pax_global_header00006660000000000000000000000064146543560570014531gustar00rootroot0000000000000052 comment=b20065bab2c7ad94c6e2b4b74bbaa50abda61566 golang-gvisor-gvisor-0.0~20240729.0/000077500000000000000000000000001465435605700166435ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/AUTHORS000066400000000000000000000005551465435605700177200ustar00rootroot00000000000000# This is the list of gVisor authors for copyright purposes. # # This does not necessarily list everyone who has contributed code, since in # some cases, their employer may be the copyright holder. To see the full list # of contributors, see the revision history in source control. # # Please send a patch if you would like to be included in this list. Google LLC golang-gvisor-gvisor-0.0~20240729.0/LICENSE000066400000000000000000000303471465435605700176570ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ------------------ Some files carry the following license, noted at the top of each file: Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.golang-gvisor-gvisor-0.0~20240729.0/README.md000066400000000000000000000002571465435605700201260ustar00rootroot00000000000000# gVisor This branch is a synthetic branch, containing only Go sources, that is compatible with standard Go tools. See the master branch for authoritative sources and tests. golang-gvisor-gvisor-0.0~20240729.0/go.mod000066400000000000000000000071531465435605700177570ustar00rootroot00000000000000module gvisor.dev/gvisor go 1.22.0 require ( github.com/BurntSushi/toml v1.2.1 github.com/bazelbuild/rules_go v0.44.2 github.com/cenkalti/backoff v2.2.1+incompatible github.com/cilium/ebpf v0.12.3 github.com/containerd/cgroups v1.0.1 github.com/containerd/console v1.0.1 github.com/containerd/containerd v1.4.13 github.com/containerd/fifo v1.0.0 github.com/containerd/go-runc v1.0.0 github.com/containerd/typeurl v1.0.2 github.com/coreos/go-systemd/v22 v22.5.0 github.com/godbus/dbus/v5 v5.1.0 github.com/gofrs/flock v0.8.0 github.com/gogo/protobuf v1.3.2 github.com/google/btree v1.1.2 github.com/google/subcommands v1.0.2-0.20190508160503-636abe8753b8 github.com/kr/pty v1.1.1 github.com/mattbaird/jsonpatch v0.0.0-20171005235357-81af80346b1a github.com/mohae/deepcopy v0.0.0-20170308212314-bb9b5e7adda9 github.com/opencontainers/runtime-spec v1.1.0-rc.1 github.com/sirupsen/logrus v1.9.3 github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 github.com/vishvananda/netlink v1.1.1-0.20211118161826-650dca95af54 golang.org/x/mod v0.14.0 golang.org/x/sync v0.6.0 golang.org/x/sys v0.17.0 golang.org/x/time v0.5.0 golang.org/x/tools v0.16.1 google.golang.org/protobuf v1.32.0 k8s.io/api v0.23.16 k8s.io/apimachinery v0.23.16 k8s.io/client-go v0.23.16 ) require ( github.com/Microsoft/go-winio v0.6.0 // indirect github.com/Microsoft/hcsshim v0.8.14 // indirect github.com/containerd/continuity v0.3.0 // indirect github.com/containerd/ttrpc v1.1.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/docker/go-units v0.4.0 // indirect github.com/go-logr/logr v1.2.0 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/mock v1.7.0-rc.1 // indirect github.com/golang/protobuf v1.5.3 // indirect github.com/google/go-cmp v0.5.9 // indirect github.com/google/go-github/v56 v56.0.0 // indirect github.com/google/gofuzz v1.1.0 // indirect github.com/googleapis/gnostic v0.5.5 // indirect github.com/hanwen/go-fuse/v2 v2.3.0 // indirect github.com/hashicorp/errwrap v1.0.0 // indirect github.com/hashicorp/go-multierror v1.1.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/vishvananda/netns v0.0.0-20200728191858-db3c7e526aae // indirect go.opencensus.io v0.24.0 // indirect golang.org/x/exp v0.0.0-20230725093048-515e97ebf090 // indirect golang.org/x/net v0.20.0 // indirect golang.org/x/oauth2 v0.4.0 // indirect golang.org/x/term v0.16.0 // indirect golang.org/x/text v0.14.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/appengine v1.6.7 // indirect google.golang.org/genproto v0.0.0-20230110181048-76db0878b65f // indirect google.golang.org/grpc v1.53.0-dev.0.20230123225046-4075ef07c5d5 // indirect google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.3.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect gotest.tools/v3 v3.4.0 // indirect honnef.co/go/tools v0.4.2 // indirect k8s.io/klog/v2 v2.30.0 // indirect k8s.io/kube-openapi v0.0.0-20211115234752-e816edb12b65 // indirect k8s.io/utils v0.0.0-20211116205334-6203023598ed // indirect sigs.k8s.io/json v0.0.0-20211020170558-c049b76a60c6 // indirect sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect sigs.k8s.io/yaml v1.2.0 // indirect ) golang-gvisor-gvisor-0.0~20240729.0/go.sum000066400000000000000000001343041465435605700200030ustar00rootroot00000000000000cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/toml v1.2.1 h1:9F2/+DoOYIOksmaJFPw1tGFy1eDnIJXg+UHjuD8lTak= github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= github.com/Microsoft/go-winio v0.4.16-0.20201130162521-d1ffc52c7331/go.mod h1:XB6nPKklQyQ7GC9LdcBEcBl8PF76WugXOPRXwdLnMv0= github.com/Microsoft/go-winio v0.6.0 h1:slsWYD/zyx7lCXoZVlvQrj0hPTM1HI4+v1sIda2yDvg= github.com/Microsoft/go-winio v0.6.0/go.mod h1:cTAf44im0RAYeL23bpB+fzCyDH2MJiz2BO69KH/soAE= github.com/Microsoft/hcsshim v0.8.14 h1:lbPVK25c1cu5xTLITwpUcxoA9vKrKErASPYygvouJns= github.com/Microsoft/hcsshim v0.8.14/go.mod h1:NtVKoYxQuTLx6gEq0L96c9Ju4JbRJ4nY2ow3VK6a9Lg= github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= github.com/bazelbuild/rules_go v0.27.0 h1:KViqR7qKXwz+LrNdIauCDU21kneCk+4DnYjpvlJwH50= github.com/bazelbuild/rules_go v0.27.0/go.mod h1:MC23Dc/wkXEyk3Wpq6lCqz0ZAYOZDw2DR5y3N1q2i7M= github.com/bazelbuild/rules_go v0.38.1 h1:YGNsLhWe18Ielebav7cClP3GMwBxBE+xEArLHtmXDx8= github.com/bazelbuild/rules_go v0.38.1/go.mod h1:TMHmtfpvyfsxaqfL9WnahCsXMWDMICTw7XeK9yVb+YU= github.com/bazelbuild/rules_go v0.44.2 h1:H2nzlC9VLKeVW1D90bahFSszpDE5qvtKr95Nz7BN0WQ= github.com/bazelbuild/rules_go v0.44.2/go.mod h1:Dhcz716Kqg1RHNWos+N6MlXNkjNP2EwZQ0LukRKJfMs= github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4= github.com/cenkalti/backoff v2.2.1+incompatible/go.mod h1:90ReRw6GdpyfrHakVjL/QHaoyV4aDUVVkXQJJJ3NXXM= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/cilium/ebpf v0.0.0-20200110133405-4032b1d8aae3/go.mod h1:MA5e5Lr8slmEg9bt0VpxxWqJlO4iwu3FBdHUzV7wQVg= github.com/cilium/ebpf v0.4.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= github.com/cilium/ebpf v0.9.3 h1:5KtxXZU+scyERvkJMEm16TbScVvuuMrlhPly78ZMbSc= github.com/cilium/ebpf v0.9.3/go.mod h1:w27N4UjpaQ9X/DGrSugxUG+H+NhgntDuPb5lCzxCn8A= github.com/cilium/ebpf v0.12.3 h1:8ht6F9MquybnY97at+VDZb3eQQr8ev79RueWeVaEcG4= github.com/cilium/ebpf v0.12.3/go.mod h1:TctK1ivibvI3znr66ljgi4hqOT8EYQjz1KWBfb1UVgM= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= github.com/containerd/cgroups v0.0.0-20200531161412-0dbf7f05ba59/go.mod h1:pA0z1pT8KYB3TCXK/ocprsh7MAkoW8bZVzPdih9snmM= github.com/containerd/cgroups v1.0.1 h1:iJnMvco9XGvKUvNQkv88bE4uJXxRQH18efbKo9w5vHQ= github.com/containerd/cgroups v1.0.1/go.mod h1:0SJrPIenamHDcZhEcJMNBB85rHcUsw4f25ZfBiPYRkU= github.com/containerd/console v0.0.0-20180822173158-c12b1e7919c1/go.mod h1:Tj/on1eG8kiEhd0+fhSDzsPAFESxzBBvdyEgyryXffw= github.com/containerd/console v1.0.1 h1:u7SFAJyRqWcG6ogaMAx3KjSTy1e3hT9QxqX7Jco7dRc= github.com/containerd/console v1.0.1/go.mod h1:XUsP6YE/mKtz6bxc+I8UiKKTP04qjQL4qcS3XoQ5xkw= github.com/containerd/containerd v1.3.2/go.mod h1:bC6axHOhabU15QhwfG7w5PipXdVtMXFTttgp+kVtyUA= github.com/containerd/containerd v1.4.13 h1:Z0CbagVdn9VN4K6htOCY/jApSw8YKP+RdLZ5dkXF8PM= github.com/containerd/containerd v1.4.13/go.mod h1:bC6axHOhabU15QhwfG7w5PipXdVtMXFTttgp+kVtyUA= github.com/containerd/continuity v0.0.0-20190426062206-aaeac12a7ffc/go.mod h1:GL3xCUCBDV3CZiTSEKksMWbLE66hEyuu9qyDOOqM47Y= github.com/containerd/continuity v0.3.0 h1:nisirsYROK15TAMVukJOUyGJjz4BNQJBVsNvAXZJ/eg= github.com/containerd/continuity v0.3.0/go.mod h1:wJEAIwKOm/pBZuBd0JmeTvnLquTB1Ag8espWhkykbPM= github.com/containerd/fifo v0.0.0-20190226154929-a9fb20d87448/go.mod h1:ODA38xgv3Kuk8dQz2ZQXpnv/UZZUHUCL7pnLehbXgQI= github.com/containerd/fifo v1.0.0 h1:6PirWBr9/L7GDamKr+XM0IeUFXu5mf3M/BPpH9gaLBU= github.com/containerd/fifo v1.0.0/go.mod h1:ocF/ME1SX5b1AOlWi9r677YJmCPSwwWnQ9O123vzpE4= github.com/containerd/go-runc v0.0.0-20180907222934-5a6d9f37cfa3/go.mod h1:IV7qH3hrUgRmyYrtgEeGWJfWbgcHL9CSRruz2Vqcph0= github.com/containerd/go-runc v1.0.0 h1:oU+lLv1ULm5taqgV/CJivypVODI4SUz1znWjv3nNYS0= github.com/containerd/go-runc v1.0.0/go.mod h1:cNU0ZbCgCQVZK4lgG3P+9tn9/PaJNmoDXPpoJhDR+Ok= github.com/containerd/ttrpc v0.0.0-20190828154514-0e0f228740de/go.mod h1:PvCDdDGpgqzQIzDW1TphrGLssLDZp2GuS+X5DkEJB8o= github.com/containerd/ttrpc v1.1.0 h1:GbtyLRxb0gOLR0TYQWt3O6B0NvT8tMdorEHqIQo/lWI= github.com/containerd/ttrpc v1.1.0/go.mod h1:XX4ZTnoOId4HklF4edwc4DcqskFZuvXB1Evzy5KFQpQ= github.com/containerd/typeurl v0.0.0-20180627222232-a93fcdb778cd/go.mod h1:Cm3kwCdlkCfMSHURc+r6fwoGH6/F1hH3S4sg0rLFWPc= github.com/containerd/typeurl v1.0.2 h1:Chlt8zIieDbzQFzXzAeBEF92KhExuE4p9p92/QmY7aY= github.com/containerd/typeurl v1.0.2/go.mod h1:9trJWW2sRlGub4wZJRTW83VtbOLS6hwcDZXTn6oPz9s= github.com/coreos/go-systemd/v22 v22.0.0/go.mod h1:xO0FLkIi5MaZafQlIrOotqXZ90ih+1atmu1JpKERPPk= github.com/coreos/go-systemd/v22 v22.1.0/go.mod h1:xO0FLkIi5MaZafQlIrOotqXZ90ih+1atmu1JpKERPPk= github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzAJc1DzSI= github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw= github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE= github.com/emicklei/go-restful v0.0.0-20170410110728-ff4f55a20633/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k= github.com/frankban/quicktest v1.14.0 h1:+cqqvzZV87b4adx/5ayVOaYZ2CrvM4ejQvUdBzPPUss= github.com/frankban/quicktest v1.14.5 h1:dfYrrRyLtiqT9GyKXgdh+k4inNeTvmGbuSgZ3lx3GhA= github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4= github.com/getkin/kin-openapi v0.76.0/go.mod h1:660oXbgy5JFMKreazJaQTw7o+X00qeSyhcnluiMv+Xg= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas= github.com/go-logr/logr v0.2.0/go.mod h1:z6/tIYblkpsD+a4lm/fGIIU9mZ+XfAiaFtq7xTgseGU= github.com/go-logr/logr v1.2.0 h1:QK40JKJyMdUDz+h+xvCsru/bJhvG0UxvePV0ufL/AcE= github.com/go-logr/logr v1.2.0/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-openapi/jsonpointer v0.19.3/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= github.com/go-openapi/jsonpointer v0.19.5/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= github.com/go-openapi/jsonreference v0.19.3/go.mod h1:rjx6GuL8TTa9VaixXglHmQmIL98+wF9xc8zWvFonSJ8= github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk= github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/godbus/dbus/v5 v5.0.4 h1:9349emZab16e7zQvpmsbtjc18ykshndd8y2PG3sgJbA= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk= github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/gofrs/flock v0.8.0 h1:MSdYClljsF3PbENUUEx85nkWfJSGfzYI9yEBZOJz6CY= github.com/gofrs/flock v0.8.0/go.mod h1:F1TvTiK9OcQqauNUHlbJvyl9Qa1QvF/gOUDKA14jxHU= github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.7.0-rc.1 h1:YojYx61/OLFsiv6Rw1Z96LpldJIy31o+UHmwAUMJ6/U= github.com/golang/mock v1.7.0-rc.1/go.mod h1:s42URUywIqd+OcERslBJvOjepvNymP31m3q8d/GkuRs= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw= github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/google/btree v1.0.1 h1:gK4Kx5IaGY9CD5sPJ36FHiBJ6ZXl0kilRiiCj+jdYp4= github.com/google/btree v1.0.1/go.mod h1:xXMiIv4Fb/0kKde4SpL7qlzvu5cMJDRkFDxJfI9uaxA= github.com/google/btree v1.1.2 h1:xf4v41cLI2Z6FxbKm+8Bu+m8ifhj15JuZ9sa0jZCMUU= github.com/google/btree v1.1.2/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-github/v56 v56.0.0 h1:TysL7dMa/r7wsQi44BjqlwaHvwlFlqkK8CtBWCX3gb4= github.com/google/go-github/v56 v56.0.0/go.mod h1:D8cdcX98YWJvi7TLo7zM4/h8ZTx6u6fwGEkCdisopo0= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.1.0 h1:Hsa8mG0dQ46ij8Sl2AYJDUv1oA9/d6Vk+3LG99Oe02g= github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/subcommands v1.0.2-0.20190508160503-636abe8753b8 h1:8nlgEAjIalk6uj/CGKCdOO8CQqTeysvcW4RFZ6HbkGM= github.com/google/subcommands v1.0.2-0.20190508160503-636abe8753b8/go.mod h1:ZjhPrFU+Olkh9WazFPsl27BQ4UPiG37m3yTrtFlrHVk= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/gnostic v0.5.1/go.mod h1:6U4PtQXGIEt/Z3h5MAT7FNofLnw9vXk2cUuW7uA/OeU= github.com/googleapis/gnostic v0.5.5 h1:9fHAtK0uDfpveeqqo1hkEZJcFvYXAiCN3UutL8F9xHw= github.com/googleapis/gnostic v0.5.5/go.mod h1:7+EbHbldMins07ALC74bsA81Ovc97DwqyJO1AENw9kA= github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= github.com/hanwen/go-fuse/v2 v2.3.0 h1:t5ivNIH2PK+zw4OBul/iJjsoG9K6kXo4nMDoBpciC8A= github.com/hanwen/go-fuse/v2 v2.3.0/go.mod h1:xKwi1cF7nXAOBCXujD5ie0ZKsxc8GGSA1rlMJc+8IJs= github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/go-multierror v1.1.0 h1:B9UzwGQJehnUY1yNrnwREHc3fGbC2xefo8g4TbElacI= github.com/hashicorp/go-multierror v1.1.0/go.mod h1:spPvp8C1qA32ftKqdAHm4hHTbPw+vmowP0z+KUhOZdA= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pty v1.1.1 h1:VkoXIwSboBpnk99O/KFauAEILuNHv5DVFKZMBN/gUgw= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kylelemons/godebug v0.0.0-20170820004349-d65d576e9348/go.mod h1:B69LEHPfb2qLo0BaaOLcbitczOKLWTsrBG9LczfCD4k= github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mattbaird/jsonpatch v0.0.0-20171005235357-81af80346b1a h1:+J2gw7Bw77w/fbK7wnNJJDKmw1IbWft2Ul5BzrG1Qm8= github.com/mattbaird/jsonpatch v0.0.0-20171005235357-81af80346b1a/go.mod h1:M1qoD/MqPgTZIk0EWKB38wE28ACRfVcn+cU08jyArI0= github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= github.com/moby/sys/mountinfo v0.6.2/go.mod h1:IJb6JQeOklcdMU9F5xQ8ZALD+CUr5VlGpwtX+VE0rpI= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/mohae/deepcopy v0.0.0-20170308212314-bb9b5e7adda9 h1:Sha2bQdoWE5YQPTlJOL31rmce94/tYi113SlFo1xQ2c= github.com/mohae/deepcopy v0.0.0-20170308212314-bb9b5e7adda9/go.mod h1:TaXosZuwdSHYgviHp1DAtfrULt5eUgsSMsZf+YrPgl8= github.com/munnerz/goautoneg v0.0.0-20120707110453-a547fc61f48d/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs= github.com/nxadm/tail v1.4.4 h1:DQuhQpB1tVlglWS2hLQ5OV6B5r8aGxSrPc5Qo6uTN78= github.com/onsi/ginkgo v0.0.0-20170829012221-11459a886d9c/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.14.0 h1:2mOpI4JVVPBN+WQRa0WKH2eXR+Ey+uK4n7Zj0aYpIQA= github.com/onsi/gomega v0.0.0-20170829124025-dcabb60a477c/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA= github.com/onsi/gomega v1.10.1 h1:o0+MgICZLuZ7xjH7Vx6zS/zcu93/BEp1VwkIW1mEXCE= github.com/opencontainers/go-digest v0.0.0-20180430190053-c9281466c8b2/go.mod h1:cMLVZDEM3+U2I4VmLI6N8jQYUd2OVphdqWwCJHrFt2s= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/runc v0.0.0-20190115041553-12f6a991201f/go.mod h1:qT5XzbpPznkRYVz/mWwUaVBUv2rmF59PVA73FjuZG0U= github.com/opencontainers/runtime-spec v1.0.2 h1:UfAcuLBJB9Coz72x1hgl8O5RVzTdNiaglX6v2DM6FI0= github.com/opencontainers/runtime-spec v1.0.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-spec v1.1.0-rc.1 h1:wHa9jroFfKGQqFHj0I1fMRKLl0pfj+ynAqBxo3v6u9w= github.com/opencontainers/runtime-spec v1.1.0-rc.1/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/procfs v0.0.0-20180125133057-cb4147076ac7/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.6.0 h1:mxy4L2jP6qMonqmq+aTtOx1ifVWUgG/TAmntgbh3xv4= github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k= github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= github.com/sirupsen/logrus v1.4.1/go.mod h1:ni0Sbl8bgC9z8RoU9G6nDWqqs/fq4eDPysMBDgk/93Q= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/sirupsen/logrus v1.8.1 h1:dJKuHgqk1NNQlqoA6BTlM1Wf9DOH3NBjQyu0h9+AZZE= github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 h1:kdXcSzyDtseVEc4yCz2qF8ZrQvIDBJLl4S1c3GCXmoI= github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww= github.com/urfave/cli v1.22.2/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= github.com/vishvananda/netlink v1.1.1-0.20211118161826-650dca95af54 h1:8mhqcHPqTMhSPoslhGYihEgSfc77+7La1P6kiB6+9So= github.com/vishvananda/netlink v1.1.1-0.20211118161826-650dca95af54/go.mod h1:twkDnbuQxJYemMlGd4JFIcuhgX83tXhKS2B/PRMpOho= github.com/vishvananda/netns v0.0.0-20200728191858-db3c7e526aae h1:4hwBBUfQCFe3Cym0ZtKyq7L16eZUtYKs+BaHDN6mAns= github.com/vishvananda/netns v0.0.0-20200728191858-db3c7e526aae/go.mod h1:DD4vA1DwXk04H54A1oHXtwZmA0grkVMdPxx/VGLCah0= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/yuin/goldmark v1.4.1/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20230725093048-515e97ebf090 h1:Di6/M8l0O2lCLc6VVRWhgCiApHV8MnQurBnFSHsQtNY= golang.org/x/exp v0.0.0-20230725093048-515e97ebf090/go.mod h1:FXUEEKJgO7OQYeo8N01OfiKP8RXMtf6e8aTskBGqWdc= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.5.1/go.mod h1:5OXOZSfqPIIbmVBIIKWRFfZjPR0E5r58TLhUjH0a2Ro= golang.org/x/mod v0.7.0 h1:LapD9S96VoQRhi/GrNTqeBJFrUjs5UHCAtTlgwA5oZA= golang.org/x/mod v0.7.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.11.0 h1:bUO06HqtnRcc/7l71XBe4WcqTZ+3AH1J59zWDDwLKgU= golang.org/x/mod v0.11.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.12.0 h1:rmsUpXtvNzj340zd98LZ4KntptpfRHwpFOHG188oHXc= golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.13.0 h1:I/DsJXRlw/8l/0c24sM9yb0T4z9liZTduXvdAWYiysY= golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/mod v0.14.0 h1:dGoOF9QVLYng8IHTm7BAyWqCqSheQ5pYWGhzW00YJr0= golang.org/x/mod v0.14.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20191004110552-13f9640d40b9/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM= golang.org/x/net v0.0.0-20211015210444-4f30a5c0130f/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.5.0 h1:GyT4nK/YDHSqa1c4753ouYCDajOYKTja9Xb/OHtgvSw= golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws= golang.org/x/net v0.15.0 h1:ugBLEUaxABaB5AJqW9enI0ACdci2RUd4eP51NTBvuJ8= golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/net v0.16.0 h1:7eBu7KsSvFDtSXUIDbh3aqlK4DPsZ1rByC8PFfBThos= golang.org/x/net v0.16.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c= golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U= golang.org/x/net v0.20.0 h1:aCL9BSgETF1k+blQaYUBx9hJ9LOGP3gAVemcZlf1Kpo= golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.4.0 h1:NF0gk8LVPg1Ml7SSbGyySuoxdsXitj7TvgvuRxIMc/M= golang.org/x/oauth2 v0.4.0/go.mod h1:RznEsdpjGAINPTOF0UH/t+xJ75L18YO3Ho6Pyn+uRec= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E= golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sync v0.4.0 h1:zxkM55ReGkDlKSM+Fu41A+zmbZuaPVbGMzvvdUPznYQ= golang.org/x/sync v0.4.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ= golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191022100944-742c48ecaeb7/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200120151820-655fe14d7479/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200217220822-9197077df867/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200728102440-3e129f6d46b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200916030750-2334cc1a136f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201201145000-ef89a241ccb3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211019181941-9d821ace8654/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.4.0 h1:Zr2JFtRQNX3BCZ8YtxRE9hNJYC8J6I1MVbMg6owUp18= golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.4.0 h1:O7UWfv5+A2qiuulQk30kVinPoMtoIPeVaKLEgLpVkvg= golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= golang.org/x/term v0.12.0 h1:/ZfYdc3zq+q02Rv9vGqTeSItdzZTSNDmfTi0mBAuidU= golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= golang.org/x/term v0.13.0 h1:bb+I9cTfFazGW51MZqBVmZy7+JEJMouUHTUSKVQLBek= golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= golang.org/x/term v0.16.0 h1:m+B6fahuftsE9qjo0VWp2FW0mB3MTJvR0BaMQrq0pmE= golang.org/x/term v0.16.0/go.mod h1:yn7UURbUtPyrVJPGPq404EukNFxcm/foM+bV/bfcDsY= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.6.0 h1:3XmdazWV+ubf7QgHSTWeykHOci5oeekaGJBLkrkaw4k= golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 h1:vVKdlvoWBphwdxWKrFZEuM0kGgGLxUOYcY4U/2Vjg44= golang.org/x/time v0.0.0-20220210224613-90d013bbcef8/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200505023115-26f46d2f7ef8/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0= golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.8/go.mod h1:nABZi5QlRsZVlzPpHl034qft6wpY4eDcsTt5AaioBiU= golang.org/x/tools v0.5.0 h1:+bSpV5HIeWkuvgaMfI3UmKRThoTA5ODJTUd8T17NO+4= golang.org/x/tools v0.5.0/go.mod h1:N+Kgy78s5I24c24dU8OfWNEotWjutIs8SnJvn5IDq+k= golang.org/x/tools v0.13.0 h1:Iey4qkscZuv0VvIt8E0neZjtPVQFSc870HQ448QgEmQ= golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= golang.org/x/tools v0.14.0 h1:jvNa2pY0M4r62jkRQ6RwEZZyPcymeL9XZMLBbV7U2nc= golang.org/x/tools v0.14.0/go.mod h1:uYBEerGOWcJyEORxN+Ek8+TT266gXkNlHdJBwexUsBg= golang.org/x/tools v0.16.1 h1:TLyB3WofjdOEepBHAU20JdNC1Zbg87elYofWYAY5oZA= golang.org/x/tools v0.16.1/go.mod h1:kYVVN6I1mBNoB1OX+noeBjbRk4IUEPa7JJ+TJMEooJ0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 h1:H2TDz8ibqkAF6YGhCdN3jS9O0/s90v0rJh3X/OLHEUk= golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20200224152610-e50cd9704f63/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= google.golang.org/genproto v0.0.0-20201019141844-1ed22bb0c154/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= google.golang.org/genproto v0.0.0-20230110181048-76db0878b65f h1:BWUVssLB0HVOSY78gIdvk1dTVYtT1y8SBWtPYuTJ/6w= google.golang.org/genproto v0.0.0-20230110181048-76db0878b65f/go.mod h1:RGgjbofJ8xD9Sq1VVhDM1Vok1vRONV+rg+CjzG4SZKM= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= google.golang.org/grpc v1.23.1/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc= google.golang.org/grpc v1.53.0-dev.0.20230123225046-4075ef07c5d5 h1:qq9WB3Dez2tMAKtZTVtZsZSmTkDgPeXx+FRPt5kLEkM= google.golang.org/grpc v1.53.0-dev.0.20230123225046-4075ef07c5d5/go.mod h1:OnIrk0ipVdj4N5d9IUoFUx72/VlD7+jUsHwZgwSMQpw= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.3.0 h1:rNBFJjBCOgVr9pWD7rs/knKL4FRTKgpZmsRfV214zcA= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.3.0/go.mod h1:Dk1tviKTvMCz5tvh7t+fh94dhmQVHuCt2OzJB3CTW9Y= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.28.2-0.20230118093459-a9481185b34d h1:qp0AnQCvRCMlu9jBjtdbTaaEmThIgZOrbVyDEOcmKhQ= google.golang.org/protobuf v1.28.2-0.20230118093459-a9481185b34d/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= google.golang.org/protobuf v1.32.0 h1:pPC6BG5ex8PDFnkbrGU3EixyhKcQ2aDuBS36lqK/C7I= google.golang.org/protobuf v1.32.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw= gotest.tools/v3 v3.4.0 h1:ZazjZUfuVeZGLAmlKKuyv3IKP5orXcwtOwDQH6YVr6o= gotest.tools/v3 v3.4.0/go.mod h1:CtbdzLSsqVhDgMtKsx03ird5YTGB3ar27v0u/yKBW5g= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.4.2 h1:6qXr+R5w+ktL5UkwEbPp+fEvfyoMPche6GkOpGHZcLc= honnef.co/go/tools v0.4.2/go.mod h1:36ZgoUOrqOk1GxwHhyryEkq8FQWkUO2xGuSMhUCcdvA= k8s.io/api v0.23.16 h1:op+yeqZLQxDt2tEnrOP9Y+WA7l4Lxh+7R0IWEzyuk2I= k8s.io/api v0.23.16/go.mod h1:Fk/eWEGf3ZYZTCVLbsgzlxekG6AtnT3QItT3eOSyFRE= k8s.io/apimachinery v0.23.16 h1:f6Q+3qYv3qWvbDZp2iUhwC2rzMRBkSb7JYBhmeVK5pc= k8s.io/apimachinery v0.23.16/go.mod h1:RMMUoABRwnjoljQXKJ86jT5FkTZPPnZsNv70cMsKIP0= k8s.io/client-go v0.23.16 h1:9NyRabEbkE9/7Rc3ZI8kMYfH3kocUD+wEBifaTn6lyU= k8s.io/client-go v0.23.16/go.mod h1:CUfIIQL+hpzxnD9nxiVGb99BNTp00mPFp3Pk26sTFys= k8s.io/gengo v0.0.0-20210813121822-485abfe95c7c/go.mod h1:FiNAH4ZV3gBg2Kwh89tzAEV2be7d5xI0vBa/VySYy3E= k8s.io/klog/v2 v2.0.0/go.mod h1:PBfzABfn139FHAV07az/IF9Wp1bkk3vpT2XSJ76fSDE= k8s.io/klog/v2 v2.2.0/go.mod h1:Od+F08eJP+W3HUb4pSrPpgp9DGU4GzlpG/TmITuYh/Y= k8s.io/klog/v2 v2.30.0 h1:bUO6drIvCIsvZ/XFgfxoGFQU/a4Qkh0iAlvUR7vlHJw= k8s.io/klog/v2 v2.30.0/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0= k8s.io/kube-openapi v0.0.0-20211115234752-e816edb12b65 h1:E3J9oCLlaobFUqsjG9DfKbP2BmgwBL2p7pn0A3dG9W4= k8s.io/kube-openapi v0.0.0-20211115234752-e816edb12b65/go.mod h1:sX9MT8g7NVZM5lVL/j8QyCCJe8YSMW30QvGZWaCIDIk= k8s.io/utils v0.0.0-20210802155522-efc7438f0176/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA= k8s.io/utils v0.0.0-20211116205334-6203023598ed h1:ck1fRPWPJWsMd8ZRFsWc6mh/zHp5fZ/shhbrgPUxDAE= k8s.io/utils v0.0.0-20211116205334-6203023598ed/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA= sigs.k8s.io/json v0.0.0-20211020170558-c049b76a60c6 h1:fD1pz4yfdADVNfFmcP2aBEtudwUQ1AlLnRBALr33v3s= sigs.k8s.io/json v0.0.0-20211020170558-c049b76a60c6/go.mod h1:p4QtZmO4uMYipTQNzagwnNoseA6OxSUutVw05NhYDRs= sigs.k8s.io/structured-merge-diff/v4 v4.0.2/go.mod h1:bJZC9H9iH24zzfZ/41RGcq60oK1F7G282QMXDPYydCw= sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E= sigs.k8s.io/yaml v1.2.0 h1:kr/MCeFWJWTwyaHoR9c8EjH9OumOmoF9YGiZd7lFm/Q= sigs.k8s.io/yaml v1.2.0/go.mod h1:yfXDCHCao9+ENCvLSE62v9VSji2MKu5jeNfTrofGhJc= golang-gvisor-gvisor-0.0~20240729.0/pkg/000077500000000000000000000000001465435605700174245ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/000077500000000000000000000000001465435605700201575ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/abi.go000066400000000000000000000022311465435605700212370ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package abi describes the interface between a kernel and userspace. package abi import ( "fmt" ) // OS describes the target operating system for an ABI. // // Note that OS is architecture-independent. The details of the OS ABI will // vary between architectures. type OS int const ( // Linux is the Linux ABI. Linux OS = iota ) // String implements fmt.Stringer. func (o OS) String() string { switch o { case Linux: return "linux" default: return fmt.Sprintf("OS(%d)", o) } } // ABI is an interface that defines OS-specific interactions. type ABI interface { } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/abi_linux.go000066400000000000000000000012671465435605700224660ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package abi // Host specifies the host ABI. const Host = Linux golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/abi_linux_state_autogen.go000066400000000000000000000001271465435605700254020ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux // +build linux package abi golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/abi_state_autogen.go000066400000000000000000000000651465435605700241640ustar00rootroot00000000000000// automatically generated by stateify. package abi golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/flag.go000066400000000000000000000040571465435605700214250ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package abi import ( "fmt" "math" "strconv" "strings" ) // A FlagSet is a slice of bit-flags and their name. type FlagSet []struct { Flag uint64 Name string } // Parse returns a pretty version of val, using the flag names for known flags. // Unknown flags remain numeric. func (s FlagSet) Parse(val uint64) string { var flags []string for _, f := range s { if val&f.Flag == f.Flag { flags = append(flags, f.Name) val &^= f.Flag } } if val != 0 { flags = append(flags, "0x"+strconv.FormatUint(val, 16)) } if len(flags) == 0 { // Prefer 0 to an empty string. return "0x0" } return strings.Join(flags, "|") } // ValueSet is a map of syscall values to their name. Parse will use the name // or the value if unknown. type ValueSet map[uint64]string // Parse returns the name of the value associated with `val`. Unknown values // are converted to hex. func (s ValueSet) Parse(val uint64) string { if v, ok := s[val]; ok { return v } return fmt.Sprintf("%#x", val) } // ParseDecimal returns the name of the value associated with `val`. Unknown // values are converted to decimal. func (s ValueSet) ParseDecimal(val uint64) string { if v, ok := s[val]; ok { return v } return fmt.Sprintf("%d", val) } // ParseName returns the flag value associated with 'name'. Returns false // if no value is found. func (s ValueSet) ParseName(name string) (uint64, bool) { for k, v := range s { if v == name { return k, true } } return math.MaxUint64, false } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/gasket/000077500000000000000000000000001465435605700214355ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/gasket/gasket.go000066400000000000000000000115161465435605700232460ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package gasket describes the userspace interface for Gasket devices. package gasket import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" ) // Ioctl represents a gasket ioctl command. type Ioctl uint32 // From https://github.com/tensorflow/tpu/blob/master/tools/driver/include/linux/google/gasket.h var ( GASKET_IOCTL_BASE = uint32(0xDC) GASKET_IOCTL_RESET = Ioctl(linux.IOW(GASKET_IOCTL_BASE, 0, SizeOfUnsignedLong)) GASKET_IOCTL_SET_EVENTFD = Ioctl(linux.IOW(GASKET_IOCTL_BASE, 1, SizeofGasketInterruptEventFd)) GASKET_IOCTL_CLEAR_EVENTFD = Ioctl(linux.IOW(GASKET_IOCTL_BASE, 2, SizeOfUnsignedLong)) GASKET_IOCTL_NUMBER_PAGE_TABLES = Ioctl(linux.IOR(GASKET_IOCTL_BASE, 4, SizeOfUnsignedLong)) GASKET_IOCTL_PAGE_TABLE_SIZE = Ioctl(linux.IOWR(GASKET_IOCTL_BASE, 5, SizeofGasketPageTableIoctl)) GASKET_IOCTL_SIMPLE_PAGE_TABLE_SIZE = Ioctl(linux.IOWR(GASKET_IOCTL_BASE, 6, SizeofGasketPageTableIoctl)) GASKET_IOCTL_PARTITION_PAGE_TABLE = Ioctl(linux.IOW(GASKET_IOCTL_BASE, 7, SizeofGasketPageTableIoctl)) GASKET_IOCTL_MAP_BUFFER = Ioctl(linux.IOW(GASKET_IOCTL_BASE, 8, SizeofGasketPageTableIoctl)) GASKET_IOCTL_UNMAP_BUFFER = Ioctl(linux.IOW(GASKET_IOCTL_BASE, 9, SizeofGasketPageTableIoctl)) GASKET_IOCTL_CLEAR_INTERRUPT_COUNTS = Ioctl(linux.IO(GASKET_IOCTL_BASE, 10)) GASKET_IOCTL_REGISTER_INTERRUPT = Ioctl(linux.IOW(GASKET_IOCTL_BASE, 11, SizeofGasketInterruptMapping)) GASKET_IOCTL_UNREGISTER_INTERRUPT = Ioctl(linux.IOW(GASKET_IOCTL_BASE, 12, SizeOfUnsignedLong)) GASKET_IOCTL_MAP_DMA_BUF = Ioctl(linux.IOW(GASKET_IOCTL_BASE, 13, SizeofGasketPageTableDmaBufIoctl)) ) func (i Ioctl) String() string { switch i { case GASKET_IOCTL_RESET: return "GASKET_IOCTL_RESET" case GASKET_IOCTL_SET_EVENTFD: return "GASKET_IOCTL_SET_EVENTFD" case GASKET_IOCTL_CLEAR_EVENTFD: return "GASKET_IOCTL_CLEAR_EVENTFD" case GASKET_IOCTL_NUMBER_PAGE_TABLES: return "GASKET_IOCTL_NUMBER_PAGE_TABLES" case GASKET_IOCTL_PAGE_TABLE_SIZE: return "GASKET_IOCTL_PAGE_TABLE_SIZE" case GASKET_IOCTL_SIMPLE_PAGE_TABLE_SIZE: return "GASKET_IOCTL_SIMPLE_PAGE_TABLE_SIZE" case GASKET_IOCTL_PARTITION_PAGE_TABLE: return "GASKET_IOCTL_PARTITION_PAGE_TABLE" case GASKET_IOCTL_MAP_BUFFER: return "GASKET_IOCTL_MAP_BUFFER" case GASKET_IOCTL_UNMAP_BUFFER: return "GASKET_IOCTL_UNMAP_BUFFER" case GASKET_IOCTL_CLEAR_INTERRUPT_COUNTS: return "GASKET_IOCTL_CLEAR_INTERRUPT_COUNTS" case GASKET_IOCTL_REGISTER_INTERRUPT: return "GASKET_IOCTL_REGISTER_INTERRUPT" case GASKET_IOCTL_UNREGISTER_INTERRUPT: return "GASKET_IOCTL_UNREGISTER_INTERRUPT" case GASKET_IOCTL_MAP_DMA_BUF: return "GASKET_IOCTL_MAP_DMA_BUF" default: return fmt.Sprintf("UNKNOWN GASKET COMMAND %d", uint32(i)) } } // GasketInterruptEventFd is the common structure for ioctls associating an // eventfd with a device interrupt, when using the Gasket interrupt module. // // +marshal type GasketInterruptEventFd struct { Interrupt uint64 EventFD uint64 } // GasketPageTableIoctl is a common structure for ioctls mapping and unmapping // buffers when using the Gasket page_table module. // // +marshal type GasketPageTableIoctl struct { PageTableIndex uint64 Size uint64 HostAddress uint64 DeviceAddress uint64 } // GasketInterruptMapping is a structure for ioctls associating an eventfd and // interrupt controlling bar register with a device interrupt, when using the // Gasket interrupt module. // // +marshal type GasketInterruptMapping struct { Interrupt uint64 EventFD uint64 BarIndex uint64 RegOffset uint64 } // GasketPageTableDmaBufIoctl is a structure for dma_buf mapping ioctl // parameters. // // +marshal type GasketPageTableDmaBufIoctl struct { PageTableIndex uint64 DeviceAddress uint64 DMABufID int32 `marshal:"unaligned"` // Struct ends mid 64bit word. } // Ioctl parameter struct sizes. var ( SizeofGasketInterruptEventFd = uint32((*GasketInterruptEventFd)(nil).SizeBytes()) SizeofGasketPageTableIoctl = uint32((*GasketPageTableIoctl)(nil).SizeBytes()) SizeofGasketInterruptMapping = uint32((*GasketInterruptMapping)(nil).SizeBytes()) SizeofGasketPageTableDmaBufIoctl = uint32((*GasketPageTableDmaBufIoctl)(nil).SizeBytes()) SizeOfUnsignedLong = uint32(8) ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/gasket/gasket_abi_autogen_unsafe.go000066400000000000000000000371531465435605700271510ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package gasket import ( "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "io" "reflect" "runtime" "unsafe" ) // Marshallable types used by this file. var _ marshal.Marshallable = (*GasketInterruptEventFd)(nil) var _ marshal.Marshallable = (*GasketInterruptMapping)(nil) var _ marshal.Marshallable = (*GasketPageTableDmaBufIoctl)(nil) var _ marshal.Marshallable = (*GasketPageTableIoctl)(nil) // SizeBytes implements marshal.Marshallable.SizeBytes. func (g *GasketInterruptEventFd) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (g *GasketInterruptEventFd) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(g.Interrupt)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(g.EventFD)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (g *GasketInterruptEventFd) UnmarshalBytes(src []byte) []byte { g.Interrupt = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] g.EventFD = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (g *GasketInterruptEventFd) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (g *GasketInterruptEventFd) MarshalUnsafe(dst []byte) []byte { size := g.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(g), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (g *GasketInterruptEventFd) UnmarshalUnsafe(src []byte) []byte { size := g.SizeBytes() gohacks.Memmove(unsafe.Pointer(g), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (g *GasketInterruptEventFd) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(g))) hdr.Len = g.SizeBytes() hdr.Cap = g.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that g // must live until the use above. runtime.KeepAlive(g) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (g *GasketInterruptEventFd) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return g.CopyOutN(cc, addr, g.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (g *GasketInterruptEventFd) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(g))) hdr.Len = g.SizeBytes() hdr.Cap = g.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that g // must live until the use above. runtime.KeepAlive(g) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (g *GasketInterruptEventFd) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return g.CopyInN(cc, addr, g.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (g *GasketInterruptEventFd) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(g))) hdr.Len = g.SizeBytes() hdr.Cap = g.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that g // must live until the use above. runtime.KeepAlive(g) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (g *GasketInterruptMapping) SizeBytes() int { return 32 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (g *GasketInterruptMapping) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(g.Interrupt)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(g.EventFD)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(g.BarIndex)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(g.RegOffset)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (g *GasketInterruptMapping) UnmarshalBytes(src []byte) []byte { g.Interrupt = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] g.EventFD = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] g.BarIndex = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] g.RegOffset = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (g *GasketInterruptMapping) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (g *GasketInterruptMapping) MarshalUnsafe(dst []byte) []byte { size := g.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(g), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (g *GasketInterruptMapping) UnmarshalUnsafe(src []byte) []byte { size := g.SizeBytes() gohacks.Memmove(unsafe.Pointer(g), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (g *GasketInterruptMapping) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(g))) hdr.Len = g.SizeBytes() hdr.Cap = g.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that g // must live until the use above. runtime.KeepAlive(g) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (g *GasketInterruptMapping) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return g.CopyOutN(cc, addr, g.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (g *GasketInterruptMapping) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(g))) hdr.Len = g.SizeBytes() hdr.Cap = g.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that g // must live until the use above. runtime.KeepAlive(g) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (g *GasketInterruptMapping) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return g.CopyInN(cc, addr, g.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (g *GasketInterruptMapping) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(g))) hdr.Len = g.SizeBytes() hdr.Cap = g.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that g // must live until the use above. runtime.KeepAlive(g) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (g *GasketPageTableDmaBufIoctl) SizeBytes() int { return 20 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (g *GasketPageTableDmaBufIoctl) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(g.PageTableIndex)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(g.DeviceAddress)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(g.DMABufID)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (g *GasketPageTableDmaBufIoctl) UnmarshalBytes(src []byte) []byte { g.PageTableIndex = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] g.DeviceAddress = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] g.DMABufID = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (g *GasketPageTableDmaBufIoctl) Packed() bool { return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (g *GasketPageTableDmaBufIoctl) MarshalUnsafe(dst []byte) []byte { // Type GasketPageTableDmaBufIoctl doesn't have a packed layout in memory, fallback to MarshalBytes. return g.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (g *GasketPageTableDmaBufIoctl) UnmarshalUnsafe(src []byte) []byte { // Type GasketPageTableDmaBufIoctl doesn't have a packed layout in memory, fallback to UnmarshalBytes. return g.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (g *GasketPageTableDmaBufIoctl) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type GasketPageTableDmaBufIoctl doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(g.SizeBytes()) // escapes: okay. g.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (g *GasketPageTableDmaBufIoctl) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return g.CopyOutN(cc, addr, g.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (g *GasketPageTableDmaBufIoctl) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type GasketPageTableDmaBufIoctl doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(g.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. g.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (g *GasketPageTableDmaBufIoctl) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return g.CopyInN(cc, addr, g.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (g *GasketPageTableDmaBufIoctl) WriteTo(writer io.Writer) (int64, error) { // Type GasketPageTableDmaBufIoctl doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, g.SizeBytes()) g.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (g *GasketPageTableIoctl) SizeBytes() int { return 32 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (g *GasketPageTableIoctl) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(g.PageTableIndex)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(g.Size)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(g.HostAddress)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(g.DeviceAddress)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (g *GasketPageTableIoctl) UnmarshalBytes(src []byte) []byte { g.PageTableIndex = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] g.Size = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] g.HostAddress = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] g.DeviceAddress = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (g *GasketPageTableIoctl) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (g *GasketPageTableIoctl) MarshalUnsafe(dst []byte) []byte { size := g.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(g), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (g *GasketPageTableIoctl) UnmarshalUnsafe(src []byte) []byte { size := g.SizeBytes() gohacks.Memmove(unsafe.Pointer(g), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (g *GasketPageTableIoctl) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(g))) hdr.Len = g.SizeBytes() hdr.Cap = g.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that g // must live until the use above. runtime.KeepAlive(g) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (g *GasketPageTableIoctl) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return g.CopyOutN(cc, addr, g.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (g *GasketPageTableIoctl) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(g))) hdr.Len = g.SizeBytes() hdr.Cap = g.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that g // must live until the use above. runtime.KeepAlive(g) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (g *GasketPageTableIoctl) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return g.CopyInN(cc, addr, g.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (g *GasketPageTableIoctl) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(g))) hdr.Len = g.SizeBytes() hdr.Cap = g.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that g // must live until the use above. runtime.KeepAlive(g) // escapes: replaced by intrinsic. return int64(length), err } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/gasket/gasket_state_autogen.go000066400000000000000000000000701465435605700261610ustar00rootroot00000000000000// automatically generated by stateify. package gasket golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/000077500000000000000000000000001465435605700213165ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/aio.go000066400000000000000000000033021465435605700224130ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import "encoding/binary" // AIORingSize is sizeof(struct aio_ring). const AIORingSize = 32 // I/O commands. const ( IOCB_CMD_PREAD = 0 IOCB_CMD_PWRITE = 1 IOCB_CMD_FSYNC = 2 IOCB_CMD_FDSYNC = 3 // 4 was the experimental IOCB_CMD_PREADX. IOCB_CMD_POLL = 5 IOCB_CMD_NOOP = 6 IOCB_CMD_PREADV = 7 IOCB_CMD_PWRITEV = 8 ) // I/O flags. const ( IOCB_FLAG_RESFD = 1 IOCB_FLAG_IOPRIO = 2 ) // IOCallback describes an I/O request. // // The priority field is currently ignored in the implementation below. Also // note that the IOCB_FLAG_RESFD feature is not supported. // // +marshal type IOCallback struct { Data uint64 Key uint32 _ uint32 OpCode uint16 ReqPrio int16 FD int32 Buf uint64 Bytes uint64 Offset int64 Reserved2 uint64 Flags uint32 // eventfd to signal if IOCB_FLAG_RESFD is set in flags. ResFD int32 } // IOEvent describes an I/O result. // // +marshal // +stateify savable type IOEvent struct { Data uint64 Obj uint64 Result int64 Result2 int64 } // IOEventSize is the size of an ioEvent encoded. var IOEventSize = binary.Size(IOEvent{}) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/arch_amd64.go000066400000000000000000000014351465435605700235600ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package linux // Start and end addresses of the vsyscall page. const ( VSyscallStartAddr uint64 = 0xffffffffff600000 VSyscallEndAddr uint64 = 0xffffffffff601000 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/audit.go000066400000000000000000000015201465435605700227510ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Audit numbers identify different system call APIs, from const ( // AUDIT_ARCH_X86_64 identifies AMD64. AUDIT_ARCH_X86_64 = 0xc000003e // AUDIT_ARCH_AARCH64 identifies ARM64. AUDIT_ARCH_AARCH64 = 0xc00000b7 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/bpf.go000066400000000000000000000022511465435605700224140ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // BPFInstruction is a raw BPF virtual machine instruction. // // +marshal slice:BPFInstructionSlice // +stateify savable type BPFInstruction struct { // OpCode is the operation to execute. OpCode uint16 // JumpIfTrue is the number of instructions to skip if OpCode is a // conditional instruction and the condition is true. JumpIfTrue uint8 // JumpIfFalse is the number of instructions to skip if OpCode is a // conditional instruction and the condition is false. JumpIfFalse uint8 // K is a constant parameter. The meaning depends on the value of OpCode. K uint32 } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/capability.go000066400000000000000000000210621465435605700237670ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "strings" ) // A Capability represents the ability to perform a privileged operation. type Capability int // Capabilities defined by Linux. Taken from the kernel's // include/uapi/linux/capability.h. See capabilities(7) or that file for more // detailed capability descriptions. const ( CAP_CHOWN = Capability(0) CAP_DAC_OVERRIDE = Capability(1) CAP_DAC_READ_SEARCH = Capability(2) CAP_FOWNER = Capability(3) CAP_FSETID = Capability(4) CAP_KILL = Capability(5) CAP_SETGID = Capability(6) CAP_SETUID = Capability(7) CAP_SETPCAP = Capability(8) CAP_LINUX_IMMUTABLE = Capability(9) CAP_NET_BIND_SERVICE = Capability(10) CAP_NET_BROADCAST = Capability(11) CAP_NET_ADMIN = Capability(12) CAP_NET_RAW = Capability(13) CAP_IPC_LOCK = Capability(14) CAP_IPC_OWNER = Capability(15) CAP_SYS_MODULE = Capability(16) CAP_SYS_RAWIO = Capability(17) CAP_SYS_CHROOT = Capability(18) CAP_SYS_PTRACE = Capability(19) CAP_SYS_PACCT = Capability(20) CAP_SYS_ADMIN = Capability(21) CAP_SYS_BOOT = Capability(22) CAP_SYS_NICE = Capability(23) CAP_SYS_RESOURCE = Capability(24) CAP_SYS_TIME = Capability(25) CAP_SYS_TTY_CONFIG = Capability(26) CAP_MKNOD = Capability(27) CAP_LEASE = Capability(28) CAP_AUDIT_WRITE = Capability(29) CAP_AUDIT_CONTROL = Capability(30) CAP_SETFCAP = Capability(31) CAP_MAC_OVERRIDE = Capability(32) CAP_MAC_ADMIN = Capability(33) CAP_SYSLOG = Capability(34) CAP_WAKE_ALARM = Capability(35) CAP_BLOCK_SUSPEND = Capability(36) CAP_AUDIT_READ = Capability(37) CAP_PERFMON = Capability(38) CAP_BPF = Capability(39) CAP_CHECKPOINT_RESTORE = Capability(40) // CAP_LAST_CAP is the highest-numbered capability. // Search for "CAP_LAST_CAP" to find other places that need to change. CAP_LAST_CAP = CAP_CHECKPOINT_RESTORE ) // Ok returns true if cp is a supported capability. func (cp Capability) Ok() bool { return cp >= 0 && cp <= CAP_LAST_CAP } // String returns the capability name. func (cp Capability) String() string { switch cp { case CAP_CHOWN: return "CAP_CHOWN" case CAP_DAC_OVERRIDE: return "CAP_DAC_OVERRIDE" case CAP_DAC_READ_SEARCH: return "CAP_DAC_READ_SEARCH" case CAP_FOWNER: return "CAP_FOWNER" case CAP_FSETID: return "CAP_FSETID" case CAP_KILL: return "CAP_KILL" case CAP_SETGID: return "CAP_SETGID" case CAP_SETUID: return "CAP_SETUID" case CAP_SETPCAP: return "CAP_SETPCAP" case CAP_LINUX_IMMUTABLE: return "CAP_LINUX_IMMUTABLE" case CAP_NET_BIND_SERVICE: return "CAP_NET_BIND_SERVICE" case CAP_NET_BROADCAST: return "CAP_NET_BROADCAST" case CAP_NET_ADMIN: return "CAP_NET_ADMIN" case CAP_NET_RAW: return "CAP_NET_RAW" case CAP_IPC_LOCK: return "CAP_IPC_LOCK" case CAP_IPC_OWNER: return "CAP_IPC_OWNER" case CAP_SYS_MODULE: return "CAP_SYS_MODULE" case CAP_SYS_RAWIO: return "CAP_SYS_RAWIO" case CAP_SYS_CHROOT: return "CAP_SYS_CHROOT" case CAP_SYS_PTRACE: return "CAP_SYS_PTRACE" case CAP_SYS_PACCT: return "CAP_SYS_PACCT" case CAP_SYS_ADMIN: return "CAP_SYS_ADMIN" case CAP_SYS_BOOT: return "CAP_SYS_BOOT" case CAP_SYS_NICE: return "CAP_SYS_NICE" case CAP_SYS_RESOURCE: return "CAP_SYS_RESOURCE" case CAP_SYS_TIME: return "CAP_SYS_TIME" case CAP_SYS_TTY_CONFIG: return "CAP_SYS_TTY_CONFIG" case CAP_MKNOD: return "CAP_MKNOD" case CAP_LEASE: return "CAP_LEASE" case CAP_AUDIT_WRITE: return "CAP_AUDIT_WRITE" case CAP_AUDIT_CONTROL: return "CAP_AUDIT_CONTROL" case CAP_SETFCAP: return "CAP_SETFCAP" case CAP_MAC_OVERRIDE: return "CAP_MAC_OVERRIDE" case CAP_MAC_ADMIN: return "CAP_MAC_ADMIN" case CAP_SYSLOG: return "CAP_SYSLOG" case CAP_WAKE_ALARM: return "CAP_WAKE_ALARM" case CAP_BLOCK_SUSPEND: return "CAP_BLOCK_SUSPEND" case CAP_AUDIT_READ: return "CAP_AUDIT_READ" default: return "UNKNOWN" } } // TrimmedString returns the capability name without the "CAP_" prefix. func (cp Capability) TrimmedString() string { const capPrefix = "CAP_" s := cp.String() if !strings.HasPrefix(s, capPrefix) { return s } // This could use strings.TrimPrefix, but that function doesn't guarantee // that it won't allocate a new string, whereas string slicing does. // In the case of this function, since Capability.String returns a constant // string, the underlying set of bytes backing that string will never be // garbage-collected. Therefore, we always want to use a string slice that // points to this same constant set of bytes, rather than risking // allocating a new string. return s[len(capPrefix):] } // CapabilityFromString converts a string to a capability. // If the capability doesn't exist, its second return value is `false`. // The capability name is expected to include the "CAP_" prefix. func CapabilityFromString(capability string) (Capability, bool) { for cp := Capability(0); cp <= CAP_LAST_CAP; cp++ { if !cp.Ok() { continue } if cp.String() == capability { return cp, true } } return -1, false } // AllCapabilities returns a list of all defined capabilities. func AllCapabilities() []Capability { allCapapabilities := make([]Capability, 0, CAP_LAST_CAP+1) for cp := Capability(0); cp <= CAP_LAST_CAP; cp++ { if !cp.Ok() { continue } allCapapabilities = append(allCapapabilities, cp) } return allCapapabilities } // Version numbers used by the capget/capset syscalls, defined in Linux's // include/uapi/linux/capability.h. const ( // LINUX_CAPABILITY_VERSION_1 causes the data pointer to be // interpreted as a pointer to a single cap_user_data_t. Since capability // sets are 64 bits and the "capability sets" in cap_user_data_t are 32 // bits only, this causes the upper 32 bits to be implicitly 0. LINUX_CAPABILITY_VERSION_1 = 0x19980330 // LINUX_CAPABILITY_VERSION_2 and LINUX_CAPABILITY_VERSION_3 cause the // data pointer to be interpreted as a pointer to an array of 2 // cap_user_data_t, using the second to store the 32 MSB of each capability // set. Versions 2 and 3 are identical, but Linux printk's a warning on use // of version 2 due to a userspace API defect. LINUX_CAPABILITY_VERSION_2 = 0x20071026 LINUX_CAPABILITY_VERSION_3 = 0x20080522 // HighestCapabilityVersion is the highest supported // LINUX_CAPABILITY_VERSION_* version. HighestCapabilityVersion = LINUX_CAPABILITY_VERSION_3 ) // Constants that are used by file capability extended attributes, defined // in Linux's include/uapi/linux/capability.h. const ( // The flag decides the value of effective file capabilit VFS_CAP_FLAGS_EFFECTIVE = 0x000001 // VFS_CAP_REVISION_1 was the original file capability implementation, // which supported 32-bit masks for file capabilities. VFS_CAP_REVISION_1 = 0x01000000 // VFS_CAP_REVISION_2 allows for file capability masks that are 64 // bits in size, and was necessary as the number of supported // capabilities grew beyond 32. VFS_CAP_REVISION_2 = 0x02000000 // VFS_CAP_REVISION_3 are provided to support namespaced file capabilities. // As with version 2 file capabilities, version 3 capability // masks are 64 bits in size. But in addition, the root user // ID of namespace is encoded in the security.capability // extended attribute. VFS_CAP_REVISION_3 = 0x03000000 VFS_CAP_REVISION_MASK = 0xFF000000 // The encoded VFS_CAP_REVISION_1 data's number of bytes. XATTR_CAPS_SZ_1 = 12 // The encoded VFS_CAP_REVISION_2 data's number of bytes. XATTR_CAPS_SZ_2 = 20 // The encoded VFS_CAP_REVISION_3 data's number of bytes. XATTR_CAPS_SZ_3 = 24 ) // CapUserHeader is equivalent to Linux's cap_user_header_t. // // +marshal type CapUserHeader struct { Version uint32 Pid int32 } // CapUserData is equivalent to Linux's cap_user_data_t. // // +marshal slice:CapUserDataSlice type CapUserData struct { Effective uint32 Permitted uint32 Inheritable uint32 } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/clone.go000066400000000000000000000037201465435605700227470ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Clone constants per clone(2). const ( CSIGNAL = 0xff CLONE_VM = 0x100 CLONE_FS = 0x200 CLONE_FILES = 0x400 CLONE_SIGHAND = 0x800 CLONE_PIDFD = 0x1000 CLONE_PTRACE = 0x2000 CLONE_VFORK = 0x4000 CLONE_PARENT = 0x8000 CLONE_THREAD = 0x10000 CLONE_NEWNS = 0x20000 CLONE_SYSVSEM = 0x40000 CLONE_SETTLS = 0x80000 CLONE_PARENT_SETTID = 0x100000 CLONE_CHILD_CLEARTID = 0x200000 CLONE_DETACHED = 0x400000 CLONE_UNTRACED = 0x800000 CLONE_CHILD_SETTID = 0x1000000 CLONE_NEWCGROUP = 0x2000000 CLONE_NEWUTS = 0x4000000 CLONE_NEWIPC = 0x8000000 CLONE_NEWUSER = 0x10000000 CLONE_NEWPID = 0x20000000 CLONE_NEWNET = 0x40000000 CLONE_IO = 0x80000000 // Only passable via clone3(2). CLONE_CLEAR_SIGHAND = 0x100000000 CLONE_INTO_CGROUP = 0x200000000 // Sizeof first published struct. CLONE_ARGS_SIZE_VER0 = 64 // Sizeof third published struct. CLONE_ARGS_SIZE_VER2 = 88 ) // CloneArgs is struct clone_args, from include/uapi/linux/sched.h. // // +marshal type CloneArgs struct { Flags uint64 Pidfd uint64 ChildTID uint64 ParentTID uint64 ExitSignal uint64 Stack uint64 StackSize uint64 TLS uint64 SetTID uint64 SetTIDSize uint64 Cgroup uint64 } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/context.go000066400000000000000000000022241465435605700233310ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/context" ) // contextID is the linux package's type for context.Context.Value keys. type contextID int const ( // CtxSignalNoInfoFunc is a Context.Value key for a function to send signals. CtxSignalNoInfoFunc contextID = iota ) // SignalNoInfoFuncFromContext returns a callback function that can be used to send a // signal to the given context. func SignalNoInfoFuncFromContext(ctx context.Context) func(Signal) error { if f := ctx.Value(CtxSignalNoInfoFunc); f != nil { return f.(func(Signal) error) } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/dev.go000066400000000000000000000046601465435605700224310ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // MakeDeviceID encodes a major and minor device number into a single device ID. // // Format (see linux/kdev_t.h:new_encode_dev): // // Bits 7:0 - minor bits 7:0 // Bits 19:8 - major bits 11:0 // Bits 31:20 - minor bits 19:8 func MakeDeviceID(major uint16, minor uint32) uint32 { return (minor & 0xff) | ((uint32(major) & 0xfff) << 8) | ((minor >> 8) << 20) } // DecodeDeviceID decodes a device ID into major and minor device numbers. func DecodeDeviceID(rdev uint32) (uint16, uint32) { major := uint16((rdev >> 8) & 0xfff) minor := (rdev & 0xff) | ((rdev >> 20) << 8) return major, minor } // Character device IDs. // // See Documentations/devices.txt and uapi/linux/major.h. const ( // UNNAMED_MAJOR is the major device number for "unnamed" devices, whose // minor numbers are dynamically allocated by the kernel. UNNAMED_MAJOR = 0 // MEM_MAJOR is the major device number for "memory" character devices. MEM_MAJOR = 1 // TTYAUX_MAJOR is the major device number for alternate TTY devices. TTYAUX_MAJOR = 5 // MISC_MAJOR is the major device number for non-serial mice, misc feature // devices. MISC_MAJOR = 10 // UNIX98_PTY_MASTER_MAJOR is the initial major device number for // Unix98 PTY masters. UNIX98_PTY_MASTER_MAJOR = 128 // UNIX98_PTY_REPLICA_MAJOR is the initial major device number for // Unix98 PTY replicas. UNIX98_PTY_REPLICA_MAJOR = 136 ) // Minor device numbers for TTYAUX_MAJOR. const ( // PTMX_MINOR is the minor device number for /dev/ptmx. PTMX_MINOR = 2 ) // from Linux include/drm/drm_accel.h const ( // ACCEL_MAJOR is the major device number for compute accelerator devices. ACCEL_MAJOR = 121 ) // Major device numbers for VFIO-based TPU. const ( // Major devices number between 243 and 254 are usually reserved for local use. // The device number 245 is used by VFIO based TPU in GCP. VFIO_MAJOR = 245 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/elf.go000066400000000000000000000103001465435605700224050ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Linux auxiliary vector entry types. const ( // AT_NULL is the end of the auxiliary vector. AT_NULL = 0 // AT_IGNORE should be ignored. AT_IGNORE = 1 // AT_EXECFD is the file descriptor of the program. AT_EXECFD = 2 // AT_PHDR points to the program headers. AT_PHDR = 3 // AT_PHENT is the size of a program header entry. AT_PHENT = 4 // AT_PHNUM is the number of program headers. AT_PHNUM = 5 // AT_PAGESZ is the system page size. AT_PAGESZ = 6 // AT_BASE is the base address of the interpreter. AT_BASE = 7 // AT_FLAGS are flags. AT_FLAGS = 8 // AT_ENTRY is the program entry point. AT_ENTRY = 9 // AT_NOTELF indicates that the program is not an ELF binary. AT_NOTELF = 10 // AT_UID is the real UID. AT_UID = 11 // AT_EUID is the effective UID. AT_EUID = 12 // AT_GID is the real GID. AT_GID = 13 // AT_EGID is the effective GID. AT_EGID = 14 // AT_PLATFORM is a string identifying the CPU. AT_PLATFORM = 15 // AT_HWCAP are arch-dependent CPU capabilities. AT_HWCAP = 16 // AT_CLKTCK is the frequency used by times(2). AT_CLKTCK = 17 // AT_SECURE indicate secure mode. AT_SECURE = 23 // AT_BASE_PLATFORM is a string identifying the "real" platform. It may // differ from AT_PLATFORM. AT_BASE_PLATFORM = 24 // AT_RANDOM points to 16-bytes of random data. AT_RANDOM = 25 // AT_HWCAP2 is an extension of AT_HWCAP. AT_HWCAP2 = 26 // AT_EXECFN is the path used to execute the program. AT_EXECFN = 31 // AT_SYSINFO_EHDR is the address of the VDSO. AT_SYSINFO_EHDR = 33 ) // ELF ET_CORE and ptrace GETREGSET/SETREGSET register set types. // // See include/uapi/linux/elf.h. const ( // NT_PRSTATUS is for general purpose register. NT_PRSTATUS = 0x1 // NT_PRFPREG is for float point register. NT_PRFPREG = 0x2 // NT_X86_XSTATE is for x86 extended state using xsave. NT_X86_XSTATE = 0x202 // NT_ARM_TLS is for ARM TLS register. NT_ARM_TLS = 0x401 ) // ElfHeader64 is the ELF64 file header. // // +marshal type ElfHeader64 struct { Ident [16]byte // File identification. Type uint16 // File type. Machine uint16 // Machine architecture. Version uint32 // ELF format version. Entry uint64 // Entry point. Phoff uint64 // Program header file offset. Shoff uint64 // Section header file offset. Flags uint32 // Architecture-specific flags. Ehsize uint16 // Size of ELF header in bytes. Phentsize uint16 // Size of program header entry. Phnum uint16 // Number of program header entries. Shentsize uint16 // Size of section header entry. Shnum uint16 // Number of section header entries. Shstrndx uint16 // Section name strings section. } // ElfSection64 is the ELF64 Section header. // // +marshal type ElfSection64 struct { Name uint32 // Section name (index into the section header string table). Type uint32 // Section type. Flags uint64 // Section flags. Addr uint64 // Address in memory image. Off uint64 // Offset in file. Size uint64 // Size in bytes. Link uint32 // Index of a related section. Info uint32 // Depends on section type. Addralign uint64 // Alignment in bytes. Entsize uint64 // Size of each entry in section. } // ElfProg64 is the ELF64 Program header. // // +marshal type ElfProg64 struct { Type uint32 // Entry type. Flags uint32 // Access permission flags. Off uint64 // File offset of contents. Vaddr uint64 // Virtual address in memory image. Paddr uint64 // Physical address (not used). Filesz uint64 // Size of contents in file. Memsz uint64 // Size of contents in memory. Align uint64 // Alignment in memory and file. } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/epoll.go000066400000000000000000000027541465435605700227700ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Event masks. const ( EPOLLIN = 0x1 EPOLLPRI = 0x2 EPOLLOUT = 0x4 EPOLLERR = 0x8 EPOLLHUP = 0x10 EPOLLRDNORM = 0x40 EPOLLRDBAND = 0x80 EPOLLWRNORM = 0x100 EPOLLWRBAND = 0x200 EPOLLMSG = 0x400 EPOLLRDHUP = 0x2000 ) // Per-file descriptor flags. const ( EPOLLEXCLUSIVE = 1 << 28 EPOLLWAKEUP = 1 << 29 EPOLLONESHOT = 1 << 30 EPOLLET = 1 << 31 // EP_PRIVATE_BITS is fs/eventpoll.c:EP_PRIVATE_BITS, the set of all bits // in an epoll event mask that correspond to flags rather than I/O events. EP_PRIVATE_BITS = EPOLLEXCLUSIVE | EPOLLWAKEUP | EPOLLONESHOT | EPOLLET ) // Operation flags. const ( EPOLL_CLOEXEC = 0x80000 EPOLL_NONBLOCK = 0x800 ) // Control operations. const ( EPOLL_CTL_ADD = 0x1 EPOLL_CTL_DEL = 0x2 EPOLL_CTL_MOD = 0x3 ) // SizeOfEpollEvent is the size of EpollEvent struct. var SizeOfEpollEvent = (*EpollEvent)(nil).SizeBytes() golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/epoll_amd64.go000066400000000000000000000020061465435605700237510ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package linux // EpollEvent is equivalent to struct epoll_event from epoll(2). // // +marshal slice:EpollEventSlice type EpollEvent struct { Events uint32 // Linux makes struct epoll_event::data a __u64. We represent it as // [2]int32 because, on amd64, Linux also makes struct epoll_event // __attribute__((packed)), such that there is no padding between Events // and Data. Data [2]int32 } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/epoll_arm64.go000066400000000000000000000016101465435605700237670ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package linux // EpollEvent is equivalent to struct epoll_event from epoll(2). // // +marshal slice:EpollEventSlice type EpollEvent struct { Events uint32 // Linux makes struct epoll_event a __u64, necessitating 4 bytes of padding // here. _ int32 Data [2]int32 } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/errno/000077500000000000000000000000001465435605700224435ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/errno/errno.go000066400000000000000000000062731465435605700241270ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package errno holds errno codes for abi/linux. package errno // Errno represents a Linux errno value. type Errno uint32 // Errno values from include/uapi/asm-generic/errno-base.h. const ( NOERRNO = iota EPERM ENOENT ESRCH EINTR EIO ENXIO E2BIG ENOEXEC EBADF ECHILD // 10 EAGAIN ENOMEM EACCES EFAULT ENOTBLK EBUSY EEXIST EXDEV ENODEV ENOTDIR // 20 EISDIR EINVAL ENFILE EMFILE ENOTTY ETXTBSY EFBIG ENOSPC ESPIPE EROFS // 30 EMLINK EPIPE EDOM ERANGE // Errno values from include/uapi/asm-generic/errno.h. EDEADLK ENAMETOOLONG ENOLCK ENOSYS ENOTEMPTY ELOOP // 40 _ // Skip for EWOULDBLOCK = EAGAIN. ENOMSG //42 EIDRM ECHRNG EL2NSYNC EL3HLT EL3RST ELNRNG EUNATCH ENOCSI EL2HLT // 50 EBADE EBADR EXFULL ENOANO EBADRQC EBADSLT _ // Skip for EDEADLOCK = EDEADLK. EBFONT ENOSTR // 60 ENODATA ETIME ENOSR ENONET ENOPKG EREMOTE ENOLINK EADV ESRMNT ECOMM // 70 EPROTO EMULTIHOP EDOTDOT EBADMSG EOVERFLOW ENOTUNIQ EBADFD EREMCHG ELIBACC ELIBBAD // 80 ELIBSCN ELIBMAX ELIBEXEC EILSEQ ERESTART ESTRPIPE EUSERS ENOTSOCK EDESTADDRREQ EMSGSIZE // 90 EPROTOTYPE ENOPROTOOPT EPROTONOSUPPORT ESOCKTNOSUPPORT EOPNOTSUPP EPFNOSUPPORT EAFNOSUPPORT EADDRINUSE EADDRNOTAVAIL ENETDOWN // 100 ENETUNREACH ENETRESET ECONNABORTED ECONNRESET ENOBUFS EISCONN ENOTCONN ESHUTDOWN ETOOMANYREFS ETIMEDOUT // 110 ECONNREFUSED EHOSTDOWN EHOSTUNREACH EALREADY EINPROGRESS ESTALE EUCLEAN ENOTNAM ENAVAIL EISNAM // 120 EREMOTEIO EDQUOT ENOMEDIUM EMEDIUMTYPE ECANCELED ENOKEY EKEYEXPIRED EKEYREVOKED EKEYREJECTED EOWNERDEAD // 130 ENOTRECOVERABLE ERFKILL EHWPOISON ) // errnos derived from other errnos. const ( EWOULDBLOCK = EAGAIN EDEADLOCK = EDEADLK ) // errnos for internal errors. const ( // ERESTARTSYS is returned by an interrupted syscall to indicate that it // should be converted to EINTR if interrupted by a signal delivered to a // user handler without SA_RESTART set, and restarted otherwise. ERESTARTSYS = 512 // ERESTARTNOINTR is returned by an interrupted syscall to indicate that it // should always be restarted. ERESTARTNOINTR = 513 // ERESTARTNOHAND is returned by an interrupted syscall to indicate that it // should be converted to EINTR if interrupted by a signal delivered to a // user handler, and restarted otherwise. ERESTARTNOHAND = 514 // ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate // that it should be restarted using a custom function. The interrupted // syscall must register a custom restart function by calling // Task.SetRestartSyscallFn. ERESTART_RESTARTBLOCK = 516 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/errno/errno_state_autogen.go000066400000000000000000000000671465435605700270440ustar00rootroot00000000000000// automatically generated by stateify. package errno golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/errqueue.go000066400000000000000000000044601465435605700235060ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/marshal" ) // Socket error origin codes as defined in include/uapi/linux/errqueue.h. const ( SO_EE_ORIGIN_NONE = 0 SO_EE_ORIGIN_LOCAL = 1 SO_EE_ORIGIN_ICMP = 2 SO_EE_ORIGIN_ICMP6 = 3 ) // SockExtendedErr represents struct sock_extended_err in Linux defined in // include/uapi/linux/errqueue.h. // // +marshal type SockExtendedErr struct { Errno uint32 Origin uint8 Type uint8 Code uint8 Pad uint8 Info uint32 Data uint32 } // SockErrCMsg represents the IP*_RECVERR control message. type SockErrCMsg interface { marshal.Marshallable CMsgLevel() uint32 CMsgType() uint32 } // SockErrCMsgIPv4 is the IP_RECVERR control message used in // recvmsg(MSG_ERRQUEUE) by ipv4 sockets. This is equilavent to `struct errhdr` // defined in net/ipv4/ip_sockglue.c:ip_recv_error(). // // +marshal type SockErrCMsgIPv4 struct { SockExtendedErr Offender SockAddrInet } var _ SockErrCMsg = (*SockErrCMsgIPv4)(nil) // CMsgLevel implements SockErrCMsg.CMsgLevel. func (*SockErrCMsgIPv4) CMsgLevel() uint32 { return SOL_IP } // CMsgType implements SockErrCMsg.CMsgType. func (*SockErrCMsgIPv4) CMsgType() uint32 { return IP_RECVERR } // SockErrCMsgIPv6 is the IPV6_RECVERR control message used in // recvmsg(MSG_ERRQUEUE) by ipv6 sockets. This is equilavent to `struct errhdr` // defined in net/ipv6/datagram.c:ipv6_recv_error(). // // +marshal type SockErrCMsgIPv6 struct { SockExtendedErr Offender SockAddrInet6 } var _ SockErrCMsg = (*SockErrCMsgIPv6)(nil) // CMsgLevel implements SockErrCMsg.CMsgLevel. func (*SockErrCMsgIPv6) CMsgLevel() uint32 { return SOL_IPV6 } // CMsgType implements SockErrCMsg.CMsgType. func (*SockErrCMsgIPv6) CMsgType() uint32 { return IPV6_RECVERR } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/eventfd.go000066400000000000000000000013301465435605700232750ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Constants for eventfd2(2). const ( EFD_SEMAPHORE = 0x1 EFD_CLOEXEC = O_CLOEXEC EFD_NONBLOCK = O_NONBLOCK ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/exec.go000066400000000000000000000012571465435605700225760ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // TASK_COMM_LEN is the task command name length. const TASK_COMM_LEN = 16 golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/fadvise.go000066400000000000000000000014461465435605700232730ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Fadvise constants. const ( POSIX_FADV_NORMAL = 0 POSIX_FADV_RANDOM = 1 POSIX_FADV_SEQUENTIAL = 2 POSIX_FADV_WILLNEED = 3 POSIX_FADV_DONTNEED = 4 POSIX_FADV_NOREUSE = 5 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/fcntl.go000066400000000000000000000031751465435605700227610ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Commands from linux/fcntl.h. const ( F_DUPFD = 0 F_GETFD = 1 F_SETFD = 2 F_GETFL = 3 F_SETFL = 4 F_GETLK = 5 F_SETLK = 6 F_SETLKW = 7 F_SETOWN = 8 F_GETOWN = 9 F_SETSIG = 10 F_GETSIG = 11 F_SETOWN_EX = 15 F_GETOWN_EX = 16 F_OFD_GETLK = 36 F_OFD_SETLK = 37 F_OFD_SETLKW = 38 F_DUPFD_CLOEXEC = 1024 + 6 F_SETPIPE_SZ = 1024 + 7 F_GETPIPE_SZ = 1024 + 8 ) // Commands for F_SETLK. const ( F_RDLCK = 0 F_WRLCK = 1 F_UNLCK = 2 ) // Flags for fcntl. const ( FD_CLOEXEC = 00000001 ) // Flock is the lock structure for F_SETLK. // // +marshal type Flock struct { Type int16 Whence int16 _ [4]byte Start int64 Len int64 PID int32 _ [4]byte } // Owner types for F_SETOWN_EX and F_GETOWN_EX. const ( F_OWNER_TID = 0 F_OWNER_PID = 1 F_OWNER_PGRP = 2 ) // FOwnerEx is the owner structure for F_SETOWN_EX and F_GETOWN_EX. // // +marshal type FOwnerEx struct { Type int32 PID int32 } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/file.go000066400000000000000000000243751465435605700225770ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "fmt" "strings" "gvisor.dev/gvisor/pkg/abi" ) // Constants for open(2). const ( O_ACCMODE = 000000003 O_RDONLY = 000000000 O_WRONLY = 000000001 O_RDWR = 000000002 O_CREAT = 000000100 O_EXCL = 000000200 O_NOCTTY = 000000400 O_TRUNC = 000001000 O_APPEND = 000002000 O_NONBLOCK = 000004000 O_DSYNC = 000010000 O_ASYNC = 000020000 O_NOATIME = 001000000 O_CLOEXEC = 002000000 O_SYNC = 004000000 // __O_SYNC in Linux O_PATH = 010000000 O_TMPFILE = 020000000 // __O_TMPFILE in Linux ) // Constants for fstatat(2). const ( AT_SYMLINK_NOFOLLOW = 0x100 ) // Constants for mount(2). const ( MS_RDONLY = 0x1 MS_NOSUID = 0x2 MS_NODEV = 0x4 MS_NOEXEC = 0x8 MS_SYNCHRONOUS = 0x10 MS_REMOUNT = 0x20 MS_MANDLOCK = 0x40 MS_DIRSYNC = 0x80 MS_NOATIME = 0x400 MS_NODIRATIME = 0x800 MS_BIND = 0x1000 MS_MOVE = 0x2000 MS_REC = 0x4000 MS_POSIXACL = 0x10000 MS_UNBINDABLE = 0x20000 MS_PRIVATE = 0x40000 MS_SLAVE = 0x80000 MS_SHARED = 0x100000 MS_RELATIME = 0x200000 MS_KERNMOUNT = 0x400000 MS_I_VERSION = 0x800000 MS_STRICTATIME = 0x1000000 MS_MGC_VAL = 0xC0ED0000 MS_MGC_MSK = 0xffff0000 ) // Constants for umount2(2). const ( MNT_FORCE = 0x1 MNT_DETACH = 0x2 MNT_EXPIRE = 0x4 UMOUNT_NOFOLLOW = 0x8 ) // Constants for unlinkat(2). const ( AT_REMOVEDIR = 0x200 ) // Constants for linkat(2) and fchownat(2). const ( AT_SYMLINK_FOLLOW = 0x400 AT_EMPTY_PATH = 0x1000 ) // Constants for faccessat2(2). const ( AT_EACCESS = 0x200 ) // Constants for all file-related ...at(2) syscalls. const ( AT_FDCWD = -100 ) // Special values for the ns field in utimensat(2). const ( UTIME_NOW = ((1 << 30) - 1) UTIME_OMIT = ((1 << 30) - 2) ) // MaxSymlinkTraversals is the maximum number of links that will be followed by // the kernel to resolve a symlink. const MaxSymlinkTraversals = 40 // Constants for flock(2). const ( LOCK_SH = 1 // shared lock LOCK_EX = 2 // exclusive lock LOCK_NB = 4 // or'd with one of the above to prevent blocking LOCK_UN = 8 // remove lock ) // Values for mode_t. const ( S_IFMT = 0170000 S_IFSOCK = 0140000 S_IFLNK = 0120000 S_IFREG = 0100000 S_IFBLK = 060000 S_IFDIR = 040000 S_IFCHR = 020000 S_IFIFO = 010000 FileTypeMask = S_IFMT ModeSocket = S_IFSOCK ModeSymlink = S_IFLNK ModeRegular = S_IFREG ModeBlockDevice = S_IFBLK ModeDirectory = S_IFDIR ModeCharacterDevice = S_IFCHR ModeNamedPipe = S_IFIFO S_ISUID = 04000 S_ISGID = 02000 S_ISVTX = 01000 ModeSetUID = S_ISUID ModeSetGID = S_ISGID ModeSticky = S_ISVTX ModeUserAll = 0700 ModeUserRead = 0400 ModeUserWrite = 0200 ModeUserExec = 0100 ModeGroupAll = 0070 ModeGroupRead = 0040 ModeGroupWrite = 0020 ModeGroupExec = 0010 ModeOtherAll = 0007 ModeOtherRead = 0004 ModeOtherWrite = 0002 ModeOtherExec = 0001 PermissionsMask = 0777 ) // Values for linux_dirent64.d_type. const ( DT_UNKNOWN = 0 DT_FIFO = 1 DT_CHR = 2 DT_DIR = 4 DT_BLK = 6 DT_REG = 8 DT_LNK = 10 DT_SOCK = 12 DT_WHT = 14 ) // DirentType are the friendly strings for linux_dirent64.d_type. var DirentType = abi.ValueSet{ DT_UNKNOWN: "DT_UNKNOWN", DT_FIFO: "DT_FIFO", DT_CHR: "DT_CHR", DT_DIR: "DT_DIR", DT_BLK: "DT_BLK", DT_REG: "DT_REG", DT_LNK: "DT_LNK", DT_SOCK: "DT_SOCK", DT_WHT: "DT_WHT", } // Values for fs on-disk file types. const ( FT_UNKNOWN = 0 FT_REG_FILE = 1 FT_DIR = 2 FT_CHRDEV = 3 FT_BLKDEV = 4 FT_FIFO = 5 FT_SOCK = 6 FT_SYMLINK = 7 FT_MAX = 8 ) // Conversion from fs on-disk file type to dirent type. var direntTypeByFileType = [FT_MAX]uint8{ FT_UNKNOWN: DT_UNKNOWN, FT_REG_FILE: DT_REG, FT_DIR: DT_DIR, FT_CHRDEV: DT_CHR, FT_BLKDEV: DT_BLK, FT_FIFO: DT_FIFO, FT_SOCK: DT_SOCK, FT_SYMLINK: DT_LNK, } // FileTypeToDirentType converts the on-disk file type (FT_*) to the directory // entry type (DT_*). func FileTypeToDirentType(filetype uint8) uint8 { if filetype >= FT_MAX { return DT_UNKNOWN } return direntTypeByFileType[filetype] } // Values for preadv2/pwritev2. const ( // NOTE(b/120162627): gVisor does not implement the RWF_HIPRI feature, but // the flag is accepted as a valid flag argument for preadv2/pwritev2 and // silently ignored. RWF_HIPRI = 0x00000001 RWF_DSYNC = 0x00000002 RWF_SYNC = 0x00000004 RWF_VALID = RWF_HIPRI | RWF_DSYNC | RWF_SYNC ) // SizeOfStat is the size of a Stat struct. var SizeOfStat = (*Stat)(nil).SizeBytes() // Flags for statx. const ( AT_NO_AUTOMOUNT = 0x800 AT_STATX_SYNC_TYPE = 0x6000 AT_STATX_SYNC_AS_STAT = 0x0000 AT_STATX_FORCE_SYNC = 0x2000 AT_STATX_DONT_SYNC = 0x4000 ) // Mask values for statx. const ( STATX_TYPE = 0x00000001 STATX_MODE = 0x00000002 STATX_NLINK = 0x00000004 STATX_UID = 0x00000008 STATX_GID = 0x00000010 STATX_ATIME = 0x00000020 STATX_MTIME = 0x00000040 STATX_CTIME = 0x00000080 STATX_INO = 0x00000100 STATX_SIZE = 0x00000200 STATX_BLOCKS = 0x00000400 STATX_BASIC_STATS = 0x000007ff STATX_BTIME = 0x00000800 STATX_ALL = 0x00000fff STATX__RESERVED = 0x80000000 ) // Bitmasks for Statx.Attributes and Statx.AttributesMask, from // include/uapi/linux/stat.h. const ( STATX_ATTR_COMPRESSED = 0x00000004 STATX_ATTR_IMMUTABLE = 0x00000010 STATX_ATTR_APPEND = 0x00000020 STATX_ATTR_NODUMP = 0x00000040 STATX_ATTR_ENCRYPTED = 0x00000800 STATX_ATTR_AUTOMOUNT = 0x00001000 ) // Statx represents struct statx. // // +marshal boundCheck slice:StatxSlice type Statx struct { Mask uint32 Blksize uint32 Attributes uint64 Nlink uint32 UID uint32 GID uint32 Mode uint16 _ uint16 Ino uint64 Size uint64 Blocks uint64 AttributesMask uint64 Atime StatxTimestamp Btime StatxTimestamp Ctime StatxTimestamp Mtime StatxTimestamp RdevMajor uint32 RdevMinor uint32 DevMajor uint32 DevMinor uint32 } // String implements fmt.Stringer.String. func (s *Statx) String() string { return fmt.Sprintf("Statx{Mask: %#x, Mode: %s, UID: %d, GID: %d, Ino: %d, DevMajor: %d, DevMinor: %d, Size: %d, Blocks: %d, Blksize: %d, Nlink: %d, Atime: %s, Btime: %s, Ctime: %s, Mtime: %s, Attributes: %d, AttributesMask: %d, RdevMajor: %d, RdevMinor: %d}", s.Mask, FileMode(s.Mode), s.UID, s.GID, s.Ino, s.DevMajor, s.DevMinor, s.Size, s.Blocks, s.Blksize, s.Nlink, s.Atime.ToTime(), s.Btime.ToTime(), s.Ctime.ToTime(), s.Mtime.ToTime(), s.Attributes, s.AttributesMask, s.RdevMajor, s.RdevMinor) } // SizeOfStatx is the size of a Statx struct. var SizeOfStatx = (*Statx)(nil).SizeBytes() // FileMode represents a mode_t. // // +marshal type FileMode uint16 // Permissions returns just the permission bits. func (m FileMode) Permissions() FileMode { return m & PermissionsMask } // FileType returns just the file type bits. func (m FileMode) FileType() FileMode { return m & FileTypeMask } // ExtraBits returns everything but the file type and permission bits. func (m FileMode) ExtraBits() FileMode { return m &^ (PermissionsMask | FileTypeMask) } // IsDir returns true if file type represents a directory. func (m FileMode) IsDir() bool { return m.FileType() == S_IFDIR } // String returns a string representation of m. func (m FileMode) String() string { var s []string if ft := m.FileType(); ft != 0 { s = append(s, fileType.Parse(uint64(ft))) } if eb := m.ExtraBits(); eb != 0 { s = append(s, modeExtraBits.Parse(uint64(eb))) } s = append(s, fmt.Sprintf("0o%o", m.Permissions())) return strings.Join(s, "|") } // DirentType maps file types to dirent types appropriate for (struct // dirent)::d_type. func (m FileMode) DirentType() uint8 { switch m.FileType() { case ModeSocket: return DT_SOCK case ModeSymlink: return DT_LNK case ModeRegular: return DT_REG case ModeBlockDevice: return DT_BLK case ModeDirectory: return DT_DIR case ModeCharacterDevice: return DT_CHR case ModeNamedPipe: return DT_FIFO default: return DT_UNKNOWN } } var modeExtraBits = abi.FlagSet{ { Flag: ModeSetUID, Name: "S_ISUID", }, { Flag: ModeSetGID, Name: "S_ISGID", }, { Flag: ModeSticky, Name: "S_ISVTX", }, } var fileType = abi.ValueSet{ ModeSocket: "S_IFSOCK", ModeSymlink: "S_IFLINK", ModeRegular: "S_IFREG", ModeBlockDevice: "S_IFBLK", ModeDirectory: "S_IFDIR", ModeCharacterDevice: "S_IFCHR", ModeNamedPipe: "S_IFIFO", } // Constants for memfd_create(2). Source: include/uapi/linux/memfd.h const ( MFD_CLOEXEC = 0x0001 MFD_ALLOW_SEALING = 0x0002 ) // Constants related to file seals. Source: include/uapi/{asm-generic,linux}/fcntl.h const ( F_LINUX_SPECIFIC_BASE = 1024 F_ADD_SEALS = F_LINUX_SPECIFIC_BASE + 9 F_GET_SEALS = F_LINUX_SPECIFIC_BASE + 10 F_SEAL_SEAL = 0x0001 // Prevent further seals from being set. F_SEAL_SHRINK = 0x0002 // Prevent file from shrinking. F_SEAL_GROW = 0x0004 // Prevent file from growing. F_SEAL_WRITE = 0x0008 // Prevent writes. ) // Constants related to fallocate(2). Source: include/uapi/linux/falloc.h const ( FALLOC_FL_KEEP_SIZE = 0x01 FALLOC_FL_PUNCH_HOLE = 0x02 FALLOC_FL_NO_HIDE_STALE = 0x04 FALLOC_FL_COLLAPSE_RANGE = 0x08 FALLOC_FL_ZERO_RANGE = 0x10 FALLOC_FL_INSERT_RANGE = 0x20 FALLOC_FL_UNSHARE_RANGE = 0x40 ) // Constants related to close_range(2). Source: /include/uapi/linux/close_range.h const ( CLOSE_RANGE_UNSHARE = uint32(1 << 1) CLOSE_RANGE_CLOEXEC = uint32(1 << 2) ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/file_amd64.go000066400000000000000000000021051465435605700235550ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package linux // Constants for open(2). const ( O_DIRECT = 000040000 O_LARGEFILE = 000100000 O_DIRECTORY = 000200000 O_NOFOLLOW = 000400000 ) // Stat represents struct stat. // // +marshal type Stat struct { Dev uint64 Ino uint64 Nlink uint64 Mode uint32 UID uint32 GID uint32 _ int32 Rdev uint64 Size int64 Blksize int64 Blocks int64 ATime Timespec MTime Timespec CTime Timespec _ [3]int64 } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/file_arm64.go000066400000000000000000000021251465435605700235750ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package linux // Constants for open(2). const ( O_DIRECTORY = 000040000 O_NOFOLLOW = 000100000 O_DIRECT = 000200000 O_LARGEFILE = 000400000 ) // Stat represents struct stat. // // +marshal type Stat struct { Dev uint64 Ino uint64 Mode uint32 Nlink uint32 UID uint32 GID uint32 Rdev uint64 _ uint64 Size int64 Blksize int32 _ int32 Blocks int64 ATime Timespec MTime Timespec CTime Timespec _ [2]int32 } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/fs.go000066400000000000000000000062711465435605700222630ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Filesystem types used in statfs(2). // // See linux/magic.h. const ( ANON_INODE_FS_MAGIC = 0x09041934 CGROUP_SUPER_MAGIC = 0x27e0eb DEVPTS_SUPER_MAGIC = 0x00001cd1 EXT_SUPER_MAGIC = 0xef53 FUSE_SUPER_MAGIC = 0x65735546 MQUEUE_MAGIC = 0x19800202 NSFS_MAGIC = 0x6e736673 OVERLAYFS_SUPER_MAGIC = 0x794c7630 PIPEFS_MAGIC = 0x50495045 PROC_SUPER_MAGIC = 0x9fa0 RAMFS_MAGIC = 0x09041934 SOCKFS_MAGIC = 0x534F434B SYSFS_MAGIC = 0x62656572 TMPFS_MAGIC = 0x01021994 V9FS_MAGIC = 0x01021997 ) // Filesystem path limits, from uapi/linux/limits.h. const ( NAME_MAX = 255 PATH_MAX = 4096 ) // The bit mask f_flags in struct statfs, from include/linux/statfs.h const ( ST_RDONLY = 0x0001 ST_NOSUID = 0x0002 ST_NODEV = 0x0004 ST_NOEXEC = 0x0008 ST_SYNCHRONOUS = 0x0010 ST_VALID = 0x0020 ST_MANDLOCK = 0x0040 ST_NOATIME = 0x0400 ST_NODIRATIME = 0x0800 ST_RELATIME = 0x1000 ST_NOSYMFOLLOW = 0x2000 ) // Statfs is struct statfs, from uapi/asm-generic/statfs.h. // // +marshal type Statfs struct { // Type is one of the filesystem magic values, defined above. Type uint64 // BlockSize is the optimal transfer block size in bytes. BlockSize int64 // Blocks is the maximum number of data blocks the filesystem may store, in // units of BlockSize. Blocks uint64 // BlocksFree is the number of free data blocks, in units of BlockSize. BlocksFree uint64 // BlocksAvailable is the number of data blocks free for use by // unprivileged users, in units of BlockSize. BlocksAvailable uint64 // Files is the number of used file nodes on the filesystem. Files uint64 // FileFress is the number of free file nodes on the filesystem. FilesFree uint64 // FSID is the filesystem ID. FSID [2]int32 // NameLength is the maximum file name length. NameLength uint64 // FragmentSize is equivalent to BlockSize. FragmentSize int64 // Flags is the set of filesystem mount flags. Flags uint64 // Spare is unused. Spare [4]uint64 } // Whence argument to lseek(2), from include/uapi/linux/fs.h. const ( SEEK_SET = 0 SEEK_CUR = 1 SEEK_END = 2 SEEK_DATA = 3 SEEK_HOLE = 4 ) // Sync_file_range flags, from include/uapi/linux/fs.h const ( SYNC_FILE_RANGE_WAIT_BEFORE = 1 SYNC_FILE_RANGE_WRITE = 2 SYNC_FILE_RANGE_WAIT_AFTER = 4 ) // Flag argument to renameat2(2), from include/uapi/linux/fs.h. const ( RENAME_NOREPLACE = (1 << 0) // Don't overwrite target. RENAME_EXCHANGE = (1 << 1) // Exchange src and dst. RENAME_WHITEOUT = (1 << 2) // Whiteout src. ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/fuse.go000066400000000000000000000720241465435605700226140ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "time" "gvisor.dev/gvisor/pkg/marshal/primitive" ) // FUSEOpcode is a FUSE operation code. // // +marshal type FUSEOpcode uint32 // FUSEOpID is a FUSE operation ID. // // +marshal type FUSEOpID uint64 // FUSE_ROOT_ID is the id of root inode. const FUSE_ROOT_ID = 1 // Opcodes for FUSE operations. // // Analogous to the opcodes in include/linux/fuse.h. const ( FUSE_LOOKUP FUSEOpcode = 1 FUSE_FORGET = 2 /* no reply */ FUSE_GETATTR = 3 FUSE_SETATTR = 4 FUSE_READLINK = 5 FUSE_SYMLINK = 6 _ FUSE_MKNOD = 8 FUSE_MKDIR = 9 FUSE_UNLINK = 10 FUSE_RMDIR = 11 FUSE_RENAME = 12 FUSE_LINK = 13 FUSE_OPEN = 14 FUSE_READ = 15 FUSE_WRITE = 16 FUSE_STATFS = 17 FUSE_RELEASE = 18 _ FUSE_FSYNC = 20 FUSE_SETXATTR = 21 FUSE_GETXATTR = 22 FUSE_LISTXATTR = 23 FUSE_REMOVEXATTR = 24 FUSE_FLUSH = 25 FUSE_INIT = 26 FUSE_OPENDIR = 27 FUSE_READDIR = 28 FUSE_RELEASEDIR = 29 FUSE_FSYNCDIR = 30 FUSE_GETLK = 31 FUSE_SETLK = 32 FUSE_SETLKW = 33 FUSE_ACCESS = 34 FUSE_CREATE = 35 FUSE_INTERRUPT = 36 FUSE_BMAP = 37 FUSE_DESTROY = 38 FUSE_IOCTL = 39 FUSE_POLL = 40 FUSE_NOTIFY_REPLY = 41 FUSE_BATCH_FORGET = 42 FUSE_FALLOCATE = 43 ) const ( // FUSE_MIN_READ_BUFFER is the minimum size the read can be for any FUSE filesystem. // This is the minimum size Linux supports. See linux.fuse.h. FUSE_MIN_READ_BUFFER uint32 = 8192 ) // FUSEHeaderIn is the header read by the daemon with each request. // // +marshal // +stateify savable type FUSEHeaderIn struct { // Len specifies the total length of the data, including this header. Len uint32 // Opcode specifies the kind of operation of the request. Opcode FUSEOpcode // Unique specifies the unique identifier for this request. Unique FUSEOpID // NodeID is the ID of the filesystem object being operated on. NodeID uint64 // UID is the UID of the requesting process. UID uint32 // GID is the GID of the requesting process. GID uint32 // PID is the PID of the requesting process. PID uint32 _ uint32 } // SizeOfFUSEHeaderIn is the size of the FUSEHeaderIn struct. var SizeOfFUSEHeaderIn = uint32((*FUSEHeaderIn)(nil).SizeBytes()) // FUSEHeaderOut is the header written by the daemon when it processes // a request and wants to send a reply (almost all operations require a // reply; if they do not, this will be explicitly documented). // // +marshal // +stateify savable type FUSEHeaderOut struct { // Len specifies the total length of the data, including this header. Len uint32 // Error specifies the error that occurred (0 if none). Error int32 // Unique specifies the unique identifier of the corresponding request. Unique FUSEOpID } // SizeOfFUSEHeaderOut is the size of the FUSEHeaderOut struct. var SizeOfFUSEHeaderOut = uint32((*FUSEHeaderOut)(nil).SizeBytes()) // FUSE_INIT flags, consistent with the ones in include/uapi/linux/fuse.h. // Our target version is 7.23 but we have few implemented in advance. const ( FUSE_ASYNC_READ = 1 << 0 FUSE_POSIX_LOCKS = 1 << 1 FUSE_FILE_OPS = 1 << 2 FUSE_ATOMIC_O_TRUNC = 1 << 3 FUSE_EXPORT_SUPPORT = 1 << 4 FUSE_BIG_WRITES = 1 << 5 FUSE_DONT_MASK = 1 << 6 FUSE_SPLICE_WRITE = 1 << 7 FUSE_SPLICE_MOVE = 1 << 8 FUSE_SPLICE_READ = 1 << 9 FUSE_FLOCK_LOCKS = 1 << 10 FUSE_HAS_IOCTL_DIR = 1 << 11 FUSE_AUTO_INVAL_DATA = 1 << 12 FUSE_DO_READDIRPLUS = 1 << 13 FUSE_READDIRPLUS_AUTO = 1 << 14 FUSE_ASYNC_DIO = 1 << 15 FUSE_WRITEBACK_CACHE = 1 << 16 FUSE_NO_OPEN_SUPPORT = 1 << 17 FUSE_MAX_PAGES = 1 << 22 // From FUSE 7.28 ) // currently supported FUSE protocol version numbers. const ( FUSE_KERNEL_VERSION = 7 FUSE_KERNEL_MINOR_VERSION = 31 ) // Constants relevant to FUSE operations. const ( FUSE_NAME_MAX = 1024 FUSE_PAGE_SIZE = 4096 FUSE_DIRENT_ALIGN = 8 ) // FUSEInitIn is the request sent by the kernel to the daemon, // to negotiate the version and flags. // // +marshal type FUSEInitIn struct { // Major version supported by kernel. Major uint32 // Minor version supported by the kernel. Minor uint32 // MaxReadahead is the maximum number of bytes to read-ahead // decided by the kernel. MaxReadahead uint32 // Flags of this init request. Flags uint32 } // FUSEInitOut is the reply sent by the daemon to the kernel // for FUSEInitIn. We target FUSE 7.23; this struct supports 7.28. // // +marshal type FUSEInitOut struct { // Major version supported by daemon. Major uint32 // Minor version supported by daemon. Minor uint32 // MaxReadahead is the maximum number of bytes to read-ahead. // Decided by the daemon, after receiving the value from kernel. MaxReadahead uint32 // Flags of this init reply. Flags uint32 // MaxBackground is the maximum number of pending background requests // that the daemon wants. MaxBackground uint16 // CongestionThreshold is the daemon-decided threshold for // the number of the pending background requests. CongestionThreshold uint16 // MaxWrite is the daemon's maximum size of a write buffer. // Kernel adjusts it to the minimum (fuse/init.go:fuseMinMaxWrite). // if the value from daemon is too small. MaxWrite uint32 // TimeGran is the daemon's time granularity for mtime and ctime metadata. // The unit is nanosecond. // Value should be power of 10. // 1 indicates full nanosecond granularity support. TimeGran uint32 // MaxPages is the daemon's maximum number of pages for one write operation. // Kernel adjusts it to the maximum (fuse/init.go:FUSE_MAX_MAX_PAGES). // if the value from daemon is too large. MaxPages uint16 _ uint16 _ [8]uint32 } // FUSEStatfsOut is the reply sent by the daemon to the kernel // for FUSE_STATFS. // from https://elixir.bootlin.com/linux/latest/source/include/uapi/linux/fuse.h#L252 // // +marshal type FUSEStatfsOut struct { // Blocks is the maximum number of data blocks the filesystem may store, in // units of BlockSize. Blocks uint64 // BlocksFree is the number of free data blocks, in units of BlockSize. BlocksFree uint64 // BlocksAvailable is the number of data blocks free for use by // unprivileged users, in units of BlockSize. BlocksAvailable uint64 // Files is the number of used file nodes on the filesystem. Files uint64 // FileFress is the number of free file nodes on the filesystem. FilesFree uint64 // BlockSize is the optimal transfer block size in bytes. BlockSize uint32 // NameLength is the maximum file name length. NameLength uint32 // FragmentSize is equivalent to BlockSize. FragmentSize uint32 _ uint32 Spare [6]uint32 } // FUSE_GETATTR_FH is currently the only flag of FUSEGetAttrIn.GetAttrFlags. // If it is set, the file handle (FUSEGetAttrIn.Fh) is used to indicate the // object instead of the node id attribute in the request header. const FUSE_GETATTR_FH = (1 << 0) // FUSEGetAttrIn is the request sent by the kernel to the daemon, // to get the attribute of a inode. // // +marshal type FUSEGetAttrIn struct { // GetAttrFlags specifies whether getattr request is sent with a nodeid or // with a file handle. GetAttrFlags uint32 _ uint32 // Fh is the file handler when GetAttrFlags has FUSE_GETATTR_FH bit. If // used, the operation is analogous to fstat(2). Fh uint64 } // FUSEAttr is the struct used in the response FUSEGetAttrOut. // // +marshal type FUSEAttr struct { // Ino is the inode number of this file. Ino uint64 // Size is the size of this file. Size uint64 // Blocks is the number of the 512B blocks allocated by this file. Blocks uint64 // Atime is the time of last access. Atime uint64 // Mtime is the time of last modification. Mtime uint64 // Ctime is the time of last status change. Ctime uint64 // AtimeNsec is the nano second part of Atime. AtimeNsec uint32 // MtimeNsec is the nano second part of Mtime. MtimeNsec uint32 // CtimeNsec is the nano second part of Ctime. CtimeNsec uint32 // Mode contains the file type and mode. Mode uint32 // Nlink is the number of the hard links. Nlink uint32 // UID is user ID of the owner. UID uint32 // GID is group ID of the owner. GID uint32 // Rdev is the device ID if this is a special file. Rdev uint32 // BlkSize is the block size for filesystem I/O. BlkSize uint32 _ uint32 } // ATimeNsec returns the last access time as the total time since the unix epoch // in nanoseconds. func (a FUSEAttr) ATimeNsec() int64 { return int64(a.Atime)*time.Second.Nanoseconds() + int64(a.AtimeNsec) } // MTimeNsec returns the last modification time as the total time since the unix // epoch in nanoseconds. func (a FUSEAttr) MTimeNsec() int64 { return int64(a.Mtime)*time.Second.Nanoseconds() + int64(a.MtimeNsec) } // CTimeNsec returns the last change time as the total time since the unix epoch // in nanoseconds. func (a FUSEAttr) CTimeNsec() int64 { return int64(a.Ctime)*time.Second.Nanoseconds() + int64(a.CtimeNsec) } // FUSEAttrOut is the reply sent by the daemon to the kernel // for FUSEGetAttrIn and FUSESetAttrIn. // // +marshal type FUSEAttrOut struct { // AttrValid and AttrValidNsec describe the attribute cache duration AttrValid uint64 // AttrValidNsec is the nanosecond part of the attribute cache duration AttrValidNsec uint32 _ uint32 // Attr contains the metadata returned from the FUSE server Attr FUSEAttr } // FUSEEntryOut is the reply sent by the daemon to the kernel // for FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, FUSE_LINK and // FUSE_LOOKUP. // // +marshal type FUSEEntryOut struct { // NodeID is the ID for current inode. NodeID uint64 // Generation is the generation number of inode. // Used to identify an inode that have different ID at different time. Generation uint64 // EntryValid indicates timeout for an entry. EntryValid uint64 // AttrValid indicates timeout for an entry's attributes. AttrValid uint64 // EntryValidNsec indicates timeout for an entry in nanosecond. EntryValidNSec uint32 // AttrValidNsec indicates timeout for an entry's attributes in nanosecond. AttrValidNSec uint32 // Attr contains the attributes of an entry. Attr FUSEAttr } // CString represents a null terminated string which can be marshalled. // // +marshal dynamic type CString string // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *CString) MarshalBytes(buf []byte) []byte { copy(buf, *s) buf[len(*s)] = 0 // null char return buf[s.SizeBytes():] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *CString) UnmarshalBytes(buf []byte) []byte { panic("Unimplemented, CString is never unmarshalled") } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *CString) SizeBytes() int { // 1 extra byte for null-terminated string. return len(*s) + 1 } // FUSELookupIn is the request sent by the kernel to the daemon // to look up a file name. // // +marshal dynamic type FUSELookupIn struct { // Name is a file name to be looked up. Name CString } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *FUSELookupIn) UnmarshalBytes(buf []byte) []byte { panic("Unimplemented, FUSELookupIn is never unmarshalled") } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *FUSELookupIn) MarshalBytes(buf []byte) []byte { return r.Name.MarshalBytes(buf) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *FUSELookupIn) SizeBytes() int { return r.Name.SizeBytes() } // MAX_NON_LFS indicates the maximum offset without large file support. const MAX_NON_LFS = ((1 << 31) - 1) // flags returned by OPEN request. const ( // FOPEN_DIRECT_IO indicates bypassing page cache for this opened file. FOPEN_DIRECT_IO = 1 << 0 // FOPEN_KEEP_CACHE avoids invalidating the data cache on open. FOPEN_KEEP_CACHE = 1 << 1 // FOPEN_NONSEEKABLE indicates the file cannot be seeked. FOPEN_NONSEEKABLE = 1 << 2 ) // FUSEOpenIn is the request sent by the kernel to the daemon, // to negotiate flags and get file handle. // // +marshal type FUSEOpenIn struct { // Flags of this open request. Flags uint32 _ uint32 } // FUSEOpenOut is the reply sent by the daemon to the kernel // for FUSEOpenIn. // // +marshal type FUSEOpenOut struct { // Fh is the file handler for opened files. Fh uint64 // OpenFlag for the opened files. OpenFlag uint32 _ uint32 } // FUSECreateOut is the reply sent by the daemon to the kernel // for FUSECreateMeta. // // +marshal type FUSECreateOut struct { FUSEEntryOut FUSEOpenOut } // FUSE_READ flags, consistent with the ones in include/uapi/linux/fuse.h. const ( FUSE_READ_LOCKOWNER = 1 << 1 ) // FUSEReadIn is the request sent by the kernel to the daemon // for FUSE_READ. // // +marshal type FUSEReadIn struct { // Fh is the file handle in userspace. Fh uint64 // Offset is the read offset. Offset uint64 // Size is the number of bytes to read. Size uint32 // ReadFlags for this FUSE_READ request. // Currently only contains FUSE_READ_LOCKOWNER. ReadFlags uint32 // LockOwner is the id of the lock owner if there is one. LockOwner uint64 // Flags for the underlying file. Flags uint32 _ uint32 } // FUSEWriteIn is the first part of the payload of the // request sent by the kernel to the daemon // for FUSE_WRITE (struct for FUSE version >= 7.9). // // The second part of the payload is the // binary bytes of the data to be written. // See FUSEWritePayloadIn that combines header & payload. // // +marshal type FUSEWriteIn struct { // Fh is the file handle in userspace. Fh uint64 // Offset is the write offset. Offset uint64 // Size is the number of bytes to write. Size uint32 // ReadFlags for this FUSE_WRITE request. WriteFlags uint32 // LockOwner is the id of the lock owner if there is one. LockOwner uint64 // Flags for the underlying file. Flags uint32 _ uint32 } // SizeOfFUSEWriteIn is the size of the FUSEWriteIn struct. var SizeOfFUSEWriteIn = uint32((*FUSEWriteIn)(nil).SizeBytes()) // FUSEWritePayloadIn combines header - FUSEWriteIn and payload // in a single marshallable struct when sending request by the // kernel to the daemon // // +marshal dynamic type FUSEWritePayloadIn struct { Header FUSEWriteIn Payload primitive.ByteSlice } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *FUSEWritePayloadIn) SizeBytes() int { if r == nil { return (*FUSEWriteIn)(nil).SizeBytes() } return r.Header.SizeBytes() + r.Payload.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *FUSEWritePayloadIn) MarshalBytes(dst []byte) []byte { dst = r.Header.MarshalUnsafe(dst) dst = r.Payload.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *FUSEWritePayloadIn) UnmarshalBytes(src []byte) []byte { panic("Unimplemented, FUSEWritePayloadIn is never unmarshalled") } // FUSEWriteOut is the payload of the reply sent by the daemon to the kernel // for a FUSE_WRITE request. // // +marshal type FUSEWriteOut struct { // Size is the number of bytes written. Size uint32 _ uint32 } // FUSEReleaseIn is the request sent by the kernel to the daemon // when there is no more reference to a file. // // +marshal type FUSEReleaseIn struct { // Fh is the file handler for the file to be released. Fh uint64 // Flags of the file. Flags uint32 // ReleaseFlags of this release request. ReleaseFlags uint32 // LockOwner is the id of the lock owner if there is one. LockOwner uint64 } // FUSECreateMeta contains all the static fields of FUSECreateIn, // which is used for FUSE_CREATE. // // +marshal type FUSECreateMeta struct { // Flags of the creating file. Flags uint32 // Mode is the mode of the creating file. Mode uint32 // Umask is the current file mode creation mask. Umask uint32 _ uint32 } // FUSERenameIn sent by the kernel for FUSE_RENAME // // +marshal dynamic type FUSERenameIn struct { Newdir primitive.Uint64 Oldname CString Newname CString } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *FUSERenameIn) MarshalBytes(dst []byte) []byte { dst = r.Newdir.MarshalBytes(dst) dst = r.Oldname.MarshalBytes(dst) return r.Newname.MarshalBytes(dst) } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *FUSERenameIn) UnmarshalBytes(buf []byte) []byte { panic("Unimplemented, FUSERmDirIn is never unmarshalled") } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *FUSERenameIn) SizeBytes() int { return r.Newdir.SizeBytes() + r.Oldname.SizeBytes() + r.Newname.SizeBytes() } // FUSECreateIn contains all the arguments sent by the kernel to the daemon, to // atomically create and open a new regular file. // // +marshal dynamic type FUSECreateIn struct { // CreateMeta contains mode, rdev and umash fields for FUSE_MKNODS. CreateMeta FUSECreateMeta // Name is the name of the node to create. Name CString } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *FUSECreateIn) MarshalBytes(buf []byte) []byte { buf = r.CreateMeta.MarshalBytes(buf) return r.Name.MarshalBytes(buf) } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *FUSECreateIn) UnmarshalBytes(buf []byte) []byte { panic("Unimplemented, FUSECreateIn is never unmarshalled") } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *FUSECreateIn) SizeBytes() int { return r.CreateMeta.SizeBytes() + r.Name.SizeBytes() } // FUSEMknodMeta contains all the static fields of FUSEMknodIn, // which is used for FUSE_MKNOD. // // +marshal type FUSEMknodMeta struct { // Mode of the inode to create. Mode uint32 // Rdev encodes device major and minor information. Rdev uint32 // Umask is the current file mode creation mask. Umask uint32 _ uint32 } // FUSEMknodIn contains all the arguments sent by the kernel // to the daemon, to create a new file node. // // +marshal dynamic type FUSEMknodIn struct { // MknodMeta contains mode, rdev and umash fields for FUSE_MKNODS. MknodMeta FUSEMknodMeta // Name is the name of the node to create. Name CString } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *FUSEMknodIn) MarshalBytes(buf []byte) []byte { buf = r.MknodMeta.MarshalBytes(buf) return r.Name.MarshalBytes(buf) } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *FUSEMknodIn) UnmarshalBytes(buf []byte) []byte { panic("Unimplemented, FUSEMknodIn is never unmarshalled") } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *FUSEMknodIn) SizeBytes() int { return r.MknodMeta.SizeBytes() + r.Name.SizeBytes() } // FUSESymlinkIn is the request sent by the kernel to the daemon, // to create a symbolic link. // // +marshal dynamic type FUSESymlinkIn struct { // Name of symlink to create. Name CString // Target of the symlink. Target CString } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *FUSESymlinkIn) MarshalBytes(buf []byte) []byte { buf = r.Name.MarshalBytes(buf) return r.Target.MarshalBytes(buf) } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *FUSESymlinkIn) UnmarshalBytes(buf []byte) []byte { panic("Unimplemented, FUSEMknodIn is never unmarshalled") } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *FUSESymlinkIn) SizeBytes() int { return r.Name.SizeBytes() + r.Target.SizeBytes() } // FUSELinkIn is the request sent by the kernel to create a hard link. // // +marshal dynamic type FUSELinkIn struct { // OldNodeID is the ID of the inode that is being linked to. OldNodeID primitive.Uint64 // Name of the new hard link to create. Name CString } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *FUSELinkIn) MarshalBytes(buf []byte) []byte { buf = r.OldNodeID.MarshalBytes(buf) return r.Name.MarshalBytes(buf) } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *FUSELinkIn) UnmarshalBytes(buf []byte) []byte { panic("Unimplemented, FUSELinkIn is never unmarshalled") } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *FUSELinkIn) SizeBytes() int { return r.OldNodeID.SizeBytes() + r.Name.SizeBytes() } // FUSEEmptyIn is used by operations without request body. // // +marshal dynamic type FUSEEmptyIn struct{} // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *FUSEEmptyIn) MarshalBytes(buf []byte) []byte { return buf } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *FUSEEmptyIn) UnmarshalBytes(buf []byte) []byte { panic("Unimplemented, FUSEEmptyIn is never unmarshalled") } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *FUSEEmptyIn) SizeBytes() int { return 0 } // FUSEMkdirMeta contains all the static fields of FUSEMkdirIn, // which is used for FUSE_MKDIR. // // +marshal type FUSEMkdirMeta struct { // Mode of the directory of create. Mode uint32 // Umask is the user file creation mask. Umask uint32 } // FUSEMkdirIn contains all the arguments sent by the kernel // to the daemon, to create a new directory. // // +marshal dynamic type FUSEMkdirIn struct { // MkdirMeta contains Mode and Umask of the directory to create. MkdirMeta FUSEMkdirMeta // Name of the directory to create. Name CString } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *FUSEMkdirIn) MarshalBytes(buf []byte) []byte { buf = r.MkdirMeta.MarshalBytes(buf) return r.Name.MarshalBytes(buf) } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *FUSEMkdirIn) UnmarshalBytes(buf []byte) []byte { panic("Unimplemented, FUSEMkdirIn is never unmarshalled") } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *FUSEMkdirIn) SizeBytes() int { return r.MkdirMeta.SizeBytes() + r.Name.SizeBytes() } // FUSERmDirIn is the request sent by the kernel to the daemon // when trying to remove a directory. // // +marshal dynamic type FUSERmDirIn struct { // Name is a directory name to be removed. Name CString } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *FUSERmDirIn) MarshalBytes(buf []byte) []byte { return r.Name.MarshalBytes(buf) } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *FUSERmDirIn) UnmarshalBytes(buf []byte) []byte { panic("Unimplemented, FUSERmDirIn is never unmarshalled") } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *FUSERmDirIn) SizeBytes() int { return r.Name.SizeBytes() } // FUSEDirents is a list of Dirents received from the FUSE daemon server. // It is used for FUSE_READDIR. // // +marshal dynamic type FUSEDirents struct { Dirents []*FUSEDirent } // FUSEDirent is a Dirent received from the FUSE daemon server. // It is used for FUSE_READDIR. // // +marshal dynamic type FUSEDirent struct { // Meta contains all the static fields of FUSEDirent. Meta FUSEDirentMeta // Name is the filename of the dirent. Name string } // FUSEDirentMeta contains all the static fields of FUSEDirent. // It is used for FUSE_READDIR. // // +marshal type FUSEDirentMeta struct { // Inode of the dirent. Ino uint64 // Offset of the dirent. Off uint64 // NameLen is the length of the dirent name. NameLen uint32 // Type of the dirent. Type uint32 } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *FUSEDirents) SizeBytes() int { var sizeBytes int for _, dirent := range r.Dirents { sizeBytes += dirent.SizeBytes() } return sizeBytes } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *FUSEDirents) MarshalBytes(buf []byte) []byte { panic("Unimplemented, FUSEDirents is never marshalled") } // UnmarshalBytes deserializes FUSEDirents from the src buffer. func (r *FUSEDirents) UnmarshalBytes(src []byte) []byte { for { if len(src) <= (*FUSEDirentMeta)(nil).SizeBytes() { break } // Its unclear how many dirents there are in src. Each dirent is dynamically // sized and so we can't make assumptions about how many dirents we can allocate. if r.Dirents == nil { r.Dirents = make([]*FUSEDirent, 0) } // We have to allocate a struct for each dirent - there must be a better way // to do this. Linux allocates 1 page to store all the dirents and then // simply reads them from the page. var dirent FUSEDirent src = dirent.UnmarshalBytes(src) r.Dirents = append(r.Dirents, &dirent) } return src } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *FUSEDirent) SizeBytes() int { dataSize := r.Meta.SizeBytes() + len(r.Name) // Each Dirent must be padded such that its size is a multiple // of FUSE_DIRENT_ALIGN. Similar to the fuse dirent alignment // in linux/fuse.h. return (dataSize + (FUSE_DIRENT_ALIGN - 1)) & ^(FUSE_DIRENT_ALIGN - 1) } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *FUSEDirent) MarshalBytes(buf []byte) []byte { panic("Unimplemented, FUSEDirent is never marshalled") } // shiftNextDirent advances buf to the start of the next dirent, per // FUSE ABI. buf should begin at the start of a dirent. func (r *FUSEDirent) shiftNextDirent(buf []byte) []byte { nextOff := r.SizeBytes() if nextOff > len(buf) { // Handle overflow. return buf[len(buf):] } return buf[nextOff:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *FUSEDirent) UnmarshalBytes(src []byte) []byte { srcP := r.Meta.UnmarshalBytes(src) if r.Meta.NameLen > FUSE_NAME_MAX || r.Meta.NameLen > uint32(len(srcP)) { // The name is too long and therefore invalid. We don't // need to unmarshal the name since it'll be thrown away. return r.shiftNextDirent(src) } buf := make([]byte, r.Meta.NameLen) name := primitive.ByteSlice(buf) name.UnmarshalBytes(srcP[:r.Meta.NameLen]) r.Name = string(name) return r.shiftNextDirent(src) } // FATTR_* consts are the attribute flags defined in include/uapi/linux/fuse.h. // These should be or-ed together for setattr to know what has been changed. const ( FATTR_MODE = (1 << 0) FATTR_UID = (1 << 1) FATTR_GID = (1 << 2) FATTR_SIZE = (1 << 3) FATTR_ATIME = (1 << 4) FATTR_MTIME = (1 << 5) FATTR_FH = (1 << 6) FATTR_ATIME_NOW = (1 << 7) FATTR_MTIME_NOW = (1 << 8) FATTR_LOCKOWNER = (1 << 9) FATTR_CTIME = (1 << 10) ) // FUSESetAttrIn is the request sent by the kernel to the daemon, // to set the attribute(s) of a file. // // +marshal type FUSESetAttrIn struct { // Valid indicates which attributes are modified by this request. Valid uint32 _ uint32 // Fh is used to identify the file if FATTR_FH is set in Valid. Fh uint64 // Size is the size that the request wants to change to. Size uint64 // LockOwner is the owner of the lock that the request wants to change to. LockOwner uint64 // Atime is the access time that the request wants to change to. Atime uint64 // Mtime is the modification time that the request wants to change to. Mtime uint64 // Ctime is the status change time that the request wants to change to. Ctime uint64 // AtimeNsec is the nano second part of Atime. AtimeNsec uint32 // MtimeNsec is the nano second part of Mtime. MtimeNsec uint32 // CtimeNsec is the nano second part of Ctime. CtimeNsec uint32 // Mode is the file mode that the request wants to change to. Mode uint32 _ uint32 // UID is the user ID of the owner that the request wants to change to. UID uint32 // GID is the group ID of the owner that the request wants to change to. GID uint32 _ uint32 } // FUSEUnlinkIn is the request sent by the kernel to the daemon // when trying to unlink a node. // // +marshal dynamic type FUSEUnlinkIn struct { // Name of the node to unlink. Name CString } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *FUSEUnlinkIn) MarshalBytes(buf []byte) []byte { return r.Name.MarshalBytes(buf) } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *FUSEUnlinkIn) UnmarshalBytes(buf []byte) []byte { panic("Unimplemented, FUSEUnlinkIn is never unmarshalled") } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *FUSEUnlinkIn) SizeBytes() int { return r.Name.SizeBytes() } // FUSEFsyncIn is the request sent by the kernel to the daemon // when trying to fsync a file. // // +marshal type FUSEFsyncIn struct { Fh uint64 FsyncFlags uint32 // padding _ uint32 } // FUSEAccessIn is the request sent by the kernel to the daemon when checking // permissions on a file. // // +marshal type FUSEAccessIn struct { Mask uint32 // padding _ uint32 } // FUSEFallocateIn is the request sent by the kernel to the daemon to perform // a fallocate operation. // // +marshal type FUSEFallocateIn struct { Fh uint64 Offset uint64 Length uint64 Mode uint32 // padding _ uint32 } // FUSEFlushIn is the request sent by the kernel to the daemon after a file is // closed. // // +marshal type FUSEFlushIn struct { Fh uint64 _ uint32 // unused _ uint32 // padding LockOwner uint64 } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/futex.go000066400000000000000000000043121465435605700230000ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // From and . // Flags are used in syscall futex(2). const ( FUTEX_WAIT = 0 FUTEX_WAKE = 1 FUTEX_FD = 2 FUTEX_REQUEUE = 3 FUTEX_CMP_REQUEUE = 4 FUTEX_WAKE_OP = 5 FUTEX_LOCK_PI = 6 FUTEX_UNLOCK_PI = 7 FUTEX_TRYLOCK_PI = 8 FUTEX_WAIT_BITSET = 9 FUTEX_WAKE_BITSET = 10 FUTEX_WAIT_REQUEUE_PI = 11 FUTEX_CMP_REQUEUE_PI = 12 FUTEX_PRIVATE_FLAG = 128 FUTEX_CLOCK_REALTIME = 256 ) // These are flags are from and are used in FUTEX_WAKE_OP // to define the operations. const ( FUTEX_OP_SET = 0 FUTEX_OP_ADD = 1 FUTEX_OP_OR = 2 FUTEX_OP_ANDN = 3 FUTEX_OP_XOR = 4 FUTEX_OP_OPARG_SHIFT = 8 FUTEX_OP_CMP_EQ = 0 FUTEX_OP_CMP_NE = 1 FUTEX_OP_CMP_LT = 2 FUTEX_OP_CMP_LE = 3 FUTEX_OP_CMP_GT = 4 FUTEX_OP_CMP_GE = 5 ) // FUTEX_TID_MASK is the TID portion of a PI futex word. const FUTEX_TID_MASK = 0x3fffffff // Constants used for priority-inheritance futexes. const ( FUTEX_WAITERS = 0x80000000 FUTEX_OWNER_DIED = 0x40000000 ) // FUTEX_BITSET_MATCH_ANY has all bits set. const FUTEX_BITSET_MATCH_ANY = 0xffffffff // ROBUST_LIST_LIMIT protects against a deliberately circular list. const ROBUST_LIST_LIMIT = 2048 // RobustListHead corresponds to Linux's struct robust_list_head. // // +marshal type RobustListHead struct { List uint64 FutexOffset uint64 ListOpPending uint64 } // SizeOfRobustListHead is the size of a RobustListHead struct. var SizeOfRobustListHead = (*RobustListHead)(nil).SizeBytes() golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/inotify.go000066400000000000000000000073611465435605700233350ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Inotify events observable by userspace. These directly correspond to // filesystem operations and there may only be a single of them per inotify // event read from an inotify fd. const ( // IN_ACCESS indicates a file was accessed. IN_ACCESS = 0x00000001 // IN_MODIFY indicates a file was modified. IN_MODIFY = 0x00000002 // IN_ATTRIB indicates a watch target's metadata changed. IN_ATTRIB = 0x00000004 // IN_CLOSE_WRITE indicates a writable file was closed. IN_CLOSE_WRITE = 0x00000008 // IN_CLOSE_NOWRITE indicates a non-writable file was closed. IN_CLOSE_NOWRITE = 0x00000010 // IN_OPEN indicates a file was opened. IN_OPEN = 0x00000020 // IN_MOVED_FROM indicates a file was moved from X. IN_MOVED_FROM = 0x00000040 // IN_MOVED_TO indicates a file was moved to Y. IN_MOVED_TO = 0x00000080 // IN_CREATE indicates a file was created in a watched directory. IN_CREATE = 0x00000100 // IN_DELETE indicates a file was deleted in a watched directory. IN_DELETE = 0x00000200 // IN_DELETE_SELF indicates a watch target itself was deleted. IN_DELETE_SELF = 0x00000400 // IN_MOVE_SELF indicates a watch target itself was moved. IN_MOVE_SELF = 0x00000800 // IN_ALL_EVENTS is a mask for all observable userspace events. IN_ALL_EVENTS = 0x00000fff ) // Inotify control events. These may be present in their own events, or ORed // with other observable events. const ( // IN_UNMOUNT indicates the backing filesystem was unmounted. IN_UNMOUNT = 0x00002000 // IN_Q_OVERFLOW indicates the event queued overflowed. IN_Q_OVERFLOW = 0x00004000 // IN_IGNORED indicates a watch was removed, either implicitly or through // inotify_rm_watch(2). IN_IGNORED = 0x00008000 // IN_ISDIR indicates the subject of an event was a directory. IN_ISDIR = 0x40000000 ) // Feature flags for inotify_add_watch(2). const ( // IN_ONLYDIR indicates that a path should be watched only if it's a // directory. IN_ONLYDIR = 0x01000000 // IN_DONT_FOLLOW indicates that the watch path shouldn't be resolved if // it's a symlink. IN_DONT_FOLLOW = 0x02000000 // IN_EXCL_UNLINK indicates events to this watch from unlinked objects // should be filtered out. IN_EXCL_UNLINK = 0x04000000 // IN_MASK_ADD indicates the provided mask should be ORed into any existing // watch on the provided path. IN_MASK_ADD = 0x20000000 // IN_ONESHOT indicates the watch should be removed after one event. IN_ONESHOT = 0x80000000 ) // Feature flags for inotify_init1(2). const ( // IN_CLOEXEC is an alias for O_CLOEXEC. It indicates that the inotify // fd should be closed on exec(2) and friends. IN_CLOEXEC = 0x00080000 // IN_NONBLOCK is an alias for O_NONBLOCK. It indicates I/O syscall on the // inotify fd should not block. IN_NONBLOCK = 0x00000800 ) // ALL_INOTIFY_BITS contains all the bits for all possible inotify events. It's // defined in the Linux source at "include/linux/inotify.h". const ALL_INOTIFY_BITS = IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | IN_MOVED_TO | IN_CREATE | IN_DELETE | IN_DELETE_SELF | IN_MOVE_SELF | IN_UNMOUNT | IN_Q_OVERFLOW | IN_IGNORED | IN_ONLYDIR | IN_DONT_FOLLOW | IN_EXCL_UNLINK | IN_MASK_ADD | IN_ISDIR | IN_ONESHOT golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/ioctl.go000066400000000000000000000107271465435605700227660ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // ioctl(2) requests provided by asm-generic/ioctls.h // // These are ordered by request number (low byte). const ( TCGETS = 0x00005401 TCSETS = 0x00005402 TCSETSW = 0x00005403 TCSETSF = 0x00005404 TCSBRK = 0x00005409 TIOCEXCL = 0x0000540c TIOCNXCL = 0x0000540d TIOCSCTTY = 0x0000540e TIOCGPGRP = 0x0000540f TIOCSPGRP = 0x00005410 TIOCOUTQ = 0x00005411 TIOCSTI = 0x00005412 TIOCGWINSZ = 0x00005413 TIOCSWINSZ = 0x00005414 TIOCMGET = 0x00005415 TIOCMBIS = 0x00005416 TIOCMBIC = 0x00005417 TIOCMSET = 0x00005418 TIOCINQ = 0x0000541b FIONREAD = TIOCINQ FIONBIO = 0x00005421 TIOCSETD = 0x00005423 TIOCNOTTY = 0x00005422 TIOCGETD = 0x00005424 TCSBRKP = 0x00005425 TIOCSBRK = 0x00005427 TIOCCBRK = 0x00005428 TIOCGSID = 0x00005429 TIOCGPTN = 0x80045430 TIOCSPTLCK = 0x40045431 TIOCGDEV = 0x80045432 TIOCVHANGUP = 0x00005437 TCFLSH = 0x0000540b TIOCCONS = 0x0000541d TIOCSSERIAL = 0x0000541f TIOCGEXCL = 0x80045440 TIOCGPTPEER = 0x80045441 TIOCGICOUNT = 0x0000545d FIONCLEX = 0x00005450 FIOCLEX = 0x00005451 FIOASYNC = 0x00005452 FIOSETOWN = 0x00008901 SIOCSPGRP = 0x00008902 FIOGETOWN = 0x00008903 SIOCGPGRP = 0x00008904 ) // ioctl(2) requests provided by uapi/linux/sockios.h const ( SIOCGIFNAME = 0x8910 SIOCGIFCONF = 0x8912 SIOCGIFFLAGS = 0x8913 SIOCGIFADDR = 0x8915 SIOCGIFDSTADDR = 0x8917 SIOCGIFBRDADDR = 0x8919 SIOCGIFNETMASK = 0x891b SIOCGIFMETRIC = 0x891d SIOCGIFMTU = 0x8921 SIOCGIFMEM = 0x891f SIOCGIFHWADDR = 0x8927 SIOCGIFINDEX = 0x8933 SIOCGIFPFLAGS = 0x8935 SIOCGIFTXQLEN = 0x8942 SIOCETHTOOL = 0x8946 SIOCGMIIPHY = 0x8947 SIOCGMIIREG = 0x8948 SIOCGIFMAP = 0x8970 ) // ioctl(2) requests provided by uapi/asm-generic/sockios.h const ( SIOCGSTAMP = 0x8906 ) // ioctl(2) directions. Used to calculate requests number. // Constants from asm-generic/ioctl.h. const ( IOC_NONE = 0 IOC_WRITE = 1 IOC_READ = 2 ) // Constants from asm-generic/ioctl.h. const ( IOC_NRBITS = 8 IOC_TYPEBITS = 8 IOC_SIZEBITS = 14 IOC_DIRBITS = 2 IOC_NRSHIFT = 0 IOC_TYPESHIFT = IOC_NRSHIFT + IOC_NRBITS IOC_SIZESHIFT = IOC_TYPESHIFT + IOC_TYPEBITS IOC_DIRSHIFT = IOC_SIZESHIFT + IOC_SIZEBITS ) // IOC outputs the result of _IOC macro in include/uapi/asm-generic/ioctl.h. func IOC(dir, typ, nr, size uint32) uint32 { return uint32(dir)<> IOC_NRSHIFT) & ((1 << IOC_NRBITS) - 1) } // IOC_SIZE outputs the result of IOC_SIZE macro in // include/uapi/asm-generic/ioctl.h. func IOC_SIZE(nr uint32) uint32 { return (nr >> IOC_SIZESHIFT) & ((1 << IOC_SIZEBITS) - 1) } // Kcov ioctls from include/uapi/linux/kcov.h. var ( KCOV_INIT_TRACE = IOR('c', 1, 8) KCOV_ENABLE = IO('c', 100) KCOV_DISABLE = IO('c', 101) ) // Kcov trace types from include/uapi/linux/kcov.h. const ( KCOV_TRACE_PC = 0 KCOV_TRACE_CMP = 1 ) // Kcov state constants from include/uapi/linux/kcov.h. const ( KCOV_MODE_DISABLED = 0 KCOV_MODE_INIT = 1 KCOV_MODE_TRACE_PC = 2 KCOV_MODE_TRACE_CMP = 3 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/ioctl_tun.go000066400000000000000000000016621465435605700236520ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // ioctl(2) request numbers from linux/if_tun.h var ( TUNSETIFF = IOW('T', 202, 4) TUNGETIFF = IOR('T', 210, 4) ) // Flags from net/if_tun.h const ( IFF_TUN = 0x0001 IFF_TAP = 0x0002 IFF_NO_PI = 0x1000 IFF_NOFILTER = 0x1000 // According to linux/if_tun.h "This flag has no real effect" IFF_ONE_QUEUE = 0x2000 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/iouring.go000066400000000000000000000170231465435605700233240ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Constants for io_uring_setup(2). See include/uapi/linux/io_uring.h. const ( IORING_SETUP_IOPOLL = (1 << 0) IORING_SETUP_SQPOLL = (1 << 1) IORING_SETUP_SQ_AFF = (1 << 2) IORING_SETUP_CQSIZE = (1 << 3) IORING_SETUP_CLAMP = (1 << 4) IORING_SETUP_ATTACH_WQ = (1 << 5) IORING_SETUP_R_DISABLED = (1 << 6) IORING_SETUP_SUBMIT_ALL = (1 << 7) ) // Constants for io_uring_enter(2). See include/uapi/linux/io_uring.h. const ( IORING_ENTER_GETEVENTS = (1 << 0) ) // Constants for IoUringParams.Features. See include/uapi/linux/io_uring.h. const ( IORING_FEAT_SINGLE_MMAP = (1 << 0) ) // Constants for IO_URING. See include/uapi/linux/io_uring.h. const ( IORING_SETUP_COOP_TASKRUN = (1 << 8) IORING_SETUP_TASKRUN_FLAG = (1 << 9) IORING_SETUP_SQE128 = (1 << 10) IORING_SETUP_CQE32 = (1 << 11) ) // Constants for IO_URING. See io_uring/io_uring.c. const ( IORING_MAX_ENTRIES = (1 << 15) // 32768 IORING_MAX_CQ_ENTRIES = (2 * IORING_MAX_ENTRIES) ) // Constants for the offsets for the application to mmap the data it needs. // See include/uapi/linux/io_uring.h. const ( IORING_OFF_SQ_RING = 0 IORING_OFF_CQ_RING = 0x8000000 IORING_OFF_SQES = 0x10000000 ) // Constants for the IO_URING opcodes. See include/uapi/linux/io_uring.h. const ( IORING_OP_NOP = 0 IORING_OP_READV = 1 ) // IORingIndex represents SQE array indexes. // // +marshal type IORingIndex uint32 // IOSqRingOffsets implements io_sqring_offsets struct. // IOSqRingOffsets represents offsets into IORings. // See struct io_sqring_offsets in include/uapi/linux/io_uring.h. // // +marshal type IOSqRingOffsets struct { Head uint32 // Offset to io_rings.sq.head Tail uint32 // Offset to io_rings.sq.tail RingMask uint32 // Offset to io_rings.sq_ring_mask RingEntries uint32 // Offset to io_rings.sq_ring_entries Flags uint32 // Offset to io_rings.sq_flags Dropped uint32 // Offset to io_rings.sq_dropped Array uint32 // Offset to an array of SQE indices Resv1 uint32 // Currently reserved and expected to be zero Resv2 uint64 // Currently reserved and expected to be zero } // IOCqRingOffsets implements io_cqring_offsets struct. // IOCqRingOffsets represents offsets into IORings. // See struct io_cqring_offsets in include/uapi/linux/io_uring.h. // // +marshal type IOCqRingOffsets struct { Head uint32 // Offset to io_rings.cq.head Tail uint32 // Offset to io_rings.cq.tail RingMask uint32 // Offset to io_rings.cq_ring_mask RingEntries uint32 // Offset to io_rings.cq_ring_entries Overflow uint32 // Offset to io_rings.cq_overflow Cqes uint32 // Offset to io_rings.cqes Flags uint32 // Offset to io_rings.cq_flags Resv1 uint32 // Currently reserved and expected to be zero Resv2 uint64 // Currently reserved and expected to be zero } // IOUringParams implements io_uring_params struct. // See struct io_uring_params in include/uapi/linux/io_uring.h. // // +marshal type IOUringParams struct { SqEntries uint32 CqEntries uint32 Flags uint32 SqThreadCPU uint32 SqThreadIdle uint32 Features uint32 WqFd uint32 Resv [3]uint32 SqOff IOSqRingOffsets CqOff IOCqRingOffsets } // IOUringCqe implements IO completion data structure (Completion Queue Entry) // io_uring_cqe struct. As we don't currently support IORING_SETUP_CQE32 flag // its size is 16 bytes. // See struct io_uring_cqe in include/uapi/linux/io_uring.h. // // +marshal // +stateify savable type IOUringCqe struct { UserData uint64 Res int32 Flags uint32 } // IOUring implements io_uring struct. // See struct io_uring in io_uring/io_uring.c. // // +marshal // +stateify savable type IOUring struct { // Both head and tail should be cacheline aligned. And we assume that // cacheline size is 64 bytes. Head uint32 _ [60]byte Tail uint32 _ [60]byte } // IORings implements io_rings struct. // This struct describes layout of the mapped region backed by the ringBuffersFile. // See struct io_rings in io_uring/io_uring.c. // // +marshal // +stateify savable type IORings struct { Sq IOUring Cq IOUring SqRingMask uint32 CqRingMask uint32 SqRingEntries uint32 CqRingEntries uint32 sqDropped uint32 sqFlags int32 cqFlags uint32 CqOverflow uint32 _ [32]byte // Padding so cqes is cacheline aligned // Linux has an additional field struct io_uring_cqe cqes[], which represents // a dynamic array. We don't include it here in order to enable marshalling. } // IOUringSqe implements io_uring_sqe struct. // This struct represents IO submission data structure (Submission Queue Entry). As we don't yet // support IORING_SETUP_SQE128 flag, its size is 64 bytes with no extra padding at the end. // See include/uapi/linux/io_uring.h. // // +marshal // +stateify savable type IOUringSqe struct { Opcode uint8 Flags uint8 IoPrio uint16 Fd int32 OffOrAddrOrCmdOp uint64 AddrOrSpliceOff uint64 Len uint32 specialFlags uint32 UserData uint64 BufIndexOrGroup uint16 personality uint16 spliceFDOrFileIndex int32 addr3 uint64 _ uint64 } const ( _IOSqRingOffset = 0 // +checkoffset . IORings.Sq _IOSqRingOffsetHead = 0 // +checkoffset . IOUring.Head _IOSqRingOffsetTail = 64 // +checkoffset . IOUring.Tail _IOSqRingOffsetMask = 256 // +checkoffset . IORings.SqRingMask _IOSqRingOffsetEntries = 264 // +checkoffset . IORings.SqRingEntries _IOSqRingOffsetFlags = 276 // +checkoffset . IORings.sqFlags _IOSqRingOffsetDropped = 272 // +checkoffset . IORings.sqDropped ) // PreComputedIOSqRingOffsets returns precomputed values for IOSqRingOffsets. func PreComputedIOSqRingOffsets() IOSqRingOffsets { return IOSqRingOffsets{ Head: _IOSqRingOffset + _IOSqRingOffsetHead, Tail: _IOSqRingOffset + _IOSqRingOffsetTail, RingMask: _IOSqRingOffsetMask, RingEntries: _IOSqRingOffsetEntries, Flags: _IOSqRingOffsetFlags, Dropped: _IOSqRingOffsetDropped, } } const ( _IOCqRingOffset = 128 // +checkoffset . IORings.Cq _IOCqRingOffsetHead = 0 // +checkoffset . IOUring.Head _IOCqRingOffsetTail = 64 // +checkoffset . IOUring.Tail _IOCqRingOffsetMask = 260 // +checkoffset . IORings.CqRingMask _IOCqRingOffsetEntries = 268 // +checkoffset . IORings.CqRingEntries _IOCqRingOffsetFlags = 280 // +checkoffset . IORings.cqFlags _IOCqRingOffsetOverflow = 284 // +checkoffset . IORings.CqOverflow ) // PreComputedIOCqRingOffsets returns precomputed values for IOCqRingOffsets. func PreComputedIOCqRingOffsets() IOCqRingOffsets { return IOCqRingOffsets{ Head: _IOCqRingOffset + _IOCqRingOffsetHead, Tail: _IOCqRingOffset + _IOCqRingOffsetTail, RingMask: _IOCqRingOffsetMask, RingEntries: _IOCqRingOffsetEntries, Overflow: _IOCqRingOffsetOverflow, Flags: _IOCqRingOffsetFlags, } } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/ip.go000066400000000000000000000107241465435605700222610ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // IP protocols const ( IPPROTO_IP = 0 IPPROTO_ICMP = 1 IPPROTO_IGMP = 2 IPPROTO_IPIP = 4 IPPROTO_TCP = 6 IPPROTO_EGP = 8 IPPROTO_PUP = 12 IPPROTO_UDP = 17 IPPROTO_IDP = 22 IPPROTO_TP = 29 IPPROTO_DCCP = 33 IPPROTO_IPV6 = 41 IPPROTO_RSVP = 46 IPPROTO_GRE = 47 IPPROTO_ESP = 50 IPPROTO_AH = 51 IPPROTO_ICMPV6 = 58 IPPROTO_MTP = 92 IPPROTO_BEETPH = 94 IPPROTO_ENCAP = 98 IPPROTO_PIM = 103 IPPROTO_COMP = 108 IPPROTO_SCTP = 132 IPPROTO_UDPLITE = 136 IPPROTO_MPLS = 137 IPPROTO_RAW = 255 ) // Socket options from uapi/linux/in.h const ( IP_TOS = 1 IP_TTL = 2 IP_HDRINCL = 3 IP_OPTIONS = 4 IP_ROUTER_ALERT = 5 IP_RECVOPTS = 6 IP_RETOPTS = 7 IP_PKTINFO = 8 IP_PKTOPTIONS = 9 IP_MTU_DISCOVER = 10 IP_RECVERR = 11 IP_RECVTTL = 12 IP_RECVTOS = 13 IP_MTU = 14 IP_FREEBIND = 15 IP_IPSEC_POLICY = 16 IP_XFRM_POLICY = 17 IP_PASSSEC = 18 IP_TRANSPARENT = 19 IP_ORIGDSTADDR = 20 IP_RECVORIGDSTADDR = IP_ORIGDSTADDR IP_MINTTL = 21 IP_NODEFRAG = 22 IP_CHECKSUM = 23 IP_BIND_ADDRESS_NO_PORT = 24 IP_RECVFRAGSIZE = 25 IP_MULTICAST_IF = 32 IP_MULTICAST_TTL = 33 IP_MULTICAST_LOOP = 34 IP_ADD_MEMBERSHIP = 35 IP_DROP_MEMBERSHIP = 36 IP_UNBLOCK_SOURCE = 37 IP_BLOCK_SOURCE = 38 IP_ADD_SOURCE_MEMBERSHIP = 39 IP_DROP_SOURCE_MEMBERSHIP = 40 IP_MSFILTER = 41 MCAST_JOIN_GROUP = 42 MCAST_BLOCK_SOURCE = 43 MCAST_UNBLOCK_SOURCE = 44 MCAST_LEAVE_GROUP = 45 MCAST_JOIN_SOURCE_GROUP = 46 MCAST_LEAVE_SOURCE_GROUP = 47 MCAST_MSFILTER = 48 IP_MULTICAST_ALL = 49 IP_UNICAST_IF = 50 ) // IP_MTU_DISCOVER values from uapi/linux/in.h const ( IP_PMTUDISC_DONT = 0 IP_PMTUDISC_WANT = 1 IP_PMTUDISC_DO = 2 IP_PMTUDISC_PROBE = 3 IP_PMTUDISC_INTERFACE = 4 IP_PMTUDISC_OMIT = 5 ) // Socket options from uapi/linux/in6.h const ( IPV6_ADDRFORM = 1 IPV6_2292PKTINFO = 2 IPV6_2292HOPOPTS = 3 IPV6_2292DSTOPTS = 4 IPV6_2292RTHDR = 5 IPV6_2292PKTOPTIONS = 6 IPV6_CHECKSUM = 7 IPV6_2292HOPLIMIT = 8 IPV6_NEXTHOP = 9 IPV6_FLOWINFO = 11 IPV6_UNICAST_HOPS = 16 IPV6_MULTICAST_IF = 17 IPV6_MULTICAST_HOPS = 18 IPV6_MULTICAST_LOOP = 19 IPV6_ADD_MEMBERSHIP = 20 IPV6_DROP_MEMBERSHIP = 21 IPV6_ROUTER_ALERT = 22 IPV6_MTU_DISCOVER = 23 IPV6_MTU = 24 IPV6_RECVERR = 25 IPV6_V6ONLY = 26 IPV6_JOIN_ANYCAST = 27 IPV6_LEAVE_ANYCAST = 28 IPV6_MULTICAST_ALL = 29 IPV6_FLOWLABEL_MGR = 32 IPV6_FLOWINFO_SEND = 33 IPV6_IPSEC_POLICY = 34 IPV6_XFRM_POLICY = 35 IPV6_HDRINCL = 36 IPV6_RECVPKTINFO = 49 IPV6_PKTINFO = 50 IPV6_RECVHOPLIMIT = 51 IPV6_HOPLIMIT = 52 IPV6_RECVHOPOPTS = 53 IPV6_HOPOPTS = 54 IPV6_RTHDRDSTOPTS = 55 IPV6_RECVRTHDR = 56 IPV6_RTHDR = 57 IPV6_RECVDSTOPTS = 58 IPV6_DSTOPTS = 59 IPV6_RECVPATHMTU = 60 IPV6_PATHMTU = 61 IPV6_DONTFRAG = 62 IPV6_RECVTCLASS = 66 IPV6_TCLASS = 67 IPV6_AUTOFLOWLABEL = 70 IPV6_ADDR_PREFERENCES = 72 IPV6_MINHOPCOUNT = 73 IPV6_ORIGDSTADDR = 74 IPV6_RECVORIGDSTADDR = IPV6_ORIGDSTADDR IPV6_TRANSPARENT = 75 IPV6_UNICAST_IF = 76 IPV6_RECVFRAGSIZE = 77 IPV6_FREEBIND = 78 ) // Socket options from uapi/linux/icmpv6.h const ( ICMPV6_FILTER = 1 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/ipc.go000066400000000000000000000026311465435605700224220ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Control commands used with semctl, shmctl, and msgctl. // // Source: include/uapi/linux/ipc.h. const ( IPC_RMID = 0 IPC_SET = 1 IPC_STAT = 2 IPC_INFO = 3 ) // Resource get request flags. // // Source: include/uapi/linux/ipc.h const ( IPC_CREAT = 00001000 IPC_EXCL = 00002000 IPC_NOWAIT = 00004000 ) // IPC flags. const ( IPC_PRIVATE = 0 ) // In Linux, amd64 does not enable CONFIG_ARCH_WANT_IPC_PARSE_VERSION, so SysV // IPC unconditionally uses the "new" 64-bit structures that are needed for // features like 32-bit UIDs. // IPCPerm is equivalent to struct ipc64_perm. // // +marshal type IPCPerm struct { Key uint32 UID uint32 GID uint32 CUID uint32 CGID uint32 Mode uint16 _ uint16 Seq uint16 _ uint16 _ uint32 unused1 uint64 unused2 uint64 } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/keyctl.go000066400000000000000000000016101465435605700231360ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Constants used by keyctl(2) and other keyrings-related syscalls. // Source: include/uapi/linux/keyctl.h const ( KEY_SPEC_SESSION_KEYRING = -3 ) const ( KEYCTL_GET_KEYRING_ID = 0 KEYCTL_JOIN_SESSION_KEYRING = 1 KEYCTL_SETPERM = 5 KEYCTL_DESCRIBE = 6 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/limits.go000066400000000000000000000054161465435605700231540ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Resources for getrlimit(2)/setrlimit(2)/prlimit(2). const ( RLIMIT_CPU = 0 RLIMIT_FSIZE = 1 RLIMIT_DATA = 2 RLIMIT_STACK = 3 RLIMIT_CORE = 4 RLIMIT_RSS = 5 RLIMIT_NPROC = 6 RLIMIT_NOFILE = 7 RLIMIT_MEMLOCK = 8 RLIMIT_AS = 9 RLIMIT_LOCKS = 10 RLIMIT_SIGPENDING = 11 RLIMIT_MSGQUEUE = 12 RLIMIT_NICE = 13 RLIMIT_RTPRIO = 14 RLIMIT_RTTIME = 15 ) // RLimit corresponds to Linux's struct rlimit. type RLimit struct { // Cur specifies the soft limit. Cur uint64 // Max specifies the hard limit. Max uint64 } const ( // RLimInfinity is RLIM_INFINITY on Linux. RLimInfinity = ^uint64(0) // DefaultStackSoftLimit is called _STK_LIM in Linux. DefaultStackSoftLimit = 8 * 1024 * 1024 // DefaultNprocLimit is defined in kernel/fork.c:set_max_threads, and // called MAX_THREADS / 2 in Linux. DefaultNprocLimit = FUTEX_TID_MASK / 2 // DefaultNofileSoftLimit is called INR_OPEN_CUR in Linux. DefaultNofileSoftLimit = 1024 // DefaultNofileHardLimit is called INR_OPEN_MAX in Linux. DefaultNofileHardLimit = 4096 // DefaultMemlockLimit is called MLOCK_LIMIT in Linux. DefaultMemlockLimit = 64 * 1024 // DefaultMsgqueueLimit is called MQ_BYTES_MAX in Linux. DefaultMsgqueueLimit = 819200 ) // InitRLimits is a map of initial rlimits set by Linux in // include/asm-generic/resource.h. var InitRLimits = map[int]RLimit{ RLIMIT_CPU: {RLimInfinity, RLimInfinity}, RLIMIT_FSIZE: {RLimInfinity, RLimInfinity}, RLIMIT_DATA: {RLimInfinity, RLimInfinity}, RLIMIT_STACK: {DefaultStackSoftLimit, RLimInfinity}, RLIMIT_CORE: {0, RLimInfinity}, RLIMIT_RSS: {RLimInfinity, RLimInfinity}, RLIMIT_NPROC: {DefaultNprocLimit, DefaultNprocLimit}, RLIMIT_NOFILE: {DefaultNofileSoftLimit, DefaultNofileHardLimit}, RLIMIT_MEMLOCK: {DefaultMemlockLimit, DefaultMemlockLimit}, RLIMIT_AS: {RLimInfinity, RLimInfinity}, RLIMIT_LOCKS: {RLimInfinity, RLimInfinity}, RLIMIT_SIGPENDING: {0, 0}, RLIMIT_MSGQUEUE: {DefaultMsgqueueLimit, DefaultMsgqueueLimit}, RLIMIT_NICE: {0, 0}, RLIMIT_RTPRIO: {0, 0}, RLIMIT_RTTIME: {RLimInfinity, RLimInfinity}, } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/linux.go000066400000000000000000000024671465435605700230150ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package linux contains the constants and types needed to interface with a // Linux kernel. package linux // NumSoftIRQ is the number of software IRQs, exposed via /proc/stat. // // Defined in linux/interrupt.h. const NumSoftIRQ = 10 // Sysinfo is the structure provided by sysinfo on linux versions > 2.3.48. // // +marshal type Sysinfo struct { Uptime int64 Loads [3]uint64 TotalRAM uint64 FreeRAM uint64 SharedRAM uint64 BufferRAM uint64 TotalSwap uint64 FreeSwap uint64 Procs uint16 _ [6]byte // Pad Procs to 64bits. TotalHigh uint64 FreeHigh uint64 Unit uint32 `marshal:"unaligned"` // Struct ends mid-64-bit-word. // The _f field in the glibc version of Sysinfo has size 0 on AMD64. } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/linux_abi_autogen_unsafe.go000066400000000000000000030077231465435605700267160ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package linux import ( "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "io" "reflect" "runtime" "unsafe" ) // Marshallable types used by this file. var _ marshal.Marshallable = (*BPFInstruction)(nil) var _ marshal.Marshallable = (*CString)(nil) var _ marshal.Marshallable = (*CapUserData)(nil) var _ marshal.Marshallable = (*CapUserHeader)(nil) var _ marshal.Marshallable = (*ClockT)(nil) var _ marshal.Marshallable = (*CloneArgs)(nil) var _ marshal.Marshallable = (*ControlMessageCredentials)(nil) var _ marshal.Marshallable = (*ControlMessageHeader)(nil) var _ marshal.Marshallable = (*ControlMessageIPPacketInfo)(nil) var _ marshal.Marshallable = (*ControlMessageIPv6PacketInfo)(nil) var _ marshal.Marshallable = (*ElfHeader64)(nil) var _ marshal.Marshallable = (*ElfProg64)(nil) var _ marshal.Marshallable = (*ElfSection64)(nil) var _ marshal.Marshallable = (*ErrorName)(nil) var _ marshal.Marshallable = (*EthtoolCmd)(nil) var _ marshal.Marshallable = (*EthtoolGFeatures)(nil) var _ marshal.Marshallable = (*EthtoolGetFeaturesBlock)(nil) var _ marshal.Marshallable = (*ExtensionName)(nil) var _ marshal.Marshallable = (*FOwnerEx)(nil) var _ marshal.Marshallable = (*FUSEAccessIn)(nil) var _ marshal.Marshallable = (*FUSEAttr)(nil) var _ marshal.Marshallable = (*FUSEAttrOut)(nil) var _ marshal.Marshallable = (*FUSECreateIn)(nil) var _ marshal.Marshallable = (*FUSECreateMeta)(nil) var _ marshal.Marshallable = (*FUSECreateOut)(nil) var _ marshal.Marshallable = (*FUSEDirent)(nil) var _ marshal.Marshallable = (*FUSEDirentMeta)(nil) var _ marshal.Marshallable = (*FUSEDirents)(nil) var _ marshal.Marshallable = (*FUSEEmptyIn)(nil) var _ marshal.Marshallable = (*FUSEEntryOut)(nil) var _ marshal.Marshallable = (*FUSEFallocateIn)(nil) var _ marshal.Marshallable = (*FUSEFlushIn)(nil) var _ marshal.Marshallable = (*FUSEFsyncIn)(nil) var _ marshal.Marshallable = (*FUSEGetAttrIn)(nil) var _ marshal.Marshallable = (*FUSEHeaderIn)(nil) var _ marshal.Marshallable = (*FUSEHeaderOut)(nil) var _ marshal.Marshallable = (*FUSEInitIn)(nil) var _ marshal.Marshallable = (*FUSEInitOut)(nil) var _ marshal.Marshallable = (*FUSELinkIn)(nil) var _ marshal.Marshallable = (*FUSELookupIn)(nil) var _ marshal.Marshallable = (*FUSEMkdirIn)(nil) var _ marshal.Marshallable = (*FUSEMkdirMeta)(nil) var _ marshal.Marshallable = (*FUSEMknodIn)(nil) var _ marshal.Marshallable = (*FUSEMknodMeta)(nil) var _ marshal.Marshallable = (*FUSEOpID)(nil) var _ marshal.Marshallable = (*FUSEOpcode)(nil) var _ marshal.Marshallable = (*FUSEOpenIn)(nil) var _ marshal.Marshallable = (*FUSEOpenOut)(nil) var _ marshal.Marshallable = (*FUSEReadIn)(nil) var _ marshal.Marshallable = (*FUSEReleaseIn)(nil) var _ marshal.Marshallable = (*FUSERenameIn)(nil) var _ marshal.Marshallable = (*FUSERmDirIn)(nil) var _ marshal.Marshallable = (*FUSESetAttrIn)(nil) var _ marshal.Marshallable = (*FUSEStatfsOut)(nil) var _ marshal.Marshallable = (*FUSESymlinkIn)(nil) var _ marshal.Marshallable = (*FUSEUnlinkIn)(nil) var _ marshal.Marshallable = (*FUSEWriteIn)(nil) var _ marshal.Marshallable = (*FUSEWriteOut)(nil) var _ marshal.Marshallable = (*FUSEWritePayloadIn)(nil) var _ marshal.Marshallable = (*FileMode)(nil) var _ marshal.Marshallable = (*Flock)(nil) var _ marshal.Marshallable = (*ICMP6Filter)(nil) var _ marshal.Marshallable = (*IFConf)(nil) var _ marshal.Marshallable = (*IFReq)(nil) var _ marshal.Marshallable = (*IOCallback)(nil) var _ marshal.Marshallable = (*IOCqRingOffsets)(nil) var _ marshal.Marshallable = (*IOEvent)(nil) var _ marshal.Marshallable = (*IORingIndex)(nil) var _ marshal.Marshallable = (*IORings)(nil) var _ marshal.Marshallable = (*IOSqRingOffsets)(nil) var _ marshal.Marshallable = (*IOUring)(nil) var _ marshal.Marshallable = (*IOUringCqe)(nil) var _ marshal.Marshallable = (*IOUringParams)(nil) var _ marshal.Marshallable = (*IOUringSqe)(nil) var _ marshal.Marshallable = (*IP6TEntry)(nil) var _ marshal.Marshallable = (*IP6TIP)(nil) var _ marshal.Marshallable = (*IP6TReplace)(nil) var _ marshal.Marshallable = (*IPCPerm)(nil) var _ marshal.Marshallable = (*IPTEntry)(nil) var _ marshal.Marshallable = (*IPTGetEntries)(nil) var _ marshal.Marshallable = (*IPTGetinfo)(nil) var _ marshal.Marshallable = (*IPTIP)(nil) var _ marshal.Marshallable = (*IPTOwnerInfo)(nil) var _ marshal.Marshallable = (*IPTReplace)(nil) var _ marshal.Marshallable = (*Inet6Addr)(nil) var _ marshal.Marshallable = (*Inet6MulticastRequest)(nil) var _ marshal.Marshallable = (*InetAddr)(nil) var _ marshal.Marshallable = (*InetMulticastRequest)(nil) var _ marshal.Marshallable = (*InetMulticastRequestWithNIC)(nil) var _ marshal.Marshallable = (*InterfaceAddrMessage)(nil) var _ marshal.Marshallable = (*InterfaceInfoMessage)(nil) var _ marshal.Marshallable = (*ItimerVal)(nil) var _ marshal.Marshallable = (*Itimerspec)(nil) var _ marshal.Marshallable = (*KernelIP6TEntry)(nil) var _ marshal.Marshallable = (*KernelIP6TGetEntries)(nil) var _ marshal.Marshallable = (*KernelIPTEntry)(nil) var _ marshal.Marshallable = (*KernelIPTGetEntries)(nil) var _ marshal.Marshallable = (*Linger)(nil) var _ marshal.Marshallable = (*MqAttr)(nil) var _ marshal.Marshallable = (*MsgBuf)(nil) var _ marshal.Marshallable = (*MsgInfo)(nil) var _ marshal.Marshallable = (*MsqidDS)(nil) var _ marshal.Marshallable = (*NFNATRange)(nil) var _ marshal.Marshallable = (*NFNATRange2)(nil) var _ marshal.Marshallable = (*NetlinkAttrHeader)(nil) var _ marshal.Marshallable = (*NetlinkErrorMessage)(nil) var _ marshal.Marshallable = (*NetlinkMessageHeader)(nil) var _ marshal.Marshallable = (*NfNATIPV4MultiRangeCompat)(nil) var _ marshal.Marshallable = (*NfNATIPV4Range)(nil) var _ marshal.Marshallable = (*NumaPolicy)(nil) var _ marshal.Marshallable = (*PollFD)(nil) var _ marshal.Marshallable = (*RSeqCriticalSection)(nil) var _ marshal.Marshallable = (*RobustListHead)(nil) var _ marshal.Marshallable = (*RouteMessage)(nil) var _ marshal.Marshallable = (*RtAttr)(nil) var _ marshal.Marshallable = (*Rusage)(nil) var _ marshal.Marshallable = (*SeccompData)(nil) var _ marshal.Marshallable = (*SeccompNotif)(nil) var _ marshal.Marshallable = (*SeccompNotifResp)(nil) var _ marshal.Marshallable = (*SeccompNotifSizes)(nil) var _ marshal.Marshallable = (*SemInfo)(nil) var _ marshal.Marshallable = (*Sembuf)(nil) var _ marshal.Marshallable = (*ShmInfo)(nil) var _ marshal.Marshallable = (*ShmParams)(nil) var _ marshal.Marshallable = (*ShmidDS)(nil) var _ marshal.Marshallable = (*SigAction)(nil) var _ marshal.Marshallable = (*Sigevent)(nil) var _ marshal.Marshallable = (*SignalInfo)(nil) var _ marshal.Marshallable = (*SignalSet)(nil) var _ marshal.Marshallable = (*SignalStack)(nil) var _ marshal.Marshallable = (*SignalfdSiginfo)(nil) var _ marshal.Marshallable = (*SockAddrInet)(nil) var _ marshal.Marshallable = (*SockAddrInet6)(nil) var _ marshal.Marshallable = (*SockAddrLink)(nil) var _ marshal.Marshallable = (*SockAddrNetlink)(nil) var _ marshal.Marshallable = (*SockAddrUnix)(nil) var _ marshal.Marshallable = (*SockErrCMsgIPv4)(nil) var _ marshal.Marshallable = (*SockErrCMsgIPv6)(nil) var _ marshal.Marshallable = (*SockExtendedErr)(nil) var _ marshal.Marshallable = (*Statfs)(nil) var _ marshal.Marshallable = (*Statx)(nil) var _ marshal.Marshallable = (*StatxTimestamp)(nil) var _ marshal.Marshallable = (*Sysinfo)(nil) var _ marshal.Marshallable = (*TCPInfo)(nil) var _ marshal.Marshallable = (*TableName)(nil) var _ marshal.Marshallable = (*Termios)(nil) var _ marshal.Marshallable = (*TimeT)(nil) var _ marshal.Marshallable = (*TimerID)(nil) var _ marshal.Marshallable = (*Timespec)(nil) var _ marshal.Marshallable = (*Timeval)(nil) var _ marshal.Marshallable = (*Tms)(nil) var _ marshal.Marshallable = (*Utime)(nil) var _ marshal.Marshallable = (*UtsName)(nil) var _ marshal.Marshallable = (*VFIODeviceInfo)(nil) var _ marshal.Marshallable = (*VFIOIommuType1DmaMap)(nil) var _ marshal.Marshallable = (*VFIOIommuType1DmaUnmap)(nil) var _ marshal.Marshallable = (*VFIOIrqInfo)(nil) var _ marshal.Marshallable = (*VFIOIrqSet)(nil) var _ marshal.Marshallable = (*VFIORegionInfo)(nil) var _ marshal.Marshallable = (*WindowSize)(nil) var _ marshal.Marshallable = (*Winsize)(nil) var _ marshal.Marshallable = (*XTCounters)(nil) var _ marshal.Marshallable = (*XTEntryMatch)(nil) var _ marshal.Marshallable = (*XTEntryTarget)(nil) var _ marshal.Marshallable = (*XTErrorTarget)(nil) var _ marshal.Marshallable = (*XTGetRevision)(nil) var _ marshal.Marshallable = (*XTNATTargetV0)(nil) var _ marshal.Marshallable = (*XTNATTargetV1)(nil) var _ marshal.Marshallable = (*XTNATTargetV2)(nil) var _ marshal.Marshallable = (*XTOwnerMatchInfo)(nil) var _ marshal.Marshallable = (*XTRedirectTarget)(nil) var _ marshal.Marshallable = (*XTStandardTarget)(nil) var _ marshal.Marshallable = (*XTTCP)(nil) var _ marshal.Marshallable = (*XTUDP)(nil) // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IOCallback) SizeBytes() int { return 64 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IOCallback) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.Data)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Key)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.OpCode)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.ReqPrio)) dst = dst[2:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.FD)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.Buf)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.Bytes)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.Offset)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.Reserved2)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.ResFD)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IOCallback) UnmarshalBytes(src []byte) []byte { i.Data = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] i.Key = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] i.OpCode = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.ReqPrio = int16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.FD = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Buf = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] i.Bytes = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] i.Offset = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] i.Reserved2 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] i.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.ResFD = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IOCallback) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IOCallback) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IOCallback) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IOCallback) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IOCallback) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IOCallback) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IOCallback) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IOCallback) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IOEvent) SizeBytes() int { return 32 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IOEvent) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.Data)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.Obj)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.Result)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.Result2)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IOEvent) UnmarshalBytes(src []byte) []byte { i.Data = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] i.Obj = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] i.Result = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] i.Result2 = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IOEvent) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IOEvent) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IOEvent) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IOEvent) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IOEvent) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IOEvent) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IOEvent) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IOEvent) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (b *BPFInstruction) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (b *BPFInstruction) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(b.OpCode)) dst = dst[2:] dst[0] = byte(b.JumpIfTrue) dst = dst[1:] dst[0] = byte(b.JumpIfFalse) dst = dst[1:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(b.K)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (b *BPFInstruction) UnmarshalBytes(src []byte) []byte { b.OpCode = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] b.JumpIfTrue = uint8(src[0]) src = src[1:] b.JumpIfFalse = uint8(src[0]) src = src[1:] b.K = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (b *BPFInstruction) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (b *BPFInstruction) MarshalUnsafe(dst []byte) []byte { size := b.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(b), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (b *BPFInstruction) UnmarshalUnsafe(src []byte) []byte { size := b.SizeBytes() gohacks.Memmove(unsafe.Pointer(b), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (b *BPFInstruction) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(b))) hdr.Len = b.SizeBytes() hdr.Cap = b.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that b // must live until the use above. runtime.KeepAlive(b) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (b *BPFInstruction) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return b.CopyOutN(cc, addr, b.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (b *BPFInstruction) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(b))) hdr.Len = b.SizeBytes() hdr.Cap = b.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that b // must live until the use above. runtime.KeepAlive(b) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (b *BPFInstruction) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return b.CopyInN(cc, addr, b.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (b *BPFInstruction) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(b))) hdr.Len = b.SizeBytes() hdr.Cap = b.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that b // must live until the use above. runtime.KeepAlive(b) // escapes: replaced by intrinsic. return int64(length), err } // CopyBPFInstructionSliceIn copies in a slice of BPFInstruction objects from the task's memory. func CopyBPFInstructionSliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []BPFInstruction) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*BPFInstruction)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyBPFInstructionSliceOut copies a slice of BPFInstruction objects to the task's memory. func CopyBPFInstructionSliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []BPFInstruction) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*BPFInstruction)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeBPFInstructionSlice is like BPFInstruction.MarshalUnsafe, but for a []BPFInstruction. func MarshalUnsafeBPFInstructionSlice(src []BPFInstruction, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*BPFInstruction)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeBPFInstructionSlice is like BPFInstruction.UnmarshalUnsafe, but for a []BPFInstruction. func UnmarshalUnsafeBPFInstructionSlice(dst []BPFInstruction, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*BPFInstruction)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. func (c *CapUserData) SizeBytes() int { return 12 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (c *CapUserData) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(c.Effective)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(c.Permitted)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(c.Inheritable)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (c *CapUserData) UnmarshalBytes(src []byte) []byte { c.Effective = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] c.Permitted = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] c.Inheritable = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (c *CapUserData) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (c *CapUserData) MarshalUnsafe(dst []byte) []byte { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(c), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (c *CapUserData) UnmarshalUnsafe(src []byte) []byte { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(c), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (c *CapUserData) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (c *CapUserData) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyOutN(cc, addr, c.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (c *CapUserData) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (c *CapUserData) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyInN(cc, addr, c.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (c *CapUserData) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return int64(length), err } // CopyCapUserDataSliceIn copies in a slice of CapUserData objects from the task's memory. func CopyCapUserDataSliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []CapUserData) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*CapUserData)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyCapUserDataSliceOut copies a slice of CapUserData objects to the task's memory. func CopyCapUserDataSliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []CapUserData) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*CapUserData)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeCapUserDataSlice is like CapUserData.MarshalUnsafe, but for a []CapUserData. func MarshalUnsafeCapUserDataSlice(src []CapUserData, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*CapUserData)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeCapUserDataSlice is like CapUserData.UnmarshalUnsafe, but for a []CapUserData. func UnmarshalUnsafeCapUserDataSlice(dst []CapUserData, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*CapUserData)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. func (c *CapUserHeader) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (c *CapUserHeader) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(c.Version)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(c.Pid)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (c *CapUserHeader) UnmarshalBytes(src []byte) []byte { c.Version = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] c.Pid = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (c *CapUserHeader) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (c *CapUserHeader) MarshalUnsafe(dst []byte) []byte { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(c), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (c *CapUserHeader) UnmarshalUnsafe(src []byte) []byte { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(c), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (c *CapUserHeader) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (c *CapUserHeader) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyOutN(cc, addr, c.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (c *CapUserHeader) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (c *CapUserHeader) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyInN(cc, addr, c.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (c *CapUserHeader) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (c *CloneArgs) SizeBytes() int { return 88 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (c *CloneArgs) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(c.Flags)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(c.Pidfd)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(c.ChildTID)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(c.ParentTID)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(c.ExitSignal)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(c.Stack)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(c.StackSize)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(c.TLS)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(c.SetTID)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(c.SetTIDSize)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(c.Cgroup)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (c *CloneArgs) UnmarshalBytes(src []byte) []byte { c.Flags = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] c.Pidfd = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] c.ChildTID = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] c.ParentTID = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] c.ExitSignal = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] c.Stack = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] c.StackSize = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] c.TLS = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] c.SetTID = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] c.SetTIDSize = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] c.Cgroup = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (c *CloneArgs) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (c *CloneArgs) MarshalUnsafe(dst []byte) []byte { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(c), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (c *CloneArgs) UnmarshalUnsafe(src []byte) []byte { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(c), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (c *CloneArgs) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (c *CloneArgs) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyOutN(cc, addr, c.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (c *CloneArgs) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (c *CloneArgs) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyInN(cc, addr, c.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (c *CloneArgs) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (e *ElfHeader64) SizeBytes() int { return 48 + 1*16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (e *ElfHeader64) MarshalBytes(dst []byte) []byte { for idx := 0; idx < 16; idx++ { dst[0] = byte(e.Ident[idx]) dst = dst[1:] } hostarch.ByteOrder.PutUint16(dst[:2], uint16(e.Type)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(e.Machine)) dst = dst[2:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.Version)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(e.Entry)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(e.Phoff)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(e.Shoff)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(e.Ehsize)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(e.Phentsize)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(e.Phnum)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(e.Shentsize)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(e.Shnum)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(e.Shstrndx)) dst = dst[2:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (e *ElfHeader64) UnmarshalBytes(src []byte) []byte { for idx := 0; idx < 16; idx++ { e.Ident[idx] = src[0] src = src[1:] } e.Type = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] e.Machine = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] e.Version = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] e.Entry = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] e.Phoff = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] e.Shoff = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] e.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] e.Ehsize = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] e.Phentsize = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] e.Phnum = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] e.Shentsize = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] e.Shnum = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] e.Shstrndx = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (e *ElfHeader64) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (e *ElfHeader64) MarshalUnsafe(dst []byte) []byte { size := e.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(e), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (e *ElfHeader64) UnmarshalUnsafe(src []byte) []byte { size := e.SizeBytes() gohacks.Memmove(unsafe.Pointer(e), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (e *ElfHeader64) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (e *ElfHeader64) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return e.CopyOutN(cc, addr, e.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (e *ElfHeader64) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (e *ElfHeader64) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return e.CopyInN(cc, addr, e.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (e *ElfHeader64) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (e *ElfProg64) SizeBytes() int { return 56 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (e *ElfProg64) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.Type)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(e.Off)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(e.Vaddr)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(e.Paddr)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(e.Filesz)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(e.Memsz)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(e.Align)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (e *ElfProg64) UnmarshalBytes(src []byte) []byte { e.Type = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] e.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] e.Off = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] e.Vaddr = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] e.Paddr = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] e.Filesz = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] e.Memsz = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] e.Align = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (e *ElfProg64) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (e *ElfProg64) MarshalUnsafe(dst []byte) []byte { size := e.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(e), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (e *ElfProg64) UnmarshalUnsafe(src []byte) []byte { size := e.SizeBytes() gohacks.Memmove(unsafe.Pointer(e), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (e *ElfProg64) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (e *ElfProg64) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return e.CopyOutN(cc, addr, e.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (e *ElfProg64) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (e *ElfProg64) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return e.CopyInN(cc, addr, e.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (e *ElfProg64) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (e *ElfSection64) SizeBytes() int { return 64 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (e *ElfSection64) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.Name)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.Type)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(e.Flags)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(e.Addr)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(e.Off)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(e.Size)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.Link)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.Info)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(e.Addralign)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(e.Entsize)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (e *ElfSection64) UnmarshalBytes(src []byte) []byte { e.Name = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] e.Type = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] e.Flags = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] e.Addr = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] e.Off = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] e.Size = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] e.Link = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] e.Info = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] e.Addralign = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] e.Entsize = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (e *ElfSection64) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (e *ElfSection64) MarshalUnsafe(dst []byte) []byte { size := e.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(e), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (e *ElfSection64) UnmarshalUnsafe(src []byte) []byte { size := e.SizeBytes() gohacks.Memmove(unsafe.Pointer(e), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (e *ElfSection64) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (e *ElfSection64) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return e.CopyOutN(cc, addr, e.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (e *ElfSection64) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (e *ElfSection64) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return e.CopyInN(cc, addr, e.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (e *ElfSection64) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SockErrCMsgIPv4) SizeBytes() int { return 0 + (*SockExtendedErr)(nil).SizeBytes() + (*SockAddrInet)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SockErrCMsgIPv4) MarshalBytes(dst []byte) []byte { dst = s.SockExtendedErr.MarshalUnsafe(dst) dst = s.Offender.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SockErrCMsgIPv4) UnmarshalBytes(src []byte) []byte { src = s.SockExtendedErr.UnmarshalUnsafe(src) src = s.Offender.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SockErrCMsgIPv4) Packed() bool { return s.Offender.Packed() && s.SockExtendedErr.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SockErrCMsgIPv4) MarshalUnsafe(dst []byte) []byte { if s.Offender.Packed() && s.SockExtendedErr.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // Type SockErrCMsgIPv4 doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SockErrCMsgIPv4) UnmarshalUnsafe(src []byte) []byte { if s.Offender.Packed() && s.SockExtendedErr.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type SockErrCMsgIPv4 doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SockErrCMsgIPv4) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Offender.Packed() && s.SockExtendedErr.Packed() { // Type SockErrCMsgIPv4 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SockErrCMsgIPv4) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SockErrCMsgIPv4) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Offender.Packed() && s.SockExtendedErr.Packed() { // Type SockErrCMsgIPv4 doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SockErrCMsgIPv4) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SockErrCMsgIPv4) WriteTo(writer io.Writer) (int64, error) { if !s.Offender.Packed() && s.SockExtendedErr.Packed() { // Type SockErrCMsgIPv4 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SockErrCMsgIPv6) SizeBytes() int { return 0 + (*SockExtendedErr)(nil).SizeBytes() + (*SockAddrInet6)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SockErrCMsgIPv6) MarshalBytes(dst []byte) []byte { dst = s.SockExtendedErr.MarshalUnsafe(dst) dst = s.Offender.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SockErrCMsgIPv6) UnmarshalBytes(src []byte) []byte { src = s.SockExtendedErr.UnmarshalUnsafe(src) src = s.Offender.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SockErrCMsgIPv6) Packed() bool { return s.Offender.Packed() && s.SockExtendedErr.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SockErrCMsgIPv6) MarshalUnsafe(dst []byte) []byte { if s.Offender.Packed() && s.SockExtendedErr.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // Type SockErrCMsgIPv6 doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SockErrCMsgIPv6) UnmarshalUnsafe(src []byte) []byte { if s.Offender.Packed() && s.SockExtendedErr.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type SockErrCMsgIPv6 doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SockErrCMsgIPv6) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Offender.Packed() && s.SockExtendedErr.Packed() { // Type SockErrCMsgIPv6 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SockErrCMsgIPv6) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SockErrCMsgIPv6) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Offender.Packed() && s.SockExtendedErr.Packed() { // Type SockErrCMsgIPv6 doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SockErrCMsgIPv6) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SockErrCMsgIPv6) WriteTo(writer io.Writer) (int64, error) { if !s.Offender.Packed() && s.SockExtendedErr.Packed() { // Type SockErrCMsgIPv6 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SockExtendedErr) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SockExtendedErr) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Errno)) dst = dst[4:] dst[0] = byte(s.Origin) dst = dst[1:] dst[0] = byte(s.Type) dst = dst[1:] dst[0] = byte(s.Code) dst = dst[1:] dst[0] = byte(s.Pad) dst = dst[1:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Info)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Data)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SockExtendedErr) UnmarshalBytes(src []byte) []byte { s.Errno = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Origin = uint8(src[0]) src = src[1:] s.Type = uint8(src[0]) src = src[1:] s.Code = uint8(src[0]) src = src[1:] s.Pad = uint8(src[0]) src = src[1:] s.Info = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Data = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SockExtendedErr) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SockExtendedErr) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SockExtendedErr) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SockExtendedErr) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SockExtendedErr) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SockExtendedErr) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SockExtendedErr) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SockExtendedErr) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FOwnerEx) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FOwnerEx) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Type)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.PID)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FOwnerEx) UnmarshalBytes(src []byte) []byte { f.Type = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.PID = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FOwnerEx) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FOwnerEx) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FOwnerEx) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FOwnerEx) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FOwnerEx) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FOwnerEx) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FOwnerEx) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FOwnerEx) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *Flock) SizeBytes() int { return 24 + 1*4 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *Flock) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(f.Type)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(f.Whence)) dst = dst[2:] // Padding: dst[:sizeof(byte)*4] ~= [4]byte{0} dst = dst[1*(4):] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Start)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Len)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.PID)) dst = dst[4:] // Padding: dst[:sizeof(byte)*4] ~= [4]byte{0} dst = dst[1*(4):] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *Flock) UnmarshalBytes(src []byte) []byte { f.Type = int16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] f.Whence = int16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] // Padding: ~ copy([4]byte(f._), src[:sizeof(byte)*4]) src = src[1*(4):] f.Start = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.Len = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.PID = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: ~ copy([4]byte(f._), src[:sizeof(byte)*4]) src = src[1*(4):] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *Flock) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *Flock) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *Flock) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *Flock) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *Flock) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *Flock) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *Flock) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *Flock) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (m *FileMode) SizeBytes() int { return 2 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (m *FileMode) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(*m)) return dst[2:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (m *FileMode) UnmarshalBytes(src []byte) []byte { *m = FileMode(uint16(hostarch.ByteOrder.Uint16(src[:2]))) return src[2:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (m *FileMode) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (m *FileMode) MarshalUnsafe(dst []byte) []byte { size := m.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(m), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (m *FileMode) UnmarshalUnsafe(src []byte) []byte { size := m.SizeBytes() gohacks.Memmove(unsafe.Pointer(m), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (m *FileMode) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (m *FileMode) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyOutN(cc, addr, m.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (m *FileMode) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (m *FileMode) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyInN(cc, addr, m.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (m *FileMode) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *Statx) SizeBytes() int { return 80 + (*StatxTimestamp)(nil).SizeBytes() + (*StatxTimestamp)(nil).SizeBytes() + (*StatxTimestamp)(nil).SizeBytes() + (*StatxTimestamp)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *Statx) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Mask)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Blksize)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Attributes)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Nlink)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.UID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.GID)) dst = dst[4:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.Mode)) dst = dst[2:] // Padding: dst[:sizeof(uint16)] ~= uint16(0) dst = dst[2:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Ino)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Size)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Blocks)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.AttributesMask)) dst = dst[8:] dst = s.Atime.MarshalUnsafe(dst) dst = s.Btime.MarshalUnsafe(dst) dst = s.Ctime.MarshalUnsafe(dst) dst = s.Mtime.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.RdevMajor)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.RdevMinor)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.DevMajor)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.DevMinor)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *Statx) UnmarshalBytes(src []byte) []byte { s.Mask = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Blksize = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Attributes = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Nlink = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.UID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.GID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Mode = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] // Padding: var _ uint16 ~= src[:sizeof(uint16)] src = src[2:] s.Ino = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Size = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Blocks = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.AttributesMask = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = s.Atime.UnmarshalUnsafe(src) src = s.Btime.UnmarshalUnsafe(src) src = s.Ctime.UnmarshalUnsafe(src) src = s.Mtime.UnmarshalUnsafe(src) s.RdevMajor = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.RdevMinor = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.DevMajor = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.DevMinor = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *Statx) Packed() bool { return s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *Statx) MarshalUnsafe(dst []byte) []byte { if s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // Type Statx doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *Statx) UnmarshalUnsafe(src []byte) []byte { if s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type Statx doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *Statx) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() { // Type Statx doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *Statx) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *Statx) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() { // Type Statx doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *Statx) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *Statx) WriteTo(writer io.Writer) (int64, error) { if !s.Atime.Packed() && s.Btime.Packed() && s.Ctime.Packed() && s.Mtime.Packed() { // Type Statx doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (s *Statx) CheckedMarshal(dst []byte) ([]byte, bool) { if s.SizeBytes() > len(dst) { return dst, false } return s.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (s *Statx) CheckedUnmarshal(src []byte) ([]byte, bool) { if s.SizeBytes() > len(src) { return src, false } return s.UnmarshalUnsafe(src), true } // CopyStatxSliceIn copies in a slice of Statx objects from the task's memory. func CopyStatxSliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []Statx) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*Statx)(nil).SizeBytes() if !dst[0].Packed() { // Type Statx doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(size * count) length, err := cc.CopyInBytes(addr, buf) // Unmarshal as much as possible, even on error. First handle full objects. limit := length/size for idx := 0; idx < limit; idx++ { buf = dst[idx].UnmarshalBytes(buf) } // Handle any final partial object. buf is guaranteed to be long enough for the // final element, but may not contain valid data for the entire range. This may // result in unmarshalling zero values for some parts of the object. if length%size != 0 { dst[limit].UnmarshalBytes(buf) } return length, err } ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyStatxSliceOut copies a slice of Statx objects to the task's memory. func CopyStatxSliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []Statx) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*Statx)(nil).SizeBytes() if !src[0].Packed() { // Type Statx doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(size * count) curBuf := buf for idx := 0; idx < count; idx++ { curBuf = src[idx].MarshalBytes(curBuf) } return cc.CopyOutBytes(addr, buf) } ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeStatxSlice is like Statx.MarshalUnsafe, but for a []Statx. func MarshalUnsafeStatxSlice(src []Statx, dst []byte) []byte { count := len(src) if count == 0 { return dst } if !src[0].Packed() { // Type Statx doesn't have a packed layout in memory, fall back to MarshalBytes. for idx := 0; idx < count; idx++ { dst = src[idx].MarshalBytes(dst) } return dst } size := (*Statx)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeStatxSlice is like Statx.UnmarshalUnsafe, but for a []Statx. func UnmarshalUnsafeStatxSlice(dst []Statx, src []byte) []byte { count := len(dst) if count == 0 { return src } if !dst[0].Packed() { // Type Statx doesn't have a packed layout in memory, fall back to UnmarshalBytes. for idx := 0; idx < count; idx++ { src = dst[idx].UnmarshalBytes(src) } return src } size := (*Statx)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *Statfs) SizeBytes() int { return 80 + 4*2 + 8*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *Statfs) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Type)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.BlockSize)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Blocks)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.BlocksFree)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.BlocksAvailable)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Files)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.FilesFree)) dst = dst[8:] for idx := 0; idx < 2; idx++ { hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.FSID[idx])) dst = dst[4:] } hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.NameLength)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.FragmentSize)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Flags)) dst = dst[8:] for idx := 0; idx < 4; idx++ { hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Spare[idx])) dst = dst[8:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *Statfs) UnmarshalBytes(src []byte) []byte { s.Type = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.BlockSize = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Blocks = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.BlocksFree = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.BlocksAvailable = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Files = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.FilesFree = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] for idx := 0; idx < 2; idx++ { s.FSID[idx] = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] } s.NameLength = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.FragmentSize = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Flags = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] for idx := 0; idx < 4; idx++ { s.Spare[idx] = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *Statfs) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *Statfs) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *Statfs) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *Statfs) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *Statfs) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *Statfs) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *Statfs) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *Statfs) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *CString) Packed() bool { // Type CString is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *CString) MarshalUnsafe(dst []byte) []byte { // Type CString doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *CString) UnmarshalUnsafe(src []byte) []byte { // Type CString doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (s *CString) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type CString doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (s *CString) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (s *CString) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type CString doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *CString) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *CString) WriteTo(writer io.Writer) (int64, error) { // Type CString doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEAccessIn) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEAccessIn) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Mask)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEAccessIn) UnmarshalBytes(src []byte) []byte { f.Mask = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEAccessIn) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEAccessIn) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEAccessIn) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEAccessIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEAccessIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEAccessIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEAccessIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEAccessIn) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (a *FUSEAttr) SizeBytes() int { return 88 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (a *FUSEAttr) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(a.Ino)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(a.Size)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(a.Blocks)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(a.Atime)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(a.Mtime)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(a.Ctime)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(a.AtimeNsec)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(a.MtimeNsec)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(a.CtimeNsec)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(a.Mode)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(a.Nlink)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(a.UID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(a.GID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(a.Rdev)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(a.BlkSize)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (a *FUSEAttr) UnmarshalBytes(src []byte) []byte { a.Ino = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] a.Size = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] a.Blocks = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] a.Atime = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] a.Mtime = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] a.Ctime = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] a.AtimeNsec = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] a.MtimeNsec = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] a.CtimeNsec = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] a.Mode = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] a.Nlink = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] a.UID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] a.GID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] a.Rdev = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] a.BlkSize = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (a *FUSEAttr) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (a *FUSEAttr) MarshalUnsafe(dst []byte) []byte { size := a.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(a), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (a *FUSEAttr) UnmarshalUnsafe(src []byte) []byte { size := a.SizeBytes() gohacks.Memmove(unsafe.Pointer(a), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (a *FUSEAttr) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(a))) hdr.Len = a.SizeBytes() hdr.Cap = a.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that a // must live until the use above. runtime.KeepAlive(a) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (a *FUSEAttr) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return a.CopyOutN(cc, addr, a.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (a *FUSEAttr) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(a))) hdr.Len = a.SizeBytes() hdr.Cap = a.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that a // must live until the use above. runtime.KeepAlive(a) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (a *FUSEAttr) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return a.CopyInN(cc, addr, a.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (a *FUSEAttr) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(a))) hdr.Len = a.SizeBytes() hdr.Cap = a.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that a // must live until the use above. runtime.KeepAlive(a) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEAttrOut) SizeBytes() int { return 16 + (*FUSEAttr)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEAttrOut) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.AttrValid)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.AttrValidNsec)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] dst = f.Attr.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEAttrOut) UnmarshalBytes(src []byte) []byte { f.AttrValid = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.AttrValidNsec = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] src = f.Attr.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEAttrOut) Packed() bool { return f.Attr.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEAttrOut) MarshalUnsafe(dst []byte) []byte { if f.Attr.Packed() { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // Type FUSEAttrOut doesn't have a packed layout in memory, fallback to MarshalBytes. return f.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEAttrOut) UnmarshalUnsafe(src []byte) []byte { if f.Attr.Packed() { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type FUSEAttrOut doesn't have a packed layout in memory, fallback to UnmarshalBytes. return f.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEAttrOut) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !f.Attr.Packed() { // Type FUSEAttrOut doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. f.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEAttrOut) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEAttrOut) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !f.Attr.Packed() { // Type FUSEAttrOut doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. f.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEAttrOut) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEAttrOut) WriteTo(writer io.Writer) (int64, error) { if !f.Attr.Packed() { // Type FUSEAttrOut doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, f.SizeBytes()) f.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *FUSECreateIn) Packed() bool { // Type FUSECreateIn is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *FUSECreateIn) MarshalUnsafe(dst []byte) []byte { // Type FUSECreateIn doesn't have a packed layout in memory, fallback to MarshalBytes. return r.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *FUSECreateIn) UnmarshalUnsafe(src []byte) []byte { // Type FUSECreateIn doesn't have a packed layout in memory, fallback to UnmarshalBytes. return r.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (r *FUSECreateIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSECreateIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. r.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (r *FUSECreateIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (r *FUSECreateIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSECreateIn doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. r.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *FUSECreateIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *FUSECreateIn) WriteTo(writer io.Writer) (int64, error) { // Type FUSECreateIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, r.SizeBytes()) r.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSECreateMeta) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSECreateMeta) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Mode)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Umask)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSECreateMeta) UnmarshalBytes(src []byte) []byte { f.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.Mode = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.Umask = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSECreateMeta) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSECreateMeta) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSECreateMeta) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSECreateMeta) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSECreateMeta) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSECreateMeta) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSECreateMeta) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSECreateMeta) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSECreateOut) SizeBytes() int { return 0 + (*FUSEEntryOut)(nil).SizeBytes() + (*FUSEOpenOut)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSECreateOut) MarshalBytes(dst []byte) []byte { dst = f.FUSEEntryOut.MarshalUnsafe(dst) dst = f.FUSEOpenOut.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSECreateOut) UnmarshalBytes(src []byte) []byte { src = f.FUSEEntryOut.UnmarshalUnsafe(src) src = f.FUSEOpenOut.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSECreateOut) Packed() bool { return f.FUSEEntryOut.Packed() && f.FUSEOpenOut.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSECreateOut) MarshalUnsafe(dst []byte) []byte { if f.FUSEEntryOut.Packed() && f.FUSEOpenOut.Packed() { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // Type FUSECreateOut doesn't have a packed layout in memory, fallback to MarshalBytes. return f.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSECreateOut) UnmarshalUnsafe(src []byte) []byte { if f.FUSEEntryOut.Packed() && f.FUSEOpenOut.Packed() { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type FUSECreateOut doesn't have a packed layout in memory, fallback to UnmarshalBytes. return f.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSECreateOut) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !f.FUSEEntryOut.Packed() && f.FUSEOpenOut.Packed() { // Type FUSECreateOut doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. f.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSECreateOut) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSECreateOut) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !f.FUSEEntryOut.Packed() && f.FUSEOpenOut.Packed() { // Type FUSECreateOut doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. f.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSECreateOut) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSECreateOut) WriteTo(writer io.Writer) (int64, error) { if !f.FUSEEntryOut.Packed() && f.FUSEOpenOut.Packed() { // Type FUSECreateOut doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, f.SizeBytes()) f.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *FUSEDirent) Packed() bool { // Type FUSEDirent is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *FUSEDirent) MarshalUnsafe(dst []byte) []byte { // Type FUSEDirent doesn't have a packed layout in memory, fallback to MarshalBytes. return r.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *FUSEDirent) UnmarshalUnsafe(src []byte) []byte { // Type FUSEDirent doesn't have a packed layout in memory, fallback to UnmarshalBytes. return r.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (r *FUSEDirent) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSEDirent doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. r.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (r *FUSEDirent) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (r *FUSEDirent) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSEDirent doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. r.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *FUSEDirent) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *FUSEDirent) WriteTo(writer io.Writer) (int64, error) { // Type FUSEDirent doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, r.SizeBytes()) r.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEDirentMeta) SizeBytes() int { return 24 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEDirentMeta) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Ino)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Off)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.NameLen)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Type)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEDirentMeta) UnmarshalBytes(src []byte) []byte { f.Ino = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.Off = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.NameLen = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.Type = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEDirentMeta) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEDirentMeta) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEDirentMeta) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEDirentMeta) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEDirentMeta) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEDirentMeta) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEDirentMeta) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEDirentMeta) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *FUSEDirents) Packed() bool { // Type FUSEDirents is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *FUSEDirents) MarshalUnsafe(dst []byte) []byte { // Type FUSEDirents doesn't have a packed layout in memory, fallback to MarshalBytes. return r.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *FUSEDirents) UnmarshalUnsafe(src []byte) []byte { // Type FUSEDirents doesn't have a packed layout in memory, fallback to UnmarshalBytes. return r.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (r *FUSEDirents) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSEDirents doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. r.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (r *FUSEDirents) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (r *FUSEDirents) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSEDirents doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. r.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *FUSEDirents) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *FUSEDirents) WriteTo(writer io.Writer) (int64, error) { // Type FUSEDirents doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, r.SizeBytes()) r.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *FUSEEmptyIn) Packed() bool { // Type FUSEEmptyIn is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *FUSEEmptyIn) MarshalUnsafe(dst []byte) []byte { // Type FUSEEmptyIn doesn't have a packed layout in memory, fallback to MarshalBytes. return r.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *FUSEEmptyIn) UnmarshalUnsafe(src []byte) []byte { // Type FUSEEmptyIn doesn't have a packed layout in memory, fallback to UnmarshalBytes. return r.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (r *FUSEEmptyIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSEEmptyIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. r.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (r *FUSEEmptyIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (r *FUSEEmptyIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSEEmptyIn doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. r.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *FUSEEmptyIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *FUSEEmptyIn) WriteTo(writer io.Writer) (int64, error) { // Type FUSEEmptyIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, r.SizeBytes()) r.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEEntryOut) SizeBytes() int { return 40 + (*FUSEAttr)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEEntryOut) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.NodeID)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Generation)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.EntryValid)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.AttrValid)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.EntryValidNSec)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.AttrValidNSec)) dst = dst[4:] dst = f.Attr.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEEntryOut) UnmarshalBytes(src []byte) []byte { f.NodeID = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.Generation = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.EntryValid = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.AttrValid = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.EntryValidNSec = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.AttrValidNSec = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = f.Attr.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEEntryOut) Packed() bool { return f.Attr.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEEntryOut) MarshalUnsafe(dst []byte) []byte { if f.Attr.Packed() { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // Type FUSEEntryOut doesn't have a packed layout in memory, fallback to MarshalBytes. return f.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEEntryOut) UnmarshalUnsafe(src []byte) []byte { if f.Attr.Packed() { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type FUSEEntryOut doesn't have a packed layout in memory, fallback to UnmarshalBytes. return f.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEEntryOut) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !f.Attr.Packed() { // Type FUSEEntryOut doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. f.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEEntryOut) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEEntryOut) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !f.Attr.Packed() { // Type FUSEEntryOut doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. f.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEEntryOut) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEEntryOut) WriteTo(writer io.Writer) (int64, error) { if !f.Attr.Packed() { // Type FUSEEntryOut doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, f.SizeBytes()) f.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEFallocateIn) SizeBytes() int { return 32 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEFallocateIn) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Fh)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Offset)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Length)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Mode)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEFallocateIn) UnmarshalBytes(src []byte) []byte { f.Fh = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.Offset = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.Length = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.Mode = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEFallocateIn) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEFallocateIn) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEFallocateIn) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEFallocateIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEFallocateIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEFallocateIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEFallocateIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEFallocateIn) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEFlushIn) SizeBytes() int { return 24 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEFlushIn) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Fh)) dst = dst[8:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.LockOwner)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEFlushIn) UnmarshalBytes(src []byte) []byte { f.Fh = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] f.LockOwner = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEFlushIn) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEFlushIn) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEFlushIn) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEFlushIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEFlushIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEFlushIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEFlushIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEFlushIn) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEFsyncIn) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEFsyncIn) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Fh)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.FsyncFlags)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEFsyncIn) UnmarshalBytes(src []byte) []byte { f.Fh = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.FsyncFlags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEFsyncIn) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEFsyncIn) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEFsyncIn) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEFsyncIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEFsyncIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEFsyncIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEFsyncIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEFsyncIn) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEGetAttrIn) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEGetAttrIn) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.GetAttrFlags)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Fh)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEGetAttrIn) UnmarshalBytes(src []byte) []byte { f.GetAttrFlags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] f.Fh = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEGetAttrIn) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEGetAttrIn) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEGetAttrIn) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEGetAttrIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEGetAttrIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEGetAttrIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEGetAttrIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEGetAttrIn) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEHeaderIn) SizeBytes() int { return 28 + (*FUSEOpcode)(nil).SizeBytes() + (*FUSEOpID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEHeaderIn) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Len)) dst = dst[4:] dst = f.Opcode.MarshalUnsafe(dst) dst = f.Unique.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.NodeID)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.UID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.GID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.PID)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEHeaderIn) UnmarshalBytes(src []byte) []byte { f.Len = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = f.Opcode.UnmarshalUnsafe(src) src = f.Unique.UnmarshalUnsafe(src) f.NodeID = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.UID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.GID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.PID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEHeaderIn) Packed() bool { return f.Opcode.Packed() && f.Unique.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEHeaderIn) MarshalUnsafe(dst []byte) []byte { if f.Opcode.Packed() && f.Unique.Packed() { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // Type FUSEHeaderIn doesn't have a packed layout in memory, fallback to MarshalBytes. return f.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEHeaderIn) UnmarshalUnsafe(src []byte) []byte { if f.Opcode.Packed() && f.Unique.Packed() { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type FUSEHeaderIn doesn't have a packed layout in memory, fallback to UnmarshalBytes. return f.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEHeaderIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !f.Opcode.Packed() && f.Unique.Packed() { // Type FUSEHeaderIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. f.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEHeaderIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEHeaderIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !f.Opcode.Packed() && f.Unique.Packed() { // Type FUSEHeaderIn doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. f.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEHeaderIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEHeaderIn) WriteTo(writer io.Writer) (int64, error) { if !f.Opcode.Packed() && f.Unique.Packed() { // Type FUSEHeaderIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, f.SizeBytes()) f.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEHeaderOut) SizeBytes() int { return 8 + (*FUSEOpID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEHeaderOut) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Len)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Error)) dst = dst[4:] dst = f.Unique.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEHeaderOut) UnmarshalBytes(src []byte) []byte { f.Len = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.Error = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = f.Unique.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEHeaderOut) Packed() bool { return f.Unique.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEHeaderOut) MarshalUnsafe(dst []byte) []byte { if f.Unique.Packed() { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // Type FUSEHeaderOut doesn't have a packed layout in memory, fallback to MarshalBytes. return f.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEHeaderOut) UnmarshalUnsafe(src []byte) []byte { if f.Unique.Packed() { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type FUSEHeaderOut doesn't have a packed layout in memory, fallback to UnmarshalBytes. return f.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEHeaderOut) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !f.Unique.Packed() { // Type FUSEHeaderOut doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. f.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEHeaderOut) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEHeaderOut) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !f.Unique.Packed() { // Type FUSEHeaderOut doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. f.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEHeaderOut) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEHeaderOut) WriteTo(writer io.Writer) (int64, error) { if !f.Unique.Packed() { // Type FUSEHeaderOut doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, f.SizeBytes()) f.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEInitIn) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEInitIn) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Major)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Minor)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.MaxReadahead)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Flags)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEInitIn) UnmarshalBytes(src []byte) []byte { f.Major = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.Minor = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.MaxReadahead = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEInitIn) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEInitIn) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEInitIn) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEInitIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEInitIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEInitIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEInitIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEInitIn) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEInitOut) SizeBytes() int { return 32 + 4*8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEInitOut) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Major)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Minor)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.MaxReadahead)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(f.MaxBackground)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(f.CongestionThreshold)) dst = dst[2:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.MaxWrite)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.TimeGran)) dst = dst[4:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(f.MaxPages)) dst = dst[2:] // Padding: dst[:sizeof(uint16)] ~= uint16(0) dst = dst[2:] // Padding: dst[:sizeof(uint32)*8] ~= [8]uint32{0} dst = dst[4*(8):] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEInitOut) UnmarshalBytes(src []byte) []byte { f.Major = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.Minor = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.MaxReadahead = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.MaxBackground = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] f.CongestionThreshold = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] f.MaxWrite = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.TimeGran = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.MaxPages = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] // Padding: var _ uint16 ~= src[:sizeof(uint16)] src = src[2:] // Padding: ~ copy([8]uint32(f._), src[:sizeof(uint32)*8]) src = src[4*(8):] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEInitOut) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEInitOut) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEInitOut) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEInitOut) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEInitOut) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEInitOut) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEInitOut) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEInitOut) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *FUSELinkIn) Packed() bool { // Type FUSELinkIn is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *FUSELinkIn) MarshalUnsafe(dst []byte) []byte { // Type FUSELinkIn doesn't have a packed layout in memory, fallback to MarshalBytes. return r.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *FUSELinkIn) UnmarshalUnsafe(src []byte) []byte { // Type FUSELinkIn doesn't have a packed layout in memory, fallback to UnmarshalBytes. return r.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (r *FUSELinkIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSELinkIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. r.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (r *FUSELinkIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (r *FUSELinkIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSELinkIn doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. r.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *FUSELinkIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *FUSELinkIn) WriteTo(writer io.Writer) (int64, error) { // Type FUSELinkIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, r.SizeBytes()) r.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *FUSELookupIn) Packed() bool { // Type FUSELookupIn is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *FUSELookupIn) MarshalUnsafe(dst []byte) []byte { // Type FUSELookupIn doesn't have a packed layout in memory, fallback to MarshalBytes. return r.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *FUSELookupIn) UnmarshalUnsafe(src []byte) []byte { // Type FUSELookupIn doesn't have a packed layout in memory, fallback to UnmarshalBytes. return r.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (r *FUSELookupIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSELookupIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. r.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (r *FUSELookupIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (r *FUSELookupIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSELookupIn doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. r.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *FUSELookupIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *FUSELookupIn) WriteTo(writer io.Writer) (int64, error) { // Type FUSELookupIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, r.SizeBytes()) r.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *FUSEMkdirIn) Packed() bool { // Type FUSEMkdirIn is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *FUSEMkdirIn) MarshalUnsafe(dst []byte) []byte { // Type FUSEMkdirIn doesn't have a packed layout in memory, fallback to MarshalBytes. return r.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *FUSEMkdirIn) UnmarshalUnsafe(src []byte) []byte { // Type FUSEMkdirIn doesn't have a packed layout in memory, fallback to UnmarshalBytes. return r.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (r *FUSEMkdirIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSEMkdirIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. r.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (r *FUSEMkdirIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (r *FUSEMkdirIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSEMkdirIn doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. r.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *FUSEMkdirIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *FUSEMkdirIn) WriteTo(writer io.Writer) (int64, error) { // Type FUSEMkdirIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, r.SizeBytes()) r.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEMkdirMeta) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEMkdirMeta) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Mode)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Umask)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEMkdirMeta) UnmarshalBytes(src []byte) []byte { f.Mode = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.Umask = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEMkdirMeta) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEMkdirMeta) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEMkdirMeta) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEMkdirMeta) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEMkdirMeta) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEMkdirMeta) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEMkdirMeta) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEMkdirMeta) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *FUSEMknodIn) Packed() bool { // Type FUSEMknodIn is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *FUSEMknodIn) MarshalUnsafe(dst []byte) []byte { // Type FUSEMknodIn doesn't have a packed layout in memory, fallback to MarshalBytes. return r.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *FUSEMknodIn) UnmarshalUnsafe(src []byte) []byte { // Type FUSEMknodIn doesn't have a packed layout in memory, fallback to UnmarshalBytes. return r.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (r *FUSEMknodIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSEMknodIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. r.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (r *FUSEMknodIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (r *FUSEMknodIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSEMknodIn doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. r.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *FUSEMknodIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *FUSEMknodIn) WriteTo(writer io.Writer) (int64, error) { // Type FUSEMknodIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, r.SizeBytes()) r.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEMknodMeta) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEMknodMeta) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Mode)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Rdev)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Umask)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEMknodMeta) UnmarshalBytes(src []byte) []byte { f.Mode = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.Rdev = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.Umask = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEMknodMeta) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEMknodMeta) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEMknodMeta) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEMknodMeta) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEMknodMeta) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEMknodMeta) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEMknodMeta) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEMknodMeta) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (f *FUSEOpID) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEOpID) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(*f)) return dst[8:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEOpID) UnmarshalBytes(src []byte) []byte { *f = FUSEOpID(uint64(hostarch.ByteOrder.Uint64(src[:8]))) return src[8:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEOpID) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEOpID) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEOpID) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEOpID) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEOpID) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEOpID) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEOpID) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEOpID) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (f *FUSEOpcode) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEOpcode) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(*f)) return dst[4:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEOpcode) UnmarshalBytes(src []byte) []byte { *f = FUSEOpcode(uint32(hostarch.ByteOrder.Uint32(src[:4]))) return src[4:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEOpcode) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEOpcode) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEOpcode) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEOpcode) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEOpcode) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEOpcode) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEOpcode) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEOpcode) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEOpenIn) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEOpenIn) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Flags)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEOpenIn) UnmarshalBytes(src []byte) []byte { f.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEOpenIn) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEOpenIn) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEOpenIn) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEOpenIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEOpenIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEOpenIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEOpenIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEOpenIn) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEOpenOut) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEOpenOut) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Fh)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.OpenFlag)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEOpenOut) UnmarshalBytes(src []byte) []byte { f.Fh = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.OpenFlag = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEOpenOut) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEOpenOut) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEOpenOut) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEOpenOut) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEOpenOut) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEOpenOut) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEOpenOut) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEOpenOut) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEReadIn) SizeBytes() int { return 40 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEReadIn) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Fh)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Offset)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Size)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.ReadFlags)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.LockOwner)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Flags)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEReadIn) UnmarshalBytes(src []byte) []byte { f.Fh = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.Offset = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.Size = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.ReadFlags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.LockOwner = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEReadIn) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEReadIn) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEReadIn) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEReadIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEReadIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEReadIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEReadIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEReadIn) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEReleaseIn) SizeBytes() int { return 24 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEReleaseIn) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Fh)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.ReleaseFlags)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.LockOwner)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEReleaseIn) UnmarshalBytes(src []byte) []byte { f.Fh = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.ReleaseFlags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.LockOwner = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEReleaseIn) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEReleaseIn) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEReleaseIn) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEReleaseIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEReleaseIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEReleaseIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEReleaseIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEReleaseIn) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *FUSERenameIn) Packed() bool { // Type FUSERenameIn is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *FUSERenameIn) MarshalUnsafe(dst []byte) []byte { // Type FUSERenameIn doesn't have a packed layout in memory, fallback to MarshalBytes. return r.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *FUSERenameIn) UnmarshalUnsafe(src []byte) []byte { // Type FUSERenameIn doesn't have a packed layout in memory, fallback to UnmarshalBytes. return r.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (r *FUSERenameIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSERenameIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. r.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (r *FUSERenameIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (r *FUSERenameIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSERenameIn doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. r.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *FUSERenameIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *FUSERenameIn) WriteTo(writer io.Writer) (int64, error) { // Type FUSERenameIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, r.SizeBytes()) r.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *FUSERmDirIn) Packed() bool { // Type FUSERmDirIn is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *FUSERmDirIn) MarshalUnsafe(dst []byte) []byte { // Type FUSERmDirIn doesn't have a packed layout in memory, fallback to MarshalBytes. return r.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *FUSERmDirIn) UnmarshalUnsafe(src []byte) []byte { // Type FUSERmDirIn doesn't have a packed layout in memory, fallback to UnmarshalBytes. return r.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (r *FUSERmDirIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSERmDirIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. r.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (r *FUSERmDirIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (r *FUSERmDirIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSERmDirIn doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. r.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *FUSERmDirIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *FUSERmDirIn) WriteTo(writer io.Writer) (int64, error) { // Type FUSERmDirIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, r.SizeBytes()) r.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSESetAttrIn) SizeBytes() int { return 88 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSESetAttrIn) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Valid)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Fh)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Size)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.LockOwner)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Atime)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Mtime)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Ctime)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.AtimeNsec)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.MtimeNsec)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.CtimeNsec)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Mode)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.UID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.GID)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSESetAttrIn) UnmarshalBytes(src []byte) []byte { f.Valid = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] f.Fh = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.Size = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.LockOwner = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.Atime = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.Mtime = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.Ctime = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.AtimeNsec = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.MtimeNsec = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.CtimeNsec = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.Mode = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] f.UID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.GID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSESetAttrIn) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSESetAttrIn) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSESetAttrIn) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSESetAttrIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSESetAttrIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSESetAttrIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSESetAttrIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSESetAttrIn) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEStatfsOut) SizeBytes() int { return 56 + 4*6 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEStatfsOut) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Blocks)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.BlocksFree)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.BlocksAvailable)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Files)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.FilesFree)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.BlockSize)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.NameLength)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.FragmentSize)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] for idx := 0; idx < 6; idx++ { hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Spare[idx])) dst = dst[4:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEStatfsOut) UnmarshalBytes(src []byte) []byte { f.Blocks = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.BlocksFree = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.BlocksAvailable = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.Files = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.FilesFree = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.BlockSize = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.NameLength = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.FragmentSize = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] for idx := 0; idx < 6; idx++ { f.Spare[idx] = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEStatfsOut) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEStatfsOut) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEStatfsOut) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEStatfsOut) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEStatfsOut) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEStatfsOut) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEStatfsOut) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEStatfsOut) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *FUSESymlinkIn) Packed() bool { // Type FUSESymlinkIn is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *FUSESymlinkIn) MarshalUnsafe(dst []byte) []byte { // Type FUSESymlinkIn doesn't have a packed layout in memory, fallback to MarshalBytes. return r.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *FUSESymlinkIn) UnmarshalUnsafe(src []byte) []byte { // Type FUSESymlinkIn doesn't have a packed layout in memory, fallback to UnmarshalBytes. return r.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (r *FUSESymlinkIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSESymlinkIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. r.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (r *FUSESymlinkIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (r *FUSESymlinkIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSESymlinkIn doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. r.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *FUSESymlinkIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *FUSESymlinkIn) WriteTo(writer io.Writer) (int64, error) { // Type FUSESymlinkIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, r.SizeBytes()) r.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *FUSEUnlinkIn) Packed() bool { // Type FUSEUnlinkIn is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *FUSEUnlinkIn) MarshalUnsafe(dst []byte) []byte { // Type FUSEUnlinkIn doesn't have a packed layout in memory, fallback to MarshalBytes. return r.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *FUSEUnlinkIn) UnmarshalUnsafe(src []byte) []byte { // Type FUSEUnlinkIn doesn't have a packed layout in memory, fallback to UnmarshalBytes. return r.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (r *FUSEUnlinkIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSEUnlinkIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. r.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (r *FUSEUnlinkIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (r *FUSEUnlinkIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSEUnlinkIn doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. r.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *FUSEUnlinkIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *FUSEUnlinkIn) WriteTo(writer io.Writer) (int64, error) { // Type FUSEUnlinkIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, r.SizeBytes()) r.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEWriteIn) SizeBytes() int { return 40 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEWriteIn) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Fh)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Offset)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Size)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.WriteFlags)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.LockOwner)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Flags)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEWriteIn) UnmarshalBytes(src []byte) []byte { f.Fh = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.Offset = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.Size = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.WriteFlags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.LockOwner = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEWriteIn) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEWriteIn) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEWriteIn) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEWriteIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEWriteIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEWriteIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEWriteIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEWriteIn) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FUSEWriteOut) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FUSEWriteOut) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Size)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FUSEWriteOut) UnmarshalBytes(src []byte) []byte { f.Size = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FUSEWriteOut) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FUSEWriteOut) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FUSEWriteOut) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FUSEWriteOut) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FUSEWriteOut) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FUSEWriteOut) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FUSEWriteOut) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FUSEWriteOut) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *FUSEWritePayloadIn) Packed() bool { // Type FUSEWritePayloadIn is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *FUSEWritePayloadIn) MarshalUnsafe(dst []byte) []byte { // Type FUSEWritePayloadIn doesn't have a packed layout in memory, fallback to MarshalBytes. return r.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *FUSEWritePayloadIn) UnmarshalUnsafe(src []byte) []byte { // Type FUSEWritePayloadIn doesn't have a packed layout in memory, fallback to UnmarshalBytes. return r.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (r *FUSEWritePayloadIn) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSEWritePayloadIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. r.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (r *FUSEWritePayloadIn) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (r *FUSEWritePayloadIn) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type FUSEWritePayloadIn doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. r.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *FUSEWritePayloadIn) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *FUSEWritePayloadIn) WriteTo(writer io.Writer) (int64, error) { // Type FUSEWritePayloadIn doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, r.SizeBytes()) r.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *RobustListHead) SizeBytes() int { return 24 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *RobustListHead) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.List)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.FutexOffset)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.ListOpPending)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *RobustListHead) UnmarshalBytes(src []byte) []byte { r.List = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] r.FutexOffset = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] r.ListOpPending = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *RobustListHead) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *RobustListHead) MarshalUnsafe(dst []byte) []byte { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(r), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *RobustListHead) UnmarshalUnsafe(src []byte) []byte { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(r), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (r *RobustListHead) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (r *RobustListHead) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (r *RobustListHead) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *RobustListHead) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *RobustListHead) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IOCqRingOffsets) SizeBytes() int { return 40 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IOCqRingOffsets) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Head)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Tail)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.RingMask)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.RingEntries)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Overflow)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Cqes)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Resv1)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.Resv2)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IOCqRingOffsets) UnmarshalBytes(src []byte) []byte { i.Head = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Tail = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.RingMask = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.RingEntries = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Overflow = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Cqes = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Resv1 = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Resv2 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IOCqRingOffsets) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IOCqRingOffsets) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IOCqRingOffsets) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IOCqRingOffsets) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IOCqRingOffsets) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IOCqRingOffsets) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IOCqRingOffsets) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IOCqRingOffsets) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (i *IORingIndex) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IORingIndex) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(*i)) return dst[4:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IORingIndex) UnmarshalBytes(src []byte) []byte { *i = IORingIndex(uint32(hostarch.ByteOrder.Uint32(src[:4]))) return src[4:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IORingIndex) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IORingIndex) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IORingIndex) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IORingIndex) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IORingIndex) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IORingIndex) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IORingIndex) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IORingIndex) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IORings) SizeBytes() int { return 32 + (*IOUring)(nil).SizeBytes() + (*IOUring)(nil).SizeBytes() + 1*32 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IORings) MarshalBytes(dst []byte) []byte { dst = i.Sq.MarshalUnsafe(dst) dst = i.Cq.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.SqRingMask)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.CqRingMask)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.SqRingEntries)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.CqRingEntries)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.sqDropped)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.sqFlags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.cqFlags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.CqOverflow)) dst = dst[4:] // Padding: dst[:sizeof(byte)*32] ~= [32]byte{0} dst = dst[1*(32):] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IORings) UnmarshalBytes(src []byte) []byte { src = i.Sq.UnmarshalUnsafe(src) src = i.Cq.UnmarshalUnsafe(src) i.SqRingMask = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.CqRingMask = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.SqRingEntries = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.CqRingEntries = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.sqDropped = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.sqFlags = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.cqFlags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.CqOverflow = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: ~ copy([32]byte(i._), src[:sizeof(byte)*32]) src = src[1*(32):] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IORings) Packed() bool { return i.Cq.Packed() && i.Sq.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IORings) MarshalUnsafe(dst []byte) []byte { if i.Cq.Packed() && i.Sq.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // Type IORings doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IORings) UnmarshalUnsafe(src []byte) []byte { if i.Cq.Packed() && i.Sq.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type IORings doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IORings) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Cq.Packed() && i.Sq.Packed() { // Type IORings doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IORings) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IORings) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Cq.Packed() && i.Sq.Packed() { // Type IORings doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IORings) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IORings) WriteTo(writer io.Writer) (int64, error) { if !i.Cq.Packed() && i.Sq.Packed() { // Type IORings doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IOSqRingOffsets) SizeBytes() int { return 40 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IOSqRingOffsets) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Head)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Tail)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.RingMask)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.RingEntries)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Dropped)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Array)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Resv1)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.Resv2)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IOSqRingOffsets) UnmarshalBytes(src []byte) []byte { i.Head = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Tail = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.RingMask = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.RingEntries = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Dropped = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Array = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Resv1 = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Resv2 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IOSqRingOffsets) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IOSqRingOffsets) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IOSqRingOffsets) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IOSqRingOffsets) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IOSqRingOffsets) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IOSqRingOffsets) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IOSqRingOffsets) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IOSqRingOffsets) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IOUring) SizeBytes() int { return 8 + 1*60 + 1*60 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IOUring) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Head)) dst = dst[4:] // Padding: dst[:sizeof(byte)*60] ~= [60]byte{0} dst = dst[1*(60):] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Tail)) dst = dst[4:] // Padding: dst[:sizeof(byte)*60] ~= [60]byte{0} dst = dst[1*(60):] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IOUring) UnmarshalBytes(src []byte) []byte { i.Head = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: ~ copy([60]byte(i._), src[:sizeof(byte)*60]) src = src[1*(60):] i.Tail = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: ~ copy([60]byte(i._), src[:sizeof(byte)*60]) src = src[1*(60):] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IOUring) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IOUring) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IOUring) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IOUring) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IOUring) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IOUring) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IOUring) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IOUring) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IOUringCqe) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IOUringCqe) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.UserData)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Res)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Flags)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IOUringCqe) UnmarshalBytes(src []byte) []byte { i.UserData = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] i.Res = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IOUringCqe) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IOUringCqe) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IOUringCqe) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IOUringCqe) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IOUringCqe) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IOUringCqe) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IOUringCqe) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IOUringCqe) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IOUringParams) SizeBytes() int { return 28 + 4*3 + (*IOSqRingOffsets)(nil).SizeBytes() + (*IOCqRingOffsets)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IOUringParams) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.SqEntries)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.CqEntries)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.SqThreadCPU)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.SqThreadIdle)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Features)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.WqFd)) dst = dst[4:] for idx := 0; idx < 3; idx++ { hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Resv[idx])) dst = dst[4:] } dst = i.SqOff.MarshalUnsafe(dst) dst = i.CqOff.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IOUringParams) UnmarshalBytes(src []byte) []byte { i.SqEntries = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.CqEntries = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.SqThreadCPU = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.SqThreadIdle = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Features = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.WqFd = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 3; idx++ { i.Resv[idx] = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] } src = i.SqOff.UnmarshalUnsafe(src) src = i.CqOff.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IOUringParams) Packed() bool { return i.CqOff.Packed() && i.SqOff.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IOUringParams) MarshalUnsafe(dst []byte) []byte { if i.CqOff.Packed() && i.SqOff.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // Type IOUringParams doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IOUringParams) UnmarshalUnsafe(src []byte) []byte { if i.CqOff.Packed() && i.SqOff.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type IOUringParams doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IOUringParams) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.CqOff.Packed() && i.SqOff.Packed() { // Type IOUringParams doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IOUringParams) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IOUringParams) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.CqOff.Packed() && i.SqOff.Packed() { // Type IOUringParams doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IOUringParams) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IOUringParams) WriteTo(writer io.Writer) (int64, error) { if !i.CqOff.Packed() && i.SqOff.Packed() { // Type IOUringParams doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IOUringSqe) SizeBytes() int { return 64 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IOUringSqe) MarshalBytes(dst []byte) []byte { dst[0] = byte(i.Opcode) dst = dst[1:] dst[0] = byte(i.Flags) dst = dst[1:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.IoPrio)) dst = dst[2:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Fd)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.OffOrAddrOrCmdOp)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.AddrOrSpliceOff)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Len)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.specialFlags)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.UserData)) dst = dst[8:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.BufIndexOrGroup)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.personality)) dst = dst[2:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.spliceFDOrFileIndex)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.addr3)) dst = dst[8:] // Padding: dst[:sizeof(uint64)] ~= uint64(0) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IOUringSqe) UnmarshalBytes(src []byte) []byte { i.Opcode = uint8(src[0]) src = src[1:] i.Flags = uint8(src[0]) src = src[1:] i.IoPrio = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.Fd = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.OffOrAddrOrCmdOp = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] i.AddrOrSpliceOff = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] i.Len = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.specialFlags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.UserData = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] i.BufIndexOrGroup = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.personality = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.spliceFDOrFileIndex = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.addr3 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] // Padding: var _ uint64 ~= src[:sizeof(uint64)] src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IOUringSqe) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IOUringSqe) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IOUringSqe) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IOUringSqe) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IOUringSqe) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IOUringSqe) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IOUringSqe) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IOUringSqe) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IPCPerm) SizeBytes() int { return 48 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IPCPerm) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Key)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.UID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.GID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.CUID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.CGID)) dst = dst[4:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.Mode)) dst = dst[2:] // Padding: dst[:sizeof(uint16)] ~= uint16(0) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.Seq)) dst = dst[2:] // Padding: dst[:sizeof(uint16)] ~= uint16(0) dst = dst[2:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.unused1)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.unused2)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IPCPerm) UnmarshalBytes(src []byte) []byte { i.Key = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.UID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.GID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.CUID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.CGID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Mode = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] // Padding: var _ uint16 ~= src[:sizeof(uint16)] src = src[2:] i.Seq = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] // Padding: var _ uint16 ~= src[:sizeof(uint16)] src = src[2:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] i.unused1 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] i.unused2 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IPCPerm) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IPCPerm) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IPCPerm) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IPCPerm) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IPCPerm) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IPCPerm) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IPCPerm) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IPCPerm) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *Sysinfo) SizeBytes() int { return 78 + 8*3 + 1*6 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *Sysinfo) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Uptime)) dst = dst[8:] for idx := 0; idx < 3; idx++ { hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Loads[idx])) dst = dst[8:] } hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.TotalRAM)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.FreeRAM)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.SharedRAM)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.BufferRAM)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.TotalSwap)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.FreeSwap)) dst = dst[8:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.Procs)) dst = dst[2:] // Padding: dst[:sizeof(byte)*6] ~= [6]byte{0} dst = dst[1*(6):] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.TotalHigh)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.FreeHigh)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Unit)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *Sysinfo) UnmarshalBytes(src []byte) []byte { s.Uptime = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] for idx := 0; idx < 3; idx++ { s.Loads[idx] = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] } s.TotalRAM = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.FreeRAM = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.SharedRAM = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.BufferRAM = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.TotalSwap = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.FreeSwap = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Procs = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] // Padding: ~ copy([6]byte(s._), src[:sizeof(byte)*6]) src = src[1*(6):] s.TotalHigh = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.FreeHigh = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Unit = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *Sysinfo) Packed() bool { return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *Sysinfo) MarshalUnsafe(dst []byte) []byte { // Type Sysinfo doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *Sysinfo) UnmarshalUnsafe(src []byte) []byte { // Type Sysinfo doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *Sysinfo) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type Sysinfo doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (s *Sysinfo) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *Sysinfo) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type Sysinfo doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *Sysinfo) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *Sysinfo) WriteTo(writer io.Writer) (int64, error) { // Type Sysinfo doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (n *NumaPolicy) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NumaPolicy) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(*n)) return dst[4:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NumaPolicy) UnmarshalBytes(src []byte) []byte { *n = NumaPolicy(int32(hostarch.ByteOrder.Uint32(src[:4]))) return src[4:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NumaPolicy) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NumaPolicy) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NumaPolicy) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NumaPolicy) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NumaPolicy) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NumaPolicy) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NumaPolicy) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NumaPolicy) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (m *MqAttr) SizeBytes() int { return 32 + 8*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (m *MqAttr) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.MqFlags)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.MqMaxmsg)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.MqMsgsize)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.MqCurmsgs)) dst = dst[8:] // Padding: dst[:sizeof(int64)*4] ~= [4]int64{0} dst = dst[8*(4):] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (m *MqAttr) UnmarshalBytes(src []byte) []byte { m.MqFlags = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] m.MqMaxmsg = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] m.MqMsgsize = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] m.MqCurmsgs = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] // Padding: ~ copy([4]int64(m._), src[:sizeof(int64)*4]) src = src[8*(4):] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (m *MqAttr) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (m *MqAttr) MarshalUnsafe(dst []byte) []byte { size := m.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(m), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (m *MqAttr) UnmarshalUnsafe(src []byte) []byte { size := m.SizeBytes() gohacks.Memmove(unsafe.Pointer(m), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (m *MqAttr) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (m *MqAttr) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyOutN(cc, addr, m.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (m *MqAttr) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (m *MqAttr) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyInN(cc, addr, m.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (m *MqAttr) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (b *MsgBuf) Packed() bool { // Type MsgBuf is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (b *MsgBuf) MarshalUnsafe(dst []byte) []byte { // Type MsgBuf doesn't have a packed layout in memory, fallback to MarshalBytes. return b.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (b *MsgBuf) UnmarshalUnsafe(src []byte) []byte { // Type MsgBuf doesn't have a packed layout in memory, fallback to UnmarshalBytes. return b.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (b *MsgBuf) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type MsgBuf doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(b.SizeBytes()) // escapes: okay. b.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (b *MsgBuf) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return b.CopyOutN(cc, addr, b.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (b *MsgBuf) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type MsgBuf doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(b.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. b.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (b *MsgBuf) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return b.CopyInN(cc, addr, b.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (b *MsgBuf) WriteTo(writer io.Writer) (int64, error) { // Type MsgBuf doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, b.SizeBytes()) b.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (m *MsgInfo) SizeBytes() int { return 30 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (m *MsgInfo) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(m.MsgPool)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(m.MsgMap)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(m.MsgMax)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(m.MsgMnb)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(m.MsgMni)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(m.MsgSsz)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(m.MsgTql)) dst = dst[4:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(m.MsgSeg)) dst = dst[2:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (m *MsgInfo) UnmarshalBytes(src []byte) []byte { m.MsgPool = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] m.MsgMap = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] m.MsgMax = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] m.MsgMnb = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] m.MsgMni = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] m.MsgSsz = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] m.MsgTql = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] m.MsgSeg = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (m *MsgInfo) Packed() bool { return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (m *MsgInfo) MarshalUnsafe(dst []byte) []byte { // Type MsgInfo doesn't have a packed layout in memory, fallback to MarshalBytes. return m.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (m *MsgInfo) UnmarshalUnsafe(src []byte) []byte { // Type MsgInfo doesn't have a packed layout in memory, fallback to UnmarshalBytes. return m.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (m *MsgInfo) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type MsgInfo doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(m.SizeBytes()) // escapes: okay. m.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (m *MsgInfo) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyOutN(cc, addr, m.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (m *MsgInfo) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type MsgInfo doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(m.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. m.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (m *MsgInfo) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyInN(cc, addr, m.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (m *MsgInfo) WriteTo(writer io.Writer) (int64, error) { // Type MsgInfo doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, m.SizeBytes()) m.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (m *MsqidDS) SizeBytes() int { return 48 + (*IPCPerm)(nil).SizeBytes() + (*TimeT)(nil).SizeBytes() + (*TimeT)(nil).SizeBytes() + (*TimeT)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (m *MsqidDS) MarshalBytes(dst []byte) []byte { dst = m.MsgPerm.MarshalUnsafe(dst) dst = m.MsgStime.MarshalUnsafe(dst) dst = m.MsgRtime.MarshalUnsafe(dst) dst = m.MsgCtime.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.MsgCbytes)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.MsgQnum)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.MsgQbytes)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(m.MsgLspid)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(m.MsgLrpid)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.unused4)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.unused5)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (m *MsqidDS) UnmarshalBytes(src []byte) []byte { src = m.MsgPerm.UnmarshalUnsafe(src) src = m.MsgStime.UnmarshalUnsafe(src) src = m.MsgRtime.UnmarshalUnsafe(src) src = m.MsgCtime.UnmarshalUnsafe(src) m.MsgCbytes = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] m.MsgQnum = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] m.MsgQbytes = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] m.MsgLspid = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] m.MsgLrpid = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] m.unused4 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] m.unused5 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (m *MsqidDS) Packed() bool { return m.MsgCtime.Packed() && m.MsgPerm.Packed() && m.MsgRtime.Packed() && m.MsgStime.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (m *MsqidDS) MarshalUnsafe(dst []byte) []byte { if m.MsgCtime.Packed() && m.MsgPerm.Packed() && m.MsgRtime.Packed() && m.MsgStime.Packed() { size := m.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(m), uintptr(size)) return dst[size:] } // Type MsqidDS doesn't have a packed layout in memory, fallback to MarshalBytes. return m.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (m *MsqidDS) UnmarshalUnsafe(src []byte) []byte { if m.MsgCtime.Packed() && m.MsgPerm.Packed() && m.MsgRtime.Packed() && m.MsgStime.Packed() { size := m.SizeBytes() gohacks.Memmove(unsafe.Pointer(m), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type MsqidDS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return m.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (m *MsqidDS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !m.MsgCtime.Packed() && m.MsgPerm.Packed() && m.MsgRtime.Packed() && m.MsgStime.Packed() { // Type MsqidDS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(m.SizeBytes()) // escapes: okay. m.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (m *MsqidDS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyOutN(cc, addr, m.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (m *MsqidDS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !m.MsgCtime.Packed() && m.MsgPerm.Packed() && m.MsgRtime.Packed() && m.MsgStime.Packed() { // Type MsqidDS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(m.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. m.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (m *MsqidDS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyInN(cc, addr, m.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (m *MsqidDS) WriteTo(writer io.Writer) (int64, error) { if !m.MsgCtime.Packed() && m.MsgPerm.Packed() && m.MsgRtime.Packed() && m.MsgStime.Packed() { // Type MsqidDS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, m.SizeBytes()) m.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (e *EthtoolCmd) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (e *EthtoolCmd) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(*e)) return dst[4:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (e *EthtoolCmd) UnmarshalBytes(src []byte) []byte { *e = EthtoolCmd(uint32(hostarch.ByteOrder.Uint32(src[:4]))) return src[4:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (e *EthtoolCmd) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (e *EthtoolCmd) MarshalUnsafe(dst []byte) []byte { size := e.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(e), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (e *EthtoolCmd) UnmarshalUnsafe(src []byte) []byte { size := e.SizeBytes() gohacks.Memmove(unsafe.Pointer(e), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (e *EthtoolCmd) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (e *EthtoolCmd) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return e.CopyOutN(cc, addr, e.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (e *EthtoolCmd) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (e *EthtoolCmd) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return e.CopyInN(cc, addr, e.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (e *EthtoolCmd) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (e *EthtoolGFeatures) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (e *EthtoolGFeatures) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.Cmd)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.Size)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (e *EthtoolGFeatures) UnmarshalBytes(src []byte) []byte { e.Cmd = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] e.Size = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (e *EthtoolGFeatures) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (e *EthtoolGFeatures) MarshalUnsafe(dst []byte) []byte { size := e.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(e), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (e *EthtoolGFeatures) UnmarshalUnsafe(src []byte) []byte { size := e.SizeBytes() gohacks.Memmove(unsafe.Pointer(e), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (e *EthtoolGFeatures) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (e *EthtoolGFeatures) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return e.CopyOutN(cc, addr, e.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (e *EthtoolGFeatures) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (e *EthtoolGFeatures) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return e.CopyInN(cc, addr, e.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (e *EthtoolGFeatures) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (e *EthtoolGetFeaturesBlock) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (e *EthtoolGetFeaturesBlock) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.Available)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.Requested)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.Active)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.NeverChanged)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (e *EthtoolGetFeaturesBlock) UnmarshalBytes(src []byte) []byte { e.Available = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] e.Requested = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] e.Active = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] e.NeverChanged = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (e *EthtoolGetFeaturesBlock) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (e *EthtoolGetFeaturesBlock) MarshalUnsafe(dst []byte) []byte { size := e.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(e), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (e *EthtoolGetFeaturesBlock) UnmarshalUnsafe(src []byte) []byte { size := e.SizeBytes() gohacks.Memmove(unsafe.Pointer(e), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (e *EthtoolGetFeaturesBlock) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (e *EthtoolGetFeaturesBlock) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return e.CopyOutN(cc, addr, e.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (e *EthtoolGetFeaturesBlock) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (e *EthtoolGetFeaturesBlock) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return e.CopyInN(cc, addr, e.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (e *EthtoolGetFeaturesBlock) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IFConf) SizeBytes() int { return 12 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IFConf) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Len)) dst = dst[4:] // Padding: dst[:sizeof(byte)*4] ~= [4]byte{0} dst = dst[1*(4):] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.Ptr)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IFConf) UnmarshalBytes(src []byte) []byte { i.Len = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: ~ copy([4]byte(i._), src[:sizeof(byte)*4]) src = src[1*(4):] i.Ptr = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IFConf) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IFConf) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IFConf) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IFConf) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IFConf) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IFConf) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IFConf) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IFConf) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (ifr *IFReq) SizeBytes() int { return 0 + 1*IFNAMSIZ + 1*24 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (ifr *IFReq) MarshalBytes(dst []byte) []byte { for idx := 0; idx < IFNAMSIZ; idx++ { dst[0] = byte(ifr.IFName[idx]) dst = dst[1:] } for idx := 0; idx < 24; idx++ { dst[0] = byte(ifr.Data[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (ifr *IFReq) UnmarshalBytes(src []byte) []byte { for idx := 0; idx < IFNAMSIZ; idx++ { ifr.IFName[idx] = src[0] src = src[1:] } for idx := 0; idx < 24; idx++ { ifr.Data[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (ifr *IFReq) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (ifr *IFReq) MarshalUnsafe(dst []byte) []byte { size := ifr.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(ifr), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (ifr *IFReq) UnmarshalUnsafe(src []byte) []byte { size := ifr.SizeBytes() gohacks.Memmove(unsafe.Pointer(ifr), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (ifr *IFReq) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(ifr))) hdr.Len = ifr.SizeBytes() hdr.Cap = ifr.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that ifr // must live until the use above. runtime.KeepAlive(ifr) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (ifr *IFReq) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return ifr.CopyOutN(cc, addr, ifr.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (ifr *IFReq) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(ifr))) hdr.Len = ifr.SizeBytes() hdr.Cap = ifr.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that ifr // must live until the use above. runtime.KeepAlive(ifr) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (ifr *IFReq) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return ifr.CopyInN(cc, addr, ifr.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (ifr *IFReq) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(ifr))) hdr.Len = ifr.SizeBytes() hdr.Cap = ifr.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that ifr // must live until the use above. runtime.KeepAlive(ifr) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (en *ErrorName) SizeBytes() int { return 1 * XT_FUNCTION_MAXNAMELEN } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (en *ErrorName) MarshalBytes(dst []byte) []byte { for idx := 0; idx < XT_FUNCTION_MAXNAMELEN; idx++ { dst[0] = byte(en[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (en *ErrorName) UnmarshalBytes(src []byte) []byte { for idx := 0; idx < XT_FUNCTION_MAXNAMELEN; idx++ { en[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (en *ErrorName) Packed() bool { // Array newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (en *ErrorName) MarshalUnsafe(dst []byte) []byte { size := en.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&en[0]), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (en *ErrorName) UnmarshalUnsafe(src []byte) []byte { size := en.SizeBytes() gohacks.Memmove(unsafe.Pointer(en), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (en *ErrorName) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(en))) hdr.Len = en.SizeBytes() hdr.Cap = en.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that en // must live until the use above. runtime.KeepAlive(en) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (en *ErrorName) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return en.CopyOutN(cc, addr, en.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (en *ErrorName) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(en))) hdr.Len = en.SizeBytes() hdr.Cap = en.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that en // must live until the use above. runtime.KeepAlive(en) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (en *ErrorName) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return en.CopyInN(cc, addr, en.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (en *ErrorName) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(en))) hdr.Len = en.SizeBytes() hdr.Cap = en.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that en // must live until the use above. runtime.KeepAlive(en) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (en *ExtensionName) SizeBytes() int { return 1 * XT_EXTENSION_MAXNAMELEN } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (en *ExtensionName) MarshalBytes(dst []byte) []byte { for idx := 0; idx < XT_EXTENSION_MAXNAMELEN; idx++ { dst[0] = byte(en[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (en *ExtensionName) UnmarshalBytes(src []byte) []byte { for idx := 0; idx < XT_EXTENSION_MAXNAMELEN; idx++ { en[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (en *ExtensionName) Packed() bool { // Array newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (en *ExtensionName) MarshalUnsafe(dst []byte) []byte { size := en.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&en[0]), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (en *ExtensionName) UnmarshalUnsafe(src []byte) []byte { size := en.SizeBytes() gohacks.Memmove(unsafe.Pointer(en), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (en *ExtensionName) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(en))) hdr.Len = en.SizeBytes() hdr.Cap = en.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that en // must live until the use above. runtime.KeepAlive(en) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (en *ExtensionName) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return en.CopyOutN(cc, addr, en.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (en *ExtensionName) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(en))) hdr.Len = en.SizeBytes() hdr.Cap = en.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that en // must live until the use above. runtime.KeepAlive(en) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (en *ExtensionName) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return en.CopyInN(cc, addr, en.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (en *ExtensionName) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(en))) hdr.Len = en.SizeBytes() hdr.Cap = en.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that en // must live until the use above. runtime.KeepAlive(en) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IPTEntry) SizeBytes() int { return 12 + (*IPTIP)(nil).SizeBytes() + (*XTCounters)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IPTEntry) MarshalBytes(dst []byte) []byte { dst = i.IP.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.NFCache)) dst = dst[4:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.TargetOffset)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.NextOffset)) dst = dst[2:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Comeback)) dst = dst[4:] dst = i.Counters.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IPTEntry) UnmarshalBytes(src []byte) []byte { src = i.IP.UnmarshalUnsafe(src) i.NFCache = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.TargetOffset = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.NextOffset = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.Comeback = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = i.Counters.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IPTEntry) Packed() bool { return i.Counters.Packed() && i.IP.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IPTEntry) MarshalUnsafe(dst []byte) []byte { if i.Counters.Packed() && i.IP.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // Type IPTEntry doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IPTEntry) UnmarshalUnsafe(src []byte) []byte { if i.Counters.Packed() && i.IP.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type IPTEntry doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IPTEntry) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Counters.Packed() && i.IP.Packed() { // Type IPTEntry doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IPTEntry) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IPTEntry) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Counters.Packed() && i.IP.Packed() { // Type IPTEntry doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IPTEntry) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IPTEntry) WriteTo(writer io.Writer) (int64, error) { if !i.Counters.Packed() && i.IP.Packed() { // Type IPTEntry doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IPTGetEntries) SizeBytes() int { return 4 + (*TableName)(nil).SizeBytes() + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IPTGetEntries) MarshalBytes(dst []byte) []byte { dst = i.Name.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Size)) dst = dst[4:] // Padding: dst[:sizeof(byte)*4] ~= [4]byte{0} dst = dst[1*(4):] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IPTGetEntries) UnmarshalBytes(src []byte) []byte { src = i.Name.UnmarshalUnsafe(src) i.Size = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: ~ copy([4]byte(i._), src[:sizeof(byte)*4]) src = src[1*(4):] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IPTGetEntries) Packed() bool { return i.Name.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IPTGetEntries) MarshalUnsafe(dst []byte) []byte { if i.Name.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // Type IPTGetEntries doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IPTGetEntries) UnmarshalUnsafe(src []byte) []byte { if i.Name.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type IPTGetEntries doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IPTGetEntries) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Name.Packed() { // Type IPTGetEntries doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IPTGetEntries) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IPTGetEntries) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Name.Packed() { // Type IPTGetEntries doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IPTGetEntries) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IPTGetEntries) WriteTo(writer io.Writer) (int64, error) { if !i.Name.Packed() { // Type IPTGetEntries doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IPTGetinfo) SizeBytes() int { return 12 + (*TableName)(nil).SizeBytes() + 4*NF_INET_NUMHOOKS + 4*NF_INET_NUMHOOKS } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IPTGetinfo) MarshalBytes(dst []byte) []byte { dst = i.Name.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.ValidHooks)) dst = dst[4:] for idx := 0; idx < NF_INET_NUMHOOKS; idx++ { hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.HookEntry[idx])) dst = dst[4:] } for idx := 0; idx < NF_INET_NUMHOOKS; idx++ { hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Underflow[idx])) dst = dst[4:] } hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.NumEntries)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Size)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IPTGetinfo) UnmarshalBytes(src []byte) []byte { src = i.Name.UnmarshalUnsafe(src) i.ValidHooks = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < NF_INET_NUMHOOKS; idx++ { i.HookEntry[idx] = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] } for idx := 0; idx < NF_INET_NUMHOOKS; idx++ { i.Underflow[idx] = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] } i.NumEntries = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Size = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IPTGetinfo) Packed() bool { return i.Name.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IPTGetinfo) MarshalUnsafe(dst []byte) []byte { if i.Name.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // Type IPTGetinfo doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IPTGetinfo) UnmarshalUnsafe(src []byte) []byte { if i.Name.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type IPTGetinfo doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IPTGetinfo) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Name.Packed() { // Type IPTGetinfo doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IPTGetinfo) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IPTGetinfo) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Name.Packed() { // Type IPTGetinfo doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IPTGetinfo) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IPTGetinfo) WriteTo(writer io.Writer) (int64, error) { if !i.Name.Packed() { // Type IPTGetinfo doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IPTIP) SizeBytes() int { return 4 + (*InetAddr)(nil).SizeBytes() + (*InetAddr)(nil).SizeBytes() + (*InetAddr)(nil).SizeBytes() + (*InetAddr)(nil).SizeBytes() + 1*IFNAMSIZ + 1*IFNAMSIZ + 1*IFNAMSIZ + 1*IFNAMSIZ } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IPTIP) MarshalBytes(dst []byte) []byte { dst = i.Src.MarshalUnsafe(dst) dst = i.Dst.MarshalUnsafe(dst) dst = i.SrcMask.MarshalUnsafe(dst) dst = i.DstMask.MarshalUnsafe(dst) for idx := 0; idx < IFNAMSIZ; idx++ { dst[0] = byte(i.InputInterface[idx]) dst = dst[1:] } for idx := 0; idx < IFNAMSIZ; idx++ { dst[0] = byte(i.OutputInterface[idx]) dst = dst[1:] } for idx := 0; idx < IFNAMSIZ; idx++ { dst[0] = byte(i.InputInterfaceMask[idx]) dst = dst[1:] } for idx := 0; idx < IFNAMSIZ; idx++ { dst[0] = byte(i.OutputInterfaceMask[idx]) dst = dst[1:] } hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.Protocol)) dst = dst[2:] dst[0] = byte(i.Flags) dst = dst[1:] dst[0] = byte(i.InverseFlags) dst = dst[1:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IPTIP) UnmarshalBytes(src []byte) []byte { src = i.Src.UnmarshalUnsafe(src) src = i.Dst.UnmarshalUnsafe(src) src = i.SrcMask.UnmarshalUnsafe(src) src = i.DstMask.UnmarshalUnsafe(src) for idx := 0; idx < IFNAMSIZ; idx++ { i.InputInterface[idx] = src[0] src = src[1:] } for idx := 0; idx < IFNAMSIZ; idx++ { i.OutputInterface[idx] = src[0] src = src[1:] } for idx := 0; idx < IFNAMSIZ; idx++ { i.InputInterfaceMask[idx] = src[0] src = src[1:] } for idx := 0; idx < IFNAMSIZ; idx++ { i.OutputInterfaceMask[idx] = src[0] src = src[1:] } i.Protocol = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.Flags = uint8(src[0]) src = src[1:] i.InverseFlags = uint8(src[0]) src = src[1:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IPTIP) Packed() bool { return i.Dst.Packed() && i.DstMask.Packed() && i.Src.Packed() && i.SrcMask.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IPTIP) MarshalUnsafe(dst []byte) []byte { if i.Dst.Packed() && i.DstMask.Packed() && i.Src.Packed() && i.SrcMask.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // Type IPTIP doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IPTIP) UnmarshalUnsafe(src []byte) []byte { if i.Dst.Packed() && i.DstMask.Packed() && i.Src.Packed() && i.SrcMask.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type IPTIP doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IPTIP) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Dst.Packed() && i.DstMask.Packed() && i.Src.Packed() && i.SrcMask.Packed() { // Type IPTIP doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IPTIP) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IPTIP) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Dst.Packed() && i.DstMask.Packed() && i.Src.Packed() && i.SrcMask.Packed() { // Type IPTIP doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IPTIP) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IPTIP) WriteTo(writer io.Writer) (int64, error) { if !i.Dst.Packed() && i.DstMask.Packed() && i.Src.Packed() && i.SrcMask.Packed() { // Type IPTIP doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IPTOwnerInfo) SizeBytes() int { return 18 + 1*16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IPTOwnerInfo) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.UID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.GID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.PID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.SID)) dst = dst[4:] for idx := 0; idx < 16; idx++ { dst[0] = byte(i.Comm[idx]) dst = dst[1:] } dst[0] = byte(i.Match) dst = dst[1:] dst[0] = byte(i.Invert) dst = dst[1:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IPTOwnerInfo) UnmarshalBytes(src []byte) []byte { i.UID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.GID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.PID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.SID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 16; idx++ { i.Comm[idx] = src[0] src = src[1:] } i.Match = uint8(src[0]) src = src[1:] i.Invert = uint8(src[0]) src = src[1:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IPTOwnerInfo) Packed() bool { return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IPTOwnerInfo) MarshalUnsafe(dst []byte) []byte { // Type IPTOwnerInfo doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IPTOwnerInfo) UnmarshalUnsafe(src []byte) []byte { // Type IPTOwnerInfo doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IPTOwnerInfo) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type IPTOwnerInfo doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IPTOwnerInfo) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IPTOwnerInfo) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type IPTOwnerInfo doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IPTOwnerInfo) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IPTOwnerInfo) WriteTo(writer io.Writer) (int64, error) { // Type IPTOwnerInfo doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IPTReplace) SizeBytes() int { return 24 + (*TableName)(nil).SizeBytes() + 4*NF_INET_NUMHOOKS + 4*NF_INET_NUMHOOKS } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IPTReplace) MarshalBytes(dst []byte) []byte { dst = i.Name.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.ValidHooks)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.NumEntries)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Size)) dst = dst[4:] for idx := 0; idx < NF_INET_NUMHOOKS; idx++ { hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.HookEntry[idx])) dst = dst[4:] } for idx := 0; idx < NF_INET_NUMHOOKS; idx++ { hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Underflow[idx])) dst = dst[4:] } hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.NumCounters)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.Counters)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IPTReplace) UnmarshalBytes(src []byte) []byte { src = i.Name.UnmarshalUnsafe(src) i.ValidHooks = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.NumEntries = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Size = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < NF_INET_NUMHOOKS; idx++ { i.HookEntry[idx] = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] } for idx := 0; idx < NF_INET_NUMHOOKS; idx++ { i.Underflow[idx] = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] } i.NumCounters = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Counters = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IPTReplace) Packed() bool { return i.Name.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IPTReplace) MarshalUnsafe(dst []byte) []byte { if i.Name.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // Type IPTReplace doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IPTReplace) UnmarshalUnsafe(src []byte) []byte { if i.Name.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type IPTReplace doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IPTReplace) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Name.Packed() { // Type IPTReplace doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IPTReplace) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IPTReplace) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Name.Packed() { // Type IPTReplace doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IPTReplace) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IPTReplace) WriteTo(writer io.Writer) (int64, error) { if !i.Name.Packed() { // Type IPTReplace doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (ke *KernelIPTEntry) Packed() bool { // Type KernelIPTEntry is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (ke *KernelIPTEntry) MarshalUnsafe(dst []byte) []byte { // Type KernelIPTEntry doesn't have a packed layout in memory, fallback to MarshalBytes. return ke.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (ke *KernelIPTEntry) UnmarshalUnsafe(src []byte) []byte { // Type KernelIPTEntry doesn't have a packed layout in memory, fallback to UnmarshalBytes. return ke.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (ke *KernelIPTEntry) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type KernelIPTEntry doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay. ke.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (ke *KernelIPTEntry) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return ke.CopyOutN(cc, addr, ke.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (ke *KernelIPTEntry) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type KernelIPTEntry doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. ke.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (ke *KernelIPTEntry) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return ke.CopyInN(cc, addr, ke.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (ke *KernelIPTEntry) WriteTo(writer io.Writer) (int64, error) { // Type KernelIPTEntry doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, ke.SizeBytes()) ke.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (ke *KernelIPTGetEntries) Packed() bool { // Type KernelIPTGetEntries is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (ke *KernelIPTGetEntries) MarshalUnsafe(dst []byte) []byte { // Type KernelIPTGetEntries doesn't have a packed layout in memory, fallback to MarshalBytes. return ke.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (ke *KernelIPTGetEntries) UnmarshalUnsafe(src []byte) []byte { // Type KernelIPTGetEntries doesn't have a packed layout in memory, fallback to UnmarshalBytes. return ke.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (ke *KernelIPTGetEntries) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type KernelIPTGetEntries doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay. ke.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (ke *KernelIPTGetEntries) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return ke.CopyOutN(cc, addr, ke.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (ke *KernelIPTGetEntries) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type KernelIPTGetEntries doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. ke.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (ke *KernelIPTGetEntries) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return ke.CopyInN(cc, addr, ke.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (ke *KernelIPTGetEntries) WriteTo(writer io.Writer) (int64, error) { // Type KernelIPTGetEntries doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, ke.SizeBytes()) ke.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NfNATIPV4MultiRangeCompat) SizeBytes() int { return 4 + (*NfNATIPV4Range)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NfNATIPV4MultiRangeCompat) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.RangeSize)) dst = dst[4:] dst = n.RangeIPV4.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NfNATIPV4MultiRangeCompat) UnmarshalBytes(src []byte) []byte { n.RangeSize = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = n.RangeIPV4.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NfNATIPV4MultiRangeCompat) Packed() bool { return n.RangeIPV4.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NfNATIPV4MultiRangeCompat) MarshalUnsafe(dst []byte) []byte { if n.RangeIPV4.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NfNATIPV4MultiRangeCompat doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NfNATIPV4MultiRangeCompat) UnmarshalUnsafe(src []byte) []byte { if n.RangeIPV4.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NfNATIPV4MultiRangeCompat doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NfNATIPV4MultiRangeCompat) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.RangeIPV4.Packed() { // Type NfNATIPV4MultiRangeCompat doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NfNATIPV4MultiRangeCompat) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NfNATIPV4MultiRangeCompat) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.RangeIPV4.Packed() { // Type NfNATIPV4MultiRangeCompat doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NfNATIPV4MultiRangeCompat) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NfNATIPV4MultiRangeCompat) WriteTo(writer io.Writer) (int64, error) { if !n.RangeIPV4.Packed() { // Type NfNATIPV4MultiRangeCompat doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NfNATIPV4Range) SizeBytes() int { return 8 + 1*4 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NfNATIPV4Range) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(n.MinIP[idx]) dst = dst[1:] } for idx := 0; idx < 4; idx++ { dst[0] = byte(n.MaxIP[idx]) dst = dst[1:] } hostarch.ByteOrder.PutUint16(dst[:2], uint16(n.MinPort)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(n.MaxPort)) dst = dst[2:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NfNATIPV4Range) UnmarshalBytes(src []byte) []byte { n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { n.MinIP[idx] = src[0] src = src[1:] } for idx := 0; idx < 4; idx++ { n.MaxIP[idx] = src[0] src = src[1:] } n.MinPort = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] n.MaxPort = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NfNATIPV4Range) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NfNATIPV4Range) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NfNATIPV4Range) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NfNATIPV4Range) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NfNATIPV4Range) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NfNATIPV4Range) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NfNATIPV4Range) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NfNATIPV4Range) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (tn *TableName) SizeBytes() int { return 1 * XT_TABLE_MAXNAMELEN } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (tn *TableName) MarshalBytes(dst []byte) []byte { for idx := 0; idx < XT_TABLE_MAXNAMELEN; idx++ { dst[0] = byte(tn[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (tn *TableName) UnmarshalBytes(src []byte) []byte { for idx := 0; idx < XT_TABLE_MAXNAMELEN; idx++ { tn[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (tn *TableName) Packed() bool { // Array newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (tn *TableName) MarshalUnsafe(dst []byte) []byte { size := tn.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&tn[0]), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (tn *TableName) UnmarshalUnsafe(src []byte) []byte { size := tn.SizeBytes() gohacks.Memmove(unsafe.Pointer(tn), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (tn *TableName) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(tn))) hdr.Len = tn.SizeBytes() hdr.Cap = tn.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that tn // must live until the use above. runtime.KeepAlive(tn) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (tn *TableName) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return tn.CopyOutN(cc, addr, tn.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (tn *TableName) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(tn))) hdr.Len = tn.SizeBytes() hdr.Cap = tn.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that tn // must live until the use above. runtime.KeepAlive(tn) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (tn *TableName) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return tn.CopyInN(cc, addr, tn.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (tn *TableName) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(tn))) hdr.Len = tn.SizeBytes() hdr.Cap = tn.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that tn // must live until the use above. runtime.KeepAlive(tn) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (x *XTCounters) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (x *XTCounters) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(x.Pcnt)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(x.Bcnt)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (x *XTCounters) UnmarshalBytes(src []byte) []byte { x.Pcnt = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] x.Bcnt = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (x *XTCounters) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (x *XTCounters) MarshalUnsafe(dst []byte) []byte { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(x), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (x *XTCounters) UnmarshalUnsafe(src []byte) []byte { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(x), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (x *XTCounters) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (x *XTCounters) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyOutN(cc, addr, x.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (x *XTCounters) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (x *XTCounters) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyInN(cc, addr, x.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (x *XTCounters) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (x *XTEntryMatch) SizeBytes() int { return 3 + (*ExtensionName)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (x *XTEntryMatch) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(x.MatchSize)) dst = dst[2:] dst = x.Name.MarshalUnsafe(dst) dst[0] = byte(x.Revision) dst = dst[1:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (x *XTEntryMatch) UnmarshalBytes(src []byte) []byte { x.MatchSize = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] src = x.Name.UnmarshalUnsafe(src) x.Revision = uint8(src[0]) src = src[1:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (x *XTEntryMatch) Packed() bool { return x.Name.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (x *XTEntryMatch) MarshalUnsafe(dst []byte) []byte { if x.Name.Packed() { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(x), uintptr(size)) return dst[size:] } // Type XTEntryMatch doesn't have a packed layout in memory, fallback to MarshalBytes. return x.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (x *XTEntryMatch) UnmarshalUnsafe(src []byte) []byte { if x.Name.Packed() { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(x), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type XTEntryMatch doesn't have a packed layout in memory, fallback to UnmarshalBytes. return x.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (x *XTEntryMatch) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !x.Name.Packed() { // Type XTEntryMatch doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(x.SizeBytes()) // escapes: okay. x.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (x *XTEntryMatch) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyOutN(cc, addr, x.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (x *XTEntryMatch) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !x.Name.Packed() { // Type XTEntryMatch doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(x.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. x.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (x *XTEntryMatch) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyInN(cc, addr, x.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (x *XTEntryMatch) WriteTo(writer io.Writer) (int64, error) { if !x.Name.Packed() { // Type XTEntryMatch doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, x.SizeBytes()) x.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (x *XTEntryTarget) SizeBytes() int { return 3 + (*ExtensionName)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (x *XTEntryTarget) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(x.TargetSize)) dst = dst[2:] dst = x.Name.MarshalUnsafe(dst) dst[0] = byte(x.Revision) dst = dst[1:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (x *XTEntryTarget) UnmarshalBytes(src []byte) []byte { x.TargetSize = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] src = x.Name.UnmarshalUnsafe(src) x.Revision = uint8(src[0]) src = src[1:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (x *XTEntryTarget) Packed() bool { return x.Name.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (x *XTEntryTarget) MarshalUnsafe(dst []byte) []byte { if x.Name.Packed() { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(x), uintptr(size)) return dst[size:] } // Type XTEntryTarget doesn't have a packed layout in memory, fallback to MarshalBytes. return x.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (x *XTEntryTarget) UnmarshalUnsafe(src []byte) []byte { if x.Name.Packed() { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(x), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type XTEntryTarget doesn't have a packed layout in memory, fallback to UnmarshalBytes. return x.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (x *XTEntryTarget) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !x.Name.Packed() { // Type XTEntryTarget doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(x.SizeBytes()) // escapes: okay. x.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (x *XTEntryTarget) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyOutN(cc, addr, x.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (x *XTEntryTarget) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !x.Name.Packed() { // Type XTEntryTarget doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(x.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. x.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (x *XTEntryTarget) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyInN(cc, addr, x.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (x *XTEntryTarget) WriteTo(writer io.Writer) (int64, error) { if !x.Name.Packed() { // Type XTEntryTarget doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, x.SizeBytes()) x.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (x *XTErrorTarget) SizeBytes() int { return 0 + (*XTEntryTarget)(nil).SizeBytes() + (*ErrorName)(nil).SizeBytes() + 1*2 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (x *XTErrorTarget) MarshalBytes(dst []byte) []byte { dst = x.Target.MarshalUnsafe(dst) dst = x.Name.MarshalUnsafe(dst) // Padding: dst[:sizeof(byte)*2] ~= [2]byte{0} dst = dst[1*(2):] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (x *XTErrorTarget) UnmarshalBytes(src []byte) []byte { src = x.Target.UnmarshalUnsafe(src) src = x.Name.UnmarshalUnsafe(src) // Padding: ~ copy([2]byte(x._), src[:sizeof(byte)*2]) src = src[1*(2):] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (x *XTErrorTarget) Packed() bool { return x.Name.Packed() && x.Target.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (x *XTErrorTarget) MarshalUnsafe(dst []byte) []byte { if x.Name.Packed() && x.Target.Packed() { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(x), uintptr(size)) return dst[size:] } // Type XTErrorTarget doesn't have a packed layout in memory, fallback to MarshalBytes. return x.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (x *XTErrorTarget) UnmarshalUnsafe(src []byte) []byte { if x.Name.Packed() && x.Target.Packed() { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(x), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type XTErrorTarget doesn't have a packed layout in memory, fallback to UnmarshalBytes. return x.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (x *XTErrorTarget) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !x.Name.Packed() && x.Target.Packed() { // Type XTErrorTarget doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(x.SizeBytes()) // escapes: okay. x.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (x *XTErrorTarget) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyOutN(cc, addr, x.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (x *XTErrorTarget) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !x.Name.Packed() && x.Target.Packed() { // Type XTErrorTarget doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(x.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. x.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (x *XTErrorTarget) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyInN(cc, addr, x.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (x *XTErrorTarget) WriteTo(writer io.Writer) (int64, error) { if !x.Name.Packed() && x.Target.Packed() { // Type XTErrorTarget doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, x.SizeBytes()) x.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (x *XTGetRevision) SizeBytes() int { return 1 + (*ExtensionName)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (x *XTGetRevision) MarshalBytes(dst []byte) []byte { dst = x.Name.MarshalUnsafe(dst) dst[0] = byte(x.Revision) dst = dst[1:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (x *XTGetRevision) UnmarshalBytes(src []byte) []byte { src = x.Name.UnmarshalUnsafe(src) x.Revision = uint8(src[0]) src = src[1:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (x *XTGetRevision) Packed() bool { return x.Name.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (x *XTGetRevision) MarshalUnsafe(dst []byte) []byte { if x.Name.Packed() { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(x), uintptr(size)) return dst[size:] } // Type XTGetRevision doesn't have a packed layout in memory, fallback to MarshalBytes. return x.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (x *XTGetRevision) UnmarshalUnsafe(src []byte) []byte { if x.Name.Packed() { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(x), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type XTGetRevision doesn't have a packed layout in memory, fallback to UnmarshalBytes. return x.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (x *XTGetRevision) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !x.Name.Packed() { // Type XTGetRevision doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(x.SizeBytes()) // escapes: okay. x.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (x *XTGetRevision) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyOutN(cc, addr, x.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (x *XTGetRevision) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !x.Name.Packed() { // Type XTGetRevision doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(x.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. x.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (x *XTGetRevision) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyInN(cc, addr, x.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (x *XTGetRevision) WriteTo(writer io.Writer) (int64, error) { if !x.Name.Packed() { // Type XTGetRevision doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, x.SizeBytes()) x.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (x *XTNATTargetV0) SizeBytes() int { return 0 + (*XTEntryTarget)(nil).SizeBytes() + (*NfNATIPV4MultiRangeCompat)(nil).SizeBytes() + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (x *XTNATTargetV0) MarshalBytes(dst []byte) []byte { dst = x.Target.MarshalUnsafe(dst) dst = x.NfRange.MarshalUnsafe(dst) // Padding: dst[:sizeof(byte)*4] ~= [4]byte{0} dst = dst[1*(4):] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (x *XTNATTargetV0) UnmarshalBytes(src []byte) []byte { src = x.Target.UnmarshalUnsafe(src) src = x.NfRange.UnmarshalUnsafe(src) // Padding: ~ copy([4]byte(x._), src[:sizeof(byte)*4]) src = src[1*(4):] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (x *XTNATTargetV0) Packed() bool { return x.NfRange.Packed() && x.Target.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (x *XTNATTargetV0) MarshalUnsafe(dst []byte) []byte { if x.NfRange.Packed() && x.Target.Packed() { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(x), uintptr(size)) return dst[size:] } // Type XTNATTargetV0 doesn't have a packed layout in memory, fallback to MarshalBytes. return x.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (x *XTNATTargetV0) UnmarshalUnsafe(src []byte) []byte { if x.NfRange.Packed() && x.Target.Packed() { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(x), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type XTNATTargetV0 doesn't have a packed layout in memory, fallback to UnmarshalBytes. return x.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (x *XTNATTargetV0) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !x.NfRange.Packed() && x.Target.Packed() { // Type XTNATTargetV0 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(x.SizeBytes()) // escapes: okay. x.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (x *XTNATTargetV0) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyOutN(cc, addr, x.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (x *XTNATTargetV0) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !x.NfRange.Packed() && x.Target.Packed() { // Type XTNATTargetV0 doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(x.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. x.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (x *XTNATTargetV0) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyInN(cc, addr, x.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (x *XTNATTargetV0) WriteTo(writer io.Writer) (int64, error) { if !x.NfRange.Packed() && x.Target.Packed() { // Type XTNATTargetV0 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, x.SizeBytes()) x.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (x *XTNATTargetV1) SizeBytes() int { return 0 + (*XTEntryTarget)(nil).SizeBytes() + (*NFNATRange)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (x *XTNATTargetV1) MarshalBytes(dst []byte) []byte { dst = x.Target.MarshalUnsafe(dst) dst = x.Range.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (x *XTNATTargetV1) UnmarshalBytes(src []byte) []byte { src = x.Target.UnmarshalUnsafe(src) src = x.Range.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (x *XTNATTargetV1) Packed() bool { return x.Range.Packed() && x.Target.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (x *XTNATTargetV1) MarshalUnsafe(dst []byte) []byte { if x.Range.Packed() && x.Target.Packed() { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(x), uintptr(size)) return dst[size:] } // Type XTNATTargetV1 doesn't have a packed layout in memory, fallback to MarshalBytes. return x.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (x *XTNATTargetV1) UnmarshalUnsafe(src []byte) []byte { if x.Range.Packed() && x.Target.Packed() { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(x), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type XTNATTargetV1 doesn't have a packed layout in memory, fallback to UnmarshalBytes. return x.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (x *XTNATTargetV1) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !x.Range.Packed() && x.Target.Packed() { // Type XTNATTargetV1 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(x.SizeBytes()) // escapes: okay. x.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (x *XTNATTargetV1) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyOutN(cc, addr, x.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (x *XTNATTargetV1) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !x.Range.Packed() && x.Target.Packed() { // Type XTNATTargetV1 doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(x.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. x.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (x *XTNATTargetV1) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyInN(cc, addr, x.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (x *XTNATTargetV1) WriteTo(writer io.Writer) (int64, error) { if !x.Range.Packed() && x.Target.Packed() { // Type XTNATTargetV1 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, x.SizeBytes()) x.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (x *XTNATTargetV2) SizeBytes() int { return 0 + (*XTEntryTarget)(nil).SizeBytes() + (*NFNATRange2)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (x *XTNATTargetV2) MarshalBytes(dst []byte) []byte { dst = x.Target.MarshalUnsafe(dst) dst = x.Range.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (x *XTNATTargetV2) UnmarshalBytes(src []byte) []byte { src = x.Target.UnmarshalUnsafe(src) src = x.Range.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (x *XTNATTargetV2) Packed() bool { return x.Range.Packed() && x.Target.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (x *XTNATTargetV2) MarshalUnsafe(dst []byte) []byte { if x.Range.Packed() && x.Target.Packed() { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(x), uintptr(size)) return dst[size:] } // Type XTNATTargetV2 doesn't have a packed layout in memory, fallback to MarshalBytes. return x.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (x *XTNATTargetV2) UnmarshalUnsafe(src []byte) []byte { if x.Range.Packed() && x.Target.Packed() { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(x), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type XTNATTargetV2 doesn't have a packed layout in memory, fallback to UnmarshalBytes. return x.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (x *XTNATTargetV2) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !x.Range.Packed() && x.Target.Packed() { // Type XTNATTargetV2 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(x.SizeBytes()) // escapes: okay. x.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (x *XTNATTargetV2) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyOutN(cc, addr, x.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (x *XTNATTargetV2) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !x.Range.Packed() && x.Target.Packed() { // Type XTNATTargetV2 doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(x.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. x.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (x *XTNATTargetV2) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyInN(cc, addr, x.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (x *XTNATTargetV2) WriteTo(writer io.Writer) (int64, error) { if !x.Range.Packed() && x.Target.Packed() { // Type XTNATTargetV2 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, x.SizeBytes()) x.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (x *XTOwnerMatchInfo) SizeBytes() int { return 18 + 1*2 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (x *XTOwnerMatchInfo) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(x.UIDMin)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(x.UIDMax)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(x.GIDMin)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(x.GIDMax)) dst = dst[4:] dst[0] = byte(x.Match) dst = dst[1:] dst[0] = byte(x.Invert) dst = dst[1:] // Padding: dst[:sizeof(byte)*2] ~= [2]byte{0} dst = dst[1*(2):] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (x *XTOwnerMatchInfo) UnmarshalBytes(src []byte) []byte { x.UIDMin = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] x.UIDMax = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] x.GIDMin = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] x.GIDMax = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] x.Match = uint8(src[0]) src = src[1:] x.Invert = uint8(src[0]) src = src[1:] // Padding: ~ copy([2]byte(x._), src[:sizeof(byte)*2]) src = src[1*(2):] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (x *XTOwnerMatchInfo) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (x *XTOwnerMatchInfo) MarshalUnsafe(dst []byte) []byte { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(x), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (x *XTOwnerMatchInfo) UnmarshalUnsafe(src []byte) []byte { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(x), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (x *XTOwnerMatchInfo) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (x *XTOwnerMatchInfo) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyOutN(cc, addr, x.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (x *XTOwnerMatchInfo) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (x *XTOwnerMatchInfo) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyInN(cc, addr, x.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (x *XTOwnerMatchInfo) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (x *XTRedirectTarget) SizeBytes() int { return 0 + (*XTEntryTarget)(nil).SizeBytes() + (*NfNATIPV4MultiRangeCompat)(nil).SizeBytes() + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (x *XTRedirectTarget) MarshalBytes(dst []byte) []byte { dst = x.Target.MarshalUnsafe(dst) dst = x.NfRange.MarshalUnsafe(dst) // Padding: dst[:sizeof(byte)*4] ~= [4]byte{0} dst = dst[1*(4):] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (x *XTRedirectTarget) UnmarshalBytes(src []byte) []byte { src = x.Target.UnmarshalUnsafe(src) src = x.NfRange.UnmarshalUnsafe(src) // Padding: ~ copy([4]byte(x._), src[:sizeof(byte)*4]) src = src[1*(4):] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (x *XTRedirectTarget) Packed() bool { return x.NfRange.Packed() && x.Target.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (x *XTRedirectTarget) MarshalUnsafe(dst []byte) []byte { if x.NfRange.Packed() && x.Target.Packed() { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(x), uintptr(size)) return dst[size:] } // Type XTRedirectTarget doesn't have a packed layout in memory, fallback to MarshalBytes. return x.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (x *XTRedirectTarget) UnmarshalUnsafe(src []byte) []byte { if x.NfRange.Packed() && x.Target.Packed() { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(x), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type XTRedirectTarget doesn't have a packed layout in memory, fallback to UnmarshalBytes. return x.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (x *XTRedirectTarget) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !x.NfRange.Packed() && x.Target.Packed() { // Type XTRedirectTarget doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(x.SizeBytes()) // escapes: okay. x.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (x *XTRedirectTarget) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyOutN(cc, addr, x.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (x *XTRedirectTarget) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !x.NfRange.Packed() && x.Target.Packed() { // Type XTRedirectTarget doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(x.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. x.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (x *XTRedirectTarget) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyInN(cc, addr, x.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (x *XTRedirectTarget) WriteTo(writer io.Writer) (int64, error) { if !x.NfRange.Packed() && x.Target.Packed() { // Type XTRedirectTarget doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, x.SizeBytes()) x.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (x *XTStandardTarget) SizeBytes() int { return 4 + (*XTEntryTarget)(nil).SizeBytes() + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (x *XTStandardTarget) MarshalBytes(dst []byte) []byte { dst = x.Target.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(x.Verdict)) dst = dst[4:] // Padding: dst[:sizeof(byte)*4] ~= [4]byte{0} dst = dst[1*(4):] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (x *XTStandardTarget) UnmarshalBytes(src []byte) []byte { src = x.Target.UnmarshalUnsafe(src) x.Verdict = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: ~ copy([4]byte(x._), src[:sizeof(byte)*4]) src = src[1*(4):] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (x *XTStandardTarget) Packed() bool { return x.Target.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (x *XTStandardTarget) MarshalUnsafe(dst []byte) []byte { if x.Target.Packed() { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(x), uintptr(size)) return dst[size:] } // Type XTStandardTarget doesn't have a packed layout in memory, fallback to MarshalBytes. return x.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (x *XTStandardTarget) UnmarshalUnsafe(src []byte) []byte { if x.Target.Packed() { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(x), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type XTStandardTarget doesn't have a packed layout in memory, fallback to UnmarshalBytes. return x.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (x *XTStandardTarget) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !x.Target.Packed() { // Type XTStandardTarget doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(x.SizeBytes()) // escapes: okay. x.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (x *XTStandardTarget) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyOutN(cc, addr, x.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (x *XTStandardTarget) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !x.Target.Packed() { // Type XTStandardTarget doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(x.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. x.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (x *XTStandardTarget) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyInN(cc, addr, x.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (x *XTStandardTarget) WriteTo(writer io.Writer) (int64, error) { if !x.Target.Packed() { // Type XTStandardTarget doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, x.SizeBytes()) x.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (x *XTTCP) SizeBytes() int { return 12 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (x *XTTCP) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(x.SourcePortStart)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(x.SourcePortEnd)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(x.DestinationPortStart)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(x.DestinationPortEnd)) dst = dst[2:] dst[0] = byte(x.Option) dst = dst[1:] dst[0] = byte(x.FlagMask) dst = dst[1:] dst[0] = byte(x.FlagCompare) dst = dst[1:] dst[0] = byte(x.InverseFlags) dst = dst[1:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (x *XTTCP) UnmarshalBytes(src []byte) []byte { x.SourcePortStart = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] x.SourcePortEnd = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] x.DestinationPortStart = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] x.DestinationPortEnd = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] x.Option = uint8(src[0]) src = src[1:] x.FlagMask = uint8(src[0]) src = src[1:] x.FlagCompare = uint8(src[0]) src = src[1:] x.InverseFlags = uint8(src[0]) src = src[1:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (x *XTTCP) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (x *XTTCP) MarshalUnsafe(dst []byte) []byte { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(x), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (x *XTTCP) UnmarshalUnsafe(src []byte) []byte { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(x), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (x *XTTCP) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (x *XTTCP) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyOutN(cc, addr, x.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (x *XTTCP) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (x *XTTCP) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyInN(cc, addr, x.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (x *XTTCP) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (x *XTUDP) SizeBytes() int { return 10 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (x *XTUDP) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(x.SourcePortStart)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(x.SourcePortEnd)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(x.DestinationPortStart)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(x.DestinationPortEnd)) dst = dst[2:] dst[0] = byte(x.InverseFlags) dst = dst[1:] // Padding: dst[:sizeof(uint8)] ~= uint8(0) dst = dst[1:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (x *XTUDP) UnmarshalBytes(src []byte) []byte { x.SourcePortStart = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] x.SourcePortEnd = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] x.DestinationPortStart = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] x.DestinationPortEnd = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] x.InverseFlags = uint8(src[0]) src = src[1:] // Padding: var _ uint8 ~= src[:sizeof(uint8)] src = src[1:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (x *XTUDP) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (x *XTUDP) MarshalUnsafe(dst []byte) []byte { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(x), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (x *XTUDP) UnmarshalUnsafe(src []byte) []byte { size := x.SizeBytes() gohacks.Memmove(unsafe.Pointer(x), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (x *XTUDP) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (x *XTUDP) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyOutN(cc, addr, x.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (x *XTUDP) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (x *XTUDP) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return x.CopyInN(cc, addr, x.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (x *XTUDP) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(x))) hdr.Len = x.SizeBytes() hdr.Cap = x.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that x // must live until the use above. runtime.KeepAlive(x) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IP6TEntry) SizeBytes() int { return 12 + (*IP6TIP)(nil).SizeBytes() + 1*4 + (*XTCounters)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IP6TEntry) MarshalBytes(dst []byte) []byte { dst = i.IPv6.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.NFCache)) dst = dst[4:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.TargetOffset)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.NextOffset)) dst = dst[2:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Comeback)) dst = dst[4:] // Padding: dst[:sizeof(byte)*4] ~= [4]byte{0} dst = dst[1*(4):] dst = i.Counters.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IP6TEntry) UnmarshalBytes(src []byte) []byte { src = i.IPv6.UnmarshalUnsafe(src) i.NFCache = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.TargetOffset = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.NextOffset = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.Comeback = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: ~ copy([4]byte(i._), src[:sizeof(byte)*4]) src = src[1*(4):] src = i.Counters.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IP6TEntry) Packed() bool { return i.Counters.Packed() && i.IPv6.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IP6TEntry) MarshalUnsafe(dst []byte) []byte { if i.Counters.Packed() && i.IPv6.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // Type IP6TEntry doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IP6TEntry) UnmarshalUnsafe(src []byte) []byte { if i.Counters.Packed() && i.IPv6.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type IP6TEntry doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IP6TEntry) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Counters.Packed() && i.IPv6.Packed() { // Type IP6TEntry doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IP6TEntry) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IP6TEntry) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Counters.Packed() && i.IPv6.Packed() { // Type IP6TEntry doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IP6TEntry) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IP6TEntry) WriteTo(writer io.Writer) (int64, error) { if !i.Counters.Packed() && i.IPv6.Packed() { // Type IP6TEntry doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IP6TIP) SizeBytes() int { return 5 + (*Inet6Addr)(nil).SizeBytes() + (*Inet6Addr)(nil).SizeBytes() + (*Inet6Addr)(nil).SizeBytes() + (*Inet6Addr)(nil).SizeBytes() + 1*IFNAMSIZ + 1*IFNAMSIZ + 1*IFNAMSIZ + 1*IFNAMSIZ + 1*3 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IP6TIP) MarshalBytes(dst []byte) []byte { dst = i.Src.MarshalUnsafe(dst) dst = i.Dst.MarshalUnsafe(dst) dst = i.SrcMask.MarshalUnsafe(dst) dst = i.DstMask.MarshalUnsafe(dst) for idx := 0; idx < IFNAMSIZ; idx++ { dst[0] = byte(i.InputInterface[idx]) dst = dst[1:] } for idx := 0; idx < IFNAMSIZ; idx++ { dst[0] = byte(i.OutputInterface[idx]) dst = dst[1:] } for idx := 0; idx < IFNAMSIZ; idx++ { dst[0] = byte(i.InputInterfaceMask[idx]) dst = dst[1:] } for idx := 0; idx < IFNAMSIZ; idx++ { dst[0] = byte(i.OutputInterfaceMask[idx]) dst = dst[1:] } hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.Protocol)) dst = dst[2:] dst[0] = byte(i.TOS) dst = dst[1:] dst[0] = byte(i.Flags) dst = dst[1:] dst[0] = byte(i.InverseFlags) dst = dst[1:] // Padding: dst[:sizeof(byte)*3] ~= [3]byte{0} dst = dst[1*(3):] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IP6TIP) UnmarshalBytes(src []byte) []byte { src = i.Src.UnmarshalUnsafe(src) src = i.Dst.UnmarshalUnsafe(src) src = i.SrcMask.UnmarshalUnsafe(src) src = i.DstMask.UnmarshalUnsafe(src) for idx := 0; idx < IFNAMSIZ; idx++ { i.InputInterface[idx] = src[0] src = src[1:] } for idx := 0; idx < IFNAMSIZ; idx++ { i.OutputInterface[idx] = src[0] src = src[1:] } for idx := 0; idx < IFNAMSIZ; idx++ { i.InputInterfaceMask[idx] = src[0] src = src[1:] } for idx := 0; idx < IFNAMSIZ; idx++ { i.OutputInterfaceMask[idx] = src[0] src = src[1:] } i.Protocol = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.TOS = uint8(src[0]) src = src[1:] i.Flags = uint8(src[0]) src = src[1:] i.InverseFlags = uint8(src[0]) src = src[1:] // Padding: ~ copy([3]byte(i._), src[:sizeof(byte)*3]) src = src[1*(3):] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IP6TIP) Packed() bool { return i.Dst.Packed() && i.DstMask.Packed() && i.Src.Packed() && i.SrcMask.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IP6TIP) MarshalUnsafe(dst []byte) []byte { if i.Dst.Packed() && i.DstMask.Packed() && i.Src.Packed() && i.SrcMask.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // Type IP6TIP doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IP6TIP) UnmarshalUnsafe(src []byte) []byte { if i.Dst.Packed() && i.DstMask.Packed() && i.Src.Packed() && i.SrcMask.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type IP6TIP doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IP6TIP) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Dst.Packed() && i.DstMask.Packed() && i.Src.Packed() && i.SrcMask.Packed() { // Type IP6TIP doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IP6TIP) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IP6TIP) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Dst.Packed() && i.DstMask.Packed() && i.Src.Packed() && i.SrcMask.Packed() { // Type IP6TIP doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IP6TIP) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IP6TIP) WriteTo(writer io.Writer) (int64, error) { if !i.Dst.Packed() && i.DstMask.Packed() && i.Src.Packed() && i.SrcMask.Packed() { // Type IP6TIP doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IP6TReplace) SizeBytes() int { return 24 + (*TableName)(nil).SizeBytes() + 4*NF_INET_NUMHOOKS + 4*NF_INET_NUMHOOKS } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IP6TReplace) MarshalBytes(dst []byte) []byte { dst = i.Name.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.ValidHooks)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.NumEntries)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Size)) dst = dst[4:] for idx := 0; idx < NF_INET_NUMHOOKS; idx++ { hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.HookEntry[idx])) dst = dst[4:] } for idx := 0; idx < NF_INET_NUMHOOKS; idx++ { hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Underflow[idx])) dst = dst[4:] } hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.NumCounters)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.Counters)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IP6TReplace) UnmarshalBytes(src []byte) []byte { src = i.Name.UnmarshalUnsafe(src) i.ValidHooks = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.NumEntries = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Size = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < NF_INET_NUMHOOKS; idx++ { i.HookEntry[idx] = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] } for idx := 0; idx < NF_INET_NUMHOOKS; idx++ { i.Underflow[idx] = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] } i.NumCounters = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Counters = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IP6TReplace) Packed() bool { return i.Name.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IP6TReplace) MarshalUnsafe(dst []byte) []byte { if i.Name.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // Type IP6TReplace doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IP6TReplace) UnmarshalUnsafe(src []byte) []byte { if i.Name.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type IP6TReplace doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IP6TReplace) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Name.Packed() { // Type IP6TReplace doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IP6TReplace) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IP6TReplace) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Name.Packed() { // Type IP6TReplace doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IP6TReplace) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IP6TReplace) WriteTo(writer io.Writer) (int64, error) { if !i.Name.Packed() { // Type IP6TReplace doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (ke *KernelIP6TEntry) Packed() bool { // Type KernelIP6TEntry is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (ke *KernelIP6TEntry) MarshalUnsafe(dst []byte) []byte { // Type KernelIP6TEntry doesn't have a packed layout in memory, fallback to MarshalBytes. return ke.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (ke *KernelIP6TEntry) UnmarshalUnsafe(src []byte) []byte { // Type KernelIP6TEntry doesn't have a packed layout in memory, fallback to UnmarshalBytes. return ke.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (ke *KernelIP6TEntry) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type KernelIP6TEntry doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay. ke.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (ke *KernelIP6TEntry) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return ke.CopyOutN(cc, addr, ke.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (ke *KernelIP6TEntry) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type KernelIP6TEntry doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. ke.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (ke *KernelIP6TEntry) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return ke.CopyInN(cc, addr, ke.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (ke *KernelIP6TEntry) WriteTo(writer io.Writer) (int64, error) { // Type KernelIP6TEntry doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, ke.SizeBytes()) ke.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (ke *KernelIP6TGetEntries) Packed() bool { // Type KernelIP6TGetEntries is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (ke *KernelIP6TGetEntries) MarshalUnsafe(dst []byte) []byte { // Type KernelIP6TGetEntries doesn't have a packed layout in memory, fallback to MarshalBytes. return ke.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (ke *KernelIP6TGetEntries) UnmarshalUnsafe(src []byte) []byte { // Type KernelIP6TGetEntries doesn't have a packed layout in memory, fallback to UnmarshalBytes. return ke.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (ke *KernelIP6TGetEntries) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type KernelIP6TGetEntries doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay. ke.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (ke *KernelIP6TGetEntries) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return ke.CopyOutN(cc, addr, ke.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (ke *KernelIP6TGetEntries) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type KernelIP6TGetEntries doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(ke.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. ke.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (ke *KernelIP6TGetEntries) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return ke.CopyInN(cc, addr, ke.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (ke *KernelIP6TGetEntries) WriteTo(writer io.Writer) (int64, error) { // Type KernelIP6TGetEntries doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, ke.SizeBytes()) ke.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NFNATRange) SizeBytes() int { return 8 + (*Inet6Addr)(nil).SizeBytes() + (*Inet6Addr)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NFNATRange) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] dst = n.MinAddr.MarshalUnsafe(dst) dst = n.MaxAddr.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint16(dst[:2], uint16(n.MinProto)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(n.MaxProto)) dst = dst[2:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NFNATRange) UnmarshalBytes(src []byte) []byte { n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = n.MinAddr.UnmarshalUnsafe(src) src = n.MaxAddr.UnmarshalUnsafe(src) n.MinProto = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] n.MaxProto = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NFNATRange) Packed() bool { return n.MaxAddr.Packed() && n.MinAddr.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NFNATRange) MarshalUnsafe(dst []byte) []byte { if n.MaxAddr.Packed() && n.MinAddr.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NFNATRange doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NFNATRange) UnmarshalUnsafe(src []byte) []byte { if n.MaxAddr.Packed() && n.MinAddr.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NFNATRange doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NFNATRange) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.MaxAddr.Packed() && n.MinAddr.Packed() { // Type NFNATRange doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NFNATRange) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NFNATRange) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.MaxAddr.Packed() && n.MinAddr.Packed() { // Type NFNATRange doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NFNATRange) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NFNATRange) WriteTo(writer io.Writer) (int64, error) { if !n.MaxAddr.Packed() && n.MinAddr.Packed() { // Type NFNATRange doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NFNATRange2) SizeBytes() int { return 10 + (*Inet6Addr)(nil).SizeBytes() + (*Inet6Addr)(nil).SizeBytes() + 1*6 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NFNATRange2) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] dst = n.MinAddr.MarshalUnsafe(dst) dst = n.MaxAddr.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint16(dst[:2], uint16(n.MinProto)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(n.MaxProto)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(n.BaseProto)) dst = dst[2:] // Padding: dst[:sizeof(byte)*6] ~= [6]byte{0} dst = dst[1*(6):] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NFNATRange2) UnmarshalBytes(src []byte) []byte { n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = n.MinAddr.UnmarshalUnsafe(src) src = n.MaxAddr.UnmarshalUnsafe(src) n.MinProto = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] n.MaxProto = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] n.BaseProto = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] // Padding: ~ copy([6]byte(n._), src[:sizeof(byte)*6]) src = src[1*(6):] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NFNATRange2) Packed() bool { return n.MaxAddr.Packed() && n.MinAddr.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NFNATRange2) MarshalUnsafe(dst []byte) []byte { if n.MaxAddr.Packed() && n.MinAddr.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NFNATRange2 doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NFNATRange2) UnmarshalUnsafe(src []byte) []byte { if n.MaxAddr.Packed() && n.MinAddr.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NFNATRange2 doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NFNATRange2) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.MaxAddr.Packed() && n.MinAddr.Packed() { // Type NFNATRange2 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NFNATRange2) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NFNATRange2) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.MaxAddr.Packed() && n.MinAddr.Packed() { // Type NFNATRange2 doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NFNATRange2) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NFNATRange2) WriteTo(writer io.Writer) (int64, error) { if !n.MaxAddr.Packed() && n.MinAddr.Packed() { // Type NFNATRange2 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NetlinkAttrHeader) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NetlinkAttrHeader) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(n.Length)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(n.Type)) dst = dst[2:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NetlinkAttrHeader) UnmarshalBytes(src []byte) []byte { n.Length = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] n.Type = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NetlinkAttrHeader) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NetlinkAttrHeader) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NetlinkAttrHeader) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NetlinkAttrHeader) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NetlinkAttrHeader) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NetlinkAttrHeader) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NetlinkAttrHeader) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NetlinkAttrHeader) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NetlinkErrorMessage) SizeBytes() int { return 4 + (*NetlinkMessageHeader)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NetlinkErrorMessage) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Error)) dst = dst[4:] dst = n.Header.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NetlinkErrorMessage) UnmarshalBytes(src []byte) []byte { n.Error = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = n.Header.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NetlinkErrorMessage) Packed() bool { return n.Header.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NetlinkErrorMessage) MarshalUnsafe(dst []byte) []byte { if n.Header.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NetlinkErrorMessage doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NetlinkErrorMessage) UnmarshalUnsafe(src []byte) []byte { if n.Header.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NetlinkErrorMessage doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NetlinkErrorMessage) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.Header.Packed() { // Type NetlinkErrorMessage doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NetlinkErrorMessage) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NetlinkErrorMessage) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.Header.Packed() { // Type NetlinkErrorMessage doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NetlinkErrorMessage) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NetlinkErrorMessage) WriteTo(writer io.Writer) (int64, error) { if !n.Header.Packed() { // Type NetlinkErrorMessage doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NetlinkMessageHeader) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NetlinkMessageHeader) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Length)) dst = dst[4:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(n.Type)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(n.Flags)) dst = dst[2:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Seq)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.PortID)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NetlinkMessageHeader) UnmarshalBytes(src []byte) []byte { n.Length = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Type = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] n.Flags = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] n.Seq = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.PortID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NetlinkMessageHeader) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NetlinkMessageHeader) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NetlinkMessageHeader) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NetlinkMessageHeader) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NetlinkMessageHeader) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NetlinkMessageHeader) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NetlinkMessageHeader) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NetlinkMessageHeader) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SockAddrNetlink) SizeBytes() int { return 12 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SockAddrNetlink) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.Family)) dst = dst[2:] // Padding: dst[:sizeof(uint16)] ~= uint16(0) dst = dst[2:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.PortID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Groups)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SockAddrNetlink) UnmarshalBytes(src []byte) []byte { s.Family = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] // Padding: var _ uint16 ~= src[:sizeof(uint16)] src = src[2:] s.PortID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Groups = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SockAddrNetlink) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SockAddrNetlink) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SockAddrNetlink) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SockAddrNetlink) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SockAddrNetlink) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SockAddrNetlink) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SockAddrNetlink) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SockAddrNetlink) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *InterfaceAddrMessage) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *InterfaceAddrMessage) MarshalBytes(dst []byte) []byte { dst[0] = byte(i.Family) dst = dst[1:] dst[0] = byte(i.PrefixLen) dst = dst[1:] dst[0] = byte(i.Flags) dst = dst[1:] dst[0] = byte(i.Scope) dst = dst[1:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Index)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *InterfaceAddrMessage) UnmarshalBytes(src []byte) []byte { i.Family = uint8(src[0]) src = src[1:] i.PrefixLen = uint8(src[0]) src = src[1:] i.Flags = uint8(src[0]) src = src[1:] i.Scope = uint8(src[0]) src = src[1:] i.Index = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *InterfaceAddrMessage) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *InterfaceAddrMessage) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *InterfaceAddrMessage) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *InterfaceAddrMessage) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *InterfaceAddrMessage) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *InterfaceAddrMessage) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *InterfaceAddrMessage) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *InterfaceAddrMessage) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *InterfaceInfoMessage) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *InterfaceInfoMessage) MarshalBytes(dst []byte) []byte { dst[0] = byte(i.Family) dst = dst[1:] // Padding: dst[:sizeof(uint8)] ~= uint8(0) dst = dst[1:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.Type)) dst = dst[2:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Index)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Change)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *InterfaceInfoMessage) UnmarshalBytes(src []byte) []byte { i.Family = uint8(src[0]) src = src[1:] // Padding: var _ uint8 ~= src[:sizeof(uint8)] src = src[1:] i.Type = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.Index = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Change = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *InterfaceInfoMessage) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *InterfaceInfoMessage) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *InterfaceInfoMessage) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *InterfaceInfoMessage) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *InterfaceInfoMessage) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *InterfaceInfoMessage) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *InterfaceInfoMessage) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *InterfaceInfoMessage) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *RouteMessage) SizeBytes() int { return 12 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *RouteMessage) MarshalBytes(dst []byte) []byte { dst[0] = byte(r.Family) dst = dst[1:] dst[0] = byte(r.DstLen) dst = dst[1:] dst[0] = byte(r.SrcLen) dst = dst[1:] dst[0] = byte(r.TOS) dst = dst[1:] dst[0] = byte(r.Table) dst = dst[1:] dst[0] = byte(r.Protocol) dst = dst[1:] dst[0] = byte(r.Scope) dst = dst[1:] dst[0] = byte(r.Type) dst = dst[1:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(r.Flags)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *RouteMessage) UnmarshalBytes(src []byte) []byte { r.Family = uint8(src[0]) src = src[1:] r.DstLen = uint8(src[0]) src = src[1:] r.SrcLen = uint8(src[0]) src = src[1:] r.TOS = uint8(src[0]) src = src[1:] r.Table = uint8(src[0]) src = src[1:] r.Protocol = uint8(src[0]) src = src[1:] r.Scope = uint8(src[0]) src = src[1:] r.Type = uint8(src[0]) src = src[1:] r.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *RouteMessage) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *RouteMessage) MarshalUnsafe(dst []byte) []byte { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(r), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *RouteMessage) UnmarshalUnsafe(src []byte) []byte { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(r), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (r *RouteMessage) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (r *RouteMessage) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (r *RouteMessage) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *RouteMessage) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *RouteMessage) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *RtAttr) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *RtAttr) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(r.Len)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(r.Type)) dst = dst[2:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *RtAttr) UnmarshalBytes(src []byte) []byte { r.Len = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] r.Type = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *RtAttr) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *RtAttr) MarshalUnsafe(dst []byte) []byte { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(r), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *RtAttr) UnmarshalUnsafe(src []byte) []byte { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(r), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (r *RtAttr) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (r *RtAttr) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (r *RtAttr) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *RtAttr) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *RtAttr) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (p *PollFD) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (p *PollFD) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.FD)) dst = dst[4:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(p.Events)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(p.REvents)) dst = dst[2:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (p *PollFD) UnmarshalBytes(src []byte) []byte { p.FD = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] p.Events = int16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] p.REvents = int16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (p *PollFD) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (p *PollFD) MarshalUnsafe(dst []byte) []byte { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(p), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (p *PollFD) UnmarshalUnsafe(src []byte) []byte { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(p), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (p *PollFD) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (p *PollFD) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyOutN(cc, addr, p.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (p *PollFD) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (p *PollFD) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyInN(cc, addr, p.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (p *PollFD) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return int64(length), err } // CopyPollFDSliceIn copies in a slice of PollFD objects from the task's memory. func CopyPollFDSliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []PollFD) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*PollFD)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyPollFDSliceOut copies a slice of PollFD objects to the task's memory. func CopyPollFDSliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []PollFD) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*PollFD)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafePollFDSlice is like PollFD.MarshalUnsafe, but for a []PollFD. func MarshalUnsafePollFDSlice(src []PollFD, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*PollFD)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafePollFDSlice is like PollFD.UnmarshalUnsafe, but for a []PollFD. func UnmarshalUnsafePollFDSlice(dst []PollFD, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*PollFD)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *RSeqCriticalSection) SizeBytes() int { return 32 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *RSeqCriticalSection) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(r.Version)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(r.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.Start)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.PostCommitOffset)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.Abort)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *RSeqCriticalSection) UnmarshalBytes(src []byte) []byte { r.Version = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] r.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] r.Start = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] r.PostCommitOffset = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] r.Abort = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *RSeqCriticalSection) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *RSeqCriticalSection) MarshalUnsafe(dst []byte) []byte { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(r), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *RSeqCriticalSection) UnmarshalUnsafe(src []byte) []byte { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(r), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (r *RSeqCriticalSection) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (r *RSeqCriticalSection) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (r *RSeqCriticalSection) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *RSeqCriticalSection) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *RSeqCriticalSection) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *Rusage) SizeBytes() int { return 112 + (*Timeval)(nil).SizeBytes() + (*Timeval)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *Rusage) MarshalBytes(dst []byte) []byte { dst = r.UTime.MarshalUnsafe(dst) dst = r.STime.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.MaxRSS)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.IXRSS)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.IDRSS)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.ISRSS)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.MinFlt)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.MajFlt)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.NSwap)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.InBlock)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.OuBlock)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.MsgSnd)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.MsgRcv)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.NSignals)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.NVCSw)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.NIvCSw)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *Rusage) UnmarshalBytes(src []byte) []byte { src = r.UTime.UnmarshalUnsafe(src) src = r.STime.UnmarshalUnsafe(src) r.MaxRSS = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] r.IXRSS = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] r.IDRSS = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] r.ISRSS = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] r.MinFlt = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] r.MajFlt = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] r.NSwap = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] r.InBlock = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] r.OuBlock = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] r.MsgSnd = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] r.MsgRcv = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] r.NSignals = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] r.NVCSw = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] r.NIvCSw = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *Rusage) Packed() bool { return r.STime.Packed() && r.UTime.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *Rusage) MarshalUnsafe(dst []byte) []byte { if r.STime.Packed() && r.UTime.Packed() { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(r), uintptr(size)) return dst[size:] } // Type Rusage doesn't have a packed layout in memory, fallback to MarshalBytes. return r.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *Rusage) UnmarshalUnsafe(src []byte) []byte { if r.STime.Packed() && r.UTime.Packed() { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(r), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type Rusage doesn't have a packed layout in memory, fallback to UnmarshalBytes. return r.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (r *Rusage) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !r.STime.Packed() && r.UTime.Packed() { // Type Rusage doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. r.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (r *Rusage) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (r *Rusage) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !r.STime.Packed() && r.UTime.Packed() { // Type Rusage doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. r.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *Rusage) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *Rusage) WriteTo(writer io.Writer) (int64, error) { if !r.STime.Packed() && r.UTime.Packed() { // Type Rusage doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, r.SizeBytes()) r.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (sd *SeccompData) SizeBytes() int { return 16 + 8*6 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (sd *SeccompData) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(sd.Nr)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(sd.Arch)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(sd.InstructionPointer)) dst = dst[8:] for idx := 0; idx < 6; idx++ { hostarch.ByteOrder.PutUint64(dst[:8], uint64(sd.Args[idx])) dst = dst[8:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (sd *SeccompData) UnmarshalBytes(src []byte) []byte { sd.Nr = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] sd.Arch = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] sd.InstructionPointer = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] for idx := 0; idx < 6; idx++ { sd.Args[idx] = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (sd *SeccompData) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (sd *SeccompData) MarshalUnsafe(dst []byte) []byte { size := sd.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(sd), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (sd *SeccompData) UnmarshalUnsafe(src []byte) []byte { size := sd.SizeBytes() gohacks.Memmove(unsafe.Pointer(sd), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (sd *SeccompData) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(sd))) hdr.Len = sd.SizeBytes() hdr.Cap = sd.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that sd // must live until the use above. runtime.KeepAlive(sd) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (sd *SeccompData) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return sd.CopyOutN(cc, addr, sd.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (sd *SeccompData) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(sd))) hdr.Len = sd.SizeBytes() hdr.Cap = sd.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that sd // must live until the use above. runtime.KeepAlive(sd) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (sd *SeccompData) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return sd.CopyInN(cc, addr, sd.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (sd *SeccompData) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(sd))) hdr.Len = sd.SizeBytes() hdr.Cap = sd.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that sd // must live until the use above. runtime.KeepAlive(sd) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SeccompNotif) SizeBytes() int { return 16 + (*SeccompData)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SeccompNotif) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.ID)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Pid)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Flags)) dst = dst[4:] dst = s.Data.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SeccompNotif) UnmarshalBytes(src []byte) []byte { s.ID = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Pid = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = s.Data.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SeccompNotif) Packed() bool { return s.Data.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SeccompNotif) MarshalUnsafe(dst []byte) []byte { if s.Data.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // Type SeccompNotif doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SeccompNotif) UnmarshalUnsafe(src []byte) []byte { if s.Data.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type SeccompNotif doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SeccompNotif) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Data.Packed() { // Type SeccompNotif doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SeccompNotif) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SeccompNotif) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Data.Packed() { // Type SeccompNotif doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SeccompNotif) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SeccompNotif) WriteTo(writer io.Writer) (int64, error) { if !s.Data.Packed() { // Type SeccompNotif doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SeccompNotifResp) SizeBytes() int { return 24 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SeccompNotifResp) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.ID)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Val)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Error)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Flags)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SeccompNotifResp) UnmarshalBytes(src []byte) []byte { s.ID = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Val = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Error = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SeccompNotifResp) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SeccompNotifResp) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SeccompNotifResp) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SeccompNotifResp) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SeccompNotifResp) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SeccompNotifResp) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SeccompNotifResp) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SeccompNotifResp) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SeccompNotifSizes) SizeBytes() int { return 6 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SeccompNotifSizes) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.Notif)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.Notif_resp)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.Data)) dst = dst[2:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SeccompNotifSizes) UnmarshalBytes(src []byte) []byte { s.Notif = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] s.Notif_resp = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] s.Data = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SeccompNotifSizes) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SeccompNotifSizes) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SeccompNotifSizes) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SeccompNotifSizes) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SeccompNotifSizes) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SeccompNotifSizes) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SeccompNotifSizes) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SeccompNotifSizes) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SemInfo) SizeBytes() int { return 40 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SemInfo) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.SemMap)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.SemMni)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.SemMns)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.SemMnu)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.SemMsl)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.SemOpm)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.SemUme)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.SemUsz)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.SemVmx)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.SemAem)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SemInfo) UnmarshalBytes(src []byte) []byte { s.SemMap = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.SemMni = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.SemMns = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.SemMnu = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.SemMsl = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.SemOpm = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.SemUme = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.SemUsz = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.SemVmx = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.SemAem = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SemInfo) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SemInfo) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SemInfo) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SemInfo) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SemInfo) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SemInfo) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SemInfo) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SemInfo) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *Sembuf) SizeBytes() int { return 6 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *Sembuf) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.SemNum)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.SemOp)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.SemFlg)) dst = dst[2:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *Sembuf) UnmarshalBytes(src []byte) []byte { s.SemNum = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] s.SemOp = int16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] s.SemFlg = int16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *Sembuf) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *Sembuf) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *Sembuf) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *Sembuf) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *Sembuf) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *Sembuf) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *Sembuf) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *Sembuf) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // CopySembufSliceIn copies in a slice of Sembuf objects from the task's memory. func CopySembufSliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []Sembuf) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*Sembuf)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopySembufSliceOut copies a slice of Sembuf objects to the task's memory. func CopySembufSliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []Sembuf) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*Sembuf)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeSembufSlice is like Sembuf.MarshalUnsafe, but for a []Sembuf. func MarshalUnsafeSembufSlice(src []Sembuf, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*Sembuf)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeSembufSlice is like Sembuf.UnmarshalUnsafe, but for a []Sembuf. func UnmarshalUnsafeSembufSlice(dst []Sembuf, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*Sembuf)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *ShmInfo) SizeBytes() int { return 44 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *ShmInfo) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.UsedIDs)) dst = dst[4:] // Padding: dst[:sizeof(byte)*4] ~= [4]byte{0} dst = dst[1*(4):] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.ShmTot)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.ShmRss)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.ShmSwp)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.SwapAttempts)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.SwapSuccesses)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *ShmInfo) UnmarshalBytes(src []byte) []byte { s.UsedIDs = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: ~ copy([4]byte(s._), src[:sizeof(byte)*4]) src = src[1*(4):] s.ShmTot = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.ShmRss = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.ShmSwp = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.SwapAttempts = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.SwapSuccesses = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *ShmInfo) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *ShmInfo) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *ShmInfo) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *ShmInfo) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *ShmInfo) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *ShmInfo) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *ShmInfo) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *ShmInfo) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *ShmParams) SizeBytes() int { return 40 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *ShmParams) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.ShmMax)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.ShmMin)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.ShmMni)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.ShmSeg)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.ShmAll)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *ShmParams) UnmarshalBytes(src []byte) []byte { s.ShmMax = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.ShmMin = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.ShmMni = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.ShmSeg = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.ShmAll = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *ShmParams) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *ShmParams) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *ShmParams) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *ShmParams) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *ShmParams) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *ShmParams) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *ShmParams) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *ShmParams) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *ShmidDS) SizeBytes() int { return 40 + (*IPCPerm)(nil).SizeBytes() + (*TimeT)(nil).SizeBytes() + (*TimeT)(nil).SizeBytes() + (*TimeT)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *ShmidDS) MarshalBytes(dst []byte) []byte { dst = s.ShmPerm.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.ShmSegsz)) dst = dst[8:] dst = s.ShmAtime.MarshalUnsafe(dst) dst = s.ShmDtime.MarshalUnsafe(dst) dst = s.ShmCtime.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.ShmCpid)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.ShmLpid)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.ShmNattach)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Unused4)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Unused5)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *ShmidDS) UnmarshalBytes(src []byte) []byte { src = s.ShmPerm.UnmarshalUnsafe(src) s.ShmSegsz = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = s.ShmAtime.UnmarshalUnsafe(src) src = s.ShmDtime.UnmarshalUnsafe(src) src = s.ShmCtime.UnmarshalUnsafe(src) s.ShmCpid = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.ShmLpid = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.ShmNattach = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Unused4 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Unused5 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *ShmidDS) Packed() bool { return s.ShmAtime.Packed() && s.ShmCtime.Packed() && s.ShmDtime.Packed() && s.ShmPerm.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *ShmidDS) MarshalUnsafe(dst []byte) []byte { if s.ShmAtime.Packed() && s.ShmCtime.Packed() && s.ShmDtime.Packed() && s.ShmPerm.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // Type ShmidDS doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *ShmidDS) UnmarshalUnsafe(src []byte) []byte { if s.ShmAtime.Packed() && s.ShmCtime.Packed() && s.ShmDtime.Packed() && s.ShmPerm.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type ShmidDS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *ShmidDS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.ShmAtime.Packed() && s.ShmCtime.Packed() && s.ShmDtime.Packed() && s.ShmPerm.Packed() { // Type ShmidDS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *ShmidDS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *ShmidDS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.ShmAtime.Packed() && s.ShmCtime.Packed() && s.ShmDtime.Packed() && s.ShmPerm.Packed() { // Type ShmidDS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *ShmidDS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *ShmidDS) WriteTo(writer io.Writer) (int64, error) { if !s.ShmAtime.Packed() && s.ShmCtime.Packed() && s.ShmDtime.Packed() && s.ShmPerm.Packed() { // Type ShmidDS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SigAction) SizeBytes() int { return 24 + (*SignalSet)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SigAction) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Handler)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Flags)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Restorer)) dst = dst[8:] dst = s.Mask.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SigAction) UnmarshalBytes(src []byte) []byte { s.Handler = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Flags = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Restorer = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = s.Mask.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SigAction) Packed() bool { return s.Mask.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SigAction) MarshalUnsafe(dst []byte) []byte { if s.Mask.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // Type SigAction doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SigAction) UnmarshalUnsafe(src []byte) []byte { if s.Mask.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type SigAction doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SigAction) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Mask.Packed() { // Type SigAction doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SigAction) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SigAction) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Mask.Packed() { // Type SigAction doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SigAction) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SigAction) WriteTo(writer io.Writer) (int64, error) { if !s.Mask.Packed() { // Type SigAction doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *Sigevent) SizeBytes() int { return 20 + 1*44 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *Sigevent) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Value)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Signo)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Notify)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Tid)) dst = dst[4:] for idx := 0; idx < 44; idx++ { dst[0] = byte(s.UnRemainder[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *Sigevent) UnmarshalBytes(src []byte) []byte { s.Value = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Signo = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Notify = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Tid = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 44; idx++ { s.UnRemainder[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *Sigevent) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *Sigevent) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *Sigevent) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *Sigevent) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *Sigevent) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *Sigevent) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *Sigevent) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *Sigevent) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SignalInfo) SizeBytes() int { return 16 + 1*(128-16) } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SignalInfo) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Signo)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Errno)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Code)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] for idx := 0; idx < (128-16); idx++ { dst[0] = byte(s.Fields[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SignalInfo) UnmarshalBytes(src []byte) []byte { s.Signo = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Errno = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Code = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] for idx := 0; idx < (128-16); idx++ { s.Fields[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SignalInfo) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SignalInfo) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SignalInfo) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SignalInfo) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SignalInfo) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SignalInfo) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SignalInfo) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SignalInfo) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (s *SignalSet) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SignalSet) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(*s)) return dst[8:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SignalSet) UnmarshalBytes(src []byte) []byte { *s = SignalSet(uint64(hostarch.ByteOrder.Uint64(src[:8]))) return src[8:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SignalSet) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SignalSet) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SignalSet) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SignalSet) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SignalSet) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SignalSet) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SignalSet) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SignalSet) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SignalStack) SizeBytes() int { return 24 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SignalStack) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Addr)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Flags)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Size)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SignalStack) UnmarshalBytes(src []byte) []byte { s.Addr = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] s.Size = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SignalStack) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SignalStack) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SignalStack) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SignalStack) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SignalStack) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SignalStack) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SignalStack) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SignalStack) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SignalfdSiginfo) SizeBytes() int { return 82 + 1*48 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SignalfdSiginfo) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Signo)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Errno)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Code)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.PID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.UID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.FD)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.TID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Band)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Overrun)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.TrapNo)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Status)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Int)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Ptr)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.UTime)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.STime)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Addr)) dst = dst[8:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.AddrLSB)) dst = dst[2:] // Padding: dst[:sizeof(uint8)*48] ~= [48]uint8{0} dst = dst[1*(48):] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SignalfdSiginfo) UnmarshalBytes(src []byte) []byte { s.Signo = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Errno = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Code = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.PID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.UID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.FD = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.TID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Band = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Overrun = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.TrapNo = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Status = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Int = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Ptr = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.UTime = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.STime = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Addr = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.AddrLSB = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] // Padding: ~ copy([48]uint8(s._), src[:sizeof(uint8)*48]) src = src[1*(48):] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SignalfdSiginfo) Packed() bool { return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SignalfdSiginfo) MarshalUnsafe(dst []byte) []byte { // Type SignalfdSiginfo doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SignalfdSiginfo) UnmarshalUnsafe(src []byte) []byte { // Type SignalfdSiginfo doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SignalfdSiginfo) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type SignalfdSiginfo doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SignalfdSiginfo) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SignalfdSiginfo) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type SignalfdSiginfo doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SignalfdSiginfo) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SignalfdSiginfo) WriteTo(writer io.Writer) (int64, error) { // Type SignalfdSiginfo doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (c *ControlMessageCredentials) SizeBytes() int { return 12 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (c *ControlMessageCredentials) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(c.PID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(c.UID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(c.GID)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (c *ControlMessageCredentials) UnmarshalBytes(src []byte) []byte { c.PID = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] c.UID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] c.GID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (c *ControlMessageCredentials) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (c *ControlMessageCredentials) MarshalUnsafe(dst []byte) []byte { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(c), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (c *ControlMessageCredentials) UnmarshalUnsafe(src []byte) []byte { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(c), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (c *ControlMessageCredentials) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (c *ControlMessageCredentials) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyOutN(cc, addr, c.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (c *ControlMessageCredentials) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (c *ControlMessageCredentials) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyInN(cc, addr, c.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (c *ControlMessageCredentials) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (c *ControlMessageHeader) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (c *ControlMessageHeader) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(c.Length)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(c.Level)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(c.Type)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (c *ControlMessageHeader) UnmarshalBytes(src []byte) []byte { c.Length = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] c.Level = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] c.Type = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (c *ControlMessageHeader) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (c *ControlMessageHeader) MarshalUnsafe(dst []byte) []byte { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(c), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (c *ControlMessageHeader) UnmarshalUnsafe(src []byte) []byte { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(c), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (c *ControlMessageHeader) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (c *ControlMessageHeader) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyOutN(cc, addr, c.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (c *ControlMessageHeader) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (c *ControlMessageHeader) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyInN(cc, addr, c.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (c *ControlMessageHeader) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (c *ControlMessageIPPacketInfo) SizeBytes() int { return 4 + (*InetAddr)(nil).SizeBytes() + (*InetAddr)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (c *ControlMessageIPPacketInfo) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(c.NIC)) dst = dst[4:] dst = c.LocalAddr.MarshalUnsafe(dst) dst = c.DestinationAddr.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (c *ControlMessageIPPacketInfo) UnmarshalBytes(src []byte) []byte { c.NIC = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = c.LocalAddr.UnmarshalUnsafe(src) src = c.DestinationAddr.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (c *ControlMessageIPPacketInfo) Packed() bool { return c.DestinationAddr.Packed() && c.LocalAddr.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (c *ControlMessageIPPacketInfo) MarshalUnsafe(dst []byte) []byte { if c.DestinationAddr.Packed() && c.LocalAddr.Packed() { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(c), uintptr(size)) return dst[size:] } // Type ControlMessageIPPacketInfo doesn't have a packed layout in memory, fallback to MarshalBytes. return c.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (c *ControlMessageIPPacketInfo) UnmarshalUnsafe(src []byte) []byte { if c.DestinationAddr.Packed() && c.LocalAddr.Packed() { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(c), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type ControlMessageIPPacketInfo doesn't have a packed layout in memory, fallback to UnmarshalBytes. return c.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (c *ControlMessageIPPacketInfo) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !c.DestinationAddr.Packed() && c.LocalAddr.Packed() { // Type ControlMessageIPPacketInfo doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(c.SizeBytes()) // escapes: okay. c.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (c *ControlMessageIPPacketInfo) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyOutN(cc, addr, c.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (c *ControlMessageIPPacketInfo) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !c.DestinationAddr.Packed() && c.LocalAddr.Packed() { // Type ControlMessageIPPacketInfo doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(c.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. c.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (c *ControlMessageIPPacketInfo) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyInN(cc, addr, c.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (c *ControlMessageIPPacketInfo) WriteTo(writer io.Writer) (int64, error) { if !c.DestinationAddr.Packed() && c.LocalAddr.Packed() { // Type ControlMessageIPPacketInfo doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, c.SizeBytes()) c.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (c *ControlMessageIPv6PacketInfo) SizeBytes() int { return 4 + (*Inet6Addr)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (c *ControlMessageIPv6PacketInfo) MarshalBytes(dst []byte) []byte { dst = c.Addr.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(c.NIC)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (c *ControlMessageIPv6PacketInfo) UnmarshalBytes(src []byte) []byte { src = c.Addr.UnmarshalUnsafe(src) c.NIC = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (c *ControlMessageIPv6PacketInfo) Packed() bool { return c.Addr.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (c *ControlMessageIPv6PacketInfo) MarshalUnsafe(dst []byte) []byte { if c.Addr.Packed() { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(c), uintptr(size)) return dst[size:] } // Type ControlMessageIPv6PacketInfo doesn't have a packed layout in memory, fallback to MarshalBytes. return c.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (c *ControlMessageIPv6PacketInfo) UnmarshalUnsafe(src []byte) []byte { if c.Addr.Packed() { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(c), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type ControlMessageIPv6PacketInfo doesn't have a packed layout in memory, fallback to UnmarshalBytes. return c.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (c *ControlMessageIPv6PacketInfo) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !c.Addr.Packed() { // Type ControlMessageIPv6PacketInfo doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(c.SizeBytes()) // escapes: okay. c.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (c *ControlMessageIPv6PacketInfo) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyOutN(cc, addr, c.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (c *ControlMessageIPv6PacketInfo) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !c.Addr.Packed() { // Type ControlMessageIPv6PacketInfo doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(c.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. c.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (c *ControlMessageIPv6PacketInfo) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyInN(cc, addr, c.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (c *ControlMessageIPv6PacketInfo) WriteTo(writer io.Writer) (int64, error) { if !c.Addr.Packed() { // Type ControlMessageIPv6PacketInfo doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, c.SizeBytes()) c.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *ICMP6Filter) SizeBytes() int { return 0 + 4*8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *ICMP6Filter) MarshalBytes(dst []byte) []byte { for idx := 0; idx < 8; idx++ { hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Filter[idx])) dst = dst[4:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *ICMP6Filter) UnmarshalBytes(src []byte) []byte { for idx := 0; idx < 8; idx++ { i.Filter[idx] = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *ICMP6Filter) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *ICMP6Filter) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *ICMP6Filter) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *ICMP6Filter) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *ICMP6Filter) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *ICMP6Filter) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *ICMP6Filter) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *ICMP6Filter) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (i *Inet6Addr) SizeBytes() int { return 1 * 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *Inet6Addr) MarshalBytes(dst []byte) []byte { for idx := 0; idx < 16; idx++ { dst[0] = byte(i[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *Inet6Addr) UnmarshalBytes(src []byte) []byte { for idx := 0; idx < 16; idx++ { i[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *Inet6Addr) Packed() bool { // Array newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *Inet6Addr) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&i[0]), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *Inet6Addr) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *Inet6Addr) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *Inet6Addr) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *Inet6Addr) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *Inet6Addr) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *Inet6Addr) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *Inet6MulticastRequest) SizeBytes() int { return 4 + (*Inet6Addr)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *Inet6MulticastRequest) MarshalBytes(dst []byte) []byte { dst = i.MulticastAddr.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.InterfaceIndex)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *Inet6MulticastRequest) UnmarshalBytes(src []byte) []byte { src = i.MulticastAddr.UnmarshalUnsafe(src) i.InterfaceIndex = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *Inet6MulticastRequest) Packed() bool { return i.MulticastAddr.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *Inet6MulticastRequest) MarshalUnsafe(dst []byte) []byte { if i.MulticastAddr.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // Type Inet6MulticastRequest doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *Inet6MulticastRequest) UnmarshalUnsafe(src []byte) []byte { if i.MulticastAddr.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type Inet6MulticastRequest doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *Inet6MulticastRequest) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.MulticastAddr.Packed() { // Type Inet6MulticastRequest doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *Inet6MulticastRequest) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *Inet6MulticastRequest) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.MulticastAddr.Packed() { // Type Inet6MulticastRequest doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *Inet6MulticastRequest) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *Inet6MulticastRequest) WriteTo(writer io.Writer) (int64, error) { if !i.MulticastAddr.Packed() { // Type Inet6MulticastRequest doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (i *InetAddr) SizeBytes() int { return 1 * 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *InetAddr) MarshalBytes(dst []byte) []byte { for idx := 0; idx < 4; idx++ { dst[0] = byte(i[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *InetAddr) UnmarshalBytes(src []byte) []byte { for idx := 0; idx < 4; idx++ { i[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *InetAddr) Packed() bool { // Array newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *InetAddr) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&i[0]), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *InetAddr) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *InetAddr) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *InetAddr) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *InetAddr) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *InetAddr) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *InetAddr) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *InetMulticastRequest) SizeBytes() int { return 0 + (*InetAddr)(nil).SizeBytes() + (*InetAddr)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *InetMulticastRequest) MarshalBytes(dst []byte) []byte { dst = i.MulticastAddr.MarshalUnsafe(dst) dst = i.InterfaceAddr.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *InetMulticastRequest) UnmarshalBytes(src []byte) []byte { src = i.MulticastAddr.UnmarshalUnsafe(src) src = i.InterfaceAddr.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *InetMulticastRequest) Packed() bool { return i.InterfaceAddr.Packed() && i.MulticastAddr.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *InetMulticastRequest) MarshalUnsafe(dst []byte) []byte { if i.InterfaceAddr.Packed() && i.MulticastAddr.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // Type InetMulticastRequest doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *InetMulticastRequest) UnmarshalUnsafe(src []byte) []byte { if i.InterfaceAddr.Packed() && i.MulticastAddr.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type InetMulticastRequest doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *InetMulticastRequest) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.InterfaceAddr.Packed() && i.MulticastAddr.Packed() { // Type InetMulticastRequest doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *InetMulticastRequest) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *InetMulticastRequest) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.InterfaceAddr.Packed() && i.MulticastAddr.Packed() { // Type InetMulticastRequest doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *InetMulticastRequest) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *InetMulticastRequest) WriteTo(writer io.Writer) (int64, error) { if !i.InterfaceAddr.Packed() && i.MulticastAddr.Packed() { // Type InetMulticastRequest doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *InetMulticastRequestWithNIC) SizeBytes() int { return 4 + (*InetMulticastRequest)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *InetMulticastRequestWithNIC) MarshalBytes(dst []byte) []byte { dst = i.InetMulticastRequest.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.InterfaceIndex)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *InetMulticastRequestWithNIC) UnmarshalBytes(src []byte) []byte { src = i.InetMulticastRequest.UnmarshalUnsafe(src) i.InterfaceIndex = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *InetMulticastRequestWithNIC) Packed() bool { return i.InetMulticastRequest.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *InetMulticastRequestWithNIC) MarshalUnsafe(dst []byte) []byte { if i.InetMulticastRequest.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // Type InetMulticastRequestWithNIC doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *InetMulticastRequestWithNIC) UnmarshalUnsafe(src []byte) []byte { if i.InetMulticastRequest.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type InetMulticastRequestWithNIC doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *InetMulticastRequestWithNIC) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.InetMulticastRequest.Packed() { // Type InetMulticastRequestWithNIC doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *InetMulticastRequestWithNIC) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *InetMulticastRequestWithNIC) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.InetMulticastRequest.Packed() { // Type InetMulticastRequestWithNIC doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *InetMulticastRequestWithNIC) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *InetMulticastRequestWithNIC) WriteTo(writer io.Writer) (int64, error) { if !i.InetMulticastRequest.Packed() { // Type InetMulticastRequestWithNIC doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (l *Linger) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (l *Linger) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(l.OnOff)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(l.Linger)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (l *Linger) UnmarshalBytes(src []byte) []byte { l.OnOff = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] l.Linger = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (l *Linger) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (l *Linger) MarshalUnsafe(dst []byte) []byte { size := l.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(l), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (l *Linger) UnmarshalUnsafe(src []byte) []byte { size := l.SizeBytes() gohacks.Memmove(unsafe.Pointer(l), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (l *Linger) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(l))) hdr.Len = l.SizeBytes() hdr.Cap = l.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that l // must live until the use above. runtime.KeepAlive(l) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (l *Linger) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return l.CopyOutN(cc, addr, l.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (l *Linger) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(l))) hdr.Len = l.SizeBytes() hdr.Cap = l.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that l // must live until the use above. runtime.KeepAlive(l) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (l *Linger) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return l.CopyInN(cc, addr, l.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (l *Linger) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(l))) hdr.Len = l.SizeBytes() hdr.Cap = l.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that l // must live until the use above. runtime.KeepAlive(l) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SockAddrInet) SizeBytes() int { return 4 + (*InetAddr)(nil).SizeBytes() + 1*8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SockAddrInet) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.Family)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.Port)) dst = dst[2:] dst = s.Addr.MarshalUnsafe(dst) // Padding: dst[:sizeof(uint8)*8] ~= [8]uint8{0} dst = dst[1*(8):] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SockAddrInet) UnmarshalBytes(src []byte) []byte { s.Family = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] s.Port = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] src = s.Addr.UnmarshalUnsafe(src) // Padding: ~ copy([8]uint8(s._), src[:sizeof(uint8)*8]) src = src[1*(8):] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SockAddrInet) Packed() bool { return s.Addr.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SockAddrInet) MarshalUnsafe(dst []byte) []byte { if s.Addr.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // Type SockAddrInet doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SockAddrInet) UnmarshalUnsafe(src []byte) []byte { if s.Addr.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type SockAddrInet doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SockAddrInet) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Addr.Packed() { // Type SockAddrInet doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SockAddrInet) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SockAddrInet) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Addr.Packed() { // Type SockAddrInet doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SockAddrInet) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SockAddrInet) WriteTo(writer io.Writer) (int64, error) { if !s.Addr.Packed() { // Type SockAddrInet doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SockAddrInet6) SizeBytes() int { return 12 + 1*16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SockAddrInet6) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.Family)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.Port)) dst = dst[2:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Flowinfo)) dst = dst[4:] for idx := 0; idx < 16; idx++ { dst[0] = byte(s.Addr[idx]) dst = dst[1:] } hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Scope_id)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SockAddrInet6) UnmarshalBytes(src []byte) []byte { s.Family = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] s.Port = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] s.Flowinfo = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 16; idx++ { s.Addr[idx] = src[0] src = src[1:] } s.Scope_id = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SockAddrInet6) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SockAddrInet6) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SockAddrInet6) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SockAddrInet6) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SockAddrInet6) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SockAddrInet6) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SockAddrInet6) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SockAddrInet6) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SockAddrLink) SizeBytes() int { return 12 + 1*8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SockAddrLink) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.Family)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.Protocol)) dst = dst[2:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.InterfaceIndex)) dst = dst[4:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.ARPHardwareType)) dst = dst[2:] dst[0] = byte(s.PacketType) dst = dst[1:] dst[0] = byte(s.HardwareAddrLen) dst = dst[1:] for idx := 0; idx < 8; idx++ { dst[0] = byte(s.HardwareAddr[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SockAddrLink) UnmarshalBytes(src []byte) []byte { s.Family = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] s.Protocol = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] s.InterfaceIndex = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.ARPHardwareType = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] s.PacketType = src[0] src = src[1:] s.HardwareAddrLen = src[0] src = src[1:] for idx := 0; idx < 8; idx++ { s.HardwareAddr[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SockAddrLink) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SockAddrLink) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SockAddrLink) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SockAddrLink) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SockAddrLink) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SockAddrLink) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SockAddrLink) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SockAddrLink) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SockAddrUnix) SizeBytes() int { return 2 + 1*UnixPathMax } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SockAddrUnix) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.Family)) dst = dst[2:] for idx := 0; idx < UnixPathMax; idx++ { dst[0] = byte(s.Path[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SockAddrUnix) UnmarshalBytes(src []byte) []byte { s.Family = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] for idx := 0; idx < UnixPathMax; idx++ { s.Path[idx] = int8(src[0]) src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SockAddrUnix) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SockAddrUnix) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SockAddrUnix) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SockAddrUnix) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SockAddrUnix) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SockAddrUnix) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SockAddrUnix) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SockAddrUnix) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (t *TCPInfo) SizeBytes() int { return 224 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (t *TCPInfo) MarshalBytes(dst []byte) []byte { dst[0] = byte(t.State) dst = dst[1:] dst[0] = byte(t.CaState) dst = dst[1:] dst[0] = byte(t.Retransmits) dst = dst[1:] dst[0] = byte(t.Probes) dst = dst[1:] dst[0] = byte(t.Backoff) dst = dst[1:] dst[0] = byte(t.Options) dst = dst[1:] dst[0] = byte(t.WindowScale) dst = dst[1:] dst[0] = byte(t.DeliveryRateAppLimited) dst = dst[1:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.RTO)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.ATO)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.SndMss)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.RcvMss)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.Unacked)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.Sacked)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.Lost)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.Retrans)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.Fackets)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.LastDataSent)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.LastAckSent)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.LastDataRecv)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.LastAckRecv)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.PMTU)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.RcvSsthresh)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.RTT)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.RTTVar)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.SndSsthresh)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.SndCwnd)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.Advmss)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.Reordering)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.RcvRTT)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.RcvSpace)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.TotalRetrans)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(t.PacingRate)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(t.MaxPacingRate)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(t.BytesAcked)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(t.BytesReceived)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.SegsOut)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.SegsIn)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.NotSentBytes)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.MinRTT)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.DataSegsIn)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.DataSegsOut)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(t.DeliveryRate)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(t.BusyTime)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(t.RwndLimited)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(t.SndBufLimited)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.Delivered)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.DeliveredCE)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(t.BytesSent)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(t.BytesRetrans)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.DSACKDups)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.ReordSeen)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (t *TCPInfo) UnmarshalBytes(src []byte) []byte { t.State = uint8(src[0]) src = src[1:] t.CaState = uint8(src[0]) src = src[1:] t.Retransmits = uint8(src[0]) src = src[1:] t.Probes = uint8(src[0]) src = src[1:] t.Backoff = uint8(src[0]) src = src[1:] t.Options = uint8(src[0]) src = src[1:] t.WindowScale = uint8(src[0]) src = src[1:] t.DeliveryRateAppLimited = uint8(src[0]) src = src[1:] t.RTO = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.ATO = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.SndMss = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.RcvMss = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.Unacked = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.Sacked = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.Lost = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.Retrans = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.Fackets = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.LastDataSent = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.LastAckSent = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.LastDataRecv = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.LastAckRecv = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.PMTU = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.RcvSsthresh = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.RTT = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.RTTVar = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.SndSsthresh = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.SndCwnd = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.Advmss = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.Reordering = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.RcvRTT = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.RcvSpace = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.TotalRetrans = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.PacingRate = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] t.MaxPacingRate = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] t.BytesAcked = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] t.BytesReceived = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] t.SegsOut = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.SegsIn = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.NotSentBytes = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.MinRTT = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.DataSegsIn = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.DataSegsOut = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.DeliveryRate = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] t.BusyTime = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] t.RwndLimited = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] t.SndBufLimited = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] t.Delivered = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.DeliveredCE = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.BytesSent = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] t.BytesRetrans = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] t.DSACKDups = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.ReordSeen = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (t *TCPInfo) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (t *TCPInfo) MarshalUnsafe(dst []byte) []byte { size := t.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(t), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (t *TCPInfo) UnmarshalUnsafe(src []byte) []byte { size := t.SizeBytes() gohacks.Memmove(unsafe.Pointer(t), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (t *TCPInfo) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(t))) hdr.Len = t.SizeBytes() hdr.Cap = t.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that t // must live until the use above. runtime.KeepAlive(t) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (t *TCPInfo) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return t.CopyOutN(cc, addr, t.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (t *TCPInfo) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(t))) hdr.Len = t.SizeBytes() hdr.Cap = t.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that t // must live until the use above. runtime.KeepAlive(t) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (t *TCPInfo) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return t.CopyInN(cc, addr, t.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (t *TCPInfo) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(t))) hdr.Len = t.SizeBytes() hdr.Cap = t.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that t // must live until the use above. runtime.KeepAlive(t) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (c *ClockT) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (c *ClockT) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(*c)) return dst[8:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (c *ClockT) UnmarshalBytes(src []byte) []byte { *c = ClockT(int64(hostarch.ByteOrder.Uint64(src[:8]))) return src[8:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (c *ClockT) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (c *ClockT) MarshalUnsafe(dst []byte) []byte { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(c), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (c *ClockT) UnmarshalUnsafe(src []byte) []byte { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(c), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (c *ClockT) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (c *ClockT) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyOutN(cc, addr, c.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (c *ClockT) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (c *ClockT) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyInN(cc, addr, c.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (c *ClockT) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *ItimerVal) SizeBytes() int { return 0 + (*Timeval)(nil).SizeBytes() + (*Timeval)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *ItimerVal) MarshalBytes(dst []byte) []byte { dst = i.Interval.MarshalUnsafe(dst) dst = i.Value.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *ItimerVal) UnmarshalBytes(src []byte) []byte { src = i.Interval.UnmarshalUnsafe(src) src = i.Value.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *ItimerVal) Packed() bool { return i.Interval.Packed() && i.Value.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *ItimerVal) MarshalUnsafe(dst []byte) []byte { if i.Interval.Packed() && i.Value.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // Type ItimerVal doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *ItimerVal) UnmarshalUnsafe(src []byte) []byte { if i.Interval.Packed() && i.Value.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type ItimerVal doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *ItimerVal) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Interval.Packed() && i.Value.Packed() { // Type ItimerVal doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *ItimerVal) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *ItimerVal) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Interval.Packed() && i.Value.Packed() { // Type ItimerVal doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *ItimerVal) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *ItimerVal) WriteTo(writer io.Writer) (int64, error) { if !i.Interval.Packed() && i.Value.Packed() { // Type ItimerVal doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *Itimerspec) SizeBytes() int { return 0 + (*Timespec)(nil).SizeBytes() + (*Timespec)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *Itimerspec) MarshalBytes(dst []byte) []byte { dst = i.Interval.MarshalUnsafe(dst) dst = i.Value.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *Itimerspec) UnmarshalBytes(src []byte) []byte { src = i.Interval.UnmarshalUnsafe(src) src = i.Value.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *Itimerspec) Packed() bool { return i.Interval.Packed() && i.Value.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *Itimerspec) MarshalUnsafe(dst []byte) []byte { if i.Interval.Packed() && i.Value.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // Type Itimerspec doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *Itimerspec) UnmarshalUnsafe(src []byte) []byte { if i.Interval.Packed() && i.Value.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type Itimerspec doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *Itimerspec) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Interval.Packed() && i.Value.Packed() { // Type Itimerspec doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *Itimerspec) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *Itimerspec) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Interval.Packed() && i.Value.Packed() { // Type Itimerspec doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *Itimerspec) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *Itimerspec) WriteTo(writer io.Writer) (int64, error) { if !i.Interval.Packed() && i.Value.Packed() { // Type Itimerspec doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (sxts *StatxTimestamp) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (sxts *StatxTimestamp) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(sxts.Sec)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(sxts.Nsec)) dst = dst[4:] // Padding: dst[:sizeof(int32)] ~= int32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (sxts *StatxTimestamp) UnmarshalBytes(src []byte) []byte { sxts.Sec = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] sxts.Nsec = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ int32 ~= src[:sizeof(int32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (sxts *StatxTimestamp) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (sxts *StatxTimestamp) MarshalUnsafe(dst []byte) []byte { size := sxts.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(sxts), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (sxts *StatxTimestamp) UnmarshalUnsafe(src []byte) []byte { size := sxts.SizeBytes() gohacks.Memmove(unsafe.Pointer(sxts), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (sxts *StatxTimestamp) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(sxts))) hdr.Len = sxts.SizeBytes() hdr.Cap = sxts.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that sxts // must live until the use above. runtime.KeepAlive(sxts) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (sxts *StatxTimestamp) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return sxts.CopyOutN(cc, addr, sxts.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (sxts *StatxTimestamp) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(sxts))) hdr.Len = sxts.SizeBytes() hdr.Cap = sxts.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that sxts // must live until the use above. runtime.KeepAlive(sxts) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (sxts *StatxTimestamp) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return sxts.CopyInN(cc, addr, sxts.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (sxts *StatxTimestamp) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(sxts))) hdr.Len = sxts.SizeBytes() hdr.Cap = sxts.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that sxts // must live until the use above. runtime.KeepAlive(sxts) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (t *TimeT) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (t *TimeT) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(*t)) return dst[8:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (t *TimeT) UnmarshalBytes(src []byte) []byte { *t = TimeT(int64(hostarch.ByteOrder.Uint64(src[:8]))) return src[8:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (t *TimeT) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (t *TimeT) MarshalUnsafe(dst []byte) []byte { size := t.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(t), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (t *TimeT) UnmarshalUnsafe(src []byte) []byte { size := t.SizeBytes() gohacks.Memmove(unsafe.Pointer(t), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (t *TimeT) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(t))) hdr.Len = t.SizeBytes() hdr.Cap = t.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that t // must live until the use above. runtime.KeepAlive(t) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (t *TimeT) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return t.CopyOutN(cc, addr, t.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (t *TimeT) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(t))) hdr.Len = t.SizeBytes() hdr.Cap = t.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that t // must live until the use above. runtime.KeepAlive(t) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (t *TimeT) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return t.CopyInN(cc, addr, t.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (t *TimeT) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(t))) hdr.Len = t.SizeBytes() hdr.Cap = t.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that t // must live until the use above. runtime.KeepAlive(t) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (t *TimerID) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (t *TimerID) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(*t)) return dst[4:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (t *TimerID) UnmarshalBytes(src []byte) []byte { *t = TimerID(int32(hostarch.ByteOrder.Uint32(src[:4]))) return src[4:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (t *TimerID) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (t *TimerID) MarshalUnsafe(dst []byte) []byte { size := t.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(t), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (t *TimerID) UnmarshalUnsafe(src []byte) []byte { size := t.SizeBytes() gohacks.Memmove(unsafe.Pointer(t), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (t *TimerID) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(t))) hdr.Len = t.SizeBytes() hdr.Cap = t.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that t // must live until the use above. runtime.KeepAlive(t) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (t *TimerID) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return t.CopyOutN(cc, addr, t.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (t *TimerID) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(t))) hdr.Len = t.SizeBytes() hdr.Cap = t.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that t // must live until the use above. runtime.KeepAlive(t) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (t *TimerID) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return t.CopyInN(cc, addr, t.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (t *TimerID) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(t))) hdr.Len = t.SizeBytes() hdr.Cap = t.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that t // must live until the use above. runtime.KeepAlive(t) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (ts *Timespec) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (ts *Timespec) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(ts.Sec)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(ts.Nsec)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (ts *Timespec) UnmarshalBytes(src []byte) []byte { ts.Sec = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] ts.Nsec = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (ts *Timespec) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (ts *Timespec) MarshalUnsafe(dst []byte) []byte { size := ts.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(ts), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (ts *Timespec) UnmarshalUnsafe(src []byte) []byte { size := ts.SizeBytes() gohacks.Memmove(unsafe.Pointer(ts), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (ts *Timespec) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(ts))) hdr.Len = ts.SizeBytes() hdr.Cap = ts.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that ts // must live until the use above. runtime.KeepAlive(ts) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (ts *Timespec) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return ts.CopyOutN(cc, addr, ts.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (ts *Timespec) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(ts))) hdr.Len = ts.SizeBytes() hdr.Cap = ts.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that ts // must live until the use above. runtime.KeepAlive(ts) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (ts *Timespec) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return ts.CopyInN(cc, addr, ts.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (ts *Timespec) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(ts))) hdr.Len = ts.SizeBytes() hdr.Cap = ts.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that ts // must live until the use above. runtime.KeepAlive(ts) // escapes: replaced by intrinsic. return int64(length), err } // CopyTimespecSliceIn copies in a slice of Timespec objects from the task's memory. func CopyTimespecSliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []Timespec) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*Timespec)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyTimespecSliceOut copies a slice of Timespec objects to the task's memory. func CopyTimespecSliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []Timespec) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*Timespec)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeTimespecSlice is like Timespec.MarshalUnsafe, but for a []Timespec. func MarshalUnsafeTimespecSlice(src []Timespec, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*Timespec)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeTimespecSlice is like Timespec.UnmarshalUnsafe, but for a []Timespec. func UnmarshalUnsafeTimespecSlice(dst []Timespec, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*Timespec)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. func (tv *Timeval) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (tv *Timeval) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(tv.Sec)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(tv.Usec)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (tv *Timeval) UnmarshalBytes(src []byte) []byte { tv.Sec = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] tv.Usec = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (tv *Timeval) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (tv *Timeval) MarshalUnsafe(dst []byte) []byte { size := tv.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(tv), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (tv *Timeval) UnmarshalUnsafe(src []byte) []byte { size := tv.SizeBytes() gohacks.Memmove(unsafe.Pointer(tv), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (tv *Timeval) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(tv))) hdr.Len = tv.SizeBytes() hdr.Cap = tv.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that tv // must live until the use above. runtime.KeepAlive(tv) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (tv *Timeval) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return tv.CopyOutN(cc, addr, tv.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (tv *Timeval) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(tv))) hdr.Len = tv.SizeBytes() hdr.Cap = tv.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that tv // must live until the use above. runtime.KeepAlive(tv) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (tv *Timeval) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return tv.CopyInN(cc, addr, tv.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (tv *Timeval) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(tv))) hdr.Len = tv.SizeBytes() hdr.Cap = tv.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that tv // must live until the use above. runtime.KeepAlive(tv) // escapes: replaced by intrinsic. return int64(length), err } // CopyTimevalSliceIn copies in a slice of Timeval objects from the task's memory. func CopyTimevalSliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []Timeval) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*Timeval)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyTimevalSliceOut copies a slice of Timeval objects to the task's memory. func CopyTimevalSliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []Timeval) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*Timeval)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeTimevalSlice is like Timeval.MarshalUnsafe, but for a []Timeval. func MarshalUnsafeTimevalSlice(src []Timeval, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*Timeval)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeTimevalSlice is like Timeval.UnmarshalUnsafe, but for a []Timeval. func UnmarshalUnsafeTimevalSlice(dst []Timeval, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*Timeval)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. func (t *Tms) SizeBytes() int { return 0 + (*ClockT)(nil).SizeBytes() + (*ClockT)(nil).SizeBytes() + (*ClockT)(nil).SizeBytes() + (*ClockT)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (t *Tms) MarshalBytes(dst []byte) []byte { dst = t.UTime.MarshalUnsafe(dst) dst = t.STime.MarshalUnsafe(dst) dst = t.CUTime.MarshalUnsafe(dst) dst = t.CSTime.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (t *Tms) UnmarshalBytes(src []byte) []byte { src = t.UTime.UnmarshalUnsafe(src) src = t.STime.UnmarshalUnsafe(src) src = t.CUTime.UnmarshalUnsafe(src) src = t.CSTime.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (t *Tms) Packed() bool { return t.CSTime.Packed() && t.CUTime.Packed() && t.STime.Packed() && t.UTime.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (t *Tms) MarshalUnsafe(dst []byte) []byte { if t.CSTime.Packed() && t.CUTime.Packed() && t.STime.Packed() && t.UTime.Packed() { size := t.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(t), uintptr(size)) return dst[size:] } // Type Tms doesn't have a packed layout in memory, fallback to MarshalBytes. return t.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (t *Tms) UnmarshalUnsafe(src []byte) []byte { if t.CSTime.Packed() && t.CUTime.Packed() && t.STime.Packed() && t.UTime.Packed() { size := t.SizeBytes() gohacks.Memmove(unsafe.Pointer(t), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type Tms doesn't have a packed layout in memory, fallback to UnmarshalBytes. return t.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (t *Tms) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !t.CSTime.Packed() && t.CUTime.Packed() && t.STime.Packed() && t.UTime.Packed() { // Type Tms doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(t.SizeBytes()) // escapes: okay. t.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(t))) hdr.Len = t.SizeBytes() hdr.Cap = t.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that t // must live until the use above. runtime.KeepAlive(t) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (t *Tms) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return t.CopyOutN(cc, addr, t.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (t *Tms) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !t.CSTime.Packed() && t.CUTime.Packed() && t.STime.Packed() && t.UTime.Packed() { // Type Tms doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(t.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. t.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(t))) hdr.Len = t.SizeBytes() hdr.Cap = t.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that t // must live until the use above. runtime.KeepAlive(t) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (t *Tms) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return t.CopyInN(cc, addr, t.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (t *Tms) WriteTo(writer io.Writer) (int64, error) { if !t.CSTime.Packed() && t.CUTime.Packed() && t.STime.Packed() && t.UTime.Packed() { // Type Tms doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, t.SizeBytes()) t.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(t))) hdr.Len = t.SizeBytes() hdr.Cap = t.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that t // must live until the use above. runtime.KeepAlive(t) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *Utime) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *Utime) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Actime)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Modtime)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *Utime) UnmarshalBytes(src []byte) []byte { u.Actime = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.Modtime = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *Utime) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *Utime) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *Utime) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *Utime) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *Utime) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *Utime) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *Utime) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *Utime) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (t *Termios) SizeBytes() int { return 17 + 1*NumControlCharacters } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (t *Termios) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.InputFlags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.OutputFlags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.ControlFlags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(t.LocalFlags)) dst = dst[4:] dst[0] = byte(t.LineDiscipline) dst = dst[1:] for idx := 0; idx < NumControlCharacters; idx++ { dst[0] = byte(t.ControlCharacters[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (t *Termios) UnmarshalBytes(src []byte) []byte { t.InputFlags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.OutputFlags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.ControlFlags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.LocalFlags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] t.LineDiscipline = uint8(src[0]) src = src[1:] for idx := 0; idx < NumControlCharacters; idx++ { t.ControlCharacters[idx] = uint8(src[0]) src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (t *Termios) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (t *Termios) MarshalUnsafe(dst []byte) []byte { size := t.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(t), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (t *Termios) UnmarshalUnsafe(src []byte) []byte { size := t.SizeBytes() gohacks.Memmove(unsafe.Pointer(t), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (t *Termios) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(t))) hdr.Len = t.SizeBytes() hdr.Cap = t.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that t // must live until the use above. runtime.KeepAlive(t) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (t *Termios) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return t.CopyOutN(cc, addr, t.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (t *Termios) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(t))) hdr.Len = t.SizeBytes() hdr.Cap = t.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that t // must live until the use above. runtime.KeepAlive(t) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (t *Termios) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return t.CopyInN(cc, addr, t.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (t *Termios) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(t))) hdr.Len = t.SizeBytes() hdr.Cap = t.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that t // must live until the use above. runtime.KeepAlive(t) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (w *WindowSize) SizeBytes() int { return 4 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (w *WindowSize) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(w.Rows)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(w.Cols)) dst = dst[2:] // Padding: dst[:sizeof(byte)*4] ~= [4]byte{0} dst = dst[1*(4):] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (w *WindowSize) UnmarshalBytes(src []byte) []byte { w.Rows = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] w.Cols = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] // Padding: ~ copy([4]byte(w._), src[:sizeof(byte)*4]) src = src[1*(4):] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (w *WindowSize) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (w *WindowSize) MarshalUnsafe(dst []byte) []byte { size := w.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(w), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (w *WindowSize) UnmarshalUnsafe(src []byte) []byte { size := w.SizeBytes() gohacks.Memmove(unsafe.Pointer(w), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (w *WindowSize) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(w))) hdr.Len = w.SizeBytes() hdr.Cap = w.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that w // must live until the use above. runtime.KeepAlive(w) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (w *WindowSize) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return w.CopyOutN(cc, addr, w.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (w *WindowSize) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(w))) hdr.Len = w.SizeBytes() hdr.Cap = w.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that w // must live until the use above. runtime.KeepAlive(w) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (w *WindowSize) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return w.CopyInN(cc, addr, w.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (w *WindowSize) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(w))) hdr.Len = w.SizeBytes() hdr.Cap = w.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that w // must live until the use above. runtime.KeepAlive(w) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (w *Winsize) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (w *Winsize) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(w.Row)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(w.Col)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(w.Xpixel)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(w.Ypixel)) dst = dst[2:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (w *Winsize) UnmarshalBytes(src []byte) []byte { w.Row = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] w.Col = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] w.Xpixel = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] w.Ypixel = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (w *Winsize) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (w *Winsize) MarshalUnsafe(dst []byte) []byte { size := w.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(w), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (w *Winsize) UnmarshalUnsafe(src []byte) []byte { size := w.SizeBytes() gohacks.Memmove(unsafe.Pointer(w), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (w *Winsize) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(w))) hdr.Len = w.SizeBytes() hdr.Cap = w.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that w // must live until the use above. runtime.KeepAlive(w) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (w *Winsize) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return w.CopyOutN(cc, addr, w.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (w *Winsize) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(w))) hdr.Len = w.SizeBytes() hdr.Cap = w.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that w // must live until the use above. runtime.KeepAlive(w) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (w *Winsize) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return w.CopyInN(cc, addr, w.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (w *Winsize) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(w))) hdr.Len = w.SizeBytes() hdr.Cap = w.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that w // must live until the use above. runtime.KeepAlive(w) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UtsName) SizeBytes() int { return 0 + 1*(UTSLen+1) + 1*(UTSLen+1) + 1*(UTSLen+1) + 1*(UTSLen+1) + 1*(UTSLen+1) + 1*(UTSLen+1) } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UtsName) MarshalBytes(dst []byte) []byte { for idx := 0; idx < (UTSLen+1); idx++ { dst[0] = byte(u.Sysname[idx]) dst = dst[1:] } for idx := 0; idx < (UTSLen+1); idx++ { dst[0] = byte(u.Nodename[idx]) dst = dst[1:] } for idx := 0; idx < (UTSLen+1); idx++ { dst[0] = byte(u.Release[idx]) dst = dst[1:] } for idx := 0; idx < (UTSLen+1); idx++ { dst[0] = byte(u.Version[idx]) dst = dst[1:] } for idx := 0; idx < (UTSLen+1); idx++ { dst[0] = byte(u.Machine[idx]) dst = dst[1:] } for idx := 0; idx < (UTSLen+1); idx++ { dst[0] = byte(u.Domainname[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UtsName) UnmarshalBytes(src []byte) []byte { for idx := 0; idx < (UTSLen+1); idx++ { u.Sysname[idx] = src[0] src = src[1:] } for idx := 0; idx < (UTSLen+1); idx++ { u.Nodename[idx] = src[0] src = src[1:] } for idx := 0; idx < (UTSLen+1); idx++ { u.Release[idx] = src[0] src = src[1:] } for idx := 0; idx < (UTSLen+1); idx++ { u.Version[idx] = src[0] src = src[1:] } for idx := 0; idx < (UTSLen+1); idx++ { u.Machine[idx] = src[0] src = src[1:] } for idx := 0; idx < (UTSLen+1); idx++ { u.Domainname[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UtsName) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UtsName) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UtsName) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UtsName) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UtsName) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UtsName) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UtsName) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UtsName) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (v *VFIODeviceInfo) SizeBytes() int { return 24 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (v *VFIODeviceInfo) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.Argsz)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.NumRegions)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.NumIrqs)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.CapOffset)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.pad)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (v *VFIODeviceInfo) UnmarshalBytes(src []byte) []byte { v.Argsz = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.NumRegions = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.NumIrqs = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.CapOffset = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.pad = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (v *VFIODeviceInfo) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (v *VFIODeviceInfo) MarshalUnsafe(dst []byte) []byte { size := v.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(v), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (v *VFIODeviceInfo) UnmarshalUnsafe(src []byte) []byte { size := v.SizeBytes() gohacks.Memmove(unsafe.Pointer(v), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (v *VFIODeviceInfo) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (v *VFIODeviceInfo) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return v.CopyOutN(cc, addr, v.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (v *VFIODeviceInfo) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (v *VFIODeviceInfo) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return v.CopyInN(cc, addr, v.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (v *VFIODeviceInfo) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (v *VFIOIommuType1DmaMap) SizeBytes() int { return 32 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (v *VFIOIommuType1DmaMap) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.Argsz)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(v.Vaddr)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(v.IOVa)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(v.Size)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (v *VFIOIommuType1DmaMap) UnmarshalBytes(src []byte) []byte { v.Argsz = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.Vaddr = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] v.IOVa = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] v.Size = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (v *VFIOIommuType1DmaMap) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (v *VFIOIommuType1DmaMap) MarshalUnsafe(dst []byte) []byte { size := v.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(v), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (v *VFIOIommuType1DmaMap) UnmarshalUnsafe(src []byte) []byte { size := v.SizeBytes() gohacks.Memmove(unsafe.Pointer(v), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (v *VFIOIommuType1DmaMap) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (v *VFIOIommuType1DmaMap) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return v.CopyOutN(cc, addr, v.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (v *VFIOIommuType1DmaMap) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (v *VFIOIommuType1DmaMap) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return v.CopyInN(cc, addr, v.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (v *VFIOIommuType1DmaMap) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (v *VFIOIommuType1DmaUnmap) SizeBytes() int { return 24 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (v *VFIOIommuType1DmaUnmap) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.Argsz)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(v.IOVa)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(v.Size)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (v *VFIOIommuType1DmaUnmap) UnmarshalBytes(src []byte) []byte { v.Argsz = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.IOVa = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] v.Size = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (v *VFIOIommuType1DmaUnmap) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (v *VFIOIommuType1DmaUnmap) MarshalUnsafe(dst []byte) []byte { size := v.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(v), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (v *VFIOIommuType1DmaUnmap) UnmarshalUnsafe(src []byte) []byte { size := v.SizeBytes() gohacks.Memmove(unsafe.Pointer(v), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (v *VFIOIommuType1DmaUnmap) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (v *VFIOIommuType1DmaUnmap) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return v.CopyOutN(cc, addr, v.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (v *VFIOIommuType1DmaUnmap) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (v *VFIOIommuType1DmaUnmap) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return v.CopyInN(cc, addr, v.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (v *VFIOIommuType1DmaUnmap) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (v *VFIOIrqInfo) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (v *VFIOIrqInfo) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.Argsz)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.Index)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.Count)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (v *VFIOIrqInfo) UnmarshalBytes(src []byte) []byte { v.Argsz = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.Index = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.Count = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (v *VFIOIrqInfo) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (v *VFIOIrqInfo) MarshalUnsafe(dst []byte) []byte { size := v.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(v), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (v *VFIOIrqInfo) UnmarshalUnsafe(src []byte) []byte { size := v.SizeBytes() gohacks.Memmove(unsafe.Pointer(v), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (v *VFIOIrqInfo) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (v *VFIOIrqInfo) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return v.CopyOutN(cc, addr, v.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (v *VFIOIrqInfo) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (v *VFIOIrqInfo) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return v.CopyInN(cc, addr, v.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (v *VFIOIrqInfo) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (v *VFIOIrqSet) SizeBytes() int { return 20 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (v *VFIOIrqSet) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.Argsz)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.Index)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.Start)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.Count)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (v *VFIOIrqSet) UnmarshalBytes(src []byte) []byte { v.Argsz = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.Index = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.Start = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.Count = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (v *VFIOIrqSet) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (v *VFIOIrqSet) MarshalUnsafe(dst []byte) []byte { size := v.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(v), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (v *VFIOIrqSet) UnmarshalUnsafe(src []byte) []byte { size := v.SizeBytes() gohacks.Memmove(unsafe.Pointer(v), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (v *VFIOIrqSet) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (v *VFIOIrqSet) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return v.CopyOutN(cc, addr, v.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (v *VFIOIrqSet) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (v *VFIOIrqSet) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return v.CopyInN(cc, addr, v.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (v *VFIOIrqSet) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (v *VFIORegionInfo) SizeBytes() int { return 32 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (v *VFIORegionInfo) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.Argsz)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.Index)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(v.capOffset)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(v.Size)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(v.Offset)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (v *VFIORegionInfo) UnmarshalBytes(src []byte) []byte { v.Argsz = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.Index = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.capOffset = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] v.Size = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] v.Offset = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (v *VFIORegionInfo) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (v *VFIORegionInfo) MarshalUnsafe(dst []byte) []byte { size := v.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(v), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (v *VFIORegionInfo) UnmarshalUnsafe(src []byte) []byte { size := v.SizeBytes() gohacks.Memmove(unsafe.Pointer(v), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (v *VFIORegionInfo) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (v *VFIORegionInfo) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return v.CopyOutN(cc, addr, v.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (v *VFIORegionInfo) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (v *VFIORegionInfo) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return v.CopyInN(cc, addr, v.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (v *VFIORegionInfo) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return int64(length), err } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/linux_amd64_abi_autogen_unsafe.go000066400000000000000000000664611465435605700277120ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build amd64 && amd64 && amd64 && amd64 && amd64 && amd64 // +build amd64,amd64,amd64,amd64,amd64,amd64 package linux import ( "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "io" "reflect" "runtime" "unsafe" ) // Marshallable types used by this file. var _ marshal.Marshallable = (*EpollEvent)(nil) var _ marshal.Marshallable = (*IPCPerm)(nil) var _ marshal.Marshallable = (*PtraceRegs)(nil) var _ marshal.Marshallable = (*SemidDS)(nil) var _ marshal.Marshallable = (*Stat)(nil) var _ marshal.Marshallable = (*TimeT)(nil) var _ marshal.Marshallable = (*Timespec)(nil) // SizeBytes implements marshal.Marshallable.SizeBytes. func (e *EpollEvent) SizeBytes() int { return 4 + 4*2 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (e *EpollEvent) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.Events)) dst = dst[4:] for idx := 0; idx < 2; idx++ { hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.Data[idx])) dst = dst[4:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (e *EpollEvent) UnmarshalBytes(src []byte) []byte { e.Events = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 2; idx++ { e.Data[idx] = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (e *EpollEvent) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (e *EpollEvent) MarshalUnsafe(dst []byte) []byte { size := e.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(e), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (e *EpollEvent) UnmarshalUnsafe(src []byte) []byte { size := e.SizeBytes() gohacks.Memmove(unsafe.Pointer(e), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (e *EpollEvent) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (e *EpollEvent) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return e.CopyOutN(cc, addr, e.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (e *EpollEvent) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (e *EpollEvent) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return e.CopyInN(cc, addr, e.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (e *EpollEvent) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return int64(length), err } // CopyEpollEventSliceIn copies in a slice of EpollEvent objects from the task's memory. func CopyEpollEventSliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []EpollEvent) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*EpollEvent)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyEpollEventSliceOut copies a slice of EpollEvent objects to the task's memory. func CopyEpollEventSliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []EpollEvent) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*EpollEvent)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeEpollEventSlice is like EpollEvent.MarshalUnsafe, but for a []EpollEvent. func MarshalUnsafeEpollEventSlice(src []EpollEvent, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*EpollEvent)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeEpollEventSlice is like EpollEvent.UnmarshalUnsafe, but for a []EpollEvent. func UnmarshalUnsafeEpollEventSlice(dst []EpollEvent, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*EpollEvent)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *Stat) SizeBytes() int { return 72 + (*Timespec)(nil).SizeBytes() + (*Timespec)(nil).SizeBytes() + (*Timespec)(nil).SizeBytes() + 8*3 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *Stat) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Dev)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Ino)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Nlink)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Mode)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.UID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.GID)) dst = dst[4:] // Padding: dst[:sizeof(int32)] ~= int32(0) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Rdev)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Size)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Blksize)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Blocks)) dst = dst[8:] dst = s.ATime.MarshalUnsafe(dst) dst = s.MTime.MarshalUnsafe(dst) dst = s.CTime.MarshalUnsafe(dst) // Padding: dst[:sizeof(int64)*3] ~= [3]int64{0} dst = dst[8*(3):] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *Stat) UnmarshalBytes(src []byte) []byte { s.Dev = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Ino = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Nlink = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Mode = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.UID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.GID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ int32 ~= src[:sizeof(int32)] src = src[4:] s.Rdev = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Size = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Blksize = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Blocks = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = s.ATime.UnmarshalUnsafe(src) src = s.MTime.UnmarshalUnsafe(src) src = s.CTime.UnmarshalUnsafe(src) // Padding: ~ copy([3]int64(s._), src[:sizeof(int64)*3]) src = src[8*(3):] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *Stat) Packed() bool { return s.ATime.Packed() && s.CTime.Packed() && s.MTime.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *Stat) MarshalUnsafe(dst []byte) []byte { if s.ATime.Packed() && s.CTime.Packed() && s.MTime.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // Type Stat doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *Stat) UnmarshalUnsafe(src []byte) []byte { if s.ATime.Packed() && s.CTime.Packed() && s.MTime.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type Stat doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *Stat) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.ATime.Packed() && s.CTime.Packed() && s.MTime.Packed() { // Type Stat doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *Stat) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *Stat) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.ATime.Packed() && s.CTime.Packed() && s.MTime.Packed() { // Type Stat doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *Stat) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *Stat) WriteTo(writer io.Writer) (int64, error) { if !s.ATime.Packed() && s.CTime.Packed() && s.MTime.Packed() { // Type Stat doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (p *PtraceRegs) SizeBytes() int { return 216 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (p *PtraceRegs) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.R15)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.R14)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.R13)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.R12)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Rbp)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Rbx)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.R11)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.R10)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.R9)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.R8)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Rax)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Rcx)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Rdx)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Rsi)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Rdi)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Orig_rax)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Rip)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Cs)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Eflags)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Rsp)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Ss)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Fs_base)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Gs_base)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Ds)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Es)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Fs)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Gs)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (p *PtraceRegs) UnmarshalBytes(src []byte) []byte { p.R15 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.R14 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.R13 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.R12 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Rbp = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Rbx = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.R11 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.R10 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.R9 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.R8 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Rax = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Rcx = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Rdx = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Rsi = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Rdi = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Orig_rax = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Rip = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Cs = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Eflags = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Rsp = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Ss = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Fs_base = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Gs_base = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Ds = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Es = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Fs = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Gs = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (p *PtraceRegs) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (p *PtraceRegs) MarshalUnsafe(dst []byte) []byte { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(p), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (p *PtraceRegs) UnmarshalUnsafe(src []byte) []byte { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(p), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (p *PtraceRegs) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (p *PtraceRegs) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyOutN(cc, addr, p.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (p *PtraceRegs) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (p *PtraceRegs) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyInN(cc, addr, p.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (p *PtraceRegs) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SemidDS) SizeBytes() int { return 40 + (*IPCPerm)(nil).SizeBytes() + (*TimeT)(nil).SizeBytes() + (*TimeT)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SemidDS) MarshalBytes(dst []byte) []byte { dst = s.SemPerm.MarshalUnsafe(dst) dst = s.SemOTime.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.unused1)) dst = dst[8:] dst = s.SemCTime.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.unused2)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.SemNSems)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.unused3)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.unused4)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SemidDS) UnmarshalBytes(src []byte) []byte { src = s.SemPerm.UnmarshalUnsafe(src) src = s.SemOTime.UnmarshalUnsafe(src) s.unused1 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = s.SemCTime.UnmarshalUnsafe(src) s.unused2 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.SemNSems = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.unused3 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.unused4 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SemidDS) Packed() bool { return s.SemCTime.Packed() && s.SemOTime.Packed() && s.SemPerm.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SemidDS) MarshalUnsafe(dst []byte) []byte { if s.SemCTime.Packed() && s.SemOTime.Packed() && s.SemPerm.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // Type SemidDS doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SemidDS) UnmarshalUnsafe(src []byte) []byte { if s.SemCTime.Packed() && s.SemOTime.Packed() && s.SemPerm.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type SemidDS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SemidDS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.SemCTime.Packed() && s.SemOTime.Packed() && s.SemPerm.Packed() { // Type SemidDS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SemidDS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SemidDS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.SemCTime.Packed() && s.SemOTime.Packed() && s.SemPerm.Packed() { // Type SemidDS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SemidDS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SemidDS) WriteTo(writer io.Writer) (int64, error) { if !s.SemCTime.Packed() && s.SemOTime.Packed() && s.SemPerm.Packed() { // Type SemidDS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/linux_amd64_state_autogen.go000066400000000000000000000054601465435605700267260ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 && amd64 && amd64 && amd64 && amd64 && amd64 // +build amd64,amd64,amd64,amd64,amd64,amd64 package linux import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (p *PtraceRegs) StateTypeName() string { return "pkg/abi/linux.PtraceRegs" } func (p *PtraceRegs) StateFields() []string { return []string{ "R15", "R14", "R13", "R12", "Rbp", "Rbx", "R11", "R10", "R9", "R8", "Rax", "Rcx", "Rdx", "Rsi", "Rdi", "Orig_rax", "Rip", "Cs", "Eflags", "Rsp", "Ss", "Fs_base", "Gs_base", "Ds", "Es", "Fs", "Gs", } } func (p *PtraceRegs) beforeSave() {} // +checklocksignore func (p *PtraceRegs) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.R15) stateSinkObject.Save(1, &p.R14) stateSinkObject.Save(2, &p.R13) stateSinkObject.Save(3, &p.R12) stateSinkObject.Save(4, &p.Rbp) stateSinkObject.Save(5, &p.Rbx) stateSinkObject.Save(6, &p.R11) stateSinkObject.Save(7, &p.R10) stateSinkObject.Save(8, &p.R9) stateSinkObject.Save(9, &p.R8) stateSinkObject.Save(10, &p.Rax) stateSinkObject.Save(11, &p.Rcx) stateSinkObject.Save(12, &p.Rdx) stateSinkObject.Save(13, &p.Rsi) stateSinkObject.Save(14, &p.Rdi) stateSinkObject.Save(15, &p.Orig_rax) stateSinkObject.Save(16, &p.Rip) stateSinkObject.Save(17, &p.Cs) stateSinkObject.Save(18, &p.Eflags) stateSinkObject.Save(19, &p.Rsp) stateSinkObject.Save(20, &p.Ss) stateSinkObject.Save(21, &p.Fs_base) stateSinkObject.Save(22, &p.Gs_base) stateSinkObject.Save(23, &p.Ds) stateSinkObject.Save(24, &p.Es) stateSinkObject.Save(25, &p.Fs) stateSinkObject.Save(26, &p.Gs) } func (p *PtraceRegs) afterLoad(context.Context) {} // +checklocksignore func (p *PtraceRegs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.R15) stateSourceObject.Load(1, &p.R14) stateSourceObject.Load(2, &p.R13) stateSourceObject.Load(3, &p.R12) stateSourceObject.Load(4, &p.Rbp) stateSourceObject.Load(5, &p.Rbx) stateSourceObject.Load(6, &p.R11) stateSourceObject.Load(7, &p.R10) stateSourceObject.Load(8, &p.R9) stateSourceObject.Load(9, &p.R8) stateSourceObject.Load(10, &p.Rax) stateSourceObject.Load(11, &p.Rcx) stateSourceObject.Load(12, &p.Rdx) stateSourceObject.Load(13, &p.Rsi) stateSourceObject.Load(14, &p.Rdi) stateSourceObject.Load(15, &p.Orig_rax) stateSourceObject.Load(16, &p.Rip) stateSourceObject.Load(17, &p.Cs) stateSourceObject.Load(18, &p.Eflags) stateSourceObject.Load(19, &p.Rsp) stateSourceObject.Load(20, &p.Ss) stateSourceObject.Load(21, &p.Fs_base) stateSourceObject.Load(22, &p.Gs_base) stateSourceObject.Load(23, &p.Ds) stateSourceObject.Load(24, &p.Es) stateSourceObject.Load(25, &p.Fs) stateSourceObject.Load(26, &p.Gs) } func init() { state.Register((*PtraceRegs)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/linux_arm64_abi_autogen_unsafe.go000066400000000000000000000600401465435605700277130ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build arm64 && arm64 && arm64 && arm64 && arm64 // +build arm64,arm64,arm64,arm64,arm64 package linux import ( "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "io" "reflect" "runtime" "unsafe" ) // Marshallable types used by this file. var _ marshal.Marshallable = (*EpollEvent)(nil) var _ marshal.Marshallable = (*IPCPerm)(nil) var _ marshal.Marshallable = (*PtraceRegs)(nil) var _ marshal.Marshallable = (*SemidDS)(nil) var _ marshal.Marshallable = (*Stat)(nil) var _ marshal.Marshallable = (*TimeT)(nil) var _ marshal.Marshallable = (*Timespec)(nil) // SizeBytes implements marshal.Marshallable.SizeBytes. func (e *EpollEvent) SizeBytes() int { return 8 + 4*2 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (e *EpollEvent) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.Events)) dst = dst[4:] // Padding: dst[:sizeof(int32)] ~= int32(0) dst = dst[4:] for idx := 0; idx < 2; idx++ { hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.Data[idx])) dst = dst[4:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (e *EpollEvent) UnmarshalBytes(src []byte) []byte { e.Events = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ int32 ~= src[:sizeof(int32)] src = src[4:] for idx := 0; idx < 2; idx++ { e.Data[idx] = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (e *EpollEvent) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (e *EpollEvent) MarshalUnsafe(dst []byte) []byte { size := e.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(e), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (e *EpollEvent) UnmarshalUnsafe(src []byte) []byte { size := e.SizeBytes() gohacks.Memmove(unsafe.Pointer(e), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (e *EpollEvent) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (e *EpollEvent) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return e.CopyOutN(cc, addr, e.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (e *EpollEvent) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (e *EpollEvent) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return e.CopyInN(cc, addr, e.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (e *EpollEvent) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return int64(length), err } // CopyEpollEventSliceIn copies in a slice of EpollEvent objects from the task's memory. func CopyEpollEventSliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []EpollEvent) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*EpollEvent)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyEpollEventSliceOut copies a slice of EpollEvent objects to the task's memory. func CopyEpollEventSliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []EpollEvent) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*EpollEvent)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeEpollEventSlice is like EpollEvent.MarshalUnsafe, but for a []EpollEvent. func MarshalUnsafeEpollEventSlice(src []EpollEvent, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*EpollEvent)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeEpollEventSlice is like EpollEvent.UnmarshalUnsafe, but for a []EpollEvent. func UnmarshalUnsafeEpollEventSlice(dst []EpollEvent, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*EpollEvent)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *Stat) SizeBytes() int { return 72 + (*Timespec)(nil).SizeBytes() + (*Timespec)(nil).SizeBytes() + (*Timespec)(nil).SizeBytes() + 4*2 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *Stat) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Dev)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Ino)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Mode)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Nlink)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.UID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.GID)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Rdev)) dst = dst[8:] // Padding: dst[:sizeof(uint64)] ~= uint64(0) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Size)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Blksize)) dst = dst[4:] // Padding: dst[:sizeof(int32)] ~= int32(0) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Blocks)) dst = dst[8:] dst = s.ATime.MarshalUnsafe(dst) dst = s.MTime.MarshalUnsafe(dst) dst = s.CTime.MarshalUnsafe(dst) // Padding: dst[:sizeof(int32)*2] ~= [2]int32{0} dst = dst[4*(2):] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *Stat) UnmarshalBytes(src []byte) []byte { s.Dev = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Ino = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Mode = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Nlink = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.UID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.GID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Rdev = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] // Padding: var _ uint64 ~= src[:sizeof(uint64)] src = src[8:] s.Size = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Blksize = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ int32 ~= src[:sizeof(int32)] src = src[4:] s.Blocks = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = s.ATime.UnmarshalUnsafe(src) src = s.MTime.UnmarshalUnsafe(src) src = s.CTime.UnmarshalUnsafe(src) // Padding: ~ copy([2]int32(s._), src[:sizeof(int32)*2]) src = src[4*(2):] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *Stat) Packed() bool { return s.ATime.Packed() && s.CTime.Packed() && s.MTime.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *Stat) MarshalUnsafe(dst []byte) []byte { if s.ATime.Packed() && s.CTime.Packed() && s.MTime.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // Type Stat doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *Stat) UnmarshalUnsafe(src []byte) []byte { if s.ATime.Packed() && s.CTime.Packed() && s.MTime.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type Stat doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *Stat) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.ATime.Packed() && s.CTime.Packed() && s.MTime.Packed() { // Type Stat doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *Stat) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *Stat) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.ATime.Packed() && s.CTime.Packed() && s.MTime.Packed() { // Type Stat doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *Stat) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *Stat) WriteTo(writer io.Writer) (int64, error) { if !s.ATime.Packed() && s.CTime.Packed() && s.MTime.Packed() { // Type Stat doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (p *PtraceRegs) SizeBytes() int { return 24 + 8*31 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (p *PtraceRegs) MarshalBytes(dst []byte) []byte { for idx := 0; idx < 31; idx++ { hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Regs[idx])) dst = dst[8:] } hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Sp)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Pc)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Pstate)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (p *PtraceRegs) UnmarshalBytes(src []byte) []byte { for idx := 0; idx < 31; idx++ { p.Regs[idx] = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] } p.Sp = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Pc = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Pstate = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (p *PtraceRegs) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (p *PtraceRegs) MarshalUnsafe(dst []byte) []byte { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(p), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (p *PtraceRegs) UnmarshalUnsafe(src []byte) []byte { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(p), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (p *PtraceRegs) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (p *PtraceRegs) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyOutN(cc, addr, p.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (p *PtraceRegs) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (p *PtraceRegs) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyInN(cc, addr, p.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (p *PtraceRegs) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SemidDS) SizeBytes() int { return 24 + (*IPCPerm)(nil).SizeBytes() + (*TimeT)(nil).SizeBytes() + (*TimeT)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SemidDS) MarshalBytes(dst []byte) []byte { dst = s.SemPerm.MarshalUnsafe(dst) dst = s.SemOTime.MarshalUnsafe(dst) dst = s.SemCTime.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.SemNSems)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.unused3)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.unused4)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SemidDS) UnmarshalBytes(src []byte) []byte { src = s.SemPerm.UnmarshalUnsafe(src) src = s.SemOTime.UnmarshalUnsafe(src) src = s.SemCTime.UnmarshalUnsafe(src) s.SemNSems = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.unused3 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.unused4 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SemidDS) Packed() bool { return s.SemCTime.Packed() && s.SemOTime.Packed() && s.SemPerm.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SemidDS) MarshalUnsafe(dst []byte) []byte { if s.SemCTime.Packed() && s.SemOTime.Packed() && s.SemPerm.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // Type SemidDS doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SemidDS) UnmarshalUnsafe(src []byte) []byte { if s.SemCTime.Packed() && s.SemOTime.Packed() && s.SemPerm.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type SemidDS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SemidDS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.SemCTime.Packed() && s.SemOTime.Packed() && s.SemPerm.Packed() { // Type SemidDS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SemidDS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SemidDS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.SemCTime.Packed() && s.SemOTime.Packed() && s.SemPerm.Packed() { // Type SemidDS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SemidDS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SemidDS) WriteTo(writer io.Writer) (int64, error) { if !s.SemCTime.Packed() && s.SemOTime.Packed() && s.SemPerm.Packed() { // Type SemidDS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/linux_arm64_state_autogen.go000066400000000000000000000020021465435605700267310ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 && arm64 && arm64 && arm64 && arm64 // +build arm64,arm64,arm64,arm64,arm64 package linux import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (p *PtraceRegs) StateTypeName() string { return "pkg/abi/linux.PtraceRegs" } func (p *PtraceRegs) StateFields() []string { return []string{ "Regs", "Sp", "Pc", "Pstate", } } func (p *PtraceRegs) beforeSave() {} // +checklocksignore func (p *PtraceRegs) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.Regs) stateSinkObject.Save(1, &p.Sp) stateSinkObject.Save(2, &p.Pc) stateSinkObject.Save(3, &p.Pstate) } func (p *PtraceRegs) afterLoad(context.Context) {} // +checklocksignore func (p *PtraceRegs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.Regs) stateSourceObject.Load(1, &p.Sp) stateSourceObject.Load(2, &p.Pc) stateSourceObject.Load(3, &p.Pstate) } func init() { state.Register((*PtraceRegs)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/linux_state_autogen.go000066400000000000000000000347471465435605700257450ustar00rootroot00000000000000// automatically generated by stateify. package linux import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (i *IOEvent) StateTypeName() string { return "pkg/abi/linux.IOEvent" } func (i *IOEvent) StateFields() []string { return []string{ "Data", "Obj", "Result", "Result2", } } func (i *IOEvent) beforeSave() {} // +checklocksignore func (i *IOEvent) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.Data) stateSinkObject.Save(1, &i.Obj) stateSinkObject.Save(2, &i.Result) stateSinkObject.Save(3, &i.Result2) } func (i *IOEvent) afterLoad(context.Context) {} // +checklocksignore func (i *IOEvent) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.Data) stateSourceObject.Load(1, &i.Obj) stateSourceObject.Load(2, &i.Result) stateSourceObject.Load(3, &i.Result2) } func (b *BPFInstruction) StateTypeName() string { return "pkg/abi/linux.BPFInstruction" } func (b *BPFInstruction) StateFields() []string { return []string{ "OpCode", "JumpIfTrue", "JumpIfFalse", "K", } } func (b *BPFInstruction) beforeSave() {} // +checklocksignore func (b *BPFInstruction) StateSave(stateSinkObject state.Sink) { b.beforeSave() stateSinkObject.Save(0, &b.OpCode) stateSinkObject.Save(1, &b.JumpIfTrue) stateSinkObject.Save(2, &b.JumpIfFalse) stateSinkObject.Save(3, &b.K) } func (b *BPFInstruction) afterLoad(context.Context) {} // +checklocksignore func (b *BPFInstruction) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &b.OpCode) stateSourceObject.Load(1, &b.JumpIfTrue) stateSourceObject.Load(2, &b.JumpIfFalse) stateSourceObject.Load(3, &b.K) } func (f *FUSEHeaderIn) StateTypeName() string { return "pkg/abi/linux.FUSEHeaderIn" } func (f *FUSEHeaderIn) StateFields() []string { return []string{ "Len", "Opcode", "Unique", "NodeID", "UID", "GID", "PID", } } func (f *FUSEHeaderIn) beforeSave() {} // +checklocksignore func (f *FUSEHeaderIn) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.Len) stateSinkObject.Save(1, &f.Opcode) stateSinkObject.Save(2, &f.Unique) stateSinkObject.Save(3, &f.NodeID) stateSinkObject.Save(4, &f.UID) stateSinkObject.Save(5, &f.GID) stateSinkObject.Save(6, &f.PID) } func (f *FUSEHeaderIn) afterLoad(context.Context) {} // +checklocksignore func (f *FUSEHeaderIn) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.Len) stateSourceObject.Load(1, &f.Opcode) stateSourceObject.Load(2, &f.Unique) stateSourceObject.Load(3, &f.NodeID) stateSourceObject.Load(4, &f.UID) stateSourceObject.Load(5, &f.GID) stateSourceObject.Load(6, &f.PID) } func (f *FUSEHeaderOut) StateTypeName() string { return "pkg/abi/linux.FUSEHeaderOut" } func (f *FUSEHeaderOut) StateFields() []string { return []string{ "Len", "Error", "Unique", } } func (f *FUSEHeaderOut) beforeSave() {} // +checklocksignore func (f *FUSEHeaderOut) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.Len) stateSinkObject.Save(1, &f.Error) stateSinkObject.Save(2, &f.Unique) } func (f *FUSEHeaderOut) afterLoad(context.Context) {} // +checklocksignore func (f *FUSEHeaderOut) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.Len) stateSourceObject.Load(1, &f.Error) stateSourceObject.Load(2, &f.Unique) } func (i *IOUringCqe) StateTypeName() string { return "pkg/abi/linux.IOUringCqe" } func (i *IOUringCqe) StateFields() []string { return []string{ "UserData", "Res", "Flags", } } func (i *IOUringCqe) beforeSave() {} // +checklocksignore func (i *IOUringCqe) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.UserData) stateSinkObject.Save(1, &i.Res) stateSinkObject.Save(2, &i.Flags) } func (i *IOUringCqe) afterLoad(context.Context) {} // +checklocksignore func (i *IOUringCqe) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.UserData) stateSourceObject.Load(1, &i.Res) stateSourceObject.Load(2, &i.Flags) } func (i *IOUring) StateTypeName() string { return "pkg/abi/linux.IOUring" } func (i *IOUring) StateFields() []string { return []string{ "Head", "Tail", } } func (i *IOUring) beforeSave() {} // +checklocksignore func (i *IOUring) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.Head) stateSinkObject.Save(1, &i.Tail) } func (i *IOUring) afterLoad(context.Context) {} // +checklocksignore func (i *IOUring) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.Head) stateSourceObject.Load(1, &i.Tail) } func (i *IORings) StateTypeName() string { return "pkg/abi/linux.IORings" } func (i *IORings) StateFields() []string { return []string{ "Sq", "Cq", "SqRingMask", "CqRingMask", "SqRingEntries", "CqRingEntries", "sqDropped", "sqFlags", "cqFlags", "CqOverflow", } } func (i *IORings) beforeSave() {} // +checklocksignore func (i *IORings) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.Sq) stateSinkObject.Save(1, &i.Cq) stateSinkObject.Save(2, &i.SqRingMask) stateSinkObject.Save(3, &i.CqRingMask) stateSinkObject.Save(4, &i.SqRingEntries) stateSinkObject.Save(5, &i.CqRingEntries) stateSinkObject.Save(6, &i.sqDropped) stateSinkObject.Save(7, &i.sqFlags) stateSinkObject.Save(8, &i.cqFlags) stateSinkObject.Save(9, &i.CqOverflow) } func (i *IORings) afterLoad(context.Context) {} // +checklocksignore func (i *IORings) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.Sq) stateSourceObject.Load(1, &i.Cq) stateSourceObject.Load(2, &i.SqRingMask) stateSourceObject.Load(3, &i.CqRingMask) stateSourceObject.Load(4, &i.SqRingEntries) stateSourceObject.Load(5, &i.CqRingEntries) stateSourceObject.Load(6, &i.sqDropped) stateSourceObject.Load(7, &i.sqFlags) stateSourceObject.Load(8, &i.cqFlags) stateSourceObject.Load(9, &i.CqOverflow) } func (i *IOUringSqe) StateTypeName() string { return "pkg/abi/linux.IOUringSqe" } func (i *IOUringSqe) StateFields() []string { return []string{ "Opcode", "Flags", "IoPrio", "Fd", "OffOrAddrOrCmdOp", "AddrOrSpliceOff", "Len", "specialFlags", "UserData", "BufIndexOrGroup", "personality", "spliceFDOrFileIndex", "addr3", } } func (i *IOUringSqe) beforeSave() {} // +checklocksignore func (i *IOUringSqe) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.Opcode) stateSinkObject.Save(1, &i.Flags) stateSinkObject.Save(2, &i.IoPrio) stateSinkObject.Save(3, &i.Fd) stateSinkObject.Save(4, &i.OffOrAddrOrCmdOp) stateSinkObject.Save(5, &i.AddrOrSpliceOff) stateSinkObject.Save(6, &i.Len) stateSinkObject.Save(7, &i.specialFlags) stateSinkObject.Save(8, &i.UserData) stateSinkObject.Save(9, &i.BufIndexOrGroup) stateSinkObject.Save(10, &i.personality) stateSinkObject.Save(11, &i.spliceFDOrFileIndex) stateSinkObject.Save(12, &i.addr3) } func (i *IOUringSqe) afterLoad(context.Context) {} // +checklocksignore func (i *IOUringSqe) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.Opcode) stateSourceObject.Load(1, &i.Flags) stateSourceObject.Load(2, &i.IoPrio) stateSourceObject.Load(3, &i.Fd) stateSourceObject.Load(4, &i.OffOrAddrOrCmdOp) stateSourceObject.Load(5, &i.AddrOrSpliceOff) stateSourceObject.Load(6, &i.Len) stateSourceObject.Load(7, &i.specialFlags) stateSourceObject.Load(8, &i.UserData) stateSourceObject.Load(9, &i.BufIndexOrGroup) stateSourceObject.Load(10, &i.personality) stateSourceObject.Load(11, &i.spliceFDOrFileIndex) stateSourceObject.Load(12, &i.addr3) } func (s *SigAction) StateTypeName() string { return "pkg/abi/linux.SigAction" } func (s *SigAction) StateFields() []string { return []string{ "Handler", "Flags", "Restorer", "Mask", } } func (s *SigAction) beforeSave() {} // +checklocksignore func (s *SigAction) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.Handler) stateSinkObject.Save(1, &s.Flags) stateSinkObject.Save(2, &s.Restorer) stateSinkObject.Save(3, &s.Mask) } func (s *SigAction) afterLoad(context.Context) {} // +checklocksignore func (s *SigAction) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.Handler) stateSourceObject.Load(1, &s.Flags) stateSourceObject.Load(2, &s.Restorer) stateSourceObject.Load(3, &s.Mask) } func (s *SignalStack) StateTypeName() string { return "pkg/abi/linux.SignalStack" } func (s *SignalStack) StateFields() []string { return []string{ "Addr", "Flags", "Size", } } func (s *SignalStack) beforeSave() {} // +checklocksignore func (s *SignalStack) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.Addr) stateSinkObject.Save(1, &s.Flags) stateSinkObject.Save(2, &s.Size) } func (s *SignalStack) afterLoad(context.Context) {} // +checklocksignore func (s *SignalStack) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.Addr) stateSourceObject.Load(1, &s.Flags) stateSourceObject.Load(2, &s.Size) } func (s *SignalInfo) StateTypeName() string { return "pkg/abi/linux.SignalInfo" } func (s *SignalInfo) StateFields() []string { return []string{ "Signo", "Errno", "Code", "Fields", } } func (s *SignalInfo) beforeSave() {} // +checklocksignore func (s *SignalInfo) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.Signo) stateSinkObject.Save(1, &s.Errno) stateSinkObject.Save(2, &s.Code) stateSinkObject.Save(3, &s.Fields) } func (s *SignalInfo) afterLoad(context.Context) {} // +checklocksignore func (s *SignalInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.Signo) stateSourceObject.Load(1, &s.Errno) stateSourceObject.Load(2, &s.Code) stateSourceObject.Load(3, &s.Fields) } func (c *ControlMessageIPPacketInfo) StateTypeName() string { return "pkg/abi/linux.ControlMessageIPPacketInfo" } func (c *ControlMessageIPPacketInfo) StateFields() []string { return []string{ "NIC", "LocalAddr", "DestinationAddr", } } func (c *ControlMessageIPPacketInfo) beforeSave() {} // +checklocksignore func (c *ControlMessageIPPacketInfo) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.NIC) stateSinkObject.Save(1, &c.LocalAddr) stateSinkObject.Save(2, &c.DestinationAddr) } func (c *ControlMessageIPPacketInfo) afterLoad(context.Context) {} // +checklocksignore func (c *ControlMessageIPPacketInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.NIC) stateSourceObject.Load(1, &c.LocalAddr) stateSourceObject.Load(2, &c.DestinationAddr) } func (c *ControlMessageIPv6PacketInfo) StateTypeName() string { return "pkg/abi/linux.ControlMessageIPv6PacketInfo" } func (c *ControlMessageIPv6PacketInfo) StateFields() []string { return []string{ "Addr", "NIC", } } func (c *ControlMessageIPv6PacketInfo) beforeSave() {} // +checklocksignore func (c *ControlMessageIPv6PacketInfo) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.Addr) stateSinkObject.Save(1, &c.NIC) } func (c *ControlMessageIPv6PacketInfo) afterLoad(context.Context) {} // +checklocksignore func (c *ControlMessageIPv6PacketInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.Addr) stateSourceObject.Load(1, &c.NIC) } func (i *ICMP6Filter) StateTypeName() string { return "pkg/abi/linux.ICMP6Filter" } func (i *ICMP6Filter) StateFields() []string { return []string{ "Filter", } } func (i *ICMP6Filter) beforeSave() {} // +checklocksignore func (i *ICMP6Filter) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.Filter) } func (i *ICMP6Filter) afterLoad(context.Context) {} // +checklocksignore func (i *ICMP6Filter) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.Filter) } func (t *KernelTermios) StateTypeName() string { return "pkg/abi/linux.KernelTermios" } func (t *KernelTermios) StateFields() []string { return []string{ "InputFlags", "OutputFlags", "ControlFlags", "LocalFlags", "LineDiscipline", "ControlCharacters", "InputSpeed", "OutputSpeed", } } func (t *KernelTermios) beforeSave() {} // +checklocksignore func (t *KernelTermios) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.InputFlags) stateSinkObject.Save(1, &t.OutputFlags) stateSinkObject.Save(2, &t.ControlFlags) stateSinkObject.Save(3, &t.LocalFlags) stateSinkObject.Save(4, &t.LineDiscipline) stateSinkObject.Save(5, &t.ControlCharacters) stateSinkObject.Save(6, &t.InputSpeed) stateSinkObject.Save(7, &t.OutputSpeed) } func (t *KernelTermios) afterLoad(context.Context) {} // +checklocksignore func (t *KernelTermios) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.InputFlags) stateSourceObject.Load(1, &t.OutputFlags) stateSourceObject.Load(2, &t.ControlFlags) stateSourceObject.Load(3, &t.LocalFlags) stateSourceObject.Load(4, &t.LineDiscipline) stateSourceObject.Load(5, &t.ControlCharacters) stateSourceObject.Load(6, &t.InputSpeed) stateSourceObject.Load(7, &t.OutputSpeed) } func (w *WindowSize) StateTypeName() string { return "pkg/abi/linux.WindowSize" } func (w *WindowSize) StateFields() []string { return []string{ "Rows", "Cols", } } func (w *WindowSize) beforeSave() {} // +checklocksignore func (w *WindowSize) StateSave(stateSinkObject state.Sink) { w.beforeSave() stateSinkObject.Save(0, &w.Rows) stateSinkObject.Save(1, &w.Cols) } func (w *WindowSize) afterLoad(context.Context) {} // +checklocksignore func (w *WindowSize) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &w.Rows) stateSourceObject.Load(1, &w.Cols) } func init() { state.Register((*IOEvent)(nil)) state.Register((*BPFInstruction)(nil)) state.Register((*FUSEHeaderIn)(nil)) state.Register((*FUSEHeaderOut)(nil)) state.Register((*IOUringCqe)(nil)) state.Register((*IOUring)(nil)) state.Register((*IORings)(nil)) state.Register((*IOUringSqe)(nil)) state.Register((*SigAction)(nil)) state.Register((*SignalStack)(nil)) state.Register((*SignalInfo)(nil)) state.Register((*ControlMessageIPPacketInfo)(nil)) state.Register((*ControlMessageIPv6PacketInfo)(nil)) state.Register((*ICMP6Filter)(nil)) state.Register((*KernelTermios)(nil)) state.Register((*WindowSize)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/linux_unsafe_abi_autogen_unsafe.go000066400000000000000000000001451465435605700302430ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package linux import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/linux_unsafe_state_autogen.go000066400000000000000000000000671465435605700272720ustar00rootroot00000000000000// automatically generated by stateify. package linux golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/membarrier.go000066400000000000000000000026251465435605700237770ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // membarrier(2) commands, from include/uapi/linux/membarrier.h. const ( MEMBARRIER_CMD_QUERY = 0 MEMBARRIER_CMD_GLOBAL = (1 << 0) MEMBARRIER_CMD_GLOBAL_EXPEDITED = (1 << 1) MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED = (1 << 2) MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3) MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4) MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 5) MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6) MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ = (1 << 7) MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ = (1 << 8) ) // membarrier(2) flags, from include/uapi/linux/membarrier.h. const ( MEMBARRIER_CMD_FLAG_CPU = (1 << 0) ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/mm.go000066400000000000000000000070731465435605700222650ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "fmt" "golang.org/x/sys/unix" ) // Protections for mmap(2). const ( PROT_NONE = 0 PROT_READ = 1 << 0 PROT_WRITE = 1 << 1 PROT_EXEC = 1 << 2 PROT_SEM = 1 << 3 PROT_GROWSDOWN = 1 << 24 PROT_GROWSUP = 1 << 25 ) // Flags for mmap(2). const ( MAP_SHARED = 1 << 0 MAP_PRIVATE = 1 << 1 MAP_FIXED = 1 << 4 MAP_ANONYMOUS = 1 << 5 MAP_32BIT = 1 << 6 // arch/x86/include/uapi/asm/mman.h MAP_GROWSDOWN = 1 << 8 MAP_DENYWRITE = 1 << 11 MAP_EXECUTABLE = 1 << 12 MAP_LOCKED = 1 << 13 MAP_NORESERVE = 1 << 14 MAP_POPULATE = 1 << 15 MAP_NONBLOCK = 1 << 16 MAP_STACK = 1 << 17 MAP_HUGETLB = 1 << 18 ) // Flags for mremap(2). const ( MREMAP_MAYMOVE = 1 << 0 MREMAP_FIXED = 1 << 1 ) // Flags for mlock2(2). const ( MLOCK_ONFAULT = 0x01 ) // Flags for mlockall(2). const ( MCL_CURRENT = 1 MCL_FUTURE = 2 MCL_ONFAULT = 4 ) // Advice for madvise(2). const ( MADV_NORMAL = 0 MADV_RANDOM = 1 MADV_SEQUENTIAL = 2 MADV_WILLNEED = 3 MADV_DONTNEED = 4 MADV_REMOVE = 9 MADV_DONTFORK = 10 MADV_DOFORK = 11 MADV_MERGEABLE = 12 MADV_UNMERGEABLE = 13 MADV_HUGEPAGE = 14 MADV_NOHUGEPAGE = 15 MADV_DONTDUMP = 16 MADV_DODUMP = 17 MADV_HWPOISON = 100 MADV_SOFT_OFFLINE = 101 MADV_NOMAJFAULT = 200 MADV_DONTCHGME = 201 ) // Flags for msync(2). const ( MS_ASYNC = 1 << 0 MS_INVALIDATE = 1 << 1 MS_SYNC = 1 << 2 ) // NumaPolicy is the NUMA memory policy for a memory range. See numa(7). // // +marshal type NumaPolicy int32 // Policies for get_mempolicy(2)/set_mempolicy(2). const ( MPOL_DEFAULT NumaPolicy = 0 MPOL_PREFERRED NumaPolicy = 1 MPOL_BIND NumaPolicy = 2 MPOL_INTERLEAVE NumaPolicy = 3 MPOL_LOCAL NumaPolicy = 4 MPOL_MAX NumaPolicy = 5 ) // Flags for get_mempolicy(2). const ( MPOL_F_NODE = 1 << 0 MPOL_F_ADDR = 1 << 1 MPOL_F_MEMS_ALLOWED = 1 << 2 ) // Flags for set_mempolicy(2). const ( MPOL_F_RELATIVE_NODES = 1 << 14 MPOL_F_STATIC_NODES = 1 << 15 MPOL_MODE_FLAGS = (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES) ) // Flags for mbind(2). const ( MPOL_MF_STRICT = 1 << 0 MPOL_MF_MOVE = 1 << 1 MPOL_MF_MOVE_ALL = 1 << 2 MPOL_MF_VALID = MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL ) // TaskSize is the address space size. var TaskSize = func() uintptr { pageSize := uintptr(unix.Getpagesize()) for _, s := range feasibleTaskSizes { // mmap returns ENOMEM if addr is greater than TASK_SIZE, // otherwise it returns EINVAL, because addr isn't aligned to // the page size. _, _, errno := unix.RawSyscall6( unix.SYS_MMAP, s-pageSize-1, 512, uintptr(unix.PROT_NONE), uintptr(unix.MAP_ANONYMOUS|unix.MAP_PRIVATE|unix.MAP_FIXED), 0, 0) if errno == unix.EINVAL { return s } if errno != unix.ENOMEM { panic(fmt.Sprintf("mmap returned unexpected error: %d", errno)) } } panic("None of the address space sizes could be successfully mmaped") }() golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/mm_amd64.go000066400000000000000000000015271465435605700232560ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package linux // TASK_SIZE can be one of two values, corresponding to 4-level and 5-level // paging. // // The array has to be sorted in decreasing order. var feasibleTaskSizes = []uintptr{0xfffffffffff000, 0x7ffffffff000} golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/mm_arm64.go000066400000000000000000000016251465435605700232730ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package linux // Only 4K page size is supported on arm64. In this case, TASK_SIZE can // be one of three values, corresponding to 3-level, 4-level and // 5-level paging. // // The array has to be sorted in decreasing order. var feasibleTaskSizes = []uintptr{1 << 52, 1 << 48, 1 << 39} golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/mqueue.go000066400000000000000000000031541465435605700231510ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Default values for POSIX message queues. Source: // include/linux/ipc_namespace.h const ( DFLT_QUEUESMAX = 256 MIN_MSGMAX = 1 DFLT_MSG uint = 10 DFLT_MSGMAX = 10 HARD_MSGMAX = 65536 MIN_MSGSIZEMAX = 128 DFLT_MSGSIZE uint = 8192 DFLT_MSGSIZEMAX = 8192 HARD_MSGSIZEMAX = (16 * 1024 * 1024) ) // Maximum values for a message queue. Source: include/uapi/linux/mqueue.h const ( MQ_PRIO_MAX = 32768 MQ_BYTES_MAX = 819200 ) // Codes used by mq_notify. Source: include/uapi/linux/mqueue.h const ( NOTIFY_NONE = 0 NOTIFY_WOKENUP = 1 NOTIFY_REMOVED = 2 NOTIFY_COOKIE_LEN = 32 ) // MqAttr is equivalent to struct mq_attr. Source: include/uapi/linux/mqueue.h // // +marshal type MqAttr struct { MqFlags int64 // Message queue flags. MqMaxmsg int64 // Maximum number of messages. MqMsgsize int64 // Maximum message size. MqCurmsgs int64 // Number of messages currently queued. _ [4]int64 // Ignored for input, zeroed for output. } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/msgqueue.go000066400000000000000000000060301465435605700234770ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/marshal/primitive" ) // Linux-specific control commands. Source: include/uapi/linux/msg.h const ( MSG_STAT = 11 MSG_INFO = 12 MSG_STAT_ANY = 13 ) // msgrcv(2) options. Source: include/uapi/linux/msg.h const ( MSG_NOERROR = 010000 // No error if message is too big. MSG_EXCEPT = 020000 // Receive any message except of specified type. MSG_COPY = 040000 // Copy (not remove) all queue messages. ) // System-wide limits for message queues. Source: include/uapi/linux/msg.h const ( MSGMNI = 32000 // Maximum number of message queue identifiers. MSGMAX = 8192 // Maximum size of message (bytes). MSGMNB = 16384 // Default max size of a message queue. ) // System-wide limits. Unused. Source: include/uapi/linux/msg.h const ( MSGPOOL = (MSGMNI * MSGMNB / 1024) MSGTQL = MSGMNB MSGMAP = MSGMNB MSGSSZ = 16 // MSGSEG is simplified due to the inexistance of a ternary operator. MSGSEG = 0xffff ) // MsqidDS is equivalent to struct msqid64_ds. Source: // include/uapi/asm-generic/shmbuf.h // // +marshal type MsqidDS struct { MsgPerm IPCPerm // IPC permissions. MsgStime TimeT // Last msgsnd time. MsgRtime TimeT // Last msgrcv time. MsgCtime TimeT // Last change time. MsgCbytes uint64 // Current number of bytes on the queue. MsgQnum uint64 // Number of messages in the queue. MsgQbytes uint64 // Max number of bytes in the queue. MsgLspid int32 // PID of last msgsnd. MsgLrpid int32 // PID of last msgrcv. unused4 uint64 unused5 uint64 } // MsgBuf is equivalent to struct msgbuf. Source: include/uapi/linux/msg.h // // +marshal dynamic type MsgBuf struct { Type primitive.Int64 Text primitive.ByteSlice } // SizeBytes implements marshal.Marshallable.SizeBytes. func (b *MsgBuf) SizeBytes() int { return b.Type.SizeBytes() + b.Text.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (b *MsgBuf) MarshalBytes(dst []byte) []byte { dst = b.Type.MarshalUnsafe(dst) return b.Text.MarshalBytes(dst) } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (b *MsgBuf) UnmarshalBytes(src []byte) []byte { src = b.Type.UnmarshalUnsafe(src) return b.Text.UnmarshalBytes(src) } // MsgInfo is equivalent to struct msginfo. Source: include/uapi/linux/msg.h // // +marshal type MsgInfo struct { MsgPool int32 MsgMap int32 MsgMax int32 MsgMnb int32 MsgMni int32 MsgSsz int32 MsgTql int32 MsgSeg uint16 `marshal:"unaligned"` } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/netdevice.go000066400000000000000000000060601465435605700236150ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux const ( // IFNAMSIZ is the size of the name field for IFReq. IFNAMSIZ = 16 ) // IFReq is an interface request. // // +marshal type IFReq struct { // IFName is an encoded name, normally null-terminated. This should be // accessed via the Name and SetName functions. IFName [IFNAMSIZ]byte // Data is the union of the following structures: // // struct sockaddr ifr_addr; // struct sockaddr ifr_dstaddr; // struct sockaddr ifr_broadaddr; // struct sockaddr ifr_netmask; // struct sockaddr ifr_hwaddr; // short ifr_flags; // int ifr_ifindex; // int ifr_metric; // int ifr_mtu; // struct ifmap ifr_map; // char ifr_slave[IFNAMSIZ]; // char ifr_newname[IFNAMSIZ]; // char *ifr_data; Data [24]byte } // Name returns the name. func (ifr *IFReq) Name() string { for c := 0; c < len(ifr.IFName); c++ { if ifr.IFName[c] == 0 { return string(ifr.IFName[:c]) } } return string(ifr.IFName[:]) } // SetName sets the name. func (ifr *IFReq) SetName(name string) { n := copy(ifr.IFName[:], []byte(name)) clear(ifr.IFName[n:]) } // SizeOfIFReq is the binary size of an IFReq struct (40 bytes). var SizeOfIFReq = (*IFReq)(nil).SizeBytes() // IFMap contains interface hardware parameters. type IFMap struct { MemStart uint64 MemEnd uint64 BaseAddr int16 IRQ byte DMA byte Port byte _ [3]byte // Pad to sizeof(struct ifmap). } // IFConf is used to return a list of interfaces and their addresses. See // netdevice(7) and struct ifconf for more detail on its use. // // +marshal type IFConf struct { Len int32 _ [4]byte // Pad to sizeof(struct ifconf). Ptr uint64 } // EthtoolCmd is a marshallable type to be able to easily copyin the // the command for an SIOCETHTOOL ioctl. // // +marshal type EthtoolCmd uint32 const ( // ETHTOOL_GFEATURES is the command to SIOCETHTOOL to query device // features. // See: ETHTOOL_GFEATURES EthtoolCmd = 0x3a ) // EthtoolGFeatures is used to return a list of device features. // See: // // +marshal type EthtoolGFeatures struct { Cmd uint32 Size uint32 } // EthtoolGetFeaturesBlock is used to return state of upto 32 device // features. // See: // // +marshal type EthtoolGetFeaturesBlock struct { Available uint32 Requested uint32 Active uint32 NeverChanged uint32 } const ( // LOOPBACK_IFINDEX is defined in include/net/flow.h. LOOPBACK_IFINDEX = 1 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/netfilter.go000066400000000000000000000475071465435605700236560ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" ) // This file contains structures required to support netfilter, specifically // the iptables tool. // Hooks into the network stack. These correspond to values in // include/uapi/linux/netfilter.h. const ( NF_INET_PRE_ROUTING = 0 NF_INET_LOCAL_IN = 1 NF_INET_FORWARD = 2 NF_INET_LOCAL_OUT = 3 NF_INET_POST_ROUTING = 4 NF_INET_NUMHOOKS = 5 ) // Verdicts that can be returned by targets. These correspond to values in // include/uapi/linux/netfilter.h const ( NF_DROP = 0 NF_ACCEPT = 1 NF_STOLEN = 2 NF_QUEUE = 3 NF_REPEAT = 4 NF_STOP = 5 NF_MAX_VERDICT = NF_STOP // NF_RETURN is defined in include/uapi/linux/netfilter/x_tables.h. NF_RETURN = -NF_REPEAT - 1 ) // VerdictStrings maps int verdicts to the strings they represent. It is used // for debugging. var VerdictStrings = map[int32]string{ -NF_DROP - 1: "DROP", -NF_ACCEPT - 1: "ACCEPT", -NF_QUEUE - 1: "QUEUE", NF_RETURN: "RETURN", } // Socket options for SOL_SOCKET. These correspond to values in // include/uapi/linux/netfilter_ipv4/ip_tables.h. const ( IPT_BASE_CTL = 64 IPT_SO_SET_REPLACE = IPT_BASE_CTL IPT_SO_SET_ADD_COUNTERS = IPT_BASE_CTL + 1 IPT_SO_SET_MAX = IPT_SO_SET_ADD_COUNTERS IPT_SO_GET_INFO = IPT_BASE_CTL IPT_SO_GET_ENTRIES = IPT_BASE_CTL + 1 IPT_SO_GET_REVISION_MATCH = IPT_BASE_CTL + 2 IPT_SO_GET_REVISION_TARGET = IPT_BASE_CTL + 3 IPT_SO_GET_MAX = IPT_SO_GET_REVISION_TARGET ) // Socket option for SOL_IP. This corresponds to the value in // include/uapi/linux/netfilter_ipv4.h. const ( SO_ORIGINAL_DST = 80 ) // Name lengths. These correspond to values in // include/uapi/linux/netfilter/x_tables.h. const ( XT_FUNCTION_MAXNAMELEN = 30 XT_EXTENSION_MAXNAMELEN = 29 XT_TABLE_MAXNAMELEN = 32 ) // IPTEntry is an iptable rule. It corresponds to struct ipt_entry in // include/uapi/linux/netfilter_ipv4/ip_tables.h. // // +marshal type IPTEntry struct { // IP is used to filter packets based on the IP header. IP IPTIP // NFCache relates to kernel-internal caching and isn't used by // userspace. NFCache uint32 // TargetOffset is the byte offset from the beginning of this IPTEntry // to the start of the entry's target. TargetOffset uint16 // NextOffset is the byte offset from the beginning of this IPTEntry to // the start of the next entry. It is thus also the size of the entry. NextOffset uint16 // Comeback is a return pointer. It is not used by userspace. Comeback uint32 // Counters holds the packet and byte counts for this rule. Counters XTCounters // Elems holds the data for all this rule's matches followed by the // target. It is variable length -- users have to iterate over any // matches and use TargetOffset and NextOffset to make sense of the // data. // // Elems is omitted here because it would cause IPTEntry to be an extra // byte larger (see http://www.catb.org/esr/structure-packing/). // // Elems [0]byte } // SizeOfIPTEntry is the size of an IPTEntry. const SizeOfIPTEntry = 112 // KernelIPTEntry is identical to IPTEntry, but includes the Elems field. // // +marshal dynamic type KernelIPTEntry struct { Entry IPTEntry // Elems holds the data for all this rule's matches followed by the // target. It is variable length -- users have to iterate over any // matches and use TargetOffset and NextOffset to make sense of the // data. Elems primitive.ByteSlice } // SizeBytes implements marshal.Marshallable.SizeBytes. func (ke *KernelIPTEntry) SizeBytes() int { return ke.Entry.SizeBytes() + ke.Elems.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (ke *KernelIPTEntry) MarshalBytes(dst []byte) []byte { dst = ke.Entry.MarshalUnsafe(dst) return ke.Elems.MarshalBytes(dst) } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (ke *KernelIPTEntry) UnmarshalBytes(src []byte) []byte { src = ke.Entry.UnmarshalUnsafe(src) return ke.Elems.UnmarshalBytes(src) } var _ marshal.Marshallable = (*KernelIPTEntry)(nil) // IPTIP contains information for matching a packet's IP header. // It corresponds to struct ipt_ip in // include/uapi/linux/netfilter_ipv4/ip_tables.h. // // +marshal type IPTIP struct { // Src is the source IP address. Src InetAddr // Dst is the destination IP address. Dst InetAddr // SrcMask is the source IP mask. SrcMask InetAddr // DstMask is the destination IP mask. DstMask InetAddr // InputInterface is the input network interface. InputInterface [IFNAMSIZ]byte // OutputInterface is the output network interface. OutputInterface [IFNAMSIZ]byte // InputInterfaceMask is the input interface mask. InputInterfaceMask [IFNAMSIZ]byte // OuputInterfaceMask is the output interface mask. OutputInterfaceMask [IFNAMSIZ]byte // Protocol is the transport protocol. Protocol uint16 // Flags define matching behavior for the IP header. Flags uint8 // InverseFlags invert the meaning of fields in struct IPTIP. See the // IPT_INV_* flags. InverseFlags uint8 } // Flags in IPTIP.InverseFlags. Corresponding constants are in // include/uapi/linux/netfilter_ipv4/ip_tables.h. const ( // Invert the meaning of InputInterface. IPT_INV_VIA_IN = 0x01 // Invert the meaning of OutputInterface. IPT_INV_VIA_OUT = 0x02 // Unclear what this is, as no references to it exist in the kernel. IPT_INV_TOS = 0x04 // Invert the meaning of Src. IPT_INV_SRCIP = 0x08 // Invert the meaning of Dst. IPT_INV_DSTIP = 0x10 // Invert the meaning of the IPT_F_FRAG flag. IPT_INV_FRAG = 0x20 // Invert the meaning of the Protocol field. IPT_INV_PROTO = 0x40 // Enable all flags. IPT_INV_MASK = 0x7F ) // SizeOfIPTIP is the size of an IPTIP. const SizeOfIPTIP = 84 // XTCounters holds packet and byte counts for a rule. It corresponds to struct // xt_counters in include/uapi/linux/netfilter/x_tables.h. // // +marshal type XTCounters struct { // Pcnt is the packet count. Pcnt uint64 // Bcnt is the byte count. Bcnt uint64 } // SizeOfXTCounters is the size of an XTCounters. const SizeOfXTCounters = 16 // XTEntryMatch holds a match for a rule. For example, a user using the // addrtype iptables match extension would put the data for that match into an // XTEntryMatch. iptables-extensions(8) has a list of possible matches. // // XTEntryMatch corresponds to struct xt_entry_match in // include/uapi/linux/netfilter/x_tables.h. That struct contains a union // exposing different data to the user and kernel, but this struct holds only // the user data. // // +marshal type XTEntryMatch struct { MatchSize uint16 Name ExtensionName Revision uint8 // Data is omitted here because it would cause XTEntryMatch to be an // extra byte larger (see http://www.catb.org/esr/structure-packing/). // Data [0]byte } // SizeOfXTEntryMatch is the size of an XTEntryMatch. const SizeOfXTEntryMatch = 32 // KernelXTEntryMatch is identical to XTEntryMatch, but contains // variable-length Data field. type KernelXTEntryMatch struct { XTEntryMatch Data []byte } // XTGetRevision corresponds to xt_get_revision in // include/uapi/linux/netfilter/x_tables.h // // +marshal type XTGetRevision struct { Name ExtensionName Revision uint8 } // SizeOfXTGetRevision is the size of an XTGetRevision. const SizeOfXTGetRevision = 30 // XTEntryTarget holds a target for a rule. For example, it can specify that // packets matching the rule should DROP, ACCEPT, or use an extension target. // iptables-extension(8) has a list of possible targets. // // XTEntryTarget corresponds to struct xt_entry_target in // include/uapi/linux/netfilter/x_tables.h. That struct contains a union // exposing different data to the user and kernel, but this struct holds only // the user data. // // +marshal type XTEntryTarget struct { TargetSize uint16 Name ExtensionName Revision uint8 // Data is omitted here because it would cause XTEntryTarget to be an // extra byte larger (see http://www.catb.org/esr/structure-packing/). // Data [0]byte } // SizeOfXTEntryTarget is the size of an XTEntryTarget. const SizeOfXTEntryTarget = 32 // KernelXTEntryTarget is identical to XTEntryTarget, but contains a // variable-length Data field. type KernelXTEntryTarget struct { XTEntryTarget Data []byte } // XTStandardTarget is a built-in target, one of ACCEPT, DROP, JUMP, QUEUE, // RETURN, or jump. It corresponds to struct xt_standard_target in // include/uapi/linux/netfilter/x_tables.h. // // +marshal type XTStandardTarget struct { Target XTEntryTarget // A positive verdict indicates a jump, and is the offset from the // start of the table to jump to. A negative value means one of the // other built-in targets. Verdict int32 _ [4]byte } // SizeOfXTStandardTarget is the size of an XTStandardTarget. const SizeOfXTStandardTarget = 40 // XTErrorTarget triggers an error when reached. It is also used to mark the // beginning of user-defined chains by putting the name of the chain in // ErrorName. It corresponds to struct xt_error_target in // include/uapi/linux/netfilter/x_tables.h. // // +marshal type XTErrorTarget struct { Target XTEntryTarget Name ErrorName _ [2]byte } // SizeOfXTErrorTarget is the size of an XTErrorTarget. const SizeOfXTErrorTarget = 64 // Flag values for NfNATIPV4Range. The values indicate whether to map // protocol specific part(ports) or IPs. It corresponds to values in // include/uapi/linux/netfilter/nf_nat.h. const ( NF_NAT_RANGE_MAP_IPS = 1 << 0 NF_NAT_RANGE_PROTO_SPECIFIED = 1 << 1 NF_NAT_RANGE_PROTO_RANDOM = 1 << 2 NF_NAT_RANGE_PERSISTENT = 1 << 3 NF_NAT_RANGE_PROTO_RANDOM_FULLY = 1 << 4 NF_NAT_RANGE_PROTO_RANDOM_ALL = (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY) NF_NAT_RANGE_MASK = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED | NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT | NF_NAT_RANGE_PROTO_RANDOM_FULLY) ) // NfNATIPV4Range corresponds to struct nf_nat_ipv4_range // in include/uapi/linux/netfilter/nf_nat.h. The fields are in // network byte order. // // +marshal type NfNATIPV4Range struct { Flags uint32 MinIP [4]byte MaxIP [4]byte MinPort uint16 MaxPort uint16 } // NfNATIPV4MultiRangeCompat corresponds to struct // nf_nat_ipv4_multi_range_compat in include/uapi/linux/netfilter/nf_nat.h. // // +marshal type NfNATIPV4MultiRangeCompat struct { RangeSize uint32 RangeIPV4 NfNATIPV4Range } // XTRedirectTarget triggers a redirect when reached. // Adding 4 bytes of padding to make the struct 8 byte aligned. // // +marshal type XTRedirectTarget struct { Target XTEntryTarget NfRange NfNATIPV4MultiRangeCompat _ [4]byte } // SizeOfXTRedirectTarget is the size of an XTRedirectTarget. const SizeOfXTRedirectTarget = 56 // XTNATTargetV0 triggers NAT when reached. // Adding 4 bytes of padding to make the struct 8 byte aligned. // // +marshal type XTNATTargetV0 struct { Target XTEntryTarget NfRange NfNATIPV4MultiRangeCompat _ [4]byte } // SizeOfXTNATTargetV0 is the size of an XTNATTargetV0. const SizeOfXTNATTargetV0 = 56 // XTNATTargetV1 triggers NAT when reached. // // +marshal type XTNATTargetV1 struct { Target XTEntryTarget Range NFNATRange } // SizeOfXTNATTargetV1 is the size of an XTNATTargetV1. const SizeOfXTNATTargetV1 = SizeOfXTEntryTarget + SizeOfNFNATRange // XTNATTargetV2 triggers NAT when reached. // // +marshal type XTNATTargetV2 struct { Target XTEntryTarget Range NFNATRange2 } // SizeOfXTNATTargetV2 is the size of an XTNATTargetV2. const SizeOfXTNATTargetV2 = SizeOfXTEntryTarget + SizeOfNFNATRange2 // IPTGetinfo is the argument for the IPT_SO_GET_INFO sockopt. It corresponds // to struct ipt_getinfo in include/uapi/linux/netfilter_ipv4/ip_tables.h. // // +marshal type IPTGetinfo struct { Name TableName ValidHooks uint32 HookEntry [NF_INET_NUMHOOKS]uint32 Underflow [NF_INET_NUMHOOKS]uint32 NumEntries uint32 Size uint32 } // SizeOfIPTGetinfo is the size of an IPTGetinfo. const SizeOfIPTGetinfo = 84 // IPTGetEntries is the argument for the IPT_SO_GET_ENTRIES sockopt. It // corresponds to struct ipt_get_entries in // include/uapi/linux/netfilter_ipv4/ip_tables.h. // // +marshal type IPTGetEntries struct { Name TableName Size uint32 _ [4]byte // Entrytable is omitted here because it would cause IPTGetEntries to // be an extra byte longer (see // http://www.catb.org/esr/structure-packing/). // Entrytable [0]IPTEntry } // SizeOfIPTGetEntries is the size of an IPTGetEntries. const SizeOfIPTGetEntries = 40 // KernelIPTGetEntries is identical to IPTGetEntries, but includes the // Entrytable field. // // +marshal dynamic type KernelIPTGetEntries struct { IPTGetEntries Entrytable []KernelIPTEntry } // SizeBytes implements marshal.Marshallable.SizeBytes. func (ke *KernelIPTGetEntries) SizeBytes() int { res := ke.IPTGetEntries.SizeBytes() for _, entry := range ke.Entrytable { res += entry.SizeBytes() } return res } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (ke *KernelIPTGetEntries) MarshalBytes(dst []byte) []byte { dst = ke.IPTGetEntries.MarshalUnsafe(dst) for i := range ke.Entrytable { dst = ke.Entrytable[i].MarshalBytes(dst) } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (ke *KernelIPTGetEntries) UnmarshalBytes(src []byte) []byte { src = ke.IPTGetEntries.UnmarshalUnsafe(src) for i := range ke.Entrytable { src = ke.Entrytable[i].UnmarshalBytes(src) } return src } var _ marshal.Marshallable = (*KernelIPTGetEntries)(nil) // IPTReplace is the argument for the IPT_SO_SET_REPLACE sockopt. It // corresponds to struct ipt_replace in // include/uapi/linux/netfilter_ipv4/ip_tables.h. // // +marshal type IPTReplace struct { Name TableName ValidHooks uint32 NumEntries uint32 Size uint32 HookEntry [NF_INET_NUMHOOKS]uint32 Underflow [NF_INET_NUMHOOKS]uint32 NumCounters uint32 Counters uint64 // This is really a *XTCounters. // Entries is omitted here because it would cause IPTReplace to be an // extra byte longer (see http://www.catb.org/esr/structure-packing/). // Entries [0]IPTEntry } // SizeOfIPTReplace is the size of an IPTReplace. const SizeOfIPTReplace = 96 // ExtensionName holds the name of a netfilter extension. // // +marshal type ExtensionName [XT_EXTENSION_MAXNAMELEN]byte // String implements fmt.Stringer. func (en ExtensionName) String() string { return goString(en[:]) } // TableName holds the name of a netfilter table. // // +marshal type TableName [XT_TABLE_MAXNAMELEN]byte // String implements fmt.Stringer. func (tn TableName) String() string { return goString(tn[:]) } // ErrorName holds the name of a netfilter error. These can also hold // user-defined chains. // // +marshal type ErrorName [XT_FUNCTION_MAXNAMELEN]byte // String implements fmt.Stringer. func (en ErrorName) String() string { return goString(en[:]) } func goString(cstring []byte) string { for i, c := range cstring { if c == 0 { return string(cstring[:i]) } } return string(cstring) } // XTTCP holds data for matching TCP packets. It corresponds to struct xt_tcp // in include/uapi/linux/netfilter/xt_tcpudp.h. // // +marshal type XTTCP struct { // SourcePortStart specifies the inclusive start of the range of source // ports to which the matcher applies. SourcePortStart uint16 // SourcePortEnd specifies the inclusive end of the range of source ports // to which the matcher applies. SourcePortEnd uint16 // DestinationPortStart specifies the start of the destination port // range to which the matcher applies. DestinationPortStart uint16 // DestinationPortEnd specifies the end of the destination port // range to which the matcher applies. DestinationPortEnd uint16 // Option specifies that a particular TCP option must be set. Option uint8 // FlagMask masks TCP flags when comparing to the FlagCompare byte. It allows // for specification of which flags are important to the matcher. FlagMask uint8 // FlagCompare, in combination with FlagMask, is used to match only packets // that have certain flags set. FlagCompare uint8 // InverseFlags flips the meaning of certain fields. See the // TX_TCP_INV_* flags. InverseFlags uint8 } // SizeOfXTTCP is the size of an XTTCP. const SizeOfXTTCP = 12 // Flags in XTTCP.InverseFlags. Corresponding constants are in // include/uapi/linux/netfilter/xt_tcpudp.h. const ( // Invert the meaning of SourcePortStart/End. XT_TCP_INV_SRCPT = 0x01 // Invert the meaning of DestinationPortStart/End. XT_TCP_INV_DSTPT = 0x02 // Invert the meaning of FlagCompare. XT_TCP_INV_FLAGS = 0x04 // Invert the meaning of Option. XT_TCP_INV_OPTION = 0x08 // Enable all flags. XT_TCP_INV_MASK = 0x0F ) // XTUDP holds data for matching UDP packets. It corresponds to struct xt_udp // in include/uapi/linux/netfilter/xt_tcpudp.h. // // +marshal type XTUDP struct { // SourcePortStart is the inclusive start of the range of source ports // to which the matcher applies. SourcePortStart uint16 // SourcePortEnd is the inclusive end of the range of source ports to // which the matcher applies. SourcePortEnd uint16 // DestinationPortStart is the inclusive start of the destination port // range to which the matcher applies. DestinationPortStart uint16 // DestinationPortEnd is the inclusive end of the destination port // range to which the matcher applies. DestinationPortEnd uint16 // InverseFlags flips the meaning of certain fields. See the // TX_UDP_INV_* flags. InverseFlags uint8 _ uint8 } // SizeOfXTUDP is the size of an XTUDP. const SizeOfXTUDP = 10 // Flags in XTUDP.InverseFlags. Corresponding constants are in // include/uapi/linux/netfilter/xt_tcpudp.h. const ( // Invert the meaning of SourcePortStart/End. XT_UDP_INV_SRCPT = 0x01 // Invert the meaning of DestinationPortStart/End. XT_UDP_INV_DSTPT = 0x02 // Enable all flags. XT_UDP_INV_MASK = 0x03 ) // IPTOwnerInfo holds data for matching packets with the owner v0 matcher. It // corresponds to struct ipt_owner_info in libxt_owner.c of iptables binary. // // +marshal type IPTOwnerInfo struct { // UID is user id which created the packet. UID uint32 // GID is group id which created the packet. GID uint32 // PID is process id of the process which created the packet. PID uint32 // SID is session id which created the packet. SID uint32 // Comm is the command name which created the packet. Comm [16]byte // Match is used to match UID/GID of the socket. See the // XT_OWNER_* flags below. Match uint8 // Invert flips the meaning of Match field. Invert uint8 `marshal:"unaligned"` } // SizeOfIPTOwnerInfo is the size of an IPTOwnerInfo. const SizeOfIPTOwnerInfo = 34 // XTOwnerMatchInfo holds data for matching packets with the owner v1 matcher. // It corresponds to struct xt_owner_match_info in // include/uapi/linux/netfilter/xt_owner.h // // +marshal type XTOwnerMatchInfo struct { UIDMin uint32 UIDMax uint32 GIDMin uint32 GIDMax uint32 Match uint8 Invert uint8 _ [2]byte } // SizeOfXTOwnerMatchInfo is the size of an XTOwnerMatchInfo. const SizeOfXTOwnerMatchInfo = 20 // Flags in IPTOwnerInfo.Match and XTOwnerMatchInfo.Match. Corresponding // constants are in include/uapi/linux/netfilter/xt_owner.h. const ( // Match the UID of the packet. XT_OWNER_UID = 1 << 0 // Match the GID of the packet. XT_OWNER_GID = 1 << 1 // Match if the socket exists for the packet. Forwarded // packets do not have an associated socket. XT_OWNER_SOCKET = 1 << 2 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/netfilter_bridge.go000066400000000000000000000017521465435605700251620ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import "math" // Netfilter Bridge Standard Hook Priorities, from // uapi/linux/netfilter_bridge.h. const ( NF_BR_PRI_FIRST = math.MinInt NF_BR_PRI_NAT_DST_BRIDGED = -300 NF_BR_PRI_FILTER_BRIDGED = -200 NF_BR_PRI_BRNF = 0 NF_BR_PRI_NAT_DST_OTHER = 100 NF_BR_PRI_FILTER_OTHER = 200 NF_BR_PRI_NAT_SRC = 300 NF_BR_PRI_LAST = math.MaxInt ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/netfilter_ipv4.go000066400000000000000000000023641465435605700246100ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import "math" // Netfilter IPv4 Standard Hook Priorities, from uapi/linux/netfilter_ipv4.h. const ( NF_IP_PRI_FIRST = math.MinInt NF_IP_PRI_RAW_BEFORE_DEFRAG = -450 NF_IP_PRI_CONNTRACK_DEFRAG = -400 NF_IP_PRI_RAW = -300 NF_IP_PRI_SELINUX_FIRST = -225 NF_IP_PRI_CONNTRACK = -200 NF_IP_PRI_MANGLE = -150 NF_IP_PRI_NAT_DST = -100 NF_IP_PRI_FILTER = 0 NF_IP_PRI_SECURITY = 50 NF_IP_PRI_NAT_SRC = 100 NF_IP_PRI_SELINUX_LAST = 225 NF_IP_PRI_CONNTRACK_HELPER = 300 NF_IP_PRI_CONNTRACK_CONFIRM = math.MaxInt NF_IP_PRI_LAST = math.MaxInt ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/netfilter_ipv6.go000066400000000000000000000220711465435605700246070ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "math" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" ) // This file contains structures required to support IPv6 netfilter and // ip6tables. Some constants and structs are equal to their IPv4 analogues, and // are only distinguished by context (e.g. whether used on an IPv4 of IPv6 // socket). // Netfilter IPv6 Standard Hook Priorities, from uapi/linux/netfilter_ipv6.h. const ( NF_IP6_PRI_FIRST = math.MinInt NF_IP6_PRI_RAW_BEFORE_DEFRAG = -450 NF_IP6_PRI_CONNTRACK_DEFRAG = -400 NF_IP6_PRI_RAW = -300 NF_IP6_PRI_SELINUX_FIRST = -225 NF_IP6_PRI_CONNTRACK = -200 NF_IP6_PRI_MANGLE = -150 NF_IP6_PRI_NAT_DST = -100 NF_IP6_PRI_FILTER = 0 NF_IP6_PRI_SECURITY = 50 NF_IP6_PRI_NAT_SRC = 100 NF_IP6_PRI_SELINUX_LAST = 225 NF_IP6_PRI_CONNTRACK_HELPER = 300 NF_IP6_PRI_LAST = math.MaxInt ) // Socket options for SOL_SOCLET. These correspond to values in // include/uapi/linux/netfilter_ipv6/ip6_tables.h. const ( IP6T_BASE_CTL = 64 IP6T_SO_SET_REPLACE = IPT_BASE_CTL IP6T_SO_SET_ADD_COUNTERS = IPT_BASE_CTL + 1 IP6T_SO_SET_MAX = IPT_SO_SET_ADD_COUNTERS IP6T_SO_GET_INFO = IPT_BASE_CTL IP6T_SO_GET_ENTRIES = IPT_BASE_CTL + 1 IP6T_SO_GET_REVISION_MATCH = IPT_BASE_CTL + 4 IP6T_SO_GET_REVISION_TARGET = IPT_BASE_CTL + 5 IP6T_SO_GET_MAX = IP6T_SO_GET_REVISION_TARGET ) // IP6T_ORIGINAL_DST is the ip6tables SOL_IPV6 socket option. Corresponds to // the value in include/uapi/linux/netfilter_ipv6/ip6_tables.h. const IP6T_ORIGINAL_DST = 80 // IP6TReplace is the argument for the IP6T_SO_SET_REPLACE sockopt. It // corresponds to struct ip6t_replace in // include/uapi/linux/netfilter_ipv6/ip6_tables.h. // // +marshal type IP6TReplace struct { Name TableName ValidHooks uint32 NumEntries uint32 Size uint32 HookEntry [NF_INET_NUMHOOKS]uint32 Underflow [NF_INET_NUMHOOKS]uint32 NumCounters uint32 Counters uint64 // This is really a *XTCounters. // Entries is omitted here because it would cause IP6TReplace to be an // extra byte longer (see http://www.catb.org/esr/structure-packing/). // Entries [0]IP6TEntry } // SizeOfIP6TReplace is the size of an IP6TReplace. const SizeOfIP6TReplace = 96 // KernelIP6TGetEntries is identical to IP6TGetEntries, but includes the // Entrytable field. // // +marshal dynamic type KernelIP6TGetEntries struct { IPTGetEntries Entrytable []KernelIP6TEntry } // SizeBytes implements marshal.Marshallable.SizeBytes. func (ke *KernelIP6TGetEntries) SizeBytes() int { res := ke.IPTGetEntries.SizeBytes() for _, entry := range ke.Entrytable { res += entry.SizeBytes() } return res } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (ke *KernelIP6TGetEntries) MarshalBytes(dst []byte) []byte { dst = ke.IPTGetEntries.MarshalUnsafe(dst) for i := range ke.Entrytable { dst = ke.Entrytable[i].MarshalBytes(dst) } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (ke *KernelIP6TGetEntries) UnmarshalBytes(src []byte) []byte { src = ke.IPTGetEntries.UnmarshalUnsafe(src) for i := range ke.Entrytable { src = ke.Entrytable[i].UnmarshalBytes(src) } return src } var _ marshal.Marshallable = (*KernelIP6TGetEntries)(nil) // IP6TEntry is an iptables rule. It corresponds to struct ip6t_entry in // include/uapi/linux/netfilter_ipv6/ip6_tables.h. // // +marshal type IP6TEntry struct { // IPv6 is used to filter packets based on the IPv6 header. IPv6 IP6TIP // NFCache relates to kernel-internal caching and isn't used by // userspace. NFCache uint32 // TargetOffset is the byte offset from the beginning of this IPTEntry // to the start of the entry's target. TargetOffset uint16 // NextOffset is the byte offset from the beginning of this IPTEntry to // the start of the next entry. It is thus also the size of the entry. NextOffset uint16 // Comeback is a return pointer. It is not used by userspace. Comeback uint32 _ [4]byte // Counters holds the packet and byte counts for this rule. Counters XTCounters // Elems holds the data for all this rule's matches followed by the // target. It is variable length -- users have to iterate over any // matches and use TargetOffset and NextOffset to make sense of the // data. // // Elems is omitted here because it would cause IPTEntry to be an extra // byte larger (see http://www.catb.org/esr/structure-packing/). // // Elems [0]byte } // SizeOfIP6TEntry is the size of an IP6TEntry. const SizeOfIP6TEntry = 168 // KernelIP6TEntry is identical to IP6TEntry, but includes the Elems field. // // +marshal dynamic type KernelIP6TEntry struct { Entry IP6TEntry // Elems holds the data for all this rule's matches followed by the // target. It is variable length -- users have to iterate over any // matches and use TargetOffset and NextOffset to make sense of the // data. Elems primitive.ByteSlice } // SizeBytes implements marshal.Marshallable.SizeBytes. func (ke *KernelIP6TEntry) SizeBytes() int { return ke.Entry.SizeBytes() + ke.Elems.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (ke *KernelIP6TEntry) MarshalBytes(dst []byte) []byte { dst = ke.Entry.MarshalUnsafe(dst) return ke.Elems.MarshalBytes(dst) } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (ke *KernelIP6TEntry) UnmarshalBytes(src []byte) []byte { src = ke.Entry.UnmarshalUnsafe(src) return ke.Elems.UnmarshalBytes(src) } var _ marshal.Marshallable = (*KernelIP6TEntry)(nil) // IP6TIP contains information for matching a packet's IP header. // It corresponds to struct ip6t_ip6 in // include/uapi/linux/netfilter_ipv6/ip6_tables.h. // // +marshal type IP6TIP struct { // Src is the source IP address. Src Inet6Addr // Dst is the destination IP address. Dst Inet6Addr // SrcMask is the source IP mask. SrcMask Inet6Addr // DstMask is the destination IP mask. DstMask Inet6Addr // InputInterface is the input network interface. InputInterface [IFNAMSIZ]byte // OutputInterface is the output network interface. OutputInterface [IFNAMSIZ]byte // InputInterfaceMask is the input interface mask. InputInterfaceMask [IFNAMSIZ]byte // OuputInterfaceMask is the output interface mask. OutputInterfaceMask [IFNAMSIZ]byte // Protocol is the transport protocol. Protocol uint16 // TOS matches TOS flags when Flags indicates filtering by TOS. TOS uint8 // Flags define matching behavior for the IP header. Flags uint8 // InverseFlags invert the meaning of fields in struct IPTIP. See the // IP6T_INV_* flags. InverseFlags uint8 // Linux defines in6_addr (Inet6Addr for us) as the union of a // 16-element byte array and a 4-element 32-bit integer array, so the // whole struct is 4-byte aligned. _ [3]byte } // SizeOfIP6TIP is the size of an IP6 header. const SizeOfIP6TIP = 136 // Flags in IP6TIP.Flags. Corresponding constants are in // include/uapi/linux/netfilter_ipv6/ip6_tables.h. const ( // Whether to check the Protocol field. IP6T_F_PROTO = 0x01 // Whether to match the TOS field. IP6T_F_TOS = 0x02 // Indicates that the jump target is an absolute GOTO, not an offset. IP6T_F_GOTO = 0x04 // Enables all flags. IP6T_F_MASK = 0x07 ) // Flags in IP6TIP.InverseFlags. Corresponding constants are in // include/uapi/linux/netfilter_ipv6/ip6_tables.h. const ( // Invert the meaning of InputInterface. IP6T_INV_VIA_IN = 0x01 // Invert the meaning of OutputInterface. IP6T_INV_VIA_OUT = 0x02 // Invert the meaning of TOS. IP6T_INV_TOS = 0x04 // Invert the meaning of Src. IP6T_INV_SRCIP = 0x08 // Invert the meaning of Dst. IP6T_INV_DSTIP = 0x10 // Invert the meaning of the IPT_F_FRAG flag. IP6T_INV_FRAG = 0x20 // Enable all flags. IP6T_INV_MASK = 0x7F ) // NFNATRange corresponds to struct nf_nat_range in // include/uapi/linux/netfilter/nf_nat.h. // // +marshal type NFNATRange struct { Flags uint32 MinAddr Inet6Addr MaxAddr Inet6Addr MinProto uint16 // Network byte order. MaxProto uint16 // Network byte order. } // SizeOfNFNATRange is the size of NFNATRange. const SizeOfNFNATRange = 40 // NFNATRange2 corresponds to struct nf_nat_range2 in // include/uapi/linux/netfilter/nf_nat.h. // // +marshal type NFNATRange2 struct { Flags uint32 MinAddr Inet6Addr MaxAddr Inet6Addr MinProto uint16 // Network byte order. MaxProto uint16 // Network byte order. BaseProto uint16 // Network byte order. _ [6]byte } // SizeOfNFNATRange2 is the size of NFNATRange2. const SizeOfNFNATRange2 = 48 golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/netlink.go000066400000000000000000000067211465435605700233170ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Netlink protocols, from uapi/linux/netlink.h. const ( NETLINK_ROUTE = 0 NETLINK_UNUSED = 1 NETLINK_USERSOCK = 2 NETLINK_FIREWALL = 3 NETLINK_SOCK_DIAG = 4 NETLINK_NFLOG = 5 NETLINK_XFRM = 6 NETLINK_SELINUX = 7 NETLINK_ISCSI = 8 NETLINK_AUDIT = 9 NETLINK_FIB_LOOKUP = 10 NETLINK_CONNECTOR = 11 NETLINK_NETFILTER = 12 NETLINK_IP6_FW = 13 NETLINK_DNRTMSG = 14 NETLINK_KOBJECT_UEVENT = 15 NETLINK_GENERIC = 16 NETLINK_SCSITRANSPORT = 18 NETLINK_ECRYPTFS = 19 NETLINK_RDMA = 20 NETLINK_CRYPTO = 21 ) // SockAddrNetlink is struct sockaddr_nl, from uapi/linux/netlink.h. // // +marshal type SockAddrNetlink struct { Family uint16 _ uint16 PortID uint32 Groups uint32 } // SockAddrNetlinkSize is the size of SockAddrNetlink. const SockAddrNetlinkSize = 12 // NetlinkMessageHeader is struct nlmsghdr, from uapi/linux/netlink.h. // // +marshal type NetlinkMessageHeader struct { Length uint32 Type uint16 Flags uint16 Seq uint32 PortID uint32 } // NetlinkMessageHeaderSize is the size of NetlinkMessageHeader. const NetlinkMessageHeaderSize = 16 // Netlink message header flags, from uapi/linux/netlink.h. const ( NLM_F_REQUEST = 0x1 NLM_F_MULTI = 0x2 NLM_F_ACK = 0x4 NLM_F_ECHO = 0x8 NLM_F_DUMP_INTR = 0x10 NLM_F_ROOT = 0x100 NLM_F_MATCH = 0x200 NLM_F_ATOMIC = 0x400 NLM_F_DUMP = NLM_F_ROOT | NLM_F_MATCH NLM_F_REPLACE = 0x100 NLM_F_EXCL = 0x200 NLM_F_CREATE = 0x400 NLM_F_APPEND = 0x800 ) // Standard netlink message types, from uapi/linux/netlink.h. const ( NLMSG_NOOP = 0x1 NLMSG_ERROR = 0x2 NLMSG_DONE = 0x3 NLMSG_OVERRUN = 0x4 // NLMSG_MIN_TYPE is the first value for protocol-level types. NLMSG_MIN_TYPE = 0x10 ) // NLMSG_ALIGNTO is the alignment of netlink messages, from // uapi/linux/netlink.h. const NLMSG_ALIGNTO = 4 // NetlinkAttrHeader is the header of a netlink attribute, followed by payload. // // This is struct nlattr, from uapi/linux/netlink.h. // // +marshal type NetlinkAttrHeader struct { Length uint16 Type uint16 } // NetlinkAttrHeaderSize is the size of NetlinkAttrHeader. const NetlinkAttrHeaderSize = 4 // NLA_ALIGNTO is the alignment of netlink attributes, from // uapi/linux/netlink.h. const NLA_ALIGNTO = 4 // Socket options, from uapi/linux/netlink.h. const ( NETLINK_ADD_MEMBERSHIP = 1 NETLINK_DROP_MEMBERSHIP = 2 NETLINK_PKTINFO = 3 NETLINK_BROADCAST_ERROR = 4 NETLINK_NO_ENOBUFS = 5 NETLINK_LISTEN_ALL_NSID = 8 NETLINK_LIST_MEMBERSHIPS = 9 NETLINK_CAP_ACK = 10 NETLINK_EXT_ACK = 11 NETLINK_DUMP_STRICT_CHK = 12 ) // NetlinkErrorMessage is struct nlmsgerr, from uapi/linux/netlink.h. // // +marshal type NetlinkErrorMessage struct { Error int32 Header NetlinkMessageHeader } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/netlink_route.go000066400000000000000000000176751465435605700245470ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Netlink message types for NETLINK_ROUTE sockets, from uapi/linux/rtnetlink.h. const ( RTM_NEWLINK = 16 RTM_DELLINK = 17 RTM_GETLINK = 18 RTM_SETLINK = 19 RTM_NEWADDR = 20 RTM_DELADDR = 21 RTM_GETADDR = 22 RTM_NEWROUTE = 24 RTM_DELROUTE = 25 RTM_GETROUTE = 26 RTM_NEWNEIGH = 28 RTM_DELNEIGH = 29 RTM_GETNEIGH = 30 RTM_NEWRULE = 32 RTM_DELRULE = 33 RTM_GETRULE = 34 RTM_NEWQDISC = 36 RTM_DELQDISC = 37 RTM_GETQDISC = 38 RTM_NEWTCLASS = 40 RTM_DELTCLASS = 41 RTM_GETTCLASS = 42 RTM_NEWTFILTER = 44 RTM_DELTFILTER = 45 RTM_GETTFILTER = 46 RTM_NEWACTION = 48 RTM_DELACTION = 49 RTM_GETACTION = 50 RTM_NEWPREFIX = 52 RTM_GETMULTICAST = 58 RTM_GETANYCAST = 62 RTM_NEWNEIGHTBL = 64 RTM_GETNEIGHTBL = 66 RTM_SETNEIGHTBL = 67 RTM_NEWNDUSEROPT = 68 RTM_NEWADDRLABEL = 72 RTM_DELADDRLABEL = 73 RTM_GETADDRLABEL = 74 RTM_GETDCB = 78 RTM_SETDCB = 79 RTM_NEWNETCONF = 80 RTM_GETNETCONF = 82 RTM_NEWMDB = 84 RTM_DELMDB = 85 RTM_GETMDB = 86 RTM_NEWNSID = 88 RTM_DELNSID = 89 RTM_GETNSID = 90 ) // InterfaceInfoMessage is struct ifinfomsg, from uapi/linux/rtnetlink.h. // // +marshal type InterfaceInfoMessage struct { Family uint8 _ uint8 Type uint16 Index int32 Flags uint32 Change uint32 } // InterfaceInfoMessageSize is the size of InterfaceInfoMessage. const InterfaceInfoMessageSize = 16 // Interface flags, from uapi/linux/if.h. const ( IFF_UP = 1 << 0 IFF_BROADCAST = 1 << 1 IFF_DEBUG = 1 << 2 IFF_LOOPBACK = 1 << 3 IFF_POINTOPOINT = 1 << 4 IFF_NOTRAILERS = 1 << 5 IFF_RUNNING = 1 << 6 IFF_NOARP = 1 << 7 IFF_PROMISC = 1 << 8 IFF_ALLMULTI = 1 << 9 IFF_MASTER = 1 << 10 IFF_SLAVE = 1 << 11 IFF_MULTICAST = 1 << 12 IFF_PORTSEL = 1 << 13 IFF_AUTOMEDIA = 1 << 14 IFF_DYNAMIC = 1 << 15 IFF_LOWER_UP = 1 << 16 IFF_DORMANT = 1 << 17 IFF_ECHO = 1 << 18 ) // Interface link attributes, from uapi/linux/if_link.h. const ( IFLA_UNSPEC = 0 IFLA_ADDRESS = 1 IFLA_BROADCAST = 2 IFLA_IFNAME = 3 IFLA_MTU = 4 IFLA_LINK = 5 IFLA_QDISC = 6 IFLA_STATS = 7 IFLA_COST = 8 IFLA_PRIORITY = 9 IFLA_MASTER = 10 IFLA_WIRELESS = 11 IFLA_PROTINFO = 12 IFLA_TXQLEN = 13 IFLA_MAP = 14 IFLA_WEIGHT = 15 IFLA_OPERSTATE = 16 IFLA_LINKMODE = 17 IFLA_LINKINFO = 18 IFLA_NET_NS_PID = 19 IFLA_IFALIAS = 20 IFLA_NUM_VF = 21 IFLA_VFINFO_LIST = 22 IFLA_STATS64 = 23 IFLA_VF_PORTS = 24 IFLA_PORT_SELF = 25 IFLA_AF_SPEC = 26 IFLA_GROUP = 27 IFLA_NET_NS_FD = 28 IFLA_EXT_MASK = 29 IFLA_PROMISCUITY = 30 IFLA_NUM_TX_QUEUES = 31 IFLA_NUM_RX_QUEUES = 32 IFLA_CARRIER = 33 IFLA_PHYS_PORT_ID = 34 IFLA_CARRIER_CHANGES = 35 IFLA_PHYS_SWITCH_ID = 36 IFLA_LINK_NETNSID = 37 IFLA_PHYS_PORT_NAME = 38 IFLA_PROTO_DOWN = 39 IFLA_GSO_MAX_SEGS = 40 IFLA_GSO_MAX_SIZE = 41 ) // Interface link info attributes, from uapi/linux/if_link.h. const ( IFLA_INFO_UNSPEC = 0 IFLA_INFO_KIND = 1 IFLA_INFO_DATA = 2 IFLA_INFO_XSTATS = 3 IFLA_INFO_SLAVE_KIND = 4 IFLA_INFO_SLAVE_DATA = 5 ) // Virtuall ethernet attributes, from uapi/linux/veth.h. const ( VETH_INFO_PEER = 1 ) // InterfaceAddrMessage is struct ifaddrmsg, from uapi/linux/if_addr.h. // // +marshal type InterfaceAddrMessage struct { Family uint8 PrefixLen uint8 Flags uint8 Scope uint8 Index uint32 } // InterfaceAddrMessageSize is the size of InterfaceAddrMessage. const InterfaceAddrMessageSize = 8 // Interface attributes, from uapi/linux/if_addr.h. const ( IFA_UNSPEC = 0 IFA_ADDRESS = 1 IFA_LOCAL = 2 IFA_LABEL = 3 IFA_BROADCAST = 4 IFA_ANYCAST = 5 IFA_CACHEINFO = 6 IFA_MULTICAST = 7 IFA_FLAGS = 8 ) // Device types, from uapi/linux/if_arp.h. const ( ARPHRD_NONE = 65534 ARPHRD_ETHER = 1 ARPHRD_LOOPBACK = 772 ) // RouteMessage is struct rtmsg, from uapi/linux/rtnetlink.h. // // +marshal type RouteMessage struct { Family uint8 DstLen uint8 SrcLen uint8 TOS uint8 Table uint8 Protocol uint8 Scope uint8 Type uint8 Flags uint32 } // SizeOfRouteMessage is the size of RouteMessage. const SizeOfRouteMessage = 12 // Route types, from uapi/linux/rtnetlink.h. const ( // RTN_UNSPEC represents an unspecified route type. RTN_UNSPEC = 0 // RTN_UNICAST represents a unicast route. RTN_UNICAST = 1 // RTN_LOCAL represents a route that is accepted locally. RTN_LOCAL = 2 // RTN_BROADCAST represents a broadcast route (Traffic is accepted locally // as broadcast, and sent as broadcast). RTN_BROADCAST = 3 // RTN_ANYCAST represents a anycast route (Traffic is accepted locally as // broadcast but sent as unicast). RTN_ANYCAST = 6 // RTN_MULTICAST represents a multicast route. RTN_MULTICAST = 5 // RTN_BLACKHOLE represents a route where all traffic is dropped. RTN_BLACKHOLE = 6 // RTN_UNREACHABLE represents a route where the destination is unreachable. RTN_UNREACHABLE = 7 RTN_PROHIBIT = 8 RTN_THROW = 9 RTN_NAT = 10 RTN_XRESOLVE = 11 ) // Route protocols/origins, from uapi/linux/rtnetlink.h. const ( RTPROT_UNSPEC = 0 RTPROT_REDIRECT = 1 RTPROT_KERNEL = 2 RTPROT_BOOT = 3 RTPROT_STATIC = 4 RTPROT_GATED = 8 RTPROT_RA = 9 RTPROT_MRT = 10 RTPROT_ZEBRA = 11 RTPROT_BIRD = 12 RTPROT_DNROUTED = 13 RTPROT_XORP = 14 RTPROT_NTK = 15 RTPROT_DHCP = 16 RTPROT_MROUTED = 17 RTPROT_BABEL = 42 RTPROT_BGP = 186 RTPROT_ISIS = 187 RTPROT_OSPF = 188 RTPROT_RIP = 189 RTPROT_EIGRP = 192 ) // Route scopes, from uapi/linux/rtnetlink.h. const ( RT_SCOPE_UNIVERSE = 0 RT_SCOPE_SITE = 200 RT_SCOPE_LINK = 253 RT_SCOPE_HOST = 254 RT_SCOPE_NOWHERE = 255 ) // Route flags, from uapi/linux/rtnetlink.h. const ( RTM_F_NOTIFY = 0x100 RTM_F_CLONED = 0x200 RTM_F_EQUALIZE = 0x400 RTM_F_PREFIX = 0x800 RTM_F_LOOKUP_TABLE = 0x1000 RTM_F_FIB_MATCH = 0x2000 ) // Route tables, from uapi/linux/rtnetlink.h. const ( RT_TABLE_UNSPEC = 0 RT_TABLE_COMPAT = 252 RT_TABLE_DEFAULT = 253 RT_TABLE_MAIN = 254 RT_TABLE_LOCAL = 255 ) // Route attributes, from uapi/linux/rtnetlink.h. const ( RTA_UNSPEC = 0 RTA_DST = 1 RTA_SRC = 2 RTA_IIF = 3 RTA_OIF = 4 RTA_GATEWAY = 5 RTA_PRIORITY = 6 RTA_PREFSRC = 7 RTA_METRICS = 8 RTA_MULTIPATH = 9 RTA_PROTOINFO = 10 RTA_FLOW = 11 RTA_CACHEINFO = 12 RTA_SESSION = 13 RTA_MP_ALGO = 14 RTA_TABLE = 15 RTA_MARK = 16 RTA_MFC_STATS = 17 RTA_VIA = 18 RTA_NEWDST = 19 RTA_PREF = 20 RTA_ENCAP_TYPE = 21 RTA_ENCAP = 22 RTA_EXPIRES = 23 RTA_PAD = 24 RTA_UID = 25 RTA_TTL_PROPAGATE = 26 RTA_IP_PROTO = 27 RTA_SPORT = 28 RTA_DPORT = 29 ) // Route flags, from include/uapi/linux/route.h. const ( RTF_GATEWAY = 0x2 RTF_UP = 0x1 ) // RtAttr is the header of optional addition route information, as a netlink // attribute. From include/uapi/linux/rtnetlink.h. // // +marshal type RtAttr struct { Len uint16 Type uint16 } // SizeOfRtAttr is the size of RtAttr. const SizeOfRtAttr = 4 golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/nf_tables.go000066400000000000000000000047071465435605700236120ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // This file contains constants required to support nf_tables. // 16-byte Registers that can be used to maintain state for rules. // These correspond to values in include/uapi/linux/netfilter/nf_tables.h. const ( NFT_REG_VERDICT = iota NFT_REG_1 NFT_REG_2 NFT_REG_3 NFT_REG_4 __NFT_REG_MAX ) // 4-byte Registers that can be used to maintain state for rules. // Note that these overlap with the 16-byte registers in memory. // These correspond to values in include/uapi/linux/netfilter/nf_tables.h. const ( NFT_REG32_00 = 8 + iota NFT_REG32_01 NFT_REG32_02 NFT_REG32_03 NFT_REG32_04 NFT_REG32_05 NFT_REG32_06 NFT_REG32_07 NFT_REG32_08 NFT_REG32_09 NFT_REG32_10 NFT_REG32_11 NFT_REG32_12 NFT_REG32_13 NFT_REG32_14 NFT_REG32_15 ) // Other register constants, corresponding to values in // include/uapi/linux/netfilter/nf_tables.h. const ( NFT_REG_MAX = __NFT_REG_MAX - 1 // Maximum register value NFT_REG_SIZE = 16 // Size of NFT_REG NFT_REG32_SIZE = 4 // Size of NFT_REG32 NFT_REG32_COUNT = NFT_REG32_15 - NFT_REG32_00 + 1 // Count of 4-byte registers ) // Internal nf table verdicts. These are used for ruleset evaluation and // are not returned to userspace. // // These also share their numeric name space with the netfilter verdicts. When // used these values are converted to uint32 (purposefully overflowing the int). // These correspond to values in include/uapi/linux/netfilter/nf_tables.h. const ( // Continue evaluation of the current rule. NFT_CONTINUE int32 = -1 // Terminate evaluation of the current rule. NFT_BREAK = -2 // Push the current chain on the jump stack and jump to a chain. NFT_JUMP = -3 // Jump to a chain without pushing the current chain on the jump stack. NFT_GOTO = -4 // Return to the topmost chain on the jump stack. NFT_RETURN = -5 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/poll.go000066400000000000000000000024211465435605700226120ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // PollFD is struct pollfd, used by poll(2)/ppoll(2), from uapi/asm-generic/poll.h. // // +marshal slice:PollFDSlice type PollFD struct { FD int32 Events int16 REvents int16 } // Poll event flags, used by poll(2)/ppoll(2) and/or // epoll_ctl(2)/epoll_wait(2), from uapi/asm-generic/poll.h. const ( POLLIN = 0x0001 POLLPRI = 0x0002 POLLOUT = 0x0004 POLLERR = 0x0008 POLLHUP = 0x0010 POLLNVAL = 0x0020 POLLRDNORM = 0x0040 POLLRDBAND = 0x0080 POLLWRNORM = 0x0100 POLLWRBAND = 0x0200 POLLMSG = 0x0400 POLLREMOVE = 0x1000 POLLRDHUP = 0x2000 POLLFREE = 0x4000 POLL_BUSY_LOOP = 0x8000 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/prctl.go000066400000000000000000000120121465435605700227650ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // PR_* flags, from for prctl(2). const ( // PR_SET_PDEATHSIG sets the process' death signal. PR_SET_PDEATHSIG = 1 // PR_GET_PDEATHSIG gets the process' death signal. PR_GET_PDEATHSIG = 2 // PR_GET_DUMPABLE gets the process' dumpable flag. PR_GET_DUMPABLE = 3 // PR_SET_DUMPABLE sets the process' dumpable flag. PR_SET_DUMPABLE = 4 // PR_GET_KEEPCAPS gets the value of the keep capabilities flag. PR_GET_KEEPCAPS = 7 // PR_SET_KEEPCAPS sets the value of the keep capabilities flag. PR_SET_KEEPCAPS = 8 // PR_GET_TIMING gets the process' timing method. PR_GET_TIMING = 13 // PR_SET_TIMING sets the process' timing method. PR_SET_TIMING = 14 // PR_SET_NAME sets the process' name. PR_SET_NAME = 15 // PR_GET_NAME gets the process' name. PR_GET_NAME = 16 // PR_GET_SECCOMP gets a process' seccomp mode. PR_GET_SECCOMP = 21 // PR_SET_SECCOMP sets a process' seccomp mode. PR_SET_SECCOMP = 22 // PR_CAPBSET_READ gets the capability bounding set. PR_CAPBSET_READ = 23 // PR_CAPBSET_DROP sets the capability bounding set. PR_CAPBSET_DROP = 24 // PR_GET_TSC gets the value of the flag determining whether the // timestamp counter can be read. PR_GET_TSC = 25 // PR_SET_TSC sets the value of the flag determining whether the // timestamp counter can be read. PR_SET_TSC = 26 // PR_SET_TIMERSLACK sets the process' time slack. PR_SET_TIMERSLACK = 29 // PR_GET_TIMERSLACK gets the process' time slack. PR_GET_TIMERSLACK = 30 // PR_TASK_PERF_EVENTS_DISABLE disables all performance counters // attached to the calling process. PR_TASK_PERF_EVENTS_DISABLE = 31 // PR_TASK_PERF_EVENTS_ENABLE enables all performance counters attached // to the calling process. PR_TASK_PERF_EVENTS_ENABLE = 32 // PR_MCE_KILL sets the machine check memory corruption kill policy for // the calling thread. PR_MCE_KILL = 33 // PR_MCE_KILL_GET gets the machine check memory corruption kill policy // for the calling thread. PR_MCE_KILL_GET = 34 // PR_SET_MM modifies certain kernel memory map descriptor fields of // the calling process. See prctl(2) for more information. PR_SET_MM = 35 PR_SET_MM_START_CODE = 1 PR_SET_MM_END_CODE = 2 PR_SET_MM_START_DATA = 3 PR_SET_MM_END_DATA = 4 PR_SET_MM_START_STACK = 5 PR_SET_MM_START_BRK = 6 PR_SET_MM_BRK = 7 PR_SET_MM_ARG_START = 8 PR_SET_MM_ARG_END = 9 PR_SET_MM_ENV_START = 10 PR_SET_MM_ENV_END = 11 PR_SET_MM_AUXV = 12 // PR_SET_MM_EXE_FILE supersedes the /proc/pid/exe symbolic link with a // new one pointing to a new executable file identified by the file // descriptor provided in arg3 argument. See prctl(2) for more // information. PR_SET_MM_EXE_FILE = 13 PR_SET_MM_MAP = 14 PR_SET_MM_MAP_SIZE = 15 // PR_SET_CHILD_SUBREAPER sets the "child subreaper" attribute of the // calling process. PR_SET_CHILD_SUBREAPER = 36 // PR_GET_CHILD_SUBREAPER gets the "child subreaper" attribute of the // calling process. PR_GET_CHILD_SUBREAPER = 37 // PR_SET_NO_NEW_PRIVS sets the calling thread's no_new_privs bit. PR_SET_NO_NEW_PRIVS = 38 // PR_GET_NO_NEW_PRIVS gets the calling thread's no_new_privs bit. PR_GET_NO_NEW_PRIVS = 39 // PR_GET_TID_ADDRESS retrieves the clear_child_tid address. PR_GET_TID_ADDRESS = 40 // PR_SET_THP_DISABLE sets the state of the "THP disable" flag for the // calling thread. PR_SET_THP_DISABLE = 41 // PR_GET_THP_DISABLE gets the state of the "THP disable" flag for the // calling thread. PR_GET_THP_DISABLE = 42 // PR_MPX_ENABLE_MANAGEMENT enables kernel management of Memory // Protection eXtensions (MPX) bounds tables. PR_MPX_ENABLE_MANAGEMENT = 43 // PR_MPX_DISABLE_MANAGEMENT disables kernel management of Memory // Protection eXtensions (MPX) bounds tables. PR_MPX_DISABLE_MANAGEMENT = 44 // The following constants are used to control thread scheduling on cores. PR_SCHED_CORE_SCOPE_THREAD = 0 PR_SCHED_CORE_SCOPE_THREAD_GROUP = 1 // PR_SET_PTRACER allows a specific process (or any, if PR_SET_PTRACER_ANY is // specified) to ptrace the current task. PR_SET_PTRACER = 0x59616d61 PR_SET_PTRACER_ANY = -1 ) // From // Flags are used in syscall arch_prctl(2). const ( ARCH_SET_GS = 0x1001 ARCH_SET_FS = 0x1002 ARCH_GET_FS = 0x1003 ARCH_GET_GS = 0x1004 ARCH_SET_CPUID = 0x1012 ) // Flags for prctl(PR_SET_DUMPABLE), defined in include/linux/sched/coredump.h. const ( SUID_DUMP_DISABLE = 0 SUID_DUMP_USER = 1 SUID_DUMP_ROOT = 2 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/ptrace.go000066400000000000000000000060161465435605700231260ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // ptrace commands from include/uapi/linux/ptrace.h. const ( PTRACE_TRACEME = 0 PTRACE_PEEKTEXT = 1 PTRACE_PEEKDATA = 2 PTRACE_PEEKUSR = 3 PTRACE_POKETEXT = 4 PTRACE_POKEDATA = 5 PTRACE_POKEUSR = 6 PTRACE_CONT = 7 PTRACE_KILL = 8 PTRACE_SINGLESTEP = 9 PTRACE_ATTACH = 16 PTRACE_DETACH = 17 PTRACE_SYSCALL = 24 PTRACE_SETOPTIONS = 0x4200 PTRACE_GETEVENTMSG = 0x4201 PTRACE_GETSIGINFO = 0x4202 PTRACE_SETSIGINFO = 0x4203 PTRACE_GETREGSET = 0x4204 PTRACE_SETREGSET = 0x4205 PTRACE_SEIZE = 0x4206 PTRACE_INTERRUPT = 0x4207 PTRACE_LISTEN = 0x4208 PTRACE_PEEKSIGINFO = 0x4209 PTRACE_GETSIGMASK = 0x420a PTRACE_SETSIGMASK = 0x420b PTRACE_SECCOMP_GET_FILTER = 0x420c PTRACE_SECCOMP_GET_METADATA = 0x420d ) // ptrace commands from arch/x86/include/uapi/asm/ptrace-abi.h. const ( PTRACE_GETREGS = 12 PTRACE_SETREGS = 13 PTRACE_GETFPREGS = 14 PTRACE_SETFPREGS = 15 PTRACE_GETFPXREGS = 18 PTRACE_SETFPXREGS = 19 PTRACE_OLDSETOPTIONS = 21 PTRACE_GET_THREAD_AREA = 25 PTRACE_SET_THREAD_AREA = 26 PTRACE_ARCH_PRCTL = 30 PTRACE_SYSEMU = 31 PTRACE_SYSEMU_SINGLESTEP = 32 PTRACE_SINGLEBLOCK = 33 ) // ptrace event codes from include/uapi/linux/ptrace.h. const ( PTRACE_EVENT_FORK = 1 PTRACE_EVENT_VFORK = 2 PTRACE_EVENT_CLONE = 3 PTRACE_EVENT_EXEC = 4 PTRACE_EVENT_VFORK_DONE = 5 PTRACE_EVENT_EXIT = 6 PTRACE_EVENT_SECCOMP = 7 PTRACE_EVENT_STOP = 128 ) // PTRACE_SETOPTIONS options from include/uapi/linux/ptrace.h. const ( PTRACE_O_TRACESYSGOOD = 1 PTRACE_O_TRACEFORK = 1 << PTRACE_EVENT_FORK PTRACE_O_TRACEVFORK = 1 << PTRACE_EVENT_VFORK PTRACE_O_TRACECLONE = 1 << PTRACE_EVENT_CLONE PTRACE_O_TRACEEXEC = 1 << PTRACE_EVENT_EXEC PTRACE_O_TRACEVFORKDONE = 1 << PTRACE_EVENT_VFORK_DONE PTRACE_O_TRACEEXIT = 1 << PTRACE_EVENT_EXIT PTRACE_O_TRACESECCOMP = 1 << PTRACE_EVENT_SECCOMP PTRACE_O_EXITKILL = 1 << 20 PTRACE_O_SUSPEND_SECCOMP = 1 << 21 ) // YAMA ptrace_scope levels from security/yama/yama_lsm.c. const ( YAMA_SCOPE_DISABLED = 0 YAMA_SCOPE_RELATIONAL = 1 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/ptrace_amd64.go000066400000000000000000000032001465435605700241110ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package linux // PtraceRegs is the set of CPU registers exposed by ptrace. Source: // syscall.PtraceRegs. // // +marshal // +stateify savable type PtraceRegs struct { R15 uint64 R14 uint64 R13 uint64 R12 uint64 Rbp uint64 Rbx uint64 R11 uint64 R10 uint64 R9 uint64 R8 uint64 Rax uint64 Rcx uint64 Rdx uint64 Rsi uint64 Rdi uint64 Orig_rax uint64 Rip uint64 Cs uint64 Eflags uint64 Rsp uint64 Ss uint64 Fs_base uint64 Gs_base uint64 Ds uint64 Es uint64 Fs uint64 Gs uint64 } // InstructionPointer returns the address of the next instruction to // be executed. func (p *PtraceRegs) InstructionPointer() uint64 { return p.Rip } // StackPointer returns the address of the Stack pointer. func (p *PtraceRegs) StackPointer() uint64 { return p.Rsp } // SetStackPointer sets the stack pointer to the specified value. func (p *PtraceRegs) SetStackPointer(sp uint64) { p.Rsp = sp } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/ptrace_arm64.go000066400000000000000000000037121465435605700241370ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package linux const ( //PSR bits PSR_MODE_EL0t = 0x00000000 PSR_MODE_EL1t = 0x00000004 PSR_MODE_EL1h = 0x00000005 PSR_MODE_EL2t = 0x00000008 PSR_MODE_EL2h = 0x00000009 PSR_MODE_EL3t = 0x0000000c PSR_MODE_EL3h = 0x0000000d PSR_MODE_MASK = 0x0000000f // AArch32 CPSR bits PSR_MODE32_BIT = 0x00000010 // AArch64 SPSR bits PSR_F_BIT = 0x00000040 PSR_I_BIT = 0x00000080 PSR_A_BIT = 0x00000100 PSR_D_BIT = 0x00000200 PSR_BTYPE_MASK = 0x00000c00 PSR_SSBS_BIT = 0x00001000 PSR_PAN_BIT = 0x00400000 PSR_UAO_BIT = 0x00800000 PSR_DIT_BIT = 0x01000000 PSR_TCO_BIT = 0x02000000 PSR_V_BIT = 0x10000000 PSR_C_BIT = 0x20000000 PSR_Z_BIT = 0x40000000 PSR_N_BIT = 0x80000000 ) // PtraceRegs is the set of CPU registers exposed by ptrace. Source: // syscall.PtraceRegs. // // +marshal // +stateify savable type PtraceRegs struct { Regs [31]uint64 Sp uint64 Pc uint64 Pstate uint64 } // InstructionPointer returns the address of the next instruction to be // executed. func (p *PtraceRegs) InstructionPointer() uint64 { return p.Pc } // StackPointer returns the address of the Stack pointer. func (p *PtraceRegs) StackPointer() uint64 { return p.Sp } // SetStackPointer sets the stack pointer to the specified value. func (p *PtraceRegs) SetStackPointer(sp uint64) { p.Sp = sp } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/rseq.go000066400000000000000000000101411465435605700226140ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Flags passed to rseq(2). // // Defined in include/uapi/linux/rseq.h. const ( // RSEQ_FLAG_UNREGISTER unregisters the current thread. RSEQ_FLAG_UNREGISTER = 1 << 0 ) // Critical section flags used in RSeqCriticalSection.Flags and RSeq.Flags. // // Defined in include/uapi/linux/rseq.h. const ( // RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT inhibits restart on preemption. RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = 1 << 0 // RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL inhibits restart on signal // delivery. RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = 1 << 1 // RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE inhibits restart on CPU // migration. RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = 1 << 2 ) // RSeqCriticalSection describes a restartable sequences critical section. It // is equivalent to struct rseq_cs, defined in include/uapi/linux/rseq.h. // // In userspace, this structure is always aligned to 32 bytes. // // +marshal type RSeqCriticalSection struct { // Version is the version of this structure. Version 0 is defined here. Version uint32 // Flags are the critical section flags, defined above. Flags uint32 // Start is the start address of the critical section. Start uint64 // PostCommitOffset is the offset from Start of the first instruction // outside of the critical section. PostCommitOffset uint64 // Abort is the abort address. It must be outside the critical section, // and the 4 bytes prior must match the abort signature. Abort uint64 } const ( // SizeOfRSeqCriticalSection is the size of RSeqCriticalSection. SizeOfRSeqCriticalSection = 32 // SizeOfRSeqSignature is the size of the signature immediately // preceding RSeqCriticalSection.Abort. SizeOfRSeqSignature = 4 ) // Special values for RSeq.CPUID, defined in include/uapi/linux/rseq.h. const ( // RSEQ_CPU_ID_UNINITIALIZED indicates that this thread has not // performed rseq initialization. RSEQ_CPU_ID_UNINITIALIZED = ^uint32(0) // -1 // RSEQ_CPU_ID_REGISTRATION_FAILED indicates that rseq initialization // failed. RSEQ_CPU_ID_REGISTRATION_FAILED = ^uint32(1) // -2 ) // RSeq is the thread-local restartable sequences config/status. It // is equivalent to struct rseq, defined in include/uapi/linux/rseq.h. // // In userspace, this structure is always aligned to 32 bytes. type RSeq struct { // CPUIDStart contains the current CPU ID if rseq is initialized. // // This field should only be read by the thread which registered this // structure, and must be read atomically. CPUIDStart uint32 // CPUID contains the current CPU ID or one of the CPU ID special // values defined above. // // This field should only be read by the thread which registered this // structure, and must be read atomically. CPUID uint32 // RSeqCriticalSection is a pointer to the current RSeqCriticalSection // block, or NULL. It is reset to NULL by the kernel on restart or // non-restarting preempt/signal. // // This field should only be written by the thread which registered // this structure, and must be written atomically. RSeqCriticalSection uint64 // Flags are the critical section flags that apply to all critical // sections on this thread, defined above. Flags uint32 } const ( // SizeOfRSeq is the size of RSeq. // // Note that RSeq is naively 24 bytes. However, it has 32-byte // alignment, which in C increases sizeof to 32. That is the size that // the Linux kernel uses. SizeOfRSeq = 32 // AlignOfRSeq is the standard alignment of RSeq. AlignOfRSeq = 32 // OffsetOfRSeqCriticalSection is the offset of RSeqCriticalSection in RSeq. OffsetOfRSeqCriticalSection = 8 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/rusage.go000066400000000000000000000022711465435605700231350ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Flags that may be used with wait4(2) and getrusage(2). const ( // wait4(2) uses this to aggregate RUSAGE_SELF and RUSAGE_CHILDREN. RUSAGE_BOTH = -0x2 // getrusage(2) flags. RUSAGE_CHILDREN = -0x1 RUSAGE_SELF = 0x0 RUSAGE_THREAD = 0x1 ) // Rusage represents the Linux struct rusage. // // +marshal type Rusage struct { UTime Timeval STime Timeval MaxRSS int64 IXRSS int64 IDRSS int64 ISRSS int64 MinFlt int64 MajFlt int64 NSwap int64 InBlock int64 OuBlock int64 MsgSnd int64 MsgRcv int64 NSignals int64 NVCSw int64 NIvCSw int64 } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/sched.go000066400000000000000000000021151465435605700227320ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Scheduling policies, exposed by sched_getscheduler(2)/sched_setscheduler(2). const ( SCHED_NORMAL = 0 SCHED_FIFO = 1 SCHED_RR = 2 SCHED_BATCH = 3 SCHED_IDLE = 5 SCHED_DEADLINE = 6 SCHED_MICROQ = 16 // SCHED_RESET_ON_FORK is a flag that indicates that the process is // reverted back to SCHED_NORMAL on fork. SCHED_RESET_ON_FORK = 0x40000000 ) // Scheduling priority group selectors. const ( PRIO_PGRP = 0x1 PRIO_PROCESS = 0x0 PRIO_USER = 0x2 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/seccomp.go000066400000000000000000000107021465435605700232760ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import "fmt" // Seccomp constants taken from . const ( SECCOMP_MODE_NONE = 0 SECCOMP_MODE_FILTER = 2 SECCOMP_RET_ACTION_FULL = 0xffff0000 SECCOMP_RET_ACTION = 0x7fff0000 SECCOMP_RET_DATA = 0x0000ffff SECCOMP_SET_MODE_FILTER = 1 SECCOMP_GET_ACTION_AVAIL = 2 SECCOMP_GET_NOTIF_SIZES = 3 SECCOMP_FILTER_FLAG_TSYNC = 1 SECCOMP_FILTER_FLAG_NEW_LISTENER = 1 << 3 SECCOMP_USER_NOTIF_FLAG_CONTINUE = 1 SECCOMP_IOCTL_NOTIF_RECV = 0xc0502100 SECCOMP_IOCTL_NOTIF_SEND = 0xc0182101 SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x40082104 SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP = 1 ) // BPFAction is an action for a BPF filter. type BPFAction uint32 // BPFAction definitions. const ( SECCOMP_RET_KILL_PROCESS BPFAction = 0x80000000 SECCOMP_RET_KILL_THREAD BPFAction = 0x00000000 SECCOMP_RET_TRAP BPFAction = 0x00030000 SECCOMP_RET_ERRNO BPFAction = 0x00050000 SECCOMP_RET_TRACE BPFAction = 0x7ff00000 SECCOMP_RET_USER_NOTIF BPFAction = 0x7fc00000 SECCOMP_RET_ALLOW BPFAction = 0x7fff0000 ) func (a BPFAction) String() string { switch a & SECCOMP_RET_ACTION_FULL { case SECCOMP_RET_KILL_PROCESS: return "kill process" case SECCOMP_RET_KILL_THREAD: return "kill thread" case SECCOMP_RET_TRAP: data := a.Data() if data == 0 { return "trap" } return fmt.Sprintf("trap (data=%#x)", data) case SECCOMP_RET_ERRNO: return fmt.Sprintf("return errno=%#x", a.Data()) case SECCOMP_RET_TRACE: data := a.Data() if data == 0 { return "trace" } return fmt.Sprintf("trace (data=%#x)", data) case SECCOMP_RET_ALLOW: return "allow" case SECCOMP_RET_USER_NOTIF: return "unotify" } return fmt.Sprintf("invalid action: %#x", a) } // Data returns the SECCOMP_RET_DATA portion of the action. func (a BPFAction) Data() uint16 { return uint16(a & SECCOMP_RET_DATA) } // WithReturnCode sets the lower 16 bits of the SECCOMP_RET_ERRNO or // SECCOMP_RET_TRACE actions to the provided return code, overwriting the previous // action, and returns a new BPFAction. If not SECCOMP_RET_ERRNO or // SECCOMP_RET_TRACE then this panics. func (a BPFAction) WithReturnCode(code uint16) BPFAction { // mask out the previous return value baseAction := a & SECCOMP_RET_ACTION_FULL if baseAction == SECCOMP_RET_ERRNO || baseAction == SECCOMP_RET_TRACE { return BPFAction(uint32(baseAction) | uint32(code)) } panic("WithReturnCode only valid for SECCOMP_RET_ERRNO and SECCOMP_RET_TRACE") } // SockFprog is sock_fprog taken from . type SockFprog struct { Len uint16 pad [6]byte Filter *BPFInstruction } // SeccompData is equivalent to struct seccomp_data, which contains the data // passed to seccomp-bpf filters. // // +marshal type SeccompData struct { // Nr is the system call number. Nr int32 // Arch is an AUDIT_ARCH_* value indicating the system call convention. Arch uint32 // InstructionPointer is the value of the instruction pointer at the time // of the system call. InstructionPointer uint64 // Args contains the first 6 system call arguments. Args [6]uint64 } // SeccompNotifResp is equivalent to struct seccomp_notif_resp. // // +marshal type SeccompNotifResp struct { ID uint64 Val int64 Error int32 Flags uint32 } // SeccompNotifSizes is equivalent to struct seccomp_notif_sizes. // // +marshal type SeccompNotifSizes struct { Notif uint16 Notif_resp uint16 Data uint16 } // SeccompNotif is equivalent to struct seccomp_notif. // // +marshal type SeccompNotif struct { ID uint64 Pid int32 Flags uint32 Data SeccompData } // String returns a human-friendly representation of this `SeccompData`. func (sd SeccompData) String() string { return fmt.Sprintf( "sysno=%d arch=%#x rip=%#x args=[%#x %#x %#x %#x %#x %#x]", sd.Nr, sd.Arch, sd.InstructionPointer, sd.Args[0], sd.Args[1], sd.Args[2], sd.Args[3], sd.Args[4], sd.Args[5], ) } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/sem.go000066400000000000000000000031661465435605700224370ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // semctl Command Definitions. Source: include/uapi/linux/sem.h const ( GETPID = 11 GETVAL = 12 GETALL = 13 GETNCNT = 14 GETZCNT = 15 SETVAL = 16 SETALL = 17 ) // ipcs ctl cmds. Source: include/uapi/linux/sem.h const ( SEM_STAT = 18 SEM_INFO = 19 SEM_STAT_ANY = 20 ) // Information about system-wide semaphore limits and parameters. // // Source: include/uapi/linux/sem.h const ( SEMMNI = 32000 SEMMSL = 32000 SEMMNS = SEMMNI * SEMMSL SEMOPM = 500 SEMVMX = 32767 SEMAEM = SEMVMX SEMUME = SEMOPM SEMMNU = SEMMNS SEMMAP = SEMMNS SEMUSZ = 20 ) // Semaphore flags. const ( SEM_UNDO = 0x1000 ) // Sembuf is equivalent to struct sembuf. // // +marshal slice:SembufSlice type Sembuf struct { SemNum uint16 SemOp int16 SemFlg int16 } // SemInfo is equivalent to struct seminfo. // // Source: include/uapi/linux/sem.h // // +marshal type SemInfo struct { SemMap uint32 SemMni uint32 SemMns uint32 SemMnu uint32 SemMsl uint32 SemOpm uint32 SemUme uint32 SemUsz uint32 SemVmx uint32 SemAem uint32 } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/sem_amd64.go000066400000000000000000000016241465435605700234270ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package linux // SemidDS is equivalent to struct semid64_ds. // // Source: arch/x86/include/uapi/asm/sembuf.h // // +marshal type SemidDS struct { SemPerm IPCPerm SemOTime TimeT unused1 uint64 SemCTime TimeT unused2 uint64 SemNSems uint64 unused3 uint64 unused4 uint64 } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/sem_arm64.go000066400000000000000000000015611465435605700234450ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package linux // SemidDS is equivalent to struct semid64_ds. // // Source: include/uapi/asm-generic/sembuf.h // // +marshal type SemidDS struct { SemPerm IPCPerm SemOTime TimeT SemCTime TimeT SemNSems uint64 unused3 uint64 unused4 uint64 } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/shm.go000066400000000000000000000050001465435605700224270ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import "math" // shmat(2) flags. Source: include/uapi/linux/shm.h const ( SHM_RDONLY = 010000 // Read-only access. SHM_RND = 020000 // Round attach address to SHMLBA boundary. SHM_REMAP = 040000 // Take-over region on attach. SHM_EXEC = 0100000 // Execution access. ) // IPCPerm.Mode upper byte flags. Source: include/linux/shm.h const ( SHM_DEST = 01000 // Segment will be destroyed on last detach. SHM_LOCKED = 02000 // Segment will not be swapped. SHM_HUGETLB = 04000 // Segment will use huge TLB pages. SHM_NORESERVE = 010000 // Don't check for reservations. ) // Additional Linux-only flags for shmctl(2). Source: include/uapi/linux/shm.h const ( SHM_LOCK = 11 SHM_UNLOCK = 12 SHM_STAT = 13 SHM_INFO = 14 ) // SHM defaults as specified by linux. Source: include/uapi/linux/shm.h const ( SHMMIN = 1 SHMMNI = 4096 SHMMAX = math.MaxUint64 - 1<<24 SHMALL = math.MaxUint64 - 1<<24 SHMSEG = 4096 ) // ShmidDS is equivalent to struct shmid64_ds. Source: // include/uapi/asm-generic/shmbuf.h // // +marshal type ShmidDS struct { ShmPerm IPCPerm ShmSegsz uint64 ShmAtime TimeT ShmDtime TimeT ShmCtime TimeT ShmCpid int32 ShmLpid int32 ShmNattach uint64 Unused4 uint64 Unused5 uint64 } // ShmParams is equivalent to struct shminfo. Source: include/uapi/linux/shm.h // // +marshal type ShmParams struct { ShmMax uint64 ShmMin uint64 ShmMni uint64 ShmSeg uint64 ShmAll uint64 } // ShmInfo is equivalent to struct shm_info. Source: include/uapi/linux/shm.h // // +marshal type ShmInfo struct { UsedIDs int32 // Number of currently existing segments. _ [4]byte ShmTot uint64 // Total number of shared memory pages. ShmRss uint64 // Number of resident shared memory pages. ShmSwp uint64 // Number of swapped shared memory pages. SwapAttempts uint64 // Unused since Linux 2.4. SwapSuccesses uint64 // Unused since Linux 2.4. } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/signal.go000066400000000000000000000334541465435605700231330ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/hostarch" ) const ( // SignalMaximum is the highest valid signal number. SignalMaximum = 64 // FirstStdSignal is the lowest standard signal number. FirstStdSignal = 1 // LastStdSignal is the highest standard signal number. LastStdSignal = 31 // FirstRTSignal is the lowest real-time signal number. // // 32 (SIGCANCEL) and 33 (SIGSETXID) are used internally by glibc. FirstRTSignal = 32 // LastRTSignal is the highest real-time signal number. LastRTSignal = 64 // NumStdSignals is the number of standard signals. NumStdSignals = LastStdSignal - FirstStdSignal + 1 // NumRTSignals is the number of realtime signals. NumRTSignals = LastRTSignal - FirstRTSignal + 1 ) // Signal is a signal number. type Signal int // IsValid returns true if s is a valid standard or realtime signal. (0 is not // considered valid; interfaces special-casing signal number 0 should check for // 0 first before asserting validity.) func (s Signal) IsValid() bool { return s > 0 && s <= SignalMaximum } // IsStandard returns true if s is a standard signal. // // Preconditions: s.IsValid(). func (s Signal) IsStandard() bool { return s <= LastStdSignal } // IsRealtime returns true if s is a realtime signal. // // Preconditions: s.IsValid(). func (s Signal) IsRealtime() bool { return s >= FirstRTSignal } // Index returns the index for signal s into arrays of both standard and // realtime signals (e.g. signal masks). // // Preconditions: s.IsValid(). func (s Signal) Index() int { return int(s - 1) } // Signals. const ( SIGABRT = Signal(6) SIGALRM = Signal(14) SIGBUS = Signal(7) SIGCHLD = Signal(17) SIGCLD = Signal(17) SIGCONT = Signal(18) SIGFPE = Signal(8) SIGHUP = Signal(1) SIGILL = Signal(4) SIGINT = Signal(2) SIGIO = Signal(29) SIGIOT = Signal(6) SIGKILL = Signal(9) SIGPIPE = Signal(13) SIGPOLL = Signal(29) SIGPROF = Signal(27) SIGPWR = Signal(30) SIGQUIT = Signal(3) SIGSEGV = Signal(11) SIGSTKFLT = Signal(16) SIGSTOP = Signal(19) SIGSYS = Signal(31) SIGTERM = Signal(15) SIGTRAP = Signal(5) SIGTSTP = Signal(20) SIGTTIN = Signal(21) SIGTTOU = Signal(22) SIGUNUSED = Signal(31) SIGURG = Signal(23) SIGUSR1 = Signal(10) SIGUSR2 = Signal(12) SIGVTALRM = Signal(26) SIGWINCH = Signal(28) SIGXCPU = Signal(24) SIGXFSZ = Signal(25) ) // SignalSet is a signal mask with a bit corresponding to each signal. // // +marshal type SignalSet uint64 // SignalSetSize is the size in bytes of a SignalSet. const SignalSetSize = 8 // MakeSignalSet returns SignalSet with the bit corresponding to each of the // given signals set. func MakeSignalSet(sigs ...Signal) SignalSet { indices := make([]int, len(sigs)) for i, sig := range sigs { indices[i] = sig.Index() } return SignalSet(bits.Mask64(indices...)) } // SignalSetOf returns a SignalSet with a single signal set. func SignalSetOf(sig Signal) SignalSet { return SignalSet(bits.MaskOf64(sig.Index())) } // ForEachSignal invokes f for each signal set in the given mask. func ForEachSignal(mask SignalSet, f func(sig Signal)) { bits.ForEachSetBit64(uint64(mask), func(i int) { f(Signal(i + 1)) }) } // 'how' values for rt_sigprocmask(2). const ( // SIG_BLOCK blocks the signals in the set. SIG_BLOCK = 0 // SIG_UNBLOCK blocks the signals in the set. SIG_UNBLOCK = 1 // SIG_SETMASK sets the signal mask to set. SIG_SETMASK = 2 ) // Signal actions for rt_sigaction(2), from uapi/asm-generic/signal-defs.h. const ( // SIG_DFL performs the default action. SIG_DFL = 0 // SIG_IGN ignores the signal. SIG_IGN = 1 ) // Signal action flags for rt_sigaction(2), from uapi/asm-generic/signal.h. const ( SA_NOCLDSTOP = 0x00000001 SA_NOCLDWAIT = 0x00000002 SA_SIGINFO = 0x00000004 SA_RESTORER = 0x04000000 SA_ONSTACK = 0x08000000 SA_RESTART = 0x10000000 SA_NODEFER = 0x40000000 SA_RESETHAND = 0x80000000 SA_NOMASK = SA_NODEFER SA_ONESHOT = SA_RESETHAND ) // Signal stack flags for signalstack(2), from include/uapi/linux/signal.h. const ( SS_ONSTACK = 1 SS_DISABLE = 2 ) // SIGPOLL si_codes. const ( // SI_POLL is defined as __SI_POLL in Linux 2.6. SI_POLL = 2 << 16 // POLL_IN indicates that data input available. POLL_IN = SI_POLL | 1 // POLL_OUT indicates that output buffers available. POLL_OUT = SI_POLL | 2 // POLL_MSG indicates that an input message available. POLL_MSG = SI_POLL | 3 // POLL_ERR indicates that there was an i/o error. POLL_ERR = SI_POLL | 4 // POLL_PRI indicates that a high priority input available. POLL_PRI = SI_POLL | 5 // POLL_HUP indicates that a device disconnected. POLL_HUP = SI_POLL | 6 ) // Possible values for si_code. const ( // SI_USER is sent by kill, sigsend, raise. SI_USER = 0 // SI_KERNEL is sent by the kernel from somewhere. SI_KERNEL = 0x80 // SI_QUEUE is sent by sigqueue. SI_QUEUE = -1 // SI_TIMER is sent by timer expiration. SI_TIMER = -2 // SI_MESGQ is sent by real time mesq state change. SI_MESGQ = -3 // SI_ASYNCIO is sent by AIO completion. SI_ASYNCIO = -4 // SI_SIGIO is sent by queued SIGIO. SI_SIGIO = -5 // SI_TKILL is sent by tkill system call. SI_TKILL = -6 // SI_DETHREAD is sent by execve() killing subsidiary threads. SI_DETHREAD = -7 // SI_ASYNCNL is sent by glibc async name lookup completion. SI_ASYNCNL = -60 ) // CLD_* codes are only meaningful for SIGCHLD. const ( // CLD_EXITED indicates that a task exited. CLD_EXITED = 1 // CLD_KILLED indicates that a task was killed by a signal. CLD_KILLED = 2 // CLD_DUMPED indicates that a task was killed by a signal and then dumped // core. CLD_DUMPED = 3 // CLD_TRAPPED indicates that a task was stopped by ptrace. CLD_TRAPPED = 4 // CLD_STOPPED indicates that a thread group completed a group stop. CLD_STOPPED = 5 // CLD_CONTINUED indicates that a group-stopped thread group was continued. CLD_CONTINUED = 6 ) // SYS_* codes are only meaningful for SIGSYS. const ( // SYS_SECCOMP indicates that a signal originates from seccomp. SYS_SECCOMP = 1 ) // Possible values for Sigevent.Notify, aka struct sigevent::sigev_notify. const ( SIGEV_SIGNAL = 0 SIGEV_NONE = 1 SIGEV_THREAD = 2 SIGEV_THREAD_ID = 4 ) // Sigevent represents struct sigevent. // // +marshal type Sigevent struct { Value uint64 // union sigval {int, void*} Signo int32 Notify int32 // struct sigevent here contains 48-byte union _sigev_un. However, only // member _tid is significant to the kernel. Tid int32 UnRemainder [44]byte } // SigAction represents struct sigaction. // // +marshal // +stateify savable type SigAction struct { Handler uint64 Flags uint64 Restorer uint64 Mask SignalSet } // SignalStack represents information about a user stack, and is equivalent to // stack_t. // // +marshal // +stateify savable type SignalStack struct { Addr uint64 Flags uint32 _ uint32 Size uint64 } // Contains checks if the stack pointer is within this stack. func (s *SignalStack) Contains(sp hostarch.Addr) bool { return hostarch.Addr(s.Addr) < sp && sp <= hostarch.Addr(s.Addr+s.Size) } // Top returns the stack's top address. func (s *SignalStack) Top() hostarch.Addr { return hostarch.Addr(s.Addr + s.Size) } // IsEnabled returns true iff this signal stack is marked as enabled. func (s *SignalStack) IsEnabled() bool { return s.Flags&SS_DISABLE == 0 } // SignalInfo represents information about a signal being delivered, and is // equivalent to struct siginfo in linux kernel(linux/include/uapi/asm-generic/siginfo.h). // // +marshal // +stateify savable type SignalInfo struct { Signo int32 // Signal number Errno int32 // Errno value Code int32 // Signal code _ uint32 // struct siginfo::_sifields is a union. In SignalInfo, fields in the union // are accessed through methods. // // For reference, here is the definition of _sifields: (_sigfault._trapno, // which does not exist on x86, omitted for clarity) // // union { // int _pad[SI_PAD_SIZE]; // // /* kill() */ // struct { // __kernel_pid_t _pid; /* sender's pid */ // __ARCH_SI_UID_T _uid; /* sender's uid */ // } _kill; // // /* POSIX.1b timers */ // struct { // __kernel_timer_t _tid; /* timer id */ // int _overrun; /* overrun count */ // char _pad[sizeof( __ARCH_SI_UID_T) - sizeof(int)]; // sigval_t _sigval; /* same as below */ // int _sys_private; /* not to be passed to user */ // } _timer; // // /* POSIX.1b signals */ // struct { // __kernel_pid_t _pid; /* sender's pid */ // __ARCH_SI_UID_T _uid; /* sender's uid */ // sigval_t _sigval; // } _rt; // // /* SIGCHLD */ // struct { // __kernel_pid_t _pid; /* which child */ // __ARCH_SI_UID_T _uid; /* sender's uid */ // int _status; /* exit code */ // __ARCH_SI_CLOCK_T _utime; // __ARCH_SI_CLOCK_T _stime; // } _sigchld; // // /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ // struct { // void *_addr; /* faulting insn/memory ref. */ // short _addr_lsb; /* LSB of the reported address */ // } _sigfault; // // /* SIGPOLL */ // struct { // __ARCH_SI_BAND_T _band; /* POLL_IN, POLL_OUT, POLL_MSG */ // int _fd; // } _sigpoll; // // /* SIGSYS */ // struct { // void *_call_addr; /* calling user insn */ // int _syscall; /* triggering system call number */ // unsigned int _arch; /* AUDIT_ARCH_* of syscall */ // } _sigsys; // } _sifields; // // _sifields is padded so that the size of siginfo is SI_MAX_SIZE = 128 // bytes. Fields [128 - 16]byte } // FixSignalCodeForUser fixes up si_code. // // The si_code we get from Linux may contain the kernel-specific code in the // top 16 bits if it's positive (e.g., from ptrace). Linux's // copy_siginfo_to_user does: // err |= __put_user((short)from->si_code, &to->si_code); // to mask out those bits and we need to do the same. func (s *SignalInfo) FixSignalCodeForUser() { if s.Code > 0 { s.Code &= 0x0000ffff } } // PID returns the si_pid field. func (s *SignalInfo) PID() int32 { return int32(hostarch.ByteOrder.Uint32(s.Fields[0:4])) } // SetPID mutates the si_pid field. func (s *SignalInfo) SetPID(val int32) { hostarch.ByteOrder.PutUint32(s.Fields[0:4], uint32(val)) } // UID returns the si_uid field. func (s *SignalInfo) UID() int32 { return int32(hostarch.ByteOrder.Uint32(s.Fields[4:8])) } // SetUID mutates the si_uid field. func (s *SignalInfo) SetUID(val int32) { hostarch.ByteOrder.PutUint32(s.Fields[4:8], uint32(val)) } // Sigval returns the sigval field, which is aliased to both si_int and si_ptr. func (s *SignalInfo) Sigval() uint64 { return hostarch.ByteOrder.Uint64(s.Fields[8:16]) } // SetSigval mutates the sigval field. func (s *SignalInfo) SetSigval(val uint64) { hostarch.ByteOrder.PutUint64(s.Fields[8:16], val) } // TimerID returns the si_timerid field. func (s *SignalInfo) TimerID() TimerID { return TimerID(hostarch.ByteOrder.Uint32(s.Fields[0:4])) } // SetTimerID sets the si_timerid field. func (s *SignalInfo) SetTimerID(val TimerID) { hostarch.ByteOrder.PutUint32(s.Fields[0:4], uint32(val)) } // Overrun returns the si_overrun field. func (s *SignalInfo) Overrun() int32 { return int32(hostarch.ByteOrder.Uint32(s.Fields[4:8])) } // SetOverrun sets the si_overrun field. func (s *SignalInfo) SetOverrun(val int32) { hostarch.ByteOrder.PutUint32(s.Fields[4:8], uint32(val)) } // Addr returns the si_addr field. func (s *SignalInfo) Addr() uint64 { return hostarch.ByteOrder.Uint64(s.Fields[0:8]) } // SetAddr sets the si_addr field. func (s *SignalInfo) SetAddr(val uint64) { hostarch.ByteOrder.PutUint64(s.Fields[0:8], val) } // Status returns the si_status field. func (s *SignalInfo) Status() int32 { return int32(hostarch.ByteOrder.Uint32(s.Fields[8:12])) } // SetStatus mutates the si_status field. func (s *SignalInfo) SetStatus(val int32) { hostarch.ByteOrder.PutUint32(s.Fields[8:12], uint32(val)) } // CallAddr returns the si_call_addr field. func (s *SignalInfo) CallAddr() uint64 { return hostarch.ByteOrder.Uint64(s.Fields[0:8]) } // SetCallAddr mutates the si_call_addr field. func (s *SignalInfo) SetCallAddr(val uint64) { hostarch.ByteOrder.PutUint64(s.Fields[0:8], val) } // Syscall returns the si_syscall field. func (s *SignalInfo) Syscall() int32 { return int32(hostarch.ByteOrder.Uint32(s.Fields[8:12])) } // SetSyscall mutates the si_syscall field. func (s *SignalInfo) SetSyscall(val int32) { hostarch.ByteOrder.PutUint32(s.Fields[8:12], uint32(val)) } // Arch returns the si_arch field. func (s *SignalInfo) Arch() uint32 { return hostarch.ByteOrder.Uint32(s.Fields[12:16]) } // SetArch mutates the si_arch field. func (s *SignalInfo) SetArch(val uint32) { hostarch.ByteOrder.PutUint32(s.Fields[12:16], val) } // Band returns the si_band field. func (s *SignalInfo) Band() int64 { return int64(hostarch.ByteOrder.Uint64(s.Fields[0:8])) } // SetBand mutates the si_band field. func (s *SignalInfo) SetBand(val int64) { // Note: this assumes the platform uses `long` as `__ARCH_SI_BAND_T`. // On some platforms, which gVisor doesn't support, `__ARCH_SI_BAND_T` is // `int`. See siginfo.h. hostarch.ByteOrder.PutUint64(s.Fields[0:8], uint64(val)) } // FD returns the si_fd field. func (s *SignalInfo) FD() uint32 { return hostarch.ByteOrder.Uint32(s.Fields[8:12]) } // SetFD mutates the si_fd field. func (s *SignalInfo) SetFD(val uint32) { hostarch.ByteOrder.PutUint32(s.Fields[8:12], val) } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/signalfd.go000066400000000000000000000022151465435605700234340ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux const ( // SFD_NONBLOCK is a signalfd(2) flag. SFD_NONBLOCK = 00004000 // SFD_CLOEXEC is a signalfd(2) flag. SFD_CLOEXEC = 02000000 ) // SignalfdSiginfo is the siginfo encoding for signalfds. // // +marshal type SignalfdSiginfo struct { Signo uint32 Errno int32 Code int32 PID uint32 UID uint32 FD int32 TID uint32 Band uint32 Overrun uint32 TrapNo uint32 Status int32 Int int32 Ptr uint64 UTime uint64 STime uint64 Addr uint64 AddrLSB uint16 _ [48]uint8 `marshal:"unaligned"` } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/socket.go000066400000000000000000000362741465435605700231510ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/marshal" ) // Address families, from linux/socket.h. const ( AF_UNSPEC = 0 AF_UNIX = 1 AF_INET = 2 AF_AX25 = 3 AF_IPX = 4 AF_APPLETALK = 5 AF_NETROM = 6 AF_BRIDGE = 7 AF_ATMPVC = 8 AF_X25 = 9 AF_INET6 = 10 AF_ROSE = 11 AF_DECnet = 12 AF_NETBEUI = 13 AF_SECURITY = 14 AF_KEY = 15 AF_NETLINK = 16 AF_PACKET = 17 AF_ASH = 18 AF_ECONET = 19 AF_ATMSVC = 20 AF_RDS = 21 AF_SNA = 22 AF_IRDA = 23 AF_PPPOX = 24 AF_WANPIPE = 25 AF_LLC = 26 AF_IB = 27 AF_MPLS = 28 AF_CAN = 29 AF_TIPC = 30 AF_BLUETOOTH = 31 AF_IUCV = 32 AF_RXRPC = 33 AF_ISDN = 34 AF_PHONET = 35 AF_IEEE802154 = 36 AF_CAIF = 37 AF_ALG = 38 AF_NFC = 39 AF_VSOCK = 40 ) // sendmsg(2)/recvmsg(2) flags, from linux/socket.h. const ( MSG_OOB = 0x1 MSG_PEEK = 0x2 MSG_DONTROUTE = 0x4 MSG_TRYHARD = 0x4 MSG_CTRUNC = 0x8 MSG_PROBE = 0x10 MSG_TRUNC = 0x20 MSG_DONTWAIT = 0x40 MSG_EOR = 0x80 MSG_WAITALL = 0x100 MSG_FIN = 0x200 MSG_EOF = MSG_FIN MSG_SYN = 0x400 MSG_CONFIRM = 0x800 MSG_RST = 0x1000 MSG_ERRQUEUE = 0x2000 MSG_NOSIGNAL = 0x4000 MSG_MORE = 0x8000 MSG_WAITFORONE = 0x10000 MSG_SENDPAGE_NOTLAST = 0x20000 MSG_ZEROCOPY = 0x4000000 MSG_FASTOPEN = 0x20000000 MSG_CMSG_CLOEXEC = 0x40000000 ) // Set/get socket option levels, from socket.h. const ( SOL_IP = 0 SOL_SOCKET = 1 SOL_TCP = 6 SOL_UDP = 17 SOL_IPV6 = 41 SOL_ICMPV6 = 58 SOL_RAW = 255 SOL_PACKET = 263 SOL_NETLINK = 270 ) // A SockType is a type (as opposed to family) of sockets. These are enumerated // below as SOCK_* constants. type SockType int // Socket types, from linux/net.h. const ( SOCK_STREAM SockType = 1 SOCK_DGRAM SockType = 2 SOCK_RAW SockType = 3 SOCK_RDM SockType = 4 SOCK_SEQPACKET SockType = 5 SOCK_DCCP SockType = 6 SOCK_PACKET SockType = 10 ) // SOCK_TYPE_MASK covers all of the above socket types. The remaining bits are // flags. From linux/net.h. const SOCK_TYPE_MASK = 0xf // socket(2)/socketpair(2)/accept4(2) flags, from linux/net.h. const ( SOCK_CLOEXEC = O_CLOEXEC SOCK_NONBLOCK = O_NONBLOCK ) // shutdown(2) how commands, from . const ( SHUT_RD = 0 SHUT_WR = 1 SHUT_RDWR = 2 ) // Packet types from const ( PACKET_HOST = 0 // To us PACKET_BROADCAST = 1 // To all PACKET_MULTICAST = 2 // To group PACKET_OTHERHOST = 3 // To someone else PACKET_OUTGOING = 4 // Outgoing of any type ) // Socket options from socket.h. const ( SO_DEBUG = 1 SO_REUSEADDR = 2 SO_TYPE = 3 SO_ERROR = 4 SO_DONTROUTE = 5 SO_BROADCAST = 6 SO_SNDBUF = 7 SO_RCVBUF = 8 SO_KEEPALIVE = 9 SO_OOBINLINE = 10 SO_NO_CHECK = 11 SO_PRIORITY = 12 SO_LINGER = 13 SO_BSDCOMPAT = 14 SO_REUSEPORT = 15 SO_PASSCRED = 16 SO_PEERCRED = 17 SO_RCVLOWAT = 18 SO_SNDLOWAT = 19 SO_RCVTIMEO = 20 SO_SNDTIMEO = 21 SO_BINDTODEVICE = 25 SO_ATTACH_FILTER = 26 SO_DETACH_FILTER = 27 SO_GET_FILTER = SO_ATTACH_FILTER SO_PEERNAME = 28 SO_TIMESTAMP = 29 SO_ACCEPTCONN = 30 SO_PEERSEC = 31 SO_SNDBUFFORCE = 32 SO_RCVBUFFORCE = 33 SO_PASSSEC = 34 SO_TIMESTAMPNS = 35 SO_MARK = 36 SO_TIMESTAMPING = 37 SO_PROTOCOL = 38 SO_DOMAIN = 39 SO_RXQ_OVFL = 40 SO_WIFI_STATUS = 41 SO_PEEK_OFF = 42 SO_NOFCS = 43 SO_LOCK_FILTER = 44 SO_SELECT_ERR_QUEUE = 45 SO_BUSY_POLL = 46 SO_MAX_PACING_RATE = 47 SO_BPF_EXTENSIONS = 48 SO_INCOMING_CPU = 49 SO_ATTACH_BPF = 50 SO_ATTACH_REUSEPORT_CBPF = 51 SO_ATTACH_REUSEPORT_EBPF = 52 SO_CNX_ADVICE = 53 SO_MEMINFO = 55 SO_INCOMING_NAPI_ID = 56 SO_COOKIE = 57 SO_PEERGROUPS = 59 SO_ZEROCOPY = 60 SO_TXTIME = 61 ) // enum socket_state, from uapi/linux/net.h. const ( SS_FREE = 0 // Not allocated. SS_UNCONNECTED = 1 // Unconnected to any socket. SS_CONNECTING = 2 // In process of connecting. SS_CONNECTED = 3 // Connected to socket. SS_DISCONNECTING = 4 // In process of disconnecting. ) // TCP protocol states, from include/net/tcp_states.h. const ( TCP_ESTABLISHED uint32 = iota + 1 TCP_SYN_SENT TCP_SYN_RECV TCP_FIN_WAIT1 TCP_FIN_WAIT2 TCP_TIME_WAIT TCP_CLOSE TCP_CLOSE_WAIT TCP_LAST_ACK TCP_LISTEN TCP_CLOSING TCP_NEW_SYN_RECV ) // SockAddrMax is the maximum size of a struct sockaddr, from // uapi/linux/socket.h. const SockAddrMax = 128 // InetAddr is struct in_addr, from uapi/linux/in.h. // // +marshal type InetAddr [4]byte // SizeOfInetAddr is the size of InetAddr. var SizeOfInetAddr = uint32((*InetAddr)(nil).SizeBytes()) // SockAddrInet is struct sockaddr_in, from uapi/linux/in.h. // // +marshal type SockAddrInet struct { Family uint16 Port uint16 Addr InetAddr _ [8]uint8 // pad to sizeof(struct sockaddr). } // Inet6MulticastRequest is struct ipv6_mreq, from uapi/linux/in6.h. // // +marshal type Inet6MulticastRequest struct { MulticastAddr Inet6Addr InterfaceIndex int32 } // InetMulticastRequest is struct ip_mreq, from uapi/linux/in.h. // // +marshal type InetMulticastRequest struct { MulticastAddr InetAddr InterfaceAddr InetAddr } // InetMulticastRequestWithNIC is struct ip_mreqn, from uapi/linux/in.h. // // +marshal type InetMulticastRequestWithNIC struct { InetMulticastRequest InterfaceIndex int32 } // Inet6Addr is struct in6_addr, from uapi/linux/in6.h. // // +marshal type Inet6Addr [16]byte // SockAddrInet6 is struct sockaddr_in6, from uapi/linux/in6.h. // // +marshal type SockAddrInet6 struct { Family uint16 Port uint16 Flowinfo uint32 Addr [16]byte Scope_id uint32 } // SockAddrLink is a struct sockaddr_ll, from uapi/linux/if_packet.h. // // +marshal type SockAddrLink struct { Family uint16 Protocol uint16 InterfaceIndex int32 ARPHardwareType uint16 PacketType byte HardwareAddrLen byte HardwareAddr [8]byte } // UnixPathMax is the maximum length of the path in an AF_UNIX socket. // // From uapi/linux/un.h. const UnixPathMax = 108 // SockAddrUnix is struct sockaddr_un, from uapi/linux/un.h. // // +marshal type SockAddrUnix struct { Family uint16 Path [UnixPathMax]int8 } // SockAddr represents a union of valid socket address types. This is logically // equivalent to struct sockaddr. SockAddr ensures that a well-defined set of // types can be used as socket addresses. type SockAddr interface { marshal.Marshallable // implementsSockAddr exists purely to allow a type to indicate that they // implement this interface. This method is a no-op and shouldn't be called. implementsSockAddr() } func (s *SockAddrInet) implementsSockAddr() {} func (s *SockAddrInet6) implementsSockAddr() {} func (s *SockAddrLink) implementsSockAddr() {} func (s *SockAddrUnix) implementsSockAddr() {} func (s *SockAddrNetlink) implementsSockAddr() {} // Linger is struct linger, from include/linux/socket.h. // // +marshal type Linger struct { OnOff int32 Linger int32 } // SizeOfLinger is the binary size of a Linger struct. const SizeOfLinger = 8 // TCPInfo is a collection of TCP statistics. // // From uapi/linux/tcp.h. Newer versions of Linux continue to add new fields to // the end of this struct or within existing unused space, so its size grows // over time. The current iteration is based on linux v4.17. New versions are // always backwards compatible. // // +marshal type TCPInfo struct { // State is the state of the connection. State uint8 // CaState is the congestion control state. CaState uint8 // Retransmits is the number of retransmissions triggered by RTO. Retransmits uint8 // Probes is the number of unanswered zero window probes. Probes uint8 // BackOff indicates exponential backoff. Backoff uint8 // Options indicates the options enabled for the connection. Options uint8 // WindowScale is the combination of snd_wscale (first 4 bits) and // rcv_wscale (second 4 bits) WindowScale uint8 // DeliveryRateAppLimited is a boolean and only the first bit is // meaningful. DeliveryRateAppLimited uint8 // RTO is the retransmission timeout. RTO uint32 // ATO is the acknowledgement timeout interval. ATO uint32 // SndMss is the send maximum segment size. SndMss uint32 // RcvMss is the receive maximum segment size. RcvMss uint32 // Unacked is the number of packets sent but not acknowledged. Unacked uint32 // Sacked is the number of packets which are selectively acknowledged. Sacked uint32 // Lost is the number of packets marked as lost. Lost uint32 // Retrans is the number of retransmitted packets. Retrans uint32 // Fackets is not used and is always zero. Fackets uint32 // Times. LastDataSent uint32 LastAckSent uint32 LastDataRecv uint32 LastAckRecv uint32 // Metrics. PMTU uint32 RcvSsthresh uint32 RTT uint32 RTTVar uint32 SndSsthresh uint32 SndCwnd uint32 Advmss uint32 Reordering uint32 // RcvRTT is the receiver round trip time. RcvRTT uint32 // RcvSpace is the current buffer space available for receiving data. RcvSpace uint32 // TotalRetrans is the total number of retransmits seen since the start // of the connection. TotalRetrans uint32 // PacingRate is the pacing rate in bytes per second. PacingRate uint64 // MaxPacingRate is the maximum pacing rate. MaxPacingRate uint64 // BytesAcked is RFC4898 tcpEStatsAppHCThruOctetsAcked. BytesAcked uint64 // BytesReceived is RFC4898 tcpEStatsAppHCThruOctetsReceived. BytesReceived uint64 // SegsOut is RFC4898 tcpEStatsPerfSegsOut. SegsOut uint32 // SegsIn is RFC4898 tcpEStatsPerfSegsIn. SegsIn uint32 // NotSentBytes is the amount of bytes in the write queue that are not // yet sent. NotSentBytes uint32 // MinRTT is the minimum round trip time seen in the connection. MinRTT uint32 // DataSegsIn is RFC4898 tcpEStatsDataSegsIn. DataSegsIn uint32 // DataSegsOut is RFC4898 tcpEStatsDataSegsOut. DataSegsOut uint32 // DeliveryRate is the most recent delivery rate in bytes per second. DeliveryRate uint64 // BusyTime is the time in microseconds busy sending data. BusyTime uint64 // RwndLimited is the time in microseconds limited by receive window. RwndLimited uint64 // SndBufLimited is the time in microseconds limited by send buffer. SndBufLimited uint64 // Delivered is the total data packets delivered including retransmits. Delivered uint32 // DeliveredCE is the total ECE marked data packets delivered including // retransmits. DeliveredCE uint32 // BytesSent is RFC4898 tcpEStatsPerfHCDataOctetsOut. BytesSent uint64 // BytesRetrans is RFC4898 tcpEStatsPerfOctetsRetrans. BytesRetrans uint64 // DSACKDups is RFC4898 tcpEStatsStackDSACKDups. DSACKDups uint32 // ReordSeen is the number of reordering events seen since the start of // the connection. ReordSeen uint32 } // SizeOfTCPInfo is the binary size of a TCPInfo struct. var SizeOfTCPInfo = (*TCPInfo)(nil).SizeBytes() // Control message types, from linux/socket.h. const ( SCM_CREDENTIALS = 0x2 SCM_RIGHTS = 0x1 ) // A ControlMessageHeader is the header for a socket control message. // // ControlMessageHeader represents struct cmsghdr from linux/socket.h. // // +marshal type ControlMessageHeader struct { Length uint64 Level int32 Type int32 } // SizeOfControlMessageHeader is the binary size of a ControlMessageHeader // struct. var SizeOfControlMessageHeader = (*ControlMessageHeader)(nil).SizeBytes() // A ControlMessageCredentials is an SCM_CREDENTIALS socket control message. // // ControlMessageCredentials represents struct ucred from linux/socket.h. // // +marshal type ControlMessageCredentials struct { PID int32 UID uint32 GID uint32 } // A ControlMessageIPPacketInfo is IP_PKTINFO socket control message. // // ControlMessageIPPacketInfo represents struct in_pktinfo from linux/in.h. // // +marshal // +stateify savable type ControlMessageIPPacketInfo struct { NIC int32 LocalAddr InetAddr DestinationAddr InetAddr } // ControlMessageIPv6PacketInfo represents struct in6_pktinfo from linux/ipv6.h. // // +marshal // +stateify savable type ControlMessageIPv6PacketInfo struct { Addr Inet6Addr NIC uint32 } // SizeOfControlMessageCredentials is the binary size of a // ControlMessageCredentials struct. var SizeOfControlMessageCredentials = (*ControlMessageCredentials)(nil).SizeBytes() // SizeOfControlMessageRight is the size of a single element in // ControlMessageRights. const SizeOfControlMessageRight = 4 // SizeOfControlMessageInq is the size of a TCP_INQ control message. const SizeOfControlMessageInq = 4 // SizeOfControlMessageTOS is the size of an IP_TOS control message. const SizeOfControlMessageTOS = 1 // SizeOfControlMessageTTL is the size of an IP_TTL control message. const SizeOfControlMessageTTL = 4 // SizeOfControlMessageTClass is the size of an IPV6_TCLASS control message. const SizeOfControlMessageTClass = 4 // SizeOfControlMessageHopLimit is the size of an IPV6_HOPLIMIT control message. const SizeOfControlMessageHopLimit = 4 // SizeOfControlMessageIPPacketInfo is the size of an IP_PKTINFO control // message. const SizeOfControlMessageIPPacketInfo = 12 // SizeOfControlMessageIPv6PacketInfo is the size of a // ControlMessageIPv6PacketInfo. const SizeOfControlMessageIPv6PacketInfo = 20 // SCM_MAX_FD is the maximum number of FDs accepted in a single sendmsg call. // From net/scm.h. const SCM_MAX_FD = 253 // SO_ACCEPTCON is defined as __SO_ACCEPTCON in // include/uapi/linux/net.h, which represents a listening socket // state. Note that this is distinct from SO_ACCEPTCONN, which is a // socket option for querying whether a socket is in a listening // state. const SO_ACCEPTCON = 1 << 16 // ICMP6Filter represents struct icmp6_filter from linux/icmpv6.h. // // +marshal // +stateify savable type ICMP6Filter struct { Filter [8]uint32 } // SizeOfICMP6Filter is the size of ICMP6Filter struct. var SizeOfICMP6Filter = uint32((*ICMP6Filter)(nil).SizeBytes()) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/splice.go000066400000000000000000000013561465435605700231310ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Constants for splice(2), sendfile(2) and tee(2). const ( SPLICE_F_MOVE = 1 << iota SPLICE_F_NONBLOCK SPLICE_F_MORE SPLICE_F_GIFT ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/tcp.go000066400000000000000000000040011465435605700224260ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Socket options from uapi/linux/tcp.h. const ( TCP_NODELAY = 1 TCP_MAXSEG = 2 TCP_CORK = 3 TCP_KEEPIDLE = 4 TCP_KEEPINTVL = 5 TCP_KEEPCNT = 6 TCP_SYNCNT = 7 TCP_LINGER2 = 8 TCP_DEFER_ACCEPT = 9 TCP_WINDOW_CLAMP = 10 TCP_INFO = 11 TCP_QUICKACK = 12 TCP_CONGESTION = 13 TCP_MD5SIG = 14 TCP_THIN_LINEAR_TIMEOUTS = 16 TCP_THIN_DUPACK = 17 TCP_USER_TIMEOUT = 18 TCP_REPAIR = 19 TCP_REPAIR_QUEUE = 20 TCP_QUEUE_SEQ = 21 TCP_REPAIR_OPTIONS = 22 TCP_FASTOPEN = 23 TCP_TIMESTAMP = 24 TCP_NOTSENT_LOWAT = 25 TCP_CC_INFO = 26 TCP_SAVE_SYN = 27 TCP_SAVED_SYN = 28 TCP_REPAIR_WINDOW = 29 TCP_FASTOPEN_CONNECT = 30 TCP_ULP = 31 TCP_MD5SIG_EXT = 32 TCP_FASTOPEN_KEY = 33 TCP_FASTOPEN_NO_COOKIE = 34 TCP_ZEROCOPY_RECEIVE = 35 TCP_INQ = 36 ) // Socket constants from include/net/tcp.h. const ( MAX_TCP_KEEPIDLE = 32767 MAX_TCP_KEEPINTVL = 32767 MAX_TCP_KEEPCNT = 127 ) // Congestion control states from include/uapi/linux/tcp.h. const ( TCP_CA_Open = 0 TCP_CA_Disorder = 1 TCP_CA_CWR = 2 TCP_CA_Recovery = 3 TCP_CA_Loss = 4 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/time.go000066400000000000000000000162211465435605700226050ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "math" "time" ) const ( // ClockTick is the length of time represented by a single clock tick, as // used by times(2) and /proc/[pid]/stat. ClockTick = time.Second / CLOCKS_PER_SEC // CLOCKS_PER_SEC is the number of ClockTicks per second. // // Linux defines this to be 100 on most architectures, irrespective of // CONFIG_HZ. Userspace obtains the value through sysconf(_SC_CLK_TCK), // which uses the AT_CLKTCK entry in the auxiliary vector if one is // provided, and assumes 100 otherwise (glibc: // sysdeps/posix/sysconf.c:__sysconf() => // sysdeps/unix/sysv/linux/getclktck.c, elf/dl-support.c:_dl_aux_init()). // // Not to be confused with POSIX CLOCKS_PER_SEC, as used by clock(3); "XSI // requires that [POSIX] CLOCKS_PER_SEC equals 1000000 independent of the // actual resolution" - clock(3). CLOCKS_PER_SEC = 100 ) // CPU clock types for use with clock_gettime(2) et al. // // The 29 most significant bits of a 32 bit clock ID are either a PID or a FD. // // Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3. // // Bit 2 indicates whether a cpu clock refers to a thread or a process. const ( CPUCLOCK_PROF = 0 CPUCLOCK_VIRT = 1 CPUCLOCK_SCHED = 2 CPUCLOCK_MAX = 3 CLOCKFD = CPUCLOCK_MAX CPUCLOCK_CLOCK_MASK = 3 CPUCLOCK_PERTHREAD_MASK = 4 ) // Clock identifiers for use with clock_gettime(2), clock_getres(2), // clock_nanosleep(2). const ( CLOCK_REALTIME = 0 CLOCK_MONOTONIC = 1 CLOCK_PROCESS_CPUTIME_ID = 2 CLOCK_THREAD_CPUTIME_ID = 3 CLOCK_MONOTONIC_RAW = 4 CLOCK_REALTIME_COARSE = 5 CLOCK_MONOTONIC_COARSE = 6 CLOCK_BOOTTIME = 7 CLOCK_REALTIME_ALARM = 8 CLOCK_BOOTTIME_ALARM = 9 ) // Flags for clock_nanosleep(2). const ( TIMER_ABSTIME = 1 ) // Flags for timerfd syscalls (timerfd_create(2), timerfd_settime(2)). const ( // TFD_CLOEXEC is a timerfd_create flag. TFD_CLOEXEC = O_CLOEXEC // TFD_NONBLOCK is a timerfd_create flag. TFD_NONBLOCK = O_NONBLOCK // TFD_TIMER_ABSTIME is a timerfd_settime flag. TFD_TIMER_ABSTIME = 1 ) // The safe number of seconds you can represent by int64. const maxSecInDuration = math.MaxInt64 / int64(time.Second) // TimeT represents time_t in . It represents time in seconds. // // +marshal type TimeT int64 // NsecToTimeT translates nanoseconds to TimeT (seconds). func NsecToTimeT(nsec int64) TimeT { return TimeT(nsec / 1e9) } // Timespec represents struct timespec in . // // +marshal slice:TimespecSlice type Timespec struct { Sec int64 Nsec int64 } // Unix returns the second and nanosecond. func (ts Timespec) Unix() (sec int64, nsec int64) { return int64(ts.Sec), int64(ts.Nsec) } // ToTime returns the Go time.Time representation. func (ts Timespec) ToTime() time.Time { return time.Unix(ts.Sec, ts.Nsec) } // ToNsec returns the nanosecond representation. func (ts Timespec) ToNsec() int64 { return int64(ts.Sec)*1e9 + int64(ts.Nsec) } // ToNsecCapped returns the safe nanosecond representation. func (ts Timespec) ToNsecCapped() int64 { if ts.Sec > maxSecInDuration { return math.MaxInt64 } return ts.ToNsec() } // ToDuration returns the safe nanosecond representation as time.Duration. func (ts Timespec) ToDuration() time.Duration { return time.Duration(ts.ToNsecCapped()) } // Valid returns whether the timespec contains valid values. func (ts Timespec) Valid() bool { return !(ts.Sec < 0 || ts.Nsec < 0 || ts.Nsec >= int64(time.Second)) } // NsecToTimespec translates nanoseconds to Timespec. func NsecToTimespec(nsec int64) (ts Timespec) { ts.Sec = nsec / 1e9 ts.Nsec = nsec % 1e9 return } // DurationToTimespec translates time.Duration to Timespec. func DurationToTimespec(dur time.Duration) Timespec { return NsecToTimespec(dur.Nanoseconds()) } // SizeOfTimeval is the size of a Timeval struct in bytes. const SizeOfTimeval = 16 // Timeval represents struct timeval in . // // +marshal slice:TimevalSlice type Timeval struct { Sec int64 Usec int64 } // ToNsecCapped returns the safe nanosecond representation. func (tv Timeval) ToNsecCapped() int64 { if tv.Sec > maxSecInDuration { return math.MaxInt64 } return int64(tv.Sec)*1e9 + int64(tv.Usec)*1e3 } // ToDuration returns the safe nanosecond representation as a time.Duration. func (tv Timeval) ToDuration() time.Duration { return time.Duration(tv.ToNsecCapped()) } // ToTime returns the Go time.Time representation. func (tv Timeval) ToTime() time.Time { return time.Unix(tv.Sec, tv.Usec*1e3) } // NsecToTimeval translates nanosecond to Timeval. func NsecToTimeval(nsec int64) (tv Timeval) { nsec += 999 // round up to microsecond tv.Sec = nsec / 1e9 tv.Usec = nsec % 1e9 / 1e3 return } // DurationToTimeval translates time.Duration to Timeval. func DurationToTimeval(dur time.Duration) Timeval { return NsecToTimeval(dur.Nanoseconds()) } // Itimerspec represents struct itimerspec in . // // +marshal type Itimerspec struct { Interval Timespec Value Timespec } // ItimerVal mimics the following struct in // // struct itimerval { // struct timeval it_interval; /* next value */ // struct timeval it_value; /* current value */ // }; // // +marshal type ItimerVal struct { Interval Timeval Value Timeval } // ClockT represents type clock_t. // // +marshal type ClockT int64 // ClockTFromDuration converts time.Duration to clock_t. func ClockTFromDuration(d time.Duration) ClockT { return ClockT(d / ClockTick) } // Tms represents struct tms, used by times(2). // // +marshal type Tms struct { UTime ClockT STime ClockT CUTime ClockT CSTime ClockT } // TimerID represents type timer_t, which identifies a POSIX per-process // interval timer. // // +marshal type TimerID int32 // StatxTimestamp represents struct statx_timestamp. // // +marshal type StatxTimestamp struct { Sec int64 Nsec uint32 _ int32 } // ToNsec returns the nanosecond representation. func (sxts StatxTimestamp) ToNsec() int64 { return int64(sxts.Sec)*1e9 + int64(sxts.Nsec) } // ToNsecCapped returns the safe nanosecond representation. func (sxts StatxTimestamp) ToNsecCapped() int64 { if sxts.Sec > maxSecInDuration { return math.MaxInt64 } return sxts.ToNsec() } // NsecToStatxTimestamp translates nanoseconds to StatxTimestamp. func NsecToStatxTimestamp(nsec int64) (ts StatxTimestamp) { return StatxTimestamp{ Sec: nsec / 1e9, Nsec: uint32(nsec % 1e9), } } // ToTime returns the Go time.Time representation. func (sxts StatxTimestamp) ToTime() time.Time { return time.Unix(sxts.Sec, int64(sxts.Nsec)) } // Utime represents struct utimbuf used by utimes(2). // // +marshal type Utime struct { Actime int64 Modtime int64 } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/timer.go000066400000000000000000000014001465435605700227600ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // itimer types for getitimer(2) and setitimer(2), from // include/uapi/linux/time.h. const ( ITIMER_REAL = 0 ITIMER_VIRTUAL = 1 ITIMER_PROF = 2 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/tty.go000066400000000000000000000203021465435605700224620ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux const ( // NumControlCharacters is the number of control characters in Termios. NumControlCharacters = 19 // disabledChar is used to indicate that a control character is // disabled. disabledChar = 0 ) // Winsize is struct winsize, defined in uapi/asm-generic/termios.h. // // +marshal type Winsize struct { Row uint16 Col uint16 Xpixel uint16 Ypixel uint16 } // Termios is struct termios, defined in uapi/asm-generic/termbits.h. // // +marshal type Termios struct { InputFlags uint32 OutputFlags uint32 ControlFlags uint32 LocalFlags uint32 LineDiscipline uint8 ControlCharacters [NumControlCharacters]uint8 } // KernelTermios is struct ktermios/struct termios2, defined in // uapi/asm-generic/termbits.h. // // +stateify savable type KernelTermios struct { InputFlags uint32 OutputFlags uint32 ControlFlags uint32 LocalFlags uint32 LineDiscipline uint8 ControlCharacters [NumControlCharacters]uint8 InputSpeed uint32 OutputSpeed uint32 } // IEnabled returns whether flag is enabled in termios input flags. func (t *KernelTermios) IEnabled(flag uint32) bool { return t.InputFlags&flag == flag } // OEnabled returns whether flag is enabled in termios output flags. func (t *KernelTermios) OEnabled(flag uint32) bool { return t.OutputFlags&flag == flag } // CEnabled returns whether flag is enabled in termios control flags. func (t *KernelTermios) CEnabled(flag uint32) bool { return t.ControlFlags&flag == flag } // LEnabled returns whether flag is enabled in termios local flags. func (t *KernelTermios) LEnabled(flag uint32) bool { return t.LocalFlags&flag == flag } // ToTermios copies fields that are shared with Termios into a new Termios // struct. func (t *KernelTermios) ToTermios() Termios { return Termios{ InputFlags: t.InputFlags, OutputFlags: t.OutputFlags, ControlFlags: t.ControlFlags, LocalFlags: t.LocalFlags, LineDiscipline: t.LineDiscipline, ControlCharacters: t.ControlCharacters, } } // FromTermios copies fields that are shared with Termios into this // KernelTermios struct. func (t *KernelTermios) FromTermios(term Termios) { t.InputFlags = term.InputFlags t.OutputFlags = term.OutputFlags t.ControlFlags = term.ControlFlags t.LocalFlags = term.LocalFlags t.LineDiscipline = term.LineDiscipline t.ControlCharacters = term.ControlCharacters } // IsTerminating returns whether c is a line terminating character. func (t *KernelTermios) IsTerminating(cBytes []byte) bool { // All terminating characters are 1 byte. if len(cBytes) != 1 { return false } c := cBytes[0] // Is this the user-set EOF character? if t.IsEOF(c) { return true } switch c { case disabledChar: return false case '\n', t.ControlCharacters[VEOL]: return true case t.ControlCharacters[VEOL2]: return t.LEnabled(IEXTEN) } return false } // IsEOF returns whether c is the EOF character. func (t *KernelTermios) IsEOF(c byte) bool { return c == t.ControlCharacters[VEOF] && t.ControlCharacters[VEOF] != disabledChar } // Input flags. const ( IGNBRK = 0000001 BRKINT = 0000002 IGNPAR = 0000004 PARMRK = 0000010 INPCK = 0000020 ISTRIP = 0000040 INLCR = 0000100 IGNCR = 0000200 ICRNL = 0000400 IUCLC = 0001000 IXON = 0002000 IXANY = 0004000 IXOFF = 0010000 IMAXBEL = 0020000 IUTF8 = 0040000 ) // Output flags. const ( OPOST = 0000001 OLCUC = 0000002 ONLCR = 0000004 OCRNL = 0000010 ONOCR = 0000020 ONLRET = 0000040 OFILL = 0000100 OFDEL = 0000200 NLDLY = 0000400 NL0 = 0000000 NL1 = 0000400 CRDLY = 0003000 CR0 = 0000000 CR1 = 0001000 CR2 = 0002000 CR3 = 0003000 TABDLY = 0014000 TAB0 = 0000000 TAB1 = 0004000 TAB2 = 0010000 TAB3 = 0014000 XTABS = 0014000 BSDLY = 0020000 BS0 = 0000000 BS1 = 0020000 VTDLY = 0040000 VT0 = 0000000 VT1 = 0040000 FFDLY = 0100000 FF0 = 0000000 FF1 = 0100000 ) // Control flags. const ( CBAUD = 0010017 B0 = 0000000 B50 = 0000001 B75 = 0000002 B110 = 0000003 B134 = 0000004 B150 = 0000005 B200 = 0000006 B300 = 0000007 B600 = 0000010 B1200 = 0000011 B1800 = 0000012 B2400 = 0000013 B4800 = 0000014 B9600 = 0000015 B19200 = 0000016 B38400 = 0000017 EXTA = B19200 EXTB = B38400 CSIZE = 0000060 CS5 = 0000000 CS6 = 0000020 CS7 = 0000040 CS8 = 0000060 CSTOPB = 0000100 CREAD = 0000200 PARENB = 0000400 PARODD = 0001000 HUPCL = 0002000 CLOCAL = 0004000 CBAUDEX = 0010000 BOTHER = 0010000 B57600 = 0010001 B115200 = 0010002 B230400 = 0010003 B460800 = 0010004 B500000 = 0010005 B576000 = 0010006 B921600 = 0010007 B1000000 = 0010010 B1152000 = 0010011 B1500000 = 0010012 B2000000 = 0010013 B2500000 = 0010014 B3000000 = 0010015 B3500000 = 0010016 B4000000 = 0010017 CIBAUD = 002003600000 CMSPAR = 010000000000 CRTSCTS = 020000000000 // IBSHIFT is the shift from CBAUD to CIBAUD. IBSHIFT = 16 ) // Local flags. const ( ISIG = 0000001 ICANON = 0000002 XCASE = 0000004 ECHO = 0000010 ECHOE = 0000020 ECHOK = 0000040 ECHONL = 0000100 NOFLSH = 0000200 TOSTOP = 0000400 ECHOCTL = 0001000 ECHOPRT = 0002000 ECHOKE = 0004000 FLUSHO = 0010000 PENDIN = 0040000 IEXTEN = 0100000 EXTPROC = 0200000 ) // Control Character indices. const ( VINTR = 0 VQUIT = 1 VERASE = 2 VKILL = 3 VEOF = 4 VTIME = 5 VMIN = 6 VSWTC = 7 VSTART = 8 VSTOP = 9 VSUSP = 10 VEOL = 11 VREPRINT = 12 VDISCARD = 13 VWERASE = 14 VLNEXT = 15 VEOL2 = 16 ) // ControlCharacter returns the termios-style control character for the passed // character. // // e.g., for Ctrl-C, i.e., ^C, call ControlCharacter('C'). // // Standard control characters are ASCII bytes 0 through 31. func ControlCharacter(c byte) uint8 { // A is 1, B is 2, etc. return uint8(c - 'A' + 1) } // DefaultControlCharacters is the default set of Termios control characters. var DefaultControlCharacters = [NumControlCharacters]uint8{ ControlCharacter('C'), // VINTR = ^C ControlCharacter('\\'), // VQUIT = ^\ '\x7f', // VERASE = DEL ControlCharacter('U'), // VKILL = ^U ControlCharacter('D'), // VEOF = ^D 0, // VTIME 1, // VMIN 0, // VSWTC ControlCharacter('Q'), // VSTART = ^Q ControlCharacter('S'), // VSTOP = ^S ControlCharacter('Z'), // VSUSP = ^Z 0, // VEOL ControlCharacter('R'), // VREPRINT = ^R ControlCharacter('O'), // VDISCARD = ^O ControlCharacter('W'), // VWERASE = ^W ControlCharacter('V'), // VLNEXT = ^V 0, // VEOL2 } // MasterTermios is the terminal configuration of the master end of a Unix98 // pseudoterminal. var MasterTermios = KernelTermios{ ControlFlags: B38400 | CS8 | CREAD, ControlCharacters: DefaultControlCharacters, InputSpeed: 38400, OutputSpeed: 38400, } // DefaultReplicaTermios is the default terminal configuration of the replica // end of a Unix98 pseudoterminal. var DefaultReplicaTermios = KernelTermios{ InputFlags: ICRNL | IXON, OutputFlags: OPOST | ONLCR, ControlFlags: B38400 | CS8 | CREAD, LocalFlags: ISIG | ICANON | ECHO | ECHOE | ECHOK | ECHOCTL | ECHOKE | IEXTEN, ControlCharacters: DefaultControlCharacters, InputSpeed: 38400, OutputSpeed: 38400, } // WindowSize corresponds to struct winsize defined in // include/uapi/asm-generic/termios.h. // // +stateify savable // +marshal type WindowSize struct { Rows uint16 Cols uint16 _ [4]byte // Padding for 2 unused shorts. } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/uio.go000066400000000000000000000013121465435605700224360ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // UIO_MAXIOV is the maximum number of struct iovecs in a struct iovec array. const UIO_MAXIOV = 1024 golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/utsname.go000066400000000000000000000030701465435605700233210ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "bytes" "fmt" ) const ( // UTSLen is the maximum length of strings contained in fields of // UtsName. UTSLen = 64 ) // UtsName represents struct utsname, the struct returned by uname(2). // // +marshal type UtsName struct { Sysname [UTSLen + 1]byte Nodename [UTSLen + 1]byte Release [UTSLen + 1]byte Version [UTSLen + 1]byte Machine [UTSLen + 1]byte Domainname [UTSLen + 1]byte } // utsNameString converts a UtsName entry to a string without NULs. func utsNameString(s [UTSLen + 1]byte) string { // The NUL bytes will remain even in a cast to string. We must // explicitly strip them. return string(bytes.TrimRight(s[:], "\x00")) } func (u UtsName) String() string { return fmt.Sprintf("{Sysname: %s, Nodename: %s, Release: %s, Version: %s, Machine: %s, Domainname: %s}", utsNameString(u.Sysname), utsNameString(u.Nodename), utsNameString(u.Release), utsNameString(u.Version), utsNameString(u.Machine), utsNameString(u.Domainname)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/vfio.go000066400000000000000000000122511465435605700226110ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // The package implements VFIOuserspace driver interface. package linux // For IOCTLs requests from include/uapi/linux/vfio.h. const ( VFIO_TYPE = ';' VFIO_BASE = 100 // VFIO extensions. VFIO_TYPE1_IOMMU = 1 VFIO_SPAPR_TCE_IOMMU = 2 VFIO_TYPE1v2_IOMMU = 3 ) // VFIO device info flags. const ( // Device supports reset. VFIO_DEVICE_FLAGS_RESET = 1 << iota // VFIO-pci device. VFIO_DEVICE_FLAGS_PCI // VFIO-platform device. VFIO_DEVICE_FLAGS_PLATFORM // VFIO-amba device. VFIO_DEVICE_FLAGS_AMBA // VFIO-ccw device. VFIO_DEVICE_FLAGS_CCW // VFIO-ap device. VFIO_DEVICE_FLAGS_AP // VFIO-fsl-mc device. VFIO_DEVICE_FLAGS_FSL_MC // Info supports caps. VFIO_DEVICE_FLAGS_CAPS // VFIO-cdx device. VFIO_DEVICE_FLAGS_CDX ) // VFIO region info flags. const ( // Region supports read. VFIO_REGION_INFO_FLAG_READ = 1 << iota // Region supports write. VFIO_REGION_INFO_FLAG_WRITE // Region supports mmap. VFIO_REGION_INFO_FLAG_MMAP // Info supports caps. VFIO_REGION_INFO_FLAG_CAPS ) // VFIOIrqInfo flags. const ( VFIO_IRQ_INFO_EVENTFD = 1 << iota VFIO_IRQ_INFO_MASKABLE VFIO_IRQ_INFO_AUTOMASKED VFIO_IRQ_INFO_NORESIZE ) // VFIOIrqSet flags. const ( VFIO_IRQ_SET_DATA_NONE = 1 << iota VFIO_IRQ_SET_DATA_BOOL VFIO_IRQ_SET_DATA_EVENTFD VFIO_IRQ_SET_ACTION_MASK VFIO_IRQ_SET_ACTION_UNMASK VFIO_IRQ_SET_ACTION_TRIGGER VFIO_IRQ_SET_DATA_TYPE_MASK = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_DATA_BOOL | VFIO_IRQ_SET_DATA_EVENTFD VFIO_IRQ_SET_ACTION_TYPE_MASK = VFIO_IRQ_SET_ACTION_MASK | VFIO_IRQ_SET_ACTION_UNMASK | VFIO_IRQ_SET_ACTION_TRIGGER ) // VFIOIrqSet index. const ( VFIO_PCI_INTX_IRQ_INDEX = iota VFIO_PCI_MSI_IRQ_INDEX VFIO_PCI_MSIX_IRQ_INDEX VFIO_PCI_ERR_IRQ_INDEX VFIO_PCI_REQ_IRQ_INDEX VFIO_PCI_NUM_IRQS ) // VFIOIommuType1DmaMap flags. const ( // Readable from device. VFIO_DMA_MAP_FLAG_READ = 1 << iota // Writable from device. VFIO_DMA_MAP_FLAG_WRITE // Update the device's virtual address. VFIO_DMA_MAP_FLAG_VADDR ) const ( VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP = 1 ) // IOCTLs for VFIO file descriptor from include/uapi/linux/vfio.h. var ( VFIO_CHECK_EXTENSION = IO(VFIO_TYPE, VFIO_BASE+1) VFIO_SET_IOMMU = IO(VFIO_TYPE, VFIO_BASE+2) VFIO_GROUP_SET_CONTAINER = IO(VFIO_TYPE, VFIO_BASE+4) VFIO_GROUP_GET_DEVICE_FD = IO(VFIO_TYPE, VFIO_BASE+6) VFIO_DEVICE_GET_INFO = IO(VFIO_TYPE, VFIO_BASE+7) VFIO_DEVICE_GET_REGION_INFO = IO(VFIO_TYPE, VFIO_BASE+8) VFIO_DEVICE_GET_IRQ_INFO = IO(VFIO_TYPE, VFIO_BASE+9) VFIO_DEVICE_SET_IRQS = IO(VFIO_TYPE, VFIO_BASE+10) VFIO_DEVICE_RESET = IO(VFIO_TYPE, VFIO_BASE+11) VFIO_IOMMU_MAP_DMA = IO(VFIO_TYPE, VFIO_BASE+13) VFIO_IOMMU_UNMAP_DMA = IO(VFIO_TYPE, VFIO_BASE+14) ) // VFIODeviceInfo is analogous to vfio_device_info // from include/uapi/linux/vfio.h. // // +marshal type VFIODeviceInfo struct { Argsz uint32 Flags uint32 // The total amount of regions. NumRegions uint32 // The maximum number of IRQ. NumIrqs uint32 // Offset within info struct of first cap. CapOffset uint32 pad uint32 } // VFIORegionInfo is analogous to vfio_region_info // from include/uapi/linux/vfio.h. // // +marshal type VFIORegionInfo struct { Argsz uint32 Flags uint32 Index uint32 // Offset within info struct of first cap. capOffset uint32 // Region size in bytes. Size uint64 // Region offset from start of device fd. Offset uint64 } // VFIOIrqInfo is analogous to vfio_irq_info // from include/uapi/linux/vfio.h. // // +marshal type VFIOIrqInfo struct { Argsz uint32 Flags uint32 Index uint32 Count uint32 } // VFIOIrqSet is analogous to vfio_irq_set // from include/uapi/linux/vfio.h. // The last field `data` from vfio_irq_set is omitted which is an // flexible array member. It will be handled separately. // // +marshal type VFIOIrqSet struct { Argsz uint32 Flags uint32 Index uint32 Start uint32 Count uint32 } // VFIOIommuType1DmaMap is analogous to vfio_iommu_type1_dma_map // from include/uapi/linux/vfio.h. // // +marshal type VFIOIommuType1DmaMap struct { Argsz uint32 Flags uint32 // Process virtual address. Vaddr uint64 // IO virtual address. IOVa uint64 // Size of mapping in bytes. Size uint64 } // VFIOIommuType1DmaUnmap is analogous to vfio_iommu_type1_dma_unmap // from include/uapi/linux/vfio.h. // // +marshal type VFIOIommuType1DmaUnmap struct { Argsz uint32 Flags uint32 // IO virtual address. IOVa uint64 // Size of mapping in bytes. Size uint64 // The `data` field from vfio_iommu_type1_dma_unmap is omitted. The // field is a flexible array member, and is needed only if the flag // VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is enabled. } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/vfio_unsafe.go000066400000000000000000000014131465435605700241500ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import "unsafe" // Size returns the number of bytes for a VFIOIrqSet object. func (vfioIrqSet VFIOIrqSet) Size() uint64 { return uint64(unsafe.Sizeof(vfioIrqSet)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/wait.go000066400000000000000000000106031465435605700226110ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "fmt" ) // Options for waitpid(2), wait4(2), and/or waitid(2), from // include/uapi/linux/wait.h. const ( WNOHANG = 0x00000001 WUNTRACED = 0x00000002 WSTOPPED = WUNTRACED WEXITED = 0x00000004 WCONTINUED = 0x00000008 WNOWAIT = 0x01000000 WNOTHREAD = 0x20000000 WALL = 0x40000000 WCLONE = 0x80000000 ) // ID types for waitid(2), from include/uapi/linux/wait.h. const ( P_ALL = 0x0 P_PID = 0x1 P_PGID = 0x2 ) // WaitStatus represents a thread status, as returned by the wait* family of // syscalls. type WaitStatus uint32 // WaitStatusExit returns a WaitStatus representing the given exit status. func WaitStatusExit(status int32) WaitStatus { return WaitStatus(uint32(status) << 8) } // WaitStatusTerminationSignal returns a WaitStatus representing termination by // the given signal. func WaitStatusTerminationSignal(sig Signal) WaitStatus { return WaitStatus(uint32(sig)) } // WaitStatusStopped returns a WaitStatus representing stoppage by the given // signal or ptrace trap code. func WaitStatusStopped(code uint32) WaitStatus { return WaitStatus(code<<8 | 0x7f) } // WaitStatusContinued returns a WaitStatus representing continuation by // SIGCONT. func WaitStatusContinued() WaitStatus { return WaitStatus(0xffff) } // WithCoreDump returns a copy of ws that indicates that a core dump was // generated. // // Preconditions: ws.Signaled(). func (ws WaitStatus) WithCoreDump() WaitStatus { return ws | 0x80 } // Exited returns true if ws represents an exit status, consistent with // WIFEXITED. func (ws WaitStatus) Exited() bool { return ws&0x7f == 0 } // Signaled returns true if ws represents a termination by signal, consistent // with WIFSIGNALED. func (ws WaitStatus) Signaled() bool { // ws&0x7f != 0 (exited) and ws&0x7f != 0x7f (stopped or continued) return ((ws&0x7f)+1)>>1 != 0 } // CoreDumped returns true if ws indicates that a core dump was produced, // consistent with WCOREDUMP. // // Preconditions: ws.Signaled(). func (ws WaitStatus) CoreDumped() bool { return ws&0x80 != 0 } // Stopped returns true if ws represents a stoppage, consistent with // WIFSTOPPED. func (ws WaitStatus) Stopped() bool { return ws&0xff == 0x7f } // Continued returns true if ws represents a continuation by SIGCONT, // consistent with WIFCONTINUED. func (ws WaitStatus) Continued() bool { return ws == 0xffff } // ExitStatus returns the lower 8 bits of the exit status represented by ws, // consistent with WEXITSTATUS. // // Preconditions: ws.Exited(). func (ws WaitStatus) ExitStatus() uint32 { return uint32((ws & 0xff00) >> 8) } // TerminationSignal returns the termination signal represented by ws, // consistent with WTERMSIG. // // Preconditions: ws.Signaled(). func (ws WaitStatus) TerminationSignal() Signal { return Signal(ws & 0x7f) } // StopSignal returns the stop signal represented by ws, consistent with // WSTOPSIG. // // Preconditions: ws.Stopped(). func (ws WaitStatus) StopSignal() Signal { return Signal((ws & 0xff00) >> 8) } // PtraceEvent returns the PTRACE_EVENT_* field in ws. // // Preconditions: ws.Stopped(). func (ws WaitStatus) PtraceEvent() uint32 { return uint32(ws >> 16) } // String implements fmt.Stringer.String. func (ws WaitStatus) String() string { switch { case ws.Exited(): return fmt.Sprintf("exit status %d", ws.ExitStatus()) case ws.Signaled(): if ws.CoreDumped() { return fmt.Sprintf("killed by signal %d (core dumped)", ws.TerminationSignal()) } return fmt.Sprintf("killed by signal %d", ws.TerminationSignal()) case ws.Stopped(): if ev := ws.PtraceEvent(); ev != 0 { return fmt.Sprintf("stopped by signal %d (PTRACE_EVENT %d)", ws.StopSignal(), ev) } return fmt.Sprintf("stopped by signal %d", ws.StopSignal()) case ws.Continued(): return "continued" default: return fmt.Sprintf("unknown status %#x", uint32(ws)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/linux/xattr.go000066400000000000000000000021551465435605700230120ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux // Constants for extended attributes. const ( XATTR_NAME_MAX = 255 XATTR_SIZE_MAX = 65536 XATTR_LIST_MAX = 65536 XATTR_CREATE = 1 XATTR_REPLACE = 2 XATTR_SECURITY_PREFIX = "security." XATTR_SECURITY_PREFIX_LEN = len(XATTR_SECURITY_PREFIX) XATTR_SYSTEM_PREFIX = "system." XATTR_SYSTEM_PREFIX_LEN = len(XATTR_SYSTEM_PREFIX) XATTR_TRUSTED_PREFIX = "trusted." XATTR_TRUSTED_PREFIX_LEN = len(XATTR_TRUSTED_PREFIX) XATTR_USER_PREFIX = "user." XATTR_USER_PREFIX_LEN = len(XATTR_USER_PREFIX) ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/nvgpu/000077500000000000000000000000001465435605700213165ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/nvgpu/classes.go000066400000000000000000000262211465435605700233050ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package nvgpu import ( "fmt" ) // ClassID is a client class ID, in the sense of // src/nvidia/src/kernel/rmapi/resource_desc.h:RS_RESOURCE_DESC::externalClassID. // // +marshal type ClassID uint32 // String implements fmt.Stringer.String. func (id ClassID) String() string { // Include leading zeroes for easier searchability, both here and in // g_allclasses.h. return fmt.Sprintf("0x%08x", uint32(id)) } // Class IDs, from src/nvidia/generated/g_allclasses.h. const ( NV01_ROOT = 0x00000000 NV01_ROOT_NON_PRIV = 0x00000001 NV01_MEMORY_SYSTEM = 0x0000003e NV01_MEMORY_LOCAL_USER = 0x00000040 NV01_ROOT_CLIENT = 0x00000041 NV01_MEMORY_SYSTEM_OS_DESCRIPTOR = 0x00000071 NV01_EVENT_OS_EVENT = 0x00000079 NV01_DEVICE_0 = 0x00000080 RM_USER_SHARED_DATA = 0x000000de NV_MEMORY_FABRIC = 0x000000f8 NV_MEMORY_MULTICAST_FABRIC = 0x000000fd NV20_SUBDEVICE_0 = 0x00002080 NV2081_BINAPI = 0x00002081 NV50_P2P = 0x0000503b NV50_THIRD_PARTY_P2P = 0x0000503c NV50_MEMORY_VIRTUAL = 0x000050a0 GT200_DEBUGGER = 0x000083de GF100_SUBDEVICE_MASTER = 0x000090e6 FERMI_CONTEXT_SHARE_A = 0x00009067 FERMI_VASPACE_A = 0x000090f1 KEPLER_CHANNEL_GROUP_A = 0x0000a06c TURING_USERMODE_A = 0x0000c461 TURING_CHANNEL_GPFIFO_A = 0x0000c46f AMPERE_CHANNEL_GPFIFO_A = 0x0000c56f TURING_DMA_COPY_A = 0x0000c5b5 TURING_COMPUTE_A = 0x0000c5c0 HOPPER_USERMODE_A = 0x0000c661 AMPERE_DMA_COPY_A = 0x0000c6b5 AMPERE_COMPUTE_A = 0x0000c6c0 AMPERE_DMA_COPY_B = 0x0000c7b5 AMPERE_COMPUTE_B = 0x0000c7c0 HOPPER_CHANNEL_GPFIFO_A = 0x0000c86f HOPPER_DMA_COPY_A = 0x0000c8b5 ADA_COMPUTE_A = 0x0000c9c0 NV_CONFIDENTIAL_COMPUTE = 0x0000cb33 HOPPER_SEC2_WORK_LAUNCH_A = 0x0000cba2 HOPPER_COMPUTE_A = 0x0000cbc0 ) // NV2081_ALLOC_PARAMETERS is the alloc params type for NV2081_BINAPI, from // src/common/sdk/nvidia/inc/class/cl2081.h. // // +marshal type NV2081_ALLOC_PARAMETERS struct { Reserved uint32 } // NV0005_ALLOC_PARAMETERS is the alloc params type for NV01_EVENT_OS_EVENT, // from src/common/sdk/nvidia/inc/class/cl0005.h. // // +marshal type NV0005_ALLOC_PARAMETERS struct { HParentClient Handle HSrcResource Handle HClass uint32 NotifyIndex uint32 Data P64 // actually FD for NV01_EVENT_OS_EVENT, see src/nvidia/src/kernel/rmapi/event.c:eventConstruct_IMPL() => src/nvidia/arch/nvalloc/unix/src/os.c:osUserHandleToKernelPtr() } // NV0080_ALLOC_PARAMETERS is the alloc params type for NV01_DEVICE_0, from // src/common/sdk/nvidia/inc/class/cl0080.h. // // +marshal type NV0080_ALLOC_PARAMETERS struct { DeviceID uint32 HClientShare Handle HTargetClient Handle HTargetDevice Handle Flags uint32 Pad0 [4]byte VASpaceSize uint64 VAStartInternal uint64 VALimitInternal uint64 VAMode uint32 Pad1 [4]byte } // NV2080_ALLOC_PARAMETERS is the alloc params type for NV20_SUBDEVICE_0, from // src/common/sdk/nvidia/inc/class/cl2080.h. // // +marshal type NV2080_ALLOC_PARAMETERS struct { SubDeviceID uint32 } // NV_MEMORY_ALLOCATION_PARAMS is the alloc params type for various NV*_MEMORY* // allocation classes, from src/common/sdk/nvidia/inc/nvos.h. // // +marshal type NV_MEMORY_ALLOCATION_PARAMS struct { Owner uint32 Type uint32 Flags uint32 Width uint32 Height uint32 Pitch int32 Attr uint32 Attr2 uint32 Format uint32 ComprCovg uint32 ZcullCovg uint32 _ uint32 RangeLo uint64 RangeHi uint64 Size uint64 Alignment uint64 Offset uint64 Limit uint64 Address P64 CtagOffset uint32 HVASpace Handle InternalFlags uint32 Tag uint32 } // NV_MEMORY_ALLOCATION_PARAMS_V545 is the updated version of // NV_MEMORY_ALLOCATION_PARAMS since 545.23.06. // // +marshal type NV_MEMORY_ALLOCATION_PARAMS_V545 struct { NV_MEMORY_ALLOCATION_PARAMS NumaNode int32 _ uint32 } // NV503B_BAR1_P2P_DMA_INFO from src/common/sdk/nvidia/inc/class/cl503b.h. // // +marshal type NV503B_BAR1_P2P_DMA_INFO struct { DmaAddress uint64 DmaSize uint64 } // NV503B_ALLOC_PARAMETERS is the alloc params type for NV50_P2P, from // src/common/sdk/nvidia/inc/class/cl503b.h. // // +marshal type NV503B_ALLOC_PARAMETERS struct { HSubDevice Handle HPeerSubDevice Handle SubDevicePeerIDMask uint32 PeerSubDevicePeerIDMask uint32 MailboxBar1Addr uint64 MailboxTotalSize uint32 Flags uint32 SubDeviceEgmPeerIDMask uint32 PeerSubDeviceEgmPeerIDMask uint32 L2pBar1P2PDmaInfo NV503B_BAR1_P2P_DMA_INFO P2lBar1P2PDmaInfo NV503B_BAR1_P2P_DMA_INFO } // NV503C_ALLOC_PARAMETERS is the alloc params type for NV50_THIRD_PARTY_P2P, // from src/common/sdk/nvidia/inc/class/cl503c.h. // // +marshal type NV503C_ALLOC_PARAMETERS struct { Flags uint32 } // NV83DE_ALLOC_PARAMETERS is the alloc params type for GT200_DEBUGGER, // from src/common/sdk/nvidia/inc/class/cl83de.h. // // +marshal type NV83DE_ALLOC_PARAMETERS struct { HDebuggerClient_Obsolete Handle HAppClient Handle HClass3DObject Handle } // NV_CTXSHARE_ALLOCATION_PARAMETERS is the alloc params type for // FERMI_CONTEXT_SHARE_A, from src/common/sdk/nvidia/inc/nvos.h. // // +marshal type NV_CTXSHARE_ALLOCATION_PARAMETERS struct { HVASpace Handle Flags uint32 SubctxID uint32 } // NV_VASPACE_ALLOCATION_PARAMETERS is the alloc params type for // FERMI_VASPACE_A, from src/common/sdk/nvidia/inc/nvos.h. // // +marshal type NV_VASPACE_ALLOCATION_PARAMETERS struct { Index uint32 Flags uint32 VASize uint64 VAStartInternal uint64 VALimitInternal uint64 BigPageSize uint32 Pad0 [4]byte VABase uint64 } // NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS is the alloc params type for // KEPLER_CHANNEL_GROUP_A, from src/common/sdk/nvidia/inc/nvos.h. // // +marshal type NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS struct { HObjectError Handle HObjectECCError Handle HVASpace Handle EngineType uint32 BIsCallingContextVgpuPlugin uint8 Pad0 [3]byte } // NV_MEMORY_DESC_PARAMS is from // src/common/sdk/nvidia/inc/alloc/alloc_channel.h. // // +marshal type NV_MEMORY_DESC_PARAMS struct { Base uint64 Size uint64 AddressSpace uint32 CacheAttrib uint32 } // NV_CHANNEL_ALLOC_PARAMS is the alloc params type for TURING_CHANNEL_GPFIFO_A // and AMPERE_CHANNEL_GPFIFO_A, from // src/common/sdk/nvidia/inc/alloc/alloc_channel.h. // // +marshal type NV_CHANNEL_ALLOC_PARAMS struct { HObjectError Handle HObjectBuffer Handle GPFIFOOffset uint64 GPFIFOEntries uint32 Flags uint32 HContextShare Handle HVASpace Handle HUserdMemory [NV_MAX_SUBDEVICES]Handle UserdOffset [NV_MAX_SUBDEVICES]uint64 EngineType uint32 CID uint32 SubDeviceID uint32 HObjectECCError Handle InstanceMem NV_MEMORY_DESC_PARAMS UserdMem NV_MEMORY_DESC_PARAMS RamfcMem NV_MEMORY_DESC_PARAMS MthdbufMem NV_MEMORY_DESC_PARAMS HPhysChannelGroup Handle InternalFlags uint32 ErrorNotifierMem NV_MEMORY_DESC_PARAMS ECCErrorNotifierMem NV_MEMORY_DESC_PARAMS ProcessID uint32 SubProcessID uint32 EncryptIv [CC_CHAN_ALLOC_IV_SIZE_DWORD]uint32 DecryptIv [CC_CHAN_ALLOC_IV_SIZE_DWORD]uint32 HmacNonce [CC_CHAN_ALLOC_NONCE_SIZE_DWORD]uint32 } // NVB0B5_ALLOCATION_PARAMETERS is the alloc param type for TURING_DMA_COPY_A, // AMPERE_DMA_COPY_A, and AMPERE_DMA_COPY_B from // src/common/sdk/nvidia/inc/class/clb0b5sw.h. // // +marshal type NVB0B5_ALLOCATION_PARAMETERS struct { Version uint32 EngineType uint32 } // NV_GR_ALLOCATION_PARAMETERS is the alloc param type for TURING_COMPUTE_A, // AMPERE_COMPUTE_A, and ADA_COMPUTE_A, from src/common/sdk/nvidia/inc/nvos.h. // // +marshal type NV_GR_ALLOCATION_PARAMETERS struct { Version uint32 Flags uint32 Size uint32 Caps uint32 } // NV_HOPPER_USERMODE_A_PARAMS is the alloc param type for HOPPER_USERMODE_A, // from src/common/sdk/nvidia/inc/nvos.h. // // +marshal type NV_HOPPER_USERMODE_A_PARAMS struct { Bar1Mapping uint8 Priv uint8 } // NV00DE_ALLOC_PARAMETERS is the alloc param type for RM_USER_SHARED_DATA, // from src/common/sdk/nvidia/inc/class/cl00de.h. // // +marshal type NV00DE_ALLOC_PARAMETERS struct { Reserved uint32 } // NV00DE_ALLOC_PARAMETERS_V545 is the updated version of // NV00DE_ALLOC_PARAMETERS since 545.23.06. // // +marshal type NV00DE_ALLOC_PARAMETERS_V545 struct { PolledDataMask uint64 } // +marshal type nv00f8Map struct { offset uint64 hVidMem Handle flags uint32 } // NV00F8_ALLOCATION_PARAMETERS is the alloc param type for NV_MEMORY_FABRIC, // from src/common/sdk/nvidia/inc/class/cl00f8.h. // // +marshal type NV00F8_ALLOCATION_PARAMETERS struct { Alignment uint64 AllocSize uint64 PageSize uint64 AllocFlags uint32 _ uint32 Map nv00f8Map } // From src/common/sdk/nvidia/inc/class/cl00e0.h const ( NV_MEM_EXPORT_UUID_LEN = 16 ) // NV_EXPORT_MEM_PACKET is from // src/common/sdk/nvidia/inc/class/cl00e0.h // // +marshal type NV_EXPORT_MEM_PACKET struct { UUID [NV_MEM_EXPORT_UUID_LEN]uint8 Opaque [16]uint8 } // NV00FD_ALLOCATION_PARAMETERS is the alloc param type for NV_MEMORY_MULTICAST_FABRIC // from src/common/sdk/nvidia/inc/class/cl00fd.h // // +marshal type NV00FD_ALLOCATION_PARAMETERS struct { Alignment uint64 AllocSize uint64 PageSize uint32 AllocFlags uint32 NumGPUs uint32 _ uint32 POsEvent P64 } // NV00FD_ALLOCATION_PARAMETERS_V545 is the updated version of // NV00FD_ALLOCATION_PARAMETERS since 545.23.06. // // +marshal type NV00FD_ALLOCATION_PARAMETERS_V545 struct { ExpPacket NV_EXPORT_MEM_PACKET Index uint16 _ [6]byte NV00FD_ALLOCATION_PARAMETERS } // NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS is the alloc param type for // NV_CONFIDENTIAL_COMPUTE, from src/common/sdk/nvidia/inc/class/clcb33.h. // // +marshal type NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS struct { Handle Handle _ uint32 } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/nvgpu/ctrl.go000066400000000000000000000373011465435605700226150ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package nvgpu // From src/nvidia/interface/deprecated/rmapi_deprecated.h: const ( RM_GSS_LEGACY_MASK = 0x00008000 ) // From src/nvidia/inc/kernel/rmapi/param_copy.h: const ( // RMAPI_PARAM_COPY_MAX_PARAMS_SIZE is the size limit imposed while copying // "embedded pointers" in rmapi parameter structs. // See src/nvidia/src/kernel/rmapi/param_copy.c:rmapiParamsAcquire(). RMAPI_PARAM_COPY_MAX_PARAMS_SIZE = 1 * 1024 * 1024 ) // From src/common/sdk/nvidia/inc/ctrl/ctrlxxxx.h: // NVXXXX_CTRL_XXX_INFO is typedef-ed as the following in the driver: // - NV2080_CTRL_GR_INFO // - NV2080_CTRL_BIOS_INFO // - NV0041_CTRL_SURFACE_INFO // // +marshal slice:CtrlXxxInfoSlice type NVXXXX_CTRL_XXX_INFO struct { Index uint32 Data uint32 } // CtrlXxxInfoSize is sizeof(NVXXXX_CTRL_XXX_INFO). var CtrlXxxInfoSize = uint32((*NVXXXX_CTRL_XXX_INFO)(nil).SizeBytes()) // HasCtrlInfoList is a type constraint for parameter structs containing a list // of NVXXXX_CTRL_XXX_INFO and are simple otherwise. type HasCtrlInfoList interface { ListSize() uint32 SetCtrlInfoList(ptr P64) CtrlInfoList() P64 } // From src/common/sdk/nvidia/inc/ctrl/ctrl0000/ctrl0000client.h: const ( NV0000_CTRL_CMD_CLIENT_GET_ADDR_SPACE_TYPE = 0xd01 NV0000_CTRL_CMD_CLIENT_SET_INHERITED_SHARE_POLICY = 0xd04 ) // From src/common/sdk/nvidia/inc/ctrl/ctrl0000/ctrl0000gpu.h: const ( NV0000_CTRL_CMD_GPU_GET_ATTACHED_IDS = 0x201 NV0000_CTRL_CMD_GPU_GET_ID_INFO = 0x202 NV0000_CTRL_CMD_GPU_GET_ID_INFO_V2 = 0x205 NV0000_CTRL_CMD_GPU_GET_PROBED_IDS = 0x214 NV0000_CTRL_CMD_GPU_ATTACH_IDS = 0x215 NV0000_CTRL_CMD_GPU_DETACH_IDS = 0x216 NV0000_CTRL_CMD_GPU_GET_PCI_INFO = 0x21b NV0000_CTRL_CMD_GPU_QUERY_DRAIN_STATE = 0x279 NV0000_CTRL_CMD_GPU_GET_MEMOP_ENABLE = 0x27b NV0000_CTRL_CMD_GPU_ASYNC_ATTACH_ID = 0x289 NV0000_CTRL_CMD_GPU_WAIT_ATTACH_ID = 0x290 ) // From src/common/sdk/nvidia/inc/ctrl/ctrl0000/ctrl0000syncgpuboost.h: const ( NV0000_CTRL_CMD_SYNC_GPU_BOOST_GROUP_INFO = 0xa04 ) // From src/common/sdk/nvidia/inc/ctrl/ctrl0000/ctrl0000system.h: const ( NV0000_CTRL_CMD_SYSTEM_GET_BUILD_VERSION = 0x101 NV0000_CTRL_CMD_SYSTEM_GET_P2P_CAPS = 0x127 NV0000_CTRL_CMD_SYSTEM_GET_P2P_CAPS_V2 = 0x12b NV0000_CTRL_CMD_SYSTEM_GET_FABRIC_STATUS = 0x136 NV0000_CTRL_CMD_SYSTEM_GET_P2P_CAPS_MATRIX = 0x13a NV0000_CTRL_CMD_SYSTEM_GET_FEATURES = 0x1f0 ) // From src/common/sdk/nvidia/inc/ctrl/ctrl0000/ctrl0000unix.h: const ( NV0000_CTRL_CMD_OS_UNIX_EXPORT_OBJECT_TO_FD = 0x3d05 NV0000_CTRL_CMD_OS_UNIX_IMPORT_OBJECT_FROM_FD = 0x3d06 NV0000_CTRL_CMD_OS_UNIX_GET_EXPORT_OBJECT_INFO = 0x3d08 NV0000_OS_UNIX_EXPORT_OBJECT_FD_BUFFER_SIZE = 64 ) // +marshal type NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS struct { FD int32 DeviceInstance uint32 MaxObjects uint16 Pad [2]byte Metadata [NV0000_OS_UNIX_EXPORT_OBJECT_FD_BUFFER_SIZE]uint8 } // GetFrontendFD implements HasFrontendFD.GetFrontendFD. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS) GetFrontendFD() int32 { return p.FD } // SetFrontendFD implements HasFrontendFD.SetFrontendFD. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS) SetFrontendFD(fd int32) { p.FD = fd } // +marshal type NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS_V545 struct { FD int32 DeviceInstance uint32 GpuInstanceID uint32 MaxObjects uint16 Pad [2]byte Metadata [NV0000_OS_UNIX_EXPORT_OBJECT_FD_BUFFER_SIZE]uint8 } // GetFrontendFD implements HasFrontendFD.GetFrontendFD. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS_V545) GetFrontendFD() int32 { return p.FD } // SetFrontendFD implements HasFrontendFD.SetFrontendFD. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS_V545) SetFrontendFD(fd int32) { p.FD = fd } // +marshal type NV0000_CTRL_OS_UNIX_EXPORT_OBJECT struct { Type uint32 // enum NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TYPE // These fields are inside union `data`, in struct `rmObject`. HDevice Handle HParent Handle HObject Handle } // +marshal type NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS struct { Object NV0000_CTRL_OS_UNIX_EXPORT_OBJECT FD int32 Flags uint32 } // GetFrontendFD implements HasFrontendFD.GetFrontendFD. func (p *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS) GetFrontendFD() int32 { return p.FD } // SetFrontendFD implements HasFrontendFD.SetFrontendFD. func (p *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS) SetFrontendFD(fd int32) { p.FD = fd } // +marshal type NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS struct { FD int32 Object NV0000_CTRL_OS_UNIX_EXPORT_OBJECT } // GetFrontendFD implements HasFrontendFD.GetFrontendFD. func (p *NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS) GetFrontendFD() int32 { return p.FD } // SetFrontendFD implements HasFrontendFD.SetFrontendFD. func (p *NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS) SetFrontendFD(fd int32) { p.FD = fd } // +marshal type NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS struct { SizeOfStrings uint32 Pad [4]byte PDriverVersionBuffer P64 PVersionBuffer P64 PTitleBuffer P64 ChangelistNumber uint32 OfficialChangelistNumber uint32 } // From src/common/sdk/nvidia/inc/ctrl/ctrl0041.h const ( NV0041_CTRL_CMD_GET_SURFACE_INFO = 0x410110 ) // +marshal type NV0041_CTRL_GET_SURFACE_INFO_PARAMS struct { SurfaceInfoListSize uint32 Pad [4]byte SurfaceInfoList P64 } // ListSize implements HasCtrlInfoList.ListSize. func (p *NV0041_CTRL_GET_SURFACE_INFO_PARAMS) ListSize() uint32 { return p.SurfaceInfoListSize } // SetCtrlInfoList implements HasCtrlInfoList.SetCtrlInfoList. func (p *NV0041_CTRL_GET_SURFACE_INFO_PARAMS) SetCtrlInfoList(ptr P64) { p.SurfaceInfoList = ptr } // CtrlInfoList implements HasCtrlInfoList.CtrlInfoList. func (p *NV0041_CTRL_GET_SURFACE_INFO_PARAMS) CtrlInfoList() P64 { return p.SurfaceInfoList } // From src/common/sdk/nvidia/inc/ctrl/ctrl0080/ctrl0080fb.h: const ( NV0080_CTRL_CMD_FB_GET_CAPS_V2 = 0x801307 ) // From src/common/sdk/nvidia/inc/ctrl/ctrl0080/ctrl0080fifo.h: const ( NV0080_CTRL_CMD_FIFO_GET_CHANNELLIST = 0x80170d ) // +marshal type NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS struct { NumChannels uint32 Pad [4]byte PChannelHandleList P64 PChannelList P64 } // From src/common/sdk/nvidia/inc/ctrl/ctrl0080/ctrl0080gpu.h: const ( NV0080_CTRL_CMD_GPU_GET_CLASSLIST = 0x800201 NV0080_CTRL_CMD_GPU_GET_NUM_SUBDEVICES = 0x800280 NV0080_CTRL_CMD_GPU_QUERY_SW_STATE_PERSISTENCE = 0x800288 NV0080_CTRL_CMD_GPU_GET_VIRTUALIZATION_MODE = 0x800289 NV0080_CTRL_CMD_GPU_GET_CLASSLIST_V2 = 0x800292 ) // +marshal type NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS struct { NumClasses uint32 Pad [4]byte ClassList P64 } // From src/common/sdk/nvidia/inc/ctrl/ctrl0080/ctrl0080gr.h: // +marshal type NV0080_CTRL_GR_ROUTE_INFO struct { Flags uint32 Pad [4]byte Route uint64 } // From src/common/sdk/nvidia/inc/ctrl/ctrl0080/ctrl0080host.h: const ( NV0080_CTRL_CMD_HOST_GET_CAPS_V2 = 0x801402 ) // From src/common/sdk/nvidia/inc/ctrl/ctrl0080/ctrl0080perf.h: const ( NV0080_CTRL_CMD_PERF_CUDA_LIMIT_SET_CONTROL = 0x801909 ) // From src/common/sdk/nvidia/inc/ctrl/ctrl00fd.h: const ( NV00FD_CTRL_CMD_GET_INFO = 0xfd0101 NV00FD_CTRL_CMD_ATTACH_MEM = 0xfd0102 NV00FD_CTRL_CMD_ATTACH_GPU = 0xfd0104 NV00FD_CTRL_CMD_DETACH_MEM = 0xfd0105 ) // +marshal type NV00FD_CTRL_ATTACH_GPU_PARAMS struct { HSubDevice Handle Flags uint32 DevDescriptor uint64 } // From src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080bios.h: const ( NV2080_CTRL_CMD_BIOS_GET_INFO = 0x20800802 ) // +marshal type NV2080_CTRL_BIOS_GET_INFO_PARAMS struct { BiosInfoListSize uint32 Pad [4]byte BiosInfoList P64 } // ListSize implements HasCtrlInfoList.ListSize. func (p *NV2080_CTRL_BIOS_GET_INFO_PARAMS) ListSize() uint32 { return p.BiosInfoListSize } // SetCtrlInfoList implements HasCtrlInfoList.SetCtrlInfoList. func (p *NV2080_CTRL_BIOS_GET_INFO_PARAMS) SetCtrlInfoList(ptr P64) { p.BiosInfoList = ptr } // CtrlInfoList implements HasCtrlInfoList.CtrlInfoList. func (p *NV2080_CTRL_BIOS_GET_INFO_PARAMS) CtrlInfoList() P64 { return p.BiosInfoList } // From src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080bus.h: const ( NV2080_CTRL_CMD_BUS_GET_PCI_INFO = 0x20801801 NV2080_CTRL_CMD_BUS_GET_PCI_BAR_INFO = 0x20801803 NV2080_CTRL_CMD_BUS_GET_INFO_V2 = 0x20801823 NV2080_CTRL_CMD_BUS_GET_PCIE_SUPPORTED_GPU_ATOMICS = 0x2080182a NV2080_CTRL_CMD_BUS_GET_C2C_INFO = 0x2080182b ) // From src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080ce.h: const ( NV2080_CTRL_CMD_CE_GET_ALL_CAPS = 0x20802a0a ) // From src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080event.h: const ( NV2080_CTRL_CMD_EVENT_SET_NOTIFICATION = 0x20800301 ) // From src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080fb.h: const ( NV2080_CTRL_CMD_FB_GET_INFO_V2 = 0x20801303 ) // From src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080fifo.h: const ( NV2080_CTRL_CMD_FIFO_DISABLE_CHANNELS = 0x2080110b NV2080_CTRL_FIFO_DISABLE_CHANNELS_MAX_ENTRIES = 64 ) // From src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080flcn.h: const ( NV2080_CTRL_CMD_FLCN_GET_CTX_BUFFER_SIZE = 0x20803125 ) // +marshal type NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS struct { BDisable uint8 Pad1 [3]byte NumChannels uint32 BOnlyDisableScheduling uint8 BRewindGpPut uint8 Pad2 [6]byte PRunlistPreemptEvent P64 HClientList [NV2080_CTRL_FIFO_DISABLE_CHANNELS_MAX_ENTRIES]Handle HChannelList [NV2080_CTRL_FIFO_DISABLE_CHANNELS_MAX_ENTRIES]Handle } // From src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080gpu.h: const ( NV2080_CTRL_CMD_GPU_GET_INFO_V2 = 0x20800102 NV2080_CTRL_CMD_GPU_GET_NAME_STRING = 0x20800110 NV2080_CTRL_CMD_GPU_GET_SHORT_NAME_STRING = 0x20800111 NV2080_CTRL_CMD_GPU_GET_SIMULATION_INFO = 0x20800119 NV2080_CTRL_CMD_GPU_QUERY_ECC_STATUS = 0x2080012f NV2080_CTRL_CMD_GPU_QUERY_COMPUTE_MODE_RULES = 0x20800131 NV2080_CTRL_CMD_GPU_QUERY_ECC_CONFIGURATION = 0x20800133 NV2080_CTRL_CMD_GPU_GET_OEM_BOARD_INFO = 0x2080013f NV2080_CTRL_CMD_GPU_ACQUIRE_COMPUTE_MODE_RESERVATION = 0x20800145 // undocumented; paramSize == 0 NV2080_CTRL_CMD_GPU_RELEASE_COMPUTE_MODE_RESERVATION = 0x20800146 // undocumented; paramSize == 0 NV2080_CTRL_CMD_GPU_GET_GID_INFO = 0x2080014a NV2080_CTRL_CMD_GPU_GET_INFOROM_OBJECT_VERSION = 0x2080014b NV2080_CTRL_CMD_GPU_GET_INFOROM_IMAGE_VERSION = 0x20800156 NV2080_CTRL_CMD_GPU_QUERY_INFOROM_ECC_SUPPORT = 0x20800157 NV2080_CTRL_CMD_GPU_GET_ENGINES_V2 = 0x20800170 NV2080_CTRL_CMD_GPU_GET_ACTIVE_PARTITION_IDS = 0x2080018b NV2080_CTRL_CMD_GPU_GET_PIDS = 0x2080018d NV2080_CTRL_CMD_GPU_GET_PID_INFO = 0x2080018e NV2080_CTRL_CMD_GPU_GET_COMPUTE_POLICY_CONFIG = 0x20800195 NV2080_CTRL_CMD_GET_GPU_FABRIC_PROBE_INFO = 0x208001a3 ) // From src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080gr.h: const ( NV2080_CTRL_CMD_GR_GET_INFO = 0x20801201 NV2080_CTRL_CMD_GR_SET_CTXSW_PREEMPTION_MODE = 0x20801210 NV2080_CTRL_CMD_GR_GET_CTX_BUFFER_SIZE = 0x20801218 NV2080_CTRL_CMD_GR_GET_GLOBAL_SM_ORDER = 0x2080121b NV2080_CTRL_CMD_GR_GET_CAPS_V2 = 0x20801227 NV2080_CTRL_CMD_GR_GET_GPC_MASK = 0x2080122a NV2080_CTRL_CMD_GR_GET_TPC_MASK = 0x2080122b NV2080_CTRL_CMD_GR_GET_SM_ISSUE_RATE_MODIFIER = 0x20801230 ) // From src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080grmgr.h: const ( NV2080_CTRL_CMD_GRMGR_GET_GR_FS_INFO = 0x20803801 ) // From src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080gsp.h: const ( NV2080_CTRL_CMD_GSP_GET_FEATURES = 0x20803601 ) // +marshal type NV2080_CTRL_GR_GET_INFO_PARAMS struct { GRInfoListSize uint32 // in elements Pad [4]byte GRInfoList P64 GRRouteInfo NV0080_CTRL_GR_ROUTE_INFO } // ListSize implements HasCtrlInfoList.ListSize. func (p *NV2080_CTRL_GR_GET_INFO_PARAMS) ListSize() uint32 { return p.GRInfoListSize } // SetCtrlInfoList implements HasCtrlInfoList.SetCtrlInfoList. func (p *NV2080_CTRL_GR_GET_INFO_PARAMS) SetCtrlInfoList(ptr P64) { p.GRInfoList = ptr } // CtrlInfoList implements HasCtrlInfoList.CtrlInfoList. func (p *NV2080_CTRL_GR_GET_INFO_PARAMS) CtrlInfoList() P64 { return p.GRInfoList } // From src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080mc.h: const ( NV2080_CTRL_CMD_MC_GET_ARCH_INFO = 0x20801701 NV2080_CTRL_CMD_MC_SERVICE_INTERRUPTS = 0x20801702 ) // From src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080nvlink.h: const ( NV2080_CTRL_CMD_NVLINK_GET_NVLINK_CAPS = 0x20803001 NV2080_CTRL_CMD_NVLINK_GET_NVLINK_STATUS = 0x20803002 ) // From src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080perf.h: const ( NV2080_CTRL_CMD_PERF_BOOST = 0x2080200a ) // From src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080rc.h: const ( NV2080_CTRL_CMD_RC_GET_WATCHDOG_INFO = 0x20802209 NV2080_CTRL_CMD_RC_RELEASE_WATCHDOG_REQUESTS = 0x2080220c NV2080_CTRL_CMD_RC_SOFT_DISABLE_WATCHDOG = 0x20802210 ) // From src/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080tmr.h: const ( NV2080_CTRL_CMD_TIMER_GET_GPU_CPU_TIME_CORRELATION_INFO = 0x20800406 NV2080_CTRL_CMD_PERF_GET_CURRENT_PSTATE = 0x20802068 ) // From src/common/sdk/nvidia/inc/ctrl/ctrl503c.h: const ( NV503C_CTRL_CMD_REGISTER_VA_SPACE = 0x503c0102 NV503C_CTRL_CMD_REGISTER_VIDMEM = 0x503c0104 NV503C_CTRL_CMD_UNREGISTER_VIDMEM = 0x503c0105 ) // +marshal type NV503C_CTRL_REGISTER_VA_SPACE_PARAMS struct { HVASpace Handle Pad [4]byte VASpaceToken uint64 } // From src/common/sdk/nvidia/inc/ctrl/ctrl83de/ctrl83dedebug.h: const ( NV83DE_CTRL_CMD_DEBUG_SET_EXCEPTION_MASK = 0x83de0309 NV83DE_CTRL_CMD_DEBUG_READ_ALL_SM_ERROR_STATES = 0x83de030c NV83DE_CTRL_CMD_DEBUG_CLEAR_ALL_SM_ERROR_STATES = 0x83de0310 ) // From src/common/sdk/nvidia/inc/ctrl/ctrlc36f.h: const ( NVC36F_CTRL_GET_CLASS_ENGINEID = 0xc36f0101 NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN = 0xc36f0108 ) // From src/common/sdk/nvidia/inc/ctrl/ctrlc56f.h: const ( NVC56F_CTRL_CMD_GET_KMB = 0xc56f010b ) // From src/common/sdk/nvidia/inc/ctrl/ctrl906f.h: const ( NV906F_CTRL_GET_CLASS_ENGINEID = 0x906f0101 NV906F_CTRL_CMD_RESET_CHANNEL = 0x906f0102 ) // From src/common/sdk/nvidia/inc/ctrl/ctrl90e6.h: const ( NV90E6_CTRL_CMD_MASTER_GET_VIRTUAL_FUNCTION_ERROR_CONT_INTR_MASK = 0x90e60102 ) // From src/common/sdk/nvidia/inc/ctrl/ctrla06c.h: const ( NVA06C_CTRL_CMD_GPFIFO_SCHEDULE = 0xa06c0101 NVA06C_CTRL_CMD_SET_TIMESLICE = 0xa06c0103 NVA06C_CTRL_CMD_PREEMPT = 0xa06c0105 ) // From src/common/sdk/nvidia/inc/ctrl/ctrla06f/ctrla06fgpfifo.h: const ( NVA06F_CTRL_CMD_GPFIFO_SCHEDULE = 0xa06f0103 ) // From src/common/sdk/nvidia/inc/ctrl/ctrlcb33.h: const ( NV_CONF_COMPUTE_CTRL_CMD_SYSTEM_GET_CAPABILITIES = 0xcb330101 NV_CONF_COMPUTE_CTRL_CMD_SYSTEM_GET_GPUS_STATE = 0xcb330104 NV_CONF_COMPUTE_CTRL_CMD_GPU_GET_NUM_SECURE_CHANNELS = 0xcb33010b ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/nvgpu/frontend.go000066400000000000000000000314541465435605700234730ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package nvgpu import ( "gvisor.dev/gvisor/pkg/marshal" ) // NV_IOCTL_MAGIC is the "canonical" IOC_TYPE for frontend ioctls. // The driver ignores IOC_TYPE, allowing any value to be passed. const NV_IOCTL_MAGIC = uint32('F') // Frontend ioctl numbers. // Note that these are only the IOC_NR part of the ioctl command. const ( // From kernel-open/common/inc/nv-ioctl-numbers.h: NV_IOCTL_BASE = 200 NV_ESC_CARD_INFO = NV_IOCTL_BASE + 0 NV_ESC_REGISTER_FD = NV_IOCTL_BASE + 1 NV_ESC_ALLOC_OS_EVENT = NV_IOCTL_BASE + 6 NV_ESC_FREE_OS_EVENT = NV_IOCTL_BASE + 7 NV_ESC_CHECK_VERSION_STR = NV_IOCTL_BASE + 10 NV_ESC_ATTACH_GPUS_TO_FD = NV_IOCTL_BASE + 12 NV_ESC_SYS_PARAMS = NV_IOCTL_BASE + 14 NV_ESC_WAIT_OPEN_COMPLETE = NV_IOCTL_BASE + 18 // From kernel-open/common/inc/nv-ioctl-numa.h: NV_ESC_NUMA_INFO = NV_IOCTL_BASE + 15 // From src/nvidia/arch/nvalloc/unix/include/nv_escape.h: NV_ESC_RM_ALLOC_MEMORY = 0x27 NV_ESC_RM_FREE = 0x29 NV_ESC_RM_CONTROL = 0x2a NV_ESC_RM_ALLOC = 0x2b NV_ESC_RM_DUP_OBJECT = 0x34 NV_ESC_RM_SHARE = 0x35 NV_ESC_RM_VID_HEAP_CONTROL = 0x4a NV_ESC_RM_MAP_MEMORY = 0x4e NV_ESC_RM_UNMAP_MEMORY = 0x4f NV_ESC_RM_UPDATE_DEVICE_MAPPING_INFO = 0x5e ) // Frontend ioctl parameter structs, from src/common/sdk/nvidia/inc/nvos.h or // kernel-open/common/inc/nv-ioctl.h. // IoctlRegisterFD is nv_ioctl_register_fd_t, the parameter type for // NV_ESC_REGISTER_FD. // // +marshal type IoctlRegisterFD struct { CtlFD int32 } // IoctlAllocOSEvent is nv_ioctl_alloc_os_event_t, the parameter type for // NV_ESC_ALLOC_OS_EVENT. // // +marshal type IoctlAllocOSEvent struct { HClient Handle HDevice Handle FD uint32 Status uint32 } // GetFrontendFD implements HasFrontendFD.GetFrontendFD. func (p *IoctlAllocOSEvent) GetFrontendFD() int32 { return int32(p.FD) } // SetFrontendFD implements HasFrontendFD.SetFrontendFD. func (p *IoctlAllocOSEvent) SetFrontendFD(fd int32) { p.FD = uint32(fd) } // IoctlFreeOSEvent is nv_ioctl_free_os_event_t, the parameter type for // NV_ESC_FREE_OS_EVENT. // // +marshal type IoctlFreeOSEvent struct { HClient Handle HDevice Handle FD uint32 Status uint32 } // GetFrontendFD implements HasFrontendFD.GetFrontendFD. func (p *IoctlFreeOSEvent) GetFrontendFD() int32 { return int32(p.FD) } // SetFrontendFD implements HasFrontendFD.SetFrontendFD. func (p *IoctlFreeOSEvent) SetFrontendFD(fd int32) { p.FD = uint32(fd) } // RMAPIVersion is nv_rm_api_version_t, the parameter type for // NV_ESC_CHECK_VERSION_STR. // // +marshal type RMAPIVersion struct { Cmd uint32 Reply uint32 VersionString [64]byte } // IoctlSysParams is nv_ioctl_sys_params_t, the parameter type for // NV_ESC_SYS_PARAMS. // // +marshal type IoctlSysParams struct { MemblockSize uint64 } // IoctlWaitOpenComplete is nv_ioctl_wait_open_complete_t, the parameter type // for NV_ESC_WAIT_OPEN_COMPLETE. // // +marshal type IoctlWaitOpenComplete struct { Rc int32 AdapterStatus uint32 } // IoctlNVOS02ParametersWithFD is nv_ioctl_nvos2_parameters_with_fd, the // parameter type for NV_ESC_RM_ALLOC_MEMORY. // // +marshal type IoctlNVOS02ParametersWithFD struct { Params NVOS02Parameters FD int32 Pad0 [4]byte } // +marshal type NVOS02Parameters struct { HRoot Handle HObjectParent Handle HObjectNew Handle HClass ClassID Flags uint32 Pad0 [4]byte PMemory P64 // address of application mapping, without indirection Limit uint64 Status uint32 Pad1 [4]byte } // NVOS00Parameters is NVOS00_PARAMETERS, the parameter type for // NV_ESC_RM_FREE. // // +marshal type NVOS00Parameters struct { HRoot Handle HObjectParent Handle HObjectOld Handle Status uint32 } // RmAllocParamType should be implemented by all possible parameter types for // NV_ESC_RM_ALLOC. type RmAllocParamType interface { GetHClass() ClassID GetPAllocParms() P64 GetPRightsRequested() P64 SetPAllocParms(p P64) SetPRightsRequested(p P64) FromOS64(other NVOS64Parameters) ToOS64() NVOS64Parameters GetPointer() uintptr marshal.Marshallable } // GetRmAllocParamObj returns the appropriate implementation of // RmAllocParamType based on passed parameters. func GetRmAllocParamObj(isNVOS64 bool) RmAllocParamType { if isNVOS64 { return &NVOS64Parameters{} } return &NVOS21Parameters{} } // NVOS21Parameters is NVOS21_PARAMETERS, one possible parameter type for // NV_ESC_RM_ALLOC. // // +marshal type NVOS21Parameters struct { HRoot Handle HObjectParent Handle HObjectNew Handle HClass ClassID PAllocParms P64 ParamsSize uint32 Status uint32 } // GetHClass implements RmAllocParamType.GetHClass. func (n *NVOS21Parameters) GetHClass() ClassID { return n.HClass } // GetPAllocParms implements RmAllocParamType.GetPAllocParms. func (n *NVOS21Parameters) GetPAllocParms() P64 { return n.PAllocParms } // GetPRightsRequested implements RmAllocParamType.GetPRightsRequested. func (n *NVOS21Parameters) GetPRightsRequested() P64 { return 0 } // SetPAllocParms implements RmAllocParamType.SetPAllocParms. func (n *NVOS21Parameters) SetPAllocParms(p P64) { n.PAllocParms = p } // SetPRightsRequested implements RmAllocParamType.SetPRightsRequested. func (n *NVOS21Parameters) SetPRightsRequested(p P64) { panic("impossible") } // FromOS64 implements RmAllocParamType.FromOS64. func (n *NVOS21Parameters) FromOS64(other NVOS64Parameters) { n.HRoot = other.HRoot n.HObjectParent = other.HObjectParent n.HObjectNew = other.HObjectNew n.HClass = other.HClass n.PAllocParms = other.PAllocParms n.ParamsSize = other.ParamsSize n.Status = other.Status } // ToOS64 implements RmAllocParamType.ToOS64. func (n *NVOS21Parameters) ToOS64() NVOS64Parameters { return NVOS64Parameters{ HRoot: n.HRoot, HObjectParent: n.HObjectParent, HObjectNew: n.HObjectNew, HClass: n.HClass, PAllocParms: n.PAllocParms, ParamsSize: n.ParamsSize, Status: n.Status, } } // NVOS55Parameters is NVOS55_PARAMETERS, the parameter type for // NV_ESC_RM_DUP_OBJECT. // // +marshal type NVOS55Parameters struct { HClient Handle HParent Handle HObject Handle HClientSrc Handle HObjectSrc Handle Flags uint32 Status uint32 } // NVOS57Parameters is NVOS57_PARAMETERS, the parameter type for // NV_ESC_RM_SHARE. // // +marshal type NVOS57Parameters struct { HClient Handle HObject Handle SharePolicy RS_SHARE_POLICY Status uint32 } // NVOS32Parameters is NVOS32_PARAMETERS, the parameter type for // NV_ESC_RM_VID_HEAP_CONTROL. // // +marshal type NVOS32Parameters struct { HRoot Handle HObjectParent Handle Function uint32 HVASpace Handle IVCHeapNumber int16 Pad [2]byte Status uint32 Total uint64 Free uint64 Data [144]byte // union } // Possible values for NVOS32Parameters.Function: const ( NVOS32_FUNCTION_ALLOC_SIZE = 2 ) // NVOS32AllocSize is the type of NVOS32Parameters.Data for // NVOS32_FUNCTION_ALLOC_SIZE. type NVOS32AllocSize struct { Owner uint32 HMemory Handle Type uint32 Flags uint32 Attr uint32 Format uint32 ComprCovg uint32 ZcullCovg uint32 PartitionStride uint32 Width uint32 Height uint32 Pad0 [4]byte Size uint64 Alignment uint64 Offset uint64 Limit uint64 Address P64 RangeBegin uint64 RangeEnd uint64 Attr2 uint32 CtagOffset uint32 } // IoctlNVOS33ParametersWithFD is nv_ioctl_nvos33_parameters_with_fd, the // parameter type for NV_ESC_RM_MAP_MEMORY, from // src/nvidia/arch/nvalloc/unix/include/nv-unix-nvos-params-wrappers.h. // // +marshal type IoctlNVOS33ParametersWithFD struct { Params NVOS33Parameters FD int32 Pad0 [4]byte } // +marshal type NVOS33Parameters struct { HClient Handle HDevice Handle HMemory Handle Pad0 [4]byte Offset uint64 Length uint64 PLinearAddress P64 // address of application mapping, without indirection Status uint32 Flags uint32 } // NVOS34Parameters is NVOS34_PARAMETERS, the parameter type for // NV_ESC_RM_UNMAP_MEMORY. // // +marshal type NVOS34Parameters struct { HClient Handle HDevice Handle HMemory Handle Pad0 [4]byte PLinearAddress P64 // address of application mapping, without indirection Status uint32 Flags uint32 } // NVOS54Parameters is NVOS54_PARAMETERS, the parameter type for // NV_ESC_RM_CONTROL. // // +marshal type NVOS54Parameters struct { HClient Handle HObject Handle Cmd uint32 Flags uint32 Params P64 ParamsSize uint32 Status uint32 } // NVOS56Parameters is NVOS56_PARAMETERS, the parameter type for // NV_ESC_RM_UPDATE_DEVICE_MAPPING_INFO. // // +marshal type NVOS56Parameters struct { HClient Handle HDevice Handle HMemory Handle Pad0 [4]byte POldCPUAddress P64 PNewCPUAddress P64 Status uint32 Pad1 [4]byte } // NVOS64Parameters is NVOS64_PARAMETERS, one possible parameter type for // NV_ESC_RM_ALLOC. // // +marshal // +stateify savable type NVOS64Parameters struct { HRoot Handle HObjectParent Handle HObjectNew Handle HClass ClassID PAllocParms P64 PRightsRequested P64 ParamsSize uint32 Flags uint32 Status uint32 _ uint32 } // GetHClass implements RmAllocParamType.GetHClass. func (n *NVOS64Parameters) GetHClass() ClassID { return n.HClass } // GetPAllocParms implements RmAllocParamType.GetPAllocParms. func (n *NVOS64Parameters) GetPAllocParms() P64 { return n.PAllocParms } // GetPRightsRequested implements RmAllocParamType.GetPRightsRequested. func (n *NVOS64Parameters) GetPRightsRequested() P64 { return n.PRightsRequested } // SetPAllocParms implements RmAllocParamType.SetPAllocParms. func (n *NVOS64Parameters) SetPAllocParms(p P64) { n.PAllocParms = p } // SetPRightsRequested implements RmAllocParamType.SetPRightsRequested. func (n *NVOS64Parameters) SetPRightsRequested(p P64) { n.PRightsRequested = p } // FromOS64 implements RmAllocParamType.FromOS64. func (n *NVOS64Parameters) FromOS64(other NVOS64Parameters) { *n = other } // ToOS64 implements RmAllocParamType.ToOS64. func (n *NVOS64Parameters) ToOS64() NVOS64Parameters { return *n } // HasFrontendFD is a type constraint for parameter structs containing a // frontend FD field. This is necessary because, as of this writing (Go 1.20), // there is no way to enable field access using a Go type constraint. type HasFrontendFD interface { GetFrontendFD() int32 SetFrontendFD(int32) } // Frontend ioctl parameter struct sizes. var ( SizeofIoctlRegisterFD = uint32((*IoctlRegisterFD)(nil).SizeBytes()) SizeofIoctlAllocOSEvent = uint32((*IoctlAllocOSEvent)(nil).SizeBytes()) SizeofIoctlFreeOSEvent = uint32((*IoctlFreeOSEvent)(nil).SizeBytes()) SizeofRMAPIVersion = uint32((*RMAPIVersion)(nil).SizeBytes()) SizeofIoctlSysParams = uint32((*IoctlSysParams)(nil).SizeBytes()) SizeofIoctlWaitOpenComplete = uint32((*IoctlWaitOpenComplete)(nil).SizeBytes()) SizeofIoctlNVOS02ParametersWithFD = uint32((*IoctlNVOS02ParametersWithFD)(nil).SizeBytes()) SizeofNVOS00Parameters = uint32((*NVOS00Parameters)(nil).SizeBytes()) SizeofNVOS21Parameters = uint32((*NVOS21Parameters)(nil).SizeBytes()) SizeofIoctlNVOS33ParametersWithFD = uint32((*IoctlNVOS33ParametersWithFD)(nil).SizeBytes()) SizeofNVOS55Parameters = uint32((*NVOS55Parameters)(nil).SizeBytes()) SizeofNVOS57Parameters = uint32((*NVOS57Parameters)(nil).SizeBytes()) SizeofNVOS32Parameters = uint32((*NVOS32Parameters)(nil).SizeBytes()) SizeofNVOS34Parameters = uint32((*NVOS34Parameters)(nil).SizeBytes()) SizeofNVOS54Parameters = uint32((*NVOS54Parameters)(nil).SizeBytes()) SizeofNVOS56Parameters = uint32((*NVOS56Parameters)(nil).SizeBytes()) SizeofNVOS64Parameters = uint32((*NVOS64Parameters)(nil).SizeBytes()) ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/nvgpu/frontend_unsafe.go000066400000000000000000000016201465435605700250240ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package nvgpu import "unsafe" // GetPointer implements RmAllocParamType.GetPointer. func (n *NVOS21Parameters) GetPointer() uintptr { return uintptr(unsafe.Pointer(n)) } // GetPointer implements RmAllocParamType.GetPointer. func (n *NVOS64Parameters) GetPointer() uintptr { return uintptr(unsafe.Pointer(n)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/nvgpu/nvgpu.go000066400000000000000000000042011465435605700230010ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package nvgpu tracks the ABI of the Nvidia GPU Linux kernel driver: // https://github.com/NVIDIA/open-gpu-kernel-modules package nvgpu import ( "fmt" ) // Device numbers. const ( NV_MAJOR_DEVICE_NUMBER = 195 // from kernel-open/common/inc/nv.h NV_CONTROL_DEVICE_MINOR = 255 // from kernel-open/common/inc/nv-linux.h NVIDIA_UVM_PRIMARY_MINOR_NUMBER = 0 // from kernel-open/nvidia-uvm/uvm_common.h ) // Handle is NvHandle, from src/common/sdk/nvidia/inc/nvtypes.h. // // +marshal // +stateify savable type Handle struct { Val uint32 } // String implements fmt.Stringer.String. func (h Handle) String() string { return fmt.Sprintf("%#x", h.Val) } // P64 is NvP64, from src/common/sdk/nvidia/inc/nvtypes.h. // // +marshal type P64 uint64 // From src/common/sdk/nvidia/inc/nvlimits.h: const ( NV_MAX_DEVICES = 32 NV_MAX_SUBDEVICES = 8 ) // From src/common/sdk/nvidia/inc/alloc/alloc_channel.h. const ( CC_CHAN_ALLOC_IV_SIZE_DWORD = 3 CC_CHAN_ALLOC_NONCE_SIZE_DWORD = 8 ) // RS_ACCESS_MASK is RS_ACCESS_MASK, from // src/common/sdk/nvidia/inc/rs_access.h. // // +marshal // +stateify savable type RS_ACCESS_MASK struct { Limbs [SDK_RS_ACCESS_MAX_LIMBS]uint32 // RsAccessLimb } const SDK_RS_ACCESS_MAX_LIMBS = 1 // RS_SHARE_POLICY is RS_SHARE_POLICY, from // src/common/sdk/nvidia/inc/rs_access.h. // // +marshal type RS_SHARE_POLICY struct { Target uint32 AccessMask RS_ACCESS_MASK Type uint16 Action uint8 Pad [1]byte } // NvUUID is defined in src/common/inc/nvCpuUuid.h. // // +marshal type NvUUID [16]uint8 golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/nvgpu/nvgpu_abi_autogen_unsafe.go000066400000000000000000020066601465435605700267140ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package nvgpu import ( "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "io" "reflect" "runtime" "unsafe" ) // Marshallable types used by this file. var _ marshal.Marshallable = (*ClassID)(nil) var _ marshal.Marshallable = (*Handle)(nil) var _ marshal.Marshallable = (*IoctlAllocOSEvent)(nil) var _ marshal.Marshallable = (*IoctlFreeOSEvent)(nil) var _ marshal.Marshallable = (*IoctlNVOS02ParametersWithFD)(nil) var _ marshal.Marshallable = (*IoctlNVOS33ParametersWithFD)(nil) var _ marshal.Marshallable = (*IoctlRegisterFD)(nil) var _ marshal.Marshallable = (*IoctlSysParams)(nil) var _ marshal.Marshallable = (*IoctlWaitOpenComplete)(nil) var _ marshal.Marshallable = (*NV0000_CTRL_OS_UNIX_EXPORT_OBJECT)(nil) var _ marshal.Marshallable = (*NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS)(nil) var _ marshal.Marshallable = (*NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS)(nil) var _ marshal.Marshallable = (*NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS_V545)(nil) var _ marshal.Marshallable = (*NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS)(nil) var _ marshal.Marshallable = (*NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS)(nil) var _ marshal.Marshallable = (*NV0005_ALLOC_PARAMETERS)(nil) var _ marshal.Marshallable = (*NV0041_CTRL_GET_SURFACE_INFO_PARAMS)(nil) var _ marshal.Marshallable = (*NV0080_ALLOC_PARAMETERS)(nil) var _ marshal.Marshallable = (*NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS)(nil) var _ marshal.Marshallable = (*NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS)(nil) var _ marshal.Marshallable = (*NV0080_CTRL_GR_ROUTE_INFO)(nil) var _ marshal.Marshallable = (*NV00DE_ALLOC_PARAMETERS)(nil) var _ marshal.Marshallable = (*NV00DE_ALLOC_PARAMETERS_V545)(nil) var _ marshal.Marshallable = (*NV00F8_ALLOCATION_PARAMETERS)(nil) var _ marshal.Marshallable = (*NV00FD_ALLOCATION_PARAMETERS)(nil) var _ marshal.Marshallable = (*NV00FD_ALLOCATION_PARAMETERS_V545)(nil) var _ marshal.Marshallable = (*NV00FD_CTRL_ATTACH_GPU_PARAMS)(nil) var _ marshal.Marshallable = (*NV2080_ALLOC_PARAMETERS)(nil) var _ marshal.Marshallable = (*NV2080_CTRL_BIOS_GET_INFO_PARAMS)(nil) var _ marshal.Marshallable = (*NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS)(nil) var _ marshal.Marshallable = (*NV2080_CTRL_GR_GET_INFO_PARAMS)(nil) var _ marshal.Marshallable = (*NV2081_ALLOC_PARAMETERS)(nil) var _ marshal.Marshallable = (*NV503B_ALLOC_PARAMETERS)(nil) var _ marshal.Marshallable = (*NV503B_BAR1_P2P_DMA_INFO)(nil) var _ marshal.Marshallable = (*NV503C_ALLOC_PARAMETERS)(nil) var _ marshal.Marshallable = (*NV503C_CTRL_REGISTER_VA_SPACE_PARAMS)(nil) var _ marshal.Marshallable = (*NV83DE_ALLOC_PARAMETERS)(nil) var _ marshal.Marshallable = (*NVB0B5_ALLOCATION_PARAMETERS)(nil) var _ marshal.Marshallable = (*NVOS00Parameters)(nil) var _ marshal.Marshallable = (*NVOS02Parameters)(nil) var _ marshal.Marshallable = (*NVOS21Parameters)(nil) var _ marshal.Marshallable = (*NVOS32Parameters)(nil) var _ marshal.Marshallable = (*NVOS33Parameters)(nil) var _ marshal.Marshallable = (*NVOS34Parameters)(nil) var _ marshal.Marshallable = (*NVOS54Parameters)(nil) var _ marshal.Marshallable = (*NVOS55Parameters)(nil) var _ marshal.Marshallable = (*NVOS56Parameters)(nil) var _ marshal.Marshallable = (*NVOS57Parameters)(nil) var _ marshal.Marshallable = (*NVOS64Parameters)(nil) var _ marshal.Marshallable = (*NVXXXX_CTRL_XXX_INFO)(nil) var _ marshal.Marshallable = (*NV_CHANNEL_ALLOC_PARAMS)(nil) var _ marshal.Marshallable = (*NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS)(nil) var _ marshal.Marshallable = (*NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS)(nil) var _ marshal.Marshallable = (*NV_CTXSHARE_ALLOCATION_PARAMETERS)(nil) var _ marshal.Marshallable = (*NV_EXPORT_MEM_PACKET)(nil) var _ marshal.Marshallable = (*NV_GR_ALLOCATION_PARAMETERS)(nil) var _ marshal.Marshallable = (*NV_HOPPER_USERMODE_A_PARAMS)(nil) var _ marshal.Marshallable = (*NV_MEMORY_ALLOCATION_PARAMS)(nil) var _ marshal.Marshallable = (*NV_MEMORY_ALLOCATION_PARAMS_V545)(nil) var _ marshal.Marshallable = (*NV_MEMORY_DESC_PARAMS)(nil) var _ marshal.Marshallable = (*NV_VASPACE_ALLOCATION_PARAMETERS)(nil) var _ marshal.Marshallable = (*NvUUID)(nil) var _ marshal.Marshallable = (*P64)(nil) var _ marshal.Marshallable = (*RMAPIVersion)(nil) var _ marshal.Marshallable = (*RS_ACCESS_MASK)(nil) var _ marshal.Marshallable = (*RS_SHARE_POLICY)(nil) var _ marshal.Marshallable = (*UVM_ALLOC_SEMAPHORE_POOL_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550)(nil) var _ marshal.Marshallable = (*UVM_CREATE_EXTERNAL_RANGE_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_CREATE_RANGE_GROUP_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_DESTROY_RANGE_GROUP_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_DISABLE_PEER_ACCESS_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_DISABLE_READ_DUPLICATION_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_ENABLE_PEER_ACCESS_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_FREE_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_INITIALIZE_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_MAP_EXTERNAL_ALLOCATION_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550)(nil) var _ marshal.Marshallable = (*UVM_MIGRATE_RANGE_GROUP_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_MM_INITIALIZE_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_PAGEABLE_MEM_ACCESS_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_REGISTER_CHANNEL_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_REGISTER_GPU_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_REGISTER_GPU_VASPACE_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_SET_PREFERRED_LOCATION_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_SET_PREFERRED_LOCATION_PARAMS_V550)(nil) var _ marshal.Marshallable = (*UVM_SET_RANGE_GROUP_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_TOOLS_READ_PROCESS_MEMORY_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_TOOLS_WRITE_PROCESS_MEMORY_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_UNMAP_EXTERNAL_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_UNREGISTER_CHANNEL_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_UNREGISTER_GPU_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_UNREGISTER_GPU_VASPACE_PARAMS)(nil) var _ marshal.Marshallable = (*UVM_VALIDATE_VA_RANGE_PARAMS)(nil) var _ marshal.Marshallable = (*UvmGpuMappingAttributes)(nil) var _ marshal.Marshallable = (*nv00f8Map)(nil) // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (id *ClassID) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (id *ClassID) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(*id)) return dst[4:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (id *ClassID) UnmarshalBytes(src []byte) []byte { *id = ClassID(uint32(hostarch.ByteOrder.Uint32(src[:4]))) return src[4:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (id *ClassID) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (id *ClassID) MarshalUnsafe(dst []byte) []byte { size := id.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(id), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (id *ClassID) UnmarshalUnsafe(src []byte) []byte { size := id.SizeBytes() gohacks.Memmove(unsafe.Pointer(id), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (id *ClassID) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(id))) hdr.Len = id.SizeBytes() hdr.Cap = id.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that id // must live until the use above. runtime.KeepAlive(id) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (id *ClassID) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return id.CopyOutN(cc, addr, id.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (id *ClassID) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(id))) hdr.Len = id.SizeBytes() hdr.Cap = id.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that id // must live until the use above. runtime.KeepAlive(id) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (id *ClassID) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return id.CopyInN(cc, addr, id.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (id *ClassID) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(id))) hdr.Len = id.SizeBytes() hdr.Cap = id.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that id // must live until the use above. runtime.KeepAlive(id) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV0005_ALLOC_PARAMETERS) SizeBytes() int { return 8 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*P64)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV0005_ALLOC_PARAMETERS) MarshalBytes(dst []byte) []byte { dst = n.HParentClient.MarshalUnsafe(dst) dst = n.HSrcResource.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.HClass)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.NotifyIndex)) dst = dst[4:] dst = n.Data.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV0005_ALLOC_PARAMETERS) UnmarshalBytes(src []byte) []byte { src = n.HParentClient.UnmarshalUnsafe(src) src = n.HSrcResource.UnmarshalUnsafe(src) n.HClass = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.NotifyIndex = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = n.Data.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV0005_ALLOC_PARAMETERS) Packed() bool { return n.Data.Packed() && n.HParentClient.Packed() && n.HSrcResource.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV0005_ALLOC_PARAMETERS) MarshalUnsafe(dst []byte) []byte { if n.Data.Packed() && n.HParentClient.Packed() && n.HSrcResource.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV0005_ALLOC_PARAMETERS doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV0005_ALLOC_PARAMETERS) UnmarshalUnsafe(src []byte) []byte { if n.Data.Packed() && n.HParentClient.Packed() && n.HSrcResource.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV0005_ALLOC_PARAMETERS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV0005_ALLOC_PARAMETERS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.Data.Packed() && n.HParentClient.Packed() && n.HSrcResource.Packed() { // Type NV0005_ALLOC_PARAMETERS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV0005_ALLOC_PARAMETERS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV0005_ALLOC_PARAMETERS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.Data.Packed() && n.HParentClient.Packed() && n.HSrcResource.Packed() { // Type NV0005_ALLOC_PARAMETERS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV0005_ALLOC_PARAMETERS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV0005_ALLOC_PARAMETERS) WriteTo(writer io.Writer) (int64, error) { if !n.Data.Packed() && n.HParentClient.Packed() && n.HSrcResource.Packed() { // Type NV0005_ALLOC_PARAMETERS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV0080_ALLOC_PARAMETERS) SizeBytes() int { return 36 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + 1*4 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV0080_ALLOC_PARAMETERS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.DeviceID)) dst = dst[4:] dst = n.HClientShare.MarshalUnsafe(dst) dst = n.HTargetClient.MarshalUnsafe(dst) dst = n.HTargetDevice.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(n.Pad0[idx]) dst = dst[1:] } hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.VASpaceSize)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.VAStartInternal)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.VALimitInternal)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.VAMode)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(n.Pad1[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV0080_ALLOC_PARAMETERS) UnmarshalBytes(src []byte) []byte { n.DeviceID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = n.HClientShare.UnmarshalUnsafe(src) src = n.HTargetClient.UnmarshalUnsafe(src) src = n.HTargetDevice.UnmarshalUnsafe(src) n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { n.Pad0[idx] = src[0] src = src[1:] } n.VASpaceSize = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.VAStartInternal = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.VALimitInternal = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.VAMode = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { n.Pad1[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV0080_ALLOC_PARAMETERS) Packed() bool { return n.HClientShare.Packed() && n.HTargetClient.Packed() && n.HTargetDevice.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV0080_ALLOC_PARAMETERS) MarshalUnsafe(dst []byte) []byte { if n.HClientShare.Packed() && n.HTargetClient.Packed() && n.HTargetDevice.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV0080_ALLOC_PARAMETERS doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV0080_ALLOC_PARAMETERS) UnmarshalUnsafe(src []byte) []byte { if n.HClientShare.Packed() && n.HTargetClient.Packed() && n.HTargetDevice.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV0080_ALLOC_PARAMETERS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV0080_ALLOC_PARAMETERS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClientShare.Packed() && n.HTargetClient.Packed() && n.HTargetDevice.Packed() { // Type NV0080_ALLOC_PARAMETERS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV0080_ALLOC_PARAMETERS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV0080_ALLOC_PARAMETERS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClientShare.Packed() && n.HTargetClient.Packed() && n.HTargetDevice.Packed() { // Type NV0080_ALLOC_PARAMETERS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV0080_ALLOC_PARAMETERS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV0080_ALLOC_PARAMETERS) WriteTo(writer io.Writer) (int64, error) { if !n.HClientShare.Packed() && n.HTargetClient.Packed() && n.HTargetDevice.Packed() { // Type NV0080_ALLOC_PARAMETERS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV00DE_ALLOC_PARAMETERS) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV00DE_ALLOC_PARAMETERS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Reserved)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV00DE_ALLOC_PARAMETERS) UnmarshalBytes(src []byte) []byte { n.Reserved = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV00DE_ALLOC_PARAMETERS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV00DE_ALLOC_PARAMETERS) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV00DE_ALLOC_PARAMETERS) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV00DE_ALLOC_PARAMETERS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV00DE_ALLOC_PARAMETERS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV00DE_ALLOC_PARAMETERS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV00DE_ALLOC_PARAMETERS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV00DE_ALLOC_PARAMETERS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV00DE_ALLOC_PARAMETERS_V545) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV00DE_ALLOC_PARAMETERS_V545) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.PolledDataMask)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV00DE_ALLOC_PARAMETERS_V545) UnmarshalBytes(src []byte) []byte { n.PolledDataMask = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV00DE_ALLOC_PARAMETERS_V545) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV00DE_ALLOC_PARAMETERS_V545) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV00DE_ALLOC_PARAMETERS_V545) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV00DE_ALLOC_PARAMETERS_V545) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV00DE_ALLOC_PARAMETERS_V545) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV00DE_ALLOC_PARAMETERS_V545) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV00DE_ALLOC_PARAMETERS_V545) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV00DE_ALLOC_PARAMETERS_V545) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV00F8_ALLOCATION_PARAMETERS) SizeBytes() int { return 32 + (*nv00f8Map)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV00F8_ALLOCATION_PARAMETERS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.Alignment)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.AllocSize)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.PageSize)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.AllocFlags)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] dst = n.Map.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV00F8_ALLOCATION_PARAMETERS) UnmarshalBytes(src []byte) []byte { n.Alignment = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.AllocSize = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.PageSize = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.AllocFlags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] src = n.Map.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV00F8_ALLOCATION_PARAMETERS) Packed() bool { return n.Map.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV00F8_ALLOCATION_PARAMETERS) MarshalUnsafe(dst []byte) []byte { if n.Map.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV00F8_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV00F8_ALLOCATION_PARAMETERS) UnmarshalUnsafe(src []byte) []byte { if n.Map.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV00F8_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV00F8_ALLOCATION_PARAMETERS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.Map.Packed() { // Type NV00F8_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV00F8_ALLOCATION_PARAMETERS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV00F8_ALLOCATION_PARAMETERS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.Map.Packed() { // Type NV00F8_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV00F8_ALLOCATION_PARAMETERS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV00F8_ALLOCATION_PARAMETERS) WriteTo(writer io.Writer) (int64, error) { if !n.Map.Packed() { // Type NV00F8_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV00FD_ALLOCATION_PARAMETERS) SizeBytes() int { return 32 + (*P64)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV00FD_ALLOCATION_PARAMETERS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.Alignment)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.AllocSize)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.PageSize)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.AllocFlags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.NumGPUs)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] dst = n.POsEvent.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV00FD_ALLOCATION_PARAMETERS) UnmarshalBytes(src []byte) []byte { n.Alignment = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.AllocSize = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.PageSize = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.AllocFlags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.NumGPUs = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] src = n.POsEvent.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV00FD_ALLOCATION_PARAMETERS) Packed() bool { return n.POsEvent.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV00FD_ALLOCATION_PARAMETERS) MarshalUnsafe(dst []byte) []byte { if n.POsEvent.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV00FD_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV00FD_ALLOCATION_PARAMETERS) UnmarshalUnsafe(src []byte) []byte { if n.POsEvent.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV00FD_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV00FD_ALLOCATION_PARAMETERS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.POsEvent.Packed() { // Type NV00FD_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV00FD_ALLOCATION_PARAMETERS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV00FD_ALLOCATION_PARAMETERS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.POsEvent.Packed() { // Type NV00FD_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV00FD_ALLOCATION_PARAMETERS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV00FD_ALLOCATION_PARAMETERS) WriteTo(writer io.Writer) (int64, error) { if !n.POsEvent.Packed() { // Type NV00FD_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV00FD_ALLOCATION_PARAMETERS_V545) SizeBytes() int { return 2 + (*NV_EXPORT_MEM_PACKET)(nil).SizeBytes() + 1*6 + (*NV00FD_ALLOCATION_PARAMETERS)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV00FD_ALLOCATION_PARAMETERS_V545) MarshalBytes(dst []byte) []byte { dst = n.ExpPacket.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint16(dst[:2], uint16(n.Index)) dst = dst[2:] // Padding: dst[:sizeof(byte)*6] ~= [6]byte{0} dst = dst[1*(6):] dst = n.NV00FD_ALLOCATION_PARAMETERS.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV00FD_ALLOCATION_PARAMETERS_V545) UnmarshalBytes(src []byte) []byte { src = n.ExpPacket.UnmarshalUnsafe(src) n.Index = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] // Padding: ~ copy([6]byte(n._), src[:sizeof(byte)*6]) src = src[1*(6):] src = n.NV00FD_ALLOCATION_PARAMETERS.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV00FD_ALLOCATION_PARAMETERS_V545) Packed() bool { return n.ExpPacket.Packed() && n.NV00FD_ALLOCATION_PARAMETERS.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV00FD_ALLOCATION_PARAMETERS_V545) MarshalUnsafe(dst []byte) []byte { if n.ExpPacket.Packed() && n.NV00FD_ALLOCATION_PARAMETERS.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV00FD_ALLOCATION_PARAMETERS_V545 doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV00FD_ALLOCATION_PARAMETERS_V545) UnmarshalUnsafe(src []byte) []byte { if n.ExpPacket.Packed() && n.NV00FD_ALLOCATION_PARAMETERS.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV00FD_ALLOCATION_PARAMETERS_V545 doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV00FD_ALLOCATION_PARAMETERS_V545) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.ExpPacket.Packed() && n.NV00FD_ALLOCATION_PARAMETERS.Packed() { // Type NV00FD_ALLOCATION_PARAMETERS_V545 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV00FD_ALLOCATION_PARAMETERS_V545) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV00FD_ALLOCATION_PARAMETERS_V545) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.ExpPacket.Packed() && n.NV00FD_ALLOCATION_PARAMETERS.Packed() { // Type NV00FD_ALLOCATION_PARAMETERS_V545 doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV00FD_ALLOCATION_PARAMETERS_V545) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV00FD_ALLOCATION_PARAMETERS_V545) WriteTo(writer io.Writer) (int64, error) { if !n.ExpPacket.Packed() && n.NV00FD_ALLOCATION_PARAMETERS.Packed() { // Type NV00FD_ALLOCATION_PARAMETERS_V545 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV2080_ALLOC_PARAMETERS) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV2080_ALLOC_PARAMETERS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.SubDeviceID)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV2080_ALLOC_PARAMETERS) UnmarshalBytes(src []byte) []byte { n.SubDeviceID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV2080_ALLOC_PARAMETERS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV2080_ALLOC_PARAMETERS) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV2080_ALLOC_PARAMETERS) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV2080_ALLOC_PARAMETERS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV2080_ALLOC_PARAMETERS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV2080_ALLOC_PARAMETERS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV2080_ALLOC_PARAMETERS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV2080_ALLOC_PARAMETERS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV2081_ALLOC_PARAMETERS) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV2081_ALLOC_PARAMETERS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Reserved)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV2081_ALLOC_PARAMETERS) UnmarshalBytes(src []byte) []byte { n.Reserved = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV2081_ALLOC_PARAMETERS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV2081_ALLOC_PARAMETERS) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV2081_ALLOC_PARAMETERS) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV2081_ALLOC_PARAMETERS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV2081_ALLOC_PARAMETERS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV2081_ALLOC_PARAMETERS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV2081_ALLOC_PARAMETERS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV2081_ALLOC_PARAMETERS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV503B_ALLOC_PARAMETERS) SizeBytes() int { return 32 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*NV503B_BAR1_P2P_DMA_INFO)(nil).SizeBytes() + (*NV503B_BAR1_P2P_DMA_INFO)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV503B_ALLOC_PARAMETERS) MarshalBytes(dst []byte) []byte { dst = n.HSubDevice.MarshalUnsafe(dst) dst = n.HPeerSubDevice.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.SubDevicePeerIDMask)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.PeerSubDevicePeerIDMask)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.MailboxBar1Addr)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.MailboxTotalSize)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.SubDeviceEgmPeerIDMask)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.PeerSubDeviceEgmPeerIDMask)) dst = dst[4:] dst = n.L2pBar1P2PDmaInfo.MarshalUnsafe(dst) dst = n.P2lBar1P2PDmaInfo.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV503B_ALLOC_PARAMETERS) UnmarshalBytes(src []byte) []byte { src = n.HSubDevice.UnmarshalUnsafe(src) src = n.HPeerSubDevice.UnmarshalUnsafe(src) n.SubDevicePeerIDMask = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.PeerSubDevicePeerIDMask = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.MailboxBar1Addr = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.MailboxTotalSize = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.SubDeviceEgmPeerIDMask = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.PeerSubDeviceEgmPeerIDMask = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = n.L2pBar1P2PDmaInfo.UnmarshalUnsafe(src) src = n.P2lBar1P2PDmaInfo.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV503B_ALLOC_PARAMETERS) Packed() bool { return n.HPeerSubDevice.Packed() && n.HSubDevice.Packed() && n.L2pBar1P2PDmaInfo.Packed() && n.P2lBar1P2PDmaInfo.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV503B_ALLOC_PARAMETERS) MarshalUnsafe(dst []byte) []byte { if n.HPeerSubDevice.Packed() && n.HSubDevice.Packed() && n.L2pBar1P2PDmaInfo.Packed() && n.P2lBar1P2PDmaInfo.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV503B_ALLOC_PARAMETERS doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV503B_ALLOC_PARAMETERS) UnmarshalUnsafe(src []byte) []byte { if n.HPeerSubDevice.Packed() && n.HSubDevice.Packed() && n.L2pBar1P2PDmaInfo.Packed() && n.P2lBar1P2PDmaInfo.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV503B_ALLOC_PARAMETERS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV503B_ALLOC_PARAMETERS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HPeerSubDevice.Packed() && n.HSubDevice.Packed() && n.L2pBar1P2PDmaInfo.Packed() && n.P2lBar1P2PDmaInfo.Packed() { // Type NV503B_ALLOC_PARAMETERS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV503B_ALLOC_PARAMETERS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV503B_ALLOC_PARAMETERS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HPeerSubDevice.Packed() && n.HSubDevice.Packed() && n.L2pBar1P2PDmaInfo.Packed() && n.P2lBar1P2PDmaInfo.Packed() { // Type NV503B_ALLOC_PARAMETERS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV503B_ALLOC_PARAMETERS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV503B_ALLOC_PARAMETERS) WriteTo(writer io.Writer) (int64, error) { if !n.HPeerSubDevice.Packed() && n.HSubDevice.Packed() && n.L2pBar1P2PDmaInfo.Packed() && n.P2lBar1P2PDmaInfo.Packed() { // Type NV503B_ALLOC_PARAMETERS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV503B_BAR1_P2P_DMA_INFO) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV503B_BAR1_P2P_DMA_INFO) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.DmaAddress)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.DmaSize)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV503B_BAR1_P2P_DMA_INFO) UnmarshalBytes(src []byte) []byte { n.DmaAddress = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.DmaSize = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV503B_BAR1_P2P_DMA_INFO) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV503B_BAR1_P2P_DMA_INFO) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV503B_BAR1_P2P_DMA_INFO) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV503B_BAR1_P2P_DMA_INFO) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV503B_BAR1_P2P_DMA_INFO) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV503B_BAR1_P2P_DMA_INFO) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV503B_BAR1_P2P_DMA_INFO) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV503B_BAR1_P2P_DMA_INFO) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV503C_ALLOC_PARAMETERS) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV503C_ALLOC_PARAMETERS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV503C_ALLOC_PARAMETERS) UnmarshalBytes(src []byte) []byte { n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV503C_ALLOC_PARAMETERS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV503C_ALLOC_PARAMETERS) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV503C_ALLOC_PARAMETERS) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV503C_ALLOC_PARAMETERS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV503C_ALLOC_PARAMETERS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV503C_ALLOC_PARAMETERS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV503C_ALLOC_PARAMETERS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV503C_ALLOC_PARAMETERS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV83DE_ALLOC_PARAMETERS) SizeBytes() int { return 0 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV83DE_ALLOC_PARAMETERS) MarshalBytes(dst []byte) []byte { dst = n.HDebuggerClient_Obsolete.MarshalUnsafe(dst) dst = n.HAppClient.MarshalUnsafe(dst) dst = n.HClass3DObject.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV83DE_ALLOC_PARAMETERS) UnmarshalBytes(src []byte) []byte { src = n.HDebuggerClient_Obsolete.UnmarshalUnsafe(src) src = n.HAppClient.UnmarshalUnsafe(src) src = n.HClass3DObject.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV83DE_ALLOC_PARAMETERS) Packed() bool { return n.HAppClient.Packed() && n.HClass3DObject.Packed() && n.HDebuggerClient_Obsolete.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV83DE_ALLOC_PARAMETERS) MarshalUnsafe(dst []byte) []byte { if n.HAppClient.Packed() && n.HClass3DObject.Packed() && n.HDebuggerClient_Obsolete.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV83DE_ALLOC_PARAMETERS doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV83DE_ALLOC_PARAMETERS) UnmarshalUnsafe(src []byte) []byte { if n.HAppClient.Packed() && n.HClass3DObject.Packed() && n.HDebuggerClient_Obsolete.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV83DE_ALLOC_PARAMETERS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV83DE_ALLOC_PARAMETERS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HAppClient.Packed() && n.HClass3DObject.Packed() && n.HDebuggerClient_Obsolete.Packed() { // Type NV83DE_ALLOC_PARAMETERS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV83DE_ALLOC_PARAMETERS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV83DE_ALLOC_PARAMETERS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HAppClient.Packed() && n.HClass3DObject.Packed() && n.HDebuggerClient_Obsolete.Packed() { // Type NV83DE_ALLOC_PARAMETERS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV83DE_ALLOC_PARAMETERS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV83DE_ALLOC_PARAMETERS) WriteTo(writer io.Writer) (int64, error) { if !n.HAppClient.Packed() && n.HClass3DObject.Packed() && n.HDebuggerClient_Obsolete.Packed() { // Type NV83DE_ALLOC_PARAMETERS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NVB0B5_ALLOCATION_PARAMETERS) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NVB0B5_ALLOCATION_PARAMETERS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Version)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.EngineType)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NVB0B5_ALLOCATION_PARAMETERS) UnmarshalBytes(src []byte) []byte { n.Version = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.EngineType = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NVB0B5_ALLOCATION_PARAMETERS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NVB0B5_ALLOCATION_PARAMETERS) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NVB0B5_ALLOCATION_PARAMETERS) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NVB0B5_ALLOCATION_PARAMETERS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NVB0B5_ALLOCATION_PARAMETERS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NVB0B5_ALLOCATION_PARAMETERS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NVB0B5_ALLOCATION_PARAMETERS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NVB0B5_ALLOCATION_PARAMETERS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV_CHANNEL_ALLOC_PARAMS) SizeBytes() int { return 40 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes()*NV_MAX_SUBDEVICES + 8*NV_MAX_SUBDEVICES + (*Handle)(nil).SizeBytes() + (*NV_MEMORY_DESC_PARAMS)(nil).SizeBytes() + (*NV_MEMORY_DESC_PARAMS)(nil).SizeBytes() + (*NV_MEMORY_DESC_PARAMS)(nil).SizeBytes() + (*NV_MEMORY_DESC_PARAMS)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*NV_MEMORY_DESC_PARAMS)(nil).SizeBytes() + (*NV_MEMORY_DESC_PARAMS)(nil).SizeBytes() + 4*CC_CHAN_ALLOC_IV_SIZE_DWORD + 4*CC_CHAN_ALLOC_IV_SIZE_DWORD + 4*CC_CHAN_ALLOC_NONCE_SIZE_DWORD } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV_CHANNEL_ALLOC_PARAMS) MarshalBytes(dst []byte) []byte { dst = n.HObjectError.MarshalUnsafe(dst) dst = n.HObjectBuffer.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.GPFIFOOffset)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.GPFIFOEntries)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] dst = n.HContextShare.MarshalUnsafe(dst) dst = n.HVASpace.MarshalUnsafe(dst) for idx := 0; idx < NV_MAX_SUBDEVICES; idx++ { dst = n.HUserdMemory[idx].MarshalUnsafe(dst) } for idx := 0; idx < NV_MAX_SUBDEVICES; idx++ { hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.UserdOffset[idx])) dst = dst[8:] } hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.EngineType)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.CID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.SubDeviceID)) dst = dst[4:] dst = n.HObjectECCError.MarshalUnsafe(dst) dst = n.InstanceMem.MarshalUnsafe(dst) dst = n.UserdMem.MarshalUnsafe(dst) dst = n.RamfcMem.MarshalUnsafe(dst) dst = n.MthdbufMem.MarshalUnsafe(dst) dst = n.HPhysChannelGroup.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.InternalFlags)) dst = dst[4:] dst = n.ErrorNotifierMem.MarshalUnsafe(dst) dst = n.ECCErrorNotifierMem.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.ProcessID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.SubProcessID)) dst = dst[4:] for idx := 0; idx < CC_CHAN_ALLOC_IV_SIZE_DWORD; idx++ { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.EncryptIv[idx])) dst = dst[4:] } for idx := 0; idx < CC_CHAN_ALLOC_IV_SIZE_DWORD; idx++ { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.DecryptIv[idx])) dst = dst[4:] } for idx := 0; idx < CC_CHAN_ALLOC_NONCE_SIZE_DWORD; idx++ { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.HmacNonce[idx])) dst = dst[4:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV_CHANNEL_ALLOC_PARAMS) UnmarshalBytes(src []byte) []byte { src = n.HObjectError.UnmarshalUnsafe(src) src = n.HObjectBuffer.UnmarshalUnsafe(src) n.GPFIFOOffset = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.GPFIFOEntries = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = n.HContextShare.UnmarshalUnsafe(src) src = n.HVASpace.UnmarshalUnsafe(src) for idx := 0; idx < NV_MAX_SUBDEVICES; idx++ { src = n.HUserdMemory[idx].UnmarshalUnsafe(src) } for idx := 0; idx < NV_MAX_SUBDEVICES; idx++ { n.UserdOffset[idx] = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] } n.EngineType = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.CID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.SubDeviceID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = n.HObjectECCError.UnmarshalUnsafe(src) src = n.InstanceMem.UnmarshalUnsafe(src) src = n.UserdMem.UnmarshalUnsafe(src) src = n.RamfcMem.UnmarshalUnsafe(src) src = n.MthdbufMem.UnmarshalUnsafe(src) src = n.HPhysChannelGroup.UnmarshalUnsafe(src) n.InternalFlags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = n.ErrorNotifierMem.UnmarshalUnsafe(src) src = n.ECCErrorNotifierMem.UnmarshalUnsafe(src) n.ProcessID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.SubProcessID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < CC_CHAN_ALLOC_IV_SIZE_DWORD; idx++ { n.EncryptIv[idx] = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] } for idx := 0; idx < CC_CHAN_ALLOC_IV_SIZE_DWORD; idx++ { n.DecryptIv[idx] = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] } for idx := 0; idx < CC_CHAN_ALLOC_NONCE_SIZE_DWORD; idx++ { n.HmacNonce[idx] = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV_CHANNEL_ALLOC_PARAMS) Packed() bool { return n.ECCErrorNotifierMem.Packed() && n.ErrorNotifierMem.Packed() && n.HContextShare.Packed() && n.HObjectBuffer.Packed() && n.HObjectECCError.Packed() && n.HObjectError.Packed() && n.HPhysChannelGroup.Packed() && n.HUserdMemory[0].Packed() && n.HVASpace.Packed() && n.InstanceMem.Packed() && n.MthdbufMem.Packed() && n.RamfcMem.Packed() && n.UserdMem.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV_CHANNEL_ALLOC_PARAMS) MarshalUnsafe(dst []byte) []byte { if n.ECCErrorNotifierMem.Packed() && n.ErrorNotifierMem.Packed() && n.HContextShare.Packed() && n.HObjectBuffer.Packed() && n.HObjectECCError.Packed() && n.HObjectError.Packed() && n.HPhysChannelGroup.Packed() && n.HUserdMemory[0].Packed() && n.HVASpace.Packed() && n.InstanceMem.Packed() && n.MthdbufMem.Packed() && n.RamfcMem.Packed() && n.UserdMem.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV_CHANNEL_ALLOC_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV_CHANNEL_ALLOC_PARAMS) UnmarshalUnsafe(src []byte) []byte { if n.ECCErrorNotifierMem.Packed() && n.ErrorNotifierMem.Packed() && n.HContextShare.Packed() && n.HObjectBuffer.Packed() && n.HObjectECCError.Packed() && n.HObjectError.Packed() && n.HPhysChannelGroup.Packed() && n.HUserdMemory[0].Packed() && n.HVASpace.Packed() && n.InstanceMem.Packed() && n.MthdbufMem.Packed() && n.RamfcMem.Packed() && n.UserdMem.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV_CHANNEL_ALLOC_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV_CHANNEL_ALLOC_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.ECCErrorNotifierMem.Packed() && n.ErrorNotifierMem.Packed() && n.HContextShare.Packed() && n.HObjectBuffer.Packed() && n.HObjectECCError.Packed() && n.HObjectError.Packed() && n.HPhysChannelGroup.Packed() && n.HUserdMemory[0].Packed() && n.HVASpace.Packed() && n.InstanceMem.Packed() && n.MthdbufMem.Packed() && n.RamfcMem.Packed() && n.UserdMem.Packed() { // Type NV_CHANNEL_ALLOC_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV_CHANNEL_ALLOC_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV_CHANNEL_ALLOC_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.ECCErrorNotifierMem.Packed() && n.ErrorNotifierMem.Packed() && n.HContextShare.Packed() && n.HObjectBuffer.Packed() && n.HObjectECCError.Packed() && n.HObjectError.Packed() && n.HPhysChannelGroup.Packed() && n.HUserdMemory[0].Packed() && n.HVASpace.Packed() && n.InstanceMem.Packed() && n.MthdbufMem.Packed() && n.RamfcMem.Packed() && n.UserdMem.Packed() { // Type NV_CHANNEL_ALLOC_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV_CHANNEL_ALLOC_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV_CHANNEL_ALLOC_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !n.ECCErrorNotifierMem.Packed() && n.ErrorNotifierMem.Packed() && n.HContextShare.Packed() && n.HObjectBuffer.Packed() && n.HObjectECCError.Packed() && n.HObjectError.Packed() && n.HPhysChannelGroup.Packed() && n.HUserdMemory[0].Packed() && n.HVASpace.Packed() && n.InstanceMem.Packed() && n.MthdbufMem.Packed() && n.RamfcMem.Packed() && n.UserdMem.Packed() { // Type NV_CHANNEL_ALLOC_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS) SizeBytes() int { return 5 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + 1*3 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS) MarshalBytes(dst []byte) []byte { dst = n.HObjectError.MarshalUnsafe(dst) dst = n.HObjectECCError.MarshalUnsafe(dst) dst = n.HVASpace.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.EngineType)) dst = dst[4:] dst[0] = byte(n.BIsCallingContextVgpuPlugin) dst = dst[1:] for idx := 0; idx < 3; idx++ { dst[0] = byte(n.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS) UnmarshalBytes(src []byte) []byte { src = n.HObjectError.UnmarshalUnsafe(src) src = n.HObjectECCError.UnmarshalUnsafe(src) src = n.HVASpace.UnmarshalUnsafe(src) n.EngineType = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.BIsCallingContextVgpuPlugin = uint8(src[0]) src = src[1:] for idx := 0; idx < 3; idx++ { n.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS) Packed() bool { return n.HObjectECCError.Packed() && n.HObjectError.Packed() && n.HVASpace.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS) MarshalUnsafe(dst []byte) []byte { if n.HObjectECCError.Packed() && n.HObjectError.Packed() && n.HVASpace.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS) UnmarshalUnsafe(src []byte) []byte { if n.HObjectECCError.Packed() && n.HObjectError.Packed() && n.HVASpace.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HObjectECCError.Packed() && n.HObjectError.Packed() && n.HVASpace.Packed() { // Type NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HObjectECCError.Packed() && n.HObjectError.Packed() && n.HVASpace.Packed() { // Type NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS) WriteTo(writer io.Writer) (int64, error) { if !n.HObjectECCError.Packed() && n.HObjectError.Packed() && n.HVASpace.Packed() { // Type NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS) SizeBytes() int { return 4 + (*Handle)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS) MarshalBytes(dst []byte) []byte { dst = n.Handle.MarshalUnsafe(dst) // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS) UnmarshalBytes(src []byte) []byte { src = n.Handle.UnmarshalUnsafe(src) // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS) Packed() bool { return n.Handle.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS) MarshalUnsafe(dst []byte) []byte { if n.Handle.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS) UnmarshalUnsafe(src []byte) []byte { if n.Handle.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.Handle.Packed() { // Type NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.Handle.Packed() { // Type NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !n.Handle.Packed() { // Type NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV_CTXSHARE_ALLOCATION_PARAMETERS) SizeBytes() int { return 8 + (*Handle)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV_CTXSHARE_ALLOCATION_PARAMETERS) MarshalBytes(dst []byte) []byte { dst = n.HVASpace.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.SubctxID)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV_CTXSHARE_ALLOCATION_PARAMETERS) UnmarshalBytes(src []byte) []byte { src = n.HVASpace.UnmarshalUnsafe(src) n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.SubctxID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV_CTXSHARE_ALLOCATION_PARAMETERS) Packed() bool { return n.HVASpace.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV_CTXSHARE_ALLOCATION_PARAMETERS) MarshalUnsafe(dst []byte) []byte { if n.HVASpace.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV_CTXSHARE_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV_CTXSHARE_ALLOCATION_PARAMETERS) UnmarshalUnsafe(src []byte) []byte { if n.HVASpace.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV_CTXSHARE_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV_CTXSHARE_ALLOCATION_PARAMETERS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HVASpace.Packed() { // Type NV_CTXSHARE_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV_CTXSHARE_ALLOCATION_PARAMETERS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV_CTXSHARE_ALLOCATION_PARAMETERS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HVASpace.Packed() { // Type NV_CTXSHARE_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV_CTXSHARE_ALLOCATION_PARAMETERS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV_CTXSHARE_ALLOCATION_PARAMETERS) WriteTo(writer io.Writer) (int64, error) { if !n.HVASpace.Packed() { // Type NV_CTXSHARE_ALLOCATION_PARAMETERS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV_EXPORT_MEM_PACKET) SizeBytes() int { return 0 + 1*NV_MEM_EXPORT_UUID_LEN + 1*16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV_EXPORT_MEM_PACKET) MarshalBytes(dst []byte) []byte { for idx := 0; idx < NV_MEM_EXPORT_UUID_LEN; idx++ { dst[0] = byte(n.UUID[idx]) dst = dst[1:] } for idx := 0; idx < 16; idx++ { dst[0] = byte(n.Opaque[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV_EXPORT_MEM_PACKET) UnmarshalBytes(src []byte) []byte { for idx := 0; idx < NV_MEM_EXPORT_UUID_LEN; idx++ { n.UUID[idx] = uint8(src[0]) src = src[1:] } for idx := 0; idx < 16; idx++ { n.Opaque[idx] = uint8(src[0]) src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV_EXPORT_MEM_PACKET) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV_EXPORT_MEM_PACKET) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV_EXPORT_MEM_PACKET) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV_EXPORT_MEM_PACKET) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV_EXPORT_MEM_PACKET) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV_EXPORT_MEM_PACKET) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV_EXPORT_MEM_PACKET) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV_EXPORT_MEM_PACKET) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV_GR_ALLOCATION_PARAMETERS) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV_GR_ALLOCATION_PARAMETERS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Version)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Size)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Caps)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV_GR_ALLOCATION_PARAMETERS) UnmarshalBytes(src []byte) []byte { n.Version = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Size = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Caps = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV_GR_ALLOCATION_PARAMETERS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV_GR_ALLOCATION_PARAMETERS) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV_GR_ALLOCATION_PARAMETERS) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV_GR_ALLOCATION_PARAMETERS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV_GR_ALLOCATION_PARAMETERS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV_GR_ALLOCATION_PARAMETERS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV_GR_ALLOCATION_PARAMETERS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV_GR_ALLOCATION_PARAMETERS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV_HOPPER_USERMODE_A_PARAMS) SizeBytes() int { return 2 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV_HOPPER_USERMODE_A_PARAMS) MarshalBytes(dst []byte) []byte { dst[0] = byte(n.Bar1Mapping) dst = dst[1:] dst[0] = byte(n.Priv) dst = dst[1:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV_HOPPER_USERMODE_A_PARAMS) UnmarshalBytes(src []byte) []byte { n.Bar1Mapping = uint8(src[0]) src = src[1:] n.Priv = uint8(src[0]) src = src[1:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV_HOPPER_USERMODE_A_PARAMS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV_HOPPER_USERMODE_A_PARAMS) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV_HOPPER_USERMODE_A_PARAMS) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV_HOPPER_USERMODE_A_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV_HOPPER_USERMODE_A_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV_HOPPER_USERMODE_A_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV_HOPPER_USERMODE_A_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV_HOPPER_USERMODE_A_PARAMS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV_MEMORY_ALLOCATION_PARAMS) SizeBytes() int { return 108 + (*P64)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV_MEMORY_ALLOCATION_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Owner)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Type)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Width)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Height)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Pitch)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Attr)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Attr2)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Format)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.ComprCovg)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.ZcullCovg)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.RangeLo)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.RangeHi)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.Size)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.Alignment)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.Offset)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.Limit)) dst = dst[8:] dst = n.Address.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.CtagOffset)) dst = dst[4:] dst = n.HVASpace.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.InternalFlags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Tag)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV_MEMORY_ALLOCATION_PARAMS) UnmarshalBytes(src []byte) []byte { n.Owner = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Type = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Width = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Height = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Pitch = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Attr = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Attr2 = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Format = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.ComprCovg = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.ZcullCovg = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] n.RangeLo = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.RangeHi = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.Size = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.Alignment = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.Offset = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.Limit = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = n.Address.UnmarshalUnsafe(src) n.CtagOffset = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = n.HVASpace.UnmarshalUnsafe(src) n.InternalFlags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Tag = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV_MEMORY_ALLOCATION_PARAMS) Packed() bool { return n.Address.Packed() && n.HVASpace.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV_MEMORY_ALLOCATION_PARAMS) MarshalUnsafe(dst []byte) []byte { if n.Address.Packed() && n.HVASpace.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV_MEMORY_ALLOCATION_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV_MEMORY_ALLOCATION_PARAMS) UnmarshalUnsafe(src []byte) []byte { if n.Address.Packed() && n.HVASpace.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV_MEMORY_ALLOCATION_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV_MEMORY_ALLOCATION_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.Address.Packed() && n.HVASpace.Packed() { // Type NV_MEMORY_ALLOCATION_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV_MEMORY_ALLOCATION_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV_MEMORY_ALLOCATION_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.Address.Packed() && n.HVASpace.Packed() { // Type NV_MEMORY_ALLOCATION_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV_MEMORY_ALLOCATION_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV_MEMORY_ALLOCATION_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !n.Address.Packed() && n.HVASpace.Packed() { // Type NV_MEMORY_ALLOCATION_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV_MEMORY_ALLOCATION_PARAMS_V545) SizeBytes() int { return 8 + (*NV_MEMORY_ALLOCATION_PARAMS)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV_MEMORY_ALLOCATION_PARAMS_V545) MarshalBytes(dst []byte) []byte { dst = n.NV_MEMORY_ALLOCATION_PARAMS.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.NumaNode)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV_MEMORY_ALLOCATION_PARAMS_V545) UnmarshalBytes(src []byte) []byte { src = n.NV_MEMORY_ALLOCATION_PARAMS.UnmarshalUnsafe(src) n.NumaNode = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV_MEMORY_ALLOCATION_PARAMS_V545) Packed() bool { return n.NV_MEMORY_ALLOCATION_PARAMS.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV_MEMORY_ALLOCATION_PARAMS_V545) MarshalUnsafe(dst []byte) []byte { if n.NV_MEMORY_ALLOCATION_PARAMS.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV_MEMORY_ALLOCATION_PARAMS_V545 doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV_MEMORY_ALLOCATION_PARAMS_V545) UnmarshalUnsafe(src []byte) []byte { if n.NV_MEMORY_ALLOCATION_PARAMS.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV_MEMORY_ALLOCATION_PARAMS_V545 doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV_MEMORY_ALLOCATION_PARAMS_V545) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.NV_MEMORY_ALLOCATION_PARAMS.Packed() { // Type NV_MEMORY_ALLOCATION_PARAMS_V545 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV_MEMORY_ALLOCATION_PARAMS_V545) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV_MEMORY_ALLOCATION_PARAMS_V545) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.NV_MEMORY_ALLOCATION_PARAMS.Packed() { // Type NV_MEMORY_ALLOCATION_PARAMS_V545 doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV_MEMORY_ALLOCATION_PARAMS_V545) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV_MEMORY_ALLOCATION_PARAMS_V545) WriteTo(writer io.Writer) (int64, error) { if !n.NV_MEMORY_ALLOCATION_PARAMS.Packed() { // Type NV_MEMORY_ALLOCATION_PARAMS_V545 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV_MEMORY_DESC_PARAMS) SizeBytes() int { return 24 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV_MEMORY_DESC_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.Base)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.Size)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.AddressSpace)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.CacheAttrib)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV_MEMORY_DESC_PARAMS) UnmarshalBytes(src []byte) []byte { n.Base = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.Size = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.AddressSpace = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.CacheAttrib = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV_MEMORY_DESC_PARAMS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV_MEMORY_DESC_PARAMS) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV_MEMORY_DESC_PARAMS) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV_MEMORY_DESC_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV_MEMORY_DESC_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV_MEMORY_DESC_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV_MEMORY_DESC_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV_MEMORY_DESC_PARAMS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV_VASPACE_ALLOCATION_PARAMETERS) SizeBytes() int { return 44 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV_VASPACE_ALLOCATION_PARAMETERS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Index)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.VASize)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.VAStartInternal)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.VALimitInternal)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.BigPageSize)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(n.Pad0[idx]) dst = dst[1:] } hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.VABase)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV_VASPACE_ALLOCATION_PARAMETERS) UnmarshalBytes(src []byte) []byte { n.Index = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.VASize = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.VAStartInternal = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.VALimitInternal = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.BigPageSize = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { n.Pad0[idx] = src[0] src = src[1:] } n.VABase = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV_VASPACE_ALLOCATION_PARAMETERS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV_VASPACE_ALLOCATION_PARAMETERS) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV_VASPACE_ALLOCATION_PARAMETERS) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV_VASPACE_ALLOCATION_PARAMETERS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV_VASPACE_ALLOCATION_PARAMETERS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV_VASPACE_ALLOCATION_PARAMETERS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV_VASPACE_ALLOCATION_PARAMETERS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV_VASPACE_ALLOCATION_PARAMETERS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *nv00f8Map) SizeBytes() int { return 12 + (*Handle)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *nv00f8Map) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.offset)) dst = dst[8:] dst = n.hVidMem.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.flags)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *nv00f8Map) UnmarshalBytes(src []byte) []byte { n.offset = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = n.hVidMem.UnmarshalUnsafe(src) n.flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *nv00f8Map) Packed() bool { return n.hVidMem.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *nv00f8Map) MarshalUnsafe(dst []byte) []byte { if n.hVidMem.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type nv00f8Map doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *nv00f8Map) UnmarshalUnsafe(src []byte) []byte { if n.hVidMem.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type nv00f8Map doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *nv00f8Map) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.hVidMem.Packed() { // Type nv00f8Map doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *nv00f8Map) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *nv00f8Map) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.hVidMem.Packed() { // Type nv00f8Map doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *nv00f8Map) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *nv00f8Map) WriteTo(writer io.Writer) (int64, error) { if !n.hVidMem.Packed() { // Type nv00f8Map doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT) SizeBytes() int { return 4 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Type)) dst = dst[4:] dst = n.HDevice.MarshalUnsafe(dst) dst = n.HParent.MarshalUnsafe(dst) dst = n.HObject.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT) UnmarshalBytes(src []byte) []byte { n.Type = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = n.HDevice.UnmarshalUnsafe(src) src = n.HParent.UnmarshalUnsafe(src) src = n.HObject.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT) Packed() bool { return n.HDevice.Packed() && n.HObject.Packed() && n.HParent.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT) MarshalUnsafe(dst []byte) []byte { if n.HDevice.Packed() && n.HObject.Packed() && n.HParent.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV0000_CTRL_OS_UNIX_EXPORT_OBJECT doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT) UnmarshalUnsafe(src []byte) []byte { if n.HDevice.Packed() && n.HObject.Packed() && n.HParent.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV0000_CTRL_OS_UNIX_EXPORT_OBJECT doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HDevice.Packed() && n.HObject.Packed() && n.HParent.Packed() { // Type NV0000_CTRL_OS_UNIX_EXPORT_OBJECT doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HDevice.Packed() && n.HObject.Packed() && n.HParent.Packed() { // Type NV0000_CTRL_OS_UNIX_EXPORT_OBJECT doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT) WriteTo(writer io.Writer) (int64, error) { if !n.HDevice.Packed() && n.HObject.Packed() && n.HParent.Packed() { // Type NV0000_CTRL_OS_UNIX_EXPORT_OBJECT doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (p *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS) SizeBytes() int { return 8 + (*NV0000_CTRL_OS_UNIX_EXPORT_OBJECT)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (p *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS) MarshalBytes(dst []byte) []byte { dst = p.Object.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.FD)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.Flags)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (p *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS) UnmarshalBytes(src []byte) []byte { src = p.Object.UnmarshalUnsafe(src) p.FD = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] p.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (p *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS) Packed() bool { return p.Object.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (p *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS) MarshalUnsafe(dst []byte) []byte { if p.Object.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(p), uintptr(size)) return dst[size:] } // Type NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return p.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (p *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS) UnmarshalUnsafe(src []byte) []byte { if p.Object.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(p), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return p.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (p *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.Object.Packed() { // Type NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. p.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (p *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyOutN(cc, addr, p.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (p *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.Object.Packed() { // Type NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. p.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (p *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyInN(cc, addr, p.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (p *NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !p.Object.Packed() { // Type NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, p.SizeBytes()) p.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS) SizeBytes() int { return 10 + 1*2 + 1*NV0000_OS_UNIX_EXPORT_OBJECT_FD_BUFFER_SIZE } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.FD)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.DeviceInstance)) dst = dst[4:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(p.MaxObjects)) dst = dst[2:] for idx := 0; idx < 2; idx++ { dst[0] = byte(p.Pad[idx]) dst = dst[1:] } for idx := 0; idx < NV0000_OS_UNIX_EXPORT_OBJECT_FD_BUFFER_SIZE; idx++ { dst[0] = byte(p.Metadata[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS) UnmarshalBytes(src []byte) []byte { p.FD = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] p.DeviceInstance = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] p.MaxObjects = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] for idx := 0; idx < 2; idx++ { p.Pad[idx] = src[0] src = src[1:] } for idx := 0; idx < NV0000_OS_UNIX_EXPORT_OBJECT_FD_BUFFER_SIZE; idx++ { p.Metadata[idx] = uint8(src[0]) src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS) MarshalUnsafe(dst []byte) []byte { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(p), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS) UnmarshalUnsafe(src []byte) []byte { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(p), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyOutN(cc, addr, p.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyInN(cc, addr, p.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS_V545) SizeBytes() int { return 14 + 1*2 + 1*NV0000_OS_UNIX_EXPORT_OBJECT_FD_BUFFER_SIZE } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS_V545) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.FD)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.DeviceInstance)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.GpuInstanceID)) dst = dst[4:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(p.MaxObjects)) dst = dst[2:] for idx := 0; idx < 2; idx++ { dst[0] = byte(p.Pad[idx]) dst = dst[1:] } for idx := 0; idx < NV0000_OS_UNIX_EXPORT_OBJECT_FD_BUFFER_SIZE; idx++ { dst[0] = byte(p.Metadata[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS_V545) UnmarshalBytes(src []byte) []byte { p.FD = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] p.DeviceInstance = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] p.GpuInstanceID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] p.MaxObjects = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] for idx := 0; idx < 2; idx++ { p.Pad[idx] = src[0] src = src[1:] } for idx := 0; idx < NV0000_OS_UNIX_EXPORT_OBJECT_FD_BUFFER_SIZE; idx++ { p.Metadata[idx] = uint8(src[0]) src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS_V545) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS_V545) MarshalUnsafe(dst []byte) []byte { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(p), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS_V545) UnmarshalUnsafe(src []byte) []byte { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(p), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS_V545) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS_V545) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyOutN(cc, addr, p.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS_V545) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS_V545) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyInN(cc, addr, p.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (p *NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS_V545) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (p *NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS) SizeBytes() int { return 4 + (*NV0000_CTRL_OS_UNIX_EXPORT_OBJECT)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (p *NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.FD)) dst = dst[4:] dst = p.Object.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (p *NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS) UnmarshalBytes(src []byte) []byte { p.FD = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = p.Object.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (p *NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS) Packed() bool { return p.Object.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (p *NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS) MarshalUnsafe(dst []byte) []byte { if p.Object.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(p), uintptr(size)) return dst[size:] } // Type NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return p.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (p *NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS) UnmarshalUnsafe(src []byte) []byte { if p.Object.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(p), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return p.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (p *NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.Object.Packed() { // Type NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. p.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (p *NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyOutN(cc, addr, p.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (p *NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.Object.Packed() { // Type NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. p.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (p *NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyInN(cc, addr, p.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (p *NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !p.Object.Packed() { // Type NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, p.SizeBytes()) p.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS) SizeBytes() int { return 12 + 1*4 + (*P64)(nil).SizeBytes() + (*P64)(nil).SizeBytes() + (*P64)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.SizeOfStrings)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(n.Pad[idx]) dst = dst[1:] } dst = n.PDriverVersionBuffer.MarshalUnsafe(dst) dst = n.PVersionBuffer.MarshalUnsafe(dst) dst = n.PTitleBuffer.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.ChangelistNumber)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.OfficialChangelistNumber)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS) UnmarshalBytes(src []byte) []byte { n.SizeOfStrings = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { n.Pad[idx] = src[0] src = src[1:] } src = n.PDriverVersionBuffer.UnmarshalUnsafe(src) src = n.PVersionBuffer.UnmarshalUnsafe(src) src = n.PTitleBuffer.UnmarshalUnsafe(src) n.ChangelistNumber = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.OfficialChangelistNumber = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS) Packed() bool { return n.PDriverVersionBuffer.Packed() && n.PTitleBuffer.Packed() && n.PVersionBuffer.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS) MarshalUnsafe(dst []byte) []byte { if n.PDriverVersionBuffer.Packed() && n.PTitleBuffer.Packed() && n.PVersionBuffer.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS) UnmarshalUnsafe(src []byte) []byte { if n.PDriverVersionBuffer.Packed() && n.PTitleBuffer.Packed() && n.PVersionBuffer.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.PDriverVersionBuffer.Packed() && n.PTitleBuffer.Packed() && n.PVersionBuffer.Packed() { // Type NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.PDriverVersionBuffer.Packed() && n.PTitleBuffer.Packed() && n.PVersionBuffer.Packed() { // Type NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !n.PDriverVersionBuffer.Packed() && n.PTitleBuffer.Packed() && n.PVersionBuffer.Packed() { // Type NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (p *NV0041_CTRL_GET_SURFACE_INFO_PARAMS) SizeBytes() int { return 4 + 1*4 + (*P64)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (p *NV0041_CTRL_GET_SURFACE_INFO_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.SurfaceInfoListSize)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(p.Pad[idx]) dst = dst[1:] } dst = p.SurfaceInfoList.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (p *NV0041_CTRL_GET_SURFACE_INFO_PARAMS) UnmarshalBytes(src []byte) []byte { p.SurfaceInfoListSize = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { p.Pad[idx] = src[0] src = src[1:] } src = p.SurfaceInfoList.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (p *NV0041_CTRL_GET_SURFACE_INFO_PARAMS) Packed() bool { return p.SurfaceInfoList.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (p *NV0041_CTRL_GET_SURFACE_INFO_PARAMS) MarshalUnsafe(dst []byte) []byte { if p.SurfaceInfoList.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(p), uintptr(size)) return dst[size:] } // Type NV0041_CTRL_GET_SURFACE_INFO_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return p.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (p *NV0041_CTRL_GET_SURFACE_INFO_PARAMS) UnmarshalUnsafe(src []byte) []byte { if p.SurfaceInfoList.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(p), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV0041_CTRL_GET_SURFACE_INFO_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return p.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (p *NV0041_CTRL_GET_SURFACE_INFO_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.SurfaceInfoList.Packed() { // Type NV0041_CTRL_GET_SURFACE_INFO_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. p.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (p *NV0041_CTRL_GET_SURFACE_INFO_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyOutN(cc, addr, p.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (p *NV0041_CTRL_GET_SURFACE_INFO_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.SurfaceInfoList.Packed() { // Type NV0041_CTRL_GET_SURFACE_INFO_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. p.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (p *NV0041_CTRL_GET_SURFACE_INFO_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyInN(cc, addr, p.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (p *NV0041_CTRL_GET_SURFACE_INFO_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !p.SurfaceInfoList.Packed() { // Type NV0041_CTRL_GET_SURFACE_INFO_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, p.SizeBytes()) p.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS) SizeBytes() int { return 4 + 1*4 + (*P64)(nil).SizeBytes() + (*P64)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.NumChannels)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(n.Pad[idx]) dst = dst[1:] } dst = n.PChannelHandleList.MarshalUnsafe(dst) dst = n.PChannelList.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS) UnmarshalBytes(src []byte) []byte { n.NumChannels = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { n.Pad[idx] = src[0] src = src[1:] } src = n.PChannelHandleList.UnmarshalUnsafe(src) src = n.PChannelList.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS) Packed() bool { return n.PChannelHandleList.Packed() && n.PChannelList.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS) MarshalUnsafe(dst []byte) []byte { if n.PChannelHandleList.Packed() && n.PChannelList.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS) UnmarshalUnsafe(src []byte) []byte { if n.PChannelHandleList.Packed() && n.PChannelList.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.PChannelHandleList.Packed() && n.PChannelList.Packed() { // Type NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.PChannelHandleList.Packed() && n.PChannelList.Packed() { // Type NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !n.PChannelHandleList.Packed() && n.PChannelList.Packed() { // Type NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS) SizeBytes() int { return 4 + 1*4 + (*P64)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.NumClasses)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(n.Pad[idx]) dst = dst[1:] } dst = n.ClassList.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS) UnmarshalBytes(src []byte) []byte { n.NumClasses = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { n.Pad[idx] = src[0] src = src[1:] } src = n.ClassList.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS) Packed() bool { return n.ClassList.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS) MarshalUnsafe(dst []byte) []byte { if n.ClassList.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS) UnmarshalUnsafe(src []byte) []byte { if n.ClassList.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.ClassList.Packed() { // Type NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.ClassList.Packed() { // Type NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !n.ClassList.Packed() { // Type NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV0080_CTRL_GR_ROUTE_INFO) SizeBytes() int { return 12 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV0080_CTRL_GR_ROUTE_INFO) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(n.Pad[idx]) dst = dst[1:] } hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.Route)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV0080_CTRL_GR_ROUTE_INFO) UnmarshalBytes(src []byte) []byte { n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { n.Pad[idx] = src[0] src = src[1:] } n.Route = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV0080_CTRL_GR_ROUTE_INFO) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV0080_CTRL_GR_ROUTE_INFO) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV0080_CTRL_GR_ROUTE_INFO) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV0080_CTRL_GR_ROUTE_INFO) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV0080_CTRL_GR_ROUTE_INFO) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV0080_CTRL_GR_ROUTE_INFO) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV0080_CTRL_GR_ROUTE_INFO) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV0080_CTRL_GR_ROUTE_INFO) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV00FD_CTRL_ATTACH_GPU_PARAMS) SizeBytes() int { return 12 + (*Handle)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV00FD_CTRL_ATTACH_GPU_PARAMS) MarshalBytes(dst []byte) []byte { dst = n.HSubDevice.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.DevDescriptor)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV00FD_CTRL_ATTACH_GPU_PARAMS) UnmarshalBytes(src []byte) []byte { src = n.HSubDevice.UnmarshalUnsafe(src) n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.DevDescriptor = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV00FD_CTRL_ATTACH_GPU_PARAMS) Packed() bool { return n.HSubDevice.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV00FD_CTRL_ATTACH_GPU_PARAMS) MarshalUnsafe(dst []byte) []byte { if n.HSubDevice.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV00FD_CTRL_ATTACH_GPU_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV00FD_CTRL_ATTACH_GPU_PARAMS) UnmarshalUnsafe(src []byte) []byte { if n.HSubDevice.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV00FD_CTRL_ATTACH_GPU_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV00FD_CTRL_ATTACH_GPU_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HSubDevice.Packed() { // Type NV00FD_CTRL_ATTACH_GPU_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV00FD_CTRL_ATTACH_GPU_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV00FD_CTRL_ATTACH_GPU_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HSubDevice.Packed() { // Type NV00FD_CTRL_ATTACH_GPU_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV00FD_CTRL_ATTACH_GPU_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV00FD_CTRL_ATTACH_GPU_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !n.HSubDevice.Packed() { // Type NV00FD_CTRL_ATTACH_GPU_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (p *NV2080_CTRL_BIOS_GET_INFO_PARAMS) SizeBytes() int { return 4 + 1*4 + (*P64)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (p *NV2080_CTRL_BIOS_GET_INFO_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.BiosInfoListSize)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(p.Pad[idx]) dst = dst[1:] } dst = p.BiosInfoList.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (p *NV2080_CTRL_BIOS_GET_INFO_PARAMS) UnmarshalBytes(src []byte) []byte { p.BiosInfoListSize = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { p.Pad[idx] = src[0] src = src[1:] } src = p.BiosInfoList.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (p *NV2080_CTRL_BIOS_GET_INFO_PARAMS) Packed() bool { return p.BiosInfoList.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (p *NV2080_CTRL_BIOS_GET_INFO_PARAMS) MarshalUnsafe(dst []byte) []byte { if p.BiosInfoList.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(p), uintptr(size)) return dst[size:] } // Type NV2080_CTRL_BIOS_GET_INFO_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return p.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (p *NV2080_CTRL_BIOS_GET_INFO_PARAMS) UnmarshalUnsafe(src []byte) []byte { if p.BiosInfoList.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(p), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV2080_CTRL_BIOS_GET_INFO_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return p.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (p *NV2080_CTRL_BIOS_GET_INFO_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.BiosInfoList.Packed() { // Type NV2080_CTRL_BIOS_GET_INFO_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. p.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (p *NV2080_CTRL_BIOS_GET_INFO_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyOutN(cc, addr, p.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (p *NV2080_CTRL_BIOS_GET_INFO_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.BiosInfoList.Packed() { // Type NV2080_CTRL_BIOS_GET_INFO_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. p.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (p *NV2080_CTRL_BIOS_GET_INFO_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyInN(cc, addr, p.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (p *NV2080_CTRL_BIOS_GET_INFO_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !p.BiosInfoList.Packed() { // Type NV2080_CTRL_BIOS_GET_INFO_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, p.SizeBytes()) p.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS) SizeBytes() int { return 7 + 1*3 + 1*6 + (*P64)(nil).SizeBytes() + (*Handle)(nil).SizeBytes()*NV2080_CTRL_FIFO_DISABLE_CHANNELS_MAX_ENTRIES + (*Handle)(nil).SizeBytes()*NV2080_CTRL_FIFO_DISABLE_CHANNELS_MAX_ENTRIES } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS) MarshalBytes(dst []byte) []byte { dst[0] = byte(n.BDisable) dst = dst[1:] for idx := 0; idx < 3; idx++ { dst[0] = byte(n.Pad1[idx]) dst = dst[1:] } hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.NumChannels)) dst = dst[4:] dst[0] = byte(n.BOnlyDisableScheduling) dst = dst[1:] dst[0] = byte(n.BRewindGpPut) dst = dst[1:] for idx := 0; idx < 6; idx++ { dst[0] = byte(n.Pad2[idx]) dst = dst[1:] } dst = n.PRunlistPreemptEvent.MarshalUnsafe(dst) for idx := 0; idx < NV2080_CTRL_FIFO_DISABLE_CHANNELS_MAX_ENTRIES; idx++ { dst = n.HClientList[idx].MarshalUnsafe(dst) } for idx := 0; idx < NV2080_CTRL_FIFO_DISABLE_CHANNELS_MAX_ENTRIES; idx++ { dst = n.HChannelList[idx].MarshalUnsafe(dst) } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS) UnmarshalBytes(src []byte) []byte { n.BDisable = uint8(src[0]) src = src[1:] for idx := 0; idx < 3; idx++ { n.Pad1[idx] = src[0] src = src[1:] } n.NumChannels = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.BOnlyDisableScheduling = uint8(src[0]) src = src[1:] n.BRewindGpPut = uint8(src[0]) src = src[1:] for idx := 0; idx < 6; idx++ { n.Pad2[idx] = src[0] src = src[1:] } src = n.PRunlistPreemptEvent.UnmarshalUnsafe(src) for idx := 0; idx < NV2080_CTRL_FIFO_DISABLE_CHANNELS_MAX_ENTRIES; idx++ { src = n.HClientList[idx].UnmarshalUnsafe(src) } for idx := 0; idx < NV2080_CTRL_FIFO_DISABLE_CHANNELS_MAX_ENTRIES; idx++ { src = n.HChannelList[idx].UnmarshalUnsafe(src) } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS) Packed() bool { return n.HChannelList[0].Packed() && n.HClientList[0].Packed() && n.PRunlistPreemptEvent.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS) MarshalUnsafe(dst []byte) []byte { if n.HChannelList[0].Packed() && n.HClientList[0].Packed() && n.PRunlistPreemptEvent.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS) UnmarshalUnsafe(src []byte) []byte { if n.HChannelList[0].Packed() && n.HClientList[0].Packed() && n.PRunlistPreemptEvent.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HChannelList[0].Packed() && n.HClientList[0].Packed() && n.PRunlistPreemptEvent.Packed() { // Type NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HChannelList[0].Packed() && n.HClientList[0].Packed() && n.PRunlistPreemptEvent.Packed() { // Type NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !n.HChannelList[0].Packed() && n.HClientList[0].Packed() && n.PRunlistPreemptEvent.Packed() { // Type NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (p *NV2080_CTRL_GR_GET_INFO_PARAMS) SizeBytes() int { return 4 + 1*4 + (*P64)(nil).SizeBytes() + (*NV0080_CTRL_GR_ROUTE_INFO)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (p *NV2080_CTRL_GR_GET_INFO_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.GRInfoListSize)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(p.Pad[idx]) dst = dst[1:] } dst = p.GRInfoList.MarshalUnsafe(dst) dst = p.GRRouteInfo.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (p *NV2080_CTRL_GR_GET_INFO_PARAMS) UnmarshalBytes(src []byte) []byte { p.GRInfoListSize = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { p.Pad[idx] = src[0] src = src[1:] } src = p.GRInfoList.UnmarshalUnsafe(src) src = p.GRRouteInfo.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (p *NV2080_CTRL_GR_GET_INFO_PARAMS) Packed() bool { return p.GRInfoList.Packed() && p.GRRouteInfo.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (p *NV2080_CTRL_GR_GET_INFO_PARAMS) MarshalUnsafe(dst []byte) []byte { if p.GRInfoList.Packed() && p.GRRouteInfo.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(p), uintptr(size)) return dst[size:] } // Type NV2080_CTRL_GR_GET_INFO_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return p.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (p *NV2080_CTRL_GR_GET_INFO_PARAMS) UnmarshalUnsafe(src []byte) []byte { if p.GRInfoList.Packed() && p.GRRouteInfo.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(p), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV2080_CTRL_GR_GET_INFO_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return p.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (p *NV2080_CTRL_GR_GET_INFO_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.GRInfoList.Packed() && p.GRRouteInfo.Packed() { // Type NV2080_CTRL_GR_GET_INFO_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. p.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (p *NV2080_CTRL_GR_GET_INFO_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyOutN(cc, addr, p.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (p *NV2080_CTRL_GR_GET_INFO_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.GRInfoList.Packed() && p.GRRouteInfo.Packed() { // Type NV2080_CTRL_GR_GET_INFO_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. p.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (p *NV2080_CTRL_GR_GET_INFO_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyInN(cc, addr, p.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (p *NV2080_CTRL_GR_GET_INFO_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !p.GRInfoList.Packed() && p.GRRouteInfo.Packed() { // Type NV2080_CTRL_GR_GET_INFO_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, p.SizeBytes()) p.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NV503C_CTRL_REGISTER_VA_SPACE_PARAMS) SizeBytes() int { return 8 + (*Handle)(nil).SizeBytes() + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NV503C_CTRL_REGISTER_VA_SPACE_PARAMS) MarshalBytes(dst []byte) []byte { dst = n.HVASpace.MarshalUnsafe(dst) for idx := 0; idx < 4; idx++ { dst[0] = byte(n.Pad[idx]) dst = dst[1:] } hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.VASpaceToken)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NV503C_CTRL_REGISTER_VA_SPACE_PARAMS) UnmarshalBytes(src []byte) []byte { src = n.HVASpace.UnmarshalUnsafe(src) for idx := 0; idx < 4; idx++ { n.Pad[idx] = src[0] src = src[1:] } n.VASpaceToken = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NV503C_CTRL_REGISTER_VA_SPACE_PARAMS) Packed() bool { return n.HVASpace.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NV503C_CTRL_REGISTER_VA_SPACE_PARAMS) MarshalUnsafe(dst []byte) []byte { if n.HVASpace.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NV503C_CTRL_REGISTER_VA_SPACE_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NV503C_CTRL_REGISTER_VA_SPACE_PARAMS) UnmarshalUnsafe(src []byte) []byte { if n.HVASpace.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NV503C_CTRL_REGISTER_VA_SPACE_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NV503C_CTRL_REGISTER_VA_SPACE_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HVASpace.Packed() { // Type NV503C_CTRL_REGISTER_VA_SPACE_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NV503C_CTRL_REGISTER_VA_SPACE_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NV503C_CTRL_REGISTER_VA_SPACE_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HVASpace.Packed() { // Type NV503C_CTRL_REGISTER_VA_SPACE_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NV503C_CTRL_REGISTER_VA_SPACE_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NV503C_CTRL_REGISTER_VA_SPACE_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !n.HVASpace.Packed() { // Type NV503C_CTRL_REGISTER_VA_SPACE_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NVXXXX_CTRL_XXX_INFO) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NVXXXX_CTRL_XXX_INFO) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Index)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Data)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NVXXXX_CTRL_XXX_INFO) UnmarshalBytes(src []byte) []byte { n.Index = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Data = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NVXXXX_CTRL_XXX_INFO) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NVXXXX_CTRL_XXX_INFO) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NVXXXX_CTRL_XXX_INFO) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NVXXXX_CTRL_XXX_INFO) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NVXXXX_CTRL_XXX_INFO) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NVXXXX_CTRL_XXX_INFO) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NVXXXX_CTRL_XXX_INFO) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NVXXXX_CTRL_XXX_INFO) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // CopyCtrlXxxInfoSliceIn copies in a slice of NVXXXX_CTRL_XXX_INFO objects from the task's memory. func CopyCtrlXxxInfoSliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []NVXXXX_CTRL_XXX_INFO) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*NVXXXX_CTRL_XXX_INFO)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyCtrlXxxInfoSliceOut copies a slice of NVXXXX_CTRL_XXX_INFO objects to the task's memory. func CopyCtrlXxxInfoSliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []NVXXXX_CTRL_XXX_INFO) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*NVXXXX_CTRL_XXX_INFO)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeCtrlXxxInfoSlice is like NVXXXX_CTRL_XXX_INFO.MarshalUnsafe, but for a []NVXXXX_CTRL_XXX_INFO. func MarshalUnsafeCtrlXxxInfoSlice(src []NVXXXX_CTRL_XXX_INFO, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*NVXXXX_CTRL_XXX_INFO)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeCtrlXxxInfoSlice is like NVXXXX_CTRL_XXX_INFO.UnmarshalUnsafe, but for a []NVXXXX_CTRL_XXX_INFO. func UnmarshalUnsafeCtrlXxxInfoSlice(dst []NVXXXX_CTRL_XXX_INFO, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*NVXXXX_CTRL_XXX_INFO)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. func (p *IoctlAllocOSEvent) SizeBytes() int { return 8 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (p *IoctlAllocOSEvent) MarshalBytes(dst []byte) []byte { dst = p.HClient.MarshalUnsafe(dst) dst = p.HDevice.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.FD)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.Status)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (p *IoctlAllocOSEvent) UnmarshalBytes(src []byte) []byte { src = p.HClient.UnmarshalUnsafe(src) src = p.HDevice.UnmarshalUnsafe(src) p.FD = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] p.Status = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (p *IoctlAllocOSEvent) Packed() bool { return p.HClient.Packed() && p.HDevice.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (p *IoctlAllocOSEvent) MarshalUnsafe(dst []byte) []byte { if p.HClient.Packed() && p.HDevice.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(p), uintptr(size)) return dst[size:] } // Type IoctlAllocOSEvent doesn't have a packed layout in memory, fallback to MarshalBytes. return p.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (p *IoctlAllocOSEvent) UnmarshalUnsafe(src []byte) []byte { if p.HClient.Packed() && p.HDevice.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(p), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type IoctlAllocOSEvent doesn't have a packed layout in memory, fallback to UnmarshalBytes. return p.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (p *IoctlAllocOSEvent) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.HClient.Packed() && p.HDevice.Packed() { // Type IoctlAllocOSEvent doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. p.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (p *IoctlAllocOSEvent) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyOutN(cc, addr, p.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (p *IoctlAllocOSEvent) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.HClient.Packed() && p.HDevice.Packed() { // Type IoctlAllocOSEvent doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. p.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (p *IoctlAllocOSEvent) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyInN(cc, addr, p.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (p *IoctlAllocOSEvent) WriteTo(writer io.Writer) (int64, error) { if !p.HClient.Packed() && p.HDevice.Packed() { // Type IoctlAllocOSEvent doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, p.SizeBytes()) p.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (p *IoctlFreeOSEvent) SizeBytes() int { return 8 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (p *IoctlFreeOSEvent) MarshalBytes(dst []byte) []byte { dst = p.HClient.MarshalUnsafe(dst) dst = p.HDevice.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.FD)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.Status)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (p *IoctlFreeOSEvent) UnmarshalBytes(src []byte) []byte { src = p.HClient.UnmarshalUnsafe(src) src = p.HDevice.UnmarshalUnsafe(src) p.FD = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] p.Status = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (p *IoctlFreeOSEvent) Packed() bool { return p.HClient.Packed() && p.HDevice.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (p *IoctlFreeOSEvent) MarshalUnsafe(dst []byte) []byte { if p.HClient.Packed() && p.HDevice.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(p), uintptr(size)) return dst[size:] } // Type IoctlFreeOSEvent doesn't have a packed layout in memory, fallback to MarshalBytes. return p.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (p *IoctlFreeOSEvent) UnmarshalUnsafe(src []byte) []byte { if p.HClient.Packed() && p.HDevice.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(p), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type IoctlFreeOSEvent doesn't have a packed layout in memory, fallback to UnmarshalBytes. return p.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (p *IoctlFreeOSEvent) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.HClient.Packed() && p.HDevice.Packed() { // Type IoctlFreeOSEvent doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. p.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (p *IoctlFreeOSEvent) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyOutN(cc, addr, p.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (p *IoctlFreeOSEvent) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.HClient.Packed() && p.HDevice.Packed() { // Type IoctlFreeOSEvent doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. p.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (p *IoctlFreeOSEvent) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyInN(cc, addr, p.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (p *IoctlFreeOSEvent) WriteTo(writer io.Writer) (int64, error) { if !p.HClient.Packed() && p.HDevice.Packed() { // Type IoctlFreeOSEvent doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, p.SizeBytes()) p.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IoctlNVOS02ParametersWithFD) SizeBytes() int { return 4 + (*NVOS02Parameters)(nil).SizeBytes() + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IoctlNVOS02ParametersWithFD) MarshalBytes(dst []byte) []byte { dst = i.Params.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.FD)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(i.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IoctlNVOS02ParametersWithFD) UnmarshalBytes(src []byte) []byte { src = i.Params.UnmarshalUnsafe(src) i.FD = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { i.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IoctlNVOS02ParametersWithFD) Packed() bool { return i.Params.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IoctlNVOS02ParametersWithFD) MarshalUnsafe(dst []byte) []byte { if i.Params.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // Type IoctlNVOS02ParametersWithFD doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IoctlNVOS02ParametersWithFD) UnmarshalUnsafe(src []byte) []byte { if i.Params.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type IoctlNVOS02ParametersWithFD doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IoctlNVOS02ParametersWithFD) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Params.Packed() { // Type IoctlNVOS02ParametersWithFD doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IoctlNVOS02ParametersWithFD) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IoctlNVOS02ParametersWithFD) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Params.Packed() { // Type IoctlNVOS02ParametersWithFD doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IoctlNVOS02ParametersWithFD) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IoctlNVOS02ParametersWithFD) WriteTo(writer io.Writer) (int64, error) { if !i.Params.Packed() { // Type IoctlNVOS02ParametersWithFD doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IoctlNVOS33ParametersWithFD) SizeBytes() int { return 4 + (*NVOS33Parameters)(nil).SizeBytes() + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IoctlNVOS33ParametersWithFD) MarshalBytes(dst []byte) []byte { dst = i.Params.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.FD)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(i.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IoctlNVOS33ParametersWithFD) UnmarshalBytes(src []byte) []byte { src = i.Params.UnmarshalUnsafe(src) i.FD = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { i.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IoctlNVOS33ParametersWithFD) Packed() bool { return i.Params.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IoctlNVOS33ParametersWithFD) MarshalUnsafe(dst []byte) []byte { if i.Params.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // Type IoctlNVOS33ParametersWithFD doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IoctlNVOS33ParametersWithFD) UnmarshalUnsafe(src []byte) []byte { if i.Params.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type IoctlNVOS33ParametersWithFD doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IoctlNVOS33ParametersWithFD) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Params.Packed() { // Type IoctlNVOS33ParametersWithFD doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IoctlNVOS33ParametersWithFD) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IoctlNVOS33ParametersWithFD) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.Params.Packed() { // Type IoctlNVOS33ParametersWithFD doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IoctlNVOS33ParametersWithFD) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IoctlNVOS33ParametersWithFD) WriteTo(writer io.Writer) (int64, error) { if !i.Params.Packed() { // Type IoctlNVOS33ParametersWithFD doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IoctlRegisterFD) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IoctlRegisterFD) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.CtlFD)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IoctlRegisterFD) UnmarshalBytes(src []byte) []byte { i.CtlFD = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IoctlRegisterFD) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IoctlRegisterFD) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IoctlRegisterFD) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IoctlRegisterFD) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IoctlRegisterFD) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IoctlRegisterFD) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IoctlRegisterFD) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IoctlRegisterFD) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IoctlSysParams) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IoctlSysParams) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.MemblockSize)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IoctlSysParams) UnmarshalBytes(src []byte) []byte { i.MemblockSize = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IoctlSysParams) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IoctlSysParams) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IoctlSysParams) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IoctlSysParams) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IoctlSysParams) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IoctlSysParams) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IoctlSysParams) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IoctlSysParams) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *IoctlWaitOpenComplete) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *IoctlWaitOpenComplete) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Rc)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.AdapterStatus)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *IoctlWaitOpenComplete) UnmarshalBytes(src []byte) []byte { i.Rc = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.AdapterStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *IoctlWaitOpenComplete) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *IoctlWaitOpenComplete) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *IoctlWaitOpenComplete) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *IoctlWaitOpenComplete) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *IoctlWaitOpenComplete) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *IoctlWaitOpenComplete) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *IoctlWaitOpenComplete) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *IoctlWaitOpenComplete) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NVOS00Parameters) SizeBytes() int { return 4 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NVOS00Parameters) MarshalBytes(dst []byte) []byte { dst = n.HRoot.MarshalUnsafe(dst) dst = n.HObjectParent.MarshalUnsafe(dst) dst = n.HObjectOld.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Status)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NVOS00Parameters) UnmarshalBytes(src []byte) []byte { src = n.HRoot.UnmarshalUnsafe(src) src = n.HObjectParent.UnmarshalUnsafe(src) src = n.HObjectOld.UnmarshalUnsafe(src) n.Status = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NVOS00Parameters) Packed() bool { return n.HObjectOld.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NVOS00Parameters) MarshalUnsafe(dst []byte) []byte { if n.HObjectOld.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NVOS00Parameters doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NVOS00Parameters) UnmarshalUnsafe(src []byte) []byte { if n.HObjectOld.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NVOS00Parameters doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NVOS00Parameters) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HObjectOld.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() { // Type NVOS00Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NVOS00Parameters) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NVOS00Parameters) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HObjectOld.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() { // Type NVOS00Parameters doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NVOS00Parameters) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NVOS00Parameters) WriteTo(writer io.Writer) (int64, error) { if !n.HObjectOld.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() { // Type NVOS00Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NVOS02Parameters) SizeBytes() int { return 16 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*ClassID)(nil).SizeBytes() + 1*4 + (*P64)(nil).SizeBytes() + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NVOS02Parameters) MarshalBytes(dst []byte) []byte { dst = n.HRoot.MarshalUnsafe(dst) dst = n.HObjectParent.MarshalUnsafe(dst) dst = n.HObjectNew.MarshalUnsafe(dst) dst = n.HClass.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(n.Pad0[idx]) dst = dst[1:] } dst = n.PMemory.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.Limit)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Status)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(n.Pad1[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NVOS02Parameters) UnmarshalBytes(src []byte) []byte { src = n.HRoot.UnmarshalUnsafe(src) src = n.HObjectParent.UnmarshalUnsafe(src) src = n.HObjectNew.UnmarshalUnsafe(src) src = n.HClass.UnmarshalUnsafe(src) n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { n.Pad0[idx] = src[0] src = src[1:] } src = n.PMemory.UnmarshalUnsafe(src) n.Limit = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.Status = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { n.Pad1[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NVOS02Parameters) Packed() bool { return n.HClass.Packed() && n.HObjectNew.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() && n.PMemory.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NVOS02Parameters) MarshalUnsafe(dst []byte) []byte { if n.HClass.Packed() && n.HObjectNew.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() && n.PMemory.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NVOS02Parameters doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NVOS02Parameters) UnmarshalUnsafe(src []byte) []byte { if n.HClass.Packed() && n.HObjectNew.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() && n.PMemory.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NVOS02Parameters doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NVOS02Parameters) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClass.Packed() && n.HObjectNew.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() && n.PMemory.Packed() { // Type NVOS02Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NVOS02Parameters) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NVOS02Parameters) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClass.Packed() && n.HObjectNew.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() && n.PMemory.Packed() { // Type NVOS02Parameters doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NVOS02Parameters) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NVOS02Parameters) WriteTo(writer io.Writer) (int64, error) { if !n.HClass.Packed() && n.HObjectNew.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() && n.PMemory.Packed() { // Type NVOS02Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NVOS21Parameters) SizeBytes() int { return 8 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*ClassID)(nil).SizeBytes() + (*P64)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NVOS21Parameters) MarshalBytes(dst []byte) []byte { dst = n.HRoot.MarshalUnsafe(dst) dst = n.HObjectParent.MarshalUnsafe(dst) dst = n.HObjectNew.MarshalUnsafe(dst) dst = n.HClass.MarshalUnsafe(dst) dst = n.PAllocParms.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.ParamsSize)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Status)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NVOS21Parameters) UnmarshalBytes(src []byte) []byte { src = n.HRoot.UnmarshalUnsafe(src) src = n.HObjectParent.UnmarshalUnsafe(src) src = n.HObjectNew.UnmarshalUnsafe(src) src = n.HClass.UnmarshalUnsafe(src) src = n.PAllocParms.UnmarshalUnsafe(src) n.ParamsSize = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Status = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NVOS21Parameters) Packed() bool { return n.HClass.Packed() && n.HObjectNew.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() && n.PAllocParms.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NVOS21Parameters) MarshalUnsafe(dst []byte) []byte { if n.HClass.Packed() && n.HObjectNew.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() && n.PAllocParms.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NVOS21Parameters doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NVOS21Parameters) UnmarshalUnsafe(src []byte) []byte { if n.HClass.Packed() && n.HObjectNew.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() && n.PAllocParms.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NVOS21Parameters doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NVOS21Parameters) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClass.Packed() && n.HObjectNew.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() && n.PAllocParms.Packed() { // Type NVOS21Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NVOS21Parameters) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NVOS21Parameters) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClass.Packed() && n.HObjectNew.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() && n.PAllocParms.Packed() { // Type NVOS21Parameters doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NVOS21Parameters) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NVOS21Parameters) WriteTo(writer io.Writer) (int64, error) { if !n.HClass.Packed() && n.HObjectNew.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() && n.PAllocParms.Packed() { // Type NVOS21Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NVOS32Parameters) SizeBytes() int { return 26 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + 1*2 + 1*144 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NVOS32Parameters) MarshalBytes(dst []byte) []byte { dst = n.HRoot.MarshalUnsafe(dst) dst = n.HObjectParent.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Function)) dst = dst[4:] dst = n.HVASpace.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint16(dst[:2], uint16(n.IVCHeapNumber)) dst = dst[2:] for idx := 0; idx < 2; idx++ { dst[0] = byte(n.Pad[idx]) dst = dst[1:] } hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Status)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.Total)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.Free)) dst = dst[8:] for idx := 0; idx < 144; idx++ { dst[0] = byte(n.Data[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NVOS32Parameters) UnmarshalBytes(src []byte) []byte { src = n.HRoot.UnmarshalUnsafe(src) src = n.HObjectParent.UnmarshalUnsafe(src) n.Function = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = n.HVASpace.UnmarshalUnsafe(src) n.IVCHeapNumber = int16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] for idx := 0; idx < 2; idx++ { n.Pad[idx] = src[0] src = src[1:] } n.Status = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Total = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.Free = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] for idx := 0; idx < 144; idx++ { n.Data[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NVOS32Parameters) Packed() bool { return n.HObjectParent.Packed() && n.HRoot.Packed() && n.HVASpace.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NVOS32Parameters) MarshalUnsafe(dst []byte) []byte { if n.HObjectParent.Packed() && n.HRoot.Packed() && n.HVASpace.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NVOS32Parameters doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NVOS32Parameters) UnmarshalUnsafe(src []byte) []byte { if n.HObjectParent.Packed() && n.HRoot.Packed() && n.HVASpace.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NVOS32Parameters doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NVOS32Parameters) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HObjectParent.Packed() && n.HRoot.Packed() && n.HVASpace.Packed() { // Type NVOS32Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NVOS32Parameters) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NVOS32Parameters) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HObjectParent.Packed() && n.HRoot.Packed() && n.HVASpace.Packed() { // Type NVOS32Parameters doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NVOS32Parameters) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NVOS32Parameters) WriteTo(writer io.Writer) (int64, error) { if !n.HObjectParent.Packed() && n.HRoot.Packed() && n.HVASpace.Packed() { // Type NVOS32Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NVOS33Parameters) SizeBytes() int { return 24 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + 1*4 + (*P64)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NVOS33Parameters) MarshalBytes(dst []byte) []byte { dst = n.HClient.MarshalUnsafe(dst) dst = n.HDevice.MarshalUnsafe(dst) dst = n.HMemory.MarshalUnsafe(dst) for idx := 0; idx < 4; idx++ { dst[0] = byte(n.Pad0[idx]) dst = dst[1:] } hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.Offset)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(n.Length)) dst = dst[8:] dst = n.PLinearAddress.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Status)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NVOS33Parameters) UnmarshalBytes(src []byte) []byte { src = n.HClient.UnmarshalUnsafe(src) src = n.HDevice.UnmarshalUnsafe(src) src = n.HMemory.UnmarshalUnsafe(src) for idx := 0; idx < 4; idx++ { n.Pad0[idx] = src[0] src = src[1:] } n.Offset = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] n.Length = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = n.PLinearAddress.UnmarshalUnsafe(src) n.Status = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NVOS33Parameters) Packed() bool { return n.HClient.Packed() && n.HDevice.Packed() && n.HMemory.Packed() && n.PLinearAddress.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NVOS33Parameters) MarshalUnsafe(dst []byte) []byte { if n.HClient.Packed() && n.HDevice.Packed() && n.HMemory.Packed() && n.PLinearAddress.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NVOS33Parameters doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NVOS33Parameters) UnmarshalUnsafe(src []byte) []byte { if n.HClient.Packed() && n.HDevice.Packed() && n.HMemory.Packed() && n.PLinearAddress.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NVOS33Parameters doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NVOS33Parameters) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClient.Packed() && n.HDevice.Packed() && n.HMemory.Packed() && n.PLinearAddress.Packed() { // Type NVOS33Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NVOS33Parameters) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NVOS33Parameters) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClient.Packed() && n.HDevice.Packed() && n.HMemory.Packed() && n.PLinearAddress.Packed() { // Type NVOS33Parameters doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NVOS33Parameters) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NVOS33Parameters) WriteTo(writer io.Writer) (int64, error) { if !n.HClient.Packed() && n.HDevice.Packed() && n.HMemory.Packed() && n.PLinearAddress.Packed() { // Type NVOS33Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NVOS34Parameters) SizeBytes() int { return 8 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + 1*4 + (*P64)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NVOS34Parameters) MarshalBytes(dst []byte) []byte { dst = n.HClient.MarshalUnsafe(dst) dst = n.HDevice.MarshalUnsafe(dst) dst = n.HMemory.MarshalUnsafe(dst) for idx := 0; idx < 4; idx++ { dst[0] = byte(n.Pad0[idx]) dst = dst[1:] } dst = n.PLinearAddress.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Status)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NVOS34Parameters) UnmarshalBytes(src []byte) []byte { src = n.HClient.UnmarshalUnsafe(src) src = n.HDevice.UnmarshalUnsafe(src) src = n.HMemory.UnmarshalUnsafe(src) for idx := 0; idx < 4; idx++ { n.Pad0[idx] = src[0] src = src[1:] } src = n.PLinearAddress.UnmarshalUnsafe(src) n.Status = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NVOS34Parameters) Packed() bool { return n.HClient.Packed() && n.HDevice.Packed() && n.HMemory.Packed() && n.PLinearAddress.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NVOS34Parameters) MarshalUnsafe(dst []byte) []byte { if n.HClient.Packed() && n.HDevice.Packed() && n.HMemory.Packed() && n.PLinearAddress.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NVOS34Parameters doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NVOS34Parameters) UnmarshalUnsafe(src []byte) []byte { if n.HClient.Packed() && n.HDevice.Packed() && n.HMemory.Packed() && n.PLinearAddress.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NVOS34Parameters doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NVOS34Parameters) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClient.Packed() && n.HDevice.Packed() && n.HMemory.Packed() && n.PLinearAddress.Packed() { // Type NVOS34Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NVOS34Parameters) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NVOS34Parameters) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClient.Packed() && n.HDevice.Packed() && n.HMemory.Packed() && n.PLinearAddress.Packed() { // Type NVOS34Parameters doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NVOS34Parameters) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NVOS34Parameters) WriteTo(writer io.Writer) (int64, error) { if !n.HClient.Packed() && n.HDevice.Packed() && n.HMemory.Packed() && n.PLinearAddress.Packed() { // Type NVOS34Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NVOS54Parameters) SizeBytes() int { return 16 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*P64)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NVOS54Parameters) MarshalBytes(dst []byte) []byte { dst = n.HClient.MarshalUnsafe(dst) dst = n.HObject.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Cmd)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] dst = n.Params.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.ParamsSize)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Status)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NVOS54Parameters) UnmarshalBytes(src []byte) []byte { src = n.HClient.UnmarshalUnsafe(src) src = n.HObject.UnmarshalUnsafe(src) n.Cmd = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = n.Params.UnmarshalUnsafe(src) n.ParamsSize = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Status = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NVOS54Parameters) Packed() bool { return n.HClient.Packed() && n.HObject.Packed() && n.Params.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NVOS54Parameters) MarshalUnsafe(dst []byte) []byte { if n.HClient.Packed() && n.HObject.Packed() && n.Params.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NVOS54Parameters doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NVOS54Parameters) UnmarshalUnsafe(src []byte) []byte { if n.HClient.Packed() && n.HObject.Packed() && n.Params.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NVOS54Parameters doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NVOS54Parameters) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClient.Packed() && n.HObject.Packed() && n.Params.Packed() { // Type NVOS54Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NVOS54Parameters) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NVOS54Parameters) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClient.Packed() && n.HObject.Packed() && n.Params.Packed() { // Type NVOS54Parameters doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NVOS54Parameters) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NVOS54Parameters) WriteTo(writer io.Writer) (int64, error) { if !n.HClient.Packed() && n.HObject.Packed() && n.Params.Packed() { // Type NVOS54Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NVOS55Parameters) SizeBytes() int { return 8 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NVOS55Parameters) MarshalBytes(dst []byte) []byte { dst = n.HClient.MarshalUnsafe(dst) dst = n.HParent.MarshalUnsafe(dst) dst = n.HObject.MarshalUnsafe(dst) dst = n.HClientSrc.MarshalUnsafe(dst) dst = n.HObjectSrc.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Status)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NVOS55Parameters) UnmarshalBytes(src []byte) []byte { src = n.HClient.UnmarshalUnsafe(src) src = n.HParent.UnmarshalUnsafe(src) src = n.HObject.UnmarshalUnsafe(src) src = n.HClientSrc.UnmarshalUnsafe(src) src = n.HObjectSrc.UnmarshalUnsafe(src) n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Status = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NVOS55Parameters) Packed() bool { return n.HClient.Packed() && n.HClientSrc.Packed() && n.HObject.Packed() && n.HObjectSrc.Packed() && n.HParent.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NVOS55Parameters) MarshalUnsafe(dst []byte) []byte { if n.HClient.Packed() && n.HClientSrc.Packed() && n.HObject.Packed() && n.HObjectSrc.Packed() && n.HParent.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NVOS55Parameters doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NVOS55Parameters) UnmarshalUnsafe(src []byte) []byte { if n.HClient.Packed() && n.HClientSrc.Packed() && n.HObject.Packed() && n.HObjectSrc.Packed() && n.HParent.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NVOS55Parameters doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NVOS55Parameters) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClient.Packed() && n.HClientSrc.Packed() && n.HObject.Packed() && n.HObjectSrc.Packed() && n.HParent.Packed() { // Type NVOS55Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NVOS55Parameters) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NVOS55Parameters) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClient.Packed() && n.HClientSrc.Packed() && n.HObject.Packed() && n.HObjectSrc.Packed() && n.HParent.Packed() { // Type NVOS55Parameters doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NVOS55Parameters) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NVOS55Parameters) WriteTo(writer io.Writer) (int64, error) { if !n.HClient.Packed() && n.HClientSrc.Packed() && n.HObject.Packed() && n.HObjectSrc.Packed() && n.HParent.Packed() { // Type NVOS55Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NVOS56Parameters) SizeBytes() int { return 4 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + 1*4 + (*P64)(nil).SizeBytes() + (*P64)(nil).SizeBytes() + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NVOS56Parameters) MarshalBytes(dst []byte) []byte { dst = n.HClient.MarshalUnsafe(dst) dst = n.HDevice.MarshalUnsafe(dst) dst = n.HMemory.MarshalUnsafe(dst) for idx := 0; idx < 4; idx++ { dst[0] = byte(n.Pad0[idx]) dst = dst[1:] } dst = n.POldCPUAddress.MarshalUnsafe(dst) dst = n.PNewCPUAddress.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Status)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(n.Pad1[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NVOS56Parameters) UnmarshalBytes(src []byte) []byte { src = n.HClient.UnmarshalUnsafe(src) src = n.HDevice.UnmarshalUnsafe(src) src = n.HMemory.UnmarshalUnsafe(src) for idx := 0; idx < 4; idx++ { n.Pad0[idx] = src[0] src = src[1:] } src = n.POldCPUAddress.UnmarshalUnsafe(src) src = n.PNewCPUAddress.UnmarshalUnsafe(src) n.Status = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { n.Pad1[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NVOS56Parameters) Packed() bool { return n.HClient.Packed() && n.HDevice.Packed() && n.HMemory.Packed() && n.PNewCPUAddress.Packed() && n.POldCPUAddress.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NVOS56Parameters) MarshalUnsafe(dst []byte) []byte { if n.HClient.Packed() && n.HDevice.Packed() && n.HMemory.Packed() && n.PNewCPUAddress.Packed() && n.POldCPUAddress.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NVOS56Parameters doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NVOS56Parameters) UnmarshalUnsafe(src []byte) []byte { if n.HClient.Packed() && n.HDevice.Packed() && n.HMemory.Packed() && n.PNewCPUAddress.Packed() && n.POldCPUAddress.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NVOS56Parameters doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NVOS56Parameters) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClient.Packed() && n.HDevice.Packed() && n.HMemory.Packed() && n.PNewCPUAddress.Packed() && n.POldCPUAddress.Packed() { // Type NVOS56Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NVOS56Parameters) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NVOS56Parameters) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClient.Packed() && n.HDevice.Packed() && n.HMemory.Packed() && n.PNewCPUAddress.Packed() && n.POldCPUAddress.Packed() { // Type NVOS56Parameters doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NVOS56Parameters) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NVOS56Parameters) WriteTo(writer io.Writer) (int64, error) { if !n.HClient.Packed() && n.HDevice.Packed() && n.HMemory.Packed() && n.PNewCPUAddress.Packed() && n.POldCPUAddress.Packed() { // Type NVOS56Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NVOS57Parameters) SizeBytes() int { return 4 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*RS_SHARE_POLICY)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NVOS57Parameters) MarshalBytes(dst []byte) []byte { dst = n.HClient.MarshalUnsafe(dst) dst = n.HObject.MarshalUnsafe(dst) dst = n.SharePolicy.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Status)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NVOS57Parameters) UnmarshalBytes(src []byte) []byte { src = n.HClient.UnmarshalUnsafe(src) src = n.HObject.UnmarshalUnsafe(src) src = n.SharePolicy.UnmarshalUnsafe(src) n.Status = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NVOS57Parameters) Packed() bool { return n.HClient.Packed() && n.HObject.Packed() && n.SharePolicy.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NVOS57Parameters) MarshalUnsafe(dst []byte) []byte { if n.HClient.Packed() && n.HObject.Packed() && n.SharePolicy.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NVOS57Parameters doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NVOS57Parameters) UnmarshalUnsafe(src []byte) []byte { if n.HClient.Packed() && n.HObject.Packed() && n.SharePolicy.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NVOS57Parameters doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NVOS57Parameters) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClient.Packed() && n.HObject.Packed() && n.SharePolicy.Packed() { // Type NVOS57Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NVOS57Parameters) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NVOS57Parameters) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClient.Packed() && n.HObject.Packed() && n.SharePolicy.Packed() { // Type NVOS57Parameters doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NVOS57Parameters) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NVOS57Parameters) WriteTo(writer io.Writer) (int64, error) { if !n.HClient.Packed() && n.HObject.Packed() && n.SharePolicy.Packed() { // Type NVOS57Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (n *NVOS64Parameters) SizeBytes() int { return 16 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*ClassID)(nil).SizeBytes() + (*P64)(nil).SizeBytes() + (*P64)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NVOS64Parameters) MarshalBytes(dst []byte) []byte { dst = n.HRoot.MarshalUnsafe(dst) dst = n.HObjectParent.MarshalUnsafe(dst) dst = n.HObjectNew.MarshalUnsafe(dst) dst = n.HClass.MarshalUnsafe(dst) dst = n.PAllocParms.MarshalUnsafe(dst) dst = n.PRightsRequested.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.ParamsSize)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Flags)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(n.Status)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NVOS64Parameters) UnmarshalBytes(src []byte) []byte { src = n.HRoot.UnmarshalUnsafe(src) src = n.HObjectParent.UnmarshalUnsafe(src) src = n.HObjectNew.UnmarshalUnsafe(src) src = n.HClass.UnmarshalUnsafe(src) src = n.PAllocParms.UnmarshalUnsafe(src) src = n.PRightsRequested.UnmarshalUnsafe(src) n.ParamsSize = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] n.Status = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NVOS64Parameters) Packed() bool { return n.HClass.Packed() && n.HObjectNew.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() && n.PAllocParms.Packed() && n.PRightsRequested.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NVOS64Parameters) MarshalUnsafe(dst []byte) []byte { if n.HClass.Packed() && n.HObjectNew.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() && n.PAllocParms.Packed() && n.PRightsRequested.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(n), uintptr(size)) return dst[size:] } // Type NVOS64Parameters doesn't have a packed layout in memory, fallback to MarshalBytes. return n.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NVOS64Parameters) UnmarshalUnsafe(src []byte) []byte { if n.HClass.Packed() && n.HObjectNew.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() && n.PAllocParms.Packed() && n.PRightsRequested.Packed() { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type NVOS64Parameters doesn't have a packed layout in memory, fallback to UnmarshalBytes. return n.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NVOS64Parameters) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClass.Packed() && n.HObjectNew.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() && n.PAllocParms.Packed() && n.PRightsRequested.Packed() { // Type NVOS64Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. n.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NVOS64Parameters) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NVOS64Parameters) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !n.HClass.Packed() && n.HObjectNew.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() && n.PAllocParms.Packed() && n.PRightsRequested.Packed() { // Type NVOS64Parameters doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(n.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. n.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NVOS64Parameters) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NVOS64Parameters) WriteTo(writer io.Writer) (int64, error) { if !n.HClass.Packed() && n.HObjectNew.Packed() && n.HObjectParent.Packed() && n.HRoot.Packed() && n.PAllocParms.Packed() && n.PRightsRequested.Packed() { // Type NVOS64Parameters doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, n.SizeBytes()) n.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *RMAPIVersion) SizeBytes() int { return 8 + 1*64 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *RMAPIVersion) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(r.Cmd)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(r.Reply)) dst = dst[4:] for idx := 0; idx < 64; idx++ { dst[0] = byte(r.VersionString[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *RMAPIVersion) UnmarshalBytes(src []byte) []byte { r.Cmd = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] r.Reply = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 64; idx++ { r.VersionString[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *RMAPIVersion) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *RMAPIVersion) MarshalUnsafe(dst []byte) []byte { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(r), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *RMAPIVersion) UnmarshalUnsafe(src []byte) []byte { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(r), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (r *RMAPIVersion) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (r *RMAPIVersion) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (r *RMAPIVersion) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *RMAPIVersion) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *RMAPIVersion) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (h *Handle) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (h *Handle) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(h.Val)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (h *Handle) UnmarshalBytes(src []byte) []byte { h.Val = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (h *Handle) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (h *Handle) MarshalUnsafe(dst []byte) []byte { size := h.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(h), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (h *Handle) UnmarshalUnsafe(src []byte) []byte { size := h.SizeBytes() gohacks.Memmove(unsafe.Pointer(h), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (h *Handle) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(h))) hdr.Len = h.SizeBytes() hdr.Cap = h.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that h // must live until the use above. runtime.KeepAlive(h) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (h *Handle) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return h.CopyOutN(cc, addr, h.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (h *Handle) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(h))) hdr.Len = h.SizeBytes() hdr.Cap = h.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that h // must live until the use above. runtime.KeepAlive(h) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (h *Handle) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return h.CopyInN(cc, addr, h.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (h *Handle) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(h))) hdr.Len = h.SizeBytes() hdr.Cap = h.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that h // must live until the use above. runtime.KeepAlive(h) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (n *NvUUID) SizeBytes() int { return 1 * 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (n *NvUUID) MarshalBytes(dst []byte) []byte { for idx := 0; idx < 16; idx++ { dst[0] = byte(n[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (n *NvUUID) UnmarshalBytes(src []byte) []byte { for idx := 0; idx < 16; idx++ { n[idx] = uint8(src[0]) src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (n *NvUUID) Packed() bool { // Array newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (n *NvUUID) MarshalUnsafe(dst []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&n[0]), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (n *NvUUID) UnmarshalUnsafe(src []byte) []byte { size := n.SizeBytes() gohacks.Memmove(unsafe.Pointer(n), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (n *NvUUID) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (n *NvUUID) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyOutN(cc, addr, n.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (n *NvUUID) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (n *NvUUID) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return n.CopyInN(cc, addr, n.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (n *NvUUID) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(n))) hdr.Len = n.SizeBytes() hdr.Cap = n.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that n // must live until the use above. runtime.KeepAlive(n) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (p *P64) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (p *P64) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(*p)) return dst[8:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (p *P64) UnmarshalBytes(src []byte) []byte { *p = P64(uint64(hostarch.ByteOrder.Uint64(src[:8]))) return src[8:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (p *P64) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (p *P64) MarshalUnsafe(dst []byte) []byte { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(p), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (p *P64) UnmarshalUnsafe(src []byte) []byte { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(p), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (p *P64) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (p *P64) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyOutN(cc, addr, p.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (p *P64) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (p *P64) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyInN(cc, addr, p.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (p *P64) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *RS_ACCESS_MASK) SizeBytes() int { return 0 + 4*SDK_RS_ACCESS_MAX_LIMBS } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *RS_ACCESS_MASK) MarshalBytes(dst []byte) []byte { for idx := 0; idx < SDK_RS_ACCESS_MAX_LIMBS; idx++ { hostarch.ByteOrder.PutUint32(dst[:4], uint32(r.Limbs[idx])) dst = dst[4:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *RS_ACCESS_MASK) UnmarshalBytes(src []byte) []byte { for idx := 0; idx < SDK_RS_ACCESS_MAX_LIMBS; idx++ { r.Limbs[idx] = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *RS_ACCESS_MASK) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *RS_ACCESS_MASK) MarshalUnsafe(dst []byte) []byte { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(r), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *RS_ACCESS_MASK) UnmarshalUnsafe(src []byte) []byte { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(r), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (r *RS_ACCESS_MASK) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (r *RS_ACCESS_MASK) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (r *RS_ACCESS_MASK) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *RS_ACCESS_MASK) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *RS_ACCESS_MASK) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *RS_SHARE_POLICY) SizeBytes() int { return 7 + (*RS_ACCESS_MASK)(nil).SizeBytes() + 1*1 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *RS_SHARE_POLICY) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(r.Target)) dst = dst[4:] dst = r.AccessMask.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint16(dst[:2], uint16(r.Type)) dst = dst[2:] dst[0] = byte(r.Action) dst = dst[1:] for idx := 0; idx < 1; idx++ { dst[0] = byte(r.Pad[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *RS_SHARE_POLICY) UnmarshalBytes(src []byte) []byte { r.Target = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = r.AccessMask.UnmarshalUnsafe(src) r.Type = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] r.Action = uint8(src[0]) src = src[1:] for idx := 0; idx < 1; idx++ { r.Pad[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *RS_SHARE_POLICY) Packed() bool { return r.AccessMask.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *RS_SHARE_POLICY) MarshalUnsafe(dst []byte) []byte { if r.AccessMask.Packed() { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(r), uintptr(size)) return dst[size:] } // Type RS_SHARE_POLICY doesn't have a packed layout in memory, fallback to MarshalBytes. return r.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *RS_SHARE_POLICY) UnmarshalUnsafe(src []byte) []byte { if r.AccessMask.Packed() { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(r), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type RS_SHARE_POLICY doesn't have a packed layout in memory, fallback to UnmarshalBytes. return r.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (r *RS_SHARE_POLICY) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !r.AccessMask.Packed() { // Type RS_SHARE_POLICY doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. r.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (r *RS_SHARE_POLICY) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (r *RS_SHARE_POLICY) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !r.AccessMask.Packed() { // Type RS_SHARE_POLICY doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. r.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *RS_SHARE_POLICY) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *RS_SHARE_POLICY) WriteTo(writer io.Writer) (int64, error) { if !r.AccessMask.Packed() { // Type RS_SHARE_POLICY doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, r.SizeBytes()) r.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS) SizeBytes() int { return 28 + (*UvmGpuMappingAttributes)(nil).SizeBytes()*UVM_MAX_GPUS + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Base)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Length)) dst = dst[8:] for idx := 0; idx < UVM_MAX_GPUS; idx++ { dst = u.PerGPUAttributes[idx].MarshalUnsafe(dst) } hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.GPUAttributesCount)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(u.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS) UnmarshalBytes(src []byte) []byte { u.Base = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.Length = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] for idx := 0; idx < UVM_MAX_GPUS; idx++ { src = u.PerGPUAttributes[idx].UnmarshalUnsafe(src) } u.GPUAttributesCount = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { u.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS) Packed() bool { return u.PerGPUAttributes[0].Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS) MarshalUnsafe(dst []byte) []byte { if u.PerGPUAttributes[0].Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // Type UVM_ALLOC_SEMAPHORE_POOL_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return u.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS) UnmarshalUnsafe(src []byte) []byte { if u.PerGPUAttributes[0].Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UVM_ALLOC_SEMAPHORE_POOL_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return u.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.PerGPUAttributes[0].Packed() { // Type UVM_ALLOC_SEMAPHORE_POOL_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. u.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.PerGPUAttributes[0].Packed() { // Type UVM_ALLOC_SEMAPHORE_POOL_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. u.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !u.PerGPUAttributes[0].Packed() { // Type UVM_ALLOC_SEMAPHORE_POOL_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, u.SizeBytes()) u.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550) SizeBytes() int { return 28 + (*UvmGpuMappingAttributes)(nil).SizeBytes()*UVM_MAX_GPUS_V2 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Base)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Length)) dst = dst[8:] for idx := 0; idx < UVM_MAX_GPUS_V2; idx++ { dst = u.PerGPUAttributes[idx].MarshalUnsafe(dst) } hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.GPUAttributesCount)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(u.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550) UnmarshalBytes(src []byte) []byte { u.Base = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.Length = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] for idx := 0; idx < UVM_MAX_GPUS_V2; idx++ { src = u.PerGPUAttributes[idx].UnmarshalUnsafe(src) } u.GPUAttributesCount = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { u.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550) Packed() bool { return u.PerGPUAttributes[0].Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550) MarshalUnsafe(dst []byte) []byte { if u.PerGPUAttributes[0].Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // Type UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550 doesn't have a packed layout in memory, fallback to MarshalBytes. return u.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550) UnmarshalUnsafe(src []byte) []byte { if u.PerGPUAttributes[0].Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550 doesn't have a packed layout in memory, fallback to UnmarshalBytes. return u.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.PerGPUAttributes[0].Packed() { // Type UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. u.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.PerGPUAttributes[0].Packed() { // Type UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550 doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. u.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550) WriteTo(writer io.Writer) (int64, error) { if !u.PerGPUAttributes[0].Packed() { // Type UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, u.SizeBytes()) u.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_CREATE_EXTERNAL_RANGE_PARAMS) SizeBytes() int { return 20 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_CREATE_EXTERNAL_RANGE_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Base)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Length)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(u.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_CREATE_EXTERNAL_RANGE_PARAMS) UnmarshalBytes(src []byte) []byte { u.Base = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.Length = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { u.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_CREATE_EXTERNAL_RANGE_PARAMS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_CREATE_EXTERNAL_RANGE_PARAMS) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_CREATE_EXTERNAL_RANGE_PARAMS) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_CREATE_EXTERNAL_RANGE_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_CREATE_EXTERNAL_RANGE_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_CREATE_EXTERNAL_RANGE_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_CREATE_EXTERNAL_RANGE_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_CREATE_EXTERNAL_RANGE_PARAMS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_CREATE_RANGE_GROUP_PARAMS) SizeBytes() int { return 12 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_CREATE_RANGE_GROUP_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.RangeGroupID)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(u.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_CREATE_RANGE_GROUP_PARAMS) UnmarshalBytes(src []byte) []byte { u.RangeGroupID = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { u.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_CREATE_RANGE_GROUP_PARAMS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_CREATE_RANGE_GROUP_PARAMS) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_CREATE_RANGE_GROUP_PARAMS) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_CREATE_RANGE_GROUP_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_CREATE_RANGE_GROUP_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_CREATE_RANGE_GROUP_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_CREATE_RANGE_GROUP_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_CREATE_RANGE_GROUP_PARAMS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_DESTROY_RANGE_GROUP_PARAMS) SizeBytes() int { return 12 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_DESTROY_RANGE_GROUP_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.RangeGroupID)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(u.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_DESTROY_RANGE_GROUP_PARAMS) UnmarshalBytes(src []byte) []byte { u.RangeGroupID = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { u.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_DESTROY_RANGE_GROUP_PARAMS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_DESTROY_RANGE_GROUP_PARAMS) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_DESTROY_RANGE_GROUP_PARAMS) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_DESTROY_RANGE_GROUP_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_DESTROY_RANGE_GROUP_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_DESTROY_RANGE_GROUP_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_DESTROY_RANGE_GROUP_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_DESTROY_RANGE_GROUP_PARAMS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_DISABLE_PEER_ACCESS_PARAMS) SizeBytes() int { return 4 + (*NvUUID)(nil).SizeBytes() + (*NvUUID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_DISABLE_PEER_ACCESS_PARAMS) MarshalBytes(dst []byte) []byte { dst = u.GPUUUIDA.MarshalUnsafe(dst) dst = u.GPUUUIDB.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_DISABLE_PEER_ACCESS_PARAMS) UnmarshalBytes(src []byte) []byte { src = u.GPUUUIDA.UnmarshalUnsafe(src) src = u.GPUUUIDB.UnmarshalUnsafe(src) u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_DISABLE_PEER_ACCESS_PARAMS) Packed() bool { return u.GPUUUIDA.Packed() && u.GPUUUIDB.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_DISABLE_PEER_ACCESS_PARAMS) MarshalUnsafe(dst []byte) []byte { if u.GPUUUIDA.Packed() && u.GPUUUIDB.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // Type UVM_DISABLE_PEER_ACCESS_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return u.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_DISABLE_PEER_ACCESS_PARAMS) UnmarshalUnsafe(src []byte) []byte { if u.GPUUUIDA.Packed() && u.GPUUUIDB.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UVM_DISABLE_PEER_ACCESS_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return u.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_DISABLE_PEER_ACCESS_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.GPUUUIDA.Packed() && u.GPUUUIDB.Packed() { // Type UVM_DISABLE_PEER_ACCESS_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. u.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_DISABLE_PEER_ACCESS_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_DISABLE_PEER_ACCESS_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.GPUUUIDA.Packed() && u.GPUUUIDB.Packed() { // Type UVM_DISABLE_PEER_ACCESS_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. u.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_DISABLE_PEER_ACCESS_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_DISABLE_PEER_ACCESS_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !u.GPUUUIDA.Packed() && u.GPUUUIDB.Packed() { // Type UVM_DISABLE_PEER_ACCESS_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, u.SizeBytes()) u.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_DISABLE_READ_DUPLICATION_PARAMS) SizeBytes() int { return 20 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_DISABLE_READ_DUPLICATION_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.RequestedBase)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Length)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(u.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_DISABLE_READ_DUPLICATION_PARAMS) UnmarshalBytes(src []byte) []byte { u.RequestedBase = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.Length = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { u.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_DISABLE_READ_DUPLICATION_PARAMS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_DISABLE_READ_DUPLICATION_PARAMS) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_DISABLE_READ_DUPLICATION_PARAMS) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_DISABLE_READ_DUPLICATION_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_DISABLE_READ_DUPLICATION_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_DISABLE_READ_DUPLICATION_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_DISABLE_READ_DUPLICATION_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_DISABLE_READ_DUPLICATION_PARAMS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_ENABLE_PEER_ACCESS_PARAMS) SizeBytes() int { return 4 + (*NvUUID)(nil).SizeBytes() + (*NvUUID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_ENABLE_PEER_ACCESS_PARAMS) MarshalBytes(dst []byte) []byte { dst = u.GPUUUIDA.MarshalUnsafe(dst) dst = u.GPUUUIDB.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_ENABLE_PEER_ACCESS_PARAMS) UnmarshalBytes(src []byte) []byte { src = u.GPUUUIDA.UnmarshalUnsafe(src) src = u.GPUUUIDB.UnmarshalUnsafe(src) u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_ENABLE_PEER_ACCESS_PARAMS) Packed() bool { return u.GPUUUIDA.Packed() && u.GPUUUIDB.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_ENABLE_PEER_ACCESS_PARAMS) MarshalUnsafe(dst []byte) []byte { if u.GPUUUIDA.Packed() && u.GPUUUIDB.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // Type UVM_ENABLE_PEER_ACCESS_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return u.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_ENABLE_PEER_ACCESS_PARAMS) UnmarshalUnsafe(src []byte) []byte { if u.GPUUUIDA.Packed() && u.GPUUUIDB.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UVM_ENABLE_PEER_ACCESS_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return u.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_ENABLE_PEER_ACCESS_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.GPUUUIDA.Packed() && u.GPUUUIDB.Packed() { // Type UVM_ENABLE_PEER_ACCESS_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. u.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_ENABLE_PEER_ACCESS_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_ENABLE_PEER_ACCESS_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.GPUUUIDA.Packed() && u.GPUUUIDB.Packed() { // Type UVM_ENABLE_PEER_ACCESS_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. u.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_ENABLE_PEER_ACCESS_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_ENABLE_PEER_ACCESS_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !u.GPUUUIDA.Packed() && u.GPUUUIDB.Packed() { // Type UVM_ENABLE_PEER_ACCESS_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, u.SizeBytes()) u.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_FREE_PARAMS) SizeBytes() int { return 20 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_FREE_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Base)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Length)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(u.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_FREE_PARAMS) UnmarshalBytes(src []byte) []byte { u.Base = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.Length = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { u.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_FREE_PARAMS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_FREE_PARAMS) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_FREE_PARAMS) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_FREE_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_FREE_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_FREE_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_FREE_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_FREE_PARAMS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_INITIALIZE_PARAMS) SizeBytes() int { return 12 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_INITIALIZE_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Flags)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(u.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_INITIALIZE_PARAMS) UnmarshalBytes(src []byte) []byte { u.Flags = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { u.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_INITIALIZE_PARAMS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_INITIALIZE_PARAMS) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_INITIALIZE_PARAMS) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_INITIALIZE_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_INITIALIZE_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_INITIALIZE_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_INITIALIZE_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_INITIALIZE_PARAMS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS) SizeBytes() int { return 20 + (*NvUUID)(nil).SizeBytes() + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Base)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Length)) dst = dst[8:] dst = u.GPUUUID.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(u.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS) UnmarshalBytes(src []byte) []byte { u.Base = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.Length = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = u.GPUUUID.UnmarshalUnsafe(src) u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { u.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS) Packed() bool { return u.GPUUUID.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS) MarshalUnsafe(dst []byte) []byte { if u.GPUUUID.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // Type UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return u.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS) UnmarshalUnsafe(src []byte) []byte { if u.GPUUUID.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return u.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.GPUUUID.Packed() { // Type UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. u.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.GPUUUID.Packed() { // Type UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. u.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !u.GPUUUID.Packed() { // Type UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, u.SizeBytes()) u.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS) SizeBytes() int { return 40 + (*UvmGpuMappingAttributes)(nil).SizeBytes()*UVM_MAX_GPUS + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Base)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Length)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Offset)) dst = dst[8:] for idx := 0; idx < UVM_MAX_GPUS; idx++ { dst = p.PerGPUAttributes[idx].MarshalUnsafe(dst) } hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.GPUAttributesCount)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.RMCtrlFD)) dst = dst[4:] dst = p.HClient.MarshalUnsafe(dst) dst = p.HMemory.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.RMStatus)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS) UnmarshalBytes(src []byte) []byte { p.Base = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Length = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Offset = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] for idx := 0; idx < UVM_MAX_GPUS; idx++ { src = p.PerGPUAttributes[idx].UnmarshalUnsafe(src) } p.GPUAttributesCount = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.RMCtrlFD = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = p.HClient.UnmarshalUnsafe(src) src = p.HMemory.UnmarshalUnsafe(src) p.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS) Packed() bool { return p.HClient.Packed() && p.HMemory.Packed() && p.PerGPUAttributes[0].Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS) MarshalUnsafe(dst []byte) []byte { if p.HClient.Packed() && p.HMemory.Packed() && p.PerGPUAttributes[0].Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(p), uintptr(size)) return dst[size:] } // Type UVM_MAP_EXTERNAL_ALLOCATION_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return p.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS) UnmarshalUnsafe(src []byte) []byte { if p.HClient.Packed() && p.HMemory.Packed() && p.PerGPUAttributes[0].Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(p), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UVM_MAP_EXTERNAL_ALLOCATION_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return p.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.HClient.Packed() && p.HMemory.Packed() && p.PerGPUAttributes[0].Packed() { // Type UVM_MAP_EXTERNAL_ALLOCATION_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. p.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyOutN(cc, addr, p.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.HClient.Packed() && p.HMemory.Packed() && p.PerGPUAttributes[0].Packed() { // Type UVM_MAP_EXTERNAL_ALLOCATION_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. p.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyInN(cc, addr, p.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !p.HClient.Packed() && p.HMemory.Packed() && p.PerGPUAttributes[0].Packed() { // Type UVM_MAP_EXTERNAL_ALLOCATION_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, p.SizeBytes()) p.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550) SizeBytes() int { return 40 + (*UvmGpuMappingAttributes)(nil).SizeBytes()*UVM_MAX_GPUS_V2 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Base)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Length)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Offset)) dst = dst[8:] for idx := 0; idx < UVM_MAX_GPUS_V2; idx++ { dst = p.PerGPUAttributes[idx].MarshalUnsafe(dst) } hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.GPUAttributesCount)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.RMCtrlFD)) dst = dst[4:] dst = p.HClient.MarshalUnsafe(dst) dst = p.HMemory.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.RMStatus)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550) UnmarshalBytes(src []byte) []byte { p.Base = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Length = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Offset = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] for idx := 0; idx < UVM_MAX_GPUS_V2; idx++ { src = p.PerGPUAttributes[idx].UnmarshalUnsafe(src) } p.GPUAttributesCount = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.RMCtrlFD = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = p.HClient.UnmarshalUnsafe(src) src = p.HMemory.UnmarshalUnsafe(src) p.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550) Packed() bool { return p.HClient.Packed() && p.HMemory.Packed() && p.PerGPUAttributes[0].Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550) MarshalUnsafe(dst []byte) []byte { if p.HClient.Packed() && p.HMemory.Packed() && p.PerGPUAttributes[0].Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(p), uintptr(size)) return dst[size:] } // Type UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550 doesn't have a packed layout in memory, fallback to MarshalBytes. return p.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550) UnmarshalUnsafe(src []byte) []byte { if p.HClient.Packed() && p.HMemory.Packed() && p.PerGPUAttributes[0].Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(p), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550 doesn't have a packed layout in memory, fallback to UnmarshalBytes. return p.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.HClient.Packed() && p.HMemory.Packed() && p.PerGPUAttributes[0].Packed() { // Type UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. p.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyOutN(cc, addr, p.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.HClient.Packed() && p.HMemory.Packed() && p.PerGPUAttributes[0].Packed() { // Type UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550 doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. p.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyInN(cc, addr, p.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550) WriteTo(writer io.Writer) (int64, error) { if !p.HClient.Packed() && p.HMemory.Packed() && p.PerGPUAttributes[0].Packed() { // Type UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, p.SizeBytes()) p.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_MIGRATE_RANGE_GROUP_PARAMS) SizeBytes() int { return 12 + (*NvUUID)(nil).SizeBytes() + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_MIGRATE_RANGE_GROUP_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.RangeGroupID)) dst = dst[8:] dst = u.DestinationUUID.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(u.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_MIGRATE_RANGE_GROUP_PARAMS) UnmarshalBytes(src []byte) []byte { u.RangeGroupID = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = u.DestinationUUID.UnmarshalUnsafe(src) u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { u.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_MIGRATE_RANGE_GROUP_PARAMS) Packed() bool { return u.DestinationUUID.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_MIGRATE_RANGE_GROUP_PARAMS) MarshalUnsafe(dst []byte) []byte { if u.DestinationUUID.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // Type UVM_MIGRATE_RANGE_GROUP_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return u.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_MIGRATE_RANGE_GROUP_PARAMS) UnmarshalUnsafe(src []byte) []byte { if u.DestinationUUID.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UVM_MIGRATE_RANGE_GROUP_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return u.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_MIGRATE_RANGE_GROUP_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.DestinationUUID.Packed() { // Type UVM_MIGRATE_RANGE_GROUP_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. u.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_MIGRATE_RANGE_GROUP_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_MIGRATE_RANGE_GROUP_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.DestinationUUID.Packed() { // Type UVM_MIGRATE_RANGE_GROUP_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. u.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_MIGRATE_RANGE_GROUP_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_MIGRATE_RANGE_GROUP_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !u.DestinationUUID.Packed() { // Type UVM_MIGRATE_RANGE_GROUP_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, u.SizeBytes()) u.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_MM_INITIALIZE_PARAMS) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_MM_INITIALIZE_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.UvmFD)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.Status)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_MM_INITIALIZE_PARAMS) UnmarshalBytes(src []byte) []byte { u.UvmFD = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] u.Status = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_MM_INITIALIZE_PARAMS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_MM_INITIALIZE_PARAMS) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_MM_INITIALIZE_PARAMS) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_MM_INITIALIZE_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_MM_INITIALIZE_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_MM_INITIALIZE_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_MM_INITIALIZE_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_MM_INITIALIZE_PARAMS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_PAGEABLE_MEM_ACCESS_PARAMS) SizeBytes() int { return 5 + 1*3 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_PAGEABLE_MEM_ACCESS_PARAMS) MarshalBytes(dst []byte) []byte { dst[0] = byte(u.PageableMemAccess) dst = dst[1:] for idx := 0; idx < 3; idx++ { dst[0] = byte(u.Pad[idx]) dst = dst[1:] } hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_PAGEABLE_MEM_ACCESS_PARAMS) UnmarshalBytes(src []byte) []byte { u.PageableMemAccess = uint8(src[0]) src = src[1:] for idx := 0; idx < 3; idx++ { u.Pad[idx] = src[0] src = src[1:] } u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_PAGEABLE_MEM_ACCESS_PARAMS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_PAGEABLE_MEM_ACCESS_PARAMS) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_PAGEABLE_MEM_ACCESS_PARAMS) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_PAGEABLE_MEM_ACCESS_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_PAGEABLE_MEM_ACCESS_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_PAGEABLE_MEM_ACCESS_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_PAGEABLE_MEM_ACCESS_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_PAGEABLE_MEM_ACCESS_PARAMS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (p *UVM_REGISTER_CHANNEL_PARAMS) SizeBytes() int { return 24 + (*NvUUID)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + 1*4 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (p *UVM_REGISTER_CHANNEL_PARAMS) MarshalBytes(dst []byte) []byte { dst = p.GPUUUID.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.RMCtrlFD)) dst = dst[4:] dst = p.HClient.MarshalUnsafe(dst) dst = p.HChannel.MarshalUnsafe(dst) for idx := 0; idx < 4; idx++ { dst[0] = byte(p.Pad[idx]) dst = dst[1:] } hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Base)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(p.Length)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.RMStatus)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(p.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (p *UVM_REGISTER_CHANNEL_PARAMS) UnmarshalBytes(src []byte) []byte { src = p.GPUUUID.UnmarshalUnsafe(src) p.RMCtrlFD = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = p.HClient.UnmarshalUnsafe(src) src = p.HChannel.UnmarshalUnsafe(src) for idx := 0; idx < 4; idx++ { p.Pad[idx] = src[0] src = src[1:] } p.Base = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.Length = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] p.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { p.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (p *UVM_REGISTER_CHANNEL_PARAMS) Packed() bool { return p.GPUUUID.Packed() && p.HChannel.Packed() && p.HClient.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (p *UVM_REGISTER_CHANNEL_PARAMS) MarshalUnsafe(dst []byte) []byte { if p.GPUUUID.Packed() && p.HChannel.Packed() && p.HClient.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(p), uintptr(size)) return dst[size:] } // Type UVM_REGISTER_CHANNEL_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return p.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (p *UVM_REGISTER_CHANNEL_PARAMS) UnmarshalUnsafe(src []byte) []byte { if p.GPUUUID.Packed() && p.HChannel.Packed() && p.HClient.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(p), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UVM_REGISTER_CHANNEL_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return p.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (p *UVM_REGISTER_CHANNEL_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.GPUUUID.Packed() && p.HChannel.Packed() && p.HClient.Packed() { // Type UVM_REGISTER_CHANNEL_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. p.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (p *UVM_REGISTER_CHANNEL_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyOutN(cc, addr, p.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (p *UVM_REGISTER_CHANNEL_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.GPUUUID.Packed() && p.HChannel.Packed() && p.HClient.Packed() { // Type UVM_REGISTER_CHANNEL_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. p.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (p *UVM_REGISTER_CHANNEL_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyInN(cc, addr, p.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (p *UVM_REGISTER_CHANNEL_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !p.GPUUUID.Packed() && p.HChannel.Packed() && p.HClient.Packed() { // Type UVM_REGISTER_CHANNEL_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, p.SizeBytes()) p.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (p *UVM_REGISTER_GPU_PARAMS) SizeBytes() int { return 13 + (*NvUUID)(nil).SizeBytes() + 1*3 + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (p *UVM_REGISTER_GPU_PARAMS) MarshalBytes(dst []byte) []byte { dst = p.GPUUUID.MarshalUnsafe(dst) dst[0] = byte(p.NumaEnabled) dst = dst[1:] for idx := 0; idx < 3; idx++ { dst[0] = byte(p.Pad[idx]) dst = dst[1:] } hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.NumaNodeID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.RMCtrlFD)) dst = dst[4:] dst = p.HClient.MarshalUnsafe(dst) dst = p.HSMCPartRef.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.RMStatus)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (p *UVM_REGISTER_GPU_PARAMS) UnmarshalBytes(src []byte) []byte { src = p.GPUUUID.UnmarshalUnsafe(src) p.NumaEnabled = uint8(src[0]) src = src[1:] for idx := 0; idx < 3; idx++ { p.Pad[idx] = src[0] src = src[1:] } p.NumaNodeID = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] p.RMCtrlFD = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = p.HClient.UnmarshalUnsafe(src) src = p.HSMCPartRef.UnmarshalUnsafe(src) p.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (p *UVM_REGISTER_GPU_PARAMS) Packed() bool { return p.GPUUUID.Packed() && p.HClient.Packed() && p.HSMCPartRef.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (p *UVM_REGISTER_GPU_PARAMS) MarshalUnsafe(dst []byte) []byte { if p.GPUUUID.Packed() && p.HClient.Packed() && p.HSMCPartRef.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(p), uintptr(size)) return dst[size:] } // Type UVM_REGISTER_GPU_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return p.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (p *UVM_REGISTER_GPU_PARAMS) UnmarshalUnsafe(src []byte) []byte { if p.GPUUUID.Packed() && p.HClient.Packed() && p.HSMCPartRef.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(p), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UVM_REGISTER_GPU_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return p.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (p *UVM_REGISTER_GPU_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.GPUUUID.Packed() && p.HClient.Packed() && p.HSMCPartRef.Packed() { // Type UVM_REGISTER_GPU_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. p.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (p *UVM_REGISTER_GPU_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyOutN(cc, addr, p.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (p *UVM_REGISTER_GPU_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.GPUUUID.Packed() && p.HClient.Packed() && p.HSMCPartRef.Packed() { // Type UVM_REGISTER_GPU_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. p.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (p *UVM_REGISTER_GPU_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyInN(cc, addr, p.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (p *UVM_REGISTER_GPU_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !p.GPUUUID.Packed() && p.HClient.Packed() && p.HSMCPartRef.Packed() { // Type UVM_REGISTER_GPU_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, p.SizeBytes()) p.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (p *UVM_REGISTER_GPU_VASPACE_PARAMS) SizeBytes() int { return 8 + (*NvUUID)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (p *UVM_REGISTER_GPU_VASPACE_PARAMS) MarshalBytes(dst []byte) []byte { dst = p.GPUUUID.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.RMCtrlFD)) dst = dst[4:] dst = p.HClient.MarshalUnsafe(dst) dst = p.HVASpace.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(p.RMStatus)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (p *UVM_REGISTER_GPU_VASPACE_PARAMS) UnmarshalBytes(src []byte) []byte { src = p.GPUUUID.UnmarshalUnsafe(src) p.RMCtrlFD = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = p.HClient.UnmarshalUnsafe(src) src = p.HVASpace.UnmarshalUnsafe(src) p.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (p *UVM_REGISTER_GPU_VASPACE_PARAMS) Packed() bool { return p.GPUUUID.Packed() && p.HClient.Packed() && p.HVASpace.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (p *UVM_REGISTER_GPU_VASPACE_PARAMS) MarshalUnsafe(dst []byte) []byte { if p.GPUUUID.Packed() && p.HClient.Packed() && p.HVASpace.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(p), uintptr(size)) return dst[size:] } // Type UVM_REGISTER_GPU_VASPACE_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return p.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (p *UVM_REGISTER_GPU_VASPACE_PARAMS) UnmarshalUnsafe(src []byte) []byte { if p.GPUUUID.Packed() && p.HClient.Packed() && p.HVASpace.Packed() { size := p.SizeBytes() gohacks.Memmove(unsafe.Pointer(p), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UVM_REGISTER_GPU_VASPACE_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return p.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (p *UVM_REGISTER_GPU_VASPACE_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.GPUUUID.Packed() && p.HClient.Packed() && p.HVASpace.Packed() { // Type UVM_REGISTER_GPU_VASPACE_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. p.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (p *UVM_REGISTER_GPU_VASPACE_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyOutN(cc, addr, p.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (p *UVM_REGISTER_GPU_VASPACE_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !p.GPUUUID.Packed() && p.HClient.Packed() && p.HVASpace.Packed() { // Type UVM_REGISTER_GPU_VASPACE_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(p.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. p.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (p *UVM_REGISTER_GPU_VASPACE_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return p.CopyInN(cc, addr, p.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (p *UVM_REGISTER_GPU_VASPACE_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !p.GPUUUID.Packed() && p.HClient.Packed() && p.HVASpace.Packed() { // Type UVM_REGISTER_GPU_VASPACE_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, p.SizeBytes()) p.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(p))) hdr.Len = p.SizeBytes() hdr.Cap = p.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that p // must live until the use above. runtime.KeepAlive(p) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS) SizeBytes() int { return 20 + (*NvUUID)(nil).SizeBytes() + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.RequestedBase)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Length)) dst = dst[8:] dst = u.PreferredLocation.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(u.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS) UnmarshalBytes(src []byte) []byte { u.RequestedBase = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.Length = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = u.PreferredLocation.UnmarshalUnsafe(src) u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { u.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_SET_PREFERRED_LOCATION_PARAMS) Packed() bool { return u.PreferredLocation.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS) MarshalUnsafe(dst []byte) []byte { if u.PreferredLocation.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // Type UVM_SET_PREFERRED_LOCATION_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return u.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS) UnmarshalUnsafe(src []byte) []byte { if u.PreferredLocation.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UVM_SET_PREFERRED_LOCATION_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return u.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.PreferredLocation.Packed() { // Type UVM_SET_PREFERRED_LOCATION_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. u.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.PreferredLocation.Packed() { // Type UVM_SET_PREFERRED_LOCATION_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. u.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !u.PreferredLocation.Packed() { // Type UVM_SET_PREFERRED_LOCATION_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, u.SizeBytes()) u.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS_V550) SizeBytes() int { return 24 + (*NvUUID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS_V550) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.RequestedBase)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Length)) dst = dst[8:] dst = u.PreferredLocation.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.PreferredCPUNumaNode)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS_V550) UnmarshalBytes(src []byte) []byte { u.RequestedBase = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.Length = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = u.PreferredLocation.UnmarshalUnsafe(src) u.PreferredCPUNumaNode = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_SET_PREFERRED_LOCATION_PARAMS_V550) Packed() bool { return u.PreferredLocation.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS_V550) MarshalUnsafe(dst []byte) []byte { if u.PreferredLocation.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // Type UVM_SET_PREFERRED_LOCATION_PARAMS_V550 doesn't have a packed layout in memory, fallback to MarshalBytes. return u.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS_V550) UnmarshalUnsafe(src []byte) []byte { if u.PreferredLocation.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UVM_SET_PREFERRED_LOCATION_PARAMS_V550 doesn't have a packed layout in memory, fallback to UnmarshalBytes. return u.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS_V550) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.PreferredLocation.Packed() { // Type UVM_SET_PREFERRED_LOCATION_PARAMS_V550 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. u.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS_V550) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS_V550) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.PreferredLocation.Packed() { // Type UVM_SET_PREFERRED_LOCATION_PARAMS_V550 doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. u.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS_V550) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_SET_PREFERRED_LOCATION_PARAMS_V550) WriteTo(writer io.Writer) (int64, error) { if !u.PreferredLocation.Packed() { // Type UVM_SET_PREFERRED_LOCATION_PARAMS_V550 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, u.SizeBytes()) u.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_SET_RANGE_GROUP_PARAMS) SizeBytes() int { return 28 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_SET_RANGE_GROUP_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.RangeGroupID)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.RequestedBase)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Length)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(u.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_SET_RANGE_GROUP_PARAMS) UnmarshalBytes(src []byte) []byte { u.RangeGroupID = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.RequestedBase = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.Length = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { u.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_SET_RANGE_GROUP_PARAMS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_SET_RANGE_GROUP_PARAMS) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_SET_RANGE_GROUP_PARAMS) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_SET_RANGE_GROUP_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_SET_RANGE_GROUP_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_SET_RANGE_GROUP_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_SET_RANGE_GROUP_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_SET_RANGE_GROUP_PARAMS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_TOOLS_READ_PROCESS_MEMORY_PARAMS) SizeBytes() int { return 36 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_TOOLS_READ_PROCESS_MEMORY_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Buffer)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Size)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.TargetVA)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.BytesRead)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(u.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_TOOLS_READ_PROCESS_MEMORY_PARAMS) UnmarshalBytes(src []byte) []byte { u.Buffer = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.Size = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.TargetVA = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.BytesRead = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { u.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_TOOLS_READ_PROCESS_MEMORY_PARAMS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_TOOLS_READ_PROCESS_MEMORY_PARAMS) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_TOOLS_READ_PROCESS_MEMORY_PARAMS) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_TOOLS_READ_PROCESS_MEMORY_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_TOOLS_READ_PROCESS_MEMORY_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_TOOLS_READ_PROCESS_MEMORY_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_TOOLS_READ_PROCESS_MEMORY_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_TOOLS_READ_PROCESS_MEMORY_PARAMS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_TOOLS_WRITE_PROCESS_MEMORY_PARAMS) SizeBytes() int { return 36 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_TOOLS_WRITE_PROCESS_MEMORY_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Buffer)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Size)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.TargetVA)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.BytesWritten)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(u.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_TOOLS_WRITE_PROCESS_MEMORY_PARAMS) UnmarshalBytes(src []byte) []byte { u.Buffer = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.Size = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.TargetVA = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.BytesWritten = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { u.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_TOOLS_WRITE_PROCESS_MEMORY_PARAMS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_TOOLS_WRITE_PROCESS_MEMORY_PARAMS) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_TOOLS_WRITE_PROCESS_MEMORY_PARAMS) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_TOOLS_WRITE_PROCESS_MEMORY_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_TOOLS_WRITE_PROCESS_MEMORY_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_TOOLS_WRITE_PROCESS_MEMORY_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_TOOLS_WRITE_PROCESS_MEMORY_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_TOOLS_WRITE_PROCESS_MEMORY_PARAMS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_UNMAP_EXTERNAL_PARAMS) SizeBytes() int { return 20 + (*NvUUID)(nil).SizeBytes() + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_UNMAP_EXTERNAL_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Base)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Length)) dst = dst[8:] dst = u.GPUUUID.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(u.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_UNMAP_EXTERNAL_PARAMS) UnmarshalBytes(src []byte) []byte { u.Base = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.Length = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = u.GPUUUID.UnmarshalUnsafe(src) u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { u.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_UNMAP_EXTERNAL_PARAMS) Packed() bool { return u.GPUUUID.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_UNMAP_EXTERNAL_PARAMS) MarshalUnsafe(dst []byte) []byte { if u.GPUUUID.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // Type UVM_UNMAP_EXTERNAL_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return u.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_UNMAP_EXTERNAL_PARAMS) UnmarshalUnsafe(src []byte) []byte { if u.GPUUUID.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UVM_UNMAP_EXTERNAL_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return u.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_UNMAP_EXTERNAL_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.GPUUUID.Packed() { // Type UVM_UNMAP_EXTERNAL_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. u.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_UNMAP_EXTERNAL_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_UNMAP_EXTERNAL_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.GPUUUID.Packed() { // Type UVM_UNMAP_EXTERNAL_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. u.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_UNMAP_EXTERNAL_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_UNMAP_EXTERNAL_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !u.GPUUUID.Packed() { // Type UVM_UNMAP_EXTERNAL_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, u.SizeBytes()) u.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_UNREGISTER_CHANNEL_PARAMS) SizeBytes() int { return 4 + (*NvUUID)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() + (*Handle)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_UNREGISTER_CHANNEL_PARAMS) MarshalBytes(dst []byte) []byte { dst = u.GPUUUID.MarshalUnsafe(dst) dst = u.HClient.MarshalUnsafe(dst) dst = u.HChannel.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_UNREGISTER_CHANNEL_PARAMS) UnmarshalBytes(src []byte) []byte { src = u.GPUUUID.UnmarshalUnsafe(src) src = u.HClient.UnmarshalUnsafe(src) src = u.HChannel.UnmarshalUnsafe(src) u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_UNREGISTER_CHANNEL_PARAMS) Packed() bool { return u.GPUUUID.Packed() && u.HChannel.Packed() && u.HClient.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_UNREGISTER_CHANNEL_PARAMS) MarshalUnsafe(dst []byte) []byte { if u.GPUUUID.Packed() && u.HChannel.Packed() && u.HClient.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // Type UVM_UNREGISTER_CHANNEL_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return u.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_UNREGISTER_CHANNEL_PARAMS) UnmarshalUnsafe(src []byte) []byte { if u.GPUUUID.Packed() && u.HChannel.Packed() && u.HClient.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UVM_UNREGISTER_CHANNEL_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return u.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_UNREGISTER_CHANNEL_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.GPUUUID.Packed() && u.HChannel.Packed() && u.HClient.Packed() { // Type UVM_UNREGISTER_CHANNEL_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. u.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_UNREGISTER_CHANNEL_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_UNREGISTER_CHANNEL_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.GPUUUID.Packed() && u.HChannel.Packed() && u.HClient.Packed() { // Type UVM_UNREGISTER_CHANNEL_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. u.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_UNREGISTER_CHANNEL_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_UNREGISTER_CHANNEL_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !u.GPUUUID.Packed() && u.HChannel.Packed() && u.HClient.Packed() { // Type UVM_UNREGISTER_CHANNEL_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, u.SizeBytes()) u.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_UNREGISTER_GPU_PARAMS) SizeBytes() int { return 4 + (*NvUUID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_UNREGISTER_GPU_PARAMS) MarshalBytes(dst []byte) []byte { dst = u.GPUUUID.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_UNREGISTER_GPU_PARAMS) UnmarshalBytes(src []byte) []byte { src = u.GPUUUID.UnmarshalUnsafe(src) u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_UNREGISTER_GPU_PARAMS) Packed() bool { return u.GPUUUID.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_UNREGISTER_GPU_PARAMS) MarshalUnsafe(dst []byte) []byte { if u.GPUUUID.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // Type UVM_UNREGISTER_GPU_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return u.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_UNREGISTER_GPU_PARAMS) UnmarshalUnsafe(src []byte) []byte { if u.GPUUUID.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UVM_UNREGISTER_GPU_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return u.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_UNREGISTER_GPU_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.GPUUUID.Packed() { // Type UVM_UNREGISTER_GPU_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. u.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_UNREGISTER_GPU_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_UNREGISTER_GPU_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.GPUUUID.Packed() { // Type UVM_UNREGISTER_GPU_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. u.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_UNREGISTER_GPU_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_UNREGISTER_GPU_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !u.GPUUUID.Packed() { // Type UVM_UNREGISTER_GPU_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, u.SizeBytes()) u.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_UNREGISTER_GPU_VASPACE_PARAMS) SizeBytes() int { return 4 + (*NvUUID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_UNREGISTER_GPU_VASPACE_PARAMS) MarshalBytes(dst []byte) []byte { dst = u.GPUUUID.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_UNREGISTER_GPU_VASPACE_PARAMS) UnmarshalBytes(src []byte) []byte { src = u.GPUUUID.UnmarshalUnsafe(src) u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_UNREGISTER_GPU_VASPACE_PARAMS) Packed() bool { return u.GPUUUID.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_UNREGISTER_GPU_VASPACE_PARAMS) MarshalUnsafe(dst []byte) []byte { if u.GPUUUID.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // Type UVM_UNREGISTER_GPU_VASPACE_PARAMS doesn't have a packed layout in memory, fallback to MarshalBytes. return u.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_UNREGISTER_GPU_VASPACE_PARAMS) UnmarshalUnsafe(src []byte) []byte { if u.GPUUUID.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UVM_UNREGISTER_GPU_VASPACE_PARAMS doesn't have a packed layout in memory, fallback to UnmarshalBytes. return u.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_UNREGISTER_GPU_VASPACE_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.GPUUUID.Packed() { // Type UVM_UNREGISTER_GPU_VASPACE_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. u.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_UNREGISTER_GPU_VASPACE_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_UNREGISTER_GPU_VASPACE_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.GPUUUID.Packed() { // Type UVM_UNREGISTER_GPU_VASPACE_PARAMS doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. u.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_UNREGISTER_GPU_VASPACE_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_UNREGISTER_GPU_VASPACE_PARAMS) WriteTo(writer io.Writer) (int64, error) { if !u.GPUUUID.Packed() { // Type UVM_UNREGISTER_GPU_VASPACE_PARAMS doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, u.SizeBytes()) u.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UVM_VALIDATE_VA_RANGE_PARAMS) SizeBytes() int { return 20 + 1*4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UVM_VALIDATE_VA_RANGE_PARAMS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Base)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Length)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.RMStatus)) dst = dst[4:] for idx := 0; idx < 4; idx++ { dst[0] = byte(u.Pad0[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UVM_VALIDATE_VA_RANGE_PARAMS) UnmarshalBytes(src []byte) []byte { u.Base = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.Length = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.RMStatus = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 4; idx++ { u.Pad0[idx] = src[0] src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UVM_VALIDATE_VA_RANGE_PARAMS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UVM_VALIDATE_VA_RANGE_PARAMS) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UVM_VALIDATE_VA_RANGE_PARAMS) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UVM_VALIDATE_VA_RANGE_PARAMS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UVM_VALIDATE_VA_RANGE_PARAMS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UVM_VALIDATE_VA_RANGE_PARAMS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UVM_VALIDATE_VA_RANGE_PARAMS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UVM_VALIDATE_VA_RANGE_PARAMS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UvmGpuMappingAttributes) SizeBytes() int { return 20 + (*NvUUID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UvmGpuMappingAttributes) MarshalBytes(dst []byte) []byte { dst = u.GPUUUID.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.GPUMappingType)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.GPUCachingType)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.GPUFormatType)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.GPUElementBits)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(u.GPUCompressionType)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UvmGpuMappingAttributes) UnmarshalBytes(src []byte) []byte { src = u.GPUUUID.UnmarshalUnsafe(src) u.GPUMappingType = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] u.GPUCachingType = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] u.GPUFormatType = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] u.GPUElementBits = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] u.GPUCompressionType = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UvmGpuMappingAttributes) Packed() bool { return u.GPUUUID.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UvmGpuMappingAttributes) MarshalUnsafe(dst []byte) []byte { if u.GPUUUID.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // Type UvmGpuMappingAttributes doesn't have a packed layout in memory, fallback to MarshalBytes. return u.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UvmGpuMappingAttributes) UnmarshalUnsafe(src []byte) []byte { if u.GPUUUID.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UvmGpuMappingAttributes doesn't have a packed layout in memory, fallback to UnmarshalBytes. return u.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UvmGpuMappingAttributes) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.GPUUUID.Packed() { // Type UvmGpuMappingAttributes doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. u.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UvmGpuMappingAttributes) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UvmGpuMappingAttributes) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.GPUUUID.Packed() { // Type UvmGpuMappingAttributes doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. u.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UvmGpuMappingAttributes) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UvmGpuMappingAttributes) WriteTo(writer io.Writer) (int64, error) { if !u.GPUUUID.Packed() { // Type UvmGpuMappingAttributes doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, u.SizeBytes()) u.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/nvgpu/nvgpu_state_autogen.go000066400000000000000000000051521465435605700257310ustar00rootroot00000000000000// automatically generated by stateify. package nvgpu import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (n *NVOS64Parameters) StateTypeName() string { return "pkg/abi/nvgpu.NVOS64Parameters" } func (n *NVOS64Parameters) StateFields() []string { return []string{ "HRoot", "HObjectParent", "HObjectNew", "HClass", "PAllocParms", "PRightsRequested", "ParamsSize", "Flags", "Status", } } func (n *NVOS64Parameters) beforeSave() {} // +checklocksignore func (n *NVOS64Parameters) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.HRoot) stateSinkObject.Save(1, &n.HObjectParent) stateSinkObject.Save(2, &n.HObjectNew) stateSinkObject.Save(3, &n.HClass) stateSinkObject.Save(4, &n.PAllocParms) stateSinkObject.Save(5, &n.PRightsRequested) stateSinkObject.Save(6, &n.ParamsSize) stateSinkObject.Save(7, &n.Flags) stateSinkObject.Save(8, &n.Status) } func (n *NVOS64Parameters) afterLoad(context.Context) {} // +checklocksignore func (n *NVOS64Parameters) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.HRoot) stateSourceObject.Load(1, &n.HObjectParent) stateSourceObject.Load(2, &n.HObjectNew) stateSourceObject.Load(3, &n.HClass) stateSourceObject.Load(4, &n.PAllocParms) stateSourceObject.Load(5, &n.PRightsRequested) stateSourceObject.Load(6, &n.ParamsSize) stateSourceObject.Load(7, &n.Flags) stateSourceObject.Load(8, &n.Status) } func (h *Handle) StateTypeName() string { return "pkg/abi/nvgpu.Handle" } func (h *Handle) StateFields() []string { return []string{ "Val", } } func (h *Handle) beforeSave() {} // +checklocksignore func (h *Handle) StateSave(stateSinkObject state.Sink) { h.beforeSave() stateSinkObject.Save(0, &h.Val) } func (h *Handle) afterLoad(context.Context) {} // +checklocksignore func (h *Handle) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &h.Val) } func (r *RS_ACCESS_MASK) StateTypeName() string { return "pkg/abi/nvgpu.RS_ACCESS_MASK" } func (r *RS_ACCESS_MASK) StateFields() []string { return []string{ "Limbs", } } func (r *RS_ACCESS_MASK) beforeSave() {} // +checklocksignore func (r *RS_ACCESS_MASK) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.Limbs) } func (r *RS_ACCESS_MASK) afterLoad(context.Context) {} // +checklocksignore func (r *RS_ACCESS_MASK) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.Limbs) } func init() { state.Register((*NVOS64Parameters)(nil)) state.Register((*Handle)(nil)) state.Register((*RS_ACCESS_MASK)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/nvgpu/nvgpu_unsafe_abi_autogen_unsafe.go000066400000000000000000000001451465435605700302430ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package nvgpu import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/nvgpu/nvgpu_unsafe_state_autogen.go000066400000000000000000000000671465435605700272720ustar00rootroot00000000000000// automatically generated by stateify. package nvgpu golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/nvgpu/status.go000066400000000000000000000016231465435605700231720ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package nvgpu // Status codes, from src/common/sdk/nvidia/inc/nvstatuscodes.h. const ( NV_OK = 0x00000000 NV_ERR_INVALID_ADDRESS = 0x0000001e NV_ERR_INVALID_ARGUMENT = 0x0000001f NV_ERR_INVALID_CLASS = 0x00000022 NV_ERR_INVALID_LIMIT = 0x0000002e NV_ERR_NOT_SUPPORTED = 0x00000056 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/nvgpu/uvm.go000066400000000000000000000205641465435605700224630ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package nvgpu // UVM ioctl commands. const ( // From kernel-open/nvidia-uvm/uvm_linux_ioctl.h: UVM_INITIALIZE = 0x30000001 UVM_DEINITIALIZE = 0x30000002 // From kernel-open/nvidia-uvm/uvm_ioctl.h: UVM_CREATE_RANGE_GROUP = 23 UVM_DESTROY_RANGE_GROUP = 24 UVM_REGISTER_GPU_VASPACE = 25 UVM_UNREGISTER_GPU_VASPACE = 26 UVM_REGISTER_CHANNEL = 27 UVM_UNREGISTER_CHANNEL = 28 UVM_ENABLE_PEER_ACCESS = 29 UVM_DISABLE_PEER_ACCESS = 30 UVM_SET_RANGE_GROUP = 31 UVM_MAP_EXTERNAL_ALLOCATION = 33 UVM_FREE = 34 UVM_REGISTER_GPU = 37 UVM_UNREGISTER_GPU = 38 UVM_PAGEABLE_MEM_ACCESS = 39 UVM_SET_PREFERRED_LOCATION = 42 UVM_DISABLE_READ_DUPLICATION = 45 UVM_MIGRATE_RANGE_GROUP = 53 UVM_TOOLS_READ_PROCESS_MEMORY = 62 UVM_TOOLS_WRITE_PROCESS_MEMORY = 63 UVM_MAP_DYNAMIC_PARALLELISM_REGION = 65 UVM_UNMAP_EXTERNAL = 66 UVM_ALLOC_SEMAPHORE_POOL = 68 UVM_VALIDATE_VA_RANGE = 72 UVM_CREATE_EXTERNAL_RANGE = 73 UVM_MM_INITIALIZE = 75 ) // +marshal type UVM_INITIALIZE_PARAMS struct { Flags uint64 RMStatus uint32 Pad0 [4]byte } // UVM_INITIALIZE_PARAMS flags, from kernel-open/nvidia-uvm/uvm_types.h. const ( UVM_INIT_FLAGS_MULTI_PROCESS_SHARING_MODE = 0x2 ) // +marshal type UVM_CREATE_RANGE_GROUP_PARAMS struct { RangeGroupID uint64 RMStatus uint32 Pad0 [4]byte } // +marshal type UVM_DESTROY_RANGE_GROUP_PARAMS struct { RangeGroupID uint64 RMStatus uint32 Pad0 [4]byte } // +marshal type UVM_REGISTER_GPU_VASPACE_PARAMS struct { GPUUUID NvUUID RMCtrlFD int32 HClient Handle HVASpace Handle RMStatus uint32 } // GetFrontendFD implements HasFrontendFD.GetFrontendFD. func (p *UVM_REGISTER_GPU_VASPACE_PARAMS) GetFrontendFD() int32 { return p.RMCtrlFD } // SetFrontendFD implements HasFrontendFD.SetFrontendFD. func (p *UVM_REGISTER_GPU_VASPACE_PARAMS) SetFrontendFD(fd int32) { p.RMCtrlFD = fd } // +marshal type UVM_UNREGISTER_GPU_VASPACE_PARAMS struct { GPUUUID NvUUID RMStatus uint32 } // +marshal type UVM_REGISTER_CHANNEL_PARAMS struct { GPUUUID NvUUID RMCtrlFD int32 HClient Handle HChannel Handle Pad [4]byte Base uint64 Length uint64 RMStatus uint32 Pad0 [4]byte } // GetFrontendFD implements HasFrontendFD.GetFrontendFD. func (p *UVM_REGISTER_CHANNEL_PARAMS) GetFrontendFD() int32 { return p.RMCtrlFD } // SetFrontendFD implements HasFrontendFD.SetFrontendFD. func (p *UVM_REGISTER_CHANNEL_PARAMS) SetFrontendFD(fd int32) { p.RMCtrlFD = fd } // +marshal type UVM_UNREGISTER_CHANNEL_PARAMS struct { GPUUUID NvUUID HClient Handle HChannel Handle RMStatus uint32 } // +marshal type UVM_ENABLE_PEER_ACCESS_PARAMS struct { GPUUUIDA NvUUID GPUUUIDB NvUUID RMStatus uint32 } // +marshal type UVM_DISABLE_PEER_ACCESS_PARAMS struct { GPUUUIDA NvUUID GPUUUIDB NvUUID RMStatus uint32 } // +marshal type UVM_SET_RANGE_GROUP_PARAMS struct { RangeGroupID uint64 RequestedBase uint64 Length uint64 RMStatus uint32 Pad0 [4]byte } // +marshal type UVM_MAP_EXTERNAL_ALLOCATION_PARAMS struct { Base uint64 Length uint64 Offset uint64 PerGPUAttributes [UVM_MAX_GPUS]UvmGpuMappingAttributes GPUAttributesCount uint64 RMCtrlFD int32 HClient Handle HMemory Handle RMStatus uint32 } // GetFrontendFD implements HasFrontendFD.GetFrontendFD. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS) GetFrontendFD() int32 { return p.RMCtrlFD } // SetFrontendFD implements HasFrontendFD.SetFrontendFD. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS) SetFrontendFD(fd int32) { p.RMCtrlFD = fd } // +marshal type UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550 struct { Base uint64 Length uint64 Offset uint64 PerGPUAttributes [UVM_MAX_GPUS_V2]UvmGpuMappingAttributes GPUAttributesCount uint64 RMCtrlFD int32 HClient Handle HMemory Handle RMStatus uint32 } // GetFrontendFD implements HasFrontendFD.GetFrontendFD. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550) GetFrontendFD() int32 { return p.RMCtrlFD } // SetFrontendFD implements HasFrontendFD.SetFrontendFD. func (p *UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550) SetFrontendFD(fd int32) { p.RMCtrlFD = fd } // +marshal type UVM_FREE_PARAMS struct { Base uint64 Length uint64 RMStatus uint32 Pad0 [4]byte } // +marshal type UVM_REGISTER_GPU_PARAMS struct { GPUUUID NvUUID NumaEnabled uint8 Pad [3]byte NumaNodeID int32 RMCtrlFD int32 HClient Handle HSMCPartRef Handle RMStatus uint32 } // GetFrontendFD implements HasFrontendFD.GetFrontendFD. func (p *UVM_REGISTER_GPU_PARAMS) GetFrontendFD() int32 { return p.RMCtrlFD } // SetFrontendFD implements HasFrontendFD.SetFrontendFD. func (p *UVM_REGISTER_GPU_PARAMS) SetFrontendFD(fd int32) { p.RMCtrlFD = fd } // +marshal type UVM_UNREGISTER_GPU_PARAMS struct { GPUUUID NvUUID RMStatus uint32 } // +marshal type UVM_PAGEABLE_MEM_ACCESS_PARAMS struct { PageableMemAccess uint8 Pad [3]byte RMStatus uint32 } // +marshal type UVM_SET_PREFERRED_LOCATION_PARAMS struct { RequestedBase uint64 Length uint64 PreferredLocation NvUUID RMStatus uint32 Pad0 [4]byte } // +marshal type UVM_SET_PREFERRED_LOCATION_PARAMS_V550 struct { RequestedBase uint64 Length uint64 PreferredLocation NvUUID PreferredCPUNumaNode int32 RMStatus uint32 } // +marshal type UVM_DISABLE_READ_DUPLICATION_PARAMS struct { RequestedBase uint64 Length uint64 RMStatus uint32 Pad0 [4]byte } // +marshal type UVM_MIGRATE_RANGE_GROUP_PARAMS struct { RangeGroupID uint64 DestinationUUID NvUUID RMStatus uint32 Pad0 [4]byte } // +marshal type UVM_TOOLS_READ_PROCESS_MEMORY_PARAMS struct { Buffer uint64 Size uint64 TargetVA uint64 BytesRead uint64 RMStatus uint32 Pad0 [4]byte } // +marshal type UVM_TOOLS_WRITE_PROCESS_MEMORY_PARAMS struct { Buffer uint64 Size uint64 TargetVA uint64 BytesWritten uint64 RMStatus uint32 Pad0 [4]byte } // +marshal type UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS struct { Base uint64 Length uint64 GPUUUID NvUUID RMStatus uint32 Pad0 [4]byte } // +marshal type UVM_UNMAP_EXTERNAL_PARAMS struct { Base uint64 Length uint64 GPUUUID NvUUID RMStatus uint32 Pad0 [4]byte } // +marshal type UVM_ALLOC_SEMAPHORE_POOL_PARAMS struct { Base uint64 Length uint64 PerGPUAttributes [UVM_MAX_GPUS]UvmGpuMappingAttributes GPUAttributesCount uint64 RMStatus uint32 Pad0 [4]byte } // +marshal type UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550 struct { Base uint64 Length uint64 PerGPUAttributes [UVM_MAX_GPUS_V2]UvmGpuMappingAttributes GPUAttributesCount uint64 RMStatus uint32 Pad0 [4]byte } // +marshal type UVM_VALIDATE_VA_RANGE_PARAMS struct { Base uint64 Length uint64 RMStatus uint32 Pad0 [4]byte } // +marshal type UVM_CREATE_EXTERNAL_RANGE_PARAMS struct { Base uint64 Length uint64 RMStatus uint32 Pad0 [4]byte } // +marshal type UVM_MM_INITIALIZE_PARAMS struct { UvmFD int32 Status uint32 } // From kernel-open/nvidia-uvm/uvm_types.h: const ( UVM_MAX_GPUS = NV_MAX_DEVICES UVM_MAX_GPUS_V2 = NV_MAX_DEVICES * NV_MAX_SUBDEVICES ) // +marshal type UvmGpuMappingAttributes struct { GPUUUID NvUUID GPUMappingType uint32 GPUCachingType uint32 GPUFormatType uint32 GPUElementBits uint32 GPUCompressionType uint32 } golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/sentry/000077500000000000000000000000001465435605700215035ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/sentry/sentry.go000066400000000000000000000012541465435605700233600ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package sentry contains ABI-related constants for the gVisor sentry. package sentry golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/sentry/sentry_state_autogen.go000066400000000000000000000000701465435605700262750ustar00rootroot00000000000000// automatically generated by stateify. package sentry golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/sentry/syscall.go000066400000000000000000000015021465435605700235020ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sentry // MaxSyscallNum is the largest-numbered syscall that is supported. // Having this as a constant allows allocating per-syscall data structures // that are of fixed size throughout the codebase. const MaxSyscallNum = 2000 golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/tpu/000077500000000000000000000000001465435605700207675ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/tpu/tpu.go000066400000000000000000000101301465435605700221210ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package tpu defines constants used to interact with TPUs. The constants are // derived from those listed in https://github.com/tensorflow/tpu/blob/master/tools/driver/drivers/char/tpu_common package tpu const ( // SizeOfTPUV4InterruptList is the total number of valid // (BAR Index, Register Offset) pairs. SizeOfTPUV4InterruptList = uint64(45) // NumberOfTPUV4PageTables is the number of gasket page tables used by the // TPU V4 driver. NumberOfTPUV4PageTables = uint64(1) // TPUV4DeviceID is the PCI device ID of TPU V4 hardware. TPUV4DeviceID = 0x005E // SizeOfTPUV4liteInterruptList is the total number of valid // (BAR Index, Register Offset) pairs. SizeOfTPUV4liteInterruptList = uint64(37) // NumberOfTPUV4litePageTables is the number of gasket page tables used by the // TPU V4 driver NumberOfTPUV4litePageTables = uint64(1) // TPUV4liteDeviceID is the PCI device ID of TPU V4lite hardware. TPUV4liteDeviceID = 0x0056 // TPUV5eDeviceID is the PCI device ID of TPU V5e hardware. TPUV5eDeviceID = 0x0063 // TPUV5pDeviceID is the PCI device ID of TPU V5p hardware. TPUV5pDeviceID = 0x0062 ) // TPUV4InterruptsMap maps BAR indices to valid register offsets. var ( TPUV4InterruptsMap = map[uint64]map[uint64]struct{}{ 2: map[uint64]struct{}{ 0x15b0008: struct{}{}, 0x15b0000: struct{}{}, 0x16b0008: struct{}{}, 0x16b0000: struct{}{}, 0x17b0008: struct{}{}, 0x17b0000: struct{}{}, 0x18b0008: struct{}{}, 0x18b0000: struct{}{}, 0x19b0020: struct{}{}, 0x19b0000: struct{}{}, 0x19b0008: struct{}{}, 0x19b0010: struct{}{}, 0x19b0018: struct{}{}, 0x1ab0020: struct{}{}, 0x1ab0000: struct{}{}, 0x1ab0008: struct{}{}, 0x1ab0010: struct{}{}, 0x1ab0018: struct{}{}, 0x4720000: struct{}{}, 0x1bb0000: struct{}{}, 0x1bb0008: struct{}{}, 0x1bb0010: struct{}{}, 0x1bb0018: struct{}{}, 0x90000: struct{}{}, 0xb0000: struct{}{}, 0xd0000: struct{}{}, 0xf0000: struct{}{}, 0x110000: struct{}{}, 0x130000: struct{}{}, 0x150000: struct{}{}, 0x170000: struct{}{}, 0x190000: struct{}{}, 0x1b0000: struct{}{}, 0x1d0000: struct{}{}, 0x1f0000: struct{}{}, 0x210000: struct{}{}, 0x230000: struct{}{}, 0x250000: struct{}{}, 0x270000: struct{}{}, 0x290000: struct{}{}, 0x2b0000: struct{}{}, 0x2d0000: struct{}{}, 0x2f0000: struct{}{}, 0x310000: struct{}{}, 0x4720018: struct{}{}, }, } // TPUV4liteInterruptsMap maps BAR indices to valid register offsets. TPUV4liteInterruptsMap = map[uint64]map[uint64]struct{}{ 2: map[uint64]struct{}{ 0x19b0020: struct{}{}, 0x19b0000: struct{}{}, 0x19b0008: struct{}{}, 0x19b0010: struct{}{}, 0x19b0018: struct{}{}, 0x1ab0020: struct{}{}, 0x1ab0000: struct{}{}, 0x1ab0008: struct{}{}, 0x1ab0010: struct{}{}, 0x1ab0018: struct{}{}, 0x4720000: struct{}{}, 0x1bb0000: struct{}{}, 0x1bb0008: struct{}{}, 0x1bb0010: struct{}{}, 0x1bb0018: struct{}{}, 0x90000: struct{}{}, 0xb0000: struct{}{}, 0xd0000: struct{}{}, 0xf0000: struct{}{}, 0x110000: struct{}{}, 0x130000: struct{}{}, 0x150000: struct{}{}, 0x170000: struct{}{}, 0x190000: struct{}{}, 0x1b0000: struct{}{}, 0x1d0000: struct{}{}, 0x1f0000: struct{}{}, 0x210000: struct{}{}, 0x230000: struct{}{}, 0x250000: struct{}{}, 0x270000: struct{}{}, 0x290000: struct{}{}, 0x2b0000: struct{}{}, 0x2d0000: struct{}{}, 0x2f0000: struct{}{}, 0x310000: struct{}{}, 0x4720018: struct{}{}, }, } ) golang-gvisor-gvisor-0.0~20240729.0/pkg/abi/tpu/tpu_state_autogen.go000066400000000000000000000000651465435605700250510ustar00rootroot00000000000000// automatically generated by stateify. package tpu golang-gvisor-gvisor-0.0~20240729.0/pkg/atomicbitops/000077500000000000000000000000001465435605700221215ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/atomicbitops/32b_32bit.go000066400000000000000000000143171465435605700240470ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm || mips || mipsle || 386 // +build arm mips mipsle 386 package atomicbitops import ( "sync/atomic" "gvisor.dev/gvisor/pkg/sync" ) // Note that this file is *identical* to 32b_64bit.go, as go_stateify gets // confused about build tags if these are not separated. // LINT.IfChange // Int32 is an atomic int32. // // The default value is zero. // // Don't add fields to this struct. It is important that it remain the same // size as its builtin analogue. // // +stateify savable type Int32 struct { _ sync.NoCopy value int32 } // FromInt32 returns an Int32 initialized to value v. // //go:nosplit func FromInt32(v int32) Int32 { return Int32{value: v} } // Load is analogous to atomic.LoadInt32. // //go:nosplit func (i *Int32) Load() int32 { return atomic.LoadInt32(&i.value) } // RacyLoad is analogous to reading an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (i *Int32) RacyLoad() int32 { return i.value } // Store is analogous to atomic.StoreInt32. // //go:nosplit func (i *Int32) Store(v int32) { atomic.StoreInt32(&i.value, v) } // RacyStore is analogous to setting an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (i *Int32) RacyStore(v int32) { i.value = v } // Add is analogous to atomic.AddInt32. // //go:nosplit func (i *Int32) Add(v int32) int32 { return atomic.AddInt32(&i.value, v) } // RacyAdd is analogous to adding to an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (i *Int32) RacyAdd(v int32) int32 { i.value += v return i.value } // Swap is analogous to atomic.SwapInt32. // //go:nosplit func (i *Int32) Swap(v int32) int32 { return atomic.SwapInt32(&i.value, v) } // CompareAndSwap is analogous to atomic.CompareAndSwapInt32. // //go:nosplit func (i *Int32) CompareAndSwap(oldVal, newVal int32) bool { return atomic.CompareAndSwapInt32(&i.value, oldVal, newVal) } //go:nosplit func (i *Int32) ptr() *int32 { return &i.value } // Uint32 is an atomic uint32. // // Don't add fields to this struct. It is important that it remain the same // size as its builtin analogue. // // See aligned_unsafe.go in this directory for justification. // // +stateify savable type Uint32 struct { _ sync.NoCopy value uint32 } // FromUint32 returns an Uint32 initialized to value v. // //go:nosplit func FromUint32(v uint32) Uint32 { return Uint32{value: v} } // Load is analogous to atomic.LoadUint32. // //go:nosplit func (u *Uint32) Load() uint32 { return atomic.LoadUint32(&u.value) } // RacyLoad is analogous to reading an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (u *Uint32) RacyLoad() uint32 { return u.value } // Store is analogous to atomic.StoreUint32. // //go:nosplit func (u *Uint32) Store(v uint32) { atomic.StoreUint32(&u.value, v) } // RacyStore is analogous to setting an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (u *Uint32) RacyStore(v uint32) { u.value = v } // Add is analogous to atomic.AddUint32. // //go:nosplit func (u *Uint32) Add(v uint32) uint32 { return atomic.AddUint32(&u.value, v) } // RacyAdd is analogous to adding to an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (u *Uint32) RacyAdd(v uint32) uint32 { u.value += v return u.value } // Swap is analogous to atomic.SwapUint32. // //go:nosplit func (u *Uint32) Swap(v uint32) uint32 { return atomic.SwapUint32(&u.value, v) } // CompareAndSwap is analogous to atomic.CompareAndSwapUint32. // //go:nosplit func (u *Uint32) CompareAndSwap(oldVal, newVal uint32) bool { return atomic.CompareAndSwapUint32(&u.value, oldVal, newVal) } //go:nosplit func (u *Uint32) ptr() *uint32 { return &u.value } // Bool is an atomic Boolean. // // It is implemented by a Uint32, with value 0 indicating false, and 1 // indicating true. // // +stateify savable type Bool struct { Uint32 } // b32 returns a uint32 0 or 1 representing b. func b32(b bool) uint32 { if b { return 1 } return 0 } // FromBool returns a Bool initialized to value val. // //go:nosplit func FromBool(val bool) Bool { return Bool{ Uint32: FromUint32(b32(val)), } } // Load is analogous to atomic.LoadBool, if such a thing existed. // //go:nosplit func (b *Bool) Load() bool { return b.Uint32.Load() != 0 } // RacyLoad is analogous to reading an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (b *Bool) RacyLoad() bool { return b.Uint32.RacyLoad() != 0 } // Store is analogous to atomic.StoreBool, if such a thing existed. // //go:nosplit func (b *Bool) Store(val bool) { b.Uint32.Store(b32(val)) } // RacyStore is analogous to setting an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (b *Bool) RacyStore(val bool) { b.Uint32.RacyStore(b32(val)) } // Swap is analogous to atomic.SwapBool, if such a thing existed. // //go:nosplit func (b *Bool) Swap(val bool) bool { return b.Uint32.Swap(b32(val)) != 0 } // CompareAndSwap is analogous to atomic.CompareAndSwapBool, if such a thing // existed. // //go:nosplit func (b *Bool) CompareAndSwap(oldVal, newVal bool) bool { return b.Uint32.CompareAndSwap(b32(oldVal), b32(newVal)) } // LINT.ThenChange(32b_64bit.go) golang-gvisor-gvisor-0.0~20240729.0/pkg/atomicbitops/32b_64bit.go000066400000000000000000000143271465435605700240550ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !arm && !mips && !mipsle && !386 // +build !arm,!mips,!mipsle,!386 package atomicbitops import ( "sync/atomic" "gvisor.dev/gvisor/pkg/sync" ) // Note that this file is *identical* to 32b_32bit.go, as go_stateify gets // confused about build tags if these are not separated. // LINT.IfChange // Int32 is an atomic int32. // // The default value is zero. // // Don't add fields to this struct. It is important that it remain the same // size as its builtin analogue. // // +stateify savable type Int32 struct { _ sync.NoCopy value int32 } // FromInt32 returns an Int32 initialized to value v. // //go:nosplit func FromInt32(v int32) Int32 { return Int32{value: v} } // Load is analogous to atomic.LoadInt32. // //go:nosplit func (i *Int32) Load() int32 { return atomic.LoadInt32(&i.value) } // RacyLoad is analogous to reading an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (i *Int32) RacyLoad() int32 { return i.value } // Store is analogous to atomic.StoreInt32. // //go:nosplit func (i *Int32) Store(v int32) { atomic.StoreInt32(&i.value, v) } // RacyStore is analogous to setting an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (i *Int32) RacyStore(v int32) { i.value = v } // Add is analogous to atomic.AddInt32. // //go:nosplit func (i *Int32) Add(v int32) int32 { return atomic.AddInt32(&i.value, v) } // RacyAdd is analogous to adding to an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (i *Int32) RacyAdd(v int32) int32 { i.value += v return i.value } // Swap is analogous to atomic.SwapInt32. // //go:nosplit func (i *Int32) Swap(v int32) int32 { return atomic.SwapInt32(&i.value, v) } // CompareAndSwap is analogous to atomic.CompareAndSwapInt32. // //go:nosplit func (i *Int32) CompareAndSwap(oldVal, newVal int32) bool { return atomic.CompareAndSwapInt32(&i.value, oldVal, newVal) } //go:nosplit func (i *Int32) ptr() *int32 { return &i.value } // Uint32 is an atomic uint32. // // Don't add fields to this struct. It is important that it remain the same // size as its builtin analogue. // // See aligned_unsafe.go in this directory for justification. // // +stateify savable type Uint32 struct { _ sync.NoCopy value uint32 } // FromUint32 returns an Uint32 initialized to value v. // //go:nosplit func FromUint32(v uint32) Uint32 { return Uint32{value: v} } // Load is analogous to atomic.LoadUint32. // //go:nosplit func (u *Uint32) Load() uint32 { return atomic.LoadUint32(&u.value) } // RacyLoad is analogous to reading an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (u *Uint32) RacyLoad() uint32 { return u.value } // Store is analogous to atomic.StoreUint32. // //go:nosplit func (u *Uint32) Store(v uint32) { atomic.StoreUint32(&u.value, v) } // RacyStore is analogous to setting an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (u *Uint32) RacyStore(v uint32) { u.value = v } // Add is analogous to atomic.AddUint32. // //go:nosplit func (u *Uint32) Add(v uint32) uint32 { return atomic.AddUint32(&u.value, v) } // RacyAdd is analogous to adding to an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (u *Uint32) RacyAdd(v uint32) uint32 { u.value += v return u.value } // Swap is analogous to atomic.SwapUint32. // //go:nosplit func (u *Uint32) Swap(v uint32) uint32 { return atomic.SwapUint32(&u.value, v) } // CompareAndSwap is analogous to atomic.CompareAndSwapUint32. // //go:nosplit func (u *Uint32) CompareAndSwap(oldVal, newVal uint32) bool { return atomic.CompareAndSwapUint32(&u.value, oldVal, newVal) } //go:nosplit func (u *Uint32) ptr() *uint32 { return &u.value } // Bool is an atomic Boolean. // // It is implemented by a Uint32, with value 0 indicating false, and 1 // indicating true. // // +stateify savable type Bool struct { Uint32 } // b32 returns a uint32 0 or 1 representing b. func b32(b bool) uint32 { if b { return 1 } return 0 } // FromBool returns a Bool initialized to value val. // //go:nosplit func FromBool(val bool) Bool { return Bool{ Uint32: FromUint32(b32(val)), } } // Load is analogous to atomic.LoadBool, if such a thing existed. // //go:nosplit func (b *Bool) Load() bool { return b.Uint32.Load() != 0 } // RacyLoad is analogous to reading an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (b *Bool) RacyLoad() bool { return b.Uint32.RacyLoad() != 0 } // Store is analogous to atomic.StoreBool, if such a thing existed. // //go:nosplit func (b *Bool) Store(val bool) { b.Uint32.Store(b32(val)) } // RacyStore is analogous to setting an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (b *Bool) RacyStore(val bool) { b.Uint32.RacyStore(b32(val)) } // Swap is analogous to atomic.SwapBool, if such a thing existed. // //go:nosplit func (b *Bool) Swap(val bool) bool { return b.Uint32.Swap(b32(val)) != 0 } // CompareAndSwap is analogous to atomic.CompareAndSwapBool, if such a thing // existed. // //go:nosplit func (b *Bool) CompareAndSwap(oldVal, newVal bool) bool { return b.Uint32.CompareAndSwap(b32(oldVal), b32(newVal)) } // LINT.ThenChange(32b_32bit.go) golang-gvisor-gvisor-0.0~20240729.0/pkg/atomicbitops/aligned_32bit_unsafe.go000066400000000000000000000132671465435605700264300ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm || mips || mipsle || 386 // +build arm mips mipsle 386 package atomicbitops import ( "sync/atomic" "unsafe" "gvisor.dev/gvisor/pkg/sync" ) // Int64 is an atomic int64 that is guaranteed to be 64-bit // aligned, even on 32-bit systems. // // Don't add fields to this struct. It is important that it remain the same // size as its builtin analogue. // // Per https://golang.org/pkg/sync/atomic/#pkg-note-BUG: // // "On ARM, 386, and 32-bit MIPS, it is the caller's responsibility to arrange // for 64-bit alignment of 64-bit words accessed atomically. The first word in // a variable or in an allocated struct, array, or slice can be relied upon to // be 64-bit aligned." // // +stateify savable type Int64 struct { _ sync.NoCopy value int64 value32 int32 } //go:nosplit func (i *Int64) ptr() *int64 { // On 32-bit systems, i.value is guaranteed to be 32-bit aligned. It means // that in the 12-byte i.value, there are guaranteed to be 8 contiguous bytes // with 64-bit alignment. return (*int64)(unsafe.Pointer((uintptr(unsafe.Pointer(&i.value)) + 4) &^ 7)) } // FromInt64 returns an Int64 initialized to value v. // //go:nosplit func FromInt64(v int64) Int64 { var i Int64 *i.ptr() = v return i } // Load is analogous to atomic.LoadInt64. // //go:nosplit func (i *Int64) Load() int64 { return atomic.LoadInt64(i.ptr()) } // RacyLoad is analogous to reading an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (i *Int64) RacyLoad() int64 { return *i.ptr() } // Store is analogous to atomic.StoreInt64. // //go:nosplit func (i *Int64) Store(v int64) { atomic.StoreInt64(i.ptr(), v) } // RacyStore is analogous to setting an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (i *Int64) RacyStore(v int64) { *i.ptr() = v } // Add is analogous to atomic.AddInt64. // //go:nosplit func (i *Int64) Add(v int64) int64 { return atomic.AddInt64(i.ptr(), v) } // RacyAdd is analogous to adding to an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (i *Int64) RacyAdd(v int64) int64 { *i.ptr() += v return *i.ptr() } // Swap is analogous to atomic.SwapInt64. // //go:nosplit func (i *Int64) Swap(v int64) int64 { return atomic.SwapInt64(i.ptr(), v) } // CompareAndSwap is analogous to atomic.CompareAndSwapInt64. // //go:nosplit func (i *Int64) CompareAndSwap(oldVal, newVal int64) bool { return atomic.CompareAndSwapInt64(&i.value, oldVal, newVal) } // Uint64 is an atomic uint64 that is guaranteed to be 64-bit // aligned, even on 32-bit systems. // // Don't add fields to this struct. It is important that it remain the same // size as its builtin analogue. // // Per https://golang.org/pkg/sync/atomic/#pkg-note-BUG: // // "On ARM, 386, and 32-bit MIPS, it is the caller's responsibility to arrange // for 64-bit alignment of 64-bit words accessed atomically. The first word in // a variable or in an allocated struct, array, or slice can be relied upon to // be 64-bit aligned." // // +stateify savable type Uint64 struct { _ sync.NoCopy value uint64 value32 uint32 } //go:nosplit func (u *Uint64) ptr() *uint64 { // On 32-bit systems, i.value is guaranteed to be 32-bit aligned. It means // that in the 12-byte i.value, there are guaranteed to be 8 contiguous bytes // with 64-bit alignment. return (*uint64)(unsafe.Pointer((uintptr(unsafe.Pointer(&u.value)) + 4) &^ 7)) } // FromUint64 returns an Uint64 initialized to value v. // //go:nosplit func FromUint64(v uint64) Uint64 { var u Uint64 *u.ptr() = v return u } // Load is analogous to atomic.LoadUint64. // //go:nosplit func (u *Uint64) Load() uint64 { return atomic.LoadUint64(u.ptr()) } // RacyLoad is analogous to reading an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (u *Uint64) RacyLoad() uint64 { return *u.ptr() } // Store is analogous to atomic.StoreUint64. // //go:nosplit func (u *Uint64) Store(v uint64) { atomic.StoreUint64(u.ptr(), v) } // RacyStore is analogous to setting an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (u *Uint64) RacyStore(v uint64) { *u.ptr() = v } // Add is analogous to atomic.AddUint64. // //go:nosplit func (u *Uint64) Add(v uint64) uint64 { return atomic.AddUint64(u.ptr(), v) } // RacyAdd is analogous to adding to an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (u *Uint64) RacyAdd(v uint64) uint64 { *u.ptr() += v return *u.ptr() } // Swap is analogous to atomic.SwapUint64. // //go:nosplit func (u *Uint64) Swap(v uint64) uint64 { return atomic.SwapUint64(u.ptr(), v) } // CompareAndSwap is analogous to atomic.CompareAndSwapUint64. // //go:nosplit func (u *Uint64) CompareAndSwap(oldVal, newVal uint64) bool { return atomic.CompareAndSwapUint64(u.ptr(), oldVal, newVal) } golang-gvisor-gvisor-0.0~20240729.0/pkg/atomicbitops/aligned_64bit.go000066400000000000000000000114321465435605700250640ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !arm && !mips && !mipsle && !386 // +build !arm,!mips,!mipsle,!386 package atomicbitops import ( "sync/atomic" "gvisor.dev/gvisor/pkg/sync" ) // Int64 is an atomic int64 that is guaranteed to be 64-bit // aligned, even on 32-bit systems. On most architectures, it's just a regular // int64. // // The default value is zero. // // Don't add fields to this struct. It is important that it remain the same // size as its builtin analogue. // // See aligned_32bit_unsafe.go in this directory for justification. // // +stateify savable type Int64 struct { _ sync.NoCopy value int64 } // FromInt64 returns an Int64 initialized to value v. // //go:nosplit func FromInt64(v int64) Int64 { return Int64{value: v} } // Load is analogous to atomic.LoadInt64. // //go:nosplit func (i *Int64) Load() int64 { return atomic.LoadInt64(&i.value) } // RacyLoad is analogous to reading an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (i *Int64) RacyLoad() int64 { return i.value } // Store is analogous to atomic.StoreInt64. // //go:nosplit func (i *Int64) Store(v int64) { atomic.StoreInt64(&i.value, v) } // RacyStore is analogous to setting an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (i *Int64) RacyStore(v int64) { i.value = v } // Add is analogous to atomic.AddInt64. // //go:nosplit func (i *Int64) Add(v int64) int64 { return atomic.AddInt64(&i.value, v) } // RacyAdd is analogous to adding to an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (i *Int64) RacyAdd(v int64) int64 { i.value += v return i.value } // Swap is analogous to atomic.SwapInt64. // //go:nosplit func (i *Int64) Swap(v int64) int64 { return atomic.SwapInt64(&i.value, v) } // CompareAndSwap is analogous to atomic.CompareAndSwapInt64. // //go:nosplit func (i *Int64) CompareAndSwap(oldVal, newVal int64) bool { return atomic.CompareAndSwapInt64(&i.value, oldVal, newVal) } //go:nosplit func (i *Int64) ptr() *int64 { return &i.value } // Uint64 is an atomic uint64 that is guaranteed to be 64-bit // aligned, even on 32-bit systems. On most architectures, it's just a regular // uint64. // // Don't add fields to this struct. It is important that it remain the same // size as its builtin analogue. // // See aligned_unsafe.go in this directory for justification. // // +stateify savable type Uint64 struct { _ sync.NoCopy value uint64 } // FromUint64 returns an Uint64 initialized to value v. // //go:nosplit func FromUint64(v uint64) Uint64 { return Uint64{value: v} } // Load is analogous to atomic.LoadUint64. // //go:nosplit func (u *Uint64) Load() uint64 { return atomic.LoadUint64(&u.value) } // RacyLoad is analogous to reading an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (u *Uint64) RacyLoad() uint64 { return u.value } // Store is analogous to atomic.StoreUint64. // //go:nosplit func (u *Uint64) Store(v uint64) { atomic.StoreUint64(&u.value, v) } // RacyStore is analogous to setting an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (u *Uint64) RacyStore(v uint64) { u.value = v } // Add is analogous to atomic.AddUint64. // //go:nosplit func (u *Uint64) Add(v uint64) uint64 { return atomic.AddUint64(&u.value, v) } // RacyAdd is analogous to adding to an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (u *Uint64) RacyAdd(v uint64) uint64 { u.value += v return u.value } // Swap is analogous to atomic.SwapUint64. // //go:nosplit func (u *Uint64) Swap(v uint64) uint64 { return atomic.SwapUint64(&u.value, v) } // CompareAndSwap is analogous to atomic.CompareAndSwapUint64. // //go:nosplit func (u *Uint64) CompareAndSwap(oldVal, newVal uint64) bool { return atomic.CompareAndSwapUint64(&u.value, oldVal, newVal) } //go:nosplit func (u *Uint64) ptr() *uint64 { return &u.value } golang-gvisor-gvisor-0.0~20240729.0/pkg/atomicbitops/atomicbitops.go000066400000000000000000000051061465435605700251470ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 || arm64 // +build amd64 arm64 // Package atomicbitops provides extensions to the sync/atomic package. // // All read-modify-write operations implemented by this package have // acquire-release memory ordering (like sync/atomic). // // +checkalignedignore package atomicbitops // AndUint32 atomically applies bitwise AND operation to *addr with val. func AndUint32(addr *Uint32, val uint32) { andUint32(&addr.value, val) } func andUint32(addr *uint32, val uint32) // OrUint32 atomically applies bitwise OR operation to *addr with val. func OrUint32(addr *Uint32, val uint32) { orUint32(&addr.value, val) } func orUint32(addr *uint32, val uint32) // XorUint32 atomically applies bitwise XOR operation to *addr with val. func XorUint32(addr *Uint32, val uint32) { xorUint32(&addr.value, val) } func xorUint32(addr *uint32, val uint32) // CompareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns // the value previously stored at addr. func CompareAndSwapUint32(addr *Uint32, old, new uint32) uint32 { return compareAndSwapUint32(&addr.value, old, new) } func compareAndSwapUint32(addr *uint32, old, new uint32) uint32 // AndUint64 atomically applies bitwise AND operation to *addr with val. func AndUint64(addr *Uint64, val uint64) { andUint64(&addr.value, val) } func andUint64(addr *uint64, val uint64) // OrUint64 atomically applies bitwise OR operation to *addr with val. func OrUint64(addr *Uint64, val uint64) { orUint64(&addr.value, val) } func orUint64(addr *uint64, val uint64) // XorUint64 atomically applies bitwise XOR operation to *addr with val. func XorUint64(addr *Uint64, val uint64) { xorUint64(&addr.value, val) } func xorUint64(addr *uint64, val uint64) // CompareAndSwapUint64 is like sync/atomic.CompareAndSwapUint64, but returns // the value previously stored at addr. func CompareAndSwapUint64(addr *Uint64, old, new uint64) uint64 { return compareAndSwapUint64(&addr.value, old, new) } func compareAndSwapUint64(addr *uint64, old, new uint64) uint64 golang-gvisor-gvisor-0.0~20240729.0/pkg/atomicbitops/atomicbitops_32bit_state_autogen.go000066400000000000000000000034331465435605700310750ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm || mips || mipsle || 386 // +build arm mips mipsle 386 package atomicbitops import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (i *Int32) StateTypeName() string { return "pkg/atomicbitops.Int32" } func (i *Int32) StateFields() []string { return []string{ "value", } } func (i *Int32) beforeSave() {} // +checklocksignore func (i *Int32) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.value) } func (i *Int32) afterLoad(context.Context) {} // +checklocksignore func (i *Int32) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.value) } func (u *Uint32) StateTypeName() string { return "pkg/atomicbitops.Uint32" } func (u *Uint32) StateFields() []string { return []string{ "value", } } func (u *Uint32) beforeSave() {} // +checklocksignore func (u *Uint32) StateSave(stateSinkObject state.Sink) { u.beforeSave() stateSinkObject.Save(0, &u.value) } func (u *Uint32) afterLoad(context.Context) {} // +checklocksignore func (u *Uint32) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &u.value) } func (b *Bool) StateTypeName() string { return "pkg/atomicbitops.Bool" } func (b *Bool) StateFields() []string { return []string{ "Uint32", } } func (b *Bool) beforeSave() {} // +checklocksignore func (b *Bool) StateSave(stateSinkObject state.Sink) { b.beforeSave() stateSinkObject.Save(0, &b.Uint32) } func (b *Bool) afterLoad(context.Context) {} // +checklocksignore func (b *Bool) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &b.Uint32) } func init() { state.Register((*Int32)(nil)) state.Register((*Uint32)(nil)) state.Register((*Bool)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/atomicbitops/atomicbitops_32bit_unsafe_state_autogen.go000066400000000000000000000026711465435605700324410ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm || mips || mipsle || 386 // +build arm mips mipsle 386 package atomicbitops import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (i *Int64) StateTypeName() string { return "pkg/atomicbitops.Int64" } func (i *Int64) StateFields() []string { return []string{ "value", "value32", } } func (i *Int64) beforeSave() {} // +checklocksignore func (i *Int64) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.value) stateSinkObject.Save(1, &i.value32) } func (i *Int64) afterLoad(context.Context) {} // +checklocksignore func (i *Int64) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.value) stateSourceObject.Load(1, &i.value32) } func (u *Uint64) StateTypeName() string { return "pkg/atomicbitops.Uint64" } func (u *Uint64) StateFields() []string { return []string{ "value", "value32", } } func (u *Uint64) beforeSave() {} // +checklocksignore func (u *Uint64) StateSave(stateSinkObject state.Sink) { u.beforeSave() stateSinkObject.Save(0, &u.value) stateSinkObject.Save(1, &u.value32) } func (u *Uint64) afterLoad(context.Context) {} // +checklocksignore func (u *Uint64) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &u.value) stateSourceObject.Load(1, &u.value32) } func init() { state.Register((*Int64)(nil)) state.Register((*Uint64)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/atomicbitops/atomicbitops_64bit_state_autogen.go000066400000000000000000000056311465435605700311040ustar00rootroot00000000000000// automatically generated by stateify. //go:build !arm && !mips && !mipsle && !386 && !arm && !mips && !mipsle && !386 // +build !arm,!mips,!mipsle,!386,!arm,!mips,!mipsle,!386 package atomicbitops import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (i *Int32) StateTypeName() string { return "pkg/atomicbitops.Int32" } func (i *Int32) StateFields() []string { return []string{ "value", } } func (i *Int32) beforeSave() {} // +checklocksignore func (i *Int32) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.value) } func (i *Int32) afterLoad(context.Context) {} // +checklocksignore func (i *Int32) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.value) } func (u *Uint32) StateTypeName() string { return "pkg/atomicbitops.Uint32" } func (u *Uint32) StateFields() []string { return []string{ "value", } } func (u *Uint32) beforeSave() {} // +checklocksignore func (u *Uint32) StateSave(stateSinkObject state.Sink) { u.beforeSave() stateSinkObject.Save(0, &u.value) } func (u *Uint32) afterLoad(context.Context) {} // +checklocksignore func (u *Uint32) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &u.value) } func (b *Bool) StateTypeName() string { return "pkg/atomicbitops.Bool" } func (b *Bool) StateFields() []string { return []string{ "Uint32", } } func (b *Bool) beforeSave() {} // +checklocksignore func (b *Bool) StateSave(stateSinkObject state.Sink) { b.beforeSave() stateSinkObject.Save(0, &b.Uint32) } func (b *Bool) afterLoad(context.Context) {} // +checklocksignore func (b *Bool) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &b.Uint32) } func (i *Int64) StateTypeName() string { return "pkg/atomicbitops.Int64" } func (i *Int64) StateFields() []string { return []string{ "value", } } func (i *Int64) beforeSave() {} // +checklocksignore func (i *Int64) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.value) } func (i *Int64) afterLoad(context.Context) {} // +checklocksignore func (i *Int64) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.value) } func (u *Uint64) StateTypeName() string { return "pkg/atomicbitops.Uint64" } func (u *Uint64) StateFields() []string { return []string{ "value", } } func (u *Uint64) beforeSave() {} // +checklocksignore func (u *Uint64) StateSave(stateSinkObject state.Sink) { u.beforeSave() stateSinkObject.Save(0, &u.value) } func (u *Uint64) afterLoad(context.Context) {} // +checklocksignore func (u *Uint64) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &u.value) } func init() { state.Register((*Int32)(nil)) state.Register((*Uint32)(nil)) state.Register((*Bool)(nil)) state.Register((*Int64)(nil)) state.Register((*Uint64)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/atomicbitops/atomicbitops_amd64.s000066400000000000000000000032661465435605700260040ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // +build amd64 #include "textflag.h" TEXT ·andUint32(SB),NOSPLIT|NOFRAME,$0-12 MOVQ addr+0(FP), BX MOVL val+8(FP), AX LOCK ANDL AX, 0(BX) RET TEXT ·orUint32(SB),NOSPLIT|NOFRAME,$0-12 MOVQ addr+0(FP), BX MOVL val+8(FP), AX LOCK ORL AX, 0(BX) RET TEXT ·xorUint32(SB),NOSPLIT|NOFRAME,$0-12 MOVQ addr+0(FP), BX MOVL val+8(FP), AX LOCK XORL AX, 0(BX) RET TEXT ·compareAndSwapUint32(SB),NOSPLIT|NOFRAME,$0-20 MOVQ addr+0(FP), DI MOVL old+8(FP), AX MOVL new+12(FP), DX LOCK CMPXCHGL DX, 0(DI) MOVL AX, ret+16(FP) RET TEXT ·andUint64(SB),NOSPLIT|NOFRAME,$0-16 MOVQ addr+0(FP), BX MOVQ val+8(FP), AX LOCK ANDQ AX, 0(BX) RET TEXT ·orUint64(SB),NOSPLIT|NOFRAME,$0-16 MOVQ addr+0(FP), BX MOVQ val+8(FP), AX LOCK ORQ AX, 0(BX) RET TEXT ·xorUint64(SB),NOSPLIT|NOFRAME,$0-16 MOVQ addr+0(FP), BX MOVQ val+8(FP), AX LOCK XORQ AX, 0(BX) RET TEXT ·compareAndSwapUint64(SB),NOSPLIT|NOFRAME,$0-32 MOVQ addr+0(FP), DI MOVQ old+8(FP), AX MOVQ new+16(FP), DX LOCK CMPXCHGQ DX, 0(DI) MOVQ AX, ret+24(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/atomicbitops/atomicbitops_arm64.go000066400000000000000000000022021465435605700261520ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package atomicbitops import ( "runtime" "golang.org/x/sys/cpu" "gvisor.dev/gvisor/pkg/cpuid" ) var arm64HasATOMICS bool func init() { // The gvisor cpuid package only works on Linux. // For all other operating systems, use Go's x/sys/cpu package // to get the one bit we care about here. // // See https://github.com/google/gvisor/issues/7849. if runtime.GOOS == "linux" { arm64HasATOMICS = cpuid.HostFeatureSet().HasFeature(cpuid.ARM64FeatureATOMICS) } else { arm64HasATOMICS = cpu.ARM64.HasATOMICS } } golang-gvisor-gvisor-0.0~20240729.0/pkg/atomicbitops/atomicbitops_arm64.s000066400000000000000000000062321465435605700260160ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // +build arm64 #include "textflag.h" TEXT ·andUint32(SB),NOSPLIT,$0-12 MOVD addr+0(FP), R0 MOVW val+8(FP), R1 MOVBU ·arm64HasATOMICS(SB), R4 CBZ R4, load_store_loop MVN R1, R2 LDCLRALW R2, (R0), R3 RET load_store_loop: LDAXRW (R0), R2 ANDW R1, R2 STLXRW R2, (R0), R3 CBNZ R3, load_store_loop RET TEXT ·orUint32(SB),NOSPLIT,$0-12 MOVD addr+0(FP), R0 MOVW val+8(FP), R1 MOVBU ·arm64HasATOMICS(SB), R4 CBZ R4, load_store_loop LDORALW R1, (R0), R2 RET load_store_loop: LDAXRW (R0), R2 ORRW R1, R2 STLXRW R2, (R0), R3 CBNZ R3, load_store_loop RET TEXT ·xorUint32(SB),NOSPLIT,$0-12 MOVD addr+0(FP), R0 MOVW val+8(FP), R1 MOVBU ·arm64HasATOMICS(SB), R4 CBZ R4, load_store_loop LDEORALW R1, (R0), R2 RET load_store_loop: LDAXRW (R0), R2 EORW R1, R2 STLXRW R2, (R0), R3 CBNZ R3, load_store_loop RET TEXT ·compareAndSwapUint32(SB),NOSPLIT,$0-20 MOVD addr+0(FP), R0 MOVW old+8(FP), R1 MOVW new+12(FP), R2 MOVBU ·arm64HasATOMICS(SB), R4 CBZ R4, load_store_loop CASALW R1, (R0), R2 MOVW R1, ret+16(FP) RET load_store_loop: LDAXRW (R0), R3 CMPW R1, R3 BNE ok STLXRW R2, (R0), R4 CBNZ R4, load_store_loop ok: MOVW R3, ret+16(FP) RET TEXT ·andUint64(SB),NOSPLIT,$0-16 MOVD addr+0(FP), R0 MOVD val+8(FP), R1 MOVBU ·arm64HasATOMICS(SB), R4 CBZ R4, load_store_loop MVN R1, R2 LDCLRALD R2, (R0), R3 RET load_store_loop: LDAXR (R0), R2 AND R1, R2 STLXR R2, (R0), R3 CBNZ R3, load_store_loop RET TEXT ·orUint64(SB),NOSPLIT,$0-16 MOVD addr+0(FP), R0 MOVD val+8(FP), R1 MOVBU ·arm64HasATOMICS(SB), R4 CBZ R4, load_store_loop LDORALD R1, (R0), R2 RET load_store_loop: LDAXR (R0), R2 ORR R1, R2 STLXR R2, (R0), R3 CBNZ R3, load_store_loop RET TEXT ·xorUint64(SB),NOSPLIT,$0-16 MOVD addr+0(FP), R0 MOVD val+8(FP), R1 MOVBU ·arm64HasATOMICS(SB), R4 CBZ R4, load_store_loop LDEORALD R1, (R0), R2 RET load_store_loop: LDAXR (R0), R2 EOR R1, R2 STLXR R2, (R0), R3 CBNZ R3, load_store_loop RET TEXT ·compareAndSwapUint64(SB),NOSPLIT,$0-32 MOVD addr+0(FP), R0 MOVD old+8(FP), R1 MOVD new+16(FP), R2 MOVBU ·arm64HasATOMICS(SB), R4 CBZ R4, load_store_loop CASALD R1, (R0), R2 MOVD R1, ret+24(FP) RET load_store_loop: LDAXR (R0), R3 CMP R1, R3 BNE ok STLXR R2, (R0), R4 CBNZ R4, load_store_loop ok: MOVD R3, ret+24(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/atomicbitops/atomicbitops_arm64_state_autogen.go000066400000000000000000000001401465435605700310730ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 // +build arm64 package atomicbitops golang-gvisor-gvisor-0.0~20240729.0/pkg/atomicbitops/atomicbitops_float64.go000066400000000000000000000060231465435605700265050ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package atomicbitops import ( "math" "gvisor.dev/gvisor/pkg/sync" ) // Float64 is an atomic 64-bit floating-point number. // // +stateify savable type Float64 struct { _ sync.NoCopy // bits stores the bit of a 64-bit floating point number. // It is not (and should not be interpreted as) a real uint64. bits Uint64 } // FromFloat64 returns a Float64 initialized to value v. // //go:nosplit func FromFloat64(v float64) Float64 { return Float64{bits: FromUint64(math.Float64bits(v))} } // Load loads the floating-point value. // //go:nosplit func (f *Float64) Load() float64 { return math.Float64frombits(f.bits.Load()) } // RacyLoad is analogous to reading an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (f *Float64) RacyLoad() float64 { return math.Float64frombits(f.bits.RacyLoad()) } // Store stores the given floating-point value in the Float64. // //go:nosplit func (f *Float64) Store(v float64) { f.bits.Store(math.Float64bits(v)) } // RacyStore is analogous to setting an atomic value without using // synchronization. // // It may be helpful to document why a racy operation is permitted. // //go:nosplit func (f *Float64) RacyStore(v float64) { f.bits.RacyStore(math.Float64bits(v)) } // Swap stores the given value and returns the previously-stored one. // //go:nosplit func (f *Float64) Swap(v float64) float64 { return math.Float64frombits(f.bits.Swap(math.Float64bits(v))) } // CompareAndSwap does a compare-and-swap operation on the float64 value. // Note that unlike typical IEEE 754 semantics, this function will treat NaN // as equal to itself if all of its bits exactly match. // //go:nosplit func (f *Float64) CompareAndSwap(oldVal, newVal float64) bool { return f.bits.CompareAndSwap(math.Float64bits(oldVal), math.Float64bits(newVal)) } // Add increments the float by the given value. // Note that unlike an atomic integer, this requires spin-looping until we win // the compare-and-swap race, so this may take an indeterminate amount of time. // //go:nosplit func (f *Float64) Add(v float64) { // We do a racy load here because we optimistically think it may pass the // compare-and-swap operation. If it doesn't, we'll load it safely, so this // is OK and not a race for the overall intent of the user to add a number. sync.RaceDisable() oldVal := f.RacyLoad() for !f.CompareAndSwap(oldVal, oldVal+v) { oldVal = f.Load() } sync.RaceEnable() } golang-gvisor-gvisor-0.0~20240729.0/pkg/atomicbitops/atomicbitops_noasm.go000066400000000000000000000042651465435605700263510ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !amd64 && !arm64 // +build !amd64,!arm64 package atomicbitops import "sync/atomic" //go:nosplit func AndUint32(addr *Uint32, val uint32) { for { o := addr.Load() n := o & val if atomic.CompareAndSwapUint32(&addr.value, o, n) { break } } } //go:nosplit func OrUint32(addr *Uint32, val uint32) { for { o := addr.Load() n := o | val if atomic.CompareAndSwapUint32(&addr.value, o, n) { break } } } //go:nosplit func XorUint32(addr *Uint32, val uint32) { for { o := addr.Load() n := o ^ val if atomic.CompareAndSwapUint32(&addr.value, o, n) { break } } } //go:nosplit func CompareAndSwapUint32(addr *Uint32, old, new uint32) (prev uint32) { for { prev = addr.Load() if prev != old { return } if atomic.CompareAndSwapUint32(&addr.value, old, new) { return } } } //go:nosplit func AndUint64(addr *Uint64, val uint64) { for { o := atomic.LoadUint64(addr.ptr()) n := o & val if atomic.CompareAndSwapUint64(addr.ptr(), o, n) { break } } } //go:nosplit func OrUint64(addr *Uint64, val uint64) { for { o := atomic.LoadUint64(addr.ptr()) n := o | val if atomic.CompareAndSwapUint64(addr.ptr(), o, n) { break } } } //go:nosplit func XorUint64(addr *Uint64, val uint64) { for { o := atomic.LoadUint64(addr.ptr()) n := o ^ val if atomic.CompareAndSwapUint64(addr.ptr(), o, n) { break } } } //go:nosplit func CompareAndSwapUint64(addr *Uint64, old, new uint64) (prev uint64) { for { prev = atomic.LoadUint64(addr.ptr()) if prev != old { return } if atomic.CompareAndSwapUint64(addr.ptr(), old, new) { return } } } golang-gvisor-gvisor-0.0~20240729.0/pkg/atomicbitops/atomicbitops_state_autogen.go000066400000000000000000000014251465435605700300710ustar00rootroot00000000000000// automatically generated by stateify. //go:build (amd64 || arm64) && !amd64 && !arm64 // +build amd64 arm64 // +build !amd64 // +build !arm64 package atomicbitops import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (f *Float64) StateTypeName() string { return "pkg/atomicbitops.Float64" } func (f *Float64) StateFields() []string { return []string{ "bits", } } func (f *Float64) beforeSave() {} // +checklocksignore func (f *Float64) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.bits) } func (f *Float64) afterLoad(context.Context) {} // +checklocksignore func (f *Float64) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.bits) } func init() { state.Register((*Float64)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/binary/000077500000000000000000000000001465435605700207105ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/binary/binary.go000066400000000000000000000162631465435605700225330ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package binary translates between select fixed-sized types and a binary // representation. package binary import ( "encoding/binary" "fmt" "io" "reflect" ) // LittleEndian is the same as encoding/binary.LittleEndian. // // It is included here as a convenience. var LittleEndian = binary.LittleEndian // BigEndian is the same as encoding/binary.BigEndian. // // It is included here as a convenience. var BigEndian = binary.BigEndian // AppendUint16 appends the binary representation of a uint16 to buf. func AppendUint16(buf []byte, order binary.ByteOrder, num uint16) []byte { buf = append(buf, make([]byte, 2)...) order.PutUint16(buf[len(buf)-2:], num) return buf } // AppendUint32 appends the binary representation of a uint32 to buf. func AppendUint32(buf []byte, order binary.ByteOrder, num uint32) []byte { buf = append(buf, make([]byte, 4)...) order.PutUint32(buf[len(buf)-4:], num) return buf } // AppendUint64 appends the binary representation of a uint64 to buf. func AppendUint64(buf []byte, order binary.ByteOrder, num uint64) []byte { buf = append(buf, make([]byte, 8)...) order.PutUint64(buf[len(buf)-8:], num) return buf } // Marshal appends a binary representation of data to buf. // // data must only contain fixed-length signed and unsigned ints, arrays, // slices, structs and compositions of said types. data may be a pointer, // but cannot contain pointers. func Marshal(buf []byte, order binary.ByteOrder, data any) []byte { return marshal(buf, order, reflect.Indirect(reflect.ValueOf(data))) } func marshal(buf []byte, order binary.ByteOrder, data reflect.Value) []byte { switch data.Kind() { case reflect.Int8: buf = append(buf, byte(int8(data.Int()))) case reflect.Int16: buf = AppendUint16(buf, order, uint16(int16(data.Int()))) case reflect.Int32: buf = AppendUint32(buf, order, uint32(int32(data.Int()))) case reflect.Int64: buf = AppendUint64(buf, order, uint64(data.Int())) case reflect.Uint8: buf = append(buf, byte(data.Uint())) case reflect.Uint16: buf = AppendUint16(buf, order, uint16(data.Uint())) case reflect.Uint32: buf = AppendUint32(buf, order, uint32(data.Uint())) case reflect.Uint64: buf = AppendUint64(buf, order, data.Uint()) case reflect.Array, reflect.Slice: for i, l := 0, data.Len(); i < l; i++ { buf = marshal(buf, order, data.Index(i)) } case reflect.Struct: for i, l := 0, data.NumField(); i < l; i++ { buf = marshal(buf, order, data.Field(i)) } default: panic("invalid type: " + data.Type().String()) } return buf } // Unmarshal unpacks buf into data. // // data must be a slice or a pointer and buf must have a length of exactly // Size(data). data must only contain fixed-length signed and unsigned ints, // arrays, slices, structs and compositions of said types. func Unmarshal(buf []byte, order binary.ByteOrder, data any) { value := reflect.ValueOf(data) switch value.Kind() { case reflect.Ptr: value = value.Elem() case reflect.Slice: default: panic("invalid type: " + value.Type().String()) } buf = unmarshal(buf, order, value) if len(buf) != 0 { panic(fmt.Sprintf("buffer too long by %d bytes", len(buf))) } } func unmarshal(buf []byte, order binary.ByteOrder, data reflect.Value) []byte { switch data.Kind() { case reflect.Int8: data.SetInt(int64(int8(buf[0]))) buf = buf[1:] case reflect.Int16: data.SetInt(int64(int16(order.Uint16(buf)))) buf = buf[2:] case reflect.Int32: data.SetInt(int64(int32(order.Uint32(buf)))) buf = buf[4:] case reflect.Int64: data.SetInt(int64(order.Uint64(buf))) buf = buf[8:] case reflect.Uint8: data.SetUint(uint64(buf[0])) buf = buf[1:] case reflect.Uint16: data.SetUint(uint64(order.Uint16(buf))) buf = buf[2:] case reflect.Uint32: data.SetUint(uint64(order.Uint32(buf))) buf = buf[4:] case reflect.Uint64: data.SetUint(order.Uint64(buf)) buf = buf[8:] case reflect.Array, reflect.Slice: for i, l := 0, data.Len(); i < l; i++ { buf = unmarshal(buf, order, data.Index(i)) } case reflect.Struct: for i, l := 0, data.NumField(); i < l; i++ { if field := data.Field(i); field.CanSet() { buf = unmarshal(buf, order, field) } else { buf = buf[sizeof(field):] } } default: panic("invalid type: " + data.Type().String()) } return buf } // Size calculates the buffer sized needed by Marshal or Unmarshal. // // Size only support the types supported by Marshal. func Size(v any) uintptr { return sizeof(reflect.Indirect(reflect.ValueOf(v))) } func sizeof(data reflect.Value) uintptr { switch data.Kind() { case reflect.Int8, reflect.Uint8: return 1 case reflect.Int16, reflect.Uint16: return 2 case reflect.Int32, reflect.Uint32: return 4 case reflect.Int64, reflect.Uint64: return 8 case reflect.Array, reflect.Slice: var size uintptr for i, l := 0, data.Len(); i < l; i++ { size += sizeof(data.Index(i)) } return size case reflect.Struct: var size uintptr for i, l := 0, data.NumField(); i < l; i++ { size += sizeof(data.Field(i)) } return size default: panic("invalid type: " + data.Type().String()) } } // ReadUint16 reads a uint16 from r. func ReadUint16(r io.Reader, order binary.ByteOrder) (uint16, error) { buf := make([]byte, 2) if _, err := io.ReadFull(r, buf); err != nil { return 0, err } return order.Uint16(buf), nil } // ReadUint32 reads a uint32 from r. func ReadUint32(r io.Reader, order binary.ByteOrder) (uint32, error) { buf := make([]byte, 4) if _, err := io.ReadFull(r, buf); err != nil { return 0, err } return order.Uint32(buf), nil } // ReadUint64 reads a uint64 from r. func ReadUint64(r io.Reader, order binary.ByteOrder) (uint64, error) { buf := make([]byte, 8) if _, err := io.ReadFull(r, buf); err != nil { return 0, err } return order.Uint64(buf), nil } // WriteUint16 writes a uint16 to w. func WriteUint16(w io.Writer, order binary.ByteOrder, num uint16) error { buf := make([]byte, 2) order.PutUint16(buf, num) _, err := w.Write(buf) return err } // WriteUint32 writes a uint32 to w. func WriteUint32(w io.Writer, order binary.ByteOrder, num uint32) error { buf := make([]byte, 4) order.PutUint32(buf, num) _, err := w.Write(buf) return err } // WriteUint64 writes a uint64 to w. func WriteUint64(w io.Writer, order binary.ByteOrder, num uint64) error { buf := make([]byte, 8) order.PutUint64(buf, num) _, err := w.Write(buf) return err } // AlignUp rounds a length up to an alignment. align must be a power of 2. func AlignUp(length int, align uint) int { return (length + int(align) - 1) & ^(int(align) - 1) } // AlignDown rounds a length down to an alignment. align must be a power of 2. func AlignDown(length int, align uint) int { return length & ^(int(align) - 1) } golang-gvisor-gvisor-0.0~20240729.0/pkg/binary/binary_state_autogen.go000066400000000000000000000000701465435605700254420ustar00rootroot00000000000000// automatically generated by stateify. package binary golang-gvisor-gvisor-0.0~20240729.0/pkg/bitmap/000077500000000000000000000000001465435605700207005ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/bitmap/bitmap.go000066400000000000000000000221041465435605700225020ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package bitmap provides the implementation of bitmap. package bitmap import ( "fmt" "math" "math/bits" ) // MaxBitEntryLimit defines the upper limit on how many bit entries are supported by this Bitmap // implementation. const MaxBitEntryLimit uint32 = math.MaxInt32 // Bitmap implements an efficient bitmap. // // +stateify savable type Bitmap struct { // numOnes is the number of ones in the bitmap. numOnes uint32 // bitBlock holds the bits. The type of bitBlock is uint64 which means // each number in bitBlock contains 64 entries. bitBlock []uint64 } // New create a new empty Bitmap. func New(size uint32) Bitmap { b := Bitmap{} bSize := (size + 63) / 64 b.bitBlock = make([]uint64, bSize) return b } // IsEmpty verifies whether the Bitmap is empty. func (b *Bitmap) IsEmpty() bool { return b.numOnes == 0 } // Size returns the total number of bits in the bitmap. func (b *Bitmap) Size() int { return len(b.bitBlock) * 64 } // Grow grows the bitmap by at least toGrow bits. func (b *Bitmap) Grow(toGrow uint32) error { newbitBlockSize := uint32(len(b.bitBlock)) + ((toGrow + 63) / 64) if newbitBlockSize > MaxBitEntryLimit/8 { return fmt.Errorf("requested bitmap size %d too large", newbitBlockSize*64) } bits := make([]uint64, (toGrow+63)/64) b.bitBlock = append(b.bitBlock, bits...) return nil } // Minimum return the smallest value in the Bitmap. func (b *Bitmap) Minimum() uint32 { for i := 0; i < len(b.bitBlock); i++ { if w := b.bitBlock[i]; w != 0 { r := bits.TrailingZeros64(w) return uint32(r + i*64) } } return MaxBitEntryLimit } // FirstZero returns the first unset bit from the range [start, ). func (b *Bitmap) FirstZero(start uint32) (bit uint32, err error) { i, nbit := int(start/64), start%64 n := len(b.bitBlock) if i >= n { return MaxBitEntryLimit, fmt.Errorf("given start of range exceeds bitmap size") } w := b.bitBlock[i] | ((1 << nbit) - 1) for { if w != ^uint64(0) { r := bits.TrailingZeros64(^w) return uint32(r + i*64), nil } i++ if i == n { break } w = b.bitBlock[i] } return MaxBitEntryLimit, fmt.Errorf("bitmap has no unset bits") } // FirstOne returns the first set bit from the range [start, ) func (b *Bitmap) FirstOne(start uint32) (bit uint32, err error) { i, nbit := int(start/64), start%64 n := len(b.bitBlock) if i >= n { return MaxBitEntryLimit, fmt.Errorf("given start of range exceeds bitmap size") } w := b.bitBlock[i] & (math.MaxUint64 << nbit) for { if w != uint64(0) { r := bits.TrailingZeros64(w) return uint32(r + i*64), nil } i++ if i == n { break } w = b.bitBlock[i] } return MaxBitEntryLimit, fmt.Errorf("bitmap has no set bits") } // Maximum return the largest value in the Bitmap. func (b *Bitmap) Maximum() uint32 { for i := len(b.bitBlock) - 1; i >= 0; i-- { if w := b.bitBlock[i]; w != 0 { r := bits.LeadingZeros64(w) return uint32(i*64 + 63 - r) } } return uint32(0) } // Add add i to the Bitmap. func (b *Bitmap) Add(i uint32) { blockNum, mask := i/64, uint64(1)<<(i%64) // if blockNum is out of range, extend b.bitBlock if x, y := int(blockNum), len(b.bitBlock); x >= y { b.bitBlock = append(b.bitBlock, make([]uint64, x-y+1)...) } oldBlock := b.bitBlock[blockNum] newBlock := oldBlock | mask if oldBlock != newBlock { b.bitBlock[blockNum] = newBlock b.numOnes++ } } // Remove i from the Bitmap. func (b *Bitmap) Remove(i uint32) { blockNum, mask := i/64, uint64(1)<<(i%64) oldBlock := b.bitBlock[blockNum] newBlock := oldBlock &^ mask if oldBlock != newBlock { b.bitBlock[blockNum] = newBlock b.numOnes-- } } // Clone the Bitmap. func (b *Bitmap) Clone() Bitmap { bitmap := Bitmap{b.numOnes, make([]uint64, len(b.bitBlock))} copy(bitmap.bitBlock, b.bitBlock[:]) return bitmap } // countOnesForBlocks count all 1 bits within b.bitBlock of begin and that of end. // The begin block and end block are inclusive. func (b *Bitmap) countOnesForBlocks(begin, end uint32) uint64 { ones := uint64(0) beginBlock := begin / 64 endBlock := end / 64 for i := beginBlock; i <= endBlock; i++ { ones += uint64(bits.OnesCount64(b.bitBlock[i])) } return ones } // countOnesForAllBlocks count all 1 bits in b.bitBlock. func (b *Bitmap) countOnesForAllBlocks() uint64 { ones := uint64(0) for i := 0; i < len(b.bitBlock); i++ { ones += uint64(bits.OnesCount64(b.bitBlock[i])) } return ones } // flipRange flip the bits within range (begin and end). begin is inclusive and end is exclusive. func (b *Bitmap) flipRange(begin, end uint32) { end-- beginBlock := begin / 64 endBlock := end / 64 if beginBlock == endBlock { b.bitBlock[endBlock] ^= ((^uint64(0) << uint(begin%64)) & ((uint64(1) << (uint(end)%64 + 1)) - 1)) } else { b.bitBlock[beginBlock] ^= ^(^uint64(0) << uint(begin%64)) for i := beginBlock; i < endBlock; i++ { b.bitBlock[i] = ^b.bitBlock[i] } b.bitBlock[endBlock] ^= ((uint64(1) << (uint(end)%64 + 1)) - 1) } } // clearRange clear the bits within range (begin and end). begin is inclusive and end is exclusive. func (b *Bitmap) clearRange(begin, end uint32) { end-- beginBlock := begin / 64 endBlock := end / 64 if beginBlock == endBlock { b.bitBlock[beginBlock] &= (((uint64(1) << uint(begin%64)) - 1) | ^((uint64(1) << (uint(end)%64 + 1)) - 1)) } else { b.bitBlock[beginBlock] &= ((uint64(1) << uint(begin%64)) - 1) for i := beginBlock + 1; i < endBlock; i++ { b.bitBlock[i] &= ^b.bitBlock[i] } b.bitBlock[endBlock] &= ^((uint64(1) << (uint(end)%64 + 1)) - 1) } } // ClearRange clear bits within range (begin and end) for the Bitmap. begin is inclusive and end is exclusive. func (b *Bitmap) ClearRange(begin, end uint32) { blockRange := end/64 - begin/64 // When the number of cleared blocks is larger than half of the length of b.bitBlock, // counting 1s for the entire bitmap has better performance. if blockRange > uint32(len(b.bitBlock)/2) { b.clearRange(begin, end) b.numOnes = uint32(b.countOnesForAllBlocks()) } else { oldRangeOnes := b.countOnesForBlocks(begin, end) b.clearRange(begin, end) newRangeOnes := b.countOnesForBlocks(begin, end) b.numOnes += uint32(newRangeOnes - oldRangeOnes) } } // FlipRange flip bits within range (begin and end) for the Bitmap. begin is inclusive and end is exclusive. func (b *Bitmap) FlipRange(begin, end uint32) { blockRange := end/64 - begin/64 // When the number of flipped blocks is larger than half of the length of b.bitBlock, // counting 1s for the entire bitmap has better performance. if blockRange > uint32(len(b.bitBlock)/2) { b.flipRange(begin, end) b.numOnes = uint32(b.countOnesForAllBlocks()) } else { oldRangeOnes := b.countOnesForBlocks(begin, end) b.flipRange(begin, end) newRangeOnes := b.countOnesForBlocks(begin, end) b.numOnes += uint32(newRangeOnes - oldRangeOnes) } } // Reset zeroes the entire bitmap. func (b *Bitmap) Reset() { b.numOnes = 0 clear(b.bitBlock) } // ForEach calls `f` for each set bit in the range [start, end). // // If f returns false, ForEach stops the iteration. func (b *Bitmap) ForEach(start, end uint32, f func(idx uint32) bool) { blockEnd := (end + 63) / 64 if blockEnd > uint32(len(b.bitBlock)) { blockEnd = uint32(len(b.bitBlock)) } // base is the start number of a bitBlock base := start / 64 * 64 blockMask := ^((uint64(1) << (start % 64)) - 1) for i := start / 64; i < blockEnd; i++ { if i == end/64 { blockMask &= (uint64(1) << (end % 64)) - 1 } bitBlock := b.bitBlock[i] & blockMask blockMask = ^uint64(0) // Iterate through all the numbers held by this bit block. for bitBlock != 0 { // Extract the lowest set 1 bit. j := bitBlock & -bitBlock // Interpret the bit as the in32 number it represents and add it to result. idx := base + uint32(bits.OnesCount64(j-1)) if !f(idx) { return } bitBlock ^= j } base += 64 } } // ToSlice transform the Bitmap into slice. For example, a bitmap of [0, 1, 0, 1] // will return the slice [1, 3]. func (b *Bitmap) ToSlice() []uint32 { bitmapSlice := make([]uint32, 0, b.numOnes) // base is the start number of a bitBlock base := 0 for i := 0; i < len(b.bitBlock); i++ { bitBlock := b.bitBlock[i] // Iterate through all the numbers held by this bit block. for bitBlock != 0 { // Extract the lowest set 1 bit. j := bitBlock & -bitBlock // Interpret the bit as the in32 number it represents and add it to result. bitmapSlice = append(bitmapSlice, uint32((base + int(bits.OnesCount64(j-1))))) bitBlock ^= j } base += 64 } return bitmapSlice } // GetNumOnes return the number of ones in the Bitmap. func (b *Bitmap) GetNumOnes() uint32 { return b.numOnes } golang-gvisor-gvisor-0.0~20240729.0/pkg/bitmap/bitmap_state_autogen.go000066400000000000000000000013751465435605700254330ustar00rootroot00000000000000// automatically generated by stateify. package bitmap import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (b *Bitmap) StateTypeName() string { return "pkg/bitmap.Bitmap" } func (b *Bitmap) StateFields() []string { return []string{ "numOnes", "bitBlock", } } func (b *Bitmap) beforeSave() {} // +checklocksignore func (b *Bitmap) StateSave(stateSinkObject state.Sink) { b.beforeSave() stateSinkObject.Save(0, &b.numOnes) stateSinkObject.Save(1, &b.bitBlock) } func (b *Bitmap) afterLoad(context.Context) {} // +checklocksignore func (b *Bitmap) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &b.numOnes) stateSourceObject.Load(1, &b.bitBlock) } func init() { state.Register((*Bitmap)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/bits/000077500000000000000000000000001465435605700203655ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/bits/bits.go000066400000000000000000000017621465435605700216630ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package bits includes all bit related types and operations. package bits // AlignUp rounds a length up to an alignment. align must be a power of 2. func AlignUp(length int, align uint) int { return (length + int(align) - 1) & ^(int(align) - 1) } // AlignDown rounds a length down to an alignment. align must be a power of 2. func AlignDown(length int, align uint) int { return length & ^(int(align) - 1) } golang-gvisor-gvisor-0.0~20240729.0/pkg/bits/bits32.go000066400000000000000000000013251465435605700220230ustar00rootroot00000000000000package bits // IsOn returns true if *all* bits set in 'bits' are set in 'mask'. func IsOn32(mask, bits uint32) bool { return mask&bits == bits } // IsAnyOn returns true if *any* bit set in 'bits' is set in 'mask'. func IsAnyOn32(mask, bits uint32) bool { return mask&bits != 0 } // Mask returns a T with all of the given bits set. func Mask32(is ...int) uint32 { ret := uint32(0) for _, i := range is { ret |= MaskOf32(i) } return ret } // MaskOf is like Mask, but sets only a single bit (more efficiently). func MaskOf32(i int) uint32 { return uint32(1) << uint32(i) } // IsPowerOfTwo returns true if v is power of 2. func IsPowerOfTwo32(v uint32) bool { if v == 0 { return false } return v&(v-1) == 0 } golang-gvisor-gvisor-0.0~20240729.0/pkg/bits/bits64.go000066400000000000000000000013251465435605700220300ustar00rootroot00000000000000package bits // IsOn returns true if *all* bits set in 'bits' are set in 'mask'. func IsOn64(mask, bits uint64) bool { return mask&bits == bits } // IsAnyOn returns true if *any* bit set in 'bits' is set in 'mask'. func IsAnyOn64(mask, bits uint64) bool { return mask&bits != 0 } // Mask returns a T with all of the given bits set. func Mask64(is ...int) uint64 { ret := uint64(0) for _, i := range is { ret |= MaskOf64(i) } return ret } // MaskOf is like Mask, but sets only a single bit (more efficiently). func MaskOf64(i int) uint64 { return uint64(1) << uint64(i) } // IsPowerOfTwo returns true if v is power of 2. func IsPowerOfTwo64(v uint64) bool { if v == 0 { return false } return v&(v-1) == 0 } golang-gvisor-gvisor-0.0~20240729.0/pkg/bits/bits_state_autogen.go000066400000000000000000000002371465435605700246010ustar00rootroot00000000000000// automatically generated by stateify. //go:build (amd64 || arm64) && !amd64 && !arm64 // +build amd64 arm64 // +build !amd64 // +build !arm64 package bits golang-gvisor-gvisor-0.0~20240729.0/pkg/bits/uint64_arch.go000066400000000000000000000024141465435605700230430ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 || arm64 // +build amd64 arm64 package bits // TrailingZeros64 returns the number of bits before the least significant 1 // bit in x; in other words, it returns the index of the least significant 1 // bit in x. If x is 0, TrailingZeros64 returns 64. func TrailingZeros64(x uint64) int // MostSignificantOne64 returns the index of the most significant 1 bit in // x. If x is 0, MostSignificantOne64 returns 64. func MostSignificantOne64(x uint64) int // ForEachSetBit64 calls f once for each set bit in x, with argument i equal to // the set bit's index. func ForEachSetBit64(x uint64, f func(i int)) { for x != 0 { i := TrailingZeros64(x) f(i) x &^= MaskOf64(i) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/bits/uint64_arch_amd64_asm.s000066400000000000000000000015411465435605700245330ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 TEXT ·TrailingZeros64(SB),$0-16 BSFQ x+0(FP), AX JNZ end MOVQ $64, AX end: MOVQ AX, ret+8(FP) RET TEXT ·MostSignificantOne64(SB),$0-16 BSRQ x+0(FP), AX JNZ end MOVQ $64, AX end: MOVQ AX, ret+8(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/bits/uint64_arch_arm64_asm.s000066400000000000000000000017611465435605700245550ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 TEXT ·TrailingZeros64(SB),$0-16 MOVD x+0(FP), R0 RBIT R0, R0 CLZ R0, R0 // return 64 if x == 0 MOVD R0, ret+8(FP) RET TEXT ·MostSignificantOne64(SB),$0-16 MOVD x+0(FP), R0 CLZ R0, R0 // return 64 if x == 0 MOVD $63, R1 SUBS R0, R1, R0 // ret = 63 - CLZ BPL end MOVD $64, R0 // x == 0 end: MOVD R0, ret+8(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/bits/uint64_arch_generic.go000066400000000000000000000027011465435605700245360ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !amd64 && !arm64 // +build !amd64,!arm64 package bits // TrailingZeros64 returns the number of bits before the least significant 1 // bit in x; in other words, it returns the index of the least significant 1 // bit in x. If x is 0, TrailingZeros64 returns 64. func TrailingZeros64(x uint64) int { if x == 0 { return 64 } i := 0 for ; x&1 == 0; i++ { x >>= 1 } return i } // MostSignificantOne64 returns the index of the most significant 1 bit in // x. If x is 0, MostSignificantOne64 returns 64. func MostSignificantOne64(x uint64) int { if x == 0 { return 64 } i := 63 for ; x&(1<<63) == 0; i-- { x <<= 1 } return i } // ForEachSetBit64 calls f once for each set bit in x, with argument i equal to // the set bit's index. func ForEachSetBit64(x uint64, f func(i int)) { for i := 0; x != 0; i++ { if x&1 != 0 { f(i) } x >>= 1 } } golang-gvisor-gvisor-0.0~20240729.0/pkg/bpf/000077500000000000000000000000001465435605700201735ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/bpf/bpf.go000066400000000000000000000201501465435605700212670ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package bpf provides tools for working with Berkeley Packet Filter (BPF) // programs. More information on BPF can be found at // https://www.freebsd.org/cgi/man.cgi?bpf(4) package bpf import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" ) const ( // MaxInstructions is the maximum number of instructions in a BPF program, // and is equal to Linux's BPF_MAXINSNS. MaxInstructions = 4096 // ScratchMemRegisters is the number of M registers in a BPF virtual machine, // and is equal to Linux's BPF_MEMWORDS. ScratchMemRegisters = 16 ) // Parts of a linux.BPFInstruction.OpCode. Compare to the Linux kernel's // include/uapi/linux/filter.h. // // In the comments below: // // - A, X, and M[] are BPF virtual machine registers. // // - K refers to the instruction field linux.BPFInstruction.K. // // - Bits are counted from the LSB position. const ( // Instruction class, stored in bits 0-2. Ld = 0x00 // load into A Ldx = 0x01 // load into X St = 0x02 // store from A Stx = 0x03 // store from X Alu = 0x04 // arithmetic Jmp = 0x05 // jump Ret = 0x06 // return Misc = 0x07 instructionClassMask = 0x07 // Size of a load, stored in bits 3-4. W = 0x00 // 32 bits H = 0x08 // 16 bits B = 0x10 // 8 bits loadSizeMask = 0x18 // Source operand for a load, stored in bits 5-7. // Address mode numbers in the comments come from Linux's // Documentation/networking/filter.txt. Imm = 0x00 // immediate value K (mode 4) Abs = 0x20 // data in input at byte offset K (mode 1) Ind = 0x40 // data in input at byte offset X+K (mode 2) Mem = 0x60 // M[K] (mode 3) Len = 0x80 // length of the input in bytes ("BPF extension len") Msh = 0xa0 // 4 * lower nibble of input at byte offset K (mode 5) loadModeMask = 0xe0 // Source operands for arithmetic, jump, and return instructions. // Arithmetic and jump instructions can use K or X as source operands. // Return instructions can use K or A as source operands. K = 0x00 // still mode 4 X = 0x08 // mode 0 A = 0x10 // mode 9 operandMask = K | X | A srcAluJmpMask = 0x08 srcRetMask = 0x18 // Arithmetic instructions, stored in bits 4-7. Add = 0x00 Sub = 0x10 // A - src Mul = 0x20 Div = 0x30 // A / src Or = 0x40 And = 0x50 Lsh = 0x60 // A << src Rsh = 0x70 // A >> src Neg = 0x80 // -A (src ignored) Mod = 0x90 // A % src Xor = 0xa0 aluMask = 0xf0 // Jump instructions, stored in bits 4-7. Ja = 0x00 // unconditional (uses K for jump offset) Jeq = 0x10 // if A == src Jgt = 0x20 // if A > src Jge = 0x30 // if A >= src Jset = 0x40 // if (A & src) != 0 jmpMask = 0xf0 // Miscellaneous instructions, stored in bits 3-7. Tax = 0x00 // A = X Txa = 0x80 // X = A miscMask = 0xf8 // Masks for bits that should be zero. unusedBitsMask = 0xff00 // all valid instructions use only bits 0-7 storeUnusedBitsMask = 0xf8 // stores only use instruction class retUnusedBitsMask = 0xe0 // returns only use instruction class and source operand ) // Instruction is a type alias for linux.BPFInstruction. // It adds a human-readable stringification and other helper functions. // // +marshal slice:InstructionSlice // +stateify savable // +stateify identtype type Instruction linux.BPFInstruction // String returns a human-readable version of the instruction. func (ins *Instruction) String() string { s, err := Decode(*ins) if err != nil { return fmt.Sprintf("[invalid %v: %v]", (*linux.BPFInstruction)(ins), err) } return s } // Stmt returns an Instruction representing a BPF non-jump instruction. func Stmt(code uint16, k uint32) Instruction { return Instruction{ OpCode: code, K: k, } } // Jump returns an Instruction representing a BPF jump instruction. func Jump(code uint16, k uint32, jt, jf uint8) Instruction { return Instruction{ OpCode: code, JumpIfTrue: jt, JumpIfFalse: jf, K: k, } } // Equal returns whether this instruction is equivalent to `other`. func (ins Instruction) Equal(other Instruction) bool { if ins.OpCode != other.OpCode { // If instructions don't have the same opcode, they are not equal. return false } switch ins.OpCode & instructionClassMask { case Ld, Ldx: if ins.OpCode&loadModeMask == Len { // Length instructions are independent of the K register. return true } // Two load instructions are the same if they load from the same offset. return ins.K == other.K case St, Stx: // Two store instructions are the same if they store at the same offset. return ins.K == other.K case Alu: if ins.OpCode == Alu|Neg { return true // The negation instruction has no operands. } if ins.OpCode&operandMask == X { // If we use X, no need to check anything. return true } if ins.OpCode&operandMask == K { // If use K, check that it's the same. return ins.K == other.K } // Otherwise, we use the whole instruction. case Ret: switch ins.OpCode { case Ret | A: // All instructions that return the A register are equivalent. return true case Ret | K: // All instructions that return the same value are equivalent. return ins.K == other.K } case Jmp: if ins.IsUnconditionalJump() { // Unconditional jumps to the same offset are equivalent. return ins.K == other.K } if ins.OpCode&operandMask == X { // If we use X as the operand, check the conditional jump targets only. return ins.JumpIfTrue == other.JumpIfTrue && ins.JumpIfFalse == other.JumpIfFalse } // Otherwise, we use the whole instruction. case Misc: if ins.OpCode == Misc|Tax || ins.OpCode == Misc|Txa { // Swapping X and A, we don't care about the other fields. return true } } // All other instructions need full bit-for-bit comparison. return ins == other } // IsReturn returns true if `ins` is a return instruction. func (ins Instruction) IsReturn() bool { return ins.OpCode&instructionClassMask == Ret } // IsJump returns true if `ins` is a jump instruction. func (ins Instruction) IsJump() bool { return ins.OpCode&instructionClassMask == Jmp } // IsConditionalJump returns true if `ins` is a conditional jump instruction. func (ins Instruction) IsConditionalJump() bool { return ins.IsJump() && ins.OpCode&jmpMask != Ja } // IsUnconditionalJump returns true if `ins` is a conditional jump instruction. func (ins Instruction) IsUnconditionalJump() bool { return ins.IsJump() && ins.OpCode&jmpMask == Ja } // JumpOffset is a possible jump offset that an instruction may jump to. type JumpOffset struct { // Type is the type of jump that an instruction may execute. Type JumpType // Offset is the number of instructions that the jump skips over. Offset uint32 } // JumpOffsets returns the set of instruction offsets that this instruction // may jump to. Returns a nil slice if this is not a jump instruction. func (ins Instruction) JumpOffsets() []JumpOffset { if !ins.IsJump() { return nil } if ins.IsConditionalJump() { return []JumpOffset{ {JumpTrue, uint32(ins.JumpIfTrue)}, {JumpFalse, uint32(ins.JumpIfFalse)}, } } return []JumpOffset{{JumpDirect, ins.K}} } // ModifiesRegisterA returns true iff this instruction modifies the value // of the "A" register. func (ins Instruction) ModifiesRegisterA() bool { switch ins.OpCode & instructionClassMask { case Ld: return true case Alu: return true case Misc: return ins.OpCode == Misc|Tax default: return false } } golang-gvisor-gvisor-0.0~20240729.0/pkg/bpf/bpf_state_autogen.go000066400000000000000000000023651465435605700242210ustar00rootroot00000000000000// automatically generated by stateify. package bpf import ( "context" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/state" ) func (ins *Instruction) StateTypeName() string { return "pkg/bpf.Instruction" } func (ins *Instruction) StateFields() []string { return (*linux.BPFInstruction)(ins).StateFields() } // +checklocksignore func (ins *Instruction) StateSave(stateSinkObject state.Sink) { (*linux.BPFInstruction)(ins).StateSave(stateSinkObject) } // +checklocksignore func (ins *Instruction) StateLoad(ctx context.Context, stateSourceObject state.Source) { (*linux.BPFInstruction)(ins).StateLoad(ctx, stateSourceObject) } func (p *Program) StateTypeName() string { return "pkg/bpf.Program" } func (p *Program) StateFields() []string { return []string{ "instructions", } } func (p *Program) beforeSave() {} // +checklocksignore func (p *Program) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.instructions) } func (p *Program) afterLoad(context.Context) {} // +checklocksignore func (p *Program) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.instructions) } func init() { state.Register((*Instruction)(nil)) state.Register((*Program)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/bpf/bpf_unsafe.go000066400000000000000000000031151465435605700226320ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package bpf import ( "fmt" "unsafe" ) // sizeOfInstruction is the size of a BPF instruction struct. const sizeOfInstruction = int(unsafe.Sizeof(Instruction{})) // ToBytecode converts BPF instructions into raw BPF bytecode. func ToBytecode(insns []Instruction) []byte { return ([]byte)(unsafe.Slice((*byte)(unsafe.Pointer(&insns[0])), len(insns)*sizeOfInstruction)) } // ParseBytecode converts raw BPF bytecode into BPF instructions. // It verifies that the resulting set of instructions is a valid program. func ParseBytecode(bytecode []byte) ([]Instruction, error) { if len(bytecode)%sizeOfInstruction != 0 { return nil, fmt.Errorf("bytecode size (%d bytes) is not a multiple of BPF instruction size of %d bytes", len(bytecode), sizeOfInstruction) } insns := ([]Instruction)(unsafe.Slice((*Instruction)(unsafe.Pointer(&bytecode[0])), len(bytecode)/sizeOfInstruction)) if _, err := Compile(insns, false); err != nil { return nil, fmt.Errorf("not a valid BPF program: %v", err) } return insns, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/bpf/bpf_unsafe_state_autogen.go000066400000000000000000000000651465435605700255550ustar00rootroot00000000000000// automatically generated by stateify. package bpf golang-gvisor-gvisor-0.0~20240729.0/pkg/bpf/decoder.go000066400000000000000000000134601465435605700221330ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package bpf import ( "bytes" "fmt" "gvisor.dev/gvisor/pkg/abi/linux" ) // DecodeProgram translates a compiled BPF program into text format. func DecodeProgram(p Program) (string, error) { return DecodeInstructions(p.instructions) } // DecodeInstructions translates an array of BPF instructions into text format. func DecodeInstructions(instns []Instruction) (string, error) { var ret bytes.Buffer for line, s := range instns { ret.WriteString(fmt.Sprintf("%v: ", line)) if err := decode(s, line, &ret); err != nil { return "", err } ret.WriteString("\n") } return ret.String(), nil } // Decode translates a single BPF instruction into text format. func Decode(ins Instruction) (string, error) { var ret bytes.Buffer err := decode(ins, -1, &ret) return ret.String(), err } func decode(inst Instruction, line int, w *bytes.Buffer) error { var err error switch inst.OpCode & instructionClassMask { case Ld: err = decodeLd(inst, w) case Ldx: err = decodeLdx(inst, w) case St: w.WriteString(fmt.Sprintf("M[%v] <- A", inst.K)) case Stx: w.WriteString(fmt.Sprintf("M[%v] <- X", inst.K)) case Alu: err = decodeAlu(inst, w) case Jmp: err = decodeJmp(inst, line, w) case Ret: err = decodeRet(inst, w) case Misc: err = decodeMisc(inst, w) default: return fmt.Errorf("invalid BPF instruction: %v", linux.BPFInstruction(inst)) } return err } // A <- P[k:4] func decodeLd(inst Instruction, w *bytes.Buffer) error { w.WriteString("A <- ") switch inst.OpCode & loadModeMask { case Imm: w.WriteString(fmt.Sprintf("%v", inst.K)) case Abs: w.WriteString(fmt.Sprintf("P[%v:", inst.K)) if err := decodeLdSize(inst, w); err != nil { return err } w.WriteString("]") case Ind: w.WriteString(fmt.Sprintf("P[X+%v:", inst.K)) if err := decodeLdSize(inst, w); err != nil { return err } w.WriteString("]") case Mem: w.WriteString(fmt.Sprintf("M[%v]", inst.K)) case Len: w.WriteString("len") default: return fmt.Errorf("invalid BPF LD instruction: %v", linux.BPFInstruction(inst)) } return nil } func decodeLdSize(inst Instruction, w *bytes.Buffer) error { switch inst.OpCode & loadSizeMask { case W: w.WriteString("4") case H: w.WriteString("2") case B: w.WriteString("1") default: return fmt.Errorf("invalid BPF LD size: %v", linux.BPFInstruction(inst)) } return nil } // X <- P[k:4] func decodeLdx(inst Instruction, w *bytes.Buffer) error { w.WriteString("X <- ") switch inst.OpCode & loadModeMask { case Imm: w.WriteString(fmt.Sprintf("%v", inst.K)) case Mem: w.WriteString(fmt.Sprintf("M[%v]", inst.K)) case Len: w.WriteString("len") case Msh: w.WriteString(fmt.Sprintf("4*(P[%v:1]&0xf)", inst.K)) default: return fmt.Errorf("invalid BPF LDX instruction: %v", linux.BPFInstruction(inst)) } return nil } // A <- A + k func decodeAlu(inst Instruction, w *bytes.Buffer) error { code := inst.OpCode & aluMask if code == Neg { w.WriteString("A <- -A") return nil } w.WriteString("A <- A ") switch code { case Add: w.WriteString("+ ") case Sub: w.WriteString("- ") case Mul: w.WriteString("* ") case Div: w.WriteString("/ ") case Or: w.WriteString("| ") case And: w.WriteString("& ") case Lsh: w.WriteString("<< ") case Rsh: w.WriteString(">> ") case Mod: w.WriteString("% ") case Xor: w.WriteString("^ ") default: return fmt.Errorf("invalid BPF ALU instruction: %v", linux.BPFInstruction(inst)) } return decodeSource(inst, w) } func decodeSource(inst Instruction, w *bytes.Buffer) error { switch inst.OpCode & srcAluJmpMask { case K: w.WriteString(fmt.Sprintf("%v", inst.K)) case X: w.WriteString("X") default: return fmt.Errorf("invalid BPF ALU/JMP source instruction: %v", linux.BPFInstruction(inst)) } return nil } // pc += (A > k) ? jt : jf func decodeJmp(inst Instruction, line int, w *bytes.Buffer) error { code := inst.OpCode & jmpMask w.WriteString("pc += ") if code == Ja { w.WriteString(printJmpTarget(inst.K, line)) } else { w.WriteString("(A ") switch code { case Jeq: w.WriteString("== ") case Jgt: w.WriteString("> ") case Jge: w.WriteString(">= ") case Jset: w.WriteString("& ") default: return fmt.Errorf("invalid BPF ALU instruction: %v", linux.BPFInstruction(inst)) } if err := decodeSource(inst, w); err != nil { return err } w.WriteString( fmt.Sprintf(") ? %s : %s", printJmpTarget(uint32(inst.JumpIfTrue), line), printJmpTarget(uint32(inst.JumpIfFalse), line))) } return nil } func printJmpTarget(target uint32, line int) string { if line == -1 { return fmt.Sprintf("%v", target) } return fmt.Sprintf("%v [%v]", target, int(target)+line+1) } // ret k func decodeRet(inst Instruction, w *bytes.Buffer) error { w.WriteString("ret ") code := inst.OpCode & srcRetMask switch code { case K: w.WriteString(fmt.Sprintf("%v", inst.K)) case A: w.WriteString("A") default: return fmt.Errorf("invalid BPF RET source instruction: %v", linux.BPFInstruction(inst)) } return nil } func decodeMisc(inst Instruction, w *bytes.Buffer) error { code := inst.OpCode & miscMask switch code { case Tax: w.WriteString("X <- A") case Txa: w.WriteString("A <- X") default: return fmt.Errorf("invalid BPF ALU/JMP source instruction: %v", linux.BPFInstruction(inst)) } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/bpf/input_bytes.go000066400000000000000000000060221465435605700230670ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package bpf import ( "encoding/binary" ) // Input represents a source of input data for a BPF program. (BPF // documentation sometimes refers to the input data as the "packet" due to its // origins as a packet processing DSL.) // Unaligned loads are supported. type Input []byte // These type definitions must have different GC shapes to ensure that // the Go compiler generates distinct code paths for them. // These do not have anything to do with the bit sizes of the loads // later on; all that matters is that these types have distinct sizes // from one another. type ( // BigEndian uses big-endian byte ordering. BigEndian uint8 // LittleEndian uses little-endian byte ordering. LittleEndian uint16 // NativeEndian uses native byte ordering. NativeEndian uint32 ) // Endianness represents a byte order. type Endianness interface { BigEndian | LittleEndian | NativeEndian } // load32 loads a 32-bit value. // //go:nosplit func load32[endian Endianness](in Input, off uint32) (uint32, bool) { if uint64(off)+4 > uint64(len(in)) { return 0, false } // Casting to any is needed here to avoid a compilation error: // https://go.googlesource.com/proposal/+/refs/heads/master/design/43651-type-parameters.md#why-not-permit-type-assertions-on-values-whose-type-is-a-type-parameter var e endian switch any(e).(type) { case BigEndian: return binary.BigEndian.Uint32(in[int(off):]), true case LittleEndian: return binary.LittleEndian.Uint32(in[int(off):]), true case NativeEndian: return binary.NativeEndian.Uint32(in[int(off):]), true default: panic("unreachable") } } // load16 loads a 16-bit value. // //go:nosplit func load16[endian Endianness](in Input, off uint32) (uint16, bool) { if uint64(off)+2 > uint64(len(in)) { return 0, false } // Casting to any is needed here to avoid a compilation error: // https://go.googlesource.com/proposal/+/refs/heads/master/design/43651-type-parameters.md#why-not-permit-type-assertions-on-values-whose-type-is-a-type-parameter var e endian switch any(e).(type) { case BigEndian: return binary.BigEndian.Uint16(in[int(off):]), true case LittleEndian: return binary.LittleEndian.Uint16(in[int(off):]), true case NativeEndian: return binary.NativeEndian.Uint16(in[int(off):]), true default: panic("unreachable") } } // load8 loads a single byte. // //go:nosplit func load8(in Input, off uint32) (uint8, bool) { if uint64(off)+1 > uint64(len(in)) { return 0, false } return in[int(off)], true } golang-gvisor-gvisor-0.0~20240729.0/pkg/bpf/interpreter.go000066400000000000000000000415541465435605700230760ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package bpf import ( "fmt" "strconv" "strings" ) // Possible values for ProgramError.Code. const ( // DivisionByZero indicates that a program contains, or executed, a // division or modulo by zero. DivisionByZero = iota // InvalidEndOfProgram indicates that the last instruction of a program is // not a return. InvalidEndOfProgram // InvalidInstructionCount indicates that a program has zero instructions // or more than MaxInstructions instructions. InvalidInstructionCount // InvalidJumpTarget indicates that a program contains a jump whose target // is outside of the program's bounds. InvalidJumpTarget // InvalidLoad indicates that a program executed an invalid load of input // data. InvalidLoad // InvalidOpcode indicates that a program contains an instruction with an // invalid opcode. InvalidOpcode // InvalidRegister indicates that a program contains a load from, or store // to, a non-existent M register (index >= ScratchMemRegisters). InvalidRegister ) // Error is an error encountered while compiling or executing a BPF program. type Error struct { // Code indicates the kind of error that occurred. Code int // PC is the program counter (index into the list of instructions) at which // the error occurred. PC int } func (e Error) codeString() string { switch e.Code { case DivisionByZero: return "division by zero" case InvalidEndOfProgram: return "last instruction must be a return" case InvalidInstructionCount: return "invalid number of instructions" case InvalidJumpTarget: return "jump target out of bounds" case InvalidLoad: return "load out of bounds or violates input alignment requirements" case InvalidOpcode: return "invalid instruction opcode" case InvalidRegister: return "invalid M register" default: return "unknown error" } } // Error implements error.Error. func (e Error) Error() string { return fmt.Sprintf("at l%d: %s", e.PC, e.codeString()) } // Program is a BPF program that has been validated for consistency. // // +stateify savable type Program struct { instructions []Instruction } // Length returns the number of instructions in the program. func (p Program) Length() int { return len(p.instructions) } // Compile performs validation and optimization on a sequence of BPF // instructions before wrapping them in a Program. func Compile(insns []Instruction, optimize bool) (Program, error) { if len(insns) == 0 || len(insns) > MaxInstructions { return Program{}, Error{InvalidInstructionCount, len(insns)} } // The last instruction must be a return. if last := insns[len(insns)-1]; last.OpCode != (Ret|K) && last.OpCode != (Ret|A) { return Program{}, Error{InvalidEndOfProgram, len(insns) - 1} } // Validate each instruction. Note that we skip a validation Linux does: // Linux additionally verifies that every load from an M register is // preceded, in every path, by a store to the same M register, in order to // avoid having to clear M between programs // (net/core/filter.c:check_load_and_stores). We always start with a zeroed // M array. for pc, i := range insns { if i.OpCode&unusedBitsMask != 0 { return Program{}, Error{InvalidOpcode, pc} } switch i.OpCode & instructionClassMask { case Ld: mode := i.OpCode & loadModeMask switch i.OpCode & loadSizeMask { case W: if mode != Imm && mode != Abs && mode != Ind && mode != Mem && mode != Len { return Program{}, Error{InvalidOpcode, pc} } if mode == Mem && i.K >= ScratchMemRegisters { return Program{}, Error{InvalidRegister, pc} } case H, B: if mode != Abs && mode != Ind { return Program{}, Error{InvalidOpcode, pc} } default: return Program{}, Error{InvalidOpcode, pc} } case Ldx: mode := i.OpCode & loadModeMask switch i.OpCode & loadSizeMask { case W: if mode != Imm && mode != Mem && mode != Len { return Program{}, Error{InvalidOpcode, pc} } if mode == Mem && i.K >= ScratchMemRegisters { return Program{}, Error{InvalidRegister, pc} } case B: if mode != Msh { return Program{}, Error{InvalidOpcode, pc} } default: return Program{}, Error{InvalidOpcode, pc} } case St, Stx: if i.OpCode&storeUnusedBitsMask != 0 { return Program{}, Error{InvalidOpcode, pc} } if i.K >= ScratchMemRegisters { return Program{}, Error{InvalidRegister, pc} } case Alu: switch i.OpCode & aluMask { case Add, Sub, Mul, Or, And, Lsh, Rsh, Xor: break case Div, Mod: if src := i.OpCode & srcAluJmpMask; src == K && i.K == 0 { return Program{}, Error{DivisionByZero, pc} } case Neg: // Negation doesn't take a source operand. if i.OpCode&srcAluJmpMask != 0 { return Program{}, Error{InvalidOpcode, pc} } default: return Program{}, Error{InvalidOpcode, pc} } case Jmp: switch i.OpCode & jmpMask { case Ja: // Unconditional jump doesn't take a source operand. if i.OpCode&srcAluJmpMask != 0 { return Program{}, Error{InvalidOpcode, pc} } // Do the comparison in 64 bits to avoid the possibility of // overflow from a very large i.K. if uint64(pc)+uint64(i.K)+1 >= uint64(len(insns)) { return Program{}, Error{InvalidJumpTarget, pc} } case Jeq, Jgt, Jge, Jset: // jt and jf are uint16s, so there's no threat of overflow. if pc+int(i.JumpIfTrue)+1 >= len(insns) { return Program{}, Error{InvalidJumpTarget, pc} } if pc+int(i.JumpIfFalse)+1 >= len(insns) { return Program{}, Error{InvalidJumpTarget, pc} } default: return Program{}, Error{InvalidOpcode, pc} } case Ret: if i.OpCode&retUnusedBitsMask != 0 { return Program{}, Error{InvalidOpcode, pc} } if src := i.OpCode & srcRetMask; src != K && src != A { return Program{}, Error{InvalidOpcode, pc} } case Misc: if misc := i.OpCode & miscMask; misc != Tax && misc != Txa { return Program{}, Error{InvalidOpcode, pc} } } } if optimize { insns = Optimize(insns) } return Program{insns}, nil } // machine represents the state of a BPF virtual machine. type machine struct { A uint32 X uint32 M [ScratchMemRegisters]uint32 } func conditionalJumpOffset(insn Instruction, cond bool) int { if cond { return int(insn.JumpIfTrue) } return int(insn.JumpIfFalse) } // Exec executes a BPF program over the given input and returns its return // value. func Exec[endian Endianness](p Program, in Input) (uint32, error) { var m machine var pc int for ; pc < len(p.instructions); pc++ { i := p.instructions[pc] switch i.OpCode { case Ld | Imm | W: m.A = i.K case Ld | Abs | W: val, ok := load32[endian](in, i.K) if !ok { return 0, Error{InvalidLoad, pc} } m.A = val case Ld | Abs | H: val, ok := load16[endian](in, i.K) if !ok { return 0, Error{InvalidLoad, pc} } m.A = uint32(val) case Ld | Abs | B: val, ok := load8(in, i.K) if !ok { return 0, Error{InvalidLoad, pc} } m.A = uint32(val) case Ld | Ind | W: val, ok := load32[endian](in, m.X+i.K) if !ok { return 0, Error{InvalidLoad, pc} } m.A = val case Ld | Ind | H: val, ok := load16[endian](in, m.X+i.K) if !ok { return 0, Error{InvalidLoad, pc} } m.A = uint32(val) case Ld | Ind | B: val, ok := load8(in, m.X+i.K) if !ok { return 0, Error{InvalidLoad, pc} } m.A = uint32(val) case Ld | Mem | W: m.A = m.M[int(i.K)] case Ld | Len | W: m.A = uint32(len(in)) case Ldx | Imm | W: m.X = i.K case Ldx | Mem | W: m.X = m.M[int(i.K)] case Ldx | Len | W: m.X = uint32(len(in)) case Ldx | Msh | B: val, ok := load8(in, i.K) if !ok { return 0, Error{InvalidLoad, pc} } m.X = 4 * uint32(val&0xf) case St: m.M[int(i.K)] = m.A case Stx: m.M[int(i.K)] = m.X case Alu | Add | K: m.A += i.K case Alu | Add | X: m.A += m.X case Alu | Sub | K: m.A -= i.K case Alu | Sub | X: m.A -= m.X case Alu | Mul | K: m.A *= i.K case Alu | Mul | X: m.A *= m.X case Alu | Div | K: // K != 0 already checked by Compile. m.A /= i.K case Alu | Div | X: if m.X == 0 { return 0, Error{DivisionByZero, pc} } m.A /= m.X case Alu | Or | K: m.A |= i.K case Alu | Or | X: m.A |= m.X case Alu | And | K: m.A &= i.K case Alu | And | X: m.A &= m.X case Alu | Lsh | K: m.A <<= i.K case Alu | Lsh | X: m.A <<= m.X case Alu | Rsh | K: m.A >>= i.K case Alu | Rsh | X: m.A >>= m.X case Alu | Neg: m.A = uint32(-int32(m.A)) case Alu | Mod | K: // K != 0 already checked by Compile. m.A %= i.K case Alu | Mod | X: if m.X == 0 { return 0, Error{DivisionByZero, pc} } m.A %= m.X case Alu | Xor | K: m.A ^= i.K case Alu | Xor | X: m.A ^= m.X case Jmp | Ja: pc += int(i.K) case Jmp | Jeq | K: pc += conditionalJumpOffset(i, m.A == i.K) case Jmp | Jeq | X: pc += conditionalJumpOffset(i, m.A == m.X) case Jmp | Jgt | K: pc += conditionalJumpOffset(i, m.A > i.K) case Jmp | Jgt | X: pc += conditionalJumpOffset(i, m.A > m.X) case Jmp | Jge | K: pc += conditionalJumpOffset(i, m.A >= i.K) case Jmp | Jge | X: pc += conditionalJumpOffset(i, m.A >= m.X) case Jmp | Jset | K: pc += conditionalJumpOffset(i, (m.A&i.K) != 0) case Jmp | Jset | X: pc += conditionalJumpOffset(i, (m.A&m.X) != 0) case Ret | K: return i.K, nil case Ret | A: return m.A, nil case Misc | Tax: m.A = m.X case Misc | Txa: m.X = m.A default: return 0, Error{InvalidOpcode, pc} } } return 0, Error{InvalidEndOfProgram, pc} } // ExecutionMetrics represents the result of executing a BPF program. type ExecutionMetrics struct { // ReturnValue is the result of the program execution. ReturnValue uint32 // Coverage maps instruction indexes to whether or not they were executed. // This slice has the same size as the number of instructions as the BPF // program that was run, so it can be used as a way to get the program size. // Since an instruction can never run twice in BPF, this can also be used // to determine how many instructions were executed. Coverage []bool // InputAccessed maps input byte offsets to whether or not they were // read by the program during execution. InputAccessed []bool } // String returns a human-readable view of an `Execution`. func (e *ExecutionMetrics) String() string { type intRange struct { from, to int } // addRangeString formats an `intRange` and writes it to `sb`. addRangeString := func(sb *strings.Builder, rng intRange) { if rng.from == rng.to { sb.WriteString(strconv.Itoa(rng.from)) } else { sb.WriteString(strconv.Itoa(rng.from)) sb.WriteRune('-') sb.WriteString(strconv.Itoa(rng.to)) } } // `getRanges` takes a slice of booleans and returns ranges of all-true // indexes. getRanges := func(s []bool) []intRange { var ranges []intRange firstTrueIndex := -1 for i, covered := range s { if covered { if firstTrueIndex == -1 { firstTrueIndex = i } continue } if firstTrueIndex != -1 { ranges = append(ranges, intRange{firstTrueIndex, i - 1}) firstTrueIndex = -1 } } if firstTrueIndex != -1 { ranges = append(ranges, intRange{firstTrueIndex, len(s) - 1}) } return ranges } // ranges returns a human-friendly representation of the // ranges of items in `s` that are contiguously `true`. ranges := func(s []bool) string { if len(s) == 0 { return "empty" } allFalse := true allTrue := true for _, v := range s { if v { allFalse = false } else { allTrue = false } } if allFalse { return "none" } if allTrue { return "all" } ranges := getRanges(s) var sb strings.Builder for i, rng := range ranges { if i != 0 { sb.WriteRune(',') } addRangeString(&sb, rng) } return sb.String() } executedInstructions := 0 for _, covered := range e.Coverage { if covered { executedInstructions++ } } return fmt.Sprintf("returned %d, covered %d/%d instructions (%s), read input bytes %s (%d total input bytes)", e.ReturnValue, executedInstructions, len(e.Coverage), ranges(e.Coverage), ranges(e.InputAccessed), len(e.InputAccessed)) } // markInputRead marks the `bytesRead` bytes starting at `offset` as having // been read from the input. This function assumes that the offset and number // of bytes have already been verified as valid. func (e *ExecutionMetrics) markInputRead(offset uint32, bytesRead int) { if int(offset)+bytesRead > len(e.InputAccessed) { panic(fmt.Sprintf("invalid offset or number of bytes read: offset=%d bytesRead=%d len=%d", offset, bytesRead, len(e.InputAccessed))) } for i := 0; i < bytesRead; i++ { e.InputAccessed[int(offset)+i] = true } } // InstrumentedExec executes a BPF program over the given input while // instrumenting it: recording memory accesses and lines executed. // This is slower than Exec, but should return equivalent results. func InstrumentedExec[endian Endianness](p Program, in Input) (ExecutionMetrics, error) { ret := ExecutionMetrics{ Coverage: make([]bool, len(p.instructions)), InputAccessed: make([]bool, len(in)), } var m machine var pc int for ; pc < len(p.instructions); pc++ { ret.Coverage[pc] = true i := p.instructions[pc] switch i.OpCode { case Ld | Imm | W: m.A = i.K case Ld | Abs | W: val, ok := load32[endian](in, i.K) if !ok { return ret, Error{InvalidLoad, pc} } ret.markInputRead(i.K, 4) m.A = val case Ld | Abs | H: val, ok := load16[endian](in, i.K) if !ok { return ret, Error{InvalidLoad, pc} } ret.markInputRead(i.K, 2) m.A = uint32(val) case Ld | Abs | B: val, ok := load8(in, i.K) if !ok { return ret, Error{InvalidLoad, pc} } ret.markInputRead(i.K, 1) m.A = uint32(val) case Ld | Ind | W: val, ok := load32[endian](in, m.X+i.K) if !ok { return ret, Error{InvalidLoad, pc} } ret.markInputRead(m.X+i.K, 4) m.A = val case Ld | Ind | H: val, ok := load16[endian](in, m.X+i.K) if !ok { return ret, Error{InvalidLoad, pc} } ret.markInputRead(m.X+i.K, 2) m.A = uint32(val) case Ld | Ind | B: val, ok := load8(in, m.X+i.K) if !ok { return ret, Error{InvalidLoad, pc} } ret.markInputRead(m.X+i.K, 1) m.A = uint32(val) case Ld | Mem | W: m.A = m.M[int(i.K)] case Ld | Len | W: m.A = uint32(len(in)) case Ldx | Imm | W: m.X = i.K case Ldx | Mem | W: m.X = m.M[int(i.K)] case Ldx | Len | W: m.X = uint32(len(in)) case Ldx | Msh | B: val, ok := load8(in, i.K) if !ok { return ret, Error{InvalidLoad, pc} } ret.markInputRead(i.K, 1) m.X = 4 * uint32(val&0xf) case St: m.M[int(i.K)] = m.A case Stx: m.M[int(i.K)] = m.X case Alu | Add | K: m.A += i.K case Alu | Add | X: m.A += m.X case Alu | Sub | K: m.A -= i.K case Alu | Sub | X: m.A -= m.X case Alu | Mul | K: m.A *= i.K case Alu | Mul | X: m.A *= m.X case Alu | Div | K: // K != 0 already checked by Compile. m.A /= i.K case Alu | Div | X: if m.X == 0 { return ret, Error{DivisionByZero, pc} } m.A /= m.X case Alu | Or | K: m.A |= i.K case Alu | Or | X: m.A |= m.X case Alu | And | K: m.A &= i.K case Alu | And | X: m.A &= m.X case Alu | Lsh | K: m.A <<= i.K case Alu | Lsh | X: m.A <<= m.X case Alu | Rsh | K: m.A >>= i.K case Alu | Rsh | X: m.A >>= m.X case Alu | Neg: m.A = uint32(-int32(m.A)) case Alu | Mod | K: // K != 0 already checked by Compile. m.A %= i.K case Alu | Mod | X: if m.X == 0 { return ret, Error{DivisionByZero, pc} } m.A %= m.X case Alu | Xor | K: m.A ^= i.K case Alu | Xor | X: m.A ^= m.X case Jmp | Ja: pc += int(i.K) case Jmp | Jeq | K: pc += conditionalJumpOffset(i, m.A == i.K) case Jmp | Jeq | X: pc += conditionalJumpOffset(i, m.A == m.X) case Jmp | Jgt | K: pc += conditionalJumpOffset(i, m.A > i.K) case Jmp | Jgt | X: pc += conditionalJumpOffset(i, m.A > m.X) case Jmp | Jge | K: pc += conditionalJumpOffset(i, m.A >= i.K) case Jmp | Jge | X: pc += conditionalJumpOffset(i, m.A >= m.X) case Jmp | Jset | K: pc += conditionalJumpOffset(i, (m.A&i.K) != 0) case Jmp | Jset | X: pc += conditionalJumpOffset(i, (m.A&m.X) != 0) case Ret | K: ret.ReturnValue = i.K return ret, nil case Ret | A: ret.ReturnValue = m.A return ret, nil case Misc | Tax: m.A = m.X case Misc | Txa: m.X = m.A default: return ret, Error{InvalidOpcode, pc} } } return ret, Error{InvalidEndOfProgram, pc} } golang-gvisor-gvisor-0.0~20240729.0/pkg/bpf/optimizer.go000066400000000000000000000572141465435605700225550ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package bpf import ( "fmt" "sort" ) const ( // maxConditionalJumpOffset is the maximum offset of a conditional // jump instruction. Conditional jump offsets are specified as an // unsigned 8-bit integer. maxConditionalJumpOffset = (1 << 8) - 1 // maxUnconditionalJumpOffset is the maximum offset of an unconditional // jump instruction. // Unconditional jumps are stored in an uint32, but here we limit it to // what would fit in a uint16. // BPF programs (once uploaded into the kernel) are limited to // `BPF_MAXINSNS`, which is 4096 in Linux as of this writing. // We need a value larger than `BPF_MAXINSNS` here in order to support // optimizing programs that are initially larger than `BPF_MAXINSNS` but // that can be optimized to fit within that limit. However, programs that // jump 2^32-1 instructions are probably not optimizable enough to fit // regardless. // This number is a middle ground that should be plenty given the type of // program we expect to optimize, while also not trying too hard to // optimize unoptimizable programs. maxUnconditionalJumpOffset = (1 << 16) - 1 ) // optimizerFunc is a function type that can optimize a BPF program. // It returns the updated set of instructions, along with whether any // modification was made. type optimizerFunc func(insns []Instruction) ([]Instruction, bool) // optimizeConditionalJumps looks for conditional jumps which go to an // unconditional jump that goes to a final target fewer than // `maxConditionalJumpOffset` instructions away. // These can safely be rewritten to not require the extra unconditional jump. // It returns the optimized set of instructions, along with whether any change // was made. func optimizeConditionalJumps(insns []Instruction) ([]Instruction, bool) { changed := false for pc, ins := range insns { if !ins.IsConditionalJump() { continue // Not a conditional jump instruction. } // Take care of "true" target: { jumpTrueOffset := pc + int(ins.JumpIfTrue) + 1 jumpTrueIns := insns[jumpTrueOffset] if jumpTrueIns.OpCode&instructionClassMask == Jmp && jumpTrueIns.OpCode&jmpMask == Ja { if finalJumpTrueOffset := int(ins.JumpIfTrue) + 1 + int(jumpTrueIns.K); finalJumpTrueOffset <= maxConditionalJumpOffset { // We can optimize the "true" target. ins.JumpIfTrue = uint8(finalJumpTrueOffset) changed = true } } } // Take care of "false" target: { jumpFalseOffset := pc + int(ins.JumpIfFalse) + 1 jumpFalseIns := insns[jumpFalseOffset] if jumpFalseIns.OpCode&instructionClassMask == Jmp && jumpFalseIns.OpCode&jmpMask == Ja { if finalJumpFalseOffset := int(ins.JumpIfFalse) + 1 + int(jumpFalseIns.K); finalJumpFalseOffset <= maxConditionalJumpOffset { // We can optimize the "false" target. ins.JumpIfFalse = uint8(finalJumpFalseOffset) changed = true } } } insns[pc] = ins } return insns, changed } // optimizeSameTargetConditionalJumps looks for conditional jumps where both // the "true" and "false" targets go to the same place, and rewrites them to // an unconditional jump to that place. // This can happen even for legitimate programs when resolving the target of // indirect jumps ends up at the same place. // It returns the optimized set of instructions, along with whether any change // was made. func optimizeSameTargetConditionalJumps(insns []Instruction) ([]Instruction, bool) { changed := false for pc, ins := range insns { if !ins.IsConditionalJump() { continue // Not a conditional jump instruction. } if ins.JumpIfTrue != ins.JumpIfFalse { continue // Not the same target. } insns[pc] = Jump(Jmp|Ja, uint32(ins.JumpIfTrue), 0, 0) changed = true } return insns, changed } // optimizeUnconditionalJumps looks for conditional jumps which go to another // unconditional jump. func optimizeUnconditionalJumps(insns []Instruction) ([]Instruction, bool) { changed := false for pc, ins := range insns { if !ins.IsUnconditionalJump() { continue // Not an unconditional jump instruction. } jumpOffset := pc + int(ins.K) + 1 jumpIns := insns[jumpOffset] if !jumpIns.IsUnconditionalJump() { // Not jumping to an unconditional jump. continue } finalJumpOffset := int(ins.K) + 1 + int(jumpIns.K) if finalJumpOffset > maxUnconditionalJumpOffset { // Final jump offset too large to fit in a single unconditional jump. continue } // We can optimize the final target. ins.K = uint32(finalJumpOffset) insns[pc] = ins changed = true } return insns, changed } // codeRemoval efficiently tracks indexes to remove from instructions. type codeRemoval struct { insns []Instruction toRemove []int } // MarkRemoved adds a new instruction index to be removed. func (cr *codeRemoval) MarkRemoved(index int) { if cr.toRemove == nil { cr.toRemove = make([]int, 0, len(cr.insns)) } cr.toRemove = append(cr.toRemove, index) } // Apply returns the set of instructions after removing marked indexes, // along with a boolean representing whether any instruction was removed. func (cr *codeRemoval) Apply() ([]Instruction, bool) { if len(cr.toRemove) == 0 { return cr.insns, false } sort.Ints(cr.toRemove) for i := len(cr.toRemove) - 1; i >= 0; i-- { pc := cr.toRemove[i] cr.insns = append(cr.insns[:pc], cr.insns[pc+1:]...) decrementJumps(cr.insns, pc) } return cr.insns, true } // decrementJumps decrements all jumps within `insns` that are jumping to an // instruction with index larger than `target`, the index of an // instruction that just got removed (i.e. `target` now points to the // instruction that was directly following the removed instruction). // Jumps that targeted `target` itself will not be affected, i.e. they will // point to the instruction that directly followed the removed instruction. // `insns` is modified in-place. func decrementJumps(insns []Instruction, target int) { for pc := 0; pc < target; pc++ { ins := insns[pc] if !ins.IsJump() { continue } if ins.IsUnconditionalJump() { // Unconditional jump, check K: if pc+int(ins.K)+1 > target { ins.K-- } } else { // Conditional jump, check true target: if pc+int(ins.JumpIfTrue)+1 > target { ins.JumpIfTrue-- } // ... And check false target: if pc+int(ins.JumpIfFalse)+1 > target { ins.JumpIfFalse-- } } insns[pc] = ins } } // removeZeroInstructionJumps removes unconditional jumps that jump zero // instructions forward. This may seem silly but it can happen due to other // optimizations in this file which decrement jump target indexes. func removeZeroInstructionJumps(insns []Instruction) ([]Instruction, bool) { removal := codeRemoval{insns: insns} for pc, ins := range insns { if !ins.IsUnconditionalJump() || ins.K != 0 { continue } removal.MarkRemoved(pc) } return removal.Apply() } // removeDeadCode removes instructions which are unreachable. // This can happen due to the other optimizations in this file, // e.g. optimizeConditionalJumps. // In addition, removing dead code means the program is shorter, // which in turn may make further jump optimizations possible. func removeDeadCode(insns []Instruction) ([]Instruction, bool) { if len(insns) == 0 { return insns, false } // Keep track of which lines are reachable from all instructions in the program. reachable := make([]bool, len(insns)) cursors := make([]int, 1, len(insns)) cursors[0] = 0 for len(cursors) > 0 { cursor := cursors[0] cursors = cursors[1:] if reachable[cursor] { continue } reachable[cursor] = true ins := insns[cursor] switch ins.OpCode & instructionClassMask { case Ret: // Return instructions are terminal, add no new cursor. case Jmp: // Add a new cursor wherever the jump can go. if ins.IsUnconditionalJump() { // Unconditional jump: cursors = append(cursors, cursor+int(ins.K)+1) } else { // Conditional jump: cursors = append(cursors, cursor+int(ins.JumpIfTrue)+1, cursor+int(ins.JumpIfFalse)+1) } default: // Other instructions simply flow forward. cursors = append(cursors, cursor+1) } } // Now remove unreachable code. removal := codeRemoval{insns: insns} for pc := range insns { if !reachable[pc] { removal.MarkRemoved(pc) } } return removal.Apply() } // optimizeJumpsToReturn replaces unconditional jumps that go to return // statements by a copy of that return statement. func optimizeJumpsToReturn(insns []Instruction) ([]Instruction, bool) { changed := false for pc, ins := range insns { if !ins.IsUnconditionalJump() { continue // Not an unconditional jump instruction. } targetIns := insns[pc+int(ins.K)+1] if targetIns.OpCode&instructionClassMask != Ret { continue // Not jumping to a return instruction. } insns[pc] = targetIns changed = true } return insns, changed } // removeRedundantLoads removes some redundant load instructions // when the value in register A is already the same value as what is // being loaded. func removeRedundantLoads(insns []Instruction) ([]Instruction, bool) { // reverseWalk maps instruction indexes I to the set of instruction indexes // that, after their execution, may result in the control flow jumping to I. reverseWalk := make([]map[int]struct{}, len(insns)) for pc := range insns { reverseWalk[pc] = make(map[int]struct{}) } for pc, ins := range insns { if ins.IsReturn() { continue // Return instructions are terminal. } if ins.IsJump() { for _, offset := range ins.JumpOffsets() { reverseWalk[pc+int(offset.Offset)+1][pc] = struct{}{} } continue } // All other instructions flow through. reverseWalk[pc+1][pc] = struct{}{} } // Now look for redundant load instructions. removal := codeRemoval{insns: insns} for pc, ins := range insns { if ins.OpCode&instructionClassMask != Ld { continue } // Walk backwards until either we've reached the beginning of the program, // or we've reached an operation which modifies register A. lastModifiedA := -1 beforePCs := reverseWalk[pc] walk: for { switch len(beforePCs) { case 0: // We've reached the beginning of the program without modifying A. break walk case 1: var beforePC int for bpc := range beforePCs { // Note: we know that this map only has one element. beforePC = bpc } if !insns[beforePC].ModifiesRegisterA() { beforePCs = reverseWalk[beforePC] continue walk } lastModifiedA = beforePC break walk default: // Multiple ways to get to `pc`. // For simplicity, we only support the single-branch case right now. break walk } } if lastModifiedA != -1 && insns[pc].Equal(insns[lastModifiedA]) { removal.MarkRemoved(pc) } } return removal.Apply() } // jumpRewriteOperation rewrites a jump target. type jumpRewriteOperation struct { pc int // Rewrite instruction at this offset. jumpType JumpType // Rewrite this type of jump. rewriteTo int // Rewrite the jump offset to this value. } // rewriteAllJumpsToReturn rewrites *all* jump instructions that go to // `fromPC` to go to `toPC` instead, if possible without converting jumps // from conditional to unconditional. `fromPC` and `toPC` must point to // identical return instructions. // It is all-or-nothing: either all jump instructions must be rewritable // (in which case they will all be rewritten, and this function will // return true), or no jump instructions will be rewritten, and this // function will return false. // This function also returns false in the vacuous case (i.e. there are // no jump instructions that go to `fromPC` in the first place). // This function is used in `optimizeJumpsToSmallestSetOfReturns`. // As a sanity check, it verifies that `fromPC` and `toPC` are functionally // identical return instruction, and panics otherwise. // `rewriteOps` is a buffer of jump rewrite operations meant to be // efficiently reusable across calls to this function. func rewriteAllJumpsToReturn(insns []Instruction, fromPC, toPC int, rewriteOps []jumpRewriteOperation) bool { fromIns, toIns := insns[fromPC], insns[toPC] if !fromIns.IsReturn() { panic(fmt.Sprintf("attempted to rewrite jumps from {pc=%d: %v} which is not a return instruction", fromPC, fromIns)) } if !toIns.IsReturn() { panic(fmt.Sprintf("attempted to rewrite jumps to {pc=%d: %v} which is not a return instruction", toPC, toIns)) } if !fromIns.Equal(toIns) { panic(fmt.Sprintf("attempted to rewrite jump target to a different return instruction: from={pc=%d: %v}, to={pc=%d: %v}", fromPC, fromIns, toPC, toIns)) } // Scan once, and populate `rewriteOps` as a list of rewrite operations // that should be run if the rewrite is feasible. rewriteOps = rewriteOps[:0] for pc := 0; pc < fromPC; pc++ { ins := insns[pc] // Note: `neededOffset` may be negative, in case where we are rewriting // the jump target to go to an earlier instruction, and we are dealing // with the instructions that come after that. // This isn't necessarily a dealbreaker, we just need to make sure that // `ins` is either not a jump statement, or it is a jump statement that // doesn't go to `fromPC` (otherwise, only then would it need to jump // backwards). neededOffset := toPC - pc - 1 if ins.IsConditionalJump() { if jumpTrueTarget := pc + int(ins.JumpIfTrue) + 1; jumpTrueTarget == fromPC { if neededOffset < 0 || neededOffset > maxConditionalJumpOffset { return false } rewriteOps = append(rewriteOps, jumpRewriteOperation{ pc: pc, jumpType: JumpTrue, rewriteTo: neededOffset, }) } if jumpFalseTarget := pc + int(ins.JumpIfFalse) + 1; jumpFalseTarget == fromPC { if neededOffset < 0 || neededOffset > maxConditionalJumpOffset { return false } rewriteOps = append(rewriteOps, jumpRewriteOperation{ pc: pc, jumpType: JumpFalse, rewriteTo: neededOffset, }) } } else if ins.IsUnconditionalJump() { if jumpTarget := pc + int(ins.K) + 1; jumpTarget == fromPC { if neededOffset < 0 || neededOffset > maxUnconditionalJumpOffset { return false } rewriteOps = append(rewriteOps, jumpRewriteOperation{ pc: pc, jumpType: JumpDirect, rewriteTo: neededOffset, }) } } } if len(rewriteOps) == 0 { return false // No jump statements to rewrite. } // Rewrite is feasible, so do it. for _, op := range rewriteOps { ins := insns[op.pc] switch op.jumpType { case JumpTrue: ins.JumpIfTrue = uint8(op.rewriteTo) case JumpFalse: ins.JumpIfFalse = uint8(op.rewriteTo) case JumpDirect: ins.K = uint32(op.rewriteTo) } insns[op.pc] = ins } return true } // optimizeJumpsToSmallestSetOfReturns modifies jump targets that go to // return statements to go to an identical return statement (which still // fits within the maximum jump offsets), with the goal of minimizing the // total number of such return statements needed within the program overall. // The return statements that are skipped this way can then be removed by // the `removeDeadCode` optimizer, which should come earlier in the // optimizer list to ensure this optimizer only runs on instructions with // no dead code in them. // Within binary search trees, this allows deduplicating return statements // across multiple conditions and makes them much shorter. In turn, this // allows pruning these redundant return instructions as // they become dead, and therefore makes the code shorter. // (Essentially, we create a common "jump to return" doormat that everyone in // Office Space^W^W^W^W any instruction in range can jump to.) // // Conceptually: // // .. if (foo) goto A else goto B // A: return rejected // B: if (bar) goto C else goto D // C: return rejected // D: if (baz) goto E else goto F // E: return rejected // F: return accepted // ... // (Another set of rules in the program): // .. if (foo2) goto G else goto H // G: return accepted // H: if (bar2) goto I else goto J // I: return accepted // J: return rejected // // becomes (after the dead code removal optimizer runs as well): // // .. if (foo) goto J else goto B // B: if (bar) goto J else goto D // D: if (baz) goto J else goto I // ... // .. if (foo2) goto I else goto H // H: if (bar2) goto I else goto J // I: return accepted // J: return rejected func optimizeJumpsToSmallestSetOfReturns(insns []Instruction) ([]Instruction, bool) { // This is probably an NP-complete problem, so this approach does not // attempt to be optimal. Not being optimal is OK, we just end up with // a program that's slightly longer than necessary. // Rough sketch of the algorithm: // For each return instruction in the program: // Count the number of jump instructions that flow to it ("popularity"). // Also add `len(insns)` to the count if the instruction just before // the return instruction is neither a jump or a return instruction, // as the program can also flow through to it. This makes the return // instruction non-removable, but that in turn means that it is a very // good target for other jumps to jump to. // Build a map of lists of return instructions sorted by how many other // instructions flow to it, in ascending order. // The map key is the return value of the return instruction. // Iterate over this map (for each possible return value): // Iterate over the list of return instructions that return this value: // If the return instruction is unreachable, skip it. // If the return instruction is reachable by fallthrough (i.e. the // instruction just before it is not a jump nor a return), skip it. // Otherwise, see if it's possible to move all jump targets of this // instruction to any other return instruction in the list (starting // from the end of the sorted list, i.e. the "most popular" return // instruction that returns the same value), without needing to // convert conditional jumps into unconditional ones. // If it's possible, move all jump targets to it. // We may redundantly update multiple jump targets in one go which may be // optimized further in later passes (e.g. if unconditional jumps can be // removed and trim the program further, expanding the set of possible // rewrites beyond what we considered in this pass), but that's OK. // This pass will run again afterwards and eventually pick them up, and this // is still more efficient over running this (expensive) pass after each // single rewrite happens. changed := false // retPopularity maps offsets (pc) of return instructions to the number of // jump targets that point to them, +numInstructions if the program can also // fall through to it. numInstructions := len(insns) retPopularity := make([]int, numInstructions) // retCanBeFallenThrough maps offsets (pc) of return instructions to whether // or not they can be fallen through (i.e. not jumped to). retCanBeFallenThrough := make([]bool, numInstructions) // retValueToPC maps return values to a set of instructions that return // that value. // In BPF, the value of the K register is part of the return instruction // itself ("immediate" in assembly parlance), whereas the A register is // more of a regular register (previous operations may store/load/modify // it). So any return statement that returns the value of the A register // is functionally identical to any other, but any return statement that // returns the value of the K register must have the same value of K in // the return instruction for it to be functionally equivalent. // So, for return instructions that return K, we use the immediate value // of the K register (which is a uint32), and for return instructions // that return the A register, we use the stand-in value // "0xaaaaaaaaaaaaaaaa" (which doesn't fit in uint32, so it can't conflict // with an immediate value of K). const retRegisterA = 0xaaaaaaaaaaaaaaaa retValueToPC := make(map[uint64][]int) for pc, ins := range insns { if !ins.IsReturn() { continue // Not a conditional jump instruction. } var retValue uint64 switch ins.OpCode - Ret { case A: retValue = retRegisterA case K: retValue = uint64(ins.K) default: panic(fmt.Sprintf("unknown return value in instruction at pc=%d: %v", pc, ins)) } popularity := 0 canBeFallenThrough := false for pc2 := 0; pc2 < pc; pc2++ { ins2 := insns[pc2] switch ins2.OpCode & instructionClassMask { case Ret: // Do nothing. case Jmp: if ins2.IsConditionalJump() { // Note that the optimizeSameTargetConditionalJumps should make it // such that it's not possible for there to be a conditional jump // with identical "true" and "false" targets, so this should not // result in adding 2 to `popularity`. if jumpTrueTarget := pc2 + int(ins2.JumpIfTrue) + 1; jumpTrueTarget == pc { popularity++ } if jumpFalseTarget := pc2 + int(ins2.JumpIfFalse) + 1; jumpFalseTarget == pc { popularity++ } } else { if jumpTarget := pc2 + int(ins2.K) + 1; jumpTarget == pc { popularity++ } } default: if pc2 == pc-1 { // This return instruction can be fallen through to. popularity += numInstructions canBeFallenThrough = true } } } retValueToPC[retValue] = append(retValueToPC[retValue], pc) retPopularity[pc] = popularity retCanBeFallenThrough[pc] = canBeFallenThrough } rewriteOps := make([]jumpRewriteOperation, 0, len(insns)) for _, pcs := range retValueToPC { sort.Slice(pcs, func(i, j int) bool { // Sort `pcs` in order of ascending popularity. // If the popularity is the same, sort by PC. if retPopularity[pcs[i]] != retPopularity[pcs[j]] { return retPopularity[pcs[i]] < retPopularity[pcs[j]] } return pcs[i] < pcs[j] }) for i, unpopularPC := range pcs { if retCanBeFallenThrough[unpopularPC] { // Can't remove this return instruction, so no need to try // to check if we can rewrite other instructions that jump to it. continue } for j := len(pcs) - 1; j > i; j-- { popularPC := pcs[j] // Check if we can rewrite all instructions that jump to `unpopularPC` // to instead jump to `popularPC`. if rewriteAllJumpsToReturn(insns, unpopularPC, popularPC, rewriteOps) { changed = true break } } } } return insns, changed } // Optimize losslessly optimizes a BPF program using the given optimization // functions. // Optimizers should be ranked in order of importance, with the most // important first. // An optimizer will be exhausted before the next one is ever run. // Earlier optimizers are re-exhausted if later optimizers cause change. // The BPF instructions are assumed to have been checked for validity and // consistency. // The instructions in `insns` may be modified in-place. func optimize(insns []Instruction, funcs []optimizerFunc) []Instruction { for changed := true; changed; { for _, fn := range funcs { if insns, changed = fn(insns); changed { break } } } return insns } // Optimize losslessly optimizes a BPF program. // The BPF instructions are assumed to have been checked for validity and // consistency. // The instructions in `insns` may be modified in-place. func Optimize(insns []Instruction) []Instruction { return optimize(insns, []optimizerFunc{ optimizeConditionalJumps, optimizeSameTargetConditionalJumps, optimizeUnconditionalJumps, optimizeJumpsToReturn, removeZeroInstructionJumps, removeDeadCode, removeRedundantLoads, optimizeJumpsToSmallestSetOfReturns, }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/bpf/program_builder.go000066400000000000000000000310051465435605700236760ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package bpf import ( "fmt" "math" "sort" "strings" "gvisor.dev/gvisor/pkg/abi/linux" ) const ( labelTarget = math.MaxUint8 labelDirectTarget = math.MaxUint32 ) // ProgramBuilder assists with building a BPF program with jump // labels that are resolved to their proper offsets. type ProgramBuilder struct { // Maps label names to label objects. labels map[string]*label // Maps label sources to the label name it references. jumpSourceToLabel map[source]string // unusableLabels are labels that are added before being referenced in a // jump. Any labels added this way cannot be referenced later in order to // avoid backwards references. unusableLabels map[string]bool // Array of BPF instructions that makes up the program. instructions []Instruction } // NewProgramBuilder creates a new ProgramBuilder instance. func NewProgramBuilder() *ProgramBuilder { return &ProgramBuilder{ labels: map[string]*label{}, jumpSourceToLabel: map[source]string{}, unusableLabels: map[string]bool{}, } } // label contains information to resolve a label to an offset. type label struct { // List of locations that reference the label in the program. sources []source // Program line when the label is located. target int } // JumpType is the type of jump target that an instruction may use. type JumpType int // Types of jump that an instruction may use. const ( JumpDirect JumpType = iota JumpTrue JumpFalse ) // source contains information about a single reference to a label. type source struct { // Program line where the label reference is present. line int // Which type of jump is referencing this label. jt JumpType } // AddStmt adds a new statement to the program. func (b *ProgramBuilder) AddStmt(code uint16, k uint32) { b.instructions = append(b.instructions, Stmt(code, k)) } // AddJump adds a new jump to the program. func (b *ProgramBuilder) AddJump(code uint16, k uint32, jt, jf uint8) { b.instructions = append(b.instructions, Jump(code, k, jt, jf)) } // AddDirectJumpLabel adds a new jump to the program where is labelled. func (b *ProgramBuilder) AddDirectJumpLabel(labelName string) { b.addLabelSource(labelName, JumpDirect) b.AddJump(Jmp|Ja, labelDirectTarget, 0, 0) } // AddJumpTrueLabel adds a new jump to the program where 'jump if true' is a label. func (b *ProgramBuilder) AddJumpTrueLabel(code uint16, k uint32, jtLabel string, jf uint8) { b.addLabelSource(jtLabel, JumpTrue) b.AddJump(code, k, labelTarget, jf) } // AddJumpFalseLabel adds a new jump to the program where 'jump if false' is a label. func (b *ProgramBuilder) AddJumpFalseLabel(code uint16, k uint32, jt uint8, jfLabel string) { b.addLabelSource(jfLabel, JumpFalse) b.AddJump(code, k, jt, labelTarget) } // AddJumpLabels adds a new jump to the program where both jump targets are labels. func (b *ProgramBuilder) AddJumpLabels(code uint16, k uint32, jtLabel, jfLabel string) { b.addLabelSource(jtLabel, JumpTrue) b.addLabelSource(jfLabel, JumpFalse) b.AddJump(code, k, labelTarget, labelTarget) } // AddLabel sets the given label name at the current location. The next instruction is executed // when the any code jumps to this label. More than one label can be added to the same location. func (b *ProgramBuilder) AddLabel(name string) error { l, ok := b.labels[name] if !ok { if _, ok = b.unusableLabels[name]; ok { return fmt.Errorf("label %q already set", name) } // Mark the label as unusable. This is done to catch backwards jumps. b.unusableLabels[name] = true return nil } if l.target != -1 { return fmt.Errorf("label %q target already set: %v", name, l.target) } l.target = len(b.instructions) return nil } // Instructions returns an array of BPF instructions representing the program with all labels // resolved. Return error in case label resolution failed due to an invalid program. // // N.B. Partial results will be returned in the error case, which is useful for debugging. func (b *ProgramBuilder) Instructions() ([]Instruction, error) { if err := b.resolveLabels(); err != nil { return b.instructions, err } return b.instructions, nil } func (b *ProgramBuilder) addLabelSource(labelName string, t JumpType) { l, ok := b.labels[labelName] if !ok { l = &label{sources: make([]source, 0), target: -1} b.labels[labelName] = l } src := source{line: len(b.instructions), jt: t} l.sources = append(l.sources, src) if existingLabel, found := b.jumpSourceToLabel[src]; found { panic(fmt.Sprintf("label %q already present at source %v; one source may only have one label", existingLabel, src)) } b.jumpSourceToLabel[src] = labelName } func (b *ProgramBuilder) resolveLabels() error { for key, v := range b.labels { if _, ok := b.unusableLabels[key]; ok { return fmt.Errorf("backwards reference detected for label: %q", key) } if v.target == -1 { return fmt.Errorf("label target not set: %v", key) } if v.target >= len(b.instructions) { return fmt.Errorf("target is beyond end of ProgramBuilder") } for _, s := range v.sources { // Finds jump instruction that references the label. inst := b.instructions[s.line] if s.line >= v.target { return fmt.Errorf("cannot jump backwards") } // Calculates the jump offset from current line. offset := v.target - s.line - 1 // Sets offset into jump instruction. switch s.jt { case JumpDirect: if offset > labelDirectTarget { return fmt.Errorf("jump offset to label '%v' is too large: %v, inst: %v, lineno: %v", key, offset, inst, s.line) } if inst.K != labelDirectTarget { return fmt.Errorf("jump target is not a label") } inst.K = uint32(offset) case JumpTrue: if offset > labelTarget { return fmt.Errorf("jump offset to label '%v' is too large: %v, inst: %v, lineno: %v", key, offset, inst, s.line) } if inst.JumpIfTrue != labelTarget { return fmt.Errorf("jump target is not a label") } inst.JumpIfTrue = uint8(offset) case JumpFalse: if offset > labelTarget { return fmt.Errorf("jump offset to label '%v' is too large: %v, inst: %v, lineno: %v", key, offset, inst, s.line) } if inst.JumpIfFalse != labelTarget { return fmt.Errorf("jump target is not a label") } inst.JumpIfFalse = uint8(offset) } b.instructions[s.line] = inst } } clear(b.labels) return nil } // ProgramFragment is a set of not-compiled instructions that were added to // a ProgramBuilder from the moment the `Record` function was called on it. type ProgramFragment struct { // b is a reference to the ProgramBuilder that this is a fragment from. b *ProgramBuilder // fromPC is the index of the first instruction that was recorded. // If no instruction was recorded, this index will be equal to `toPC`. fromPC int // toPC is the index *after* the last instruction that was recorded. // This means that right after recording, the program will not have // any instruction at index `toPC`. toPC int } // Record starts recording the instructions being added to the ProgramBuilder // until the returned function is called. // The returned function returns a ProgramFragment which represents the // recorded instructions. It may be called repeatedly. func (b *ProgramBuilder) Record() func() ProgramFragment { currentPC := len(b.instructions) return func() ProgramFragment { return ProgramFragment{ b: b, fromPC: currentPC, toPC: len(b.instructions), } } } // String returns a string version of the fragment. func (f ProgramFragment) String() string { return fmt.Sprintf("fromPC=%d toPC=%d", f.fromPC, f.toPC) } // FragmentOutcomes represents the set of outcomes that a ProgramFragment // execution may result into. type FragmentOutcomes struct { // MayFallThrough is true if executing the fragment may cause it to start // executing the program instruction that comes right after the last // instruction in this fragment (i.e. at `Fragment.toPC`). MayFallThrough bool // MayJumpToKnownOffsetBeyondFragment is true if executing the fragment may // jump to a fixed offset (or resolved label) that is not within the range // of the fragment itself, nor does it point to the instruction that would // come right after this fragment. // If the fragment jumps to an unresolved label, this will instead be // indicated in `MayJumpToUnresolvedLabels`. MayJumpToKnownOffsetBeyondFragment bool // MayJumpToUnresolvedLabels is the set of named labels that have not yet // been added to the program (the labels are not resolvable) but that the // fragment may jump to. MayJumpToUnresolvedLabels map[string]struct{} // MayReturnImmediate contains the set of possible immediate return values // that the fragment may return. MayReturnImmediate map[linux.BPFAction]struct{} // MayReturnRegisterA is true if the fragment may return the value of // register A. MayReturnRegisterA bool } // String returns a list of possible human-readable outcomes. func (o FragmentOutcomes) String() string { var s []string if o.MayJumpToKnownOffsetBeyondFragment { s = append(s, "may jump to known offset beyond fragment") } sortedLabels := make([]string, 0, len(o.MayJumpToUnresolvedLabels)) for lbl := range o.MayJumpToUnresolvedLabels { sortedLabels = append(sortedLabels, lbl) } sort.Strings(sortedLabels) for _, lbl := range sortedLabels { s = append(s, fmt.Sprintf("may jump to unresolved label %q", lbl)) } if o.MayFallThrough { s = append(s, "may fall through") } sortedReturnValues := make([]uint32, 0, len(o.MayReturnImmediate)) for v := range o.MayReturnImmediate { sortedReturnValues = append(sortedReturnValues, uint32(v)) } sort.Slice(sortedReturnValues, func(i, j int) bool { return sortedReturnValues[i] < sortedReturnValues[j] }) for _, v := range sortedReturnValues { s = append(s, fmt.Sprintf("may return '0x%x'", v)) } if o.MayReturnRegisterA { s = append(s, "may return register A") } if len(s) == 0 { return "no outcomes (this should never happen)" } return strings.Join(s, ", ") } // MayReturn returns whether the fragment may return for any reason. func (o FragmentOutcomes) MayReturn() bool { return len(o.MayReturnImmediate) > 0 || o.MayReturnRegisterA } // Outcomes returns the set of possible outcomes that executing this fragment // may result into. func (f ProgramFragment) Outcomes() FragmentOutcomes { if f.fromPC == f.toPC { // No instructions, this just falls through. return FragmentOutcomes{ MayFallThrough: true, } } outcomes := FragmentOutcomes{ MayJumpToUnresolvedLabels: make(map[string]struct{}), MayReturnImmediate: make(map[linux.BPFAction]struct{}), } for pc := f.fromPC; pc < f.toPC; pc++ { ins := f.b.instructions[pc] isLastInstruction := pc == f.toPC-1 switch ins.OpCode & instructionClassMask { case Ret: switch ins.OpCode { case Ret | K: outcomes.MayReturnImmediate[linux.BPFAction(ins.K)] = struct{}{} case Ret | A: outcomes.MayReturnRegisterA = true } case Jmp: for _, offset := range ins.JumpOffsets() { var foundLabel *label foundLabelName, found := f.b.jumpSourceToLabel[source{line: pc, jt: offset.Type}] if found { foundLabel = f.b.labels[foundLabelName] if foundLabel.target == -1 { outcomes.MayJumpToUnresolvedLabels[foundLabelName] = struct{}{} continue } } var target int if foundLabel != nil { target = foundLabel.target } else { target = pc + int(offset.Offset) + 1 } if target == f.toPC { outcomes.MayFallThrough = true } else if target > f.toPC { outcomes.MayJumpToKnownOffsetBeyondFragment = true } } default: if isLastInstruction { outcomes.MayFallThrough = true } } } return outcomes } // MayModifyRegisterA returns whether this fragment may modify register A. // A value of "true" does not necessarily mean that A *will* be modified, // as the control flow of this fragment may skip over instructions that // modify the A register. func (f ProgramFragment) MayModifyRegisterA() bool { for pc := f.fromPC; pc < f.toPC; pc++ { if f.b.instructions[pc].ModifiesRegisterA() { return true } } return false } golang-gvisor-gvisor-0.0~20240729.0/pkg/buffer/000077500000000000000000000000001465435605700206755ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/buffer/buffer.go000066400000000000000000000364601465435605700225060ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package buffer provides the implementation of a non-contiguous buffer that // is reference counted, pooled, and copy-on-write. It allows O(1) append, // and prepend operations. package buffer import ( "fmt" "io" "gvisor.dev/gvisor/pkg/tcpip/checksum" ) // Buffer is a non-linear buffer. // // +stateify savable type Buffer struct { data ViewList `state:".([]byte)"` size int64 } func (b *Buffer) removeView(v *View) { b.data.Remove(v) v.Release() } // MakeWithData creates a new Buffer initialized with given data. This function // should be used with caution to avoid unnecessary []byte allocations. When in // doubt use NewWithView to maximize chunk reuse. func MakeWithData(b []byte) Buffer { buf := Buffer{} if len(b) == 0 { return buf } v := NewViewWithData(b) buf.Append(v) return buf } // MakeWithView creates a new Buffer initialized with given view. This function // takes ownership of v. func MakeWithView(v *View) Buffer { if v == nil { return Buffer{} } b := Buffer{ size: int64(v.Size()), } if b.size == 0 { v.Release() return b } b.data.PushBack(v) return b } // Release frees all resources held by b. func (b *Buffer) Release() { for v := b.data.Front(); v != nil; v = b.data.Front() { b.removeView(v) } b.size = 0 } // TrimFront removes the first count bytes from the buffer. func (b *Buffer) TrimFront(count int64) { if count >= b.size { b.advanceRead(b.size) } else { b.advanceRead(count) } } // ReadAt implements io.ReaderAt.ReadAt. func (b *Buffer) ReadAt(p []byte, offset int64) (int, error) { var ( skipped int64 done int64 ) for v := b.data.Front(); v != nil && done < int64(len(p)); v = v.Next() { needToSkip := int(offset - skipped) if sz := v.Size(); sz <= needToSkip { skipped += int64(sz) continue } // Actually read data. n := copy(p[done:], v.AsSlice()[needToSkip:]) skipped += int64(needToSkip) done += int64(n) } if int(done) < len(p) || offset+done == b.size { return int(done), io.EOF } return int(done), nil } // advanceRead advances the Buffer's read index. // // Precondition: there must be sufficient bytes in the buffer. func (b *Buffer) advanceRead(count int64) { for v := b.data.Front(); v != nil && count > 0; { sz := int64(v.Size()) if sz > count { // There is still data for reading. v.TrimFront(int(count)) b.size -= count count = 0 return } // Consume the whole view. oldView := v v = v.Next() // Iterate. b.removeView(oldView) // Update counts. count -= sz b.size -= sz } if count > 0 { panic(fmt.Sprintf("advanceRead still has %d bytes remaining", count)) } } // Truncate truncates the Buffer to the given length. // // This will not grow the Buffer, only shrink it. If a length is passed that is // greater than the current size of the Buffer, then nothing will happen. // // Precondition: length must be >= 0. func (b *Buffer) Truncate(length int64) { if length < 0 { panic("negative length provided") } if length >= b.size { return // Nothing to do. } for v := b.data.Back(); v != nil && b.size > length; v = b.data.Back() { sz := int64(v.Size()) if after := b.size - sz; after < length { // Truncate the buffer locally. left := (length - after) v.write = v.read + int(left) b.size = length break } // Drop the buffer completely; see above. b.removeView(v) b.size -= sz } } // GrowTo grows the given Buffer to the number of bytes, which will be appended. // If zero is true, all these bytes will be zero. If zero is false, then this is // the caller's responsibility. // // Precondition: length must be >= 0. func (b *Buffer) GrowTo(length int64, zero bool) { if length < 0 { panic("negative length provided") } for b.size < length { v := b.data.Back() // Is there some space in the last buffer? if v.Full() { v = NewView(int(length - b.size)) b.data.PushBack(v) } // Write up to length bytes. sz := v.AvailableSize() if int64(sz) > length-b.size { sz = int(length - b.size) } // Zero the written section. if zero { clear(v.chunk.data[v.write : v.write+sz]) } // Advance the index. v.Grow(sz) b.size += int64(sz) } } // Prepend prepends the given data. Prepend takes ownership of src. func (b *Buffer) Prepend(src *View) error { if src == nil { return nil } if src.Size() == 0 { src.Release() return nil } // If the first buffer does not have room just prepend the view. v := b.data.Front() if v == nil || v.read == 0 { b.prependOwned(src) return nil } // If there's room at the front and we won't incur a copy by writing to this // view, fill in the extra room first. if !v.sharesChunk() { avail := v.read vStart := 0 srcStart := src.Size() - avail if avail > src.Size() { vStart = avail - src.Size() srcStart = 0 } // Save the write index and restore it after. old := v.write v.read = vStart n, err := v.WriteAt(src.AsSlice()[srcStart:], 0) if err != nil { return fmt.Errorf("could not write to view during append: %w", err) } b.size += int64(n) v.write = old src.write = srcStart // If there's no more to be written, then we're done. if src.Size() == 0 { src.Release() return nil } } // Otherwise, just prepend the view. b.prependOwned(src) return nil } // Append appends the given data. Append takes ownership of src. func (b *Buffer) Append(src *View) error { if src == nil { return nil } if src.Size() == 0 { src.Release() return nil } // If the last buffer is full, just append the view. v := b.data.Back() if v.Full() { b.appendOwned(src) return nil } // If a write won't incur a copy, then fill the back of the existing last // chunk. if !v.sharesChunk() { writeSz := src.Size() if src.Size() > v.AvailableSize() { writeSz = v.AvailableSize() } done, err := v.Write(src.AsSlice()[:writeSz]) if err != nil { return fmt.Errorf("could not write to view during append: %w", err) } src.TrimFront(done) b.size += int64(done) if src.Size() == 0 { src.Release() return nil } } // If there is still data left just append the src. b.appendOwned(src) return nil } func (b *Buffer) appendOwned(v *View) { b.data.PushBack(v) b.size += int64(v.Size()) } func (b *Buffer) prependOwned(v *View) { b.data.PushFront(v) b.size += int64(v.Size()) } // PullUp makes the specified range contiguous and returns the backing memory. func (b *Buffer) PullUp(offset, length int) (View, bool) { if length == 0 { return View{}, true } tgt := Range{begin: offset, end: offset + length} if tgt.Intersect(Range{end: int(b.size)}).Len() != length { return View{}, false } curr := Range{} v := b.data.Front() for ; v != nil; v = v.Next() { origLen := v.Size() curr.end = curr.begin + origLen if x := curr.Intersect(tgt); x.Len() == tgt.Len() { // buf covers the whole requested target range. sub := x.Offset(-curr.begin) // Don't increment the reference count of the underlying chunk. Views // returned by PullUp are explicitly unowned and read only new := View{ read: v.read + sub.begin, write: v.read + sub.end, chunk: v.chunk, } return new, true } else if x.Len() > 0 { // buf is pointing at the starting buffer we want to merge. break } curr.begin += origLen } // Calculate the total merged length. totLen := 0 for n := v; n != nil; n = n.Next() { totLen += n.Size() if curr.begin+totLen >= tgt.end { break } } // Merge the buffers. merged := NewViewSize(totLen) off := 0 for n := v; n != nil && off < totLen; { merged.WriteAt(n.AsSlice(), off) off += n.Size() // Remove buffers except for the first one, which will be reused. if n == v { n = n.Next() } else { old := n n = n.Next() b.removeView(old) } } // Make data the first buffer. b.data.InsertBefore(v, merged) b.removeView(v) r := tgt.Offset(-curr.begin) pulled := View{ read: r.begin, write: r.end, chunk: merged.chunk, } return pulled, true } // Flatten returns a flattened copy of this data. // // This method should not be used in any performance-sensitive paths. It may // allocate a fresh byte slice sufficiently large to contain all the data in // the buffer. This is principally for debugging. // // N.B. Tee data still belongs to this Buffer, as if there is a single buffer // present, then it will be returned directly. This should be used for // temporary use only, and a reference to the given slice should not be held. func (b *Buffer) Flatten() []byte { if v := b.data.Front(); v == nil { return nil // No data at all. } data := make([]byte, 0, b.size) // Need to flatten. for v := b.data.Front(); v != nil; v = v.Next() { // Copy to the allocated slice. data = append(data, v.AsSlice()...) } return data } // Size indicates the total amount of data available in this Buffer. func (b *Buffer) Size() int64 { return b.size } // AsViewList returns the ViewList backing b. Users may not save or modify the // ViewList returned. func (b *Buffer) AsViewList() ViewList { return b.data } // Clone creates a copy-on-write clone of b. The underlying chunks are shared // until they are written to. func (b *Buffer) Clone() Buffer { other := Buffer{ size: b.size, } for v := b.data.Front(); v != nil; v = v.Next() { newView := v.Clone() other.data.PushBack(newView) } return other } // DeepClone creates a deep clone of b, copying data such that no bytes are // shared with any other Buffers. func (b *Buffer) DeepClone() Buffer { newBuf := Buffer{} buf := b.Clone() reader := buf.AsBufferReader() newBuf.WriteFromReader(&reader, b.size) return newBuf } // Apply applies the given function across all valid data. func (b *Buffer) Apply(fn func(*View)) { for v := b.data.Front(); v != nil; v = v.Next() { d := v.Clone() fn(d) d.Release() } } // SubApply applies fn to a given range of data in b. Any part of the range // outside of b is ignored. func (b *Buffer) SubApply(offset, length int, fn func(*View)) { for v := b.data.Front(); length > 0 && v != nil; v = v.Next() { if offset >= v.Size() { offset -= v.Size() continue } d := v.Clone() if offset > 0 { d.TrimFront(offset) offset = 0 } if length < d.Size() { d.write = d.read + length } fn(d) length -= d.Size() d.Release() } } // Checksum calculates a checksum over the buffer's payload starting at offset. func (b *Buffer) Checksum(offset int) uint16 { if offset >= int(b.size) { return 0 } var v *View for v = b.data.Front(); v != nil && offset >= v.Size(); v = v.Next() { offset -= v.Size() } var cs checksum.Checksumer cs.Add(v.AsSlice()[offset:]) for v = v.Next(); v != nil; v = v.Next() { cs.Add(v.AsSlice()) } return cs.Checksum() } // Merge merges the provided Buffer with this one. // // The other Buffer will be appended to v, and other will be empty after this // operation completes. func (b *Buffer) Merge(other *Buffer) { b.data.PushBackList(&other.data) other.data = ViewList{} // Adjust sizes. b.size += other.size other.size = 0 } // WriteFromReader writes to the buffer from an io.Reader. A maximum read size // of MaxChunkSize is enforced to prevent allocating views from the heap. func (b *Buffer) WriteFromReader(r io.Reader, count int64) (int64, error) { return b.WriteFromReaderAndLimitedReader(r, count, nil) } // WriteFromReaderAndLimitedReader is the same as WriteFromReader, but // optimized to avoid allocations if a LimitedReader is passed in. // // This function clobbers the values of lr. func (b *Buffer) WriteFromReaderAndLimitedReader(r io.Reader, count int64, lr *io.LimitedReader) (int64, error) { if lr == nil { lr = &io.LimitedReader{} } var done int64 for done < count { vsize := count - done if vsize > MaxChunkSize { vsize = MaxChunkSize } v := NewView(int(vsize)) lr.R = r lr.N = vsize n, err := io.Copy(v, lr) b.Append(v) done += n if err == io.EOF { break } if err != nil { return done, err } } return done, nil } // ReadToWriter reads from the buffer into an io.Writer. // // N.B. This does not consume the bytes read. TrimFront should // be called appropriately after this call in order to do so. func (b *Buffer) ReadToWriter(w io.Writer, count int64) (int64, error) { bytesLeft := int(count) for v := b.data.Front(); v != nil && bytesLeft > 0; v = v.Next() { view := v.Clone() if view.Size() > bytesLeft { view.CapLength(bytesLeft) } n, err := io.Copy(w, view) bytesLeft -= int(n) view.Release() if err != nil { return count - int64(bytesLeft), err } } return count - int64(bytesLeft), nil } // read implements the io.Reader interface. This method is used by BufferReader // to consume its underlying buffer. To perform io operations on buffers // directly, use ReadToWriter or WriteToReader. func (b *Buffer) read(p []byte) (int, error) { if len(p) == 0 { return 0, nil } if b.Size() == 0 { return 0, io.EOF } done := 0 v := b.data.Front() for v != nil && done < len(p) { n, err := v.Read(p[done:]) done += n next := v.Next() if v.Size() == 0 { b.removeView(v) } b.size -= int64(n) if err != nil && err != io.EOF { return done, err } v = next } return done, nil } // readByte implements the io.ByteReader interface. This method is used by // BufferReader to consume its underlying buffer. To perform io operations on // buffers directly, use ReadToWriter or WriteToReader. func (b *Buffer) readByte() (byte, error) { if b.Size() == 0 { return 0, io.EOF } v := b.data.Front() bt := v.AsSlice()[0] b.TrimFront(1) return bt, nil } // AsBufferReader returns the Buffer as a BufferReader capable of io methods. // The new BufferReader takes ownership of b. func (b *Buffer) AsBufferReader() BufferReader { return BufferReader{b} } // BufferReader implements io methods on Buffer. Users must call Close() // when finished with the buffer to free the underlying memory. type BufferReader struct { b *Buffer } // Read implements the io.Reader interface. func (br *BufferReader) Read(p []byte) (int, error) { return br.b.read(p) } // ReadByte implements the io.ByteReader interface. func (br *BufferReader) ReadByte() (byte, error) { return br.b.readByte() } // Close implements the io.Closer interface. func (br *BufferReader) Close() { br.b.Release() } // Len returns the number of bytes in the unread portion of the buffer. func (br *BufferReader) Len() int { return int(br.b.Size()) } // Range specifies a range of buffer. type Range struct { begin int end int } // Intersect returns the intersection of x and y. func (x Range) Intersect(y Range) Range { if x.begin < y.begin { x.begin = y.begin } if x.end > y.end { x.end = y.end } if x.begin >= x.end { return Range{} } return x } // Offset returns x offset by off. func (x Range) Offset(off int) Range { x.begin += off x.end += off return x } // Len returns the length of x. func (x Range) Len() int { l := x.end - x.begin if l < 0 { l = 0 } return l } golang-gvisor-gvisor-0.0~20240729.0/pkg/buffer/buffer_state.go000066400000000000000000000015271465435605700237020ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package buffer import ( "context" ) // saveData is invoked by stateify. func (b *Buffer) saveData() []byte { return b.Flatten() } // loadData is invoked by stateify. func (b *Buffer) loadData(_ context.Context, data []byte) { *b = MakeWithData(data) } golang-gvisor-gvisor-0.0~20240729.0/pkg/buffer/buffer_state_autogen.go000066400000000000000000000076551465435605700254340ustar00rootroot00000000000000// automatically generated by stateify. package buffer import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (b *Buffer) StateTypeName() string { return "pkg/buffer.Buffer" } func (b *Buffer) StateFields() []string { return []string{ "data", "size", } } func (b *Buffer) beforeSave() {} // +checklocksignore func (b *Buffer) StateSave(stateSinkObject state.Sink) { b.beforeSave() var dataValue []byte dataValue = b.saveData() stateSinkObject.SaveValue(0, dataValue) stateSinkObject.Save(1, &b.size) } func (b *Buffer) afterLoad(context.Context) {} // +checklocksignore func (b *Buffer) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(1, &b.size) stateSourceObject.LoadValue(0, new([]byte), func(y any) { b.loadData(ctx, y.([]byte)) }) } func (c *chunk) StateTypeName() string { return "pkg/buffer.chunk" } func (c *chunk) StateFields() []string { return []string{ "chunkRefs", "data", } } func (c *chunk) beforeSave() {} // +checklocksignore func (c *chunk) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.chunkRefs) stateSinkObject.Save(1, &c.data) } func (c *chunk) afterLoad(context.Context) {} // +checklocksignore func (c *chunk) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.chunkRefs) stateSourceObject.Load(1, &c.data) } func (r *chunkRefs) StateTypeName() string { return "pkg/buffer.chunkRefs" } func (r *chunkRefs) StateFields() []string { return []string{ "refCount", } } func (r *chunkRefs) beforeSave() {} // +checklocksignore func (r *chunkRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *chunkRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (v *View) StateTypeName() string { return "pkg/buffer.View" } func (v *View) StateFields() []string { return []string{ "read", "write", "chunk", } } func (v *View) beforeSave() {} // +checklocksignore func (v *View) StateSave(stateSinkObject state.Sink) { v.beforeSave() stateSinkObject.Save(0, &v.read) stateSinkObject.Save(1, &v.write) stateSinkObject.Save(2, &v.chunk) } func (v *View) afterLoad(context.Context) {} // +checklocksignore func (v *View) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &v.read) stateSourceObject.Load(1, &v.write) stateSourceObject.Load(2, &v.chunk) } func (l *ViewList) StateTypeName() string { return "pkg/buffer.ViewList" } func (l *ViewList) StateFields() []string { return []string{ "head", "tail", } } func (l *ViewList) beforeSave() {} // +checklocksignore func (l *ViewList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *ViewList) afterLoad(context.Context) {} // +checklocksignore func (l *ViewList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *ViewEntry) StateTypeName() string { return "pkg/buffer.ViewEntry" } func (e *ViewEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *ViewEntry) beforeSave() {} // +checklocksignore func (e *ViewEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *ViewEntry) afterLoad(context.Context) {} // +checklocksignore func (e *ViewEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func init() { state.Register((*Buffer)(nil)) state.Register((*chunk)(nil)) state.Register((*chunkRefs)(nil)) state.Register((*View)(nil)) state.Register((*ViewList)(nil)) state.Register((*ViewEntry)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/buffer/buffer_unsafe_state_autogen.go000066400000000000000000000000701465435605700267550ustar00rootroot00000000000000// automatically generated by stateify. package buffer golang-gvisor-gvisor-0.0~20240729.0/pkg/buffer/chunk.go000066400000000000000000000053451465435605700223430ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package buffer import ( "fmt" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/sync" ) const ( // This is log2(baseChunkSize). This number is used to calculate which pool // to use for a payload size by right shifting the payload size by this // number and passing the result to MostSignificantOne64. baseChunkSizeLog2 = 6 // This is the size of the buffers in the first pool. Each subsequent pool // creates payloads 2^(pool index) times larger than the first pool's // payloads. baseChunkSize = 1 << baseChunkSizeLog2 // 64 // MaxChunkSize is largest payload size that we pool. Payloads larger than // this will be allocated from the heap and garbage collected as normal. MaxChunkSize = baseChunkSize << (numPools - 1) // 64k // The number of chunk pools we have for use. numPools = 11 ) // chunkPools is a collection of pools for payloads of different sizes. The // size of the payloads doubles in each successive pool. var chunkPools [numPools]sync.Pool func init() { for i := 0; i < numPools; i++ { chunkSize := baseChunkSize * (1 << i) chunkPools[i].New = func() any { return &chunk{ data: make([]byte, chunkSize), } } } } // Precondition: 0 <= size <= maxChunkSize func getChunkPool(size int) *sync.Pool { idx := 0 if size > baseChunkSize { idx = bits.MostSignificantOne64(uint64(size) >> baseChunkSizeLog2) if size > 1<<(idx+baseChunkSizeLog2) { idx++ } } if idx >= numPools { panic(fmt.Sprintf("pool for chunk size %d does not exist", size)) } return &chunkPools[idx] } // Chunk represents a slice of pooled memory. // // +stateify savable type chunk struct { chunkRefs data []byte } func newChunk(size int) *chunk { var c *chunk if size > MaxChunkSize { c = &chunk{ data: make([]byte, size), } } else { pool := getChunkPool(size) c = pool.Get().(*chunk) clear(c.data) } c.InitRefs() return c } func (c *chunk) destroy() { if len(c.data) > MaxChunkSize { c.data = nil return } pool := getChunkPool(len(c.data)) pool.Put(c) } func (c *chunk) DecRef() { c.chunkRefs.DecRef(c.destroy) } func (c *chunk) Clone() *chunk { cpy := newChunk(len(c.data)) copy(cpy.data, c.data) return cpy } golang-gvisor-gvisor-0.0~20240729.0/pkg/buffer/chunk_refs.go000066400000000000000000000100671465435605700233570ustar00rootroot00000000000000package buffer import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const chunkenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var chunkobj *chunk // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type chunkRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *chunkRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *chunkRefs) RefType() string { return fmt.Sprintf("%T", chunkobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *chunkRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *chunkRefs) LogRefs() bool { return chunkenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *chunkRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *chunkRefs) IncRef() { v := r.refCount.Add(1) if chunkenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *chunkRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if chunkenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *chunkRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if chunkenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *chunkRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/buffer/view.go000066400000000000000000000215001465435605700221740ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package buffer import ( "fmt" "io" "gvisor.dev/gvisor/pkg/sync" ) // ReadSize is the default amount that a View's size is increased by when an // io.Reader has more data than a View can hold during calls to ReadFrom. const ReadSize = 512 var viewPool = sync.Pool{ New: func() any { return &View{} }, } // View is a window into a shared chunk. Views are held by Buffers in // viewLists to represent contiguous memory. // // A View must be created with NewView, NewViewWithData, or Clone. Owners are // responsible for maintaining ownership over their views. When Views need to be // shared or copied, the owner should create a new View with Clone. Clone must // only ever be called on a owned View, not a borrowed one. // // Users are responsible for calling Release when finished with their View so // that its resources can be returned to the pool. // // Users must not write directly to slices returned by AsSlice. Instead, they // must use Write/WriteAt/CopyIn to modify the underlying View. This preserves // the safety guarantees of copy-on-write. // // +stateify savable type View struct { ViewEntry `state:"nosave"` read int write int chunk *chunk } // NewView creates a new view with capacity at least as big as cap. It is // analogous to make([]byte, 0, cap). func NewView(cap int) *View { c := newChunk(cap) v := viewPool.Get().(*View) *v = View{chunk: c} return v } // NewViewSize creates a new view with capacity at least as big as size and // length that is exactly size. It is analogous to make([]byte, size). func NewViewSize(size int) *View { v := NewView(size) v.Grow(size) return v } // NewViewWithData creates a new view and initializes it with data. This // function should be used with caution to avoid unnecessary []byte allocations. // When in doubt use NewWithView to maximize chunk reuse in production // environments. func NewViewWithData(data []byte) *View { c := newChunk(len(data)) v := viewPool.Get().(*View) *v = View{chunk: c} v.Write(data) return v } // Clone creates a shallow clone of v where the underlying chunk is shared. // // The caller must own the View to call Clone. It is not safe to call Clone // on a borrowed or shared View because it can race with other View methods. func (v *View) Clone() *View { if v == nil { panic("cannot clone a nil view") } v.chunk.IncRef() newV := viewPool.Get().(*View) newV.chunk = v.chunk newV.read = v.read newV.write = v.write return newV } // Release releases the chunk held by v and returns v to the pool. func (v *View) Release() { if v == nil { panic("cannot release a nil view") } v.chunk.DecRef() *v = View{} viewPool.Put(v) } // Reset sets the view's read and write indices back to zero. func (v *View) Reset() { if v == nil { panic("cannot reset a nil view") } v.read = 0 v.write = 0 } func (v *View) sharesChunk() bool { return v.chunk.refCount.Load() > 1 } // Full indicates the chunk is full. // // This indicates there is no capacity left to write. func (v *View) Full() bool { return v == nil || v.write == len(v.chunk.data) } // Capacity returns the total size of this view's chunk. func (v *View) Capacity() int { if v == nil { return 0 } return len(v.chunk.data) } // Size returns the size of data written to the view. func (v *View) Size() int { if v == nil { return 0 } return v.write - v.read } // TrimFront advances the read index by the given amount. func (v *View) TrimFront(n int) { if v.read+n > v.write { panic("cannot trim past the end of a view") } v.read += n } // AsSlice returns a slice of the data written to this view. func (v *View) AsSlice() []byte { if v.Size() == 0 { return nil } return v.chunk.data[v.read:v.write] } // ToSlice returns an owned copy of the data in this view. func (v *View) ToSlice() []byte { if v.Size() == 0 { return nil } s := make([]byte, v.Size()) copy(s, v.AsSlice()) return s } // AvailableSize returns the number of bytes available for writing. func (v *View) AvailableSize() int { if v == nil { return 0 } return len(v.chunk.data) - v.write } // Read reads v's data into p. // // Implements the io.Reader interface. func (v *View) Read(p []byte) (int, error) { if len(p) == 0 { return 0, nil } if v.Size() == 0 { return 0, io.EOF } n := copy(p, v.AsSlice()) v.TrimFront(n) return n, nil } // ReadByte implements the io.ByteReader interface. func (v *View) ReadByte() (byte, error) { if v.Size() == 0 { return 0, io.EOF } b := v.AsSlice()[0] v.read++ return b, nil } // WriteTo writes data to w until the view is empty or an error occurs. The // return value n is the number of bytes written. // // WriteTo implements the io.WriterTo interface. func (v *View) WriteTo(w io.Writer) (n int64, err error) { if v.Size() > 0 { sz := v.Size() m, e := w.Write(v.AsSlice()) v.TrimFront(m) n = int64(m) if e != nil { return n, e } if m != sz { return n, io.ErrShortWrite } } return n, nil } // ReadAt reads data to the p starting at offset. // // Implements the io.ReaderAt interface. func (v *View) ReadAt(p []byte, off int) (int, error) { if off < 0 || off > v.Size() { return 0, fmt.Errorf("ReadAt(): offset out of bounds: want 0 < off < %d, got off=%d", v.Size(), off) } n := copy(p, v.AsSlice()[off:]) return n, nil } // Write writes data to the view's chunk starting at the v.write index. If the // view's chunk has a reference count greater than 1, the chunk is copied first // and then written to. // // Implements the io.Writer interface. func (v *View) Write(p []byte) (int, error) { if v == nil { panic("cannot write to a nil view") } if v.AvailableSize() < len(p) { v.growCap(len(p) - v.AvailableSize()) } else if v.sharesChunk() { defer v.chunk.DecRef() v.chunk = v.chunk.Clone() } n := copy(v.chunk.data[v.write:], p) v.write += n if n < len(p) { return n, io.ErrShortWrite } return n, nil } // ReadFrom reads data from r until EOF and appends it to the buffer, growing // the buffer as needed. The return value n is the number of bytes read. Any // error except io.EOF encountered during the read is also returned. // // ReadFrom implements the io.ReaderFrom interface. func (v *View) ReadFrom(r io.Reader) (n int64, err error) { if v == nil { panic("cannot write to a nil view") } if v.sharesChunk() { defer v.chunk.DecRef() v.chunk = v.chunk.Clone() } for { // Check for EOF to avoid an unnnecesary allocation. if _, e := r.Read(nil); e == io.EOF { return n, nil } if v.AvailableSize() == 0 { v.growCap(ReadSize) } m, e := r.Read(v.availableSlice()) v.write += m n += int64(m) if e == io.EOF { return n, nil } if e != nil { return n, e } } } // WriteAt writes data to the views's chunk starting at start. If the // view's chunk has a reference count greater than 1, the chunk is copied first // and then written to. // // Implements the io.WriterAt interface. func (v *View) WriteAt(p []byte, off int) (int, error) { if v == nil { panic("cannot write to a nil view") } if off < 0 || off > v.Size() { return 0, fmt.Errorf("write offset out of bounds: want 0 < off < %d, got off=%d", v.Size(), off) } if v.sharesChunk() { defer v.chunk.DecRef() v.chunk = v.chunk.Clone() } n := copy(v.AsSlice()[off:], p) if n < len(p) { return n, io.ErrShortWrite } return n, nil } // Grow increases the size of the view. If the new size is greater than the // view's current capacity, Grow will reallocate the view with an increased // capacity. func (v *View) Grow(n int) { if v == nil { panic("cannot grow a nil view") } if v.write+n > v.Capacity() { v.growCap(n) } v.write += n } // growCap increases the capacity of the view by at least n. func (v *View) growCap(n int) { if v == nil { panic("cannot grow a nil view") } defer v.chunk.DecRef() old := v.AsSlice() v.chunk = newChunk(v.Capacity() + n) copy(v.chunk.data, old) v.read = 0 v.write = len(old) } // CapLength caps the length of the view's read slice to n. If n > v.Size(), // the function is a no-op. func (v *View) CapLength(n int) { if v == nil { panic("cannot resize a nil view") } if n < 0 { panic("n must be >= 0") } if n > v.Size() { n = v.Size() } v.write = v.read + n } func (v *View) availableSlice() []byte { if v.sharesChunk() { defer v.chunk.DecRef() c := v.chunk.Clone() v.chunk = c } return v.chunk.data[v.write:] } golang-gvisor-gvisor-0.0~20240729.0/pkg/buffer/view_list.go000066400000000000000000000116221465435605700232330ustar00rootroot00000000000000package buffer // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type ViewElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (ViewElementMapper) linkerFor(elem *View) *View { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type ViewList struct { head *View tail *View } // Reset resets list l to the empty state. func (l *ViewList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *ViewList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *ViewList) Front() *View { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *ViewList) Back() *View { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *ViewList) Len() (count int) { for e := l.Front(); e != nil; e = (ViewElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *ViewList) PushFront(e *View) { linker := ViewElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { ViewElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *ViewList) PushFrontList(m *ViewList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { ViewElementMapper{}.linkerFor(l.head).SetPrev(m.tail) ViewElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *ViewList) PushBack(e *View) { linker := ViewElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { ViewElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *ViewList) PushBackList(m *ViewList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { ViewElementMapper{}.linkerFor(l.tail).SetNext(m.head) ViewElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *ViewList) InsertAfter(b, e *View) { bLinker := ViewElementMapper{}.linkerFor(b) eLinker := ViewElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { ViewElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *ViewList) InsertBefore(a, e *View) { aLinker := ViewElementMapper{}.linkerFor(a) eLinker := ViewElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { ViewElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *ViewList) Remove(e *View) { linker := ViewElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { ViewElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { ViewElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type ViewEntry struct { next *View prev *View } // Next returns the entry that follows e in the list. // //go:nosplit func (e *ViewEntry) Next() *View { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *ViewEntry) Prev() *View { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *ViewEntry) SetNext(elem *View) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *ViewEntry) SetPrev(elem *View) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/buffer/view_unsafe.go000066400000000000000000000015031465435605700235360ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package buffer import ( "reflect" "unsafe" ) // BasePtr returns a pointer to the view's chunk. func (v *View) BasePtr() *byte { hdr := (*reflect.SliceHeader)(unsafe.Pointer(&v.chunk.data)) return (*byte)(unsafe.Pointer(hdr.Data)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/cleanup/000077500000000000000000000000001465435605700210535ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/cleanup/cleanup.go000066400000000000000000000035161465435605700230360ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package cleanup provides utilities to clean "stuff" on defers. package cleanup // Cleanup allows defers to be aborted when cleanup needs to happen // conditionally. Usage: // // cu := cleanup.Make(func() { f.Close() }) // defer cu.Clean() // failure before release is called will close the file. // ... // cu.Add(func() { f2.Close() }) // Adds another cleanup function // ... // cu.Release() // on success, aborts closing the file. // return f type Cleanup struct { cleaners []func() } // Make creates a new Cleanup object. func Make(f func()) Cleanup { return Cleanup{cleaners: []func(){f}} } // Add adds a new function to be called on Clean(). func (c *Cleanup) Add(f func()) { c.cleaners = append(c.cleaners, f) } // Clean calls all cleanup functions in reverse order. func (c *Cleanup) Clean() { clean(c.cleaners) c.cleaners = nil } // Release releases the cleanup from its duties, i.e. cleanup functions are not // called after this point. Returns a function that calls all registered // functions in case the caller has use for them. func (c *Cleanup) Release() func() { old := c.cleaners c.cleaners = nil return func() { clean(old) } } func clean(cleaners []func()) { for i := len(cleaners) - 1; i >= 0; i-- { cleaners[i]() } } golang-gvisor-gvisor-0.0~20240729.0/pkg/cleanup/cleanup_state_autogen.go000066400000000000000000000000711465435605700257510ustar00rootroot00000000000000// automatically generated by stateify. package cleanup golang-gvisor-gvisor-0.0~20240729.0/pkg/compressio/000077500000000000000000000000001465435605700216075ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/compressio/compressio.go000066400000000000000000000510441465435605700243250ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package compressio provides parallel compression and decompression, as well // as optional SHA-256 hashing. It also provides another storage variant // (nocompressio) that does not compress data but tracks its integrity. // // The stream format is defined as follows. // // /------------------------------------------------------\ // | chunk size (4-bytes) | // +------------------------------------------------------+ // | (optional) hash (32-bytes) | // +------------------------------------------------------+ // | compressed data size (4-bytes) | // +------------------------------------------------------+ // | compressed data | // +------------------------------------------------------+ // | (optional) hash (32-bytes) | // +------------------------------------------------------+ // | compressed data size (4-bytes) | // +------------------------------------------------------+ // | ...... | // \------------------------------------------------------/ // // where each subsequent hash is calculated from the following items in order // // compressed data // compressed data size // previous hash // // so the stream integrity cannot be compromised by switching and mixing // compressed chunks. package compressio import ( "bytes" "compress/flate" "crypto/hmac" "crypto/sha256" "encoding/binary" "errors" "hash" "io" "runtime" "gvisor.dev/gvisor/pkg/sync" ) var bufPool = sync.Pool{ New: func() any { return bytes.NewBuffer(nil) }, } var chunkPool = sync.Pool{ New: func() any { return new(chunk) }, } // chunk is a unit of work. type chunk struct { // compressed is compressed data. // // This will always be returned to the bufPool directly when work has // finished (in schedule) and therefore must be allocated. compressed *bytes.Buffer // uncompressed is the uncompressed data. // // This is not returned to the bufPool automatically, since it may // correspond to a inline slice (provided directly to Read or Write). uncompressed *bytes.Buffer // The current hash object. Only used in compress mode. h hash.Hash // The hash from previous chunks. Only used in uncompress mode. lastSum []byte // The expected hash after current chunk. Only used in uncompress mode. sum []byte } // newChunk allocates a new chunk object (or pulls one from the pool). Buffers // will be allocated if nil is provided for compressed or uncompressed. func newChunk(lastSum []byte, sum []byte, compressed *bytes.Buffer, uncompressed *bytes.Buffer) *chunk { c := chunkPool.Get().(*chunk) c.lastSum = lastSum c.sum = sum if compressed != nil { c.compressed = compressed } else { c.compressed = bufPool.Get().(*bytes.Buffer) } if uncompressed != nil { c.uncompressed = uncompressed } else { c.uncompressed = bufPool.Get().(*bytes.Buffer) } return c } // result is the result of some work; it includes the original chunk. type result struct { *chunk err error } // worker is a compression/decompression worker. // // The associated worker goroutine reads in uncompressed buffers from input and // writes compressed buffers to its output. Alternatively, the worker reads // compressed buffers from input and writes uncompressed buffers to its output. // // The goroutine will exit when input is closed, and the goroutine will close // output. type worker struct { hashPool *hashPool input chan *chunk output chan result // scratch is a temporary buffer used for marshalling. This is declared // unfront here to avoid reallocation. scratch [4]byte } // work is the main work routine; see worker. func (w *worker) work(compress bool, level int) { defer close(w.output) var h hash.Hash for c := range w.input { if h == nil && w.hashPool != nil { h = w.hashPool.getHash() } if compress { mw := io.Writer(c.compressed) if h != nil { mw = io.MultiWriter(mw, h) } // Encode this slice. fw, err := flate.NewWriter(mw, level) if err != nil { w.output <- result{c, err} continue } // Encode the input. if _, err := io.CopyN(fw, c.uncompressed, int64(c.uncompressed.Len())); err != nil { w.output <- result{c, err} continue } if err := fw.Close(); err != nil { w.output <- result{c, err} continue } // Write the hash, if enabled. if h != nil { binary.BigEndian.PutUint32(w.scratch[:], uint32(c.compressed.Len())) h.Write(w.scratch[:4]) c.h = h h = nil } } else { // Check the hash of the compressed contents. if h != nil { h.Write(c.compressed.Bytes()) binary.BigEndian.PutUint32(w.scratch[:], uint32(c.compressed.Len())) h.Write(w.scratch[:4]) io.CopyN(h, bytes.NewReader(c.lastSum), int64(len(c.lastSum))) sum := h.Sum(nil) h.Reset() if !hmac.Equal(c.sum, sum) { w.output <- result{c, ErrHashMismatch} continue } } // Decode this slice. fr := flate.NewReader(c.compressed) // Decode the input. if _, err := io.Copy(c.uncompressed, fr); err != nil { w.output <- result{c, err} continue } } // Send the output. w.output <- result{c, nil} } } type hashPool struct { // mu protects the hash list. mu sync.Mutex // key is the key used to create hash objects. key []byte // hashes is the hash object free list. Note that this cannot be // globally shared across readers or writers, as it is key-specific. hashes []hash.Hash } // getHash gets a hash object for the pool. It should only be called when the // pool key is non-nil. func (p *hashPool) getHash() hash.Hash { p.mu.Lock() defer p.mu.Unlock() if len(p.hashes) == 0 { return hmac.New(sha256.New, p.key) } h := p.hashes[len(p.hashes)-1] p.hashes = p.hashes[:len(p.hashes)-1] return h } func (p *hashPool) putHash(h hash.Hash) { h.Reset() p.mu.Lock() defer p.mu.Unlock() p.hashes = append(p.hashes, h) } // pool is common functionality for reader/writers. type pool struct { // workers are the compression/decompression workers. workers []worker // chunkSize is the chunk size. This is the first four bytes in the // stream and is shared across both the reader and writer. chunkSize uint32 // mu protects below; it is generally the responsibility of users to // acquire this mutex before calling any methods on the pool. mu sync.Mutex // nextInput is the next worker for input (scheduling). nextInput int // nextOutput is the next worker for output (result). nextOutput int // buf is the current active buffer; the exact semantics of this buffer // depending on whether this is a reader or a writer. buf *bytes.Buffer // lasSum records the hash of the last chunk processed. lastSum []byte // hashPool is the hash object pool. It cannot be embedded into pool // itself as worker refers to it and that would stop pool from being // GCed. hashPool *hashPool } // init initializes the worker pool. // // This should only be called once. func (p *pool) init(key []byte, workers int, compress bool, level int) { if key != nil { p.hashPool = &hashPool{key: key} } p.workers = make([]worker, workers) for i := 0; i < len(p.workers); i++ { p.workers[i] = worker{ hashPool: p.hashPool, input: make(chan *chunk, 1), output: make(chan result, 1), } go p.workers[i].work(compress, level) // S/R-SAFE: In save path only. } runtime.SetFinalizer(p, (*pool).stop) } // stop stops all workers. func (p *pool) stop() { for i := 0; i < len(p.workers); i++ { close(p.workers[i].input) } p.workers = nil p.hashPool = nil } // handleResult calls the callback. func handleResult(r result, callback func(*chunk) error) error { defer func() { r.chunk.compressed.Reset() bufPool.Put(r.chunk.compressed) chunkPool.Put(r.chunk) }() if r.err != nil { return r.err } return callback(r.chunk) } // schedule schedules the given buffers. // // If c is non-nil, then it will return as soon as the chunk is scheduled. If c // is nil, then it will return only when no more work is left to do. // // If no callback function is provided, then the output channel will be // ignored. You must be sure that the input is schedulable in this case. func (p *pool) schedule(c *chunk, callback func(*chunk) error) error { for { var ( inputChan chan *chunk outputChan chan result ) if c != nil && len(p.workers) != 0 { inputChan = p.workers[(p.nextInput+1)%len(p.workers)].input } if callback != nil && p.nextOutput != p.nextInput && len(p.workers) != 0 { outputChan = p.workers[(p.nextOutput+1)%len(p.workers)].output } if inputChan == nil && outputChan == nil { return nil } select { case inputChan <- c: p.nextInput++ return nil case r := <-outputChan: p.nextOutput++ if err := handleResult(r, callback); err != nil { return err } } } } // Reader is a compressed reader. type Reader struct { pool // in is the source. in io.Reader // scratch is a temporary buffer used for marshalling. This is declared // unfront here to avoid reallocation. scratch [4]byte } var _ io.Reader = (*Reader)(nil) // NewReader returns a new compressed reader. If key is non-nil, the data stream // is assumed to contain expected hash values, which will be compared against // hash values computed from the compressed bytes. See package comments for // details. func NewReader(in io.Reader, key []byte) (*Reader, error) { r := &Reader{ in: in, } // Use double buffering for read. r.init(key, 2*runtime.GOMAXPROCS(0), false, 0) if _, err := io.ReadFull(in, r.scratch[:4]); err != nil { return nil, err } r.chunkSize = binary.BigEndian.Uint32(r.scratch[:4]) if r.hashPool != nil { h := r.hashPool.getHash() binary.BigEndian.PutUint32(r.scratch[:], r.chunkSize) h.Write(r.scratch[:4]) r.lastSum = h.Sum(nil) r.hashPool.putHash(h) sum := make([]byte, len(r.lastSum)) if _, err := io.ReadFull(r.in, sum); err != nil { return nil, err } if !hmac.Equal(r.lastSum, sum) { return nil, ErrHashMismatch } } return r, nil } // errNewBuffer is returned when a new buffer is completed. var errNewBuffer = errors.New("buffer ready") // ErrHashMismatch is returned if the hash does not match. var ErrHashMismatch = errors.New("hash mismatch") // Read implements io.Reader.Read. func (r *Reader) Read(p []byte) (int, error) { r.mu.Lock() defer r.mu.Unlock() // Total bytes completed; this is declared up front because it must be // adjustable by the callback below. done := 0 // Total bytes pending in the asynchronous workers for buffers. This is // used to process the proper regions of the input as inline buffers. var ( pendingPre = r.nextInput - r.nextOutput pendingInline = 0 ) // Define our callback for completed work. callback := func(c *chunk) error { // Check for an inline buffer. if pendingPre == 0 && pendingInline > 0 { pendingInline-- done += c.uncompressed.Len() return nil } // Copy the resulting buffer to our intermediate one, and // return errNewBuffer to ensure that we aren't called a second // time. This error code is handled specially below. // // c.buf will be freed and return to the pool when it is done. if pendingPre > 0 { pendingPre-- } r.buf = c.uncompressed return errNewBuffer } for done < len(p) { // Do we have buffered data available? if r.buf != nil { n, err := r.buf.Read(p[done:]) done += n if err == io.EOF { // This is the uncompressed buffer, it can be // returned to the pool at this point. r.buf.Reset() bufPool.Put(r.buf) r.buf = nil } else if err != nil { // Should never happen. defer r.stop() return done, err } continue } // Read the length of the next chunk and reset the // reader. The length is used to limit the reader. // // See writer.flush. if _, err := io.ReadFull(r.in, r.scratch[:4]); err != nil { // This is generally okay as long as there // are still buffers outstanding. We actually // just wait for completion of those buffers here // and continue our loop. if err := r.schedule(nil, callback); err == nil { // We've actually finished all buffers; this is // the normal EOF exit path. defer r.stop() return done, io.EOF } else if err == errNewBuffer { // A new buffer is now available. continue } else { // Some other error occurred; we cannot // process any further. defer r.stop() return done, err } } l := binary.BigEndian.Uint32(r.scratch[:4]) // Read this chunk and schedule decompression. compressed := bufPool.Get().(*bytes.Buffer) if _, err := io.CopyN(compressed, r.in, int64(l)); err != nil { // Some other error occurred; see above. if err == io.EOF { err = io.ErrUnexpectedEOF } return done, err } var sum []byte if r.hashPool != nil { sum = make([]byte, len(r.lastSum)) if _, err := io.ReadFull(r.in, sum); err != nil { if err == io.EOF { err = io.ErrUnexpectedEOF } return done, err } } // Are we doing inline decoding? // // Note that we need to check the length here against // bytes.MinRead, since the bytes library will choose to grow // the slice if the available capacity is not at least // bytes.MinRead. This limits inline decoding to chunkSizes // that are at least bytes.MinRead (which is not unreasonable). var c *chunk start := done + ((pendingPre + pendingInline) * int(r.chunkSize)) if len(p) >= start+int(r.chunkSize) && len(p) >= start+bytes.MinRead { c = newChunk(r.lastSum, sum, compressed, bytes.NewBuffer(p[start:start])) pendingInline++ } else { c = newChunk(r.lastSum, sum, compressed, nil) } r.lastSum = sum if err := r.schedule(c, callback); err == errNewBuffer { // A new buffer was completed while we were reading. // That's great, but we need to force schedule the // current buffer so that it does not get lost. // // It is safe to pass nil as an output function here, // because we know that we just freed up a slot above. r.schedule(c, nil) } else if err != nil { // Some other error occurred; see above. defer r.stop() return done, err } } // Make sure that everything has been decoded successfully, otherwise // parts of p may not actually have completed. for pendingInline > 0 { if err := r.schedule(nil, func(c *chunk) error { if err := callback(c); err != nil { return err } // The nil case means that an inline buffer has // completed. The callback will have already removed // the inline buffer from the map, so we just return an // error to check the top of the loop again. return errNewBuffer }); err != errNewBuffer { // Some other error occurred; see above. return done, err } } // Need to return done here, since it may have been adjusted by the // callback to compensation for partial reads on some inline buffer. return done, nil } // Writer is a compressed writer. type Writer struct { pool // out is the underlying writer. out io.Writer // closed indicates whether the file has been closed. closed bool // scratch is a temporary buffer used for marshalling. This is declared // unfront here to avoid reallocation. scratch [4]byte } var _ io.Writer = (*Writer)(nil) // NewWriter returns a new compressed writer. If key is non-nil, hash values are // generated and written out for compressed bytes. See package comments for // details. // // The recommended chunkSize is on the order of 1M. Extra memory may be // buffered (in the form of read-ahead, or buffered writes), and is limited to // O(chunkSize * [1+GOMAXPROCS]). func NewWriter(out io.Writer, key []byte, chunkSize uint32, level int) (*Writer, error) { w := &Writer{ pool: pool{ chunkSize: chunkSize, buf: bufPool.Get().(*bytes.Buffer), }, out: out, } w.init(key, 1+runtime.GOMAXPROCS(0), true, level) binary.BigEndian.PutUint32(w.scratch[:], chunkSize) if _, err := w.out.Write(w.scratch[:4]); err != nil { return nil, err } if w.hashPool != nil { h := w.hashPool.getHash() binary.BigEndian.PutUint32(w.scratch[:], chunkSize) h.Write(w.scratch[:4]) w.lastSum = h.Sum(nil) w.hashPool.putHash(h) if _, err := io.CopyN(w.out, bytes.NewReader(w.lastSum), int64(len(w.lastSum))); err != nil { return nil, err } } return w, nil } // flush writes a single buffer. func (w *Writer) flush(c *chunk) error { // Prefix each chunk with a length; this allows the reader to safely // limit reads while buffering. l := uint32(c.compressed.Len()) binary.BigEndian.PutUint32(w.scratch[:], l) if _, err := w.out.Write(w.scratch[:4]); err != nil { return err } // Write out to the stream. if _, err := io.CopyN(w.out, c.compressed, int64(c.compressed.Len())); err != nil { return err } if w.hashPool != nil { io.CopyN(c.h, bytes.NewReader(w.lastSum), int64(len(w.lastSum))) sum := c.h.Sum(nil) w.hashPool.putHash(c.h) c.h = nil if _, err := io.CopyN(w.out, bytes.NewReader(sum), int64(len(sum))); err != nil { return err } w.lastSum = sum } return nil } // Write implements io.Writer.Write. func (w *Writer) Write(p []byte) (int, error) { w.mu.Lock() defer w.mu.Unlock() // Did we close already? if w.closed { return 0, io.ErrUnexpectedEOF } // See above; we need to track in the same way. var ( pendingPre = w.nextInput - w.nextOutput pendingInline = 0 ) callback := func(c *chunk) error { if pendingPre > 0 { pendingPre-- err := w.flush(c) c.uncompressed.Reset() bufPool.Put(c.uncompressed) return err } if pendingInline > 0 { pendingInline-- return w.flush(c) } panic("both pendingPre and pendingInline exhausted") } for done := 0; done < len(p); { // Construct an inline buffer if we're doing an inline // encoding; see above regarding the bytes.MinRead constraint. inline := false if w.buf.Len() == 0 && len(p) >= done+int(w.chunkSize) && len(p) >= done+bytes.MinRead { bufPool.Put(w.buf) // Return to the pool; never scheduled. w.buf = bytes.NewBuffer(p[done : done+int(w.chunkSize)]) done += int(w.chunkSize) pendingInline++ inline = true } // Do we need to flush w.buf? Note that this case should be hit // immediately following the inline case above. left := int(w.chunkSize) - w.buf.Len() if left == 0 { if err := w.schedule(newChunk(nil, nil, nil, w.buf), callback); err != nil { return done, err } if !inline { pendingPre++ } // Reset the buffer, since this has now been scheduled // for compression. Note that this may be trampled // immediately by the bufPool.Put(w.buf) above if the // next buffer happens to be inline, but that's okay. w.buf = bufPool.Get().(*bytes.Buffer) continue } // Read from p into w.buf. toWrite := len(p) - done if toWrite > left { toWrite = left } n, err := w.buf.Write(p[done : done+toWrite]) done += n if err != nil { return done, err } } // Make sure that everything has been flushed, we can't return until // all the contents from p have been used. for pendingInline > 0 { if err := w.schedule(nil, func(c *chunk) error { if err := callback(c); err != nil { return err } // The flush was successful, return errNewBuffer here // to break from the loop and check the condition // again. return errNewBuffer }); err != errNewBuffer { return len(p), err } } return len(p), nil } // Close implements io.Closer.Close. func (w *Writer) Close() error { w.mu.Lock() defer w.mu.Unlock() // Did we already close? After the call to Close, we always mark as // closed, regardless of whether the flush is successful. if w.closed { return io.ErrUnexpectedEOF } w.closed = true defer w.stop() // Schedule any remaining partial buffer; we pass w.flush directly here // because the final buffer is guaranteed to not be an inline buffer. if w.buf.Len() > 0 { if err := w.schedule(newChunk(nil, nil, nil, w.buf), w.flush); err != nil { return err } } // Flush all scheduled buffers; see above. if err := w.schedule(nil, w.flush); err != nil { return err } // Close the underlying writer (if necessary). if closer, ok := w.out.(io.Closer); ok { return closer.Close() } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/compressio/compressio_state_autogen.go000066400000000000000000000000741465435605700272440ustar00rootroot00000000000000// automatically generated by stateify. package compressio golang-gvisor-gvisor-0.0~20240729.0/pkg/compressio/nocompressio.go000066400000000000000000000134241465435605700246620ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package compressio import ( "bufio" "bytes" "crypto/hmac" "crypto/sha256" "encoding/binary" "hash" "io" ) // nocompressio provides data storage that does not use data compression but // offers optional data integrity via SHA-256 hashing. // // The stream format is defined as follows. // // /------------------------------------------------------\ // | data size (4-bytes) | // +------------------------------------------------------+ // | data | // +------------------------------------------------------+ // | (optional) hash (32-bytes) | // +------------------------------------------------------+ // | data size (4-bytes) | // +------------------------------------------------------+ // | ...... | // \------------------------------------------------------/ // // where each hash is calculated from the following items in order // // data // data size // SimpleReader is a reader from uncompressed image. type SimpleReader struct { // in is the source. in io.Reader // key is the key used to create hash objects. key []byte // h is the hash object. h hash.Hash // current data chunk size chunkSize uint32 // current chunk position done uint32 } var _ io.Reader = (*SimpleReader)(nil) const ( defaultBufSize = 256 * 1024 ) // NewSimpleReader returns a new (uncompressed) reader. If key is non-nil, the data stream // is assumed to contain expected hash values. See package comments for // details. func NewSimpleReader(in io.Reader, key []byte) (*SimpleReader, error) { r := &SimpleReader{ in: bufio.NewReaderSize(in, defaultBufSize), key: key, } if key != nil { r.h = hmac.New(sha256.New, key) } return r, nil } // Read implements io.Reader.Read. func (r *SimpleReader) Read(p []byte) (int, error) { var scratch [4]byte if len(p) == 0 { return r.in.Read(p) } // need next chunk? if r.done >= r.chunkSize { if _, err := io.ReadFull(r.in, scratch[:]); err != nil { return 0, err } r.chunkSize = binary.BigEndian.Uint32(scratch[:]) r.done = 0 if r.key != nil { r.h.Reset() } if r.chunkSize == 0 { // this must not happen return 0, io.ErrNoProgress } } toRead := uint32(len(p)) // can't read more than what's left if toRead > r.chunkSize-r.done { toRead = r.chunkSize - r.done } n, err := r.in.Read(p[:toRead]) if err != nil { if err == io.EOF { // this only can happen if storage or data size is corrupted, // but we have no other means to detect it earlier as we store // hash after the data block. return n, ErrHashMismatch } return n, err } if r.key != nil { _, _ = r.h.Write(p[:n]) } r.done += uint32(n) if r.done >= r.chunkSize { if r.key != nil { binary.BigEndian.PutUint32(scratch[:], r.chunkSize) r.h.Write(scratch[:4]) sum := r.h.Sum(nil) readerSum := make([]byte, len(sum)) if _, err := io.ReadFull(r.in, readerSum); err != nil { if err == io.EOF { return n, io.ErrUnexpectedEOF } return n, err } if !hmac.Equal(readerSum, sum) { return n, ErrHashMismatch } } r.done = 0 r.chunkSize = 0 } return n, nil } // SimpleWriter is a writer that does not compress. type SimpleWriter struct { // base is the underlying writer. base io.Writer // out is a buffered writer. out *bufio.Writer // key is the key used to create hash objects. key []byte // closed indicates whether the file has been closed. closed bool } var _ io.Writer = (*SimpleWriter)(nil) var _ io.Closer = (*SimpleWriter)(nil) // NewSimpleWriter returns a new non-compressing writer. If key is non-nil, hash values are // generated and written out for compressed bytes. See package comments for // details. func NewSimpleWriter(out io.Writer, key []byte) (*SimpleWriter, error) { return &SimpleWriter{ base: out, out: bufio.NewWriterSize(out, defaultBufSize), key: key, }, nil } // Write implements io.Writer.Write. func (w *SimpleWriter) Write(p []byte) (int, error) { var scratch [4]byte // Did we close already? if w.closed { return 0, io.ErrUnexpectedEOF } l := uint32(len(p)) // chunk length binary.BigEndian.PutUint32(scratch[:], l) if _, err := w.out.Write(scratch[:4]); err != nil { return 0, err } // Write out to the stream. n, err := w.out.Write(p) if err != nil { return n, err } if w.key != nil { h := hmac.New(sha256.New, w.key) // chunk data _, _ = h.Write(p) // chunk length binary.BigEndian.PutUint32(scratch[:], l) h.Write(scratch[:4]) sum := h.Sum(nil) if _, err := io.CopyN(w.out, bytes.NewReader(sum), int64(len(sum))); err != nil { return n, err } } return n, nil } // Close implements io.Closer.Close. func (w *SimpleWriter) Close() error { // Did we already close? After the call to Close, we always mark as // closed, regardless of whether the flush is successful. if w.closed { return io.ErrUnexpectedEOF } w.closed = true // Flush buffered writer if err := w.out.Flush(); err != nil { return err } // Close the underlying writer (if necessary). if closer, ok := w.base.(io.Closer); ok { return closer.Close() } w.out = nil w.base = nil return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/context/000077500000000000000000000000001465435605700211105ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/context/context.go000066400000000000000000000156371465435605700231370ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package context defines an internal context type. // // The given Context conforms to the standard Go context, but mandates // additional methods that are specific to the kernel internals. Note however, // that the Context described by this package carries additional constraints // regarding concurrent access and retaining beyond the scope of a call. // // See the Context type for complete details. package context import ( "context" "errors" "sync" "time" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/waiter" ) // Blocker represents an object with control flow hooks. // // These may be used to perform blocking operations, sleep or otherwise // wait, since there may be asynchronous events that require processing. type Blocker interface { // Interrupt interrupts any Block operations. Interrupt() // Interrupted notes whether this context is Interrupted. Interrupted() bool // BlockOn blocks until one of the previously registered events occurs, // or some external interrupt (cancellation). // // The return value should indicate whether the wake-up occurred as a // result of the requested event (versus an external interrupt). BlockOn(waiter.Waitable, waiter.EventMask) bool // Block blocks until an event is received from C, or some external // interrupt. It returns nil if an event is received from C and an err if t // is interrupted. Block(C <-chan struct{}) error // BlockWithTimeoutOn blocks until either the conditions of Block are // satisfied, or the timeout is hit. Note that deadlines are not supported // since the notion of "with respect to what clock" is not resolved. // // The return value is per BlockOn. BlockWithTimeoutOn(waiter.Waitable, waiter.EventMask, time.Duration) (time.Duration, bool) // UninterruptibleSleepStart indicates the beginning of an uninterruptible // sleep state (equivalent to Linux's TASK_UNINTERRUPTIBLE). If deactivate // is true and the Context represents a Task, the Task's AddressSpace is // deactivated. UninterruptibleSleepStart(deactivate bool) // UninterruptibleSleepFinish indicates the end of an uninterruptible sleep // state that was begun by a previous call to UninterruptibleSleepStart. If // activate is true and the Context represents a Task, the Task's // AddressSpace is activated. Normally activate is the same value as the // deactivate parameter passed to UninterruptibleSleepStart. UninterruptibleSleepFinish(activate bool) } // NoTask is an implementation of Blocker that does not block. type NoTask struct { cancel chan struct{} } // Interrupt implements Blocker.Interrupt. func (nt *NoTask) Interrupt() { select { case nt.cancel <- struct{}{}: default: } } // Interrupted implements Blocker.Interrupted. func (nt *NoTask) Interrupted() bool { return nt.cancel != nil && len(nt.cancel) > 0 } // Block implements Blocker.Block. func (nt *NoTask) Block(C <-chan struct{}) error { if nt.cancel == nil { nt.cancel = make(chan struct{}, 1) } select { case <-nt.cancel: return errors.New("interrupted system call") // Interrupted. case <-C: return nil } } // BlockOn implements Blocker.BlockOn. func (nt *NoTask) BlockOn(w waiter.Waitable, mask waiter.EventMask) bool { if nt.cancel == nil { nt.cancel = make(chan struct{}, 1) } e, ch := waiter.NewChannelEntry(mask) w.EventRegister(&e) defer w.EventUnregister(&e) select { case <-nt.cancel: return false // Interrupted. case _, ok := <-ch: return ok } } // BlockWithTimeoutOn implements Blocker.BlockWithTimeoutOn. func (nt *NoTask) BlockWithTimeoutOn(w waiter.Waitable, mask waiter.EventMask, duration time.Duration) (time.Duration, bool) { if nt.cancel == nil { nt.cancel = make(chan struct{}, 1) } e, ch := waiter.NewChannelEntry(mask) w.EventRegister(&e) defer w.EventUnregister(&e) start := time.Now() // In system time. t := time.AfterFunc(duration, func() { ch <- struct{}{} }) select { case <-nt.cancel: return time.Since(start), false // Interrupted. case _, ok := <-ch: if ok && t.Stop() { // Timer never fired. return time.Since(start), ok } // Timer fired, remain is zero. return time.Duration(0), ok } } // UninterruptibleSleepStart implmenents Blocker.UninterruptedSleepStart. func (*NoTask) UninterruptibleSleepStart(bool) {} // UninterruptibleSleepFinish implmenents Blocker.UninterruptibleSleepFinish. func (*NoTask) UninterruptibleSleepFinish(bool) {} // Context represents a thread of execution (hereafter "goroutine" to reflect // Go idiosyncrasy). It carries state associated with the goroutine across API // boundaries. // // While Context exists for essentially the same reasons as Go's standard // context.Context, the standard type represents the state of an operation // rather than that of a goroutine. This is a critical distinction: // // - Unlike context.Context, which "may be passed to functions running in // different goroutines", it is *not safe* to use the same Context in multiple // concurrent goroutines. // // - It is *not safe* to retain a Context passed to a function beyond the scope // of that function call. // // In both cases, values extracted from the Context should be used instead. type Context interface { context.Context log.Logger Blocker } // logContext implements basic logging. type logContext struct { NoTask log.Logger context.Context } // bgContext is the context returned by context.Background. var bgContext Context var bgOnce sync.Once // Background returns an empty context using the default logger. // Generally, one should use the Task as their context when available, or avoid // having to use a context in places where a Task is unavailable. // // Using a Background context for tests is fine, as long as no values are // needed from the context in the tested code paths. // // The global log.SetTarget() must be called before context.Background() func Background() Context { bgOnce.Do(func() { bgContext = &logContext{ Context: context.Background(), Logger: log.Log(), } }) return bgContext } // WithValue returns a copy of parent in which the value associated with key is // val. func WithValue(parent Context, key, val any) Context { return &withValue{ Context: parent, key: key, val: val, } } type withValue struct { Context key any val any } // Value implements Context.Value. func (ctx *withValue) Value(key any) any { if key == ctx.key { return ctx.val } return ctx.Context.Value(key) } golang-gvisor-gvisor-0.0~20240729.0/pkg/context/context_state_autogen.go000066400000000000000000000000711465435605700260430ustar00rootroot00000000000000// automatically generated by stateify. package context golang-gvisor-gvisor-0.0~20240729.0/pkg/control/000077500000000000000000000000001465435605700211045ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/control/client/000077500000000000000000000000001465435605700223625ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/control/client/client.go000066400000000000000000000020221465435605700241630ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package client provides a basic control client interface. package client import ( "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/urpc" ) // ConnectTo attempts to connect to the sandbox with the given address. func ConnectTo(addr string) (*urpc.Client, error) { // Connect to the server. conn, err := unet.Connect(addr, false) if err != nil { return nil, err } // Wrap in our stream codec. return urpc.NewClient(conn), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/control/client/client_state_autogen.go000066400000000000000000000000701465435605700271060ustar00rootroot00000000000000// automatically generated by stateify. package client golang-gvisor-gvisor-0.0~20240729.0/pkg/control/server/000077500000000000000000000000001465435605700224125ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/control/server/server.go000066400000000000000000000117261465435605700242560ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. /* Package server provides a basic control server interface. Note that no objects are registered by default. Users must provide their own implementations of the control interface. */ package server import ( "fmt" "os" "path/filepath" "sync/atomic" "time" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/urpc" ) // curUID is the unix user ID of the user that the control server is running as. var curUID = os.Getuid() // Server is a basic control server. type Server struct { // socket is our bound socket. socket *unet.ServerSocket // server is our rpc server. server atomic.Pointer[urpc.Server] // wg waits for the accept loop to terminate. wg sync.WaitGroup } // New returns a new bound control server. func New(socket *unet.ServerSocket) *Server { s := &Server{ socket: socket, } s.server.Store(urpc.NewServer()) return s } // ResetServer resets the server, clearing all registered objects. It stops the // old server asynchronously. func (s *Server) ResetServer() { if old := s.server.Swap(urpc.NewServer()); old != nil { go old.Stop(0) } } // FD returns the file descriptor that the server is running on. func (s *Server) FD() int { return s.socket.FD() } // Wait waits for the main server goroutine to exit. This should be // called after a call to Serve. func (s *Server) Wait() { s.wg.Wait() } // Stop stops the server. Note that this function should only be called once // and the server should not be used afterwards. func (s *Server) Stop(timeout time.Duration) { s.socket.Close() s.Wait() // This will cause existing clients to be terminated safely. If the // registered handlers have a Stop callback, it will be called. s.server.Load().Stop(timeout) } // StartServing starts listening for connect and spawns the main service // goroutine for handling incoming control requests. StartServing does not // block; to wait for the control server to exit, call Wait. func (s *Server) StartServing() error { // Actually start listening. if err := s.socket.Listen(); err != nil { return err } s.wg.Add(1) go func() { // S/R-SAFE: does not impact state directly. s.serve() s.wg.Done() }() return nil } // serve is the body of the main service goroutine. It handles incoming control // connections and dispatches requests to registered objects. func (s *Server) serve() { for { // Accept clients. conn, err := s.socket.Accept() if err != nil { return } // Handle the connection non-blockingly. s.server.Load().StartHandling(conn) } } // Register registers a specific control interface with the server. func (s *Server) Register(obj any) { s.server.Load().Register(obj) } // CreateFromFD creates a new control bound to the given 'fd'. It has no // registered interfaces and will not start serving until StartServing is // called. func CreateFromFD(fd int) (*Server, error) { socket, err := unet.NewServerSocket(fd) if err != nil { return nil, err } return New(socket), nil } // Create creates a new control server with an abstract unix socket // with the given address, which must must be unique and a valid // abstract socket name. func Create(addr string) (*Server, error) { socket, err := CreateSocket(addr) if err != nil { return nil, err } return CreateFromFD(socket) } // CreateSocket creates a socket that can be used with control server, // but doesn't start control server. 'addr' must be a valid and unique // abstract socket name. Returns socket's FD, -1 in case of error. func CreateSocket(addr string) (int, error) { if addr[0] != 0 && len(addr) >= linux.UnixPathMax { // This is not an abstract socket path. It is a filesystem path. // UDS bind fails when the len(socket path) >= UNIX_PATH_MAX. Instead // try opening the parent and attempt to shorten the path via procfs. dirFD, err := unix.Open(filepath.Dir(addr), unix.O_RDONLY|unix.O_DIRECTORY, 0) if err != nil { return -1, fmt.Errorf("failed to open parent directory of %q", addr) } defer unix.Close(dirFD) name := filepath.Base(addr) addr = fmt.Sprintf("/proc/self/fd/%d/%s", dirFD, name) if len(addr) >= linux.UnixPathMax { // Urgh... This is just doomed to fail. Ask caller to use a shorter name. return -1, fmt.Errorf("socket name %q is too long, use a shorter name", name) } } socket, err := unet.Bind(addr, false) if err != nil { return -1, err } return socket.Release() } golang-gvisor-gvisor-0.0~20240729.0/pkg/control/server/server_state_autogen.go000066400000000000000000000000701465435605700271660ustar00rootroot00000000000000// automatically generated by stateify. package server golang-gvisor-gvisor-0.0~20240729.0/pkg/coretag/000077500000000000000000000000001465435605700210505ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/coretag/coretag.go000066400000000000000000000052351465435605700230300ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package coretag implements core tagging. package coretag import ( "fmt" "io/ioutil" "strconv" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" ) // Enable core tagging. If this returns with no error, all threads in the // current thread group will be run in a core tagged thread. Only available on // linux kernel >= 5.14. func Enable() error { // Set core tag on current thread group. // prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, pid=0, // PR_SCHED_CORE_SCOPE_THREAD_GROUP, cookie=nullptr) // pid=0 means current pid. // cookie=nullptr is required for PR_SCHED_CORE_CREATE. if _, _, errno := unix.Syscall6(unix.SYS_PRCTL, unix.PR_SCHED_CORE, unix.PR_SCHED_CORE_CREATE, 0 /*pid*/, linux.PR_SCHED_CORE_SCOPE_THREAD_GROUP, 0, 0); errno != 0 { return fmt.Errorf("failed to core tag sentry: %w", errno) } return nil } // GetAllCoreTags returns the core tag of all the threads in the thread group. func GetAllCoreTags(pid int) ([]uint64, error) { // prctl(PR_SCHED_CORE_GET, PR_SCHED_CORE_SCOPE_THREAD_GROUP, ...) is not supported // in linux. So instead we get all threads from /proc//task and get all the // core tags individually. tagSet := make(map[uint64]struct{}) // Get current pid core tag. tag, err := getCoreTag(pid) if err != nil { return nil, err } tagSet[tag] = struct{}{} // Get core tags of tids. tids, err := getTids(pid) if err != nil { return nil, err } for tid := range tids { tag, err := getCoreTag(tid) if err != nil { return nil, err } tagSet[tag] = struct{}{} } // Return set of tags as a slice. tags := make([]uint64, 0, len(tagSet)) for t := range tagSet { tags = append(tags, t) } return tags, nil } // getTids returns set of tids as reported by /proc//task. func getTids(pid int) (map[int]struct{}, error) { tids := make(map[int]struct{}) files, err := ioutil.ReadDir("/proc/" + strconv.Itoa(pid) + "/task") if err != nil { return nil, err } for _, file := range files { tid, err := strconv.Atoi(file.Name()) if err != nil { return nil, err } tids[tid] = struct{}{} } return tids, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/coretag/coretag_state_autogen.go000066400000000000000000000000711465435605700257430ustar00rootroot00000000000000// automatically generated by stateify. package coretag golang-gvisor-gvisor-0.0~20240729.0/pkg/coretag/coretag_unsafe.go000066400000000000000000000022531465435605700243660ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package coretag import ( "fmt" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" ) // getCoreTag returns the core tag of the tid. Only available on linux kernel >= 5.14. func getCoreTag(tid int) (uint64, error) { var cookie uint64 if _, _, errno := unix.Syscall6(unix.SYS_PRCTL, unix.PR_SCHED_CORE, unix.PR_SCHED_CORE_GET, uintptr(tid), linux.PR_SCHED_CORE_SCOPE_THREAD, uintptr(unsafe.Pointer(&cookie)), 0); errno != 0 { return 0, fmt.Errorf("prctl(PR_SCHED_CORE, PR_SCHED_CORE_GET, %d, PR_SCHED_CORE_SCOPE_THREAD) (errno=%d)", tid, errno) } return cookie, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/coretag/coretag_unsafe_state_autogen.go000066400000000000000000000000711465435605700273040ustar00rootroot00000000000000// automatically generated by stateify. package coretag golang-gvisor-gvisor-0.0~20240729.0/pkg/coverage/000077500000000000000000000000001465435605700212175ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/coverage/coverage.go000066400000000000000000000243171465435605700233500ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false // Package coverage provides an interface through which Go coverage data can // be collected, converted to kcov format, and exposed to userspace. // // Coverage can be enabled by calling bazel {build,test} with // --collect_coverage_data and --instrumentation_filter with the desired // coverage surface. This causes bazel to use the Go cover tool manually to // generate instrumented files. It injects a hook that registers all coverage // data with the coverdata package. // // Using coverdata.Counters requires sync/atomic integers. // +checkalignedignore package coverage import ( "fmt" "io" "sort" "sync/atomic" "testing" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sync" "github.com/bazelbuild/rules_go/go/tools/coverdata" ) var ( // coverageMu must be held while accessing coverdata.*. This prevents // concurrent reads/writes from multiple threads collecting coverage data. coverageMu sync.RWMutex // reportOutput is the place to write out a coverage report. It should be // closed after the report is written. It is protected by reportOutputMu. reportOutput io.WriteCloser reportOutputMu sync.Mutex ) // blockBitLength is the number of bits used to represent coverage block index // in a synthetic PC (the rest are used to represent the file index). Even // though a PC has 64 bits, we only use the lower 32 bits because some users // (e.g., syzkaller) may truncate that address to a 32-bit value. // // As of this writing, there are ~1200 files that can be instrumented and at // most ~1200 blocks per file, so 16 bits is more than enough to represent every // file and every block. const blockBitLength = 16 // Available returns whether any coverage data is available. func Available() bool { return len(coverdata.Blocks) > 0 } // EnableReport sets up coverage reporting. func EnableReport(w io.WriteCloser) { reportOutputMu.Lock() defer reportOutputMu.Unlock() reportOutput = w } // KcovSupported returns whether the kcov interface should be made available. // // If coverage reporting is on, do not turn on kcov, which will consume // coverage data. func KcovSupported() bool { return (reportOutput == nil) && Available() } var globalData struct { // files is the set of covered files sorted by filename. It is calculated at // startup. files []string // syntheticPCs are a set of PCs calculated at startup, where the PC // at syntheticPCs[i][j] corresponds to file i, block j. syntheticPCs [][]uint64 // once ensures that globalData is only initialized once. once sync.Once } // ClearCoverageData clears existing coverage data. // //go:norace func ClearCoverageData() { coverageMu.Lock() defer coverageMu.Unlock() // We do not use atomic operations while reading/writing to the counters, // which would drastically degrade performance. Slight discrepancies due to // racing is okay for the purposes of kcov. for _, counters := range coverdata.Counters { clear(counters) } } var coveragePool = sync.Pool{ New: func() any { return make([]byte, 0) }, } // ConsumeCoverageData builds and writes the collection of covered PCs. It // returns the number of bytes written. // // In Linux, a kernel configuration is set that compiles the kernel with a // custom function that is called at the beginning of every basic block, which // updates the memory-mapped coverage information. The Go coverage tool does not // allow us to inject arbitrary instructions into basic blocks, but it does // provide data that we can convert to a kcov-like format and transfer them to // userspace through a memory mapping. // // Note that this is not a strict implementation of kcov, which is especially // tricky to do because we do not have the same coverage tools available in Go // that that are available for the actual Linux kernel. In Linux, a kernel // configuration is set that compiles the kernel with a custom function that is // called at the beginning of every basic block to write program counters to the // kcov memory mapping. In Go, however, coverage tools only give us a count of // basic blocks as they are executed. Every time we return to userspace, we // collect the coverage information and write out PCs for each block that was // executed, providing userspace with the illusion that the kcov data is always // up to date. For convenience, we also generate a unique synthetic PC for each // block instead of using actual PCs. Finally, we do not provide thread-specific // coverage data (each kcov instance only contains PCs executed by the thread // owning it); instead, we will supply data for any file specified by -- // instrumentation_filter. // // Note that we "consume", i.e. clear, coverdata when this function is run, to // ensure that each event is only reported once. Due to the limitations of Go // coverage tools, we reset the global coverage data every time this function is // run. // //go:norace func ConsumeCoverageData(w io.Writer) int { InitCoverageData() coverageMu.Lock() defer coverageMu.Unlock() total := 0 var pcBuffer [8]byte for fileNum, file := range globalData.files { counters := coverdata.Counters[file] for index := 0; index < len(counters); index++ { // We do not use atomic operations while reading/writing to the counters, // which would drastically degrade performance. Slight discrepancies due to // racing is okay for the purposes of kcov. if counters[index] == 0 { continue } // Non-zero coverage data found; consume it and report as a PC. counters[index] = 0 pc := globalData.syntheticPCs[fileNum][index] hostarch.ByteOrder.PutUint64(pcBuffer[:], pc) n, err := w.Write(pcBuffer[:]) if err != nil { if err == io.EOF { // Simply stop writing if we encounter EOF; it's ok if we attempted to // write more than we can hold. return total + n } panic(fmt.Sprintf("Internal error writing PCs to kcov area: %v", err)) } total += n } } return total } // InitCoverageData initializes globalData. It should be called before any kcov // data is written. func InitCoverageData() { globalData.once.Do(func() { // First, order all files. Then calculate synthetic PCs for every block // (using the well-defined ordering for files as well). for file := range coverdata.Blocks { globalData.files = append(globalData.files, file) } sort.Strings(globalData.files) for fileNum, file := range globalData.files { blocks := coverdata.Blocks[file] pcs := make([]uint64, 0, len(blocks)) for blockNum := range blocks { pcs = append(pcs, calculateSyntheticPC(fileNum, blockNum)) } globalData.syntheticPCs = append(globalData.syntheticPCs, pcs) } }) } // reportOnce ensures that a coverage report is written at most once. For a // complete coverage report, Report should be called during the sandbox teardown // process. Report is called from multiple places (which may overlap) so that a // coverage report is written in different sandbox exit scenarios. var reportOnce sync.Once // Report writes out a coverage report with all blocks that have been covered. // // TODO(b/144576401): Decide whether this should actually be in LCOV format func Report() error { if reportOutput == nil { return nil } var err error reportOnce.Do(func() { for file, counters := range coverdata.Counters { blocks := coverdata.Blocks[file] for i := 0; i < len(counters); i++ { if atomic.LoadUint32(&counters[i]) > 0 { err = writeBlock(reportOutput, file, blocks[i]) if err != nil { return } } } } reportOutput.Close() }) return err } // Symbolize prints information about the block corresponding to pc. func Symbolize(out io.Writer, pc uint64) error { fileNum, blockNum := syntheticPCToIndexes(pc) file, err := fileFromIndex(fileNum) if err != nil { return err } block, err := blockFromIndex(file, blockNum) if err != nil { return err } return writeBlockWithPC(out, pc, file, block) } // WriteAllBlocks prints all information about all blocks along with their // corresponding synthetic PCs. func WriteAllBlocks(out io.Writer) error { for fileNum, file := range globalData.files { for blockNum, block := range coverdata.Blocks[file] { if err := writeBlockWithPC(out, calculateSyntheticPC(fileNum, blockNum), file, block); err != nil { return err } } } return nil } func writeBlockWithPC(out io.Writer, pc uint64, file string, block testing.CoverBlock) error { if _, err := io.WriteString(out, fmt.Sprintf("%#x\n", pc)); err != nil { return err } return writeBlock(out, file, block) } func writeBlock(out io.Writer, file string, block testing.CoverBlock) error { _, err := io.WriteString(out, fmt.Sprintf("%s:%d.%d,%d.%d\n", file, block.Line0, block.Col0, block.Line1, block.Col1)) return err } func calculateSyntheticPC(fileNum int, blockNum int) uint64 { return (uint64(fileNum) << blockBitLength) + uint64(blockNum) } func syntheticPCToIndexes(pc uint64) (fileNum int, blockNum int) { return int(pc >> blockBitLength), int(pc & ((1 << blockBitLength) - 1)) } // fileFromIndex returns the name of the file in the sorted list of instrumented files. func fileFromIndex(i int) (string, error) { total := len(globalData.files) if i < 0 || i >= total { return "", fmt.Errorf("file index out of range: [%d] with length %d", i, total) } return globalData.files[i], nil } // blockFromIndex returns the i-th block in the given file. func blockFromIndex(file string, i int) (testing.CoverBlock, error) { blocks, ok := coverdata.Blocks[file] if !ok { return testing.CoverBlock{}, fmt.Errorf("instrumented file %s does not exist", file) } total := len(blocks) if i < 0 || i >= total { return testing.CoverBlock{}, fmt.Errorf("block index out of range: [%d] with length %d", i, total) } return blocks[i], nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/coverage/coverage_state_autogen.go000066400000000000000000000001361465435605700262630ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package coverage golang-gvisor-gvisor-0.0~20240729.0/pkg/cpuid/000077500000000000000000000000001465435605700205305ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/cpuid/cpuid.go000066400000000000000000000174471465435605700222000ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package cpuid provides basic functionality for creating and adjusting CPU // feature sets. // // Each architecture should define its own FeatureSet type, that must be // savable, along with an allFeatures map, appropriate arch hooks and a // HostFeatureSet function. This file contains common functionality to all // architectures, which is essentially string munging and some errors. // // Individual architectures may export methods on FeatureSet that are relevant, // e.g. FeatureSet.Vendor(). Common to all architectures, FeatureSets include // HasFeature, which provides a trivial mechanism to test for the presence of // specific hardware features. The hardware features are also defined on a // per-architecture basis. package cpuid import ( "encoding/binary" "fmt" "os" "runtime" "strings" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" ) // contextID is the package for anyContext.Context.Value keys. type contextID int const ( // CtxFeatureSet is the FeatureSet for the context. CtxFeatureSet contextID = iota // hardware capability bit vector. _AT_HWCAP = 16 // hardware capability bit vector 2. _AT_HWCAP2 = 26 ) // anyContext represents context.Context. type anyContext interface { Value(key any) any } // FromContext returns the FeatureSet from the context, if available. func FromContext(ctx anyContext) FeatureSet { v := ctx.Value(CtxFeatureSet) if v == nil { return FeatureSet{} // Panics if used. } return v.(FeatureSet) } // Feature is a unique identifier for a particular cpu feature. We just use an // int as a feature number on x86 and arm64. // // On x86, features are numbered according to "blocks". Each block is 32 bits, and // feature bits from the same source (cpuid leaf/level) are in the same block. // // On arm64, features are numbered according to the ELF HWCAP definition, from // arch/arm64/include/uapi/asm/hwcap.h. type Feature int // allFeatureInfo is the value for allFeatures. type allFeatureInfo struct { // displayName is the short display name for the feature. displayName string // shouldAppear indicates whether the feature normally appears in // cpuinfo. This affects FlagString only. shouldAppear bool } // String implements fmt.Stringer.String. func (f Feature) String() string { info, ok := allFeatures[f] if ok { return info.displayName } return fmt.Sprintf("[0x%x?]", int(f)) // No given name. } // reverseMap is a map from displayName to Feature. var reverseMap = func() map[string]Feature { m := make(map[string]Feature) for feature, info := range allFeatures { if info.displayName != "" { // Sanity check that the name is unique. if old, ok := m[info.displayName]; ok { panic(fmt.Sprintf("feature %v has conflicting values (0x%x vs 0x%x)", info.displayName, old, feature)) } m[info.displayName] = feature } } return m }() // FeatureFromString returns the Feature associated with the given feature // string plus a bool to indicate if it could find the feature. func FeatureFromString(s string) (Feature, bool) { feature, ok := reverseMap[s] return feature, ok } // AllFeatures returns the full set of all possible features. func AllFeatures() (features []Feature) { archFlagOrder(func(f Feature) { features = append(features, f) }) return } // Subtract returns the features present in fs that are not present in other. // If all features in fs are present in other, Subtract returns nil. // // This does not check for any kinds of incompatibility. func (fs FeatureSet) Subtract(other FeatureSet) (left map[Feature]struct{}) { for feature := range allFeatures { thisHas := fs.HasFeature(feature) otherHas := other.HasFeature(feature) if thisHas && !otherHas { if left == nil { left = make(map[Feature]struct{}) } left[feature] = struct{}{} } } return } // FlagString prints out supported CPU flags. func (fs FeatureSet) FlagString() string { var s []string archFlagOrder(func(feature Feature) { if !fs.HasFeature(feature) { return } info := allFeatures[feature] if !info.shouldAppear { return } s = append(s, info.displayName) }) return strings.Join(s, " ") } // ErrIncompatible is returned for incompatible feature sets. type ErrIncompatible struct { reason string } // Error implements error.Error. func (e *ErrIncompatible) Error() string { return fmt.Sprintf("incompatible FeatureSet: %v", e.reason) } // CheckHostCompatible returns nil if fs is a subset of the host feature set. func (fs FeatureSet) CheckHostCompatible() error { hfs := HostFeatureSet() // Check that hfs is a superset of fs. if diff := fs.Subtract(hfs); len(diff) > 0 { return &ErrIncompatible{ reason: fmt.Sprintf("missing features: %v", diff), } } // Make arch-specific checks. return fs.archCheckHostCompatible(hfs) } // +stateify savable type hwCap struct { // hwCap1 stores HWCAP bits exposed through the elf auxiliary vector. hwCap1 uint64 // hwCap2 stores HWCAP2 bits exposed through the elf auxiliary vector. hwCap2 uint64 } // The auxiliary vector of a process on the Linux system can be read // from /proc/self/auxv, and tags and values are stored as 8-bytes // decimal key-value pairs on the 64-bit system. // // $ od -t d8 /proc/self/auxv // // 0000000 33 140734615224320 // 0000020 16 3219913727 // 0000040 6 4096 // 0000060 17 100 // 0000100 3 94665627353152 // 0000120 4 56 // 0000140 5 9 // 0000160 7 140425502162944 // 0000200 8 0 // 0000220 9 94665627365760 // 0000240 11 1000 // 0000260 12 1000 // 0000300 13 1000 // 0000320 14 1000 // 0000340 23 0 // 0000360 25 140734614619513 // 0000400 26 0 // 0000420 31 140734614626284 // 0000440 15 140734614619529 // 0000460 0 0 func readHWCap(auxvFilepath string) (hwCap, error) { c := hwCap{} if runtime.GOOS != "linux" { // Don't try to read Linux-specific /proc files. return c, fmt.Errorf("readHwCap only supported on linux, not %s", runtime.GOOS) } auxv, err := os.ReadFile(auxvFilepath) if err != nil { return c, fmt.Errorf("failed to read file %s: %w", auxvFilepath, err) } l := len(auxv) / 16 for i := 0; i < l; i++ { tag := binary.LittleEndian.Uint64(auxv[i*16:]) val := binary.LittleEndian.Uint64(auxv[i*16+8:]) if tag == _AT_HWCAP { c.hwCap1 = val } else if tag == _AT_HWCAP2 { c.hwCap2 = val } if (c.hwCap1 != 0) && (c.hwCap2 != 0) { break } } return c, nil } func initHWCap() { c, err := readHWCap("/proc/self/auxv") if err != nil { log.Warningf("cpuid HWCap not initialized: %w", err) } else { hostFeatureSet.hwCap = c } } var initOnce sync.Once // Initialize initializes the global data structures used by this package. // Must be called prior to using anything else in this package. func Initialize() { initOnce.Do(archInitialize) } golang-gvisor-gvisor-0.0~20240729.0/pkg/cpuid/cpuid_amd64.go000066400000000000000000000332231465435605700231610ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package cpuid import ( "context" "fmt" "io" ) // FeatureSet defines features in terms of CPUID leaves and bits. // The kernel also exposes the presence of features to userspace through // a set of flags(HWCAP/HWCAP2) bits, exposed in the auxiliary vector, which // are necessary to read for some features (e.g. FSGSBASE). // // Common references: // // Intel: // - Intel SDM Volume 2, Chapter 3.2 "CPUID" (more up-to-date) // - Intel Application Note 485 (more detailed) // // AMD: // - AMD64 APM Volume 3, Appendix 3 "Obtaining Processor Information ..." // // +stateify savable type FeatureSet struct { // Function is the underlying CPUID Function. // // This is exported to allow direct calls of the underlying CPUID // function, where required. Function `state:".(Static)"` // hwCap stores HWCAP1/2 exposed from the elf auxiliary vector. hwCap hwCap } // saveFunction saves the function as a static query. func (fs *FeatureSet) saveFunction() Static { if s, ok := fs.Function.(Static); ok { return s } return fs.ToStatic() } // loadFunction saves the function as a static query. func (fs *FeatureSet) loadFunction(_ context.Context, s Static) { fs.Function = s } // Helper to convert 3 regs into 12-byte vendor ID. // //go:nosplit func vendorIDFromRegs(bx, cx, dx uint32) (r [12]byte) { for i := uint(0); i < 4; i++ { b := byte(bx >> (i * 8)) r[i] = b } for i := uint(0); i < 4; i++ { b := byte(dx >> (i * 8)) r[4+i] = b } for i := uint(0); i < 4; i++ { b := byte(cx >> (i * 8)) r[8+i] = b } return r } // Helper to merge a 12-byte vendor ID back to registers. // // Used by static_amd64.go. func regsFromVendorID(r [12]byte) (bx, cx, dx uint32) { bx |= uint32(r[0]) bx |= uint32(r[1]) << 8 bx |= uint32(r[2]) << 16 bx |= uint32(r[3]) << 24 cx |= uint32(r[4]) cx |= uint32(r[5]) << 8 cx |= uint32(r[6]) << 16 cx |= uint32(r[7]) << 24 dx |= uint32(r[8]) dx |= uint32(r[9]) << 8 dx |= uint32(r[10]) << 16 dx |= uint32(r[10]) << 24 return } // VendorID is the 12-char string returned in ebx:edx:ecx for eax=0. // //go:nosplit func (fs FeatureSet) VendorID() [12]byte { _, bx, cx, dx := fs.query(vendorID) return vendorIDFromRegs(bx, cx, dx) } // Helper to deconstruct signature dword. // //go:nosplit func signatureSplit(v uint32) (ef, em, pt, f, m, sid uint8) { sid = uint8(v & 0xf) m = uint8(v>>4) & 0xf f = uint8(v>>8) & 0xf pt = uint8(v>>12) & 0x3 em = uint8(v>>16) & 0xf ef = uint8(v >> 20) return } // ExtendedFamily is part of the processor signature. // //go:nosplit func (fs FeatureSet) ExtendedFamily() uint8 { ax, _, _, _ := fs.query(featureInfo) ef, _, _, _, _, _ := signatureSplit(ax) return ef } // ExtendedModel is part of the processor signature. // //go:nosplit func (fs FeatureSet) ExtendedModel() uint8 { ax, _, _, _ := fs.query(featureInfo) _, em, _, _, _, _ := signatureSplit(ax) return em } // ProcessorType is part of the processor signature. // //go:nosplit func (fs FeatureSet) ProcessorType() uint8 { ax, _, _, _ := fs.query(featureInfo) _, _, pt, _, _, _ := signatureSplit(ax) return pt } // Family is part of the processor signature. // //go:nosplit func (fs FeatureSet) Family() uint8 { ax, _, _, _ := fs.query(featureInfo) _, _, _, f, _, _ := signatureSplit(ax) return f } // Model is part of the processor signature. // //go:nosplit func (fs FeatureSet) Model() uint8 { ax, _, _, _ := fs.query(featureInfo) _, _, _, _, m, _ := signatureSplit(ax) return m } // SteppingID is part of the processor signature. // //go:nosplit func (fs FeatureSet) SteppingID() uint8 { ax, _, _, _ := fs.query(featureInfo) _, _, _, _, _, sid := signatureSplit(ax) return sid } // VirtualAddressBits returns the number of bits available for virtual // addresses. // //go:nosplit func (fs FeatureSet) VirtualAddressBits() uint32 { ax, _, _, _ := fs.query(addressSizes) return (ax >> 8) & 0xff } // PhysicalAddressBits returns the number of bits available for physical // addresses. // //go:nosplit func (fs FeatureSet) PhysicalAddressBits() uint32 { ax, _, _, _ := fs.query(addressSizes) return ax & 0xff } // CacheType describes the type of a cache, as returned in eax[4:0] for eax=4. type CacheType uint8 const ( // cacheNull indicates that there are no more entries. cacheNull CacheType = iota // CacheData is a data cache. CacheData // CacheInstruction is an instruction cache. CacheInstruction // CacheUnified is a unified instruction and data cache. CacheUnified ) // Cache describes the parameters of a single cache on the system. // // This is returned by the Caches method on FeatureSet. type Cache struct { // Level is the hierarchical level of this cache (L1, L2, etc). Level uint32 // Type is the type of cache. Type CacheType // FullyAssociative indicates that entries may be placed in any block. FullyAssociative bool // Partitions is the number of physical partitions in the cache. Partitions uint32 // Ways is the number of ways of associativity in the cache. Ways uint32 // Sets is the number of sets in the cache. Sets uint32 // InvalidateHierarchical indicates that WBINVD/INVD from threads // sharing this cache acts upon lower level caches for threads sharing // this cache. InvalidateHierarchical bool // Inclusive indicates that this cache is inclusive of lower cache // levels. Inclusive bool // DirectMapped indicates that this cache is directly mapped from // address, rather than using a hash function. DirectMapped bool } // Caches describes the caches on the CPU. // // Only supported on Intel; requires allocation. func (fs FeatureSet) Caches() (caches []Cache) { if !fs.Intel() { return } // Check against the cache line, which should be consistent. cacheLine := fs.CacheLine() for i := uint32(0); ; i++ { out := fs.Query(In{ Eax: uint32(intelDeterministicCacheParams), Ecx: i, }) t := CacheType(out.Eax & 0xf) if t == cacheNull { break } lineSize := (out.Ebx & 0xfff) + 1 if lineSize != cacheLine { panic(fmt.Sprintf("Mismatched cache line size: %d vs %d", lineSize, cacheLine)) } caches = append(caches, Cache{ Type: t, Level: (out.Eax >> 5) & 0x7, FullyAssociative: ((out.Eax >> 9) & 1) == 1, Partitions: ((out.Ebx >> 12) & 0x3ff) + 1, Ways: ((out.Ebx >> 22) & 0x3ff) + 1, Sets: out.Ecx + 1, InvalidateHierarchical: (out.Edx & 1) == 0, Inclusive: ((out.Edx >> 1) & 1) == 1, DirectMapped: ((out.Edx >> 2) & 1) == 0, }) } return } // CacheLine is the size of a cache line in bytes. // // All caches use the same line size. This is not enforced in the CPUID // encoding, but is true on all known x86 processors. // //go:nosplit func (fs FeatureSet) CacheLine() uint32 { _, bx, _, _ := fs.query(featureInfo) return 8 * (bx >> 8) & 0xff } // HasFeature tests whether or not a feature is in the given feature set. // // This function is safe to call from a nosplit context, as long as the // FeatureSet does not have any masked features. // //go:nosplit func (fs FeatureSet) HasFeature(feature Feature) bool { return feature.check(fs) } // WriteCPUInfoTo is to generate a section of one cpu in /proc/cpuinfo. This is // a minimal /proc/cpuinfo, it is missing some fields like "microcode" that are // not always printed in Linux. The bogomips field is simply made up. func (fs FeatureSet) WriteCPUInfoTo(cpu, numCPU uint, w io.Writer) { // Avoid many redundant calls here, since this can occasionally appear // in the hot path. Read all basic information up front, see above. ax, _, _, _ := fs.query(featureInfo) ef, em, _, f, m, _ := signatureSplit(ax) vendor := fs.VendorID() fmt.Fprintf(w, "processor\t: %d\n", cpu) fmt.Fprintf(w, "vendor_id\t: %s\n", string(vendor[:])) fmt.Fprintf(w, "cpu family\t: %d\n", ((ef<<4)&0xff)|f) fmt.Fprintf(w, "model\t\t: %d\n", ((em<<4)&0xff)|m) fmt.Fprintf(w, "model name\t: %s\n", "unknown") // Unknown for now. fmt.Fprintf(w, "stepping\t: %s\n", "unknown") // Unknown for now. fmt.Fprintf(w, "cpu MHz\t\t: %.3f\n", cpuFreqMHz) fmt.Fprintf(w, "physical id\t: 0\n") // Pretend all CPUs are in the same socket. fmt.Fprintf(w, "siblings\t: %d\n", numCPU) fmt.Fprintf(w, "core id\t\t: %d\n", cpu) fmt.Fprintf(w, "cpu cores\t: %d\n", numCPU) // Pretend each CPU is a distinct core (rather than a hyperthread). fmt.Fprintf(w, "apicid\t\t: %d\n", cpu) fmt.Fprintf(w, "initial apicid\t: %d\n", cpu) fmt.Fprintf(w, "fpu\t\t: yes\n") fmt.Fprintf(w, "fpu_exception\t: yes\n") fmt.Fprintf(w, "cpuid level\t: %d\n", uint32(xSaveInfo)) // Same as ax in vendorID. fmt.Fprintf(w, "wp\t\t: yes\n") fmt.Fprintf(w, "flags\t\t: %s\n", fs.FlagString()) fmt.Fprintf(w, "bogomips\t: %.02f\n", cpuFreqMHz) // It's bogus anyway. fmt.Fprintf(w, "clflush size\t: %d\n", fs.CacheLine()) fmt.Fprintf(w, "cache_alignment\t: %d\n", fs.CacheLine()) fmt.Fprintf(w, "address sizes\t: %d bits physical, %d bits virtual\n", 46, 48) fmt.Fprintf(w, "power management:\n") // This is always here, but can be blank. fmt.Fprintf(w, "\n") // The /proc/cpuinfo file ends with an extra newline. } var ( authenticAMD = [12]byte{'A', 'u', 't', 'h', 'e', 'n', 't', 'i', 'c', 'A', 'M', 'D'} genuineIntel = [12]byte{'G', 'e', 'n', 'u', 'i', 'n', 'e', 'I', 'n', 't', 'e', 'l'} ) // AMD returns true if fs describes an AMD CPU. // //go:nosplit func (fs FeatureSet) AMD() bool { return fs.VendorID() == authenticAMD } // Intel returns true if fs describes an Intel CPU. // //go:nosplit func (fs FeatureSet) Intel() bool { return fs.VendorID() == genuineIntel } // Leaf 0 of xsaveinfo function returns the size for currently // enabled xsave features in ebx, the maximum size if all valid // features are saved with xsave in ecx, and valid XCR0 bits in // edx:eax. // // If xSaveInfo isn't supported, cpuid will not fault but will // return bogus values. var ( xsaveSize = native(In{Eax: uint32(xSaveInfo)}).Ebx maxXsaveSize = native(In{Eax: uint32(xSaveInfo)}).Ecx amxTileCfgSize = native(In{Eax: uint32(xSaveInfo), Ecx: 17}).Eax amxTileDataSize = native(In{Eax: uint32(xSaveInfo), Ecx: 18}).Eax ) const ( // XCR0AMXMask are the bits that enable xsave to operate on AMX TILECFG // and TILEDATA. // // Note: TILECFG and TILEDATA are always either both enabled or both // disabled. // // See Intel® 64 and IA-32 Architectures Software Developer’s Manual Vol.1 // section 13.3 for details. XCR0AMXMask = uint64((1 << 17) | (1 << 18)) ) // ExtendedStateSize returns the number of bytes needed to save the "extended // state" for the enabled features and the boundary it must be aligned to. // Extended state includes floating point registers, and other cpu state that's // not associated with the normal task context. // // Note: the return value matches the size of signal FP state frames. // Look at check_xstate_in_sigframe() in the kernel sources for more details. // //go:nosplit func (fs FeatureSet) ExtendedStateSize() (size, align uint) { if fs.UseXsave() { return uint(xsaveSize), 64 } // If we don't support xsave, we fall back to fxsave, which requires // 512 bytes aligned to 16 bytes. return 512, 16 } // AMXExtendedStateSize returns the number of bytes within the "extended state" // area that is used for AMX. func (fs FeatureSet) AMXExtendedStateSize() uint { if fs.UseXsave() { xcr0 := xgetbv(0) if (xcr0 & XCR0AMXMask) != 0 { return uint(amxTileCfgSize + amxTileDataSize) } } return 0 } // ValidXCR0Mask returns the valid bits in control register XCR0. // // Always exclude AMX bits, because we do not support it. // TODO(gvisor.dev/issues/9896): Implement AMX Support. // //go:nosplit func (fs FeatureSet) ValidXCR0Mask() uint64 { if !fs.HasFeature(X86FeatureXSAVE) { return 0 } ax, _, _, dx := fs.query(xSaveInfo) return (uint64(dx)<<32 | uint64(ax)) &^ XCR0AMXMask } // UseXsave returns the choice of fp state saving instruction. // //go:nosplit func (fs FeatureSet) UseXsave() bool { return fs.HasFeature(X86FeatureXSAVE) && fs.HasFeature(X86FeatureOSXSAVE) } // UseXsaveopt returns true if 'fs' supports the "xsaveopt" instruction. // //go:nosplit func (fs FeatureSet) UseXsaveopt() bool { return fs.UseXsave() && fs.HasFeature(X86FeatureXSAVEOPT) } // UseXsavec returns true if 'fs' supports the "xsavec" instruction. // //go:nosplit func (fs FeatureSet) UseXsavec() bool { return fs.UseXsaveopt() && fs.HasFeature(X86FeatureXSAVEC) } // UseFSGSBASE returns true if 'fs' supports the (RD|WR)(FS|GS)BASE instructions. func (fs FeatureSet) UseFSGSBASE() bool { HWCAP2_FSGSBASE := uint64(1) << 1 return fs.HasFeature(X86FeatureFSGSBase) && ((fs.hwCap.hwCap2 & HWCAP2_FSGSBASE) != 0) } // archCheckHostCompatible checks for compatibility. func (fs FeatureSet) archCheckHostCompatible(hfs FeatureSet) error { // The size of a cache line must match, as it is critical to correctly // utilizing CLFLUSH. Other cache properties are allowed to change, as // they are not important to correctness. fsCache := fs.CacheLine() hostCache := hfs.CacheLine() if fsCache != hostCache { return &ErrIncompatible{ reason: fmt.Sprintf("CPU cache line size %d incompatible with host cache line size %d", fsCache, hostCache), } } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/cpuid/cpuid_amd64_state_autogen.go000066400000000000000000000044141465435605700261030ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 && amd64 && amd64 && amd64 // +build amd64,amd64,amd64,amd64 package cpuid import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (fs *FeatureSet) StateTypeName() string { return "pkg/cpuid.FeatureSet" } func (fs *FeatureSet) StateFields() []string { return []string{ "Function", "hwCap", } } func (fs *FeatureSet) beforeSave() {} // +checklocksignore func (fs *FeatureSet) StateSave(stateSinkObject state.Sink) { fs.beforeSave() var FunctionValue Static FunctionValue = fs.saveFunction() stateSinkObject.SaveValue(0, FunctionValue) stateSinkObject.Save(1, &fs.hwCap) } func (fs *FeatureSet) afterLoad(context.Context) {} // +checklocksignore func (fs *FeatureSet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(1, &fs.hwCap) stateSourceObject.LoadValue(0, new(Static), func(y any) { fs.loadFunction(ctx, y.(Static)) }) } func (i *In) StateTypeName() string { return "pkg/cpuid.In" } func (i *In) StateFields() []string { return []string{ "Eax", "Ecx", } } func (i *In) beforeSave() {} // +checklocksignore func (i *In) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.Eax) stateSinkObject.Save(1, &i.Ecx) } func (i *In) afterLoad(context.Context) {} // +checklocksignore func (i *In) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.Eax) stateSourceObject.Load(1, &i.Ecx) } func (o *Out) StateTypeName() string { return "pkg/cpuid.Out" } func (o *Out) StateFields() []string { return []string{ "Eax", "Ebx", "Ecx", "Edx", } } func (o *Out) beforeSave() {} // +checklocksignore func (o *Out) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.Eax) stateSinkObject.Save(1, &o.Ebx) stateSinkObject.Save(2, &o.Ecx) stateSinkObject.Save(3, &o.Edx) } func (o *Out) afterLoad(context.Context) {} // +checklocksignore func (o *Out) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.Eax) stateSourceObject.Load(1, &o.Ebx) stateSourceObject.Load(2, &o.Ecx) stateSourceObject.Load(3, &o.Edx) } func init() { state.Register((*FeatureSet)(nil)) state.Register((*In)(nil)) state.Register((*Out)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/cpuid/cpuid_arm64.go000066400000000000000000000070661465435605700232050ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package cpuid import ( "fmt" "io" ) // FeatureSet for ARM64 is defined as a static set of bits. // // ARM64 doesn't have a CPUID equivalent, which means it has no architected // discovery mechanism for hardware features available to userspace code at // EL0. The kernel exposes the presence of these features to userspace through // a set of flags(HWCAP/HWCAP2) bits, exposed in the auxiliary vector. See // Documentation/arm64/elf_hwcaps.rst for more info. // // Currently, only the HWCAP bits are supported. // // +stateify savable type FeatureSet struct { hwCap hwCap cpuFreqMHz float64 cpuImplHex uint64 cpuArchDec uint64 cpuVarHex uint64 cpuPartHex uint64 cpuRevDec uint64 } // CPUImplementer is part of the processor signature. func (fs FeatureSet) CPUImplementer() uint8 { return uint8(fs.cpuImplHex) } // CPUArchitecture is part of the processor signature. func (fs FeatureSet) CPUArchitecture() uint8 { return uint8(fs.cpuArchDec) } // CPUVariant is part of the processor signature. func (fs FeatureSet) CPUVariant() uint8 { return uint8(fs.cpuVarHex) } // CPUPartnum is part of the processor signature. func (fs FeatureSet) CPUPartnum() uint16 { return uint16(fs.cpuPartHex) } // CPURevision is part of the processor signature. func (fs FeatureSet) CPURevision() uint8 { return uint8(fs.cpuRevDec) } // ExtendedStateSize returns the number of bytes needed to save the "extended // state" for this processor and the boundary it must be aligned to. Extended // state includes floating point(NEON) registers, and other cpu state that's not // associated with the normal task context. func (fs FeatureSet) ExtendedStateSize() (size, align uint) { // ARMv8 provide 32x128bits NEON registers. // // Ref arch/arm64/include/uapi/asm/ptrace.h // struct user_fpsimd_state { // __uint128_t vregs[32]; // __u32 fpsr; // __u32 fpcr; // __u32 __reserved[2]; // }; return 528, 16 } // HasFeature checks for the presence of a feature. func (fs FeatureSet) HasFeature(feature Feature) bool { return fs.hwCap.hwCap1&(1<= uint32(extendedFeatures) { _, _, cx, dx := fs.query(extendedFeatures) if f.block() == 5 { return (cx & f.bit()) != 0 } // Ignore features duplicated from block 1 on AMD. // These bits are reserved on Intel. return ((dx &^ block6DuplicateMask) & f.bit()) != 0 } return false case 7: _, _, _, dx := fs.query(extendedFeatureInfo) return (dx & f.bit()) != 0 default: return false } } // Block 0 constants are all of the "basic" feature bits returned by a cpuid in // ecx with eax=1. const ( X86FeatureSSE3 Feature = iota X86FeaturePCLMULDQ X86FeatureDTES64 X86FeatureMONITOR X86FeatureDSCPL X86FeatureVMX X86FeatureSMX X86FeatureEST X86FeatureTM2 X86FeatureSSSE3 // Not a typo, "supplemental" SSE3. X86FeatureCNXTID X86FeatureSDBG X86FeatureFMA X86FeatureCX16 X86FeatureXTPR X86FeaturePDCM _ // ecx bit 16 is reserved. X86FeaturePCID X86FeatureDCA X86FeatureSSE4_1 X86FeatureSSE4_2 X86FeatureX2APIC X86FeatureMOVBE X86FeaturePOPCNT X86FeatureTSCD X86FeatureAES X86FeatureXSAVE X86FeatureOSXSAVE X86FeatureAVX X86FeatureF16C X86FeatureRDRAND X86FeatureHypervisor ) // Block 1 constants are all of the "basic" feature bits returned by a cpuid in // edx with eax=1. const ( X86FeatureFPU Feature = 32 + iota X86FeatureVME X86FeatureDE X86FeaturePSE X86FeatureTSC X86FeatureMSR X86FeaturePAE X86FeatureMCE X86FeatureCX8 X86FeatureAPIC _ // edx bit 10 is reserved. X86FeatureSEP X86FeatureMTRR X86FeaturePGE X86FeatureMCA X86FeatureCMOV X86FeaturePAT X86FeaturePSE36 X86FeaturePSN X86FeatureCLFSH _ // edx bit 20 is reserved. X86FeatureDS X86FeatureACPI X86FeatureMMX X86FeatureFXSR X86FeatureSSE X86FeatureSSE2 X86FeatureSS X86FeatureHTT X86FeatureTM X86FeatureIA64 X86FeaturePBE ) // Block 2 bits are the "structured extended" features returned in ebx for // eax=7, ecx=0. const ( X86FeatureFSGSBase Feature = 2*32 + iota X86FeatureTSC_ADJUST _ // ebx bit 2 is reserved. X86FeatureBMI1 X86FeatureHLE X86FeatureAVX2 X86FeatureFDP_EXCPTN_ONLY X86FeatureSMEP X86FeatureBMI2 X86FeatureERMS X86FeatureINVPCID X86FeatureRTM X86FeatureCQM X86FeatureFPCSDS X86FeatureMPX X86FeatureRDT X86FeatureAVX512F X86FeatureAVX512DQ X86FeatureRDSEED X86FeatureADX X86FeatureSMAP X86FeatureAVX512IFMA X86FeaturePCOMMIT X86FeatureCLFLUSHOPT X86FeatureCLWB X86FeatureIPT // Intel processor trace. X86FeatureAVX512PF X86FeatureAVX512ER X86FeatureAVX512CD X86FeatureSHA X86FeatureAVX512BW X86FeatureAVX512VL ) // Block 3 bits are the "extended" features returned in ecx for eax=7, ecx=0. const ( X86FeaturePREFETCHWT1 Feature = 3*32 + iota X86FeatureAVX512VBMI X86FeatureUMIP X86FeaturePKU X86FeatureOSPKE X86FeatureWAITPKG X86FeatureAVX512_VBMI2 X86FeatureCET_SS X86FeatureGFNI X86FeatureVAES X86FeatureVPCLMULQDQ X86FeatureAVX512_VNNI X86FeatureAVX512_BITALG X86FeatureTME X86FeatureAVX512_VPOPCNTDQ _ // ecx bit 15 is reserved X86FeatureLA57 // ecx bits 17-21 are reserved _ _ _ _ _ X86FeatureRDPID // ecx bits 23-24 are reserved _ _ X86FeatureCLDEMOTE _ // ecx bit 26 is reserved X86FeatureMOVDIRI X86FeatureMOVDIR64B ) // Block 4 constants are for xsave capabilities in CPUID.(EAX=0DH,ECX=01H):EAX. // The CPUID leaf is available only if 'X86FeatureXSAVE' is present. const ( X86FeatureXSAVEOPT Feature = 4*32 + iota X86FeatureXSAVEC X86FeatureXGETBV1 X86FeatureXSAVES // EAX[31:4] are reserved. ) // Block 5 constants are the extended feature bits in // CPUID.(EAX=0x80000001):ECX. const ( X86FeatureLAHF64 Feature = 5*32 + iota X86FeatureCMP_LEGACY X86FeatureSVM X86FeatureEXTAPIC X86FeatureCR8_LEGACY X86FeatureLZCNT X86FeatureSSE4A X86FeatureMISALIGNSSE X86FeaturePREFETCHW X86FeatureOSVW X86FeatureIBS X86FeatureXOP X86FeatureSKINIT X86FeatureWDT _ // ecx bit 14 is reserved. X86FeatureLWP X86FeatureFMA4 X86FeatureTCE _ // ecx bit 18 is reserved. _ // ecx bit 19 is reserved. _ // ecx bit 20 is reserved. X86FeatureTBM X86FeatureTOPOLOGY X86FeaturePERFCTR_CORE X86FeaturePERFCTR_NB _ // ecx bit 25 is reserved. X86FeatureBPEXT X86FeaturePERFCTR_TSC X86FeaturePERFCTR_LLC X86FeatureMWAITX X86FeatureADMSKEXTN _ // ecx bit 31 is reserved. ) // Block 6 constants are the extended feature bits in // CPUID.(EAX=0x80000001):EDX. // // These are sparse, and so the bit positions are assigned manually. const ( // On AMD, EDX[24:23] | EDX[17:12] | EDX[9:0] are duplicate features // also defined in block 1 (in identical bit positions). Those features // are not listed here. block6DuplicateMask = 0x183f3ff X86FeatureSYSCALL Feature = 6*32 + 11 X86FeatureNX Feature = 6*32 + 20 X86FeatureMMXEXT Feature = 6*32 + 22 X86FeatureFXSR_OPT Feature = 6*32 + 25 X86FeatureGBPAGES Feature = 6*32 + 26 X86FeatureRDTSCP Feature = 6*32 + 27 X86FeatureLM Feature = 6*32 + 29 X86Feature3DNOWEXT Feature = 6*32 + 30 X86Feature3DNOW Feature = 6*32 + 31 ) // Block 7 constants are the extended features bits in // CPUID.(EAX=07H,ECX=0):EDX. const ( _ Feature = 7*32 + iota // edx bit 0 is reserved. _ // edx bit 1 is reserved. X86FeatureAVX512_4VNNIW X86FeatureAVX512_4FMAPS X86FeatureFSRM _ // edx bit 5 is not used in Linux. _ // edx bit 6 is reserved. _ // edx bit 7 is reserved. X86FeatureAVX512_VP2INTERSECT X86FeatureSRBDS_CTRL X86FeatureMD_CLEAR X86FeatureRTM_ALWAYS_ABORT _ // edx bit 12 is reserved. X86FeatureTSX_FORCE_ABORT X86FeatureSERIALIZE X86FeatureHYBRID_CPU X86FeatureTSXLDTRK _ // edx bit 17 is reserved. X86FeaturePCONFIG X86FeatureARCH_LBR X86FeatureIBT _ // edx bit 21 is reserved. X86FeatureAMX_BF16 X86FeatureAVX512_FP16 X86FeatureAMX_TILE X86FeatureAMX_INT8 X86FeatureSPEC_CTRL X86FeatureINTEL_STIBP X86FeatureFLUSH_L1D X86FeatureARCH_CAPABILITIES X86FeatureCORE_CAPABILITIES X86FeatureSPEC_CTRL_SSBD ) // These are the extended floating point state features. They are used to // enumerate floating point features in XCR0, XSTATE_BV, etc. const ( XSAVEFeatureX87 = 1 << 0 XSAVEFeatureSSE = 1 << 1 XSAVEFeatureAVX = 1 << 2 XSAVEFeatureBNDREGS = 1 << 3 XSAVEFeatureBNDCSR = 1 << 4 XSAVEFeatureAVX512op = 1 << 5 XSAVEFeatureAVX512zmm0 = 1 << 6 XSAVEFeatureAVX512zmm16 = 1 << 7 XSAVEFeaturePKRU = 1 << 9 ) // allFeatures is the set of allFeatures. // // These match names used in arch/x86/kernel/cpu/capflags.c. var allFeatures = map[Feature]allFeatureInfo{ // Block 0. X86FeatureSSE3: {"pni", true}, X86FeaturePCLMULDQ: {"pclmulqdq", true}, X86FeatureDTES64: {"dtes64", true}, X86FeatureMONITOR: {"monitor", true}, X86FeatureDSCPL: {"ds_cpl", true}, X86FeatureVMX: {"vmx", true}, X86FeatureSMX: {"smx", true}, X86FeatureEST: {"est", true}, X86FeatureTM2: {"tm2", true}, X86FeatureSSSE3: {"ssse3", true}, X86FeatureCNXTID: {"cid", true}, X86FeatureSDBG: {"sdbg", true}, X86FeatureFMA: {"fma", true}, X86FeatureCX16: {"cx16", true}, X86FeatureXTPR: {"xtpr", true}, X86FeaturePDCM: {"pdcm", true}, X86FeaturePCID: {"pcid", true}, X86FeatureDCA: {"dca", true}, X86FeatureSSE4_1: {"sse4_1", true}, X86FeatureSSE4_2: {"sse4_2", true}, X86FeatureX2APIC: {"x2apic", true}, X86FeatureMOVBE: {"movbe", true}, X86FeaturePOPCNT: {"popcnt", true}, X86FeatureTSCD: {"tsc_deadline_timer", true}, X86FeatureAES: {"aes", true}, X86FeatureXSAVE: {"xsave", true}, X86FeatureAVX: {"avx", true}, X86FeatureF16C: {"f16c", true}, X86FeatureRDRAND: {"rdrand", true}, X86FeatureHypervisor: {"hypervisor", true}, X86FeatureOSXSAVE: {"osxsave", false}, // Block 1. X86FeatureFPU: {"fpu", true}, X86FeatureVME: {"vme", true}, X86FeatureDE: {"de", true}, X86FeaturePSE: {"pse", true}, X86FeatureTSC: {"tsc", true}, X86FeatureMSR: {"msr", true}, X86FeaturePAE: {"pae", true}, X86FeatureMCE: {"mce", true}, X86FeatureCX8: {"cx8", true}, X86FeatureAPIC: {"apic", true}, X86FeatureSEP: {"sep", true}, X86FeatureMTRR: {"mtrr", true}, X86FeaturePGE: {"pge", true}, X86FeatureMCA: {"mca", true}, X86FeatureCMOV: {"cmov", true}, X86FeaturePAT: {"pat", true}, X86FeaturePSE36: {"pse36", true}, X86FeaturePSN: {"pn", true}, X86FeatureCLFSH: {"clflush", true}, X86FeatureDS: {"dts", true}, X86FeatureACPI: {"acpi", true}, X86FeatureMMX: {"mmx", true}, X86FeatureFXSR: {"fxsr", true}, X86FeatureSSE: {"sse", true}, X86FeatureSSE2: {"sse2", true}, X86FeatureSS: {"ss", true}, X86FeatureHTT: {"ht", true}, X86FeatureTM: {"tm", true}, X86FeatureIA64: {"ia64", true}, X86FeaturePBE: {"pbe", true}, // Block 2. X86FeatureFSGSBase: {"fsgsbase", true}, X86FeatureTSC_ADJUST: {"tsc_adjust", true}, X86FeatureBMI1: {"bmi1", true}, X86FeatureHLE: {"hle", true}, X86FeatureAVX2: {"avx2", true}, X86FeatureSMEP: {"smep", true}, X86FeatureBMI2: {"bmi2", true}, X86FeatureERMS: {"erms", true}, X86FeatureINVPCID: {"invpcid", true}, X86FeatureRTM: {"rtm", true}, X86FeatureCQM: {"cqm", true}, X86FeatureMPX: {"mpx", true}, X86FeatureRDT: {"rdt_a", true}, X86FeatureAVX512F: {"avx512f", true}, X86FeatureAVX512DQ: {"avx512dq", true}, X86FeatureRDSEED: {"rdseed", true}, X86FeatureADX: {"adx", true}, X86FeatureSMAP: {"smap", true}, X86FeatureCLWB: {"clwb", true}, X86FeatureAVX512PF: {"avx512pf", true}, X86FeatureAVX512ER: {"avx512er", true}, X86FeatureAVX512CD: {"avx512cd", true}, X86FeatureSHA: {"sha_ni", true}, X86FeatureAVX512BW: {"avx512bw", true}, X86FeatureAVX512VL: {"avx512vl", true}, X86FeatureFDP_EXCPTN_ONLY: {"fdp_excptn_only", false}, X86FeatureFPCSDS: {"fpcsds", false}, X86FeatureIPT: {"ipt", false}, X86FeatureCLFLUSHOPT: {"clfushopt", false}, // Block 3. X86FeatureAVX512VBMI: {"avx512vbmi", true}, X86FeatureUMIP: {"umip", true}, X86FeaturePKU: {"pku", true}, X86FeatureOSPKE: {"ospke", true}, X86FeatureWAITPKG: {"waitpkg", true}, X86FeatureAVX512_VBMI2: {"avx512_vbmi2", true}, X86FeatureGFNI: {"gfni", true}, X86FeatureCET_SS: {"cet_ss", false}, X86FeatureVAES: {"vaes", true}, X86FeatureVPCLMULQDQ: {"vpclmulqdq", true}, X86FeatureAVX512_VNNI: {"avx512_vnni", true}, X86FeatureAVX512_BITALG: {"avx512_bitalg", true}, X86FeatureTME: {"tme", true}, X86FeatureAVX512_VPOPCNTDQ: {"avx512_vpopcntdq", true}, X86FeatureLA57: {"la57", true}, X86FeatureRDPID: {"rdpid", true}, X86FeatureCLDEMOTE: {"cldemote", true}, X86FeatureMOVDIRI: {"movdiri", true}, X86FeatureMOVDIR64B: {"movdir64b", true}, X86FeaturePREFETCHWT1: {"prefetchwt1", false}, // Block 4. X86FeatureXSAVEOPT: {"xsaveopt", true}, X86FeatureXSAVEC: {"xsavec", true}, X86FeatureXGETBV1: {"xgetbv1", true}, X86FeatureXSAVES: {"xsaves", true}, // Block 5. X86FeatureLAHF64: {"lahf_lm", true}, // LAHF/SAHF in long mode. X86FeatureCMP_LEGACY: {"cmp_legacy", true}, X86FeatureSVM: {"svm", true}, X86FeatureEXTAPIC: {"extapic", true}, X86FeatureCR8_LEGACY: {"cr8_legacy", true}, X86FeatureLZCNT: {"abm", true}, // Advanced bit manipulation. X86FeatureSSE4A: {"sse4a", true}, X86FeatureMISALIGNSSE: {"misalignsse", true}, X86FeaturePREFETCHW: {"3dnowprefetch", true}, X86FeatureOSVW: {"osvw", true}, X86FeatureIBS: {"ibs", true}, X86FeatureXOP: {"xop", true}, X86FeatureSKINIT: {"skinit", true}, X86FeatureWDT: {"wdt", true}, X86FeatureLWP: {"lwp", true}, X86FeatureFMA4: {"fma4", true}, X86FeatureTCE: {"tce", true}, X86FeatureTBM: {"tbm", true}, X86FeatureTOPOLOGY: {"topoext", true}, X86FeaturePERFCTR_CORE: {"perfctr_core", true}, X86FeaturePERFCTR_NB: {"perfctr_nb", true}, X86FeatureBPEXT: {"bpext", true}, X86FeaturePERFCTR_TSC: {"ptsc", true}, X86FeaturePERFCTR_LLC: {"perfctr_llc", true}, X86FeatureMWAITX: {"mwaitx", true}, X86FeatureADMSKEXTN: {"ad_mask_extn", false}, // Block 6. X86FeatureSYSCALL: {"syscall", true}, X86FeatureNX: {"nx", true}, X86FeatureMMXEXT: {"mmxext", true}, X86FeatureFXSR_OPT: {"fxsr_opt", true}, X86FeatureGBPAGES: {"pdpe1gb", true}, X86FeatureRDTSCP: {"rdtscp", true}, X86FeatureLM: {"lm", true}, X86Feature3DNOWEXT: {"3dnowext", true}, X86Feature3DNOW: {"3dnow", true}, // Block 7. X86FeatureAVX512_4VNNIW: {"avx512_4vnniw", true}, X86FeatureAVX512_4FMAPS: {"avx512_4fmaps", true}, X86FeatureFSRM: {"fsrm", true}, X86FeatureAVX512_VP2INTERSECT: {"avx512_vp2intersect", true}, X86FeatureSRBDS_CTRL: {"srbds_ctrl", false}, X86FeatureMD_CLEAR: {"md_clear", true}, X86FeatureRTM_ALWAYS_ABORT: {"rtm_always_abort", false}, X86FeatureTSX_FORCE_ABORT: {"tsx_force_abort", false}, X86FeatureSERIALIZE: {"serialize", true}, X86FeatureHYBRID_CPU: {"hybrid_cpu", false}, X86FeatureTSXLDTRK: {"tsxldtrk", true}, X86FeaturePCONFIG: {"pconfig", true}, X86FeatureARCH_LBR: {"arch_lbr", true}, X86FeatureIBT: {"ibt", true}, X86FeatureAMX_BF16: {"amx_bf16", true}, X86FeatureAVX512_FP16: {"avx512_fp16", true}, X86FeatureAMX_TILE: {"amx_tile", true}, X86FeatureAMX_INT8: {"amx_int8", true}, X86FeatureSPEC_CTRL: {"spec_ctrl", false}, X86FeatureINTEL_STIBP: {"intel_stibp", false}, X86FeatureFLUSH_L1D: {"flush_l1d", true}, X86FeatureARCH_CAPABILITIES: {"arch_capabilities", true}, X86FeatureCORE_CAPABILITIES: {"core_capabilities", false}, X86FeatureSPEC_CTRL_SSBD: {"spec_ctrl_ssbd", false}, } // linuxBlockOrder defines the order in which linux organizes the feature // blocks. Linux also tracks feature bits in 32-bit blocks, but in an order // which doesn't match well here, so for the /proc/cpuinfo generation we simply // re-map the blocks to Linux's ordering and then go through the bits in each // block. var linuxBlockOrder = []block{1, 6, 0, 5, 2, 4, 3, 7} func archFlagOrder(fn func(Feature)) { for _, b := range linuxBlockOrder { for i := 0; i < blockSize; i++ { f := featureID(b, i) if _, ok := allFeatures[f]; ok { fn(f) } } } } golang-gvisor-gvisor-0.0~20240729.0/pkg/cpuid/features_arm64.go000066400000000000000000000105221465435605700237060ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package cpuid const ( // ARM64FeatureFP indicates support for single and double precision // float point types. ARM64FeatureFP Feature = iota // ARM64FeatureASIMD indicates support for Advanced SIMD with single // and double precision float point arithmetic. ARM64FeatureASIMD // ARM64FeatureEVTSTRM indicates support for the generic timer // configured to generate events at a frequency of approximately // 100KHz. ARM64FeatureEVTSTRM // ARM64FeatureAES indicates support for AES instructions // (AESE/AESD/AESMC/AESIMC). ARM64FeatureAES // ARM64FeaturePMULL indicates support for AES instructions // (PMULL/PMULL2). ARM64FeaturePMULL // ARM64FeatureSHA1 indicates support for SHA1 instructions // (SHA1C/SHA1P/SHA1M etc). ARM64FeatureSHA1 // ARM64FeatureSHA2 indicates support for SHA2 instructions // (SHA256H/SHA256H2/SHA256SU0 etc). ARM64FeatureSHA2 // ARM64FeatureCRC32 indicates support for CRC32 instructions // (CRC32B/CRC32H/CRC32W etc). ARM64FeatureCRC32 // ARM64FeatureATOMICS indicates support for atomic instructions // (LDADD/LDCLR/LDEOR/LDSET etc). ARM64FeatureATOMICS // ARM64FeatureFPHP indicates support for half precision float point // arithmetic. ARM64FeatureFPHP // ARM64FeatureASIMDHP indicates support for ASIMD with half precision // float point arithmetic. ARM64FeatureASIMDHP // ARM64FeatureCPUID indicates support for EL0 access to certain ID // registers is available. ARM64FeatureCPUID // ARM64FeatureASIMDRDM indicates support for SQRDMLAH and SQRDMLSH // instructions. ARM64FeatureASIMDRDM // ARM64FeatureJSCVT indicates support for the FJCVTZS instruction. ARM64FeatureJSCVT // ARM64FeatureFCMA indicates support for the FCMLA and FCADD // instructions. ARM64FeatureFCMA // ARM64FeatureLRCPC indicates support for the LDAPRB/LDAPRH/LDAPR // instructions. ARM64FeatureLRCPC // ARM64FeatureDCPOP indicates support for DC instruction (DC CVAP). ARM64FeatureDCPOP // ARM64FeatureSHA3 indicates support for SHA3 instructions // (EOR3/RAX1/XAR/BCAX). ARM64FeatureSHA3 // ARM64FeatureSM3 indicates support for SM3 instructions // (SM3SS1/SM3TT1A/SM3TT1B). ARM64FeatureSM3 // ARM64FeatureSM4 indicates support for SM4 instructions // (SM4E/SM4EKEY). ARM64FeatureSM4 // ARM64FeatureASIMDDP indicates support for dot product instructions // (UDOT/SDOT). ARM64FeatureASIMDDP // ARM64FeatureSHA512 indicates support for SHA2 instructions // (SHA512H/SHA512H2/SHA512SU0). ARM64FeatureSHA512 // ARM64FeatureSVE indicates support for Scalable Vector Extension. ARM64FeatureSVE // ARM64FeatureASIMDFHM indicates support for FMLAL and FMLSL // instructions. ARM64FeatureASIMDFHM ) var allFeatures = map[Feature]allFeatureInfo{ ARM64FeatureFP: {"fp", true}, ARM64FeatureASIMD: {"asimd", true}, ARM64FeatureEVTSTRM: {"evtstrm", true}, ARM64FeatureAES: {"aes", true}, ARM64FeaturePMULL: {"pmull", true}, ARM64FeatureSHA1: {"sha1", true}, ARM64FeatureSHA2: {"sha2", true}, ARM64FeatureCRC32: {"crc32", true}, ARM64FeatureATOMICS: {"atomics", true}, ARM64FeatureFPHP: {"fphp", true}, ARM64FeatureASIMDHP: {"asimdhp", true}, ARM64FeatureCPUID: {"cpuid", true}, ARM64FeatureASIMDRDM: {"asimdrdm", true}, ARM64FeatureJSCVT: {"jscvt", true}, ARM64FeatureFCMA: {"fcma", true}, ARM64FeatureLRCPC: {"lrcpc", true}, ARM64FeatureDCPOP: {"dcpop", true}, ARM64FeatureSHA3: {"sha3", true}, ARM64FeatureSM3: {"sm3", true}, ARM64FeatureSM4: {"sm4", true}, ARM64FeatureASIMDDP: {"asimddp", true}, ARM64FeatureSHA512: {"sha512", true}, ARM64FeatureSVE: {"sve", true}, ARM64FeatureASIMDFHM: {"asimdfhm", true}, } func archFlagOrder(fn func(Feature)) { for i := 0; i < len(allFeatures); i++ { fn(Feature(i)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/cpuid/native_amd64.go000066400000000000000000000175531465435605700233530ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package cpuid import ( "io/ioutil" "strconv" "strings" "gvisor.dev/gvisor/pkg/log" ) // cpuididFunction is a useful type wrapper. The format is eax | (ecx << 32). type cpuidFunction uint64 func (f cpuidFunction) eax() uint32 { return uint32(f) } func (f cpuidFunction) ecx() uint32 { return uint32(f >> 32) } // The constants below are the lower or "standard" cpuid functions, ordered as // defined by the hardware. Note that these may not be included in the standard // set of functions that we are allowed to execute, which are filtered in the // Native.Query function defined below. const ( vendorID cpuidFunction = 0x0 // Returns vendor ID and largest standard function. featureInfo cpuidFunction = 0x1 // Returns basic feature bits and processor signature. intelCacheDescriptors cpuidFunction = 0x2 // Returns list of cache descriptors. Intel only. intelSerialNumber cpuidFunction = 0x3 // Returns processor serial number (obsolete on new hardware). Intel only. intelDeterministicCacheParams cpuidFunction = 0x4 // Returns deterministic cache information. Intel only. monitorMwaitParams cpuidFunction = 0x5 // Returns information about monitor/mwait instructions. powerParams cpuidFunction = 0x6 // Returns information about power management and thermal sensors. extendedFeatureInfo cpuidFunction = 0x7 // Returns extended feature bits. _ // Function 0x8 is reserved. intelDCAParams cpuidFunction = 0x9 // Returns direct cache access information. Intel only. intelPMCInfo cpuidFunction = 0xa // Returns information about performance monitoring features. Intel only. intelX2APICInfo cpuidFunction = 0xb // Returns core/logical processor topology. Intel only. _ // Function 0xc is reserved. xSaveInfo cpuidFunction = 0xd // Returns information about extended state management. xSaveInfoSub cpuidFunction = 0xd | (0x1 << 32) // Returns information about extended state management (Sub-leaf). ) const xSaveInfoNumLeaves = 64 // Maximum number of xSaveInfo leaves. // The "extended" functions. const ( extendedStart cpuidFunction = 0x80000000 extendedFunctionInfo cpuidFunction = extendedStart + 0 // Returns highest available extended function in eax. extendedFeatures = extendedStart + 1 // Returns some extended feature bits in edx and ecx. processorBrandString2 = extendedStart + 2 // Processor Name String Identifier. processorBrandString3 = extendedStart + 3 // Processor Name String Identifier. processorBrandString4 = extendedStart + 4 // Processor Name String Identifier. l1CacheAndTLBInfo = extendedStart + 5 // Returns L2 cache information. l2CacheInfo = extendedStart + 6 // Returns L2 cache information. addressSizes = extendedStart + 8 // Physical and virtual address sizes. ) var allowedBasicFunctions = [...]bool{ vendorID: true, featureInfo: true, extendedFeatureInfo: true, intelCacheDescriptors: true, intelDeterministicCacheParams: true, xSaveInfo: true, } var allowedExtendedFunctions = [...]bool{ extendedFunctionInfo - extendedStart: true, extendedFeatures - extendedStart: true, addressSizes - extendedStart: true, processorBrandString2 - extendedStart: true, processorBrandString3 - extendedStart: true, processorBrandString4 - extendedStart: true, l1CacheAndTLBInfo - extendedStart: true, l2CacheInfo - extendedStart: true, } // Function executes a CPUID function. // // This is typically the native function or a Static definition. type Function interface { Query(In) Out } // Native is a native Function. // // This implements Function. type Native struct{} // In is input to the Query function. // // +stateify savable type In struct { Eax uint32 Ecx uint32 } // normalize drops irrelevant Ecx values. func (i *In) normalize() { switch cpuidFunction(i.Eax) { case vendorID, featureInfo, intelCacheDescriptors, extendedFunctionInfo, extendedFeatures: i.Ecx = 0 // Ignore. case processorBrandString2, processorBrandString3, processorBrandString4, l1CacheAndTLBInfo, l2CacheInfo: i.Ecx = 0 // Ignore. case intelDeterministicCacheParams, extendedFeatureInfo: // Preserve i.Ecx. } } // Out is output from the Query function. // // +stateify savable type Out struct { Eax uint32 Ebx uint32 Ecx uint32 Edx uint32 } // native is the native Query function. func native(In) Out // Query executes CPUID natively. // // This implements Function. // //go:nosplit func (*Native) Query(in In) Out { if int(in.Eax) < len(allowedBasicFunctions) && allowedBasicFunctions[in.Eax] { return native(in) } else if in.Eax >= uint32(extendedStart) { if l := int(in.Eax - uint32(extendedStart)); l < len(allowedExtendedFunctions) && allowedExtendedFunctions[l] { return native(in) } } return Out{} // All zeros. } // query is a internal wrapper. // //go:nosplit func (fs FeatureSet) query(fn cpuidFunction) (uint32, uint32, uint32, uint32) { out := fs.Query(In{Eax: fn.eax(), Ecx: fn.ecx()}) return out.Eax, out.Ebx, out.Ecx, out.Edx } var hostFeatureSet FeatureSet // HostFeatureSet returns a host CPUID. // //go:nosplit func HostFeatureSet() FeatureSet { return hostFeatureSet } var ( // cpuFreqMHz is the native CPU frequency. cpuFreqMHz float64 ) // Reads max cpu frequency from host /proc/cpuinfo. Must run before syscall // filter installation. This value is used to create the fake /proc/cpuinfo // from a FeatureSet. func readMaxCPUFreq() { cpuinfob, err := ioutil.ReadFile("/proc/cpuinfo") if err != nil { // Leave it as 0... the VDSO bails out in the same way. log.Warningf("Could not read /proc/cpuinfo: %v", err) return } cpuinfo := string(cpuinfob) // We get the value straight from host /proc/cpuinfo. On machines with // frequency scaling enabled, this will only get the current value // which will likely be inaccurate. This is fine on machines with // frequency scaling disabled. for _, line := range strings.Split(cpuinfo, "\n") { if strings.Contains(line, "cpu MHz") { splitMHz := strings.Split(line, ":") if len(splitMHz) < 2 { log.Warningf("Could not read /proc/cpuinfo: malformed cpu MHz line") return } // If there was a problem, leave cpuFreqMHz as 0. var err error cpuFreqMHz, err = strconv.ParseFloat(strings.TrimSpace(splitMHz[1]), 64) if err != nil { log.Warningf("Could not parse cpu MHz value %v: %v", splitMHz[1], err) cpuFreqMHz = 0 return } return } } log.Warningf("Could not parse /proc/cpuinfo, it is empty or does not contain cpu MHz") } // xgetbv reads an extended control register. func xgetbv(reg uintptr) uint64 // archInitialize initializes hostFeatureSet. func archInitialize() { hostFeatureSet = FeatureSet{ Function: &Native{}, }.Fixed() readMaxCPUFreq() initHWCap() } golang-gvisor-gvisor-0.0~20240729.0/pkg/cpuid/native_amd64.s000066400000000000000000000020411465435605700231720ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "textflag.h" TEXT ·native(SB),NOSPLIT|NOFRAME,$0-24 MOVL arg_Eax+0(FP), AX MOVL arg_Ecx+4(FP), CX CPUID MOVL AX, ret_Eax+8(FP) MOVL BX, ret_Ebx+12(FP) MOVL CX, ret_Ecx+16(FP) MOVL DX, ret_Edx+20(FP) RET // xgetbv reads an extended control register. // // The code corresponds to: // // xgetbv // TEXT ·xgetbv(SB),NOSPLIT|NOFRAME,$0-16 MOVQ reg+0(FP), CX BYTE $0x0f; BYTE $0x01; BYTE $0xd0; MOVL AX, ret+8(FP) MOVL DX, ret+12(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/cpuid/native_arm64.go000066400000000000000000000113731465435605700233630ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package cpuid import ( "io/ioutil" "runtime" "strconv" "strings" "gvisor.dev/gvisor/pkg/log" ) // hostFeatureSet is initialized at startup. // // This is copied for HostFeatureSet, below. var hostFeatureSet FeatureSet // HostFeatureSet returns a copy of the host FeatureSet. func HostFeatureSet() FeatureSet { return hostFeatureSet } // Fixed returns the same feature set. func (fs FeatureSet) Fixed() FeatureSet { return fs } // Reads CPU information from host /proc/cpuinfo. // // Must run before syscall filter installation. This value is used to create // the fake /proc/cpuinfo from a FeatureSet. func initCPUInfo() { if runtime.GOOS != "linux" { // Don't try to read Linux-specific /proc files or // warn about them not existing. return } cpuinfob, err := ioutil.ReadFile("/proc/cpuinfo") if err != nil { // Leave everything at 0, nothing can be done. log.Warningf("Could not read /proc/cpuinfo: %v", err) return } cpuinfo := string(cpuinfob) // We get the value straight from host /proc/cpuinfo. for _, line := range strings.Split(cpuinfo, "\n") { switch { case strings.Contains(line, "BogoMIPS"): splitMHz := strings.Split(line, ":") if len(splitMHz) < 2 { log.Warningf("Could not read /proc/cpuinfo: malformed BogoMIPS") break } // If there was a problem, leave cpuFreqMHz as 0. var err error hostFeatureSet.cpuFreqMHz, err = strconv.ParseFloat(strings.TrimSpace(splitMHz[1]), 64) if err != nil { hostFeatureSet.cpuFreqMHz = 0.0 log.Warningf("Could not parse BogoMIPS value %v: %v", splitMHz[1], err) } case strings.Contains(line, "CPU implementer"): splitImpl := strings.Split(line, ":") if len(splitImpl) < 2 { log.Warningf("Could not read /proc/cpuinfo: malformed CPU implementer") break } // If there was a problem, leave cpuImplHex as 0. var err error hostFeatureSet.cpuImplHex, err = strconv.ParseUint(strings.TrimSpace(splitImpl[1]), 0, 64) if err != nil { hostFeatureSet.cpuImplHex = 0 log.Warningf("Could not parse CPU implementer value %v: %v", splitImpl[1], err) } case strings.Contains(line, "CPU architecture"): splitArch := strings.Split(line, ":") if len(splitArch) < 2 { log.Warningf("Could not read /proc/cpuinfo: malformed CPU architecture") break } // If there was a problem, leave cpuArchDec as 0. var err error hostFeatureSet.cpuArchDec, err = strconv.ParseUint(strings.TrimSpace(splitArch[1]), 0, 64) if err != nil { hostFeatureSet.cpuArchDec = 0 log.Warningf("Could not parse CPU architecture value %v: %v", splitArch[1], err) } case strings.Contains(line, "CPU variant"): splitVar := strings.Split(line, ":") if len(splitVar) < 2 { log.Warningf("Could not read /proc/cpuinfo: malformed CPU variant") break } // If there was a problem, leave cpuVarHex as 0. var err error hostFeatureSet.cpuVarHex, err = strconv.ParseUint(strings.TrimSpace(splitVar[1]), 0, 64) if err != nil { hostFeatureSet.cpuVarHex = 0 log.Warningf("Could not parse CPU variant value %v: %v", splitVar[1], err) } case strings.Contains(line, "CPU part"): splitPart := strings.Split(line, ":") if len(splitPart) < 2 { log.Warningf("Could not read /proc/cpuinfo: malformed CPU part") break } // If there was a problem, leave cpuPartHex as 0. var err error hostFeatureSet.cpuPartHex, err = strconv.ParseUint(strings.TrimSpace(splitPart[1]), 0, 64) if err != nil { hostFeatureSet.cpuPartHex = 0 log.Warningf("Could not parse CPU part value %v: %v", splitPart[1], err) } case strings.Contains(line, "CPU revision"): splitRev := strings.Split(line, ":") if len(splitRev) < 2 { log.Warningf("Could not read /proc/cpuinfo: malformed CPU revision") break } // If there was a problem, leave cpuRevDec as 0. var err error hostFeatureSet.cpuRevDec, err = strconv.ParseUint(strings.TrimSpace(splitRev[1]), 0, 64) if err != nil { hostFeatureSet.cpuRevDec = 0 log.Warningf("Could not parse CPU revision value %v: %v", splitRev[1], err) } } } } // archInitialize initializes hostFeatureSet. func archInitialize() { initCPUInfo() initHWCap() } golang-gvisor-gvisor-0.0~20240729.0/pkg/cpuid/static_amd64.go000066400000000000000000000061011465435605700233370ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package cpuid import "context" // Static is a static CPUID function. // // +stateify savable type Static map[In]Out // Fixed converts the FeatureSet to a fixed set. func (fs FeatureSet) Fixed() FeatureSet { return fs.ToStatic().ToFeatureSet() } // ToStatic converts a FeatureSet to a Static function. // // You can create a new static feature set as: // // fs := otherFeatureSet.ToStatic().ToFeatureSet() func (fs FeatureSet) ToStatic() Static { s := make(Static) // Save all allowed top-level functions. for fn, allowed := range allowedBasicFunctions { if allowed { in := In{Eax: uint32(fn)} s[in] = fs.Query(in) } } // Save all allowed extended functions. for fn, allowed := range allowedExtendedFunctions { if allowed { in := In{Eax: uint32(fn) + uint32(extendedStart)} s[in] = fs.Query(in) } } // Save all features (may be redundant). for feature := range allFeatures { feature.set(s, fs.HasFeature(feature)) } // Processor Extended State Enumeration. for i := uint32(0); i < xSaveInfoNumLeaves; i++ { in := In{Eax: uint32(xSaveInfo), Ecx: i} s[in] = fs.Query(in) } // Save all cache information. out := fs.Query(In{Eax: uint32(featureInfo)}) for i := uint32(0); i < out.Ecx; i++ { in := In{Eax: uint32(intelDeterministicCacheParams), Ecx: i} out := fs.Query(in) s[in] = out if CacheType(out.Eax&0xf) == cacheNull { break } } return s } // ToFeatureSet converts a static specification to a FeatureSet. // // This overloads some local values, where required. func (s Static) ToFeatureSet() FeatureSet { // Make a copy. ns := make(Static) for k, v := range s { ns[k] = v } ns.normalize() return FeatureSet{ns, hwCap{}} } // afterLoad calls normalize. func (s Static) afterLoad(context.Context) { s.normalize() } // normalize normalizes FPU sizes. func (s Static) normalize() { // Override local FPU sizes, which must be fixed. fs := FeatureSet{s, hwCap{}} if fs.HasFeature(X86FeatureXSAVE) { in := In{Eax: uint32(xSaveInfo)} out := s[in] out.Ecx = maxXsaveSize out.Ebx = xsaveSize s[in] = out } } // Add adds a feature. func (s Static) Add(feature Feature) Static { feature.set(s, true) return s } // Remove removes a feature. func (s Static) Remove(feature Feature) Static { feature.set(s, false) return s } // Set implements ChangeableSet.Set. func (s Static) Set(in In, out Out) { s[in] = out } // Query implements Function.Query. func (s Static) Query(in In) Out { in.normalize() return s[in] } golang-gvisor-gvisor-0.0~20240729.0/pkg/devutil/000077500000000000000000000000001465435605700211005ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/devutil/context.go000066400000000000000000000026361465435605700231220ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package devutil import "context" // contextID is this package's type for context.Context.Value keys. type contextID int const ( // CtxDevGoferClient is a Context.Value key for a /dev gofer client. CtxDevGoferClient contextID = iota // CtxDevGoferClientProvider is a Context.Value key for GoferClientProvider. CtxDevGoferClientProvider ) // GoferClientFromContext returns the device gofer client used by ctx. func GoferClientFromContext(ctx context.Context) *GoferClient { if v := ctx.Value(CtxDevGoferClient); v != nil { return v.(*GoferClient) } return nil } // GoferClientProviderFromContext returns the GoferClientProvider used by ctx. func GoferClientProviderFromContext(ctx context.Context) GoferClientProvider { if v := ctx.Value(CtxDevGoferClientProvider); v != nil { return v.(GoferClientProvider) } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/devutil/devutil.go000066400000000000000000000075241465435605700231130ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package devutil provides device specific utilities. package devutil import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fsutil" "gvisor.dev/gvisor/pkg/lisafs" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/unet" ) // GoferClient is the lisafs client for the /dev gofer connection. type GoferClient struct { clientFD lisafs.ClientFD hostFD int contName string } // NewGoferClient establishes the LISAFS connection to the dev gofer server. // It takes ownership of fd. contName is the owning container name. func NewGoferClient(ctx context.Context, contName string, fd int) (*GoferClient, error) { ctx.UninterruptibleSleepStart(false) defer ctx.UninterruptibleSleepFinish(false) sock, err := unet.NewSocket(fd) if err != nil { ctx.Warningf("failed to create socket for dev gofer client: %v", err) return nil, err } client, devInode, devHostFD, err := lisafs.NewClient(sock) if err != nil { ctx.Warningf("failed to create dev gofer client: %v", err) return nil, err } return &GoferClient{ clientFD: client.NewFD(devInode.ControlFD), hostFD: devHostFD, contName: contName, }, nil } // Close closes the LISAFS connection. func (g *GoferClient) Close() { // Close the connection to the server. This implicitly closes all FDs. g.clientFD.Client().Close() if g.hostFD >= 0 { _ = unix.Close(g.hostFD) } } // ContainerName returns the name of the container that owns this gofer. func (g *GoferClient) ContainerName() string { return g.contName } // DirentNames returns names of all the dirents for /dev on the gofer. func (g *GoferClient) DirentNames(ctx context.Context) ([]string, error) { if g.hostFD >= 0 { return fsutil.DirentNames(g.hostFD) } client := g.clientFD.Client() openFDID, _, err := g.clientFD.OpenAt(ctx, unix.O_RDONLY) if err != nil { return nil, fmt.Errorf("failed to open dev from gofer: %v", err) } defer client.CloseFD(ctx, openFDID, true /* flush */) openFD := client.NewFD(openFDID) const count = int32(64 * 1024) var names []string for { dirents, err := openFD.Getdents64(ctx, count) if err != nil { return nil, fmt.Errorf("Getdents64 RPC failed: %v", err) } if len(dirents) == 0 { break } for i := range dirents { names = append(names, string(dirents[i].Name)) } } return names, nil } // OpenAt opens the device file at /dev/{name} on the gofer. func (g *GoferClient) OpenAt(ctx context.Context, name string, flags uint32) (int, error) { flags &= unix.O_ACCMODE if g.hostFD >= 0 { return unix.Openat(g.hostFD, name, int(flags|unix.O_NOFOLLOW), 0) } childInode, err := g.clientFD.Walk(ctx, name) if err != nil { log.Infof("failed to walk %q from dev gofer FD", name) return 0, err } client := g.clientFD.Client() childFD := client.NewFD(childInode.ControlFD) childOpenFD, childHostFD, err := childFD.OpenAt(ctx, flags) if err != nil { log.Infof("failed to open %q from child FD", name) client.CloseFD(ctx, childFD.ID(), true /* flush */) return 0, err } client.CloseFD(ctx, childFD.ID(), false /* flush */) client.CloseFD(ctx, childOpenFD, true /* flush */) return childHostFD, nil } // GoferClientProvider provides a GoferClient for a given container. type GoferClientProvider interface { GetDevGoferClient(contName string) *GoferClient } golang-gvisor-gvisor-0.0~20240729.0/pkg/devutil/devutil_state_autogen.go000066400000000000000000000000711465435605700260230ustar00rootroot00000000000000// automatically generated by stateify. package devutil golang-gvisor-gvisor-0.0~20240729.0/pkg/erofs/000077500000000000000000000000001465435605700205425ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/erofs/erofs.go000066400000000000000000000613541465435605700222200ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package erofs provides the ability to access the contents in an EROFS [1] image. // // The design principle of this package is that, it will just provide the ability // to access the contents in the image, and it will never cache any objects internally. // The whole disk image is mapped via a read-only/shared mapping, and it relies on // host kernel to cache the blocks/pages transparently. // // [1] https://docs.kernel.org/filesystems/erofs.html package erofs import ( "bytes" "fmt" "hash/crc32" "os" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/safemem" ) const ( // Definitions for superblock. SuperBlockMagicV1 = 0xe0f5e1e2 SuperBlockOffset = 1024 // Inode slot size in bit shift. InodeSlotBits = 5 // Max file name length. MaxNameLen = 255 ) // Bit definitions for Inode*::Format. const ( InodeLayoutBit = 0 InodeLayoutBits = 1 InodeDataLayoutBit = 1 InodeDataLayoutBits = 3 ) // Inode layouts. const ( InodeLayoutCompact = 0 InodeLayoutExtended = 1 ) // Inode data layouts. const ( InodeDataLayoutFlatPlain = iota InodeDataLayoutFlatCompressionLegacy InodeDataLayoutFlatInline InodeDataLayoutFlatCompression InodeDataLayoutChunkBased InodeDataLayoutMax ) // Features w/ backward compatibility. // This is not exhaustive, unused features are not listed. const ( FeatureCompatSuperBlockChecksum = 0x00000001 ) // Features w/o backward compatibility. // // Any features that aren't in FeatureIncompatSupported are incompatible // with this implementation. // // This is not exhaustive, unused features are not listed. const ( FeatureIncompatSupported = 0x0 ) // Sizes of on-disk structures in bytes. const ( SuperBlockSize = 128 InodeCompactSize = 32 InodeExtendedSize = 64 DirentSize = 12 ) // SuperBlock represents on-disk superblock. // // +marshal // +stateify savable type SuperBlock struct { Magic uint32 Checksum uint32 FeatureCompat uint32 BlockSizeBits uint8 ExtSlots uint8 RootNid uint16 Inodes uint64 BuildTime uint64 BuildTimeNsec uint32 Blocks uint32 MetaBlockAddr uint32 XattrBlockAddr uint32 UUID [16]uint8 VolumeName [16]uint8 FeatureIncompat uint32 Union1 uint16 ExtraDevices uint16 DevTableSlotOff uint16 Reserved [38]uint8 } // BlockSize returns the block size. func (sb *SuperBlock) BlockSize() uint32 { return 1 << sb.BlockSizeBits } // BlockAddrToOffset converts block addr to the offset in image file. func (sb *SuperBlock) BlockAddrToOffset(addr uint32) uint64 { return uint64(addr) << sb.BlockSizeBits } // MetaOffset returns the offset of metadata area in image file. func (sb *SuperBlock) MetaOffset() uint64 { return sb.BlockAddrToOffset(sb.MetaBlockAddr) } // NidToOffset converts inode number to the offset in image file. func (sb *SuperBlock) NidToOffset(nid uint64) uint64 { return sb.MetaOffset() + (nid << InodeSlotBits) } // InodeCompact represents 32-byte reduced form of on-disk inode. // // +marshal type InodeCompact struct { Format uint16 XattrCount uint16 Mode uint16 Nlink uint16 Size uint32 Reserved uint32 RawBlockAddr uint32 Ino uint32 UID uint16 GID uint16 Reserved2 uint32 } // InodeExtended represents 64-byte complete form of on-disk inode. // // +marshal type InodeExtended struct { Format uint16 XattrCount uint16 Mode uint16 Reserved uint16 Size uint64 RawBlockAddr uint32 Ino uint32 UID uint32 GID uint32 Mtime uint64 MtimeNsec uint32 Nlink uint32 Reserved2 [16]uint8 } // Dirent represents on-disk directory entry. // // +marshal type Dirent struct { NidLow uint32 NidHigh uint32 NameOff uint16 FileType uint8 Reserved uint8 } // Nid returns the inode number of the inode referenced by this dirent. func (d *Dirent) Nid() uint64 { // EROFS on-disk structures are always in little endian. // TODO: This implementation does not support big endian yet. return (uint64(d.NidHigh) << 32) | uint64(d.NidLow) } // Image represents an open EROFS image. // // +stateify savable type Image struct { src *os.File `state:"nosave"` bytes []byte `state:"nosave"` sb SuperBlock } // OpenImage returns an Image providing access to the contents in the image file src. // // On success, the ownership of src is transferred to Image. func OpenImage(src *os.File) (*Image, error) { i := &Image{src: src} var cu cleanup.Cleanup defer cu.Clean() stat, err := i.src.Stat() if err != nil { return nil, err } i.bytes, err = unix.Mmap(int(i.src.Fd()), 0, int(stat.Size()), unix.PROT_READ, unix.MAP_SHARED) if err != nil { return nil, err } cu.Add(func() { unix.Munmap(i.bytes) }) if err := i.initSuperBlock(); err != nil { return nil, err } cu.Release() return i, nil } // Close closes the image. func (i *Image) Close() { unix.Munmap(i.bytes) i.src.Close() } // SuperBlock returns a copy of the image's superblock. func (i *Image) SuperBlock() SuperBlock { return i.sb } // BlockSize returns the block size of this image. func (i *Image) BlockSize() uint32 { return i.sb.BlockSize() } // Blocks returns the total blocks of this image. func (i *Image) Blocks() uint32 { return i.sb.Blocks } // RootNid returns the root inode number of this image. func (i *Image) RootNid() uint64 { return uint64(i.sb.RootNid) } // initSuperBlock initializes the superblock of this image. func (i *Image) initSuperBlock() error { // i.sb is used in the hot path. Let's save a copy of the superblock. if err := i.unmarshalAt(&i.sb, SuperBlockOffset); err != nil { return fmt.Errorf("image size is too small") } if i.sb.Magic != SuperBlockMagicV1 { return fmt.Errorf("unknown magic: 0x%x", i.sb.Magic) } if err := i.verifyChecksum(); err != nil { return err } if featureIncompat := i.sb.FeatureIncompat & ^uint32(FeatureIncompatSupported); featureIncompat != 0 { return fmt.Errorf("unsupported incompatible features detected: 0x%x", featureIncompat) } if i.BlockSize()%hostarch.PageSize != 0 { return fmt.Errorf("unsupported block size: 0x%x", i.BlockSize()) } return nil } // verifyChecksum verifies the checksum of the superblock. func (i *Image) verifyChecksum() error { if i.sb.FeatureCompat&FeatureCompatSuperBlockChecksum == 0 { return nil } sb := i.sb sb.Checksum = 0 table := crc32.MakeTable(crc32.Castagnoli) checksum := crc32.Checksum(marshal.Marshal(&sb), table) off := SuperBlockOffset + uint64(i.sb.SizeBytes()) if bytes, err := i.BytesAt(off, uint64(i.BlockSize())-off); err != nil { return fmt.Errorf("image size is too small") } else { checksum = ^crc32.Update(checksum, table, bytes) } if checksum != i.sb.Checksum { return fmt.Errorf("invalid checksum: 0x%x, expected: 0x%x", checksum, i.sb.Checksum) } return nil } // FD returns the host FD of underlying image file. func (i *Image) FD() int { return int(i.src.Fd()) } // checkRange checks whether the range [off, off+n) is valid. func (i *Image) checkRange(off, n uint64) bool { size := uint64(len(i.bytes)) end := off + n return off < size && off <= end && end <= size } // BytesAt returns the bytes at [off, off+n) of the image. func (i *Image) BytesAt(off, n uint64) ([]byte, error) { if ok := i.checkRange(off, n); !ok { log.Warningf("Invalid byte range (off: 0x%x, n: 0x%x) for image (size: 0x%x)", off, n, len(i.bytes)) return nil, linuxerr.EFAULT } return i.bytes[off : off+n], nil } // checkInodeAlignment checks whether off matches inode's alignment requirement. func checkInodeAlignment(off uint64) bool { // Each valid inode should be aligned with an inode slot, which is // a fixed value (32 bytes). return off&((1< blockSize-uint64(inodeSize) { log.Warningf("Inline data not found or cross block boundary at inode (nid=%v)", nid) return Inode{}, linuxerr.EUCLEAN } inode.idataOff = off + uint64(inodeSize) fallthrough case InodeDataLayoutFlatPlain: inode.dataOff = i.sb.BlockAddrToOffset(rawBlockAddr) default: log.Warningf("Unsupported data layout 0x%x at inode (nid=%v)", dataLayout, nid) return Inode{}, linuxerr.ENOTSUP } return inode, nil } // Inode represents in-memory inode object. // // +stateify savable type Inode struct { // image is the underlying image. Inode should not perform writable // operations (e.g. Close()) on the image. image *Image // dataOff points to the data of this inode in the data blocks. dataOff uint64 // idataOff points to the tail packing inline data of this inode // if it's not zero in the metadata block. idataOff uint64 // blocks indicates the count of blocks that store the data associated // with this inode. It will count in the metadata block that includes // the inline data as well. blocks uint64 // format is the format of this inode. format uint16 // Metadata. mode uint16 nid uint64 size uint64 mtime uint64 mtimeNsec uint32 uid uint32 gid uint32 nlink uint32 } // bitRange returns the bits within the range [bit, bit+bits) in value. func bitRange(value, bit, bits uint16) uint16 { return (value >> bit) & ((1 << bits) - 1) } // Layout returns the inode layout. func (i *Inode) Layout() uint16 { return bitRange(i.format, InodeLayoutBit, InodeLayoutBits) } // DataLayout returns the inode data layout. func (i *Inode) DataLayout() uint16 { return bitRange(i.format, InodeDataLayoutBit, InodeDataLayoutBits) } // IsRegular indicates whether i represents a regular file. func (i *Inode) IsRegular() bool { return i.mode&linux.S_IFMT == linux.S_IFREG } // IsDir indicates whether i represents a directory. func (i *Inode) IsDir() bool { return i.mode&linux.S_IFMT == linux.S_IFDIR } // IsCharDev indicates whether i represents a character device. func (i *Inode) IsCharDev() bool { return i.mode&linux.S_IFMT == linux.S_IFCHR } // IsBlockDev indicates whether i represents a block device. func (i *Inode) IsBlockDev() bool { return i.mode&linux.S_IFMT == linux.S_IFBLK } // IsFIFO indicates whether i represents a named pipe. func (i *Inode) IsFIFO() bool { return i.mode&linux.S_IFMT == linux.S_IFIFO } // IsSocket indicates whether i represents a socket. func (i *Inode) IsSocket() bool { return i.mode&linux.S_IFMT == linux.S_IFSOCK } // IsSymlink indicates whether i represents a symbolic link. func (i *Inode) IsSymlink() bool { return i.mode&linux.S_IFMT == linux.S_IFLNK } // Nid returns the inode number. func (i *Inode) Nid() uint64 { return i.nid } // Size returns the data size. func (i *Inode) Size() uint64 { return i.size } // Nlink returns the number of hard links. func (i *Inode) Nlink() uint32 { return i.nlink } // Mtime returns the time of last modification. func (i *Inode) Mtime() uint64 { return i.mtime } // MtimeNsec returns the nano second part of Mtime. func (i *Inode) MtimeNsec() uint32 { return i.mtimeNsec } // Mode returns the file type and permissions. func (i *Inode) Mode() uint16 { return i.mode } // UID returns the user ID of the owner. func (i *Inode) UID() uint32 { return i.uid } // GID returns the group ID of the owner. func (i *Inode) GID() uint32 { return i.gid } // DataOffset returns the data offset of this inode in image file. func (i *Inode) DataOffset() (uint64, error) { // TODO: We don't support regular files with inline data yet, which means the image // should be created with the "-E noinline_data" option. The "-E noinline_data" option // was introduced for the DAX feature support in Linux [1]. // [1] https://github.com/erofs/erofs-utils/commit/60549d52c3b636f0ddd1d51b0c1517c1dee22595 if dataLayout := i.DataLayout(); dataLayout != InodeDataLayoutFlatPlain { log.Warningf("Unsupported data layout 0x%x at inode (nid=%v)", dataLayout, i.Nid()) return 0, linuxerr.ENOTSUP } return i.dataOff, nil } // Data returns the read-only file data of this inode. func (i *Inode) Data() (safemem.BlockSeq, error) { switch dataLayout := i.DataLayout(); dataLayout { case InodeDataLayoutFlatPlain: bytes, err := i.image.BytesAt(i.dataOff, i.size) if err != nil { return safemem.BlockSeq{}, err } return safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bytes)), nil case InodeDataLayoutFlatInline: sl := make([]safemem.Block, 0, 2) idataSize := i.size & (uint64(i.image.BlockSize()) - 1) if i.size > idataSize { if bytes, err := i.image.BytesAt(i.dataOff, i.size-idataSize); err != nil { return safemem.BlockSeq{}, err } else { sl = append(sl, safemem.BlockFromSafeSlice(bytes)) } } if bytes, err := i.image.BytesAt(i.idataOff, idataSize); err != nil { return safemem.BlockSeq{}, err } else { sl = append(sl, safemem.BlockFromSafeSlice(bytes)) } return safemem.BlockSeqFromSlice(sl), nil default: log.Warningf("Unsupported data layout 0x%x at inode (nid=%v)", dataLayout, i.Nid()) return safemem.BlockSeq{}, linuxerr.ENOTSUP } } // blockData represents the information of the data in a block. type blockData struct { // base indicates the data offset within the image. base uint64 // size indicates the data size. size uint32 } // valid indicates whether this is valid information about the data in a block. func (b *blockData) valid() bool { // The data offset within the image will never be zero. return b.base > 0 } // getBlockDataInfo returns the information of the data in the block identified by // blockIdx of this inode. // // Precondition: blockIdx < i.blocks. func (i *Inode) getBlockDataInfo(blockIdx uint64) blockData { blockSize := i.image.BlockSize() lastBlock := blockIdx == i.blocks-1 base := i.idataOff if !lastBlock || base == 0 { base = i.dataOff + blockIdx*uint64(blockSize) } size := blockSize if lastBlock { if tailSize := uint32(i.size) & (blockSize - 1); tailSize != 0 { size = tailSize } } return blockData{base, size} } // getDirentName returns the name of dirent d in the given block of this inode. // // The on-disk format of one block looks like this: // // ___________________________ // / | // / ______________|________________ // / / | nameoff1 | nameoffN-1 // ____________.______________._______________v________________v__________ // | dirent | dirent | ... | dirent | filename | filename | ... | filename | // |___.0___|____1___|_____|___N-1__|____0_____|____1_____|_____|___N-1____| // \ ^ // \ | * could have // \ | trailing '\0' // \________________________| nameoff0 // Directory block // // The on-disk format of one directory looks like this: // // [ (block 1) dirent 1 | dirent 2 | dirent 3 | name 1 | name 2 | name 3 | optional padding ] // [ (block 2) dirent 4 | dirent 5 | name 4 | name 5 | optional padding ] // ... // [ (block N) dirent M | dirent M+1 | name M | name M+1 | optional padding ] // // [ (metadata block) inode | optional fields | dirent M+2 | dirent M+3 | name M+2 | name M+3 | optional padding ] // // Refer: https://docs.kernel.org/filesystems/erofs.html#directories func (i *Inode) getDirentName(d *Dirent, block blockData, lastDirent bool) ([]byte, error) { var nameLen uint32 if lastDirent { nameLen = block.size - uint32(d.NameOff) } else { nameLen = uint32(direntAfter(d).NameOff - d.NameOff) } if uint32(d.NameOff)+nameLen > block.size || nameLen > MaxNameLen || nameLen == 0 { log.Warningf("Corrupted dirent at inode (nid=%v)", i.Nid()) return nil, linuxerr.EUCLEAN } name, err := i.image.BytesAt(block.base+uint64(d.NameOff), uint64(nameLen)) if err != nil { return nil, err } if lastDirent { // Optional padding may exist at the end of a block. n := bytes.IndexByte(name, 0) if n == 0 { log.Warningf("Corrupted dirent at inode (nid=%v)", i.Nid()) return nil, linuxerr.EUCLEAN } if n != -1 { name = name[:n] } } return name, nil } // getDirent0 returns a pointer to the first dirent in the given block of this inode. func (i *Inode) getDirent0(block blockData) (*Dirent, error) { d0, err := i.image.direntAt(block.base) if err != nil { return nil, err } if d0.NameOff < DirentSize || uint32(d0.NameOff) >= block.size { log.Warningf("Invalid nameOff0 %v at inode (nid=%v)", d0.NameOff, i.Nid()) return nil, linuxerr.EUCLEAN } return d0, nil } // Lookup looks up a child by the name. The child inode number will be returned on success. func (i *Inode) Lookup(name string) (uint64, error) { if !i.IsDir() { return 0, linuxerr.ENOTDIR } // Currently (Go 1.21), there is no safe and efficient way to do three-way // string comparisons, so let's convert the string to a byte slice first. nameBytes := gohacks.ImmutableBytesFromString(name) // In EROFS, all directory entries are _strictly_ recorded in alphabetical // order. The lookup is done by directly performing binary search on the // disk data similar to what Linux does in fs/erofs/namei.c:erofs_namei(). var ( targetBlock blockData targetNumDirents uint16 ) // Find the block that may contain the target dirent first. bLeft, bRight := int64(0), int64(i.blocks)-1 for bLeft <= bRight { // Cast to uint64 to avoid overflow. mid := uint64(bLeft+bRight) >> 1 block := i.getBlockDataInfo(mid) d0, err := i.getDirent0(block) if err != nil { return 0, err } numDirents := d0.NameOff / DirentSize d0Name, err := i.getDirentName(d0, block, numDirents == 1) if err != nil { return 0, err } switch bytes.Compare(nameBytes, d0Name) { case 0: // Found the target dirent. return d0.Nid(), nil case 1: // name > d0Name, this block may contain the target dirent. targetBlock = block targetNumDirents = numDirents bLeft = int64(mid) + 1 case -1: // name < d0Name, this is not the block we're looking for. bRight = int64(mid) - 1 } } if !targetBlock.valid() { // The target block was not found. return 0, linuxerr.ENOENT } // Find the target dirent in the target block. Note that, as the 0th dirent // has already been checked during the block binary search, we don't need to // check it again and can define dLeft/dRight as unsigned types. dLeft, dRight := uint16(1), targetNumDirents-1 for dLeft <= dRight { // The sum will never lead to a uint16 overflow, as the maximum value of // the operands is MaxUint16/DirentSize. mid := (dLeft + dRight) >> 1 direntOff := targetBlock.base + uint64(mid)*DirentSize d, err := i.image.direntAt(direntOff) if err != nil { return 0, err } dName, err := i.getDirentName(d, targetBlock, mid == targetNumDirents-1) if err != nil { return 0, err } switch bytes.Compare(nameBytes, dName) { case 0: // Found the target dirent. return d.Nid(), nil case 1: // name > dName. dLeft = mid + 1 case -1: // name < dName. dRight = mid - 1 } } return 0, linuxerr.ENOENT } // IterDirents invokes cb on each entry in the directory represented by this inode. // The directory entries will be iterated in alphabetical order. func (i *Inode) IterDirents(cb func(name string, typ uint8, nid uint64) error) error { if !i.IsDir() { return linuxerr.ENOTDIR } // Iterate all the blocks which contain dirents. for blockIdx := uint64(0); blockIdx < i.blocks; blockIdx++ { block := i.getBlockDataInfo(blockIdx) d, err := i.getDirent0(block) if err != nil { return err } // Iterate all the dirents in this block. numDirents := d.NameOff / DirentSize for { name, err := i.getDirentName(d, block, numDirents == 1) if err != nil { return err } if err := cb(string(name), d.FileType, d.Nid()); err != nil { return err } if numDirents--; numDirents == 0 { break } d = direntAfter(d) } } return nil } // Readlink reads the link target. func (i *Inode) Readlink() (string, error) { if !i.IsSymlink() { return "", linuxerr.EINVAL } off := i.dataOff size := i.size if i.idataOff != 0 { // Inline symlink data shouldn't cross block boundary. if i.blocks > 1 { log.Warningf("Inline data cross block boundary at inode (nid=%v)", i.Nid()) return "", linuxerr.EUCLEAN } off = i.idataOff } else { // This matches Linux's behaviour in fs/namei.c:page_get_link() and // include/linux/namei.h:nd_terminate_link(). if size > hostarch.PageSize-1 { size = hostarch.PageSize - 1 } } target, err := i.image.BytesAt(off, size) if err != nil { return "", err } return string(target), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/erofs/erofs_abi_autogen_unsafe.go000066400000000000000000000517541465435605700261210ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package erofs import ( "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "io" "reflect" "runtime" "unsafe" ) // Marshallable types used by this file. var _ marshal.Marshallable = (*Dirent)(nil) var _ marshal.Marshallable = (*InodeCompact)(nil) var _ marshal.Marshallable = (*InodeExtended)(nil) var _ marshal.Marshallable = (*SuperBlock)(nil) // SizeBytes implements marshal.Marshallable.SizeBytes. func (d *Dirent) SizeBytes() int { return 12 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (d *Dirent) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(d.NidLow)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(d.NidHigh)) dst = dst[4:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(d.NameOff)) dst = dst[2:] dst[0] = byte(d.FileType) dst = dst[1:] dst[0] = byte(d.Reserved) dst = dst[1:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (d *Dirent) UnmarshalBytes(src []byte) []byte { d.NidLow = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] d.NidHigh = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] d.NameOff = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] d.FileType = uint8(src[0]) src = src[1:] d.Reserved = uint8(src[0]) src = src[1:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (d *Dirent) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (d *Dirent) MarshalUnsafe(dst []byte) []byte { size := d.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(d), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (d *Dirent) UnmarshalUnsafe(src []byte) []byte { size := d.SizeBytes() gohacks.Memmove(unsafe.Pointer(d), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (d *Dirent) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(d))) hdr.Len = d.SizeBytes() hdr.Cap = d.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that d // must live until the use above. runtime.KeepAlive(d) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (d *Dirent) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return d.CopyOutN(cc, addr, d.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (d *Dirent) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(d))) hdr.Len = d.SizeBytes() hdr.Cap = d.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that d // must live until the use above. runtime.KeepAlive(d) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (d *Dirent) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return d.CopyInN(cc, addr, d.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (d *Dirent) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(d))) hdr.Len = d.SizeBytes() hdr.Cap = d.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that d // must live until the use above. runtime.KeepAlive(d) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *InodeCompact) SizeBytes() int { return 32 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *InodeCompact) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.Format)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.XattrCount)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.Mode)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.Nlink)) dst = dst[2:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Size)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Reserved)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.RawBlockAddr)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Ino)) dst = dst[4:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.UID)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.GID)) dst = dst[2:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Reserved2)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *InodeCompact) UnmarshalBytes(src []byte) []byte { i.Format = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.XattrCount = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.Mode = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.Nlink = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.Size = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Reserved = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.RawBlockAddr = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Ino = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.UID = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.GID = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.Reserved2 = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *InodeCompact) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *InodeCompact) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *InodeCompact) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *InodeCompact) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *InodeCompact) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *InodeCompact) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *InodeCompact) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *InodeCompact) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *InodeExtended) SizeBytes() int { return 48 + 1*16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *InodeExtended) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.Format)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.XattrCount)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.Mode)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(i.Reserved)) dst = dst[2:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.Size)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.RawBlockAddr)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Ino)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.UID)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.GID)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(i.Mtime)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.MtimeNsec)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(i.Nlink)) dst = dst[4:] for idx := 0; idx < 16; idx++ { dst[0] = byte(i.Reserved2[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *InodeExtended) UnmarshalBytes(src []byte) []byte { i.Format = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.XattrCount = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.Mode = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.Reserved = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] i.Size = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] i.RawBlockAddr = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Ino = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.UID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.GID = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Mtime = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] i.MtimeNsec = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] i.Nlink = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 16; idx++ { i.Reserved2[idx] = uint8(src[0]) src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *InodeExtended) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *InodeExtended) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *InodeExtended) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *InodeExtended) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *InodeExtended) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *InodeExtended) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *InodeExtended) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *InodeExtended) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (sb *SuperBlock) SizeBytes() int { return 58 + 1*16 + 1*16 + 1*38 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (sb *SuperBlock) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(sb.Magic)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(sb.Checksum)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(sb.FeatureCompat)) dst = dst[4:] dst[0] = byte(sb.BlockSizeBits) dst = dst[1:] dst[0] = byte(sb.ExtSlots) dst = dst[1:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(sb.RootNid)) dst = dst[2:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(sb.Inodes)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(sb.BuildTime)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(sb.BuildTimeNsec)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(sb.Blocks)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(sb.MetaBlockAddr)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(sb.XattrBlockAddr)) dst = dst[4:] for idx := 0; idx < 16; idx++ { dst[0] = byte(sb.UUID[idx]) dst = dst[1:] } for idx := 0; idx < 16; idx++ { dst[0] = byte(sb.VolumeName[idx]) dst = dst[1:] } hostarch.ByteOrder.PutUint32(dst[:4], uint32(sb.FeatureIncompat)) dst = dst[4:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(sb.Union1)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(sb.ExtraDevices)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(sb.DevTableSlotOff)) dst = dst[2:] for idx := 0; idx < 38; idx++ { dst[0] = byte(sb.Reserved[idx]) dst = dst[1:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (sb *SuperBlock) UnmarshalBytes(src []byte) []byte { sb.Magic = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] sb.Checksum = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] sb.FeatureCompat = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] sb.BlockSizeBits = uint8(src[0]) src = src[1:] sb.ExtSlots = uint8(src[0]) src = src[1:] sb.RootNid = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] sb.Inodes = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] sb.BuildTime = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] sb.BuildTimeNsec = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] sb.Blocks = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] sb.MetaBlockAddr = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] sb.XattrBlockAddr = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 16; idx++ { sb.UUID[idx] = uint8(src[0]) src = src[1:] } for idx := 0; idx < 16; idx++ { sb.VolumeName[idx] = uint8(src[0]) src = src[1:] } sb.FeatureIncompat = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] sb.Union1 = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] sb.ExtraDevices = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] sb.DevTableSlotOff = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] for idx := 0; idx < 38; idx++ { sb.Reserved[idx] = uint8(src[0]) src = src[1:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (sb *SuperBlock) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (sb *SuperBlock) MarshalUnsafe(dst []byte) []byte { size := sb.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(sb), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (sb *SuperBlock) UnmarshalUnsafe(src []byte) []byte { size := sb.SizeBytes() gohacks.Memmove(unsafe.Pointer(sb), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (sb *SuperBlock) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(sb))) hdr.Len = sb.SizeBytes() hdr.Cap = sb.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that sb // must live until the use above. runtime.KeepAlive(sb) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (sb *SuperBlock) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return sb.CopyOutN(cc, addr, sb.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (sb *SuperBlock) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(sb))) hdr.Len = sb.SizeBytes() hdr.Cap = sb.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that sb // must live until the use above. runtime.KeepAlive(sb) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (sb *SuperBlock) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return sb.CopyInN(cc, addr, sb.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (sb *SuperBlock) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(sb))) hdr.Len = sb.SizeBytes() hdr.Cap = sb.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that sb // must live until the use above. runtime.KeepAlive(sb) // escapes: replaced by intrinsic. return int64(length), err } golang-gvisor-gvisor-0.0~20240729.0/pkg/erofs/erofs_state_autogen.go000066400000000000000000000107521465435605700251360ustar00rootroot00000000000000// automatically generated by stateify. package erofs import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (sb *SuperBlock) StateTypeName() string { return "pkg/erofs.SuperBlock" } func (sb *SuperBlock) StateFields() []string { return []string{ "Magic", "Checksum", "FeatureCompat", "BlockSizeBits", "ExtSlots", "RootNid", "Inodes", "BuildTime", "BuildTimeNsec", "Blocks", "MetaBlockAddr", "XattrBlockAddr", "UUID", "VolumeName", "FeatureIncompat", "Union1", "ExtraDevices", "DevTableSlotOff", "Reserved", } } func (sb *SuperBlock) beforeSave() {} // +checklocksignore func (sb *SuperBlock) StateSave(stateSinkObject state.Sink) { sb.beforeSave() stateSinkObject.Save(0, &sb.Magic) stateSinkObject.Save(1, &sb.Checksum) stateSinkObject.Save(2, &sb.FeatureCompat) stateSinkObject.Save(3, &sb.BlockSizeBits) stateSinkObject.Save(4, &sb.ExtSlots) stateSinkObject.Save(5, &sb.RootNid) stateSinkObject.Save(6, &sb.Inodes) stateSinkObject.Save(7, &sb.BuildTime) stateSinkObject.Save(8, &sb.BuildTimeNsec) stateSinkObject.Save(9, &sb.Blocks) stateSinkObject.Save(10, &sb.MetaBlockAddr) stateSinkObject.Save(11, &sb.XattrBlockAddr) stateSinkObject.Save(12, &sb.UUID) stateSinkObject.Save(13, &sb.VolumeName) stateSinkObject.Save(14, &sb.FeatureIncompat) stateSinkObject.Save(15, &sb.Union1) stateSinkObject.Save(16, &sb.ExtraDevices) stateSinkObject.Save(17, &sb.DevTableSlotOff) stateSinkObject.Save(18, &sb.Reserved) } func (sb *SuperBlock) afterLoad(context.Context) {} // +checklocksignore func (sb *SuperBlock) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &sb.Magic) stateSourceObject.Load(1, &sb.Checksum) stateSourceObject.Load(2, &sb.FeatureCompat) stateSourceObject.Load(3, &sb.BlockSizeBits) stateSourceObject.Load(4, &sb.ExtSlots) stateSourceObject.Load(5, &sb.RootNid) stateSourceObject.Load(6, &sb.Inodes) stateSourceObject.Load(7, &sb.BuildTime) stateSourceObject.Load(8, &sb.BuildTimeNsec) stateSourceObject.Load(9, &sb.Blocks) stateSourceObject.Load(10, &sb.MetaBlockAddr) stateSourceObject.Load(11, &sb.XattrBlockAddr) stateSourceObject.Load(12, &sb.UUID) stateSourceObject.Load(13, &sb.VolumeName) stateSourceObject.Load(14, &sb.FeatureIncompat) stateSourceObject.Load(15, &sb.Union1) stateSourceObject.Load(16, &sb.ExtraDevices) stateSourceObject.Load(17, &sb.DevTableSlotOff) stateSourceObject.Load(18, &sb.Reserved) } func (i *Image) StateTypeName() string { return "pkg/erofs.Image" } func (i *Image) StateFields() []string { return []string{ "sb", } } func (i *Image) beforeSave() {} // +checklocksignore func (i *Image) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.sb) } func (i *Image) afterLoad(context.Context) {} // +checklocksignore func (i *Image) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.sb) } func (i *Inode) StateTypeName() string { return "pkg/erofs.Inode" } func (i *Inode) StateFields() []string { return []string{ "image", "dataOff", "idataOff", "blocks", "format", "mode", "nid", "size", "mtime", "mtimeNsec", "uid", "gid", "nlink", } } func (i *Inode) beforeSave() {} // +checklocksignore func (i *Inode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.image) stateSinkObject.Save(1, &i.dataOff) stateSinkObject.Save(2, &i.idataOff) stateSinkObject.Save(3, &i.blocks) stateSinkObject.Save(4, &i.format) stateSinkObject.Save(5, &i.mode) stateSinkObject.Save(6, &i.nid) stateSinkObject.Save(7, &i.size) stateSinkObject.Save(8, &i.mtime) stateSinkObject.Save(9, &i.mtimeNsec) stateSinkObject.Save(10, &i.uid) stateSinkObject.Save(11, &i.gid) stateSinkObject.Save(12, &i.nlink) } func (i *Inode) afterLoad(context.Context) {} // +checklocksignore func (i *Inode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.image) stateSourceObject.Load(1, &i.dataOff) stateSourceObject.Load(2, &i.idataOff) stateSourceObject.Load(3, &i.blocks) stateSourceObject.Load(4, &i.format) stateSourceObject.Load(5, &i.mode) stateSourceObject.Load(6, &i.nid) stateSourceObject.Load(7, &i.size) stateSourceObject.Load(8, &i.mtime) stateSourceObject.Load(9, &i.mtimeNsec) stateSourceObject.Load(10, &i.uid) stateSourceObject.Load(11, &i.gid) stateSourceObject.Load(12, &i.nlink) } func init() { state.Register((*SuperBlock)(nil)) state.Register((*Image)(nil)) state.Register((*Inode)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/erofs/erofs_unsafe.go000066400000000000000000000027141465435605700235540ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package erofs import "unsafe" // pointerAt returns a pointer to offset off within the memory backed by image. // // Precondition: Callers are responsible for the range check. func (i *Image) pointerAt(off uint64) unsafe.Pointer { // Although callers will always do the range check, there is no need to // bother with the slice's builtin range check below. Because this function // will be inlined into callers, and there are no redundant checks and // unnecessary out-of-range panic calls in the code generated by the compiler. return unsafe.Pointer(&i.bytes[off]) } // direntAfter returns a pointer to the next adjacent dirent after dirent d. // // Preconditions: // - d is a pointer to the memory backed by image. // - d is not the last dirent in its block. func direntAfter(d *Dirent) *Dirent { return (*Dirent)(unsafe.Pointer(uintptr(unsafe.Pointer(d)) + DirentSize)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/erofs/erofs_unsafe_abi_autogen_unsafe.go000066400000000000000000000001451465435605700274460ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package erofs import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/erofs/erofs_unsafe_state_autogen.go000066400000000000000000000000671465435605700264750ustar00rootroot00000000000000// automatically generated by stateify. package erofs golang-gvisor-gvisor-0.0~20240729.0/pkg/errors/000077500000000000000000000000001465435605700207405ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/errors/errors.go000066400000000000000000000022461465435605700226070ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package errors holds the standardized error definition for gVisor. package errors import ( "gvisor.dev/gvisor/pkg/abi/linux/errno" ) // Error represents a syscall errno with a descriptive message. type Error struct { errno errno.Errno message string } // New creates a new *Error. func New(err errno.Errno, message string) *Error { return &Error{ errno: err, message: message, } } // Error implements error.Error. func (e *Error) Error() string { return e.message } // Errno returns the underlying errno.Errno value. func (e *Error) Errno() errno.Errno { return e.errno } golang-gvisor-gvisor-0.0~20240729.0/pkg/errors/errors_state_autogen.go000066400000000000000000000000701465435605700255220ustar00rootroot00000000000000// automatically generated by stateify. package errors golang-gvisor-gvisor-0.0~20240729.0/pkg/errors/linuxerr/000077500000000000000000000000001465435605700226105ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/errors/linuxerr/internal.go000066400000000000000000000112121465435605700247500ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"),; // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linuxerr import ( "gvisor.dev/gvisor/pkg/abi/linux/errno" "gvisor.dev/gvisor/pkg/errors" ) var ( // ErrWouldBlock is an internal error used to indicate that an operation // cannot be satisfied immediately, and should be retried at a later // time, possibly when the caller has received a notification that the // operation may be able to complete. It is used by implementations of // the kio.File interface. ErrWouldBlock = errors.New(errno.EWOULDBLOCK, "request would block") // ErrInterrupted is returned if a request is interrupted before it can // complete. ErrInterrupted = errors.New(errno.EINTR, "request was interrupted") // ErrExceedsFileSizeLimit is returned if a request would exceed the // file's size limit. ErrExceedsFileSizeLimit = errors.New(errno.E2BIG, "exceeds file size limit") ) var errorMap = map[error]*errors.Error{ ErrWouldBlock: EWOULDBLOCK, ErrInterrupted: EINTR, ErrExceedsFileSizeLimit: EFBIG, } // errorUnwrappers is an array of unwrap functions to extract typed errors. var errorUnwrappers = []func(error) (*errors.Error, bool){} // AddErrorUnwrapper registers an unwrap method that can extract a concrete error // from a typed, but not initialized, error. func AddErrorUnwrapper(unwrap func(e error) (*errors.Error, bool)) { errorUnwrappers = append(errorUnwrappers, unwrap) } // TranslateError translates errors to errnos, it will return false if // the error was not registered. func TranslateError(from error) (*errors.Error, bool) { if err, ok := errorMap[from]; ok { return err, true } // Try to unwrap the error if we couldn't match an error // exactly. This might mean that a package has its own // error type. for _, unwrap := range errorUnwrappers { if err, ok := unwrap(from); ok { return err, true } } return nil, false } // These errors are significant because ptrace syscall exit tracing can // observe them. // // For all of the following errors, if the syscall is not interrupted by a // signal delivered to a user handler, the syscall is restarted. var ( // ERESTARTSYS is returned by an interrupted syscall to indicate that it // should be converted to EINTR if interrupted by a signal delivered to a // user handler without SA_RESTART set, and restarted otherwise. ERESTARTSYS = errors.New(errno.ERESTARTSYS, "to be restarted if SA_RESTART is set") // ERESTARTNOINTR is returned by an interrupted syscall to indicate that it // should always be restarted. ERESTARTNOINTR = errors.New(errno.ERESTARTNOINTR, "to be restarted") // ERESTARTNOHAND is returned by an interrupted syscall to indicate that it // should be converted to EINTR if interrupted by a signal delivered to a // user handler, and restarted otherwise. ERESTARTNOHAND = errors.New(errno.ERESTARTNOHAND, "to be restarted if no handler") // ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate // that it should be restarted using a custom function. The interrupted // syscall must register a custom restart function by calling // Task.SetRestartSyscallFn. ERESTART_RESTARTBLOCK = errors.New(errno.ERESTART_RESTARTBLOCK, "interrupted by signal") ) var restartMap = map[int]*errors.Error{ -int(errno.ERESTARTSYS): ERESTARTSYS, -int(errno.ERESTARTNOINTR): ERESTARTNOINTR, -int(errno.ERESTARTNOHAND): ERESTARTNOHAND, -int(errno.ERESTART_RESTARTBLOCK): ERESTART_RESTARTBLOCK, } // IsRestartError checks if a given error is a restart error. func IsRestartError(err error) bool { switch err { case ERESTARTSYS, ERESTARTNOINTR, ERESTARTNOHAND, ERESTART_RESTARTBLOCK: return true default: return false } } // SyscallRestartErrorFromReturn returns the SyscallRestartErrno represented by // rv, the value in a syscall return register. func SyscallRestartErrorFromReturn(rv uintptr) (*errors.Error, bool) { err, ok := restartMap[int(rv)] return err, ok } // ConvertIntr converts the provided error code (err) to another one (intr) if // the first error corresponds to an interrupted operation. func ConvertIntr(err, intr error) error { if err == ErrInterrupted { return intr } return err } golang-gvisor-gvisor-0.0~20240729.0/pkg/errors/linuxerr/linuxerr.go000066400000000000000000000414421465435605700250140ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"),; // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package linuxerr contains syscall error codes exported as an error interface // pointers. This allows for fast comparison and return operations comperable // to unix.Errno constants. package linuxerr import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux/errno" "gvisor.dev/gvisor/pkg/errors" ) const maxErrno uint32 = errno.EHWPOISON + 1 // The following errors are semantically identical to Errno of type unix.Errno // or sycall.Errno. However, since the type are distinct ( these are // *errors.Error), they are not directly comperable. However, the Errno method // returns an Errno number such that the error can be compared to unix/syscall.Errno // (e.g. unix.Errno(EPERM.Errno()) == unix.EPERM is true). Converting unix/syscall.Errno // to the errors should be done via the lookup methods provided. var ( noError *errors.Error = nil EPERM = errors.New(errno.EPERM, "operation not permitted") ENOENT = errors.New(errno.ENOENT, "no such file or directory") ESRCH = errors.New(errno.ESRCH, "no such process") EINTR = errors.New(errno.EINTR, "interrupted system call") EIO = errors.New(errno.EIO, "I/O error") ENXIO = errors.New(errno.ENXIO, "no such device or address") E2BIG = errors.New(errno.E2BIG, "argument list too long") ENOEXEC = errors.New(errno.ENOEXEC, "exec format error") EBADF = errors.New(errno.EBADF, "bad file number") ECHILD = errors.New(errno.ECHILD, "no child processes") EAGAIN = errors.New(errno.EAGAIN, "try again") ENOMEM = errors.New(errno.ENOMEM, "out of memory") EACCES = errors.New(errno.EACCES, "permission denied") EFAULT = errors.New(errno.EFAULT, "bad address") ENOTBLK = errors.New(errno.ENOTBLK, "block device required") EBUSY = errors.New(errno.EBUSY, "device or resource busy") EEXIST = errors.New(errno.EEXIST, "file exists") EXDEV = errors.New(errno.EXDEV, "cross-device link") ENODEV = errors.New(errno.ENODEV, "no such device") ENOTDIR = errors.New(errno.ENOTDIR, "not a directory") EISDIR = errors.New(errno.EISDIR, "is a directory") EINVAL = errors.New(errno.EINVAL, "invalid argument") ENFILE = errors.New(errno.ENFILE, "file table overflow") EMFILE = errors.New(errno.EMFILE, "too many open files") ENOTTY = errors.New(errno.ENOTTY, "not a typewriter") ETXTBSY = errors.New(errno.ETXTBSY, "text file busy") EFBIG = errors.New(errno.EFBIG, "file too large") ENOSPC = errors.New(errno.ENOSPC, "no space left on device") ESPIPE = errors.New(errno.ESPIPE, "illegal seek") EROFS = errors.New(errno.EROFS, "read-only file system") EMLINK = errors.New(errno.EMLINK, "too many links") EPIPE = errors.New(errno.EPIPE, "broken pipe") EDOM = errors.New(errno.EDOM, "math argument out of domain of func") ERANGE = errors.New(errno.ERANGE, "math result not representable") // Errno values from include/uapi/asm-generic/errno.h. EDEADLK = errors.New(errno.EDEADLK, "resource deadlock would occur") ENAMETOOLONG = errors.New(errno.ENAMETOOLONG, "file name too long") ENOLCK = errors.New(errno.ENOLCK, "no record locks available") ENOSYS = errors.New(errno.ENOSYS, "invalid system call number") ENOTEMPTY = errors.New(errno.ENOTEMPTY, "directory not empty") ELOOP = errors.New(errno.ELOOP, "too many symbolic links encountered") ENOMSG = errors.New(errno.ENOMSG, "no message of desired type") EIDRM = errors.New(errno.EIDRM, "identifier removed") ECHRNG = errors.New(errno.ECHRNG, "channel number out of range") EL2NSYNC = errors.New(errno.EL2NSYNC, "level 2 not synchronized") EL3HLT = errors.New(errno.EL3HLT, "level 3 halted") EL3RST = errors.New(errno.EL3RST, "level 3 reset") ELNRNG = errors.New(errno.ELNRNG, "link number out of range") EUNATCH = errors.New(errno.EUNATCH, "protocol driver not attached") ENOCSI = errors.New(errno.ENOCSI, "no CSI structure available") EL2HLT = errors.New(errno.EL2HLT, "level 2 halted") EBADE = errors.New(errno.EBADE, "invalid exchange") EBADR = errors.New(errno.EBADR, "invalid request descriptor") EXFULL = errors.New(errno.EXFULL, "exchange full") ENOANO = errors.New(errno.ENOANO, "no anode") EBADRQC = errors.New(errno.EBADRQC, "invalid request code") EBADSLT = errors.New(errno.EBADSLT, "invalid slot") EBFONT = errors.New(errno.EBFONT, "bad font file format") ENOSTR = errors.New(errno.ENOSTR, "device not a stream") ENODATA = errors.New(errno.ENODATA, "no data available") ETIME = errors.New(errno.ETIME, "timer expired") ENOSR = errors.New(errno.ENOSR, "out of streams resources") ENOPKG = errors.New(errno.ENOPKG, "package not installed") EREMOTE = errors.New(errno.EREMOTE, "object is remote") ENOLINK = errors.New(errno.ENOLINK, "link has been severed") EADV = errors.New(errno.EADV, "advertise error") ESRMNT = errors.New(errno.ESRMNT, "srmount error") ECOMM = errors.New(errno.ECOMM, "communication error on send") EPROTO = errors.New(errno.EPROTO, "protocol error") EMULTIHOP = errors.New(errno.EMULTIHOP, "multihop attempted") EDOTDOT = errors.New(errno.EDOTDOT, "RFS specific error") EBADMSG = errors.New(errno.EBADMSG, "not a data message") EOVERFLOW = errors.New(errno.EOVERFLOW, "value too large for defined data type") ENOTUNIQ = errors.New(errno.ENOTUNIQ, "name not unique on network") EBADFD = errors.New(errno.EBADFD, "file descriptor in bad state") EREMCHG = errors.New(errno.EREMCHG, "remote address changed") ELIBACC = errors.New(errno.ELIBACC, "can not access a needed shared library") ELIBBAD = errors.New(errno.ELIBBAD, "accessing a corrupted shared library") ELIBSCN = errors.New(errno.ELIBSCN, ".lib section in a.out corrupted") ELIBMAX = errors.New(errno.ELIBMAX, "attempting to link in too many shared libraries") ELIBEXEC = errors.New(errno.ELIBEXEC, "cannot exec a shared library directly") EILSEQ = errors.New(errno.EILSEQ, "illegal byte sequence") ERESTART = errors.New(errno.ERESTART, "interrupted system call should be restarted") ESTRPIPE = errors.New(errno.ESTRPIPE, "streams pipe error") EUSERS = errors.New(errno.EUSERS, "too many users") ENOTSOCK = errors.New(errno.ENOTSOCK, "socket operation on non-socket") EDESTADDRREQ = errors.New(errno.EDESTADDRREQ, "destination address required") EMSGSIZE = errors.New(errno.EMSGSIZE, "message too long") EPROTOTYPE = errors.New(errno.EPROTOTYPE, "protocol wrong type for socket") ENOPROTOOPT = errors.New(errno.ENOPROTOOPT, "protocol not available") EPROTONOSUPPORT = errors.New(errno.EPROTONOSUPPORT, "protocol not supported") ESOCKTNOSUPPORT = errors.New(errno.ESOCKTNOSUPPORT, "socket type not supported") EOPNOTSUPP = errors.New(errno.EOPNOTSUPP, "operation not supported on transport endpoint") EPFNOSUPPORT = errors.New(errno.EPFNOSUPPORT, "protocol family not supported") EAFNOSUPPORT = errors.New(errno.EAFNOSUPPORT, "address family not supported by protocol") EADDRINUSE = errors.New(errno.EADDRINUSE, "address already in use") EADDRNOTAVAIL = errors.New(errno.EADDRNOTAVAIL, "cannot assign requested address") ENETDOWN = errors.New(errno.ENETDOWN, "network is down") ENETUNREACH = errors.New(errno.ENETUNREACH, "network is unreachable") ENETRESET = errors.New(errno.ENETRESET, "network dropped connection because of reset") ECONNABORTED = errors.New(errno.ECONNABORTED, "software caused connection abort") ECONNRESET = errors.New(errno.ECONNRESET, "connection reset by peer") ENOBUFS = errors.New(errno.ENOBUFS, "no buffer space available") EISCONN = errors.New(errno.EISCONN, "transport endpoint is already connected") ENOTCONN = errors.New(errno.ENOTCONN, "transport endpoint is not connected") ESHUTDOWN = errors.New(errno.ESHUTDOWN, "cannot send after transport endpoint shutdown") ETOOMANYREFS = errors.New(errno.ETOOMANYREFS, "too many references: cannot splice") ETIMEDOUT = errors.New(errno.ETIMEDOUT, "connection timed out") ECONNREFUSED = errors.New(errno.ECONNREFUSED, "connection refused") EHOSTDOWN = errors.New(errno.EHOSTDOWN, "host is down") EHOSTUNREACH = errors.New(errno.EHOSTUNREACH, "no route to host") EALREADY = errors.New(errno.EALREADY, "operation already in progress") EINPROGRESS = errors.New(errno.EINPROGRESS, "operation now in progress") ESTALE = errors.New(errno.ESTALE, "stale file handle") EUCLEAN = errors.New(errno.EUCLEAN, "structure needs cleaning") ENOTNAM = errors.New(errno.ENOTNAM, "not a XENIX named type file") ENAVAIL = errors.New(errno.ENAVAIL, "no XENIX semaphores available") EISNAM = errors.New(errno.EISNAM, "is a named type file") EREMOTEIO = errors.New(errno.EREMOTEIO, "remote I/O error") EDQUOT = errors.New(errno.EDQUOT, "quota exceeded") ENOMEDIUM = errors.New(errno.ENOMEDIUM, "no medium found") EMEDIUMTYPE = errors.New(errno.EMEDIUMTYPE, "wrong medium type") ECANCELED = errors.New(errno.ECANCELED, "operation Canceled") ENOKEY = errors.New(errno.ENOKEY, "required key not available") EKEYEXPIRED = errors.New(errno.EKEYEXPIRED, "key has expired") EKEYREVOKED = errors.New(errno.EKEYREVOKED, "key has been revoked") EKEYREJECTED = errors.New(errno.EKEYREJECTED, "key was rejected by service") EOWNERDEAD = errors.New(errno.EOWNERDEAD, "owner died") ENOTRECOVERABLE = errors.New(errno.ENOTRECOVERABLE, "state not recoverable") ERFKILL = errors.New(errno.ERFKILL, "operation not possible due to RF-kill") EHWPOISON = errors.New(errno.EHWPOISON, "memory page has hardware error") // Errors equivalent to other errors. EWOULDBLOCK = EAGAIN EDEADLOCK = EDEADLK ENONET = ENOENT ENOATTR = ENODATA ENOTSUP = EOPNOTSUPP ) // A nil *errors.Error denotes no error and is placed at the 0 index of // errorSlice. Thus, any other empty index should not be nil or a valid error. // This marks that index as an invalid error so any comparison to nil or a // valid linuxerr fails. var errNotValidError = errors.New(errno.Errno(maxErrno), "not a valid error") // The following errorSlice holds errors by errno for fast translation between // errnos (especially uint32(sycall.Errno)) and *errors.Error. var errorSlice = []*errors.Error{ // Errno values from include/uapi/asm-generic/errno-base.h. errno.NOERRNO: noError, errno.EPERM: EPERM, errno.ENOENT: ENOENT, errno.ESRCH: ESRCH, errno.EINTR: EINTR, errno.EIO: EIO, errno.ENXIO: ENXIO, errno.E2BIG: E2BIG, errno.ENOEXEC: ENOEXEC, errno.EBADF: EBADF, errno.ECHILD: ECHILD, errno.EAGAIN: EAGAIN, errno.ENOMEM: ENOMEM, errno.EACCES: EACCES, errno.EFAULT: EFAULT, errno.ENOTBLK: ENOTBLK, errno.EBUSY: EBUSY, errno.EEXIST: EEXIST, errno.EXDEV: EXDEV, errno.ENODEV: ENODEV, errno.ENOTDIR: ENOTDIR, errno.EISDIR: EISDIR, errno.EINVAL: EINVAL, errno.ENFILE: ENFILE, errno.EMFILE: EMFILE, errno.ENOTTY: ENOTTY, errno.ETXTBSY: ETXTBSY, errno.EFBIG: EFBIG, errno.ENOSPC: ENOSPC, errno.ESPIPE: ESPIPE, errno.EROFS: EROFS, errno.EMLINK: EMLINK, errno.EPIPE: EPIPE, errno.EDOM: EDOM, errno.ERANGE: ERANGE, // Errno values from include/uapi/asm-generic/errno.h. errno.EDEADLK: EDEADLK, errno.ENAMETOOLONG: ENAMETOOLONG, errno.ENOLCK: ENOLCK, errno.ENOSYS: ENOSYS, errno.ENOTEMPTY: ENOTEMPTY, errno.ELOOP: ELOOP, errno.ELOOP + 1: errNotValidError, // No valid errno between ELOOP and ENOMSG. errno.ENOMSG: ENOMSG, errno.EIDRM: EIDRM, errno.ECHRNG: ECHRNG, errno.EL2NSYNC: EL2NSYNC, errno.EL3HLT: EL3HLT, errno.EL3RST: EL3RST, errno.ELNRNG: ELNRNG, errno.EUNATCH: EUNATCH, errno.ENOCSI: ENOCSI, errno.EL2HLT: EL2HLT, errno.EBADE: EBADE, errno.EBADR: EBADR, errno.EXFULL: EXFULL, errno.ENOANO: ENOANO, errno.EBADRQC: EBADRQC, errno.EBADSLT: EBADSLT, errno.EBADSLT + 1: errNotValidError, // No valid errno between EBADSLT and ENOPKG. errno.EBFONT: EBFONT, errno.ENOSTR: ENOSTR, errno.ENODATA: ENODATA, errno.ETIME: ETIME, errno.ENOSR: ENOSR, errno.ENOSR + 1: errNotValidError, // No valid errno between ENOSR and ENOPKG. errno.ENOPKG: ENOPKG, errno.EREMOTE: EREMOTE, errno.ENOLINK: ENOLINK, errno.EADV: EADV, errno.ESRMNT: ESRMNT, errno.ECOMM: ECOMM, errno.EPROTO: EPROTO, errno.EMULTIHOP: EMULTIHOP, errno.EDOTDOT: EDOTDOT, errno.EBADMSG: EBADMSG, errno.EOVERFLOW: EOVERFLOW, errno.ENOTUNIQ: ENOTUNIQ, errno.EBADFD: EBADFD, errno.EREMCHG: EREMCHG, errno.ELIBACC: ELIBACC, errno.ELIBBAD: ELIBBAD, errno.ELIBSCN: ELIBSCN, errno.ELIBMAX: ELIBMAX, errno.ELIBEXEC: ELIBEXEC, errno.EILSEQ: EILSEQ, errno.ERESTART: ERESTART, errno.ESTRPIPE: ESTRPIPE, errno.EUSERS: EUSERS, errno.ENOTSOCK: ENOTSOCK, errno.EDESTADDRREQ: EDESTADDRREQ, errno.EMSGSIZE: EMSGSIZE, errno.EPROTOTYPE: EPROTOTYPE, errno.ENOPROTOOPT: ENOPROTOOPT, errno.EPROTONOSUPPORT: EPROTONOSUPPORT, errno.ESOCKTNOSUPPORT: ESOCKTNOSUPPORT, errno.EOPNOTSUPP: EOPNOTSUPP, errno.EPFNOSUPPORT: EPFNOSUPPORT, errno.EAFNOSUPPORT: EAFNOSUPPORT, errno.EADDRINUSE: EADDRINUSE, errno.EADDRNOTAVAIL: EADDRNOTAVAIL, errno.ENETDOWN: ENETDOWN, errno.ENETUNREACH: ENETUNREACH, errno.ENETRESET: ENETRESET, errno.ECONNABORTED: ECONNABORTED, errno.ECONNRESET: ECONNRESET, errno.ENOBUFS: ENOBUFS, errno.EISCONN: EISCONN, errno.ENOTCONN: ENOTCONN, errno.ESHUTDOWN: ESHUTDOWN, errno.ETOOMANYREFS: ETOOMANYREFS, errno.ETIMEDOUT: ETIMEDOUT, errno.ECONNREFUSED: ECONNREFUSED, errno.EHOSTDOWN: EHOSTDOWN, errno.EHOSTUNREACH: EHOSTUNREACH, errno.EALREADY: EALREADY, errno.EINPROGRESS: EINPROGRESS, errno.ESTALE: ESTALE, errno.EUCLEAN: EUCLEAN, errno.ENOTNAM: ENOTNAM, errno.ENAVAIL: ENAVAIL, errno.EISNAM: EISNAM, errno.EREMOTEIO: EREMOTEIO, errno.EDQUOT: EDQUOT, errno.ENOMEDIUM: ENOMEDIUM, errno.EMEDIUMTYPE: EMEDIUMTYPE, errno.ECANCELED: ECANCELED, errno.ENOKEY: ENOKEY, errno.EKEYEXPIRED: EKEYEXPIRED, errno.EKEYREVOKED: EKEYREVOKED, errno.EKEYREJECTED: EKEYREJECTED, errno.EOWNERDEAD: EOWNERDEAD, errno.ENOTRECOVERABLE: ENOTRECOVERABLE, errno.ERFKILL: ERFKILL, errno.EHWPOISON: EHWPOISON, } // ErrorFromUnix returns a linuxerr from a unix.Errno. func ErrorFromUnix(err unix.Errno) error { if err == unix.Errno(0) { return nil } e := errorSlice[errno.Errno(err)] // Done this way because a single comparison in benchmarks is 2-3 faster // than something like ( if err == nil && err > 0 ). if e == errNotValidError { panic(fmt.Sprintf("invalid error requested with errno: %v", e)) } return e } // ToError converts a linuxerr to an error type. func ToError(err *errors.Error) error { if err == noError { return nil } return err } // ToUnix converts a linuxerr to a unix.Errno. func ToUnix(e *errors.Error) unix.Errno { var unixErr unix.Errno if e != noError { unixErr = unix.Errno(e.Errno()) } return unixErr } // Equals compars a linuxerr to a given error. func Equals(e *errors.Error, err error) bool { var unixErr unix.Errno if e != noError { unixErr = unix.Errno(e.Errno()) } if err == nil { err = noError } return e == err || unixErr == err } golang-gvisor-gvisor-0.0~20240729.0/pkg/errors/linuxerr/linuxerr_state_autogen.go000066400000000000000000000000721465435605700277300ustar00rootroot00000000000000// automatically generated by stateify. package linuxerr golang-gvisor-gvisor-0.0~20240729.0/pkg/eventchannel/000077500000000000000000000000001465435605700220765ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/eventchannel/event.go000066400000000000000000000131241465435605700235470ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package eventchannel contains functionality for sending any protobuf message // on a socketpair. // // The wire format is a uvarint length followed by a binary protobuf.Any // message. package eventchannel import ( "encoding/binary" "fmt" "google.golang.org/protobuf/encoding/prototext" "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/anypb" "gvisor.dev/gvisor/pkg/errors/linuxerr" pb "gvisor.dev/gvisor/pkg/eventchannel/eventchannel_go_proto" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) // Emitter emits a proto message. type Emitter interface { // Emit writes a single eventchannel message to an emitter. Emit should // return hangup = true to indicate an emitter has "hung up" and no further // messages should be directed to it. Emit(msg proto.Message) (hangup bool, err error) // Close closes this emitter. Emit cannot be used after Close is called. Close() error } // DefaultEmitter is the default emitter. Calls to Emit and AddEmitter are sent // to this Emitter. var DefaultEmitter = &multiEmitter{} // Emit is a helper method that calls DefaultEmitter.Emit. func Emit(msg proto.Message) error { _, err := DefaultEmitter.Emit(msg) return err } // LogEmit is a helper method that calls DefaultEmitter.Emit. // It also logs a warning message when an error occurs. func LogEmit(msg proto.Message) error { _, err := DefaultEmitter.Emit(msg) if err != nil { log.Warningf("unable to emit event: %s", err) } return err } // AddEmitter is a helper method that calls DefaultEmitter.AddEmitter. func AddEmitter(e Emitter) { DefaultEmitter.AddEmitter(e) } // HaveEmitters indicates if any emitters have been registered to the // default emitter. func HaveEmitters() bool { DefaultEmitter.mu.Lock() defer DefaultEmitter.mu.Unlock() return len(DefaultEmitter.emitters) > 0 } // multiEmitter is an Emitter that forwards messages to multiple Emitters. type multiEmitter struct { // mu protects emitters. mu sync.Mutex // emitters is initialized lazily in AddEmitter. emitters map[Emitter]struct{} } // Emit emits a message using all added emitters. func (me *multiEmitter) Emit(msg proto.Message) (bool, error) { me.mu.Lock() defer me.mu.Unlock() var err error for e := range me.emitters { hangup, eerr := e.Emit(msg) if eerr != nil { if err == nil { err = fmt.Errorf("error emitting %v: on %v: %v", msg, e, eerr) } else { err = fmt.Errorf("%v; on %v: %v", err, e, eerr) } // Log as well, since most callers ignore the error. log.Warningf("Error emitting %v on %v: %v", msg, e, eerr) } if hangup { log.Infof("Hangup on eventchannel emitter %v.", e) delete(me.emitters, e) } } return false, err } // AddEmitter adds a new emitter. func (me *multiEmitter) AddEmitter(e Emitter) { me.mu.Lock() defer me.mu.Unlock() if me.emitters == nil { me.emitters = make(map[Emitter]struct{}) } me.emitters[e] = struct{}{} } // Close closes all emitters. If any Close call errors, it returns the first // one encountered. func (me *multiEmitter) Close() error { me.mu.Lock() defer me.mu.Unlock() var err error for e := range me.emitters { if eerr := e.Close(); err == nil && eerr != nil { err = eerr } delete(me.emitters, e) } return err } // socketEmitter emits proto messages on a socket. type socketEmitter struct { socket *unet.Socket } // SocketEmitter creates a new event channel based on the given fd. // // SocketEmitter takes ownership of fd. func SocketEmitter(fd int) (Emitter, error) { s, err := unet.NewSocket(fd) if err != nil { return nil, err } return &socketEmitter{ socket: s, }, nil } // Emit implements Emitter.Emit. func (s *socketEmitter) Emit(msg proto.Message) (bool, error) { any, err := anypb.New(msg) if err != nil { return false, err } bufMsg, err := proto.Marshal(any) if err != nil { return false, err } // Wire format is uvarint message length followed by binary proto. p := make([]byte, binary.MaxVarintLen64) n := binary.PutUvarint(p, uint64(len(bufMsg))) p = append(p[:n], bufMsg...) for done := 0; done < len(p); { n, err := s.socket.Write(p[done:]) if err != nil { return linuxerr.Equals(linuxerr.EPIPE, err), err } done += n } return false, nil } // Close implements Emitter.Emit. func (s *socketEmitter) Close() error { return s.socket.Close() } // debugEmitter wraps an emitter to emit stringified event messages. This is // useful for debugging -- when the messages are intended for humans. type debugEmitter struct { inner Emitter } // DebugEmitterFrom creates a new event channel emitter by wrapping an existing // raw emitter. func DebugEmitterFrom(inner Emitter) Emitter { return &debugEmitter{ inner: inner, } } func (d *debugEmitter) Emit(msg proto.Message) (bool, error) { text, err := prototext.Marshal(msg) if err != nil { return false, err } ev := &pb.DebugEvent{ Name: string(msg.ProtoReflect().Descriptor().FullName()), Text: string(text), } return d.inner.Emit(ev) } func (d *debugEmitter) Close() error { return d.inner.Close() } golang-gvisor-gvisor-0.0~20240729.0/pkg/eventchannel/eventchannel_go_proto/000077500000000000000000000000001465435605700264605ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/eventchannel/eventchannel_go_proto/event.pb.go000066400000000000000000000112461465435605700305340ustar00rootroot00000000000000// Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.31.0 // protoc v5.26.1 // source: pkg/eventchannel/event.proto package eventchannel_go_proto import ( protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" reflect "reflect" sync "sync" ) const ( // Verify that this generated code is sufficiently up-to-date. _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) // Verify that runtime/protoimpl is sufficiently up-to-date. _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) type DebugEvent struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` Text string `protobuf:"bytes,2,opt,name=text,proto3" json:"text,omitempty"` } func (x *DebugEvent) Reset() { *x = DebugEvent{} if protoimpl.UnsafeEnabled { mi := &file_pkg_eventchannel_event_proto_msgTypes[0] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *DebugEvent) String() string { return protoimpl.X.MessageStringOf(x) } func (*DebugEvent) ProtoMessage() {} func (x *DebugEvent) ProtoReflect() protoreflect.Message { mi := &file_pkg_eventchannel_event_proto_msgTypes[0] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use DebugEvent.ProtoReflect.Descriptor instead. func (*DebugEvent) Descriptor() ([]byte, []int) { return file_pkg_eventchannel_event_proto_rawDescGZIP(), []int{0} } func (x *DebugEvent) GetName() string { if x != nil { return x.Name } return "" } func (x *DebugEvent) GetText() string { if x != nil { return x.Text } return "" } var File_pkg_eventchannel_event_proto protoreflect.FileDescriptor var file_pkg_eventchannel_event_proto_rawDesc = []byte{ 0x0a, 0x1c, 0x70, 0x6b, 0x67, 0x2f, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x63, 0x68, 0x61, 0x6e, 0x6e, 0x65, 0x6c, 0x2f, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x06, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x22, 0x34, 0x0a, 0x0a, 0x44, 0x65, 0x62, 0x75, 0x67, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x12, 0x0a, 0x04, 0x74, 0x65, 0x78, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x74, 0x65, 0x78, 0x74, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( file_pkg_eventchannel_event_proto_rawDescOnce sync.Once file_pkg_eventchannel_event_proto_rawDescData = file_pkg_eventchannel_event_proto_rawDesc ) func file_pkg_eventchannel_event_proto_rawDescGZIP() []byte { file_pkg_eventchannel_event_proto_rawDescOnce.Do(func() { file_pkg_eventchannel_event_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_eventchannel_event_proto_rawDescData) }) return file_pkg_eventchannel_event_proto_rawDescData } var file_pkg_eventchannel_event_proto_msgTypes = make([]protoimpl.MessageInfo, 1) var file_pkg_eventchannel_event_proto_goTypes = []interface{}{ (*DebugEvent)(nil), // 0: gvisor.DebugEvent } var file_pkg_eventchannel_event_proto_depIdxs = []int32{ 0, // [0:0] is the sub-list for method output_type 0, // [0:0] is the sub-list for method input_type 0, // [0:0] is the sub-list for extension type_name 0, // [0:0] is the sub-list for extension extendee 0, // [0:0] is the sub-list for field type_name } func init() { file_pkg_eventchannel_event_proto_init() } func file_pkg_eventchannel_event_proto_init() { if File_pkg_eventchannel_event_proto != nil { return } if !protoimpl.UnsafeEnabled { file_pkg_eventchannel_event_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*DebugEvent); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } } type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_pkg_eventchannel_event_proto_rawDesc, NumEnums: 0, NumMessages: 1, NumExtensions: 0, NumServices: 0, }, GoTypes: file_pkg_eventchannel_event_proto_goTypes, DependencyIndexes: file_pkg_eventchannel_event_proto_depIdxs, MessageInfos: file_pkg_eventchannel_event_proto_msgTypes, }.Build() File_pkg_eventchannel_event_proto = out.File file_pkg_eventchannel_event_proto_rawDesc = nil file_pkg_eventchannel_event_proto_goTypes = nil file_pkg_eventchannel_event_proto_depIdxs = nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/eventchannel/eventchannel_state_autogen.go000066400000000000000000000000761465435605700300240ustar00rootroot00000000000000// automatically generated by stateify. package eventchannel golang-gvisor-gvisor-0.0~20240729.0/pkg/eventchannel/processor.go000066400000000000000000000074101465435605700244460ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package eventchannel import ( "encoding/binary" "fmt" "io" "os" "time" "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/anypb" pb "gvisor.dev/gvisor/pkg/eventchannel/eventchannel_go_proto" ) // eventProcessor carries display state across multiple events. type eventProcessor struct { filtering bool // filtered is the number of events omitted since printing the last matching // event. Only meaningful when filtering == true. filtered uint64 // allowlist is the set of event names to display. If empty, all events are // displayed. allowlist map[string]bool } // newEventProcessor creates a new EventProcessor with filters. func newEventProcessor(filters []string) *eventProcessor { e := &eventProcessor{ filtering: len(filters) > 0, allowlist: make(map[string]bool), } for _, f := range filters { e.allowlist[f] = true } return e } // processOne reads, parses and displays a single event from the event channel. // // The event channel is a stream of (msglen, payload) packets; this function // processes a single such packet. The msglen is a uvarint-encoded length for // the associated payload. The payload is a binary-encoded 'Any' protobuf, which // in turn encodes an arbitrary event protobuf. func (e *eventProcessor) processOne(src io.Reader, out *os.File) error { // Read and parse the msglen. lenbuf := make([]byte, binary.MaxVarintLen64) if _, err := io.ReadFull(src, lenbuf); err != nil { return err } msglen, consumed := binary.Uvarint(lenbuf) if consumed <= 0 { return fmt.Errorf("couldn't parse the message length") } // Read the payload. buf := make([]byte, msglen) // Copy any unused bytes from the len buffer into the payload buffer. These // bytes are actually part of the payload. extraBytes := copy(buf, lenbuf[consumed:]) if _, err := io.ReadFull(src, buf[extraBytes:]); err != nil { return err } // Unmarshal the payload into an "Any" protobuf, which encodes the actual // event. encodedEv := anypb.Any{} if err := proto.Unmarshal(buf, &encodedEv); err != nil { return fmt.Errorf("failed to unmarshal 'any' protobuf message: %v", err) } var ev pb.DebugEvent if err := encodedEv.UnmarshalTo(&ev); err != nil { return fmt.Errorf("failed to decode 'any' protobuf message: %v", err) } if e.filtering && e.allowlist[ev.Name] { e.filtered++ return nil } if e.filtering && e.filtered > 0 { if e.filtered == 1 { fmt.Fprintf(out, "... filtered %d event ...\n\n", e.filtered) } else { fmt.Fprintf(out, "... filtered %d events ...\n\n", e.filtered) } e.filtered = 0 } // Extract the inner event and display it. Example: // // 2017-10-04 14:35:05.316180374 -0700 PDT m=+1.132485846 // cloud_gvisor.MemoryUsage { // total: 23822336 // } fmt.Fprintf(out, "%v\n%v {\n", time.Now(), ev.Name) fmt.Fprintf(out, "%v", ev.Text) fmt.Fprintf(out, "}\n\n") return nil } // ProcessAll reads, parses and displays all events from src. The events are // displayed to out. func ProcessAll(src io.Reader, filters []string, out *os.File) error { ep := newEventProcessor(filters) for { switch err := ep.processOne(src, out); err { case nil: continue case io.EOF: return nil default: return err } } } golang-gvisor-gvisor-0.0~20240729.0/pkg/eventchannel/rate.go000066400000000000000000000034051465435605700233620ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package eventchannel import ( "golang.org/x/time/rate" "google.golang.org/protobuf/proto" ) // rateLimitedEmitter wraps an emitter and limits events to the given limits. // Events that would exceed the limit are discarded. type rateLimitedEmitter struct { inner Emitter limiter *rate.Limiter } // RateLimitedEmitterFrom creates a new event channel emitter that wraps the // existing emitter and enforces rate limits. The limits are imposed via a // token bucket, with `maxRate` events per second, with burst size of `burst` // events. See the golang.org/x/time/rate package and // https://en.wikipedia.org/wiki/Token_bucket for more information about token // buckets generally. func RateLimitedEmitterFrom(inner Emitter, maxRate float64, burst int) Emitter { return &rateLimitedEmitter{ inner: inner, limiter: rate.NewLimiter(rate.Limit(maxRate), burst), } } // Emit implements EventEmitter.Emit. func (rle *rateLimitedEmitter) Emit(msg proto.Message) (bool, error) { if !rle.limiter.Allow() { // Drop event. return false, nil } return rle.inner.Emit(msg) } // Close implements EventEmitter.Close. func (rle *rateLimitedEmitter) Close() error { return rle.inner.Close() } golang-gvisor-gvisor-0.0~20240729.0/pkg/eventfd/000077500000000000000000000000001465435605700210575ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/eventfd/eventfd.go000066400000000000000000000060361465435605700230460ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package eventfd wraps Linux's eventfd(2) syscall. package eventfd import ( "fmt" "io" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/rawfile" ) const sizeofUint64 = 8 // Eventfd represents a Linux eventfd object. type Eventfd struct { fd int } // Create returns an initialized eventfd. func Create() (Eventfd, error) { fd, _, err := unix.RawSyscall(unix.SYS_EVENTFD2, 0, 0, 0) if err != 0 { return Eventfd{}, fmt.Errorf("failed to create eventfd: %v", error(err)) } if err := unix.SetNonblock(int(fd), true); err != nil { unix.Close(int(fd)) return Eventfd{}, err } return Eventfd{int(fd)}, nil } // Wrap returns an initialized Eventfd using the provided fd. func Wrap(fd int) Eventfd { return Eventfd{fd} } // Close closes the eventfd, after which it should not be used. func (ev Eventfd) Close() error { return unix.Close(ev.fd) } // Dup copies the eventfd, calling dup(2) on the underlying file descriptor. func (ev Eventfd) Dup() (Eventfd, error) { other, err := unix.Dup(ev.fd) if err != nil { return Eventfd{}, fmt.Errorf("failed to dup: %v", other) } return Eventfd{other}, nil } // Notify alerts other users of the eventfd. Users can receive alerts by // calling Wait or Read. func (ev Eventfd) Notify() error { return ev.Write(1) } // Write writes a specific value to the eventfd. func (ev Eventfd) Write(val uint64) error { var buf [sizeofUint64]byte hostarch.ByteOrder.PutUint64(buf[:], val) for { n, err := nonBlockingWrite(ev.fd, buf[:]) if err == unix.EINTR { continue } if err != nil || n != sizeofUint64 { panic(fmt.Sprintf("bad write to eventfd: got %d bytes, wanted %d with error %v", n, sizeofUint64, err)) } return err } } // Wait blocks until eventfd is non-zero (i.e. someone calls Notify or Write). func (ev Eventfd) Wait() error { _, err := ev.Read() return err } // Read blocks until eventfd is non-zero (i.e. someone calls Notify or Write) // and returns the value read. func (ev Eventfd) Read() (uint64, error) { var tmp [sizeofUint64]byte n, errno := rawfile.BlockingRead(ev.fd, tmp[:]) if errno != 0 { return 0, errno } if n == 0 { return 0, io.EOF } if n != sizeofUint64 { panic(fmt.Sprintf("short read from eventfd: got %d bytes, wanted %d", n, sizeofUint64)) } return hostarch.ByteOrder.Uint64(tmp[:]), nil } // FD returns the underlying file descriptor. Use with care, as this breaks the // Eventfd abstraction. func (ev Eventfd) FD() int { return ev.fd } golang-gvisor-gvisor-0.0~20240729.0/pkg/eventfd/eventfd_state_autogen.go000066400000000000000000000000711465435605700257610ustar00rootroot00000000000000// automatically generated by stateify. package eventfd golang-gvisor-gvisor-0.0~20240729.0/pkg/eventfd/eventfd_unsafe.go000066400000000000000000000020741465435605700244050ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package eventfd import ( "unsafe" "golang.org/x/sys/unix" ) // nonBlockingWrite writes the given buffer to a file descriptor. It fails if // partial data is written. func nonBlockingWrite(fd int, buf []byte) (int, error) { var ptr unsafe.Pointer if len(buf) > 0 { ptr = unsafe.Pointer(&buf[0]) } nwritten, _, errno := unix.RawSyscall(unix.SYS_WRITE, uintptr(fd), uintptr(ptr), uintptr(len(buf))) if errno != 0 { return int(nwritten), errno } return int(nwritten), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/eventfd/eventfd_unsafe_state_autogen.go000066400000000000000000000000711465435605700273220ustar00rootroot00000000000000// automatically generated by stateify. package eventfd golang-gvisor-gvisor-0.0~20240729.0/pkg/fd/000077500000000000000000000000001465435605700200155ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/fd/fd.go000066400000000000000000000152421465435605700207410ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package fd provides types for working with file descriptors. package fd import ( "fmt" "io" "os" "runtime" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" ) // ReadWriter implements io.ReadWriter, io.ReaderAt, and io.WriterAt for fd. It // does not take ownership of fd. type ReadWriter struct { // fd is accessed atomically so FD.Close/Release can swap it. fd atomicbitops.Int64 } var _ io.ReadWriter = (*ReadWriter)(nil) var _ io.ReaderAt = (*ReadWriter)(nil) var _ io.WriterAt = (*ReadWriter)(nil) // NewReadWriter creates a ReadWriter for fd. func NewReadWriter(fd int) *ReadWriter { return &ReadWriter{ fd: atomicbitops.FromInt64(int64(fd)), } } func fixCount(n int, err error) (int, error) { if n < 0 { n = 0 } return n, err } // Read implements io.Reader. func (r *ReadWriter) Read(b []byte) (int, error) { c, err := fixCount(unix.Read(r.FD(), b)) if c == 0 && len(b) > 0 && err == nil { return 0, io.EOF } return c, err } // ReadAt implements io.ReaderAt. // // ReadAt always returns a non-nil error when c < len(b). func (r *ReadWriter) ReadAt(b []byte, off int64) (c int, err error) { for len(b) > 0 { var m int m, err = fixCount(unix.Pread(r.FD(), b, off)) if m == 0 && err == nil { return c, io.EOF } if err != nil { return c, err } c += m b = b[m:] off += int64(m) } return } // Write implements io.Writer. func (r *ReadWriter) Write(b []byte) (int, error) { var err error var n, remaining int for remaining = len(b); remaining > 0; { woff := len(b) - remaining n, err = unix.Write(r.FD(), b[woff:]) if n > 0 { // unix.Write wrote some bytes. This is the common case. remaining -= n } else { if err == nil { // unix.Write did not write anything nor did it return an error. // // There is no way to guarantee that a subsequent unix.Write will // make forward progress so just panic. panic(fmt.Sprintf("unix.Write returned %d with no error", n)) } if err != unix.EINTR { // If the write failed for anything other than a signal, bail out. break } } } return len(b) - remaining, err } // WriteAt implements io.WriterAt. func (r *ReadWriter) WriteAt(b []byte, off int64) (c int, err error) { for len(b) > 0 { var m int m, err = fixCount(unix.Pwrite(r.FD(), b, off)) if err != nil { break } c += m b = b[m:] off += int64(m) } return } // FD returns the owned file descriptor. Ownership remains unchanged. func (r *ReadWriter) FD() int { return int(r.fd.Load()) } // String implements Stringer.String(). func (r *ReadWriter) String() string { return fmt.Sprintf("FD: %d", r.FD()) } // FD owns a host file descriptor. // // It is similar to os.File, with a few important distinctions: // // FD provides a Release() method which relinquishes ownership. Like os.File, // FD adds a finalizer to close the backing FD. However, the finalizer cannot // be removed from os.File, forever pinning the lifetime of an FD to its // os.File. // // FD supports both blocking and non-blocking operation. os.File only // supports blocking operation. type FD struct { ReadWriter } // New creates a new FD. // // New takes ownership of fd. func New(fd int) *FD { if fd < 0 { return &FD{ ReadWriter: ReadWriter{ fd: atomicbitops.FromInt64(-1), }, } } f := &FD{ ReadWriter: ReadWriter{ fd: atomicbitops.FromInt64(int64(fd)), }, } runtime.SetFinalizer(f, (*FD).Close) return f } // NewFromFile creates a new FD from an os.File. // // NewFromFile does not transfer ownership of the file descriptor (it will be // duplicated, so both the os.File and FD will eventually need to be closed // and some (but not all) changes made to the FD will be applied to the // os.File as well). // // The returned FD is always blocking (Go 1.9+). func NewFromFile(file *os.File) (*FD, error) { fd, err := unix.Dup(int(file.Fd())) // Technically, the runtime may call the finalizer on file as soon as // Fd() returns. runtime.KeepAlive(file) if err != nil { return &FD{ ReadWriter: ReadWriter{ fd: atomicbitops.FromInt64(-1), }, }, err } return New(fd), nil } // NewFromFiles creates new FDs for each file in the slice. func NewFromFiles(files []*os.File) ([]*FD, error) { rv := make([]*FD, 0, len(files)) for _, f := range files { new, err := NewFromFile(f) if err != nil { // Cleanup on error. for _, fd := range rv { fd.Close() } return nil, err } rv = append(rv, new) } return rv, nil } // Open is equivalent to open(2). func Open(path string, openmode int, perm uint32) (*FD, error) { f, err := unix.Open(path, openmode|unix.O_LARGEFILE, perm) if err != nil { return nil, err } return New(f), nil } // OpenAt is equivalent to openat(2). func OpenAt(dir *FD, path string, flags int, mode uint32) (*FD, error) { f, err := unix.Openat(dir.FD(), path, flags, mode) if err != nil { return nil, err } return New(f), nil } // Close closes the file descriptor contained in the FD. // // Close is safe to call multiple times, but will return an error after the // first call. // // Concurrently calling Close and any other method is undefined. func (f *FD) Close() error { runtime.SetFinalizer(f, nil) return unix.Close(int(f.fd.Swap(-1))) } // Release relinquishes ownership of the contained file descriptor. // // Concurrently calling Release and any other method is undefined. func (f *FD) Release() int { runtime.SetFinalizer(f, nil) return int(f.fd.Swap(-1)) } // File converts the FD to an os.File. // // FD does not transfer ownership of the file descriptor (it will be // duplicated, so both the FD and os.File will eventually need to be closed // and some (but not all) changes made to the os.File will be applied to the // FD as well). // // This operation is somewhat expensive, so care should be taken to minimize // its use. func (f *FD) File() (*os.File, error) { fd, err := unix.Dup(f.FD()) if err != nil { return nil, err } return os.NewFile(uintptr(fd), ""), nil } // ReleaseToFile returns an os.File that takes ownership of the FD. // // name is passed to os.NewFile. func (f *FD) ReleaseToFile(name string) *os.File { return os.NewFile(uintptr(f.Release()), name) } golang-gvisor-gvisor-0.0~20240729.0/pkg/fd/fd_state_autogen.go000066400000000000000000000000641465435605700236570ustar00rootroot00000000000000// automatically generated by stateify. package fd golang-gvisor-gvisor-0.0~20240729.0/pkg/fdchannel/000077500000000000000000000000001465435605700213465ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/fdchannel/fdchannel_unsafe.go000066400000000000000000000123541465435605700251650ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris // +build aix darwin dragonfly freebsd linux netbsd openbsd solaris // Package fdchannel implements passing file descriptors between processes over // Unix domain sockets. package fdchannel import ( "fmt" "unsafe" "golang.org/x/sys/unix" ) // int32 is the real type of a file descriptor. const sizeofInt32 = int(unsafe.Sizeof(int32(0))) // NewConnectedSockets returns a pair of file descriptors, owned by the caller, // representing connected sockets that may be passed to separate calls to // NewEndpoint to create connected Endpoints. func NewConnectedSockets() ([2]int, error) { return unix.Socketpair(unix.AF_UNIX, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) } // Endpoint sends file descriptors to, and receives them from, another // connected Endpoint. // // Endpoint is not copyable or movable by value. type Endpoint struct { sockfd int32 msghdr unix.Msghdr cmsg *unix.Cmsghdr // followed by sizeofInt32 bytes of data } // Init must be called on zero-value Endpoints before first use. sockfd must be // a blocking AF_UNIX SOCK_SEQPACKET socket. func (ep *Endpoint) Init(sockfd int) { // "Datagram sockets in various domains (e.g., the UNIX and Internet // domains) permit zero-length datagrams." - recv(2). Experimentally, // sendmsg+recvmsg for a zero-length datagram is slightly faster than // sendmsg+recvmsg for a single byte over a stream socket. cmsgSlice := make([]byte, unix.CmsgSpace(sizeofInt32)) ep.sockfd = int32(sockfd) ep.msghdr.Control = (*byte)(unsafe.Pointer(&cmsgSlice[0])) ep.cmsg = (*unix.Cmsghdr)(unsafe.Pointer(&cmsgSlice[0])) // ep.msghdr.Controllen and ep.cmsg.* are mutated by recvmsg(2), so they're // set before calling sendmsg/recvmsg. } // NewEndpoint is a convenience function that returns an initialized Endpoint // allocated on the heap. func NewEndpoint(sockfd int) *Endpoint { ep := &Endpoint{} ep.Init(sockfd) return ep } // Destroy releases resources owned by ep. No other Endpoint methods may be // called after Destroy. func (ep *Endpoint) Destroy() { unix.Close(int(ep.sockfd)) ep.sockfd = -1 } // Shutdown causes concurrent and future calls to ep.SendFD(), ep.RecvFD(), and // ep.RecvFDNonblock(), as well as the same calls in the connected Endpoint, to // unblock and return errors. It does not wait for concurrent calls to return. // // Shutdown is the only Endpoint method that may be called concurrently with // other methods. func (ep *Endpoint) Shutdown() { unix.Shutdown(int(ep.sockfd), unix.SHUT_RDWR) } // SendFD sends the open file description represented by the given file // descriptor to the connected Endpoint. func (ep *Endpoint) SendFD(fd int) error { cmsgLen := unix.CmsgLen(sizeofInt32) ep.cmsg.Level = unix.SOL_SOCKET ep.cmsg.Type = unix.SCM_RIGHTS ep.cmsg.SetLen(cmsgLen) *ep.cmsgData() = int32(fd) ep.msghdr.SetControllen(cmsgLen) _, _, e := unix.Syscall(unix.SYS_SENDMSG, uintptr(ep.sockfd), uintptr(unsafe.Pointer(&ep.msghdr)), 0) if e != 0 { return e } return nil } // RecvFD receives an open file description from the connected Endpoint and // returns a file descriptor representing it, owned by the caller. func (ep *Endpoint) RecvFD() (int, error) { return ep.recvFD(false) } // RecvFDNonblock receives an open file description from the connected Endpoint // and returns a file descriptor representing it, owned by the caller. If there // are no pending receivable open file descriptions, RecvFDNonblock returns // (, EAGAIN or EWOULDBLOCK). func (ep *Endpoint) RecvFDNonblock() (int, error) { return ep.recvFD(true) } func (ep *Endpoint) recvFD(nonblock bool) (int, error) { cmsgLen := unix.CmsgLen(sizeofInt32) ep.msghdr.SetControllen(cmsgLen) var e unix.Errno if nonblock { _, _, e = unix.RawSyscall(unix.SYS_RECVMSG, uintptr(ep.sockfd), uintptr(unsafe.Pointer(&ep.msghdr)), unix.MSG_TRUNC|unix.MSG_DONTWAIT) } else { _, _, e = unix.Syscall(unix.SYS_RECVMSG, uintptr(ep.sockfd), uintptr(unsafe.Pointer(&ep.msghdr)), unix.MSG_TRUNC) } if e != 0 { return -1, e } if int(ep.msghdr.Controllen) != cmsgLen { return -1, fmt.Errorf("received control message has incorrect length: got %d, wanted %d", ep.msghdr.Controllen, cmsgLen) } if ep.cmsg.Level != unix.SOL_SOCKET || ep.cmsg.Type != unix.SCM_RIGHTS { return -1, fmt.Errorf("received control message has incorrect (level, type): got (%v, %v), wanted (%v, %v)", ep.cmsg.Level, ep.cmsg.Type, unix.SOL_SOCKET, unix.SCM_RIGHTS) } return int(*ep.cmsgData()), nil } func (ep *Endpoint) cmsgData() *int32 { // unix.CmsgLen(0) == unix.cmsgAlignOf(unix.SizeofCmsghdr) return (*int32)(unsafe.Pointer(uintptr(unsafe.Pointer(ep.cmsg)) + uintptr(unix.CmsgLen(0)))) } golang-gvisor-gvisor-0.0~20240729.0/pkg/fdchannel/fdchannel_unsafe_state_autogen.go000066400000000000000000000003321465435605700301000ustar00rootroot00000000000000// automatically generated by stateify. //go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris // +build aix darwin dragonfly freebsd linux netbsd openbsd solaris package fdchannel golang-gvisor-gvisor-0.0~20240729.0/pkg/fdnotifier/000077500000000000000000000000001465435605700215555ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/fdnotifier/fdnotifier.go000066400000000000000000000121241465435605700242350ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux // Package fdnotifier contains an adapter that translates IO events (e.g., a // file became readable/writable) from native FDs to the notifications in the // waiter package. It uses epoll in edge-triggered mode to receive notifications // for registered FDs. package fdnotifier import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) type fdInfo struct { queue *waiter.Queue waiting bool } // notifier holds all the state necessary to issue notifications when IO events // occur in the observed FDs. type notifier struct { // epFD is the epoll file descriptor used to register for io // notifications. epFD int // mu protects fdMap. mu sync.Mutex // fdMap maps file descriptors to their notification queues and waiting // status. fdMap map[int32]*fdInfo } // newNotifier creates a new notifier object. func newNotifier() (*notifier, error) { epfd, err := unix.EpollCreate1(0) if err != nil { return nil, err } w := ¬ifier{ epFD: epfd, fdMap: make(map[int32]*fdInfo), } go w.waitAndNotify() // S/R-SAFE: no waiter exists during save / load. return w, nil } // waitFD waits on mask for fd. The fdMap mutex must be hold. func (n *notifier) waitFD(fd int32, fi *fdInfo, mask waiter.EventMask) error { if !fi.waiting && mask == 0 { return nil } e := unix.EpollEvent{ Events: mask.ToLinux() | unix.EPOLLET, Fd: fd, } switch { case !fi.waiting && mask != 0: if err := unix.EpollCtl(n.epFD, unix.EPOLL_CTL_ADD, int(fd), &e); err != nil { return err } fi.waiting = true case fi.waiting && mask == 0: unix.EpollCtl(n.epFD, unix.EPOLL_CTL_DEL, int(fd), nil) fi.waiting = false case fi.waiting && mask != 0: if err := unix.EpollCtl(n.epFD, unix.EPOLL_CTL_MOD, int(fd), &e); err != nil { return err } } return nil } // addFD adds an FD to the list of FDs observed by n. func (n *notifier) addFD(fd int32, queue *waiter.Queue) error { n.mu.Lock() defer n.mu.Unlock() // Panic if we're already notifying on this FD. if _, ok := n.fdMap[fd]; ok { panic(fmt.Sprintf("File descriptor %v added twice", fd)) } info := &fdInfo{queue: queue} // We might already have something in queue to wait for. if err := n.waitFD(fd, info, queue.Events()); err != nil { return err } // Add it to the map. n.fdMap[fd] = info return nil } // updateFD updates the set of events the fd needs to be notified on. func (n *notifier) updateFD(fd int32) error { n.mu.Lock() defer n.mu.Unlock() if fi, ok := n.fdMap[fd]; ok { return n.waitFD(fd, fi, fi.queue.Events()) } return nil } // RemoveFD removes an FD from the list of FDs observed by n. func (n *notifier) removeFD(fd int32) { n.mu.Lock() defer n.mu.Unlock() // Remove from map, then from epoll object. n.waitFD(fd, n.fdMap[fd], 0) delete(n.fdMap, fd) } // hasFD returns true if the fd is in the list of observed FDs. func (n *notifier) hasFD(fd int32) bool { n.mu.Lock() defer n.mu.Unlock() _, ok := n.fdMap[fd] return ok } // waitAndNotify run is its own goroutine and loops waiting for io event // notifications from the epoll object. Once notifications arrive, they are // dispatched to the registered queue. func (n *notifier) waitAndNotify() error { e := make([]unix.EpollEvent, 100) for { v, err := epollWait(n.epFD, e, -1) if err == unix.EINTR { continue } if err != nil { return err } notified := false n.mu.Lock() for i := 0; i < v; i++ { if fi, ok := n.fdMap[e[i].Fd]; ok { fi.queue.Notify(waiter.EventMaskFromLinux(e[i].Events)) notified = true } } n.mu.Unlock() if notified { // Let goroutines woken by Notify get a chance to run before we // epoll_wait again. sync.Goyield() } } } var shared struct { notifier *notifier once sync.Once initErr error } // AddFD adds an FD to the list of observed FDs. func AddFD(fd int32, queue *waiter.Queue) error { shared.once.Do(func() { shared.notifier, shared.initErr = newNotifier() }) if shared.initErr != nil { return shared.initErr } return shared.notifier.addFD(fd, queue) } // UpdateFD updates the set of events the fd needs to be notified on. func UpdateFD(fd int32) error { return shared.notifier.updateFD(fd) } // RemoveFD removes an FD from the list of observed FDs. func RemoveFD(fd int32) { shared.notifier.removeFD(fd) } // HasFD returns true if the FD is in the list of observed FDs. // // This should only be used by tests to assert that FDs are correctly registered. func HasFD(fd int32) bool { return shared.notifier.hasFD(fd) } golang-gvisor-gvisor-0.0~20240729.0/pkg/fdnotifier/fdnotifier_state_autogen.go000066400000000000000000000001361465435605700271570ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux // +build linux package fdnotifier golang-gvisor-gvisor-0.0~20240729.0/pkg/fdnotifier/fdnotifier_unsafe_state_autogen.go000066400000000000000000000001361465435605700305200ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux // +build linux package fdnotifier golang-gvisor-gvisor-0.0~20240729.0/pkg/fdnotifier/poll_unsafe.go000066400000000000000000000042231465435605700244140ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package fdnotifier import ( "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/waiter" ) // NonBlockingPoll polls the given FD in non-blocking fashion. It is used just // to query the FD's current state. func NonBlockingPoll(fd int32, mask waiter.EventMask) waiter.EventMask { e := struct { fd int32 events int16 revents int16 }{ fd: fd, events: int16(mask.ToLinux()), } ts := unix.Timespec{ Sec: 0, Nsec: 0, } for { n, _, err := unix.RawSyscall6(unix.SYS_PPOLL, uintptr(unsafe.Pointer(&e)), 1, uintptr(unsafe.Pointer(&ts)), 0, 0, 0) // Interrupted by signal, try again. if err == unix.EINTR { continue } // If an error occur we'll conservatively say the FD is ready for // whatever is being checked. if err != 0 { return mask } // If no FDs were returned, it wasn't ready for anything. if n == 0 { return 0 } // Otherwise we got the ready events in the revents field. return waiter.EventMaskFromLinux(uint32(e.revents)) } } // epollWait performs a blocking wait on epfd. // // Preconditions: len(events) > 0 func epollWait(epfd int, events []unix.EpollEvent, msec int) (int, error) { if len(events) == 0 { panic("Empty events passed to EpollWait") } // We actually use epoll_pwait with NULL sigmask instead of epoll_wait // since that is what the Go >= 1.11 runtime prefers. r, _, e := unix.Syscall6(unix.SYS_EPOLL_PWAIT, uintptr(epfd), uintptr(unsafe.Pointer(&events[0])), uintptr(len(events)), uintptr(msec), 0, 0) if e != 0 { return 0, e } return int(r), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/flipcall/000077500000000000000000000000001465435605700212125ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/flipcall/ctrl_futex.go000066400000000000000000000135651465435605700237320ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false package flipcall import ( "encoding/json" "fmt" "math" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/log" ) type endpointControlImpl struct { state atomicbitops.Int32 } // Bits in endpointControlImpl.state. const ( epsBlocked = 1 << iota epsShutdown ) func (ep *Endpoint) ctrlInit(opts ...EndpointOption) error { if len(opts) != 0 { return fmt.Errorf("unknown EndpointOption: %T", opts[0]) } return nil } func (ep *Endpoint) ctrlConnect() error { if err := ep.enterFutexWait(); err != nil { return err } defer ep.exitFutexWait() // Write the connection request. w := ep.NewWriter() if err := json.NewEncoder(w).Encode(struct{}{}); err != nil { return fmt.Errorf("error writing connection request: %v", err) } *ep.dataLen() = atomicbitops.FromUint32(w.Len()) // Exchange control with the server. if err := ep.futexSetPeerActive(); err != nil { return err } if err := ep.futexWakePeer(); err != nil { return err } if err := ep.futexWaitUntilActive(); err != nil { return err } // Read the connection response. var resp struct{} respLen := ep.dataLen().Load() if respLen > ep.dataCap { return fmt.Errorf("invalid connection response length %d (maximum %d)", respLen, ep.dataCap) } if err := json.NewDecoder(ep.NewReader(respLen)).Decode(&resp); err != nil { return fmt.Errorf("error reading connection response: %v", err) } return nil } func (ep *Endpoint) ctrlWaitFirst() error { if err := ep.enterFutexWait(); err != nil { return err } defer ep.exitFutexWait() // Wait for the connection request. if err := ep.futexWaitUntilActive(); err != nil { return err } // Read the connection request. reqLen := ep.dataLen().Load() if reqLen > ep.dataCap { return fmt.Errorf("invalid connection request length %d (maximum %d)", reqLen, ep.dataCap) } var req struct{} if err := json.NewDecoder(ep.NewReader(reqLen)).Decode(&req); err != nil { return fmt.Errorf("error reading connection request: %v", err) } // Write the connection response. w := ep.NewWriter() if err := json.NewEncoder(w).Encode(struct{}{}); err != nil { return fmt.Errorf("error writing connection response: %v", err) } *ep.dataLen() = atomicbitops.FromUint32(w.Len()) // Return control to the client. raceBecomeInactive() if err := ep.futexSetPeerActive(); err != nil { return err } if err := ep.futexWakePeer(); err != nil { return err } // Wait for the first non-connection message. return ep.futexWaitUntilActive() } func (ep *Endpoint) ctrlRoundTrip(mayRetainP bool) error { if err := ep.enterFutexWait(); err != nil { return err } defer ep.exitFutexWait() if err := ep.futexSetPeerActive(); err != nil { return err } if err := ep.futexWakePeer(); err != nil { return err } // Since we don't know if the peer Endpoint is in the same process as this // one (in which case it may need our P to run), we allow our P to be // retaken regardless of mayRetainP. return ep.futexWaitUntilActive() } func (ep *Endpoint) ctrlWakeLast() error { if err := ep.futexSetPeerActive(); err != nil { return err } return ep.futexWakePeer() } func (ep *Endpoint) enterFutexWait() error { switch eps := ep.ctrl.state.Add(epsBlocked); eps { case epsBlocked: return nil case epsBlocked | epsShutdown: ep.ctrl.state.Add(-epsBlocked) return ShutdownError{} default: // Most likely due to ep.enterFutexWait() being called concurrently // from multiple goroutines. panic(fmt.Sprintf("invalid flipcall.Endpoint.ctrl.state before flipcall.Endpoint.enterFutexWait(): %v", eps-epsBlocked)) } } func (ep *Endpoint) exitFutexWait() { switch eps := ep.ctrl.state.Add(-epsBlocked); eps { case 0: return case epsShutdown: // ep.ctrlShutdown() was called while we were blocked, so we are // responsible for indicating connection shutdown. ep.shutdownConn() default: panic(fmt.Sprintf("invalid flipcall.Endpoint.ctrl.state after flipcall.Endpoint.exitFutexWait(): %v", eps+epsBlocked)) } } func (ep *Endpoint) ctrlShutdown() { // Set epsShutdown to ensure that future calls to ep.enterFutexWait() fail. if ep.ctrl.state.Add(epsShutdown)&epsBlocked != 0 { // Wake the blocked thread. This must loop because it's possible that // FUTEX_WAKE occurs after the waiter sets epsBlocked, but before it // blocks in FUTEX_WAIT. for { // Wake MaxInt32 threads to prevent a broken or malicious peer from // swallowing our wakeup by FUTEX_WAITing from multiple threads. if err := ep.futexWakeConnState(math.MaxInt32); err != nil { log.Warningf("failed to FUTEX_WAKE Endpoints: %v", err) break } yieldThread() if ep.ctrl.state.Load()&epsBlocked == 0 { break } } } else { // There is no blocked thread, so we are responsible for indicating // connection shutdown. ep.shutdownConn() } } func (ep *Endpoint) shutdownConn() { switch cs := ep.connState().Swap(csShutdown); cs { case ep.activeState: if err := ep.futexWakeConnState(1); err != nil { log.Warningf("failed to FUTEX_WAKE peer Endpoint for shutdown: %v", err) } case ep.inactiveState: // The peer is currently active and will detect shutdown when it tries // to update the connection state. case csShutdown: // The peer also called Endpoint.Shutdown(). default: log.Warningf("unexpected connection state before Endpoint.shutdownConn(): %v", cs) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/flipcall/flipcall.go000066400000000000000000000233171465435605700233350ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package flipcall implements a protocol providing Fast Local Interprocess // Procedure Calls between mutually-distrusting processes. package flipcall import ( "fmt" "math" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/memutil" ) // An Endpoint provides the ability to synchronously transfer data and control // to a connected peer Endpoint, which may be in another process. // // Since the Endpoint control transfer model is synchronous, at any given time // one Endpoint "has control" (designated the active Endpoint), and the other // is "waiting for control" (designated the inactive Endpoint). Users of the // flipcall package designate one Endpoint as the client, which is initially // active, and the other as the server, which is initially inactive. See // flipcall_example_test.go for usage. type Endpoint struct { // packet is a pointer to the beginning of the packet window. (Since this // is a raw OS memory mapping and not a Go object, it does not need to be // represented as an unsafe.Pointer.) packet is immutable. packet uintptr // dataCap is the size of the datagram part of the packet window in bytes. // dataCap is immutable. dataCap uint32 // activeState is csClientActive if this is a client Endpoint and // csServerActive if this is a server Endpoint. activeState uint32 // inactiveState is csServerActive if this is a client Endpoint and // csClientActive if this is a server Endpoint. inactiveState uint32 // shutdown is non-zero if Endpoint.Shutdown() has been called, or if the // Endpoint has acknowledged shutdown initiated by the peer. shutdown atomicbitops.Uint32 ctrl endpointControlImpl } // EndpointSide indicates which side of a connection an Endpoint belongs to. type EndpointSide int const ( // ClientSide indicates that an Endpoint is a client (initially-active; // first method call should be Connect). ClientSide EndpointSide = iota // ServerSide indicates that an Endpoint is a server (initially-inactive; // first method call should be RecvFirst.) ServerSide ) // Init must be called on zero-value Endpoints before first use. If it // succeeds, ep.Destroy() must be called once the Endpoint is no longer in use. // // pwd represents the packet window used to exchange data with the peer // Endpoint. FD may differ between Endpoints if they are in different // processes, but must represent the same file. The packet window must // initially be filled with zero bytes. func (ep *Endpoint) Init(side EndpointSide, pwd PacketWindowDescriptor, opts ...EndpointOption) error { switch side { case ClientSide: ep.activeState = csClientActive ep.inactiveState = csServerActive case ServerSide: ep.activeState = csServerActive ep.inactiveState = csClientActive default: return fmt.Errorf("invalid EndpointSide: %v", side) } if pwd.Length < pageSize { return fmt.Errorf("packet window size (%d) less than minimum (%d)", pwd.Length, pageSize) } if pwd.Length > math.MaxUint32 { return fmt.Errorf("packet window size (%d) exceeds maximum (%d)", pwd.Length, math.MaxUint32) } m, err := memutil.MapFile(0, uintptr(pwd.Length), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED, uintptr(pwd.FD), uintptr(pwd.Offset)) if err != nil { return fmt.Errorf("failed to mmap packet window: %v", err) } ep.packet = m ep.dataCap = uint32(pwd.Length) - uint32(PacketHeaderBytes) if err := ep.ctrlInit(opts...); err != nil { ep.unmapPacket() return err } return nil } // NewEndpoint is a convenience function that returns an initialized Endpoint // allocated on the heap. func NewEndpoint(side EndpointSide, pwd PacketWindowDescriptor, opts ...EndpointOption) (*Endpoint, error) { var ep Endpoint if err := ep.Init(side, pwd, opts...); err != nil { return nil, err } return &ep, nil } // An EndpointOption configures an Endpoint. type EndpointOption interface { isEndpointOption() } // Destroy releases resources owned by ep. No other Endpoint methods may be // called after Destroy. func (ep *Endpoint) Destroy() { ep.unmapPacket() } func (ep *Endpoint) unmapPacket() { unix.RawSyscall(unix.SYS_MUNMAP, ep.packet, uintptr(ep.dataCap)+PacketHeaderBytes, 0) ep.packet = 0 } // Shutdown causes concurrent and future calls to ep.Connect(), ep.SendRecv(), // ep.RecvFirst(), and ep.SendLast(), as well as the same calls in the peer // Endpoint, to unblock and return ShutdownErrors. It does not wait for // concurrent calls to return. Successive calls to Shutdown have no effect. // // Shutdown is the only Endpoint method that may be called concurrently with // other methods on the same Endpoint. func (ep *Endpoint) Shutdown() { if ep.shutdown.Swap(1) != 0 { // ep.Shutdown() has previously been called. return } ep.ctrlShutdown() } // isShutdownLocally returns true if ep.Shutdown() has been called. func (ep *Endpoint) isShutdownLocally() bool { return ep.shutdown.Load() != 0 } // ShutdownError is returned by most Endpoint methods after Endpoint.Shutdown() // has been called. type ShutdownError struct{} // Error implements error.Error. func (ShutdownError) Error() string { return "flipcall connection shutdown" } // DataCap returns the maximum datagram size supported by ep. Equivalently, // DataCap returns len(ep.Data()). func (ep *Endpoint) DataCap() uint32 { return ep.dataCap } // Connection state. const ( // The client is, by definition, initially active, so this must be 0. csClientActive = 0 csServerActive = 1 csShutdown = 2 ) // Connect blocks until the peer Endpoint has called Endpoint.RecvFirst(). // // Preconditions: // - ep is a client Endpoint. // - ep.Connect(), ep.RecvFirst(), ep.SendRecv(), and ep.SendLast() have never // been called. func (ep *Endpoint) Connect() error { err := ep.ctrlConnect() if err == nil { raceBecomeActive() } return err } // RecvFirst blocks until the peer Endpoint calls Endpoint.SendRecv(), then // returns the datagram length specified by that call. // // Preconditions: // - ep is a server Endpoint. // - ep.SendRecv(), ep.RecvFirst(), and ep.SendLast() have never been called. func (ep *Endpoint) RecvFirst() (uint32, error) { if err := ep.ctrlWaitFirst(); err != nil { return 0, err } raceBecomeActive() recvDataLen := ep.dataLen().Load() if recvDataLen > ep.dataCap { return 0, fmt.Errorf("received packet with invalid datagram length %d (maximum %d)", recvDataLen, ep.dataCap) } return recvDataLen, nil } // SendRecv transfers control to the peer Endpoint, causing its call to // Endpoint.SendRecv() or Endpoint.RecvFirst() to return with the given // datagram length, then blocks until the peer Endpoint calls // Endpoint.SendRecv() or Endpoint.SendLast(). // // Preconditions: // - dataLen <= ep.DataCap(). // - No previous call to ep.SendRecv() or ep.RecvFirst() has returned an error. // - ep.SendLast() has never been called. // - If ep is a client Endpoint, ep.Connect() has previously been called and // returned nil. func (ep *Endpoint) SendRecv(dataLen uint32) (uint32, error) { return ep.sendRecv(dataLen, false /* mayRetainP */) } // SendRecvFast is equivalent to SendRecv, but may prevent the caller's runtime // P from being released, in which case the calling goroutine continues to // count against GOMAXPROCS while waiting for the peer Endpoint to return // control to the caller. // // SendRecvFast is appropriate if the peer Endpoint is expected to consistently // return control in a short amount of time (less than ~10ms). // // Preconditions: As for SendRecv. func (ep *Endpoint) SendRecvFast(dataLen uint32) (uint32, error) { return ep.sendRecv(dataLen, true /* mayRetainP */) } func (ep *Endpoint) sendRecv(dataLen uint32, mayRetainP bool) (uint32, error) { if dataLen > ep.dataCap { panic(fmt.Sprintf("attempting to send packet with datagram length %d (maximum %d)", dataLen, ep.dataCap)) } // This store can safely be non-atomic: Under correct operation we should // be the only thread writing ep.dataLen(), and ep.ctrlRoundTrip() will // synchronize with the receiver. We will not read from ep.dataLen() until // after ep.ctrlRoundTrip(), so if the peer is mutating it concurrently then // they can only shoot themselves in the foot. ep.dataLen().RacyStore(dataLen) raceBecomeInactive() if err := ep.ctrlRoundTrip(mayRetainP); err != nil { return 0, err } raceBecomeActive() recvDataLen := ep.dataLen().Load() if recvDataLen > ep.dataCap { return 0, fmt.Errorf("received packet with invalid datagram length %d (maximum %d)", recvDataLen, ep.dataCap) } return recvDataLen, nil } // SendLast causes the peer Endpoint's call to Endpoint.SendRecv() or // Endpoint.RecvFirst() to return with the given datagram length. // // Preconditions: // - dataLen <= ep.DataCap(). // - No previous call to ep.SendRecv() or ep.RecvFirst() has returned an error. // - ep.SendLast() has never been called. // - If ep is a client Endpoint, ep.Connect() has previously been called and // returned nil. func (ep *Endpoint) SendLast(dataLen uint32) error { if dataLen > ep.dataCap { panic(fmt.Sprintf("attempting to send packet with datagram length %d (maximum %d)", dataLen, ep.dataCap)) } ep.dataLen().RacyStore(dataLen) raceBecomeInactive() if err := ep.ctrlWakeLast(); err != nil { return err } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/flipcall/flipcall_linux_state_autogen.go000066400000000000000000000001341465435605700274660ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux // +build linux package flipcall golang-gvisor-gvisor-0.0~20240729.0/pkg/flipcall/flipcall_state_autogen.go000066400000000000000000000001361465435605700262510ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package flipcall golang-gvisor-gvisor-0.0~20240729.0/pkg/flipcall/flipcall_unsafe.go000066400000000000000000000053131465435605700246720ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package flipcall import ( "reflect" "unsafe" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sync" ) // Packets consist of a 16-byte header followed by an arbitrarily-sized // datagram. The header consists of: // // - A 4-byte native-endian connection state. // // - A 4-byte native-endian datagram length in bytes. // // - 8 reserved bytes. const ( // PacketHeaderBytes is the size of a flipcall packet header in bytes. The // maximum datagram size supported by a flipcall connection is equal to the // length of the packet window minus PacketHeaderBytes. // // PacketHeaderBytes is exported to support its use in constant // expressions. Non-constant expressions may prefer to use // PacketWindowLengthForDataCap(). PacketHeaderBytes = 16 ) func (ep *Endpoint) connState() *atomicbitops.Uint32 { return (*atomicbitops.Uint32)(unsafe.Pointer(ep.packet)) } func (ep *Endpoint) dataLen() *atomicbitops.Uint32 { return (*atomicbitops.Uint32)(unsafe.Pointer(ep.packet + 4)) } // Data returns the datagram part of ep's packet window as a byte slice. // // Note that the packet window is shared with the potentially-untrusted peer // Endpoint, which may concurrently mutate the contents of the packet window. // Thus: // // - Readers must not assume that two reads of the same byte in Data() will // return the same result. In other words, readers should read any given byte // in Data() at most once. // // - Writers must not assume that they will read back the same data that they // have written. In other words, writers should avoid reading from Data() at // all. func (ep *Endpoint) Data() (bs []byte) { bshdr := (*reflect.SliceHeader)(unsafe.Pointer(&bs)) bshdr.Data = ep.packet + PacketHeaderBytes bshdr.Len = int(ep.dataCap) bshdr.Cap = int(ep.dataCap) return } // ioSync is a dummy variable used to indicate synchronization to the Go race // detector. Compare syscall.ioSync. var ioSync int64 func raceBecomeActive() { if sync.RaceEnabled { sync.RaceAcquire(unsafe.Pointer(&ioSync)) } } func raceBecomeInactive() { if sync.RaceEnabled { sync.RaceReleaseMerge(unsafe.Pointer(&ioSync)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/flipcall/flipcall_unsafe_state_autogen.go000066400000000000000000000000721465435605700276110ustar00rootroot00000000000000// automatically generated by stateify. package flipcall golang-gvisor-gvisor-0.0~20240729.0/pkg/flipcall/futex_linux.go000066400000000000000000000045651465435605700241250ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package flipcall import ( "fmt" "runtime" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" ) func (ep *Endpoint) futexSetPeerActive() error { if ep.connState().CompareAndSwap(ep.activeState, ep.inactiveState) { return nil } switch cs := ep.connState().Load(); cs { case csShutdown: return ShutdownError{} default: return fmt.Errorf("unexpected connection state before FUTEX_WAKE: %v", cs) } } func (ep *Endpoint) futexWakePeer() error { if err := ep.futexWakeConnState(1); err != nil { return fmt.Errorf("failed to FUTEX_WAKE peer Endpoint: %v", err) } return nil } func (ep *Endpoint) futexWaitUntilActive() error { for { switch cs := ep.connState().Load(); cs { case ep.activeState: return nil case ep.inactiveState: if ep.isShutdownLocally() { return ShutdownError{} } if err := ep.futexWaitConnState(ep.inactiveState); err != nil { return fmt.Errorf("failed to FUTEX_WAIT for peer Endpoint: %v", err) } continue case csShutdown: return ShutdownError{} default: return fmt.Errorf("unexpected connection state before FUTEX_WAIT: %v", cs) } } } func (ep *Endpoint) futexWakeConnState(numThreads int32) error { if _, _, e := unix.RawSyscall(unix.SYS_FUTEX, ep.packet, linux.FUTEX_WAKE, uintptr(numThreads)); e != 0 { return e } return nil } func (ep *Endpoint) futexWaitConnState(curState uint32) error { _, _, e := unix.Syscall6(unix.SYS_FUTEX, ep.packet, linux.FUTEX_WAIT, uintptr(curState), 0, 0, 0) if e != 0 && e != unix.EAGAIN && e != unix.EINTR { return e } return nil } func yieldThread() { unix.Syscall(unix.SYS_SCHED_YIELD, 0, 0, 0) // The thread we're trying to yield to may be waiting for a Go runtime P. // runtime.Gosched() will hand off ours if necessary. runtime.Gosched() } golang-gvisor-gvisor-0.0~20240729.0/pkg/flipcall/io.go000066400000000000000000000063211465435605700221520ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package flipcall import ( "fmt" "io" ) // DatagramReader implements io.Reader by reading a datagram from an Endpoint's // packet window. Its use is optional; users that can use Endpoint.Data() more // efficiently are advised to do so. type DatagramReader struct { ep *Endpoint off uint32 end uint32 } // Init must be called on zero-value DatagramReaders before first use. // // Preconditions: dataLen is 0, or was returned by a previous call to // ep.RecvFirst() or ep.SendRecv(). func (r *DatagramReader) Init(ep *Endpoint, dataLen uint32) { r.ep = ep r.Reset(dataLen) } // Reset causes r to begin reading a new datagram of the given length from the // associated Endpoint. // // Preconditions: dataLen is 0, or was returned by a previous call to the // associated Endpoint's RecvFirst() or SendRecv() methods. func (r *DatagramReader) Reset(dataLen uint32) { if dataLen > r.ep.dataCap { panic(fmt.Sprintf("invalid dataLen (%d) > ep.dataCap (%d)", dataLen, r.ep.dataCap)) } r.off = 0 r.end = dataLen } // NewReader is a convenience function that returns an initialized // DatagramReader allocated on the heap. // // Preconditions: dataLen was returned by a previous call to ep.RecvFirst() or // ep.SendRecv(). func (ep *Endpoint) NewReader(dataLen uint32) *DatagramReader { r := &DatagramReader{} r.Init(ep, dataLen) return r } // Read implements io.Reader.Read. func (r *DatagramReader) Read(dst []byte) (int, error) { n := copy(dst, r.ep.Data()[r.off:r.end]) r.off += uint32(n) if r.off == r.end { return n, io.EOF } return n, nil } // DatagramWriter implements io.Writer by writing a datagram to an Endpoint's // packet window. Its use is optional; users that can use Endpoint.Data() more // efficiently are advised to do so. type DatagramWriter struct { ep *Endpoint off uint32 } // Init must be called on zero-value DatagramWriters before first use. func (w *DatagramWriter) Init(ep *Endpoint) { w.ep = ep } // Reset causes w to begin writing a new datagram to the associated Endpoint. func (w *DatagramWriter) Reset() { w.off = 0 } // NewWriter is a convenience function that returns an initialized // DatagramWriter allocated on the heap. func (ep *Endpoint) NewWriter() *DatagramWriter { w := &DatagramWriter{} w.Init(ep) return w } // Write implements io.Writer.Write. func (w *DatagramWriter) Write(src []byte) (int, error) { n := copy(w.ep.Data()[w.off:w.ep.dataCap], src) w.off += uint32(n) if n != len(src) { return n, fmt.Errorf("datagram would exceed maximum size of %d bytes", w.ep.dataCap) } return n, nil } // Len returns the length of the written datagram. func (w *DatagramWriter) Len() uint32 { return w.off } golang-gvisor-gvisor-0.0~20240729.0/pkg/flipcall/packet_window.go000066400000000000000000000117611465435605700244050ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package flipcall import ( "fmt" "math/bits" "os" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/memutil" ) var ( pageSize = os.Getpagesize() pageMask = pageSize - 1 ) func init() { if bits.OnesCount(uint(pageSize)) != 1 { // This is depended on by roundUpToPage(). panic(fmt.Sprintf("system page size (%d) is not a power of 2", pageSize)) } if uintptr(pageSize) < PacketHeaderBytes { // This is required since Endpoint.Init() imposes a minimum packet // window size of 1 page. panic(fmt.Sprintf("system page size (%d) is less than packet header size (%d)", pageSize, PacketHeaderBytes)) } } // PacketWindowDescriptor represents a packet window, a range of pages in a // shared memory file that is used to exchange packets between partner // Endpoints. type PacketWindowDescriptor struct { // FD is the file descriptor representing the shared memory file. FD int // Offset is the offset into the shared memory file at which the packet // window begins. Offset int64 // Length is the size of the packet window in bytes. Length int } // PacketWindowLengthForDataCap returns the minimum packet window size required // to accommodate datagrams of the given size in bytes. func PacketWindowLengthForDataCap(dataCap uint32) int { return roundUpToPage(int(dataCap) + int(PacketHeaderBytes)) } func roundUpToPage(x int) int { return (x + pageMask) &^ pageMask } // A PacketWindowAllocator owns a shared memory file, and allocates packet // windows from it. type PacketWindowAllocator struct { fd int nextAlloc int64 fileSize int64 } // Init must be called on zero-value PacketWindowAllocators before first use. // If it succeeds, Destroy() must be called once the PacketWindowAllocator is // no longer in use. func (pwa *PacketWindowAllocator) Init() error { fd, err := memutil.CreateMemFD("flipcall_packet_windows", linux.MFD_CLOEXEC|linux.MFD_ALLOW_SEALING) if err != nil { return fmt.Errorf("failed to create memfd: %v", err) } // Apply F_SEAL_SHRINK to prevent either party from causing SIGBUS in the // other by truncating the file, and F_SEAL_SEAL to prevent either party // from applying F_SEAL_GROW or F_SEAL_WRITE. if _, _, e := unix.RawSyscall(unix.SYS_FCNTL, uintptr(fd), linux.F_ADD_SEALS, linux.F_SEAL_SHRINK|linux.F_SEAL_SEAL); e != 0 { unix.Close(fd) return fmt.Errorf("failed to apply memfd seals: %v", e) } pwa.fd = fd return nil } // NewPacketWindowAllocator is a convenience function that returns an // initialized PacketWindowAllocator allocated on the heap. func NewPacketWindowAllocator() (*PacketWindowAllocator, error) { var pwa PacketWindowAllocator if err := pwa.Init(); err != nil { return nil, err } return &pwa, nil } // Destroy releases resources owned by pwa. This invalidates file descriptors // previously returned by pwa.FD() and pwd.Allocate(). func (pwa *PacketWindowAllocator) Destroy() { unix.Close(pwa.fd) } // FD represents the file descriptor of the shared memory file backing pwa. func (pwa *PacketWindowAllocator) FD() int { return pwa.fd } // Allocate allocates a new packet window of at least the given size and // returns a PacketWindowDescriptor representing it. // // Preconditions: size > 0. func (pwa *PacketWindowAllocator) Allocate(size int) (PacketWindowDescriptor, error) { if size <= 0 { return PacketWindowDescriptor{}, fmt.Errorf("invalid size: %d", size) } // Page-align size to ensure that pwa.nextAlloc remains page-aligned. size = roundUpToPage(size) if size <= 0 { return PacketWindowDescriptor{}, fmt.Errorf("size %d overflows after rounding up to page size", size) } end := pwa.nextAlloc + int64(size) // overflow checked by ensureFileSize if err := pwa.ensureFileSize(end); err != nil { return PacketWindowDescriptor{}, err } start := pwa.nextAlloc pwa.nextAlloc = end return PacketWindowDescriptor{ FD: pwa.FD(), Offset: start, Length: size, }, nil } func (pwa *PacketWindowAllocator) ensureFileSize(min int64) error { if min <= 0 { return fmt.Errorf("file size would overflow") } if pwa.fileSize >= min { return nil } newSize := 2 * pwa.fileSize if newSize == 0 { newSize = int64(pageSize) } for newSize < min { newNewSize := newSize * 2 if newNewSize <= 0 { return fmt.Errorf("file size would overflow") } newSize = newNewSize } if err := unix.Ftruncate(pwa.FD(), newSize); err != nil { return fmt.Errorf("ftruncate failed: %v", err) } pwa.fileSize = newSize return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/fspath/000077500000000000000000000000001465435605700207115ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/fspath/builder.go000066400000000000000000000055221465435605700226720ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fspath import ( "fmt" "gvisor.dev/gvisor/pkg/gohacks" ) // Builder is similar to strings.Builder, but is used to produce pathnames // given path components in reverse order (from leaf to root). This is useful // in the common case where a filesystem is represented by a tree of named // nodes, and the path to a given node must be produced by walking upward from // that node to a given root. type Builder struct { buf []byte start int needSep bool } // Reset resets the Builder to be empty. func (b *Builder) Reset() { b.start = len(b.buf) b.needSep = false } // Len returns the number of accumulated bytes. func (b *Builder) Len() int { return len(b.buf) - b.start } func (b *Builder) needToGrow(n int) bool { return b.start < n } func (b *Builder) grow(n int) { newLen := b.Len() + n var newCap int if len(b.buf) == 0 { newCap = 64 // arbitrary } else { newCap = 2 * len(b.buf) } for newCap < newLen { newCap *= 2 if newCap == 0 { panic(fmt.Sprintf("required length (%d) causes buffer size to overflow", newLen)) } } newBuf := make([]byte, newCap) copy(newBuf[newCap-b.Len():], b.buf[b.start:]) b.start += newCap - len(b.buf) b.buf = newBuf } // PrependComponent prepends the given path component to b's buffer. A path // separator is automatically inserted if appropriate. func (b *Builder) PrependComponent(pc string) { if b.needSep { b.PrependByte('/') } b.PrependString(pc) b.needSep = true } // PrependString prepends the given string to b's buffer. func (b *Builder) PrependString(str string) { if b.needToGrow(len(str)) { b.grow(len(str)) } b.start -= len(str) copy(b.buf[b.start:], str) } // PrependByte prepends the given byte to b's buffer. func (b *Builder) PrependByte(c byte) { if b.needToGrow(1) { b.grow(1) } b.start-- b.buf[b.start] = c } // AppendString appends the given string to b's buffer. func (b *Builder) AppendString(str string) { if b.needToGrow(len(str)) { b.grow(len(str)) } oldStart := b.start b.start -= len(str) copy(b.buf[b.start:], b.buf[oldStart:]) copy(b.buf[len(b.buf)-len(str):], str) } // String returns the accumulated string. No other methods should be called // after String. func (b *Builder) String() string { return gohacks.StringFromImmutableBytes(b.buf[b.start:]) } golang-gvisor-gvisor-0.0~20240729.0/pkg/fspath/fspath.go000066400000000000000000000126041465435605700225300ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package fspath provides efficient tools for working with file paths in // Linux-compatible filesystem implementations. package fspath import ( "strings" ) const pathSep = '/' // Parse parses a pathname as described by path_resolution(7), except that // empty pathnames will be parsed successfully to a Path for which // Path.Absolute == Path.Dir == Path.HasComponents() == false. (This is // necessary to support AT_EMPTY_PATH.) func Parse(pathname string) Path { if len(pathname) == 0 { return Path{} } // Skip leading path separators. i := 0 for pathname[i] == pathSep { i++ if i == len(pathname) { // pathname consists entirely of path separators. return Path{ Absolute: true, Dir: true, } } } // Skip trailing path separators. This is required by Iterator.Next. This // loop is guaranteed to terminate with j >= 0 because otherwise the // pathname would consist entirely of path separators, so we would have // returned above. j := len(pathname) - 1 for pathname[j] == pathSep { j-- } // Find the end of the first path component. firstEnd := i + 1 for firstEnd != len(pathname) && pathname[firstEnd] != pathSep { firstEnd++ } return Path{ Begin: Iterator{ partialPathname: pathname[i : j+1], end: firstEnd - i, }, Absolute: i != 0, Dir: j != len(pathname)-1, } } // Path contains the information contained in a pathname string. // // Path is copyable by value. The zero value for Path is equivalent to // fspath.Parse(""), i.e. the empty path. type Path struct { // Begin is an iterator to the first path component in the relative part of // the path. // // Path doesn't store information about path components after the first // since this would require allocation. Begin Iterator // If true, the path is absolute, such that lookup should begin at the // filesystem root. If false, the path is relative, such that where lookup // begins is unspecified. Absolute bool // If true, the pathname contains trailing path separators, so the last // path component must exist and resolve to a directory. Dir bool } // String returns a pathname string equivalent to p. Note that the returned // string is not necessarily equal to the string p was parsed from; in // particular, redundant path separators will not be present. func (p Path) String() string { var b strings.Builder if p.Absolute { b.WriteByte(pathSep) } sep := false for pit := p.Begin; pit.Ok(); pit = pit.Next() { if sep { b.WriteByte(pathSep) } b.WriteString(pit.String()) sep = true } // Don't return "//" for Parse("/"). if p.Dir && p.Begin.Ok() { b.WriteByte(pathSep) } return b.String() } // HasComponents returns true if p contains a non-zero number of path // components. func (p Path) HasComponents() bool { return p.Begin.Ok() } // An Iterator represents either a path component in a Path or a terminal // iterator indicating that the end of the path has been reached. // // Iterator is immutable and copyable by value. The zero value of Iterator is // valid, and represents a terminal iterator. type Iterator struct { // partialPathname is a substring of the original pathname beginning at the // start of the represented path component and ending immediately after the // end of the last path component in the pathname. If partialPathname is // empty, the PathnameIterator is terminal. // // See TestParseIteratorPartialPathnames in fspath_test.go for a worked // example. partialPathname string // end is the offset into partialPathname of the first byte after the end // of the represented path component. end int } // Ok returns true if it is not terminal. func (it Iterator) Ok() bool { return len(it.partialPathname) != 0 } // String returns the path component represented by it. // // Preconditions: it.Ok(). func (it Iterator) String() string { return it.partialPathname[:it.end] } // Next returns an iterator to the path component after it. If it is the last // component in the path, Next returns a terminal iterator. // // Preconditions: it.Ok(). func (it Iterator) Next() Iterator { if it.end == len(it.partialPathname) { // End of the path. return Iterator{} } // Skip path separators. Since Parse trims trailing path separators, if we // aren't at the end of the path, there is definitely another path // component. i := it.end + 1 for { if it.partialPathname[i] != pathSep { break } i++ } nextPartialPathname := it.partialPathname[i:] // Find the end of this path component. nextEnd := 1 for nextEnd < len(nextPartialPathname) && nextPartialPathname[nextEnd] != pathSep { nextEnd++ } return Iterator{ partialPathname: nextPartialPathname, end: nextEnd, } } // NextOk is equivalent to it.Next().Ok(), but is faster. // // Preconditions: it.Ok(). func (it Iterator) NextOk() bool { return it.end != len(it.partialPathname) } golang-gvisor-gvisor-0.0~20240729.0/pkg/fspath/fspath_state_autogen.go000066400000000000000000000000701465435605700254440ustar00rootroot00000000000000// automatically generated by stateify. package fspath golang-gvisor-gvisor-0.0~20240729.0/pkg/fsutil/000077500000000000000000000000001465435605700207325ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/fsutil/fsutil.go000066400000000000000000000031051465435605700225660ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package fsutil contains filesystem utilities that can be shared between the // sentry and other sandbox components. package fsutil import "golang.org/x/sys/unix" // DirentHandler is a function that handles a dirent. type DirentHandler func(ino uint64, off int64, ftype uint8, name string, reclen uint16) // ForEachDirent retrieves all dirents from dirfd using getdents64(2) and // invokes handleDirent on them. func ForEachDirent(dirfd int, handleDirent DirentHandler) error { var direntsBuf [8192]byte for { n, err := unix.Getdents(dirfd, direntsBuf[:]) if err != nil { return err } if n <= 0 { return nil } ParseDirents(direntsBuf[:n], handleDirent) } } // DirentNames retrieves all dirents from dirfd using getdents64(2) and returns // all the recorded dirent names. func DirentNames(dirfd int) ([]string, error) { var names []string err := ForEachDirent(dirfd, func(_ uint64, _ int64, _ uint8, name string, _ uint16) { names = append(names, name) }) return names, err } golang-gvisor-gvisor-0.0~20240729.0/pkg/fsutil/fsutil_amd64_unsafe.go000066400000000000000000000024031465435605700251220ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package fsutil import ( "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/syserr" ) // StatAt is a convenience wrapper around newfstatat(2). func StatAt(dirFd int, name string) (unix.Stat_t, error) { nameBytes, err := unix.BytePtrFromString(name) if err != nil { return unix.Stat_t{}, err } namePtr := unsafe.Pointer(nameBytes) var stat unix.Stat_t statPtr := unsafe.Pointer(&stat) if _, _, errno := unix.Syscall6( unix.SYS_NEWFSTATAT, uintptr(dirFd), uintptr(namePtr), uintptr(statPtr), unix.AT_SYMLINK_NOFOLLOW, 0, 0); errno != 0 { return unix.Stat_t{}, syserr.FromHost(errno).ToError() } return stat, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/fsutil/fsutil_amd64_unsafe_state_autogen.go000066400000000000000000000001321465435605700300410ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 // +build amd64 package fsutil golang-gvisor-gvisor-0.0~20240729.0/pkg/fsutil/fsutil_arm64_unsafe.go000066400000000000000000000023751465435605700251500ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package fsutil import ( "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/syserr" ) // StatAt is a convenience wrapper around fstatat(2). func StatAt(dirFd int, name string) (unix.Stat_t, error) { nameBytes, err := unix.BytePtrFromString(name) if err != nil { return unix.Stat_t{}, err } namePtr := unsafe.Pointer(nameBytes) var stat unix.Stat_t statPtr := unsafe.Pointer(&stat) if _, _, errno := unix.Syscall6( unix.SYS_FSTATAT, uintptr(dirFd), uintptr(namePtr), uintptr(statPtr), unix.AT_SYMLINK_NOFOLLOW, 0, 0); errno != 0 { return unix.Stat_t{}, syserr.FromHost(errno).ToError() } return stat, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/fsutil/fsutil_arm64_unsafe_state_autogen.go000066400000000000000000000001321465435605700300570ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 // +build arm64 package fsutil golang-gvisor-gvisor-0.0~20240729.0/pkg/fsutil/fsutil_state_autogen.go000066400000000000000000000000701465435605700255060ustar00rootroot00000000000000// automatically generated by stateify. package fsutil golang-gvisor-gvisor-0.0~20240729.0/pkg/fsutil/fsutil_unsafe.go000066400000000000000000000064701465435605700241370ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fsutil import ( "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/syserr" ) // UnixDirentMaxSize is the maximum size of unix.Dirent in bytes. var UnixDirentMaxSize = int(unsafe.Sizeof(unix.Dirent{})) // Utimensat is a convenience wrapper to make the utimensat(2) syscall. It // additionally handles empty name. func Utimensat(dirFd int, name string, times [2]unix.Timespec, flags int) error { // utimensat(2) doesn't accept empty name, instead name must be nil to make it // operate directly on 'dirFd' unlike other *at syscalls. var namePtr unsafe.Pointer if name != "" { nameBytes, err := unix.BytePtrFromString(name) if err != nil { return err } namePtr = unsafe.Pointer(nameBytes) } timesPtr := unsafe.Pointer(×[0]) if _, _, errno := unix.Syscall6( unix.SYS_UTIMENSAT, uintptr(dirFd), uintptr(namePtr), uintptr(timesPtr), uintptr(flags), 0, 0); errno != 0 { return syserr.FromHost(errno).ToError() } return nil } // RenameAt is a convenience wrapper to make the renameat(2) syscall. It // additionally handles empty names. func RenameAt(oldDirFD int, oldName string, newDirFD int, newName string) error { var oldNamePtr unsafe.Pointer if oldName != "" { nameBytes, err := unix.BytePtrFromString(oldName) if err != nil { return err } oldNamePtr = unsafe.Pointer(nameBytes) } var newNamePtr unsafe.Pointer if newName != "" { nameBytes, err := unix.BytePtrFromString(newName) if err != nil { return err } newNamePtr = unsafe.Pointer(nameBytes) } if _, _, errno := unix.Syscall6( unix.SYS_RENAMEAT, uintptr(oldDirFD), uintptr(oldNamePtr), uintptr(newDirFD), uintptr(newNamePtr), 0, 0); errno != 0 { return syserr.FromHost(errno).ToError() } return nil } // ParseDirents parses dirents from buf. buf must have been populated by // getdents64(2) syscall. It calls the handleDirent callback for each dirent. func ParseDirents(buf []byte, handleDirent DirentHandler) { for len(buf) > 0 { // Interpret the buf populated by unix.Getdents as unix.Dirent. dirent := *(*unix.Dirent)(unsafe.Pointer(&buf[0])) // Advance buf for the next dirent. buf = buf[dirent.Reclen:] // Extracting the name is pretty tedious... var nameBuf [unix.NAME_MAX]byte var nameLen int for i := 0; i < len(dirent.Name); i++ { // The name is null terminated. if dirent.Name[i] == 0 { nameLen = i break } nameBuf[i] = byte(dirent.Name[i]) } name := string(nameBuf[:nameLen]) // Skip `.` and `..` entries. It is anyways ignored by the client. We also // don't want to leak information about `..`. if name == "." || name == ".." { continue } // Deliver results to caller. handleDirent(dirent.Ino, dirent.Off, dirent.Type, name, dirent.Reclen) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/fsutil/fsutil_unsafe_state_autogen.go000066400000000000000000000000701465435605700270470ustar00rootroot00000000000000// automatically generated by stateify. package fsutil golang-gvisor-gvisor-0.0~20240729.0/pkg/gohacks/000077500000000000000000000000001465435605700210435ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/gohacks/linkname_go113_unsafe.go000066400000000000000000000027471465435605700254550ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build go1.13 // //go:linkname directives type-checked by checklinkname. Any other // non-linkname assumptions outside the Go 1 compatibility guarantee should // have an accompanied vet check or version guard build tag. // Package gohacks contains utilities for subverting the Go compiler. package gohacks import ( "unsafe" ) // Note that go:linkname silently doesn't work if the local name is exported, // necessitating an indirection for exported functions. // Memmove is runtime.memmove, exported for SeqAtomicLoad/SeqAtomicTryLoad. // //go:nosplit func Memmove(to, from unsafe.Pointer, n uintptr) { memmove(to, from, n) } //go:linkname memmove runtime.memmove //go:noescape func memmove(to, from unsafe.Pointer, n uintptr) // Nanotime is runtime.nanotime. // //go:nosplit func Nanotime() int64 { return nanotime() } //go:linkname nanotime runtime.nanotime //go:noescape func nanotime() int64 golang-gvisor-gvisor-0.0~20240729.0/pkg/gohacks/noescape_unsafe.go000066400000000000000000000023041465435605700245270ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gohacks import ( "unsafe" ) // Noescape hides a pointer from escape analysis. Noescape is the identity // function but escape analysis doesn't think the output depends on the input. // Noescape is inlined and currently compiles down to zero instructions. // USE CAREFULLY! // // Noescape is copy/pasted from Go's runtime/stubs.go:noescape(), and is valid // as of Go 1.20. It is possible that this approach stops working in future // versions of the toolchain, at which point `p` may still escape. // //go:nosplit func Noescape(p unsafe.Pointer) unsafe.Pointer { x := uintptr(p) return unsafe.Pointer(x ^ 0) } golang-gvisor-gvisor-0.0~20240729.0/pkg/gohacks/slice_go113_unsafe.go000066400000000000000000000025521465435605700247500ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build go1.13 && !go1.20 // +build go1.13,!go1.20 // TODO(go.dev/issue/8422): Remove this once Go 1.19 is no longer supported, // and update callers to use unsafe.Slice directly. package gohacks import ( "unsafe" ) // sliceHeader is equivalent to reflect.SliceHeader, but represents the pointer // to the underlying array as unsafe.Pointer rather than uintptr, allowing // sliceHeaders to be directly converted to slice objects. type sliceHeader struct { Data unsafe.Pointer Len int Cap int } // Slice returns a slice whose underlying array starts at ptr an which length // and capacity are len. func Slice[T any](ptr *T, length int) []T { var s []T hdr := (*sliceHeader)(unsafe.Pointer(&s)) hdr.Data = unsafe.Pointer(ptr) hdr.Len = length hdr.Cap = length return s } golang-gvisor-gvisor-0.0~20240729.0/pkg/gohacks/slice_go120_unsafe.go000066400000000000000000000016501465435605700247440ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build go1.20 package gohacks import ( "unsafe" ) // Slice returns a slice whose underlying array starts at ptr an which length // and capacity are len. // // Slice is a wrapper around unsafe.Slice. Prefer to use unsafe.Slice directly // if possible. func Slice[T any](ptr *T, length int) []T { return unsafe.Slice(ptr, length) } golang-gvisor-gvisor-0.0~20240729.0/pkg/gohacks/string_go113_unsafe.go000066400000000000000000000035471465435605700251640ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build go1.13 && !go1.20 // +build go1.13,!go1.20 // TODO(go.dev/issue/8422): Remove this file once Go 1.19 is no longer // supported. package gohacks import ( "unsafe" ) // stringHeader is equivalent to reflect.StringHeader, but represents the // pointer to the underlying array as unsafe.Pointer rather than uintptr, // allowing StringHeaders to be directly converted to strings. type stringHeader struct { Data unsafe.Pointer Len int } // ImmutableBytesFromString is equivalent to []byte(s), except that it uses the // same memory backing s instead of making a heap-allocated copy. This is only // valid if the returned slice is never mutated. func ImmutableBytesFromString(s string) []byte { shdr := (*stringHeader)(unsafe.Pointer(&s)) return Slice((*byte)(shdr.Data), shdr.Len) } // StringFromImmutableBytes is equivalent to string(bs), except that it uses // the same memory backing bs instead of making a heap-allocated copy. This is // only valid if bs is never mutated after StringFromImmutableBytes returns. func StringFromImmutableBytes(bs []byte) string { // This is cheaper than messing with StringHeader and SliceHeader, which as // of this writing produces many dead stores of zeroes. Compare // strings.Builder.String(). return *(*string)(unsafe.Pointer(&bs)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/gohacks/string_go120_unsafe.go000066400000000000000000000024641465435605700251570ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build go1.20 package gohacks import ( "unsafe" ) // ImmutableBytesFromString is equivalent to []byte(s), except that it uses the // same memory backing s instead of making a heap-allocated copy. This is only // valid if the returned slice is never mutated. func ImmutableBytesFromString(s string) []byte { b := unsafe.StringData(s) return unsafe.Slice(b, len(s)) } // StringFromImmutableBytes is equivalent to string(bs), except that it uses // the same memory backing bs instead of making a heap-allocated copy. This is // only valid if bs is never mutated after StringFromImmutableBytes returns. func StringFromImmutableBytes(bs []byte) string { if len(bs) == 0 { return "" } return unsafe.String(&bs[0], len(bs)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/goid/000077500000000000000000000000001465435605700203465ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/goid/goid.go000066400000000000000000000015531465435605700216230ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package goid provides the Get function. package goid import ( _ "runtime" // For facts in assembly files. ) // goid returns the current goid, it is defined in assembly. func goid() int64 // Get returns the ID of the current goroutine. func Get() int64 { return goid() } golang-gvisor-gvisor-0.0~20240729.0/pkg/goid/goid_122_amd64.s000066400000000000000000000014751465435605700230420ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !go1.23 #include "textflag.h" #define GOID_OFFSET 152 // +checkoffset runtime g.goid // func goid() int64 TEXT ·goid(SB),NOSPLIT|NOFRAME,$0-8 MOVQ (TLS), R14 MOVQ GOID_OFFSET(R14), R14 MOVQ R14, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/goid/goid_122_arm64.s000066400000000000000000000015171465435605700230550ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !go1.23 #include "textflag.h" #define GOID_OFFSET 152 // +checkoffset runtime g.goid // func goid() int64 TEXT ·goid(SB),NOSPLIT,$0-8 MOVD g, R0 // g MOVD GOID_OFFSET(R0), R0 MOVD R0, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/goid/goid_123_amd64.s000066400000000000000000000014741465435605700230420ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build go1.23 #include "textflag.h" #define GOID_OFFSET 160 // +checkoffset runtime g.goid // func goid() int64 TEXT ·goid(SB),NOSPLIT|NOFRAME,$0-8 MOVQ (TLS), R14 MOVQ GOID_OFFSET(R14), R14 MOVQ R14, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/goid/goid_123_arm64.s000066400000000000000000000015161465435605700230550ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build go1.23 #include "textflag.h" #define GOID_OFFSET 160 // +checkoffset runtime g.goid // func goid() int64 TEXT ·goid(SB),NOSPLIT,$0-8 MOVD g, R0 // g MOVD GOID_OFFSET(R0), R0 MOVD R0, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/hostarch/000077500000000000000000000000001465435605700212375ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/hostarch/access_type.go000066400000000000000000000064271465435605700241010ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package hostarch import "golang.org/x/sys/unix" // AccessType specifies memory access types. This is used for // setting mapping permissions, as well as communicating faults. // // +stateify savable type AccessType struct { // Read is read access. Read bool // Write is write access. Write bool // Execute is executable access. Execute bool } // String returns a pretty representation of access. This looks like the // familiar r-x, rw-, etc. and can be relied on as such. func (a AccessType) String() string { bits := [3]byte{'-', '-', '-'} if a.Read { bits[0] = 'r' } if a.Write { bits[1] = 'w' } if a.Execute { bits[2] = 'x' } return string(bits[:]) } // Any returns true iff at least one of Read, Write or Execute is true. func (a AccessType) Any() bool { return a.Read || a.Write || a.Execute } // Prot returns the system prot (unix.PROT_READ, etc.) for this access. func (a AccessType) Prot() int { var prot int if a.Read { prot |= unix.PROT_READ } if a.Write { prot |= unix.PROT_WRITE } if a.Execute { prot |= unix.PROT_EXEC } return prot } // SupersetOf returns true iff the access types in a are a superset of the // access types in other. func (a AccessType) SupersetOf(other AccessType) bool { if !a.Read && other.Read { return false } if !a.Write && other.Write { return false } if !a.Execute && other.Execute { return false } return true } // Intersect returns the access types set in both a and other. func (a AccessType) Intersect(other AccessType) AccessType { return AccessType{ Read: a.Read && other.Read, Write: a.Write && other.Write, Execute: a.Execute && other.Execute, } } // Union returns the access types set in either a or other. func (a AccessType) Union(other AccessType) AccessType { return AccessType{ Read: a.Read || other.Read, Write: a.Write || other.Write, Execute: a.Execute || other.Execute, } } // Effective returns the set of effective access types allowed by a, even if // some types are not explicitly allowed. func (a AccessType) Effective() AccessType { // In Linux, Write and Execute access generally imply Read access. See // mm/mmap.c:protection_map. // // The notable exception is get_user_pages, which only checks against // the original vma flags. That said, most user memory accesses do not // use GUP. if a.Write || a.Execute { a.Read = true } return a } // Convenient access types. var ( NoAccess = AccessType{} Read = AccessType{Read: true} Write = AccessType{Write: true} Execute = AccessType{Execute: true} ReadWrite = AccessType{Read: true, Write: true} ReadExecute = AccessType{Read: true, Execute: true} AnyAccess = AccessType{Read: true, Write: true, Execute: true} ) golang-gvisor-gvisor-0.0~20240729.0/pkg/hostarch/addr.go000066400000000000000000000071221465435605700225020ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package hostarch import ( "fmt" ) // Addr represents an address in an unspecified address space. // // +stateify savable type Addr uintptr // AddLength adds the given length to start and returns the result. ok is true // iff adding the length did not overflow the range of Addr. // // Note: This function is usually used to get the end of an address range // defined by its start address and length. Since the resulting end is // exclusive, end == 0 is technically valid, and corresponds to a range that // extends to the end of the address space, but ok will be false. This isn't // expected to ever come up in practice. func (v Addr) AddLength(length uint64) (end Addr, ok bool) { end = v + Addr(length) // As of this writing (Go 1.21), addrAtLeast64b is required to prevent the // compiler from generating a tautological `length <= MaxUint64` check on // 64-bit architectures. ok = end >= v && (addrAtLeast64b || length <= uint64(^Addr(0))) return } // RoundDown is equivalent to function PageRoundDown. func (v Addr) RoundDown() Addr { return PageRoundDown(v) } // RoundUp is equivalent to function PageRoundUp. func (v Addr) RoundUp() (Addr, bool) { return PageRoundUp(v) } // MustRoundUp is equivalent to function MustPageRoundUp. func (v Addr) MustRoundUp() Addr { return MustPageRoundUp(v) } // HugeRoundDown is equivalent to function HugePageRoundDown. func (v Addr) HugeRoundDown() Addr { return HugePageRoundDown(v) } // HugeRoundUp is equivalent to function HugePageRoundUp. func (v Addr) HugeRoundUp() (Addr, bool) { return HugePageRoundUp(v) } // MustHugeRoundUp is equivalent to function MustHugePageRoundUp. func (v Addr) MustHugeRoundUp() Addr { return MustHugePageRoundUp(v) } // PageOffset is equivalent to function PageOffset, except that it casts the // result to uint64. func (v Addr) PageOffset() uint64 { return uint64(PageOffset(v)) } // IsPageAligned is equivalent to function IsPageAligned. func (v Addr) IsPageAligned() bool { return IsPageAligned(v) } // HugePageOffset is equivalent to function HugePageOffset. func (v Addr) HugePageOffset() uint64 { return uint64(HugePageOffset(v)) } // IsHugePageAligned is equivalent to function IsHugePageAligned. func (v Addr) IsHugePageAligned() bool { return IsHugePageAligned(v) } // AddrRange is a range of Addrs. // // type AddrRange // ToRange returns [v, v+length). func (v Addr) ToRange(length uint64) (AddrRange, bool) { end, ok := v.AddLength(length) return AddrRange{v, end}, ok } // IsPageAligned returns true if ar.Start.IsPageAligned() and // ar.End.IsPageAligned(). func (ar AddrRange) IsPageAligned() bool { return ar.Start.IsPageAligned() && ar.End.IsPageAligned() } // IsHugePageAligned returns true if ar.Start.IsHugePageAligned() and // ar.End.IsHugePageAligned(). func (ar AddrRange) IsHugePageAligned() bool { return ar.Start.IsHugePageAligned() && ar.End.IsHugePageAligned() } // String implements fmt.Stringer.String. func (ar AddrRange) String() string { return fmt.Sprintf("[%#x, %#x)", ar.Start, ar.End) } golang-gvisor-gvisor-0.0~20240729.0/pkg/hostarch/addr_range.go000066400000000000000000000033401465435605700236540ustar00rootroot00000000000000package hostarch // A Range represents a contiguous range of T. // // +stateify savable type AddrRange struct { // Start is the inclusive start of the range. Start Addr // End is the exclusive end of the range. End Addr } // WellFormed returns true if r.Start <= r.End. All other methods on a Range // require that the Range is well-formed. // //go:nosplit func (r AddrRange) WellFormed() bool { return r.Start <= r.End } // Length returns the length of the range. // //go:nosplit func (r AddrRange) Length() Addr { return r.End - r.Start } // Contains returns true if r contains x. // //go:nosplit func (r AddrRange) Contains(x Addr) bool { return r.Start <= x && x < r.End } // Overlaps returns true if r and r2 overlap. // //go:nosplit func (r AddrRange) Overlaps(r2 AddrRange) bool { return r.Start < r2.End && r2.Start < r.End } // IsSupersetOf returns true if r is a superset of r2; that is, the range r2 is // contained within r. // //go:nosplit func (r AddrRange) IsSupersetOf(r2 AddrRange) bool { return r.Start <= r2.Start && r.End >= r2.End } // Intersect returns a range consisting of the intersection between r and r2. // If r and r2 do not overlap, Intersect returns a range with unspecified // bounds, but for which Length() == 0. // //go:nosplit func (r AddrRange) Intersect(r2 AddrRange) AddrRange { if r.Start < r2.Start { r.Start = r2.Start } if r.End > r2.End { r.End = r2.End } if r.End < r.Start { r.End = r.Start } return r } // CanSplitAt returns true if it is legal to split a segment spanning the range // r at x; that is, splitting at x would produce two ranges, both of which have // non-zero length. // //go:nosplit func (r AddrRange) CanSplitAt(x Addr) bool { return r.Contains(x) && r.Start < x } golang-gvisor-gvisor-0.0~20240729.0/pkg/hostarch/addr_range_seq_unsafe.go000066400000000000000000000167271465435605700261020ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package hostarch import ( "bytes" "fmt" "unsafe" "gvisor.dev/gvisor/pkg/gohacks" ) // An AddrRangeSeq represents a sequence of AddrRanges. // // AddrRangeSeqs are immutable and may be copied by value. The zero value of // AddrRangeSeq represents an empty sequence. // // An AddrRangeSeq may contain AddrRanges with a length of 0. This is necessary // since zero-length AddrRanges are significant to MM bounds checks. type AddrRangeSeq struct { // If length is 0, then the AddrRangeSeq represents no AddrRanges. // Invariants: data == 0; offset == 0; limit == 0. // // If length is 1, then the AddrRangeSeq represents the single // AddrRange{offset, offset+limit}. Invariants: data == 0. // // Otherwise, length >= 2, and the AddrRangeSeq represents the `length` // AddrRanges in the array of AddrRanges starting at address `data`, // starting at `offset` bytes into the first AddrRange and limited to the // following `limit` bytes. (AddrRanges after `limit` are still iterated, // but are truncated to a length of 0.) Invariants: data != 0; offset <= // data[0].Length(); limit > 0; offset+limit <= the combined length of all // AddrRanges in the array. data unsafe.Pointer length int offset Addr limit Addr } // AddrRangeSeqOf returns an AddrRangeSeq representing the single AddrRange ar. func AddrRangeSeqOf(ar AddrRange) AddrRangeSeq { return AddrRangeSeq{ length: 1, offset: ar.Start, limit: ar.Length(), } } // AddrRangeSeqFromSlice returns an AddrRangeSeq representing all AddrRanges in // slice. // // Whether the returned AddrRangeSeq shares memory with slice is unspecified; // clients should avoid mutating slices passed to AddrRangeSeqFromSlice. // // Preconditions: The combined length of all AddrRanges in slice <= // math.MaxInt64. func AddrRangeSeqFromSlice(slice []AddrRange) AddrRangeSeq { var limit int64 for _, ar := range slice { len64 := int64(ar.Length()) if len64 < 0 { panic(fmt.Sprintf("Length of AddrRange %v overflows int64", ar)) } sum := limit + len64 if sum < limit { panic(fmt.Sprintf("Total length of AddrRanges %v overflows int64", slice)) } limit = sum } return addrRangeSeqFromSliceLimited(slice, limit) } // Preconditions: // - The combined length of all AddrRanges in slice <= limit. // - limit >= 0. // - If len(slice) != 0, then limit > 0. func addrRangeSeqFromSliceLimited(slice []AddrRange, limit int64) AddrRangeSeq { switch len(slice) { case 0: return AddrRangeSeq{} case 1: return AddrRangeSeq{ length: 1, offset: slice[0].Start, limit: Addr(limit), } default: return AddrRangeSeq{ data: unsafe.Pointer(&slice[0]), length: len(slice), limit: Addr(limit), } } } // IsEmpty returns true if ars.NumRanges() == 0. // // Note that since AddrRangeSeq may contain AddrRanges with a length of zero, // an AddrRange representing 0 bytes (AddrRangeSeq.NumBytes() == 0) is not // necessarily empty. func (ars AddrRangeSeq) IsEmpty() bool { return ars.length == 0 } // NumRanges returns the number of AddrRanges in ars. func (ars AddrRangeSeq) NumRanges() int { return ars.length } // NumBytes returns the number of bytes represented by ars. func (ars AddrRangeSeq) NumBytes() int64 { return int64(ars.limit) } // Head returns the first AddrRange in ars. // // Preconditions: !ars.IsEmpty(). func (ars AddrRangeSeq) Head() AddrRange { if ars.length == 0 { panic("empty AddrRangeSeq") } if ars.length == 1 { return AddrRange{ars.offset, ars.offset + ars.limit} } ar := *(*AddrRange)(ars.data) ar.Start += ars.offset if ar.Length() > ars.limit { ar.End = ar.Start + ars.limit } return ar } // Tail returns an AddrRangeSeq consisting of all AddrRanges in ars after the // first. // // Preconditions: !ars.IsEmpty(). func (ars AddrRangeSeq) Tail() AddrRangeSeq { if ars.length == 0 { panic("empty AddrRangeSeq") } if ars.length == 1 { return AddrRangeSeq{} } return ars.externalTail() } // Preconditions: ars.length >= 2. func (ars AddrRangeSeq) externalTail() AddrRangeSeq { data := (*AddrRange)(ars.data) headLen := data.Length() - ars.offset var tailLimit int64 if ars.limit > headLen { tailLimit = int64(ars.limit - headLen) } extSlice := gohacks.Slice(data, ars.length) return addrRangeSeqFromSliceLimited(extSlice[1:], tailLimit) } // DropFirst returns an AddrRangeSeq equivalent to ars, but with the first n // bytes omitted. If n > ars.NumBytes(), DropFirst returns an empty // AddrRangeSeq. // // If !ars.IsEmpty() and ars.Head().Length() == 0, DropFirst will always omit // at least ars.Head(), even if n == 0. This guarantees that the basic pattern // of: // // for !ars.IsEmpty() { // n, err = doIOWith(ars.Head()) // if err != nil { // return err // } // ars = ars.DropFirst(n) // } // // works even in the presence of zero-length AddrRanges. // // Preconditions: n >= 0. func (ars AddrRangeSeq) DropFirst(n int) AddrRangeSeq { if n < 0 { panic(fmt.Sprintf("invalid n: %d", n)) } return ars.DropFirst64(int64(n)) } // DropFirst64 is equivalent to DropFirst but takes an int64. func (ars AddrRangeSeq) DropFirst64(n int64) AddrRangeSeq { if n < 0 { panic(fmt.Sprintf("invalid n: %d", n)) } if Addr(n) > ars.limit { return AddrRangeSeq{} } // Handle initial empty AddrRange. switch ars.length { case 0: return AddrRangeSeq{} case 1: if ars.limit == 0 { return AddrRangeSeq{} } default: if rawHeadLen := (*AddrRange)(ars.data).Length(); ars.offset == rawHeadLen { ars = ars.externalTail() } } for n != 0 { // Calling ars.Head() here is surprisingly expensive, so inline getting // the head's length. var headLen Addr if ars.length == 1 { headLen = ars.limit } else { headLen = (*AddrRange)(ars.data).Length() - ars.offset } if Addr(n) < headLen { // Dropping ends partway through the head AddrRange. ars.offset += Addr(n) ars.limit -= Addr(n) return ars } n -= int64(headLen) ars = ars.Tail() } return ars } // TakeFirst returns an AddrRangeSeq equivalent to ars, but iterating at most n // bytes. TakeFirst never removes AddrRanges from ars; AddrRanges beyond the // first n bytes are reduced to a length of zero, but will still be iterated. // // Preconditions: n >= 0. func (ars AddrRangeSeq) TakeFirst(n int) AddrRangeSeq { if n < 0 { panic(fmt.Sprintf("invalid n: %d", n)) } return ars.TakeFirst64(int64(n)) } // TakeFirst64 is equivalent to TakeFirst but takes an int64. func (ars AddrRangeSeq) TakeFirst64(n int64) AddrRangeSeq { if n < 0 { panic(fmt.Sprintf("invalid n: %d", n)) } if ars.limit > Addr(n) { ars.limit = Addr(n) } return ars } // String implements fmt.Stringer.String. func (ars AddrRangeSeq) String() string { // This is deliberately chosen to be the same as fmt's automatic stringer // for []AddrRange. var buf bytes.Buffer buf.WriteByte('[') var sep string for !ars.IsEmpty() { buf.WriteString(sep) sep = " " buf.WriteString(ars.Head().String()) ars = ars.Tail() } buf.WriteByte(']') return buf.String() } golang-gvisor-gvisor-0.0~20240729.0/pkg/hostarch/addr_unsafe.go000066400000000000000000000013351465435605700240430ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package hostarch import ( "unsafe" ) // This is used in addr.go:Addr.AddLength(). const addrAtLeast64b = unsafe.Sizeof(Addr(0)) >= 8 golang-gvisor-gvisor-0.0~20240729.0/pkg/hostarch/hostarch.go000066400000000000000000000003561465435605700234050ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Package hostarch contains host arch address operations for user memory. package hostarch golang-gvisor-gvisor-0.0~20240729.0/pkg/hostarch/hostarch_arm64.go000066400000000000000000000032471465435605700244200ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package hostarch import ( "encoding/binary" "golang.org/x/sys/unix" ) const ( // PageSize is the system page size. // arm64 support 4K/16K/64K page size, // which can be get by unix.Getpagesize(). // Currently, only 4K page size is supported. PageSize = 1 << PageShift // HugePageSize is the system huge page size. HugePageSize = 1 << HugePageShift // CacheLineSize is the size of the cache line. CacheLineSize = 1 << CacheLineShift // PageShift is the binary log of the system page size. PageShift = 12 // HugePageShift is the binary log of the system huge page size. // Should be calculated by "PageShift + (PageShift - 3)" // when multiple page size support is ready. HugePageShift = 21 // CacheLineShift is the binary log of the cache line size. CacheLineShift = 6 ) var ( // ByteOrder is the native byte order (little endian). ByteOrder = binary.LittleEndian ) func init() { // Make sure the page size is 4K on arm64 platform. if size := unix.Getpagesize(); size != PageSize { panic("Only 4K page size is supported on arm64!") } } golang-gvisor-gvisor-0.0~20240729.0/pkg/hostarch/hostarch_arm64_state_autogen.go000066400000000000000000000001341465435605700273320ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 // +build arm64 package hostarch golang-gvisor-gvisor-0.0~20240729.0/pkg/hostarch/hostarch_state_autogen.go000066400000000000000000000032061465435605700263240ustar00rootroot00000000000000// automatically generated by stateify. package hostarch import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (a *AccessType) StateTypeName() string { return "pkg/hostarch.AccessType" } func (a *AccessType) StateFields() []string { return []string{ "Read", "Write", "Execute", } } func (a *AccessType) beforeSave() {} // +checklocksignore func (a *AccessType) StateSave(stateSinkObject state.Sink) { a.beforeSave() stateSinkObject.Save(0, &a.Read) stateSinkObject.Save(1, &a.Write) stateSinkObject.Save(2, &a.Execute) } func (a *AccessType) afterLoad(context.Context) {} // +checklocksignore func (a *AccessType) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &a.Read) stateSourceObject.Load(1, &a.Write) stateSourceObject.Load(2, &a.Execute) } func (v *Addr) StateTypeName() string { return "pkg/hostarch.Addr" } func (v *Addr) StateFields() []string { return nil } func (r *AddrRange) StateTypeName() string { return "pkg/hostarch.AddrRange" } func (r *AddrRange) StateFields() []string { return []string{ "Start", "End", } } func (r *AddrRange) beforeSave() {} // +checklocksignore func (r *AddrRange) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.Start) stateSinkObject.Save(1, &r.End) } func (r *AddrRange) afterLoad(context.Context) {} // +checklocksignore func (r *AddrRange) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.Start) stateSourceObject.Load(1, &r.End) } func init() { state.Register((*AccessType)(nil)) state.Register((*Addr)(nil)) state.Register((*AddrRange)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/hostarch/hostarch_unsafe_state_autogen.go000066400000000000000000000000721465435605700276630ustar00rootroot00000000000000// automatically generated by stateify. package hostarch golang-gvisor-gvisor-0.0~20240729.0/pkg/hostarch/hostarch_x86.go000066400000000000000000000023701465435605700241100ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 || 386 // +build amd64 386 package hostarch import "encoding/binary" const ( // PageSize is the system page size. PageSize = 1 << PageShift // HugePageSize is the system huge page size. HugePageSize = 1 << HugePageShift // CacheLineSize is the size of the cache line. CacheLineSize = 1 << CacheLineShift // PageShift is the binary log of the system page size. PageShift = 12 // HugePageShift is the binary log of the system huge page size. HugePageShift = 21 // CacheLineShift is the binary log of the cache line size. CacheLineShift = 6 ) var ( // ByteOrder is the native byte order (little endian). ByteOrder = binary.LittleEndian ) golang-gvisor-gvisor-0.0~20240729.0/pkg/hostarch/hostarch_x86_state_autogen.go000066400000000000000000000001471465435605700270320ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 || 386 // +build amd64 386 package hostarch golang-gvisor-gvisor-0.0~20240729.0/pkg/hostarch/sizes_util.go000066400000000000000000000061211465435605700237600ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package hostarch // Masks often used when working with alignment in constant expressions. const ( PageMask = PageSize - 1 HugePageMask = HugePageSize - 1 CacheLineMask = CacheLineSize - 1 ) type bytecount interface { ~uint | ~uint16 | ~uint32 | ~uint64 | ~uintptr } type hugebytecount interface { ~uint | ~uint32 | ~uint64 | ~uintptr } // PageRoundDown returns x rounded down to the nearest multiple of PageSize. func PageRoundDown[T bytecount](x T) T { return x &^ PageMask } // PageRoundUp returns x rounded up to the nearest multiple of PageSize. ok is // true iff rounding up does not overflow the range of T. func PageRoundUp[T bytecount](x T) (val T, ok bool) { val = PageRoundDown(x + PageMask) ok = val >= x return } // MustPageRoundUp is equivalent to PageRoundUp, but panics if rounding up // overflows. func MustPageRoundUp[T bytecount](x T) T { val, ok := PageRoundUp(x) if !ok { panic("PageRoundUp overflows") } return val } // PageOffset returns the offset of x into its containing page. func PageOffset[T bytecount](x T) T { return x & PageMask } // IsPageAligned returns true if x is a multiple of PageSize. func IsPageAligned[T bytecount](x T) bool { return PageOffset(x) == 0 } // ToPagesRoundUp returns (the number of pages equal to x bytes rounded up, // true). If rounding x up to a multiple of PageSize overflows the range of T, // ToPagesRoundUp returns (unspecified, false). func ToPagesRoundUp[T bytecount](x T) (T, bool) { y := x + PageMask if y < x { return x, false } return y / PageSize, true } // HugePageRoundDown returns x rounded down to the nearest multiple of // HugePageSize. func HugePageRoundDown[T hugebytecount](x T) T { return x &^ HugePageMask } // HugePageRoundUp returns x rounded up to the nearest multiple of // HugePageSize. ok is true iff rounding up does not overflow the range of T. func HugePageRoundUp[T hugebytecount](x T) (val T, ok bool) { val = HugePageRoundDown(x + HugePageMask) ok = val >= x return } // MustHugePageRoundUp is equivalent to HugePageRoundUp, but panics if rounding // up overflows. func MustHugePageRoundUp[T hugebytecount](x T) T { val, ok := HugePageRoundUp(x) if !ok { panic("HugePageRoundUp overflows") } return val } // HugePageOffset returns the offset of x into its containing page. func HugePageOffset[T hugebytecount](x T) T { return x & HugePageMask } // IsHugePageAligned returns true if x is a multiple of HugePageSize. func IsHugePageAligned[T hugebytecount](x T) bool { return HugePageOffset(x) == 0 } // CacheLineRoundDown returns the offset rounded down to the nearest multiple // of CacheLineSize. func CacheLineRoundDown[T bytecount](x T) T { return x &^ CacheLineMask } // CacheLineRoundUp returns the offset rounded up to the nearest multiple of // CacheLineSize. ok is true iff rounding up does not overflow the range of T. func CacheLineRoundUp[T bytecount](x T) (val T, ok bool) { val = CacheLineRoundDown(x + CacheLineMask) ok = val >= x return } golang-gvisor-gvisor-0.0~20240729.0/pkg/hostos/000077500000000000000000000000001465435605700207435ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/hostos/hostos.go000066400000000000000000000043661465435605700226220ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package hostos contains utility functions for getting information about the host OS. package hostos import ( "fmt" "regexp" "strings" "sync" "golang.org/x/mod/semver" "golang.org/x/sys/unix" ) // Version represents a semantic version of the form "%d.%d[.%d]". type Version struct { version string } // AtLeast returns whether vr is at least version major.minor. func (vr Version) AtLeast(major, minor int) bool { return semver.Compare(vr.version, fmt.Sprintf("v%d.%d", major, minor)) >= 0 } // LessThan returns whether vr is less than version major.minor. func (vr Version) LessThan(major, minor int) bool { return !vr.AtLeast(major, minor) } // String implements fmt.Stringer. func (vr Version) String() string { if vr.version == "" { return "unknown" } // Omit the "v" prefix required by semver. return vr.version[1:] } // These values are effectively local to KernelVersion, but kept here so as to // work with sync.Once. var ( semVersion Version unameErr error once sync.Once ) // KernelVersion returns the version of the kernel using uname(). func KernelVersion() (Version, error) { once.Do(func() { var utsname unix.Utsname if err := unix.Uname(&utsname); err != nil { unameErr = err return } var sb strings.Builder for _, b := range utsname.Release { if b == 0 { break } sb.WriteByte(byte(b)) } versionRegexp := regexp.MustCompile(`[0-9]+\.[0-9]+(\.[0-9]+)?`) version := "v" + string(versionRegexp.Find([]byte(sb.String()))) if !semver.IsValid(version) { unameErr = fmt.Errorf("invalid version found in release %q", sb.String()) return } semVersion.version = version }) return semVersion, unameErr } golang-gvisor-gvisor-0.0~20240729.0/pkg/hostos/hostos_state_autogen.go000066400000000000000000000000701465435605700255300ustar00rootroot00000000000000// automatically generated by stateify. package hostos golang-gvisor-gvisor-0.0~20240729.0/pkg/hosttid/000077500000000000000000000000001465435605700211025ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/hosttid/hosttid.go000066400000000000000000000020231465435605700231040ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package hosttid provides the Current function. package hosttid import ( "runtime" ) // Dummy references for facts. const _ = runtime.Compiler // Current returns the caller's host thread ID. Unless runtime.LockOSThread() // is in effect, this function is inherently racy since the Go runtime may // migrate the calling goroutine to another thread at any time. // // Current is equivalent to unix.Gettid(), but faster. func Current() uint64 golang-gvisor-gvisor-0.0~20240729.0/pkg/hosttid/hosttid_amd64.s000066400000000000000000000017201465435605700237370ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 #include "textflag.h" #define M_OFFSET 48 // +checkoffset runtime g.m #define PROCID_OFFSET 72 // +checkoffset runtime m.procid TEXT ·Current(SB),NOSPLIT|NOFRAME,$0-8 // procid is in getg().m.procid. MOVQ TLS, AX MOVQ 0(AX)(TLS*1), AX MOVQ M_OFFSET(AX), AX // gp.m MOVQ PROCID_OFFSET(AX), AX // mp.procid MOVQ AX, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/hosttid/hosttid_arm64.s000066400000000000000000000016671465435605700237670ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 #include "textflag.h" #define M_OFFSET 48 // +checkoffset runtime g.m #define PROCID_OFFSET 72 // +checkoffset runtime m.procid TEXT ·Current(SB),NOSPLIT,$0-8 // procid is in getg().m.procid. MOVD g, R0 // g MOVD M_OFFSET(R0), R0 // gp.m MOVD PROCID_OFFSET(R0), R0 // mp.procid MOVD R0, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/hosttid/hosttid_state_autogen.go000066400000000000000000000000711465435605700260270ustar00rootroot00000000000000// automatically generated by stateify. package hosttid golang-gvisor-gvisor-0.0~20240729.0/pkg/ilist/000077500000000000000000000000001465435605700205505ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/ilist/ilist_state_autogen.go000066400000000000000000000024651465435605700251540ustar00rootroot00000000000000// automatically generated by stateify. package ilist import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (l *List) StateTypeName() string { return "pkg/ilist.List" } func (l *List) StateFields() []string { return []string{ "head", "tail", } } func (l *List) beforeSave() {} // +checklocksignore func (l *List) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *List) afterLoad(context.Context) {} // +checklocksignore func (l *List) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *Entry) StateTypeName() string { return "pkg/ilist.Entry" } func (e *Entry) StateFields() []string { return []string{ "next", "prev", } } func (e *Entry) beforeSave() {} // +checklocksignore func (e *Entry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *Entry) afterLoad(context.Context) {} // +checklocksignore func (e *Entry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func init() { state.Register((*List)(nil)) state.Register((*Entry)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/ilist/interface_list.go000066400000000000000000000124131465435605700240730ustar00rootroot00000000000000package ilist // Linker is the interface that objects must implement if they want to be added // to and/or removed from List objects. // // N.B. When substituted in a template instantiation, Linker doesn't need to // be an interface, and in most cases won't be. type Linker interface { Next() Element Prev() Element SetNext(Element) SetPrev(Element) } // Element the item that is used at the API level. // // N.B. Like Linker, this is unlikely to be an interface in most cases. type Element interface { Linker } // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type ElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (ElementMapper) linkerFor(elem Element) Linker { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type List struct { head Element tail Element } // Reset resets list l to the empty state. func (l *List) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *List) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *List) Front() Element { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *List) Back() Element { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *List) Len() (count int) { for e := l.Front(); e != nil; e = (ElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *List) PushFront(e Element) { linker := ElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { ElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *List) PushFrontList(m *List) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { ElementMapper{}.linkerFor(l.head).SetPrev(m.tail) ElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *List) PushBack(e Element) { linker := ElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { ElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *List) PushBackList(m *List) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { ElementMapper{}.linkerFor(l.tail).SetNext(m.head) ElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *List) InsertAfter(b, e Element) { bLinker := ElementMapper{}.linkerFor(b) eLinker := ElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { ElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *List) InsertBefore(a, e Element) { aLinker := ElementMapper{}.linkerFor(a) eLinker := ElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { ElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *List) Remove(e Element) { linker := ElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { ElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { ElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type Entry struct { next Element prev Element } // Next returns the entry that follows e in the list. // //go:nosplit func (e *Entry) Next() Element { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *Entry) Prev() Element { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *Entry) SetNext(elem Element) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *Entry) SetPrev(elem Element) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/linewriter/000077500000000000000000000000001465435605700216105ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/linewriter/linewriter.go000066400000000000000000000035541465435605700243320ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package linewriter provides an io.Writer which calls an emitter on each line. package linewriter import ( "bytes" "gvisor.dev/gvisor/pkg/sync" ) // Writer is an io.Writer which buffers input, flushing // individual lines through an emitter function. type Writer struct { // the mutex locks buf. sync.Mutex // buf holds the data we haven't emitted yet. buf bytes.Buffer // emit is used to flush individual lines. emit func(p []byte) } // NewWriter creates a Writer which emits using emitter. // The emitter must not retain p. It may change after emitter returns. func NewWriter(emitter func(p []byte)) *Writer { return &Writer{emit: emitter} } // Write implements io.Writer.Write. // It calls emit on each line of input, not including the newline. // Write may be called concurrently. func (w *Writer) Write(p []byte) (int, error) { w.Lock() defer w.Unlock() total := 0 for len(p) > 0 { emit := true i := bytes.IndexByte(p, '\n') if i < 0 { // No newline, we will buffer everything. i = len(p) emit = false } n, err := w.buf.Write(p[:i]) if err != nil { return total, err } total += n p = p[i:] if emit { // Skip the newline, but still count it. p = p[1:] total++ w.emit(w.buf.Bytes()) w.buf.Reset() } } return total, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/000077500000000000000000000000001465435605700207055ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/bound_socket_fd_refs.go000066400000000000000000000103071465435605700254040ustar00rootroot00000000000000package lisafs import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const boundSocketFDenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var boundSocketFDobj *BoundSocketFD // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type boundSocketFDRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *boundSocketFDRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *boundSocketFDRefs) RefType() string { return fmt.Sprintf("%T", boundSocketFDobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *boundSocketFDRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *boundSocketFDRefs) LogRefs() bool { return boundSocketFDenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *boundSocketFDRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *boundSocketFDRefs) IncRef() { v := r.refCount.Add(1) if boundSocketFDenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *boundSocketFDRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if boundSocketFDenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *boundSocketFDRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if boundSocketFDenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *boundSocketFDRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/channel.go000066400000000000000000000134551465435605700226540ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package lisafs import ( "fmt" "math" "runtime" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/fdchannel" "gvisor.dev/gvisor/pkg/flipcall" "gvisor.dev/gvisor/pkg/log" ) var ( chanHeaderLen = uint32((*channelHeader)(nil).SizeBytes()) ) // maxChannels returns the number of channels a client can create. // // The server will reject channel creation requests beyond this (per client). // Note that we don't want the number of channels to be too large, because each // accounts for a large region of shared memory. // TODO(gvisor.dev/issue/6313): Tune the number of channels. func maxChannels() int { maxChans := runtime.GOMAXPROCS(0) if maxChans < 2 { maxChans = 2 } if maxChans > 4 { maxChans = 4 } return maxChans } // channel implements Communicator and represents the communication endpoint // for the client and server and is used to perform fast IPC. Apart from // communicating data, a channel is also capable of donating file descriptors. type channel struct { fdTracker dead bool data flipcall.Endpoint fdChan fdchannel.Endpoint } var _ Communicator = (*channel)(nil) // PayloadBuf implements Communicator.PayloadBuf. func (ch *channel) PayloadBuf(size uint32) []byte { return ch.data.Data()[chanHeaderLen : chanHeaderLen+size] } // SndRcvMessage implements Communicator.SndRcvMessage. func (ch *channel) SndRcvMessage(m MID, payloadLen uint32, wantFDs uint8) (MID, uint32, error) { // Write header. Requests can not donate FDs. ch.marshalHdr(m, 0 /* numFDs */) // One-shot communication. RPCs are expected to be quick rather than block. rcvDataLen, err := ch.data.SendRecvFast(chanHeaderLen + payloadLen) if err != nil { // This channel is now unusable. ch.dead = true // Map the transport errors to EIO, but also log the real error. log.Warningf("channel.SndRcvMessage: flipcall.Endpoint.SendRecv failed: %v", err) return 0, 0, unix.EIO } return ch.rcvMsg(rcvDataLen) } // String implements fmt.Stringer.String. func (ch *channel) String() string { return fmt.Sprintf("channel %p", ch) } func (ch *channel) shutdown() { ch.data.Shutdown() } func (ch *channel) destroy() { ch.dead = true ch.fdChan.Destroy() ch.data.Destroy() } // createChannel creates a server side channel. It returns a packet window // descriptor (for the data channel) and an open socket for the FD channel. func (c *Connection) createChannel(maxMessageSize uint32) (*channel, flipcall.PacketWindowDescriptor, int, error) { c.channelsMu.Lock() defer c.channelsMu.Unlock() // If c.channels is nil, the connection has closed. if c.channels == nil { return nil, flipcall.PacketWindowDescriptor{}, -1, unix.ENOSYS } // Return ENOMEM to indicate that the server has hit its max channels limit. if len(c.channels) >= maxChannels() { return nil, flipcall.PacketWindowDescriptor{}, -1, unix.ENOMEM } ch := &channel{} // Set up data channel. desc, err := c.channelAlloc.Allocate(flipcall.PacketHeaderBytes + int(chanHeaderLen+maxMessageSize)) if err != nil { return nil, flipcall.PacketWindowDescriptor{}, -1, err } if err := ch.data.Init(flipcall.ServerSide, desc); err != nil { return nil, flipcall.PacketWindowDescriptor{}, -1, err } // Set up FD channel. fdSocks, err := fdchannel.NewConnectedSockets() if err != nil { ch.data.Destroy() return nil, flipcall.PacketWindowDescriptor{}, -1, err } ch.fdChan.Init(fdSocks[0]) clientFDSock := fdSocks[1] c.channels = append(c.channels, ch) return ch, desc, clientFDSock, nil } // sendFDs sends as many FDs as it can. The failure to send an FD does not // cause an error and fail the entire RPC. FDs are considered supplementary // responses that are not critical to the RPC response itself. The failure to // send the (i)th FD will cause all the following FDs to not be sent as well // because the order in which FDs are donated is important. func (ch *channel) sendFDs(fds []int) uint8 { numFDs := len(fds) if numFDs == 0 { return 0 } if numFDs > math.MaxUint8 { log.Warningf("dropping all FDs because too many FDs to donate: %v", numFDs) return 0 } for i, fd := range fds { if err := ch.fdChan.SendFD(fd); err != nil { log.Warningf("error occurred while sending (%d/%d)th FD on channel(%p): %v", i+1, numFDs, ch, err) return uint8(i) } } return uint8(numFDs) } // channelHeader is the header present in front of each message received on // flipcall endpoint when the protocol version being used is 1. // // +marshal type channelHeader struct { message MID numFDs uint8 _ uint8 // Need to make struct packed. } func (ch *channel) marshalHdr(m MID, numFDs uint8) { header := &channelHeader{ message: m, numFDs: numFDs, } header.MarshalUnsafe(ch.data.Data()) } func (ch *channel) rcvMsg(dataLen uint32) (MID, uint32, error) { if dataLen < chanHeaderLen { log.Warningf("received data has size smaller than header length: %d", dataLen) return 0, 0, unix.EIO } // Read header first. var header channelHeader header.UnmarshalUnsafe(ch.data.Data()) // Read any FDs. for i := 0; i < int(header.numFDs); i++ { fd, err := ch.fdChan.RecvFDNonblock() if err != nil { log.Warningf("expected %d FDs, received %d successfully, got err after that: %v", header.numFDs, i, err) break } ch.TrackFD(fd) } return header.message, dataLen - chanHeaderLen, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/client.go000066400000000000000000000334261465435605700225220ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package lisafs import ( "fmt" "math" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/flipcall" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) const ( // fdsToCloseBatchSize is the number of closed FDs batched before an Close // RPC is made to close them all. fdsToCloseBatchSize is immutable. fdsToCloseBatchSize = 100 ) // Client helps manage a connection to the lisafs server and pass messages // efficiently. There is a 1:1 mapping between a Connection and a Client. type Client struct { // sockComm is the main socket by which this connections is established. // Communication over the socket is synchronized by sockMu. sockMu sync.Mutex sockComm *sockCommunicator // channelsMu protects channels and availableChannels. channelsMu sync.Mutex // channels tracks all the channels. channels []*channel // availableChannels is a LIFO (stack) of channels available to be used. availableChannels []*channel // activeWg represents active channels. activeWg sync.WaitGroup // watchdogWg only holds the watchdog goroutine. watchdogWg sync.WaitGroup // supported caches information about which messages are supported. It is // indexed by MID. An MID is supported if supported[MID] is true. supported []bool // maxMessageSize is the maximum payload length (in bytes) that can be sent. // It is initialized on Mount and is immutable. maxMessageSize uint32 // fdsToClose tracks the FDs to close. It caches the FDs no longer being used // by the client and closes them in one shot. It is not preserved across // checkpoint/restore as FDIDs are not preserved. fdsMu sync.Mutex fdsToClose []FDID } // NewClient creates a new client for communication with the server. It mounts // the server and creates channels for fast IPC. NewClient takes ownership over // the passed socket. On success, it returns the initialized client along with // the root Inode. func NewClient(sock *unet.Socket) (*Client, Inode, int, error) { c := &Client{ sockComm: newSockComm(sock), maxMessageSize: 1 << 20, // 1 MB for now. fdsToClose: make([]FDID, 0, fdsToCloseBatchSize), } // Start a goroutine to check socket health. This goroutine is also // responsible for client cleanup. c.watchdogWg.Add(1) go c.watchdog() // Mount the server first. Assume Mount is supported so that we can make the // Mount RPC below. c.supported = make([]bool, Mount+1) c.supported[Mount] = true var ( mountReq MountReq mountResp MountResp mountHostFD = [1]int{-1} ) if err := c.SndRcvMessage(Mount, uint32(mountReq.SizeBytes()), mountReq.MarshalBytes, mountResp.CheckedUnmarshal, mountHostFD[:], mountReq.String, mountResp.String); err != nil { c.Close() return nil, Inode{}, -1, err } // Initialize client. c.maxMessageSize = uint32(mountResp.MaxMessageSize) var maxSuppMID MID for _, suppMID := range mountResp.SupportedMs { if suppMID > maxSuppMID { maxSuppMID = suppMID } } c.supported = make([]bool, maxSuppMID+1) for _, suppMID := range mountResp.SupportedMs { c.supported[suppMID] = true } return c, mountResp.Root, mountHostFD[0], nil } // StartChannels starts maxChannels() channel communicators. func (c *Client) StartChannels() error { maxChans := maxChannels() c.channelsMu.Lock() c.channels = make([]*channel, 0, maxChans) c.availableChannels = make([]*channel, 0, maxChans) c.channelsMu.Unlock() // Create channels parallelly so that channels can be used to create more // channels and costly initialization like flipcall.Endpoint.Connect can // proceed parallelly. var channelsWg sync.WaitGroup for i := 0; i < maxChans; i++ { channelsWg.Add(1) go func() { defer channelsWg.Done() ch, err := c.createChannel() if err != nil { if err == unix.ENOMEM { log.Debugf("channel creation failed because server hit max channels limit") } else { log.Warningf("channel creation failed: %v", err) } return } c.channelsMu.Lock() c.channels = append(c.channels, ch) c.availableChannels = append(c.availableChannels, ch) c.channelsMu.Unlock() }() } channelsWg.Wait() // Check that atleast 1 channel is created. This is not required by lisafs // protocol. It exists to flag server side issues in channel creation. c.channelsMu.Lock() numChannels := len(c.channels) c.channelsMu.Unlock() if maxChans > 0 && numChannels == 0 { log.Warningf("all channel RPCs failed") return unix.ENOMEM } return nil } func (c *Client) watchdog() { defer c.watchdogWg.Done() events := []unix.PollFd{ { Fd: int32(c.sockComm.FD()), Events: unix.POLLHUP | unix.POLLRDHUP, }, } // Wait for a shutdown event. for { n, err := unix.Ppoll(events, nil, nil) if err == unix.EINTR || err == unix.EAGAIN { continue } if err != nil { log.Warningf("lisafs.Client.watch(): %v", err) } else if n != 1 { log.Warningf("lisafs.Client.watch(): got %d events, wanted 1", n) } break } // Shutdown all active channels and wait for them to complete. c.shutdownActiveChans() c.activeWg.Wait() // Close all channels. c.channelsMu.Lock() for _, ch := range c.channels { ch.destroy() } c.channelsMu.Unlock() // Close main socket. c.sockComm.destroy() } func (c *Client) shutdownActiveChans() { c.channelsMu.Lock() defer c.channelsMu.Unlock() availableChans := make(map[*channel]bool) for _, ch := range c.availableChannels { availableChans[ch] = true } for _, ch := range c.channels { // A channel that is not available is active. if _, ok := availableChans[ch]; !ok { log.Debugf("shutting down active channel@%p...", ch) ch.shutdown() } } // Prevent channels from becoming available and serving new requests. c.availableChannels = nil } // Close shuts down the main socket and waits for the watchdog to clean up. func (c *Client) Close() { // This shutdown has no effect if the watchdog has already fired and closed // the main socket. c.sockComm.shutdown() c.watchdogWg.Wait() } func (c *Client) createChannel() (*channel, error) { var ( chanReq ChannelReq chanResp ChannelResp ) var fds [2]int if err := c.SndRcvMessage(Channel, uint32(chanReq.SizeBytes()), chanReq.MarshalBytes, chanResp.CheckedUnmarshal, fds[:], chanReq.String, chanResp.String); err != nil { return nil, err } if fds[0] < 0 || fds[1] < 0 { closeFDs(fds[:]) return nil, fmt.Errorf("insufficient FDs provided in Channel response: %v", fds) } // Lets create the channel. defer closeFDs(fds[:1]) // The data FD is not needed after this. desc := flipcall.PacketWindowDescriptor{ FD: fds[0], Offset: chanResp.dataOffset, Length: int(chanResp.dataLength), } ch := &channel{} if err := ch.data.Init(flipcall.ClientSide, desc); err != nil { closeFDs(fds[1:]) return nil, err } ch.fdChan.Init(fds[1]) // fdChan now owns this FD. // Only a connected channel is usable. if err := ch.data.Connect(); err != nil { ch.destroy() return nil, err } return ch, nil } // IsSupported returns true if this connection supports the passed message. func (c *Client) IsSupported(m MID) bool { return int(m) < len(c.supported) && c.supported[m] } // CloseFD either queues the passed FD to be closed or makes a batch // RPC to close all the accumulated FDs-to-close. If flush is true, the RPC // is made immediately. func (c *Client) CloseFD(ctx context.Context, fd FDID, flush bool) { c.fdsMu.Lock() c.fdsToClose = append(c.fdsToClose, fd) if !flush && len(c.fdsToClose) < fdsToCloseBatchSize { // We can continue batching. c.fdsMu.Unlock() return } // Flush the cache. We should not hold fdsMu while making an RPC, so be sure // to copy the fdsToClose to another buffer before unlocking fdsMu. var toCloseArr [fdsToCloseBatchSize]FDID toClose := toCloseArr[:len(c.fdsToClose)] copy(toClose, c.fdsToClose) // Clear fdsToClose so other FDIDs can be appended. c.fdsToClose = c.fdsToClose[:0] c.fdsMu.Unlock() req := CloseReq{FDs: toClose} var resp CloseResp ctx.UninterruptibleSleepStart(false) err := c.SndRcvMessage(Close, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) if err != nil { log.Warningf("lisafs: batch closing FDs returned error: %v", err) } } // SyncFDs makes a Fsync RPC to sync multiple FDs. func (c *Client) SyncFDs(ctx context.Context, fds []FDID) error { if len(fds) == 0 { return nil } req := FsyncReq{FDs: fds} var resp FsyncResp ctx.UninterruptibleSleepStart(false) err := c.SndRcvMessage(FSync, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return err } // SndRcvMessage invokes reqMarshal to marshal the request onto the payload // buffer, wakes up the server to process the request, waits for the response // and invokes respUnmarshal with the response payload. respFDs is populated // with the received FDs, extra fields are set to -1. // // See messages.go to understand why function arguments are used instead of // combining these functions into an interface type. // // Precondition: function arguments must be non-nil. func (c *Client) SndRcvMessage(m MID, payloadLen uint32, reqMarshal marshalFunc, respUnmarshal unmarshalFunc, respFDs []int, reqString debugStringer, respString debugStringer) error { if !c.IsSupported(m) { return unix.EOPNOTSUPP } if payloadLen > c.maxMessageSize { log.Warningf("message %d has payload which is too large: %d bytes", m, payloadLen) return unix.EIO } wantFDs := len(respFDs) if wantFDs > math.MaxUint8 { log.Warningf("want too many FDs: %d", wantFDs) return unix.EINVAL } // Acquire a communicator. comm := c.acquireCommunicator() defer c.releaseCommunicator(comm) debugf("send", comm, reqString) // Marshal the request into comm's payload buffer and make the RPC. reqMarshal(comm.PayloadBuf(payloadLen)) respM, respPayloadLen, err := comm.SndRcvMessage(m, payloadLen, uint8(wantFDs)) // Handle FD donation. rcvFDs := comm.ReleaseFDs() if numRcvFDs := len(rcvFDs); numRcvFDs+wantFDs > 0 { // releasedFDs is memory owned by comm which can not be returned to caller. // Copy it into the caller's buffer. numFDCopied := copy(respFDs, rcvFDs) if numFDCopied < numRcvFDs { log.Warningf("%d unexpected FDs were donated by the server, wanted", numRcvFDs-numFDCopied, wantFDs) closeFDs(rcvFDs[numFDCopied:]) } if numFDCopied < wantFDs { for i := numFDCopied; i < wantFDs; i++ { respFDs[i] = -1 } } } // Error cases. if err != nil { closeFDs(respFDs) return err } if respPayloadLen > c.maxMessageSize { log.Warningf("server response for message %d is too large: %d bytes", respM, respPayloadLen) closeFDs(respFDs) return unix.EIO } if respM == Error { closeFDs(respFDs) var resp ErrorResp resp.UnmarshalUnsafe(comm.PayloadBuf(respPayloadLen)) debugf("recv", comm, resp.String) return unix.Errno(resp.errno) } if respM != m { closeFDs(respFDs) log.Warningf("sent %d message but got %d in response", m, respM) return unix.EINVAL } // Success. The payload must be unmarshalled *before* comm is released. if _, ok := respUnmarshal(comm.PayloadBuf(respPayloadLen)); !ok { log.Warningf("server response unmarshalling for %d message failed", respM) return unix.EIO } debugf("recv", comm, respString) return nil } func debugf(action string, comm Communicator, debugMsg debugStringer) { // Replicate the log.IsLogging(log.Debug) check to avoid having to call // debugMsg() on the hot path. if log.IsLogging(log.Debug) { log.Debugf("%s [%s] %s", action, comm, debugMsg()) } } // Postcondition: releaseCommunicator() must be called on the returned value. func (c *Client) acquireCommunicator() Communicator { // Prefer using channel over socket because: // - Channel uses a shared memory region for passing messages. IO from shared // memory is faster and does not involve making a syscall. // - No intermediate buffer allocation needed. With a channel, the message // can be directly pasted into the shared memory region. if ch := c.getChannel(); ch != nil { return ch } c.sockMu.Lock() return c.sockComm } // Precondition: comm must have been acquired via acquireCommunicator(). func (c *Client) releaseCommunicator(comm Communicator) { switch t := comm.(type) { case *sockCommunicator: c.sockMu.Unlock() // +checklocksforce: locked in acquireCommunicator(). case *channel: c.releaseChannel(t) default: panic(fmt.Sprintf("unknown communicator type %T", t)) } } // getChannel pops a channel from the available channels stack. The caller must // release the channel after use. func (c *Client) getChannel() *channel { c.channelsMu.Lock() defer c.channelsMu.Unlock() if len(c.availableChannels) == 0 { return nil } idx := len(c.availableChannels) - 1 ch := c.availableChannels[idx] c.availableChannels = c.availableChannels[:idx] c.activeWg.Add(1) return ch } // releaseChannel pushes the passed channel onto the available channel stack if // reinsert is true. func (c *Client) releaseChannel(ch *channel) { c.channelsMu.Lock() defer c.channelsMu.Unlock() // If availableChannels is nil, then watchdog has fired and the client is // shutting down. So don't make this channel available again. if !ch.dead && c.availableChannels != nil { c.availableChannels = append(c.availableChannels, ch) } c.activeWg.Done() } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/client_file.go000066400000000000000000000511571465435605700235220ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package lisafs import ( "fmt" "io" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal/primitive" ) // ClientFD is a wrapper around FDID that provides client-side utilities // so that RPC making is easier. type ClientFD struct { fd FDID client *Client } // ID returns the underlying FDID. func (f *ClientFD) ID() FDID { return f.fd } // Client returns the backing Client. func (f *ClientFD) Client() *Client { return f.client } // NewFD initializes a new ClientFD. func (c *Client) NewFD(fd FDID) ClientFD { return ClientFD{ client: c, fd: fd, } } // Ok returns true if the underlying FD is ok. func (f *ClientFD) Ok() bool { return f.fd.Ok() } // Close queues this FD to be closed on the server and resets f.fd. // This maybe invoke the Close RPC if the queue is full. If flush is true, then // the Close RPC is made immediately. Consider setting flush to false if // closing this FD on remote right away is not critical. func (f *ClientFD) Close(ctx context.Context, flush bool) { f.client.CloseFD(ctx, f.fd, flush) f.fd = InvalidFDID } // OpenAt makes the OpenAt RPC. func (f *ClientFD) OpenAt(ctx context.Context, flags uint32) (FDID, int, error) { req := OpenAtReq{ FD: f.fd, Flags: flags, } var respFD [1]int var resp OpenAtResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(OpenAt, uint32(req.SizeBytes()), req.MarshalUnsafe, resp.CheckedUnmarshal, respFD[:], req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return resp.OpenFD, respFD[0], err } // OpenCreateAt makes the OpenCreateAt RPC. func (f *ClientFD) OpenCreateAt(ctx context.Context, name string, flags uint32, mode linux.FileMode, uid UID, gid GID) (Inode, FDID, int, error) { var req OpenCreateAtReq req.DirFD = f.fd req.Name = SizedString(name) req.Flags = primitive.Uint32(flags) req.Mode = mode req.UID = uid req.GID = gid var respFD [1]int var resp OpenCreateAtResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(OpenCreateAt, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, respFD[:], req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return resp.Child, resp.NewFD, respFD[0], err } // StatTo makes the Fstat RPC and populates stat with the result. func (f *ClientFD) StatTo(ctx context.Context, stat *linux.Statx) error { req := StatReq{FD: f.fd} ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(FStat, uint32(req.SizeBytes()), req.MarshalUnsafe, stat.CheckedUnmarshal, nil, req.String, stat.String) ctx.UninterruptibleSleepFinish(false) return err } // Sync makes the Fsync RPC. func (f *ClientFD) Sync(ctx context.Context) error { req := FsyncReq{FDs: []FDID{f.fd}} var resp FsyncResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(FSync, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return err } // chunkify applies fn to buf in chunks based on chunkSize. func chunkify(chunkSize uint64, buf []byte, fn func([]byte, uint64) (uint64, error)) (uint64, error) { toProcess := uint64(len(buf)) var ( totalProcessed uint64 curProcessed uint64 off uint64 err error ) for { if totalProcessed == toProcess { return totalProcessed, nil } if totalProcessed+chunkSize > toProcess { curProcessed, err = fn(buf[totalProcessed:], off) } else { curProcessed, err = fn(buf[totalProcessed:totalProcessed+chunkSize], off) } totalProcessed += curProcessed off += curProcessed if err != nil { return totalProcessed, err } // Return partial result immediately. if curProcessed < chunkSize { return totalProcessed, nil } // If we received more bytes than we ever requested, this is a problem. if totalProcessed > toProcess { panic(fmt.Sprintf("bytes completed (%d)) > requested (%d)", totalProcessed, toProcess)) } } } // Read makes the PRead RPC. func (f *ClientFD) Read(ctx context.Context, dst []byte, offset uint64) (uint64, error) { var resp PReadResp // maxDataReadSize represents the maximum amount of data we can read at once // (maximum message size - metadata size present in resp). Uninitialized // resp.SizeBytes() correctly returns the metadata size only (since the read // buffer is empty). maxDataReadSize := uint64(f.client.maxMessageSize) - uint64(resp.SizeBytes()) return chunkify(maxDataReadSize, dst, func(buf []byte, curOff uint64) (uint64, error) { req := PReadReq{ Offset: offset + curOff, FD: f.fd, Count: uint32(len(buf)), } // This will be unmarshalled into. Already set Buf so that we don't need to // allocate a temporary buffer during unmarshalling. // PReadResp.CheckedUnmarshal expects this to be set. resp.Buf = buf ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(PRead, uint32(req.SizeBytes()), req.MarshalUnsafe, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) if err != nil { return 0, err } // io.EOF is not an error that a lisafs server can return. Use POSIX // semantics to return io.EOF manually: zero bytes were returned and a // non-zero buffer was used. // NOTE(b/237442794): Some callers like splice really depend on a non-nil // error being returned in such a case. This is consistent with P9. if resp.NumBytes == 0 && len(buf) > 0 { return 0, io.EOF } return uint64(resp.NumBytes), nil }) } // Write makes the PWrite RPC. func (f *ClientFD) Write(ctx context.Context, src []byte, offset uint64) (uint64, error) { var req PWriteReq // maxDataWriteSize represents the maximum amount of data we can write at // once (maximum message size - metadata size present in req). Uninitialized // req.SizeBytes() correctly returns the metadata size only (since the write // buffer is empty). maxDataWriteSize := uint64(f.client.maxMessageSize) - uint64(req.SizeBytes()) return chunkify(maxDataWriteSize, src, func(buf []byte, curOff uint64) (uint64, error) { req = PWriteReq{ Offset: primitive.Uint64(offset + curOff), FD: f.fd, NumBytes: primitive.Uint32(len(buf)), Buf: buf, } var resp PWriteResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(PWrite, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return resp.Count, err }) } // MkdirAt makes the MkdirAt RPC. func (f *ClientFD) MkdirAt(ctx context.Context, name string, mode linux.FileMode, uid UID, gid GID) (Inode, error) { var req MkdirAtReq req.DirFD = f.fd req.Name = SizedString(name) req.Mode = mode req.UID = uid req.GID = gid var resp MkdirAtResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(MkdirAt, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return resp.ChildDir, err } // SymlinkAt makes the SymlinkAt RPC. func (f *ClientFD) SymlinkAt(ctx context.Context, name, target string, uid UID, gid GID) (Inode, error) { req := SymlinkAtReq{ DirFD: f.fd, Name: SizedString(name), Target: SizedString(target), UID: uid, GID: gid, } var resp SymlinkAtResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(SymlinkAt, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return resp.Symlink, err } // LinkAt makes the LinkAt RPC. func (f *ClientFD) LinkAt(ctx context.Context, targetFD FDID, name string) (Inode, error) { req := LinkAtReq{ DirFD: f.fd, Target: targetFD, Name: SizedString(name), } var resp LinkAtResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(LinkAt, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return resp.Link, err } // MknodAt makes the MknodAt RPC. func (f *ClientFD) MknodAt(ctx context.Context, name string, mode linux.FileMode, uid UID, gid GID, minor, major uint32) (Inode, error) { var req MknodAtReq req.DirFD = f.fd req.Name = SizedString(name) req.Mode = mode req.UID = uid req.GID = gid req.Minor = primitive.Uint32(minor) req.Major = primitive.Uint32(major) var resp MknodAtResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(MknodAt, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return resp.Child, err } // SetStat makes the SetStat RPC. func (f *ClientFD) SetStat(ctx context.Context, stat *linux.Statx) (uint32, error, error) { req := SetStatReq{ FD: f.fd, Mask: stat.Mask, Mode: uint32(stat.Mode), UID: UID(stat.UID), GID: GID(stat.GID), Size: stat.Size, Atime: linux.Timespec{ Sec: stat.Atime.Sec, Nsec: int64(stat.Atime.Nsec), }, Mtime: linux.Timespec{ Sec: stat.Mtime.Sec, Nsec: int64(stat.Mtime.Nsec), }, } var resp SetStatResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(SetStat, uint32(req.SizeBytes()), req.MarshalUnsafe, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) if err != nil { return 0, nil, err } if resp.FailureMask == 0 { return 0, nil, nil } return resp.FailureMask, unix.Errno(resp.FailureErrNo), nil } // WalkMultiple makes the Walk RPC with multiple path components. func (f *ClientFD) WalkMultiple(ctx context.Context, names []string) (WalkStatus, []Inode, error) { req := WalkReq{ DirFD: f.fd, Path: StringArray(names), } var resp WalkResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(Walk, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return resp.Status, resp.Inodes, err } // Walk makes the Walk RPC with just one path component to walk. func (f *ClientFD) Walk(ctx context.Context, name string) (Inode, error) { req := WalkReq{ DirFD: f.fd, Path: []string{name}, } var inode [1]Inode resp := WalkResp{Inodes: inode[:]} ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(Walk, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) if err != nil { return Inode{}, err } switch resp.Status { case WalkComponentDoesNotExist: return Inode{}, unix.ENOENT case WalkComponentSymlink: // f is not a directory which can be walked on. return Inode{}, unix.ENOTDIR } if n := len(resp.Inodes); n > 1 { for i := range resp.Inodes { f.client.CloseFD(ctx, resp.Inodes[i].ControlFD, false /* flush */) } log.Warningf("requested to walk one component, but got %d results", n) return Inode{}, unix.EIO } else if n == 0 { log.Warningf("walk has success status but no results returned") return Inode{}, unix.ENOENT } return inode[0], err } // WalkStat makes the WalkStat RPC with multiple path components to walk. func (f *ClientFD) WalkStat(ctx context.Context, names []string) ([]linux.Statx, error) { req := WalkReq{ DirFD: f.fd, Path: StringArray(names), } var resp WalkStatResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(WalkStat, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return resp.Stats, err } // StatFSTo makes the FStatFS RPC and populates statFS with the result. func (f *ClientFD) StatFSTo(ctx context.Context, statFS *StatFS) error { req := FStatFSReq{FD: f.fd} ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(FStatFS, uint32(req.SizeBytes()), req.MarshalUnsafe, statFS.CheckedUnmarshal, nil, req.String, statFS.String) ctx.UninterruptibleSleepFinish(false) return err } // Allocate makes the FAllocate RPC. func (f *ClientFD) Allocate(ctx context.Context, mode, offset, length uint64) error { req := FAllocateReq{ FD: f.fd, Mode: mode, Offset: offset, Length: length, } var resp FAllocateResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(FAllocate, uint32(req.SizeBytes()), req.MarshalUnsafe, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return err } // ReadLinkAt makes the ReadLinkAt RPC. func (f *ClientFD) ReadLinkAt(ctx context.Context) (string, error) { req := ReadLinkAtReq{FD: f.fd} var resp ReadLinkAtResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(ReadLinkAt, uint32(req.SizeBytes()), req.MarshalUnsafe, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return string(resp.Target), err } // Flush makes the Flush RPC. func (f *ClientFD) Flush(ctx context.Context) error { if !f.client.IsSupported(Flush) { // If Flush is not supported, it probably means that it would be a noop. return nil } req := FlushReq{FD: f.fd} var resp FlushResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(Flush, uint32(req.SizeBytes()), req.MarshalUnsafe, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return err } // BindAt makes the BindAt RPC. func (f *ClientFD) BindAt(ctx context.Context, sockType linux.SockType, name string, mode linux.FileMode, uid UID, gid GID) (Inode, *ClientBoundSocketFD, error) { var ( req BindAtReq resp BindAtResp hostSocketFD [1]int ) req.DirFD = f.fd req.SockType = primitive.Uint32(sockType) req.Name = SizedString(name) req.Mode = mode req.UID = uid req.GID = gid ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(BindAt, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, hostSocketFD[:], req.String, resp.String) ctx.UninterruptibleSleepFinish(false) if err == nil && hostSocketFD[0] < 0 { // No host socket fd? We can't proceed. // Clean up any resources the gofer sent to us. if resp.Child.ControlFD.Ok() { f.client.CloseFD(ctx, resp.Child.ControlFD, false /* flush */) } if resp.BoundSocketFD.Ok() { f.client.CloseFD(ctx, resp.BoundSocketFD, false /* flush */) } err = unix.EBADF } if err != nil { return Inode{}, nil, err } cbsFD := &ClientBoundSocketFD{ fd: resp.BoundSocketFD, notificationFD: int32(hostSocketFD[0]), client: f.client, } return resp.Child, cbsFD, err } // Connect makes the Connect RPC. func (f *ClientFD) Connect(ctx context.Context, sockType linux.SockType) (int, error) { req := ConnectReq{FD: f.fd, SockType: uint32(sockType)} var resp ConnectResp var sockFD [1]int ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(Connect, uint32(req.SizeBytes()), req.MarshalUnsafe, resp.CheckedUnmarshal, sockFD[:], req.String, resp.String) ctx.UninterruptibleSleepFinish(false) if err == nil && sockFD[0] < 0 { err = unix.EBADF } return sockFD[0], err } // UnlinkAt makes the UnlinkAt RPC. func (f *ClientFD) UnlinkAt(ctx context.Context, name string, flags uint32) error { req := UnlinkAtReq{ DirFD: f.fd, Name: SizedString(name), Flags: primitive.Uint32(flags), } var resp UnlinkAtResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(UnlinkAt, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return err } // RenameAt makes the RenameAt RPC which renames oldName inside directory f to // newDirFD directory with name newName. func (f *ClientFD) RenameAt(ctx context.Context, oldName string, newDirFD FDID, newName string) error { req := RenameAtReq{ OldDir: f.fd, OldName: SizedString(oldName), NewDir: newDirFD, NewName: SizedString(newName), } var resp RenameAtResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(RenameAt, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return err } // Getdents64 makes the Getdents64 RPC. func (f *ClientFD) Getdents64(ctx context.Context, count int32) ([]Dirent64, error) { req := Getdents64Req{ DirFD: f.fd, Count: count, } var resp Getdents64Resp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(Getdents64, uint32(req.SizeBytes()), req.MarshalUnsafe, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return resp.Dirents, err } // ListXattr makes the FListXattr RPC. func (f *ClientFD) ListXattr(ctx context.Context, size uint64) ([]string, error) { req := FListXattrReq{ FD: f.fd, Size: size, } var resp FListXattrResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(FListXattr, uint32(req.SizeBytes()), req.MarshalUnsafe, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return resp.Xattrs, err } // GetXattr makes the FGetXattr RPC. func (f *ClientFD) GetXattr(ctx context.Context, name string, size uint64) (string, error) { req := FGetXattrReq{ FD: f.fd, Name: SizedString(name), BufSize: primitive.Uint32(size), } var resp FGetXattrResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(FGetXattr, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return string(resp.Value), err } // SetXattr makes the FSetXattr RPC. func (f *ClientFD) SetXattr(ctx context.Context, name string, value string, flags uint32) error { req := FSetXattrReq{ FD: f.fd, Name: SizedString(name), Value: SizedString(value), Flags: primitive.Uint32(flags), } var resp FSetXattrResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(FSetXattr, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return err } // RemoveXattr makes the FRemoveXattr RPC. func (f *ClientFD) RemoveXattr(ctx context.Context, name string) error { req := FRemoveXattrReq{ FD: f.fd, Name: SizedString(name), } var resp FRemoveXattrResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(FRemoveXattr, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return err } // ClientBoundSocketFD corresponds to a bound socket on the server. It // implements transport.BoundSocketFD. // // All fields are immutable. type ClientBoundSocketFD struct { // fd is the FDID of the bound socket on the server. fd FDID // notificationFD is the host FD that can be used to notify when new // clients connect to the socket. notificationFD int32 client *Client } // Close implements transport.BoundSocketFD.Close. func (f *ClientBoundSocketFD) Close(ctx context.Context) { _ = unix.Close(int(f.notificationFD)) // flush is true because the socket FD must be closed immediately on the // server. close(2) on socket FD impacts application behavior. f.client.CloseFD(ctx, f.fd, true /* flush */) } // NotificationFD implements transport.BoundSocketFD.NotificationFD. func (f *ClientBoundSocketFD) NotificationFD() int32 { return f.notificationFD } // Listen implements transport.BoundSocketFD.Listen. func (f *ClientBoundSocketFD) Listen(ctx context.Context, backlog int32) error { req := ListenReq{ FD: f.fd, Backlog: backlog, } var resp ListenResp ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(Listen, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, nil, req.String, resp.String) ctx.UninterruptibleSleepFinish(false) return err } // Accept implements transport.BoundSocketFD.Accept. func (f *ClientBoundSocketFD) Accept(ctx context.Context) (int, error) { req := AcceptReq{ FD: f.fd, } var resp AcceptResp var hostSocketFD [1]int ctx.UninterruptibleSleepStart(false) err := f.client.SndRcvMessage(Accept, uint32(req.SizeBytes()), req.MarshalBytes, resp.CheckedUnmarshal, hostSocketFD[:], req.String, resp.String) ctx.UninterruptibleSleepFinish(false) if err == nil && hostSocketFD[0] < 0 { err = unix.EBADF } return hostSocketFD[0], err } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/communicator.go000066400000000000000000000061231465435605700237360ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package lisafs import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" ) // Communicator is a server side utility which represents exactly how the // server is communicating with the client. type Communicator interface { fmt.Stringer // PayloadBuf returns a slice to the payload section of its internal buffer // where the message can be marshalled. The handlers should use this to // populate the payload buffer with the message. // // The payload buffer contents *should* be preserved across calls with // different sizes. Note that this is not a guarantee, because a compromised // owner of a "shared" payload buffer can tamper with its contents anytime, // even when it's not its turn to do so. PayloadBuf(size uint32) []byte // SndRcvMessage sends message m. The caller must have populated PayloadBuf() // with payloadLen bytes. The caller expects to receive wantFDs FDs. // Any received FDs must be accessible via ReleaseFDs(). It returns the // response message along with the response payload length. SndRcvMessage(m MID, payloadLen uint32, wantFDs uint8) (MID, uint32, error) // DonateFD attempts to make fd non-blocking and starts tracking it. The next // call to ReleaseFDs will include fd in the order it was added. Communicator // takes ownership of fd. Server side should call this. DonateFD(fd int) // Track starts tracking fd. The next call to ReleaseFDs will include fd in // the order it was added. Communicator takes ownership of fd. Client side // should use this for accumulating received FDs. TrackFD(fd int) // ReleaseFDs returns the accumulated FDs and stops tracking them. The // ownership of the FDs is transferred to the caller. ReleaseFDs() []int } // fdTracker is a partial implementation of Communicator. It can be embedded in // Communicator implementations to keep track of FD donations. type fdTracker struct { fds []int } // DonateFD implements Communicator.DonateFD. func (d *fdTracker) DonateFD(fd int) { // Try to make the FD non-blocking. if err := unix.SetNonblock(fd, true); err != nil && err != unix.EBADF { // This may fail if fd was opened with O_PATH, because fcntl(F_SETFL) fails // with EBADF on O_PATH FDs. log.Warningf("DonateFD: unix.SetNonblock() failed on FD %d: %v", fd, err) } d.TrackFD(fd) } // TrackFD implements Communicator.TrackFD. func (d *fdTracker) TrackFD(fd int) { d.fds = append(d.fds, fd) } // ReleaseFDs implements Communicator.ReleaseFDs. func (d *fdTracker) ReleaseFDs() []int { ret := d.fds d.fds = d.fds[:0] return ret } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/connection.go000066400000000000000000000250571465435605700234040ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package lisafs import ( "path" "path/filepath" "runtime/debug" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/flipcall" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) // Connection represents a connection between a mount point in the client and a // mount point in the server. It is owned by the server on which it was started // and facilitates communication with the client mount. // // Each connection is set up using a unix domain socket. One end is owned by // the server and the other end is owned by the client. The connection may // spawn additional comunicational channels for the same mount for increased // RPC concurrency. // // Reference model: // - When any FD is created, the connection takes a ref on it which represents // the client's ref on the FD. // - The client can drop its ref via the Close RPC which will in turn make the // connection drop its ref. type Connection struct { // server is the server on which this connection was created. It is immutably // associated with it for its entire lifetime. server *Server // mountPath is the path to a file inside the server that is served to this // connection as its root FD. IOW, this connection is mounted at this path. // mountPath is trusted because it is configured by the server (trusted) as // per the user's sandbox configuration. mountPath is immutable. mountPath string // maxMessageSize is the cached value of server.impl.MaxMessageSize(). maxMessageSize uint32 // readonly indicates if this connection is readonly. All write operations // will fail with EROFS. readonly bool // sockComm is the main socket by which this connections is established. sockComm *sockCommunicator // channelsMu protects channels. channelsMu sync.Mutex // channels keeps track of all open channels. channels []*channel // activeWg represents active channels. activeWg sync.WaitGroup // reqGate counts requests that are still being handled. reqGate sync.Gate // channelAlloc is used to allocate memory for channels. channelAlloc *flipcall.PacketWindowAllocator fdsMu sync.RWMutex // fds keeps tracks of open FDs on this server. It is protected by fdsMu. fds map[FDID]genericFD // nextFDID is the next available FDID. It is protected by fdsMu. nextFDID FDID } // CreateConnection initializes a new connection which will be mounted at // mountPath. The connection must be started separately. func (s *Server) CreateConnection(sock *unet.Socket, mountPath string, readonly bool) (*Connection, error) { mountPath = path.Clean(mountPath) if !filepath.IsAbs(mountPath) { log.Warningf("mountPath %q is not absolute", mountPath) return nil, unix.EINVAL } c := &Connection{ sockComm: newSockComm(sock), server: s, maxMessageSize: s.impl.MaxMessageSize(), mountPath: mountPath, readonly: readonly, channels: make([]*channel, 0, maxChannels()), fds: make(map[FDID]genericFD), nextFDID: InvalidFDID + 1, } alloc, err := flipcall.NewPacketWindowAllocator() if err != nil { return nil, err } c.channelAlloc = alloc return c, nil } // ServerImpl returns the associated server implementation. func (c *Connection) ServerImpl() ServerImpl { return c.server.impl } // Run defines the lifecycle of a connection. func (c *Connection) Run() { defer c.close() // Start handling requests on this connection. for { m, payloadLen, err := c.sockComm.rcvMsg(0 /* wantFDs */) if err != nil { log.Debugf("sock read failed, closing connection: %v", err) return } respM, respPayloadLen, respFDs := c.handleMsg(c.sockComm, m, payloadLen) err = c.sockComm.sndPrepopulatedMsg(respM, respPayloadLen, respFDs) closeFDs(respFDs) if err != nil { log.Debugf("sock write failed, closing connection: %v", err) return } } } // service starts servicing the passed channel until the channel is shutdown. // This is a blocking method and hence must be called in a separate goroutine. func (c *Connection) service(ch *channel) error { rcvDataLen, err := ch.data.RecvFirst() if err != nil { return err } for rcvDataLen > 0 { m, payloadLen, err := ch.rcvMsg(rcvDataLen) if err != nil { return err } respM, respPayloadLen, respFDs := c.handleMsg(ch, m, payloadLen) numFDs := ch.sendFDs(respFDs) closeFDs(respFDs) ch.marshalHdr(respM, numFDs) rcvDataLen, err = ch.data.SendRecv(respPayloadLen + chanHeaderLen) if err != nil { return err } } return nil } func (c *Connection) respondError(comm Communicator, err unix.Errno) (MID, uint32, []int) { resp := &ErrorResp{errno: uint32(err)} respLen := uint32(resp.SizeBytes()) resp.MarshalUnsafe(comm.PayloadBuf(respLen)) return Error, respLen, nil } func (c *Connection) handleMsg(comm Communicator, m MID, payloadLen uint32) (retM MID, retPayloadLen uint32, retFDs []int) { if payloadLen > c.maxMessageSize { log.Warningf("received payload is too large: %d bytes", payloadLen) return c.respondError(comm, unix.EIO) } if !c.reqGate.Enter() { // c.close() has been called; the connection is shutting down. return c.respondError(comm, unix.ECONNRESET) } defer func() { c.reqGate.Leave() // Don't allow a panic to propagate. if err := recover(); err != nil { // Include a useful log message. log.Warningf("panic in handler: %v\n%s", err, debug.Stack()) // Wrap in an EREMOTEIO error; we don't really have a better way to // describe this kind of error. EREMOTEIO is appropriate for a generic // failed RPC message. retM, retPayloadLen, retFDs = c.respondError(comm, unix.EREMOTEIO) } }() // Check if the message is supported for forward compatibility. if int(m) >= len(c.server.handlers) || c.server.handlers[m] == nil { log.Warningf("received request which is not supported by the server, MID = %d", m) return c.respondError(comm, unix.EOPNOTSUPP) } // Try handling the request. respPayloadLen, err := c.server.handlers[m](c, comm, payloadLen) fds := comm.ReleaseFDs() if err != nil { closeFDs(fds) return c.respondError(comm, p9.ExtractErrno(err)) } if respPayloadLen > c.maxMessageSize { log.Warningf("handler for message %d responded with payload which is too large: %d bytes", m, respPayloadLen) closeFDs(fds) return c.respondError(comm, unix.EIO) } return m, respPayloadLen, fds } func (c *Connection) close() { // Wait for completion of all inflight requests. This is mostly so that if // a request is stuck, the sandbox supervisor has the opportunity to kill // us with SIGABRT to get a stack dump of the offending handler. c.reqGate.Close() // Shutdown and clean up channels. c.channelsMu.Lock() for _, ch := range c.channels { ch.shutdown() } c.activeWg.Wait() for _, ch := range c.channels { ch.destroy() } // This is to prevent additional channels from being created. c.channels = nil c.channelsMu.Unlock() // Free the channel memory. if c.channelAlloc != nil { c.channelAlloc.Destroy() } // Ensure the connection is closed. c.sockComm.destroy() // Cleanup all FDs. c.fdsMu.Lock() defer c.fdsMu.Unlock() for fdid := range c.fds { fd := c.stopTrackingFD(fdid) fd.DecRef(nil) // Drop the ref held by c. } } // Postcondition: The caller gains a ref on the FD on success. func (c *Connection) lookupFD(id FDID) (genericFD, error) { c.fdsMu.RLock() defer c.fdsMu.RUnlock() fd, ok := c.fds[id] if !ok { return nil, unix.EBADF } fd.IncRef() return fd, nil } // lookupControlFD retrieves the control FD identified by id on this // connection. On success, the caller gains a ref on the FD. func (c *Connection) lookupControlFD(id FDID) (*ControlFD, error) { fd, err := c.lookupFD(id) if err != nil { return nil, err } cfd, ok := fd.(*ControlFD) if !ok { fd.DecRef(nil) return nil, unix.EINVAL } return cfd, nil } // lookupOpenFD retrieves the open FD identified by id on this // connection. On success, the caller gains a ref on the FD. func (c *Connection) lookupOpenFD(id FDID) (*OpenFD, error) { fd, err := c.lookupFD(id) if err != nil { return nil, err } ofd, ok := fd.(*OpenFD) if !ok { fd.DecRef(nil) return nil, unix.EINVAL } return ofd, nil } // lookupBoundSocketFD retrieves the boundSockedFD identified by id on this // connection. On success, the caller gains a ref on the FD. func (c *Connection) lookupBoundSocketFD(id FDID) (*BoundSocketFD, error) { fd, err := c.lookupFD(id) if err != nil { return nil, err } bsfd, ok := fd.(*BoundSocketFD) if !ok { fd.DecRef(nil) return nil, unix.EINVAL } return bsfd, nil } // insertFD inserts the passed fd into the internal datastructure to track FDs. // The caller must hold a ref on fd which is transferred to the connection. func (c *Connection) insertFD(fd genericFD) FDID { c.fdsMu.Lock() defer c.fdsMu.Unlock() res := c.nextFDID c.nextFDID++ if c.nextFDID < res { panic("ran out of FDIDs") } c.fds[res] = fd return res } // removeFD makes c stop tracking the passed FDID and drops its ref on it. func (c *Connection) removeFD(id FDID) { c.fdsMu.Lock() fd := c.stopTrackingFD(id) c.fdsMu.Unlock() if fd != nil { // Drop the ref held by c. This can take arbitrarily long. So do not hold // c.fdsMu while calling it. fd.DecRef(nil) } } // removeControlFDLocked is the same as removeFD with added preconditions. // // Preconditions: // - server's rename mutex must at least be read locked. // - id must be pointing to a control FD. func (c *Connection) removeControlFDLocked(id FDID) { c.fdsMu.Lock() fd := c.stopTrackingFD(id) c.fdsMu.Unlock() if fd != nil { // Drop the ref held by c. This can take arbitrarily long. So do not hold // c.fdsMu while calling it. fd.(*ControlFD).decRefLocked() } } // stopTrackingFD makes c stop tracking the passed FDID. Note that the caller // must drop ref on the returned fd (preferably without holding c.fdsMu). // // Precondition: c.fdsMu is locked. func (c *Connection) stopTrackingFD(id FDID) genericFD { fd := c.fds[id] if fd == nil { log.Warningf("removeFDLocked called on non-existent FDID %d", id) return nil } delete(c.fds, id) return fd } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/control_fd_list.go000066400000000000000000000122571465435605700244270ustar00rootroot00000000000000package lisafs // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type controlFDElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (controlFDElementMapper) linkerFor(elem *ControlFD) *ControlFD { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type controlFDList struct { head *ControlFD tail *ControlFD } // Reset resets list l to the empty state. func (l *controlFDList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *controlFDList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *controlFDList) Front() *ControlFD { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *controlFDList) Back() *ControlFD { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *controlFDList) Len() (count int) { for e := l.Front(); e != nil; e = (controlFDElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *controlFDList) PushFront(e *ControlFD) { linker := controlFDElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { controlFDElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *controlFDList) PushFrontList(m *controlFDList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { controlFDElementMapper{}.linkerFor(l.head).SetPrev(m.tail) controlFDElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *controlFDList) PushBack(e *ControlFD) { linker := controlFDElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { controlFDElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *controlFDList) PushBackList(m *controlFDList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { controlFDElementMapper{}.linkerFor(l.tail).SetNext(m.head) controlFDElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *controlFDList) InsertAfter(b, e *ControlFD) { bLinker := controlFDElementMapper{}.linkerFor(b) eLinker := controlFDElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { controlFDElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *controlFDList) InsertBefore(a, e *ControlFD) { aLinker := controlFDElementMapper{}.linkerFor(a) eLinker := controlFDElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { controlFDElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *controlFDList) Remove(e *ControlFD) { linker := controlFDElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { controlFDElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { controlFDElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type controlFDEntry struct { next *ControlFD prev *ControlFD } // Next returns the entry that follows e in the list. // //go:nosplit func (e *controlFDEntry) Next() *ControlFD { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *controlFDEntry) Prev() *ControlFD { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *controlFDEntry) SetNext(elem *ControlFD) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *controlFDEntry) SetPrev(elem *ControlFD) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/control_fd_refs.go000066400000000000000000000101771465435605700244120ustar00rootroot00000000000000package lisafs import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const controlFDenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var controlFDobj *ControlFD // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type controlFDRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *controlFDRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *controlFDRefs) RefType() string { return fmt.Sprintf("%T", controlFDobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *controlFDRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *controlFDRefs) LogRefs() bool { return controlFDenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *controlFDRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *controlFDRefs) IncRef() { v := r.refCount.Add(1) if controlFDenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *controlFDRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if controlFDenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *controlFDRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if controlFDenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *controlFDRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/fd.go000066400000000000000000000566011465435605700216350ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package lisafs import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sync" ) // FDID (file descriptor identifier) is used to identify FDs on a connection. // Each connection has its own FDID namespace. // // +marshal boundCheck slice:FDIDSlice type FDID uint64 // InvalidFDID represents an invalid FDID. const InvalidFDID FDID = 0 // Ok returns true if f is a valid FDID. func (f FDID) Ok() bool { return f != InvalidFDID } // genericFD can represent any type of FD. type genericFD interface { refs.RefCounter } // A ControlFD is the gateway to the backing filesystem tree node. It is an // unusual concept. This exists to provide a safe way to do path-based // operations on the file. It performs operations that can modify the // filesystem tree and synchronizes these operations. See ControlFDImpl for // supported operations. // // It is not an inode, because multiple control FDs are allowed to exist on the // same file. It is not a file descriptor because it is not tied to any access // mode, i.e. a control FD can change its access mode based on the operation // being performed. // // Reference Model: // - Each control FD holds a ref on its Node for its entire lifetime. type ControlFD struct { controlFDRefs controlFDEntry // node is the filesystem node this FD is immutably associated with. node *Node // openFDs is a linked list of all FDs opened on this FD. As per reference // model, all open FDs hold a ref on this FD. openFDsMu sync.RWMutex openFDs openFDList // All the following fields are immutable. // id is the unique FD identifier which identifies this FD on its connection. id FDID // conn is the backing connection owning this FD. conn *Connection // ftype is the file type of the backing inode. ftype.FileType() == ftype. ftype linux.FileMode // impl is the control FD implementation which embeds this struct. It // contains all the implementation specific details. impl ControlFDImpl } var _ genericFD = (*ControlFD)(nil) // DecRef implements refs.RefCounter.DecRef. Note that the context // parameter should never be used. It exists solely to comply with the // refs.RefCounter interface. func (fd *ControlFD) DecRef(context.Context) { fd.controlFDRefs.DecRef(func() { fd.conn.server.renameMu.RLock() defer fd.conn.server.renameMu.RUnlock() fd.destroyLocked() }) } // decRefLocked is the same as DecRef except the added precondition. // // Precondition: server's rename mutex must be at least read locked. func (fd *ControlFD) decRefLocked() { fd.controlFDRefs.DecRef(func() { fd.destroyLocked() }) } // Precondition: server's rename mutex must be at least read locked. func (fd *ControlFD) destroyLocked() { // Update node's control FD list. fd.node.removeFD(fd) // Drop ref on node. fd.node.DecRef(nil) // Let the FD implementation clean up. fd.impl.Close() } // Init must be called before first use of fd. It inserts fd into the // filesystem tree. // // Preconditions: // - server's rename mutex must be at least read locked. // - The caller must take a ref on node which is transferred to fd. func (fd *ControlFD) Init(c *Connection, node *Node, mode linux.FileMode, impl ControlFDImpl) { fd.conn = c fd.node = node fd.impl = impl fd.ftype = mode.FileType() // Initialize fd with 1 ref which is transferred to c via c.insertFD(). fd.controlFDRefs.InitRefs() // Make fd reachable/discoverable. fd.id = c.insertFD(fd) node.insertFD(fd) } // Conn returns the fd's owning connection. func (fd *ControlFD) Conn() *Connection { return fd.conn } // FileType returns the file mode only containing the file type bits. func (fd *ControlFD) FileType() linux.FileMode { return fd.ftype } // IsDir indicates whether fd represents a directory. func (fd *ControlFD) IsDir() bool { return fd.ftype == unix.S_IFDIR } // IsRegular indicates whether fd represents a regular file. func (fd *ControlFD) IsRegular() bool { return fd.ftype == unix.S_IFREG } // IsSymlink indicates whether fd represents a symbolic link. func (fd *ControlFD) IsSymlink() bool { return fd.ftype == unix.S_IFLNK } // IsSocket indicates whether fd represents a socket. func (fd *ControlFD) IsSocket() bool { return fd.ftype == unix.S_IFSOCK } // Node returns the node this FD was opened on. func (fd *ControlFD) Node() *Node { return fd.node } // RemoveFromConn removes this control FD from its owning connection. // // Preconditions: // - fd should not have been returned to the client. Otherwise the client can // still refer to it. // - server's rename mutex must at least be read locked. func (fd *ControlFD) RemoveFromConn() { fd.conn.removeControlFDLocked(fd.id) } // safelyRead executes the given operation with the local path node locked. // This guarantees that fd's path will not change. fn may not any change paths. func (fd *ControlFD) safelyRead(fn func() error) error { fd.conn.server.renameMu.RLock() defer fd.conn.server.renameMu.RUnlock() fd.node.opMu.RLock() defer fd.node.opMu.RUnlock() return fn() } // safelyWrite executes the given operation with the local path node locked in // a writable fashion. This guarantees that no other operation is executing on // this path node. fn may change paths inside fd.node. func (fd *ControlFD) safelyWrite(fn func() error) error { fd.conn.server.renameMu.RLock() defer fd.conn.server.renameMu.RUnlock() fd.node.opMu.Lock() defer fd.node.opMu.Unlock() return fn() } // safelyGlobal executes the given operation with the global path lock held. // This guarantees that no other operations is executing concurrently on this // server. fn may change any path. func (fd *ControlFD) safelyGlobal(fn func() error) (err error) { fd.conn.server.renameMu.Lock() defer fd.conn.server.renameMu.Unlock() return fn() } // forEachOpenFD executes fn on each FD opened on fd. func (fd *ControlFD) forEachOpenFD(fn func(ofd *OpenFD)) { fd.openFDsMu.RLock() defer fd.openFDsMu.RUnlock() for ofd := fd.openFDs.Front(); ofd != nil; ofd = ofd.Next() { fn(ofd) } } // OpenFD represents an open file descriptor on the protocol. It resonates // closely with a Linux file descriptor. Its operations are limited to the // file. Its operations are not allowed to modify or traverse the filesystem // tree. See OpenFDImpl for the supported operations. // // Reference Model: // - An OpenFD takes a reference on the control FD it was opened on. type OpenFD struct { openFDRefs openFDEntry // All the following fields are immutable. // controlFD is the ControlFD on which this FD was opened. OpenFD holds a ref // on controlFD for its entire lifetime. controlFD *ControlFD // id is the unique FD identifier which identifies this FD on its connection. id FDID // Access mode for this FD. readable bool writable bool // impl is the open FD implementation which embeds this struct. It // contains all the implementation specific details. impl OpenFDImpl } var _ genericFD = (*OpenFD)(nil) // ControlFD returns the control FD on which this FD was opened. func (fd *OpenFD) ControlFD() ControlFDImpl { return fd.controlFD.impl } // DecRef implements refs.RefCounter.DecRef. Note that the context // parameter should never be used. It exists solely to comply with the // refs.RefCounter interface. func (fd *OpenFD) DecRef(context.Context) { fd.openFDRefs.DecRef(func() { fd.controlFD.openFDsMu.Lock() fd.controlFD.openFDs.Remove(fd) fd.controlFD.openFDsMu.Unlock() fd.controlFD.DecRef(nil) // Drop the ref on the control FD. fd.impl.Close() }) } // Init must be called before first use of fd. func (fd *OpenFD) Init(cfd *ControlFD, flags uint32, impl OpenFDImpl) { // Initialize fd with 1 ref which is transferred to c via c.insertFD(). fd.openFDRefs.InitRefs() fd.controlFD = cfd fd.id = cfd.conn.insertFD(fd) accessMode := flags & unix.O_ACCMODE fd.readable = accessMode == unix.O_RDONLY || accessMode == unix.O_RDWR fd.writable = accessMode == unix.O_WRONLY || accessMode == unix.O_RDWR fd.impl = impl cfd.IncRef() // Holds a ref on cfd for its lifetime. cfd.openFDsMu.Lock() cfd.openFDs.PushBack(fd) cfd.openFDsMu.Unlock() } // BoundSocketFD represents a bound socket on the server. // // Reference Model: // - A BoundSocketFD takes a reference on the control FD it is bound to. type BoundSocketFD struct { boundSocketFDRefs // All the following fields are immutable. // controlFD is the ControlFD on which this FD was bound. BoundSocketFD // holds a ref on controlFD for its entire lifetime. controlFD *ControlFD // id is the unique FD identifier which identifies this FD on its connection. id FDID // impl is the socket FD implementation which embeds this struct. It // contains all the implementation specific details. impl BoundSocketFDImpl } var _ genericFD = (*BoundSocketFD)(nil) // ControlFD returns the control FD on which this FD was bound. func (fd *BoundSocketFD) ControlFD() ControlFDImpl { return fd.controlFD.impl } // DecRef implements refs.RefCounter.DecRef. Note that the context // parameter should never be used. It exists solely to comply with the // refs.RefCounter interface. func (fd *BoundSocketFD) DecRef(context.Context) { fd.boundSocketFDRefs.DecRef(func() { fd.controlFD.DecRef(nil) // Drop the ref on the control FD. fd.impl.Close() }) } // Init must be called before first use of fd. func (fd *BoundSocketFD) Init(cfd *ControlFD, impl BoundSocketFDImpl) { // Initialize fd with 1 ref which is transferred to c via c.insertFD(). fd.boundSocketFDRefs.InitRefs() fd.controlFD = cfd fd.id = cfd.conn.insertFD(fd) fd.impl = impl cfd.IncRef() // Holds a ref on cfd for its lifetime. } // There are four different types of guarantees provided: // // none: There is no concurrency guarantee. The method may be invoked // concurrently with any other method on any other FD. // // read: The method is guaranteed to be exclusive of any write or global // operation that is mutating the state of the directory tree starting at this // node. For example, this means creating new files, symlinks, directories or // renaming a directory entry (or renaming in to this target), but the method // may be called concurrently with other read methods. // // write: The method is guaranteed to be exclusive of any read, write or global // operation that is mutating the state of the directory tree starting at this // node, as described in read above. There may however, be other write // operations executing concurrently on other components in the directory tree. // // global: The method is guaranteed to be exclusive of any read, write or // global operation. // ControlFDImpl contains implementation details for a ControlFD. // Implementations of ControlFDImpl should contain their associated ControlFD // by value as their first field. // // The operations that perform path traversal or any modification to the // filesystem tree must synchronize those modifications with the server's // rename mutex. type ControlFDImpl interface { // FD returns a pointer to the embedded ControlFD. FD() *ControlFD // Close should clean up resources used by the control FD implementation. // Close is called after all references on the FD have been dropped and its // FDID has been released. // // On the server, Close has no concurrency guarantee. Close() // Stat returns the stat(2) results for this FD. // // On the server, Stat has a read concurrency guarantee. Stat() (linux.Statx, error) // SetStat sets file attributes on the backing file. This does not correspond // to any one Linux syscall. On Linux, this operation is performed using // multiple syscalls like fchmod(2), fchown(2), ftruncate(2), futimesat(2) // and so on. The implementation must only set attributes for fields // indicated by stat.Mask. Failure to set an attribute may or may not // terminate the entire operation. SetStat must return a uint32 which is // interpreted as a stat mask to indicate which attribute setting attempts // failed. If multiple attribute setting attempts failed, the returned error // may be from any one of them. // // On the server, SetStat has a write concurrency guarantee. SetStat(stat SetStatReq) (uint32, error) // Walk walks one path component from the directory represented by this FD. // Walk must open a ControlFD on the walked file. // // On the server, Walk has a read concurrency guarantee. Walk(name string) (*ControlFD, linux.Statx, error) // WalkStat is capable of walking multiple path components and returning the // stat results for each path component walked via recordStat. Stat results // must be returned in the order of walk. // // In case a symlink is encountered, the walk must terminate successfully on // the symlink including its stat result. // // The first path component of path may be "" which indicates that the first // stat result returned must be of this starting directory. // // On the server, WalkStat has a read concurrency guarantee. WalkStat(path StringArray, recordStat func(linux.Statx)) error // Open opens the control FD with the flags passed. The flags should be // interpreted as open(2) flags. // // Open may also optionally return a host FD for the opened file whose // lifecycle is independent of the OpenFD. Returns -1 if not available. // // N.B. The server must resolve any lazy paths when open is called. // After this point, read and write may be called on files with no // deletion check, so resolving in the data path is not viable. // // On the server, Open has a read concurrency guarantee. Open(flags uint32) (*OpenFD, int, error) // OpenCreate creates a regular file inside the directory represented by this // FD and then also opens the file. The created file has perms as specified // by mode and owners as specified by uid and gid. The file is opened with // the specified flags. // // OpenCreate may also optionally return a host FD for the opened file whose // lifecycle is independent of the OpenFD. Returns -1 if not available. // // N.B. The server must resolve any lazy paths when open is called. // After this point, read and write may be called on files with no // deletion check, so resolving in the data path is not viable. // // On the server, OpenCreate has a write concurrency guarantee. OpenCreate(mode linux.FileMode, uid UID, gid GID, name string, flags uint32) (*ControlFD, linux.Statx, *OpenFD, int, error) // Mkdir creates a directory inside the directory represented by this FD. The // created directory has perms as specified by mode and owners as specified // by uid and gid. // // On the server, Mkdir has a write concurrency guarantee. Mkdir(mode linux.FileMode, uid UID, gid GID, name string) (*ControlFD, linux.Statx, error) // Mknod creates a file inside the directory represented by this FD. The file // type and perms are specified by mode and owners are specified by uid and // gid. If the newly created file is a character or block device, minor and // major specify its device number. // // On the server, Mkdir has a write concurrency guarantee. Mknod(mode linux.FileMode, uid UID, gid GID, name string, minor uint32, major uint32) (*ControlFD, linux.Statx, error) // Symlink creates a symlink inside the directory represented by this FD. The // symlink has owners as specified by uid and gid and points to target. // // On the server, Symlink has a write concurrency guarantee. Symlink(name string, target string, uid UID, gid GID) (*ControlFD, linux.Statx, error) // Link creates a hard link to the file represented by this FD. The hard link // is created inside dir with the specified name. // // On the server, Link has a write concurrency guarantee for dir and read // concurrency guarantee for this file. Link(dir ControlFDImpl, name string) (*ControlFD, linux.Statx, error) // StatFS returns information about the file system associated with // this file. // // On the server, StatFS has read concurrency guarantee. StatFS() (StatFS, error) // Readlink reads the symlink's target and writes the string into the buffer // returned by getLinkBuf which can be used to request buffer for some size. // It returns the number of bytes written into the buffer. // // On the server, Readlink has a read concurrency guarantee. Readlink(getLinkBuf func(uint32) []byte) (uint16, error) // Connect establishes a new host-socket backed connection with a unix domain // socket. On success it returns a non-blocking host socket FD whose // lifecycle is independent of this ControlFD. // // sockType indicates the requested type of socket and can be passed as type // argument to socket(2). // // On the server, Connect has a read concurrency guarantee. Connect(sockType uint32) (int, error) // BindAt creates a host unix domain socket of type sockType, bound to // the given namt of type sockType, bound to the given name. It returns // a ControlFD that can be used for path operations on the socket, a // BoundSocketFD that can be used to Accept/Listen on the socket, and a // host FD that can be used for event notifications (like new // connections). // // On the server, BindAt has a write concurrency guarantee. BindAt(name string, sockType uint32, mode linux.FileMode, uid UID, gid GID) (*ControlFD, linux.Statx, *BoundSocketFD, int, error) // UnlinkAt the file identified by name in this directory. // // Flags are Linux unlinkat(2) flags. // // On the server, UnlinkAt has a write concurrency guarantee. Unlink(name string, flags uint32) error // RenameAt renames a given file to a new name in a potentially new directory. // // oldName must be a name relative to this file, which must be a directory. // newName is a name relative to newDir. // // On the server, RenameAt has a global concurrency guarantee. RenameAt(oldName string, newDir ControlFDImpl, newName string) error // Renamed is called to notify the FD implementation that the file has been // renamed. FD implementation may update its state accordingly. // // On the server, Renamed has a global concurrency guarantee. Renamed() // GetXattr returns extended attributes of this file. It returns the number // of bytes written into the buffer returned by getValueBuf which can be used // to request buffer for some size. // // If the value is larger than size, implementations may return ERANGE to // indicate that the buffer is too small. // // N.B. size may be 0, in which can the implementation must first find out // the attribute value size using getxattr(2) by passing size=0. Then request // a buffer large enough using getValueBuf and write the value there. // // On the server, GetXattr has a read concurrency guarantee. GetXattr(name string, size uint32, getValueBuf func(uint32) []byte) (uint16, error) // SetXattr sets extended attributes on this file. // // On the server, SetXattr has a write concurrency guarantee. SetXattr(name string, value string, flags uint32) error // ListXattr lists the names of the extended attributes on this file. // // Size indicates the size of the buffer that has been allocated to hold the // attribute list. If the list would be larger than size, implementations may // return ERANGE to indicate that the buffer is too small, but they are also // free to ignore the hint entirely (i.e. the value returned may be larger // than size). All size checking is done independently at the syscall layer. // // On the server, ListXattr has a read concurrency guarantee. ListXattr(size uint64) (StringArray, error) // RemoveXattr removes extended attributes on this file. // // On the server, RemoveXattr has a write concurrency guarantee. RemoveXattr(name string) error } // OpenFDImpl contains implementation details for a OpenFD. Implementations of // OpenFDImpl should contain their associated OpenFD by value as their first // field. // // Since these operations do not perform any path traversal or any modification // to the filesystem tree, there is no need to synchronize with rename // operations. type OpenFDImpl interface { // FD returns a pointer to the embedded OpenFD. FD() *OpenFD // Close should clean up resources used by the open FD implementation. // Close is called after all references on the FD have been dropped and its // FDID has been released. // // On the server, Close has no concurrency guarantee. Close() // Stat returns the stat(2) results for this FD. // // On the server, Stat has a read concurrency guarantee. Stat() (linux.Statx, error) // Sync is similar to fsync(2). // // On the server, Sync has a read concurrency guarantee. Sync() error // Write writes buf at offset off to the backing file via this open FD. Write // attempts to write len(buf) bytes and returns the number of bytes written. // // On the server, Write has a write concurrency guarantee. See Open for // additional requirements regarding lazy path resolution. Write(buf []byte, off uint64) (uint64, error) // Read reads at offset off into buf from the backing file via this open FD. // Read attempts to read len(buf) bytes and returns the number of bytes read. // // On the server, Read has a read concurrency guarantee. See Open for // additional requirements regarding lazy path resolution. Read(buf []byte, off uint64) (uint64, error) // Allocate allows the caller to directly manipulate the allocated disk space // for the file. See fallocate(2) for more details. // // On the server, Allocate has a write concurrency guarantee. Allocate(mode, off, length uint64) error // Flush can be used to clean up the file state. Behavior is // implementation-specific. // // On the server, Flush has a read concurrency guarantee. Flush() error // Getdent64 fetches directory entries for this directory and calls // recordDirent for each dirent read. If seek0 is true, then the directory FD // is seeked to 0 and iteration starts from the beginning. // // On the server, Getdent64 has a read concurrency guarantee. Getdent64(count uint32, seek0 bool, recordDirent func(Dirent64)) error // Renamed is called to notify the FD implementation that the file has been // renamed. FD implementation may update its state accordingly. // // On the server, Renamed has a global concurrency guarantee. Renamed() } // BoundSocketFDImpl represents a socket on the host filesystem that has been // created by the sandboxed application via Bind. type BoundSocketFDImpl interface { // FD returns a pointer to the embedded BoundSocketFD. FD() *BoundSocketFD // Listen marks the socket as accepting incoming connections. // // On the server, Listen has a read concurrency guarantee. Listen(backlog int32) error // Accept takes the first pending connection and creates a new socket // for it. The new socket FD is returned along with the peer address of // the connecting socket (which may be empty string). // // On the server, Accept has a read concurrency guarantee. Accept() (int, string, error) // Close should clean up resources used by the bound socket FD // implementation. // // Close is called after all references on the FD have been dropped and its // FDID has been released. // // On the server, Close has no concurrency guarantee. Close() } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/handlers.go000066400000000000000000001147371465435605700230510ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package lisafs import ( "fmt" "math" "strings" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/flipcall" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/p9" ) const ( allowedOpenFlags = unix.O_ACCMODE | unix.O_TRUNC setStatSupportedMask = unix.STATX_MODE | unix.STATX_UID | unix.STATX_GID | unix.STATX_SIZE | unix.STATX_ATIME | unix.STATX_MTIME // unixDirentMaxSize is the maximum size of unix.Dirent for amd64. unixDirentMaxSize = 280 ) // RPCHandler defines a handler that is invoked when the associated message is // received. The handler is responsible for: // // - Unmarshalling the request from the passed payload and interpreting it. // - Marshalling the response into the communicator's payload buffer. // - Return the number of payload bytes written. // - Donate any FDs (if needed) to comm which will in turn donate it to client. type RPCHandler func(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) var handlers = [...]RPCHandler{ Error: ErrorHandler, Mount: MountHandler, Channel: ChannelHandler, FStat: FStatHandler, SetStat: SetStatHandler, Walk: WalkHandler, WalkStat: WalkStatHandler, OpenAt: OpenAtHandler, OpenCreateAt: OpenCreateAtHandler, Close: CloseHandler, FSync: FSyncHandler, PWrite: PWriteHandler, PRead: PReadHandler, MkdirAt: MkdirAtHandler, MknodAt: MknodAtHandler, SymlinkAt: SymlinkAtHandler, LinkAt: LinkAtHandler, FStatFS: FStatFSHandler, FAllocate: FAllocateHandler, ReadLinkAt: ReadLinkAtHandler, Flush: FlushHandler, UnlinkAt: UnlinkAtHandler, RenameAt: RenameAtHandler, Getdents64: Getdents64Handler, FGetXattr: FGetXattrHandler, FSetXattr: FSetXattrHandler, FListXattr: FListXattrHandler, FRemoveXattr: FRemoveXattrHandler, Connect: ConnectHandler, BindAt: BindAtHandler, Listen: ListenHandler, Accept: AcceptHandler, } // ErrorHandler handles Error message. func ErrorHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { // Client should never send Error. return 0, unix.EINVAL } // MountHandler handles the Mount RPC. Note that there can not be concurrent // executions of MountHandler on a connection because the connection enforces // that Mount is the first message on the connection. Only after the connection // has been successfully mounted can other channels be created. func MountHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { var ( mountPointFD *ControlFD mountPointHostFD = -1 mountPointStat linux.Statx mountNode = c.server.root ) if err := c.server.withRenameReadLock(func() (err error) { // Maintain extra ref on mountNode to ensure existence during walk. mountNode.IncRef() defer func() { // Drop extra ref on mountNode. Wrap the defer call with a func so that // mountNode is evaluated on execution, not on defer itself. mountNode.DecRef(nil) }() // Walk to the mountpoint. pit := fspath.Parse(c.mountPath).Begin for pit.Ok() { curName := pit.String() if err := checkSafeName(curName); err != nil { return err } mountNode.opMu.RLock() if mountNode.isDeleted() { mountNode.opMu.RUnlock() return unix.ENOENT } mountNode.childrenMu.Lock() next := mountNode.LookupChildLocked(curName) if next == nil { next = &Node{} next.InitLocked(curName, mountNode) } else { next.IncRef() } mountNode.childrenMu.Unlock() mountNode.opMu.RUnlock() // next has an extra ref as needed. Drop extra ref on mountNode. mountNode.DecRef(nil) pit = pit.Next() mountNode = next } // Provide Mount with read concurrency guarantee. mountNode.opMu.RLock() defer mountNode.opMu.RUnlock() if mountNode.isDeleted() { return unix.ENOENT } mountPointFD, mountPointStat, mountPointHostFD, err = c.ServerImpl().Mount(c, mountNode) return err }); err != nil { return 0, err } if mountPointHostFD >= 0 { comm.DonateFD(mountPointHostFD) } resp := MountResp{ Root: Inode{ ControlFD: mountPointFD.id, Stat: mountPointStat, }, SupportedMs: c.ServerImpl().SupportedMessages(), MaxMessageSize: primitive.Uint32(c.ServerImpl().MaxMessageSize()), } respPayloadLen := uint32(resp.SizeBytes()) resp.MarshalBytes(comm.PayloadBuf(respPayloadLen)) return respPayloadLen, nil } // ChannelHandler handles the Channel RPC. func ChannelHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { ch, desc, fdSock, err := c.createChannel(c.ServerImpl().MaxMessageSize()) if err != nil { return 0, err } // Start servicing the channel in a separate goroutine. c.activeWg.Add(1) go func() { if err := c.service(ch); err != nil { // Don't log shutdown error which is expected during server shutdown. if _, ok := err.(flipcall.ShutdownError); !ok { log.Warningf("lisafs.Connection.service(channel = @%p): %v", ch, err) } } c.activeWg.Done() }() clientDataFD, err := unix.Dup(desc.FD) if err != nil { unix.Close(fdSock) ch.shutdown() return 0, err } // Respond to client with successful channel creation message. comm.DonateFD(clientDataFD) comm.DonateFD(fdSock) resp := ChannelResp{ dataOffset: desc.Offset, dataLength: uint64(desc.Length), } respLen := uint32(resp.SizeBytes()) resp.MarshalUnsafe(comm.PayloadBuf(respLen)) return respLen, nil } // FStatHandler handles the FStat RPC. func FStatHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { var req StatReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } fd, err := c.lookupFD(req.FD) if err != nil { return 0, err } defer fd.DecRef(nil) var resp linux.Statx switch t := fd.(type) { case *ControlFD: t.safelyRead(func() error { resp, err = t.impl.Stat() return err }) case *OpenFD: t.controlFD.safelyRead(func() error { resp, err = t.impl.Stat() return err }) default: panic(fmt.Sprintf("unknown fd type %T", t)) } if err != nil { return 0, err } respLen := uint32(resp.SizeBytes()) resp.MarshalUnsafe(comm.PayloadBuf(respLen)) return respLen, nil } // SetStatHandler handles the SetStat RPC. func SetStatHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { if c.readonly { return 0, unix.EROFS } var req SetStatReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } fd, err := c.lookupControlFD(req.FD) if err != nil { return 0, err } defer fd.DecRef(nil) if req.Mask&^setStatSupportedMask != 0 { return 0, unix.EPERM } var resp SetStatResp if err := fd.safelyWrite(func() error { if fd.node.isDeleted() && !c.server.opts.SetAttrOnDeleted { return unix.EINVAL } failureMask, failureErr := fd.impl.SetStat(req) resp.FailureMask = failureMask if failureErr != nil { resp.FailureErrNo = uint32(p9.ExtractErrno(failureErr)) } return nil }); err != nil { return 0, err } respLen := uint32(resp.SizeBytes()) resp.MarshalUnsafe(comm.PayloadBuf(respLen)) return respLen, nil } // WalkHandler handles the Walk RPC. func WalkHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { var req WalkReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } startDir, err := c.lookupControlFD(req.DirFD) if err != nil { return 0, err } defer startDir.DecRef(nil) if !startDir.IsDir() { return 0, unix.ENOTDIR } // Manually marshal the inodes into the payload buffer during walk to avoid // the slice allocation. The memory format should be WalkResp's. var ( numInodes primitive.Uint16 status = WalkSuccess ) respMetaSize := status.SizeBytes() + numInodes.SizeBytes() maxPayloadSize := respMetaSize + (len(req.Path) * (*Inode)(nil).SizeBytes()) if maxPayloadSize > math.MaxUint32 { // Too much to walk, can't do. return 0, unix.EIO } payloadBuf := comm.PayloadBuf(uint32(maxPayloadSize)) payloadPos := respMetaSize if err := c.server.withRenameReadLock(func() error { curDir := startDir cu := cleanup.Make(func() { // Destroy all newly created FDs until now. Read the new FDIDs from the // payload buffer. buf := comm.PayloadBuf(uint32(maxPayloadSize))[respMetaSize:] var curIno Inode for i := 0; i < int(numInodes); i++ { buf = curIno.UnmarshalBytes(buf) c.removeControlFDLocked(curIno.ControlFD) } }) defer cu.Clean() for _, name := range req.Path { if err := checkSafeName(name); err != nil { return err } // Symlinks terminate walk. This client gets the symlink inode, but will // have to invoke Walk again with the resolved path. if curDir.IsSymlink() { status = WalkComponentSymlink break } curDir.node.opMu.RLock() if curDir.node.isDeleted() { // It is not safe to walk on a deleted directory. It could have been // replaced with a malicious symlink. curDir.node.opMu.RUnlock() status = WalkComponentDoesNotExist break } child, childStat, err := curDir.impl.Walk(name) curDir.node.opMu.RUnlock() if err == unix.ENOENT { status = WalkComponentDoesNotExist break } if err != nil { return err } // Write inode into payload buffer. i := Inode{ControlFD: child.id, Stat: childStat} i.MarshalUnsafe(payloadBuf[payloadPos:]) payloadPos += i.SizeBytes() numInodes++ curDir = child } cu.Release() return nil }); err != nil { return 0, err } // WalkResp writes the walk status followed by the number of inodes in the // beginning. payloadBuf = status.MarshalUnsafe(payloadBuf) numInodes.MarshalUnsafe(payloadBuf) return uint32(payloadPos), nil } // WalkStatHandler handles the WalkStat RPC. func WalkStatHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { var req WalkReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } startDir, err := c.lookupControlFD(req.DirFD) if err != nil { return 0, err } defer startDir.DecRef(nil) // Note that this fd is allowed to not actually be a directory when the // only path component to walk is "" (self). if !startDir.IsDir() { if len(req.Path) > 1 || (len(req.Path) == 1 && len(req.Path[0]) > 0) { return 0, unix.ENOTDIR } } for i, name := range req.Path { // First component is allowed to be "". if i == 0 && len(name) == 0 { continue } if err := checkSafeName(name); err != nil { return 0, err } } // We will manually marshal the statx results into the payload buffer as they // are generated to avoid the slice allocation. The memory format should be // the same as WalkStatResp's. var numStats primitive.Uint16 maxPayloadSize := numStats.SizeBytes() + (len(req.Path) * linux.SizeOfStatx) if maxPayloadSize > math.MaxUint32 { // Too much to walk, can't do. return 0, unix.EIO } payloadBuf := comm.PayloadBuf(uint32(maxPayloadSize)) payloadPos := numStats.SizeBytes() if c.server.opts.WalkStatSupported { if err = startDir.safelyRead(func() error { return startDir.impl.WalkStat(req.Path, func(s linux.Statx) { s.MarshalUnsafe(payloadBuf[payloadPos:]) payloadPos += s.SizeBytes() numStats++ }) }); err != nil { return 0, err } // WalkStatResp writes the number of stats in the beginning. numStats.MarshalUnsafe(payloadBuf) return uint32(payloadPos), nil } if err = c.server.withRenameReadLock(func() error { if len(req.Path) > 0 && len(req.Path[0]) == 0 { startDir.node.opMu.RLock() stat, err := startDir.impl.Stat() startDir.node.opMu.RUnlock() if err != nil { return err } stat.MarshalUnsafe(payloadBuf[payloadPos:]) payloadPos += stat.SizeBytes() numStats++ req.Path = req.Path[1:] } parent := startDir closeParent := func() { if parent != startDir { c.removeControlFDLocked(parent.id) } } defer closeParent() for _, name := range req.Path { parent.node.opMu.RLock() if parent.node.isDeleted() { // It is not safe to walk on a deleted directory. It could have been // replaced with a malicious symlink. parent.node.opMu.RUnlock() break } child, childStat, err := parent.impl.Walk(name) parent.node.opMu.RUnlock() if err != nil { if err == unix.ENOENT { break } return err } // Update with next generation. closeParent() parent = child // Write results. childStat.MarshalUnsafe(payloadBuf[payloadPos:]) payloadPos += childStat.SizeBytes() numStats++ // Symlinks terminate walk. This client gets the symlink stat result, but // will have to invoke Walk again with the resolved path. if childStat.Mode&unix.S_IFMT == unix.S_IFLNK { break } } return nil }); err != nil { return 0, err } // WalkStatResp writes the number of stats in the beginning. numStats.MarshalUnsafe(payloadBuf) return uint32(payloadPos), nil } // OpenAtHandler handles the OpenAt RPC. func OpenAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { var req OpenAtReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } // Only keep allowed open flags. if allowedFlags := req.Flags & allowedOpenFlags; allowedFlags != req.Flags { log.Debugf("discarding open flags that are not allowed: old open flags = %d, new open flags = %d", req.Flags, allowedFlags) req.Flags = allowedFlags } accessMode := req.Flags & unix.O_ACCMODE trunc := req.Flags&unix.O_TRUNC != 0 if c.readonly && (accessMode != unix.O_RDONLY || trunc) { return 0, unix.EROFS } fd, err := c.lookupControlFD(req.FD) if err != nil { return 0, err } defer fd.DecRef(nil) if fd.IsDir() { // Directory is not truncatable and must be opened with O_RDONLY. if accessMode != unix.O_RDONLY || trunc { return 0, unix.EISDIR } } var ( openFD *OpenFD hostOpenFD int ) if err := fd.safelyRead(func() error { if fd.node.isDeleted() || fd.IsSymlink() { return unix.EINVAL } openFD, hostOpenFD, err = fd.impl.Open(req.Flags) return err }); err != nil { return 0, err } if hostOpenFD >= 0 { comm.DonateFD(hostOpenFD) } resp := OpenAtResp{OpenFD: openFD.id} respLen := uint32(resp.SizeBytes()) resp.MarshalUnsafe(comm.PayloadBuf(respLen)) return respLen, nil } // OpenCreateAtHandler handles the OpenCreateAt RPC. func OpenCreateAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { if c.readonly { return 0, unix.EROFS } var req OpenCreateAtReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } // Only keep allowed open flags. if allowedFlags := req.Flags & allowedOpenFlags; allowedFlags != req.Flags { log.Debugf("discarding open flags that are not allowed: old open flags = %d, new open flags = %d", req.Flags, allowedFlags) req.Flags = allowedFlags } name := string(req.Name) if err := checkSafeName(name); err != nil { return 0, err } fd, err := c.lookupControlFD(req.DirFD) if err != nil { return 0, err } defer fd.DecRef(nil) if !fd.IsDir() { return 0, unix.ENOTDIR } var ( childFD *ControlFD childStat linux.Statx openFD *OpenFD hostOpenFD int ) if err := fd.safelyWrite(func() error { if fd.node.isDeleted() { return unix.EINVAL } childFD, childStat, openFD, hostOpenFD, err = fd.impl.OpenCreate(req.Mode, req.UID, req.GID, name, uint32(req.Flags)) return err }); err != nil { return 0, err } if hostOpenFD >= 0 { comm.DonateFD(hostOpenFD) } resp := OpenCreateAtResp{ NewFD: openFD.id, Child: Inode{ ControlFD: childFD.id, Stat: childStat, }, } respLen := uint32(resp.SizeBytes()) resp.MarshalUnsafe(comm.PayloadBuf(respLen)) return respLen, nil } // CloseHandler handles the Close RPC. func CloseHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { var req CloseReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } for _, fd := range req.FDs { c.removeFD(fd) } // There is no response message for this. return 0, nil } // FSyncHandler handles the FSync RPC. func FSyncHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { var req FsyncReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } // Return the first error we encounter, but sync everything we can // regardless. var retErr error for _, fdid := range req.FDs { if err := c.fsyncFD(fdid); err != nil && retErr == nil { retErr = err } } // There is no response message for this. return 0, retErr } func (c *Connection) fsyncFD(id FDID) error { fd, err := c.lookupOpenFD(id) if err != nil { return err } defer fd.DecRef(nil) return fd.controlFD.safelyRead(func() error { return fd.impl.Sync() }) } // PWriteHandler handles the PWrite RPC. func PWriteHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { if c.readonly { return 0, unix.EROFS } var req PWriteReq // Note that it is an optimized Unmarshal operation which avoids any buffer // allocation and copying. req.Buf just points to payload. This is safe to do // as the handler owns payload and req's lifetime is limited to the handler. if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } fd, err := c.lookupOpenFD(req.FD) if err != nil { return 0, err } defer fd.DecRef(nil) if !fd.writable { return 0, unix.EBADF } var count uint64 if err := fd.controlFD.safelyWrite(func() error { count, err = fd.impl.Write(req.Buf, uint64(req.Offset)) return err }); err != nil { return 0, err } resp := PWriteResp{Count: count} respLen := uint32(resp.SizeBytes()) resp.MarshalUnsafe(comm.PayloadBuf(respLen)) return respLen, nil } // PReadHandler handles the PRead RPC. func PReadHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { var req PReadReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } fd, err := c.lookupOpenFD(req.FD) if err != nil { return 0, err } defer fd.DecRef(nil) if !fd.readable { return 0, unix.EBADF } // To save an allocation and a copy, we directly read into the payload // buffer. The rest of the response message is manually marshalled. var resp PReadResp respMetaSize := uint32(resp.NumBytes.SizeBytes()) respPayloadLen := respMetaSize + req.Count if respPayloadLen > c.maxMessageSize { return 0, unix.ENOBUFS } payloadBuf := comm.PayloadBuf(respPayloadLen) var n uint64 if err := fd.controlFD.safelyRead(func() error { n, err = fd.impl.Read(payloadBuf[respMetaSize:], req.Offset) return err }); err != nil { return 0, err } // Write the response metadata onto the payload buffer. The response contents // already have been written immediately after it. resp.NumBytes = primitive.Uint64(n) resp.NumBytes.MarshalUnsafe(payloadBuf) return respMetaSize + uint32(n), nil } // MkdirAtHandler handles the MkdirAt RPC. func MkdirAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { if c.readonly { return 0, unix.EROFS } var req MkdirAtReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } name := string(req.Name) if err := checkSafeName(name); err != nil { return 0, err } fd, err := c.lookupControlFD(req.DirFD) if err != nil { return 0, err } defer fd.DecRef(nil) if !fd.IsDir() { return 0, unix.ENOTDIR } var ( childDir *ControlFD childDirStat linux.Statx ) if err := fd.safelyWrite(func() error { if fd.node.isDeleted() { return unix.EINVAL } childDir, childDirStat, err = fd.impl.Mkdir(req.Mode, req.UID, req.GID, name) return err }); err != nil { return 0, err } resp := MkdirAtResp{ ChildDir: Inode{ ControlFD: childDir.id, Stat: childDirStat, }, } respLen := uint32(resp.SizeBytes()) resp.MarshalUnsafe(comm.PayloadBuf(respLen)) return respLen, nil } // MknodAtHandler handles the MknodAt RPC. func MknodAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { if c.readonly { return 0, unix.EROFS } var req MknodAtReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } name := string(req.Name) if err := checkSafeName(name); err != nil { return 0, err } fd, err := c.lookupControlFD(req.DirFD) if err != nil { return 0, err } defer fd.DecRef(nil) if !fd.IsDir() { return 0, unix.ENOTDIR } var ( child *ControlFD childStat linux.Statx ) if err := fd.safelyWrite(func() error { if fd.node.isDeleted() { return unix.EINVAL } child, childStat, err = fd.impl.Mknod(req.Mode, req.UID, req.GID, name, uint32(req.Minor), uint32(req.Major)) return err }); err != nil { return 0, err } resp := MknodAtResp{ Child: Inode{ ControlFD: child.id, Stat: childStat, }, } respLen := uint32(resp.SizeBytes()) resp.MarshalUnsafe(comm.PayloadBuf(respLen)) return respLen, nil } // SymlinkAtHandler handles the SymlinkAt RPC. func SymlinkAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { if c.readonly { return 0, unix.EROFS } var req SymlinkAtReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } name := string(req.Name) if err := checkSafeName(name); err != nil { return 0, err } fd, err := c.lookupControlFD(req.DirFD) if err != nil { return 0, err } defer fd.DecRef(nil) if !fd.IsDir() { return 0, unix.ENOTDIR } var ( symlink *ControlFD symlinkStat linux.Statx ) if err := fd.safelyWrite(func() error { if fd.node.isDeleted() { return unix.EINVAL } symlink, symlinkStat, err = fd.impl.Symlink(name, string(req.Target), req.UID, req.GID) return err }); err != nil { return 0, err } resp := SymlinkAtResp{ Symlink: Inode{ ControlFD: symlink.id, Stat: symlinkStat, }, } respLen := uint32(resp.SizeBytes()) resp.MarshalUnsafe(comm.PayloadBuf(respLen)) return respLen, nil } // LinkAtHandler handles the LinkAt RPC. func LinkAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { if c.readonly { return 0, unix.EROFS } var req LinkAtReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } name := string(req.Name) if err := checkSafeName(name); err != nil { return 0, err } fd, err := c.lookupControlFD(req.DirFD) if err != nil { return 0, err } defer fd.DecRef(nil) if !fd.IsDir() { return 0, unix.ENOTDIR } targetFD, err := c.lookupControlFD(req.Target) if err != nil { return 0, err } defer targetFD.DecRef(nil) if targetFD.IsDir() { // Can not create hard link to directory. return 0, unix.EPERM } var ( link *ControlFD linkStat linux.Statx ) if err := fd.safelyWrite(func() error { if fd.node.isDeleted() { return unix.EINVAL } // This is a lock ordering issue. Need to provide safe read guarantee for // targetFD. We know targetFD is not a directory while fd is a directory. // So targetFD would either be a descendant of fd or exist elsewhere in the // tree. So locking fd first and targetFD later should not lead to cycles. targetFD.node.opMu.RLock() defer targetFD.node.opMu.RUnlock() if targetFD.node.isDeleted() { return unix.EINVAL } link, linkStat, err = targetFD.impl.Link(fd.impl, name) return err }); err != nil { return 0, err } resp := LinkAtResp{ Link: Inode{ ControlFD: link.id, Stat: linkStat, }, } respLen := uint32(resp.SizeBytes()) resp.MarshalUnsafe(comm.PayloadBuf(respLen)) return respLen, nil } // FStatFSHandler handles the FStatFS RPC. func FStatFSHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { var req FStatFSReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } fd, err := c.lookupControlFD(req.FD) if err != nil { return 0, err } defer fd.DecRef(nil) var resp StatFS if err := fd.safelyRead(func() error { resp, err = fd.impl.StatFS() return err }); err != nil { return 0, err } respLen := uint32(resp.SizeBytes()) resp.MarshalUnsafe(comm.PayloadBuf(respLen)) return respLen, nil } // FAllocateHandler handles the FAllocate RPC. func FAllocateHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { if c.readonly { return 0, unix.EROFS } var req FAllocateReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } fd, err := c.lookupOpenFD(req.FD) if err != nil { return 0, err } defer fd.DecRef(nil) if !fd.writable { return 0, unix.EBADF } return 0, fd.controlFD.safelyWrite(func() error { if fd.controlFD.node.isDeleted() && !c.server.opts.AllocateOnDeleted { return unix.EINVAL } return fd.impl.Allocate(req.Mode, req.Offset, req.Length) }) } // ReadLinkAtHandler handles the ReadLinkAt RPC. func ReadLinkAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { var req ReadLinkAtReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } fd, err := c.lookupControlFD(req.FD) if err != nil { return 0, err } defer fd.DecRef(nil) if !fd.IsSymlink() { return 0, unix.EINVAL } // We will manually marshal ReadLinkAtResp, which just contains a // SizedString. Let Readlinkat directly write into the payload buffer and // manually write the string size before it. var ( linkLen primitive.Uint16 n uint16 ) respMetaSize := uint32(linkLen.SizeBytes()) if fd.safelyRead(func() error { if fd.node.isDeleted() { return unix.EINVAL } n, err = fd.impl.Readlink(func(dataLen uint32) []byte { return comm.PayloadBuf(dataLen + respMetaSize)[respMetaSize:] }) return err }); err != nil { return 0, err } linkLen = primitive.Uint16(n) linkLen.MarshalUnsafe(comm.PayloadBuf(respMetaSize)) return respMetaSize + uint32(n), nil } // FlushHandler handles the Flush RPC. func FlushHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { var req FlushReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } fd, err := c.lookupOpenFD(req.FD) if err != nil { return 0, err } defer fd.DecRef(nil) return 0, fd.controlFD.safelyRead(func() error { return fd.impl.Flush() }) } // ConnectHandler handles the Connect RPC. func ConnectHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { var req ConnectReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } fd, err := c.lookupControlFD(req.FD) if err != nil { return 0, err } defer fd.DecRef(nil) if !fd.IsSocket() { return 0, unix.ENOTSOCK } var sock int if err := fd.safelyRead(func() error { if fd.node.isDeleted() { return unix.EINVAL } sock, err = fd.impl.Connect(req.SockType) return err }); err != nil { return 0, err } comm.DonateFD(sock) return 0, nil } // BindAtHandler handles the BindAt RPC. func BindAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { var req BindAtReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } name := string(req.Name) if err := checkSafeName(name); err != nil { return 0, err } dir, err := c.lookupControlFD(req.DirFD) if err != nil { return 0, err } defer dir.DecRef(nil) if !dir.IsDir() { return 0, unix.ENOTDIR } var ( childFD *ControlFD childStat linux.Statx boundSocketFD *BoundSocketFD hostSocketFD int ) if err := dir.safelyWrite(func() error { if dir.node.isDeleted() { return unix.EINVAL } childFD, childStat, boundSocketFD, hostSocketFD, err = dir.impl.BindAt(name, uint32(req.SockType), req.Mode, req.UID, req.GID) return err }); err != nil { return 0, err } comm.DonateFD(hostSocketFD) resp := BindAtResp{ Child: Inode{ ControlFD: childFD.id, Stat: childStat, }, BoundSocketFD: boundSocketFD.id, } respLen := uint32(resp.SizeBytes()) resp.MarshalUnsafe(comm.PayloadBuf(respLen)) return respLen, nil } // ListenHandler handles the Listen RPC. func ListenHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { var req ListenReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } sock, err := c.lookupBoundSocketFD(req.FD) if err != nil { return 0, err } if err := sock.controlFD.safelyRead(func() error { if sock.controlFD.node.isDeleted() { return unix.EINVAL } return sock.impl.Listen(req.Backlog) }); err != nil { return 0, err } return 0, nil } // AcceptHandler handles the Accept RPC. func AcceptHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { var req AcceptReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } sock, err := c.lookupBoundSocketFD(req.FD) if err != nil { return 0, err } var ( newSock int peerAddr string ) if err := sock.controlFD.safelyRead(func() error { if sock.controlFD.node.isDeleted() { return unix.EINVAL } var err error newSock, peerAddr, err = sock.impl.Accept() return err }); err != nil { return 0, err } comm.DonateFD(newSock) resp := AcceptResp{ PeerAddr: SizedString(peerAddr), } respLen := uint32(resp.SizeBytes()) resp.MarshalBytes(comm.PayloadBuf(respLen)) return respLen, nil } // UnlinkAtHandler handles the UnlinkAt RPC. func UnlinkAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { if c.readonly { return 0, unix.EROFS } var req UnlinkAtReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } name := string(req.Name) if err := checkSafeName(name); err != nil { return 0, err } fd, err := c.lookupControlFD(req.DirFD) if err != nil { return 0, err } defer fd.DecRef(nil) if !fd.IsDir() { return 0, unix.ENOTDIR } return 0, fd.safelyWrite(func() error { if fd.node.isDeleted() { return unix.EINVAL } fd.node.childrenMu.Lock() childNode := fd.node.LookupChildLocked(name) fd.node.childrenMu.Unlock() if childNode != nil { // Before we do the unlink itself, we need to ensure that there // are no operations in flight on associated path node. // // This is another case of a lock ordering issue, but since we always // acquire deeper in the hierarchy, we know that we are free of cycles. childNode.opMu.Lock() defer childNode.opMu.Unlock() } if err := fd.impl.Unlink(name, uint32(req.Flags)); err != nil { return err } // Since fd.node.opMu is locked for writing, there will not be a concurrent // creation of a node at that position if childNode == nil. So only remove // node if one existed. if childNode != nil { fd.node.childrenMu.Lock() fd.node.removeChildLocked(name) fd.node.childrenMu.Unlock() childNode.markDeletedRecursive() } return nil }) } // RenameAtHandler handles the RenameAt RPC. func RenameAtHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { if c.readonly { return 0, unix.EROFS } var req RenameAtReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } oldName := string(req.OldName) if err := checkSafeName(oldName); err != nil { return 0, err } newName := string(req.NewName) if err := checkSafeName(newName); err != nil { return 0, err } oldDir, err := c.lookupControlFD(req.OldDir) if err != nil { return 0, err } defer oldDir.DecRef(nil) newDir, err := c.lookupControlFD(req.NewDir) if err != nil { return 0, err } defer newDir.DecRef(nil) if !oldDir.IsDir() || !newDir.IsDir() { return 0, unix.ENOTDIR } // Hold RenameMu for writing during rename, this is important. return 0, oldDir.safelyGlobal(func() error { if oldDir.node.isDeleted() || newDir.node.isDeleted() { return unix.EINVAL } if oldDir.node == newDir.node && oldName == newName { // Nothing to do. return nil } // Attempt the actual rename. if err := oldDir.impl.RenameAt(oldName, newDir.impl, newName); err != nil { return err } // Successful, so update the node tree. Note that since we have global // concurrency guarantee here, the node tree can not be modified // concurrently in any way. // First see if a file was deleted by being replaced by the rename. If so, // detach it from node tree and mark it as deleted. newDir.node.childrenMu.Lock() replaced := newDir.node.removeChildLocked(newName) newDir.node.childrenMu.Unlock() if replaced != nil { replaced.opMu.Lock() replaced.markDeletedRecursive() replaced.opMu.Unlock() } // Now move the renamed node to the right position. oldDir.node.childrenMu.Lock() renamed := oldDir.node.removeChildLocked(oldName) oldDir.node.childrenMu.Unlock() if renamed != nil { renamed.parent.DecRef(nil) renamed.parent = newDir.node renamed.parent.IncRef() renamed.name = newName newDir.node.childrenMu.Lock() newDir.node.insertChildLocked(newName, renamed) newDir.node.childrenMu.Unlock() // Now update all FDs under the subtree rooted at renamed. notifyRenameRecursive(renamed) } return nil }) } func notifyRenameRecursive(n *Node) { n.forEachFD(func(cfd *ControlFD) { cfd.impl.Renamed() cfd.forEachOpenFD(func(ofd *OpenFD) { ofd.impl.Renamed() }) }) n.forEachChild(func(child *Node) { notifyRenameRecursive(child) }) } // Getdents64Handler handles the Getdents64 RPC. func Getdents64Handler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { var req Getdents64Req if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } fd, err := c.lookupOpenFD(req.DirFD) if err != nil { return 0, err } defer fd.DecRef(nil) if !fd.controlFD.IsDir() { return 0, unix.ENOTDIR } seek0 := false if req.Count < 0 { seek0 = true req.Count = -req.Count } // We will manually marshal the response Getdents64Resp. // numDirents is the number of dirents marshalled into the payload. var numDirents primitive.Uint16 // The payload starts with numDirents, dirents go right after that. // payloadBufPos represents the position at which to write the next dirent. payloadBufPos := uint32(numDirents.SizeBytes()) // Request enough payloadBuf for 10 dirents, we will extend when needed. // unix.Dirent is 280 bytes for amd64. payloadBuf := comm.PayloadBuf(payloadBufPos + 10*unixDirentMaxSize) if err := fd.controlFD.safelyRead(func() error { if fd.controlFD.node.isDeleted() { return unix.EINVAL } return fd.impl.Getdent64(uint32(req.Count), seek0, func(dirent Dirent64) { // Paste the dirent into the payload buffer without having the dirent // escape. Request a larger buffer if needed. if int(payloadBufPos)+dirent.SizeBytes() > len(payloadBuf) { // Ask for 10 large dirents worth of more space. payloadBuf = comm.PayloadBuf(payloadBufPos + 10*unixDirentMaxSize) } dirent.MarshalBytes(payloadBuf[payloadBufPos:]) payloadBufPos += uint32(dirent.SizeBytes()) numDirents++ }) }); err != nil { return 0, err } // The number of dirents goes at the beginning of the payload. numDirents.MarshalUnsafe(payloadBuf) return payloadBufPos, nil } // FGetXattrHandler handles the FGetXattr RPC. func FGetXattrHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { var req FGetXattrReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } fd, err := c.lookupControlFD(req.FD) if err != nil { return 0, err } defer fd.DecRef(nil) // Manually marshal FGetXattrResp to avoid allocations and copying. // FGetXattrResp simply is a wrapper around SizedString. var valueLen primitive.Uint16 respMetaSize := uint32(valueLen.SizeBytes()) var n uint16 if err := fd.safelyRead(func() error { if fd.node.isDeleted() { return unix.EINVAL } n, err = fd.impl.GetXattr(string(req.Name), uint32(req.BufSize), func(dataLen uint32) []byte { return comm.PayloadBuf(dataLen + respMetaSize)[respMetaSize:] }) return err }); err != nil { return 0, err } payloadBuf := comm.PayloadBuf(respMetaSize) valueLen = primitive.Uint16(n) valueLen.MarshalBytes(payloadBuf) return respMetaSize + uint32(n), nil } // FSetXattrHandler handles the FSetXattr RPC. func FSetXattrHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { if c.readonly { return 0, unix.EROFS } var req FSetXattrReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } fd, err := c.lookupControlFD(req.FD) if err != nil { return 0, err } defer fd.DecRef(nil) return 0, fd.safelyWrite(func() error { if fd.node.isDeleted() { return unix.EINVAL } return fd.impl.SetXattr(string(req.Name), string(req.Value), uint32(req.Flags)) }) } // FListXattrHandler handles the FListXattr RPC. func FListXattrHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { var req FListXattrReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } fd, err := c.lookupControlFD(req.FD) if err != nil { return 0, err } defer fd.DecRef(nil) var resp FListXattrResp if fd.safelyRead(func() error { if fd.node.isDeleted() { return unix.EINVAL } resp.Xattrs, err = fd.impl.ListXattr(req.Size) return err }); err != nil { return 0, err } respLen := uint32(resp.SizeBytes()) resp.MarshalBytes(comm.PayloadBuf(respLen)) return respLen, nil } // FRemoveXattrHandler handles the FRemoveXattr RPC. func FRemoveXattrHandler(c *Connection, comm Communicator, payloadLen uint32) (uint32, error) { if c.readonly { return 0, unix.EROFS } var req FRemoveXattrReq if _, ok := req.CheckedUnmarshal(comm.PayloadBuf(payloadLen)); !ok { return 0, unix.EIO } fd, err := c.lookupControlFD(req.FD) if err != nil { return 0, err } defer fd.DecRef(nil) return 0, fd.safelyWrite(func() error { return fd.impl.RemoveXattr(string(req.Name)) }) } // checkSafeName validates the name and returns nil or returns an error. func checkSafeName(name string) error { if name != "" && !strings.Contains(name, "/") && name != "." && name != ".." { return nil } return unix.EINVAL } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/lisafs.go000066400000000000000000000027511465435605700225220ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package lisafs (LInux SAndbox FileSystem) defines the protocol for // filesystem RPCs between an untrusted Sandbox (client) and a trusted // filesystem server. // // Lock ordering: // // Server.renameMu // Node.opMu // Node.childrenMu // Node.controlFDsMu // // Locking rules: // - Node.childrenMu can be simultaneously held on multiple nodes, ancestors // before descendants. // - Node.opMu can be simultaneously held on multiple nodes, ancestors before // descendants. // - Node.opMu can be simultaneously held on two nodes that do not have an // ancestor-descendant relationship. One node must be an internal (directory) // node and the other a leaf (non-directory) node. Directory must be locked // before non-directories. // - "Ancestors before descendants" requires that Server.renameMu is locked to // ensure that this ordering remains satisfied. package lisafs golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/lisafs_abi_autogen_unsafe.go000066400000000000000000005570121465435605700264250ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package lisafs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "io" "reflect" "runtime" "unsafe" ) // Marshallable types used by this file. var _ marshal.Marshallable = (*AcceptReq)(nil) var _ marshal.Marshallable = (*BindAtResp)(nil) var _ marshal.Marshallable = (*ChannelResp)(nil) var _ marshal.Marshallable = (*ConnectReq)(nil) var _ marshal.Marshallable = (*ErrorResp)(nil) var _ marshal.Marshallable = (*FAllocateReq)(nil) var _ marshal.Marshallable = (*FDID)(nil) var _ marshal.Marshallable = (*FListXattrReq)(nil) var _ marshal.Marshallable = (*FStatFSReq)(nil) var _ marshal.Marshallable = (*FlushReq)(nil) var _ marshal.Marshallable = (*GID)(nil) var _ marshal.Marshallable = (*Getdents64Req)(nil) var _ marshal.Marshallable = (*Inode)(nil) var _ marshal.Marshallable = (*LinkAtResp)(nil) var _ marshal.Marshallable = (*ListenReq)(nil) var _ marshal.Marshallable = (*MID)(nil) var _ marshal.Marshallable = (*MkdirAtResp)(nil) var _ marshal.Marshallable = (*MknodAtResp)(nil) var _ marshal.Marshallable = (*MsgDynamic)(nil) var _ marshal.Marshallable = (*MsgSimple)(nil) var _ marshal.Marshallable = (*OpenAtReq)(nil) var _ marshal.Marshallable = (*OpenAtResp)(nil) var _ marshal.Marshallable = (*OpenCreateAtResp)(nil) var _ marshal.Marshallable = (*PReadReq)(nil) var _ marshal.Marshallable = (*PWriteResp)(nil) var _ marshal.Marshallable = (*ReadLinkAtReq)(nil) var _ marshal.Marshallable = (*SetStatReq)(nil) var _ marshal.Marshallable = (*SetStatResp)(nil) var _ marshal.Marshallable = (*StatFS)(nil) var _ marshal.Marshallable = (*StatReq)(nil) var _ marshal.Marshallable = (*SymlinkAtResp)(nil) var _ marshal.Marshallable = (*UID)(nil) var _ marshal.Marshallable = (*channelHeader)(nil) var _ marshal.Marshallable = (*createCommon)(nil) var _ marshal.Marshallable = (*linux.FileMode)(nil) var _ marshal.Marshallable = (*linux.Statx)(nil) var _ marshal.Marshallable = (*linux.Timespec)(nil) var _ marshal.Marshallable = (*sockHeader)(nil) // SizeBytes implements marshal.Marshallable.SizeBytes. func (c *channelHeader) SizeBytes() int { return 2 + (*MID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (c *channelHeader) MarshalBytes(dst []byte) []byte { dst = c.message.MarshalUnsafe(dst) dst[0] = byte(c.numFDs) dst = dst[1:] // Padding: dst[:sizeof(uint8)] ~= uint8(0) dst = dst[1:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (c *channelHeader) UnmarshalBytes(src []byte) []byte { src = c.message.UnmarshalUnsafe(src) c.numFDs = uint8(src[0]) src = src[1:] // Padding: var _ uint8 ~= src[:sizeof(uint8)] src = src[1:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (c *channelHeader) Packed() bool { return c.message.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (c *channelHeader) MarshalUnsafe(dst []byte) []byte { if c.message.Packed() { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(c), uintptr(size)) return dst[size:] } // Type channelHeader doesn't have a packed layout in memory, fallback to MarshalBytes. return c.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (c *channelHeader) UnmarshalUnsafe(src []byte) []byte { if c.message.Packed() { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(c), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type channelHeader doesn't have a packed layout in memory, fallback to UnmarshalBytes. return c.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (c *channelHeader) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !c.message.Packed() { // Type channelHeader doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(c.SizeBytes()) // escapes: okay. c.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (c *channelHeader) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyOutN(cc, addr, c.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (c *channelHeader) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !c.message.Packed() { // Type channelHeader doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(c.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. c.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (c *channelHeader) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyInN(cc, addr, c.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (c *channelHeader) WriteTo(writer io.Writer) (int64, error) { if !c.message.Packed() { // Type channelHeader doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, c.SizeBytes()) c.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (f *FDID) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FDID) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(*f)) return dst[8:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FDID) UnmarshalBytes(src []byte) []byte { *f = FDID(uint64(hostarch.ByteOrder.Uint64(src[:8]))) return src[8:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FDID) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FDID) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FDID) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FDID) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FDID) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FDID) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FDID) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FDID) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (f *FDID) CheckedMarshal(dst []byte) ([]byte, bool) { size := f.SizeBytes() if size > len(dst) { return dst, false } gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:], true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (f *FDID) CheckedUnmarshal(src []byte) ([]byte, bool) { size := f.SizeBytes() if size > len(src) { return src, false } gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:], true } // CopyFDIDSliceIn copies in a slice of FDID objects from the task's memory. func CopyFDIDSliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []FDID) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*FDID)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyFDIDSliceOut copies a slice of FDID objects to the task's memory. func CopyFDIDSliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []FDID) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*FDID)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeFDIDSlice is like FDID.MarshalUnsafe, but for a []FDID. func MarshalUnsafeFDIDSlice(src []FDID, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*FDID)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeFDIDSlice is like FDID.UnmarshalUnsafe, but for a []FDID. func UnmarshalUnsafeFDIDSlice(dst []FDID, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*FDID)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. func (a *AcceptReq) SizeBytes() int { return 0 + (*FDID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (a *AcceptReq) MarshalBytes(dst []byte) []byte { dst = a.FD.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (a *AcceptReq) UnmarshalBytes(src []byte) []byte { src = a.FD.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (a *AcceptReq) Packed() bool { return a.FD.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (a *AcceptReq) MarshalUnsafe(dst []byte) []byte { if a.FD.Packed() { size := a.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(a), uintptr(size)) return dst[size:] } // Type AcceptReq doesn't have a packed layout in memory, fallback to MarshalBytes. return a.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (a *AcceptReq) UnmarshalUnsafe(src []byte) []byte { if a.FD.Packed() { size := a.SizeBytes() gohacks.Memmove(unsafe.Pointer(a), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type AcceptReq doesn't have a packed layout in memory, fallback to UnmarshalBytes. return a.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (a *AcceptReq) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !a.FD.Packed() { // Type AcceptReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(a.SizeBytes()) // escapes: okay. a.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(a))) hdr.Len = a.SizeBytes() hdr.Cap = a.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that a // must live until the use above. runtime.KeepAlive(a) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (a *AcceptReq) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return a.CopyOutN(cc, addr, a.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (a *AcceptReq) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !a.FD.Packed() { // Type AcceptReq doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(a.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. a.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(a))) hdr.Len = a.SizeBytes() hdr.Cap = a.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that a // must live until the use above. runtime.KeepAlive(a) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (a *AcceptReq) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return a.CopyInN(cc, addr, a.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (a *AcceptReq) WriteTo(writer io.Writer) (int64, error) { if !a.FD.Packed() { // Type AcceptReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, a.SizeBytes()) a.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(a))) hdr.Len = a.SizeBytes() hdr.Cap = a.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that a // must live until the use above. runtime.KeepAlive(a) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (a *AcceptReq) CheckedMarshal(dst []byte) ([]byte, bool) { if a.SizeBytes() > len(dst) { return dst, false } return a.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (a *AcceptReq) CheckedUnmarshal(src []byte) ([]byte, bool) { if a.SizeBytes() > len(src) { return src, false } return a.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (b *BindAtResp) SizeBytes() int { return 0 + (*Inode)(nil).SizeBytes() + (*FDID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (b *BindAtResp) MarshalBytes(dst []byte) []byte { dst = b.Child.MarshalUnsafe(dst) dst = b.BoundSocketFD.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (b *BindAtResp) UnmarshalBytes(src []byte) []byte { src = b.Child.UnmarshalUnsafe(src) src = b.BoundSocketFD.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (b *BindAtResp) Packed() bool { return b.BoundSocketFD.Packed() && b.Child.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (b *BindAtResp) MarshalUnsafe(dst []byte) []byte { if b.BoundSocketFD.Packed() && b.Child.Packed() { size := b.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(b), uintptr(size)) return dst[size:] } // Type BindAtResp doesn't have a packed layout in memory, fallback to MarshalBytes. return b.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (b *BindAtResp) UnmarshalUnsafe(src []byte) []byte { if b.BoundSocketFD.Packed() && b.Child.Packed() { size := b.SizeBytes() gohacks.Memmove(unsafe.Pointer(b), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type BindAtResp doesn't have a packed layout in memory, fallback to UnmarshalBytes. return b.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (b *BindAtResp) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !b.BoundSocketFD.Packed() && b.Child.Packed() { // Type BindAtResp doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(b.SizeBytes()) // escapes: okay. b.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(b))) hdr.Len = b.SizeBytes() hdr.Cap = b.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that b // must live until the use above. runtime.KeepAlive(b) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (b *BindAtResp) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return b.CopyOutN(cc, addr, b.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (b *BindAtResp) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !b.BoundSocketFD.Packed() && b.Child.Packed() { // Type BindAtResp doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(b.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. b.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(b))) hdr.Len = b.SizeBytes() hdr.Cap = b.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that b // must live until the use above. runtime.KeepAlive(b) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (b *BindAtResp) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return b.CopyInN(cc, addr, b.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (b *BindAtResp) WriteTo(writer io.Writer) (int64, error) { if !b.BoundSocketFD.Packed() && b.Child.Packed() { // Type BindAtResp doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, b.SizeBytes()) b.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(b))) hdr.Len = b.SizeBytes() hdr.Cap = b.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that b // must live until the use above. runtime.KeepAlive(b) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (b *BindAtResp) CheckedMarshal(dst []byte) ([]byte, bool) { if b.SizeBytes() > len(dst) { return dst, false } return b.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (b *BindAtResp) CheckedUnmarshal(src []byte) ([]byte, bool) { if b.SizeBytes() > len(src) { return src, false } return b.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (c *ChannelResp) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (c *ChannelResp) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(c.dataOffset)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(c.dataLength)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (c *ChannelResp) UnmarshalBytes(src []byte) []byte { c.dataOffset = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] c.dataLength = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (c *ChannelResp) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (c *ChannelResp) MarshalUnsafe(dst []byte) []byte { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(c), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (c *ChannelResp) UnmarshalUnsafe(src []byte) []byte { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(c), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (c *ChannelResp) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (c *ChannelResp) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyOutN(cc, addr, c.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (c *ChannelResp) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (c *ChannelResp) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyInN(cc, addr, c.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (c *ChannelResp) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (c *ChannelResp) CheckedMarshal(dst []byte) ([]byte, bool) { if c.SizeBytes() > len(dst) { return dst, false } return c.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (c *ChannelResp) CheckedUnmarshal(src []byte) ([]byte, bool) { if c.SizeBytes() > len(src) { return src, false } return c.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (c *ConnectReq) SizeBytes() int { return 8 + (*FDID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (c *ConnectReq) MarshalBytes(dst []byte) []byte { dst = c.FD.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(c.SockType)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (c *ConnectReq) UnmarshalBytes(src []byte) []byte { src = c.FD.UnmarshalUnsafe(src) c.SockType = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (c *ConnectReq) Packed() bool { return c.FD.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (c *ConnectReq) MarshalUnsafe(dst []byte) []byte { if c.FD.Packed() { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(c), uintptr(size)) return dst[size:] } // Type ConnectReq doesn't have a packed layout in memory, fallback to MarshalBytes. return c.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (c *ConnectReq) UnmarshalUnsafe(src []byte) []byte { if c.FD.Packed() { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(c), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type ConnectReq doesn't have a packed layout in memory, fallback to UnmarshalBytes. return c.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (c *ConnectReq) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !c.FD.Packed() { // Type ConnectReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(c.SizeBytes()) // escapes: okay. c.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (c *ConnectReq) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyOutN(cc, addr, c.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (c *ConnectReq) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !c.FD.Packed() { // Type ConnectReq doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(c.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. c.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (c *ConnectReq) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyInN(cc, addr, c.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (c *ConnectReq) WriteTo(writer io.Writer) (int64, error) { if !c.FD.Packed() { // Type ConnectReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, c.SizeBytes()) c.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (c *ConnectReq) CheckedMarshal(dst []byte) ([]byte, bool) { if c.SizeBytes() > len(dst) { return dst, false } return c.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (c *ConnectReq) CheckedUnmarshal(src []byte) ([]byte, bool) { if c.SizeBytes() > len(src) { return src, false } return c.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (e *ErrorResp) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (e *ErrorResp) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(e.errno)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (e *ErrorResp) UnmarshalBytes(src []byte) []byte { e.errno = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (e *ErrorResp) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (e *ErrorResp) MarshalUnsafe(dst []byte) []byte { size := e.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(e), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (e *ErrorResp) UnmarshalUnsafe(src []byte) []byte { size := e.SizeBytes() gohacks.Memmove(unsafe.Pointer(e), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (e *ErrorResp) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (e *ErrorResp) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return e.CopyOutN(cc, addr, e.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (e *ErrorResp) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (e *ErrorResp) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return e.CopyInN(cc, addr, e.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (e *ErrorResp) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(e))) hdr.Len = e.SizeBytes() hdr.Cap = e.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that e // must live until the use above. runtime.KeepAlive(e) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (a *FAllocateReq) SizeBytes() int { return 24 + (*FDID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (a *FAllocateReq) MarshalBytes(dst []byte) []byte { dst = a.FD.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint64(dst[:8], uint64(a.Mode)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(a.Offset)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(a.Length)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (a *FAllocateReq) UnmarshalBytes(src []byte) []byte { src = a.FD.UnmarshalUnsafe(src) a.Mode = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] a.Offset = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] a.Length = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (a *FAllocateReq) Packed() bool { return a.FD.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (a *FAllocateReq) MarshalUnsafe(dst []byte) []byte { if a.FD.Packed() { size := a.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(a), uintptr(size)) return dst[size:] } // Type FAllocateReq doesn't have a packed layout in memory, fallback to MarshalBytes. return a.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (a *FAllocateReq) UnmarshalUnsafe(src []byte) []byte { if a.FD.Packed() { size := a.SizeBytes() gohacks.Memmove(unsafe.Pointer(a), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type FAllocateReq doesn't have a packed layout in memory, fallback to UnmarshalBytes. return a.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (a *FAllocateReq) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !a.FD.Packed() { // Type FAllocateReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(a.SizeBytes()) // escapes: okay. a.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(a))) hdr.Len = a.SizeBytes() hdr.Cap = a.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that a // must live until the use above. runtime.KeepAlive(a) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (a *FAllocateReq) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return a.CopyOutN(cc, addr, a.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (a *FAllocateReq) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !a.FD.Packed() { // Type FAllocateReq doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(a.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. a.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(a))) hdr.Len = a.SizeBytes() hdr.Cap = a.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that a // must live until the use above. runtime.KeepAlive(a) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (a *FAllocateReq) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return a.CopyInN(cc, addr, a.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (a *FAllocateReq) WriteTo(writer io.Writer) (int64, error) { if !a.FD.Packed() { // Type FAllocateReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, a.SizeBytes()) a.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(a))) hdr.Len = a.SizeBytes() hdr.Cap = a.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that a // must live until the use above. runtime.KeepAlive(a) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (a *FAllocateReq) CheckedMarshal(dst []byte) ([]byte, bool) { if a.SizeBytes() > len(dst) { return dst, false } return a.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (a *FAllocateReq) CheckedUnmarshal(src []byte) ([]byte, bool) { if a.SizeBytes() > len(src) { return src, false } return a.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (l *FListXattrReq) SizeBytes() int { return 8 + (*FDID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (l *FListXattrReq) MarshalBytes(dst []byte) []byte { dst = l.FD.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint64(dst[:8], uint64(l.Size)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (l *FListXattrReq) UnmarshalBytes(src []byte) []byte { src = l.FD.UnmarshalUnsafe(src) l.Size = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (l *FListXattrReq) Packed() bool { return l.FD.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (l *FListXattrReq) MarshalUnsafe(dst []byte) []byte { if l.FD.Packed() { size := l.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(l), uintptr(size)) return dst[size:] } // Type FListXattrReq doesn't have a packed layout in memory, fallback to MarshalBytes. return l.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (l *FListXattrReq) UnmarshalUnsafe(src []byte) []byte { if l.FD.Packed() { size := l.SizeBytes() gohacks.Memmove(unsafe.Pointer(l), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type FListXattrReq doesn't have a packed layout in memory, fallback to UnmarshalBytes. return l.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (l *FListXattrReq) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !l.FD.Packed() { // Type FListXattrReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(l.SizeBytes()) // escapes: okay. l.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(l))) hdr.Len = l.SizeBytes() hdr.Cap = l.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that l // must live until the use above. runtime.KeepAlive(l) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (l *FListXattrReq) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return l.CopyOutN(cc, addr, l.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (l *FListXattrReq) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !l.FD.Packed() { // Type FListXattrReq doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(l.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. l.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(l))) hdr.Len = l.SizeBytes() hdr.Cap = l.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that l // must live until the use above. runtime.KeepAlive(l) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (l *FListXattrReq) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return l.CopyInN(cc, addr, l.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (l *FListXattrReq) WriteTo(writer io.Writer) (int64, error) { if !l.FD.Packed() { // Type FListXattrReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, l.SizeBytes()) l.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(l))) hdr.Len = l.SizeBytes() hdr.Cap = l.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that l // must live until the use above. runtime.KeepAlive(l) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (l *FListXattrReq) CheckedMarshal(dst []byte) ([]byte, bool) { if l.SizeBytes() > len(dst) { return dst, false } return l.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (l *FListXattrReq) CheckedUnmarshal(src []byte) ([]byte, bool) { if l.SizeBytes() > len(src) { return src, false } return l.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *FStatFSReq) SizeBytes() int { return 0 + (*FDID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *FStatFSReq) MarshalBytes(dst []byte) []byte { dst = s.FD.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *FStatFSReq) UnmarshalBytes(src []byte) []byte { src = s.FD.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *FStatFSReq) Packed() bool { return s.FD.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *FStatFSReq) MarshalUnsafe(dst []byte) []byte { if s.FD.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // Type FStatFSReq doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *FStatFSReq) UnmarshalUnsafe(src []byte) []byte { if s.FD.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type FStatFSReq doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *FStatFSReq) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.FD.Packed() { // Type FStatFSReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *FStatFSReq) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *FStatFSReq) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.FD.Packed() { // Type FStatFSReq doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *FStatFSReq) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *FStatFSReq) WriteTo(writer io.Writer) (int64, error) { if !s.FD.Packed() { // Type FStatFSReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (s *FStatFSReq) CheckedMarshal(dst []byte) ([]byte, bool) { if s.SizeBytes() > len(dst) { return dst, false } return s.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (s *FStatFSReq) CheckedUnmarshal(src []byte) ([]byte, bool) { if s.SizeBytes() > len(src) { return src, false } return s.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FlushReq) SizeBytes() int { return 0 + (*FDID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FlushReq) MarshalBytes(dst []byte) []byte { dst = f.FD.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FlushReq) UnmarshalBytes(src []byte) []byte { src = f.FD.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FlushReq) Packed() bool { return f.FD.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FlushReq) MarshalUnsafe(dst []byte) []byte { if f.FD.Packed() { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // Type FlushReq doesn't have a packed layout in memory, fallback to MarshalBytes. return f.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FlushReq) UnmarshalUnsafe(src []byte) []byte { if f.FD.Packed() { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type FlushReq doesn't have a packed layout in memory, fallback to UnmarshalBytes. return f.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FlushReq) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !f.FD.Packed() { // Type FlushReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. f.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FlushReq) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FlushReq) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !f.FD.Packed() { // Type FlushReq doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. f.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FlushReq) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FlushReq) WriteTo(writer io.Writer) (int64, error) { if !f.FD.Packed() { // Type FlushReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, f.SizeBytes()) f.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (f *FlushReq) CheckedMarshal(dst []byte) ([]byte, bool) { if f.SizeBytes() > len(dst) { return dst, false } return f.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (f *FlushReq) CheckedUnmarshal(src []byte) ([]byte, bool) { if f.SizeBytes() > len(src) { return src, false } return f.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (gid *GID) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (gid *GID) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(*gid)) return dst[4:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (gid *GID) UnmarshalBytes(src []byte) []byte { *gid = GID(uint32(hostarch.ByteOrder.Uint32(src[:4]))) return src[4:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (gid *GID) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (gid *GID) MarshalUnsafe(dst []byte) []byte { size := gid.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(gid), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (gid *GID) UnmarshalUnsafe(src []byte) []byte { size := gid.SizeBytes() gohacks.Memmove(unsafe.Pointer(gid), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (gid *GID) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(gid))) hdr.Len = gid.SizeBytes() hdr.Cap = gid.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that gid // must live until the use above. runtime.KeepAlive(gid) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (gid *GID) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return gid.CopyOutN(cc, addr, gid.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (gid *GID) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(gid))) hdr.Len = gid.SizeBytes() hdr.Cap = gid.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that gid // must live until the use above. runtime.KeepAlive(gid) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (gid *GID) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return gid.CopyInN(cc, addr, gid.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (gid *GID) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(gid))) hdr.Len = gid.SizeBytes() hdr.Cap = gid.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that gid // must live until the use above. runtime.KeepAlive(gid) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (g *Getdents64Req) SizeBytes() int { return 8 + (*FDID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (g *Getdents64Req) MarshalBytes(dst []byte) []byte { dst = g.DirFD.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(g.Count)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (g *Getdents64Req) UnmarshalBytes(src []byte) []byte { src = g.DirFD.UnmarshalUnsafe(src) g.Count = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (g *Getdents64Req) Packed() bool { return g.DirFD.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (g *Getdents64Req) MarshalUnsafe(dst []byte) []byte { if g.DirFD.Packed() { size := g.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(g), uintptr(size)) return dst[size:] } // Type Getdents64Req doesn't have a packed layout in memory, fallback to MarshalBytes. return g.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (g *Getdents64Req) UnmarshalUnsafe(src []byte) []byte { if g.DirFD.Packed() { size := g.SizeBytes() gohacks.Memmove(unsafe.Pointer(g), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type Getdents64Req doesn't have a packed layout in memory, fallback to UnmarshalBytes. return g.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (g *Getdents64Req) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !g.DirFD.Packed() { // Type Getdents64Req doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(g.SizeBytes()) // escapes: okay. g.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(g))) hdr.Len = g.SizeBytes() hdr.Cap = g.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that g // must live until the use above. runtime.KeepAlive(g) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (g *Getdents64Req) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return g.CopyOutN(cc, addr, g.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (g *Getdents64Req) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !g.DirFD.Packed() { // Type Getdents64Req doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(g.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. g.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(g))) hdr.Len = g.SizeBytes() hdr.Cap = g.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that g // must live until the use above. runtime.KeepAlive(g) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (g *Getdents64Req) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return g.CopyInN(cc, addr, g.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (g *Getdents64Req) WriteTo(writer io.Writer) (int64, error) { if !g.DirFD.Packed() { // Type Getdents64Req doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, g.SizeBytes()) g.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(g))) hdr.Len = g.SizeBytes() hdr.Cap = g.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that g // must live until the use above. runtime.KeepAlive(g) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (g *Getdents64Req) CheckedMarshal(dst []byte) ([]byte, bool) { if g.SizeBytes() > len(dst) { return dst, false } return g.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (g *Getdents64Req) CheckedUnmarshal(src []byte) ([]byte, bool) { if g.SizeBytes() > len(src) { return src, false } return g.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (i *Inode) SizeBytes() int { return 0 + (*FDID)(nil).SizeBytes() + (*linux.Statx)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *Inode) MarshalBytes(dst []byte) []byte { dst = i.ControlFD.MarshalUnsafe(dst) dst = i.Stat.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *Inode) UnmarshalBytes(src []byte) []byte { src = i.ControlFD.UnmarshalUnsafe(src) src = i.Stat.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *Inode) Packed() bool { return i.ControlFD.Packed() && i.Stat.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *Inode) MarshalUnsafe(dst []byte) []byte { if i.ControlFD.Packed() && i.Stat.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // Type Inode doesn't have a packed layout in memory, fallback to MarshalBytes. return i.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *Inode) UnmarshalUnsafe(src []byte) []byte { if i.ControlFD.Packed() && i.Stat.Packed() { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type Inode doesn't have a packed layout in memory, fallback to UnmarshalBytes. return i.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *Inode) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.ControlFD.Packed() && i.Stat.Packed() { // Type Inode doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. i.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *Inode) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *Inode) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !i.ControlFD.Packed() && i.Stat.Packed() { // Type Inode doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(i.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. i.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *Inode) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *Inode) WriteTo(writer io.Writer) (int64, error) { if !i.ControlFD.Packed() && i.Stat.Packed() { // Type Inode doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, i.SizeBytes()) i.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // CopyInodeSliceIn copies in a slice of Inode objects from the task's memory. func CopyInodeSliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []Inode) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*Inode)(nil).SizeBytes() if !dst[0].Packed() { // Type Inode doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(size * count) length, err := cc.CopyInBytes(addr, buf) // Unmarshal as much as possible, even on error. First handle full objects. limit := length/size for idx := 0; idx < limit; idx++ { buf = dst[idx].UnmarshalBytes(buf) } // Handle any final partial object. buf is guaranteed to be long enough for the // final element, but may not contain valid data for the entire range. This may // result in unmarshalling zero values for some parts of the object. if length%size != 0 { dst[limit].UnmarshalBytes(buf) } return length, err } ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyInodeSliceOut copies a slice of Inode objects to the task's memory. func CopyInodeSliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []Inode) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*Inode)(nil).SizeBytes() if !src[0].Packed() { // Type Inode doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(size * count) curBuf := buf for idx := 0; idx < count; idx++ { curBuf = src[idx].MarshalBytes(curBuf) } return cc.CopyOutBytes(addr, buf) } ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeInodeSlice is like Inode.MarshalUnsafe, but for a []Inode. func MarshalUnsafeInodeSlice(src []Inode, dst []byte) []byte { count := len(src) if count == 0 { return dst } if !src[0].Packed() { // Type Inode doesn't have a packed layout in memory, fall back to MarshalBytes. for idx := 0; idx < count; idx++ { dst = src[idx].MarshalBytes(dst) } return dst } size := (*Inode)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeInodeSlice is like Inode.UnmarshalUnsafe, but for a []Inode. func UnmarshalUnsafeInodeSlice(dst []Inode, src []byte) []byte { count := len(dst) if count == 0 { return src } if !dst[0].Packed() { // Type Inode doesn't have a packed layout in memory, fall back to UnmarshalBytes. for idx := 0; idx < count; idx++ { src = dst[idx].UnmarshalBytes(src) } return src } size := (*Inode)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. func (l *LinkAtResp) SizeBytes() int { return 0 + (*Inode)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (l *LinkAtResp) MarshalBytes(dst []byte) []byte { dst = l.Link.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (l *LinkAtResp) UnmarshalBytes(src []byte) []byte { src = l.Link.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (l *LinkAtResp) Packed() bool { return l.Link.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (l *LinkAtResp) MarshalUnsafe(dst []byte) []byte { if l.Link.Packed() { size := l.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(l), uintptr(size)) return dst[size:] } // Type LinkAtResp doesn't have a packed layout in memory, fallback to MarshalBytes. return l.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (l *LinkAtResp) UnmarshalUnsafe(src []byte) []byte { if l.Link.Packed() { size := l.SizeBytes() gohacks.Memmove(unsafe.Pointer(l), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type LinkAtResp doesn't have a packed layout in memory, fallback to UnmarshalBytes. return l.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (l *LinkAtResp) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !l.Link.Packed() { // Type LinkAtResp doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(l.SizeBytes()) // escapes: okay. l.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(l))) hdr.Len = l.SizeBytes() hdr.Cap = l.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that l // must live until the use above. runtime.KeepAlive(l) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (l *LinkAtResp) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return l.CopyOutN(cc, addr, l.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (l *LinkAtResp) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !l.Link.Packed() { // Type LinkAtResp doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(l.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. l.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(l))) hdr.Len = l.SizeBytes() hdr.Cap = l.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that l // must live until the use above. runtime.KeepAlive(l) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (l *LinkAtResp) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return l.CopyInN(cc, addr, l.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (l *LinkAtResp) WriteTo(writer io.Writer) (int64, error) { if !l.Link.Packed() { // Type LinkAtResp doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, l.SizeBytes()) l.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(l))) hdr.Len = l.SizeBytes() hdr.Cap = l.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that l // must live until the use above. runtime.KeepAlive(l) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (l *LinkAtResp) CheckedMarshal(dst []byte) ([]byte, bool) { if l.SizeBytes() > len(dst) { return dst, false } return l.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (l *LinkAtResp) CheckedUnmarshal(src []byte) ([]byte, bool) { if l.SizeBytes() > len(src) { return src, false } return l.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (l *ListenReq) SizeBytes() int { return 8 + (*FDID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (l *ListenReq) MarshalBytes(dst []byte) []byte { dst = l.FD.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(l.Backlog)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (l *ListenReq) UnmarshalBytes(src []byte) []byte { src = l.FD.UnmarshalUnsafe(src) l.Backlog = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (l *ListenReq) Packed() bool { return l.FD.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (l *ListenReq) MarshalUnsafe(dst []byte) []byte { if l.FD.Packed() { size := l.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(l), uintptr(size)) return dst[size:] } // Type ListenReq doesn't have a packed layout in memory, fallback to MarshalBytes. return l.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (l *ListenReq) UnmarshalUnsafe(src []byte) []byte { if l.FD.Packed() { size := l.SizeBytes() gohacks.Memmove(unsafe.Pointer(l), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type ListenReq doesn't have a packed layout in memory, fallback to UnmarshalBytes. return l.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (l *ListenReq) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !l.FD.Packed() { // Type ListenReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(l.SizeBytes()) // escapes: okay. l.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(l))) hdr.Len = l.SizeBytes() hdr.Cap = l.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that l // must live until the use above. runtime.KeepAlive(l) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (l *ListenReq) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return l.CopyOutN(cc, addr, l.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (l *ListenReq) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !l.FD.Packed() { // Type ListenReq doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(l.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. l.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(l))) hdr.Len = l.SizeBytes() hdr.Cap = l.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that l // must live until the use above. runtime.KeepAlive(l) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (l *ListenReq) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return l.CopyInN(cc, addr, l.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (l *ListenReq) WriteTo(writer io.Writer) (int64, error) { if !l.FD.Packed() { // Type ListenReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, l.SizeBytes()) l.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(l))) hdr.Len = l.SizeBytes() hdr.Cap = l.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that l // must live until the use above. runtime.KeepAlive(l) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (l *ListenReq) CheckedMarshal(dst []byte) ([]byte, bool) { if l.SizeBytes() > len(dst) { return dst, false } return l.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (l *ListenReq) CheckedUnmarshal(src []byte) ([]byte, bool) { if l.SizeBytes() > len(src) { return src, false } return l.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (m *MID) SizeBytes() int { return 2 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (m *MID) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(*m)) return dst[2:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (m *MID) UnmarshalBytes(src []byte) []byte { *m = MID(uint16(hostarch.ByteOrder.Uint16(src[:2]))) return src[2:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (m *MID) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (m *MID) MarshalUnsafe(dst []byte) []byte { size := m.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(m), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (m *MID) UnmarshalUnsafe(src []byte) []byte { size := m.SizeBytes() gohacks.Memmove(unsafe.Pointer(m), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (m *MID) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (m *MID) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyOutN(cc, addr, m.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (m *MID) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (m *MID) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyInN(cc, addr, m.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (m *MID) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return int64(length), err } // CopyMIDSliceIn copies in a slice of MID objects from the task's memory. func CopyMIDSliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []MID) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*MID)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyMIDSliceOut copies a slice of MID objects to the task's memory. func CopyMIDSliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []MID) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*MID)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeMIDSlice is like MID.MarshalUnsafe, but for a []MID. func MarshalUnsafeMIDSlice(src []MID, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*MID)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeMIDSlice is like MID.UnmarshalUnsafe, but for a []MID. func UnmarshalUnsafeMIDSlice(dst []MID, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*MID)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. func (m *MkdirAtResp) SizeBytes() int { return 0 + (*Inode)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (m *MkdirAtResp) MarshalBytes(dst []byte) []byte { dst = m.ChildDir.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (m *MkdirAtResp) UnmarshalBytes(src []byte) []byte { src = m.ChildDir.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (m *MkdirAtResp) Packed() bool { return m.ChildDir.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (m *MkdirAtResp) MarshalUnsafe(dst []byte) []byte { if m.ChildDir.Packed() { size := m.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(m), uintptr(size)) return dst[size:] } // Type MkdirAtResp doesn't have a packed layout in memory, fallback to MarshalBytes. return m.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (m *MkdirAtResp) UnmarshalUnsafe(src []byte) []byte { if m.ChildDir.Packed() { size := m.SizeBytes() gohacks.Memmove(unsafe.Pointer(m), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type MkdirAtResp doesn't have a packed layout in memory, fallback to UnmarshalBytes. return m.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (m *MkdirAtResp) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !m.ChildDir.Packed() { // Type MkdirAtResp doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(m.SizeBytes()) // escapes: okay. m.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (m *MkdirAtResp) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyOutN(cc, addr, m.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (m *MkdirAtResp) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !m.ChildDir.Packed() { // Type MkdirAtResp doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(m.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. m.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (m *MkdirAtResp) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyInN(cc, addr, m.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (m *MkdirAtResp) WriteTo(writer io.Writer) (int64, error) { if !m.ChildDir.Packed() { // Type MkdirAtResp doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, m.SizeBytes()) m.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (m *MkdirAtResp) CheckedMarshal(dst []byte) ([]byte, bool) { if m.SizeBytes() > len(dst) { return dst, false } return m.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (m *MkdirAtResp) CheckedUnmarshal(src []byte) ([]byte, bool) { if m.SizeBytes() > len(src) { return src, false } return m.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (m *MknodAtResp) SizeBytes() int { return 0 + (*Inode)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (m *MknodAtResp) MarshalBytes(dst []byte) []byte { dst = m.Child.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (m *MknodAtResp) UnmarshalBytes(src []byte) []byte { src = m.Child.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (m *MknodAtResp) Packed() bool { return m.Child.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (m *MknodAtResp) MarshalUnsafe(dst []byte) []byte { if m.Child.Packed() { size := m.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(m), uintptr(size)) return dst[size:] } // Type MknodAtResp doesn't have a packed layout in memory, fallback to MarshalBytes. return m.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (m *MknodAtResp) UnmarshalUnsafe(src []byte) []byte { if m.Child.Packed() { size := m.SizeBytes() gohacks.Memmove(unsafe.Pointer(m), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type MknodAtResp doesn't have a packed layout in memory, fallback to UnmarshalBytes. return m.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (m *MknodAtResp) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !m.Child.Packed() { // Type MknodAtResp doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(m.SizeBytes()) // escapes: okay. m.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (m *MknodAtResp) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyOutN(cc, addr, m.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (m *MknodAtResp) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !m.Child.Packed() { // Type MknodAtResp doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(m.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. m.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (m *MknodAtResp) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyInN(cc, addr, m.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (m *MknodAtResp) WriteTo(writer io.Writer) (int64, error) { if !m.Child.Packed() { // Type MknodAtResp doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, m.SizeBytes()) m.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (m *MknodAtResp) CheckedMarshal(dst []byte) ([]byte, bool) { if m.SizeBytes() > len(dst) { return dst, false } return m.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (m *MknodAtResp) CheckedUnmarshal(src []byte) ([]byte, bool) { if m.SizeBytes() > len(src) { return src, false } return m.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (o *OpenAtReq) SizeBytes() int { return 8 + (*FDID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (o *OpenAtReq) MarshalBytes(dst []byte) []byte { dst = o.FD.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(o.Flags)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (o *OpenAtReq) UnmarshalBytes(src []byte) []byte { src = o.FD.UnmarshalUnsafe(src) o.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (o *OpenAtReq) Packed() bool { return o.FD.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (o *OpenAtReq) MarshalUnsafe(dst []byte) []byte { if o.FD.Packed() { size := o.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(o), uintptr(size)) return dst[size:] } // Type OpenAtReq doesn't have a packed layout in memory, fallback to MarshalBytes. return o.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (o *OpenAtReq) UnmarshalUnsafe(src []byte) []byte { if o.FD.Packed() { size := o.SizeBytes() gohacks.Memmove(unsafe.Pointer(o), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type OpenAtReq doesn't have a packed layout in memory, fallback to UnmarshalBytes. return o.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (o *OpenAtReq) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !o.FD.Packed() { // Type OpenAtReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(o.SizeBytes()) // escapes: okay. o.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(o))) hdr.Len = o.SizeBytes() hdr.Cap = o.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that o // must live until the use above. runtime.KeepAlive(o) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (o *OpenAtReq) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return o.CopyOutN(cc, addr, o.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (o *OpenAtReq) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !o.FD.Packed() { // Type OpenAtReq doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(o.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. o.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(o))) hdr.Len = o.SizeBytes() hdr.Cap = o.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that o // must live until the use above. runtime.KeepAlive(o) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (o *OpenAtReq) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return o.CopyInN(cc, addr, o.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (o *OpenAtReq) WriteTo(writer io.Writer) (int64, error) { if !o.FD.Packed() { // Type OpenAtReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, o.SizeBytes()) o.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(o))) hdr.Len = o.SizeBytes() hdr.Cap = o.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that o // must live until the use above. runtime.KeepAlive(o) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (o *OpenAtReq) CheckedMarshal(dst []byte) ([]byte, bool) { if o.SizeBytes() > len(dst) { return dst, false } return o.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (o *OpenAtReq) CheckedUnmarshal(src []byte) ([]byte, bool) { if o.SizeBytes() > len(src) { return src, false } return o.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (o *OpenAtResp) SizeBytes() int { return 0 + (*FDID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (o *OpenAtResp) MarshalBytes(dst []byte) []byte { dst = o.OpenFD.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (o *OpenAtResp) UnmarshalBytes(src []byte) []byte { src = o.OpenFD.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (o *OpenAtResp) Packed() bool { return o.OpenFD.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (o *OpenAtResp) MarshalUnsafe(dst []byte) []byte { if o.OpenFD.Packed() { size := o.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(o), uintptr(size)) return dst[size:] } // Type OpenAtResp doesn't have a packed layout in memory, fallback to MarshalBytes. return o.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (o *OpenAtResp) UnmarshalUnsafe(src []byte) []byte { if o.OpenFD.Packed() { size := o.SizeBytes() gohacks.Memmove(unsafe.Pointer(o), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type OpenAtResp doesn't have a packed layout in memory, fallback to UnmarshalBytes. return o.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (o *OpenAtResp) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !o.OpenFD.Packed() { // Type OpenAtResp doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(o.SizeBytes()) // escapes: okay. o.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(o))) hdr.Len = o.SizeBytes() hdr.Cap = o.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that o // must live until the use above. runtime.KeepAlive(o) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (o *OpenAtResp) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return o.CopyOutN(cc, addr, o.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (o *OpenAtResp) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !o.OpenFD.Packed() { // Type OpenAtResp doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(o.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. o.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(o))) hdr.Len = o.SizeBytes() hdr.Cap = o.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that o // must live until the use above. runtime.KeepAlive(o) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (o *OpenAtResp) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return o.CopyInN(cc, addr, o.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (o *OpenAtResp) WriteTo(writer io.Writer) (int64, error) { if !o.OpenFD.Packed() { // Type OpenAtResp doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, o.SizeBytes()) o.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(o))) hdr.Len = o.SizeBytes() hdr.Cap = o.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that o // must live until the use above. runtime.KeepAlive(o) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (o *OpenAtResp) CheckedMarshal(dst []byte) ([]byte, bool) { if o.SizeBytes() > len(dst) { return dst, false } return o.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (o *OpenAtResp) CheckedUnmarshal(src []byte) ([]byte, bool) { if o.SizeBytes() > len(src) { return src, false } return o.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (o *OpenCreateAtResp) SizeBytes() int { return 0 + (*Inode)(nil).SizeBytes() + (*FDID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (o *OpenCreateAtResp) MarshalBytes(dst []byte) []byte { dst = o.Child.MarshalUnsafe(dst) dst = o.NewFD.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (o *OpenCreateAtResp) UnmarshalBytes(src []byte) []byte { src = o.Child.UnmarshalUnsafe(src) src = o.NewFD.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (o *OpenCreateAtResp) Packed() bool { return o.Child.Packed() && o.NewFD.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (o *OpenCreateAtResp) MarshalUnsafe(dst []byte) []byte { if o.Child.Packed() && o.NewFD.Packed() { size := o.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(o), uintptr(size)) return dst[size:] } // Type OpenCreateAtResp doesn't have a packed layout in memory, fallback to MarshalBytes. return o.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (o *OpenCreateAtResp) UnmarshalUnsafe(src []byte) []byte { if o.Child.Packed() && o.NewFD.Packed() { size := o.SizeBytes() gohacks.Memmove(unsafe.Pointer(o), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type OpenCreateAtResp doesn't have a packed layout in memory, fallback to UnmarshalBytes. return o.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (o *OpenCreateAtResp) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !o.Child.Packed() && o.NewFD.Packed() { // Type OpenCreateAtResp doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(o.SizeBytes()) // escapes: okay. o.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(o))) hdr.Len = o.SizeBytes() hdr.Cap = o.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that o // must live until the use above. runtime.KeepAlive(o) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (o *OpenCreateAtResp) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return o.CopyOutN(cc, addr, o.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (o *OpenCreateAtResp) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !o.Child.Packed() && o.NewFD.Packed() { // Type OpenCreateAtResp doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(o.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. o.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(o))) hdr.Len = o.SizeBytes() hdr.Cap = o.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that o // must live until the use above. runtime.KeepAlive(o) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (o *OpenCreateAtResp) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return o.CopyInN(cc, addr, o.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (o *OpenCreateAtResp) WriteTo(writer io.Writer) (int64, error) { if !o.Child.Packed() && o.NewFD.Packed() { // Type OpenCreateAtResp doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, o.SizeBytes()) o.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(o))) hdr.Len = o.SizeBytes() hdr.Cap = o.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that o // must live until the use above. runtime.KeepAlive(o) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (o *OpenCreateAtResp) CheckedMarshal(dst []byte) ([]byte, bool) { if o.SizeBytes() > len(dst) { return dst, false } return o.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (o *OpenCreateAtResp) CheckedUnmarshal(src []byte) ([]byte, bool) { if o.SizeBytes() > len(src) { return src, false } return o.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *PReadReq) SizeBytes() int { return 16 + (*FDID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *PReadReq) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.Offset)) dst = dst[8:] dst = r.FD.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(r.Count)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *PReadReq) UnmarshalBytes(src []byte) []byte { r.Offset = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = r.FD.UnmarshalUnsafe(src) r.Count = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *PReadReq) Packed() bool { return r.FD.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *PReadReq) MarshalUnsafe(dst []byte) []byte { if r.FD.Packed() { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(r), uintptr(size)) return dst[size:] } // Type PReadReq doesn't have a packed layout in memory, fallback to MarshalBytes. return r.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *PReadReq) UnmarshalUnsafe(src []byte) []byte { if r.FD.Packed() { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(r), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type PReadReq doesn't have a packed layout in memory, fallback to UnmarshalBytes. return r.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (r *PReadReq) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !r.FD.Packed() { // Type PReadReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. r.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (r *PReadReq) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (r *PReadReq) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !r.FD.Packed() { // Type PReadReq doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. r.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *PReadReq) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *PReadReq) WriteTo(writer io.Writer) (int64, error) { if !r.FD.Packed() { // Type PReadReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, r.SizeBytes()) r.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (r *PReadReq) CheckedMarshal(dst []byte) ([]byte, bool) { if r.SizeBytes() > len(dst) { return dst, false } return r.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (r *PReadReq) CheckedUnmarshal(src []byte) ([]byte, bool) { if r.SizeBytes() > len(src) { return src, false } return r.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (w *PWriteResp) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (w *PWriteResp) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(w.Count)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (w *PWriteResp) UnmarshalBytes(src []byte) []byte { w.Count = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (w *PWriteResp) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (w *PWriteResp) MarshalUnsafe(dst []byte) []byte { size := w.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(w), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (w *PWriteResp) UnmarshalUnsafe(src []byte) []byte { size := w.SizeBytes() gohacks.Memmove(unsafe.Pointer(w), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (w *PWriteResp) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(w))) hdr.Len = w.SizeBytes() hdr.Cap = w.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that w // must live until the use above. runtime.KeepAlive(w) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (w *PWriteResp) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return w.CopyOutN(cc, addr, w.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (w *PWriteResp) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(w))) hdr.Len = w.SizeBytes() hdr.Cap = w.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that w // must live until the use above. runtime.KeepAlive(w) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (w *PWriteResp) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return w.CopyInN(cc, addr, w.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (w *PWriteResp) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(w))) hdr.Len = w.SizeBytes() hdr.Cap = w.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that w // must live until the use above. runtime.KeepAlive(w) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (w *PWriteResp) CheckedMarshal(dst []byte) ([]byte, bool) { if w.SizeBytes() > len(dst) { return dst, false } return w.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (w *PWriteResp) CheckedUnmarshal(src []byte) ([]byte, bool) { if w.SizeBytes() > len(src) { return src, false } return w.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *ReadLinkAtReq) SizeBytes() int { return 0 + (*FDID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *ReadLinkAtReq) MarshalBytes(dst []byte) []byte { dst = r.FD.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *ReadLinkAtReq) UnmarshalBytes(src []byte) []byte { src = r.FD.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *ReadLinkAtReq) Packed() bool { return r.FD.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *ReadLinkAtReq) MarshalUnsafe(dst []byte) []byte { if r.FD.Packed() { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(r), uintptr(size)) return dst[size:] } // Type ReadLinkAtReq doesn't have a packed layout in memory, fallback to MarshalBytes. return r.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *ReadLinkAtReq) UnmarshalUnsafe(src []byte) []byte { if r.FD.Packed() { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(r), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type ReadLinkAtReq doesn't have a packed layout in memory, fallback to UnmarshalBytes. return r.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (r *ReadLinkAtReq) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !r.FD.Packed() { // Type ReadLinkAtReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. r.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (r *ReadLinkAtReq) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (r *ReadLinkAtReq) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !r.FD.Packed() { // Type ReadLinkAtReq doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. r.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *ReadLinkAtReq) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *ReadLinkAtReq) WriteTo(writer io.Writer) (int64, error) { if !r.FD.Packed() { // Type ReadLinkAtReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, r.SizeBytes()) r.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (r *ReadLinkAtReq) CheckedMarshal(dst []byte) ([]byte, bool) { if r.SizeBytes() > len(dst) { return dst, false } return r.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (r *ReadLinkAtReq) CheckedUnmarshal(src []byte) ([]byte, bool) { if r.SizeBytes() > len(src) { return src, false } return r.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SetStatReq) SizeBytes() int { return 16 + (*FDID)(nil).SizeBytes() + (*UID)(nil).SizeBytes() + (*GID)(nil).SizeBytes() + (*linux.Timespec)(nil).SizeBytes() + (*linux.Timespec)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SetStatReq) MarshalBytes(dst []byte) []byte { dst = s.FD.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Mask)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.Mode)) dst = dst[4:] dst = s.UID.MarshalUnsafe(dst) dst = s.GID.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Size)) dst = dst[8:] dst = s.Atime.MarshalUnsafe(dst) dst = s.Mtime.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SetStatReq) UnmarshalBytes(src []byte) []byte { src = s.FD.UnmarshalUnsafe(src) s.Mask = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.Mode = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = s.UID.UnmarshalUnsafe(src) src = s.GID.UnmarshalUnsafe(src) s.Size = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = s.Atime.UnmarshalUnsafe(src) src = s.Mtime.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SetStatReq) Packed() bool { return s.Atime.Packed() && s.FD.Packed() && s.GID.Packed() && s.Mtime.Packed() && s.UID.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SetStatReq) MarshalUnsafe(dst []byte) []byte { if s.Atime.Packed() && s.FD.Packed() && s.GID.Packed() && s.Mtime.Packed() && s.UID.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // Type SetStatReq doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SetStatReq) UnmarshalUnsafe(src []byte) []byte { if s.Atime.Packed() && s.FD.Packed() && s.GID.Packed() && s.Mtime.Packed() && s.UID.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type SetStatReq doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SetStatReq) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Atime.Packed() && s.FD.Packed() && s.GID.Packed() && s.Mtime.Packed() && s.UID.Packed() { // Type SetStatReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SetStatReq) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SetStatReq) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Atime.Packed() && s.FD.Packed() && s.GID.Packed() && s.Mtime.Packed() && s.UID.Packed() { // Type SetStatReq doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SetStatReq) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SetStatReq) WriteTo(writer io.Writer) (int64, error) { if !s.Atime.Packed() && s.FD.Packed() && s.GID.Packed() && s.Mtime.Packed() && s.UID.Packed() { // Type SetStatReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (s *SetStatReq) CheckedMarshal(dst []byte) ([]byte, bool) { if s.SizeBytes() > len(dst) { return dst, false } return s.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (s *SetStatReq) CheckedUnmarshal(src []byte) ([]byte, bool) { if s.SizeBytes() > len(src) { return src, false } return s.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SetStatResp) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SetStatResp) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.FailureMask)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.FailureErrNo)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SetStatResp) UnmarshalBytes(src []byte) []byte { s.FailureMask = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] s.FailureErrNo = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SetStatResp) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SetStatResp) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SetStatResp) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SetStatResp) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SetStatResp) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SetStatResp) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SetStatResp) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SetStatResp) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (s *SetStatResp) CheckedMarshal(dst []byte) ([]byte, bool) { if s.SizeBytes() > len(dst) { return dst, false } return s.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (s *SetStatResp) CheckedUnmarshal(src []byte) ([]byte, bool) { if s.SizeBytes() > len(src) { return src, false } return s.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *StatFS) SizeBytes() int { return 64 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *StatFS) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Type)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.BlockSize)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Blocks)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.BlocksFree)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.BlocksAvailable)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Files)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.FilesFree)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.NameLength)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *StatFS) UnmarshalBytes(src []byte) []byte { s.Type = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.BlockSize = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Blocks = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.BlocksFree = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.BlocksAvailable = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Files = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.FilesFree = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.NameLength = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *StatFS) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *StatFS) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *StatFS) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *StatFS) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *StatFS) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *StatFS) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *StatFS) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *StatFS) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (s *StatFS) CheckedMarshal(dst []byte) ([]byte, bool) { if s.SizeBytes() > len(dst) { return dst, false } return s.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (s *StatFS) CheckedUnmarshal(src []byte) ([]byte, bool) { if s.SizeBytes() > len(src) { return src, false } return s.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *StatReq) SizeBytes() int { return 0 + (*FDID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *StatReq) MarshalBytes(dst []byte) []byte { dst = s.FD.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *StatReq) UnmarshalBytes(src []byte) []byte { src = s.FD.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *StatReq) Packed() bool { return s.FD.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *StatReq) MarshalUnsafe(dst []byte) []byte { if s.FD.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // Type StatReq doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *StatReq) UnmarshalUnsafe(src []byte) []byte { if s.FD.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type StatReq doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *StatReq) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.FD.Packed() { // Type StatReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *StatReq) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *StatReq) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.FD.Packed() { // Type StatReq doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *StatReq) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *StatReq) WriteTo(writer io.Writer) (int64, error) { if !s.FD.Packed() { // Type StatReq doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (s *StatReq) CheckedMarshal(dst []byte) ([]byte, bool) { if s.SizeBytes() > len(dst) { return dst, false } return s.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (s *StatReq) CheckedUnmarshal(src []byte) ([]byte, bool) { if s.SizeBytes() > len(src) { return src, false } return s.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SymlinkAtResp) SizeBytes() int { return 0 + (*Inode)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SymlinkAtResp) MarshalBytes(dst []byte) []byte { dst = s.Symlink.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SymlinkAtResp) UnmarshalBytes(src []byte) []byte { src = s.Symlink.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SymlinkAtResp) Packed() bool { return s.Symlink.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SymlinkAtResp) MarshalUnsafe(dst []byte) []byte { if s.Symlink.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // Type SymlinkAtResp doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SymlinkAtResp) UnmarshalUnsafe(src []byte) []byte { if s.Symlink.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type SymlinkAtResp doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SymlinkAtResp) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Symlink.Packed() { // Type SymlinkAtResp doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SymlinkAtResp) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SymlinkAtResp) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Symlink.Packed() { // Type SymlinkAtResp doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SymlinkAtResp) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SymlinkAtResp) WriteTo(writer io.Writer) (int64, error) { if !s.Symlink.Packed() { // Type SymlinkAtResp doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (s *SymlinkAtResp) CheckedMarshal(dst []byte) ([]byte, bool) { if s.SizeBytes() > len(dst) { return dst, false } return s.MarshalUnsafe(dst), true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (s *SymlinkAtResp) CheckedUnmarshal(src []byte) ([]byte, bool) { if s.SizeBytes() > len(src) { return src, false } return s.UnmarshalUnsafe(src), true } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (uid *UID) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (uid *UID) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(*uid)) return dst[4:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (uid *UID) UnmarshalBytes(src []byte) []byte { *uid = UID(uint32(hostarch.ByteOrder.Uint32(src[:4]))) return src[4:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (uid *UID) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (uid *UID) MarshalUnsafe(dst []byte) []byte { size := uid.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(uid), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (uid *UID) UnmarshalUnsafe(src []byte) []byte { size := uid.SizeBytes() gohacks.Memmove(unsafe.Pointer(uid), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (uid *UID) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(uid))) hdr.Len = uid.SizeBytes() hdr.Cap = uid.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that uid // must live until the use above. runtime.KeepAlive(uid) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (uid *UID) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return uid.CopyOutN(cc, addr, uid.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (uid *UID) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(uid))) hdr.Len = uid.SizeBytes() hdr.Cap = uid.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that uid // must live until the use above. runtime.KeepAlive(uid) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (uid *UID) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return uid.CopyInN(cc, addr, uid.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (uid *UID) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(uid))) hdr.Len = uid.SizeBytes() hdr.Cap = uid.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that uid // must live until the use above. runtime.KeepAlive(uid) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (c *createCommon) SizeBytes() int { return 6 + (*FDID)(nil).SizeBytes() + (*UID)(nil).SizeBytes() + (*GID)(nil).SizeBytes() + (*linux.FileMode)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (c *createCommon) MarshalBytes(dst []byte) []byte { dst = c.DirFD.MarshalUnsafe(dst) dst = c.UID.MarshalUnsafe(dst) dst = c.GID.MarshalUnsafe(dst) dst = c.Mode.MarshalUnsafe(dst) // Padding: dst[:sizeof(uint16)] ~= uint16(0) dst = dst[2:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (c *createCommon) UnmarshalBytes(src []byte) []byte { src = c.DirFD.UnmarshalUnsafe(src) src = c.UID.UnmarshalUnsafe(src) src = c.GID.UnmarshalUnsafe(src) src = c.Mode.UnmarshalUnsafe(src) // Padding: var _ uint16 ~= src[:sizeof(uint16)] src = src[2:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (c *createCommon) Packed() bool { return c.DirFD.Packed() && c.GID.Packed() && c.Mode.Packed() && c.UID.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (c *createCommon) MarshalUnsafe(dst []byte) []byte { if c.DirFD.Packed() && c.GID.Packed() && c.Mode.Packed() && c.UID.Packed() { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(c), uintptr(size)) return dst[size:] } // Type createCommon doesn't have a packed layout in memory, fallback to MarshalBytes. return c.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (c *createCommon) UnmarshalUnsafe(src []byte) []byte { if c.DirFD.Packed() && c.GID.Packed() && c.Mode.Packed() && c.UID.Packed() { size := c.SizeBytes() gohacks.Memmove(unsafe.Pointer(c), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type createCommon doesn't have a packed layout in memory, fallback to UnmarshalBytes. return c.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (c *createCommon) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !c.DirFD.Packed() && c.GID.Packed() && c.Mode.Packed() && c.UID.Packed() { // Type createCommon doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(c.SizeBytes()) // escapes: okay. c.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (c *createCommon) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyOutN(cc, addr, c.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (c *createCommon) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !c.DirFD.Packed() && c.GID.Packed() && c.Mode.Packed() && c.UID.Packed() { // Type createCommon doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(c.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. c.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (c *createCommon) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return c.CopyInN(cc, addr, c.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (c *createCommon) WriteTo(writer io.Writer) (int64, error) { if !c.DirFD.Packed() && c.GID.Packed() && c.Mode.Packed() && c.UID.Packed() { // Type createCommon doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, c.SizeBytes()) c.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(c))) hdr.Len = c.SizeBytes() hdr.Cap = c.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that c // must live until the use above. runtime.KeepAlive(c) // escapes: replaced by intrinsic. return int64(length), err } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (m *MsgDynamic) Packed() bool { // Type MsgDynamic is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (m *MsgDynamic) MarshalUnsafe(dst []byte) []byte { // Type MsgDynamic doesn't have a packed layout in memory, fallback to MarshalBytes. return m.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (m *MsgDynamic) UnmarshalUnsafe(src []byte) []byte { // Type MsgDynamic doesn't have a packed layout in memory, fallback to UnmarshalBytes. return m.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (m *MsgDynamic) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type MsgDynamic doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(m.SizeBytes()) // escapes: okay. m.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (m *MsgDynamic) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyOutN(cc, addr, m.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (m *MsgDynamic) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type MsgDynamic doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(m.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. m.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (m *MsgDynamic) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyInN(cc, addr, m.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (m *MsgDynamic) WriteTo(writer io.Writer) (int64, error) { // Type MsgDynamic doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, m.SizeBytes()) m.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (m *MsgSimple) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (m *MsgSimple) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(m.A)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(m.B)) dst = dst[2:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(m.C)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.D)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (m *MsgSimple) UnmarshalBytes(src []byte) []byte { m.A = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] m.B = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] m.C = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] m.D = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (m *MsgSimple) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (m *MsgSimple) MarshalUnsafe(dst []byte) []byte { size := m.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(m), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (m *MsgSimple) UnmarshalUnsafe(src []byte) []byte { size := m.SizeBytes() gohacks.Memmove(unsafe.Pointer(m), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (m *MsgSimple) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (m *MsgSimple) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyOutN(cc, addr, m.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (m *MsgSimple) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (m *MsgSimple) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyInN(cc, addr, m.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (m *MsgSimple) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return int64(length), err } // CopyMsg1SliceIn copies in a slice of MsgSimple objects from the task's memory. func CopyMsg1SliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []MsgSimple) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*MsgSimple)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyMsg1SliceOut copies a slice of MsgSimple objects to the task's memory. func CopyMsg1SliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []MsgSimple) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*MsgSimple)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeMsg1Slice is like MsgSimple.MarshalUnsafe, but for a []MsgSimple. func MarshalUnsafeMsg1Slice(src []MsgSimple, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*MsgSimple)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeMsg1Slice is like MsgSimple.UnmarshalUnsafe, but for a []MsgSimple. func UnmarshalUnsafeMsg1Slice(dst []MsgSimple, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*MsgSimple)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *sockHeader) SizeBytes() int { return 6 + (*MID)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *sockHeader) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.payloadLen)) dst = dst[4:] dst = s.message.MarshalUnsafe(dst) // Padding: dst[:sizeof(uint16)] ~= uint16(0) dst = dst[2:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *sockHeader) UnmarshalBytes(src []byte) []byte { s.payloadLen = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] src = s.message.UnmarshalUnsafe(src) // Padding: var _ uint16 ~= src[:sizeof(uint16)] src = src[2:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *sockHeader) Packed() bool { return s.message.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *sockHeader) MarshalUnsafe(dst []byte) []byte { if s.message.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // Type sockHeader doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *sockHeader) UnmarshalUnsafe(src []byte) []byte { if s.message.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type sockHeader doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *sockHeader) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.message.Packed() { // Type sockHeader doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *sockHeader) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *sockHeader) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.message.Packed() { // Type sockHeader doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *sockHeader) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *sockHeader) WriteTo(writer io.Writer) (int64, error) { if !s.message.Packed() { // Type sockHeader doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/lisafs_state_autogen.go000066400000000000000000000121001465435605700254310ustar00rootroot00000000000000// automatically generated by stateify. package lisafs import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (r *boundSocketFDRefs) StateTypeName() string { return "pkg/lisafs.boundSocketFDRefs" } func (r *boundSocketFDRefs) StateFields() []string { return []string{ "refCount", } } func (r *boundSocketFDRefs) beforeSave() {} // +checklocksignore func (r *boundSocketFDRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *boundSocketFDRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (l *controlFDList) StateTypeName() string { return "pkg/lisafs.controlFDList" } func (l *controlFDList) StateFields() []string { return []string{ "head", "tail", } } func (l *controlFDList) beforeSave() {} // +checklocksignore func (l *controlFDList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *controlFDList) afterLoad(context.Context) {} // +checklocksignore func (l *controlFDList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *controlFDEntry) StateTypeName() string { return "pkg/lisafs.controlFDEntry" } func (e *controlFDEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *controlFDEntry) beforeSave() {} // +checklocksignore func (e *controlFDEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *controlFDEntry) afterLoad(context.Context) {} // +checklocksignore func (e *controlFDEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (r *controlFDRefs) StateTypeName() string { return "pkg/lisafs.controlFDRefs" } func (r *controlFDRefs) StateFields() []string { return []string{ "refCount", } } func (r *controlFDRefs) beforeSave() {} // +checklocksignore func (r *controlFDRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *controlFDRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (r *nodeRefs) StateTypeName() string { return "pkg/lisafs.nodeRefs" } func (r *nodeRefs) StateFields() []string { return []string{ "refCount", } } func (r *nodeRefs) beforeSave() {} // +checklocksignore func (r *nodeRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *nodeRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (l *openFDList) StateTypeName() string { return "pkg/lisafs.openFDList" } func (l *openFDList) StateFields() []string { return []string{ "head", "tail", } } func (l *openFDList) beforeSave() {} // +checklocksignore func (l *openFDList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *openFDList) afterLoad(context.Context) {} // +checklocksignore func (l *openFDList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *openFDEntry) StateTypeName() string { return "pkg/lisafs.openFDEntry" } func (e *openFDEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *openFDEntry) beforeSave() {} // +checklocksignore func (e *openFDEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *openFDEntry) afterLoad(context.Context) {} // +checklocksignore func (e *openFDEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (r *openFDRefs) StateTypeName() string { return "pkg/lisafs.openFDRefs" } func (r *openFDRefs) StateFields() []string { return []string{ "refCount", } } func (r *openFDRefs) beforeSave() {} // +checklocksignore func (r *openFDRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *openFDRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func init() { state.Register((*boundSocketFDRefs)(nil)) state.Register((*controlFDList)(nil)) state.Register((*controlFDEntry)(nil)) state.Register((*controlFDRefs)(nil)) state.Register((*nodeRefs)(nil)) state.Register((*openFDList)(nil)) state.Register((*openFDEntry)(nil)) state.Register((*openFDRefs)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/message.go000066400000000000000000001515631465435605700226730ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package lisafs import ( "fmt" "math" "os" "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" ) // Messages have two parts: // * A transport header used to decipher received messages. // * A byte array referred to as "payload" which contains the actual message. // "dataLen" refers to the size of both combined. // // All messages must implement the following functions: // * marshal.Marshallable.SizeBytes // * marshal.Marshallable.Marshal{Unsafe/Bytes} // * marshal.CheckedMarshallable.CheckedUnmarshal // * fmt.Stringer.String // // There is no explicit interface definition for this because that definition // will not be used anywhere. If a concrete type is passed into a function // which receives it as an interface, the struct is moved to the heap. This // erodes memory performance. Message structs are be short lived - they are // initialized, marshalled into a buffer and not used after that. So heap // allocating these message structs is wasteful. Don't define Message interface // so it's not used. Instead use function arguments. See Client.SndRcvMessage. // // Unmarshalling code should use the Checked variant of the Unmarshal functions // because a malicious encoder could have manipulated payload bytes to make the // unchecked unmarshal variants panic due to the lack of bound checking. // Marshalling code does not need additional bound checking because the caller // itself initializes the struct being marshalled, so it is trusted. // // String() implementations must ensure that the message struct doesn't escape. // For instance, directly passing the struct to fmt.Sprintf() escapes it // because of the implicit conversion to any. type marshalFunc func([]byte) []byte type unmarshalFunc func([]byte) ([]byte, bool) type debugStringer func() string // MID (message ID) is used to identify messages to parse from payload. // // +marshal slice:MIDSlice type MID uint16 // These constants are used to identify their corresponding message types. const ( // Error is only used in responses to pass errors to client. Error MID = 0 // Mount is used to establish connection between the client and server mount // point. lisafs requires that the client makes a successful Mount RPC before // making other RPCs. Mount MID = 1 // Channel requests to start a new communicational channel. Channel MID = 2 // FStat requests the stat(2) results for a specified file. FStat MID = 3 // SetStat requests to change file attributes. Note that there is no one // corresponding Linux syscall. This is a conglomeration of fchmod(2), // fchown(2), ftruncate(2) and futimesat(2). SetStat MID = 4 // Walk requests to walk the specified path starting from the specified // directory. Server-side path traversal is terminated preemptively on // symlinks entries because they can cause non-linear traversal. Walk MID = 5 // WalkStat is the same as Walk, except the following differences: // * If the first path component is "", then it also returns stat results // for the directory where the walk starts. // * Does not return Inode, just the Stat results for each path component. WalkStat MID = 6 // OpenAt is analogous to openat(2). It does not perform any walk. It merely // duplicates the control FD with the open flags passed. OpenAt MID = 7 // OpenCreateAt is analogous to openat(2) with O_CREAT|O_EXCL added to flags. // It also returns the newly created file inode. OpenCreateAt MID = 8 // Close is analogous to close(2) but can work on multiple FDs. Close MID = 9 // FSync is analogous to fsync(2) but can work on multiple FDs. FSync MID = 10 // PWrite is analogous to pwrite(2). PWrite MID = 11 // PRead is analogous to pread(2). PRead MID = 12 // MkdirAt is analogous to mkdirat(2). MkdirAt MID = 13 // MknodAt is analogous to mknodat(2). MknodAt MID = 14 // SymlinkAt is analogous to symlinkat(2). SymlinkAt MID = 15 // LinkAt is analogous to linkat(2). LinkAt MID = 16 // FStatFS is analogous to fstatfs(2). FStatFS MID = 17 // FAllocate is analogous to fallocate(2). FAllocate MID = 18 // ReadLinkAt is analogous to readlinkat(2). ReadLinkAt MID = 19 // Flush cleans up the file state. Its behavior is implementation // dependent and might not even be supported in server implementations. Flush MID = 20 // Connect is loosely analogous to connect(2). Connect MID = 21 // UnlinkAt is analogous to unlinkat(2). UnlinkAt MID = 22 // RenameAt is loosely analogous to renameat(2). RenameAt MID = 23 // Getdents64 is analogous to getdents64(2). Getdents64 MID = 24 // FGetXattr is analogous to fgetxattr(2). FGetXattr MID = 25 // FSetXattr is analogous to fsetxattr(2). FSetXattr MID = 26 // FListXattr is analogous to flistxattr(2). FListXattr MID = 27 // FRemoveXattr is analogous to fremovexattr(2). FRemoveXattr MID = 28 // BindAt is analogous to bind(2). BindAt MID = 29 // Listen is analogous to listen(2). Listen MID = 30 // Accept is analogous to accept4(2). Accept MID = 31 ) const ( // NoUID is a sentinel used to indicate no valid UID. NoUID UID = math.MaxUint32 // NoGID is a sentinel used to indicate no valid GID. NoGID GID = math.MaxUint32 ) // MaxMessageSize is the recommended max message size that can be used by // connections. Server implementations may choose to use other values. func MaxMessageSize() uint32 { // Return HugePageSize - PageSize so that when flipcall packet window is // created with MaxMessageSize() + flipcall header size + channel header // size, HugePageSize is allocated and can be backed by a single huge page // if supported by the underlying memfd. return uint32(hostarch.HugePageSize - os.Getpagesize()) } // UID represents a user ID. // // +marshal type UID uint32 // Ok returns true if uid is not NoUID. func (uid UID) Ok() bool { return uid != NoUID } // GID represents a group ID. // // +marshal type GID uint32 // Ok returns true if gid is not NoGID. func (gid GID) Ok() bool { return gid != NoGID } // EmptyMessage is an empty message. type EmptyMessage struct{} // String implements fmt.Stringer.String. func (*EmptyMessage) String() string { return "EmptyMessage{}" } // SizeBytes implements marshal.Marshallable.SizeBytes. func (*EmptyMessage) SizeBytes() int { return 0 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (*EmptyMessage) MarshalBytes(dst []byte) []byte { return dst } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (*EmptyMessage) CheckedUnmarshal(src []byte) ([]byte, bool) { return src, true } // SizedString represents a string in memory. The marshalled string bytes are // preceded by a uint16 signifying the string length. type SizedString string // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SizedString) SizeBytes() int { return (*primitive.Uint16)(nil).SizeBytes() + len(*s) } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SizedString) MarshalBytes(dst []byte) []byte { strLen := primitive.Uint16(len(*s)) dst = strLen.MarshalUnsafe(dst) // Copy without any allocation. return dst[copy(dst[:strLen], *s):] } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (s *SizedString) CheckedUnmarshal(src []byte) ([]byte, bool) { var strLen primitive.Uint16 srcRemain, ok := strLen.CheckedUnmarshal(src) if !ok || len(srcRemain) < int(strLen) { return src, false } // Take the hit, this leads to an allocation + memcpy. No way around it. *s = SizedString(srcRemain[:strLen]) return srcRemain[strLen:], true } // StringArray represents an array of SizedStrings in memory. The marshalled // array data is preceded by a uint16 signifying the array length. type StringArray []string // String implements fmt.Stringer.String. This ensures that the string slice is // not escaped so that callers that use a statically sized string array do not // incur an unnecessary allocation. func (s *StringArray) String() string { var b strings.Builder b.WriteString("[") b.WriteString(strings.Join(*s, ", ")) b.WriteString("]") return b.String() } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *StringArray) SizeBytes() int { size := (*primitive.Uint16)(nil).SizeBytes() for _, str := range *s { sstr := SizedString(str) size += sstr.SizeBytes() } return size } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *StringArray) MarshalBytes(dst []byte) []byte { arrLen := primitive.Uint16(len(*s)) dst = arrLen.MarshalUnsafe(dst) for _, str := range *s { sstr := SizedString(str) dst = sstr.MarshalBytes(dst) } return dst } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (s *StringArray) CheckedUnmarshal(src []byte) ([]byte, bool) { var arrLen primitive.Uint16 srcRemain, ok := arrLen.CheckedUnmarshal(src) if !ok { return src, false } if cap(*s) < int(arrLen) { *s = make([]string, arrLen) } else { *s = (*s)[:arrLen] } for i := primitive.Uint16(0); i < arrLen; i++ { var sstr SizedString srcRemain, ok = sstr.CheckedUnmarshal(srcRemain) if !ok { return src, false } (*s)[i] = string(sstr) } return srcRemain, true } // Inode represents an inode on the remote filesystem. // // +marshal slice:InodeSlice type Inode struct { ControlFD FDID Stat linux.Statx } func (i *Inode) String() string { return fmt.Sprintf("Inode{ControlFD: %d, Stat: %s}", i.ControlFD, i.Stat.String()) } // MountReq is an empty request to Mount on the connection. type MountReq struct{ EmptyMessage } // String implements fmt.Stringer.String. func (*MountReq) String() string { return "MountReq{}" } // MountResp represents a Mount response. type MountResp struct { Root Inode // MaxMessageSize is the maximum size of messages communicated between the // client and server in bytes. This includes the communication header. MaxMessageSize primitive.Uint32 // SupportedMs holds all the supported messages. SupportedMs []MID } // String implements fmt.Stringer.String. func (m *MountResp) String() string { return fmt.Sprintf("MountResp{Root: %s, MaxMessageSize: %d, SupportedMs: %+v}", m.Root.String(), m.MaxMessageSize, m.SupportedMs) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (m *MountResp) SizeBytes() int { return m.Root.SizeBytes() + m.MaxMessageSize.SizeBytes() + (*primitive.Uint16)(nil).SizeBytes() + (len(m.SupportedMs) * (*MID)(nil).SizeBytes()) } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (m *MountResp) MarshalBytes(dst []byte) []byte { dst = m.Root.MarshalUnsafe(dst) dst = m.MaxMessageSize.MarshalUnsafe(dst) numSupported := primitive.Uint16(len(m.SupportedMs)) dst = numSupported.MarshalBytes(dst) return MarshalUnsafeMIDSlice(m.SupportedMs, dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (m *MountResp) CheckedUnmarshal(src []byte) ([]byte, bool) { m.SupportedMs = m.SupportedMs[:0] if m.SizeBytes() > len(src) { return src, false } srcRemain := m.Root.UnmarshalUnsafe(src) srcRemain = m.MaxMessageSize.UnmarshalUnsafe(srcRemain) var numSupported primitive.Uint16 srcRemain = numSupported.UnmarshalBytes(srcRemain) if int(numSupported)*(*MID)(nil).SizeBytes() > len(srcRemain) { return src, false } if cap(m.SupportedMs) < int(numSupported) { m.SupportedMs = make([]MID, numSupported) } else { m.SupportedMs = m.SupportedMs[:numSupported] } return UnmarshalUnsafeMIDSlice(m.SupportedMs, srcRemain), true } // ChannelReq is an empty requent to create a Channel. type ChannelReq struct{ EmptyMessage } // String implements fmt.Stringer.String. func (*ChannelReq) String() string { return "ChannelReq{}" } // ChannelResp is the response to the create channel request. // // +marshal boundCheck type ChannelResp struct { dataOffset int64 dataLength uint64 } // String implements fmt.Stringer.String. func (c *ChannelResp) String() string { return fmt.Sprintf("ChannelResp{dataOffset: %d, dataLength: %d}", c.dataOffset, c.dataLength) } // ErrorResp is returned to represent an error while handling a request. // // +marshal type ErrorResp struct { errno uint32 } // String implements fmt.Stringer.String. func (e *ErrorResp) String() string { return fmt.Sprintf("ErrorResp{errno: %d}", e.errno) } // StatReq requests the stat results for the specified FD. // // +marshal boundCheck type StatReq struct { FD FDID } // String implements fmt.Stringer.String. func (s *StatReq) String() string { return fmt.Sprintf("StatReq{FD: %d}", s.FD) } // SetStatReq is used to set attributeds on FDs. // // +marshal boundCheck type SetStatReq struct { FD FDID Mask uint32 Mode uint32 // Only permissions part is settable. UID UID GID GID Size uint64 Atime linux.Timespec Mtime linux.Timespec } // String implements fmt.Stringer.String. func (s *SetStatReq) String() string { return fmt.Sprintf("SetStatReq{FD: %d, Mask: %#x, Mode: %d, UID: %d, GID: %d, Size: %d, Atime: %s, Mtime: %s}", s.FD, s.Mask, s.Mode, s.UID, s.GID, s.Size, s.Atime.ToTime(), s.Mtime.ToTime()) } // SetStatResp is used to communicate SetStat results. It contains a mask // representing the failed changes. It also contains the errno of the failed // set attribute operation. If multiple operations failed then any of those // errnos can be returned. // // +marshal boundCheck type SetStatResp struct { FailureMask uint32 FailureErrNo uint32 } // String implements fmt.Stringer.String. func (s *SetStatResp) String() string { return fmt.Sprintf("SetStatResp{FailureMask: %#x, FailureErrNo: %d}", s.FailureMask, s.FailureErrNo) } // WalkReq is used to request to walk multiple path components at once. This // is used for both Walk and WalkStat. type WalkReq struct { DirFD FDID Path StringArray } // String implements fmt.Stringer.String. func (w *WalkReq) String() string { return fmt.Sprintf("WalkReq{DirFD: %d, Path: %s}", w.DirFD, w.Path.String()) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (w *WalkReq) SizeBytes() int { return w.DirFD.SizeBytes() + w.Path.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (w *WalkReq) MarshalBytes(dst []byte) []byte { dst = w.DirFD.MarshalUnsafe(dst) return w.Path.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (w *WalkReq) CheckedUnmarshal(src []byte) ([]byte, bool) { w.Path = w.Path[:0] if w.SizeBytes() > len(src) { return src, false } srcRemain := w.DirFD.UnmarshalUnsafe(src) if srcRemain, ok := w.Path.CheckedUnmarshal(srcRemain); ok { return srcRemain, true } return src, false } // WalkStatus is used to indicate the reason for partial/unsuccessful server // side Walk operations. Please note that partial/unsuccessful walk operations // do not necessarily fail the RPC. The RPC is successful with a failure hint // which can be used by the client to infer server-side state. type WalkStatus = primitive.Uint8 const ( // WalkSuccess indicates that all path components were successfully walked. WalkSuccess WalkStatus = iota // WalkComponentDoesNotExist indicates that the walk was prematurely // terminated because an intermediate path component does not exist on // server. The results of all previous existing path components is returned. WalkComponentDoesNotExist // WalkComponentSymlink indicates that the walk was prematurely // terminated because an intermediate path component was a symlink. It is not // safe to resolve symlinks remotely (unaware of mount points). WalkComponentSymlink ) func walkStatusToString(ws WalkStatus) string { switch ws { case WalkSuccess: return "Success" case WalkComponentDoesNotExist: return "ComponentDoesNotExist" case WalkComponentSymlink: return "ComponentSymlink" default: panic(fmt.Sprintf("Unknown WalkStatus: %d", ws)) } } // WalkResp is used to communicate the inodes walked by the server. In memory, // the inode array is preceded by a uint16 integer denoting array length. type WalkResp struct { Status WalkStatus Inodes []Inode } // String implements fmt.Stringer.String. This ensures that the Inode slice is // not escaped so that callers that use a statically sized Inode array do not // incur an unnecessary allocation. func (w *WalkResp) String() string { var arrB strings.Builder arrB.WriteString("[") for i := range w.Inodes { if i > 0 { arrB.WriteString(", ") } arrB.WriteString(w.Inodes[i].String()) } arrB.WriteString("]") return fmt.Sprintf("WalkResp{Status: %s, Inodes: %s}", walkStatusToString(w.Status), arrB.String()) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (w *WalkResp) SizeBytes() int { return w.Status.SizeBytes() + (*primitive.Uint16)(nil).SizeBytes() + (len(w.Inodes) * (*Inode)(nil).SizeBytes()) } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (w *WalkResp) MarshalBytes(dst []byte) []byte { dst = w.Status.MarshalUnsafe(dst) numInodes := primitive.Uint16(len(w.Inodes)) dst = numInodes.MarshalUnsafe(dst) return MarshalUnsafeInodeSlice(w.Inodes, dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (w *WalkResp) CheckedUnmarshal(src []byte) ([]byte, bool) { w.Inodes = w.Inodes[:0] if w.SizeBytes() > len(src) { return src, false } srcRemain := w.Status.UnmarshalUnsafe(src) var numInodes primitive.Uint16 srcRemain = numInodes.UnmarshalUnsafe(srcRemain) if int(numInodes)*(*Inode)(nil).SizeBytes() > len(srcRemain) { return src, false } if cap(w.Inodes) < int(numInodes) { w.Inodes = make([]Inode, numInodes) } else { w.Inodes = w.Inodes[:numInodes] } return UnmarshalUnsafeInodeSlice(w.Inodes, srcRemain), true } // WalkStatResp is used to communicate stat results for WalkStat. In memory, // the array data is preceded by a uint16 denoting the array length. type WalkStatResp struct { Stats []linux.Statx } // String implements fmt.Stringer.String. func (w *WalkStatResp) String() string { var arrB strings.Builder arrB.WriteString("[") for i := range w.Stats { if i > 0 { arrB.WriteString(", ") } arrB.WriteString(w.Stats[i].String()) } arrB.WriteString("]") return fmt.Sprintf("WalkStatResp{Stats: %s}", arrB.String()) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (w *WalkStatResp) SizeBytes() int { return (*primitive.Uint16)(nil).SizeBytes() + (len(w.Stats) * linux.SizeOfStatx) } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (w *WalkStatResp) MarshalBytes(dst []byte) []byte { numStats := primitive.Uint16(len(w.Stats)) dst = numStats.MarshalUnsafe(dst) return linux.MarshalUnsafeStatxSlice(w.Stats, dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (w *WalkStatResp) CheckedUnmarshal(src []byte) ([]byte, bool) { w.Stats = w.Stats[:0] if w.SizeBytes() > len(src) { return src, false } var numStats primitive.Uint16 srcRemain := numStats.UnmarshalUnsafe(src) if int(numStats)*linux.SizeOfStatx > len(srcRemain) { return src, false } if cap(w.Stats) < int(numStats) { w.Stats = make([]linux.Statx, numStats) } else { w.Stats = w.Stats[:numStats] } return linux.UnmarshalUnsafeStatxSlice(w.Stats, srcRemain), true } // OpenAtReq is used to open existing FDs with the specified flags. // // +marshal boundCheck type OpenAtReq struct { FD FDID Flags uint32 _ uint32 // Need to make struct packed. } // String implements fmt.Stringer.String. func (o *OpenAtReq) String() string { return fmt.Sprintf("OpenAtReq{FD: %d, Flags: %#o}", o.FD, o.Flags) } // OpenAtResp is used to communicate the newly created FD. // // +marshal boundCheck type OpenAtResp struct { OpenFD FDID } // String implements fmt.Stringer.String. func (o *OpenAtResp) String() string { return fmt.Sprintf("OpenAtResp{OpenFD: %d}", o.OpenFD) } // +marshal type createCommon struct { DirFD FDID UID UID GID GID Mode linux.FileMode // The following are needed to make the struct packed. _ uint16 _ uint32 } // OpenCreateAtReq is used to make OpenCreateAt requests. type OpenCreateAtReq struct { createCommon Flags primitive.Uint32 Name SizedString } // String implements fmt.Stringer.String. func (o *OpenCreateAtReq) String() string { return fmt.Sprintf("OpenCreateAtReq{DirFD: %d, Mode: %s, UID: %d, GID: %d, Flags: %#o, Name: %s}", o.DirFD, o.Mode, o.UID, o.GID, o.Flags, o.Name) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (o *OpenCreateAtReq) SizeBytes() int { return o.createCommon.SizeBytes() + o.Flags.SizeBytes() + o.Name.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (o *OpenCreateAtReq) MarshalBytes(dst []byte) []byte { dst = o.createCommon.MarshalUnsafe(dst) dst = o.Flags.MarshalUnsafe(dst) return o.Name.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (o *OpenCreateAtReq) CheckedUnmarshal(src []byte) ([]byte, bool) { o.Name = "" if o.SizeBytes() > len(src) { return src, false } srcRemain := o.createCommon.UnmarshalUnsafe(src) srcRemain = o.Flags.UnmarshalUnsafe(srcRemain) if srcRemain, ok := o.Name.CheckedUnmarshal(srcRemain); ok { return srcRemain, true } return src, false } // OpenCreateAtResp is used to communicate successful OpenCreateAt results. // // +marshal boundCheck type OpenCreateAtResp struct { Child Inode NewFD FDID } // String implements fmt.Stringer.String. func (o *OpenCreateAtResp) String() string { return fmt.Sprintf("OpenCreateAtResp{Child: %s, NewFD: %d}", o.Child.String(), o.NewFD) } // FdArray is a utility struct which implements a marshallable type for // communicating an array of FDIDs. In memory, the array data is preceded by a // uint16 denoting the array length. type FdArray []FDID // String implements fmt.Stringer.String. This ensures that the FDID slice is // not escaped so that callers that use a statically sized FDID array do not // incur an unnecessary allocation. func (f *FdArray) String() string { var b strings.Builder b.WriteString("[") for i, fd := range *f { if i > 0 { b.WriteString(", ") } b.WriteString(fmt.Sprintf("%d", fd)) } b.WriteString("]") return b.String() } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FdArray) SizeBytes() int { return (*primitive.Uint16)(nil).SizeBytes() + (len(*f) * (*FDID)(nil).SizeBytes()) } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FdArray) MarshalBytes(dst []byte) []byte { arrLen := primitive.Uint16(len(*f)) dst = arrLen.MarshalUnsafe(dst) return MarshalUnsafeFDIDSlice(*f, dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (f *FdArray) CheckedUnmarshal(src []byte) ([]byte, bool) { *f = (*f)[:0] if f.SizeBytes() > len(src) { return src, false } var arrLen primitive.Uint16 srcRemain := arrLen.UnmarshalUnsafe(src) if int(arrLen)*(*FDID)(nil).SizeBytes() > len(srcRemain) { return src, false } if cap(*f) < int(arrLen) { *f = make(FdArray, arrLen) } else { *f = (*f)[:arrLen] } return UnmarshalUnsafeFDIDSlice(*f, srcRemain), true } // CloseReq is used to close(2) FDs. type CloseReq struct { FDs FdArray } // String implements fmt.Stringer.String. func (c *CloseReq) String() string { return fmt.Sprintf("CloseReq{FDs: %s}", c.FDs.String()) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (c *CloseReq) SizeBytes() int { return c.FDs.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (c *CloseReq) MarshalBytes(dst []byte) []byte { return c.FDs.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (c *CloseReq) CheckedUnmarshal(src []byte) ([]byte, bool) { return c.FDs.CheckedUnmarshal(src) } // CloseResp is an empty response to CloseReq. type CloseResp struct{ EmptyMessage } // String implements fmt.Stringer.String. func (*CloseResp) String() string { return "CloseResp{}" } // FsyncReq is used to fsync(2) FDs. type FsyncReq struct { FDs FdArray } // String implements fmt.Stringer.String. func (f *FsyncReq) String() string { return fmt.Sprintf("FsyncReq{FDs: %s}", f.FDs.String()) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FsyncReq) SizeBytes() int { return f.FDs.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FsyncReq) MarshalBytes(dst []byte) []byte { return f.FDs.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (f *FsyncReq) CheckedUnmarshal(src []byte) ([]byte, bool) { return f.FDs.CheckedUnmarshal(src) } // FsyncResp is an empty response to FsyncReq. type FsyncResp struct{ EmptyMessage } // String implements fmt.Stringer.String. func (*FsyncResp) String() string { return "FsyncResp{}" } // PReadReq is used to pread(2) on an FD. // // +marshal boundCheck type PReadReq struct { Offset uint64 FD FDID Count uint32 _ uint32 // Need to make struct packed. } // String implements fmt.Stringer.String. func (r *PReadReq) String() string { return fmt.Sprintf("PReadReq{Offset: %d, FD: %d, Count: %d}", r.Offset, r.FD, r.Count) } // PReadResp is used to return the result of pread(2). type PReadResp struct { NumBytes primitive.Uint64 Buf []byte } // String implements fmt.Stringer.String. func (r *PReadResp) String() string { return fmt.Sprintf("PReadResp{NumBytes: %d, Buf: [...%d bytes...]}", r.NumBytes, len(r.Buf)) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *PReadResp) SizeBytes() int { return r.NumBytes.SizeBytes() + int(r.NumBytes) } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *PReadResp) MarshalBytes(dst []byte) []byte { dst = r.NumBytes.MarshalUnsafe(dst) return dst[copy(dst[:r.NumBytes], r.Buf[:r.NumBytes]):] } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (r *PReadResp) CheckedUnmarshal(src []byte) ([]byte, bool) { srcRemain, ok := r.NumBytes.CheckedUnmarshal(src) if !ok || uint32(r.NumBytes) > uint32(len(srcRemain)) || uint32(r.NumBytes) > uint32(len(r.Buf)) { return src, false } // We expect the client to have already allocated r.Buf. r.Buf probably // (optimally) points to usermem. Directly copy into that. r.Buf = r.Buf[:r.NumBytes] return srcRemain[copy(r.Buf, srcRemain[:r.NumBytes]):], true } // PWriteReq is used to pwrite(2) on an FD. type PWriteReq struct { Offset primitive.Uint64 FD FDID NumBytes primitive.Uint32 Buf []byte } // String implements fmt.Stringer.String. func (w *PWriteReq) String() string { return fmt.Sprintf("PWriteReq{Offset: %d, FD: %d, NumBytes: %d, Buf: [...%d bytes...]}", w.Offset, w.FD, w.NumBytes, len(w.Buf)) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (w *PWriteReq) SizeBytes() int { return w.Offset.SizeBytes() + w.FD.SizeBytes() + w.NumBytes.SizeBytes() + int(w.NumBytes) } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (w *PWriteReq) MarshalBytes(dst []byte) []byte { dst = w.Offset.MarshalUnsafe(dst) dst = w.FD.MarshalUnsafe(dst) dst = w.NumBytes.MarshalUnsafe(dst) return dst[copy(dst[:w.NumBytes], w.Buf[:w.NumBytes]):] } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (w *PWriteReq) CheckedUnmarshal(src []byte) ([]byte, bool) { w.NumBytes = 0 if w.SizeBytes() > len(src) { return src, false } srcRemain := w.Offset.UnmarshalUnsafe(src) srcRemain = w.FD.UnmarshalUnsafe(srcRemain) srcRemain = w.NumBytes.UnmarshalUnsafe(srcRemain) // This is an optimization. Assuming that the server is making this call, it // is safe to just point to src rather than allocating and copying. if uint32(w.NumBytes) > uint32(len(srcRemain)) { return src, false } w.Buf = srcRemain[:w.NumBytes] return srcRemain[w.NumBytes:], true } // PWriteResp is used to return the result of pwrite(2). // // +marshal boundCheck type PWriteResp struct { Count uint64 } // String implements fmt.Stringer.String. func (w *PWriteResp) String() string { return fmt.Sprintf("PWriteResp{Count: %d}", w.Count) } // MkdirAtReq is used to make MkdirAt requests. type MkdirAtReq struct { createCommon Name SizedString } // String implements fmt.Stringer.String. func (m *MkdirAtReq) String() string { return fmt.Sprintf("MkdirAtReq{DirFD: %d, Mode: %s, UID: %d, GID: %d, Name: %s}", m.DirFD, m.Mode, m.UID, m.GID, m.Name) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (m *MkdirAtReq) SizeBytes() int { return m.createCommon.SizeBytes() + m.Name.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (m *MkdirAtReq) MarshalBytes(dst []byte) []byte { dst = m.createCommon.MarshalUnsafe(dst) return m.Name.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (m *MkdirAtReq) CheckedUnmarshal(src []byte) ([]byte, bool) { m.Name = "" if m.SizeBytes() > len(src) { return src, false } srcRemain := m.createCommon.UnmarshalUnsafe(src) if srcRemain, ok := m.Name.CheckedUnmarshal(srcRemain); ok { return srcRemain, true } return src, false } // MkdirAtResp is the response to a successful MkdirAt request. // // +marshal boundCheck type MkdirAtResp struct { ChildDir Inode } // String implements fmt.Stringer.String. func (m *MkdirAtResp) String() string { return fmt.Sprintf("MkdirAtResp{ChildDir: %s}", m.ChildDir.String()) } // MknodAtReq is used to make MknodAt requests. type MknodAtReq struct { createCommon Minor primitive.Uint32 Major primitive.Uint32 Name SizedString } // String implements fmt.Stringer.String. func (m *MknodAtReq) String() string { return fmt.Sprintf("MknodAtReq{DirFD: %d, Mode: %s, UID: %d, GID: %d, Minor: %d, Major: %d, Name: %s}", m.DirFD, m.Mode, m.UID, m.GID, m.Minor, m.Major, m.Name) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (m *MknodAtReq) SizeBytes() int { return m.createCommon.SizeBytes() + m.Minor.SizeBytes() + m.Major.SizeBytes() + m.Name.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (m *MknodAtReq) MarshalBytes(dst []byte) []byte { dst = m.createCommon.MarshalUnsafe(dst) dst = m.Minor.MarshalUnsafe(dst) dst = m.Major.MarshalUnsafe(dst) return m.Name.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (m *MknodAtReq) CheckedUnmarshal(src []byte) ([]byte, bool) { m.Name = "" if m.SizeBytes() > len(src) { return src, false } srcRemain := m.createCommon.UnmarshalUnsafe(src) srcRemain = m.Minor.UnmarshalUnsafe(srcRemain) srcRemain = m.Major.UnmarshalUnsafe(srcRemain) if srcRemain, ok := m.Name.CheckedUnmarshal(srcRemain); ok { return srcRemain, true } return src, false } // MknodAtResp is the response to a successful MknodAt request. // // +marshal boundCheck type MknodAtResp struct { Child Inode } // String implements fmt.Stringer.String. func (m *MknodAtResp) String() string { return fmt.Sprintf("MknodAtResp{Child: %s}", m.Child.String()) } // SymlinkAtReq is used to make SymlinkAt request. type SymlinkAtReq struct { DirFD FDID UID UID GID GID Name SizedString Target SizedString } // String implements fmt.Stringer.String. func (s *SymlinkAtReq) String() string { return fmt.Sprintf("SymlinkAtReq{DirFD: %d, UID: %d, GID: %d, Name: %s, Target: %s}", s.DirFD, s.UID, s.GID, s.Name, s.Target) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SymlinkAtReq) SizeBytes() int { return s.DirFD.SizeBytes() + s.UID.SizeBytes() + s.GID.SizeBytes() + s.Name.SizeBytes() + s.Target.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SymlinkAtReq) MarshalBytes(dst []byte) []byte { dst = s.DirFD.MarshalUnsafe(dst) dst = s.UID.MarshalUnsafe(dst) dst = s.GID.MarshalUnsafe(dst) dst = s.Name.MarshalBytes(dst) return s.Target.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (s *SymlinkAtReq) CheckedUnmarshal(src []byte) ([]byte, bool) { s.Name = "" s.Target = "" if s.SizeBytes() > len(src) { return src, false } srcRemain := s.DirFD.UnmarshalUnsafe(src) srcRemain = s.UID.UnmarshalUnsafe(srcRemain) srcRemain = s.GID.UnmarshalUnsafe(srcRemain) var ok bool if srcRemain, ok = s.Name.CheckedUnmarshal(srcRemain); !ok { return src, false } if srcRemain, ok = s.Target.CheckedUnmarshal(srcRemain); !ok { return src, false } return srcRemain, true } // SymlinkAtResp is the response to a successful SymlinkAt request. // // +marshal boundCheck type SymlinkAtResp struct { Symlink Inode } // String implements fmt.Stringer.String. func (s *SymlinkAtResp) String() string { return fmt.Sprintf("SymlinkAtResp{Symlink: %s}", s.Symlink.String()) } // LinkAtReq is used to make LinkAt requests. type LinkAtReq struct { DirFD FDID Target FDID Name SizedString } // String implements fmt.Stringer.String. func (l *LinkAtReq) String() string { return fmt.Sprintf("LinkAtReq{DirFD: %d, Target: %d, Name: %s}", l.DirFD, l.Target, l.Name) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (l *LinkAtReq) SizeBytes() int { return l.DirFD.SizeBytes() + l.Target.SizeBytes() + l.Name.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (l *LinkAtReq) MarshalBytes(dst []byte) []byte { dst = l.DirFD.MarshalUnsafe(dst) dst = l.Target.MarshalUnsafe(dst) return l.Name.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (l *LinkAtReq) CheckedUnmarshal(src []byte) ([]byte, bool) { l.Name = "" if l.SizeBytes() > len(src) { return src, false } srcRemain := l.DirFD.UnmarshalUnsafe(src) srcRemain = l.Target.UnmarshalUnsafe(srcRemain) if srcRemain, ok := l.Name.CheckedUnmarshal(srcRemain); ok { return srcRemain, true } return src, false } // LinkAtResp is used to respond to a successful LinkAt request. // // +marshal boundCheck type LinkAtResp struct { Link Inode } // String implements fmt.Stringer.String. func (l *LinkAtResp) String() string { return fmt.Sprintf("LinkAtResp{Link: %s}", l.Link.String()) } // FStatFSReq is used to request StatFS results for the specified FD. // // +marshal boundCheck type FStatFSReq struct { FD FDID } // String implements fmt.Stringer.String. func (s *FStatFSReq) String() string { return fmt.Sprintf("FStatFSReq{FD: %d}", s.FD) } // StatFS is responded to a successful FStatFS request. // // +marshal boundCheck type StatFS struct { Type uint64 BlockSize int64 Blocks uint64 BlocksFree uint64 BlocksAvailable uint64 Files uint64 FilesFree uint64 NameLength uint64 } // String implements fmt.Stringer.String. func (s *StatFS) String() string { return fmt.Sprintf("StatFS{Type: %d, BlockSize: %d, Blocks: %d, BlocksFree: %d, BlocksAvailable: %d, Files: %d, FilesFree: %d, NameLength: %d}", s.Type, s.BlockSize, s.Blocks, s.BlocksFree, s.BlocksAvailable, s.Files, s.FilesFree, s.NameLength) } // FAllocateReq is used to request to fallocate(2) an FD. This has no response. // // +marshal boundCheck type FAllocateReq struct { FD FDID Mode uint64 Offset uint64 Length uint64 } // String implements fmt.Stringer.String. func (a *FAllocateReq) String() string { return fmt.Sprintf("FAllocateReq{FD: %d, Mode: %d, Offset: %d, Length: %d}", a.FD, a.Mode, a.Offset, a.Length) } // FAllocateResp is an empty response to FAllocateReq. type FAllocateResp struct{ EmptyMessage } // String implements fmt.Stringer.String. func (*FAllocateResp) String() string { return "FAllocateResp{}" } // ReadLinkAtReq is used to readlinkat(2) at the specified FD. // // +marshal boundCheck type ReadLinkAtReq struct { FD FDID } // String implements fmt.Stringer.String. func (r *ReadLinkAtReq) String() string { return fmt.Sprintf("ReadLinkAtReq{FD: %d}", r.FD) } // ReadLinkAtResp is used to communicate ReadLinkAt results. type ReadLinkAtResp struct { Target SizedString } // String implements fmt.Stringer.String. func (r *ReadLinkAtResp) String() string { return fmt.Sprintf("ReadLinkAtResp{Target: %s}", r.Target) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *ReadLinkAtResp) SizeBytes() int { return r.Target.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *ReadLinkAtResp) MarshalBytes(dst []byte) []byte { return r.Target.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (r *ReadLinkAtResp) CheckedUnmarshal(src []byte) ([]byte, bool) { return r.Target.CheckedUnmarshal(src) } // FlushReq is used to make Flush requests. // // +marshal boundCheck type FlushReq struct { FD FDID } // String implements fmt.Stringer.String. func (f *FlushReq) String() string { return fmt.Sprintf("FlushReq{FD: %d}", f.FD) } // FlushResp is an empty response to FlushReq. type FlushResp struct{ EmptyMessage } // String implements fmt.Stringer.String. func (*FlushResp) String() string { return "FlushResp{}" } // ConnectReq is used to make a Connect request. // // +marshal boundCheck type ConnectReq struct { FD FDID // SockType is used to specify the socket type to connect to. As a special // case, SockType = 0 means that the socket type does not matter and the // requester will accept any socket type. SockType uint32 _ uint32 // Need to make struct packed. } // String implements fmt.Stringer.String. func (c *ConnectReq) String() string { return fmt.Sprintf("ConnectReq{FD: %d, SockType: %d}", c.FD, c.SockType) } // ConnectResp is an empty response to ConnectReq. type ConnectResp struct{ EmptyMessage } // String implements fmt.Stringer.String. func (*ConnectResp) String() string { return "ConnectResp{}" } // BindAtReq is used to make BindAt requests. type BindAtReq struct { createCommon SockType primitive.Uint32 Name SizedString } // SizeBytes implements marshal.Marshallable.SizeBytes. func (b *BindAtReq) SizeBytes() int { return b.createCommon.SizeBytes() + b.SockType.SizeBytes() + b.Name.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (b *BindAtReq) MarshalBytes(dst []byte) []byte { dst = b.createCommon.MarshalUnsafe(dst) dst = b.SockType.MarshalUnsafe(dst) return b.Name.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (b *BindAtReq) CheckedUnmarshal(src []byte) ([]byte, bool) { b.Name = "" if b.SizeBytes() > len(src) { return src, false } srcRemain := b.createCommon.UnmarshalUnsafe(src) srcRemain = b.SockType.UnmarshalUnsafe(srcRemain) if srcRemain, ok := b.Name.CheckedUnmarshal(srcRemain); ok { return srcRemain, ok } return src, false } // String implements fmt.Stringer.String. func (b *BindAtReq) String() string { return fmt.Sprintf("BindAtReq{DirFD: %d, Mode: %s, UID: %d, GID: %d, SockType: %d, Name: %q}", b.DirFD, b.Mode, b.UID, b.GID, b.SockType, b.Name) } // BindAtResp is used to communicate BindAt response. // // +marshal boundCheck type BindAtResp struct { Child Inode BoundSocketFD FDID } // String implements fmt.Stringer.String. func (b *BindAtResp) String() string { return fmt.Sprintf("BindAtResp{Child: %s, BoundSocketFD: %d}", b.Child.String(), b.BoundSocketFD) } // ListenReq is used to make Listen requests. // // +marshal boundCheck type ListenReq struct { FD FDID Backlog int32 _ uint32 } // String implements fmt.Stringer.String. func (l *ListenReq) String() string { return fmt.Sprintf("ListenReq{FD: %v, Backlog: %d}", l.FD, l.Backlog) } // ListenResp is an empty response to ListenResp. type ListenResp struct{ EmptyMessage } // String implements fmt.Stringer.String. func (*ListenResp) String() string { return "ListenResp{}" } // AcceptReq is used to make AcceptRequests. // // +marshal boundCheck type AcceptReq struct { FD FDID } // String implements fmt.Stringer.String. func (a *AcceptReq) String() string { return fmt.Sprintf("AcceptReq{FD: %v}", a.FD) } // AcceptResp is an empty response to AcceptResp. type AcceptResp struct { PeerAddr SizedString } // String implements fmt.Stringer.String. func (a *AcceptResp) String() string { return fmt.Sprintf("AcceptResp{PeerAddr: %s}", a.PeerAddr) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (a *AcceptResp) SizeBytes() int { return a.PeerAddr.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (a *AcceptResp) MarshalBytes(dst []byte) []byte { return a.PeerAddr.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (a *AcceptResp) CheckedUnmarshal(src []byte) ([]byte, bool) { return a.PeerAddr.CheckedUnmarshal(src) } // UnlinkAtReq is used to make UnlinkAt request. type UnlinkAtReq struct { DirFD FDID Flags primitive.Uint32 Name SizedString } // String implements fmt.Stringer.String. func (u *UnlinkAtReq) String() string { return fmt.Sprintf("UnlinkAtReq{DirFD: %d, Flags: %#x, Name: %s}", u.DirFD, u.Flags, u.Name) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UnlinkAtReq) SizeBytes() int { return u.DirFD.SizeBytes() + u.Flags.SizeBytes() + u.Name.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UnlinkAtReq) MarshalBytes(dst []byte) []byte { dst = u.DirFD.MarshalUnsafe(dst) dst = u.Flags.MarshalUnsafe(dst) return u.Name.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (u *UnlinkAtReq) CheckedUnmarshal(src []byte) ([]byte, bool) { u.Name = "" if u.SizeBytes() > len(src) { return src, false } srcRemain := u.DirFD.UnmarshalUnsafe(src) srcRemain = u.Flags.UnmarshalUnsafe(srcRemain) if srcRemain, ok := u.Name.CheckedUnmarshal(srcRemain); ok { return srcRemain, true } return src, false } // UnlinkAtResp is an empty response to UnlinkAtReq. type UnlinkAtResp struct{ EmptyMessage } // String implements fmt.Stringer.String. func (*UnlinkAtResp) String() string { return "UnlinkAtResp{}" } // RenameAtReq is used to make RenameAt requests. Note that the request takes in // the to-be-renamed file's FD instead of oldDir and oldName like renameat(2). type RenameAtReq struct { OldDir FDID NewDir FDID OldName SizedString NewName SizedString } // String implements fmt.Stringer.String. func (r *RenameAtReq) String() string { return fmt.Sprintf("RenameAtReq{OldDir: %d, NewDir: %d, OldName: %s, NewName: %s}", r.OldDir, r.NewDir, r.OldName, r.NewName) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *RenameAtReq) SizeBytes() int { return r.OldDir.SizeBytes() + r.NewDir.SizeBytes() + r.OldName.SizeBytes() + r.NewName.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *RenameAtReq) MarshalBytes(dst []byte) []byte { dst = r.OldDir.MarshalUnsafe(dst) dst = r.NewDir.MarshalUnsafe(dst) dst = r.OldName.MarshalBytes(dst) return r.NewName.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (r *RenameAtReq) CheckedUnmarshal(src []byte) ([]byte, bool) { r.OldName = "" r.NewName = "" if r.SizeBytes() > len(src) { return src, false } srcRemain := r.OldDir.UnmarshalUnsafe(src) srcRemain = r.NewDir.UnmarshalUnsafe(srcRemain) var ok bool if srcRemain, ok = r.OldName.CheckedUnmarshal(srcRemain); !ok { return src, false } if srcRemain, ok = r.NewName.CheckedUnmarshal(srcRemain); !ok { return src, false } return srcRemain, true } // RenameAtResp is an empty response to RenameAtReq. type RenameAtResp struct{ EmptyMessage } // String implements fmt.Stringer.String. func (*RenameAtResp) String() string { return "RenameAtResp{}" } // Getdents64Req is used to make Getdents64 requests. // // +marshal boundCheck type Getdents64Req struct { DirFD FDID // Count is the number of bytes to read. A negative value of Count is used to // indicate that the implementation must lseek(0, SEEK_SET) before calling // getdents64(2). Implementations must use the absolute value of Count to // determine the number of bytes to read. Count int32 _ uint32 // Need to make struct packed. } // String implements fmt.Stringer.String. func (g *Getdents64Req) String() string { return fmt.Sprintf("Getdents64Req{DirFD: %d, Count: %d}", g.DirFD, g.Count) } // Dirent64 is analogous to struct linux_dirent64. type Dirent64 struct { Ino primitive.Uint64 DevMinor primitive.Uint32 DevMajor primitive.Uint32 Off primitive.Uint64 Type primitive.Uint8 Name SizedString } // String implements fmt.Stringer.String. func (d *Dirent64) String() string { return fmt.Sprintf("Dirent64{Ino: %d, DevMinor: %d, DevMajor: %d, Off: %d, Type: %d, Name: %s}", d.Ino, d.DevMinor, d.DevMajor, d.Off, d.Type, d.Name) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (d *Dirent64) SizeBytes() int { return d.Ino.SizeBytes() + d.DevMinor.SizeBytes() + d.DevMajor.SizeBytes() + d.Off.SizeBytes() + d.Type.SizeBytes() + d.Name.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (d *Dirent64) MarshalBytes(dst []byte) []byte { dst = d.Ino.MarshalUnsafe(dst) dst = d.DevMinor.MarshalUnsafe(dst) dst = d.DevMajor.MarshalUnsafe(dst) dst = d.Off.MarshalUnsafe(dst) dst = d.Type.MarshalUnsafe(dst) return d.Name.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (d *Dirent64) CheckedUnmarshal(src []byte) ([]byte, bool) { d.Name = "" if d.SizeBytes() > len(src) { return src, false } srcRemain := d.Ino.UnmarshalUnsafe(src) srcRemain = d.DevMinor.UnmarshalUnsafe(srcRemain) srcRemain = d.DevMajor.UnmarshalUnsafe(srcRemain) srcRemain = d.Off.UnmarshalUnsafe(srcRemain) srcRemain = d.Type.UnmarshalUnsafe(srcRemain) if srcRemain, ok := d.Name.CheckedUnmarshal(srcRemain); ok { return srcRemain, true } return src, false } // Getdents64Resp is used to communicate getdents64 results. In memory, the // dirents array is preceded by a uint16 integer denoting array length. type Getdents64Resp struct { Dirents []Dirent64 } // String implements fmt.Stringer.String. func (g *Getdents64Resp) String() string { var b strings.Builder b.WriteString("[") for i, dirent := range g.Dirents { if i > 0 { b.WriteString(", ") } b.WriteString(dirent.String()) } b.WriteString("]") return fmt.Sprintf("Getdents64Resp{Dirents: %s}", b.String()) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (g *Getdents64Resp) SizeBytes() int { ret := (*primitive.Uint16)(nil).SizeBytes() for i := range g.Dirents { ret += g.Dirents[i].SizeBytes() } return ret } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (g *Getdents64Resp) MarshalBytes(dst []byte) []byte { numDirents := primitive.Uint16(len(g.Dirents)) dst = numDirents.MarshalUnsafe(dst) for i := range g.Dirents { dst = g.Dirents[i].MarshalBytes(dst) } return dst } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (g *Getdents64Resp) CheckedUnmarshal(src []byte) ([]byte, bool) { g.Dirents = g.Dirents[:0] if g.SizeBytes() > len(src) { return src, false } var numDirents primitive.Uint16 srcRemain := numDirents.UnmarshalUnsafe(src) if cap(g.Dirents) < int(numDirents) { g.Dirents = make([]Dirent64, numDirents) } else { g.Dirents = g.Dirents[:numDirents] } var ok bool for i := range g.Dirents { if srcRemain, ok = g.Dirents[i].CheckedUnmarshal(srcRemain); !ok { return src, false } } return srcRemain, true } // FGetXattrReq is used to make FGetXattr requests. The response to this is // just a SizedString containing the xattr value. type FGetXattrReq struct { FD FDID BufSize primitive.Uint32 Name SizedString } // String implements fmt.Stringer.String. func (g *FGetXattrReq) String() string { return fmt.Sprintf("FGetXattrReq{FD: %d, BufSize: %d, Name: %s}", g.FD, g.BufSize, g.Name) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (g *FGetXattrReq) SizeBytes() int { return g.FD.SizeBytes() + g.BufSize.SizeBytes() + g.Name.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (g *FGetXattrReq) MarshalBytes(dst []byte) []byte { dst = g.FD.MarshalUnsafe(dst) dst = g.BufSize.MarshalUnsafe(dst) return g.Name.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (g *FGetXattrReq) CheckedUnmarshal(src []byte) ([]byte, bool) { g.Name = "" if g.SizeBytes() > len(src) { return src, false } srcRemain := g.FD.UnmarshalUnsafe(src) srcRemain = g.BufSize.UnmarshalUnsafe(srcRemain) if srcRemain, ok := g.Name.CheckedUnmarshal(srcRemain); ok { return srcRemain, true } return src, false } // FGetXattrResp is used to respond to FGetXattr request. type FGetXattrResp struct { Value SizedString } // String implements fmt.Stringer.String. func (g *FGetXattrResp) String() string { return fmt.Sprintf("FGetXattrResp{Value: %s}", g.Value) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (g *FGetXattrResp) SizeBytes() int { return g.Value.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (g *FGetXattrResp) MarshalBytes(dst []byte) []byte { return g.Value.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (g *FGetXattrResp) CheckedUnmarshal(src []byte) ([]byte, bool) { return g.Value.CheckedUnmarshal(src) } // FSetXattrReq is used to make FSetXattr requests. It has no response. type FSetXattrReq struct { FD FDID Flags primitive.Uint32 Name SizedString Value SizedString } // String implements fmt.Stringer.String. func (s *FSetXattrReq) String() string { return fmt.Sprintf("FSetXattrReq{FD: %d, Flags: %#x, Name: %s, Value: %s}", s.FD, s.Flags, s.Name, s.Value) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *FSetXattrReq) SizeBytes() int { return s.FD.SizeBytes() + s.Flags.SizeBytes() + s.Name.SizeBytes() + s.Value.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *FSetXattrReq) MarshalBytes(dst []byte) []byte { dst = s.FD.MarshalUnsafe(dst) dst = s.Flags.MarshalUnsafe(dst) dst = s.Name.MarshalBytes(dst) return s.Value.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (s *FSetXattrReq) CheckedUnmarshal(src []byte) ([]byte, bool) { s.Name = "" s.Value = "" if s.SizeBytes() > len(src) { return src, false } srcRemain := s.FD.UnmarshalUnsafe(src) srcRemain = s.Flags.UnmarshalUnsafe(srcRemain) var ok bool if srcRemain, ok = s.Name.CheckedUnmarshal(srcRemain); !ok { return src, false } if srcRemain, ok = s.Value.CheckedUnmarshal(srcRemain); !ok { return src, false } return srcRemain, true } // FSetXattrResp is an empty response to FSetXattrReq. type FSetXattrResp struct{ EmptyMessage } // String implements fmt.Stringer.String. func (*FSetXattrResp) String() string { return "FSetXattrResp{}" } // FRemoveXattrReq is used to make FRemoveXattr requests. It has no response. type FRemoveXattrReq struct { FD FDID Name SizedString } // String implements fmt.Stringer.String. func (r *FRemoveXattrReq) String() string { return fmt.Sprintf("FRemoveXattrReq{FD: %d, Name: %s}", r.FD, r.Name) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *FRemoveXattrReq) SizeBytes() int { return r.FD.SizeBytes() + r.Name.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *FRemoveXattrReq) MarshalBytes(dst []byte) []byte { dst = r.FD.MarshalUnsafe(dst) return r.Name.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (r *FRemoveXattrReq) CheckedUnmarshal(src []byte) ([]byte, bool) { r.Name = "" if r.SizeBytes() > len(src) { return src, false } srcRemain := r.FD.UnmarshalUnsafe(src) if srcRemain, ok := r.Name.CheckedUnmarshal(srcRemain); ok { return srcRemain, true } return src, false } // FRemoveXattrResp is an empty response to FRemoveXattrReq. type FRemoveXattrResp struct{ EmptyMessage } // String implements fmt.Stringer.String. func (*FRemoveXattrResp) String() string { return "FRemoveXattrResp{}" } // FListXattrReq is used to make FListXattr requests. // // +marshal boundCheck type FListXattrReq struct { FD FDID Size uint64 } // String implements fmt.Stringer.String. func (l *FListXattrReq) String() string { return fmt.Sprintf("FListXattrReq{FD: %d, Size: %d}", l.FD, l.Size) } // FListXattrResp is used to respond to FListXattr requests. type FListXattrResp struct { Xattrs StringArray } // String implements fmt.Stringer.String. func (l *FListXattrResp) String() string { return fmt.Sprintf("FListXattrResp{Xattrs: %s}", l.Xattrs.String()) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (l *FListXattrResp) SizeBytes() int { return l.Xattrs.SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (l *FListXattrResp) MarshalBytes(dst []byte) []byte { return l.Xattrs.MarshalBytes(dst) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (l *FListXattrResp) CheckedUnmarshal(src []byte) ([]byte, bool) { return l.Xattrs.CheckedUnmarshal(src) } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/node.go000066400000000000000000000233421465435605700221650ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package lisafs import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sync" ) // numStaticChildren is the number of static children tracked by each node. // Sampling certain filesystem heavy workloads showed that a majority of // directories store at most 5 children in their map. This should be kept low // to minimize the memory overhead for each node. 5 is fairly low and at the // same time helps avoid map allocations for majority of nodes. Benchmarking // also showed that static arrays are faster than maps for lookups until n=8. const numStaticChildren = 5 // Node is a node on the filesystem tree. A Node is shared by all the // ControlFDs opened on that position. For a given Server, there will only be // one Node for a given filesystem position. // // Reference Model: // - Each node holds a ref on its parent for its entire lifetime. type Node struct { // node's ref count is protected by its parent's childrenMu. nodeRefs // opMu synchronizes high level operations on this path. // // It is used to ensure the following which are important for security: // * This node's data is protected by opMu. So all operations that change its // data should hold opMu for writing. For example: write, setstat, setxattr, // etc. This entails that if this node represents a directory, creation and // deletion operations happening directly under this directory must lock // opMu for writing. All operations accessing data must hold opMu for // reading. This is to avoid the can of worms that open when creation and // deletion are allowed to race. This prevents any walks from occurring // during creation or deletion. // * When this node is being deleted, the deletion handler must hold opMu for // writing. This ensures that there are no concurrent operations going on // this node while it is being deleted and potentially being replaced with // something hazardous. // // A useful consequence of the above is that holding opMu for reading // guarantees that the Server can not change Nodes on the path until this // Node. For instance, if the grandparent needs to be renamed or deleted, // the client must first delete this node to avoid ENOTEMPTY error. Deleting // this node is not possible while opMu is read locked. opMu sync.RWMutex // deleted indicates whether the backing file has been unlinked. This can be // used to deny operations on FDs on this Node after deletion because it is // not safe for FD implementations to do host walks up to this position // anymore. This node may have been replaced with something hazardous. // deleted is protected by opMu. deleted must only be accessed/mutated using // atomics; see markDeletedRecursive for more details. deleted atomicbitops.Uint32 // name is the name of the file represented by this Node in parent. If this // FD represents the root directory, then name is an empty string. name is // protected by the backing server's rename mutex. name string // parent is this parent node which tracks this node as a child. parent is // protected by the backing server's rename mutex. parent *Node // controlFDs is a linked list of all the ControlFDs opened on this node. // Prefer this over a slice to avoid additional allocations. Each ControlFD // is an implicit linked list node so there are no additional allocations // needed to maintain the linked list. controlFDsMu sync.Mutex controlFDs controlFDList // Here is a performance hack. Past experience has shown that map allocations // on each node for tracking children costs a lot of memory. More small // allocations also fragment memory. To save allocations, statically track // upto numStaticChildren children using hardcoded pointers. If more children // are inserted then move to a map. Use dynamicChildren iff it is non-nil. // The following fields are protected by childrenMu. childrenMu sync.Mutex staticChildren [numStaticChildren]struct { name string node *Node } dynamicChildren map[string]*Node } // DecRef implements refs.RefCounter.DecRef. Note that the context // parameter should never be used. It exists solely to comply with the // refs.RefCounter interface. // // Precondition: server's rename mutex must be at least read locked. func (n *Node) DecRef(context.Context) { if n.parent == nil { n.nodeRefs.DecRef(nil) return } // If this is the only ref on node then it will need to be destroyed. n.parent.childrenMu.Lock() deleted := false n.nodeRefs.DecRef(func() { n.parent.removeChildLocked(n.name) deleted = true }) n.parent.childrenMu.Unlock() if deleted { // Drop ref on parent. Keep Decref call lock free for scalability. n.parent.DecRef(nil) } } // InitLocked must be called before first use of fd. // // Precondition: parent.childrenMu is locked. // // Postconditions: A ref on n is transferred to the caller. func (n *Node) InitLocked(name string, parent *Node) { n.nodeRefs.InitRefs() n.name = name n.parent = parent if parent != nil { parent.IncRef() parent.insertChildLocked(name, n) } } // LookupChildLocked looks up for a child with given name. Returns nil if child // does not exist. // // Preconditions: childrenMu is locked. func (n *Node) LookupChildLocked(name string) *Node { if n.dynamicChildren != nil { return n.dynamicChildren[name] } for i := 0; i < numStaticChildren; i++ { if n.staticChildren[i].name == name { return n.staticChildren[i].node } } return nil } // WithChildrenMu executes fn with n.childrenMu locked. func (n *Node) WithChildrenMu(fn func()) { n.childrenMu.Lock() defer n.childrenMu.Unlock() fn() } // FilePath returns the absolute path of the backing file. This is an expensive // operation. The returned path should be free of any intermediate symlinks // because all internal (non-leaf) nodes are directories. // // Precondition: // - server's rename mutex must be at least read locked. Calling handlers must // at least have read concurrency guarantee from the server. func (n *Node) FilePath() string { // Walk upwards and prepend name to res. var res fspath.Builder for n.parent != nil { res.PrependComponent(n.name) n = n.parent } // n is the root node. res.PrependByte('/') return res.String() } func (n *Node) isDeleted() bool { return n.deleted.Load() != 0 } func (n *Node) removeFD(fd *ControlFD) { n.controlFDsMu.Lock() defer n.controlFDsMu.Unlock() n.controlFDs.Remove(fd) } func (n *Node) insertFD(fd *ControlFD) { n.controlFDsMu.Lock() defer n.controlFDsMu.Unlock() n.controlFDs.PushBack(fd) } func (n *Node) forEachFD(fn func(*ControlFD)) { n.controlFDsMu.Lock() defer n.controlFDsMu.Unlock() for fd := n.controlFDs.Front(); fd != nil; fd = fd.Next() { fn(fd) } } // removeChildLocked removes child with given name from n and returns the // removed child. Returns nil if no such child existed. // // Precondition: childrenMu is locked. func (n *Node) removeChildLocked(name string) *Node { if n.dynamicChildren != nil { toRemove := n.dynamicChildren[name] delete(n.dynamicChildren, name) return toRemove } for i := 0; i < numStaticChildren; i++ { if n.staticChildren[i].name == name { toRemove := n.staticChildren[i].node n.staticChildren[i].name = "" n.staticChildren[i].node = nil return toRemove } } return nil } // insertChildLocked inserts child into n. It does not check for duplicates. // // Precondition: childrenMu is locked. func (n *Node) insertChildLocked(name string, child *Node) { // Try to insert statically first if staticChildren is still being used. if n.dynamicChildren == nil { for i := 0; i < numStaticChildren; i++ { if n.staticChildren[i].node == nil { n.staticChildren[i].node = child n.staticChildren[i].name = name return } } // Ran out of static space. Need to start inserting dynamically. // Shift everything to the map. n.dynamicChildren = make(map[string]*Node) for i := 0; i < numStaticChildren; i++ { // From above loop we know all staticChildren entries are non-nil. n.dynamicChildren[n.staticChildren[i].name] = n.staticChildren[i].node n.staticChildren[i].name = "" n.staticChildren[i].node = nil } } n.dynamicChildren[name] = child } func (n *Node) forEachChild(fn func(*Node)) { n.childrenMu.Lock() defer n.childrenMu.Unlock() if n.dynamicChildren != nil { for _, child := range n.dynamicChildren { fn(child) } return } for i := 0; i < numStaticChildren; i++ { if n.staticChildren[i].node != nil { fn(n.staticChildren[i].node) } } } // Precondition: opMu must be locked for writing on the root node being marked // as deleted. func (n *Node) markDeletedRecursive() { n.deleted.Store(1) // No need to hold opMu for children as it introduces lock ordering issues // because forEachChild locks childrenMu. Locking opMu after childrenMu // violates the lock ordering. Anyway if a directory is being deleted, it // must not have children. The client must have already deleted the entire // subtree. If the client did not delete this subtree nodes, then the subtree // was deleted externally and there is not much we can do. This is best // effort work to mark the subtree as deleted. n.forEachChild(func(child *Node) { child.markDeletedRecursive() }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/node_fd_refs.go000066400000000000000000000100451465435605700236510ustar00rootroot00000000000000package lisafs import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const nodeenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var nodeobj *Node // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type nodeRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *nodeRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *nodeRefs) RefType() string { return fmt.Sprintf("%T", nodeobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *nodeRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *nodeRefs) LogRefs() bool { return nodeenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *nodeRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *nodeRefs) IncRef() { v := r.refCount.Add(1) if nodeenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *nodeRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if nodeenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *nodeRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if nodeenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *nodeRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/open_fd_list.go000066400000000000000000000120041465435605700236760ustar00rootroot00000000000000package lisafs // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type openFDElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (openFDElementMapper) linkerFor(elem *OpenFD) *OpenFD { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type openFDList struct { head *OpenFD tail *OpenFD } // Reset resets list l to the empty state. func (l *openFDList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *openFDList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *openFDList) Front() *OpenFD { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *openFDList) Back() *OpenFD { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *openFDList) Len() (count int) { for e := l.Front(); e != nil; e = (openFDElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *openFDList) PushFront(e *OpenFD) { linker := openFDElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { openFDElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *openFDList) PushFrontList(m *openFDList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { openFDElementMapper{}.linkerFor(l.head).SetPrev(m.tail) openFDElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *openFDList) PushBack(e *OpenFD) { linker := openFDElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { openFDElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *openFDList) PushBackList(m *openFDList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { openFDElementMapper{}.linkerFor(l.tail).SetNext(m.head) openFDElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *openFDList) InsertAfter(b, e *OpenFD) { bLinker := openFDElementMapper{}.linkerFor(b) eLinker := openFDElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { openFDElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *openFDList) InsertBefore(a, e *OpenFD) { aLinker := openFDElementMapper{}.linkerFor(a) eLinker := openFDElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { openFDElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *openFDList) Remove(e *OpenFD) { linker := openFDElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { openFDElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { openFDElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type openFDEntry struct { next *OpenFD prev *OpenFD } // Next returns the entry that follows e in the list. // //go:nosplit func (e *openFDEntry) Next() *OpenFD { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *openFDEntry) Prev() *OpenFD { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *openFDEntry) SetNext(elem *OpenFD) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *openFDEntry) SetPrev(elem *OpenFD) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/open_fd_refs.go000066400000000000000000000101111465435605700236570ustar00rootroot00000000000000package lisafs import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const openFDenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var openFDobj *OpenFD // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type openFDRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *openFDRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *openFDRefs) RefType() string { return fmt.Sprintf("%T", openFDobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *openFDRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *openFDRefs) LogRefs() bool { return openFDenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *openFDRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *openFDRefs) IncRef() { v := r.refCount.Add(1) if openFDenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *openFDRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if openFDenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *openFDRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if openFDenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *openFDRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/sample_message.go000066400000000000000000000074201465435605700242240ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package lisafs import ( "fmt" "math/rand" "gvisor.dev/gvisor/pkg/marshal/primitive" ) // MsgSimple is a sample packed struct which can be used to test message passing. // // +marshal slice:Msg1Slice type MsgSimple struct { A uint16 B uint16 C uint32 D uint64 } // Randomize randomizes the contents of m. func (m *MsgSimple) Randomize() { m.A = uint16(rand.Uint32()) m.B = uint16(rand.Uint32()) m.C = rand.Uint32() m.D = rand.Uint64() } // MsgDynamic is a sample dynamic struct which can be used to test message passing. // // +marshal dynamic type MsgDynamic struct { N primitive.Uint32 Arr []MsgSimple } // String implements fmt.Stringer.String. func (m *MsgDynamic) String() string { return fmt.Sprintf("MsgDynamic{N: %d, Arr: %v}", m.N, m.Arr) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (m *MsgDynamic) SizeBytes() int { return m.N.SizeBytes() + (int(m.N) * (*MsgSimple)(nil).SizeBytes()) } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (m *MsgDynamic) MarshalBytes(dst []byte) []byte { dst = m.N.MarshalUnsafe(dst) return MarshalUnsafeMsg1Slice(m.Arr, dst) } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (m *MsgDynamic) UnmarshalBytes(src []byte) []byte { src = m.N.UnmarshalUnsafe(src) m.Arr = make([]MsgSimple, m.N) return UnmarshalUnsafeMsg1Slice(m.Arr, src) } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (m *MsgDynamic) CheckedUnmarshal(src []byte) ([]byte, bool) { m.Arr = m.Arr[:0] if m.SizeBytes() > len(src) { return nil, false } src = m.N.UnmarshalUnsafe(src) if int(m.N) > cap(m.Arr) { m.Arr = make([]MsgSimple, m.N) } else { m.Arr = m.Arr[:m.N] } if int(m.N)*(*MsgSimple)(nil).SizeBytes() > len(src) { return nil, false } return UnmarshalUnsafeMsg1Slice(m.Arr, src), true } // Randomize randomizes the contents of m. func (m *MsgDynamic) Randomize(arrLen int) { m.N = primitive.Uint32(arrLen) m.Arr = make([]MsgSimple, arrLen) for i := 0; i < arrLen; i++ { m.Arr[i].Randomize() } } // P9Version mimics p9.TVersion and p9.Rversion. type P9Version struct { MSize primitive.Uint32 Version string } // String implements fmt.Stringer.String. func (v *P9Version) String() string { return fmt.Sprintf("P9Version{MSize: %d, Version: %s}", v.MSize, v.Version) } // SizeBytes implements marshal.Marshallable.SizeBytes. func (v *P9Version) SizeBytes() int { return (*primitive.Uint32)(nil).SizeBytes() + (*primitive.Uint16)(nil).SizeBytes() + len(v.Version) } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (v *P9Version) MarshalBytes(dst []byte) []byte { dst = v.MSize.MarshalUnsafe(dst) versionLen := primitive.Uint16(len(v.Version)) dst = versionLen.MarshalUnsafe(dst) return dst[copy(dst, v.Version):] } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (v *P9Version) CheckedUnmarshal(src []byte) ([]byte, bool) { v.Version = "" if v.SizeBytes() > len(src) { return nil, false } src = v.MSize.UnmarshalUnsafe(src) var versionLen primitive.Uint16 src = versionLen.UnmarshalUnsafe(src) if int(versionLen) > len(src) { return nil, false } v.Version = string(src[:versionLen]) return src[versionLen:], true } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/server.go000066400000000000000000000102051465435605700225400ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package lisafs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sync" ) // Server serves a filesystem tree. Multiple connections on different mount // points can be started on a server. The server provides utilities to safely // modify the filesystem tree across its connections (mount points). Note that // it does not support synchronizing filesystem tree mutations across other // servers serving the same filesystem subtree. Server also manages the // lifecycle of all connections. type Server struct { // connWg counts the number of active connections being tracked. connWg sync.WaitGroup // renameMu synchronizes rename operations within this filesystem tree. renameMu sync.RWMutex // handlers is a list of RPC handlers which can be indexed by the handler's // corresponding MID. handlers []RPCHandler // root is the root of the filesystem tree being managed by this server. // root is immutable. Server holds a ref on root for its entire lifetime. root *Node // impl is the server implementation which embeds this server. impl ServerImpl // opts is the server specific options. This dictates how some of the // messages are handled. opts ServerOpts } // ServerOpts defines some server implementation specific behavior. type ServerOpts struct { // WalkStatSupported is set to true if it's safe to call // ControlFDImpl.WalkStat and let the file implementation perform the walk // without holding locks on any of the descendant's Nodes. WalkStatSupported bool // SetAttrOnDeleted is set to true if it's safe to call ControlFDImpl.SetStat // for deleted files. SetAttrOnDeleted bool // AllocateOnDeleted is set to true if it's safe to call OpenFDImpl.Allocate // for deleted files. AllocateOnDeleted bool } // Init must be called before first use of the server. func (s *Server) Init(impl ServerImpl, opts ServerOpts) { s.impl = impl s.opts = opts s.handlers = handlers[:] s.root = &Node{} // s owns the ref on s.root. s.root.InitLocked("", nil) } // SetHandlers overrides the server's RPC handlers. Mainly should only be used // for tests. func (s *Server) SetHandlers(handlers []RPCHandler) { s.handlers = handlers } // withRenameReadLock invokes fn with the server's rename mutex locked for // reading. This ensures that no rename operations occur concurrently. func (s *Server) withRenameReadLock(fn func() error) error { s.renameMu.RLock() defer s.renameMu.RUnlock() return fn() } // StartConnection starts the connection on a separate goroutine and tracks it. func (s *Server) StartConnection(c *Connection) { s.connWg.Add(1) go func() { c.Run() s.connWg.Done() }() } // Wait waits for all connections started via StartConnection() to terminate. func (s *Server) Wait() { s.connWg.Wait() } // Destroy releases resources being used by this server. func (s *Server) Destroy() { s.root.DecRef(nil) } // ServerImpl contains the implementation details for a Server. // Implementations of ServerImpl should contain their associated Server by // value as their first field. type ServerImpl interface { // Mount is called when a Mount RPC is made. It mounts the connection on // mountNode. Mount may optionally donate a host FD to the mount point. // // Mount has a read concurrency guarantee on mountNode. Mount(c *Connection, mountNode *Node) (*ControlFD, linux.Statx, int, error) // SupportedMessages returns a list of messages that the server // implementation supports. SupportedMessages() []MID // MaxMessageSize is the maximum payload length (in bytes) that can be sent // to this server implementation. MaxMessageSize() uint32 } golang-gvisor-gvisor-0.0~20240729.0/pkg/lisafs/sock.go000066400000000000000000000122631465435605700221770ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package lisafs import ( "fmt" "io" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/unet" ) var ( sockHeaderLen = uint32((*sockHeader)(nil).SizeBytes()) ) // sockHeader is the header present in front of each message received on a UDS. // // +marshal type sockHeader struct { payloadLen uint32 message MID _ uint16 // Need to make struct packed. } // sockCommunicator implements Communicator. This is not thread safe. type sockCommunicator struct { fdTracker sock *unet.Socket buf []byte } var _ Communicator = (*sockCommunicator)(nil) func newSockComm(sock *unet.Socket) *sockCommunicator { return &sockCommunicator{ sock: sock, buf: make([]byte, sockHeaderLen), } } func (s *sockCommunicator) FD() int { return s.sock.FD() } func (s *sockCommunicator) destroy() { s.sock.Close() } func (s *sockCommunicator) shutdown() { if err := s.sock.Shutdown(); err != nil { log.Warningf("Socket.Shutdown() failed (FD: %d): %v", s.sock.FD(), err) } } func (s *sockCommunicator) resizeBuf(size uint32) { if cap(s.buf) < int(size) { s.buf = s.buf[:cap(s.buf)] s.buf = append(s.buf, make([]byte, int(size)-cap(s.buf))...) } else { s.buf = s.buf[:size] } } // PayloadBuf implements Communicator.PayloadBuf. func (s *sockCommunicator) PayloadBuf(size uint32) []byte { s.resizeBuf(sockHeaderLen + size) return s.buf[sockHeaderLen : sockHeaderLen+size] } // SndRcvMessage implements Communicator.SndRcvMessage. func (s *sockCommunicator) SndRcvMessage(m MID, payloadLen uint32, wantFDs uint8) (MID, uint32, error) { // Map the transport errors to EIO, but also log the real error. if err := s.sndPrepopulatedMsg(m, payloadLen, nil); err != nil { log.Warningf("socketCommunicator.SndRcvMessage: sndPrepopulatedMsg failed: %v", err) return 0, 0, unix.EIO } respM, respPayloadLen, err := s.rcvMsg(wantFDs) if err != nil { log.Warningf("socketCommunicator.SndRcvMessage: rcvMsg failed: %v", err) return 0, 0, unix.EIO } return respM, respPayloadLen, nil } // String implements fmt.Stringer.String. func (s *sockCommunicator) String() string { return fmt.Sprintf("sockComm %d", s.sock.FD()) } // sndPrepopulatedMsg assumes that s.buf has already been populated with // `payloadLen` bytes of data. func (s *sockCommunicator) sndPrepopulatedMsg(m MID, payloadLen uint32, fds []int) error { header := sockHeader{payloadLen: payloadLen, message: m} header.MarshalUnsafe(s.buf) dataLen := sockHeaderLen + payloadLen return writeTo(s.sock, [][]byte{s.buf[:dataLen]}, int(dataLen), fds) } // writeTo writes the passed iovec to the UDS and donates any passed FDs. func writeTo(sock *unet.Socket, iovec [][]byte, dataLen int, fds []int) error { w := sock.Writer(true) if len(fds) > 0 { w.PackFDs(fds...) } fdsUnpacked := false for n := 0; n < dataLen; { cur, err := w.WriteVec(iovec) if err != nil { return err } n += cur // Fast common path. if n >= dataLen { break } // Consume iovecs. for consumed := 0; consumed < cur; { if len(iovec[0]) <= cur-consumed { consumed += len(iovec[0]) iovec = iovec[1:] } else { iovec[0] = iovec[0][cur-consumed:] break } } if n > 0 && !fdsUnpacked { // Don't resend any control message. fdsUnpacked = true w.UnpackFDs() } } return nil } // rcvMsg reads the message header and payload from the UDS. It also populates // fds with any donated FDs. func (s *sockCommunicator) rcvMsg(wantFDs uint8) (MID, uint32, error) { fds, err := readFrom(s.sock, s.buf[:sockHeaderLen], wantFDs) if err != nil { return 0, 0, err } for _, fd := range fds { s.TrackFD(fd) } var header sockHeader header.UnmarshalUnsafe(s.buf) // No payload? We are done. if header.payloadLen == 0 { return header.message, 0, nil } if _, err := readFrom(s.sock, s.PayloadBuf(header.payloadLen), 0); err != nil { return 0, 0, err } return header.message, header.payloadLen, nil } // readFrom fills the passed buffer with data from the socket. It also returns // any donated FDs. func readFrom(sock *unet.Socket, buf []byte, wantFDs uint8) ([]int, error) { r := sock.Reader(true) r.EnableFDs(int(wantFDs)) var ( fds []int fdInit bool ) n := len(buf) for got := 0; got < n; { cur, err := r.ReadVec([][]byte{buf[got:]}) // Ignore EOF if cur > 0. if err != nil && (err != io.EOF || cur == 0) { r.CloseFDs() return nil, err } if !fdInit && cur > 0 { fds, err = r.ExtractFDs() if err != nil { return nil, err } fdInit = true r.EnableFDs(0) } got += cur } return fds, nil } func closeFDs(fds []int) { for _, fd := range fds { if fd >= 0 { unix.Close(fd) } } } golang-gvisor-gvisor-0.0~20240729.0/pkg/log/000077500000000000000000000000001465435605700202055ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/log/glog.go000066400000000000000000000046451465435605700214750ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package log import ( "fmt" "os" "runtime" "strings" "time" ) // GoogleEmitter is a wrapper that emits logs in a format compatible with // package github.com/golang/glog. type GoogleEmitter struct { *Writer } // pid is used for the threadid component of the header. var pid = os.Getpid() // Emit emits the message, google-style. // // Log lines have this form: // // Lmmdd hh:mm:ss.uuuuuu threadid file:line] msg... // // where the fields are defined as follows: // // L A single character, representing the log level (eg 'I' for INFO) // mm The month (zero padded; ie May is '05') // dd The day (zero padded) // hh:mm:ss.uuuuuu Time in hours, minutes and fractional seconds // threadid The space-padded thread ID as returned by GetTID() // file The file name // line The line number // msg The user-supplied message func (g GoogleEmitter) Emit(depth int, level Level, timestamp time.Time, format string, args ...any) { // Log level. prefix := byte('?') switch level { case Debug: prefix = byte('D') case Info: prefix = byte('I') case Warning: prefix = byte('W') } // Timestamp. _, month, day := timestamp.Date() hour, minute, second := timestamp.Clock() microsecond := int(timestamp.Nanosecond() / 1000) // 0 = this frame. _, file, line, ok := runtime.Caller(depth + 1) if ok { // Trim any directory path from the file. slash := strings.LastIndexByte(file, byte('/')) if slash >= 0 { file = file[slash+1:] } } else { // We don't have a filename. file = "???" line = 0 } // Generate the message. message := fmt.Sprintf(format, args...) // Emit the formatted result. fmt.Fprintf(g.Writer, "%c%02d%02d %02d:%02d:%02d.%06d % 7d %s:%d] %s\n", prefix, int(month), day, hour, minute, second, microsecond, pid, file, line, message) } golang-gvisor-gvisor-0.0~20240729.0/pkg/log/json.go000066400000000000000000000041361465435605700215110ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package log import ( "encoding/json" "fmt" "runtime" "strings" "time" ) type jsonLog struct { Msg string `json:"msg"` Level Level `json:"level"` Time time.Time `json:"time"` } // MarshalJSON implements json.Marshaler.MarashalJSON. func (l Level) MarshalJSON() ([]byte, error) { switch l { case Warning: return []byte(`"warning"`), nil case Info: return []byte(`"info"`), nil case Debug: return []byte(`"debug"`), nil default: return nil, fmt.Errorf("unknown level %v", l) } } // UnmarshalJSON implements json.Unmarshaler.UnmarshalJSON. It can unmarshal // from both string names and integers. func (l *Level) UnmarshalJSON(b []byte) error { switch s := string(b); s { case "0", `"warning"`: *l = Warning case "1", `"info"`: *l = Info case "2", `"debug"`: *l = Debug default: return fmt.Errorf("unknown level %q", s) } return nil } // JSONEmitter logs messages in json format. type JSONEmitter struct { *Writer } // Emit implements Emitter.Emit. func (e JSONEmitter) Emit(depth int, level Level, timestamp time.Time, format string, v ...any) { logLine := fmt.Sprintf(format, v...) if _, file, line, ok := runtime.Caller(depth + 1); ok { if slash := strings.LastIndexByte(file, byte('/')); slash >= 0 { file = file[slash+1:] // Trim any directory path from the file. } logLine = fmt.Sprintf("%s:%d] %s", file, line, logLine) } j := jsonLog{ Msg: logLine, Level: level, Time: timestamp, } b, err := json.Marshal(j) if err != nil { panic(err) } e.Writer.Write(b) } golang-gvisor-gvisor-0.0~20240729.0/pkg/log/json_k8s.go000066400000000000000000000027711465435605700223010ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package log import ( "encoding/json" "fmt" "runtime" "strings" "time" ) type k8sJSONLog struct { Log string `json:"log"` Level Level `json:"level"` Time time.Time `json:"time"` } // K8sJSONEmitter logs messages in json format that is compatible with // Kubernetes fluent configuration. type K8sJSONEmitter struct { *Writer } // Emit implements Emitter.Emit. func (e K8sJSONEmitter) Emit(depth int, level Level, timestamp time.Time, format string, v ...any) { logLine := fmt.Sprintf(format, v...) if _, file, line, ok := runtime.Caller(depth + 1); ok { if slash := strings.LastIndexByte(file, byte('/')); slash >= 0 { file = file[slash+1:] // Trim any directory path from the file. } logLine = fmt.Sprintf("%s:%d] %s", file, line, logLine) } j := k8sJSONLog{ Log: logLine, Level: level, Time: timestamp, } b, err := json.Marshal(j) if err != nil { panic(err) } e.Writer.Write(b) } golang-gvisor-gvisor-0.0~20240729.0/pkg/log/log.go000066400000000000000000000252671465435605700213310ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package log implements a library for logging. // // This is separate from the standard logging package because logging may be a // high-impact activity, and therefore we wanted to provide as much flexibility // as possible in the underlying implementation. // // Note that logging should still be considered high-impact, and should not be // done in the hot path. If necessary, logging statements should be protected // with guards regarding the logging level. For example, // // if log.IsLogging(log.Debug) { // log.Debugf(...) // } // // This is because the log.Debugf(...) statement alone will generate a // significant amount of garbage and churn in many cases, even if no log // message is ultimately emitted. // // +checkalignedignore package log import ( "fmt" "io" stdlog "log" "os" "regexp" "runtime" "sync/atomic" "time" "gvisor.dev/gvisor/pkg/linewriter" "gvisor.dev/gvisor/pkg/sync" ) // Level is the log level. type Level uint32 // The following levels are fixed, and can never be changed. Since some control // RPCs allow for changing the level as an integer, it is only possible to add // additional levels, and the existing one cannot be removed. const ( // Warning indicates that output should always be emitted. Warning Level = iota // Info indicates that output should normally be emitted. Info // Debug indicates that output should not normally be emitted. Debug ) func (l Level) String() string { switch l { case Warning: return "Warning" case Info: return "Info" case Debug: return "Debug" default: return fmt.Sprintf("Invalid level: %d", l) } } // Emitter is the final destination for logs. type Emitter interface { // Emit emits the given log statement. This allows for control over the // timestamp used for logging. Emit(depth int, level Level, timestamp time.Time, format string, v ...any) } // Writer writes the output to the given writer. type Writer struct { // Next is where output is written. Next io.Writer // mu protects fields below. mu sync.Mutex // errors counts failures to write log messages so it can be reported // when writer start to work again. Needs to be accessed using atomics // to make race detector happy because it's read outside the mutex. // +checklocks atomicErrors int32 } // Write writes out the given bytes, handling non-blocking sockets. func (l *Writer) Write(data []byte) (int, error) { n := 0 for n < len(data) { w, err := l.Next.Write(data[n:]) n += w // Is it a non-blocking socket? if pathErr, ok := err.(*os.PathError); ok && pathErr.Timeout() { runtime.Gosched() continue } // Some other error? if err != nil { l.mu.Lock() atomic.AddInt32(&l.atomicErrors, 1) l.mu.Unlock() return n, err } } // Do we need to end with a '\n'? if len(data) == 0 || data[len(data)-1] != '\n' { l.Write([]byte{'\n'}) } // Dirty read in case there were errors (rare). if atomic.LoadInt32(&l.atomicErrors) > 0 { l.mu.Lock() defer l.mu.Unlock() // Recheck condition under lock. if e := atomic.LoadInt32(&l.atomicErrors); e > 0 { msg := fmt.Sprintf("\n*** Dropped %d log messages ***\n", e) if _, err := l.Next.Write([]byte(msg)); err == nil { atomic.StoreInt32(&l.atomicErrors, 0) } } } return n, nil } // Emit emits the message. func (l *Writer) Emit(_ int, _ Level, _ time.Time, format string, args ...any) { fmt.Fprintf(l, format, args...) } // MultiEmitter is an emitter that emits to multiple Emitters. type MultiEmitter []Emitter // Emit emits to all emitters. func (m *MultiEmitter) Emit(depth int, level Level, timestamp time.Time, format string, v ...any) { for _, e := range *m { e.Emit(1+depth, level, timestamp, format, v...) } } // TestLogger is implemented by testing.T and testing.B. type TestLogger interface { Logf(format string, v ...any) } // TestEmitter may be used for wrapping tests. type TestEmitter struct { TestLogger } // Emit emits to the TestLogger. func (t *TestEmitter) Emit(_ int, level Level, timestamp time.Time, format string, v ...any) { t.Logf(format, v...) } // Logger is a high-level logging interface. It is in fact, not used within the // log package. Rather it is provided for others to provide contextual loggers // that may append some addition information to log statement. BasicLogger // satisfies this interface, and may be passed around as a Logger. type Logger interface { // Debugf logs a debug statement. Debugf(format string, v ...any) // Infof logs at an info level. Infof(format string, v ...any) // Warningf logs at a warning level. Warningf(format string, v ...any) // IsLogging returns true iff this level is being logged. This may be // used to short-circuit expensive operations for debugging calls. IsLogging(level Level) bool } // BasicLogger is the default implementation of Logger. type BasicLogger struct { Level Emitter } // Debugf implements logger.Debugf. func (l *BasicLogger) Debugf(format string, v ...any) { l.DebugfAtDepth(1, format, v...) } // Infof implements logger.Infof. func (l *BasicLogger) Infof(format string, v ...any) { l.InfofAtDepth(1, format, v...) } // Warningf implements logger.Warningf. func (l *BasicLogger) Warningf(format string, v ...any) { l.WarningfAtDepth(1, format, v...) } // DebugfAtDepth logs at a specific depth. func (l *BasicLogger) DebugfAtDepth(depth int, format string, v ...any) { if l.IsLogging(Debug) { l.Emit(1+depth, Debug, time.Now(), format, v...) } } // InfofAtDepth logs at a specific depth. func (l *BasicLogger) InfofAtDepth(depth int, format string, v ...any) { if l.IsLogging(Info) { l.Emit(1+depth, Info, time.Now(), format, v...) } } // WarningfAtDepth logs at a specific depth. func (l *BasicLogger) WarningfAtDepth(depth int, format string, v ...any) { if l.IsLogging(Warning) { l.Emit(1+depth, Warning, time.Now(), format, v...) } } // IsLogging implements logger.IsLogging. func (l *BasicLogger) IsLogging(level Level) bool { return atomic.LoadUint32((*uint32)(&l.Level)) >= uint32(level) } // SetLevel sets the logging level. func (l *BasicLogger) SetLevel(level Level) { atomic.StoreUint32((*uint32)(&l.Level), uint32(level)) } // logMu protects Log below. We use atomic operations to read the value, but // updates require logMu to ensure consistency. var logMu sync.Mutex // log is the default logger. var log atomic.Pointer[BasicLogger] // Log retrieves the global logger. func Log() *BasicLogger { return log.Load() } // SetTarget sets the log target. // // This is not thread safe and shouldn't be called concurrently with any // logging calls. // // SetTarget should be called before any instances of log.Log() to avoid race conditions func SetTarget(target Emitter) { logMu.Lock() defer logMu.Unlock() oldLog := Log() log.Store(&BasicLogger{Level: oldLog.Level, Emitter: target}) } // SetLevel sets the log level. func SetLevel(newLevel Level) { Log().SetLevel(newLevel) } // Debugf logs to the global logger. func Debugf(format string, v ...any) { Log().DebugfAtDepth(1, format, v...) } // Infof logs to the global logger. func Infof(format string, v ...any) { Log().InfofAtDepth(1, format, v...) } // Warningf logs to the global logger. func Warningf(format string, v ...any) { Log().WarningfAtDepth(1, format, v...) } // DebugfAtDepth logs to the global logger. func DebugfAtDepth(depth int, format string, v ...any) { Log().DebugfAtDepth(1+depth, format, v...) } // InfofAtDepth logs to the global logger. func InfofAtDepth(depth int, format string, v ...any) { Log().InfofAtDepth(1+depth, format, v...) } // WarningfAtDepth logs to the global logger. func WarningfAtDepth(depth int, format string, v ...any) { Log().WarningfAtDepth(1+depth, format, v...) } // defaultStackSize is the default buffer size to allocate for stack traces. const defaultStackSize = 1 << 16 // 64KB // maxStackSize is the maximum buffer size to allocate for stack traces. const maxStackSize = 1 << 26 // 64MB // Stacks returns goroutine stacks, like panic. func Stacks(all bool) []byte { var trace []byte for s := defaultStackSize; s <= maxStackSize; s *= 4 { trace = make([]byte, s) nbytes := runtime.Stack(trace, all) if nbytes == s { continue } return trace[:nbytes] } trace = append(trace, []byte("\n\n...")...) return trace } // stackRegexp matches one level within a stack trace. var stackRegexp = regexp.MustCompile("(?m)^\\S+\\(.*\\)$\\r?\\n^\\t\\S+:\\d+.*$\\r?\\n") // LocalStack returns the local goroutine stack, excluding the top N entries. // LocalStack's own entry is excluded by default and does not need to be counted in excludeTopN. func LocalStack(excludeTopN int) []byte { replaceNext := excludeTopN + 1 return stackRegexp.ReplaceAllFunc(Stacks(false), func(s []byte) []byte { if replaceNext > 0 { replaceNext-- return nil } return s }) } // Traceback logs the given message and dumps a stacktrace of the current // goroutine. // // This will be print a traceback, tb, as Warningf(format+":\n%s", v..., tb). func Traceback(format string, v ...any) { v = append(v, Stacks(false)) Warningf(format+":\n%s", v...) } // TracebackAll logs the given message and dumps a stacktrace of all goroutines. // // This will be print a traceback, tb, as Warningf(format+":\n%s", v..., tb). func TracebackAll(format string, v ...any) { v = append(v, Stacks(true)) Warningf(format+":\n%s", v...) } // IsLogging returns whether the global logger is logging. func IsLogging(level Level) bool { return Log().IsLogging(level) } // CopyStandardLogTo redirects the stdlib log package global output to the global // logger for the specified level. func CopyStandardLogTo(l Level) error { var f func(string, ...any) switch l { case Debug: f = Debugf case Info: f = Infof case Warning: f = Warningf default: return fmt.Errorf("unknown log level %v", l) } stdlog.SetOutput(linewriter.NewWriter(func(p []byte) { // We must not retain p, but log formatting is not required to // be synchronous (though the in-package implementations are), // so we must make a copy. b := make([]byte, len(p)) copy(b, p) f("%s", b) })) return nil } func init() { // Store the initial value for the log. log.Store(&BasicLogger{Level: Info, Emitter: GoogleEmitter{&Writer{Next: os.Stderr}}}) } golang-gvisor-gvisor-0.0~20240729.0/pkg/log/rate_limited.go000066400000000000000000000032611465435605700232000ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package log import ( "time" "golang.org/x/time/rate" ) type rateLimitedLogger struct { logger Logger limit *rate.Limiter } func (rl *rateLimitedLogger) Debugf(format string, v ...any) { if rl.limit.Allow() { rl.logger.Debugf(format, v...) } } func (rl *rateLimitedLogger) Infof(format string, v ...any) { if rl.limit.Allow() { rl.logger.Infof(format, v...) } } func (rl *rateLimitedLogger) Warningf(format string, v ...any) { if rl.limit.Allow() { rl.logger.Warningf(format, v...) } } func (rl *rateLimitedLogger) IsLogging(level Level) bool { return rl.logger.IsLogging(level) } // BasicRateLimitedLogger returns a Logger that logs to the global logger no // more than once per the provided duration. func BasicRateLimitedLogger(every time.Duration) Logger { return RateLimitedLogger(Log(), every) } // RateLimitedLogger returns a Logger that logs to the provided logger no more // than once per the provided duration. func RateLimitedLogger(logger Logger, every time.Duration) Logger { return &rateLimitedLogger{ logger: logger, limit: rate.NewLimiter(rate.Every(every), 1), } } golang-gvisor-gvisor-0.0~20240729.0/pkg/marshal/000077500000000000000000000000001465435605700210535ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/marshal/marshal.go000066400000000000000000000234061465435605700230360ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package marshal defines the Marshallable interface for // serialize/deserializing go data structures to/from memory, according to the // Linux ABI. // // Implementations of this interface are typically automatically generated by // tools/go_marshal. See the go_marshal README for details. package marshal import ( "io" "gvisor.dev/gvisor/pkg/hostarch" ) // CopyContext defines the memory operations required to marshal to and from // user memory. Typically, kernel.Task is used to provide implementations for // these operations. type CopyContext interface { // CopyScratchBuffer provides a task goroutine-local scratch buffer. See // kernel.CopyScratchBuffer. CopyScratchBuffer(size int) []byte // CopyOutBytes writes the contents of b to the task's memory. See // kernel.CopyOutBytes. CopyOutBytes(addr hostarch.Addr, b []byte) (int, error) // CopyInBytes reads the contents of the task's memory to b. See // kernel.CopyInBytes. CopyInBytes(addr hostarch.Addr, b []byte) (int, error) } // Marshallable represents operations on a type that can be marshalled to and // from memory. // // go-marshal automatically generates implementations for this interface for // types marked as '+marshal'. type Marshallable interface { io.WriterTo // SizeBytes is the size of the memory representation of a type in // marshalled form. // // SizeBytes must handle a nil receiver. Practically, this means SizeBytes // cannot deference any fields on the object implementing it (but will // likely make use of the type of these fields). SizeBytes() int // MarshalBytes serializes a copy of a type to dst and returns the remaining // buffer. // Precondition: dst must be at least SizeBytes() in length. MarshalBytes(dst []byte) []byte // UnmarshalBytes deserializes a type from src and returns the remaining // buffer. // Precondition: src must be at least SizeBytes() in length. UnmarshalBytes(src []byte) []byte // Packed returns true if the marshalled size of the type is the same as the // size it occupies in memory. This happens when the type has no fields // starting at unaligned addresses (should always be true by default for ABI // structs, verified by automatically generated tests when using // go_marshal), and has no fields marked `marshal:"unaligned"`. // // Packed must return the same result for all possible values of the type // implementing it. Violating this constraint implies the type doesn't have // a static memory layout, and will lead to memory corruption. // Go-marshal-generated code reuses the result of Packed for multiple values // of the same type. Packed() bool // MarshalUnsafe serializes a type by bulk copying its in-memory // representation to the dst buffer. This is only safe to do when the type // has no implicit padding, see Marshallable.Packed. When Packed would // return false, MarshalUnsafe should fall back to the safer but slower // MarshalBytes. // Precondition: dst must be at least SizeBytes() in length. MarshalUnsafe(dst []byte) []byte // UnmarshalUnsafe deserializes a type by directly copying to the underlying // memory allocated for the object by the runtime. // // This allows much faster unmarshalling of types which have no implicit // padding, see Marshallable.Packed. When Packed would return false, // UnmarshalUnsafe should fall back to the safer but slower unmarshal // mechanism implemented in UnmarshalBytes. // Precondition: src must be at least SizeBytes() in length. UnmarshalUnsafe(src []byte) []byte // CopyIn deserializes a Marshallable type from a task's memory. This may // only be called from a task goroutine. This is more efficient than calling // UnmarshalUnsafe on Marshallable.Packed types, as the type being // marshalled does not escape. The implementation should avoid creating // extra copies in memory by directly deserializing to the object's // underlying memory. // // If the copy-in from the task memory is only partially successful, CopyIn // should still attempt to deserialize as much data as possible. See comment // for UnmarshalBytes. CopyIn(cc CopyContext, addr hostarch.Addr) (int, error) // CopyInN is like CopyIn, but explicitly requests a partial // copy-in. Note that this may yield unexpected results for non-packed // types and the caller may only want to allow this for packed types. See // comment on UnmarshalBytes. // // The limit must be less than or equal to SizeBytes(). CopyInN(cc CopyContext, addr hostarch.Addr, limit int) (int, error) // CopyOut serializes a Marshallable type to a task's memory. This may only // be called from a task goroutine. This is more efficient than calling // MarshalUnsafe on Marshallable.Packed types, as the type being serialized // does not escape. The implementation should avoid creating extra copies in // memory by directly serializing from the object's underlying memory. // // The copy-out to the task memory may be partially successful, in which // case CopyOut returns how much data was serialized. See comment for // MarshalBytes for implications. CopyOut(cc CopyContext, addr hostarch.Addr) (int, error) // CopyOutN is like CopyOut, but explicitly requests a partial // copy-out. Note that this may yield unexpected results for non-packed // types and the caller may only want to allow this for packed types. See // comment on MarshalBytes. // // The limit must be less than or equal to SizeBytes(). CopyOutN(cc CopyContext, addr hostarch.Addr, limit int) (int, error) } // CheckedMarshallable represents operations on a type that can be marshalled // to and from memory and additionally does bound checking. type CheckedMarshallable interface { // CheckedMarshal is the same as Marshallable.MarshalUnsafe but without the // precondition that dst must at least have some appropriate length. Similar // to Marshallable.MarshalBytes, it returns a shifted slice according to how // much data is consumed. Additionally it returns a bool indicating whether // marshalling was successful. Unsuccessful marshalling doesn't consume any // data. CheckedMarshal(dst []byte) ([]byte, bool) // CheckedUnmarshal is the same as Marshallable.UmarshalUnsafe but without // the precondition that src must at least have some appropriate length. // Similar to Marshallable.UnmarshalBytes, it returns a shifted slice // according to how much data is consumed. Additionally it returns a bool // indicating whether marshalling was successful. Unsuccessful marshalling // doesn't consume any data. CheckedUnmarshal(src []byte) ([]byte, bool) } // go-marshal generates additional functions for a type based on additional // clauses to the +marshal directive. They are documented below. // // Slice API // ========= // // Adding a "slice" clause to the +marshal directive for structs or newtypes on // primitives like this: // // // +marshal slice:FooSlice // type Foo struct { ... } // // Generates four additional functions for marshalling slices of Foos like this: // // // MarshalUnsafeFooSlice is like Foo.MarshalUnsafe, buf for a []Foo. It // // might be more efficient that repeatedly calling Foo.MarshalUnsafe // // over a []Foo in a loop if the type is Packed. // // Preconditions: dst must be at least len(src)*Foo.SizeBytes() in length. // func MarshalUnsafeFooSlice(src []Foo, dst []byte) []byte { ... } // // // UnmarshalUnsafeFooSlice is like Foo.UnmarshalUnsafe, buf for a []Foo. It // // might be more efficient that repeatedly calling Foo.UnmarshalUnsafe // // over a []Foo in a loop if the type is Packed. // // Preconditions: src must be at least len(dst)*Foo.SizeBytes() in length. // func UnmarshalUnsafeFooSlice(dst []Foo, src []byte) []byte { ... } // // // CopyFooSliceIn copies in a slice of Foo objects from the task's memory. // func CopyFooSliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []Foo) (int, error) { ... } // // // CopyFooSliceIn copies out a slice of Foo objects to the task's memory. // func CopyFooSliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []Foo) (int, error) { ... } // // The name of the functions are of the format "Copy%sIn" and "Copy%sOut", where // %s is the first argument to the slice clause. This directive is not supported // for newtypes on arrays. // // Note: Partial copies are not supported for Slice API UnmarshalUnsafe and // MarshalUnsafe. // // The slice clause also takes an optional second argument, which must be the // value "inner": // // // +marshal slice:Int32Slice:inner // type Int32 int32 // // This is only valid on newtypes on primitives, and causes the generated // functions to accept slices of the inner type instead: // // func CopyInt32SliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []int32) (int, error) { ... } // // Without "inner", they would instead be: // // func CopyInt32SliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []Int32) (int, error) { ... } // // This may help avoid a cast depending on how the generated functions are used. // // Bound Checking // ============== // // Some users might want to do bound checking on marshal and unmarshal. This is // is useful when the user does not control the buffer size. To prevent // repeated bound checking code around Marshallable, users can add a // "boundCheck" clause to the +marshal directive. go_marshal will generate the // CheckedMarshallable interface methods on the type. golang-gvisor-gvisor-0.0~20240729.0/pkg/marshal/marshal_state_autogen.go000066400000000000000000000000711465435605700257510ustar00rootroot00000000000000// automatically generated by stateify. package marshal golang-gvisor-gvisor-0.0~20240729.0/pkg/marshal/primitive/000077500000000000000000000000001465435605700230635ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/marshal/primitive/primitive.go000066400000000000000000000262171465435605700254320ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package primitive defines marshal.Marshallable implementations for primitive // types. package primitive import ( "io" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" ) // Int8 is a marshal.Marshallable implementation for int8. // // +marshal boundCheck slice:Int8Slice:inner type Int8 int8 // Uint8 is a marshal.Marshallable implementation for uint8. // // +marshal boundCheck slice:Uint8Slice:inner type Uint8 uint8 // Int16 is a marshal.Marshallable implementation for int16. // // +marshal boundCheck slice:Int16Slice:inner type Int16 int16 // Uint16 is a marshal.Marshallable implementation for uint16. // // +marshal boundCheck slice:Uint16Slice:inner type Uint16 uint16 // Int32 is a marshal.Marshallable implementation for int32. // // +marshal boundCheck slice:Int32Slice:inner type Int32 int32 // Uint32 is a marshal.Marshallable implementation for uint32. // // +marshal boundCheck slice:Uint32Slice:inner type Uint32 uint32 // Int64 is a marshal.Marshallable implementation for int64. // // +marshal boundCheck slice:Int64Slice:inner type Int64 int64 // Uint64 is a marshal.Marshallable implementation for uint64. // // +marshal boundCheck slice:Uint64Slice:inner type Uint64 uint64 // ByteSlice is a marshal.Marshallable implementation for []byte. // This is a convenience wrapper around a dynamically sized type, and can't be // embedded in other marshallable types because it breaks assumptions made by // go-marshal internals. It violates the "no dynamically-sized types" // constraint of the go-marshal library. type ByteSlice []byte // SizeBytes implements marshal.Marshallable.SizeBytes. func (b *ByteSlice) SizeBytes() int { return len(*b) } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (b *ByteSlice) MarshalBytes(dst []byte) []byte { return dst[copy(dst, *b):] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (b *ByteSlice) UnmarshalBytes(src []byte) []byte { return src[copy(*b, src):] } // Packed implements marshal.Marshallable.Packed. func (b *ByteSlice) Packed() bool { return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (b *ByteSlice) MarshalUnsafe(dst []byte) []byte { return b.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (b *ByteSlice) UnmarshalUnsafe(src []byte) []byte { return b.UnmarshalBytes(src) } // CopyIn implements marshal.Marshallable.CopyIn. func (b *ByteSlice) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return cc.CopyInBytes(addr, *b) } // CopyInN implements marshal.Marshallable.CopyInN. func (b *ByteSlice) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { return cc.CopyInBytes(addr, (*b)[:limit]) } // CopyOut implements marshal.Marshallable.CopyOut. func (b *ByteSlice) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return cc.CopyOutBytes(addr, *b) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (b *ByteSlice) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { return cc.CopyOutBytes(addr, (*b)[:limit]) } // WriteTo implements io.WriterTo.WriteTo. func (b *ByteSlice) WriteTo(w io.Writer) (int64, error) { n, err := w.Write(*b) return int64(n), err } var _ marshal.Marshallable = (*ByteSlice)(nil) // The following set of functions are convenient shorthands for wrapping a // built-in type in a marshallable primitive type. For example: // // func useMarshallable(m marshal.Marshallable) { ... } // // // Compare: // // buf = []byte{...} // // useMarshallable(&primitive.ByteSlice(buf)) // Not allowed, can't address temp value. // bufP := primitive.ByteSlice(buf) // useMarshallable(&bufP) // // // Vs: // // useMarshallable(AsByteSlice(buf)) // // Note that the argument to these function escapes, so avoid using them on very // hot code paths. But generally if a function accepts an interface as an // argument, the argument escapes anyways. // AllocateInt8 returns x as a marshallable. func AllocateInt8(x int8) marshal.Marshallable { p := Int8(x) return &p } // AllocateUint8 returns x as a marshallable. func AllocateUint8(x uint8) marshal.Marshallable { p := Uint8(x) return &p } // AllocateInt16 returns x as a marshallable. func AllocateInt16(x int16) marshal.Marshallable { p := Int16(x) return &p } // AllocateUint16 returns x as a marshallable. func AllocateUint16(x uint16) marshal.Marshallable { p := Uint16(x) return &p } // AllocateInt32 returns x as a marshallable. func AllocateInt32(x int32) marshal.Marshallable { p := Int32(x) return &p } // AllocateUint32 returns x as a marshallable. func AllocateUint32(x uint32) marshal.Marshallable { p := Uint32(x) return &p } // AllocateInt64 returns x as a marshallable. func AllocateInt64(x int64) marshal.Marshallable { p := Int64(x) return &p } // AllocateUint64 returns x as a marshallable. func AllocateUint64(x uint64) marshal.Marshallable { p := Uint64(x) return &p } // AsByteSlice returns b as a marshallable. Note that this allocates a new slice // header, but does not copy the slice contents. func AsByteSlice(b []byte) marshal.Marshallable { bs := ByteSlice(b) return &bs } // Below, we define some convenience functions for marshalling primitive types // using the newtypes above, without requiring superfluous casts. // 8-bit integers // CopyInt8In is a convenient wrapper for copying in an int8 from the task's // memory. func CopyInt8In(cc marshal.CopyContext, addr hostarch.Addr, dst *int8) (int, error) { var buf Int8 n, err := buf.CopyIn(cc, addr) if err != nil { return n, err } *dst = int8(buf) return n, nil } // CopyInt8Out is a convenient wrapper for copying out an int8 to the task's // memory. func CopyInt8Out(cc marshal.CopyContext, addr hostarch.Addr, src int8) (int, error) { srcP := Int8(src) return srcP.CopyOut(cc, addr) } // CopyUint8In is a convenient wrapper for copying in a uint8 from the task's // memory. func CopyUint8In(cc marshal.CopyContext, addr hostarch.Addr, dst *uint8) (int, error) { var buf Uint8 n, err := buf.CopyIn(cc, addr) if err != nil { return n, err } *dst = uint8(buf) return n, nil } // CopyUint8Out is a convenient wrapper for copying out a uint8 to the task's // memory. func CopyUint8Out(cc marshal.CopyContext, addr hostarch.Addr, src uint8) (int, error) { srcP := Uint8(src) return srcP.CopyOut(cc, addr) } // 16-bit integers // CopyInt16In is a convenient wrapper for copying in an int16 from the task's // memory. func CopyInt16In(cc marshal.CopyContext, addr hostarch.Addr, dst *int16) (int, error) { var buf Int16 n, err := buf.CopyIn(cc, addr) if err != nil { return n, err } *dst = int16(buf) return n, nil } // CopyInt16Out is a convenient wrapper for copying out an int16 to the task's // memory. func CopyInt16Out(cc marshal.CopyContext, addr hostarch.Addr, src int16) (int, error) { srcP := Int16(src) return srcP.CopyOut(cc, addr) } // CopyUint16In is a convenient wrapper for copying in a uint16 from the task's // memory. func CopyUint16In(cc marshal.CopyContext, addr hostarch.Addr, dst *uint16) (int, error) { var buf Uint16 n, err := buf.CopyIn(cc, addr) if err != nil { return n, err } *dst = uint16(buf) return n, nil } // CopyUint16Out is a convenient wrapper for copying out a uint16 to the task's // memory. func CopyUint16Out(cc marshal.CopyContext, addr hostarch.Addr, src uint16) (int, error) { srcP := Uint16(src) return srcP.CopyOut(cc, addr) } // 32-bit integers // CopyInt32In is a convenient wrapper for copying in an int32 from the task's // memory. func CopyInt32In(cc marshal.CopyContext, addr hostarch.Addr, dst *int32) (int, error) { var buf Int32 n, err := buf.CopyIn(cc, addr) if err != nil { return n, err } *dst = int32(buf) return n, nil } // CopyInt32Out is a convenient wrapper for copying out an int32 to the task's // memory. func CopyInt32Out(cc marshal.CopyContext, addr hostarch.Addr, src int32) (int, error) { srcP := Int32(src) return srcP.CopyOut(cc, addr) } // CopyUint32In is a convenient wrapper for copying in a uint32 from the task's // memory. func CopyUint32In(cc marshal.CopyContext, addr hostarch.Addr, dst *uint32) (int, error) { var buf Uint32 n, err := buf.CopyIn(cc, addr) if err != nil { return n, err } *dst = uint32(buf) return n, nil } // CopyUint32Out is a convenient wrapper for copying out a uint32 to the task's // memory. func CopyUint32Out(cc marshal.CopyContext, addr hostarch.Addr, src uint32) (int, error) { srcP := Uint32(src) return srcP.CopyOut(cc, addr) } // 64-bit integers // CopyInt64In is a convenient wrapper for copying in an int64 from the task's // memory. func CopyInt64In(cc marshal.CopyContext, addr hostarch.Addr, dst *int64) (int, error) { var buf Int64 n, err := buf.CopyIn(cc, addr) if err != nil { return n, err } *dst = int64(buf) return n, nil } // CopyInt64Out is a convenient wrapper for copying out an int64 to the task's // memory. func CopyInt64Out(cc marshal.CopyContext, addr hostarch.Addr, src int64) (int, error) { srcP := Int64(src) return srcP.CopyOut(cc, addr) } // CopyUint64In is a convenient wrapper for copying in a uint64 from the task's // memory. func CopyUint64In(cc marshal.CopyContext, addr hostarch.Addr, dst *uint64) (int, error) { var buf Uint64 n, err := buf.CopyIn(cc, addr) if err != nil { return n, err } *dst = uint64(buf) return n, nil } // CopyUint64Out is a convenient wrapper for copying out a uint64 to the task's // memory. func CopyUint64Out(cc marshal.CopyContext, addr hostarch.Addr, src uint64) (int, error) { srcP := Uint64(src) return srcP.CopyOut(cc, addr) } // CopyByteSliceIn is a convenient wrapper for copying in a []byte from the // task's memory. func CopyByteSliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst *[]byte) (int, error) { var buf ByteSlice n, err := buf.CopyIn(cc, addr) if err != nil { return n, err } *dst = []byte(buf) return n, nil } // CopyByteSliceOut is a convenient wrapper for copying out a []byte to the // task's memory. func CopyByteSliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []byte) (int, error) { srcP := ByteSlice(src) return srcP.CopyOut(cc, addr) } // CopyStringIn is a convenient wrapper for copying in a string from the // task's memory. func CopyStringIn(cc marshal.CopyContext, addr hostarch.Addr, dst *string) (int, error) { var buf ByteSlice n, err := buf.CopyIn(cc, addr) if err != nil { return n, err } *dst = string(buf) return n, nil } // CopyStringOut is a convenient wrapper for copying out a string to the task's // memory. func CopyStringOut(cc marshal.CopyContext, addr hostarch.Addr, src string) (int, error) { srcP := ByteSlice(src) return srcP.CopyOut(cc, addr) } golang-gvisor-gvisor-0.0~20240729.0/pkg/marshal/primitive/primitive_abi_autogen_unsafe.go000066400000000000000000001510561465435605700313300ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package primitive import ( "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "io" "reflect" "runtime" "unsafe" ) // Marshallable types used by this file. var _ marshal.Marshallable = (*Int16)(nil) var _ marshal.Marshallable = (*Int32)(nil) var _ marshal.Marshallable = (*Int64)(nil) var _ marshal.Marshallable = (*Int8)(nil) var _ marshal.Marshallable = (*Uint16)(nil) var _ marshal.Marshallable = (*Uint32)(nil) var _ marshal.Marshallable = (*Uint64)(nil) var _ marshal.Marshallable = (*Uint8)(nil) // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (i *Int16) SizeBytes() int { return 2 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *Int16) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(*i)) return dst[2:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *Int16) UnmarshalBytes(src []byte) []byte { *i = Int16(int16(hostarch.ByteOrder.Uint16(src[:2]))) return src[2:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *Int16) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *Int16) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *Int16) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *Int16) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *Int16) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *Int16) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *Int16) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *Int16) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (i *Int16) CheckedMarshal(dst []byte) ([]byte, bool) { size := i.SizeBytes() if size > len(dst) { return dst, false } gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:], true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (i *Int16) CheckedUnmarshal(src []byte) ([]byte, bool) { size := i.SizeBytes() if size > len(src) { return src, false } gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:], true } // CopyInt16SliceIn copies in a slice of int16 objects from the task's memory. func CopyInt16SliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []int16) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*Int16)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyInt16SliceOut copies a slice of int16 objects to the task's memory. func CopyInt16SliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []int16) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*Int16)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeInt16Slice is like Int16.MarshalUnsafe, but for a []Int16. func MarshalUnsafeInt16Slice(src []Int16, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*Int16)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeInt16Slice is like Int16.UnmarshalUnsafe, but for a []Int16. func UnmarshalUnsafeInt16Slice(dst []Int16, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*Int16)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (i *Int32) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *Int32) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(*i)) return dst[4:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *Int32) UnmarshalBytes(src []byte) []byte { *i = Int32(int32(hostarch.ByteOrder.Uint32(src[:4]))) return src[4:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *Int32) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *Int32) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *Int32) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *Int32) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *Int32) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *Int32) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *Int32) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *Int32) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (i *Int32) CheckedMarshal(dst []byte) ([]byte, bool) { size := i.SizeBytes() if size > len(dst) { return dst, false } gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:], true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (i *Int32) CheckedUnmarshal(src []byte) ([]byte, bool) { size := i.SizeBytes() if size > len(src) { return src, false } gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:], true } // CopyInt32SliceIn copies in a slice of int32 objects from the task's memory. func CopyInt32SliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []int32) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*Int32)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyInt32SliceOut copies a slice of int32 objects to the task's memory. func CopyInt32SliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []int32) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*Int32)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeInt32Slice is like Int32.MarshalUnsafe, but for a []Int32. func MarshalUnsafeInt32Slice(src []Int32, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*Int32)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeInt32Slice is like Int32.UnmarshalUnsafe, but for a []Int32. func UnmarshalUnsafeInt32Slice(dst []Int32, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*Int32)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (i *Int64) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *Int64) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(*i)) return dst[8:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *Int64) UnmarshalBytes(src []byte) []byte { *i = Int64(int64(hostarch.ByteOrder.Uint64(src[:8]))) return src[8:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *Int64) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *Int64) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *Int64) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *Int64) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *Int64) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *Int64) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *Int64) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *Int64) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (i *Int64) CheckedMarshal(dst []byte) ([]byte, bool) { size := i.SizeBytes() if size > len(dst) { return dst, false } gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:], true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (i *Int64) CheckedUnmarshal(src []byte) ([]byte, bool) { size := i.SizeBytes() if size > len(src) { return src, false } gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:], true } // CopyInt64SliceIn copies in a slice of int64 objects from the task's memory. func CopyInt64SliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []int64) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*Int64)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyInt64SliceOut copies a slice of int64 objects to the task's memory. func CopyInt64SliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []int64) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*Int64)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeInt64Slice is like Int64.MarshalUnsafe, but for a []Int64. func MarshalUnsafeInt64Slice(src []Int64, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*Int64)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeInt64Slice is like Int64.UnmarshalUnsafe, but for a []Int64. func UnmarshalUnsafeInt64Slice(dst []Int64, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*Int64)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (i *Int8) SizeBytes() int { return 1 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (i *Int8) MarshalBytes(dst []byte) []byte { dst[0] = byte(*i) return dst[1:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (i *Int8) UnmarshalBytes(src []byte) []byte { *i = Int8(int8(src[0])) return src[1:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (i *Int8) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (i *Int8) MarshalUnsafe(dst []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (i *Int8) UnmarshalUnsafe(src []byte) []byte { size := i.SizeBytes() gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (i *Int8) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (i *Int8) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyOutN(cc, addr, i.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (i *Int8) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (i *Int8) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return i.CopyInN(cc, addr, i.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (i *Int8) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(i))) hdr.Len = i.SizeBytes() hdr.Cap = i.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that i // must live until the use above. runtime.KeepAlive(i) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (i *Int8) CheckedMarshal(dst []byte) ([]byte, bool) { size := i.SizeBytes() if size > len(dst) { return dst, false } gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(i), uintptr(size)) return dst[size:], true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (i *Int8) CheckedUnmarshal(src []byte) ([]byte, bool) { size := i.SizeBytes() if size > len(src) { return src, false } gohacks.Memmove(unsafe.Pointer(i), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:], true } // CopyInt8SliceIn copies in a slice of int8 objects from the task's memory. func CopyInt8SliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []int8) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*Int8)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyInt8SliceOut copies a slice of int8 objects to the task's memory. func CopyInt8SliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []int8) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*Int8)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeInt8Slice is like Int8.MarshalUnsafe, but for a []Int8. func MarshalUnsafeInt8Slice(src []Int8, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*Int8)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeInt8Slice is like Int8.UnmarshalUnsafe, but for a []Int8. func UnmarshalUnsafeInt8Slice(dst []Int8, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*Int8)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (u *Uint16) SizeBytes() int { return 2 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *Uint16) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(*u)) return dst[2:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *Uint16) UnmarshalBytes(src []byte) []byte { *u = Uint16(uint16(hostarch.ByteOrder.Uint16(src[:2]))) return src[2:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *Uint16) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *Uint16) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *Uint16) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *Uint16) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *Uint16) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *Uint16) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *Uint16) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *Uint16) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (u *Uint16) CheckedMarshal(dst []byte) ([]byte, bool) { size := u.SizeBytes() if size > len(dst) { return dst, false } gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:], true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (u *Uint16) CheckedUnmarshal(src []byte) ([]byte, bool) { size := u.SizeBytes() if size > len(src) { return src, false } gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:], true } // CopyUint16SliceIn copies in a slice of uint16 objects from the task's memory. func CopyUint16SliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []uint16) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*Uint16)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyUint16SliceOut copies a slice of uint16 objects to the task's memory. func CopyUint16SliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []uint16) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*Uint16)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeUint16Slice is like Uint16.MarshalUnsafe, but for a []Uint16. func MarshalUnsafeUint16Slice(src []Uint16, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*Uint16)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeUint16Slice is like Uint16.UnmarshalUnsafe, but for a []Uint16. func UnmarshalUnsafeUint16Slice(dst []Uint16, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*Uint16)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (u *Uint32) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *Uint32) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(*u)) return dst[4:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *Uint32) UnmarshalBytes(src []byte) []byte { *u = Uint32(uint32(hostarch.ByteOrder.Uint32(src[:4]))) return src[4:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *Uint32) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *Uint32) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *Uint32) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *Uint32) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *Uint32) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *Uint32) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *Uint32) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *Uint32) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (u *Uint32) CheckedMarshal(dst []byte) ([]byte, bool) { size := u.SizeBytes() if size > len(dst) { return dst, false } gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:], true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (u *Uint32) CheckedUnmarshal(src []byte) ([]byte, bool) { size := u.SizeBytes() if size > len(src) { return src, false } gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:], true } // CopyUint32SliceIn copies in a slice of uint32 objects from the task's memory. func CopyUint32SliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []uint32) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*Uint32)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyUint32SliceOut copies a slice of uint32 objects to the task's memory. func CopyUint32SliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []uint32) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*Uint32)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeUint32Slice is like Uint32.MarshalUnsafe, but for a []Uint32. func MarshalUnsafeUint32Slice(src []Uint32, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*Uint32)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeUint32Slice is like Uint32.UnmarshalUnsafe, but for a []Uint32. func UnmarshalUnsafeUint32Slice(dst []Uint32, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*Uint32)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (u *Uint64) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *Uint64) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(*u)) return dst[8:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *Uint64) UnmarshalBytes(src []byte) []byte { *u = Uint64(uint64(hostarch.ByteOrder.Uint64(src[:8]))) return src[8:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *Uint64) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *Uint64) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *Uint64) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *Uint64) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *Uint64) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *Uint64) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *Uint64) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *Uint64) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (u *Uint64) CheckedMarshal(dst []byte) ([]byte, bool) { size := u.SizeBytes() if size > len(dst) { return dst, false } gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:], true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (u *Uint64) CheckedUnmarshal(src []byte) ([]byte, bool) { size := u.SizeBytes() if size > len(src) { return src, false } gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:], true } // CopyUint64SliceIn copies in a slice of uint64 objects from the task's memory. func CopyUint64SliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []uint64) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*Uint64)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyUint64SliceOut copies a slice of uint64 objects to the task's memory. func CopyUint64SliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []uint64) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*Uint64)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeUint64Slice is like Uint64.MarshalUnsafe, but for a []Uint64. func MarshalUnsafeUint64Slice(src []Uint64, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*Uint64)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeUint64Slice is like Uint64.UnmarshalUnsafe, but for a []Uint64. func UnmarshalUnsafeUint64Slice(dst []Uint64, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*Uint64)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (u *Uint8) SizeBytes() int { return 1 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *Uint8) MarshalBytes(dst []byte) []byte { dst[0] = byte(*u) return dst[1:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *Uint8) UnmarshalBytes(src []byte) []byte { *u = Uint8(uint8(src[0])) return src[1:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *Uint8) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *Uint8) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *Uint8) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *Uint8) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *Uint8) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *Uint8) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *Uint8) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *Uint8) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal. func (u *Uint8) CheckedMarshal(dst []byte) ([]byte, bool) { size := u.SizeBytes() if size > len(dst) { return dst, false } gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:], true } // CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal. func (u *Uint8) CheckedUnmarshal(src []byte) ([]byte, bool) { size := u.SizeBytes() if size > len(src) { return src, false } gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:], true } // CopyUint8SliceIn copies in a slice of uint8 objects from the task's memory. func CopyUint8SliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []uint8) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*Uint8)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyUint8SliceOut copies a slice of uint8 objects to the task's memory. func CopyUint8SliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []uint8) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*Uint8)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeUint8Slice is like Uint8.MarshalUnsafe, but for a []Uint8. func MarshalUnsafeUint8Slice(src []Uint8, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*Uint8)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeUint8Slice is like Uint8.UnmarshalUnsafe, but for a []Uint8. func UnmarshalUnsafeUint8Slice(dst []Uint8, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*Uint8)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } golang-gvisor-gvisor-0.0~20240729.0/pkg/marshal/primitive/primitive_state_autogen.go000066400000000000000000000000731465435605700303440ustar00rootroot00000000000000// automatically generated by stateify. package primitive golang-gvisor-gvisor-0.0~20240729.0/pkg/marshal/util.go000066400000000000000000000023631465435605700223630ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package marshal // Marshal returns the serialized contents of m in a newly allocated // byte slice. func Marshal(m Marshallable) []byte { buf := make([]byte, m.SizeBytes()) m.MarshalUnsafe(buf) return buf } // MarshalAll returns the serialized contents of all ms in a newly allocated // byte slice. func MarshalAll(ms []Marshallable) []byte { buf := make([]byte, TotalSize(ms)) var written int for _, m := range ms { m.MarshalUnsafe(buf[written:]) written += m.SizeBytes() } return buf } // TotalSize returns the total size of all ms. func TotalSize(ms []Marshallable) int { var size int for _, m := range ms { size += m.SizeBytes() } return size } golang-gvisor-gvisor-0.0~20240729.0/pkg/memutil/000077500000000000000000000000001465435605700211005ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/memutil/memfd_linux_unsafe.go000066400000000000000000000022271465435605700253020ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package memutil import ( "fmt" "unsafe" "golang.org/x/sys/unix" ) // CreateMemFD creates a memfd file and returns the fd. func CreateMemFD(name string, flags int) (int, error) { p, err := unix.BytePtrFromString(name) if err != nil { return -1, err } fd, _, e := unix.Syscall(unix.SYS_MEMFD_CREATE, uintptr(unsafe.Pointer(p)), uintptr(flags), 0) if e != 0 { if e == unix.ENOSYS { return -1, fmt.Errorf("memfd_create(2) is not implemented. Check that you have Linux 3.17 or higher") } return -1, e } return int(fd), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/memutil/memutil_linux_unsafe_state_autogen.go000066400000000000000000000001331465435605700306020ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux // +build linux package memutil golang-gvisor-gvisor-0.0~20240729.0/pkg/memutil/memutil_state_autogen.go000066400000000000000000000001351465435605700260240ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package memutil golang-gvisor-gvisor-0.0~20240729.0/pkg/memutil/memutil_unsafe.go000066400000000000000000000026151465435605700244500ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package memutil provides utilities for working with shared memory files. package memutil import ( "reflect" "unsafe" "golang.org/x/sys/unix" ) // MapSlice is like MapFile, but returns a slice instead of a uintptr. func MapSlice(addr, size, prot, flags, fd, offset uintptr) ([]byte, error) { addr, err := MapFile(addr, size, prot, flags, fd, offset) if err != nil { return nil, err } var slice []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) hdr.Data = addr hdr.Len = int(size) hdr.Cap = int(size) return slice, nil } // UnmapSlice unmaps a mapping returned by MapSlice. func UnmapSlice(slice []byte) error { hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) _, _, err := unix.RawSyscall6(unix.SYS_MUNMAP, uintptr(unsafe.Pointer(hdr.Data)), uintptr(hdr.Cap), 0, 0, 0, 0) return err } golang-gvisor-gvisor-0.0~20240729.0/pkg/memutil/memutil_unsafe_state_autogen.go000066400000000000000000000000711465435605700273640ustar00rootroot00000000000000// automatically generated by stateify. package memutil golang-gvisor-gvisor-0.0~20240729.0/pkg/memutil/mmap.go000066400000000000000000000017231465435605700223640ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false package memutil import ( "golang.org/x/sys/unix" ) // MapFile returns a memory mapping configured by the given options as per // mmap(2). func MapFile(addr, size, prot, flags, fd, offset uintptr) (uintptr, error) { m, _, e := unix.RawSyscall6(unix.SYS_MMAP, addr, size, prot, flags, fd, offset) if e != 0 { return 0, e } return m, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/metric/000077500000000000000000000000001465435605700207075ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/metric/condmetric.go000066400000000000000000000143141465435605700233700ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package metric import ( "fmt" pb "gvisor.dev/gvisor/pkg/metric/metric_go_proto" ) // FakeUint64Metric is a type that implements all the methods of a Uint64Metric // as a no-op. type FakeUint64Metric struct{} // FakeDistributionMetric is a type that implements all the methods of a // DistributionMetric as a no-op. type FakeDistributionMetric struct{} // FakeTimerMetric is a type that implements all the methods of a TimerMetric // as a no-op. type FakeTimerMetric struct{} // FakeTimedOperation is a type that implements all the methods of a // TimedOperation as a no-op. type FakeTimedOperation struct{} // Value from a FakeUint64Metric always returns a meaningless value. // //go:nosplit func (m *FakeUint64Metric) Value(fieldValues ...*FieldValue) uint64 { return 0 } // Increment on a FakeUint64Metric does nothing. // //go:nosplit func (m *FakeUint64Metric) Increment(fieldValues ...*FieldValue) {} // Decrement on a FakeUint64Metric does nothing. // //go:nosplit func (m *FakeUint64Metric) Decrement(fieldValues ...*FieldValue) {} // IncrementBy on a FakeUint64Metric does nothing. // //go:nosplit func (m *FakeUint64Metric) IncrementBy(v uint64, fieldValues ...*FieldValue) {} // Set on a FakeUint64Metric does nothing. // //go:nosplit func (m *FakeUint64Metric) Set(v uint64, fieldValues ...*FieldValue) {} // AddSample on a FakeUint64Metric does nothing. // //go:nosplit func (d *FakeDistributionMetric) AddSample(sample int64, fields ...*FieldValue) {} // Start on a FakeUint64Metric returns a FakeTimedOperation struct, which does // nothing and does not keep the time. // //go:nosplit func (t *FakeTimerMetric) Start(fields ...*FieldValue) FakeTimedOperation { return FakeTimedOperation{} } // Finish on a FakeTimedOperation does nothing. // //go:nosplit func (o FakeTimedOperation) Finish(extraFields ...*FieldValue) {} // FakeMetricBuilder is a type used to produce conditionally compiled metrics. // Methods of this struct produce fake, inactive metrics. type FakeMetricBuilder struct{} // NewUint64Metric creates a fake Uint64 metric. func (b *FakeMetricBuilder) NewUint64Metric(name string, metadata Uint64Metadata) (*FakeUint64Metric, error) { return &FakeUint64Metric{}, nil } // MustCreateNewUint64Metric creates a fake Uint64 metric. func (b *FakeMetricBuilder) MustCreateNewUint64Metric(name string, metadata Uint64Metadata) *FakeUint64Metric { return &FakeUint64Metric{} } // NewDistributionMetric creates a fake distribution metric. func (b *FakeMetricBuilder) NewDistributionMetric(name string, sync bool, bucketer Bucketer, unit pb.MetricMetadata_Units, description string, fields ...Field) (*FakeDistributionMetric, error) { return &FakeDistributionMetric{}, nil } // MustCreateNewDistributionMetric creates a fake distribution metric. func (b *FakeMetricBuilder) MustCreateNewDistributionMetric(name string, sync bool, bucketer Bucketer, unit pb.MetricMetadata_Units, description string, fields ...Field) *FakeDistributionMetric { return &FakeDistributionMetric{} } // NewTimerMetric creates a fake timer metric. func (b *FakeMetricBuilder) NewTimerMetric(name string, nanoBucketer Bucketer, description string, fields ...Field) (*FakeTimerMetric, error) { return &FakeTimerMetric{}, nil } // MustCreateNewTimerMetric creates a fake timer metric. func (b *FakeMetricBuilder) MustCreateNewTimerMetric(name string, nanoBucketer Bucketer, description string, fields ...Field) *FakeTimerMetric { return &FakeTimerMetric{} } // RealMetricBuilder is a type used to produce conditionally compiled metrics. // Methods of this struct produce real active metrics. type RealMetricBuilder struct{} // NewUint64Metric calls the generic metric.NewUint64Metric to produce a real // Uint64 metric. func (b *RealMetricBuilder) NewUint64Metric(name string, metadata Uint64Metadata) (*Uint64Metric, error) { m, err := NewUint64Metric(name, metadata) if err != nil { return m, err } definedProfilingMetrics = append(definedProfilingMetrics, m.name) return m, err } // MustCreateNewUint64Metric creates a real Uint64 metric or panics if unable to // do so. func (b *RealMetricBuilder) MustCreateNewUint64Metric(name string, metadata Uint64Metadata) *Uint64Metric { m, err := b.NewUint64Metric(name, metadata) if err != nil { panic(fmt.Sprintf("Unable to create metric %q: %s", name, err)) } return m } // NewDistributionMetric calls the generic metric.NewDistributionMetric to // produce a real distribution metric. func (b *RealMetricBuilder) NewDistributionMetric(name string, sync bool, bucketer Bucketer, unit pb.MetricMetadata_Units, description string, fields ...Field) (*DistributionMetric, error) { return NewDistributionMetric(name, sync, bucketer, unit, description, fields...) } // MustCreateNewDistributionMetric creates a real distribution metric or panics // if unable to do so. func (b *RealMetricBuilder) MustCreateNewDistributionMetric(name string, sync bool, bucketer Bucketer, unit pb.MetricMetadata_Units, description string, fields ...Field) *DistributionMetric { return MustCreateNewDistributionMetric(name, sync, bucketer, unit, description, fields...) } // NewTimerMetric calls the generic metric.NewTimerMetric to produce a real timer // metric. func (b *RealMetricBuilder) NewTimerMetric(name string, nanoBucketer Bucketer, description string, fields ...Field) (*TimerMetric, error) { return NewTimerMetric(name, nanoBucketer, description, fields...) } // MustCreateNewTimerMetric creates a real timer metric or panics if unable to // do so. func (b *RealMetricBuilder) MustCreateNewTimerMetric(name string, nanoBucketer Bucketer, description string, fields ...Field) *TimerMetric { return MustCreateNewTimerMetric(name, nanoBucketer, description, fields...) } golang-gvisor-gvisor-0.0~20240729.0/pkg/metric/metric.go000066400000000000000000001622751465435605700225360ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package metric provides primitives for collecting metrics. package metric import ( "errors" "fmt" "math" re "regexp" "sort" "strings" "time" "google.golang.org/protobuf/types/known/timestamppb" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/log" pb "gvisor.dev/gvisor/pkg/metric/metric_go_proto" "gvisor.dev/gvisor/pkg/prometheus" "gvisor.dev/gvisor/pkg/sync" ) var ( // ErrNameInUse indicates that another metric is already defined for // the given name. ErrNameInUse = errors.New("metric name already in use") // ErrInitializationDone indicates that the caller tried to create a // new metric after initialization. ErrInitializationDone = errors.New("metric cannot be created after initialization is complete") // ErrFieldValueContainsIllegalChar indicates that the value of a metric // field had an invalid character in it. ErrFieldValueContainsIllegalChar = errors.New("metric field value contains illegal character") // ErrFieldHasNoAllowedValues indicates that the field needs to define some // allowed values to be a valid and useful field. ErrFieldHasNoAllowedValues = errors.New("metric field does not define any allowed values") // ErrTooManyFieldCombinations indicates that the number of unique // combinations of fields is too large to support. ErrTooManyFieldCombinations = errors.New("metric has too many combinations of allowed field values") ) // Weirdness metric type constants. var ( WeirdnessTypeTimeFallback = FieldValue{"time_fallback"} WeirdnessTypePartialResult = FieldValue{"partial_result"} WeirdnessTypeVsyscallCount = FieldValue{"vsyscall_count"} WeirdnessTypeWatchdogStuckStartup = FieldValue{"watchdog_stuck_startup"} WeirdnessTypeWatchdogStuckTasks = FieldValue{"watchdog_stuck_tasks"} ) // Suspicious operations metric type constants. var ( SuspiciousOperationsTypeOpenedWriteExecuteFile = FieldValue{"opened_write_execute_file"} ) // List of global metrics that are used in multiple places. var ( // WeirdnessMetric is a metric with fields created to track the number // of weird occurrences such as time fallback, partial_result, vsyscall // count, watchdog startup timeouts and stuck tasks. WeirdnessMetric = MustCreateNewUint64Metric( "/weirdness", Uint64Metadata{ Cumulative: true, Sync: true, Description: "Increment for weird occurrences of problems such as time fallback, partial result, vsyscalls invoked in the sandbox, watchdog startup timeouts and stuck tasks.", Fields: []Field{ NewField("weirdness_type", &WeirdnessTypeTimeFallback, &WeirdnessTypePartialResult, &WeirdnessTypeVsyscallCount, &WeirdnessTypeWatchdogStuckStartup, &WeirdnessTypeWatchdogStuckTasks, ), }, }) // SuspiciousOperationsMetric is a metric with fields created to detect // operations such as opening an executable file to write from a gofer. SuspiciousOperationsMetric = MustCreateNewUint64Metric( "/suspicious_operations", Uint64Metadata{ Cumulative: true, Sync: true, Description: "Increment for suspicious operations such as opening an executable file to write from a gofer.", Fields: []Field{ NewField("operation_type", &SuspiciousOperationsTypeOpenedWriteExecuteFile, ), }, }) ) // InitStage is the name of a Sentry initialization stage. type InitStage string // List of all Sentry initialization stages. var ( InitRestoreConfig InitStage = "restore_config" InitExecConfig InitStage = "exec_config" InitRestore InitStage = "restore" InitCreateProcess InitStage = "create_process" InitTaskStart InitStage = "task_start" // allStages is the list of allowed stages. allStages = []InitStage{ InitRestoreConfig, InitExecConfig, InitRestore, InitCreateProcess, InitTaskStart, } ) // Uint64Metric encapsulates a uint64 that represents some kind of metric to be // monitored. // // Metrics are not saved across save/restore and thus reset to zero on restore. type Uint64Metric struct { name string // fields is the map of field-value combination index keys to Uint64 counters. fields []atomicbitops.Uint64 // fieldMapper is used to generate index keys for the fields array (above) // based on field value combinations, and vice-versa. fieldMapper fieldMapper } var ( // initialized indicates that all metrics are registered. allMetrics is // immutable once initialized is true. initialized atomicbitops.Bool // allMetrics are the registered metrics. allMetrics = makeMetricSet() ) // Initialize sends a metric registration event over the event channel. // // Precondition: // - All metrics are registered. // - Initialize/Disable has not been called. func Initialize() error { if initialized.Load() { return errors.New("metric.Initialize called after metric.Initialize or metric.Disable") } m := pb.MetricRegistration{} for _, v := range allMetrics.uint64Metrics { m.Metrics = append(m.Metrics, v.metadata) } for _, v := range allMetrics.distributionMetrics { m.Metrics = append(m.Metrics, v.metadata) } m.Stages = make([]string, 0, len(allStages)) for _, s := range allStages { m.Stages = append(m.Stages, string(s)) } allMetrics.registration = &m if err := eventchannel.Emit(&m); err != nil { return fmt.Errorf("unable to emit metric initialize event: %w", err) } if initialized.Swap(true) { return errors.New("raced with another call to metric.Initialize or metric.Disable") } return nil } // ErrNotYetInitialized is returned by GetMetricRegistration if metrics are not yet initialized. var ErrNotYetInitialized = errors.New("metrics are not yet initialized") // GetMetricRegistration returns the metric registration data for all registered metrics. // Must be called after Initialize(). // Returns ErrNotYetInitialized if metrics are not yet initialized. func GetMetricRegistration() (*pb.MetricRegistration, error) { if !initialized.Load() { return nil, ErrNotYetInitialized } if allMetrics.registration == nil { return nil, errors.New("metrics are disabled") } return allMetrics.registration, nil } // Disable sends an empty metric registration event over the event channel, // disabling metric collection. // // Precondition: // - All metrics are registered. // - Initialize/Disable has not been called. func Disable() error { if initialized.Load() { return errors.New("metric.Disable called after metric.Initialize or metric.Disable") } m := pb.MetricRegistration{} if err := eventchannel.Emit(&m); err != nil { return fmt.Errorf("unable to emit empty metric registration event (metrics disabled): %w", err) } if initialized.Swap(true) { return errors.New("raced with another call to metric.Initialize or metric.Disable") } return nil } // Uint64Metadata is the metadata for a uint64 metric. type Uint64Metadata struct { Cumulative bool Sync bool Unit pb.MetricMetadata_Units Description string Fields []Field } type customUint64Metric struct { // metadata describes the metric. It is immutable. metadata *pb.MetricMetadata // prometheusMetric describes the metric in Prometheus format. It is immutable. prometheusMetric *prometheus.Metric // fields is the set of fields of the metric. fields []Field // value returns the current value of the metric for the given set of // fields. It takes a variadic number of field values as argument. value func(fieldValues ...*FieldValue) uint64 // forEachNonZero calls the given function on each possible field value of // the metric where the metric's value is non-zero. // The passed-in function should not allocate new memory, and may not save // or modify `fields` directly, as the slice memory is reused across calls. // `forEachNonZero` does not guarantee that it will be called on a // consistent snapshot of this metric's values. // `forEachNonZero` may be nil. forEachNonZero func(f func(fields []*FieldValue, val uint64)) } // FieldValue is a string that can be used as a value for a Field. // It must be referred to by address when the Field is created and when its // metric value is modified. This ensures that the same FieldValue reference // is used, which in turn enables the metric code to use the address of a // FieldValue as comparison operator, rather than doing string comparisons. type FieldValue struct { Value string } // fieldMapperMapThreshold is the number of field values after which we switch // to using map lookups when looking up field values. // This value was determined using benchmarks to see which is fastest. const fieldMapperMapThreshold = 48 // Field contains the field name and allowed values for the metric which is // used in registration of the metric. type Field struct { // name is the metric field name. name string // values is the list of values for the field. // `values` is always populated but not always used for lookup. It depends // on the number of allowed field values. `values` is used for lookups on // fields with small numbers of field values. values []*FieldValue // valuesPtrMap is a map version of `values`. For each item in `values`, // its pointer is mapped to its index within `values`. // `valuesPtrMap` is used for fields with large numbers of possible values. // For fields with small numbers of field values, it is nil. // This map allows doing faster string matching than a normal string map, // as it avoids the string hashing step that normal string maps need to do. valuesPtrMap map[*FieldValue]int } // toProto returns the proto definition of this field, for use in metric // metadata. func (f Field) toProto() *pb.MetricMetadata_Field { allowedValues := make([]string, len(f.values)) for i, v := range f.values { allowedValues[i] = v.Value } return &pb.MetricMetadata_Field{ FieldName: f.name, AllowedValues: allowedValues, } } // NewField defines a new Field that can be used to break down a metric. // The set of allowedValues must be unique strings wrapped with `FieldValue`. // The *same* `FieldValue` pointers must be used during metric modifications. // In practice, in most cases, this means you should declare these // `FieldValue`s as package-level `var`s, and always use the address of these // package-level `var`s during metric modifications. func NewField(name string, allowedValues ...*FieldValue) Field { // Verify that all string values have a unique value. strMap := make(map[string]bool, len(allowedValues)) for _, v := range allowedValues { if strMap[v.Value] { panic(fmt.Sprintf("found duplicate field value: %q", v)) } strMap[v.Value] = true } if useMap := len(allowedValues) > fieldMapperMapThreshold; !useMap { return Field{ name: name, values: allowedValues, } } valuesPtrMap := make(map[*FieldValue]int, len(allowedValues)) for i, v := range allowedValues { valuesPtrMap[v] = i } return Field{ name: name, values: allowedValues, valuesPtrMap: valuesPtrMap, } } // fieldMapper provides multi-dimensional fields to a single unique integer key type fieldMapper struct { // fields is a list of Field objects, which importantly include individual // Field names which are used to perform the keyToMultiField function; and // allowedValues for each field type which are used to perform the lookup // function. fields []Field // numFieldCombinations is the number of unique keys for all possible field // combinations. numFieldCombinations int } // newFieldMapper returns a new fieldMapper for the given set of fields. func newFieldMapper(fields ...Field) (fieldMapper, error) { numFieldCombinations := 1 for _, f := range fields { // Disallow fields with no possible values. We could also ignore them // instead, but passing in a no-allowed-values field is probably a mistake. if len(f.values) == 0 { return fieldMapper{nil, 0}, ErrFieldHasNoAllowedValues } numFieldCombinations *= len(f.values) // Sanity check, could be useful in case someone dynamically generates too // many fields accidentally. if numFieldCombinations > math.MaxUint32 || numFieldCombinations < 0 { return fieldMapper{nil, 0}, ErrTooManyFieldCombinations } } return fieldMapper{ fields: fields, numFieldCombinations: numFieldCombinations, }, nil } // lookupSingle looks up a single key for a single field within fieldMapper. // It is used internally within lookupConcat. // It returns the updated `idx` and `remainingCombinationBucket` values. // +checkescape:all // //go:nosplit func (m fieldMapper) lookupSingle(fieldIndex int, fieldValue *FieldValue, idx, remainingCombinationBucket int) (int, int) { field := m.fields[fieldIndex] numValues := len(field.values) // Are we doing a linear search? if field.valuesPtrMap == nil { // We scan by pointers only. This means the caller must pass the same // FieldValue pointer as the one used in `NewField`. for valIdx, allowedVal := range field.values { if fieldValue == allowedVal { remainingCombinationBucket /= numValues idx += remainingCombinationBucket * valIdx return idx, remainingCombinationBucket } } panic("invalid field value or did not reuse the same FieldValue pointer as passed in NewField") } // Use map lookup instead. // Match using FieldValue pointer. // This avoids the string hashing step that string maps otherwise do. valIdx, found := field.valuesPtrMap[fieldValue] if found { remainingCombinationBucket /= numValues idx += remainingCombinationBucket * valIdx return idx, remainingCombinationBucket } panic("invalid field value or did not reuse the same FieldValue pointer as passed in NewField") } // lookupConcat looks up a key within the fieldMapper where the fields are // the concatenation of two list of fields. // The returned key is an index that can be used to access to map created by // makeMap(). // This *must* be called with the correct number of fields, or it will panic. // +checkescape:all // //go:nosplit func (m fieldMapper) lookupConcat(fields1, fields2 []*FieldValue) int { if (len(fields1) + len(fields2)) != len(m.fields) { panic("invalid field lookup depth") } idx := 0 remainingCombinationBucket := m.numFieldCombinations for i, val := range fields1 { idx, remainingCombinationBucket = m.lookupSingle(i, val, idx, remainingCombinationBucket) } numFields1 := len(fields1) for i, val := range fields2 { idx, remainingCombinationBucket = m.lookupSingle(i+numFields1, val, idx, remainingCombinationBucket) } return idx } // lookup looks up a key within the fieldMapper. // The returned key is an index that can be used to access to map created by // makeMap(). // This *must* be called with the correct number of fields, or it will panic. // +checkescape:all // //go:nosplit func (m fieldMapper) lookup(fields ...*FieldValue) int { return m.lookupConcat(fields, nil) } // numKeys returns the total number of key-to-field-combinations mappings // defined by the fieldMapper. // //go:nosplit func (m fieldMapper) numKeys() int { return m.numFieldCombinations } // makeDistributionSampleMap creates a two dimensional array, where: // - The first level corresponds to unique field value combinations and is // accessed using index "keys" made by fieldMapper. // - The second level corresponds to buckets within a metric. The number of // buckets is specified by numBuckets. func (m fieldMapper) makeDistributionSampleMap(numBuckets int) [][]atomicbitops.Uint64 { samples := make([][]atomicbitops.Uint64, m.numKeys()) for i := range samples { samples[i] = make([]atomicbitops.Uint64, numBuckets) } return samples } // keyToMultiField is the reverse of lookup/lookupConcat. The returned list of // field values corresponds to the same order of fields that were passed in to // newFieldMapper. func (m fieldMapper) keyToMultiField(key int) []string { depth := len(m.fields) if depth == 0 && key == 0 { return nil } fieldValues := make([]string, depth) remainingCombinationBucket := m.numFieldCombinations for i := 0; i < depth; i++ { remainingCombinationBucket /= len(m.fields[i].values) fieldValues[i] = m.fields[i].values[key/remainingCombinationBucket].Value key = key % remainingCombinationBucket } return fieldValues } // keyToMultiFieldInPlace does the operation described in `keyToMultiField` // but modifies `fieldValues` in-place. It must already be of size // `len(m.fields)`. // //go:nosplit func (m fieldMapper) keyToMultiFieldInPlace(key int, fieldValues []*FieldValue) { if len(m.fields) == 0 { return } depth := len(m.fields) remainingCombinationBucket := m.numFieldCombinations for i := 0; i < depth; i++ { remainingCombinationBucket /= len(m.fields[i].values) fieldValues[i] = m.fields[i].values[key/remainingCombinationBucket] key = key % remainingCombinationBucket } } // nameToPrometheusName transforms a path-style metric name (/foo/bar) into a Prometheus-style // metric name (foo_bar). func nameToPrometheusName(name string) string { return strings.ReplaceAll(strings.TrimPrefix(name, "/"), "/", "_") } var validMetricNameRegexp = re.MustCompile("^(?:/[_\\w]+)+$") // verifyName verifies that the given metric name is a valid path-style metric // name. func verifyName(name string) error { if !strings.HasPrefix(name, "/") { return fmt.Errorf("metric name must start with a '/': %q", name) } if !validMetricNameRegexp.MatchString(name) { return fmt.Errorf("invalid metric name: %q", name) } return nil } // RegisterCustomUint64Metric registers a metric with the given name. // // Register must only be called at init and will return and error if called // after Initialized. // // Preconditions: // - name must be globally unique. // - Initialize/Disable have not been called. // - value is expected to accept exactly len(fields) arguments. func RegisterCustomUint64Metric(name string, metadata Uint64Metadata, value func(...*FieldValue) uint64) error { if initialized.Load() { return ErrInitializationDone } if _, ok := allMetrics.uint64Metrics[name]; ok { return ErrNameInUse } if _, ok := allMetrics.distributionMetrics[name]; ok { return ErrNameInUse } promType := prometheus.TypeGauge if metadata.Cumulative { promType = prometheus.TypeCounter } allMetrics.uint64Metrics[name] = customUint64Metric{ metadata: &pb.MetricMetadata{ Name: name, PrometheusName: nameToPrometheusName(name), Description: metadata.Description, Cumulative: metadata.Cumulative, Sync: metadata.Sync, Type: pb.MetricMetadata_TYPE_UINT64, Units: metadata.Unit, }, prometheusMetric: &prometheus.Metric{ Name: nameToPrometheusName(name), Help: metadata.Description, Type: promType, }, fields: metadata.Fields, value: value, } // Metrics can exist without fields. if l := len(metadata.Fields); l > 1 { return fmt.Errorf("%d fields provided, must be <= 1", l) } for _, field := range metadata.Fields { allMetrics.uint64Metrics[name].metadata.Fields = append(allMetrics.uint64Metrics[name].metadata.Fields, field.toProto()) } return nil } // MustRegisterCustomUint64Metric calls RegisterCustomUint64Metric for metrics // without fields and panics if it returns an error. func MustRegisterCustomUint64Metric(name string, metadata Uint64Metadata, value func(...*FieldValue) uint64) { if err := RegisterCustomUint64Metric(name, metadata, value); err != nil { panic(fmt.Sprintf("Unable to register metric %q: %s", name, err)) } } // NewUint64Metric creates and registers a new cumulative metric with the given // name. // // Metrics must be statically defined (i.e., at init). func NewUint64Metric(name string, metadata Uint64Metadata) (*Uint64Metric, error) { if err := verifyName(name); err != nil { return nil, err } f, err := newFieldMapper(metadata.Fields...) if err != nil { return nil, err } m := Uint64Metric{ name: name, fieldMapper: f, fields: make([]atomicbitops.Uint64, f.numKeys()), } if err := RegisterCustomUint64Metric(name, metadata, m.Value); err != nil { return nil, err } cm := allMetrics.uint64Metrics[name] cm.forEachNonZero = m.forEachNonZero allMetrics.uint64Metrics[name] = cm return &m, nil } // MustCreateNewUint64Metric calls NewUint64Metric and panics if it returns // an error. func MustCreateNewUint64Metric(name string, metadata Uint64Metadata) *Uint64Metric { m, err := NewUint64Metric(name, metadata) if err != nil { panic(fmt.Sprintf("Unable to create metric %q: %s", name, err)) } return m } // Value returns the current value of the metric for the given set of fields. // This must be called with the correct number of field values or it will panic. // //go:nosplit func (m *Uint64Metric) Value(fieldValues ...*FieldValue) uint64 { key := m.fieldMapper.lookupConcat(fieldValues, nil) return m.fields[key].Load() } // forEachNonZero iterates over each field combination and calls the given // function whenever this metric's value is not zero. func (m *Uint64Metric) forEachNonZero(f func(fieldValues []*FieldValue, value uint64)) { numCombinations := m.fieldMapper.numKeys() if len(m.fieldMapper.fields) == 0 { // Special-case the "there are no fields" case for speed and to avoid // allocating a slice. if val := m.fields[0].Load(); val != 0 { f(nil, val) } return } var fieldValues []*FieldValue for k := 0; k < numCombinations; k++ { val := m.fields[k].Load() if val == 0 { continue } if fieldValues == nil { fieldValues = make([]*FieldValue, len(m.fieldMapper.fields)) } m.fieldMapper.keyToMultiFieldInPlace(k, fieldValues) f(fieldValues, val) } } // Increment increments the metric by 1. // This must be called with the correct number of field values or it will panic. // //go:nosplit func (m *Uint64Metric) Increment(fieldValues ...*FieldValue) { m.IncrementBy(1, fieldValues...) } // Decrement decrements the metric by 1. // This must be called with the correct number of field values or it will panic. // //go:nosplit func (m *Uint64Metric) Decrement(fieldValues ...*FieldValue) { m.IncrementBy(0xFFFFFFFFFFFFFFFF, fieldValues...) } // IncrementBy increments the metric by v. // It is also possible to use this function to decrement the metric by using // a two's-complement int64 representation of the negative number to add. // This must be called with the correct number of field values or it will panic. // //go:nosplit func (m *Uint64Metric) IncrementBy(v uint64, fieldValues ...*FieldValue) { key := m.fieldMapper.lookupConcat(fieldValues, nil) m.fields[key].Add(v) } // Set sets the metric to v. // This must be called with the correct number of field values or it will panic. // //go:nosplit func (m *Uint64Metric) Set(v uint64, fieldValues ...*FieldValue) { key := m.fieldMapper.lookupConcat(fieldValues, nil) m.fields[key].Store(v) } // Bucketer is an interface to bucket values into finite, distinct buckets. type Bucketer interface { // NumFiniteBuckets is the number of finite buckets in the distribution. // This is only called once and never expected to return a different value. NumFiniteBuckets() int // LowerBound takes the index of a bucket (within [0, NumBuckets()]) and // returns the inclusive lower bound of that bucket. // In other words, the lowest value of `x` for which `BucketIndex(x) == i` // should be `x = LowerBound(i)`. // The upper bound of a bucket is the lower bound of the next bucket. // The last bucket (with `bucketIndex == NumFiniteBuckets()`) is infinite, // i.e. it has no upper bound (but it still has a lower bound). LowerBound(bucketIndex int) int64 // BucketIndex takes a sample and returns the index of the bucket that the // sample should fall into. // Must return either: // - A value within [0, NumBuckets() -1] if the sample falls within a // finite bucket // - NumBuckets() if the sample falls within the last (infinite) bucket // - '-1' if the sample is lower than what any bucket can represent, i.e. // the sample should be in the implicit "underflow" bucket. // This function must be go:nosplit-compatible and have no escapes. // +checkescape:all BucketIndex(sample int64) int } // ExponentialBucketer implements Bucketer, with the first bucket starting // with 0 as lowest bound with `Width` width, and each subsequent bucket being // wider by a scaled exponentially-growing series, until `NumFiniteBuckets` // buckets exist. type ExponentialBucketer struct { // numFinitebuckets is the total number of finite buckets in the scheme. numFiniteBuckets int // width is the size of the first (0-th) finite bucket. width float64 // scale is a factor applied uniformly to the exponential growth portion // of the bucket size. scale float64 // growth is the exponential growth factor for finite buckets. // The n-th bucket is `growth` times wider than the (n-1)-th bucket. // Bucket sizes are floored, so `width` and `growth` must be large enough // such that the second bucket is actually wider than the first after // flooring (unless, of course, fixed-width buckets are what's desired). growth float64 // growthLog is math.Log(growth). growthLog float64 // maxSample is the max sample value which can be represented in a finite // bucket. maxSample int64 // lowerbounds is a precomputed set of lower bounds of the buckets. // The "underflow" bucket has no lower bound, so it is not included here. // lowerBounds[0] is the lower bound of the first finite bucket, which is // also the upper bound of the underflow bucket. // lowerBounds[numFiniteBuckets] is the lower bound of the overflow bucket. lowerBounds []int64 } // Minimum/maximum finite buckets for exponential bucketers. const ( exponentialMinBuckets = 1 exponentialMaxBuckets = 100 ) // NewExponentialBucketer returns a new Bucketer with exponential buckets. func NewExponentialBucketer(numFiniteBuckets int, width uint64, scale, growth float64) *ExponentialBucketer { if numFiniteBuckets < exponentialMinBuckets || numFiniteBuckets > exponentialMaxBuckets { panic(fmt.Sprintf("number of finite buckets must be in [%d, %d]", exponentialMinBuckets, exponentialMaxBuckets)) } if scale < 0 || growth < 0 { panic(fmt.Sprintf("scale and growth for exponential buckets must be >0, got scale=%f and growth=%f", scale, growth)) } b := &ExponentialBucketer{ numFiniteBuckets: numFiniteBuckets, width: float64(width), scale: scale, growth: growth, growthLog: math.Log(growth), lowerBounds: make([]int64, numFiniteBuckets+1), } b.lowerBounds[0] = 0 for i := 1; i <= numFiniteBuckets; i++ { b.lowerBounds[i] = int64(b.width*float64(i) + b.scale*math.Pow(b.growth, float64(i-1))) if b.lowerBounds[i] < 0 { panic(fmt.Sprintf("encountered bucket width overflow at bucket %d", i)) } } b.maxSample = b.lowerBounds[numFiniteBuckets] - 1 return b } // NumFiniteBuckets implements Bucketer.NumFiniteBuckets. func (b *ExponentialBucketer) NumFiniteBuckets() int { return int(b.numFiniteBuckets) } // LowerBound implements Bucketer.LowerBound. func (b *ExponentialBucketer) LowerBound(bucketIndex int) int64 { return b.lowerBounds[bucketIndex] } // BucketIndex implements Bucketer.BucketIndex. // +checkescape:all // //go:nosplit func (b *ExponentialBucketer) BucketIndex(sample int64) int { if sample < 0 { return -1 } if sample == 0 { return 0 } if sample > b.maxSample { return b.numFiniteBuckets } // Do a binary search. For the number of buckets we expect to deal with in // this code (a few dozen at most), this may be faster than computing a // logarithm. We can't use recursion because this would violate go:nosplit. lowIndex := 0 highIndex := b.numFiniteBuckets for { pivotIndex := (highIndex + lowIndex) >> 1 lowerBound := b.lowerBounds[pivotIndex] if sample < lowerBound { highIndex = pivotIndex continue } upperBound := b.lowerBounds[pivotIndex+1] if sample >= upperBound { lowIndex = pivotIndex continue } return pivotIndex } } // Verify that ExponentialBucketer implements Bucketer. var _ = (Bucketer)((*ExponentialBucketer)(nil)) // DistributionMetric represents a distribution of values in finite buckets. // It also separately keeps track of min/max in order to ascertain whether the // buckets can faithfully represent the range of values encountered in the // distribution. type DistributionMetric struct { // exponentialBucketer is the bucketing scheme used for this metric. // Because we need DistributionMetric.AddSample to be go:nosplit-compatible, // we cannot use an interface reference here, as we would not be able to call // it in AddSample. Instead, we need one field per Bucketer implementation, // and we call whichever one is in use in AddSample. exponentialBucketer *ExponentialBucketer // metadata is the metadata about this metric. It is immutable. metadata *pb.MetricMetadata // prometheusMetric describes the metric in Prometheus format. It is immutable. prometheusMetric *prometheus.Metric // fieldsToKey converts a multi-dimensional fields to a single string to use // as key for `samples`. fieldsToKey fieldMapper // samples is the number of samples that fell within each bucket. // It is mapped by the concatenation of the fields using `fieldsToKey`. // The value is a list of bucket sample counts, with the 0-th being the // "underflow bucket", i.e. the bucket of samples which cannot fall into // any bucket that the bucketer supports. // The i-th value is the number of samples that fell into the bucketer's // (i-1)-th finite bucket. // The last value is the number of samples that fell into the bucketer's // last (i.e. infinite) bucket. samples [][]atomicbitops.Uint64 // statistics is a set of statistics about each distribution. // It is mapped by the concatenation of the fields using `fieldsToKey`. statistics []distributionStatistics } // NewDistributionMetric creates and registers a new distribution metric. func NewDistributionMetric(name string, sync bool, bucketer Bucketer, unit pb.MetricMetadata_Units, description string, fields ...Field) (*DistributionMetric, error) { if err := verifyName(name); err != nil { return nil, err } if initialized.Load() { return nil, ErrInitializationDone } if _, ok := allMetrics.uint64Metrics[name]; ok { return nil, ErrNameInUse } if _, ok := allMetrics.distributionMetrics[name]; ok { return nil, ErrNameInUse } var exponentialBucketer *ExponentialBucketer if expBucketer, ok := bucketer.(*ExponentialBucketer); ok { exponentialBucketer = expBucketer } else { return nil, fmt.Errorf("unsupported bucketer implementation: %T", bucketer) } fieldsToKey, err := newFieldMapper(fields...) if err != nil { return nil, err } numFiniteBuckets := bucketer.NumFiniteBuckets() samples := fieldsToKey.makeDistributionSampleMap(numFiniteBuckets + 2) protoFields := make([]*pb.MetricMetadata_Field, len(fields)) for i, f := range fields { protoFields[i] = f.toProto() } lowerBounds := make([]int64, numFiniteBuckets+1) for i := 0; i <= numFiniteBuckets; i++ { lowerBounds[i] = bucketer.LowerBound(i) } allMetrics.distributionMetrics[name] = &DistributionMetric{ exponentialBucketer: exponentialBucketer, fieldsToKey: fieldsToKey, samples: samples, statistics: make([]distributionStatistics, fieldsToKey.numKeys()), metadata: &pb.MetricMetadata{ Name: name, PrometheusName: nameToPrometheusName(name), Description: description, Cumulative: false, Sync: sync, Type: pb.MetricMetadata_TYPE_DISTRIBUTION, Units: unit, Fields: protoFields, DistributionBucketLowerBounds: lowerBounds, }, prometheusMetric: &prometheus.Metric{ Name: nameToPrometheusName(name), Type: prometheus.TypeHistogram, Help: description, }, } return allMetrics.distributionMetrics[name], nil } // MustCreateNewDistributionMetric creates and registers a distribution metric. // If an error occurs, it panics. func MustCreateNewDistributionMetric(name string, sync bool, bucketer Bucketer, unit pb.MetricMetadata_Units, description string, fields ...Field) *DistributionMetric { distrib, err := NewDistributionMetric(name, sync, bucketer, unit, description, fields...) if err != nil { panic(err) } return distrib } // distributionStatistics is a set of useful statistics for a distribution. // As metric update operations must be non-blocking, this uses a bunch of // atomic numbers rather than a mutex. type distributionStatistics struct { // sampleCount is the total number of samples. sampleCount atomicbitops.Uint64 // sampleSum is the sum of samples. sampleSum atomicbitops.Int64 // sumOfSquaredDeviations is the running sum of squared deviations from the // mean of each sample. // This quantity is useful as part of Welford's online algorithm: // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm sumOfSquaredDeviations atomicbitops.Float64 // min and max are the minimum and maximum samples ever recorded. min, max atomicbitops.Int64 } // Update updates the distribution statistics with the given sample. // This function must be non-blocking, i.e. no mutexes. // As a result, it is not entirely accurate when it races with itself, // though the imprecision should be fairly small and should not practically // matter for distributions with more than a handful of records. func (s *distributionStatistics) Update(sample int64) { newSampleCount := s.sampleCount.Add(1) newSampleSum := s.sampleSum.Add(sample) if newSampleCount > 1 { // Not the first sample of the distribution. floatSample := float64(sample) oldMean := float64(newSampleSum-sample) / float64(newSampleCount-1) newMean := float64(newSampleSum) / float64(newSampleCount) devSquared := (floatSample - oldMean) * (floatSample - newMean) s.sumOfSquaredDeviations.Add(devSquared) // Update min and max. // We optimistically load racily here in the hope that it passes the CaS // operation. If it doesn't, we'll load it atomically, so this is not a // race. sync.RaceDisable() for oldMin := s.min.RacyLoad(); sample < oldMin && !s.min.CompareAndSwap(oldMin, sample); oldMin = s.min.Load() { } for oldMax := s.max.RacyLoad(); sample > oldMax && !s.max.CompareAndSwap(oldMax, sample); oldMax = s.max.Load() { } sync.RaceEnable() } else { // We are the first sample, so set the min and max to the current sample. // See above for why disabling race detection is safe here as well. sync.RaceDisable() if !s.min.CompareAndSwap(0, sample) { for oldMin := s.min.RacyLoad(); sample < oldMin && !s.min.CompareAndSwap(oldMin, sample); oldMin = s.min.Load() { } } if !s.max.CompareAndSwap(0, sample) { for oldMax := s.max.RacyLoad(); sample > oldMax && !s.max.CompareAndSwap(oldMax, sample); oldMax = s.max.Load() { } } sync.RaceEnable() } } // distributionStatisticsSnapshot an atomically-loaded snapshot of // distributionStatistics. type distributionStatisticsSnapshot struct { // sampleCount is the total number of samples. sampleCount uint64 // sampleSum is the sum of samples. sampleSum int64 // sumOfSquaredDeviations is the running sum of squared deviations from the // mean of each sample. // This quantity is useful as part of Welford's online algorithm: // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm sumOfSquaredDeviations float64 // min and max are the minimum and maximum samples ever recorded. min, max int64 } // Load generates a consistent snapshot of the distribution statistics. func (s *distributionStatistics) Load() distributionStatisticsSnapshot { // We start out reading things racily, but will verify each of them // atomically later in this function, so this is OK. Disable the race // checker for this part of the function. sync.RaceDisable() snapshot := distributionStatisticsSnapshot{ sampleCount: s.sampleCount.RacyLoad(), sampleSum: s.sampleSum.RacyLoad(), sumOfSquaredDeviations: s.sumOfSquaredDeviations.RacyLoad(), min: s.min.RacyLoad(), max: s.max.RacyLoad(), } sync.RaceEnable() // Now verify that we loaded an atomic snapshot of the statistics. // This relies on the fact that each update should at least change the // count statistic, so we should be able to tell if anything changed based // on whether we have an exact match with the currently-loaded values. // If not, we reload that value and try again until all is consistent. retry: if sampleCount := s.sampleCount.Load(); sampleCount != snapshot.sampleCount { snapshot.sampleCount = sampleCount goto retry } if sampleSum := s.sampleSum.Load(); sampleSum != snapshot.sampleSum { snapshot.sampleSum = sampleSum goto retry } if ssd := s.sumOfSquaredDeviations.Load(); ssd != snapshot.sumOfSquaredDeviations { snapshot.sumOfSquaredDeviations = ssd goto retry } if min := s.min.Load(); min != snapshot.min { snapshot.min = min goto retry } if max := s.max.Load(); max != snapshot.max { snapshot.max = max goto retry } return snapshot } // AddSample adds a sample to the distribution. // This *must* be called with the correct number of fields, or it will panic. // +checkescape:all // //go:nosplit func (d *DistributionMetric) AddSample(sample int64, fields ...*FieldValue) { d.addSampleByKey(sample, d.fieldsToKey.lookup(fields...)) } // addSampleByKey works like AddSample, with the field key already known. // +checkescape:all // //go:nosplit func (d *DistributionMetric) addSampleByKey(sample int64, key int) { bucket := d.exponentialBucketer.BucketIndex(sample) d.samples[key][bucket+1].Add(1) d.statistics[key].Update(sample) } // Minimum number of buckets for NewDurationBucket. const durationMinBuckets = 3 // NewDurationBucketer returns a Bucketer well-suited for measuring durations in // nanoseconds. Useful for NewTimerMetric. // minDuration and maxDuration are conservative estimates of the minimum and // maximum durations expected to be accurately measured by the Bucketer. func NewDurationBucketer(numFiniteBuckets int, minDuration, maxDuration time.Duration) Bucketer { if numFiniteBuckets < durationMinBuckets { panic(fmt.Sprintf("duration bucketer must have at least %d buckets, got %d", durationMinBuckets, numFiniteBuckets)) } minNs := minDuration.Nanoseconds() exponentCoversNs := float64(maxDuration.Nanoseconds()-int64(numFiniteBuckets-durationMinBuckets)*minNs) / float64(minNs) exponent := math.Log(exponentCoversNs) / math.Log(float64(numFiniteBuckets-durationMinBuckets)) minNs = int64(float64(minNs) / exponent) return NewExponentialBucketer(numFiniteBuckets, uint64(minNs), float64(minNs), exponent) } // TimerMetric wraps a distribution metric with convenience functions for // latency measurements, which is a popular specialization of distribution // metrics. type TimerMetric struct { DistributionMetric } // NewTimerMetric provides a convenient way to measure latencies. // The arguments are the same as `NewDistributionMetric`, except: // - `nanoBucketer`: Same as `NewDistribution`'s `bucketer`, expected to hold // durations in nanoseconds. Adjust parameters accordingly. // NewDurationBucketer may be helpful here. func NewTimerMetric(name string, nanoBucketer Bucketer, description string, fields ...Field) (*TimerMetric, error) { distrib, err := NewDistributionMetric(name, false, nanoBucketer, pb.MetricMetadata_UNITS_NANOSECONDS, description, fields...) if err != nil { return nil, err } return &TimerMetric{ DistributionMetric: *distrib, }, nil } // MustCreateNewTimerMetric creates and registers a timer metric. // If an error occurs, it panics. func MustCreateNewTimerMetric(name string, nanoBucketer Bucketer, description string, fields ...Field) *TimerMetric { timer, err := NewTimerMetric(name, nanoBucketer, description, fields...) if err != nil { panic(err) } return timer } // TimedOperation is used by TimerMetric to keep track of the time elapsed // between an operation starting and stopping. type TimedOperation struct { // metric is a reference to the timer metric for the operation. metric *TimerMetric // partialFields is a prefix of the fields used in this operation. // The rest of the fields is provided in TimedOperation.Finish. partialFields []*FieldValue // startedNs is the number of nanoseconds measured in TimerMetric.Start(). startedNs int64 } // Start starts a timer measurement for the given combination of fields. // It returns a TimedOperation which can be passed around as necessary to // measure the duration of the operation. // Once the operation is finished, call Finish on the TimedOperation. // The fields passed to Start may be partially specified; if so, the remaining // fields must be passed to TimedOperation.Finish. This is useful for cases // where which path an operation took is only known after it happens. This // path can be part of the fields passed to Finish. // +checkescape:all // //go:nosplit func (t *TimerMetric) Start(fields ...*FieldValue) TimedOperation { return TimedOperation{ metric: t, partialFields: fields, startedNs: CheapNowNano(), } } // Finish marks an operation as finished and records its duration. // `extraFields` is the rest of the fields appended to the fields passed to // `TimerMetric.Start`. The concatenation of these two must be the exact // number of fields that the underlying metric has. // +checkescape:all // //go:nosplit func (o TimedOperation) Finish(extraFields ...*FieldValue) { ended := CheapNowNano() fieldKey := o.metric.fieldsToKey.lookupConcat(o.partialFields, extraFields) o.metric.addSampleByKey(ended-o.startedNs, fieldKey) } // stageTiming contains timing data for an initialization stage. type stageTiming struct { stage InitStage started time.Time // ended is the zero time when the stage has not ended yet. ended time.Time } // inProgress returns whether this stage hasn't ended yet. func (s stageTiming) inProgress() bool { return !s.started.IsZero() && s.ended.IsZero() } // metricSet holds metric data. type metricSet struct { // Metric registration data for all the metrics below. registration *pb.MetricRegistration // Map of uint64 metrics. uint64Metrics map[string]customUint64Metric // Map of distribution metrics. distributionMetrics map[string]*DistributionMetric // mu protects the fields below. mu sync.RWMutex // Information about the stages reached by the Sentry. Only appended to, so // reading a shallow copy of the slice header concurrently is safe. finished []stageTiming // The current stage in progress. currentStage stageTiming } // makeMetricSet returns a new metricSet. func makeMetricSet() *metricSet { return &metricSet{ uint64Metrics: make(map[string]customUint64Metric), distributionMetrics: make(map[string]*DistributionMetric), finished: make([]stageTiming, 0, len(allStages)), } } // Values returns a snapshot of all values in m. func (m *metricSet) Values() metricValues { m.mu.Lock() stages := m.finished[:] m.mu.Unlock() vals := metricValues{ uint64Metrics: make(map[string]any, len(m.uint64Metrics)), distributionMetrics: make(map[string][][]uint64, len(m.distributionMetrics)), distributionTotalSamples: make(map[string][]uint64, len(m.distributionMetrics)), distributionStatistics: make(map[string][]distributionStatisticsSnapshot, len(m.distributionMetrics)), stages: stages, } for k, v := range m.uint64Metrics { fields := v.fields switch len(fields) { case 0: vals.uint64Metrics[k] = v.value() case 1: fieldsMap := make(map[*FieldValue]uint64) if v.forEachNonZero != nil { v.forEachNonZero(func(fieldValues []*FieldValue, val uint64) { fieldsMap[fieldValues[0]] = val }) } else { for _, fieldValue := range fields[0].values { fieldsMap[fieldValue] = v.value(fieldValue) } } vals.uint64Metrics[k] = fieldsMap default: panic(fmt.Sprintf("Unsupported number of metric fields: %d", len(fields))) } } for name, metric := range m.distributionMetrics { fieldKeysToValues := make([][]uint64, len(metric.samples)) fieldKeysToTotalSamples := make([]uint64, len(metric.samples)) fieldKeysToStatistics := make([]distributionStatisticsSnapshot, len(metric.samples)) for fieldKey, samples := range metric.samples { samplesSnapshot := snapshotDistribution(samples) totalSamples := uint64(0) for _, bucket := range samplesSnapshot { totalSamples += bucket } if totalSamples == 0 { // No samples recorded for this combination of field, so leave // the maps for this fieldKey as nil. This lessens the memory cost // of distributions with unused field combinations. fieldKeysToTotalSamples[fieldKey] = 0 fieldKeysToStatistics[fieldKey] = distributionStatisticsSnapshot{} fieldKeysToValues[fieldKey] = nil } else { fieldKeysToTotalSamples[fieldKey] = totalSamples fieldKeysToStatistics[fieldKey] = metric.statistics[fieldKey].Load() fieldKeysToValues[fieldKey] = samplesSnapshot } } vals.distributionMetrics[name] = fieldKeysToValues vals.distributionTotalSamples[name] = fieldKeysToTotalSamples vals.distributionStatistics[name] = fieldKeysToStatistics } return vals } // metricValues contains a copy of the values of all metrics. type metricValues struct { // uint64Metrics is a map of uint64 metrics, // with key as metric name. Value can be either uint64, or map[*FieldValue]uint64 // to support metrics with one field. uint64Metrics map[string]any // distributionMetrics is a map of distribution metrics. // The first key level is the metric name. // The second key level is an index ID corresponding to the combination of // field values. The index is decoded to field strings using keyToMultiField. // The slice value is the number of samples in each bucket of the // distribution, with the first (0-th) element being the underflow bucket // and the last element being the "infinite" (overflow) bucket. // The slice value may also be nil for field combinations with no samples. // This saves memory by avoiding storing anything for unused field // combinations. distributionMetrics map[string][][]uint64 // distributionTotalSamples is the total number of samples for each // distribution metric and field values. // It allows performing a quick diff between snapshots without having to // iterate over all the buckets individually, so that distributions with // no new samples are not retransmitted. distributionTotalSamples map[string][]uint64 // distributionStatistics is a set of statistics about the samples. distributionStatistics map[string][]distributionStatisticsSnapshot // Information on when initialization stages were reached. Does not include // the currently-ongoing stage, if any. stages []stageTiming } var ( // emitMu protects metricsAtLastEmit and ensures that all emitted // metrics are strongly ordered (older metrics are never emitted after // newer metrics). emitMu sync.Mutex // metricsAtLastEmit contains the state of the metrics at the last emit event. metricsAtLastEmit metricValues ) // EmitMetricUpdate emits a MetricUpdate over the event channel. // // Only metrics that have changed since the last call are emitted. // // EmitMetricUpdate is thread-safe. // // Preconditions: // - Initialize has been called. func EmitMetricUpdate() { emitMu.Lock() defer emitMu.Unlock() snapshot := allMetrics.Values() m := pb.MetricUpdate{} // On the first call metricsAtLastEmit will be empty. Include all // metrics then. for k, v := range snapshot.uint64Metrics { prev, ok := metricsAtLastEmit.uint64Metrics[k] switch t := v.(type) { case uint64: // Metric exists and value did not change. if ok && prev.(uint64) == t { continue } m.Metrics = append(m.Metrics, &pb.MetricValue{ Name: k, Value: &pb.MetricValue_Uint64Value{Uint64Value: t}, }) case map[*FieldValue]uint64: for fieldValue, metricValue := range t { // Emit data on the first call only if the field // value has been incremented. For all other // calls, emit data if the field value has been // changed from the previous emit. if (!ok && metricValue == 0) || (ok && prev.(map[*FieldValue]uint64)[fieldValue] == metricValue) { continue } m.Metrics = append(m.Metrics, &pb.MetricValue{ Name: k, FieldValues: []string{fieldValue.Value}, Value: &pb.MetricValue_Uint64Value{Uint64Value: metricValue}, }) } default: panic(fmt.Sprintf("unsupported type in uint64Metrics: %T (%v)", v, v)) } } for name, dist := range snapshot.distributionTotalSamples { prev, ok := metricsAtLastEmit.distributionTotalSamples[name] for fieldKey, currentTotal := range dist { if currentTotal == 0 { continue } if ok { if prevTotal := prev[fieldKey]; prevTotal == currentTotal { continue } } oldSamples := metricsAtLastEmit.distributionMetrics[name] var newSamples []uint64 if oldSamples != nil && oldSamples[fieldKey] != nil { currentSamples := snapshot.distributionMetrics[name][fieldKey] numBuckets := len(currentSamples) newSamples = make([]uint64, numBuckets) for i := 0; i < numBuckets; i++ { newSamples[i] = currentSamples[i] - oldSamples[fieldKey][i] } } else { // oldSamples == nil means that the previous snapshot has no samples. // This means the delta is the current number of samples, no need for // a copy. newSamples = snapshot.distributionMetrics[name][fieldKey] } m.Metrics = append(m.Metrics, &pb.MetricValue{ Name: name, FieldValues: allMetrics.distributionMetrics[name].fieldsToKey.keyToMultiField(fieldKey), Value: &pb.MetricValue_DistributionValue{ DistributionValue: &pb.Samples{ NewSamples: newSamples, }, }, }) } } for s := len(metricsAtLastEmit.stages); s < len(snapshot.stages); s++ { newStage := snapshot.stages[s] m.StageTiming = append(m.StageTiming, &pb.StageTiming{ Stage: string(newStage.stage), Started: ×tamppb.Timestamp{ Seconds: newStage.started.Unix(), Nanos: int32(newStage.started.Nanosecond()), }, Ended: ×tamppb.Timestamp{ Seconds: newStage.ended.Unix(), Nanos: int32(newStage.ended.Nanosecond()), }, }) } metricsAtLastEmit = snapshot if len(m.Metrics) == 0 && len(m.StageTiming) == 0 { return } if log.IsLogging(log.Debug) { sort.Slice(m.Metrics, func(i, j int) bool { return m.Metrics[i].GetName() < m.Metrics[j].GetName() }) log.Debugf("Emitting metrics:") for _, metric := range m.Metrics { var valueStr string switch metric.GetValue().(type) { case *pb.MetricValue_Uint64Value: valueStr = fmt.Sprintf("%d", metric.GetUint64Value()) case *pb.MetricValue_DistributionValue: valueStr = fmt.Sprintf("new distribution samples: %+v", metric.GetDistributionValue()) default: valueStr = "unsupported type" } if len(metric.GetFieldValues()) > 0 { var foundMetadata *pb.MetricMetadata if metricObj, found := allMetrics.uint64Metrics[metric.GetName()]; found { foundMetadata = metricObj.metadata } else if metricObj, found := allMetrics.distributionMetrics[metric.GetName()]; found { foundMetadata = metricObj.metadata } if foundMetadata == nil || len(foundMetadata.GetFields()) != len(metric.GetFieldValues()) { // This should never happen, but if it somehow does, we don't want to crash here, as // this is debug output that may already be printed in the context of panic. log.Debugf("%s%v (cannot find metric definition!): %s", metric.GetName(), metric.GetFieldValues(), valueStr) continue } var sb strings.Builder for i, fieldValue := range metric.GetFieldValues() { if i > 0 { sb.WriteRune(',') } sb.WriteString(foundMetadata.GetFields()[i].GetFieldName()) sb.WriteRune('=') sb.WriteString(fieldValue) } log.Debugf(" Metric %s[%s]: %s", metric.GetName(), sb.String(), valueStr) } else { log.Debugf(" Metric %s: %s", metric.GetName(), valueStr) } } for _, stage := range m.StageTiming { duration := time.Duration(stage.Ended.Seconds-stage.Started.Seconds)*time.Second + time.Duration(stage.Ended.Nanos-stage.Started.Nanos)*time.Nanosecond log.Debugf("Stage %s took %v", stage.GetStage(), duration) } } if err := eventchannel.Emit(&m); err != nil { log.Warningf("Unable to emit metrics: %s", err) } } // SnapshotOptions controls how snapshots are exported in GetSnapshot. type SnapshotOptions struct { // Filter, if set, should return true for metrics that should be written to // the snapshot. If unset, all metrics are written to the snapshot. Filter func(*prometheus.Metric) bool } // GetSnapshot returns a Prometheus snapshot of the metric data. // Returns ErrNotYetInitialized if metrics have not yet been initialized. func GetSnapshot(options SnapshotOptions) (*prometheus.Snapshot, error) { if !initialized.Load() { return nil, ErrNotYetInitialized } values := allMetrics.Values() snapshot := prometheus.NewSnapshot() for k, v := range values.uint64Metrics { m := allMetrics.uint64Metrics[k] if options.Filter != nil && !options.Filter(m.prometheusMetric) { continue } switch t := v.(type) { case uint64: if m.metadata.GetCumulative() && t == 0 { // Zero-valued counter, ignore. continue } snapshot.Add(prometheus.NewIntData(m.prometheusMetric, int64(t))) case map[*FieldValue]uint64: for fieldValue, metricValue := range t { if m.metadata.GetCumulative() && metricValue == 0 { // Zero-valued counter, ignore. continue } snapshot.Add(prometheus.LabeledIntData(m.prometheusMetric, map[string]string{ // uint64 metrics currently only support at most one field name. m.metadata.Fields[0].GetFieldName(): fieldValue.Value, }, int64(metricValue))) } default: panic(fmt.Sprintf("unsupported type in uint64Metrics: %T (%v)", v, v)) } } for k, dists := range values.distributionTotalSamples { m := allMetrics.distributionMetrics[k] if options.Filter != nil && !options.Filter(m.prometheusMetric) { continue } distributionSamples := values.distributionMetrics[k] numFiniteBuckets := m.exponentialBucketer.NumFiniteBuckets() statistics := values.distributionStatistics[k] for fieldKey := range dists { var labels map[string]string if numFields := m.fieldsToKey.numKeys(); numFields > 0 { labels = make(map[string]string, numFields) for fieldIndex, field := range m.fieldsToKey.keyToMultiField(fieldKey) { labels[m.metadata.Fields[fieldIndex].GetFieldName()] = field } } currentSamples := distributionSamples[fieldKey] buckets := make([]prometheus.Bucket, numFiniteBuckets+2) samplesForFieldKey := uint64(0) for b := 0; b < numFiniteBuckets+2; b++ { var upperBound prometheus.Number if b == numFiniteBuckets+1 { upperBound = prometheus.Number{Float: math.Inf(1)} // Overflow bucket. } else { upperBound = prometheus.Number{Int: m.exponentialBucketer.LowerBound(b)} } samples := uint64(0) if currentSamples != nil { samples = currentSamples[b] samplesForFieldKey += samples } buckets[b] = prometheus.Bucket{ Samples: samples, UpperBound: upperBound, } } if samplesForFieldKey == 0 { // Zero-valued distribution (no samples in any bucket for this field // combination). Ignore. continue } snapshot.Add(&prometheus.Data{ Metric: m.prometheusMetric, Labels: labels, HistogramValue: &prometheus.Histogram{ Total: prometheus.Number{Int: statistics[fieldKey].sampleSum}, SumOfSquaredDeviations: prometheus.Number{Float: statistics[fieldKey].sumOfSquaredDeviations}, Min: prometheus.Number{Int: statistics[fieldKey].min}, Max: prometheus.Number{Int: statistics[fieldKey].max}, Buckets: buckets, }, }) } } return snapshot, nil } // StartStage should be called when an initialization stage is started. // It returns a function that must be called to indicate that the stage ended. // Alternatively, future calls to StartStage will implicitly indicate that the // previous stage ended. // Stage information will be emitted in the next call to EmitMetricUpdate after // a stage has ended. // // This function may (and is expected to) be called prior to final // initialization of this metric library, as it has to capture early stages // of Sentry initialization. func StartStage(stage InitStage) func() { now := time.Now() allMetrics.mu.Lock() defer allMetrics.mu.Unlock() if allMetrics.currentStage.inProgress() { endStage(now) } allMetrics.currentStage.stage = stage allMetrics.currentStage.started = now return func() { now := time.Now() allMetrics.mu.Lock() defer allMetrics.mu.Unlock() // The current stage may have been ended by another call to StartStage, so // double-check prior to clearing the current stage. if allMetrics.currentStage.inProgress() && allMetrics.currentStage.stage == stage { endStage(now) } } } // endStage marks allMetrics.currentStage as ended, adding it to the list of // finished stages. It assumes allMetrics.mu is locked. func endStage(when time.Time) { allMetrics.currentStage.ended = when allMetrics.finished = append(allMetrics.finished, allMetrics.currentStage) allMetrics.currentStage = stageTiming{} } golang-gvisor-gvisor-0.0~20240729.0/pkg/metric/metric_go_proto/000077500000000000000000000000001465435605700241025ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/metric/metric_go_proto/metric.pb.go000066400000000000000000000701031465435605700263150ustar00rootroot00000000000000// Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.31.0 // protoc v5.26.1 // source: pkg/metric/metric.proto package metric_go_proto import ( protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" timestamppb "google.golang.org/protobuf/types/known/timestamppb" reflect "reflect" sync "sync" ) const ( // Verify that this generated code is sufficiently up-to-date. _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) // Verify that runtime/protoimpl is sufficiently up-to-date. _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) type MetricMetadata_Type int32 const ( MetricMetadata_TYPE_UINT64 MetricMetadata_Type = 0 MetricMetadata_TYPE_DISTRIBUTION MetricMetadata_Type = 1 ) // Enum value maps for MetricMetadata_Type. var ( MetricMetadata_Type_name = map[int32]string{ 0: "TYPE_UINT64", 1: "TYPE_DISTRIBUTION", } MetricMetadata_Type_value = map[string]int32{ "TYPE_UINT64": 0, "TYPE_DISTRIBUTION": 1, } ) func (x MetricMetadata_Type) Enum() *MetricMetadata_Type { p := new(MetricMetadata_Type) *p = x return p } func (x MetricMetadata_Type) String() string { return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) } func (MetricMetadata_Type) Descriptor() protoreflect.EnumDescriptor { return file_pkg_metric_metric_proto_enumTypes[0].Descriptor() } func (MetricMetadata_Type) Type() protoreflect.EnumType { return &file_pkg_metric_metric_proto_enumTypes[0] } func (x MetricMetadata_Type) Number() protoreflect.EnumNumber { return protoreflect.EnumNumber(x) } // Deprecated: Use MetricMetadata_Type.Descriptor instead. func (MetricMetadata_Type) EnumDescriptor() ([]byte, []int) { return file_pkg_metric_metric_proto_rawDescGZIP(), []int{0, 0} } type MetricMetadata_Units int32 const ( MetricMetadata_UNITS_NONE MetricMetadata_Units = 0 MetricMetadata_UNITS_NANOSECONDS MetricMetadata_Units = 1 ) // Enum value maps for MetricMetadata_Units. var ( MetricMetadata_Units_name = map[int32]string{ 0: "UNITS_NONE", 1: "UNITS_NANOSECONDS", } MetricMetadata_Units_value = map[string]int32{ "UNITS_NONE": 0, "UNITS_NANOSECONDS": 1, } ) func (x MetricMetadata_Units) Enum() *MetricMetadata_Units { p := new(MetricMetadata_Units) *p = x return p } func (x MetricMetadata_Units) String() string { return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) } func (MetricMetadata_Units) Descriptor() protoreflect.EnumDescriptor { return file_pkg_metric_metric_proto_enumTypes[1].Descriptor() } func (MetricMetadata_Units) Type() protoreflect.EnumType { return &file_pkg_metric_metric_proto_enumTypes[1] } func (x MetricMetadata_Units) Number() protoreflect.EnumNumber { return protoreflect.EnumNumber(x) } // Deprecated: Use MetricMetadata_Units.Descriptor instead. func (MetricMetadata_Units) EnumDescriptor() ([]byte, []int) { return file_pkg_metric_metric_proto_rawDescGZIP(), []int{0, 1} } type MetricMetadata struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` PrometheusName string `protobuf:"bytes,9,opt,name=prometheus_name,json=prometheusName,proto3" json:"prometheus_name,omitempty"` Description string `protobuf:"bytes,2,opt,name=description,proto3" json:"description,omitempty"` Cumulative bool `protobuf:"varint,3,opt,name=cumulative,proto3" json:"cumulative,omitempty"` Sync bool `protobuf:"varint,4,opt,name=sync,proto3" json:"sync,omitempty"` Type MetricMetadata_Type `protobuf:"varint,5,opt,name=type,proto3,enum=gvisor.MetricMetadata_Type" json:"type,omitempty"` Units MetricMetadata_Units `protobuf:"varint,6,opt,name=units,proto3,enum=gvisor.MetricMetadata_Units" json:"units,omitempty"` Fields []*MetricMetadata_Field `protobuf:"bytes,7,rep,name=fields,proto3" json:"fields,omitempty"` DistributionBucketLowerBounds []int64 `protobuf:"varint,8,rep,packed,name=distribution_bucket_lower_bounds,json=distributionBucketLowerBounds,proto3" json:"distribution_bucket_lower_bounds,omitempty"` } func (x *MetricMetadata) Reset() { *x = MetricMetadata{} if protoimpl.UnsafeEnabled { mi := &file_pkg_metric_metric_proto_msgTypes[0] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *MetricMetadata) String() string { return protoimpl.X.MessageStringOf(x) } func (*MetricMetadata) ProtoMessage() {} func (x *MetricMetadata) ProtoReflect() protoreflect.Message { mi := &file_pkg_metric_metric_proto_msgTypes[0] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use MetricMetadata.ProtoReflect.Descriptor instead. func (*MetricMetadata) Descriptor() ([]byte, []int) { return file_pkg_metric_metric_proto_rawDescGZIP(), []int{0} } func (x *MetricMetadata) GetName() string { if x != nil { return x.Name } return "" } func (x *MetricMetadata) GetPrometheusName() string { if x != nil { return x.PrometheusName } return "" } func (x *MetricMetadata) GetDescription() string { if x != nil { return x.Description } return "" } func (x *MetricMetadata) GetCumulative() bool { if x != nil { return x.Cumulative } return false } func (x *MetricMetadata) GetSync() bool { if x != nil { return x.Sync } return false } func (x *MetricMetadata) GetType() MetricMetadata_Type { if x != nil { return x.Type } return MetricMetadata_TYPE_UINT64 } func (x *MetricMetadata) GetUnits() MetricMetadata_Units { if x != nil { return x.Units } return MetricMetadata_UNITS_NONE } func (x *MetricMetadata) GetFields() []*MetricMetadata_Field { if x != nil { return x.Fields } return nil } func (x *MetricMetadata) GetDistributionBucketLowerBounds() []int64 { if x != nil { return x.DistributionBucketLowerBounds } return nil } type MetricRegistration struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Metrics []*MetricMetadata `protobuf:"bytes,1,rep,name=metrics,proto3" json:"metrics,omitempty"` Stages []string `protobuf:"bytes,2,rep,name=stages,proto3" json:"stages,omitempty"` } func (x *MetricRegistration) Reset() { *x = MetricRegistration{} if protoimpl.UnsafeEnabled { mi := &file_pkg_metric_metric_proto_msgTypes[1] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *MetricRegistration) String() string { return protoimpl.X.MessageStringOf(x) } func (*MetricRegistration) ProtoMessage() {} func (x *MetricRegistration) ProtoReflect() protoreflect.Message { mi := &file_pkg_metric_metric_proto_msgTypes[1] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use MetricRegistration.ProtoReflect.Descriptor instead. func (*MetricRegistration) Descriptor() ([]byte, []int) { return file_pkg_metric_metric_proto_rawDescGZIP(), []int{1} } func (x *MetricRegistration) GetMetrics() []*MetricMetadata { if x != nil { return x.Metrics } return nil } func (x *MetricRegistration) GetStages() []string { if x != nil { return x.Stages } return nil } type Samples struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields NewSamples []uint64 `protobuf:"varint,1,rep,packed,name=new_samples,json=newSamples,proto3" json:"new_samples,omitempty"` } func (x *Samples) Reset() { *x = Samples{} if protoimpl.UnsafeEnabled { mi := &file_pkg_metric_metric_proto_msgTypes[2] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Samples) String() string { return protoimpl.X.MessageStringOf(x) } func (*Samples) ProtoMessage() {} func (x *Samples) ProtoReflect() protoreflect.Message { mi := &file_pkg_metric_metric_proto_msgTypes[2] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Samples.ProtoReflect.Descriptor instead. func (*Samples) Descriptor() ([]byte, []int) { return file_pkg_metric_metric_proto_rawDescGZIP(), []int{2} } func (x *Samples) GetNewSamples() []uint64 { if x != nil { return x.NewSamples } return nil } type MetricValue struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` // Types that are assignable to Value: // // *MetricValue_Uint64Value // *MetricValue_DistributionValue Value isMetricValue_Value `protobuf_oneof:"value"` FieldValues []string `protobuf:"bytes,4,rep,name=field_values,json=fieldValues,proto3" json:"field_values,omitempty"` } func (x *MetricValue) Reset() { *x = MetricValue{} if protoimpl.UnsafeEnabled { mi := &file_pkg_metric_metric_proto_msgTypes[3] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *MetricValue) String() string { return protoimpl.X.MessageStringOf(x) } func (*MetricValue) ProtoMessage() {} func (x *MetricValue) ProtoReflect() protoreflect.Message { mi := &file_pkg_metric_metric_proto_msgTypes[3] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use MetricValue.ProtoReflect.Descriptor instead. func (*MetricValue) Descriptor() ([]byte, []int) { return file_pkg_metric_metric_proto_rawDescGZIP(), []int{3} } func (x *MetricValue) GetName() string { if x != nil { return x.Name } return "" } func (m *MetricValue) GetValue() isMetricValue_Value { if m != nil { return m.Value } return nil } func (x *MetricValue) GetUint64Value() uint64 { if x, ok := x.GetValue().(*MetricValue_Uint64Value); ok { return x.Uint64Value } return 0 } func (x *MetricValue) GetDistributionValue() *Samples { if x, ok := x.GetValue().(*MetricValue_DistributionValue); ok { return x.DistributionValue } return nil } func (x *MetricValue) GetFieldValues() []string { if x != nil { return x.FieldValues } return nil } type isMetricValue_Value interface { isMetricValue_Value() } type MetricValue_Uint64Value struct { Uint64Value uint64 `protobuf:"varint,2,opt,name=uint64_value,json=uint64Value,proto3,oneof"` } type MetricValue_DistributionValue struct { DistributionValue *Samples `protobuf:"bytes,3,opt,name=distribution_value,json=distributionValue,proto3,oneof"` } func (*MetricValue_Uint64Value) isMetricValue_Value() {} func (*MetricValue_DistributionValue) isMetricValue_Value() {} type StageTiming struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Stage string `protobuf:"bytes,1,opt,name=stage,proto3" json:"stage,omitempty"` Started *timestamppb.Timestamp `protobuf:"bytes,2,opt,name=started,proto3" json:"started,omitempty"` Ended *timestamppb.Timestamp `protobuf:"bytes,3,opt,name=ended,proto3" json:"ended,omitempty"` } func (x *StageTiming) Reset() { *x = StageTiming{} if protoimpl.UnsafeEnabled { mi := &file_pkg_metric_metric_proto_msgTypes[4] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *StageTiming) String() string { return protoimpl.X.MessageStringOf(x) } func (*StageTiming) ProtoMessage() {} func (x *StageTiming) ProtoReflect() protoreflect.Message { mi := &file_pkg_metric_metric_proto_msgTypes[4] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use StageTiming.ProtoReflect.Descriptor instead. func (*StageTiming) Descriptor() ([]byte, []int) { return file_pkg_metric_metric_proto_rawDescGZIP(), []int{4} } func (x *StageTiming) GetStage() string { if x != nil { return x.Stage } return "" } func (x *StageTiming) GetStarted() *timestamppb.Timestamp { if x != nil { return x.Started } return nil } func (x *StageTiming) GetEnded() *timestamppb.Timestamp { if x != nil { return x.Ended } return nil } type MetricUpdate struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Metrics []*MetricValue `protobuf:"bytes,1,rep,name=metrics,proto3" json:"metrics,omitempty"` StageTiming []*StageTiming `protobuf:"bytes,2,rep,name=stage_timing,json=stageTiming,proto3" json:"stage_timing,omitempty"` } func (x *MetricUpdate) Reset() { *x = MetricUpdate{} if protoimpl.UnsafeEnabled { mi := &file_pkg_metric_metric_proto_msgTypes[5] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *MetricUpdate) String() string { return protoimpl.X.MessageStringOf(x) } func (*MetricUpdate) ProtoMessage() {} func (x *MetricUpdate) ProtoReflect() protoreflect.Message { mi := &file_pkg_metric_metric_proto_msgTypes[5] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use MetricUpdate.ProtoReflect.Descriptor instead. func (*MetricUpdate) Descriptor() ([]byte, []int) { return file_pkg_metric_metric_proto_rawDescGZIP(), []int{5} } func (x *MetricUpdate) GetMetrics() []*MetricValue { if x != nil { return x.Metrics } return nil } func (x *MetricUpdate) GetStageTiming() []*StageTiming { if x != nil { return x.StageTiming } return nil } type MetricMetadata_Field struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields FieldName string `protobuf:"bytes,1,opt,name=field_name,json=fieldName,proto3" json:"field_name,omitempty"` AllowedValues []string `protobuf:"bytes,2,rep,name=allowed_values,json=allowedValues,proto3" json:"allowed_values,omitempty"` } func (x *MetricMetadata_Field) Reset() { *x = MetricMetadata_Field{} if protoimpl.UnsafeEnabled { mi := &file_pkg_metric_metric_proto_msgTypes[6] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *MetricMetadata_Field) String() string { return protoimpl.X.MessageStringOf(x) } func (*MetricMetadata_Field) ProtoMessage() {} func (x *MetricMetadata_Field) ProtoReflect() protoreflect.Message { mi := &file_pkg_metric_metric_proto_msgTypes[6] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use MetricMetadata_Field.ProtoReflect.Descriptor instead. func (*MetricMetadata_Field) Descriptor() ([]byte, []int) { return file_pkg_metric_metric_proto_rawDescGZIP(), []int{0, 0} } func (x *MetricMetadata_Field) GetFieldName() string { if x != nil { return x.FieldName } return "" } func (x *MetricMetadata_Field) GetAllowedValues() []string { if x != nil { return x.AllowedValues } return nil } var File_pkg_metric_metric_proto protoreflect.FileDescriptor var file_pkg_metric_metric_proto_rawDesc = []byte{ 0x0a, 0x17, 0x70, 0x6b, 0x67, 0x2f, 0x6d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x2f, 0x6d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x06, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x1a, 0x1f, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2f, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0xb6, 0x04, 0x0a, 0x0e, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x27, 0x0a, 0x0f, 0x70, 0x72, 0x6f, 0x6d, 0x65, 0x74, 0x68, 0x65, 0x75, 0x73, 0x5f, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x09, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0e, 0x70, 0x72, 0x6f, 0x6d, 0x65, 0x74, 0x68, 0x65, 0x75, 0x73, 0x4e, 0x61, 0x6d, 0x65, 0x12, 0x20, 0x0a, 0x0b, 0x64, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x64, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x1e, 0x0a, 0x0a, 0x63, 0x75, 0x6d, 0x75, 0x6c, 0x61, 0x74, 0x69, 0x76, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0a, 0x63, 0x75, 0x6d, 0x75, 0x6c, 0x61, 0x74, 0x69, 0x76, 0x65, 0x12, 0x12, 0x0a, 0x04, 0x73, 0x79, 0x6e, 0x63, 0x18, 0x04, 0x20, 0x01, 0x28, 0x08, 0x52, 0x04, 0x73, 0x79, 0x6e, 0x63, 0x12, 0x2f, 0x0a, 0x04, 0x74, 0x79, 0x70, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x1b, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x54, 0x79, 0x70, 0x65, 0x52, 0x04, 0x74, 0x79, 0x70, 0x65, 0x12, 0x32, 0x0a, 0x05, 0x75, 0x6e, 0x69, 0x74, 0x73, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x1c, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x55, 0x6e, 0x69, 0x74, 0x73, 0x52, 0x05, 0x75, 0x6e, 0x69, 0x74, 0x73, 0x12, 0x34, 0x0a, 0x06, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x18, 0x07, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x1c, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x52, 0x06, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x12, 0x47, 0x0a, 0x20, 0x64, 0x69, 0x73, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x62, 0x75, 0x63, 0x6b, 0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x77, 0x65, 0x72, 0x5f, 0x62, 0x6f, 0x75, 0x6e, 0x64, 0x73, 0x18, 0x08, 0x20, 0x03, 0x28, 0x03, 0x52, 0x1d, 0x64, 0x69, 0x73, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x69, 0x6f, 0x6e, 0x42, 0x75, 0x63, 0x6b, 0x65, 0x74, 0x4c, 0x6f, 0x77, 0x65, 0x72, 0x42, 0x6f, 0x75, 0x6e, 0x64, 0x73, 0x1a, 0x4d, 0x0a, 0x05, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x12, 0x1d, 0x0a, 0x0a, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x5f, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x4e, 0x61, 0x6d, 0x65, 0x12, 0x25, 0x0a, 0x0e, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x65, 0x64, 0x5f, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x09, 0x52, 0x0d, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x65, 0x64, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x73, 0x22, 0x2e, 0x0a, 0x04, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0f, 0x0a, 0x0b, 0x54, 0x59, 0x50, 0x45, 0x5f, 0x55, 0x49, 0x4e, 0x54, 0x36, 0x34, 0x10, 0x00, 0x12, 0x15, 0x0a, 0x11, 0x54, 0x59, 0x50, 0x45, 0x5f, 0x44, 0x49, 0x53, 0x54, 0x52, 0x49, 0x42, 0x55, 0x54, 0x49, 0x4f, 0x4e, 0x10, 0x01, 0x22, 0x2e, 0x0a, 0x05, 0x55, 0x6e, 0x69, 0x74, 0x73, 0x12, 0x0e, 0x0a, 0x0a, 0x55, 0x4e, 0x49, 0x54, 0x53, 0x5f, 0x4e, 0x4f, 0x4e, 0x45, 0x10, 0x00, 0x12, 0x15, 0x0a, 0x11, 0x55, 0x4e, 0x49, 0x54, 0x53, 0x5f, 0x4e, 0x41, 0x4e, 0x4f, 0x53, 0x45, 0x43, 0x4f, 0x4e, 0x44, 0x53, 0x10, 0x01, 0x22, 0x5e, 0x0a, 0x12, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x52, 0x65, 0x67, 0x69, 0x73, 0x74, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x30, 0x0a, 0x07, 0x6d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x16, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x52, 0x07, 0x6d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x73, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x67, 0x65, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x09, 0x52, 0x06, 0x73, 0x74, 0x61, 0x67, 0x65, 0x73, 0x22, 0x2a, 0x0a, 0x07, 0x53, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x73, 0x12, 0x1f, 0x0a, 0x0b, 0x6e, 0x65, 0x77, 0x5f, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x04, 0x52, 0x0a, 0x6e, 0x65, 0x77, 0x53, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x73, 0x22, 0xb4, 0x01, 0x0a, 0x0b, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x23, 0x0a, 0x0c, 0x75, 0x69, 0x6e, 0x74, 0x36, 0x34, 0x5f, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x48, 0x00, 0x52, 0x0b, 0x75, 0x69, 0x6e, 0x74, 0x36, 0x34, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x12, 0x40, 0x0a, 0x12, 0x64, 0x69, 0x73, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x0f, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x53, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x73, 0x48, 0x00, 0x52, 0x11, 0x64, 0x69, 0x73, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x69, 0x6f, 0x6e, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x12, 0x21, 0x0a, 0x0c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x5f, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x09, 0x52, 0x0b, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x73, 0x42, 0x07, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x8b, 0x01, 0x0a, 0x0b, 0x53, 0x74, 0x61, 0x67, 0x65, 0x54, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x67, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x73, 0x74, 0x61, 0x67, 0x65, 0x12, 0x34, 0x0a, 0x07, 0x73, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x52, 0x07, 0x73, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64, 0x12, 0x30, 0x0a, 0x05, 0x65, 0x6e, 0x64, 0x65, 0x64, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x52, 0x05, 0x65, 0x6e, 0x64, 0x65, 0x64, 0x22, 0x75, 0x0a, 0x0c, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x12, 0x2d, 0x0a, 0x07, 0x6d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x13, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x52, 0x07, 0x6d, 0x65, 0x74, 0x72, 0x69, 0x63, 0x73, 0x12, 0x36, 0x0a, 0x0c, 0x73, 0x74, 0x61, 0x67, 0x65, 0x5f, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x13, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x53, 0x74, 0x61, 0x67, 0x65, 0x54, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x52, 0x0b, 0x73, 0x74, 0x61, 0x67, 0x65, 0x54, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( file_pkg_metric_metric_proto_rawDescOnce sync.Once file_pkg_metric_metric_proto_rawDescData = file_pkg_metric_metric_proto_rawDesc ) func file_pkg_metric_metric_proto_rawDescGZIP() []byte { file_pkg_metric_metric_proto_rawDescOnce.Do(func() { file_pkg_metric_metric_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_metric_metric_proto_rawDescData) }) return file_pkg_metric_metric_proto_rawDescData } var file_pkg_metric_metric_proto_enumTypes = make([]protoimpl.EnumInfo, 2) var file_pkg_metric_metric_proto_msgTypes = make([]protoimpl.MessageInfo, 7) var file_pkg_metric_metric_proto_goTypes = []interface{}{ (MetricMetadata_Type)(0), // 0: gvisor.MetricMetadata.Type (MetricMetadata_Units)(0), // 1: gvisor.MetricMetadata.Units (*MetricMetadata)(nil), // 2: gvisor.MetricMetadata (*MetricRegistration)(nil), // 3: gvisor.MetricRegistration (*Samples)(nil), // 4: gvisor.Samples (*MetricValue)(nil), // 5: gvisor.MetricValue (*StageTiming)(nil), // 6: gvisor.StageTiming (*MetricUpdate)(nil), // 7: gvisor.MetricUpdate (*MetricMetadata_Field)(nil), // 8: gvisor.MetricMetadata.Field (*timestamppb.Timestamp)(nil), // 9: google.protobuf.Timestamp } var file_pkg_metric_metric_proto_depIdxs = []int32{ 0, // 0: gvisor.MetricMetadata.type:type_name -> gvisor.MetricMetadata.Type 1, // 1: gvisor.MetricMetadata.units:type_name -> gvisor.MetricMetadata.Units 8, // 2: gvisor.MetricMetadata.fields:type_name -> gvisor.MetricMetadata.Field 2, // 3: gvisor.MetricRegistration.metrics:type_name -> gvisor.MetricMetadata 4, // 4: gvisor.MetricValue.distribution_value:type_name -> gvisor.Samples 9, // 5: gvisor.StageTiming.started:type_name -> google.protobuf.Timestamp 9, // 6: gvisor.StageTiming.ended:type_name -> google.protobuf.Timestamp 5, // 7: gvisor.MetricUpdate.metrics:type_name -> gvisor.MetricValue 6, // 8: gvisor.MetricUpdate.stage_timing:type_name -> gvisor.StageTiming 9, // [9:9] is the sub-list for method output_type 9, // [9:9] is the sub-list for method input_type 9, // [9:9] is the sub-list for extension type_name 9, // [9:9] is the sub-list for extension extendee 0, // [0:9] is the sub-list for field type_name } func init() { file_pkg_metric_metric_proto_init() } func file_pkg_metric_metric_proto_init() { if File_pkg_metric_metric_proto != nil { return } if !protoimpl.UnsafeEnabled { file_pkg_metric_metric_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*MetricMetadata); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_metric_metric_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*MetricRegistration); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_metric_metric_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Samples); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_metric_metric_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*MetricValue); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_metric_metric_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*StageTiming); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_metric_metric_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*MetricUpdate); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_metric_metric_proto_msgTypes[6].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*MetricMetadata_Field); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } } file_pkg_metric_metric_proto_msgTypes[3].OneofWrappers = []interface{}{ (*MetricValue_Uint64Value)(nil), (*MetricValue_DistributionValue)(nil), } type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_pkg_metric_metric_proto_rawDesc, NumEnums: 2, NumMessages: 7, NumExtensions: 0, NumServices: 0, }, GoTypes: file_pkg_metric_metric_proto_goTypes, DependencyIndexes: file_pkg_metric_metric_proto_depIdxs, EnumInfos: file_pkg_metric_metric_proto_enumTypes, MessageInfos: file_pkg_metric_metric_proto_msgTypes, }.Build() File_pkg_metric_metric_proto = out.File file_pkg_metric_metric_proto_rawDesc = nil file_pkg_metric_metric_proto_goTypes = nil file_pkg_metric_metric_proto_depIdxs = nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/metric/metric_unsafe.go000066400000000000000000000040141465435605700240610ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package metric import ( "unsafe" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/sync" ) // snapshotDistribution snapshots the sample data of distribution metrics in // a non-consistent manner. // Distribution metrics don't need to be read consistently, because any // inconsistency (i.e. increments that race with the snapshot) will simply be // detected during the next snapshot instead. Reading them consistently would // require more synchronization during increments, which we need to be cheap. func snapshotDistribution(samples []atomicbitops.Uint64) []uint64 { // The number of buckets within a distribution never changes, so there is // no race condition from getting the number of buckets upfront. numBuckets := len(samples) snapshot := make([]uint64, numBuckets) if sync.RaceEnabled { // runtime.RaceDisable() doesn't actually stop the race detector, so it // can't help us here. Instead, call runtime.memmove directly, which is // not instrumented by the race detector. gohacks.Memmove(unsafe.Pointer(&snapshot[0]), unsafe.Pointer(&samples[0]), unsafe.Sizeof(uint64(0))*uintptr(numBuckets)) } else { for i := range samples { snapshot[i] = samples[i].RacyLoad() } } return snapshot } // CheapNowNano returns the a timestamp in nanoseconds. // It is *NOT* measured from the Unix epoch. // It is monotonic. // //go:nosplit func CheapNowNano() int64 { return gohacks.Nanotime() } golang-gvisor-gvisor-0.0~20240729.0/pkg/metric/profiling_metric.go000066400000000000000000000705321465435605700246010ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package metric import ( "bytes" "encoding/json" "errors" "fmt" "hash" "hash/adler32" "io" "os" "runtime" "strings" "time" "google.golang.org/protobuf/encoding/protojson" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/prometheus" "gvisor.dev/gvisor/pkg/sync" ) const ( // snapshotBufferSize is the number of snapshots within one item of the // ringbuffer. Increasing this number means less context-switching // overhead between collector and writer goroutines, but worse time // precision, as the precise time is refreshed every this many snapshots. snapshotBufferSize = 1024 // snapshotRingbufferSize is the number of items in the ringbuffer. // Increasing this number means the writer has more slack to catch up // if it falls behind, but it also means that the collector may need // to wait for longer intervals when the writer does fall behind, // adding more variance to the time gaps between collections. snapshotRingbufferSize = 128 // MetricsPrefix is prepended before every metrics line. MetricsPrefix = "GVISOR_METRICS\t" // MetricsHashIndicator is prepended before the hash of the metrics // data at the end of the metrics stream. MetricsHashIndicator = "ADLER32\t" // TimeColumn is the column header for the time column. TimeColumn = "Time (ns)" // MetricsMetaIndicator is prepended before every metrics metadata line // after metricsPrefix. MetricsMetaIndicator = "META\t" // MetricsStartTimeIndicator is prepended before the start time of the // metrics collection. MetricsStartTimeIndicator = "START_TIME\t" // MetricsStatsIndicator is prepended before the stats of the metrics // collection process. MetricsStatsIndicator = "STATS\t" ) // CollectionStats contains statistics about the profiling metrics collection // process itself. type CollectionStats struct { // mu protects the fields below. mu sync.Mutex `json:"-"` // CollectionRate is the rate at which the metrics are meant to be // collected. CollectionRateNanos uint64 `json:"collection_rate"` // CheapStartNanos is the time at which the collector started in nanoseconds, // as returned by CheapNowNano. CheapStartNanos uint64 `json:"cheap_start_nanos"` // CheapLastCollectionNanos is the time at which the last collection was // meant to be performed, in nanoseconds as returned by CheapNowNano. CheapLastCollectionNanos uint64 `json:"cheap_last_collection_nanos"` // TotalSnapshots is the total number of snapshots successfully taken. TotalSnapshots uint64 `json:"total_snapshots"` // TotalSleepTimingError is the running sum of absolute difference in timing // between when the collector was meant to start collecting a metric snapshot // vs when it actually collected it. It should be divided by TotalSnapshots // to get the average sleep timing error. TotalSleepTimingErrorNanos uint64 `json:"total_sleep_timing_error"` // TotalCollectionTimingError is the running sum of time spent doing actual // metric collection, i.e. retrieving numerical values from metrics. // The larger this duration is, the more time gap there is between the first // metric being collected and the last within a single metric collection // cycle. This can cause the metric data to be less accurate because all of // these points will be recorded as having the same timestamp despite this // not actually being the case. // It should be divided by TotalSnapshots to get the average // per-collection-cycle collection timing error. TotalCollectionTimingErrorNanos uint64 `json:"total_collection_timing_error"` // NumBackoffSleeps is the number of times the collector had to back off // because the writer was too slow. NumBackoffSleeps uint64 `json:"num_backoff_sleeps"` // TotalBackoffSleep is the running sum of time the collector had to back // off because the writer was too slow. TotalBackoffSleepNanos uint64 `json:"total_backoff_sleep"` } var ( // profilingMetricsStarted indicates whether StartProfilingMetrics has // been called. profilingMetricsStarted atomicbitops.Bool // stopProfilingMetrics is used to signal to the profiling metrics // goroutine to stop recording and writing metrics. stopProfilingMetrics atomicbitops.Bool // doneProfilingMetrics is used to signal that the profiling metrics // goroutines are finished. It carries information about the stats of // the profiling metrics collection process. doneProfilingMetrics chan *CollectionStats // definedProfilingMetrics is the set of metrics known to be created for // profiling (see condmetric_profiling.go). definedProfilingMetrics []string ) // snapshots is used to as temporary storage of metric data // before it's written to the writer. type snapshots struct { numMetrics int // startTime is the time at which collection started in nanoseconds. startTime int64 // ringbuffer is used to store metric data. ringbuffer [][]uint64 // curWriterIndex is the ringbuffer index currently being read by the // writer. It should not be used by the collector. curWriterIndex atomicbitops.Int32 } // writeReq is the message sent between from the collector to the writer. type writeReq struct { ringbufferIdx int // numLines indicates how many data lines are filled in the buffer. numLines int } // ProfilingMetricsWriter is the interface for profiling metrics sinks. type ProfilingMetricsWriter interface { // Write from the io.Writer interface. io.Writer // WriteString from the io.StringWriter interface. io.StringWriter // Truncate truncates the underlying writer, if possible. Truncate(size int64) error // Close closes the writer. Close() error } // ProfilingMetricsOptions is the set of options to profile metrics. type ProfilingMetricsOptions[T ProfilingMetricsWriter] struct { // Sink is the sink to write the profiling metrics data to. Sink T // Lossy specifies whether the sink is lossy, i.e. data may be dropped from // too large logging volume. In this case, data integrity is desirable at the // expense of extra CPU cost at data-writing time. The data will be prefixed // with `MetricsPrefix` and the hash of the data will be appended at the end. Lossy bool // Metrics is the comma-separated list of metrics to profile. Metrics string // Rate is the rate at which the metrics are collected. Rate time.Duration } // StartProfilingMetrics checks the ProfilingMetrics runsc flags and creates // goroutines responsible for outputting the profiling metric data. // // Preconditions: // - All metrics are registered. // - Initialize/Disable has been called. func StartProfilingMetrics[T ProfilingMetricsWriter](opts ProfilingMetricsOptions[T]) error { if !initialized.Load() { // Wait for initialization to complete to make sure that all // metrics are registered. return errors.New("metric initialization is not complete") } var values []func(fieldValues ...*FieldValue) uint64 var headers []string var columnHeaders strings.Builder columnHeaders.WriteString(TimeColumn) numMetrics := 0 if len(opts.Metrics) > 0 { metrics := strings.Split(opts.Metrics, ",") numMetrics = len(metrics) for _, name := range metrics { name := strings.TrimSpace(name) m, ok := allMetrics.uint64Metrics[name] if !ok { return fmt.Errorf("given profiling metric name '%s' does not correspond to a registered Uint64 metric", name) } if len(m.fields) > 0 { // TODO(b/240280155): Add support for field values. return fmt.Errorf("will not profile metric '%s' because it has metric fields which are not supported", name) } var metricMetadataHeader strings.Builder metricMetadataHeader.WriteString(MetricsMetaIndicator) metricMetadataHeader.WriteString(name) metricMetadataHeader.WriteRune('\t') metricMetadata, err := protojson.MarshalOptions{Multiline: false}.Marshal(m.metadata) if err != nil { return fmt.Errorf("failed to marshal metric schema for metric %q: %w", name, err) } metricMetadataHeader.Write(metricMetadata) headers = append(headers, metricMetadataHeader.String()) columnHeaders.WriteRune('\t') columnHeaders.WriteString(name) values = append(values, m.value) } if opts.Lossy { columnHeaders.WriteString("\tChecksum") } } else { if len(definedProfilingMetrics) > 0 { return fmt.Errorf("a value for --profiling-metrics was not specified; consider using a subset of '--profiling-metrics=%s'", strings.Join(definedProfilingMetrics, ",")) } return fmt.Errorf("a value for --profiling-metrics was not specified; also no conditionally compiled metrics found, consider compiling runsc with --go_tag=condmetric_profiling") } headers = append( headers, fmt.Sprintf("%s%d", MetricsStartTimeIndicator, time.Now().UnixNano()), columnHeaders.String(), ) if !profilingMetricsStarted.CompareAndSwap(false, true) { return errors.New("profiling metrics have already been started") } s := snapshots{ numMetrics: numMetrics, ringbuffer: make([][]uint64, snapshotRingbufferSize), // curWriterIndex is initialized to a valid index so that the // collector cannot use up all indices before the writer even has // a chance to start (as unlikely as that is). curWriterIndex: atomicbitops.FromInt32(snapshotRingbufferSize - 1), } for i := 0; i < snapshotRingbufferSize; i++ { s.ringbuffer[i] = make([]uint64, snapshotBufferSize*(numMetrics+1)) } // Truncate the underlying sink if possible to delete any past profiling // data in the file, if any, as it makes no sense to concatenate them or // to overwrite them in-place. // We ignore errors here because the sink may not be truncatable, // e.g. when it is pointing to the stdout FD. _ = opts.Sink.Truncate(0) stopProfilingMetrics = atomicbitops.FromBool(false) doneProfilingMetrics = make(chan *CollectionStats, 1) writeCh := make(chan writeReq, snapshotRingbufferSize) s.startTime = time.Now().UnixNano() cheapStartTime := CheapNowNano() stats := CollectionStats{ CollectionRateNanos: uint64(opts.Rate.Nanoseconds()), CheapStartNanos: uint64(cheapStartTime), } go collectProfilingMetrics(&s, values, cheapStartTime, opts.Rate, writeCh, &stats) if opts.Lossy { lossySink := newLossyBufferedWriter(opts.Sink) go writeProfilingMetrics[*lossyBufferedWriter[T]](lossySink, &s, headers, writeCh, &stats) } else { bufferedSink := newBufferedWriter(opts.Sink) go writeProfilingMetrics[*bufferedWriter[T]](bufferedSink, &s, headers, writeCh, &stats) } log.Infof("Profiling metrics started.") return nil } // collectProfilingMetrics will send metrics to the writeCh until it receives a // signal via the stopProfilingMetrics channel. func collectProfilingMetrics(s *snapshots, values []func(fieldValues ...*FieldValue) uint64, cheapStartTime int64, profilingRate time.Duration, writeCh chan<- writeReq, stats *CollectionStats) { defer close(writeCh) stats.mu.Lock() defer stats.mu.Unlock() numEntries := s.numMetrics + 1 // to account for the timestamp ringbufferIdx := 0 curSnapshot := 0 var beforeCollectionTimestamp int64 // If we write faster than the writer can keep up, we back off. // The backoff factor starts small but increases exponentially // each time we find that we are still faster than the writer. const ( // How much slower than the profiling rate we sleep for, as a // multiplier for the profiling rate. initialBackoffFactor = 1.0 // The exponential factor by which the backoff factor increases. backoffFactorGrowth = 1.125 // The maximum backoff factor, i.e. the maximum multiplier of // the profiling rate for which we sleep. backoffFactorMax = 256.0 ) backoffFactor := initialBackoffFactor stopCollecting := false for nextCollection := cheapStartTime; !stopCollecting; nextCollection += profilingRate.Nanoseconds() { if stopProfilingMetrics.Load() { stopCollecting = true stats.CheapLastCollectionNanos = uint64(nextCollection) // Collect one last time before stopping. } // For small durations, just spin (and maybe yield). Otherwise sleep. for { const ( // When the next collection time is closer than `spinMaxNanos` away, // we will spin in place waiting for the collection time to come. // If it is further away, see `yieldMaxNanos`. spinMaxNanos = 50_000 // When the next collection time is closer than `yieldMaxNanos` away, // we will continuously call `runtime.Gosched` until the collection // time comes. // If it is further away, we will call `time.Sleep` (but see // `wakeUpNanos`). // Look at your kernel's CONFIG_HZ configuration to see what a good // lower bound for this value should be. yieldMaxNanos = 2_500_000 // When we decide to call `time.Sleep`, `wakeUpNanos` is the amount of // time to *undersleep* by passed to `time.Sleep`, such that we are // likely to wake up a bit before the actual next collection time. wakeUpNanos = 100_000 ) beforeCollectionTimestamp = CheapNowNano() nanosToNextCollection := nextCollection - beforeCollectionTimestamp if nanosToNextCollection <= 0 { // Collect now. break } if nanosToNextCollection < spinMaxNanos { continue // Spin. } if nanosToNextCollection < yieldMaxNanos { // Yield then spin. runtime.Gosched() continue } // Sleep. time.Sleep(time.Duration(nanosToNextCollection-wakeUpNanos) * time.Nanosecond) } ringBuf := s.ringbuffer[ringbufferIdx] base := curSnapshot * numEntries for i := 1; i < numEntries; i++ { ringBuf[base+i] = values[i-1]() } afterCollectionTimestamp := CheapNowNano() middleCollectionTimestamp := (beforeCollectionTimestamp + afterCollectionTimestamp) / 2 ringBuf[base] = uint64(middleCollectionTimestamp - cheapStartTime) curSnapshot++ stats.TotalSnapshots++ stats.TotalSleepTimingErrorNanos += uint64(max(nextCollection-beforeCollectionTimestamp, beforeCollectionTimestamp-nextCollection)) stats.TotalCollectionTimingErrorNanos += uint64(afterCollectionTimestamp - beforeCollectionTimestamp) if curSnapshot == snapshotBufferSize { writeCh <- writeReq{ringbufferIdx: ringbufferIdx, numLines: curSnapshot} curSnapshot = 0 // Block until the writer indicates that this part of the ringbuffer // is available for writing. for ringbufferIdx = (ringbufferIdx + 1) % snapshotRingbufferSize; ringbufferIdx == int(s.curWriterIndex.Load()); { // Going too fast, stop collecting for a bit. backoffSleep := profilingRate * time.Duration(backoffFactor) log.Warningf("Profiling metrics collector exhausted the entire ringbuffer... backing off for %v to let writer catch up.", backoffSleep) stats.NumBackoffSleeps++ stats.TotalBackoffSleepNanos += uint64(backoffSleep.Nanoseconds()) time.Sleep(backoffSleep) backoffFactor = min(backoffFactor*backoffFactorGrowth, backoffFactorMax) } } } if curSnapshot != 0 { writeCh <- writeReq{ringbufferIdx: ringbufferIdx, numLines: curSnapshot} } } // bufferedMetricsWriter is a ProfilingMetricsWriter that buffers data // before writing it to some underlying writer. type bufferedMetricsWriter interface { // We inherit from the ProfilingMetricsWriter interface. // Note however that calls to WriteString should *not* contain any // newline character, unless called through NewLine. ProfilingMetricsWriter // NewLine writes a newline character to the buffer. // The writer may decide to flush the buffer at this point. NewLine() // Flush flushes the buffer to the underlying writer. Flush() } const ( // Buffer size reasonable to use for a single line of metric data. lineBufSize = 4 * 1024 // 4 KiB // Buffer size for a buffered write to an underlying sink. bufSize = 984 * 1024 // 984 KiB // Number of lines to buffer before flushing to the underlying sink // by a line-buffered writer. bufferedLines = bufSize / lineBufSize ) // bufferedWriter is a buffered metrics writer that wraps an underlying // ProfilingMetricsWriter. // It implements `bufferedMetricsWriter`. type bufferedWriter[T ProfilingMetricsWriter] struct { buf bytes.Buffer underlying T } func newBufferedWriter[T ProfilingMetricsWriter](underlying T) *bufferedWriter[T] { w := &bufferedWriter[T]{underlying: underlying} w.buf.Grow(bufSize + lineBufSize) return w } // Write implements bufferedMetricsWriter.Write. func (w *bufferedWriter[T]) Write(s []byte) (int, error) { return w.buf.Write(s) } // WriteString implements bufferedMetricsWriter.WriteString. func (w *bufferedWriter[T]) WriteString(s string) (int, error) { return w.buf.WriteString(s) } // NewLine implements bufferedMetricsWriter.NewLine. func (w *bufferedWriter[T]) NewLine() { w.buf.WriteString("\n") if w.buf.Len() >= bufSize { w.Flush() } } // Flush implements bufferedMetricsWriter.Flush. func (w *bufferedWriter[T]) Flush() { w.underlying.WriteString(w.buf.String()) w.buf.Reset() } // Truncate implements bufferedMetricsWriter.Truncate. func (w *bufferedWriter[T]) Truncate(size int64) error { return w.underlying.Truncate(size) } // Close implements bufferedMetricsWriter.Close. func (w *bufferedWriter[T]) Close() error { w.Flush() return w.underlying.Close() } // lossyBufferedWriter writes to an underlying ProfilingMetricsWriter // and buffers data on a per-line basis. It adds a prefix to every line, // and keeps track of the checksum of the data it has written (which is then // also written to the underlying writer on `Close()`). // The checksum covers all of the per-line data written after the line prefix, // including the newline character of these lines, with the exception of // the checksum data line itself. // All lines are also checksummed individually, with the checksum covering // the contents of the line after the line prefix but before the tab and // line checksum itself at the end of the line. // `lossyBufferedWriter` implements `bufferedMetricsWriter`. type lossyBufferedWriter[T ProfilingMetricsWriter] struct { lineBuf bytes.Buffer flushBuf bytes.Buffer lineHasher hash.Hash32 overallHasher hash.Hash32 lines int longestLine int underlying T } // newLossyBufferedWriter creates a new lossyBufferedWriter. func newLossyBufferedWriter[T ProfilingMetricsWriter](underlying T) *lossyBufferedWriter[T] { w := &lossyBufferedWriter[T]{ underlying: underlying, lineHasher: adler32.New(), overallHasher: adler32.New(), longestLine: lineBufSize, } w.lineBuf.Grow(lineBufSize) // `lineBufSize + 1` to account for the newline at the end of each line. // `+ 2` to account for the newline at the beginning and end of each flush. w.flushBuf.Grow((lineBufSize+1)*bufferedLines + 2) w.flushBuf.WriteString("\n") return w } // Write implements bufferedMetricsWriter.Write. func (w *lossyBufferedWriter[T]) Write(s []byte) (int, error) { return w.lineBuf.Write(s) } // WriteString implements bufferedMetricsWriter.WriteString. func (w *lossyBufferedWriter[T]) WriteString(s string) (int, error) { return w.lineBuf.WriteString(s) } // Flush implements bufferedMetricsWriter.Flush. func (w *lossyBufferedWriter[T]) Flush() { if w.lines > 0 { // Ensure that we write a complete line atomically, as this // may get parsed while being mixed with other logs that may not // have clean line endings a the time we print this. w.flushBuf.WriteString("\n") w.underlying.WriteString(w.flushBuf.String()) if f, isFile := any(w.underlying).(*os.File); isFile { // If we're dealing with a file, also call `sync(2)`. f.Sync() } w.flushBuf.Reset() w.flushBuf.WriteString("\n") w.lines = 0 } } // NewLine implements bufferedMetricsWriter.NewLine. func (w *lossyBufferedWriter[T]) NewLine() { if lineLen := w.lineBuf.Len(); lineLen > w.longestLine { wantTotalSize := (lineLen+1)*bufferedLines + 2 if growBy := wantTotalSize - w.flushBuf.Len(); growBy > 0 { w.flushBuf.Grow(growBy) } w.longestLine = lineLen } line := w.lineBuf.String() w.lineHasher.Reset() w.lineHasher.Write([]byte(line)) lineHash := w.lineHasher.Sum32() w.lineBuf.Reset() w.flushBuf.WriteString(MetricsPrefix) beforeLineIndex := w.flushBuf.Len() w.flushBuf.WriteString(line) w.flushBuf.WriteString("\t0x") prometheus.WriteHex(&w.flushBuf, uint64(lineHash)) w.flushBuf.WriteString("\n") afterLineIndex := w.flushBuf.Len() // We ignore the effects that partial writes on the underlying writer // would have on the hash computation here. // This is OK because the goal of this writer is speed over correctness, // and correctness is enforced by the reader of this data checking the // hash at the end. w.overallHasher.Write(w.flushBuf.Bytes()[beforeLineIndex:afterLineIndex]) w.lineBuf.Reset() w.lines++ if w.lines >= bufferedLines || w.flushBuf.Len() >= bufSize { w.Flush() } } // Truncate implements bufferedMetricsWriter.Truncate. func (w *lossyBufferedWriter[T]) Truncate(size int64) error { return w.underlying.Truncate(size) } // Close implements bufferedMetricsWriter.Close. // It writes the checksum of the data written to the underlying writer. func (w *lossyBufferedWriter[T]) Close() error { w.Flush() w.flushBuf.WriteString(MetricsPrefix) w.flushBuf.WriteString(MetricsHashIndicator) w.flushBuf.WriteString("0x") prometheus.WriteHex(&w.flushBuf, uint64(w.overallHasher.Sum32())) w.flushBuf.WriteString("\n") w.underlying.WriteString(w.flushBuf.String()) w.overallHasher.Reset() w.lineBuf.Reset() w.flushBuf.Reset() return w.underlying.Close() } // writeProfilingMetrics will write to the ProfilingMetricsWriter on every // request via writeReqs, until writeReqs is closed. func writeProfilingMetrics[T bufferedMetricsWriter](sink T, s *snapshots, headers []string, writeReqs <-chan writeReq, stats *CollectionStats) { numEntries := s.numMetrics + 1 for _, header := range headers { sink.WriteString(header) sink.NewLine() } for req := range writeReqs { s.curWriterIndex.Store(int32(req.ringbufferIdx)) ringBuf := s.ringbuffer[req.ringbufferIdx] for i := 0; i < req.numLines; i++ { base := i * numEntries // Write the time prometheus.WriteInteger(sink, int64(ringBuf[base])) // Then everything else for j := 1; j < numEntries; j++ { sink.WriteString("\t") prometheus.WriteInteger(sink, int64(ringBuf[base+j])) } sink.NewLine() } } sink.WriteString(MetricsStatsIndicator) stats.WriteTo(sink) sink.NewLine() sink.Close() doneProfilingMetrics <- stats close(doneProfilingMetrics) profilingMetricsStarted.Store(false) } // Log logs some statistics about the profiling metrics collection process. func (s *CollectionStats) Log(infoFn func(format string, val ...any), warningFn func(format string, val ...any)) { s.mu.Lock() defer s.mu.Unlock() captureDuration := time.Duration(s.CheapLastCollectionNanos-s.CheapStartNanos) * time.Nanosecond collectionRate := time.Duration(s.CollectionRateNanos) * time.Nanosecond totalSleepTimingError := time.Duration(s.TotalSleepTimingErrorNanos) * time.Nanosecond totalCollectionTimingError := time.Duration(s.TotalCollectionTimingErrorNanos) * time.Nanosecond totalBackoffSleep := time.Duration(s.TotalBackoffSleepNanos) * time.Nanosecond expectedSnapshots := uint64(captureDuration / collectionRate) if s.TotalSnapshots == expectedSnapshots+1 { // Depending on the timing of when the stop signal was sent, the // collection goroutine is expected to do an extra collection cycle, // so we add one to the expected number of snapshots here to make it not // look like the capture rate is >100%. expectedSnapshots = s.TotalSnapshots } captureRate := 0.0 if expectedSnapshots > 0 { captureRate = float64(s.TotalSnapshots) / float64(expectedSnapshots) } if captureRate < .99 { warningFn("Captured %d snapshots out of %d expected (%.2f%% capture rate) over %v.", s.TotalSnapshots, expectedSnapshots, captureRate*100.0, captureDuration) warningFn("This indicates that the profiling metrics writer is not keeping up with the metrics collection rate.") warningFn("Ensure that the profiling metrics log is stored on a fast storage device, or consider reducing the metric profiling rate.") } else { infoFn("Captured %d snapshots out of %d expected (%.2f%% capture rate) over %v. This is acceptable.", s.TotalSnapshots, expectedSnapshots, captureRate*100.0, captureDuration) } averageSleepTimingError := totalSleepTimingError / time.Duration(s.TotalSnapshots) sleepTimingErrorVsRate := float64(averageSleepTimingError) / float64(collectionRate) if sleepTimingErrorVsRate > .1 { warningFn("Average sleep timing error is high: %v (%.2f%% of the collection interval).", averageSleepTimingError, sleepTimingErrorVsRate*100.0) warningFn("This means the profiling metrics collector is not waking up at the correct time to collect the next snapshot.") warningFn("This may mean that the CPU is overloaded (e.g. from other processes running on the same machine or from the workload itself taking up all the cores).") warningFn("Consider using a slower profiling rate, removing other background processes, or tweaking the sleep consts in profiling_metric.go.") } else { infoFn("Average sleep timing error: %v (%.2f%% of the collection interval). This is acceptable.", averageSleepTimingError, sleepTimingErrorVsRate*100.0) } averageCollectionTimingError := totalCollectionTimingError / time.Duration(s.TotalSnapshots) collectionTimingErrorVsRate := float64(averageCollectionTimingError) / float64(collectionRate) if collectionTimingErrorVsRate > .1 { warningFn("Average collection timing error is high: %v (%.2f%% of the collection interval).", averageCollectionTimingError, collectionTimingErrorVsRate*100.0) warningFn("This means the time between getting the value of the first metric vs the last metric within a single collection cycle is a too large fraction of the profiling rate.") warningFn("This means there is significant drift between the time a value is reported as having vs the time it was actually scraped, relative to the profiling interval.") warningFn("Consider using a slower profiling rate or profiling fewer metrics at a time.") } else { infoFn("Average collection timing error: %v (%.2f%% of the collection interval). This is acceptable.", averageCollectionTimingError, collectionTimingErrorVsRate*100.0) } if s.NumBackoffSleeps > 0 { ratioLostToBackoff := float64(totalBackoffSleep) / float64(captureDuration) if ratioLostToBackoff > .05 { warningFn("Backed off %d times due to slow writer; total %v spent in backoff sleep (%.2f%% of the capture duration).") warningFn("This indicates that the profiling metrics writer is not keeping up with the metrics collection rate.") warningFn("Ensure that the profiling metrics log is stored on a fast storage device, or consider reducing the metric profiling rate.") } else { infoFn("Backed off %d times due to slow writer; total %v spent in backoff sleep (%.2f%% of the capture duration). This is acceptable.", s.NumBackoffSleeps, totalBackoffSleep, ratioLostToBackoff*100.0) } } } // WriteTo writes the statistics about the profiling metrics collection process // to the given io.Writer. func (s *CollectionStats) WriteTo(w io.Writer) (int64, error) { s.mu.Lock() marshalled, err := json.Marshal(s) s.mu.Unlock() if err != nil { return 0, err } n, err := w.Write(marshalled) return int64(n), err } // ParseCollectionStats parses the profiling metrics collection stats from the // given line. func ParseCollectionStats(line string) (*CollectionStats, error) { line = strings.TrimPrefix(line, MetricsStatsIndicator) var stats CollectionStats if err := json.Unmarshal([]byte(line), &stats); err != nil { return nil, err } return &stats, nil } // StopProfilingMetrics stops the profiling metrics goroutines. Call to make sure // all metric data has been flushed. // Note that calling this function prior to StartProfilingMetrics has no effect. func StopProfilingMetrics() { if !profilingMetricsStarted.Load() { return } if !stopProfilingMetrics.CompareAndSwap(false, true) { // If the CAS fails, this means the signal was already sent, // so don't wait on doneProfilingMetrics. return } stats := <-doneProfilingMetrics log.Infof("Profiling metrics stopped.") stats.Log(func(format string, val ...any) { log.Infof("Profiling metrics: "+format, val...) }, func(format string, val ...any) { log.Warningf("Profiling metrics: "+format, val...) }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/metric/sentry_profiling.go000066400000000000000000000016251465435605700246370ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build sentry_profiling // +build sentry_profiling package metric // SentryProfiling is a builder that produces conditionally compiled metrics. // Metrics made from this are compiled and active at runtime when the // "sentry_profiling" go-tag is specified at compilation. var SentryProfiling = RealMetricBuilder{} golang-gvisor-gvisor-0.0~20240729.0/pkg/metric/sentry_profiling_fake.go000066400000000000000000000016271465435605700256270ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !sentry_profiling // +build !sentry_profiling package metric // SentryProfiling is a builder that produces conditionally compiled metrics. // Metrics made from this are compiled and active at runtime when the // "sentry_profiling" go-tag is specified at compilation. var SentryProfiling = FakeMetricBuilder{} golang-gvisor-gvisor-0.0~20240729.0/pkg/p9/000077500000000000000000000000001465435605700177545ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/p9/buffer.go000066400000000000000000000136041465435605700215600ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package p9 import ( "encoding/binary" ) // encoder is used for messages and 9P primitives. type encoder interface { // decode decodes from the given buffer. decode may be called more than once // to reuse the instance. It must clear any previous state. // // This may not fail, exhaustion will be recorded in the buffer. decode(b *buffer) // encode encodes to the given buffer. // // This may not fail. encode(b *buffer) } // order is the byte order used for encoding. var order = binary.LittleEndian // buffer is a slice that is consumed. // // This is passed to the encoder methods. type buffer struct { // data is the underlying data. This may grow during encode. data []byte // overflow indicates whether an overflow has occurred. overflow bool } // append appends n bytes to the buffer and returns a slice pointing to the // newly appended bytes. func (b *buffer) append(n int) []byte { b.data = append(b.data, make([]byte, n)...) return b.data[len(b.data)-n:] } // consume consumes n bytes from the buffer. func (b *buffer) consume(n int) ([]byte, bool) { if !b.has(n) { b.markOverrun() return nil, false } rval := b.data[:n] b.data = b.data[n:] return rval, true } // has returns true if n bytes are available. func (b *buffer) has(n int) bool { return len(b.data) >= n } // markOverrun immediately marks this buffer as overrun. // // This is used by ReadString, since some invalid data implies the rest of the // buffer is no longer valid either. func (b *buffer) markOverrun() { b.overflow = true } // isOverrun returns true if this buffer has run past the end. func (b *buffer) isOverrun() bool { return b.overflow } // Read8 reads a byte from the buffer. func (b *buffer) Read8() uint8 { v, ok := b.consume(1) if !ok { return 0 } return uint8(v[0]) } // Read16 reads a 16-bit value from the buffer. func (b *buffer) Read16() uint16 { v, ok := b.consume(2) if !ok { return 0 } return order.Uint16(v) } // Read32 reads a 32-bit value from the buffer. func (b *buffer) Read32() uint32 { v, ok := b.consume(4) if !ok { return 0 } return order.Uint32(v) } // Read64 reads a 64-bit value from the buffer. func (b *buffer) Read64() uint64 { v, ok := b.consume(8) if !ok { return 0 } return order.Uint64(v) } // ReadQIDType reads a QIDType value. func (b *buffer) ReadQIDType() QIDType { return QIDType(b.Read8()) } // ReadTag reads a Tag value. func (b *buffer) ReadTag() Tag { return Tag(b.Read16()) } // ReadFID reads a FID value. func (b *buffer) ReadFID() FID { return FID(b.Read32()) } // ReadUID reads a UID value. func (b *buffer) ReadUID() UID { return UID(b.Read32()) } // ReadGID reads a GID value. func (b *buffer) ReadGID() GID { return GID(b.Read32()) } // ReadPermissions reads a file mode value and applies the mask for permissions. func (b *buffer) ReadPermissions() FileMode { return b.ReadFileMode() & permissionsMask } // ReadFileMode reads a file mode value. func (b *buffer) ReadFileMode() FileMode { return FileMode(b.Read32()) } // ReadOpenFlags reads an OpenFlags. func (b *buffer) ReadOpenFlags() OpenFlags { return OpenFlags(b.Read32()) } // ReadSocketType reads a SocketType. func (b *buffer) ReadSocketType() SocketType { return SocketType(b.Read32()) } // ReadMsgType writes a MsgType. func (b *buffer) ReadMsgType() MsgType { return MsgType(b.Read8()) } // ReadString deserializes a string. func (b *buffer) ReadString() string { l := b.Read16() if !b.has(int(l)) { // Mark the buffer as corrupted. b.markOverrun() return "" } bs := make([]byte, l) for i := 0; i < int(l); i++ { bs[i] = byte(b.Read8()) } return string(bs) } // Write8 writes a byte to the buffer. func (b *buffer) Write8(v uint8) { b.append(1)[0] = byte(v) } // Write16 writes a 16-bit value to the buffer. func (b *buffer) Write16(v uint16) { order.PutUint16(b.append(2), v) } // Write32 writes a 32-bit value to the buffer. func (b *buffer) Write32(v uint32) { order.PutUint32(b.append(4), v) } // Write64 writes a 64-bit value to the buffer. func (b *buffer) Write64(v uint64) { order.PutUint64(b.append(8), v) } // WriteQIDType writes a QIDType value. func (b *buffer) WriteQIDType(qidType QIDType) { b.Write8(uint8(qidType)) } // WriteTag writes a Tag value. func (b *buffer) WriteTag(tag Tag) { b.Write16(uint16(tag)) } // WriteFID writes a FID value. func (b *buffer) WriteFID(fid FID) { b.Write32(uint32(fid)) } // WriteUID writes a UID value. func (b *buffer) WriteUID(uid UID) { b.Write32(uint32(uid)) } // WriteGID writes a GID value. func (b *buffer) WriteGID(gid GID) { b.Write32(uint32(gid)) } // WritePermissions applies a permissions mask and writes the FileMode. func (b *buffer) WritePermissions(perm FileMode) { b.WriteFileMode(perm & permissionsMask) } // WriteFileMode writes a FileMode. func (b *buffer) WriteFileMode(mode FileMode) { b.Write32(uint32(mode)) } // WriteOpenFlags writes an OpenFlags. func (b *buffer) WriteOpenFlags(flags OpenFlags) { b.Write32(uint32(flags)) } // WriteSocketType writes a SocketType. func (b *buffer) WriteSocketType(flags SocketType) { b.Write32(uint32(flags)) } // WriteMsgType writes a MsgType. func (b *buffer) WriteMsgType(t MsgType) { b.Write8(uint8(t)) } // WriteString serializes the given string. func (b *buffer) WriteString(s string) { b.Write16(uint16(len(s))) for i := 0; i < len(s); i++ { b.Write8(byte(s[i])) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/p9/client.go000066400000000000000000000370401465435605700215650ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package p9 import ( "errors" "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/flipcall" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/pool" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) // ErrOutOfTags indicates no tags are available. var ErrOutOfTags = errors.New("out of tags -- messages lost?") // ErrOutOfFIDs indicates no more FIDs are available. var ErrOutOfFIDs = errors.New("out of FIDs -- messages lost?") // ErrUnexpectedTag indicates a response with an unexpected tag was received. var ErrUnexpectedTag = errors.New("unexpected tag in response") // ErrVersionsExhausted indicates that all versions to negotiate have been exhausted. var ErrVersionsExhausted = errors.New("exhausted all versions to negotiate") // ErrBadVersionString indicates that the version string is malformed or unsupported. var ErrBadVersionString = errors.New("bad version string") // ErrBadResponse indicates the response didn't match the request. type ErrBadResponse struct { Got MsgType Want MsgType } // Error returns a highly descriptive error. func (e *ErrBadResponse) Error() string { return fmt.Sprintf("unexpected message type: got %v, want %v", e.Got, e.Want) } // response is the asynchronous return from recv. // // This is used in the pending map below. type response struct { r message done chan error } var responsePool = sync.Pool{ New: func() any { return &response{ done: make(chan error, 1), } }, } // Client is at least a 9P2000.L client. type Client struct { // socket is the connected socket. socket *unet.Socket // tagPool is the collection of available tags. tagPool pool.Pool // fidPool is the collection of available fids. fidPool pool.Pool // messageSize is the maximum total size of a message. messageSize uint32 // payloadSize is the maximum payload size of a read or write. // // For large reads and writes this means that the read or write is // broken up into buffer-size/payloadSize requests. payloadSize uint32 // version is the agreed upon version X of 9P2000.L.Google.X. // version 0 implies 9P2000.L. version uint32 // closedWg is marked as done when the Client.watch() goroutine, which is // responsible for closing channels and the socket fd, returns. closedWg sync.WaitGroup // sendRecv is the transport function. // // This is determined dynamically based on whether or not the server // supports flipcall channels (preferred as it is faster and more // efficient, and does not require tags). sendRecv func(message, message) error // -- below corresponds to sendRecvChannel -- // channelsMu protects channels. channelsMu sync.Mutex // channelsWg counts the number of channels for which channel.active == // true. channelsWg sync.WaitGroup // channels is the set of all initialized channels. channels []*channel // availableChannels is a LIFO of inactive channels. availableChannels []*channel // -- below corresponds to sendRecvLegacy -- // pending is the set of pending messages. pending map[Tag]*response pendingMu sync.Mutex // sendMu is the lock for sending a request. sendMu sync.Mutex // recvr is essentially a mutex for calling recv. // // Whoever writes to this channel is permitted to call recv. When // finished calling recv, this channel should be emptied. recvr chan bool } // NewClient creates a new client. It performs a Tversion exchange with // the server to assert that messageSize is ok to use. // // If NewClient succeeds, ownership of socket is transferred to the new Client. func NewClient(socket *unet.Socket, messageSize uint32, version string) (*Client, error) { // Need at least one byte of payload. if messageSize <= msgRegistry.largestFixedSize { return nil, &ErrMessageTooLarge{ size: messageSize, msize: msgRegistry.largestFixedSize, } } // Compute a payload size and round to 512 (normal block size) // if it's larger than a single block. payloadSize := messageSize - msgRegistry.largestFixedSize if payloadSize > 512 && payloadSize%512 != 0 { payloadSize -= (payloadSize % 512) } c := &Client{ socket: socket, tagPool: pool.Pool{Start: 1, Limit: uint64(NoTag)}, fidPool: pool.Pool{Start: 1, Limit: uint64(NoFID)}, pending: make(map[Tag]*response), recvr: make(chan bool, 1), messageSize: messageSize, payloadSize: payloadSize, } // Agree upon a version. requested, ok := parseVersion(version) if !ok { return nil, ErrBadVersionString } for { // Always exchange the version using the legacy version of the // protocol. If the protocol supports flipcall, then we switch // our sendRecv function to use that functionality. Otherwise, // we stick to sendRecvLegacy. rversion := Rversion{} _, err := c.sendRecvLegacy(&Tversion{ Version: versionString(requested), MSize: messageSize, }, &rversion) // The server told us to try again with a lower version. if err == unix.EAGAIN { if requested == lowestSupportedVersion { return nil, ErrVersionsExhausted } requested-- continue } // We requested an impossible version or our other parameters were bogus. if err != nil { return nil, err } // Parse the version. version, ok := parseVersion(rversion.Version) if !ok { // The server gave us a bad version. We return a generically worrisome error. log.Warningf("server returned bad version string %q", rversion.Version) return nil, ErrBadVersionString } c.version = version break } // Can we switch to use the more advanced channels and create // independent channels for communication? Prefer it if possible. if versionSupportsFlipcall(c.version) { // Attempt to initialize IPC-based communication. for i := 0; i < channelsPerClient; i++ { if err := c.openChannel(i); err != nil { log.Warningf("error opening flipcall channel: %v", err) break // Stop. } } if len(c.channels) >= 1 { // At least one channel created. c.sendRecv = c.sendRecvChannel } else { // Channel setup failed; fallback. c.sendRecv = c.sendRecvLegacySyscallErr } } else { // No channels available: use the legacy mechanism. c.sendRecv = c.sendRecvLegacySyscallErr } // Ensure that the socket and channels are closed when the socket is shut // down. c.closedWg.Add(1) go c.watch(socket) // S/R-SAFE: not relevant. return c, nil } // watch watches the given socket and releases resources on hangup events. // // This is intended to be called as a goroutine. func (c *Client) watch(socket *unet.Socket) { defer c.closedWg.Done() events := []unix.PollFd{ { Fd: int32(socket.FD()), Events: unix.POLLHUP | unix.POLLRDHUP, }, } // Wait for a shutdown event. for { n, err := unix.Ppoll(events, nil, nil) if err == unix.EINTR || err == unix.EAGAIN { continue } if err != nil { log.Warningf("p9.Client.watch(): %v", err) break } if n != 1 { log.Warningf("p9.Client.watch(): got %d events, wanted 1", n) } break } // Set availableChannels to nil so that future calls to c.sendRecvChannel() // don't attempt to activate a channel, and concurrent calls to // c.sendRecvChannel() don't mark released channels as available. c.channelsMu.Lock() c.availableChannels = nil // Shut down all active channels. for _, ch := range c.channels { if ch.active { log.Debugf("shutting down active channel@%p...", ch) ch.Shutdown() } } c.channelsMu.Unlock() // Wait for active channels to become inactive. c.channelsWg.Wait() // Close all channels. c.channelsMu.Lock() for _, ch := range c.channels { ch.Close() } c.channelsMu.Unlock() // Close the main socket. c.socket.Close() } // openChannel attempts to open a client channel. // // Note that this function returns naked errors which should not be propagated // directly to a caller. It is expected that the errors will be logged and a // fallback path will be used instead. func (c *Client) openChannel(id int) error { var ( rchannel0 Rchannel rchannel1 Rchannel res = new(channel) ) // Open the data channel. if _, err := c.sendRecvLegacy(&Tchannel{ ID: uint32(id), Control: 0, }, &rchannel0); err != nil { return fmt.Errorf("error handling Tchannel message: %v", err) } if rchannel0.FilePayload() == nil { return fmt.Errorf("missing file descriptor on primary channel") } // We don't need to hold this. defer rchannel0.FilePayload().Close() // Open the channel for file descriptors. if _, err := c.sendRecvLegacy(&Tchannel{ ID: uint32(id), Control: 1, }, &rchannel1); err != nil { return err } if rchannel1.FilePayload() == nil { return fmt.Errorf("missing file descriptor on file descriptor channel") } // Construct the endpoints. res.desc = flipcall.PacketWindowDescriptor{ FD: rchannel0.FilePayload().FD(), Offset: int64(rchannel0.Offset), Length: int(rchannel0.Length), } if err := res.data.Init(flipcall.ClientSide, res.desc); err != nil { rchannel1.FilePayload().Close() return err } // The fds channel owns the control payload, and it will be closed when // the channel object is closed. res.fds.Init(rchannel1.FilePayload().Release()) // Save the channel. c.channelsMu.Lock() defer c.channelsMu.Unlock() c.channels = append(c.channels, res) c.availableChannels = append(c.availableChannels, res) return nil } // handleOne handles a single incoming message. // // This should only be called with the token from recvr. Note that the received // tag will automatically be cleared from pending. func (c *Client) handleOne() { tag, r, err := recv(c.socket, c.messageSize, func(tag Tag, t MsgType) (message, error) { c.pendingMu.Lock() resp := c.pending[tag] c.pendingMu.Unlock() // Not expecting this message? if resp == nil { log.Warningf("client received unexpected tag %v, ignoring", tag) return nil, ErrUnexpectedTag } // Is it an error? We specifically allow this to // go through, and then we deserialize below. if t == MsgRlerror { return &Rlerror{}, nil } // Does it match expectations? if t != resp.r.Type() { return nil, &ErrBadResponse{Got: t, Want: resp.r.Type()} } // Return the response. return resp.r, nil }) if err != nil { // No tag was extracted (probably a socket error). // // Likely catastrophic. Notify all waiters and clear pending. c.pendingMu.Lock() for _, resp := range c.pending { resp.done <- err } clear(c.pending) c.pendingMu.Unlock() } else { // Process the tag. // // We know that is is contained in the map because our lookup function // above must have succeeded (found the tag) to return nil err. c.pendingMu.Lock() resp := c.pending[tag] delete(c.pending, tag) c.pendingMu.Unlock() resp.r = r resp.done <- err } } // waitAndRecv coordinates with other receivers to handle responses. func (c *Client) waitAndRecv(done chan error) error { for { select { case err := <-done: return err case c.recvr <- true: select { case err := <-done: // It's possible that we got the token, despite // done also being available. Check for that. <-c.recvr return err default: // Handle receiving one tag. c.handleOne() // Return the token. <-c.recvr } } } } // sendRecvLegacySyscallErr is a wrapper for sendRecvLegacy that converts all // non-syscall errors to EIO. func (c *Client) sendRecvLegacySyscallErr(t message, r message) error { received, err := c.sendRecvLegacy(t, r) if !received { log.Warningf("p9.Client.sendRecvChannel: %v", err) return unix.EIO } return err } // sendRecvLegacy performs a roundtrip message exchange. // // sendRecvLegacy returns true if a message was received. This allows us to // differentiate between failed receives and successful receives where the // response was an error message. // // This is called by internal functions. func (c *Client) sendRecvLegacy(t message, r message) (bool, error) { tag, ok := c.tagPool.Get() if !ok { return false, ErrOutOfTags } defer c.tagPool.Put(tag) // Indicate we're expecting a response. // // Note that the tag will be cleared from pending // automatically (see handleOne for details). resp := responsePool.Get().(*response) defer responsePool.Put(resp) resp.r = r c.pendingMu.Lock() c.pending[Tag(tag)] = resp c.pendingMu.Unlock() // Send the request over the wire. c.sendMu.Lock() err := send(c.socket, Tag(tag), t) c.sendMu.Unlock() if err != nil { return false, err } // Coordinate with other receivers. if err := c.waitAndRecv(resp.done); err != nil { return false, err } // Is it an error message? // // For convenience, we transform these directly // into errors. Handlers need not handle this case. if rlerr, ok := resp.r.(*Rlerror); ok { return true, unix.Errno(rlerr.Error) } // At this point, we know it matches. // // Per recv call above, we will only allow a type // match (and give our r) or an instance of Rlerror. return true, nil } // sendRecvChannel uses channels to send a message. func (c *Client) sendRecvChannel(t message, r message) error { // Acquire an available channel. c.channelsMu.Lock() if len(c.availableChannels) == 0 { c.channelsMu.Unlock() return c.sendRecvLegacySyscallErr(t, r) } idx := len(c.availableChannels) - 1 ch := c.availableChannels[idx] c.availableChannels = c.availableChannels[:idx] ch.active = true c.channelsWg.Add(1) c.channelsMu.Unlock() // Ensure that it's connected. if !ch.connected { ch.connected = true if err := ch.data.Connect(); err != nil { // The channel is unusable, so don't return it to // c.availableChannels. However, we still have to mark it as // inactive so c.watch() doesn't wait for it. c.channelsMu.Lock() ch.active = false c.channelsMu.Unlock() c.channelsWg.Done() // Map all transport errors to EIO, but ensure that the real error // is logged. log.Warningf("p9.Client.sendRecvChannel: flipcall.Endpoint.Connect: %v", err) return unix.EIO } } // Send the request and receive the server's response. rsz, err := ch.send(t, false /* isServer */) if err != nil { // See above. c.channelsMu.Lock() ch.active = false c.channelsMu.Unlock() c.channelsWg.Done() log.Warningf("p9.Client.sendRecvChannel: p9.channel.send: %v", err) return unix.EIO } // Parse the server's response. resp, retErr := ch.recv(r, rsz) if resp == nil { log.Warningf("p9.Client.sendRecvChannel: p9.channel.recv: %v", retErr) retErr = unix.EIO } // Release the channel. c.channelsMu.Lock() ch.active = false // If c.availableChannels is nil, c.watch() has fired and we should not // mark this channel as available. if c.availableChannels != nil { c.availableChannels = append(c.availableChannels, ch) } c.channelsMu.Unlock() c.channelsWg.Done() return retErr } // Version returns the negotiated 9P2000.L.Google version number. func (c *Client) Version() uint32 { return c.version } // Close closes the underlying socket and channels. func (c *Client) Close() { // unet.Socket.Shutdown() has no effect if unet.Socket.Close() has already // been called (by c.watch()). if err := c.socket.Shutdown(); err != nil { log.Warningf("Socket.Shutdown() failed (FD: %d): %v", c.socket.FD(), err) } c.closedWg.Wait() } golang-gvisor-gvisor-0.0~20240729.0/pkg/p9/client_file.go000066400000000000000000000505571465435605700225740ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package p9 import ( "errors" "fmt" "io" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" ) // Attach attaches to a server. // // Note that authentication is not currently supported. func (c *Client) Attach(name string) (File, error) { fid, ok := c.fidPool.Get() if !ok { return nil, ErrOutOfFIDs } rattach := Rattach{} if err := c.sendRecv(&Tattach{FID: FID(fid), Auth: Tauth{AttachName: name, AuthenticationFID: NoFID, UID: NoUID}}, &rattach); err != nil { c.fidPool.Put(fid) return nil, err } return c.newFile(FID(fid)), nil } // newFile returns a new client file. func (c *Client) newFile(fid FID) *clientFile { return &clientFile{ client: c, fid: fid, } } // clientFile is provided to clients. // // This proxies all of the interfaces found in file.go. type clientFile struct { DisallowServerCalls // client is the originating client. client *Client // fid is the FID for this file. fid FID // closed indicates whether this file has been closed. closed atomicbitops.Uint32 } // Walk implements File.Walk. func (c *clientFile) Walk(names []string) ([]QID, File, error) { if c.closed.Load() != 0 { return nil, nil, unix.EBADF } fid, ok := c.client.fidPool.Get() if !ok { return nil, nil, ErrOutOfFIDs } rwalk := Rwalk{} if err := c.client.sendRecv(&Twalk{FID: c.fid, NewFID: FID(fid), Names: names}, &rwalk); err != nil { c.client.fidPool.Put(fid) return nil, nil, err } // Return a new client file. return rwalk.QIDs, c.client.newFile(FID(fid)), nil } // WalkGetAttr implements File.WalkGetAttr. func (c *clientFile) WalkGetAttr(components []string) ([]QID, File, AttrMask, Attr, error) { if c.closed.Load() != 0 { return nil, nil, AttrMask{}, Attr{}, unix.EBADF } if !versionSupportsTwalkgetattr(c.client.version) { qids, file, err := c.Walk(components) if err != nil { return nil, nil, AttrMask{}, Attr{}, err } _, valid, attr, err := file.GetAttr(AttrMaskAll()) if err != nil { file.Close() return nil, nil, AttrMask{}, Attr{}, err } return qids, file, valid, attr, nil } fid, ok := c.client.fidPool.Get() if !ok { return nil, nil, AttrMask{}, Attr{}, ErrOutOfFIDs } rwalkgetattr := Rwalkgetattr{} if err := c.client.sendRecv(&Twalkgetattr{FID: c.fid, NewFID: FID(fid), Names: components}, &rwalkgetattr); err != nil { c.client.fidPool.Put(fid) return nil, nil, AttrMask{}, Attr{}, err } // Return a new client file. return rwalkgetattr.QIDs, c.client.newFile(FID(fid)), rwalkgetattr.Valid, rwalkgetattr.Attr, nil } func (c *clientFile) MultiGetAttr(names []string) ([]FullStat, error) { if c.closed.Load() != 0 { return nil, unix.EBADF } if versionSupportsTmultiGetAttr(c.client.version) { rmultigetattr := Rmultigetattr{} if err := c.client.sendRecv(&Tmultigetattr{FID: c.fid, Names: names}, &rmultigetattr); err != nil { return nil, err } return rmultigetattr.Stats, nil } stats := make([]FullStat, 0, len(names)) var start File = c parent := start closeParent := func() { if parent != start { _ = parent.Close() } } defer closeParent() mask := AttrMaskAll() for i, name := range names { if len(name) == 0 && i == 0 { qid, valid, attr, err := parent.GetAttr(mask) if err != nil { return nil, err } stats = append(stats, FullStat{ QID: qid, Valid: valid, Attr: attr, }) continue } qids, child, valid, attr, err := parent.WalkGetAttr([]string{name}) if err != nil { if errors.Is(err, unix.ENOENT) { return stats, nil } return nil, err } closeParent() parent = child stats = append(stats, FullStat{ QID: qids[0], Valid: valid, Attr: attr, }) if attr.Mode.FileType() != ModeDirectory { // Doesn't need to continue if entry is not a dir. Including symlinks // that cannot be followed. break } } return stats, nil } // StatFS implements File.StatFS. func (c *clientFile) StatFS() (FSStat, error) { if c.closed.Load() != 0 { return FSStat{}, unix.EBADF } rstatfs := Rstatfs{} if err := c.client.sendRecv(&Tstatfs{FID: c.fid}, &rstatfs); err != nil { return FSStat{}, err } return rstatfs.FSStat, nil } // FSync implements File.FSync. func (c *clientFile) FSync() error { if c.closed.Load() != 0 { return unix.EBADF } return c.client.sendRecv(&Tfsync{FID: c.fid}, &Rfsync{}) } // GetAttr implements File.GetAttr. func (c *clientFile) GetAttr(req AttrMask) (QID, AttrMask, Attr, error) { if c.closed.Load() != 0 { return QID{}, AttrMask{}, Attr{}, unix.EBADF } rgetattr := Rgetattr{} if err := c.client.sendRecv(&Tgetattr{FID: c.fid, AttrMask: req}, &rgetattr); err != nil { return QID{}, AttrMask{}, Attr{}, err } return rgetattr.QID, rgetattr.Valid, rgetattr.Attr, nil } // SetAttr implements File.SetAttr. func (c *clientFile) SetAttr(valid SetAttrMask, attr SetAttr) error { if c.closed.Load() != 0 { return unix.EBADF } return c.client.sendRecv(&Tsetattr{FID: c.fid, Valid: valid, SetAttr: attr}, &Rsetattr{}) } // GetXattr implements File.GetXattr. func (c *clientFile) GetXattr(name string, size uint64) (string, error) { if c.closed.Load() != 0 { return "", unix.EBADF } if !versionSupportsGetSetXattr(c.client.version) { return "", unix.EOPNOTSUPP } rgetxattr := Rgetxattr{} if err := c.client.sendRecv(&Tgetxattr{FID: c.fid, Name: name, Size: size}, &rgetxattr); err != nil { return "", err } return rgetxattr.Value, nil } // SetXattr implements File.SetXattr. func (c *clientFile) SetXattr(name, value string, flags uint32) error { if c.closed.Load() != 0 { return unix.EBADF } if !versionSupportsGetSetXattr(c.client.version) { return unix.EOPNOTSUPP } return c.client.sendRecv(&Tsetxattr{FID: c.fid, Name: name, Value: value, Flags: flags}, &Rsetxattr{}) } // ListXattr implements File.ListXattr. func (c *clientFile) ListXattr(size uint64) (map[string]struct{}, error) { if c.closed.Load() != 0 { return nil, unix.EBADF } if !versionSupportsListRemoveXattr(c.client.version) { return nil, unix.EOPNOTSUPP } rlistxattr := Rlistxattr{} if err := c.client.sendRecv(&Tlistxattr{FID: c.fid, Size: size}, &rlistxattr); err != nil { return nil, err } xattrs := make(map[string]struct{}, len(rlistxattr.Xattrs)) for _, x := range rlistxattr.Xattrs { xattrs[x] = struct{}{} } return xattrs, nil } // RemoveXattr implements File.RemoveXattr. func (c *clientFile) RemoveXattr(name string) error { if c.closed.Load() != 0 { return unix.EBADF } if !versionSupportsListRemoveXattr(c.client.version) { return unix.EOPNOTSUPP } return c.client.sendRecv(&Tremovexattr{FID: c.fid, Name: name}, &Rremovexattr{}) } // Allocate implements File.Allocate. func (c *clientFile) Allocate(mode AllocateMode, offset, length uint64) error { if c.closed.Load() != 0 { return unix.EBADF } if !versionSupportsTallocate(c.client.version) { return unix.EOPNOTSUPP } return c.client.sendRecv(&Tallocate{FID: c.fid, Mode: mode, Offset: offset, Length: length}, &Rallocate{}) } // Remove implements File.Remove. // // N.B. This method is no longer part of the file interface and should be // considered deprecated. func (c *clientFile) Remove() error { // Avoid double close. if !c.closed.CompareAndSwap(0, 1) { return unix.EBADF } // Send the remove message. if err := c.client.sendRecv(&Tremove{FID: c.fid}, &Rremove{}); err != nil { return err } // "It is correct to consider remove to be a clunk with the side effect // of removing the file if permissions allow." // https://swtch.com/plan9port/man/man9/remove.html // Return the FID to the pool. c.client.fidPool.Put(uint64(c.fid)) return nil } // Close implements File.Close. func (c *clientFile) Close() error { // Avoid double close. if !c.closed.CompareAndSwap(0, 1) { return unix.EBADF } // Send the close message. if err := c.client.sendRecv(&Tclunk{FID: c.fid}, &Rclunk{}); err != nil { // If an error occurred, we toss away the FID. This isn't ideal, // but I'm not sure what else makes sense in this context. log.Warningf("Tclunk failed, losing FID %v: %v", c.fid, err) return err } // Return the FID to the pool. c.client.fidPool.Put(uint64(c.fid)) return nil } // SetAttrClose implements File.SetAttrClose. func (c *clientFile) SetAttrClose(valid SetAttrMask, attr SetAttr) error { if !versionSupportsTsetattrclunk(c.client.version) { setAttrErr := c.SetAttr(valid, attr) // Try to close file even in case of failure above. Since the state of the // file is unknown to the caller, it will not attempt to close the file // again. if err := c.Close(); err != nil { return err } return setAttrErr } // Avoid double close. if !c.closed.CompareAndSwap(0, 1) { return unix.EBADF } // Send the message. if err := c.client.sendRecv(&Tsetattrclunk{FID: c.fid, Valid: valid, SetAttr: attr}, &Rsetattrclunk{}); err != nil { // If an error occurred, we toss away the FID. This isn't ideal, // but I'm not sure what else makes sense in this context. log.Warningf("Tsetattrclunk failed, losing FID %v: %v", c.fid, err) return err } // Return the FID to the pool. c.client.fidPool.Put(uint64(c.fid)) return nil } // Open implements File.Open. func (c *clientFile) Open(flags OpenFlags) (*fd.FD, QID, uint32, error) { if c.closed.Load() != 0 { return nil, QID{}, 0, unix.EBADF } rlopen := Rlopen{} if err := c.client.sendRecv(&Tlopen{FID: c.fid, Flags: flags}, &rlopen); err != nil { return nil, QID{}, 0, err } return rlopen.File, rlopen.QID, rlopen.IoUnit, nil } func (c *clientFile) Bind(sockType uint32, sockName string, uid UID, gid GID) (File, QID, AttrMask, Attr, error) { if c.closed.Load() != 0 { return nil, QID{}, AttrMask{}, Attr{}, unix.EBADF } if !versionSupportsBind(c.client.version) { return nil, QID{}, AttrMask{}, Attr{}, unix.EOPNOTSUPP } fid, ok := c.client.fidPool.Get() if !ok { return nil, QID{}, AttrMask{}, Attr{}, ErrOutOfFIDs } tbind := Tbind{ SockType: sockType, SockName: sockName, UID: uid, GID: gid, Directory: c.fid, NewFID: FID(fid), } rbind := Rbind{} if err := c.client.sendRecv(&tbind, &rbind); err != nil { c.client.fidPool.Put(fid) return nil, QID{}, AttrMask{}, Attr{}, err } return c.client.newFile(FID(fid)), rbind.QID, rbind.Valid, rbind.Attr, nil } // Connect implements File.Connect. func (c *clientFile) Connect(socketType SocketType) (*fd.FD, error) { if c.closed.Load() != 0 { return nil, unix.EBADF } if !VersionSupportsConnect(c.client.version) { return nil, unix.ECONNREFUSED } rlconnect := Rlconnect{} if err := c.client.sendRecv(&Tlconnect{FID: c.fid, SocketType: socketType}, &rlconnect); err != nil { return nil, err } return rlconnect.File, nil } // chunk applies fn to p in chunkSize-sized chunks until fn returns a partial result, p is // exhausted, or an error is encountered (which may be io.EOF). func chunk(chunkSize uint32, fn func([]byte, uint64) (int, error), p []byte, offset uint64) (int, error) { // Some p9.Clients depend on executing fn on zero-byte buffers. Handle this // as a special case (normally it is fine to short-circuit and return (0, nil)). if len(p) == 0 { return fn(p, offset) } // total is the cumulative bytes processed. var total int for { var n int var err error // We're done, don't bother trying to do anything more. if total == len(p) { return total, nil } // Apply fn to a chunkSize-sized (or less) chunk of p. if len(p) < total+int(chunkSize) { n, err = fn(p[total:], offset) } else { n, err = fn(p[total:total+int(chunkSize)], offset) } total += n offset += uint64(n) // Return whatever we have processed if we encounter an error. This error // could be io.EOF. if err != nil { return total, err } // Did we get a partial result? If so, return it immediately. if n < int(chunkSize) { return total, nil } // If we received more bytes than we ever requested, this is a problem. if total > len(p) { panic(fmt.Sprintf("bytes completed (%d)) > requested (%d)", total, len(p))) } } } // ReadAt proxies File.ReadAt. func (c *clientFile) ReadAt(p []byte, offset uint64) (int, error) { return chunk(c.client.payloadSize, c.readAt, p, offset) } func (c *clientFile) readAt(p []byte, offset uint64) (int, error) { if c.closed.Load() != 0 { return 0, unix.EBADF } rread := Rread{Data: p} if err := c.client.sendRecv(&Tread{FID: c.fid, Offset: offset, Count: uint32(len(p))}, &rread); err != nil { return 0, err } // The message may have been truncated, or for some reason a new buffer // allocated. This isn't the common path, but we make sure that if the // payload has changed we copy it. See transport.go for more information. if len(p) > 0 && len(rread.Data) > 0 && &rread.Data[0] != &p[0] { copy(p, rread.Data) } // io.EOF is not an error that a p9 server can return. Use POSIX semantics to // return io.EOF manually: zero bytes were returned and a non-zero buffer was used. if len(rread.Data) == 0 && len(p) > 0 { return 0, io.EOF } return len(rread.Data), nil } // WriteAt proxies File.WriteAt. func (c *clientFile) WriteAt(p []byte, offset uint64) (int, error) { return chunk(c.client.payloadSize, c.writeAt, p, offset) } func (c *clientFile) writeAt(p []byte, offset uint64) (int, error) { if c.closed.Load() != 0 { return 0, unix.EBADF } rwrite := Rwrite{} if err := c.client.sendRecv(&Twrite{FID: c.fid, Offset: offset, Data: p}, &rwrite); err != nil { return 0, err } return int(rwrite.Count), nil } // ReadWriterFile wraps a File and implements io.ReadWriter, io.ReaderAt, and io.WriterAt. type ReadWriterFile struct { File File Offset uint64 } // Read implements part of the io.ReadWriter interface. func (r *ReadWriterFile) Read(p []byte) (int, error) { n, err := r.File.ReadAt(p, r.Offset) r.Offset += uint64(n) if err != nil { return n, err } if n == 0 && len(p) > 0 { return n, io.EOF } return n, nil } // ReadAt implements the io.ReaderAt interface. func (r *ReadWriterFile) ReadAt(p []byte, offset int64) (int, error) { n, err := r.File.ReadAt(p, uint64(offset)) if err != nil { return 0, err } if n == 0 && len(p) > 0 { return n, io.EOF } return n, nil } // Write implements part of the io.ReadWriter interface. // // Note that this may return a short write with a nil error. This violates the // contract of io.Writer, but is more consistent with gVisor's pattern of // returning errors that correspond to Linux errnos. Since short writes without // error are common in Linux, returning a nil error is appropriate. func (r *ReadWriterFile) Write(p []byte) (int, error) { n, err := r.File.WriteAt(p, r.Offset) r.Offset += uint64(n) return n, err } // WriteAt implements the io.WriteAt interface. // // Note that this may return a short write with a nil error. This violates the // contract of io.WriterAt. See comment on Write for justification. func (r *ReadWriterFile) WriteAt(p []byte, offset int64) (int, error) { return r.File.WriteAt(p, uint64(offset)) } // Rename implements File.Rename. func (c *clientFile) Rename(dir File, name string) error { if c.closed.Load() != 0 { return unix.EBADF } clientDir, ok := dir.(*clientFile) if !ok { return unix.EBADF } return c.client.sendRecv(&Trename{FID: c.fid, Directory: clientDir.fid, Name: name}, &Rrename{}) } // Create implements File.Create. func (c *clientFile) Create(name string, openFlags OpenFlags, permissions FileMode, uid UID, gid GID) (*fd.FD, File, QID, uint32, error) { if c.closed.Load() != 0 { return nil, nil, QID{}, 0, unix.EBADF } msg := Tlcreate{ FID: c.fid, Name: name, OpenFlags: openFlags, Permissions: permissions, GID: NoGID, } if versionSupportsTucreation(c.client.version) { msg.GID = gid rucreate := Rucreate{} if err := c.client.sendRecv(&Tucreate{Tlcreate: msg, UID: uid}, &rucreate); err != nil { return nil, nil, QID{}, 0, err } return rucreate.File, c, rucreate.QID, rucreate.IoUnit, nil } rlcreate := Rlcreate{} if err := c.client.sendRecv(&msg, &rlcreate); err != nil { return nil, nil, QID{}, 0, err } return rlcreate.File, c, rlcreate.QID, rlcreate.IoUnit, nil } // Mkdir implements File.Mkdir. func (c *clientFile) Mkdir(name string, permissions FileMode, uid UID, gid GID) (QID, error) { if c.closed.Load() != 0 { return QID{}, unix.EBADF } msg := Tmkdir{ Directory: c.fid, Name: name, Permissions: permissions, GID: NoGID, } if versionSupportsTucreation(c.client.version) { msg.GID = gid rumkdir := Rumkdir{} if err := c.client.sendRecv(&Tumkdir{Tmkdir: msg, UID: uid}, &rumkdir); err != nil { return QID{}, err } return rumkdir.QID, nil } rmkdir := Rmkdir{} if err := c.client.sendRecv(&msg, &rmkdir); err != nil { return QID{}, err } return rmkdir.QID, nil } // Symlink implements File.Symlink. func (c *clientFile) Symlink(oldname string, newname string, uid UID, gid GID) (QID, error) { if c.closed.Load() != 0 { return QID{}, unix.EBADF } msg := Tsymlink{ Directory: c.fid, Name: newname, Target: oldname, GID: NoGID, } if versionSupportsTucreation(c.client.version) { msg.GID = gid rusymlink := Rusymlink{} if err := c.client.sendRecv(&Tusymlink{Tsymlink: msg, UID: uid}, &rusymlink); err != nil { return QID{}, err } return rusymlink.QID, nil } rsymlink := Rsymlink{} if err := c.client.sendRecv(&msg, &rsymlink); err != nil { return QID{}, err } return rsymlink.QID, nil } // Link implements File.Link. func (c *clientFile) Link(target File, newname string) error { if c.closed.Load() != 0 { return unix.EBADF } targetFile, ok := target.(*clientFile) if !ok { return unix.EBADF } return c.client.sendRecv(&Tlink{Directory: c.fid, Name: newname, Target: targetFile.fid}, &Rlink{}) } // Mknod implements File.Mknod. func (c *clientFile) Mknod(name string, mode FileMode, major uint32, minor uint32, uid UID, gid GID) (QID, error) { if c.closed.Load() != 0 { return QID{}, unix.EBADF } msg := Tmknod{ Directory: c.fid, Name: name, Mode: mode, Major: major, Minor: minor, GID: NoGID, } if versionSupportsTucreation(c.client.version) { msg.GID = gid rumknod := Rumknod{} if err := c.client.sendRecv(&Tumknod{Tmknod: msg, UID: uid}, &rumknod); err != nil { return QID{}, err } return rumknod.QID, nil } rmknod := Rmknod{} if err := c.client.sendRecv(&msg, &rmknod); err != nil { return QID{}, err } return rmknod.QID, nil } // RenameAt implements File.RenameAt. func (c *clientFile) RenameAt(oldname string, newdir File, newname string) error { if c.closed.Load() != 0 { return unix.EBADF } clientNewDir, ok := newdir.(*clientFile) if !ok { return unix.EBADF } return c.client.sendRecv(&Trenameat{OldDirectory: c.fid, OldName: oldname, NewDirectory: clientNewDir.fid, NewName: newname}, &Rrenameat{}) } // UnlinkAt implements File.UnlinkAt. func (c *clientFile) UnlinkAt(name string, flags uint32) error { if c.closed.Load() != 0 { return unix.EBADF } return c.client.sendRecv(&Tunlinkat{Directory: c.fid, Name: name, Flags: flags}, &Runlinkat{}) } // Readdir implements File.Readdir. func (c *clientFile) Readdir(direntOffset uint64, count uint32) ([]Dirent, error) { if c.closed.Load() != 0 { return nil, unix.EBADF } rreaddir := Rreaddir{} if err := c.client.sendRecv(&Treaddir{Directory: c.fid, DirentOffset: direntOffset, Count: count}, &rreaddir); err != nil { return nil, err } return rreaddir.Entries, nil } // Readlink implements File.Readlink. func (c *clientFile) Readlink() (string, error) { if c.closed.Load() != 0 { return "", unix.EBADF } rreadlink := Rreadlink{} if err := c.client.sendRecv(&Treadlink{FID: c.fid}, &rreadlink); err != nil { return "", err } return rreadlink.Target, nil } // Flush implements File.Flush. func (c *clientFile) Flush() error { if c.closed.Load() != 0 { return unix.EBADF } if !VersionSupportsTflushf(c.client.version) { return nil } return c.client.sendRecv(&Tflushf{FID: c.fid}, &Rflushf{}) } golang-gvisor-gvisor-0.0~20240729.0/pkg/p9/file.go000066400000000000000000000341351465435605700212300ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package p9 import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/fd" ) // AttacherOptions contains Attacher configuration. type AttacherOptions struct { // SetAttrOnDeleted is set to true if it's safe to call File.SetAttr for // deleted files. SetAttrOnDeleted bool // AllocateOnDeleted is set to true if it's safe to call File.Allocate for // deleted files. AllocateOnDeleted bool // MultiGetAttrSupported is set to true if it's safe to call // File.MultiGetAttr with read concurrency guarantee only on start directory. MultiGetAttrSupported bool } // NoServerOptions partially implements Attacher with empty AttacherOptions. type NoServerOptions struct{} // ServerOptions implements Attacher. func (*NoServerOptions) ServerOptions() AttacherOptions { return AttacherOptions{} } // Attacher is provided by the server. type Attacher interface { // Attach returns a new File. // // The client-side attach will be translated to a series of walks from // the file returned by this Attach call. Attach() (File, error) // ServerOptions returns configuration options for this attach point. // // This is never caller in the client-side. ServerOptions() AttacherOptions } // File is a set of operations corresponding to a single node. // // Note that on the server side, the server logic places constraints on // concurrent operations to make things easier. This may reduce the need for // complex, error-prone locking and logic in the backend. These are documented // for each method. // // There are three different types of guarantees provided: // // none: There is no concurrency guarantee. The method may be invoked // concurrently with any other method on any other file. // // read: The method is guaranteed to be exclusive of any write or global // operation that is mutating the state of the directory tree starting at this // node. For example, this means creating new files, symlinks, directories or // renaming a directory entry (or renaming in to this target), but the method // may be called concurrently with other read methods. // // write: The method is guaranteed to be exclusive of any read, write or global // operation that is mutating the state of the directory tree starting at this // node, as described in read above. There may however, be other write // operations executing concurrently on other components in the directory tree. // // global: The method is guaranteed to be exclusive of any read, write or // global operation. type File interface { // Walk walks to the path components given in names. // // Walk returns QIDs in the same order that the names were passed in. // // An empty list of arguments should return a copy of the current file. // // On the server, Walk has a read concurrency guarantee. Walk(names []string) ([]QID, File, error) // WalkGetAttr walks to the next file and returns its maximal set of // attributes. // // Server-side p9.Files may return unix.ENOSYS to indicate that Walk // and GetAttr should be used separately to satisfy this request. // // On the server, WalkGetAttr has a read concurrency guarantee. WalkGetAttr([]string) ([]QID, File, AttrMask, Attr, error) // MultiGetAttr batches up multiple calls to GetAttr(). names is a list of // path components similar to Walk(). If the first component name is empty, // the current file is stat'd and included in the results. If the walk reaches // a file that doesn't exist or not a directory, MultiGetAttr returns the // partial result with no error. // // On the server, MultiGetAttr has a read concurrency guarantee. MultiGetAttr(names []string) ([]FullStat, error) // StatFS returns information about the file system associated with // this file. // // On the server, StatFS has no concurrency guarantee. StatFS() (FSStat, error) // GetAttr returns attributes of this node. // // On the server, GetAttr has a read concurrency guarantee. GetAttr(req AttrMask) (QID, AttrMask, Attr, error) // SetAttr sets attributes on this node. // // On the server, SetAttr has a write concurrency guarantee. SetAttr(valid SetAttrMask, attr SetAttr) error // GetXattr returns extended attributes of this node. // // Size indicates the size of the buffer that has been allocated to hold the // attribute value. If the value is larger than size, implementations may // return ERANGE to indicate that the buffer is too small, but they are also // free to ignore the hint entirely (i.e. the value returned may be larger // than size). All size checking is done independently at the syscall layer. // // On the server, GetXattr has a read concurrency guarantee. GetXattr(name string, size uint64) (string, error) // SetXattr sets extended attributes on this node. // // On the server, SetXattr has a write concurrency guarantee. SetXattr(name, value string, flags uint32) error // ListXattr lists the names of the extended attributes on this node. // // Size indicates the size of the buffer that has been allocated to hold the // attribute list. If the list would be larger than size, implementations may // return ERANGE to indicate that the buffer is too small, but they are also // free to ignore the hint entirely (i.e. the value returned may be larger // than size). All size checking is done independently at the syscall layer. // // On the server, ListXattr has a read concurrency guarantee. ListXattr(size uint64) (map[string]struct{}, error) // RemoveXattr removes extended attributes on this node. // // On the server, RemoveXattr has a write concurrency guarantee. RemoveXattr(name string) error // Allocate allows the caller to directly manipulate the allocated disk space // for the file. See fallocate(2) for more details. Allocate(mode AllocateMode, offset, length uint64) error // Close is called when all references are dropped on the server side, // and Close should be called by the client to drop all references. // // For server-side implementations of Close, the error is ignored. // // Close must be called even when Open has not been called. // // On the server, Close has no concurrency guarantee. Close() error // SetAttrClose is the equivalent of calling SetAttr() followed by Close(). // This can be used to set file times before closing the file in a single // operation. // // On the server, SetAttr has a write concurrency guarantee. // On the server, Close has no concurrency guarantee. SetAttrClose(valid SetAttrMask, attr SetAttr) error // Open must be called prior to using Read, Write or Readdir. Once Open // is called, some operations, such as Walk, will no longer work. // // On the client, Open should be called only once. The fd return is // optional, and may be nil. // // On the server, Open has a read concurrency guarantee. If an *fd.FD // is provided, ownership now belongs to the caller. Open is guaranteed // to be called only once. // // N.B. The server must resolve any lazy paths when open is called. // After this point, read and write may be called on files with no // deletion check, so resolving in the data path is not viable. Open(flags OpenFlags) (*fd.FD, QID, uint32, error) // Read reads from this file. Open must be called first. // // This may return io.EOF in addition to unix.Errno values. // // On the server, ReadAt has a read concurrency guarantee. See Open for // additional requirements regarding lazy path resolution. ReadAt(p []byte, offset uint64) (int, error) // Write writes to this file. Open must be called first. // // This may return io.EOF in addition to unix.Errno values. // // On the server, WriteAt has a read concurrency guarantee. See Open // for additional requirements regarding lazy path resolution. WriteAt(p []byte, offset uint64) (int, error) // FSync syncs this node. Open must be called first. // // On the server, FSync has a read concurrency guarantee. FSync() error // Create creates a new regular file and opens it according to the // flags given. This file is already Open. // // N.B. On the client, the returned file is a reference to the current // file, which now represents the created file. This is not the case on // the server. These semantics are very subtle and can easily lead to // bugs, but are a consequence of the 9P create operation. // // See p9.File.Open for a description of *fd.FD. // // On the server, Create has a write concurrency guarantee. Create(name string, flags OpenFlags, permissions FileMode, uid UID, gid GID) (*fd.FD, File, QID, uint32, error) // Mkdir creates a subdirectory. // // On the server, Mkdir has a write concurrency guarantee. Mkdir(name string, permissions FileMode, uid UID, gid GID) (QID, error) // Symlink makes a new symbolic link. // // On the server, Symlink has a write concurrency guarantee. Symlink(oldName string, newName string, uid UID, gid GID) (QID, error) // Link makes a new hard link. // // On the server, Link has a write concurrency guarantee. Link(target File, newName string) error // Mknod makes a new device node. // // On the server, Mknod has a write concurrency guarantee. Mknod(name string, mode FileMode, major uint32, minor uint32, uid UID, gid GID) (QID, error) // Rename renames the file. // // Rename will never be called on the server, and RenameAt will always // be used instead. Rename(newDir File, newName string) error // RenameAt renames a given file to a new name in a potentially new // directory. // // oldName must be a name relative to this file, which must be a // directory. newName is a name relative to newDir. // // On the server, RenameAt has a global concurrency guarantee. RenameAt(oldName string, newDir File, newName string) error // UnlinkAt the given named file. // // name must be a file relative to this directory. // // Flags are implementation-specific (e.g. O_DIRECTORY), but are // generally Linux unlinkat(2) flags. // // On the server, UnlinkAt has a write concurrency guarantee. UnlinkAt(name string, flags uint32) error // Readdir reads directory entries. // // This may return io.EOF in addition to unix.Errno values. count is the // number of bytes to read. // // direntOffset is the directory offset at which the read should happen. // direntOffset can be set to 0 to start reading the directory from start. // direntOffset is used more like a cookie. The unit of direntOffset is // unspecified. Gofers can choose their own unit. The client must set it // to one of the values returned in Dirent.Offset, preferably the last offset // returned, which should cause the readdir to continue from where it was // left off. // // On the server, Readdir has a read concurrency guarantee. Readdir(direntOffset uint64, count uint32) ([]Dirent, error) // Readlink reads the link target. // // On the server, Readlink has a read concurrency guarantee. Readlink() (string, error) // Flush is called prior to Close. // // Whereas Close drops all references to the file, Flush cleans up the // file state. Behavior is implementation-specific. // // Flush is not related to flush(9p). Flush is an extension to 9P2000.L, // see version.go. // // On the server, Flush has a read concurrency guarantee. Flush() error // Bind binds to a host unix domain socket. If successful, it creates a // socket file on the host filesystem and returns a File for the newly // created socket file. The File implementation must save the bound socket // FD so that subsequent Listen and Accept operations on the File can be // served. // // Bind is an extension to 9P2000.L, see version.go. // // On the server, Bind has a write concurrency guarantee. Bind(sockType uint32, sockName string, uid UID, gid GID) (File, QID, AttrMask, Attr, error) // Connect establishes a new host-socket backed connection with a // socket. A File does not need to be opened before it can be connected // and it can be connected to multiple times resulting in a unique // *fd.FD each time. In addition, the lifetime of the *fd.FD is // independent from the lifetime of the p9.File and must be managed by // the caller. // // The returned FD must be non-blocking. // // Flags indicates the requested type of socket. // // On the server, Connect has a read concurrency guarantee. Connect(socketType SocketType) (*fd.FD, error) // Renamed is called when this node is renamed. // // This may not fail. The file will hold a reference to its parent // within the p9 package, and is therefore safe to use for the lifetime // of this File (until Close is called). // // This method should not be called by clients, who should use the // relevant Rename methods. (Although the method will be a no-op.) // // On the server, Renamed has a global concurrency guarantee. Renamed(newDir File, newName string) } // DefaultWalkGetAttr implements File.WalkGetAttr to return ENOSYS for server-side Files. type DefaultWalkGetAttr struct{} // WalkGetAttr implements File.WalkGetAttr. func (*DefaultWalkGetAttr) WalkGetAttr([]string) ([]QID, File, AttrMask, Attr, error) { return nil, nil, AttrMask{}, Attr{}, unix.ENOSYS } // DisallowClientCalls panics if a client-only function is called. type DisallowClientCalls struct{} // SetAttrClose implements File.SetAttrClose. func (*DisallowClientCalls) SetAttrClose(SetAttrMask, SetAttr) error { panic("SetAttrClose should not be called on the server") } // DisallowServerCalls panics if a server-only function is called. type DisallowServerCalls struct{} // Renamed implements File.Renamed. func (*DisallowServerCalls) Renamed(File, string) { panic("Renamed should not be called on the client") } // ServerOptions implements Attacher. func (*DisallowServerCalls) ServerOptions() AttacherOptions { panic("ServerOptions should not be called on the client") } golang-gvisor-gvisor-0.0~20240729.0/pkg/p9/handlers.go000066400000000000000000001130461465435605700221100ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package p9 import ( errors2 "errors" "fmt" "io" "os" "path" "strings" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/errors" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" ) // ExtractErrno extracts a unix.Errno from a error, best effort. func ExtractErrno(err error) unix.Errno { switch err { case os.ErrNotExist: return unix.ENOENT case os.ErrExist: return unix.EEXIST case os.ErrPermission: return unix.EACCES case os.ErrInvalid: return unix.EINVAL } // Attempt to unwrap. switch e := err.(type) { case *errors.Error: return linuxerr.ToUnix(e) case unix.Errno: return e case *os.PathError: return ExtractErrno(e.Err) case *os.SyscallError: return ExtractErrno(e.Err) case *os.LinkError: return ExtractErrno(e.Err) } // Default case. log.Warningf("unknown error: %v", err) return unix.EIO } // newErr returns a new error message from an error. func newErr(err error) *Rlerror { return &Rlerror{Error: uint32(ExtractErrno(err))} } // ExtractLinuxerrErrno extracts a *errors.Error from a error, best effort. // TODO(b/34162363): Merge this with ExtractErrno. func ExtractLinuxerrErrno(err error) error { switch err { case os.ErrNotExist: return linuxerr.ENOENT case os.ErrExist: return linuxerr.EEXIST case os.ErrPermission: return linuxerr.EACCES case os.ErrInvalid: return linuxerr.EINVAL } // Attempt to unwrap. switch e := err.(type) { case *errors.Error: return linuxerr.ToError(e) case unix.Errno: return linuxerr.ErrorFromUnix(e) case *os.PathError: return ExtractLinuxerrErrno(e.Err) case *os.SyscallError: return ExtractLinuxerrErrno(e.Err) case *os.LinkError: return ExtractLinuxerrErrno(e.Err) } // Default case. log.Warningf("unknown error: %v", err) return linuxerr.EIO } // newErrFromLinuxerr returns an Rlerror from the linuxerr list. // TODO(b/34162363): Merge this with newErr. func newErrFromLinuxerr(err error) *Rlerror { return &Rlerror{Error: uint32(ExtractErrno(err))} } // handler is implemented for server-handled messages. // // See server.go for call information. type handler interface { // Handle handles the given message. // // This may modify the server state. The handle function must return a // message which will be sent back to the client. It may be useful to // use newErr to automatically extract an error message. handle(cs *connState) message } // handle implements handler.handle. func (t *Tversion) handle(cs *connState) message { if t.MSize == 0 { return newErr(unix.EINVAL) } if t.MSize > maximumLength { return newErr(unix.EINVAL) } cs.messageSize.Store(t.MSize) requested, ok := parseVersion(t.Version) if !ok { return newErr(unix.EINVAL) } // The server cannot support newer versions that it doesn't know about. In this // case we return EAGAIN to tell the client to try again with a lower version. if requested > highestSupportedVersion { return newErr(unix.EAGAIN) } // From Tversion(9P): "The server may respond with the client’s version // string, or a version string identifying an earlier defined protocol version". cs.version.Store(requested) return &Rversion{ MSize: t.MSize, Version: t.Version, } } // handle implements handler.handle. func (t *Tflush) handle(cs *connState) message { cs.WaitTag(t.OldTag) return &Rflush{} } // checkSafeName validates the name and returns nil or returns an error. func checkSafeName(name string) error { if name != "" && !strings.Contains(name, "/") && name != "." && name != ".." { return nil } return unix.EINVAL } // handle implements handler.handle. func (t *Tclunk) handle(cs *connState) message { if !cs.DeleteFID(t.FID) { return newErr(unix.EBADF) } return &Rclunk{} } func (t *Tsetattrclunk) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() setAttrErr := ref.safelyWrite(func() error { // We don't allow setattr on files that have been deleted. // This might be technically incorrect, as it's possible that // there were multiple links and you can still change the // corresponding inode information. if !cs.server.options.SetAttrOnDeleted && ref.isDeleted() { return unix.EINVAL } // Set the attributes. return ref.file.SetAttr(t.Valid, t.SetAttr) }) // Try to delete FID even in case of failure above. Since the state of the // file is unknown to the caller, it will not attempt to close the file again. if !cs.DeleteFID(t.FID) { return newErr(unix.EBADF) } if setAttrErr != nil { return newErr(setAttrErr) } return &Rsetattrclunk{} } // handle implements handler.handle. func (t *Tremove) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() // Frustratingly, because we can't be guaranteed that a rename is not // occurring simultaneously with this removal, we need to acquire the // global rename lock for this kind of remove operation to ensure that // ref.parent does not change out from underneath us. // // This is why Tremove is a bad idea, and clients should generally use // Tunlinkat. All p9 clients will use Tunlinkat. err := ref.safelyGlobal(func() error { // Is this a root? Can't remove that. if ref.isRoot() { return unix.EINVAL } // N.B. this remove operation is permitted, even if the file is open. // See also rename below for reasoning. // Is this file already deleted? if ref.isDeleted() { return unix.EINVAL } // Retrieve the file's proper name. name := ref.parent.pathNode.nameFor(ref) // Attempt the removal. if err := ref.parent.file.UnlinkAt(name, 0); err != nil { return err } // Mark all relevant fids as deleted. We don't need to lock any // individual nodes because we already hold the global lock. ref.parent.markChildDeleted(name) return nil }) // "The remove request asks the file server both to remove the file // represented by fid and to clunk the fid, even if the remove fails." // // "It is correct to consider remove to be a clunk with the side effect // of removing the file if permissions allow." // https://swtch.com/plan9port/man/man9/remove.html if !cs.DeleteFID(t.FID) { return newErr(unix.EBADF) } if err != nil { return newErr(err) } return &Rremove{} } // handle implements handler.handle. // // We don't support authentication, so this just returns ENOSYS. func (t *Tauth) handle(cs *connState) message { return newErr(unix.ENOSYS) } // handle implements handler.handle. func (t *Tattach) handle(cs *connState) message { // Ensure no authentication FID is provided. if t.Auth.AuthenticationFID != NoFID { return newErr(unix.EINVAL) } // Must provide an absolute path. if path.IsAbs(t.Auth.AttachName) { // Trim off the leading / if the path is absolute. We always // treat attach paths as absolute and call attach with the root // argument on the server file for clarity. t.Auth.AttachName = t.Auth.AttachName[1:] } // Do the attach on the root. sf, err := cs.server.attacher.Attach() if err != nil { return newErr(err) } qid, valid, attr, err := sf.GetAttr(AttrMaskAll()) if err != nil { sf.Close() // Drop file. return newErr(err) } if !valid.Mode { sf.Close() // Drop file. return newErr(unix.EINVAL) } // Build a transient reference. root := &fidRef{ server: cs.server, parent: nil, file: sf, refs: atomicbitops.FromInt64(1), mode: attr.Mode.FileType(), pathNode: cs.server.pathTree, } defer root.DecRef() // Attach the root? if len(t.Auth.AttachName) == 0 { cs.InsertFID(t.FID, root) return &Rattach{QID: qid} } // We want the same traversal checks to apply on attach, so always // attach at the root and use the regular walk paths. names := strings.Split(t.Auth.AttachName, "/") _, newRef, _, _, err := doWalk(cs, root, names, false) if err != nil { return newErr(err) } defer newRef.DecRef() // Insert the FID. cs.InsertFID(t.FID, newRef) return &Rattach{QID: qid} } // CanOpen returns whether this file open can be opened, read and written to. // // This includes everything except symlinks and sockets. func CanOpen(mode FileMode) bool { return mode.IsRegular() || mode.IsDir() || mode.IsNamedPipe() || mode.IsBlockDevice() || mode.IsCharacterDevice() } // handle implements handler.handle. func (t *Tlopen) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() var ( qid QID ioUnit uint32 osFile *fd.FD ) if err := ref.safelyRead(func() (err error) { // Has it been deleted already? if ref.isDeleted() { return unix.EINVAL } // Has it been opened already? if ref.opened || !CanOpen(ref.mode) { return unix.EINVAL } if ref.mode.IsDir() { // Directory must be opened ReadOnly. if t.Flags&OpenFlagsModeMask != ReadOnly { return unix.EISDIR } // Directory not truncatable. if t.Flags&OpenTruncate != 0 { return unix.EISDIR } } osFile, qid, ioUnit, err = ref.file.Open(t.Flags) return err }); err != nil { return newErr(err) } // Mark file as opened and set open mode. ref.opened = true ref.openFlags = t.Flags rlopen := &Rlopen{QID: qid, IoUnit: ioUnit} rlopen.SetFilePayload(osFile) return rlopen } func (t *Tlcreate) do(cs *connState, uid UID) (*Rlcreate, error) { if err := checkSafeName(t.Name); err != nil { return nil, err } ref, ok := cs.LookupFID(t.FID) if !ok { return nil, unix.EBADF } defer ref.DecRef() var ( osFile *fd.FD nsf File qid QID ioUnit uint32 newRef *fidRef ) if err := ref.safelyWrite(func() (err error) { // Don't allow creation from non-directories or deleted directories. if ref.isDeleted() || !ref.mode.IsDir() { return unix.EINVAL } // Not allowed on open directories. if ref.opened { return unix.EINVAL } // Do the create. osFile, nsf, qid, ioUnit, err = ref.file.Create(t.Name, t.OpenFlags, t.Permissions, uid, t.GID) if err != nil { return err } newRef = &fidRef{ server: cs.server, parent: ref, file: nsf, opened: true, openFlags: t.OpenFlags, mode: ModeRegular, pathNode: ref.pathNode.pathNodeFor(t.Name), } ref.pathNode.addChild(newRef, t.Name) ref.IncRef() // Acquire parent reference. return nil }); err != nil { return nil, err } // Replace the FID reference. cs.InsertFID(t.FID, newRef) rlcreate := &Rlcreate{Rlopen: Rlopen{QID: qid, IoUnit: ioUnit}} rlcreate.SetFilePayload(osFile) return rlcreate, nil } // handle implements handler.handle. func (t *Tlcreate) handle(cs *connState) message { rlcreate, err := t.do(cs, NoUID) if err != nil { return newErr(err) } return rlcreate } // handle implements handler.handle. func (t *Tsymlink) handle(cs *connState) message { rsymlink, err := t.do(cs, NoUID) if err != nil { return newErr(err) } return rsymlink } func (t *Tsymlink) do(cs *connState, uid UID) (*Rsymlink, error) { if err := checkSafeName(t.Name); err != nil { return nil, err } ref, ok := cs.LookupFID(t.Directory) if !ok { return nil, unix.EBADF } defer ref.DecRef() var qid QID if err := ref.safelyWrite(func() (err error) { // Don't allow symlinks from non-directories or deleted directories. if ref.isDeleted() || !ref.mode.IsDir() { return unix.EINVAL } // Not allowed on open directories. if ref.opened { return unix.EINVAL } // Do the symlink. qid, err = ref.file.Symlink(t.Target, t.Name, uid, t.GID) return err }); err != nil { return nil, err } return &Rsymlink{QID: qid}, nil } // handle implements handler.handle. func (t *Tlink) handle(cs *connState) message { if err := checkSafeName(t.Name); err != nil { return newErr(err) } ref, ok := cs.LookupFID(t.Directory) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() refTarget, ok := cs.LookupFID(t.Target) if !ok { return newErr(unix.EBADF) } defer refTarget.DecRef() if err := ref.safelyWrite(func() (err error) { // Don't allow create links from non-directories or deleted directories. if ref.isDeleted() || !ref.mode.IsDir() { return unix.EINVAL } // Not allowed on open directories. if ref.opened { return unix.EINVAL } // Do the link. return ref.file.Link(refTarget.file, t.Name) }); err != nil { return newErr(err) } return &Rlink{} } // handle implements handler.handle. func (t *Trenameat) handle(cs *connState) message { if err := checkSafeName(t.OldName); err != nil { return newErr(err) } if err := checkSafeName(t.NewName); err != nil { return newErr(err) } ref, ok := cs.LookupFID(t.OldDirectory) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() refTarget, ok := cs.LookupFID(t.NewDirectory) if !ok { return newErr(unix.EBADF) } defer refTarget.DecRef() // Perform the rename holding the global lock. if err := ref.safelyGlobal(func() (err error) { // Don't allow renaming across deleted directories. if ref.isDeleted() || !ref.mode.IsDir() || refTarget.isDeleted() || !refTarget.mode.IsDir() { return unix.EINVAL } // Not allowed on open directories. if ref.opened { return unix.EINVAL } // Is this the same file? If yes, short-circuit and return success. if ref.pathNode == refTarget.pathNode && t.OldName == t.NewName { return nil } // Attempt the actual rename. if err := ref.file.RenameAt(t.OldName, refTarget.file, t.NewName); err != nil { return err } // Update the path tree. ref.renameChildTo(t.OldName, refTarget, t.NewName) return nil }); err != nil { return newErr(err) } return &Rrenameat{} } // handle implements handler.handle. func (t *Tunlinkat) handle(cs *connState) message { if err := checkSafeName(t.Name); err != nil { return newErr(err) } ref, ok := cs.LookupFID(t.Directory) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() if err := ref.safelyWrite(func() (err error) { // Don't allow deletion from non-directories or deleted directories. if ref.isDeleted() || !ref.mode.IsDir() { return unix.EINVAL } // Not allowed on open directories. if ref.opened { return unix.EINVAL } // Before we do the unlink itself, we need to ensure that there // are no operations in flight on associated path node. The // child's path node lock must be held to ensure that the // unlinkat marking the child deleted below is atomic with // respect to any other read or write operations. // // This is one case where we have a lock ordering issue, but // since we always acquire deeper in the hierarchy, we know // that we are free of lock cycles. childPathNode := ref.pathNode.pathNodeFor(t.Name) childPathNode.opMu.Lock() defer childPathNode.opMu.Unlock() // Do the unlink. err = ref.file.UnlinkAt(t.Name, t.Flags) if err != nil { return err } // Mark the path as deleted. ref.markChildDeleted(t.Name) return nil }); err != nil { return newErr(err) } return &Runlinkat{} } // handle implements handler.handle. func (t *Trename) handle(cs *connState) message { if err := checkSafeName(t.Name); err != nil { return newErr(err) } ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() refTarget, ok := cs.LookupFID(t.Directory) if !ok { return newErr(unix.EBADF) } defer refTarget.DecRef() if err := ref.safelyGlobal(func() (err error) { // Don't allow a root rename. if ref.isRoot() { return unix.EINVAL } // Don't allow renaming deleting entries, or target non-directories. if ref.isDeleted() || refTarget.isDeleted() || !refTarget.mode.IsDir() { return unix.EINVAL } // If the parent is deleted, but we not, something is seriously wrong. // It's fail to die at this point with an assertion failure. if ref.parent.isDeleted() { panic(fmt.Sprintf("parent %+v deleted, child %+v is not", ref.parent, ref)) } // N.B. The rename operation is allowed to proceed on open files. It // does impact the state of its parent, but this is merely a sanity // check in any case, and the operation is safe. There may be other // files corresponding to the same path that are renamed anyways. // Check for the exact same file and short-circuit. oldName := ref.parent.pathNode.nameFor(ref) if ref.parent.pathNode == refTarget.pathNode && oldName == t.Name { return nil } // Call the rename method on the parent. if err := ref.parent.file.RenameAt(oldName, refTarget.file, t.Name); err != nil { return err } // Update the path tree. ref.parent.renameChildTo(oldName, refTarget, t.Name) return nil }); err != nil { return newErr(err) } return &Rrename{} } // handle implements handler.handle. func (t *Treadlink) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() var target string if err := ref.safelyRead(func() (err error) { // Don't allow readlink on deleted files. There is no need to // check if this file is opened because symlinks cannot be // opened. if ref.isDeleted() || !ref.mode.IsSymlink() { return unix.EINVAL } // Do the read. target, err = ref.file.Readlink() return err }); err != nil { return newErr(err) } return &Rreadlink{target} } // handle implements handler.handle. func (t *Tread) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() // Constrain the size of the read buffer. if int(t.Count) > int(maximumLength) { return newErr(unix.ENOBUFS) } var ( data = make([]byte, t.Count) n int ) if err := ref.safelyRead(func() (err error) { // Has it been opened already? if !ref.opened { return unix.EINVAL } // Can it be read? Check permissions. if ref.openFlags&OpenFlagsModeMask == WriteOnly { return unix.EPERM } n, err = ref.file.ReadAt(data, t.Offset) return err }); err != nil && err != io.EOF { return newErr(err) } return &Rread{Data: data[:n]} } // handle implements handler.handle. func (t *Twrite) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() var n int if err := ref.safelyRead(func() (err error) { // Has it been opened already? if !ref.opened { return unix.EINVAL } // Can it be written? Check permissions. if ref.openFlags&OpenFlagsModeMask == ReadOnly { return unix.EPERM } n, err = ref.file.WriteAt(t.Data, t.Offset) return err }); err != nil { return newErr(err) } return &Rwrite{Count: uint32(n)} } // handle implements handler.handle. func (t *Tmknod) handle(cs *connState) message { rmknod, err := t.do(cs, NoUID) if err != nil { return newErr(err) } return rmknod } func (t *Tmknod) do(cs *connState, uid UID) (*Rmknod, error) { if err := checkSafeName(t.Name); err != nil { return nil, err } ref, ok := cs.LookupFID(t.Directory) if !ok { return nil, unix.EBADF } defer ref.DecRef() var qid QID if err := ref.safelyWrite(func() (err error) { // Don't allow mknod on deleted files. if ref.isDeleted() || !ref.mode.IsDir() { return unix.EINVAL } // Not allowed on open directories. if ref.opened { return unix.EINVAL } // Do the mknod. qid, err = ref.file.Mknod(t.Name, t.Mode, t.Major, t.Minor, uid, t.GID) return err }); err != nil { return nil, err } return &Rmknod{QID: qid}, nil } // handle implements handler.handle. func (t *Tmkdir) handle(cs *connState) message { rmkdir, err := t.do(cs, NoUID) if err != nil { return newErr(err) } return rmkdir } func (t *Tmkdir) do(cs *connState, uid UID) (*Rmkdir, error) { if err := checkSafeName(t.Name); err != nil { return nil, err } ref, ok := cs.LookupFID(t.Directory) if !ok { return nil, unix.EBADF } defer ref.DecRef() var qid QID if err := ref.safelyWrite(func() (err error) { // Don't allow mkdir on deleted files. if ref.isDeleted() || !ref.mode.IsDir() { return unix.EINVAL } // Not allowed on open directories. if ref.opened { return unix.EINVAL } // Do the mkdir. qid, err = ref.file.Mkdir(t.Name, t.Permissions, uid, t.GID) return err }); err != nil { return nil, err } return &Rmkdir{QID: qid}, nil } // handle implements handler.handle. func (t *Tgetattr) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() // We allow getattr on deleted files. Depending on the backing // implementation, it's possible that races exist that might allow // fetching attributes of other files. But we need to generally allow // refreshing attributes and this is a minor leak, if at all. var ( qid QID valid AttrMask attr Attr ) if err := ref.safelyRead(func() (err error) { qid, valid, attr, err = ref.file.GetAttr(t.AttrMask) return err }); err != nil { return newErr(err) } return &Rgetattr{QID: qid, Valid: valid, Attr: attr} } // handle implements handler.handle. func (t *Tsetattr) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() if err := ref.safelyWrite(func() error { // We don't allow setattr on files that have been deleted. // This might be technically incorrect, as it's possible that // there were multiple links and you can still change the // corresponding inode information. if !cs.server.options.SetAttrOnDeleted && ref.isDeleted() { return unix.EINVAL } // Set the attributes. return ref.file.SetAttr(t.Valid, t.SetAttr) }); err != nil { return newErr(err) } return &Rsetattr{} } // handle implements handler.handle. func (t *Tallocate) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() if err := ref.safelyWrite(func() error { // Has it been opened already? if !ref.opened { return unix.EINVAL } // Can it be written? Check permissions. if ref.openFlags&OpenFlagsModeMask == ReadOnly { return unix.EBADF } // We don't allow allocate on files that have been deleted. if !cs.server.options.AllocateOnDeleted && ref.isDeleted() { return unix.EINVAL } return ref.file.Allocate(t.Mode, t.Offset, t.Length) }); err != nil { return newErr(err) } return &Rallocate{} } // handle implements handler.handle. func (t *Txattrwalk) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() // We don't support extended attributes. return newErr(unix.ENODATA) } // handle implements handler.handle. func (t *Txattrcreate) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() // We don't support extended attributes. return newErr(unix.ENOSYS) } // handle implements handler.handle. func (t *Tgetxattr) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() var val string if err := ref.safelyRead(func() (err error) { // Don't allow getxattr on files that have been deleted. if ref.isDeleted() { return unix.EINVAL } val, err = ref.file.GetXattr(t.Name, t.Size) return err }); err != nil { return newErr(err) } return &Rgetxattr{Value: val} } // handle implements handler.handle. func (t *Tsetxattr) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() if err := ref.safelyWrite(func() error { // Don't allow setxattr on files that have been deleted. if ref.isDeleted() { return unix.EINVAL } return ref.file.SetXattr(t.Name, t.Value, t.Flags) }); err != nil { return newErr(err) } return &Rsetxattr{} } // handle implements handler.handle. func (t *Tlistxattr) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() var xattrs map[string]struct{} if err := ref.safelyRead(func() (err error) { // Don't allow listxattr on files that have been deleted. if ref.isDeleted() { return unix.EINVAL } xattrs, err = ref.file.ListXattr(t.Size) return err }); err != nil { return newErr(err) } xattrList := make([]string, 0, len(xattrs)) for x := range xattrs { xattrList = append(xattrList, x) } return &Rlistxattr{Xattrs: xattrList} } // handle implements handler.handle. func (t *Tremovexattr) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() if err := ref.safelyWrite(func() error { // Don't allow removexattr on files that have been deleted. if ref.isDeleted() { return unix.EINVAL } return ref.file.RemoveXattr(t.Name) }); err != nil { return newErr(err) } return &Rremovexattr{} } // handle implements handler.handle. func (t *Treaddir) handle(cs *connState) message { ref, ok := cs.LookupFID(t.Directory) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() var entries []Dirent if err := ref.safelyRead(func() (err error) { // Don't allow reading deleted directories. if ref.isDeleted() || !ref.mode.IsDir() { return unix.EINVAL } // Has it been opened yet? if !ref.opened { return unix.EINVAL } // Read the entries. entries, err = ref.file.Readdir(t.DirentOffset, t.Count) if err != nil && err != io.EOF { return err } return nil }); err != nil { return newErr(err) } return &Rreaddir{Count: t.Count, Entries: entries} } // handle implements handler.handle. func (t *Tfsync) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() if err := ref.safelyRead(func() (err error) { // Has it been opened yet? if !ref.opened { return unix.EINVAL } // Perform the sync. return ref.file.FSync() }); err != nil { return newErr(err) } return &Rfsync{} } // handle implements handler.handle. func (t *Tstatfs) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() st, err := ref.file.StatFS() if err != nil { return newErr(err) } return &Rstatfs{st} } // handle implements handler.handle. func (t *Tflushf) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() if err := ref.safelyRead(ref.file.Flush); err != nil { return newErr(err) } return &Rflushf{} } // walkOne walks zero or one path elements. // // The slice passed as qids is append and returned. func walkOne(qids []QID, from File, names []string, getattr bool) ([]QID, File, AttrMask, Attr, error) { if len(names) > 1 { // We require exactly zero or one elements. return nil, nil, AttrMask{}, Attr{}, unix.EINVAL } var ( localQIDs []QID sf File valid AttrMask attr Attr err error ) switch { case getattr: localQIDs, sf, valid, attr, err = from.WalkGetAttr(names) // Can't put fallthrough in the if because Go. if err != unix.ENOSYS { break } fallthrough default: localQIDs, sf, err = from.Walk(names) if err != nil { // No way to walk this element. break } if getattr { _, valid, attr, err = sf.GetAttr(AttrMaskAll()) if err != nil { // Don't leak the file. sf.Close() } } } if err != nil { // Error walking, don't return anything. return nil, nil, AttrMask{}, Attr{}, err } if len(localQIDs) != 1 { // Expected a single QID. sf.Close() return nil, nil, AttrMask{}, Attr{}, unix.EINVAL } return append(qids, localQIDs...), sf, valid, attr, nil } // doWalk walks from a given fidRef. // // This enforces that all intermediate nodes are walkable (directories). The // fidRef returned (newRef) has a reference associated with it that is now // owned by the caller and must be handled appropriately. func doWalk(cs *connState, ref *fidRef, names []string, getattr bool) (qids []QID, newRef *fidRef, valid AttrMask, attr Attr, err error) { // Check the names. for _, name := range names { err = checkSafeName(name) if err != nil { return } } // Has it been opened already? err = ref.safelyRead(func() (err error) { if ref.opened { return unix.EBUSY } return nil }) if err != nil { return } // Is this an empty list? Handle specially. We don't actually need to // validate anything since this is always permitted. if len(names) == 0 { var sf File // Temporary. if err := ref.maybeParent().safelyRead(func() (err error) { // Clone the single element. qids, sf, valid, attr, err = walkOne(nil, ref.file, nil, getattr) if err != nil { return err } newRef = &fidRef{ server: cs.server, parent: ref.parent, file: sf, mode: ref.mode, pathNode: ref.pathNode, } if !ref.isRoot() { if !newRef.isDeleted() { // Add only if a non-root node; the same node. ref.parent.pathNode.addChild(newRef, ref.parent.pathNode.nameFor(ref)) } ref.parent.IncRef() // Acquire parent reference. } // doWalk returns a reference. newRef.IncRef() return nil }); err != nil { return nil, nil, AttrMask{}, Attr{}, err } // Do not return the new QID. return nil, newRef, valid, attr, nil } // Do the walk, one element at a time. walkRef := ref walkRef.IncRef() for i := 0; i < len(names); i++ { // We won't allow beyond past symlinks; stop here if this isn't // a proper directory and we have additional paths to walk. if !walkRef.mode.IsDir() { walkRef.DecRef() // Drop walk reference; no lock required. return nil, nil, AttrMask{}, Attr{}, unix.EINVAL } var sf File // Temporary. if err := walkRef.safelyRead(func() (err error) { // It is not safe to walk on a deleted directory. It could have been // replaced with a malicious symlink. if walkRef.isDeleted() { // Fail this operation as the result will not be meaningful if walkRef // is deleted. return unix.ENOENT } // Pass getattr = true to walkOne since we need the file type for // newRef. qids, sf, valid, attr, err = walkOne(qids, walkRef.file, names[i:i+1], true) if err != nil { return err } // Note that we don't need to acquire a lock on any of // these individual instances. That's because they are // not actually addressable via a FID. They are // anonymous. They exist in the tree for tracking // purposes. newRef := &fidRef{ server: cs.server, parent: walkRef, file: sf, mode: attr.Mode.FileType(), pathNode: walkRef.pathNode.pathNodeFor(names[i]), } walkRef.pathNode.addChild(newRef, names[i]) // We allow our walk reference to become the new parent // reference here and so we don't IncRef. Instead, just // set walkRef to the newRef above and acquire a new // walk reference. walkRef = newRef walkRef.IncRef() return nil }); err != nil { walkRef.DecRef() // Drop the old walkRef. return nil, nil, AttrMask{}, Attr{}, err } } // Success. return qids, walkRef, valid, attr, nil } // handle implements handler.handle. func (t *Twalk) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() // Do the walk. qids, newRef, _, _, err := doWalk(cs, ref, t.Names, false) if err != nil { return newErr(err) } defer newRef.DecRef() // Install the new FID. cs.InsertFID(t.NewFID, newRef) return &Rwalk{QIDs: qids} } // handle implements handler.handle. func (t *Twalkgetattr) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() // Do the walk. qids, newRef, valid, attr, err := doWalk(cs, ref, t.Names, true) if err != nil { return newErr(err) } defer newRef.DecRef() // Install the new FID. cs.InsertFID(t.NewFID, newRef) return &Rwalkgetattr{QIDs: qids, Valid: valid, Attr: attr} } // handle implements handler.handle. func (t *Tucreate) handle(cs *connState) message { rlcreate, err := t.Tlcreate.do(cs, t.UID) if err != nil { return newErr(err) } return &Rucreate{*rlcreate} } // handle implements handler.handle. func (t *Tumkdir) handle(cs *connState) message { rmkdir, err := t.Tmkdir.do(cs, t.UID) if err != nil { return newErr(err) } return &Rumkdir{*rmkdir} } // handle implements handler.handle. func (t *Tusymlink) handle(cs *connState) message { rsymlink, err := t.Tsymlink.do(cs, t.UID) if err != nil { return newErr(err) } return &Rusymlink{*rsymlink} } // handle implements handler.handle. func (t *Tumknod) handle(cs *connState) message { rmknod, err := t.Tmknod.do(cs, t.UID) if err != nil { return newErr(err) } return &Rumknod{*rmknod} } // handle implements handler.handle. func (t *Tbind) handle(cs *connState) message { if err := checkSafeName(t.SockName); err != nil { return newErr(err) } ref, ok := cs.LookupFID(t.Directory) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() var ( sockRef *fidRef qid QID valid AttrMask attr Attr ) if err := ref.safelyWrite(func() (err error) { // Don't allow creation from non-directories or deleted directories. if ref.isDeleted() || !ref.mode.IsDir() { return unix.EINVAL } // Not allowed on open directories. if ref.opened { return unix.EINVAL } var sockF File sockF, qid, valid, attr, err = ref.file.Bind(t.SockType, t.SockName, t.UID, t.GID) if err != nil { return err } sockRef = &fidRef{ server: cs.server, parent: ref, file: sockF, mode: ModeSocket, pathNode: ref.pathNode.pathNodeFor(t.SockName), } ref.pathNode.addChild(sockRef, t.SockName) ref.IncRef() // Acquire parent reference. return nil }); err != nil { return newErr(err) } cs.InsertFID(t.NewFID, sockRef) return &Rbind{QID: qid, Valid: valid, Attr: attr} } // handle implements handler.handle. func (t *Tlconnect) handle(cs *connState) message { ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() var osFile *fd.FD if err := ref.safelyRead(func() (err error) { // Don't allow connecting to deleted files. if ref.isDeleted() || !ref.mode.IsSocket() { return unix.EINVAL } // Do the connect. osFile, err = ref.file.Connect(t.SocketType) return err }); err != nil { return newErr(err) } rlconnect := &Rlconnect{} rlconnect.SetFilePayload(osFile) return rlconnect } // handle implements handler.handle. func (t *Tchannel) handle(cs *connState) message { // Ensure that channels are enabled. if err := cs.initializeChannels(); err != nil { return newErr(err) } ch := cs.lookupChannel(t.ID) if ch == nil { return newErr(unix.ENOSYS) } // Return the payload. Note that we need to duplicate the file // descriptor for the channel allocator, because sending is a // destructive operation between sendRecvLegacy (and now the newer // channel send operations). Same goes for the client FD. rchannel := &Rchannel{ Offset: uint64(ch.desc.Offset), Length: uint64(ch.desc.Length), } switch t.Control { case 0: // Open the main data channel. mfd, err := unix.Dup(int(cs.channelAlloc.FD())) if err != nil { return newErr(err) } rchannel.SetFilePayload(fd.New(mfd)) case 1: cfd, err := unix.Dup(ch.client.FD()) if err != nil { return newErr(err) } rchannel.SetFilePayload(fd.New(cfd)) default: return newErr(unix.EINVAL) } return rchannel } // handle implements handler.handle. func (t *Tmultigetattr) handle(cs *connState) message { for i, name := range t.Names { if len(name) == 0 && i == 0 { // Empty name is allowed on the first entry to indicate that the current // FID needs to be included in the result. continue } if err := checkSafeName(name); err != nil { return newErr(err) } } ref, ok := cs.LookupFID(t.FID) if !ok { return newErr(unix.EBADF) } defer ref.DecRef() if cs.server.options.MultiGetAttrSupported { var stats []FullStat if err := ref.safelyRead(func() (err error) { stats, err = ref.file.MultiGetAttr(t.Names) return err }); err != nil { return newErr(err) } return &Rmultigetattr{Stats: stats} } stats := make([]FullStat, 0, len(t.Names)) mask := AttrMaskAll() start := ref.file startNode := ref.pathNode parent := start parentNode := startNode closeParent := func() { if parent != start { _ = parent.Close() } } defer closeParent() cs.server.renameMu.RLock() defer cs.server.renameMu.RUnlock() for i, name := range t.Names { if len(name) == 0 && i == 0 { startNode.opMu.RLock() qid, valid, attr, err := start.GetAttr(mask) startNode.opMu.RUnlock() if err != nil { return newErr(err) } stats = append(stats, FullStat{ QID: qid, Valid: valid, Attr: attr, }) continue } parentNode.opMu.RLock() if parentNode.deleted.Load() != 0 { parentNode.opMu.RUnlock() break } qids, child, valid, attr, err := parent.WalkGetAttr([]string{name}) if err != nil { parentNode.opMu.RUnlock() if errors2.Is(err, unix.ENOENT) { break } return newErr(err) } stats = append(stats, FullStat{ QID: qids[0], Valid: valid, Attr: attr, }) // Update with next generation. closeParent() parent = child childNode := parentNode.pathNodeFor(name) parentNode.opMu.RUnlock() parentNode = childNode if attr.Mode.FileType() != ModeDirectory { // Doesn't need to continue if entry is not a dir. Including symlinks // that cannot be followed. break } } return &Rmultigetattr{Stats: stats} } golang-gvisor-gvisor-0.0~20240729.0/pkg/p9/messages.go000066400000000000000000002010121465435605700221060ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package p9 import ( "fmt" "math" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" ) // ErrInvalidMsgType is returned when an unsupported message type is found. type ErrInvalidMsgType struct { MsgType } // Error returns a useful string. func (e *ErrInvalidMsgType) Error() string { return fmt.Sprintf("invalid message type: %d", e.MsgType) } // message is a generic 9P message. type message interface { encoder fmt.Stringer // Type returns the message type number. Type() MsgType } // payloader is a special message which may include an inline payload. type payloader interface { // FixedSize returns the size of the fixed portion of this message. FixedSize() uint32 // Payload returns the payload for sending. Payload() []byte // SetPayload returns the decoded message. // // This is going to be total message size - FixedSize. But this should // be validated during decode, which will be called after SetPayload. SetPayload([]byte) } // filer is a message capable of passing a file. type filer interface { // FilePayload returns the file payload. FilePayload() *fd.FD // SetFilePayload sets the file payload. SetFilePayload(*fd.FD) } // filePayload embeds a File object. type filePayload struct { File *fd.FD } // FilePayload returns the file payload. func (f *filePayload) FilePayload() *fd.FD { return f.File } // SetFilePayload sets the received file. func (f *filePayload) SetFilePayload(file *fd.FD) { f.File = file } // Tversion is a version request. type Tversion struct { // MSize is the message size to use. MSize uint32 // Version is the version string. // // For this implementation, this must be 9P2000.L. Version string } // decode implements encoder.decode. func (t *Tversion) decode(b *buffer) { t.MSize = b.Read32() t.Version = b.ReadString() } // encode implements encoder.encode. func (t *Tversion) encode(b *buffer) { b.Write32(t.MSize) b.WriteString(t.Version) } // Type implements message.Type. func (*Tversion) Type() MsgType { return MsgTversion } // String implements fmt.Stringer. func (t *Tversion) String() string { return fmt.Sprintf("Tversion{MSize: %d, Version: %s}", t.MSize, t.Version) } // Rversion is a version response. type Rversion struct { // MSize is the negotiated size. MSize uint32 // Version is the negotiated version. Version string } // decode implements encoder.decode. func (r *Rversion) decode(b *buffer) { r.MSize = b.Read32() r.Version = b.ReadString() } // encode implements encoder.encode. func (r *Rversion) encode(b *buffer) { b.Write32(r.MSize) b.WriteString(r.Version) } // Type implements message.Type. func (*Rversion) Type() MsgType { return MsgRversion } // String implements fmt.Stringer. func (r *Rversion) String() string { return fmt.Sprintf("Rversion{MSize: %d, Version: %s}", r.MSize, r.Version) } // Tflush is a flush request. type Tflush struct { // OldTag is the tag to wait on. OldTag Tag } // decode implements encoder.decode. func (t *Tflush) decode(b *buffer) { t.OldTag = b.ReadTag() } // encode implements encoder.encode. func (t *Tflush) encode(b *buffer) { b.WriteTag(t.OldTag) } // Type implements message.Type. func (*Tflush) Type() MsgType { return MsgTflush } // String implements fmt.Stringer. func (t *Tflush) String() string { return fmt.Sprintf("Tflush{OldTag: %d}", t.OldTag) } // Rflush is a flush response. type Rflush struct { } // decode implements encoder.decode. func (*Rflush) decode(*buffer) { } // encode implements encoder.encode. func (*Rflush) encode(*buffer) { } // Type implements message.Type. func (*Rflush) Type() MsgType { return MsgRflush } // String implements fmt.Stringer. func (r *Rflush) String() string { return "RFlush{}" } // Twalk is a walk request. type Twalk struct { // FID is the FID to be walked. FID FID // NewFID is the resulting FID. NewFID FID // Names are the set of names to be walked. Names []string } // decode implements encoder.decode. func (t *Twalk) decode(b *buffer) { t.FID = b.ReadFID() t.NewFID = b.ReadFID() n := b.Read16() t.Names = t.Names[:0] for i := 0; i < int(n); i++ { t.Names = append(t.Names, b.ReadString()) } } // encode implements encoder.encode. func (t *Twalk) encode(b *buffer) { b.WriteFID(t.FID) b.WriteFID(t.NewFID) b.Write16(uint16(len(t.Names))) for _, name := range t.Names { b.WriteString(name) } } // Type implements message.Type. func (*Twalk) Type() MsgType { return MsgTwalk } // String implements fmt.Stringer. func (t *Twalk) String() string { return fmt.Sprintf("Twalk{FID: %d, NewFID: %d, Names: %v}", t.FID, t.NewFID, t.Names) } // Rwalk is a walk response. type Rwalk struct { // QIDs are the set of QIDs returned. QIDs []QID } // decode implements encoder.decode. func (r *Rwalk) decode(b *buffer) { n := b.Read16() r.QIDs = r.QIDs[:0] for i := 0; i < int(n); i++ { var q QID q.decode(b) r.QIDs = append(r.QIDs, q) } } // encode implements encoder.encode. func (r *Rwalk) encode(b *buffer) { b.Write16(uint16(len(r.QIDs))) for i := range r.QIDs { r.QIDs[i].encode(b) } } // Type implements message.Type. func (*Rwalk) Type() MsgType { return MsgRwalk } // String implements fmt.Stringer. func (r *Rwalk) String() string { return fmt.Sprintf("Rwalk{QIDs: %v}", r.QIDs) } // Tclunk is a close request. type Tclunk struct { // FID is the FID to be closed. FID FID } // decode implements encoder.decode. func (t *Tclunk) decode(b *buffer) { t.FID = b.ReadFID() } // encode implements encoder.encode. func (t *Tclunk) encode(b *buffer) { b.WriteFID(t.FID) } // Type implements message.Type. func (*Tclunk) Type() MsgType { return MsgTclunk } // String implements fmt.Stringer. func (t *Tclunk) String() string { return fmt.Sprintf("Tclunk{FID: %d}", t.FID) } // Rclunk is a close response. type Rclunk struct { } // decode implements encoder.decode. func (*Rclunk) decode(*buffer) { } // encode implements encoder.encode. func (*Rclunk) encode(*buffer) { } // Type implements message.Type. func (*Rclunk) Type() MsgType { return MsgRclunk } // String implements fmt.Stringer. func (r *Rclunk) String() string { return "Rclunk{}" } // Tsetattrclunk is a setattr+close request. type Tsetattrclunk struct { // FID is the FID to change. FID FID // Valid is the set of bits which will be used. Valid SetAttrMask // SetAttr is the set request. SetAttr SetAttr } // decode implements encoder.decode. func (t *Tsetattrclunk) decode(b *buffer) { t.FID = b.ReadFID() t.Valid.decode(b) t.SetAttr.decode(b) } // encode implements encoder.encode. func (t *Tsetattrclunk) encode(b *buffer) { b.WriteFID(t.FID) t.Valid.encode(b) t.SetAttr.encode(b) } // Type implements message.Type. func (*Tsetattrclunk) Type() MsgType { return MsgTsetattrclunk } // String implements fmt.Stringer. func (t *Tsetattrclunk) String() string { return fmt.Sprintf("Tsetattrclunk{FID: %d, Valid: %v, SetAttr: %s}", t.FID, t.Valid, t.SetAttr) } // Rsetattrclunk is a setattr+close response. type Rsetattrclunk struct { } // decode implements encoder.decode. func (*Rsetattrclunk) decode(*buffer) { } // encode implements encoder.encode. func (*Rsetattrclunk) encode(*buffer) { } // Type implements message.Type. func (*Rsetattrclunk) Type() MsgType { return MsgRsetattrclunk } // String implements fmt.Stringer. func (r *Rsetattrclunk) String() string { return "Rsetattrclunk{}" } // Tremove is a remove request. // // This will eventually be replaced by Tunlinkat. type Tremove struct { // FID is the FID to be removed. FID FID } // decode implements encoder.decode. func (t *Tremove) decode(b *buffer) { t.FID = b.ReadFID() } // encode implements encoder.encode. func (t *Tremove) encode(b *buffer) { b.WriteFID(t.FID) } // Type implements message.Type. func (*Tremove) Type() MsgType { return MsgTremove } // String implements fmt.Stringer. func (t *Tremove) String() string { return fmt.Sprintf("Tremove{FID: %d}", t.FID) } // Rremove is a remove response. type Rremove struct { } // decode implements encoder.decode. func (*Rremove) decode(*buffer) { } // encode implements encoder.encode. func (*Rremove) encode(*buffer) { } // Type implements message.Type. func (*Rremove) Type() MsgType { return MsgRremove } // String implements fmt.Stringer. func (r *Rremove) String() string { return "Rremove{}" } // Rlerror is an error response. // // Note that this replaces the error code used in 9p. type Rlerror struct { Error uint32 } // decode implements encoder.decode. func (r *Rlerror) decode(b *buffer) { r.Error = b.Read32() } // encode implements encoder.encode. func (r *Rlerror) encode(b *buffer) { b.Write32(r.Error) } // Type implements message.Type. func (*Rlerror) Type() MsgType { return MsgRlerror } // String implements fmt.Stringer. func (r *Rlerror) String() string { return fmt.Sprintf("Rlerror{Error: %d}", r.Error) } // Tauth is an authentication request. type Tauth struct { // AuthenticationFID is the FID to attach the authentication result. AuthenticationFID FID // UserName is the user to attach. UserName string // AttachName is the attach name. AttachName string // UserID is the numeric identifier for UserName. UID UID } // decode implements encoder.decode. func (t *Tauth) decode(b *buffer) { t.AuthenticationFID = b.ReadFID() t.UserName = b.ReadString() t.AttachName = b.ReadString() t.UID = b.ReadUID() } // encode implements encoder.encode. func (t *Tauth) encode(b *buffer) { b.WriteFID(t.AuthenticationFID) b.WriteString(t.UserName) b.WriteString(t.AttachName) b.WriteUID(t.UID) } // Type implements message.Type. func (*Tauth) Type() MsgType { return MsgTauth } // String implements fmt.Stringer. func (t *Tauth) String() string { return fmt.Sprintf("Tauth{AuthFID: %d, UserName: %s, AttachName: %s, UID: %d", t.AuthenticationFID, t.UserName, t.AttachName, t.UID) } // Rauth is an authentication response. // // encode and decode are inherited directly from QID. type Rauth struct { QID } // Type implements message.Type. func (*Rauth) Type() MsgType { return MsgRauth } // String implements fmt.Stringer. func (r *Rauth) String() string { return fmt.Sprintf("Rauth{QID: %s}", r.QID) } // Tattach is an attach request. type Tattach struct { // FID is the FID to be attached. FID FID // Auth is the embedded authentication request. // // See client.Attach for information regarding authentication. Auth Tauth } // decode implements encoder.decode. func (t *Tattach) decode(b *buffer) { t.FID = b.ReadFID() t.Auth.decode(b) } // encode implements encoder.encode. func (t *Tattach) encode(b *buffer) { b.WriteFID(t.FID) t.Auth.encode(b) } // Type implements message.Type. func (*Tattach) Type() MsgType { return MsgTattach } // String implements fmt.Stringer. func (t *Tattach) String() string { return fmt.Sprintf("Tattach{FID: %d, AuthFID: %d, UserName: %s, AttachName: %s, UID: %d}", t.FID, t.Auth.AuthenticationFID, t.Auth.UserName, t.Auth.AttachName, t.Auth.UID) } // Rattach is an attach response. type Rattach struct { QID } // Type implements message.Type. func (*Rattach) Type() MsgType { return MsgRattach } // String implements fmt.Stringer. func (r *Rattach) String() string { return fmt.Sprintf("Rattach{QID: %s}", r.QID) } // Tlopen is an open request. type Tlopen struct { // FID is the FID to be opened. FID FID // Flags are the open flags. Flags OpenFlags } // decode implements encoder.decode. func (t *Tlopen) decode(b *buffer) { t.FID = b.ReadFID() t.Flags = b.ReadOpenFlags() } // encode implements encoder.encode. func (t *Tlopen) encode(b *buffer) { b.WriteFID(t.FID) b.WriteOpenFlags(t.Flags) } // Type implements message.Type. func (*Tlopen) Type() MsgType { return MsgTlopen } // String implements fmt.Stringer. func (t *Tlopen) String() string { return fmt.Sprintf("Tlopen{FID: %d, Flags: %v}", t.FID, t.Flags) } // Rlopen is a open response. type Rlopen struct { // QID is the file's QID. QID QID // IoUnit is the recommended I/O unit. IoUnit uint32 filePayload } // decode implements encoder.decode. func (r *Rlopen) decode(b *buffer) { r.QID.decode(b) r.IoUnit = b.Read32() } // encode implements encoder.encode. func (r *Rlopen) encode(b *buffer) { r.QID.encode(b) b.Write32(r.IoUnit) } // Type implements message.Type. func (*Rlopen) Type() MsgType { return MsgRlopen } // String implements fmt.Stringer. func (r *Rlopen) String() string { return fmt.Sprintf("Rlopen{QID: %s, IoUnit: %d, File: %v}", r.QID, r.IoUnit, r.File) } // Tlcreate is a create request. type Tlcreate struct { // FID is the parent FID. // // This becomes the new file. FID FID // Name is the file name to create. Name string // Mode is the open mode (O_RDWR, etc.). // // Note that flags like O_TRUNC are ignored, as is O_EXCL. All // create operations are exclusive. OpenFlags OpenFlags // Permissions is the set of permission bits. Permissions FileMode // GID is the group ID to use for creating the file. GID GID } // decode implements encoder.decode. func (t *Tlcreate) decode(b *buffer) { t.FID = b.ReadFID() t.Name = b.ReadString() t.OpenFlags = b.ReadOpenFlags() t.Permissions = b.ReadPermissions() t.GID = b.ReadGID() } // encode implements encoder.encode. func (t *Tlcreate) encode(b *buffer) { b.WriteFID(t.FID) b.WriteString(t.Name) b.WriteOpenFlags(t.OpenFlags) b.WritePermissions(t.Permissions) b.WriteGID(t.GID) } // Type implements message.Type. func (*Tlcreate) Type() MsgType { return MsgTlcreate } // String implements fmt.Stringer. func (t *Tlcreate) String() string { return fmt.Sprintf("Tlcreate{FID: %d, Name: %s, OpenFlags: %s, Permissions: 0o%o, GID: %d}", t.FID, t.Name, t.OpenFlags, t.Permissions, t.GID) } // Rlcreate is a create response. // // The encode, decode, etc. methods are inherited from Rlopen. type Rlcreate struct { Rlopen } // Type implements message.Type. func (*Rlcreate) Type() MsgType { return MsgRlcreate } // String implements fmt.Stringer. func (r *Rlcreate) String() string { return fmt.Sprintf("Rlcreate{QID: %s, IoUnit: %d, File: %v}", r.QID, r.IoUnit, r.File) } // Tsymlink is a symlink request. type Tsymlink struct { // Directory is the directory FID. Directory FID // Name is the new in the directory. Name string // Target is the symlink target. Target string // GID is the owning group. GID GID } // decode implements encoder.decode. func (t *Tsymlink) decode(b *buffer) { t.Directory = b.ReadFID() t.Name = b.ReadString() t.Target = b.ReadString() t.GID = b.ReadGID() } // encode implements encoder.encode. func (t *Tsymlink) encode(b *buffer) { b.WriteFID(t.Directory) b.WriteString(t.Name) b.WriteString(t.Target) b.WriteGID(t.GID) } // Type implements message.Type. func (*Tsymlink) Type() MsgType { return MsgTsymlink } // String implements fmt.Stringer. func (t *Tsymlink) String() string { return fmt.Sprintf("Tsymlink{DirectoryFID: %d, Name: %s, Target: %s, GID: %d}", t.Directory, t.Name, t.Target, t.GID) } // Rsymlink is a symlink response. type Rsymlink struct { // QID is the new symlink's QID. QID QID } // decode implements encoder.decode. func (r *Rsymlink) decode(b *buffer) { r.QID.decode(b) } // encode implements encoder.encode. func (r *Rsymlink) encode(b *buffer) { r.QID.encode(b) } // Type implements message.Type. func (*Rsymlink) Type() MsgType { return MsgRsymlink } // String implements fmt.Stringer. func (r *Rsymlink) String() string { return fmt.Sprintf("Rsymlink{QID: %s}", r.QID) } // Tlink is a link request. type Tlink struct { // Directory is the directory to contain the link. Directory FID // FID is the target. Target FID // Name is the new source name. Name string } // decode implements encoder.decode. func (t *Tlink) decode(b *buffer) { t.Directory = b.ReadFID() t.Target = b.ReadFID() t.Name = b.ReadString() } // encode implements encoder.encode. func (t *Tlink) encode(b *buffer) { b.WriteFID(t.Directory) b.WriteFID(t.Target) b.WriteString(t.Name) } // Type implements message.Type. func (*Tlink) Type() MsgType { return MsgTlink } // String implements fmt.Stringer. func (t *Tlink) String() string { return fmt.Sprintf("Tlink{DirectoryFID: %d, TargetFID: %d, Name: %s}", t.Directory, t.Target, t.Name) } // Rlink is a link response. type Rlink struct { } // Type implements message.Type. func (*Rlink) Type() MsgType { return MsgRlink } // decode implements encoder.decode. func (*Rlink) decode(*buffer) { } // encode implements encoder.encode. func (*Rlink) encode(*buffer) { } // String implements fmt.Stringer. func (r *Rlink) String() string { return "Rlink{}" } // Trenameat is a rename request. type Trenameat struct { // OldDirectory is the source directory. OldDirectory FID // OldName is the source file name. OldName string // NewDirectory is the target directory. NewDirectory FID // NewName is the new file name. NewName string } // decode implements encoder.decode. func (t *Trenameat) decode(b *buffer) { t.OldDirectory = b.ReadFID() t.OldName = b.ReadString() t.NewDirectory = b.ReadFID() t.NewName = b.ReadString() } // encode implements encoder.encode. func (t *Trenameat) encode(b *buffer) { b.WriteFID(t.OldDirectory) b.WriteString(t.OldName) b.WriteFID(t.NewDirectory) b.WriteString(t.NewName) } // Type implements message.Type. func (*Trenameat) Type() MsgType { return MsgTrenameat } // String implements fmt.Stringer. func (t *Trenameat) String() string { return fmt.Sprintf("TrenameAt{OldDirectoryFID: %d, OldName: %s, NewDirectoryFID: %d, NewName: %s}", t.OldDirectory, t.OldName, t.NewDirectory, t.NewName) } // Rrenameat is a rename response. type Rrenameat struct { } // decode implements encoder.decode. func (*Rrenameat) decode(*buffer) { } // encode implements encoder.encode. func (*Rrenameat) encode(*buffer) { } // Type implements message.Type. func (*Rrenameat) Type() MsgType { return MsgRrenameat } // String implements fmt.Stringer. func (r *Rrenameat) String() string { return "Rrenameat{}" } // Tunlinkat is an unlink request. type Tunlinkat struct { // Directory is the originating directory. Directory FID // Name is the name of the entry to unlink. Name string // Flags are extra flags (e.g. O_DIRECTORY). These are not interpreted by p9. Flags uint32 } // decode implements encoder.decode. func (t *Tunlinkat) decode(b *buffer) { t.Directory = b.ReadFID() t.Name = b.ReadString() t.Flags = b.Read32() } // encode implements encoder.encode. func (t *Tunlinkat) encode(b *buffer) { b.WriteFID(t.Directory) b.WriteString(t.Name) b.Write32(t.Flags) } // Type implements message.Type. func (*Tunlinkat) Type() MsgType { return MsgTunlinkat } // String implements fmt.Stringer. func (t *Tunlinkat) String() string { return fmt.Sprintf("Tunlinkat{DirectoryFID: %d, Name: %s, Flags: 0x%X}", t.Directory, t.Name, t.Flags) } // Runlinkat is an unlink response. type Runlinkat struct { } // decode implements encoder.decode. func (*Runlinkat) decode(*buffer) { } // encode implements encoder.encode. func (*Runlinkat) encode(*buffer) { } // Type implements message.Type. func (*Runlinkat) Type() MsgType { return MsgRunlinkat } // String implements fmt.Stringer. func (r *Runlinkat) String() string { return "Runlinkat{}" } // Trename is a rename request. // // Note that this generally isn't used anymore, and ideally all rename calls // should Trenameat below. type Trename struct { // FID is the FID to rename. FID FID // Directory is the target directory. Directory FID // Name is the new file name. Name string } // decode implements encoder.decode. func (t *Trename) decode(b *buffer) { t.FID = b.ReadFID() t.Directory = b.ReadFID() t.Name = b.ReadString() } // encode implements encoder.encode. func (t *Trename) encode(b *buffer) { b.WriteFID(t.FID) b.WriteFID(t.Directory) b.WriteString(t.Name) } // Type implements message.Type. func (*Trename) Type() MsgType { return MsgTrename } // String implements fmt.Stringer. func (t *Trename) String() string { return fmt.Sprintf("Trename{FID: %d, DirectoryFID: %d, Name: %s}", t.FID, t.Directory, t.Name) } // Rrename is a rename response. type Rrename struct { } // decode implements encoder.decode. func (*Rrename) decode(*buffer) { } // encode implements encoder.encode. func (*Rrename) encode(*buffer) { } // Type implements message.Type. func (*Rrename) Type() MsgType { return MsgRrename } // String implements fmt.Stringer. func (r *Rrename) String() string { return "Rrename{}" } // Treadlink is a readlink request. type Treadlink struct { // FID is the symlink. FID FID } // decode implements encoder.decode. func (t *Treadlink) decode(b *buffer) { t.FID = b.ReadFID() } // encode implements encoder.encode. func (t *Treadlink) encode(b *buffer) { b.WriteFID(t.FID) } // Type implements message.Type. func (*Treadlink) Type() MsgType { return MsgTreadlink } // String implements fmt.Stringer. func (t *Treadlink) String() string { return fmt.Sprintf("Treadlink{FID: %d}", t.FID) } // Rreadlink is a readlink response. type Rreadlink struct { // Target is the symlink target. Target string } // decode implements encoder.decode. func (r *Rreadlink) decode(b *buffer) { r.Target = b.ReadString() } // encode implements encoder.encode. func (r *Rreadlink) encode(b *buffer) { b.WriteString(r.Target) } // Type implements message.Type. func (*Rreadlink) Type() MsgType { return MsgRreadlink } // String implements fmt.Stringer. func (r *Rreadlink) String() string { return fmt.Sprintf("Rreadlink{Target: %s}", r.Target) } // Tread is a read request. type Tread struct { // FID is the FID to read. FID FID // Offset indicates the file offset. Offset uint64 // Count indicates the number of bytes to read. Count uint32 } // decode implements encoder.decode. func (t *Tread) decode(b *buffer) { t.FID = b.ReadFID() t.Offset = b.Read64() t.Count = b.Read32() } // encode implements encoder.encode. func (t *Tread) encode(b *buffer) { b.WriteFID(t.FID) b.Write64(t.Offset) b.Write32(t.Count) } // Type implements message.Type. func (*Tread) Type() MsgType { return MsgTread } // String implements fmt.Stringer. func (t *Tread) String() string { return fmt.Sprintf("Tread{FID: %d, Offset: %d, Count: %d}", t.FID, t.Offset, t.Count) } // Rread is the response for a Tread. type Rread struct { // Data is the resulting data. Data []byte } // decode implements encoder.decode. // // Data is automatically decoded via Payload. func (r *Rread) decode(b *buffer) { count := b.Read32() if count != uint32(len(r.Data)) { b.markOverrun() } } // encode implements encoder.encode. // // Data is automatically encoded via Payload. func (r *Rread) encode(b *buffer) { b.Write32(uint32(len(r.Data))) } // Type implements message.Type. func (*Rread) Type() MsgType { return MsgRread } // FixedSize implements payloader.FixedSize. func (*Rread) FixedSize() uint32 { return 4 } // Payload implements payloader.Payload. func (r *Rread) Payload() []byte { return r.Data } // SetPayload implements payloader.SetPayload. func (r *Rread) SetPayload(p []byte) { r.Data = p } // String implements fmt.Stringer. func (r *Rread) String() string { return fmt.Sprintf("Rread{len(Data): %d}", len(r.Data)) } // Twrite is a write request. type Twrite struct { // FID is the FID to read. FID FID // Offset indicates the file offset. Offset uint64 // Data is the data to be written. Data []byte } // decode implements encoder.decode. func (t *Twrite) decode(b *buffer) { t.FID = b.ReadFID() t.Offset = b.Read64() count := b.Read32() if count != uint32(len(t.Data)) { b.markOverrun() } } // encode implements encoder.encode. // // This uses the buffer payload to avoid a copy. func (t *Twrite) encode(b *buffer) { b.WriteFID(t.FID) b.Write64(t.Offset) b.Write32(uint32(len(t.Data))) } // Type implements message.Type. func (*Twrite) Type() MsgType { return MsgTwrite } // FixedSize implements payloader.FixedSize. func (*Twrite) FixedSize() uint32 { return 16 } // Payload implements payloader.Payload. func (t *Twrite) Payload() []byte { return t.Data } // SetPayload implements payloader.SetPayload. func (t *Twrite) SetPayload(p []byte) { t.Data = p } // String implements fmt.Stringer. func (t *Twrite) String() string { return fmt.Sprintf("Twrite{FID: %v, Offset %d, len(Data): %d}", t.FID, t.Offset, len(t.Data)) } // Rwrite is the response for a Twrite. type Rwrite struct { // Count indicates the number of bytes successfully written. Count uint32 } // decode implements encoder.decode. func (r *Rwrite) decode(b *buffer) { r.Count = b.Read32() } // encode implements encoder.encode. func (r *Rwrite) encode(b *buffer) { b.Write32(r.Count) } // Type implements message.Type. func (*Rwrite) Type() MsgType { return MsgRwrite } // String implements fmt.Stringer. func (r *Rwrite) String() string { return fmt.Sprintf("Rwrite{Count: %d}", r.Count) } // Tmknod is a mknod request. type Tmknod struct { // Directory is the parent directory. Directory FID // Name is the device name. Name string // Mode is the device mode and permissions. Mode FileMode // Major is the device major number. Major uint32 // Minor is the device minor number. Minor uint32 // GID is the device GID. GID GID } // decode implements encoder.decode. func (t *Tmknod) decode(b *buffer) { t.Directory = b.ReadFID() t.Name = b.ReadString() t.Mode = b.ReadFileMode() t.Major = b.Read32() t.Minor = b.Read32() t.GID = b.ReadGID() } // encode implements encoder.encode. func (t *Tmknod) encode(b *buffer) { b.WriteFID(t.Directory) b.WriteString(t.Name) b.WriteFileMode(t.Mode) b.Write32(t.Major) b.Write32(t.Minor) b.WriteGID(t.GID) } // Type implements message.Type. func (*Tmknod) Type() MsgType { return MsgTmknod } // String implements fmt.Stringer. func (t *Tmknod) String() string { return fmt.Sprintf("Tmknod{DirectoryFID: %d, Name: %s, Mode: 0o%o, Major: %d, Minor: %d, GID: %d}", t.Directory, t.Name, t.Mode, t.Major, t.Minor, t.GID) } // Rmknod is a mknod response. type Rmknod struct { // QID is the resulting QID. QID QID } // decode implements encoder.decode. func (r *Rmknod) decode(b *buffer) { r.QID.decode(b) } // encode implements encoder.encode. func (r *Rmknod) encode(b *buffer) { r.QID.encode(b) } // Type implements message.Type. func (*Rmknod) Type() MsgType { return MsgRmknod } // String implements fmt.Stringer. func (r *Rmknod) String() string { return fmt.Sprintf("Rmknod{QID: %s}", r.QID) } // Tmkdir is a mkdir request. type Tmkdir struct { // Directory is the parent directory. Directory FID // Name is the new directory name. Name string // Permissions is the set of permission bits. Permissions FileMode // GID is the owning group. GID GID } // decode implements encoder.decode. func (t *Tmkdir) decode(b *buffer) { t.Directory = b.ReadFID() t.Name = b.ReadString() t.Permissions = b.ReadPermissions() t.GID = b.ReadGID() } // encode implements encoder.encode. func (t *Tmkdir) encode(b *buffer) { b.WriteFID(t.Directory) b.WriteString(t.Name) b.WritePermissions(t.Permissions) b.WriteGID(t.GID) } // Type implements message.Type. func (*Tmkdir) Type() MsgType { return MsgTmkdir } // String implements fmt.Stringer. func (t *Tmkdir) String() string { return fmt.Sprintf("Tmkdir{DirectoryFID: %d, Name: %s, Permissions: 0o%o, GID: %d}", t.Directory, t.Name, t.Permissions, t.GID) } // Rmkdir is a mkdir response. type Rmkdir struct { // QID is the resulting QID. QID QID } // decode implements encoder.decode. func (r *Rmkdir) decode(b *buffer) { r.QID.decode(b) } // encode implements encoder.encode. func (r *Rmkdir) encode(b *buffer) { r.QID.encode(b) } // Type implements message.Type. func (*Rmkdir) Type() MsgType { return MsgRmkdir } // String implements fmt.Stringer. func (r *Rmkdir) String() string { return fmt.Sprintf("Rmkdir{QID: %s}", r.QID) } // Tgetattr is a getattr request. type Tgetattr struct { // FID is the FID to get attributes for. FID FID // AttrMask is the set of attributes to get. AttrMask AttrMask } // decode implements encoder.decode. func (t *Tgetattr) decode(b *buffer) { t.FID = b.ReadFID() t.AttrMask.decode(b) } // encode implements encoder.encode. func (t *Tgetattr) encode(b *buffer) { b.WriteFID(t.FID) t.AttrMask.encode(b) } // Type implements message.Type. func (*Tgetattr) Type() MsgType { return MsgTgetattr } // String implements fmt.Stringer. func (t *Tgetattr) String() string { return fmt.Sprintf("Tgetattr{FID: %d, AttrMask: %s}", t.FID, t.AttrMask) } // Rgetattr is a getattr response. type Rgetattr struct { // Valid indicates which fields are valid. Valid AttrMask // QID is the QID for this file. QID // Attr is the set of attributes. Attr Attr } // decode implements encoder.decode. func (r *Rgetattr) decode(b *buffer) { r.Valid.decode(b) r.QID.decode(b) r.Attr.decode(b) } // encode implements encoder.encode. func (r *Rgetattr) encode(b *buffer) { r.Valid.encode(b) r.QID.encode(b) r.Attr.encode(b) } // Type implements message.Type. func (*Rgetattr) Type() MsgType { return MsgRgetattr } // String implements fmt.Stringer. func (r *Rgetattr) String() string { return fmt.Sprintf("Rgetattr{Valid: %v, QID: %s, Attr: %s}", r.Valid, r.QID, r.Attr) } // Tsetattr is a setattr request. type Tsetattr struct { // FID is the FID to change. FID FID // Valid is the set of bits which will be used. Valid SetAttrMask // SetAttr is the set request. SetAttr SetAttr } // decode implements encoder.decode. func (t *Tsetattr) decode(b *buffer) { t.FID = b.ReadFID() t.Valid.decode(b) t.SetAttr.decode(b) } // encode implements encoder.encode. func (t *Tsetattr) encode(b *buffer) { b.WriteFID(t.FID) t.Valid.encode(b) t.SetAttr.encode(b) } // Type implements message.Type. func (*Tsetattr) Type() MsgType { return MsgTsetattr } // String implements fmt.Stringer. func (t *Tsetattr) String() string { return fmt.Sprintf("Tsetattr{FID: %d, Valid: %v, SetAttr: %s}", t.FID, t.Valid, t.SetAttr) } // Rsetattr is a setattr response. type Rsetattr struct { } // decode implements encoder.decode. func (*Rsetattr) decode(*buffer) { } // encode implements encoder.encode. func (*Rsetattr) encode(*buffer) { } // Type implements message.Type. func (*Rsetattr) Type() MsgType { return MsgRsetattr } // String implements fmt.Stringer. func (r *Rsetattr) String() string { return "Rsetattr{}" } // Tallocate is an allocate request. This is an extension to 9P protocol, not // present in the 9P2000.L standard. type Tallocate struct { FID FID Mode AllocateMode Offset uint64 Length uint64 } // decode implements encoder.decode. func (t *Tallocate) decode(b *buffer) { t.FID = b.ReadFID() t.Mode.decode(b) t.Offset = b.Read64() t.Length = b.Read64() } // encode implements encoder.encode. func (t *Tallocate) encode(b *buffer) { b.WriteFID(t.FID) t.Mode.encode(b) b.Write64(t.Offset) b.Write64(t.Length) } // Type implements message.Type. func (*Tallocate) Type() MsgType { return MsgTallocate } // String implements fmt.Stringer. func (t *Tallocate) String() string { return fmt.Sprintf("Tallocate{FID: %d, Offset: %d, Length: %d}", t.FID, t.Offset, t.Length) } // Rallocate is an allocate response. type Rallocate struct { } // decode implements encoder.decode. func (*Rallocate) decode(*buffer) { } // encode implements encoder.encode. func (*Rallocate) encode(*buffer) { } // Type implements message.Type. func (*Rallocate) Type() MsgType { return MsgRallocate } // String implements fmt.Stringer. func (r *Rallocate) String() string { return "Rallocate{}" } // Tlistxattr is a listxattr request. type Tlistxattr struct { // FID refers to the file on which to list xattrs. FID FID // Size is the buffer size for the xattr list. Size uint64 } // decode implements encoder.decode. func (t *Tlistxattr) decode(b *buffer) { t.FID = b.ReadFID() t.Size = b.Read64() } // encode implements encoder.encode. func (t *Tlistxattr) encode(b *buffer) { b.WriteFID(t.FID) b.Write64(t.Size) } // Type implements message.Type. func (*Tlistxattr) Type() MsgType { return MsgTlistxattr } // String implements fmt.Stringer. func (t *Tlistxattr) String() string { return fmt.Sprintf("Tlistxattr{FID: %d, Size: %d}", t.FID, t.Size) } // Rlistxattr is a listxattr response. type Rlistxattr struct { // Xattrs is a list of extended attribute names. Xattrs []string } // decode implements encoder.decode. func (r *Rlistxattr) decode(b *buffer) { n := b.Read16() r.Xattrs = r.Xattrs[:0] for i := 0; i < int(n); i++ { r.Xattrs = append(r.Xattrs, b.ReadString()) } } // encode implements encoder.encode. func (r *Rlistxattr) encode(b *buffer) { b.Write16(uint16(len(r.Xattrs))) for _, x := range r.Xattrs { b.WriteString(x) } } // Type implements message.Type. func (*Rlistxattr) Type() MsgType { return MsgRlistxattr } // String implements fmt.Stringer. func (r *Rlistxattr) String() string { return fmt.Sprintf("Rlistxattr{Xattrs: %v}", r.Xattrs) } // Txattrwalk walks extended attributes. type Txattrwalk struct { // FID is the FID to check for attributes. FID FID // NewFID is the new FID associated with the attributes. NewFID FID // Name is the attribute name. Name string } // decode implements encoder.decode. func (t *Txattrwalk) decode(b *buffer) { t.FID = b.ReadFID() t.NewFID = b.ReadFID() t.Name = b.ReadString() } // encode implements encoder.encode. func (t *Txattrwalk) encode(b *buffer) { b.WriteFID(t.FID) b.WriteFID(t.NewFID) b.WriteString(t.Name) } // Type implements message.Type. func (*Txattrwalk) Type() MsgType { return MsgTxattrwalk } // String implements fmt.Stringer. func (t *Txattrwalk) String() string { return fmt.Sprintf("Txattrwalk{FID: %d, NewFID: %d, Name: %s}", t.FID, t.NewFID, t.Name) } // Rxattrwalk is a xattrwalk response. type Rxattrwalk struct { // Size is the size of the extended attribute. Size uint64 } // decode implements encoder.decode. func (r *Rxattrwalk) decode(b *buffer) { r.Size = b.Read64() } // encode implements encoder.encode. func (r *Rxattrwalk) encode(b *buffer) { b.Write64(r.Size) } // Type implements message.Type. func (*Rxattrwalk) Type() MsgType { return MsgRxattrwalk } // String implements fmt.Stringer. func (r *Rxattrwalk) String() string { return fmt.Sprintf("Rxattrwalk{Size: %d}", r.Size) } // Txattrcreate prepare to set extended attributes. type Txattrcreate struct { // FID is input/output parameter, it identifies the file on which // extended attributes will be set but after successful Rxattrcreate // it is used to write the extended attribute value. FID FID // Name is the attribute name. Name string // Size of the attribute value. When the FID is clunked it has to match // the number of bytes written to the FID. AttrSize uint64 // Linux setxattr(2) flags. Flags uint32 } // decode implements encoder.decode. func (t *Txattrcreate) decode(b *buffer) { t.FID = b.ReadFID() t.Name = b.ReadString() t.AttrSize = b.Read64() t.Flags = b.Read32() } // encode implements encoder.encode. func (t *Txattrcreate) encode(b *buffer) { b.WriteFID(t.FID) b.WriteString(t.Name) b.Write64(t.AttrSize) b.Write32(t.Flags) } // Type implements message.Type. func (*Txattrcreate) Type() MsgType { return MsgTxattrcreate } // String implements fmt.Stringer. func (t *Txattrcreate) String() string { return fmt.Sprintf("Txattrcreate{FID: %d, Name: %s, AttrSize: %d, Flags: %d}", t.FID, t.Name, t.AttrSize, t.Flags) } // Rxattrcreate is a xattrcreate response. type Rxattrcreate struct { } // decode implements encoder.decode. func (r *Rxattrcreate) decode(*buffer) { } // encode implements encoder.encode. func (r *Rxattrcreate) encode(*buffer) { } // Type implements message.Type. func (*Rxattrcreate) Type() MsgType { return MsgRxattrcreate } // String implements fmt.Stringer. func (r *Rxattrcreate) String() string { return "Rxattrcreate{}" } // Tgetxattr is a getxattr request. type Tgetxattr struct { // FID refers to the file for which to get xattrs. FID FID // Name is the xattr to get. Name string // Size is the buffer size for the xattr to get. Size uint64 } // decode implements encoder.decode. func (t *Tgetxattr) decode(b *buffer) { t.FID = b.ReadFID() t.Name = b.ReadString() t.Size = b.Read64() } // encode implements encoder.encode. func (t *Tgetxattr) encode(b *buffer) { b.WriteFID(t.FID) b.WriteString(t.Name) b.Write64(t.Size) } // Type implements message.Type. func (*Tgetxattr) Type() MsgType { return MsgTgetxattr } // String implements fmt.Stringer. func (t *Tgetxattr) String() string { return fmt.Sprintf("Tgetxattr{FID: %d, Name: %s, Size: %d}", t.FID, t.Name, t.Size) } // Rgetxattr is a getxattr response. type Rgetxattr struct { // Value is the extended attribute value. Value string } // decode implements encoder.decode. func (r *Rgetxattr) decode(b *buffer) { r.Value = b.ReadString() } // encode implements encoder.encode. func (r *Rgetxattr) encode(b *buffer) { b.WriteString(r.Value) } // Type implements message.Type. func (*Rgetxattr) Type() MsgType { return MsgRgetxattr } // String implements fmt.Stringer. func (r *Rgetxattr) String() string { return fmt.Sprintf("Rgetxattr{Value: %s}", r.Value) } // Tsetxattr sets extended attributes. type Tsetxattr struct { // FID refers to the file on which to set xattrs. FID FID // Name is the attribute name. Name string // Value is the attribute value. Value string // Linux setxattr(2) flags. Flags uint32 } // decode implements encoder.decode. func (t *Tsetxattr) decode(b *buffer) { t.FID = b.ReadFID() t.Name = b.ReadString() t.Value = b.ReadString() t.Flags = b.Read32() } // encode implements encoder.encode. func (t *Tsetxattr) encode(b *buffer) { b.WriteFID(t.FID) b.WriteString(t.Name) b.WriteString(t.Value) b.Write32(t.Flags) } // Type implements message.Type. func (*Tsetxattr) Type() MsgType { return MsgTsetxattr } // String implements fmt.Stringer. func (t *Tsetxattr) String() string { return fmt.Sprintf("Tsetxattr{FID: %d, Name: %s, Value: %s, Flags: %d}", t.FID, t.Name, t.Value, t.Flags) } // Rsetxattr is a setxattr response. type Rsetxattr struct { } // decode implements encoder.decode. func (r *Rsetxattr) decode(*buffer) { } // encode implements encoder.encode. func (r *Rsetxattr) encode(*buffer) { } // Type implements message.Type. func (*Rsetxattr) Type() MsgType { return MsgRsetxattr } // String implements fmt.Stringer. func (r *Rsetxattr) String() string { return "Rsetxattr{}" } // Tremovexattr is a removexattr request. type Tremovexattr struct { // FID refers to the file on which to set xattrs. FID FID // Name is the attribute name. Name string } // decode implements encoder.decode. func (t *Tremovexattr) decode(b *buffer) { t.FID = b.ReadFID() t.Name = b.ReadString() } // encode implements encoder.encode. func (t *Tremovexattr) encode(b *buffer) { b.WriteFID(t.FID) b.WriteString(t.Name) } // Type implements message.Type. func (*Tremovexattr) Type() MsgType { return MsgTremovexattr } // String implements fmt.Stringer. func (t *Tremovexattr) String() string { return fmt.Sprintf("Tremovexattr{FID: %d, Name: %s}", t.FID, t.Name) } // Rremovexattr is a removexattr response. type Rremovexattr struct { } // decode implements encoder.decode. func (r *Rremovexattr) decode(*buffer) { } // encode implements encoder.encode. func (r *Rremovexattr) encode(*buffer) { } // Type implements message.Type. func (*Rremovexattr) Type() MsgType { return MsgRremovexattr } // String implements fmt.Stringer. func (r *Rremovexattr) String() string { return "Rremovexattr{}" } // Treaddir is a readdir request. type Treaddir struct { // Directory is the directory FID to read. Directory FID // DirentOffset is the dirent offset to read at. DirentOffset uint64 // Count is the number of bytes to read. Count uint32 } // decode implements encoder.decode. func (t *Treaddir) decode(b *buffer) { t.Directory = b.ReadFID() t.DirentOffset = b.Read64() t.Count = b.Read32() } // encode implements encoder.encode. func (t *Treaddir) encode(b *buffer) { b.WriteFID(t.Directory) b.Write64(t.DirentOffset) b.Write32(t.Count) } // Type implements message.Type. func (*Treaddir) Type() MsgType { return MsgTreaddir } // String implements fmt.Stringer. func (t *Treaddir) String() string { return fmt.Sprintf("Treaddir{DirectoryFID: %d, DirentOffset: %d, Count: %d}", t.Directory, t.DirentOffset, t.Count) } // Rreaddir is a readdir response. type Rreaddir struct { // Count is the byte limit. // // This should always be set from the Treaddir request. Count uint32 // Entries are the resulting entries. // // This may be constructed in decode. Entries []Dirent // payload is the encoded payload. // // This is constructed by encode. payload []byte } // decode implements encoder.decode. func (r *Rreaddir) decode(b *buffer) { r.Count = b.Read32() entriesBuf := buffer{data: r.payload} r.Entries = r.Entries[:0] for { var d Dirent d.decode(&entriesBuf) if entriesBuf.isOverrun() { // Couldn't decode a complete entry. break } r.Entries = append(r.Entries, d) } } // encode implements encoder.encode. func (r *Rreaddir) encode(b *buffer) { entriesBuf := buffer{} payloadSize := 0 for i, d := range r.Entries { d.encode(&entriesBuf) if len(entriesBuf.data) > int(r.Count) { log.Warningf("hit Rreaddir.Count limit while encoding dirents, discarding %d dirents", len(r.Entries)-i) break } payloadSize = len(entriesBuf.data) } r.Count = uint32(payloadSize) r.payload = entriesBuf.data[:payloadSize] b.Write32(r.Count) } // Type implements message.Type. func (*Rreaddir) Type() MsgType { return MsgRreaddir } // FixedSize implements payloader.FixedSize. func (*Rreaddir) FixedSize() uint32 { return 4 } // Payload implements payloader.Payload. func (r *Rreaddir) Payload() []byte { return r.payload } // SetPayload implements payloader.SetPayload. func (r *Rreaddir) SetPayload(p []byte) { r.payload = p } // String implements fmt.Stringer. func (r *Rreaddir) String() string { return fmt.Sprintf("Rreaddir{Count: %d, Entries: %s}", r.Count, r.Entries) } // Tfsync is an fsync request. type Tfsync struct { // FID is the fid to sync. FID FID } // decode implements encoder.decode. func (t *Tfsync) decode(b *buffer) { t.FID = b.ReadFID() } // encode implements encoder.encode. func (t *Tfsync) encode(b *buffer) { b.WriteFID(t.FID) } // Type implements message.Type. func (*Tfsync) Type() MsgType { return MsgTfsync } // String implements fmt.Stringer. func (t *Tfsync) String() string { return fmt.Sprintf("Tfsync{FID: %d}", t.FID) } // Rfsync is an fsync response. type Rfsync struct { } // decode implements encoder.decode. func (*Rfsync) decode(*buffer) { } // encode implements encoder.encode. func (*Rfsync) encode(*buffer) { } // Type implements message.Type. func (*Rfsync) Type() MsgType { return MsgRfsync } // String implements fmt.Stringer. func (r *Rfsync) String() string { return "Rfsync{}" } // Tstatfs is a stat request. type Tstatfs struct { // FID is the root. FID FID } // decode implements encoder.decode. func (t *Tstatfs) decode(b *buffer) { t.FID = b.ReadFID() } // encode implements encoder.encode. func (t *Tstatfs) encode(b *buffer) { b.WriteFID(t.FID) } // Type implements message.Type. func (*Tstatfs) Type() MsgType { return MsgTstatfs } // String implements fmt.Stringer. func (t *Tstatfs) String() string { return fmt.Sprintf("Tstatfs{FID: %d}", t.FID) } // Rstatfs is the response for a Tstatfs. type Rstatfs struct { // FSStat is the stat result. FSStat FSStat } // decode implements encoder.decode. func (r *Rstatfs) decode(b *buffer) { r.FSStat.decode(b) } // encode implements encoder.encode. func (r *Rstatfs) encode(b *buffer) { r.FSStat.encode(b) } // Type implements message.Type. func (*Rstatfs) Type() MsgType { return MsgRstatfs } // String implements fmt.Stringer. func (r *Rstatfs) String() string { return fmt.Sprintf("Rstatfs{FSStat: %v}", r.FSStat) } // Tflushf is a flush file request, not to be confused with Tflush. type Tflushf struct { // FID is the FID to be flushed. FID FID } // decode implements encoder.decode. func (t *Tflushf) decode(b *buffer) { t.FID = b.ReadFID() } // encode implements encoder.encode. func (t *Tflushf) encode(b *buffer) { b.WriteFID(t.FID) } // Type implements message.Type. func (*Tflushf) Type() MsgType { return MsgTflushf } // String implements fmt.Stringer. func (t *Tflushf) String() string { return fmt.Sprintf("Tflushf{FID: %d}", t.FID) } // Rflushf is a flush file response. type Rflushf struct { } // decode implements encoder.decode. func (*Rflushf) decode(*buffer) { } // encode implements encoder.encode. func (*Rflushf) encode(*buffer) { } // Type implements message.Type. func (*Rflushf) Type() MsgType { return MsgRflushf } // String implements fmt.Stringer. func (*Rflushf) String() string { return "Rflushf{}" } // Twalkgetattr is a walk request. type Twalkgetattr struct { // FID is the FID to be walked. FID FID // NewFID is the resulting FID. NewFID FID // Names are the set of names to be walked. Names []string } // decode implements encoder.decode. func (t *Twalkgetattr) decode(b *buffer) { t.FID = b.ReadFID() t.NewFID = b.ReadFID() n := b.Read16() t.Names = t.Names[:0] for i := 0; i < int(n); i++ { t.Names = append(t.Names, b.ReadString()) } } // encode implements encoder.encode. func (t *Twalkgetattr) encode(b *buffer) { b.WriteFID(t.FID) b.WriteFID(t.NewFID) b.Write16(uint16(len(t.Names))) for _, name := range t.Names { b.WriteString(name) } } // Type implements message.Type. func (*Twalkgetattr) Type() MsgType { return MsgTwalkgetattr } // String implements fmt.Stringer. func (t *Twalkgetattr) String() string { return fmt.Sprintf("Twalkgetattr{FID: %d, NewFID: %d, Names: %v}", t.FID, t.NewFID, t.Names) } // Rwalkgetattr is a walk response. type Rwalkgetattr struct { // Valid indicates which fields are valid in the Attr below. Valid AttrMask // Attr is the set of attributes for the last QID (the file walked to). Attr Attr // QIDs are the set of QIDs returned. QIDs []QID } // decode implements encoder.decode. func (r *Rwalkgetattr) decode(b *buffer) { r.Valid.decode(b) r.Attr.decode(b) n := b.Read16() r.QIDs = r.QIDs[:0] for i := 0; i < int(n); i++ { var q QID q.decode(b) r.QIDs = append(r.QIDs, q) } } // encode implements encoder.encode. func (r *Rwalkgetattr) encode(b *buffer) { r.Valid.encode(b) r.Attr.encode(b) b.Write16(uint16(len(r.QIDs))) for i := range r.QIDs { r.QIDs[i].encode(b) } } // Type implements message.Type. func (*Rwalkgetattr) Type() MsgType { return MsgRwalkgetattr } // String implements fmt.Stringer. func (r *Rwalkgetattr) String() string { return fmt.Sprintf("Rwalkgetattr{Valid: %s, Attr: %s, QIDs: %v}", r.Valid, r.Attr, r.QIDs) } // Tucreate is a Tlcreate message that includes a UID. type Tucreate struct { Tlcreate // UID is the UID to use as the effective UID in creation messages. UID UID } // decode implements encoder.decode. func (t *Tucreate) decode(b *buffer) { t.Tlcreate.decode(b) t.UID = b.ReadUID() } // encode implements encoder.encode. func (t *Tucreate) encode(b *buffer) { t.Tlcreate.encode(b) b.WriteUID(t.UID) } // Type implements message.Type. func (t *Tucreate) Type() MsgType { return MsgTucreate } // String implements fmt.Stringer. func (t *Tucreate) String() string { return fmt.Sprintf("Tucreate{Tlcreate: %v, UID: %d}", &t.Tlcreate, t.UID) } // Rucreate is a file creation response. type Rucreate struct { Rlcreate } // Type implements message.Type. func (*Rucreate) Type() MsgType { return MsgRucreate } // String implements fmt.Stringer. func (r *Rucreate) String() string { return fmt.Sprintf("Rucreate{%v}", &r.Rlcreate) } // Tumkdir is a Tmkdir message that includes a UID. type Tumkdir struct { Tmkdir // UID is the UID to use as the effective UID in creation messages. UID UID } // decode implements encoder.decode. func (t *Tumkdir) decode(b *buffer) { t.Tmkdir.decode(b) t.UID = b.ReadUID() } // encode implements encoder.encode. func (t *Tumkdir) encode(b *buffer) { t.Tmkdir.encode(b) b.WriteUID(t.UID) } // Type implements message.Type. func (t *Tumkdir) Type() MsgType { return MsgTumkdir } // String implements fmt.Stringer. func (t *Tumkdir) String() string { return fmt.Sprintf("Tumkdir{Tmkdir: %v, UID: %d}", &t.Tmkdir, t.UID) } // Rumkdir is a umkdir response. type Rumkdir struct { Rmkdir } // Type implements message.Type. func (*Rumkdir) Type() MsgType { return MsgRumkdir } // String implements fmt.Stringer. func (r *Rumkdir) String() string { return fmt.Sprintf("Rumkdir{%v}", &r.Rmkdir) } // Tumknod is a Tmknod message that includes a UID. type Tumknod struct { Tmknod // UID is the UID to use as the effective UID in creation messages. UID UID } // decode implements encoder.decode. func (t *Tumknod) decode(b *buffer) { t.Tmknod.decode(b) t.UID = b.ReadUID() } // encode implements encoder.encode. func (t *Tumknod) encode(b *buffer) { t.Tmknod.encode(b) b.WriteUID(t.UID) } // Type implements message.Type. func (t *Tumknod) Type() MsgType { return MsgTumknod } // String implements fmt.Stringer. func (t *Tumknod) String() string { return fmt.Sprintf("Tumknod{Tmknod: %v, UID: %d}", &t.Tmknod, t.UID) } // Rumknod is a umknod response. type Rumknod struct { Rmknod } // Type implements message.Type. func (*Rumknod) Type() MsgType { return MsgRumknod } // String implements fmt.Stringer. func (r *Rumknod) String() string { return fmt.Sprintf("Rumknod{%v}", &r.Rmknod) } // Tusymlink is a Tsymlink message that includes a UID. type Tusymlink struct { Tsymlink // UID is the UID to use as the effective UID in creation messages. UID UID } // decode implements encoder.decode. func (t *Tusymlink) decode(b *buffer) { t.Tsymlink.decode(b) t.UID = b.ReadUID() } // encode implements encoder.encode. func (t *Tusymlink) encode(b *buffer) { t.Tsymlink.encode(b) b.WriteUID(t.UID) } // Type implements message.Type. func (t *Tusymlink) Type() MsgType { return MsgTusymlink } // String implements fmt.Stringer. func (t *Tusymlink) String() string { return fmt.Sprintf("Tusymlink{Tsymlink: %v, UID: %d}", &t.Tsymlink, t.UID) } // Rusymlink is a usymlink response. type Rusymlink struct { Rsymlink } // Type implements message.Type. func (*Rusymlink) Type() MsgType { return MsgRusymlink } // String implements fmt.Stringer. func (r *Rusymlink) String() string { return fmt.Sprintf("Rusymlink{%v}", &r.Rsymlink) } // Tbind is a bind request. type Tbind struct { // Directory is the directory inside which the bound socket file should be // created. Directory FID // SockType is the type of socket to be used. This is passed as an argument // to socket(2). SockType uint32 // SockName is the name of the socket file to be created. SockName string // UID is the owning user. UID UID // GID is the owning group. GID GID // NewFID is the resulting FID for the socket file. NewFID FID } // decode implements encoder.decode. func (t *Tbind) decode(b *buffer) { t.Directory = b.ReadFID() t.SockType = b.Read32() t.SockName = b.ReadString() t.UID = b.ReadUID() t.GID = b.ReadGID() t.NewFID = b.ReadFID() } // encode implements encoder.encode. func (t *Tbind) encode(b *buffer) { b.WriteFID(t.Directory) b.Write32(t.SockType) b.WriteString(t.SockName) b.WriteUID(t.UID) b.WriteGID(t.GID) b.WriteFID(t.NewFID) } // Type implements message.Type. func (*Tbind) Type() MsgType { return MsgTbind } // String implements fmt.Stringer. func (t *Tbind) String() string { return fmt.Sprintf("Tbind{Directory: %d, SockType: %d, SockName: %s, UID: %d, GID: %d, NewFID: %d}", t.Directory, t.SockType, t.SockName, t.UID, t.GID, t.NewFID) } // Rbind is a bind response. type Rbind struct { // QID is the resulting QID of the created socket file. QID QID // Valid indicates which fields are valid. Valid AttrMask // Attr is the set of attributes of the created socket file. Attr Attr } // decode implements encoder.decode. func (r *Rbind) decode(b *buffer) { r.QID.decode(b) r.Valid.decode(b) r.Attr.decode(b) } // encode implements encoder.encode. func (r *Rbind) encode(b *buffer) { r.QID.encode(b) r.Valid.encode(b) r.Attr.encode(b) } // Type implements message.Type. func (*Rbind) Type() MsgType { return MsgRbind } // String implements fmt.Stringer. func (r *Rbind) String() string { return fmt.Sprintf("Rbind{QID: %s, Valid: %v, Attr: %s}", r.QID, r.Valid, r.Attr) } // Tlconnect is a connect request. type Tlconnect struct { // FID is the FID to be connected. FID FID // SocketType is the socket type to be connected to. SocketType SocketType } // decode implements encoder.decode. func (t *Tlconnect) decode(b *buffer) { t.FID = b.ReadFID() t.SocketType = b.ReadSocketType() } // encode implements encoder.encode. func (t *Tlconnect) encode(b *buffer) { b.WriteFID(t.FID) b.WriteSocketType(t.SocketType) } // Type implements message.Type. func (*Tlconnect) Type() MsgType { return MsgTlconnect } // String implements fmt.Stringer. func (t *Tlconnect) String() string { return fmt.Sprintf("Tlconnect{FID: %d, SocketType: %v}", t.FID, t.SocketType) } // Rlconnect is a connect response. type Rlconnect struct { filePayload } // decode implements encoder.decode. func (r *Rlconnect) decode(*buffer) {} // encode implements encoder.encode. func (r *Rlconnect) encode(*buffer) {} // Type implements message.Type. func (*Rlconnect) Type() MsgType { return MsgRlconnect } // String implements fmt.Stringer. func (r *Rlconnect) String() string { return fmt.Sprintf("Rlconnect{File: %v}", r.File) } // Tchannel creates a new channel. type Tchannel struct { // ID is the channel ID. ID uint32 // Control is 0 if the Rchannel response should provide the flipcall // component of the channel, and 1 if the Rchannel response should // provide the fdchannel component of the channel. Control uint32 } // decode implements encoder.decode. func (t *Tchannel) decode(b *buffer) { t.ID = b.Read32() t.Control = b.Read32() } // encode implements encoder.encode. func (t *Tchannel) encode(b *buffer) { b.Write32(t.ID) b.Write32(t.Control) } // Type implements message.Type. func (*Tchannel) Type() MsgType { return MsgTchannel } // String implements fmt.Stringer. func (t *Tchannel) String() string { return fmt.Sprintf("Tchannel{ID: %d, Control: %d}", t.ID, t.Control) } // Rchannel is the channel response. type Rchannel struct { Offset uint64 Length uint64 filePayload } // decode implements encoder.decode. func (r *Rchannel) decode(b *buffer) { r.Offset = b.Read64() r.Length = b.Read64() } // encode implements encoder.encode. func (r *Rchannel) encode(b *buffer) { b.Write64(r.Offset) b.Write64(r.Length) } // Type implements message.Type. func (*Rchannel) Type() MsgType { return MsgRchannel } // String implements fmt.Stringer. func (r *Rchannel) String() string { return fmt.Sprintf("Rchannel{Offset: %d, Length: %d}", r.Offset, r.Length) } // Tmultigetattr is a multi-getattr request. type Tmultigetattr struct { // FID is the FID to be walked. FID FID // Names are the set of names to be walked. Names []string } // decode implements encoder.decode. func (t *Tmultigetattr) decode(b *buffer) { t.FID = b.ReadFID() n := b.Read16() t.Names = t.Names[:0] for i := 0; i < int(n); i++ { t.Names = append(t.Names, b.ReadString()) } } // encode implements encoder.encode. func (t *Tmultigetattr) encode(b *buffer) { b.WriteFID(t.FID) b.Write16(uint16(len(t.Names))) for _, name := range t.Names { b.WriteString(name) } } // Type implements message.Type. func (*Tmultigetattr) Type() MsgType { return MsgTmultigetattr } // String implements fmt.Stringer. func (t *Tmultigetattr) String() string { return fmt.Sprintf("Tmultigetattr{FID: %d, Names: %v}", t.FID, t.Names) } // Rmultigetattr is a multi-getattr response. type Rmultigetattr struct { // Stats are the set of FullStat returned for each of the names in the // request. Stats []FullStat } // decode implements encoder.decode. func (r *Rmultigetattr) decode(b *buffer) { n := b.Read16() r.Stats = r.Stats[:0] for i := 0; i < int(n); i++ { var fs FullStat fs.decode(b) r.Stats = append(r.Stats, fs) } } // encode implements encoder.encode. func (r *Rmultigetattr) encode(b *buffer) { b.Write16(uint16(len(r.Stats))) for i := range r.Stats { r.Stats[i].encode(b) } } // Type implements message.Type. func (*Rmultigetattr) Type() MsgType { return MsgRmultigetattr } // String implements fmt.Stringer. func (r *Rmultigetattr) String() string { return fmt.Sprintf("Rmultigetattr{Stats: %v}", r.Stats) } const maxCacheSize = 3 // msgFactory is used to reduce allocations by caching messages for reuse. type msgFactory struct { create func() message cache chan message } // msgRegistry indexes all message factories by type. var msgRegistry registry type registry struct { factories [math.MaxUint8 + 1]msgFactory // largestFixedSize is computed so that given some message size M, you can // compute the maximum payload size (e.g. for Twrite, Rread) with // M-largestFixedSize. You could do this individual on a per-message basis, // but it's easier to compute a single maximum safe payload. largestFixedSize uint32 } // get returns a new message by type. // // An error is returned in the case of an unknown message. // // This takes, and ignores, a message tag so that it may be used directly as a // lookupTagAndType function for recv (by design). func (r *registry) get(_ Tag, t MsgType) (message, error) { entry := &r.factories[t] if entry.create == nil { return nil, &ErrInvalidMsgType{t} } select { case msg := <-entry.cache: return msg, nil default: return entry.create(), nil } } func (r *registry) put(msg message) { if p, ok := msg.(payloader); ok { p.SetPayload(nil) } if f, ok := msg.(filer); ok { f.SetFilePayload(nil) } entry := &r.factories[msg.Type()] select { case entry.cache <- msg: default: } } // register registers the given message type. // // This may cause panic on failure and should only be used from init. func (r *registry) register(t MsgType, fn func() message) { if int(t) >= len(r.factories) { panic(fmt.Sprintf("message type %d is too large. It must be smaller than %d", t, len(r.factories))) } if r.factories[t].create != nil { panic(fmt.Sprintf("duplicate message type %d: first is %T, second is %T", t, r.factories[t].create(), fn())) } r.factories[t] = msgFactory{ create: fn, cache: make(chan message, maxCacheSize), } if size := calculateSize(fn()); size > r.largestFixedSize { r.largestFixedSize = size } } func calculateSize(m message) uint32 { if p, ok := m.(payloader); ok { return p.FixedSize() } var dataBuf buffer m.encode(&dataBuf) return uint32(len(dataBuf.data)) } func init() { msgRegistry.register(MsgRlerror, func() message { return &Rlerror{} }) msgRegistry.register(MsgTstatfs, func() message { return &Tstatfs{} }) msgRegistry.register(MsgRstatfs, func() message { return &Rstatfs{} }) msgRegistry.register(MsgTlopen, func() message { return &Tlopen{} }) msgRegistry.register(MsgRlopen, func() message { return &Rlopen{} }) msgRegistry.register(MsgTlcreate, func() message { return &Tlcreate{} }) msgRegistry.register(MsgRlcreate, func() message { return &Rlcreate{} }) msgRegistry.register(MsgTsymlink, func() message { return &Tsymlink{} }) msgRegistry.register(MsgRsymlink, func() message { return &Rsymlink{} }) msgRegistry.register(MsgTmknod, func() message { return &Tmknod{} }) msgRegistry.register(MsgRmknod, func() message { return &Rmknod{} }) msgRegistry.register(MsgTrename, func() message { return &Trename{} }) msgRegistry.register(MsgRrename, func() message { return &Rrename{} }) msgRegistry.register(MsgTreadlink, func() message { return &Treadlink{} }) msgRegistry.register(MsgRreadlink, func() message { return &Rreadlink{} }) msgRegistry.register(MsgTgetattr, func() message { return &Tgetattr{} }) msgRegistry.register(MsgRgetattr, func() message { return &Rgetattr{} }) msgRegistry.register(MsgTsetattr, func() message { return &Tsetattr{} }) msgRegistry.register(MsgRsetattr, func() message { return &Rsetattr{} }) msgRegistry.register(MsgTlistxattr, func() message { return &Tlistxattr{} }) msgRegistry.register(MsgRlistxattr, func() message { return &Rlistxattr{} }) msgRegistry.register(MsgTxattrwalk, func() message { return &Txattrwalk{} }) msgRegistry.register(MsgRxattrwalk, func() message { return &Rxattrwalk{} }) msgRegistry.register(MsgTxattrcreate, func() message { return &Txattrcreate{} }) msgRegistry.register(MsgRxattrcreate, func() message { return &Rxattrcreate{} }) msgRegistry.register(MsgTgetxattr, func() message { return &Tgetxattr{} }) msgRegistry.register(MsgRgetxattr, func() message { return &Rgetxattr{} }) msgRegistry.register(MsgTsetxattr, func() message { return &Tsetxattr{} }) msgRegistry.register(MsgRsetxattr, func() message { return &Rsetxattr{} }) msgRegistry.register(MsgTremovexattr, func() message { return &Tremovexattr{} }) msgRegistry.register(MsgRremovexattr, func() message { return &Rremovexattr{} }) msgRegistry.register(MsgTreaddir, func() message { return &Treaddir{} }) msgRegistry.register(MsgRreaddir, func() message { return &Rreaddir{} }) msgRegistry.register(MsgTfsync, func() message { return &Tfsync{} }) msgRegistry.register(MsgRfsync, func() message { return &Rfsync{} }) msgRegistry.register(MsgTlink, func() message { return &Tlink{} }) msgRegistry.register(MsgRlink, func() message { return &Rlink{} }) msgRegistry.register(MsgTmkdir, func() message { return &Tmkdir{} }) msgRegistry.register(MsgRmkdir, func() message { return &Rmkdir{} }) msgRegistry.register(MsgTrenameat, func() message { return &Trenameat{} }) msgRegistry.register(MsgRrenameat, func() message { return &Rrenameat{} }) msgRegistry.register(MsgTunlinkat, func() message { return &Tunlinkat{} }) msgRegistry.register(MsgRunlinkat, func() message { return &Runlinkat{} }) msgRegistry.register(MsgTversion, func() message { return &Tversion{} }) msgRegistry.register(MsgRversion, func() message { return &Rversion{} }) msgRegistry.register(MsgTauth, func() message { return &Tauth{} }) msgRegistry.register(MsgRauth, func() message { return &Rauth{} }) msgRegistry.register(MsgTattach, func() message { return &Tattach{} }) msgRegistry.register(MsgRattach, func() message { return &Rattach{} }) msgRegistry.register(MsgTflush, func() message { return &Tflush{} }) msgRegistry.register(MsgRflush, func() message { return &Rflush{} }) msgRegistry.register(MsgTwalk, func() message { return &Twalk{} }) msgRegistry.register(MsgRwalk, func() message { return &Rwalk{} }) msgRegistry.register(MsgTread, func() message { return &Tread{} }) msgRegistry.register(MsgRread, func() message { return &Rread{} }) msgRegistry.register(MsgTwrite, func() message { return &Twrite{} }) msgRegistry.register(MsgRwrite, func() message { return &Rwrite{} }) msgRegistry.register(MsgTclunk, func() message { return &Tclunk{} }) msgRegistry.register(MsgRclunk, func() message { return &Rclunk{} }) msgRegistry.register(MsgTremove, func() message { return &Tremove{} }) msgRegistry.register(MsgRremove, func() message { return &Rremove{} }) msgRegistry.register(MsgTflushf, func() message { return &Tflushf{} }) msgRegistry.register(MsgRflushf, func() message { return &Rflushf{} }) msgRegistry.register(MsgTwalkgetattr, func() message { return &Twalkgetattr{} }) msgRegistry.register(MsgRwalkgetattr, func() message { return &Rwalkgetattr{} }) msgRegistry.register(MsgTucreate, func() message { return &Tucreate{} }) msgRegistry.register(MsgRucreate, func() message { return &Rucreate{} }) msgRegistry.register(MsgTumkdir, func() message { return &Tumkdir{} }) msgRegistry.register(MsgRumkdir, func() message { return &Rumkdir{} }) msgRegistry.register(MsgTumknod, func() message { return &Tumknod{} }) msgRegistry.register(MsgRumknod, func() message { return &Rumknod{} }) msgRegistry.register(MsgTusymlink, func() message { return &Tusymlink{} }) msgRegistry.register(MsgRusymlink, func() message { return &Rusymlink{} }) msgRegistry.register(MsgTbind, func() message { return &Tbind{} }) msgRegistry.register(MsgRbind, func() message { return &Rbind{} }) msgRegistry.register(MsgTlconnect, func() message { return &Tlconnect{} }) msgRegistry.register(MsgRlconnect, func() message { return &Rlconnect{} }) msgRegistry.register(MsgTallocate, func() message { return &Tallocate{} }) msgRegistry.register(MsgRallocate, func() message { return &Rallocate{} }) msgRegistry.register(MsgTsetattrclunk, func() message { return &Tsetattrclunk{} }) msgRegistry.register(MsgRsetattrclunk, func() message { return &Rsetattrclunk{} }) msgRegistry.register(MsgTmultigetattr, func() message { return &Tmultigetattr{} }) msgRegistry.register(MsgRmultigetattr, func() message { return &Rmultigetattr{} }) msgRegistry.register(MsgTchannel, func() message { return &Tchannel{} }) msgRegistry.register(MsgRchannel, func() message { return &Rchannel{} }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/p9/p9.go000066400000000000000000000715541465435605700206470ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package p9 is a 9P2000.L implementation. package p9 import ( "fmt" "math" "os" "strings" "syscall" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" ) // OpenFlags is the mode passed to Open and Create operations. // // These correspond to bits sent over the wire. type OpenFlags uint32 const ( // ReadOnly is a Tlopen and Tlcreate flag indicating read-only mode. ReadOnly OpenFlags = 0 // WriteOnly is a Tlopen and Tlcreate flag indicating write-only mode. WriteOnly OpenFlags = 1 // ReadWrite is a Tlopen flag indicates read-write mode. ReadWrite OpenFlags = 2 // OpenFlagsModeMask is a mask of valid OpenFlags mode bits. OpenFlagsModeMask OpenFlags = 3 // OpenTruncate is a Tlopen flag indicating that the opened file should be // truncated. OpenTruncate OpenFlags = 01000 ) // SocketType is the socket type passed in Connect and Bind operations. // // These correspond to bits sent over the wire. type SocketType uint32 const ( // StreamSocket indicates SOCK_STREAM mode. StreamSocket SocketType = 0 // DgramSocket indicates SOCK_DGRAM mode. DgramSocket SocketType = 1 // SeqpacketSocket indicates SOCK_SEQPACKET mode. SeqpacketSocket SocketType = 2 // AnonymousSocket is only valid for Connect calls, and indicates that // the caller will accept any socket type. AnonymousSocket SocketType = 3 ) // ToLinux maps the SocketType to a Linux socket type. func (st SocketType) ToLinux() (linux.SockType, bool) { switch st { case StreamSocket: return linux.SOCK_STREAM, true case DgramSocket: return linux.SOCK_DGRAM, true case SeqpacketSocket: return linux.SOCK_SEQPACKET, true default: return 0, false } } // SocketTypeFromLinux maps a Linux socket type to a SocketType. func SocketTypeFromLinux(st linux.SockType) (SocketType, bool) { switch st { case linux.SOCK_STREAM: return StreamSocket, true case linux.SOCK_DGRAM: return DgramSocket, true case linux.SOCK_SEQPACKET: return SeqpacketSocket, true default: return 0, false } } // OSFlags converts a p9.OpenFlags to an int compatible with open(2). func (o OpenFlags) OSFlags() int { // "flags contains Linux open(2) flags bits" - 9P2000.L return int(o) } // String implements fmt.Stringer. func (o OpenFlags) String() string { var buf strings.Builder switch mode := o & OpenFlagsModeMask; mode { case ReadOnly: buf.WriteString("ReadOnly") case WriteOnly: buf.WriteString("WriteOnly") case ReadWrite: buf.WriteString("ReadWrite") default: fmt.Fprintf(&buf, "%#o", mode) } otherFlags := o &^ OpenFlagsModeMask if otherFlags&OpenTruncate != 0 { buf.WriteString("|OpenTruncate") otherFlags &^= OpenTruncate } if otherFlags != 0 { fmt.Fprintf(&buf, "|%#o", otherFlags) } return buf.String() } // Tag is a message tag. type Tag uint16 // FID is a file identifier. type FID uint64 // FileMode are flags corresponding to file modes. // // These correspond to bits sent over the wire. // These also correspond to mode_t bits. type FileMode uint32 const ( // FileModeMask is a mask of all the file mode bits of FileMode. FileModeMask FileMode = 0170000 // ModeSocket is an (unused) mode bit for a socket. ModeSocket FileMode = 0140000 // ModeSymlink is a mode bit for a symlink. ModeSymlink FileMode = 0120000 // ModeRegular is a mode bit for regular files. ModeRegular FileMode = 0100000 // ModeBlockDevice is a mode bit for block devices. ModeBlockDevice FileMode = 060000 // ModeDirectory is a mode bit for directories. ModeDirectory FileMode = 040000 // ModeCharacterDevice is a mode bit for a character device. ModeCharacterDevice FileMode = 020000 // ModeNamedPipe is a mode bit for a named pipe. ModeNamedPipe FileMode = 010000 // Read is a mode bit indicating read permission. Read FileMode = 04 // Write is a mode bit indicating write permission. Write FileMode = 02 // Exec is a mode bit indicating exec permission. Exec FileMode = 01 // AllPermissions is a mask with rwx bits set for user, group and others. AllPermissions FileMode = 0777 // Sticky is a mode bit indicating sticky directories. Sticky FileMode = 01000 // SetGID is the set group ID bit. SetGID FileMode = 02000 // SetUID is the set user ID bit. SetUID FileMode = 04000 // permissionsMask is the mask to apply to FileModes for permissions. It // includes rwx bits for user, group, and others, as well as the sticky // bit, setuid bit, and setgid bit. permissionsMask FileMode = 07777 ) // QIDType is the most significant byte of the FileMode word, to be used as the // Type field of p9.QID. func (m FileMode) QIDType() QIDType { switch { case m.IsDir(): return TypeDir case m.IsSocket(), m.IsNamedPipe(), m.IsCharacterDevice(): // Best approximation. return TypeAppendOnly case m.IsSymlink(): return TypeSymlink default: return TypeRegular } } // FileType returns the file mode without the permission bits. func (m FileMode) FileType() FileMode { return m & FileModeMask } // Permissions returns just the permission bits of the mode. func (m FileMode) Permissions() FileMode { return m & permissionsMask } // Writable returns the mode with write bits added. func (m FileMode) Writable() FileMode { return m | 0222 } // IsReadable returns true if m represents a file that can be read. func (m FileMode) IsReadable() bool { return m&0444 != 0 } // IsWritable returns true if m represents a file that can be written to. func (m FileMode) IsWritable() bool { return m&0222 != 0 } // IsExecutable returns true if m represents a file that can be executed. func (m FileMode) IsExecutable() bool { return m&0111 != 0 } // IsRegular returns true if m is a regular file. func (m FileMode) IsRegular() bool { return m&FileModeMask == ModeRegular } // IsDir returns true if m represents a directory. func (m FileMode) IsDir() bool { return m&FileModeMask == ModeDirectory } // IsNamedPipe returns true if m represents a named pipe. func (m FileMode) IsNamedPipe() bool { return m&FileModeMask == ModeNamedPipe } // IsCharacterDevice returns true if m represents a character device. func (m FileMode) IsCharacterDevice() bool { return m&FileModeMask == ModeCharacterDevice } // IsBlockDevice returns true if m represents a character device. func (m FileMode) IsBlockDevice() bool { return m&FileModeMask == ModeBlockDevice } // IsSocket returns true if m represents a socket. func (m FileMode) IsSocket() bool { return m&FileModeMask == ModeSocket } // IsSymlink returns true if m represents a symlink. func (m FileMode) IsSymlink() bool { return m&FileModeMask == ModeSymlink } // ModeFromOS returns a FileMode from an os.FileMode. func ModeFromOS(mode os.FileMode) FileMode { m := FileMode(mode.Perm()) switch { case mode.IsDir(): m |= ModeDirectory case mode&os.ModeSymlink != 0: m |= ModeSymlink case mode&os.ModeSocket != 0: m |= ModeSocket case mode&os.ModeNamedPipe != 0: m |= ModeNamedPipe case mode&os.ModeCharDevice != 0: m |= ModeCharacterDevice case mode&os.ModeDevice != 0: m |= ModeBlockDevice default: m |= ModeRegular } return m } // OSMode converts a p9.FileMode to an os.FileMode. func (m FileMode) OSMode() os.FileMode { var osMode os.FileMode osMode |= os.FileMode(m.Permissions()) switch { case m.IsDir(): osMode |= os.ModeDir case m.IsSymlink(): osMode |= os.ModeSymlink case m.IsSocket(): osMode |= os.ModeSocket case m.IsNamedPipe(): osMode |= os.ModeNamedPipe case m.IsCharacterDevice(): osMode |= os.ModeCharDevice | os.ModeDevice case m.IsBlockDevice(): osMode |= os.ModeDevice } return osMode } // UID represents a user ID. type UID uint32 // Ok returns true if uid is not NoUID. func (uid UID) Ok() bool { return uid != NoUID } // GID represents a group ID. type GID uint32 // Ok returns true if gid is not NoGID. func (gid GID) Ok() bool { return gid != NoGID } const ( // NoTag is a sentinel used to indicate no valid tag. NoTag Tag = math.MaxUint16 // NoFID is a sentinel used to indicate no valid FID. NoFID FID = math.MaxUint32 // NoUID is a sentinel used to indicate no valid UID. NoUID UID = math.MaxUint32 // NoGID is a sentinel used to indicate no valid GID. NoGID GID = math.MaxUint32 ) // MsgType is a type identifier. type MsgType uint8 // MsgType declarations. const ( MsgTlerror MsgType = 6 MsgRlerror MsgType = 7 MsgTstatfs MsgType = 8 MsgRstatfs MsgType = 9 MsgTlopen MsgType = 12 MsgRlopen MsgType = 13 MsgTlcreate MsgType = 14 MsgRlcreate MsgType = 15 MsgTsymlink MsgType = 16 MsgRsymlink MsgType = 17 MsgTmknod MsgType = 18 MsgRmknod MsgType = 19 MsgTrename MsgType = 20 MsgRrename MsgType = 21 MsgTreadlink MsgType = 22 MsgRreadlink MsgType = 23 MsgTgetattr MsgType = 24 MsgRgetattr MsgType = 25 MsgTsetattr MsgType = 26 MsgRsetattr MsgType = 27 MsgTlistxattr MsgType = 28 MsgRlistxattr MsgType = 29 MsgTxattrwalk MsgType = 30 MsgRxattrwalk MsgType = 31 MsgTxattrcreate MsgType = 32 MsgRxattrcreate MsgType = 33 MsgTgetxattr MsgType = 34 MsgRgetxattr MsgType = 35 MsgTsetxattr MsgType = 36 MsgRsetxattr MsgType = 37 MsgTremovexattr MsgType = 38 MsgRremovexattr MsgType = 39 MsgTreaddir MsgType = 40 MsgRreaddir MsgType = 41 MsgTfsync MsgType = 50 MsgRfsync MsgType = 51 MsgTlink MsgType = 70 MsgRlink MsgType = 71 MsgTmkdir MsgType = 72 MsgRmkdir MsgType = 73 MsgTrenameat MsgType = 74 MsgRrenameat MsgType = 75 MsgTunlinkat MsgType = 76 MsgRunlinkat MsgType = 77 MsgTversion MsgType = 100 MsgRversion MsgType = 101 MsgTauth MsgType = 102 MsgRauth MsgType = 103 MsgTattach MsgType = 104 MsgRattach MsgType = 105 MsgTflush MsgType = 108 MsgRflush MsgType = 109 MsgTwalk MsgType = 110 MsgRwalk MsgType = 111 MsgTread MsgType = 116 MsgRread MsgType = 117 MsgTwrite MsgType = 118 MsgRwrite MsgType = 119 MsgTclunk MsgType = 120 MsgRclunk MsgType = 121 MsgTremove MsgType = 122 MsgRremove MsgType = 123 MsgTflushf MsgType = 124 MsgRflushf MsgType = 125 MsgTwalkgetattr MsgType = 126 MsgRwalkgetattr MsgType = 127 MsgTucreate MsgType = 128 MsgRucreate MsgType = 129 MsgTumkdir MsgType = 130 MsgRumkdir MsgType = 131 MsgTumknod MsgType = 132 MsgRumknod MsgType = 133 MsgTusymlink MsgType = 134 MsgRusymlink MsgType = 135 MsgTlconnect MsgType = 136 MsgRlconnect MsgType = 137 MsgTallocate MsgType = 138 MsgRallocate MsgType = 139 MsgTsetattrclunk MsgType = 140 MsgRsetattrclunk MsgType = 141 MsgTmultigetattr MsgType = 142 MsgRmultigetattr MsgType = 143 MsgTbind MsgType = 144 MsgRbind MsgType = 145 MsgTchannel MsgType = 250 MsgRchannel MsgType = 251 ) // QIDType represents the file type for QIDs. // // QIDType corresponds to the high 8 bits of a Plan 9 file mode. type QIDType uint8 const ( // TypeDir represents a directory type. TypeDir QIDType = 0x80 // TypeAppendOnly represents an append only file. TypeAppendOnly QIDType = 0x40 // TypeExclusive represents an exclusive-use file. TypeExclusive QIDType = 0x20 // TypeMount represents a mounted channel. TypeMount QIDType = 0x10 // TypeAuth represents an authentication file. TypeAuth QIDType = 0x08 // TypeTemporary represents a temporary file. TypeTemporary QIDType = 0x04 // TypeSymlink represents a symlink. TypeSymlink QIDType = 0x02 // TypeLink represents a hard link. TypeLink QIDType = 0x01 // TypeRegular represents a regular file. TypeRegular QIDType = 0x00 ) // QID is a unique file identifier. // // This may be embedded in other requests and responses. type QID struct { // Type is the highest order byte of the file mode. Type QIDType // Version is an arbitrary server version number. Version uint32 // Path is a unique server identifier for this path (e.g. inode). Path uint64 } // String implements fmt.Stringer. func (q QID) String() string { return fmt.Sprintf("QID{Type: %d, Version: %d, Path: %d}", q.Type, q.Version, q.Path) } // decode implements encoder.decode. func (q *QID) decode(b *buffer) { q.Type = b.ReadQIDType() q.Version = b.Read32() q.Path = b.Read64() } // encode implements encoder.encode. func (q *QID) encode(b *buffer) { b.WriteQIDType(q.Type) b.Write32(q.Version) b.Write64(q.Path) } // QIDGenerator is a simple generator for QIDs that atomically increments Path // values. type QIDGenerator struct { // uids is an ever increasing value that can be atomically incremented // to provide unique Path values for QIDs. uids atomicbitops.Uint64 } // Get returns a new 9P unique ID with a unique Path given a QID type. // // While the 9P spec allows Version to be incremented every time the file is // modified, we currently do not use the Version member for anything. Hence, // it is set to 0. func (q *QIDGenerator) Get(t QIDType) QID { return QID{ Type: t, Version: 0, Path: q.uids.Add(1), } } // FSStat is used by statfs. type FSStat struct { // Type is the filesystem type. Type uint32 // BlockSize is the blocksize. BlockSize uint32 // Blocks is the number of blocks. Blocks uint64 // BlocksFree is the number of free blocks. BlocksFree uint64 // BlocksAvailable is the number of blocks *available*. BlocksAvailable uint64 // Files is the number of files available. Files uint64 // FilesFree is the number of free file nodes. FilesFree uint64 // FSID is the filesystem ID. FSID uint64 // NameLength is the maximum name length. NameLength uint32 } // decode implements encoder.decode. func (f *FSStat) decode(b *buffer) { f.Type = b.Read32() f.BlockSize = b.Read32() f.Blocks = b.Read64() f.BlocksFree = b.Read64() f.BlocksAvailable = b.Read64() f.Files = b.Read64() f.FilesFree = b.Read64() f.FSID = b.Read64() f.NameLength = b.Read32() } // encode implements encoder.encode. func (f *FSStat) encode(b *buffer) { b.Write32(f.Type) b.Write32(f.BlockSize) b.Write64(f.Blocks) b.Write64(f.BlocksFree) b.Write64(f.BlocksAvailable) b.Write64(f.Files) b.Write64(f.FilesFree) b.Write64(f.FSID) b.Write32(f.NameLength) } // AttrMask is a mask of attributes for getattr. type AttrMask struct { Mode bool NLink bool UID bool GID bool RDev bool ATime bool MTime bool CTime bool INo bool Size bool Blocks bool BTime bool Gen bool DataVersion bool } // Contains returns true if a contains all of the attributes masked as b. func (a AttrMask) Contains(b AttrMask) bool { if b.Mode && !a.Mode { return false } if b.NLink && !a.NLink { return false } if b.UID && !a.UID { return false } if b.GID && !a.GID { return false } if b.RDev && !a.RDev { return false } if b.ATime && !a.ATime { return false } if b.MTime && !a.MTime { return false } if b.CTime && !a.CTime { return false } if b.INo && !a.INo { return false } if b.Size && !a.Size { return false } if b.Blocks && !a.Blocks { return false } if b.BTime && !a.BTime { return false } if b.Gen && !a.Gen { return false } if b.DataVersion && !a.DataVersion { return false } return true } // Empty returns true if no fields are masked. func (a AttrMask) Empty() bool { return !a.Mode && !a.NLink && !a.UID && !a.GID && !a.RDev && !a.ATime && !a.MTime && !a.CTime && !a.INo && !a.Size && !a.Blocks && !a.BTime && !a.Gen && !a.DataVersion } // AttrMaskAll returns an AttrMask with all fields masked. func AttrMaskAll() AttrMask { return AttrMask{ Mode: true, NLink: true, UID: true, GID: true, RDev: true, ATime: true, MTime: true, CTime: true, INo: true, Size: true, Blocks: true, BTime: true, Gen: true, DataVersion: true, } } // String implements fmt.Stringer. func (a AttrMask) String() string { var masks []string if a.Mode { masks = append(masks, "Mode") } if a.NLink { masks = append(masks, "NLink") } if a.UID { masks = append(masks, "UID") } if a.GID { masks = append(masks, "GID") } if a.RDev { masks = append(masks, "RDev") } if a.ATime { masks = append(masks, "ATime") } if a.MTime { masks = append(masks, "MTime") } if a.CTime { masks = append(masks, "CTime") } if a.INo { masks = append(masks, "INo") } if a.Size { masks = append(masks, "Size") } if a.Blocks { masks = append(masks, "Blocks") } if a.BTime { masks = append(masks, "BTime") } if a.Gen { masks = append(masks, "Gen") } if a.DataVersion { masks = append(masks, "DataVersion") } return fmt.Sprintf("AttrMask{with: %s}", strings.Join(masks, " ")) } // decode implements encoder.decode. func (a *AttrMask) decode(b *buffer) { mask := b.Read64() a.Mode = mask&0x00000001 != 0 a.NLink = mask&0x00000002 != 0 a.UID = mask&0x00000004 != 0 a.GID = mask&0x00000008 != 0 a.RDev = mask&0x00000010 != 0 a.ATime = mask&0x00000020 != 0 a.MTime = mask&0x00000040 != 0 a.CTime = mask&0x00000080 != 0 a.INo = mask&0x00000100 != 0 a.Size = mask&0x00000200 != 0 a.Blocks = mask&0x00000400 != 0 a.BTime = mask&0x00000800 != 0 a.Gen = mask&0x00001000 != 0 a.DataVersion = mask&0x00002000 != 0 } // encode implements encoder.encode. func (a *AttrMask) encode(b *buffer) { var mask uint64 if a.Mode { mask |= 0x00000001 } if a.NLink { mask |= 0x00000002 } if a.UID { mask |= 0x00000004 } if a.GID { mask |= 0x00000008 } if a.RDev { mask |= 0x00000010 } if a.ATime { mask |= 0x00000020 } if a.MTime { mask |= 0x00000040 } if a.CTime { mask |= 0x00000080 } if a.INo { mask |= 0x00000100 } if a.Size { mask |= 0x00000200 } if a.Blocks { mask |= 0x00000400 } if a.BTime { mask |= 0x00000800 } if a.Gen { mask |= 0x00001000 } if a.DataVersion { mask |= 0x00002000 } b.Write64(mask) } // Attr is a set of attributes for getattr. type Attr struct { Mode FileMode UID UID GID GID NLink uint64 RDev uint64 Size uint64 BlockSize uint64 Blocks uint64 ATimeSeconds uint64 ATimeNanoSeconds uint64 MTimeSeconds uint64 MTimeNanoSeconds uint64 CTimeSeconds uint64 CTimeNanoSeconds uint64 BTimeSeconds uint64 BTimeNanoSeconds uint64 Gen uint64 DataVersion uint64 } // String implements fmt.Stringer. func (a Attr) String() string { return fmt.Sprintf("Attr{Mode: 0o%o, UID: %d, GID: %d, NLink: %d, RDev: %d, Size: %d, BlockSize: %d, Blocks: %d, ATime: {Sec: %d, NanoSec: %d}, MTime: {Sec: %d, NanoSec: %d}, CTime: {Sec: %d, NanoSec: %d}, BTime: {Sec: %d, NanoSec: %d}, Gen: %d, DataVersion: %d}", a.Mode, a.UID, a.GID, a.NLink, a.RDev, a.Size, a.BlockSize, a.Blocks, a.ATimeSeconds, a.ATimeNanoSeconds, a.MTimeSeconds, a.MTimeNanoSeconds, a.CTimeSeconds, a.CTimeNanoSeconds, a.BTimeSeconds, a.BTimeNanoSeconds, a.Gen, a.DataVersion) } // encode implements encoder.encode. func (a *Attr) encode(b *buffer) { b.WriteFileMode(a.Mode) b.WriteUID(a.UID) b.WriteGID(a.GID) b.Write64(a.NLink) b.Write64(a.RDev) b.Write64(a.Size) b.Write64(a.BlockSize) b.Write64(a.Blocks) b.Write64(a.ATimeSeconds) b.Write64(a.ATimeNanoSeconds) b.Write64(a.MTimeSeconds) b.Write64(a.MTimeNanoSeconds) b.Write64(a.CTimeSeconds) b.Write64(a.CTimeNanoSeconds) b.Write64(a.BTimeSeconds) b.Write64(a.BTimeNanoSeconds) b.Write64(a.Gen) b.Write64(a.DataVersion) } // decode implements encoder.decode. func (a *Attr) decode(b *buffer) { a.Mode = b.ReadFileMode() a.UID = b.ReadUID() a.GID = b.ReadGID() a.NLink = b.Read64() a.RDev = b.Read64() a.Size = b.Read64() a.BlockSize = b.Read64() a.Blocks = b.Read64() a.ATimeSeconds = b.Read64() a.ATimeNanoSeconds = b.Read64() a.MTimeSeconds = b.Read64() a.MTimeNanoSeconds = b.Read64() a.CTimeSeconds = b.Read64() a.CTimeNanoSeconds = b.Read64() a.BTimeSeconds = b.Read64() a.BTimeNanoSeconds = b.Read64() a.Gen = b.Read64() a.DataVersion = b.Read64() } // StatToAttr converts a Linux syscall stat structure to an Attr. func StatToAttr(s *syscall.Stat_t, req AttrMask) (Attr, AttrMask) { attr := Attr{ UID: NoUID, GID: NoGID, } if req.Mode { // p9.FileMode corresponds to Linux mode_t. attr.Mode = FileMode(s.Mode) } if req.NLink { attr.NLink = uint64(s.Nlink) } if req.UID { attr.UID = UID(s.Uid) } if req.GID { attr.GID = GID(s.Gid) } if req.RDev { attr.RDev = s.Dev } if req.ATime { attr.ATimeSeconds = uint64(s.Atim.Sec) attr.ATimeNanoSeconds = uint64(s.Atim.Nsec) } if req.MTime { attr.MTimeSeconds = uint64(s.Mtim.Sec) attr.MTimeNanoSeconds = uint64(s.Mtim.Nsec) } if req.CTime { attr.CTimeSeconds = uint64(s.Ctim.Sec) attr.CTimeNanoSeconds = uint64(s.Ctim.Nsec) } if req.Size { attr.Size = uint64(s.Size) } if req.Blocks { attr.BlockSize = uint64(s.Blksize) attr.Blocks = uint64(s.Blocks) } // Use the req field because we already have it. req.BTime = false req.Gen = false req.DataVersion = false return attr, req } // SetAttrMask specifies a valid mask for setattr. type SetAttrMask struct { Permissions bool UID bool GID bool Size bool ATime bool MTime bool CTime bool ATimeNotSystemTime bool MTimeNotSystemTime bool } // IsSubsetOf returns whether s is a subset of m. func (s SetAttrMask) IsSubsetOf(m SetAttrMask) bool { sb := s.bitmask() sm := m.bitmask() return sm|sb == sm } // String implements fmt.Stringer. func (s SetAttrMask) String() string { var masks []string if s.Permissions { masks = append(masks, "Permissions") } if s.UID { masks = append(masks, "UID") } if s.GID { masks = append(masks, "GID") } if s.Size { masks = append(masks, "Size") } if s.ATime { masks = append(masks, "ATime") } if s.MTime { masks = append(masks, "MTime") } if s.CTime { masks = append(masks, "CTime") } if s.ATimeNotSystemTime { masks = append(masks, "ATimeNotSystemTime") } if s.MTimeNotSystemTime { masks = append(masks, "MTimeNotSystemTime") } return fmt.Sprintf("SetAttrMask{with: %s}", strings.Join(masks, " ")) } // Empty returns true if no fields are masked. func (s SetAttrMask) Empty() bool { return !s.Permissions && !s.UID && !s.GID && !s.Size && !s.ATime && !s.MTime && !s.CTime && !s.ATimeNotSystemTime && !s.MTimeNotSystemTime } // decode implements encoder.decode. func (s *SetAttrMask) decode(b *buffer) { mask := b.Read32() s.Permissions = mask&0x00000001 != 0 s.UID = mask&0x00000002 != 0 s.GID = mask&0x00000004 != 0 s.Size = mask&0x00000008 != 0 s.ATime = mask&0x00000010 != 0 s.MTime = mask&0x00000020 != 0 s.CTime = mask&0x00000040 != 0 s.ATimeNotSystemTime = mask&0x00000080 != 0 s.MTimeNotSystemTime = mask&0x00000100 != 0 } func (s SetAttrMask) bitmask() uint32 { var mask uint32 if s.Permissions { mask |= 0x00000001 } if s.UID { mask |= 0x00000002 } if s.GID { mask |= 0x00000004 } if s.Size { mask |= 0x00000008 } if s.ATime { mask |= 0x00000010 } if s.MTime { mask |= 0x00000020 } if s.CTime { mask |= 0x00000040 } if s.ATimeNotSystemTime { mask |= 0x00000080 } if s.MTimeNotSystemTime { mask |= 0x00000100 } return mask } // encode implements encoder.encode. func (s *SetAttrMask) encode(b *buffer) { b.Write32(s.bitmask()) } // SetAttr specifies a set of attributes for a setattr. type SetAttr struct { Permissions FileMode UID UID GID GID Size uint64 ATimeSeconds uint64 ATimeNanoSeconds uint64 MTimeSeconds uint64 MTimeNanoSeconds uint64 } // String implements fmt.Stringer. func (s SetAttr) String() string { return fmt.Sprintf("SetAttr{Permissions: 0o%o, UID: %d, GID: %d, Size: %d, ATime: {Sec: %d, NanoSec: %d}, MTime: {Sec: %d, NanoSec: %d}}", s.Permissions, s.UID, s.GID, s.Size, s.ATimeSeconds, s.ATimeNanoSeconds, s.MTimeSeconds, s.MTimeNanoSeconds) } // decode implements encoder.decode. func (s *SetAttr) decode(b *buffer) { s.Permissions = b.ReadPermissions() s.UID = b.ReadUID() s.GID = b.ReadGID() s.Size = b.Read64() s.ATimeSeconds = b.Read64() s.ATimeNanoSeconds = b.Read64() s.MTimeSeconds = b.Read64() s.MTimeNanoSeconds = b.Read64() } // encode implements encoder.encode. func (s *SetAttr) encode(b *buffer) { b.WritePermissions(s.Permissions) b.WriteUID(s.UID) b.WriteGID(s.GID) b.Write64(s.Size) b.Write64(s.ATimeSeconds) b.Write64(s.ATimeNanoSeconds) b.Write64(s.MTimeSeconds) b.Write64(s.MTimeNanoSeconds) } // Apply applies this to the given Attr. func (a *Attr) Apply(mask SetAttrMask, attr SetAttr) { if mask.Permissions { a.Mode = a.Mode&^permissionsMask | (attr.Permissions & permissionsMask) } if mask.UID { a.UID = attr.UID } if mask.GID { a.GID = attr.GID } if mask.Size { a.Size = attr.Size } if mask.ATime { a.ATimeSeconds = attr.ATimeSeconds a.ATimeNanoSeconds = attr.ATimeNanoSeconds } if mask.MTime { a.MTimeSeconds = attr.MTimeSeconds a.MTimeNanoSeconds = attr.MTimeNanoSeconds } } // DirentSizeStatic is the number of bytes required to encode a p9.Dirent // with an empty name. In other words, it is the static part of its size. const DirentSizeStatic = 24 // Dirent is used for readdir. type Dirent struct { // QID is the entry QID. QID QID // Offset is the offset in the directory. // // This will be communicated back the original caller. Offset uint64 // Type is the 9P type. Type QIDType // Name is the name of the entry (i.e. basename). Name string } // String implements fmt.Stringer. func (d Dirent) String() string { return fmt.Sprintf("Dirent{QID: %d, Offset: %d, Type: 0x%X, Name: %s}", d.QID, d.Offset, d.Type, d.Name) } // decode implements encoder.decode. func (d *Dirent) decode(b *buffer) { d.QID.decode(b) d.Offset = b.Read64() d.Type = b.ReadQIDType() d.Name = b.ReadString() } // encode implements encoder.encode. func (d *Dirent) encode(b *buffer) { d.QID.encode(b) b.Write64(d.Offset) b.WriteQIDType(d.Type) b.WriteString(d.Name) } // AllocateMode are possible modes to p9.File.Allocate(). type AllocateMode struct { KeepSize bool PunchHole bool NoHideStale bool CollapseRange bool ZeroRange bool InsertRange bool Unshare bool } // ToAllocateMode returns an AllocateMode from a fallocate(2) mode. func ToAllocateMode(mode uint64) AllocateMode { return AllocateMode{ KeepSize: mode&unix.FALLOC_FL_KEEP_SIZE != 0, PunchHole: mode&unix.FALLOC_FL_PUNCH_HOLE != 0, NoHideStale: mode&unix.FALLOC_FL_NO_HIDE_STALE != 0, CollapseRange: mode&unix.FALLOC_FL_COLLAPSE_RANGE != 0, ZeroRange: mode&unix.FALLOC_FL_ZERO_RANGE != 0, InsertRange: mode&unix.FALLOC_FL_INSERT_RANGE != 0, Unshare: mode&unix.FALLOC_FL_UNSHARE_RANGE != 0, } } // ToLinux converts to a value compatible with fallocate(2)'s mode. func (a *AllocateMode) ToLinux() uint32 { rv := uint32(0) if a.KeepSize { rv |= unix.FALLOC_FL_KEEP_SIZE } if a.PunchHole { rv |= unix.FALLOC_FL_PUNCH_HOLE } if a.NoHideStale { rv |= unix.FALLOC_FL_NO_HIDE_STALE } if a.CollapseRange { rv |= unix.FALLOC_FL_COLLAPSE_RANGE } if a.ZeroRange { rv |= unix.FALLOC_FL_ZERO_RANGE } if a.InsertRange { rv |= unix.FALLOC_FL_INSERT_RANGE } if a.Unshare { rv |= unix.FALLOC_FL_UNSHARE_RANGE } return rv } // decode implements encoder.decode. func (a *AllocateMode) decode(b *buffer) { mask := b.Read32() a.KeepSize = mask&0x01 != 0 a.PunchHole = mask&0x02 != 0 a.NoHideStale = mask&0x04 != 0 a.CollapseRange = mask&0x08 != 0 a.ZeroRange = mask&0x10 != 0 a.InsertRange = mask&0x20 != 0 a.Unshare = mask&0x40 != 0 } // encode implements encoder.encode. func (a *AllocateMode) encode(b *buffer) { mask := uint32(0) if a.KeepSize { mask |= 0x01 } if a.PunchHole { mask |= 0x02 } if a.NoHideStale { mask |= 0x04 } if a.CollapseRange { mask |= 0x08 } if a.ZeroRange { mask |= 0x10 } if a.InsertRange { mask |= 0x20 } if a.Unshare { mask |= 0x40 } b.Write32(mask) } // FullStat is used in the result of a MultiGetAttr call. type FullStat struct { QID QID Valid AttrMask Attr Attr } // String implements fmt.Stringer. func (f *FullStat) String() string { return fmt.Sprintf("FullStat{QID: %v, Valid: %v, Attr: %v}", f.QID, f.Valid, f.Attr) } // decode implements encoder.decode. func (f *FullStat) decode(b *buffer) { f.QID.decode(b) f.Valid.decode(b) f.Attr.decode(b) } // encode implements encoder.encode. func (f *FullStat) encode(b *buffer) { f.QID.encode(b) f.Valid.encode(b) f.Attr.encode(b) } golang-gvisor-gvisor-0.0~20240729.0/pkg/p9/p9_state_autogen.go000066400000000000000000000000641465435605700235550ustar00rootroot00000000000000// automatically generated by stateify. package p9 golang-gvisor-gvisor-0.0~20240729.0/pkg/p9/path_tree.go000066400000000000000000000144701465435605700222640ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package p9 import ( "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sync" ) // pathNode is a single node in a path traversal. // // These are shared by all fidRefs that point to the same path. // // Lock ordering: // // opMu // childMu // // Two different pathNodes may only be locked if Server.renameMu is held for // write, in which case they can be acquired in any order. type pathNode struct { // opMu synchronizes high-level, semantic operations, such as the // simultaneous creation and deletion of a file. opMu sync.RWMutex // deleted indicates that the backing file has been deleted. We stop many // operations at the API level if they are incompatible with a file that has // already been unlinked. deleted is protected by opMu. However, it may be // changed without opMu if this node is deleted as part of an entire subtree // on unlink. So deleted must only be accessed/mutated using atomics. deleted atomicbitops.Uint32 // childMu protects the fields below. childMu sync.RWMutex // childNodes maps child path component names to their pathNode. childNodes map[string]*pathNode // childRefs maps child path component names to all of the their // references. childRefs map[string]map[*fidRef]struct{} // childRefNames maps child references back to their path component // name. childRefNames map[*fidRef]string } func newPathNode() *pathNode { return &pathNode{ childNodes: make(map[string]*pathNode), childRefs: make(map[string]map[*fidRef]struct{}), childRefNames: make(map[*fidRef]string), } } // forEachChildRef calls fn for each child reference. func (p *pathNode) forEachChildRef(fn func(ref *fidRef, name string)) { p.childMu.RLock() defer p.childMu.RUnlock() for name, m := range p.childRefs { for ref := range m { fn(ref, name) } } } // forEachChildNode calls fn for each child pathNode. func (p *pathNode) forEachChildNode(fn func(pn *pathNode)) { p.childMu.RLock() defer p.childMu.RUnlock() for _, pn := range p.childNodes { fn(pn) } } // pathNodeFor returns the path node for the given name, or a new one. func (p *pathNode) pathNodeFor(name string) *pathNode { p.childMu.RLock() // Fast path, node already exists. if pn, ok := p.childNodes[name]; ok { p.childMu.RUnlock() return pn } p.childMu.RUnlock() // Slow path, create a new pathNode for shared use. p.childMu.Lock() // Re-check after re-lock. if pn, ok := p.childNodes[name]; ok { p.childMu.Unlock() return pn } pn := newPathNode() p.childNodes[name] = pn p.childMu.Unlock() return pn } // nameFor returns the name for the given fidRef. // // Precondition: addChild is called for ref before nameFor. func (p *pathNode) nameFor(ref *fidRef) string { p.childMu.RLock() n, ok := p.childRefNames[ref] p.childMu.RUnlock() if !ok { // This should not happen, don't proceed. panic(fmt.Sprintf("expected name for %+v, none found", ref)) } return n } // addChildLocked adds a child reference to p. // // Precondition: As addChild, plus childMu is locked for write. func (p *pathNode) addChildLocked(ref *fidRef, name string) { if n, ok := p.childRefNames[ref]; ok { // This should not happen, don't proceed. panic(fmt.Sprintf("unexpected fidRef %+v with path %q, wanted %q", ref, n, name)) } p.childRefNames[ref] = name m, ok := p.childRefs[name] if !ok { m = make(map[*fidRef]struct{}) p.childRefs[name] = m } m[ref] = struct{}{} } // addChild adds a child reference to p. // // Precondition: ref may only be added once at a time. func (p *pathNode) addChild(ref *fidRef, name string) { p.childMu.Lock() p.addChildLocked(ref, name) p.childMu.Unlock() } // removeChild removes the given child. // // This applies only to an individual fidRef, which is not required to exist. func (p *pathNode) removeChild(ref *fidRef) { p.childMu.Lock() // This ref may not exist anymore. This can occur, e.g., in unlink, // where a removeWithName removes the ref, and then a DecRef on the ref // attempts to remove again. if name, ok := p.childRefNames[ref]; ok { m, ok := p.childRefs[name] if !ok { // This should not happen, don't proceed. p.childMu.Unlock() panic(fmt.Sprintf("name %s missing from childfidRefs", name)) } delete(m, ref) if len(m) == 0 { delete(p.childRefs, name) } } delete(p.childRefNames, ref) p.childMu.Unlock() } // addPathNodeFor adds an existing pathNode as the node for name. // // Preconditions: newName does not exist. func (p *pathNode) addPathNodeFor(name string, pn *pathNode) { p.childMu.Lock() if opn, ok := p.childNodes[name]; ok { p.childMu.Unlock() panic(fmt.Sprintf("unexpected pathNode %+v with path %q", opn, name)) } p.childNodes[name] = pn p.childMu.Unlock() } // removeWithName removes all references with the given name. // // The provided function is executed after reference removal. The only method // it may (transitively) call on this pathNode is addChildLocked. // // If a child pathNode for name exists, it is removed from this pathNode and // returned by this function. Any operations on the removed tree must use this // value. func (p *pathNode) removeWithName(name string, fn func(ref *fidRef)) *pathNode { p.childMu.Lock() defer p.childMu.Unlock() if m, ok := p.childRefs[name]; ok { for ref := range m { delete(m, ref) delete(p.childRefNames, ref) if fn == nil { // No callback provided. continue } // Attempt to hold a reference while calling fn() to // prevent concurrent destruction of the child, which // can lead to data races. If the child has already // been destroyed, then we can skip the callback. if ref.TryIncRef() { fn(ref) ref.DecRef() } } } // Return the original path node, if it exists. origPathNode := p.childNodes[name] delete(p.childNodes, name) return origPathNode } golang-gvisor-gvisor-0.0~20240729.0/pkg/p9/server.go000066400000000000000000000436011465435605700216150ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package p9 import ( "io" "runtime/debug" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdchannel" "gvisor.dev/gvisor/pkg/flipcall" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) // Server is a 9p2000.L server. type Server struct { // attacher provides the attach function. attacher Attacher options AttacherOptions // pathTree is the full set of paths opened on this server. // // These may be across different connections, but rename operations // must be serialized globally for safely. There is a single pathTree // for the entire server, and not per connection. pathTree *pathNode // renameMu is a global lock protecting rename operations. With this // lock, we can be certain that any given rename operation can safely // acquire two path nodes in any order, as all other concurrent // operations acquire at most a single node. renameMu sync.RWMutex } // NewServer returns a new server. attacher may be nil. func NewServer(attacher Attacher) *Server { opts := AttacherOptions{} if attacher != nil { opts = attacher.ServerOptions() } return &Server{ attacher: attacher, options: opts, pathTree: newPathNode(), } } // connState is the state for a single connection. type connState struct { // server is the backing server. server *Server // fids is the set of active FIDs. // // This is used to find FIDs for files. fidMu sync.Mutex fids map[FID]*fidRef // tags is the set of active tags. // // The given channel is closed when the // tag is finished with processing. tagMu sync.Mutex tags map[Tag]chan struct{} // messageSize is the maximum message size. The server does not // do automatic splitting of messages. messageSize atomicbitops.Uint32 // version is the agreed upon version X of 9P2000.L.Google.X. // version 0 implies 9P2000.L. version atomicbitops.Uint32 // reqGate counts requests that are still being handled. reqGate sync.Gate // -- below relates to the legacy handler -- // recvMu serializes receiving from conn. recvMu sync.Mutex // recvIdle is the number of goroutines in handleRequests() attempting to // lock recvMu so that they can receive from conn. recvIdle atomicbitops.Int32 // If recvShutdown is true, at least one goroutine has observed a // connection error while receiving from conn, and all goroutines in // handleRequests() should exit immediately. recvShutdown is protected by // recvMu. recvShutdown bool // sendMu serializes sending to conn. sendMu sync.Mutex // conn is the connection used by the legacy transport. conn *unet.Socket // -- below relates to the flipcall handler -- // channelMu protects below. channelMu sync.Mutex // channelWg represents active workers. channelWg sync.WaitGroup // channelAlloc allocates channel memory. channelAlloc *flipcall.PacketWindowAllocator // channels are the set of initialized channels. channels []*channel } // fidRef wraps a node and tracks references. type fidRef struct { // server is the associated server. server *Server // file is the associated File. file File // refs is an active reference count. // // The node above will be closed only when refs reaches zero. refs atomicbitops.Int64 // opened indicates whether this has been opened already. // // This is updated in handlers.go. // // opened is protected by pathNode.opMu or renameMu (for write). opened bool // mode is the fidRef's mode from the walk. Only the type bits are // valid, the permissions may change. This is used to sanity check // operations on this element, and prevent walks across // non-directories. mode FileMode // openFlags is the mode used in the open. // // This is updated in handlers.go. // // openFlags is protected by pathNode.opMu or renameMu (for write). openFlags OpenFlags // pathNode is the current pathNode for this FID. pathNode *pathNode // parent is the parent fidRef. We hold on to a parent reference to // ensure that hooks, such as Renamed, can be executed safely by the // server code. // // Note that parent cannot be changed without holding both the global // rename lock and a writable lock on the associated pathNode for this // fidRef. Holding either of these locks is sufficient to examine // parent safely. // // The parent will be nil for root fidRefs, and non-nil otherwise. The // method maybeParent can be used to return a cyclical reference, and // isRoot should be used to check for root over looking at parent // directly. parent *fidRef } // IncRef increases the references on a fid. func (f *fidRef) IncRef() { f.refs.Add(1) } // DecRef should be called when you're finished with a fid. func (f *fidRef) DecRef() { if f.refs.Add(-1) == 0 { f.file.Close() // Drop the parent reference. // // Since this fidRef is guaranteed to be non-discoverable when // the references reach zero, we don't need to worry about // clearing the parent. if f.parent != nil { // If we've been previously deleted, this removing this // ref is a no-op. That's expected. f.parent.pathNode.removeChild(f) f.parent.DecRef() } } } // TryIncRef returns true if a new reference is taken on the fid, and false if // the fid has been destroyed. func (f *fidRef) TryIncRef() bool { for { r := f.refs.Load() if r <= 0 { return false } if f.refs.CompareAndSwap(r, r+1) { return true } } } // isDeleted returns true if this fidRef has been deleted. // // Precondition: this must be called via safelyRead, safelyWrite or // safelyGlobal. func (f *fidRef) isDeleted() bool { return f.pathNode.deleted.Load() != 0 } // isRoot indicates whether this is a root fid. func (f *fidRef) isRoot() bool { return f.parent == nil } // maybeParent returns a cyclic reference for roots, and the parent otherwise. func (f *fidRef) maybeParent() *fidRef { if f.parent != nil { return f.parent } return f // Root has itself. } // notifyDelete marks all fidRefs as deleted. // // Precondition: this must be called via safelyWrite or safelyGlobal. func notifyDelete(pn *pathNode) { pn.deleted.Store(1) // Call on all subtrees. pn.forEachChildNode(func(pn *pathNode) { notifyDelete(pn) }) } // markChildDeleted marks all children below the given name as deleted. // // Precondition: this must be called via safelyWrite or safelyGlobal. func (f *fidRef) markChildDeleted(name string) { if origPathNode := f.pathNode.removeWithName(name, nil); origPathNode != nil { // Mark all children as deleted. notifyDelete(origPathNode) } } // notifyNameChange calls the relevant Renamed method on all nodes in the path, // recursively. Note that this applies only for subtrees, as these // notifications do not apply to the actual file whose name has changed. // // Precondition: this must be called via safelyGlobal. func notifyNameChange(pn *pathNode) { // Call on all local references. pn.forEachChildRef(func(ref *fidRef, name string) { ref.file.Renamed(ref.parent.file, name) }) // Call on all subtrees. pn.forEachChildNode(func(pn *pathNode) { notifyNameChange(pn) }) } // renameChildTo renames the given child to the target. // // Precondition: this must be called via safelyGlobal. func (f *fidRef) renameChildTo(oldName string, target *fidRef, newName string) { target.markChildDeleted(newName) origPathNode := f.pathNode.removeWithName(oldName, func(ref *fidRef) { // N.B. DecRef can take f.pathNode's parent's childMu. This is // allowed because renameMu is held for write via safelyGlobal. ref.parent.DecRef() // Drop original reference. ref.parent = target // Change parent. ref.parent.IncRef() // Acquire new one. if f.pathNode == target.pathNode { target.pathNode.addChildLocked(ref, newName) } else { target.pathNode.addChild(ref, newName) } ref.file.Renamed(target.file, newName) }) if origPathNode != nil { // Replace the previous (now deleted) path node. target.pathNode.addPathNodeFor(newName, origPathNode) // Call Renamed on all children. notifyNameChange(origPathNode) } } // safelyRead executes the given operation with the local path node locked. // This implies that paths will not change during the operation. func (f *fidRef) safelyRead(fn func() error) (err error) { f.server.renameMu.RLock() defer f.server.renameMu.RUnlock() f.pathNode.opMu.RLock() defer f.pathNode.opMu.RUnlock() return fn() } // safelyWrite executes the given operation with the local path node locked in // a writable fashion. This implies some paths may change. func (f *fidRef) safelyWrite(fn func() error) (err error) { f.server.renameMu.RLock() defer f.server.renameMu.RUnlock() f.pathNode.opMu.Lock() defer f.pathNode.opMu.Unlock() return fn() } // safelyGlobal executes the given operation with the global path lock held. func (f *fidRef) safelyGlobal(fn func() error) (err error) { f.server.renameMu.Lock() defer f.server.renameMu.Unlock() return fn() } // LookupFID finds the given FID. // // You should call fid.DecRef when you are finished using the fid. func (cs *connState) LookupFID(fid FID) (*fidRef, bool) { cs.fidMu.Lock() defer cs.fidMu.Unlock() fidRef, ok := cs.fids[fid] if ok { fidRef.IncRef() return fidRef, true } return nil, false } // InsertFID installs the given FID. // // This fid starts with a reference count of one. If a FID exists in // the slot already it is closed, per the specification. func (cs *connState) InsertFID(fid FID, newRef *fidRef) { cs.fidMu.Lock() defer cs.fidMu.Unlock() origRef, ok := cs.fids[fid] if ok { defer origRef.DecRef() } newRef.IncRef() cs.fids[fid] = newRef } // DeleteFID removes the given FID. // // This simply removes it from the map and drops a reference. func (cs *connState) DeleteFID(fid FID) bool { cs.fidMu.Lock() defer cs.fidMu.Unlock() fidRef, ok := cs.fids[fid] if !ok { return false } delete(cs.fids, fid) fidRef.DecRef() return true } // StartTag starts handling the tag. // // False is returned if this tag is already active. func (cs *connState) StartTag(t Tag) bool { cs.tagMu.Lock() defer cs.tagMu.Unlock() _, ok := cs.tags[t] if ok { return false } cs.tags[t] = make(chan struct{}) return true } // ClearTag finishes handling a tag. func (cs *connState) ClearTag(t Tag) { cs.tagMu.Lock() defer cs.tagMu.Unlock() ch, ok := cs.tags[t] if !ok { // Should never happen. panic("unused tag cleared") } delete(cs.tags, t) // Notify. close(ch) } // WaitTag waits for a tag to finish. func (cs *connState) WaitTag(t Tag) { cs.tagMu.Lock() ch, ok := cs.tags[t] cs.tagMu.Unlock() if !ok { return } // Wait for close. <-ch } // initializeChannels initializes all channels. // // This is a no-op if channels are already initialized. func (cs *connState) initializeChannels() (err error) { cs.channelMu.Lock() defer cs.channelMu.Unlock() // Initialize our channel allocator. if cs.channelAlloc == nil { alloc, err := flipcall.NewPacketWindowAllocator() if err != nil { return err } cs.channelAlloc = alloc } // Create all the channels. for len(cs.channels) < channelsPerClient { res := &channel{ done: make(chan struct{}), } res.desc, err = cs.channelAlloc.Allocate(channelSize) if err != nil { return err } if err := res.data.Init(flipcall.ServerSide, res.desc); err != nil { return err } socks, err := fdchannel.NewConnectedSockets() if err != nil { res.data.Destroy() // Cleanup. return err } res.fds.Init(socks[0]) res.client = fd.New(socks[1]) cs.channels = append(cs.channels, res) // Start servicing the channel. // // When we call stop, we will close all the channels and these // routines should finish. We need the wait group to ensure // that active handlers are actually finished before cleanup. cs.channelWg.Add(1) go func() { // S/R-SAFE: Server side. defer cs.channelWg.Done() if err := res.service(cs); err != nil { // Don't log flipcall.ShutdownErrors, which we expect to be // returned during server shutdown. if _, ok := err.(flipcall.ShutdownError); !ok { log.Warningf("p9.channel.service: %v", err) } } }() } return nil } // lookupChannel looks up the channel with given id. // // The function returns nil if no such channel is available. func (cs *connState) lookupChannel(id uint32) *channel { cs.channelMu.Lock() defer cs.channelMu.Unlock() if id >= uint32(len(cs.channels)) { return nil } return cs.channels[id] } // handle handles a single message. func (cs *connState) handle(m message) (r message) { if !cs.reqGate.Enter() { // connState.stop() has been called; the connection is shutting down. r = newErrFromLinuxerr(linuxerr.ECONNRESET) return } defer func() { cs.reqGate.Leave() if r == nil { // Don't allow a panic to propagate. err := recover() // Include a useful log message. log.Warningf("panic in handler: %v\n%s", err, debug.Stack()) // Wrap in an EREMOTEIO error; we don't really have a // better way to describe this kind of error. It will // usually manifest as a result of the test framework. r = newErrFromLinuxerr(linuxerr.EREMOTEIO) } }() if handler, ok := m.(handler); ok { // Call the message handler. r = handler.handle(cs) // TODO(b/34162363):This is only here to make sure the server works with // only linuxerr Errors, as the handlers work with both client and server. // It will be removed a followup, when all the unix.Errno errors are // replaced with linuxerr. if rlError, ok := r.(*Rlerror); ok { e := linuxerr.ErrorFromUnix(unix.Errno(rlError.Error)) r = newErrFromLinuxerr(e) } } else { // Produce an ENOSYS error. r = newErrFromLinuxerr(linuxerr.ENOSYS) } return } // handleRequest handles a single request. It returns true if the caller should // continue handling requests and false if it should terminate. func (cs *connState) handleRequest() bool { // Obtain the right to receive a message from cs.conn. cs.recvIdle.Add(1) cs.recvMu.Lock() cs.recvIdle.Add(-1) if cs.recvShutdown { // Another goroutine already detected a connection problem; exit // immediately. cs.recvMu.Unlock() return false } messageSize := cs.messageSize.Load() if messageSize == 0 { // Default or not yet negotiated. messageSize = maximumLength } // Receive a message. tag, m, err := recv(cs.conn, messageSize, msgRegistry.get) if errSocket, ok := err.(ErrSocket); ok { // Connection problem; stop serving. log.Debugf("p9.recv: %v", errSocket.error) cs.recvShutdown = true cs.recvMu.Unlock() return false } // Ensure that another goroutine is available to receive from cs.conn. if cs.recvIdle.Load() == 0 { go cs.handleRequests() // S/R-SAFE: Irrelevant. } cs.recvMu.Unlock() // Deal with other errors. if err != nil && err != io.EOF { // If it's not a connection error, but some other protocol error, // we can send a response immediately. cs.sendMu.Lock() err := send(cs.conn, tag, newErrFromLinuxerr(err)) cs.sendMu.Unlock() if err != nil { log.Debugf("p9.send: %v", err) } return true } // Try to start the tag. if !cs.StartTag(tag) { // Nothing we can do at this point; client is bogus. log.Debugf("no valid tag [%05d]", tag) return true } // Handle the message. r := cs.handle(m) // Clear the tag before sending. That's because as soon as this hits // the wire, the client can legally send the same tag. cs.ClearTag(tag) // Send back the result. cs.sendMu.Lock() err = send(cs.conn, tag, r) cs.sendMu.Unlock() if err != nil { log.Debugf("p9.send: %v", err) } // Return the message to the cache. msgRegistry.put(m) return true } func (cs *connState) handleRequests() { for { if !cs.handleRequest() { return } } } func (cs *connState) stop() { // Stop new requests from proceeding, and wait for completion of all // inflight requests. This is mostly so that if a request is stuck, the // sandbox supervisor has the opportunity to kill us with SIGABRT to get a // stack dump of the offending handler. cs.reqGate.Close() // Free the channels. cs.channelMu.Lock() for _, ch := range cs.channels { ch.Shutdown() } cs.channelWg.Wait() for _, ch := range cs.channels { ch.Close() } cs.channels = nil // Clear. cs.channelMu.Unlock() // Free the channel memory. if cs.channelAlloc != nil { cs.channelAlloc.Destroy() } // Ensure the connection is closed. cs.conn.Close() // Close all remaining fids. for fid, fidRef := range cs.fids { delete(cs.fids, fid) // Drop final reference in the FID table. Note this should // always close the file, since we've ensured that there are no // handlers running via the wait for Pending => 0 below. fidRef.DecRef() } } // Handle handles a single connection. func (s *Server) Handle(conn *unet.Socket) error { cs := &connState{ server: s, fids: make(map[FID]*fidRef), tags: make(map[Tag]chan struct{}), conn: conn, } defer cs.stop() // Serve requests from conn in the current goroutine; handleRequests() will // create more goroutines as needed. cs.handleRequests() return nil } // Serve handles requests from the bound socket. // // The passed serverSocket _must_ be created in packet mode. func (s *Server) Serve(serverSocket *unet.ServerSocket) error { var wg sync.WaitGroup defer wg.Wait() for { conn, err := serverSocket.Accept() if err != nil { // Something went wrong. // // Socket closed? return err } wg.Add(1) go func(conn *unet.Socket) { // S/R-SAFE: Irrelevant. s.Handle(conn) wg.Done() }(conn) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/p9/transport.go000066400000000000000000000206051465435605700223420ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package p9 import ( "errors" "fmt" "io" "io/ioutil" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) // ErrSocket is returned in cases of a socket issue. // // This may be treated differently than other errors. type ErrSocket struct { // error is the socket error. error } // ErrMessageTooLarge indicates the size was larger than reasonable. type ErrMessageTooLarge struct { size uint32 msize uint32 } // Error returns a sensible error. func (e *ErrMessageTooLarge) Error() string { return fmt.Sprintf("message too large for fixed buffer: size is %d, limit is %d", e.size, e.msize) } // ErrNoValidMessage indicates no valid message could be decoded. var ErrNoValidMessage = errors.New("buffer contained no valid message") const ( // headerLength is the number of bytes required for a header. headerLength uint32 = 7 // maximumLength is the largest possible message. maximumLength uint32 = 1 << 20 // DefaultMessageSize is a sensible default. DefaultMessageSize uint32 = 64 << 10 // initialBufferLength is the initial data buffer we allocate. initialBufferLength uint32 = 64 ) var dataPool = sync.Pool{ New: func() any { // These buffers are used for decoding without a payload. // We need to return a pointer to avoid unnecessary allocations // (see https://staticcheck.io/docs/checks#SA6002). b := make([]byte, initialBufferLength) return &b }, } // send sends the given message over the socket. func send(s *unet.Socket, tag Tag, m message) error { data := dataPool.Get().(*[]byte) dataBuf := buffer{data: (*data)[:0]} if log.IsLogging(log.Debug) { log.Debugf("send [FD %d] [Tag %06d] %s", s.FD(), tag, m.String()) } // Encode the message. The buffer will grow automatically. m.encode(&dataBuf) // Get our vectors to send. var hdr [headerLength]byte vecs := make([][]byte, 0, 3) vecs = append(vecs, hdr[:]) if len(dataBuf.data) > 0 { vecs = append(vecs, dataBuf.data) } totalLength := headerLength + uint32(len(dataBuf.data)) // Is there a payload? if payloader, ok := m.(payloader); ok { p := payloader.Payload() if len(p) > 0 { vecs = append(vecs, p) totalLength += uint32(len(p)) } } // Construct the header. headerBuf := buffer{data: hdr[:0]} headerBuf.Write32(totalLength) headerBuf.WriteMsgType(m.Type()) headerBuf.WriteTag(tag) // Pack any files if necessary. w := s.Writer(true) if filer, ok := m.(filer); ok { if f := filer.FilePayload(); f != nil { defer f.Close() // Pack the file into the message. w.PackFDs(f.FD()) } } for n := 0; n < int(totalLength); { cur, err := w.WriteVec(vecs) if err != nil { return ErrSocket{err} } n += cur // Consume iovecs. for consumed := 0; consumed < cur; { if len(vecs[0]) <= cur-consumed { consumed += len(vecs[0]) vecs = vecs[1:] } else { vecs[0] = vecs[0][cur-consumed:] break } } if n > 0 && n < int(totalLength) { // Don't resend any control message. w.UnpackFDs() } } // All set. dataPool.Put(&dataBuf.data) return nil } // lookupTagAndType looks up an existing message or creates a new one. // // This is called by recv after decoding the header. Any error returned will be // propagating back to the caller. You may use messageByType directly as a // lookupTagAndType function (by design). type lookupTagAndType func(tag Tag, t MsgType) (message, error) // recv decodes a message from the socket. // // This is done in two parts, and is thus not safe for multiple callers. // // On a socket error, the special error type ErrSocket is returned. // // The tag value NoTag will always be returned if err is non-nil. func recv(s *unet.Socket, msize uint32, lookup lookupTagAndType) (Tag, message, error) { // Read a header. // // Since the send above is atomic, we must always receive control // messages along with the header. This means we need to be careful // about closing FDs during errors to prevent leaks. var hdr [headerLength]byte r := s.Reader(true) r.EnableFDs(1) n, err := r.ReadVec([][]byte{hdr[:]}) if err != nil && (n == 0 || err != io.EOF) { r.CloseFDs() return NoTag, nil, ErrSocket{err} } fds, err := r.ExtractFDs() if err != nil { return NoTag, nil, ErrSocket{err} } defer func() { // Close anything left open. The case where // fds are caught and used is handled below, // and the fds variable will be set to nil. for _, fd := range fds { unix.Close(fd) } }() r.EnableFDs(0) // Continuing reading for a short header. for n < int(headerLength) { cur, err := r.ReadVec([][]byte{hdr[n:]}) if err != nil && (cur == 0 || err != io.EOF) { return NoTag, nil, ErrSocket{err} } n += cur } // Decode the header. headerBuf := buffer{data: hdr[:]} size := headerBuf.Read32() t := headerBuf.ReadMsgType() tag := headerBuf.ReadTag() if size < headerLength { // The message is too small. // // See above: it's probably screwed. return NoTag, nil, ErrSocket{ErrNoValidMessage} } if size > maximumLength || size > msize { // The message is too big. return NoTag, nil, ErrSocket{&ErrMessageTooLarge{size, msize}} } remaining := size - headerLength // Find our message to decode. m, err := lookup(tag, t) if err != nil { // Throw away the contents of this message. if remaining > 0 { io.Copy(ioutil.Discard, &io.LimitedReader{R: s, N: int64(remaining)}) } return tag, nil, err } // Not yet initialized. var dataBuf buffer var vecs [][]byte appendBuffer := func(size int) *[]byte { // Pull a data buffer from the pool. datap := dataPool.Get().(*[]byte) data := *datap if size > len(data) { // Create a larger data buffer. data = make([]byte, size) datap = &data } else { // Limit the data buffer. data = data[:size] } dataBuf = buffer{data: data} vecs = append(vecs, data) return datap } // Read the rest of the payload. // // This requires some special care to ensure that the vectors all line // up the way they should. We do this to minimize copying data around. if payloader, ok := m.(payloader); ok { fixedSize := payloader.FixedSize() // Do we need more than there is? if fixedSize > remaining { // This is not a valid message. if remaining > 0 { io.Copy(ioutil.Discard, &io.LimitedReader{R: s, N: int64(remaining)}) } return NoTag, nil, ErrNoValidMessage } if fixedSize != 0 { datap := appendBuffer(int(fixedSize)) defer dataPool.Put(datap) } // Include the payload. p := payloader.Payload() if p == nil || len(p) != int(remaining-fixedSize) { p = make([]byte, remaining-fixedSize) payloader.SetPayload(p) } if len(p) > 0 { vecs = append(vecs, p) } } else if remaining != 0 { datap := appendBuffer(int(remaining)) defer dataPool.Put(datap) } if len(vecs) > 0 { // Read the rest of the message. // // No need to handle a control message. r := s.Reader(true) for n := 0; n < int(remaining); { cur, err := r.ReadVec(vecs) if err != nil && (cur == 0 || err != io.EOF) { return NoTag, nil, ErrSocket{err} } n += cur // Consume iovecs. for consumed := 0; consumed < cur; { if len(vecs[0]) <= cur-consumed { consumed += len(vecs[0]) vecs = vecs[1:] } else { vecs[0] = vecs[0][cur-consumed:] break } } } } // Decode the message data. m.decode(&dataBuf) if dataBuf.isOverrun() { // No need to drain the socket. return NoTag, nil, ErrNoValidMessage } // Save the file, if any came out. if filer, ok := m.(filer); ok && len(fds) > 0 { // Set the file object. filer.SetFilePayload(fd.New(fds[0])) // Close the rest. We support only one. for i := 1; i < len(fds); i++ { unix.Close(fds[i]) } // Don't close in the defer. fds = nil } if log.IsLogging(log.Debug) { log.Debugf("recv [FD %d] [Tag %06d] %s", s.FD(), tag, m.String()) } // All set. return tag, m, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/p9/transport_flipcall.go000066400000000000000000000146571465435605700242220ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package p9 import ( "runtime" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdchannel" "gvisor.dev/gvisor/pkg/flipcall" "gvisor.dev/gvisor/pkg/log" ) // channelsPerClient is the number of channels to create per client. // // While the client and server will generally agree on this number, in reality // it's completely up to the server. We simply define a minimum of 2, and a // maximum of 4, and select the number of available processes as a tie-breaker. // Note that we don't want the number of channels to be too large, because each // will account for channelSize memory used, which can be large. var channelsPerClient = func() int { n := runtime.NumCPU() if n < 2 { return 2 } if n > 4 { return 4 } return n }() // channelSize is the channel size to create. // // We simply ensure that this is larger than the largest possible message size, // plus the flipcall packet header, plus the two bytes we write below. const channelSize = int(2 + flipcall.PacketHeaderBytes + 2 + maximumLength) // channel is a fast IPC channel. // // The same object is used by both the server and client implementations. In // general, the client will use only the send and recv methods. type channel struct { desc flipcall.PacketWindowDescriptor data flipcall.Endpoint fds fdchannel.Endpoint buf buffer // -- client only -- connected bool active bool // -- server only -- client *fd.FD done chan struct{} } // reset resets the channel buffer. func (ch *channel) reset(sz uint32) { ch.buf.data = ch.data.Data()[:sz] } // service services the channel. func (ch *channel) service(cs *connState) error { rsz, err := ch.data.RecvFirst() if err != nil { return err } for rsz > 0 { m, err := ch.recv(nil, rsz) if err != nil { return err } r := cs.handle(m) msgRegistry.put(m) rsz, err = ch.send(r, true /* isServer */) if err != nil { return err } } return nil // Done. } // Shutdown shuts down the channel. // // This must be called before Close. func (ch *channel) Shutdown() { ch.data.Shutdown() } // Close closes the channel. // // This must only be called once, and cannot return an error. Note that // synchronization for this method is provided at a high-level, depending on // whether it is the client or server. This cannot be called while there are // active callers in either service or sendRecv. // // Precondition: the channel should be shutdown. func (ch *channel) Close() error { // Close all backing transports. ch.fds.Destroy() ch.data.Destroy() if ch.client != nil { ch.client.Close() } return nil } // send sends the given message. // // The return value is the size of the received response. Not that in the // server case, this is the size of the next request. func (ch *channel) send(m message, isServer bool) (uint32, error) { if log.IsLogging(log.Debug) { log.Debugf("send [channel @%p] %s", ch, m.String()) } // Send any file payload. sentFD := false if filer, ok := m.(filer); ok { if f := filer.FilePayload(); f != nil { if err := ch.fds.SendFD(f.FD()); err != nil { return 0, err } f.Close() // Per sendRecvLegacy. sentFD = true // To mark below. } } // Encode the message. // // Note that IPC itself encodes the length of messages, so we don't // need to encode a standard 9P header. We write only the message type. ch.reset(0) ch.buf.WriteMsgType(m.Type()) if sentFD { ch.buf.Write8(1) // Incoming FD. } else { ch.buf.Write8(0) // No incoming FD. } m.encode(&ch.buf) ssz := uint32(len(ch.buf.data)) // Updated below. // Is there a payload? if payloader, ok := m.(payloader); ok { p := payloader.Payload() copy(ch.data.Data()[ssz:], p) ssz += uint32(len(p)) } // Perform the one-shot communication. if isServer { return ch.data.SendRecv(ssz) } // RPCs are expected to return quickly rather than block. return ch.data.SendRecvFast(ssz) } // recv decodes a message that exists on the channel. // // If the passed r is non-nil, then the type must match or an error will be // generated. If the passed r is nil, then a new message will be created and // returned. func (ch *channel) recv(r message, rsz uint32) (message, error) { // Decode the response from the inline buffer. ch.reset(rsz) t := ch.buf.ReadMsgType() hasFD := ch.buf.Read8() != 0 if t == MsgRlerror { // Change the message type. We check for this special case // after decoding below, and transform into an error. r = &Rlerror{} } else if r == nil { nr, err := msgRegistry.get(0, t) if err != nil { return nil, err } r = nr // New message. } else if t != r.Type() { // Not an error and not the expected response; propagate. return nil, &ErrBadResponse{Got: t, Want: r.Type()} } // Is there a payload? Copy from the latter portion. if payloader, ok := r.(payloader); ok { fs := payloader.FixedSize() p := payloader.Payload() payloadData := ch.buf.data[fs:] if len(p) < len(payloadData) { p = make([]byte, len(payloadData)) copy(p, payloadData) payloader.SetPayload(p) } else if n := copy(p, payloadData); n < len(p) { payloader.SetPayload(p[:n]) } ch.buf.data = ch.buf.data[:fs] } r.decode(&ch.buf) if ch.buf.isOverrun() { // Nothing valid was available. log.Debugf("recv [got %d bytes, needed more]", rsz) return nil, ErrNoValidMessage } // Read any FD result. if hasFD { if rfd, err := ch.fds.RecvFDNonblock(); err == nil { f := fd.New(rfd) if filer, ok := r.(filer); ok { // Set the payload. filer.SetFilePayload(f) } else { // Don't want the FD. f.Close() } } else { // The header bit was set but nothing came in. log.Warningf("expected FD, got err: %v", err) } } // Log a message. if log.IsLogging(log.Debug) { log.Debugf("recv [channel @%p] %s", ch, r.String()) } // Convert errors appropriately; see above. if rlerr, ok := r.(*Rlerror); ok { return r, unix.Errno(rlerr.Error) } return r, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/p9/version.go000066400000000000000000000155351465435605700220010ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package p9 import ( "fmt" "strconv" "strings" ) const ( // highestSupportedVersion is the highest supported version X in a // version string of the format 9P2000.L.Google.X. // // Clients are expected to start requesting this version number and // to continuously decrement it until a Tversion request succeeds. highestSupportedVersion uint32 = 13 // lowestSupportedVersion is the lowest supported version X in a // version string of the format 9P2000.L.Google.X. // // Clients are free to send a Tversion request at a version below this // value but are expected to encounter an Rlerror in response. lowestSupportedVersion uint32 = 0 // baseVersion is the base version of 9P that this package must always // support. It is equivalent to 9P2000.L.Google.0. baseVersion = "9P2000.L" ) // HighestVersionString returns the highest possible version string that a client // may request or a server may support. func HighestVersionString() string { return versionString(highestSupportedVersion) } // parseVersion parses a Tversion version string into a numeric version number // if the version string is supported by p9. Otherwise returns (0, false). // // From Tversion(9P): "Version strings are defined such that, if the client string // contains one or more period characters, the initial substring up to but not // including any single period in the version string defines a version of the protocol." // // p9 intentionally diverges from this and always requires that the version string // start with 9P2000.L to express that it is always compatible with 9P2000.L. The // only supported versions extensions are of the format 9p2000.L.Google.X where X // is an ever increasing version counter. // // Version 9P2000.L.Google.0 implies 9P2000.L. // // New versions must always be a strict superset of 9P2000.L. A version increase must // define a predicate representing the feature extension introduced by that version. The // predicate must be commented and should take the format: // // // VersionSupportsX returns true if version v supports X and must be checked when ... // // func VersionSupportsX(v int32) bool { // ... // } func parseVersion(str string) (uint32, bool) { // Special case the base version which lacks the ".Google.X" suffix. This // version always means version 0. if str == baseVersion { return 0, true } substr := strings.Split(str, ".") if len(substr) != 4 { return 0, false } if substr[0] != "9P2000" || substr[1] != "L" || substr[2] != "Google" || len(substr[3]) == 0 { return 0, false } version, err := strconv.ParseUint(substr[3], 10, 32) if err != nil { return 0, false } return uint32(version), true } // versionString formats a p9 version number into a Tversion version string. func versionString(version uint32) string { // Special case the base version so that clients expecting this string // instead of the 9P2000.L.Google.0 equivalent get it. This is important // for backwards compatibility with legacy servers that check for exactly // the baseVersion and allow nothing else. if version == 0 { return baseVersion } return fmt.Sprintf("9P2000.L.Google.%d", version) } // VersionSupportsTflushf returns true if version v supports the Tflushf message. // This predicate must be checked by clients before attempting to make a Tflushf // request. If this predicate returns false, then clients may safely no-op. func VersionSupportsTflushf(v uint32) bool { return v >= 1 } // versionSupportsTwalkgetattr returns true if version v supports the // Twalkgetattr message. This predicate must be checked by clients before // attempting to make a Twalkgetattr request. func versionSupportsTwalkgetattr(v uint32) bool { return v >= 2 } // versionSupportsTucreation returns true if version v supports the Tucreation // messages (Tucreate, Tusymlink, Tumkdir, Tumknod). This predicate must be // checked by clients before attempting to make a Tucreation request. // If Tucreation messages are not supported, their non-UID supporting // counterparts (Tlcreate, Tsymlink, Tmkdir, Tmknod) should be used. func versionSupportsTucreation(v uint32) bool { return v >= 3 } // VersionSupportsConnect returns true if version v supports the Tlconnect // message. This predicate must be checked by clients // before attempting to make a Tlconnect request. If Tlconnect messages are not // supported, Tlopen should be used. func VersionSupportsConnect(v uint32) bool { return v >= 4 } // VersionSupportsAnonymous returns true if version v supports Tlconnect // with the AnonymousSocket mode. This predicate must be checked by clients // before attempting to use the AnonymousSocket Tlconnect mode. func VersionSupportsAnonymous(v uint32) bool { return v >= 5 } // VersionSupportsMultiUser returns true if version v supports multi-user fake // directory permissions and ID values. func VersionSupportsMultiUser(v uint32) bool { return v >= 6 } // versionSupportsTallocate returns true if version v supports Allocate(). func versionSupportsTallocate(v uint32) bool { return v >= 7 } // versionSupportsFlipcall returns true if version v supports IPC channels from // the flipcall package. Note that these must be negotiated, but this version // string indicates that such a facility exists. func versionSupportsFlipcall(v uint32) bool { return v >= 8 } // VersionSupportsOpenTruncateFlag returns true if version v supports // passing the OpenTruncate flag to Tlopen. func VersionSupportsOpenTruncateFlag(v uint32) bool { return v >= 9 } // versionSupportsGetSetXattr returns true if version v supports // the Tgetxattr and Tsetxattr messages. func versionSupportsGetSetXattr(v uint32) bool { return v >= 10 } // versionSupportsListRemoveXattr returns true if version v supports // the Tlistxattr and Tremovexattr messages. func versionSupportsListRemoveXattr(v uint32) bool { return v >= 11 } // versionSupportsTsetattrclunk returns true if version v supports // the Tsetattrclunk message. func versionSupportsTsetattrclunk(v uint32) bool { return v >= 12 } // versionSupportsTmultiGetAttr returns true if version v supports // the TmultiGetAttr message. func versionSupportsTmultiGetAttr(v uint32) bool { return v >= 13 } // versionSupportsBind returns true if version v supports the Tbind message. func versionSupportsBind(v uint32) bool { // TODO(b/194709873): Bump version and gate with that. return false } golang-gvisor-gvisor-0.0~20240729.0/pkg/pool/000077500000000000000000000000001465435605700203755ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/pool/pool.go000066400000000000000000000027221465435605700217000ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package pool provides a trivial integer pool. package pool import ( "gvisor.dev/gvisor/pkg/sync" ) // Pool is a simple allocator. type Pool struct { mu sync.Mutex // cache is the set of returned values. cache []uint64 // Start is the starting value (if needed). Start uint64 // max is the current maximum issued. max uint64 // Limit is the upper limit. Limit uint64 } // Get gets a value from the pool. func (p *Pool) Get() (uint64, bool) { p.mu.Lock() defer p.mu.Unlock() // Anything cached? if len(p.cache) > 0 { v := p.cache[len(p.cache)-1] p.cache = p.cache[:len(p.cache)-1] return v, true } // Over the limit? if p.Start == p.Limit { return 0, false } // Generate a new value. v := p.Start p.Start++ return v, true } // Put returns a value to the pool. func (p *Pool) Put(v uint64) { p.mu.Lock() p.cache = append(p.cache, v) p.mu.Unlock() } golang-gvisor-gvisor-0.0~20240729.0/pkg/pool/pool_state_autogen.go000066400000000000000000000000661465435605700246210ustar00rootroot00000000000000// automatically generated by stateify. package pool golang-gvisor-gvisor-0.0~20240729.0/pkg/prometheus/000077500000000000000000000000001465435605700216175ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/prometheus/prometheus.go000066400000000000000000000765451465435605700243620ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package prometheus contains Prometheus-compliant metric data structures and utilities. // It can export data in Prometheus data format, documented at: // https://prometheus.io/docs/instrumenting/exposition_formats/ package prometheus import ( "bytes" "errors" "fmt" "io" "math" "reflect" "sort" "strings" "time" ) // timeNow is the time.Now() function. Can be mocked in tests. var timeNow = time.Now // Prometheus label names used to identify each sandbox. const ( SandboxIDLabel = "sandbox" PodNameLabel = "pod_name" NamespaceLabel = "namespace_name" IterationIDLabel = "iteration" ) // Type is a Prometheus metric type. type Type int // List of supported Prometheus metric types. const ( TypeUntyped = Type(iota) TypeGauge TypeCounter TypeHistogram ) // Metric is a Prometheus metric metadata. type Metric struct { // Name is the Prometheus metric name. Name string `json:"name"` // Type is the type of the metric. Type Type `json:"type"` // Help is an optional helpful string explaining what the metric is about. Help string `json:"help"` } // writeMetricHeaderTo writes the metric comment header to the given writer. func writeMetricHeaderTo[T io.StringWriter](w T, m *Metric, options SnapshotExportOptions) error { if m.Help != "" { // This writes each string component one by one (rather than using fmt.Sprintf) // in order to avoid allocating strings for each metric. if _, err := w.WriteString("# HELP "); err != nil { return err } if _, err := w.WriteString(options.ExporterPrefix); err != nil { return err } if _, err := w.WriteString(m.Name); err != nil { return err } if _, err := w.WriteString(" "); err != nil { return err } if _, err := writeEscapedString(w, m.Help, false); err != nil { return err } if _, err := w.WriteString("\n"); err != nil { return err } } var metricType string switch m.Type { case TypeGauge: metricType = "gauge" case TypeCounter: metricType = "counter" case TypeHistogram: metricType = "histogram" case TypeUntyped: metricType = "untyped" } if metricType != "" { if _, err := w.WriteString("# TYPE "); err != nil { return err } if _, err := w.WriteString(options.ExporterPrefix); err != nil { return err } if _, err := w.WriteString(m.Name); err != nil { return err } if _, err := w.WriteString(" "); err != nil { return err } if _, err := w.WriteString(metricType); err != nil { return err } if _, err := w.WriteString("\n"); err != nil { return err } } return nil } // Number represents a numerical value. // In Prometheus, all numbers are float64s. // However, for the purpose of usage of this library, we support expressing numbers as integers, // which makes things like counters much easier and more precise. // At data export time (i.e. when written out in Prometheus data format), it is coalesced into // a float. type Number struct { // Float is the float value of this number. // Mutually exclusive with Int. Float float64 `json:"float,omitempty"` // Int is the integer value of this number. // Mutually exclusive with Float. Int int64 `json:"int,omitempty"` } // Common numbers which are reused and don't need their own memory allocations. var ( zero = Number{} intOne = Number{Int: 1} floatOne = Number{Float: 1.0} floatNaN = Number{Float: math.NaN()} floatInf = Number{Float: math.Inf(1)} floatNegInf = Number{Float: math.Inf(-1)} ) // NewInt returns a new integer Number. func NewInt(val int64) *Number { switch val { case 0: return &zero case 1: return &intOne default: return &Number{Int: val} } } // NewFloat returns a new floating-point Number. func NewFloat(val float64) *Number { if math.IsNaN(val) { return &floatNaN } switch val { case 0: return &zero case 1.0: return &floatOne case math.Inf(1.0): return &floatInf case math.Inf(-1.0): return &floatNegInf default: return &Number{Float: val} } } // IsInteger returns whether this number contains an integer value. // This is defined as either having the `Float` part set to zero (in which case the `Int` part takes // precedence), or having `Float` be a value equal to its own rounding and not a special float. // //go:nosplit func (n *Number) IsInteger() bool { if n.Float == 0 { return true } if math.IsNaN(n.Float) || n.Float == math.Inf(-1) || n.Float == math.Inf(1) { return false } return n.Float < float64(math.MaxInt64) && n.Float > float64(math.MinInt64) && math.Round(n.Float) == n.Float } // ToFloat returns this number as a floating-point number, regardless of which // type the number was encoded as. An integer Number will have its value cast // to a float, while a floating-point Number will have its value returned // as-is. // //go:nosplit func (n *Number) ToFloat() float64 { if n.Int != 0 { return float64(n.Int) } return n.Float } // String returns a string representation of this number. func (n *Number) String() string { var s strings.Builder if err := writeNumberTo(&s, n); err != nil { panic(err) } return s.String() } // SameType returns true if `n` and `other` are either both floating-point or both integers. // If a `Number` is zero, it is considered of the same type as any other zero `Number`. // //go:nosplit func (n *Number) SameType(other *Number) bool { // Within `n` and `other`, at least one of `Int` or `Float` must be set to zero. // Therefore, this verifies that there is at least one shared zero between the two. return n.Float == other.Float || n.Int == other.Int } // GreaterThan returns true if n > other. // Precondition: n.SameType(other) is true. Panics otherwise. // //go:nosplit func (n *Number) GreaterThan(other *Number) bool { if !n.SameType(other) { panic("tried to compare two numbers of different types") } if n.IsInteger() { return n.Int > other.Int } return n.Float > other.Float } // WriteInteger writes the given integer to a writer without allocating strings. // //go:nosplit func WriteInteger[T io.StringWriter](w T, val int64) (int, error) { const decimalDigits = "0123456789" if val == 0 { return w.WriteString(decimalDigits[0:1]) } var written int if val < 0 { n, err := w.WriteString("-") written += n if err != nil { return written, err } val = -val } decimal := int64(1) for ; val/decimal != 0; decimal *= 10 { } for decimal /= 10; decimal > 0; decimal /= 10 { digit := (val / decimal) % 10 n, err := w.WriteString(decimalDigits[digit : digit+1]) written += n if err != nil { return written, err } } return written, nil } // WriteHex writes the given integer as hex to a writer // without allocating strings. // //go:nosplit func WriteHex[T io.StringWriter](w T, val uint64) (int, error) { const hexDigits = "0123456789abcdef" if val == 0 { return w.WriteString(hexDigits[0:1]) } var written int hex := uint64(16) for ; val/hex != 0; hex <<= 4 { } for hex >>= 4; hex > 0; hex >>= 4 { digit := (val / hex) % 16 n, err := w.WriteString(hexDigits[digit : digit+1]) written += n if err != nil { return written, err } } return written, nil } // writeNumberTo writes the number to the given writer. // This only causes heap allocations when the number is a non-zero, non-special float. func writeNumberTo[T io.StringWriter](w T, n *Number) error { var s string switch { // Zero case: case n.Int == 0 && n.Float == 0: s = "0" // Integer case: case n.Int != 0: _, err := WriteInteger(w, n.Int) return err // Special float cases: case n.Float == math.Inf(-1): s = "-Inf" case n.Float == math.Inf(1): s = "+Inf" case math.IsNaN(n.Float): s = "NaN" // Regular float case: default: s = fmt.Sprintf("%f", n.Float) } _, err := w.WriteString(s) return err } // Bucket is a single histogram bucket. type Bucket struct { // UpperBound is the upper bound of the bucket. // The lower bound of the bucket is the largest UpperBound within other Histogram Buckets that // is smaller than this bucket's UpperBound. // The bucket with the smallest UpperBound within a Histogram implicitly has -Inf as lower bound. // This should be set to +Inf to mark the "last" bucket. UpperBound Number `json:"le"` // Samples is the number of samples in the bucket. // Note: When exported to Prometheus, they are exported cumulatively, i.e. the count of samples // exported in Bucket i is actually sum(histogram.Buckets[j].Samples for 0 <= j <= i). Samples uint64 `json:"n,omitempty"` } // Histogram contains data about histogram values. type Histogram struct { // Total is the sum of sample values across all buckets. Total Number `json:"total"` // Min is the minimum sample ever recorded in this histogram. Min Number `json:"min"` // Max is the maximum sample ever recorded in this histogram. Max Number `json:"max"` // SumOfSquaredDeviations is the number of squared deviations of all samples. SumOfSquaredDeviations Number `json:"ssd"` // Buckets contains per-bucket data. // A distribution with n finite-boundary buckets should have n+2 entries here. // The 0th entry is the underflow bucket (i.e. the one with -inf as lower bound), // and the last aka (n+1)th entry is the overflow bucket (i.e. the one with +inf as upper bound). Buckets []Bucket `json:"buckets,omitempty"` } // Data is an observation of the value of a single metric at a certain point in time. type Data struct { // Metric is the metric for which the value is being reported. Metric *Metric `json:"metric"` // Labels is a key-value pair representing the labels set on this metric. // This may be merged with other labels during export. Labels map[string]string `json:"labels,omitempty"` // ExternalLabels are more labels merged together with `Labels`. // They can be set using SetExternalLabels. // They are useful in the case where a single Data needs labels from two sources: // labels specific to this data point (which should be in `Labels`), and labels // that are shared between multiple data points (stored in `ExternalLabels`). // This avoids allocating unique `Labels` maps for each Data struct, when // most of the actual labels would be shared between them. ExternalLabels map[string]string `json:"external_labels,omitempty"` // At most one of the fields below may be set. // Which one depends on the type of the metric. // Number is used for all numerical types. Number *Number `json:"val,omitempty"` // Histogram is used for histogram-typed metrics. HistogramValue *Histogram `json:"histogram,omitempty"` } // NewIntData returns a new Data struct with the given metric and value. func NewIntData(metric *Metric, val int64) *Data { return LabeledIntData(metric, nil, val) } // LabeledIntData returns a new Data struct with the given metric, labels, and value. func LabeledIntData(metric *Metric, labels map[string]string, val int64) *Data { return &Data{Metric: metric, Labels: labels, Number: NewInt(val)} } // NewFloatData returns a new Data struct with the given metric and value. func NewFloatData(metric *Metric, val float64) *Data { return LabeledFloatData(metric, nil, val) } // LabeledFloatData returns a new Data struct with the given metric, labels, and value. func LabeledFloatData(metric *Metric, labels map[string]string, val float64) *Data { return &Data{Metric: metric, Labels: labels, Number: NewFloat(val)} } // SetExternalLabels sets d.ExternalLabels. See its docstring for more information. // Returns `d` for chainability. func (d *Data) SetExternalLabels(externalLabels map[string]string) *Data { d.ExternalLabels = externalLabels return d } // ExportOptions contains options that control how metric data is exported in Prometheus format. type ExportOptions struct { // CommentHeader is prepended as a comment before any metric data is exported. CommentHeader string // MetricsWritten memoizes written metric preambles (help/type comments) // by metric name. // If specified, this map can be used to avoid duplicate preambles across multiple snapshots. // Note that this map is modified in-place during the writing process. MetricsWritten map[string]bool } // SnapshotExportOptions contains options that control how metric data is exported for an // individual Snapshot. type SnapshotExportOptions struct { // ExporterPrefix is prepended to all metric names. ExporterPrefix string // ExtraLabels is added as labels for all metric values. ExtraLabels map[string]string } // writeEscapedString writes the given string in quotation marks and with some characters escaped, // per Prometheus spec. It does this without string allocations. // If `quoted` is true, quote characters will surround the string, and quote characters within `s` // will also be escaped. func writeEscapedString[T io.StringWriter](w T, s string, quoted bool) (int, error) { const ( quote = '"' backslash = '\\' newline = '\n' quoteStr = `"` escapedQuote = `\\"` escapedBackslash = "\\\\" escapedNewline = "\\\n" ) written := 0 var n int var err error if quoted { n, err = w.WriteString(quoteStr) written += n if err != nil { return written, err } } for _, r := range s { switch r { case quote: if quoted { n, err = w.WriteString(escapedQuote) } else { n, err = w.WriteString(quoteStr) } case backslash: n, err = w.WriteString(escapedBackslash) case newline: n, err = w.WriteString(escapedNewline) default: n, err = w.WriteString(string(r)) } written += n if err != nil { return written, err } } if quoted { n, err = w.WriteString(quoteStr) written += n if err != nil { return written, err } } return written, nil } // writeMetricPreambleTo writes the metric name to the writer. It may also // write unwritten help and type comments of the metric if they haven't been // written to the writer yet. func writeMetricPreambleTo[T io.StringWriter](w T, d *Data, options SnapshotExportOptions, metricsWritten map[string]bool) error { // Metric header, if we haven't printed it yet. if !metricsWritten[d.Metric.Name] { // Extra newline before each preamble for aesthetic reasons. if _, err := w.WriteString("\n"); err != nil { return err } if err := writeMetricHeaderTo(w, d.Metric, options); err != nil { return err } metricsWritten[d.Metric.Name] = true } // Metric name. if options.ExporterPrefix != "" { if _, err := w.WriteString(options.ExporterPrefix); err != nil { return err } } if _, err := w.WriteString(d.Metric.Name); err != nil { return err } return nil } // keyVal is a key-value pair used in the function below. type keyVal struct{ Key, Value string } // sortedIterateLabels iterates through labels and outputs them to `out` in sorted key order, // or stops when cancelCh is written to. It runs in O(n^2) time but makes no heap allocations. func sortedIterateLabels(labels map[string]string, out chan<- keyVal, cancelCh <-chan struct{}) { defer close(out) if len(labels) == 0 { return } // smallestKey is the smallest key that we've already sent to `out`. // It starts as the empty string, which means we haven't sent anything to `out` yet. smallestKey := "" // Find the smallest key of the whole set and send it out. for k := range labels { if smallestKey == "" || k < smallestKey { smallestKey = k } } select { case out <- keyVal{smallestKey, labels[smallestKey]}: case <-cancelCh: return } // Iterate until we've sent as many items as we have as input to the output channel. // We start at 1 because the loop above already sent out the smallest key to `out`. for numOutput := 1; numOutput < len(labels); numOutput++ { // nextSmallestKey is the smallest key that is strictly larger than `smallestKey`. nextSmallestKey := "" for k := range labels { if k > smallestKey && (nextSmallestKey == "" || k < nextSmallestKey) { nextSmallestKey = k } } // Update smallestKey and send it out. smallestKey = nextSmallestKey select { case out <- keyVal{smallestKey, labels[smallestKey]}: case <-cancelCh: return } } } // LabelOrError is used in OrderedLabels. // It represents either a key-value pair, or an error. type LabelOrError struct { Key, Value string Error error } // OrderedLabels streams the list of 'label_key="label_value"' in sorted order, except "le" which is // a reserved Prometheus label name and should go last. // If an error is encountered, it is returned as the Error field of LabelOrError, and no further // messages will be sent on the channel. func OrderedLabels(labels ...map[string]string) <-chan LabelOrError { // This function is quite hot on the metric-rendering path, and its naive "just put all the // strings in one map to ensure no dupes it, then in one slice and sort it" approach is very // allocation-heavy. This approach is more computation-heavy (it runs in // O(len(labels) * len(largest label map))), but the only heap allocations it does is for the // following tiny slices and channels. In practice, the number of label maps and the size of // each label map is tiny, so this is worth doing despite the theoretically-longer run time. // Initialize the channels we'll use. mapChannels := make([]chan keyVal, 0, len(labels)) lastKeyVal := make([]keyVal, len(labels)) resultCh := make(chan LabelOrError) var cancelCh chan struct{} // outputError is a helper function for when we have encountered an error mid-way. outputError := func(err error) { if cancelCh != nil { for range mapChannels { cancelCh <- struct{}{} } close(cancelCh) } resultCh <- LabelOrError{Error: err} close(resultCh) } // Verify that no label is the empty string. It's not a valid label name, // and we use the empty string later on in the function as a marker of having // finished processing all labels from a given label map. for _, labelMap := range labels { for label := range labelMap { if label == "" { go outputError(errors.New("got empty-string label")) return resultCh } } } // Each label map is processed in its own goroutine, // which will stream it back to this function in sorted order. cancelCh = make(chan struct{}, len(labels)) for _, labelMap := range labels { ch := make(chan keyVal) mapChannels = append(mapChannels, ch) go sortedIterateLabels(labelMap, ch, cancelCh) } // This goroutine is the meat of this function; it iterates through // the results being streamed from each `sortedIterateLabels` goroutine // that we spawned earlier, until all of them are exhausted or until we // hit an error. go func() { // The "le" label is special and goes last, not in sorted order. // gotLe is the empty string if there is no "le" label, // otherwise it's the value of the "le" label. var gotLe string // numChannelsLeft tracks the number of channels that are still live. for numChannelsLeft := len(mapChannels); numChannelsLeft > 0; { // Iterate over all channels and ensure we have the freshest (smallest) // label from each of them. for i, ch := range mapChannels { // A nil channel is one that has been closed. if ch == nil { continue } // If we already have the latest value from this channel, // keep it there instead of getting a new one, if lastKeyVal[i].Key != "" { continue } // Otherwise, get a new label. kv, open := <-ch if !open { // Channel has been closed, no more to read from this one. numChannelsLeft-- mapChannels[i] = nil continue } if kv.Key == "le" { if gotLe != "" { outputError(errors.New("got duplicate 'le' label")) return } gotLe = kv.Value continue } lastKeyVal[i] = kv } // We have one key-value pair from each still-active channel now. // Find the smallest one between them. smallestKey := "" indexForSmallest := -1 for i, kv := range lastKeyVal { if kv.Key == "" { continue } if smallestKey == "" || kv.Key < smallestKey { smallestKey = kv.Key indexForSmallest = i } else if kv.Key == smallestKey { outputError(fmt.Errorf("got duplicate label %q", smallestKey)) return } } if indexForSmallest == -1 { // There are no more key-value pairs to output. We're done. break } // Output the smallest key-value pairs out of all the channels. resultCh <- LabelOrError{ Key: smallestKey, Value: lastKeyVal[indexForSmallest].Value, } // Mark the last key-value pair from the channel that gave us the // smallest key-value pair as no longer present, so that we get a new // key-value pair from it in the next iteration. lastKeyVal[indexForSmallest] = keyVal{} } // Output the "le" label last. if gotLe != "" { resultCh <- LabelOrError{ Key: "le", Value: gotLe, } } close(resultCh) close(cancelCh) }() return resultCh } // writeLabelsTo writes a set of metric labels. func writeLabelsTo[T io.StringWriter](w T, d *Data, extraLabels map[string]string, leLabel *Number) error { if len(d.Labels)+len(d.ExternalLabels)+len(extraLabels) != 0 || leLabel != nil { if _, err := w.WriteString("{"); err != nil { return err } var orderedLabels <-chan LabelOrError if leLabel != nil { orderedLabels = OrderedLabels(d.Labels, d.ExternalLabels, extraLabels, map[string]string{"le": leLabel.String()}) } else { orderedLabels = OrderedLabels(d.Labels, d.ExternalLabels, extraLabels) } firstLabel := true var foundError error for labelOrError := range orderedLabels { if foundError != nil { continue } if labelOrError.Error != nil { foundError = labelOrError.Error continue } if !firstLabel { if _, err := w.WriteString(","); err != nil { return err } } firstLabel = false if _, err := w.WriteString(labelOrError.Key); err != nil { return err } if _, err := w.WriteString("="); err != nil { return err } if _, err := writeEscapedString(w, labelOrError.Value, true); err != nil { return err } } if foundError != nil { return foundError } if _, err := w.WriteString("}"); err != nil { return err } } return nil } // writeMetricLine writes a single Data line with a single number (val) to w. func writeMetricLine[T io.StringWriter](w T, d *Data, metricSuffix string, val *Number, when time.Time, options SnapshotExportOptions, leLabel *Number, metricsWritten map[string]bool) error { if err := writeMetricPreambleTo(w, d, options, metricsWritten); err != nil { return err } if metricSuffix != "" { if _, err := w.WriteString(metricSuffix); err != nil { return err } } if err := writeLabelsTo(w, d, options.ExtraLabels, leLabel); err != nil { return err } if _, err := w.WriteString(" "); err != nil { return err } if err := writeNumberTo(w, val); err != nil { return err } if _, err := w.WriteString(" "); err != nil { return err } if _, err := WriteInteger(w, when.UnixMilli()); err != nil { return err } if _, err := w.WriteString("\n"); err != nil { return err } return nil } // writeDataTo writes the Data to the given writer in Prometheus format. func writeDataTo[T io.StringWriter](w T, d *Data, when time.Time, options SnapshotExportOptions, metricsWritten map[string]bool) error { switch d.Metric.Type { case TypeUntyped, TypeGauge, TypeCounter: return writeMetricLine(w, d, "", d.Number, when, options, nil, metricsWritten) case TypeHistogram: // Write an empty line before and after histograms to easily distinguish them from // other metric lines. if _, err := w.WriteString("\n"); err != nil { return err } var numSamples uint64 var samples Number for _, bucket := range d.HistogramValue.Buckets { numSamples += bucket.Samples samples.Int = int64(numSamples) // Prometheus distribution bucket counts are cumulative. if err := writeMetricLine(w, d, "_bucket", &samples, when, options, &bucket.UpperBound, metricsWritten); err != nil { return err } } if err := writeMetricLine(w, d, "_sum", &d.HistogramValue.Total, when, options, nil, metricsWritten); err != nil { return err } samples.Int = int64(numSamples) if err := writeMetricLine(w, d, "_count", &samples, when, options, nil, metricsWritten); err != nil { return err } if err := writeMetricLine(w, d, "_min", &d.HistogramValue.Min, when, options, nil, metricsWritten); err != nil { return err } if err := writeMetricLine(w, d, "_max", &d.HistogramValue.Max, when, options, nil, metricsWritten); err != nil { return err } if err := writeMetricLine(w, d, "_ssd", &d.HistogramValue.SumOfSquaredDeviations, when, options, nil, metricsWritten); err != nil { return err } // Empty line after the histogram. if _, err := w.WriteString("\n"); err != nil { return err } return nil default: return fmt.Errorf("unknown metric type for metric %s: %v", d.Metric.Name, d.Metric.Type) } } // Snapshot is a snapshot of the values of all the metrics at a certain point in time. type Snapshot struct { // When is the timestamp at which the snapshot was taken. // Note that Prometheus ultimately encodes timestamps as millisecond-precision int64s from epoch. When time.Time `json:"when,omitempty"` // Data is the whole snapshot data. // Each Data must be a unique combination of (Metric, Labels) within a Snapshot. Data []*Data `json:"data,omitempty"` } // NewSnapshot returns a new Snapshot at the current time. func NewSnapshot() *Snapshot { return &Snapshot{When: timeNow()} } // Add data point(s) to the snapshot. // Returns itself for chainability. func (s *Snapshot) Add(data ...*Data) *Snapshot { s.Data = append(s.Data, data...) return s } const counterWriterBufSize = 32768 // countingWriter implements io.StringWriter, and counts the number of bytes // written to it. // Useful in this file to keep track of total number of bytes without having // to plumb this everywhere in the writeX() functions in this file. type countingWriter[T io.StringWriter] struct { buf *bytes.Buffer underlying T written int } // WriteString implements io.StringWriter.WriteString. // This avoids going into the slow, allocation-heavy path of io.WriteString. func (w *countingWriter[T]) WriteString(s string) (int, error) { written, err := w.buf.WriteString(s) w.written += written if w.buf.Len() >= counterWriterBufSize { w.Flush() } return written, err } func (w *countingWriter[T]) Flush() error { if w.buf.Len() > 0 { _, err := w.underlying.WriteString(w.buf.String()) w.buf.Reset() return err } return nil } // Written returns the number of bytes written to the underlying writer (minus buffered writes). func (w *countingWriter[T]) Written() int { return w.written - w.buf.Len() } // writeSnapshotSingleMetric writes a single metric data from a snapshot to // the given writer in Prometheus format. // It returns the number of bytes written. func writeSnapshotSingleMetric[T io.StringWriter](w T, s *Snapshot, options SnapshotExportOptions, metricName string, metricsWritten map[string]bool) error { if !strings.HasPrefix(metricName, options.ExporterPrefix) { return nil } wantMetricName := strings.TrimPrefix(metricName, options.ExporterPrefix) for _, d := range s.Data { if d.Metric.Name != wantMetricName { continue } if err := writeDataTo(w, d, s.When, options, metricsWritten); err != nil { return err } } return nil } // ReusableWriter is a writer that can be reused to efficiently write // successive snapshots. type ReusableWriter[T io.StringWriter] struct { // buf is the reusable buffer used for buffering writes. // It is reset after each write, but keeps the underlying byte buffer, // avoiding allocations on successive snapshot writes. buf bytes.Buffer } // Write writes one or more snapshots to the writer. // This method may not be used concurrently for the same `ReusableWriter`. func (rw *ReusableWriter[T]) Write(w T, options ExportOptions, snapshotsToOptions map[*Snapshot]SnapshotExportOptions) (int, error) { rw.buf.Reset() cw := &countingWriter[T]{ buf: &rw.buf, underlying: w, } return write(cw, options, snapshotsToOptions) } // Write writes one or more snapshots to the writer. // This ensures same-name metrics across different snapshots are printed together, per spec. // If the caller will call `Write` successively for multiple snapshots, it is more efficient // to use the `ReusableWriter` type instead of this function. func Write[T io.StringWriter](w T, options ExportOptions, snapshotsToOptions map[*Snapshot]SnapshotExportOptions) (int, error) { var b bytes.Buffer // Sane default buffer size. b.Grow(counterWriterBufSize) cw := &countingWriter[T]{ buf: &b, underlying: w, } return write(cw, options, snapshotsToOptions) } func write[T io.StringWriter](cw *countingWriter[T], options ExportOptions, snapshotsToOptions map[*Snapshot]SnapshotExportOptions) (int, error) { if len(snapshotsToOptions) == 0 { return 0, nil } if options.CommentHeader != "" { for _, commentLine := range strings.Split(options.CommentHeader, "\n") { if _, err := cw.WriteString("# "); err != nil { return cw.Written(), err } if _, err := cw.WriteString(commentLine); err != nil { return cw.Written(), err } if _, err := cw.WriteString("\n"); err != nil { return cw.Written(), err } } } snapshots := make([]*Snapshot, 0, len(snapshotsToOptions)) for snapshot := range snapshotsToOptions { snapshots = append(snapshots, snapshot) } switch len(snapshots) { case 1: // Single-snapshot case. if _, err := cw.WriteString(fmt.Sprintf("# Writing data from snapshot containing %d data points taken at %v.\n", len(snapshots[0].Data), snapshots[0].When)); err != nil { return cw.Written(), err } default: // Multi-snapshot case. // Provide a consistent ordering of snapshots. sort.Slice(snapshots, func(i, j int) bool { return reflect.ValueOf(snapshots[i]).Pointer() < reflect.ValueOf(snapshots[j]).Pointer() }) if _, err := cw.WriteString(fmt.Sprintf("# Writing data from %d snapshots:\n", len(snapshots))); err != nil { return cw.Written(), err } for _, snapshot := range snapshots { if _, err := cw.WriteString(fmt.Sprintf("# - Snapshot with %d data points taken at %v: %v\n", len(snapshot.Data), snapshot.When, snapshotsToOptions[snapshot].ExtraLabels)); err != nil { return cw.Written(), err } } } if _, err := cw.WriteString("\n"); err != nil { return cw.Written(), err } if options.MetricsWritten == nil { options.MetricsWritten = make(map[string]bool) } metricNamesMap := make(map[string]bool, len(options.MetricsWritten)) metricNames := make([]string, 0, len(options.MetricsWritten)) for _, snapshot := range snapshots { for _, data := range snapshot.Data { metricName := snapshotsToOptions[snapshot].ExporterPrefix + data.Metric.Name if !metricNamesMap[metricName] { metricNamesMap[metricName] = true metricNames = append(metricNames, metricName) } } } sort.Strings(metricNames) for _, metricName := range metricNames { for _, snapshot := range snapshots { writeSnapshotSingleMetric(cw, snapshot, snapshotsToOptions[snapshot], metricName, options.MetricsWritten) } } if _, err := cw.WriteString("\n# End of metric data.\n"); err != nil { return cw.Written(), err } if err := cw.Flush(); err != nil { return cw.Written(), err } return cw.Written(), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/prometheus/prometheus_verify.go000066400000000000000000001017171465435605700257340ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package prometheus import ( "errors" "fmt" "math" "strings" "sync" "time" "unicode" pb "gvisor.dev/gvisor/pkg/metric/metric_go_proto" ) const ( // maxExportStaleness is the maximum allowed age of a snapshot when it is verified. // Used to avoid exporting snapshots from bogus times from ages past. maxExportStaleness = 10 * time.Second // MetaMetricPrefix is a prefix used for metrics defined by the metric server, // as opposed to metrics generated by each sandbox. // For this reason, this prefix is not allowed to be used in sandbox metrics. MetaMetricPrefix = "meta_" ) // Prometheus process-level metric names and definitions. // These are not necessarily exported, but we enforce that sandboxes may not // export metrics sharing the same names. // https://prometheus.io/docs/instrumenting/writing_clientlibs/#process-metrics var ( ProcessCPUSecondsTotal = Metric{ Name: "process_cpu_seconds_total", Type: TypeGauge, Help: "Total user and system CPU time spent in seconds.", } ProcessOpenFDs = Metric{ Name: "process_open_fds", Type: TypeGauge, Help: "Number of open file descriptors.", } ProcessMaxFDs = Metric{ Name: "process_max_fds", Type: TypeGauge, Help: "Maximum number of open file descriptors.", } ProcessVirtualMemoryBytes = Metric{ Name: "process_virtual_memory_bytes", Type: TypeGauge, Help: "Virtual memory size in bytes.", } ProcessVirtualMemoryMaxBytes = Metric{ Name: "process_virtual_memory_max_bytes", Type: TypeGauge, Help: "Maximum amount of virtual memory available in bytes.", } ProcessResidentMemoryBytes = Metric{ Name: "process_resident_memory_bytes", Type: TypeGauge, Help: "Resident memory size in bytes.", } ProcessHeapBytes = Metric{ Name: "process_heap_bytes", Type: TypeGauge, Help: "Process heap size in bytes.", } ProcessStartTimeSeconds = Metric{ Name: "process_start_time_seconds", Type: TypeGauge, Help: "Start time of the process since unix epoch in seconds.", } ProcessThreads = Metric{ Name: "process_threads", Type: TypeGauge, Help: "Number of OS threads in the process.", } ) // processMetrics is the set of process-level metrics. var processMetrics = [9]*Metric{ &ProcessCPUSecondsTotal, &ProcessOpenFDs, &ProcessMaxFDs, &ProcessVirtualMemoryBytes, &ProcessVirtualMemoryMaxBytes, &ProcessResidentMemoryBytes, &ProcessHeapBytes, &ProcessStartTimeSeconds, &ProcessThreads, } // internedStringMap allows for interning strings. type internedStringMap map[string]*string // Intern returns the interned version of the given string. // If it is not already interned in the map, this function interns it. func (m internedStringMap) Intern(s string) string { if existing, found := m[s]; found { return *existing } m[s] = &s return s } // globalInternMap is a string intern map used for globally-relevant data that repeats across // verifiers, such as metric names and field names, but not field values or combinations of field // values. var ( globalInternMu sync.Mutex verifierCount uint64 globalInternMap = make(internedStringMap) ) // globalIntern returns the interned version of the given string. // If it is not already interned in the map, this function interns it. func globalIntern(s string) string { globalInternMu.Lock() defer globalInternMu.Unlock() return globalInternMap.Intern(s) } func globalInternVerifierCreated() { globalInternMu.Lock() defer globalInternMu.Unlock() verifierCount++ } func globalInternVerifierReleased() { globalInternMu.Lock() defer globalInternMu.Unlock() verifierCount-- if verifierCount <= 0 { verifierCount = 0 // No more verifiers active, so release the global map to not keep consuming needless resources. globalInternMap = make(internedStringMap) } } // numberPacker holds packedNumber data. It is useful to store large amounts of Number structs in a // small memory footprint. type numberPacker struct { // `data` *must* be pre-allocated if there is any number to be stored in it. // Attempts to pack a number that cannot fit into the existing space // allocated for this slice will cause a panic. // Callers may use `needsIndirection` to determine whether a number needs // space in this slice or not ahead of packing it. data []uint64 } // packedNumber is a non-serializable but smaller-memory-footprint container for a numerical value. // It can be unpacked out to a Number struct. // This contains 4 bytes where we try to pack as much as possible. // For the overhwelmingly-common case of integers that fit in 30 bits (i.e. 30 bits where the first // 2 bits are zero, we store them directly here. Otherwise, we store the offset of a 64-bit number // within numberPacker. // Layout, going from highest to lowest bit: // Bit 0 is the type: 0 for integer, 1 for float. // Bit 1 is 0 if the number's value is stored within the next 30 bits, or 1 if the next 30 bits // refer to an offset within numberPacker instead. // In the case of a float, the next two bits (bits 2 and 3) may be used to encode a special value: // - 00 means not a special value // - 01 means NaN // - 10 means -infinity // - 11 means +infinity // // When not using a special value, the 32-bit exponent must fit in 5 bits, and is encoded using a // bias of 2^4, meaning it ranges from -15 (encoded as 0b00000) to 16 (encoded as 0b11111), and an // exponent of 0 is encoded as 0b01111. // Floats that do not fit within this range must be encoded indirectly as float64s, similar to // integers that don't fit in 30 bits. type packedNumber uint32 // Useful masks and other bit-twiddling stuff for packedNumber. const ( typeField = uint32(1 << 31) typeFieldInteger = uint32(0) typeFieldFloat = uint32(typeField) storageField = uint32(1 << 30) storageFieldDirect = uint32(0) storageFieldIndirect = uint32(storageField) valueField = uint32(1<<30 - 1) maxDirectUint = uint64(valueField) float32ExponentField = uint32(0x7f800000) float32ExponentShift = uint32(23) float32ExponentBias = uint32(127) float32FractionField = uint32(0x7fffff) packedFloatExponentField = uint32(0x0f800000) packedFloatExponentBias = uint32(15) packedFloatNaN = packedNumber(typeFieldFloat | storageFieldDirect | 0x10000000) packedFloatNegInf = packedNumber(typeFieldFloat | storageFieldDirect | 0x20000000) packedFloatInf = packedNumber(typeFieldFloat | storageFieldDirect | 0x30000000) ) // needsPackerStorage returns 0 for numbers that can be // stored directly into the 32 bits of a packedNumber, or 1 for numbers that // need more bits and would need to be stored into a numberPacker's `data` // field. // //go:nosplit func needsPackerStorage(n *Number) uint64 { if n.Float == 0.0 { v := n.Int if v >= 0 && v <= int64(valueField) { return 0 } return 1 } // n is a float. v := n.Float if math.IsNaN(v) || v == math.Inf(-1) || v == math.Inf(1) { return 0 } if v >= 0.0 && float64(float32(v)) == v { float32Bits := math.Float32bits(float32(v)) exponent := (float32Bits&float32ExponentField)>>float32ExponentShift - float32ExponentBias packedExponent := (exponent + packedFloatExponentBias) << float32ExponentShift if packedExponent&packedFloatExponentField == packedExponent { return 0 } } return 1 } // isIndirect returns 1 iff this packedNumber needs storage in a numberPacker. // //go:nosplit func (n packedNumber) isIndirect() uint64 { if uint32(n)&storageField == storageFieldIndirect { return 1 } return 0 } // errOutOfPackerMemory is emitted when the number cannot be packed into a numberPacker. var errOutOfPackerMemory = errors.New("out of numberPacker memory") // pack packs a Number into a packedNumber. // //go:nosplit func (p *numberPacker) pack(n *Number) packedNumber { if n.Float == 0.0 { v := n.Int if v >= 0 && v <= int64(maxDirectUint) { // We can store the integer value directly. return packedNumber(typeFieldInteger | storageFieldDirect | uint32(v)) } if len(p.data) == cap(p.data) { panic(errOutOfPackerMemory) } p.data = append(p.data, uint64(v)) return packedNumber(typeFieldInteger | storageFieldIndirect | uint32(len(p.data)-1)) } // n is a float. v := n.Float if math.IsNaN(v) { return packedFloatNaN } if v == math.Inf(-1) { return packedFloatNegInf } if v == math.Inf(1) { return packedFloatInf } if v >= 0.0 && float64(float32(v)) == v { float32Bits := math.Float32bits(float32(v)) exponent := (float32Bits&float32ExponentField)>>float32ExponentShift - float32ExponentBias packedExponent := (exponent + packedFloatExponentBias) << float32ExponentShift if packedExponent&packedFloatExponentField == packedExponent { float32Fraction := float32Bits & float32FractionField return packedNumber(typeFieldFloat | storageFieldDirect | packedExponent | float32Fraction) } } if len(p.data) == cap(p.data) { panic(errOutOfPackerMemory) } p.data = append(p.data, math.Float64bits(v)) return packedNumber(typeFieldFloat | storageFieldIndirect | uint32(len(p.data)-1)) } // packInt packs an integer. // //go:nosplit func (p *numberPacker) packInt(val int64) packedNumber { n := Number{Int: val} return p.pack(&n) } // packFloat packs a floating-point number. // //go:nosplit func (p *numberPacker) packFloat(val float64) packedNumber { n := Number{Float: val} return p.pack(&n) } // unpack unpacks a packedNumber back into a Number. func (p *numberPacker) unpack(n packedNumber) *Number { switch uint32(n) & typeField { case typeFieldInteger: switch uint32(n) & storageField { case storageFieldDirect: return NewInt(int64(uint32(n) & valueField)) case storageFieldIndirect: return NewInt(int64(p.data[uint32(n)&valueField])) } case typeFieldFloat: switch uint32(n) & storageField { case storageFieldDirect: switch n { case packedFloatNaN: return NewFloat(math.NaN()) case packedFloatNegInf: return NewFloat(math.Inf(-1)) case packedFloatInf: return NewFloat(math.Inf(1)) default: exponent := ((uint32(n) & packedFloatExponentField) >> float32ExponentShift) - packedFloatExponentBias float32Bits := ((exponent + float32ExponentBias) << float32ExponentShift) | (uint32(n) & float32FractionField) return NewFloat(float64(math.Float32frombits(float32Bits))) } case storageFieldIndirect: return NewFloat(math.Float64frombits(p.data[uint32(n)&valueField])) } } panic("unreachable") } // mustUnpackInt unpacks an integer. // It panics if the packedNumber is not an integer. func (p *numberPacker) mustUnpackInt(n packedNumber) int64 { num := p.unpack(n) if !num.IsInteger() { panic("not an integer") } return num.Int } // mustUnpackFloat unpacks a floating-point number. // It panics if the packedNumber is not an floating-point number. func (p *numberPacker) mustUnpackFloat(n packedNumber) float64 { num := p.unpack(n) if *num == zero { return 0.0 } if num.IsInteger() { panic("not a float") } return num.Float } // portTo ports over a packedNumber from this numberPacker to a new one. // It is equivalent to `p.pack(other.unpack(n))` but avoids // allocations in the overwhelmingly-common case where the number is direct. func (p *numberPacker) portTo(other *numberPacker, n packedNumber) packedNumber { if uint32(n)&storageField == storageFieldDirect { // `n` is self-contained, just return as-is. return n } if len(other.data) == cap(other.data) { panic(errOutOfPackerMemory) } other.data = append(other.data, p.data[uint32(n)&valueField]) return packedNumber(uint32(n)&(typeField|storageField) | uint32(len(other.data)-1)) } // distributionSnapshot contains the data for a single field combination of a // distribution ("histogram") metric. type distributionSnapshot struct { // sum is the sum of all samples across all buckets. sum packedNumber // count is the number of samples across all buckets. count packedNumber // min is the lowest-recorded sample in the distribution. // It is only meaningful when count >= 1. min packedNumber // max is the highest-recorded sample in the distribution. // It is only meaningful when count >= 1. max packedNumber // ssd is the sum-of-squared-deviations computation of the distribution. // If non-zero, it is always a floating-point number. // It is only meaningful when count >= 2. ssd packedNumber // numSamples is the number of samples in each bucket. numSamples []packedNumber } // verifiableMetric verifies a single metric within a Verifier. type verifiableMetric struct { metadata *pb.MetricMetadata wantMetric Metric numFields uint32 verifier *Verifier allowedFieldValues map[string]map[string]struct{} wantBucketUpperBounds []Number // The following fields are used to verify that values are actually increasing monotonically. // They are only read and modified when the parent Verifier.mu is held. // They are mapped by their combination of field values. // lastCounterValue is used for counter metrics. lastCounterValue map[string]packedNumber // lastDistributionSnapshot is used for distribution ("histogram") metrics. lastDistributionSnapshot map[string]*distributionSnapshot } // newVerifiableMetric creates a new verifiableMetric that can verify the // values of a metric with the given metadata. func newVerifiableMetric(metadata *pb.MetricMetadata, verifier *Verifier) (*verifiableMetric, error) { promName := metadata.GetPrometheusName() if metadata.GetName() == "" || promName == "" { return nil, errors.New("metric has no name") } for _, processMetric := range processMetrics { if promName == processMetric.Name { return nil, fmt.Errorf("metric name %q is reserved by Prometheus for process-level metrics", promName) } } if strings.HasPrefix(promName, MetaMetricPrefix) { return nil, fmt.Errorf("metric name %q starts with %q which is a reserved prefix", promName, "meta_") } if !unicode.IsLower(rune(promName[0])) { return nil, fmt.Errorf("invalid initial character in prometheus metric name: %q", promName) } for _, r := range promName { if !unicode.IsLower(r) && !unicode.IsDigit(r) && r != '_' { return nil, fmt.Errorf("invalid character %c in prometheus metric name %q", r, promName) } } numFields := uint32(len(metadata.GetFields())) var allowedFieldValues map[string]map[string]struct{} if numFields > 0 { seenFields := make(map[string]struct{}, numFields) allowedFieldValues = make(map[string]map[string]struct{}, numFields) for _, field := range metadata.GetFields() { fieldName := field.GetFieldName() if _, alreadyExists := seenFields[fieldName]; alreadyExists { return nil, fmt.Errorf("field %s is defined twice", fieldName) } seenFields[fieldName] = struct{}{} if len(field.GetAllowedValues()) == 0 { return nil, fmt.Errorf("field %s has no allowed values", fieldName) } fieldValues := make(map[string]struct{}, len(field.GetAllowedValues())) for _, value := range field.GetAllowedValues() { if _, alreadyExists := fieldValues[value]; alreadyExists { return nil, fmt.Errorf("field %s has duplicate allowed value %q", fieldName, value) } fieldValues[globalIntern(value)] = struct{}{} } allowedFieldValues[globalIntern(fieldName)] = fieldValues } } v := &verifiableMetric{ metadata: metadata, verifier: verifier, wantMetric: Metric{ Name: globalIntern(promName), Help: globalIntern(metadata.GetDescription()), }, numFields: numFields, allowedFieldValues: allowedFieldValues, } numFieldCombinations := len(allowedFieldValues) switch metadata.GetType() { case pb.MetricMetadata_TYPE_UINT64: v.wantMetric.Type = TypeGauge if metadata.GetCumulative() { v.wantMetric.Type = TypeCounter v.lastCounterValue = make(map[string]packedNumber, numFieldCombinations) } case pb.MetricMetadata_TYPE_DISTRIBUTION: v.wantMetric.Type = TypeHistogram numBuckets := len(metadata.GetDistributionBucketLowerBounds()) + 1 if numBuckets <= 1 || numBuckets > 256 { return nil, fmt.Errorf("unsupported number of buckets: %d", numBuckets) } v.wantBucketUpperBounds = make([]Number, numBuckets) for i, boundary := range metadata.GetDistributionBucketLowerBounds() { v.wantBucketUpperBounds[i] = Number{Int: boundary} } v.wantBucketUpperBounds[numBuckets-1] = Number{Float: math.Inf(1)} v.lastDistributionSnapshot = make(map[string]*distributionSnapshot, numFieldCombinations) default: return nil, fmt.Errorf("invalid type: %v", metadata.GetType()) } return v, nil } func (v *verifiableMetric) numFieldCombinations() int { return len(v.allowedFieldValues) } // verify does read-only checks on `data`. // `metricFieldsSeen` is passed across calls to `verify`. It is used to track the set of metric // field values that have already been seen. `verify` should populate this. // `dataToFieldsSeen` is passed across calls to `verify` and other methods of `verifiableMetric`. // It is used to store the canonical representation of the field values seen for each *Data. // // Precondition: `Verifier.mu` is held. func (v *verifiableMetric) verify(data *Data, metricFieldsSeen map[string]struct{}, dataToFieldsSeen map[*Data]string) error { if *data.Metric != v.wantMetric { return fmt.Errorf("invalid metric definition: got %+v want %+v", data.Metric, v.wantMetric) } // Verify fields. if uint32(len(data.Labels)) != v.numFields { return fmt.Errorf("invalid number of fields: got %d want %d", len(data.Labels), v.numFields) } var fieldValues strings.Builder firstField := true for _, field := range v.metadata.GetFields() { fieldName := field.GetFieldName() value, found := data.Labels[fieldName] if !found { return fmt.Errorf("did not specify field %q", fieldName) } if _, allowed := v.allowedFieldValues[fieldName][value]; !allowed { return fmt.Errorf("value %q is not allowed for field %s", value, fieldName) } if !firstField { fieldValues.WriteRune(',') } fieldValues.WriteString(value) firstField = false } fieldValuesStr := fieldValues.String() if _, alreadySeen := metricFieldsSeen[fieldValuesStr]; alreadySeen { return fmt.Errorf("combination of field values %q was already seen", fieldValuesStr) } // Verify value. gotNumber := data.Number != nil gotHistogram := data.HistogramValue != nil numSpecified := 0 if gotNumber { numSpecified++ } if gotHistogram { numSpecified++ } if numSpecified != 1 { return fmt.Errorf("invalid number of value fields specified: %d", numSpecified) } switch v.metadata.GetType() { case pb.MetricMetadata_TYPE_UINT64: if !gotNumber { return errors.New("expected number value for gauge or counter") } if !data.Number.IsInteger() { return fmt.Errorf("integer metric got non-integer value: %v", data.Number) } case pb.MetricMetadata_TYPE_DISTRIBUTION: if !gotHistogram { return errors.New("expected histogram value for histogram") } if len(data.HistogramValue.Buckets) != len(v.wantBucketUpperBounds) { return fmt.Errorf("invalid number of buckets: got %d want %d", len(data.HistogramValue.Buckets), len(v.wantBucketUpperBounds)) } if data.HistogramValue.SumOfSquaredDeviations.IsInteger() && data.HistogramValue.SumOfSquaredDeviations.Int != 0 { return fmt.Errorf("sum of squared deviations must be a floating-point value, got %v", data.HistogramValue.SumOfSquaredDeviations) } for i, b := range data.HistogramValue.Buckets { if want := v.wantBucketUpperBounds[i]; b.UpperBound != want { return fmt.Errorf("invalid upper bound for bucket %d (0-based): got %v want %v", i, b.UpperBound, want) } } default: return fmt.Errorf("invalid metric type: %v", v.wantMetric.Type) } // All passed. Update the maps that are shared across calls. fieldValuesStr = v.verifier.internMap.Intern(fieldValuesStr) dataToFieldsSeen[data] = fieldValuesStr metricFieldsSeen[fieldValuesStr] = struct{}{} return nil } // verifyIncrement verifies that incremental metrics are monotonically increasing. // // Preconditions: `verify` has succeeded on the given `data`, and `Verifier.mu` is held. func (v *verifiableMetric) verifyIncrement(data *Data, fieldValues string, packer *numberPacker) error { switch v.wantMetric.Type { case TypeCounter: last := packer.unpack(v.lastCounterValue[v.verifier.internMap.Intern(fieldValues)]) if !last.SameType(data.Number) { return fmt.Errorf("counter number type changed: %v vs %v", last, data.Number) } if last.GreaterThan(data.Number) { return fmt.Errorf("counter value decreased from %v to %v", last, data.Number) } case TypeHistogram: lastDistributionSnapshot := v.lastDistributionSnapshot[v.verifier.internMap.Intern(fieldValues)] if lastDistributionSnapshot == nil { lastDistributionSnapshot = &distributionSnapshot{ numSamples: make([]packedNumber, len(v.wantBucketUpperBounds)), } v.lastDistributionSnapshot[v.verifier.internMap.Intern(fieldValues)] = lastDistributionSnapshot } lastCount := packer.mustUnpackInt(lastDistributionSnapshot.count) if lastCount >= 1 { lastMin := packer.unpack(lastDistributionSnapshot.min) if !lastMin.SameType(&data.HistogramValue.Min) { return fmt.Errorf("minimum value type changed: %v vs %v", lastMin, data.HistogramValue.Min) } if data.HistogramValue.Min.GreaterThan(lastMin) { return fmt.Errorf("minimum value strictly increased: from %v to %v", lastMin, data.HistogramValue.Min) } lastMax := packer.unpack(lastDistributionSnapshot.max) if !lastMax.SameType(&data.HistogramValue.Max) { return fmt.Errorf("maximum value type changed: %v vs %v", lastMax, data.HistogramValue.Max) } if lastMax.GreaterThan(&data.HistogramValue.Max) { return fmt.Errorf("maximum value strictly decreased: from %v to %v", lastMax, data.HistogramValue.Max) } } if lastCount >= 2 { // We already verified that the new data is a floating-point number // earlier, no need to double-check here. lastSSD := packer.mustUnpackFloat(lastDistributionSnapshot.ssd) if data.HistogramValue.SumOfSquaredDeviations.Float < lastSSD { return fmt.Errorf("sum of squared deviations decreased from %v to %v", lastSSD, data.HistogramValue.SumOfSquaredDeviations.Float) } } numSamples := lastDistributionSnapshot.numSamples for i, b := range data.HistogramValue.Buckets { if uint64(packer.mustUnpackInt(numSamples[i])) > b.Samples { return fmt.Errorf("number of samples in bucket %d (0-based) decreased from %d to %d", i, packer.mustUnpackInt(numSamples[i]), b.Samples) } } } return nil } // packerCapacityNeeded returns the `numberPacker` capacity to store `Data`. func (v *verifiableMetric) packerCapacityNeededForData(data *Data, fieldValues string) uint64 { switch v.wantMetric.Type { case TypeCounter: return needsPackerStorage(data.Number) case TypeHistogram: var toPack uint64 var totalSamples uint64 var buf Number for _, b := range data.HistogramValue.Buckets { buf = Number{Int: int64(b.Samples)} toPack += needsPackerStorage(&buf) totalSamples += b.Samples } toPack += needsPackerStorage(&data.HistogramValue.Total) toPack += needsPackerStorage(&data.HistogramValue.Min) toPack += needsPackerStorage(&data.HistogramValue.Max) toPack += needsPackerStorage(&data.HistogramValue.SumOfSquaredDeviations) buf = Number{Int: int64(totalSamples)} toPack += needsPackerStorage(&buf) return toPack default: return 0 } } // packerCapacityNeededForLast returns the `numberPacker` capacity needed to // store the last snapshot's data that was not seen in the current snapshot // (aka not in metricFieldsSeen). func (v *verifiableMetric) packerCapacityNeededForLast(metricFieldsSeen map[string]struct{}) uint64 { var capacity uint64 switch v.wantMetric.Type { case TypeCounter: for fieldValues, lastCounterValue := range v.lastCounterValue { if _, found := metricFieldsSeen[fieldValues]; found { continue } capacity += lastCounterValue.isIndirect() } case TypeHistogram: for fieldValues, distributionSnapshot := range v.lastDistributionSnapshot { if _, found := metricFieldsSeen[fieldValues]; found { continue } for _, b := range distributionSnapshot.numSamples { capacity += b.isIndirect() } capacity += distributionSnapshot.sum.isIndirect() capacity += distributionSnapshot.count.isIndirect() capacity += distributionSnapshot.min.isIndirect() capacity += distributionSnapshot.max.isIndirect() capacity += distributionSnapshot.ssd.isIndirect() } } return capacity } // update updates incremental metrics' "last seen" data. // // Preconditions: `verifyIncrement` has succeeded on the given `data`, `Verifier.mu` is held, // and `packer` is guaranteed to have enough room to store all numbers. func (v *verifiableMetric) update(data *Data, fieldValues string, packer *numberPacker) { switch v.wantMetric.Type { case TypeCounter: v.lastCounterValue[v.verifier.internMap.Intern(fieldValues)] = packer.pack(data.Number) case TypeHistogram: lastDistributionSnapshot := v.lastDistributionSnapshot[v.verifier.internMap.Intern(fieldValues)] lastBucketSamples := lastDistributionSnapshot.numSamples var count uint64 for i, b := range data.HistogramValue.Buckets { lastBucketSamples[i] = packer.packInt(int64(b.Samples)) count += b.Samples } lastDistributionSnapshot.sum = packer.pack(&data.HistogramValue.Total) lastDistributionSnapshot.count = packer.packInt(int64(count)) lastDistributionSnapshot.min = packer.pack(&data.HistogramValue.Min) lastDistributionSnapshot.max = packer.pack(&data.HistogramValue.Max) lastDistributionSnapshot.ssd = packer.pack(&data.HistogramValue.SumOfSquaredDeviations) } } // repackUnseen packs all numbers that must be carried over from snapshot to snapshot and which were // not seen in the latest snapshot's data. // This function should carry over all numbers typically packed in `v.update` but for all metric // field combinations that are not in `metricFieldsSeen`. // // Preconditions: `verifyIncrement` has succeeded on the given `data`, // and `newPacker` is guaranteed to have enough room to store all numbers. func (v *verifiableMetric) repackUnseen(metricFieldsSeen map[string]struct{}, oldPacker, newPacker *numberPacker) { switch v.wantMetric.Type { case TypeCounter: for fieldValues, lastCounterValue := range v.lastCounterValue { if _, found := metricFieldsSeen[fieldValues]; found { continue } v.lastCounterValue[fieldValues] = oldPacker.portTo(newPacker, lastCounterValue) } case TypeHistogram: for fieldValues, lastDistributionSnapshot := range v.lastDistributionSnapshot { if _, found := metricFieldsSeen[fieldValues]; found { continue } lastBucketSamples := lastDistributionSnapshot.numSamples for i, b := range lastBucketSamples { lastBucketSamples[i] = oldPacker.portTo(newPacker, b) } lastDistributionSnapshot.sum = oldPacker.portTo(newPacker, lastDistributionSnapshot.sum) lastDistributionSnapshot.count = oldPacker.portTo(newPacker, lastDistributionSnapshot.count) lastDistributionSnapshot.min = oldPacker.portTo(newPacker, lastDistributionSnapshot.min) lastDistributionSnapshot.max = oldPacker.portTo(newPacker, lastDistributionSnapshot.max) lastDistributionSnapshot.ssd = oldPacker.portTo(newPacker, lastDistributionSnapshot.ssd) } } } // Verifier allows verifying metric snapshot against metric registration data. // The aim is to prevent a compromised Sentry from emitting bogus data or DoS'ing metric ingestion. // A single Verifier should be used per sandbox. It is expected to be reused across exports such // that it can enforce the export snapshot timestamp is strictly monotonically increasing. type Verifier struct { knownMetrics map[string]*verifiableMetric // mu protects the fields below. mu sync.Mutex // internMap is used to intern strings relevant to this verifier only. // Globally-relevant strings should be interned in globalInternMap. internMap internedStringMap // lastPacker is a reference to the numberPacker used to pack numbers in the last successful // verification round. lastPacker *numberPacker // lastTimestamp is the snapshot timestamp of the last successfully-verified snapshot. lastTimestamp time.Time } // NewVerifier returns a new metric verifier that can verify the integrity of snapshots against // the given metric registration data. // It returns a cleanup function that must be called when the Verifier is no longer needed. func NewVerifier(registration *pb.MetricRegistration) (*Verifier, func(), error) { globalInternVerifierCreated() verifier := &Verifier{ knownMetrics: make(map[string]*verifiableMetric), internMap: make(internedStringMap), } for _, metric := range registration.GetMetrics() { metricName := metric.GetPrometheusName() if _, alreadyExists := verifier.knownMetrics[metricName]; alreadyExists { globalInternVerifierReleased() return nil, func() {}, fmt.Errorf("metric %q registered twice", metricName) } verifiableM, err := newVerifiableMetric(metric, verifier) if err != nil { globalInternVerifierReleased() return nil, func() {}, fmt.Errorf("metric %q: %v", metricName, err) } verifier.knownMetrics[globalIntern(metricName)] = verifiableM } return verifier, globalInternVerifierReleased, nil } // Verify verifies the integrity of a snapshot against the metric registration data of the Verifier. // It assumes that it will be called on snapshots obtained chronologically over time. func (v *Verifier) Verify(snapshot *Snapshot) error { var err error // Basic timestamp checks. now := timeNow() if snapshot.When.After(now) { return errors.New("snapshot is from the future") } if snapshot.When.Before(now.Add(-maxExportStaleness)) { return fmt.Errorf("snapshot is too old; it is from %v, expected at least %v (%v from now)", snapshot.When, now.Add(-maxExportStaleness), maxExportStaleness) } // Start critical section. v.mu.Lock() defer v.mu.Unlock() // Metrics checks. fieldsSeen := make(map[string]map[string]struct{}, len(v.knownMetrics)) dataToFieldsSeen := make(map[*Data]string, len(snapshot.Data)) for _, data := range snapshot.Data { metricName := data.Metric.Name verifiableM, found := v.knownMetrics[metricName] if !found { return fmt.Errorf("snapshot contains unknown metric %q", metricName) } metricName = globalIntern(metricName) metricFieldsSeen, found := fieldsSeen[metricName] if !found { metricFieldsSeen = make(map[string]struct{}, verifiableM.numFieldCombinations()) fieldsSeen[metricName] = metricFieldsSeen } if err = verifiableM.verify(data, metricFieldsSeen, dataToFieldsSeen); err != nil { return fmt.Errorf("metric %q: %v", metricName, err) } } if v.lastTimestamp.After(snapshot.When) { return fmt.Errorf("consecutive snapshots are not chronologically ordered: last verified snapshot was exported at %v, this one is from %v", v.lastTimestamp, snapshot.When) } for _, data := range snapshot.Data { if err := v.knownMetrics[data.Metric.Name].verifyIncrement(data, dataToFieldsSeen[data], v.lastPacker); err != nil { return fmt.Errorf("metric %q: %v", data.Metric.Name, err) } } var neededPackerCapacity uint64 for _, data := range snapshot.Data { neededPackerCapacity += v.knownMetrics[data.Metric.Name].packerCapacityNeededForData(data, dataToFieldsSeen[data]) } for name, metric := range v.knownMetrics { neededPackerCapacity += metric.packerCapacityNeededForLast(fieldsSeen[name]) } if neededPackerCapacity > uint64(valueField) { return fmt.Errorf("snapshot contains too many large numbers to fit into packer memory (%d numbers needing indirection)", neededPackerCapacity) } // All checks succeeded, update last-seen data. // We need to be guaranteed to not fail past this point in the function. newPacker := &numberPacker{} if neededPackerCapacity != 0 { newPacker.data = make([]uint64, 0, neededPackerCapacity) } v.lastTimestamp = snapshot.When for _, data := range snapshot.Data { v.knownMetrics[globalIntern(data.Metric.Name)].update(data, v.internMap.Intern(dataToFieldsSeen[data]), newPacker) } if uint64(len(newPacker.data)) != neededPackerCapacity { for name, metric := range v.knownMetrics { metric.repackUnseen(fieldsSeen[name], v.lastPacker, newPacker) } } if uint64(len(newPacker.data)) != neededPackerCapacity { // We panic here because this represents an internal logic error, // not something the user did wrong. panic(fmt.Sprintf("did not pack the expected number of numbers in numberPacker: packed %d, expected %d; this indicates a logic error in verifyIncrement", len(newPacker.data), neededPackerCapacity)) } v.lastPacker = newPacker return nil } // AllMetrics returns the metadata of all the metrics that were declared as // part of this Verifier. func (v *Verifier) AllMetrics() []*pb.MetricMetadata { metrics := make([]*pb.MetricMetadata, 0, len(v.knownMetrics)) for _, m := range v.knownMetrics { metrics = append(metrics, m.metadata) } return metrics } golang-gvisor-gvisor-0.0~20240729.0/pkg/rand/000077500000000000000000000000001465435605700203505ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/rand/rand.go000066400000000000000000000014651465435605700216310ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !linux // +build !linux package rand import "crypto/rand" // Reader is the default reader. var Reader = rand.Reader // Read implements io.Reader.Read. func Read(b []byte) (int, error) { return rand.Read(b) } golang-gvisor-gvisor-0.0~20240729.0/pkg/rand/rand_linux.go000066400000000000000000000040441465435605700230440ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package rand import ( "bufio" "crypto/rand" "io" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sync" ) // reader implements an io.Reader that returns pseudorandom bytes. type reader struct { once sync.Once useGetrandom bool } // Read implements io.Reader.Read. func (r *reader) Read(p []byte) (int, error) { r.once.Do(func() { _, err := unix.Getrandom(p, 0) if err != unix.ENOSYS { r.useGetrandom = true } }) if r.useGetrandom { return unix.Getrandom(p, 0) } return rand.Read(p) } // bufferedReader implements a threadsafe buffered io.Reader. type bufferedReader struct { mu sync.Mutex r *bufio.Reader } // Read implements io.Reader.Read. func (b *bufferedReader) Read(p []byte) (int, error) { // In Linux, reads of up to page size bytes will always complete fully. // See drivers/char/random.c:get_random_bytes_user(). // NOTE(gvisor.dev/issue/9445): Some applications rely on this behavior. const pageSize = 4096 min := len(p) if min > pageSize { min = pageSize } b.mu.Lock() defer b.mu.Unlock() return io.ReadAtLeast(b.r, p, min) } // Reader is the default reader. var Reader io.Reader = &bufferedReader{r: bufio.NewReader(&reader{})} // Read reads from the default reader. func Read(b []byte) (int, error) { return io.ReadFull(Reader, b) } // Init can be called to make sure /dev/urandom is pre-opened on kernels that // do not support getrandom(2). func Init() error { p := make([]byte, 1) _, err := Read(p) return err } golang-gvisor-gvisor-0.0~20240729.0/pkg/rand/rand_linux_state_autogen.go000066400000000000000000000000661465435605700257660ustar00rootroot00000000000000// automatically generated by stateify. package rand golang-gvisor-gvisor-0.0~20240729.0/pkg/rand/rand_state_autogen.go000066400000000000000000000001321465435605700245410ustar00rootroot00000000000000// automatically generated by stateify. //go:build !linux // +build !linux package rand golang-gvisor-gvisor-0.0~20240729.0/pkg/rand/rng.go000066400000000000000000000077461465435605700215030ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package rand implements a cryptographically secure pseudorandom number // generator. package rand import ( "encoding/binary" "fmt" "io" ) // RNG exposes convenience functions based on a cryptographically secure // io.Reader. type RNG struct { Reader io.Reader } // RNGFrom returns a new RNG. r must be a cryptographically secure io.Reader. func RNGFrom(r io.Reader) RNG { return RNG{Reader: r} } // Uint16 is analogous to the standard library's math/rand.Uint16. func (rg *RNG) Uint16() uint16 { var data [2]byte if _, err := rg.Reader.Read(data[:]); err != nil { panic(fmt.Sprintf("Read() failed: %v", err)) } return binary.NativeEndian.Uint16(data[:]) } // Uint32 is analogous to the standard library's math/rand.Uint32. func (rg *RNG) Uint32() uint32 { var data [4]byte if _, err := rg.Reader.Read(data[:]); err != nil { panic(fmt.Sprintf("Read() failed: %v", err)) } return binary.NativeEndian.Uint32(data[:]) } // Int63n is analogous to the standard library's math/rand.Int63n. func (rg *RNG) Int63n(n int64) int64 { // Based on Go's rand package implementation, but using // cryptographically secure random numbers. if n <= 0 { panic(fmt.Sprintf("n must be positive, but got %d", n)) } // This can be done quickly when n is a power of 2. if n&(n-1) == 0 { return int64(rg.Uint64()) & (n - 1) } // The naive approach would be to return rg.Int63()%n, but we need the // random number to be fair. It shouldn't be biased towards certain // results, but simple modular math can be very biased. For example, if // n is 40% of the maximum int64, then the output values of rg.Int63 // map to return values as follows: // // - The first 40% of values map to themselves. // - The second 40% map to themselves - maximum int64. // - The remaining 20% map to the themselves - 2 * (maximum int64), // i.e. the first half of possible output values. // // And thus 60% of results map the first half of possible output // values, and 40% map the second half. Oops! // // We use the same trick as Go to deal with this: shave off the last // segment (the 20% in our example) to make the RNG more fair. // // In the worst case, n is just over half of maximum int64, meaning // that the upper half of rg.Int63 return values are bad. So each call // to rg.Int63 has, at worst, a 50% chance of needing a retry. maximum := int64((1 << 63) - 1 - (1<<63)%uint64(n)) ret := rg.Int63() for ret > maximum { ret = rg.Int63() } return ret % n } // Int63 is analogous to the standard library's math/rand.Int63. func (rg *RNG) Int63() int64 { return ((1 << 63) - 1) & int64(rg.Uint64()) } // Uint64 is analogous to the standard library's math/rand.Uint64. func (rg *RNG) Uint64() uint64 { var data [8]byte if _, err := rg.Reader.Read(data[:]); err != nil { panic(fmt.Sprintf("Read() failed: %v", err)) } return binary.NativeEndian.Uint64(data[:]) } // Uint32 is analogous to the standard library's math/rand.Uint32. func Uint32() uint32 { rng := RNG{Reader: Reader} return rng.Uint32() } // Int63n is analogous to the standard library's math/rand.Int63n. func Int63n(n int64) int64 { rng := RNG{Reader: Reader} return rng.Int63n(n) } // Int63 is analogous to the standard library's math/rand.Int63. func Int63() int64 { rng := RNG{Reader: Reader} return rng.Int63() } // Uint64 is analogous to the standard library's math/rand.Uint64. func Uint64() uint64 { rng := RNG{Reader: Reader} return rng.Uint64() } golang-gvisor-gvisor-0.0~20240729.0/pkg/rawfile/000077500000000000000000000000001465435605700210555ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/rawfile/blockingpoll_amd64.s000066400000000000000000000025541465435605700247210ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "textflag.h" // BlockingPoll makes the ppoll() syscall while calling the version of // entersyscall that relinquishes the P so that other Gs can run. This is meant // to be called in cases when the syscall is expected to block. // // func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (n int, err syscall.Errno) TEXT ·BlockingPoll(SB),NOSPLIT|NOFRAME,$0-40 CALL ·callEntersyscallblock(SB) MOVQ fds+0(FP), DI MOVQ nfds+8(FP), SI MOVQ timeout+16(FP), DX MOVQ $0x0, R10 // sigmask parameter which isn't used here MOVQ $0x10f, AX // SYS_PPOLL SYSCALL CMPQ AX, $0xfffffffffffff002 JLS ok MOVQ $-1, ret+24(FP) NEGQ AX MOVQ AX, ret1+32(FP) CALL ·callExitsyscall(SB) RET ok: MOVQ AX, ret+24(FP) MOVQ $0, ret1+32(FP) CALL ·callExitsyscall(SB) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/rawfile/blockingpoll_arm64.s000066400000000000000000000025471465435605700247410ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "textflag.h" // BlockingPoll makes the ppoll() syscall while calling the version of // entersyscall that relinquishes the P so that other Gs can run. This is meant // to be called in cases when the syscall is expected to block. // // func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (n int, err syscall.Errno) TEXT ·BlockingPoll(SB),NOSPLIT,$0-40 BL ·callEntersyscallblock(SB) MOVD fds+0(FP), R0 MOVD nfds+8(FP), R1 MOVD timeout+16(FP), R2 MOVD $0x0, R3 // sigmask parameter which isn't used here MOVD $0x49, R8 // SYS_PPOLL SVC CMP $0xfffffffffffff002, R0 BLS ok MOVD $-1, R1 MOVD R1, ret+24(FP) NEG R0, R0 MOVD R0, ret1+32(FP) BL ·callExitsyscall(SB) RET ok: MOVD R0, ret+24(FP) MOVD $0, ret1+32(FP) BL ·callExitsyscall(SB) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/rawfile/blockingpoll_noyield_unsafe.go000066400000000000000000000021051465435605700271450ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux && !amd64 && !arm64 // +build linux,!amd64,!arm64 package rawfile import ( "unsafe" "golang.org/x/sys/unix" ) // BlockingPoll is just a stub function that forwards to the ppoll() system call // on non-amd64 and non-arm64 platforms. func BlockingPoll(fds *PollEvent, nfds int, timeout *unix.Timespec) (int, unix.Errno) { n, _, e := unix.Syscall6(unix.SYS_PPOLL, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), 0, 0, 0) return int(n), e } golang-gvisor-gvisor-0.0~20240729.0/pkg/rawfile/blockingpoll_yield_unsafe.go000066400000000000000000000045141465435605700266160ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build ((linux && amd64) || (linux && arm64)) && go1.18 // +build linux,amd64 linux,arm64 // +build go1.18 // //go:linkname directives type-checked by checklinkname. Any other // non-linkname assumptions outside the Go 1 compatibility guarantee should // have an accompanied vet check or version guard build tag. package rawfile import ( _ "unsafe" // for go:linkname "golang.org/x/sys/unix" ) // BlockingPoll on amd64/arm64 makes the ppoll() syscall while calling the // version of entersyscall that relinquishes the P so that other Gs can // run. This is meant to be called in cases when the syscall is expected to // block. On non amd64/arm64 platforms it just forwards to the ppoll() system // call. // //go:noescape func BlockingPoll(fds *PollEvent, nfds int, timeout *unix.Timespec) (int, unix.Errno) // Use go:linkname to call into the runtime. As of Go 1.13 this has to // be done from Go code so that we make an ABIInternal call to an // ABIInternal function; see https://golang.org/issue/27539. // We need to call both entersyscallblock and exitsyscall this way so // that the runtime's check on the stack pointer lines up. // Note that calling an unexported function in the runtime package is // unsafe and this hack is likely to break in future Go releases. //go:linkname entersyscallblock runtime.entersyscallblock func entersyscallblock() //go:linkname exitsyscall runtime.exitsyscall func exitsyscall() // These forwarding functions must be nosplit because 1) we must // disallow preemption between entersyscallblock and exitsyscall, and // 2) we have an untyped assembly frame on the stack which can not be // grown or moved. //go:nosplit func callEntersyscallblock() { entersyscallblock() } //go:nosplit func callExitsyscall() { exitsyscall() } golang-gvisor-gvisor-0.0~20240729.0/pkg/rawfile/rawfile_unsafe.go000066400000000000000000000154331465435605700244040ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux // Package rawfile contains utilities for using raw host files on Linux hosts. package rawfile import ( "reflect" "unsafe" "golang.org/x/sys/unix" ) // SizeofIovec is the size of a unix.Iovec in bytes. const SizeofIovec = unsafe.Sizeof(unix.Iovec{}) // MaxIovs is UIO_MAXIOV, the maximum number of iovecs that may be passed to a // host system call in a single array. const MaxIovs = 1024 // IovecFromBytes returns a unix.Iovec representing bs. // // Preconditions: len(bs) > 0. func IovecFromBytes(bs []byte) unix.Iovec { iov := unix.Iovec{ Base: &bs[0], } iov.SetLen(len(bs)) return iov } func bytesFromIovec(iov unix.Iovec) (bs []byte) { sh := (*reflect.SliceHeader)(unsafe.Pointer(&bs)) sh.Data = uintptr(unsafe.Pointer(iov.Base)) sh.Len = int(iov.Len) sh.Cap = int(iov.Len) return } // AppendIovecFromBytes returns append(iovs, IovecFromBytes(bs)). If len(bs) == // 0, AppendIovecFromBytes returns iovs without modification. If len(iovs) >= // max, AppendIovecFromBytes replaces the final iovec in iovs with one that // also includes the contents of bs. Note that this implies that // AppendIovecFromBytes is only usable when the returned iovec slice is used as // the source of a write. func AppendIovecFromBytes(iovs []unix.Iovec, bs []byte, max int) []unix.Iovec { if len(bs) == 0 { return iovs } if len(iovs) < max { return append(iovs, IovecFromBytes(bs)) } iovs[len(iovs)-1] = IovecFromBytes(append(bytesFromIovec(iovs[len(iovs)-1]), bs...)) return iovs } // MMsgHdr represents the mmsg_hdr structure required by recvmmsg() on linux. type MMsgHdr struct { Msg unix.Msghdr Len uint32 _ [4]byte } // SizeofMMsgHdr is the size of a MMsgHdr in bytes. const SizeofMMsgHdr = unsafe.Sizeof(MMsgHdr{}) // GetMTU determines the MTU of a network interface device. func GetMTU(name string) (uint32, error) { fd, err := unix.Socket(unix.AF_UNIX, unix.SOCK_DGRAM, 0) if err != nil { return 0, err } defer unix.Close(fd) var ifreq struct { name [16]byte mtu int32 _ [20]byte } copy(ifreq.name[:], name) _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(fd), unix.SIOCGIFMTU, uintptr(unsafe.Pointer(&ifreq))) if errno != 0 { return 0, errno } return uint32(ifreq.mtu), nil } // NonBlockingWrite writes the given buffer to a file descriptor. It fails if // partial data is written. func NonBlockingWrite(fd int, buf []byte) unix.Errno { var ptr unsafe.Pointer if len(buf) > 0 { ptr = unsafe.Pointer(&buf[0]) } _, _, e := unix.RawSyscall(unix.SYS_WRITE, uintptr(fd), uintptr(ptr), uintptr(len(buf))) return e } // NonBlockingWriteIovec writes iovec to a file descriptor in a single unix. // It fails if partial data is written. func NonBlockingWriteIovec(fd int, iovec []unix.Iovec) unix.Errno { iovecLen := uintptr(len(iovec)) _, _, e := unix.RawSyscall(unix.SYS_WRITEV, uintptr(fd), uintptr(unsafe.Pointer(&iovec[0])), iovecLen) return e } // NonBlockingSendMMsg sends multiple messages on a socket. func NonBlockingSendMMsg(fd int, msgHdrs []MMsgHdr) (int, unix.Errno) { n, _, e := unix.RawSyscall6(unix.SYS_SENDMMSG, uintptr(fd), uintptr(unsafe.Pointer(&msgHdrs[0])), uintptr(len(msgHdrs)), unix.MSG_DONTWAIT, 0, 0) return int(n), e } // PollEvent represents the pollfd structure passed to a poll() system call. type PollEvent struct { FD int32 Events int16 Revents int16 } // BlockingRead reads from a file descriptor that is set up as non-blocking. // If no data is available, it will block in a poll() syscall until the file // descriptor becomes readable. func BlockingRead(fd int, b []byte) (int, unix.Errno) { for { n, _, e := unix.RawSyscall(unix.SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(&b[0])), uintptr(len(b))) if e == 0 { return int(n), 0 } event := PollEvent{ FD: int32(fd), Events: 1, // POLLIN } _, e = BlockingPoll(&event, 1, nil) if e != 0 && e != unix.EINTR { return 0, e } } } // BlockingReadvUntilStopped reads from a file descriptor that is set up as // non-blocking and stores the data in a list of iovecs buffers. If no data is // available, it will block in a poll() syscall until the file descriptor // becomes readable or stop is signalled (efd becomes readable). Returns -1 in // the latter case. func BlockingReadvUntilStopped(efd int, fd int, iovecs []unix.Iovec) (int, unix.Errno) { for { n, _, e := unix.RawSyscall(unix.SYS_READV, uintptr(fd), uintptr(unsafe.Pointer(&iovecs[0])), uintptr(len(iovecs))) if e == 0 { return int(n), 0 } if e != 0 && e != unix.EWOULDBLOCK { return 0, e } stopped, e := BlockingPollUntilStopped(efd, fd, unix.POLLIN) if stopped { return -1, e } if e != 0 && e != unix.EINTR { return 0, e } } } // BlockingRecvMMsgUntilStopped reads from a file descriptor that is set up as // non-blocking and stores the received messages in a slice of MMsgHdr // structures. If no data is available, it will block in a poll() syscall until // the file descriptor becomes readable or stop is signalled (efd becomes // readable). Returns -1 in the latter case. func BlockingRecvMMsgUntilStopped(efd int, fd int, msgHdrs []MMsgHdr) (int, unix.Errno) { for { n, _, e := unix.RawSyscall6(unix.SYS_RECVMMSG, uintptr(fd), uintptr(unsafe.Pointer(&msgHdrs[0])), uintptr(len(msgHdrs)), unix.MSG_DONTWAIT, 0, 0) if e == 0 { return int(n), e } if e != 0 && e != unix.EWOULDBLOCK { return 0, e } stopped, e := BlockingPollUntilStopped(efd, fd, unix.POLLIN) if stopped { return -1, e } if e != 0 && e != unix.EINTR { return 0, e } } } // BlockingPollUntilStopped polls for events on fd or until a stop is signalled // on the event fd efd. Returns true if stopped, i.e., efd has event POLLIN. func BlockingPollUntilStopped(efd int, fd int, events int16) (bool, unix.Errno) { pevents := [...]PollEvent{ { FD: int32(efd), Events: unix.POLLIN, }, { FD: int32(fd), Events: events, }, } _, _, errno := unix.Syscall6(unix.SYS_PPOLL, uintptr(unsafe.Pointer(&pevents[0])), uintptr(len(pevents)), 0, 0, 0, 0) if errno != 0 { return pevents[0].Revents&unix.POLLIN != 0, errno } if pevents[1].Revents&unix.POLLHUP != 0 || pevents[1].Revents&unix.POLLERR != 0 { errno = unix.ECONNRESET } return pevents[0].Revents&unix.POLLIN != 0, errno } golang-gvisor-gvisor-0.0~20240729.0/pkg/rawfile/rawfile_unsafe_state_autogen.go000066400000000000000000000004211465435605700273150ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux && !amd64 && !arm64 && ((linux && amd64) || (linux && arm64)) && go1.18 && linux // +build linux // +build !amd64 // +build !arm64 // +build linux,amd64 linux,arm64 // +build go1.18 // +build linux package rawfile golang-gvisor-gvisor-0.0~20240729.0/pkg/refs/000077500000000000000000000000001465435605700203635ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/refs/refcounter.go000066400000000000000000000115531465435605700230730ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package refs defines an interface for reference counted objects. package refs import ( "bytes" "fmt" "runtime" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" ) // RefCounter is the interface to be implemented by objects that are reference // counted. type RefCounter interface { // IncRef increments the reference counter on the object. IncRef() // DecRef decrements the object's reference count. Users of refs_template.Refs // may specify a destructor to be called once the reference count reaches zero. DecRef(ctx context.Context) } // TryRefCounter is like RefCounter but allow the ref increment to be tried. type TryRefCounter interface { RefCounter // TryIncRef attempts to increment the reference count, but may fail if all // references have already been dropped, in which case it returns false. If // true is returned, then a valid reference is now held on the object. TryIncRef() bool } // LeakMode configures the leak checker. type LeakMode uint32 const ( // NoLeakChecking indicates that no effort should be made to check for // leaks. NoLeakChecking LeakMode = iota // LeaksLogWarning indicates that a warning should be logged when leaks // are found. LeaksLogWarning // LeaksPanic indidcates that a panic should be issued when leaks are found. LeaksPanic ) // Set implements flag.Value. func (l *LeakMode) Set(v string) error { switch v { case "disabled": *l = NoLeakChecking case "log-names": *l = LeaksLogWarning case "panic": *l = LeaksPanic default: return fmt.Errorf("invalid ref leak mode %q", v) } return nil } // Get implements flag.Value. func (l *LeakMode) Get() any { return *l } // String implements flag.Value. func (l LeakMode) String() string { switch l { case NoLeakChecking: return "disabled" case LeaksLogWarning: return "log-names" case LeaksPanic: return "panic" default: panic(fmt.Sprintf("invalid ref leak mode %d", l)) } } // leakMode stores the current mode for the reference leak checker. // // Values must be one of the LeakMode values. // // leakMode must be accessed atomically. var leakMode atomicbitops.Uint32 // SetLeakMode configures the reference leak checker. func SetLeakMode(mode LeakMode) { leakMode.Store(uint32(mode)) } // GetLeakMode returns the current leak mode. func GetLeakMode() LeakMode { return LeakMode(leakMode.Load()) } const maxStackFrames = 40 type fileLine struct { file string line int } // A stackKey is a representation of a stack frame for use as a map key. // // The fileLine type is used as PC values seem to vary across collections, even // for the same call stack. type stackKey [maxStackFrames]fileLine var stackCache = struct { sync.Mutex entries map[stackKey][]uintptr }{entries: map[stackKey][]uintptr{}} func makeStackKey(pcs []uintptr) stackKey { frames := runtime.CallersFrames(pcs) var key stackKey keySlice := key[:0] for { frame, more := frames.Next() keySlice = append(keySlice, fileLine{frame.File, frame.Line}) if !more || len(keySlice) == len(key) { break } } return key } // RecordStack constructs and returns the PCs on the current stack. func RecordStack() []uintptr { pcs := make([]uintptr, maxStackFrames) n := runtime.Callers(1, pcs) if n == 0 { // No pcs available. Stop now. // // This can happen if the first argument to runtime.Callers // is large. return nil } pcs = pcs[:n] key := makeStackKey(pcs) stackCache.Lock() v, ok := stackCache.entries[key] if !ok { // Reallocate to prevent pcs from escaping. v = append([]uintptr(nil), pcs...) stackCache.entries[key] = v } stackCache.Unlock() return v } // FormatStack converts the given stack into a readable format. func FormatStack(pcs []uintptr) string { frames := runtime.CallersFrames(pcs) var trace bytes.Buffer for { frame, more := frames.Next() fmt.Fprintf(&trace, "%s:%d: %s\n", frame.File, frame.Line, frame.Function) if !more { break } } return trace.String() } // OnExit is called on sandbox exit. It runs GC to enqueue refcount finalizers, // which check for reference leaks. There is no way to guarantee that every // finalizer will run before exiting, but this at least ensures that they will // be discovered/enqueued by GC. func OnExit() { if LeakMode(leakMode.Load()) != NoLeakChecking { runtime.GC() } } golang-gvisor-gvisor-0.0~20240729.0/pkg/refs/refs_map.go000066400000000000000000000117331465435605700225130ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package refs import ( "fmt" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" ) var ( // liveObjects is a global map of reference-counted objects. Objects are // inserted when leak check is enabled, and they are removed when they are // destroyed. It is protected by liveObjectsMu. liveObjects map[CheckedObject]struct{} liveObjectsMu sync.Mutex ) // CheckedObject represents a reference-counted object with an informative // leak detection message. type CheckedObject interface { // RefType is the type of the reference-counted object. RefType() string // LeakMessage supplies a warning to be printed upon leak detection. LeakMessage() string // LogRefs indicates whether reference-related events should be logged. LogRefs() bool } func init() { liveObjects = make(map[CheckedObject]struct{}) } // LeakCheckEnabled returns whether leak checking is enabled. The following // functions should only be called if it returns true. func LeakCheckEnabled() bool { mode := GetLeakMode() return mode != NoLeakChecking } // leakCheckPanicEnabled returns whether DoLeakCheck() should panic when leaks // are detected. func leakCheckPanicEnabled() bool { return GetLeakMode() == LeaksPanic } // Register adds obj to the live object map. func Register(obj CheckedObject) { if LeakCheckEnabled() { liveObjectsMu.Lock() if _, ok := liveObjects[obj]; ok { panic(fmt.Sprintf("Unexpected entry in leak checking map: reference %p already added", obj)) } liveObjects[obj] = struct{}{} liveObjectsMu.Unlock() if LeakCheckEnabled() && obj.LogRefs() { logEvent(obj, "registered") } } } // Unregister removes obj from the live object map. func Unregister(obj CheckedObject) { if LeakCheckEnabled() { liveObjectsMu.Lock() defer liveObjectsMu.Unlock() if _, ok := liveObjects[obj]; !ok { panic(fmt.Sprintf("Expected to find entry in leak checking map for reference %p", obj)) } delete(liveObjects, obj) if LeakCheckEnabled() && obj.LogRefs() { logEvent(obj, "unregistered") } } } // LogIncRef logs a reference increment. func LogIncRef(obj CheckedObject, refs int64) { if LeakCheckEnabled() && obj.LogRefs() { logEvent(obj, fmt.Sprintf("IncRef to %d", refs)) } } // LogTryIncRef logs a successful TryIncRef call. func LogTryIncRef(obj CheckedObject, refs int64) { if LeakCheckEnabled() && obj.LogRefs() { logEvent(obj, fmt.Sprintf("TryIncRef to %d", refs)) } } // LogDecRef logs a reference decrement. func LogDecRef(obj CheckedObject, refs int64) { if LeakCheckEnabled() && obj.LogRefs() { logEvent(obj, fmt.Sprintf("DecRef to %d", refs)) } } // logEvent logs a message for the given reference-counted object. // // obj.LogRefs() should be checked before calling logEvent, in order to avoid // calling any text processing needed to evaluate msg. func logEvent(obj CheckedObject, msg string) { log.Infof("[%s %p] %s:\n%s", obj.RefType(), obj, msg, FormatStack(RecordStack())) } // checkOnce makes sure that leak checking is only done once. DoLeakCheck is // called from multiple places (which may overlap) to cover different sandbox // exit scenarios. var checkOnce sync.Once // DoLeakCheck iterates through the live object map and logs a message for each // object. It should be called when no reference-counted objects are reachable // anymore, at which point anything left in the map is considered a leak. On // multiple calls, only the first call will perform the leak check. func DoLeakCheck() { if LeakCheckEnabled() { checkOnce.Do(doLeakCheck) } } // DoRepeatedLeakCheck is the same as DoLeakCheck except that it can be called // multiple times by the caller to incrementally perform leak checking. func DoRepeatedLeakCheck() { if LeakCheckEnabled() { doLeakCheck() } } type leakCheckDisabled interface { LeakCheckDisabled() bool } // CleanupSync is used to wait for async cleanup actions. var CleanupSync sync.WaitGroup func doLeakCheck() { CleanupSync.Wait() liveObjectsMu.Lock() defer liveObjectsMu.Unlock() leaked := len(liveObjects) if leaked > 0 { n := 0 msg := fmt.Sprintf("Leak checking detected %d leaked objects:\n", leaked) for obj := range liveObjects { skip := false if o, ok := obj.(leakCheckDisabled); ok { skip = o.LeakCheckDisabled() } if skip { log.Debugf(obj.LeakMessage()) continue } msg += obj.LeakMessage() + "\n" n++ } if n == 0 { return } if leakCheckPanicEnabled() { panic(msg) } log.Warningf(msg) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/refs/refs_state_autogen.go000066400000000000000000000000661465435605700245750ustar00rootroot00000000000000// automatically generated by stateify. package refs golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/000077500000000000000000000000001465435605700204435ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/aarch64.go000066400000000000000000000044761465435605700222350ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package ring0 // Useful bits. const ( _PGD_PGT_BASE = 0x1000 _PGD_PGT_SIZE = 0x1000 _PUD_PGT_BASE = 0x2000 _PUD_PGT_SIZE = 0x1000 _PMD_PGT_BASE = 0x3000 _PMD_PGT_SIZE = 0x4000 _PTE_PGT_BASE = 0x7000 _PTE_PGT_SIZE = 0x1000 ) const ( // VirtualAddressBits is fixed at 48. VirtualAddressBits = 48 // PhysicalAddressBits is fixed at 40. PhysicalAddressBits = 40 // DAIF bits:debug, sError, IRQ, FIQ. _PSR_D_BIT = 0x00000200 _PSR_A_BIT = 0x00000100 _PSR_I_BIT = 0x00000080 _PSR_F_BIT = 0x00000040 _PSR_DAIF_SHIFT = 6 _PSR_DAIF_MASK = 0xf << _PSR_DAIF_SHIFT // PSR bits. _PSR_MODE_EL0t = 0x00000000 _PSR_MODE_EL1t = 0x00000004 _PSR_MODE_EL1h = 0x00000005 _PSR_MODE_MASK = 0x0000000f PsrFlagsClear = _PSR_MODE_MASK | _PSR_DAIF_MASK PsrModeMask = _PSR_MODE_MASK // KernelFlagsSet should always be set in the kernel. KernelFlagsSet = _PSR_MODE_EL1h | _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT // UserFlagsSet are always set in userspace. UserFlagsSet = _PSR_MODE_EL0t ) // Vector is an exception vector. type Vector uintptr // Exception vectors. const ( El1InvSync = iota El1InvIrq El1InvFiq El1InvError El1Sync El1Irq El1Fiq El1Err El0Sync El0Irq El0Fiq El0Err El0InvSync El0InvIrq El0InvFiq El0InvErr El1SyncDa El1SyncIa El1SyncSpPc El1SyncUndef El1SyncDbg El1SyncInv El0SyncSVC El0SyncDa El0SyncIa El0SyncFpsimdAcc El0SyncSveAcc El0SyncFpsimdExc El0SyncSys El0SyncSpPc El0SyncUndef El0SyncDbg El0SyncWfx El0SyncInv El0ErrNMI El0ErrBounce _NR_INTERRUPTS ) // System call vectors. const ( Syscall Vector = El0SyncSVC PageFault Vector = El0SyncDa VirtualizationException Vector = El0ErrBounce ) golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/defs.go000066400000000000000000000072471465435605700217250ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ring0 import ( "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" ) // Kernel is a global kernel object. // // This contains global state, shared by multiple CPUs. type Kernel struct { // PageTables are the kernel pagetables; this must be provided. PageTables *pagetables.PageTables KernelArchState } // Hooks are hooks for kernel functions. type Hooks interface { // KernelSyscall is called for kernel system calls. // // Return from this call will restore registers and return to the kernel: the // registers must be modified directly. // // If this function is not provided, a kernel exception results in halt. // // This must be go:nosplit, as this will be on the interrupt stack. // Closures are permitted, as the pointer to the closure frame is not // passed on the stack. KernelSyscall() // KernelException handles an exception during kernel execution. // // Return from this call will restore registers and return to the kernel: the // registers must be modified directly. // // If this function is not provided, a kernel exception results in halt. // // This must be go:nosplit, as this will be on the interrupt stack. // Closures are permitted, as the pointer to the closure frame is not // passed on the stack. KernelException(Vector) } // CPU is the per-CPU struct. type CPU struct { // self is a self reference. // // This is always guaranteed to be at offset zero. self *CPU // kernel is reference to the kernel that this CPU was initialized // with. This reference is kept for garbage collection purposes: CPU // registers may refer to objects within the Kernel object that cannot // be safely freed. kernel *Kernel // CPUArchState is architecture-specific state. CPUArchState // registers is a set of registers; these may be used on kernel system // calls and exceptions via the Registers function. registers arch.Registers // floatingPointState holds floating point state. floatingPointState fpu.State // hooks are kernel hooks. hooks Hooks } // Registers returns a modifiable-copy of the kernel registers. // // This is explicitly safe to call during KernelException and KernelSyscall. // //go:nosplit func (c *CPU) Registers() *arch.Registers { return &c.registers } // FloatingPointState returns the kernel floating point state. // // This is explicitly safe to call during KernelException and KernelSyscall. // //go:nosplit func (c *CPU) FloatingPointState() *fpu.State { return &c.floatingPointState } // SwitchOpts are passed to the Switch function. type SwitchOpts struct { // Registers are the user register state. Registers *arch.Registers // FloatingPointState is a byte pointer where floating point state is // saved and restored. FloatingPointState *fpu.State // PageTables are the application page tables. PageTables *pagetables.PageTables // Flush indicates that a TLB flush should be forced on switch. Flush bool // FullRestore indicates that an iret-based restore should be used. FullRestore bool // SwitchArchOpts are architecture-specific options. SwitchArchOpts } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/defs_amd64.go000066400000000000000000000125561465435605700227170ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package ring0 var ( // VirtualAddressBits is the number of bits available in the virtual // address space. // // Initialized by ring0.Init. VirtualAddressBits uintptr // PhysicalAddressBits is the number of bits available in the physical // address space. // // Initialized by ring0.Init. PhysicalAddressBits uintptr // UserspaceSize is the total size of userspace. // // Initialized by ring0.Init. UserspaceSize uintptr // MaximumUserAddress is the largest possible user address. // // Initialized by ring0.Init. MaximumUserAddress uintptr // KernelStartAddress is the starting kernel address. // // Initialized by ring0.Init. KernelStartAddress uintptr ) // Segment indices and Selectors. const ( // Index into GDT array. _ = iota // Null descriptor first. _ // Reserved (Linux is kernel 32). segKcode // Kernel code (64-bit). segKdata // Kernel data. segUcode32 // User code (32-bit). segUdata // User data. segUcode64 // User code (64-bit). segTss // Task segment descriptor. segTssHi // Upper bits for TSS. segLast // Last segment (terminal, not included). ) // Selectors. const ( Kcode Selector = segKcode << 3 Kdata Selector = segKdata << 3 Ucode32 Selector = (segUcode32 << 3) | 3 Udata Selector = (segUdata << 3) | 3 Ucode64 Selector = (segUcode64 << 3) | 3 Tss Selector = segTss << 3 ) // Standard segments. var ( UserCodeSegment32 SegmentDescriptor UserDataSegment SegmentDescriptor UserCodeSegment64 SegmentDescriptor KernelCodeSegment SegmentDescriptor KernelDataSegment SegmentDescriptor ) // KernelArchState contains architecture-specific state. type KernelArchState struct { // cpuEntries is array of kernelEntry for all cpus. cpuEntries []kernelEntry // globalIDT is our set of interrupt gates. globalIDT *idt64 } // kernelEntry contains minimal CPU-specific arch state // that can be mapped at the upper of the address space. // Malicious APP might steal info from it via CPU bugs. type kernelEntry struct { // stack is the stack used for interrupts on this CPU. stack [256]byte // scratch space for temporary usage. scratch0 uint64 // stackTop is the top of the stack. stackTop uint64 // cpuSelf is back reference to CPU. cpuSelf *CPU // kernelCR3 is the cr3 used for sentry kernel. kernelCR3 uintptr // gdt is the CPU's descriptor table. gdt descriptorTable // tss is the CPU's task state. tss TaskState64 } // CPUArchState contains CPU-specific arch state. type CPUArchState struct { // errorCode is the error code from the last exception. errorCode uintptr // errorType indicates the type of error code here, it is always set // along with the errorCode value above. // // It will either by 1, which indicates a user error, or 0 indicating a // kernel error. If the error code below returns false (kernel error), // then it cannot provide relevant information about the last // exception. errorType uintptr // vector is the vector of the last exception. vector uintptr // faultAddr is the value of the cr2 register. faultAddr uintptr *kernelEntry appGsBase uint64 // Copies of global variables, stored in CPU so that they can be used by // syscall and exception handlers (in the upper address space). hasXSAVE bool hasXSAVEOPT bool hasFSGSBASE bool } // ErrorCode returns the last error code. // // The returned boolean indicates whether the error code corresponds to the // last user error or not. If it does not, then fault information must be // ignored. This is generally the result of a kernel fault while servicing a // user fault. // //go:nosplit func (c *CPU) ErrorCode() (value uintptr, user bool) { return c.errorCode, c.errorType != 0 } // ClearErrorCode resets the error code. // //go:nosplit func (c *CPU) ClearErrorCode() { c.errorCode = 0 // No code. c.errorType = 1 // User mode. } // Vector returns the vector of the last exception. // //go:nosplit func (c *CPU) Vector() uintptr { return c.vector } // FaultAddr returns the last fault address. // //go:nosplit func (c *CPU) FaultAddr() uintptr { return c.faultAddr } // SwitchArchOpts are embedded in SwitchOpts. type SwitchArchOpts struct { // UserPCID indicates that the application PCID to be used on switch, // assuming that PCIDs are supported. // // Per pagetables_x86.go, a zero PCID implies a flush. UserPCID uint16 // KernelPCID indicates that the kernel PCID to be used on return, // assuming that PCIDs are supported. // // Per pagetables_x86.go, a zero PCID implies a flush. KernelPCID uint16 } func init() { KernelCodeSegment.setCode64(0, 0, 0) KernelDataSegment.setData(0, 0xffffffff, 0) UserCodeSegment32.setCode64(0, 0, 3) UserDataSegment.setData(0, 0xffffffff, 3) UserCodeSegment64.setCode64(0, 0, 3) } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/defs_arm64.go000066400000000000000000000070071465435605700227300ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package ring0 import ( "gvisor.dev/gvisor/pkg/hostarch" ) const ( // UserspaceSize is the total size of userspace. UserspaceSize = uintptr(1) << VirtualAddressBits // MaximumUserAddress is the largest possible user address. MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(hostarch.PageSize-1) // KernelStartAddress is the starting kernel address. KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1) ) // KernelArchState contains architecture-specific state. type KernelArchState struct { } // CPUArchState contains CPU-specific arch state. type CPUArchState struct { // stack is the stack used for interrupts on this CPU. stack [128]byte // errorCode is the error code from the last exception. errorCode uintptr // errorType indicates the type of error code here, it is always set // along with the errorCode value above. // // It will either by 1, which indicates a user error, or 0 indicating a // kernel error. If the error code below returns false (kernel error), // then it cannot provide relevant information about the last // exception. errorType uintptr // faultAddr is the value of far_el1. faultAddr uintptr // el0Fp is the address of application's fpstate. el0Fp uintptr // ttbr0Kvm is the value of ttbr0_el1 for sentry. ttbr0Kvm uintptr // ttbr0App is the value of ttbr0_el1 for application. ttbr0App uintptr // exception vector. vecCode Vector // application context pointer. appAddr uintptr // lazyVFP is the value of cpacr_el1. lazyVFP uintptr // appASID is the asid value of guest application. appASID uintptr } // ErrorCode returns the last error code. // // The returned boolean indicates whether the error code corresponds to the // last user error or not. If it does not, then fault information must be // ignored. This is generally the result of a kernel fault while servicing a // user fault. // //go:nosplit func (c *CPU) ErrorCode() (value uintptr, user bool) { return c.errorCode, c.errorType != 0 } // ClearErrorCode resets the error code. // //go:nosplit func (c *CPU) ClearErrorCode() { c.errorCode = 0 // No code. c.errorType = 1 // User mode. } // FaultAddr returns the last fault address. // //go:nosplit func (c *CPU) FaultAddr() (value uintptr) { return c.faultAddr } //go:nosplit func (c *CPU) SetTtbr0Kvm(value uintptr) { c.ttbr0Kvm = value } //go:nosplit func (c *CPU) SetTtbr0App(value uintptr) { c.ttbr0App = value } //go:nosplit func (c *CPU) GetVector() (value Vector) { return c.vecCode } //go:nosplit func (c *CPU) SetAppAddr(value uintptr) { c.appAddr = value } // GetLazyVFP returns the value of cpacr_el1. // //go:nosplit func (c *CPU) GetLazyVFP() (value uintptr) { return c.lazyVFP } // SwitchArchOpts are embedded in SwitchOpts. type SwitchArchOpts struct { // UserASID indicates that the application ASID to be used on switch, UserASID uint16 // KernelASID indicates that the kernel ASID to be used on return, KernelASID uint16 } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/entry_amd64.go000066400000000000000000000136001465435605700231260ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package ring0 import ( "gvisor.dev/gvisor/pkg/sentry/arch" ) // This is an assembly function. // // The sysenter function is invoked in two situations: // // (1) The guest kernel has executed a system call. // (2) The guest application has executed a system call. // // The interrupt flag is examined to determine whether the system call was // executed from kernel mode or not and the appropriate stub is called. func sysenter() // addrOfSysenter returns the start address of sysenter. // // In Go 1.17+, Go references to assembly functions resolve to an ABIInternal // wrapper function rather than the function itself. We must reference from // assembly to get the ABI0 (i.e., primary) address. func addrOfSysenter() uintptr // jumpToKernel jumps to the kernel version of the current RIP. func jumpToKernel() // jumpToUser jumps to the user version of the current RIP. func jumpToUser() // sysret returns to userspace from a system call. // // The return code is the vector that interrupted execution. // // See stubs.go for a note regarding the frame size of this function. func sysret(cpu *CPU, regs *arch.Registers, userCR3 uintptr) Vector // "iret is the cadillac of CPL switching." // // -- Neel Natu // // iret is nearly identical to sysret, except an iret is used to fully restore // all user state. This must be called in cases where all registers need to be // restored. func iret(cpu *CPU, regs *arch.Registers, userCR3 uintptr) Vector // exception is the generic exception entry. // // This is called by the individual stub definitions. func exception() // resume is a stub that restores the CPU kernel registers. // // This is used when processing kernel exceptions and syscalls. func resume() // start is the CPU entrypoint. // // See requirements below. func start() // AddrOfStart return the address of the CPU entrypoint. // // The following start conditions must be satisfied: // // - AX should contain the CPU pointer. // - c.GDT() should be loaded as the GDT. // - c.IDT() should be loaded as the IDT. // - c.CR0() should be the current CR0 value. // - c.CR3() should be set to the kernel PageTables. // - c.CR4() should be the current CR4 value. // - c.EFER() should be the current EFER value. // // The CPU state will be set to c.Registers(). // // In Go 1.17+, Go references to assembly functions resolve to an ABIInternal // wrapper function rather than the function itself. We must reference from // assembly to get the ABI0 (i.e., primary) address. func AddrOfStart() uintptr // Exception stubs. func divideByZero() func debug() func nmi() func breakpoint() func overflow() func boundRangeExceeded() func invalidOpcode() func deviceNotAvailable() func doubleFault() func coprocessorSegmentOverrun() func invalidTSS() func segmentNotPresent() func stackSegmentFault() func generalProtectionFault() func pageFault() func x87FloatingPointException() func alignmentCheck() func machineCheck() func simdFloatingPointException() func virtualizationException() func securityException() func syscallInt80() // These returns the start address of the functions above. // // In Go 1.17+, Go references to assembly functions resolve to an ABIInternal // wrapper function rather than the function itself. We must reference from // assembly to get the ABI0 (i.e., primary) address. func addrOfDivideByZero() uintptr func addrOfDebug() uintptr func addrOfNMI() uintptr func addrOfBreakpoint() uintptr func addrOfOverflow() uintptr func addrOfBoundRangeExceeded() uintptr func addrOfInvalidOpcode() uintptr func addrOfDeviceNotAvailable() uintptr func addrOfDoubleFault() uintptr func addrOfCoprocessorSegmentOverrun() uintptr func addrOfInvalidTSS() uintptr func addrOfSegmentNotPresent() uintptr func addrOfStackSegmentFault() uintptr func addrOfGeneralProtectionFault() uintptr func addrOfPageFault() uintptr func addrOfX87FloatingPointException() uintptr func addrOfAlignmentCheck() uintptr func addrOfMachineCheck() uintptr func addrOfSimdFloatingPointException() uintptr func addrOfVirtualizationException() uintptr func addrOfSecurityException() uintptr func addrOfSyscallInt80() uintptr // Exception handler index. var handlers = map[Vector]uintptr{ DivideByZero: addrOfDivideByZero(), Debug: addrOfDebug(), NMI: addrOfNMI(), Breakpoint: addrOfBreakpoint(), Overflow: addrOfOverflow(), BoundRangeExceeded: addrOfBoundRangeExceeded(), InvalidOpcode: addrOfInvalidOpcode(), DeviceNotAvailable: addrOfDeviceNotAvailable(), DoubleFault: addrOfDoubleFault(), CoprocessorSegmentOverrun: addrOfCoprocessorSegmentOverrun(), InvalidTSS: addrOfInvalidTSS(), SegmentNotPresent: addrOfSegmentNotPresent(), StackSegmentFault: addrOfStackSegmentFault(), GeneralProtectionFault: addrOfGeneralProtectionFault(), PageFault: addrOfPageFault(), X87FloatingPointException: addrOfX87FloatingPointException(), AlignmentCheck: addrOfAlignmentCheck(), MachineCheck: addrOfMachineCheck(), SIMDFloatingPointException: addrOfSimdFloatingPointException(), VirtualizationException: addrOfVirtualizationException(), SecurityException: addrOfSecurityException(), SyscallInt80: addrOfSyscallInt80(), } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/entry_amd64.s000066400000000000000000000630571465435605700227760ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "funcdata.h" #include "textflag.h" // CPU offsets. #define CPU_REGISTERS 72 // +checkoffset . CPU.registers #define CPU_FPU_STATE 288 // +checkoffset . CPU.floatingPointState #define CPU_ARCH_STATE 16 // +checkoffset . CPU.CPUArchState #define CPU_ERROR_CODE CPU_ARCH_STATE+0 // +checkoffset . CPUArchState.errorCode #define CPU_ERROR_TYPE CPU_ARCH_STATE+8 // +checkoffset . CPUArchState.errorType #define CPU_VECTOR CPU_ARCH_STATE+16 // +checkoffset . CPUArchState.vector #define CPU_FAULT_ADDR CPU_ARCH_STATE+24 // +checkoffset . CPUArchState.faultAddr #define CPU_ENTRY CPU_ARCH_STATE+32 // +checkoffset . CPUArchState.kernelEntry #define CPU_APP_GS_BASE CPU_ARCH_STATE+40 // +checkoffset . CPUArchState.appGsBase #define CPU_HAS_XSAVE CPU_ARCH_STATE+48 // +checkoffset . CPUArchState.hasXSAVE #define CPU_HAS_XSAVEOPT CPU_ARCH_STATE+49 // +checkoffset . CPUArchState.hasXSAVEOPT #define CPU_HAS_FSGSBASE CPU_ARCH_STATE+50 // +checkoffset . CPUArchState.hasFSGSBASE #define ENTRY_SCRATCH0 256 // +checkoffset . kernelEntry.scratch0 #define ENTRY_STACK_TOP 264 // +checkoffset . kernelEntry.stackTop #define ENTRY_CPU_SELF 272 // +checkoffset . kernelEntry.cpuSelf #define ENTRY_KERNEL_CR3 280 // +checkoffset . kernelEntry.kernelCR3 // Bits. #define _RFLAGS_IF 512 // +checkconst . _RFLAGS_IF #define _RFLAGS_IOPL0 4096 // +checkconst . _RFLAGS_IOPL0 #define _KERNEL_FLAGS 2 // +checkconst . KernelFlagsSet // Vectors. #define DivideByZero 0 // +checkconst . DivideByZero #define Debug 1 // +checkconst . Debug #define NMI 2 // +checkconst . NMI #define Breakpoint 3 // +checkconst . Breakpoint #define Overflow 4 // +checkconst . Overflow #define BoundRangeExceeded 5 // +checkconst . BoundRangeExceeded #define InvalidOpcode 6 // +checkconst . InvalidOpcode #define DeviceNotAvailable 7 // +checkconst . DeviceNotAvailable #define DoubleFault 8 // +checkconst . DoubleFault #define CoprocessorSegmentOverrun 9 // +checkconst . CoprocessorSegmentOverrun #define InvalidTSS 10 // +checkconst . InvalidTSS #define SegmentNotPresent 11 // +checkconst . SegmentNotPresent #define StackSegmentFault 12 // +checkconst . StackSegmentFault #define GeneralProtectionFault 13 // +checkconst . GeneralProtectionFault #define PageFault 14 // +checkconst . PageFault #define X87FloatingPointException 16 // +checkconst . X87FloatingPointException #define AlignmentCheck 17 // +checkconst . AlignmentCheck #define MachineCheck 18 // +checkconst . MachineCheck #define SIMDFloatingPointException 19 // +checkconst . SIMDFloatingPointException #define VirtualizationException 20 // +checkconst . VirtualizationException #define SecurityException 30 // +checkconst . SecurityException #define SyscallInt80 128 // +checkconst . SyscallInt80 #define Syscall 256 // +checkconst . Syscall #define PTRACE_R15 0 // +checkoffset linux PtraceRegs.R15 #define PTRACE_R14 8 // +checkoffset linux PtraceRegs.R14 #define PTRACE_R13 16 // +checkoffset linux PtraceRegs.R13 #define PTRACE_R12 24 // +checkoffset linux PtraceRegs.R12 #define PTRACE_RBP 32 // +checkoffset linux PtraceRegs.Rbp #define PTRACE_RBX 40 // +checkoffset linux PtraceRegs.Rbx #define PTRACE_R11 48 // +checkoffset linux PtraceRegs.R11 #define PTRACE_R10 56 // +checkoffset linux PtraceRegs.R10 #define PTRACE_R9 64 // +checkoffset linux PtraceRegs.R9 #define PTRACE_R8 72 // +checkoffset linux PtraceRegs.R8 #define PTRACE_RAX 80 // +checkoffset linux PtraceRegs.Rax #define PTRACE_RCX 88 // +checkoffset linux PtraceRegs.Rcx #define PTRACE_RDX 96 // +checkoffset linux PtraceRegs.Rdx #define PTRACE_RSI 104 // +checkoffset linux PtraceRegs.Rsi #define PTRACE_RDI 112 // +checkoffset linux PtraceRegs.Rdi #define PTRACE_ORIGRAX 120 // +checkoffset linux PtraceRegs.Orig_rax #define PTRACE_RIP 128 // +checkoffset linux PtraceRegs.Rip #define PTRACE_CS 136 // +checkoffset linux PtraceRegs.Cs #define PTRACE_FLAGS 144 // +checkoffset linux PtraceRegs.Eflags #define PTRACE_RSP 152 // +checkoffset linux PtraceRegs.Rsp #define PTRACE_SS 160 // +checkoffset linux PtraceRegs.Ss #define PTRACE_FS_BASE 168 // +checkoffset linux PtraceRegs.Fs_base #define PTRACE_GS_BASE 176 // +checkoffset linux PtraceRegs.Gs_base // The value for XCR0 is defined to xsave/xrstor everything except for PKRU and // AMX regions. // TODO(gvisor.dev/issues/9896): Implement AMX support. // TODO(gvisor.dev/issues/10087): Implement PKRU support. #define XCR0_DISABLED_MASK ((1 << 9) | (1 << 17) | (1 << 18)) #define XCR0_EAX (0xffffffff ^ XCR0_DISABLED_MASK) #define XCR0_EDX 0xffffffff // Saves a register set. // // This is a macro because it may need to executed in contents where a stack is // not available for calls. // // The following registers are not saved: AX, SP, IP, FLAGS, all segments. #define REGISTERS_SAVE(reg, offset) \ MOVQ R15, offset+PTRACE_R15(reg); \ MOVQ R14, offset+PTRACE_R14(reg); \ MOVQ R13, offset+PTRACE_R13(reg); \ MOVQ R12, offset+PTRACE_R12(reg); \ MOVQ BP, offset+PTRACE_RBP(reg); \ MOVQ BX, offset+PTRACE_RBX(reg); \ MOVQ CX, offset+PTRACE_RCX(reg); \ MOVQ DX, offset+PTRACE_RDX(reg); \ MOVQ R11, offset+PTRACE_R11(reg); \ MOVQ R10, offset+PTRACE_R10(reg); \ MOVQ R9, offset+PTRACE_R9(reg); \ MOVQ R8, offset+PTRACE_R8(reg); \ MOVQ SI, offset+PTRACE_RSI(reg); \ MOVQ DI, offset+PTRACE_RDI(reg); // Loads a register set. // // This is a macro because it may need to executed in contents where a stack is // not available for calls. // // The following registers are not loaded: AX, SP, IP, FLAGS, all segments. #define REGISTERS_LOAD(reg, offset) \ MOVQ offset+PTRACE_R15(reg), R15; \ MOVQ offset+PTRACE_R14(reg), R14; \ MOVQ offset+PTRACE_R13(reg), R13; \ MOVQ offset+PTRACE_R12(reg), R12; \ MOVQ offset+PTRACE_RBP(reg), BP; \ MOVQ offset+PTRACE_RBX(reg), BX; \ MOVQ offset+PTRACE_RCX(reg), CX; \ MOVQ offset+PTRACE_RDX(reg), DX; \ MOVQ offset+PTRACE_R11(reg), R11; \ MOVQ offset+PTRACE_R10(reg), R10; \ MOVQ offset+PTRACE_R9(reg), R9; \ MOVQ offset+PTRACE_R8(reg), R8; \ MOVQ offset+PTRACE_RSI(reg), SI; \ MOVQ offset+PTRACE_RDI(reg), DI; // WRITE_CR3() writes the given CR3 value. // // The code corresponds to: // // mov %rax, %cr3 // #define WRITE_CR3() \ BYTE $0x0f; BYTE $0x22; BYTE $0xd8; // SWAP_GS swaps the kernel GS (CPU). #define SWAP_GS() \ BYTE $0x0F; BYTE $0x01; BYTE $0xf8; // IRET returns from an interrupt frame. #define IRET() \ BYTE $0x48; BYTE $0xcf; // SYSRET64 executes the sysret instruction. #define SYSRET64() \ BYTE $0x48; BYTE $0x0f; BYTE $0x07; // LOAD_KERNEL_STACK loads the kernel stack. #define LOAD_KERNEL_STACK(entry) \ MOVQ ENTRY_STACK_TOP(entry), SP; // ADDR_OF_FUNC defines a function named 'name' that returns the address of // 'symbol'. #define ADDR_OF_FUNC(name, symbol) \ TEXT name,$0-8; \ MOVQ $symbol, AX; \ MOVQ AX, ret+0(FP); \ RET // See kernel.go. TEXT ·Halt(SB),NOSPLIT|NOFRAME,$0 HLT RET // See kernel_amd64.go. TEXT ·HaltAndWriteFSBase(SB),NOSPLIT,$8-8 HLT // Restore FS_BASE. MOVQ regs+0(FP), AX MOVQ PTRACE_FS_BASE(AX), AX PUSHQ AX // First argument (FS_BASE) CALL ·writeFS(SB) POPQ AX RET // jumpToKernel changes execution to the kernel address space. // // This works by changing the return value to the kernel version. TEXT ·jumpToKernel(SB),NOSPLIT|NOFRAME,$0 MOVQ 0(SP), AX ORQ ·KernelStartAddress(SB), AX // Future return value. MOVQ AX, 0(SP) RET // jumpToUser changes execution to the user address space. // // This works by changing the return value to the user version. TEXT ·jumpToUser(SB),NOSPLIT|NOFRAME,$0 // N.B. we can't access KernelStartAddress from the upper half (data // pages not available), so just naively clear all the upper bits. // We are assuming a 47-bit virtual address space. MOVQ $0x00007fffffffffff, AX MOVQ 0(SP), BX ANDQ BX, AX // Future return value. MOVQ AX, 0(SP) RET // See kernel_amd64.go. // // The 16-byte frame size is for the saved values of MXCSR and the x87 control // word. TEXT ·doSwitchToUser(SB),NOSPLIT,$16-48 // We are passed pointers to heap objects, but do not store them in our // local frame. NO_LOCAL_POINTERS // MXCSR and the x87 control word are the only floating point state // that is callee-save and thus we must save. STMXCSR mxcsr-0(SP) FSTCW cw-8(SP) // Restore application floating point state. MOVQ cpu+0(FP), SI MOVQ fpState+16(FP), DI MOVB ·hasXSAVE(SB), BX TESTB BX, BX JZ no_xrstor // Use xrstor to restore all available fp state. MOVL $XCR0_EAX, AX MOVL $XCR0_EDX, DX BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f // XRSTOR64 0(DI) JMP fprestore_done no_xrstor: // Fall back to fxrstor if xsave is not available. FXRSTOR64 0(DI) fprestore_done: // Set application GS. MOVQ regs+8(FP), R8 SWAP_GS() MOVQ PTRACE_GS_BASE(R8), AX CMPQ AX, CPU_APP_GS_BASE(SI) JE skip_gs MOVQ AX, CPU_APP_GS_BASE(SI) PUSHQ AX CALL ·writeGS(SB) POPQ AX skip_gs: // Call sysret() or iret(). MOVQ userCR3+24(FP), CX MOVQ needIRET+32(FP), R9 ADDQ $-32, SP MOVQ SI, 0(SP) // cpu MOVQ R8, 8(SP) // regs MOVQ CX, 16(SP) // userCR3 TESTQ R9, R9 JNZ do_iret CALL ·sysret(SB) JMP done_sysret_or_iret do_iret: CALL ·iret(SB) done_sysret_or_iret: MOVQ 24(SP), AX // vector ADDQ $32, SP MOVQ AX, ret+40(FP) // Save application floating point state. MOVQ fpState+16(FP), DI MOVB ·hasXSAVE(SB), BX MOVB ·hasXSAVEOPT(SB), CX TESTB BX, BX JZ no_xsave // Use xsave/xsaveopt to save all extended state. MOVL $XCR0_EAX, AX MOVL $XCR0_EDX, DX TESTB CX, CX JZ no_xsaveopt BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI) JMP fpsave_done no_xsaveopt: BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI) JMP fpsave_done no_xsave: FXSAVE64 0(DI) fpsave_done: // Restore MXCSR and the x87 control word after one of the two floating // point save cases above, to ensure the application versions are saved // before being clobbered here. LDMXCSR mxcsr-0(SP) // FLDCW is a "waiting" x87 instruction, meaning it checks for pending // unmasked exceptions before executing. Thus if userspace has unmasked // an exception and has one pending, it can be raised by FLDCW even // though the new control word will mask exceptions. To prevent this, // we must first clear pending exceptions (which will be restored by // XRSTOR, et al). BYTE $0xDB; BYTE $0xE2; // FNCLEX FLDCW cw-8(SP) RET // See entry_amd64.go. TEXT ·sysret(SB),NOSPLIT|NOFRAME,$0-32 // Set application FS. We can't do this in Go because Go code needs FS. MOVQ regs+8(FP), AX MOVQ PTRACE_FS_BASE(AX), AX PUSHQ AX CALL ·writeFS(SB) POPQ AX CALL ·jumpToKernel(SB) // Save original state and stack. sysenter() or exception() // from APP(gr3) will switch to this stack, set the return // value (vector: 32(SP)) and then do RET, which will also // automatically return to the lower half. MOVQ cpu+0(FP), BX MOVQ regs+8(FP), AX MOVQ userCR3+16(FP), CX MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX) MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX) MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX) // save SP AX userCR3 on the kernel stack. MOVQ CPU_ENTRY(BX), BX LOAD_KERNEL_STACK(BX) PUSHQ PTRACE_RSP(AX) PUSHQ PTRACE_RAX(AX) PUSHQ CX // Restore user register state. REGISTERS_LOAD(AX, 0) MOVQ PTRACE_RIP(AX), CX // Needed for SYSRET. MOVQ PTRACE_FLAGS(AX), R11 // Needed for SYSRET. // restore userCR3, AX, SP. POPQ AX // Get userCR3. WRITE_CR3() // Switch to userCR3. POPQ AX // Restore AX. POPQ SP // Restore SP. SYSRET64() // sysenter or exception will write our return value and return to our // caller. // See entry_amd64.go. TEXT ·iret(SB),NOSPLIT|NOFRAME,$0-32 // Set application FS. We can't do this in Go because Go code needs FS. MOVQ regs+8(FP), AX MOVQ PTRACE_FS_BASE(AX), AX PUSHQ AX // First argument (FS_BASE) CALL ·writeFS(SB) POPQ AX CALL ·jumpToKernel(SB) // Save original state and stack. sysenter() or exception() // from APP(gr3) will switch to this stack, set the return // value (vector: 32(SP)) and then do RET, which will also // automatically return to the lower half. MOVQ cpu+0(FP), BX MOVQ regs+8(FP), AX MOVQ userCR3+16(FP), CX MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX) MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX) MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX) // Build an IRET frame & restore state. MOVQ CPU_ENTRY(BX), BX LOAD_KERNEL_STACK(BX) PUSHQ PTRACE_SS(AX) PUSHQ PTRACE_RSP(AX) PUSHQ PTRACE_FLAGS(AX) PUSHQ PTRACE_CS(AX) PUSHQ PTRACE_RIP(AX) PUSHQ PTRACE_RAX(AX) // Save AX on kernel stack. PUSHQ CX // Save userCR3 on kernel stack. REGISTERS_LOAD(AX, 0) // Restore most registers. POPQ AX // Get userCR3. WRITE_CR3() // Switch to userCR3. POPQ AX // Restore AX. IRET() // sysenter or exception will write our return value and return to our // caller. // See entry_amd64.go. TEXT ·resume(SB),NOSPLIT|NOFRAME,$0 // See iret, above. MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. PUSHQ CPU_REGISTERS+PTRACE_SS(AX) PUSHQ CPU_REGISTERS+PTRACE_RSP(AX) PUSHQ CPU_REGISTERS+PTRACE_FLAGS(AX) PUSHQ CPU_REGISTERS+PTRACE_CS(AX) PUSHQ CPU_REGISTERS+PTRACE_RIP(AX) REGISTERS_LOAD(AX, CPU_REGISTERS) MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX IRET() // See entry_amd64.go. TEXT ·start(SB),NOSPLIT|NOFRAME,$0 // N.B. This is the vCPU entrypoint. It is not called from Go code and // thus pushes and pops values on the stack until calling into Go // (startGo) because we aren't usually a typical Go assembly frame. PUSHQ $0x0 // Previous frame pointer. MOVQ SP, BP // Set frame pointer. PUSHQ AX // Save CPU. // Set up environment required by Go before calling startGo: Go needs // FS_BASE and floating point initialized. MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX PUSHQ BX // First argument (FS_BASE) CALL ·writeFS(SB) POPQ BX MOVQ CPU_APP_GS_BASE(AX),BX PUSHQ BX CALL ·writeGS(SB) POPQ BX SWAP_GS() // First argument (CPU) already at bottom of stack. CALL ·startGo(SB) // Call Go hook. JMP ·resume(SB) // Restore to registers. ADDR_OF_FUNC(·AddrOfStart(SB), ·start(SB)); // See entry_amd64.go. TEXT ·sysenter(SB),NOSPLIT|NOFRAME,$0 // _RFLAGS_IOPL0 is always set in the user mode and it is never set in // the kernel mode. See the comment of UserFlagsSet for more details. TESTL $_RFLAGS_IOPL0, R11 JZ kernel user: SWAP_GS() MOVQ AX, ENTRY_SCRATCH0(GS) // Save user AX on scratch. MOVQ ENTRY_KERNEL_CR3(GS), AX // Get kernel cr3 on AX. WRITE_CR3() // Switch to kernel cr3. MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX // Get user regs. REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX. MOVQ CX, PTRACE_RIP(AX) MOVQ R11, PTRACE_FLAGS(AX) MOVQ SP, PTRACE_RSP(AX) MOVQ ENTRY_SCRATCH0(GS), CX // Load saved user AX value. MOVQ CX, PTRACE_RAX(AX) // Save everything else. MOVQ CX, PTRACE_ORIGRAX(AX) CMPB CPU_HAS_FSGSBASE(GS), $1 JNE sysenter_skip_gs SWAP_GS() BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xcb; // rdgsbase rbx MOVQ BX, PTRACE_GS_BASE(AX) SWAP_GS() sysenter_skip_gs: MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Get stacks. MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code. MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user. CALL ·jumpToUser(SB) // Restore kernel FS_BASE. MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX PUSHQ BX // First argument (FS_BASE) CALL ·writeFS(SB) POPQ BX MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. // Return to the kernel, where the frame is: // // vector (sp+32) // userCR3 (sp+24) // regs (sp+16) // cpu (sp+8) // vcpu.Switch (sp+0) // MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer. MOVQ $Syscall, 32(SP) // Output vector. RET kernel: // We can't restore the original stack, but we can access the registers // in the CPU state directly. No need for temporary juggling. MOVQ AX, ENTRY_SCRATCH0(GS) MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. REGISTERS_SAVE(AX, CPU_REGISTERS) MOVQ CX, CPU_REGISTERS+PTRACE_RIP(AX) MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(AX) MOVQ SP, CPU_REGISTERS+PTRACE_RSP(AX) MOVQ ENTRY_SCRATCH0(GS), BX MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX) MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX) MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code. MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel. MOVQ $0xffffffffffffffff, CPU_VECTOR(AX) // Set error type to kernel. // Save floating point state. CPU.floatingPointState is a slice, so the // first word of CPU.floatingPointState is a pointer to the destination // array. MOVQ CPU_FPU_STATE(AX), DI MOVB CPU_HAS_XSAVE(AX), BX MOVB CPU_HAS_XSAVEOPT(AX), CX TESTB BX, BX JZ no_xsave // Use xsave/xsaveopt to save all extended state. MOVL $XCR0_EAX, AX MOVL $XCR0_EDX, DX TESTB CX, CX JZ no_xsaveopt BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI) JMP fpsave_done no_xsaveopt: BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI) JMP fpsave_done no_xsave: FXSAVE64 0(DI) fpsave_done: // Call the syscall trampoline. LOAD_KERNEL_STACK(GS) MOVQ ENTRY_CPU_SELF(GS), AX // AX contains the vCPU. PUSHQ AX // First argument (vCPU). CALL ·kernelSyscall(SB) // Call the trampoline. POPQ AX // Pop vCPU. // We only trigger a bluepill entry in the bluepill function, and can // therefore be guaranteed that there is no floating point state to be // loaded on resuming from halt. JMP ·resume(SB) ADDR_OF_FUNC(·addrOfSysenter(SB), ·sysenter(SB)); // exception is a generic exception handler. // // There are two cases handled: // // 1) An exception in kernel mode: this results in saving the state at the time // of the exception and calling the defined hook. // // 2) An exception in guest mode: the original kernel frame is restored, and // the vector & error codes are pushed as return values. // // See below for the stubs that call exception. TEXT ·exception(SB),NOSPLIT|NOFRAME,$0 // Determine whether the exception occurred in kernel mode or user // mode, based on the flags. We expect the following stack: // // SS (sp+48) // SP (sp+40) // FLAGS (sp+32) // CS (sp+24) // IP (sp+16) // ERROR_CODE (sp+8) // VECTOR (sp+0) // TESTL $_RFLAGS_IOPL0, 32(SP) JZ kernel user: SWAP_GS() ADDQ $-8, SP // Adjust for flags. MOVQ $_KERNEL_FLAGS, 0(SP); BYTE $0x9d; // Reset flags (POPFQ). PUSHQ AX // Save user AX on stack. MOVQ ENTRY_KERNEL_CR3(GS), AX // Get kernel cr3 on AX. WRITE_CR3() // Switch to kernel cr3. MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX // Get user regs. REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX. POPQ BX // Restore original AX. MOVQ BX, PTRACE_RAX(AX) // Save it. MOVQ BX, PTRACE_ORIGRAX(AX) CMPB CPU_HAS_FSGSBASE(GS), $1 JNE exception_skip_gs SWAP_GS() BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xcb; // rdgsbase rbx MOVQ BX, PTRACE_GS_BASE(AX) SWAP_GS() exception_skip_gs: MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX) MOVQ 24(SP), CX; MOVQ CX, PTRACE_CS(AX) MOVQ 32(SP), DX; MOVQ DX, PTRACE_FLAGS(AX) MOVQ 40(SP), DI; MOVQ DI, PTRACE_RSP(AX) MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX) CALL ·jumpToUser(SB) // Restore kernel FS_BASE. MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX PUSHQ BX // First argument (FS_BASE) CALL ·writeFS(SB) POPQ BX // Copy out and return. MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. MOVQ 0(SP), BX // Load vector. MOVQ 8(SP), CX // Load error code. MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Original stack (kernel version). MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer. MOVQ CX, CPU_ERROR_CODE(AX) // Set error code. MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user. MOVQ BX, 32(SP) // Output vector. RET kernel: // As per above, we can save directly. PUSHQ AX MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. REGISTERS_SAVE(AX, CPU_REGISTERS) POPQ BX MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX) MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX) MOVQ 16(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RIP(AX) MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(AX) MOVQ 40(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RSP(AX) // Set the error code and adjust the stack. MOVQ 8(SP), BX // Load the error code. MOVQ BX, CPU_ERROR_CODE(AX) // Copy out to the CPU. MOVQ 0(SP), BX // Load the error code. MOVQ BX, CPU_VECTOR(AX) // Copy out to the CPU. BYTE $0x0f; BYTE $0x20; BYTE $0xd3; // MOV CR2, RBX MOVQ BX, CPU_FAULT_ADDR(AX) MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel. // Save floating point state. CPU.floatingPointState is a slice, so the // first word of CPU.floatingPointState is a pointer to the destination // array. MOVQ CPU_FPU_STATE(AX), DI MOVB CPU_HAS_XSAVE(AX), BX MOVB CPU_HAS_XSAVEOPT(AX), CX TESTB BX, BX JZ no_xsave // Use xsave/xsaveopt to save all extended state. MOVL $XCR0_EAX, AX MOVL $XCR0_EDX, DX TESTB CX, CX JZ no_xsaveopt BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI) JMP fpsave_done no_xsaveopt: BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI) JMP fpsave_done no_xsave: FXSAVE64 0(DI) fpsave_done: // Call the exception trampoline. MOVQ 0(SP), BX // BX contains the vector. LOAD_KERNEL_STACK(GS) MOVQ ENTRY_CPU_SELF(GS), AX // AX contains the vCPU. PUSHQ BX // Second argument (vector). PUSHQ AX // First argument (vCPU). CALL ·kernelException(SB) // Call the trampoline. POPQ BX // Pop vector. POPQ AX // Pop vCPU. // We only trigger a bluepill entry in the bluepill function, and can // therefore be guaranteed that there is no floating point state to be // loaded on resuming from halt. JMP ·resume(SB) #define EXCEPTION_WITH_ERROR(value, symbol, addr) \ ADDR_OF_FUNC(addr, symbol); \ TEXT symbol,NOSPLIT|NOFRAME,$0; \ PUSHQ $value; \ JMP ·exception(SB); #define EXCEPTION_WITHOUT_ERROR(value, symbol, addr) \ ADDR_OF_FUNC(addr, symbol); \ TEXT symbol,NOSPLIT|NOFRAME,$0; \ PUSHQ $0x0; \ PUSHQ $value; \ JMP ·exception(SB); EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB), ·addrOfDivideByZero(SB)) EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB), ·addrOfDebug(SB)) EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB), ·addrOfNMI(SB)) EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB), ·addrOfBreakpoint(SB)) EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB), ·addrOfOverflow(SB)) EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB), ·addrOfBoundRangeExceeded(SB)) EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB), ·addrOfInvalidOpcode(SB)) EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB), ·addrOfDeviceNotAvailable(SB)) EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB), ·addrOfDoubleFault(SB)) EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB), ·addrOfCoprocessorSegmentOverrun(SB)) EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB), ·addrOfInvalidTSS(SB)) EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB), ·addrOfSegmentNotPresent(SB)) EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB), ·addrOfStackSegmentFault(SB)) EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB), ·addrOfGeneralProtectionFault(SB)) EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB), ·addrOfPageFault(SB)) EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB), ·addrOfX87FloatingPointException(SB)) EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB), ·addrOfAlignmentCheck(SB)) EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB), ·addrOfMachineCheck(SB)) EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB), ·addrOfSimdFloatingPointException(SB)) EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB), ·addrOfVirtualizationException(SB)) EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB), ·addrOfSecurityException(SB)) EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB), ·addrOfSyscallInt80(SB)) golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/entry_arm64.go000066400000000000000000000030471465435605700231500ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package ring0 // This is an assembly function. // // The sysenter function is invoked in two situations: // // (1) The guest kernel has executed a system call. // (2) The guest application has executed a system call. // // The interrupt flag is examined to determine whether the system call was // executed from kernel mode or not and the appropriate stub is called. func El1_sync_invalid() func El1_irq_invalid() func El1_fiq_invalid() func El1_error_invalid() func El1_sync() func El1_irq() func El1_fiq() func El1_error() func El0_sync() func El0_irq() func El0_fiq() func El0_error() func El0_sync_invalid() func El0_irq_invalid() func El0_fiq_invalid() func El0_error_invalid() func vectors() func AddrOfVectors() uintptr // start is the CPU entrypoint. // // The CPU state will be set to c.Registers(). func start() func AddrOfStart() uintptr func kernelExitToEl1() func kernelExitToEl0() // Shutdown execution func Shutdown() golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/entry_arm64.s000066400000000000000000000657511465435605700230170ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "funcdata.h" #include "textflag.h" #define CPU_SELF 0 // +checkoffset . CPU.self #define CPU_REGISTERS 224 // +checkoffset . CPU.registers #define CPU_ARCH_STATE 16 // +checkoffset . CPU.CPUArchState #define CPU_STACK_BOTTOM CPU_ARCH_STATE+0 // +checkoffset . CPUArchState.stack #define CPU_STACK_TOP CPU_STACK_BOTTOM+128 // +checksize . CPUArchState.stack #define CPU_ERROR_CODE CPU_ARCH_STATE+128 // +checkoffset . CPUArchState.errorCode #define CPU_ERROR_TYPE CPU_ARCH_STATE+136 // +checkoffset . CPUArchState.errorType #define CPU_FAULT_ADDR CPU_ARCH_STATE+144 // +checkoffset . CPUArchState.faultAddr #define CPU_FPSTATE_EL0 CPU_ARCH_STATE+152 // +checkoffset . CPUArchState.el0Fp #define CPU_TTBR0_KVM CPU_ARCH_STATE+160 // +checkoffset . CPUArchState.ttbr0Kvm #define CPU_TTBR0_APP CPU_ARCH_STATE+168 // +checkoffset . CPUArchState.ttbr0App #define CPU_VECTOR_CODE CPU_ARCH_STATE+176 // +checkoffset . CPUArchState.vecCode #define CPU_APP_ADDR CPU_ARCH_STATE+184 // +checkoffset . CPUArchState.appAddr #define CPU_LAZY_VFP CPU_ARCH_STATE+192 // +checkoffset . CPUArchState.lazyVFP #define CPU_APP_ASID CPU_ARCH_STATE+200 // +checkoffset . CPUArchState.appASID // Bits. #define _KERNEL_FLAGS 965 // +checkconst . KernelFlagsSet // Vectors. #define El1Sync 4 // +checkconst . El1Sync #define El1Irq 5 // +checkconst . El1Irq #define El1Fiq 6 // +checkconst . El1Fiq #define El1Err 7 // +checkconst . El1Err #define El0Sync 8 // +checkconst . El0Sync #define El0Irq 9 // +checkconst . El0Irq #define El0Fiq 10 // +checkconst . El0Fiq #define El0Err 11 // +checkconst . El0Err #define El1SyncDa 16 // +checkconst . El1SyncDa #define El1SyncIa 17 // +checkconst . El1SyncIa #define El1SyncSpPc 18 // +checkconst . El1SyncSpPc #define El1SyncUndef 19 // +checkconst . El1SyncUndef #define El1SyncDbg 20 // +checkconst . El1SyncDbg #define El1SyncInv 21 // +checkconst . El1SyncInv #define El0SyncSVC 22 // +checkconst . El0SyncSVC #define El0SyncDa 23 // +checkconst . El0SyncDa #define El0SyncIa 24 // +checkconst . El0SyncIa #define El0SyncFpsimdAcc 25 // +checkconst . El0SyncFpsimdAcc #define El0SyncSveAcc 26 // +checkconst . El0SyncSveAcc #define El0SyncFpsimdExc 27 // +checkconst . El0SyncFpsimdExc #define El0SyncSys 28 // +checkconst . El0SyncSys #define El0SyncSpPc 29 // +checkconst . El0SyncSpPc #define El0SyncUndef 30 // +checkconst . El0SyncUndef #define El0SyncDbg 31 // +checkconst . El0SyncDbg #define El0SyncWfx 32 // +checkconst . El0SyncWfx #define El0SyncInv 33 // +checkconst . El0SyncInv #define El0ErrNMI 34 // +checkconst . El0ErrNMI #define PageFault 23 // +checkconst . PageFault #define Syscall 22 // +checkconst . Syscall #define VirtualizationException 35 // +checkconst . VirtualizationException #define PTRACE_REGS 0 // +checkoffset linux PtraceRegs.Regs #define PTRACE_R0 (PTRACE_REGS + 0*8) #define PTRACE_R1 (PTRACE_REGS + 1*8) #define PTRACE_R2 (PTRACE_REGS + 2*8) #define PTRACE_R3 (PTRACE_REGS + 3*8) #define PTRACE_R4 (PTRACE_REGS + 4*8) #define PTRACE_R5 (PTRACE_REGS + 5*8) #define PTRACE_R6 (PTRACE_REGS + 6*8) #define PTRACE_R7 (PTRACE_REGS + 7*8) #define PTRACE_R8 (PTRACE_REGS + 8*8) #define PTRACE_R9 (PTRACE_REGS + 9*8) #define PTRACE_R10 (PTRACE_REGS + 10*8) #define PTRACE_R11 (PTRACE_REGS + 11*8) #define PTRACE_R12 (PTRACE_REGS + 12*8) #define PTRACE_R13 (PTRACE_REGS + 13*8) #define PTRACE_R14 (PTRACE_REGS + 14*8) #define PTRACE_R15 (PTRACE_REGS + 15*8) #define PTRACE_R16 (PTRACE_REGS + 16*8) #define PTRACE_R17 (PTRACE_REGS + 17*8) #define PTRACE_R18 (PTRACE_REGS + 18*8) #define PTRACE_R19 (PTRACE_REGS + 19*8) #define PTRACE_R20 (PTRACE_REGS + 20*8) #define PTRACE_R21 (PTRACE_REGS + 21*8) #define PTRACE_R22 (PTRACE_REGS + 22*8) #define PTRACE_R23 (PTRACE_REGS + 23*8) #define PTRACE_R24 (PTRACE_REGS + 24*8) #define PTRACE_R25 (PTRACE_REGS + 25*8) #define PTRACE_R26 (PTRACE_REGS + 26*8) #define PTRACE_R27 (PTRACE_REGS + 27*8) #define PTRACE_R28 (PTRACE_REGS + 28*8) #define PTRACE_R29 (PTRACE_REGS + 29*8) #define PTRACE_R30 (PTRACE_REGS + 30*8) #define PTRACE_SP 248 // +checkoffset linux PtraceRegs.Sp #define PTRACE_PC 256 // +checkoffset linux PtraceRegs.Pc #define PTRACE_PSTATE 264 // +checkoffset linux PtraceRegs.Pstate #define PTRACE_TLS 272 // +checkoffset arch Registers.TPIDR_EL0 // Saves a register set. // // This is a macro because it may need to executed in contents where a stack is // not available for calls. // // ERET returns using the ELR and SPSR for the current exception level. #define ERET() \ WORD $0xd69f03e0; \ DSB $7; \ ISB $15; // RSV_REG is a register that holds el1 information temporarily. #define RSV_REG R18_PLATFORM // RSV_REG_APP is a register that holds el0 information temporarily. #define RSV_REG_APP R19 #define FPEN_NOTRAP 0x3 #define FPEN_SHIFT 20 #define FPEN_ENABLE (FPEN_NOTRAP << FPEN_SHIFT) // Saves a register set. // // This is a macro because it may need to executed in contents where a stack is // not available for calls. // // The following registers are not saved: R18, R19. #define REGISTERS_SAVE(reg, offset) \ STP (R0, R1), offset+PTRACE_R0(reg); \ STP (R2, R3), offset+PTRACE_R2(reg); \ STP (R4, R5), offset+PTRACE_R4(reg); \ STP (R6, R7), offset+PTRACE_R6(reg); \ STP (R8, R9), offset+PTRACE_R8(reg); \ STP (R10, R11), offset+PTRACE_R10(reg); \ STP (R12, R13), offset+PTRACE_R12(reg); \ STP (R14, R15), offset+PTRACE_R14(reg); \ STP (R16, R17), offset+PTRACE_R16(reg); \ STP (R20, R21), offset+PTRACE_R20(reg); \ STP (R22, R23), offset+PTRACE_R22(reg); \ STP (R24, R25), offset+PTRACE_R24(reg); \ STP (R26, R27), offset+PTRACE_R26(reg); \ STP (g, R29), offset+PTRACE_R28(reg); \ MOVD R30, offset+PTRACE_R30(reg); // Loads a register set. // // This is a macro because it may need to executed in contents where a stack is // not available for calls. // // The following registers are not loaded: R18, R19. #define REGISTERS_LOAD(reg, offset) \ LDP offset+PTRACE_R0(reg), (R0, R1); \ LDP offset+PTRACE_R2(reg), (R2, R3); \ LDP offset+PTRACE_R4(reg), (R4, R5); \ LDP offset+PTRACE_R6(reg), (R6, R7); \ LDP offset+PTRACE_R8(reg), (R8, R9); \ LDP offset+PTRACE_R10(reg), (R10, R11); \ LDP offset+PTRACE_R12(reg), (R12, R13); \ LDP offset+PTRACE_R14(reg), (R14, R15); \ LDP offset+PTRACE_R16(reg), (R16, R17); \ LDP offset+PTRACE_R20(reg), (R20, R21); \ LDP offset+PTRACE_R22(reg), (R22, R23); \ LDP offset+PTRACE_R24(reg), (R24, R25); \ LDP offset+PTRACE_R26(reg), (R26, R27); \ LDP offset+PTRACE_R28(reg), (g, R29); \ MOVD offset+PTRACE_R30(reg), R30; // Loads the application's fpstate. #define FPSTATE_EL0_LOAD() \ MRS TPIDR_EL1, RSV_REG; \ MOVD CPU_FPSTATE_EL0(RSV_REG), RSV_REG; \ MOVD 0(RSV_REG), RSV_REG_APP; \ MOVD RSV_REG_APP, FPSR; \ MOVD 8(RSV_REG), RSV_REG_APP; \ MOVD RSV_REG_APP, FPCR; \ ADD $16, RSV_REG, RSV_REG; \ WORD $0xad400640; \ // ldp q0, q1, [x18] WORD $0xad410e42; \ WORD $0xad421644; \ WORD $0xad431e46; \ WORD $0xad442648; \ WORD $0xad452e4a; \ WORD $0xad46364c; \ WORD $0xad473e4e; \ WORD $0xad484650; \ WORD $0xad494e52; \ WORD $0xad4a5654; \ WORD $0xad4b5e56; \ WORD $0xad4c6658; \ WORD $0xad4d6e5a; \ WORD $0xad4e765c; \ WORD $0xad4f7e5e; #define ESR_ELx_EC_UNKNOWN (0x00) #define ESR_ELx_EC_WFx (0x01) /* Unallocated EC: 0x02 */ #define ESR_ELx_EC_CP15_32 (0x03) #define ESR_ELx_EC_CP15_64 (0x04) #define ESR_ELx_EC_CP14_MR (0x05) #define ESR_ELx_EC_CP14_LS (0x06) #define ESR_ELx_EC_FP_ASIMD (0x07) #define ESR_ELx_EC_CP10_ID (0x08) /* EL2 only */ #define ESR_ELx_EC_PAC (0x09) /* EL2 and above */ /* Unallocated EC: 0x0A - 0x0B */ #define ESR_ELx_EC_CP14_64 (0x0C) /* Unallocated EC: 0x0d */ #define ESR_ELx_EC_ILL (0x0E) /* Unallocated EC: 0x0F - 0x10 */ #define ESR_ELx_EC_SVC32 (0x11) #define ESR_ELx_EC_HVC32 (0x12) /* EL2 only */ #define ESR_ELx_EC_SMC32 (0x13) /* EL2 and above */ /* Unallocated EC: 0x14 */ #define ESR_ELx_EC_SVC64 (0x15) #define ESR_ELx_EC_HVC64 (0x16) /* EL2 and above */ #define ESR_ELx_EC_SMC64 (0x17) /* EL2 and above */ #define ESR_ELx_EC_SYS64 (0x18) #define ESR_ELx_EC_SVE (0x19) /* Unallocated EC: 0x1A - 0x1E */ #define ESR_ELx_EC_IMP_DEF (0x1f) /* EL3 only */ #define ESR_ELx_EC_IABT_LOW (0x20) #define ESR_ELx_EC_IABT_CUR (0x21) #define ESR_ELx_EC_PC_ALIGN (0x22) /* Unallocated EC: 0x23 */ #define ESR_ELx_EC_DABT_LOW (0x24) #define ESR_ELx_EC_DABT_CUR (0x25) #define ESR_ELx_EC_SP_ALIGN (0x26) /* Unallocated EC: 0x27 */ #define ESR_ELx_EC_FP_EXC32 (0x28) /* Unallocated EC: 0x29 - 0x2B */ #define ESR_ELx_EC_FP_EXC64 (0x2C) /* Unallocated EC: 0x2D - 0x2E */ #define ESR_ELx_EC_SERROR (0x2F) #define ESR_ELx_EC_BREAKPT_LOW (0x30) #define ESR_ELx_EC_BREAKPT_CUR (0x31) #define ESR_ELx_EC_SOFTSTP_LOW (0x32) #define ESR_ELx_EC_SOFTSTP_CUR (0x33) #define ESR_ELx_EC_WATCHPT_LOW (0x34) #define ESR_ELx_EC_WATCHPT_CUR (0x35) /* Unallocated EC: 0x36 - 0x37 */ #define ESR_ELx_EC_BKPT32 (0x38) /* Unallocated EC: 0x39 */ #define ESR_ELx_EC_VECTOR32 (0x3A) /* EL2 only */ /* Unallocted EC: 0x3B */ #define ESR_ELx_EC_BRK64 (0x3C) /* Unallocated EC: 0x3D - 0x3F */ #define ESR_ELx_EC_MAX (0x3F) #define ESR_ELx_EC_SHIFT (26) #define ESR_ELx_EC_MASK (UL(0x3F) << ESR_ELx_EC_SHIFT) #define ESR_ELx_EC(esr) (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT) #define ESR_ELx_IL_SHIFT (25) #define ESR_ELx_IL (UL(1) << ESR_ELx_IL_SHIFT) #define ESR_ELx_ISS_MASK (ESR_ELx_IL - 1) /* ISS field definitions shared by different classes */ #define ESR_ELx_WNR_SHIFT (6) #define ESR_ELx_WNR (UL(1) << ESR_ELx_WNR_SHIFT) /* Asynchronous Error Type */ #define ESR_ELx_IDS_SHIFT (24) #define ESR_ELx_IDS (UL(1) << ESR_ELx_IDS_SHIFT) #define ESR_ELx_AET_SHIFT (10) #define ESR_ELx_AET (UL(0x7) << ESR_ELx_AET_SHIFT) #define ESR_ELx_AET_UC (UL(0) << ESR_ELx_AET_SHIFT) #define ESR_ELx_AET_UEU (UL(1) << ESR_ELx_AET_SHIFT) #define ESR_ELx_AET_UEO (UL(2) << ESR_ELx_AET_SHIFT) #define ESR_ELx_AET_UER (UL(3) << ESR_ELx_AET_SHIFT) #define ESR_ELx_AET_CE (UL(6) << ESR_ELx_AET_SHIFT) /* Shared ISS field definitions for Data/Instruction aborts */ #define ESR_ELx_SET_SHIFT (11) #define ESR_ELx_SET_MASK (UL(3) << ESR_ELx_SET_SHIFT) #define ESR_ELx_FnV_SHIFT (10) #define ESR_ELx_FnV (UL(1) << ESR_ELx_FnV_SHIFT) #define ESR_ELx_EA_SHIFT (9) #define ESR_ELx_EA (UL(1) << ESR_ELx_EA_SHIFT) #define ESR_ELx_S1PTW_SHIFT (7) #define ESR_ELx_S1PTW (UL(1) << ESR_ELx_S1PTW_SHIFT) /* Shared ISS fault status code(IFSC/DFSC) for Data/Instruction aborts */ #define ESR_ELx_FSC (0x3F) #define ESR_ELx_FSC_TYPE (0x3C) #define ESR_ELx_FSC_EXTABT (0x10) #define ESR_ELx_FSC_SERROR (0x11) #define ESR_ELx_FSC_ACCESS (0x08) #define ESR_ELx_FSC_FAULT (0x04) #define ESR_ELx_FSC_PERM (0x0C) /* ISS field definitions for Data Aborts */ #define ESR_ELx_ISV_SHIFT (24) #define ESR_ELx_ISV (UL(1) << ESR_ELx_ISV_SHIFT) #define ESR_ELx_SAS_SHIFT (22) #define ESR_ELx_SAS (UL(3) << ESR_ELx_SAS_SHIFT) #define ESR_ELx_SSE_SHIFT (21) #define ESR_ELx_SSE (UL(1) << ESR_ELx_SSE_SHIFT) #define ESR_ELx_SRT_SHIFT (16) #define ESR_ELx_SRT_MASK (UL(0x1F) << ESR_ELx_SRT_SHIFT) #define ESR_ELx_SF_SHIFT (15) #define ESR_ELx_SF (UL(1) << ESR_ELx_SF_SHIFT) #define ESR_ELx_AR_SHIFT (14) #define ESR_ELx_AR (UL(1) << ESR_ELx_AR_SHIFT) #define ESR_ELx_CM_SHIFT (8) #define ESR_ELx_CM (UL(1) << ESR_ELx_CM_SHIFT) /* ISS field definitions for exceptions taken in to Hyp */ #define ESR_ELx_CV (UL(1) << 24) #define ESR_ELx_COND_SHIFT (20) #define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT) #define ESR_ELx_WFx_ISS_TI (UL(1) << 0) #define ESR_ELx_WFx_ISS_WFI (UL(0) << 0) #define ESR_ELx_WFx_ISS_WFE (UL(1) << 0) #define ESR_ELx_xVC_IMM_MASK ((1UL << 16) - 1) /* ISS field definitions for system error */ #define ESR_ELx_SERR_MASK (0x1) #define ESR_ELx_SERR_NMI (0x1) // LOAD_KERNEL_ADDRESS loads a kernel address. #define LOAD_KERNEL_ADDRESS(from, to) \ MOVD from, to; \ ORR $0xffff000000000000, to, to; // LOAD_KERNEL_STACK loads the kernel temporary stack. #define LOAD_KERNEL_STACK(from) \ LOAD_KERNEL_ADDRESS(CPU_SELF(from), RSV_REG); \ MOVD $CPU_STACK_TOP(RSV_REG), RSV_REG; \ MOVD RSV_REG, RSP; \ WORD $0xd538d092; //MRS TPIDR_EL1, R18 // SWITCH_TO_APP_PAGETABLE sets a new pagetable for a container application. #define SWITCH_TO_APP_PAGETABLE() \ MOVD CPU_APP_ASID(RSV_REG), RSV_REG_APP; \ MOVD CPU_TTBR0_APP(RSV_REG), RSV_REG; \ BFI $48, RSV_REG_APP, $16, RSV_REG; \ MSR RSV_REG, TTBR0_EL1; \ ISB $15; // SWITCH_TO_KVM_PAGETABLE sets the kvm pagetable. #define SWITCH_TO_KVM_PAGETABLE() \ MOVD CPU_TTBR0_KVM(RSV_REG), RSV_REG; \ MOVD $1, RSV_REG_APP; \ BFI $48, RSV_REG_APP, $16, RSV_REG; \ MSR RSV_REG, TTBR0_EL1; \ ISB $15; // FPSIMDDisableTrap disables the trap for accessing fpsimd. TEXT ·FPSIMDDisableTrap(SB),NOSPLIT,$0 MOVD $FPEN_ENABLE, R0 MSR R0, CPACR_EL1 ISB $15 RET // FPSIMDEnableTrap enables the trap for accessing fpsimd. TEXT ·FPSIMDEnableTrap(SB),NOSPLIT,$0 MSR $0, CPACR_EL1 ISB $15 RET // FPSIMD_DISABLE_TRAP disables the trap for accessing fpsimd. #define FPSIMD_DISABLE_TRAP(reg) \ MOVD $FPEN_ENABLE, reg; \ MSR reg, CPACR_EL1; \ ISB $15; // FPSIMD_ENABLE_TRAP enables the trap for accessing fpsimd. #define FPSIMD_ENABLE_TRAP(reg) \ MSR $0, CPACR_EL1; \ ISB $15; // KERNEL_ENTRY_FROM_EL0 is the entry code of the vcpu from el0 to el1. #define KERNEL_ENTRY_FROM_EL0 \ SUB $16, RSP, RSP; \ // step1, save r18, r19 into kernel temporary stack. STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \ WORD $0xd538d092; \ // MRS TPIDR_EL1, R18 MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP; \ // step2, load app context pointer. REGISTERS_SAVE(RSV_REG_APP, 0); \ // step3, save app context. MOVD RSV_REG_APP, R20; \ LDP 16*0(RSP), (RSV_REG, RSV_REG_APP); \ ADD $16, RSP, RSP; \ STP (RSV_REG, RSV_REG_APP), PTRACE_R18(R20); \ MRS TPIDR_EL0, R3; \ MOVD R3, PTRACE_TLS(R20); \ WORD $0xd5384003; \ // MRS SPSR_EL1, R3 MOVD R3, PTRACE_PSTATE(R20); \ MRS ELR_EL1, R3; \ MOVD R3, PTRACE_PC(R20); \ WORD $0xd5384103; \ // MRS SP_EL0, R3 MOVD R3, PTRACE_SP(R20); // KERNEL_ENTRY_FROM_EL1 is the entry code of the vcpu from el1 to el1. #define KERNEL_ENTRY_FROM_EL1 \ WORD $0xd538d092; \ //MRS TPIDR_EL1, R18 REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \ // Save sentry context. MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R19(RSV_REG); \ MRS TPIDR_EL0, R4; \ MOVD R4, CPU_REGISTERS+PTRACE_TLS(RSV_REG); \ WORD $0xd5384004; \ // MRS SPSR_EL1, R4 MOVD R4, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG); \ MRS ELR_EL1, R4; \ MOVD R4, CPU_REGISTERS+PTRACE_PC(RSV_REG); \ MOVD RSP, R4; \ MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); \ LOAD_KERNEL_STACK(RSV_REG); // Load the temporary stack. // EXCEPTION_EL0 is a common el0 exception handler function. #define EXCEPTION_EL0(vector) \ WORD $0xd538d092; \ //MRS TPIDR_EL1, R18 WORD $0xd538601a; \ //MRS FAR_EL1, R26 MOVD R26, CPU_FAULT_ADDR(RSV_REG); \ MOVD $1, R3; \ MOVD R3, CPU_ERROR_TYPE(RSV_REG); \ // Set error type to user. MOVD $vector, R3; \ MOVD R3, CPU_VECTOR_CODE(RSV_REG); \ MRS ESR_EL1, R3; \ MOVD R3, CPU_ERROR_CODE(RSV_REG); \ B ·kernelExitToEl1(SB); // EXCEPTION_EL1 is a common el1 exception handler function. #define EXCEPTION_EL1(vector) \ MOVD $vector, R3; \ MOVD R3, 8(RSP); \ B ·HaltEl1ExceptionAndResume(SB); // storeEl0Fpstate writes the address of application's fpstate. TEXT ·storeEl0Fpstate(SB),NOSPLIT,$0-8 MOVD value+0(FP), R1 ORR $0xffff000000000000, R1, R1 MRS TPIDR_EL1, RSV_REG MOVD R1, CPU_FPSTATE_EL0(RSV_REG) RET // storeAppASID writes the application's asid value. TEXT ·storeAppASID(SB),NOSPLIT,$0-8 MOVD asid+0(FP), R1 MRS TPIDR_EL1, RSV_REG MOVD R1, CPU_APP_ASID(RSV_REG) RET // Halt halts execution. TEXT ·Halt(SB),NOSPLIT,$0 // Disable fpsimd. WORD $0xd5381041 // MRS CPACR_EL1, R1 MOVD R1, CPU_LAZY_VFP(RSV_REG) DSB $15 FPSIMD_ENABLE_TRAP(RSV_REG) // Trigger MMIO_EXIT/_KVM_HYPERCALL_VMEXIT. // // To keep it simple, I used the address of exception table as the // MMIO base address, so that I can trigger a MMIO-EXIT by forcibly writing // a read-only space. // Also, the length is engough to match a sufficient number of hypercall ID. // Then, in host user space, I can calculate this address to find out // which hypercall. MRS VBAR_EL1, R9 MOVD R0, 0x0(R9) RET // HaltAndResume halts execution and point the pointer to the resume function. TEXT ·HaltAndResume(SB),NOSPLIT,$0 BL ·Halt(SB) B ·kernelExitToEl1(SB) // Resume. // HaltEl1SvcAndResume calls Hooks.KernelSyscall and resume. TEXT ·HaltEl1SvcAndResume(SB),NOSPLIT,$0 WORD $0xd538d092 // MRS TPIDR_EL1, R18 MOVD CPU_SELF(RSV_REG), R3 // Load vCPU. MOVD R3, 8(RSP) // First argument (vCPU). CALL ·kernelSyscall(SB) // Call the trampoline. B ·kernelExitToEl1(SB) // Resume. // HaltEl1ExceptionAndResume calls Hooks.KernelException and resume. TEXT ·HaltEl1ExceptionAndResume(SB),NOSPLIT,$0 WORD $0xd538d092 // MRS TPIDR_EL1, R18 MOVD CPU_SELF(RSV_REG), R3 // Load vCPU. MOVD R3, 8(RSP) // First argument (vCPU). MOVD vector+0(FP), R3 MOVD R3, 16(RSP) // Second argument (vector). CALL ·kernelException(SB) // Call the trampoline. B ·kernelExitToEl1(SB) // Resume. // Shutdown stops the guest. TEXT ·Shutdown(SB),NOSPLIT,$0 // PSCI EVENT. MOVD $0x84000009, R0 HVC $0 #define STACK_FRAME_SIZE 32 // kernelExitToEl0 is the entrypoint for application in guest_el0. // Prepare the vcpu environment for container application. TEXT ·kernelExitToEl0(SB),NOSPLIT,$0 // Step1, save sentry context into memory. MRS TPIDR_EL1, RSV_REG REGISTERS_SAVE(RSV_REG, CPU_REGISTERS) MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R19(RSV_REG) MRS TPIDR_EL0, R3 MOVD R3, CPU_REGISTERS+PTRACE_TLS(RSV_REG) WORD $0xd5384003 // MRS SPSR_EL1, R3 MOVD R3, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG) MOVD R30, CPU_REGISTERS+PTRACE_PC(RSV_REG) MOVD RSP, R3 MOVD R3, CPU_REGISTERS+PTRACE_SP(RSV_REG) MOVD CPU_REGISTERS+PTRACE_R3(RSV_REG), R3 // Step2, switch to temporary stack. LOAD_KERNEL_STACK(RSV_REG) // Step3, load app context pointer. MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP // Step4, prepare the environment for container application. // set sp_el0. MOVD PTRACE_SP(RSV_REG_APP), R1 WORD $0xd5184101 //MSR R1, SP_EL0 // set pc. MOVD PTRACE_PC(RSV_REG_APP), R1 MSR R1, ELR_EL1 // set pstate. MOVD PTRACE_PSTATE(RSV_REG_APP), R1 WORD $0xd5184001 //MSR R1, SPSR_EL1 // need use kernel space address to execute below code, since // after SWITCH_TO_APP_PAGETABLE the ASID is changed to app's // ASID. WORD $0x10000061 // ADR R1, do_exit_to_el0 ORR $0xffff000000000000, R1, R1 JMP (R1) do_exit_to_el0: // RSV_REG & RSV_REG_APP will be loaded at the end. REGISTERS_LOAD(RSV_REG_APP, 0) MOVD PTRACE_TLS(RSV_REG_APP), RSV_REG MSR RSV_REG, TPIDR_EL0 // switch to user pagetable. LDP PTRACE_R18(RSV_REG_APP), (RSV_REG, RSV_REG_APP) SUB $STACK_FRAME_SIZE, RSP, RSP STP (RSV_REG, RSV_REG_APP), 16*0(RSP) STP (R0, R1), 16*1(RSP) WORD $0xd538d092 //MRS TPIDR_EL1, R18 SWITCH_TO_APP_PAGETABLE() LDP 16*1(RSP), (R0, R1) LDP 16*0(RSP), (RSV_REG, RSV_REG_APP) ADD $STACK_FRAME_SIZE, RSP, RSP ERET() // kernelExitToEl1 is the entrypoint for sentry in guest_el1. // Prepare the vcpu environment for sentry. TEXT ·kernelExitToEl1(SB),NOSPLIT,$0 WORD $0xd538d092 //MRS TPIDR_EL1, R18 MOVD CPU_REGISTERS+PTRACE_PSTATE(RSV_REG), R1 WORD $0xd5184001 //MSR R1, SPSR_EL1 MOVD CPU_REGISTERS+PTRACE_PC(RSV_REG), R1 MSR R1, ELR_EL1 // restore sentry's tls. MOVD CPU_REGISTERS+PTRACE_TLS(RSV_REG), R1 MSR R1, TPIDR_EL0 MOVD CPU_REGISTERS+PTRACE_SP(RSV_REG), R1 MOVD R1, RSP REGISTERS_LOAD(RSV_REG, CPU_REGISTERS) SWITCH_TO_KVM_PAGETABLE() MRS TPIDR_EL1, RSV_REG MOVD CPU_REGISTERS+PTRACE_R19(RSV_REG), RSV_REG_APP ERET() TEXT ·start(SB),NOSPLIT,$0 DSB $7 // dsb(nsh) ISB $15 B ·kernelExitToEl1(SB) // func AddrOfStart() uintptr TEXT ·AddrOfStart(SB), $0-8 MOVD $·start(SB), R0 MOVD R0, ret+0(FP) RET // El1_sync_invalid is the handler for an invalid EL1_sync. TEXT ·El1_sync_invalid(SB),NOSPLIT,$0 B ·Shutdown(SB) // El1_irq_invalid is the handler for an invalid El1_irq. TEXT ·El1_irq_invalid(SB),NOSPLIT,$0 B ·Shutdown(SB) // El1_fiq_invalid is the handler for an invalid El1_fiq. TEXT ·El1_fiq_invalid(SB),NOSPLIT,$0 B ·Shutdown(SB) // El1_error_invalid is the handler for an invalid El1_error. TEXT ·El1_error_invalid(SB),NOSPLIT,$0 B ·Shutdown(SB) // El1_sync is the handler for El1_sync. TEXT ·El1_sync(SB),NOSPLIT,$0 KERNEL_ENTRY_FROM_EL1 MRS ESR_EL1, R25 // read the syndrome register LSR $ESR_ELx_EC_SHIFT, R25, R24 // exception class CMP $ESR_ELx_EC_DABT_CUR, R24 BEQ el1_da // data abort in EL1 CMP $ESR_ELx_EC_IABT_CUR, R24 BEQ el1_ia // instruction abort in EL1 CMP $ESR_ELx_EC_FP_ASIMD, R24 BEQ el1_fpsimd_acc // FP/ASIMD access CMP $ESR_ELx_EC_SVE, R24 BEQ el1_sve_acc // SVE access CMP $ESR_ELx_EC_SP_ALIGN, R24 BEQ el1_sp_pc // stack alignment exception CMP $ESR_ELx_EC_PC_ALIGN, R24 BEQ el1_sp_pc // pc alignment exception CMP $ESR_ELx_EC_UNKNOWN, R24 BEQ el1_undef // unknown exception in EL1 CMP $ESR_ELx_EC_SVC64, R24 BEQ el1_svc // SVC in 64-bit state CMP $ESR_ELx_EC_BREAKPT_CUR, R24 BEQ el1_dbg // debug exception in EL1 B el1_invalid el1_da: EXCEPTION_EL1(El1SyncDa) el1_ia: EXCEPTION_EL1(El1SyncIa) el1_sp_pc: EXCEPTION_EL1(El1SyncSpPc) el1_undef: EXCEPTION_EL1(El1SyncUndef) el1_svc: B ·HaltEl1SvcAndResume(SB) el1_dbg: EXCEPTION_EL1(El1SyncDbg) el1_fpsimd_acc: el1_sve_acc: FPSIMD_DISABLE_TRAP(RSV_REG) // Restore context. MRS TPIDR_EL1, RSV_REG // Restore sp. MOVD CPU_REGISTERS+PTRACE_SP(RSV_REG), R1 MOVD R1, RSP // Restore common registers. REGISTERS_LOAD(RSV_REG, CPU_REGISTERS) MOVD CPU_REGISTERS+PTRACE_R19(RSV_REG), RSV_REG_APP ERET() // return to el1. el1_invalid: EXCEPTION_EL1(El1SyncInv) // El1_irq is the handler for El1_irq. TEXT ·El1_irq(SB),NOSPLIT,$0 B ·Shutdown(SB) // El1_fiq is the handler for El1_fiq. TEXT ·El1_fiq(SB),NOSPLIT,$0 B ·Shutdown(SB) // El1_error is the handler for El1_error. TEXT ·El1_error(SB),NOSPLIT,$0 B ·Shutdown(SB) // El0_sync is the handler for El0_sync. TEXT ·El0_sync(SB),NOSPLIT,$0 KERNEL_ENTRY_FROM_EL0 MRS ESR_EL1, R25 // read the syndrome register LSR $ESR_ELx_EC_SHIFT, R25, R24 // exception class CMP $ESR_ELx_EC_SVC64, R24 BEQ el0_svc // SVC in 64-bit state CMP $ESR_ELx_EC_DABT_LOW, R24 BEQ el0_da // data abort in EL0 CMP $ESR_ELx_EC_IABT_LOW, R24 BEQ el0_ia // instruction abort in EL0 CMP $ESR_ELx_EC_FP_ASIMD, R24 BEQ el0_fpsimd_acc // FP/ASIMD access CMP $ESR_ELx_EC_SVE, R24 BEQ el0_sve_acc // SVE access CMP $ESR_ELx_EC_FP_EXC64, R24 BEQ el0_fpsimd_exc // FP/ASIMD exception CMP $ESR_ELx_EC_SP_ALIGN, R24 BEQ el0_sp_pc // stack alignment exception CMP $ESR_ELx_EC_PC_ALIGN, R24 BEQ el0_sp_pc // pc alignment exception CMP $ESR_ELx_EC_UNKNOWN, R24 BEQ el0_undef // unknown exception in EL0 CMP $ESR_ELx_EC_BREAKPT_LOW, R24 BEQ el0_dbg // debug exception in EL0 CMP $ESR_ELx_EC_SYS64, R24 BEQ el0_sys // configurable trap CMP $ESR_ELx_EC_WFx, R24 BEQ el0_wfx // WFX trap B el0_invalid el0_svc: WORD $0xd538d092 //MRS TPIDR_EL1, R18 MOVD $0, CPU_ERROR_CODE(RSV_REG) // Clear error code. MOVD $1, R3 MOVD R3, CPU_ERROR_TYPE(RSV_REG) // Set error type to user. MOVD $Syscall, R3 MOVD R3, CPU_VECTOR_CODE(RSV_REG) B ·kernelExitToEl1(SB) el0_da: el0_ia: EXCEPTION_EL0(PageFault) el0_fpsimd_acc: el0_sve_acc: FPSIMD_DISABLE_TRAP(RSV_REG) FPSTATE_EL0_LOAD() // Restore context. MRS TPIDR_EL1, RSV_REG MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP // Restore R0-R30 REGISTERS_LOAD(RSV_REG_APP, 0) MOVD PTRACE_R18(RSV_REG_APP), RSV_REG MOVD PTRACE_R19(RSV_REG_APP), RSV_REG_APP ERET() // return to el0. el0_fpsimd_exc: EXCEPTION_EL0(El0SyncFpsimdExc) el0_sp_pc: EXCEPTION_EL0(El0SyncSpPc) el0_undef: EXCEPTION_EL0(El0SyncUndef) el0_dbg: EXCEPTION_EL0(El0SyncDbg) el0_sys: EXCEPTION_EL0(El0SyncSys) el0_wfx: EXCEPTION_EL0(El0SyncWfx) el0_invalid: EXCEPTION_EL0(El0SyncInv) TEXT ·El0_irq(SB),NOSPLIT,$0 B ·Shutdown(SB) TEXT ·El0_fiq(SB),NOSPLIT,$0 B ·Shutdown(SB) TEXT ·El0_error(SB),NOSPLIT,$0 KERNEL_ENTRY_FROM_EL0 WORD $0xd5385219 // MRS ESR_EL1, R25 AND $ESR_ELx_SERR_MASK, R25, R24 CMP $ESR_ELx_SERR_NMI, R24 BEQ el0_nmi B el0_bounce el0_nmi: EXCEPTION_EL0(El0ErrNMI) el0_bounce: EXCEPTION_EL0(VirtualizationException) TEXT ·El0_sync_invalid(SB),NOSPLIT,$0 B ·Shutdown(SB) TEXT ·El0_irq_invalid(SB),NOSPLIT,$0 B ·Shutdown(SB) TEXT ·El0_fiq_invalid(SB),NOSPLIT,$0 B ·Shutdown(SB) TEXT ·El0_error_invalid(SB),NOSPLIT,$0 B ·Shutdown(SB) // vectors implements exception vector table. // The start address of exception vector table should be 11-bits aligned. // For detail, please refer to arm developer document: // https://developer.arm.com/documentation/100933/0100/AArch64-exception-vector-table // Also can refer to the code in linux kernel: arch/arm64/kernel/entry.S TEXT ·vectors(SB),NOSPLIT,$0 PCALIGN $2048 B ·El1_sync_invalid(SB) PCALIGN $128 B ·El1_irq_invalid(SB) PCALIGN $128 B ·El1_fiq_invalid(SB) PCALIGN $128 B ·El1_error_invalid(SB) PCALIGN $128 B ·El1_sync(SB) PCALIGN $128 B ·El1_irq(SB) PCALIGN $128 B ·El1_fiq(SB) PCALIGN $128 B ·El1_error(SB) PCALIGN $128 B ·El0_sync(SB) PCALIGN $128 B ·El0_irq(SB) PCALIGN $128 B ·El0_fiq(SB) PCALIGN $128 B ·El0_error(SB) PCALIGN $128 B ·El0_sync_invalid(SB) PCALIGN $128 B ·El0_irq_invalid(SB) PCALIGN $128 B ·El0_fiq_invalid(SB) PCALIGN $128 B ·El0_error_invalid(SB) // func AddrOfVectors() uintptr TEXT ·AddrOfVectors(SB), $0-8 MOVD $·vectors(SB), R0 MOVD R0, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/kernel.go000066400000000000000000000045061465435605700222570ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ring0 import ( "gvisor.dev/gvisor/pkg/sentry/arch/fpu" ) // Init initializes a new kernel. // //go:nosplit func (k *Kernel) Init(maxCPUs int) { k.init(maxCPUs) } // Halt halts execution. func Halt() // defaultHooks implements hooks. type defaultHooks struct{} // KernelSyscall implements Hooks.KernelSyscall. // // +checkescape:all // //go:nosplit func (defaultHooks) KernelSyscall() { Halt() } // KernelException implements Hooks.KernelException. // // +checkescape:all // //go:nosplit func (defaultHooks) KernelException(Vector) { Halt() } // kernelSyscall is a trampoline. // // When in amd64, it is called with %rip on the upper half, so it can // NOT access to any global data which is not mapped on upper and must // call to function pointers or interfaces to switch to the lower half // so that callee can access to global data. // // +checkescape:hard,stack // //go:nosplit func kernelSyscall(c *CPU) { c.hooks.KernelSyscall() } // kernelException is a trampoline. // // When in amd64, it is called with %rip on the upper half, so it can // NOT access to any global data which is not mapped on upper and must // call to function pointers or interfaces to switch to the lower half // so that callee can access to global data. // // +checkescape:hard,stack // //go:nosplit func kernelException(c *CPU, vector Vector) { c.hooks.KernelException(vector) } // Init initializes a new CPU. // // Init allows embedding in other objects. func (c *CPU) Init(k *Kernel, cpuID int, hooks Hooks) { c.self = c // Set self reference. c.kernel = k // Set kernel reference. c.init(cpuID) // Perform architectural init. c.floatingPointState = fpu.NewState() // Require hooks. if hooks != nil { c.hooks = hooks } else { c.hooks = defaultHooks{} } } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/kernel_amd64.go000066400000000000000000000242451465435605700232540ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package ring0 import ( "encoding/binary" "reflect" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" ) // HaltAndWriteFSBase halts execution. On resume, it sets FS_BASE from the // value in regs. func HaltAndWriteFSBase(regs *arch.Registers) // init initializes architecture-specific state. func (k *Kernel) init(maxCPUs int) { entrySize := reflect.TypeOf(kernelEntry{}).Size() var ( entries []kernelEntry padding = 1 ) for { entries = make([]kernelEntry, maxCPUs+padding-1) totalSize := entrySize * uintptr(maxCPUs+padding-1) addr := reflect.ValueOf(&entries[0]).Pointer() if addr&(hostarch.PageSize-1) == 0 && totalSize >= hostarch.PageSize { // The runtime forces power-of-2 alignment for allocations, and we are therefore // safe once the first address is aligned and the chunk is at least a full page. break } padding = padding << 1 } k.cpuEntries = entries k.globalIDT = &idt64{} if reflect.TypeOf(idt64{}).Size() != hostarch.PageSize { panic("Size of globalIDT should be PageSize") } if reflect.ValueOf(k.globalIDT).Pointer()&(hostarch.PageSize-1) != 0 { panic("Allocated globalIDT should be page aligned") } // Setup the IDT, which is uniform. for v, handler := range handlers { // Allow Breakpoint and Overflow to be called from all // privilege levels. dpl := 0 if v == Breakpoint || v == Overflow { dpl = 3 } // Note that we set all traps to use the interrupt stack, this // is defined below when setting up the TSS. k.globalIDT[v].setInterrupt(Kcode, uint64(kernelFunc(handler)), dpl, 1 /* ist */) } } // EntryRegions returns the set of kernel entry regions (must be mapped). func (k *Kernel) EntryRegions() map[uintptr]uintptr { regions := make(map[uintptr]uintptr) addr := reflect.ValueOf(&k.cpuEntries[0]).Pointer() size := reflect.TypeOf(kernelEntry{}).Size() * uintptr(len(k.cpuEntries)) end, _ := hostarch.Addr(addr + size).RoundUp() regions[uintptr(hostarch.Addr(addr).RoundDown())] = uintptr(end) addr = reflect.ValueOf(k.globalIDT).Pointer() size = reflect.TypeOf(idt64{}).Size() end, _ = hostarch.Addr(addr + size).RoundUp() regions[uintptr(hostarch.Addr(addr).RoundDown())] = uintptr(end) return regions } // init initializes architecture-specific state. func (c *CPU) init(cpuID int) { c.kernelEntry = &c.kernel.cpuEntries[cpuID] c.cpuSelf = c // Null segment. c.gdt[0].setNull() // Kernel & user segments. c.gdt[segKcode] = KernelCodeSegment c.gdt[segKdata] = KernelDataSegment c.gdt[segUcode32] = UserCodeSegment32 c.gdt[segUdata] = UserDataSegment c.gdt[segUcode64] = UserCodeSegment64 // The task segment, this spans two entries. tssBase, tssLimit, _ := c.TSS() c.gdt[segTss].set( uint32(tssBase), uint32(tssLimit), 0, // Privilege level zero. SegmentDescriptorPresent| SegmentDescriptorAccess| SegmentDescriptorWrite| SegmentDescriptorExecute) c.gdt[segTssHi].setHi(uint32((tssBase) >> 32)) // Set the kernel stack pointer in the TSS (virtual address). stackAddr := c.StackTop() c.stackTop = stackAddr c.tss.rsp0Lo = uint32(stackAddr) c.tss.rsp0Hi = uint32(stackAddr >> 32) c.tss.ist1Lo = uint32(stackAddr) c.tss.ist1Hi = uint32(stackAddr >> 32) // Set the I/O bitmap base address beyond the last byte in the TSS // to block access to the entire I/O address range. // // From section 18.5.2 "I/O Permission Bit Map" from Intel SDM vol1: // I/O addresses not spanned by the map are treated as if they had set // bits in the map. c.tss.ioPerm = tssLimit + 1 // Permanently set the kernel segments. c.registers.Cs = uint64(Kcode) c.registers.Ds = uint64(Kdata) c.registers.Es = uint64(Kdata) c.registers.Ss = uint64(Kdata) c.registers.Fs = uint64(Kdata) c.registers.Gs = uint64(Kdata) // Set mandatory flags. c.registers.Eflags = KernelFlagsSet c.hasXSAVE = hasXSAVE c.hasXSAVEOPT = hasXSAVEOPT c.hasFSGSBASE = hasFSGSBASE } // StackTop returns the kernel's stack address. // //go:nosplit func (c *CPU) StackTop() uint64 { return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack)) } // IDT returns the CPU's IDT base and limit. // //go:nosplit func (c *CPU) IDT() (uint64, uint16) { return uint64(kernelAddr(&c.kernel.globalIDT[0])), uint16(binary.Size(&c.kernel.globalIDT) - 1) } // GDT returns the CPU's GDT base and limit. // //go:nosplit func (c *CPU) GDT() (uint64, uint16) { return uint64(kernelAddr(&c.gdt[0])), uint16(8*segLast - 1) } // TSS returns the CPU's TSS base, limit and value. // //go:nosplit func (c *CPU) TSS() (uint64, uint16, *SegmentDescriptor) { return uint64(kernelAddr(&c.tss)), uint16(binary.Size(&c.tss) - 1), &c.gdt[segTss] } // CR0 returns the CPU's CR0 value. // //go:nosplit func (c *CPU) CR0() uint64 { return _CR0_PE | _CR0_PG | _CR0_AM | _CR0_ET | _CR0_NE } // CR4 returns the CPU's CR4 value. // //go:nosplit func (c *CPU) CR4() uint64 { cr4 := uint64(_CR4_PAE | _CR4_PSE | _CR4_OSFXSR | _CR4_OSXMMEXCPT) if hasPCID { cr4 |= _CR4_PCIDE } if hasXSAVE { cr4 |= _CR4_OSXSAVE } if hasSMEP { cr4 |= _CR4_SMEP } if hasSMAP { cr4 |= _CR4_SMAP } if hasFSGSBASE { cr4 |= _CR4_FSGSBASE } return cr4 } // EFER returns the CPU's EFER value. // //go:nosplit func (c *CPU) EFER() uint64 { return _EFER_LME | _EFER_LMA | _EFER_SCE | _EFER_NX } // IsCanonical indicates whether addr is canonical per the amd64 spec. // //go:nosplit func IsCanonical(addr uint64) bool { return addr <= 0x00007fffffffffff || addr >= 0xffff800000000000 } // SwitchToUser performs either a sysret or an iret. // // The return value is the vector that interrupted execution. // // This function will not split the stack. Callers will probably want to call // runtime.entersyscall (and pair with a call to runtime.exitsyscall) prior to // calling this function. // // When this is done, this region is quite sensitive to things like system // calls. After calling entersyscall, any memory used must have been allocated // and no function calls without go:nosplit are permitted. Any calls made here // are protected appropriately (e.g. IsCanonical and CR3). // // Also note that this function transitively depends on the compiler generating // code that uses IP-relative addressing inside of absolute addresses. That's // the case for amd64, but may not be the case for other architectures. // // Precondition: the Rip, Rsp, Fs and Gs registers must be canonical. // // +checkescape:all // //go:nosplit func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID) c.kernelCR3 = uintptr(c.kernel.PageTables.CR3(true, switchOpts.KernelPCID)) // Sanitize registers. regs := switchOpts.Registers regs.Eflags &= ^uint64(UserFlagsClear) regs.Eflags |= UserFlagsSet regs.Cs = uint64(Ucode64) // Required for iret. regs.Ss = uint64(Udata) // Ditto. // Perform the switch. needIRET := uint64(0) if switchOpts.FullRestore { needIRET = 1 } vector = doSwitchToUser(c, regs, switchOpts.FloatingPointState.BytePointer(), userCR3, needIRET) // escapes: no. return } func doSwitchToUser( cpu *CPU, // +0(FP) regs *arch.Registers, // +8(FP) fpState *byte, // +16(FP) userCR3 uint64, // +24(FP) needIRET uint64) Vector // +32(FP), +40(FP) // startGo is the CPU entrypoint. // // This is called from the start asm stub (see entry_amd64.go); on return the // registers in c.registers will be restored (not segments). // // Note that any code written in Go should adhere to Go expected environment: // - Initialized floating point state (required for optimizations using // floating point instructions). // - Go TLS in FS_BASE (this is required by splittable functions, calls into // the runtime, calls to assembly functions (Go 1.17+ ABI wrappers access // TLS)). // //go:nosplit func startGo(c *CPU) { // Save per-cpu. writeGS(kernelAddr(c.kernelEntry)) // // TODO(mpratt): Note that per the note above, this should be done // before entering Go code. However for simplicity we leave it here for // now, since the small critical sections with undefined FPU state // should only contain very limited use of floating point instructions // (notably, use of XMM15 as a zero register). fninit() // Need to sync XCR0 with the host, because xsave and xrstor can be // called from different contexts. if hasXSAVE { // Exclude MPX bits. MPX has been deprecated and we have seen // cases when it isn't supported in VM. xcr0 := localXCR0 &^ (cpuid.XSAVEFeatureBNDCSR | cpuid.XSAVEFeatureBNDREGS) xsetbv(0, xcr0) } // Set the syscall target. wrmsr(_MSR_LSTAR, kernelFunc(addrOfSysenter())) wrmsr(_MSR_SYSCALL_MASK, KernelFlagsClear|_RFLAGS_DF) // NOTE: This depends on having the 64-bit segments immediately // following the 32-bit user segments. This is simply the way the // sysret instruction is designed to work (it assumes they follow). wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48)) wrmsr(_MSR_CSTAR, kernelFunc(addrOfSysenter())) } // SetCPUIDFaulting sets CPUID faulting per the boolean value. // // True is returned if faulting could be set. // //go:nosplit func SetCPUIDFaulting(on bool) bool { // Per the SDM (Vol 3, Table 2-43), PLATFORM_INFO bit 31 denotes support // for CPUID faulting, and we enable and disable via the MISC_FEATURES MSR. if rdmsr(_MSR_PLATFORM_INFO)&_PLATFORM_INFO_CPUID_FAULT != 0 { features := rdmsr(_MSR_MISC_FEATURES) if on { features |= _MISC_FEATURE_CPUID_TRAP } else { features &^= _MISC_FEATURE_CPUID_TRAP } wrmsr(_MSR_MISC_FEATURES, features) return true // Setting successful. } return false } // ReadCR2 reads the current CR2 value. // //go:nosplit func ReadCR2() uintptr { return readCR2() } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/kernel_arm64.go000066400000000000000000000044361465435605700232720ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package ring0 // HaltAndResume halts execution and point the pointer to the resume function. // //go:nosplit func HaltAndResume() // HaltEl1SvcAndResume calls Hooks.KernelSyscall and resume. // //go:nosplit func HaltEl1SvcAndResume() // HaltEl1ExceptionAndResume calls Hooks.KernelException and resume. // //go:nosplit func HaltEl1ExceptionAndResume() // init initializes architecture-specific state. func (k *Kernel) init(maxCPUs int) { } // init initializes architecture-specific state. func (c *CPU) init(cpuID int) { // Set the kernel stack pointer(virtual address). c.registers.Sp = uint64(c.StackTop()) } // StackTop returns the kernel's stack address. // //go:nosplit func (c *CPU) StackTop() uint64 { return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack)) } // IsCanonical indicates whether addr is canonical per the arm64 spec. // //go:nosplit func IsCanonical(addr uint64) bool { return addr <= 0x0000ffffffffffff || addr >= 0xffff000000000000 } // SwitchToUser performs an eret. // // The return value is the exception vector. // // +checkescape:all // //go:nosplit func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { storeAppASID(uintptr(switchOpts.UserASID)) storeEl0Fpstate(switchOpts.FloatingPointState.BytePointer()) if switchOpts.Flush { LocalFlushTlbByASID(uintptr(switchOpts.UserASID)) } regs := switchOpts.Registers regs.Pstate &= ^uint64(PsrFlagsClear) regs.Pstate |= UserFlagsSet fpDisableTrap := CPACREL1() if fpDisableTrap != 0 { FPSIMDEnableTrap() } kernelExitToEl0() fpDisableTrap = CPACREL1() if fpDisableTrap != 0 { SaveFloatingPoint(switchOpts.FloatingPointState.BytePointer()) } vector = c.vecCode return } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/kernel_unsafe.go000066400000000000000000000020641465435605700236150ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ring0 import ( "unsafe" ) // eface mirrors runtime.eface. type eface struct { typ uintptr data unsafe.Pointer } // kernelAddr returns the kernel virtual address for the given object. // //go:nosplit func kernelAddr(obj any) uintptr { e := (*eface)(unsafe.Pointer(&obj)) return KernelStartAddress | uintptr(e.data) } // kernelFunc returns the address of the given function. // //go:nosplit func kernelFunc(fn uintptr) uintptr { return KernelStartAddress | fn } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/lib_amd64.go000066400000000000000000000072611465435605700225410ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package ring0 import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/hostarch" ) // fxrstor restores floating point state. func fxrstor(addr uintptr) // xrstor restores floating point state. func xrstor(addr uintptr) // fxsave saves floating point state. func fxsave(addr uintptr) // xsave saves floating point state. func xsave(addr uintptr) // xsaveopt saves floating point state. func xsaveopt(addr uintptr) // writeFS sets the FS base address (selects one of wrfsbase or wrfsmsr). func writeFS(addr uintptr) // wrfsbase writes to the GS base address. func wrfsbase(addr uintptr) // wrfsmsr writes to the GS_BASE MSR. func wrfsmsr(addr uintptr) // writeGS sets the GS address (selects one of wrgsbase or wrgsmsr). func writeGS(addr uintptr) // wrgsbase writes to the GS base address. func wrgsbase(addr uintptr) // wrgsmsr writes to the GS_BASE MSR. func wrgsmsr(addr uintptr) // stmxcsr reads the MXCSR control and status register. func stmxcsr(addr *uint32) // ldmxcsr writes to the MXCSR control and status register. func ldmxcsr(addr *uint32) // readCR2 reads the current CR2 value. func readCR2() uintptr // fninit initializes the floating point unit. func fninit() // xsetbv writes to an extended control register. func xsetbv(reg, value uintptr) // xgetbv reads an extended control register. func xgetbv(reg uintptr) uintptr // wrmsr reads to the given MSR. func wrmsr(reg, value uintptr) // rdmsr reads the given MSR. func rdmsr(reg uintptr) uintptr // Mostly-constants set by Init. var ( hasSMEP bool hasSMAP bool hasPCID bool hasXSAVEOPT bool hasXSAVE bool hasFSGSBASE bool validXCR0Mask uintptr localXCR0 uintptr ) // Init sets function pointers based on architectural features. // // This must be called prior to using ring0. It may be called with the // auto-detected feature set using InitDefault. It may also be called at // another time with a different FeatureSet. func Init(fs cpuid.FeatureSet) { // Initialize all sizes. VirtualAddressBits = uintptr(fs.VirtualAddressBits()) // TODO(gvisor.dev/issue/7349): introduce support for 5-level paging. // Four-level page tables allows to address up to 48-bit virtual // addresses. if VirtualAddressBits > 48 { VirtualAddressBits = 48 } PhysicalAddressBits = uintptr(fs.PhysicalAddressBits()) UserspaceSize = uintptr(1) << (VirtualAddressBits - 1) MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(hostarch.PageSize-1) KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1) // Initialize all functions. hasSMEP = fs.HasFeature(cpuid.X86FeatureSMEP) hasSMAP = fs.HasFeature(cpuid.X86FeatureSMAP) hasPCID = fs.HasFeature(cpuid.X86FeaturePCID) hasXSAVEOPT = fs.UseXsaveopt() hasXSAVE = fs.UseXsave() hasFSGSBASE = fs.HasFeature(cpuid.X86FeatureFSGSBase) validXCR0Mask = uintptr(fs.ValidXCR0Mask()) if hasXSAVE { XCR0DisabledMask := uintptr((1 << 9) | (1 << 17) | (1 << 18)) localXCR0 = xgetbv(0) &^ XCR0DisabledMask } } // InitDefault initializes ring0 with the auto-detected host feature set. func InitDefault() { cpuid.Initialize() Init(cpuid.HostFeatureSet()) } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/lib_amd64.s000066400000000000000000000123111465435605700223660ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "funcdata.h" #include "textflag.h" // fxrstor loads floating point state. // // The code corresponds to: // // fxrstor64 (%rbx) // TEXT ·fxrstor(SB),NOSPLIT|NOFRAME,$0-8 MOVQ addr+0(FP), BX MOVL $0xffffffff, AX MOVL $0xffffffff, DX BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x0b; RET // xrstor loads floating point state. // // The code corresponds to: // // xrstor (%rdi) // TEXT ·xrstor(SB),NOSPLIT|NOFRAME,$0-8 MOVQ addr+0(FP), DI MOVL $0xffffffff, AX MOVL $0xffffffff, DX BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f; RET // fxsave saves floating point state. // // The code corresponds to: // // fxsave64 (%rbx) // TEXT ·fxsave(SB),NOSPLIT|NOFRAME,$0-8 MOVQ addr+0(FP), BX MOVL $0xffffffff, AX MOVL $0xffffffff, DX BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x03; RET // xsave saves floating point state. // // The code corresponds to: // // xsave (%rdi) // TEXT ·xsave(SB),NOSPLIT|NOFRAME,$0-8 MOVQ addr+0(FP), DI MOVL $0xffffffff, AX MOVL $0xffffffff, DX BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; RET // xsaveopt saves floating point state. // // The code corresponds to: // // xsaveopt (%rdi) // TEXT ·xsaveopt(SB),NOSPLIT|NOFRAME,$0-8 MOVQ addr+0(FP), DI MOVL $0xffffffff, AX MOVL $0xffffffff, DX BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; RET // writeFS writes to the FS base. // // This is written in assembly because it must be safe to call before the Go // environment is set up. See comment on start(). // // Preconditions: must be running in the lower address space, as it accesses // global data. TEXT ·writeFS(SB),NOSPLIT,$8-8 MOVQ addr+0(FP), AX CMPB ·hasFSGSBASE(SB), $1 JNE msr PUSHQ AX CALL ·wrfsbase(SB) POPQ AX RET msr: PUSHQ AX CALL ·wrfsmsr(SB) POPQ AX RET // wrfsbase writes to the FS base. // // The code corresponds to: // // wrfsbase %rax // TEXT ·wrfsbase(SB),NOSPLIT|NOFRAME,$0-8 MOVQ addr+0(FP), AX BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xd0; RET // wrfsmsr writes to the FSBASE MSR. // // The code corresponds to: // // wrmsr (writes EDX:EAX to the MSR in ECX) // TEXT ·wrfsmsr(SB),NOSPLIT|NOFRAME,$0-8 MOVQ addr+0(FP), AX MOVQ AX, DX SHRQ $32, DX MOVQ $0xc0000100, CX // MSR_FS_BASE BYTE $0x0f; BYTE $0x30; RET // writeGS writes to the GS base. // // This is written in assembly because it must be callable from assembly (ABI0) // without an intermediate transition to ABIInternal. // // Preconditions: must be running in the lower address space, as it accesses // global data. TEXT ·writeGS(SB),NOSPLIT,$8-8 MOVQ addr+0(FP), AX CMPB ·hasFSGSBASE(SB), $1 JNE msr PUSHQ AX CALL ·wrgsbase(SB) POPQ AX RET msr: PUSHQ AX CALL ·wrgsmsr(SB) POPQ AX RET // wrgsbase writes to the GS base. // // The code corresponds to: // // wrgsbase %rax // TEXT ·wrgsbase(SB),NOSPLIT|NOFRAME,$0-8 MOVQ addr+0(FP), AX BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xd8; RET // wrgsmsr writes to the GSBASE MSR. // // See wrfsmsr. TEXT ·wrgsmsr(SB),NOSPLIT|NOFRAME,$0-8 MOVQ addr+0(FP), AX MOVQ AX, DX SHRQ $32, DX MOVQ $0xc0000101, CX // MSR_GS_BASE BYTE $0x0f; BYTE $0x30; // WRMSR RET // readCR2 reads the current CR2 value. // // The code corresponds to: // // mov %cr2, %rax // TEXT ·readCR2(SB),NOSPLIT|NOFRAME,$0-8 BYTE $0x0f; BYTE $0x20; BYTE $0xd0; MOVQ AX, ret+0(FP) RET // fninit initializes the floating point unit. // // The code corresponds to: // // fninit TEXT ·fninit(SB),NOSPLIT|NOFRAME,$0 BYTE $0xdb; BYTE $0xe3; RET // xsetbv writes to an extended control register. // // The code corresponds to: // // xsetbv // TEXT ·xsetbv(SB),NOSPLIT|NOFRAME,$0-16 MOVQ reg+0(FP), CX MOVL value+8(FP), AX MOVL value+12(FP), DX BYTE $0x0f; BYTE $0x01; BYTE $0xd1; RET // xgetbv reads an extended control register. // // The code corresponds to: // // xgetbv // TEXT ·xgetbv(SB),NOSPLIT|NOFRAME,$0-16 MOVQ reg+0(FP), CX BYTE $0x0f; BYTE $0x01; BYTE $0xd0; MOVL AX, ret+8(FP) MOVL DX, ret+12(FP) RET // wrmsr writes to a control register. // // The code corresponds to: // // wrmsr // TEXT ·wrmsr(SB),NOSPLIT|NOFRAME,$0-16 MOVQ reg+0(FP), CX MOVL value+8(FP), AX MOVL value+12(FP), DX BYTE $0x0f; BYTE $0x30; RET // rdmsr reads a control register. // // The code corresponds to: // // rdmsr // TEXT ·rdmsr(SB),NOSPLIT|NOFRAME,$0-16 MOVQ reg+0(FP), CX BYTE $0x0f; BYTE $0x32; MOVL AX, ret+8(FP) MOVL DX, ret+12(FP) RET // stmxcsr reads the MXCSR control and status register. TEXT ·stmxcsr(SB),NOSPLIT|NOFRAME,$0-8 MOVQ addr+0(FP), SI STMXCSR (SI) RET // ldmxcsr writes to the MXCSR control and status register. TEXT ·ldmxcsr(SB),NOSPLIT|NOFRAME,$0-8 MOVQ addr+0(FP), SI LDMXCSR (SI) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/lib_arm64.go000066400000000000000000000044451465435605700225600ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package ring0 // storeEl0Fpstate writes the address of application's fpstate. func storeEl0Fpstate(value *byte) // storeAppASID writes the application's asid value. func storeAppASID(asid uintptr) // LocalFlushTlbAll same as FlushTlbAll, but only applies to the calling CPU. func LocalFlushTlbAll() // FlushTlbByVA invalidates tlb by VA/Last-level/Inner-Shareable. func FlushTlbByVA(addr uintptr) // FlushTlbByASID invalidates tlb by ASID/Inner-Shareable. func FlushTlbByASID(asid uintptr) // LocalFlushTlbByASID invalidates tlb by ASID. func LocalFlushTlbByASID(asid uintptr) // FlushTlbAll invalidates all tlb. func FlushTlbAll() // CPACREL1 returns the value of the CPACR_EL1 register. func CPACREL1() (value uintptr) // GetFPCR returns the value of FPCR register. func GetFPCR() (value uintptr) // SetFPCR writes the FPCR value. func SetFPCR(value uintptr) // GetFPSR returns the value of FPSR register. func GetFPSR() (value uintptr) // SetFPSR writes the FPSR value. func SetFPSR(value uintptr) // SaveVRegs saves V0-V31 registers. // V0-V31: 32 128-bit registers for floating point and simd. func SaveVRegs(*byte) // LoadVRegs loads V0-V31 registers. func LoadVRegs(*byte) // LoadFloatingPoint loads floating point state. func LoadFloatingPoint(*byte) // SaveFloatingPoint saves floating point state. func SaveFloatingPoint(*byte) // FPSIMDDisableTrap disables fpsimd. func FPSIMDDisableTrap() // FPSIMDEnableTrap enables fpsimd. func FPSIMDEnableTrap() // Init sets function pointers based on architectural features. // // This must be called prior to using ring0. func Init() {} // InitDefault calls Init with default parameters. // On ARM, this is not much. func InitDefault() {} golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/lib_arm64.s000066400000000000000000000131101465435605700224020ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "funcdata.h" #include "textflag.h" #define TLBI_ASID_SHIFT 48 TEXT ·FlushTlbByVA(SB),NOSPLIT,$0-8 MOVD addr+0(FP), R1 DSB $10 // dsb(ishst) WORD $0xd50883a1 // tlbi vale1is, x1 DSB $11 // dsb(ish) RET TEXT ·FlushTlbByASID(SB),NOSPLIT,$0-8 MOVD asid+0(FP), R1 LSL $TLBI_ASID_SHIFT, R1, R1 DSB $10 // dsb(ishst) WORD $0xd5088341 // tlbi aside1is, x1 DSB $11 // dsb(ish) RET TEXT ·LocalFlushTlbByASID(SB),NOSPLIT,$0-8 MOVD asid+0(FP), R1 LSL $TLBI_ASID_SHIFT, R1, R1 DSB $10 // dsb(ishst) WORD $0xd5088741 // tlbi aside1, x1 DSB $11 // dsb(ish) RET TEXT ·LocalFlushTlbAll(SB),NOSPLIT,$0 DSB $6 // dsb(nshst) WORD $0xd508871f // __tlbi(vmalle1) DSB $7 // dsb(nsh) ISB $15 RET TEXT ·FlushTlbAll(SB),NOSPLIT,$0 DSB $10 // dsb(ishst) WORD $0xd508831f // __tlbi(vmalle1is) DSB $11 // dsb(ish) ISB $15 RET TEXT ·CPACREL1(SB),NOSPLIT,$0-8 WORD $0xd5381041 // MRS CPACR_EL1, R1 MOVD R1, value+0(FP) RET TEXT ·GetFPCR(SB),NOSPLIT,$0-8 MOVD FPCR, R1 MOVD R1, value+0(FP) RET TEXT ·GetFPSR(SB),NOSPLIT,$0-8 MOVD FPSR, R1 MOVD R1, value+0(FP) RET TEXT ·SetFPCR(SB),NOSPLIT,$0-8 MOVD value+0(FP), R1 MOVD R1, FPCR RET TEXT ·SetFPSR(SB),NOSPLIT,$0-8 MOVD value+0(FP), R1 MOVD R1, FPSR RET TEXT ·SaveVRegs(SB),NOSPLIT,$0-8 MOVD arg+0(FP), R0 // Skip aarch64_ctx, fpsr, fpcr. ADD $16, R0, R0 WORD $0xad000400 // stp q0, q1, [x0] WORD $0xad010c02 // stp q2, q3, [x0, #32] WORD $0xad021404 // stp q4, q5, [x0, #64] WORD $0xad031c06 // stp q6, q7, [x0, #96] WORD $0xad042408 // stp q8, q9, [x0, #128] WORD $0xad052c0a // stp q10, q11, [x0, #160] WORD $0xad06340c // stp q12, q13, [x0, #192] WORD $0xad073c0e // stp q14, q15, [x0, #224] WORD $0xad084410 // stp q16, q17, [x0, #256] WORD $0xad094c12 // stp q18, q19, [x0, #288] WORD $0xad0a5414 // stp q20, q21, [x0, #320] WORD $0xad0b5c16 // stp q22, q23, [x0, #352] WORD $0xad0c6418 // stp q24, q25, [x0, #384] WORD $0xad0d6c1a // stp q26, q27, [x0, #416] WORD $0xad0e741c // stp q28, q29, [x0, #448] WORD $0xad0f7c1e // stp q30, q31, [x0, #480] RET TEXT ·LoadVRegs(SB),NOSPLIT,$0-8 MOVD arg+0(FP), R0 // Skip aarch64_ctx, fpsr, fpcr. ADD $16, R0, R0 WORD $0xad400400 // ldp q0, q1, [x0] WORD $0xad410c02 // ldp q2, q3, [x0, #32] WORD $0xad421404 // ldp q4, q5, [x0, #64] WORD $0xad431c06 // ldp q6, q7, [x0, #96] WORD $0xad442408 // ldp q8, q9, [x0, #128] WORD $0xad452c0a // ldp q10, q11, [x0, #160] WORD $0xad46340c // ldp q12, q13, [x0, #192] WORD $0xad473c0e // ldp q14, q15, [x0, #224] WORD $0xad484410 // ldp q16, q17, [x0, #256] WORD $0xad494c12 // ldp q18, q19, [x0, #288] WORD $0xad4a5414 // ldp q20, q21, [x0, #320] WORD $0xad4b5c16 // ldp q22, q23, [x0, #352] WORD $0xad4c6418 // ldp q24, q25, [x0, #384] WORD $0xad4d6c1a // ldp q26, q27, [x0, #416] WORD $0xad4e741c // ldp q28, q29, [x0, #448] WORD $0xad4f7c1e // ldp q30, q31, [x0, #480] RET TEXT ·LoadFloatingPoint(SB),NOSPLIT,$0-8 MOVD arg+0(FP), R0 MOVD 0(R0), R1 MOVD R1, FPSR MOVD 8(R0), R1 MOVD R1, FPCR ADD $16, R0, R0 WORD $0xad400400 // ldp q0, q1, [x0] WORD $0xad410c02 // ldp q2, q3, [x0, #32] WORD $0xad421404 // ldp q4, q5, [x0, #64] WORD $0xad431c06 // ldp q6, q7, [x0, #96] WORD $0xad442408 // ldp q8, q9, [x0, #128] WORD $0xad452c0a // ldp q10, q11, [x0, #160] WORD $0xad46340c // ldp q12, q13, [x0, #192] WORD $0xad473c0e // ldp q14, q15, [x0, #224] WORD $0xad484410 // ldp q16, q17, [x0, #256] WORD $0xad494c12 // ldp q18, q19, [x0, #288] WORD $0xad4a5414 // ldp q20, q21, [x0, #320] WORD $0xad4b5c16 // ldp q22, q23, [x0, #352] WORD $0xad4c6418 // ldp q24, q25, [x0, #384] WORD $0xad4d6c1a // ldp q26, q27, [x0, #416] WORD $0xad4e741c // ldp q28, q29, [x0, #448] WORD $0xad4f7c1e // ldp q30, q31, [x0, #480] RET TEXT ·SaveFloatingPoint(SB),NOSPLIT,$0-8 MOVD arg+0(FP), R0 MOVD FPSR, R1 MOVD R1, 0(R0) MOVD FPCR, R1 MOVD R1, 8(R0) ADD $16, R0, R0 WORD $0xad000400 // stp q0, q1, [x0] WORD $0xad010c02 // stp q2, q3, [x0, #32] WORD $0xad021404 // stp q4, q5, [x0, #64] WORD $0xad031c06 // stp q6, q7, [x0, #96] WORD $0xad042408 // stp q8, q9, [x0, #128] WORD $0xad052c0a // stp q10, q11, [x0, #160] WORD $0xad06340c // stp q12, q13, [x0, #192] WORD $0xad073c0e // stp q14, q15, [x0, #224] WORD $0xad084410 // stp q16, q17, [x0, #256] WORD $0xad094c12 // stp q18, q19, [x0, #288] WORD $0xad0a5414 // stp q20, q21, [x0, #320] WORD $0xad0b5c16 // stp q22, q23, [x0, #352] WORD $0xad0c6418 // stp q24, q25, [x0, #384] WORD $0xad0d6c1a // stp q26, q27, [x0, #416] WORD $0xad0e741c // stp q28, q29, [x0, #448] WORD $0xad0f7c1e // stp q30, q31, [x0, #480] RET golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/000077500000000000000000000000001465435605700225525ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/allocator.go000066400000000000000000000067311465435605700250700ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pagetables // Allocator is used to allocate and map PTEs. // // Note that allocators may be called concurrently. type Allocator interface { // NewPTEs returns a new set of PTEs and their physical address. NewPTEs() *PTEs // PhysicalFor gives the physical address for a set of PTEs. PhysicalFor(ptes *PTEs) uintptr // LookupPTEs looks up PTEs by physical address. LookupPTEs(physical uintptr) *PTEs // FreePTEs marks a set of PTEs a freed, although they may not be available // for use again until Recycle is called, below. FreePTEs(ptes *PTEs) // Recycle makes freed PTEs available for use again. Recycle() } // RuntimeAllocator is a trivial allocator. type RuntimeAllocator struct { // used is the set of PTEs that have been allocated. This includes any // PTEs that may be in the pool below. PTEs are only freed from this // map by the Drain call. // // This exists to prevent accidental garbage collection. used map[*PTEs]struct{} // pool is the set of free-to-use PTEs. pool []*PTEs // freed is the set of recently-freed PTEs. freed []*PTEs } // NewRuntimeAllocator returns an allocator that uses runtime allocation. func NewRuntimeAllocator() *RuntimeAllocator { r := new(RuntimeAllocator) r.Init() return r } // Init initializes a RuntimeAllocator. func (r *RuntimeAllocator) Init() { r.used = make(map[*PTEs]struct{}) } // Recycle returns freed pages to the pool. func (r *RuntimeAllocator) Recycle() { r.pool = append(r.pool, r.freed...) r.freed = r.freed[:0] } // Drain empties the pool. func (r *RuntimeAllocator) Drain() { r.Recycle() for i, ptes := range r.pool { // Zap the entry in the underlying array to ensure that it can // be properly garbage collected. r.pool[i] = nil // Similarly, free the reference held by the used map (these // also apply for the pool entries). delete(r.used, ptes) } r.pool = r.pool[:0] } // NewPTEs implements Allocator.NewPTEs. // // Note that the "physical" address here is actually the virtual address of the // PTEs structure. The entries are tracked only to avoid garbage collection. // // This is guaranteed not to split as long as the pool is sufficiently full. // //go:nosplit func (r *RuntimeAllocator) NewPTEs() *PTEs { // Pull from the pool if we can. if len(r.pool) > 0 { ptes := r.pool[len(r.pool)-1] r.pool = r.pool[:len(r.pool)-1] return ptes } // Allocate a new entry. ptes := newAlignedPTEs() r.used[ptes] = struct{}{} return ptes } // PhysicalFor returns the physical address for the given PTEs. // //go:nosplit func (r *RuntimeAllocator) PhysicalFor(ptes *PTEs) uintptr { return physicalFor(ptes) } // LookupPTEs implements Allocator.LookupPTEs. // //go:nosplit func (r *RuntimeAllocator) LookupPTEs(physical uintptr) *PTEs { return fromPhysical(physical) } // FreePTEs implements Allocator.FreePTEs. // //go:nosplit func (r *RuntimeAllocator) FreePTEs(ptes *PTEs) { r.freed = append(r.freed, ptes) } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/allocator_unsafe.go000066400000000000000000000027151465435605700264270ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pagetables import ( "unsafe" "gvisor.dev/gvisor/pkg/hostarch" ) // newAlignedPTEs returns a set of aligned PTEs. func newAlignedPTEs() *PTEs { ptes := new(PTEs) offset := physicalFor(ptes) & (hostarch.PageSize - 1) if offset == 0 { // Already aligned. return ptes } // Need to force an aligned allocation. unaligned := make([]byte, (2*hostarch.PageSize)-1) offset = uintptr(unsafe.Pointer(&unaligned[0])) & (hostarch.PageSize - 1) if offset != 0 { offset = hostarch.PageSize - offset } return (*PTEs)(unsafe.Pointer(&unaligned[offset])) } // physicalFor returns the "physical" address for PTEs. // //go:nosplit func physicalFor(ptes *PTEs) uintptr { return uintptr(unsafe.Pointer(ptes)) } // fromPhysical returns the PTEs from the "physical" address. // //go:nosplit func fromPhysical(physical uintptr) *PTEs { return (*PTEs)(unsafe.Pointer(physical)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/pagetables.go000066400000000000000000000216051465435605700252140ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package pagetables provides a generic implementation of pagetables. // // The core functions must be safe to call from a nosplit context. Furthermore, // this pagetables implementation goes to lengths to ensure that all functions // are free from runtime allocation. Calls to NewPTEs/FreePTEs may be made // during walks, but these can be cached elsewhere if required. package pagetables import ( "gvisor.dev/gvisor/pkg/hostarch" ) // PageTables is a set of page tables. type PageTables struct { // Allocator is used to allocate nodes. Allocator Allocator // root is the pagetable root. // // For same archs such as amd64, the upper of the PTEs is cloned // from and owned by upperSharedPageTables which are shared among // many PageTables if upperSharedPageTables is not nil. root *PTEs // rootPhysical is the cached physical address of the root. // // This is saved only to prevent constant translation. rootPhysical uintptr // archPageTables includes architecture-specific features. archPageTables // upperSharedPageTables represents a read-only shared upper // of the Pagetable. When it is not nil, the upper is not // allowed to be modified. upperSharedPageTables *PageTables // upperStart is the start address of the upper portion that // are shared from upperSharedPageTables upperStart uintptr // readOnlyShared indicates the Pagetables are read-only and // own the ranges that are shared with other Pagetables. readOnlyShared bool } // Init initializes a set of PageTables. // // +checkescape:hard,stack // //go:nosplit func (p *PageTables) Init(allocator Allocator) { p.Allocator = allocator p.root = p.Allocator.NewPTEs() p.rootPhysical = p.Allocator.PhysicalFor(p.root) } // NewWithUpper returns new PageTables. // // upperSharedPageTables are used for mapping the upper of addresses, // starting at upperStart. These pageTables should not be touched (as // invalidations may be incorrect) after they are passed as an // upperSharedPageTables. Only when all dependent PageTables are gone // may they be used. The intenteded use case is for kernel page tables, // which are static and fixed. // // Precondition: upperStart must be between canonical ranges. // Precondition: upperStart must be pgdSize aligned. // precondition: upperSharedPageTables must be marked read-only shared. func NewWithUpper(a Allocator, upperSharedPageTables *PageTables, upperStart uintptr) *PageTables { p := new(PageTables) p.Init(a) if upperSharedPageTables != nil { if !upperSharedPageTables.readOnlyShared { panic("Only read-only shared pagetables can be used as upper") } p.upperSharedPageTables = upperSharedPageTables p.upperStart = upperStart } p.InitArch(a) return p } // New returns new PageTables. func New(a Allocator) *PageTables { return NewWithUpper(a, nil, 0) } // mapVisitor is used for map. type mapVisitor struct { target uintptr // Input. physical uintptr // Input. opts MapOpts // Input. prev bool // Output. } // visit is used for map. // //go:nosplit func (v *mapVisitor) visit(start uintptr, pte *PTE, align uintptr) bool { p := v.physical + (start - uintptr(v.target)) if pte.Valid() && (pte.Address() != p || pte.Opts() != v.opts) { v.prev = true } if p&align != 0 { // We will install entries at a smaller granulaity if we don't // install a valid entry here, however we must zap any existing // entry to ensure this happens. pte.Clear() return true } pte.Set(p, v.opts) return true } //go:nosplit func (*mapVisitor) requiresAlloc() bool { return true } //go:nosplit func (*mapVisitor) requiresSplit() bool { return true } // Map installs a mapping with the given physical address. // // True is returned iff there was a previous mapping in the range. // // Precondition: addr & length must be page-aligned, their sum must not overflow. // // +checkescape:hard,stack // //go:nosplit func (p *PageTables) Map(addr hostarch.Addr, length uintptr, opts MapOpts, physical uintptr) bool { if p.readOnlyShared { panic("Should not modify read-only shared pagetables.") } if uintptr(addr)+length < uintptr(addr) { panic("addr & length overflow") } if p.upperSharedPageTables != nil { // ignore change to the read-only upper shared portion. if uintptr(addr) >= p.upperStart { return false } if uintptr(addr)+length > p.upperStart { length = p.upperStart - uintptr(addr) } } w := mapWalker{ pageTables: p, visitor: mapVisitor{ target: uintptr(addr), physical: physical, opts: opts, }, } w.iterateRange(uintptr(addr), uintptr(addr)+length) return w.visitor.prev } // unmapVisitor is used for unmap. type unmapVisitor struct { count int } //go:nosplit func (*unmapVisitor) requiresAlloc() bool { return false } //go:nosplit func (*unmapVisitor) requiresSplit() bool { return true } // visit unmaps the given entry. // //go:nosplit func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) bool { pte.Clear() v.count++ return true } // Unmap unmaps the given range. // // True is returned iff there was a previous mapping in the range. // // Precondition: addr & length must be page-aligned, their sum must not overflow. // // +checkescape:hard,stack // //go:nosplit func (p *PageTables) Unmap(addr hostarch.Addr, length uintptr) bool { if p.readOnlyShared { panic("Should not modify read-only shared pagetables.") } if uintptr(addr)+length < uintptr(addr) { panic("addr & length overflow") } if p.upperSharedPageTables != nil { // ignore change to the read-only upper shared portion. if uintptr(addr) >= p.upperStart { return false } if uintptr(addr)+length > p.upperStart { length = p.upperStart - uintptr(addr) } } w := unmapWalker{ pageTables: p, visitor: unmapVisitor{ count: 0, }, } w.iterateRange(uintptr(addr), uintptr(addr)+length) return w.visitor.count > 0 } // emptyVisitor is used for emptiness checks. type emptyVisitor struct { count int } //go:nosplit func (*emptyVisitor) requiresAlloc() bool { return false } //go:nosplit func (*emptyVisitor) requiresSplit() bool { return false } // visit unmaps the given entry. // //go:nosplit func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) bool { v.count++ return true } // IsEmpty checks if the given range is empty. // // Precondition: addr & length must be page-aligned. // // +checkescape:hard,stack // //go:nosplit func (p *PageTables) IsEmpty(addr hostarch.Addr, length uintptr) bool { w := emptyWalker{ pageTables: p, } w.iterateRange(uintptr(addr), uintptr(addr)+length) return w.visitor.count == 0 } // lookupVisitor is used for lookup. type lookupVisitor struct { target uintptr // Input & Output. findFirst bool // Input. physical uintptr // Output. size uintptr // Output. opts MapOpts // Output. } // visit matches the given address. // //go:nosplit func (v *lookupVisitor) visit(start uintptr, pte *PTE, align uintptr) bool { if !pte.Valid() { // If looking for the first, then we just keep iterating until // we find a valid entry. return v.findFirst } // Is this within the current range? v.target = start v.physical = pte.Address() v.size = (align + 1) v.opts = pte.Opts() return false } //go:nosplit func (*lookupVisitor) requiresAlloc() bool { return false } //go:nosplit func (*lookupVisitor) requiresSplit() bool { return false } // Lookup returns the physical address for the given virtual address. // // If findFirst is true, then the next valid address after addr is returned. // If findFirst is false, then only a mapping for addr will be returned. // // Note that if size is zero, then no matching entry was found. // // +checkescape:hard,stack // //go:nosplit func (p *PageTables) Lookup(addr hostarch.Addr, findFirst bool) (virtual hostarch.Addr, physical, size uintptr, opts MapOpts) { mask := uintptr(hostarch.PageSize - 1) addr &^= hostarch.Addr(mask) w := lookupWalker{ pageTables: p, visitor: lookupVisitor{ target: uintptr(addr), findFirst: findFirst, }, } end := ^hostarch.Addr(0) &^ hostarch.Addr(mask) if !findFirst { end = addr + 1 } w.iterateRange(uintptr(addr), uintptr(end)) return hostarch.Addr(w.visitor.target), w.visitor.physical, w.visitor.size, w.visitor.opts } // MarkReadOnlyShared marks the pagetables read-only and can be shared. // // It is usually used on the pagetables that are used as the upper func (p *PageTables) MarkReadOnlyShared() { p.readOnlyShared = true } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/pagetables_aarch64.go000066400000000000000000000115371465435605700265270ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package pagetables import ( "sync/atomic" "gvisor.dev/gvisor/pkg/hostarch" ) // archPageTables is architecture-specific data. type archPageTables struct { // root is the pagetable root for kernel space. root *PTEs // rootPhysical is the cached physical address of the root. // // This is saved only to prevent constant translation. rootPhysical uintptr asid uint16 } // TTBR0_EL1 returns the translation table base register 0. // //go:nosplit func (p *PageTables) TTBR0_EL1(noFlush bool, asid uint16) uint64 { return uint64(p.rootPhysical) | (uint64(asid)&ttbrASIDMask)<= upperBottom { return entriesPerPage/2 + (upperStart-upperBottom)/pgdSize } if upperStart < lowerTop { return upperStart / pgdSize } panic("upperStart should be in canonical range") } // cloneUpperShared clone the upper from the upper shared page tables. // //go:nosplit func (p *PageTables) cloneUpperShared() { start := pgdIndex(p.upperStart) copy(p.root[start:entriesPerPage], p.upperSharedPageTables.root[start:entriesPerPage]) } // PTEs is a collection of entries. type PTEs [entriesPerPage]PTE golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/pagetables_amd64_state_autogen.go000066400000000000000000000001361465435605700311250ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 // +build amd64 package pagetables golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/pagetables_arm64.go000066400000000000000000000040151465435605700262210ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pagetables // Address constraints. // // The lowerTop and upperBottom currently apply to four-level pagetables; // additional refactoring would be necessary to support five-level pagetables. const ( lowerTop = 0x0000ffffffffffff upperBottom = 0xffff000000000000 pteShift = 12 pmdShift = 21 pudShift = 30 pgdShift = 39 pteMask = 0x1ff << pteShift pmdMask = 0x1ff << pmdShift pudMask = 0x1ff << pudShift pgdMask = 0x1ff << pgdShift pteSize = 1 << pteShift pmdSize = 1 << pmdShift pudSize = 1 << pudShift pgdSize = 1 << pgdShift ttbrASIDOffset = 48 ttbrASIDMask = 0xff entriesPerPage = 512 ) // InitArch does some additional initialization related to the architecture. // // +checkescape:hard,stack // //go:nosplit func (p *PageTables) InitArch(allocator Allocator) { if p.upperSharedPageTables != nil { p.cloneUpperShared() } else { p.archPageTables.root = p.Allocator.NewPTEs() p.archPageTables.rootPhysical = p.Allocator.PhysicalFor(p.archPageTables.root) } } // cloneUpperShared clone the upper from the upper shared page tables. // //go:nosplit func (p *PageTables) cloneUpperShared() { if p.upperStart != upperBottom { panic("upperStart should be the same as upperBottom") } p.archPageTables.root = p.upperSharedPageTables.archPageTables.root p.archPageTables.rootPhysical = p.upperSharedPageTables.archPageTables.rootPhysical } // PTEs is a collection of entries. type PTEs [entriesPerPage]PTE golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/pagetables_arm64_state_autogen.go000066400000000000000000000001361465435605700311430ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 // +build arm64 package pagetables golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/pagetables_state_autogen.go000066400000000000000000000000741465435605700301330ustar00rootroot00000000000000// automatically generated by stateify. package pagetables golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/pagetables_unsafe_state_autogen.go000066400000000000000000000000741465435605700314740ustar00rootroot00000000000000// automatically generated by stateify. package pagetables golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/pagetables_x86.go000066400000000000000000000105221465435605700257150ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build 386 || amd64 // +build 386 amd64 package pagetables import ( "sync/atomic" "gvisor.dev/gvisor/pkg/hostarch" ) // archPageTables is architecture-specific data. type archPageTables struct { // pcid is the value assigned by PCIDs.Assign. // // Note that zero is a valid PCID. pcid uint16 } // CR3 returns the CR3 value for these tables. // // This may be called in interrupt contexts. A PCID of zero always implies a // flush and should be passed when PCIDs are not enabled. See pcids_x86.go for // more information. // //go:nosplit func (p *PageTables) CR3(noFlush bool, pcid uint16) uint64 { // Bit 63 is set to avoid flushing the PCID (per SDM 4.10.4.1). const noFlushBit uint64 = 0x8000000000000000 if noFlush && pcid != 0 { return noFlushBit | uint64(p.rootPhysical) | uint64(pcid) } return uint64(p.rootPhysical) | uint64(pcid) } // Bits in page table entries. const ( present = 0x001 writable = 0x002 user = 0x004 writeThrough = 0x008 cacheDisable = 0x010 accessed = 0x020 dirty = 0x040 super = 0x080 global = 0x100 optionMask = executeDisable | 0xfff ) // MapOpts are x86 options. type MapOpts struct { // AccessType defines permissions. AccessType hostarch.AccessType // Global indicates the page is globally accessible. Global bool // User indicates the page is a user page. User bool } // PTE is a page table entry. type PTE uintptr // Clear clears this PTE, including super page information. // //go:nosplit func (p *PTE) Clear() { atomic.StoreUintptr((*uintptr)(p), 0) } // Valid returns true iff this entry is valid. // //go:nosplit func (p *PTE) Valid() bool { return atomic.LoadUintptr((*uintptr)(p))&present != 0 } // Opts returns the PTE options. // // These are all options except Valid and Super. // //go:nosplit func (p *PTE) Opts() MapOpts { v := atomic.LoadUintptr((*uintptr)(p)) return MapOpts{ AccessType: hostarch.AccessType{ Read: v&present != 0, Write: v&writable != 0, Execute: v&executeDisable == 0, }, Global: v&global != 0, User: v&user != 0, } } // SetSuper sets this page as a super page. // // The page must not be valid or a panic will result. // //go:nosplit func (p *PTE) SetSuper() { if p.Valid() { // This is not allowed. panic("SetSuper called on valid page!") } atomic.StoreUintptr((*uintptr)(p), super) } // IsSuper returns true iff this page is a super page. // //go:nosplit func (p *PTE) IsSuper() bool { return atomic.LoadUintptr((*uintptr)(p))&super != 0 } // Set sets this PTE value. // // This does not change the super page property. // //go:nosplit func (p *PTE) Set(addr uintptr, opts MapOpts) { if !opts.AccessType.Any() { p.Clear() return } v := (addr &^ optionMask) if opts.AccessType.Any() { v |= present | accessed } if opts.User { v |= user } if opts.Global { v |= global } if !opts.AccessType.Execute { v |= executeDisable } if opts.AccessType.Write { v |= writable | dirty } if p.IsSuper() { // Note that this is inherited from the previous instance. Set // does not change the value of Super. See above. v |= super } atomic.StoreUintptr((*uintptr)(p), v) } // setPageTable sets this PTE value and forces the write bit and super bit to // be cleared. This is used explicitly for breaking super pages. // //go:nosplit func (p *PTE) setPageTable(pt *PageTables, ptes *PTEs) { addr := pt.Allocator.PhysicalFor(ptes) if addr&^optionMask != addr { // This should never happen. panic("unaligned physical address!") } v := addr | present | user | writable | accessed | dirty atomic.StoreUintptr((*uintptr)(p), v) } // Address extracts the address. This should only be used if Valid returns true. // //go:nosplit func (p *PTE) Address() uintptr { return atomic.LoadUintptr((*uintptr)(p)) &^ optionMask } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/pagetables_x86_state_autogen.go000066400000000000000000000002231465435605700306340ustar00rootroot00000000000000// automatically generated by stateify. //go:build (386 || amd64) && (i386 || amd64) // +build 386 amd64 // +build i386 amd64 package pagetables golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/pcids.go000066400000000000000000000053631465435605700242120ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pagetables import ( "gvisor.dev/gvisor/pkg/sync" ) // PCIDs is a simple PCID database. // // This is not protected by locks and is thus suitable for use only with a // single CPU at a time. type PCIDs struct { // mu protects below. mu sync.Mutex // cache are the assigned page tables. cache map[*PageTables]uint16 // avail are available PCIDs. avail []uint16 } // NewPCIDs returns a new PCID database. // // start is the first index to assign. Typically this will be one, as the zero // pcid will always be flushed on transition (see pagetables_x86.go). This may // be more than one if specific PCIDs are reserved. // // Nil is returned iff the start and size are out of range. func NewPCIDs(start, size uint16) *PCIDs { if start+uint16(size) > limitPCID { return nil // See comment. } p := &PCIDs{ cache: make(map[*PageTables]uint16), } for pcid := start; pcid < start+size; pcid++ { p.avail = append(p.avail, pcid) } return p } // Assign assigns a PCID to the given PageTables. // // This may overwrite any previous assignment provided. If this in the case, // true is returned to indicate that the PCID should be flushed. func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) { p.mu.Lock() if pcid, ok := p.cache[pt]; ok { p.mu.Unlock() return pcid, false // No flush. } // Is there something available? if len(p.avail) > 0 { pcid := p.avail[len(p.avail)-1] p.avail = p.avail[:len(p.avail)-1] p.cache[pt] = pcid // We need to flush because while this is in the available // pool, it may have been used previously. p.mu.Unlock() return pcid, true } // Evict an existing table. for old, pcid := range p.cache { delete(p.cache, old) p.cache[pt] = pcid // A flush is definitely required in this case, these page // tables may still be active. (They will just be assigned some // other PCID if and when they hit the given CPU again.) p.mu.Unlock() return pcid, true } // No PCID. p.mu.Unlock() return 0, false } // Drop drops references to a set of page tables. func (p *PCIDs) Drop(pt *PageTables) { p.mu.Lock() if pcid, ok := p.cache[pt]; ok { delete(p.cache, pt) p.avail = append(p.avail, pcid) } p.mu.Unlock() } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/pcids_aarch64.go000066400000000000000000000021621465435605700255140ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package pagetables // limitPCID is the maximum value of PCIDs. // // In VMSAv8-64, the PCID(ASID) size is an IMPLEMENTATION DEFINED choice // of 8 bits or 16 bits, and ID_AA64MMFR0_EL1.ASIDBits identifies the // supported size. When an implementation supports a 16-bit ASID, TCR_ELx.AS // selects whether the top 8 bits of the ASID are used. var limitPCID uint16 // GetASIDBits return the system ASID bits, 8 or 16 bits. func GetASIDBits() uint8 func init() { limitPCID = uint16(1)<> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { var ( pgdEntry = &w.pageTables.root[pgdIndex] pudEntries *PTEs ) if !pgdEntry.Valid() { if !w.visitor.requiresAlloc() { // Skip over this entry. start = next(start, pgdSize) continue } // Allocate a new pgd. pudEntries = w.pageTables.Allocator.NewPTEs() // escapes: depends on allocator. pgdEntry.setPageTable(w.pageTables, pudEntries) } else { pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) // escapes: see above. } // Map the next level. clearPUDEntries := uint16(0) for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { var ( pudEntry = &pudEntries[pudIndex] pmdEntries *PTEs ) if !pudEntry.Valid() { if !w.visitor.requiresAlloc() { // Skip over this entry. clearPUDEntries++ start = next(start, pudSize) continue } // This level has 1-GB super pages. Is this // entire region at least as large as a single // PUD entry? If so, we can skip allocating a // new page for the pmd. if start&(pudSize-1) == 0 && end-start >= pudSize { pudEntry.SetSuper() if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) { return false } if pudEntry.Valid() { start = next(start, pudSize) continue } } // Allocate a new pud. pmdEntries = w.pageTables.Allocator.NewPTEs() // escapes: see above. pudEntry.setPageTable(w.pageTables, pmdEntries) } else if pudEntry.IsSuper() { // Does this page need to be split? if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < next(start, pudSize)) { // Install the relevant entries. pmdEntries = w.pageTables.Allocator.NewPTEs() // escapes: see above. for index := uint16(0); index < entriesPerPage; index++ { pmdEntries[index].SetSuper() pmdEntries[index].Set( pudEntry.Address()+(pmdSize*uintptr(index)), pudEntry.Opts()) } pudEntry.setPageTable(w.pageTables, pmdEntries) } else { // A super page to be checked directly. if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) { return false } // Might have been cleared. if !pudEntry.Valid() { clearPUDEntries++ } // Note that the super page was changed. start = next(start, pudSize) continue } } else { pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) // escapes: see above. } // Map the next level, since this is valid. clearPMDEntries := uint16(0) for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { var ( pmdEntry = &pmdEntries[pmdIndex] pteEntries *PTEs ) if !pmdEntry.Valid() { if !w.visitor.requiresAlloc() { // Skip over this entry. clearPMDEntries++ start = next(start, pmdSize) continue } // This level has 2-MB huge pages. If this // region is continued in a single PMD entry? // As above, we can skip allocating a new page. if start&(pmdSize-1) == 0 && end-start >= pmdSize { pmdEntry.SetSuper() if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) { return false } if pmdEntry.Valid() { start = next(start, pmdSize) continue } } // Allocate a new pmd. pteEntries = w.pageTables.Allocator.NewPTEs() // escapes: see above. pmdEntry.setPageTable(w.pageTables, pteEntries) } else if pmdEntry.IsSuper() { // Does this page need to be split? if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < next(start, pmdSize)) { // Install the relevant entries. pteEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pteEntries[index].Set( pmdEntry.Address()+(pteSize*uintptr(index)), pmdEntry.Opts()) } pmdEntry.setPageTable(w.pageTables, pteEntries) } else { // A huge page to be checked directly. if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) { return false } // Might have been cleared. if !pmdEntry.Valid() { clearPMDEntries++ } // Note that the huge page was changed. start = next(start, pmdSize) continue } } else { pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) // escapes: see above. } // Map the next level, since this is valid. clearPTEEntries := uint16(0) for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { var ( pteEntry = &pteEntries[pteIndex] ) if !pteEntry.Valid() && !w.visitor.requiresAlloc() { clearPTEEntries++ start += pteSize continue } // At this point, we are guaranteed that start%pteSize == 0. if !w.visitor.visit(uintptr(start&^(pteSize-1)), pteEntry, pteSize-1) { return false } if !pteEntry.Valid() && !w.visitor.requiresAlloc() { clearPTEEntries++ } // Note that the pte was changed. start += pteSize continue } // Check if we no longer need this page. if clearPTEEntries == entriesPerPage { pmdEntry.Clear() w.pageTables.Allocator.FreePTEs(pteEntries) // escapes: see above. clearPMDEntries++ } } // Check if we no longer need this page. if clearPMDEntries == entriesPerPage { pudEntry.Clear() w.pageTables.Allocator.FreePTEs(pmdEntries) // escapes: see above. clearPUDEntries++ } } // Check if we no longer need this page. if clearPUDEntries == entriesPerPage { pgdEntry.Clear() w.pageTables.Allocator.FreePTEs(pudEntries) // escapes: see above. } } return true } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/walker_arm64.go000066400000000000000000000147771465435605700254170ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package pagetables // iterateRangeCanonical walks a canonical range. // //go:nosplit func (w *Walker) iterateRangeCanonical(start, end uintptr) bool { pgdEntryIndex := w.pageTables.root if start >= upperBottom { pgdEntryIndex = w.pageTables.archPageTables.root } for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ { var ( pgdEntry = &pgdEntryIndex[pgdIndex] pudEntries *PTEs ) if !pgdEntry.Valid() { if !w.visitor.requiresAlloc() { // Skip over this entry. start = next(start, pgdSize) continue } // Allocate a new pgd. pudEntries = w.pageTables.Allocator.NewPTEs() pgdEntry.setPageTable(w.pageTables, pudEntries) } else { pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) } // Map the next level. clearPUDEntries := uint16(0) for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { var ( pudEntry = &pudEntries[pudIndex] pmdEntries *PTEs ) if !pudEntry.Valid() { if !w.visitor.requiresAlloc() { // Skip over this entry. clearPUDEntries++ start = next(start, pudSize) continue } // This level has 1-GB sect pages. Is this // entire region at least as large as a single // PUD entry? If so, we can skip allocating a // new page for the pmd. if start&(pudSize-1) == 0 && end-start >= pudSize { pudEntry.SetSect() if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) { return false } if pudEntry.Valid() { start = next(start, pudSize) continue } } // Allocate a new pud. pmdEntries = w.pageTables.Allocator.NewPTEs() pudEntry.setPageTable(w.pageTables, pmdEntries) } else if pudEntry.IsSect() { // Does this page need to be split? if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < next(start, pudSize)) { // Install the relevant entries. pmdEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pmdEntries[index].SetSect() pmdEntries[index].Set( pudEntry.Address()+(pmdSize*uintptr(index)), pudEntry.Opts()) } pudEntry.setPageTable(w.pageTables, pmdEntries) } else { // A sect page to be checked directly. if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) { return false } // Might have been cleared. if !pudEntry.Valid() { clearPUDEntries++ } // Note that the sect page was changed. start = next(start, pudSize) continue } } else { pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) } // Map the next level, since this is valid. clearPMDEntries := uint16(0) for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { var ( pmdEntry = &pmdEntries[pmdIndex] pteEntries *PTEs ) if !pmdEntry.Valid() { if !w.visitor.requiresAlloc() { // Skip over this entry. clearPMDEntries++ start = next(start, pmdSize) continue } // This level has 2-MB huge pages. If this // region is continued in a single PMD entry? // As above, we can skip allocating a new page. if start&(pmdSize-1) == 0 && end-start >= pmdSize { pmdEntry.SetSect() if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) { return false } if pmdEntry.Valid() { start = next(start, pmdSize) continue } } // Allocate a new pmd. pteEntries = w.pageTables.Allocator.NewPTEs() pmdEntry.setPageTable(w.pageTables, pteEntries) } else if pmdEntry.IsSect() { // Does this page need to be split? if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < next(start, pmdSize)) { // Install the relevant entries. pteEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pteEntries[index].Set( pmdEntry.Address()+(pteSize*uintptr(index)), pmdEntry.Opts()) } pmdEntry.setPageTable(w.pageTables, pteEntries) } else { // A huge page to be checked directly. if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) { return false } // Might have been cleared. if !pmdEntry.Valid() { clearPMDEntries++ } // Note that the huge page was changed. start = next(start, pmdSize) continue } } else { pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) } // Map the next level, since this is valid. clearPTEEntries := uint16(0) for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { var ( pteEntry = &pteEntries[pteIndex] ) if !pteEntry.Valid() && !w.visitor.requiresAlloc() { clearPTEEntries++ start += pteSize continue } // At this point, we are guaranteed that start%pteSize == 0. if !w.visitor.visit(uintptr(start), pteEntry, pteSize-1) { return false } if !pteEntry.Valid() { if w.visitor.requiresAlloc() { panic("PTE not set after iteration with requiresAlloc!") } clearPTEEntries++ } // Note that the pte was changed. start += pteSize continue } // Check if we no longer need this page. if clearPTEEntries == entriesPerPage { pmdEntry.Clear() w.pageTables.Allocator.FreePTEs(pteEntries) clearPMDEntries++ } } // Check if we no longer need this page. if clearPMDEntries == entriesPerPage { pudEntry.Clear() w.pageTables.Allocator.FreePTEs(pmdEntries) clearPUDEntries++ } } // Check if we no longer need this page. if clearPUDEntries == entriesPerPage { pgdEntry.Clear() w.pageTables.Allocator.FreePTEs(pudEntries) } } return true } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/walker_empty_amd64.go000066400000000000000000000156441465435605700266110ustar00rootroot00000000000000//go:build amd64 // +build amd64 package pagetables // iterateRangeCanonical walks a canonical range. // //go:nosplit func (w *emptyWalker) iterateRangeCanonical(start, end uintptr) bool { for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { var ( pgdEntry = &w.pageTables.root[pgdIndex] pudEntries *PTEs ) if !pgdEntry.Valid() { if !w.visitor.requiresAlloc() { start = emptynext(start, pgdSize) continue } pudEntries = w.pageTables.Allocator.NewPTEs() pgdEntry.setPageTable(w.pageTables, pudEntries) } else { pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) } clearPUDEntries := uint16(0) for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { var ( pudEntry = &pudEntries[pudIndex] pmdEntries *PTEs ) if !pudEntry.Valid() { if !w.visitor.requiresAlloc() { clearPUDEntries++ start = emptynext(start, pudSize) continue } if start&(pudSize-1) == 0 && end-start >= pudSize { pudEntry.SetSuper() if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) { return false } if pudEntry.Valid() { start = emptynext(start, pudSize) continue } } pmdEntries = w.pageTables.Allocator.NewPTEs() pudEntry.setPageTable(w.pageTables, pmdEntries) } else if pudEntry.IsSuper() { if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < emptynext(start, pudSize)) { pmdEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pmdEntries[index].SetSuper() pmdEntries[index].Set( pudEntry.Address()+(pmdSize*uintptr(index)), pudEntry.Opts()) } pudEntry.setPageTable(w.pageTables, pmdEntries) } else { if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) { return false } if !pudEntry.Valid() { clearPUDEntries++ } start = emptynext(start, pudSize) continue } } else { pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) } clearPMDEntries := uint16(0) for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { var ( pmdEntry = &pmdEntries[pmdIndex] pteEntries *PTEs ) if !pmdEntry.Valid() { if !w.visitor.requiresAlloc() { clearPMDEntries++ start = emptynext(start, pmdSize) continue } if start&(pmdSize-1) == 0 && end-start >= pmdSize { pmdEntry.SetSuper() if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) { return false } if pmdEntry.Valid() { start = emptynext(start, pmdSize) continue } } pteEntries = w.pageTables.Allocator.NewPTEs() pmdEntry.setPageTable(w.pageTables, pteEntries) } else if pmdEntry.IsSuper() { if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < emptynext(start, pmdSize)) { pteEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pteEntries[index].Set( pmdEntry.Address()+(pteSize*uintptr(index)), pmdEntry.Opts()) } pmdEntry.setPageTable(w.pageTables, pteEntries) } else { if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) { return false } if !pmdEntry.Valid() { clearPMDEntries++ } start = emptynext(start, pmdSize) continue } } else { pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) } clearPTEEntries := uint16(0) for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { var ( pteEntry = &pteEntries[pteIndex] ) if !pteEntry.Valid() && !w.visitor.requiresAlloc() { clearPTEEntries++ start += pteSize continue } if !w.visitor.visit(uintptr(start&^(pteSize-1)), pteEntry, pteSize-1) { return false } if !pteEntry.Valid() && !w.visitor.requiresAlloc() { clearPTEEntries++ } start += pteSize continue } if clearPTEEntries == entriesPerPage { pmdEntry.Clear() w.pageTables.Allocator.FreePTEs(pteEntries) clearPMDEntries++ } } if clearPMDEntries == entriesPerPage { pudEntry.Clear() w.pageTables.Allocator.FreePTEs(pmdEntries) clearPUDEntries++ } } if clearPUDEntries == entriesPerPage { pgdEntry.Clear() w.pageTables.Allocator.FreePTEs(pudEntries) } } return true } // Walker walks page tables. type emptyWalker struct { // pageTables are the tables to walk. pageTables *PageTables // Visitor is the set of arguments. visitor emptyVisitor } // iterateRange iterates over all appropriate levels of page tables for the given range. // // If requiresAlloc is true, then Set _must_ be called on all given PTEs. The // exception is super pages. If a valid super page (huge or jumbo) cannot be // installed, then the walk will continue to individual entries. // // This algorithm will attempt to maximize the use of super/sect pages whenever // possible. Whether a super page is provided will be clear through the range // provided in the callback. // // Note that if requiresAlloc is true, then no gaps will be present. However, // if alloc is not set, then the iteration will likely be full of gaps. // // Note that this function should generally be avoided in favor of Map, Unmap, // etc. when not necessary. // // Precondition: start must be page-aligned. // Precondition: start must be less than end. // Precondition: If requiresAlloc is true, then start and end should not span // non-canonical ranges. If they do, a panic will result. // //go:nosplit func (w *emptyWalker) iterateRange(start, end uintptr) { if start%pteSize != 0 { panic("unaligned start") } if end < start { panic("start > end") } if start < lowerTop { if end <= lowerTop { w.iterateRangeCanonical(start, end) } else if end > lowerTop && end <= upperBottom { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } w.iterateRangeCanonical(start, lowerTop) } else { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } if !w.iterateRangeCanonical(start, lowerTop) { return } w.iterateRangeCanonical(upperBottom, end) } } else if start < upperBottom { if end <= upperBottom { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } } else { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } w.iterateRangeCanonical(upperBottom, end) } } else { w.iterateRangeCanonical(start, end) } } // next returns the next address quantized by the given size. // //go:nosplit func emptynext(start uintptr, size uintptr) uintptr { start &= ^(size - 1) start += size return start } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/walker_empty_arm64.go000066400000000000000000000160431465435605700266210ustar00rootroot00000000000000//go:build arm64 // +build arm64 package pagetables // iterateRangeCanonical walks a canonical range. // //go:nosplit func (w *emptyWalker) iterateRangeCanonical(start, end uintptr) bool { pgdEntryIndex := w.pageTables.root if start >= upperBottom { pgdEntryIndex = w.pageTables.archPageTables.root } for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ { var ( pgdEntry = &pgdEntryIndex[pgdIndex] pudEntries *PTEs ) if !pgdEntry.Valid() { if !w.visitor.requiresAlloc() { start = emptynext(start, pgdSize) continue } pudEntries = w.pageTables.Allocator.NewPTEs() pgdEntry.setPageTable(w.pageTables, pudEntries) } else { pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) } clearPUDEntries := uint16(0) for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { var ( pudEntry = &pudEntries[pudIndex] pmdEntries *PTEs ) if !pudEntry.Valid() { if !w.visitor.requiresAlloc() { clearPUDEntries++ start = emptynext(start, pudSize) continue } if start&(pudSize-1) == 0 && end-start >= pudSize { pudEntry.SetSect() if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) { return false } if pudEntry.Valid() { start = emptynext(start, pudSize) continue } } pmdEntries = w.pageTables.Allocator.NewPTEs() pudEntry.setPageTable(w.pageTables, pmdEntries) } else if pudEntry.IsSect() { if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < emptynext(start, pudSize)) { pmdEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pmdEntries[index].SetSect() pmdEntries[index].Set( pudEntry.Address()+(pmdSize*uintptr(index)), pudEntry.Opts()) } pudEntry.setPageTable(w.pageTables, pmdEntries) } else { if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) { return false } if !pudEntry.Valid() { clearPUDEntries++ } start = emptynext(start, pudSize) continue } } else { pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) } clearPMDEntries := uint16(0) for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { var ( pmdEntry = &pmdEntries[pmdIndex] pteEntries *PTEs ) if !pmdEntry.Valid() { if !w.visitor.requiresAlloc() { clearPMDEntries++ start = emptynext(start, pmdSize) continue } if start&(pmdSize-1) == 0 && end-start >= pmdSize { pmdEntry.SetSect() if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) { return false } if pmdEntry.Valid() { start = emptynext(start, pmdSize) continue } } pteEntries = w.pageTables.Allocator.NewPTEs() pmdEntry.setPageTable(w.pageTables, pteEntries) } else if pmdEntry.IsSect() { if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < emptynext(start, pmdSize)) { pteEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pteEntries[index].Set( pmdEntry.Address()+(pteSize*uintptr(index)), pmdEntry.Opts()) } pmdEntry.setPageTable(w.pageTables, pteEntries) } else { if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) { return false } if !pmdEntry.Valid() { clearPMDEntries++ } start = emptynext(start, pmdSize) continue } } else { pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) } clearPTEEntries := uint16(0) for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { var ( pteEntry = &pteEntries[pteIndex] ) if !pteEntry.Valid() && !w.visitor.requiresAlloc() { clearPTEEntries++ start += pteSize continue } if !w.visitor.visit(uintptr(start), pteEntry, pteSize-1) { return false } if !pteEntry.Valid() { if w.visitor.requiresAlloc() { panic("PTE not set after iteration with requiresAlloc!") } clearPTEEntries++ } start += pteSize continue } if clearPTEEntries == entriesPerPage { pmdEntry.Clear() w.pageTables.Allocator.FreePTEs(pteEntries) clearPMDEntries++ } } if clearPMDEntries == entriesPerPage { pudEntry.Clear() w.pageTables.Allocator.FreePTEs(pmdEntries) clearPUDEntries++ } } if clearPUDEntries == entriesPerPage { pgdEntry.Clear() w.pageTables.Allocator.FreePTEs(pudEntries) } } return true } // Walker walks page tables. type emptyWalker struct { // pageTables are the tables to walk. pageTables *PageTables // Visitor is the set of arguments. visitor emptyVisitor } // iterateRange iterates over all appropriate levels of page tables for the given range. // // If requiresAlloc is true, then Set _must_ be called on all given PTEs. The // exception is super pages. If a valid super page (huge or jumbo) cannot be // installed, then the walk will continue to individual entries. // // This algorithm will attempt to maximize the use of super/sect pages whenever // possible. Whether a super page is provided will be clear through the range // provided in the callback. // // Note that if requiresAlloc is true, then no gaps will be present. However, // if alloc is not set, then the iteration will likely be full of gaps. // // Note that this function should generally be avoided in favor of Map, Unmap, // etc. when not necessary. // // Precondition: start must be page-aligned. // Precondition: start must be less than end. // Precondition: If requiresAlloc is true, then start and end should not span // non-canonical ranges. If they do, a panic will result. // //go:nosplit func (w *emptyWalker) iterateRange(start, end uintptr) { if start%pteSize != 0 { panic("unaligned start") } if end < start { panic("start > end") } if start < lowerTop { if end <= lowerTop { w.iterateRangeCanonical(start, end) } else if end > lowerTop && end <= upperBottom { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } w.iterateRangeCanonical(start, lowerTop) } else { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } if !w.iterateRangeCanonical(start, lowerTop) { return } w.iterateRangeCanonical(upperBottom, end) } } else if start < upperBottom { if end <= upperBottom { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } } else { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } w.iterateRangeCanonical(upperBottom, end) } } else { w.iterateRangeCanonical(start, end) } } // next returns the next address quantized by the given size. // //go:nosplit func emptynext(start uintptr, size uintptr) uintptr { start &= ^(size - 1) start += size return start } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/walker_generic.go000066400000000000000000000065221465435605700260670ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pagetables // Visitor is a generic type. type Visitor interface { // visit is called on each PTE. The returned boolean indicates whether // the walk should continue. visit(start uintptr, pte *PTE, align uintptr) bool // requiresAlloc indicates that new entries should be allocated within // the walked range. requiresAlloc() bool // requiresSplit indicates that entries in the given range should be // split if they are huge or jumbo pages. requiresSplit() bool } // Walker walks page tables. type Walker struct { // pageTables are the tables to walk. pageTables *PageTables // Visitor is the set of arguments. visitor Visitor } // iterateRange iterates over all appropriate levels of page tables for the given range. // // If requiresAlloc is true, then Set _must_ be called on all given PTEs. The // exception is super pages. If a valid super page (huge or jumbo) cannot be // installed, then the walk will continue to individual entries. // // This algorithm will attempt to maximize the use of super/sect pages whenever // possible. Whether a super page is provided will be clear through the range // provided in the callback. // // Note that if requiresAlloc is true, then no gaps will be present. However, // if alloc is not set, then the iteration will likely be full of gaps. // // Note that this function should generally be avoided in favor of Map, Unmap, // etc. when not necessary. // // Precondition: start must be page-aligned. // Precondition: start must be less than end. // Precondition: If requiresAlloc is true, then start and end should not span // non-canonical ranges. If they do, a panic will result. // //go:nosplit func (w *Walker) iterateRange(start, end uintptr) { if start%pteSize != 0 { panic("unaligned start") } if end < start { panic("start > end") } if start < lowerTop { if end <= lowerTop { w.iterateRangeCanonical(start, end) } else if end > lowerTop && end <= upperBottom { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } w.iterateRangeCanonical(start, lowerTop) } else { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } if !w.iterateRangeCanonical(start, lowerTop) { return } w.iterateRangeCanonical(upperBottom, end) } } else if start < upperBottom { if end <= upperBottom { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } } else { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } w.iterateRangeCanonical(upperBottom, end) } } else { w.iterateRangeCanonical(start, end) } } // next returns the next address quantized by the given size. // //go:nosplit func next(start uintptr, size uintptr) uintptr { start &= ^(size - 1) start += size return start } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/walker_lookup_amd64.go000066400000000000000000000156621465435605700267640ustar00rootroot00000000000000//go:build amd64 // +build amd64 package pagetables // iterateRangeCanonical walks a canonical range. // //go:nosplit func (w *lookupWalker) iterateRangeCanonical(start, end uintptr) bool { for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { var ( pgdEntry = &w.pageTables.root[pgdIndex] pudEntries *PTEs ) if !pgdEntry.Valid() { if !w.visitor.requiresAlloc() { start = lookupnext(start, pgdSize) continue } pudEntries = w.pageTables.Allocator.NewPTEs() pgdEntry.setPageTable(w.pageTables, pudEntries) } else { pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) } clearPUDEntries := uint16(0) for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { var ( pudEntry = &pudEntries[pudIndex] pmdEntries *PTEs ) if !pudEntry.Valid() { if !w.visitor.requiresAlloc() { clearPUDEntries++ start = lookupnext(start, pudSize) continue } if start&(pudSize-1) == 0 && end-start >= pudSize { pudEntry.SetSuper() if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) { return false } if pudEntry.Valid() { start = lookupnext(start, pudSize) continue } } pmdEntries = w.pageTables.Allocator.NewPTEs() pudEntry.setPageTable(w.pageTables, pmdEntries) } else if pudEntry.IsSuper() { if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < lookupnext(start, pudSize)) { pmdEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pmdEntries[index].SetSuper() pmdEntries[index].Set( pudEntry.Address()+(pmdSize*uintptr(index)), pudEntry.Opts()) } pudEntry.setPageTable(w.pageTables, pmdEntries) } else { if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) { return false } if !pudEntry.Valid() { clearPUDEntries++ } start = lookupnext(start, pudSize) continue } } else { pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) } clearPMDEntries := uint16(0) for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { var ( pmdEntry = &pmdEntries[pmdIndex] pteEntries *PTEs ) if !pmdEntry.Valid() { if !w.visitor.requiresAlloc() { clearPMDEntries++ start = lookupnext(start, pmdSize) continue } if start&(pmdSize-1) == 0 && end-start >= pmdSize { pmdEntry.SetSuper() if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) { return false } if pmdEntry.Valid() { start = lookupnext(start, pmdSize) continue } } pteEntries = w.pageTables.Allocator.NewPTEs() pmdEntry.setPageTable(w.pageTables, pteEntries) } else if pmdEntry.IsSuper() { if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < lookupnext(start, pmdSize)) { pteEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pteEntries[index].Set( pmdEntry.Address()+(pteSize*uintptr(index)), pmdEntry.Opts()) } pmdEntry.setPageTable(w.pageTables, pteEntries) } else { if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) { return false } if !pmdEntry.Valid() { clearPMDEntries++ } start = lookupnext(start, pmdSize) continue } } else { pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) } clearPTEEntries := uint16(0) for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { var ( pteEntry = &pteEntries[pteIndex] ) if !pteEntry.Valid() && !w.visitor.requiresAlloc() { clearPTEEntries++ start += pteSize continue } if !w.visitor.visit(uintptr(start&^(pteSize-1)), pteEntry, pteSize-1) { return false } if !pteEntry.Valid() && !w.visitor.requiresAlloc() { clearPTEEntries++ } start += pteSize continue } if clearPTEEntries == entriesPerPage { pmdEntry.Clear() w.pageTables.Allocator.FreePTEs(pteEntries) clearPMDEntries++ } } if clearPMDEntries == entriesPerPage { pudEntry.Clear() w.pageTables.Allocator.FreePTEs(pmdEntries) clearPUDEntries++ } } if clearPUDEntries == entriesPerPage { pgdEntry.Clear() w.pageTables.Allocator.FreePTEs(pudEntries) } } return true } // Walker walks page tables. type lookupWalker struct { // pageTables are the tables to walk. pageTables *PageTables // Visitor is the set of arguments. visitor lookupVisitor } // iterateRange iterates over all appropriate levels of page tables for the given range. // // If requiresAlloc is true, then Set _must_ be called on all given PTEs. The // exception is super pages. If a valid super page (huge or jumbo) cannot be // installed, then the walk will continue to individual entries. // // This algorithm will attempt to maximize the use of super/sect pages whenever // possible. Whether a super page is provided will be clear through the range // provided in the callback. // // Note that if requiresAlloc is true, then no gaps will be present. However, // if alloc is not set, then the iteration will likely be full of gaps. // // Note that this function should generally be avoided in favor of Map, Unmap, // etc. when not necessary. // // Precondition: start must be page-aligned. // Precondition: start must be less than end. // Precondition: If requiresAlloc is true, then start and end should not span // non-canonical ranges. If they do, a panic will result. // //go:nosplit func (w *lookupWalker) iterateRange(start, end uintptr) { if start%pteSize != 0 { panic("unaligned start") } if end < start { panic("start > end") } if start < lowerTop { if end <= lowerTop { w.iterateRangeCanonical(start, end) } else if end > lowerTop && end <= upperBottom { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } w.iterateRangeCanonical(start, lowerTop) } else { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } if !w.iterateRangeCanonical(start, lowerTop) { return } w.iterateRangeCanonical(upperBottom, end) } } else if start < upperBottom { if end <= upperBottom { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } } else { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } w.iterateRangeCanonical(upperBottom, end) } } else { w.iterateRangeCanonical(start, end) } } // next returns the next address quantized by the given size. // //go:nosplit func lookupnext(start uintptr, size uintptr) uintptr { start &= ^(size - 1) start += size return start } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/walker_lookup_arm64.go000066400000000000000000000160611465435605700267740ustar00rootroot00000000000000//go:build arm64 // +build arm64 package pagetables // iterateRangeCanonical walks a canonical range. // //go:nosplit func (w *lookupWalker) iterateRangeCanonical(start, end uintptr) bool { pgdEntryIndex := w.pageTables.root if start >= upperBottom { pgdEntryIndex = w.pageTables.archPageTables.root } for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ { var ( pgdEntry = &pgdEntryIndex[pgdIndex] pudEntries *PTEs ) if !pgdEntry.Valid() { if !w.visitor.requiresAlloc() { start = lookupnext(start, pgdSize) continue } pudEntries = w.pageTables.Allocator.NewPTEs() pgdEntry.setPageTable(w.pageTables, pudEntries) } else { pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) } clearPUDEntries := uint16(0) for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { var ( pudEntry = &pudEntries[pudIndex] pmdEntries *PTEs ) if !pudEntry.Valid() { if !w.visitor.requiresAlloc() { clearPUDEntries++ start = lookupnext(start, pudSize) continue } if start&(pudSize-1) == 0 && end-start >= pudSize { pudEntry.SetSect() if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) { return false } if pudEntry.Valid() { start = lookupnext(start, pudSize) continue } } pmdEntries = w.pageTables.Allocator.NewPTEs() pudEntry.setPageTable(w.pageTables, pmdEntries) } else if pudEntry.IsSect() { if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < lookupnext(start, pudSize)) { pmdEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pmdEntries[index].SetSect() pmdEntries[index].Set( pudEntry.Address()+(pmdSize*uintptr(index)), pudEntry.Opts()) } pudEntry.setPageTable(w.pageTables, pmdEntries) } else { if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) { return false } if !pudEntry.Valid() { clearPUDEntries++ } start = lookupnext(start, pudSize) continue } } else { pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) } clearPMDEntries := uint16(0) for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { var ( pmdEntry = &pmdEntries[pmdIndex] pteEntries *PTEs ) if !pmdEntry.Valid() { if !w.visitor.requiresAlloc() { clearPMDEntries++ start = lookupnext(start, pmdSize) continue } if start&(pmdSize-1) == 0 && end-start >= pmdSize { pmdEntry.SetSect() if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) { return false } if pmdEntry.Valid() { start = lookupnext(start, pmdSize) continue } } pteEntries = w.pageTables.Allocator.NewPTEs() pmdEntry.setPageTable(w.pageTables, pteEntries) } else if pmdEntry.IsSect() { if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < lookupnext(start, pmdSize)) { pteEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pteEntries[index].Set( pmdEntry.Address()+(pteSize*uintptr(index)), pmdEntry.Opts()) } pmdEntry.setPageTable(w.pageTables, pteEntries) } else { if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) { return false } if !pmdEntry.Valid() { clearPMDEntries++ } start = lookupnext(start, pmdSize) continue } } else { pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) } clearPTEEntries := uint16(0) for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { var ( pteEntry = &pteEntries[pteIndex] ) if !pteEntry.Valid() && !w.visitor.requiresAlloc() { clearPTEEntries++ start += pteSize continue } if !w.visitor.visit(uintptr(start), pteEntry, pteSize-1) { return false } if !pteEntry.Valid() { if w.visitor.requiresAlloc() { panic("PTE not set after iteration with requiresAlloc!") } clearPTEEntries++ } start += pteSize continue } if clearPTEEntries == entriesPerPage { pmdEntry.Clear() w.pageTables.Allocator.FreePTEs(pteEntries) clearPMDEntries++ } } if clearPMDEntries == entriesPerPage { pudEntry.Clear() w.pageTables.Allocator.FreePTEs(pmdEntries) clearPUDEntries++ } } if clearPUDEntries == entriesPerPage { pgdEntry.Clear() w.pageTables.Allocator.FreePTEs(pudEntries) } } return true } // Walker walks page tables. type lookupWalker struct { // pageTables are the tables to walk. pageTables *PageTables // Visitor is the set of arguments. visitor lookupVisitor } // iterateRange iterates over all appropriate levels of page tables for the given range. // // If requiresAlloc is true, then Set _must_ be called on all given PTEs. The // exception is super pages. If a valid super page (huge or jumbo) cannot be // installed, then the walk will continue to individual entries. // // This algorithm will attempt to maximize the use of super/sect pages whenever // possible. Whether a super page is provided will be clear through the range // provided in the callback. // // Note that if requiresAlloc is true, then no gaps will be present. However, // if alloc is not set, then the iteration will likely be full of gaps. // // Note that this function should generally be avoided in favor of Map, Unmap, // etc. when not necessary. // // Precondition: start must be page-aligned. // Precondition: start must be less than end. // Precondition: If requiresAlloc is true, then start and end should not span // non-canonical ranges. If they do, a panic will result. // //go:nosplit func (w *lookupWalker) iterateRange(start, end uintptr) { if start%pteSize != 0 { panic("unaligned start") } if end < start { panic("start > end") } if start < lowerTop { if end <= lowerTop { w.iterateRangeCanonical(start, end) } else if end > lowerTop && end <= upperBottom { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } w.iterateRangeCanonical(start, lowerTop) } else { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } if !w.iterateRangeCanonical(start, lowerTop) { return } w.iterateRangeCanonical(upperBottom, end) } } else if start < upperBottom { if end <= upperBottom { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } } else { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } w.iterateRangeCanonical(upperBottom, end) } } else { w.iterateRangeCanonical(start, end) } } // next returns the next address quantized by the given size. // //go:nosplit func lookupnext(start uintptr, size uintptr) uintptr { start &= ^(size - 1) start += size return start } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/walker_map_amd64.go000066400000000000000000000156101465435605700262210ustar00rootroot00000000000000//go:build amd64 // +build amd64 package pagetables // iterateRangeCanonical walks a canonical range. // //go:nosplit func (w *mapWalker) iterateRangeCanonical(start, end uintptr) bool { for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { var ( pgdEntry = &w.pageTables.root[pgdIndex] pudEntries *PTEs ) if !pgdEntry.Valid() { if !w.visitor.requiresAlloc() { start = mapnext(start, pgdSize) continue } pudEntries = w.pageTables.Allocator.NewPTEs() pgdEntry.setPageTable(w.pageTables, pudEntries) } else { pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) } clearPUDEntries := uint16(0) for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { var ( pudEntry = &pudEntries[pudIndex] pmdEntries *PTEs ) if !pudEntry.Valid() { if !w.visitor.requiresAlloc() { clearPUDEntries++ start = mapnext(start, pudSize) continue } if start&(pudSize-1) == 0 && end-start >= pudSize { pudEntry.SetSuper() if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) { return false } if pudEntry.Valid() { start = mapnext(start, pudSize) continue } } pmdEntries = w.pageTables.Allocator.NewPTEs() pudEntry.setPageTable(w.pageTables, pmdEntries) } else if pudEntry.IsSuper() { if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < mapnext(start, pudSize)) { pmdEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pmdEntries[index].SetSuper() pmdEntries[index].Set( pudEntry.Address()+(pmdSize*uintptr(index)), pudEntry.Opts()) } pudEntry.setPageTable(w.pageTables, pmdEntries) } else { if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) { return false } if !pudEntry.Valid() { clearPUDEntries++ } start = mapnext(start, pudSize) continue } } else { pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) } clearPMDEntries := uint16(0) for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { var ( pmdEntry = &pmdEntries[pmdIndex] pteEntries *PTEs ) if !pmdEntry.Valid() { if !w.visitor.requiresAlloc() { clearPMDEntries++ start = mapnext(start, pmdSize) continue } if start&(pmdSize-1) == 0 && end-start >= pmdSize { pmdEntry.SetSuper() if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) { return false } if pmdEntry.Valid() { start = mapnext(start, pmdSize) continue } } pteEntries = w.pageTables.Allocator.NewPTEs() pmdEntry.setPageTable(w.pageTables, pteEntries) } else if pmdEntry.IsSuper() { if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < mapnext(start, pmdSize)) { pteEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pteEntries[index].Set( pmdEntry.Address()+(pteSize*uintptr(index)), pmdEntry.Opts()) } pmdEntry.setPageTable(w.pageTables, pteEntries) } else { if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) { return false } if !pmdEntry.Valid() { clearPMDEntries++ } start = mapnext(start, pmdSize) continue } } else { pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) } clearPTEEntries := uint16(0) for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { var ( pteEntry = &pteEntries[pteIndex] ) if !pteEntry.Valid() && !w.visitor.requiresAlloc() { clearPTEEntries++ start += pteSize continue } if !w.visitor.visit(uintptr(start&^(pteSize-1)), pteEntry, pteSize-1) { return false } if !pteEntry.Valid() && !w.visitor.requiresAlloc() { clearPTEEntries++ } start += pteSize continue } if clearPTEEntries == entriesPerPage { pmdEntry.Clear() w.pageTables.Allocator.FreePTEs(pteEntries) clearPMDEntries++ } } if clearPMDEntries == entriesPerPage { pudEntry.Clear() w.pageTables.Allocator.FreePTEs(pmdEntries) clearPUDEntries++ } } if clearPUDEntries == entriesPerPage { pgdEntry.Clear() w.pageTables.Allocator.FreePTEs(pudEntries) } } return true } // Walker walks page tables. type mapWalker struct { // pageTables are the tables to walk. pageTables *PageTables // Visitor is the set of arguments. visitor mapVisitor } // iterateRange iterates over all appropriate levels of page tables for the given range. // // If requiresAlloc is true, then Set _must_ be called on all given PTEs. The // exception is super pages. If a valid super page (huge or jumbo) cannot be // installed, then the walk will continue to individual entries. // // This algorithm will attempt to maximize the use of super/sect pages whenever // possible. Whether a super page is provided will be clear through the range // provided in the callback. // // Note that if requiresAlloc is true, then no gaps will be present. However, // if alloc is not set, then the iteration will likely be full of gaps. // // Note that this function should generally be avoided in favor of Map, Unmap, // etc. when not necessary. // // Precondition: start must be page-aligned. // Precondition: start must be less than end. // Precondition: If requiresAlloc is true, then start and end should not span // non-canonical ranges. If they do, a panic will result. // //go:nosplit func (w *mapWalker) iterateRange(start, end uintptr) { if start%pteSize != 0 { panic("unaligned start") } if end < start { panic("start > end") } if start < lowerTop { if end <= lowerTop { w.iterateRangeCanonical(start, end) } else if end > lowerTop && end <= upperBottom { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } w.iterateRangeCanonical(start, lowerTop) } else { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } if !w.iterateRangeCanonical(start, lowerTop) { return } w.iterateRangeCanonical(upperBottom, end) } } else if start < upperBottom { if end <= upperBottom { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } } else { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } w.iterateRangeCanonical(upperBottom, end) } } else { w.iterateRangeCanonical(start, end) } } // next returns the next address quantized by the given size. // //go:nosplit func mapnext(start uintptr, size uintptr) uintptr { start &= ^(size - 1) start += size return start } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/walker_map_arm64.go000066400000000000000000000160071465435605700262400ustar00rootroot00000000000000//go:build arm64 // +build arm64 package pagetables // iterateRangeCanonical walks a canonical range. // //go:nosplit func (w *mapWalker) iterateRangeCanonical(start, end uintptr) bool { pgdEntryIndex := w.pageTables.root if start >= upperBottom { pgdEntryIndex = w.pageTables.archPageTables.root } for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ { var ( pgdEntry = &pgdEntryIndex[pgdIndex] pudEntries *PTEs ) if !pgdEntry.Valid() { if !w.visitor.requiresAlloc() { start = mapnext(start, pgdSize) continue } pudEntries = w.pageTables.Allocator.NewPTEs() pgdEntry.setPageTable(w.pageTables, pudEntries) } else { pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) } clearPUDEntries := uint16(0) for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { var ( pudEntry = &pudEntries[pudIndex] pmdEntries *PTEs ) if !pudEntry.Valid() { if !w.visitor.requiresAlloc() { clearPUDEntries++ start = mapnext(start, pudSize) continue } if start&(pudSize-1) == 0 && end-start >= pudSize { pudEntry.SetSect() if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) { return false } if pudEntry.Valid() { start = mapnext(start, pudSize) continue } } pmdEntries = w.pageTables.Allocator.NewPTEs() pudEntry.setPageTable(w.pageTables, pmdEntries) } else if pudEntry.IsSect() { if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < mapnext(start, pudSize)) { pmdEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pmdEntries[index].SetSect() pmdEntries[index].Set( pudEntry.Address()+(pmdSize*uintptr(index)), pudEntry.Opts()) } pudEntry.setPageTable(w.pageTables, pmdEntries) } else { if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) { return false } if !pudEntry.Valid() { clearPUDEntries++ } start = mapnext(start, pudSize) continue } } else { pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) } clearPMDEntries := uint16(0) for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { var ( pmdEntry = &pmdEntries[pmdIndex] pteEntries *PTEs ) if !pmdEntry.Valid() { if !w.visitor.requiresAlloc() { clearPMDEntries++ start = mapnext(start, pmdSize) continue } if start&(pmdSize-1) == 0 && end-start >= pmdSize { pmdEntry.SetSect() if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) { return false } if pmdEntry.Valid() { start = mapnext(start, pmdSize) continue } } pteEntries = w.pageTables.Allocator.NewPTEs() pmdEntry.setPageTable(w.pageTables, pteEntries) } else if pmdEntry.IsSect() { if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < mapnext(start, pmdSize)) { pteEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pteEntries[index].Set( pmdEntry.Address()+(pteSize*uintptr(index)), pmdEntry.Opts()) } pmdEntry.setPageTable(w.pageTables, pteEntries) } else { if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) { return false } if !pmdEntry.Valid() { clearPMDEntries++ } start = mapnext(start, pmdSize) continue } } else { pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) } clearPTEEntries := uint16(0) for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { var ( pteEntry = &pteEntries[pteIndex] ) if !pteEntry.Valid() && !w.visitor.requiresAlloc() { clearPTEEntries++ start += pteSize continue } if !w.visitor.visit(uintptr(start), pteEntry, pteSize-1) { return false } if !pteEntry.Valid() { if w.visitor.requiresAlloc() { panic("PTE not set after iteration with requiresAlloc!") } clearPTEEntries++ } start += pteSize continue } if clearPTEEntries == entriesPerPage { pmdEntry.Clear() w.pageTables.Allocator.FreePTEs(pteEntries) clearPMDEntries++ } } if clearPMDEntries == entriesPerPage { pudEntry.Clear() w.pageTables.Allocator.FreePTEs(pmdEntries) clearPUDEntries++ } } if clearPUDEntries == entriesPerPage { pgdEntry.Clear() w.pageTables.Allocator.FreePTEs(pudEntries) } } return true } // Walker walks page tables. type mapWalker struct { // pageTables are the tables to walk. pageTables *PageTables // Visitor is the set of arguments. visitor mapVisitor } // iterateRange iterates over all appropriate levels of page tables for the given range. // // If requiresAlloc is true, then Set _must_ be called on all given PTEs. The // exception is super pages. If a valid super page (huge or jumbo) cannot be // installed, then the walk will continue to individual entries. // // This algorithm will attempt to maximize the use of super/sect pages whenever // possible. Whether a super page is provided will be clear through the range // provided in the callback. // // Note that if requiresAlloc is true, then no gaps will be present. However, // if alloc is not set, then the iteration will likely be full of gaps. // // Note that this function should generally be avoided in favor of Map, Unmap, // etc. when not necessary. // // Precondition: start must be page-aligned. // Precondition: start must be less than end. // Precondition: If requiresAlloc is true, then start and end should not span // non-canonical ranges. If they do, a panic will result. // //go:nosplit func (w *mapWalker) iterateRange(start, end uintptr) { if start%pteSize != 0 { panic("unaligned start") } if end < start { panic("start > end") } if start < lowerTop { if end <= lowerTop { w.iterateRangeCanonical(start, end) } else if end > lowerTop && end <= upperBottom { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } w.iterateRangeCanonical(start, lowerTop) } else { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } if !w.iterateRangeCanonical(start, lowerTop) { return } w.iterateRangeCanonical(upperBottom, end) } } else if start < upperBottom { if end <= upperBottom { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } } else { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } w.iterateRangeCanonical(upperBottom, end) } } else { w.iterateRangeCanonical(start, end) } } // next returns the next address quantized by the given size. // //go:nosplit func mapnext(start uintptr, size uintptr) uintptr { start &= ^(size - 1) start += size return start } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/walker_unmap_amd64.go000066400000000000000000000156441465435605700265730ustar00rootroot00000000000000//go:build amd64 // +build amd64 package pagetables // iterateRangeCanonical walks a canonical range. // //go:nosplit func (w *unmapWalker) iterateRangeCanonical(start, end uintptr) bool { for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { var ( pgdEntry = &w.pageTables.root[pgdIndex] pudEntries *PTEs ) if !pgdEntry.Valid() { if !w.visitor.requiresAlloc() { start = unmapnext(start, pgdSize) continue } pudEntries = w.pageTables.Allocator.NewPTEs() pgdEntry.setPageTable(w.pageTables, pudEntries) } else { pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) } clearPUDEntries := uint16(0) for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { var ( pudEntry = &pudEntries[pudIndex] pmdEntries *PTEs ) if !pudEntry.Valid() { if !w.visitor.requiresAlloc() { clearPUDEntries++ start = unmapnext(start, pudSize) continue } if start&(pudSize-1) == 0 && end-start >= pudSize { pudEntry.SetSuper() if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) { return false } if pudEntry.Valid() { start = unmapnext(start, pudSize) continue } } pmdEntries = w.pageTables.Allocator.NewPTEs() pudEntry.setPageTable(w.pageTables, pmdEntries) } else if pudEntry.IsSuper() { if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < unmapnext(start, pudSize)) { pmdEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pmdEntries[index].SetSuper() pmdEntries[index].Set( pudEntry.Address()+(pmdSize*uintptr(index)), pudEntry.Opts()) } pudEntry.setPageTable(w.pageTables, pmdEntries) } else { if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) { return false } if !pudEntry.Valid() { clearPUDEntries++ } start = unmapnext(start, pudSize) continue } } else { pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) } clearPMDEntries := uint16(0) for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { var ( pmdEntry = &pmdEntries[pmdIndex] pteEntries *PTEs ) if !pmdEntry.Valid() { if !w.visitor.requiresAlloc() { clearPMDEntries++ start = unmapnext(start, pmdSize) continue } if start&(pmdSize-1) == 0 && end-start >= pmdSize { pmdEntry.SetSuper() if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) { return false } if pmdEntry.Valid() { start = unmapnext(start, pmdSize) continue } } pteEntries = w.pageTables.Allocator.NewPTEs() pmdEntry.setPageTable(w.pageTables, pteEntries) } else if pmdEntry.IsSuper() { if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < unmapnext(start, pmdSize)) { pteEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pteEntries[index].Set( pmdEntry.Address()+(pteSize*uintptr(index)), pmdEntry.Opts()) } pmdEntry.setPageTable(w.pageTables, pteEntries) } else { if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) { return false } if !pmdEntry.Valid() { clearPMDEntries++ } start = unmapnext(start, pmdSize) continue } } else { pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) } clearPTEEntries := uint16(0) for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { var ( pteEntry = &pteEntries[pteIndex] ) if !pteEntry.Valid() && !w.visitor.requiresAlloc() { clearPTEEntries++ start += pteSize continue } if !w.visitor.visit(uintptr(start&^(pteSize-1)), pteEntry, pteSize-1) { return false } if !pteEntry.Valid() && !w.visitor.requiresAlloc() { clearPTEEntries++ } start += pteSize continue } if clearPTEEntries == entriesPerPage { pmdEntry.Clear() w.pageTables.Allocator.FreePTEs(pteEntries) clearPMDEntries++ } } if clearPMDEntries == entriesPerPage { pudEntry.Clear() w.pageTables.Allocator.FreePTEs(pmdEntries) clearPUDEntries++ } } if clearPUDEntries == entriesPerPage { pgdEntry.Clear() w.pageTables.Allocator.FreePTEs(pudEntries) } } return true } // Walker walks page tables. type unmapWalker struct { // pageTables are the tables to walk. pageTables *PageTables // Visitor is the set of arguments. visitor unmapVisitor } // iterateRange iterates over all appropriate levels of page tables for the given range. // // If requiresAlloc is true, then Set _must_ be called on all given PTEs. The // exception is super pages. If a valid super page (huge or jumbo) cannot be // installed, then the walk will continue to individual entries. // // This algorithm will attempt to maximize the use of super/sect pages whenever // possible. Whether a super page is provided will be clear through the range // provided in the callback. // // Note that if requiresAlloc is true, then no gaps will be present. However, // if alloc is not set, then the iteration will likely be full of gaps. // // Note that this function should generally be avoided in favor of Map, Unmap, // etc. when not necessary. // // Precondition: start must be page-aligned. // Precondition: start must be less than end. // Precondition: If requiresAlloc is true, then start and end should not span // non-canonical ranges. If they do, a panic will result. // //go:nosplit func (w *unmapWalker) iterateRange(start, end uintptr) { if start%pteSize != 0 { panic("unaligned start") } if end < start { panic("start > end") } if start < lowerTop { if end <= lowerTop { w.iterateRangeCanonical(start, end) } else if end > lowerTop && end <= upperBottom { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } w.iterateRangeCanonical(start, lowerTop) } else { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } if !w.iterateRangeCanonical(start, lowerTop) { return } w.iterateRangeCanonical(upperBottom, end) } } else if start < upperBottom { if end <= upperBottom { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } } else { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } w.iterateRangeCanonical(upperBottom, end) } } else { w.iterateRangeCanonical(start, end) } } // next returns the next address quantized by the given size. // //go:nosplit func unmapnext(start uintptr, size uintptr) uintptr { start &= ^(size - 1) start += size return start } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/pagetables/walker_unmap_arm64.go000066400000000000000000000160431465435605700266030ustar00rootroot00000000000000//go:build arm64 // +build arm64 package pagetables // iterateRangeCanonical walks a canonical range. // //go:nosplit func (w *unmapWalker) iterateRangeCanonical(start, end uintptr) bool { pgdEntryIndex := w.pageTables.root if start >= upperBottom { pgdEntryIndex = w.pageTables.archPageTables.root } for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ { var ( pgdEntry = &pgdEntryIndex[pgdIndex] pudEntries *PTEs ) if !pgdEntry.Valid() { if !w.visitor.requiresAlloc() { start = unmapnext(start, pgdSize) continue } pudEntries = w.pageTables.Allocator.NewPTEs() pgdEntry.setPageTable(w.pageTables, pudEntries) } else { pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) } clearPUDEntries := uint16(0) for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { var ( pudEntry = &pudEntries[pudIndex] pmdEntries *PTEs ) if !pudEntry.Valid() { if !w.visitor.requiresAlloc() { clearPUDEntries++ start = unmapnext(start, pudSize) continue } if start&(pudSize-1) == 0 && end-start >= pudSize { pudEntry.SetSect() if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) { return false } if pudEntry.Valid() { start = unmapnext(start, pudSize) continue } } pmdEntries = w.pageTables.Allocator.NewPTEs() pudEntry.setPageTable(w.pageTables, pmdEntries) } else if pudEntry.IsSect() { if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < unmapnext(start, pudSize)) { pmdEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pmdEntries[index].SetSect() pmdEntries[index].Set( pudEntry.Address()+(pmdSize*uintptr(index)), pudEntry.Opts()) } pudEntry.setPageTable(w.pageTables, pmdEntries) } else { if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) { return false } if !pudEntry.Valid() { clearPUDEntries++ } start = unmapnext(start, pudSize) continue } } else { pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) } clearPMDEntries := uint16(0) for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { var ( pmdEntry = &pmdEntries[pmdIndex] pteEntries *PTEs ) if !pmdEntry.Valid() { if !w.visitor.requiresAlloc() { clearPMDEntries++ start = unmapnext(start, pmdSize) continue } if start&(pmdSize-1) == 0 && end-start >= pmdSize { pmdEntry.SetSect() if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) { return false } if pmdEntry.Valid() { start = unmapnext(start, pmdSize) continue } } pteEntries = w.pageTables.Allocator.NewPTEs() pmdEntry.setPageTable(w.pageTables, pteEntries) } else if pmdEntry.IsSect() { if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < unmapnext(start, pmdSize)) { pteEntries = w.pageTables.Allocator.NewPTEs() for index := uint16(0); index < entriesPerPage; index++ { pteEntries[index].Set( pmdEntry.Address()+(pteSize*uintptr(index)), pmdEntry.Opts()) } pmdEntry.setPageTable(w.pageTables, pteEntries) } else { if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) { return false } if !pmdEntry.Valid() { clearPMDEntries++ } start = unmapnext(start, pmdSize) continue } } else { pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) } clearPTEEntries := uint16(0) for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { var ( pteEntry = &pteEntries[pteIndex] ) if !pteEntry.Valid() && !w.visitor.requiresAlloc() { clearPTEEntries++ start += pteSize continue } if !w.visitor.visit(uintptr(start), pteEntry, pteSize-1) { return false } if !pteEntry.Valid() { if w.visitor.requiresAlloc() { panic("PTE not set after iteration with requiresAlloc!") } clearPTEEntries++ } start += pteSize continue } if clearPTEEntries == entriesPerPage { pmdEntry.Clear() w.pageTables.Allocator.FreePTEs(pteEntries) clearPMDEntries++ } } if clearPMDEntries == entriesPerPage { pudEntry.Clear() w.pageTables.Allocator.FreePTEs(pmdEntries) clearPUDEntries++ } } if clearPUDEntries == entriesPerPage { pgdEntry.Clear() w.pageTables.Allocator.FreePTEs(pudEntries) } } return true } // Walker walks page tables. type unmapWalker struct { // pageTables are the tables to walk. pageTables *PageTables // Visitor is the set of arguments. visitor unmapVisitor } // iterateRange iterates over all appropriate levels of page tables for the given range. // // If requiresAlloc is true, then Set _must_ be called on all given PTEs. The // exception is super pages. If a valid super page (huge or jumbo) cannot be // installed, then the walk will continue to individual entries. // // This algorithm will attempt to maximize the use of super/sect pages whenever // possible. Whether a super page is provided will be clear through the range // provided in the callback. // // Note that if requiresAlloc is true, then no gaps will be present. However, // if alloc is not set, then the iteration will likely be full of gaps. // // Note that this function should generally be avoided in favor of Map, Unmap, // etc. when not necessary. // // Precondition: start must be page-aligned. // Precondition: start must be less than end. // Precondition: If requiresAlloc is true, then start and end should not span // non-canonical ranges. If they do, a panic will result. // //go:nosplit func (w *unmapWalker) iterateRange(start, end uintptr) { if start%pteSize != 0 { panic("unaligned start") } if end < start { panic("start > end") } if start < lowerTop { if end <= lowerTop { w.iterateRangeCanonical(start, end) } else if end > lowerTop && end <= upperBottom { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } w.iterateRangeCanonical(start, lowerTop) } else { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } if !w.iterateRangeCanonical(start, lowerTop) { return } w.iterateRangeCanonical(upperBottom, end) } } else if start < upperBottom { if end <= upperBottom { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } } else { if w.visitor.requiresAlloc() { panic("alloc spans non-canonical range") } w.iterateRangeCanonical(upperBottom, end) } } else { w.iterateRangeCanonical(start, end) } } // next returns the next address quantized by the given size. // //go:nosplit func unmapnext(start uintptr, size uintptr) uintptr { start &= ^(size - 1) start += size return start } golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/ring0.go000066400000000000000000000013611465435605700220120ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package ring0 provides basic operating system-level stubs. package ring0 import ( // Required for facts checks. _ "gvisor.dev/gvisor/pkg/abi/linux" ) golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/ring0_amd64_state_autogen.go000066400000000000000000000002061465435605700257240ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 && amd64 && amd64 && amd64 // +build amd64,amd64,amd64,amd64 package ring0 golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/ring0_arm64_state_autogen.go000066400000000000000000000002061465435605700257420ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 && arm64 && arm64 && arm64 // +build arm64,arm64,arm64,arm64 package ring0 golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/ring0_state_autogen.go000066400000000000000000000001771465435605700247400ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 && (386 || amd64) // +build arm64 // +build 386 amd64 package ring0 golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/ring0_unsafe_state_autogen.go000066400000000000000000000000671465435605700262770ustar00rootroot00000000000000// automatically generated by stateify. package ring0 golang-gvisor-gvisor-0.0~20240729.0/pkg/ring0/x86.go000066400000000000000000000175661465435605700214360ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build 386 || amd64 // +build 386 amd64 package ring0 // Useful bits. const ( _CR0_PE = 1 << 0 _CR0_ET = 1 << 4 _CR0_NE = 1 << 5 _CR0_AM = 1 << 18 _CR0_PG = 1 << 31 _CR4_PSE = 1 << 4 _CR4_PAE = 1 << 5 _CR4_PGE = 1 << 7 _CR4_OSFXSR = 1 << 9 _CR4_OSXMMEXCPT = 1 << 10 _CR4_FSGSBASE = 1 << 16 _CR4_PCIDE = 1 << 17 _CR4_OSXSAVE = 1 << 18 _CR4_SMEP = 1 << 20 _CR4_SMAP = 1 << 21 _RFLAGS_AC = 1 << 18 _RFLAGS_NT = 1 << 14 _RFLAGS_IOPL0 = 1 << 12 _RFLAGS_IOPL1 = 1 << 13 _RFLAGS_IOPL = _RFLAGS_IOPL0 | _RFLAGS_IOPL1 _RFLAGS_DF = 1 << 10 _RFLAGS_IF = 1 << 9 _RFLAGS_STEP = 1 << 8 _RFLAGS_RESERVED = 1 << 1 _EFER_SCE = 0x001 _EFER_LME = 0x100 _EFER_LMA = 0x400 _EFER_NX = 0x800 _MSR_STAR = 0xc0000081 _MSR_LSTAR = 0xc0000082 _MSR_CSTAR = 0xc0000083 _MSR_SYSCALL_MASK = 0xc0000084 _MSR_PLATFORM_INFO = 0xce _MSR_MISC_FEATURES = 0x140 _PLATFORM_INFO_CPUID_FAULT = 1 << 31 _MISC_FEATURE_CPUID_TRAP = 0x1 ) const ( // KernelFlagsSet should always be set in the kernel. KernelFlagsSet = _RFLAGS_RESERVED // UserFlagsSet are always set in userspace. // // _RFLAGS_IOPL is a set of two bits and it shows the I/O privilege // level. The Current Privilege Level (CPL) of the task must be less // than or equal to the IOPL in order for the task or program to access // I/O ports. // // Here, _RFLAGS_IOPL0 is used only to determine whether the task is // running in the kernel or userspace mode. In the user mode, the CPL is // always 3 and it doesn't matter what IOPL is set if it is below CPL. // // We need to have one bit which will be always different in user and // kernel modes. And we have to remember that even though we have // KernelFlagsClear, we still can see some of these flags in the kernel // mode. This can happen when the goruntime switches on a goroutine // which has been saved in the host mode. On restore, the popf // instruction is used to restore flags and this means that all flags // what the goroutine has in the host mode will be restored in the // kernel mode. // // _RFLAGS_IOPL0 is never set in host and kernel modes and we always set // it in the user mode. So if this flag is set, the task is running in // the user mode and if it isn't set, the task is running in the kernel // mode. UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF | _RFLAGS_IOPL0 // KernelFlagsClear should always be clear in the kernel. KernelFlagsClear = _RFLAGS_STEP | _RFLAGS_IF | _RFLAGS_IOPL | _RFLAGS_AC | _RFLAGS_NT // UserFlagsClear are always cleared in userspace. UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL1 ) // IsKernelFlags returns true if rflags corresponds to the kernel mode. // //go:nosplit func IsKernelFlags(rflags uint64) bool { return rflags&_RFLAGS_IOPL0 == 0 } // Vector is an exception vector. type Vector uintptr // Exception vectors. const ( DivideByZero Vector = iota Debug NMI Breakpoint Overflow BoundRangeExceeded InvalidOpcode DeviceNotAvailable DoubleFault CoprocessorSegmentOverrun InvalidTSS SegmentNotPresent StackSegmentFault GeneralProtectionFault PageFault _ X87FloatingPointException AlignmentCheck MachineCheck SIMDFloatingPointException VirtualizationException SecurityException = 0x1e SyscallInt80 = 0x80 _NR_INTERRUPTS = 0x100 ) // System call vectors. const ( Syscall Vector = _NR_INTERRUPTS ) // Selector is a segment Selector. type Selector uint16 // SegmentDescriptor is a segment descriptor. type SegmentDescriptor struct { bits [2]uint32 } // descriptorTable is a collection of descriptors. type descriptorTable [32]SegmentDescriptor // SegmentDescriptorFlags are typed flags within a descriptor. type SegmentDescriptorFlags uint32 // SegmentDescriptorFlag declarations. const ( SegmentDescriptorAccess SegmentDescriptorFlags = 1 << 8 // Access bit (always set). SegmentDescriptorWrite = 1 << 9 // Write permission. SegmentDescriptorExpandDown = 1 << 10 // Grows down, not used. SegmentDescriptorExecute = 1 << 11 // Execute permission. SegmentDescriptorSystem = 1 << 12 // Zero => system, 1 => user code/data. SegmentDescriptorPresent = 1 << 15 // Present. SegmentDescriptorAVL = 1 << 20 // Available. SegmentDescriptorLong = 1 << 21 // Long mode. SegmentDescriptorDB = 1 << 22 // 16 or 32-bit. SegmentDescriptorG = 1 << 23 // Granularity: page or byte. ) // Base returns the descriptor's base linear address. func (d *SegmentDescriptor) Base() uint32 { return d.bits[1]&0xFF000000 | (d.bits[1]&0x000000FF)<<16 | d.bits[0]>>16 } // Limit returns the descriptor size. func (d *SegmentDescriptor) Limit() uint32 { l := d.bits[0]&0xFFFF | d.bits[1]&0xF0000 if d.bits[1]&uint32(SegmentDescriptorG) != 0 { l <<= 12 l |= 0xFFF } return l } // Flags returns descriptor flags. func (d *SegmentDescriptor) Flags() SegmentDescriptorFlags { return SegmentDescriptorFlags(d.bits[1] & 0x00F09F00) } // DPL returns the descriptor privilege level. func (d *SegmentDescriptor) DPL() int { return int((d.bits[1] >> 13) & 3) } func (d *SegmentDescriptor) setNull() { d.bits[0] = 0 d.bits[1] = 0 } func (d *SegmentDescriptor) set(base, limit uint32, dpl int, flags SegmentDescriptorFlags) { flags |= SegmentDescriptorPresent if limit>>12 != 0 { limit >>= 12 flags |= SegmentDescriptorG } d.bits[0] = base<<16 | limit&0xFFFF d.bits[1] = base&0xFF000000 | (base>>16)&0xFF | limit&0x000F0000 | uint32(flags) | uint32(dpl)<<13 } func (d *SegmentDescriptor) setCode32(base, limit uint32, dpl int) { d.set(base, limit, dpl, SegmentDescriptorDB| SegmentDescriptorExecute| SegmentDescriptorSystem) } func (d *SegmentDescriptor) setCode64(base, limit uint32, dpl int) { d.set(base, limit, dpl, SegmentDescriptorG| SegmentDescriptorLong| SegmentDescriptorExecute| SegmentDescriptorSystem) } func (d *SegmentDescriptor) setData(base, limit uint32, dpl int) { d.set(base, limit, dpl, SegmentDescriptorWrite| SegmentDescriptorSystem) } // setHi is only used for the TSS segment, which is magically 64-bits. func (d *SegmentDescriptor) setHi(base uint32) { d.bits[0] = base d.bits[1] = 0 } // Gate64 is a 64-bit task, trap, or interrupt gate. type Gate64 struct { bits [4]uint32 } // idt64 is a 64-bit interrupt descriptor table. type idt64 [_NR_INTERRUPTS]Gate64 func (g *Gate64) setInterrupt(cs Selector, rip uint64, dpl int, ist int) { g.bits[0] = uint32(cs)<<16 | uint32(rip)&0xFFFF g.bits[1] = uint32(rip)&0xFFFF0000 | SegmentDescriptorPresent | uint32(dpl)<<13 | 14<<8 | uint32(ist)&0x7 g.bits[2] = uint32(rip >> 32) } func (g *Gate64) setTrap(cs Selector, rip uint64, dpl int, ist int) { g.setInterrupt(cs, rip, dpl, ist) g.bits[1] |= 1 << 8 } // TaskState64 is a 64-bit task state structure. type TaskState64 struct { _ uint32 rsp0Lo, rsp0Hi uint32 rsp1Lo, rsp1Hi uint32 rsp2Lo, rsp2Hi uint32 _ [2]uint32 ist1Lo, ist1Hi uint32 ist2Lo, ist2Hi uint32 ist3Lo, ist3Hi uint32 ist4Lo, ist4Hi uint32 ist5Lo, ist5Hi uint32 ist6Lo, ist6Hi uint32 ist7Lo, ist7Hi uint32 _ [2]uint32 _ uint16 ioPerm uint16 } golang-gvisor-gvisor-0.0~20240729.0/pkg/safecopy/000077500000000000000000000000001465435605700212355ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/safecopy/atomic_amd64.s000066400000000000000000000133441465435605700236750ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "textflag.h" // handleSwapUint32Fault returns the value stored in DI. Control is transferred // to it when swapUint32 below receives SIGSEGV or SIGBUS, with the signal // number stored in DI. // // It must have the same frame configuration as swapUint32 so that it can undo // any potential call frame set up by the assembler. TEXT handleSwapUint32Fault(SB), NOSPLIT|NOFRAME, $0-24 MOVL DI, sig+20(FP) RET // swapUint32 atomically stores new into *ptr and returns (the previous ptr* // value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the // value of old is unspecified, and sig is the number of the signal that was // received. // // Preconditions: ptr must be aligned to a 4-byte boundary. // //func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32) TEXT ·swapUint32(SB), NOSPLIT|NOFRAME, $0-24 // Store 0 as the returned signal number. If we run to completion, // this is the value the caller will see; if a signal is received, // handleSwapUint32Fault will store a different value in this address. MOVL $0, sig+20(FP) MOVQ ptr+0(FP), DI MOVL new+8(FP), AX XCHGL AX, 0(DI) MOVL AX, old+16(FP) RET // func addrOfSwapUint32() uintptr TEXT ·addrOfSwapUint32(SB), $0-8 MOVQ $·swapUint32(SB), AX MOVQ AX, ret+0(FP) RET // handleSwapUint64Fault returns the value stored in DI. Control is transferred // to it when swapUint64 below receives SIGSEGV or SIGBUS, with the signal // number stored in DI. // // It must have the same frame configuration as swapUint64 so that it can undo // any potential call frame set up by the assembler. TEXT handleSwapUint64Fault(SB), NOSPLIT|NOFRAME, $0-28 MOVL DI, sig+24(FP) RET // swapUint64 atomically stores new into *ptr and returns (the previous *ptr // value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the // value of old is unspecified, and sig is the number of the signal that was // received. // // Preconditions: ptr must be aligned to a 8-byte boundary. // //func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32) TEXT ·swapUint64(SB), NOSPLIT|NOFRAME, $0-28 // Store 0 as the returned signal number. If we run to completion, // this is the value the caller will see; if a signal is received, // handleSwapUint64Fault will store a different value in this address. MOVL $0, sig+24(FP) MOVQ ptr+0(FP), DI MOVQ new+8(FP), AX XCHGQ AX, 0(DI) MOVQ AX, old+16(FP) RET // func addrOfSwapUint64() uintptr TEXT ·addrOfSwapUint64(SB), NOSPLIT|NOFRAME, $0-8 MOVQ $·swapUint64(SB), AX MOVQ AX, ret+0(FP) RET // handleCompareAndSwapUint32Fault returns the value stored in DI. Control is // transferred to it when swapUint64 below receives SIGSEGV or SIGBUS, with the // signal number stored in DI. // // It must have the same frame configuration as compareAndSwapUint32 so that it // can undo any potential call frame set up by the assembler. TEXT handleCompareAndSwapUint32Fault(SB), NOSPLIT|NOFRAME, $0-24 MOVL DI, sig+20(FP) RET // compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns // (the value previously stored at ptr, 0). If a SIGSEGV or SIGBUS signal is // received during the operation, the value of prev is unspecified, and sig is // the number of the signal that was received. // // Preconditions: ptr must be aligned to a 4-byte boundary. // //func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32) TEXT ·compareAndSwapUint32(SB), NOSPLIT|NOFRAME, $0-24 // Store 0 as the returned signal number. If we run to completion, this is // the value the caller will see; if a signal is received, // handleCompareAndSwapUint32Fault will store a different value in this // address. MOVL $0, sig+20(FP) MOVQ ptr+0(FP), DI MOVL old+8(FP), AX MOVL new+12(FP), DX LOCK CMPXCHGL DX, 0(DI) MOVL AX, prev+16(FP) RET // func addrOfCompareAndSwapUint32() uintptr TEXT ·addrOfCompareAndSwapUint32(SB), NOSPLIT|NOFRAME, $0-8 MOVQ $·compareAndSwapUint32(SB), AX MOVQ AX, ret+0(FP) RET // handleLoadUint32Fault returns the value stored in DI. Control is transferred // to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal // number stored in DI. // // It must have the same frame configuration as loadUint32 so that it can undo // any potential call frame set up by the assembler. TEXT handleLoadUint32Fault(SB), NOSPLIT|NOFRAME, $0-16 MOVL DI, sig+12(FP) RET // loadUint32 atomically loads *ptr and returns it. If a SIGSEGV or SIGBUS // signal is received, the value returned is unspecified, and sig is the number // of the signal that was received. // // Preconditions: ptr must be aligned to a 4-byte boundary. // //func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) TEXT ·loadUint32(SB), NOSPLIT|NOFRAME, $0-16 // Store 0 as the returned signal number. If we run to completion, // this is the value the caller will see; if a signal is received, // handleLoadUint32Fault will store a different value in this address. MOVL $0, sig+12(FP) MOVQ ptr+0(FP), AX MOVL (AX), BX MOVL BX, val+8(FP) RET // func addrOfLoadUint32() uintptr TEXT ·addrOfLoadUint32(SB), NOSPLIT|NOFRAME, $0-8 MOVQ $·loadUint32(SB), AX MOVQ AX, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/safecopy/atomic_arm64.s000066400000000000000000000113221465435605700237050ustar00rootroot00000000000000// Copyright 2014 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "textflag.h" // handleSwapUint32Fault returns the value stored in R1. Control is transferred // to it when swapUint32 below receives SIGSEGV or SIGBUS, with the signal // number stored in R1. // // It must have the same frame configuration as swapUint32 so that it can undo // any potential call frame set up by the assembler. TEXT handleSwapUint32Fault(SB), NOSPLIT, $0-24 MOVW R1, sig+20(FP) RET // See the corresponding doc in safecopy_unsafe.go // // The code is derived from Go source runtime/internal/atomic.Xchg. // //func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32) TEXT ·swapUint32(SB), NOSPLIT, $0-24 // Store 0 as the returned signal number. If we run to completion, // this is the value the caller will see; if a signal is received, // handleSwapUint32Fault will store a different value in this address. MOVW $0, sig+20(FP) again: MOVD ptr+0(FP), R0 MOVW new+8(FP), R1 LDAXRW (R0), R2 STLXRW R1, (R0), R3 CBNZ R3, again MOVW R2, old+16(FP) RET // func addrOfSwapUint32() uintptr TEXT ·addrOfSwapUint32(SB), $0-8 MOVD $·swapUint32(SB), R0 MOVD R0, ret+0(FP) RET // handleSwapUint64Fault returns the value stored in R1. Control is transferred // to it when swapUint64 below receives SIGSEGV or SIGBUS, with the signal // number stored in R1. // // It must have the same frame configuration as swapUint64 so that it can undo // any potential call frame set up by the assembler. TEXT handleSwapUint64Fault(SB), NOSPLIT, $0-28 MOVW R1, sig+24(FP) RET // See the corresponding doc in safecopy_unsafe.go // // The code is derived from Go source runtime/internal/atomic.Xchg64. // //func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32) TEXT ·swapUint64(SB), NOSPLIT, $0-28 // Store 0 as the returned signal number. If we run to completion, // this is the value the caller will see; if a signal is received, // handleSwapUint64Fault will store a different value in this address. MOVW $0, sig+24(FP) again: MOVD ptr+0(FP), R0 MOVD new+8(FP), R1 LDAXR (R0), R2 STLXR R1, (R0), R3 CBNZ R3, again MOVD R2, old+16(FP) RET // func addrOfSwapUint64() uintptr TEXT ·addrOfSwapUint64(SB), $0-8 MOVD $·swapUint64(SB), R0 MOVD R0, ret+0(FP) RET // handleCompareAndSwapUint32Fault returns the value stored in R1. Control is // transferred to it when compareAndSwapUint32 below receives SIGSEGV or SIGBUS, // with the signal number stored in R1. // // It must have the same frame configuration as compareAndSwapUint32 so that it // can undo any potential call frame set up by the assembler. TEXT handleCompareAndSwapUint32Fault(SB), NOSPLIT, $0-24 MOVW R1, sig+20(FP) RET // See the corresponding doc in safecopy_unsafe.go // // The code is derived from Go source runtime/internal/atomic.Cas. // //func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32) TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24 // Store 0 as the returned signal number. If we run to completion, this is // the value the caller will see; if a signal is received, // handleCompareAndSwapUint32Fault will store a different value in this // address. MOVW $0, sig+20(FP) MOVD ptr+0(FP), R0 MOVW old+8(FP), R1 MOVW new+12(FP), R2 again: LDAXRW (R0), R3 CMPW R1, R3 BNE done STLXRW R2, (R0), R4 CBNZ R4, again done: MOVW R3, prev+16(FP) RET // func addrOfCompareAndSwapUint32() uintptr TEXT ·addrOfCompareAndSwapUint32(SB), $0-8 MOVD $·compareAndSwapUint32(SB), R0 MOVD R0, ret+0(FP) RET // handleLoadUint32Fault returns the value stored in DI. Control is transferred // to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal // number stored in DI. // // It must have the same frame configuration as loadUint32 so that it can undo // any potential call frame set up by the assembler. TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16 MOVW R1, sig+12(FP) RET // loadUint32 atomically loads *ptr and returns it. If a SIGSEGV or SIGBUS // signal is received, the value returned is unspecified, and sig is the number // of the signal that was received. // // Preconditions: ptr must be aligned to a 4-byte boundary. // //func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) TEXT ·loadUint32(SB), NOSPLIT, $0-16 // Store 0 as the returned signal number. If we run to completion, // this is the value the caller will see; if a signal is received, // handleLoadUint32Fault will store a different value in this address. MOVW $0, sig+12(FP) MOVD ptr+0(FP), R0 LDARW (R0), R1 MOVW R1, val+8(FP) RET // func addrOfLoadUint32() uintptr TEXT ·addrOfLoadUint32(SB), $0-8 MOVD $·loadUint32(SB), R0 MOVD R0, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/safecopy/memclr_amd64.s000066400000000000000000000067601465435605700237040ustar00rootroot00000000000000// Copyright 2014 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "textflag.h" // handleMemclrFault returns (the value stored in AX, the value stored in DI). // Control is transferred to it when memclr below receives SIGSEGV or SIGBUS, // with the faulting address stored in AX and the signal number stored in DI. // // It must have the same frame configuration as memclr so that it can undo any // potential call frame set up by the assembler. TEXT handleMemclrFault(SB), NOSPLIT|NOFRAME, $0-28 MOVQ AX, addr+16(FP) MOVL DI, sig+24(FP) RET // memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS // signal is received during the write, it returns the address that caused the // fault and the number of the signal that was received. Otherwise, it returns // an unspecified address and a signal number of 0. // // Data is written in order, such that if a fault happens at address p, it is // safe to assume that all data before p-maxRegisterSize has already been // successfully written. // // The code is derived from runtime.memclrNoHeapPointers. // // func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) TEXT ·memclr(SB), NOSPLIT|NOFRAME, $0-28 // Store 0 as the returned signal number. If we run to completion, // this is the value the caller will see; if a signal is received, // handleMemclrFault will store a different value in this address. MOVL $0, sig+24(FP) MOVQ ptr+0(FP), DI MOVQ n+8(FP), BX XORQ AX, AX // MOVOU seems always faster than REP STOSQ. tail: TESTQ BX, BX JEQ _0 CMPQ BX, $2 JBE _1or2 CMPQ BX, $4 JBE _3or4 CMPQ BX, $8 JB _5through7 JE _8 CMPQ BX, $16 JBE _9through16 PXOR X0, X0 CMPQ BX, $32 JBE _17through32 CMPQ BX, $64 JBE _33through64 CMPQ BX, $128 JBE _65through128 CMPQ BX, $256 JBE _129through256 // TODO: use branch table and BSR to make this just a single dispatch // TODO: for really big clears, use MOVNTDQ, even without AVX2. loop: MOVOU X0, 0(DI) MOVOU X0, 16(DI) MOVOU X0, 32(DI) MOVOU X0, 48(DI) MOVOU X0, 64(DI) MOVOU X0, 80(DI) MOVOU X0, 96(DI) MOVOU X0, 112(DI) MOVOU X0, 128(DI) MOVOU X0, 144(DI) MOVOU X0, 160(DI) MOVOU X0, 176(DI) MOVOU X0, 192(DI) MOVOU X0, 208(DI) MOVOU X0, 224(DI) MOVOU X0, 240(DI) SUBQ $256, BX ADDQ $256, DI CMPQ BX, $256 JAE loop JMP tail _1or2: MOVB AX, (DI) MOVB AX, -1(DI)(BX*1) RET _0: RET _3or4: MOVW AX, (DI) MOVW AX, -2(DI)(BX*1) RET _5through7: MOVL AX, (DI) MOVL AX, -4(DI)(BX*1) RET _8: // We need a separate case for 8 to make sure we clear pointers atomically. MOVQ AX, (DI) RET _9through16: MOVQ AX, (DI) MOVQ AX, -8(DI)(BX*1) RET _17through32: MOVOU X0, (DI) MOVOU X0, -16(DI)(BX*1) RET _33through64: MOVOU X0, (DI) MOVOU X0, 16(DI) MOVOU X0, -32(DI)(BX*1) MOVOU X0, -16(DI)(BX*1) RET _65through128: MOVOU X0, (DI) MOVOU X0, 16(DI) MOVOU X0, 32(DI) MOVOU X0, 48(DI) MOVOU X0, -64(DI)(BX*1) MOVOU X0, -48(DI)(BX*1) MOVOU X0, -32(DI)(BX*1) MOVOU X0, -16(DI)(BX*1) RET _129through256: MOVOU X0, (DI) MOVOU X0, 16(DI) MOVOU X0, 32(DI) MOVOU X0, 48(DI) MOVOU X0, 64(DI) MOVOU X0, 80(DI) MOVOU X0, 96(DI) MOVOU X0, 112(DI) MOVOU X0, -128(DI)(BX*1) MOVOU X0, -112(DI)(BX*1) MOVOU X0, -96(DI)(BX*1) MOVOU X0, -80(DI)(BX*1) MOVOU X0, -64(DI)(BX*1) MOVOU X0, -48(DI)(BX*1) MOVOU X0, -32(DI)(BX*1) MOVOU X0, -16(DI)(BX*1) RET // func addrOfMemclr() uintptr TEXT ·addrOfMemclr(SB), $0-8 MOVQ $·memclr(SB), AX MOVQ AX, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/safecopy/memclr_arm64.s000066400000000000000000000041361465435605700237150ustar00rootroot00000000000000// Copyright 2014 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "textflag.h" // handleMemclrFault returns (the value stored in R0, the value stored in R1). // Control is transferred to it when memclr below receives SIGSEGV or SIGBUS, // with the faulting address stored in R0 and the signal number stored in R1. // // It must have the same frame configuration as memclr so that it can undo any // potential call frame set up by the assembler. TEXT handleMemclrFault(SB), NOSPLIT, $0-28 MOVD R0, addr+16(FP) MOVW R1, sig+24(FP) RET // See the corresponding doc in safecopy_unsafe.go // // The code is derived from runtime.memclrNoHeapPointers. // // func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) TEXT ·memclr(SB), NOSPLIT, $0-28 // Store 0 as the returned signal number. If we run to completion, // this is the value the caller will see; if a signal is received, // handleMemclrFault will store a different value in this address. MOVW $0, sig+24(FP) MOVD ptr+0(FP), R0 MOVD n+8(FP), R1 // If size is less than 16 bytes, use tail_zero to zero what remains CMP $16, R1 BLT tail_zero // Get buffer offset into 16 byte aligned address for better performance ANDS $15, R0, ZR BNE unaligned_to_16 aligned_to_16: LSR $4, R1, R2 zero_by_16: STP.P (ZR, ZR), 16(R0) // Store pair with post index. SUBS $1, R2, R2 BNE zero_by_16 ANDS $15, R1, R1 BEQ end // Zero buffer with size=R1 < 16 tail_zero: TBZ $3, R1, tail_zero_4 MOVD.P ZR, 8(R0) tail_zero_4: TBZ $2, R1, tail_zero_2 MOVW.P ZR, 4(R0) tail_zero_2: TBZ $1, R1, tail_zero_1 MOVH.P ZR, 2(R0) tail_zero_1: TBZ $0, R1, end MOVB ZR, (R0) end: RET unaligned_to_16: MOVD R0, R2 head_loop: MOVBU.P ZR, 1(R0) ANDS $15, R0, ZR BNE head_loop // Adjust length for what remains SUB R2, R0, R3 SUB R3, R1 // If size is less than 16 bytes, use tail_zero to zero what remains CMP $16, R1 BLT tail_zero B aligned_to_16 // func addrOfMemclr() uintptr TEXT ·addrOfMemclr(SB), $0-8 MOVD $·memclr(SB), R0 MOVD R0, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/safecopy/memcpy_amd64.s000066400000000000000000000133661465435605700237170ustar00rootroot00000000000000// Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. // Portions Copyright 2009 The Go Authors. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "textflag.h" // handleMemcpyFault returns (the value stored in AX, the value stored in DI). // Control is transferred to it when memcpy below receives SIGSEGV or SIGBUS, // with the faulting address stored in AX and the signal number stored in DI. // // It must have the same frame configuration as memcpy so that it can undo any // potential call frame set up by the assembler. TEXT handleMemcpyFault(SB), NOSPLIT|NOFRAME, $0-36 MOVQ AX, addr+24(FP) MOVL DI, sig+32(FP) RET // memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received // during the copy, it returns the address that caused the fault and the number // of the signal that was received. Otherwise, it returns an unspecified address // and a signal number of 0. // // Data is copied in order, such that if a fault happens at address p, it is // safe to assume that all data before p-maxRegisterSize has already been // successfully copied. // // The code is derived from the forward copying part of runtime.memmove. // // func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) TEXT ·memcpy(SB), NOSPLIT|NOFRAME, $0-36 // Store 0 as the returned signal number. If we run to completion, // this is the value the caller will see; if a signal is received, // handleMemcpyFault will store a different value in this address. MOVL $0, sig+32(FP) MOVQ dst+0(FP), DI MOVQ src+8(FP), SI MOVQ n+16(FP), BX tail: // BSR+branch table make almost all memmove/memclr benchmarks worse. Not // worth doing. TESTQ BX, BX JEQ move_0 CMPQ BX, $2 JBE move_1or2 CMPQ BX, $4 JBE move_3or4 CMPQ BX, $8 JB move_5through7 JE move_8 CMPQ BX, $16 JBE move_9through16 CMPQ BX, $32 JBE move_17through32 CMPQ BX, $64 JBE move_33through64 CMPQ BX, $128 JBE move_65through128 CMPQ BX, $256 JBE move_129through256 move_257plus: SUBQ $256, BX MOVOU (SI), X0 MOVOU X0, (DI) MOVOU 16(SI), X1 MOVOU X1, 16(DI) MOVOU 32(SI), X2 MOVOU X2, 32(DI) MOVOU 48(SI), X3 MOVOU X3, 48(DI) MOVOU 64(SI), X4 MOVOU X4, 64(DI) MOVOU 80(SI), X5 MOVOU X5, 80(DI) MOVOU 96(SI), X6 MOVOU X6, 96(DI) MOVOU 112(SI), X7 MOVOU X7, 112(DI) MOVOU 128(SI), X8 MOVOU X8, 128(DI) MOVOU 144(SI), X9 MOVOU X9, 144(DI) MOVOU 160(SI), X10 MOVOU X10, 160(DI) MOVOU 176(SI), X11 MOVOU X11, 176(DI) MOVOU 192(SI), X12 MOVOU X12, 192(DI) MOVOU 208(SI), X13 MOVOU X13, 208(DI) MOVOU 224(SI), X14 MOVOU X14, 224(DI) MOVOU 240(SI), X15 MOVOU X15, 240(DI) CMPQ BX, $256 LEAQ 256(SI), SI LEAQ 256(DI), DI JGE move_257plus JMP tail move_1or2: MOVB (SI), AX MOVB AX, (DI) MOVB -1(SI)(BX*1), CX MOVB CX, -1(DI)(BX*1) RET move_0: RET move_3or4: MOVW (SI), AX MOVW AX, (DI) MOVW -2(SI)(BX*1), CX MOVW CX, -2(DI)(BX*1) RET move_5through7: MOVL (SI), AX MOVL AX, (DI) MOVL -4(SI)(BX*1), CX MOVL CX, -4(DI)(BX*1) RET move_8: // We need a separate case for 8 to make sure we write pointers atomically. MOVQ (SI), AX MOVQ AX, (DI) RET move_9through16: MOVQ (SI), AX MOVQ AX, (DI) MOVQ -8(SI)(BX*1), CX MOVQ CX, -8(DI)(BX*1) RET move_17through32: MOVOU (SI), X0 MOVOU X0, (DI) MOVOU -16(SI)(BX*1), X1 MOVOU X1, -16(DI)(BX*1) RET move_33through64: MOVOU (SI), X0 MOVOU X0, (DI) MOVOU 16(SI), X1 MOVOU X1, 16(DI) MOVOU -32(SI)(BX*1), X2 MOVOU X2, -32(DI)(BX*1) MOVOU -16(SI)(BX*1), X3 MOVOU X3, -16(DI)(BX*1) RET move_65through128: MOVOU (SI), X0 MOVOU X0, (DI) MOVOU 16(SI), X1 MOVOU X1, 16(DI) MOVOU 32(SI), X2 MOVOU X2, 32(DI) MOVOU 48(SI), X3 MOVOU X3, 48(DI) MOVOU -64(SI)(BX*1), X4 MOVOU X4, -64(DI)(BX*1) MOVOU -48(SI)(BX*1), X5 MOVOU X5, -48(DI)(BX*1) MOVOU -32(SI)(BX*1), X6 MOVOU X6, -32(DI)(BX*1) MOVOU -16(SI)(BX*1), X7 MOVOU X7, -16(DI)(BX*1) RET move_129through256: MOVOU (SI), X0 MOVOU X0, (DI) MOVOU 16(SI), X1 MOVOU X1, 16(DI) MOVOU 32(SI), X2 MOVOU X2, 32(DI) MOVOU 48(SI), X3 MOVOU X3, 48(DI) MOVOU 64(SI), X4 MOVOU X4, 64(DI) MOVOU 80(SI), X5 MOVOU X5, 80(DI) MOVOU 96(SI), X6 MOVOU X6, 96(DI) MOVOU 112(SI), X7 MOVOU X7, 112(DI) MOVOU -128(SI)(BX*1), X8 MOVOU X8, -128(DI)(BX*1) MOVOU -112(SI)(BX*1), X9 MOVOU X9, -112(DI)(BX*1) MOVOU -96(SI)(BX*1), X10 MOVOU X10, -96(DI)(BX*1) MOVOU -80(SI)(BX*1), X11 MOVOU X11, -80(DI)(BX*1) MOVOU -64(SI)(BX*1), X12 MOVOU X12, -64(DI)(BX*1) MOVOU -48(SI)(BX*1), X13 MOVOU X13, -48(DI)(BX*1) MOVOU -32(SI)(BX*1), X14 MOVOU X14, -32(DI)(BX*1) MOVOU -16(SI)(BX*1), X15 MOVOU X15, -16(DI)(BX*1) RET // func addrOfMemcpy() uintptr TEXT ·addrOfMemcpy(SB), $0-8 MOVQ $·memcpy(SB), AX MOVQ AX, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/safecopy/memcpy_arm64.s000066400000000000000000000052211465435605700237240ustar00rootroot00000000000000// Copyright 2014 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "textflag.h" // handleMemcpyFault returns (the value stored in R0, the value stored in R1). // Control is transferred to it when memcpy below receives SIGSEGV or SIGBUS, // with the faulting address stored in R0 and the signal number stored in R1. // // It must have the same frame configuration as memcpy so that it can undo any // potential call frame set up by the assembler. TEXT handleMemcpyFault(SB), NOSPLIT, $0-36 MOVD R0, addr+24(FP) MOVW R1, sig+32(FP) RET // memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received // during the copy, it returns the address that caused the fault and the number // of the signal that was received. Otherwise, it returns an unspecified address // and a signal number of 0. // // Data is copied in order, such that if a fault happens at address p, it is // safe to assume that all data before p-maxRegisterSize has already been // successfully copied. // // The code is derived from the Go source runtime.memmove. // // func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) TEXT ·memcpy(SB), NOSPLIT, $-8-36 // Store 0 as the returned signal number. If we run to completion, // this is the value the caller will see; if a signal is received, // handleMemcpyFault will store a different value in this address. MOVW $0, sig+32(FP) MOVD dst+0(FP), R3 MOVD src+8(FP), R4 MOVD n+16(FP), R5 CMP $0, R5 BNE check RET check: AND $~7, R5, R7 // R7 is N&~7. SUB R7, R5, R6 // R6 is N&7. // Copying forward proceeds by copying R7/8 words then copying R6 bytes. // R3 and R4 are advanced as we copy. // (There may be implementations of armv8 where copying by bytes until // at least one of source or dest is word aligned is a worthwhile // optimization, but the on the one tested so far (xgene) it did not // make a significance difference.) CMP $0, R7 // Do we need to do any word-by-word copying? BEQ noforwardlarge ADD R3, R7, R9 // R9 points just past where we copy by word. forwardlargeloop: MOVD.P 8(R4), R8 // R8 is just a scratch register. MOVD.P R8, 8(R3) CMP R3, R9 BNE forwardlargeloop noforwardlarge: CMP $0, R6 // Do we need to do any byte-by-byte copying? BNE forwardtail RET forwardtail: ADD R3, R6, R9 // R9 points just past the destination memory. forwardtailloop: MOVBU.P 1(R4), R8 MOVBU.P R8, 1(R3) CMP R3, R9 BNE forwardtailloop RET // func addrOfMemcpy() uintptr TEXT ·addrOfMemcpy(SB), $0-8 MOVD $·memcpy(SB), R0 MOVD R0, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/safecopy/safecopy.go000066400000000000000000000113151465435605700233760ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package safecopy provides an efficient implementation of functions to access // memory that may result in SIGSEGV or SIGBUS being sent to the accessor. package safecopy import ( "fmt" "runtime" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/errors" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sighandling" ) // SegvError is returned when a safecopy function receives SIGSEGV. type SegvError struct { // Addr is the address at which the SIGSEGV occurred. Addr uintptr } // Error implements error.Error. func (e SegvError) Error() string { return fmt.Sprintf("SIGSEGV at %#x", e.Addr) } // BusError is returned when a safecopy function receives SIGBUS. type BusError struct { // Addr is the address at which the SIGBUS occurred. Addr uintptr } // Error implements error.Error. func (e BusError) Error() string { return fmt.Sprintf("SIGBUS at %#x", e.Addr) } // AlignmentError is returned when a safecopy function is passed an address // that does not meet alignment requirements. type AlignmentError struct { // Addr is the invalid address. Addr uintptr // Alignment is the required alignment. Alignment uintptr } // Error implements error.Error. func (e AlignmentError) Error() string { return fmt.Sprintf("address %#x is not aligned to a %d-byte boundary", e.Addr, e.Alignment) } var ( // The begin and end addresses below are for the functions that are // checked by the signal handler. memcpyBegin uintptr memcpyEnd uintptr memclrBegin uintptr memclrEnd uintptr swapUint32Begin uintptr swapUint32End uintptr swapUint64Begin uintptr swapUint64End uintptr compareAndSwapUint32Begin uintptr compareAndSwapUint32End uintptr loadUint32Begin uintptr loadUint32End uintptr // savedSigSegVHandler is a pointer to the SIGSEGV handler that was // configured before we replaced it with our own. We still call into it // when we get a SIGSEGV that is not interesting to us. savedSigSegVHandler uintptr // Same as above, but for SIGBUS signals. savedSigBusHandler uintptr ) // signalHandler is our replacement signal handler for SIGSEGV and SIGBUS // signals. func signalHandler() // addrOfSignalHandler returns the start address of signalHandler. // // See comment on addrOfMemcpy for more details. func addrOfSignalHandler() uintptr // FindEndAddress returns the end address (one byte beyond the last) of the // function that contains the specified address (begin). func FindEndAddress(begin uintptr) uintptr { f := runtime.FuncForPC(begin) if f != nil { for p := begin; ; p++ { g := runtime.FuncForPC(p) if f != g { return p } } } return begin } // initializeAddresses initializes the addresses used by the signal handler. func initializeAddresses() { // The following functions are written in assembly language, so they won't // be inlined by the existing compiler/linker. Tests will fail if this // assumption is violated. memcpyBegin = addrOfMemcpy() memcpyEnd = FindEndAddress(memcpyBegin) memclrBegin = addrOfMemclr() memclrEnd = FindEndAddress(memclrBegin) swapUint32Begin = addrOfSwapUint32() swapUint32End = FindEndAddress(swapUint32Begin) swapUint64Begin = addrOfSwapUint64() swapUint64End = FindEndAddress(swapUint64Begin) compareAndSwapUint32Begin = addrOfCompareAndSwapUint32() compareAndSwapUint32End = FindEndAddress(compareAndSwapUint32Begin) loadUint32Begin = addrOfLoadUint32() loadUint32End = FindEndAddress(loadUint32Begin) initializeArchAddresses() } func init() { initializeAddresses() if err := sighandling.ReplaceSignalHandler(unix.SIGSEGV, addrOfSignalHandler(), &savedSigSegVHandler); err != nil { panic(fmt.Sprintf("Unable to set handler for SIGSEGV: %v", err)) } if err := sighandling.ReplaceSignalHandler(unix.SIGBUS, addrOfSignalHandler(), &savedSigBusHandler); err != nil { panic(fmt.Sprintf("Unable to set handler for SIGBUS: %v", err)) } linuxerr.AddErrorUnwrapper(func(e error) (*errors.Error, bool) { switch e.(type) { case SegvError, BusError, AlignmentError: return linuxerr.EFAULT, true default: return nil, false } }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/safecopy/safecopy_amd64_unsafe.go000066400000000000000000000023411465435605700257310ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 || i386 // +build amd64 i386 package safecopy import ( "unsafe" ) var ( checkXstateBegin uintptr checkXstateEnd uintptr ) func initializeArchAddresses() { checkXstateBegin = addrOfCheckXstate() checkXstateEnd = FindEndAddress(checkXstateBegin) } //go:noescape func checkXstate(addr uintptr) (fault uintptr, sig int32, mxcsr uint32, cw uint16) func addrOfCheckXstate() uintptr // CheckXstate verifies that xstate can be restored by the xrstor instruction. func CheckXstate(state *byte) error { _, sig, _, _ := checkXstate(uintptr(unsafe.Pointer(state))) return errorFromFaultSignal(uintptr(unsafe.Pointer(state)), sig) } golang-gvisor-gvisor-0.0~20240729.0/pkg/safecopy/safecopy_amd64_unsafe_state_autogen.go000066400000000000000000000001511465435605700306500ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 || i386 // +build amd64 i386 package safecopy golang-gvisor-gvisor-0.0~20240729.0/pkg/safecopy/safecopy_arm64.go000066400000000000000000000012541465435605700244100ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package safecopy func initializeArchAddresses() { } golang-gvisor-gvisor-0.0~20240729.0/pkg/safecopy/safecopy_arm64_state_autogen.go000066400000000000000000000001341465435605700273260ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 // +build arm64 package safecopy golang-gvisor-gvisor-0.0~20240729.0/pkg/safecopy/safecopy_state_autogen.go000066400000000000000000000000721465435605700263160ustar00rootroot00000000000000// automatically generated by stateify. package safecopy golang-gvisor-gvisor-0.0~20240729.0/pkg/safecopy/safecopy_unsafe.go000066400000000000000000000262031465435605700247410ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package safecopy import ( "fmt" "runtime" "unsafe" "golang.org/x/sys/unix" ) // maxRegisterSize is the maximum register size used in memcpy and memclr. It // is used to decide by how much to rewind the copy (for memcpy) or zeroing // (for memclr) before proceeding. const maxRegisterSize = 16 // memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received // during the copy, it returns the address that caused the fault and the number // of the signal that was received. Otherwise, it returns an unspecified address // and a signal number of 0. // // Data is copied in order, such that if a fault happens at address p, it is // safe to assume that all data before p-maxRegisterSize has already been // successfully copied. // //go:noescape func memcpy(dst, src uintptr, n uintptr) (fault uintptr, sig int32) // memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS // signal is received during the write, it returns the address that caused the // fault and the number of the signal that was received. Otherwise, it returns // an unspecified address and a signal number of 0. // // Data is written in order, such that if a fault happens at address p, it is // safe to assume that all data before p-maxRegisterSize has already been // successfully written. // //go:noescape func memclr(ptr uintptr, n uintptr) (fault uintptr, sig int32) // swapUint32 atomically stores new into *ptr and returns (the previous *ptr // value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the // value of old is unspecified, and sig is the number of the signal that was // received. // // Preconditions: ptr must be aligned to a 4-byte boundary. // //go:noescape func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32) // swapUint64 atomically stores new into *ptr and returns (the previous *ptr // value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the // value of old is unspecified, and sig is the number of the signal that was // received. // // Preconditions: ptr must be aligned to a 8-byte boundary. // //go:noescape func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32) // compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns // (the value previously stored at ptr, 0). If a SIGSEGV or SIGBUS signal is // received during the operation, the value of prev is unspecified, and sig is // the number of the signal that was received. // // Preconditions: ptr must be aligned to a 4-byte boundary. // //go:noescape func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32) // LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It // may fail with SIGSEGV or SIGBUS if it is received while reading from ptr. // // Preconditions: ptr must be aligned to a 4-byte boundary. // //go:noescape func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) // Return the start address of the functions above. // // In Go 1.17+, Go references to assembly functions resolve to an ABIInternal // wrapper function rather than the function itself. We must reference from // assembly to get the ABI0 (i.e., primary) address. func addrOfMemcpy() uintptr func addrOfMemclr() uintptr func addrOfSwapUint32() uintptr func addrOfSwapUint64() uintptr func addrOfCompareAndSwapUint32() uintptr func addrOfLoadUint32() uintptr // CopyIn copies len(dst) bytes from src to dst. It returns the number of bytes // copied and an error if SIGSEGV or SIGBUS is received while reading from src. func CopyIn(dst []byte, src unsafe.Pointer) (int, error) { n, err := copyIn(dst, uintptr(src)) runtime.KeepAlive(src) return n, err } // copyIn is the underlying definition for CopyIn. func copyIn(dst []byte, src uintptr) (int, error) { toCopy := uintptr(len(dst)) if len(dst) == 0 { return 0, nil } fault, sig := memcpy(uintptr(unsafe.Pointer(&dst[0])), src, toCopy) if sig == 0 { return len(dst), nil } if fault < src || fault >= src+toCopy { panic(fmt.Sprintf("CopyIn raised signal %d at %#x, which is outside source [%#x, %#x)", sig, fault, src, src+toCopy)) } // memcpy might have ended the copy up to maxRegisterSize bytes before // fault, if an instruction caused a memory access that straddled two // pages, and the second one faulted. Try to copy up to the fault. var done int if fault-src > maxRegisterSize { done = int(fault - src - maxRegisterSize) } n, err := copyIn(dst[done:int(fault-src)], src+uintptr(done)) done += n if err != nil { return done, err } return done, errorFromFaultSignal(fault, sig) } // CopyOut copies len(src) bytes from src to dst. If returns the number of // bytes done and an error if SIGSEGV or SIGBUS is received while writing to // dst. func CopyOut(dst unsafe.Pointer, src []byte) (int, error) { n, err := copyOut(uintptr(dst), src) runtime.KeepAlive(dst) return n, err } // copyOut is the underlying definition for CopyOut. func copyOut(dst uintptr, src []byte) (int, error) { toCopy := uintptr(len(src)) if toCopy == 0 { return 0, nil } fault, sig := memcpy(dst, uintptr(unsafe.Pointer(&src[0])), toCopy) if sig == 0 { return len(src), nil } if fault < dst || fault >= dst+toCopy { panic(fmt.Sprintf("CopyOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, fault, dst, dst+toCopy)) } // memcpy might have ended the copy up to maxRegisterSize bytes before // fault, if an instruction caused a memory access that straddled two // pages, and the second one faulted. Try to copy up to the fault. var done int if fault-dst > maxRegisterSize { done = int(fault - dst - maxRegisterSize) } n, err := copyOut(dst+uintptr(done), src[done:int(fault-dst)]) done += n if err != nil { return done, err } return done, errorFromFaultSignal(fault, sig) } // Copy copies toCopy bytes from src to dst. It returns the number of bytes // copied and an error if SIGSEGV or SIGBUS is received while reading from src // or writing to dst. // // Data is copied in order; if [src, src+toCopy) and [dst, dst+toCopy) overlap, // the resulting contents of dst are unspecified. func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) { n, err := copyN(uintptr(dst), uintptr(src), toCopy) runtime.KeepAlive(dst) runtime.KeepAlive(src) return n, err } // copyN is the underlying definition for Copy. func copyN(dst, src uintptr, toCopy uintptr) (uintptr, error) { if toCopy == 0 { return 0, nil } fault, sig := memcpy(dst, src, toCopy) if sig == 0 { return toCopy, nil } // Did the fault occur while reading from src or writing to dst? faultAfterSrc := ^uintptr(0) if fault >= src { faultAfterSrc = fault - src } faultAfterDst := ^uintptr(0) if fault >= dst { faultAfterDst = fault - dst } if faultAfterSrc >= toCopy && faultAfterDst >= toCopy { panic(fmt.Sprintf("Copy raised signal %d at %#x, which is outside source [%#x, %#x) and destination [%#x, %#x)", sig, fault, src, src+toCopy, dst, dst+toCopy)) } faultedAfter := faultAfterSrc if faultedAfter > faultAfterDst { faultedAfter = faultAfterDst } // memcpy might have ended the copy up to maxRegisterSize bytes before // fault, if an instruction caused a memory access that straddled two // pages, and the second one faulted. Try to copy up to the fault. var done uintptr if faultedAfter > maxRegisterSize { done = faultedAfter - maxRegisterSize } n, err := copyN(dst+done, src+done, faultedAfter-done) done += n if err != nil { return done, err } return done, errorFromFaultSignal(fault, sig) } // ZeroOut writes toZero zero bytes to dst. It returns the number of bytes // written and an error if SIGSEGV or SIGBUS is received while writing to dst. func ZeroOut(dst unsafe.Pointer, toZero uintptr) (uintptr, error) { n, err := zeroOut(uintptr(dst), toZero) runtime.KeepAlive(dst) return n, err } // zeroOut is the underlying definition for ZeroOut. func zeroOut(dst uintptr, toZero uintptr) (uintptr, error) { if toZero == 0 { return 0, nil } fault, sig := memclr(dst, toZero) if sig == 0 { return toZero, nil } if fault < dst || fault >= dst+toZero { panic(fmt.Sprintf("ZeroOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, fault, dst, dst+toZero)) } // memclr might have ended the write up to maxRegisterSize bytes before // fault, if an instruction caused a memory access that straddled two // pages, and the second one faulted. Try to write up to the fault. var done uintptr if fault-dst > maxRegisterSize { done = fault - dst - maxRegisterSize } n, err := zeroOut(dst+done, fault-dst-done) done += n if err != nil { return done, err } return done, errorFromFaultSignal(fault, sig) } // SwapUint32 is equivalent to sync/atomic.SwapUint32, except that it returns // an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is // not aligned to a 4-byte boundary. func SwapUint32(ptr unsafe.Pointer, new uint32) (uint32, error) { if addr := uintptr(ptr); addr&3 != 0 { return 0, AlignmentError{addr, 4} } old, sig := swapUint32(ptr, new) return old, errorFromFaultSignal(uintptr(ptr), sig) } // SwapUint64 is equivalent to sync/atomic.SwapUint64, except that it returns // an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is // not aligned to an 8-byte boundary. func SwapUint64(ptr unsafe.Pointer, new uint64) (uint64, error) { if addr := uintptr(ptr); addr&7 != 0 { return 0, AlignmentError{addr, 8} } old, sig := swapUint64(ptr, new) return old, errorFromFaultSignal(uintptr(ptr), sig) } // CompareAndSwapUint32 is equivalent to atomicbitops.CompareAndSwapUint32, // except that it returns an error if SIGSEGV or SIGBUS is received while // accessing ptr, or if ptr is not aligned to a 4-byte boundary. func CompareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (uint32, error) { if addr := uintptr(ptr); addr&3 != 0 { return 0, AlignmentError{addr, 4} } prev, sig := compareAndSwapUint32(ptr, old, new) return prev, errorFromFaultSignal(uintptr(ptr), sig) } // LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It // may fail with SIGSEGV or SIGBUS if it is received while reading from ptr. // // Preconditions: ptr must be aligned to a 4-byte boundary. func LoadUint32(ptr unsafe.Pointer) (uint32, error) { if addr := uintptr(ptr); addr&3 != 0 { return 0, AlignmentError{addr, 4} } val, sig := loadUint32(ptr) return val, errorFromFaultSignal(uintptr(ptr), sig) } func errorFromFaultSignal(addr uintptr, sig int32) error { switch sig { case 0: return nil case int32(unix.SIGSEGV): return SegvError{addr} case int32(unix.SIGBUS): return BusError{addr} default: panic(fmt.Sprintf("safecopy got unexpected signal %d at address %#x", sig, addr)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/safecopy/safecopy_unsafe_state_autogen.go000066400000000000000000000000721465435605700276570ustar00rootroot00000000000000// automatically generated by stateify. package safecopy golang-gvisor-gvisor-0.0~20240729.0/pkg/safecopy/sighandler_amd64.s000066400000000000000000000075171465435605700245460ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "textflag.h" // The signals handled by sigHandler. #define SIGBUS 7 #define SIGSEGV 11 // Offsets to the registers in context->uc_mcontext.gregs[]. #define REG_RDI 0x68 #define REG_RAX 0x90 #define REG_IP 0xa8 // Offset to the si_addr field of siginfo. #define SI_CODE 0x08 #define SI_ADDR 0x10 // signalHandler is the signal handler for SIGSEGV and SIGBUS signals. It must // not be set up as a handler to any other signals. // // If the instruction causing the signal is within a safecopy-protected // function, the signal is handled such that execution resumes in the // appropriate fault handling stub with AX containing the faulting address and // DI containing the signal number. Otherwise control is transferred to the // previously configured signal handler (savedSigSegvHandler or // savedSigBusHandler). // // This function cannot be written in go because it runs whenever a signal is // received by the thread (preempting whatever was running), which includes when // garbage collector has stopped or isn't expecting any interactions (like // barriers). // // The arguments are the following: // DI - The signal number. // SI - Pointer to siginfo_t structure. // DX - Pointer to ucontext structure. TEXT ·signalHandler(SB),NOSPLIT|NOFRAME,$0 // Check if the signal is from the kernel. MOVQ $0x0, CX CMPL CX, SI_CODE(SI) JGE original_handler // Check if RIP is within the area we care about. MOVQ REG_IP(DX), CX CMPQ CX, ·memcpyBegin(SB) JB not_memcpy CMPQ CX, ·memcpyEnd(SB) JAE not_memcpy // Modify the context such that execution will resume in the fault // handler. LEAQ handleMemcpyFault(SB), CX JMP handle_fault not_memcpy: CMPQ CX, ·memclrBegin(SB) JB not_memclr CMPQ CX, ·memclrEnd(SB) JAE not_memclr LEAQ handleMemclrFault(SB), CX JMP handle_fault not_memclr: CMPQ CX, ·swapUint32Begin(SB) JB not_swapuint32 CMPQ CX, ·swapUint32End(SB) JAE not_swapuint32 LEAQ handleSwapUint32Fault(SB), CX JMP handle_fault not_swapuint32: CMPQ CX, ·swapUint64Begin(SB) JB not_swapuint64 CMPQ CX, ·swapUint64End(SB) JAE not_swapuint64 LEAQ handleSwapUint64Fault(SB), CX JMP handle_fault not_swapuint64: CMPQ CX, ·compareAndSwapUint32Begin(SB) JB not_casuint32 CMPQ CX, ·compareAndSwapUint32End(SB) JAE not_casuint32 LEAQ handleCompareAndSwapUint32Fault(SB), CX JMP handle_fault not_casuint32: CMPQ CX, ·loadUint32Begin(SB) JB not_loaduint32 CMPQ CX, ·loadUint32End(SB) JAE not_loaduint32 LEAQ handleLoadUint32Fault(SB), CX JMP handle_fault not_loaduint32: CMPQ CX, ·checkXstateBegin(SB) JB not_checkXstate CMPQ CX, ·checkXstateEnd(SB) JAE not_checkXstate LEAQ handleCheckXstateFault(SB), CX JMP handle_fault not_checkXstate: original_handler: // Jump to the previous signal handler, which is likely the golang one. XORQ CX, CX MOVQ ·savedSigBusHandler(SB), AX CMPL DI, $SIGSEGV CMOVQEQ ·savedSigSegVHandler(SB), AX JMP AX handle_fault: // Entered with the address of the fault handler in RCX; store it in // RIP. MOVQ CX, REG_IP(DX) // Store the faulting address in RAX. MOVQ SI_ADDR(SI), CX MOVQ CX, REG_RAX(DX) // Store the signal number in EDI. MOVL DI, REG_RDI(DX) RET // func addrOfSignalHandler() uintptr TEXT ·addrOfSignalHandler(SB), $0-8 MOVQ $·signalHandler(SB), AX MOVQ AX, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/safecopy/sighandler_arm64.s000066400000000000000000000074731465435605700245650ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "textflag.h" // The signals handled by sigHandler. #define SIGBUS 7 #define SIGSEGV 11 // Offsets to the registers in context->uc_mcontext.gregs[]. #define REG_R0 0xB8 #define REG_R1 0xC0 #define REG_PC 0x1B8 // Offset to the si_addr field of siginfo. #define SI_CODE 0x08 #define SI_ADDR 0x10 // signalHandler is the signal handler for SIGSEGV and SIGBUS signals. It must // not be set up as a handler to any other signals. // // If the instruction causing the signal is within a safecopy-protected // function, the signal is handled such that execution resumes in the // appropriate fault handling stub with R0 containing the faulting address and // R1 containing the signal number. Otherwise control is transferred to the // previously configured signal handler (savedSigSegvHandler or // savedSigBusHandler). // // This function cannot be written in go because it runs whenever a signal is // received by the thread (preempting whatever was running), which includes when // garbage collector has stopped or isn't expecting any interactions (like // barriers). // // The arguments are the following: // R0 - The signal number. // R1 - Pointer to siginfo_t structure. // R2 - Pointer to ucontext structure. TEXT ·signalHandler(SB),NOSPLIT,$0 // Check if the signal is from the kernel, si_code > 0 means a kernel signal. MOVD SI_CODE(R1), R7 CMPW $0x0, R7 BLE original_handler // Check if PC is within the area we care about. MOVD REG_PC(R2), R7 MOVD ·memcpyBegin(SB), R8 CMP R8, R7 BLO not_memcpy MOVD ·memcpyEnd(SB), R8 CMP R8, R7 BHS not_memcpy // Modify the context such that execution will resume in the fault handler. MOVD $handleMemcpyFault(SB), R7 B handle_fault not_memcpy: MOVD ·memclrBegin(SB), R8 CMP R8, R7 BLO not_memclr MOVD ·memclrEnd(SB), R8 CMP R8, R7 BHS not_memclr MOVD $handleMemclrFault(SB), R7 B handle_fault not_memclr: MOVD ·swapUint32Begin(SB), R8 CMP R8, R7 BLO not_swapuint32 MOVD ·swapUint32End(SB), R8 CMP R8, R7 BHS not_swapuint32 MOVD $handleSwapUint32Fault(SB), R7 B handle_fault not_swapuint32: MOVD ·swapUint64Begin(SB), R8 CMP R8, R7 BLO not_swapuint64 MOVD ·swapUint64End(SB), R8 CMP R8, R7 BHS not_swapuint64 MOVD $handleSwapUint64Fault(SB), R7 B handle_fault not_swapuint64: MOVD ·compareAndSwapUint32Begin(SB), R8 CMP R8, R7 BLO not_casuint32 MOVD ·compareAndSwapUint32End(SB), R8 CMP R8, R7 BHS not_casuint32 MOVD $handleCompareAndSwapUint32Fault(SB), R7 B handle_fault not_casuint32: MOVD ·loadUint32Begin(SB), R8 CMP R8, R7 BLO not_loaduint32 MOVD ·loadUint32End(SB), R8 CMP R8, R7 BHS not_loaduint32 MOVD $handleLoadUint32Fault(SB), R7 B handle_fault not_loaduint32: original_handler: // Jump to the previous signal handler, which is likely the golang one. MOVD ·savedSigBusHandler(SB), R7 MOVD ·savedSigSegVHandler(SB), R8 CMPW $SIGSEGV, R0 CSEL EQ, R8, R7, R7 B (R7) handle_fault: // Entered with the address of the fault handler in R7; store it in PC. MOVD R7, REG_PC(R2) // Store the faulting address in R0. MOVD SI_ADDR(R1), R7 MOVD R7, REG_R0(R2) // Store the signal number in R1. MOVW R0, REG_R1(R2) RET // func addrOfSignalHandler() uintptr TEXT ·addrOfSignalHandler(SB), $0-8 MOVD $·signalHandler(SB), R0 MOVD R0, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/safecopy/xrstor_amd64.s000066400000000000000000000037351465435605700237650ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "textflag.h" // handleCheckXstateFault returns (the value stored in AX, the value stored in DI). // Control is transferred to it when checkXstate below receives SIGSEGV or SIGBUS, // with the faulting address stored in AX and the signal number stored in DI. // // It must have the same frame configuration as memcpy so that it can undo any // potential call frame set up by the assembler. TEXT handleCheckXstateFault(SB), NOSPLIT|NOFRAME, $0-26 MOVQ AX, addr+8(FP) MOVL DI, sig+16(FP) LDMXCSR mxcsr+20(FP) BYTE $0xDB; BYTE $0xE2; // FNCLEX FLDCW cw+24(FP) RET // ·checkXstate verifies that the specified floating point state can be loaded. TEXT ·checkXstate(SB),NOSPLIT|NOFRAME,$0-26 // Store 0 as the returned signal number. If we run to completion, // this is the value the caller will see; if a signal is received, // handleMemcpyFault will store a different value in this address. MOVL $0, sig+16(FP) // MXCSR and the x87 control word are the only floating point state // that is callee-save and thus we must save. STMXCSR mxcsr+20(FP) FSTCW cw+24(FP) MOVQ addr+0(FP), DI MOVL $0xffffffff, AX MOVL $0xffffffff, DX XRSTOR64 (DI) // Restore MXCSR and the x87 control word. LDMXCSR mxcsr+20(FP) BYTE $0xDB; BYTE $0xE2; // FNCLEX FLDCW cw+24(FP) RET // func addrOfCheckXstate() uintptr TEXT ·addrOfCheckXstate(SB), $0-8 MOVQ $·checkXstate(SB), AX MOVQ AX, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/safemem/000077500000000000000000000000001465435605700210415ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/safemem/block_unsafe.go000066400000000000000000000176331465435605700240350ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package safemem import ( "fmt" "unsafe" "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/safecopy" "gvisor.dev/gvisor/pkg/sync" ) // A Block is a range of contiguous bytes, similar to []byte but with the // following differences: // // - The memory represented by a Block may require the use of safecopy to // access. // // - Block does not carry a capacity and cannot be expanded. // // Blocks are immutable and may be copied by value. The zero value of Block // represents an empty range, analogous to a nil []byte. type Block struct { // [start, start+length) is the represented memory. // // start is an unsafe.Pointer to ensure that Block prevents the represented // memory from being garbage-collected. start unsafe.Pointer length int // needSafecopy is true if accessing the represented memory requires the // use of safecopy. needSafecopy bool } // BlockFromSafeSlice returns a Block equivalent to slice, which is safe to // access without safecopy. func BlockFromSafeSlice(slice []byte) Block { return blockFromSlice(slice, false) } // BlockFromUnsafeSlice returns a Block equivalent to bs, which is not safe to // access without safecopy. func BlockFromUnsafeSlice(slice []byte) Block { return blockFromSlice(slice, true) } func blockFromSlice(slice []byte, needSafecopy bool) Block { if len(slice) == 0 { return Block{} } return Block{ start: unsafe.Pointer(&slice[0]), length: len(slice), needSafecopy: needSafecopy, } } // BlockFromSafePointer returns a Block equivalent to [ptr, ptr+length), which is // safe to access without safecopy. // // Preconditions: ptr+length does not overflow. func BlockFromSafePointer(ptr unsafe.Pointer, length int) Block { return blockFromPointer(ptr, length, false) } // BlockFromUnsafePointer returns a Block equivalent to [ptr, ptr+len), which // is not safe to access without safecopy. // // Preconditions: ptr+len does not overflow. func BlockFromUnsafePointer(ptr unsafe.Pointer, length int) Block { return blockFromPointer(ptr, length, true) } func blockFromPointer(ptr unsafe.Pointer, length int, needSafecopy bool) Block { if uptr := uintptr(ptr); uptr+uintptr(length) < uptr { panic(fmt.Sprintf("ptr %#x + len %#x overflows", uptr, length)) } return Block{ start: ptr, length: length, needSafecopy: needSafecopy, } } // DropFirst returns a Block equivalent to b, but with the first n bytes // omitted. It is analogous to the [n:] operation on a slice, except that if n // > b.Len(), DropFirst returns an empty Block instead of panicking. // // Preconditions: n >= 0. func (b Block) DropFirst(n int) Block { if n < 0 { panic(fmt.Sprintf("invalid n: %d", n)) } return b.DropFirst64(uint64(n)) } // DropFirst64 is equivalent to DropFirst but takes a uint64. func (b Block) DropFirst64(n uint64) Block { if n >= uint64(b.length) { return Block{} } return Block{ start: unsafe.Pointer(uintptr(b.start) + uintptr(n)), length: b.length - int(n), needSafecopy: b.needSafecopy, } } // TakeFirst returns a Block equivalent to the first n bytes of b. It is // analogous to the [:n] operation on a slice, except that if n > b.Len(), // TakeFirst returns a copy of b instead of panicking. // // Preconditions: n >= 0. func (b Block) TakeFirst(n int) Block { if n < 0 { panic(fmt.Sprintf("invalid n: %d", n)) } return b.TakeFirst64(uint64(n)) } // TakeFirst64 is equivalent to TakeFirst but takes a uint64. func (b Block) TakeFirst64(n uint64) Block { if n == 0 { return Block{} } if n >= uint64(b.length) { return b } return Block{ start: b.start, length: int(n), needSafecopy: b.needSafecopy, } } // ToSlice returns a []byte equivalent to b. func (b Block) ToSlice() []byte { return gohacks.Slice((*byte)(b.start), b.length) } // Addr returns b's start address as a uintptr. It returns uintptr instead of // unsafe.Pointer so that code using safemem cannot obtain unsafe.Pointers // without importing the unsafe package explicitly. // // Note that a uintptr is not recognized as a pointer by the garbage collector, // such that if there are no uses of b after a call to b.Addr() and the address // is to Go-managed memory, the returned uintptr does not prevent garbage // collection of the pointee. func (b Block) Addr() uintptr { return uintptr(b.start) } // Len returns b's length in bytes. func (b Block) Len() int { return b.length } // NeedSafecopy returns true if accessing b.ToSlice() requires the use of safecopy. func (b Block) NeedSafecopy() bool { return b.needSafecopy } // String implements fmt.Stringer.String. func (b Block) String() string { if uintptr(b.start) == 0 && b.length == 0 { return "" } var suffix string if b.needSafecopy { suffix = "*" } return fmt.Sprintf("[%#x-%#x)%s", uintptr(b.start), uintptr(b.start)+uintptr(b.length), suffix) } // Copy copies src.Len() or dst.Len() bytes, whichever is less, from src // to dst and returns the number of bytes copied. // // If src and dst overlap, the data stored in dst is unspecified. func Copy(dst, src Block) (int, error) { if !dst.needSafecopy && !src.needSafecopy { return copy(dst.ToSlice(), src.ToSlice()), nil } n := dst.length if n > src.length { n = src.length } if n == 0 { return 0, nil } switch { case dst.needSafecopy && !src.needSafecopy: return safecopy.CopyOut(dst.start, src.TakeFirst(n).ToSlice()) case !dst.needSafecopy && src.needSafecopy: return safecopy.CopyIn(dst.TakeFirst(n).ToSlice(), src.start) case dst.needSafecopy && src.needSafecopy: n64, err := safecopy.Copy(dst.start, src.start, uintptr(n)) return int(n64), err default: panic("unreachable") } } // Zero sets all bytes in dst to 0 and returns the number of bytes zeroed. func Zero(dst Block) (int, error) { if !dst.needSafecopy { bs := dst.ToSlice() if !sync.RaceEnabled { clear(bs) } else { bsLen := len(bs) if bsLen == 0 { return 0, nil } bs[0] = 0 for i := 1; i < bsLen; i *= 2 { copy(bs[i:], bs[:i]) } } return len(bs), nil } n64, err := safecopy.ZeroOut(dst.start, uintptr(dst.length)) return int(n64), err } // Safecopy atomics are no slower than non-safecopy atomics, so use the former // even when !b.needSafecopy to get consistent alignment checking. // SwapUint32 invokes safecopy.SwapUint32 on the first 4 bytes of b. // // Preconditions: b.Len() >= 4. func SwapUint32(b Block, new uint32) (uint32, error) { if b.length < 4 { panic(fmt.Sprintf("insufficient length: %d", b.length)) } return safecopy.SwapUint32(b.start, new) } // SwapUint64 invokes safecopy.SwapUint64 on the first 8 bytes of b. // // Preconditions: b.Len() >= 8. func SwapUint64(b Block, new uint64) (uint64, error) { if b.length < 8 { panic(fmt.Sprintf("insufficient length: %d", b.length)) } return safecopy.SwapUint64(b.start, new) } // CompareAndSwapUint32 invokes safecopy.CompareAndSwapUint32 on the first 4 // bytes of b. // // Preconditions: b.Len() >= 4. func CompareAndSwapUint32(b Block, old, new uint32) (uint32, error) { if b.length < 4 { panic(fmt.Sprintf("insufficient length: %d", b.length)) } return safecopy.CompareAndSwapUint32(b.start, old, new) } // LoadUint32 invokes safecopy.LoadUint32 on the first 4 bytes of b. // // Preconditions: b.Len() >= 4. func LoadUint32(b Block) (uint32, error) { if b.length < 4 { panic(fmt.Sprintf("insufficient length: %d", b.length)) } return safecopy.LoadUint32(b.start) } golang-gvisor-gvisor-0.0~20240729.0/pkg/safemem/io.go000066400000000000000000000224211465435605700220000ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package safemem import ( "errors" "io" "math" ) // ErrEndOfBlockSeq is returned by BlockSeqWriter when attempting to write // beyond the end of the BlockSeq. var ErrEndOfBlockSeq = errors.New("write beyond end of BlockSeq") // Reader represents a streaming byte source like io.Reader. type Reader interface { // ReadToBlocks reads up to dsts.NumBytes() bytes into dsts and returns the // number of bytes read. It may return a partial read without an error // (i.e. (n, nil) where 0 < n < dsts.NumBytes()). It should not return a // full read with an error (i.e. (dsts.NumBytes(), err) where err != nil); // note that this differs from io.Reader.Read (in particular, io.EOF should // not be returned if ReadToBlocks successfully reads dsts.NumBytes() // bytes.) ReadToBlocks(dsts BlockSeq) (uint64, error) } // Writer represents a streaming byte sink like io.Writer. type Writer interface { // WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns // the number of bytes written. It may return a partial write without an // error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not // return a full write with an error (i.e. srcs.NumBytes(), err) where err // != nil). WriteFromBlocks(srcs BlockSeq) (uint64, error) } // ReadFullToBlocks repeatedly invokes r until dsts.NumBytes() bytes have been // read or r returns an error. Note that we avoid a Reader interface receiver // to avoid heap allocation. func ReadFullToBlocks(r ReaderFunc, dsts BlockSeq) (uint64, error) { var done uint64 for !dsts.IsEmpty() { n, err := r(dsts) done += n if err != nil { return done, err } dsts = dsts.DropFirst64(n) } return done, nil } // WriteFullFromBlocks repeatedly invokes w until srcs.NumBytes() bytes have // been written or w returns an error. Note that we avoid a Writer interface // receiver to avoid heap allocation. func WriteFullFromBlocks(w WriterFunc, srcs BlockSeq) (uint64, error) { var done uint64 for !srcs.IsEmpty() { n, err := w(srcs) done += n if err != nil { return done, err } srcs = srcs.DropFirst64(n) } return done, nil } // BlockSeqReader implements Reader by reading from a BlockSeq. type BlockSeqReader struct { Blocks BlockSeq } // ReadToBlocks implements Reader.ReadToBlocks. func (r *BlockSeqReader) ReadToBlocks(dsts BlockSeq) (uint64, error) { n, err := CopySeq(dsts, r.Blocks) r.Blocks = r.Blocks.DropFirst64(n) if err != nil { return n, err } if n < dsts.NumBytes() { return n, io.EOF } return n, nil } // BlockSeqWriter implements Writer by writing to a BlockSeq. type BlockSeqWriter struct { Blocks BlockSeq } // WriteFromBlocks implements Writer.WriteFromBlocks. func (w *BlockSeqWriter) WriteFromBlocks(srcs BlockSeq) (uint64, error) { n, err := CopySeq(w.Blocks, srcs) w.Blocks = w.Blocks.DropFirst64(n) if err != nil { return n, err } if n < srcs.NumBytes() { return n, ErrEndOfBlockSeq } return n, nil } // ReaderFunc implements Reader for a function with the semantics of // Reader.ReadToBlocks. type ReaderFunc func(dsts BlockSeq) (uint64, error) // ReadToBlocks implements Reader.ReadToBlocks. func (f ReaderFunc) ReadToBlocks(dsts BlockSeq) (uint64, error) { return f(dsts) } // WriterFunc implements Writer for a function with the semantics of // Writer.WriteFromBlocks. type WriterFunc func(srcs BlockSeq) (uint64, error) // WriteFromBlocks implements Writer.WriteFromBlocks. func (f WriterFunc) WriteFromBlocks(srcs BlockSeq) (uint64, error) { return f(srcs) } // ToIOReader implements io.Reader for a (safemem.)Reader. // // ToIOReader will return a successful partial read iff Reader.ReadToBlocks does // so. type ToIOReader struct { Reader Reader } // Read implements io.Reader.Read. func (r ToIOReader) Read(dst []byte) (int, error) { n, err := r.Reader.ReadToBlocks(BlockSeqOf(BlockFromSafeSlice(dst))) return int(n), err } // FromIOReader implements Reader for an io.Reader by repeatedly invoking // io.Reader.Read until it returns an error or partial read. This is not // thread-safe. // // FromIOReader will return a successful partial read iff Reader.Read does so. type FromIOReader struct { Reader io.Reader } // ReadToBlocks implements Reader.ReadToBlocks. func (r FromIOReader) ReadToBlocks(dsts BlockSeq) (uint64, error) { var buf []byte var done uint64 for !dsts.IsEmpty() { dst := dsts.Head() var n int var err error n, buf, err = r.readToBlock(dst, buf) done += uint64(n) if n != dst.Len() { return done, err } dsts = dsts.Tail() if err != nil { if dsts.IsEmpty() && err == io.EOF { return done, nil } return done, err } } return done, nil } func (r FromIOReader) readToBlock(dst Block, buf []byte) (int, []byte, error) { // io.Reader isn't safecopy-aware, so we have to buffer Blocks that require // safecopy. if !dst.NeedSafecopy() { n, err := r.Reader.Read(dst.ToSlice()) return n, buf, err } if len(buf) < dst.Len() { buf = make([]byte, dst.Len()) } rn, rerr := r.Reader.Read(buf[:dst.Len()]) wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn])) if wberr != nil { return wbn, buf, wberr } return wbn, buf, rerr } // FromIOWriter implements Writer for an io.Writer by repeatedly invoking // io.Writer.Write until it returns an error or partial write. // // FromIOWriter will tolerate implementations of io.Writer.Write that return // partial writes with a nil error in contravention of io.Writer's // requirements, since Writer is permitted to do so. FromIOWriter will return a // successful partial write iff Writer.Write does so. type FromIOWriter struct { Writer io.Writer } // WriteFromBlocks implements Writer.WriteFromBlocks. func (w FromIOWriter) WriteFromBlocks(srcs BlockSeq) (uint64, error) { var buf []byte var done uint64 for !srcs.IsEmpty() { src := srcs.Head() var n int var err error n, buf, err = w.writeFromBlock(src, buf) done += uint64(n) if n != src.Len() || err != nil { return done, err } srcs = srcs.Tail() } return done, nil } func (w FromIOWriter) writeFromBlock(src Block, buf []byte) (int, []byte, error) { // io.Writer isn't safecopy-aware, so we have to buffer Blocks that require // safecopy. if !src.NeedSafecopy() { n, err := w.Writer.Write(src.ToSlice()) return n, buf, err } if len(buf) < src.Len() { buf = make([]byte, src.Len()) } bufn, buferr := Copy(BlockFromSafeSlice(buf[:src.Len()]), src) wn, werr := w.Writer.Write(buf[:bufn]) if werr != nil { return wn, buf, werr } return wn, buf, buferr } // FromVecReaderFunc implements Reader for a function that reads data into a // [][]byte and returns the number of bytes read as an int64. type FromVecReaderFunc struct { ReadVec func(dsts [][]byte) (int64, error) } // ReadToBlocks implements Reader.ReadToBlocks. // // ReadToBlocks calls r.ReadVec at most once. func (r FromVecReaderFunc) ReadToBlocks(dsts BlockSeq) (uint64, error) { if dsts.IsEmpty() { return 0, nil } // Ensure that we don't pass a [][]byte with a total length > MaxInt64. dsts = dsts.TakeFirst64(uint64(math.MaxInt64)) dstSlices := make([][]byte, 0, dsts.NumBlocks()) // Buffer Blocks that require safecopy. for tmp := dsts; !tmp.IsEmpty(); tmp = tmp.Tail() { dst := tmp.Head() if dst.NeedSafecopy() { dstSlices = append(dstSlices, make([]byte, dst.Len())) } else { dstSlices = append(dstSlices, dst.ToSlice()) } } rn, rerr := r.ReadVec(dstSlices) dsts = dsts.TakeFirst64(uint64(rn)) var done uint64 var i int for !dsts.IsEmpty() { dst := dsts.Head() if dst.NeedSafecopy() { n, err := Copy(dst, BlockFromSafeSlice(dstSlices[i])) done += uint64(n) if err != nil { return done, err } } else { done += uint64(dst.Len()) } dsts = dsts.Tail() i++ } return done, rerr } // FromVecWriterFunc implements Writer for a function that writes data from a // [][]byte and returns the number of bytes written. type FromVecWriterFunc struct { WriteVec func(srcs [][]byte) (int64, error) } // WriteFromBlocks implements Writer.WriteFromBlocks. // // WriteFromBlocks calls w.WriteVec at most once. func (w FromVecWriterFunc) WriteFromBlocks(srcs BlockSeq) (uint64, error) { if srcs.IsEmpty() { return 0, nil } // Ensure that we don't pass a [][]byte with a total length > MaxInt64. srcs = srcs.TakeFirst64(uint64(math.MaxInt64)) srcSlices := make([][]byte, 0, srcs.NumBlocks()) // Buffer Blocks that require safecopy. var buferr error for tmp := srcs; !tmp.IsEmpty(); tmp = tmp.Tail() { src := tmp.Head() if src.NeedSafecopy() { slice := make([]byte, src.Len()) n, err := Copy(BlockFromSafeSlice(slice), src) srcSlices = append(srcSlices, slice[:n]) if err != nil { buferr = err break } } else { srcSlices = append(srcSlices, src.ToSlice()) } } n, err := w.WriteVec(srcSlices) if err != nil { return uint64(n), err } return uint64(n), buferr } golang-gvisor-gvisor-0.0~20240729.0/pkg/safemem/safemem.go000066400000000000000000000012371465435605700230100ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package safemem provides the Block and BlockSeq types. package safemem golang-gvisor-gvisor-0.0~20240729.0/pkg/safemem/safemem_state_autogen.go000066400000000000000000000000711465435605700257250ustar00rootroot00000000000000// automatically generated by stateify. package safemem golang-gvisor-gvisor-0.0~20240729.0/pkg/safemem/safemem_unsafe_state_autogen.go000066400000000000000000000000711465435605700272660ustar00rootroot00000000000000// automatically generated by stateify. package safemem golang-gvisor-gvisor-0.0~20240729.0/pkg/safemem/seq_unsafe.go000066400000000000000000000204021465435605700235170ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package safemem import ( "bytes" "fmt" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/gohacks" ) // A BlockSeq represents a sequence of Blocks, each of which has non-zero // length. // // BlockSeqs are immutable and may be copied by value. The zero value of // BlockSeq represents an empty sequence. type BlockSeq struct { // If length is 0, then the BlockSeq is empty. Invariants: data == 0; // offset == 0; limit == 0. // // If length is -1, then the BlockSeq represents the single Block{data, // limit, false}. Invariants: offset == 0; limit > 0; limit does not // overflow the range of an int. // // If length is -2, then the BlockSeq represents the single Block{data, // limit, true}. Invariants: offset == 0; limit > 0; limit does not // overflow the range of an int. // // Otherwise, length >= 2, and the BlockSeq represents the `length` Blocks // in the array of Blocks starting at address `data`, starting at `offset` // bytes into the first Block and limited to the following `limit` bytes. // Invariants: data != 0; offset < len(data[0]); limit > 0; offset+limit <= // the combined length of all Blocks in the array; the first Block in the // array has non-zero length. // // length is never 1; sequences consisting of a single Block are always // stored inline (with length < 0). data unsafe.Pointer length int offset int limit uint64 } // BlockSeqOf returns a BlockSeq representing the single Block b. func BlockSeqOf(b Block) BlockSeq { if b.length == 0 { return BlockSeq{} } bs := BlockSeq{ data: b.start, length: -1, limit: uint64(b.length), } if b.needSafecopy { bs.length = -2 } return bs } // BlockSeqFromSlice returns a BlockSeq representing all Blocks in slice. // If slice contains Blocks with zero length, BlockSeq will skip them during // iteration. // // Whether the returned BlockSeq shares memory with slice is unspecified; // clients should avoid mutating slices passed to BlockSeqFromSlice. // // Preconditions: The combined length of all Blocks in slice <= math.MaxUint64. func BlockSeqFromSlice(slice []Block) BlockSeq { slice = skipEmpty(slice) var limit uint64 for _, b := range slice { sum := limit + uint64(b.Len()) if sum < limit { panic("BlockSeq length overflows uint64") } limit = sum } return blockSeqFromSliceLimited(slice, limit) } // Preconditions: // - The combined length of all Blocks in slice <= limit. // - If len(slice) != 0, the first Block in slice has non-zero length and // limit > 0. func blockSeqFromSliceLimited(slice []Block, limit uint64) BlockSeq { switch len(slice) { case 0: return BlockSeq{} case 1: return BlockSeqOf(slice[0].TakeFirst64(limit)) default: return BlockSeq{ data: unsafe.Pointer(&slice[0]), length: len(slice), limit: limit, } } } func skipEmpty(slice []Block) []Block { for i, b := range slice { if b.Len() != 0 { return slice[i:] } } return nil } // IsEmpty returns true if bs contains no Blocks. // // Invariants: bs.IsEmpty() == (bs.NumBlocks() == 0) == (bs.NumBytes() == 0). // (Of these, prefer to use bs.IsEmpty().) func (bs BlockSeq) IsEmpty() bool { return bs.length == 0 } // NumBlocks returns the number of Blocks in bs. func (bs BlockSeq) NumBlocks() int { // In general, we have to count: if bs represents a windowed slice then the // slice may contain Blocks with zero length, and bs.length may be larger // than the actual number of Blocks due to bs.limit. var n int for !bs.IsEmpty() { n++ bs = bs.Tail() } return n } // NumBytes returns the sum of Block.Len() for all Blocks in bs. func (bs BlockSeq) NumBytes() uint64 { return bs.limit } // Head returns the first Block in bs. // // Preconditions: !bs.IsEmpty(). func (bs BlockSeq) Head() Block { if bs.length == 0 { panic("empty BlockSeq") } if bs.length < 0 { return bs.internalBlock() } return (*Block)(bs.data).DropFirst(bs.offset).TakeFirst64(bs.limit) } // Preconditions: bs.length < 0. func (bs BlockSeq) internalBlock() Block { return Block{ start: bs.data, length: int(bs.limit), needSafecopy: bs.length == -2, } } // Tail returns a BlockSeq consisting of all Blocks in bs after the first. // // Preconditions: !bs.IsEmpty(). func (bs BlockSeq) Tail() BlockSeq { if bs.length == 0 { panic("empty BlockSeq") } if bs.length < 0 { return BlockSeq{} } data := (*Block)(bs.data) head := data.DropFirst(bs.offset) headLen := uint64(head.Len()) if headLen >= bs.limit { // The head Block exhausts the limit, so the tail is empty. return BlockSeq{} } extSlice := gohacks.Slice(data, bs.length) tailSlice := skipEmpty(extSlice[1:]) tailLimit := bs.limit - headLen return blockSeqFromSliceLimited(tailSlice, tailLimit) } // DropFirst returns a BlockSeq equivalent to bs, but with the first n bytes // omitted. If n > bs.NumBytes(), DropFirst returns an empty BlockSeq. // // Preconditions: n >= 0. func (bs BlockSeq) DropFirst(n int) BlockSeq { if n < 0 { panic(fmt.Sprintf("invalid n: %d", n)) } return bs.DropFirst64(uint64(n)) } // DropFirst64 is equivalent to DropFirst but takes an uint64. func (bs BlockSeq) DropFirst64(n uint64) BlockSeq { if n >= bs.limit { return BlockSeq{} } for { // Calling bs.Head() here is surprisingly expensive, so inline getting // the head's length. var headLen uint64 if bs.length < 0 { headLen = bs.limit } else { headLen = uint64((*Block)(bs.data).Len() - bs.offset) } if n < headLen { // Dropping ends partway through the head Block. if bs.length < 0 { return BlockSeqOf(bs.internalBlock().DropFirst64(n)) } bs.offset += int(n) bs.limit -= n return bs } n -= headLen bs = bs.Tail() } } // TakeFirst returns a BlockSeq equivalent to the first n bytes of bs. If n > // bs.NumBytes(), TakeFirst returns a BlockSeq equivalent to bs. // // Preconditions: n >= 0. func (bs BlockSeq) TakeFirst(n int) BlockSeq { if n < 0 { panic(fmt.Sprintf("invalid n: %d", n)) } return bs.TakeFirst64(uint64(n)) } // TakeFirst64 is equivalent to TakeFirst but takes a uint64. func (bs BlockSeq) TakeFirst64(n uint64) BlockSeq { if n == 0 { return BlockSeq{} } if bs.limit > n { bs.limit = n } return bs } // String implements fmt.Stringer.String. func (bs BlockSeq) String() string { var buf bytes.Buffer buf.WriteByte('[') var sep string for !bs.IsEmpty() { buf.WriteString(sep) sep = " " buf.WriteString(bs.Head().String()) bs = bs.Tail() } buf.WriteByte(']') return buf.String() } // CopySeq copies srcs.NumBytes() or dsts.NumBytes() bytes, whichever is less, // from srcs to dsts and returns the number of bytes copied. // // If srcs and dsts overlap, the data stored in dsts is unspecified. func CopySeq(dsts, srcs BlockSeq) (uint64, error) { var done uint64 for !dsts.IsEmpty() && !srcs.IsEmpty() { dst := dsts.Head() src := srcs.Head() n, err := Copy(dst, src) done += uint64(n) if err != nil { return done, err } dsts = dsts.DropFirst(n) srcs = srcs.DropFirst(n) } return done, nil } // ZeroSeq sets all bytes in dsts to 0 and returns the number of bytes zeroed. func ZeroSeq(dsts BlockSeq) (uint64, error) { var done uint64 for !dsts.IsEmpty() { n, err := Zero(dsts.Head()) done += uint64(n) if err != nil { return done, err } dsts = dsts.DropFirst(n) } return done, nil } // IovecsFromBlockSeq returns a []unix.Iovec representing seq. func IovecsFromBlockSeq(bs BlockSeq) []unix.Iovec { iovs := make([]unix.Iovec, 0, bs.NumBlocks()) for ; !bs.IsEmpty(); bs = bs.Tail() { b := bs.Head() iovs = append(iovs, unix.Iovec{ Base: &b.ToSlice()[0], Len: uint64(b.Len()), }) // We don't need to care about b.NeedSafecopy(), because the host // kernel will handle such address ranges just fine (by returning // EFAULT). } return iovs } golang-gvisor-gvisor-0.0~20240729.0/pkg/seccomp/000077500000000000000000000000001465435605700210555ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/seccomp/precompiledseccomp/000077500000000000000000000000001465435605700247325ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/seccomp/precompiledseccomp/precompiledseccomp.go000066400000000000000000000314271465435605700311450ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package precompiledseccomp provides tooling to precompile seccomp-bpf // programs that can be embedded inside Go source code. package precompiledseccomp import ( "encoding/binary" "fmt" "sort" "strings" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/seccomp" ) // ProgramDesc describes a program to be compiled. type ProgramDesc struct { // Rules contains the seccomp-bpf rulesets to compile. Rules []seccomp.RuleSet // SeccompOptions is the seccomp-bpf program options used in compilation. SeccompOptions seccomp.ProgramOptions } // Program is a precompiled seccomp-bpf program. // To get actual BPF instructions, call the `RenderInstructions` function. type Program struct { // Name is the name of this program within a set of embedded programs. Name string // Bytecode32 is the raw BPF bytecode represented as a sequence of uint32s. Bytecode32 []uint32 // VarOffsets maps variable names to the uint32-based offsets where these // variables show up in `Bytecode32`. VarOffsets map[string][]int } // Values is an assignment of variables to uint32 values. // It is used when rendering seccomp-bpf program instructions. type Values map[string]uint32 const ( uint64VarSuffixHigh = "_high32bits" uint64VarSuffixLow = "_low32bits" ) // SetUint64 sets the value of a 64-bit variable in `v`. // Under the hood, this is stored as two 32-bit variables. // Use `Values.GetUint64` to retrieve the 64-bit variable. func (v Values) SetUint64(varName string, value uint64) { v[varName+uint64VarSuffixHigh] = uint32(value >> 32) v[varName+uint64VarSuffixLow] = uint32(value) } // GetUint64 retrieves the value of a 64-bit variable set using // `Values.SetUint64(varName)`. func (v Values) GetUint64(varName string) uint64 { return uint64(v[varName+"_high32bits"])<<32 | uint64(v[varName+"_low32bits"]) } // Precompile compiles a `ProgramDesc` with the given values. // It supports the notion of "variables", which are named in `vars`. // Variables are uint32s which are only known at runtime, and whose value // shows up in the BPF bytecode. // // `fn` takes in a mapping of variable names to their assigned values, // and should return a `ProgramDesc` describing the seccomp-bpf program // to be compiled. // // Precompile verifies that all variables in `vars` show up consistently in // the bytecode by compiling the program twice, ensures that the offsets at // which some stand-in values is consistent across these two compilation // attempts, and that nothing else about the BPF bytecode is different. func Precompile(name string, varNames []string, fn func(Values) ProgramDesc) (Program, error) { vars := make(map[string]struct{}, len(varNames)) for _, varName := range varNames { vars[varName] = struct{}{} } if len(vars) != len(varNames) { return Program{}, fmt.Errorf("non-unique variable names: %q", varNames) } // These constants are chosen to be recognizable and unique within // seccomp-bpf programs. // These could of course show up in seccomp-bpf programs for legitimate // reasons other than being part the variable being matched against (e.g. a // jump of this many instructions forward, or a static equality match that // happens to check against this exact value), but it is very unlikely that // integers this large actually occur. // If it does happen, we'll catch it here because one compilation attempt // will find its placeholder values show up less often than the other. // Assuming that the reason this occurred is legitimate, update these // constants to even-less-likely values in order to fix this issue. const ( varStart1 uint32 = 0x13371337 varStart2 uint32 = 0x42424243 ) // Render the program with one set of values. // Remember at which offsets we saw these values show up in the bytecode. values1 := Values(make(map[string]uint32, len(vars))) v := varStart1 for varName := range vars { values1[varName] = v v += 2 } program1, err := precompile(name, values1, fn) if err != nil { return Program{}, err } // Do the same, but with a different set of values. values2 := Values(make(map[string]uint32, len(vars))) v = varStart2 for _, varName := range varNames { values2[varName] = v v += 2 } program2, err := precompile(name, values2, fn) if err != nil { return Program{}, err } // Ensure that the offsets we got is consistent. for _, varName := range varNames { offsets1 := program1.VarOffsets[varName] offsets2 := program2.VarOffsets[varName] if len(offsets1) != len(offsets2) { return Program{}, fmt.Errorf("var %q has different number of offsets depending on its value: with value 0x%08x it showed up %d times, but with value %d it showed up %d times", varName, values1[varName], len(offsets1), values2[varName], len(offsets2)) } for i := 0; i < len(offsets1); i++ { if offsets1[i] != offsets2[i] { return Program{}, fmt.Errorf("var %q has different offsets depending on its value: with value 0x%08x it showed up at offsets %v, but with value %d it showed up at offsets %v", varName, values1[varName], offsets1, values2[varName], offsets2) } } } // Ensure that the rest of the bytecode is exactly equal. if len(program1.Bytecode32) != len(program2.Bytecode32) { return Program{}, fmt.Errorf("compiled programs do not have the same bytecode size: %d vs %d", len(program1.Bytecode32), len(program2.Bytecode32)) } knownOffsets := map[int]struct{}{} for _, varName := range varNames { for _, offset := range program1.VarOffsets[varName] { knownOffsets[offset] = struct{}{} } } for i := 0; i < len(program1.Bytecode32); i++ { if _, isVarOffset := knownOffsets[i]; isVarOffset { continue } if program1.Bytecode32[i] != program2.Bytecode32[i] { return Program{}, fmt.Errorf("compiled programs do not have the same bytecode at uint32 offset %d (which is not any of the offsets where a variable shows up: %v)", i, knownOffsets) } } return program1, nil } // precompile compiles a `ProgramDesc` with the given values. func precompile(name string, values Values, fn func(Values) ProgramDesc) (Program, error) { precompileOpts := fn(values) insns, _, err := seccomp.BuildProgram(precompileOpts.Rules, precompileOpts.SeccompOptions) if err != nil { return Program{}, err } if log.IsLogging(log.Debug) { log.Debugf("Compiled program with values %v (%d instructions):", values, len(insns)) for i, insn := range insns { log.Debugf(" %04d: %s\n", i, insn.String()) } } bytecode32 := instructionsToUint32Slice(insns) varOffsets := getVarOffsets(bytecode32, values) // nonOptimizedOffsets stores the offsets at which each variable shows up // in the non-optimized version of the program. It is only computed when // a variable doesn't show up in the optimized version of the program. var nonOptimizedOffsets map[string][]int computeNonOptimizedOffsets := func() error { if nonOptimizedOffsets != nil { return nil } if !precompileOpts.SeccompOptions.Optimize { nonOptimizedOffsets = varOffsets return nil } nonOptimizedOpts := precompileOpts.SeccompOptions nonOptimizedOpts.Optimize = false nonOptInsns, _, err := seccomp.BuildProgram(precompileOpts.Rules, nonOptimizedOpts) if err != nil { return fmt.Errorf("cannot build seccomp program with optimizations disabled: %w", err) } nonOptimizedOffsets = getVarOffsets(instructionsToUint32Slice(nonOptInsns), values) return nil } for varName := range values { if len(varOffsets[varName]) == 0 { // If the variable doesn't show up in the optimized program but does // show up in the non-optimized program, then it is not unused. // It is being optimized away, e.g. as a result of being OR'd with a // `MatchAll` rule. // Only report an error if the variable shows up in neither optimized // nor non-optimized bytecode. if err := computeNonOptimizedOffsets(); err != nil { return Program{}, fmt.Errorf("cannot compute variable offsets for the non-optimized version of the program: %v", err) } if len(nonOptimizedOffsets[varName]) == 0 { return Program{}, fmt.Errorf("var %q does not show up in the BPF bytecode", varName) } // We set the offset slice for this variable to a nil slice, so that // it gets properly serialized (as opposed to omitted entirely) in the // generated Go code. varOffsets[varName] = nil } } return Program{ Name: name, Bytecode32: bytecode32, VarOffsets: varOffsets, }, nil } // getVarOffsets returns the uint32-based offsets at which the values of each // variable in `values` shows up. func getVarOffsets(bytecode32 []uint32, values Values) map[string][]int { varOffsets := make(map[string][]int, len(values)) for varName, value := range values { for i, v := range bytecode32 { if v == value { varOffsets[varName] = append(varOffsets[varName], i) } } } return varOffsets } // Expr renders a Go expression encoding this `Program`. // It is used when embedding a precompiled `Program` into a Go library file. // `pkgName` is the package name under which the precompiledseccomp package is // imported. func (program Program) Expr(indentPrefix, pkgName string) string { var sb strings.Builder sb.WriteString(fmt.Sprintf("%s.Program{\n", pkgName)) sb.WriteString(fmt.Sprintf("%s\tName: %q,\n", indentPrefix, program.Name)) sb.WriteString(fmt.Sprintf("%s\tBytecode32: []uint32{\n", indentPrefix)) for _, v := range program.Bytecode32 { sb.WriteString(fmt.Sprintf("%s\t\t0x%08x,\n", indentPrefix, v)) } sb.WriteString(fmt.Sprintf("%s\t},\n", indentPrefix)) sb.WriteString(fmt.Sprintf("%s\tVarOffsets: map[string][]int{\n", indentPrefix)) varNames := make([]string, 0, len(program.VarOffsets)) for varName := range program.VarOffsets { varNames = append(varNames, varName) } sort.Strings(varNames) for _, varName := range varNames { if len(program.VarOffsets[varName]) == 0 { sb.WriteString(fmt.Sprintf("%s\t\t%q: nil,\n", indentPrefix, varName)) continue } sb.WriteString(fmt.Sprintf("%s\t\t%q: []int{\n", indentPrefix, varName)) for _, v := range program.VarOffsets[varName] { sb.WriteString(fmt.Sprintf("%s\t\t\t%d,\n", indentPrefix, v)) } sb.WriteString(fmt.Sprintf("%s\t\t},\n", indentPrefix)) } sb.WriteString(fmt.Sprintf("%s\t},\n", indentPrefix)) sb.WriteString(fmt.Sprintf("%s}", indentPrefix)) return sb.String() } // RenderInstructions builds the set of precompiled BPF instructions, // replacing the variables with their values as given in `values`. // This must be called with the exact same set of variable names as was used // during `Precompile`. func (program Program) RenderInstructions(values Values) ([]bpf.Instruction, error) { if len(values) != len(program.VarOffsets) { return nil, fmt.Errorf("called with inconsistent vars: got %v expected %v", values, program.VarOffsets) } for varName, value := range values { offsets, found := program.VarOffsets[varName] if !found { return nil, fmt.Errorf("var %q was not defined in precompiled instructions (defined: %v)", varName, program.VarOffsets) } for _, offset := range offsets { program.Bytecode32[offset] = value } } return uint32SliceToInstructions(program.Bytecode32) } // instructionsToUint32Slice converts a slice of BPF instructions into a slice // of uint32s containing the same binary data. func instructionsToUint32Slice(insns []bpf.Instruction) []uint32 { bytecode := bpf.ToBytecode(insns) bytecode32 := make([]uint32, len(bytecode)/4) for i := 0; i < len(bytecode); i += 4 { bytecode32[i/4] = binary.NativeEndian.Uint32(bytecode[i : i+4]) } return bytecode32 } // uint32SliceToInstructions converts a slice of uint32s into a slice of // BPF instructions containing the same binary data. func uint32SliceToInstructions(bytecode32 []uint32) ([]bpf.Instruction, error) { bytecode := make([]byte, len(bytecode32)*4) for i, v := range bytecode32 { binary.NativeEndian.PutUint32(bytecode[i*4:], v) } return bpf.ParseBytecode(bytecode) } // Registration outputs Go code that registers this programs in a // `map[string]Program` variable named `programsMapVarName` which maps // programs names to their `Program` struct. // It is used when embedding precompiled programs into a Go library file. func (program Program) Registration(indentPrefix, pkgName, programsMapVarName string) string { return fmt.Sprintf("%s%s[%q] = %s\n", indentPrefix, programsMapVarName, program.Name, program.Expr(indentPrefix, pkgName)) } precompiledseccomp_state_autogen.go000066400000000000000000000001041465435605700337740ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/seccomp/precompiledseccomp// automatically generated by stateify. package precompiledseccomp golang-gvisor-gvisor-0.0~20240729.0/pkg/seccomp/seccomp.go000066400000000000000000001073701465435605700230450ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package seccomp provides generation of basic seccomp filters. Currently, // only little endian systems are supported. package seccomp import ( "fmt" "sort" "strings" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/log" ) const ( // skipOneInst is the offset to take for skipping one instruction. skipOneInst = 1 // defaultLabel is the label for the default action. defaultLabel = label("default_action") // vsyscallPageIPMask is the bit we expect to see in the instruction // pointer of a vsyscall call. vsyscallPageIPMask = 1 << 31 ) // Install generates BPF code based on the set of syscalls provided. It only // allows syscalls that conform to the specification. Syscalls that violate the // specification will trigger RET_KILL_PROCESS. If RET_KILL_PROCESS is not // supported, violations will trigger RET_TRAP instead. RET_KILL_THREAD is not // used because it only kills the offending thread and often keeps the sentry // hanging. // // denyRules describes forbidden syscalls. rules describes allowed syscalls. // denyRules is executed before rules. // // Be aware that RET_TRAP sends SIGSYS to the process and it may be ignored, // making it possible for the process to continue running after a violation. // However, it will leave a SECCOMP audit event trail behind. In any case, the // syscall is still blocked from executing. func Install(rules SyscallRules, denyRules SyscallRules, options ProgramOptions) error { // *** DEBUG TIP *** // If you suspect the Sentry is getting killed due to a seccomp violation, // look for the `debugFilter` boolean in `//runsc/boot/filter/filter.go`. log.Infof("Installing seccomp filters for %d syscalls (action=%v)", rules.Size(), options.DefaultAction) instrs, _, err := BuildProgram([]RuleSet{ { Rules: denyRules, Action: options.DefaultAction, }, { Rules: rules, Action: linux.SECCOMP_RET_ALLOW, }, }, options) if log.IsLogging(log.Debug) { programStr, errDecode := bpf.DecodeInstructions(instrs) if errDecode != nil { programStr = fmt.Sprintf("Error: %v\n%s", errDecode, programStr) } log.Debugf("Seccomp program dump:\n%s", programStr) } if err != nil { return err } // Perform the actual installation. if err := SetFilter(instrs); err != nil { return fmt.Errorf("failed to set filter: %v", err) } log.Infof("Seccomp filters installed.") return nil } // DefaultAction returns a sane default for a failure to match // a seccomp-bpf filter. Either kill the process, or trap. func DefaultAction() (linux.BPFAction, error) { available, err := isKillProcessAvailable() if err != nil { return 0, err } if available { return linux.SECCOMP_RET_KILL_PROCESS, nil } return linux.SECCOMP_RET_TRAP, nil } // RuleSet is a set of rules and associated action. type RuleSet struct { Rules SyscallRules Action linux.BPFAction // Vsyscall indicates that a check is made for a function being called // from kernel mappings. This is where the vsyscall page is located // (and typically) emulated, so this RuleSet will not match any // functions not dispatched from the vsyscall page. Vsyscall bool } // SyscallName gives names to system calls. It is used purely for debugging purposes. // // An alternate namer can be provided to the package at initialization time. var SyscallName = func(sysno uintptr) string { return fmt.Sprintf("syscall_%d", sysno) } // syscallProgram builds a BPF program for applying syscall rules. // It is a stateful struct that is updated as the program is built. type syscallProgram struct { // program is the underlying BPF program being built. program *bpf.ProgramBuilder } // Stmt adds a statement to the program. func (s *syscallProgram) Stmt(code uint16, k uint32) { s.program.AddStmt(code, k) } // label is a custom label type which is returned by `labelSet`. type label string // JumpTo adds a jump instruction to the program, jumping to the given label. func (s *syscallProgram) JumpTo(label label) { s.program.AddDirectJumpLabel(string(label)) } // If checks a condition and jumps to a label if the condition is true. // If the condition is false, the program continues executing (no jumping). func (s *syscallProgram) If(code uint16, k uint32, jt label) { s.program.AddJump(code, k, 0, skipOneInst) s.JumpTo(jt) } // IfNot checks a condition and jumps to a label if the condition is false. // If the condition is true, the program continues executing (no jumping). func (s *syscallProgram) IfNot(code uint16, k uint32, jf label) { s.program.AddJump(code, k, skipOneInst, 0) s.JumpTo(jf) } // Ret adds a return instruction to the program. func (s *syscallProgram) Ret(action linux.BPFAction) { s.Stmt(bpf.Ret|bpf.K, uint32(action)) } // Label adds a label to the program. // It panics if this label has already been added to the program. func (s *syscallProgram) Label(label label) { if err := s.program.AddLabel(string(label)); err != nil { panic(fmt.Sprintf("cannot add label %q to program: %v", label, err)) } } // Record starts recording the instructions added to the program from now on. // It returns a syscallFragment which can be used to perform assertions on the // possible set of outcomes of the set of instruction that has been added // since `Record` was called. func (s *syscallProgram) Record() syscallProgramFragment { return syscallProgramFragment{s.program.Record()} } // syscallProgramFragment represents a fragment of the syscall program. type syscallProgramFragment struct { getFragment func() bpf.ProgramFragment } // MustHaveJumpedTo asserts that the fragment must jump to one of the // given labels. // The fragment may not jump to any other label, nor return, nor fall through. func (f syscallProgramFragment) MustHaveJumpedTo(labels ...label) { f.MustHaveJumpedToOrReturned(labels, nil) } // MustHaveJumpedToOrReturned asserts that the fragment must jump to one of // the given labels, or have returned one of the given return values. // The fragment may not jump to any other label, nor fall through, // nor return a non-deterministic value. func (f syscallProgramFragment) MustHaveJumpedToOrReturned(possibleLabels []label, possibleReturnValues map[linux.BPFAction]struct{}) { fragment := f.getFragment() outcomes := fragment.Outcomes() if outcomes.MayFallThrough { panic(fmt.Sprintf("fragment %v may fall through", fragment)) } if len(possibleReturnValues) == 0 && outcomes.MayReturn() { panic(fmt.Sprintf("fragment %v may return", fragment)) } if outcomes.MayReturnRegisterA { panic(fmt.Sprintf("fragment %v may return register A", fragment)) } if outcomes.MayJumpToKnownOffsetBeyondFragment { panic(fmt.Sprintf("fragment %v may jump to an offset beyond the fragment", fragment)) } for jumpLabel := range outcomes.MayJumpToUnresolvedLabels { found := false for _, wantLabel := range possibleLabels { if jumpLabel == string(wantLabel) { found = true break } } if !found { panic(fmt.Sprintf("fragment %v may jump to a label %q which is not one of %v", fragment, jumpLabel, possibleLabels)) } } for returnValue := range outcomes.MayReturnImmediate { if _, found := possibleReturnValues[returnValue]; !found { panic(fmt.Sprintf("fragment %v may return a value %q which is not one of %v", fragment, returnValue, possibleReturnValues)) } } } // labelSet keeps track of labels that individual rules may jump to if they // either match or mismatch. // It can generate unique label names, and can be used recursively within // rules. type labelSet struct { // prefix is a label prefix used when generating label names. prefix string // labelCounter is used to generate unique label names. labelCounter int // ruleMatched is the label that a rule should jump to if it matches. ruleMatched label // ruleMismatched is the label that a rule should jump to if it doesn't // match. ruleMismatched label } // NewLabel returns a new unique label. func (l *labelSet) NewLabel() label { newLabel := label(fmt.Sprintf("%s#%d", l.prefix, l.labelCounter)) l.labelCounter++ return newLabel } // Matched returns the label to jump to if the rule matches. func (l *labelSet) Matched() label { return l.ruleMatched } // Mismatched returns the label to jump to if the rule does not match. func (l *labelSet) Mismatched() label { return l.ruleMismatched } // Push creates a new labelSet meant to be used in a recursive context of the // rule currently being rendered. // Labels generated by this new labelSet will have `labelSuffix` appended to // this labelSet's current prefix, and will have its matched/mismatched labels // point to the given labels. func (l *labelSet) Push(labelSuffix string, newRuleMatch, newRuleMismatch label) *labelSet { newPrefix := labelSuffix if l.prefix != "" { newPrefix = fmt.Sprintf("%s_%s", l.prefix, labelSuffix) } return &labelSet{ prefix: newPrefix, ruleMatched: newRuleMatch, ruleMismatched: newRuleMismatch, } } // matchedValue keeps track of BPF instructions needed to load a 64-bit value // being matched against. Since BPF can only do operations on 32-bit // instructions, value-matching code needs to selectively load one or the // other half of the 64-bit value. type matchedValue struct { program *syscallProgram dataOffsetHigh uint32 dataOffsetLow uint32 } // LoadHigh32Bits loads the high 32-bit of the 64-bit value into register A. func (m matchedValue) LoadHigh32Bits() { m.program.Stmt(bpf.Ld|bpf.Abs|bpf.W, m.dataOffsetHigh) } // LoadLow32Bits loads the low 32-bit of the 64-bit value into register A. func (m matchedValue) LoadLow32Bits() { m.program.Stmt(bpf.Ld|bpf.Abs|bpf.W, m.dataOffsetLow) } // ProgramOptions configure a seccomp program. type ProgramOptions struct { // DefaultAction is the action returned when none of the rules match. DefaultAction linux.BPFAction // BadArchAction is the action returned when the architecture of the // syscall structure input doesn't match the one the program expects. BadArchAction linux.BPFAction // Optimize specifies whether optimizations should be applied to the // syscall rules and generated BPF bytecode. Optimize bool // HotSyscalls is the set of syscall numbers that are the hottest, // where "hotness" refers to frequency (regardless of the amount of // computation that the kernel will do handling them, and regardless of // the complexity of the syscall rule for this). // It should only contain very hot syscalls (i.e. any syscall that is // called >10% of the time out of all syscalls made). // It is ordered from most frequent to least frequent. HotSyscalls []uintptr } // DefaultProgramOptions returns the default program options. func DefaultProgramOptions() ProgramOptions { action, err := DefaultAction() if err != nil { panic(fmt.Sprintf("cannot determine default seccomp action: %v", err)) } return ProgramOptions{ DefaultAction: action, BadArchAction: action, Optimize: true, } } // BuildStats contains information about seccomp program generation. type BuildStats struct { // SizeBeforeOptimizations and SizeAfterOptimizations correspond to the // number of instructions in the program before vs after optimization. SizeBeforeOptimizations, SizeAfterOptimizations int // BuildDuration is the amount of time it took to build the program (before // BPF bytecode optimizations). BuildDuration time.Duration // RuleOptimizeDuration is the amount of time it took to run SyscallRule // optimizations. RuleOptimizeDuration time.Duration // BPFOptimizeDuration is the amount of time it took to run BPF bytecode // optimizations. BPFOptimizeDuration time.Duration } // BuildProgram builds a BPF program from the given map of actions to matching // SyscallRules. The single generated program covers all provided RuleSets. func BuildProgram(rules []RuleSet, options ProgramOptions) ([]bpf.Instruction, BuildStats, error) { start := time.Now() // Make a copy of the syscall rules and maybe optimize them. ors, ruleOptimizeDuration, err := orderRuleSets(rules, options) if err != nil { return nil, BuildStats{}, err } possibleActions := make(map[linux.BPFAction]struct{}) for _, ruleSet := range rules { possibleActions[ruleSet.Action] = struct{}{} } program := &syscallProgram{ program: bpf.NewProgramBuilder(), } // Be paranoid and check that syscall is done in the expected architecture. // // A = seccomp_data.arch // if (A != AUDIT_ARCH) goto badArchLabel. badArchLabel := label("badarch") program.Stmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetArch) program.IfNot(bpf.Jmp|bpf.Jeq|bpf.K, LINUX_AUDIT_ARCH, badArchLabel) orsFrag := program.Record() if err := ors.render(program); err != nil { return nil, BuildStats{}, err } orsFrag.MustHaveJumpedToOrReturned([]label{defaultLabel}, possibleActions) // Default label if none of the rules matched: program.Label(defaultLabel) program.Ret(options.DefaultAction) // Label if the architecture didn't match: program.Label(badArchLabel) program.Ret(options.BadArchAction) insns, err := program.program.Instructions() if err != nil { return nil, BuildStats{}, err } beforeOpt := len(insns) buildDuration := time.Since(start) - ruleOptimizeDuration var bpfOptimizeDuration time.Duration afterOpt := beforeOpt if options.Optimize { insns = bpf.Optimize(insns) bpfOptimizeDuration = time.Since(start) - buildDuration - ruleOptimizeDuration afterOpt = len(insns) log.Debugf("Seccomp program optimized from %d to %d instructions; took %v to build and %v to optimize", beforeOpt, afterOpt, buildDuration, bpfOptimizeDuration) } return insns, BuildStats{ SizeBeforeOptimizations: beforeOpt, SizeAfterOptimizations: afterOpt, BuildDuration: buildDuration, RuleOptimizeDuration: ruleOptimizeDuration, BPFOptimizeDuration: bpfOptimizeDuration, }, nil } // singleSyscallRuleSet represents what to do for a single syscall. // It is used inside `orderedRules`. type singleSyscallRuleSet struct { sysno uintptr rules []syscallRuleAction vsyscall bool } // Render renders the ruleset for this syscall. func (ssrs singleSyscallRuleSet) Render(program *syscallProgram, ls *labelSet, noMatch label) { frag := program.Record() if ssrs.vsyscall { // Emit a vsyscall check. // This rule ensures that the top bit is set in the // instruction pointer, which is where the vsyscall page // will be mapped. program.Stmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetIPHigh) program.IfNot(bpf.Jmp|bpf.Jset|bpf.K, vsyscallPageIPMask, noMatch) } var nextRule label actions := make(map[linux.BPFAction]struct{}) for i, ra := range ssrs.rules { actions[ra.action] = struct{}{} // Render the rule. nextRule = ls.NewLabel() ruleLabels := ls.Push(fmt.Sprintf("sysno%d_rule%d", ssrs.sysno, i), ls.NewLabel(), nextRule) ruleFrag := program.Record() ra.rule.Render(program, ruleLabels) program.Label(ruleLabels.Matched()) program.Ret(ra.action) ruleFrag.MustHaveJumpedToOrReturned( []label{nextRule}, map[linux.BPFAction]struct{}{ ra.action: struct{}{}, }) program.Label(nextRule) } program.JumpTo(noMatch) frag.MustHaveJumpedToOrReturned([]label{noMatch}, actions) } // String returns a human-friendly representation of the // `singleSyscallRuleSet`. func (ssrs singleSyscallRuleSet) String() string { var sb strings.Builder if ssrs.vsyscall { sb.WriteString("Vsyscall ") } else { sb.WriteString("Syscall ") } sb.WriteString(fmt.Sprintf("%3d: ", ssrs.sysno)) switch len(ssrs.rules) { case 0: sb.WriteString("(no rules)") case 1: sb.WriteString(ssrs.rules[0].String()) default: sb.WriteRune('{') for i, r := range ssrs.rules { if i != 0 { sb.WriteString("; ") } sb.WriteString(r.String()) } sb.WriteRune('}') } return sb.String() } // syscallRuleAction groups a `SyscallRule` and an action that should be // returned if the rule matches. type syscallRuleAction struct { rule SyscallRule action linux.BPFAction } // String returns a human-friendly representation of the `syscallRuleAction`. func (sra syscallRuleAction) String() string { if _, isMatchAll := sra.rule.(MatchAll); isMatchAll { return sra.action.String() } return fmt.Sprintf("(%v) => %v", sra.rule.String(), sra.action) } // orderedRules contains an ordering of syscall rules used to render a // program. It is derived from a list of `RuleSet`s and `ProgramOptions`. // Its fields represent the order in which rulesets are rendered. // There are three categorization criteria: // - "Hot" vs "cold": hot syscalls go first and are checked linearly, cold // syscalls go later. // - "Trivial" vs "non-trivial": A "trivial" syscall rule means one that // does not require checking any argument or RIP data. This basically // means a syscall mapped to `MatchAll{}`. // If a syscall shows up in multiple RuleSets where any of them is // non-trivial, the whole syscall is considered non-trivial. // - "vsyscall" vs "non-vsyscall": A syscall that needs vsyscall checking // checks that the function is dispatched from the vsyscall page by // checking RIP. This inherently makes it non-trivial. All trivial // rules are non-vsyscall, but not all non-vsyscall rules are trivial. type orderedRuleSets struct { // hotNonTrivial is the set of hot syscalls that are non-trivial // and may or may not require vsyscall checking. // They come first and are checked linearly using `hotNonTrivialOrder`. hotNonTrivial map[uintptr]singleSyscallRuleSet // hotNonTrivial is the set of hot syscalls that are non-trivial // and may or may not require vsyscall checking. // They come first and are checked linearly using `hotNonTrivialOrder`. hotNonTrivialOrder []uintptr // coldNonTrivial is the set of non-hot syscalls that are non-trivial. // They may or may not require vsyscall checking. // They come second. coldNonTrivial map[uintptr]singleSyscallRuleSet // trivial is the set of syscalls that are trivial. They may or may not be // hot, but they may not require vsyscall checking (otherwise they would // be non-trivial). // They come last. This is because the host kernel will cache the results // of these system calls, and will never execute them on the hot path. trivial map[uintptr]singleSyscallRuleSet } // orderRuleSets converts a set of `RuleSet`s into an `orderedRuleSets`. // It orders the rulesets, along with the time to optimize the // rules (if any). func orderRuleSets(rules []RuleSet, options ProgramOptions) (orderedRuleSets, time.Duration, error) { // Do a pass to determine if vsyscall is consistent across syscall numbers. vsyscallBySysno := make(map[uintptr]bool) for _, rs := range rules { for sysno := range rs.Rules.rules { if prevVsyscall, ok := vsyscallBySysno[sysno]; ok { if prevVsyscall != rs.Vsyscall { return orderedRuleSets{}, 0, fmt.Errorf("syscall %d has conflicting vsyscall checking rules", sysno) } } else { vsyscallBySysno[sysno] = rs.Vsyscall } } } // Build a single map of per-syscall syscallRuleActions. // We will split this map up later. allSyscallRuleActions := make(map[uintptr][]syscallRuleAction) for _, rs := range rules { for sysno, rule := range rs.Rules.rules { existing, found := allSyscallRuleActions[sysno] if !found { allSyscallRuleActions[sysno] = []syscallRuleAction{{ rule: rule, action: rs.Action, }} continue } if existing[len(existing)-1].action == rs.Action { // If the last action for this syscall is the same, union the rules. existing[len(existing)-1].rule = Or{existing[len(existing)-1].rule, rule} } else { // Otherwise, add it as a new ruleset. existing = append(existing, syscallRuleAction{ rule: rule, action: rs.Action, }) } allSyscallRuleActions[sysno] = existing } } // Optimize all rules. var optimizeDuration time.Duration if options.Optimize { optimizeStart := time.Now() for _, ruleActions := range allSyscallRuleActions { for i, ra := range ruleActions { ra.rule = optimizeSyscallRule(ra.rule) ruleActions[i] = ra } } optimizeDuration = time.Since(optimizeStart) } // Do a pass that checks which syscall numbers are trivial. isTrivial := make(map[uintptr]bool) for sysno, ruleActions := range allSyscallRuleActions { for _, ra := range ruleActions { _, isMatchAll := ra.rule.(MatchAll) isVsyscall := vsyscallBySysno[sysno] trivial := isMatchAll && !isVsyscall if prevTrivial, ok := isTrivial[sysno]; ok { isTrivial[sysno] = prevTrivial && trivial } else { isTrivial[sysno] = trivial } } } // Compute the set of non-trivial hot syscalls and their order. hotNonTrivialSyscallsIndex := make(map[uintptr]int, len(options.HotSyscalls)) for i, sysno := range options.HotSyscalls { if _, hasRule := allSyscallRuleActions[sysno]; !hasRule { continue } if isTrivial[sysno] { continue } if _, ok := hotNonTrivialSyscallsIndex[sysno]; ok { continue } hotNonTrivialSyscallsIndex[sysno] = i } hotNonTrivialOrder := make([]uintptr, 0, len(hotNonTrivialSyscallsIndex)) for sysno := range hotNonTrivialSyscallsIndex { hotNonTrivialOrder = append(hotNonTrivialOrder, sysno) } sort.Slice(hotNonTrivialOrder, func(i, j int) bool { return hotNonTrivialSyscallsIndex[hotNonTrivialOrder[i]] < hotNonTrivialSyscallsIndex[hotNonTrivialOrder[j]] }) // Now split up the map and build the `orderedRuleSets`. ors := orderedRuleSets{ hotNonTrivial: make(map[uintptr]singleSyscallRuleSet), hotNonTrivialOrder: hotNonTrivialOrder, coldNonTrivial: make(map[uintptr]singleSyscallRuleSet), trivial: make(map[uintptr]singleSyscallRuleSet), } for sysno, ruleActions := range allSyscallRuleActions { _, hot := hotNonTrivialSyscallsIndex[sysno] trivial := isTrivial[sysno] var subMap map[uintptr]singleSyscallRuleSet switch { case trivial: subMap = ors.trivial case hot: subMap = ors.hotNonTrivial default: subMap = ors.coldNonTrivial } subMap[sysno] = singleSyscallRuleSet{ sysno: sysno, vsyscall: vsyscallBySysno[sysno], rules: ruleActions, } } // Log our findings. if log.IsLogging(log.Debug) { ors.log(log.Debugf) } return ors, optimizeDuration, nil } // log logs the set of seccomp rules to the given logger. func (ors orderedRuleSets) log(logFn func(string, ...any)) { logFn("Ordered seccomp rules:") for _, sm := range []struct { name string m map[uintptr]singleSyscallRuleSet }{ {"Hot non-trivial", ors.hotNonTrivial}, {"Cold non-trivial", ors.coldNonTrivial}, {"Trivial", ors.trivial}, } { if len(sm.m) == 0 { logFn(" %s syscalls: None.", sm.name) continue } logFn(" %s syscalls:", sm.name) orderedSysnos := make([]int, 0, len(sm.m)) for sysno := range sm.m { orderedSysnos = append(orderedSysnos, int(sysno)) } sort.Ints(orderedSysnos) for _, sysno := range orderedSysnos { logFn(" - %s", sm.m[uintptr(sysno)].String()) } } logFn("End of ordered seccomp rules.") } // render renders all rulesets in the given program. func (ors orderedRuleSets) render(program *syscallProgram) error { ls := &labelSet{prefix: string("ors")} // totalFrag wraps the entire output of the `render` function. totalFrag := program.Record() // Load syscall number into register A. program.Stmt(bpf.Ld|bpf.Abs|bpf.W, seccompDataOffsetNR) // Keep track of which syscalls we've already looked for. sysnosChecked := make(map[uintptr]struct{}) // First render hot syscalls linearly. if len(ors.hotNonTrivialOrder) > 0 { notHotLabel := ls.NewLabel() // hotFrag wraps the "hot syscalls" part of the program. // It must either return one of `hotActions`, or jump to `defaultLabel` if // the syscall number matched but the vsyscall match failed, or // `notHotLabel` if none of the hot syscall numbers matched. hotFrag := program.Record() possibleActions := ors.renderLinear(program, ls, sysnosChecked, ors.hotNonTrivial, ors.hotNonTrivialOrder, notHotLabel) hotFrag.MustHaveJumpedToOrReturned([]label{notHotLabel, defaultLabel}, possibleActions) program.Label(notHotLabel) } // Now render the cold non-trivial rules as a binary search tree: if len(ors.coldNonTrivial) > 0 { frag := program.Record() noSycallNumberMatch := ls.NewLabel() possibleActions, err := ors.renderBST(program, ls, sysnosChecked, ors.coldNonTrivial, noSycallNumberMatch) if err != nil { return err } frag.MustHaveJumpedToOrReturned([]label{noSycallNumberMatch, defaultLabel}, possibleActions) program.Label(noSycallNumberMatch) } // Finally render the trivial rules as a binary search tree: if len(ors.trivial) > 0 { frag := program.Record() noSycallNumberMatch := ls.NewLabel() possibleActions, err := ors.renderBST(program, ls, sysnosChecked, ors.trivial, noSycallNumberMatch) if err != nil { return err } frag.MustHaveJumpedToOrReturned([]label{noSycallNumberMatch, defaultLabel}, possibleActions) program.Label(noSycallNumberMatch) } program.JumpTo(defaultLabel) // Reached the end of the program. // Independently verify the set of all possible actions. allPossibleActions := make(map[linux.BPFAction]struct{}) for _, mapping := range []map[uintptr]singleSyscallRuleSet{ ors.hotNonTrivial, ors.coldNonTrivial, ors.trivial, } { for _, ssrs := range mapping { for _, ra := range ssrs.rules { allPossibleActions[ra.action] = struct{}{} } } } totalFrag.MustHaveJumpedToOrReturned([]label{defaultLabel}, allPossibleActions) return nil } // renderLinear renders linear search code that searches for syscall matches // in the given order. It assumes the syscall number is loaded into register // A. Rulesets for all syscall numbers in `order` must exist in `syscallMap`. // It returns the list of possible actions the generated code may return. // `alreadyChecked` will be updated with the syscalls that have been checked. func (ors orderedRuleSets) renderLinear(program *syscallProgram, ls *labelSet, alreadyChecked map[uintptr]struct{}, syscallMap map[uintptr]singleSyscallRuleSet, order []uintptr, noSycallNumberMatch label) map[linux.BPFAction]struct{} { allActions := make(map[linux.BPFAction]struct{}) for _, sysno := range order { ssrs, found := syscallMap[sysno] if !found { panic(fmt.Sprintf("syscall %d found in linear order but not map", sysno)) } nextSyscall := ls.NewLabel() // sysnoFrag wraps the "statements about this syscall number" part of // the program. It must either return one of the actions specified in // that syscall number's rules (`sysnoActions`), or jump to // `nextSyscall`. sysnoFrag := program.Record() sysnoActions := make(map[linux.BPFAction]struct{}) for _, ra := range ssrs.rules { sysnoActions[ra.action] = struct{}{} allActions[ra.action] = struct{}{} } program.IfNot(bpf.Jmp|bpf.Jeq|bpf.K, uint32(ssrs.sysno), nextSyscall) ssrs.Render(program, ls, defaultLabel) sysnoFrag.MustHaveJumpedToOrReturned([]label{nextSyscall, defaultLabel}, sysnoActions) program.Label(nextSyscall) } program.JumpTo(noSycallNumberMatch) for _, sysno := range order { alreadyChecked[sysno] = struct{}{} } return allActions } // renderBST renders a binary search tree that searches the given map of // syscalls. It assumes the syscall number is loaded into register A. // It returns the list of possible actions the generated code may return. // `alreadyChecked` will be updated with the syscalls that the BST has // searched. func (ors orderedRuleSets) renderBST(program *syscallProgram, ls *labelSet, alreadyChecked map[uintptr]struct{}, syscallMap map[uintptr]singleSyscallRuleSet, noSycallNumberMatch label) (map[linux.BPFAction]struct{}, error) { possibleActions := make(map[linux.BPFAction]struct{}) orderedSysnos := make([]uintptr, 0, len(syscallMap)) for sysno, ruleActions := range syscallMap { orderedSysnos = append(orderedSysnos, sysno) for _, ra := range ruleActions.rules { possibleActions[ra.action] = struct{}{} } } sort.Slice(orderedSysnos, func(i, j int) bool { return orderedSysnos[i] < orderedSysnos[j] }) frag := program.Record() root := createBST(orderedSysnos) root.root = true knownRng := knownRange{ lowerBoundExclusive: -1, // sysno fits in 32 bits, so this is definitely out of bounds: upperBoundExclusive: 1 << 32, previouslyChecked: alreadyChecked, } if err := root.traverse(renderBSTTraversal, knownRng, syscallMap, program, noSycallNumberMatch); err != nil { return nil, err } if err := root.traverse(renderBSTRules, knownRng, syscallMap, program, noSycallNumberMatch); err != nil { return nil, err } frag.MustHaveJumpedToOrReturned([]label{noSycallNumberMatch, defaultLabel}, possibleActions) for sysno := range syscallMap { alreadyChecked[sysno] = struct{}{} } return possibleActions, nil } // createBST converts sorted syscall slice into a balanced BST. // Panics if syscalls is empty. func createBST(syscalls []uintptr) *node { i := len(syscalls) / 2 parent := node{value: syscalls[i]} if i > 0 { parent.left = createBST(syscalls[:i]) } if i+1 < len(syscalls) { parent.right = createBST(syscalls[i+1:]) } return &parent } // renderBSTTraversal renders the traversal bytecode for a binary search tree. // The outline of the code is as follows, given a BST with: // // 22 // / \ // 9 24 // / / \ // 8 23 50 // // index_22: // SYS_PIPE(22), root // (A < 22) ? goto index_9 : continue // (A > 22) ? goto index_24 : continue // goto checkArgs_22 // // index_9: // SYS_MMAP(9), single child // (A < 9) ? goto index_8 : continue // (A == 9) ? continue : goto defaultLabel // goto checkArgs_9 // // index_8: // SYS_LSEEK(8), leaf // (A == 8) ? continue : goto defaultLabel // goto checkArgs_8 // // index_24: // SYS_SCHED_YIELD(24) // (A < 24) ? goto index_23 : continue // (A > 22) ? goto index_50 : continue // goto checkArgs_24 // // index_23: // SYS_SELECT(23), leaf with parent nodes adjacent in value // # Notice that we do not check for equality at all here, since we've // # already established that the syscall number is 23 from the // # two parent nodes that we've already traversed. // # This is tracked in the `rng knownRange` argument during traversal. // goto rules_23 // // index_50: // SYS_LISTEN(50), leaf // (A == 50) ? continue : goto defaultLabel // goto checkArgs_50 // // All of the "checkArgs_XYZ" labels are not defined in this function; they // are created using the `renderBSTRules` function, which is expected to be // called after this one on the entire BST. func renderBSTTraversal(n *node, rng knownRange, syscallMap map[uintptr]singleSyscallRuleSet, program *syscallProgram, searchFailed label) error { // Root node is never referenced by label, skip it. if !n.root { program.Label(n.label()) } sysno := n.value nodeFrag := program.Record() checkArgsLabel := label(fmt.Sprintf("checkArgs_%d", sysno)) if n.left != nil { program.IfNot(bpf.Jmp|bpf.Jge|bpf.K, uint32(sysno), n.left.label()) rng.lowerBoundExclusive = int(sysno - 1) } if n.right != nil { program.If(bpf.Jmp|bpf.Jgt|bpf.K, uint32(sysno), n.right.label()) rng.upperBoundExclusive = int(sysno + 1) } if rng.lowerBoundExclusive != int(sysno-1) || rng.upperBoundExclusive != int(sysno+1) { // If the previous BST nodes we traversed haven't fully established // that the current node's syscall value is exactly `sysno`, we still // need to verify it. program.IfNot(bpf.Jmp|bpf.Jeq|bpf.K, uint32(sysno), searchFailed) } program.JumpTo(checkArgsLabel) nodeFrag.MustHaveJumpedTo(n.left.label(), n.right.label(), checkArgsLabel, searchFailed) return nil } // renderBSTRules renders the `checkArgs_XYZ` labels that `renderBSTTraversal` // jumps to as part of the BST traversal code. It contains all the // argument-specific syscall rules for each syscall number. func renderBSTRules(n *node, rng knownRange, syscallMap map[uintptr]singleSyscallRuleSet, program *syscallProgram, searchFailed label) error { sysno := n.value checkArgsLabel := label(fmt.Sprintf("checkArgs_%d", sysno)) program.Label(checkArgsLabel) ruleSetsFrag := program.Record() possibleActions := make(map[linux.BPFAction]struct{}) for _, ra := range syscallMap[sysno].rules { possibleActions[ra.action] = struct{}{} } nodeLabelSet := &labelSet{prefix: string(n.label())} syscallMap[sysno].Render(program, nodeLabelSet, defaultLabel) ruleSetsFrag.MustHaveJumpedToOrReturned( []label{ defaultLabel, // Either we jumped to the default label (if the rules didn't match)... }, possibleActions, // ... or we returned one of the actions of the rulesets. ) return nil } // node represents a tree node. type node struct { value uintptr left *node right *node root bool } // label returns the label corresponding to this node. // // If n is nil, then the defaultLabel is returned. func (n *node) label() label { if n == nil { return defaultLabel } return label(fmt.Sprintf("node_%d", n.value)) } // knownRange represents the known set of node numbers that we've // already checked. This is used as part of BST traversal. type knownRange struct { lowerBoundExclusive int upperBoundExclusive int // alreadyChecked is a set of node values that were already checked // earlier in the program (prior to the BST being built). // It is *not* updated during BST traversal. previouslyChecked map[uintptr]struct{} } // withLowerBoundExclusive returns an updated `knownRange` with the given // new exclusive lower bound. The actual exclusive lower bound of the // returned `knownRange` may be higher, in case `previouslyChecked` covers // more numbers. func (kr knownRange) withLowerBoundExclusive(newLowerBoundExclusive int) knownRange { nkr := knownRange{ lowerBoundExclusive: newLowerBoundExclusive, upperBoundExclusive: kr.upperBoundExclusive, previouslyChecked: kr.previouslyChecked, } for ; nkr.lowerBoundExclusive < nkr.upperBoundExclusive; nkr.lowerBoundExclusive++ { if _, ok := nkr.previouslyChecked[uintptr(nkr.lowerBoundExclusive+1)]; !ok { break } } return nkr } // withUpperBoundExclusive returns an updated `knownRange` with the given // new exclusive upper bound. The actual exclusive upper bound of the // returned `knownRange` may be lower, in case `previouslyChecked` covers // more numbers. func (kr knownRange) withUpperBoundExclusive(newUpperBoundExclusive int) knownRange { nkr := knownRange{ lowerBoundExclusive: kr.lowerBoundExclusive, upperBoundExclusive: newUpperBoundExclusive, previouslyChecked: kr.previouslyChecked, } for ; nkr.lowerBoundExclusive < nkr.upperBoundExclusive; nkr.upperBoundExclusive-- { if _, ok := nkr.previouslyChecked[uintptr(nkr.upperBoundExclusive-1)]; !ok { break } } return nkr } // traverseFunc is called as the BST is traversed. type traverseFunc func(*node, knownRange, map[uintptr]singleSyscallRuleSet, *syscallProgram, label) error func (n *node) traverse(fn traverseFunc, kr knownRange, syscallMap map[uintptr]singleSyscallRuleSet, program *syscallProgram, searchFailed label) error { if n == nil { return nil } if err := fn(n, kr, syscallMap, program, searchFailed); err != nil { return err } if err := n.left.traverse( fn, kr.withUpperBoundExclusive(int(n.value)), syscallMap, program, searchFailed, ); err != nil { return err } return n.right.traverse( fn, kr.withLowerBoundExclusive(int(n.value)), syscallMap, program, searchFailed, ) } // DataAsBPFInput converts a linux.SeccompData to a bpf.Input. // It uses `buf` as scratch buffer; this buffer must be wide enough // to accommodate a mashaled version of `d`. func DataAsBPFInput(d *linux.SeccompData, buf []byte) bpf.Input { if len(buf) < d.SizeBytes() { panic(fmt.Sprintf("buffer must be at least %d bytes long", d.SizeBytes())) } d.MarshalUnsafe(buf) return buf[:d.SizeBytes()] } golang-gvisor-gvisor-0.0~20240729.0/pkg/seccomp/seccomp_amd64.go000066400000000000000000000014051465435605700240300ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package seccomp import ( "gvisor.dev/gvisor/pkg/abi/linux" ) const ( LINUX_AUDIT_ARCH = linux.AUDIT_ARCH_X86_64 SYS_SECCOMP = 317 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/seccomp/seccomp_amd64_state_autogen.go000066400000000000000000000001331465435605700267470ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 // +build amd64 package seccomp golang-gvisor-gvisor-0.0~20240729.0/pkg/seccomp/seccomp_arm64.go000066400000000000000000000014061465435605700240470ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package seccomp import ( "gvisor.dev/gvisor/pkg/abi/linux" ) const ( LINUX_AUDIT_ARCH = linux.AUDIT_ARCH_AARCH64 SYS_SECCOMP = 277 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/seccomp/seccomp_arm64_state_autogen.go000066400000000000000000000001331465435605700267650ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 // +build arm64 package seccomp golang-gvisor-gvisor-0.0~20240729.0/pkg/seccomp/seccomp_fuzz_helpers.go000066400000000000000000000147661465435605700256530ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package seccomp // This file contains helpers to generate fuzz tests for seccomp rules. // It contains the `InterestingValues` implementations for all matchers, // and a helper function to generate test cases based on `RuleSet`s. import ( "sort" "gvisor.dev/gvisor/pkg/abi/linux" ) // UsefulTestCases returns a best-effort list of test cases that may be // useful in fuzzing this set of rules. func (sr SyscallRules) UsefulTestCases() []linux.SeccompData { var testCases []linux.SeccompData for sysno, r := range sr.rules { // valueMatchers maps argument indexes to value matchers // seen for that argument index. // valueMatchersRepr tracks the `Repr()` of those // `ValueMatcher`s in order to avoid inserting duplicates. valueMatchers := make(map[int][]ValueMatcher) valueMatchersRepr := make(map[int]map[string]struct{}) // Find all unique `ValueMatcher`s for each argument. var processRule func(SyscallRule) SyscallRule processRule = func(r SyscallRule) SyscallRule { r.Recurse(processRule) pa, isPerArg := r.(PerArg) if !isPerArg { return r } for argNum, arg := range pa { if arg == nil { arg = AnyValue{} } valueMatchersReprMap, ok := valueMatchersRepr[argNum] if !ok { valueMatchersReprMap = make(map[string]struct{}) valueMatchersRepr[argNum] = valueMatchersReprMap } repr := arg.Repr() if _, seen := valueMatchersReprMap[repr]; seen { continue } valueMatchersReprMap[repr] = struct{}{} valueMatchers[argNum] = append(valueMatchers[argNum], arg) } return r } processRule(r) // Now compute the combination of all interesting values for them. sysnoCases := []linux.SeccompData{{ Nr: int32(sysno), Arch: LINUX_AUDIT_ARCH, }} for argNum, vms := range valueMatchers { // Deduplicate interesting values across value matchers. interestingValuesMap := make(map[uint64]struct{}) interestingValuesMap[0] = struct{}{} // The zero value is always interesting. for _, vm := range vms { for _, interestingValue := range vm.InterestingValues() { interestingValuesMap[interestingValue] = struct{}{} } } // Convert to sorted slice of integers. interestingValues := make([]uint64, 0, len(interestingValuesMap)) for interestingValue := range interestingValuesMap { interestingValues = append(interestingValues, interestingValue) } sort.Slice(interestingValues, func(i, j int) bool { return interestingValues[i] < interestingValues[j] }) // Generate test cases. newSysnoCases := make([]linux.SeccompData, 0, len(sysnoCases)*len(interestingValues)) for _, sysnoCase := range sysnoCases { for _, interestingValue := range interestingValues { if argNum == RuleIP { sysnoCase.InstructionPointer = interestingValue } else { sysnoCase.Args[argNum] = interestingValue } newSysnoCases = append(newSysnoCases, sysnoCase) } } sysnoCases = newSysnoCases } testCases = append(testCases, sysnoCases...) } return testCases } // InterestingValues implements `halfValueMatcher.InterestingValues`. func (halfAnyValue) InterestingValues() []uint32 { return []uint32{0} } // InterestingValues implements `halfValueMatcher.InterestingValues`. func (heq halfEqualTo) InterestingValues() []uint32 { return []uint32{uint32(heq), uint32(heq + 1)} } // InterestingValues implements `halfValueMatcher.InterestingValues`. func (hns halfNotSet) InterestingValues() []uint32 { return []uint32{uint32(hns), uint32(hns + 1)} } // InterestingValues implements `halfValueMatcher.InterestingValues`. func (hmeq halfMaskedEqual) InterestingValues() []uint32 { return []uint32{uint32(hmeq.mask), uint32(hmeq.mask + 1)} } // InterestingValues implements `halfValueMatcher.InterestingValues`. func (sm splitMatcher) InterestingValues() []uint64 { interestingHigh := sm.highMatcher.InterestingValues() interestingLow := sm.lowMatcher.InterestingValues() interesting := make([]uint64, 0, len(interestingHigh)*len(interestingLow)) for _, high := range interestingHigh { for _, low := range interestingLow { interesting = append(interesting, (uint64(high)<<32)|uint64(low)) } } return interesting } // InterestingValues implements `halfValueMatcher.InterestingValues`. func (av AnyValue) InterestingValues() []uint64 { return []uint64{0} } // InterestingValues implements `halfValueMatcher.InterestingValues`. func (eq EqualTo) InterestingValues() []uint64 { return eq.split().InterestingValues() } // InterestingValues implements `halfValueMatcher.InterestingValues`. func (ne NotEqual) InterestingValues() []uint64 { return EqualTo(ne).InterestingValues() } // InterestingValues implements `halfValueMatcher.InterestingValues`. func (gt GreaterThan) InterestingValues() []uint64 { return []uint64{ uint64(high32Bits(uintptr(gt))+1) << 32, uint64(high32Bits(uintptr(gt))-1) << 32, uint64(high32Bits(uintptr(gt))) << 32, (uint64(high32Bits(uintptr(gt))) << 32) + uint64(low32Bits(uintptr(gt))), (uint64(high32Bits(uintptr(gt))) << 32) + uint64(low32Bits(uintptr(gt))) + 1, (uint64(high32Bits(uintptr(gt))) << 32) + uint64(low32Bits(uintptr(gt))) - 1, } } // InterestingValues implements `halfValueMatcher.InterestingValues`. func (ge GreaterThanOrEqual) InterestingValues() []uint64 { return GreaterThan(ge).InterestingValues() } // InterestingValues implements `halfValueMatcher.InterestingValues`. func (lt LessThan) InterestingValues() []uint64 { return GreaterThan(lt).InterestingValues() } // InterestingValues implements `halfValueMatcher.InterestingValues`. func (le LessThanOrEqual) InterestingValues() []uint64 { return GreaterThan(le).InterestingValues() } // InterestingValues implements `halfValueMatcher.InterestingValues`. func (nnfd NonNegativeFD) InterestingValues() []uint64 { return nnfd.split().InterestingValues() } // InterestingValues implements `halfValueMatcher.InterestingValues`. func (me maskedEqual) InterestingValues() []uint64 { return me.split().InterestingValues() } golang-gvisor-gvisor-0.0~20240729.0/pkg/seccomp/seccomp_optimizer.go000066400000000000000000000600451465435605700251440ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package seccomp import ( "fmt" "strings" ) // ruleOptimizerFunc is a function type that can optimize a SyscallRule. // It returns the updated SyscallRule, along with whether any modification // was made. type ruleOptimizerFunc func(SyscallRule) (SyscallRule, bool) // convertSingleCompoundRuleToThatRule replaces `Or` or `And` rules with a // single branch to just that branch. func convertSingleCompoundRuleToThatRule[T Or | And](rule SyscallRule) (SyscallRule, bool) { if tRule, isT := rule.(T); isT && len(tRule) == 1 { return tRule[0], true } return rule, false } // flattenCompoundRules turns compound rules (Or or And) embedded inside // compound rules of the same type into a flat rule of that type. func flattenCompoundRules[T Or | And](rule SyscallRule) (SyscallRule, bool) { tRule, isT := rule.(T) if !isT { return rule, false } anySubT := false for _, subRule := range tRule { if _, subIsT := subRule.(T); subIsT { anySubT = true break } } if !anySubT { return rule, false } var newRules []SyscallRule for _, subRule := range tRule { if subT, subIsT := subRule.(T); subIsT { newRules = append(newRules, subT...) } else { newRules = append(newRules, subRule) } } return SyscallRule(T(newRules)), true } // convertMatchAllOrXToMatchAll an Or rule that contains MatchAll to MatchAll. func convertMatchAllOrXToMatchAll(rule SyscallRule) (SyscallRule, bool) { orRule, isOr := rule.(Or) if !isOr { return rule, false } for _, subRule := range orRule { if _, subIsMatchAll := subRule.(MatchAll); subIsMatchAll { return MatchAll{}, true } } return orRule, false } // convertMatchAllAndXToX removes MatchAll clauses from And rules. func convertMatchAllAndXToX(rule SyscallRule) (SyscallRule, bool) { andRule, isAnd := rule.(And) if !isAnd { return rule, false } hasMatchAll := false for _, subRule := range andRule { if _, subIsMatchAll := subRule.(MatchAll); subIsMatchAll { hasMatchAll = true break } } if !hasMatchAll { return rule, false } var newRules []SyscallRule for _, subRule := range andRule { if _, subIsAny := subRule.(MatchAll); !subIsAny { newRules = append(newRules, subRule) } } if len(newRules) == 0 { // An `And` rule with zero rules inside is invalid. return MatchAll{}, true } return And(newRules), true } // nilInPerArgToAnyValue replaces `nil` values in `PerArg` rules with // `AnyValue`. This isn't really an optimization, but it simplifies the // logic of other `PerArg` optimizers to not have to handle the `nil` case // separately from the `AnyValue` case. func nilInPerArgToAnyValue(rule SyscallRule) (SyscallRule, bool) { perArg, isPerArg := rule.(PerArg) if !isPerArg { return rule, false } changed := false for argNum, valueMatcher := range perArg { if valueMatcher == nil { perArg[argNum] = AnyValue{} changed = true } } return perArg, changed } // convertUselessPerArgToMatchAll looks for `PerArg` rules that match // anything and replaces them with `MatchAll`. func convertUselessPerArgToMatchAll(rule SyscallRule) (SyscallRule, bool) { perArg, isPerArg := rule.(PerArg) if !isPerArg { return rule, false } for _, valueMatcher := range perArg { if _, isAnyValue := valueMatcher.(AnyValue); !isAnyValue { return rule, false } } return MatchAll{}, true } // signature returns a string signature of this `PerArg`. // This string can be used to identify the behavior of this `PerArg` rule. func (pa PerArg) signature() string { var sb strings.Builder for _, valueMatcher := range pa { repr := valueMatcher.Repr() if strings.ContainsRune(repr, ';') { panic(fmt.Sprintf("ValueMatcher %v (type %T) returned representation %q containing illegal character ';'", valueMatcher, valueMatcher, repr)) } sb.WriteString(repr) sb.WriteRune(';') } return sb.String() } // deduplicatePerArgs deduplicates PerArg rules with identical matchers. // This can happen during filter construction, when rules are added across // multiple files. func deduplicatePerArgs[T Or | And](rule SyscallRule) (SyscallRule, bool) { tRule, isT := rule.(T) if !isT || len(tRule) < 2 { return rule, false } knownPerArgs := make(map[string]struct{}, len(tRule)) newRules := make([]SyscallRule, 0, len(tRule)) changed := false for _, subRule := range tRule { subPerArg, subIsPerArg := subRule.(PerArg) if !subIsPerArg { newRules = append(newRules, subRule) continue } sig := subPerArg.signature() if _, isDupe := knownPerArgs[sig]; isDupe { changed = true continue } knownPerArgs[sig] = struct{}{} newRules = append(newRules, subPerArg) } if !changed { return rule, false } return SyscallRule(T(newRules)), true } // splitMatchers replaces every `splittableValueMatcher` with a // `splitMatcher` value matcher instead. // This enables optimizations that are split-aware to run without // the need to have logic handling this conversion. func splitMatchers(rule SyscallRule) (SyscallRule, bool) { perArg, isPerArg := rule.(PerArg) if !isPerArg { return rule, false } changed := false for argNum, valueMatcher := range perArg { if _, isAlreadySplit := valueMatcher.(splitMatcher); isAlreadySplit { continue } splittableMatcher, isSplittableMatcher := valueMatcher.(splittableValueMatcher) if !isSplittableMatcher { continue } perArg[argNum] = splittableMatcher.split() changed = true } return perArg, changed } // simplifyHalfValueMatcher may convert a `halfValueMatcher` to a simpler // (and potentially faster) representation. func simplifyHalfValueMatcher(hvm halfValueMatcher) halfValueMatcher { switch v := hvm.(type) { case halfNotSet: if v == 0 { return halfAnyValue{} } case halfMaskedEqual: switch { case v.mask == 0 && v.value == 0: return halfAnyValue{} case v.mask == 0xffffffff: return halfEqualTo(v.value) case v.value == 0: return halfNotSet(v.mask) } } return hvm } // simplifyHalfValueMatchers replace `halfValueMatcher`s with their simplified // version. func simplifyHalfValueMatchers(rule SyscallRule) (SyscallRule, bool) { perArg, isPerArg := rule.(PerArg) if !isPerArg { return rule, false } changed := false for i, valueMatcher := range perArg { sm, isSplitMatcher := valueMatcher.(splitMatcher) if !isSplitMatcher { continue } if newHigh := simplifyHalfValueMatcher(sm.highMatcher); newHigh.Repr() != sm.highMatcher.Repr() { sm.highMatcher = newHigh perArg[i] = sm changed = true } if newLow := simplifyHalfValueMatcher(sm.lowMatcher); newLow.Repr() != sm.lowMatcher.Repr() { sm.lowMatcher = newLow perArg[i] = sm changed = true } } return perArg, changed } // anySplitMatchersToAnyValue converts `splitMatcher`s where both halves // match any value to a single AnyValue{} rule. func anySplitMatchersToAnyValue(rule SyscallRule) (SyscallRule, bool) { perArg, isPerArg := rule.(PerArg) if !isPerArg { return rule, false } changed := false for argNum, valueMatcher := range perArg { sm, isSplitMatcher := valueMatcher.(splitMatcher) if !isSplitMatcher { continue } _, highIsAny := sm.highMatcher.(halfAnyValue) _, lowIsAny := sm.lowMatcher.(halfAnyValue) if highIsAny && lowIsAny { perArg[argNum] = AnyValue{} changed = true } } return perArg, changed } // invalidValueMatcher is a stand-in `ValueMatcher` with a unique // representation that doesn't look like any legitimate `ValueMatcher`. // Calling any method other than `Repr` will panic. // It is used as an intermediate step for some optimizers. type invalidValueMatcher struct { ValueMatcher } // Repr implements `ValueMatcher.Repr`. func (invalidValueMatcher) Repr() string { return "invalidValueMatcher" } // invalidHalfValueMatcher is a stand-in `HalfValueMatcher` with a unique // representation that doesn't look like any legitimate `HalfValueMatcher`. // Calling any method other than `Repr` will panic. // It is used as an intermediate step for some optimizers. type invalidHalfValueMatcher struct { halfValueMatcher } // Repr implements `HalfValueMatcher.Repr`. func (invalidHalfValueMatcher) Repr() string { return "invalidHalfValueMatcher" } // sameStringSet returns whether the given string sets are equal. func sameStringSet(m1, m2 map[string]struct{}) bool { if len(m1) != len(m2) { return false } for k := range m1 { if _, found := m2[k]; !found { return false } } return true } // extractRepeatedMatchers looks for common argument matchers that are // repeated across all combinations of *other* argument matchers in branches // of an `Or` rule that contains only `PerArg` rules. // It removes them from these `PerArg` rules, creates an `Or` of the // matchers that are repeated across all combinations, and `And`s that // rule to the rewritten `Or` rule. // In other words (simplifying `PerArg` to 4 items for simplicity): // // Or{ // PerArg{A1, B1, C1, D}, // PerArg{A2, B1, C1, D}, // PerArg{A1, B2, C2, D}, // PerArg{A2, B2, C2, D}, // PerArg{A1, B3, C3, D}, // PerArg{A2, B3, C3, D}, // } // // becomes (after one pass): // // And{ // Or{ // # Note: These will get deduplicated by deduplicatePerArgs // PerArg{A1, AnyValue{}, AnyValue{}, AnyValue{}}, // PerArg{A2, AnyValue{}, AnyValue{}, AnyValue{}}, // PerArg{A1, AnyValue{}, AnyValue{}, AnyValue{}}, // PerArg{A2, AnyValue{}, AnyValue{}, AnyValue{}}, // PerArg{A1, AnyValue{}, AnyValue{}, AnyValue{}}, // PerArg{A2, AnyValue{}, AnyValue{}, AnyValue{}}, // }, // Or{ // # Note: These will also get deduplicated by deduplicatePerArgs // PerArg{AnyValue{}, B1, C1, D}, // PerArg{AnyValue{}, B1, C1, D}, // PerArg{AnyValue{}, B2, C2, D}, // PerArg{AnyValue{}, B2, C2, D}, // PerArg{AnyValue{}, B3, C3, D}, // PerArg{AnyValue{}, B3, C3, D}, // }, // } // // ... then, on the second pass (after deduplication), // the second inner `Or` rule gets recursively optimized to: // // And{ // Or{ // PerArg{A1, AnyValue{}, AnyValue{}, AnyValue{}}, // PerArg{A2, AnyValue{}, AnyValue{}, AnyValue{}}, // }, // And{ // Or{ // PerArg{AnyValue{}, AnyValue{}, AnyValue{}, D}, // PerArg{AnyValue{}, AnyValue{}, AnyValue{}, D}, // PerArg{AnyValue{}, AnyValue{}, AnyValue{}, D}, // }, // Or{ // PerArg{AnyValue{}, B1, C1, AnyValue{}}, // PerArg{AnyValue{}, B2, C2, AnyValue{}}, // PerArg{AnyValue{}, B3, C3, AnyValue{}}, // }, // }, // } // // ... which (after other optimizers clean this all up), finally becomes: // // And{ // Or{ // PerArg{A1, AnyValue{}, AnyValue{}, AnyValue{}}, // PerArg{A2, AnyValue{}, AnyValue{}, AnyValue{}}, // }, // PerArg{AnyValue{}, AnyValue{}, AnyValue{}, D}, // Or{ // PerArg{AnyValue{}, B1, C1, AnyValue{}}, // PerArg{AnyValue{}, B2, C2, AnyValue{}}, // PerArg{AnyValue{}, B3, C3, AnyValue{}}, // }, // } // // ... Turning 24 comparisons into just 9. func extractRepeatedMatchers(rule SyscallRule) (SyscallRule, bool) { orRule, isOr := rule.(Or) if !isOr || len(orRule) < 2 { return rule, false } for _, subRule := range orRule { if _, subIsPerArg := subRule.(PerArg); !subIsPerArg { return rule, false } } // extractData is the result of extracting a matcher at `argNum`. type extractData struct { // extractedMatcher is the extracted matcher that should be AND'd // with the rest. extractedMatcher ValueMatcher // otherMatchers represents the rest of the matchers after // `extractedMatcher` is extracted from a `PerArg`. // The matcher that was extracted should be replaced with something // that matches any value (i.e. either `AnyValue` or `halfAnyValue`). otherMatchers PerArg // otherMatchersSig represents the signature of other matchers, with // the extracted matcher being replaced with an "invalid" matcher. // The "invalid" matcher acts as a token that is equal across all // instances of `otherMatchersSig` for the other `PerArg` rules of the // `Or` expression. // `otherMatchersSig` isn't the same as `otherMatchers.Signature()`, // as `otherMatchers` does not contain this "invalid" matcher (it // contains a matcher that matches any value instead). otherMatchersSig string // extractedMatcherIsAnyValue is true iff `extractedMatcher` would // match any value thrown at it. // If this is the case across all branches of the `Or` expression, // the optimization is skipped. extractedMatcherIsAnyValue bool // otherMatchersAreAllAnyValue is true iff all matchers in // `otherMatchers` would match any value thrown at them. // If this is the case across all branches of the `Or` expression, // the optimization is skipped. otherMatchersAreAllAnyValue bool } allOtherMatchersSigs := make(map[string]struct{}, len(orRule)) argExprToOtherMatchersSigs := make(map[string]map[string]struct{}, len(orRule)) for argNum := 0; argNum < len(orRule[0].(PerArg)); argNum++ { // Check if `argNum` takes on a set of matchers common for all // combinations of all other matchers. // We try to extract a common matcher by three ways, which we // iterate over here. // Each of them returns the result of their extraction attempt, // along with a boolean representing whether extraction was // possible at all. // To "extract" a matcher means to replace it with an "invalid" // matcher in the PerArg expression, and checking if their set of // signatures is identical for each unique `Repr()` of the extracted // matcher. For splittable matcher, we try each half as well. // Conceptually (simplify PerArg to 3 arguments for simplicity), // if we have: // // Or{ // PerArg{A, B, C}, // PerArg{D, E, F}, // } // // ... then first, we will try: // // Or{ // PerArg{invalid, B, C} // PerArg{invalid, E, F} // } // // ... then, assuming both A and D are `splitMatcher`s: // we will try: // // Or{ // PerArg{splitMatcher{invalid, A.lowMatcher}, B, C} // PerArg{splitMatcher{invalid, D.lowMatcher}, E, F} // } // // ... and finally we will try: // // Or{ // PerArg{splitMatcher{A.highMatcher, invalid}, B, C} // PerArg{splitMatcher{D.highMatcher, invalid}, E, F} // } for _, extractFn := range []func(PerArg) (extractData, bool){ // Return whole ValueMatcher at a time: func(pa PerArg) (extractData, bool) { extractedMatcher := pa[argNum] _, extractedMatcherIsAnyValue := extractedMatcher.(AnyValue) otherMatchers := pa.clone() otherMatchers[argNum] = invalidValueMatcher{} otherMatchersSig := otherMatchers.signature() otherMatchers[argNum] = AnyValue{} otherMatchersAreAllAnyValue := true for _, valueMatcher := range otherMatchers { if _, isAnyValue := valueMatcher.(AnyValue); !isAnyValue { otherMatchersAreAllAnyValue = false break } } return extractData{ extractedMatcher: extractedMatcher, otherMatchers: otherMatchers, otherMatchersSig: otherMatchersSig, extractedMatcherIsAnyValue: extractedMatcherIsAnyValue, otherMatchersAreAllAnyValue: otherMatchersAreAllAnyValue, }, true }, // Extract a matcher for the high bits only: func(pa PerArg) (extractData, bool) { split, isSplit := pa[argNum].(splitMatcher) if !isSplit { return extractData{}, false } _, extractedMatcherIsAnyValue := split.highMatcher.(halfAnyValue) _, lowMatcherIsAnyValue := split.lowMatcher.(halfAnyValue) extractedMatcher := high32BitsMatch(split.highMatcher) otherMatchers := pa.clone() otherMatchers[argNum] = splitMatcher{ highMatcher: invalidHalfValueMatcher{}, lowMatcher: split.lowMatcher, } otherMatchersSig := otherMatchers.signature() otherMatchers[argNum] = low32BitsMatch(split.lowMatcher) otherMatchersAreAllAnyValue := lowMatcherIsAnyValue for i, valueMatcher := range otherMatchers { if i == argNum { continue } if _, isAnyValue := valueMatcher.(AnyValue); !isAnyValue { otherMatchersAreAllAnyValue = false break } } return extractData{ extractedMatcher: extractedMatcher, otherMatchers: otherMatchers, otherMatchersSig: otherMatchersSig, extractedMatcherIsAnyValue: extractedMatcherIsAnyValue, otherMatchersAreAllAnyValue: otherMatchersAreAllAnyValue, }, true }, // Extract a matcher for the low bits only: func(pa PerArg) (extractData, bool) { split, isSplit := pa[argNum].(splitMatcher) if !isSplit { return extractData{}, false } _, extractedMatcherIsAnyValue := split.lowMatcher.(halfAnyValue) _, highMatcherIsAnyValue := split.highMatcher.(halfAnyValue) extractedMatcher := low32BitsMatch(split.lowMatcher) otherMatchers := pa.clone() otherMatchers[argNum] = splitMatcher{ highMatcher: split.highMatcher, lowMatcher: invalidHalfValueMatcher{}, } otherMatchersSig := otherMatchers.signature() otherMatchers[argNum] = high32BitsMatch(split.highMatcher) otherMatchersAreAllAnyValue := highMatcherIsAnyValue for i, valueMatcher := range otherMatchers { if i == argNum { continue } if _, isAnyValue := valueMatcher.(AnyValue); !isAnyValue { otherMatchersAreAllAnyValue = false break } } return extractData{ extractedMatcher: extractedMatcher, otherMatchers: otherMatchers, otherMatchersSig: otherMatchersSig, extractedMatcherIsAnyValue: extractedMatcherIsAnyValue, otherMatchersAreAllAnyValue: otherMatchersAreAllAnyValue, }, true }, } { clear(allOtherMatchersSigs) clear(argExprToOtherMatchersSigs) allExtractable := true allArgNumMatchersAreAnyValue := true allOtherMatchersAreAnyValue := true for _, subRule := range orRule { ed, extractable := extractFn(subRule.(PerArg)) if allExtractable = allExtractable && extractable; !allExtractable { break } allArgNumMatchersAreAnyValue = allArgNumMatchersAreAnyValue && ed.extractedMatcherIsAnyValue allOtherMatchersAreAnyValue = allOtherMatchersAreAnyValue && ed.otherMatchersAreAllAnyValue repr := ed.extractedMatcher.Repr() allOtherMatchersSigs[ed.otherMatchersSig] = struct{}{} if _, reprSeen := argExprToOtherMatchersSigs[repr]; !reprSeen { argExprToOtherMatchersSigs[repr] = make(map[string]struct{}, len(orRule)) } argExprToOtherMatchersSigs[repr][ed.otherMatchersSig] = struct{}{} } if !allExtractable || allArgNumMatchersAreAnyValue || allOtherMatchersAreAnyValue { // Cannot optimize. continue } // Now check if each possible repr of `argNum` got the same set of // signatures for other matchers as `allOtherMatchersSigs`. sameOtherMatchers := true for _, omsigs := range argExprToOtherMatchersSigs { if !sameStringSet(omsigs, allOtherMatchersSigs) { sameOtherMatchers = false break } } if !sameOtherMatchers { continue } // We can simplify the rule by extracting `argNum` out. // Create two copies of `orRule`: One with only `argNum`, // and the other one with all arguments except `argNum`. // This will likely contain many duplicates but that's OK, // they'll be optimized out by `deduplicatePerArgs`. argNumMatch := Or(make([]SyscallRule, len(orRule))) otherArgsMatch := Or(make([]SyscallRule, len(orRule))) for i, subRule := range orRule { ed, _ := extractFn(subRule.(PerArg)) onlyArg := PerArg{AnyValue{}, AnyValue{}, AnyValue{}, AnyValue{}, AnyValue{}, AnyValue{}, AnyValue{}} onlyArg[argNum] = ed.extractedMatcher argNumMatch[i] = onlyArg otherArgsMatch[i] = ed.otherMatchers } // Attempt to optimize the "other" arguments: otherArgsMatchOpt, _ := extractRepeatedMatchers(otherArgsMatch) return And{argNumMatch, otherArgsMatchOpt}, true } } return rule, false } // optimizationRun is a stateful struct tracking the state of an optimization // over a rule. It may not be used concurrently. type optimizationRun struct { // funcs is the list of optimizer functions to run on the rules. // Optimizers should be ranked in order of importance, with the most // important first. // An optimizer will be exhausted before the next one is ever run. // Earlier optimizers are re-exhausted if later optimizers cause change. funcs []ruleOptimizerFunc // recurseFuncs is a list of closures that correspond one-to-one to `funcs` // and are suitable for passing to `SyscallRule.Recurse`. They are stored // here in order to be allocated once, as opposed to escaping if they were // specified directly as argument to `SyscallRule.Recurse`. recurseFuncs []func(subRule SyscallRule) SyscallRule // changed tracks whether any change has been made in the current pass. // It is updated as the optimizer runs. changed bool } // apply recursively applies `opt.funcs[funcIndex]` to the given `rule`. // It sets `opt.changed` to true if there has been any change. func (opt *optimizationRun) apply(rule SyscallRule, funcIndex int) SyscallRule { rule.Recurse(opt.recurseFuncs[funcIndex]) if opt.changed { return rule } rule, opt.changed = opt.funcs[funcIndex](rule) return rule } // optimize losslessly optimizes a SyscallRule using the `optimizationRun`'s // optimizer functions. // It may not be called concurrently. func (opt *optimizationRun) optimize(rule SyscallRule) SyscallRule { opt.recurseFuncs = make([]func(SyscallRule) SyscallRule, len(opt.funcs)) for i := range opt.funcs { funcIndex := i opt.recurseFuncs[funcIndex] = func(subRule SyscallRule) SyscallRule { return opt.apply(subRule, funcIndex) } } for opt.changed = true; opt.changed; { for i := range opt.funcs { opt.changed = false rule = opt.apply(rule, i) if opt.changed { break } } } return rule } // optimizeSyscallRule losslessly optimizes a `SyscallRule`. func optimizeSyscallRule(rule SyscallRule) SyscallRule { return (&optimizationRun{ funcs: []ruleOptimizerFunc{ // Convert Or / And rules with a single rule into that single rule. convertSingleCompoundRuleToThatRule[Or], convertSingleCompoundRuleToThatRule[And], // Flatten Or/And rules. flattenCompoundRules[Or], flattenCompoundRules[And], // Handle MatchAll. This is best done after flattening so that we // effectively traverse the whole tree to find a MatchAll by just // linearly scanning through the first (and only) level of rules. convertMatchAllOrXToMatchAll, convertMatchAllAndXToX, // Replace all `nil` values in `PerArg` to `AnyValue`, to simplify // the `PerArg` matchers below. nilInPerArgToAnyValue, // Deduplicate redundant `PerArg`s in Or and And. // This must come after `nilInPerArgToAnyValue` because it does not // handle the nil case. deduplicatePerArgs[Or], deduplicatePerArgs[And], // Remove useless `PerArg` matchers. // This must come after `nilInPerArgToAnyValue` because it does not // handle the nil case. convertUselessPerArgToMatchAll, // Replace `ValueMatcher`s that are splittable into their split version. // Like `nilInPerArgToAnyValue`, this isn't so much an optimization, // but allows the matchers below (which are `splitMatcher`-aware) to not // have to carry logic to split the matchers they encounter. splitMatchers, // Replace `halfValueMatcher`s with their simplified version. simplifyHalfValueMatchers, // Replace `splitMatchers` that match any value with `AnyValue`. anySplitMatchersToAnyValue, // Extract repeated argument matchers out of `Or` expressions. // This must come after `nilInPerArgToAnyValue` because it does not // handle the nil case. // This should ideally run late in the list because it does a bunch // of memory allocations (even in the non-optimizable case), which // should be avoided unless there is nothing else left to optimize. extractRepeatedMatchers, }, }).optimize(rule) } golang-gvisor-gvisor-0.0~20240729.0/pkg/seccomp/seccomp_rules.go000066400000000000000000000761211465435605700242560ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package seccomp import ( "fmt" "sort" "strings" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/bpf" ) // The offsets are based on the following struct in include/linux/seccomp.h. // // struct seccomp_data { // int nr; // __u32 arch; // __u64 instruction_pointer; // __u64 args[6]; // }; const ( seccompDataOffsetNR = 0 seccompDataOffsetArch = 4 seccompDataOffsetIPLow = 8 seccompDataOffsetIPHigh = 12 seccompDataOffsetArgs = 16 ) func seccompDataOffsetArgLow(i int) uint32 { return uint32(seccompDataOffsetArgs + i*8) } func seccompDataOffsetArgHigh(i int) uint32 { return seccompDataOffsetArgLow(i) + 4 } // ValueMatcher verifies a numerical value, typically a syscall argument // or RIP value. type ValueMatcher interface { // String returns a human-readable representation of the match rule. // If the returned string contains "VAL", it will be replaced with // the symbolic name of the value being matched against. String() string // Repr returns a string that will be used for asserting equality between // two `ValueMatcher` instances. It must therefore be unique to the // `ValueMatcher` implementation and to its parameters. // It must not contain the character ";". Repr() string // Render should add rules to the given program that verify the value // loadable from `value` matches this rule or not. // The rules should indicate this by either jumping to `labelSet.Matched()` // or `labelSet.Mismatched()`. They may not fall through. Render(program *syscallProgram, labelSet *labelSet, value matchedValue) // InterestingValues returns a list of values that may be interesting to // test this `ValueMatcher` against. InterestingValues() []uint64 } // halfValueMatcher verifies a 32-bit value. type halfValueMatcher interface { // String returns a human-friendly representation of the check being done // against the 32-bit value. // The string "x.(high|low) {{halfValueMatcher.String()}}" should read well, // e.g. "x.low == 0xffff". String() string // Repr returns a string that will be used for asserting equality between // two `halfValueMatcher` instances. It must therefore be unique to the // `halfValueMatcher` implementation and to its parameters. // It must not contain the character ";". Repr() string // HalfRender should add rules to the given program that verify the value // loaded into the "A" register matches this 32-bit value or not. // The rules should indicate this by either jumping to `labelSet.Matched()` // or `labelSet.Mismatched()`. They may not fall through. HalfRender(program *syscallProgram, labelSet *labelSet) // InterestingValues returns a list of values that may be interesting to // test this `halfValueMatcher` against. InterestingValues() []uint32 } // halfAnyValue implements `halfValueMatcher` and matches any value. type halfAnyValue struct{} // String implements `halfValueMatcher.String`. func (halfAnyValue) String() string { return "== *" } // Repr implements `halfValueMatcher.Repr`. func (halfAnyValue) Repr() string { return "halfAnyValue" } // HalfRender implements `halfValueMatcher.HalfRender`. func (halfAnyValue) HalfRender(program *syscallProgram, labelSet *labelSet) { program.JumpTo(labelSet.Matched()) } // halfEqualTo implements `halfValueMatcher` and matches a specific 32-bit value. type halfEqualTo uint32 // String implements `halfValueMatcher.String`. func (heq halfEqualTo) String() string { if heq == 0 { return "== 0" } return fmt.Sprintf("== %#x", uint32(heq)) } // Repr implements `halfValueMatcher.Repr`. func (heq halfEqualTo) Repr() string { return fmt.Sprintf("halfEq(%#x)", uint32(heq)) } // HalfRender implements `halfValueMatcher.HalfRender`. func (heq halfEqualTo) HalfRender(program *syscallProgram, labelSet *labelSet) { program.If(bpf.Jmp|bpf.Jeq|bpf.K, uint32(heq), labelSet.Matched()) program.JumpTo(labelSet.Mismatched()) } // halfNotSet implements `halfValueMatcher` and matches using the "set" // bitwise operation. type halfNotSet uint32 // String implements `halfValueMatcher.String`. func (hns halfNotSet) String() string { return fmt.Sprintf("& %#x == 0", uint32(hns)) } // Repr implements `halfValueMatcher.Repr`. func (hns halfNotSet) Repr() string { return fmt.Sprintf("halfNotSet(%#x)", uint32(hns)) } // HalfRender implements `halfValueMatcher.HalfRender`. func (hns halfNotSet) HalfRender(program *syscallProgram, labelSet *labelSet) { program.If(bpf.Jmp|bpf.Jset|bpf.K, uint32(hns), labelSet.Mismatched()) program.JumpTo(labelSet.Matched()) } // halfMaskedEqual implements `halfValueMatcher` and verifies that the value // is equal after applying a bit mask. type halfMaskedEqual struct { mask uint32 value uint32 } // String implements `halfValueMatcher.String`. func (hmeq halfMaskedEqual) String() string { if hmeq.value == 0 { return fmt.Sprintf("& %#x == 0", hmeq.mask) } return fmt.Sprintf("& %#x == %#x", hmeq.mask, hmeq.value) } // Repr implements `halfValueMatcher.Repr`. func (hmeq halfMaskedEqual) Repr() string { return fmt.Sprintf("halfMaskedEqual(%#x, %#x)", hmeq.mask, hmeq.value) } // HalfRender implements `halfValueMatcher.HalfRender`. func (hmeq halfMaskedEqual) HalfRender(program *syscallProgram, labelSet *labelSet) { program.Stmt(bpf.Alu|bpf.And|bpf.K, hmeq.mask) program.IfNot(bpf.Jmp|bpf.Jeq|bpf.K, hmeq.value, labelSet.Mismatched()) program.JumpTo(labelSet.Matched()) } // splitMatcher implements `ValueMatcher` and verifies each half of the 64-bit // value independently (with AND semantics). // It implements `ValueMatcher`, but is never used directly in seccomp filter // rules. Rather, it acts as an intermediate representation for the rules that // can be expressed as an AND of two 32-bit values. type splitMatcher struct { // repr is the `Repr()` of the original `ValueMatcher` (pre-split). repr string // highMatcher is the half-value matcher to verify the high 32 bits. highMatcher halfValueMatcher // lowMatcher is the half-value matcher to verify the low 32 bits. lowMatcher halfValueMatcher } // String implements `ValueMatcher.String`. func (sm splitMatcher) String() string { if sm.repr == "" { _, highIsAnyValue := sm.highMatcher.(halfAnyValue) _, lowIsAnyValue := sm.lowMatcher.(halfAnyValue) if highIsAnyValue && lowIsAnyValue { return "== *" } if highIsAnyValue { return fmt.Sprintf("VAL.low %s", sm.lowMatcher.String()) } if lowIsAnyValue { return fmt.Sprintf("VAL.high %s", sm.highMatcher.String()) } return fmt.Sprintf("(VAL.high %s && VAL.low %s)", sm.highMatcher.String(), sm.lowMatcher.String()) } return sm.repr } // Repr implements `ValueMatcher.Repr`. func (sm splitMatcher) Repr() string { if sm.repr == "" { _, highIsAnyValue := sm.highMatcher.(halfAnyValue) _, lowIsAnyValue := sm.lowMatcher.(halfAnyValue) if highIsAnyValue && lowIsAnyValue { return "split(*)" } if highIsAnyValue { return fmt.Sprintf("low=%s", sm.lowMatcher.Repr()) } if lowIsAnyValue { return fmt.Sprintf("high=%s", sm.highMatcher.Repr()) } return fmt.Sprintf("(high=%s && low=%s)", sm.highMatcher.Repr(), sm.lowMatcher.Repr()) } return sm.repr } // Render implements `ValueMatcher.Render`. func (sm splitMatcher) Render(program *syscallProgram, labelSet *labelSet, value matchedValue) { _, highIsAny := sm.highMatcher.(halfAnyValue) _, lowIsAny := sm.lowMatcher.(halfAnyValue) if highIsAny && lowIsAny { program.JumpTo(labelSet.Matched()) return } if highIsAny { value.LoadLow32Bits() sm.lowMatcher.HalfRender(program, labelSet) return } if lowIsAny { value.LoadHigh32Bits() sm.highMatcher.HalfRender(program, labelSet) return } // We render the "low" bits first on the assumption that most syscall // arguments fit within 32-bits, and those rules actually only care // about the value of the low 32 bits. This way, we only check the // high 32 bits if the low 32 bits have already matched. lowLabels := labelSet.Push("low", labelSet.NewLabel(), labelSet.Mismatched()) lowFrag := program.Record() value.LoadLow32Bits() sm.lowMatcher.HalfRender(program, lowLabels) lowFrag.MustHaveJumpedTo(lowLabels.Matched(), labelSet.Mismatched()) program.Label(lowLabels.Matched()) highFrag := program.Record() value.LoadHigh32Bits() sm.highMatcher.HalfRender(program, labelSet.Push("high", labelSet.Matched(), labelSet.Mismatched())) highFrag.MustHaveJumpedTo(labelSet.Matched(), labelSet.Mismatched()) } // high32BitsMatch returns a `splitMatcher` that only matches the high 32 bits // of a 64-bit value. func high32BitsMatch(hvm halfValueMatcher) splitMatcher { return splitMatcher{ highMatcher: hvm, lowMatcher: halfAnyValue{}, } } // low32BitsMatch returns a `splitMatcher` that only matches the low 32 bits // of a 64-bit value. func low32BitsMatch(hvm halfValueMatcher) splitMatcher { return splitMatcher{ highMatcher: halfAnyValue{}, lowMatcher: hvm, } } // splittableValueMatcher should be implemented by `ValueMatcher` that can // be expressed as a `splitMatcher`. type splittableValueMatcher interface { // split converts this `ValueMatcher` into a `splitMatcher`. split() splitMatcher } // renderSplittable is a helper function for the `ValueMatcher.Render` // implementation of `splittableValueMatcher`s. func renderSplittable(sm splittableValueMatcher, program *syscallProgram, labelSet *labelSet, value matchedValue) { sm.split().Render(program, labelSet, value) } // high32Bits returns the higher 32-bits of the given value. func high32Bits(val uintptr) uint32 { return uint32(val >> 32) } // low32Bits returns the lower 32-bits of the given value. func low32Bits(val uintptr) uint32 { return uint32(val) } // AnyValue is marker to indicate any value will be accepted. // It implements ValueMatcher. type AnyValue struct{} // String implements `ValueMatcher.String`. func (AnyValue) String() string { return "== *" } // Repr implements `ValueMatcher.Repr`. func (av AnyValue) Repr() string { return av.String() } // Render implements `ValueMatcher.Render`. func (av AnyValue) Render(program *syscallProgram, labelSet *labelSet, value matchedValue) { program.JumpTo(labelSet.Matched()) } // EqualTo specifies a value that needs to be strictly matched. // It implements ValueMatcher. type EqualTo uintptr // String implements `ValueMatcher.String`. func (eq EqualTo) String() string { if eq == 0 { return "== 0" } return fmt.Sprintf("== %#x", uintptr(eq)) } // Repr implements `ValueMatcher.Repr`. func (eq EqualTo) Repr() string { return eq.String() } // Render implements `ValueMatcher.Render`. func (eq EqualTo) Render(program *syscallProgram, labelSet *labelSet, value matchedValue) { renderSplittable(eq, program, labelSet, value) } // split implements `splittableValueMatcher.split`. func (eq EqualTo) split() splitMatcher { return splitMatcher{ repr: eq.Repr(), highMatcher: halfEqualTo(high32Bits(uintptr(eq))), lowMatcher: halfEqualTo(low32Bits(uintptr(eq))), } } // NotEqual specifies a value that is strictly not equal. type NotEqual uintptr // String implements `ValueMatcher.String`. func (ne NotEqual) String() string { return fmt.Sprintf("!= %#x", uintptr(ne)) } // Repr implements `ValueMatcher.Repr`. func (ne NotEqual) Repr() string { return ne.String() } // Render implements `ValueMatcher.Render`. func (ne NotEqual) Render(program *syscallProgram, labelSet *labelSet, value matchedValue) { // Note that `NotEqual` is *not* a splittable rule by itself, because it is not the // conjunction of two `halfValueMatchers` (it is the *disjunction* of them). // However, it is also the exact inverse of `EqualTo`. // Therefore, we can use `EqualTo` here, and simply invert the // matched/mismatched labels. EqualTo(ne).Render(program, labelSet.Push("inverted", labelSet.Mismatched(), labelSet.Matched()), value) } // GreaterThan specifies a value that needs to be strictly smaller. type GreaterThan uintptr // String implements `ValueMatcher.String`. func (gt GreaterThan) String() string { return fmt.Sprintf("> %#x", uintptr(gt)) } // Repr implements `ValueMatcher.Repr`. func (gt GreaterThan) Repr() string { return gt.String() } // Render implements `ValueMatcher.Render`. func (gt GreaterThan) Render(program *syscallProgram, labelSet *labelSet, value matchedValue) { high := high32Bits(uintptr(gt)) // Assert the higher 32bits are greater than or equal. // arg_high >= high ? continue : violation (arg_high < high) value.LoadHigh32Bits() program.IfNot(bpf.Jmp|bpf.Jge|bpf.K, high, labelSet.Mismatched()) // arg_high == high ? continue : success (arg_high > high) program.IfNot(bpf.Jmp|bpf.Jeq|bpf.K, high, labelSet.Matched()) // Assert that the lower 32bits are greater. // arg_low > low ? continue/success : violation (arg_high == high and arg_low <= low) value.LoadLow32Bits() program.IfNot(bpf.Jmp|bpf.Jgt|bpf.K, low32Bits(uintptr(gt)), labelSet.Mismatched()) program.JumpTo(labelSet.Matched()) } // GreaterThanOrEqual specifies a value that needs to be smaller or equal. type GreaterThanOrEqual uintptr // String implements `ValueMatcher.String`. func (ge GreaterThanOrEqual) String() string { return fmt.Sprintf(">= %#x", uintptr(ge)) } // Repr implements `ValueMatcher.Repr`. func (ge GreaterThanOrEqual) Repr() string { return ge.String() } // Render implements `ValueMatcher.Render`. func (ge GreaterThanOrEqual) Render(program *syscallProgram, labelSet *labelSet, value matchedValue) { high := high32Bits(uintptr(ge)) // Assert the higher 32bits are greater than or equal. // arg_high >= high ? continue : violation (arg_high < high) value.LoadHigh32Bits() program.IfNot(bpf.Jmp|bpf.Jge|bpf.K, high, labelSet.Mismatched()) // arg_high == high ? continue : success (arg_high > high) program.IfNot(bpf.Jmp|bpf.Jeq|bpf.K, high, labelSet.Matched()) // Assert that the lower 32bits are greater or equal (assuming the // higher bits are equal). // arg_low >= low ? continue/success : violation (arg_high == high and arg_low < low) value.LoadLow32Bits() program.IfNot(bpf.Jmp|bpf.Jge|bpf.K, low32Bits(uintptr(ge)), labelSet.Mismatched()) program.JumpTo(labelSet.Matched()) } // LessThan specifies a value that needs to be strictly greater. type LessThan uintptr // String implements `ValueMatcher.String`. func (lt LessThan) String() string { return fmt.Sprintf("< %#x", uintptr(lt)) } // Repr implements `ValueMatcher.Repr`. func (lt LessThan) Repr() string { return lt.String() } // Render implements `ValueMatcher.Render`. func (lt LessThan) Render(program *syscallProgram, labelSet *labelSet, value matchedValue) { high := high32Bits(uintptr(lt)) // Assert the higher 32bits are less than or equal. // arg_high > high ? violation : continue value.LoadHigh32Bits() program.If(bpf.Jmp|bpf.Jgt|bpf.K, high, labelSet.Mismatched()) // arg_high == high ? continue : success (arg_high < high) program.IfNot(bpf.Jmp|bpf.Jeq|bpf.K, high, labelSet.Matched()) // Assert that the lower 32bits are less (assuming the // higher bits are equal). // arg_low >= low ? violation : continue value.LoadLow32Bits() program.If(bpf.Jmp|bpf.Jge|bpf.K, low32Bits(uintptr(lt)), labelSet.Mismatched()) program.JumpTo(labelSet.Matched()) } // LessThanOrEqual specifies a value that needs to be greater or equal. type LessThanOrEqual uintptr // String implements `ValueMatcher.String`. func (le LessThanOrEqual) String() string { return fmt.Sprintf("<= %#x", uintptr(le)) } // Repr implements `ValueMatcher.Repr`. func (le LessThanOrEqual) Repr() string { return le.String() } // Render implements `ValueMatcher.Render`. func (le LessThanOrEqual) Render(program *syscallProgram, labelSet *labelSet, value matchedValue) { high := high32Bits(uintptr(le)) // Assert the higher 32bits are less than or equal. // assert arg_high > high ? violation : continue value.LoadHigh32Bits() program.If(bpf.Jmp|bpf.Jgt|bpf.K, high, labelSet.Mismatched()) // arg_high == high ? continue : success program.IfNot(bpf.Jmp|bpf.Jeq|bpf.K, high, labelSet.Matched()) // Assert the lower bits are less than or equal (assuming // the higher bits are equal). // arg_low > low ? violation : success value.LoadLow32Bits() program.If(bpf.Jmp|bpf.Jgt|bpf.K, low32Bits(uintptr(le)), labelSet.Mismatched()) program.JumpTo(labelSet.Matched()) } // NonNegativeFD ensures that an FD argument is a non-negative int32. type NonNegativeFD struct{} // String implements `ValueMatcher.String`. func (NonNegativeFD) String() string { return "is non-negative FD" } // Repr implements `ValueMatcher.Repr`. func (NonNegativeFD) Repr() string { return "NonNegativeFD" } // Render implements `ValueMatcher.Render`. func (nnfd NonNegativeFD) Render(program *syscallProgram, labelSet *labelSet, value matchedValue) { renderSplittable(nnfd, program, labelSet, value) } // split implements `splittableValueMatcher.split`. func (nnfd NonNegativeFD) split() splitMatcher { return splitMatcher{ repr: nnfd.Repr(), // FDs are 32 bits, so the high 32 bits must all be zero. // Negative int32 has the MSB (31st bit) set. // So the low 32bits of the FD value must not have the 31st bit set. highMatcher: halfEqualTo(0), lowMatcher: halfNotSet(1 << 31), } } // MaskedEqual specifies a value that matches the input after the input is // masked (bitwise &) against the given mask. It implements `ValueMatcher`. type maskedEqual struct { mask uintptr value uintptr } // String implements `ValueMatcher.String`. func (me maskedEqual) String() string { return fmt.Sprintf("& %#x == %#x", me.mask, me.value) } // Repr implements `ValueMatcher.Repr`. func (me maskedEqual) Repr() string { return me.String() } // Render implements `ValueMatcher.Render`. func (me maskedEqual) Render(program *syscallProgram, labelSet *labelSet, value matchedValue) { renderSplittable(me, program, labelSet, value) } // split implements `splittableValueMatcher.Split`. func (me maskedEqual) split() splitMatcher { return splitMatcher{ repr: me.Repr(), highMatcher: halfMaskedEqual{high32Bits(me.mask), high32Bits(me.value)}, lowMatcher: halfMaskedEqual{low32Bits(me.mask), low32Bits(me.value)}, } } // MaskedEqual specifies a value that matches the input after the input is // masked (bitwise &) against the given mask. Can be used to verify that input // only includes certain approved flags. func MaskedEqual(mask, value uintptr) ValueMatcher { return maskedEqual{ mask: mask, value: value, } } // BitsAllowlist specifies that a value can only have non-zero bits within // the mask specified in `allowlist`. It implements `ValueMatcher`. func BitsAllowlist(allowlist uintptr) ValueMatcher { return MaskedEqual(^allowlist, 0) } // SyscallRule expresses a set of rules to verify the arguments of a specific // syscall. type SyscallRule interface { // Render renders the syscall rule in the given `program`. // The emitted instructions **must** end up jumping to either // `labelSet.Matched()` or `labelSet.Mismatched()`; they may // not "fall through" to whatever instructions will be added // next into the program. Render(program *syscallProgram, labelSet *labelSet) // Copy returns a copy of this `SyscallRule`. Copy() SyscallRule // Recurse should call the given function on all `SyscallRule`s that are // part of this `SyscallRule`, and should replace them with the returned // `SyscallRule`. For example, conjunctive rules should call the given // function on each of the `SyscallRule`s that they are ANDing, replacing // them with the rule returned by the function. Recurse(func(SyscallRule) SyscallRule) // String returns a human-readable string representing what the rule does. String() string } // MatchAll implements `SyscallRule` and matches everything. type MatchAll struct{} // Render implements `SyscallRule.Render`. func (MatchAll) Render(program *syscallProgram, labelSet *labelSet) { program.JumpTo(labelSet.Matched()) } // Copy implements `SyscallRule.Copy`. func (MatchAll) Copy() SyscallRule { return MatchAll{} } // Recurse implements `SyscallRule.Recurse`. func (MatchAll) Recurse(func(SyscallRule) SyscallRule) {} // String implements `SyscallRule.String`. func (MatchAll) String() string { return "true" } // Or expresses an "OR" (a disjunction) over a set of `SyscallRule`s. // An `Or` may not be empty. type Or []SyscallRule // Render implements `SyscallRule.Render`. func (or Or) Render(program *syscallProgram, labelSet *labelSet) { if len(or) == 0 { panic("Or expression cannot be empty") } // If `len(or) == 1`, this will be optimized away to be the same as // rendering the single rule in the disjunction. for i, rule := range or { frag := program.Record() nextRuleLabel := labelSet.NewLabel() rule.Render(program, labelSet.Push(fmt.Sprintf("or[%d]", i), labelSet.Matched(), nextRuleLabel)) frag.MustHaveJumpedTo(labelSet.Matched(), nextRuleLabel) program.Label(nextRuleLabel) } program.JumpTo(labelSet.Mismatched()) } // Copy implements `SyscallRule.Copy`. func (or Or) Copy() SyscallRule { orCopy := make([]SyscallRule, len(or)) for i, rule := range or { orCopy[i] = rule.Copy() } return Or(orCopy) } // Recurse implements `SyscallRule.Recurse`. func (or Or) Recurse(fn func(SyscallRule) SyscallRule) { for i, rule := range or { or[i] = fn(rule) } } // String implements `SyscallRule.String`. func (or Or) String() string { switch len(or) { case 0: return "invalid" case 1: return or[0].String() default: var sb strings.Builder sb.WriteRune('(') for i, rule := range or { if i != 0 { sb.WriteString(" || ") } sb.WriteString(rule.String()) } sb.WriteRune(')') return sb.String() } } // And expresses an "AND" (a conjunction) over a set of `SyscallRule`s. // An `And` may not be empty. type And []SyscallRule // Render implements `SyscallRule.Render`. func (and And) Render(program *syscallProgram, labelSet *labelSet) { if len(and) == 0 { panic("And expression cannot be empty") } // If `len(and) == 1`, this will be optimized away to be the same as // rendering the single rule in the conjunction. for i, rule := range and { frag := program.Record() nextRuleLabel := labelSet.NewLabel() rule.Render(program, labelSet.Push(fmt.Sprintf("and[%d]", i), nextRuleLabel, labelSet.Mismatched())) frag.MustHaveJumpedTo(nextRuleLabel, labelSet.Mismatched()) program.Label(nextRuleLabel) } program.JumpTo(labelSet.Matched()) } // Copy implements `SyscallRule.Copy`. func (and And) Copy() SyscallRule { andCopy := make([]SyscallRule, len(and)) for i, rule := range and { andCopy[i] = rule.Copy() } return And(andCopy) } // Recurse implements `SyscallRule.Recurse`. func (and And) Recurse(fn func(SyscallRule) SyscallRule) { for i, rule := range and { and[i] = fn(rule) } } // String implements `SyscallRule.String`. func (and And) String() string { switch len(and) { case 0: return "invalid" case 1: return and[0].String() default: var sb strings.Builder sb.WriteRune('(') for i, rule := range and { if i != 0 { sb.WriteString(" && ") } sb.WriteString(rule.String()) } sb.WriteRune(')') return sb.String() } } // PerArg implements SyscallRule and verifies the syscall arguments and RIP. // // For example: // // rule := PerArg{ // EqualTo(linux.ARCH_GET_FS | linux.ARCH_SET_FS), // arg0 // } type PerArg [7]ValueMatcher // 6 arguments + RIP // RuleIP indicates what rules in the Rule array have to be applied to // instruction pointer. const RuleIP = 6 // clone returns a copy of this `PerArg`. // It is more efficient than `Copy` because it returns a `PerArg` // directly, rather than a `SyscallRule` interface. func (pa PerArg) clone() PerArg { return PerArg{ pa[0], pa[1], pa[2], pa[3], pa[4], pa[5], pa[6], } } // Copy implements `SyscallRule.Copy`. func (pa PerArg) Copy() SyscallRule { return pa.clone() } // Render implements `SyscallRule.Render`. func (pa PerArg) Render(program *syscallProgram, labelSet *labelSet) { for i, arg := range pa { if arg == nil { continue } frag := program.Record() nextArgLabel := labelSet.NewLabel() labelSuffix := fmt.Sprintf("arg[%d]", i) // Determine the data offset for low and high bits of input. dataOffsetLow := seccompDataOffsetArgLow(i) dataOffsetHigh := seccompDataOffsetArgHigh(i) if i == RuleIP { dataOffsetLow = seccompDataOffsetIPLow dataOffsetHigh = seccompDataOffsetIPHigh labelSuffix = "rip" } ls := labelSet.Push(labelSuffix, nextArgLabel, labelSet.Mismatched()) arg.Render(program, ls, matchedValue{ program: program, dataOffsetHigh: dataOffsetHigh, dataOffsetLow: dataOffsetLow, }) frag.MustHaveJumpedTo(ls.Matched(), ls.Mismatched()) program.Label(nextArgLabel) } // Matched all argument-wise rules, jump to the final rule matched label. program.JumpTo(labelSet.Matched()) } // Recurse implements `SyscallRule.Recurse`. func (PerArg) Recurse(fn func(SyscallRule) SyscallRule) {} // String implements `SyscallRule.String`. func (pa PerArg) String() string { var sb strings.Builder writtenArgs := 0 for i, arg := range pa { if arg == nil { continue } if _, isAny := arg.(AnyValue); isAny { continue } if writtenArgs != 0 { sb.WriteString(" && ") } str := arg.String() var varName string if i == RuleIP { varName = "rip" } else { varName = fmt.Sprintf("arg[%d]", i) } if strings.Contains(str, "VAL") { sb.WriteString(strings.ReplaceAll(str, "VAL", varName)) } else { sb.WriteString(varName) sb.WriteRune(' ') sb.WriteString(str) } writtenArgs++ } if writtenArgs == 0 { return "true" } if writtenArgs == 1 { return sb.String() } return "(" + sb.String() + ")" } // SyscallRules maps syscall numbers to their corresponding rules. // // For example: // // rules := MakeSyscallRules(map[uintptr]SyscallRule{ // syscall.SYS_FUTEX: Or{ // PerArg{ // AnyValue{}, // EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), // }, // PerArg{ // AnyValue{}, // EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), // }, // }, // syscall.SYS_GETPID: MatchAll{}, // }) type SyscallRules struct { rules map[uintptr]SyscallRule } // NewSyscallRules returns a new SyscallRules. func NewSyscallRules() SyscallRules { return MakeSyscallRules(nil) } // MakeSyscallRules returns a new SyscallRules with the given set of rules. func MakeSyscallRules(rules map[uintptr]SyscallRule) SyscallRules { if rules == nil { rules = make(map[uintptr]SyscallRule) } return SyscallRules{rules: rules} } // String returns a string representation of the syscall rules, one syscall // per line. func (sr SyscallRules) String() string { if len(sr.rules) == 0 { return "(no rules)" } sysnums := make([]uintptr, 0, len(sr.rules)) for sysno := range sr.rules { sysnums = append(sysnums, sysno) } sort.Slice(sysnums, func(i, j int) bool { return sysnums[i] < sysnums[j] }) var sb strings.Builder for _, sysno := range sysnums { sb.WriteString(fmt.Sprintf("syscall %d: %v\n", sysno, sr.rules[sysno])) } return strings.TrimSpace(sb.String()) } // Size returns the number of syscall numbers for which a rule is defined. func (sr SyscallRules) Size() int { return len(sr.rules) } // Get returns the rule defined for the given syscall number. func (sr SyscallRules) Get(sysno uintptr) SyscallRule { return sr.rules[sysno] } // Has returns whether there is a rule defined for the given syscall number. func (sr SyscallRules) Has(sysno uintptr) bool { _, has := sr.rules[sysno] return has } // Add adds the given rule. It will create a new entry for a new syscall, otherwise // it will append to the existing rules. // Returns itself for chainability. func (sr SyscallRules) Add(sysno uintptr, r SyscallRule) SyscallRules { if cur, ok := sr.rules[sysno]; ok { sr.rules[sysno] = Or{cur, r} } else { sr.rules[sysno] = r } return sr } // Set sets the rule for the given syscall number. // Panics if there is already a rule for this syscall number. // This is useful for deterministic rules where the set of syscall rules is // added in multiple chunks but is known to never overlap by syscall number. // Returns itself for chainability. func (sr SyscallRules) Set(sysno uintptr, r SyscallRule) SyscallRules { if cur, ok := sr.rules[sysno]; ok { panic(fmt.Sprintf("tried to set syscall rule for sysno=%d to %v but it is already set to %v", sysno, r, cur)) } sr.rules[sysno] = r return sr } // Remove clears the syscall rule for the given syscall number. // It will panic if there is no syscall rule for this syscall number. func (sr SyscallRules) Remove(sysno uintptr) { if !sr.Has(sysno) { panic(fmt.Sprintf("tried to remove syscall rule for sysno=%d but it is not set", sysno)) } delete(sr.rules, sysno) } // Merge merges the given SyscallRules. // Returns itself for chainability. func (sr SyscallRules) Merge(other SyscallRules) SyscallRules { for sysno, r := range other.rules { sr.Add(sysno, r) } return sr } // Copy returns a deep copy of these SyscallRules. func (sr SyscallRules) Copy() SyscallRules { rulesCopy := make(map[uintptr]SyscallRule, len(sr.rules)) for sysno, r := range sr.rules { rulesCopy[sysno] = r.Copy() } return MakeSyscallRules(rulesCopy) } // ForSingleArgument runs the given function on the `ValueMatcher` rules // for a single specific syscall argument of the given syscall number. // If the function returns an error, it will be propagated along with some // details as to which rule caused the error to be returned. // ForSingleArgument also returns an error if there are no rules defined for // the given syscall number, or if at least one rule for this syscall number // is not either a `PerArg` rule or a rule with children rules (as this would // indicate that the `PerArg` rules alone may not be a good representation of // the entire set of rules for this system call). func (sr SyscallRules) ForSingleArgument(sysno uintptr, argNum int, fn func(ValueMatcher) error) error { if argNum < 0 || argNum >= len(PerArg{}) { return fmt.Errorf("invalid argument number %d", argNum) } if !sr.Has(sysno) { return fmt.Errorf("syscall %d has no rules defined", sysno) } var err error var process func(SyscallRule) SyscallRule var callCount int process = func(r SyscallRule) SyscallRule { callCount++ pa, isPerArg := r.(PerArg) if isPerArg { if gotErr := fn(pa[argNum]); gotErr != nil && err == nil { err = fmt.Errorf("PerArg rule %v: arg[%d] = %v (type %T): %v", pa, argNum, pa[argNum], pa[argNum], gotErr) } } else { beforeRecurse := callCount r.Recurse(process) if callCount == beforeRecurse { err = fmt.Errorf("rule %v (type: %T) is not a PerArg or a recursive rule", r, r) } } return r } process(sr.rules[sysno]) return err } // DenyNewExecMappings is a set of rules that denies creating new executable // mappings and converting existing ones. var DenyNewExecMappings = MakeSyscallRules(map[uintptr]SyscallRule{ unix.SYS_MMAP: PerArg{ AnyValue{}, AnyValue{}, MaskedEqual(unix.PROT_EXEC, unix.PROT_EXEC), }, unix.SYS_MPROTECT: PerArg{ AnyValue{}, AnyValue{}, MaskedEqual(unix.PROT_EXEC, unix.PROT_EXEC), }, }) golang-gvisor-gvisor-0.0~20240729.0/pkg/seccomp/seccomp_state_autogen.go000066400000000000000000000000711465435605700257550ustar00rootroot00000000000000// automatically generated by stateify. package seccomp golang-gvisor-gvisor-0.0~20240729.0/pkg/seccomp/seccomp_unsafe.go000066400000000000000000000100551465435605700243770ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package seccomp import ( "fmt" "runtime" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" ) // SetFilter installs the given BPF program. func SetFilter(instrs []bpf.Instruction) error { // PR_SET_NO_NEW_PRIVS is required in order to enable seccomp. See // seccomp(2) for details. // // PR_SET_NO_NEW_PRIVS is specific to the calling thread, not the whole // thread group, so between PR_SET_NO_NEW_PRIVS and seccomp() below we must // remain on the same thread. no_new_privs will be propagated to other // threads in the thread group by seccomp(SECCOMP_FILTER_FLAG_TSYNC), in // kernel/seccomp.c:seccomp_sync_threads(). runtime.LockOSThread() defer runtime.UnlockOSThread() if _, _, errno := unix.RawSyscall6(unix.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0, 0); errno != 0 { return errno } sockProg := linux.SockFprog{ Len: uint16(len(instrs)), Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0])), } tid, errno := seccomp(linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, unsafe.Pointer(&sockProg)) if errno != 0 { return errno } // "On error, if SECCOMP_FILTER_FLAG_TSYNC was used, the return value is // the ID of the thread that caused the synchronization failure. (This ID // is a kernel thread ID of the type returned by clone(2) and gettid(2).)" // - seccomp(2) if tid != 0 { return fmt.Errorf("couldn't synchronize filter to TID %d", tid) } return nil } // SetFilterInChild is equivalent to SetFilter, but: // // - It is safe to call after runtime.syscall_runtime_AfterForkInChild. // // - It requires that the calling goroutine cannot be moved to another thread, // which either requires that runtime.LockOSThread() is in effect or that the // caller is in fact in a fork()ed child process. // // - Since fork()ed child processes cannot perform heap allocation, it returns // a unix.Errno rather than an error. // // - The race instrumentation has to be disabled for all functions that are // called in a forked child. // //go:norace //go:nosplit func SetFilterInChild(instrs []bpf.Instruction) unix.Errno { if _, _, errno := unix.RawSyscall6(unix.SYS_PRCTL, linux.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0, 0); errno != 0 { return errno } sockProg := linux.SockFprog{ Len: uint16(len(instrs)), Filter: (*linux.BPFInstruction)(unsafe.Pointer(&instrs[0])), } tid, errno := seccomp(linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_TSYNC, unsafe.Pointer(&sockProg)) if errno != 0 { return errno } if tid != 0 { // Return an errno that seccomp(2) doesn't to uniquely identify this // case. Since this case occurs if another thread has a conflicting // filter set, "name not unique on network" is at least suggestive? return unix.ENOTUNIQ } return 0 } func isKillProcessAvailable() (bool, error) { action := uint32(linux.SECCOMP_RET_KILL_PROCESS) if _, errno := seccomp(linux.SECCOMP_GET_ACTION_AVAIL, 0, unsafe.Pointer(&action)); errno != 0 { // EINVAL: SECCOMP_GET_ACTION_AVAIL not in this kernel yet. // EOPNOTSUPP: SECCOMP_RET_KILL_PROCESS not supported. if errno == unix.EINVAL || errno == unix.EOPNOTSUPP { return false, nil } return false, errno } return true, nil } // seccomp calls seccomp(2). This is safe to call from an afterFork context. // //go:nosplit func seccomp(op, flags uint32, ptr unsafe.Pointer) (uintptr, unix.Errno) { n, _, errno := unix.RawSyscall(SYS_SECCOMP, uintptr(op), uintptr(flags), uintptr(ptr)) return n, errno } golang-gvisor-gvisor-0.0~20240729.0/pkg/seccomp/seccomp_unsafe_state_autogen.go000066400000000000000000000000711465435605700273160ustar00rootroot00000000000000// automatically generated by stateify. package seccomp golang-gvisor-gvisor-0.0~20240729.0/pkg/secio/000077500000000000000000000000001465435605700205265ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/secio/full_reader.go000066400000000000000000000016761465435605700233530ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package secio import ( "io" ) // FullReader adapts an io.Reader to never return partial reads with a nil // error. type FullReader struct { Reader io.Reader } // Read implements io.Reader.Read. func (r FullReader) Read(dst []byte) (int, error) { n, err := io.ReadFull(r.Reader, dst) if err == io.ErrUnexpectedEOF { return n, io.EOF } return n, err } golang-gvisor-gvisor-0.0~20240729.0/pkg/secio/secio.go000066400000000000000000000062361465435605700221660ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package secio provides support for sectioned I/O. package secio import ( "errors" "io" ) // ErrReachedLimit is returned when SectionReader.Read or SectionWriter.Write // reaches its limit. var ErrReachedLimit = errors.New("reached limit") // SectionReader implements io.Reader on a section of an underlying io.ReaderAt. // It is similar to io.SectionReader, but: // // - Reading beyond the limit returns ErrReachedLimit, not io.EOF. // // - Limit overflow is handled correctly. type SectionReader struct { r io.ReaderAt off int64 limit int64 } // Read implements io.Reader.Read. func (r *SectionReader) Read(dst []byte) (int, error) { if r.limit >= 0 { if max := r.limit - r.off; max < int64(len(dst)) { dst = dst[:max] } } n, err := r.r.ReadAt(dst, r.off) r.off += int64(n) if err == nil && r.off == r.limit { err = ErrReachedLimit } return n, err } // NewOffsetReader returns an io.Reader that reads from r starting at offset // off. func NewOffsetReader(r io.ReaderAt, off int64) *SectionReader { return &SectionReader{r, off, -1} } // NewSectionReader returns an io.Reader that reads from r starting at offset // off and stops with ErrReachedLimit after n bytes. func NewSectionReader(r io.ReaderAt, off int64, n int64) *SectionReader { // If off + n overflows, it will be < 0 such that no limit applies, but // this is the correct behavior as long as r prohibits reading at offsets // beyond MaxInt64. return &SectionReader{r, off, off + n} } // SectionWriter implements io.Writer on a section of an underlying // io.WriterAt. Writing beyond the limit returns ErrReachedLimit. type SectionWriter struct { w io.WriterAt off int64 limit int64 } // Write implements io.Writer.Write. func (w *SectionWriter) Write(src []byte) (int, error) { if w.limit >= 0 { if max := w.limit - w.off; max < int64(len(src)) { src = src[:max] } } n, err := w.w.WriteAt(src, w.off) w.off += int64(n) if err == nil && w.off == w.limit { err = ErrReachedLimit } return n, err } // NewOffsetWriter returns an io.Writer that writes to w starting at offset // off. func NewOffsetWriter(w io.WriterAt, off int64) *SectionWriter { return &SectionWriter{w, off, -1} } // NewSectionWriter returns an io.Writer that writes to w starting at offset // off and stops with ErrReachedLimit after n bytes. func NewSectionWriter(w io.WriterAt, off int64, n int64) *SectionWriter { // If off + n overflows, it will be < 0 such that no limit applies, but // this is the correct behavior as long as w prohibits writing at offsets // beyond MaxInt64. return &SectionWriter{w, off, off + n} } golang-gvisor-gvisor-0.0~20240729.0/pkg/secio/secio_state_autogen.go000066400000000000000000000000671465435605700251040ustar00rootroot00000000000000// automatically generated by stateify. package secio golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/000077500000000000000000000000001465435605700207505ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/000077500000000000000000000000001465435605700216655ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/aligned.go000066400000000000000000000021441465435605700236200ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package arch import ( "reflect" ) // alignedBytes returns a slice of size bytes, aligned in memory to the given // alignment. This is used because we require certain structures to be aligned // in a specific way (for example, the X86 floating point data). func alignedBytes(size, alignment uint) []byte { data := make([]byte, size+alignment-1) offset := uint(reflect.ValueOf(data).Index(0).Addr().Pointer() % uintptr(alignment)) if offset == 0 { return data[:size:size] } return data[alignment-offset:][:size:size] } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch.go000066400000000000000000000272221465435605700231360ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package arch provides abstractions around architecture-dependent details, // such as syscall calling conventions, native types, etc. package arch import ( "fmt" "io" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/limits" ) // Arch describes an architecture. type Arch int const ( // AMD64 is the x86-64 architecture. AMD64 Arch = iota // ARM64 is the aarch64 architecture. ARM64 ) // String implements fmt.Stringer. func (a Arch) String() string { switch a { case AMD64: return "amd64" case ARM64: return "arm64" default: return fmt.Sprintf("Arch(%d)", a) } } // contextInterface provides architecture-dependent information for a thread. // This is currently not referenced, because there exists only one concrete // implementation of this interface (*Context64), which we reference directly // wherever this interface could otherwise be used in order to avoid the // overhead involved in calling functions on interfaces in Go. // This interface is still useful in order to see the entire // architecture-dependent call surface it must support, as this is difficult // to follow across the rest of this module due to the conditional compilation // of the files that make it up. // // NOTE(b/34169503): Currently we use uintptr here to refer to a generic native // register value. While this will work for the foreseeable future, it isn't // strictly correct. We may want to create some abstraction that makes this // more clear or enables us to store values of arbitrary widths. This is // particularly true for RegisterMap(). type contextInterface interface { // Arch returns the architecture for this Context. Arch() Arch // Native converts a generic type to a native value. // // Because the architecture is not specified here, we may be dealing // with return values of varying sizes (for example ARCH_GETFS). This // is a simple utility function to convert to the native size in these // cases, and then we can CopyOut. Native(val uintptr) marshal.Marshallable // Value converts a native type back to a generic value. // Once a value has been converted to native via the above call -- it // can be converted back here. Value(val marshal.Marshallable) uintptr // Width returns the number of bytes for a native value. Width() uint // Fork creates a clone of the context. Fork() *Context64 // SyscallNo returns the syscall number. SyscallNo() uintptr // SyscallSaveOrig save original register value. SyscallSaveOrig() // SyscallArgs returns the syscall arguments in an array. SyscallArgs() SyscallArguments // Return returns the return value for a system call. Return() uintptr // SetReturn sets the return value for a system call. SetReturn(value uintptr) // RestartSyscall reverses over the current syscall instruction, such that // when the application resumes execution the syscall will be re-attempted. RestartSyscall() // RestartSyscallWithRestartBlock reverses over the current syscall // instraction and overwrites the current syscall number with that of // restart_syscall(2). This causes the application to restart the current // syscall with a custom function when execution resumes. RestartSyscallWithRestartBlock() // IP returns the current instruction pointer. IP() uintptr // SetIP sets the current instruction pointer. SetIP(value uintptr) // Stack returns the current stack pointer. Stack() uintptr // SetStack sets the current stack pointer. SetStack(value uintptr) // TLS returns the current TLS pointer. TLS() uintptr // SetTLS sets the current TLS pointer. Returns false if value is invalid. SetTLS(value uintptr) bool // SetOldRSeqInterruptedIP sets the register that contains the old IP // when an "old rseq" restartable sequence is interrupted. SetOldRSeqInterruptedIP(value uintptr) // StateData returns a pointer to underlying architecture state. StateData() *State // RegisterMap returns a map of all registers. RegisterMap() (map[string]uintptr, error) // SignalSetup modifies the context in preparation for handling the // given signal. // // st is the stack where the signal handler frame should be // constructed. // // act is the SigAction that specifies how this signal is being // handled. // // info is the SignalInfo of the signal being delivered. // // alt is the alternate signal stack (even if the alternate signal // stack is not going to be used). // // sigset is the signal mask before entering the signal handler. // // featureSet is the application CPU feature set. SignalSetup(st *Stack, act *linux.SigAction, info *linux.SignalInfo, alt *linux.SignalStack, sigset linux.SignalSet, featureSet cpuid.FeatureSet) error // SignalRestore restores context after returning from a signal // handler. // // st is the current thread stack. // // rt is true if SignalRestore is being entered from rt_sigreturn and // false if SignalRestore is being entered from sigreturn. // // featureSet is the application CPU feature set. // // SignalRestore returns the thread's new signal mask. SignalRestore(st *Stack, rt bool, featureSet cpuid.FeatureSet) (linux.SignalSet, linux.SignalStack, error) // SingleStep returns true if single stepping is enabled. SingleStep() bool // SetSingleStep enables single stepping. SetSingleStep() // ClearSingleStep disables single stepping. ClearSingleStep() // FloatingPointData will be passed to underlying save routines. FloatingPointData() *fpu.State // NewMmapLayout returns a layout for a new MM, where MinAddr for the // returned layout must be no lower than min, and MaxAddr for the returned // layout must be no higher than max. Repeated calls to NewMmapLayout may // return different layouts. NewMmapLayout(min, max hostarch.Addr, limits *limits.LimitSet) (MmapLayout, error) // PIELoadAddress returns a preferred load address for a // position-independent executable within l. PIELoadAddress(l MmapLayout) hostarch.Addr // Hack around our package dependences being too broken to support the // equivalent of arch_ptrace(): // PtracePeekUser implements ptrace(PTRACE_PEEKUSR). PtracePeekUser(addr uintptr) (marshal.Marshallable, error) // PtracePokeUser implements ptrace(PTRACE_POKEUSR). PtracePokeUser(addr, data uintptr) error // PtraceGetRegs implements ptrace(PTRACE_GETREGS) by writing the // general-purpose registers represented by this Context to dst and // returning the number of bytes written. PtraceGetRegs(dst io.Writer) (int, error) // PtraceSetRegs implements ptrace(PTRACE_SETREGS) by reading // general-purpose registers from src into this Context and returning the // number of bytes read. PtraceSetRegs(src io.Reader) (int, error) // PtraceGetRegSet implements ptrace(PTRACE_GETREGSET) by writing the // register set given by architecture-defined value regset from this // Context to dst and returning the number of bytes written, which must be // less than or equal to maxlen. PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int, fs cpuid.FeatureSet) (int, error) // PtraceSetRegSet implements ptrace(PTRACE_SETREGSET) by reading the // register set given by architecture-defined value regset from src and // returning the number of bytes read, which must be less than or equal to // maxlen. PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int, fs cpuid.FeatureSet) (int, error) // FullRestore returns 'true' if all CPU registers must be restored // when switching to the untrusted application. Typically a task enters // and leaves the kernel via a system call. Platform.Switch() may // optimize for this by not saving/restoring all registers if allowed // by the ABI. For e.g. the amd64 ABI specifies that syscall clobbers // %rcx and %r11. If FullRestore returns true then these optimizations // must be disabled and all registers restored. FullRestore() bool } // Compile-time assertion that Context64 implements contextInterface. var _ = (contextInterface)((*Context64)(nil)) // MmapDirection is a search direction for mmaps. type MmapDirection int const ( // MmapBottomUp instructs mmap to prefer lower addresses. MmapBottomUp MmapDirection = iota // MmapTopDown instructs mmap to prefer higher addresses. MmapTopDown ) // MmapLayout defines the layout of the user address space for a particular // MemoryManager. // // Note that "highest address" below is always exclusive. // // +stateify savable type MmapLayout struct { // MinAddr is the lowest mappable address. MinAddr hostarch.Addr // MaxAddr is the highest mappable address. MaxAddr hostarch.Addr // BottomUpBase is the lowest address that may be returned for a // MmapBottomUp mmap. BottomUpBase hostarch.Addr // TopDownBase is the highest address that may be returned for a // MmapTopDown mmap. TopDownBase hostarch.Addr // DefaultDirection is the direction for most non-fixed mmaps in this // layout. DefaultDirection MmapDirection // MaxStackRand is the maximum randomization to apply to stack // allocations to maintain a proper gap between the stack and // TopDownBase. MaxStackRand uint64 } // Valid returns true if this layout is valid. func (m *MmapLayout) Valid() bool { if m.MinAddr > m.MaxAddr { return false } if m.BottomUpBase < m.MinAddr { return false } if m.BottomUpBase > m.MaxAddr { return false } if m.TopDownBase < m.MinAddr { return false } if m.TopDownBase > m.MaxAddr { return false } return true } // SyscallArgument is an argument supplied to a syscall implementation. The // methods used to access the arguments are named after the ***C type name*** and // they convert to the closest Go type available. For example, Int() refers to a // 32-bit signed integer argument represented in Go as an int32. // // Using the accessor methods guarantees that the conversion between types is // correct, taking into account size and signedness (i.e., zero-extension vs // signed-extension). type SyscallArgument struct { // Prefer to use accessor methods instead of 'Value' directly. Value uintptr } // SyscallArguments represents the set of arguments passed to a syscall. type SyscallArguments [6]SyscallArgument // Pointer returns the hostarch.Addr representation of a pointer argument. func (a SyscallArgument) Pointer() hostarch.Addr { return hostarch.Addr(a.Value) } // Int returns the int32 representation of a 32-bit signed integer argument. func (a SyscallArgument) Int() int32 { return int32(a.Value) } // Uint returns the uint32 representation of a 32-bit unsigned integer argument. func (a SyscallArgument) Uint() uint32 { return uint32(a.Value) } // Int64 returns the int64 representation of a 64-bit signed integer argument. func (a SyscallArgument) Int64() int64 { return int64(a.Value) } // Uint64 returns the uint64 representation of a 64-bit unsigned integer argument. func (a SyscallArgument) Uint64() uint64 { return uint64(a.Value) } // SizeT returns the uint representation of a size_t argument. func (a SyscallArgument) SizeT() uint { return uint(a.Value) } // ModeT returns the int representation of a mode_t argument. func (a SyscallArgument) ModeT() uint { return uint(uint16(a.Value)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_aarch64.go000066400000000000000000000160211465435605700244410ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package arch import ( "fmt" "io" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" ) // Registers represents the CPU registers for this architecture. // // +stateify savable type Registers struct { linux.PtraceRegs // TPIDR_EL0 is the EL0 Read/Write Software Thread ID Register. TPIDR_EL0 uint64 } const ( // SyscallWidth is the width of instructions. SyscallWidth = 4 ) // ARMTrapFlag is the mask for the trap flag. const ARMTrapFlag = uint64(1) << 21 // State contains the common architecture bits for aarch64 (the build tag of this // file ensures it's only built on aarch64). // // +stateify savable type State struct { // The system registers. Regs Registers // Our floating point state. fpState fpu.State `state:"wait"` // OrigR0 stores the value of register R0. OrigR0 uint64 } // Proto returns a protobuf representation of the system registers in State. func (s State) Proto() *rpb.Registers { regs := &rpb.ARM64Registers{ R0: s.Regs.Regs[0], R1: s.Regs.Regs[1], R2: s.Regs.Regs[2], R3: s.Regs.Regs[3], R4: s.Regs.Regs[4], R5: s.Regs.Regs[5], R6: s.Regs.Regs[6], R7: s.Regs.Regs[7], R8: s.Regs.Regs[8], R9: s.Regs.Regs[9], R10: s.Regs.Regs[10], R11: s.Regs.Regs[11], R12: s.Regs.Regs[12], R13: s.Regs.Regs[13], R14: s.Regs.Regs[14], R15: s.Regs.Regs[15], R16: s.Regs.Regs[16], R17: s.Regs.Regs[17], R18: s.Regs.Regs[18], R19: s.Regs.Regs[19], R20: s.Regs.Regs[20], R21: s.Regs.Regs[21], R22: s.Regs.Regs[22], R23: s.Regs.Regs[23], R24: s.Regs.Regs[24], R25: s.Regs.Regs[25], R26: s.Regs.Regs[26], R27: s.Regs.Regs[27], R28: s.Regs.Regs[28], R29: s.Regs.Regs[29], R30: s.Regs.Regs[30], Sp: s.Regs.Sp, Pc: s.Regs.Pc, Pstate: s.Regs.Pstate, Tls: s.Regs.TPIDR_EL0, } return &rpb.Registers{Arch: &rpb.Registers_Arm64{Arm64: regs}} } // Fork creates and returns an identical copy of the state. func (s *State) Fork() State { return State{ Regs: s.Regs, fpState: s.fpState.Fork(), OrigR0: s.OrigR0, } } // StateData implements Context.StateData. func (s *State) StateData() *State { return s } // SingleStep implements Context.SingleStep. func (s *State) SingleStep() bool { return false } // SetSingleStep enables single stepping. func (s *State) SetSingleStep() { // Set the trap flag. // TODO(gvisor.dev/issue/1239): ptrace single-step is not supported. } // ClearSingleStep enables single stepping. func (s *State) ClearSingleStep() { // Clear the trap flag. // TODO(gvisor.dev/issue/1239): ptrace single-step is not supported. } // RegisterMap returns a map of all registers. func (s *State) RegisterMap() (map[string]uintptr, error) { return map[string]uintptr{ "R0": uintptr(s.Regs.Regs[0]), "R1": uintptr(s.Regs.Regs[1]), "R2": uintptr(s.Regs.Regs[2]), "R3": uintptr(s.Regs.Regs[3]), "R4": uintptr(s.Regs.Regs[4]), "R5": uintptr(s.Regs.Regs[5]), "R6": uintptr(s.Regs.Regs[6]), "R7": uintptr(s.Regs.Regs[7]), "R8": uintptr(s.Regs.Regs[8]), "R9": uintptr(s.Regs.Regs[9]), "R10": uintptr(s.Regs.Regs[10]), "R11": uintptr(s.Regs.Regs[11]), "R12": uintptr(s.Regs.Regs[12]), "R13": uintptr(s.Regs.Regs[13]), "R14": uintptr(s.Regs.Regs[14]), "R15": uintptr(s.Regs.Regs[15]), "R16": uintptr(s.Regs.Regs[16]), "R17": uintptr(s.Regs.Regs[17]), "R18": uintptr(s.Regs.Regs[18]), "R19": uintptr(s.Regs.Regs[19]), "R20": uintptr(s.Regs.Regs[20]), "R21": uintptr(s.Regs.Regs[21]), "R22": uintptr(s.Regs.Regs[22]), "R23": uintptr(s.Regs.Regs[23]), "R24": uintptr(s.Regs.Regs[24]), "R25": uintptr(s.Regs.Regs[25]), "R26": uintptr(s.Regs.Regs[26]), "R27": uintptr(s.Regs.Regs[27]), "R28": uintptr(s.Regs.Regs[28]), "R29": uintptr(s.Regs.Regs[29]), "R30": uintptr(s.Regs.Regs[30]), "Sp": uintptr(s.Regs.Sp), "Pc": uintptr(s.Regs.Pc), "Pstate": uintptr(s.Regs.Pstate), "Tls": uintptr(s.Regs.TPIDR_EL0), }, nil } // PtraceGetRegs implements Context.PtraceGetRegs. func (s *State) PtraceGetRegs(dst io.Writer) (int, error) { regs := s.ptraceGetRegs() n, err := regs.WriteTo(dst) return int(n), err } func (s *State) ptraceGetRegs() Registers { return s.Regs } var ptraceRegistersSize = (*linux.PtraceRegs)(nil).SizeBytes() // PtraceSetRegs implements Context.PtraceSetRegs. func (s *State) PtraceSetRegs(src io.Reader) (int, error) { var regs Registers buf := make([]byte, ptraceRegistersSize) if _, err := io.ReadFull(src, buf); err != nil { return 0, err } regs.UnmarshalUnsafe(buf) if !regs.validRegs() { return 0, linuxerr.EINVAL } s.Regs = regs return ptraceRegistersSize, nil } // PtraceGetFPRegs implements Context.PtraceGetFPRegs. func (s *State) PtraceGetFPRegs(dst io.Writer) (int, error) { // TODO(gvisor.dev/issue/1238): floating-point is not supported. return 0, nil } // PtraceSetFPRegs implements Context.PtraceSetFPRegs. func (s *State) PtraceSetFPRegs(src io.Reader) (int, error) { // TODO(gvisor.dev/issue/1238): floating-point is not supported. return 0, nil } // Register sets defined in include/uapi/linux/elf.h. const ( _NT_PRSTATUS = 1 _NT_PRFPREG = 2 _NT_ARM_TLS = 0x401 ) // PtraceGetRegSet implements Context.PtraceGetRegSet. func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int, _ cpuid.FeatureSet) (int, error) { switch regset { case _NT_PRSTATUS: if maxlen < ptraceRegistersSize { return 0, linuxerr.EFAULT } return s.PtraceGetRegs(dst) default: return 0, linuxerr.EINVAL } } // PtraceSetRegSet implements Context.PtraceSetRegSet. func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int, _ cpuid.FeatureSet) (int, error) { switch regset { case _NT_PRSTATUS: if maxlen < ptraceRegistersSize { return 0, linuxerr.EFAULT } return s.PtraceSetRegs(src) default: return 0, linuxerr.EINVAL } } // FullRestore indicates whether a full restore is required. func (s *State) FullRestore() bool { return false } // New returns a new architecture context. func New(arch Arch) *Context64 { switch arch { case ARM64: return &Context64{ State{ fpState: fpu.NewState(), }, []fpu.State(nil), } } panic(fmt.Sprintf("unknown architecture %v", arch)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_aarch64_abi_autogen_unsafe.go000066400000000000000000000007331465435605700303420ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build arm64 // +build arm64 package arch import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_aarch64_state_autogen.go000066400000000000000000000030161465435605700273630ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 // +build arm64 package arch import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (r *Registers) StateTypeName() string { return "pkg/sentry/arch.Registers" } func (r *Registers) StateFields() []string { return []string{ "PtraceRegs", "TPIDR_EL0", } } func (r *Registers) beforeSave() {} // +checklocksignore func (r *Registers) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.PtraceRegs) stateSinkObject.Save(1, &r.TPIDR_EL0) } func (r *Registers) afterLoad(context.Context) {} // +checklocksignore func (r *Registers) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.PtraceRegs) stateSourceObject.Load(1, &r.TPIDR_EL0) } func (s *State) StateTypeName() string { return "pkg/sentry/arch.State" } func (s *State) StateFields() []string { return []string{ "Regs", "fpState", "OrigR0", } } func (s *State) beforeSave() {} // +checklocksignore func (s *State) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.Regs) stateSinkObject.Save(1, &s.fpState) stateSinkObject.Save(2, &s.OrigR0) } func (s *State) afterLoad(context.Context) {} // +checklocksignore func (s *State) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.Regs) stateSourceObject.LoadWait(1, &s.fpState) stateSourceObject.Load(2, &s.OrigR0) } func init() { state.Register((*Registers)(nil)) state.Register((*State)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_abi_autogen_unsafe.go000066400000000000000000000001441465435605700270260ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package arch import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_amd64.go000066400000000000000000000227621465435605700241350ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package arch import ( "bytes" "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/limits" ) // Host specifies the host architecture. const Host = AMD64 // These constants come directly from Linux. const ( // maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux // for a 64-bit process. maxAddr64 hostarch.Addr = (1 << 47) - hostarch.PageSize // maxStackRand64 is the maximum randomization to apply to the stack. // It is defined by arch/x86/mm/mmap.c:stack_maxrandom_size in Linux. maxStackRand64 = 16 << 30 // 16 GB // maxMmapRand64 is the maximum randomization to apply to the mmap // layout. It is defined by arch/x86/mm/mmap.c:arch_mmap_rnd in Linux. maxMmapRand64 = (1 << 28) * hostarch.PageSize // minGap64 is the minimum gap to leave at the top of the address space // for the stack. It is defined by arch/x86/mm/mmap.c:MIN_GAP in Linux. minGap64 = (128 << 20) + maxStackRand64 // preferredPIELoadAddr is the standard Linux position-independent // executable base load address. It is ELF_ET_DYN_BASE in Linux. // // The Platform {Min,Max}UserAddress() may preclude loading at this // address. See other preferredFoo comments below. preferredPIELoadAddr hostarch.Addr = maxAddr64 / 3 * 2 ) // These constants are selected as heuristics to help make the Platform's // potentially limited address space conform as closely to Linux as possible. const ( // Select a preferred minimum TopDownBase address. // // Some applications (TSAN and other *SANs) are very particular about // the way the Linux mmap allocator layouts out the address space. // // TSAN in particular expects top down allocations to be made in the // range [0x7e8000000000, 0x800000000000). // // The minimum TopDownBase on Linux would be: // 0x800000000000 - minGap64 - maxMmapRand64 = 0x7efbf8000000. // // (minGap64 because TSAN uses a small RLIMIT_STACK.) // // 0x7e8000000000 is selected arbitrarily by TSAN to leave room for // allocations below TopDownBase. // // N.B. ASAN and MSAN are more forgiving; ASAN allows allocations all // the way down to 0x10007fff8000, and MSAN down to 0x700000000000. // // Of course, there is no hard minimum to allocation; an allocator can // search all the way from TopDownBase to Min. However, TSAN declared // their range "good enough". // // We would like to pick a TopDownBase such that it is unlikely that an // allocator will select an address below TSAN's minimum. We achieve // this by trying to leave a sizable gap below TopDownBase. // // This is all "preferred" because the layout min/max address may not // allow us to select such a TopDownBase, in which case we have to fall // back to a layout that TSAN may not be happy with. preferredTopDownAllocMin hostarch.Addr = 0x7e8000000000 preferredAllocationGap = 128 << 30 // 128 GB preferredTopDownBaseMin = preferredTopDownAllocMin + preferredAllocationGap // minMmapRand64 is the smallest we are willing to make the // randomization to stay above preferredTopDownBaseMin. minMmapRand64 = (1 << 26) * hostarch.PageSize ) // Context64 represents an AMD64 context. // // +stateify savable type Context64 struct { State } // Arch implements Context.Arch. func (c *Context64) Arch() Arch { return AMD64 } // FloatingPointData returns the state of the floating-point unit. func (c *Context64) FloatingPointData() *fpu.State { return &c.State.fpState } // Fork returns an exact copy of this context. func (c *Context64) Fork() *Context64 { return &Context64{ State: c.State.Fork(), } } // Return returns the current syscall return value. func (c *Context64) Return() uintptr { return uintptr(c.Regs.Rax) } // SetReturn sets the syscall return value. func (c *Context64) SetReturn(value uintptr) { c.Regs.Rax = uint64(value) } // IP returns the current instruction pointer. func (c *Context64) IP() uintptr { return uintptr(c.Regs.Rip) } // SetIP sets the current instruction pointer. func (c *Context64) SetIP(value uintptr) { c.Regs.Rip = uint64(value) } // Stack returns the current stack pointer. func (c *Context64) Stack() uintptr { return uintptr(c.Regs.Rsp) } // SetStack sets the current stack pointer. func (c *Context64) SetStack(value uintptr) { c.Regs.Rsp = uint64(value) } // TLS returns the current TLS pointer. func (c *Context64) TLS() uintptr { return uintptr(c.Regs.Fs_base) } // SetTLS sets the current TLS pointer. Returns false if value is invalid. func (c *Context64) SetTLS(value uintptr) bool { if !isValidSegmentBase(uint64(value)) { return false } c.Regs.Fs = 0 c.Regs.Fs_base = uint64(value) return true } // SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP. func (c *Context64) SetOldRSeqInterruptedIP(value uintptr) { c.Regs.R10 = uint64(value) } // Native returns the native type for the given val. func (c *Context64) Native(val uintptr) marshal.Marshallable { v := primitive.Uint64(val) return &v } // Value returns the generic val for the given native type. func (c *Context64) Value(val marshal.Marshallable) uintptr { return uintptr(*val.(*primitive.Uint64)) } // Width returns the byte width of this architecture. func (c *Context64) Width() uint { return 8 } // mmapRand returns a random adjustment for randomizing an mmap layout. func mmapRand(max uint64) hostarch.Addr { return hostarch.Addr(rand.Int63n(int64(max))).RoundDown() } // NewMmapLayout implements Context.NewMmapLayout consistently with Linux. func (c *Context64) NewMmapLayout(min, max hostarch.Addr, r *limits.LimitSet) (MmapLayout, error) { min, ok := min.RoundUp() if !ok { return MmapLayout{}, unix.EINVAL } if max > maxAddr64 { max = maxAddr64 } max = max.RoundDown() if min > max { return MmapLayout{}, unix.EINVAL } stackSize := r.Get(limits.Stack) // MAX_GAP in Linux. maxGap := (max / 6) * 5 gap := hostarch.Addr(stackSize.Cur) if gap < minGap64 { gap = minGap64 } if gap > maxGap { gap = maxGap } defaultDir := MmapTopDown if stackSize.Cur == limits.Infinity { defaultDir = MmapBottomUp } topDownMin := max - gap - maxMmapRand64 maxRand := hostarch.Addr(maxMmapRand64) if topDownMin < preferredTopDownBaseMin { // Try to keep TopDownBase above preferredTopDownBaseMin by // shrinking maxRand. maxAdjust := maxRand - minMmapRand64 needAdjust := preferredTopDownBaseMin - topDownMin if needAdjust <= maxAdjust { maxRand -= needAdjust } } rnd := mmapRand(uint64(maxRand)) l := MmapLayout{ MinAddr: min, MaxAddr: max, // TASK_UNMAPPED_BASE in Linux. BottomUpBase: (max/3 + rnd).RoundDown(), TopDownBase: (max - gap - rnd).RoundDown(), DefaultDirection: defaultDir, // We may have reduced the maximum randomization to keep // TopDownBase above preferredTopDownBaseMin while maintaining // our stack gap. Stack allocations must use that max // randomization to avoiding eating into the gap. MaxStackRand: uint64(maxRand), } // Final sanity check on the layout. if !l.Valid() { panic(fmt.Sprintf("Invalid MmapLayout: %+v", l)) } return l, nil } // PIELoadAddress implements Context.PIELoadAddress. func (c *Context64) PIELoadAddress(l MmapLayout) hostarch.Addr { base := preferredPIELoadAddr max, ok := base.AddLength(maxMmapRand64) if !ok { panic(fmt.Sprintf("preferredPIELoadAddr %#x too large", base)) } if max > l.MaxAddr { // preferredPIELoadAddr won't fit; fall back to the standard // Linux behavior of 2/3 of TopDownBase. TSAN won't like this. // // Don't bother trying to shrink the randomization for now. base = l.TopDownBase / 3 * 2 } return base + mmapRand(maxMmapRand64) } // userStructSize is the size in bytes of Linux's struct user on amd64. const userStructSize = 928 // PtracePeekUser implements Context.PtracePeekUser. func (c *Context64) PtracePeekUser(addr uintptr) (marshal.Marshallable, error) { if addr&7 != 0 || addr >= userStructSize { return nil, unix.EIO } // PTRACE_PEEKUSER and PTRACE_POKEUSER are only effective on regs and // u_debugreg, returning 0 or silently no-oping for other fields // respectively. if addr < uintptr(ptraceRegistersSize) { regs := c.ptraceGetRegs() buf := make([]byte, regs.SizeBytes()) regs.MarshalUnsafe(buf) return c.Native(uintptr(hostarch.ByteOrder.Uint64(buf[addr:]))), nil } // Note: x86 debug registers are missing. return c.Native(0), nil } // PtracePokeUser implements Context.PtracePokeUser. func (c *Context64) PtracePokeUser(addr, data uintptr) error { if addr&7 != 0 || addr >= userStructSize { return unix.EIO } if addr < uintptr(ptraceRegistersSize) { regs := c.ptraceGetRegs() buf := make([]byte, regs.SizeBytes()) regs.MarshalUnsafe(buf) hostarch.ByteOrder.PutUint64(buf[addr:], uint64(data)) _, err := c.PtraceSetRegs(bytes.NewBuffer(buf)) return err } // Note: x86 debug registers are missing. return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_amd64_abi_autogen_unsafe.go000066400000000000000000000365761465435605700300430ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build amd64 && amd64 && amd64 // +build amd64,amd64,amd64 package arch import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "io" "reflect" "runtime" "unsafe" ) // Marshallable types used by this file. var _ marshal.Marshallable = (*SignalContext64)(nil) var _ marshal.Marshallable = (*UContext64)(nil) var _ marshal.Marshallable = (*linux.SignalSet)(nil) var _ marshal.Marshallable = (*linux.SignalStack)(nil) // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SignalContext64) SizeBytes() int { return 184 + (*linux.SignalSet)(nil).SizeBytes() + 8*8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SignalContext64) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.R8)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.R9)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.R10)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.R11)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.R12)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.R13)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.R14)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.R15)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Rdi)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Rsi)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Rbp)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Rbx)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Rdx)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Rax)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Rcx)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Rsp)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Rip)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Eflags)) dst = dst[8:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.Cs)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.Gs)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.Fs)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(s.Ss)) dst = dst[2:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Err)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Trapno)) dst = dst[8:] dst = s.Oldmask.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Cr2)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Fpstate)) dst = dst[8:] for idx := 0; idx < 8; idx++ { hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Reserved[idx])) dst = dst[8:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SignalContext64) UnmarshalBytes(src []byte) []byte { s.R8 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.R9 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.R10 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.R11 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.R12 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.R13 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.R14 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.R15 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Rdi = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Rsi = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Rbp = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Rbx = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Rdx = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Rax = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Rcx = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Rsp = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Rip = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Eflags = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Cs = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] s.Gs = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] s.Fs = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] s.Ss = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] s.Err = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Trapno = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = s.Oldmask.UnmarshalUnsafe(src) s.Cr2 = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Fpstate = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] for idx := 0; idx < 8; idx++ { s.Reserved[idx] = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SignalContext64) Packed() bool { return s.Oldmask.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SignalContext64) MarshalUnsafe(dst []byte) []byte { if s.Oldmask.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // Type SignalContext64 doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SignalContext64) UnmarshalUnsafe(src []byte) []byte { if s.Oldmask.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type SignalContext64 doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SignalContext64) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Oldmask.Packed() { // Type SignalContext64 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SignalContext64) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SignalContext64) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Oldmask.Packed() { // Type SignalContext64 doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SignalContext64) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SignalContext64) WriteTo(writer io.Writer) (int64, error) { if !s.Oldmask.Packed() { // Type SignalContext64 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UContext64) SizeBytes() int { return 16 + (*linux.SignalStack)(nil).SizeBytes() + (*SignalContext64)(nil).SizeBytes() + (*linux.SignalSet)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UContext64) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Flags)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Link)) dst = dst[8:] dst = u.Stack.MarshalUnsafe(dst) dst = u.MContext.MarshalUnsafe(dst) dst = u.Sigset.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UContext64) UnmarshalBytes(src []byte) []byte { u.Flags = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.Link = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = u.Stack.UnmarshalUnsafe(src) src = u.MContext.UnmarshalUnsafe(src) src = u.Sigset.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UContext64) Packed() bool { return u.MContext.Packed() && u.Sigset.Packed() && u.Stack.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UContext64) MarshalUnsafe(dst []byte) []byte { if u.MContext.Packed() && u.Sigset.Packed() && u.Stack.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // Type UContext64 doesn't have a packed layout in memory, fallback to MarshalBytes. return u.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UContext64) UnmarshalUnsafe(src []byte) []byte { if u.MContext.Packed() && u.Sigset.Packed() && u.Stack.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UContext64 doesn't have a packed layout in memory, fallback to UnmarshalBytes. return u.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UContext64) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.MContext.Packed() && u.Sigset.Packed() && u.Stack.Packed() { // Type UContext64 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. u.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UContext64) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UContext64) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.MContext.Packed() && u.Sigset.Packed() && u.Stack.Packed() { // Type UContext64 doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. u.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UContext64) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UContext64) WriteTo(writer io.Writer) (int64, error) { if !u.MContext.Packed() && u.Sigset.Packed() && u.Stack.Packed() { // Type UContext64 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, u.SizeBytes()) u.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_amd64_state_autogen.go000066400000000000000000000013661465435605700270540ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 && amd64 && amd64 // +build amd64,amd64,amd64 package arch import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (c *Context64) StateTypeName() string { return "pkg/sentry/arch.Context64" } func (c *Context64) StateFields() []string { return []string{ "State", } } func (c *Context64) beforeSave() {} // +checklocksignore func (c *Context64) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.State) } func (c *Context64) afterLoad(context.Context) {} // +checklocksignore func (c *Context64) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.State) } func init() { state.Register((*Context64)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_arm64.go000066400000000000000000000201651465435605700241460ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package arch import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/limits" ) // Host specifies the host architecture. const Host = ARM64 // These constants come directly from Linux. const ( // maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux // for a 64-bit process. maxAddr64 hostarch.Addr = (1 << 48) // maxStackRand64 is the maximum randomization to apply to the stack. // It is defined by arch/arm64/mm/mmap.c:(STACK_RND_MASK << PAGE_SHIFT) in Linux. maxStackRand64 = 0x3ffff << 12 // 16 GB // maxMmapRand64 is the maximum randomization to apply to the mmap // layout. It is defined by arch/arm64/mm/mmap.c:arch_mmap_rnd in Linux. maxMmapRand64 = (1 << 33) * hostarch.PageSize // minGap64 is the minimum gap to leave at the top of the address space // for the stack. It is defined by arch/arm64/mm/mmap.c:MIN_GAP in Linux. minGap64 = (128 << 20) + maxStackRand64 // preferredPIELoadAddr is the standard Linux position-independent // executable base load address. It is ELF_ET_DYN_BASE in Linux. // // The Platform {Min,Max}UserAddress() may preclude loading at this // address. See other preferredFoo comments below. preferredPIELoadAddr hostarch.Addr = maxAddr64 / 6 * 5 ) var ( // CPUIDInstruction doesn't exist on ARM64. CPUIDInstruction = []byte{} ) // These constants are selected as heuristics to help make the Platform's // potentially limited address space conform as closely to Linux as possible. const ( preferredTopDownAllocMin hostarch.Addr = 0x7e8000000000 preferredAllocationGap = 128 << 30 // 128 GB preferredTopDownBaseMin = preferredTopDownAllocMin + preferredAllocationGap // minMmapRand64 is the smallest we are willing to make the // randomization to stay above preferredTopDownBaseMin. minMmapRand64 = (1 << 18) * hostarch.PageSize ) // Context64 represents an ARM64 context. // // +stateify savable type Context64 struct { State sigFPState []fpu.State // fpstate to be restored on sigreturn. } // Arch implements Context.Arch. func (c *Context64) Arch() Arch { return ARM64 } func (c *Context64) copySigFPState() []fpu.State { var sigfps []fpu.State for _, s := range c.sigFPState { sigfps = append(sigfps, s.Fork()) } return sigfps } // Fork returns an exact copy of this context. func (c *Context64) Fork() *Context64 { return &Context64{ State: c.State.Fork(), sigFPState: c.copySigFPState(), } } // General purpose registers usage on Arm64: // R0...R7: parameter/result registers. // R8: indirect result location register. // R9...R15: temporary rgisters. // R16: the first intra-procedure-call scratch register. // R17: the second intra-procedure-call scratch register. // R18: the platform register. // R19...R28: callee-saved registers. // R29: the frame pointer. // R30: the link register. // Return returns the current syscall return value. func (c *Context64) Return() uintptr { return uintptr(c.Regs.Regs[0]) } // SetReturn sets the syscall return value. func (c *Context64) SetReturn(value uintptr) { c.Regs.Regs[0] = uint64(value) } // IP returns the current instruction pointer. func (c *Context64) IP() uintptr { return uintptr(c.Regs.Pc) } // SetIP sets the current instruction pointer. func (c *Context64) SetIP(value uintptr) { c.Regs.Pc = uint64(value) } // Stack returns the current stack pointer. func (c *Context64) Stack() uintptr { return uintptr(c.Regs.Sp) } // SetStack sets the current stack pointer. func (c *Context64) SetStack(value uintptr) { c.Regs.Sp = uint64(value) } // TLS returns the current TLS pointer. func (c *Context64) TLS() uintptr { return uintptr(c.Regs.TPIDR_EL0) } // SetTLS sets the current TLS pointer. Returns false if value is invalid. func (c *Context64) SetTLS(value uintptr) bool { if value >= uintptr(maxAddr64) { return false } c.Regs.TPIDR_EL0 = uint64(value) return true } // SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP. func (c *Context64) SetOldRSeqInterruptedIP(value uintptr) { c.Regs.Regs[3] = uint64(value) } // Native returns the native type for the given val. func (c *Context64) Native(val uintptr) marshal.Marshallable { v := primitive.Uint64(val) return &v } // Value returns the generic val for the given native type. func (c *Context64) Value(val marshal.Marshallable) uintptr { return uintptr(*val.(*primitive.Uint64)) } // Width returns the byte width of this architecture. func (c *Context64) Width() uint { return 8 } // mmapRand returns a random adjustment for randomizing an mmap layout. func mmapRand(max uint64) hostarch.Addr { return hostarch.Addr(rand.Int63n(int64(max))).RoundDown() } // NewMmapLayout implements Context.NewMmapLayout consistently with Linux. func (c *Context64) NewMmapLayout(min, max hostarch.Addr, r *limits.LimitSet) (MmapLayout, error) { min, ok := min.RoundUp() if !ok { return MmapLayout{}, unix.EINVAL } if max > maxAddr64 { max = maxAddr64 } max = max.RoundDown() if min > max { return MmapLayout{}, unix.EINVAL } stackSize := r.Get(limits.Stack) // MAX_GAP in Linux. maxGap := (max / 6) * 5 gap := hostarch.Addr(stackSize.Cur) if gap < minGap64 { gap = minGap64 } if gap > maxGap { gap = maxGap } defaultDir := MmapTopDown if stackSize.Cur == limits.Infinity { defaultDir = MmapBottomUp } topDownMin := max - gap - maxMmapRand64 maxRand := hostarch.Addr(maxMmapRand64) if topDownMin < preferredTopDownBaseMin { // Try to keep TopDownBase above preferredTopDownBaseMin by // shrinking maxRand. maxAdjust := maxRand - minMmapRand64 needAdjust := preferredTopDownBaseMin - topDownMin if needAdjust <= maxAdjust { maxRand -= needAdjust } } rnd := mmapRand(uint64(maxRand)) l := MmapLayout{ MinAddr: min, MaxAddr: max, // TASK_UNMAPPED_BASE in Linux. BottomUpBase: (max/3 + rnd).RoundDown(), TopDownBase: (max - gap - rnd).RoundDown(), DefaultDirection: defaultDir, // We may have reduced the maximum randomization to keep // TopDownBase above preferredTopDownBaseMin while maintaining // our stack gap. Stack allocations must use that max // randomization to avoiding eating into the gap. MaxStackRand: uint64(maxRand), } // Final sanity check on the layout. if !l.Valid() { panic(fmt.Sprintf("Invalid MmapLayout: %+v", l)) } return l, nil } // PIELoadAddress implements Context.PIELoadAddress. func (c *Context64) PIELoadAddress(l MmapLayout) hostarch.Addr { base := preferredPIELoadAddr max, ok := base.AddLength(maxMmapRand64) if !ok { panic(fmt.Sprintf("preferredPIELoadAddr %#x too large", base)) } if max > l.MaxAddr { // preferredPIELoadAddr won't fit; fall back to the standard // Linux behavior of 2/3 of TopDownBase. TSAN won't like this. // // Don't bother trying to shrink the randomization for now. base = l.TopDownBase / 3 * 2 } return base + mmapRand(maxMmapRand64) } // PtracePeekUser implements Context.PtracePeekUser. func (c *Context64) PtracePeekUser(addr uintptr) (marshal.Marshallable, error) { // TODO(gvisor.dev/issue/1239): Full ptrace supporting for Arm64. return c.Native(0), nil } // PtracePokeUser implements Context.PtracePokeUser. func (c *Context64) PtracePokeUser(addr, data uintptr) error { // TODO(gvisor.dev/issue/1239): Full ptrace supporting for Arm64. return nil } // FloatingPointData returns the state of the floating-point unit. func (c *Context64) FloatingPointData() *fpu.State { return &c.State.fpState } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_arm64_abi_autogen_unsafe.go000066400000000000000000000534661465435605700300560ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build arm64 && arm64 && arm64 // +build arm64,arm64,arm64 package arch import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "io" "reflect" "runtime" "unsafe" ) // Marshallable types used by this file. var _ marshal.Marshallable = (*FpsimdContext)(nil) var _ marshal.Marshallable = (*SignalContext64)(nil) var _ marshal.Marshallable = (*UContext64)(nil) var _ marshal.Marshallable = (*aarch64Ctx)(nil) var _ marshal.Marshallable = (*linux.SignalSet)(nil) var _ marshal.Marshallable = (*linux.SignalStack)(nil) // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FpsimdContext) SizeBytes() int { return 8 + (*aarch64Ctx)(nil).SizeBytes() + 8*64 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FpsimdContext) MarshalBytes(dst []byte) []byte { dst = f.Head.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Fpsr)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Fpcr)) dst = dst[4:] for idx := 0; idx < 64; idx++ { hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Vregs[idx])) dst = dst[8:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FpsimdContext) UnmarshalBytes(src []byte) []byte { src = f.Head.UnmarshalUnsafe(src) f.Fpsr = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.Fpcr = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 64; idx++ { f.Vregs[idx] = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FpsimdContext) Packed() bool { return f.Head.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FpsimdContext) MarshalUnsafe(dst []byte) []byte { if f.Head.Packed() { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // Type FpsimdContext doesn't have a packed layout in memory, fallback to MarshalBytes. return f.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FpsimdContext) UnmarshalUnsafe(src []byte) []byte { if f.Head.Packed() { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type FpsimdContext doesn't have a packed layout in memory, fallback to UnmarshalBytes. return f.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FpsimdContext) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !f.Head.Packed() { // Type FpsimdContext doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. f.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FpsimdContext) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FpsimdContext) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !f.Head.Packed() { // Type FpsimdContext doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(f.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. f.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FpsimdContext) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FpsimdContext) WriteTo(writer io.Writer) (int64, error) { if !f.Head.Packed() { // Type FpsimdContext doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, f.SizeBytes()) f.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SignalContext64) SizeBytes() int { return 32 + 8*31 + 1*8 + (*FpsimdContext)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SignalContext64) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.FaultAddr)) dst = dst[8:] for idx := 0; idx < 31; idx++ { hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Regs[idx])) dst = dst[8:] } hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Sp)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Pc)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.Pstate)) dst = dst[8:] for idx := 0; idx < 8; idx++ { dst[0] = byte(s._pad[idx]) dst = dst[1:] } dst = s.Fpsimd64.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SignalContext64) UnmarshalBytes(src []byte) []byte { s.FaultAddr = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] for idx := 0; idx < 31; idx++ { s.Regs[idx] = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] } s.Sp = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Pc = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.Pstate = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] for idx := 0; idx < 8; idx++ { s._pad[idx] = src[0] src = src[1:] } src = s.Fpsimd64.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SignalContext64) Packed() bool { return s.Fpsimd64.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SignalContext64) MarshalUnsafe(dst []byte) []byte { if s.Fpsimd64.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // Type SignalContext64 doesn't have a packed layout in memory, fallback to MarshalBytes. return s.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SignalContext64) UnmarshalUnsafe(src []byte) []byte { if s.Fpsimd64.Packed() { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type SignalContext64 doesn't have a packed layout in memory, fallback to UnmarshalBytes. return s.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SignalContext64) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Fpsimd64.Packed() { // Type SignalContext64 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. s.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SignalContext64) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SignalContext64) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !s.Fpsimd64.Packed() { // Type SignalContext64 doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(s.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. s.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SignalContext64) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SignalContext64) WriteTo(writer io.Writer) (int64, error) { if !s.Fpsimd64.Packed() { // Type SignalContext64 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, s.SizeBytes()) s.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *UContext64) SizeBytes() int { return 16 + (*linux.SignalStack)(nil).SizeBytes() + (*linux.SignalSet)(nil).SizeBytes() + 1*120 + 1*8 + (*SignalContext64)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *UContext64) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Flags)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Link)) dst = dst[8:] dst = u.Stack.MarshalUnsafe(dst) dst = u.Sigset.MarshalUnsafe(dst) for idx := 0; idx < 120; idx++ { dst[0] = byte(u._pad[idx]) dst = dst[1:] } for idx := 0; idx < 8; idx++ { dst[0] = byte(u._pad2[idx]) dst = dst[1:] } dst = u.MContext.MarshalUnsafe(dst) return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *UContext64) UnmarshalBytes(src []byte) []byte { u.Flags = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] u.Link = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] src = u.Stack.UnmarshalUnsafe(src) src = u.Sigset.UnmarshalUnsafe(src) for idx := 0; idx < 120; idx++ { u._pad[idx] = src[0] src = src[1:] } for idx := 0; idx < 8; idx++ { u._pad2[idx] = src[0] src = src[1:] } src = u.MContext.UnmarshalUnsafe(src) return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *UContext64) Packed() bool { return u.MContext.Packed() && u.Sigset.Packed() && u.Stack.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *UContext64) MarshalUnsafe(dst []byte) []byte { if u.MContext.Packed() && u.Sigset.Packed() && u.Stack.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // Type UContext64 doesn't have a packed layout in memory, fallback to MarshalBytes. return u.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *UContext64) UnmarshalUnsafe(src []byte) []byte { if u.MContext.Packed() && u.Sigset.Packed() && u.Stack.Packed() { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type UContext64 doesn't have a packed layout in memory, fallback to UnmarshalBytes. return u.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *UContext64) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.MContext.Packed() && u.Sigset.Packed() && u.Stack.Packed() { // Type UContext64 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. u.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *UContext64) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *UContext64) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !u.MContext.Packed() && u.Sigset.Packed() && u.Stack.Packed() { // Type UContext64 doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(u.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. u.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *UContext64) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *UContext64) WriteTo(writer io.Writer) (int64, error) { if !u.MContext.Packed() && u.Sigset.Packed() && u.Stack.Packed() { // Type UContext64 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, u.SizeBytes()) u.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (a *aarch64Ctx) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (a *aarch64Ctx) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(a.Magic)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(a.Size)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (a *aarch64Ctx) UnmarshalBytes(src []byte) []byte { a.Magic = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] a.Size = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (a *aarch64Ctx) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (a *aarch64Ctx) MarshalUnsafe(dst []byte) []byte { size := a.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(a), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (a *aarch64Ctx) UnmarshalUnsafe(src []byte) []byte { size := a.SizeBytes() gohacks.Memmove(unsafe.Pointer(a), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (a *aarch64Ctx) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(a))) hdr.Len = a.SizeBytes() hdr.Cap = a.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that a // must live until the use above. runtime.KeepAlive(a) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (a *aarch64Ctx) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return a.CopyOutN(cc, addr, a.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (a *aarch64Ctx) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(a))) hdr.Len = a.SizeBytes() hdr.Cap = a.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that a // must live until the use above. runtime.KeepAlive(a) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (a *aarch64Ctx) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return a.CopyInN(cc, addr, a.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (a *aarch64Ctx) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(a))) hdr.Len = a.SizeBytes() hdr.Cap = a.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that a // must live until the use above. runtime.KeepAlive(a) // escapes: replaced by intrinsic. return int64(length), err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_arm64_state_autogen.go000066400000000000000000000015301465435605700270630ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 && arm64 && arm64 // +build arm64,arm64,arm64 package arch import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (c *Context64) StateTypeName() string { return "pkg/sentry/arch.Context64" } func (c *Context64) StateFields() []string { return []string{ "State", "sigFPState", } } func (c *Context64) beforeSave() {} // +checklocksignore func (c *Context64) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.State) stateSinkObject.Save(1, &c.sigFPState) } func (c *Context64) afterLoad(context.Context) {} // +checklocksignore func (c *Context64) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.State) stateSourceObject.Load(1, &c.sigFPState) } func init() { state.Register((*Context64)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_state_autogen.go000066400000000000000000000035031465435605700260540ustar00rootroot00000000000000// automatically generated by stateify. package arch import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (m *MmapLayout) StateTypeName() string { return "pkg/sentry/arch.MmapLayout" } func (m *MmapLayout) StateFields() []string { return []string{ "MinAddr", "MaxAddr", "BottomUpBase", "TopDownBase", "DefaultDirection", "MaxStackRand", } } func (m *MmapLayout) beforeSave() {} // +checklocksignore func (m *MmapLayout) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.MinAddr) stateSinkObject.Save(1, &m.MaxAddr) stateSinkObject.Save(2, &m.BottomUpBase) stateSinkObject.Save(3, &m.TopDownBase) stateSinkObject.Save(4, &m.DefaultDirection) stateSinkObject.Save(5, &m.MaxStackRand) } func (m *MmapLayout) afterLoad(context.Context) {} // +checklocksignore func (m *MmapLayout) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.MinAddr) stateSourceObject.Load(1, &m.MaxAddr) stateSourceObject.Load(2, &m.BottomUpBase) stateSourceObject.Load(3, &m.TopDownBase) stateSourceObject.Load(4, &m.DefaultDirection) stateSourceObject.Load(5, &m.MaxStackRand) } func (a *AuxEntry) StateTypeName() string { return "pkg/sentry/arch.AuxEntry" } func (a *AuxEntry) StateFields() []string { return []string{ "Key", "Value", } } func (a *AuxEntry) beforeSave() {} // +checklocksignore func (a *AuxEntry) StateSave(stateSinkObject state.Sink) { a.beforeSave() stateSinkObject.Save(0, &a.Key) stateSinkObject.Save(1, &a.Value) } func (a *AuxEntry) afterLoad(context.Context) {} // +checklocksignore func (a *AuxEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &a.Key) stateSourceObject.Load(1, &a.Value) } func init() { state.Register((*MmapLayout)(nil)) state.Register((*AuxEntry)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_state_x86.go000066400000000000000000000013731465435605700250420ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 || 386 // +build amd64 386 package arch // afterLoadFPState is invoked by afterLoad. func (s *State) afterLoadFPState() { s.fpState.AfterLoad() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_unsafe_abi_autogen_unsafe.go000066400000000000000000000001441465435605700303670ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package arch import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_unsafe_state_autogen.go000066400000000000000000000000661465435605700274160ustar00rootroot00000000000000// automatically generated by stateify. package arch golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_x86.go000066400000000000000000000300001465435605700236270ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 || 386 // +build amd64 386 package arch import ( "fmt" "io" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" ) // Registers represents the CPU registers for this architecture. // // +stateify savable type Registers struct { linux.PtraceRegs } // System-related constants for x86. const ( // SyscallWidth is the width of syscall, sysenter, and int 80 instructions. SyscallWidth = 2 ) // EFLAGS register bits. const ( // eflagsCF is the mask for the carry flag. eflagsCF = uint64(1) << 0 // eflagsPF is the mask for the parity flag. eflagsPF = uint64(1) << 2 // eflagsAF is the mask for the auxiliary carry flag. eflagsAF = uint64(1) << 4 // eflagsZF is the mask for the zero flag. eflagsZF = uint64(1) << 6 // eflagsSF is the mask for the sign flag. eflagsSF = uint64(1) << 7 // eflagsTF is the mask for the trap flag. eflagsTF = uint64(1) << 8 // eflagsIF is the mask for the interrupt flag. eflagsIF = uint64(1) << 9 // eflagsDF is the mask for the direction flag. eflagsDF = uint64(1) << 10 // eflagsOF is the mask for the overflow flag. eflagsOF = uint64(1) << 11 // eflagsIOPL is the mask for the I/O privilege level. eflagsIOPL = uint64(3) << 12 // eflagsNT is the mask for the nested task bit. eflagsNT = uint64(1) << 14 // eflagsRF is the mask for the resume flag. eflagsRF = uint64(1) << 16 // eflagsVM is the mask for the virtual mode bit. eflagsVM = uint64(1) << 17 // eflagsAC is the mask for the alignment check / access control bit. eflagsAC = uint64(1) << 18 // eflagsVIF is the mask for the virtual interrupt flag. eflagsVIF = uint64(1) << 19 // eflagsVIP is the mask for the virtual interrupt pending bit. eflagsVIP = uint64(1) << 20 // eflagsID is the mask for the CPUID detection bit. eflagsID = uint64(1) << 21 // eflagsPtraceMutable is the mask for the set of EFLAGS that may be // changed by ptrace(PTRACE_SETREGS). eflagsPtraceMutable is analogous to // Linux's FLAG_MASK. eflagsPtraceMutable = eflagsCF | eflagsPF | eflagsAF | eflagsZF | eflagsSF | eflagsTF | eflagsDF | eflagsOF | eflagsRF | eflagsAC | eflagsNT // eflagsRestorable is the mask for the set of EFLAGS that may be changed by // SignalReturn. eflagsRestorable is analogous to Linux's FIX_EFLAGS. eflagsRestorable = eflagsAC | eflagsOF | eflagsDF | eflagsTF | eflagsSF | eflagsZF | eflagsAF | eflagsPF | eflagsCF | eflagsRF ) // Segment selectors. See arch/x86/include/asm/segment.h. const ( userCS = 0x33 // guest ring 3 code selector user32CS = 0x23 // guest ring 3 32 bit code selector userDS = 0x2b // guest ring 3 data selector _FS_TLS_SEL = 0x63 // Linux FS thread-local storage selector _GS_TLS_SEL = 0x6b // Linux GS thread-local storage selector ) var ( // TrapInstruction is the x86 trap instruction. TrapInstruction = [1]byte{0xcc} // CPUIDInstruction is the x86 CPUID instruction. CPUIDInstruction = [2]byte{0xf, 0xa2} // X86TrapFlag is an exported const for use by other packages. X86TrapFlag uint64 = (1 << 8) ) // Proto returns a protobuf representation of the system registers in State. func (s State) Proto() *rpb.Registers { regs := &rpb.AMD64Registers{ Rax: s.Regs.Rax, Rbx: s.Regs.Rbx, Rcx: s.Regs.Rcx, Rdx: s.Regs.Rdx, Rsi: s.Regs.Rsi, Rdi: s.Regs.Rdi, Rsp: s.Regs.Rsp, Rbp: s.Regs.Rbp, R8: s.Regs.R8, R9: s.Regs.R9, R10: s.Regs.R10, R11: s.Regs.R11, R12: s.Regs.R12, R13: s.Regs.R13, R14: s.Regs.R14, R15: s.Regs.R15, Rip: s.Regs.Rip, Rflags: s.Regs.Eflags, OrigRax: s.Regs.Orig_rax, Cs: s.Regs.Cs, Ds: s.Regs.Ds, Es: s.Regs.Es, Fs: s.Regs.Fs, Gs: s.Regs.Gs, Ss: s.Regs.Ss, FsBase: s.Regs.Fs_base, GsBase: s.Regs.Gs_base, } return &rpb.Registers{Arch: &rpb.Registers_Amd64{Amd64: regs}} } // Fork creates and returns an identical copy of the state. func (s *State) Fork() State { return State{ Regs: s.Regs, fpState: s.fpState.Fork(), } } // StateData implements Context.StateData. func (s *State) StateData() *State { return s } // SingleStep implements Context.SingleStep. func (s *State) SingleStep() bool { return s.Regs.Eflags&X86TrapFlag != 0 } // SetSingleStep enables single stepping. func (s *State) SetSingleStep() { // Set the trap flag. s.Regs.Eflags |= X86TrapFlag } // ClearSingleStep enables single stepping. func (s *State) ClearSingleStep() { // Clear the trap flag. s.Regs.Eflags &= ^X86TrapFlag } // RegisterMap returns a map of all registers. func (s *State) RegisterMap() (map[string]uintptr, error) { return map[string]uintptr{ "R15": uintptr(s.Regs.R15), "R14": uintptr(s.Regs.R14), "R13": uintptr(s.Regs.R13), "R12": uintptr(s.Regs.R12), "Rbp": uintptr(s.Regs.Rbp), "Rbx": uintptr(s.Regs.Rbx), "R11": uintptr(s.Regs.R11), "R10": uintptr(s.Regs.R10), "R9": uintptr(s.Regs.R9), "R8": uintptr(s.Regs.R8), "Rax": uintptr(s.Regs.Rax), "Rcx": uintptr(s.Regs.Rcx), "Rdx": uintptr(s.Regs.Rdx), "Rsi": uintptr(s.Regs.Rsi), "Rdi": uintptr(s.Regs.Rdi), "Orig_rax": uintptr(s.Regs.Orig_rax), "Rip": uintptr(s.Regs.Rip), "Cs": uintptr(s.Regs.Cs), "Eflags": uintptr(s.Regs.Eflags), "Rsp": uintptr(s.Regs.Rsp), "Ss": uintptr(s.Regs.Ss), "Fs_base": uintptr(s.Regs.Fs_base), "Gs_base": uintptr(s.Regs.Gs_base), "Ds": uintptr(s.Regs.Ds), "Es": uintptr(s.Regs.Es), "Fs": uintptr(s.Regs.Fs), "Gs": uintptr(s.Regs.Gs), }, nil } // PtraceGetRegs implements Context.PtraceGetRegs. func (s *State) PtraceGetRegs(dst io.Writer) (int, error) { regs := s.ptraceGetRegs() n, err := regs.WriteTo(dst) return int(n), err } func (s *State) ptraceGetRegs() Registers { regs := s.Regs // As an optimization, Linux <4.7 implements 32-bit fs_base/gs_base // addresses using reserved descriptors in the GDT instead of the MSRs, // with selector values FS_TLS_SEL and GS_TLS_SEL respectively. These // values are actually visible in struct user_regs_struct::fs/gs; // arch/x86/kernel/ptrace.c:getreg() doesn't attempt to sanitize struct // thread_struct::fsindex/gsindex. // // We always use fs == gs == 0 when fs_base/gs_base is in use, for // simplicity. // // Luckily, Linux <4.7 silently ignores setting fs/gs to 0 via // arch/x86/kernel/ptrace.c:set_segment_reg() when fs_base/gs_base is a // 32-bit value and fsindex/gsindex indicates that this optimization is // in use, as well as the reverse case of setting fs/gs to // FS/GS_TLS_SEL when fs_base/gs_base is a 64-bit value. (We do the // same in PtraceSetRegs.) // // TODO(gvisor.dev/issue/168): Remove this fixup since newer Linux // doesn't have this behavior anymore. if regs.Fs == 0 && regs.Fs_base <= 0xffffffff { regs.Fs = _FS_TLS_SEL } if regs.Gs == 0 && regs.Gs_base <= 0xffffffff { regs.Gs = _GS_TLS_SEL } return regs } var ptraceRegistersSize = (*linux.PtraceRegs)(nil).SizeBytes() // PtraceSetRegs implements Context.PtraceSetRegs. func (s *State) PtraceSetRegs(src io.Reader) (int, error) { var regs Registers buf := make([]byte, ptraceRegistersSize) if _, err := io.ReadFull(src, buf); err != nil { return 0, err } regs.UnmarshalUnsafe(buf) // Truncate segment registers to 16 bits. regs.Cs = uint64(uint16(regs.Cs)) regs.Ds = uint64(uint16(regs.Ds)) regs.Es = uint64(uint16(regs.Es)) regs.Fs = uint64(uint16(regs.Fs)) regs.Gs = uint64(uint16(regs.Gs)) regs.Ss = uint64(uint16(regs.Ss)) // In Linux this validation is via arch/x86/kernel/ptrace.c:putreg(). if !isUserSegmentSelector(regs.Cs) { return 0, unix.EIO } if regs.Ds != 0 && !isUserSegmentSelector(regs.Ds) { return 0, unix.EIO } if regs.Es != 0 && !isUserSegmentSelector(regs.Es) { return 0, unix.EIO } if regs.Fs != 0 && !isUserSegmentSelector(regs.Fs) { return 0, unix.EIO } if regs.Gs != 0 && !isUserSegmentSelector(regs.Gs) { return 0, unix.EIO } if !isUserSegmentSelector(regs.Ss) { return 0, unix.EIO } if !isValidSegmentBase(regs.Fs_base) { return 0, unix.EIO } if !isValidSegmentBase(regs.Gs_base) { return 0, unix.EIO } // CS and SS are validated, but changes to them are otherwise silently // ignored on amd64. regs.Cs = s.Regs.Cs regs.Ss = s.Regs.Ss // fs_base/gs_base changes reset fs/gs via do_arch_prctl() on Linux. if regs.Fs_base != s.Regs.Fs_base { regs.Fs = 0 } if regs.Gs_base != s.Regs.Gs_base { regs.Gs = 0 } // Ignore "stale" TLS segment selectors for FS and GS. See comment in // ptraceGetRegs. if regs.Fs == _FS_TLS_SEL && regs.Fs_base != 0 { regs.Fs = 0 } if regs.Gs == _GS_TLS_SEL && regs.Gs_base != 0 { regs.Gs = 0 } regs.Eflags = (s.Regs.Eflags &^ eflagsPtraceMutable) | (regs.Eflags & eflagsPtraceMutable) s.Regs = regs return ptraceRegistersSize, nil } // isUserSegmentSelector returns true if the given segment selector specifies a // privilege level of 3 (USER_RPL). func isUserSegmentSelector(reg uint64) bool { return reg&3 == 3 } // isValidSegmentBase returns true if the given segment base specifies a // canonical user address. func isValidSegmentBase(reg uint64) bool { return reg < uint64(maxAddr64) } // Register sets defined in include/uapi/linux/elf.h. const ( _NT_PRSTATUS = 1 _NT_PRFPREG = 2 _NT_X86_XSTATE = 0x202 ) // PtraceGetRegSet implements Context.PtraceGetRegSet. func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int, fs cpuid.FeatureSet) (int, error) { switch regset { case _NT_PRSTATUS: if maxlen < ptraceRegistersSize { return 0, linuxerr.EFAULT } return s.PtraceGetRegs(dst) case _NT_PRFPREG: return s.fpState.PtraceGetFPRegs(dst, maxlen) case _NT_X86_XSTATE: return s.fpState.PtraceGetXstateRegs(dst, maxlen, fs) default: return 0, linuxerr.EINVAL } } // PtraceSetRegSet implements Context.PtraceSetRegSet. func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int, fs cpuid.FeatureSet) (int, error) { switch regset { case _NT_PRSTATUS: if maxlen < ptraceRegistersSize { return 0, linuxerr.EFAULT } return s.PtraceSetRegs(src) case _NT_PRFPREG: return s.fpState.PtraceSetFPRegs(src, maxlen) case _NT_X86_XSTATE: return s.fpState.PtraceSetXstateRegs(src, maxlen, fs) default: return 0, linuxerr.EINVAL } } // FullRestore indicates whether a full restore is required. func (s *State) FullRestore() bool { // A fast system call return is possible only if // // * RCX matches the instruction pointer. // * R11 matches our flags value. // * Usermode does not expect to set either the resume flag or the // virtual mode flags (unlikely.) // * CS and SS are set to the standard selectors. // // That is, SYSRET results in the correct final state. fastRestore := s.Regs.Rcx == s.Regs.Rip && s.Regs.Eflags == s.Regs.R11 && (s.Regs.Eflags&eflagsRF == 0) && (s.Regs.Eflags&eflagsVM == 0) && s.Regs.Cs == userCS && s.Regs.Ss == userDS return !fastRestore } // New returns a new architecture context. func New(arch Arch) *Context64 { switch arch { case AMD64: return &Context64{ State{ fpState: fpu.NewState(), // Set initial registers for compatibility with Linux // (as done in arch/x86/kernel/process_64.c:start_thread()). Regs: Registers{ PtraceRegs: linux.PtraceRegs{ Eflags: eflagsIF, Cs: userCS, Ss: userDS, }, }, }, } } panic(fmt.Sprintf("unknown architecture %v", arch)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_x86_abi_autogen_unsafe.go000066400000000000000000000010161465435605700275320ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build (amd64 || 386) && (amd64 || 386) // +build amd64 386 // +build amd64 386 package arch import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_x86_impl.go000066400000000000000000000021511465435605700246560ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build (amd64 || 386) && !false // +build amd64 386 // +build !false package arch import ( "context" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" ) // State contains the common architecture bits for X86 (the build tag of this // file ensures it's only built on x86). // // +stateify savable type State struct { // The system registers. Regs Registers // Our floating point state. fpState fpu.State `state:"wait"` } // afterLoad is invoked by stateify. func (s *State) afterLoad(context.Context) { s.afterLoadFPState() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_x86_impl_abi_autogen_unsafe.go000066400000000000000000000010031465435605700305470ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build (amd64 || 386) && !false // +build amd64 386 // +build !false package arch import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_x86_impl_state_autogen.go000066400000000000000000000015051465435605700276020ustar00rootroot00000000000000// automatically generated by stateify. //go:build (amd64 || 386) && !false // +build amd64 386 // +build !false package arch import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (s *State) StateTypeName() string { return "pkg/sentry/arch.State" } func (s *State) StateFields() []string { return []string{ "Regs", "fpState", } } func (s *State) beforeSave() {} // +checklocksignore func (s *State) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.Regs) stateSinkObject.Save(1, &s.fpState) } // +checklocksignore func (s *State) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.Regs) stateSourceObject.LoadWait(1, &s.fpState) stateSourceObject.AfterLoad(func() { s.afterLoad(ctx) }) } func init() { state.Register((*State)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/arch_x86_state_autogen.go000066400000000000000000000014321465435605700265600ustar00rootroot00000000000000// automatically generated by stateify. //go:build (amd64 || 386) && (amd64 || 386) // +build amd64 386 // +build amd64 386 package arch import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (r *Registers) StateTypeName() string { return "pkg/sentry/arch.Registers" } func (r *Registers) StateFields() []string { return []string{ "PtraceRegs", } } func (r *Registers) beforeSave() {} // +checklocksignore func (r *Registers) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.PtraceRegs) } func (r *Registers) afterLoad(context.Context) {} // +checklocksignore func (r *Registers) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.PtraceRegs) } func init() { state.Register((*Registers)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/auxv.go000066400000000000000000000015511465435605700232010ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package arch import ( "gvisor.dev/gvisor/pkg/hostarch" ) // An AuxEntry represents an entry in an ELF auxiliary vector. // // +stateify savable type AuxEntry struct { Key uint64 Value hostarch.Addr } // An Auxv represents an ELF auxiliary vector. type Auxv []AuxEntry golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/fpu/000077500000000000000000000000001465435605700224575ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/fpu/fpu.go000066400000000000000000000025111465435605700235770ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package fpu provides basic floating point helpers. package fpu import ( "fmt" ) // State represents floating point state. // // This is a simple byte slice, but may have architecture-specific methods // attached to it. type State []byte // ErrLoadingState indicates a failed restore due to unusable floating point // state. type ErrLoadingState struct { // supported is the supported floating point state. supportedFeatures uint64 // saved is the saved floating point state. savedFeatures uint64 } // Error returns a sensible description of the restore error. func (e ErrLoadingState) Error() string { return fmt.Sprintf("floating point state contains unsupported features; supported: %#x saved: %#x", e.supportedFeatures, e.savedFeatures) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/fpu/fpu_abi_autogen_unsafe.go000066400000000000000000000001431465435605700274740ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package fpu import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/fpu/fpu_amd64.go000066400000000000000000000330001465435605700245670ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 || i386 // +build amd64 i386 package fpu import ( "fmt" "io" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safecopy" "gvisor.dev/gvisor/pkg/sync" ) // FPSoftwareFrame is equivalent to struct _fpx_sw_bytes, the data stored by // Linux in bytes 464:511 of the fxsave/xsave frame. // // +marshal type FPSoftwareFrame struct { Magic1 uint32 ExtendedSize uint32 Xfeatures uint64 XstateSize uint32 Padding [7]uint32 } // From Linux's arch/x86/include/uapi/asm/sigcontext.h. const ( // FP_XSTATE_MAGIC1 is the value of FPSoftwareFrame.Magic1. FP_XSTATE_MAGIC1 = 0x46505853 // FP_SW_FRAME_OFFSET is the offset of FPSoftwareFrame in the // fxsave/xsave area. FP_SW_FRAME_OFFSET = 464 // FP_XSTATE_MAGIC2 is the value written to the 4 bytes inserted by // Linux after the fxsave/xsave area in the signal frame. FP_XSTATE_MAGIC2 = 0x46505845 // FP_XSTATE_MAGIC2_SIZE is the size of FP_XSTATE_MAGIC2. FP_XSTATE_MAGIC2_SIZE = 4 ) // From Linux's arch/x86/include/asm/fpu/types.h. const ( // XFEATURE_MASK_FPSSE is xsave features that are always enabled in // signal frame fpstate. XFEATURE_MASK_FPSSE = 0x3 // FXSAVE_AREA_SIZE is the size of the FXSAVE area. FXSAVE_AREA_SIZE = 512 ) // initX86FPState (defined in asm files) sets up initial state. func initX86FPState(data *byte, useXsave bool) func newX86FPStateSlice() State { maxsize, align := cpuid.HostFeatureSet().ExtendedStateSize() // We need capacity to be large enough to hold AMX bytes because of // ptrace. PTRACE_SETREGSET/GETREGSET assume that AMX portions should // always be used. // TODO(gvisor.dev/issues/9896): Implement AMX Support. capacity := maxsize + FP_XSTATE_MAGIC2_SIZE size := maxsize - cpuid.HostFeatureSet().AMXExtendedStateSize() // Always use at least 4096 bytes. // // For the KVM platform, this state is a fixed 4096 bytes, so make sure // that the underlying array is at _least_ that size otherwise we will // corrupt random memory. This is not a pleasant thing to debug. if capacity < 4096 { capacity = 4096 } return alignedBytes(capacity, align)[:size+FP_XSTATE_MAGIC2_SIZE] } // Slice returns the byte array that contains only the fpu state. `s` has the // fpu state and FP_XSTATE_MAGIC2. func (s State) Slice() []byte { return s[:len(s)-FP_XSTATE_MAGIC2_SIZE] } // NewState returns an initialized floating point state. // // The returned state is large enough to store all floating point state // supported by host, even if the app won't use much of it due to a restricted // FeatureSet. Since they may still be able to see state not advertised by // CPUID we must ensure it does not contain any sentry state. func NewState() State { f := newX86FPStateSlice() initX86FPState(&f[0], cpuid.HostFeatureSet().UseXsave()) return f } // Fork creates and returns an identical copy of the x86 floating point state. func (s *State) Fork() State { n := newX86FPStateSlice() copy(n, *s) return n } // Reset resets s to its initial state. func (s *State) Reset() { f := *s clear(f) initX86FPState(&f[0], cpuid.HostFeatureSet().UseXsave()) } var ( hostXCR0Mask uint64 hostFPSize uint hostUseXsave bool initHostStateOnce sync.Once ) // InitHostState initializes host parameters. func InitHostState() { initHostStateOnce.Do(func() { featureSet := cpuid.HostFeatureSet() hostXCR0Mask = featureSet.ValidXCR0Mask() hostUseXsave = featureSet.UseXsave() hostFPSize, _ = featureSet.ExtendedStateSize() // TODO(gvisor.dev/issues/9896): Implement AMX Support. hostFPSize = hostFPSize - featureSet.AMXExtendedStateSize() }) } // ptraceFPRegsSize is the size in bytes of Linux's user_i387_struct, the type // manipulated by PTRACE_GETFPREGS and PTRACE_SETFPREGS on x86. Equivalently, // ptraceFPRegsSize is the size in bytes of the x86 FXSAVE area. const ptraceFPRegsSize = 512 // PtraceGetFPRegs implements Context.PtraceGetFPRegs. func (s *State) PtraceGetFPRegs(dst io.Writer, maxlen int) (int, error) { if maxlen < ptraceFPRegsSize { return 0, linuxerr.EFAULT } return dst.Write((*s)[:ptraceFPRegsSize]) } // PtraceSetFPRegs implements Context.PtraceSetFPRegs. func (s *State) PtraceSetFPRegs(src io.Reader, maxlen int) (int, error) { if maxlen < ptraceFPRegsSize { return 0, linuxerr.EFAULT } var f [ptraceFPRegsSize]byte n, err := io.ReadFull(src, f[:]) if err != nil { return 0, err } // Force reserved bits in MXCSR to 0. This is consistent with Linux. sanitizeMXCSR(State(f[:])) // N.B. this only copies the beginning of the FP state, which // corresponds to the FXSAVE area. copy(*s, f[:]) return n, nil } const ( // mxcsrOffset is the offset in bytes of the MXCSR field from the start of // the FXSAVE area. (Intel SDM Vol. 1, Table 10-2 "Format of an FXSAVE // Area") mxcsrOffset = 24 // mxcsrMaskOffset is the offset in bytes of the MXCSR_MASK field from the // start of the FXSAVE area. mxcsrMaskOffset = 28 ) const ( // minXstateBytes is the minimum size in bytes of an x86 XSAVE area, equal // to the size of the XSAVE legacy area (512 bytes) plus the size of the // XSAVE header (64 bytes). Equivalently, minXstateBytes is GDB's // X86_XSTATE_SSE_SIZE. minXstateBytes = 512 + 64 // userXstateXCR0Offset is the offset in bytes of the USER_XSTATE_XCR0_WORD // field in Linux's struct user_xstateregs, which is the type manipulated // by ptrace(PTRACE_GET/SETREGSET, NT_X86_XSTATE). Equivalently, // userXstateXCR0Offset is GDB's I386_LINUX_XSAVE_XCR0_OFFSET. userXstateXCR0Offset = 464 // xstateBVOffset is the offset in bytes of the XSTATE_BV field in an x86 // XSAVE area. xstateBVOffset = 512 xcompBVOffset = 520 // xsaveHeaderZeroedOffset and xsaveHeaderZeroedBytes indicate parts of the // XSAVE header that we coerce to zero: "Bytes 15:8 of the XSAVE header is // a state-component bitmap called XCOMP_BV. ... Bytes 63:16 of the XSAVE // header are reserved." - Intel SDM Vol. 1, Section 13.4.2 "XSAVE Header". // Linux ignores XCOMP_BV, but it's able to recover from XRSTOR #GP // exceptions resulting from invalid values; we aren't. Linux also never // uses the compacted format when doing XSAVE and doesn't even define the // compaction extensions to XSAVE as a CPU feature, so for simplicity we // assume no one is using them. xsaveHeaderZeroedOffset = 512 + 8 xsaveHeaderZeroedBytes = 64 - 8 ) // PtraceGetXstateRegs implements ptrace(PTRACE_GETREGS, NT_X86_XSTATE) by // writing the floating point registers from this state to dst and returning the // number of bytes written, which must be less than or equal to maxlen. func (s *State) PtraceGetXstateRegs(dst io.Writer, maxlen int, featureSet cpuid.FeatureSet) (int, error) { // N.B. s.x86FPState may contain more state than the application // expects. We only copy the subset that would be in their XSAVE area. ess, _ := featureSet.ExtendedStateSize() f := make([]byte, ess) copy(f, *s) // "The XSAVE feature set does not use bytes 511:416; bytes 463:416 are // reserved." - Intel SDM Vol 1., Section 13.4.1 "Legacy Region of an XSAVE // Area". Linux uses the first 8 bytes of this area to store the OS XSTATE // mask. GDB relies on this: see // gdb/x86-linux-nat.c:x86_linux_read_description(). hostarch.ByteOrder.PutUint64(f[userXstateXCR0Offset:], featureSet.ValidXCR0Mask()) if len(f) > maxlen { f = f[:maxlen] } return dst.Write(f) } // PtraceSetXstateRegs implements ptrace(PTRACE_SETREGS, NT_X86_XSTATE) by // reading floating point registers from src and returning the number of bytes // read, which must be less than or equal to maxlen. func (s *State) PtraceSetXstateRegs(src io.Reader, maxlen int, featureSet cpuid.FeatureSet) (int, error) { // Allow users to pass an xstate register set smaller than ours (they can // mask bits out of XSTATE_BV), as long as it's at least minXstateBytes. // Also allow users to pass a register set larger than ours; anything after // their ExtendedStateSize will be ignored. (I think Linux technically // permits setting a register set smaller than minXstateBytes, but it has // the same silent truncation behavior in kernel/ptrace.c:ptrace_regset().) if maxlen < minXstateBytes { return 0, unix.EFAULT } ess, _ := featureSet.ExtendedStateSize() if maxlen > int(ess) { maxlen = int(ess) } f := make([]byte, maxlen) if _, err := io.ReadFull(src, f); err != nil { return 0, err } n := copy(*s, f) s.SanitizeUser(featureSet) return n, nil } // SanitizeUser mutates s to ensure that restoring it is safe. func (s *State) SanitizeUser(featureSet cpuid.FeatureSet) { f := *s // Force reserved bits in MXCSR to 0. This is consistent with Linux. sanitizeMXCSR(f) if len(f) >= minXstateBytes { // Users can't enable *more* XCR0 bits than what we, and the CPU, support. xstateBV := hostarch.ByteOrder.Uint64(f[xstateBVOffset:]) xstateBV &= featureSet.ValidXCR0Mask() hostarch.ByteOrder.PutUint64(f[xstateBVOffset:], xstateBV) // Force XCOMP_BV and reserved bytes in the XSAVE header to 0. reserved := f[xsaveHeaderZeroedOffset : xsaveHeaderZeroedOffset+xsaveHeaderZeroedBytes] clear(reserved) } } var ( mxcsrMask uint32 initMXCSRMask sync.Once ) // sanitizeMXCSR coerces reserved bits in the MXCSR field of f to 0. ("FXRSTOR // generates a general-protection fault (#GP) in response to an attempt to set // any of the reserved bits of the MXCSR register." - Intel SDM Vol. 1, Section // 10.5.1.2 "SSE State") func sanitizeMXCSR(f State) { mxcsr := hostarch.ByteOrder.Uint32(f[mxcsrOffset:]) initMXCSRMask.Do(func() { temp := State(alignedBytes(uint(ptraceFPRegsSize), 16)) initX86FPState(&temp[0], false /* useXsave */) mxcsrMask = hostarch.ByteOrder.Uint32(temp[mxcsrMaskOffset:]) if mxcsrMask == 0 { // "If the value of the MXCSR_MASK field is 00000000H, then the // MXCSR_MASK value is the default value of 0000FFBFH." - Intel SDM // Vol. 1, Section 11.6.6 "Guidelines for Writing to the MXCSR // Register" mxcsrMask = 0xffbf } }) mxcsr &= mxcsrMask hostarch.ByteOrder.PutUint32(f[mxcsrOffset:], mxcsr) } // SetMXCSR sets the MXCSR control/status register in the state. func (s *State) SetMXCSR(mxcsr uint32) { hostarch.ByteOrder.PutUint32((*s)[mxcsrOffset:], mxcsr) } // GetMXCSR gets the MXCSR control/status register in the state. func (s *State) GetMXCSR() uint32 { return hostarch.ByteOrder.Uint32((*s)[mxcsrOffset:]) } // BytePointer returns a pointer to the first byte of the state. // //go:nosplit func (s *State) BytePointer() *byte { return &(*s)[0] } // XSTATE_BV does not exist if FXSAVE is used, but FXSAVE implicitly saves x87 // and SSE state, so this is the equivalent XSTATE_BV value. const fxsaveBV uint64 = cpuid.XSAVEFeatureX87 | cpuid.XSAVEFeatureSSE // AfterLoad converts the loaded state to the format that compatible with the // current processor. func (s *State) AfterLoad() { old := s.Slice() // Recreate the slice. This is done to ensure that it is aligned // appropriately in memory, and large enough to accommodate any new // state that may be saved by the new CPU. Even if extraneous new state // is saved, the state we care about is guaranteed to be a subset of // new state. Later optimizations can use less space when using a // smaller state component bitmap. Intel SDM Volume 1 Chapter 13 has // more info. *s = NewState() // x86FPState always contains all the FP state supported by the host. // We may have come from a newer machine that supports additional state // which we cannot restore. // // The x86 FP state areas are backwards compatible, so we can simply // truncate the additional floating point state. // // Applications should not depend on the truncated state because it // should relate only to features that were not exposed in the app // FeatureSet. However, because we do not *prevent* them from using // this state, we must verify here that there is no in-use state // (according to XSTATE_BV) which we do not support. // What do we support? supportedBV := fxsaveBV hostFeatureSet := cpuid.HostFeatureSet() if hostFeatureSet.UseXsave() { supportedBV = hostFeatureSet.ValidXCR0Mask() } // What was in use? savedBV := fxsaveBV if len(old) >= xstateBVOffset+8 { savedBV = hostarch.ByteOrder.Uint64(old[xstateBVOffset:]) } // Supported features must be a superset of saved features. if savedBV&^supportedBV != 0 { panic(ErrLoadingState{supportedFeatures: supportedBV, savedFeatures: savedBV}) } // Copy to the new, aligned location. copy(*s, old) mxcsrBefore := s.GetMXCSR() sanitizeMXCSR(*s) mxcsrAfter := s.GetMXCSR() if mxcsrBefore != mxcsrAfter { panic(fmt.Sprintf("incompatible mxcsr value: %x (%x)", mxcsrBefore, mxcsrAfter)) } if hostFeatureSet.UseXsave() { if err := safecopy.CheckXstate(s.BytePointer()); err != nil { xcompBV := uint64(0) if len(old) >= xcompBVOffset+8 { xcompBV = hostarch.ByteOrder.Uint64(old[xcompBVOffset:]) } panic(fmt.Sprintf("incompatible state: %s\nlen(old)=%d len(new)=%d supportedBV=%#x XSTATE_BV=%#x XCOMP_BV=%#x", err, len(old), len(*s), supportedBV, savedBV, xcompBV)) } } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/fpu/fpu_amd64.s000066400000000000000000000101031465435605700244230ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "textflag.h" // MXCSR_DEFAULT is the reset value of MXCSR (Intel SDM Vol. 2, Ch. 3.2 // "LDMXCSR") #define MXCSR_DEFAULT 0x1f80 // MXCSR_OFFSET is the offset in bytes of the MXCSR field from the start of the // FXSAVE/XSAVE area. (Intel SDM Vol. 1, Table 10-2 "Format of an FXSAVE Area") #define MXCSR_OFFSET 24 // The value for XCR0 is defined to xsave/xrstor everything except for PKRU and // AMX regions. // TODO(gvisor.dev/issues/9896): Implement AMX support. // TODO(gvisor.dev/issues/10087): Implement PKRU support. #define XCR0_DISABLED_MASK ((1 << 9) | (1 << 17) | (1 << 18)) #define XCR0_EAX (0xffffffff ^ XCR0_DISABLED_MASK) #define XCR0_EDX 0xffffffff // initX86FPState initializes floating point state. // // func initX86FPState(data *FloatingPointData, useXsave bool) // // We need to clear out and initialize an empty fp state area since the sentry, // or any previous loader, may have left sensitive information in the floating // point registers. // // Preconditions: data is zeroed. TEXT ·initX86FPState(SB), $24-9 // Save MXCSR (callee-save) STMXCSR mxcsr-8(SP) // Save x87 CW (callee-save) FSTCW cw-16(SP) MOVQ data+0(FP), DI // Do we use xsave? MOVBQZX useXsave+8(FP), AX TESTQ AX, AX JZ no_xsave // Use XRSTOR to clear all FP state to an initial state. // // The fpState XSAVE area is zeroed on function entry, meaning // XSTATE_BV is zero. // // "If RFBM[i] = 1 and bit i is clear in the XSTATE_BV field in the // XSAVE header, XRSTOR initializes state component i." // // Initialization is defined in SDM Vol 1, Chapter 13.3. It puts all // the registers in a reasonable initial state, except MXCSR: // // "The MXCSR register is part of state component 1, SSE state (see // Section 13.5.2). However, the standard form of XRSTOR loads the // MXCSR register from memory whenever the RFBM[1] (SSE) or RFBM[2] // (AVX) is set, regardless of the values of XSTATE_BV[1] and // XSTATE_BV[2]." // Set MXCSR to the default value. MOVL $MXCSR_DEFAULT, MXCSR_OFFSET(DI) // Initialize registers with XRSTOR. MOVL $XCR0_EAX, AX MOVL $XCR0_EDX, DX BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f // XRSTOR64 0(DI) // Now that all the state has been reset, write it back out to the // XSAVE area. BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27 // XSAVE64 0(DI) JMP out no_xsave: // Clear out existing X values. PXOR X0, X0 MOVO X0, X1 MOVO X0, X2 MOVO X0, X3 MOVO X0, X4 MOVO X0, X5 MOVO X0, X6 MOVO X0, X7 MOVO X0, X8 MOVO X0, X9 MOVO X0, X10 MOVO X0, X11 MOVO X0, X12 MOVO X0, X13 MOVO X0, X14 MOVO X0, X15 // Zero out %rax and store into MMX registers. MMX registers are // an alias of 8x64 bits of the 8x80 bits used for the original // x87 registers. Storing zero into them will reset the FPU registers // to bits [63:0] = 0, [79:64] = 1. But the contents aren't too // important, just the fact that we have reset them to a known value. XORQ AX, AX MOVQ AX, M0 MOVQ AX, M1 MOVQ AX, M2 MOVQ AX, M3 MOVQ AX, M4 MOVQ AX, M5 MOVQ AX, M6 MOVQ AX, M7 // The Go assembler doesn't support FNINIT, so we use BYTE. // This will: // - Reset FPU control word to 0x037f // - Clear FPU status word // - Reset FPU tag word to 0xffff // - Clear FPU data pointer // - Clear FPU instruction pointer BYTE $0xDB; BYTE $0xE3; // FNINIT // Reset MXCSR. MOVL $MXCSR_DEFAULT, tmpmxcsr-24(SP) LDMXCSR tmpmxcsr-24(SP) // Save the floating point state with fxsave. FXSAVE64 0(DI) out: // Restore MXCSR. LDMXCSR mxcsr-8(SP) // Restore x87 CW. FLDCW cw-16(SP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/fpu/fpu_amd64_abi_autogen_unsafe.go000066400000000000000000000116731465435605700305010ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build amd64 || i386 // +build amd64 i386 package fpu import ( "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "io" "reflect" "runtime" "unsafe" ) // Marshallable types used by this file. var _ marshal.Marshallable = (*FPSoftwareFrame)(nil) // SizeBytes implements marshal.Marshallable.SizeBytes. func (f *FPSoftwareFrame) SizeBytes() int { return 20 + 4*7 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (f *FPSoftwareFrame) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Magic1)) dst = dst[4:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.ExtendedSize)) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(f.Xfeatures)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.XstateSize)) dst = dst[4:] for idx := 0; idx < 7; idx++ { hostarch.ByteOrder.PutUint32(dst[:4], uint32(f.Padding[idx])) dst = dst[4:] } return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (f *FPSoftwareFrame) UnmarshalBytes(src []byte) []byte { f.Magic1 = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.ExtendedSize = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] f.Xfeatures = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] f.XstateSize = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] for idx := 0; idx < 7; idx++ { f.Padding[idx] = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] } return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (f *FPSoftwareFrame) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (f *FPSoftwareFrame) MarshalUnsafe(dst []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(f), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (f *FPSoftwareFrame) UnmarshalUnsafe(src []byte) []byte { size := f.SizeBytes() gohacks.Memmove(unsafe.Pointer(f), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (f *FPSoftwareFrame) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (f *FPSoftwareFrame) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyOutN(cc, addr, f.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (f *FPSoftwareFrame) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (f *FPSoftwareFrame) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return f.CopyInN(cc, addr, f.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (f *FPSoftwareFrame) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(f))) hdr.Len = f.SizeBytes() hdr.Cap = f.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that f // must live until the use above. runtime.KeepAlive(f) // escapes: replaced by intrinsic. return int64(length), err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/fpu/fpu_amd64_state_autogen.go000066400000000000000000000001441465435605700275140ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 || i386 // +build amd64 i386 package fpu golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/fpu/fpu_amd64_unsafe.go000066400000000000000000000024731465435605700261420ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 || i386 // +build amd64 i386 package fpu import ( "unsafe" ) // PrepForHostSigframe prepare the SW reserved portion of the fxsave memory // layout and adds FP_XSTATE_MAGIC2. It has to be called if the state is // restored by rt_sigreturn. // // Look at save_xstate_epilog in the kernel sources for more details. // //go:nosplit func (s State) PrepForHostSigframe() { fpsw := (*FPSoftwareFrame)(unsafe.Pointer(&s[FP_SW_FRAME_OFFSET])) fpsw.Magic1 = FP_XSTATE_MAGIC1 fpsw.ExtendedSize = uint32(hostFPSize) + FP_XSTATE_MAGIC2_SIZE fpsw.Xfeatures = XFEATURE_MASK_FPSSE | hostXCR0Mask fpsw.XstateSize = uint32(hostFPSize) if !hostUseXsave { return } *(*uint32)(unsafe.Pointer(&s[hostFPSize])) = FP_XSTATE_MAGIC2 } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/fpu/fpu_amd64_unsafe_abi_autogen_unsafe.go000066400000000000000000000007471465435605700320420ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build amd64 || i386 // +build amd64 i386 package fpu import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/fpu/fpu_amd64_unsafe_state_autogen.go000066400000000000000000000001441465435605700310550ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 || i386 // +build amd64 i386 package fpu golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/fpu/fpu_arm64.go000066400000000000000000000035161465435605700246160ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package fpu const ( // fpsimdMagic is the magic number which is used in fpsimd_context. fpsimdMagic = 0x46508001 // fpsimdContextSize is the size of fpsimd_context. fpsimdContextSize = 0x210 ) // initAarch64FPState sets up initial state. // // Related code in Linux kernel: fpsimd_flush_thread(). // FPCR = FPCR_RM_RN (0x0 << 22). // // Currently, aarch64FPState is only a space of 0x210 length for fpstate. // The fp head is useless in sentry/ptrace/kvm. func initAarch64FPState(data *State) { } func newAarch64FPStateSlice() []byte { return alignedBytes(4096, 16)[:fpsimdContextSize] } // NewState returns an initialized floating point state. // // The returned state is large enough to store all floating point state // supported by host, even if the app won't use much of it due to a restricted // FeatureSet. func NewState() State { f := State(newAarch64FPStateSlice()) initAarch64FPState(&f) return f } // Fork creates and returns an identical copy of the aarch64 floating point state. func (s *State) Fork() State { n := State(newAarch64FPStateSlice()) copy(n, *s) return n } // BytePointer returns a pointer to the first byte of the state. // //go:nosplit func (s *State) BytePointer() *byte { return &(*s)[0] } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/fpu/fpu_arm64_abi_autogen_unsafe.go000066400000000000000000000007321465435605700305110ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build arm64 // +build arm64 package fpu import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/fpu/fpu_arm64_state_autogen.go000066400000000000000000000001271465435605700275330ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 // +build arm64 package fpu golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/fpu/fpu_state_autogen.go000066400000000000000000000000651465435605700265230ustar00rootroot00000000000000// automatically generated by stateify. package fpu golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/fpu/fpu_unsafe.go000066400000000000000000000021241465435605700251400ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fpu import ( "unsafe" ) // alignedBytes returns a slice of size bytes, aligned in memory to the given // alignment. This is used because we require certain structures to be aligned // in a specific way (for example, the X86 floating point data). func alignedBytes(size, alignment uint) []byte { data := make([]byte, size+alignment-1) offset := uint(uintptr(unsafe.Pointer(&data[0])) % uintptr(alignment)) if offset == 0 { return data[:size:size] } return data[alignment-offset:][:size:size] } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/fpu/fpu_unsafe_abi_autogen_unsafe.go000066400000000000000000000001431465435605700310350ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package fpu import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/fpu/fpu_unsafe_state_autogen.go000066400000000000000000000000651465435605700300640ustar00rootroot00000000000000// automatically generated by stateify. package fpu golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/registers_go_proto/000077500000000000000000000000001465435605700256045ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/registers_go_proto/registers.pb.go000066400000000000000000000631311465435605700305460ustar00rootroot00000000000000// Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.31.0 // protoc v5.26.1 // source: pkg/sentry/arch/registers.proto package registers_go_proto import ( protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" reflect "reflect" sync "sync" ) const ( // Verify that this generated code is sufficiently up-to-date. _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) // Verify that runtime/protoimpl is sufficiently up-to-date. _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) type AMD64Registers struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Rax uint64 `protobuf:"varint,1,opt,name=rax,proto3" json:"rax,omitempty"` Rbx uint64 `protobuf:"varint,2,opt,name=rbx,proto3" json:"rbx,omitempty"` Rcx uint64 `protobuf:"varint,3,opt,name=rcx,proto3" json:"rcx,omitempty"` Rdx uint64 `protobuf:"varint,4,opt,name=rdx,proto3" json:"rdx,omitempty"` Rsi uint64 `protobuf:"varint,5,opt,name=rsi,proto3" json:"rsi,omitempty"` Rdi uint64 `protobuf:"varint,6,opt,name=rdi,proto3" json:"rdi,omitempty"` Rsp uint64 `protobuf:"varint,7,opt,name=rsp,proto3" json:"rsp,omitempty"` Rbp uint64 `protobuf:"varint,8,opt,name=rbp,proto3" json:"rbp,omitempty"` R8 uint64 `protobuf:"varint,9,opt,name=r8,proto3" json:"r8,omitempty"` R9 uint64 `protobuf:"varint,10,opt,name=r9,proto3" json:"r9,omitempty"` R10 uint64 `protobuf:"varint,11,opt,name=r10,proto3" json:"r10,omitempty"` R11 uint64 `protobuf:"varint,12,opt,name=r11,proto3" json:"r11,omitempty"` R12 uint64 `protobuf:"varint,13,opt,name=r12,proto3" json:"r12,omitempty"` R13 uint64 `protobuf:"varint,14,opt,name=r13,proto3" json:"r13,omitempty"` R14 uint64 `protobuf:"varint,15,opt,name=r14,proto3" json:"r14,omitempty"` R15 uint64 `protobuf:"varint,16,opt,name=r15,proto3" json:"r15,omitempty"` Rip uint64 `protobuf:"varint,17,opt,name=rip,proto3" json:"rip,omitempty"` Rflags uint64 `protobuf:"varint,18,opt,name=rflags,proto3" json:"rflags,omitempty"` OrigRax uint64 `protobuf:"varint,19,opt,name=orig_rax,json=origRax,proto3" json:"orig_rax,omitempty"` Cs uint64 `protobuf:"varint,20,opt,name=cs,proto3" json:"cs,omitempty"` Ds uint64 `protobuf:"varint,21,opt,name=ds,proto3" json:"ds,omitempty"` Es uint64 `protobuf:"varint,22,opt,name=es,proto3" json:"es,omitempty"` Fs uint64 `protobuf:"varint,23,opt,name=fs,proto3" json:"fs,omitempty"` Gs uint64 `protobuf:"varint,24,opt,name=gs,proto3" json:"gs,omitempty"` Ss uint64 `protobuf:"varint,25,opt,name=ss,proto3" json:"ss,omitempty"` FsBase uint64 `protobuf:"varint,26,opt,name=fs_base,json=fsBase,proto3" json:"fs_base,omitempty"` GsBase uint64 `protobuf:"varint,27,opt,name=gs_base,json=gsBase,proto3" json:"gs_base,omitempty"` } func (x *AMD64Registers) Reset() { *x = AMD64Registers{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_arch_registers_proto_msgTypes[0] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *AMD64Registers) String() string { return protoimpl.X.MessageStringOf(x) } func (*AMD64Registers) ProtoMessage() {} func (x *AMD64Registers) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_arch_registers_proto_msgTypes[0] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use AMD64Registers.ProtoReflect.Descriptor instead. func (*AMD64Registers) Descriptor() ([]byte, []int) { return file_pkg_sentry_arch_registers_proto_rawDescGZIP(), []int{0} } func (x *AMD64Registers) GetRax() uint64 { if x != nil { return x.Rax } return 0 } func (x *AMD64Registers) GetRbx() uint64 { if x != nil { return x.Rbx } return 0 } func (x *AMD64Registers) GetRcx() uint64 { if x != nil { return x.Rcx } return 0 } func (x *AMD64Registers) GetRdx() uint64 { if x != nil { return x.Rdx } return 0 } func (x *AMD64Registers) GetRsi() uint64 { if x != nil { return x.Rsi } return 0 } func (x *AMD64Registers) GetRdi() uint64 { if x != nil { return x.Rdi } return 0 } func (x *AMD64Registers) GetRsp() uint64 { if x != nil { return x.Rsp } return 0 } func (x *AMD64Registers) GetRbp() uint64 { if x != nil { return x.Rbp } return 0 } func (x *AMD64Registers) GetR8() uint64 { if x != nil { return x.R8 } return 0 } func (x *AMD64Registers) GetR9() uint64 { if x != nil { return x.R9 } return 0 } func (x *AMD64Registers) GetR10() uint64 { if x != nil { return x.R10 } return 0 } func (x *AMD64Registers) GetR11() uint64 { if x != nil { return x.R11 } return 0 } func (x *AMD64Registers) GetR12() uint64 { if x != nil { return x.R12 } return 0 } func (x *AMD64Registers) GetR13() uint64 { if x != nil { return x.R13 } return 0 } func (x *AMD64Registers) GetR14() uint64 { if x != nil { return x.R14 } return 0 } func (x *AMD64Registers) GetR15() uint64 { if x != nil { return x.R15 } return 0 } func (x *AMD64Registers) GetRip() uint64 { if x != nil { return x.Rip } return 0 } func (x *AMD64Registers) GetRflags() uint64 { if x != nil { return x.Rflags } return 0 } func (x *AMD64Registers) GetOrigRax() uint64 { if x != nil { return x.OrigRax } return 0 } func (x *AMD64Registers) GetCs() uint64 { if x != nil { return x.Cs } return 0 } func (x *AMD64Registers) GetDs() uint64 { if x != nil { return x.Ds } return 0 } func (x *AMD64Registers) GetEs() uint64 { if x != nil { return x.Es } return 0 } func (x *AMD64Registers) GetFs() uint64 { if x != nil { return x.Fs } return 0 } func (x *AMD64Registers) GetGs() uint64 { if x != nil { return x.Gs } return 0 } func (x *AMD64Registers) GetSs() uint64 { if x != nil { return x.Ss } return 0 } func (x *AMD64Registers) GetFsBase() uint64 { if x != nil { return x.FsBase } return 0 } func (x *AMD64Registers) GetGsBase() uint64 { if x != nil { return x.GsBase } return 0 } type ARM64Registers struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields R0 uint64 `protobuf:"varint,1,opt,name=r0,proto3" json:"r0,omitempty"` R1 uint64 `protobuf:"varint,2,opt,name=r1,proto3" json:"r1,omitempty"` R2 uint64 `protobuf:"varint,3,opt,name=r2,proto3" json:"r2,omitempty"` R3 uint64 `protobuf:"varint,4,opt,name=r3,proto3" json:"r3,omitempty"` R4 uint64 `protobuf:"varint,5,opt,name=r4,proto3" json:"r4,omitempty"` R5 uint64 `protobuf:"varint,6,opt,name=r5,proto3" json:"r5,omitempty"` R6 uint64 `protobuf:"varint,7,opt,name=r6,proto3" json:"r6,omitempty"` R7 uint64 `protobuf:"varint,8,opt,name=r7,proto3" json:"r7,omitempty"` R8 uint64 `protobuf:"varint,9,opt,name=r8,proto3" json:"r8,omitempty"` R9 uint64 `protobuf:"varint,10,opt,name=r9,proto3" json:"r9,omitempty"` R10 uint64 `protobuf:"varint,11,opt,name=r10,proto3" json:"r10,omitempty"` R11 uint64 `protobuf:"varint,12,opt,name=r11,proto3" json:"r11,omitempty"` R12 uint64 `protobuf:"varint,13,opt,name=r12,proto3" json:"r12,omitempty"` R13 uint64 `protobuf:"varint,14,opt,name=r13,proto3" json:"r13,omitempty"` R14 uint64 `protobuf:"varint,15,opt,name=r14,proto3" json:"r14,omitempty"` R15 uint64 `protobuf:"varint,16,opt,name=r15,proto3" json:"r15,omitempty"` R16 uint64 `protobuf:"varint,17,opt,name=r16,proto3" json:"r16,omitempty"` R17 uint64 `protobuf:"varint,18,opt,name=r17,proto3" json:"r17,omitempty"` R18 uint64 `protobuf:"varint,19,opt,name=r18,proto3" json:"r18,omitempty"` R19 uint64 `protobuf:"varint,20,opt,name=r19,proto3" json:"r19,omitempty"` R20 uint64 `protobuf:"varint,21,opt,name=r20,proto3" json:"r20,omitempty"` R21 uint64 `protobuf:"varint,22,opt,name=r21,proto3" json:"r21,omitempty"` R22 uint64 `protobuf:"varint,23,opt,name=r22,proto3" json:"r22,omitempty"` R23 uint64 `protobuf:"varint,24,opt,name=r23,proto3" json:"r23,omitempty"` R24 uint64 `protobuf:"varint,25,opt,name=r24,proto3" json:"r24,omitempty"` R25 uint64 `protobuf:"varint,26,opt,name=r25,proto3" json:"r25,omitempty"` R26 uint64 `protobuf:"varint,27,opt,name=r26,proto3" json:"r26,omitempty"` R27 uint64 `protobuf:"varint,28,opt,name=r27,proto3" json:"r27,omitempty"` R28 uint64 `protobuf:"varint,29,opt,name=r28,proto3" json:"r28,omitempty"` R29 uint64 `protobuf:"varint,30,opt,name=r29,proto3" json:"r29,omitempty"` R30 uint64 `protobuf:"varint,31,opt,name=r30,proto3" json:"r30,omitempty"` Sp uint64 `protobuf:"varint,32,opt,name=sp,proto3" json:"sp,omitempty"` Pc uint64 `protobuf:"varint,33,opt,name=pc,proto3" json:"pc,omitempty"` Pstate uint64 `protobuf:"varint,34,opt,name=pstate,proto3" json:"pstate,omitempty"` Tls uint64 `protobuf:"varint,35,opt,name=tls,proto3" json:"tls,omitempty"` } func (x *ARM64Registers) Reset() { *x = ARM64Registers{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_arch_registers_proto_msgTypes[1] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *ARM64Registers) String() string { return protoimpl.X.MessageStringOf(x) } func (*ARM64Registers) ProtoMessage() {} func (x *ARM64Registers) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_arch_registers_proto_msgTypes[1] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use ARM64Registers.ProtoReflect.Descriptor instead. func (*ARM64Registers) Descriptor() ([]byte, []int) { return file_pkg_sentry_arch_registers_proto_rawDescGZIP(), []int{1} } func (x *ARM64Registers) GetR0() uint64 { if x != nil { return x.R0 } return 0 } func (x *ARM64Registers) GetR1() uint64 { if x != nil { return x.R1 } return 0 } func (x *ARM64Registers) GetR2() uint64 { if x != nil { return x.R2 } return 0 } func (x *ARM64Registers) GetR3() uint64 { if x != nil { return x.R3 } return 0 } func (x *ARM64Registers) GetR4() uint64 { if x != nil { return x.R4 } return 0 } func (x *ARM64Registers) GetR5() uint64 { if x != nil { return x.R5 } return 0 } func (x *ARM64Registers) GetR6() uint64 { if x != nil { return x.R6 } return 0 } func (x *ARM64Registers) GetR7() uint64 { if x != nil { return x.R7 } return 0 } func (x *ARM64Registers) GetR8() uint64 { if x != nil { return x.R8 } return 0 } func (x *ARM64Registers) GetR9() uint64 { if x != nil { return x.R9 } return 0 } func (x *ARM64Registers) GetR10() uint64 { if x != nil { return x.R10 } return 0 } func (x *ARM64Registers) GetR11() uint64 { if x != nil { return x.R11 } return 0 } func (x *ARM64Registers) GetR12() uint64 { if x != nil { return x.R12 } return 0 } func (x *ARM64Registers) GetR13() uint64 { if x != nil { return x.R13 } return 0 } func (x *ARM64Registers) GetR14() uint64 { if x != nil { return x.R14 } return 0 } func (x *ARM64Registers) GetR15() uint64 { if x != nil { return x.R15 } return 0 } func (x *ARM64Registers) GetR16() uint64 { if x != nil { return x.R16 } return 0 } func (x *ARM64Registers) GetR17() uint64 { if x != nil { return x.R17 } return 0 } func (x *ARM64Registers) GetR18() uint64 { if x != nil { return x.R18 } return 0 } func (x *ARM64Registers) GetR19() uint64 { if x != nil { return x.R19 } return 0 } func (x *ARM64Registers) GetR20() uint64 { if x != nil { return x.R20 } return 0 } func (x *ARM64Registers) GetR21() uint64 { if x != nil { return x.R21 } return 0 } func (x *ARM64Registers) GetR22() uint64 { if x != nil { return x.R22 } return 0 } func (x *ARM64Registers) GetR23() uint64 { if x != nil { return x.R23 } return 0 } func (x *ARM64Registers) GetR24() uint64 { if x != nil { return x.R24 } return 0 } func (x *ARM64Registers) GetR25() uint64 { if x != nil { return x.R25 } return 0 } func (x *ARM64Registers) GetR26() uint64 { if x != nil { return x.R26 } return 0 } func (x *ARM64Registers) GetR27() uint64 { if x != nil { return x.R27 } return 0 } func (x *ARM64Registers) GetR28() uint64 { if x != nil { return x.R28 } return 0 } func (x *ARM64Registers) GetR29() uint64 { if x != nil { return x.R29 } return 0 } func (x *ARM64Registers) GetR30() uint64 { if x != nil { return x.R30 } return 0 } func (x *ARM64Registers) GetSp() uint64 { if x != nil { return x.Sp } return 0 } func (x *ARM64Registers) GetPc() uint64 { if x != nil { return x.Pc } return 0 } func (x *ARM64Registers) GetPstate() uint64 { if x != nil { return x.Pstate } return 0 } func (x *ARM64Registers) GetTls() uint64 { if x != nil { return x.Tls } return 0 } type Registers struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields // Types that are assignable to Arch: // // *Registers_Amd64 // *Registers_Arm64 Arch isRegisters_Arch `protobuf_oneof:"arch"` } func (x *Registers) Reset() { *x = Registers{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_arch_registers_proto_msgTypes[2] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Registers) String() string { return protoimpl.X.MessageStringOf(x) } func (*Registers) ProtoMessage() {} func (x *Registers) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_arch_registers_proto_msgTypes[2] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Registers.ProtoReflect.Descriptor instead. func (*Registers) Descriptor() ([]byte, []int) { return file_pkg_sentry_arch_registers_proto_rawDescGZIP(), []int{2} } func (m *Registers) GetArch() isRegisters_Arch { if m != nil { return m.Arch } return nil } func (x *Registers) GetAmd64() *AMD64Registers { if x, ok := x.GetArch().(*Registers_Amd64); ok { return x.Amd64 } return nil } func (x *Registers) GetArm64() *ARM64Registers { if x, ok := x.GetArch().(*Registers_Arm64); ok { return x.Arm64 } return nil } type isRegisters_Arch interface { isRegisters_Arch() } type Registers_Amd64 struct { Amd64 *AMD64Registers `protobuf:"bytes,1,opt,name=amd64,proto3,oneof"` } type Registers_Arm64 struct { Arm64 *ARM64Registers `protobuf:"bytes,2,opt,name=arm64,proto3,oneof"` } func (*Registers_Amd64) isRegisters_Arch() {} func (*Registers_Arm64) isRegisters_Arch() {} var File_pkg_sentry_arch_registers_proto protoreflect.FileDescriptor var file_pkg_sentry_arch_registers_proto_rawDesc = []byte{ 0x0a, 0x1f, 0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x2f, 0x61, 0x72, 0x63, 0x68, 0x2f, 0x72, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x06, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x22, 0x83, 0x04, 0x0a, 0x0e, 0x41, 0x4d, 0x44, 0x36, 0x34, 0x52, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72, 0x73, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x61, 0x78, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x61, 0x78, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x62, 0x78, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x62, 0x78, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x63, 0x78, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x63, 0x78, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x64, 0x78, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x64, 0x78, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x73, 0x69, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x73, 0x69, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x64, 0x69, 0x18, 0x06, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x64, 0x69, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x73, 0x70, 0x18, 0x07, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x73, 0x70, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x62, 0x70, 0x18, 0x08, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x62, 0x70, 0x12, 0x0e, 0x0a, 0x02, 0x72, 0x38, 0x18, 0x09, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x72, 0x38, 0x12, 0x0e, 0x0a, 0x02, 0x72, 0x39, 0x18, 0x0a, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x72, 0x39, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x31, 0x30, 0x18, 0x0b, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x31, 0x30, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x31, 0x31, 0x18, 0x0c, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x31, 0x31, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x31, 0x32, 0x18, 0x0d, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x31, 0x32, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x31, 0x33, 0x18, 0x0e, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x31, 0x33, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x31, 0x34, 0x18, 0x0f, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x31, 0x34, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x31, 0x35, 0x18, 0x10, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x31, 0x35, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x69, 0x70, 0x18, 0x11, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x69, 0x70, 0x12, 0x16, 0x0a, 0x06, 0x72, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x18, 0x12, 0x20, 0x01, 0x28, 0x04, 0x52, 0x06, 0x72, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x12, 0x19, 0x0a, 0x08, 0x6f, 0x72, 0x69, 0x67, 0x5f, 0x72, 0x61, 0x78, 0x18, 0x13, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x6f, 0x72, 0x69, 0x67, 0x52, 0x61, 0x78, 0x12, 0x0e, 0x0a, 0x02, 0x63, 0x73, 0x18, 0x14, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x63, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x64, 0x73, 0x18, 0x15, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x64, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x65, 0x73, 0x18, 0x16, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x65, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x66, 0x73, 0x18, 0x17, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x66, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x67, 0x73, 0x18, 0x18, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x67, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x73, 0x73, 0x18, 0x19, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x73, 0x73, 0x12, 0x17, 0x0a, 0x07, 0x66, 0x73, 0x5f, 0x62, 0x61, 0x73, 0x65, 0x18, 0x1a, 0x20, 0x01, 0x28, 0x04, 0x52, 0x06, 0x66, 0x73, 0x42, 0x61, 0x73, 0x65, 0x12, 0x17, 0x0a, 0x07, 0x67, 0x73, 0x5f, 0x62, 0x61, 0x73, 0x65, 0x18, 0x1b, 0x20, 0x01, 0x28, 0x04, 0x52, 0x06, 0x67, 0x73, 0x42, 0x61, 0x73, 0x65, 0x22, 0xf4, 0x04, 0x0a, 0x0e, 0x41, 0x52, 0x4d, 0x36, 0x34, 0x52, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72, 0x73, 0x12, 0x0e, 0x0a, 0x02, 0x72, 0x30, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x72, 0x30, 0x12, 0x0e, 0x0a, 0x02, 0x72, 0x31, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x72, 0x31, 0x12, 0x0e, 0x0a, 0x02, 0x72, 0x32, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x72, 0x32, 0x12, 0x0e, 0x0a, 0x02, 0x72, 0x33, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x72, 0x33, 0x12, 0x0e, 0x0a, 0x02, 0x72, 0x34, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x72, 0x34, 0x12, 0x0e, 0x0a, 0x02, 0x72, 0x35, 0x18, 0x06, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x72, 0x35, 0x12, 0x0e, 0x0a, 0x02, 0x72, 0x36, 0x18, 0x07, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x72, 0x36, 0x12, 0x0e, 0x0a, 0x02, 0x72, 0x37, 0x18, 0x08, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x72, 0x37, 0x12, 0x0e, 0x0a, 0x02, 0x72, 0x38, 0x18, 0x09, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x72, 0x38, 0x12, 0x0e, 0x0a, 0x02, 0x72, 0x39, 0x18, 0x0a, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x72, 0x39, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x31, 0x30, 0x18, 0x0b, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x31, 0x30, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x31, 0x31, 0x18, 0x0c, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x31, 0x31, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x31, 0x32, 0x18, 0x0d, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x31, 0x32, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x31, 0x33, 0x18, 0x0e, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x31, 0x33, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x31, 0x34, 0x18, 0x0f, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x31, 0x34, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x31, 0x35, 0x18, 0x10, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x31, 0x35, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x31, 0x36, 0x18, 0x11, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x31, 0x36, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x31, 0x37, 0x18, 0x12, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x31, 0x37, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x31, 0x38, 0x18, 0x13, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x31, 0x38, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x31, 0x39, 0x18, 0x14, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x31, 0x39, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x32, 0x30, 0x18, 0x15, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x32, 0x30, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x32, 0x31, 0x18, 0x16, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x32, 0x31, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x32, 0x32, 0x18, 0x17, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x32, 0x32, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x32, 0x33, 0x18, 0x18, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x32, 0x33, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x32, 0x34, 0x18, 0x19, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x32, 0x34, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x32, 0x35, 0x18, 0x1a, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x32, 0x35, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x32, 0x36, 0x18, 0x1b, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x32, 0x36, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x32, 0x37, 0x18, 0x1c, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x32, 0x37, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x32, 0x38, 0x18, 0x1d, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x32, 0x38, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x32, 0x39, 0x18, 0x1e, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x32, 0x39, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x33, 0x30, 0x18, 0x1f, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x72, 0x33, 0x30, 0x12, 0x0e, 0x0a, 0x02, 0x73, 0x70, 0x18, 0x20, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x73, 0x70, 0x12, 0x0e, 0x0a, 0x02, 0x70, 0x63, 0x18, 0x21, 0x20, 0x01, 0x28, 0x04, 0x52, 0x02, 0x70, 0x63, 0x12, 0x16, 0x0a, 0x06, 0x70, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x22, 0x20, 0x01, 0x28, 0x04, 0x52, 0x06, 0x70, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x10, 0x0a, 0x03, 0x74, 0x6c, 0x73, 0x18, 0x23, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x74, 0x6c, 0x73, 0x22, 0x73, 0x0a, 0x09, 0x52, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72, 0x73, 0x12, 0x2e, 0x0a, 0x05, 0x61, 0x6d, 0x64, 0x36, 0x34, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x16, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x41, 0x4d, 0x44, 0x36, 0x34, 0x52, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72, 0x73, 0x48, 0x00, 0x52, 0x05, 0x61, 0x6d, 0x64, 0x36, 0x34, 0x12, 0x2e, 0x0a, 0x05, 0x61, 0x72, 0x6d, 0x36, 0x34, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x16, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x41, 0x52, 0x4d, 0x36, 0x34, 0x52, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72, 0x73, 0x48, 0x00, 0x52, 0x05, 0x61, 0x72, 0x6d, 0x36, 0x34, 0x42, 0x06, 0x0a, 0x04, 0x61, 0x72, 0x63, 0x68, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( file_pkg_sentry_arch_registers_proto_rawDescOnce sync.Once file_pkg_sentry_arch_registers_proto_rawDescData = file_pkg_sentry_arch_registers_proto_rawDesc ) func file_pkg_sentry_arch_registers_proto_rawDescGZIP() []byte { file_pkg_sentry_arch_registers_proto_rawDescOnce.Do(func() { file_pkg_sentry_arch_registers_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_sentry_arch_registers_proto_rawDescData) }) return file_pkg_sentry_arch_registers_proto_rawDescData } var file_pkg_sentry_arch_registers_proto_msgTypes = make([]protoimpl.MessageInfo, 3) var file_pkg_sentry_arch_registers_proto_goTypes = []interface{}{ (*AMD64Registers)(nil), // 0: gvisor.AMD64Registers (*ARM64Registers)(nil), // 1: gvisor.ARM64Registers (*Registers)(nil), // 2: gvisor.Registers } var file_pkg_sentry_arch_registers_proto_depIdxs = []int32{ 0, // 0: gvisor.Registers.amd64:type_name -> gvisor.AMD64Registers 1, // 1: gvisor.Registers.arm64:type_name -> gvisor.ARM64Registers 2, // [2:2] is the sub-list for method output_type 2, // [2:2] is the sub-list for method input_type 2, // [2:2] is the sub-list for extension type_name 2, // [2:2] is the sub-list for extension extendee 0, // [0:2] is the sub-list for field type_name } func init() { file_pkg_sentry_arch_registers_proto_init() } func file_pkg_sentry_arch_registers_proto_init() { if File_pkg_sentry_arch_registers_proto != nil { return } if !protoimpl.UnsafeEnabled { file_pkg_sentry_arch_registers_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*AMD64Registers); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_arch_registers_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*ARM64Registers); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_arch_registers_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Registers); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } } file_pkg_sentry_arch_registers_proto_msgTypes[2].OneofWrappers = []interface{}{ (*Registers_Amd64)(nil), (*Registers_Arm64)(nil), } type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_pkg_sentry_arch_registers_proto_rawDesc, NumEnums: 0, NumMessages: 3, NumExtensions: 0, NumServices: 0, }, GoTypes: file_pkg_sentry_arch_registers_proto_goTypes, DependencyIndexes: file_pkg_sentry_arch_registers_proto_depIdxs, MessageInfos: file_pkg_sentry_arch_registers_proto_msgTypes, }.Build() File_pkg_sentry_arch_registers_proto = out.File file_pkg_sentry_arch_registers_proto_rawDesc = nil file_pkg_sentry_arch_registers_proto_goTypes = nil file_pkg_sentry_arch_registers_proto_depIdxs = nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/signal_amd64.go000066400000000000000000000222321465435605700244650ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package arch import ( "math" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/usermem" ) // SignalContext64 is equivalent to struct sigcontext, the type passed as the // second argument to signal handlers set by signal(2). // // +marshal type SignalContext64 struct { R8 uint64 R9 uint64 R10 uint64 R11 uint64 R12 uint64 R13 uint64 R14 uint64 R15 uint64 Rdi uint64 Rsi uint64 Rbp uint64 Rbx uint64 Rdx uint64 Rax uint64 Rcx uint64 Rsp uint64 Rip uint64 Eflags uint64 Cs uint16 Gs uint16 // always 0 on amd64. Fs uint16 // always 0 on amd64. Ss uint16 // only restored if _UC_STRICT_RESTORE_SS (unsupported). Err uint64 Trapno uint64 Oldmask linux.SignalSet Cr2 uint64 // Pointer to a struct _fpstate. Fpstate uint64 Reserved [8]uint64 } // Flags for UContext64.Flags. const ( _UC_FP_XSTATE = 1 _UC_SIGCONTEXT_SS = 2 _UC_STRICT_RESTORE_SS = 4 ) // UContext64 is equivalent to ucontext_t on 64-bit x86. // // +marshal type UContext64 struct { Flags uint64 Link uint64 Stack linux.SignalStack MContext SignalContext64 Sigset linux.SignalSet } // SignalSetup implements Context.SignalSetup. (Compare to Linux's // arch/x86/kernel/signal.c:__setup_rt_frame().) func (c *Context64) SignalSetup(st *Stack, act *linux.SigAction, info *linux.SignalInfo, alt *linux.SignalStack, sigset linux.SignalSet, featureSet cpuid.FeatureSet) error { // "The 128-byte area beyond the location pointed to by %rsp is considered // to be reserved and shall not be modified by signal or interrupt // handlers. ... leaf functions may use this area for their entire stack // frame, rather than adjusting the stack pointer in the prologue and // epilogue." - AMD64 ABI // // (But this doesn't apply if we're starting at the top of the signal // stack, in which case there is no following stack frame.) sp := st.Bottom if !(alt.IsEnabled() && sp == alt.Top()) { sp -= 128 } // Allocate space for floating point state on the stack. _, fpAlign := featureSet.ExtendedStateSize() fpState := c.fpState.Slice() fpSize := len(fpState) + fpu.FP_XSTATE_MAGIC2_SIZE fpStart := (sp - hostarch.Addr(fpSize)) & ^hostarch.Addr(fpAlign-1) // Construct the UContext64 now since we need its size. uc := &UContext64{ // No _UC_STRICT_RESTORE_SS: we don't allow SS changes. Flags: _UC_SIGCONTEXT_SS, Stack: *alt, MContext: SignalContext64{ R8: c.Regs.R8, R9: c.Regs.R9, R10: c.Regs.R10, R11: c.Regs.R11, R12: c.Regs.R12, R13: c.Regs.R13, R14: c.Regs.R14, R15: c.Regs.R15, Rdi: c.Regs.Rdi, Rsi: c.Regs.Rsi, Rbp: c.Regs.Rbp, Rbx: c.Regs.Rbx, Rdx: c.Regs.Rdx, Rax: c.Regs.Rax, Rcx: c.Regs.Rcx, Rsp: c.Regs.Rsp, Rip: c.Regs.Rip, Eflags: c.Regs.Eflags, Cs: uint16(c.Regs.Cs), Ss: uint16(c.Regs.Ss), Oldmask: sigset, Fpstate: uint64(fpStart), }, Sigset: sigset, } if featureSet.UseXsave() { uc.Flags |= _UC_FP_XSTATE } // TODO(gvisor.dev/issue/159): Set SignalContext64.Err, Trapno, and Cr2 // based on the fault that caused the signal. For now, leave Err and // Trapno unset and assume CR2 == info.Addr() for SIGSEGVs and // SIGBUSes. if linux.Signal(info.Signo) == linux.SIGSEGV || linux.Signal(info.Signo) == linux.SIGBUS { uc.MContext.Cr2 = info.Addr() } // "... the value (%rsp+8) is always a multiple of 16 (...) when // control is transferred to the function entry point." - AMD64 ABI ucSize := uc.SizeBytes() // st.Arch.Width() is for the restorer address. sizeof(siginfo) == 128. frameSize := int(st.Arch.Width()) + ucSize + 128 frameStart := (fpStart-hostarch.Addr(frameSize)) & ^hostarch.Addr(15) - 8 frameEnd := frameStart + hostarch.Addr(frameSize) // Prior to proceeding, figure out if the frame will exhaust the range // for the signal stack. This is not allowed, and should immediately // force signal delivery (reverting to the default handler). if act.Flags&linux.SA_ONSTACK != 0 && alt.IsEnabled() && !alt.Contains(frameStart) { return unix.EFAULT } // Set up floating point state on the stack. Compare Linux's // arch/x86/kernel/fpu/signal.c:copy_fpstate_to_sigframe(). if _, err := st.IO.CopyOut(context.Background(), fpStart, fpState[:fpu.FP_SW_FRAME_OFFSET], usermem.IOOpts{}); err != nil { return err } fpsw := fpu.FPSoftwareFrame{ Magic1: fpu.FP_XSTATE_MAGIC1, ExtendedSize: uint32(fpSize), Xfeatures: fpu.XFEATURE_MASK_FPSSE | featureSet.ValidXCR0Mask(), XstateSize: uint32(fpSize) - fpu.FP_XSTATE_MAGIC2_SIZE, } st.Bottom = fpStart + 512 if _, err := fpsw.CopyOut(st, StackBottomMagic); err != nil { return err } if len(fpState) > 512 { if _, err := st.IO.CopyOut(context.Background(), fpStart+512, fpState[512:], usermem.IOOpts{}); err != nil { return err } } st.Bottom = fpStart + hostarch.Addr(fpSize) if _, err := primitive.CopyUint32Out(st, StackBottomMagic, fpu.FP_XSTATE_MAGIC2); err != nil { return err } // Adjust the code. info.FixSignalCodeForUser() // Set up the stack frame. st.Bottom = frameEnd if _, err := info.CopyOut(st, StackBottomMagic); err != nil { return err } infoAddr := st.Bottom if _, err := uc.CopyOut(st, StackBottomMagic); err != nil { return err } ucAddr := st.Bottom if act.Flags&linux.SA_RESTORER != 0 { // Push the restorer return address. // Note that this doesn't need to be popped. if _, err := primitive.CopyUint64Out(st, StackBottomMagic, act.Restorer); err != nil { return err } } else { // amd64 requires a restorer. return unix.EFAULT } // Set up registers. c.Regs.Rip = act.Handler c.Regs.Rsp = uint64(st.Bottom) c.Regs.Rdi = uint64(info.Signo) c.Regs.Rsi = uint64(infoAddr) c.Regs.Rdx = uint64(ucAddr) c.Regs.Rax = 0 c.Regs.Eflags &^= eflagsDF | eflagsRF | eflagsTF c.Regs.Ds = userDS c.Regs.Es = userDS c.Regs.Cs = userCS c.Regs.Ss = userDS // Clear floating point registers. c.fpState.Reset() return nil } // SignalRestore implements Context.SignalRestore. (Compare to Linux's // arch/x86/kernel/signal.c:sys_rt_sigreturn().) func (c *Context64) SignalRestore(st *Stack, rt bool, featureSet cpuid.FeatureSet) (linux.SignalSet, linux.SignalStack, error) { // Copy out the stack frame. var uc UContext64 if _, err := uc.CopyIn(st, StackBottomMagic); err != nil { return 0, linux.SignalStack{}, err } var info linux.SignalInfo if _, err := info.CopyIn(st, StackBottomMagic); err != nil { return 0, linux.SignalStack{}, err } // Restore registers. c.Regs.R8 = uc.MContext.R8 c.Regs.R9 = uc.MContext.R9 c.Regs.R10 = uc.MContext.R10 c.Regs.R11 = uc.MContext.R11 c.Regs.R12 = uc.MContext.R12 c.Regs.R13 = uc.MContext.R13 c.Regs.R14 = uc.MContext.R14 c.Regs.R15 = uc.MContext.R15 c.Regs.Rdi = uc.MContext.Rdi c.Regs.Rsi = uc.MContext.Rsi c.Regs.Rbp = uc.MContext.Rbp c.Regs.Rbx = uc.MContext.Rbx c.Regs.Rdx = uc.MContext.Rdx c.Regs.Rax = uc.MContext.Rax c.Regs.Rcx = uc.MContext.Rcx c.Regs.Rsp = uc.MContext.Rsp c.Regs.Rip = uc.MContext.Rip c.Regs.Eflags = (c.Regs.Eflags & ^eflagsRestorable) | (uc.MContext.Eflags & eflagsRestorable) c.Regs.Cs = uint64(uc.MContext.Cs) | 3 // N.B. _UC_STRICT_RESTORE_SS not supported. c.Regs.Orig_rax = math.MaxUint64 // Restore floating point state. Compare Linux's // arch/x86/kernel/fpu/signal.c:fpu__restore_sig(). if uc.MContext.Fpstate == 0 { c.fpState.Reset() } else { fpsw := fpu.FPSoftwareFrame{} st.Bottom = hostarch.Addr(uc.MContext.Fpstate + fpu.FP_SW_FRAME_OFFSET) if _, err := fpsw.CopyIn(st, StackBottomMagic); err != nil { c.fpState.Reset() return 0, linux.SignalStack{}, err } if fpsw.Magic1 != fpu.FP_XSTATE_MAGIC1 || fpsw.XstateSize < fpu.FXSAVE_AREA_SIZE || fpsw.XstateSize > fpsw.ExtendedSize { c.fpState.Reset() return 0, linux.SignalStack{}, linuxerr.EFAULT } fpState := c.fpState.Slice() fpSize := fpsw.XstateSize if int(fpSize) < len(fpState) { // The signal frame FPU state is smaller than expected. This can happen after S/R. c.fpState.Reset() fpState = fpState[:fpSize] } if _, err := st.IO.CopyIn(context.Background(), hostarch.Addr(uc.MContext.Fpstate), fpState, usermem.IOOpts{}); err != nil { c.fpState.Reset() return 0, linux.SignalStack{}, err } c.fpState.SanitizeUser(featureSet) } return uc.Sigset, uc.Stack, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/signal_arm64.go000066400000000000000000000134231465435605700245050ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package arch import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" ) // SignalContext64 is equivalent to struct sigcontext, the type passed as the // second argument to signal handlers set by signal(2). // // +marshal type SignalContext64 struct { FaultAddr uint64 Regs [31]uint64 Sp uint64 Pc uint64 Pstate uint64 _pad [8]byte // __attribute__((__aligned__(16))) Fpsimd64 FpsimdContext // size = 528 } // +marshal type aarch64Ctx struct { Magic uint32 Size uint32 } // FpsimdContext is equivalent to struct fpsimd_context on arm64 // (arch/arm64/include/uapi/asm/sigcontext.h). // // +marshal type FpsimdContext struct { Head aarch64Ctx Fpsr uint32 Fpcr uint32 Vregs [64]uint64 // actually [32]uint128 } // UContext64 is equivalent to ucontext on arm64(arch/arm64/include/uapi/asm/ucontext.h). // // +marshal type UContext64 struct { Flags uint64 Link uint64 Stack linux.SignalStack Sigset linux.SignalSet // glibc uses a 1024-bit sigset_t _pad [120]byte // (1024 - 64) / 8 = 120 // sigcontext must be aligned to 16-byte _pad2 [8]byte // last for future expansion MContext SignalContext64 } // SignalSetup implements Context.SignalSetup. func (c *Context64) SignalSetup(st *Stack, act *linux.SigAction, info *linux.SignalInfo, alt *linux.SignalStack, sigset linux.SignalSet, featureSet cpuid.FeatureSet) error { sp := st.Bottom // Construct the UContext64 now since we need its size. uc := &UContext64{ Flags: 0, Stack: *alt, MContext: SignalContext64{ Regs: c.Regs.Regs, Sp: c.Regs.Sp, Pc: c.Regs.Pc, Pstate: c.Regs.Pstate, }, Sigset: sigset, } if linux.Signal(info.Signo) == linux.SIGSEGV || linux.Signal(info.Signo) == linux.SIGBUS { uc.MContext.FaultAddr = info.Addr() } ucSize := uc.SizeBytes() // frameSize = ucSize + sizeof(siginfo). // sizeof(siginfo) == 128. // R30 stores the restorer address. frameSize := ucSize + 128 frameBottom := (sp - hostarch.Addr(frameSize)) & ^hostarch.Addr(15) sp = frameBottom + hostarch.Addr(frameSize) st.Bottom = sp // Prior to proceeding, figure out if the frame will exhaust the range // for the signal stack. This is not allowed, and should immediately // force signal delivery (reverting to the default handler). if act.Flags&linux.SA_ONSTACK != 0 && alt.IsEnabled() && !alt.Contains(frameBottom) { return unix.EFAULT } // Adjust the code. info.FixSignalCodeForUser() // Set up the stack frame. if _, err := info.CopyOut(st, StackBottomMagic); err != nil { return err } infoAddr := st.Bottom if _, err := uc.CopyOut(st, StackBottomMagic); err != nil { return err } ucAddr := st.Bottom // Set up registers. c.Regs.Sp = uint64(st.Bottom) c.Regs.Pc = act.Handler c.Regs.Regs[0] = uint64(info.Signo) c.Regs.Regs[1] = uint64(infoAddr) c.Regs.Regs[2] = uint64(ucAddr) c.Regs.Regs[30] = act.Restorer // Save the thread's floating point state. c.sigFPState = append(c.sigFPState, c.fpState) // Signal handler gets a clean floating point state. c.fpState = fpu.NewState() return nil } // SPSR_ELx bits which are always architecturally RES0 per ARM DDI 0487D.a. const _SPSR_EL1_AARCH64_RES0_BITS = uint64(0xffffffff0cdfe020) func (regs *Registers) userMode() bool { return (regs.Pstate & linux.PSR_MODE_MASK) == linux.PSR_MODE_EL0t } func (regs *Registers) validRegs() bool { regs.Pstate &= ^_SPSR_EL1_AARCH64_RES0_BITS if regs.userMode() && (regs.Pstate&linux.PSR_MODE32_BIT) == 0 && (regs.Pstate&linux.PSR_D_BIT) == 0 && (regs.Pstate&linux.PSR_A_BIT) == 0 && (regs.Pstate&linux.PSR_I_BIT) == 0 && (regs.Pstate&linux.PSR_F_BIT) == 0 { return true } // Force PSR to a valid 64-bit EL0t regs.Pstate &= linux.PSR_N_BIT | linux.PSR_Z_BIT | linux.PSR_C_BIT | linux.PSR_V_BIT return false } // SignalRestore implements Context.SignalRestore. func (c *Context64) SignalRestore(st *Stack, rt bool, featureSet cpuid.FeatureSet) (linux.SignalSet, linux.SignalStack, error) { // Copy out the stack frame. var uc UContext64 if _, err := uc.CopyIn(st, StackBottomMagic); err != nil { return 0, linux.SignalStack{}, err } var info linux.SignalInfo if _, err := info.CopyIn(st, StackBottomMagic); err != nil { return 0, linux.SignalStack{}, err } // Restore registers. c.Regs.Regs = uc.MContext.Regs c.Regs.Pc = uc.MContext.Pc c.Regs.Sp = uc.MContext.Sp c.Regs.Pstate = uc.MContext.Pstate if !c.Regs.validRegs() { return 0, linux.SignalStack{}, unix.EFAULT } // Restore floating point state. l := len(c.sigFPState) if l > 0 { c.fpState = c.sigFPState[l-1] // NOTE(cl/133042258): State save requires that any slice // elements from '[len:cap]' to be zero value. c.sigFPState[l-1] = nil c.sigFPState = c.sigFPState[0 : l-1] } else { // This might happen if sigreturn(2) calls are unbalanced with // respect to signal handler entries. This is not expected so // don't bother to do anything fancy with the floating point // state. log.Warningf("sigreturn unable to restore application fpstate") return 0, linux.SignalStack{}, unix.EFAULT } return uc.Sigset, uc.Stack, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/stack.go000066400000000000000000000163321465435605700233260ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package arch import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/usermem" ) // Stack is a simple wrapper around a hostarch.IO and an address. Stack // implements marshal.CopyContext, and marshallable values can be pushed or // popped from the stack through the marshal.Marshallable interface. // // Stack is not thread-safe. type Stack struct { // Our arch info. // We use this for automatic Native conversion of hostarch.Addrs during // Push() and Pop(). Arch *Context64 // The interface used to actually copy user memory. IO usermem.IO // Our current stack bottom. Bottom hostarch.Addr // Scratch buffer used for marshalling to avoid having to repeatedly // allocate scratch memory. scratchBuf []byte } // scratchBufLen is the default length of Stack.scratchBuf. The // largest structs the stack regularly serializes are linux.SignalInfo // and arch.UContext64. We'll set the default size as the larger of // the two, arch.UContext64. var scratchBufLen = (*UContext64)(nil).SizeBytes() // CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer. func (s *Stack) CopyScratchBuffer(size int) []byte { if len(s.scratchBuf) < size { s.scratchBuf = make([]byte, size) } return s.scratchBuf[:size] } // StackBottomMagic is the special address callers must past to all stack // marshalling operations to cause the src/dst address to be computed based on // the current end of the stack. const StackBottomMagic = ^hostarch.Addr(0) // hostarch.Addr(-1) // CopyOutBytes implements marshal.CopyContext.CopyOutBytes. CopyOutBytes // computes an appropriate address based on the current end of the // stack. Callers use the sentinel address StackBottomMagic to marshal methods // to indicate this. func (s *Stack) CopyOutBytes(sentinel hostarch.Addr, b []byte) (int, error) { if sentinel != StackBottomMagic { panic("Attempted to copy out to stack with absolute address") } c := len(b) n, err := s.IO.CopyOut(context.Background(), s.Bottom-hostarch.Addr(c), b, usermem.IOOpts{}) if err == nil && n == c { s.Bottom -= hostarch.Addr(n) } return n, err } // CopyInBytes implements marshal.CopyContext.CopyInBytes. CopyInBytes computes // an appropriate address based on the current end of the stack. Callers must // use the sentinel address StackBottomMagic to marshal methods to indicate // this. func (s *Stack) CopyInBytes(sentinel hostarch.Addr, b []byte) (int, error) { if sentinel != StackBottomMagic { panic("Attempted to copy in from stack with absolute address") } n, err := s.IO.CopyIn(context.Background(), s.Bottom, b, usermem.IOOpts{}) if err == nil { s.Bottom += hostarch.Addr(n) } return n, err } // Align aligns the stack to the given offset. func (s *Stack) Align(offset int) { if s.Bottom%hostarch.Addr(offset) != 0 { s.Bottom -= (s.Bottom % hostarch.Addr(offset)) } } // PushNullTerminatedByteSlice writes bs to the stack, followed by an extra null // byte at the end. On error, the contents of the stack and the bottom cursor // are undefined. func (s *Stack) PushNullTerminatedByteSlice(bs []byte) (int, error) { // Note: Stack grows up, so write the terminal null byte first. nNull, err := primitive.CopyUint8Out(s, StackBottomMagic, 0) if err != nil { return 0, err } n, err := primitive.CopyByteSliceOut(s, StackBottomMagic, bs) if err != nil { return 0, err } return n + nNull, nil } // StackLayout describes the location of the arguments and environment on the // stack. type StackLayout struct { // ArgvStart is the beginning of the argument vector. ArgvStart hostarch.Addr // ArgvEnd is the end of the argument vector. ArgvEnd hostarch.Addr // EnvvStart is the beginning of the environment vector. EnvvStart hostarch.Addr // EnvvEnd is the end of the environment vector. EnvvEnd hostarch.Addr } // Load pushes the given args, env and aux vector to the stack using the // well-known format for a new executable. It returns the start and end // of the argument and environment vectors. func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error) { l := StackLayout{} // Make sure we start with a 16-byte alignment. s.Align(16) // Push the environment vector so the end of the argument vector is adjacent to // the beginning of the environment vector. // While the System V abi for x86_64 does not specify an ordering to the // Information Block (the block holding the arg, env, and aux vectors), // support features like setproctitle(3) naturally expect these segments // to be in this order. See: https://www.uclibc.org/docs/psABI-x86_64.pdf // page 29. l.EnvvEnd = s.Bottom envAddrs := make([]hostarch.Addr, len(env)) for i := len(env) - 1; i >= 0; i-- { if _, err := s.PushNullTerminatedByteSlice([]byte(env[i])); err != nil { return StackLayout{}, err } envAddrs[i] = s.Bottom } l.EnvvStart = s.Bottom // Push our strings. l.ArgvEnd = s.Bottom argAddrs := make([]hostarch.Addr, len(args)) for i := len(args) - 1; i >= 0; i-- { if _, err := s.PushNullTerminatedByteSlice([]byte(args[i])); err != nil { return StackLayout{}, err } argAddrs[i] = s.Bottom } l.ArgvStart = s.Bottom // We need to align the arguments appropriately. // // We must finish on a 16-byte alignment, but we'll play it // conservatively and finish at 32-bytes. It would be nice to be able // to call Align here, but unfortunately we need to align the stack // with all the variable sized arrays pushed. So we just need to do // some calculations. argvSize := s.Arch.Width() * uint(len(args)+1) envvSize := s.Arch.Width() * uint(len(env)+1) auxvSize := s.Arch.Width() * 2 * uint(len(aux)+1) total := hostarch.Addr(argvSize) + hostarch.Addr(envvSize) + hostarch.Addr(auxvSize) + hostarch.Addr(s.Arch.Width()) expectedBottom := s.Bottom - total if expectedBottom%32 != 0 { s.Bottom -= expectedBottom % 32 } // Push our auxvec. // NOTE: We need an extra zero here per spec. // The Push function will automatically terminate // strings and arrays with a single null value. auxv := make([]hostarch.Addr, 0, len(aux)*2+1) for _, a := range aux { auxv = append(auxv, hostarch.Addr(a.Key), a.Value) } auxv = append(auxv, hostarch.Addr(0)) _, err := s.pushAddrSliceAndTerminator(auxv) if err != nil { return StackLayout{}, err } // Push environment. _, err = s.pushAddrSliceAndTerminator(envAddrs) if err != nil { return StackLayout{}, err } // Push args. _, err = s.pushAddrSliceAndTerminator(argAddrs) if err != nil { return StackLayout{}, err } // Push arg count. lenP := s.Arch.Native(uintptr(len(args))) if _, err = lenP.CopyOut(s, StackBottomMagic); err != nil { return StackLayout{}, err } return l, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/stack_unsafe.go000066400000000000000000000034751465435605700246730ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package arch import ( "unsafe" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" ) // pushAddrSliceAndTerminator copies a slices of addresses to the stack, and // also pushes an extra null address element at the end of the slice. // // Internally, we unsafely transmute the slice type from the arch-dependent // []hostarch.Addr type, to a slice of fixed-sized ints so that we can pass it to // go-marshal. // // On error, the contents of the stack and the bottom cursor are undefined. func (s *Stack) pushAddrSliceAndTerminator(src []hostarch.Addr) (int, error) { // Note: Stack grows upwards, so push the terminator first. switch s.Arch.Width() { case 8: nNull, err := primitive.CopyUint64Out(s, StackBottomMagic, 0) if err != nil { return 0, err } srcAsUint64 := *(*[]uint64)(unsafe.Pointer(&src)) n, err := primitive.CopyUint64SliceOut(s, StackBottomMagic, srcAsUint64) return n + nNull, err case 4: nNull, err := primitive.CopyUint32Out(s, StackBottomMagic, 0) if err != nil { return 0, err } srcAsUint32 := *(*[]uint32)(unsafe.Pointer(&src)) n, err := primitive.CopyUint32SliceOut(s, StackBottomMagic, srcAsUint32) return n + nNull, err default: panic("Unsupported arch width") } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/syscalls_amd64.go000066400000000000000000000037431465435605700250530ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package arch const restartSyscallNr = uintptr(219) // SyscallSaveOrig save the value of the register which is clobbered in // syscall handler(doSyscall()). // // Noop on x86. func (c *Context64) SyscallSaveOrig() { } // SyscallNo returns the syscall number according to the 64-bit convention. func (c *Context64) SyscallNo() uintptr { return uintptr(c.Regs.Orig_rax) } // SyscallArgs provides syscall arguments according to the 64-bit convention. // // Due to the way addresses are mapped for the sentry this binary *must* be // built in 64-bit mode. So we can just assume the syscall numbers that come // back match the expected host system call numbers. func (c *Context64) SyscallArgs() SyscallArguments { return SyscallArguments{ SyscallArgument{Value: uintptr(c.Regs.Rdi)}, SyscallArgument{Value: uintptr(c.Regs.Rsi)}, SyscallArgument{Value: uintptr(c.Regs.Rdx)}, SyscallArgument{Value: uintptr(c.Regs.R10)}, SyscallArgument{Value: uintptr(c.Regs.R8)}, SyscallArgument{Value: uintptr(c.Regs.R9)}, } } // RestartSyscall implements Context.RestartSyscall. func (c *Context64) RestartSyscall() { c.Regs.Rip -= SyscallWidth c.Regs.Rax = c.Regs.Orig_rax } // RestartSyscallWithRestartBlock implements Context.RestartSyscallWithRestartBlock. func (c *Context64) RestartSyscallWithRestartBlock() { c.Regs.Rip -= SyscallWidth c.Regs.Rax = uint64(restartSyscallNr) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/arch/syscalls_arm64.go000066400000000000000000000061071465435605700250660ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package arch const restartSyscallNr = uintptr(128) // SyscallSaveOrig save the value of the register R0 which is clobbered in // syscall handler(doSyscall()). // // In linux, at the entry of the syscall handler(el0_svc_common()), value of R0 // is saved to the pt_regs.orig_x0 in kernel code. But currently, the orig_x0 // was not accessible to the userspace application, so we have to do the same // operation in the sentry code to save the R0 value into the App context. func (c *Context64) SyscallSaveOrig() { c.OrigR0 = c.Regs.Regs[0] } // SyscallNo returns the syscall number according to the 64-bit convention. func (c *Context64) SyscallNo() uintptr { return uintptr(c.Regs.Regs[8]) } // SyscallArgs provides syscall arguments according to the 64-bit convention. // // Due to the way addresses are mapped for the sentry this binary *must* be // built in 64-bit mode. So we can just assume the syscall numbers that come // back match the expected host system call numbers. // General purpose registers usage on Arm64: // R0...R7: parameter/result registers. // R8: indirect result location register. // R9...R15: temporary registers. // R16: the first intra-procedure-call scratch register. // R17: the second intra-procedure-call scratch register. // R18: the platform register. // R19...R28: callee-saved registers. // R29: the frame pointer. // R30: the link register. func (c *Context64) SyscallArgs() SyscallArguments { return SyscallArguments{ SyscallArgument{Value: uintptr(c.OrigR0)}, SyscallArgument{Value: uintptr(c.Regs.Regs[1])}, SyscallArgument{Value: uintptr(c.Regs.Regs[2])}, SyscallArgument{Value: uintptr(c.Regs.Regs[3])}, SyscallArgument{Value: uintptr(c.Regs.Regs[4])}, SyscallArgument{Value: uintptr(c.Regs.Regs[5])}, } } // RestartSyscall implements Context.RestartSyscall. // Prepare for system call restart, OrigR0 will be restored to R0. // Please see the linux code as reference: // arch/arm64/kernel/signal.c:do_signal() func (c *Context64) RestartSyscall() { c.Regs.Pc -= SyscallWidth // R0 will be backed up into OrigR0 when entering doSyscall(). // Please see the linux code as reference: // arch/arm64/kernel/syscall.c:el0_svc_common(). // Here we restore it back. c.Regs.Regs[0] = uint64(c.OrigR0) } // RestartSyscallWithRestartBlock implements Context.RestartSyscallWithRestartBlock. func (c *Context64) RestartSyscallWithRestartBlock() { c.Regs.Pc -= SyscallWidth c.Regs.Regs[0] = uint64(c.OrigR0) c.Regs.Regs[8] = uint64(restartSyscallNr) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/control/000077500000000000000000000000001465435605700224305ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/control/cgroups.go000066400000000000000000000100441465435605700244400ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package control import ( "fmt" "strings" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel" ) // Cgroups contains the state for cgroupfs related control commands. type Cgroups struct { Kernel *kernel.Kernel } func (c *Cgroups) findCgroup(ctx context.Context, file CgroupControlFile) (kernel.Cgroup, error) { ctl, err := file.controller() if err != nil { return kernel.Cgroup{}, err } return c.Kernel.CgroupRegistry().FindCgroup(ctx, ctl, file.Path) } // CgroupControlFile identifies a specific control file within a // specific cgroup, for the hierarchy with a given controller. type CgroupControlFile struct { Controller string `json:"controller"` Path string `json:"path"` Name string `json:"name"` } func (f *CgroupControlFile) controller() (kernel.CgroupControllerType, error) { return kernel.ParseCgroupController(f.Controller) } // CgroupsResult represents the result of a cgroup operation. type CgroupsResult struct { Data string `json:"value"` IsError bool `json:"is_error"` } // AsError interprets the result as an error. func (r *CgroupsResult) AsError() error { if r.IsError { return fmt.Errorf(r.Data) } return nil } // Unpack splits CgroupsResult into a (value, error) tuple. func (r *CgroupsResult) Unpack() (string, error) { if r.IsError { return "", fmt.Errorf(r.Data) } return r.Data, nil } func newValue(val string) CgroupsResult { return CgroupsResult{ Data: strings.TrimSpace(val), } } func newError(err error) CgroupsResult { return CgroupsResult{ Data: err.Error(), IsError: true, } } // CgroupsResults represents the list of results for a batch command. type CgroupsResults struct { Results []CgroupsResult `json:"results"` } func (o *CgroupsResults) appendValue(val string) { o.Results = append(o.Results, newValue(val)) } func (o *CgroupsResults) appendError(err error) { o.Results = append(o.Results, newError(err)) } // CgroupsReadArg represents the arguments for a single read command. type CgroupsReadArg struct { File CgroupControlFile `json:"file"` } // CgroupsReadArgs represents the list of arguments for a batched read command. type CgroupsReadArgs struct { Args []CgroupsReadArg `json:"args"` } // ReadControlFiles is an RPC stub for batch-reading cgroupfs control files. func (c *Cgroups) ReadControlFiles(args *CgroupsReadArgs, out *CgroupsResults) error { ctx := c.Kernel.SupervisorContext() for _, arg := range args.Args { cg, err := c.findCgroup(ctx, arg.File) if err != nil { out.appendError(err) continue } val, err := cg.ReadControl(ctx, arg.File.Name) if err != nil { out.appendError(err) } else { out.appendValue(val) } } return nil } // CgroupsWriteArg represents the arguments for a single write command. type CgroupsWriteArg struct { File CgroupControlFile `json:"file"` Value string `json:"value"` } // CgroupsWriteArgs represents the lust of arguments for a batched write command. type CgroupsWriteArgs struct { Args []CgroupsWriteArg `json:"args"` } // WriteControlFiles is an RPC stub for batch-writing cgroupfs control files. func (c *Cgroups) WriteControlFiles(args *CgroupsWriteArgs, out *CgroupsResults) error { ctx := c.Kernel.SupervisorContext() for _, arg := range args.Args { cg, err := c.findCgroup(ctx, arg.File) if err != nil { out.appendError(err) continue } err = cg.WriteControl(ctx, arg.File.Name, arg.Value) if err != nil { out.appendError(err) } else { out.appendValue("") } } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/control/control.go000066400000000000000000000013701465435605700244400ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package control contains types that expose control server methods, and can // be used to configure and interact with a running sandbox process. package control golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/control/control_go_proto/000077500000000000000000000000001465435605700260205ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/control/control_go_proto/control.pb.go000066400000000000000000000355031465435605700304350ustar00rootroot00000000000000// Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.31.0 // protoc v5.26.1 // source: pkg/sentry/control/control.proto package control_go_proto import ( protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" timestamppb "google.golang.org/protobuf/types/known/timestamppb" reflect "reflect" sync "sync" ) const ( // Verify that this generated code is sufficiently up-to-date. _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) // Verify that runtime/protoimpl is sufficiently up-to-date. _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) type ControlConfig_Endpoint int32 const ( ControlConfig_UNKNOWN ControlConfig_Endpoint = 0 ControlConfig_EVENTS ControlConfig_Endpoint = 1 ControlConfig_FS ControlConfig_Endpoint = 2 ControlConfig_LIFECYCLE ControlConfig_Endpoint = 3 ControlConfig_LOGGING ControlConfig_Endpoint = 4 ControlConfig_PROFILE ControlConfig_Endpoint = 5 ControlConfig_USAGE ControlConfig_Endpoint = 6 ControlConfig_PROC ControlConfig_Endpoint = 7 ControlConfig_STATE ControlConfig_Endpoint = 8 ControlConfig_DEBUG ControlConfig_Endpoint = 9 ControlConfig_CGROUPS ControlConfig_Endpoint = 10 ) // Enum value maps for ControlConfig_Endpoint. var ( ControlConfig_Endpoint_name = map[int32]string{ 0: "UNKNOWN", 1: "EVENTS", 2: "FS", 3: "LIFECYCLE", 4: "LOGGING", 5: "PROFILE", 6: "USAGE", 7: "PROC", 8: "STATE", 9: "DEBUG", 10: "CGROUPS", } ControlConfig_Endpoint_value = map[string]int32{ "UNKNOWN": 0, "EVENTS": 1, "FS": 2, "LIFECYCLE": 3, "LOGGING": 4, "PROFILE": 5, "USAGE": 6, "PROC": 7, "STATE": 8, "DEBUG": 9, "CGROUPS": 10, } ) func (x ControlConfig_Endpoint) Enum() *ControlConfig_Endpoint { p := new(ControlConfig_Endpoint) *p = x return p } func (x ControlConfig_Endpoint) String() string { return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) } func (ControlConfig_Endpoint) Descriptor() protoreflect.EnumDescriptor { return file_pkg_sentry_control_control_proto_enumTypes[0].Descriptor() } func (ControlConfig_Endpoint) Type() protoreflect.EnumType { return &file_pkg_sentry_control_control_proto_enumTypes[0] } func (x ControlConfig_Endpoint) Number() protoreflect.EnumNumber { return protoreflect.EnumNumber(x) } // Deprecated: Use ControlConfig_Endpoint.Descriptor instead. func (ControlConfig_Endpoint) EnumDescriptor() ([]byte, []int) { return file_pkg_sentry_control_control_proto_rawDescGZIP(), []int{0, 0} } type ControlConfig struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields AllowedControls []ControlConfig_Endpoint `protobuf:"varint,1,rep,packed,name=allowed_controls,json=allowedControls,proto3,enum=gvisor.ControlConfig_Endpoint" json:"allowed_controls,omitempty"` } func (x *ControlConfig) Reset() { *x = ControlConfig{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_control_control_proto_msgTypes[0] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *ControlConfig) String() string { return protoimpl.X.MessageStringOf(x) } func (*ControlConfig) ProtoMessage() {} func (x *ControlConfig) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_control_control_proto_msgTypes[0] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use ControlConfig.ProtoReflect.Descriptor instead. func (*ControlConfig) Descriptor() ([]byte, []int) { return file_pkg_sentry_control_control_proto_rawDescGZIP(), []int{0} } func (x *ControlConfig) GetAllowedControls() []ControlConfig_Endpoint { if x != nil { return x.AllowedControls } return nil } type ContainerStartedEvent struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Started bool `protobuf:"varint,1,opt,name=started,proto3" json:"started,omitempty"` ContainerId string `protobuf:"bytes,2,opt,name=container_id,json=containerId,proto3" json:"container_id,omitempty"` RequestReceived *timestamppb.Timestamp `protobuf:"bytes,3,opt,name=request_received,json=requestReceived,proto3" json:"request_received,omitempty"` RequestCompleted *timestamppb.Timestamp `protobuf:"bytes,4,opt,name=request_completed,json=requestCompleted,proto3" json:"request_completed,omitempty"` } func (x *ContainerStartedEvent) Reset() { *x = ContainerStartedEvent{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_control_control_proto_msgTypes[1] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *ContainerStartedEvent) String() string { return protoimpl.X.MessageStringOf(x) } func (*ContainerStartedEvent) ProtoMessage() {} func (x *ContainerStartedEvent) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_control_control_proto_msgTypes[1] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use ContainerStartedEvent.ProtoReflect.Descriptor instead. func (*ContainerStartedEvent) Descriptor() ([]byte, []int) { return file_pkg_sentry_control_control_proto_rawDescGZIP(), []int{1} } func (x *ContainerStartedEvent) GetStarted() bool { if x != nil { return x.Started } return false } func (x *ContainerStartedEvent) GetContainerId() string { if x != nil { return x.ContainerId } return "" } func (x *ContainerStartedEvent) GetRequestReceived() *timestamppb.Timestamp { if x != nil { return x.RequestReceived } return nil } func (x *ContainerStartedEvent) GetRequestCompleted() *timestamppb.Timestamp { if x != nil { return x.RequestCompleted } return nil } type ContainerExitEvent struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContainerId string `protobuf:"bytes,1,opt,name=container_id,json=containerId,proto3" json:"container_id,omitempty"` ExitStatus uint32 `protobuf:"varint,2,opt,name=exit_status,json=exitStatus,proto3" json:"exit_status,omitempty"` } func (x *ContainerExitEvent) Reset() { *x = ContainerExitEvent{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_control_control_proto_msgTypes[2] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *ContainerExitEvent) String() string { return protoimpl.X.MessageStringOf(x) } func (*ContainerExitEvent) ProtoMessage() {} func (x *ContainerExitEvent) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_control_control_proto_msgTypes[2] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use ContainerExitEvent.ProtoReflect.Descriptor instead. func (*ContainerExitEvent) Descriptor() ([]byte, []int) { return file_pkg_sentry_control_control_proto_rawDescGZIP(), []int{2} } func (x *ContainerExitEvent) GetContainerId() string { if x != nil { return x.ContainerId } return "" } func (x *ContainerExitEvent) GetExitStatus() uint32 { if x != nil { return x.ExitStatus } return 0 } var File_pkg_sentry_control_control_proto protoreflect.FileDescriptor var file_pkg_sentry_control_control_proto_rawDesc = []byte{ 0x0a, 0x20, 0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x06, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x1a, 0x1f, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2f, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0xe9, 0x01, 0x0a, 0x0d, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x12, 0x49, 0x0a, 0x10, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x65, 0x64, 0x5f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0e, 0x32, 0x1e, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x45, 0x6e, 0x64, 0x70, 0x6f, 0x69, 0x6e, 0x74, 0x52, 0x0f, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x65, 0x64, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x73, 0x22, 0x8c, 0x01, 0x0a, 0x08, 0x45, 0x6e, 0x64, 0x70, 0x6f, 0x69, 0x6e, 0x74, 0x12, 0x0b, 0x0a, 0x07, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, 0x0a, 0x0a, 0x06, 0x45, 0x56, 0x45, 0x4e, 0x54, 0x53, 0x10, 0x01, 0x12, 0x06, 0x0a, 0x02, 0x46, 0x53, 0x10, 0x02, 0x12, 0x0d, 0x0a, 0x09, 0x4c, 0x49, 0x46, 0x45, 0x43, 0x59, 0x43, 0x4c, 0x45, 0x10, 0x03, 0x12, 0x0b, 0x0a, 0x07, 0x4c, 0x4f, 0x47, 0x47, 0x49, 0x4e, 0x47, 0x10, 0x04, 0x12, 0x0b, 0x0a, 0x07, 0x50, 0x52, 0x4f, 0x46, 0x49, 0x4c, 0x45, 0x10, 0x05, 0x12, 0x09, 0x0a, 0x05, 0x55, 0x53, 0x41, 0x47, 0x45, 0x10, 0x06, 0x12, 0x08, 0x0a, 0x04, 0x50, 0x52, 0x4f, 0x43, 0x10, 0x07, 0x12, 0x09, 0x0a, 0x05, 0x53, 0x54, 0x41, 0x54, 0x45, 0x10, 0x08, 0x12, 0x09, 0x0a, 0x05, 0x44, 0x45, 0x42, 0x55, 0x47, 0x10, 0x09, 0x12, 0x0b, 0x0a, 0x07, 0x43, 0x47, 0x52, 0x4f, 0x55, 0x50, 0x53, 0x10, 0x0a, 0x22, 0xe4, 0x01, 0x0a, 0x15, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x12, 0x18, 0x0a, 0x07, 0x73, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07, 0x73, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64, 0x12, 0x21, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x5f, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x49, 0x64, 0x12, 0x45, 0x0a, 0x10, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x5f, 0x72, 0x65, 0x63, 0x65, 0x69, 0x76, 0x65, 0x64, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x52, 0x0f, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x52, 0x65, 0x63, 0x65, 0x69, 0x76, 0x65, 0x64, 0x12, 0x47, 0x0a, 0x11, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x5f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x52, 0x10, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x22, 0x58, 0x0a, 0x12, 0x43, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x45, 0x78, 0x69, 0x74, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x12, 0x21, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x5f, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x49, 0x64, 0x12, 0x1f, 0x0a, 0x0b, 0x65, 0x78, 0x69, 0x74, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0a, 0x65, 0x78, 0x69, 0x74, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( file_pkg_sentry_control_control_proto_rawDescOnce sync.Once file_pkg_sentry_control_control_proto_rawDescData = file_pkg_sentry_control_control_proto_rawDesc ) func file_pkg_sentry_control_control_proto_rawDescGZIP() []byte { file_pkg_sentry_control_control_proto_rawDescOnce.Do(func() { file_pkg_sentry_control_control_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_sentry_control_control_proto_rawDescData) }) return file_pkg_sentry_control_control_proto_rawDescData } var file_pkg_sentry_control_control_proto_enumTypes = make([]protoimpl.EnumInfo, 1) var file_pkg_sentry_control_control_proto_msgTypes = make([]protoimpl.MessageInfo, 3) var file_pkg_sentry_control_control_proto_goTypes = []interface{}{ (ControlConfig_Endpoint)(0), // 0: gvisor.ControlConfig.Endpoint (*ControlConfig)(nil), // 1: gvisor.ControlConfig (*ContainerStartedEvent)(nil), // 2: gvisor.ContainerStartedEvent (*ContainerExitEvent)(nil), // 3: gvisor.ContainerExitEvent (*timestamppb.Timestamp)(nil), // 4: google.protobuf.Timestamp } var file_pkg_sentry_control_control_proto_depIdxs = []int32{ 0, // 0: gvisor.ControlConfig.allowed_controls:type_name -> gvisor.ControlConfig.Endpoint 4, // 1: gvisor.ContainerStartedEvent.request_received:type_name -> google.protobuf.Timestamp 4, // 2: gvisor.ContainerStartedEvent.request_completed:type_name -> google.protobuf.Timestamp 3, // [3:3] is the sub-list for method output_type 3, // [3:3] is the sub-list for method input_type 3, // [3:3] is the sub-list for extension type_name 3, // [3:3] is the sub-list for extension extendee 0, // [0:3] is the sub-list for field type_name } func init() { file_pkg_sentry_control_control_proto_init() } func file_pkg_sentry_control_control_proto_init() { if File_pkg_sentry_control_control_proto != nil { return } if !protoimpl.UnsafeEnabled { file_pkg_sentry_control_control_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*ControlConfig); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_control_control_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*ContainerStartedEvent); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_control_control_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*ContainerExitEvent); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } } type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_pkg_sentry_control_control_proto_rawDesc, NumEnums: 1, NumMessages: 3, NumExtensions: 0, NumServices: 0, }, GoTypes: file_pkg_sentry_control_control_proto_goTypes, DependencyIndexes: file_pkg_sentry_control_control_proto_depIdxs, EnumInfos: file_pkg_sentry_control_control_proto_enumTypes, MessageInfos: file_pkg_sentry_control_control_proto_msgTypes, }.Build() File_pkg_sentry_control_control_proto = out.File file_pkg_sentry_control_control_proto_rawDesc = nil file_pkg_sentry_control_control_proto_goTypes = nil file_pkg_sentry_control_control_proto_depIdxs = nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/control/control_state_autogen.go000066400000000000000000000000711465435605700273570ustar00rootroot00000000000000// automatically generated by stateify. package control golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/control/events.go000066400000000000000000000035061465435605700242670ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package control import ( "errors" "fmt" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/urpc" ) // EventsOpts are the arguments for eventchannel-related commands. type EventsOpts struct { urpc.FilePayload } // Events is the control server state for eventchannel-related commands. type Events struct { emitter eventchannel.Emitter } // AttachDebugEmitter receives a connected unix domain socket FD from the client // and establishes it as a new emitter for the sentry eventchannel. Any existing // emitters are replaced on a subsequent attach. func (e *Events) AttachDebugEmitter(o *EventsOpts, _ *struct{}) error { if len(o.FilePayload.Files) < 1 { return errors.New("no output writer provided") } sock, err := o.ReleaseFD(0) if err != nil { return err } sockFD := sock.Release() // SocketEmitter takes ownership of sockFD. emitter, err := eventchannel.SocketEmitter(sockFD) if err != nil { return fmt.Errorf("failed to create SocketEmitter for FD %d: %v", sockFD, err) } // If there is already a debug emitter, close the old one. if e.emitter != nil { e.emitter.Close() } e.emitter = eventchannel.DebugEmitterFrom(emitter) // Register the new stream destination. eventchannel.AddEmitter(e.emitter) return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/control/fs.go000066400000000000000000000050331465435605700233700ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package control import ( "fmt" "io" "os" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/pkg/usermem" ) // CatOpts contains options for the Cat RPC call. type CatOpts struct { // Files are the filesystem paths for the files to cat. Files []string `json:"files"` // FilePayload contains the destination for output. urpc.FilePayload } // Fs includes fs-related functions. type Fs struct { Kernel *kernel.Kernel } // Cat is a RPC stub which prints out and returns the content of the files. func (f *Fs) Cat(o *CatOpts, _ *struct{}) error { // Create an output stream. if len(o.FilePayload.Files) != 1 { return ErrInvalidFiles } output := o.FilePayload.Files[0] for _, file := range o.Files { if err := cat(f.Kernel, file, output); err != nil { return fmt.Errorf("cannot read from file %s: %v", file, err) } } return nil } // fdReader provides an io.Reader interface for a vfs.FileDescription. type fdReader struct { ctx context.Context fd *vfs.FileDescription } // Read implements io.Reader.Read. func (f *fdReader) Read(p []byte) (int, error) { n, err := f.fd.Read(f.ctx, usermem.BytesIOSequence(p), vfs.ReadOptions{}) return int(n), err } func cat(k *kernel.Kernel, path string, output *os.File) error { ctx := k.SupervisorContext() creds := auth.NewRootCredentials(k.RootUserNamespace()) mns := k.GlobalInit().Leader().MountNamespace() root := mns.Root(ctx) defer root.DecRef(ctx) fd, err := k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse(path), }, &vfs.OpenOptions{ Flags: linux.O_RDONLY, }) if err != nil { return fmt.Errorf("failed to open file %s: %v", path, err) } defer fd.DecRef(ctx) _, err = io.Copy(output, &fdReader{ctx: ctx, fd: fd}) return err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/control/lifecycle.go000066400000000000000000000373161465435605700247300ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package control import ( "encoding/json" "fmt" "time" "google.golang.org/protobuf/types/known/timestamppb" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" pb "gvisor.dev/gvisor/pkg/sentry/control/control_go_proto" "gvisor.dev/gvisor/pkg/sentry/fdimport" "gvisor.dev/gvisor/pkg/sentry/fsimpl/user" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/urpc" ) // Lifecycle provides functions related to starting and stopping tasks. type Lifecycle struct { // Kernel is the kernel where the tasks belong to. Kernel *kernel.Kernel // ShutdownCh is the channel used to signal the sentry to shutdown // the sentry/sandbox. ShutdownCh chan struct{} // mu protects the fields below. mu sync.RWMutex // MountNamespacesMap is a map of container id/names and the mount // namespaces. MountNamespacesMap map[string]*vfs.MountNamespace // containerMap is a map of the container id and the container. containerMap map[string]*Container } // containerState is the state of the container. type containerState int const ( // stateCreated is the state when the container was created. It is the // initial state. stateCreated containerState = iota // stateRunning is the state when the container/application is running. stateRunning // stateStopped is the state when the container has exited. stateStopped ) // Container contains the set of parameters to represent a container. type Container struct { // containerID. containerID string // tg is the init(PID 1) threadgroup of the container. tg *kernel.ThreadGroup // state is the current state of the container. state containerState } // StartContainerArgs is the set of arguments to start a container. type StartContainerArgs struct { // Filename is the filename to load. // // If this is provided as "", then the file will be guessed via Argv[0]. Filename string `json:"filename"` // Argv is a list of arguments. Argv []string `json:"argv"` // Envv is a list of environment variables. Envv []string `json:"envv"` // Secret_envv is a list of secret environment variables. // // NOTE: This field must never be logged! SecretEnvv []string `json:"secret_envv"` // WorkingDirectory defines the working directory for the new process. WorkingDirectory string `json:"wd"` // KUID is the UID to run with in the root user namespace. Defaults to // root if not set explicitly. KUID auth.KUID `json:"KUID"` // KGID is the GID to run with in the root user namespace. Defaults to // the root group if not set explicitly. KGID auth.KGID `json:"KGID"` // User is the user string used to retrieve UID/GID. User string `json:"user"` // ContainerID is the container for the process being executed. ContainerID string `json:"container_id"` // InitialCgroups is the set of cgroup controllers container needs to be initialised to. InitialCgroups map[kernel.CgroupControllerType]string `json:"initial_cgroups"` // Limits is the limit set for the process being executed. Limits map[string]limits.Limit `json:"limits"` // If HOME environment variable is not provided, and this flag is set, // then the HOME environment variable will be set inside the container // based on the user's home directory in /etc/passwd. ResolveHome bool `json:"resolve_home"` // If set, attempt to resolve the binary_path via the following procedure: // 1) If binary_path is absolute, it is used directly. // 2) If binary_path contains a slash, then it is resolved relative to the // working_directory (or the root it working_directory is not set). // 3) Otherwise, search the PATH environment variable for the first directory // that contains an executable file with name in binary_path. ResolveBinaryPath bool `json:"resolve_binary_path"` // DonatedFDs is the list of sentry-intrenal file descriptors that will // donated. They correspond to the donated files in FilePayload. DonatedFDs []int `json:"donated_fds"` // FilePayload determines the files to give to the new process. urpc.FilePayload } // String formats the StartContainerArgs without the SecretEnvv field. func (sca StartContainerArgs) String() string { sca.SecretEnvv = make([]string, len(sca.SecretEnvv)) for i := range sca.SecretEnvv { sca.SecretEnvv[i] = "(hidden)" } b, err := json.Marshal(sca) if err != nil { return fmt.Sprintf("error marshaling: %s", err) } return string(b) } func (l *Lifecycle) updateContainerState(containerID string, newState containerState) error { l.mu.Lock() defer l.mu.Unlock() c, ok := l.containerMap[containerID] if !ok { return fmt.Errorf("container %v not started", containerID) } switch newState { case stateCreated: // Impossible. panic(fmt.Sprintf("invalid state transition: %v => %v", c.state, newState)) case stateRunning: if c.state != stateCreated { // Impossible. panic(fmt.Sprintf("invalid state transition: %v => %v", c.state, newState)) } case stateStopped: // Valid state transition. default: // Invalid new state. panic(fmt.Sprintf("invalid new state: %v", newState)) } c.state = newState return nil } // StartContainer will start a new container in the sandbox. func (l *Lifecycle) StartContainer(args *StartContainerArgs, _ *uint32) error { timeRequested := time.Now() timeRequestReceived := ×tamppb.Timestamp{ Seconds: timeRequested.Unix(), Nanos: int32(timeRequested.Nanosecond()), } log.Infof("StartContainer: %v", args) if len(args.Files) != len(args.DonatedFDs) { return fmt.Errorf("FilePayload.Files and DonatedFDs must have same number of elements (%d != %d)", len(args.Files), len(args.DonatedFDs)) } l.mu.RLock() mntns, ok := l.MountNamespacesMap[args.ContainerID] if !ok { l.mu.RUnlock() return fmt.Errorf("mount namespace is nil for %s", args.ContainerID) } l.mu.RUnlock() uid := args.KUID gid := args.KGID if args.User != "" { if uid != 0 || gid != 0 { return fmt.Errorf("container spec specified both an explicit UID/GID and a user name, only one or the other may be provided") } var err error uid, gid, err = user.GetExecUIDGIDFromUser(l.Kernel.SupervisorContext(), mntns, args.User) if err != nil { return fmt.Errorf("couldn't retrieve UID and GID for user %v, err: %v", args.User, err) } } creds := auth.NewUserCredentials( uid, gid, nil, /* extraKGIDs */ nil, /* capabilities */ l.Kernel.RootUserNamespace()) ls, err := limits.NewLinuxDistroLimitSet() if err != nil { return fmt.Errorf("error creating default limit set: %w", err) } for name, limit := range args.Limits { lt, ok := limits.FromLinuxResourceName[name] if !ok { return fmt.Errorf("unknown limit %q", name) } ls.SetUnchecked(lt, limit) } // Create a new pid namespace for the container. Each container must run // in its own pid namespace. pidNs := l.Kernel.RootPIDNamespace().NewChild(l.Kernel.RootUserNamespace()) initArgs := kernel.CreateProcessArgs{ Filename: args.Filename, Argv: args.Argv, // Order Envv before SecretEnvv. Envv: append(args.Envv, args.SecretEnvv...), WorkingDirectory: args.WorkingDirectory, Credentials: creds, Umask: 0022, Limits: ls, MaxSymlinkTraversals: linux.MaxSymlinkTraversals, UTSNamespace: l.Kernel.RootUTSNamespace(), IPCNamespace: l.Kernel.RootIPCNamespace(), ContainerID: args.ContainerID, PIDNamespace: pidNs, } ctx := initArgs.NewContext(l.Kernel) // Import file descriptors. fdTable := l.Kernel.NewFDTable() defer fdTable.DecRef(ctx) hostFDs, err := fd.NewFromFiles(args.Files) if err != nil { return fmt.Errorf("error donating host files: %w", err) } defer func() { for _, hfd := range hostFDs { _ = hfd.Close() } }() fdMap := make(map[int]*fd.FD, len(args.DonatedFDs)) for i, appFD := range args.DonatedFDs { fdMap[appFD] = hostFDs[i] } // Use ContainerID since containers don't have names here. if _, err := fdimport.Import(ctx, fdTable, false, args.KUID, args.KGID, fdMap, initArgs.ContainerID); err != nil { return fmt.Errorf("error importing host files: %w", err) } initArgs.FDTable = fdTable initArgs.MountNamespace = mntns initArgs.MountNamespace.IncRef() if args.ResolveBinaryPath { resolved, err := user.ResolveExecutablePath(ctx, &initArgs) if err != nil { return fmt.Errorf("failed to resolve binary path: %w", err) } initArgs.Filename = resolved } if args.ResolveHome { envVars, err := user.MaybeAddExecUserHome(ctx, initArgs.MountNamespace, creds.RealKUID, initArgs.Envv) if err != nil { return fmt.Errorf("failed to get user home dir: %w", err) } initArgs.Envv = envVars } fds, err := fd.NewFromFiles(args.Files) if err != nil { return fmt.Errorf("duplicating payload files: %w", err) } defer func() { for _, fd := range fds { _ = fd.Close() } }() initialCgroups := make(map[kernel.Cgroup]struct{}, len(args.InitialCgroups)) cgroupRegistry := l.Kernel.CgroupRegistry() // path is relative to the container's cgroup controller of specified type. for initialCgroupController, path := range args.InitialCgroups { cg, err := cgroupRegistry.FindCgroup(ctx, initialCgroupController, path) if err != nil { return fmt.Errorf("FindCgroup can't locate cgroup controller: %v err: %v", initialCgroupController, err) } initialCgroups[cg] = struct{}{} } initArgs.InitialCgroups = initialCgroups tg, _, err := l.Kernel.CreateProcess(initArgs) if err != nil { return err } c := &Container{ containerID: initArgs.ContainerID, tg: tg, state: stateCreated, } l.mu.Lock() if l.containerMap == nil { l.containerMap = make(map[string]*Container) } if _, ok := l.containerMap[initArgs.ContainerID]; ok { l.mu.Unlock() return fmt.Errorf("container id: %v already exists", initArgs.ContainerID) } l.containerMap[initArgs.ContainerID] = c l.mu.Unlock() // Start the newly created process. l.Kernel.StartProcess(tg) log.Infof("Started the new container %v ", initArgs.ContainerID) if err := l.updateContainerState(initArgs.ContainerID, stateRunning); err != nil { // Sanity check: shouldn't fail to update the state at this point. panic(fmt.Sprintf("Failed to set running state: %v", err)) } timeRequestCompleted := time.Now() eventchannel.LogEmit(&pb.ContainerStartedEvent{ Started: true, ContainerId: initArgs.ContainerID, RequestReceived: timeRequestReceived, RequestCompleted: ×tamppb.Timestamp{ Seconds: timeRequestCompleted.Unix(), Nanos: int32(timeRequestCompleted.Nanosecond()), }, }) // TODO(b/251490950): reap thread needs to synchronize with Save, so the // container state update doesn't race with state serialization. go l.reap(initArgs.ContainerID, tg) // S/R-SAFE: see above. return nil } func (l *Lifecycle) reap(containerID string, tg *kernel.ThreadGroup) { tg.WaitExited() if err := l.updateContainerState(containerID, stateStopped); err != nil { panic(err) } eventchannel.LogEmit(&pb.ContainerExitEvent{ ContainerId: containerID, ExitStatus: uint32(tg.ExitStatus()), }) } // Shutdown sends signal to destroy the sentry/sandbox. func (l *Lifecycle) Shutdown(_, _ *struct{}) error { close(l.ShutdownCh) return nil } func (l *Lifecycle) getInitContainerProcess(containerID string) (*kernel.ThreadGroup, error) { l.mu.Lock() defer l.mu.Unlock() c, ok := l.containerMap[containerID] if !ok { return nil, fmt.Errorf("container %v not started", containerID) } return c.tg, nil } // ContainerArgs is the set of arguments for container related APIs after // starting the container. type ContainerArgs struct { ContainerID string `json:"container_id"` } // GetExitStatus returns the container exit status if it has stopped. func (l *Lifecycle) GetExitStatus(args *ContainerArgs, status *uint32) error { l.mu.Lock() defer l.mu.Unlock() c, ok := l.containerMap[args.ContainerID] if !ok { return fmt.Errorf("container %q doesn't exist, or has not been started", args.ContainerID) } if c.state != stateStopped { return fmt.Errorf("container %q hasn't exited yet", args.ContainerID) } *status = uint32(c.tg.ExitStatus()) eventchannel.LogEmit(&pb.ContainerExitEvent{ ContainerId: args.ContainerID, ExitStatus: *status, }) return nil } // Reap notifies the sandbox that the caller is interested in the exit status via // an exit event. The caller is responsible for handling any corresponding exit // events, especially if they're interested in waiting for the exit. func (l *Lifecycle) Reap(args *ContainerArgs, _ *struct{}) error { // Check if there are any real emitters registered. If there are no // emitters, the caller will never be notified, so fail immediately. if !eventchannel.HaveEmitters() { return fmt.Errorf("no event emitters configured") } l.mu.Lock() c, ok := l.containerMap[args.ContainerID] if !ok { l.mu.Unlock() return fmt.Errorf("no container with id %q", args.ContainerID) } // Once a container enters the stop state, the state never changes. It's // safe to cache a stopped state outside a l.mu critical section. isStopped := c.state == stateStopped l.mu.Unlock() if isStopped { // Already stopped, emit stop to ensure any callbacks registered after // the actual stop is called. This may be a duplicate event, but is // necessary in case the reap goroutine transitions the container to the // stop state before the caller starts observing the event channel. eventchannel.LogEmit(&pb.ContainerExitEvent{ ContainerId: args.ContainerID, ExitStatus: uint32(c.tg.ExitStatus()), }) } // Caller now responsible for blocking on the exit event. return nil } // IsContainerRunning returns true if the container is running. func (l *Lifecycle) IsContainerRunning(args *ContainerArgs, isRunning *bool) error { l.mu.Lock() defer l.mu.Unlock() c, ok := l.containerMap[args.ContainerID] // We may be racing with the reaper goroutine updating c.state, so also // check the number non-exited tasks. if !ok || c.state != stateRunning || c.tg.Count() == 0 { return nil } *isRunning = true return nil } // SignalContainerArgs is the set of arguments for signalling a container. type SignalContainerArgs struct { ContainerID string `json:"container_id"` Signo int32 `json:"signo"` SignalAll bool `json:"signalAll"` } // SignalContainer signals the container in multi-container mode. It returns error if the // container hasn't started or has exited. func (l *Lifecycle) SignalContainer(args *SignalContainerArgs, _ *struct{}) error { tg, err := l.getInitContainerProcess(args.ContainerID) if err != nil { return err } l.mu.Lock() c, ok := l.containerMap[args.ContainerID] if !ok || c.state != stateRunning { l.mu.Unlock() return fmt.Errorf("%v container not running", args.ContainerID) } l.mu.Unlock() // Signalling a single process is supported only for the init process. if !args.SignalAll { if tg == nil { return fmt.Errorf("no process exists in %v", tg) } return l.Kernel.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: args.Signo}) } l.Kernel.Pause() defer l.Kernel.Unpause() return l.Kernel.SendContainerSignal(args.ContainerID, &linux.SignalInfo{Signo: args.Signo}) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/control/logging.go000066400000000000000000000100121465435605700243770ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package control import ( "fmt" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/strace" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" ) // LoggingArgs are the arguments to use for changing the logging // level and strace list. type LoggingArgs struct { // SetLevel is a flag used to indicate that we should update // the logging level. We should be able to change the strace // list without affecting the logging level and vice versa. SetLevel bool // Level is the log level that will be set if SetLevel is true. Level log.Level // SetLogPackets indicates that we should update the log packets flag. SetLogPackets bool // LogPackets is the actual value to set for LogPackets. // SetLogPackets must be enabled to indicate that we're changing // the value. LogPackets bool // SetStrace is a flag used to indicate that strace related // arguments were passed in. SetStrace bool // EnableStrace is a flag from the CLI that specifies whether to // enable strace at all. If this flag is false then a completely // pristine copy of the syscall table will be swapped in. This // approach is used to remain consistent with an empty strace // allowlist meaning trace all system calls. EnableStrace bool // Strace is the allowlist of syscalls to trace to log. If this // and StraceEventAllowlist are empty trace all system calls. StraceAllowlist []string // SetEventStrace is a flag used to indicate that event strace // related arguments were passed in. SetEventStrace bool // StraceEventAllowlist is the allowlist of syscalls to trace // to event log. StraceEventAllowlist []string } // Logging provides functions related to logging. type Logging struct{} // Change will change the log level and strace arguments. Although // this functions signature requires an error it never actually // returns an error. It's required by the URPC interface. // Additionally, it may look odd that this is the only method // attached to an empty struct but this is also part of how // URPC dispatches. func (l *Logging) Change(args *LoggingArgs, code *int) error { if args.SetLevel { // Logging uses an atomic for the level so this is thread safe. log.SetLevel(args.Level) } if args.SetLogPackets { if args.LogPackets { sniffer.LogPackets.Store(1) } else { sniffer.LogPackets.Store(0) } log.Infof("LogPackets set to: %v", sniffer.LogPackets.Load()) } if args.SetStrace { if err := l.configureStrace(args); err != nil { return fmt.Errorf("error configuring strace: %v", err) } } if args.SetEventStrace { if err := l.configureEventStrace(args); err != nil { return fmt.Errorf("error configuring event strace: %v", err) } } return nil } func (l *Logging) configureStrace(args *LoggingArgs) error { if args.EnableStrace { // Install the allowlist specified. if len(args.StraceAllowlist) > 0 { if err := strace.Enable(args.StraceAllowlist, strace.SinkTypeLog); err != nil { return err } } else { // For convenience, if strace is enabled but allowlist // is empty, enable everything to log. strace.EnableAll(strace.SinkTypeLog) } } else { // Uninstall all strace functions. strace.Disable(strace.SinkTypeLog) } return nil } func (l *Logging) configureEventStrace(args *LoggingArgs) error { if len(args.StraceEventAllowlist) > 0 { if err := strace.Enable(args.StraceEventAllowlist, strace.SinkTypeEvent); err != nil { return err } } else { strace.Disable(strace.SinkTypeEvent) } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/control/metrics.go000066400000000000000000000102001465435605700244160ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package control import ( "fmt" "regexp" "gvisor.dev/gvisor/pkg/metric" pb "gvisor.dev/gvisor/pkg/metric/metric_go_proto" "gvisor.dev/gvisor/pkg/prometheus" "gvisor.dev/gvisor/pkg/sync" ) // Metrics includes metrics-related RPC stubs. type Metrics struct{} // GetRegisteredMetricsOpts contains metric registration query options. type GetRegisteredMetricsOpts struct{} // MetricsRegistrationResponse contains metric registration data. type MetricsRegistrationResponse struct { RegisteredMetrics *pb.MetricRegistration } // GetRegisteredMetrics sets `out` to the metric registration information. // Meant to be called over the control channel, with `out` as return value. // This should be called during Sentry boot before any container starts. // Metric registration data is used by the processes querying sandbox metrics // to ensure the integrity of metrics exported from the untrusted sandbox. func (u *Metrics) GetRegisteredMetrics(_ *GetRegisteredMetricsOpts, out *MetricsRegistrationResponse) error { registration, err := metric.GetMetricRegistration() if err != nil { return err } out.RegisteredMetrics = registration return nil } // MetricsExportOpts contains metric exporting options. type MetricsExportOpts struct { // If set, this is a regular expression that is used to filter the set of // exported metrics. OnlyMetrics string `json:"only_metrics"` } var ( // lastOnlyMetricsMu protects the variables below. lastOnlyMetricsMu sync.Mutex // lastOnlyMetricsStr is the last value of the "only_metrics" parameter passed to // MetricsExport. It is used to avoid re-compiling the regular expression on every // request in the common case where a single metric scraper is scraping the sandbox // metrics using the same filter in each request. lastOnlyMetricsStr string // lastOnlyMetrics is the compiled version of lastOnlyMetricsStr. lastOnlyMetrics *regexp.Regexp ) // filterFunc returns a filter function to filter relevant Prometheus metric names. func (m *MetricsExportOpts) filterFunc() (func(*prometheus.Metric) bool, error) { if m.OnlyMetrics == "" { return nil, nil } lastOnlyMetricsMu.Lock() defer lastOnlyMetricsMu.Unlock() onlyMetricsReg := lastOnlyMetrics if m.OnlyMetrics != lastOnlyMetricsStr { reg, err := regexp.Compile(m.OnlyMetrics) if err != nil { return nil, fmt.Errorf("cannot compile regexp %q: %v", m.OnlyMetrics, err) } lastOnlyMetricsStr = m.OnlyMetrics lastOnlyMetrics = reg onlyMetricsReg = reg } return func(m *prometheus.Metric) bool { return onlyMetricsReg.MatchString(m.Name) }, nil } // Verify verifies that the given exported data is compliant with the export // options. This should be run client-side to double-check results. func (m *MetricsExportOpts) Verify(data *MetricsExportData) error { filterFunc, err := m.filterFunc() if err != nil { return err } if filterFunc != nil && data.Snapshot != nil { for _, data := range data.Snapshot.Data { if !filterFunc(data.Metric) { return fmt.Errorf("metric %v violated the filter set in export options", data.Metric) } } } return nil } // MetricsExportData contains data for all metrics being exported. type MetricsExportData struct { Snapshot *prometheus.Snapshot `json:"snapshot"` } // Export export metrics data into MetricsExportData. func (u *Metrics) Export(opts *MetricsExportOpts, out *MetricsExportData) error { filterFunc, err := opts.filterFunc() if err != nil { return err } snapshot, err := metric.GetSnapshot(metric.SnapshotOptions{ Filter: filterFunc, }) if err != nil { return err } out.Snapshot = snapshot return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/control/pprof.go000066400000000000000000000166251465435605700241170ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package control import ( "runtime" "runtime/pprof" "runtime/trace" "time" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/urpc" ) const ( // DefaultBlockProfileRate is the default profiling rate for block // profiles. // // The default here is 10%, which will record a stacktrace 10% of the // time when blocking occurs. Since these events should not be super // frequent, we expect this to achieve a reasonable balance between // collecting the data we need and imposing a high performance cost // (e.g. skewing even the CPU profile). DefaultBlockProfileRate = 10 // DefaultMutexProfileRate is the default profiling rate for mutex // profiles. Like the block rate above, we use a default rate of 10% // for the same reasons. DefaultMutexProfileRate = 10 ) // Profile includes profile-related RPC stubs. It provides a way to // control the built-in runtime profiling facilities. // // The profile object must be instantied via NewProfile. type Profile struct { // kernel is the kernel under profile. It's immutable. kernel *kernel.Kernel // cpuMu protects CPU profiling. cpuMu sync.Mutex // blockMu protects block profiling. blockMu sync.Mutex // mutexMu protects mutex profiling. mutexMu sync.Mutex // traceMu protects trace profiling. traceMu sync.Mutex // done is closed when profiling is done. done chan struct{} } // NewProfile returns a new Profile object. func NewProfile(k *kernel.Kernel) *Profile { return &Profile{ kernel: k, done: make(chan struct{}), } } // Stop implements urpc.Stopper.Stop. func (p *Profile) Stop() { close(p.done) } // CPUProfileOpts contains options specifically for CPU profiles. type CPUProfileOpts struct { // FilePayload is the destination for the profiling output. urpc.FilePayload // Duration is the duration of the profile. Duration time.Duration `json:"duration"` } // CPU is an RPC stub which collects a CPU profile. func (p *Profile) CPU(o *CPUProfileOpts, _ *struct{}) error { if len(o.FilePayload.Files) < 1 { return nil // Allowed. } output := o.FilePayload.Files[0] defer output.Close() p.cpuMu.Lock() defer p.cpuMu.Unlock() // Returns an error if profiling is already started. if err := pprof.StartCPUProfile(output); err != nil { return err } defer pprof.StopCPUProfile() // Collect the profile. select { case <-time.After(o.Duration): case <-p.done: } return nil } // HeapProfileOpts contains options specifically for heap profiles. type HeapProfileOpts struct { // FilePayload is the destination for the profiling output. urpc.FilePayload // Delay is the sleep time, similar to Duration. This may // not affect the data collected however, as the heap will // continue only the memory associated with the last alloc. Delay time.Duration `json:"delay"` } // Heap generates a heap profile. func (p *Profile) Heap(o *HeapProfileOpts, _ *struct{}) error { if len(o.FilePayload.Files) < 1 { return nil // Allowed. } output := o.FilePayload.Files[0] defer output.Close() // Wait for the given delay. select { case <-time.After(o.Delay): case <-p.done: } // Get up-to-date statistics. runtime.GC() // Write the given profile. return pprof.WriteHeapProfile(output) } // GoroutineProfileOpts contains options specifically for goroutine profiles. type GoroutineProfileOpts struct { // FilePayload is the destination for the profiling output. urpc.FilePayload } // Goroutine dumps out the stack trace for all running goroutines. func (p *Profile) Goroutine(o *GoroutineProfileOpts, _ *struct{}) error { if len(o.FilePayload.Files) < 1 { return nil // Allowed. } output := o.FilePayload.Files[0] defer output.Close() return pprof.Lookup("goroutine").WriteTo(output, 2) } // BlockProfileOpts contains options specifically for block profiles. type BlockProfileOpts struct { // FilePayload is the destination for the profiling output. urpc.FilePayload // Duration is the duration of the profile. Duration time.Duration `json:"duration"` // Rate is the block profile rate. Rate int `json:"rate"` } // Block dumps a blocking profile. func (p *Profile) Block(o *BlockProfileOpts, _ *struct{}) error { if len(o.FilePayload.Files) < 1 { return nil // Allowed. } output := o.FilePayload.Files[0] defer output.Close() p.blockMu.Lock() defer p.blockMu.Unlock() // Always set the rate. We then wait to collect a profile at this rate, // and disable when we're done. rate := DefaultBlockProfileRate if o.Rate != 0 { rate = o.Rate } runtime.SetBlockProfileRate(rate) defer runtime.SetBlockProfileRate(0) // Collect the profile. select { case <-time.After(o.Duration): case <-p.done: } return pprof.Lookup("block").WriteTo(output, 0) } // MutexProfileOpts contains options specifically for mutex profiles. type MutexProfileOpts struct { // FilePayload is the destination for the profiling output. urpc.FilePayload // Duration is the duration of the profile. Duration time.Duration `json:"duration"` // Fraction is the mutex profile fraction. Fraction int `json:"fraction"` } // Mutex dumps a mutex profile. func (p *Profile) Mutex(o *MutexProfileOpts, _ *struct{}) error { if len(o.FilePayload.Files) < 1 { return nil // Allowed. } output := o.FilePayload.Files[0] defer output.Close() p.mutexMu.Lock() defer p.mutexMu.Unlock() // Always set the fraction. fraction := DefaultMutexProfileRate if o.Fraction != 0 { fraction = o.Fraction } runtime.SetMutexProfileFraction(fraction) defer runtime.SetMutexProfileFraction(0) // Collect the profile. select { case <-time.After(o.Duration): case <-p.done: } return pprof.Lookup("mutex").WriteTo(output, 0) } // TraceProfileOpts contains options specifically for traces. type TraceProfileOpts struct { // FilePayload is the destination for the profiling output. urpc.FilePayload // Duration is the duration of the profile. Duration time.Duration `json:"duration"` } // Trace is an RPC stub which starts collection of an execution trace. func (p *Profile) Trace(o *TraceProfileOpts, _ *struct{}) error { if len(o.FilePayload.Files) < 1 { return nil // Allowed. } output, err := fd.NewFromFile(o.FilePayload.Files[0]) if err != nil { return err } defer output.Close() p.traceMu.Lock() defer p.traceMu.Unlock() // Returns an error if profiling is already started. if err := trace.Start(output); err != nil { output.Close() return err } defer trace.Stop() // Ensure all trace contexts are registered. p.kernel.RebuildTraceContexts() // Wait for the trace. select { case <-time.After(o.Duration): case <-p.done: } // Similarly to the case above, if tasks have not ended traces, we will // lose information. Thus we need to rebuild the tasks in order to have // complete information. This will not lose information if multiple // traces are overlapping. p.kernel.RebuildTraceContexts() return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/control/proc.go000066400000000000000000000364771465435605700237430ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package control import ( "bytes" "encoding/json" "fmt" "os" "sort" "strings" "text/tabwriter" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fdimport" "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/fsimpl/user" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/urpc" ) // Proc includes task-related functions. // // At the moment, this is limited to exec support. type Proc struct { Kernel *kernel.Kernel } // FilePayload aids to ensure that payload files and guest file descriptors are // consistent when instantiated through the NewFilePayload helper method. type FilePayload struct { // FilePayload is the file payload that is transferred via RPC. urpc.FilePayload // GuestFDs are the file descriptors in the file descriptor map of the // executed application. They correspond 1:1 to the files in the // urpc.FilePayload. If a program is executed from a host file descriptor, // the file payload may contain one additional file. In that case, the file // used for program execution is the last file in the Files array. GuestFDs []int } // NewFilePayload returns a FilePayload that maps file descriptors to files inside // the executed process and provides a file for execution. func NewFilePayload(fdMap map[int]*os.File, execFile *os.File) FilePayload { fileCount := len(fdMap) if execFile != nil { fileCount++ } files := make([]*os.File, 0, fileCount) guestFDs := make([]int, 0, len(fdMap)) // Make the map iteration order deterministic for the sake of testing. // Otherwise, the order is randomized and tests relying on the comparison // of equality will fail. for key := range fdMap { guestFDs = append(guestFDs, key) } sort.Ints(guestFDs) for _, guestFD := range guestFDs { files = append(files, fdMap[guestFD]) } if execFile != nil { files = append(files, execFile) } return FilePayload{ FilePayload: urpc.FilePayload{Files: files}, GuestFDs: guestFDs, } } // ExecArgs is the set of arguments to exec. type ExecArgs struct { // Filename is the filename to load. // // If this is provided as "", then the file will be guessed via Argv[0]. Filename string `json:"filename"` // Argv is a list of arguments. Argv []string `json:"argv"` // Envv is a list of environment variables. Envv []string `json:"envv"` // MountNamespace is the mount namespace to execute the new process in. // A reference on MountNamespace must be held for the lifetime of the // ExecArgs. If MountNamespace is nil, it will default to the init // process's MountNamespace. MountNamespace *vfs.MountNamespace // WorkingDirectory defines the working directory for the new process. WorkingDirectory string `json:"wd"` // KUID is the UID to run with in the root user namespace. Defaults to // root if not set explicitly. KUID auth.KUID // KGID is the GID to run with in the root user namespace. Defaults to // the root group if not set explicitly. KGID auth.KGID // ExtraKGIDs is the list of additional groups to which the user belongs. ExtraKGIDs []auth.KGID // Capabilities is the list of capabilities to give to the process. Capabilities *auth.TaskCapabilities // StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host pty FD. StdioIsPty bool // FilePayload determines the files to give to the new process. FilePayload // ContainerID is the container for the process being executed. ContainerID string // PIDNamespace is the pid namespace for the process being executed. PIDNamespace *kernel.PIDNamespace // Limits is the limit set for the process being executed. Limits *limits.LimitSet } // String prints the arguments as a string. func (args *ExecArgs) String() string { if len(args.Argv) == 0 { return args.Filename } a := make([]string, len(args.Argv)) copy(a, args.Argv) if args.Filename != "" { a[0] = args.Filename } return strings.Join(a, " ") } // Exec runs a new task. func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error { newTG, _, _, err := proc.execAsync(args) if err != nil { return err } // Wait for completion. newTG.WaitExited() *waitStatus = uint32(newTG.ExitStatus()) return nil } // ExecAsync runs a new task, but doesn't wait for it to finish. It is defined // as a function rather than a method to avoid exposing execAsync as an RPC. func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileDescription, error) { return proc.execAsync(args) } // execAsync runs a new task, but doesn't wait for it to finish. It returns the // newly created thread group and its PID. If the stdio FDs are TTYs, then a // TTYFileOperations that wraps the TTY is also returned. func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileDescription, error) { // Import file descriptors. fdTable := proc.Kernel.NewFDTable() creds := auth.NewUserCredentials( args.KUID, args.KGID, args.ExtraKGIDs, args.Capabilities, proc.Kernel.RootUserNamespace()) pidns := args.PIDNamespace if pidns == nil { pidns = proc.Kernel.RootPIDNamespace() } limitSet := args.Limits if limitSet == nil { limitSet = limits.NewLimitSet() } initArgs := kernel.CreateProcessArgs{ Filename: args.Filename, Argv: args.Argv, Envv: args.Envv, WorkingDirectory: args.WorkingDirectory, MountNamespace: args.MountNamespace, Credentials: creds, FDTable: fdTable, Umask: 0022, Limits: limitSet, MaxSymlinkTraversals: linux.MaxSymlinkTraversals, UTSNamespace: proc.Kernel.RootUTSNamespace(), IPCNamespace: proc.Kernel.RootIPCNamespace(), ContainerID: args.ContainerID, PIDNamespace: pidns, Origin: kernel.OriginExec, } if initArgs.MountNamespace != nil { // initArgs must hold a reference on MountNamespace, which will // be donated to the new process in CreateProcess. initArgs.MountNamespace.IncRef() } ctx := initArgs.NewContext(proc.Kernel) defer fdTable.DecRef(ctx) // Get the full path to the filename from the PATH env variable. if initArgs.MountNamespace == nil { // Set initArgs so that 'ctx' returns the namespace. // // Add a reference to the namespace, which is transferred to the new process. initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace() initArgs.MountNamespace.IncRef() } fdMap, execFD, err := args.unpackFiles() if err != nil { return nil, 0, nil, fmt.Errorf("creating fd map: %w", err) } defer func() { for _, hostFD := range fdMap { _ = hostFD.Close() } }() if execFD != nil { if initArgs.Filename != "" { return nil, 0, nil, fmt.Errorf("process must either be started from a file or a filename, not both") } file, err := host.NewFD(ctx, proc.Kernel.HostMount(), execFD.FD(), &host.NewFDOptions{ Readonly: true, Savable: true, VirtualOwner: true, UID: args.KUID, GID: args.KGID, }) if err != nil { return nil, 0, nil, err } defer file.DecRef(ctx) execFD.Release() initArgs.File = file } else { resolved, err := user.ResolveExecutablePath(ctx, &initArgs) if err != nil { return nil, 0, nil, err } initArgs.Filename = resolved } // TODO(gvisor.dev/issue/1956): Container name is not really needed because // exec processes are not restored, but add it for completeness. ttyFile, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, args.KUID, args.KGID, fdMap, "") if err != nil { return nil, 0, nil, err } // Set cgroups to the new exec task if cgroups are mounted. cgroupRegistry := proc.Kernel.CgroupRegistry() initialCgrps := map[kernel.Cgroup]struct{}{} for _, ctrl := range kernel.CgroupCtrls { cg, err := cgroupRegistry.FindCgroup(ctx, ctrl, "/"+args.ContainerID) if err != nil { log.Warningf("cgroup mount for controller %v not found", ctrl) continue } initialCgrps[cg] = struct{}{} } if len(initialCgrps) > 0 { initArgs.InitialCgroups = initialCgrps } tg, tid, err := proc.Kernel.CreateProcess(initArgs) if err != nil { return nil, 0, nil, err } // Set the foreground process group on the TTY before starting the process. if ttyFile != nil { ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) } // Start the newly created process. proc.Kernel.StartProcess(tg) return tg, tid, ttyFile, nil } // PsArgs is the set of arguments to ps. type PsArgs struct { // JSON will force calls to Ps to return the result as a JSON payload. JSON bool } // Ps provides a process listing for the running kernel. func (proc *Proc) Ps(args *PsArgs, out *string) error { var p []*Process if e := Processes(proc.Kernel, "", &p); e != nil { return e } if !args.JSON { *out = ProcessListToTable(p) } else { s, e := ProcessListToJSON(p) if e != nil { return e } *out = s } return nil } // Process contains information about a single process in a Sandbox. type Process struct { UID auth.KUID `json:"uid"` PID kernel.ThreadID `json:"pid"` // Parent PID PPID kernel.ThreadID `json:"ppid"` Threads []kernel.ThreadID `json:"threads"` // Processor utilization C int32 `json:"c"` // TTY name of the process. Will be of the form "pts/N" if there is a // TTY, or "?" if there is not. TTY string `json:"tty"` // Start time STime string `json:"stime"` // CPU time Time string `json:"time"` // Executable shortname (e.g. "sh" for /bin/sh) Cmd string `json:"cmd"` } // ProcessListToTable prints a table with the following format: // UID PID PPID C TTY STIME TIME CMD // 0 1 0 0 pty/4 14:04 505262ns tail func ProcessListToTable(pl []*Process) string { var buf bytes.Buffer tw := tabwriter.NewWriter(&buf, 10, 1, 3, ' ', 0) fmt.Fprint(tw, "UID\tPID\tPPID\tC\tTTY\tSTIME\tTIME\tCMD") for _, d := range pl { fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s", d.UID, d.PID, d.PPID, d.C, d.TTY, d.STime, d.Time, d.Cmd) } tw.Flush() return buf.String() } // ProcessListToJSON will return the JSON representation of ps. func ProcessListToJSON(pl []*Process) (string, error) { b, err := json.MarshalIndent(pl, "", " ") if err != nil { return "", fmt.Errorf("couldn't marshal process list %v: %v", pl, err) } return string(b), nil } // PrintPIDsJSON prints a JSON object containing only the PIDs in pl. This // behavior is the same as runc's. func PrintPIDsJSON(pl []*Process) (string, error) { pids := make([]kernel.ThreadID, 0, len(pl)) for _, d := range pl { pids = append(pids, d.PID) } b, err := json.Marshal(pids) if err != nil { return "", fmt.Errorf("couldn't marshal PIDs %v: %v", pids, err) } return string(b), nil } // Processes retrieves information about processes running in the sandbox with // the given container id. All processes are returned if 'containerID' is empty. func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error { ts := k.TaskSet() now := k.RealtimeClock().Now() pidns := ts.Root for _, tg := range pidns.ThreadGroups() { pid := pidns.IDOfThreadGroup(tg) // If tg has already been reaped ignore it. if pid == 0 { continue } if containerID != "" && containerID != tg.Leader().ContainerID() { continue } ppid := kernel.ThreadID(0) if p := tg.Leader().Parent(); p != nil { ppid = pidns.IDOfThreadGroup(p.ThreadGroup()) } threads := tg.MemberIDs(pidns) *out = append(*out, &Process{ UID: tg.Leader().Credentials().EffectiveKUID, PID: pid, PPID: ppid, Threads: threads, STime: formatStartTime(now, tg.Leader().StartTime()), C: percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now), Time: tg.CPUStats().SysTime.String(), Cmd: tg.Leader().Name(), TTY: ttyName(tg.TTY()), }) } sort.Slice(*out, func(i, j int) bool { return (*out)[i].PID < (*out)[j].PID }) return nil } // formatStartTime formats startTime depending on the current time: // - If startTime was today, HH:MM is used. // - If startTime was not today but was this year, MonDD is used (e.g. Jan02) // - If startTime was not this year, the year is used. func formatStartTime(now, startTime ktime.Time) string { nowS, nowNs := now.Unix() n := time.Unix(nowS, nowNs) startTimeS, startTimeNs := startTime.Unix() st := time.Unix(startTimeS, startTimeNs) format := "15:04" if st.YearDay() != n.YearDay() { format = "Jan02" } if st.Year() != n.Year() { format = "2006" } return st.Format(format) } func percentCPU(stats usage.CPUStats, startTime, now ktime.Time) int32 { // Note: In procps, there is an option to include child CPU stats. As // it is disabled by default, we do not include them. total := stats.UserTime + stats.SysTime lifetime := now.Sub(startTime) if lifetime <= 0 { return 0 } percentCPU := total * 100 / lifetime // Cap at 99% since procps does the same. if percentCPU > 99 { percentCPU = 99 } return int32(percentCPU) } func ttyName(tty *kernel.TTY) string { if tty == nil { return "?" } return fmt.Sprintf("pts/%d", tty.Index) } // ContainerUsage retrieves per-container CPU usage. func ContainerUsage(kr *kernel.Kernel) map[string]uint64 { cusage := make(map[string]uint64) for _, tg := range kr.TaskSet().Root.ThreadGroups() { // We want each tg's usage including reaped children. cid := tg.Leader().ContainerID() stats := tg.CPUStats() stats.Accumulate(tg.JoinedChildCPUStats()) cusage[cid] += uint64(stats.UserTime.Nanoseconds()) + uint64(stats.SysTime.Nanoseconds()) } return cusage } // unpackFiles unpacks the file descriptor map and, if applicable, the file // descriptor to be used for execution from the unmarshalled ExecArgs. func (args *ExecArgs) unpackFiles() (map[int]*fd.FD, *fd.FD, error) { var execFD *fd.FD var err error // If there is one additional file, the last file is used for program // execution. if len(args.Files) == len(args.GuestFDs)+1 { execFD, err = fd.NewFromFile(args.Files[len(args.Files)-1]) if err != nil { return nil, nil, fmt.Errorf("duplicating exec file: %w", err) } } else if len(args.Files) != len(args.GuestFDs) { return nil, nil, fmt.Errorf("length of payload files does not match length of file descriptor array") } // GuestFDs are the indexes of our FD map. fdMap := make(map[int]*fd.FD, len(args.GuestFDs)) for i, appFD := range args.GuestFDs { file := args.Files[i] if appFD < 0 { return nil, nil, fmt.Errorf("guest file descriptors must be 0 or greater") } hostFD, err := fd.NewFromFile(file) if err != nil { return nil, nil, fmt.Errorf("duplicating payload files: %w", err) } fdMap[appFD] = hostFD } return fdMap, execFD, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/control/state.go000066400000000000000000000062731465435605700241070ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package control import ( "errors" "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/state" "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/urpc" ) // ErrInvalidFiles is returned when the urpc call to Save does not include an // appropriate file payload (e.g. there is no output file!). var ErrInvalidFiles = errors.New("exactly one file must be provided") // State includes state-related functions. type State struct { Kernel *kernel.Kernel Watchdog *watchdog.Watchdog } // SaveOpts contains options for the Save RPC call. type SaveOpts struct { // Key is used for state integrity check. Key []byte `json:"key"` // Metadata is the set of metadata to prepend to the state file. Metadata map[string]string `json:"metadata"` // MemoryFileSaveOpts is passed to calls to pgalloc.MemoryFile.SaveTo(). MemoryFileSaveOpts pgalloc.SaveOpts // HavePagesFile indicates whether the pages file and its corresponding // metadata file is provided. HavePagesFile bool `json:"have_pages_file"` // FilePayload contains the following: // 1. checkpoint state file. // 2. optional checkpoint pages metadata file. // 3. optional checkpoint pages file. urpc.FilePayload // Resume indicates if the sandbox process should continue running // after checkpointing. Resume bool } // Save saves the running system. func (s *State) Save(o *SaveOpts, _ *struct{}) error { wantFiles := 1 if o.HavePagesFile { wantFiles += 2 } if gotFiles := len(o.FilePayload.Files); gotFiles != wantFiles { return fmt.Errorf("got %d files, wanted %d", gotFiles, wantFiles) } // Save to the first provided stream. stateFile, err := o.ReleaseFD(0) if err != nil { return err } defer stateFile.Close() saveOpts := state.SaveOpts{ Destination: stateFile, Key: o.Key, Metadata: o.Metadata, MemoryFileSaveOpts: o.MemoryFileSaveOpts, Callback: func(err error) { if err == nil { log.Infof("Save succeeded: exiting...") s.Kernel.SetSaveSuccess(false /* autosave */) } else { log.Warningf("Save failed: %v", err) s.Kernel.SetSaveError(err) } if !o.Resume { s.Kernel.Kill(linux.WaitStatusExit(0)) } }, } if o.HavePagesFile { saveOpts.PagesMetadata, err = o.ReleaseFD(1) if err != nil { return err } defer saveOpts.PagesMetadata.Close() saveOpts.PagesFile, err = o.ReleaseFD(2) if err != nil { return err } defer saveOpts.PagesFile.Close() } return saveOpts.Save(s.Kernel.SupervisorContext(), s.Kernel, s.Watchdog) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/control/usage.go000066400000000000000000000147741465435605700241000ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package control import ( "encoding/json" "fmt" "os" "runtime" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/urpc" ) // Usage includes usage-related RPC stubs. type Usage struct { Kernel *kernel.Kernel } // MemoryUsageOpts contains usage options. type MemoryUsageOpts struct { // Full indicates that a full accounting should be done. If Full is not // specified, then a partial accounting will be done, and Unknown will // contain a majority of memory. See Collect for more information. Full bool `json:"Full"` } // MemoryUsage is a memory usage structure. type MemoryUsage struct { Unknown uint64 `json:"Unknown"` System uint64 `json:"System"` Anonymous uint64 `json:"Anonymous"` PageCache uint64 `json:"PageCache"` Mapped uint64 `json:"Mapped"` Tmpfs uint64 `json:"Tmpfs"` Ramdiskfs uint64 `json:"Ramdiskfs"` Total uint64 `json:"Total"` } // MemoryUsageFileOpts contains usage file options. type MemoryUsageFileOpts struct { // Version is used to ensure both sides agree on the format of the // shared memory buffer. Version uint64 `json:"Version"` } // MemoryUsageFile contains the file handle to the usage file. type MemoryUsageFile struct { urpc.FilePayload } // UsageFD returns the file that tracks the memory usage of the application. func (u *Usage) UsageFD(opts *MemoryUsageFileOpts, out *MemoryUsageFile) error { // Only support version 1 for now. if opts.Version != 1 { return fmt.Errorf("unsupported version requested: %d", opts.Version) } mf := u.Kernel.MemoryFile() *out = MemoryUsageFile{ FilePayload: urpc.FilePayload{ Files: []*os.File{ usage.MemoryAccounting.File, mf.File(), }, }, } return nil } // Collect returns memory used by the sandboxed application. func (u *Usage) Collect(opts *MemoryUsageOpts, out *MemoryUsage) error { if opts.Full { // Ensure everything is up to date. if err := u.Kernel.MemoryFile().UpdateUsage(nil); err != nil { return err } // Copy out a snapshot. snapshot, total := usage.MemoryAccounting.Copy() *out = MemoryUsage{ System: snapshot.System, Anonymous: snapshot.Anonymous, PageCache: snapshot.PageCache, Mapped: snapshot.Mapped, Tmpfs: snapshot.Tmpfs, Ramdiskfs: snapshot.Ramdiskfs, Total: total, } } else { // Get total usage from the MemoryFile implementation. total, err := u.Kernel.MemoryFile().TotalUsage() if err != nil { return err } // The memory accounting is guaranteed to be accurate only when // UpdateUsage is called. If UpdateUsage is not called, then only Mapped // will be up-to-date. snapshot, _ := usage.MemoryAccounting.Copy() *out = MemoryUsage{ Unknown: total, Mapped: snapshot.Mapped, Total: total + snapshot.Mapped, } } return nil } // UsageReduceOpts contains options to Usage.Reduce(). type UsageReduceOpts struct { // If Wait is `true`, Reduce blocks until all activity initiated by // Usage.Reduce() has completed. // If Wait is `false`, Go garbage collection is still performed and may // still block for some time, unless `DoNotGC` is `true`. Wait bool `json:"wait"` // If DoNotGC is true, Reduce does not explicitly run Go garbage collection. // Garbage collection may block for an indeterminate amount of time. // Note that the runtime Go may still perform routine garbage collection at // any time during program execution, so a routine GC is still possible even // when this option set to `true`. DoNotGC bool `json:"do_not_gc"` } // UsageReduceOutput contains output from Usage.Reduce(). type UsageReduceOutput struct{} // Reduce requests that the sentry attempt to reduce its memory usage. func (u *Usage) Reduce(opts *UsageReduceOpts, out *UsageReduceOutput) error { mf := u.Kernel.MemoryFile() mf.StartEvictions() if opts.Wait { mf.WaitForEvictions() } if !opts.DoNotGC { runtime.GC() } return nil } // MemoryUsageRecord contains the mapping and platform memory file. type MemoryUsageRecord struct { mmap uintptr stats *usage.RTMemoryStats mf os.File } // NewMemoryUsageRecord creates a new MemoryUsageRecord from usageFile and // platformFile. func NewMemoryUsageRecord(usageFile, platformFile os.File) (*MemoryUsageRecord, error) { mmap, _, e := unix.RawSyscall6(unix.SYS_MMAP, 0, usage.RTMemoryStatsSize, unix.PROT_READ, unix.MAP_SHARED, usageFile.Fd(), 0) if e != 0 { return nil, fmt.Errorf("mmap returned %d, want 0", e) } m := MemoryUsageRecord{ mmap: mmap, stats: usage.RTMemoryStatsPointer(mmap), mf: platformFile, } runtime.SetFinalizer(&m, finalizer) return &m, nil } // GetFileIoStats writes the read times in nanoseconds to out. func (*Usage) GetFileIoStats(_ *struct{}, out *string) error { fileIoStats := struct { // The total amount of time spent reading. The map maps gopher prefixes // to the total time spent reading. Times not included in a known prefix // are placed in the "/" prefix. ReadWait map[string]uint64 `json:"ReadWait"` // The total amount of time spent reading. The map maps gopher prefixes // to the total time spent reading. Times not included in a known prefix // are placed in the "/" prefix. ReadWait9P map[string]uint64 `json:"ReadWait9P"` }{ ReadWait: map[string]uint64{"/": fsmetric.ReadWait.Value()}, ReadWait9P: map[string]uint64{"/": fsmetric.GoferReadWait9P.Value()}, } m, err := json.Marshal(fileIoStats) if err != nil { return err } *out = string(m) return nil } func finalizer(m *MemoryUsageRecord) { unix.RawSyscall(unix.SYS_MUNMAP, m.mmap, usage.RTMemoryStatsSize, 0) } // Fetch fetches the usage info from a MemoryUsageRecord. func (m *MemoryUsageRecord) Fetch() (mapped, unknown, total uint64, err error) { var stat unix.Stat_t if err := unix.Fstat(int(m.mf.Fd()), &stat); err != nil { return 0, 0, 0, err } fmem := uint64(stat.Blocks) * 512 rtmapped := m.stats.RTMapped.Load() return rtmapped, fmem, rtmapped + fmem, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/000077500000000000000000000000001465435605700223725ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/accel/000077500000000000000000000000001465435605700234415ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/accel/accel_state_autogen.go000066400000000000000000000110611465435605700277600ustar00rootroot00000000000000// automatically generated by stateify. package accel import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (r *DevAddrRange) StateTypeName() string { return "pkg/sentry/devices/accel.DevAddrRange" } func (r *DevAddrRange) StateFields() []string { return []string{ "Start", "End", } } func (r *DevAddrRange) beforeSave() {} // +checklocksignore func (r *DevAddrRange) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.Start) stateSinkObject.Save(1, &r.End) } func (r *DevAddrRange) afterLoad(context.Context) {} // +checklocksignore func (r *DevAddrRange) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.Start) stateSourceObject.Load(1, &r.End) } func (s *DevAddrSet) StateTypeName() string { return "pkg/sentry/devices/accel.DevAddrSet" } func (s *DevAddrSet) StateFields() []string { return []string{ "root", } } func (s *DevAddrSet) beforeSave() {} // +checklocksignore func (s *DevAddrSet) StateSave(stateSinkObject state.Sink) { s.beforeSave() var rootValue []DevAddrFlatSegment rootValue = s.saveRoot() stateSinkObject.SaveValue(0, rootValue) } func (s *DevAddrSet) afterLoad(context.Context) {} // +checklocksignore func (s *DevAddrSet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new([]DevAddrFlatSegment), func(y any) { s.loadRoot(ctx, y.([]DevAddrFlatSegment)) }) } func (n *DevAddrnode) StateTypeName() string { return "pkg/sentry/devices/accel.DevAddrnode" } func (n *DevAddrnode) StateFields() []string { return []string{ "nrSegments", "parent", "parentIndex", "hasChildren", "maxGap", "keys", "values", "children", } } func (n *DevAddrnode) beforeSave() {} // +checklocksignore func (n *DevAddrnode) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.nrSegments) stateSinkObject.Save(1, &n.parent) stateSinkObject.Save(2, &n.parentIndex) stateSinkObject.Save(3, &n.hasChildren) stateSinkObject.Save(4, &n.maxGap) stateSinkObject.Save(5, &n.keys) stateSinkObject.Save(6, &n.values) stateSinkObject.Save(7, &n.children) } func (n *DevAddrnode) afterLoad(context.Context) {} // +checklocksignore func (n *DevAddrnode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.nrSegments) stateSourceObject.Load(1, &n.parent) stateSourceObject.Load(2, &n.parentIndex) stateSourceObject.Load(3, &n.hasChildren) stateSourceObject.Load(4, &n.maxGap) stateSourceObject.Load(5, &n.keys) stateSourceObject.Load(6, &n.values) stateSourceObject.Load(7, &n.children) } func (d *DevAddrFlatSegment) StateTypeName() string { return "pkg/sentry/devices/accel.DevAddrFlatSegment" } func (d *DevAddrFlatSegment) StateFields() []string { return []string{ "Start", "End", "Value", } } func (d *DevAddrFlatSegment) beforeSave() {} // +checklocksignore func (d *DevAddrFlatSegment) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.Start) stateSinkObject.Save(1, &d.End) stateSinkObject.Save(2, &d.Value) } func (d *DevAddrFlatSegment) afterLoad(context.Context) {} // +checklocksignore func (d *DevAddrFlatSegment) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.Start) stateSourceObject.Load(1, &d.End) stateSourceObject.Load(2, &d.Value) } func (dev *tpuV4Device) StateTypeName() string { return "pkg/sentry/devices/accel.tpuV4Device" } func (dev *tpuV4Device) StateFields() []string { return []string{ "mu", "minor", "lite", "openWriteFDs", "devAddrSet", "owner", } } func (dev *tpuV4Device) beforeSave() {} // +checklocksignore func (dev *tpuV4Device) StateSave(stateSinkObject state.Sink) { dev.beforeSave() stateSinkObject.Save(0, &dev.mu) stateSinkObject.Save(1, &dev.minor) stateSinkObject.Save(2, &dev.lite) stateSinkObject.Save(3, &dev.openWriteFDs) stateSinkObject.Save(4, &dev.devAddrSet) stateSinkObject.Save(5, &dev.owner) } func (dev *tpuV4Device) afterLoad(context.Context) {} // +checklocksignore func (dev *tpuV4Device) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &dev.mu) stateSourceObject.Load(1, &dev.minor) stateSourceObject.Load(2, &dev.lite) stateSourceObject.Load(3, &dev.openWriteFDs) stateSourceObject.Load(4, &dev.devAddrSet) stateSourceObject.Load(5, &dev.owner) } func init() { state.Register((*DevAddrRange)(nil)) state.Register((*DevAddrSet)(nil)) state.Register((*DevAddrnode)(nil)) state.Register((*DevAddrFlatSegment)(nil)) state.Register((*tpuV4Device)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/accel/devaddr_range.go000066400000000000000000000034131465435605700265560ustar00rootroot00000000000000package accel // A Range represents a contiguous range of T. // // +stateify savable type DevAddrRange struct { // Start is the inclusive start of the range. Start uint64 // End is the exclusive end of the range. End uint64 } // WellFormed returns true if r.Start <= r.End. All other methods on a Range // require that the Range is well-formed. // //go:nosplit func (r DevAddrRange) WellFormed() bool { return r.Start <= r.End } // Length returns the length of the range. // //go:nosplit func (r DevAddrRange) Length() uint64 { return r.End - r.Start } // Contains returns true if r contains x. // //go:nosplit func (r DevAddrRange) Contains(x uint64) bool { return r.Start <= x && x < r.End } // Overlaps returns true if r and r2 overlap. // //go:nosplit func (r DevAddrRange) Overlaps(r2 DevAddrRange) bool { return r.Start < r2.End && r2.Start < r.End } // IsSupersetOf returns true if r is a superset of r2; that is, the range r2 is // contained within r. // //go:nosplit func (r DevAddrRange) IsSupersetOf(r2 DevAddrRange) bool { return r.Start <= r2.Start && r.End >= r2.End } // Intersect returns a range consisting of the intersection between r and r2. // If r and r2 do not overlap, Intersect returns a range with unspecified // bounds, but for which Length() == 0. // //go:nosplit func (r DevAddrRange) Intersect(r2 DevAddrRange) DevAddrRange { if r.Start < r2.Start { r.Start = r2.Start } if r.End > r2.End { r.End = r2.End } if r.End < r.Start { r.End = r.Start } return r } // CanSplitAt returns true if it is legal to split a segment spanning the range // r at x; that is, splitting at x would produce two ranges, both of which have // non-zero length. // //go:nosplit func (r DevAddrRange) CanSplitAt(x uint64) bool { return r.Contains(x) && r.Start < x } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/accel/devaddr_set.go000066400000000000000000002025741465435605700262660ustar00rootroot00000000000000package accel import ( "bytes" "context" "fmt" ) // trackGaps is an optional parameter. // // If trackGaps is 1, the Set will track maximum gap size recursively, // enabling the GapIterator.{Prev,Next}LargeEnoughGap functions. In this // case, Key must be an unsigned integer. // // trackGaps must be 0 or 1. const DevAddrtrackGaps = 0 var _ = uint8(DevAddrtrackGaps << 7) // Will fail if not zero or one. // dynamicGap is a type that disappears if trackGaps is 0. type DevAddrdynamicGap [DevAddrtrackGaps]uint64 // Get returns the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *DevAddrdynamicGap) Get() uint64 { return d[:][0] } // Set sets the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *DevAddrdynamicGap) Set(v uint64) { d[:][0] = v } const ( // minDegree is the minimum degree of an internal node in a Set B-tree. // // - Any non-root node has at least minDegree-1 segments. // // - Any non-root internal (non-leaf) node has at least minDegree children. // // - The root node may have fewer than minDegree-1 segments, but it may // only have 0 segments if the tree is empty. // // Our implementation requires minDegree >= 3. Higher values of minDegree // usually improve performance, but increase memory usage for small sets. DevAddrminDegree = 3 DevAddrmaxDegree = 2 * DevAddrminDegree ) // A Set is a mapping of segments with non-overlapping Range keys. The zero // value for a Set is an empty set. Set values are not safely movable nor // copyable. Set is thread-compatible. // // +stateify savable type DevAddrSet struct { root DevAddrnode `state:".([]DevAddrFlatSegment)"` } // IsEmpty returns true if the set contains no segments. func (s *DevAddrSet) IsEmpty() bool { return s.root.nrSegments == 0 } // IsEmptyRange returns true iff no segments in the set overlap the given // range. This is semantically equivalent to s.SpanRange(r) == 0, but may be // more efficient. func (s *DevAddrSet) IsEmptyRange(r DevAddrRange) bool { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return true } _, gap := s.Find(r.Start) if !gap.Ok() { return false } return r.End <= gap.End() } // Span returns the total size of all segments in the set. func (s *DevAddrSet) Span() uint64 { var sz uint64 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { sz += seg.Range().Length() } return sz } // SpanRange returns the total size of the intersection of segments in the set // with the given range. func (s *DevAddrSet) SpanRange(r DevAddrRange) uint64 { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return 0 } var sz uint64 for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { sz += seg.Range().Intersect(r).Length() } return sz } // FirstSegment returns the first segment in the set. If the set is empty, // FirstSegment returns a terminal iterator. func (s *DevAddrSet) FirstSegment() DevAddrIterator { if s.root.nrSegments == 0 { return DevAddrIterator{} } return s.root.firstSegment() } // LastSegment returns the last segment in the set. If the set is empty, // LastSegment returns a terminal iterator. func (s *DevAddrSet) LastSegment() DevAddrIterator { if s.root.nrSegments == 0 { return DevAddrIterator{} } return s.root.lastSegment() } // FirstGap returns the first gap in the set. func (s *DevAddrSet) FirstGap() DevAddrGapIterator { n := &s.root for n.hasChildren { n = n.children[0] } return DevAddrGapIterator{n, 0} } // LastGap returns the last gap in the set. func (s *DevAddrSet) LastGap() DevAddrGapIterator { n := &s.root for n.hasChildren { n = n.children[n.nrSegments] } return DevAddrGapIterator{n, n.nrSegments} } // Find returns the segment or gap whose range contains the given key. If a // segment is found, the returned Iterator is non-terminal and the // returned GapIterator is terminal. Otherwise, the returned Iterator is // terminal and the returned GapIterator is non-terminal. func (s *DevAddrSet) Find(key uint64) (DevAddrIterator, DevAddrGapIterator) { n := &s.root for { lower := 0 upper := n.nrSegments for lower < upper { i := lower + (upper-lower)/2 if r := n.keys[i]; key < r.End { if key >= r.Start { return DevAddrIterator{n, i}, DevAddrGapIterator{} } upper = i } else { lower = i + 1 } } i := lower if !n.hasChildren { return DevAddrIterator{}, DevAddrGapIterator{n, i} } n = n.children[i] } } // FindSegment returns the segment whose range contains the given key. If no // such segment exists, FindSegment returns a terminal iterator. func (s *DevAddrSet) FindSegment(key uint64) DevAddrIterator { seg, _ := s.Find(key) return seg } // LowerBoundSegment returns the segment with the lowest range that contains a // key greater than or equal to min. If no such segment exists, // LowerBoundSegment returns a terminal iterator. func (s *DevAddrSet) LowerBoundSegment(min uint64) DevAddrIterator { seg, gap := s.Find(min) if seg.Ok() { return seg } return gap.NextSegment() } // UpperBoundSegment returns the segment with the highest range that contains a // key less than or equal to max. If no such segment exists, UpperBoundSegment // returns a terminal iterator. func (s *DevAddrSet) UpperBoundSegment(max uint64) DevAddrIterator { seg, gap := s.Find(max) if seg.Ok() { return seg } return gap.PrevSegment() } // FindGap returns the gap containing the given key. If no such gap exists // (i.e. the set contains a segment containing that key), FindGap returns a // terminal iterator. func (s *DevAddrSet) FindGap(key uint64) DevAddrGapIterator { _, gap := s.Find(key) return gap } // LowerBoundGap returns the gap with the lowest range that is greater than or // equal to min. func (s *DevAddrSet) LowerBoundGap(min uint64) DevAddrGapIterator { seg, gap := s.Find(min) if gap.Ok() { return gap } return seg.NextGap() } // UpperBoundGap returns the gap with the highest range that is less than or // equal to max. func (s *DevAddrSet) UpperBoundGap(max uint64) DevAddrGapIterator { seg, gap := s.Find(max) if gap.Ok() { return gap } return seg.PrevGap() } // FirstLargeEnoughGap returns the first gap in the set with at least the given // length. If no such gap exists, FirstLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *DevAddrSet) FirstLargeEnoughGap(minSize uint64) DevAddrGapIterator { if DevAddrtrackGaps != 1 { panic("set is not tracking gaps") } gap := s.FirstGap() if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // LastLargeEnoughGap returns the last gap in the set with at least the given // length. If no such gap exists, LastLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *DevAddrSet) LastLargeEnoughGap(minSize uint64) DevAddrGapIterator { if DevAddrtrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LastGap() if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // LowerBoundLargeEnoughGap returns the first gap in the set with at least the // given length and whose range contains a key greater than or equal to min. If // no such gap exists, LowerBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *DevAddrSet) LowerBoundLargeEnoughGap(min, minSize uint64) DevAddrGapIterator { if DevAddrtrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LowerBoundGap(min) if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // UpperBoundLargeEnoughGap returns the last gap in the set with at least the // given length and whose range contains a key less than or equal to max. If no // such gap exists, UpperBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *DevAddrSet) UpperBoundLargeEnoughGap(max, minSize uint64) DevAddrGapIterator { if DevAddrtrackGaps != 1 { panic("set is not tracking gaps") } gap := s.UpperBoundGap(max) if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // Insert inserts the given segment into the given gap. If the new segment can // be merged with adjacent segments, Insert will do so. Insert returns an // iterator to the segment containing the inserted value (which may have been // merged with other values). All existing iterators (including gap, but not // including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, Insert panics. // // Insert is semantically equivalent to a InsertWithoutMerging followed by a // Merge, but may be more efficient. Note that there is no unchecked variant of // Insert since Insert must retrieve and inspect gap's predecessor and // successor segments regardless. func (s *DevAddrSet) Insert(gap DevAddrGapIterator, r DevAddrRange, val pinnedAccelMem) DevAddrIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } prev, next := gap.PrevSegment(), gap.NextSegment() if prev.Ok() && prev.End() > r.Start { panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) } if next.Ok() && next.Start() < r.End { panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) } if prev.Ok() && prev.End() == r.Start { if mval, ok := (devAddrSetFuncs{}).Merge(prev.Range(), prev.Value(), r, val); ok { shrinkMaxGap := DevAddrtrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() prev.SetEndUnchecked(r.End) prev.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } if next.Ok() && next.Start() == r.End { val = mval if mval, ok := (devAddrSetFuncs{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { prev.SetEndUnchecked(next.End()) prev.SetValue(mval) return s.Remove(next).PrevSegment() } } return prev } } if next.Ok() && next.Start() == r.End { if mval, ok := (devAddrSetFuncs{}).Merge(r, val, next.Range(), next.Value()); ok { shrinkMaxGap := DevAddrtrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() next.SetStartUnchecked(r.Start) next.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } return next } } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMerging inserts the given segment into the given gap and // returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, // InsertWithoutMerging panics. func (s *DevAddrSet) InsertWithoutMerging(gap DevAddrGapIterator, r DevAddrRange, val pinnedAccelMem) DevAddrIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if gr := gap.Range(); !gr.IsSupersetOf(r) { panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMergingUnchecked inserts the given segment into the given gap // and returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // Preconditions: // - r.Start >= gap.Start(). // - r.End <= gap.End(). func (s *DevAddrSet) InsertWithoutMergingUnchecked(gap DevAddrGapIterator, r DevAddrRange, val pinnedAccelMem) DevAddrIterator { gap = gap.node.rebalanceBeforeInsert(gap) splitMaxGap := DevAddrtrackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get()) copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) gap.node.keys[gap.index] = r gap.node.values[gap.index] = val gap.node.nrSegments++ if splitMaxGap { gap.node.updateMaxGapLeaf() } return DevAddrIterator{gap.node, gap.index} } // InsertRange inserts the given segment into the set. If the new segment can // be merged with adjacent segments, InsertRange will do so. InsertRange // returns an iterator to the segment containing the inserted value (which may // have been merged with other values). All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertRange panics. // // InsertRange searches the set to find the gap to insert into. If the caller // already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *DevAddrSet) InsertRange(r DevAddrRange, val pinnedAccelMem) DevAddrIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.Insert(gap, r, val) } // InsertWithoutMergingRange inserts the given segment into the set and returns // an iterator to the inserted segment. All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertWithoutMergingRange panics. // // InsertWithoutMergingRange searches the set to find the gap to insert into. // If the caller already has the appropriate GapIterator, or if the caller // needs to do additional work between finding the gap and insertion, use // InsertWithoutMerging instead. func (s *DevAddrSet) InsertWithoutMergingRange(r DevAddrRange, val pinnedAccelMem) DevAddrIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.InsertWithoutMerging(gap, r, val) } // TryInsertRange attempts to insert the given segment into the set. If the new // segment can be merged with adjacent segments, TryInsertRange will do so. // TryInsertRange returns an iterator to the segment containing the inserted // value (which may have been merged with other values). All existing iterators // (excluding the returned iterator) are invalidated. // // If the new segment would overlap an existing segment, TryInsertRange does // nothing and returns a terminal iterator. // // TryInsertRange searches the set to find the gap to insert into. If the // caller already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *DevAddrSet) TryInsertRange(r DevAddrRange, val pinnedAccelMem) DevAddrIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return DevAddrIterator{} } if gap.End() < r.End { return DevAddrIterator{} } return s.Insert(gap, r, val) } // TryInsertWithoutMergingRange attempts to insert the given segment into the // set. If successful, it returns an iterator to the inserted segment; all // existing iterators (excluding the returned iterator) are invalidated. If the // new segment would overlap an existing segment, TryInsertWithoutMergingRange // does nothing and returns a terminal iterator. // // TryInsertWithoutMergingRange searches the set to find the gap to insert // into. If the caller already has the appropriate GapIterator, or if the // caller needs to do additional work between finding the gap and insertion, // use InsertWithoutMerging instead. func (s *DevAddrSet) TryInsertWithoutMergingRange(r DevAddrRange, val pinnedAccelMem) DevAddrIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return DevAddrIterator{} } if gap.End() < r.End { return DevAddrIterator{} } return s.InsertWithoutMerging(gap, r, val) } // Remove removes the given segment and returns an iterator to the vacated gap. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. func (s *DevAddrSet) Remove(seg DevAddrIterator) DevAddrGapIterator { if seg.node.hasChildren { victim := seg.PrevSegment() seg.SetRangeUnchecked(victim.Range()) seg.SetValue(victim.Value()) nextAdjacentNode := seg.NextSegment().node if DevAddrtrackGaps != 0 { nextAdjacentNode.updateMaxGapLeaf() } return s.Remove(victim).NextGap() } copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) devAddrSetFuncs{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) seg.node.nrSegments-- if DevAddrtrackGaps != 0 { seg.node.updateMaxGapLeaf() } return seg.node.rebalanceAfterRemove(DevAddrGapIterator{seg.node, seg.index}) } // RemoveAll removes all segments from the set. All existing iterators are // invalidated. func (s *DevAddrSet) RemoveAll() { s.root = DevAddrnode{} } // RemoveRange removes all segments in the given range. An iterator to the // newly formed gap is returned, and all existing iterators are invalidated. // // RemoveRange searches the set to find segments to remove. If the caller // already has an iterator to either end of the range of segments to remove, or // if the caller needs to do additional work before removing each segment, // iterate segments and call Remove in a loop instead. func (s *DevAddrSet) RemoveRange(r DevAddrRange) DevAddrGapIterator { seg, gap := s.Find(r.Start) if seg.Ok() { seg = s.Isolate(seg, r) gap = s.Remove(seg) } for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { seg = s.SplitAfter(seg, r.End) gap = s.Remove(seg) } return gap } // RemoveFullRange is equivalent to RemoveRange, except that if any key in the // given range does not correspond to a segment, RemoveFullRange panics. func (s *DevAddrSet) RemoveFullRange(r DevAddrRange) DevAddrGapIterator { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) end := seg.End() gap := s.Remove(seg) if r.End <= end { return gap } seg = gap.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // Merge attempts to merge two neighboring segments. If successful, Merge // returns an iterator to the merged segment, and all existing iterators are // invalidated. Otherwise, Merge returns a terminal iterator. // // If first is not the predecessor of second, Merge panics. func (s *DevAddrSet) Merge(first, second DevAddrIterator) DevAddrIterator { if first.NextSegment() != second { panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) } return s.MergeUnchecked(first, second) } // MergeUnchecked attempts to merge two neighboring segments. If successful, // MergeUnchecked returns an iterator to the merged segment, and all existing // iterators are invalidated. Otherwise, MergeUnchecked returns a terminal // iterator. // // Precondition: first is the predecessor of second: first.NextSegment() == // second, first == second.PrevSegment(). func (s *DevAddrSet) MergeUnchecked(first, second DevAddrIterator) DevAddrIterator { if first.End() == second.Start() { if mval, ok := (devAddrSetFuncs{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { first.SetEndUnchecked(second.End()) first.SetValue(mval) return s.Remove(second).PrevSegment() } } return DevAddrIterator{} } // MergePrev attempts to merge the given segment with its predecessor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergePrev is usually used when mutating segments while iterating them in // order of increasing keys, to attempt merging of each mutated segment with // its previously-mutated predecessor. In such cases, merging a mutated segment // with its unmutated successor would incorrectly cause the latter to be // skipped. func (s *DevAddrSet) MergePrev(seg DevAddrIterator) DevAddrIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } return seg } // MergeNext attempts to merge the given segment with its successor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergeNext is usually used when mutating segments while iterating them in // order of decreasing keys, to attempt merging of each mutated segment with // its previously-mutated successor. In such cases, merging a mutated segment // with its unmutated predecessor would incorrectly cause the latter to be // skipped. func (s *DevAddrSet) MergeNext(seg DevAddrIterator) DevAddrIterator { if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // Unisolate attempts to merge the given segment with its predecessor and // successor if possible, and returns an updated iterator to the extended // segment. All existing iterators (including seg, but not including the // returned iterator) are invalidated. // // Unisolate is usually used in conjunction with Isolate when mutating part of // a single segment in a way that may affect its mergeability. For the reasons // described by MergePrev and MergeNext, it is usually incorrect to use the // return value of Unisolate in a loop variable. func (s *DevAddrSet) Unisolate(seg DevAddrIterator) DevAddrIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // MergeAll merges all mergeable adjacent segments in the set. All existing // iterators are invalidated. func (s *DevAddrSet) MergeAll() { seg := s.FirstSegment() if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeInsideRange attempts to merge all adjacent segments that contain a key // in the specific range. All existing iterators are invalidated. // // MergeInsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid a redundant search. func (s *DevAddrSet) MergeInsideRange(r DevAddrRange) { seg := s.LowerBoundSegment(r.Start) if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() && next.Start() < r.End { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeOutsideRange attempts to merge the segment containing r.Start with its // predecessor, and the segment containing r.End-1 with its successor. // // MergeOutsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid two redundant searches. func (s *DevAddrSet) MergeOutsideRange(r DevAddrRange) { first := s.FindSegment(r.Start) if first.Ok() { if prev := first.PrevSegment(); prev.Ok() { s.Merge(prev, first) } } last := s.FindSegment(r.End - 1) if last.Ok() { if next := last.NextSegment(); next.Ok() { s.Merge(last, next) } } } // Split splits the given segment at the given key and returns iterators to the // two resulting segments. All existing iterators (including seg, but not // including the returned iterators) are invalidated. // // If the segment cannot be split at split (because split is at the start or // end of the segment's range, so splitting would produce a segment with zero // length, or because split falls outside the segment's range altogether), // Split panics. func (s *DevAddrSet) Split(seg DevAddrIterator, split uint64) (DevAddrIterator, DevAddrIterator) { if !seg.Range().CanSplitAt(split) { panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) } return s.SplitUnchecked(seg, split) } // SplitUnchecked splits the given segment at the given key and returns // iterators to the two resulting segments. All existing iterators (including // seg, but not including the returned iterators) are invalidated. // // Preconditions: seg.Start() < key < seg.End(). func (s *DevAddrSet) SplitUnchecked(seg DevAddrIterator, split uint64) (DevAddrIterator, DevAddrIterator) { val1, val2 := (devAddrSetFuncs{}).Split(seg.Range(), seg.Value(), split) end2 := seg.End() seg.SetEndUnchecked(split) seg.SetValue(val1) seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), DevAddrRange{split, end2}, val2) return seg2.PrevSegment(), seg2 } // SplitBefore ensures that the given segment's start is at least start by // splitting at start if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterator) are invalidated. // // SplitBefore is usually when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, the first segment may // extend beyond the start of the range to be mutated, and needs to be // SplitBefore to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter; i.e. SplitBefore needs to be invoked on each segment, while // SplitAfter only needs to be invoked on the first. // // Preconditions: start < seg.End(). func (s *DevAddrSet) SplitBefore(seg DevAddrIterator, start uint64) DevAddrIterator { if seg.Range().CanSplitAt(start) { _, seg = s.SplitUnchecked(seg, start) } return seg } // SplitAfter ensures that the given segment's end is at most end by splitting // at end if necessary, and returns an updated iterator to the bounded segment. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. // // SplitAfter is usually used when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, each iterated segment // may extend beyond the end of the range to be mutated, and needs to be // SplitAfter to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter exchange roles; i.e. SplitBefore needs to be invoked on each // segment, while SplitAfter only needs to be invoked on the first. // // Preconditions: seg.Start() < end. func (s *DevAddrSet) SplitAfter(seg DevAddrIterator, end uint64) DevAddrIterator { if seg.Range().CanSplitAt(end) { seg, _ = s.SplitUnchecked(seg, end) } return seg } // Isolate ensures that the given segment's range is a subset of r by splitting // at r.Start and r.End if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterators) are invalidated. // // Isolate is usually used when mutating part of a single segment, or when // mutating segments in a range where the first segment is not necessarily // split, making use of SplitBefore/SplitAfter complex. // // Preconditions: seg.Range().Overlaps(r). func (s *DevAddrSet) Isolate(seg DevAddrIterator, r DevAddrRange) DevAddrIterator { if seg.Range().CanSplitAt(r.Start) { _, seg = s.SplitUnchecked(seg, r.Start) } if seg.Range().CanSplitAt(r.End) { seg, _ = s.SplitUnchecked(seg, r.End) } return seg } // LowerBoundSegmentSplitBefore combines LowerBoundSegment and SplitBefore. // // LowerBoundSegmentSplitBefore is usually used when mutating segments in a // range while iterating them in order of increasing keys. In such cases, // LowerBoundSegmentSplitBefore provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *DevAddrSet) LowerBoundSegmentSplitBefore(min uint64) DevAddrIterator { seg := s.LowerBoundSegment(min) if seg.Ok() { seg = s.SplitBefore(seg, min) } return seg } // UpperBoundSegmentSplitAfter combines UpperBoundSegment and SplitAfter. // // UpperBoundSegmentSplitAfter is usually used when mutating segments in a // range while iterating them in order of decreasing keys. In such cases, // UpperBoundSegmentSplitAfter provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *DevAddrSet) UpperBoundSegmentSplitAfter(max uint64) DevAddrIterator { seg := s.UpperBoundSegment(max) if seg.Ok() { seg = s.SplitAfter(seg, max) } return seg } // VisitRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments will not be split, so f may be called // on segments lying partially outside r. Non-empty gaps between segments are // skipped. If a call to f returns false, VisitRange stops iteration // immediately. // // N.B. f must not invalidate iterators into s. func (s *DevAddrSet) VisitRange(r DevAddrRange, f func(seg DevAddrIterator) bool) { for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { if !f(seg) { return } } } // VisitFullRange is equivalent to VisitRange, except that if any key in r that // is visited before f returns false does not correspond to a segment, // VisitFullRange panics. func (s *DevAddrSet) VisitFullRange(r DevAddrRange, f func(seg DevAddrIterator) bool) { pos := r.Start seg := s.FindSegment(r.Start) for { if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", pos)) } if !f(seg) { return } pos = seg.End() if r.End <= pos { return } seg, _ = seg.NextNonEmpty() } } // MutateRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments that lie partially outside r are split // before f is called, such that f only observes segments entirely within r. // Iterated segments are merged again after f is called. Non-empty gaps between // segments are skipped. If a call to f returns false, MutateRange stops // iteration immediately. // // MutateRange invalidates all existing iterators. // // N.B. f must not invalidate iterators into s. func (s *DevAddrSet) MutateRange(r DevAddrRange, f func(seg DevAddrIterator) bool) { seg := s.LowerBoundSegmentSplitBefore(r.Start) for seg.Ok() && seg.Start() < r.End { seg = s.SplitAfter(seg, r.End) cont := f(seg) seg = s.MergePrev(seg) if !cont { s.MergeNext(seg) return } seg = seg.NextSegment() } if seg.Ok() { s.MergePrev(seg) } } // MutateFullRange is equivalent to MutateRange, except that if any key in r // that is visited before f returns false does not correspond to a segment, // MutateFullRange panics. func (s *DevAddrSet) MutateFullRange(r DevAddrRange, f func(seg DevAddrIterator) bool) { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) cont := f(seg) end := seg.End() seg = s.MergePrev(seg) if !cont || r.End <= end { s.MergeNext(seg) return } seg = seg.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // +stateify savable type DevAddrnode struct { // An internal binary tree node looks like: // // K // / \ // Cl Cr // // where all keys in the subtree rooted by Cl (the left subtree) are less // than K (the key of the parent node), and all keys in the subtree rooted // by Cr (the right subtree) are greater than K. // // An internal B-tree node's indexes work out to look like: // // K0 K1 K2 ... Kn-1 // / \/ \/ \ ... / \ // C0 C1 C2 C3 ... Cn-1 Cn // // where n is nrSegments. nrSegments int // parent is a pointer to this node's parent. If this node is root, parent // is nil. parent *DevAddrnode // parentIndex is the index of this node in parent.children. parentIndex int // Flag for internal nodes that is technically redundant with "children[0] // != nil", but is stored in the first cache line. "hasChildren" rather // than "isLeaf" because false must be the correct value for an empty root. hasChildren bool // The longest gap within this node. If the node is a leaf, it's simply the // maximum gap among all the (nrSegments+1) gaps formed by its nrSegments keys // including the 0th and nrSegments-th gap possibly shared with its upper-level // nodes; if it's a non-leaf node, it's the max of all children's maxGap. maxGap DevAddrdynamicGap // Nodes store keys and values in separate arrays to maximize locality in // the common case (scanning keys for lookup). keys [DevAddrmaxDegree - 1]DevAddrRange values [DevAddrmaxDegree - 1]pinnedAccelMem children [DevAddrmaxDegree]*DevAddrnode } // firstSegment returns the first segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *DevAddrnode) firstSegment() DevAddrIterator { for n.hasChildren { n = n.children[0] } return DevAddrIterator{n, 0} } // lastSegment returns the last segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *DevAddrnode) lastSegment() DevAddrIterator { for n.hasChildren { n = n.children[n.nrSegments] } return DevAddrIterator{n, n.nrSegments - 1} } func (n *DevAddrnode) prevSibling() *DevAddrnode { if n.parent == nil || n.parentIndex == 0 { return nil } return n.parent.children[n.parentIndex-1] } func (n *DevAddrnode) nextSibling() *DevAddrnode { if n.parent == nil || n.parentIndex == n.parent.nrSegments { return nil } return n.parent.children[n.parentIndex+1] } // rebalanceBeforeInsert splits n and its ancestors if they are full, as // required for insertion, and returns an updated iterator to the position // represented by gap. func (n *DevAddrnode) rebalanceBeforeInsert(gap DevAddrGapIterator) DevAddrGapIterator { if n.nrSegments < DevAddrmaxDegree-1 { return gap } if n.parent != nil { gap = n.parent.rebalanceBeforeInsert(gap) } if n.parent == nil { left := &DevAddrnode{ nrSegments: DevAddrminDegree - 1, parent: n, parentIndex: 0, hasChildren: n.hasChildren, } right := &DevAddrnode{ nrSegments: DevAddrminDegree - 1, parent: n, parentIndex: 1, hasChildren: n.hasChildren, } copy(left.keys[:DevAddrminDegree-1], n.keys[:DevAddrminDegree-1]) copy(left.values[:DevAddrminDegree-1], n.values[:DevAddrminDegree-1]) copy(right.keys[:DevAddrminDegree-1], n.keys[DevAddrminDegree:]) copy(right.values[:DevAddrminDegree-1], n.values[DevAddrminDegree:]) n.keys[0], n.values[0] = n.keys[DevAddrminDegree-1], n.values[DevAddrminDegree-1] DevAddrzeroValueSlice(n.values[1:]) if n.hasChildren { copy(left.children[:DevAddrminDegree], n.children[:DevAddrminDegree]) copy(right.children[:DevAddrminDegree], n.children[DevAddrminDegree:]) DevAddrzeroNodeSlice(n.children[2:]) for i := 0; i < DevAddrminDegree; i++ { left.children[i].parent = left left.children[i].parentIndex = i right.children[i].parent = right right.children[i].parentIndex = i } } n.nrSegments = 1 n.hasChildren = true n.children[0] = left n.children[1] = right if DevAddrtrackGaps != 0 { left.updateMaxGapLocal() right.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < DevAddrminDegree { return DevAddrGapIterator{left, gap.index} } return DevAddrGapIterator{right, gap.index - DevAddrminDegree} } copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[DevAddrminDegree-1], n.values[DevAddrminDegree-1] copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { n.parent.children[i].parentIndex = i } sibling := &DevAddrnode{ nrSegments: DevAddrminDegree - 1, parent: n.parent, parentIndex: n.parentIndex + 1, hasChildren: n.hasChildren, } n.parent.children[n.parentIndex+1] = sibling n.parent.nrSegments++ copy(sibling.keys[:DevAddrminDegree-1], n.keys[DevAddrminDegree:]) copy(sibling.values[:DevAddrminDegree-1], n.values[DevAddrminDegree:]) DevAddrzeroValueSlice(n.values[DevAddrminDegree-1:]) if n.hasChildren { copy(sibling.children[:DevAddrminDegree], n.children[DevAddrminDegree:]) DevAddrzeroNodeSlice(n.children[DevAddrminDegree:]) for i := 0; i < DevAddrminDegree; i++ { sibling.children[i].parent = sibling sibling.children[i].parentIndex = i } } n.nrSegments = DevAddrminDegree - 1 if DevAddrtrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < DevAddrminDegree { return gap } return DevAddrGapIterator{sibling, gap.index - DevAddrminDegree} } // rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient // (contain fewer segments than required by B-tree invariants), as required for // removal, and returns an updated iterator to the position represented by gap. // // Precondition: n is the only node in the tree that may currently violate a // B-tree invariant. func (n *DevAddrnode) rebalanceAfterRemove(gap DevAddrGapIterator) DevAddrGapIterator { for { if n.nrSegments >= DevAddrminDegree-1 { return gap } if n.parent == nil { return gap } if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= DevAddrminDegree { copy(n.keys[1:], n.keys[:n.nrSegments]) copy(n.values[1:], n.values[:n.nrSegments]) n.keys[0] = n.parent.keys[n.parentIndex-1] n.values[0] = n.parent.values[n.parentIndex-1] n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] devAddrSetFuncs{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { copy(n.children[1:], n.children[:n.nrSegments+1]) n.children[0] = sibling.children[sibling.nrSegments] sibling.children[sibling.nrSegments] = nil n.children[0].parent = n n.children[0].parentIndex = 0 for i := 1; i < n.nrSegments+2; i++ { n.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if DevAddrtrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling && gap.index == sibling.nrSegments { return DevAddrGapIterator{n, 0} } if gap.node == n { return DevAddrGapIterator{n, gap.index + 1} } return gap } if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= DevAddrminDegree { n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] n.values[n.nrSegments] = n.parent.values[n.parentIndex] n.parent.keys[n.parentIndex] = sibling.keys[0] n.parent.values[n.parentIndex] = sibling.values[0] copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) devAddrSetFuncs{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { n.children[n.nrSegments+1] = sibling.children[0] copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) sibling.children[sibling.nrSegments] = nil n.children[n.nrSegments+1].parent = n n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 for i := 0; i < sibling.nrSegments; i++ { sibling.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if DevAddrtrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling { if gap.index == 0 { return DevAddrGapIterator{n, n.nrSegments} } return DevAddrGapIterator{sibling, gap.index - 1} } return gap } p := n.parent if p.nrSegments == 1 { left, right := p.children[0], p.children[1] p.nrSegments = left.nrSegments + right.nrSegments + 1 p.hasChildren = left.hasChildren p.keys[left.nrSegments] = p.keys[0] p.values[left.nrSegments] = p.values[0] copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := 0; i < p.nrSegments+1; i++ { p.children[i].parent = p p.children[i].parentIndex = i } } else { p.children[0] = nil p.children[1] = nil } if gap.node == left { return DevAddrGapIterator{p, gap.index} } if gap.node == right { return DevAddrGapIterator{p, gap.index + left.nrSegments + 1} } return gap } // Merge n and either sibling, along with the segment separating the // two, into whichever of the two nodes comes first. This is the // reverse of the non-root splitting case in // node.rebalanceBeforeInsert. var left, right *DevAddrnode if n.parentIndex > 0 { left = n.prevSibling() right = n } else { left = n right = n.nextSibling() } if gap.node == right { gap = DevAddrGapIterator{left, gap.index + left.nrSegments + 1} } left.keys[left.nrSegments] = p.keys[left.parentIndex] left.values[left.nrSegments] = p.values[left.parentIndex] copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { left.children[i].parent = left left.children[i].parentIndex = i } } left.nrSegments += right.nrSegments + 1 copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) devAddrSetFuncs{}.ClearValue(&p.values[p.nrSegments-1]) copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) for i := 0; i < p.nrSegments; i++ { p.children[i].parentIndex = i } p.children[p.nrSegments] = nil p.nrSegments-- if DevAddrtrackGaps != 0 { left.updateMaxGapLocal() } n = p } } // updateMaxGapLeaf updates maxGap bottom-up from the calling leaf until no // necessary update. // // Preconditions: n must be a leaf node, trackGaps must be 1. func (n *DevAddrnode) updateMaxGapLeaf() { if n.hasChildren { panic(fmt.Sprintf("updateMaxGapLeaf should always be called on leaf node: %v", n)) } max := n.calculateMaxGapLeaf() if max == n.maxGap.Get() { return } oldMax := n.maxGap.Get() n.maxGap.Set(max) if max > oldMax { for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() >= max { break } p.maxGap.Set(max) } return } for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() > oldMax { break } parentNewMax := p.calculateMaxGapInternal() if p.maxGap.Get() == parentNewMax { break } p.maxGap.Set(parentNewMax) } } // updateMaxGapLocal updates maxGap of the calling node solely with no // propagation to ancestor nodes. // // Precondition: trackGaps must be 1. func (n *DevAddrnode) updateMaxGapLocal() { if !n.hasChildren { n.maxGap.Set(n.calculateMaxGapLeaf()) } else { n.maxGap.Set(n.calculateMaxGapInternal()) } } // calculateMaxGapLeaf iterates the gaps within a leaf node and calculate the // max. // // Preconditions: n must be a leaf node. func (n *DevAddrnode) calculateMaxGapLeaf() uint64 { max := DevAddrGapIterator{n, 0}.Range().Length() for i := 1; i <= n.nrSegments; i++ { if current := (DevAddrGapIterator{n, i}).Range().Length(); current > max { max = current } } return max } // calculateMaxGapInternal iterates children's maxGap within an internal node n // and calculate the max. // // Preconditions: n must be a non-leaf node. func (n *DevAddrnode) calculateMaxGapInternal() uint64 { max := n.children[0].maxGap.Get() for i := 1; i <= n.nrSegments; i++ { if current := n.children[i].maxGap.Get(); current > max { max = current } } return max } // searchFirstLargeEnoughGap returns the first gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *DevAddrnode) searchFirstLargeEnoughGap(minSize uint64) DevAddrGapIterator { if n.maxGap.Get() < minSize { return DevAddrGapIterator{} } if n.hasChildren { for i := 0; i <= n.nrSegments; i++ { if largeEnoughGap := n.children[i].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := 0; i <= n.nrSegments; i++ { currentGap := DevAddrGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // searchLastLargeEnoughGap returns the last gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *DevAddrnode) searchLastLargeEnoughGap(minSize uint64) DevAddrGapIterator { if n.maxGap.Get() < minSize { return DevAddrGapIterator{} } if n.hasChildren { for i := n.nrSegments; i >= 0; i-- { if largeEnoughGap := n.children[i].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := n.nrSegments; i >= 0; i-- { currentGap := DevAddrGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // A Iterator is conceptually one of: // // - A pointer to a segment in a set; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Iterators are copyable values and are meaningfully equality-comparable. The // zero value of Iterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type DevAddrIterator struct { // node is the node containing the iterated segment. If the iterator is // terminal, node is nil. node *DevAddrnode // index is the index of the segment in node.keys/values. index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (seg DevAddrIterator) Ok() bool { return seg.node != nil } // Range returns the iterated segment's range key. func (seg DevAddrIterator) Range() DevAddrRange { return seg.node.keys[seg.index] } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (seg DevAddrIterator) Start() uint64 { return seg.node.keys[seg.index].Start } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (seg DevAddrIterator) End() uint64 { return seg.node.keys[seg.index].End } // SetRangeUnchecked mutates the iterated segment's range key. This operation // does not invalidate any iterators. // // Preconditions: // - r.Length() > 0. // - The new range must not overlap an existing one: // - If seg.NextSegment().Ok(), then r.end <= seg.NextSegment().Start(). // - If seg.PrevSegment().Ok(), then r.start >= seg.PrevSegment().End(). func (seg DevAddrIterator) SetRangeUnchecked(r DevAddrRange) { seg.node.keys[seg.index] = r } // SetRange mutates the iterated segment's range key. If the new range would // cause the iterated segment to overlap another segment, or if the new range // is invalid, SetRange panics. This operation does not invalidate any // iterators. func (seg DevAddrIterator) SetRange(r DevAddrRange) { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) } if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) } seg.SetRangeUnchecked(r) } // SetStartUnchecked mutates the iterated segment's start. This operation does // not invalidate any iterators. // // Preconditions: The new start must be valid: // - start < seg.End() // - If seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). func (seg DevAddrIterator) SetStartUnchecked(start uint64) { seg.node.keys[seg.index].Start = start } // SetStart mutates the iterated segment's start. If the new start value would // cause the iterated segment to overlap another segment, or would result in an // invalid range, SetStart panics. This operation does not invalidate any // iterators. func (seg DevAddrIterator) SetStart(start uint64) { if start >= seg.End() { panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) } if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) } seg.SetStartUnchecked(start) } // SetEndUnchecked mutates the iterated segment's end. This operation does not // invalidate any iterators. // // Preconditions: The new end must be valid: // - end > seg.Start(). // - If seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). func (seg DevAddrIterator) SetEndUnchecked(end uint64) { seg.node.keys[seg.index].End = end } // SetEnd mutates the iterated segment's end. If the new end value would cause // the iterated segment to overlap another segment, or would result in an // invalid range, SetEnd panics. This operation does not invalidate any // iterators. func (seg DevAddrIterator) SetEnd(end uint64) { if end <= seg.Start() { panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) } if next := seg.NextSegment(); next.Ok() && end > next.Start() { panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) } seg.SetEndUnchecked(end) } // Value returns a copy of the iterated segment's value. func (seg DevAddrIterator) Value() pinnedAccelMem { return seg.node.values[seg.index] } // ValuePtr returns a pointer to the iterated segment's value. The pointer is // invalidated if the iterator is invalidated. This operation does not // invalidate any iterators. func (seg DevAddrIterator) ValuePtr() *pinnedAccelMem { return &seg.node.values[seg.index] } // SetValue mutates the iterated segment's value. This operation does not // invalidate any iterators. func (seg DevAddrIterator) SetValue(val pinnedAccelMem) { seg.node.values[seg.index] = val } // PrevSegment returns the iterated segment's predecessor. If there is no // preceding segment, PrevSegment returns a terminal iterator. func (seg DevAddrIterator) PrevSegment() DevAddrIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment() } if seg.index > 0 { return DevAddrIterator{seg.node, seg.index - 1} } if seg.node.parent == nil { return DevAddrIterator{} } return DevAddrsegmentBeforePosition(seg.node.parent, seg.node.parentIndex) } // NextSegment returns the iterated segment's successor. If there is no // succeeding segment, NextSegment returns a terminal iterator. func (seg DevAddrIterator) NextSegment() DevAddrIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment() } if seg.index < seg.node.nrSegments-1 { return DevAddrIterator{seg.node, seg.index + 1} } if seg.node.parent == nil { return DevAddrIterator{} } return DevAddrsegmentAfterPosition(seg.node.parent, seg.node.parentIndex) } // PrevGap returns the gap immediately before the iterated segment. func (seg DevAddrIterator) PrevGap() DevAddrGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment().NextGap() } return DevAddrGapIterator{seg.node, seg.index} } // NextGap returns the gap immediately after the iterated segment. func (seg DevAddrIterator) NextGap() DevAddrGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment().PrevGap() } return DevAddrGapIterator{seg.node, seg.index + 1} } // PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, // or the gap before the iterated segment otherwise. If seg.Start() == // Functions.MinKey(), PrevNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by PrevNonEmpty will be // non-terminal. func (seg DevAddrIterator) PrevNonEmpty() (DevAddrIterator, DevAddrGapIterator) { if prev := seg.PrevSegment(); prev.Ok() && prev.End() == seg.Start() { return prev, DevAddrGapIterator{} } return DevAddrIterator{}, seg.PrevGap() } // NextNonEmpty returns the iterated segment's successor if it is adjacent, or // the gap after the iterated segment otherwise. If seg.End() == // Functions.MaxKey(), NextNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by NextNonEmpty will be // non-terminal. func (seg DevAddrIterator) NextNonEmpty() (DevAddrIterator, DevAddrGapIterator) { if next := seg.NextSegment(); next.Ok() && next.Start() == seg.End() { return next, DevAddrGapIterator{} } return DevAddrIterator{}, seg.NextGap() } // A GapIterator is conceptually one of: // // - A pointer to a position between two segments, before the first segment, or // after the last segment in a set, called a *gap*; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Note that the gap between two adjacent segments exists (iterators to it are // non-terminal), but has a length of zero. GapIterator.IsEmpty returns true // for such gaps. An empty set contains a single gap, spanning the entire range // of the set's keys. // // GapIterators are copyable values and are meaningfully equality-comparable. // The zero value of GapIterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type DevAddrGapIterator struct { // The representation of a GapIterator is identical to that of an Iterator, // except that index corresponds to positions between segments in the same // way as for node.children (see comment for node.nrSegments). node *DevAddrnode index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (gap DevAddrGapIterator) Ok() bool { return gap.node != nil } // Range returns the range spanned by the iterated gap. func (gap DevAddrGapIterator) Range() DevAddrRange { return DevAddrRange{gap.Start(), gap.End()} } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (gap DevAddrGapIterator) Start() uint64 { if ps := gap.PrevSegment(); ps.Ok() { return ps.End() } return devAddrSetFuncs{}.MinKey() } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (gap DevAddrGapIterator) End() uint64 { if ns := gap.NextSegment(); ns.Ok() { return ns.Start() } return devAddrSetFuncs{}.MaxKey() } // IsEmpty returns true if the iterated gap is empty (that is, the "gap" is // between two adjacent segments.) func (gap DevAddrGapIterator) IsEmpty() bool { return gap.Range().Length() == 0 } // PrevSegment returns the segment immediately before the iterated gap. If no // such segment exists, PrevSegment returns a terminal iterator. func (gap DevAddrGapIterator) PrevSegment() DevAddrIterator { return DevAddrsegmentBeforePosition(gap.node, gap.index) } // NextSegment returns the segment immediately after the iterated gap. If no // such segment exists, NextSegment returns a terminal iterator. func (gap DevAddrGapIterator) NextSegment() DevAddrIterator { return DevAddrsegmentAfterPosition(gap.node, gap.index) } // PrevGap returns the iterated gap's predecessor. If no such gap exists, // PrevGap returns a terminal iterator. func (gap DevAddrGapIterator) PrevGap() DevAddrGapIterator { seg := gap.PrevSegment() if !seg.Ok() { return DevAddrGapIterator{} } return seg.PrevGap() } // NextGap returns the iterated gap's successor. If no such gap exists, NextGap // returns a terminal iterator. func (gap DevAddrGapIterator) NextGap() DevAddrGapIterator { seg := gap.NextSegment() if !seg.Ok() { return DevAddrGapIterator{} } return seg.NextGap() } // NextLargeEnoughGap returns the iterated gap's first next gap with larger // length than minSize. If not found, return a terminal gap iterator (does NOT // include this gap itself). // // Precondition: trackGaps must be 1. func (gap DevAddrGapIterator) NextLargeEnoughGap(minSize uint64) DevAddrGapIterator { if DevAddrtrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == gap.node.nrSegments { gap.node = gap.NextSegment().node gap.index = 0 return gap.nextLargeEnoughGapHelper(minSize) } return gap.nextLargeEnoughGapHelper(minSize) } // nextLargeEnoughGapHelper is the helper function used by NextLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the trailing gap of a non-leaf node. func (gap DevAddrGapIterator) nextLargeEnoughGapHelper(minSize uint64) DevAddrGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == gap.node.nrSegments)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return DevAddrGapIterator{} } gap.index++ for gap.index <= gap.node.nrSegments { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index++ } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == gap.node.nrSegments { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // PrevLargeEnoughGap returns the iterated gap's first prev gap with larger or // equal length than minSize. If not found, return a terminal gap iterator // (does NOT include this gap itself). // // Precondition: trackGaps must be 1. func (gap DevAddrGapIterator) PrevLargeEnoughGap(minSize uint64) DevAddrGapIterator { if DevAddrtrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == 0 { gap.node = gap.PrevSegment().node gap.index = gap.node.nrSegments return gap.prevLargeEnoughGapHelper(minSize) } return gap.prevLargeEnoughGapHelper(minSize) } // prevLargeEnoughGapHelper is the helper function used by PrevLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the first gap of a non-leaf node. func (gap DevAddrGapIterator) prevLargeEnoughGapHelper(minSize uint64) DevAddrGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == 0)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return DevAddrGapIterator{} } gap.index-- for gap.index >= 0 { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index-- } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == 0 { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // segmentBeforePosition returns the predecessor segment of the position given // by n.children[i], which may or may not contain a child. If no such segment // exists, segmentBeforePosition returns a terminal iterator. func DevAddrsegmentBeforePosition(n *DevAddrnode, i int) DevAddrIterator { for i == 0 { if n.parent == nil { return DevAddrIterator{} } n, i = n.parent, n.parentIndex } return DevAddrIterator{n, i - 1} } // segmentAfterPosition returns the successor segment of the position given by // n.children[i], which may or may not contain a child. If no such segment // exists, segmentAfterPosition returns a terminal iterator. func DevAddrsegmentAfterPosition(n *DevAddrnode, i int) DevAddrIterator { for i == n.nrSegments { if n.parent == nil { return DevAddrIterator{} } n, i = n.parent, n.parentIndex } return DevAddrIterator{n, i} } func DevAddrzeroValueSlice(slice []pinnedAccelMem) { for i := range slice { devAddrSetFuncs{}.ClearValue(&slice[i]) } } func DevAddrzeroNodeSlice(slice []*DevAddrnode) { for i := range slice { slice[i] = nil } } // String stringifies a Set for debugging. func (s *DevAddrSet) String() string { return s.root.String() } // String stringifies a node (and all of its children) for debugging. func (n *DevAddrnode) String() string { var buf bytes.Buffer n.writeDebugString(&buf, "") return buf.String() } func (n *DevAddrnode) writeDebugString(buf *bytes.Buffer, prefix string) { if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { buf.WriteString(prefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) } for i := 0; i < n.nrSegments; i++ { if child := n.children[i]; child != nil { cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) if child.parent != n || child.parentIndex != i { buf.WriteString(cprefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) } child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) } buf.WriteString(prefix) if n.hasChildren { if DevAddrtrackGaps != 0 { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v, maxGap: %d\n", i, n.keys[i], n.values[i], n.maxGap.Get())) } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } if child := n.children[n.nrSegments]; child != nil { child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) } } // FlatSegment represents a segment as a single object. FlatSegment is used as // an intermediate representation for save/restore and tests. // // +stateify savable type DevAddrFlatSegment struct { Start uint64 End uint64 Value pinnedAccelMem } // ExportSlice returns a copy of all segments in the given set, in ascending // key order. func (s *DevAddrSet) ExportSlice() []DevAddrFlatSegment { var fs []DevAddrFlatSegment for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { fs = append(fs, DevAddrFlatSegment{ Start: seg.Start(), End: seg.End(), Value: seg.Value(), }) } return fs } // ImportSlice initializes the given set from the given slice. // // Preconditions: // - s must be empty. // - fs must represent a valid set (the segments in fs must have valid // lengths that do not overlap). // - The segments in fs must be sorted in ascending key order. func (s *DevAddrSet) ImportSlice(fs []DevAddrFlatSegment) error { if !s.IsEmpty() { return fmt.Errorf("cannot import into non-empty set %v", s) } gap := s.FirstGap() for i := range fs { f := &fs[i] r := DevAddrRange{f.Start, f.End} if !gap.Range().IsSupersetOf(r) { return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: %v => %v", r, f.Value) } gap = s.InsertWithoutMerging(gap, r, f.Value).NextGap() } return nil } // segmentTestCheck returns an error if s is incorrectly sorted, does not // contain exactly expectedSegments segments, or contains a segment which // fails the passed check. // // This should be used only for testing, and has been added to this package for // templating convenience. func (s *DevAddrSet) segmentTestCheck(expectedSegments int, segFunc func(int, DevAddrRange, pinnedAccelMem) error) error { havePrev := false prev := uint64(0) nrSegments := 0 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { next := seg.Start() if havePrev && prev >= next { return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments) } if segFunc != nil { if err := segFunc(nrSegments, seg.Range(), seg.Value()); err != nil { return err } } prev = next havePrev = true nrSegments++ } if nrSegments != expectedSegments { return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments) } return nil } // countSegments counts the number of segments in the set. // // Similar to Check, this should only be used for testing. func (s *DevAddrSet) countSegments() (segments int) { for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { segments++ } return segments } func (s *DevAddrSet) saveRoot() []DevAddrFlatSegment { fs := s.ExportSlice() fs = fs[:len(fs):len(fs)] return fs } func (s *DevAddrSet) loadRoot(_ context.Context, fs []DevAddrFlatSegment) { if err := s.ImportSlice(fs); err != nil { panic(err) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/accel/device.go000066400000000000000000000052621465435605700252340ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package accel import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/devutil" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) // tpuV4Device implements vfs.Device for /dev/accel[0-9]+. // // +stateify savable type tpuV4Device struct { mu sync.Mutex minor uint32 lite bool // +checklocks:mu openWriteFDs uint32 // +checklocks:mu devAddrSet DevAddrSet // +checklocks:mu owner *kernel.ThreadGroup } func (dev *tpuV4Device) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { devClient := devutil.GoferClientFromContext(ctx) if devClient == nil { log.Warningf("devutil.CtxDevGoferClient is not set") return nil, linuxerr.ENOENT } dev.mu.Lock() defer dev.mu.Unlock() name := fmt.Sprintf("accel%d", dev.minor) hostFD, err := devClient.OpenAt(ctx, name, opts.Flags) if err != nil { ctx.Warningf("accelDevice: failed to open device %s: %v", name, err) return nil, err } fd := &tpuV4FD{ hostFD: int32(hostFD), device: dev, } if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ UseDentryMetadata: true, }); err != nil { unix.Close(hostFD) return nil, err } if err := fdnotifier.AddFD(int32(hostFD), &fd.queue); err != nil { unix.Close(hostFD) return nil, err } fd.memmapFile.fd = fd if vfs.MayWriteFileWithOpenFlags(opts.Flags) { dev.openWriteFDs++ } if dev.owner == nil { t := kernel.TaskFromContext(ctx) if t == nil { return nil, linuxerr.ESRCH } dev.owner = t.ThreadGroup() } return &fd.vfsfd, nil } // RegisterTPUDevice registers all devices implemented by this package in vfsObj. func RegisterTPUDevice(vfsObj *vfs.VirtualFilesystem, minor uint32, lite bool) error { return vfsObj.RegisterDevice(vfs.CharDevice, linux.ACCEL_MAJOR, minor, &tpuV4Device{ lite: lite, minor: minor, }, &vfs.RegisterDeviceOptions{ GroupName: "accel", }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/accel/gasket.go000066400000000000000000000157531465435605700252610ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package accel import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/gasket" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/abi/tpu" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/devices/tpuproxy" "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" ) func gasketMapBufferIoctl(ctx context.Context, t *kernel.Task, hostFd int32, fd *tpuV4FD, paramsAddr hostarch.Addr) (uintptr, error) { var userIoctlParams gasket.GasketPageTableIoctl if _, err := userIoctlParams.CopyIn(t, paramsAddr); err != nil { return 0, err } numberOfPageTables := tpu.NumberOfTPUV4PageTables if fd.device.lite { numberOfPageTables = tpu.NumberOfTPUV4litePageTables } if userIoctlParams.PageTableIndex >= numberOfPageTables { return 0, linuxerr.EFAULT } tmm := t.MemoryManager() ar, ok := tmm.CheckIORange(hostarch.Addr(userIoctlParams.HostAddress), int64(userIoctlParams.Size)) if !ok { return 0, linuxerr.EFAULT } if !ar.IsPageAligned() || (userIoctlParams.Size/hostarch.PageSize) == 0 { return 0, linuxerr.EINVAL } devAddr := userIoctlParams.DeviceAddress // The kernel driver does not enforce page alignment on the device // address although it will be implicitly rounded down to a page // boundary. We do it explicitly because it simplifies tracking // of allocated ranges in 'devAddrSet'. devAddr &^= (hostarch.PageSize - 1) // Make sure that the device address range can be mapped. devar := DevAddrRange{ devAddr, devAddr + userIoctlParams.Size, } if !devar.WellFormed() { return 0, linuxerr.EINVAL } // Reserve a range in our address space. m, _, errno := unix.RawSyscall6(unix.SYS_MMAP, 0 /* addr */, uintptr(ar.Length()), unix.PROT_NONE, unix.MAP_PRIVATE|unix.MAP_ANONYMOUS, ^uintptr(0) /* fd */, 0 /* offset */) if errno != 0 { return 0, errno } cu := cleanup.Make(func() { unix.RawSyscall(unix.SYS_MUNMAP, m, uintptr(ar.Length()), 0) }) defer cu.Clean() // Mirror application mappings into the reserved range. prs, err := t.MemoryManager().Pin(ctx, ar, hostarch.ReadWrite, false /* ignorePermissions */) cu.Add(func() { mm.Unpin(prs) }) if err != nil { return 0, err } sentryAddr := uintptr(m) for _, pr := range prs { ims, err := pr.File.MapInternal(memmap.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())}, hostarch.ReadWrite) if err != nil { return 0, err } for !ims.IsEmpty() { im := ims.Head() if _, _, errno := unix.RawSyscall6(unix.SYS_MREMAP, im.Addr(), 0 /* old_size */, uintptr(im.Len()), linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED, sentryAddr, 0); errno != 0 { return 0, errno } sentryAddr += uintptr(im.Len()) ims = ims.Tail() } } sentryIoctlParams := userIoctlParams sentryIoctlParams.HostAddress = uint64(m) n, err := tpuproxy.IOCTLInvokePtrArg[gasket.Ioctl](hostFd, gasket.GASKET_IOCTL_MAP_BUFFER, &sentryIoctlParams) if err != nil { return n, err } cu.Release() // Unmap the reserved range, which is no longer required. unix.RawSyscall(unix.SYS_MUNMAP, m, uintptr(ar.Length()), 0) fd.device.mu.Lock() defer fd.device.mu.Unlock() for _, pr := range prs { rlen := uint64(pr.Source.Length()) fd.device.devAddrSet.InsertRange(DevAddrRange{ devAddr, devAddr + rlen, }, pinnedAccelMem{pinnedRange: pr, pageTableIndex: userIoctlParams.PageTableIndex}) devAddr += rlen } return n, nil } func gasketUnmapBufferIoctl(ctx context.Context, t *kernel.Task, hostFd int32, fd *tpuV4FD, paramsAddr hostarch.Addr) (uintptr, error) { var userIoctlParams gasket.GasketPageTableIoctl if _, err := userIoctlParams.CopyIn(t, paramsAddr); err != nil { return 0, err } numberOfPageTables := tpu.NumberOfTPUV4PageTables if fd.device.lite { numberOfPageTables = tpu.NumberOfTPUV4litePageTables } if userIoctlParams.PageTableIndex >= numberOfPageTables { return 0, linuxerr.EFAULT } devAddr := userIoctlParams.DeviceAddress devAddr &^= (hostarch.PageSize - 1) devar := DevAddrRange{ devAddr, devAddr + userIoctlParams.Size, } if !devar.WellFormed() { return 0, linuxerr.EINVAL } sentryIoctlParams := userIoctlParams sentryIoctlParams.HostAddress = 0 // clobber this value, it's unused. n, err := tpuproxy.IOCTLInvokePtrArg[gasket.Ioctl](hostFd, gasket.GASKET_IOCTL_UNMAP_BUFFER, &sentryIoctlParams) if err != nil { return n, err } fd.device.mu.Lock() defer fd.device.mu.Unlock() s := &fd.device.devAddrSet r := DevAddrRange{userIoctlParams.DeviceAddress, userIoctlParams.DeviceAddress + userIoctlParams.Size} seg := s.LowerBoundSegment(r.Start) for seg.Ok() && seg.Start() < r.End { seg = s.Isolate(seg, r) v := seg.Value() mm.Unpin([]mm.PinnedRange{v.pinnedRange}) gap := s.Remove(seg) seg = gap.NextSegment() } return n, nil } func gasketInterruptMappingIoctl(ctx context.Context, t *kernel.Task, hostFd int32, paramsAddr hostarch.Addr, lite bool) (uintptr, error) { var userIoctlParams gasket.GasketInterruptMapping if _, err := userIoctlParams.CopyIn(t, paramsAddr); err != nil { return 0, err } sizeOfInterruptList := tpu.SizeOfTPUV4InterruptList interruptMap := tpu.TPUV4InterruptsMap if lite { sizeOfInterruptList = tpu.SizeOfTPUV4liteInterruptList interruptMap = tpu.TPUV4liteInterruptsMap } if userIoctlParams.Interrupt >= sizeOfInterruptList { return 0, linuxerr.EINVAL } barRegMap, ok := interruptMap[userIoctlParams.BarIndex] if !ok { return 0, linuxerr.EINVAL } if _, ok := barRegMap[userIoctlParams.RegOffset]; !ok { return 0, linuxerr.EINVAL } // Check that 'userEventFD.Eventfd' is an eventfd. eventFileGeneric, _ := t.FDTable().Get(int32(userIoctlParams.EventFD)) if eventFileGeneric == nil { return 0, linuxerr.EBADF } defer eventFileGeneric.DecRef(ctx) eventFile, ok := eventFileGeneric.Impl().(*eventfd.EventFileDescription) if !ok { return 0, linuxerr.EINVAL } eventfd, err := eventFile.HostFD() if err != nil { return 0, err } sentryIoctlParams := userIoctlParams sentryIoctlParams.EventFD = uint64(eventfd) n, err := tpuproxy.IOCTLInvokePtrArg[gasket.Ioctl](hostFd, gasket.GASKET_IOCTL_REGISTER_INTERRUPT, &sentryIoctlParams) if err != nil { return n, err } outIoctlParams := sentryIoctlParams outIoctlParams.EventFD = userIoctlParams.EventFD if _, err := outIoctlParams.CopyOut(t, paramsAddr); err != nil { return n, err } return n, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/accel/seccomp_filters.go000066400000000000000000000047631465435605700271630ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package accel import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/gasket" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" ) // Filters returns seccomp-bpf filters for this package. func Filters() seccomp.SyscallRules { return seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_OPENAT: seccomp.PerArg{ // All paths that we openat() are absolute, so we pass a dirfd // of -1 (which is invalid for relative paths, but ignored for // absolute paths) to hedge against bugs involving AT_FDCWD or // real dirfds. seccomp.EqualTo(^uintptr(0)), seccomp.AnyValue{}, seccomp.MaskedEqual(unix.O_CREAT|unix.O_NOFOLLOW, unix.O_NOFOLLOW), seccomp.AnyValue{}, }, unix.SYS_GETDENTS64: seccomp.MatchAll{}, unix.SYS_IOCTL: seccomp.Or{ seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(gasket.GASKET_IOCTL_RESET), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(gasket.GASKET_IOCTL_MAP_BUFFER), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(gasket.GASKET_IOCTL_UNMAP_BUFFER), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(gasket.GASKET_IOCTL_CLEAR_INTERRUPT_COUNTS), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(gasket.GASKET_IOCTL_REGISTER_INTERRUPT), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(gasket.GASKET_IOCTL_UNREGISTER_INTERRUPT), }, }, unix.SYS_EVENTFD2: seccomp.Or{ seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(linux.EFD_NONBLOCK), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(linux.EFD_NONBLOCK | linux.EFD_SEMAPHORE), }, }, unix.SYS_MREMAP: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(0), /* old_size */ seccomp.AnyValue{}, seccomp.EqualTo(linux.MREMAP_MAYMOVE | linux.MREMAP_FIXED), seccomp.AnyValue{}, seccomp.EqualTo(0), }, }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/accel/tpu_v4.go000066400000000000000000000162021465435605700252120ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package accel implements proxying for hardware accelerators. package accel import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/gasket" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/devices/tpuproxy" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // tpuV4FD implements vfs.FileDescriptionImpl for /dev/accel[0-9]+. // // accelFD is not savable; we do not implement save/restore of accelerator // state. type tpuV4FD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD hostFD int32 device *tpuV4Device queue waiter.Queue memmapFile accelFDMemmapFile } // Release implements vfs.FileDescriptionImpl.Release. func (fd *tpuV4FD) Release(context.Context) { fd.device.mu.Lock() defer fd.device.mu.Unlock() fd.device.openWriteFDs-- if fd.device.openWriteFDs == 0 { log.Infof("openWriteFDs is zero, unpinning all sentry memory mappings") s := &fd.device.devAddrSet seg := s.FirstSegment() for seg.Ok() { r, v := seg.Range(), seg.Value() gpti := gasket.GasketPageTableIoctl{ PageTableIndex: v.pageTableIndex, DeviceAddress: r.Start, Size: r.End - r.Start, HostAddress: 0, } _, err := tpuproxy.IOCTLInvokePtrArg[gasket.Ioctl](fd.hostFD, gasket.GASKET_IOCTL_UNMAP_BUFFER, &gpti) if err != nil { log.Warningf("could not unmap range [%#x, %#x) (index %d) on device: %v", r.Start, r.End, v.pageTableIndex, err) } mm.Unpin([]mm.PinnedRange{v.pinnedRange}) gap := s.Remove(seg) seg = gap.NextSegment() } fd.device.owner = nil } fdnotifier.RemoveFD(fd.hostFD) unix.Close(int(fd.hostFD)) } // EventRegister implements waiter.Waitable.EventRegister. func (fd *tpuV4FD) EventRegister(e *waiter.Entry) error { fd.queue.EventRegister(e) if err := fdnotifier.UpdateFD(fd.hostFD); err != nil { fd.queue.EventUnregister(e) return err } return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (fd *tpuV4FD) EventUnregister(e *waiter.Entry) { fd.queue.EventUnregister(e) if err := fdnotifier.UpdateFD(fd.hostFD); err != nil { panic(fmt.Sprint("UpdateFD:", err)) } } // Readiness implements waiter.Waitable.Readiness. func (fd *tpuV4FD) Readiness(mask waiter.EventMask) waiter.EventMask { return fdnotifier.NonBlockingPoll(fd.hostFD, mask) } // Epollable implements vfs.FileDescriptionImpl.Epollable. func (fd *tpuV4FD) Epollable() bool { return true } // Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (fd *tpuV4FD) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { cmd := args[1].Uint() argPtr := args[2].Pointer() argSize := linux.IOC_SIZE(cmd) t := kernel.TaskFromContext(ctx) if t == nil { panic("Ioctl should be called from a task context") } if err := fd.checkPermission(t); err != nil { return 0, err } log.Infof("Accel ioctl %s called on fd %d with arg %v of size %d.", gasket.Ioctl(cmd), fd.hostFD, argPtr, argSize) switch gasket.Ioctl(cmd) { // Not yet implemented gasket ioctls. case gasket.GASKET_IOCTL_SET_EVENTFD, gasket.GASKET_IOCTL_CLEAR_EVENTFD, gasket.GASKET_IOCTL_NUMBER_PAGE_TABLES, gasket.GASKET_IOCTL_PAGE_TABLE_SIZE, gasket.GASKET_IOCTL_SIMPLE_PAGE_TABLE_SIZE, gasket.GASKET_IOCTL_PARTITION_PAGE_TABLE, gasket.GASKET_IOCTL_MAP_DMA_BUF: return 0, linuxerr.ENOSYS case gasket.GASKET_IOCTL_RESET: return tpuproxy.IOCTLInvoke[gasket.Ioctl, uint64](fd.hostFD, gasket.GASKET_IOCTL_RESET, args[2].Uint64()) case gasket.GASKET_IOCTL_MAP_BUFFER: return gasketMapBufferIoctl(ctx, t, fd.hostFD, fd, argPtr) case gasket.GASKET_IOCTL_UNMAP_BUFFER: return gasketUnmapBufferIoctl(ctx, t, fd.hostFD, fd, argPtr) case gasket.GASKET_IOCTL_CLEAR_INTERRUPT_COUNTS: return tpuproxy.IOCTLInvoke[gasket.Ioctl](fd.hostFD, gasket.GASKET_IOCTL_CLEAR_INTERRUPT_COUNTS, 0) case gasket.GASKET_IOCTL_REGISTER_INTERRUPT: return gasketInterruptMappingIoctl(ctx, t, fd.hostFD, argPtr, fd.device.lite) case gasket.GASKET_IOCTL_UNREGISTER_INTERRUPT: return tpuproxy.IOCTLInvoke[gasket.Ioctl, uint64](fd.hostFD, gasket.GASKET_IOCTL_UNREGISTER_INTERRUPT, args[2].Uint64()) default: return 0, linuxerr.EINVAL } } // checkPermission checks that the thread that owns this device is the only // one that can issue commands to the TPU. Other threads with access to // /dev/accel will not be able to issue commands to the device. func (fd *tpuV4FD) checkPermission(t *kernel.Task) error { fd.device.mu.Lock() defer fd.device.mu.Unlock() owner := fd.device.owner if t.ThreadGroup() != owner { return linuxerr.EPERM } return nil } type pinnedAccelMem struct { pinnedRange mm.PinnedRange pageTableIndex uint64 } // DevAddrSet tracks device address ranges that have been mapped. type devAddrSetFuncs struct{} func (devAddrSetFuncs) MinKey() uint64 { return 0 } func (devAddrSetFuncs) MaxKey() uint64 { return ^uint64(0) } func (devAddrSetFuncs) ClearValue(val *pinnedAccelMem) { *val = pinnedAccelMem{} } func (devAddrSetFuncs) Merge(r1 DevAddrRange, v1 pinnedAccelMem, r2 DevAddrRange, v2 pinnedAccelMem) (pinnedAccelMem, bool) { // Do we have the same backing file? if v1.pinnedRange.File != v2.pinnedRange.File { return pinnedAccelMem{}, false } // Do we have contiguous offsets in the backing file? if v1.pinnedRange.Offset+uint64(v1.pinnedRange.Source.Length()) != v2.pinnedRange.Offset { return pinnedAccelMem{}, false } // Are the virtual addresses contiguous? // // This check isn't strictly needed because 'mm.PinnedRange.Source' // is only used to track the size of the pinned region (this is // because the virtual address range can be unmapped or remapped // elsewhere). Regardless we require this for simplicity. if v1.pinnedRange.Source.End != v2.pinnedRange.Source.Start { return pinnedAccelMem{}, false } // Extend v1 to account for the adjacent PinnedRange. v1.pinnedRange.Source.End = v2.pinnedRange.Source.End return v1, true } func (devAddrSetFuncs) Split(r DevAddrRange, val pinnedAccelMem, split uint64) (pinnedAccelMem, pinnedAccelMem) { n := split - r.Start left := val left.pinnedRange.Source.End = left.pinnedRange.Source.Start + hostarch.Addr(n) right := val right.pinnedRange.Source.Start += hostarch.Addr(n) right.pinnedRange.Offset += n return left, right } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/accel/tpu_v4_mmap.go000066400000000000000000000054451465435605700262330ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package accel import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *tpuV4FD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts) } // AddMapping implements memmap.Mappable.AddMapping. func (fd *tpuV4FD) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (fd *tpuV4FD) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { } // CopyMapping implements memmap.Mappable.CopyMapping. func (fd *tpuV4FD) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { return nil } // Translate implements memmap.Mappable.Translate. func (fd *tpuV4FD) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { return []memmap.Translation{ { Source: optional, File: &fd.memmapFile, Offset: optional.Start, Perms: at, }, }, nil } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (fd *tpuV4FD) InvalidateUnsavable(ctx context.Context) error { return nil } type accelFDMemmapFile struct { memmap.NoBufferedIOFallback fd *tpuV4FD } // IncRef implements memmap.File.IncRef. func (mf *accelFDMemmapFile) IncRef(memmap.FileRange, uint32) { } // DecRef implements memmap.File.DecRef. func (mf *accelFDMemmapFile) DecRef(fr memmap.FileRange) { } // MapInternal implements memmap.File.MapInternal. func (mf *accelFDMemmapFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { log.Traceback("accel: rejecting accelFDMemmapFile.MapInternal") return safemem.BlockSeq{}, linuxerr.EINVAL } // FD implements memmap.File.FD. func (mf *accelFDMemmapFile) FD() int { return int(mf.fd.hostFD) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/memdev/000077500000000000000000000000001465435605700236475ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/memdev/full.go000066400000000000000000000047701465435605700251500ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package memdev import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) const fullDevMinor = 7 // fullDevice implements vfs.Device for /dev/full. // // +stateify savable type fullDevice struct{} // Open implements vfs.Device.Open. func (fullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &fullFD{} if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ UseDentryMetadata: true, }); err != nil { return nil, err } return &fd.vfsfd, nil } // fullFD implements vfs.FileDescriptionImpl for /dev/full. // // +stateify savable type fullFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD } // Release implements vfs.FileDescriptionImpl.Release. func (fd *fullFD) Release(context.Context) { // noop } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *fullFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return dst.ZeroOut(ctx, dst.NumBytes()) } // Read implements vfs.FileDescriptionImpl.Read. func (fd *fullFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { return dst.ZeroOut(ctx, dst.NumBytes()) } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *fullFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { return 0, linuxerr.ENOSPC } // Write implements vfs.FileDescriptionImpl.Write. func (fd *fullFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { return 0, linuxerr.ENOSPC } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *fullFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { return 0, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/memdev/memdev.go000066400000000000000000000027431465435605700254610ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package memdev implements "mem" character devices, as implemented in Linux // by drivers/char/mem.c and drivers/char/random.c. package memdev import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // Register registers all devices implemented by this package in vfsObj. func Register(vfsObj *vfs.VirtualFilesystem) error { for minor, spec := range map[uint32]struct { dev vfs.Device pathname string }{ nullDevMinor: {nullDevice{}, "null"}, zeroDevMinor: {zeroDevice{}, "zero"}, fullDevMinor: {fullDevice{}, "full"}, randomDevMinor: {randomDevice{}, "random"}, urandomDevMinor: {randomDevice{}, "urandom"}, } { if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MEM_MAJOR, minor, spec.dev, &vfs.RegisterDeviceOptions{ GroupName: "mem", Pathname: spec.pathname, FilePerms: 0666, }); err != nil { return err } } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/memdev/memdev_state_autogen.go000066400000000000000000000137731465435605700304100ustar00rootroot00000000000000// automatically generated by stateify. package memdev import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (f *fullDevice) StateTypeName() string { return "pkg/sentry/devices/memdev.fullDevice" } func (f *fullDevice) StateFields() []string { return []string{} } func (f *fullDevice) beforeSave() {} // +checklocksignore func (f *fullDevice) StateSave(stateSinkObject state.Sink) { f.beforeSave() } func (f *fullDevice) afterLoad(context.Context) {} // +checklocksignore func (f *fullDevice) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (fd *fullFD) StateTypeName() string { return "pkg/sentry/devices/memdev.fullFD" } func (fd *fullFD) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "NoLockFD", } } func (fd *fullFD) beforeSave() {} // +checklocksignore func (fd *fullFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.vfsfd) stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &fd.NoLockFD) } func (fd *fullFD) afterLoad(context.Context) {} // +checklocksignore func (fd *fullFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.vfsfd) stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &fd.NoLockFD) } func (n *nullDevice) StateTypeName() string { return "pkg/sentry/devices/memdev.nullDevice" } func (n *nullDevice) StateFields() []string { return []string{} } func (n *nullDevice) beforeSave() {} // +checklocksignore func (n *nullDevice) StateSave(stateSinkObject state.Sink) { n.beforeSave() } func (n *nullDevice) afterLoad(context.Context) {} // +checklocksignore func (n *nullDevice) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (fd *nullFD) StateTypeName() string { return "pkg/sentry/devices/memdev.nullFD" } func (fd *nullFD) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "NoLockFD", } } func (fd *nullFD) beforeSave() {} // +checklocksignore func (fd *nullFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.vfsfd) stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &fd.NoLockFD) } func (fd *nullFD) afterLoad(context.Context) {} // +checklocksignore func (fd *nullFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.vfsfd) stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &fd.NoLockFD) } func (r *randomDevice) StateTypeName() string { return "pkg/sentry/devices/memdev.randomDevice" } func (r *randomDevice) StateFields() []string { return []string{} } func (r *randomDevice) beforeSave() {} // +checklocksignore func (r *randomDevice) StateSave(stateSinkObject state.Sink) { r.beforeSave() } func (r *randomDevice) afterLoad(context.Context) {} // +checklocksignore func (r *randomDevice) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (fd *randomFD) StateTypeName() string { return "pkg/sentry/devices/memdev.randomFD" } func (fd *randomFD) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "NoLockFD", "off", } } func (fd *randomFD) beforeSave() {} // +checklocksignore func (fd *randomFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.vfsfd) stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &fd.NoLockFD) stateSinkObject.Save(4, &fd.off) } func (fd *randomFD) afterLoad(context.Context) {} // +checklocksignore func (fd *randomFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.vfsfd) stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &fd.NoLockFD) stateSourceObject.Load(4, &fd.off) } func (z *zeroDevice) StateTypeName() string { return "pkg/sentry/devices/memdev.zeroDevice" } func (z *zeroDevice) StateFields() []string { return []string{} } func (z *zeroDevice) beforeSave() {} // +checklocksignore func (z *zeroDevice) StateSave(stateSinkObject state.Sink) { z.beforeSave() } func (z *zeroDevice) afterLoad(context.Context) {} // +checklocksignore func (z *zeroDevice) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (fd *zeroFD) StateTypeName() string { return "pkg/sentry/devices/memdev.zeroFD" } func (fd *zeroFD) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "NoLockFD", } } func (fd *zeroFD) beforeSave() {} // +checklocksignore func (fd *zeroFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.vfsfd) stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &fd.NoLockFD) } func (fd *zeroFD) afterLoad(context.Context) {} // +checklocksignore func (fd *zeroFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.vfsfd) stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &fd.NoLockFD) } func init() { state.Register((*fullDevice)(nil)) state.Register((*fullFD)(nil)) state.Register((*nullDevice)(nil)) state.Register((*nullFD)(nil)) state.Register((*randomDevice)(nil)) state.Register((*randomFD)(nil)) state.Register((*zeroDevice)(nil)) state.Register((*zeroFD)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/memdev/null.go000066400000000000000000000046521465435605700251570ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package memdev import ( "io" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) const nullDevMinor = 3 // nullDevice implements vfs.Device for /dev/null. // // +stateify savable type nullDevice struct{} // Open implements vfs.Device.Open. func (nullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &nullFD{} if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ UseDentryMetadata: true, }); err != nil { return nil, err } return &fd.vfsfd, nil } // nullFD implements vfs.FileDescriptionImpl for /dev/null. // // +stateify savable type nullFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD } // Release implements vfs.FileDescriptionImpl.Release. func (fd *nullFD) Release(context.Context) { // noop } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *nullFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return 0, io.EOF } // Read implements vfs.FileDescriptionImpl.Read. func (fd *nullFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { return 0, io.EOF } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *nullFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { return src.NumBytes(), nil } // Write implements vfs.FileDescriptionImpl.Write. func (fd *nullFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { return src.NumBytes(), nil } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *nullFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { return 0, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/memdev/random.go000066400000000000000000000060371465435605700254640ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package memdev import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) const ( randomDevMinor = 8 urandomDevMinor = 9 ) // randomDevice implements vfs.Device for /dev/random and /dev/urandom. // // +stateify savable type randomDevice struct{} // Open implements vfs.Device.Open. func (randomDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &randomFD{} if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ UseDentryMetadata: true, }); err != nil { return nil, err } return &fd.vfsfd, nil } // randomFD implements vfs.FileDescriptionImpl for /dev/random. // // +stateify savable type randomFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD // off is the "file offset". off is accessed using atomic memory // operations. off atomicbitops.Int64 } // Release implements vfs.FileDescriptionImpl.Release. func (fd *randomFD) Release(context.Context) { // noop } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *randomFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader}) } // Read implements vfs.FileDescriptionImpl.Read. func (fd *randomFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { n, err := dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader}) fd.off.Add(n) return n, err } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *randomFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { // In Linux, this mixes the written bytes into the entropy pool; we just // throw them away. return src.NumBytes(), nil } // Write implements vfs.FileDescriptionImpl.Write. func (fd *randomFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { fd.off.Add(src.NumBytes()) return src.NumBytes(), nil } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *randomFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { // Linux: drivers/char/random.c:random_fops.llseek == urandom_fops.llseek // == noop_llseek return fd.off.Load(), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/memdev/zero.go000066400000000000000000000070531465435605700251620ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package memdev import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) const zeroDevMinor = 5 // zeroDevice implements vfs.Device for /dev/zero. // // +stateify savable type zeroDevice struct{} // Open implements vfs.Device.Open. func (zeroDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &zeroFD{} if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ UseDentryMetadata: true, }); err != nil { return nil, err } return &fd.vfsfd, nil } // zeroFD implements vfs.FileDescriptionImpl for /dev/zero. // // +stateify savable type zeroFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD } // Release implements vfs.FileDescriptionImpl.Release. func (fd *zeroFD) Release(context.Context) { // noop } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *zeroFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return dst.ZeroOut(ctx, dst.NumBytes()) } // Read implements vfs.FileDescriptionImpl.Read. func (fd *zeroFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { return dst.ZeroOut(ctx, dst.NumBytes()) } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *zeroFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { return src.NumBytes(), nil } // Write implements vfs.FileDescriptionImpl.Write. func (fd *zeroFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { return src.NumBytes(), nil } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *zeroFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { return 0, nil } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *zeroFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { if opts.Private || !opts.MaxPerms.Write { // This mapping will never permit writing to the "underlying file" (in // Linux terms, it isn't VM_SHARED), so implement it as an anonymous // mapping, but back it with fd; this is what Linux does, and is // actually application-visible because the resulting VMA will show up // in /proc/[pid]/maps with fd.vfsfd.VirtualDentry()'s path rather than // "/dev/zero (deleted)". opts.Offset = 0 opts.MappingIdentity = &fd.vfsfd opts.SentryOwnedContent = true opts.MappingIdentity.IncRef() return nil } tmpfsFD, err := tmpfs.NewZeroFile(ctx, auth.CredentialsFromContext(ctx), kernel.KernelFromContext(ctx).ShmMount(), opts.Length) if err != nil { return err } defer tmpfsFD.DecRef(ctx) return tmpfsFD.ConfigureMMap(ctx, opts) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/000077500000000000000000000000001465435605700241175ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/fds_mutex.go000066400000000000000000000030501465435605700264420ustar00rootroot00000000000000package nvproxy import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type fdsMutex struct { mu sync.Mutex } var fdsprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var fdslockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type fdslockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *fdsMutex) Lock() { locking.AddGLock(fdsprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *fdsMutex) NestedLock(i fdslockNameIndex) { locking.AddGLock(fdsprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *fdsMutex) Unlock() { locking.DelGLock(fdsprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *fdsMutex) NestedUnlock(i fdslockNameIndex) { locking.DelGLock(fdsprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func fdsinitLockNames() {} func init() { fdsinitLockNames() fdsprefixIndex = locking.NewMutexClass(reflect.TypeOf(fdsMutex{}), fdslockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/frontend.go000066400000000000000000001116651465435605700262770ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package nvproxy import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/abi/nvgpu" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/devutil" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // frontendDevice implements vfs.Device for /dev/nvidia# and /dev/nvidiactl. // // +stateify savable type frontendDevice struct { nvp *nvproxy minor uint32 } func (dev *frontendDevice) basename() string { if dev.minor == nvgpu.NV_CONTROL_DEVICE_MINOR { return "nvidiactl" } return fmt.Sprintf("nvidia%d", dev.minor) } // Open implements vfs.Device.Open. func (dev *frontendDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { devClient := devutil.GoferClientFromContext(ctx) if devClient == nil { log.Warningf("devutil.CtxDevGoferClient is not set") return nil, linuxerr.ENOENT } basename := dev.basename() hostFD, err := devClient.OpenAt(ctx, basename, opts.Flags) if err != nil { ctx.Warningf("nvproxy: failed to open host %s: %v", basename, err) return nil, err } fd := &frontendFD{ dev: dev, containerName: devClient.ContainerName(), hostFD: int32(hostFD), } if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ UseDentryMetadata: true, }); err != nil { unix.Close(hostFD) return nil, err } fd.internalEntry.Init(fd, waiter.AllEvents) fd.internalQueue.EventRegister(&fd.internalEntry) if err := fdnotifier.AddFD(int32(hostFD), &fd.internalQueue); err != nil { unix.Close(hostFD) return nil, err } fd.memmapFile.fd = fd fd.dev.nvp.fdsMu.Lock() defer fd.dev.nvp.fdsMu.Unlock() fd.dev.nvp.frontendFDs[fd] = struct{}{} return &fd.vfsfd, nil } // frontendFD implements vfs.FileDescriptionImpl for /dev/nvidia# and // /dev/nvidiactl. // // +stateify savable type frontendFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD dev *frontendDevice containerName string hostFD int32 memmapFile frontendFDMemmapFile // The driver's implementation of poll() for these files, // kernel-open/nvidia/nv.c:nvidia_poll(), unsets // nv_linux_file_private_t::dataless_event_pending if it's set. This makes // notifications from dataless_event_pending edge-triggered; a host poll() // or epoll_wait() that returns the notification consumes it, preventing // future calls to poll() or epoll_wait() from observing the same // notification again. // // This is problematic in gVisor: fdnotifier, which epoll_wait()s on an // epoll instance that includes our hostFD, will forward notifications to // registered waiters, but this typically only wakes up blocked task // goroutines which will later call vfs.FileDescription.Readiness() to get // the FD's most up-to-date state. If our implementation of Readiness() // just polls the underlying host FD, it will no longer observe the // consumed notification. // // To work around this, intercept all events from fdnotifier and cache them // for the first following call to Readiness(), essentially replicating the // driver's behavior. internalQueue waiter.Queue internalEntry waiter.Entry cachedEvents atomicbitops.Uint64 appQueue waiter.Queue haveMmapContext atomicbitops.Bool `state:"nosave"` // clients are handles of clients owned by this frontendFD. clients is // protected by dev.nvp.objsMu. clients map[nvgpu.Handle]struct{} } // Release implements vfs.FileDescriptionImpl.Release. func (fd *frontendFD) Release(ctx context.Context) { fdnotifier.RemoveFD(fd.hostFD) fd.appQueue.Notify(waiter.EventHUp) fd.dev.nvp.fdsMu.Lock() delete(fd.dev.nvp.frontendFDs, fd) fd.dev.nvp.fdsMu.Unlock() fd.dev.nvp.objsLock() defer fd.dev.nvp.objsUnlock() unix.Close(int(fd.hostFD)) // src/nvidia/arch/nvalloc/unix/src/osapi.c:rm_cleanup_file_private() => // RmFreeUnusedClients() for h := range fd.clients { fd.dev.nvp.objFree(ctx, h, h) } } // EventRegister implements waiter.Waitable.EventRegister. func (fd *frontendFD) EventRegister(e *waiter.Entry) error { fd.appQueue.EventRegister(e) return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (fd *frontendFD) EventUnregister(e *waiter.Entry) { fd.appQueue.EventUnregister(e) } // Readiness implements waiter.Waitable.Readiness. func (fd *frontendFD) Readiness(mask waiter.EventMask) waiter.EventMask { for { cachedEvents := waiter.EventMask(fd.cachedEvents.Load()) maskedEvents := cachedEvents & mask if maskedEvents == 0 { // Poll for all events and cache any not consumed by this call. events := fdnotifier.NonBlockingPoll(fd.hostFD, waiter.AllEvents) if unmaskedEvents := events &^ mask; unmaskedEvents != 0 { fd.cacheEvents(unmaskedEvents) } return events & mask } if fd.cachedEvents.CompareAndSwap(uint64(cachedEvents), uint64(cachedEvents&^maskedEvents)) { return maskedEvents } } } func (fd *frontendFD) cacheEvents(mask waiter.EventMask) { for { oldEvents := waiter.EventMask(fd.cachedEvents.Load()) newEvents := oldEvents | mask if oldEvents == newEvents { break } if fd.cachedEvents.CompareAndSwap(uint64(oldEvents), uint64(newEvents)) { break } } } // NotifyEvent implements waiter.EventListener.NotifyEvent. func (fd *frontendFD) NotifyEvent(mask waiter.EventMask) { // Events must be cached before notifying fd.appQueue, in order to ensure // that the first notified waiter to call fd.Readiness() sees the // newly-cached events. fd.cacheEvents(mask) fd.appQueue.Notify(mask) } // Epollable implements vfs.FileDescriptionImpl.Epollable. func (fd *frontendFD) Epollable() bool { return true } // Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (fd *frontendFD) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { cmd := args[1].Uint() nr := linux.IOC_NR(cmd) argPtr := args[2].Pointer() argSize := linux.IOC_SIZE(cmd) t := kernel.TaskFromContext(ctx) if t == nil { panic("Ioctl should be called from a task context") } if ctx.IsLogging(log.Debug) { ctx.Debugf("nvproxy: frontend ioctl: nr = %d = %#x, argSize = %d", nr, nr, argSize) } fi := frontendIoctlState{ fd: fd, ctx: ctx, t: t, nr: nr, ioctlParamsAddr: argPtr, ioctlParamsSize: argSize, } // nr determines the argument type. // Implementors: // - To map nr to a symbol, look in // src/nvidia/arch/nvalloc/unix/include/nv_escape.h, // kernel-open/common/inc/nv-ioctl-numbers.h, and // kernel-open/common/inc/nv-ioctl-numa.h. // - To determine the parameter type, find the implementation in // kernel-open/nvidia/nv.c:nvidia_ioctl() or // src/nvidia/arch/nvalloc/unix/src/escape.c:RmIoctl(). // - Add symbol and parameter type definitions to //pkg/abi/nvgpu. // - Add filter to seccomp_filters.go. // - Add handling below. handler := fd.dev.nvp.abi.frontendIoctl[nr] if handler == nil { ctx.Warningf("nvproxy: unknown frontend ioctl %d == %#x (argSize=%d, cmd=%#x)", nr, nr, argSize, cmd) return 0, linuxerr.EINVAL } return handler(&fi) } // IsNvidiaDeviceFD implements NvidiaDeviceFD.IsNvidiaDeviceFD. func (fd *frontendFD) IsNvidiaDeviceFD() {} func frontendIoctlCmd(nr, argSize uint32) uintptr { return uintptr(linux.IOWR(nvgpu.NV_IOCTL_MAGIC, nr, argSize)) } // frontendIoctlState holds the state of a call to frontendFD.Ioctl(). type frontendIoctlState struct { fd *frontendFD ctx context.Context t *kernel.Task nr uint32 ioctlParamsAddr hostarch.Addr ioctlParamsSize uint32 } // frontendIoctlSimple implements a frontend ioctl whose parameters don't // contain any pointers requiring translation, file descriptors, or special // cases or effects, and consequently don't need to be typed by the sentry. func frontendIoctlSimple(fi *frontendIoctlState) (uintptr, error) { if fi.ioctlParamsSize == 0 { return frontendIoctlInvoke[byte](fi, nil) } ioctlParams := make([]byte, fi.ioctlParamsSize) if _, err := fi.t.CopyInBytes(fi.ioctlParamsAddr, ioctlParams); err != nil { return 0, err } n, err := frontendIoctlInvoke(fi, &ioctlParams[0]) if err != nil { return n, err } if _, err := fi.t.CopyOutBytes(fi.ioctlParamsAddr, ioctlParams); err != nil { return n, err } return n, nil } func rmNumaInfo(fi *frontendIoctlState) (uintptr, error) { // The CPU topology seen by the host driver differs from the CPU // topology presented by the sentry to the application, so reject this // ioctl; doing so is non-fatal. log.Debugf("nvproxy: ignoring NV_ESC_NUMA_INFO") return 0, linuxerr.EINVAL } func frontendRegisterFD(fi *frontendIoctlState) (uintptr, error) { var ioctlParams nvgpu.IoctlRegisterFD if fi.ioctlParamsSize != nvgpu.SizeofIoctlRegisterFD { return 0, linuxerr.EINVAL } if _, err := ioctlParams.CopyIn(fi.t, fi.ioctlParamsAddr); err != nil { return 0, err } ctlFileGeneric, _ := fi.t.FDTable().Get(ioctlParams.CtlFD) if ctlFileGeneric == nil { return 0, linuxerr.EINVAL } defer ctlFileGeneric.DecRef(fi.ctx) ctlFile, ok := ctlFileGeneric.Impl().(*frontendFD) if !ok { return 0, linuxerr.EINVAL } ioctlParams.CtlFD = ctlFile.hostFD // The returned ctl_fd can't change, so skip copying out. return frontendIoctlInvoke(fi, &ioctlParams) } func frontendIoctHasFD[Params any, PtrParams hasFrontendFDPtr[Params]](fi *frontendIoctlState) (uintptr, error) { var ioctlParamsValue Params ioctlParams := PtrParams(&ioctlParamsValue) if int(fi.ioctlParamsSize) != ioctlParams.SizeBytes() { return 0, linuxerr.EINVAL } if _, err := ioctlParams.CopyIn(fi.t, fi.ioctlParamsAddr); err != nil { return 0, err } origFD := ioctlParams.GetFrontendFD() eventFileGeneric, _ := fi.t.FDTable().Get(origFD) if eventFileGeneric == nil { return 0, linuxerr.EINVAL } defer eventFileGeneric.DecRef(fi.ctx) eventFile, ok := eventFileGeneric.Impl().(*frontendFD) if !ok { return 0, linuxerr.EINVAL } ioctlParams.SetFrontendFD(eventFile.hostFD) n, err := frontendIoctlInvoke(fi, ioctlParams) ioctlParams.SetFrontendFD(origFD) if err != nil { return n, err } if _, err := ioctlParams.CopyOut(fi.t, fi.ioctlParamsAddr); err != nil { return n, err } return n, nil } func rmAllocMemory(fi *frontendIoctlState) (uintptr, error) { var ioctlParams nvgpu.IoctlNVOS02ParametersWithFD if fi.ioctlParamsSize != nvgpu.SizeofIoctlNVOS02ParametersWithFD { return 0, linuxerr.EINVAL } if _, err := ioctlParams.CopyIn(fi.t, fi.ioctlParamsAddr); err != nil { return 0, err } if log.IsLogging(log.Debug) { fi.ctx.Debugf("nvproxy: NV_ESC_RM_ALLOC_MEMORY class %v", ioctlParams.Params.HClass) } // See src/nvidia/arch/nvalloc/unix/src/escape.c:RmIoctl() and // src/nvidia/interface/deprecated/rmapi_deprecated_allocmemory.c:rmAllocMemoryTable // for implementation. switch ioctlParams.Params.HClass { case nvgpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR: return rmAllocOSDescriptor(fi, &ioctlParams) default: fi.ctx.Warningf("nvproxy: unknown NV_ESC_RM_ALLOC_MEMORY class %v", ioctlParams.Params.HClass) return 0, linuxerr.EINVAL } } func rmAllocOSDescriptor(fi *frontendIoctlState, ioctlParams *nvgpu.IoctlNVOS02ParametersWithFD) (uintptr, error) { // Compare src/nvidia/arch/nvalloc/unix/src/escape.c:RmAllocOsDescriptor() // => RmCreateOsDescriptor(). failWithStatus := func(status uint32) error { ioctlParams.Params.Status = status _, err := ioctlParams.CopyOut(fi.t, fi.ioctlParamsAddr) return err } appAddr := addrFromP64(ioctlParams.Params.PMemory) if !appAddr.IsPageAligned() { return 0, failWithStatus(nvgpu.NV_ERR_NOT_SUPPORTED) } arLen := ioctlParams.Params.Limit + 1 if arLen == 0 { // integer overflow return 0, failWithStatus(nvgpu.NV_ERR_INVALID_LIMIT) } var ok bool arLen, ok = hostarch.PageRoundUp(arLen) if !ok { return 0, failWithStatus(nvgpu.NV_ERR_INVALID_ADDRESS) } appAR, ok := appAddr.ToRange(arLen) if !ok { return 0, failWithStatus(nvgpu.NV_ERR_INVALID_ADDRESS) } // The host driver will collect pages from our address space starting at // PMemory, so we must assemble a contiguous mapping equivalent to the // application's. at := hostarch.Read if ((ioctlParams.Params.Flags >> 21) & 0x1) == 0 /* NVOS02_FLAGS_ALLOC_USER_READ_ONLY_NO */ { at.Write = true } // Reserve a range in our address space. m, _, errno := unix.RawSyscall6(unix.SYS_MMAP, 0 /* addr */, uintptr(arLen), unix.PROT_NONE, unix.MAP_PRIVATE|unix.MAP_ANONYMOUS, ^uintptr(0) /* fd */, 0 /* offset */) if errno != 0 { return 0, errno } defer unix.RawSyscall(unix.SYS_MUNMAP, m, uintptr(arLen), 0) // Mirror application mappings into the reserved range. prs, err := fi.t.MemoryManager().Pin(fi.ctx, appAR, at, false /* ignorePermissions */) unpinCleanup := cleanup.Make(func() { mm.Unpin(prs) }) defer unpinCleanup.Clean() if err != nil { return 0, err } sentryAddr := uintptr(m) for _, pr := range prs { ims, err := pr.File.MapInternal(memmap.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())}, at) if err != nil { return 0, err } for !ims.IsEmpty() { im := ims.Head() if _, _, errno := unix.RawSyscall6(unix.SYS_MREMAP, im.Addr(), 0 /* old_size */, uintptr(im.Len()), linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED, sentryAddr, 0); errno != 0 { return 0, errno } sentryAddr += uintptr(im.Len()) ims = ims.Tail() } } origPMemory := ioctlParams.Params.PMemory ioctlParams.Params.PMemory = nvgpu.P64(uint64(m)) // NV01_MEMORY_SYSTEM_OS_DESCRIPTOR shouldn't use ioctlParams.FD; clobber // it to be sure. origFD := ioctlParams.FD ioctlParams.FD = -1 fi.fd.dev.nvp.objsLock() n, err := frontendIoctlInvoke(fi, ioctlParams) if err == nil && ioctlParams.Params.Status == nvgpu.NV_OK { // Transfer ownership of pinned pages to an osDescMem object, to be // unpinned when the driver OsDescMem is freed. fi.fd.dev.nvp.objAdd(fi.ctx, ioctlParams.Params.HRoot, ioctlParams.Params.HObjectNew, nvgpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, &osDescMem{ pinnedRanges: prs, }, ioctlParams.Params.HObjectParent) unpinCleanup.Release() if fi.ctx.IsLogging(log.Debug) { fi.ctx.Debugf("nvproxy: pinned %d bytes for OS descriptor with handle %v", arLen, ioctlParams.Params.HObjectNew) } } fi.fd.dev.nvp.objsUnlock() ioctlParams.Params.PMemory = origPMemory ioctlParams.FD = origFD if err != nil { return n, err } if _, err := ioctlParams.CopyOut(fi.t, fi.ioctlParamsAddr); err != nil { return n, err } return n, nil } func rmFree(fi *frontendIoctlState) (uintptr, error) { var ioctlParams nvgpu.NVOS00Parameters if fi.ioctlParamsSize != nvgpu.SizeofNVOS00Parameters { return 0, linuxerr.EINVAL } if _, err := ioctlParams.CopyIn(fi.t, fi.ioctlParamsAddr); err != nil { return 0, err } fi.fd.dev.nvp.objsLock() n, err := frontendIoctlInvoke(fi, &ioctlParams) if err == nil && ioctlParams.Status == nvgpu.NV_OK { fi.fd.dev.nvp.objFree(fi.ctx, ioctlParams.HRoot, ioctlParams.HObjectOld) } fi.fd.dev.nvp.objsUnlock() if err != nil { return n, err } if _, err := ioctlParams.CopyOut(fi.t, fi.ioctlParamsAddr); err != nil { return n, err } return n, nil } func rmControl(fi *frontendIoctlState) (uintptr, error) { var ioctlParams nvgpu.NVOS54Parameters if fi.ioctlParamsSize != nvgpu.SizeofNVOS54Parameters { return 0, linuxerr.EINVAL } if _, err := ioctlParams.CopyIn(fi.t, fi.ioctlParamsAddr); err != nil { return 0, err } // Cmd determines the type of Params. if log.IsLogging(log.Debug) { fi.ctx.Debugf("nvproxy: control command %#x, object %#x", ioctlParams.Cmd, ioctlParams.HObject.Val) } if ioctlParams.Cmd&nvgpu.RM_GSS_LEGACY_MASK != 0 { // This is a "legacy GSS control" that is implemented by the GPU System // Processor (GSP). Conseqeuently, its parameters cannot reasonably // contain application pointers, and the control is in any case // undocumented. // See // src/nvidia/src/kernel/rmapi/entry_points.c:_nv04ControlWithSecInfo() // => // src/nvidia/interface/deprecated/rmapi_deprecated_control.c:RmDeprecatedGetControlHandler() // => // src/nvidia/interface/deprecated/rmapi_gss_legacy_control.c:RmGssLegacyRpcCmd(). return rmControlSimple(fi, &ioctlParams) } // Implementors: // - Top two bytes of Cmd specifies class; third byte specifies category; // fourth byte specifies "message ID" (command within class/category). // e.g. 0x800288: // - Class 0x0080 => look in // src/common/sdk/nvidia/inc/ctrl/ctrl0080/ctrl0080base.h for categories. // - Category 0x02 => NV0080_CTRL_GPU => look in // src/common/sdk/nvidia/inc/ctrl/ctrl0080/ctrl0080gpu.h for // `#define NV0080_CTRL_CMD_GPU_QUERY_SW_STATE_PERSISTENCE (0x800288)` // and accompanying documentation, parameter type. // - If this fails, or to find implementation, grep for `methodId=.*0x` to find entry in g_*_nvoc.c; // implementing function is is "pFunc". // - Add symbol definition to //pkg/abi/nvgpu. Parameter type definition is // only required for non-simple commands. // - Add handling below. handler := fi.fd.dev.nvp.abi.controlCmd[ioctlParams.Cmd] if handler == nil { fi.ctx.Warningf("nvproxy: unknown control command %#x (paramsSize=%d)", ioctlParams.Cmd, ioctlParams.ParamsSize) return 0, linuxerr.EINVAL } return handler(fi, &ioctlParams) } func rmControlSimple(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS54Parameters) (uintptr, error) { if ioctlParams.ParamsSize == 0 { if ioctlParams.Params != 0 { return 0, linuxerr.EINVAL } return rmControlInvoke[byte](fi, ioctlParams, nil) } if ioctlParams.Params == 0 { return 0, linuxerr.EINVAL } ctrlParams := make([]byte, ioctlParams.ParamsSize) if _, err := fi.t.CopyInBytes(addrFromP64(ioctlParams.Params), ctrlParams); err != nil { return 0, err } n, err := rmControlInvoke(fi, ioctlParams, &ctrlParams[0]) if err != nil { return n, err } if _, err := fi.t.CopyOutBytes(addrFromP64(ioctlParams.Params), ctrlParams); err != nil { return n, err } return n, nil } func ctrlCmdFailWithStatus(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS54Parameters, status uint32) error { ioctlParams.Status = status _, err := ioctlParams.CopyOut(fi.t, fi.ioctlParamsAddr) return err } func ctrlHasFrontendFD[Params any, PtrParams hasFrontendFDPtr[Params]](fi *frontendIoctlState, ioctlParams *nvgpu.NVOS54Parameters) (uintptr, error) { var ctrlParamsValue Params ctrlParams := PtrParams(&ctrlParamsValue) if ctrlParams.SizeBytes() != int(ioctlParams.ParamsSize) { return 0, linuxerr.EINVAL } if _, err := ctrlParams.CopyIn(fi.t, addrFromP64(ioctlParams.Params)); err != nil { return 0, err } origFD := ctrlParams.GetFrontendFD() ctlFileGeneric, _ := fi.t.FDTable().Get(origFD) if ctlFileGeneric == nil { return 0, linuxerr.EINVAL } defer ctlFileGeneric.DecRef(fi.ctx) ctlFile, ok := ctlFileGeneric.Impl().(*frontendFD) if !ok { return 0, linuxerr.EINVAL } ctrlParams.SetFrontendFD(ctlFile.hostFD) n, err := rmControlInvoke(fi, ioctlParams, ctrlParams) ctrlParams.SetFrontendFD(origFD) if err != nil { return n, err } if _, err := ctrlParams.CopyOut(fi.t, addrFromP64(ioctlParams.Params)); err != nil { return n, err } return n, nil } func ctrlMemoryMulticastFabricAttachGPU(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS54Parameters) (uintptr, error) { var ctrlParams nvgpu.NV00FD_CTRL_ATTACH_GPU_PARAMS if ctrlParams.SizeBytes() != int(ioctlParams.ParamsSize) { return 0, linuxerr.EINVAL } if _, err := ctrlParams.CopyIn(fi.t, addrFromP64(ioctlParams.Params)); err != nil { return 0, err } origDevDescriptor := ctrlParams.DevDescriptor devDescriptor, _ := fi.t.FDTable().Get(int32(origDevDescriptor)) if devDescriptor == nil { return 0, linuxerr.EINVAL } defer devDescriptor.DecRef(fi.ctx) devDesc, ok := devDescriptor.Impl().(*frontendFD) if !ok { return 0, linuxerr.EINVAL } ctrlParams.DevDescriptor = uint64(devDesc.hostFD) n, err := rmControlInvoke(fi, ioctlParams, &ctrlParams) ctrlParams.DevDescriptor = origDevDescriptor // Note that ctrlParams.CopyOut() is not called here because // NV00FD_CTRL_ATTACH_GPU_PARAMS is an input-only parameter. return n, err } func ctrlClientSystemGetBuildVersion(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS54Parameters) (uintptr, error) { var ctrlParams nvgpu.NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS if ctrlParams.SizeBytes() != int(ioctlParams.ParamsSize) { return 0, linuxerr.EINVAL } if _, err := ctrlParams.CopyIn(fi.t, addrFromP64(ioctlParams.Params)); err != nil { return 0, err } if ctrlParams.PDriverVersionBuffer == 0 || ctrlParams.PVersionBuffer == 0 || ctrlParams.PTitleBuffer == 0 { // No strings are written if any are null. See // src/nvidia/interface/deprecated/rmapi_deprecated_control.c:V2_CONVERTER(_NV0000_CTRL_CMD_SYSTEM_GET_BUILD_VERSION). return ctrlClientSystemGetBuildVersionInvoke(fi, ioctlParams, &ctrlParams, nil, nil, nil) } // Need to buffer strings for copy-out. if ctrlParams.SizeOfStrings == 0 { return 0, linuxerr.EINVAL } driverVersionBuf := make([]byte, ctrlParams.SizeOfStrings) versionBuf := make([]byte, ctrlParams.SizeOfStrings) titleBuf := make([]byte, ctrlParams.SizeOfStrings) n, err := ctrlClientSystemGetBuildVersionInvoke(fi, ioctlParams, &ctrlParams, &driverVersionBuf[0], &versionBuf[0], &titleBuf[0]) if err != nil { return n, err } if _, err := fi.t.CopyOutBytes(addrFromP64(ctrlParams.PDriverVersionBuffer), driverVersionBuf); err != nil { return n, err } if _, err := fi.t.CopyOutBytes(addrFromP64(ctrlParams.PVersionBuffer), versionBuf); err != nil { return n, err } if _, err := fi.t.CopyOutBytes(addrFromP64(ctrlParams.PTitleBuffer), titleBuf); err != nil { return n, err } return n, nil } func ctrlDevGpuGetClasslist(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS54Parameters) (uintptr, error) { var ctrlParams nvgpu.NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS if ctrlParams.SizeBytes() != int(ioctlParams.ParamsSize) { return 0, linuxerr.EINVAL } if _, err := ctrlParams.CopyIn(fi.t, addrFromP64(ioctlParams.Params)); err != nil { return 0, err } // This command has two modes. If the classList pointer is NULL, only simple command handling // is required; see src/common/sdk/nvidia/inc/ctrl/ctrl0080gpu.h. if ctrlParams.ClassList == 0 { return rmControlSimple(fi, ioctlParams) } // classList pointer is not NULL. Check classList size against limit. See // src/nvidia/src/kernel/rmapi/embedded_param_copy.c:embeddedParamCopyIn() => // case NV0080_CTRL_CMD_GPU_GET_CLASSLIST => RMAPI_PARAM_COPY_INIT(). // paramCopy.paramsSize is initialized as numClasses * sizeof(NvU32). if ctrlParams.NumClasses*4 > nvgpu.RMAPI_PARAM_COPY_MAX_PARAMS_SIZE { return 0, ctrlCmdFailWithStatus(fi, ioctlParams, nvgpu.NV_ERR_INVALID_ARGUMENT) } classList := make([]uint32, ctrlParams.NumClasses) n, err := ctrlDevGpuGetClasslistInvoke(fi, ioctlParams, &ctrlParams, classList) if err != nil { return n, err } return n, nil } func ctrlRegisterVASpace(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS54Parameters) (uintptr, error) { var ctrlParams nvgpu.NV503C_CTRL_REGISTER_VA_SPACE_PARAMS if ctrlParams.SizeBytes() != int(ioctlParams.ParamsSize) { return 0, linuxerr.EINVAL } if _, err := ctrlParams.CopyIn(fi.t, addrFromP64(ioctlParams.Params)); err != nil { return 0, err } fi.fd.dev.nvp.objsLock() n, err := rmControlInvoke(fi, ioctlParams, &ctrlParams) if err == nil && ioctlParams.Status == nvgpu.NV_OK { // src/nvidia/src/kernel/gpu/bus/third_party_p2p.c:CliAddThirdPartyP2PVASpace() // => refAddDependant() fi.fd.dev.nvp.objAddDep(ioctlParams.HClient, ioctlParams.HObject, ctrlParams.HVASpace) } fi.fd.dev.nvp.objsUnlock() if err != nil { return n, err } if _, err := ctrlParams.CopyOut(fi.t, addrFromP64(ioctlParams.Params)); err != nil { return n, err } return n, nil } func ctrlSubdevFIFODisableChannels(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS54Parameters) (uintptr, error) { var ctrlParams nvgpu.NV2080_CTRL_FIFO_DISABLE_CHANNELS_PARAMS if ctrlParams.SizeBytes() != int(ioctlParams.ParamsSize) { return 0, linuxerr.EINVAL } if _, err := ctrlParams.CopyIn(fi.t, addrFromP64(ioctlParams.Params)); err != nil { return 0, err } // This pointer must be NULL; see // src/nvidia/src/kernel/gpu/fifo/kernel_fifo_ctrl.c:subdeviceCtrlCmdFifoDisableChannels_IMPL(). // Consequently, we don't need to translate it, but we do want to ensure // that it actually is NULL. if ctrlParams.PRunlistPreemptEvent != 0 { return 0, linuxerr.EINVAL } n, err := rmControlInvoke(fi, ioctlParams, &ctrlParams) if err != nil { return n, err } if _, err := ctrlParams.CopyOut(fi.t, addrFromP64(ioctlParams.Params)); err != nil { return n, err } return n, nil } func rmAlloc(fi *frontendIoctlState) (uintptr, error) { var isNVOS64 bool switch fi.ioctlParamsSize { case nvgpu.SizeofNVOS21Parameters: case nvgpu.SizeofNVOS64Parameters: isNVOS64 = true default: return 0, linuxerr.EINVAL } // Copy in parameters and convert to NVOS64Parameters, which is a super // set of all parameter types we support. buf := nvgpu.GetRmAllocParamObj(isNVOS64) if _, err := buf.CopyIn(fi.t, fi.ioctlParamsAddr); err != nil { return 0, err } ioctlParams := buf.ToOS64() // hClass determines the type of pAllocParms. if log.IsLogging(log.Debug) { fi.ctx.Debugf("nvproxy: allocation class %v", ioctlParams.HClass) } // Implementors: // - To map hClass to a symbol, look in // src/nvidia/generated/g_allclasses.h. // - See src/nvidia/src/kernel/rmapi/resource_list.h for table mapping class // ("External Class") to the type of pAllocParms ("Alloc Param Info") and // the class whose constructor interprets it ("Internal Class"). // - Add symbol and parameter type definitions to //pkg/abi/nvgpu. // - Check constructor for calls to refAddDependant(), // sessionAddDependant(), or sessionAddDependency(), which need to be // mirrored by dependencies in the call to nvproxy.objAddLocked(). // - Add handling below. handler := fi.fd.dev.nvp.abi.allocationClass[ioctlParams.HClass] if handler == nil { fi.ctx.Warningf("nvproxy: unknown allocation class %v", ioctlParams.HClass) // Compare // src/nvidia/src/kernel/rmapi/alloc_free.c:serverAllocResourceUnderLock(), // when RsResInfoByExternalClassId() is null. ioctlParams.Status = nvgpu.NV_ERR_INVALID_CLASS outIoctlParams := nvgpu.GetRmAllocParamObj(isNVOS64) outIoctlParams.FromOS64(ioctlParams) // Any copy-out error from // src/nvidia/src/kernel/rmapi/alloc_free.c:serverAllocApiCopyOut() is // discarded. outIoctlParams.CopyOut(fi.t, fi.ioctlParamsAddr) return 0, nil } return handler(fi, &ioctlParams, isNVOS64) } // rmAllocSimple implements NV_ESC_RM_ALLOC for classes whose parameters don't // contain any pointers or file descriptors requiring translation, and whose // objects require no special handling and depend only on their parents. // // Unlike frontendIoctlSimple and rmControlSimple, rmAllocSimple requires the // parameter type since the parameter's size is otherwise unknown. func rmAllocSimple[Params any, PtrParams marshalPtr[Params]](fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, isNVOS64 bool) (uintptr, error) { return rmAllocSimpleParams[Params, PtrParams](fi, ioctlParams, isNVOS64, addSimpleObjDepParentLocked) } // addSimpleObjDepParentLocked implements rmAllocInvoke.addObjLocked for // classes that require no special handling and depend only on their parents. func addSimpleObjDepParentLocked[Params any](fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, rightsRequested nvgpu.RS_ACCESS_MASK, allocParams *Params) { fi.fd.dev.nvp.objAdd(fi.ctx, ioctlParams.HRoot, ioctlParams.HObjectNew, ioctlParams.HClass, newRmAllocObject(fi.fd, ioctlParams, rightsRequested, allocParams), ioctlParams.HObjectParent) } func rmAllocSimpleParams[Params any, PtrParams marshalPtr[Params]](fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, isNVOS64 bool, objAddLocked func(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, rightsRequested nvgpu.RS_ACCESS_MASK, allocParams *Params)) (uintptr, error) { if ioctlParams.PAllocParms == 0 { return rmAllocInvoke[Params](fi, ioctlParams, nil, isNVOS64, objAddLocked) } var allocParamsValue Params allocParams := PtrParams(&allocParamsValue) if _, err := allocParams.CopyIn(fi.t, addrFromP64(ioctlParams.PAllocParms)); err != nil { return 0, err } n, err := rmAllocInvoke(fi, ioctlParams, allocParams, isNVOS64, objAddLocked) if err != nil { return n, err } if _, err := allocParams.CopyOut(fi.t, addrFromP64(ioctlParams.PAllocParms)); err != nil { return n, err } return n, nil } func rmAllocNoParams(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, isNVOS64 bool) (uintptr, error) { return rmAllocInvoke[byte](fi, ioctlParams, nil, isNVOS64, addSimpleObjDepParentLocked) } func rmAllocRootClient(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, isNVOS64 bool) (uintptr, error) { return rmAllocSimpleParams(fi, ioctlParams, isNVOS64, func(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, rightsRequested nvgpu.RS_ACCESS_MASK, allocParams *nvgpu.Handle) { fi.fd.dev.nvp.objAdd(fi.ctx, ioctlParams.HRoot, ioctlParams.HObjectNew, ioctlParams.HClass, newRootClient(fi.fd, ioctlParams, rightsRequested, allocParams)) if fi.fd.clients == nil { fi.fd.clients = make(map[nvgpu.Handle]struct{}) } fi.fd.clients[ioctlParams.HObjectNew] = struct{}{} }) } func rmAllocEventOSEvent(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, isNVOS64 bool) (uintptr, error) { var allocParams nvgpu.NV0005_ALLOC_PARAMETERS if _, err := allocParams.CopyIn(fi.t, addrFromP64(ioctlParams.PAllocParms)); err != nil { return 0, err } eventFileGeneric, _ := fi.t.FDTable().Get(int32(allocParams.Data)) if eventFileGeneric == nil { return 0, linuxerr.EINVAL } defer eventFileGeneric.DecRef(fi.ctx) eventFile, ok := eventFileGeneric.Impl().(*frontendFD) if !ok { return 0, linuxerr.EINVAL } origData := allocParams.Data allocParams.Data = nvgpu.P64(uint64(eventFile.hostFD)) n, err := rmAllocInvoke(fi, ioctlParams, &allocParams, isNVOS64, func(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, rightsRequested nvgpu.RS_ACCESS_MASK, allocParams *nvgpu.NV0005_ALLOC_PARAMETERS) { fi.fd.dev.nvp.objAdd(fi.ctx, ioctlParams.HRoot, ioctlParams.HObjectNew, ioctlParams.HClass, &osEvent{}, ioctlParams.HObjectParent) }) if err != nil { return n, err } allocParams.Data = origData if _, err := allocParams.CopyOut(fi.t, addrFromP64(ioctlParams.PAllocParms)); err != nil { return n, err } return n, nil } func rmAllocSMDebuggerSession(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, isNVOS64 bool) (uintptr, error) { return rmAllocSimpleParams(fi, ioctlParams, isNVOS64, func(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, rightsRequested nvgpu.RS_ACCESS_MASK, allocParams *nvgpu.NV83DE_ALLOC_PARAMETERS) { // Compare // src/nvidia/src/kernel/gpu/gr/kernel_sm_debugger_session.c:ksmdbgssnConstruct_IMPL() // => _ShareDebugger() => sessionAddDependency/sessionAddDependant(); // the driver indirects through a per-KernelGraphicsObject // RmDebuggerSession, which we elide for dependency tracking. fi.fd.dev.nvp.objAdd(fi.ctx, ioctlParams.HRoot, ioctlParams.HObjectNew, ioctlParams.HClass, newRmAllocObject(fi.fd, ioctlParams, rightsRequested, allocParams), ioctlParams.HObjectParent, allocParams.HClass3DObject) }) } func rmAllocChannelGroup(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, isNVOS64 bool) (uintptr, error) { return rmAllocSimpleParams(fi, ioctlParams, isNVOS64, func(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, rightsRequested nvgpu.RS_ACCESS_MASK, allocParams *nvgpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS) { // See // src/nvidia/src/kernel/gpu/fifo/kernel_channel_group_api.c:kchangrpapiConstruct_IMPL() // => refAddDependant(). fi.fd.dev.nvp.objAdd(fi.ctx, ioctlParams.HRoot, ioctlParams.HObjectNew, ioctlParams.HClass, newRmAllocObject(fi.fd, ioctlParams, rightsRequested, allocParams), ioctlParams.HObjectParent, allocParams.HVASpace) // Note: When the channel group's engine type is GR, which is always // true unless MIG is enabled, kchangrpapiConstruct_IMPL() constructs a // KERNEL_GRAPHICS_CONTEXT whose lifetime is the same as the channel // group's (the graphics context is freed when the channel group is). // Channels, context shares, and graphics objects depend on this // graphics context rather than the channel group. Consequently, if MIG // is enabled, these might not depend on the channel group at all. // Since nvproxy currently does not support MIG, we represent these // dependencies as unconditionally on the channel group instead. }) } func rmAllocChannel(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, isNVOS64 bool) (uintptr, error) { return rmAllocSimpleParams(fi, ioctlParams, isNVOS64, func(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, rightsRequested nvgpu.RS_ACCESS_MASK, allocParams *nvgpu.NV_CHANNEL_ALLOC_PARAMS) { // See // src/nvidia/src/kernel/gpu/fifo/kernel_channel.c:kchannelConstruct_IMPL() // => refAddDependant(). The channel's parent may be a device or // channel group; if it is a channel group then the channel depends on // it via the parent relationship, and if it is not a channel group // then kchannelConstruct_IMPL() constructs one internally and frees it // when the channel is destroyed, so either way no separate dependency // is required. fi.fd.dev.nvp.objAdd(fi.ctx, ioctlParams.HRoot, ioctlParams.HObjectNew, ioctlParams.HClass, newRmAllocObject(fi.fd, ioctlParams, rightsRequested, allocParams), ioctlParams.HObjectParent, allocParams.HVASpace, allocParams.HContextShare) }) } func rmAllocContextShare(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, isNVOS64 bool) (uintptr, error) { return rmAllocSimpleParams(fi, ioctlParams, isNVOS64, func(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, rightsRequested nvgpu.RS_ACCESS_MASK, allocParams *nvgpu.NV_CTXSHARE_ALLOCATION_PARAMETERS) { // See // src/nvidia/src/kernel/gpu/fifo/kernel_ctxshare.c:kctxshareapiConstruct_IMPL() // => refAddDependant(). The context share's parent is the channel // group, so (given that we are representing graphics context // dependencies as channel group dependencies) no separate dependency // is required. fi.fd.dev.nvp.objAdd(fi.ctx, ioctlParams.HRoot, ioctlParams.HObjectNew, ioctlParams.HClass, newRmAllocObject(fi.fd, ioctlParams, rightsRequested, allocParams), ioctlParams.HObjectParent, allocParams.HVASpace) }) } func rmVidHeapControl(fi *frontendIoctlState) (uintptr, error) { var ioctlParams nvgpu.NVOS32Parameters if fi.ioctlParamsSize != nvgpu.SizeofNVOS32Parameters { return 0, linuxerr.EINVAL } if _, err := ioctlParams.CopyIn(fi.t, fi.ioctlParamsAddr); err != nil { return 0, err } // Function determines the type of Data. if fi.ctx.IsLogging(log.Debug) { fi.ctx.Debugf("nvproxy: VID_HEAP_CONTROL function %d", ioctlParams.Function) } // See // src/nvidia/interface/deprecated/rmapi_deprecated_vidheapctrl.c:rmVidHeapControlTable // for implementation. switch ioctlParams.Function { case nvgpu.NVOS32_FUNCTION_ALLOC_SIZE: return rmVidHeapControlAllocSize(fi, &ioctlParams) default: fi.ctx.Warningf("nvproxy: unknown VID_HEAP_CONTROL function %d", ioctlParams.Function) return 0, linuxerr.EINVAL } } func rmMapMemory(fi *frontendIoctlState) (uintptr, error) { var ioctlParams nvgpu.IoctlNVOS33ParametersWithFD if fi.ioctlParamsSize != nvgpu.SizeofIoctlNVOS33ParametersWithFD { return 0, linuxerr.EINVAL } if _, err := ioctlParams.CopyIn(fi.t, fi.ioctlParamsAddr); err != nil { return 0, err } mapFileGeneric, _ := fi.t.FDTable().Get(ioctlParams.FD) if mapFileGeneric == nil { return 0, linuxerr.EINVAL } defer mapFileGeneric.DecRef(fi.ctx) mapFile, ok := mapFileGeneric.Impl().(*frontendFD) if !ok { return 0, linuxerr.EINVAL } if mapFile.haveMmapContext.Load() || !mapFile.haveMmapContext.CompareAndSwap(false, true) { fi.ctx.Warningf("nvproxy: attempted to reuse FD %d for NV_ESC_RM_MAP_MEMORY", ioctlParams.FD) return 0, linuxerr.EINVAL } origFD := ioctlParams.FD ioctlParams.FD = mapFile.hostFD n, err := frontendIoctlInvoke(fi, &ioctlParams) if err != nil { return n, err } ioctlParams.FD = origFD if _, err := ioctlParams.CopyOut(fi.t, fi.ioctlParamsAddr); err != nil { return n, err } return n, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/frontend_mmap.go000066400000000000000000000056371465435605700273120ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package nvproxy import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *frontendFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts) } // AddMapping implements memmap.Mappable.AddMapping. func (fd *frontendFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (fd *frontendFD) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { } // CopyMapping implements memmap.Mappable.CopyMapping. func (fd *frontendFD) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { return nil } // Translate implements memmap.Mappable.Translate. func (fd *frontendFD) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { return []memmap.Translation{ { Source: optional, File: &fd.memmapFile, Offset: optional.Start, Perms: at, }, }, nil } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (fd *frontendFD) InvalidateUnsavable(ctx context.Context) error { return nil } // +stateify savable type frontendFDMemmapFile struct { memmap.NoBufferedIOFallback fd *frontendFD } // IncRef implements memmap.File.IncRef. func (mf *frontendFDMemmapFile) IncRef(fr memmap.FileRange, memCgID uint32) { } // DecRef implements memmap.File.DecRef. func (mf *frontendFDMemmapFile) DecRef(fr memmap.FileRange) { } // MapInternal implements memmap.File.MapInternal. func (mf *frontendFDMemmapFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { // FIXME(jamieliu): determine if this is safe log.Traceback("nvproxy: rejecting frontendFDMemmapFile.MapInternal") return safemem.BlockSeq{}, linuxerr.EINVAL } // FD implements memmap.File.FD. func (mf *frontendFDMemmapFile) FD() int { return int(mf.fd.hostFD) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/frontend_unsafe.go000066400000000000000000000231711465435605700276320ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package nvproxy import ( "runtime" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/nvgpu" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" ) func frontendIoctlInvoke[Params any](fi *frontendIoctlState, sentryParams *Params) (uintptr, error) { n, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(fi.fd.hostFD), frontendIoctlCmd(fi.nr, fi.ioctlParamsSize), uintptr(unsafe.Pointer(sentryParams))) if errno != 0 { return n, errno } return n, nil } func rmControlInvoke[Params any](fi *frontendIoctlState, ioctlParams *nvgpu.NVOS54Parameters, ctrlParams *Params) (uintptr, error) { defer runtime.KeepAlive(ctrlParams) // since we convert to non-pointer-typed P64 origParams := ioctlParams.Params ioctlParams.Params = p64FromPtr(unsafe.Pointer(ctrlParams)) n, err := frontendIoctlInvoke(fi, ioctlParams) ioctlParams.Params = origParams if err != nil { return n, err } if _, err := ioctlParams.CopyOut(fi.t, fi.ioctlParamsAddr); err != nil { return n, err } return n, nil } func ctrlClientSystemGetBuildVersionInvoke(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS54Parameters, ctrlParams *nvgpu.NV0000_CTRL_SYSTEM_GET_BUILD_VERSION_PARAMS, driverVersionBuf, versionBuf, titleBuf *byte) (uintptr, error) { // *Buf arguments don't need runtime.KeepAlive() since our caller // ctrlClientSystemGetBuildVersion() copies them out, keeping them alive // during this function. origPDriverVersionBuffer := ctrlParams.PDriverVersionBuffer origPVersionBuffer := ctrlParams.PVersionBuffer origPTitleBuffer := ctrlParams.PTitleBuffer ctrlParams.PDriverVersionBuffer = p64FromPtr(unsafe.Pointer(driverVersionBuf)) ctrlParams.PVersionBuffer = p64FromPtr(unsafe.Pointer(versionBuf)) ctrlParams.PTitleBuffer = p64FromPtr(unsafe.Pointer(titleBuf)) n, err := rmControlInvoke(fi, ioctlParams, ctrlParams) ctrlParams.PDriverVersionBuffer = origPDriverVersionBuffer ctrlParams.PVersionBuffer = origPVersionBuffer ctrlParams.PTitleBuffer = origPTitleBuffer if err != nil { return n, err } if _, err := ctrlParams.CopyOut(fi.t, addrFromP64(ioctlParams.Params)); err != nil { return n, err } return n, nil } func ctrlIoctlHasInfoList[Params any, PtrParams hasCtrlInfoListPtr[Params]](fi *frontendIoctlState, ioctlParams *nvgpu.NVOS54Parameters) (uintptr, error) { var ctrlParamsValue Params ctrlParams := PtrParams(&ctrlParamsValue) if ctrlParams.SizeBytes() != int(ioctlParams.ParamsSize) { return 0, linuxerr.EINVAL } if _, err := ctrlParams.CopyIn(fi.t, addrFromP64(ioctlParams.Params)); err != nil { return 0, err } var infoList []byte if listSize := ctrlParams.ListSize(); listSize > 0 { infoList = make([]byte, listSize*nvgpu.CtrlXxxInfoSize) if _, err := fi.t.CopyInBytes(addrFromP64(ctrlParams.CtrlInfoList()), infoList); err != nil { return 0, err } } origInfoList := ctrlParams.CtrlInfoList() if infoList == nil { ctrlParams.SetCtrlInfoList(p64FromPtr(unsafe.Pointer(nil))) } else { ctrlParams.SetCtrlInfoList(p64FromPtr(unsafe.Pointer(&infoList[0]))) } n, err := rmControlInvoke(fi, ioctlParams, ctrlParams) ctrlParams.SetCtrlInfoList(origInfoList) if err != nil { return n, err } if infoList != nil { if _, err := fi.t.CopyOutBytes(addrFromP64(origInfoList), infoList); err != nil { return n, err } } if _, err := ctrlParams.CopyOut(fi.t, addrFromP64(ioctlParams.Params)); err != nil { return n, err } return n, nil } func ctrlDevGpuGetClasslistInvoke(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS54Parameters, ctrlParams *nvgpu.NV0080_CTRL_GPU_GET_CLASSLIST_PARAMS, classList []uint32) (uintptr, error) { origClassList := ctrlParams.ClassList ctrlParams.ClassList = p64FromPtr(unsafe.Pointer(&classList[0])) n, err := rmControlInvoke(fi, ioctlParams, ctrlParams) ctrlParams.ClassList = origClassList if err != nil { return n, err } if _, err := primitive.CopyUint32SliceOut(fi.t, addrFromP64(origClassList), classList); err != nil { return 0, err } if _, err := ctrlParams.CopyOut(fi.t, addrFromP64(ioctlParams.Params)); err != nil { return n, err } return n, nil } func ctrlDevFIFOGetChannelList(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS54Parameters) (uintptr, error) { var ctrlParams nvgpu.NV0080_CTRL_FIFO_GET_CHANNELLIST_PARAMS if ctrlParams.SizeBytes() != int(ioctlParams.ParamsSize) { return 0, linuxerr.EINVAL } if _, err := ctrlParams.CopyIn(fi.t, addrFromP64(ioctlParams.Params)); err != nil { return 0, err } if ctrlParams.NumChannels == 0 { // Compare // src/nvidia/src/kernel/gpu/fifo/kernel_fifo_ctrl.c:deviceCtrlCmdFifoGetChannelList_IMPL(). return 0, linuxerr.EINVAL } channelHandleList := make([]uint32, ctrlParams.NumChannels) if _, err := primitive.CopyUint32SliceIn(fi.t, addrFromP64(ctrlParams.PChannelHandleList), channelHandleList); err != nil { return 0, err } channelList := make([]uint32, ctrlParams.NumChannels) if _, err := primitive.CopyUint32SliceIn(fi.t, addrFromP64(ctrlParams.PChannelList), channelList); err != nil { return 0, err } origPChannelHandleList := ctrlParams.PChannelHandleList origPChannelList := ctrlParams.PChannelList ctrlParams.PChannelHandleList = p64FromPtr(unsafe.Pointer(&channelHandleList[0])) ctrlParams.PChannelList = p64FromPtr(unsafe.Pointer(&channelList[0])) n, err := rmControlInvoke(fi, ioctlParams, &ctrlParams) ctrlParams.PChannelHandleList = origPChannelHandleList ctrlParams.PChannelList = origPChannelList if err != nil { return n, err } if _, err := primitive.CopyUint32SliceOut(fi.t, addrFromP64(origPChannelHandleList), channelHandleList); err != nil { return 0, err } if _, err := primitive.CopyUint32SliceOut(fi.t, addrFromP64(origPChannelList), channelList); err != nil { return 0, err } if _, err := ctrlParams.CopyOut(fi.t, addrFromP64(ioctlParams.Params)); err != nil { return n, err } return n, nil } func rmAllocInvoke[Params any](fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, allocParams *Params, isNVOS64 bool, addObjLocked func(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, rightsRequested nvgpu.RS_ACCESS_MASK, allocParams *Params)) (uintptr, error) { defer runtime.KeepAlive(allocParams) // since we convert to non-pointer-typed P64 // Temporarily replace application pointers with sentry pointers. origPAllocParms := ioctlParams.PAllocParms origPRightsRequested := ioctlParams.PRightsRequested var rightsRequested nvgpu.RS_ACCESS_MASK if ioctlParams.PRightsRequested != 0 { if _, err := rightsRequested.CopyIn(fi.t, addrFromP64(ioctlParams.PRightsRequested)); err != nil { return 0, err } ioctlParams.PRightsRequested = p64FromPtr(unsafe.Pointer(&rightsRequested)) } ioctlParams.PAllocParms = p64FromPtr(unsafe.Pointer(allocParams)) // Invoke the driver ioctl and restore application pointers. We always pass // NVOS64Parameters to the driver even if !isNVOS64, as this is handled // identically to the equivalent NVOS21Parameters; compare // src/nvidia/src/kernel/rmapi/entry_points.c:_nv04AllocWithSecInfo() and // _nv04AllocWithAccessSecInfo(). fi.fd.dev.nvp.objsLock() n, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(fi.fd.hostFD), frontendIoctlCmd(nvgpu.NV_ESC_RM_ALLOC, nvgpu.SizeofNVOS64Parameters), uintptr(unsafe.Pointer(ioctlParams))) if errno == 0 && ioctlParams.Status == nvgpu.NV_OK { addObjLocked(fi, ioctlParams, rightsRequested, allocParams) } fi.fd.dev.nvp.objsUnlock() ioctlParams.PAllocParms = origPAllocParms ioctlParams.PRightsRequested = origPRightsRequested if errno != 0 { return n, errno } // Copy updated params out to the application. outIoctlParams := nvgpu.GetRmAllocParamObj(isNVOS64) outIoctlParams.FromOS64(*ioctlParams) if ioctlParams.PRightsRequested != 0 { if _, err := rightsRequested.CopyOut(fi.t, addrFromP64(ioctlParams.PRightsRequested)); err != nil { return n, err } } if _, err := outIoctlParams.CopyOut(fi.t, fi.ioctlParamsAddr); err != nil { return n, err } return n, nil } func rmVidHeapControlAllocSize(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS32Parameters) (uintptr, error) { allocSizeParams := (*nvgpu.NVOS32AllocSize)(unsafe.Pointer(&ioctlParams.Data)) origAddress := allocSizeParams.Address var addr uint64 if allocSizeParams.Address != 0 { if _, err := primitive.CopyUint64In(fi.t, addrFromP64(allocSizeParams.Address), &addr); err != nil { return 0, err } allocSizeParams.Address = p64FromPtr(unsafe.Pointer(&addr)) } fi.fd.dev.nvp.objsLock() n, err := frontendIoctlInvoke(fi, ioctlParams) if err == nil && ioctlParams.Status == nvgpu.NV_OK { // src/nvidia/src/kernel/mem_mgr/virtual_mem.c:virtmemConstruct_IMPL() => refAddDependant() fi.fd.dev.nvp.objAdd(fi.ctx, ioctlParams.HRoot, allocSizeParams.HMemory, nvgpu.NV50_MEMORY_VIRTUAL, &virtMem{}, ioctlParams.HObjectParent, ioctlParams.HVASpace) } fi.fd.dev.nvp.objsUnlock() allocSizeParams.Address = origAddress if err != nil { return n, err } if allocSizeParams.Address != 0 { if _, err := primitive.CopyUint64Out(fi.t, addrFromP64(allocSizeParams.Address), addr); err != nil { return n, err } } if _, err := ioctlParams.CopyOut(fi.t, fi.ioctlParamsAddr); err != nil { return n, err } return n, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/nvproxy.go000066400000000000000000000065341465435605700262030ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package nvproxy implements proxying for the Nvidia GPU Linux kernel driver: // https://github.com/NVIDIA/open-gpu-kernel-modules. // // Supported Nvidia GPUs: T4, L4, A100, A10G and H100. package nvproxy import ( "fmt" "gvisor.dev/gvisor/pkg/abi/nvgpu" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // Register registers all devices implemented by this package in vfsObj. func Register(vfsObj *vfs.VirtualFilesystem, versionStr string, uvmDevMajor uint32) error { // The kernel driver's interface is unstable, so only allow versions of the // driver that are known to be supported. log.Infof("NVIDIA driver version: %s", versionStr) version, err := DriverVersionFrom(versionStr) if err != nil { return fmt.Errorf("failed to parse Nvidia driver version %s: %w", versionStr, err) } abiCons, ok := abis[version] if !ok { return fmt.Errorf("unsupported Nvidia driver version: %s", versionStr) } nvp := &nvproxy{ abi: abiCons.cons(), version: version, frontendFDs: make(map[*frontendFD]struct{}), clients: make(map[nvgpu.Handle]*rootClient), objsFreeSet: make(map[*object]struct{}), } for minor := uint32(0); minor <= nvgpu.NV_CONTROL_DEVICE_MINOR; minor++ { if err := vfsObj.RegisterDevice(vfs.CharDevice, nvgpu.NV_MAJOR_DEVICE_NUMBER, minor, &frontendDevice{ nvp: nvp, minor: minor, }, &vfs.RegisterDeviceOptions{ GroupName: "nvidia-frontend", }); err != nil { return err } } if err := vfsObj.RegisterDevice(vfs.CharDevice, uvmDevMajor, nvgpu.NVIDIA_UVM_PRIMARY_MINOR_NUMBER, &uvmDevice{ nvp: nvp, }, &vfs.RegisterDeviceOptions{ GroupName: "nvidia-uvm", }); err != nil { return err } return nil } // +stateify savable type nvproxy struct { abi *driverABI `state:"nosave"` version DriverVersion fdsMu fdsMutex `state:"nosave"` frontendFDs map[*frontendFD]struct{} // See object.go. // Users should call nvproxy.objsLock/Unlock() rather than locking objsMu // directly. objsMu objsMutex `state:"nosave"` // These fields are protected by objsMu. clients map[nvgpu.Handle]*rootClient objsCleanup []func() `state:"nosave"` objsFreeList objectFreeList `state:"nosave"` objsFreeSet map[*object]struct{} `state:"nosave"` } type marshalPtr[T any] interface { *T marshal.Marshallable } func addrFromP64(p nvgpu.P64) hostarch.Addr { return hostarch.Addr(uintptr(uint64(p))) } type hasFrontendFDPtr[T any] interface { marshalPtr[T] nvgpu.HasFrontendFD } type hasCtrlInfoListPtr[T any] interface { marshalPtr[T] nvgpu.HasCtrlInfoList } // NvidiaDeviceFD is an interface that should be implemented by all // vfs.FileDescriptionImpl of Nvidia devices. type NvidiaDeviceFD interface { IsNvidiaDeviceFD() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/nvproxy_impl_state_autogen.go000066400000000000000000000001351465435605700321350ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package nvproxy golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/nvproxy_state_autogen.go000066400000000000000000000306061465435605700311220ustar00rootroot00000000000000// automatically generated by stateify. package nvproxy import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (dev *frontendDevice) StateTypeName() string { return "pkg/sentry/devices/nvproxy.frontendDevice" } func (dev *frontendDevice) StateFields() []string { return []string{ "nvp", "minor", } } func (dev *frontendDevice) beforeSave() {} // +checklocksignore func (dev *frontendDevice) StateSave(stateSinkObject state.Sink) { dev.beforeSave() stateSinkObject.Save(0, &dev.nvp) stateSinkObject.Save(1, &dev.minor) } func (dev *frontendDevice) afterLoad(context.Context) {} // +checklocksignore func (dev *frontendDevice) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &dev.nvp) stateSourceObject.Load(1, &dev.minor) } func (fd *frontendFD) StateTypeName() string { return "pkg/sentry/devices/nvproxy.frontendFD" } func (fd *frontendFD) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "NoLockFD", "dev", "containerName", "hostFD", "memmapFile", "internalQueue", "internalEntry", "cachedEvents", "appQueue", "clients", } } // +checklocksignore func (fd *frontendFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.vfsfd) stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &fd.NoLockFD) stateSinkObject.Save(4, &fd.dev) stateSinkObject.Save(5, &fd.containerName) stateSinkObject.Save(6, &fd.hostFD) stateSinkObject.Save(7, &fd.memmapFile) stateSinkObject.Save(8, &fd.internalQueue) stateSinkObject.Save(9, &fd.internalEntry) stateSinkObject.Save(10, &fd.cachedEvents) stateSinkObject.Save(11, &fd.appQueue) stateSinkObject.Save(12, &fd.clients) } // +checklocksignore func (fd *frontendFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.vfsfd) stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &fd.NoLockFD) stateSourceObject.Load(4, &fd.dev) stateSourceObject.Load(5, &fd.containerName) stateSourceObject.Load(6, &fd.hostFD) stateSourceObject.Load(7, &fd.memmapFile) stateSourceObject.Load(8, &fd.internalQueue) stateSourceObject.Load(9, &fd.internalEntry) stateSourceObject.Load(10, &fd.cachedEvents) stateSourceObject.Load(11, &fd.appQueue) stateSourceObject.Load(12, &fd.clients) stateSourceObject.AfterLoad(func() { fd.afterLoad(ctx) }) } func (mf *frontendFDMemmapFile) StateTypeName() string { return "pkg/sentry/devices/nvproxy.frontendFDMemmapFile" } func (mf *frontendFDMemmapFile) StateFields() []string { return []string{ "NoBufferedIOFallback", "fd", } } func (mf *frontendFDMemmapFile) beforeSave() {} // +checklocksignore func (mf *frontendFDMemmapFile) StateSave(stateSinkObject state.Sink) { mf.beforeSave() stateSinkObject.Save(0, &mf.NoBufferedIOFallback) stateSinkObject.Save(1, &mf.fd) } func (mf *frontendFDMemmapFile) afterLoad(context.Context) {} // +checklocksignore func (mf *frontendFDMemmapFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &mf.NoBufferedIOFallback) stateSourceObject.Load(1, &mf.fd) } func (nvp *nvproxy) StateTypeName() string { return "pkg/sentry/devices/nvproxy.nvproxy" } func (nvp *nvproxy) StateFields() []string { return []string{ "version", "frontendFDs", "clients", } } // +checklocksignore func (nvp *nvproxy) StateSave(stateSinkObject state.Sink) { nvp.beforeSave() stateSinkObject.Save(0, &nvp.version) stateSinkObject.Save(1, &nvp.frontendFDs) stateSinkObject.Save(2, &nvp.clients) } // +checklocksignore func (nvp *nvproxy) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &nvp.version) stateSourceObject.Load(1, &nvp.frontendFDs) stateSourceObject.Load(2, &nvp.clients) stateSourceObject.AfterLoad(func() { nvp.afterLoad(ctx) }) } func (o *object) StateTypeName() string { return "pkg/sentry/devices/nvproxy.object" } func (o *object) StateFields() []string { return []string{ "nvp", "client", "class", "handle", "impl", "deps", "rdeps", "objectFreeEntry", } } func (o *object) beforeSave() {} // +checklocksignore func (o *object) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.nvp) stateSinkObject.Save(1, &o.client) stateSinkObject.Save(2, &o.class) stateSinkObject.Save(3, &o.handle) stateSinkObject.Save(4, &o.impl) stateSinkObject.Save(5, &o.deps) stateSinkObject.Save(6, &o.rdeps) stateSinkObject.Save(7, &o.objectFreeEntry) } func (o *object) afterLoad(context.Context) {} // +checklocksignore func (o *object) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.nvp) stateSourceObject.Load(1, &o.client) stateSourceObject.Load(2, &o.class) stateSourceObject.Load(3, &o.handle) stateSourceObject.Load(4, &o.impl) stateSourceObject.Load(5, &o.deps) stateSourceObject.Load(6, &o.rdeps) stateSourceObject.Load(7, &o.objectFreeEntry) } func (c *capturedRmAllocParams) StateTypeName() string { return "pkg/sentry/devices/nvproxy.capturedRmAllocParams" } func (c *capturedRmAllocParams) StateFields() []string { return []string{ "fd", "ioctlParams", "rightsRequested", "allocParams", } } func (c *capturedRmAllocParams) beforeSave() {} // +checklocksignore func (c *capturedRmAllocParams) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.fd) stateSinkObject.Save(1, &c.ioctlParams) stateSinkObject.Save(2, &c.rightsRequested) stateSinkObject.Save(3, &c.allocParams) } func (c *capturedRmAllocParams) afterLoad(context.Context) {} // +checklocksignore func (c *capturedRmAllocParams) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.fd) stateSourceObject.Load(1, &c.ioctlParams) stateSourceObject.Load(2, &c.rightsRequested) stateSourceObject.Load(3, &c.allocParams) } func (o *rmAllocObject) StateTypeName() string { return "pkg/sentry/devices/nvproxy.rmAllocObject" } func (o *rmAllocObject) StateFields() []string { return []string{ "object", "params", } } func (o *rmAllocObject) beforeSave() {} // +checklocksignore func (o *rmAllocObject) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.object) stateSinkObject.Save(1, &o.params) } func (o *rmAllocObject) afterLoad(context.Context) {} // +checklocksignore func (o *rmAllocObject) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.object) stateSourceObject.Load(1, &o.params) } func (o *rootClient) StateTypeName() string { return "pkg/sentry/devices/nvproxy.rootClient" } func (o *rootClient) StateFields() []string { return []string{ "object", "resources", "params", } } func (o *rootClient) beforeSave() {} // +checklocksignore func (o *rootClient) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.object) stateSinkObject.Save(1, &o.resources) stateSinkObject.Save(2, &o.params) } func (o *rootClient) afterLoad(context.Context) {} // +checklocksignore func (o *rootClient) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.object) stateSourceObject.Load(1, &o.resources) stateSourceObject.Load(2, &o.params) } func (l *objectFreeList) StateTypeName() string { return "pkg/sentry/devices/nvproxy.objectFreeList" } func (l *objectFreeList) StateFields() []string { return []string{ "head", "tail", } } func (l *objectFreeList) beforeSave() {} // +checklocksignore func (l *objectFreeList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *objectFreeList) afterLoad(context.Context) {} // +checklocksignore func (l *objectFreeList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *objectFreeEntry) StateTypeName() string { return "pkg/sentry/devices/nvproxy.objectFreeEntry" } func (e *objectFreeEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *objectFreeEntry) beforeSave() {} // +checklocksignore func (e *objectFreeEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *objectFreeEntry) afterLoad(context.Context) {} // +checklocksignore func (e *objectFreeEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (dev *uvmDevice) StateTypeName() string { return "pkg/sentry/devices/nvproxy.uvmDevice" } func (dev *uvmDevice) StateFields() []string { return []string{ "nvp", } } func (dev *uvmDevice) beforeSave() {} // +checklocksignore func (dev *uvmDevice) StateSave(stateSinkObject state.Sink) { dev.beforeSave() stateSinkObject.Save(0, &dev.nvp) } func (dev *uvmDevice) afterLoad(context.Context) {} // +checklocksignore func (dev *uvmDevice) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &dev.nvp) } func (fd *uvmFD) StateTypeName() string { return "pkg/sentry/devices/nvproxy.uvmFD" } func (fd *uvmFD) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "NoLockFD", "dev", "containerName", "hostFD", "memmapFile", "queue", } } // +checklocksignore func (fd *uvmFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.vfsfd) stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &fd.NoLockFD) stateSinkObject.Save(4, &fd.dev) stateSinkObject.Save(5, &fd.containerName) stateSinkObject.Save(6, &fd.hostFD) stateSinkObject.Save(7, &fd.memmapFile) stateSinkObject.Save(8, &fd.queue) } // +checklocksignore func (fd *uvmFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.vfsfd) stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &fd.NoLockFD) stateSourceObject.Load(4, &fd.dev) stateSourceObject.Load(5, &fd.containerName) stateSourceObject.Load(6, &fd.hostFD) stateSourceObject.Load(7, &fd.memmapFile) stateSourceObject.Load(8, &fd.queue) stateSourceObject.AfterLoad(func() { fd.afterLoad(ctx) }) } func (mf *uvmFDMemmapFile) StateTypeName() string { return "pkg/sentry/devices/nvproxy.uvmFDMemmapFile" } func (mf *uvmFDMemmapFile) StateFields() []string { return []string{ "fd", } } func (mf *uvmFDMemmapFile) beforeSave() {} // +checklocksignore func (mf *uvmFDMemmapFile) StateSave(stateSinkObject state.Sink) { mf.beforeSave() stateSinkObject.Save(0, &mf.fd) } func (mf *uvmFDMemmapFile) afterLoad(context.Context) {} // +checklocksignore func (mf *uvmFDMemmapFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &mf.fd) } func (v *DriverVersion) StateTypeName() string { return "pkg/sentry/devices/nvproxy.DriverVersion" } func (v *DriverVersion) StateFields() []string { return []string{ "major", "minor", "patch", } } func (v *DriverVersion) beforeSave() {} // +checklocksignore func (v *DriverVersion) StateSave(stateSinkObject state.Sink) { v.beforeSave() stateSinkObject.Save(0, &v.major) stateSinkObject.Save(1, &v.minor) stateSinkObject.Save(2, &v.patch) } func (v *DriverVersion) afterLoad(context.Context) {} // +checklocksignore func (v *DriverVersion) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &v.major) stateSourceObject.Load(1, &v.minor) stateSourceObject.Load(2, &v.patch) } func init() { state.Register((*frontendDevice)(nil)) state.Register((*frontendFD)(nil)) state.Register((*frontendFDMemmapFile)(nil)) state.Register((*nvproxy)(nil)) state.Register((*object)(nil)) state.Register((*capturedRmAllocParams)(nil)) state.Register((*rmAllocObject)(nil)) state.Register((*rootClient)(nil)) state.Register((*objectFreeList)(nil)) state.Register((*objectFreeEntry)(nil)) state.Register((*uvmDevice)(nil)) state.Register((*uvmFD)(nil)) state.Register((*uvmFDMemmapFile)(nil)) state.Register((*DriverVersion)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/nvproxy_unsafe.go000066400000000000000000000060441465435605700275400ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package nvproxy import ( "bytes" "fmt" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/nvgpu" ) // HostDriverVersion returns the version of the host Nvidia driver. func HostDriverVersion() (string, error) { ctlFD, err := unix.Openat(-1, "/dev/nvidiactl", unix.O_RDONLY|unix.O_NOFOLLOW, 0) if err != nil { return "", fmt.Errorf("failed to open /dev/nvidiactl: %w", err) } defer unix.Close(ctlFD) // From src/nvidia/arch/nvalloc/unix/include/nv-ioctl.h: const NV_RM_API_VERSION_REPLY_RECOGNIZED = 1 // 530.30.02 and later versions of the host driver `#define // NV_RM_API_VERSION_CMD_QUERY '2'`, which causes this ioctl to return the // driver version without performing a check. Earlier versions of the // driver `#define NV_RM_API_VERSION_CMD_OVERRIDE '2'`, which causes the // ioctl to no-op. Try with Cmd '2' first, hoping that the driver // interprets it as _QUERY; if the returned string is empty, then it was // interpreted as _OVERRIDE and we need to perform an actual check (Cmd 0), // which has the downside of logging an error message. ioctlParams := nvgpu.RMAPIVersion{ Cmd: '2', } if _, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(ctlFD), frontendIoctlCmd(nvgpu.NV_ESC_CHECK_VERSION_STR, uint32(unsafe.Sizeof(ioctlParams))), uintptr(unsafe.Pointer(&ioctlParams))); errno != 0 { return "", fmt.Errorf("NV_ESC_CHECK_VERSION_STR ioctl error: %w", errno) } if ioctlParams.Reply != NV_RM_API_VERSION_REPLY_RECOGNIZED { return "", fmt.Errorf("unknown NV_ESC_CHECK_VERSION_STR reply: %d", ioctlParams.Reply) } if ioctlParams.VersionString[0] == '\x00' { ioctlParams.Cmd = 0 ioctlParams.Reply = 0 // We expect the check to fail on our empty version string, so tolerate // EINVAL. if _, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(ctlFD), frontendIoctlCmd(nvgpu.NV_ESC_CHECK_VERSION_STR, uint32(unsafe.Sizeof(ioctlParams))), uintptr(unsafe.Pointer(&ioctlParams))); errno != 0 && errno != unix.EINVAL { return "", fmt.Errorf("fallback NV_ESC_CHECK_VERSION_STR ioctl error: %w", errno) } if ioctlParams.Reply != NV_RM_API_VERSION_REPLY_RECOGNIZED { return "", fmt.Errorf("unknown fallback NV_ESC_CHECK_VERSION_STR reply: %d", ioctlParams.Reply) } } if i := bytes.IndexByte(ioctlParams.VersionString[:], '\x00'); i >= 0 { return string(ioctlParams.VersionString[:i]), nil } return string(ioctlParams.VersionString[:]), nil } func p64FromPtr(ptr unsafe.Pointer) nvgpu.P64 { return nvgpu.P64(uint64(uintptr(ptr))) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/nvproxy_unsafe_state_autogen.go000066400000000000000000000000711465435605700324540ustar00rootroot00000000000000// automatically generated by stateify. package nvproxy golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/object.go000066400000000000000000000241321465435605700257160ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package nvproxy import ( "gvisor.dev/gvisor/pkg/abi/nvgpu" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/mm" ) // object tracks a driver object. // // +stateify savable type object struct { // These fields are initialized by nvproxy.objAdd() and are immutable thereafter. nvp *nvproxy client *rootClient // may be == impl class nvgpu.ClassID handle nvgpu.Handle // in client.resources, and also nvp.clients if impl is rootClient impl objectImpl // The driver tracks parent/child relationships and "arbitrary dependency" // relationships between objects separately; we treat parent/child // relationships as equivalent to other dependencies. These fields are // protected by nvp.objsMu. deps map[*object]struct{} // objects that this object depends on rdeps map[*object]struct{} // objects that depend on this object objectFreeEntry } type objectImpl interface { // Object returns the object embedded in this objectImpl. Object() *object // Release is called when the driver object represented by this objectImpl // is freed. // // Preconditions: nvproxy.objsMu must be locked. Release(ctx context.Context) } // Object implements objectImpl.Object. func (o *object) Object() *object { return o } func (nvp *nvproxy) objsLock() { nvp.objsMu.Lock() } func (nvp *nvproxy) objsUnlock() { cleanup := nvp.objsCleanup nvp.objsCleanup = nil nvp.objsMu.Unlock() for _, f := range cleanup { f() } } // objAdd records the allocation of a driver object with class c and handle h, // in the client with handle clientH, represented by oi. Each non-zero handle // in deps is a dependency of the created object, such that the freeing of any // of those objects also results in the freeing of the recorded object. func (nvp *nvproxy) objAdd(ctx context.Context, clientH, h nvgpu.Handle, c nvgpu.ClassID, oi objectImpl, deps ...nvgpu.Handle) { if h.Val == 0 { log.Traceback("nvproxy: new object (class %v) has invalid handle 0", c) return } var client *rootClient // The driver forced NV01_ROOT and NV01_ROOT_NON_PRIV to NV01_ROOT_CLIENT, // so we only need to check for the latter. if c == nvgpu.NV01_ROOT_CLIENT { clientH = h client = oi.(*rootClient) if _, ok := nvp.clients[h]; ok { ctx.Warningf("nvproxy: client handle %v already in use", h) } nvp.clients[h] = client } else { var ok bool client, ok = nvp.clients[clientH] if !ok { log.Traceback("nvproxy: new object %v (class %v) has invalid client handle %v", h, c, clientH) return } } o := oi.Object() o.nvp = nvp o.client = client o.class = c o.handle = h o.impl = oi if _, ok := client.resources[h]; ok { ctx.Warningf("nvproxy: handle %v:%v already in use", clientH, h) } client.resources[h] = o for _, depH := range deps { if depH.Val == 0 /* aka NV01_NULL_OBJECT */ { continue } dep, ok := client.resources[depH] if !ok { log.Traceback("nvproxy: new object %v:%v (class %v) has invalid dependency handle %v", clientH, h, c, depH) continue } nvp.objDep(o, dep) } if ctx.IsLogging(log.Debug) { ctx.Debugf("nvproxy: added object %v:%v (class %v) with dependencies %v", clientH, h, c, deps) } } // objAddDep records a dependency between the existing object with handle h1 on // the existing object with handle h2, such that the freeing of the object with // handle h2 results in the freeing of object h1. Both h1 and h2 are handles in // the client with handle clientH. func (nvp *nvproxy) objAddDep(clientH, h1, h2 nvgpu.Handle) { if h1.Val == 0 || h2.Val == 0 { return } client, ok := nvp.clients[clientH] if !ok { log.Traceback("nvproxy: invalid client handle %v", clientH) return } o1, ok := client.resources[h1] if !ok { log.Traceback("nvproxy: invalid handle %v:%v", clientH, h1) return } o2, ok := client.resources[h2] if !ok { log.Traceback("nvproxy: invalid handle %v:%v", clientH, h2) return } nvp.objDep(o1, o2) } func (nvp *nvproxy) objDep(o1, o2 *object) { if o1.deps == nil { o1.deps = make(map[*object]struct{}) } o1.deps[o2] = struct{}{} if o2.rdeps == nil { o2.rdeps = make(map[*object]struct{}) } o2.rdeps[o1] = struct{}{} } // objFree marks an object and its transitive dependents as freed. // // Compare // src/nvidia/src/libraries/resserv/src/rs_server.c:serverFreeResourceTree(). func (nvp *nvproxy) objFree(ctx context.Context, clientH, h nvgpu.Handle) { // Check for recursive calls to objFree() (via objectImpl.Release()). // serverFreeResourceTree() permits this; we currently don't for // simplicity. if !nvp.objsFreeList.Empty() { panic("nvproxy.objFree called with non-empty free list (possible recursion?)") } client, ok := nvp.clients[clientH] if !ok { ctx.Warningf("nvproxy: freeing object handle %v with unknown client handle %v", h, clientH) return } o, ok := client.resources[h] if !ok { // When RS_COMPATABILITY_MODE is defined as true in the driver (as it // is in Linux), the driver permits NV_ESC_RM_FREE on nonexistent // handles as a no-op, and applications do this, so log at level INFO // rather than WARNING. ctx.Infof("nvproxy: freeing object with unknown handle %v:%v", clientH, h) return } nvp.prependFreedLockedRecursive(o) for !nvp.objsFreeList.Empty() { o2 := nvp.objsFreeList.Front() o2.impl.Release(ctx) for o3 := range o2.deps { delete(o3.rdeps, o2) } delete(o2.client.resources, o2.handle) if o2.class == nvgpu.NV01_ROOT_CLIENT { delete(nvp.clients, o2.handle) } nvp.objsFreeList.Remove(o2) delete(nvp.objsFreeSet, o2) if ctx.IsLogging(log.Debug) { ctx.Debugf("nvproxy: freed object %v:%v (class %v)", o2.client.handle, o2.handle, o2.class) } } } func (nvp *nvproxy) prependFreedLockedRecursive(o *object) { if _, ok := nvp.objsFreeSet[o]; ok { // o is already on the free list; move it to the front so that it // remains freed before our caller's o. nvp.objsFreeList.Remove(o) } else { nvp.objsFreeSet[o] = struct{}{} } nvp.objsFreeList.PushFront(o) // In the driver, freeing an object causes its children and dependents to // be freed first; see // src/nvidia/src/libraries/resserv/src/rs_server.c:serverFreeResourceTree() // => clientUpdatePendingFreeList_IMPL(). Replicate this freeing order. for o2 := range o.rdeps { nvp.prependFreedLockedRecursive(o2) } } // enqueueCleanup enqueues a cleanup function that will run after nvp.objsMu is // unlocked. func (nvp *nvproxy) enqueueCleanup(f func()) { nvp.objsCleanup = append(nvp.objsCleanup, f) } // +stateify savable type capturedRmAllocParams struct { fd *frontendFD ioctlParams nvgpu.NVOS64Parameters rightsRequested nvgpu.RS_ACCESS_MASK allocParams []byte } func captureRmAllocParams[Params any](fd *frontendFD, ioctlParams *nvgpu.NVOS64Parameters, rightsRequested nvgpu.RS_ACCESS_MASK, allocParams *Params) capturedRmAllocParams { var allocParamsBuf []byte if allocParams != nil { if allocParamsMarshal, ok := any(allocParams).(marshal.Marshallable); ok { allocParamsBuf = make([]byte, allocParamsMarshal.SizeBytes()) allocParamsMarshal.MarshalBytes(allocParamsBuf) } else { log.Traceback("nvproxy: allocParams %T is not marshalable") } } return capturedRmAllocParams{ fd: fd, ioctlParams: *ioctlParams, rightsRequested: rightsRequested, allocParams: allocParamsBuf, } } // rmAllocObject is an objectImpl tracking a driver object allocated by an // invocation of NV_ESC_RM_ALLOC whose class is not represented by a more // specific type. // // +stateify savable type rmAllocObject struct { object params capturedRmAllocParams } func newRmAllocObject[Params any](fd *frontendFD, ioctlParams *nvgpu.NVOS64Parameters, rightsRequested nvgpu.RS_ACCESS_MASK, allocParams *Params) *rmAllocObject { return &rmAllocObject{ params: captureRmAllocParams(fd, ioctlParams, rightsRequested, allocParams), } } // Release implements objectImpl.Release. func (o *rmAllocObject) Release(ctx context.Context) { // no-op } // rootClient is an objectImpl tracking a NV01_ROOT_CLIENT. // // +stateify savable type rootClient struct { object // These fields are protected by nvproxy.objsMu. resources map[nvgpu.Handle]*object params capturedRmAllocParams } func newRootClient(fd *frontendFD, ioctlParams *nvgpu.NVOS64Parameters, rightsRequested nvgpu.RS_ACCESS_MASK, allocParams *nvgpu.Handle) *rootClient { return &rootClient{ resources: make(map[nvgpu.Handle]*object), params: captureRmAllocParams(fd, ioctlParams, rightsRequested, allocParams), } } // Release implements objectImpl.Release. func (o *rootClient) Release(ctx context.Context) { delete(o.params.fd.clients, o.handle) } // osDescMem is an objectImpl tracking a NV01_MEMORY_SYSTEM_OS_DESCRIPTOR. type osDescMem struct { object pinnedRanges []mm.PinnedRange } // Release implements objectImpl.Release. func (o *osDescMem) Release(ctx context.Context) { // Unpin pages (which takes MM locks) without holding nvproxy locks. o.nvp.enqueueCleanup(func() { mm.Unpin(o.pinnedRanges) if ctx.IsLogging(log.Debug) { total := uint64(0) for _, pr := range o.pinnedRanges { total += uint64(pr.Source.Length()) } ctx.Debugf("nvproxy: unpinned %d bytes for released OS descriptor", total) } }) } // osEvent is an objectImpl tracking a NV01_EVENT_OS_EVENT. type osEvent struct { object } // Release implements objectImpl.Release. func (o *osEvent) Release(ctx context.Context) { // no-op } // virtMem is an objectImpl tracking a NV50_MEMORY_VIRTUAL. type virtMem struct { object } // Release implements objectImpl.Release. func (o *virtMem) Release(ctx context.Context) { // no-op } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/object_free_list.go000066400000000000000000000122451465435605700277540ustar00rootroot00000000000000package nvproxy // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type objectFreeElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (objectFreeElementMapper) linkerFor(elem *object) *object { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type objectFreeList struct { head *object tail *object } // Reset resets list l to the empty state. func (l *objectFreeList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *objectFreeList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *objectFreeList) Front() *object { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *objectFreeList) Back() *object { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *objectFreeList) Len() (count int) { for e := l.Front(); e != nil; e = (objectFreeElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *objectFreeList) PushFront(e *object) { linker := objectFreeElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { objectFreeElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *objectFreeList) PushFrontList(m *objectFreeList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { objectFreeElementMapper{}.linkerFor(l.head).SetPrev(m.tail) objectFreeElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *objectFreeList) PushBack(e *object) { linker := objectFreeElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { objectFreeElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *objectFreeList) PushBackList(m *objectFreeList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { objectFreeElementMapper{}.linkerFor(l.tail).SetNext(m.head) objectFreeElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *objectFreeList) InsertAfter(b, e *object) { bLinker := objectFreeElementMapper{}.linkerFor(b) eLinker := objectFreeElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { objectFreeElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *objectFreeList) InsertBefore(a, e *object) { aLinker := objectFreeElementMapper{}.linkerFor(a) eLinker := objectFreeElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { objectFreeElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *objectFreeList) Remove(e *object) { linker := objectFreeElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { objectFreeElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { objectFreeElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type objectFreeEntry struct { next *object prev *object } // Next returns the entry that follows e in the list. // //go:nosplit func (e *objectFreeEntry) Next() *object { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *objectFreeEntry) Prev() *object { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *objectFreeEntry) SetNext(elem *object) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *objectFreeEntry) SetPrev(elem *object) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/objs_mutex.go000066400000000000000000000030731465435605700266300ustar00rootroot00000000000000package nvproxy import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type objsMutex struct { mu sync.Mutex } var objsprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var objslockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type objslockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *objsMutex) Lock() { locking.AddGLock(objsprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *objsMutex) NestedLock(i objslockNameIndex) { locking.AddGLock(objsprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *objsMutex) Unlock() { locking.DelGLock(objsprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *objsMutex) NestedUnlock(i objslockNameIndex) { locking.DelGLock(objsprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func objsinitLockNames() {} func init() { objsinitLockNames() objsprefixIndex = locking.NewMutexClass(reflect.TypeOf(objsMutex{}), objslockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/save_restore.go000066400000000000000000000027361465435605700271570ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package nvproxy import ( goContext "context" "fmt" ) // beforeSave is invoked by stateify. func (nvp *nvproxy) beforeSave() { nvp.beforeSaveImpl() } // afterLoad is invoked by stateify. func (nvp *nvproxy) afterLoad(ctx goContext.Context) { Init() abiCons, ok := abis[nvp.version] if !ok { panic(fmt.Sprintf("driver version %q not found in abis map", nvp.version)) } nvp.abi = abiCons.cons() nvp.objsFreeSet = make(map[*object]struct{}) nvp.afterLoadImpl(ctx) } // beforeSave is invoked by stateify. func (fd *frontendFD) beforeSave() { fd.beforeSaveImpl() } // afterLoad is invoked by stateify. func (fd *frontendFD) afterLoad(ctx goContext.Context) { fd.afterLoadImpl(ctx) } // beforeSave is invoked by stateify. func (fd *uvmFD) beforeSave() { fd.beforeSaveImpl() } // afterLoad is invoked by stateify. func (fd *uvmFD) afterLoad(ctx goContext.Context) { fd.afterLoadImpl(ctx) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/save_restore_impl.go000066400000000000000000000024031465435605700301670ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false package nvproxy import ( goContext "context" ) func (nvp *nvproxy) beforeSaveImpl() { nvp.objsLock() defer nvp.objsUnlock() if len(nvp.clients) != 0 { panic("can't save with live nvproxy clients") } } func (nvp *nvproxy) afterLoadImpl(goContext.Context) { // no-op } func (fd *frontendFD) beforeSaveImpl() { panic("nvproxy.frontendFD is not saveable") } func (fd *frontendFD) afterLoadImpl(goContext.Context) { panic("nvproxy.frontendFD is not restorable") } func (fd *uvmFD) beforeSaveImpl() { panic("nvproxy.uvmFD is not saveable") } func (fd *uvmFD) afterLoadImpl(goContext.Context) { panic("nvproxy.uvmFD is not restorable") } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/seccomp_filters.go000066400000000000000000000153471465435605700276410ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package nvproxy import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/abi/nvgpu" "gvisor.dev/gvisor/pkg/seccomp" ) // Filters returns seccomp-bpf filters for this package. func Filters() seccomp.SyscallRules { notIocSizeMask := ^(((uintptr(1) << linux.IOC_SIZEBITS) - 1) << linux.IOC_SIZESHIFT) // for ioctls taking arbitrary size return seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_IOCTL: seccomp.Or{ seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.MaskedEqual(notIocSizeMask, frontendIoctlCmd(nvgpu.NV_ESC_CARD_INFO, 0)), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(frontendIoctlCmd(nvgpu.NV_ESC_CHECK_VERSION_STR, nvgpu.SizeofRMAPIVersion)), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.MaskedEqual(notIocSizeMask, frontendIoctlCmd(nvgpu.NV_ESC_ATTACH_GPUS_TO_FD, 0)), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(frontendIoctlCmd(nvgpu.NV_ESC_REGISTER_FD, nvgpu.SizeofIoctlRegisterFD)), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(frontendIoctlCmd(nvgpu.NV_ESC_ALLOC_OS_EVENT, nvgpu.SizeofIoctlAllocOSEvent)), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(frontendIoctlCmd(nvgpu.NV_ESC_FREE_OS_EVENT, nvgpu.SizeofIoctlFreeOSEvent)), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(frontendIoctlCmd(nvgpu.NV_ESC_SYS_PARAMS, nvgpu.SizeofIoctlSysParams)), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(frontendIoctlCmd(nvgpu.NV_ESC_WAIT_OPEN_COMPLETE, nvgpu.SizeofIoctlWaitOpenComplete)), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(frontendIoctlCmd(nvgpu.NV_ESC_RM_ALLOC_MEMORY, nvgpu.SizeofIoctlNVOS02ParametersWithFD)), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(frontendIoctlCmd(nvgpu.NV_ESC_RM_FREE, nvgpu.SizeofNVOS00Parameters)), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(frontendIoctlCmd(nvgpu.NV_ESC_RM_CONTROL, nvgpu.SizeofNVOS54Parameters)), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(frontendIoctlCmd(nvgpu.NV_ESC_RM_ALLOC, nvgpu.SizeofNVOS64Parameters)), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(frontendIoctlCmd(nvgpu.NV_ESC_RM_DUP_OBJECT, nvgpu.SizeofNVOS55Parameters)), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(frontendIoctlCmd(nvgpu.NV_ESC_RM_SHARE, nvgpu.SizeofNVOS57Parameters)), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(frontendIoctlCmd(nvgpu.NV_ESC_RM_VID_HEAP_CONTROL, nvgpu.SizeofNVOS32Parameters)), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(frontendIoctlCmd(nvgpu.NV_ESC_RM_MAP_MEMORY, nvgpu.SizeofIoctlNVOS33ParametersWithFD)), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(frontendIoctlCmd(nvgpu.NV_ESC_RM_UNMAP_MEMORY, nvgpu.SizeofNVOS34Parameters)), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(frontendIoctlCmd(nvgpu.NV_ESC_RM_UPDATE_DEVICE_MAPPING_INFO, nvgpu.SizeofNVOS56Parameters)), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_INITIALIZE), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_MM_INITIALIZE), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_DEINITIALIZE), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_CREATE_RANGE_GROUP), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_DESTROY_RANGE_GROUP), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_REGISTER_GPU_VASPACE), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_UNREGISTER_GPU_VASPACE), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_REGISTER_CHANNEL), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_UNREGISTER_CHANNEL), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_ENABLE_PEER_ACCESS), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_DISABLE_PEER_ACCESS), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_SET_RANGE_GROUP), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_MAP_EXTERNAL_ALLOCATION), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_FREE), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_REGISTER_GPU), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_UNREGISTER_GPU), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_PAGEABLE_MEM_ACCESS), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_SET_PREFERRED_LOCATION), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_DISABLE_READ_DUPLICATION), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_MIGRATE_RANGE_GROUP), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_TOOLS_READ_PROCESS_MEMORY), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_TOOLS_WRITE_PROCESS_MEMORY), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_MAP_DYNAMIC_PARALLELISM_REGION), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_UNMAP_EXTERNAL), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_ALLOC_SEMAPHORE_POOL), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_VALIDATE_VA_RANGE), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(nvgpu.UVM_CREATE_EXTERNAL_RANGE), }, }, unix.SYS_MREMAP: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(0), /* old_size */ seccomp.AnyValue{}, seccomp.EqualTo(linux.MREMAP_MAYMOVE | linux.MREMAP_FIXED), seccomp.AnyValue{}, seccomp.EqualTo(0), }, }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/uvm.go000066400000000000000000000173401465435605700252620ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package nvproxy import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/nvgpu" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/devutil" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // uvmDevice implements vfs.Device for /dev/nvidia-uvm. // // +stateify savable type uvmDevice struct { nvp *nvproxy } // Open implements vfs.Device.Open. func (dev *uvmDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { devClient := devutil.GoferClientFromContext(ctx) if devClient == nil { log.Warningf("devutil.CtxDevGoferClient is not set") return nil, linuxerr.ENOENT } hostFD, err := devClient.OpenAt(ctx, "nvidia-uvm", opts.Flags) if err != nil { ctx.Warningf("nvproxy: failed to open host /dev/nvidia-uvm: %v", err) return nil, err } fd := &uvmFD{ dev: dev, containerName: devClient.ContainerName(), hostFD: int32(hostFD), } if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ UseDentryMetadata: true, }); err != nil { unix.Close(hostFD) return nil, err } if err := fdnotifier.AddFD(int32(hostFD), &fd.queue); err != nil { unix.Close(hostFD) return nil, err } fd.memmapFile.fd = fd return &fd.vfsfd, nil } // uvmFD implements vfs.FileDescriptionImpl for /dev/nvidia-uvm. // // +stateify savable type uvmFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD dev *uvmDevice containerName string hostFD int32 memmapFile uvmFDMemmapFile queue waiter.Queue } // Release implements vfs.FileDescriptionImpl.Release. func (fd *uvmFD) Release(context.Context) { fdnotifier.RemoveFD(fd.hostFD) fd.queue.Notify(waiter.EventHUp) unix.Close(int(fd.hostFD)) } // EventRegister implements waiter.Waitable.EventRegister. func (fd *uvmFD) EventRegister(e *waiter.Entry) error { fd.queue.EventRegister(e) if err := fdnotifier.UpdateFD(fd.hostFD); err != nil { fd.queue.EventUnregister(e) return err } return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (fd *uvmFD) EventUnregister(e *waiter.Entry) { fd.queue.EventUnregister(e) if err := fdnotifier.UpdateFD(fd.hostFD); err != nil { panic(fmt.Sprint("UpdateFD:", err)) } } // Readiness implements waiter.Waitable.Readiness. func (fd *uvmFD) Readiness(mask waiter.EventMask) waiter.EventMask { return fdnotifier.NonBlockingPoll(fd.hostFD, mask) } // Epollable implements vfs.FileDescriptionImpl.Epollable. func (fd *uvmFD) Epollable() bool { return true } // Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (fd *uvmFD) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { cmd := args[1].Uint() argPtr := args[2].Pointer() t := kernel.TaskFromContext(ctx) if t == nil { panic("Ioctl should be called from a task context") } if ctx.IsLogging(log.Debug) { ctx.Debugf("nvproxy: uvm ioctl %d = %#x", cmd, cmd) } ui := uvmIoctlState{ fd: fd, ctx: ctx, t: t, cmd: cmd, ioctlParamsAddr: argPtr, } handler := fd.dev.nvp.abi.uvmIoctl[cmd] if handler == nil { ctx.Warningf("nvproxy: unknown uvm ioctl %d = %#x", cmd, cmd) return 0, linuxerr.EINVAL } return handler(&ui) } // IsNvidiaDeviceFD implements NvidiaDeviceFD.IsNvidiaDeviceFD. func (fd *uvmFD) IsNvidiaDeviceFD() {} // uvmIoctlState holds the state of a call to uvmFD.Ioctl(). type uvmIoctlState struct { fd *uvmFD ctx context.Context t *kernel.Task cmd uint32 ioctlParamsAddr hostarch.Addr } func uvmIoctlNoParams(ui *uvmIoctlState) (uintptr, error) { return uvmIoctlInvoke[byte](ui, nil) } func uvmIoctlSimple[Params any, PtrParams marshalPtr[Params]](ui *uvmIoctlState) (uintptr, error) { var ioctlParamsValue Params ioctlParams := PtrParams(&ioctlParamsValue) if _, err := ioctlParams.CopyIn(ui.t, ui.ioctlParamsAddr); err != nil { return 0, err } n, err := uvmIoctlInvoke(ui, ioctlParams) if err != nil { return n, err } if _, err := ioctlParams.CopyOut(ui.t, ui.ioctlParamsAddr); err != nil { return n, err } return n, nil } func uvmInitialize(ui *uvmIoctlState) (uintptr, error) { var ioctlParams nvgpu.UVM_INITIALIZE_PARAMS if _, err := ioctlParams.CopyIn(ui.t, ui.ioctlParamsAddr); err != nil { return 0, err } origFlags := ioctlParams.Flags // This is necessary to share the host UVM FD between sentry and // application processes. ioctlParams.Flags = ioctlParams.Flags | nvgpu.UVM_INIT_FLAGS_MULTI_PROCESS_SHARING_MODE n, err := uvmIoctlInvoke(ui, &ioctlParams) // Only expose the MULTI_PROCESS_SHARING_MODE flag if it was already present. ioctlParams.Flags &^= ^origFlags & nvgpu.UVM_INIT_FLAGS_MULTI_PROCESS_SHARING_MODE if err != nil { return n, err } if _, err := ioctlParams.CopyOut(ui.t, ui.ioctlParamsAddr); err != nil { return n, err } return n, nil } func uvmMMInitialize(ui *uvmIoctlState) (uintptr, error) { var ioctlParams nvgpu.UVM_MM_INITIALIZE_PARAMS if _, err := ioctlParams.CopyIn(ui.t, ui.ioctlParamsAddr); err != nil { return 0, err } failWithStatus := func(status uint32) error { outIoctlParams := ioctlParams outIoctlParams.Status = status _, err := outIoctlParams.CopyOut(ui.t, ui.ioctlParamsAddr) return err } uvmFileGeneric, _ := ui.t.FDTable().Get(ioctlParams.UvmFD) if uvmFileGeneric == nil { return 0, failWithStatus(nvgpu.NV_ERR_INVALID_ARGUMENT) } defer uvmFileGeneric.DecRef(ui.ctx) uvmFile, ok := uvmFileGeneric.Impl().(*uvmFD) if !ok { return 0, failWithStatus(nvgpu.NV_ERR_INVALID_ARGUMENT) } origFD := ioctlParams.UvmFD ioctlParams.UvmFD = uvmFile.hostFD n, err := uvmIoctlInvoke(ui, &ioctlParams) ioctlParams.UvmFD = origFD if err != nil { return n, err } if _, err := ioctlParams.CopyOut(ui.t, ui.ioctlParamsAddr); err != nil { return n, err } return n, nil } func uvmIoctlHasFrontendFD[Params any, PtrParams hasFrontendFDPtr[Params]](ui *uvmIoctlState) (uintptr, error) { var ioctlParamsValue Params ioctlParams := PtrParams(&ioctlParamsValue) if _, err := ioctlParams.CopyIn(ui.t, ui.ioctlParamsAddr); err != nil { return 0, err } origFD := ioctlParams.GetFrontendFD() if origFD < 0 { n, err := uvmIoctlInvoke(ui, ioctlParams) if err != nil { return n, err } if _, err := ioctlParams.CopyOut(ui.t, ui.ioctlParamsAddr); err != nil { return n, err } return n, nil } ctlFileGeneric, _ := ui.t.FDTable().Get(origFD) if ctlFileGeneric == nil { return 0, linuxerr.EINVAL } defer ctlFileGeneric.DecRef(ui.ctx) ctlFile, ok := ctlFileGeneric.Impl().(*frontendFD) if !ok { return 0, linuxerr.EINVAL } ioctlParams.SetFrontendFD(ctlFile.hostFD) n, err := uvmIoctlInvoke(ui, ioctlParams) ioctlParams.SetFrontendFD(origFD) if err != nil { return n, err } if _, err := ioctlParams.CopyOut(ui.t, ui.ioctlParamsAddr); err != nil { return n, err } return n, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/uvm_mmap.go000066400000000000000000000061211465435605700262670ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package nvproxy import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *uvmFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { // UVM_VALIDATE_VA_RANGE, and probably other ioctls, expect that // application mmaps of /dev/nvidia-uvm are immediately visible to the // driver. if opts.PlatformEffect < memmap.PlatformEffectPopulate { opts.PlatformEffect = memmap.PlatformEffectPopulate } return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts) } // AddMapping implements memmap.Mappable.AddMapping. func (fd *uvmFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (fd *uvmFD) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { } // CopyMapping implements memmap.Mappable.CopyMapping. func (fd *uvmFD) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { return nil } // Translate implements memmap.Mappable.Translate. func (fd *uvmFD) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { return []memmap.Translation{ { Source: optional, File: &fd.memmapFile, Offset: optional.Start, // kernel-open/nvidia-uvm/uvm.c:uvm_mmap() requires mappings to be // PROT_READ|PROT_WRITE. Perms: hostarch.ReadWrite, }, }, nil } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (fd *uvmFD) InvalidateUnsavable(ctx context.Context) error { return nil } // +stateify savable type uvmFDMemmapFile struct { fd *uvmFD } // IncRef implements memmap.File.IncRef. func (mf *uvmFDMemmapFile) IncRef(fr memmap.FileRange, memCgID uint32) { } // DecRef implements memmap.File.DecRef. func (mf *uvmFDMemmapFile) DecRef(fr memmap.FileRange) { } // MapInternal implements memmap.File.MapInternal. func (mf *uvmFDMemmapFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { // TODO(jamieliu): make an attempt with MAP_FIXED_NOREPLACE? return safemem.BlockSeq{}, memmap.BufferedIOFallbackErr{} } // FD implements memmap.File.FD. func (mf *uvmFDMemmapFile) FD() int { return int(mf.fd.hostFD) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/uvm_unsafe.go000066400000000000000000000064101465435605700266170ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package nvproxy import ( "runtime" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/nvgpu" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" ) func uvmIoctlInvoke[Params any](ui *uvmIoctlState, ioctlParams *Params) (uintptr, error) { n, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(ui.fd.hostFD), uintptr(ui.cmd), uintptr(unsafe.Pointer(ioctlParams))) if errno != 0 { return n, errno } return n, nil } // BufferReadAt implements memmap.File.BufferReadAt. func (mf *uvmFDMemmapFile) BufferReadAt(off uint64, dst []byte) (uint64, error) { // kernel-open/nvidia-uvm/uvm.c:uvm_fops.{read,read_iter,splice_read} == // NULL, so UVM data can only be read via ioctl. if len(dst) == 0 { return 0, nil } defer runtime.KeepAlive(dst) params := nvgpu.UVM_TOOLS_READ_PROCESS_MEMORY_PARAMS{ Buffer: uint64(uintptr(unsafe.Pointer(&dst[0]))), Size: uint64(len(dst)), TargetVA: off, } _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(mf.fd.hostFD), nvgpu.UVM_TOOLS_READ_PROCESS_MEMORY, uintptr(unsafe.Pointer(¶ms))) if errno != 0 { return 0, errno } if params.RMStatus != nvgpu.NV_OK { log.Warningf("nvproxy: UVM_TOOLS_READ_PROCESS_MEMORY(targetVa=%#x, len=%d) returned status %d", off, len(dst), params.RMStatus) return params.BytesRead, linuxerr.EINVAL } if params.BytesRead != uint64(len(dst)) { log.Warningf("nvproxy: UVM_TOOLS_READ_PROCESS_MEMORY(targetVa=%#x, len=%d) returned %d bytes", off, len(dst), params.BytesRead) return params.BytesRead, linuxerr.EINVAL } return params.BytesRead, nil } // BufferWriteAt implements memmap.File.BufferWriteAt. func (mf *uvmFDMemmapFile) BufferWriteAt(off uint64, src []byte) (uint64, error) { // kernel-open/nvidia-uvm/uvm.c:uvm_fops.{write,write_iter,splice_write} == // NULL, so UVM data can only be written via ioctl. if len(src) == 0 { return 0, nil } defer runtime.KeepAlive(src) params := nvgpu.UVM_TOOLS_WRITE_PROCESS_MEMORY_PARAMS{ Buffer: uint64(uintptr(unsafe.Pointer(&src[0]))), Size: uint64(len(src)), TargetVA: off, } _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(mf.fd.hostFD), nvgpu.UVM_TOOLS_WRITE_PROCESS_MEMORY, uintptr(unsafe.Pointer(¶ms))) if errno != 0 { return 0, errno } if params.RMStatus != nvgpu.NV_OK { log.Warningf("nvproxy: UVM_TOOLS_WRITE_PROCESS_MEMORY(targetVa=%#x, len=%d) returned status %d", off, len(src), params.RMStatus) return params.BytesWritten, linuxerr.EINVAL } if params.BytesWritten != uint64(len(src)) { log.Warningf("nvproxy: UVM_TOOLS_WRITE_PROCESS_MEMORY(targetVa=%#x, len=%d) returned %d bytes", off, len(src), params.BytesWritten) return params.BytesWritten, linuxerr.EINVAL } return params.BytesWritten, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/nvproxy/version.go000066400000000000000000000645561465435605700261530ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package nvproxy import ( "fmt" "strconv" "strings" "gvisor.dev/gvisor/pkg/abi/nvgpu" "gvisor.dev/gvisor/pkg/sync" ) // DriverVersion represents a NVIDIA driver version patch release. // // +stateify savable type DriverVersion struct { major int minor int patch int } // NewDriverVersion returns a new driver version. func NewDriverVersion(major, minor, patch int) DriverVersion { return DriverVersion{major, minor, patch} } // DriverVersionFrom returns a DriverVersion from a string. func DriverVersionFrom(version string) (DriverVersion, error) { parts := strings.Split(version, ".") if len(parts) != 3 { return DriverVersion{}, fmt.Errorf("invalid format of version string %q", version) } var ( res DriverVersion err error ) res.major, err = strconv.Atoi(parts[0]) if err != nil { return DriverVersion{}, fmt.Errorf("invalid format for major version %q: %v", version, err) } res.minor, err = strconv.Atoi(parts[1]) if err != nil { return DriverVersion{}, fmt.Errorf("invalid format for minor version %q: %v", version, err) } res.patch, err = strconv.Atoi(parts[2]) if err != nil { return DriverVersion{}, fmt.Errorf("invalid format for patch version %q: %v", version, err) } return res, nil } func (v DriverVersion) String() string { return fmt.Sprintf("%02d.%02d.%02d", v.major, v.minor, v.patch) } // Equals returns true if the two driver versions are equal. func (v DriverVersion) Equals(other DriverVersion) bool { return v.major == other.major && v.minor == other.minor && v.patch == other.patch } // isGreaterThan returns true if v is greater than other. // isGreaterThan returns true if v is more recent than other, assuming v and other are on the same // dev branch. func (v DriverVersion) isGreaterThan(other DriverVersion) bool { switch { case v.major > other.major: return true case other.major > v.major: return false case v.minor > other.minor: return true case other.minor > v.minor: return false case v.patch > other.patch: return true case other.patch > v.patch: return false default: return true } } type frontendIoctlHandler func(fi *frontendIoctlState) (uintptr, error) type controlCmdHandler func(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS54Parameters) (uintptr, error) type allocationClassHandler func(fi *frontendIoctlState, ioctlParams *nvgpu.NVOS64Parameters, isNVOS64 bool) (uintptr, error) type uvmIoctlHandler func(ui *uvmIoctlState) (uintptr, error) // A driverABIFunc constructs and returns a driverABI. // This indirection exists to avoid memory usage from unused driver ABIs. type driverABIFunc func() *driverABI // abiConAndChecksum couples the driver's abiConstructor to the SHA256 checksum of its linux .run // driver installer file from NVIDIA. type abiConAndChecksum struct { cons driverABIFunc checksum string } // driverABI defines the Nvidia kernel driver ABI proxied at a given version. // // The Nvidia driver's ioctl interface branches widely at various places in the // kernel driver. As for now, versioning is only supported for the following // points of branching: // 1. frontend device ioctls (based on IOC_NR(cmd)). // 2. uvm device ioctls (based on cmd). // 3. control commands within NV_ESC_RM_CONTROL in frontend device (based on // NVOS54_PARAMETERS.Cmd). Note that commands that have RM_GSS_LEGACY_MASK // set are not versioned. // 4. allocation classes within NV_ESC_RM_ALLOC in frontend device (based on // NVOS64_PARAMETERS.HClass). type driverABI struct { frontendIoctl map[uint32]frontendIoctlHandler uvmIoctl map[uint32]uvmIoctlHandler controlCmd map[uint32]controlCmdHandler allocationClass map[nvgpu.ClassID]allocationClassHandler } // abis is a global map containing all supported Nvidia driver ABIs. This is // initialized on Init() and is immutable henceforth. var abis map[DriverVersion]abiConAndChecksum var abisOnce sync.Once // Note: runfileChecksum is the checksum of the .run file of the driver installer for linux from // nvidia. // To add a new version, add in support as normal and add the "addDriverABI" call for your version. // Run `make sudo TARGETS=//tools/gpu:main ARGS="checksum --version={}"` to get checksum. func addDriverABI(major, minor, patch int, runfileChecksum string, cons driverABIFunc) driverABIFunc { if abis == nil { abis = make(map[DriverVersion]abiConAndChecksum) } version := NewDriverVersion(major, minor, patch) abis[version] = abiConAndChecksum{cons: cons, checksum: runfileChecksum} return cons } // Init initializes abis global map. func Init() { abisOnce.Do(func() { v535_104_05 := func() *driverABI { // Since there is no parent to inherit from, the driverABI needs to be // constructed with the entirety of the nvproxy functionality. return &driverABI{ frontendIoctl: map[uint32]frontendIoctlHandler{ nvgpu.NV_ESC_CARD_INFO: frontendIoctlSimple, // nv_ioctl_card_info_t array nvgpu.NV_ESC_CHECK_VERSION_STR: frontendIoctlSimple, // nv_rm_api_version_t nvgpu.NV_ESC_ATTACH_GPUS_TO_FD: frontendIoctlSimple, // NvU32 array containing GPU IDs nvgpu.NV_ESC_SYS_PARAMS: frontendIoctlSimple, // nv_ioctl_sys_params_t nvgpu.NV_ESC_RM_DUP_OBJECT: frontendIoctlSimple, // NVOS55_PARAMETERS nvgpu.NV_ESC_RM_SHARE: frontendIoctlSimple, // NVOS57_PARAMETERS nvgpu.NV_ESC_RM_UNMAP_MEMORY: frontendIoctlSimple, // NVOS34_PARAMETERS nvgpu.NV_ESC_RM_UPDATE_DEVICE_MAPPING_INFO: frontendIoctlSimple, // NVOS56_PARAMETERS nvgpu.NV_ESC_REGISTER_FD: frontendRegisterFD, nvgpu.NV_ESC_ALLOC_OS_EVENT: frontendIoctHasFD[nvgpu.IoctlAllocOSEvent], nvgpu.NV_ESC_FREE_OS_EVENT: frontendIoctHasFD[nvgpu.IoctlFreeOSEvent], nvgpu.NV_ESC_NUMA_INFO: rmNumaInfo, nvgpu.NV_ESC_RM_ALLOC_MEMORY: rmAllocMemory, nvgpu.NV_ESC_RM_FREE: rmFree, nvgpu.NV_ESC_RM_CONTROL: rmControl, nvgpu.NV_ESC_RM_ALLOC: rmAlloc, nvgpu.NV_ESC_RM_VID_HEAP_CONTROL: rmVidHeapControl, nvgpu.NV_ESC_RM_MAP_MEMORY: rmMapMemory, }, uvmIoctl: map[uint32]uvmIoctlHandler{ nvgpu.UVM_INITIALIZE: uvmInitialize, nvgpu.UVM_DEINITIALIZE: uvmIoctlNoParams, nvgpu.UVM_CREATE_RANGE_GROUP: uvmIoctlSimple[nvgpu.UVM_CREATE_RANGE_GROUP_PARAMS], nvgpu.UVM_DESTROY_RANGE_GROUP: uvmIoctlSimple[nvgpu.UVM_DESTROY_RANGE_GROUP_PARAMS], nvgpu.UVM_REGISTER_GPU_VASPACE: uvmIoctlHasFrontendFD[nvgpu.UVM_REGISTER_GPU_VASPACE_PARAMS], nvgpu.UVM_UNREGISTER_GPU_VASPACE: uvmIoctlSimple[nvgpu.UVM_UNREGISTER_GPU_VASPACE_PARAMS], nvgpu.UVM_REGISTER_CHANNEL: uvmIoctlHasFrontendFD[nvgpu.UVM_REGISTER_CHANNEL_PARAMS], nvgpu.UVM_UNREGISTER_CHANNEL: uvmIoctlSimple[nvgpu.UVM_UNREGISTER_CHANNEL_PARAMS], nvgpu.UVM_ENABLE_PEER_ACCESS: uvmIoctlSimple[nvgpu.UVM_ENABLE_PEER_ACCESS_PARAMS], nvgpu.UVM_DISABLE_PEER_ACCESS: uvmIoctlSimple[nvgpu.UVM_DISABLE_PEER_ACCESS_PARAMS], nvgpu.UVM_SET_RANGE_GROUP: uvmIoctlSimple[nvgpu.UVM_SET_RANGE_GROUP_PARAMS], nvgpu.UVM_MAP_EXTERNAL_ALLOCATION: uvmIoctlHasFrontendFD[nvgpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS], nvgpu.UVM_FREE: uvmIoctlSimple[nvgpu.UVM_FREE_PARAMS], nvgpu.UVM_REGISTER_GPU: uvmIoctlHasFrontendFD[nvgpu.UVM_REGISTER_GPU_PARAMS], nvgpu.UVM_UNREGISTER_GPU: uvmIoctlSimple[nvgpu.UVM_UNREGISTER_GPU_PARAMS], nvgpu.UVM_PAGEABLE_MEM_ACCESS: uvmIoctlSimple[nvgpu.UVM_PAGEABLE_MEM_ACCESS_PARAMS], nvgpu.UVM_SET_PREFERRED_LOCATION: uvmIoctlSimple[nvgpu.UVM_SET_PREFERRED_LOCATION_PARAMS], nvgpu.UVM_DISABLE_READ_DUPLICATION: uvmIoctlSimple[nvgpu.UVM_DISABLE_READ_DUPLICATION_PARAMS], nvgpu.UVM_MIGRATE_RANGE_GROUP: uvmIoctlSimple[nvgpu.UVM_MIGRATE_RANGE_GROUP_PARAMS], nvgpu.UVM_MAP_DYNAMIC_PARALLELISM_REGION: uvmIoctlSimple[nvgpu.UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS], nvgpu.UVM_UNMAP_EXTERNAL: uvmIoctlSimple[nvgpu.UVM_UNMAP_EXTERNAL_PARAMS], nvgpu.UVM_ALLOC_SEMAPHORE_POOL: uvmIoctlSimple[nvgpu.UVM_ALLOC_SEMAPHORE_POOL_PARAMS], nvgpu.UVM_VALIDATE_VA_RANGE: uvmIoctlSimple[nvgpu.UVM_VALIDATE_VA_RANGE_PARAMS], nvgpu.UVM_CREATE_EXTERNAL_RANGE: uvmIoctlSimple[nvgpu.UVM_CREATE_EXTERNAL_RANGE_PARAMS], nvgpu.UVM_MM_INITIALIZE: uvmMMInitialize, }, controlCmd: map[uint32]controlCmdHandler{ nvgpu.NV0000_CTRL_CMD_CLIENT_GET_ADDR_SPACE_TYPE: rmControlSimple, nvgpu.NV0000_CTRL_CMD_CLIENT_SET_INHERITED_SHARE_POLICY: rmControlSimple, nvgpu.NV0000_CTRL_CMD_GPU_GET_ATTACHED_IDS: rmControlSimple, nvgpu.NV0000_CTRL_CMD_GPU_GET_ID_INFO: rmControlSimple, nvgpu.NV0000_CTRL_CMD_GPU_GET_ID_INFO_V2: rmControlSimple, nvgpu.NV0000_CTRL_CMD_GPU_GET_PROBED_IDS: rmControlSimple, nvgpu.NV0000_CTRL_CMD_GPU_ATTACH_IDS: rmControlSimple, nvgpu.NV0000_CTRL_CMD_GPU_DETACH_IDS: rmControlSimple, nvgpu.NV0000_CTRL_CMD_GPU_GET_PCI_INFO: rmControlSimple, nvgpu.NV0000_CTRL_CMD_GPU_QUERY_DRAIN_STATE: rmControlSimple, nvgpu.NV0000_CTRL_CMD_GPU_GET_MEMOP_ENABLE: rmControlSimple, nvgpu.NV0000_CTRL_CMD_SYNC_GPU_BOOST_GROUP_INFO: rmControlSimple, nvgpu.NV0000_CTRL_CMD_SYSTEM_GET_P2P_CAPS: rmControlSimple, nvgpu.NV0000_CTRL_CMD_SYSTEM_GET_P2P_CAPS_V2: rmControlSimple, nvgpu.NV0000_CTRL_CMD_SYSTEM_GET_FABRIC_STATUS: rmControlSimple, nvgpu.NV0000_CTRL_CMD_SYSTEM_GET_P2P_CAPS_MATRIX: rmControlSimple, nvgpu.NV0000_CTRL_CMD_SYSTEM_GET_FEATURES: rmControlSimple, nvgpu.NV0080_CTRL_CMD_FB_GET_CAPS_V2: rmControlSimple, nvgpu.NV0080_CTRL_CMD_GPU_GET_NUM_SUBDEVICES: rmControlSimple, nvgpu.NV0080_CTRL_CMD_GPU_QUERY_SW_STATE_PERSISTENCE: rmControlSimple, nvgpu.NV0080_CTRL_CMD_GPU_GET_VIRTUALIZATION_MODE: rmControlSimple, 0x80028b: rmControlSimple, // unknown, paramsSize == 1 nvgpu.NV0080_CTRL_CMD_GPU_GET_CLASSLIST_V2: rmControlSimple, nvgpu.NV0080_CTRL_CMD_HOST_GET_CAPS_V2: rmControlSimple, nvgpu.NV00FD_CTRL_CMD_GET_INFO: rmControlSimple, nvgpu.NV00FD_CTRL_CMD_ATTACH_MEM: rmControlSimple, nvgpu.NV00FD_CTRL_CMD_DETACH_MEM: rmControlSimple, nvgpu.NV2080_CTRL_CMD_BUS_GET_PCI_INFO: rmControlSimple, nvgpu.NV2080_CTRL_CMD_BUS_GET_PCI_BAR_INFO: rmControlSimple, nvgpu.NV2080_CTRL_CMD_BUS_GET_INFO_V2: rmControlSimple, nvgpu.NV2080_CTRL_CMD_BUS_GET_PCIE_SUPPORTED_GPU_ATOMICS: rmControlSimple, nvgpu.NV2080_CTRL_CMD_BUS_GET_C2C_INFO: rmControlSimple, nvgpu.NV2080_CTRL_CMD_CE_GET_ALL_CAPS: rmControlSimple, nvgpu.NV2080_CTRL_CMD_EVENT_SET_NOTIFICATION: rmControlSimple, nvgpu.NV2080_CTRL_CMD_FB_GET_INFO_V2: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_GET_INFO_V2: rmControlSimple, nvgpu.NV2080_CTRL_CMD_FLCN_GET_CTX_BUFFER_SIZE: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_GET_NAME_STRING: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_GET_SHORT_NAME_STRING: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_GET_SIMULATION_INFO: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_QUERY_ECC_STATUS: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_QUERY_COMPUTE_MODE_RULES: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_QUERY_ECC_CONFIGURATION: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_GET_OEM_BOARD_INFO: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_ACQUIRE_COMPUTE_MODE_RESERVATION: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_RELEASE_COMPUTE_MODE_RESERVATION: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_GET_GID_INFO: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_GET_INFOROM_OBJECT_VERSION: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_GET_INFOROM_IMAGE_VERSION: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_QUERY_INFOROM_ECC_SUPPORT: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_GET_ENGINES_V2: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_GET_ACTIVE_PARTITION_IDS: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_GET_PIDS: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_GET_PID_INFO: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GPU_GET_COMPUTE_POLICY_CONFIG: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GET_GPU_FABRIC_PROBE_INFO: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GR_SET_CTXSW_PREEMPTION_MODE: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GR_GET_CTX_BUFFER_SIZE: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GR_GET_GLOBAL_SM_ORDER: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GR_GET_CAPS_V2: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GR_GET_GPC_MASK: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GR_GET_TPC_MASK: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GR_GET_SM_ISSUE_RATE_MODIFIER: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GRMGR_GET_GR_FS_INFO: rmControlSimple, nvgpu.NV2080_CTRL_CMD_GSP_GET_FEATURES: rmControlSimple, nvgpu.NV2080_CTRL_CMD_MC_GET_ARCH_INFO: rmControlSimple, nvgpu.NV2080_CTRL_CMD_MC_SERVICE_INTERRUPTS: rmControlSimple, nvgpu.NV2080_CTRL_CMD_NVLINK_GET_NVLINK_CAPS: rmControlSimple, nvgpu.NV2080_CTRL_CMD_NVLINK_GET_NVLINK_STATUS: rmControlSimple, nvgpu.NV2080_CTRL_CMD_PERF_BOOST: rmControlSimple, nvgpu.NV2080_CTRL_CMD_RC_GET_WATCHDOG_INFO: rmControlSimple, nvgpu.NV2080_CTRL_CMD_RC_RELEASE_WATCHDOG_REQUESTS: rmControlSimple, nvgpu.NV2080_CTRL_CMD_RC_SOFT_DISABLE_WATCHDOG: rmControlSimple, nvgpu.NV2080_CTRL_CMD_TIMER_GET_GPU_CPU_TIME_CORRELATION_INFO: rmControlSimple, nvgpu.NV503C_CTRL_CMD_REGISTER_VIDMEM: rmControlSimple, nvgpu.NV503C_CTRL_CMD_UNREGISTER_VIDMEM: rmControlSimple, nvgpu.NV83DE_CTRL_CMD_DEBUG_SET_EXCEPTION_MASK: rmControlSimple, nvgpu.NV83DE_CTRL_CMD_DEBUG_READ_ALL_SM_ERROR_STATES: rmControlSimple, nvgpu.NV83DE_CTRL_CMD_DEBUG_CLEAR_ALL_SM_ERROR_STATES: rmControlSimple, nvgpu.NV906F_CTRL_GET_CLASS_ENGINEID: rmControlSimple, nvgpu.NV906F_CTRL_CMD_RESET_CHANNEL: rmControlSimple, nvgpu.NV90E6_CTRL_CMD_MASTER_GET_VIRTUAL_FUNCTION_ERROR_CONT_INTR_MASK: rmControlSimple, nvgpu.NVC36F_CTRL_GET_CLASS_ENGINEID: rmControlSimple, nvgpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN: rmControlSimple, nvgpu.NV_CONF_COMPUTE_CTRL_CMD_SYSTEM_GET_CAPABILITIES: rmControlSimple, nvgpu.NV_CONF_COMPUTE_CTRL_CMD_SYSTEM_GET_GPUS_STATE: rmControlSimple, nvgpu.NV_CONF_COMPUTE_CTRL_CMD_GPU_GET_NUM_SECURE_CHANNELS: rmControlSimple, nvgpu.NVA06C_CTRL_CMD_GPFIFO_SCHEDULE: rmControlSimple, nvgpu.NVA06C_CTRL_CMD_SET_TIMESLICE: rmControlSimple, nvgpu.NVA06C_CTRL_CMD_PREEMPT: rmControlSimple, nvgpu.NVA06F_CTRL_CMD_GPFIFO_SCHEDULE: rmControlSimple, nvgpu.NVC56F_CTRL_CMD_GET_KMB: rmControlSimple, nvgpu.NV0000_CTRL_CMD_SYSTEM_GET_BUILD_VERSION: ctrlClientSystemGetBuildVersion, nvgpu.NV0000_CTRL_CMD_OS_UNIX_EXPORT_OBJECT_TO_FD: ctrlHasFrontendFD[nvgpu.NV0000_CTRL_OS_UNIX_EXPORT_OBJECT_TO_FD_PARAMS], nvgpu.NV0000_CTRL_CMD_OS_UNIX_IMPORT_OBJECT_FROM_FD: ctrlHasFrontendFD[nvgpu.NV0000_CTRL_OS_UNIX_IMPORT_OBJECT_FROM_FD_PARAMS], nvgpu.NV0000_CTRL_CMD_OS_UNIX_GET_EXPORT_OBJECT_INFO: ctrlHasFrontendFD[nvgpu.NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS], nvgpu.NV0041_CTRL_CMD_GET_SURFACE_INFO: ctrlIoctlHasInfoList[nvgpu.NV0041_CTRL_GET_SURFACE_INFO_PARAMS], nvgpu.NV0080_CTRL_CMD_FIFO_GET_CHANNELLIST: ctrlDevFIFOGetChannelList, nvgpu.NV00FD_CTRL_CMD_ATTACH_GPU: ctrlMemoryMulticastFabricAttachGPU, nvgpu.NV0080_CTRL_CMD_GPU_GET_CLASSLIST: ctrlDevGpuGetClasslist, nvgpu.NV2080_CTRL_CMD_FIFO_DISABLE_CHANNELS: ctrlSubdevFIFODisableChannels, nvgpu.NV2080_CTRL_CMD_BIOS_GET_INFO: ctrlIoctlHasInfoList[nvgpu.NV2080_CTRL_BIOS_GET_INFO_PARAMS], nvgpu.NV2080_CTRL_CMD_GR_GET_INFO: ctrlIoctlHasInfoList[nvgpu.NV2080_CTRL_GR_GET_INFO_PARAMS], nvgpu.NV503C_CTRL_CMD_REGISTER_VA_SPACE: ctrlRegisterVASpace, }, allocationClass: map[nvgpu.ClassID]allocationClassHandler{ nvgpu.NV01_ROOT: rmAllocRootClient, nvgpu.NV01_ROOT_NON_PRIV: rmAllocRootClient, nvgpu.NV01_MEMORY_SYSTEM: rmAllocSimple[nvgpu.NV_MEMORY_ALLOCATION_PARAMS], nvgpu.NV01_MEMORY_LOCAL_USER: rmAllocSimple[nvgpu.NV_MEMORY_ALLOCATION_PARAMS], nvgpu.NV01_ROOT_CLIENT: rmAllocRootClient, nvgpu.NV01_EVENT_OS_EVENT: rmAllocEventOSEvent, nvgpu.NV2081_BINAPI: rmAllocSimple[nvgpu.NV2081_ALLOC_PARAMETERS], nvgpu.NV01_DEVICE_0: rmAllocSimple[nvgpu.NV0080_ALLOC_PARAMETERS], nvgpu.RM_USER_SHARED_DATA: rmAllocSimple[nvgpu.NV00DE_ALLOC_PARAMETERS], nvgpu.NV_MEMORY_FABRIC: rmAllocSimple[nvgpu.NV00F8_ALLOCATION_PARAMETERS], nvgpu.NV_MEMORY_MULTICAST_FABRIC: rmAllocSimple[nvgpu.NV00FD_ALLOCATION_PARAMETERS], nvgpu.NV20_SUBDEVICE_0: rmAllocSimple[nvgpu.NV2080_ALLOC_PARAMETERS], nvgpu.NV50_MEMORY_VIRTUAL: rmAllocSimple[nvgpu.NV_MEMORY_ALLOCATION_PARAMS], nvgpu.NV50_P2P: rmAllocSimple[nvgpu.NV503B_ALLOC_PARAMETERS], nvgpu.NV50_THIRD_PARTY_P2P: rmAllocSimple[nvgpu.NV503C_ALLOC_PARAMETERS], nvgpu.GT200_DEBUGGER: rmAllocSMDebuggerSession, nvgpu.FERMI_CONTEXT_SHARE_A: rmAllocContextShare, nvgpu.FERMI_VASPACE_A: rmAllocSimple[nvgpu.NV_VASPACE_ALLOCATION_PARAMETERS], nvgpu.KEPLER_CHANNEL_GROUP_A: rmAllocChannelGroup, nvgpu.TURING_CHANNEL_GPFIFO_A: rmAllocChannel, nvgpu.AMPERE_CHANNEL_GPFIFO_A: rmAllocChannel, nvgpu.HOPPER_CHANNEL_GPFIFO_A: rmAllocChannel, nvgpu.TURING_DMA_COPY_A: rmAllocSimple[nvgpu.NVB0B5_ALLOCATION_PARAMETERS], nvgpu.AMPERE_DMA_COPY_A: rmAllocSimple[nvgpu.NVB0B5_ALLOCATION_PARAMETERS], nvgpu.AMPERE_DMA_COPY_B: rmAllocSimple[nvgpu.NVB0B5_ALLOCATION_PARAMETERS], nvgpu.HOPPER_DMA_COPY_A: rmAllocSimple[nvgpu.NVB0B5_ALLOCATION_PARAMETERS], nvgpu.TURING_COMPUTE_A: rmAllocSimple[nvgpu.NV_GR_ALLOCATION_PARAMETERS], nvgpu.AMPERE_COMPUTE_A: rmAllocSimple[nvgpu.NV_GR_ALLOCATION_PARAMETERS], nvgpu.AMPERE_COMPUTE_B: rmAllocSimple[nvgpu.NV_GR_ALLOCATION_PARAMETERS], nvgpu.ADA_COMPUTE_A: rmAllocSimple[nvgpu.NV_GR_ALLOCATION_PARAMETERS], nvgpu.NV_CONFIDENTIAL_COMPUTE: rmAllocSimple[nvgpu.NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS], nvgpu.HOPPER_COMPUTE_A: rmAllocSimple[nvgpu.NV_GR_ALLOCATION_PARAMETERS], nvgpu.HOPPER_USERMODE_A: rmAllocSimple[nvgpu.NV_HOPPER_USERMODE_A_PARAMS], nvgpu.GF100_SUBDEVICE_MASTER: rmAllocNoParams, nvgpu.TURING_USERMODE_A: rmAllocNoParams, nvgpu.HOPPER_SEC2_WORK_LAUNCH_A: rmAllocNoParams, }, } } // 535.104.12 exists on the "535.104.12" branch. It branched off the main // branch at 535.104.05. _ = addDriverABI(535, 104, 12, "ffc2d89e233d2427edb1ff5f436028a94b3ef86e78f97e088e11d905c82e8001", v535_104_05) // 535.113.01 is an intermediate unqualified version from the main branch. v535_113_01 := v535_104_05 // The following exist on the "535" branch. They branched off the main // branch at 535.113.01. v535_129_03 := addDriverABI(535, 129, 03, "e6dca5626a2608c6bb2a046cfcb7c1af338b9e961a7dd90ac09bb8a126ff002e", v535_113_01) v535_154_05 := addDriverABI(535, 154, 05, "7e95065caa6b82de926110f14827a61972eb12c200e863a29e9fb47866eaa898", v535_129_03) _ = addDriverABI(535, 161, 07, "edc527f1dcfa0212a3bf815ebf302d45ef9663834a41e11a851dd38da159a8cd", v535_154_05) // 545.23.06 is an intermediate unqualified version from the main branch. v545_23_06 := func() *driverABI { abi := v535_113_01() abi.controlCmd[nvgpu.NV0000_CTRL_CMD_OS_UNIX_GET_EXPORT_OBJECT_INFO] = ctrlHasFrontendFD[nvgpu.NV0000_CTRL_OS_UNIX_GET_EXPORT_OBJECT_INFO_PARAMS_V545] abi.allocationClass[nvgpu.RM_USER_SHARED_DATA] = rmAllocSimple[nvgpu.NV00DE_ALLOC_PARAMETERS_V545] abi.allocationClass[nvgpu.NV_MEMORY_MULTICAST_FABRIC] = rmAllocSimple[nvgpu.NV00FD_ALLOCATION_PARAMETERS_V545] abi.allocationClass[nvgpu.NV01_MEMORY_SYSTEM] = rmAllocSimple[nvgpu.NV_MEMORY_ALLOCATION_PARAMS_V545] abi.allocationClass[nvgpu.NV01_MEMORY_LOCAL_USER] = rmAllocSimple[nvgpu.NV_MEMORY_ALLOCATION_PARAMS_V545] abi.allocationClass[nvgpu.NV50_MEMORY_VIRTUAL] = rmAllocSimple[nvgpu.NV_MEMORY_ALLOCATION_PARAMS_V545] return abi } // 550.40.07 is an intermediate unqualified version from the main branch. v550_40_07 := func() *driverABI { abi := v545_23_06() abi.frontendIoctl[nvgpu.NV_ESC_WAIT_OPEN_COMPLETE] = frontendIoctlSimple // nv_ioctl_wait_open_complete_t abi.controlCmd[nvgpu.NV0000_CTRL_CMD_GPU_ASYNC_ATTACH_ID] = rmControlSimple abi.controlCmd[nvgpu.NV0000_CTRL_CMD_GPU_WAIT_ATTACH_ID] = rmControlSimple abi.controlCmd[nvgpu.NV0080_CTRL_CMD_PERF_CUDA_LIMIT_SET_CONTROL] = rmControlSimple // NV0080_CTRL_PERF_CUDA_LIMIT_CONTROL_PARAMS abi.controlCmd[nvgpu.NV2080_CTRL_CMD_PERF_GET_CURRENT_PSTATE] = rmControlSimple // NV2081_BINAPI forwards all control commands to the GSP in // src/nvidia/src/kernel/rmapi/binary_api.c:binapiControl_IMPL(). abi.controlCmd[(nvgpu.NV2081_BINAPI<<16)|0x0108] = rmControlSimple abi.uvmIoctl[nvgpu.UVM_SET_PREFERRED_LOCATION] = uvmIoctlSimple[nvgpu.UVM_SET_PREFERRED_LOCATION_PARAMS_V550] return abi } v550_54_14 := addDriverABI(550, 54, 14, "8c497ff1cfc7c310fb875149bc30faa4fd26d2237b2cba6cd2e8b0780157cfe3", func() *driverABI { abi := v550_40_07() abi.uvmIoctl[nvgpu.UVM_ALLOC_SEMAPHORE_POOL] = uvmIoctlSimple[nvgpu.UVM_ALLOC_SEMAPHORE_POOL_PARAMS_V550] abi.uvmIoctl[nvgpu.UVM_MAP_EXTERNAL_ALLOCATION] = uvmIoctlHasFrontendFD[nvgpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS_V550] return abi }) v550_54_15 := addDriverABI(550, 54, 15, "2e859ae5f912a9a47aaa9b2d40a94a14f6f486b5d3b67c0ddf8b72c1c9650385", v550_54_14) _ = addDriverABI(550, 90, 07, "51acf579d5a9884f573a1d3f522e7fafa5e7841e22a9cec0b4bbeae31b0b9733", v550_54_15) }) } // ForEachSupportDriver calls f on all supported drivers. // Precondition: Init() must have been called. func ForEachSupportDriver(f func(version DriverVersion, checksum string)) { for version, abi := range abis { f(version, abi.checksum) } } // LatestDriver returns the latest supported driver. // Precondition: Init() must have been called. func LatestDriver() DriverVersion { var ret DriverVersion for version := range abis { if version.isGreaterThan(ret) { ret = version } } return ret } // ExpectedDriverChecksum returns the expected checksum for a given version. // Precondition: Init() must have been called. func ExpectedDriverChecksum(version DriverVersion) (string, bool) { abi, ok := abis[version] if !ok { return "", false } return abi.checksum, true } // SupportedIoctls returns the ioctl numbers that are supported by nvproxy at // a given version. func SupportedIoctls(version DriverVersion) (frontendIoctls map[uint32]struct{}, uvmIoctls map[uint32]struct{}, controlCmds map[uint32]struct{}, allocClasses map[uint32]struct{}, ok bool) { abiCons, ok := abis[version] if !ok { return nil, nil, nil, nil, false } abi := abiCons.cons() frontendIoctls = make(map[uint32]struct{}) for ioc := range abi.frontendIoctl { frontendIoctls[ioc] = struct{}{} } uvmIoctls = make(map[uint32]struct{}) for ioc := range abi.uvmIoctl { uvmIoctls[ioc] = struct{}{} } controlCmds = make(map[uint32]struct{}) for cmd := range abi.controlCmd { controlCmds[cmd] = struct{}{} } allocClasses = make(map[uint32]struct{}) for class := range abi.allocationClass { allocClasses[uint32(class)] = struct{}{} } return } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/tpuproxy/000077500000000000000000000000001465435605700243045ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/tpuproxy/devaddr_range.go000066400000000000000000000034161465435605700274240ustar00rootroot00000000000000package tpuproxy // A Range represents a contiguous range of T. // // +stateify savable type DevAddrRange struct { // Start is the inclusive start of the range. Start uint64 // End is the exclusive end of the range. End uint64 } // WellFormed returns true if r.Start <= r.End. All other methods on a Range // require that the Range is well-formed. // //go:nosplit func (r DevAddrRange) WellFormed() bool { return r.Start <= r.End } // Length returns the length of the range. // //go:nosplit func (r DevAddrRange) Length() uint64 { return r.End - r.Start } // Contains returns true if r contains x. // //go:nosplit func (r DevAddrRange) Contains(x uint64) bool { return r.Start <= x && x < r.End } // Overlaps returns true if r and r2 overlap. // //go:nosplit func (r DevAddrRange) Overlaps(r2 DevAddrRange) bool { return r.Start < r2.End && r2.Start < r.End } // IsSupersetOf returns true if r is a superset of r2; that is, the range r2 is // contained within r. // //go:nosplit func (r DevAddrRange) IsSupersetOf(r2 DevAddrRange) bool { return r.Start <= r2.Start && r.End >= r2.End } // Intersect returns a range consisting of the intersection between r and r2. // If r and r2 do not overlap, Intersect returns a range with unspecified // bounds, but for which Length() == 0. // //go:nosplit func (r DevAddrRange) Intersect(r2 DevAddrRange) DevAddrRange { if r.Start < r2.Start { r.Start = r2.Start } if r.End > r2.End { r.End = r2.End } if r.End < r.Start { r.End = r.Start } return r } // CanSplitAt returns true if it is legal to split a segment spanning the range // r at x; that is, splitting at x would produce two ranges, both of which have // non-zero length. // //go:nosplit func (r DevAddrRange) CanSplitAt(x uint64) bool { return r.Contains(x) && r.Start < x } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/tpuproxy/devaddr_set.go000066400000000000000000002032771465435605700271320ustar00rootroot00000000000000package tpuproxy import ( __generics_imported0 "gvisor.dev/gvisor/pkg/sentry/mm" ) import ( "bytes" "context" "fmt" ) // trackGaps is an optional parameter. // // If trackGaps is 1, the Set will track maximum gap size recursively, // enabling the GapIterator.{Prev,Next}LargeEnoughGap functions. In this // case, Key must be an unsigned integer. // // trackGaps must be 0 or 1. const DevAddrtrackGaps = 0 var _ = uint8(DevAddrtrackGaps << 7) // Will fail if not zero or one. // dynamicGap is a type that disappears if trackGaps is 0. type DevAddrdynamicGap [DevAddrtrackGaps]uint64 // Get returns the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *DevAddrdynamicGap) Get() uint64 { return d[:][0] } // Set sets the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *DevAddrdynamicGap) Set(v uint64) { d[:][0] = v } const ( // minDegree is the minimum degree of an internal node in a Set B-tree. // // - Any non-root node has at least minDegree-1 segments. // // - Any non-root internal (non-leaf) node has at least minDegree children. // // - The root node may have fewer than minDegree-1 segments, but it may // only have 0 segments if the tree is empty. // // Our implementation requires minDegree >= 3. Higher values of minDegree // usually improve performance, but increase memory usage for small sets. DevAddrminDegree = 3 DevAddrmaxDegree = 2 * DevAddrminDegree ) // A Set is a mapping of segments with non-overlapping Range keys. The zero // value for a Set is an empty set. Set values are not safely movable nor // copyable. Set is thread-compatible. // // +stateify savable type DevAddrSet struct { root DevAddrnode `state:".([]DevAddrFlatSegment)"` } // IsEmpty returns true if the set contains no segments. func (s *DevAddrSet) IsEmpty() bool { return s.root.nrSegments == 0 } // IsEmptyRange returns true iff no segments in the set overlap the given // range. This is semantically equivalent to s.SpanRange(r) == 0, but may be // more efficient. func (s *DevAddrSet) IsEmptyRange(r DevAddrRange) bool { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return true } _, gap := s.Find(r.Start) if !gap.Ok() { return false } return r.End <= gap.End() } // Span returns the total size of all segments in the set. func (s *DevAddrSet) Span() uint64 { var sz uint64 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { sz += seg.Range().Length() } return sz } // SpanRange returns the total size of the intersection of segments in the set // with the given range. func (s *DevAddrSet) SpanRange(r DevAddrRange) uint64 { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return 0 } var sz uint64 for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { sz += seg.Range().Intersect(r).Length() } return sz } // FirstSegment returns the first segment in the set. If the set is empty, // FirstSegment returns a terminal iterator. func (s *DevAddrSet) FirstSegment() DevAddrIterator { if s.root.nrSegments == 0 { return DevAddrIterator{} } return s.root.firstSegment() } // LastSegment returns the last segment in the set. If the set is empty, // LastSegment returns a terminal iterator. func (s *DevAddrSet) LastSegment() DevAddrIterator { if s.root.nrSegments == 0 { return DevAddrIterator{} } return s.root.lastSegment() } // FirstGap returns the first gap in the set. func (s *DevAddrSet) FirstGap() DevAddrGapIterator { n := &s.root for n.hasChildren { n = n.children[0] } return DevAddrGapIterator{n, 0} } // LastGap returns the last gap in the set. func (s *DevAddrSet) LastGap() DevAddrGapIterator { n := &s.root for n.hasChildren { n = n.children[n.nrSegments] } return DevAddrGapIterator{n, n.nrSegments} } // Find returns the segment or gap whose range contains the given key. If a // segment is found, the returned Iterator is non-terminal and the // returned GapIterator is terminal. Otherwise, the returned Iterator is // terminal and the returned GapIterator is non-terminal. func (s *DevAddrSet) Find(key uint64) (DevAddrIterator, DevAddrGapIterator) { n := &s.root for { lower := 0 upper := n.nrSegments for lower < upper { i := lower + (upper-lower)/2 if r := n.keys[i]; key < r.End { if key >= r.Start { return DevAddrIterator{n, i}, DevAddrGapIterator{} } upper = i } else { lower = i + 1 } } i := lower if !n.hasChildren { return DevAddrIterator{}, DevAddrGapIterator{n, i} } n = n.children[i] } } // FindSegment returns the segment whose range contains the given key. If no // such segment exists, FindSegment returns a terminal iterator. func (s *DevAddrSet) FindSegment(key uint64) DevAddrIterator { seg, _ := s.Find(key) return seg } // LowerBoundSegment returns the segment with the lowest range that contains a // key greater than or equal to min. If no such segment exists, // LowerBoundSegment returns a terminal iterator. func (s *DevAddrSet) LowerBoundSegment(min uint64) DevAddrIterator { seg, gap := s.Find(min) if seg.Ok() { return seg } return gap.NextSegment() } // UpperBoundSegment returns the segment with the highest range that contains a // key less than or equal to max. If no such segment exists, UpperBoundSegment // returns a terminal iterator. func (s *DevAddrSet) UpperBoundSegment(max uint64) DevAddrIterator { seg, gap := s.Find(max) if seg.Ok() { return seg } return gap.PrevSegment() } // FindGap returns the gap containing the given key. If no such gap exists // (i.e. the set contains a segment containing that key), FindGap returns a // terminal iterator. func (s *DevAddrSet) FindGap(key uint64) DevAddrGapIterator { _, gap := s.Find(key) return gap } // LowerBoundGap returns the gap with the lowest range that is greater than or // equal to min. func (s *DevAddrSet) LowerBoundGap(min uint64) DevAddrGapIterator { seg, gap := s.Find(min) if gap.Ok() { return gap } return seg.NextGap() } // UpperBoundGap returns the gap with the highest range that is less than or // equal to max. func (s *DevAddrSet) UpperBoundGap(max uint64) DevAddrGapIterator { seg, gap := s.Find(max) if gap.Ok() { return gap } return seg.PrevGap() } // FirstLargeEnoughGap returns the first gap in the set with at least the given // length. If no such gap exists, FirstLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *DevAddrSet) FirstLargeEnoughGap(minSize uint64) DevAddrGapIterator { if DevAddrtrackGaps != 1 { panic("set is not tracking gaps") } gap := s.FirstGap() if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // LastLargeEnoughGap returns the last gap in the set with at least the given // length. If no such gap exists, LastLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *DevAddrSet) LastLargeEnoughGap(minSize uint64) DevAddrGapIterator { if DevAddrtrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LastGap() if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // LowerBoundLargeEnoughGap returns the first gap in the set with at least the // given length and whose range contains a key greater than or equal to min. If // no such gap exists, LowerBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *DevAddrSet) LowerBoundLargeEnoughGap(min, minSize uint64) DevAddrGapIterator { if DevAddrtrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LowerBoundGap(min) if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // UpperBoundLargeEnoughGap returns the last gap in the set with at least the // given length and whose range contains a key less than or equal to max. If no // such gap exists, UpperBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *DevAddrSet) UpperBoundLargeEnoughGap(max, minSize uint64) DevAddrGapIterator { if DevAddrtrackGaps != 1 { panic("set is not tracking gaps") } gap := s.UpperBoundGap(max) if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // Insert inserts the given segment into the given gap. If the new segment can // be merged with adjacent segments, Insert will do so. Insert returns an // iterator to the segment containing the inserted value (which may have been // merged with other values). All existing iterators (including gap, but not // including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, Insert panics. // // Insert is semantically equivalent to a InsertWithoutMerging followed by a // Merge, but may be more efficient. Note that there is no unchecked variant of // Insert since Insert must retrieve and inspect gap's predecessor and // successor segments regardless. func (s *DevAddrSet) Insert(gap DevAddrGapIterator, r DevAddrRange, val __generics_imported0.PinnedRange) DevAddrIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } prev, next := gap.PrevSegment(), gap.NextSegment() if prev.Ok() && prev.End() > r.Start { panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) } if next.Ok() && next.Start() < r.End { panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) } if prev.Ok() && prev.End() == r.Start { if mval, ok := (devAddrSetFuncs{}).Merge(prev.Range(), prev.Value(), r, val); ok { shrinkMaxGap := DevAddrtrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() prev.SetEndUnchecked(r.End) prev.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } if next.Ok() && next.Start() == r.End { val = mval if mval, ok := (devAddrSetFuncs{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { prev.SetEndUnchecked(next.End()) prev.SetValue(mval) return s.Remove(next).PrevSegment() } } return prev } } if next.Ok() && next.Start() == r.End { if mval, ok := (devAddrSetFuncs{}).Merge(r, val, next.Range(), next.Value()); ok { shrinkMaxGap := DevAddrtrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() next.SetStartUnchecked(r.Start) next.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } return next } } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMerging inserts the given segment into the given gap and // returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, // InsertWithoutMerging panics. func (s *DevAddrSet) InsertWithoutMerging(gap DevAddrGapIterator, r DevAddrRange, val __generics_imported0.PinnedRange) DevAddrIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if gr := gap.Range(); !gr.IsSupersetOf(r) { panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMergingUnchecked inserts the given segment into the given gap // and returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // Preconditions: // - r.Start >= gap.Start(). // - r.End <= gap.End(). func (s *DevAddrSet) InsertWithoutMergingUnchecked(gap DevAddrGapIterator, r DevAddrRange, val __generics_imported0.PinnedRange) DevAddrIterator { gap = gap.node.rebalanceBeforeInsert(gap) splitMaxGap := DevAddrtrackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get()) copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) gap.node.keys[gap.index] = r gap.node.values[gap.index] = val gap.node.nrSegments++ if splitMaxGap { gap.node.updateMaxGapLeaf() } return DevAddrIterator{gap.node, gap.index} } // InsertRange inserts the given segment into the set. If the new segment can // be merged with adjacent segments, InsertRange will do so. InsertRange // returns an iterator to the segment containing the inserted value (which may // have been merged with other values). All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertRange panics. // // InsertRange searches the set to find the gap to insert into. If the caller // already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *DevAddrSet) InsertRange(r DevAddrRange, val __generics_imported0.PinnedRange) DevAddrIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.Insert(gap, r, val) } // InsertWithoutMergingRange inserts the given segment into the set and returns // an iterator to the inserted segment. All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertWithoutMergingRange panics. // // InsertWithoutMergingRange searches the set to find the gap to insert into. // If the caller already has the appropriate GapIterator, or if the caller // needs to do additional work between finding the gap and insertion, use // InsertWithoutMerging instead. func (s *DevAddrSet) InsertWithoutMergingRange(r DevAddrRange, val __generics_imported0.PinnedRange) DevAddrIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.InsertWithoutMerging(gap, r, val) } // TryInsertRange attempts to insert the given segment into the set. If the new // segment can be merged with adjacent segments, TryInsertRange will do so. // TryInsertRange returns an iterator to the segment containing the inserted // value (which may have been merged with other values). All existing iterators // (excluding the returned iterator) are invalidated. // // If the new segment would overlap an existing segment, TryInsertRange does // nothing and returns a terminal iterator. // // TryInsertRange searches the set to find the gap to insert into. If the // caller already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *DevAddrSet) TryInsertRange(r DevAddrRange, val __generics_imported0.PinnedRange) DevAddrIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return DevAddrIterator{} } if gap.End() < r.End { return DevAddrIterator{} } return s.Insert(gap, r, val) } // TryInsertWithoutMergingRange attempts to insert the given segment into the // set. If successful, it returns an iterator to the inserted segment; all // existing iterators (excluding the returned iterator) are invalidated. If the // new segment would overlap an existing segment, TryInsertWithoutMergingRange // does nothing and returns a terminal iterator. // // TryInsertWithoutMergingRange searches the set to find the gap to insert // into. If the caller already has the appropriate GapIterator, or if the // caller needs to do additional work between finding the gap and insertion, // use InsertWithoutMerging instead. func (s *DevAddrSet) TryInsertWithoutMergingRange(r DevAddrRange, val __generics_imported0.PinnedRange) DevAddrIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return DevAddrIterator{} } if gap.End() < r.End { return DevAddrIterator{} } return s.InsertWithoutMerging(gap, r, val) } // Remove removes the given segment and returns an iterator to the vacated gap. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. func (s *DevAddrSet) Remove(seg DevAddrIterator) DevAddrGapIterator { if seg.node.hasChildren { victim := seg.PrevSegment() seg.SetRangeUnchecked(victim.Range()) seg.SetValue(victim.Value()) nextAdjacentNode := seg.NextSegment().node if DevAddrtrackGaps != 0 { nextAdjacentNode.updateMaxGapLeaf() } return s.Remove(victim).NextGap() } copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) devAddrSetFuncs{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) seg.node.nrSegments-- if DevAddrtrackGaps != 0 { seg.node.updateMaxGapLeaf() } return seg.node.rebalanceAfterRemove(DevAddrGapIterator{seg.node, seg.index}) } // RemoveAll removes all segments from the set. All existing iterators are // invalidated. func (s *DevAddrSet) RemoveAll() { s.root = DevAddrnode{} } // RemoveRange removes all segments in the given range. An iterator to the // newly formed gap is returned, and all existing iterators are invalidated. // // RemoveRange searches the set to find segments to remove. If the caller // already has an iterator to either end of the range of segments to remove, or // if the caller needs to do additional work before removing each segment, // iterate segments and call Remove in a loop instead. func (s *DevAddrSet) RemoveRange(r DevAddrRange) DevAddrGapIterator { seg, gap := s.Find(r.Start) if seg.Ok() { seg = s.Isolate(seg, r) gap = s.Remove(seg) } for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { seg = s.SplitAfter(seg, r.End) gap = s.Remove(seg) } return gap } // RemoveFullRange is equivalent to RemoveRange, except that if any key in the // given range does not correspond to a segment, RemoveFullRange panics. func (s *DevAddrSet) RemoveFullRange(r DevAddrRange) DevAddrGapIterator { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) end := seg.End() gap := s.Remove(seg) if r.End <= end { return gap } seg = gap.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // Merge attempts to merge two neighboring segments. If successful, Merge // returns an iterator to the merged segment, and all existing iterators are // invalidated. Otherwise, Merge returns a terminal iterator. // // If first is not the predecessor of second, Merge panics. func (s *DevAddrSet) Merge(first, second DevAddrIterator) DevAddrIterator { if first.NextSegment() != second { panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) } return s.MergeUnchecked(first, second) } // MergeUnchecked attempts to merge two neighboring segments. If successful, // MergeUnchecked returns an iterator to the merged segment, and all existing // iterators are invalidated. Otherwise, MergeUnchecked returns a terminal // iterator. // // Precondition: first is the predecessor of second: first.NextSegment() == // second, first == second.PrevSegment(). func (s *DevAddrSet) MergeUnchecked(first, second DevAddrIterator) DevAddrIterator { if first.End() == second.Start() { if mval, ok := (devAddrSetFuncs{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { first.SetEndUnchecked(second.End()) first.SetValue(mval) return s.Remove(second).PrevSegment() } } return DevAddrIterator{} } // MergePrev attempts to merge the given segment with its predecessor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergePrev is usually used when mutating segments while iterating them in // order of increasing keys, to attempt merging of each mutated segment with // its previously-mutated predecessor. In such cases, merging a mutated segment // with its unmutated successor would incorrectly cause the latter to be // skipped. func (s *DevAddrSet) MergePrev(seg DevAddrIterator) DevAddrIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } return seg } // MergeNext attempts to merge the given segment with its successor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergeNext is usually used when mutating segments while iterating them in // order of decreasing keys, to attempt merging of each mutated segment with // its previously-mutated successor. In such cases, merging a mutated segment // with its unmutated predecessor would incorrectly cause the latter to be // skipped. func (s *DevAddrSet) MergeNext(seg DevAddrIterator) DevAddrIterator { if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // Unisolate attempts to merge the given segment with its predecessor and // successor if possible, and returns an updated iterator to the extended // segment. All existing iterators (including seg, but not including the // returned iterator) are invalidated. // // Unisolate is usually used in conjunction with Isolate when mutating part of // a single segment in a way that may affect its mergeability. For the reasons // described by MergePrev and MergeNext, it is usually incorrect to use the // return value of Unisolate in a loop variable. func (s *DevAddrSet) Unisolate(seg DevAddrIterator) DevAddrIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // MergeAll merges all mergeable adjacent segments in the set. All existing // iterators are invalidated. func (s *DevAddrSet) MergeAll() { seg := s.FirstSegment() if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeInsideRange attempts to merge all adjacent segments that contain a key // in the specific range. All existing iterators are invalidated. // // MergeInsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid a redundant search. func (s *DevAddrSet) MergeInsideRange(r DevAddrRange) { seg := s.LowerBoundSegment(r.Start) if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() && next.Start() < r.End { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeOutsideRange attempts to merge the segment containing r.Start with its // predecessor, and the segment containing r.End-1 with its successor. // // MergeOutsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid two redundant searches. func (s *DevAddrSet) MergeOutsideRange(r DevAddrRange) { first := s.FindSegment(r.Start) if first.Ok() { if prev := first.PrevSegment(); prev.Ok() { s.Merge(prev, first) } } last := s.FindSegment(r.End - 1) if last.Ok() { if next := last.NextSegment(); next.Ok() { s.Merge(last, next) } } } // Split splits the given segment at the given key and returns iterators to the // two resulting segments. All existing iterators (including seg, but not // including the returned iterators) are invalidated. // // If the segment cannot be split at split (because split is at the start or // end of the segment's range, so splitting would produce a segment with zero // length, or because split falls outside the segment's range altogether), // Split panics. func (s *DevAddrSet) Split(seg DevAddrIterator, split uint64) (DevAddrIterator, DevAddrIterator) { if !seg.Range().CanSplitAt(split) { panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) } return s.SplitUnchecked(seg, split) } // SplitUnchecked splits the given segment at the given key and returns // iterators to the two resulting segments. All existing iterators (including // seg, but not including the returned iterators) are invalidated. // // Preconditions: seg.Start() < key < seg.End(). func (s *DevAddrSet) SplitUnchecked(seg DevAddrIterator, split uint64) (DevAddrIterator, DevAddrIterator) { val1, val2 := (devAddrSetFuncs{}).Split(seg.Range(), seg.Value(), split) end2 := seg.End() seg.SetEndUnchecked(split) seg.SetValue(val1) seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), DevAddrRange{split, end2}, val2) return seg2.PrevSegment(), seg2 } // SplitBefore ensures that the given segment's start is at least start by // splitting at start if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterator) are invalidated. // // SplitBefore is usually when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, the first segment may // extend beyond the start of the range to be mutated, and needs to be // SplitBefore to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter; i.e. SplitBefore needs to be invoked on each segment, while // SplitAfter only needs to be invoked on the first. // // Preconditions: start < seg.End(). func (s *DevAddrSet) SplitBefore(seg DevAddrIterator, start uint64) DevAddrIterator { if seg.Range().CanSplitAt(start) { _, seg = s.SplitUnchecked(seg, start) } return seg } // SplitAfter ensures that the given segment's end is at most end by splitting // at end if necessary, and returns an updated iterator to the bounded segment. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. // // SplitAfter is usually used when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, each iterated segment // may extend beyond the end of the range to be mutated, and needs to be // SplitAfter to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter exchange roles; i.e. SplitBefore needs to be invoked on each // segment, while SplitAfter only needs to be invoked on the first. // // Preconditions: seg.Start() < end. func (s *DevAddrSet) SplitAfter(seg DevAddrIterator, end uint64) DevAddrIterator { if seg.Range().CanSplitAt(end) { seg, _ = s.SplitUnchecked(seg, end) } return seg } // Isolate ensures that the given segment's range is a subset of r by splitting // at r.Start and r.End if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterators) are invalidated. // // Isolate is usually used when mutating part of a single segment, or when // mutating segments in a range where the first segment is not necessarily // split, making use of SplitBefore/SplitAfter complex. // // Preconditions: seg.Range().Overlaps(r). func (s *DevAddrSet) Isolate(seg DevAddrIterator, r DevAddrRange) DevAddrIterator { if seg.Range().CanSplitAt(r.Start) { _, seg = s.SplitUnchecked(seg, r.Start) } if seg.Range().CanSplitAt(r.End) { seg, _ = s.SplitUnchecked(seg, r.End) } return seg } // LowerBoundSegmentSplitBefore combines LowerBoundSegment and SplitBefore. // // LowerBoundSegmentSplitBefore is usually used when mutating segments in a // range while iterating them in order of increasing keys. In such cases, // LowerBoundSegmentSplitBefore provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *DevAddrSet) LowerBoundSegmentSplitBefore(min uint64) DevAddrIterator { seg := s.LowerBoundSegment(min) if seg.Ok() { seg = s.SplitBefore(seg, min) } return seg } // UpperBoundSegmentSplitAfter combines UpperBoundSegment and SplitAfter. // // UpperBoundSegmentSplitAfter is usually used when mutating segments in a // range while iterating them in order of decreasing keys. In such cases, // UpperBoundSegmentSplitAfter provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *DevAddrSet) UpperBoundSegmentSplitAfter(max uint64) DevAddrIterator { seg := s.UpperBoundSegment(max) if seg.Ok() { seg = s.SplitAfter(seg, max) } return seg } // VisitRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments will not be split, so f may be called // on segments lying partially outside r. Non-empty gaps between segments are // skipped. If a call to f returns false, VisitRange stops iteration // immediately. // // N.B. f must not invalidate iterators into s. func (s *DevAddrSet) VisitRange(r DevAddrRange, f func(seg DevAddrIterator) bool) { for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { if !f(seg) { return } } } // VisitFullRange is equivalent to VisitRange, except that if any key in r that // is visited before f returns false does not correspond to a segment, // VisitFullRange panics. func (s *DevAddrSet) VisitFullRange(r DevAddrRange, f func(seg DevAddrIterator) bool) { pos := r.Start seg := s.FindSegment(r.Start) for { if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", pos)) } if !f(seg) { return } pos = seg.End() if r.End <= pos { return } seg, _ = seg.NextNonEmpty() } } // MutateRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments that lie partially outside r are split // before f is called, such that f only observes segments entirely within r. // Iterated segments are merged again after f is called. Non-empty gaps between // segments are skipped. If a call to f returns false, MutateRange stops // iteration immediately. // // MutateRange invalidates all existing iterators. // // N.B. f must not invalidate iterators into s. func (s *DevAddrSet) MutateRange(r DevAddrRange, f func(seg DevAddrIterator) bool) { seg := s.LowerBoundSegmentSplitBefore(r.Start) for seg.Ok() && seg.Start() < r.End { seg = s.SplitAfter(seg, r.End) cont := f(seg) seg = s.MergePrev(seg) if !cont { s.MergeNext(seg) return } seg = seg.NextSegment() } if seg.Ok() { s.MergePrev(seg) } } // MutateFullRange is equivalent to MutateRange, except that if any key in r // that is visited before f returns false does not correspond to a segment, // MutateFullRange panics. func (s *DevAddrSet) MutateFullRange(r DevAddrRange, f func(seg DevAddrIterator) bool) { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) cont := f(seg) end := seg.End() seg = s.MergePrev(seg) if !cont || r.End <= end { s.MergeNext(seg) return } seg = seg.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // +stateify savable type DevAddrnode struct { // An internal binary tree node looks like: // // K // / \ // Cl Cr // // where all keys in the subtree rooted by Cl (the left subtree) are less // than K (the key of the parent node), and all keys in the subtree rooted // by Cr (the right subtree) are greater than K. // // An internal B-tree node's indexes work out to look like: // // K0 K1 K2 ... Kn-1 // / \/ \/ \ ... / \ // C0 C1 C2 C3 ... Cn-1 Cn // // where n is nrSegments. nrSegments int // parent is a pointer to this node's parent. If this node is root, parent // is nil. parent *DevAddrnode // parentIndex is the index of this node in parent.children. parentIndex int // Flag for internal nodes that is technically redundant with "children[0] // != nil", but is stored in the first cache line. "hasChildren" rather // than "isLeaf" because false must be the correct value for an empty root. hasChildren bool // The longest gap within this node. If the node is a leaf, it's simply the // maximum gap among all the (nrSegments+1) gaps formed by its nrSegments keys // including the 0th and nrSegments-th gap possibly shared with its upper-level // nodes; if it's a non-leaf node, it's the max of all children's maxGap. maxGap DevAddrdynamicGap // Nodes store keys and values in separate arrays to maximize locality in // the common case (scanning keys for lookup). keys [DevAddrmaxDegree - 1]DevAddrRange values [DevAddrmaxDegree - 1]__generics_imported0.PinnedRange children [DevAddrmaxDegree]*DevAddrnode } // firstSegment returns the first segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *DevAddrnode) firstSegment() DevAddrIterator { for n.hasChildren { n = n.children[0] } return DevAddrIterator{n, 0} } // lastSegment returns the last segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *DevAddrnode) lastSegment() DevAddrIterator { for n.hasChildren { n = n.children[n.nrSegments] } return DevAddrIterator{n, n.nrSegments - 1} } func (n *DevAddrnode) prevSibling() *DevAddrnode { if n.parent == nil || n.parentIndex == 0 { return nil } return n.parent.children[n.parentIndex-1] } func (n *DevAddrnode) nextSibling() *DevAddrnode { if n.parent == nil || n.parentIndex == n.parent.nrSegments { return nil } return n.parent.children[n.parentIndex+1] } // rebalanceBeforeInsert splits n and its ancestors if they are full, as // required for insertion, and returns an updated iterator to the position // represented by gap. func (n *DevAddrnode) rebalanceBeforeInsert(gap DevAddrGapIterator) DevAddrGapIterator { if n.nrSegments < DevAddrmaxDegree-1 { return gap } if n.parent != nil { gap = n.parent.rebalanceBeforeInsert(gap) } if n.parent == nil { left := &DevAddrnode{ nrSegments: DevAddrminDegree - 1, parent: n, parentIndex: 0, hasChildren: n.hasChildren, } right := &DevAddrnode{ nrSegments: DevAddrminDegree - 1, parent: n, parentIndex: 1, hasChildren: n.hasChildren, } copy(left.keys[:DevAddrminDegree-1], n.keys[:DevAddrminDegree-1]) copy(left.values[:DevAddrminDegree-1], n.values[:DevAddrminDegree-1]) copy(right.keys[:DevAddrminDegree-1], n.keys[DevAddrminDegree:]) copy(right.values[:DevAddrminDegree-1], n.values[DevAddrminDegree:]) n.keys[0], n.values[0] = n.keys[DevAddrminDegree-1], n.values[DevAddrminDegree-1] DevAddrzeroValueSlice(n.values[1:]) if n.hasChildren { copy(left.children[:DevAddrminDegree], n.children[:DevAddrminDegree]) copy(right.children[:DevAddrminDegree], n.children[DevAddrminDegree:]) DevAddrzeroNodeSlice(n.children[2:]) for i := 0; i < DevAddrminDegree; i++ { left.children[i].parent = left left.children[i].parentIndex = i right.children[i].parent = right right.children[i].parentIndex = i } } n.nrSegments = 1 n.hasChildren = true n.children[0] = left n.children[1] = right if DevAddrtrackGaps != 0 { left.updateMaxGapLocal() right.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < DevAddrminDegree { return DevAddrGapIterator{left, gap.index} } return DevAddrGapIterator{right, gap.index - DevAddrminDegree} } copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[DevAddrminDegree-1], n.values[DevAddrminDegree-1] copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { n.parent.children[i].parentIndex = i } sibling := &DevAddrnode{ nrSegments: DevAddrminDegree - 1, parent: n.parent, parentIndex: n.parentIndex + 1, hasChildren: n.hasChildren, } n.parent.children[n.parentIndex+1] = sibling n.parent.nrSegments++ copy(sibling.keys[:DevAddrminDegree-1], n.keys[DevAddrminDegree:]) copy(sibling.values[:DevAddrminDegree-1], n.values[DevAddrminDegree:]) DevAddrzeroValueSlice(n.values[DevAddrminDegree-1:]) if n.hasChildren { copy(sibling.children[:DevAddrminDegree], n.children[DevAddrminDegree:]) DevAddrzeroNodeSlice(n.children[DevAddrminDegree:]) for i := 0; i < DevAddrminDegree; i++ { sibling.children[i].parent = sibling sibling.children[i].parentIndex = i } } n.nrSegments = DevAddrminDegree - 1 if DevAddrtrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < DevAddrminDegree { return gap } return DevAddrGapIterator{sibling, gap.index - DevAddrminDegree} } // rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient // (contain fewer segments than required by B-tree invariants), as required for // removal, and returns an updated iterator to the position represented by gap. // // Precondition: n is the only node in the tree that may currently violate a // B-tree invariant. func (n *DevAddrnode) rebalanceAfterRemove(gap DevAddrGapIterator) DevAddrGapIterator { for { if n.nrSegments >= DevAddrminDegree-1 { return gap } if n.parent == nil { return gap } if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= DevAddrminDegree { copy(n.keys[1:], n.keys[:n.nrSegments]) copy(n.values[1:], n.values[:n.nrSegments]) n.keys[0] = n.parent.keys[n.parentIndex-1] n.values[0] = n.parent.values[n.parentIndex-1] n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] devAddrSetFuncs{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { copy(n.children[1:], n.children[:n.nrSegments+1]) n.children[0] = sibling.children[sibling.nrSegments] sibling.children[sibling.nrSegments] = nil n.children[0].parent = n n.children[0].parentIndex = 0 for i := 1; i < n.nrSegments+2; i++ { n.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if DevAddrtrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling && gap.index == sibling.nrSegments { return DevAddrGapIterator{n, 0} } if gap.node == n { return DevAddrGapIterator{n, gap.index + 1} } return gap } if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= DevAddrminDegree { n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] n.values[n.nrSegments] = n.parent.values[n.parentIndex] n.parent.keys[n.parentIndex] = sibling.keys[0] n.parent.values[n.parentIndex] = sibling.values[0] copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) devAddrSetFuncs{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { n.children[n.nrSegments+1] = sibling.children[0] copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) sibling.children[sibling.nrSegments] = nil n.children[n.nrSegments+1].parent = n n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 for i := 0; i < sibling.nrSegments; i++ { sibling.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if DevAddrtrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling { if gap.index == 0 { return DevAddrGapIterator{n, n.nrSegments} } return DevAddrGapIterator{sibling, gap.index - 1} } return gap } p := n.parent if p.nrSegments == 1 { left, right := p.children[0], p.children[1] p.nrSegments = left.nrSegments + right.nrSegments + 1 p.hasChildren = left.hasChildren p.keys[left.nrSegments] = p.keys[0] p.values[left.nrSegments] = p.values[0] copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := 0; i < p.nrSegments+1; i++ { p.children[i].parent = p p.children[i].parentIndex = i } } else { p.children[0] = nil p.children[1] = nil } if gap.node == left { return DevAddrGapIterator{p, gap.index} } if gap.node == right { return DevAddrGapIterator{p, gap.index + left.nrSegments + 1} } return gap } // Merge n and either sibling, along with the segment separating the // two, into whichever of the two nodes comes first. This is the // reverse of the non-root splitting case in // node.rebalanceBeforeInsert. var left, right *DevAddrnode if n.parentIndex > 0 { left = n.prevSibling() right = n } else { left = n right = n.nextSibling() } if gap.node == right { gap = DevAddrGapIterator{left, gap.index + left.nrSegments + 1} } left.keys[left.nrSegments] = p.keys[left.parentIndex] left.values[left.nrSegments] = p.values[left.parentIndex] copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { left.children[i].parent = left left.children[i].parentIndex = i } } left.nrSegments += right.nrSegments + 1 copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) devAddrSetFuncs{}.ClearValue(&p.values[p.nrSegments-1]) copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) for i := 0; i < p.nrSegments; i++ { p.children[i].parentIndex = i } p.children[p.nrSegments] = nil p.nrSegments-- if DevAddrtrackGaps != 0 { left.updateMaxGapLocal() } n = p } } // updateMaxGapLeaf updates maxGap bottom-up from the calling leaf until no // necessary update. // // Preconditions: n must be a leaf node, trackGaps must be 1. func (n *DevAddrnode) updateMaxGapLeaf() { if n.hasChildren { panic(fmt.Sprintf("updateMaxGapLeaf should always be called on leaf node: %v", n)) } max := n.calculateMaxGapLeaf() if max == n.maxGap.Get() { return } oldMax := n.maxGap.Get() n.maxGap.Set(max) if max > oldMax { for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() >= max { break } p.maxGap.Set(max) } return } for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() > oldMax { break } parentNewMax := p.calculateMaxGapInternal() if p.maxGap.Get() == parentNewMax { break } p.maxGap.Set(parentNewMax) } } // updateMaxGapLocal updates maxGap of the calling node solely with no // propagation to ancestor nodes. // // Precondition: trackGaps must be 1. func (n *DevAddrnode) updateMaxGapLocal() { if !n.hasChildren { n.maxGap.Set(n.calculateMaxGapLeaf()) } else { n.maxGap.Set(n.calculateMaxGapInternal()) } } // calculateMaxGapLeaf iterates the gaps within a leaf node and calculate the // max. // // Preconditions: n must be a leaf node. func (n *DevAddrnode) calculateMaxGapLeaf() uint64 { max := DevAddrGapIterator{n, 0}.Range().Length() for i := 1; i <= n.nrSegments; i++ { if current := (DevAddrGapIterator{n, i}).Range().Length(); current > max { max = current } } return max } // calculateMaxGapInternal iterates children's maxGap within an internal node n // and calculate the max. // // Preconditions: n must be a non-leaf node. func (n *DevAddrnode) calculateMaxGapInternal() uint64 { max := n.children[0].maxGap.Get() for i := 1; i <= n.nrSegments; i++ { if current := n.children[i].maxGap.Get(); current > max { max = current } } return max } // searchFirstLargeEnoughGap returns the first gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *DevAddrnode) searchFirstLargeEnoughGap(minSize uint64) DevAddrGapIterator { if n.maxGap.Get() < minSize { return DevAddrGapIterator{} } if n.hasChildren { for i := 0; i <= n.nrSegments; i++ { if largeEnoughGap := n.children[i].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := 0; i <= n.nrSegments; i++ { currentGap := DevAddrGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // searchLastLargeEnoughGap returns the last gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *DevAddrnode) searchLastLargeEnoughGap(minSize uint64) DevAddrGapIterator { if n.maxGap.Get() < minSize { return DevAddrGapIterator{} } if n.hasChildren { for i := n.nrSegments; i >= 0; i-- { if largeEnoughGap := n.children[i].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := n.nrSegments; i >= 0; i-- { currentGap := DevAddrGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // A Iterator is conceptually one of: // // - A pointer to a segment in a set; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Iterators are copyable values and are meaningfully equality-comparable. The // zero value of Iterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type DevAddrIterator struct { // node is the node containing the iterated segment. If the iterator is // terminal, node is nil. node *DevAddrnode // index is the index of the segment in node.keys/values. index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (seg DevAddrIterator) Ok() bool { return seg.node != nil } // Range returns the iterated segment's range key. func (seg DevAddrIterator) Range() DevAddrRange { return seg.node.keys[seg.index] } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (seg DevAddrIterator) Start() uint64 { return seg.node.keys[seg.index].Start } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (seg DevAddrIterator) End() uint64 { return seg.node.keys[seg.index].End } // SetRangeUnchecked mutates the iterated segment's range key. This operation // does not invalidate any iterators. // // Preconditions: // - r.Length() > 0. // - The new range must not overlap an existing one: // - If seg.NextSegment().Ok(), then r.end <= seg.NextSegment().Start(). // - If seg.PrevSegment().Ok(), then r.start >= seg.PrevSegment().End(). func (seg DevAddrIterator) SetRangeUnchecked(r DevAddrRange) { seg.node.keys[seg.index] = r } // SetRange mutates the iterated segment's range key. If the new range would // cause the iterated segment to overlap another segment, or if the new range // is invalid, SetRange panics. This operation does not invalidate any // iterators. func (seg DevAddrIterator) SetRange(r DevAddrRange) { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) } if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) } seg.SetRangeUnchecked(r) } // SetStartUnchecked mutates the iterated segment's start. This operation does // not invalidate any iterators. // // Preconditions: The new start must be valid: // - start < seg.End() // - If seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). func (seg DevAddrIterator) SetStartUnchecked(start uint64) { seg.node.keys[seg.index].Start = start } // SetStart mutates the iterated segment's start. If the new start value would // cause the iterated segment to overlap another segment, or would result in an // invalid range, SetStart panics. This operation does not invalidate any // iterators. func (seg DevAddrIterator) SetStart(start uint64) { if start >= seg.End() { panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) } if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) } seg.SetStartUnchecked(start) } // SetEndUnchecked mutates the iterated segment's end. This operation does not // invalidate any iterators. // // Preconditions: The new end must be valid: // - end > seg.Start(). // - If seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). func (seg DevAddrIterator) SetEndUnchecked(end uint64) { seg.node.keys[seg.index].End = end } // SetEnd mutates the iterated segment's end. If the new end value would cause // the iterated segment to overlap another segment, or would result in an // invalid range, SetEnd panics. This operation does not invalidate any // iterators. func (seg DevAddrIterator) SetEnd(end uint64) { if end <= seg.Start() { panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) } if next := seg.NextSegment(); next.Ok() && end > next.Start() { panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) } seg.SetEndUnchecked(end) } // Value returns a copy of the iterated segment's value. func (seg DevAddrIterator) Value() __generics_imported0.PinnedRange { return seg.node.values[seg.index] } // ValuePtr returns a pointer to the iterated segment's value. The pointer is // invalidated if the iterator is invalidated. This operation does not // invalidate any iterators. func (seg DevAddrIterator) ValuePtr() *__generics_imported0.PinnedRange { return &seg.node.values[seg.index] } // SetValue mutates the iterated segment's value. This operation does not // invalidate any iterators. func (seg DevAddrIterator) SetValue(val __generics_imported0.PinnedRange) { seg.node.values[seg.index] = val } // PrevSegment returns the iterated segment's predecessor. If there is no // preceding segment, PrevSegment returns a terminal iterator. func (seg DevAddrIterator) PrevSegment() DevAddrIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment() } if seg.index > 0 { return DevAddrIterator{seg.node, seg.index - 1} } if seg.node.parent == nil { return DevAddrIterator{} } return DevAddrsegmentBeforePosition(seg.node.parent, seg.node.parentIndex) } // NextSegment returns the iterated segment's successor. If there is no // succeeding segment, NextSegment returns a terminal iterator. func (seg DevAddrIterator) NextSegment() DevAddrIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment() } if seg.index < seg.node.nrSegments-1 { return DevAddrIterator{seg.node, seg.index + 1} } if seg.node.parent == nil { return DevAddrIterator{} } return DevAddrsegmentAfterPosition(seg.node.parent, seg.node.parentIndex) } // PrevGap returns the gap immediately before the iterated segment. func (seg DevAddrIterator) PrevGap() DevAddrGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment().NextGap() } return DevAddrGapIterator{seg.node, seg.index} } // NextGap returns the gap immediately after the iterated segment. func (seg DevAddrIterator) NextGap() DevAddrGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment().PrevGap() } return DevAddrGapIterator{seg.node, seg.index + 1} } // PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, // or the gap before the iterated segment otherwise. If seg.Start() == // Functions.MinKey(), PrevNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by PrevNonEmpty will be // non-terminal. func (seg DevAddrIterator) PrevNonEmpty() (DevAddrIterator, DevAddrGapIterator) { if prev := seg.PrevSegment(); prev.Ok() && prev.End() == seg.Start() { return prev, DevAddrGapIterator{} } return DevAddrIterator{}, seg.PrevGap() } // NextNonEmpty returns the iterated segment's successor if it is adjacent, or // the gap after the iterated segment otherwise. If seg.End() == // Functions.MaxKey(), NextNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by NextNonEmpty will be // non-terminal. func (seg DevAddrIterator) NextNonEmpty() (DevAddrIterator, DevAddrGapIterator) { if next := seg.NextSegment(); next.Ok() && next.Start() == seg.End() { return next, DevAddrGapIterator{} } return DevAddrIterator{}, seg.NextGap() } // A GapIterator is conceptually one of: // // - A pointer to a position between two segments, before the first segment, or // after the last segment in a set, called a *gap*; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Note that the gap between two adjacent segments exists (iterators to it are // non-terminal), but has a length of zero. GapIterator.IsEmpty returns true // for such gaps. An empty set contains a single gap, spanning the entire range // of the set's keys. // // GapIterators are copyable values and are meaningfully equality-comparable. // The zero value of GapIterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type DevAddrGapIterator struct { // The representation of a GapIterator is identical to that of an Iterator, // except that index corresponds to positions between segments in the same // way as for node.children (see comment for node.nrSegments). node *DevAddrnode index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (gap DevAddrGapIterator) Ok() bool { return gap.node != nil } // Range returns the range spanned by the iterated gap. func (gap DevAddrGapIterator) Range() DevAddrRange { return DevAddrRange{gap.Start(), gap.End()} } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (gap DevAddrGapIterator) Start() uint64 { if ps := gap.PrevSegment(); ps.Ok() { return ps.End() } return devAddrSetFuncs{}.MinKey() } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (gap DevAddrGapIterator) End() uint64 { if ns := gap.NextSegment(); ns.Ok() { return ns.Start() } return devAddrSetFuncs{}.MaxKey() } // IsEmpty returns true if the iterated gap is empty (that is, the "gap" is // between two adjacent segments.) func (gap DevAddrGapIterator) IsEmpty() bool { return gap.Range().Length() == 0 } // PrevSegment returns the segment immediately before the iterated gap. If no // such segment exists, PrevSegment returns a terminal iterator. func (gap DevAddrGapIterator) PrevSegment() DevAddrIterator { return DevAddrsegmentBeforePosition(gap.node, gap.index) } // NextSegment returns the segment immediately after the iterated gap. If no // such segment exists, NextSegment returns a terminal iterator. func (gap DevAddrGapIterator) NextSegment() DevAddrIterator { return DevAddrsegmentAfterPosition(gap.node, gap.index) } // PrevGap returns the iterated gap's predecessor. If no such gap exists, // PrevGap returns a terminal iterator. func (gap DevAddrGapIterator) PrevGap() DevAddrGapIterator { seg := gap.PrevSegment() if !seg.Ok() { return DevAddrGapIterator{} } return seg.PrevGap() } // NextGap returns the iterated gap's successor. If no such gap exists, NextGap // returns a terminal iterator. func (gap DevAddrGapIterator) NextGap() DevAddrGapIterator { seg := gap.NextSegment() if !seg.Ok() { return DevAddrGapIterator{} } return seg.NextGap() } // NextLargeEnoughGap returns the iterated gap's first next gap with larger // length than minSize. If not found, return a terminal gap iterator (does NOT // include this gap itself). // // Precondition: trackGaps must be 1. func (gap DevAddrGapIterator) NextLargeEnoughGap(minSize uint64) DevAddrGapIterator { if DevAddrtrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == gap.node.nrSegments { gap.node = gap.NextSegment().node gap.index = 0 return gap.nextLargeEnoughGapHelper(minSize) } return gap.nextLargeEnoughGapHelper(minSize) } // nextLargeEnoughGapHelper is the helper function used by NextLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the trailing gap of a non-leaf node. func (gap DevAddrGapIterator) nextLargeEnoughGapHelper(minSize uint64) DevAddrGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == gap.node.nrSegments)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return DevAddrGapIterator{} } gap.index++ for gap.index <= gap.node.nrSegments { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index++ } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == gap.node.nrSegments { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // PrevLargeEnoughGap returns the iterated gap's first prev gap with larger or // equal length than minSize. If not found, return a terminal gap iterator // (does NOT include this gap itself). // // Precondition: trackGaps must be 1. func (gap DevAddrGapIterator) PrevLargeEnoughGap(minSize uint64) DevAddrGapIterator { if DevAddrtrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == 0 { gap.node = gap.PrevSegment().node gap.index = gap.node.nrSegments return gap.prevLargeEnoughGapHelper(minSize) } return gap.prevLargeEnoughGapHelper(minSize) } // prevLargeEnoughGapHelper is the helper function used by PrevLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the first gap of a non-leaf node. func (gap DevAddrGapIterator) prevLargeEnoughGapHelper(minSize uint64) DevAddrGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == 0)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return DevAddrGapIterator{} } gap.index-- for gap.index >= 0 { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index-- } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == 0 { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // segmentBeforePosition returns the predecessor segment of the position given // by n.children[i], which may or may not contain a child. If no such segment // exists, segmentBeforePosition returns a terminal iterator. func DevAddrsegmentBeforePosition(n *DevAddrnode, i int) DevAddrIterator { for i == 0 { if n.parent == nil { return DevAddrIterator{} } n, i = n.parent, n.parentIndex } return DevAddrIterator{n, i - 1} } // segmentAfterPosition returns the successor segment of the position given by // n.children[i], which may or may not contain a child. If no such segment // exists, segmentAfterPosition returns a terminal iterator. func DevAddrsegmentAfterPosition(n *DevAddrnode, i int) DevAddrIterator { for i == n.nrSegments { if n.parent == nil { return DevAddrIterator{} } n, i = n.parent, n.parentIndex } return DevAddrIterator{n, i} } func DevAddrzeroValueSlice(slice []__generics_imported0.PinnedRange) { for i := range slice { devAddrSetFuncs{}.ClearValue(&slice[i]) } } func DevAddrzeroNodeSlice(slice []*DevAddrnode) { for i := range slice { slice[i] = nil } } // String stringifies a Set for debugging. func (s *DevAddrSet) String() string { return s.root.String() } // String stringifies a node (and all of its children) for debugging. func (n *DevAddrnode) String() string { var buf bytes.Buffer n.writeDebugString(&buf, "") return buf.String() } func (n *DevAddrnode) writeDebugString(buf *bytes.Buffer, prefix string) { if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { buf.WriteString(prefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) } for i := 0; i < n.nrSegments; i++ { if child := n.children[i]; child != nil { cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) if child.parent != n || child.parentIndex != i { buf.WriteString(cprefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) } child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) } buf.WriteString(prefix) if n.hasChildren { if DevAddrtrackGaps != 0 { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v, maxGap: %d\n", i, n.keys[i], n.values[i], n.maxGap.Get())) } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } if child := n.children[n.nrSegments]; child != nil { child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) } } // FlatSegment represents a segment as a single object. FlatSegment is used as // an intermediate representation for save/restore and tests. // // +stateify savable type DevAddrFlatSegment struct { Start uint64 End uint64 Value __generics_imported0.PinnedRange } // ExportSlice returns a copy of all segments in the given set, in ascending // key order. func (s *DevAddrSet) ExportSlice() []DevAddrFlatSegment { var fs []DevAddrFlatSegment for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { fs = append(fs, DevAddrFlatSegment{ Start: seg.Start(), End: seg.End(), Value: seg.Value(), }) } return fs } // ImportSlice initializes the given set from the given slice. // // Preconditions: // - s must be empty. // - fs must represent a valid set (the segments in fs must have valid // lengths that do not overlap). // - The segments in fs must be sorted in ascending key order. func (s *DevAddrSet) ImportSlice(fs []DevAddrFlatSegment) error { if !s.IsEmpty() { return fmt.Errorf("cannot import into non-empty set %v", s) } gap := s.FirstGap() for i := range fs { f := &fs[i] r := DevAddrRange{f.Start, f.End} if !gap.Range().IsSupersetOf(r) { return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: %v => %v", r, f.Value) } gap = s.InsertWithoutMerging(gap, r, f.Value).NextGap() } return nil } // segmentTestCheck returns an error if s is incorrectly sorted, does not // contain exactly expectedSegments segments, or contains a segment which // fails the passed check. // // This should be used only for testing, and has been added to this package for // templating convenience. func (s *DevAddrSet) segmentTestCheck(expectedSegments int, segFunc func(int, DevAddrRange, __generics_imported0.PinnedRange) error) error { havePrev := false prev := uint64(0) nrSegments := 0 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { next := seg.Start() if havePrev && prev >= next { return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments) } if segFunc != nil { if err := segFunc(nrSegments, seg.Range(), seg.Value()); err != nil { return err } } prev = next havePrev = true nrSegments++ } if nrSegments != expectedSegments { return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments) } return nil } // countSegments counts the number of segments in the set. // // Similar to Check, this should only be used for testing. func (s *DevAddrSet) countSegments() (segments int) { for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { segments++ } return segments } func (s *DevAddrSet) saveRoot() []DevAddrFlatSegment { fs := s.ExportSlice() fs = fs[:len(fs):len(fs)] return fs } func (s *DevAddrSet) loadRoot(_ context.Context, fs []DevAddrFlatSegment) { if err := s.ImportSlice(fs); err != nil { panic(err) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/tpuproxy/device.go000066400000000000000000000100701465435605700260700ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tpuproxy import ( "path/filepath" "strconv" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/devutil" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) const ( // VFIO_MINOR is the VFIO minor number from include/linux/miscdevice.h. VFIO_MINOR = 196 // VFIOPath is the path to a VFIO device, it is usually used to // construct a VFIO container. VFIOPath = "/dev/vfio/vfio" tpuDeviceGroupName = "vfio" vfioDeviceGroupName = "vfio" ) // device implements TPU's vfs.Device for /dev/vfio/[0-9]+ // // +stateify savable type tpuDevice struct { mu sync.Mutex // minor is the device minor number. minor uint32 // num is the number of the device in the dev filesystem (e.g /dev/vfio/0). num uint32 } // Open implements vfs.Device.Open. func (dev *tpuDevice) Open(ctx context.Context, mnt *vfs.Mount, d *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { devClient := devutil.GoferClientFromContext(ctx) if devClient == nil { log.Warningf("devutil.CtxDevGoferClient is not set") return nil, linuxerr.ENOENT } dev.mu.Lock() defer dev.mu.Unlock() devName := filepath.Join("vfio", strconv.Itoa(int(dev.num))) hostFD, err := devClient.OpenAt(ctx, devName, opts.Flags) if err != nil { ctx.Warningf("tpuDevice: failed to open host %s: %v", devName, err) return nil, err } fd := &tpuFD{ hostFD: int32(hostFD), device: dev, } if err := fd.vfsfd.Init(fd, opts.Flags, mnt, d, &vfs.FileDescriptionOptions{ UseDentryMetadata: true, }); err != nil { unix.Close(hostFD) return nil, err } if err := fdnotifier.AddFD(int32(hostFD), &fd.queue); err != nil { unix.Close(hostFD) return nil, err } fd.memmapFile.fd = fd return &fd.vfsfd, nil } // device implements vfs.Device for /dev/vfio/vfio. type vfioDevice struct{} // Open implements vfs.Device.Open. func (dev *vfioDevice) Open(ctx context.Context, mnt *vfs.Mount, d *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { client := devutil.GoferClientFromContext(ctx) if client == nil { log.Warningf("devutil.CtxDevGoferClient is not set") return nil, linuxerr.ENOENT } name := filepath.Join("vfio", filepath.Base(VFIOPath)) hostFD, err := client.OpenAt(ctx, name, opts.Flags) if err != nil { ctx.Warningf("failed to open host file %s: %v", name, err) return nil, err } fd := &vfioFD{ hostFD: int32(hostFD), device: dev, } if err := fd.vfsfd.Init(fd, opts.Flags, mnt, d, &vfs.FileDescriptionOptions{ UseDentryMetadata: true, }); err != nil { unix.Close(hostFD) return nil, err } if err := fdnotifier.AddFD(int32(hostFD), &fd.queue); err != nil { unix.Close(hostFD) return nil, err } fd.memmapFile.fd = fd return &fd.vfsfd, nil } // RegisterTPUDevice registers devices implemented by this package in vfsObj. func RegisterTPUDevice(vfsObj *vfs.VirtualFilesystem, minor, deviceNum uint32) error { return vfsObj.RegisterDevice(vfs.CharDevice, linux.VFIO_MAJOR, minor, &tpuDevice{ minor: minor, num: deviceNum, }, &vfs.RegisterDeviceOptions{ GroupName: tpuDeviceGroupName, }) } // RegisterVfioDevice registers VFIO devices that are implemented by this package in vfsObj. func RegisterVfioDevice(vfsObj *vfs.VirtualFilesystem) error { return vfsObj.RegisterDevice(vfs.CharDevice, linux.MISC_MAJOR, VFIO_MINOR, &vfioDevice{}, &vfs.RegisterDeviceOptions{ GroupName: vfioDeviceGroupName, }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/tpuproxy/ioctl_unsafe.go000066400000000000000000000024621465435605700273120ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License package tpuproxy import ( "unsafe" "golang.org/x/exp/constraints" "golang.org/x/sys/unix" ) // IOCTLInvokePtrArg makes ioctl syscalls with the command of the integer type // and the pointer to any given params. func IOCTLInvokePtrArg[Cmd constraints.Integer, Params any](hostFd int32, cmd Cmd, params *Params) (uintptr, error) { return IOCTLInvoke[Cmd, uintptr](hostFd, cmd, uintptr(unsafe.Pointer(params))) } // IOCTLInvoke makes ioctl syscalls with the arg of the integer type. func IOCTLInvoke[Cmd, Arg constraints.Integer](hostFd int32, cmd Cmd, arg Arg) (uintptr, error) { n, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(hostFd), uintptr(cmd), uintptr(arg)) if errno != 0 { return n, errno } return n, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/tpuproxy/seccomp_filter.go000066400000000000000000000062361465435605700276400ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tpuproxy import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" ) // Filters returns seccomp-bpf filters for this package. func Filters() seccomp.SyscallRules { return seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_OPENAT: seccomp.PerArg{ // All paths that we openat() are absolute, so we pass a dirfd // of -1 (which is invalid for relative paths, but ignored for // absolute paths) to hedge against bugs involving AT_FDCWD or // real dirfds. seccomp.EqualTo(^uintptr(0)), seccomp.AnyValue{}, seccomp.MaskedEqual(unix.O_CREAT|unix.O_NOFOLLOW, unix.O_NOFOLLOW), seccomp.AnyValue{}, }, unix.SYS_GETDENTS64: seccomp.MatchAll{}, unix.SYS_EVENTFD2: seccomp.Or{ seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(linux.EFD_NONBLOCK), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(linux.EFD_NONBLOCK | linux.EFD_SEMAPHORE), }, }, unix.SYS_MREMAP: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(0), /* old_size */ seccomp.AnyValue{}, seccomp.EqualTo(linux.MREMAP_MAYMOVE | linux.MREMAP_FIXED), seccomp.AnyValue{}, seccomp.EqualTo(0), }, unix.SYS_MMAP: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(linux.PROT_READ | linux.PROT_WRITE), seccomp.EqualTo(linux.MAP_SHARED | linux.MAP_LOCKED), seccomp.NonNegativeFD{}, }, unix.SYS_MUNMAP: seccomp.MatchAll{}, unix.SYS_PREAD64: seccomp.MatchAll{}, unix.SYS_PWRITE64: seccomp.MatchAll{}, unix.SYS_IOCTL: seccomp.Or{ seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(linux.VFIO_CHECK_EXTENSION), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(linux.VFIO_DEVICE_GET_INFO), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(linux.VFIO_DEVICE_GET_REGION_INFO), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(linux.VFIO_DEVICE_GET_IRQ_INFO), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(linux.VFIO_DEVICE_SET_IRQS), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(linux.VFIO_GROUP_GET_DEVICE_FD), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(linux.VFIO_GROUP_SET_CONTAINER), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(linux.VFIO_IOMMU_MAP_DMA), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(linux.VFIO_IOMMU_UNMAP_DMA), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(linux.VFIO_SET_IOMMU), }, }, }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/tpuproxy/tpu.go000066400000000000000000000365221465435605700254530ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package tpuproxy implements proxying for TPU devices. package tpuproxy import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) const ( // A value of -1 can be used to either de-assign interrupts if already // assigned or skip un-assigned interrupts. disableInterrupt = -1 ) var ( // vfioDeviceInfoFlags contains all available flags for // IOCTL command VFIO_DEVICE_GET_INFO. vfioDeviceInfoFlags uint32 = linux.VFIO_DEVICE_FLAGS_RESET | linux.VFIO_DEVICE_FLAGS_PCI | linux.VFIO_DEVICE_FLAGS_PLATFORM | linux.VFIO_DEVICE_FLAGS_AMBA | linux.VFIO_DEVICE_FLAGS_CCW | linux.VFIO_DEVICE_FLAGS_AP | linux.VFIO_DEVICE_FLAGS_FSL_MC | linux.VFIO_DEVICE_FLAGS_CAPS | linux.VFIO_DEVICE_FLAGS_CDX // vfioIrqSetFlags includes all available flags for IOCTL comamnd VFIO_DEVICE_SET_IRQS vfioIrqSetFlags uint32 = linux.VFIO_IRQ_SET_DATA_TYPE_MASK | linux.VFIO_IRQ_SET_ACTION_TYPE_MASK ) // tpuFD implements vfs.FileDescriptionImpl for /dev/vfio/[0-9]+ // // tpuFD is not savable until TPU save/restore is needed. type tpuFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD hostFD int32 device *tpuDevice queue waiter.Queue memmapFile tpuFDMemmapFile } // Release implements vfs.FileDescriptionImpl.Release. func (fd *tpuFD) Release(context.Context) { fdnotifier.RemoveFD(fd.hostFD) fd.queue.Notify(waiter.EventHUp) unix.Close(int(fd.hostFD)) } // EventRegister implements waiter.Waitable.EventRegister. func (fd *tpuFD) EventRegister(e *waiter.Entry) error { fd.queue.EventRegister(e) if err := fdnotifier.UpdateFD(fd.hostFD); err != nil { fd.queue.EventUnregister(e) return err } return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (fd *tpuFD) EventUnregister(e *waiter.Entry) { fd.queue.EventUnregister(e) if err := fdnotifier.UpdateFD(fd.hostFD); err != nil { panic(fmt.Sprint("UpdateFD:", err)) } } // Readiness implements waiter.Waitable.Readiness. func (fd *tpuFD) Readiness(mask waiter.EventMask) waiter.EventMask { return fdnotifier.NonBlockingPoll(fd.hostFD, mask) } // Epollable implements vfs.FileDescriptionImpl.Epollable. func (fd *tpuFD) Epollable() bool { return true } // Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (fd *tpuFD) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { cmd := args[1].Uint() t := kernel.TaskFromContext(ctx) if t == nil { panic("Ioctl should be called from a task context") } switch cmd { case linux.VFIO_GROUP_SET_CONTAINER: return fd.setContainer(ctx, t, args[2].Pointer()) case linux.VFIO_GROUP_GET_DEVICE_FD: ret, cleanup, err := fd.getPciDeviceFd(t, args[2].Pointer()) defer cleanup() return ret, err } return 0, linuxerr.ENOSYS } func (fd *tpuFD) setContainer(ctx context.Context, t *kernel.Task, arg hostarch.Addr) (uintptr, error) { var vfioContainerFD int32 if _, err := primitive.CopyInt32In(t, arg, &vfioContainerFD); err != nil { return 0, err } vfioContainerFile, _ := t.FDTable().Get(vfioContainerFD) if vfioContainerFile == nil { return 0, linuxerr.EBADF } defer vfioContainerFile.DecRef(ctx) vfioContainer, ok := vfioContainerFile.Impl().(*vfioFD) if !ok { return 0, linuxerr.EINVAL } return IOCTLInvokePtrArg[uint32](fd.hostFD, linux.VFIO_GROUP_SET_CONTAINER, &vfioContainer.hostFD) } // It will be the caller's responsibility to call the returned cleanup function. func (fd *tpuFD) getPciDeviceFd(t *kernel.Task, arg hostarch.Addr) (uintptr, func(), error) { pciAddress, err := t.CopyInString(arg, hostarch.PageSize) if err != nil { return 0, func() {}, err } // Build a NUL-terminated slice of bytes containing the PCI address. pciAddressBytes, err := unix.ByteSliceFromString(pciAddress) if err != nil { return 0, func() {}, err } // Pass the address of the PCI address' first byte which can be // recognized by the IOCTL syscall. hostFD, err := IOCTLInvokePtrArg[uint32](fd.hostFD, linux.VFIO_GROUP_GET_DEVICE_FD, &pciAddressBytes[0]) if err != nil { return 0, func() {}, err } pciDevFD := &pciDeviceFD{ hostFD: int32(hostFD), } cleanup := func() { unix.Close(int(hostFD)) } // See drivers/vfio/group.c:vfio_device_open_file(), the PCI device // is accessed for both reads and writes. vd := t.Kernel().VFS().NewAnonVirtualDentry("[vfio-device]") if err := pciDevFD.vfsfd.Init(pciDevFD, linux.O_RDWR, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ UseDentryMetadata: true, }); err != nil { return 0, cleanup, err } if err := fdnotifier.AddFD(int32(hostFD), &fd.queue); err != nil { return 0, cleanup, err } newFD, err := t.NewFDFrom(0, &pciDevFD.vfsfd, kernel.FDFlags{}) if err != nil { return 0, cleanup, err } // Initialize a mapping that is backed by a host FD. pciDevFD.memmapFile.fd = pciDevFD return uintptr(newFD), func() {}, nil } // pciDeviceFD implements vfs.FileDescriptionImpl for TPU's PCI device. type pciDeviceFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD hostFD int32 queue waiter.Queue memmapFile pciDeviceFdMemmapFile } // Release implements vfs.FileDescriptionImpl.Release. func (fd *pciDeviceFD) Release(context.Context) { fdnotifier.RemoveFD(fd.hostFD) fd.queue.Notify(waiter.EventHUp) unix.Close(int(fd.hostFD)) } // EventRegister implements waiter.Waitable.EventRegister. func (fd *pciDeviceFD) EventRegister(e *waiter.Entry) error { fd.queue.EventRegister(e) if err := fdnotifier.UpdateFD(fd.hostFD); err != nil { fd.queue.EventUnregister(e) return err } return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (fd *pciDeviceFD) EventUnregister(e *waiter.Entry) { fd.queue.EventUnregister(e) if err := fdnotifier.UpdateFD(fd.hostFD); err != nil { panic(fmt.Sprint("UpdateFD:", err)) } } // Readiness implements waiter.Waitable.Readiness. func (fd *pciDeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask { return fdnotifier.NonBlockingPoll(fd.hostFD, mask) } // Epollable implements vfs.FileDescriptionImpl.Epollable. func (fd *pciDeviceFD) Epollable() bool { return true } // Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (fd *pciDeviceFD) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { cmd := args[1].Uint() t := kernel.TaskFromContext(ctx) if t == nil { panic("Ioctl should be called from a task context") } switch cmd { // TODO(b/299303493): consider making VFIO's GET_INFO commands more generic. case linux.VFIO_DEVICE_GET_INFO: return fd.vfioDeviceInfo(ctx, t, args[2].Pointer()) case linux.VFIO_DEVICE_GET_REGION_INFO: return fd.vfioRegionInfo(ctx, t, args[2].Pointer()) case linux.VFIO_DEVICE_GET_IRQ_INFO: return fd.vfioIrqInfo(ctx, t, args[2].Pointer()) case linux.VFIO_DEVICE_SET_IRQS: return fd.vfioSetIrqs(ctx, t, args[2].Pointer()) case linux.VFIO_DEVICE_RESET: // VFIO_DEVICE_RESET is just a simple IOCTL command that carries no data. return IOCTLInvoke[uint32, uintptr](fd.hostFD, linux.VFIO_DEVICE_RESET, 0) } return 0, linuxerr.ENOSYS } // Retrieve the host TPU device's region information, which could be used by // vfio driver to setup mappings. func (fd *pciDeviceFD) vfioRegionInfo(ctx context.Context, t *kernel.Task, arg hostarch.Addr) (uintptr, error) { var regionInfo linux.VFIORegionInfo if _, err := regionInfo.CopyIn(t, arg); err != nil { return 0, err } if regionInfo.Argsz == 0 { return 0, linuxerr.EINVAL } ret, err := IOCTLInvokePtrArg[uint32](fd.hostFD, linux.VFIO_DEVICE_GET_REGION_INFO, ®ionInfo) if err != nil { return 0, err } if _, err := regionInfo.CopyOut(t, arg); err != nil { return 0, err } return ret, nil } // Retrieve the host TPU device's information. func (fd *pciDeviceFD) vfioDeviceInfo(ctx context.Context, t *kernel.Task, arg hostarch.Addr) (uintptr, error) { var deviceInfo linux.VFIODeviceInfo if _, err := deviceInfo.CopyIn(t, arg); err != nil { return 0, err } // Callers must set VFIODeviceInfo.Argsz. if deviceInfo.Argsz == 0 { return 0, linuxerr.EINVAL } if deviceInfo.Flags&^vfioDeviceInfoFlags != 0 { return 0, linuxerr.EINVAL } ret, err := IOCTLInvokePtrArg[uint32](fd.hostFD, linux.VFIO_DEVICE_GET_INFO, &deviceInfo) if err != nil { return 0, err } // gVisor is not supposed to change any device information that is // returned from the host since gVisor doesn't own the device. // Passing the device info back to the caller will be just fine. if _, err := deviceInfo.CopyOut(t, arg); err != nil { return 0, err } return ret, nil } // Retrieve the device's interrupt information. func (fd *pciDeviceFD) vfioIrqInfo(ctx context.Context, t *kernel.Task, arg hostarch.Addr) (uintptr, error) { var irqInfo linux.VFIOIrqInfo if _, err := irqInfo.CopyIn(t, arg); err != nil { return 0, err } // Callers must set the payload's size. if irqInfo.Argsz == 0 { return 0, linuxerr.EINVAL } ret, err := IOCTLInvokePtrArg[uint32](fd.hostFD, linux.VFIO_DEVICE_GET_IRQ_INFO, &irqInfo) if err != nil { return 0, err } if _, err := irqInfo.CopyOut(t, arg); err != nil { return 0, err } return ret, nil } func (fd *pciDeviceFD) vfioSetIrqs(ctx context.Context, t *kernel.Task, arg hostarch.Addr) (uintptr, error) { var irqSet linux.VFIOIrqSet if _, err := irqSet.CopyIn(t, arg); err != nil { return 0, err } // Callers must set the payload's size. if irqSet.Argsz == 0 { return 0, linuxerr.EINVAL } // Invalidate unknown flags. if irqSet.Flags&^vfioIrqSetFlags != 0 { return 0, linuxerr.EINVAL } // See drivers/vfio/vfio_main.c:vfio_set_irqs_validate_and_prepare, // VFIO uses the data type at the request's flags to determine // the memory layout of data field. // // The struct vfio_irq_set includes a flexible array member, it // allocates an array for a continuous trunk of memory to back // a vfio_irq_set object. In order to mirror that behavior, gVisor // would allocate a slice to store the underlying bytes // and pass that through to its host. switch irqSet.Flags & linux.VFIO_IRQ_SET_DATA_TYPE_MASK { // VFIO_IRQ_SET_DATA_NONE indicates there is no data field for // the IOCTL command. // It works with VFIO_IRQ_SET_ACTION_MASK, VFIO_IRQ_SET_ACTION_UNMASK, // or VFIO_IRQ_SET_ACTION_TRIGGER to mask an interrupt, unmask an // interrupt, and trigger an interrupt unconditionally. case linux.VFIO_IRQ_SET_DATA_NONE: // When there is no data, passing through the given payload // works just fine. return IOCTLInvokePtrArg[uint32](fd.hostFD, linux.VFIO_DEVICE_SET_IRQS, &irqSet) // VFIO_IRQ_SET_DATA_BOOL indicates that the data field is an array of uint8. // The action will be performed if the corresponding boolean is true. case linux.VFIO_IRQ_SET_DATA_BOOL: payloadSize := uint32(irqSet.Size()) + irqSet.Count payload := make([]uint8, payloadSize) if _, err := primitive.CopyUint8SliceIn(t, arg, payload); err != nil { return 0, err } return IOCTLInvokePtrArg[uint32](fd.hostFD, linux.VFIO_DEVICE_SET_IRQS, &payload[0]) // VFIO_IRQ_SET_DATA_EVENTFD indicates that the data field is an array // of int32 (or event file descriptors). These descriptors will be // signalled when an action in the flags happens. case linux.VFIO_IRQ_SET_DATA_EVENTFD: payloadSize := uint32(irqSet.Size())/4 + irqSet.Count payload := make([]int32, payloadSize) if _, err := primitive.CopyInt32SliceIn(t, arg, payload); err != nil { return 0, err } // Transform the input FDs to host FDs. for i := 0; i < int(irqSet.Count); i++ { index := len(payload) - 1 - i fd := payload[index] // Skip non-event FD. if fd == disableInterrupt { continue } eventFileGeneric, _ := t.FDTable().Get(fd) if eventFileGeneric == nil { return 0, linuxerr.EBADF } defer eventFileGeneric.DecRef(ctx) eventFile, ok := eventFileGeneric.Impl().(*eventfd.EventFileDescription) if !ok { return 0, linuxerr.EINVAL } eventfd, err := eventFile.HostFD() if err != nil { return 0, err } payload[index] = int32(eventfd) } return IOCTLInvokePtrArg[uint32](fd.hostFD, linux.VFIO_DEVICE_SET_IRQS, &payload[0]) } // No data type is specified or multiple data types are specified. return 0, linuxerr.EINVAL } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *pciDeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { if offset < 0 { return 0, linuxerr.EINVAL } buf := make([]byte, dst.NumBytes()) _, err := unix.Pread(int(fd.hostFD), buf, offset) if err != nil { return 0, err } n, err := dst.CopyOut(ctx, buf) return int64(n), err } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *pciDeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { if offset < 0 { return 0, linuxerr.EINVAL } buf := make([]byte, src.NumBytes()) _, err := src.CopyIn(ctx, buf) if err != nil { return 0, err } n, err := unix.Pwrite(int(fd.hostFD), buf, offset) return int64(n), err } // DevAddrSet tracks device address ranges that have been mapped. type devAddrSetFuncs struct{} func (devAddrSetFuncs) MinKey() uint64 { return 0 } func (devAddrSetFuncs) MaxKey() uint64 { return ^uint64(0) } func (devAddrSetFuncs) ClearValue(val *mm.PinnedRange) { *val = mm.PinnedRange{} } func (devAddrSetFuncs) Merge(r1 DevAddrRange, v1 mm.PinnedRange, r2 DevAddrRange, v2 mm.PinnedRange) (mm.PinnedRange, bool) { // Do we have the same backing file? if v1.File != v2.File { return mm.PinnedRange{}, false } // Do we have contiguous offsets in the backing file? if v1.Offset+uint64(v1.Source.Length()) != v2.Offset { return mm.PinnedRange{}, false } // Are the virtual addresses contiguous? // // This check isn't strictly needed because 'mm.PinnedRange.Source' // is only used to track the size of the pinned region (this is // because the virtual address range can be unmapped or remapped // elsewhere). Regardless we require this for simplicity. if v1.Source.End != v2.Source.Start { return mm.PinnedRange{}, false } // Extend v1 to account for the adjacent PinnedRange. v1.Source.End = v2.Source.End return v1, true } func (devAddrSetFuncs) Split(r DevAddrRange, val mm.PinnedRange, split uint64) (mm.PinnedRange, mm.PinnedRange) { n := split - r.Start left := val left.Source.End = left.Source.Start + hostarch.Addr(n) right := val right.Source.Start += hostarch.Addr(n) right.Offset += n return left, right } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/tpuproxy/tpu_mmap.go000066400000000000000000000114111465435605700264530ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tpuproxy import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *tpuFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts) } // AddMapping implements memmap.Mappable.AddMapping. func (fd *tpuFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (fd *tpuFD) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { } // CopyMapping implements memmap.Mappable.CopyMapping. func (fd *tpuFD) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { return nil } // Translate implements memmap.Mappable.Translate. func (fd *tpuFD) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { return []memmap.Translation{ { Source: optional, File: &fd.memmapFile, Offset: optional.Start, Perms: at, }, }, nil } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (fd *tpuFD) InvalidateUnsavable(ctx context.Context) error { return nil } type tpuFDMemmapFile struct { memmap.NoBufferedIOFallback fd *tpuFD } // IncRef implements memmap.File.IncRef. func (mf *tpuFDMemmapFile) IncRef(memmap.FileRange, uint32) { } // DecRef implements memmap.File.DecRef. func (mf *tpuFDMemmapFile) DecRef(fr memmap.FileRange) { } // MapInternal implements memmap.File.MapInternal. func (mf *tpuFDMemmapFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { log.Traceback("tpuproxy: rejecting tpuFdMemmapFile.MapInternal") return safemem.BlockSeq{}, linuxerr.EINVAL } // FD implements memmap.File.FD. func (mf *tpuFDMemmapFile) FD() int { return int(mf.fd.hostFD) } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *pciDeviceFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts) } // AddMapping implements memmap.Mappable.AddMapping. func (fd *pciDeviceFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (fd *pciDeviceFD) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { } // CopyMapping implements memmap.Mappable.CopyMapping. func (fd *pciDeviceFD) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { return nil } // Translate implements memmap.Mappable.Translate. func (fd *pciDeviceFD) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { return []memmap.Translation{ { Source: optional, File: &fd.memmapFile, Offset: optional.Start, Perms: at, }, }, nil } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (fd *pciDeviceFD) InvalidateUnsavable(ctx context.Context) error { return nil } type pciDeviceFdMemmapFile struct { memmap.NoBufferedIOFallback fd *pciDeviceFD } // IncRef implements memmap.File.IncRef. func (mf *pciDeviceFdMemmapFile) IncRef(memmap.FileRange, uint32) { } // DecRef implements memmap.File.DecRef. func (mf *pciDeviceFdMemmapFile) DecRef(fr memmap.FileRange) { } // MapInternal implements memmap.File.MapInternal. func (mf *pciDeviceFdMemmapFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { log.Traceback("tpuproxy: rejecting pciDeviceFdMemmapFile.MapInternal") return safemem.BlockSeq{}, linuxerr.EINVAL } // FD implements memmap.File.FD. func (mf *pciDeviceFdMemmapFile) FD() int { return int(mf.fd.hostFD) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/tpuproxy/tpuproxy_state_autogen.go000066400000000000000000000104071465435605700314710ustar00rootroot00000000000000// automatically generated by stateify. package tpuproxy import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (r *DevAddrRange) StateTypeName() string { return "pkg/sentry/devices/tpuproxy.DevAddrRange" } func (r *DevAddrRange) StateFields() []string { return []string{ "Start", "End", } } func (r *DevAddrRange) beforeSave() {} // +checklocksignore func (r *DevAddrRange) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.Start) stateSinkObject.Save(1, &r.End) } func (r *DevAddrRange) afterLoad(context.Context) {} // +checklocksignore func (r *DevAddrRange) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.Start) stateSourceObject.Load(1, &r.End) } func (s *DevAddrSet) StateTypeName() string { return "pkg/sentry/devices/tpuproxy.DevAddrSet" } func (s *DevAddrSet) StateFields() []string { return []string{ "root", } } func (s *DevAddrSet) beforeSave() {} // +checklocksignore func (s *DevAddrSet) StateSave(stateSinkObject state.Sink) { s.beforeSave() var rootValue []DevAddrFlatSegment rootValue = s.saveRoot() stateSinkObject.SaveValue(0, rootValue) } func (s *DevAddrSet) afterLoad(context.Context) {} // +checklocksignore func (s *DevAddrSet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new([]DevAddrFlatSegment), func(y any) { s.loadRoot(ctx, y.([]DevAddrFlatSegment)) }) } func (n *DevAddrnode) StateTypeName() string { return "pkg/sentry/devices/tpuproxy.DevAddrnode" } func (n *DevAddrnode) StateFields() []string { return []string{ "nrSegments", "parent", "parentIndex", "hasChildren", "maxGap", "keys", "values", "children", } } func (n *DevAddrnode) beforeSave() {} // +checklocksignore func (n *DevAddrnode) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.nrSegments) stateSinkObject.Save(1, &n.parent) stateSinkObject.Save(2, &n.parentIndex) stateSinkObject.Save(3, &n.hasChildren) stateSinkObject.Save(4, &n.maxGap) stateSinkObject.Save(5, &n.keys) stateSinkObject.Save(6, &n.values) stateSinkObject.Save(7, &n.children) } func (n *DevAddrnode) afterLoad(context.Context) {} // +checklocksignore func (n *DevAddrnode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.nrSegments) stateSourceObject.Load(1, &n.parent) stateSourceObject.Load(2, &n.parentIndex) stateSourceObject.Load(3, &n.hasChildren) stateSourceObject.Load(4, &n.maxGap) stateSourceObject.Load(5, &n.keys) stateSourceObject.Load(6, &n.values) stateSourceObject.Load(7, &n.children) } func (d *DevAddrFlatSegment) StateTypeName() string { return "pkg/sentry/devices/tpuproxy.DevAddrFlatSegment" } func (d *DevAddrFlatSegment) StateFields() []string { return []string{ "Start", "End", "Value", } } func (d *DevAddrFlatSegment) beforeSave() {} // +checklocksignore func (d *DevAddrFlatSegment) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.Start) stateSinkObject.Save(1, &d.End) stateSinkObject.Save(2, &d.Value) } func (d *DevAddrFlatSegment) afterLoad(context.Context) {} // +checklocksignore func (d *DevAddrFlatSegment) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.Start) stateSourceObject.Load(1, &d.End) stateSourceObject.Load(2, &d.Value) } func (dev *tpuDevice) StateTypeName() string { return "pkg/sentry/devices/tpuproxy.tpuDevice" } func (dev *tpuDevice) StateFields() []string { return []string{ "mu", "minor", "num", } } func (dev *tpuDevice) beforeSave() {} // +checklocksignore func (dev *tpuDevice) StateSave(stateSinkObject state.Sink) { dev.beforeSave() stateSinkObject.Save(0, &dev.mu) stateSinkObject.Save(1, &dev.minor) stateSinkObject.Save(2, &dev.num) } func (dev *tpuDevice) afterLoad(context.Context) {} // +checklocksignore func (dev *tpuDevice) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &dev.mu) stateSourceObject.Load(1, &dev.minor) stateSourceObject.Load(2, &dev.num) } func init() { state.Register((*DevAddrRange)(nil)) state.Register((*DevAddrSet)(nil)) state.Register((*DevAddrnode)(nil)) state.Register((*DevAddrFlatSegment)(nil)) state.Register((*tpuDevice)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/tpuproxy/tpuproxy_unsafe_state_autogen.go000066400000000000000000000000721465435605700330270ustar00rootroot00000000000000// automatically generated by stateify. package tpuproxy golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/tpuproxy/vfio.go000066400000000000000000000175301465435605700256040ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tpuproxy import ( "fmt" "sync" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // deviceFD implements vfs.FileDescriptionImpl for /dev/vfio/vfio. type vfioFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD hostFD int32 device *vfioDevice queue waiter.Queue memmapFile vfioFDMemmapFile mu sync.Mutex // +checklocks:mu devAddrSet DevAddrSet } // Release implements vfs.FileDescriptionImpl.Release. func (fd *vfioFD) Release(context.Context) { fd.unpinRange(DevAddrRange{0, ^uint64(0)}) fdnotifier.RemoveFD(fd.hostFD) fd.queue.Notify(waiter.EventHUp) unix.Close(int(fd.hostFD)) } // EventRegister implements waiter.Waitable.EventRegister. func (fd *vfioFD) EventRegister(e *waiter.Entry) error { fd.queue.EventRegister(e) if err := fdnotifier.UpdateFD(fd.hostFD); err != nil { fd.queue.EventUnregister(e) return err } return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (fd *vfioFD) EventUnregister(e *waiter.Entry) { fd.queue.EventUnregister(e) if err := fdnotifier.UpdateFD(fd.hostFD); err != nil { panic(fmt.Sprint("UpdateFD:", err)) } } // Readiness implements waiter.Waitable.Readiness. func (fd *vfioFD) Readiness(mask waiter.EventMask) waiter.EventMask { return fdnotifier.NonBlockingPoll(fd.hostFD, mask) } // Epollable implements vfs.FileDescriptionImpl.Epollable. func (fd *vfioFD) Epollable() bool { return true } // Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (fd *vfioFD) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { cmd := args[1].Uint() t := kernel.TaskFromContext(ctx) if t == nil { panic("Ioctl should be called from a task context") } switch cmd { case linux.VFIO_CHECK_EXTENSION: return fd.checkExtension(extension(args[2].Int())) case linux.VFIO_SET_IOMMU: return fd.setIOMMU(extension(args[2].Int())) case linux.VFIO_IOMMU_MAP_DMA: return fd.iommuMapDma(ctx, t, args[2].Pointer()) case linux.VFIO_IOMMU_UNMAP_DMA: return fd.iommuUnmapDma(ctx, t, args[2].Pointer()) } return 0, linuxerr.ENOSYS } // checkExtension returns a positive integer when the given VFIO extension // is supported, otherwise, it returns 0. func (fd *vfioFD) checkExtension(ext extension) (uintptr, error) { switch ext { case linux.VFIO_TYPE1_IOMMU, linux.VFIO_SPAPR_TCE_IOMMU, linux.VFIO_TYPE1v2_IOMMU: ret, err := IOCTLInvoke[uint32, int32](fd.hostFD, linux.VFIO_CHECK_EXTENSION, int32(ext)) if err != nil { log.Warningf("check VFIO extension %s: %v", ext, err) return 0, err } return ret, nil } return 0, linuxerr.EINVAL } // Set the iommu to the given type. The type must be supported by an iommu // driver as verified by calling VFIO_CHECK_EXTENSION using the same type. func (fd *vfioFD) setIOMMU(ext extension) (uintptr, error) { switch ext { case linux.VFIO_TYPE1_IOMMU, linux.VFIO_SPAPR_TCE_IOMMU, linux.VFIO_TYPE1v2_IOMMU: ret, err := IOCTLInvoke[uint32, int32](fd.hostFD, linux.VFIO_SET_IOMMU, int32(ext)) if err != nil { log.Warningf("set the IOMMU group to %s: %v", ext, err) return 0, err } return ret, nil } return 0, linuxerr.EINVAL } func (fd *vfioFD) iommuMapDma(ctx context.Context, t *kernel.Task, arg hostarch.Addr) (uintptr, error) { var dmaMap linux.VFIOIommuType1DmaMap if _, err := dmaMap.CopyIn(t, arg); err != nil { return 0, err } tmm := t.MemoryManager() ar, ok := tmm.CheckIORange(hostarch.Addr(dmaMap.Vaddr), int64(dmaMap.Size)) if !ok { return 0, linuxerr.EFAULT } if !ar.IsPageAligned() || (dmaMap.Size/hostarch.PageSize) == 0 { return 0, linuxerr.EINVAL } // See comments at pkg/sentry/devices/accel/gasket.go, line 57-60. devAddr := dmaMap.IOVa devAddr &^= (hostarch.PageSize - 1) devar := DevAddrRange{ devAddr, devAddr + dmaMap.Size, } if !devar.WellFormed() { return 0, linuxerr.EINVAL } // Reserve a range in the address space. m, _, errno := unix.RawSyscall6(unix.SYS_MMAP, 0 /* addr */, uintptr(ar.Length()), unix.PROT_NONE, unix.MAP_PRIVATE|unix.MAP_ANONYMOUS, ^uintptr(0), 0) if errno != 0 { return 0, errno } cu := cleanup.Make(func() { unix.RawSyscall(unix.SYS_MUNMAP, m, uintptr(ar.Length()), 0) }) defer cu.Clean() // Mirror application mappings into the reserved range. prs, err := t.MemoryManager().Pin(ctx, ar, hostarch.ReadWrite, false) cu.Add(func() { mm.Unpin(prs) }) if err != nil { return 0, err } sentryAddr := uintptr(m) for _, pr := range prs { ims, err := pr.File.MapInternal(memmap.FileRange{Start: pr.Offset, End: pr.Offset + uint64(pr.Source.Length())}, hostarch.ReadWrite) if err != nil { return 0, err } for !ims.IsEmpty() { im := ims.Head() if _, _, errno := unix.RawSyscall6(unix.SYS_MREMAP, im.Addr(), 0, uintptr(im.Len()), linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED, sentryAddr, 0); errno != 0 { return 0, errno } sentryAddr += uintptr(im.Len()) ims = ims.Tail() } } // Replace Vaddr with the host's virtual address. dmaMap.Vaddr = uint64(m) n, err := IOCTLInvokePtrArg[uint32](fd.hostFD, linux.VFIO_IOMMU_MAP_DMA, &dmaMap) if err != nil { return n, err } cu.Release() // Unmap the reserved range, which is no longer required. unix.RawSyscall(unix.SYS_MUNMAP, m, uintptr(ar.Length()), 0) fd.mu.Lock() defer fd.mu.Unlock() dar := devAddr for _, pr := range prs { r := uint64(pr.Source.Length()) fd.devAddrSet.InsertRange(DevAddrRange{ dar, dar + r, }, pr) dar += r } return n, nil } func (fd *vfioFD) iommuUnmapDma(ctx context.Context, t *kernel.Task, arg hostarch.Addr) (uintptr, error) { var dmaUnmap linux.VFIOIommuType1DmaUnmap if _, err := dmaUnmap.CopyIn(t, arg); err != nil { return 0, err } if dmaUnmap.Flags&linux.VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP != 0 { // VFIO_DMA_UNMAP_FALGS_GET_DIRTY_BITMAP is not used by libtpu for // gVisor working with TPU. return 0, linuxerr.ENOSYS } n, err := IOCTLInvokePtrArg[uint32](fd.hostFD, linux.VFIO_IOMMU_UNMAP_DMA, &dmaUnmap) if err != nil { return 0, nil } if _, err := dmaUnmap.CopyOut(t, arg); err != nil { return 0, err } r := DevAddrRange{Start: dmaUnmap.IOVa, End: dmaUnmap.IOVa + dmaUnmap.Size} fd.unpinRange(r) return n, nil } func (fd *vfioFD) unpinRange(r DevAddrRange) { fd.mu.Lock() defer fd.mu.Unlock() seg := fd.devAddrSet.LowerBoundSegment(r.Start) for seg.Ok() && seg.Start() < r.End { seg = fd.devAddrSet.Isolate(seg, r) mm.Unpin([]mm.PinnedRange{seg.Value()}) gap := fd.devAddrSet.Remove(seg) seg = gap.NextSegment() } } // VFIO extension. type extension int32 // String implements fmt.Stringer for VFIO extension string representation. func (e extension) String() string { switch e { case linux.VFIO_TYPE1_IOMMU: return "VFIO_TYPE1_IOMMU" case linux.VFIO_SPAPR_TCE_IOMMU: return "VFIO_SPAPR_TCE_IOMMU" case linux.VFIO_TYPE1v2_IOMMU: return "VFIO_TYPE1v2_IOMMU" } return "" } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/tpuproxy/vfio_mmap.go000066400000000000000000000054361465435605700266200ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tpuproxy import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *vfioFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts) } // AddMapping implements memmap.Mappable.AddMapping. func (fd *vfioFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (fd *vfioFD) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { } // CopyMapping implements memmap.Mappable.CopyMapping. func (fd *vfioFD) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { return nil } // Translate implements memmap.Mappable.Translate. func (fd *vfioFD) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { return []memmap.Translation{ { Source: optional, File: &fd.memmapFile, Offset: optional.Start, Perms: at, }, }, nil } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (fd *vfioFD) InvalidateUnsavable(ctx context.Context) error { return nil } type vfioFDMemmapFile struct { memmap.NoBufferedIOFallback fd *vfioFD } // IncRef implements memmap.File.IncRef. func (mf *vfioFDMemmapFile) IncRef(memmap.FileRange, uint32) { } // DecRef implements memmap.File.DecRef. func (mf *vfioFDMemmapFile) DecRef(fr memmap.FileRange) { } // MapInternal implements memmap.File.MapInternal. func (mf *vfioFDMemmapFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { log.Traceback("tpuproxy: rejecting vfioFdMemmapFile.MapInternal") return safemem.BlockSeq{}, linuxerr.EINVAL } // FD implements memmap.File.FD. func (mf *vfioFDMemmapFile) FD() int { return int(mf.fd.hostFD) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/ttydev/000077500000000000000000000000001465435605700237115ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/ttydev/ttydev.go000066400000000000000000000030011465435605700255510ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package ttydev implements an unopenable vfs.Device for /dev/tty. package ttydev import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/vfs" ) const ( // See drivers/tty/tty_io.c:tty_init(). ttyDevMinor = 0 consoleDevMinor = 1 ) // ttyDevice implements vfs.Device for /dev/tty. // // +stateify savable type ttyDevice struct{} // Open implements vfs.Device.Open. func (ttyDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return nil, linuxerr.EIO } // Register registers all devices implemented by this package in vfsObj. func Register(vfsObj *vfs.VirtualFilesystem) error { return vfsObj.RegisterDevice(vfs.CharDevice, linux.TTYAUX_MAJOR, ttyDevMinor, ttyDevice{}, &vfs.RegisterDeviceOptions{ GroupName: "tty", Pathname: "tty", FilePerms: 0666, }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/ttydev/ttydev_state_autogen.go000066400000000000000000000011551465435605700305030ustar00rootroot00000000000000// automatically generated by stateify. package ttydev import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (t *ttyDevice) StateTypeName() string { return "pkg/sentry/devices/ttydev.ttyDevice" } func (t *ttyDevice) StateFields() []string { return []string{} } func (t *ttyDevice) beforeSave() {} // +checklocksignore func (t *ttyDevice) StateSave(stateSinkObject state.Sink) { t.beforeSave() } func (t *ttyDevice) afterLoad(context.Context) {} // +checklocksignore func (t *ttyDevice) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func init() { state.Register((*ttyDevice)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/tundev/000077500000000000000000000000001465435605700236775ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/tundev/tundev.go000066400000000000000000000127651465435605700255460ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package tundev implements the /dev/net/tun device. package tundev import ( "io" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/tcpip/link/tun" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) const ( netTunDevMajor = 10 netTunDevMinor = 200 ) // tunDevice implements vfs.Device for /dev/net/tun. // // +stateify savable type tunDevice struct{} // Open implements vfs.Device.Open. func (tunDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &tunFD{} if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ UseDentryMetadata: true, }); err != nil { return nil, err } return &fd.vfsfd, nil } // tunFD implements vfs.FileDescriptionImpl for /dev/net/tun. // // +stateify savable type tunFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD device tun.Device } // Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { request := args[1].Uint() data := args[2].Pointer() t := kernel.TaskFromContext(ctx) if t == nil { panic("Ioctl should be called from a task context") } switch request { case linux.TUNSETIFF: if !t.HasCapability(linux.CAP_NET_ADMIN) { return 0, linuxerr.EPERM } stack, ok := t.NetworkContext().(*netstack.Stack) if !ok { return 0, linuxerr.EINVAL } var req linux.IFReq if _, err := req.CopyIn(t, data); err != nil { return 0, err } // Validate flags. flags, err := netstack.LinuxToTUNFlags(hostarch.ByteOrder.Uint16(req.Data[:])) if err != nil { return 0, err } return 0, fd.device.SetIff(stack.Stack, req.Name(), flags) case linux.TUNGETIFF: var req linux.IFReq copy(req.IFName[:], fd.device.Name()) hostarch.ByteOrder.PutUint16(req.Data[:], netstack.TUNFlagsToLinux(fd.device.Flags())) _, err := req.CopyOut(t, data) return 0, err default: return 0, linuxerr.ENOTTY } } // Release implements vfs.FileDescriptionImpl.Release. func (fd *tunFD) Release(ctx context.Context) { fd.device.Release(ctx) } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *tunFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return fd.Read(ctx, dst, opts) } // Read implements vfs.FileDescriptionImpl.Read. func (fd *tunFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { data, err := fd.device.Read() if err != nil { return 0, err } defer data.Release() size := data.Size() n, err := io.CopyN(dst.Writer(ctx), data, dst.NumBytes()) if n > 0 && n < int64(size) { // Not an error for partial copying. Packet truncated. err = nil } return int64(n), err } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *tunFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { return fd.Write(ctx, src, opts) } // Write implements vfs.FileDescriptionImpl.Write. func (fd *tunFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { if src.NumBytes() == 0 { return 0, unix.EINVAL } mtu, err := fd.device.MTU() if err != nil { return 0, err } if int64(mtu) < src.NumBytes() { return 0, unix.EMSGSIZE } data := buffer.NewView(int(src.NumBytes())) defer data.Release() if _, err := io.CopyN(data, src.Reader(ctx), src.NumBytes()); err != nil { return 0, err } return fd.device.Write(data) } // Readiness implements watier.Waitable.Readiness. func (fd *tunFD) Readiness(mask waiter.EventMask) waiter.EventMask { return fd.device.Readiness(mask) } // EventRegister implements watier.Waitable.EventRegister. func (fd *tunFD) EventRegister(e *waiter.Entry) error { fd.device.EventRegister(e) return nil } // EventUnregister implements watier.Waitable.EventUnregister. func (fd *tunFD) EventUnregister(e *waiter.Entry) { fd.device.EventUnregister(e) } // Epollable implements FileDescriptionImpl.Epollable. func (fd *tunFD) Epollable() bool { return true } // IsNetTunSupported returns whether /dev/net/tun device is supported for s. func IsNetTunSupported(s inet.Stack) bool { _, ok := s.(*netstack.Stack) return ok } // Register registers all devices implemented by this package in vfsObj. func Register(vfsObj *vfs.VirtualFilesystem) error { return vfsObj.RegisterDevice(vfs.CharDevice, netTunDevMajor, netTunDevMinor, tunDevice{}, &vfs.RegisterDeviceOptions{ Pathname: "net/tun", FilePerms: 0666, }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/devices/tundev/tundev_state_autogen.go000066400000000000000000000032131465435605700304540ustar00rootroot00000000000000// automatically generated by stateify. package tundev import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (t *tunDevice) StateTypeName() string { return "pkg/sentry/devices/tundev.tunDevice" } func (t *tunDevice) StateFields() []string { return []string{} } func (t *tunDevice) beforeSave() {} // +checklocksignore func (t *tunDevice) StateSave(stateSinkObject state.Sink) { t.beforeSave() } func (t *tunDevice) afterLoad(context.Context) {} // +checklocksignore func (t *tunDevice) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (fd *tunFD) StateTypeName() string { return "pkg/sentry/devices/tundev.tunFD" } func (fd *tunFD) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "NoLockFD", "device", } } func (fd *tunFD) beforeSave() {} // +checklocksignore func (fd *tunFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.vfsfd) stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &fd.NoLockFD) stateSinkObject.Save(4, &fd.device) } func (fd *tunFD) afterLoad(context.Context) {} // +checklocksignore func (fd *tunFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.vfsfd) stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &fd.NoLockFD) stateSourceObject.Load(4, &fd.device) } func init() { state.Register((*tunDevice)(nil)) state.Register((*tunFD)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fdimport/000077500000000000000000000000001465435605700225745ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fdimport/fdimport.go000066400000000000000000000072001465435605700247460ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package fdimport provides the Import function. package fdimport import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // Import imports a map of FDs into the given FDTable. If console is true, // sets up TTY for sentry stdin, stdout, and stderr FDs. Used FDs are either // closed or released. It's safe for the caller to close any remaining files // upon return. func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, uid auth.KUID, gid auth.KGID, fds map[int]*fd.FD, containerName string) (*host.TTYFileDescription, error) { k := kernel.KernelFromContext(ctx) if k == nil { return nil, fmt.Errorf("cannot find kernel from context") } mnt := k.HostMount() // Collect host fds and flags. Do this before importing any fds because // multiple fds may refer to the same file, and importing fds may // change flags. fdFlags := make(map[*fd.FD]kernel.FDFlags) for _, hostFD := range fds { var err error fdFlags[hostFD], err = getFDFlags(hostFD) if err != nil { return nil, err } } var ttyFile *vfs.FileDescription for appFD, hostFD := range fds { fdOpts := host.NewFDOptions{ Savable: true, } if uid != auth.NoID || gid != auth.NoID { fdOpts.VirtualOwner = true fdOpts.UID = uid fdOpts.GID = gid } var appFile *vfs.FileDescription fdOpts.RestoreKey = host.MakeRestoreID(containerName, appFD) if console && appFD < 3 { // Import the file as a host TTY file. if ttyFile == nil { fdOpts.IsTTY = true var err error appFile, err = host.NewFD(ctx, mnt, hostFD.FD(), &fdOpts) if err != nil { return nil, err } defer appFile.DecRef(ctx) hostFD.Release() // FD is transferred to host FD. // Remember this in the TTY file, as we will use it for the other stdio // FDs. ttyFile = appFile } else { // Re-use the existing TTY file, as all three stdio FDs must point to // the same fs.File in order to share TTY state, specifically the // foreground process group id. appFile = ttyFile } } else { var err error appFile, err = host.NewFD(ctx, mnt, hostFD.FD(), &fdOpts) if err != nil { return nil, err } defer appFile.DecRef(ctx) hostFD.Release() // FD is transferred to host FD. } df, err := fdTable.NewFDAt(ctx, int32(appFD), appFile, fdFlags[hostFD]) if err != nil { return nil, err } if df != nil { df.DecRef(ctx) return nil, fmt.Errorf("app FD %d displaced while importing FDs", appFD) } } if ttyFile == nil { return nil, nil } return ttyFile.Impl().(*host.TTYFileDescription), nil } func getFDFlags(f *fd.FD) (kernel.FDFlags, error) { fdflags, _, errno := unix.Syscall(unix.SYS_FCNTL, uintptr(f.FD()), unix.F_GETFD, 0) if errno != 0 { return kernel.FDFlags{}, fmt.Errorf("failed to get fd flags for fd %d (errno=%d)", f, errno) } return kernel.FDFlags{ CloseOnExec: fdflags&unix.O_CLOEXEC != 0, }, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fdimport/fdimport_state_autogen.go000066400000000000000000000000721465435605700276700ustar00rootroot00000000000000// automatically generated by stateify. package fdimport golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/000077500000000000000000000000001465435605700222425ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/cgroupfs/000077500000000000000000000000001465435605700240725ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/cgroupfs/base.go000066400000000000000000000425161465435605700253430ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cgroupfs import ( "bytes" "fmt" "sort" "strconv" "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) // controllerCommon implements kernel.CgroupController. // // Must call init before use. // // +stateify savable type controllerCommon struct { ty kernel.CgroupControllerType fs *filesystem // parent is the parent controller if any. Immutable. // // Note that we don't have to update this on renames, since cgroup // directories can't be moved to a different parent directory. parent controller } func (c *controllerCommon) init(ty kernel.CgroupControllerType, fs *filesystem) { c.ty = ty c.fs = fs } func (c *controllerCommon) cloneFromParent(parent controller) { c.ty = parent.Type() c.fs = parent.Filesystem() c.parent = parent } // Filesystem implements controller.Filesystem. func (c *controllerCommon) Filesystem() *filesystem { return c.fs } // Type implements kernel.CgroupController.Type. func (c *controllerCommon) Type() kernel.CgroupControllerType { return kernel.CgroupControllerType(c.ty) } // HierarchyID implements kernel.CgroupController.HierarchyID. func (c *controllerCommon) HierarchyID() uint32 { return c.fs.hierarchyID } // NumCgroups implements kernel.CgroupController.NumCgroups. func (c *controllerCommon) NumCgroups() uint64 { return c.fs.numCgroups.Load() } // Enabled implements kernel.CgroupController.Enabled. // // Controllers are currently always enabled. func (c *controllerCommon) Enabled() bool { return true } // EffectiveRootCgroup implements kernel.CgroupController.EffectiveRootCgroup. func (c *controllerCommon) EffectiveRootCgroup() kernel.Cgroup { return c.fs.effectiveRootCgroup() } // controller is an interface for common functionality related to all cgroups. // It is an extension of the public cgroup interface, containing cgroup // functionality private to cgroupfs. type controller interface { kernel.CgroupController // Filesystem returns the cgroupfs filesystem backing this controller. Filesystem() *filesystem // Clone creates a new controller based on the internal state of this // controller. This is used to initialize a sub-cgroup based on the state of // the parent. Clone() controller // AddControlFiles should extend the contents map with inodes representing // control files defined by this controller. AddControlFiles(ctx context.Context, creds *auth.Credentials, c *cgroupInode, contents map[string]kernfs.Inode) // Enter is called when a task initially moves into a cgroup. This is // distinct from migration because the task isn't migrating away from a // cgroup. Enter is called when a task is created and joins its initial // cgroup, or when cgroupfs is mounted and existing tasks are moved into // cgroups. Enter(t *kernel.Task) // Leave is called when a task leaves a cgroup. This is distinct from // migration because the task isn't migrating to another cgroup. Leave is // called when a task exits. Leave(t *kernel.Task) // PrepareMigrate signals the controller that a migration is about to // happen. The controller should check for any conditions that would prevent // the migration. If PrepareMigrate succeeds, the controller must // unconditionally either accept the migration via CommitMigrate, or roll it // back via AbortMigrate. // // Postcondition: If PrepareMigrate returns nil, caller must resolve the // migration by calling either CommitMigrate or AbortMigrate. PrepareMigrate(t *kernel.Task, src controller) error // CommitMigrate completes an in-flight migration. // // Precondition: Caller must call a corresponding PrepareMigrate. CommitMigrate(t *kernel.Task, src controller) // AbortMigrate cancels an in-flight migration. // // Precondition: Caller must call a corresponding PrepareMigrate. AbortMigrate(t *kernel.Task, src controller) // Charge charges a controller for a particular resource. The implementation // should panic if passed a resource type they do not control. Charge(t *kernel.Task, d *kernfs.Dentry, res kernel.CgroupResourceType, value int64) error } // cgroupInode implements kernel.CgroupImpl and kernfs.Inode. // // +stateify savable type cgroupInode struct { dir // id is the id of this cgroup. id uint32 // controllers is the set of controllers for this cgroup. This is used to // store controller-specific state per cgroup. The set of controllers should // match the controllers for this hierarchy as tracked by the filesystem // object. Immutable. controllers map[kernel.CgroupControllerType]controller // ts is the list of tasks in this cgroup. The kernel is responsible for // removing tasks from this list before they're destroyed, so any tasks on // this list are always valid. // // ts, and cgroup membership in general is protected by fs.tasksMu. ts map[*kernel.Task]struct{} } var _ kernel.CgroupImpl = (*cgroupInode)(nil) func (fs *filesystem) newCgroupInode(ctx context.Context, creds *auth.Credentials, parent *cgroupInode, mode linux.FileMode) kernfs.Inode { c := &cgroupInode{ dir: dir{fs: fs}, ts: make(map[*kernel.Task]struct{}), controllers: make(map[kernel.CgroupControllerType]controller), } c.dir.cgi = c k := kernel.KernelFromContext(ctx) r := k.CgroupRegistry() // Assign id for the cgroup. cid, err := r.NextCgroupID() if err != nil { log.Warningf("cgroupfs newCgroupInode: Failed to assign id to the cgroup: %v", err) } c.id = cid r.AddCgroup(c) contents := make(map[string]kernfs.Inode) contents["cgroup.procs"] = fs.newControllerWritableFile(ctx, creds, &cgroupProcsData{c}, false) contents["tasks"] = fs.newControllerWritableFile(ctx, creds, &tasksData{c}, false) if parent != nil { for ty, ctl := range parent.controllers { new := ctl.Clone() c.controllers[ty] = new new.AddControlFiles(ctx, creds, c, contents) } } else { for _, ctl := range fs.controllers { // Uniqueness of controllers enforced by the filesystem on // creation. The root cgroup uses the controllers directly from the // filesystem. c.controllers[ctl.Type()] = ctl ctl.AddControlFiles(ctx, creds, c, contents) } } c.dir.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), mode) c.dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) c.dir.IncLinks(c.dir.OrderedChildren.Populate(contents)) fs.numCgroups.Add(1) return c } // HierarchyID implements kernel.CgroupImpl.HierarchyID. func (c *cgroupInode) HierarchyID() uint32 { return c.fs.hierarchyID } // Name implements kernel.CgroupImpl.Name. func (c *cgroupInode) Name() string { return c.fs.hierarchyName } // Controllers implements kernel.CgroupImpl.Controllers. func (c *cgroupInode) Controllers() []kernel.CgroupController { return c.fs.kcontrollers } // tasks returns a snapshot of the tasks inside the cgroup. func (c *cgroupInode) tasks() []*kernel.Task { c.fs.tasksMu.RLock() defer c.fs.tasksMu.RUnlock() ts := make([]*kernel.Task, 0, len(c.ts)) for t := range c.ts { ts = append(ts, t) } return ts } // Enter implements kernel.CgroupImpl.Enter. func (c *cgroupInode) Enter(t *kernel.Task) { c.fs.tasksMu.Lock() defer c.fs.tasksMu.Unlock() c.ts[t] = struct{}{} for _, ctl := range c.controllers { ctl.Enter(t) } } // Leave implements kernel.CgroupImpl.Leave. func (c *cgroupInode) Leave(t *kernel.Task) { c.fs.tasksMu.Lock() defer c.fs.tasksMu.Unlock() for _, ctl := range c.controllers { ctl.Leave(t) } delete(c.ts, t) } // PrepareMigrate implements kernel.CgroupImpl.PrepareMigrate. func (c *cgroupInode) PrepareMigrate(t *kernel.Task, src *kernel.Cgroup) error { prepared := make([]controller, 0, len(c.controllers)) rollback := func() { for _, p := range prepared { c.controllers[p.Type()].AbortMigrate(t, p) } } for srcType, srcCtl := range src.CgroupImpl.(*cgroupInode).controllers { ctl := c.controllers[srcType] if err := ctl.PrepareMigrate(t, srcCtl); err != nil { rollback() return err } prepared = append(prepared, srcCtl) } return nil } // CommitMigrate implements kernel.CgroupImpl.CommitMigrate. func (c *cgroupInode) CommitMigrate(t *kernel.Task, src *kernel.Cgroup) { c.fs.tasksMu.Lock() defer c.fs.tasksMu.Unlock() for srcType, srcCtl := range src.CgroupImpl.(*cgroupInode).controllers { c.controllers[srcType].CommitMigrate(t, srcCtl) } srcI := src.CgroupImpl.(*cgroupInode) delete(srcI.ts, t) c.ts[t] = struct{}{} } // AbortMigrate implements kernel.CgroupImpl.AbortMigrate. func (c *cgroupInode) AbortMigrate(t *kernel.Task, src *kernel.Cgroup) { for srcType, srcCtl := range src.CgroupImpl.(*cgroupInode).controllers { c.controllers[srcType].AbortMigrate(t, srcCtl) } } // CgroupFromControlFileFD returns a cgroup object given a control file FD for the cgroup. func (c *cgroupInode) CgroupFromControlFileFD(fd *vfs.FileDescription) kernel.Cgroup { controlFileDentry := fd.Dentry().Impl().(*kernfs.Dentry) // The returned parent dentry remains valid without holding locks because in // cgroupfs, the parent directory relationship of a control file is // effectively immutable. Control files cannot be unlinked, renamed or // destroyed independently from their parent directory. parentD := controlFileDentry.Parent() return kernel.Cgroup{ Dentry: parentD, CgroupImpl: c, } } // Charge implements kernel.CgroupImpl.Charge. // // Charge notifies a matching controller of a change in resource usage. Due to // the uniqueness of controllers, at most one controller will match. If no // matching controller is present in this directory, the call silently // succeeds. The caller should call Charge on all hierarchies to ensure any // matching controller across the entire system is charged. func (c *cgroupInode) Charge(t *kernel.Task, d *kernfs.Dentry, ctlType kernel.CgroupControllerType, res kernel.CgroupResourceType, value int64) error { c.fs.tasksMu.RLock() defer c.fs.tasksMu.RUnlock() if ctl, ok := c.controllers[ctlType]; ok { return ctl.Charge(t, d, res, value) } return nil } // ReadControl implements kernel.CgroupImpl.ReadControl. func (c *cgroupInode) ReadControl(ctx context.Context, name string) (string, error) { cfi, err := c.Lookup(ctx, name) if err != nil { return "", fmt.Errorf("no such control file") } cbf, ok := cfi.(controllerFileImpl) if !ok { return "", fmt.Errorf("no such control file") } if !cbf.AllowBackgroundAccess() { return "", fmt.Errorf("this control may not be accessed from a background context") } var buf bytes.Buffer err = cbf.Source().Data().Generate(ctx, &buf) return buf.String(), err } // WriteControl implements kernel.CgroupImpl.WriteControl. func (c *cgroupInode) WriteControl(ctx context.Context, name string, value string) error { cfi, err := c.Lookup(ctx, name) if err != nil { return fmt.Errorf("no such control file") } // Do the more general cast first so we can give a meaningful error message when // the control file exists, but isn't accessible (either due to being // unwritable, or not being available from a background context). cbf, ok := cfi.(controllerFileImpl) if !ok { return fmt.Errorf("no such control file") } if !cbf.AllowBackgroundAccess() { return fmt.Errorf("this control may not be accessed from a background context") } wcbf, ok := cfi.(writableControllerFileImpl) if !ok { return fmt.Errorf("control file not writable") } ioSeq := usermem.BytesIOSequence([]byte(value)) n, err := wcbf.WriteBackground(ctx, ioSeq) if err != nil { return err } if n != int64(len(value)) { return fmt.Errorf("short write") } return nil } // ID implements kernel.CgroupImpl.ID. func (c *cgroupInode) ID() uint32 { return c.id } func sortTIDs(tids []kernel.ThreadID) { sort.Slice(tids, func(i, j int) bool { return tids[i] < tids[j] }) } // +stateify savable type cgroupProcsData struct { *cgroupInode } // Generate implements vfs.DynamicBytesSource.Generate. func (d *cgroupProcsData) Generate(ctx context.Context, buf *bytes.Buffer) error { t := kernel.TaskFromContext(ctx) currPidns := t.ThreadGroup().PIDNamespace() pgids := make(map[kernel.ThreadID]struct{}) for _, task := range d.tasks() { // Map dedups pgid, since iterating over all tasks produces multiple // entries for the group leaders. if pgid := currPidns.IDOfThreadGroup(task.ThreadGroup()); pgid != 0 { pgids[pgid] = struct{}{} } } pgidList := make([]kernel.ThreadID, 0, len(pgids)) for pgid := range pgids { pgidList = append(pgidList, pgid) } sortTIDs(pgidList) for _, pgid := range pgidList { fmt.Fprintf(buf, "%d\n", pgid) } return nil } // Write implements vfs.WritableDynamicBytesSource.Write. func (d *cgroupProcsData) Write(ctx context.Context, fd *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { tgid, n, err := parseInt64FromString(ctx, src) if err != nil { return n, err } t := kernel.TaskFromContext(ctx) currPidns := t.ThreadGroup().PIDNamespace() var targetTG *kernel.ThreadGroup if tgid != 0 { targetTG = currPidns.ThreadGroupWithID(kernel.ThreadID(tgid)) } else { targetTG = t.ThreadGroup() } if targetTG == nil { return 0, linuxerr.EINVAL } return n, targetTG.MigrateCgroup(d.CgroupFromControlFileFD(fd)) } // +stateify savable type tasksData struct { *cgroupInode } // Generate implements vfs.DynamicBytesSource.Generate. func (d *tasksData) Generate(ctx context.Context, buf *bytes.Buffer) error { t := kernel.TaskFromContext(ctx) currPidns := t.ThreadGroup().PIDNamespace() var pids []kernel.ThreadID for _, task := range d.tasks() { if pid := currPidns.IDOfTask(task); pid != 0 { pids = append(pids, pid) } } sortTIDs(pids) for _, pid := range pids { fmt.Fprintf(buf, "%d\n", pid) } return nil } // Write implements vfs.WritableDynamicBytesSource.Write. func (d *tasksData) Write(ctx context.Context, fd *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { tid, n, err := parseInt64FromString(ctx, src) if err != nil { return n, err } t := kernel.TaskFromContext(ctx) currPidns := t.ThreadGroup().PIDNamespace() var targetTask *kernel.Task if tid != 0 { targetTask = currPidns.TaskWithID(kernel.ThreadID(tid)) } else { targetTask = t } if targetTask == nil { return 0, linuxerr.EINVAL } return n, targetTask.MigrateCgroup(d.CgroupFromControlFileFD(fd)) } // parseInt64FromString interprets src as string encoding a int64 value, and // returns the parsed value. func parseInt64FromString(ctx context.Context, src usermem.IOSequence) (val, len int64, err error) { const maxInt64StrLen = 20 // i.e. len(fmt.Sprintf("%d", math.MinInt64)) == 20 buf := copyScratchBufferFromContext(ctx, maxInt64StrLen) n, err := src.CopyIn(ctx, buf) if err != nil { return 0, int64(n), err } str := strings.TrimSpace(string(buf[:n])) val, err = strconv.ParseInt(str, 10, 64) if err != nil { // Note: This also handles zero-len writes if offset is beyond the end // of src, or src is empty. ctx.Debugf("cgroupfs.parseInt64FromString: failed to parse %q: %v", str, err) return 0, int64(n), linuxerr.EINVAL } return val, int64(n), nil } // copyScratchBufferFromContext returns a scratch buffer of the given size. It // tries to use the task's copy scratch buffer if we're on a task context, // otherwise it allocates a new buffer. func copyScratchBufferFromContext(ctx context.Context, size int) []byte { t := kernel.TaskFromContext(ctx) if t != nil { return t.CopyScratchBuffer(hostarch.PageSize) } // Not on task context. return make([]byte, hostarch.PageSize) } // controllerStateless partially implements controller. It stubs the migration // methods with noops for a stateless controller. type controllerStateless struct{} // Enter implements controller.Enter. func (*controllerStateless) Enter(t *kernel.Task) {} // Leave implements controller.Leave. func (*controllerStateless) Leave(t *kernel.Task) {} // PrepareMigrate implements controller.PrepareMigrate. func (*controllerStateless) PrepareMigrate(t *kernel.Task, src controller) error { return nil } // CommitMigrate implements controller.CommitMigrate. func (*controllerStateless) CommitMigrate(t *kernel.Task, src controller) {} // AbortMigrate implements controller.AbortMigrate. func (*controllerStateless) AbortMigrate(t *kernel.Task, src controller) {} // controllerNoResource partially implements controller. It stubs out the Charge // method for controllers that don't track resource usage through the charge // mechanism. type controllerNoResource struct{} // Charge implements controller.Charge. func (*controllerNoResource) Charge(t *kernel.Task, d *kernfs.Dentry, res kernel.CgroupResourceType, value int64) error { panic(fmt.Sprintf("cgroupfs: Attempted to charge a controller with unknown resource %v for value %v", res, value)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/cgroupfs/bitmap.go000066400000000000000000000067121465435605700257030ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cgroupfs import ( "fmt" "strconv" "strings" "gvisor.dev/gvisor/pkg/bitmap" ) // formatBitmap produces a string representation of b, which lists the indices // of set bits in the bitmap. Indices are separated by commas and ranges of // set bits are abbreviated. Example outputs: "0,2,4", "0,3-7,10", "0-10". // // Inverse of parseBitmap. func formatBitmap(b *bitmap.Bitmap) string { ones := b.ToSlice() if len(ones) == 0 { return "" } elems := make([]string, 0, len(ones)) runStart := ones[0] lastVal := ones[0] inRun := false for _, v := range ones[1:] { last := lastVal lastVal = v if last+1 == v { // In a contiguous block of ones. if !inRun { runStart = last inRun = true } continue } // Non-contiguous bit. if inRun { // Render a run elems = append(elems, fmt.Sprintf("%d-%d", runStart, last)) inRun = false continue } // Lone non-contiguous bit. elems = append(elems, fmt.Sprintf("%d", last)) } // Process potential final run if inRun { elems = append(elems, fmt.Sprintf("%d-%d", runStart, lastVal)) } else { elems = append(elems, fmt.Sprintf("%d", lastVal)) } return strings.Join(elems, ",") } func parseToken(token string) (start, end uint32, err error) { ts := strings.SplitN(token, "-", 2) switch len(ts) { case 0: return 0, 0, fmt.Errorf("invalid token %q", token) case 1: val, err := strconv.ParseUint(ts[0], 10, 32) if err != nil { return 0, 0, err } return uint32(val), uint32(val), nil case 2: val1, err := strconv.ParseUint(ts[0], 10, 32) if err != nil { return 0, 0, err } val2, err := strconv.ParseUint(ts[1], 10, 32) if err != nil { return 0, 0, err } if val1 >= val2 { return 0, 0, fmt.Errorf("start (%v) must be less than end (%v)", val1, val2) } return uint32(val1), uint32(val2), nil default: panic(fmt.Sprintf("Unreachable: got %d substrs", len(ts))) } } // parseBitmap parses input as a bitmap. input should be a comma separated list // of indices, and ranges of set bits may be abbreviated. Examples: "0,2,4", // "0,3-7,10", "0-10". Input after the first newline or null byte is discarded. // // sizeHint sets the initial size of the bitmap, which may prevent reallocation // when growing the bitmap during parsing. Ideally sizeHint should be at least // as large as the bitmap represented by input, but this is not required. // // Inverse of formatBitmap. func parseBitmap(input string, sizeHint uint32) (*bitmap.Bitmap, error) { b := bitmap.New(sizeHint) if termIdx := strings.IndexAny(input, "\n\000"); termIdx != -1 { input = input[:termIdx] } input = strings.TrimSpace(input) if len(input) == 0 { return &b, nil } tokens := strings.Split(input, ",") for _, t := range tokens { start, end, err := parseToken(strings.TrimSpace(t)) if err != nil { return nil, err } for i := start; i <= end; i++ { b.Add(i) } } return &b, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go000066400000000000000000000666671465435605700262760ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package cgroupfs implements cgroupfs. // // A cgroup is a collection of tasks on the system, organized into a tree-like // structure similar to a filesystem directory tree. In fact, each cgroup is // represented by a directory on cgroupfs, and is manipulated through control // files in the directory. // // All cgroups on a system are organized into hierarchies. Hierarchies are a // distinct tree of cgroups, with a common set of controllers. One or more // cgroupfs mounts may point to each hierarchy. These mounts provide a common // view into the same tree of cgroups. // // A controller (also known as a "resource controller", or a cgroup "subsystem") // determines the behaviour of each cgroup. // // In addition to cgroupfs, the kernel has a cgroup registry that tracks // system-wide state related to cgroups such as active hierarchies and the // controllers associated with them. // // Since cgroupfs doesn't allow hardlinks, there is a unique mapping between // cgroupfs dentries and inodes. Thus, cgroupfs inodes don't need to be ref // counted and exist until they're unlinked once or the FS is destroyed. // // # Synchronization // // Cgroup hierarchy creation and destruction is protected by the // kernel.CgroupRegistry.mu. Once created, a hierarchy's set of controllers, the // filesystem associated with it, and the root cgroup for the hierarchy are // immutable. // // Membership of tasks within cgroups is protected by // cgroupfs.filesystem.tasksMu. Tasks also maintain a set of all cgroups they're // in, and this list is protected by Task.mu. // // Lock order: // // kernel.CgroupRegistry.mu // kernfs.filesystem.mu // kernel.TaskSet.mu // kernel.Task.mu // cgroupfs.filesystem.tasksMu. // cgroupfs.dir.OrderedChildren.mu package cgroupfs import ( "bytes" "fmt" "sort" "strconv" "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) const ( // Name is the default filesystem name. Name = "cgroup" readonlyFileMode = linux.FileMode(0444) writableFileMode = linux.FileMode(0644) defaultDirMode = linux.FileMode(0555) | linux.ModeDirectory defaultMaxCachedDentries = uint64(1000) ) var allControllers = []kernel.CgroupControllerType{ kernel.CgroupControllerCPU, kernel.CgroupControllerCPUAcct, kernel.CgroupControllerCPUSet, kernel.CgroupControllerDevices, kernel.CgroupControllerJob, kernel.CgroupControllerMemory, kernel.CgroupControllerPIDs, } // SupportedMountOptions is the set of supported mount options for cgroupfs. var SupportedMountOptions = []string{"all", "cpu", "cpuacct", "cpuset", "devices", "job", "memory", "pids"} // FilesystemType implements vfs.FilesystemType. // // +stateify savable type FilesystemType struct{} // InitialCgroup specifies properties of the cgroup for the init task. // // +stateify savable type InitialCgroup struct { // Path is an absolute path relative to the root of a cgroupfs filesystem // that indicates where to place the init task. An empty string indicates // the root of the filesystem. Path string // SetOwner indicates the UID and GID fields contain valid values. If true, // Both UID and GID must be provided. SetOwner bool // UID of the initial cgroup path components, excluding the root cgroup. UID auth.KUID // GID of the initial cgroup path components, excluding the root cgroup. GID auth.KGID // SetMode indicates the Mode field contains a valid value. SetMode bool // Mode of the initial cgroup path components, excluding the root cgroup. Mode linux.FileMode } // InternalData contains internal data passed in to the cgroupfs mount via // vfs.GetFilesystemOptions.InternalData. // // +stateify savable type InternalData struct { DefaultControlValues map[string]int64 InitialCgroup InitialCgroup } // filesystem implements vfs.FilesystemImpl and kernel.cgroupFS. // // +stateify savable type filesystem struct { kernfs.Filesystem devMinor uint32 // hierarchyID is the id the cgroup registry assigns to this hierarchy. Has // the value kernel.InvalidCgroupHierarchyID until the FS is fully // initialized. // // hierarchyID is immutable after initialization. hierarchyID uint32 // hierarchyName is the name for a named hierarchy. May be empty if the // 'name=' mount option was not used when the hierarchy was created. // // Immutable after initialization. hierarchyName string // controllers and kcontrollers are both the list of controllers attached to // this cgroupfs. Both lists are the same set of controllers, but typecast // to different interfaces for convenience. Both must stay in sync, and are // immutable. controllers []controller kcontrollers []kernel.CgroupController numCgroups atomicbitops.Uint64 // Protected by atomic ops. root *kernfs.Dentry // effectiveRoot is the initial cgroup new tasks are created in. Unless // overwritten by internal mount options, root == effectiveRoot. If // effectiveRoot != root, an extra reference is held on effectiveRoot for // the lifetime of the filesystem. effectiveRoot *kernfs.Dentry // tasksMu serializes task membership changes across all cgroups within a // filesystem. tasksMu taskRWMutex `state:"nosave"` } // InitializeHierarchyID implements kernel.cgroupFS.InitializeHierarchyID. func (fs *filesystem) InitializeHierarchyID(hid uint32) { fs.hierarchyID = hid } // RootCgroup implements kernel.cgroupFS.RootCgroup. func (fs *filesystem) RootCgroup() kernel.Cgroup { return kernel.Cgroup{ Dentry: fs.root, CgroupImpl: fs.root.Inode().(kernel.CgroupImpl), } } // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name } // Release implements vfs.FilesystemType.Release. func (FilesystemType) Release(ctx context.Context) {} // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { devMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, nil, err } mopts := vfs.GenericParseMountOptions(opts.Data) maxCachedDentries := defaultMaxCachedDentries if str, ok := mopts["dentry_cache_limit"]; ok { delete(mopts, "dentry_cache_limit") maxCachedDentries, err = strconv.ParseUint(str, 10, 64) if err != nil { ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) return nil, nil, linuxerr.EINVAL } } var wantControllers []kernel.CgroupControllerType if _, ok := mopts["cpu"]; ok { delete(mopts, "cpu") wantControllers = append(wantControllers, kernel.CgroupControllerCPU) } if _, ok := mopts["cpuacct"]; ok { delete(mopts, "cpuacct") wantControllers = append(wantControllers, kernel.CgroupControllerCPUAcct) } if _, ok := mopts["cpuset"]; ok { delete(mopts, "cpuset") wantControllers = append(wantControllers, kernel.CgroupControllerCPUSet) } if _, ok := mopts["devices"]; ok { delete(mopts, "devices") wantControllers = append(wantControllers, kernel.CgroupControllerDevices) } if _, ok := mopts["job"]; ok { delete(mopts, "job") wantControllers = append(wantControllers, kernel.CgroupControllerJob) } if _, ok := mopts["memory"]; ok { delete(mopts, "memory") wantControllers = append(wantControllers, kernel.CgroupControllerMemory) } if _, ok := mopts["pids"]; ok { delete(mopts, "pids") wantControllers = append(wantControllers, kernel.CgroupControllerPIDs) } if _, ok := mopts["all"]; ok { if len(wantControllers) > 0 { ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: other controllers specified with all: %v", wantControllers) return nil, nil, linuxerr.EINVAL } delete(mopts, "all") wantControllers = allControllers } var name string var ok bool if name, ok = mopts["name"]; ok { delete(mopts, "name") } var none bool if _, ok = mopts["none"]; ok { none = true delete(mopts, "none") } if !none && len(wantControllers) == 0 { // Specifying no controllers implies all controllers, unless "none" was // explicitly requested. wantControllers = allControllers } // Some combinations of "none", "all", "name=" and explicit controllers are // not allowed. See Linux, kernel/cgroup.c:parse_cgroupfs_options(). // All empty hierarchies must have a name. if len(wantControllers) == 0 && name == "" { ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: empty hierarchy with no name") return nil, nil, linuxerr.EINVAL } // Can't have "none" and some controllers. if none && len(wantControllers) != 0 { ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: 'none' specified with controllers: %v", wantControllers) return nil, nil, linuxerr.EINVAL } if len(mopts) != 0 { ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: unknown options: %v", mopts) return nil, nil, linuxerr.EINVAL } k := kernel.KernelFromContext(ctx) r := k.CgroupRegistry() // "It is not possible to mount the same controller against multiple // cgroup hierarchies. For example, it is not possible to mount both // the cpu and cpuacct controllers against one hierarchy, and to mount // the cpu controller alone against another hierarchy." - man cgroups(7) // // Is there a hierarchy available with all the controllers we want? If so, // this mount is a view into the same hierarchy. // // Note: we're guaranteed to have at least one requested controller, since // no explicit controller name implies all controllers. vfsfs, err := r.FindHierarchy(name, wantControllers) if err != nil { return nil, nil, err } if vfsfs != nil { fs := vfsfs.Impl().(*filesystem) ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: mounting new view to hierarchy %v", fs.hierarchyID) fs.root.IncRef() if fs.effectiveRoot != fs.root { fs.effectiveRoot.IncRef() } return vfsfs, fs.root.VFSDentry(), nil } // No existing hierarchy with the exactly controllers found. Make a new // one. Note that it's possible this mount creation is unsatisfiable, if one // or more of the requested controllers are already on existing // hierarchies. We'll find out about such collisions when we try to register // the new hierarchy later. fs := &filesystem{ devMinor: devMinor, hierarchyName: name, } fs.MaxCachedDentries = maxCachedDentries fs.VFSFilesystem().Init(vfsObj, &fsType, fs) var defaults map[string]int64 if opts.InternalData != nil { defaults = opts.InternalData.(*InternalData).DefaultControlValues ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults) } for _, ty := range wantControllers { var c controller switch ty { case kernel.CgroupControllerCPU: c = newCPUController(fs, defaults) case kernel.CgroupControllerCPUAcct: c = newCPUAcctController(fs) case kernel.CgroupControllerCPUSet: c = newCPUSetController(k, fs) case kernel.CgroupControllerDevices: c = newDevicesController(fs) case kernel.CgroupControllerJob: c = newJobController(fs) case kernel.CgroupControllerMemory: c = newMemoryController(fs, defaults) case kernel.CgroupControllerPIDs: c = newRootPIDsController(fs) default: panic(fmt.Sprintf("Unreachable: unknown cgroup controller %q", ty)) } fs.controllers = append(fs.controllers, c) } if len(defaults) != 0 { // Internal data is always provided at sentry startup and unused values // indicate a problem with the sandbox config. Fail fast. panic(fmt.Sprintf("cgroupfs.FilesystemType.GetFilesystem: unknown internal mount data: %v", defaults)) } // Controllers usually appear in alphabetical order when displayed. Sort it // here now, so it never needs to be sorted elsewhere. sort.Slice(fs.controllers, func(i, j int) bool { return fs.controllers[i].Type() < fs.controllers[j].Type() }) fs.kcontrollers = make([]kernel.CgroupController, 0, len(fs.controllers)) for _, c := range fs.controllers { fs.kcontrollers = append(fs.kcontrollers, c) } root := fs.newCgroupInode(ctx, creds, nil, defaultDirMode) var rootD kernfs.Dentry rootD.InitRoot(&fs.Filesystem, root) fs.root = &rootD fs.effectiveRoot = fs.root if err := fs.prepareInitialCgroup(ctx, vfsObj, opts); err != nil { ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: failed to prepare initial cgroup: %v", err) rootD.DecRef(ctx) fs.VFSFilesystem().DecRef(ctx) return nil, nil, err } // Register controllers. The registry may be modified concurrently, so if we // get an error, we raced with someone else who registered the same // controllers first. if err := r.Register(name, fs.kcontrollers, fs); err != nil { ctx.Infof("cgroupfs.FilesystemType.GetFilesystem: failed to register new hierarchy with controllers %v: %v", wantControllers, err) rootD.DecRef(ctx) fs.VFSFilesystem().DecRef(ctx) return nil, nil, linuxerr.EBUSY } // Move all existing tasks to the root of the new hierarchy. k.PopulateNewCgroupHierarchy(fs.effectiveRootCgroup()) return fs.VFSFilesystem(), rootD.VFSDentry(), nil } // prepareInitialCgroup creates the initial cgroup according to opts. An initial // cgroup is optional, and if not specified, this function is a no-op. func (fs *filesystem) prepareInitialCgroup(ctx context.Context, vfsObj *vfs.VirtualFilesystem, opts vfs.GetFilesystemOptions) error { if opts.InternalData == nil { return nil } idata := opts.InternalData.(*InternalData) initPathStr := idata.InitialCgroup.Path if initPathStr == "" { return nil } ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path: %v", initPathStr) initPath := fspath.Parse(initPathStr) if !initPath.Absolute { ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path invalid: %+v", initPath) return linuxerr.EINVAL } if !initPath.HasComponents() { // Explicit "/" as initial cgroup, nothing to do. return nil } ownerCreds := auth.CredentialsFromContext(ctx).Fork() if idata.InitialCgroup.SetOwner { ownerCreds.EffectiveKUID = idata.InitialCgroup.UID ownerCreds.EffectiveKGID = idata.InitialCgroup.GID } mode := defaultDirMode if idata.InitialCgroup.SetMode { mode = idata.InitialCgroup.Mode } // Have initial cgroup target, create the tree. cgDir := fs.root.Inode().(*cgroupInode) for pit := initPath.Begin; pit.Ok(); pit = pit.Next() { cgDirI, err := cgDir.newDirWithOwner(ctx, ownerCreds, pit.String(), vfs.MkdirOptions{Mode: mode}) if err != nil { return err } cgDir = cgDirI.(*cgroupInode) } // Walk to target dentry. initDentry, err := fs.root.WalkDentryTree(ctx, vfsObj, initPath) if err != nil { ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup dentry not found: %v", err) return linuxerr.ENOENT } fs.effectiveRoot = initDentry // Reference from WalkDentryTree transferred here. return nil } func (fs *filesystem) effectiveRootCgroup() kernel.Cgroup { return kernel.Cgroup{ Dentry: fs.effectiveRoot, CgroupImpl: fs.effectiveRoot.Inode().(kernel.CgroupImpl), } } // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { k := kernel.KernelFromContext(ctx) r := k.CgroupRegistry() if fs.hierarchyID != kernel.InvalidCgroupHierarchyID { k.ReleaseCgroupHierarchy(fs.hierarchyID) r.Unregister(fs.hierarchyID) } if fs.root != fs.effectiveRoot { fs.effectiveRoot.DecRef(ctx) } fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) } // MountOptions implements vfs.FilesystemImpl.MountOptions. func (fs *filesystem) MountOptions() string { var cnames []string for _, c := range fs.controllers { cnames = append(cnames, string(c.Type())) } return strings.Join(cnames, ",") } // +stateify savable type implStatFS struct{} // StatFS implements kernfs.Inode.StatFS. func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { return vfs.GenericStatFS(linux.CGROUP_SUPER_MAGIC), nil } // dir implements kernfs.Inode for a generic cgroup resource controller // directory. Specific controllers extend this to add their own functionality. // // +stateify savable type dir struct { kernfs.InodeAlwaysValid kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren kernfs.InodeNoopRefCount kernfs.InodeNotAnonymous kernfs.InodeNotSymlink kernfs.InodeWatches kernfs.OrderedChildren implStatFS locks vfs.FileLocks fs *filesystem // Immutable. cgi *cgroupInode // Immutable. } // Keep implements kernfs.Inode.Keep. func (*dir) Keep() bool { return true } // SetStat implements kernfs.Inode.SetStat. func (d *dir) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { return d.InodeAttrs.SetStat(ctx, fs, creds, opts) } // Open implements kernfs.Inode.Open. func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK | linux.O_NOCTTY fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndStaticEntries, }) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } // NewDir implements kernfs.Inode.NewDir. func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) { return d.newDirWithOwner(ctx, auth.CredentialsFromContext(ctx), name, opts) } func (d *dir) newDirWithOwner(ctx context.Context, ownerCreds *auth.Credentials, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) { // "Do not accept '\n' to prevent making /proc//cgroup unparsable." // -- Linux, kernel/cgroup.c:cgroup_mkdir(). if strings.Contains(name, "\n") { return nil, linuxerr.EINVAL } mode := opts.Mode.Permissions() | linux.ModeDirectory return d.OrderedChildren.Inserter(name, func() kernfs.Inode { d.IncLinks(1) return d.fs.newCgroupInode(ctx, ownerCreds, d.cgi, mode) }) } // Rename implements kernfs.Inode.Rename. Cgroupfs only allows renaming of // cgroup directories, and the rename may only change the name within the same // parent. See linux, kernel/cgroup.c:cgroup_rename(). func (d *dir) Rename(ctx context.Context, oldname, newname string, child, dst kernfs.Inode) error { if _, ok := child.(*cgroupInode); !ok { // Not a cgroup directory. Control files are backed by different types. return linuxerr.ENOTDIR } dstCGInode, ok := dst.(*cgroupInode) if !ok { // Not a cgroup inode, so definitely can't be *this* inode. return linuxerr.EIO } // Note: We're intentionally comparing addresses, since two different dirs // could plausibly be identical in memory, but would occupy different // locations in memory. if d != &dstCGInode.dir { // Destination dir is a different cgroup inode. Cross directory renames // aren't allowed. return linuxerr.EIO } // Rename moves oldname to newname within d. Proceed. return d.OrderedChildren.Rename(ctx, oldname, newname, child, dst) } // Unlink implements kernfs.Inode.Unlink. Cgroupfs disallows unlink, as the only // files in the filesystem are control files, which can't be deleted. func (d *dir) Unlink(ctx context.Context, name string, child kernfs.Inode) error { return linuxerr.EPERM } // hasChildrenLocked returns whether the cgroup dir contains any objects that // prevent it from being deleted. func (d *dir) hasChildrenLocked() bool { // Subdirs take a link on the parent, so checks if there are any direct // children cgroups. Exclude the dir's self link and the link from ".". if d.InodeAttrs.Links()-2 > 0 { return true } return len(d.cgi.ts) > 0 } // HasChildren implements kernfs.Inode.HasChildren. // // The empty check for a cgroupfs directory is unlike a regular directory since // a cgroupfs directory will always have control files. A cgroupfs directory can // be deleted if cgroup contains no tasks and has no sub-cgroups. func (d *dir) HasChildren() bool { d.fs.tasksMu.RLock() defer d.fs.tasksMu.RUnlock() return d.hasChildrenLocked() } // RmDir implements kernfs.Inode.RmDir. func (d *dir) RmDir(ctx context.Context, name string, child kernfs.Inode) error { // Unlike a normal directory, we need to recheck if d is empty again, since // vfs/kernfs can't stop tasks from entering or leaving the cgroup. d.fs.tasksMu.RLock() defer d.fs.tasksMu.RUnlock() cgi, ok := child.(*cgroupInode) if !ok { return linuxerr.ENOTDIR } if cgi.dir.hasChildrenLocked() { return linuxerr.ENOTEMPTY } // Disallow deletion of the effective root cgroup. if cgi == d.fs.effectiveRoot.Inode().(*cgroupInode) { ctx.Warningf("Cannot delete initial cgroup for new tasks %q", d.fs.effectiveRoot.FSLocalPath()) return linuxerr.EBUSY } err := d.OrderedChildren.RmDir(ctx, name, child) if err == nil { d.InodeAttrs.DecLinks() } return err } func (d *dir) forEachChildDir(fn func(*dir)) { d.OrderedChildren.ForEachChild(func(_ string, i kernfs.Inode) { if childI, ok := i.(*cgroupInode); ok { fn(&childI.dir) } }) } // controllerFileImpl represents common cgroupfs-specific operations for control // files. type controllerFileImpl interface { // Source extracts the underlying DynamicBytesFile for a control file. Source() *kernfs.DynamicBytesFile // AllowBackgroundAccess indicates whether a control file can be accessed // from a background (i.e. non-task) context. Some control files cannot be // meaningfully accessed from a non-task context because accessing them // either have side effects on the calling context (ex: task migration // across cgroups), or they refer to data which must be interpreted within // the calling context (ex: when referring to a pid, in which pid // namespace?). // // Currently, all writable control files that allow access from a background // process can handle a nil FD, since a background write doesn't explicitly // open the control file. This is enforced through the // writableControllerFileImpl. AllowBackgroundAccess() bool } // writableControllerFileImpl represents common cgroupfs-specific operations for // a writable control file. type writableControllerFileImpl interface { controllerFileImpl // WriteBackground writes data to a control file from a background // context. This means the write isn't performed through and FD may be // performed from a background context. // // Control files that support this should also return true for // controllerFileImpl.AllowBackgroundAccess(). WriteBackground(ctx context.Context, src usermem.IOSequence) (int64, error) } // controllerFile represents a generic control file that appears within a cgroup // directory. // // +stateify savable type controllerFile struct { kernfs.DynamicBytesFile implStatFS allowBackgroundAccess bool } var _ controllerFileImpl = (*controllerFile)(nil) // Source implements controllerFileImpl.Source. func (f *controllerFile) Source() *kernfs.DynamicBytesFile { return &f.DynamicBytesFile } // AllowBackgroundAccess implements controllerFileImpl.AllowBackgroundAccess. func (f *controllerFile) AllowBackgroundAccess() bool { return f.allowBackgroundAccess } // SetStat implements kernfs.Inode.SetStat. func (f *controllerFile) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { return f.InodeAttrs.SetStat(ctx, fs, creds, opts) } func (fs *filesystem) newControllerFile(ctx context.Context, creds *auth.Credentials, data vfs.DynamicBytesSource, allowBackgroundAccess bool) kernfs.Inode { f := &controllerFile{ allowBackgroundAccess: allowBackgroundAccess, } f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), data, readonlyFileMode) return f } func (fs *filesystem) newControllerWritableFile(ctx context.Context, creds *auth.Credentials, data vfs.WritableDynamicBytesSource, allowBackgroundAccess bool) kernfs.Inode { f := &controllerFile{ allowBackgroundAccess: allowBackgroundAccess, } f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), data, writableFileMode) return f } // staticControllerFile represents a generic control file that appears within a // cgroup directory which always returns the same data when read. // staticControllerFiles are not writable. // // +stateify savable type staticControllerFile struct { kernfs.DynamicBytesFile vfs.StaticData } var _ controllerFileImpl = (*staticControllerFile)(nil) // Source implements controllerFileImpl.Source. func (f *staticControllerFile) Source() *kernfs.DynamicBytesFile { return &f.DynamicBytesFile } // AllowBackgroundAccess implements controllerFileImpl.AllowBackgroundAccess. func (f *staticControllerFile) AllowBackgroundAccess() bool { return true } // SetStat implements kernfs.Inode.SetStat. func (f *staticControllerFile) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { return f.InodeAttrs.SetStat(ctx, fs, creds, opts) } // Note: We let the caller provide the mode so that static files may be used to // fake both readable and writable control files. However, static files are // effectively readonly, as attempting to write to them will return EIO // regardless of the mode. func (fs *filesystem) newStaticControllerFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, data string) kernfs.Inode { f := &staticControllerFile{StaticData: vfs.StaticData{Data: data}} f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), f, mode) return f } // stubControllerFile is a writable control file that remembers the control // value written to it. // // +stateify savable type stubControllerFile struct { controllerFile // data is accessed through atomic ops. data *atomicbitops.Int64 } var _ controllerFileImpl = (*stubControllerFile)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (f *stubControllerFile) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "%d\n", f.data.Load()) return nil } // Write implements vfs.WritableDynamicBytesSource.Write. func (f *stubControllerFile) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { return f.WriteBackground(ctx, src) } // WriteBackground implements writableControllerFileImpl.WriteBackground. func (f *stubControllerFile) WriteBackground(ctx context.Context, src usermem.IOSequence) (int64, error) { val, n, err := parseInt64FromString(ctx, src) if err != nil { return 0, err } f.data.Store(val) return n, nil } // newStubControllerFile creates a new stub controller file that loads and // stores a control value from data. func (fs *filesystem) newStubControllerFile(ctx context.Context, creds *auth.Credentials, data *atomicbitops.Int64, allowBackgroundAccess bool) kernfs.Inode { f := &stubControllerFile{ controllerFile: controllerFile{ allowBackgroundAccess: allowBackgroundAccess, }, data: data, } f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), f, writableFileMode) return f } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/cgroupfs/cgroupfs_state_autogen.go000066400000000000000000000717221465435605700312040ustar00rootroot00000000000000// automatically generated by stateify. package cgroupfs import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (c *controllerCommon) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.controllerCommon" } func (c *controllerCommon) StateFields() []string { return []string{ "ty", "fs", "parent", } } func (c *controllerCommon) beforeSave() {} // +checklocksignore func (c *controllerCommon) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.ty) stateSinkObject.Save(1, &c.fs) stateSinkObject.Save(2, &c.parent) } func (c *controllerCommon) afterLoad(context.Context) {} // +checklocksignore func (c *controllerCommon) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.ty) stateSourceObject.Load(1, &c.fs) stateSourceObject.Load(2, &c.parent) } func (c *cgroupInode) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.cgroupInode" } func (c *cgroupInode) StateFields() []string { return []string{ "dir", "id", "controllers", "ts", } } func (c *cgroupInode) beforeSave() {} // +checklocksignore func (c *cgroupInode) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.dir) stateSinkObject.Save(1, &c.id) stateSinkObject.Save(2, &c.controllers) stateSinkObject.Save(3, &c.ts) } func (c *cgroupInode) afterLoad(context.Context) {} // +checklocksignore func (c *cgroupInode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.dir) stateSourceObject.Load(1, &c.id) stateSourceObject.Load(2, &c.controllers) stateSourceObject.Load(3, &c.ts) } func (d *cgroupProcsData) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.cgroupProcsData" } func (d *cgroupProcsData) StateFields() []string { return []string{ "cgroupInode", } } func (d *cgroupProcsData) beforeSave() {} // +checklocksignore func (d *cgroupProcsData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.cgroupInode) } func (d *cgroupProcsData) afterLoad(context.Context) {} // +checklocksignore func (d *cgroupProcsData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.cgroupInode) } func (d *tasksData) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.tasksData" } func (d *tasksData) StateFields() []string { return []string{ "cgroupInode", } } func (d *tasksData) beforeSave() {} // +checklocksignore func (d *tasksData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.cgroupInode) } func (d *tasksData) afterLoad(context.Context) {} // +checklocksignore func (d *tasksData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.cgroupInode) } func (fsType *FilesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.FilesystemType" } func (fsType *FilesystemType) StateFields() []string { return []string{} } func (fsType *FilesystemType) beforeSave() {} // +checklocksignore func (fsType *FilesystemType) StateSave(stateSinkObject state.Sink) { fsType.beforeSave() } func (fsType *FilesystemType) afterLoad(context.Context) {} // +checklocksignore func (fsType *FilesystemType) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (i *InitialCgroup) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.InitialCgroup" } func (i *InitialCgroup) StateFields() []string { return []string{ "Path", "SetOwner", "UID", "GID", "SetMode", "Mode", } } func (i *InitialCgroup) beforeSave() {} // +checklocksignore func (i *InitialCgroup) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.Path) stateSinkObject.Save(1, &i.SetOwner) stateSinkObject.Save(2, &i.UID) stateSinkObject.Save(3, &i.GID) stateSinkObject.Save(4, &i.SetMode) stateSinkObject.Save(5, &i.Mode) } func (i *InitialCgroup) afterLoad(context.Context) {} // +checklocksignore func (i *InitialCgroup) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.Path) stateSourceObject.Load(1, &i.SetOwner) stateSourceObject.Load(2, &i.UID) stateSourceObject.Load(3, &i.GID) stateSourceObject.Load(4, &i.SetMode) stateSourceObject.Load(5, &i.Mode) } func (i *InternalData) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.InternalData" } func (i *InternalData) StateFields() []string { return []string{ "DefaultControlValues", "InitialCgroup", } } func (i *InternalData) beforeSave() {} // +checklocksignore func (i *InternalData) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.DefaultControlValues) stateSinkObject.Save(1, &i.InitialCgroup) } func (i *InternalData) afterLoad(context.Context) {} // +checklocksignore func (i *InternalData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.DefaultControlValues) stateSourceObject.Load(1, &i.InitialCgroup) } func (fs *filesystem) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.filesystem" } func (fs *filesystem) StateFields() []string { return []string{ "Filesystem", "devMinor", "hierarchyID", "hierarchyName", "controllers", "kcontrollers", "numCgroups", "root", "effectiveRoot", } } func (fs *filesystem) beforeSave() {} // +checklocksignore func (fs *filesystem) StateSave(stateSinkObject state.Sink) { fs.beforeSave() stateSinkObject.Save(0, &fs.Filesystem) stateSinkObject.Save(1, &fs.devMinor) stateSinkObject.Save(2, &fs.hierarchyID) stateSinkObject.Save(3, &fs.hierarchyName) stateSinkObject.Save(4, &fs.controllers) stateSinkObject.Save(5, &fs.kcontrollers) stateSinkObject.Save(6, &fs.numCgroups) stateSinkObject.Save(7, &fs.root) stateSinkObject.Save(8, &fs.effectiveRoot) } func (fs *filesystem) afterLoad(context.Context) {} // +checklocksignore func (fs *filesystem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fs.Filesystem) stateSourceObject.Load(1, &fs.devMinor) stateSourceObject.Load(2, &fs.hierarchyID) stateSourceObject.Load(3, &fs.hierarchyName) stateSourceObject.Load(4, &fs.controllers) stateSourceObject.Load(5, &fs.kcontrollers) stateSourceObject.Load(6, &fs.numCgroups) stateSourceObject.Load(7, &fs.root) stateSourceObject.Load(8, &fs.effectiveRoot) } func (i *implStatFS) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.implStatFS" } func (i *implStatFS) StateFields() []string { return []string{} } func (i *implStatFS) beforeSave() {} // +checklocksignore func (i *implStatFS) StateSave(stateSinkObject state.Sink) { i.beforeSave() } func (i *implStatFS) afterLoad(context.Context) {} // +checklocksignore func (i *implStatFS) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (d *dir) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.dir" } func (d *dir) StateFields() []string { return []string{ "InodeAlwaysValid", "InodeAttrs", "InodeDirectoryNoNewChildren", "InodeNoopRefCount", "InodeNotAnonymous", "InodeNotSymlink", "InodeWatches", "OrderedChildren", "implStatFS", "locks", "fs", "cgi", } } func (d *dir) beforeSave() {} // +checklocksignore func (d *dir) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.InodeAlwaysValid) stateSinkObject.Save(1, &d.InodeAttrs) stateSinkObject.Save(2, &d.InodeDirectoryNoNewChildren) stateSinkObject.Save(3, &d.InodeNoopRefCount) stateSinkObject.Save(4, &d.InodeNotAnonymous) stateSinkObject.Save(5, &d.InodeNotSymlink) stateSinkObject.Save(6, &d.InodeWatches) stateSinkObject.Save(7, &d.OrderedChildren) stateSinkObject.Save(8, &d.implStatFS) stateSinkObject.Save(9, &d.locks) stateSinkObject.Save(10, &d.fs) stateSinkObject.Save(11, &d.cgi) } func (d *dir) afterLoad(context.Context) {} // +checklocksignore func (d *dir) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.InodeAlwaysValid) stateSourceObject.Load(1, &d.InodeAttrs) stateSourceObject.Load(2, &d.InodeDirectoryNoNewChildren) stateSourceObject.Load(3, &d.InodeNoopRefCount) stateSourceObject.Load(4, &d.InodeNotAnonymous) stateSourceObject.Load(5, &d.InodeNotSymlink) stateSourceObject.Load(6, &d.InodeWatches) stateSourceObject.Load(7, &d.OrderedChildren) stateSourceObject.Load(8, &d.implStatFS) stateSourceObject.Load(9, &d.locks) stateSourceObject.Load(10, &d.fs) stateSourceObject.Load(11, &d.cgi) } func (f *controllerFile) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.controllerFile" } func (f *controllerFile) StateFields() []string { return []string{ "DynamicBytesFile", "implStatFS", "allowBackgroundAccess", } } func (f *controllerFile) beforeSave() {} // +checklocksignore func (f *controllerFile) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.DynamicBytesFile) stateSinkObject.Save(1, &f.implStatFS) stateSinkObject.Save(2, &f.allowBackgroundAccess) } func (f *controllerFile) afterLoad(context.Context) {} // +checklocksignore func (f *controllerFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.DynamicBytesFile) stateSourceObject.Load(1, &f.implStatFS) stateSourceObject.Load(2, &f.allowBackgroundAccess) } func (f *staticControllerFile) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.staticControllerFile" } func (f *staticControllerFile) StateFields() []string { return []string{ "DynamicBytesFile", "StaticData", } } func (f *staticControllerFile) beforeSave() {} // +checklocksignore func (f *staticControllerFile) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.DynamicBytesFile) stateSinkObject.Save(1, &f.StaticData) } func (f *staticControllerFile) afterLoad(context.Context) {} // +checklocksignore func (f *staticControllerFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.DynamicBytesFile) stateSourceObject.Load(1, &f.StaticData) } func (f *stubControllerFile) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.stubControllerFile" } func (f *stubControllerFile) StateFields() []string { return []string{ "controllerFile", "data", } } func (f *stubControllerFile) beforeSave() {} // +checklocksignore func (f *stubControllerFile) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.controllerFile) stateSinkObject.Save(1, &f.data) } func (f *stubControllerFile) afterLoad(context.Context) {} // +checklocksignore func (f *stubControllerFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.controllerFile) stateSourceObject.Load(1, &f.data) } func (c *cpuController) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.cpuController" } func (c *cpuController) StateFields() []string { return []string{ "controllerCommon", "controllerStateless", "controllerNoResource", "cfsPeriod", "cfsQuota", "shares", } } func (c *cpuController) beforeSave() {} // +checklocksignore func (c *cpuController) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.controllerCommon) stateSinkObject.Save(1, &c.controllerStateless) stateSinkObject.Save(2, &c.controllerNoResource) stateSinkObject.Save(3, &c.cfsPeriod) stateSinkObject.Save(4, &c.cfsQuota) stateSinkObject.Save(5, &c.shares) } func (c *cpuController) afterLoad(context.Context) {} // +checklocksignore func (c *cpuController) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.controllerCommon) stateSourceObject.Load(1, &c.controllerStateless) stateSourceObject.Load(2, &c.controllerNoResource) stateSourceObject.Load(3, &c.cfsPeriod) stateSourceObject.Load(4, &c.cfsQuota) stateSourceObject.Load(5, &c.shares) } func (c *cpuacctController) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.cpuacctController" } func (c *cpuacctController) StateFields() []string { return []string{ "controllerCommon", "controllerNoResource", "taskCommittedCharges", "usage", } } func (c *cpuacctController) beforeSave() {} // +checklocksignore func (c *cpuacctController) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.controllerCommon) stateSinkObject.Save(1, &c.controllerNoResource) stateSinkObject.Save(2, &c.taskCommittedCharges) stateSinkObject.Save(3, &c.usage) } func (c *cpuacctController) afterLoad(context.Context) {} // +checklocksignore func (c *cpuacctController) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.controllerCommon) stateSourceObject.Load(1, &c.controllerNoResource) stateSourceObject.Load(2, &c.taskCommittedCharges) stateSourceObject.Load(3, &c.usage) } func (c *cpuacctCgroup) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.cpuacctCgroup" } func (c *cpuacctCgroup) StateFields() []string { return []string{ "cgroupInode", } } func (c *cpuacctCgroup) beforeSave() {} // +checklocksignore func (c *cpuacctCgroup) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.cgroupInode) } func (c *cpuacctCgroup) afterLoad(context.Context) {} // +checklocksignore func (c *cpuacctCgroup) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.cgroupInode) } func (d *cpuacctStatData) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.cpuacctStatData" } func (d *cpuacctStatData) StateFields() []string { return []string{ "cpuacctCgroup", } } func (d *cpuacctStatData) beforeSave() {} // +checklocksignore func (d *cpuacctStatData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.cpuacctCgroup) } func (d *cpuacctStatData) afterLoad(context.Context) {} // +checklocksignore func (d *cpuacctStatData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.cpuacctCgroup) } func (d *cpuacctUsageData) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.cpuacctUsageData" } func (d *cpuacctUsageData) StateFields() []string { return []string{ "cpuacctCgroup", } } func (d *cpuacctUsageData) beforeSave() {} // +checklocksignore func (d *cpuacctUsageData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.cpuacctCgroup) } func (d *cpuacctUsageData) afterLoad(context.Context) {} // +checklocksignore func (d *cpuacctUsageData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.cpuacctCgroup) } func (d *cpuacctUsageUserData) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.cpuacctUsageUserData" } func (d *cpuacctUsageUserData) StateFields() []string { return []string{ "cpuacctCgroup", } } func (d *cpuacctUsageUserData) beforeSave() {} // +checklocksignore func (d *cpuacctUsageUserData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.cpuacctCgroup) } func (d *cpuacctUsageUserData) afterLoad(context.Context) {} // +checklocksignore func (d *cpuacctUsageUserData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.cpuacctCgroup) } func (d *cpuacctUsageSysData) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.cpuacctUsageSysData" } func (d *cpuacctUsageSysData) StateFields() []string { return []string{ "cpuacctCgroup", } } func (d *cpuacctUsageSysData) beforeSave() {} // +checklocksignore func (d *cpuacctUsageSysData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.cpuacctCgroup) } func (d *cpuacctUsageSysData) afterLoad(context.Context) {} // +checklocksignore func (d *cpuacctUsageSysData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.cpuacctCgroup) } func (c *cpusetController) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.cpusetController" } func (c *cpusetController) StateFields() []string { return []string{ "controllerCommon", "controllerStateless", "controllerNoResource", "maxCpus", "maxMems", "cpus", "mems", } } func (c *cpusetController) beforeSave() {} // +checklocksignore func (c *cpusetController) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.controllerCommon) stateSinkObject.Save(1, &c.controllerStateless) stateSinkObject.Save(2, &c.controllerNoResource) stateSinkObject.Save(3, &c.maxCpus) stateSinkObject.Save(4, &c.maxMems) stateSinkObject.Save(5, &c.cpus) stateSinkObject.Save(6, &c.mems) } func (c *cpusetController) afterLoad(context.Context) {} // +checklocksignore func (c *cpusetController) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.controllerCommon) stateSourceObject.Load(1, &c.controllerStateless) stateSourceObject.Load(2, &c.controllerNoResource) stateSourceObject.Load(3, &c.maxCpus) stateSourceObject.Load(4, &c.maxMems) stateSourceObject.Load(5, &c.cpus) stateSourceObject.Load(6, &c.mems) } func (d *cpusData) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.cpusData" } func (d *cpusData) StateFields() []string { return []string{ "c", } } func (d *cpusData) beforeSave() {} // +checklocksignore func (d *cpusData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.c) } func (d *cpusData) afterLoad(context.Context) {} // +checklocksignore func (d *cpusData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.c) } func (d *memsData) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.memsData" } func (d *memsData) StateFields() []string { return []string{ "c", } } func (d *memsData) beforeSave() {} // +checklocksignore func (d *memsData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.c) } func (d *memsData) afterLoad(context.Context) {} // +checklocksignore func (d *memsData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.c) } func (d *deviceID) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.deviceID" } func (d *deviceID) StateFields() []string { return []string{ "controllerType", "major", "minor", } } func (d *deviceID) beforeSave() {} // +checklocksignore func (d *deviceID) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.controllerType) stateSinkObject.Save(1, &d.major) stateSinkObject.Save(2, &d.minor) } func (d *deviceID) afterLoad(context.Context) {} // +checklocksignore func (d *deviceID) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.controllerType) stateSourceObject.Load(1, &d.major) stateSourceObject.Load(2, &d.minor) } func (c *devicesController) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.devicesController" } func (c *devicesController) StateFields() []string { return []string{ "controllerCommon", "controllerStateless", "controllerNoResource", "defaultAllow", "deviceRules", } } func (c *devicesController) beforeSave() {} // +checklocksignore func (c *devicesController) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.controllerCommon) stateSinkObject.Save(1, &c.controllerStateless) stateSinkObject.Save(2, &c.controllerNoResource) stateSinkObject.Save(3, &c.defaultAllow) stateSinkObject.Save(4, &c.deviceRules) } func (c *devicesController) afterLoad(context.Context) {} // +checklocksignore func (c *devicesController) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.controllerCommon) stateSourceObject.Load(1, &c.controllerStateless) stateSourceObject.Load(2, &c.controllerNoResource) stateSourceObject.Load(3, &c.defaultAllow) stateSourceObject.Load(4, &c.deviceRules) } func (d *allowedDevicesData) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.allowedDevicesData" } func (d *allowedDevicesData) StateFields() []string { return []string{ "c", } } func (d *allowedDevicesData) beforeSave() {} // +checklocksignore func (d *allowedDevicesData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.c) } func (d *allowedDevicesData) afterLoad(context.Context) {} // +checklocksignore func (d *allowedDevicesData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.c) } func (d *deniedDevicesData) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.deniedDevicesData" } func (d *deniedDevicesData) StateFields() []string { return []string{ "c", } } func (d *deniedDevicesData) beforeSave() {} // +checklocksignore func (d *deniedDevicesData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.c) } func (d *deniedDevicesData) afterLoad(context.Context) {} // +checklocksignore func (d *deniedDevicesData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.c) } func (d *controlledDevicesData) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.controlledDevicesData" } func (d *controlledDevicesData) StateFields() []string { return []string{ "c", } } func (d *controlledDevicesData) beforeSave() {} // +checklocksignore func (d *controlledDevicesData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.c) } func (d *controlledDevicesData) afterLoad(context.Context) {} // +checklocksignore func (d *controlledDevicesData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.c) } func (r *dirRefs) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.dirRefs" } func (r *dirRefs) StateFields() []string { return []string{ "refCount", } } func (r *dirRefs) beforeSave() {} // +checklocksignore func (r *dirRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *dirRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (c *jobController) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.jobController" } func (c *jobController) StateFields() []string { return []string{ "controllerCommon", "controllerStateless", "controllerNoResource", "id", } } func (c *jobController) beforeSave() {} // +checklocksignore func (c *jobController) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.controllerCommon) stateSinkObject.Save(1, &c.controllerStateless) stateSinkObject.Save(2, &c.controllerNoResource) stateSinkObject.Save(3, &c.id) } func (c *jobController) afterLoad(context.Context) {} // +checklocksignore func (c *jobController) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.controllerCommon) stateSourceObject.Load(1, &c.controllerStateless) stateSourceObject.Load(2, &c.controllerNoResource) stateSourceObject.Load(3, &c.id) } func (c *memoryController) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.memoryController" } func (c *memoryController) StateFields() []string { return []string{ "controllerCommon", "controllerNoResource", "limitBytes", "softLimitBytes", "moveChargeAtImmigrate", "pressureLevel", "memCg", } } func (c *memoryController) beforeSave() {} // +checklocksignore func (c *memoryController) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.controllerCommon) stateSinkObject.Save(1, &c.controllerNoResource) stateSinkObject.Save(2, &c.limitBytes) stateSinkObject.Save(3, &c.softLimitBytes) stateSinkObject.Save(4, &c.moveChargeAtImmigrate) stateSinkObject.Save(5, &c.pressureLevel) stateSinkObject.Save(6, &c.memCg) } func (c *memoryController) afterLoad(context.Context) {} // +checklocksignore func (c *memoryController) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.controllerCommon) stateSourceObject.Load(1, &c.controllerNoResource) stateSourceObject.Load(2, &c.limitBytes) stateSourceObject.Load(3, &c.softLimitBytes) stateSourceObject.Load(4, &c.moveChargeAtImmigrate) stateSourceObject.Load(5, &c.pressureLevel) stateSourceObject.Load(6, &c.memCg) } func (memCg *memoryCgroup) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.memoryCgroup" } func (memCg *memoryCgroup) StateFields() []string { return []string{ "cgroupInode", } } func (memCg *memoryCgroup) beforeSave() {} // +checklocksignore func (memCg *memoryCgroup) StateSave(stateSinkObject state.Sink) { memCg.beforeSave() stateSinkObject.Save(0, &memCg.cgroupInode) } func (memCg *memoryCgroup) afterLoad(context.Context) {} // +checklocksignore func (memCg *memoryCgroup) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &memCg.cgroupInode) } func (d *memoryUsageInBytesData) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.memoryUsageInBytesData" } func (d *memoryUsageInBytesData) StateFields() []string { return []string{ "memCg", } } func (d *memoryUsageInBytesData) beforeSave() {} // +checklocksignore func (d *memoryUsageInBytesData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.memCg) } func (d *memoryUsageInBytesData) afterLoad(context.Context) {} // +checklocksignore func (d *memoryUsageInBytesData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.memCg) } func (c *pidsController) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.pidsController" } func (c *pidsController) StateFields() []string { return []string{ "controllerCommon", "isRoot", "pendingTotal", "pendingPool", "committed", "max", } } func (c *pidsController) beforeSave() {} // +checklocksignore func (c *pidsController) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.controllerCommon) stateSinkObject.Save(1, &c.isRoot) stateSinkObject.Save(2, &c.pendingTotal) stateSinkObject.Save(3, &c.pendingPool) stateSinkObject.Save(4, &c.committed) stateSinkObject.Save(5, &c.max) } func (c *pidsController) afterLoad(context.Context) {} // +checklocksignore func (c *pidsController) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.controllerCommon) stateSourceObject.Load(1, &c.isRoot) stateSourceObject.Load(2, &c.pendingTotal) stateSourceObject.Load(3, &c.pendingPool) stateSourceObject.Load(4, &c.committed) stateSourceObject.Load(5, &c.max) } func (d *pidsCurrentData) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.pidsCurrentData" } func (d *pidsCurrentData) StateFields() []string { return []string{ "c", } } func (d *pidsCurrentData) beforeSave() {} // +checklocksignore func (d *pidsCurrentData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.c) } func (d *pidsCurrentData) afterLoad(context.Context) {} // +checklocksignore func (d *pidsCurrentData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.c) } func (d *pidsMaxData) StateTypeName() string { return "pkg/sentry/fsimpl/cgroupfs.pidsMaxData" } func (d *pidsMaxData) StateFields() []string { return []string{ "c", } } func (d *pidsMaxData) beforeSave() {} // +checklocksignore func (d *pidsMaxData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.c) } func (d *pidsMaxData) afterLoad(context.Context) {} // +checklocksignore func (d *pidsMaxData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.c) } func init() { state.Register((*controllerCommon)(nil)) state.Register((*cgroupInode)(nil)) state.Register((*cgroupProcsData)(nil)) state.Register((*tasksData)(nil)) state.Register((*FilesystemType)(nil)) state.Register((*InitialCgroup)(nil)) state.Register((*InternalData)(nil)) state.Register((*filesystem)(nil)) state.Register((*implStatFS)(nil)) state.Register((*dir)(nil)) state.Register((*controllerFile)(nil)) state.Register((*staticControllerFile)(nil)) state.Register((*stubControllerFile)(nil)) state.Register((*cpuController)(nil)) state.Register((*cpuacctController)(nil)) state.Register((*cpuacctCgroup)(nil)) state.Register((*cpuacctStatData)(nil)) state.Register((*cpuacctUsageData)(nil)) state.Register((*cpuacctUsageUserData)(nil)) state.Register((*cpuacctUsageSysData)(nil)) state.Register((*cpusetController)(nil)) state.Register((*cpusData)(nil)) state.Register((*memsData)(nil)) state.Register((*deviceID)(nil)) state.Register((*devicesController)(nil)) state.Register((*allowedDevicesData)(nil)) state.Register((*deniedDevicesData)(nil)) state.Register((*controlledDevicesData)(nil)) state.Register((*dirRefs)(nil)) state.Register((*jobController)(nil)) state.Register((*memoryController)(nil)) state.Register((*memoryCgroup)(nil)) state.Register((*memoryUsageInBytesData)(nil)) state.Register((*pidsController)(nil)) state.Register((*pidsCurrentData)(nil)) state.Register((*pidsMaxData)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/cgroupfs/cpu.go000066400000000000000000000052631465435605700252160ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cgroupfs import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) // +stateify savable type cpuController struct { controllerCommon controllerStateless controllerNoResource // CFS bandwidth control parameters, values in microseconds. cfsPeriod atomicbitops.Int64 cfsQuota atomicbitops.Int64 // CPU shares, values should be (num core * 1024). shares atomicbitops.Int64 } var _ controller = (*cpuController)(nil) func newCPUController(fs *filesystem, defaults map[string]int64) *cpuController { // Default values for controller parameters from Linux. c := &cpuController{ cfsPeriod: atomicbitops.FromInt64(100000), cfsQuota: atomicbitops.FromInt64(-1), shares: atomicbitops.FromInt64(1024), } if val, ok := defaults["cpu.cfs_period_us"]; ok { c.cfsPeriod = atomicbitops.FromInt64(val) delete(defaults, "cpu.cfs_period_us") } if val, ok := defaults["cpu.cfs_quota_us"]; ok { c.cfsQuota = atomicbitops.FromInt64(val) delete(defaults, "cpu.cfs_quota_us") } if val, ok := defaults["cpu.shares"]; ok { c.shares = atomicbitops.FromInt64(val) delete(defaults, "cpu.shares") } c.controllerCommon.init(kernel.CgroupControllerCPU, fs) return c } // Clone implements controller.Clone. func (c *cpuController) Clone() controller { new := &cpuController{ cfsPeriod: atomicbitops.FromInt64(c.cfsPeriod.Load()), cfsQuota: atomicbitops.FromInt64(c.cfsQuota.Load()), shares: atomicbitops.FromInt64(c.shares.Load()), } new.controllerCommon.cloneFromParent(c) return new } // AddControlFiles implements controller.AddControlFiles. func (c *cpuController) AddControlFiles(ctx context.Context, creds *auth.Credentials, _ *cgroupInode, contents map[string]kernfs.Inode) { contents["cpu.cfs_period_us"] = c.fs.newStubControllerFile(ctx, creds, &c.cfsPeriod, true) contents["cpu.cfs_quota_us"] = c.fs.newStubControllerFile(ctx, creds, &c.cfsQuota, true) contents["cpu.shares"] = c.fs.newStubControllerFile(ctx, creds, &c.shares, true) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/cgroupfs/cpuacct.go000066400000000000000000000153151465435605700260500ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cgroupfs import ( "bytes" "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" ) // cpuacctController tracks CPU usage for tasks managed by the controller. The // sentry already tracks CPU usage per task; the controller tries to avoid // duplicate bookkeeping. When a task moves into a cpuacct cgroup, for currently // running tasks we simple refer to the tasks themselves when asked to report // usage. Things get more interesting when tasks leave the cgroup, since we need // to attribute the usage across multiple cgroups. // // On migration, we attribute the task's usage up to the point of migration to // the src cgroup, and keep track of how much of the overall usage to discount // at the dst cgroup. // // On task exit, we attribute all unaccounted usage to the current cgroup and // stop tracking the task. // // +stateify savable type cpuacctController struct { controllerCommon controllerNoResource mu sync.Mutex `state:"nosave"` // taskCommittedCharges tracks charges for a task already attributed to this // cgroup. This is used to avoid double counting usage for live // tasks. Protected by mu. taskCommittedCharges map[*kernel.Task]usage.CPUStats // usage is the cumulative CPU time used by past tasks in this cgroup. Note // that this doesn't include usage by live tasks currently in the // cgroup. Protected by mu. usage usage.CPUStats } var _ controller = (*cpuacctController)(nil) func newCPUAcctController(fs *filesystem) *cpuacctController { c := &cpuacctController{ taskCommittedCharges: make(map[*kernel.Task]usage.CPUStats), } c.controllerCommon.init(kernel.CgroupControllerCPUAcct, fs) return c } // Clone implements controller.Clone. func (c *cpuacctController) Clone() controller { new := &cpuacctController{ taskCommittedCharges: make(map[*kernel.Task]usage.CPUStats), } new.controllerCommon.cloneFromParent(c) return new } // AddControlFiles implements controller.AddControlFiles. func (c *cpuacctController) AddControlFiles(ctx context.Context, creds *auth.Credentials, cg *cgroupInode, contents map[string]kernfs.Inode) { cpuacctCG := &cpuacctCgroup{cg} contents["cpuacct.stat"] = c.fs.newControllerFile(ctx, creds, &cpuacctStatData{cpuacctCG}, true) contents["cpuacct.usage"] = c.fs.newControllerFile(ctx, creds, &cpuacctUsageData{cpuacctCG}, true) contents["cpuacct.usage_user"] = c.fs.newControllerFile(ctx, creds, &cpuacctUsageUserData{cpuacctCG}, true) contents["cpuacct.usage_sys"] = c.fs.newControllerFile(ctx, creds, &cpuacctUsageSysData{cpuacctCG}, true) } // Enter implements controller.Enter. func (c *cpuacctController) Enter(t *kernel.Task) {} // Leave implements controller.Leave. func (c *cpuacctController) Leave(t *kernel.Task) { charge := t.CPUStats() c.mu.Lock() outstandingCharge := charge.DifferenceSince(c.taskCommittedCharges[t]) c.usage.Accumulate(outstandingCharge) delete(c.taskCommittedCharges, t) c.mu.Unlock() } // PrepareMigrate implements controller.PrepareMigrate. func (c *cpuacctController) PrepareMigrate(t *kernel.Task, src controller) error { return nil } // CommitMigrate implements controller.CommitMigrate. func (c *cpuacctController) CommitMigrate(t *kernel.Task, src controller) { charge := t.CPUStats() // Commit current charge to src and stop tracking t at src. srcCtl := src.(*cpuacctController) srcCtl.mu.Lock() srcTaskCharge := srcCtl.taskCommittedCharges[t] outstandingCharge := charge.DifferenceSince(srcTaskCharge) srcCtl.usage.Accumulate(outstandingCharge) delete(srcCtl.taskCommittedCharges, t) srcCtl.mu.Unlock() // Start tracking charge at dst, excluding the charge at src. c.mu.Lock() c.taskCommittedCharges[t] = charge c.mu.Unlock() } // AbortMigrate implements controller.AbortMigrate. func (c *cpuacctController) AbortMigrate(t *kernel.Task, src controller) {} // +stateify savable type cpuacctCgroup struct { *cgroupInode } func (c *cpuacctCgroup) cpuacctController() *cpuacctController { return c.controllers[kernel.CgroupControllerCPUAcct].(*cpuacctController) } // checklocks:c.fs.tasksMu func (c *cpuacctCgroup) collectCPUStatsLocked(acc *usage.CPUStats) { ctl := c.cpuacctController() for t := range c.ts { charge := t.CPUStats() ctl.mu.Lock() outstandingCharge := charge.DifferenceSince(ctl.taskCommittedCharges[t]) ctl.mu.Unlock() acc.Accumulate(outstandingCharge) } ctl.mu.Lock() acc.Accumulate(ctl.usage) ctl.mu.Unlock() c.forEachChildDir(func(d *dir) { cg := cpuacctCgroup{d.cgi} cg.collectCPUStatsLocked(acc) }) } func (c *cpuacctCgroup) collectCPUStats() usage.CPUStats { c.fs.tasksMu.RLock() defer c.fs.tasksMu.RUnlock() var cs usage.CPUStats c.collectCPUStatsLocked(&cs) return cs } // +stateify savable type cpuacctStatData struct { *cpuacctCgroup } // Generate implements vfs.DynamicBytesSource.Generate. func (d *cpuacctStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { cs := d.collectCPUStats() fmt.Fprintf(buf, "user %d\n", linux.ClockTFromDuration(cs.UserTime)) fmt.Fprintf(buf, "system %d\n", linux.ClockTFromDuration(cs.SysTime)) return nil } // +stateify savable type cpuacctUsageData struct { *cpuacctCgroup } // Generate implements vfs.DynamicBytesSource.Generate. func (d *cpuacctUsageData) Generate(ctx context.Context, buf *bytes.Buffer) error { cs := d.collectCPUStats() fmt.Fprintf(buf, "%d\n", cs.UserTime.Nanoseconds()+cs.SysTime.Nanoseconds()) return nil } // +stateify savable type cpuacctUsageUserData struct { *cpuacctCgroup } // Generate implements vfs.DynamicBytesSource.Generate. func (d *cpuacctUsageUserData) Generate(ctx context.Context, buf *bytes.Buffer) error { cs := d.collectCPUStats() fmt.Fprintf(buf, "%d\n", cs.UserTime.Nanoseconds()) return nil } // +stateify savable type cpuacctUsageSysData struct { *cpuacctCgroup } // Generate implements vfs.DynamicBytesSource.Generate. func (d *cpuacctUsageSysData) Generate(ctx context.Context, buf *bytes.Buffer) error { cs := d.collectCPUStats() fmt.Fprintf(buf, "%d\n", cs.SysTime.Nanoseconds()) return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/cgroupfs/cpuset.go000066400000000000000000000122171465435605700257270ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cgroupfs import ( "bytes" "fmt" "gvisor.dev/gvisor/pkg/bitmap" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) // +stateify savable type cpusetController struct { controllerCommon controllerStateless controllerNoResource maxCpus uint32 maxMems uint32 mu sync.Mutex `state:"nosave"` cpus *bitmap.Bitmap mems *bitmap.Bitmap } var _ controller = (*cpusetController)(nil) func newCPUSetController(k *kernel.Kernel, fs *filesystem) *cpusetController { cores := uint32(k.ApplicationCores()) cpus := bitmap.New(cores) cpus.FlipRange(0, cores) mems := bitmap.New(1) mems.FlipRange(0, 1) c := &cpusetController{ cpus: &cpus, mems: &mems, maxCpus: uint32(k.ApplicationCores()), maxMems: 1, // We always report a single NUMA node. } c.controllerCommon.init(kernel.CgroupControllerCPUSet, fs) return c } // Clone implements controller.Clone. func (c *cpusetController) Clone() controller { c.mu.Lock() defer c.mu.Unlock() cpus := c.cpus.Clone() mems := c.mems.Clone() new := &cpusetController{ maxCpus: c.maxCpus, maxMems: c.maxMems, cpus: &cpus, mems: &mems, } new.controllerCommon.cloneFromParent(c) return new } // AddControlFiles implements controller.AddControlFiles. func (c *cpusetController) AddControlFiles(ctx context.Context, creds *auth.Credentials, _ *cgroupInode, contents map[string]kernfs.Inode) { contents["cpuset.cpus"] = c.fs.newControllerWritableFile(ctx, creds, &cpusData{c: c}, true) contents["cpuset.mems"] = c.fs.newControllerWritableFile(ctx, creds, &memsData{c: c}, true) } // +stateify savable type cpusData struct { c *cpusetController } // Generate implements vfs.DynamicBytesSource.Generate. func (d *cpusData) Generate(ctx context.Context, buf *bytes.Buffer) error { d.c.mu.Lock() defer d.c.mu.Unlock() fmt.Fprintf(buf, "%s\n", formatBitmap(d.c.cpus)) return nil } // Write implements vfs.WritableDynamicBytesSource.Write. func (d *cpusData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { return d.WriteBackground(ctx, src) } // WriteBackground implements writableControllerFileImpl.WriteBackground. func (d *cpusData) WriteBackground(ctx context.Context, src usermem.IOSequence) (int64, error) { if src.NumBytes() > hostarch.PageSize { return 0, linuxerr.EINVAL } buf := copyScratchBufferFromContext(ctx, hostarch.PageSize) n, err := src.CopyIn(ctx, buf) if err != nil { return 0, err } buf = buf[:n] b, err := parseBitmap(string(buf), d.c.maxCpus) if err != nil { log.Warningf("cgroupfs cpuset controller: Failed to parse bitmap: %v", err) return 0, linuxerr.EINVAL } if got, want := b.Maximum(), d.c.maxCpus; got > want { log.Warningf("cgroupfs cpuset controller: Attempted to specify cpuset.cpus beyond highest available cpu: got %d, want %d", got, want) return 0, linuxerr.EINVAL } d.c.mu.Lock() defer d.c.mu.Unlock() d.c.cpus = b return int64(n), nil } // +stateify savable type memsData struct { c *cpusetController } // Generate implements vfs.DynamicBytesSource.Generate. func (d *memsData) Generate(ctx context.Context, buf *bytes.Buffer) error { d.c.mu.Lock() defer d.c.mu.Unlock() fmt.Fprintf(buf, "%s\n", formatBitmap(d.c.mems)) return nil } // Write implements vfs.WritableDynamicBytesSource.Write. func (d *memsData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { return d.WriteBackground(ctx, src) } // WriteBackground implements writableControllerFileImpl.WriteBackground. func (d *memsData) WriteBackground(ctx context.Context, src usermem.IOSequence) (int64, error) { if src.NumBytes() > hostarch.PageSize { return 0, linuxerr.EINVAL } buf := copyScratchBufferFromContext(ctx, hostarch.PageSize) n, err := src.CopyIn(ctx, buf) if err != nil { return 0, err } buf = buf[:n] b, err := parseBitmap(string(buf), d.c.maxMems) if err != nil { log.Warningf("cgroupfs cpuset controller: Failed to parse bitmap: %v", err) return 0, linuxerr.EINVAL } if got, want := b.Maximum(), d.c.maxMems; got > want { log.Warningf("cgroupfs cpuset controller: Attempted to specify cpuset.mems beyond highest available node: got %d, want %d", got, want) return 0, linuxerr.EINVAL } d.c.mu.Lock() defer d.c.mu.Unlock() d.c.mems = b return int64(n), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/cgroupfs/devices.go000066400000000000000000000241671465435605700260550ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cgroupfs import ( "bytes" "fmt" "strconv" "strings" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) const ( canRead = 1 << iota canWrite canMknod ) const ( allowedDevices = "devices.allow" controlledDevices = "devices.list" deniedDevices = "devices.deny" wildcardDeviceNumber = -1 ) const ( blockDevice deviceType = "b" charDevice deviceType = "c" wildcardDevice deviceType = "a" ) // type denotes a device's type. type deviceType string func (d deviceType) valid() bool { switch d { case wildcardDevice, charDevice, blockDevice: return true default: return false } } // permission represents a device access, read, write, and mknod. type permission string func (p permission) valid() bool { for _, c := range p { switch c { case 'r', 'w', 'm': continue default: return false } } return true } // toBinary converts permission to its binary representation. func (p permission) toBinary() int { var perm int for _, c := range p { switch c { case 'r': perm |= canRead case 'w': perm |= canWrite case 'm': perm |= canMknod } } return perm } // union returns a permission which unions p and perm. func (p permission) union(perm permission) permission { return fromBinary(p.toBinary() | perm.toBinary()) } // difference returns a permission which consists of accesses in p and not in perm. func (p permission) difference(perm permission) permission { return fromBinary(p.toBinary() & ^perm.toBinary()) } // fromBinary converts permission to its string representation. func fromBinary(i int) permission { var perm permission if i&canRead == canRead { perm += "r" } if i&canWrite == canWrite { perm += "w" } if i&canMknod == canMknod { perm += "m" } return perm } // +stateify savable type deviceID struct { // Device type, when the type is all, the following fields are ignored. controllerType deviceType // The device's major number. major int64 // The device's minor number. minor int64 } // +stateify savable type devicesController struct { controllerCommon controllerStateless controllerNoResource // mu protects the fields below. mu sync.Mutex `state:"nosave"` // Allow or deny the device rules below. defaultAllow bool deviceRules map[deviceID]permission } // +stateify savable type allowedDevicesData struct { c *devicesController } // Generate implements vfs.DynamicBytesSource.Generate. The devices.allow shows nothing. func (d *allowedDevicesData) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } // Write implements vfs.WritableDynamicBytesSource.Write. func (d *allowedDevicesData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { return d.c.write(ctx, src, offset, true) } // +stateify savable type deniedDevicesData struct { c *devicesController } // Generate implements vfs.DynamicBytesSource.Generate. The devices.deny shows nothing. func (d *deniedDevicesData) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } // Write implements vfs.WritableDynamicBytesSource.Write. func (d *deniedDevicesData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { return d.c.write(ctx, src, offset, false) } // +stateify savable type controlledDevicesData struct { c *devicesController } // Generate implements vfs.DynamicBytesSource.Generate. // // The corresponding devices.list shows devices for which access control is set. func (d *controlledDevicesData) Generate(ctx context.Context, buf *bytes.Buffer) error { return d.c.generate(ctx, buf) } func (c *devicesController) addRule(id deviceID, newPermission permission) error { existingPermission := c.deviceRules[id] c.deviceRules[id] = existingPermission.union(newPermission) return nil } func (c *devicesController) removeRule(id deviceID, p permission) error { // cgroupv1 ignores silently requests to remove a partially-matching wildcard rule, // which are {majorDevice:wildcardDevice}, {wildcardDevice:minorDevice}, and {wildcardDevice:wildcardDevice} for _, wildcardDeviceID := range []deviceID{ {controllerType: id.controllerType, major: id.major, minor: wildcardDeviceNumber}, {controllerType: id.controllerType, major: wildcardDeviceNumber, minor: id.minor}, {controllerType: id.controllerType, major: wildcardDeviceNumber, minor: wildcardDeviceNumber}, } { // If there is a exact match, the permission needs to be updated. if id == wildcardDeviceID { continue } if _, exist := c.deviceRules[wildcardDeviceID]; exist { return nil } } if existingPermission, exist := c.deviceRules[id]; exist { if newPermission := existingPermission.difference(p); len(newPermission) == 0 { delete(c.deviceRules, id) } else { c.deviceRules[id] = newPermission } } return nil } func (c *devicesController) applyRule(id deviceID, p permission, allow bool) error { if !id.controllerType.valid() { return linuxerr.EINVAL } // If the device type is all, it will reset the rules for all. if id.controllerType == wildcardDevice { c.defaultAllow = allow clear(c.deviceRules) return nil } if !p.valid() { return linuxerr.EINVAL } if len(c.deviceRules) == 0 { c.defaultAllow = allow clear(c.deviceRules) } if allow == c.defaultAllow { return c.addRule(id, p) } return c.removeRule(id, p) } func (c *devicesController) generate(ctx context.Context, buf *bytes.Buffer) error { c.mu.Lock() defer c.mu.Unlock() switch { case c.defaultAllow && len(c.deviceRules) > 0: for id, p := range c.deviceRules { buf.WriteString(deviceRuleString(id, p)) // It lists one rule per line. buf.WriteRune('\n') } case c.defaultAllow && len(c.deviceRules) == 0: buf.WriteString(deviceRuleString(deviceID{controllerType: wildcardDevice, major: wildcardDeviceNumber, minor: wildcardDeviceNumber}, "rwm")) case !c.defaultAllow && len(c.deviceRules) == 0: buf.WriteString("") default: // When allow-all rule presents at devices.list, it actually indicates that // the cgroup is in black-list mode. buf.WriteString(deviceRuleString(deviceID{controllerType: wildcardDevice, major: wildcardDeviceNumber, minor: wildcardDeviceNumber}, "rwm")) } return nil } func (c *devicesController) write(ctx context.Context, src usermem.IOSequence, offset int64, allow bool) (int64, error) { c.mu.Lock() defer c.mu.Unlock() if src.NumBytes() > hostarch.PageSize { return 0, linuxerr.EINVAL } buf := copyScratchBufferFromContext(ctx, hostarch.PageSize) n, err := src.CopyIn(ctx, buf) if err != nil { return 0, err } rule := string(buf[:n]) fields := strings.FieldsFunc(rule, func(r rune) bool { return r == ' ' || r == ':' }) switch { case len(fields) != 1 && len(fields) != 4: return 0, linuxerr.EINVAL case len(fields) == 4: controllerType := deviceType(fields[0]) perm := permission(fields[3]) if i := strings.IndexFunc(fields[3], func(r rune) bool { return r == '\n' }); i != -1 { perm = perm[:i] } if len(perm) > 3 { perm = perm[:3] } majorDevice, err := toDeviceNumber(fields[1]) if err != nil { return 0, err } minorDevice, err := toDeviceNumber(fields[2]) if err != nil { return 0, err } id := deviceID{ controllerType: controllerType, major: majorDevice, minor: minorDevice, } if err := c.applyRule(id, perm, allow); err != nil { return 0, err } case len(fields) == 1: if deviceType(fields[0]) != wildcardDevice { return 0, linuxerr.EINVAL } if err := c.applyRule(deviceID{controllerType: wildcardDevice}, permission(""), allow); err != nil { return 0, err } } return int64(n), nil } var _ controller = (*devicesController)(nil) func newDevicesController(fs *filesystem) *devicesController { // The root device cgroup starts with rwm to all. c := &devicesController{ defaultAllow: true, deviceRules: make(map[deviceID]permission), } c.controllerCommon.init(kernel.CgroupControllerDevices, fs) return c } // Clone implements controller.Clone. func (c *devicesController) Clone() controller { c.mu.Lock() defer c.mu.Unlock() newRules := make(map[deviceID]permission) for id, p := range c.deviceRules { newRules[id] = p } new := &devicesController{ defaultAllow: c.defaultAllow, deviceRules: newRules, } new.controllerCommon.cloneFromParent(c) return new } // AddControlFiles implements controller.AddControlFiles. func (c *devicesController) AddControlFiles(ctx context.Context, creds *auth.Credentials, _ *cgroupInode, contents map[string]kernfs.Inode) { contents[allowedDevices] = c.fs.newControllerWritableFile(ctx, creds, &allowedDevicesData{c: c}, true) contents[deniedDevices] = c.fs.newControllerWritableFile(ctx, creds, &deniedDevicesData{c: c}, true) contents[controlledDevices] = c.fs.newControllerFile(ctx, creds, &controlledDevicesData{c: c}, true) } func deviceRuleString(id deviceID, p permission) string { return fmt.Sprintf("%s %s:%s %s", id.controllerType, deviceNumber(id.major), deviceNumber(id.minor), p) } // deviceNumber converts a device number to string. func deviceNumber(number int64) string { if number == wildcardDeviceNumber { return "*" } return fmt.Sprint(number) } func toDeviceNumber(s string) (int64, error) { if s == "*" { return wildcardDeviceNumber, nil } val, err := strconv.ParseInt(s, 10, 64) if err != nil { return 0, linuxerr.EINVAL } return val, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/cgroupfs/dir_refs.go000066400000000000000000000100251465435605700262140ustar00rootroot00000000000000package cgroupfs import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const direnableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var dirobj *dir // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type dirRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *dirRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *dirRefs) RefType() string { return fmt.Sprintf("%T", dirobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *dirRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *dirRefs) LogRefs() bool { return direnableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *dirRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *dirRefs) IncRef() { v := r.refCount.Add(1) if direnableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *dirRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if direnableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *dirRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if direnableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *dirRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/cgroupfs/job.go000066400000000000000000000030511465435605700251720ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cgroupfs import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) // +stateify savable type jobController struct { controllerCommon controllerStateless controllerNoResource id atomicbitops.Int64 } var _ controller = (*jobController)(nil) func newJobController(fs *filesystem) *jobController { c := &jobController{} c.controllerCommon.init(kernel.CgroupControllerJob, fs) return c } // Clone implements controller.Clone. func (c *jobController) Clone() controller { new := &jobController{ id: atomicbitops.FromInt64(c.id.Load()), } new.controllerCommon.cloneFromParent(c) return new } func (c *jobController) AddControlFiles(ctx context.Context, creds *auth.Credentials, _ *cgroupInode, contents map[string]kernfs.Inode) { contents["job.id"] = c.fs.newStubControllerFile(ctx, creds, &c.id, true) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/cgroupfs/memory.go000066400000000000000000000122561465435605700257370ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cgroupfs import ( "bytes" "fmt" "math" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/usage" ) // +stateify savable type memoryController struct { controllerCommon controllerNoResource limitBytes atomicbitops.Int64 softLimitBytes atomicbitops.Int64 moveChargeAtImmigrate atomicbitops.Int64 pressureLevel int64 // memCg is the memory cgroup for this controller. memCg *memoryCgroup } var _ controller = (*memoryController)(nil) func newMemoryController(fs *filesystem, defaults map[string]int64) *memoryController { c := &memoryController{ // Linux sets these limits to (PAGE_COUNTER_MAX * PAGE_SIZE) by default, // which is ~ 2**63 on a 64-bit system. So essentially, infinity. The // exact value isn't very important. limitBytes: atomicbitops.FromInt64(math.MaxInt64), softLimitBytes: atomicbitops.FromInt64(math.MaxInt64), } consumeDefault := func(name string, valPtr *atomicbitops.Int64) { if val, ok := defaults[name]; ok { valPtr.Store(val) delete(defaults, name) } } consumeDefault("memory.limit_in_bytes", &c.limitBytes) consumeDefault("memory.soft_limit_in_bytes", &c.softLimitBytes) consumeDefault("memory.move_charge_at_immigrate", &c.moveChargeAtImmigrate) c.controllerCommon.init(kernel.CgroupControllerMemory, fs) return c } // Clone implements controller.Clone. func (c *memoryController) Clone() controller { new := &memoryController{ limitBytes: atomicbitops.FromInt64(c.limitBytes.Load()), softLimitBytes: atomicbitops.FromInt64(c.softLimitBytes.Load()), moveChargeAtImmigrate: atomicbitops.FromInt64(c.moveChargeAtImmigrate.Load()), } new.controllerCommon.cloneFromParent(c) return new } // AddControlFiles implements controller.AddControlFiles. func (c *memoryController) AddControlFiles(ctx context.Context, creds *auth.Credentials, cg *cgroupInode, contents map[string]kernfs.Inode) { c.memCg = &memoryCgroup{cg} contents["memory.usage_in_bytes"] = c.fs.newControllerFile(ctx, creds, &memoryUsageInBytesData{memCg: &memoryCgroup{cg}}, true) contents["memory.limit_in_bytes"] = c.fs.newStubControllerFile(ctx, creds, &c.limitBytes, true) contents["memory.soft_limit_in_bytes"] = c.fs.newStubControllerFile(ctx, creds, &c.softLimitBytes, true) contents["memory.move_charge_at_immigrate"] = c.fs.newStubControllerFile(ctx, creds, &c.moveChargeAtImmigrate, true) contents["memory.pressure_level"] = c.fs.newStaticControllerFile(ctx, creds, linux.FileMode(0644), fmt.Sprintf("%d\n", c.pressureLevel)) } // Enter implements controller.Enter. func (c *memoryController) Enter(t *kernel.Task) { // Update the new cgroup id for the task. t.SetMemCgID(c.memCg.ID()) } // Leave implements controller.Leave. func (c *memoryController) Leave(t *kernel.Task) { // Update the cgroup id for the task to zero. t.SetMemCgID(0) } // PrepareMigrate implements controller.PrepareMigrate. func (c *memoryController) PrepareMigrate(t *kernel.Task, src controller) error { return nil } // CommitMigrate implements controller.CommitMigrate. func (c *memoryController) CommitMigrate(t *kernel.Task, src controller) { // Start tracking t at dst by updating the memCgID. t.SetMemCgID(c.memCg.ID()) } // AbortMigrate implements controller.AbortMigrate. func (c *memoryController) AbortMigrate(t *kernel.Task, src controller) {} // +stateify savable type memoryCgroup struct { *cgroupInode } // Collects all the memory cgroup ids for the cgroup. func (memCg *memoryCgroup) collectMemCgIDs(memCgIDs map[uint32]struct{}) { // Add ourselves. memCgIDs[memCg.ID()] = struct{}{} // Add our children. memCg.forEachChildDir(func(d *dir) { cg := memoryCgroup{d.cgi} cg.collectMemCgIDs(memCgIDs) }) } // Returns the memory usage for all cgroup ids in memCgIDs. func getUsage(k *kernel.Kernel, memCgIDs map[uint32]struct{}) uint64 { k.MemoryFile().UpdateUsage(memCgIDs) var totalBytes uint64 for id := range memCgIDs { _, bytes := usage.MemoryAccounting.CopyPerCg(id) totalBytes += bytes } return totalBytes } // +stateify savable type memoryUsageInBytesData struct { memCg *memoryCgroup } // Generate implements vfs.DynamicBytesSource.Generate. func (d *memoryUsageInBytesData) Generate(ctx context.Context, buf *bytes.Buffer) error { k := kernel.KernelFromContext(ctx) memCgIDs := make(map[uint32]struct{}) d.memCg.collectMemCgIDs(memCgIDs) totalBytes := getUsage(k, memCgIDs) fmt.Fprintf(buf, "%d\n", totalBytes) return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/cgroupfs/pids.go000066400000000000000000000240611465435605700253630ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cgroupfs import ( "bytes" "fmt" "strings" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) // pidMaxLimit is the maximum number of pids allowed on a 64-bit system. The // practical limit is much lower. See Linux, include/linux/threads.h. const pidMaxLimit = 4 * 1024 * 1024 const pidLimitUnlimited = pidMaxLimit + 1 // pidsController tracks how many pids are used by tasks in a cgroup. This is // used to limit the number of tasks per cgroup. The limit is enforced only when // new tasks are created via Fork/Clone. Task migrations and limit changes can // cause the current number of pids to exceed the limit. // // A task can charge a PIDs cgroup in two ways: // // 1. A task created prior to the PIDs controller being enabled, or created // through kernel.CreateProcess (i.e. not from userspace) directly add // committed charges via the Enter method. // // 2. A task created through Task.Clone (i.e. userspace fork/clone) first add a // pending charge through the Charge method. This is a temporary reservation // which ensures the cgroup has enough space to allow the task to start. Once // the task startup succeeds, it calls Enter and consumes the reservation. // // +stateify savable type pidsController struct { controllerCommon // isRoot indicates if this is the root cgroup in its hierarchy. Immutable // since cgroupfs doesn't allow cross directory renames. isRoot bool // mu protects the fields below. mu pidsControllerMutex `state:"nosave"` // pendingTotal and pendingPool tracks the charge for processes starting // up. During startup, we check if PIDs are available by charging the // cgroup. However, the process actually joins the cgroup as a later point // via Enter. We keep a count of the charges we allocated via Charge, and // use this pool to account for already accounted charges from Enter. // // We also track which task owns the pending charge so we can cancel the // charge if a task creation fails after the Charge call. // // pendingTotal and pendingPool are both protected by mu. pendingTotal int64 pendingPool map[*kernel.Task]int64 // committed represent charges for tasks that have already started and // called Enter. Protected by mu. committed int64 // max is the PID limit for this cgroup. Protected by mu. max int64 } var _ controller = (*pidsController)(nil) // newRootPIDsController creates the root node for a PIDs cgroup. Child // directories should be created through Clone. func newRootPIDsController(fs *filesystem) *pidsController { c := &pidsController{ isRoot: true, max: pidLimitUnlimited, pendingPool: make(map[*kernel.Task]int64), } c.controllerCommon.init(kernel.CgroupControllerPIDs, fs) return c } // Clone implements controller.Clone. func (c *pidsController) Clone() controller { c.mu.Lock() defer c.mu.Unlock() new := &pidsController{ isRoot: false, max: pidLimitUnlimited, pendingPool: make(map[*kernel.Task]int64), } new.controllerCommon.cloneFromParent(c) return new } // AddControlFiles implements controller.AddControlFiles. func (c *pidsController) AddControlFiles(ctx context.Context, creds *auth.Credentials, _ *cgroupInode, contents map[string]kernfs.Inode) { contents["pids.current"] = c.fs.newControllerFile(ctx, creds, &pidsCurrentData{c: c}, true) if !c.isRoot { // "This is not available in the root cgroup for obvious reasons" -- // Linux, Documentation/cgroup-v1/pids.txt. contents["pids.max"] = c.fs.newControllerWritableFile(ctx, creds, &pidsMaxData{c: c}, true) } } // Enter implements controller.Enter. // // Enter attempts to commit a charge from the pending pool. If at least one // charge is pending for t, one pending charge is converted to a committed // charge, and the net change in total charges is zero. If no charge is pending, // a new charge is added directly to the committed pool. func (c *pidsController) Enter(t *kernel.Task) { c.mu.Lock() defer c.mu.Unlock() if pending, ok := c.pendingPool[t]; ok { if pending == 1 { delete(c.pendingPool, t) } else { c.pendingPool[t] = pending - 1 } c.pendingTotal-- if c.pendingTotal < 0 { panic(fmt.Sprintf("cgroupfs: pids controller has negative pending charge: %v\n", c.committed)) } } // Either we're converting a pending charge from above, or generating a new // committed charge directly here. Either way, we don't enforce the limit on // Enter. c.committed++ } // Leave implements controller.Leave. func (c *pidsController) Leave(t *kernel.Task) { c.mu.Lock() defer c.mu.Unlock() if c.committed <= 0 { panic(fmt.Sprintf("cgroupfs: pids controller committed charge underflow on Leave for task %+v", t)) } c.committed-- } // PrepareMigrate implements controller.PrepareMigrate. func (c *pidsController) PrepareMigrate(t *kernel.Task, src controller) error { srcC := src.(*pidsController) srcC.mu.Lock() defer srcC.mu.Unlock() if _, ok := srcC.pendingPool[t]; ok { // Migrating task isn't fully initialized, return transient failure. return linuxerr.EAGAIN } return nil } // CommitMigrate implements controller.CommitMigrate. // // Migrations can cause a cgroup to exceed its limit. CommitMigrate can only be // called for tasks with committed charges, PrepareMigrate will deny migrations // prior to Enter. func (c *pidsController) CommitMigrate(t *kernel.Task, src controller) { // Note: The charge is allowed to exceed max on migration. The charge may // not exceed max when incurred due to a fork/clone, which will call // pidsController.Charge(). c.mu.Lock() c.committed++ c.mu.Unlock() srcC := src.(*pidsController) srcC.mu.Lock() if srcC.committed <= 0 { panic(fmt.Sprintf("cgroupfs: pids controller committed charge underflow on CommitMigrate for task %+v on the source cgroup", t)) } srcC.committed-- srcC.mu.Unlock() } // AbortMigrate implements controller.AbortMigrate. func (c *pidsController) AbortMigrate(t *kernel.Task, src controller) {} // Charge implements controller.Charge. This manipulates the pending // pool. Charge are committed from the pending pool by Enter. The caller is // responsible for ensuring negative charges correspond to previous positive // charges. Negative charges that cause an underflow result in a panic. func (c *pidsController) Charge(t *kernel.Task, d *kernfs.Dentry, res kernel.CgroupResourceType, value int64) error { if res != kernel.CgroupResourcePID { panic(fmt.Sprintf("cgroupfs: pids controller invalid resource type %v", res)) } c.mu.Lock() defer c.mu.Unlock() // Negative charge. if value < 0 { if c.pendingTotal+value < 0 { panic(fmt.Sprintf("cgroupfs: pids controller pending pool would be negative if charge was allowed: current pool: %d, proposed charge: %d, path: %q, task: %p", c.pendingTotal, value, d.FSLocalPath(), t)) } pending, ok := c.pendingPool[t] if !ok { panic(fmt.Sprintf("cgroupfs: pids controller attempted to remove pending charge for Task %p, but task didn't have pending charges, path: %q", t, d.FSLocalPath())) } if pending+value < 0 { panic(fmt.Sprintf("cgroupfs: pids controller attempted to remove pending charge for Task %p, but task didn't have enough pending charges; current charges: %d, proposed charge: %d, path: %q", t, pending, value, d.FSLocalPath())) } c.pendingPool[t] += value c.pendingTotal += value return nil } // Positive charge. new := c.committed + c.pendingTotal + value if new > c.max { log.Debugf("cgroupfs: pids controller charge denied due to limit: path: %q, requested: %d, current: %d (pending: %v, committed: %v), max: %v", d.FSLocalPath(), value, c.committed+c.pendingTotal, c.pendingTotal, c.committed, c.max) return linuxerr.EAGAIN } c.pendingPool[t] += value c.pendingTotal += value return nil } // +stateify savable type pidsCurrentData struct { c *pidsController } // Generate implements vfs.DynamicBytesSource.Generate. func (d *pidsCurrentData) Generate(ctx context.Context, buf *bytes.Buffer) error { d.c.mu.Lock() defer d.c.mu.Unlock() fmt.Fprintf(buf, "%d\n", d.c.committed+d.c.pendingTotal) return nil } // +stateify savable type pidsMaxData struct { c *pidsController } // Generate implements vfs.DynamicBytesSource.Generate. func (d *pidsMaxData) Generate(ctx context.Context, buf *bytes.Buffer) error { d.c.mu.Lock() defer d.c.mu.Unlock() if d.c.max > pidMaxLimit { fmt.Fprintf(buf, "max\n") } else { fmt.Fprintf(buf, "%d\n", d.c.max) } return nil } // Write implements vfs.WritableDynamicBytesSource.Write. func (d *pidsMaxData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { return d.WriteBackground(ctx, src) } // WriteBackground implements writableControllerFileImpl.WriteBackground. func (d *pidsMaxData) WriteBackground(ctx context.Context, src usermem.IOSequence) (int64, error) { buf := copyScratchBufferFromContext(ctx, hostarch.PageSize) ncpy, err := src.CopyIn(ctx, buf) if err != nil { return 0, err } if strings.TrimSpace(string(buf)) == "max" { d.c.mu.Lock() defer d.c.mu.Unlock() d.c.max = pidLimitUnlimited return int64(ncpy), nil } val, n, err := parseInt64FromString(ctx, src) if err != nil { return 0, linuxerr.EINVAL } if val < 0 || val > pidMaxLimit { return 0, linuxerr.EINVAL } d.c.mu.Lock() defer d.c.mu.Unlock() d.c.max = val return int64(n), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/cgroupfs/pids_controller_mutex.go000066400000000000000000000033721465435605700310520ustar00rootroot00000000000000package cgroupfs import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type pidsControllerMutex struct { mu sync.Mutex } var pidsControllerprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var pidsControllerlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type pidsControllerlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *pidsControllerMutex) Lock() { locking.AddGLock(pidsControllerprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *pidsControllerMutex) NestedLock(i pidsControllerlockNameIndex) { locking.AddGLock(pidsControllerprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *pidsControllerMutex) Unlock() { locking.DelGLock(pidsControllerprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *pidsControllerMutex) NestedUnlock(i pidsControllerlockNameIndex) { locking.DelGLock(pidsControllerprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func pidsControllerinitLockNames() {} func init() { pidsControllerinitLockNames() pidsControllerprefixIndex = locking.NewMutexClass(reflect.TypeOf(pidsControllerMutex{}), pidsControllerlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/cgroupfs/task_mutex.go000066400000000000000000000044471465435605700266160ustar00rootroot00000000000000package cgroupfs import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type taskRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var tasklockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type tasklockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *taskRWMutex) Lock() { locking.AddGLock(taskprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *taskRWMutex) NestedLock(i tasklockNameIndex) { locking.AddGLock(taskprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *taskRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(taskprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *taskRWMutex) NestedUnlock(i tasklockNameIndex) { m.mu.Unlock() locking.DelGLock(taskprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *taskRWMutex) RLock() { locking.AddGLock(taskprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *taskRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(taskprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *taskRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *taskRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *taskRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var taskprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func taskinitLockNames() {} func init() { taskinitLockNames() taskprefixIndex = locking.NewMutexClass(reflect.TypeOf(taskRWMutex{}), tasklockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/dev/000077500000000000000000000000001465435605700230205ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/dev/dev.go000066400000000000000000000146721465435605700241370ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package dev provides a filesystem implementation for /dev. package dev import ( "fmt" "path" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // Name is the dev filesystem name. const Name = "dev" // FilesystemType implements vfs.FilesystemType. // // +stateify savable type FilesystemType struct{} // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name } // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fst FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { mntns, err := vfsObj.NewMountNamespace(ctx, creds, source /* source */, tmpfs.Name, &vfs.MountOptions{GetFilesystemOptions: vfs.GetFilesystemOptions{ Data: "mode=0755", // opts from drivers/base/devtmpfs.c:devtmpfs_init() }}, nil) if err != nil { return nil, nil, err } defer mntns.DecRef(ctx) root := mntns.Root(ctx) defer root.DecRef(ctx) iopts, _ := opts.InternalData.(InternalData) // If not provided, zero value is OK. // Initialize contents. if err := userspaceInit(ctx, vfsObj, creds, root, iopts.ShmMode); err != nil { return nil, nil, err } if err := vfsObj.ForEachDevice(func(pathname string, kind vfs.DeviceKind, major, minor uint32, perms uint16) error { if pathname == "" { return nil } mode := linux.FileMode(perms) switch kind { case vfs.CharDevice: mode |= linux.S_IFCHR case vfs.BlockDevice: mode |= linux.S_IFBLK default: panic(fmt.Sprintf("invalid DeviceKind: %v", kind)) } return CreateDeviceFile(ctx, vfsObj, creds, root, pathname, major, minor, mode, nil /* uid */, nil /* gid */) }); err != nil { return nil, nil, err } root.Mount().Filesystem().IncRef() root.Dentry().IncRef() return root.Mount().Filesystem(), root.Dentry(), nil } // Release implements vfs.FilesystemType.Release. func (fst *FilesystemType) Release(ctx context.Context) {} // InternalData contains internal data passed in via vfs.GetFilesystemOptions. type InternalData struct { // ShmMode indicates the mode to create the /dev/shm dir with. ShmMode *uint16 } func pathOperationAt(root vfs.VirtualDentry, pathname string) *vfs.PathOperation { return &vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse(pathname), } } // CreateDeviceFile creates a device special file at the given pathname from root. func CreateDeviceFile(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry, pathname string, major, minor uint32, mode linux.FileMode, uid, gid *uint32) error { // Create any parent directories. See // devtmpfs.c:handle_create()=>create_path(). parent := path.Dir(pathname) if err := vfsObj.MkdirAllAt(ctx, parent, root, creds, &vfs.MkdirOptions{ Mode: 0755, }, true /* mustBeDir */); err != nil { return fmt.Errorf("failed to create device parent directory %q: %v", parent, err) } created := true pop := pathOperationAt(root, pathname) if err := vfsObj.MknodAt(ctx, creds, pop, &vfs.MknodOptions{Mode: mode, DevMajor: major, DevMinor: minor}); err != nil { if linuxerr.Equals(linuxerr.EEXIST, err) { // EEXIST is silently ignored; compare // opencontainers/runc:libcontainer/rootfs_linux.go:createDeviceNode(). created = false } else { return fmt.Errorf("failed to create device file at %q: %w", pathname, err) } } if created && (uid != nil || gid != nil) { var opts vfs.SetStatOptions if uid != nil { opts.Stat.Mask |= linux.STATX_UID opts.Stat.UID = *uid } if gid != nil { opts.Stat.Mask |= linux.STATX_GID opts.Stat.GID = *gid } if err := vfsObj.SetStatAt(ctx, creds, pop, &opts); err != nil { return fmt.Errorf("failed to set UID/GID for device file %q: %w", pathname, err) } } return nil } // userspaceInit creates symbolic links and mount points in the devtmpfs // instance that are created by userspace in Linux. It does not create mounts. func userspaceInit(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry, shmMode *uint16) error { // Initialize symlinks. for _, symlink := range []struct { source string target string }{ // systemd: src/shared/dev-setup.c:dev_setup() {source: "fd", target: "/proc/self/fd"}, {source: "stdin", target: "/proc/self/fd/0"}, {source: "stdout", target: "/proc/self/fd/1"}, {source: "stderr", target: "/proc/self/fd/2"}, // /proc/kcore is not implemented. // Linux implements /dev/ptmx as a device node, but advises // container implementations to create /dev/ptmx as a symlink // to pts/ptmx (Documentation/filesystems/devpts.txt). Systemd // follows this advice (src/nspawn/nspawn.c:setup_pts()), while // LXC tries to create a bind mount and falls back to a symlink // (src/lxc/conf.c:lxc_setup_devpts()). {source: "ptmx", target: "pts/ptmx"}, } { if err := vfsObj.SymlinkAt(ctx, creds, pathOperationAt(root, symlink.source), symlink.target); err != nil { return fmt.Errorf("failed to create symlink %q => %q: %v", symlink.source, symlink.target, err) } } // systemd: src/core/mount-setup.c:mount_table for _, dir := range []string{ "shm", "pts", } { // "The access mode here doesn't really matter too much, since the // mounted file system will take precedence anyway" // - systemd: src/core/mount-setup.c:mount_one() accessMode := linux.FileMode(0755) if shmMode != nil && dir == "shm" { accessMode = linux.FileMode(*shmMode) } if err := vfsObj.MkdirAt(ctx, creds, pathOperationAt(root, dir), &vfs.MkdirOptions{ Mode: accessMode, }); err != nil { return fmt.Errorf("failed to create directory %q: %v", dir, err) } } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/dev/dev_state_autogen.go000066400000000000000000000012341465435605700270470ustar00rootroot00000000000000// automatically generated by stateify. package dev import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (fst *FilesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/dev.FilesystemType" } func (fst *FilesystemType) StateFields() []string { return []string{} } func (fst *FilesystemType) beforeSave() {} // +checklocksignore func (fst *FilesystemType) StateSave(stateSinkObject state.Sink) { fst.beforeSave() } func (fst *FilesystemType) afterLoad(context.Context) {} // +checklocksignore func (fst *FilesystemType) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func init() { state.Register((*FilesystemType)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/devpts/000077500000000000000000000000001465435605700235475ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/devpts/devpts.go000066400000000000000000000251571465435605700254150ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package devpts provides a filesystem implementation that behaves like // devpts. package devpts import ( "fmt" "math" "sort" "strconv" "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // Name is the filesystem name. const Name = "devpts" // FilesystemType implements vfs.FilesystemType. // // +stateify savable type FilesystemType struct { initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported. initErr error // fs backs all mounts of this FilesystemType. root is fs' root. fs and root // are immutable. fs *vfs.Filesystem root *vfs.Dentry } type fileSystemOpts struct { mode linux.FileMode ptmxMode linux.FileMode uid auth.KUID gid auth.KGID } // Name implements vfs.FilesystemType.Name. func (*FilesystemType) Name() string { return Name } // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fstype *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { mopts := vfs.GenericParseMountOptions(opts.Data) fsOpts := fileSystemOpts{ mode: 0555, ptmxMode: 0666, uid: creds.EffectiveKUID, gid: creds.EffectiveKGID, } if modeStr, ok := mopts["mode"]; ok { delete(mopts, "mode") mode, err := strconv.ParseUint(modeStr, 8, 32) if err != nil { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr) return nil, nil, linuxerr.EINVAL } fsOpts.mode = linux.FileMode(mode & 0777) } if modeStr, ok := mopts["ptmxmode"]; ok { delete(mopts, "ptmxmode") mode, err := strconv.ParseUint(modeStr, 8, 32) if err != nil { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid ptmxmode: %q", modeStr) return nil, nil, linuxerr.EINVAL } fsOpts.ptmxMode = linux.FileMode(mode & 0777) } if uidStr, ok := mopts["uid"]; ok { delete(mopts, "uid") uid, err := strconv.ParseUint(uidStr, 10, 32) if err != nil { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr) return nil, nil, linuxerr.EINVAL } kuid := creds.UserNamespace.MapToKUID(auth.UID(uid)) if !kuid.Ok() { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid) return nil, nil, linuxerr.EINVAL } fsOpts.uid = kuid } if gidStr, ok := mopts["gid"]; ok { delete(mopts, "gid") gid, err := strconv.ParseUint(gidStr, 10, 32) if err != nil { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr) return nil, nil, linuxerr.EINVAL } kgid := creds.UserNamespace.MapToKGID(auth.GID(gid)) if !kgid.Ok() { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid) return nil, nil, linuxerr.EINVAL } fsOpts.gid = kgid } newinstance := false if _, ok := mopts["newinstance"]; ok { newinstance = true delete(mopts, "newinstance") } if len(mopts) != 0 { ctx.Warningf("devpts.FilesystemType.GetFilesystem: unknown options: %v", mopts) return nil, nil, linuxerr.EINVAL } if newinstance { fs, root, err := fstype.newFilesystem(ctx, vfsObj, creds, fsOpts) if err != nil { return nil, nil, err } return fs.VFSFilesystem(), root.VFSDentry(), nil } fstype.initOnce.Do(func() { fs, root, err := fstype.newFilesystem(ctx, vfsObj, creds, fsOpts) if err != nil { fstype.initErr = err return } fstype.fs = fs.VFSFilesystem() fstype.root = root.VFSDentry() }) if fstype.initErr != nil { return nil, nil, fstype.initErr } fstype.fs.IncRef() fstype.root.IncRef() return fstype.fs, fstype.root, nil } // Release implements vfs.FilesystemType.Release. func (fstype *FilesystemType) Release(ctx context.Context) { if fstype.fs != nil { fstype.root.DecRef(ctx) fstype.fs.DecRef(ctx) } } // +stateify savable type filesystem struct { kernfs.Filesystem devMinor uint32 } // newFilesystem creates a new devpts filesystem with root directory and ptmx // master inode. It returns the filesystem and root Dentry. func (fstype *FilesystemType) newFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, opts fileSystemOpts) (*filesystem, *kernfs.Dentry, error) { devMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, nil, err } fs := &filesystem{ devMinor: devMinor, } fs.Filesystem.VFSFilesystem().Init(vfsObj, fstype, fs) // Construct the root directory. This is always inode id 1. root := &rootInode{ replicas: make(map[uint32]*replicaInode), } root.InodeAttrs.InitWithIDs(ctx, opts.uid, opts.gid, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|opts.mode) root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) root.InitRefs() var rootD kernfs.Dentry rootD.InitRoot(&fs.Filesystem, root) // Construct the pts master inode and dentry. Linux always uses inode // id 2 for ptmx. See fs/devpts/inode.c:mknod_ptmx. master := &masterInode{ root: root, } master.InodeAttrs.InitWithIDs(ctx, opts.uid, opts.gid, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|opts.ptmxMode) // Add the master as a child of the root. links := root.OrderedChildren.Populate(map[string]kernfs.Inode{ "ptmx": master, }) root.IncLinks(links) return fs, &rootD, nil } // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) } // MountOptions implements vfs.FilesystemImpl.MountOptions. func (fs *filesystem) MountOptions() string { return "" } // rootInode is the root directory inode for the devpts mounts. // // +stateify savable type rootInode struct { implStatFS kernfs.InodeAlwaysValid kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren kernfs.InodeNotAnonymous kernfs.InodeNotSymlink kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid. kernfs.InodeWatches kernfs.OrderedChildren rootInodeRefs locks vfs.FileLocks // master is the master pty inode. Immutable. master *masterInode // mu protects the fields below. mu sync.Mutex `state:"nosave"` // replicas maps pty ids to replica inodes. replicas map[uint32]*replicaInode // nextIdx is the next pty index to use. Must be accessed atomically. // // TODO(b/29356795): reuse indices when ptys are closed. nextIdx uint32 } var _ kernfs.Inode = (*rootInode)(nil) // allocateTerminal creates a new Terminal and installs a pts node for it. func (i *rootInode) allocateTerminal(ctx context.Context, creds *auth.Credentials) (*Terminal, error) { i.mu.Lock() defer i.mu.Unlock() if i.nextIdx == math.MaxUint32 { return nil, linuxerr.ENOMEM } idx := i.nextIdx i.nextIdx++ // Sanity check that replica with idx does not exist. if _, ok := i.replicas[idx]; ok { panic(fmt.Sprintf("pty index collision; index %d already exists", idx)) } // Create the new terminal and replica. t := newTerminal(idx) replica := &replicaInode{ root: i, t: t, } // Linux always uses pty index + 3 as the inode id. See // fs/devpts/inode.c:devpts_pty_new(). replica.InodeAttrs.Init(ctx, creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600) i.replicas[idx] = replica return t, nil } // masterClose is called when the master end of t is closed. func (i *rootInode) masterClose(ctx context.Context, t *Terminal) { i.mu.Lock() defer i.mu.Unlock() // Sanity check that replica with idx exists. ri, ok := i.replicas[t.n] if !ok { panic(fmt.Sprintf("pty with index %d does not exist", t.n)) } // Drop the ref on replica inode taken during rootInode.allocateTerminal. ri.DecRef(ctx) delete(i.replicas, t.n) } // Open implements kernfs.Inode.Open. func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK | linux.O_NOCTTY fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndStaticEntries, }) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } // Lookup implements kernfs.Inode.Lookup. func (i *rootInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { // Check if a static entry was looked up. if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil { return d, nil } // Not a static entry. idx, err := strconv.ParseUint(name, 10, 32) if err != nil { return nil, linuxerr.ENOENT } i.mu.Lock() defer i.mu.Unlock() if ri, ok := i.replicas[uint32(idx)]; ok { ri.IncRef() // This ref is passed to the dentry upon creation via Init. return ri, nil } return nil, linuxerr.ENOENT } // IterDirents implements kernfs.Inode.IterDirents. func (i *rootInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { i.mu.Lock() defer i.mu.Unlock() i.InodeAttrs.TouchAtime(ctx, mnt) if relOffset >= int64(len(i.replicas)) { return offset, nil } ids := make([]int, 0, len(i.replicas)) for id := range i.replicas { ids = append(ids, int(id)) } sort.Ints(ids) for _, id := range ids[relOffset:] { dirent := vfs.Dirent{ Name: strconv.FormatUint(uint64(id), 10), Type: linux.DT_CHR, Ino: i.replicas[uint32(id)].InodeAttrs.Ino(), NextOff: offset + 1, } if err := cb.Handle(dirent); err != nil { return offset, err } offset++ } return offset, nil } // DecRef implements kernfs.Inode.DecRef. func (i *rootInode) DecRef(ctx context.Context) { i.rootInodeRefs.DecRef(func() { i.Destroy(ctx) }) } // +stateify savable type implStatFS struct{} // StatFS implements kernfs.Inode.StatFS. func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { return vfs.GenericStatFS(linux.DEVPTS_SUPER_MAGIC), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/devpts/devpts_state_autogen.go000066400000000000000000000336241465435605700303350ustar00rootroot00000000000000// automatically generated by stateify. package devpts import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (fstype *FilesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/devpts.FilesystemType" } func (fstype *FilesystemType) StateFields() []string { return []string{ "initErr", "fs", "root", } } func (fstype *FilesystemType) beforeSave() {} // +checklocksignore func (fstype *FilesystemType) StateSave(stateSinkObject state.Sink) { fstype.beforeSave() stateSinkObject.Save(0, &fstype.initErr) stateSinkObject.Save(1, &fstype.fs) stateSinkObject.Save(2, &fstype.root) } func (fstype *FilesystemType) afterLoad(context.Context) {} // +checklocksignore func (fstype *FilesystemType) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fstype.initErr) stateSourceObject.Load(1, &fstype.fs) stateSourceObject.Load(2, &fstype.root) } func (fs *filesystem) StateTypeName() string { return "pkg/sentry/fsimpl/devpts.filesystem" } func (fs *filesystem) StateFields() []string { return []string{ "Filesystem", "devMinor", } } func (fs *filesystem) beforeSave() {} // +checklocksignore func (fs *filesystem) StateSave(stateSinkObject state.Sink) { fs.beforeSave() stateSinkObject.Save(0, &fs.Filesystem) stateSinkObject.Save(1, &fs.devMinor) } func (fs *filesystem) afterLoad(context.Context) {} // +checklocksignore func (fs *filesystem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fs.Filesystem) stateSourceObject.Load(1, &fs.devMinor) } func (i *rootInode) StateTypeName() string { return "pkg/sentry/fsimpl/devpts.rootInode" } func (i *rootInode) StateFields() []string { return []string{ "implStatFS", "InodeAlwaysValid", "InodeAttrs", "InodeDirectoryNoNewChildren", "InodeNotAnonymous", "InodeNotSymlink", "InodeTemporary", "InodeWatches", "OrderedChildren", "rootInodeRefs", "locks", "master", "replicas", "nextIdx", } } func (i *rootInode) beforeSave() {} // +checklocksignore func (i *rootInode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.implStatFS) stateSinkObject.Save(1, &i.InodeAlwaysValid) stateSinkObject.Save(2, &i.InodeAttrs) stateSinkObject.Save(3, &i.InodeDirectoryNoNewChildren) stateSinkObject.Save(4, &i.InodeNotAnonymous) stateSinkObject.Save(5, &i.InodeNotSymlink) stateSinkObject.Save(6, &i.InodeTemporary) stateSinkObject.Save(7, &i.InodeWatches) stateSinkObject.Save(8, &i.OrderedChildren) stateSinkObject.Save(9, &i.rootInodeRefs) stateSinkObject.Save(10, &i.locks) stateSinkObject.Save(11, &i.master) stateSinkObject.Save(12, &i.replicas) stateSinkObject.Save(13, &i.nextIdx) } func (i *rootInode) afterLoad(context.Context) {} // +checklocksignore func (i *rootInode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.implStatFS) stateSourceObject.Load(1, &i.InodeAlwaysValid) stateSourceObject.Load(2, &i.InodeAttrs) stateSourceObject.Load(3, &i.InodeDirectoryNoNewChildren) stateSourceObject.Load(4, &i.InodeNotAnonymous) stateSourceObject.Load(5, &i.InodeNotSymlink) stateSourceObject.Load(6, &i.InodeTemporary) stateSourceObject.Load(7, &i.InodeWatches) stateSourceObject.Load(8, &i.OrderedChildren) stateSourceObject.Load(9, &i.rootInodeRefs) stateSourceObject.Load(10, &i.locks) stateSourceObject.Load(11, &i.master) stateSourceObject.Load(12, &i.replicas) stateSourceObject.Load(13, &i.nextIdx) } func (i *implStatFS) StateTypeName() string { return "pkg/sentry/fsimpl/devpts.implStatFS" } func (i *implStatFS) StateFields() []string { return []string{} } func (i *implStatFS) beforeSave() {} // +checklocksignore func (i *implStatFS) StateSave(stateSinkObject state.Sink) { i.beforeSave() } func (i *implStatFS) afterLoad(context.Context) {} // +checklocksignore func (i *implStatFS) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (l *lineDiscipline) StateTypeName() string { return "pkg/sentry/fsimpl/devpts.lineDiscipline" } func (l *lineDiscipline) StateFields() []string { return []string{ "size", "inQueue", "outQueue", "termios", "column", "numReplicas", "masterWaiter", "replicaWaiter", "terminal", } } func (l *lineDiscipline) beforeSave() {} // +checklocksignore func (l *lineDiscipline) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.size) stateSinkObject.Save(1, &l.inQueue) stateSinkObject.Save(2, &l.outQueue) stateSinkObject.Save(3, &l.termios) stateSinkObject.Save(4, &l.column) stateSinkObject.Save(5, &l.numReplicas) stateSinkObject.Save(6, &l.masterWaiter) stateSinkObject.Save(7, &l.replicaWaiter) stateSinkObject.Save(8, &l.terminal) } func (l *lineDiscipline) afterLoad(context.Context) {} // +checklocksignore func (l *lineDiscipline) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.size) stateSourceObject.Load(1, &l.inQueue) stateSourceObject.Load(2, &l.outQueue) stateSourceObject.Load(3, &l.termios) stateSourceObject.Load(4, &l.column) stateSourceObject.Load(5, &l.numReplicas) stateSourceObject.Load(6, &l.masterWaiter) stateSourceObject.Load(7, &l.replicaWaiter) stateSourceObject.Load(8, &l.terminal) } func (o *outputQueueTransformer) StateTypeName() string { return "pkg/sentry/fsimpl/devpts.outputQueueTransformer" } func (o *outputQueueTransformer) StateFields() []string { return []string{} } func (o *outputQueueTransformer) beforeSave() {} // +checklocksignore func (o *outputQueueTransformer) StateSave(stateSinkObject state.Sink) { o.beforeSave() } func (o *outputQueueTransformer) afterLoad(context.Context) {} // +checklocksignore func (o *outputQueueTransformer) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (i *inputQueueTransformer) StateTypeName() string { return "pkg/sentry/fsimpl/devpts.inputQueueTransformer" } func (i *inputQueueTransformer) StateFields() []string { return []string{} } func (i *inputQueueTransformer) beforeSave() {} // +checklocksignore func (i *inputQueueTransformer) StateSave(stateSinkObject state.Sink) { i.beforeSave() } func (i *inputQueueTransformer) afterLoad(context.Context) {} // +checklocksignore func (i *inputQueueTransformer) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (mi *masterInode) StateTypeName() string { return "pkg/sentry/fsimpl/devpts.masterInode" } func (mi *masterInode) StateFields() []string { return []string{ "implStatFS", "InodeAttrs", "InodeNoopRefCount", "InodeNotAnonymous", "InodeNotDirectory", "InodeNotSymlink", "InodeWatches", "locks", "root", } } func (mi *masterInode) beforeSave() {} // +checklocksignore func (mi *masterInode) StateSave(stateSinkObject state.Sink) { mi.beforeSave() stateSinkObject.Save(0, &mi.implStatFS) stateSinkObject.Save(1, &mi.InodeAttrs) stateSinkObject.Save(2, &mi.InodeNoopRefCount) stateSinkObject.Save(3, &mi.InodeNotAnonymous) stateSinkObject.Save(4, &mi.InodeNotDirectory) stateSinkObject.Save(5, &mi.InodeNotSymlink) stateSinkObject.Save(6, &mi.InodeWatches) stateSinkObject.Save(7, &mi.locks) stateSinkObject.Save(8, &mi.root) } func (mi *masterInode) afterLoad(context.Context) {} // +checklocksignore func (mi *masterInode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &mi.implStatFS) stateSourceObject.Load(1, &mi.InodeAttrs) stateSourceObject.Load(2, &mi.InodeNoopRefCount) stateSourceObject.Load(3, &mi.InodeNotAnonymous) stateSourceObject.Load(4, &mi.InodeNotDirectory) stateSourceObject.Load(5, &mi.InodeNotSymlink) stateSourceObject.Load(6, &mi.InodeWatches) stateSourceObject.Load(7, &mi.locks) stateSourceObject.Load(8, &mi.root) } func (mfd *masterFileDescription) StateTypeName() string { return "pkg/sentry/fsimpl/devpts.masterFileDescription" } func (mfd *masterFileDescription) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "LockFD", "inode", "t", } } func (mfd *masterFileDescription) beforeSave() {} // +checklocksignore func (mfd *masterFileDescription) StateSave(stateSinkObject state.Sink) { mfd.beforeSave() stateSinkObject.Save(0, &mfd.vfsfd) stateSinkObject.Save(1, &mfd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &mfd.LockFD) stateSinkObject.Save(3, &mfd.inode) stateSinkObject.Save(4, &mfd.t) } func (mfd *masterFileDescription) afterLoad(context.Context) {} // +checklocksignore func (mfd *masterFileDescription) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &mfd.vfsfd) stateSourceObject.Load(1, &mfd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &mfd.LockFD) stateSourceObject.Load(3, &mfd.inode) stateSourceObject.Load(4, &mfd.t) } func (q *queue) StateTypeName() string { return "pkg/sentry/fsimpl/devpts.queue" } func (q *queue) StateFields() []string { return []string{ "readBuf", "waitBuf", "waitBufLen", "readable", "transformer", } } func (q *queue) beforeSave() {} // +checklocksignore func (q *queue) StateSave(stateSinkObject state.Sink) { q.beforeSave() stateSinkObject.Save(0, &q.readBuf) stateSinkObject.Save(1, &q.waitBuf) stateSinkObject.Save(2, &q.waitBufLen) stateSinkObject.Save(3, &q.readable) stateSinkObject.Save(4, &q.transformer) } func (q *queue) afterLoad(context.Context) {} // +checklocksignore func (q *queue) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &q.readBuf) stateSourceObject.Load(1, &q.waitBuf) stateSourceObject.Load(2, &q.waitBufLen) stateSourceObject.Load(3, &q.readable) stateSourceObject.Load(4, &q.transformer) } func (ri *replicaInode) StateTypeName() string { return "pkg/sentry/fsimpl/devpts.replicaInode" } func (ri *replicaInode) StateFields() []string { return []string{ "implStatFS", "InodeAttrs", "InodeNoopRefCount", "InodeNotAnonymous", "InodeNotDirectory", "InodeNotSymlink", "InodeWatches", "locks", "root", "t", } } func (ri *replicaInode) beforeSave() {} // +checklocksignore func (ri *replicaInode) StateSave(stateSinkObject state.Sink) { ri.beforeSave() stateSinkObject.Save(0, &ri.implStatFS) stateSinkObject.Save(1, &ri.InodeAttrs) stateSinkObject.Save(2, &ri.InodeNoopRefCount) stateSinkObject.Save(3, &ri.InodeNotAnonymous) stateSinkObject.Save(4, &ri.InodeNotDirectory) stateSinkObject.Save(5, &ri.InodeNotSymlink) stateSinkObject.Save(6, &ri.InodeWatches) stateSinkObject.Save(7, &ri.locks) stateSinkObject.Save(8, &ri.root) stateSinkObject.Save(9, &ri.t) } func (ri *replicaInode) afterLoad(context.Context) {} // +checklocksignore func (ri *replicaInode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &ri.implStatFS) stateSourceObject.Load(1, &ri.InodeAttrs) stateSourceObject.Load(2, &ri.InodeNoopRefCount) stateSourceObject.Load(3, &ri.InodeNotAnonymous) stateSourceObject.Load(4, &ri.InodeNotDirectory) stateSourceObject.Load(5, &ri.InodeNotSymlink) stateSourceObject.Load(6, &ri.InodeWatches) stateSourceObject.Load(7, &ri.locks) stateSourceObject.Load(8, &ri.root) stateSourceObject.Load(9, &ri.t) } func (rfd *replicaFileDescription) StateTypeName() string { return "pkg/sentry/fsimpl/devpts.replicaFileDescription" } func (rfd *replicaFileDescription) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "LockFD", "inode", } } func (rfd *replicaFileDescription) beforeSave() {} // +checklocksignore func (rfd *replicaFileDescription) StateSave(stateSinkObject state.Sink) { rfd.beforeSave() stateSinkObject.Save(0, &rfd.vfsfd) stateSinkObject.Save(1, &rfd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &rfd.LockFD) stateSinkObject.Save(3, &rfd.inode) } func (rfd *replicaFileDescription) afterLoad(context.Context) {} // +checklocksignore func (rfd *replicaFileDescription) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &rfd.vfsfd) stateSourceObject.Load(1, &rfd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &rfd.LockFD) stateSourceObject.Load(3, &rfd.inode) } func (r *rootInodeRefs) StateTypeName() string { return "pkg/sentry/fsimpl/devpts.rootInodeRefs" } func (r *rootInodeRefs) StateFields() []string { return []string{ "refCount", } } func (r *rootInodeRefs) beforeSave() {} // +checklocksignore func (r *rootInodeRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *rootInodeRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (t *Terminal) StateTypeName() string { return "pkg/sentry/fsimpl/devpts.Terminal" } func (t *Terminal) StateFields() []string { return []string{ "n", "ld", "masterKTTY", "replicaKTTY", } } func (t *Terminal) beforeSave() {} // +checklocksignore func (t *Terminal) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.n) stateSinkObject.Save(1, &t.ld) stateSinkObject.Save(2, &t.masterKTTY) stateSinkObject.Save(3, &t.replicaKTTY) } func (t *Terminal) afterLoad(context.Context) {} // +checklocksignore func (t *Terminal) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.n) stateSourceObject.Load(1, &t.ld) stateSourceObject.Load(2, &t.masterKTTY) stateSourceObject.Load(3, &t.replicaKTTY) } func init() { state.Register((*FilesystemType)(nil)) state.Register((*filesystem)(nil)) state.Register((*rootInode)(nil)) state.Register((*implStatFS)(nil)) state.Register((*lineDiscipline)(nil)) state.Register((*outputQueueTransformer)(nil)) state.Register((*inputQueueTransformer)(nil)) state.Register((*masterInode)(nil)) state.Register((*masterFileDescription)(nil)) state.Register((*queue)(nil)) state.Register((*replicaInode)(nil)) state.Register((*replicaFileDescription)(nil)) state.Register((*rootInodeRefs)(nil)) state.Register((*Terminal)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/devpts/line_discipline.go000066400000000000000000000453011465435605700272330ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package devpts import ( "bytes" "unicode" "unicode/utf8" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) const ( // canonMaxBytes is the number of bytes that fit into a single line of // terminal input in canonical mode. This corresponds to N_TTY_BUF_SIZE // in include/linux/tty.h. canonMaxBytes = 4096 // nonCanonMaxBytes is the maximum number of bytes that can be read at // a time in noncanonical mode. nonCanonMaxBytes = canonMaxBytes - 1 spacesPerTab = 8 ) // lineDiscipline dictates how input and output are handled between the // pseudoterminal (pty) master and replica. It can be configured to alter I/O, // modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man // pages are good resources for how to affect the line discipline: // // - termios(3) // - tty_ioctl(4) // // This file corresponds most closely to drivers/tty/n_tty.c. // // lineDiscipline has a simple structure but supports a multitude of options // (see the above man pages). It consists of two queues of bytes: one from the // terminal master to replica (the input queue) and one from replica to master // (the output queue). When bytes are written to one end of the pty, the line // discipline reads the bytes, modifies them or takes special action if // required, and enqueues them to be read by the other end of the pty: // // input from terminal +-------------+ input to process (e.g. bash) // +------------------------>| input queue |---------------------------+ // | (inputQueueWrite) +-------------+ (inputQueueRead) | // | | // | v // // masterFD replicaFD // // ^ | // | | // | output to terminal +--------------+ output from process | // +------------------------| output queue |<--------------------------+ // (outputQueueRead) +--------------+ (outputQueueWrite) // // There is special handling for the ECHO option, where bytes written to the // input queue are also output back to the terminal by being written to // l.outQueue by the input queue transformer. // // Lock order: // // termiosMu // inQueue.mu // outQueue.mu // // +stateify savable type lineDiscipline struct { // sizeMu protects size. sizeMu sync.Mutex `state:"nosave"` // size is the terminal size (width and height). size linux.WindowSize // inQueue is the input queue of the terminal. inQueue queue // outQueue is the output queue of the terminal. outQueue queue // termiosMu protects termios. termiosMu sync.RWMutex `state:"nosave"` // termios is the terminal configuration used by the lineDiscipline. termios linux.KernelTermios // column is the location in a row of the cursor. This is important for // handling certain special characters like backspace. column int // numReplicas is the number of replica file descriptors. numReplicas int // masterWaiter is used to wait on the master end of the TTY. masterWaiter waiter.Queue // replicaWaiter is used to wait on the replica end of the TTY. replicaWaiter waiter.Queue // terminal is the terminal linked to this lineDiscipline. terminal *Terminal } func newLineDiscipline(termios linux.KernelTermios, terminal *Terminal) *lineDiscipline { ld := lineDiscipline{ termios: termios, terminal: terminal, } ld.inQueue.transformer = &inputQueueTransformer{} ld.outQueue.transformer = &outputQueueTransformer{} return &ld } // getTermios gets the linux.Termios for the tty. func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) { l.termiosMu.RLock() defer l.termiosMu.RUnlock() // We must copy a Termios struct, not KernelTermios. t := l.termios.ToTermios() _, err := t.CopyOut(task, args[2].Pointer()) return 0, err } // setTermios sets a linux.Termios for the tty. func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) { l.termiosMu.Lock() oldCanonEnabled := l.termios.LEnabled(linux.ICANON) // We must copy a Termios struct, not KernelTermios. var t linux.Termios _, err := t.CopyIn(task, args[2].Pointer()) l.termios.FromTermios(t) // If canonical mode is turned off, move bytes from inQueue's wait // buffer to its read buffer. Anything already in the read buffer is // now readable. if oldCanonEnabled && !l.termios.LEnabled(linux.ICANON) { l.inQueue.mu.Lock() l.inQueue.pushWaitBufLocked(l) l.inQueue.readable = len(l.inQueue.readBuf) > 0 l.inQueue.mu.Unlock() l.termiosMu.Unlock() l.replicaWaiter.Notify(waiter.ReadableEvents) } else { l.termiosMu.Unlock() } return 0, err } func (l *lineDiscipline) windowSize(t *kernel.Task, args arch.SyscallArguments) error { l.sizeMu.Lock() defer l.sizeMu.Unlock() _, err := l.size.CopyOut(t, args[2].Pointer()) return err } func (l *lineDiscipline) setWindowSize(t *kernel.Task, args arch.SyscallArguments) error { l.sizeMu.Lock() defer l.sizeMu.Unlock() _, err := l.size.CopyIn(t, args[2].Pointer()) return err } func (l *lineDiscipline) masterReadiness() waiter.EventMask { // The master termios is immutable so termiosMu is not needed. res := l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios) l.termiosMu.RLock() if l.numReplicas == 0 { res |= waiter.EventHUp } l.termiosMu.RUnlock() return res } func (l *lineDiscipline) replicaReadiness() waiter.EventMask { l.termiosMu.RLock() defer l.termiosMu.RUnlock() return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios) } func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error { return l.inQueue.readableSize(t, io, args) } func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { l.termiosMu.RLock() n, pushed, notifyEcho, err := l.inQueue.read(ctx, dst, l) isCanon := l.termios.LEnabled(linux.ICANON) l.termiosMu.RUnlock() if err != nil { return 0, err } if n > 0 { if notifyEcho { l.masterWaiter.Notify(waiter.ReadableEvents | waiter.WritableEvents) } else { l.masterWaiter.Notify(waiter.WritableEvents) } if pushed { l.replicaWaiter.Notify(waiter.ReadableEvents) } return n, nil } if notifyEcho { l.masterWaiter.Notify(waiter.ReadableEvents) } if !pushed && isCanon { return 0, nil // EOF } return 0, linuxerr.ErrWouldBlock } func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { l.termiosMu.RLock() n, notifyEcho, err := l.inQueue.write(ctx, src, l) l.termiosMu.RUnlock() if err != nil { return 0, err } if notifyEcho { l.masterWaiter.Notify(waiter.ReadableEvents) } if n > 0 { l.replicaWaiter.Notify(waiter.ReadableEvents) return n, nil } return 0, linuxerr.ErrWouldBlock } func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error { return l.outQueue.readableSize(t, io, args) } func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { l.termiosMu.RLock() // Ignore notifyEcho, as it cannot happen when reading from the output queue. n, pushed, _, err := l.outQueue.read(ctx, dst, l) l.termiosMu.RUnlock() if err != nil { return 0, err } if n > 0 { l.replicaWaiter.Notify(waiter.WritableEvents) if pushed { l.masterWaiter.Notify(waiter.ReadableEvents) } return n, nil } return 0, linuxerr.ErrWouldBlock } func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { l.termiosMu.RLock() // Ignore notifyEcho, as it cannot happen when writing to the output queue. n, _, err := l.outQueue.write(ctx, src, l) l.termiosMu.RUnlock() if err != nil { return 0, err } l.masterWaiter.Notify(waiter.ReadableEvents) return n, nil } // replicaOpen is called when a replica file descriptor is opened. func (l *lineDiscipline) replicaOpen() { l.termiosMu.Lock() defer l.termiosMu.Unlock() l.numReplicas++ } // replicaClose is called when a replica file descriptor is closed. func (l *lineDiscipline) replicaClose() { l.termiosMu.Lock() l.numReplicas-- notify := l.numReplicas == 0 l.termiosMu.Unlock() if notify { l.masterWaiter.Notify(waiter.EventHUp) } } // transformer is a helper interface to make it easier to stateify queue. type transformer interface { // transform functions require queue's mutex to be held. // The boolean indicates whether there was any echoed bytes. transform(*lineDiscipline, *queue, []byte) (int, bool) } // outputQueueTransformer implements transformer. It performs line discipline // transformations on the output queue. // // +stateify savable type outputQueueTransformer struct{} // transform does output processing for one end of the pty. See // drivers/tty/n_tty.c:do_output_char for an analogous kernel function. // // Preconditions: // - l.termiosMu must be held for reading. // - q.mu must be held. func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) (int, bool) { // transformOutput is effectively always in noncanonical mode, as the // master termios never has ICANON set. sizeBudget := nonCanonMaxBytes - len(q.readBuf) if sizeBudget <= 0 { return 0, false } if !l.termios.OEnabled(linux.OPOST) { copySize := min(len(buf), sizeBudget) q.readBuf = append(q.readBuf, buf[:copySize]...) if len(q.readBuf) > 0 { q.readable = true } return copySize, false } var ret int Outer: for ; len(buf) > 0 && sizeBudget > 0; sizeBudget = nonCanonMaxBytes - len(q.readBuf) { size := l.peek(buf) if size > sizeBudget { break Outer } cBytes := append([]byte{}, buf[:size]...) buf = buf[size:] // We're guaranteed that cBytes has at least one element. cByteSwitch: switch cBytes[0] { case '\n': if l.termios.OEnabled(linux.ONLRET) { l.column = 0 } if l.termios.OEnabled(linux.ONLCR) { if sizeBudget < 2 { break Outer } ret += size q.readBuf = append(q.readBuf, '\r', '\n') continue Outer } case '\r': if l.termios.OEnabled(linux.ONOCR) && l.column == 0 { // Treat the carriage return as processed, since it's a no-op. ret += size continue Outer } if l.termios.OEnabled(linux.OCRNL) { cBytes[0] = '\n' if l.termios.OEnabled(linux.ONLRET) { l.column = 0 } break cByteSwitch } l.column = 0 case '\t': spaces := spacesPerTab - l.column%spacesPerTab if l.termios.OutputFlags&linux.TABDLY == linux.XTABS { if sizeBudget < spacesPerTab { break Outer } ret += size l.column += spaces q.readBuf = append(q.readBuf, bytes.Repeat([]byte{' '}, spacesPerTab)...) continue Outer } l.column += spaces case '\b': if l.column > 0 { l.column-- } default: l.column++ } ret += size q.readBuf = append(q.readBuf, cBytes...) } if len(q.readBuf) > 0 { q.readable = true } return ret, false } // inputQueueTransformer implements transformer. It performs line discipline // transformations on the input queue. // // +stateify savable type inputQueueTransformer struct{} // transform does input processing for one end of the pty. Characters read are // transformed according to flags set in the termios struct. See // drivers/tty/n_tty.c:n_tty_receive_char_special for an analogous kernel // function. // It returns an extra boolean indicating whether any characters need to be // echoed, in which case we need to notify readers. // // Preconditions: // - l.termiosMu must be held for reading. // - q.mu must be held. func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) (int, bool) { // If there's a line waiting to be read in canonical mode, don't write // anything else to the read buffer. if l.termios.LEnabled(linux.ICANON) && q.readable { return 0, false } maxBytes := nonCanonMaxBytes if l.termios.LEnabled(linux.ICANON) { maxBytes = canonMaxBytes } var ret int var notifyEcho bool for len(buf) > 0 && len(q.readBuf) < canonMaxBytes { size := l.peek(buf) cBytes := append([]byte{}, buf[:size]...) // We're guaranteed that cBytes has at least one element. switch cBytes[0] { case '\r': if l.termios.IEnabled(linux.IGNCR) { buf = buf[size:] ret += size continue } if l.termios.IEnabled(linux.ICRNL) { cBytes[0] = '\n' } case '\n': if l.termios.IEnabled(linux.INLCR) { cBytes[0] = '\r' } case l.termios.ControlCharacters[linux.VINTR]: // ctrl-c // The input queue is reading from the master TTY and // writing to the replica TTY which is connected to the // interactive program (like bash). We want to send the // signal the process connected to the replica TTY. l.terminal.replicaKTTY.SignalForegroundProcessGroup(kernel.SignalInfoPriv(linux.SIGINT)) case l.termios.ControlCharacters[linux.VSUSP]: // ctrl-z l.terminal.replicaKTTY.SignalForegroundProcessGroup(kernel.SignalInfoPriv(linux.SIGTSTP)) case l.termios.ControlCharacters[linux.VQUIT]: // ctrl-\ l.terminal.replicaKTTY.SignalForegroundProcessGroup(kernel.SignalInfoPriv(linux.SIGQUIT)) // In canonical mode, some characters need to be handled specially; for example, backspace. // This roughly aligns with n_tty.c:n_tty_receive_char_canon and n_tty.c:eraser // cBytes[0] == ControlCharacters[linux.VKILL] is also handled by n_tty.c:eraser, but this isn't implemented case l.termios.ControlCharacters[linux.VWERASE]: if !l.termios.LEnabled(linux.IEXTEN) { break } fallthrough case l.termios.ControlCharacters[linux.VERASE]: if !l.termios.LEnabled(linux.ICANON) { break } c := cBytes[0] killType := linux.VERASE if c == l.termios.ControlCharacters[linux.VWERASE] { killType = linux.VWERASE } seenAlphanumeric := false for len(q.readBuf) > 0 { // Erase a character. If IUTF8 is enabled, erase an entire multibyte unicode character. var toErase byte cnt := 0 isContinuationByte := true for ; cnt < len(q.readBuf) && isContinuationByte; cnt++ { toErase = q.readBuf[len(q.readBuf)-cnt-1] isContinuationByte = l.termios.IEnabled(linux.IUTF8) && (toErase&0xc0) == 0x80 } if isContinuationByte { // Do not partially erase a multibyte unicode character. break } // VWERASE will continue erasing characters until we encounter the first non-alphanumeric character // that follows some alphanumeric character. We consider "_" to be alphanumeric. if killType == linux.VWERASE { if unicode.IsLetter(rune(toErase)) || unicode.IsDigit(rune(toErase)) || toErase == '_' { seenAlphanumeric = true } else if seenAlphanumeric { break } } q.readBuf = q.readBuf[:len(q.readBuf)-cnt] if l.termios.LEnabled(linux.ECHO) { if l.termios.LEnabled(linux.ECHOPRT) { // Not implemented } else if killType == linux.VERASE && !l.termios.LEnabled(linux.ECHOE) { // Not implemented } else if toErase == '\t' { // Not implemented } else { const unicodeDelete byte = 0x7f isCtrl := toErase < 0x20 || toErase == unicodeDelete echoctl := l.termios.LEnabled(linux.ECHOCTL) charsToDelete := 1 if isCtrl { // echoctl controls how we echo control characters, which also determines how we delete them. if echoctl { // echoctl echoes control characters as ^X, so we need to erase two characters. charsToDelete = 2 } else { // if echoctl is disabled, we don't echo control characters so we don't have to erase anything. charsToDelete = 0 } } for i := 0; i < charsToDelete; i++ { // Linux's kernel does character deletion with this sequence // of bytes, presumably because some older terminals don't erase // characters with \b, so we need to "erase" the old character // by writing a space over it. l.outQueue.writeBytes([]byte{'\b', ' ', '\b'}, l) } } } // VERASE only erases a single character if killType == linux.VERASE { break } } buf = buf[1:] ret += 1 notifyEcho = true continue } // In canonical mode, we discard non-terminating characters // after the first 4095. if l.shouldDiscard(q, cBytes) { buf = buf[size:] ret += size continue } // Stop if the buffer would be overfilled. if len(q.readBuf)+size > maxBytes { break } buf = buf[size:] ret += size // If we get EOF, make the buffer available for reading. if l.termios.LEnabled(linux.ICANON) && l.termios.IsEOF(cBytes[0]) { q.readable = true break } q.readBuf = append(q.readBuf, cBytes...) // Anything written to the readBuf will have to be echoed. if l.termios.LEnabled(linux.ECHO) { l.outQueue.writeBytes(cBytes, l) notifyEcho = true } // If we finish a line, make it available for reading. if l.termios.LEnabled(linux.ICANON) && l.termios.IsTerminating(cBytes) { q.readable = true break } } // In noncanonical mode, everything is readable. if !l.termios.LEnabled(linux.ICANON) && len(q.readBuf) > 0 { q.readable = true } return ret, notifyEcho } // shouldDiscard returns whether c should be discarded. In canonical mode, if // too many bytes are enqueued, we keep reading input and discarding it until // we find a terminating character. Signal/echo processing still occurs. // // Precondition: // - l.termiosMu must be held for reading. // - q.mu must be held. func (l *lineDiscipline) shouldDiscard(q *queue, cBytes []byte) bool { return l.termios.LEnabled(linux.ICANON) && len(q.readBuf)+len(cBytes) >= canonMaxBytes && !l.termios.IsTerminating(cBytes) } // peek returns the size in bytes of the next character to process. As long as // b isn't empty, peek returns a value of at least 1. func (l *lineDiscipline) peek(b []byte) int { size := 1 // If UTF-8 support is enabled, runes might be multiple bytes. if l.termios.IEnabled(linux.IUTF8) { _, size = utf8.DecodeRune(b) } return size } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/devpts/master.go000066400000000000000000000174231465435605700254000ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package devpts import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // masterInode is the inode for the master end of the Terminal. // // +stateify savable type masterInode struct { implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeNotAnonymous kernfs.InodeNotDirectory kernfs.InodeNotSymlink kernfs.InodeWatches locks vfs.FileLocks // root is the devpts root inode. root *rootInode } var _ kernfs.Inode = (*masterInode)(nil) // Open implements kernfs.Inode.Open. func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { t, err := mi.root.allocateTerminal(ctx, rp.Credentials()) if err != nil { return nil, err } fd := &masterFileDescription{ inode: mi, t: t, } fd.LockFD.Init(&mi.locks) if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil } // Stat implements kernfs.Inode.Stat. func (mi *masterInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { statx, err := mi.InodeAttrs.Stat(ctx, vfsfs, opts) if err != nil { return linux.Statx{}, err } statx.Blksize = 1024 statx.RdevMajor = linux.TTYAUX_MAJOR statx.RdevMinor = linux.PTMX_MINOR return statx, nil } // SetStat implements kernfs.Inode.SetStat func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { if opts.Stat.Mask&linux.STATX_SIZE != 0 { return linuxerr.EINVAL } return mi.InodeAttrs.SetStat(ctx, vfsfs, creds, opts) } // +stateify savable type masterFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.LockFD inode *masterInode t *Terminal } var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil) // Release implements vfs.FileDescriptionImpl.Release. func (mfd *masterFileDescription) Release(ctx context.Context) { mfd.inode.root.masterClose(ctx, mfd.t) } // EventRegister implements waiter.Waitable.EventRegister. func (mfd *masterFileDescription) EventRegister(e *waiter.Entry) error { mfd.t.ld.masterWaiter.EventRegister(e) return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (mfd *masterFileDescription) EventUnregister(e *waiter.Entry) { mfd.t.ld.masterWaiter.EventUnregister(e) } // Readiness implements waiter.Waitable.Readiness. func (mfd *masterFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { return mfd.t.ld.masterReadiness() } // Epollable implements FileDescriptionImpl.Epollable. func (mfd *masterFileDescription) Epollable() bool { return true } // Read implements vfs.FileDescriptionImpl.Read. func (mfd *masterFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { return mfd.t.ld.outputQueueRead(ctx, dst) } // Write implements vfs.FileDescriptionImpl.Write. func (mfd *masterFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { return mfd.t.ld.inputQueueWrite(ctx, src) } // Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { t := kernel.TaskFromContext(ctx) if t == nil { // ioctl(2) may only be called from a task goroutine. return 0, linuxerr.ENOTTY } switch cmd := args[1].Uint(); cmd { case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ // Get the number of bytes in the output queue read buffer. return 0, mfd.t.ld.outputQueueReadSize(t, io, args) case linux.TCGETS: // N.B. TCGETS on the master actually returns the configuration // of the replica end. return mfd.t.ld.getTermios(t, args) case linux.TCSETS: // N.B. TCSETS on the master actually affects the configuration // of the replica end. return mfd.t.ld.setTermios(t, args) case linux.TCSETSW: // TODO(b/29356795): This should drain the output queue first. return mfd.t.ld.setTermios(t, args) case linux.TCSETSF: // TODO(b/29356795): This should drain the output queue and // clear the input queue first. return mfd.t.ld.setTermios(t, args) case linux.TIOCGPTN: nP := primitive.Uint32(mfd.t.n) _, err := nP.CopyOut(t, args[2].Pointer()) return 0, err case linux.TIOCSPTLCK: // TODO(b/29356795): Implement pty locking. For now just pretend we do. return 0, nil case linux.TIOCGWINSZ: return 0, mfd.t.ld.windowSize(t, args) case linux.TIOCSWINSZ: return 0, mfd.t.ld.setWindowSize(t, args) case linux.TIOCSCTTY: // Make the given terminal the controlling terminal of the // calling process. steal := args[2].Int() == 1 return 0, t.ThreadGroup().SetControllingTTY(mfd.t.masterKTTY, steal, mfd.vfsfd.IsReadable()) case linux.TIOCNOTTY: // Release this process's controlling terminal. return 0, t.ThreadGroup().ReleaseControllingTTY(mfd.t.masterKTTY) case linux.TIOCGPGRP: // Get the foreground process group id. pgid, err := t.ThreadGroup().ForegroundProcessGroupID(mfd.t.masterKTTY) if err != nil { return 0, err } ret := primitive.Int32(pgid) _, err = ret.CopyOut(t, args[2].Pointer()) return 0, err case linux.TIOCSPGRP: // Set the foreground process group id. var pgid primitive.Int32 if _, err := pgid.CopyIn(t, args[2].Pointer()); err != nil { return 0, err } return 0, t.ThreadGroup().SetForegroundProcessGroupID(mfd.t.masterKTTY, kernel.ProcessGroupID(pgid)) default: maybeEmitUnimplementedEvent(ctx, sysno, cmd) return 0, linuxerr.ENOTTY } } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (mfd *masterFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem() return mfd.inode.SetStat(ctx, fs, creds, opts) } // Stat implements vfs.FileDescriptionImpl.Stat. func (mfd *masterFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem() return mfd.inode.Stat(ctx, fs, opts) } // maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid. func maybeEmitUnimplementedEvent(ctx context.Context, sysno uintptr, cmd uint32) { switch cmd { case linux.TCGETS, linux.TCSETS, linux.TCSETSW, linux.TCSETSF, linux.TIOCGWINSZ, linux.TIOCSWINSZ, linux.TIOCSETD, linux.TIOCSBRK, linux.TIOCCBRK, linux.TCSBRK, linux.TCSBRKP, linux.TIOCSTI, linux.TIOCCONS, linux.FIONBIO, linux.TIOCEXCL, linux.TIOCNXCL, linux.TIOCGEXCL, linux.TIOCGSID, linux.TIOCGETD, linux.TIOCVHANGUP, linux.TIOCGDEV, linux.TIOCMGET, linux.TIOCMSET, linux.TIOCMBIC, linux.TIOCMBIS, linux.TIOCGICOUNT, linux.TCFLSH, linux.TIOCSSERIAL, linux.TIOCGPTPEER: unimpl.EmitUnimplementedEvent(ctx, sysno) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/devpts/queue.go000066400000000000000000000157721465435605700252360ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package devpts import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // waitBufMaxBytes is the maximum size of a wait buffer. It is based on // TTYB_DEFAULT_MEM_LIMIT. const waitBufMaxBytes = 131072 // queue represents one of the input or output queues between a pty master and // replica. Bytes written to a queue are added to the read buffer until it is // full, at which point they are written to the wait buffer. Bytes are // processed (i.e. undergo termios transformations) as they are added to the // read buffer. The read buffer is readable when its length is nonzero and // readable is true, or when its length is zero and readable is true (EOF). // // +stateify savable type queue struct { // mu protects everything in queue. mu sync.Mutex `state:"nosave"` // readBuf is buffer of data ready to be read when readable is true. // This data has been processed. readBuf []byte // waitBuf contains data that can't fit into readBuf. It is put here // until it can be loaded into the read buffer. waitBuf contains data // that hasn't been processed. waitBuf [][]byte waitBufLen uint64 // readable indicates whether the read buffer can be read from. In // canonical mode, there can be an unterminated line in the read buffer, // so readable must be checked. readable bool // transform is the queue's function for transforming bytes // entering the queue. For example, transform might convert all '\r's // entering the queue to '\n's. transformer } // readReadiness returns whether q is ready to be read from. func (q *queue) readReadiness(t *linux.KernelTermios) waiter.EventMask { q.mu.Lock() defer q.mu.Unlock() if len(q.readBuf) > 0 && q.readable { return waiter.ReadableEvents } return waiter.EventMask(0) } // writeReadiness returns whether q is ready to be written to. func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask { q.mu.Lock() defer q.mu.Unlock() if q.waitBufLen < waitBufMaxBytes { return waiter.WritableEvents } return waiter.EventMask(0) } // readableSize writes the number of readable bytes to userspace. func (q *queue) readableSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error { q.mu.Lock() defer q.mu.Unlock() size := primitive.Int32(0) if q.readable { size = primitive.Int32(len(q.readBuf)) } _, err := size.CopyOut(t, args[2].Pointer()) return err } // read reads from q to userspace. It returns: // - The number of bytes read // - Whether the read caused more readable data to become available (whether // data was pushed from the wait buffer to the read buffer). // - Whether any data was echoed back (need to notify readers). // // Preconditions: l.termiosMu must be held for reading. func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, bool, error) { q.mu.Lock() defer q.mu.Unlock() if !q.readable { if l.numReplicas == 0 { return 0, false, false, linuxerr.EIO } return 0, false, false, linuxerr.ErrWouldBlock } if dst.NumBytes() > canonMaxBytes { dst = dst.TakeFirst(canonMaxBytes) } n, err := dst.CopyOutFrom(ctx, safemem.ReaderFunc(func(dst safemem.BlockSeq) (uint64, error) { src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(q.readBuf)) n, err := safemem.CopySeq(dst, src) if err != nil { return 0, err } q.readBuf = q.readBuf[n:] // If we read everything, this queue is no longer readable. if len(q.readBuf) == 0 { q.readable = false } return n, nil })) if err != nil { return 0, false, false, err } // Move data from the queue's wait buffer to its read buffer. nPushed, notifyEcho := q.pushWaitBufLocked(l) return int64(n), nPushed > 0, notifyEcho, nil } // write writes to q from userspace. // The returned boolean indicates whether any data was echoed back. // // Preconditions: l.termiosMu must be held for reading. func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, bool, error) { q.mu.Lock() defer q.mu.Unlock() // Copy data into the wait buffer. n, err := src.CopyInTo(ctx, safemem.WriterFunc(func(src safemem.BlockSeq) (uint64, error) { copyLen := src.NumBytes() room := waitBufMaxBytes - q.waitBufLen // If out of room, return EAGAIN. if room == 0 && copyLen > 0 { return 0, linuxerr.ErrWouldBlock } // Cap the size of the wait buffer. if copyLen > room { copyLen = room src = src.TakeFirst64(room) } buf := make([]byte, copyLen) // Copy the data into the wait buffer. dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)) n, err := safemem.CopySeq(dst, src) if err != nil { return 0, err } q.waitBufAppend(buf) return n, nil })) if err != nil { return 0, false, err } // Push data from the wait to the read buffer. _, notifyEcho := q.pushWaitBufLocked(l) return n, notifyEcho, nil } // writeBytes writes to q from b. // The returned boolean indicates whether any data was echoed back. // // Preconditions: l.termiosMu must be held for reading. func (q *queue) writeBytes(b []byte, l *lineDiscipline) bool { q.mu.Lock() defer q.mu.Unlock() // Write to the wait buffer. q.waitBufAppend(b) _, notifyEcho := q.pushWaitBufLocked(l) return notifyEcho } // pushWaitBufLocked fills the queue's read buffer with data from the wait // buffer. // The returned boolean indicates whether any data was echoed back. // // Preconditions: // - l.termiosMu must be held for reading. // - q.mu must be locked. func (q *queue) pushWaitBufLocked(l *lineDiscipline) (int, bool) { if q.waitBufLen == 0 { return 0, false } // Move data from the wait to the read buffer. var total int var i int var notifyEcho bool for i = 0; i < len(q.waitBuf); i++ { n, echo := q.transform(l, q, q.waitBuf[i]) total += n notifyEcho = notifyEcho || echo if n != len(q.waitBuf[i]) { // The read buffer filled up without consuming the // entire buffer. q.waitBuf[i] = q.waitBuf[i][n:] break } } // Update wait buffer based on consumed data. q.waitBuf = q.waitBuf[i:] q.waitBufLen -= uint64(total) return total, notifyEcho } // Precondition: q.mu must be locked. func (q *queue) waitBufAppend(b []byte) { q.waitBuf = append(q.waitBuf, b) q.waitBufLen += uint64(len(b)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/devpts/replica.go000066400000000000000000000166351465435605700255300ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package devpts import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // replicaInode is the inode for the replica end of the Terminal. // // +stateify savable type replicaInode struct { implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeNotAnonymous kernfs.InodeNotDirectory kernfs.InodeNotSymlink kernfs.InodeWatches locks vfs.FileLocks // root is the devpts root inode. root *rootInode // t is the connected Terminal. t *Terminal } var _ kernfs.Inode = (*replicaInode)(nil) // Open implements kernfs.Inode.Open. func (ri *replicaInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { t := kernel.TaskFromContext(ctx) if t == nil { panic("open must be called from a task goroutine") } fd := &replicaFileDescription{ inode: ri, } fd.LockFD.Init(&ri.locks) if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } if opts.Flags&linux.O_NOCTTY == 0 { // Opening a replica sets the process' controlling TTY when // possible. An error indicates it cannot be set, and is // ignored silently. _ = t.ThreadGroup().SetControllingTTY(fd.inode.t.replicaKTTY, false /* steal */, fd.vfsfd.IsReadable()) } ri.t.ld.replicaOpen() return &fd.vfsfd, nil } // Valid implements kernfs.Inode.Valid. func (ri *replicaInode) Valid(context.Context, *kernfs.Dentry, string) bool { // Return valid if the replica still exists. ri.root.mu.Lock() defer ri.root.mu.Unlock() _, ok := ri.root.replicas[ri.t.n] return ok } // Stat implements kernfs.Inode.Stat. func (ri *replicaInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { statx, err := ri.InodeAttrs.Stat(ctx, vfsfs, opts) if err != nil { return linux.Statx{}, err } statx.Blksize = 1024 statx.RdevMajor = linux.UNIX98_PTY_REPLICA_MAJOR statx.RdevMinor = ri.t.n return statx, nil } // SetStat implements kernfs.Inode.SetStat func (ri *replicaInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { if opts.Stat.Mask&linux.STATX_SIZE != 0 { return linuxerr.EINVAL } return ri.InodeAttrs.SetStat(ctx, vfsfs, creds, opts) } // +stateify savable type replicaFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.LockFD inode *replicaInode } var _ vfs.FileDescriptionImpl = (*replicaFileDescription)(nil) // Release implements fs.FileOperations.Release. func (rfd *replicaFileDescription) Release(ctx context.Context) { rfd.inode.t.ld.replicaClose() } // EventRegister implements waiter.Waitable.EventRegister. func (rfd *replicaFileDescription) EventRegister(e *waiter.Entry) error { rfd.inode.t.ld.replicaWaiter.EventRegister(e) return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (rfd *replicaFileDescription) EventUnregister(e *waiter.Entry) { rfd.inode.t.ld.replicaWaiter.EventUnregister(e) } // Readiness implements waiter.Waitable.Readiness. func (rfd *replicaFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { return rfd.inode.t.ld.replicaReadiness() } // Epollable implements FileDescriptionImpl.Epollable. func (rfd *replicaFileDescription) Epollable() bool { return true } // Read implements vfs.FileDescriptionImpl.Read. func (rfd *replicaFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { return rfd.inode.t.ld.inputQueueRead(ctx, dst) } // Write implements vfs.FileDescriptionImpl.Write. func (rfd *replicaFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { return rfd.inode.t.ld.outputQueueWrite(ctx, src) } // Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (rfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { t := kernel.TaskFromContext(ctx) if t == nil { // ioctl(2) may only be called from a task goroutine. return 0, linuxerr.ENOTTY } switch cmd := args[1].Uint(); cmd { case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ // Get the number of bytes in the input queue read buffer. return 0, rfd.inode.t.ld.inputQueueReadSize(t, io, args) case linux.TCGETS: return rfd.inode.t.ld.getTermios(t, args) case linux.TCSETS: return rfd.inode.t.ld.setTermios(t, args) case linux.TCSETSW: // TODO(b/29356795): This should drain the output queue first. return rfd.inode.t.ld.setTermios(t, args) case linux.TCSETSF: // TODO(b/29356795): This should drain the output queue and // clear the input queue first. return rfd.inode.t.ld.setTermios(t, args) case linux.TIOCGPTN: nP := primitive.Uint32(rfd.inode.t.n) _, err := nP.CopyOut(t, args[2].Pointer()) return 0, err case linux.TIOCGWINSZ: return 0, rfd.inode.t.ld.windowSize(t, args) case linux.TIOCSWINSZ: return 0, rfd.inode.t.ld.setWindowSize(t, args) case linux.TIOCSCTTY: // Make the given terminal the controlling terminal of the // calling process. steal := args[2].Int() == 1 return 0, t.ThreadGroup().SetControllingTTY(rfd.inode.t.replicaKTTY, steal, rfd.vfsfd.IsReadable()) case linux.TIOCNOTTY: // Release this process's controlling terminal. return 0, t.ThreadGroup().ReleaseControllingTTY(rfd.inode.t.replicaKTTY) case linux.TIOCGPGRP: // Get the foreground process group id. pgid, err := t.ThreadGroup().ForegroundProcessGroupID(rfd.inode.t.replicaKTTY) if err != nil { return 0, err } ret := primitive.Int32(pgid) _, err = ret.CopyOut(t, args[2].Pointer()) return 0, err case linux.TIOCSPGRP: // Set the foreground process group id. var pgid primitive.Int32 if _, err := pgid.CopyIn(t, args[2].Pointer()); err != nil { return 0, err } return 0, t.ThreadGroup().SetForegroundProcessGroupID(rfd.inode.t.replicaKTTY, kernel.ProcessGroupID(pgid)) default: maybeEmitUnimplementedEvent(ctx, sysno, cmd) return 0, linuxerr.ENOTTY } } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (rfd *replicaFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem() return rfd.inode.SetStat(ctx, fs, creds, opts) } // Stat implements vfs.FileDescriptionImpl.Stat. func (rfd *replicaFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem() return rfd.inode.Stat(ctx, fs, opts) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/devpts/root_inode_refs.go000066400000000000000000000101771465435605700272640ustar00rootroot00000000000000package devpts import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const rootInodeenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var rootInodeobj *rootInode // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type rootInodeRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *rootInodeRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *rootInodeRefs) RefType() string { return fmt.Sprintf("%T", rootInodeobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *rootInodeRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *rootInodeRefs) LogRefs() bool { return rootInodeenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *rootInodeRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *rootInodeRefs) IncRef() { v := r.refCount.Add(1) if rootInodeenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *rootInodeRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if rootInodeenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *rootInodeRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if rootInodeenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *rootInodeRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/devpts/terminal.go000066400000000000000000000026221465435605700257130ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package devpts import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" ) // Terminal is a pseudoterminal. // // +stateify savable type Terminal struct { // n is the terminal index. It is immutable. n uint32 // ld is the line discipline of the terminal. It is immutable. ld *lineDiscipline // masterKTTY contains the controlling process of the master end of // this terminal. This field is immutable. masterKTTY *kernel.TTY // replicaKTTY contains the controlling process of the replica end of this // terminal. This field is immutable. replicaKTTY *kernel.TTY } func newTerminal(n uint32) *Terminal { t := &Terminal{ n: n, masterKTTY: &kernel.TTY{Index: n}, replicaKTTY: &kernel.TTY{Index: n}, } t.ld = newLineDiscipline(linux.DefaultReplicaTermios, t) return t } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/devtmpfs/000077500000000000000000000000001465435605700240725ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go000066400000000000000000000043221465435605700262520ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package devtmpfs provides a singleton fsimpl/dev filesystem instance, // analogous to Linux's devtmpfs. package devtmpfs import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/dev" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) // Name is the default filesystem name. const Name = "devtmpfs" // FilesystemType implements vfs.FilesystemType. // // +stateify savable type FilesystemType struct { initOnce sync.Once `state:"nosave"` initErr error // fs is the tmpfs filesystem that backs all mounts of this FilesystemType. // root is fs' root. fs and root are immutable. fs *vfs.Filesystem root *vfs.Dentry } // Name implements vfs.FilesystemType.Name. func (*FilesystemType) Name() string { return Name } // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fst *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { fst.initOnce.Do(func() { fs, root, err := dev.FilesystemType{}.GetFilesystem(ctx, vfsObj, creds, source, opts) if err != nil { fst.initErr = err return } fst.fs = fs fst.root = root }) if fst.initErr != nil { return nil, nil, fst.initErr } fst.fs.IncRef() fst.root.IncRef() return fst.fs, fst.root, nil } // Release implements vfs.FilesystemType.Release. func (fst *FilesystemType) Release(ctx context.Context) { if fst.fs != nil { // Release the original reference obtained when creating the filesystem. fst.root.DecRef(ctx) fst.fs.DecRef(ctx) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/devtmpfs/devtmpfs_state_autogen.go000066400000000000000000000016511465435605700311760ustar00rootroot00000000000000// automatically generated by stateify. package devtmpfs import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (fst *FilesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/devtmpfs.FilesystemType" } func (fst *FilesystemType) StateFields() []string { return []string{ "initErr", "fs", "root", } } func (fst *FilesystemType) beforeSave() {} // +checklocksignore func (fst *FilesystemType) StateSave(stateSinkObject state.Sink) { fst.beforeSave() stateSinkObject.Save(0, &fst.initErr) stateSinkObject.Save(1, &fst.fs) stateSinkObject.Save(2, &fst.root) } // +checklocksignore func (fst *FilesystemType) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fst.initErr) stateSourceObject.Load(1, &fst.fs) stateSourceObject.Load(2, &fst.root) stateSourceObject.AfterLoad(func() { fst.afterLoad(ctx) }) } func init() { state.Register((*FilesystemType)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/devtmpfs/save_restore.go000066400000000000000000000015011465435605700271170ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package devtmpfs import "context" // afterLoad is invoked by stateify. func (fst *FilesystemType) afterLoad(context.Context) { if fst.fs != nil { // Ensure that we don't create another filesystem. fst.initOnce.Do(func() {}) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/erofs/000077500000000000000000000000001465435605700233605ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/erofs/dentry_refs.go000066400000000000000000000101101465435605700262240ustar00rootroot00000000000000package erofs import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const dentryenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var dentryobj *dentry // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type dentryRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *dentryRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *dentryRefs) RefType() string { return fmt.Sprintf("%T", dentryobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *dentryRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *dentryRefs) LogRefs() bool { return dentryenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *dentryRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *dentryRefs) IncRef() { v := r.refCount.Add(1) if dentryenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *dentryRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if dentryenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *dentryRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if dentryenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *dentryRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/erofs/directory.go000066400000000000000000000075021465435605700257170ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package erofs import ( "sort" "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/vfs" ) func (i *inode) getDirents() ([]vfs.Dirent, error) { // Fast path. i.dirMu.RLock() dirents := i.dirents i.dirMu.RUnlock() if dirents != nil { return dirents, nil } // Slow path. i.dirMu.Lock() defer i.dirMu.Unlock() off := int64(1) if err := i.IterDirents(func(name string, typ uint8, nid uint64) error { dirents = append(dirents, vfs.Dirent{ Name: name, Type: linux.FileTypeToDirentType(typ), Ino: nid, NextOff: off, }) off++ return nil }); err != nil { return nil, err } // "." and ".." should always be present. if len(dirents) < 2 { return nil, linuxerr.EUCLEAN } i.dirents = dirents return dirents, nil } func (i *inode) lookup(name string) (uint64, error) { var dirents []vfs.Dirent // Lazily fetch dirents. if i.dirMu.TryRLock() { dirents = i.dirents // +checklocksforce: TryRLock. i.dirMu.RUnlock() // +checklocksforce: TryRLock. } if dirents == nil { // The dirents cache is not available immediately, let's do // binary search on disk data directly. return i.Lookup(name) } // The dirents are sorted in alphabetical order. We do binary search // to find the target. idx := sort.Search(len(dirents), func(i int) bool { return dirents[i].Name >= name }) if idx >= len(dirents) || dirents[idx].Name != name { return 0, linuxerr.ENOENT } return dirents[idx].Ino, nil } func (d *dentry) lookup(ctx context.Context, name string) (*dentry, error) { // Fast path, dentry already exists. d.dirMu.RLock() child, ok := d.childMap[name] d.dirMu.RUnlock() if ok { return child, nil } // Slow path, create a new dentry. d.dirMu.Lock() defer d.dirMu.Unlock() if child, ok := d.childMap[name]; ok { return child, nil } nid, err := d.inode.lookup(name) if err != nil { return nil, err } if d.childMap == nil { d.childMap = make(map[string]*dentry) } child, err = d.inode.fs.newDentry(nid) if err != nil { return nil, err } child.parent.Store(d) child.name = name d.childMap[name] = child return child, nil } // +stateify savable type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl // mu protects off. mu sync.Mutex `state:"nosave"` // +checklocks:mu off int64 } // IterDirents implements vfs.FileDescriptionImpl.IterDirents. func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { d := fd.dentry() dirents, err := d.inode.getDirents() if err != nil { return err } d.InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent) fd.mu.Lock() defer fd.mu.Unlock() for fd.off < int64(len(dirents)) { if err := cb.Handle(dirents[fd.off]); err != nil { return err } fd.off++ } return nil } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { fd.mu.Lock() defer fd.mu.Unlock() switch whence { case linux.SEEK_SET: // use offset as specified case linux.SEEK_CUR: offset += fd.off default: return 0, linuxerr.EINVAL } if offset < 0 { return 0, linuxerr.EINVAL } fd.off = offset return offset, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/erofs/erofs.go000066400000000000000000000363201465435605700250310ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package erofs implements erofs. package erofs import ( "os" "runtime" "strconv" "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/erofs" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // Name is the filesystem name. It is part of the interface used by users, // e.g. via annotations, and shouldn't change. const Name = "erofs" // Mount option names for EROFS. const ( moptImageFD = "ifd" ) // FilesystemType implements vfs.FilesystemType. // // +stateify savable type FilesystemType struct{} // filesystem implements vfs.FilesystemImpl. // // +stateify savable type filesystem struct { vfsfs vfs.Filesystem // Immutable options. mopts string iopts InternalFilesystemOptions // devMinor is the filesystem's minor device number. devMinor is immutable. devMinor uint32 // root is the root dentry. root is immutable. root *dentry // image is the EROFS image. image is immutable. image *erofs.Image // mf implements memmap.File for this image. mf imageMemmapFile // inodeBuckets contains the inodes in use. Multiple buckets are used to // reduce the lock contention. Bucket is chosen based on the hash calculation // on nid in filesystem.inodeBucket. inodeBuckets []inodeBucket } // InternalFilesystemOptions may be passed as // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem. // // +stateify savable type InternalFilesystemOptions struct { // If UniqueID is non-empty, it is an opaque string used to reassociate the // filesystem with a new image FD during restoration from checkpoint. UniqueID vfs.RestoreID } // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name } // Release implements vfs.FilesystemType.Release. func (FilesystemType) Release(ctx context.Context) {} // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { mopts := vfs.GenericParseMountOptions(opts.Data) var cu cleanup.Cleanup defer cu.Clean() fd, err := getFDFromMountOptionsMap(ctx, mopts) if err != nil { return nil, nil, err } f := os.NewFile(uintptr(fd), "EROFS image file") image, err := erofs.OpenImage(f) if err != nil { f.Close() return nil, nil, err } cu.Add(func() { image.Close() }) iopts, ok := opts.InternalData.(InternalFilesystemOptions) if opts.InternalData != nil && !ok { ctx.Warningf("erofs.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted erofs.InternalFilesystemOptions", opts.InternalData) return nil, nil, linuxerr.EINVAL } devMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, nil, err } fs := &filesystem{ mopts: opts.Data, iopts: iopts, image: image, devMinor: devMinor, mf: imageMemmapFile{image: image}, } fs.vfsfs.Init(vfsObj, &fstype, fs) cu.Add(func() { fs.vfsfs.DecRef(ctx) }) fs.inodeBuckets = make([]inodeBucket, runtime.GOMAXPROCS(0)) for i := range fs.inodeBuckets { fs.inodeBuckets[i].init() } root, err := fs.newDentry(image.RootNid()) if err != nil { return nil, nil, err } // Increase the root's reference count to 2. One reference is returned to // the caller, and the other is held by fs. root.IncRef() fs.root = root cu.Release() return &fs.vfsfs, &root.vfsd, nil } func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) { ifdstr, ok := mopts[moptImageFD] if !ok { ctx.Warningf("erofs.getFDFromMountOptionsMap: image FD must be specified as '%s='", moptImageFD) return -1, linuxerr.EINVAL } delete(mopts, moptImageFD) ifd, err := strconv.Atoi(ifdstr) if err != nil { ctx.Warningf("erofs.getFDFromMountOptionsMap: invalid image FD: %s=%s", moptImageFD, ifdstr) return -1, linuxerr.EINVAL } return ifd, nil } // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { // An extra reference was held by the filesystem on the root. if fs.root != nil { fs.root.DecRef(ctx) } fs.image.Close() fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) } func (fs *filesystem) statFS() linux.Statfs { blockSize := int64(fs.image.BlockSize()) return linux.Statfs{ Type: erofs.SuperBlockMagicV1, NameLength: erofs.MaxNameLen, BlockSize: blockSize, FragmentSize: blockSize, Blocks: uint64(fs.image.Blocks()), } } // +stateify savable type inodeBucket struct { // mu protects inodeMap. mu sync.RWMutex `state:"nosave"` // inodeMap contains the inodes indexed by nid. // +checklocks:mu inodeMap map[uint64]*inode } func (ib *inodeBucket) init() { ib.inodeMap = make(map[uint64]*inode) // +checklocksignore } // getInode returns the inode identified by nid. A reference on inode is also // returned to caller. func (ib *inodeBucket) getInode(nid uint64) *inode { ib.mu.RLock() defer ib.mu.RUnlock() i := ib.inodeMap[nid] if i != nil { i.IncRef() } return i } // addInode adds the inode identified by nid into the bucket. It will first check // whether the old inode exists. If not, it will call newInode() to get the new inode. // The inode eventually saved in the bucket will be returned with a reference for caller. func (ib *inodeBucket) addInode(nid uint64, newInode func() *inode) *inode { ib.mu.Lock() defer ib.mu.Unlock() if i, ok := ib.inodeMap[nid]; ok { i.IncRef() return i } i := newInode() ib.inodeMap[nid] = i return i } // removeInode removes the inode identified by nid. func (ib *inodeBucket) removeInode(nid uint64) { ib.mu.Lock() delete(ib.inodeMap, nid) ib.mu.Unlock() } func (fs *filesystem) inodeBucket(nid uint64) *inodeBucket { bucket := nid % uint64(len(fs.inodeBuckets)) return &fs.inodeBuckets[bucket] } // inode represents a filesystem object. // // Each dentry holds a reference on the inode it represents. An inode will // be dropped once its reference count reaches zero. We do not cache inodes // directly. The caching policy is implemented on top of dentries. // // +stateify savable type inode struct { erofs.Inode // inodeRefs is the reference count. inodeRefs // fs is the owning filesystem. fs *filesystem // dirMu protects dirents. dirents is immutable after creation. dirMu sync.RWMutex `state:"nosave"` // +checklocks:dirMu dirents []vfs.Dirent `state:"nosave"` // mapsMu protects mappings. mapsMu sync.Mutex `state:"nosave"` // mappings tracks the mappings of the file into memmap.MappingSpaces // if this inode represents a regular file. // +checklocks:mapsMu mappings memmap.MappingSet // locks supports POSIX and BSD style locks. locks vfs.FileLocks // Inotify watches for this inode. watches vfs.Watches } // getInode returns the inode identified by nid. A reference on inode is also // returned to caller. func (fs *filesystem) getInode(nid uint64) (*inode, error) { bucket := fs.inodeBucket(nid) // Fast path, inode already exists. if i := bucket.getInode(nid); i != nil { return i, nil } // Slow path, create a new inode. // // Construct the underlying inode object from the image without taking // the bucket lock first to reduce the contention. ino, err := fs.image.Inode(nid) if err != nil { return nil, err } return bucket.addInode(nid, func() *inode { i := &inode{ Inode: ino, fs: fs, } i.InitRefs() return i }), nil } // DecRef should be called when you're finished with an inode. func (i *inode) DecRef(ctx context.Context) { i.inodeRefs.DecRef(func() { nid := i.Nid() i.fs.inodeBucket(nid).removeInode(nid) }) } func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(i.Mode()), auth.KUID(i.UID()), auth.KGID(i.GID())) } func (i *inode) statTo(stat *linux.Statx) { stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME | linux.STATX_MTIME stat.Blksize = i.fs.image.BlockSize() stat.Nlink = i.Nlink() stat.UID = i.UID() stat.GID = i.GID() stat.Mode = i.Mode() stat.Ino = i.Nid() stat.Size = i.Size() stat.Blocks = (stat.Size + 511) / 512 stat.Mtime = linux.StatxTimestamp{ Sec: int64(i.Mtime()), Nsec: i.MtimeNsec(), } stat.Atime = stat.Mtime stat.Ctime = stat.Mtime stat.DevMajor = linux.UNNAMED_MAJOR stat.DevMinor = i.fs.devMinor } func (i *inode) fileType() uint16 { return i.Mode() & linux.S_IFMT } // dentry implements vfs.DentryImpl. // // The filesystem is read-only and currently we never drop the cached dentries // until the filesystem is unmounted. The reference model works like this: // // - The initial reference count of each dentry is one, which is the reference // held by the parent (so when the reference count is one, it also means that // this is a cached dentry, i.e. not in use). // // - When a dentry is used (e.g. opened by someone), its reference count will // be increased and the new reference is held by caller. // // - The reference count of root dentry is two. One reference is returned to // the caller of `GetFilesystem()`, and the other is held by `fs`. // // TODO: This can lead to unbounded memory growth in sentry due to the ever-growing // dentry tree. We should have a dentry LRU cache, similar to what fsimpl/gofer does. // // +stateify savable type dentry struct { vfsd vfs.Dentry // dentryRefs is the reference count. dentryRefs // parent is this dentry's parent directory. If this dentry is // a file system root, parent is nil. parent atomic.Pointer[dentry] `state:".(*dentry)"` // name is this dentry's name in its parent. If this dentry is // a file system root, name is the empty string. name string // inode is the inode represented by this dentry. inode *inode // dirMu serializes changes to the dentry tree. dirMu sync.RWMutex `state:"nosave"` // childMap contains the mappings of child names to dentries if this // dentry represents a directory. // +checklocks:dirMu childMap map[string]*dentry } // The caller is expected to handle dentry insertion into dentry tree. func (fs *filesystem) newDentry(nid uint64) (*dentry, error) { i, err := fs.getInode(nid) if err != nil { return nil, err } d := &dentry{ inode: i, } d.InitRefs() d.vfsd.Init(d) return d, nil } // DecRef implements vfs.DentryImpl.DecRef. func (d *dentry) DecRef(ctx context.Context) { d.dentryRefs.DecRef(func() { d.dirMu.Lock() for _, c := range d.childMap { c.DecRef(ctx) } d.childMap = nil d.dirMu.Unlock() d.inode.DecRef(ctx) }) } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { if d.inode.IsDir() { events |= linux.IN_ISDIR } // The ordering below is important, Linux always notifies the parent first. if parent := d.parent.Load(); parent != nil { parent.inode.watches.Notify(ctx, d.name, events, cookie, et, false) } d.inode.watches.Notify(ctx, "", events, cookie, et, false) } // Watches implements vfs.DentryImpl.Watches. func (d *dentry) Watches() *vfs.Watches { return &d.inode.watches } // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. func (d *dentry) OnZeroWatches(ctx context.Context) {} func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) if err := d.inode.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } switch d.inode.fileType() { case linux.S_IFREG: if ats&vfs.MayWrite != 0 { return nil, linuxerr.EROFS } var fd regularFileFD fd.LockFD.Init(&d.inode.locks) if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { return nil, err } return &fd.vfsfd, nil case linux.S_IFDIR: // Can't open directories with O_CREAT. if opts.Flags&linux.O_CREAT != 0 { return nil, linuxerr.EISDIR } // Can't open directories writably. if ats&vfs.MayWrite != 0 { return nil, linuxerr.EISDIR } if opts.Flags&linux.O_DIRECT != 0 { return nil, linuxerr.EINVAL } var fd directoryFD fd.LockFD.Init(&d.inode.locks) if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { return nil, err } return &fd.vfsfd, nil case linux.S_IFLNK: // Can't open symlinks without O_PATH, which is handled at the VFS layer. return nil, linuxerr.ELOOP default: return nil, linuxerr.ENXIO } } // +stateify savable type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.LockFD lockLogging sync.Once `state:"nosave"` } func (fd *fileDescription) filesystem() *filesystem { return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) } func (fd *fileDescription) dentry() *dentry { return fd.vfsfd.Dentry().Impl().(*dentry) } func (fd *fileDescription) inode() *inode { return fd.dentry().inode } // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx fd.inode().statTo(&stat) return stat, nil } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { return linuxerr.EROFS } // StatFS implements vfs.FileDescriptionImpl.StatFS. func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { return fd.filesystem().statFS(), nil } // ListXattr implements vfs.FileDescriptionImpl.ListXattr. func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { return nil, linuxerr.ENOTSUP } // GetXattr implements vfs.FileDescriptionImpl.GetXattr. func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { return "", linuxerr.ENOTSUP } // SetXattr implements vfs.FileDescriptionImpl.SetXattr. func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { return linuxerr.EROFS } // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { return linuxerr.EROFS } // Sync implements vfs.FileDescriptionImpl.Sync. func (*fileDescription) Sync(context.Context) error { return nil } // Release implements vfs.FileDescriptionImpl.Release. func (*fileDescription) Release(ctx context.Context) {} golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/erofs/erofs_state_autogen.go000066400000000000000000000230211465435605700277450ustar00rootroot00000000000000// automatically generated by stateify. package erofs import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (r *dentryRefs) StateTypeName() string { return "pkg/sentry/fsimpl/erofs.dentryRefs" } func (r *dentryRefs) StateFields() []string { return []string{ "refCount", } } func (r *dentryRefs) beforeSave() {} // +checklocksignore func (r *dentryRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *dentryRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (fd *directoryFD) StateTypeName() string { return "pkg/sentry/fsimpl/erofs.directoryFD" } func (fd *directoryFD) StateFields() []string { return []string{ "fileDescription", "DirectoryFileDescriptionDefaultImpl", "off", } } func (fd *directoryFD) beforeSave() {} // +checklocksignore func (fd *directoryFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.fileDescription) stateSinkObject.Save(1, &fd.DirectoryFileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.off) } func (fd *directoryFD) afterLoad(context.Context) {} // +checklocksignore func (fd *directoryFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.fileDescription) stateSourceObject.Load(1, &fd.DirectoryFileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.off) } func (fstype *FilesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/erofs.FilesystemType" } func (fstype *FilesystemType) StateFields() []string { return []string{} } func (fstype *FilesystemType) beforeSave() {} // +checklocksignore func (fstype *FilesystemType) StateSave(stateSinkObject state.Sink) { fstype.beforeSave() } func (fstype *FilesystemType) afterLoad(context.Context) {} // +checklocksignore func (fstype *FilesystemType) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (fs *filesystem) StateTypeName() string { return "pkg/sentry/fsimpl/erofs.filesystem" } func (fs *filesystem) StateFields() []string { return []string{ "vfsfs", "mopts", "iopts", "devMinor", "root", "image", "mf", "inodeBuckets", } } func (fs *filesystem) beforeSave() {} // +checklocksignore func (fs *filesystem) StateSave(stateSinkObject state.Sink) { fs.beforeSave() stateSinkObject.Save(0, &fs.vfsfs) stateSinkObject.Save(1, &fs.mopts) stateSinkObject.Save(2, &fs.iopts) stateSinkObject.Save(3, &fs.devMinor) stateSinkObject.Save(4, &fs.root) stateSinkObject.Save(5, &fs.image) stateSinkObject.Save(6, &fs.mf) stateSinkObject.Save(7, &fs.inodeBuckets) } // +checklocksignore func (fs *filesystem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fs.vfsfs) stateSourceObject.Load(1, &fs.mopts) stateSourceObject.Load(2, &fs.iopts) stateSourceObject.Load(3, &fs.devMinor) stateSourceObject.Load(4, &fs.root) stateSourceObject.Load(5, &fs.image) stateSourceObject.Load(6, &fs.mf) stateSourceObject.Load(7, &fs.inodeBuckets) stateSourceObject.AfterLoad(func() { fs.afterLoad(ctx) }) } func (i *InternalFilesystemOptions) StateTypeName() string { return "pkg/sentry/fsimpl/erofs.InternalFilesystemOptions" } func (i *InternalFilesystemOptions) StateFields() []string { return []string{ "UniqueID", } } func (i *InternalFilesystemOptions) beforeSave() {} // +checklocksignore func (i *InternalFilesystemOptions) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.UniqueID) } func (i *InternalFilesystemOptions) afterLoad(context.Context) {} // +checklocksignore func (i *InternalFilesystemOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.UniqueID) } func (ib *inodeBucket) StateTypeName() string { return "pkg/sentry/fsimpl/erofs.inodeBucket" } func (ib *inodeBucket) StateFields() []string { return []string{ "inodeMap", } } func (ib *inodeBucket) beforeSave() {} // +checklocksignore func (ib *inodeBucket) StateSave(stateSinkObject state.Sink) { ib.beforeSave() stateSinkObject.Save(0, &ib.inodeMap) } func (ib *inodeBucket) afterLoad(context.Context) {} // +checklocksignore func (ib *inodeBucket) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &ib.inodeMap) } func (i *inode) StateTypeName() string { return "pkg/sentry/fsimpl/erofs.inode" } func (i *inode) StateFields() []string { return []string{ "Inode", "inodeRefs", "fs", "mappings", "locks", "watches", } } func (i *inode) beforeSave() {} // +checklocksignore func (i *inode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.Inode) stateSinkObject.Save(1, &i.inodeRefs) stateSinkObject.Save(2, &i.fs) stateSinkObject.Save(3, &i.mappings) stateSinkObject.Save(4, &i.locks) stateSinkObject.Save(5, &i.watches) } func (i *inode) afterLoad(context.Context) {} // +checklocksignore func (i *inode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.Inode) stateSourceObject.Load(1, &i.inodeRefs) stateSourceObject.Load(2, &i.fs) stateSourceObject.Load(3, &i.mappings) stateSourceObject.Load(4, &i.locks) stateSourceObject.Load(5, &i.watches) } func (d *dentry) StateTypeName() string { return "pkg/sentry/fsimpl/erofs.dentry" } func (d *dentry) StateFields() []string { return []string{ "vfsd", "dentryRefs", "parent", "name", "inode", "childMap", } } func (d *dentry) beforeSave() {} // +checklocksignore func (d *dentry) StateSave(stateSinkObject state.Sink) { d.beforeSave() var parentValue *dentry parentValue = d.saveParent() stateSinkObject.SaveValue(2, parentValue) stateSinkObject.Save(0, &d.vfsd) stateSinkObject.Save(1, &d.dentryRefs) stateSinkObject.Save(3, &d.name) stateSinkObject.Save(4, &d.inode) stateSinkObject.Save(5, &d.childMap) } func (d *dentry) afterLoad(context.Context) {} // +checklocksignore func (d *dentry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.vfsd) stateSourceObject.Load(1, &d.dentryRefs) stateSourceObject.Load(3, &d.name) stateSourceObject.Load(4, &d.inode) stateSourceObject.Load(5, &d.childMap) stateSourceObject.LoadValue(2, new(*dentry), func(y any) { d.loadParent(ctx, y.(*dentry)) }) } func (fd *fileDescription) StateTypeName() string { return "pkg/sentry/fsimpl/erofs.fileDescription" } func (fd *fileDescription) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "LockFD", } } func (fd *fileDescription) beforeSave() {} // +checklocksignore func (fd *fileDescription) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.vfsfd) stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.LockFD) } func (fd *fileDescription) afterLoad(context.Context) {} // +checklocksignore func (fd *fileDescription) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.vfsfd) stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.LockFD) } func (r *inodeRefs) StateTypeName() string { return "pkg/sentry/fsimpl/erofs.inodeRefs" } func (r *inodeRefs) StateFields() []string { return []string{ "refCount", } } func (r *inodeRefs) beforeSave() {} // +checklocksignore func (r *inodeRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *inodeRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (fd *regularFileFD) StateTypeName() string { return "pkg/sentry/fsimpl/erofs.regularFileFD" } func (fd *regularFileFD) StateFields() []string { return []string{ "fileDescription", "off", } } func (fd *regularFileFD) beforeSave() {} // +checklocksignore func (fd *regularFileFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.fileDescription) stateSinkObject.Save(1, &fd.off) } func (fd *regularFileFD) afterLoad(context.Context) {} // +checklocksignore func (fd *regularFileFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.fileDescription) stateSourceObject.Load(1, &fd.off) } func (mf *imageMemmapFile) StateTypeName() string { return "pkg/sentry/fsimpl/erofs.imageMemmapFile" } func (mf *imageMemmapFile) StateFields() []string { return []string{ "NoBufferedIOFallback", "image", } } func (mf *imageMemmapFile) beforeSave() {} // +checklocksignore func (mf *imageMemmapFile) StateSave(stateSinkObject state.Sink) { mf.beforeSave() stateSinkObject.Save(0, &mf.NoBufferedIOFallback) stateSinkObject.Save(1, &mf.image) } func (mf *imageMemmapFile) afterLoad(context.Context) {} // +checklocksignore func (mf *imageMemmapFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &mf.NoBufferedIOFallback) stateSourceObject.Load(1, &mf.image) } func init() { state.Register((*dentryRefs)(nil)) state.Register((*directoryFD)(nil)) state.Register((*FilesystemType)(nil)) state.Register((*filesystem)(nil)) state.Register((*InternalFilesystemOptions)(nil)) state.Register((*inodeBucket)(nil)) state.Register((*inode)(nil)) state.Register((*dentry)(nil)) state.Register((*fileDescription)(nil)) state.Register((*inodeRefs)(nil)) state.Register((*regularFileFD)(nil)) state.Register((*imageMemmapFile)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/erofs/filesystem.go000066400000000000000000000320021465435605700260700ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package erofs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/erofs" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // step resolves rp.Component() to an existing file, starting from the given directory. // // step is loosely analogous to fs/namei.c:walk_component(). // // Preconditions: // - !rp.Done(). func step(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, bool, error) { if !d.inode.IsDir() { return nil, false, linuxerr.ENOTDIR } if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, false, err } name := rp.Component() if name == "." { rp.Advance() return d, false, nil } if name == ".." { parent := d.parent.Load() if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { return nil, false, err } else if isRoot || parent == nil { rp.Advance() return d, false, nil } if err := rp.CheckMount(ctx, &parent.vfsd); err != nil { return nil, false, err } rp.Advance() return parent, false, nil } if len(name) > erofs.MaxNameLen { return nil, false, linuxerr.ENAMETOOLONG } child, err := d.lookup(ctx, name) if err != nil { return nil, false, err } if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, false, err } if child.inode.IsSymlink() && rp.ShouldFollowSymlink() { target, err := child.inode.Readlink() if err != nil { return nil, false, err } followedSymlink, err := rp.HandleSymlink(target) return d, followedSymlink, err } rp.Advance() return child, false, nil } // walkParentDir resolves all but the last path component of rp to an existing // directory, starting from the gvien directory. It does not check that the // returned directory is searchable by the provider of rp. // // walkParentDir is loosely analogous to Linux's fs/namei.c:path_parentat(). // // Preconditions: // - !rp.Done(). func walkParentDir(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { for !rp.Final() { next, _, err := step(ctx, rp, d) if err != nil { return nil, err } d = next } if !d.inode.IsDir() { return nil, linuxerr.ENOTDIR } return d, nil } // resolve resolves rp to an existing file. // // resolve is loosely analogous to Linux's fs/namei.c:path_lookupat(). func resolve(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) { d := rp.Start().Impl().(*dentry) for !rp.Done() { next, _, err := step(ctx, rp, d) if err != nil { return nil, err } d = next } if rp.MustBeDir() && !d.inode.IsDir() { return nil, linuxerr.ENOTDIR } return d, nil } // doCreateAt checks that creating a file at rp is permitted. // // doCreateAt is loosely analogous to a conjunction of Linux's // fs/namei.c:filename_create() and done_path_create(). // // Preconditions: // - !rp.Done(). // - For the final path component in rp, !rp.ShouldFollowSymlink(). func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error { parentDir, err := walkParentDir(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } // Order of checks is important. First check if parent directory can be // executed, then check for existence, and lastly check if mount is writable. if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return err } name := rp.Component() if name == "." || name == ".." { return linuxerr.EEXIST } if len(name) > erofs.MaxNameLen { return linuxerr.ENAMETOOLONG } if _, err := parentDir.lookup(ctx, name); err == nil { return linuxerr.EEXIST } else if !linuxerr.Equals(linuxerr.ENOENT, err) { return err } if !dir && rp.MustBeDir() { return linuxerr.ENOENT } return linuxerr.EROFS } // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { return nil } // AccessAt implements vfs.FilesystemImpl.AccessAt. func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { d, err := resolve(ctx, rp) if err != nil { return err } if ats.MayWrite() { return linuxerr.EROFS } return d.inode.checkPermissions(creds, ats) } // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { d, err := resolve(ctx, rp) if err != nil { return nil, err } if opts.CheckSearchable { if !d.inode.IsDir() { return nil, linuxerr.ENOTDIR } if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } } d.IncRef() return &d.vfsd, nil } // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { dir, err := walkParentDir(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return nil, err } dir.IncRef() return &dir.vfsd, nil } // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { return fs.doCreateAt(ctx, rp, false /* dir */) } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { return fs.doCreateAt(ctx, rp, true /* dir */) } // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { return fs.doCreateAt(ctx, rp, false /* dir */) } // OpenAt implements vfs.FilesystemImpl.OpenAt. func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { if opts.Flags&linux.O_TMPFILE != 0 { return nil, linuxerr.EOPNOTSUPP } if opts.Flags&linux.O_CREAT == 0 { d, err := resolve(ctx, rp) if err != nil { return nil, err } return d.open(ctx, rp, &opts) } mustCreate := opts.Flags&linux.O_EXCL != 0 start := rp.Start().Impl().(*dentry) if rp.Done() { // Reject attempts to open mount root directory with O_CREAT. if rp.MustBeDir() { return nil, linuxerr.EISDIR } if mustCreate { return nil, linuxerr.EEXIST } return start.open(ctx, rp, &opts) } afterTrailingSymlink: parentDir, err := walkParentDir(ctx, rp, start) if err != nil { return nil, err } // Check for search permission in the parent directory. if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Reject attempts to open directories with O_CREAT. if rp.MustBeDir() { return nil, linuxerr.EISDIR } child, followedSymlink, err := step(ctx, rp, parentDir) if followedSymlink { if mustCreate { // EEXIST must be returned if an existing symlink is opened with O_EXCL. return nil, linuxerr.EEXIST } if err != nil { // If followedSymlink && err != nil, then this symlink resolution error // must be handled by the VFS layer. return nil, err } start = parentDir goto afterTrailingSymlink } if linuxerr.Equals(linuxerr.ENOENT, err) { return nil, linuxerr.EROFS } if err != nil { return nil, err } if mustCreate { return nil, linuxerr.EEXIST } if rp.MustBeDir() && !child.inode.IsDir() { return nil, linuxerr.ENOTDIR } return child.open(ctx, rp, &opts) } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { d, err := resolve(ctx, rp) if err != nil { return "", err } return d.inode.Readlink() } // RenameAt implements vfs.FilesystemImpl.RenameAt. func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { // Resolve newParent first to verify that it's on this Mount. newParentDir, err := walkParentDir(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } newName := rp.Component() if len(newName) > erofs.MaxNameLen { return linuxerr.ENAMETOOLONG } mnt := rp.Mount() if mnt != oldParentVD.Mount() { return linuxerr.EXDEV } if err := newParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } oldParentDir := oldParentVD.Dentry().Impl().(*dentry) if err := oldParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } return linuxerr.EROFS } // RmdirAt implements vfs.FilesystemImpl.RmdirAt. func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { parentDir, err := walkParentDir(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return err } name := rp.Component() if name == "." { return linuxerr.EINVAL } if name == ".." { return linuxerr.ENOTEMPTY } return linuxerr.EROFS } // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { if _, err := resolve(ctx, rp); err != nil { return err } return linuxerr.EROFS } // StatAt implements vfs.FilesystemImpl.StatAt. func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { d, err := resolve(ctx, rp) if err != nil { return linux.Statx{}, err } var stat linux.Statx d.inode.statTo(&stat) return stat, nil } // StatFSAt implements vfs.FilesystemImpl.StatFSAt. func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { if _, err := resolve(ctx, rp); err != nil { return linux.Statfs{}, err } return fs.statFS(), nil } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { return fs.doCreateAt(ctx, rp, false /* dir */) } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { parentDir, err := walkParentDir(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return err } name := rp.Component() if name == "." || name == ".." { return linuxerr.EISDIR } return linuxerr.EROFS } // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { d, err := resolve(ctx, rp) if err != nil { return nil, err } if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } return nil, linuxerr.ECONNREFUSED } // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { if _, err := resolve(ctx, rp); err != nil { return nil, err } return nil, linuxerr.ENOTSUP } // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { if _, err := resolve(ctx, rp); err != nil { return "", err } return "", linuxerr.ENOTSUP } // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { if _, err := resolve(ctx, rp); err != nil { return err } return linuxerr.EROFS } // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { if _, err := resolve(ctx, rp); err != nil { return err } return linuxerr.EROFS } // PrependPath implements vfs.FilesystemImpl.PrependPath. func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) } // MountOptions implements vfs.FilesystemImpl.MountOptions. func (fs *filesystem) MountOptions() string { return fs.mopts } // IsDescendant implements vfs.FilesystemImpl.IsDescendant. func (fs *filesystem) IsDescendant(vfsroot, vd vfs.VirtualDentry) bool { return genericIsDescendant(vfsroot.Dentry(), vd.Dentry().Impl().(*dentry)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/erofs/fstree.go000066400000000000000000000036701465435605700252050ustar00rootroot00000000000000package erofs import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // We need to define an interface instead of using atomic.Pointer because // the Dentry type gets removed during code generation and the compiler // complains about the unused sync/atomic type. type genericatomicptr interface { Load() *dentry } // IsAncestorDentry returns true if d is an ancestor of d2; that is, d is // either d2's parent or an ancestor of d2's parent. func genericIsAncestorDentry(d, d2 *dentry) bool { for d2 != nil { parent := d2.parent.Load() if parent == d { return true } if parent == d2 { return false } d2 = parent } return false } // IsDescendant returns true if vd is a descendant of vfsroot or if vd and // vfsroot are the same dentry. func genericIsDescendant(vfsroot *vfs.Dentry, d *dentry) bool { for d != nil && &d.vfsd != vfsroot { d = d.parent.Load() } return d != nil } // ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d. func genericParentOrSelf(d *dentry) *dentry { if parent := d.parent.Load(); parent != nil { return parent } return d } // PrependPath is a generic implementation of FilesystemImpl.PrependPath(). func genericPrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *dentry, b *fspath.Builder) error { for { if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { return vfs.PrependPathAtVFSRootError{} } if mnt != nil && &d.vfsd == mnt.Root() { return nil } parent := d.parent.Load() if parent == nil { return vfs.PrependPathAtNonMountRootError{} } b.PrependComponent(d.name) d = parent } } // DebugPathname returns a pathname to d relative to its filesystem root. // DebugPathname does not correspond to any Linux function; it's used to // generate dentry pathnames for debugging. func genericDebugPathname(d *dentry) string { var b fspath.Builder _ = genericPrependPath(vfs.VirtualDentry{}, nil, d, &b) return b.String() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/erofs/inode_refs.go000066400000000000000000000100661465435605700260270ustar00rootroot00000000000000package erofs import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const inodeenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var inodeobj *inode // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type inodeRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *inodeRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *inodeRefs) RefType() string { return fmt.Sprintf("%T", inodeobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *inodeRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *inodeRefs) LogRefs() bool { return inodeenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *inodeRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *inodeRefs) IncRef() { v := r.refCount.Add(1) if inodeenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *inodeRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if inodeenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *inodeRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if inodeenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *inodeRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/erofs/regular_file.go000066400000000000000000000140151465435605700263500ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package erofs import ( "io" "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/erofs" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) // +stateify savable type regularFileFD struct { fileDescription // offMu protects off. offMu sync.Mutex `state:"nosave"` // off is the file offset. // +checklocks:offMu off int64 } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { if offset < 0 { return 0, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, linuxerr.EOPNOTSUPP } if dst.NumBytes() == 0 { return 0, nil } data, err := fd.inode().Data() if err != nil { return 0, err } r := ®ularFileReader{ data: data, off: uint64(offset), } return dst.CopyOutFrom(ctx, r) } type regularFileReader struct { data safemem.BlockSeq off uint64 } // ReadToBlocks implements safemem.Reader.ReadToBlocks. func (r *regularFileReader) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { if r.off >= r.data.NumBytes() { return 0, io.EOF } cp, err := safemem.CopySeq(dsts, r.data.DropFirst(int(r.off))) r.off += cp return cp, err } // Read implements vfs.FileDescriptionImpl.Read. func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { fd.offMu.Lock() n, err := fd.PRead(ctx, dst, fd.off, opts) fd.off += n fd.offMu.Unlock() return n, err } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { return 0, linuxerr.EROFS } // Write implements vfs.FileDescriptionImpl.Write. func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { return 0, linuxerr.EROFS } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { fd.offMu.Lock() defer fd.offMu.Unlock() switch whence { case linux.SEEK_SET: // use offset as specified case linux.SEEK_CUR: offset += fd.off case linux.SEEK_END: offset += int64(fd.inode().Size()) default: return 0, linuxerr.EINVAL } if offset < 0 { return 0, linuxerr.EINVAL } fd.off = offset return offset, nil } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { return vfs.GenericConfigureMMap(&fd.vfsfd, fd.inode(), opts) } // AddMapping implements memmap.Mappable.AddMapping. func (i *inode) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { i.mapsMu.Lock() i.mappings.AddMapping(ms, ar, offset, writable) i.mapsMu.Unlock() return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (i *inode) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { i.mapsMu.Lock() i.mappings.RemoveMapping(ms, ar, offset, writable) i.mapsMu.Unlock() } // CopyMapping implements memmap.Mappable.CopyMapping. func (i *inode) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { i.AddMapping(ctx, ms, dstAR, offset, writable) return nil } // Translate implements memmap.Mappable.Translate. func (i *inode) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { pgend, _ := hostarch.PageRoundUp(i.Size()) if required.End > pgend { if required.Start >= pgend { return nil, &memmap.BusError{io.EOF} } required.End = pgend } if optional.End > pgend { optional.End = pgend } if at.Write { return nil, &memmap.BusError{linuxerr.EROFS} } offset, err := i.DataOffset() if err != nil { return nil, &memmap.BusError{err} } mr := optional return []memmap.Translation{ { Source: mr, File: &i.fs.mf, Offset: mr.Start + offset, Perms: at, }, }, nil } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (i *inode) InvalidateUnsavable(ctx context.Context) error { i.mapsMu.Lock() i.mappings.InvalidateAll(memmap.InvalidateOpts{}) i.mapsMu.Unlock() return nil } // +stateify savable type imageMemmapFile struct { memmap.NoBufferedIOFallback image *erofs.Image } // IncRef implements memmap.File.IncRef. func (mf *imageMemmapFile) IncRef(fr memmap.FileRange, memCgID uint32) {} // DecRef implements memmap.File.DecRef. func (mf *imageMemmapFile) DecRef(fr memmap.FileRange) {} // MapInternal implements memmap.File.MapInternal. func (mf *imageMemmapFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { if at.Write { return safemem.BlockSeq{}, &memmap.BusError{linuxerr.EROFS} } bytes, err := mf.image.BytesAt(fr.Start, fr.Length()) if err != nil { return safemem.BlockSeq{}, &memmap.BusError{err} } return safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bytes)), nil } // FD implements memmap.File.FD. func (mf *imageMemmapFile) FD() int { return mf.image.FD() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/erofs/save_restore.go000066400000000000000000000032501465435605700264100ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package erofs import ( "context" "fmt" "os" "gvisor.dev/gvisor/pkg/erofs" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // afterLoad is called by stateify. func (fs *filesystem) afterLoad(ctx context.Context) { fdmap := vfs.RestoreFilesystemFDMapFromContext(ctx) fd, ok := fdmap[fs.iopts.UniqueID] if !ok { panic(fmt.Sprintf("no image FD available for filesystem with unique ID %q", fs.iopts.UniqueID)) } newImage, err := erofs.OpenImage(os.NewFile(uintptr(fd), "EROFS image file")) if err != nil { panic(fmt.Sprintf("erofs.OpenImage failed: %v", err)) } if got, want := newImage.SuperBlock(), fs.image.SuperBlock(); got != want { panic(fmt.Sprintf("superblock mismatch detected on restore, got %+v, expected %+v", got, want)) } // We need to update the image in place, as there are other pointers // pointing to this image as well. *fs.image = *newImage } // saveParent is called by stateify. func (d *dentry) saveParent() *dentry { return d.parent.Load() } // loadParent is called by stateify. func (d *dentry) loadParent(_ context.Context, parent *dentry) { d.parent.Store(parent) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/eventfd/000077500000000000000000000000001465435605700236755ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/eventfd/eventfd.go000066400000000000000000000172271465435605700256700ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package eventfd implements event fds. package eventfd import ( "fmt" "math" "sync" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // EventFileDescription implements vfs.FileDescriptionImpl for file-based event // notification (eventfd). Eventfds are usually internal to the Sentry but in // certain situations they may be converted into a host-backed eventfd. // // +stateify savable type EventFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD // queue is used to notify interested parties when the event object // becomes readable or writable. queue waiter.Queue // mu protects the fields below. mu sync.Mutex `state:"nosave"` // val is the current value of the event counter. val uint64 // semMode specifies whether the event is in "semaphore" mode. semMode bool // hostfd indicates whether this eventfd is passed through to the host. hostfd int } var _ vfs.FileDescriptionImpl = (*EventFileDescription)(nil) // New creates a new event fd. func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) { vd := vfsObj.NewAnonVirtualDentry("[eventfd]") defer vd.DecRef(ctx) efd := &EventFileDescription{ val: initVal, semMode: semMode, hostfd: -1, } if err := efd.vfsfd.Init(efd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ UseDentryMetadata: true, DenyPRead: true, DenyPWrite: true, DenySpliceIn: true, }); err != nil { return nil, err } return &efd.vfsfd, nil } // HostFD returns the host eventfd associated with this event. func (efd *EventFileDescription) HostFD() (int, error) { efd.mu.Lock() defer efd.mu.Unlock() if efd.hostfd >= 0 { return efd.hostfd, nil } flags := linux.EFD_NONBLOCK if efd.semMode { flags |= linux.EFD_SEMAPHORE } fd, _, errno := unix.Syscall(unix.SYS_EVENTFD2, uintptr(efd.val), uintptr(flags), 0) if errno != 0 { return -1, errno } if err := fdnotifier.AddFD(int32(fd), &efd.queue); err != nil { if closeErr := unix.Close(int(fd)); closeErr != nil { log.Warningf("close(%d) eventfd failed: %v", fd, closeErr) } return -1, err } efd.hostfd = int(fd) return efd.hostfd, nil } // Release implements vfs.FileDescriptionImpl.Release. func (efd *EventFileDescription) Release(context.Context) { efd.mu.Lock() defer efd.mu.Unlock() if efd.hostfd >= 0 { fdnotifier.RemoveFD(int32(efd.hostfd)) if closeErr := unix.Close(int(efd.hostfd)); closeErr != nil { log.Warningf("close(%d) eventfd failed: %v", efd.hostfd, closeErr) } efd.hostfd = -1 } } // Read implements vfs.FileDescriptionImpl.Read. func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { if dst.NumBytes() < 8 { return 0, unix.EINVAL } if err := efd.read(ctx, dst); err != nil { return 0, err } return 8, nil } // Write implements vfs.FileDescriptionImpl.Write. func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { if src.NumBytes() < 8 { return 0, unix.EINVAL } if err := efd.write(ctx, src); err != nil { return 0, err } return 8, nil } // Preconditions: Must be called with efd.mu locked. func (efd *EventFileDescription) hostReadLocked(ctx context.Context, dst usermem.IOSequence) error { var buf [8]byte if _, err := unix.Read(efd.hostfd, buf[:]); err != nil { if err == unix.EWOULDBLOCK { return linuxerr.ErrWouldBlock } return err } _, err := dst.CopyOut(ctx, buf[:]) return err } func (efd *EventFileDescription) read(ctx context.Context, dst usermem.IOSequence) error { efd.mu.Lock() if efd.hostfd >= 0 { defer efd.mu.Unlock() return efd.hostReadLocked(ctx, dst) } // We can't complete the read if the value is currently zero. if efd.val == 0 { efd.mu.Unlock() return linuxerr.ErrWouldBlock } // Update the value based on the mode the event is operating in. var val uint64 if efd.semMode { val = 1 // Consistent with Linux, this is done even if writing to memory fails. efd.val-- } else { val = efd.val efd.val = 0 } efd.mu.Unlock() // Notify writers. We do this even if we were already writable because // it is possible that a writer is waiting to write the maximum value // to the event. efd.queue.Notify(waiter.WritableEvents) var buf [8]byte hostarch.ByteOrder.PutUint64(buf[:], val) _, err := dst.CopyOut(ctx, buf[:]) return err } // Preconditions: Must be called with efd.mu locked. func (efd *EventFileDescription) hostWriteLocked(val uint64) error { var buf [8]byte hostarch.ByteOrder.PutUint64(buf[:], val) _, err := unix.Write(efd.hostfd, buf[:]) if err == unix.EWOULDBLOCK { return linuxerr.ErrWouldBlock } return err } func (efd *EventFileDescription) write(ctx context.Context, src usermem.IOSequence) error { var buf [8]byte if _, err := src.CopyIn(ctx, buf[:]); err != nil { return err } val := hostarch.ByteOrder.Uint64(buf[:]) return efd.Signal(val) } // Signal is an internal function to signal the event fd. func (efd *EventFileDescription) Signal(val uint64) error { if val == math.MaxUint64 { return unix.EINVAL } efd.mu.Lock() if efd.hostfd >= 0 { defer efd.mu.Unlock() return efd.hostWriteLocked(val) } // We only allow writes that won't cause the value to go over the max // uint64 minus 1. if val > math.MaxUint64-1-efd.val { efd.mu.Unlock() return linuxerr.ErrWouldBlock } efd.val += val efd.mu.Unlock() // Always trigger a notification. efd.queue.Notify(waiter.ReadableEvents) return nil } // Readiness implements waiter.Waitable.Readiness. func (efd *EventFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { efd.mu.Lock() defer efd.mu.Unlock() if efd.hostfd >= 0 { return fdnotifier.NonBlockingPoll(int32(efd.hostfd), mask) } ready := waiter.EventMask(0) if efd.val > 0 { ready |= waiter.ReadableEvents } if efd.val < math.MaxUint64-1 { ready |= waiter.WritableEvents } return mask & ready } // EventRegister implements waiter.Waitable.EventRegister. func (efd *EventFileDescription) EventRegister(entry *waiter.Entry) error { efd.queue.EventRegister(entry) efd.mu.Lock() defer efd.mu.Unlock() if efd.hostfd >= 0 { if err := fdnotifier.UpdateFD(int32(efd.hostfd)); err != nil { efd.queue.EventUnregister(entry) return err } } return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (efd *EventFileDescription) EventUnregister(entry *waiter.Entry) { efd.queue.EventUnregister(entry) efd.mu.Lock() defer efd.mu.Unlock() if efd.hostfd >= 0 { if err := fdnotifier.UpdateFD(int32(efd.hostfd)); err != nil { panic(fmt.Sprint("UpdateFD:", err)) } } } // Epollable implements FileDescriptionImpl.Epollable. func (efd *EventFileDescription) Epollable() bool { return true } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/eventfd/eventfd_state_autogen.go000066400000000000000000000030551465435605700306040ustar00rootroot00000000000000// automatically generated by stateify. package eventfd import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (efd *EventFileDescription) StateTypeName() string { return "pkg/sentry/fsimpl/eventfd.EventFileDescription" } func (efd *EventFileDescription) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "NoLockFD", "queue", "val", "semMode", "hostfd", } } func (efd *EventFileDescription) beforeSave() {} // +checklocksignore func (efd *EventFileDescription) StateSave(stateSinkObject state.Sink) { efd.beforeSave() stateSinkObject.Save(0, &efd.vfsfd) stateSinkObject.Save(1, &efd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &efd.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &efd.NoLockFD) stateSinkObject.Save(4, &efd.queue) stateSinkObject.Save(5, &efd.val) stateSinkObject.Save(6, &efd.semMode) stateSinkObject.Save(7, &efd.hostfd) } func (efd *EventFileDescription) afterLoad(context.Context) {} // +checklocksignore func (efd *EventFileDescription) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &efd.vfsfd) stateSourceObject.Load(1, &efd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &efd.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &efd.NoLockFD) stateSourceObject.Load(4, &efd.queue) stateSourceObject.Load(5, &efd.val) stateSourceObject.Load(6, &efd.semMode) stateSourceObject.Load(7, &efd.hostfd) } func init() { state.Register((*EventFileDescription)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/000077500000000000000000000000001465435605700232045ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/connection.go000066400000000000000000000252251465435605700257000ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fuse import ( goContext "context" "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/waiter" ) const ( // fuseDefaultMaxBackground is the default value for MaxBackground. fuseDefaultMaxBackground = 12 // fuseDefaultCongestionThreshold is the default value for CongestionThreshold, // and is 75% of the default maximum of MaxGround. fuseDefaultCongestionThreshold = (fuseDefaultMaxBackground * 3 / 4) // fuseDefaultMaxPagesPerReq is the default value for MaxPagesPerReq. fuseDefaultMaxPagesPerReq = 32 ) // connection is the struct by which the sentry communicates with the FUSE server daemon. // // Lock order: // - conn.fd.mu // - conn.mu // - conn.asyncMu // // +stateify savable type connection struct { fd *DeviceFD // mu protects access to struct members. mu sync.Mutex `state:"nosave"` // attributeVersion is the version of connection's attributes. attributeVersion atomicbitops.Uint64 // We target FUSE 7.23. // The following FUSE_INIT flags are currently unsupported by this implementation: // - FUSE_EXPORT_SUPPORT // - FUSE_POSIX_LOCKS: requires POSIX locks // - FUSE_FLOCK_LOCKS: requires POSIX locks // - FUSE_AUTO_INVAL_DATA: requires page caching eviction // - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation // - FUSE_ASYNC_DIO // - FUSE_PARALLEL_DIROPS (7.25) // - FUSE_HANDLE_KILLPRIV (7.26) // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler (7.26) // - FUSE_ABORT_ERROR (7.27) // - FUSE_CACHE_SYMLINKS (7.28) // - FUSE_NO_OPENDIR_SUPPORT (7.29) // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction (7.30) // - FUSE_MAP_ALIGNMENT (7.31) // initialized after receiving FUSE_INIT reply. // Until it's set, suspend sending FUSE requests. // Use SetInitialized() and IsInitialized() for atomic access. initialized atomicbitops.Int32 // initializedChan is used to block requests before initialization. initializedChan chan struct{} `state:".(bool)"` // connected (connection established) when a new FUSE file system is created. // Set to false when: // umount, // connection abort, // device release. // +checklocks:mu connected bool // connInitError if FUSE_INIT encountered error (major version mismatch). // Only set in INIT. // +checklocks:mu connInitError bool // connInitSuccess if FUSE_INIT is successful. // Only set in INIT. // Used for destroy (not yet implemented). // +checklocks:mu connInitSuccess bool // aborted via sysfs, and will send ECONNABORTED to read after disconnection (instead of ENODEV). // Set only if abortErr is true and via fuse control fs (not yet implemented). // TODO(gvisor.dev/issue/3525): set this to true when user aborts. aborted bool // numWaiting is the number of requests waiting to be // sent to FUSE device or being processed by FUSE daemon. numWaiting uint32 // Terminology note: // // - `asyncNumMax` is the `MaxBackground` in the FUSE_INIT_IN struct. // // - `asyncCongestionThreshold` is the `CongestionThreshold` in the FUSE_INIT_IN struct. // // We call the "background" requests in unix term as async requests. // The "async requests" in unix term is our async requests that expect a reply, // i.e. `!request.noReply` // asyncMu protects the async request fields. asyncMu sync.Mutex `state:"nosave"` // asyncNum is the number of async requests. // +checklocks:asyncMu asyncNum uint16 // asyncCongestionThreshold the number of async requests. // Negotiated in FUSE_INIT as "CongestionThreshold". // TODO(gvisor.dev/issue/3529): add congestion control. // +checklocks:asyncMu asyncCongestionThreshold uint16 // asyncNumMax is the maximum number of asyncNum. // Connection blocks the async requests when it is reached. // Negotiated in FUSE_INIT as "MaxBackground". // +checklocks:asyncMu asyncNumMax uint16 // maxRead is the maximum size of a read buffer in in bytes. // Initialized from a fuse fs parameter. maxRead uint32 // maxWrite is the maximum size of a write buffer in bytes. // Negotiated in FUSE_INIT. maxWrite uint32 // maxPages is the maximum number of pages for a single request to use. // Negotiated in FUSE_INIT. maxPages uint16 // maxActiveRequests specifies the maximum number of active requests that can // exist at any time. Any further requests will block when trying to CAll // the server. maxActiveRequests uint64 // minor version of the FUSE protocol. // Negotiated and only set in INIT. minor uint32 // atomicOTrunc is true when FUSE does not send a separate SETATTR request // before open with O_TRUNC flag. // Negotiated and only set in INIT. atomicOTrunc bool // asyncRead if read pages asynchronously. // Negotiated and only set in INIT. asyncRead bool // writebackCache is true for write-back cache policy, // false for write-through policy. // Negotiated and only set in INIT. writebackCache bool // bigWrites if doing multi-page cached writes. // Negotiated and only set in INIT. bigWrites bool // dontMask if filesystem does not apply umask to creation modes. // Negotiated in INIT. dontMask bool // noOpen if FUSE server doesn't support open operation. // This flag only influences performance, not correctness of the program. noOpen bool } func (conn *connection) saveInitializedChan() bool { select { case <-conn.initializedChan: return true // Closed. default: return false // Not closed. } } func (conn *connection) loadInitializedChan(_ goContext.Context, closed bool) { conn.initializedChan = make(chan struct{}, 1) if closed { close(conn.initializedChan) } } // newFUSEConnection creates a FUSE connection to fuseFD. // +checklocks:fuseFD.mu func newFUSEConnection(_ context.Context, fuseFD *DeviceFD, opts *filesystemOptions) (*connection, error) { // Mark the device as ready so it can be used. // FIXME(gvisor.dev/issue/4813): fuseFD's fields are accessed without // synchronization and without checking if fuseFD has already been used to // mount another filesystem. // Create the writeBuf for the header to be stored in. fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse) fuseFD.fullQueueCh = make(chan struct{}, opts.maxActiveRequests) return &connection{ fd: fuseFD, asyncNumMax: fuseDefaultMaxBackground, asyncCongestionThreshold: fuseDefaultCongestionThreshold, maxRead: opts.maxRead, maxPages: fuseDefaultMaxPagesPerReq, maxActiveRequests: opts.maxActiveRequests, initializedChan: make(chan struct{}), connected: true, }, nil } // CallAsync makes an async (aka background) request. // It's a simple wrapper around Call(). func (conn *connection) CallAsync(ctx context.Context, r *Request) error { r.async = true _, err := conn.Call(ctx, r) return err } // Call makes a request to the server. // Block before the connection is initialized. // When the Request is FUSE_INIT, it will not be blocked before initialization. // Task should never be nil. // // For a sync request, it blocks the invoking task until // a server responds with a response. // // For an async request (that do not expect a response immediately), // it returns directly unless being blocked either before initialization // or when there are too many async requests ongoing. // // Example for async request: // init, readahead, write, async read/write, fuse_notify_reply, // non-sync release, interrupt, forget. // // The forget request does not have a reply, // as documented in include/uapi/linux/fuse.h:FUSE_FORGET. func (conn *connection) Call(ctx context.Context, r *Request) (*Response, error) { b := blockerFromContext(ctx) // Block requests sent before connection is initialized. if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT { if err := b.Block(conn.initializedChan); err != nil { return nil, err } } conn.fd.mu.Lock() conn.mu.Lock() connected := conn.connected connInitError := conn.connInitError conn.mu.Unlock() if !connected { conn.fd.mu.Unlock() return nil, linuxerr.ENOTCONN } if connInitError { conn.fd.mu.Unlock() return nil, linuxerr.ECONNREFUSED } fut, err := conn.callFuture(b, r) conn.fd.mu.Unlock() if err != nil { return nil, err } return fut.resolve(b) } // callFuture makes a request to the server and returns a future response. // Call resolve() when the response needs to be fulfilled. // +checklocks:conn.fd.mu func (conn *connection) callFuture(b context.Blocker, r *Request) (*futureResponse, error) { // Is the queue full? // // We must busy wait here until the request can be queued. We don't // block on the fd.fullQueueCh with a lock - so after being signalled, // before we acquire the lock, it is possible that a barging task enters // and queues a request. As a result, upon acquiring the lock we must // again check if the room is available. // // This can potentially starve a request forever but this can only happen // if there are always too many ongoing requests all the time. The // supported maxActiveRequests setting should be really high to avoid this. for conn.fd.numActiveRequests == conn.maxActiveRequests { log.Infof("Blocking request %v from being queued. Too many active requests: %v", r.id, conn.fd.numActiveRequests) conn.fd.mu.Unlock() err := b.Block(conn.fd.fullQueueCh) conn.fd.mu.Lock() if err != nil { return nil, err } } return conn.callFutureLocked(r) } // callFutureLocked makes a request to the server and returns a future response. // +checklocks:conn.fd.mu func (conn *connection) callFutureLocked(r *Request) (*futureResponse, error) { // Check connected again holding conn.mu. conn.mu.Lock() if !conn.connected { conn.mu.Unlock() // we checked connected before, // this must be due to aborted connection. return nil, linuxerr.ECONNABORTED } conn.mu.Unlock() conn.fd.queue.PushBack(r) conn.fd.numActiveRequests++ fut := newFutureResponse(r) conn.fd.completions[r.id] = fut // Signal the readers that there is something to read. conn.fd.waitQueue.Notify(waiter.ReadableEvents) return fut, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/connection_control.go000066400000000000000000000165251465435605700274430ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fuse import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) // consts used by FUSE_INIT negotiation. const ( // fuseMaxMaxPages is the maximum value for MaxPages received in InitOut. // Follow the same behavior as unix fuse implementation. fuseMaxMaxPages = 256 // Maximum value for the time granularity for file time stamps, 1s. // Follow the same behavior as unix fuse implementation. fuseMaxTimeGranNs = 1000000000 // Minimum value for MaxWrite and MaxRead. // Follow the same behavior as unix fuse implementation. fuseMinMaxWrite = 4096 fuseMinMaxRead = 4096 // Temporary default value for max readahead, 128kb. fuseDefaultMaxReadahead = 131072 // The FUSE_INIT_IN flags sent to the daemon. // TODO(gvisor.dev/issue/3199): complete the flags. fuseDefaultInitFlags = linux.FUSE_MAX_PAGES // An INIT response needs to be at least this long. minInitSize = 24 ) // Adjustable maximums for Connection's cogestion control parameters. // Used as the upperbound of the config values. // Currently we do not support adjustment to them. var ( MaxUserBackgroundRequest uint16 = fuseDefaultMaxBackground MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold ) // SetInitialized atomically sets the connection as initialized. func (conn *connection) SetInitialized() { // Unblock the requests sent before INIT. close(conn.initializedChan) // Close the channel first to avoid the non-atomic situation // where conn.initialized is true but there are // tasks being blocked on the channel. // And it prevents the newer tasks from gaining // unnecessary higher chance to be issued before the blocked one. conn.initialized.Store(1) } // Initialized atomically check if the connection is initialized. pairs with // SetInitialized(). func (conn *connection) Initialized() bool { return conn.initialized.Load() != 0 } // InitSend sends a FUSE_INIT request. func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error { in := linux.FUSEInitIn{ Major: linux.FUSE_KERNEL_VERSION, Minor: linux.FUSE_KERNEL_MINOR_VERSION, // TODO(gvisor.dev/issue/3196): find appropriate way to calculate this MaxReadahead: fuseDefaultMaxReadahead, Flags: fuseDefaultInitFlags, } req := conn.NewRequest(creds, pid, 0, linux.FUSE_INIT, &in) // Since there is no task to block on and FUSE_INIT is the request // to unblock other requests, use context.Background(). return conn.CallAsync(context.Background(), req) } // InitRecv receives a FUSE_INIT reply and process it. // // Preconditions: conn.asyncMu must not be held if minor version is newer than 13. func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error { if err := res.Error(); err != nil { return err } if res.DataLen() < minInitSize { return linuxerr.EINVAL } initRes := fuseInitRes{initLen: res.DataLen()} if err := res.UnmarshalPayload(&initRes); err != nil { return err } return conn.initProcessReply(&initRes.initOut, hasSysAdminCap) } // Process the FUSE_INIT reply from the FUSE server. // It tries to acquire the conn.asyncMu lock if minor version is newer than 13. func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error { conn.mu.Lock() // No matter error or not, always set initialized. // to unblock the blocked requests. defer func() { conn.SetInitialized() conn.mu.Unlock() }() // No support for old major fuse versions. if out.Major != linux.FUSE_KERNEL_VERSION { conn.connInitError = true return nil } // Start processing the reply. conn.connInitSuccess = true conn.minor = out.Minor // No support for negotiating MaxWrite before minor version 5. if out.Minor >= 5 { conn.maxWrite = out.MaxWrite } else { conn.maxWrite = fuseMinMaxWrite } if conn.maxWrite < fuseMinMaxWrite { conn.maxWrite = fuseMinMaxWrite } // No support for the following flags before minor version 6. if out.Minor >= 6 { conn.asyncRead = out.Flags&linux.FUSE_ASYNC_READ != 0 conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0 conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0 conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0 conn.atomicOTrunc = out.Flags&linux.FUSE_ATOMIC_O_TRUNC != 0 // TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs). if out.Flags&linux.FUSE_MAX_PAGES != 0 { maxPages := out.MaxPages if maxPages < 1 { maxPages = 1 } if maxPages > fuseMaxMaxPages { maxPages = fuseMaxMaxPages } conn.maxPages = maxPages } } // No support for limits before minor version 13. if out.Minor >= 13 { conn.asyncMu.Lock() if out.MaxBackground > 0 { conn.asyncNumMax = out.MaxBackground if !hasSysAdminCap && conn.asyncNumMax > MaxUserBackgroundRequest { conn.asyncNumMax = MaxUserBackgroundRequest } } if out.CongestionThreshold > 0 { conn.asyncCongestionThreshold = out.CongestionThreshold if !hasSysAdminCap && conn.asyncCongestionThreshold > MaxUserCongestionThreshold { conn.asyncCongestionThreshold = MaxUserCongestionThreshold } } conn.asyncMu.Unlock() } return nil } // Abort this FUSE connection. // It tries to acquire conn.fd.mu, conn.lock, conn.bgLock in order. // All possible requests waiting or blocking will be aborted. // // +checklocks:conn.fd.mu func (conn *connection) Abort(ctx context.Context) { conn.mu.Lock() conn.asyncMu.Lock() if !conn.connected { conn.asyncMu.Unlock() conn.mu.Unlock() return } conn.connected = false // Empty the `fd.queue` that holds the requests // not yet read by the FUSE daemon yet. // These are a subset of the requests in `fuse.completion` map. for !conn.fd.queue.Empty() { req := conn.fd.queue.Front() conn.fd.queue.Remove(req) } var terminate []linux.FUSEOpID // 2. Collect the requests have not been sent to FUSE daemon, // or have not received a reply. for unique := range conn.fd.completions { terminate = append(terminate, unique) } // Release locks to avoid deadlock. conn.asyncMu.Unlock() conn.mu.Unlock() // 1. The request blocked before initialization. // Will reach call() `connected` check and return. if !conn.Initialized() { conn.SetInitialized() } // 2. Terminate the requests collected above. // Set ECONNABORTED error. // sendError() will remove them from `fd.completion` map. // Will enter the path of a normally received error. for _, toTerminate := range terminate { conn.fd.sendError(ctx, -int32(unix.ECONNABORTED), toTerminate) } // 3. The requests not yet written to FUSE device. // Early terminate. // Will reach callFutureLocked() `connected` check and return. close(conn.fd.fullQueueCh) // TODO(gvisor.dev/issue/3528): Forget all pending forget reqs. } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/dev.go000066400000000000000000000251461465435605700243210ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fuse import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) const fuseDevMinor = 229 // This is equivalent to linux.SizeOfFUSEHeaderIn const fuseHeaderOutSize = 16 // fuseDevice implements vfs.Device for /dev/fuse. // // +stateify savable type fuseDevice struct{} // Open implements vfs.Device.Open. func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { var fd DeviceFD if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ UseDentryMetadata: true, }); err != nil { return nil, err } return &fd.vfsfd, nil } // DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse. // // +stateify savable type DeviceFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD // waitQueue is used to notify interested parties when the device becomes // readable or writable. waitQueue waiter.Queue // fullQueueCh is a channel used to synchronize the readers with the writers. // Writers (inbound requests to the filesystem) block if there are too many // unprocessed in-flight requests. fullQueueCh chan struct{} `state:".(int)"` // mu protects all the queues, maps, buffers and cursors and nextOpID. mu sync.Mutex `state:"nosave"` // nextOpID is used to create new requests. // +checklocks:mu nextOpID linux.FUSEOpID // queue is the list of requests that need to be processed by the FUSE server. // +checklocks:mu queue requestList // numActiveRequests is the number of requests made by the Sentry that has // yet to be responded to. // +checklocks:mu numActiveRequests uint64 // completions is used to map a request to its response. A Writer will use this // to notify the caller of a completed response. // +checklocks:mu completions map[linux.FUSEOpID]*futureResponse // writeBuf is the memory buffer used to copy in the FUSE out header from // userspace. // +checklocks:mu writeBuf [fuseHeaderOutSize]byte // conn is the FUSE connection that this FD is being used for. // +checklocks:mu conn *connection } // Release implements vfs.FileDescriptionImpl.Release. func (fd *DeviceFD) Release(ctx context.Context) { fd.mu.Lock() defer fd.mu.Unlock() if fd.conn != nil { fd.conn.mu.Lock() fd.conn.connected = false fd.conn.mu.Unlock() fd.conn.Abort(ctx) // +checklocksforce: fd.conn.fd.mu=fd.mu fd.waitQueue.Notify(waiter.ReadableEvents) fd.conn = nil } } // connected returns true if fd.conn is set and the connection has not been // aborted. // +checklocks:fd.mu func (fd *DeviceFD) connected() bool { if fd.conn != nil { fd.conn.mu.Lock() defer fd.conn.mu.Unlock() return fd.conn.connected } return false } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is // mounted. If there is an active connection we know there is at least one // filesystem mounted. fd.mu.Lock() defer fd.mu.Unlock() if !fd.connected() { return 0, linuxerr.EPERM } return 0, linuxerr.ENOSYS } // Read implements vfs.FileDescriptionImpl.Read. func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { fd.mu.Lock() defer fd.mu.Unlock() if !fd.connected() { return 0, linuxerr.EPERM } // We require that any Read done on this filesystem have a sane minimum // read buffer. It must have the capacity for the fixed parts of any request // header (Linux uses the request header and the FUSEWriteIn header for this // calculation) + the negotiated MaxWrite room for the data. minBuffSize := linux.FUSE_MIN_READ_BUFFER fd.conn.mu.Lock() negotiatedMinBuffSize := linux.SizeOfFUSEHeaderIn + linux.SizeOfFUSEHeaderOut + fd.conn.maxWrite fd.conn.mu.Unlock() if minBuffSize < negotiatedMinBuffSize { minBuffSize = negotiatedMinBuffSize } // If the read buffer is too small, error out. if dst.NumBytes() < int64(minBuffSize) { return 0, linuxerr.EINVAL } // Find the first valid request. For the normal case this loop only executes // once. var req *Request for req = fd.queue.Front(); !fd.queue.Empty(); req = fd.queue.Front() { if int64(req.hdr.Len) <= dst.NumBytes() { break } // The request is too large so we cannot process it. All requests must be // smaller than the negotiated size as specified by Connection.MaxWrite set // as part of the FUSE_INIT handshake. errno := -int32(unix.EIO) if req.hdr.Opcode == linux.FUSE_SETXATTR { errno = -int32(unix.E2BIG) } if err := fd.sendError(ctx, errno, req.hdr.Unique); err != nil { return 0, err } fd.queue.Remove(req) req = nil } if req == nil { return 0, linuxerr.ErrWouldBlock } // We already checked the size: dst must be able to fit the whole request. n, err := dst.CopyOut(ctx, req.data) if err != nil { return 0, err } if n != len(req.data) { return 0, linuxerr.EIO } fd.queue.Remove(req) // Remove noReply ones from the map of requests expecting a reply. if req.noReply { fd.numActiveRequests-- delete(fd.completions, req.hdr.Unique) } return int64(n), nil } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is // mounted. If there is an active connection we know there is at least one // filesystem mounted. fd.mu.Lock() defer fd.mu.Unlock() if !fd.connected() { return 0, linuxerr.EPERM } return 0, linuxerr.ENOSYS } // Write implements vfs.FileDescriptionImpl.Write. func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { fd.mu.Lock() defer fd.mu.Unlock() if !fd.connected() { return 0, linuxerr.EPERM } n, err := src.CopyIn(ctx, fd.writeBuf[:]) if err != nil { return 0, err } var hdr linux.FUSEHeaderOut hdr.UnmarshalBytes(fd.writeBuf[:]) fut, ok := fd.completions[hdr.Unique] if !ok { // Server sent us a response for a request we never sent, or for which we // already received a reply (e.g. aborted), an unlikely event. return 0, linuxerr.EINVAL } delete(fd.completions, hdr.Unique) // Copy over the header into the future response. The rest of the payload // will be copied over to the FR's data in the next iteration. fut.hdr = &hdr fut.data = make([]byte, fut.hdr.Len) copy(fut.data, fd.writeBuf[:]) if fut.hdr.Len > uint32(len(fd.writeBuf)) { src = src.DropFirst(len(fd.writeBuf)) n2, err := src.CopyIn(ctx, fut.data[len(fd.writeBuf):]) if err != nil { return 0, err } n += n2 } if err := fd.sendResponse(ctx, fut); err != nil { return 0, err } return int64(n), nil } // Readiness implements vfs.FileDescriptionImpl.Readiness. func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask { fd.mu.Lock() defer fd.mu.Unlock() var ready waiter.EventMask if !fd.connected() { ready |= waiter.EventErr return ready & mask } // FD is always writable. ready |= waiter.WritableEvents if !fd.queue.Empty() { // Have reqs available, FD is readable. ready |= waiter.ReadableEvents } return ready & mask } // EventRegister implements waiter.Waitable.EventRegister. func (fd *DeviceFD) EventRegister(e *waiter.Entry) error { fd.mu.Lock() defer fd.mu.Unlock() fd.waitQueue.EventRegister(e) return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (fd *DeviceFD) EventUnregister(e *waiter.Entry) { fd.mu.Lock() defer fd.mu.Unlock() fd.waitQueue.EventUnregister(e) } // Epollable implements FileDescriptionImpl.Epollable. func (fd *DeviceFD) Epollable() bool { return true } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is // mounted. If there is an active connection we know there is at least one // filesystem mounted. fd.mu.Lock() defer fd.mu.Unlock() if !fd.connected() { return 0, linuxerr.EPERM } return 0, linuxerr.ENOSYS } // sendResponse sends a response to the waiting task (if any). // // +checklocks:fd.mu func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error { // Signal the task waiting on a response if any. defer close(fut.ch) // Signal that the queue is no longer full. select { case fd.fullQueueCh <- struct{}{}: default: } fd.numActiveRequests-- if fut.async { return fd.asyncCallBack(ctx, fut.getResponse()) } return nil } // sendError sends an error response to the waiting task (if any) by calling sendResponse(). // // +checklocks:fd.mu func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUSEOpID) error { // Return the error to the calling task. respHdr := linux.FUSEHeaderOut{ Len: linux.SizeOfFUSEHeaderOut, Error: errno, Unique: unique, } fut, ok := fd.completions[respHdr.Unique] if !ok { // A response for a request we never sent, // or for which we already received a reply (e.g. aborted). return linuxerr.EINVAL } delete(fd.completions, respHdr.Unique) fut.hdr = &respHdr return fd.sendResponse(ctx, fut) } // asyncCallBack executes pre-defined callback function for async requests. // Currently used by: FUSE_INIT. // +checklocks:fd.mu func (fd *DeviceFD) asyncCallBack(ctx context.Context, r *Response) error { switch r.opcode { case linux.FUSE_INIT: creds := auth.CredentialsFromContext(ctx) rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace() return fd.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs)) // TODO(gvisor.dev/issue/3247): support async read: correctly process the response. } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/dev_state.go000066400000000000000000000014761465435605700255210ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fuse import ( "context" ) func (fd *DeviceFD) saveFullQueueCh() int { return cap(fd.fullQueueCh) } func (fd *DeviceFD) loadFullQueueCh(_ context.Context, capacity int) { fd.fullQueueCh = make(chan struct{}, capacity) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/directory.go000066400000000000000000000056131465435605700255440ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fuse import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) // +stateify savable type directoryFD struct { fileDescription } // Allocate implements directoryFD.Allocate. func (*directoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { return linuxerr.EISDIR } // PRead implements vfs.FileDescriptionImpl.PRead. func (*directoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return 0, linuxerr.EISDIR } // Read implements vfs.FileDescriptionImpl.Read. func (*directoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { return 0, linuxerr.EISDIR } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (*directoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { return 0, linuxerr.EISDIR } // Write implements vfs.FileDescriptionImpl.Write. func (*directoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { return 0, linuxerr.EISDIR } // IterDirents implements vfs.FileDescriptionImpl.IterDirents. func (dir *directoryFD) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback) error { fusefs := dir.inode().fs in := linux.FUSEReadIn{ Fh: dir.Fh, Offset: uint64(dir.off.Load()), Size: linux.FUSE_PAGE_SIZE, Flags: dir.statusFlags(), } // TODO(gVisor.dev/issue/3404): Support FUSE_READDIRPLUS. req := fusefs.conn.NewRequest(auth.CredentialsFromContext(ctx), pidFromContext(ctx), dir.inode().nodeID, linux.FUSE_READDIR, &in) res, err := fusefs.conn.Call(ctx, req) if err != nil { return err } if err := res.Error(); err != nil { return err } var out linux.FUSEDirents if err := res.UnmarshalPayload(&out); err != nil { return err } for _, fuseDirent := range out.Dirents { nextOff := int64(fuseDirent.Meta.Off) dirent := vfs.Dirent{ Name: fuseDirent.Name, Type: uint8(fuseDirent.Meta.Type), Ino: fuseDirent.Meta.Ino, NextOff: nextOff, } if err := callback.Handle(dirent); err != nil { return err } dir.off.Store(nextOff) } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/file.go000066400000000000000000000127571465435605700244660ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fuse import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) // fileDescription implements vfs.FileDescriptionImpl for fuse. // // +stateify savable type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.LockFD // the file handle used in userspace. Fh uint64 // Nonseekable indicates we cannot perform seek on a file. Nonseekable bool // DirectIO suggests that fuse use direct IO operations. DirectIO bool // OpenFlag is the flag returned by open. OpenFlag uint32 // off is the file offset. off atomicbitops.Int64 } func (fd *fileDescription) dentry() *kernfs.Dentry { return fd.vfsfd.Dentry().Impl().(*kernfs.Dentry) } func (fd *fileDescription) inode() *inode { return fd.dentry().Inode().(*inode) } func (fd *fileDescription) filesystem() *vfs.Filesystem { return fd.vfsfd.VirtualDentry().Mount().Filesystem() } func (fd *fileDescription) statusFlags() uint32 { return fd.vfsfd.StatusFlags() } // Release implements vfs.FileDescriptionImpl.Release. func (fd *fileDescription) Release(ctx context.Context) { // no need to release if FUSE server doesn't implement Open. conn := fd.inode().fs.conn if conn.noOpen { return } in := linux.FUSEReleaseIn{ Fh: fd.Fh, Flags: fd.statusFlags(), } // TODO(gvisor.dev/issue/3245): add logic when we support file lock owners. inode := fd.inode() inode.attrMu.Lock() defer inode.attrMu.Unlock() var opcode linux.FUSEOpcode if inode.filemode().IsDir() { opcode = linux.FUSE_RELEASEDIR } else { opcode = linux.FUSE_RELEASE } // Ignoring errors and FUSE server replies is analogous to Linux's behavior. req := conn.NewRequest(auth.CredentialsFromContext(ctx), pidFromContext(ctx), inode.nodeID, opcode, &in) // The reply will be ignored since no callback is defined in asyncCallBack(). conn.CallAsync(ctx, req) } // OnClose implements vfs.FileDescriptionImpl.OnClose. func (fd *fileDescription) OnClose(ctx context.Context) error { inode := fd.inode() conn := inode.fs.conn inode.attrMu.Lock() defer inode.attrMu.Unlock() in := linux.FUSEFlushIn{ Fh: fd.Fh, LockOwner: 0, // TODO(gvisor.dev/issue/3245): file lock } req := conn.NewRequest(auth.CredentialsFromContext(ctx), pidFromContext(ctx), inode.nodeID, linux.FUSE_FLUSH, &in) return conn.CallAsync(ctx, req) } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return 0, nil } // Read implements vfs.FileDescriptionImpl.Read. func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { return 0, nil } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { return 0, nil } // Write implements vfs.FileDescriptionImpl.Write. func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { return 0, nil } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { return 0, nil } // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := fd.filesystem() inode := fd.inode() return inode.Stat(ctx, fs, opts) } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { fs := fd.filesystem() creds := auth.CredentialsFromContext(ctx) inode := fd.inode() inode.attrMu.Lock() defer inode.attrMu.Unlock() if err := vfs.CheckSetStat(ctx, creds, &opts, inode.filemode(), auth.KUID(inode.uid.Load()), auth.KGID(inode.gid.Load())); err != nil { return err } return inode.setAttr(ctx, fs, creds, opts, fhOptions{useFh: true, fh: fd.Fh}) } // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *fileDescription) Sync(ctx context.Context) error { inode := fd.inode() inode.attrMu.Lock() defer inode.attrMu.Unlock() conn := inode.fs.conn // no need to proceed if FUSE server doesn't implement Open. if conn.noOpen { return linuxerr.EINVAL } in := linux.FUSEFsyncIn{ Fh: fd.Fh, FsyncFlags: fd.statusFlags(), } // Ignoring errors and FUSE server replies is analogous to Linux's behavior. req := conn.NewRequest(auth.CredentialsFromContext(ctx), pidFromContext(ctx), inode.nodeID, linux.FUSE_FSYNC, &in) // The reply will be ignored since no callback is defined in asyncCallBack(). conn.CallAsync(ctx, req) return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/fuse_abi_autogen_unsafe.go000066400000000000000000000052351465435605700304000ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package fuse import ( "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "io" ) // Marshallable types used by this file. var _ marshal.Marshallable = (*fuseInitRes)(nil) // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *fuseInitRes) Packed() bool { // Type fuseInitRes is dynamic so it might have slice/string headers. Hence, it is not packed. return false } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *fuseInitRes) MarshalUnsafe(dst []byte) []byte { // Type fuseInitRes doesn't have a packed layout in memory, fallback to MarshalBytes. return r.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *fuseInitRes) UnmarshalUnsafe(src []byte) []byte { // Type fuseInitRes doesn't have a packed layout in memory, fallback to UnmarshalBytes. return r.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. //go:nosplit func (r *fuseInitRes) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type fuseInitRes doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. r.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // CopyOut implements marshal.Marshallable.CopyOut. func (r *fuseInitRes) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. //go:nosplit func (r *fuseInitRes) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Type fuseInitRes doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(r.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. r.UnmarshalBytes(buf) // escapes: fallback. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *fuseInitRes) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *fuseInitRes) WriteTo(writer io.Writer) (int64, error) { // Type fuseInitRes doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, r.SizeBytes()) r.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go000066400000000000000000000462371465435605700274330ustar00rootroot00000000000000// automatically generated by stateify. package fuse import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (conn *connection) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.connection" } func (conn *connection) StateFields() []string { return []string{ "fd", "attributeVersion", "initialized", "initializedChan", "connected", "connInitError", "connInitSuccess", "aborted", "numWaiting", "asyncNum", "asyncCongestionThreshold", "asyncNumMax", "maxRead", "maxWrite", "maxPages", "maxActiveRequests", "minor", "atomicOTrunc", "asyncRead", "writebackCache", "bigWrites", "dontMask", "noOpen", } } func (conn *connection) beforeSave() {} // +checklocksignore func (conn *connection) StateSave(stateSinkObject state.Sink) { conn.beforeSave() var initializedChanValue bool initializedChanValue = conn.saveInitializedChan() stateSinkObject.SaveValue(3, initializedChanValue) stateSinkObject.Save(0, &conn.fd) stateSinkObject.Save(1, &conn.attributeVersion) stateSinkObject.Save(2, &conn.initialized) stateSinkObject.Save(4, &conn.connected) stateSinkObject.Save(5, &conn.connInitError) stateSinkObject.Save(6, &conn.connInitSuccess) stateSinkObject.Save(7, &conn.aborted) stateSinkObject.Save(8, &conn.numWaiting) stateSinkObject.Save(9, &conn.asyncNum) stateSinkObject.Save(10, &conn.asyncCongestionThreshold) stateSinkObject.Save(11, &conn.asyncNumMax) stateSinkObject.Save(12, &conn.maxRead) stateSinkObject.Save(13, &conn.maxWrite) stateSinkObject.Save(14, &conn.maxPages) stateSinkObject.Save(15, &conn.maxActiveRequests) stateSinkObject.Save(16, &conn.minor) stateSinkObject.Save(17, &conn.atomicOTrunc) stateSinkObject.Save(18, &conn.asyncRead) stateSinkObject.Save(19, &conn.writebackCache) stateSinkObject.Save(20, &conn.bigWrites) stateSinkObject.Save(21, &conn.dontMask) stateSinkObject.Save(22, &conn.noOpen) } func (conn *connection) afterLoad(context.Context) {} // +checklocksignore func (conn *connection) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &conn.fd) stateSourceObject.Load(1, &conn.attributeVersion) stateSourceObject.Load(2, &conn.initialized) stateSourceObject.Load(4, &conn.connected) stateSourceObject.Load(5, &conn.connInitError) stateSourceObject.Load(6, &conn.connInitSuccess) stateSourceObject.Load(7, &conn.aborted) stateSourceObject.Load(8, &conn.numWaiting) stateSourceObject.Load(9, &conn.asyncNum) stateSourceObject.Load(10, &conn.asyncCongestionThreshold) stateSourceObject.Load(11, &conn.asyncNumMax) stateSourceObject.Load(12, &conn.maxRead) stateSourceObject.Load(13, &conn.maxWrite) stateSourceObject.Load(14, &conn.maxPages) stateSourceObject.Load(15, &conn.maxActiveRequests) stateSourceObject.Load(16, &conn.minor) stateSourceObject.Load(17, &conn.atomicOTrunc) stateSourceObject.Load(18, &conn.asyncRead) stateSourceObject.Load(19, &conn.writebackCache) stateSourceObject.Load(20, &conn.bigWrites) stateSourceObject.Load(21, &conn.dontMask) stateSourceObject.Load(22, &conn.noOpen) stateSourceObject.LoadValue(3, new(bool), func(y any) { conn.loadInitializedChan(ctx, y.(bool)) }) } func (f *fuseDevice) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.fuseDevice" } func (f *fuseDevice) StateFields() []string { return []string{} } func (f *fuseDevice) beforeSave() {} // +checklocksignore func (f *fuseDevice) StateSave(stateSinkObject state.Sink) { f.beforeSave() } func (f *fuseDevice) afterLoad(context.Context) {} // +checklocksignore func (f *fuseDevice) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (fd *DeviceFD) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.DeviceFD" } func (fd *DeviceFD) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "NoLockFD", "waitQueue", "fullQueueCh", "nextOpID", "queue", "numActiveRequests", "completions", "writeBuf", "conn", } } func (fd *DeviceFD) beforeSave() {} // +checklocksignore func (fd *DeviceFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() var fullQueueChValue int fullQueueChValue = fd.saveFullQueueCh() stateSinkObject.SaveValue(5, fullQueueChValue) stateSinkObject.Save(0, &fd.vfsfd) stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &fd.NoLockFD) stateSinkObject.Save(4, &fd.waitQueue) stateSinkObject.Save(6, &fd.nextOpID) stateSinkObject.Save(7, &fd.queue) stateSinkObject.Save(8, &fd.numActiveRequests) stateSinkObject.Save(9, &fd.completions) stateSinkObject.Save(10, &fd.writeBuf) stateSinkObject.Save(11, &fd.conn) } func (fd *DeviceFD) afterLoad(context.Context) {} // +checklocksignore func (fd *DeviceFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.vfsfd) stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &fd.NoLockFD) stateSourceObject.Load(4, &fd.waitQueue) stateSourceObject.Load(6, &fd.nextOpID) stateSourceObject.Load(7, &fd.queue) stateSourceObject.Load(8, &fd.numActiveRequests) stateSourceObject.Load(9, &fd.completions) stateSourceObject.Load(10, &fd.writeBuf) stateSourceObject.Load(11, &fd.conn) stateSourceObject.LoadValue(5, new(int), func(y any) { fd.loadFullQueueCh(ctx, y.(int)) }) } func (dir *directoryFD) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.directoryFD" } func (dir *directoryFD) StateFields() []string { return []string{ "fileDescription", } } func (dir *directoryFD) beforeSave() {} // +checklocksignore func (dir *directoryFD) StateSave(stateSinkObject state.Sink) { dir.beforeSave() stateSinkObject.Save(0, &dir.fileDescription) } func (dir *directoryFD) afterLoad(context.Context) {} // +checklocksignore func (dir *directoryFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &dir.fileDescription) } func (fd *fileDescription) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.fileDescription" } func (fd *fileDescription) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "LockFD", "Fh", "Nonseekable", "DirectIO", "OpenFlag", "off", } } func (fd *fileDescription) beforeSave() {} // +checklocksignore func (fd *fileDescription) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.vfsfd) stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &fd.LockFD) stateSinkObject.Save(4, &fd.Fh) stateSinkObject.Save(5, &fd.Nonseekable) stateSinkObject.Save(6, &fd.DirectIO) stateSinkObject.Save(7, &fd.OpenFlag) stateSinkObject.Save(8, &fd.off) } func (fd *fileDescription) afterLoad(context.Context) {} // +checklocksignore func (fd *fileDescription) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.vfsfd) stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &fd.LockFD) stateSourceObject.Load(4, &fd.Fh) stateSourceObject.Load(5, &fd.Nonseekable) stateSourceObject.Load(6, &fd.DirectIO) stateSourceObject.Load(7, &fd.OpenFlag) stateSourceObject.Load(8, &fd.off) } func (fsType *FilesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.FilesystemType" } func (fsType *FilesystemType) StateFields() []string { return []string{} } func (fsType *FilesystemType) beforeSave() {} // +checklocksignore func (fsType *FilesystemType) StateSave(stateSinkObject state.Sink) { fsType.beforeSave() } func (fsType *FilesystemType) afterLoad(context.Context) {} // +checklocksignore func (fsType *FilesystemType) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (f *filesystemOptions) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.filesystemOptions" } func (f *filesystemOptions) StateFields() []string { return []string{ "mopts", "uid", "gid", "rootMode", "maxActiveRequests", "maxRead", "defaultPermissions", "allowOther", } } func (f *filesystemOptions) beforeSave() {} // +checklocksignore func (f *filesystemOptions) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.mopts) stateSinkObject.Save(1, &f.uid) stateSinkObject.Save(2, &f.gid) stateSinkObject.Save(3, &f.rootMode) stateSinkObject.Save(4, &f.maxActiveRequests) stateSinkObject.Save(5, &f.maxRead) stateSinkObject.Save(6, &f.defaultPermissions) stateSinkObject.Save(7, &f.allowOther) } func (f *filesystemOptions) afterLoad(context.Context) {} // +checklocksignore func (f *filesystemOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.mopts) stateSourceObject.Load(1, &f.uid) stateSourceObject.Load(2, &f.gid) stateSourceObject.Load(3, &f.rootMode) stateSourceObject.Load(4, &f.maxActiveRequests) stateSourceObject.Load(5, &f.maxRead) stateSourceObject.Load(6, &f.defaultPermissions) stateSourceObject.Load(7, &f.allowOther) } func (fs *filesystem) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.filesystem" } func (fs *filesystem) StateFields() []string { return []string{ "Filesystem", "devMinor", "conn", "opts", "clock", } } func (fs *filesystem) beforeSave() {} // +checklocksignore func (fs *filesystem) StateSave(stateSinkObject state.Sink) { fs.beforeSave() stateSinkObject.Save(0, &fs.Filesystem) stateSinkObject.Save(1, &fs.devMinor) stateSinkObject.Save(2, &fs.conn) stateSinkObject.Save(3, &fs.opts) stateSinkObject.Save(4, &fs.clock) } func (fs *filesystem) afterLoad(context.Context) {} // +checklocksignore func (fs *filesystem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fs.Filesystem) stateSourceObject.Load(1, &fs.devMinor) stateSourceObject.Load(2, &fs.conn) stateSourceObject.Load(3, &fs.opts) stateSourceObject.Load(4, &fs.clock) } func (f *fileHandle) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.fileHandle" } func (f *fileHandle) StateFields() []string { return []string{ "new", "handle", "flags", } } func (f *fileHandle) beforeSave() {} // +checklocksignore func (f *fileHandle) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.new) stateSinkObject.Save(1, &f.handle) stateSinkObject.Save(2, &f.flags) } func (f *fileHandle) afterLoad(context.Context) {} // +checklocksignore func (f *fileHandle) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.new) stateSourceObject.Load(1, &f.handle) stateSourceObject.Load(2, &f.flags) } func (i *inode) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.inode" } func (i *inode) StateFields() []string { return []string{ "inodeRefs", "InodeNotAnonymous", "InodeNotSymlink", "InodeWatches", "OrderedChildren", "CachedMappable", "fs", "nodeID", "generation", "entryTime", "attrVersion", "attrTime", "link", "fh", "locks", "watches", "ino", "uid", "gid", "mode", "atime", "mtime", "ctime", "size", "nlink", "blockSize", } } func (i *inode) beforeSave() {} // +checklocksignore func (i *inode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.inodeRefs) stateSinkObject.Save(1, &i.InodeNotAnonymous) stateSinkObject.Save(2, &i.InodeNotSymlink) stateSinkObject.Save(3, &i.InodeWatches) stateSinkObject.Save(4, &i.OrderedChildren) stateSinkObject.Save(5, &i.CachedMappable) stateSinkObject.Save(6, &i.fs) stateSinkObject.Save(7, &i.nodeID) stateSinkObject.Save(8, &i.generation) stateSinkObject.Save(9, &i.entryTime) stateSinkObject.Save(10, &i.attrVersion) stateSinkObject.Save(11, &i.attrTime) stateSinkObject.Save(12, &i.link) stateSinkObject.Save(13, &i.fh) stateSinkObject.Save(14, &i.locks) stateSinkObject.Save(15, &i.watches) stateSinkObject.Save(16, &i.ino) stateSinkObject.Save(17, &i.uid) stateSinkObject.Save(18, &i.gid) stateSinkObject.Save(19, &i.mode) stateSinkObject.Save(20, &i.atime) stateSinkObject.Save(21, &i.mtime) stateSinkObject.Save(22, &i.ctime) stateSinkObject.Save(23, &i.size) stateSinkObject.Save(24, &i.nlink) stateSinkObject.Save(25, &i.blockSize) } func (i *inode) afterLoad(context.Context) {} // +checklocksignore func (i *inode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.inodeRefs) stateSourceObject.Load(1, &i.InodeNotAnonymous) stateSourceObject.Load(2, &i.InodeNotSymlink) stateSourceObject.Load(3, &i.InodeWatches) stateSourceObject.Load(4, &i.OrderedChildren) stateSourceObject.Load(5, &i.CachedMappable) stateSourceObject.Load(6, &i.fs) stateSourceObject.Load(7, &i.nodeID) stateSourceObject.Load(8, &i.generation) stateSourceObject.Load(9, &i.entryTime) stateSourceObject.Load(10, &i.attrVersion) stateSourceObject.Load(11, &i.attrTime) stateSourceObject.Load(12, &i.link) stateSourceObject.Load(13, &i.fh) stateSourceObject.Load(14, &i.locks) stateSourceObject.Load(15, &i.watches) stateSourceObject.Load(16, &i.ino) stateSourceObject.Load(17, &i.uid) stateSourceObject.Load(18, &i.gid) stateSourceObject.Load(19, &i.mode) stateSourceObject.Load(20, &i.atime) stateSourceObject.Load(21, &i.mtime) stateSourceObject.Load(22, &i.ctime) stateSourceObject.Load(23, &i.size) stateSourceObject.Load(24, &i.nlink) stateSourceObject.Load(25, &i.blockSize) } func (r *inodeRefs) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.inodeRefs" } func (r *inodeRefs) StateFields() []string { return []string{ "refCount", } } func (r *inodeRefs) beforeSave() {} // +checklocksignore func (r *inodeRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *inodeRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (fd *regularFileFD) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.regularFileFD" } func (fd *regularFileFD) StateFields() []string { return []string{ "fileDescription", "off", "mappings", "data", } } func (fd *regularFileFD) beforeSave() {} // +checklocksignore func (fd *regularFileFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.fileDescription) stateSinkObject.Save(1, &fd.off) stateSinkObject.Save(2, &fd.mappings) stateSinkObject.Save(3, &fd.data) } func (fd *regularFileFD) afterLoad(context.Context) {} // +checklocksignore func (fd *regularFileFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.fileDescription) stateSourceObject.Load(1, &fd.off) stateSourceObject.Load(2, &fd.mappings) stateSourceObject.Load(3, &fd.data) } func (l *requestList) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.requestList" } func (l *requestList) StateFields() []string { return []string{ "head", "tail", } } func (l *requestList) beforeSave() {} // +checklocksignore func (l *requestList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *requestList) afterLoad(context.Context) {} // +checklocksignore func (l *requestList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *requestEntry) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.requestEntry" } func (e *requestEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *requestEntry) beforeSave() {} // +checklocksignore func (e *requestEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *requestEntry) afterLoad(context.Context) {} // +checklocksignore func (e *requestEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (r *Request) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.Request" } func (r *Request) StateFields() []string { return []string{ "requestEntry", "id", "hdr", "data", "async", "noReply", } } func (r *Request) beforeSave() {} // +checklocksignore func (r *Request) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.requestEntry) stateSinkObject.Save(1, &r.id) stateSinkObject.Save(2, &r.hdr) stateSinkObject.Save(3, &r.data) stateSinkObject.Save(4, &r.async) stateSinkObject.Save(5, &r.noReply) } func (r *Request) afterLoad(context.Context) {} // +checklocksignore func (r *Request) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.requestEntry) stateSourceObject.Load(1, &r.id) stateSourceObject.Load(2, &r.hdr) stateSourceObject.Load(3, &r.data) stateSourceObject.Load(4, &r.async) stateSourceObject.Load(5, &r.noReply) } func (fRes *futureResponse) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.futureResponse" } func (fRes *futureResponse) StateFields() []string { return []string{ "opcode", "hdr", "data", "async", } } func (fRes *futureResponse) beforeSave() {} // +checklocksignore func (fRes *futureResponse) StateSave(stateSinkObject state.Sink) { fRes.beforeSave() stateSinkObject.Save(0, &fRes.opcode) stateSinkObject.Save(1, &fRes.hdr) stateSinkObject.Save(2, &fRes.data) stateSinkObject.Save(3, &fRes.async) } // +checklocksignore func (fRes *futureResponse) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fRes.opcode) stateSourceObject.Load(1, &fRes.hdr) stateSourceObject.Load(2, &fRes.data) stateSourceObject.Load(3, &fRes.async) stateSourceObject.AfterLoad(func() { fRes.afterLoad(ctx) }) } func (r *Response) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.Response" } func (r *Response) StateFields() []string { return []string{ "opcode", "hdr", "data", } } func (r *Response) beforeSave() {} // +checklocksignore func (r *Response) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.opcode) stateSinkObject.Save(1, &r.hdr) stateSinkObject.Save(2, &r.data) } func (r *Response) afterLoad(context.Context) {} // +checklocksignore func (r *Response) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.opcode) stateSourceObject.Load(1, &r.hdr) stateSourceObject.Load(2, &r.data) } func init() { state.Register((*connection)(nil)) state.Register((*fuseDevice)(nil)) state.Register((*DeviceFD)(nil)) state.Register((*directoryFD)(nil)) state.Register((*fileDescription)(nil)) state.Register((*FilesystemType)(nil)) state.Register((*filesystemOptions)(nil)) state.Register((*filesystem)(nil)) state.Register((*fileHandle)(nil)) state.Register((*inode)(nil)) state.Register((*inodeRefs)(nil)) state.Register((*regularFileFD)(nil)) state.Register((*requestList)(nil)) state.Register((*requestEntry)(nil)) state.Register((*Request)(nil)) state.Register((*futureResponse)(nil)) state.Register((*Response)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/fuse_unsafe_abi_autogen_unsafe.go000066400000000000000000000001441465435605700317330ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package fuse import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/fuse_unsafe_state_autogen.go000066400000000000000000000000661465435605700307620ustar00rootroot00000000000000// automatically generated by stateify. package fuse golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/fusefs.go000066400000000000000000000237461465435605700250420ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package fuse implements fusefs. package fuse import ( "math" "strconv" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // Name is the default filesystem name. const Name = "fuse" // maxActiveRequestsDefault is the default setting controlling the upper bound // on the number of active requests at any given time. const maxActiveRequestsDefault = 10000 // FilesystemType implements vfs.FilesystemType. // // +stateify savable type FilesystemType struct{} // +stateify savable type filesystemOptions struct { // mopts contains the raw, unparsed mount options passed to this filesystem. mopts string // uid of the mount owner. uid auth.KUID // gid of the mount owner. gid auth.KGID // rootMode specifies the file mode of the filesystem's root. rootMode linux.FileMode // maxActiveRequests specifies the maximum number of active requests that can // exist at any time. Any further requests will block when trying to // Call the server. maxActiveRequests uint64 // maxRead is the max number of bytes to read, // specified as "max_read" in fs parameters. // If not specified by user, use math.MaxUint32 as default value. maxRead uint32 // defaultPermissions is the default_permissions mount option. It instructs // the kernel to perform a standard unix permission checks based on // ownership and mode bits, instead of deferring the check to the server. // // Immutable after mount. defaultPermissions bool // allowOther is the allow_other mount option. It allows processes that // don't own the FUSE mount to call into it. // // Immutable after mount. allowOther bool } // filesystem implements vfs.FilesystemImpl. // // +stateify savable type filesystem struct { kernfs.Filesystem devMinor uint32 // conn is used for communication between the FUSE server // daemon and the sentry fusefs. conn *connection // opts is the options the fusefs is initialized with. opts *filesystemOptions // clock is a real-time clock used to set timestamps in file operations. clock time.Clock } // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name } // Release implements vfs.FilesystemType.Release. func (FilesystemType) Release(ctx context.Context) {} // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { devMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, nil, err } fsopts := filesystemOptions{mopts: opts.Data} mopts := vfs.GenericParseMountOptions(opts.Data) deviceDescriptorStr, ok := mopts["fd"] if !ok { ctx.Warningf("fusefs.FilesystemType.GetFilesystem: mandatory mount option fd missing") return nil, nil, linuxerr.EINVAL } delete(mopts, "fd") deviceDescriptor, err := strconv.ParseInt(deviceDescriptorStr, 10 /* base */, 32 /* bitSize */) if err != nil { ctx.Debugf("fusefs.FilesystemType.GetFilesystem: invalid fd: %q (%v)", deviceDescriptorStr, err) return nil, nil, linuxerr.EINVAL } kernelTask := kernel.TaskFromContext(ctx) if kernelTask == nil { log.Warningf("%s.GetFilesystem: couldn't get kernel task from context", fsType.Name()) return nil, nil, linuxerr.EINVAL } fuseFDGeneric := kernelTask.GetFile(int32(deviceDescriptor)) if fuseFDGeneric == nil { return nil, nil, linuxerr.EINVAL } defer fuseFDGeneric.DecRef(ctx) fuseFD, ok := fuseFDGeneric.Impl().(*DeviceFD) if !ok { log.Warningf("%s.GetFilesystem: device FD is %T, not a FUSE device", fsType.Name, fuseFDGeneric) return nil, nil, linuxerr.EINVAL } // Parse and set all the other supported FUSE mount options. // TODO(gVisor.dev/issue/3229): Expand the supported mount options. if uidStr, ok := mopts["user_id"]; ok { delete(mopts, "user_id") uid, err := strconv.ParseUint(uidStr, 10, 32) if err != nil { log.Warningf("%s.GetFilesystem: invalid user_id: user_id=%s", fsType.Name(), uidStr) return nil, nil, linuxerr.EINVAL } kuid := creds.UserNamespace.MapToKUID(auth.UID(uid)) if !kuid.Ok() { ctx.Warningf("fusefs.FilesystemType.GetFilesystem: unmapped uid: %d", uid) return nil, nil, linuxerr.EINVAL } fsopts.uid = kuid } else { ctx.Warningf("fusefs.FilesystemType.GetFilesystem: mandatory mount option user_id missing") return nil, nil, linuxerr.EINVAL } if gidStr, ok := mopts["group_id"]; ok { delete(mopts, "group_id") gid, err := strconv.ParseUint(gidStr, 10, 32) if err != nil { log.Warningf("%s.GetFilesystem: invalid group_id: group_id=%s", fsType.Name(), gidStr) return nil, nil, linuxerr.EINVAL } kgid := creds.UserNamespace.MapToKGID(auth.GID(gid)) if !kgid.Ok() { ctx.Warningf("fusefs.FilesystemType.GetFilesystem: unmapped gid: %d", gid) return nil, nil, linuxerr.EINVAL } fsopts.gid = kgid } else { ctx.Warningf("fusefs.FilesystemType.GetFilesystem: mandatory mount option group_id missing") return nil, nil, linuxerr.EINVAL } if modeStr, ok := mopts["rootmode"]; ok { delete(mopts, "rootmode") mode, err := strconv.ParseUint(modeStr, 8, 32) if err != nil { log.Warningf("%s.GetFilesystem: invalid mode: %q", fsType.Name(), modeStr) return nil, nil, linuxerr.EINVAL } fsopts.rootMode = linux.FileMode(mode) } else { ctx.Warningf("fusefs.FilesystemType.GetFilesystem: mandatory mount option rootmode missing") return nil, nil, linuxerr.EINVAL } // Set the maxInFlightRequests option. fsopts.maxActiveRequests = maxActiveRequestsDefault if maxReadStr, ok := mopts["max_read"]; ok { delete(mopts, "max_read") maxRead, err := strconv.ParseUint(maxReadStr, 10, 32) if err != nil { log.Warningf("%s.GetFilesystem: invalid max_read: max_read=%s", fsType.Name(), maxReadStr) return nil, nil, linuxerr.EINVAL } if maxRead < fuseMinMaxRead { maxRead = fuseMinMaxRead } fsopts.maxRead = uint32(maxRead) } else { fsopts.maxRead = math.MaxUint32 } if _, ok := mopts["default_permissions"]; ok { delete(mopts, "default_permissions") fsopts.defaultPermissions = true } if _, ok := mopts["allow_other"]; ok { delete(mopts, "allow_other") fsopts.allowOther = true } // Check for unparsed options. if len(mopts) != 0 { log.Warningf("%s.GetFilesystem: unsupported or unknown options: %v", fsType.Name(), mopts) return nil, nil, linuxerr.EINVAL } fuseFD.mu.Lock() connected := fuseFD.connected() // Create a new FUSE filesystem. fs, err := newFUSEFilesystem(ctx, vfsObj, &fsType, fuseFD, devMinor, &fsopts) if err != nil { log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err) fuseFD.mu.Unlock() return nil, nil, err } fuseFD.mu.Unlock() // Send a FUSE_INIT request to the FUSE daemon server before returning. // This call is not blocking. if !connected { if err := fs.conn.InitSend(creds, uint32(kernelTask.ThreadID())); err != nil { log.Warningf("%s.InitSend: failed with error: %v", fsType.Name(), err) return nil, nil, err } } // root is the fusefs root directory. root := fs.newRoot(ctx, creds, fsopts.rootMode) return fs.VFSFilesystem(), root.VFSDentry(), nil } // newFUSEFilesystem creates a new FUSE filesystem. // +checklocks:fuseFD.mu func newFUSEFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, fsType *FilesystemType, fuseFD *DeviceFD, devMinor uint32, opts *filesystemOptions) (*filesystem, error) { if !fuseFD.connected() { conn, err := newFUSEConnection(ctx, fuseFD, opts) if err != nil { log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err) return nil, linuxerr.EINVAL } fuseFD.conn = conn } fs := &filesystem{ devMinor: devMinor, opts: opts, conn: fuseFD.conn, clock: time.RealtimeClockFromContext(ctx), } fs.VFSFilesystem().Init(vfsObj, fsType, fs) return fs, nil } // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) } // MountOptions implements vfs.FilesystemImpl.MountOptions. func (fs *filesystem) MountOptions() string { return fs.opts.mopts } func (fs *filesystem) newRoot(ctx context.Context, creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry { i := &inode{fs: fs, nodeID: 1} i.attrMu.Lock() i.init(creds, linux.UNNAMED_MAJOR, fs.devMinor, 1, linux.ModeDirectory|0755, 2) i.attrMu.Unlock() i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) i.InitRefs() var d kernfs.Dentry d.InitRoot(&fs.Filesystem, i) return &d } func (fs *filesystem) newInode(ctx context.Context, out linux.FUSEEntryOut) kernfs.Inode { attr := out.Attr i := &inode{fs: fs, nodeID: out.NodeID, generation: out.Generation} i.attrMu.Lock() defer i.attrMu.Unlock() creds := auth.Credentials{EffectiveKGID: auth.KGID(attr.UID), EffectiveKUID: auth.KUID(attr.UID)} i.init(&creds, linux.UNNAMED_MAJOR, fs.devMinor, out.NodeID, linux.FileMode(attr.Mode), attr.Nlink) i.updateAttrs(attr, int64(out.AttrValid), int64(out.AttrValidNSec)) i.updateEntryTime(int64(out.EntryValid), int64(out.EntryValidNSec)) i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) i.InitRefs() return i } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/inode.go000066400000000000000000000674531465435605700246500ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fuse import ( "fmt" gotime "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) // +stateify savable type fileHandle struct { new bool handle uint64 flags uint32 } // inode implements kernfs.Inode. // // +stateify savable type inode struct { inodeRefs kernfs.InodeNotAnonymous kernfs.InodeNotSymlink kernfs.InodeWatches kernfs.OrderedChildren kernfs.CachedMappable // the owning filesystem. fs is immutable. fs *filesystem // nodeID is a unique id which identifies the inode between userspace and // the sentry. generation is used to distinguish inodes in case of nodeID // reuse. Both are immutable. nodeID uint64 generation uint64 // entryTime is the time at which the entry must be revalidated. Reading // entryTime requires either using entryTimeSeq and SeqAtomicLoadTime, or // that attrMu is locked. Writing entryTime requires that attrMu is locked // and that entryTimeSeq is in a writer critical section. entryTimeSeq sync.SeqCount `state:"nosave"` entryTime time.Time // attrVersion is the version of the last attribute change. attrVersion atomicbitops.Uint64 // attrTime is the time at which the attributes become invalid. attrTime time.Time // link is result of following a symbolic link. link string // fh caches the file handle returned by the server from a FUSE_CREATE request // so we don't have to send a separate FUSE_OPEN request. fh fileHandle locks vfs.FileLocks watches vfs.Watches // attrMu protects the attributes of this inode. attrMu sync.Mutex `state:"nosave"` // +checklocks:attrMu ino atomicbitops.Uint64 // Stat data, not accessed for path walking. // +checklocks:attrMu uid atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic. // +checklocks:attrMu gid atomicbitops.Uint32 // auth.KGID, but... // +checklocks:attrMu mode atomicbitops.Uint32 // File type and mode. // Timestamps in nanoseconds from the unix epoch. // +checklocks:attrMu atime atomicbitops.Int64 // +checklocks:attrMu mtime atomicbitops.Int64 // +checklocks:attrMu ctime atomicbitops.Int64 // +checklocks:attrMu size atomicbitops.Uint64 // nlink counts the number of hard links to this inode. It's updated and // accessed used atomic operations but not protected by attrMu. nlink atomicbitops.Uint32 // +checklocks:attrMu blockSize atomicbitops.Uint32 // 0 if unknown. } func blockerFromContext(ctx context.Context) context.Blocker { kernelTask := kernel.TaskFromContext(ctx) if kernelTask == nil { return ctx } return kernelTask } func pidFromContext(ctx context.Context) uint32 { kernelTask := kernel.TaskFromContext(ctx) if kernelTask == nil { return 0 } return uint32(kernelTask.ThreadID()) } func umaskFromContext(ctx context.Context) uint32 { kernelTask := kernel.TaskFromContext(ctx) umask := uint32(0) if kernelTask != nil { umask = uint32(kernelTask.FSContext().Umask()) } return umask } func (i *inode) Mode() linux.FileMode { i.attrMu.Lock() defer i.attrMu.Unlock() return i.filemode() } func (i *inode) UID() auth.KUID { i.attrMu.Lock() defer i.attrMu.Unlock() return auth.KUID(i.uid.Load()) } func (i *inode) GID() auth.KGID { i.attrMu.Lock() defer i.attrMu.Unlock() return auth.KGID(i.gid.Load()) } // +checklocks:i.attrMu func (i *inode) filemode() linux.FileMode { return linux.FileMode(i.mode.Load()) } // touchCMTime updates the ctime and mtime attributes to be the current time. // // +checklocks:i.attrMu func (i *inode) touchCMtime() { now := i.fs.clock.Now().Nanoseconds() i.mtime.Store(now) i.ctime.Store(now) } // touchAtime updates the atime attribute to be the current time. // // +checklocks:i.attrMu func (i *inode) touchAtime() { i.atime.Store(i.fs.clock.Now().Nanoseconds()) } // +checklocks:i.attrMu func (i *inode) init(creds *auth.Credentials, devMajor, devMinor uint32, nodeid uint64, mode linux.FileMode, nlink uint32) { if mode.FileType() == 0 { panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode)) } i.nodeID = nodeid i.ino.Store(nodeid) i.mode.Store(uint32(mode)) i.uid.Store(uint32(creds.EffectiveKUID)) i.gid.Store(uint32(creds.EffectiveKGID)) i.nlink.Store(nlink) i.blockSize.Store(hostarch.PageSize) now := i.fs.clock.Now().Nanoseconds() i.atime.Store(now) i.mtime.Store(now) i.ctime.Store(now) } // +checklocks:i.attrMu func (i *inode) updateEntryTime(entrySec, entryNSec int64) { entryTime := time.FromTimespec(linux.Timespec{Sec: entrySec, Nsec: entryNSec}) SeqAtomicStoreTime(&i.entryTimeSeq, &i.entryTime, i.fs.clock.Now().AddTime(entryTime)) } // CheckPermissions implements kernfs.Inode.CheckPermissions. func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { // Since FUSE operations are ultimately backed by a userspace process (the // fuse daemon), allowing a process to call into fusefs grants the daemon // ptrace-like capabilities over the calling process. Because of this, by // default FUSE only allows the mount owner to interact with the // filesystem. This explicitly excludes setuid/setgid processes. // // This behaviour can be overridden with the 'allow_other' mount option. // // See fs/fuse/dir.c:fuse_allow_current_process() in Linux. if !i.fs.opts.allowOther { if creds.RealKUID != i.fs.opts.uid || creds.EffectiveKUID != i.fs.opts.uid || creds.SavedKUID != i.fs.opts.uid || creds.RealKGID != i.fs.opts.gid || creds.EffectiveKGID != i.fs.opts.gid || creds.SavedKGID != i.fs.opts.gid { return linuxerr.EACCES } } // By default, fusefs delegates all permission checks to the server. // However, standard unix permission checks can be enabled with the // default_permissions mount option. i.attrMu.Lock() defer i.attrMu.Unlock() refreshed := false opts := vfs.StatOptions{Mask: linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID} if i.fs.opts.defaultPermissions || (ats.MayExec() && i.filemode().FileType() == linux.S_IFREG) { if i.fs.clock.Now().After(i.attrTime) { refreshed = true if _, err := i.getAttr(ctx, i.fs.VFSFilesystem(), opts, 0, 0); err != nil { return err } } } if i.fs.opts.defaultPermissions || (ats.MayExec() && i.filemode().FileType() == linux.S_IFREG) { err := vfs.GenericCheckPermissions(creds, ats, linux.FileMode(i.mode.Load()), auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())) if linuxerr.Equals(linuxerr.EACCES, err) && !refreshed { if _, err := i.getAttr(ctx, i.fs.VFSFilesystem(), opts, 0, 0); err != nil { return err } return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(i.mode.Load()), auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())) } return err } else if ats.MayRead() || ats.MayWrite() || ats.MayExec() { in := linux.FUSEAccessIn{Mask: uint32(ats)} req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), pidFromContext(ctx), i.nodeID, linux.FUSE_ACCESS, &in) res, err := i.fs.conn.Call(ctx, req) if err != nil { return err } return res.Error() } return nil } // Open implements kernfs.Inode.Open. func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK | linux.O_NOCTTY | linux.O_APPEND | linux.O_DIRECT i.attrMu.Lock() defer i.attrMu.Unlock() if opts.Flags&linux.O_LARGEFILE == 0 && i.size.Load() > linux.MAX_NON_LFS { return nil, linuxerr.EOVERFLOW } var ( fd *fileDescription fdImpl vfs.FileDescriptionImpl opcode linux.FUSEOpcode ) switch i.filemode().FileType() { case linux.S_IFREG: regularFD := ®ularFileFD{} fd = &(regularFD.fileDescription) fdImpl = regularFD opcode = linux.FUSE_OPEN case linux.S_IFDIR: if opts.Flags&linux.O_CREAT != 0 { return nil, linuxerr.EISDIR } if ats := vfs.AccessTypesForOpenFlags(&opts); ats.MayWrite() { return nil, linuxerr.EISDIR } if opts.Flags&linux.O_DIRECT != 0 { return nil, linuxerr.EINVAL } directoryFD := &directoryFD{} fd = &(directoryFD.fileDescription) fdImpl = directoryFD opcode = linux.FUSE_OPENDIR case linux.S_IFLNK: return nil, linuxerr.ELOOP } fd.LockFD.Init(&i.locks) // FOPEN_KEEP_CACHE is the default flag for noOpen. fd.OpenFlag = linux.FOPEN_KEEP_CACHE truncateRegFile := opts.Flags&linux.O_TRUNC != 0 && i.filemode().FileType() == linux.S_IFREG if truncateRegFile && (i.fh.new || !i.fs.conn.atomicOTrunc) { // If the regular file needs to be truncated, but the connection doesn't // support O_TRUNC or if we are optimizing away the Open RPC, then manually // truncate the file *before* Open. As per libfuse, "If [atomic O_TRUNC is] // disabled, and an application specifies O_TRUNC, fuse first calls // truncate() and then open() with O_TRUNC filtered out.". opts := vfs.SetStatOptions{Stat: linux.Statx{Size: 0, Mask: linux.STATX_SIZE}} if err := i.setAttr(ctx, i.fs.VFSFilesystem(), auth.CredentialsFromContext(ctx), opts, fhOptions{useFh: false}); err != nil { return nil, err } } if i.fh.new { fd.OpenFlag = i.fh.flags fd.Fh = i.fh.handle i.fh.new = false // Only send an open request when the FUSE server supports open or is // opening a directory. } else if !i.fs.conn.noOpen || i.filemode().IsDir() { in := linux.FUSEOpenIn{Flags: opts.Flags & ^uint32(linux.O_CREAT|linux.O_EXCL|linux.O_NOCTTY)} // Clear O_TRUNC if the server doesn't support it. if !i.fs.conn.atomicOTrunc { in.Flags &= ^uint32(linux.O_TRUNC) } req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), pidFromContext(ctx), i.nodeID, opcode, &in) res, err := i.fs.conn.Call(ctx, req) if err != nil { return nil, err } if err := res.Error(); err != nil { if linuxerr.Equals(linuxerr.ENOSYS, err) && !i.filemode().IsDir() { i.fs.conn.noOpen = true } else { return nil, err } } else { out := linux.FUSEOpenOut{} if err := res.UnmarshalPayload(&out); err != nil { return nil, err } fd.OpenFlag = out.OpenFlag fd.Fh = out.Fh // Open was successful. Update inode's size if atomicOTrunc && O_TRUNC. if truncateRegFile && i.fs.conn.atomicOTrunc { i.fs.conn.mu.Lock() i.attrVersion.Store(i.fs.conn.attributeVersion.Add(1)) i.fs.conn.mu.Unlock() i.size.Store(0) i.touchCMtime() } } } if i.filemode().IsDir() { fd.OpenFlag &= ^uint32(linux.FOPEN_DIRECT_IO) } // TODO(gvisor.dev/issue/3234): invalidate mmap after implemented it for FUSE Inode fd.DirectIO = fd.OpenFlag&linux.FOPEN_DIRECT_IO != 0 fdOptions := &vfs.FileDescriptionOptions{} if fd.OpenFlag&linux.FOPEN_NONSEEKABLE != 0 { fdOptions.DenyPRead = true fdOptions.DenyPWrite = true fd.Nonseekable = true } if err := fd.vfsfd.Init(fdImpl, opts.Flags, rp.Mount(), d.VFSDentry(), fdOptions); err != nil { return nil, err } return &fd.vfsfd, nil } func (i *inode) Valid(ctx context.Context, parent *kernfs.Dentry, name string) bool { now := i.fs.clock.Now() if entryTime := SeqAtomicLoadTime(&i.entryTimeSeq, &i.entryTime); entryTime.After(now) { return true } i.attrMu.Lock() defer i.attrMu.Unlock() if i.entryTime.After(now) { return true } in := linux.FUSELookupIn{Name: linux.CString(name)} req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), pidFromContext(ctx), parent.Inode().(*inode).nodeID, linux.FUSE_LOOKUP, &in) res, err := i.fs.conn.Call(ctx, req) if err != nil { return false } if res.Error() != nil { return false } var out linux.FUSEEntryOut if res.UnmarshalPayload(&out) != nil { return false } if i.nodeID != out.NodeID { return false } // Don't enforce fuse_invalid_attr() => fuse_valid_type(), // fuse_valid_size() since inode.updateAttrs() and its callers // don't. But do enforce fuse_stale_inode(): if i.generation != out.Generation { return false } if (i.mode.RacyLoad()^out.Attr.Mode)&linux.S_IFMT != 0 { return false } i.updateEntryTime(int64(out.EntryValid), int64(out.EntryValidNSec)) return true } // Lookup implements kernfs.Inode.Lookup. func (i *inode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { in := linux.FUSELookupIn{Name: linux.CString(name)} return i.newEntry(ctx, name, 0, linux.FUSE_LOOKUP, &in) } // Keep implements kernfs.Inode.Keep. func (i *inode) Keep() bool { // Return true so that kernfs keeps the new dentry pointing to this // inode in the dentry tree. This is needed because inodes created via // Lookup are not temporary. They might refer to existing files on server // that can be Unlink'd/Rmdir'd. return true } // IterDirents implements kernfs.Inode.IterDirents. func (*inode) IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { return offset, nil } // NewFile implements kernfs.Inode.NewFile. func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (kernfs.Inode, error) { opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK | linux.O_NOCTTY in := linux.FUSECreateIn{ CreateMeta: linux.FUSECreateMeta{ Flags: opts.Flags, Mode: uint32(opts.Mode) | linux.S_IFREG, Umask: umaskFromContext(ctx), }, Name: linux.CString(name), } return i.newEntry(ctx, name, linux.S_IFREG, linux.FUSE_CREATE, &in) } // NewNode implements kernfs.Inode.NewNode. func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (kernfs.Inode, error) { in := linux.FUSEMknodIn{ MknodMeta: linux.FUSEMknodMeta{ Mode: uint32(opts.Mode), Rdev: linux.MakeDeviceID(uint16(opts.DevMajor), opts.DevMinor), Umask: umaskFromContext(ctx), }, Name: linux.CString(name), } return i.newEntry(ctx, name, opts.Mode.FileType(), linux.FUSE_MKNOD, &in) } // NewSymlink implements kernfs.Inode.NewSymlink. func (i *inode) NewSymlink(ctx context.Context, name, target string) (kernfs.Inode, error) { in := linux.FUSESymlinkIn{ Name: linux.CString(name), Target: linux.CString(target), } return i.newEntry(ctx, name, linux.S_IFLNK, linux.FUSE_SYMLINK, &in) } // NewLink implements kernfs.Inode.NewLink. func (i *inode) NewLink(ctx context.Context, name string, target kernfs.Inode) (kernfs.Inode, error) { targetInode := target.(*inode) in := linux.FUSELinkIn{ OldNodeID: primitive.Uint64(targetInode.nodeID), Name: linux.CString(name), } return i.newEntry(ctx, name, targetInode.Mode().FileType(), linux.FUSE_LINK, &in) } // Unlink implements kernfs.Inode.Unlink. func (i *inode) Unlink(ctx context.Context, name string, child kernfs.Inode) error { in := linux.FUSEUnlinkIn{Name: linux.CString(name)} req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), pidFromContext(ctx), i.nodeID, linux.FUSE_UNLINK, &in) res, err := i.fs.conn.Call(ctx, req) if err != nil { return err } // only return error, discard res. return res.Error() } // NewDir implements kernfs.Inode.NewDir. func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) { in := linux.FUSEMkdirIn{ MkdirMeta: linux.FUSEMkdirMeta{ Mode: uint32(opts.Mode), Umask: umaskFromContext(ctx), }, Name: linux.CString(name), } return i.newEntry(ctx, name, linux.S_IFDIR, linux.FUSE_MKDIR, &in) } // RmDir implements kernfs.Inode.RmDir. func (i *inode) RmDir(ctx context.Context, name string, child kernfs.Inode) error { in := linux.FUSERmDirIn{Name: linux.CString(name)} req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), pidFromContext(ctx), i.nodeID, linux.FUSE_RMDIR, &in) res, err := i.fs.conn.Call(ctx, req) if err != nil { return err } return res.Error() } // Rename implements kernfs.Inode.Rename. func (i *inode) Rename(ctx context.Context, oldname, newname string, child, dstDir kernfs.Inode) error { dstDirInode := dstDir.(*inode) in := linux.FUSERenameIn{ Newdir: primitive.Uint64(dstDirInode.nodeID), Oldname: linux.CString(oldname), Newname: linux.CString(newname), } req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), pidFromContext(ctx), i.nodeID, linux.FUSE_RENAME, &in) res, err := i.fs.conn.Call(ctx, req) if err != nil { return err } return res.Error() } // newEntry calls FUSE server for entry creation and allocates corresponding // entry according to response. Shared by FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, // FUSE_LINK and FUSE_LOOKUP. func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMode, opcode linux.FUSEOpcode, payload marshal.Marshallable) (kernfs.Inode, error) { req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), pidFromContext(ctx), i.nodeID, opcode, payload) res, err := i.fs.conn.Call(ctx, req) if err != nil { return nil, err } if err := res.Error(); err != nil { return nil, err } out := linux.FUSECreateOut{} if opcode == linux.FUSE_CREATE { if err := res.UnmarshalPayload(&out); err != nil { return nil, err } } else { if err := res.UnmarshalPayload(&out.FUSEEntryOut); err != nil { return nil, err } } if opcode != linux.FUSE_LOOKUP && ((out.Attr.Mode&linux.S_IFMT)^uint32(fileType) != 0 || out.NodeID == 0 || out.NodeID == linux.FUSE_ROOT_ID) { return nil, linuxerr.EIO } child := i.fs.newInode(ctx, out.FUSEEntryOut) if opcode == linux.FUSE_CREATE { // File handler is returned by fuse server at a time of file create. // Save it temporary in a created child, so Open could return it when invoked // to be sure after fh is consumed reset 'isNewFh' flag of inode childI, ok := child.(*inode) if ok { childI.fh.new = true childI.fh.handle = out.FUSEOpenOut.Fh childI.fh.flags = out.FUSEOpenOut.OpenFlag } } return child, nil } // Getlink implements kernfs.Inode.Getlink. func (i *inode) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { path, err := i.Readlink(ctx, mnt) return vfs.VirtualDentry{}, path, err } // Readlink implements kernfs.Inode.Readlink. func (i *inode) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { i.attrMu.Lock() defer i.attrMu.Unlock() if i.filemode().FileType()&linux.S_IFLNK == 0 { return "", linuxerr.EINVAL } if len(i.link) == 0 { req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), pidFromContext(ctx), i.nodeID, linux.FUSE_READLINK, &linux.FUSEEmptyIn{}) res, err := i.fs.conn.Call(ctx, req) if err != nil { return "", err } i.link = string(res.data[res.hdr.SizeBytes():]) if !mnt.Options().ReadOnly { i.attrTime = time.ZeroTime } } return i.link, nil } // getFUSEAttr returns a linux.FUSEAttr of this inode stored in local cache. // // +checklocks:i.attrMu func (i *inode) getFUSEAttr() linux.FUSEAttr { ns := gotime.Second.Nanoseconds() return linux.FUSEAttr{ Ino: i.nodeID, UID: i.uid.Load(), GID: i.gid.Load(), Size: i.size.Load(), Mode: uint32(i.filemode()), BlkSize: i.blockSize.Load(), Atime: uint64(i.atime.Load() / ns), Mtime: uint64(i.mtime.Load() / ns), Ctime: uint64(i.ctime.Load() / ns), AtimeNsec: uint32(i.atime.Load() % ns), MtimeNsec: uint32(i.mtime.Load() % ns), CtimeNsec: uint32(i.ctime.Load() % ns), Nlink: i.nlink.Load(), } } // statFromFUSEAttr makes attributes from linux.FUSEAttr to linux.Statx. The // opts.Sync attribute is ignored since the synchronization is handled by the // FUSE server. func statFromFUSEAttr(attr linux.FUSEAttr, mask, devMinor uint32) linux.Statx { var stat linux.Statx stat.Blksize = attr.BlkSize stat.DevMajor, stat.DevMinor = linux.UNNAMED_MAJOR, devMinor rdevMajor, rdevMinor := linux.DecodeDeviceID(attr.Rdev) stat.RdevMajor, stat.RdevMinor = uint32(rdevMajor), rdevMinor if mask&linux.STATX_MODE != 0 { stat.Mode = uint16(attr.Mode) } if mask&linux.STATX_NLINK != 0 { stat.Nlink = attr.Nlink } if mask&linux.STATX_UID != 0 { stat.UID = attr.UID } if mask&linux.STATX_GID != 0 { stat.GID = attr.GID } if mask&linux.STATX_ATIME != 0 { stat.Atime = linux.StatxTimestamp{ Sec: int64(attr.Atime), Nsec: attr.AtimeNsec, } } if mask&linux.STATX_MTIME != 0 { stat.Mtime = linux.StatxTimestamp{ Sec: int64(attr.Mtime), Nsec: attr.MtimeNsec, } } if mask&linux.STATX_CTIME != 0 { stat.Ctime = linux.StatxTimestamp{ Sec: int64(attr.Ctime), Nsec: attr.CtimeNsec, } } if mask&linux.STATX_INO != 0 { stat.Ino = attr.Ino } if mask&linux.STATX_SIZE != 0 { stat.Size = attr.Size } if mask&linux.STATX_BLOCKS != 0 { stat.Blocks = attr.Blocks } return stat } // getAttr gets the attribute of this inode by issuing a FUSE_GETATTR request // or read from local cache. It updates the corresponding attributes if // necessary. // // +checklocks:i.attrMu func (i *inode) getAttr(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions, flags uint32, fh uint64) (linux.FUSEAttr, error) { // TODO(gvisor.dev/issue/3679): send the request only if // - invalid local cache for fields specified in the opts.Mask // - forced update // - i.attributeTime expired // If local cache is still valid, return local cache. // Currently we always send a request, // and we always set the metadata with the new result, // unless attributeVersion has changed. creds := auth.CredentialsFromContext(ctx) in := linux.FUSEGetAttrIn{ GetAttrFlags: flags, Fh: fh, } req := i.fs.conn.NewRequest(creds, pidFromContext(ctx), i.nodeID, linux.FUSE_GETATTR, &in) res, err := i.fs.conn.Call(ctx, req) if err != nil { return linux.FUSEAttr{}, err } if err := res.Error(); err != nil { return linux.FUSEAttr{}, err } var out linux.FUSEAttrOut if err := res.UnmarshalPayload(&out); err != nil { return linux.FUSEAttr{}, err } // Local version is newer, return the local one. i.fs.conn.mu.Lock() attributeVersion := i.fs.conn.attributeVersion.Load() if attributeVersion != 0 && i.attrVersion.Load() > attributeVersion { i.fs.conn.mu.Unlock() return i.getFUSEAttr(), nil } i.fs.conn.mu.Unlock() i.updateAttrs(out.Attr, int64(out.AttrValid), int64(out.AttrValidNsec)) return out.Attr, nil } // reviseAttr attempts to update the attributes for internal purposes // by calling getAttr with a pre-specified mask. // Used by read, write, lseek. // // +checklocks:i.attrMu func (i *inode) reviseAttr(ctx context.Context, flags uint32, fh uint64) error { // Never need atime for internal purposes. _, err := i.getAttr(ctx, i.fs.VFSFilesystem(), vfs.StatOptions{ Mask: linux.STATX_BASIC_STATS &^ linux.STATX_ATIME, }, flags, fh) return err } // Stat implements kernfs.Inode.Stat. func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { i.attrMu.Lock() defer i.attrMu.Unlock() attr, err := i.getAttr(ctx, fs, opts, 0, 0) if err != nil { return linux.Statx{}, err } return statFromFUSEAttr(attr, opts.Mask, i.fs.devMinor), nil } // DecRef implements kernfs.Inode.DecRef. func (i *inode) DecRef(ctx context.Context) { i.inodeRefs.DecRef(func() { i.Destroy(ctx) }) } // StatFS implements kernfs.Inode.StatFS. func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), pidFromContext(ctx), i.nodeID, linux.FUSE_STATFS, &linux.FUSEEmptyIn{}, ) res, err := i.fs.conn.Call(ctx, req) if err != nil { return linux.Statfs{}, err } if err := res.Error(); err != nil { return linux.Statfs{}, err } var out linux.FUSEStatfsOut if err := res.UnmarshalPayload(&out); err != nil { return linux.Statfs{}, err } return linux.Statfs{ Type: linux.FUSE_SUPER_MAGIC, Blocks: uint64(out.Blocks), BlocksFree: out.BlocksFree, BlocksAvailable: out.BlocksAvailable, Files: out.Files, FilesFree: out.FilesFree, BlockSize: int64(out.BlockSize), NameLength: uint64(out.NameLength), FragmentSize: int64(out.FragmentSize), }, nil } // fattrMaskFromStats converts vfs.SetStatOptions.Stat.Mask to linux stats mask // aligned with the attribute mask defined in include/linux/fs.h. func fattrMaskFromStats(mask uint32) uint32 { var fuseAttrMask uint32 maskMap := map[uint32]uint32{ linux.STATX_MODE: linux.FATTR_MODE, linux.STATX_UID: linux.FATTR_UID, linux.STATX_GID: linux.FATTR_GID, linux.STATX_SIZE: linux.FATTR_SIZE, linux.STATX_ATIME: linux.FATTR_ATIME, linux.STATX_MTIME: linux.FATTR_MTIME, linux.STATX_CTIME: linux.FATTR_CTIME, } for statxMask, fattrMask := range maskMap { if mask&statxMask != 0 { fuseAttrMask |= fattrMask } } return fuseAttrMask } // SetStat implements kernfs.Inode.SetStat. func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { i.attrMu.Lock() defer i.attrMu.Unlock() if err := vfs.CheckSetStat(ctx, creds, &opts, i.filemode(), auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())); err != nil { return err } if opts.Stat.Mask == 0 { return nil } return i.setAttr(ctx, fs, creds, opts, fhOptions{useFh: false}) } type fhOptions struct { useFh bool fh uint64 } // +checklocks:i.attrMu func (i *inode) setAttr(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions, fhOpts fhOptions) error { // We should retain the original file type when assigning a new mode. fattrMask := fattrMaskFromStats(opts.Stat.Mask) if fhOpts.useFh { fattrMask |= linux.FATTR_FH } if opts.Stat.Mask&linux.STATX_ATIME != 0 && opts.Stat.Atime.Nsec == linux.UTIME_NOW { fattrMask |= linux.FATTR_ATIME_NOW } if opts.Stat.Mask&linux.STATX_MTIME != 0 && opts.Stat.Mtime.Nsec == linux.UTIME_NOW { fattrMask |= linux.FATTR_ATIME_NOW } in := linux.FUSESetAttrIn{ Valid: fattrMask, Fh: fhOpts.fh, Size: opts.Stat.Size, Atime: uint64(opts.Stat.Atime.Sec), Mtime: uint64(opts.Stat.Mtime.Sec), Ctime: uint64(opts.Stat.Ctime.Sec), AtimeNsec: opts.Stat.Atime.Nsec, MtimeNsec: opts.Stat.Mtime.Nsec, CtimeNsec: opts.Stat.Ctime.Nsec, Mode: uint32(uint16(i.filemode().FileType()) | opts.Stat.Mode), UID: opts.Stat.UID, GID: opts.Stat.GID, } req := i.fs.conn.NewRequest(creds, pidFromContext(ctx), i.nodeID, linux.FUSE_SETATTR, &in) res, err := i.fs.conn.Call(ctx, req) if err != nil { return err } if err := res.Error(); err != nil { return err } out := linux.FUSEAttrOut{} if err := res.UnmarshalPayload(&out); err != nil { return err } i.updateAttrs(out.Attr, int64(out.AttrValid), int64(out.AttrValidNsec)) return nil } // +checklocks:i.attrMu func (i *inode) updateAttrs(attr linux.FUSEAttr, validSec, validNSec int64) { i.fs.conn.mu.Lock() i.attrVersion.Store(i.fs.conn.attributeVersion.Add(1)) i.fs.conn.mu.Unlock() i.attrTime = i.fs.clock.Now().AddTime(time.FromTimespec(linux.Timespec{Sec: validSec, Nsec: validNSec})) i.ino.Store(attr.Ino) i.mode.Store((attr.Mode & 07777) | (i.mode.Load() & linux.S_IFMT)) i.uid.Store(attr.UID) i.gid.Store(attr.GID) i.atime.Store(attr.ATimeNsec()) i.mtime.Store(attr.MTimeNsec()) i.ctime.Store(attr.CTimeNsec()) i.size.Store(attr.Size) i.nlink.Store(attr.Nlink) if !i.fs.opts.defaultPermissions { i.mode.Store(i.mode.Load() & ^uint32(linux.S_ISVTX)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/inode_refs.go000066400000000000000000000100651465435605700256520ustar00rootroot00000000000000package fuse import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const inodeenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var inodeobj *inode // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type inodeRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *inodeRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *inodeRefs) RefType() string { return fmt.Sprintf("%T", inodeobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *inodeRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *inodeRefs) LogRefs() bool { return inodeenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *inodeRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *inodeRefs) IncRef() { v := r.refCount.Add(1) if inodeenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *inodeRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if inodeenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *inodeRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if inodeenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *inodeRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/read_write.go000066400000000000000000000162521465435605700256660ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fuse import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/usermem" ) // ReadInPages sends FUSE_READ requests for the size after round it up to // a multiple of page size, blocks on it for reply, processes the reply // and returns the payload (or joined payloads) as a byte slice. // This is used for the general purpose reading. // We do not support direct IO (which read the exact number of bytes) // at this moment. func (fs *filesystem) ReadInPages(ctx context.Context, fd *regularFileFD, off uint64, size uint32) ([][]byte, uint32, error) { attributeVersion := fs.conn.attributeVersion.Load() // Round up to a multiple of page size. readSize, _ := hostarch.PageRoundUp(uint64(size)) // One request cannot exceed either maxRead or maxPages. maxPages := fs.conn.maxRead >> hostarch.PageShift if maxPages > uint32(fs.conn.maxPages) { maxPages = uint32(fs.conn.maxPages) } var outs [][]byte var sizeRead uint32 // readSize is a multiple of hostarch.PageSize. // Always request bytes as a multiple of pages. pagesRead, pagesToRead := uint32(0), uint32(readSize>>hostarch.PageShift) // Reuse the same struct for unmarshalling to avoid unnecessary memory allocation. in := linux.FUSEReadIn{ Fh: fd.Fh, LockOwner: 0, // TODO(gvisor.dev/issue/3245): file lock ReadFlags: 0, // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER Flags: fd.statusFlags(), } // This loop is intended for fragmented read where the bytes to read is // larger than either the maxPages or maxRead. // For the majority of reads with normal size, this loop should only // execute once. for pagesRead < pagesToRead { pagesCanRead := pagesToRead - pagesRead if pagesCanRead > maxPages { pagesCanRead = maxPages } in.Offset = off + (uint64(pagesRead) << hostarch.PageShift) in.Size = pagesCanRead << hostarch.PageShift // TODO(gvisor.dev/issue/3247): support async read. req := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), pidFromContext(ctx), fd.inode().nodeID, linux.FUSE_READ, &in) res, err := fs.conn.Call(ctx, req) if err != nil { return nil, 0, err } if err := res.Error(); err != nil { return nil, 0, err } // Not enough bytes in response, // either we reached EOF, // or the FUSE server sends back a response // that cannot even fit the hdr. if len(res.data) <= res.hdr.SizeBytes() { // We treat both case as EOF here for now // since there is no reliable way to detect // the over-short hdr case. break } // Directly using the slice to avoid extra copy. out := res.data[res.hdr.SizeBytes():] outs = append(outs, out) sizeRead += uint32(len(out)) pagesRead += pagesCanRead } defer fs.ReadCallback(ctx, fd.inode(), off, size, sizeRead, attributeVersion) // +checklocksforce: fd.inode() locks are held during fd operations. // No bytes returned: offset >= EOF. if len(outs) == 0 { return nil, 0, io.EOF } return outs, sizeRead, nil } // ReadCallback updates several information after receiving a read response. // Due to readahead, sizeRead can be larger than size. // // +checklocks:i.attrMu func (fs *filesystem) ReadCallback(ctx context.Context, i *inode, off uint64, size uint32, sizeRead uint32, attributeVersion uint64) { // TODO(gvisor.dev/issue/3247): support async read. // If this is called by an async read, correctly process it. // May need to update the signature. i.touchAtime() // Reached EOF. if sizeRead < size { // TODO(gvisor.dev/issue/3630): If we have writeback cache, then we need to fill this hole. // Might need to update the buf to be returned from the Read(). // Update existing size. newSize := off + uint64(sizeRead) fs.conn.mu.Lock() if attributeVersion == i.attrVersion.Load() && newSize < i.size.Load() { i.attrVersion.Store(i.fs.conn.attributeVersion.Add(1)) i.size.Store(newSize) } fs.conn.mu.Unlock() } } // Write sends FUSE_WRITE requests and return the bytes written according to the // response. func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, offset int64, src usermem.IOSequence) (int64, int64, error) { // One request cannot exceed either maxWrite or maxPages. maxWrite := uint32(fs.conn.maxPages) << hostarch.PageShift if maxWrite > fs.conn.maxWrite { maxWrite = fs.conn.maxWrite } // Reuse the same struct for unmarshalling to avoid unnecessary memory allocation. in := linux.FUSEWritePayloadIn{ Header: linux.FUSEWriteIn{ Fh: fd.Fh, // TODO(gvisor.dev/issue/3245): file lock LockOwner: 0, // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER // TODO(gvisor.dev/issue/3237): |= linux.FUSE_WRITE_CACHE (not added yet) WriteFlags: 0, Flags: fd.statusFlags(), }, } // This loop is intended for fragmented write where the bytes to write is // larger than either the maxWrite or maxPages or when bigWrites is false. // Unless a small value for max_write is explicitly used, this loop // is expected to execute only once for the majority of the writes. n := int64(0) end := offset + src.NumBytes() for n < end { writeSize := uint32(end - n) // Limit the write size to one page. // Note that the bigWrites flag is obsolete, // latest libfuse always sets it on. if !fs.conn.bigWrites && writeSize > hostarch.PageSize { writeSize = hostarch.PageSize } // Limit the write size to maxWrite. if writeSize > maxWrite { writeSize = maxWrite } // TODO(gvisor.dev/issue/3237): Add cache support: // buffer cache. Ideally we write from src to our buffer cache first. // The slice passed to fs.Write() should be a slice from buffer cache. data := make([]byte, writeSize) cp, _ := src.CopyIn(ctx, data) data = data[:cp] in.Header.Offset = uint64(offset) in.Header.Size = uint32(cp) in.Payload = data req := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), pidFromContext(ctx), fd.inode().nodeID, linux.FUSE_WRITE, &in) // TODO(gvisor.dev/issue/3247): support async write. res, err := fs.conn.Call(ctx, req) if err != nil { return n, offset, err } out := linux.FUSEWriteOut{} if err := res.UnmarshalPayload(&out); err != nil { return n, offset, err } n += int64(out.Size) offset += int64(out.Size) src = src.DropFirst64(int64(out.Size)) if err := res.Error(); err != nil { return n, offset, err } // Write more than requested? EIO. if out.Size > writeSize { return n, offset, linuxerr.EIO } // Break if short write. Not necessarily an error. if out.Size != writeSize { break } } return n, offset, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/register.go000066400000000000000000000017311465435605700253610ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fuse import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // Register registers the FUSE device with vfsObj. func Register(vfsObj *vfs.VirtualFilesystem) error { return vfsObj.RegisterDevice(vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, fuseDevice{}, &vfs.RegisterDeviceOptions{ GroupName: "misc", Pathname: "fuse", FilePerms: 0666, }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/regular_file.go000066400000000000000000000215541465435605700262020ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fuse import ( "io" "math" "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) // +stateify savable type regularFileFD struct { fileDescription // offMu protects off. offMu sync.Mutex `state:"nosave"` // off is the file offset. // +checklocks:offMu off int64 // mapsMu protects mappings. mapsMu sync.Mutex `state:"nosave"` // mappings tracks mappings of the file into memmap.MappingSpaces. // // Protected by mapsMu. mappings memmap.MappingSet // dataMu protects the fields below. dataMu sync.RWMutex `state:"nosave"` // data maps offsets into the file to offsets into memFile that store // the file's data. // // Protected by dataMu. data fsutil.FileRangeSet } // Seek implements vfs.FileDescriptionImpl.Allocate. func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { if mode & ^uint64(linux.FALLOC_FL_KEEP_SIZE|linux.FALLOC_FL_PUNCH_HOLE|linux.FALLOC_FL_ZERO_RANGE) != 0 { return linuxerr.EOPNOTSUPP } in := linux.FUSEFallocateIn{ Fh: fd.Fh, Offset: uint64(offset), Length: uint64(length), Mode: uint32(mode), } i := fd.inode() req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), pidFromContext(ctx), i.nodeID, linux.FUSE_FALLOCATE, &in) res, err := i.fs.conn.Call(ctx, req) if err != nil { return err } if err := res.Error(); err != nil { return err } i.attrMu.Lock() defer i.attrMu.Unlock() if uint64(offset+length) > i.size.Load() { if err := i.reviseAttr(ctx, linux.FUSE_GETATTR_FH, fd.Fh); err != nil { return err } // If the offset after update is still too large, return error. if uint64(offset) >= i.size.Load() { return io.EOF } } return nil } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { fd.offMu.Lock() defer fd.offMu.Unlock() inode := fd.inode() inode.attrMu.Lock() defer inode.attrMu.Unlock() switch whence { case linux.SEEK_SET: // use offset as specified case linux.SEEK_CUR: offset += fd.off case linux.SEEK_END: offset += int64(inode.size.Load()) default: return 0, linuxerr.EINVAL } if offset < 0 { return 0, linuxerr.EINVAL } fd.off = offset return offset, nil } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { if offset < 0 { return 0, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, linuxerr.EOPNOTSUPP } size := dst.NumBytes() if size == 0 { // Early return if count is 0. return 0, nil } else if size > math.MaxUint32 { // FUSE only supports uint32 for size. // Overflow. return 0, linuxerr.EINVAL } // TODO(gvisor.dev/issue/3678): Add direct IO support. inode := fd.inode() inode.attrMu.Lock() defer inode.attrMu.Unlock() // Reading beyond EOF, update file size if outdated. if uint64(offset+size) > inode.size.Load() { if err := inode.reviseAttr(ctx, linux.FUSE_GETATTR_FH, fd.Fh); err != nil { return 0, err } // If the offset after update is still too large, return error. if uint64(offset) >= inode.size.Load() { return 0, io.EOF } } // Truncate the read with updated file size. fileSize := inode.size.Load() if uint64(offset+size) > fileSize { size = int64(fileSize) - offset } buffers, n, err := inode.fs.ReadInPages(ctx, fd, uint64(offset), uint32(size)) if err != nil { return 0, err } // TODO(gvisor.dev/issue/3237): support indirect IO (e.g. caching), // store the bytes that were read ahead. // Update the number of bytes to copy for short read. if n < uint32(size) { size = int64(n) } // Copy the bytes read to the dst. // This loop is intended for fragmented reads. // For the majority of reads, this loop only execute once. var copied int64 for _, buffer := range buffers { toCopy := int64(len(buffer)) if copied+toCopy > size { toCopy = size - copied } cp, err := dst.DropFirst64(copied).CopyOut(ctx, buffer[:toCopy]) if err != nil { return 0, err } if int64(cp) != toCopy { return 0, linuxerr.EIO } copied += toCopy } return copied, nil } // Read implements vfs.FileDescriptionImpl.Read. func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { fd.offMu.Lock() n, err := fd.PRead(ctx, dst, fd.off, opts) fd.off += n fd.offMu.Unlock() return n, err } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { n, _, err := fd.pwrite(ctx, src, offset, opts) return n, err } // Write implements vfs.FileDescriptionImpl.Write. func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { fd.offMu.Lock() n, off, err := fd.pwrite(ctx, src, fd.off, opts) fd.off = off fd.offMu.Unlock() return n, err } // pwrite returns the number of bytes written, final offset and error. The // final offset should be ignored by PWrite. func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, int64, error) { if offset < 0 { return 0, offset, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, offset, linuxerr.EOPNOTSUPP } inode := fd.inode() inode.attrMu.Lock() defer inode.attrMu.Unlock() // If the file is opened with O_APPEND, update offset to file size. // Note: since our Open() implements the interface of kernfs, // and kernfs currently does not support O_APPEND, this will never // be true before we switch out from kernfs. if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { // Locking inode.metadataMu is sufficient for reading size offset = int64(inode.size.Load()) } srclen := src.NumBytes() if srclen > math.MaxUint32 { // FUSE only supports uint32 for size. // Overflow. return 0, offset, linuxerr.EINVAL } if end := offset + srclen; end < offset { // Overflow. return 0, offset, linuxerr.EINVAL } limit, err := vfs.CheckLimit(ctx, offset, srclen) if err != nil { return 0, offset, err } if limit == 0 { // Return before causing any side effects. return 0, offset, nil } src = src.TakeFirst64(limit) n, offset, err := inode.fs.Write(ctx, fd, offset, src) if n == 0 { // We have checked srclen != 0 previously. // If err == nil, then it's a short write and we return EIO. return 0, offset, linuxerr.EIO } if offset > int64(inode.size.Load()) { inode.size.Store(uint64(offset)) inode.fs.conn.attributeVersion.Add(1) } inode.touchCMtime() return n, offset, err } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { return linuxerr.ENOSYS } // AddMapping implements memmap.Mappable.AddMapping. func (fd *regularFileFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { return linuxerr.ENOSYS } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (fd *regularFileFD) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { } // CopyMapping implements memmap.Mappable.CopyMapping. func (fd *regularFileFD) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { return linuxerr.ENOSYS } // Translate implements memmap.Mappable.Translate. func (fd *regularFileFD) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { return nil, linuxerr.ENOSYS } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (fd *regularFileFD) InvalidateUnsavable(ctx context.Context) error { return linuxerr.ENOSYS } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/request_list.go000066400000000000000000000120731465435605700262610ustar00rootroot00000000000000package fuse // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type requestElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (requestElementMapper) linkerFor(elem *Request) *Request { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type requestList struct { head *Request tail *Request } // Reset resets list l to the empty state. func (l *requestList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *requestList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *requestList) Front() *Request { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *requestList) Back() *Request { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *requestList) Len() (count int) { for e := l.Front(); e != nil; e = (requestElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *requestList) PushFront(e *Request) { linker := requestElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { requestElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *requestList) PushFrontList(m *requestList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { requestElementMapper{}.linkerFor(l.head).SetPrev(m.tail) requestElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *requestList) PushBack(e *Request) { linker := requestElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { requestElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *requestList) PushBackList(m *requestList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { requestElementMapper{}.linkerFor(l.tail).SetNext(m.head) requestElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *requestList) InsertAfter(b, e *Request) { bLinker := requestElementMapper{}.linkerFor(b) eLinker := requestElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { requestElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *requestList) InsertBefore(a, e *Request) { aLinker := requestElementMapper{}.linkerFor(a) eLinker := requestElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { requestElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *requestList) Remove(e *Request) { linker := requestElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { requestElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { requestElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type requestEntry struct { next *Request prev *Request } // Next returns the entry that follows e in the list. // //go:nosplit func (e *requestEntry) Next() *Request { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *requestEntry) Prev() *Request { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *requestEntry) SetNext(elem *Request) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *requestEntry) SetPrev(elem *Request) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/request_response.go000066400000000000000000000141141465435605700271420ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fuse import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) // fuseInitRes is a variable-length wrapper of linux.FUSEInitOut. The FUSE // server may implement an older version of FUSE protocol, which contains a // linux.FUSEInitOut with less attributes. // // +marshal dynamic type fuseInitRes struct { // initOut contains the response from the FUSE server. initOut linux.FUSEInitOut // initLen is the total length of bytes of the response. initLen uint32 } func (r *fuseInitRes) MarshalBytes(src []byte) []byte { panic("Unimplemented, fuseInitRes should never be marshalled") } // UnmarshalBytes deserializes src to the initOut attribute in a fuseInitRes. func (r *fuseInitRes) UnmarshalBytes(src []byte) []byte { out := &r.initOut // Introduced before FUSE kernel version 7.13. out.Major = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] out.Minor = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] out.MaxReadahead = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] out.Flags = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] out.MaxBackground = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] out.CongestionThreshold = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] out.MaxWrite = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Introduced in FUSE kernel version 7.23. if len(src) >= 4 { out.TimeGran = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] } // Introduced in FUSE kernel version 7.28. if len(src) >= 2 { out.MaxPages = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] } return src } // SizeBytes is the size of the payload of the FUSE_INIT response. func (r *fuseInitRes) SizeBytes() int { return int(r.initLen) } // Ordinary requests have even IDs, while interrupts IDs are odd. // Used to increment the unique ID for each FUSE request. var reqIDStep uint64 = 2 // Request represents a FUSE operation request that hasn't been sent to the // server yet. // // +stateify savable type Request struct { requestEntry id linux.FUSEOpID hdr *linux.FUSEHeaderIn data []byte // If this request is async. async bool // If we don't care its response. // Manually set by the caller. noReply bool } // NewRequest creates a new request that can be sent to the FUSE server. func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) *Request { conn.fd.mu.Lock() defer conn.fd.mu.Unlock() conn.fd.nextOpID += linux.FUSEOpID(reqIDStep) hdr := linux.FUSEHeaderIn{ Len: linux.SizeOfFUSEHeaderIn + uint32(payload.SizeBytes()), Opcode: opcode, Unique: conn.fd.nextOpID, NodeID: ino, UID: uint32(creds.EffectiveKUID), GID: uint32(creds.EffectiveKGID), PID: pid, } buf := make([]byte, hdr.Len) hdr.MarshalUnsafe(buf[:linux.SizeOfFUSEHeaderIn]) payload.MarshalUnsafe(buf[linux.SizeOfFUSEHeaderIn:]) return &Request{ id: hdr.Unique, hdr: &hdr, data: buf, } } // futureResponse represents an in-flight request, that may or may not have // completed yet. Convert it to a resolved Response by calling Resolve, but note // that this may block. // // +stateify savable type futureResponse struct { opcode linux.FUSEOpcode ch chan struct{} `state:"nosave"` hdr *linux.FUSEHeaderOut data []byte // If this request is async. async bool } // newFutureResponse creates a future response to a FUSE request. func newFutureResponse(req *Request) *futureResponse { return &futureResponse{ opcode: req.hdr.Opcode, ch: make(chan struct{}), async: req.async, } } // resolve blocks the task until the server responds to its corresponding request, // then returns a resolved response. func (f *futureResponse) resolve(b context.Blocker) (*Response, error) { // Return directly for async requests. if f.async { return nil, nil } if err := b.Block(f.ch); err != nil { return nil, err } return f.getResponse(), nil } // getResponse creates a Response from the data the futureResponse has. func (f *futureResponse) getResponse() *Response { return &Response{ opcode: f.opcode, hdr: *f.hdr, data: f.data, } } // Response represents an actual response from the server, including the // response payload. // // +stateify savable type Response struct { opcode linux.FUSEOpcode hdr linux.FUSEHeaderOut data []byte } // Error returns the error of the FUSE call. func (r *Response) Error() error { errno := r.hdr.Error if errno >= 0 { return nil } sysErrNo := unix.Errno(-errno) return error(sysErrNo) } // DataLen returns the size of the response without the header. func (r *Response) DataLen() uint32 { return r.hdr.Len - uint32(r.hdr.SizeBytes()) } // UnmarshalPayload unmarshals the response data into m. func (r *Response) UnmarshalPayload(m marshal.Marshallable) error { hdrLen := r.hdr.SizeBytes() haveDataLen := r.hdr.Len - uint32(hdrLen) wantDataLen := uint32(m.SizeBytes()) if haveDataLen < wantDataLen { log.Warningf("fusefs: Payload too small. Minimum data length required: %d, but got data length %d", wantDataLen, haveDataLen) return linuxerr.EINVAL } // The response data is empty unless there is some payload. And so, doesn't // need to be unmarshalled. if r.data == nil { return nil } m.UnmarshalUnsafe(r.data[hdrLen:]) return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/save_restore.go000066400000000000000000000013171465435605700262360ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fuse import "context" func (fRes *futureResponse) afterLoad(context.Context) { fRes.ch = make(chan struct{}) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/fuse/seqatomic_time_unsafe.go000066400000000000000000000034741465435605700301070ustar00rootroot00000000000000package fuse import ( __generics_imported0 "gvisor.dev/gvisor/pkg/sentry/kernel/time" ) import ( "unsafe" "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/sync" ) // SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race // with any writer critical sections in seq. // //go:nosplit func SeqAtomicLoadTime(seq *sync.SeqCount, ptr *__generics_imported0.Time) __generics_imported0.Time { for { if val, ok := SeqAtomicTryLoadTime(seq, seq.BeginRead(), ptr); ok { return val } } } // SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section // in seq initiated by a call to seq.BeginRead() that returned epoch. If the // read would race with a writer critical section, SeqAtomicTryLoad returns // (unspecified, false). // //go:nosplit func SeqAtomicTryLoadTime(seq *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *__generics_imported0.Time) (val __generics_imported0.Time, ok bool) { if sync.RaceEnabled { gohacks.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) } else { val = *ptr } ok = seq.ReadOk(epoch) return } // SeqAtomicStore sets *ptr to a copy of val, ensuring that any racing reader // critical sections are forced to retry. // //go:nosplit func SeqAtomicStoreTime(seq *sync.SeqCount, ptr *__generics_imported0.Time, val __generics_imported0.Time) { seq.BeginWrite() SeqAtomicStoreSeqedTime(ptr, val) seq.EndWrite() } // SeqAtomicStoreSeqed sets *ptr to a copy of val. // // Preconditions: ptr is protected by a SeqCount that will be in a writer // critical section throughout the call to SeqAtomicStore. // //go:nosplit func SeqAtomicStoreSeqedTime(ptr *__generics_imported0.Time, val __generics_imported0.Time) { if sync.RaceEnabled { gohacks.Memmove(unsafe.Pointer(ptr), unsafe.Pointer(&val), unsafe.Sizeof(val)) } else { *ptr = val } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/000077500000000000000000000000001465435605700233445ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/dentry_impl.go000066400000000000000000000427661465435605700262400ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gofer import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fsutil" "gvisor.dev/gvisor/pkg/lisafs" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // We do *not* define an interface for dentry.impl because making interface // method calls is almost 2.5x slower than calling the same method on a // concrete type. Instead, we use type assertions in switch statements. The // asserted type is a concrete dentry implementation and methods are called // directly on the concrete type. This helps in the following ways: // // 1. This is faster because concrete type assertion just needs to compare the // itab pointer in the interface value to a constant which is relatively // cheap. Benchmarking showed that such type switches don't add almost any // overhead. // 2. Passing any pointer to an interface method immediately causes the pointed // object to escape to heap. Making concrete method calls allows escape // analysis to proceed as usual and avoids heap allocations. // // Also note that the default case in these type switch statements panics. We // do not do panic(fmt.Sprintf("... %T", d.impl)) because somehow it adds a lot // of overhead to the type switch. So instead we panic with a constant string. // Precondition: d.handleMu must be locked. func (d *dentry) isReadHandleOk() bool { switch dt := d.impl.(type) { case *lisafsDentry: return dt.readFDLisa.Ok() case *directfsDentry: return d.readFD.RacyLoad() >= 0 case nil: // synthetic dentry return false default: panic("unknown dentry implementation") } } // Precondition: d.handleMu must be locked. func (d *dentry) isWriteHandleOk() bool { switch dt := d.impl.(type) { case *lisafsDentry: return dt.writeFDLisa.Ok() case *directfsDentry: return d.writeFD.RacyLoad() >= 0 case nil: // synthetic dentry return false default: panic("unknown dentry implementation") } } // Precondition: d.handleMu must be locked. func (d *dentry) readHandle() handle { switch dt := d.impl.(type) { case *lisafsDentry: return handle{ fdLisa: dt.readFDLisa, fd: d.readFD.RacyLoad(), } case *directfsDentry: return handle{fd: d.readFD.RacyLoad()} case nil: // synthetic dentry return noHandle default: panic("unknown dentry implementation") } } // Precondition: d.handleMu must be locked. func (d *dentry) writeHandle() handle { switch dt := d.impl.(type) { case *lisafsDentry: return handle{ fdLisa: dt.writeFDLisa, fd: d.writeFD.RacyLoad(), } case *directfsDentry: return handle{fd: d.writeFD.RacyLoad()} case nil: // synthetic dentry return noHandle default: panic("unknown dentry implementation") } } // Preconditions: // - !d.isSynthetic(). // - fs.renameMu is locked. func (d *dentry) openHandle(ctx context.Context, read, write, trunc bool) (handle, error) { flags := uint32(unix.O_RDONLY) switch { case read && write: flags = unix.O_RDWR case read: flags = unix.O_RDONLY case write: flags = unix.O_WRONLY default: log.Debugf("openHandle called with read = write = false. Falling back to read only FD.") } if trunc { flags |= unix.O_TRUNC } switch dt := d.impl.(type) { case *lisafsDentry: return dt.openHandle(ctx, flags) case *directfsDentry: return dt.openHandle(ctx, flags) default: panic("unknown dentry implementation") } } // Preconditions: // - d.handleMu must be locked. // - !d.isSynthetic(). func (d *dentry) updateHandles(ctx context.Context, h handle, readable, writable bool) { switch dt := d.impl.(type) { case *lisafsDentry: dt.updateHandles(ctx, h, readable, writable) case *directfsDentry: // No update needed. default: panic("unknown dentry implementation") } } // Preconditions: // - d.handleMu must be locked. // - !d.isSynthetic(). func (d *dentry) closeHostFDs() { // We can use RacyLoad() because d.handleMu is locked. if d.readFD.RacyLoad() >= 0 { _ = unix.Close(int(d.readFD.RacyLoad())) } if d.writeFD.RacyLoad() >= 0 && d.readFD.RacyLoad() != d.writeFD.RacyLoad() { _ = unix.Close(int(d.writeFD.RacyLoad())) } d.readFD = atomicbitops.FromInt32(-1) d.writeFD = atomicbitops.FromInt32(-1) d.mmapFD = atomicbitops.FromInt32(-1) switch dt := d.impl.(type) { case *directfsDentry: if dt.controlFD >= 0 { _ = unix.Close(dt.controlFD) dt.controlFD = -1 } } } // updateMetadataLocked updates the dentry's metadata fields. The h parameter // is optional. If it is not provided, an appropriate FD should be chosen to // stat the remote file. // // Preconditions: // - !d.isSynthetic(). // - d.metadataMu is locked. // // +checklocks:d.metadataMu func (d *dentry) updateMetadataLocked(ctx context.Context, h handle) error { // Need checklocksforce below because checklocks has no way of knowing that // d.impl.(*dentryImpl).dentry == d. It can't know that the right metadataMu // is already locked. switch dt := d.impl.(type) { case *lisafsDentry: return dt.updateMetadataLocked(ctx, h) // +checklocksforce: acquired by precondition. case *directfsDentry: return dt.updateMetadataLocked(h) // +checklocksforce: acquired by precondition. default: panic("unknown dentry implementation") } } // Preconditions: // - !d.isSynthetic(). // - fs.renameMu is locked. func (d *dentry) prepareSetStat(ctx context.Context, stat *linux.Statx) error { switch dt := d.impl.(type) { case *lisafsDentry: // Nothing to be done. return nil case *directfsDentry: return dt.prepareSetStat(ctx, stat) default: panic("unknown dentry implementation") } } // Precondition: fs.renameMu is locked if d is a socket. func (d *dentry) chmod(ctx context.Context, mode uint16) error { switch dt := d.impl.(type) { case *lisafsDentry: return chmod(ctx, dt.controlFD, mode) case *directfsDentry: return dt.chmod(ctx, mode) default: panic("unknown dentry implementation") } } // Preconditions: // - !d.isSynthetic(). // - d.handleMu is locked. // - fs.renameMu is locked. func (d *dentry) setStatLocked(ctx context.Context, stat *linux.Statx) (uint32, error, error) { switch dt := d.impl.(type) { case *lisafsDentry: return dt.controlFD.SetStat(ctx, stat) case *directfsDentry: failureMask, failureErr := dt.setStatLocked(ctx, stat) return failureMask, failureErr, nil default: panic("unknown dentry implementation") } } // Precondition: d.handleMu must be locked. func (d *dentry) destroyImpl(ctx context.Context) { switch dt := d.impl.(type) { case *lisafsDentry: dt.destroy(ctx) case *directfsDentry: dt.destroy(ctx) case nil: // synthetic dentry default: panic("unknown dentry implementation") } } // Postcondition: Caller must do dentry caching appropriately. // // +checklocksread:d.opMu func (d *dentry) getRemoteChild(ctx context.Context, name string) (*dentry, error) { switch dt := d.impl.(type) { case *lisafsDentry: return dt.getRemoteChild(ctx, name) case *directfsDentry: return dt.getHostChild(name) default: panic("unknown dentry implementation") } } // Preconditions: // - fs.renameMu must be locked. // - parent.opMu must be locked for reading. // - parent.isDir(). // - !rp.Done() && rp.Component() is not "." or "..". // // Postcondition: The returned dentry is already cached appropriately. // // +checklocksread:d.opMu func (d *dentry) getRemoteChildAndWalkPathLocked(ctx context.Context, rp resolvingPath, ds **[]*dentry) (*dentry, error) { switch dt := d.impl.(type) { case *lisafsDentry: return dt.getRemoteChildAndWalkPathLocked(ctx, rp, ds) case *directfsDentry: // We need to check for races because opMu is read locked which allows // concurrent walks to occur. return d.fs.getRemoteChildLocked(ctx, d, rp.Component(), true /* checkForRace */, ds) default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). func (d *dentry) listXattrImpl(ctx context.Context, size uint64) ([]string, error) { switch dt := d.impl.(type) { case *lisafsDentry: return dt.controlFD.ListXattr(ctx, size) case *directfsDentry: // Consistent with runsc/fsgofer. return nil, linuxerr.EOPNOTSUPP default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). func (d *dentry) getXattrImpl(ctx context.Context, opts *vfs.GetXattrOptions) (string, error) { switch dt := d.impl.(type) { case *lisafsDentry: return dt.controlFD.GetXattr(ctx, opts.Name, opts.Size) case *directfsDentry: return dt.getXattr(opts.Name, opts.Size) default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). func (d *dentry) setXattrImpl(ctx context.Context, opts *vfs.SetXattrOptions) error { switch dt := d.impl.(type) { case *lisafsDentry: return dt.controlFD.SetXattr(ctx, opts.Name, opts.Value, opts.Flags) case *directfsDentry: // Consistent with runsc/fsgofer. return linuxerr.EOPNOTSUPP default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). func (d *dentry) removeXattrImpl(ctx context.Context, name string) error { switch dt := d.impl.(type) { case *lisafsDentry: return dt.controlFD.RemoveXattr(ctx, name) case *directfsDentry: // Consistent with runsc/fsgofer. return linuxerr.EOPNOTSUPP default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). func (d *dentry) mknod(ctx context.Context, name string, creds *auth.Credentials, opts *vfs.MknodOptions) (*dentry, error) { switch dt := d.impl.(type) { case *lisafsDentry: return dt.mknod(ctx, name, creds, opts) case *directfsDentry: return dt.mknod(ctx, name, creds, opts) default: panic("unknown dentry implementation") } } // Preconditions: // - !d.isSynthetic(). // - !target.isSynthetic(). // - d.fs.renameMu must be locked. func (d *dentry) link(ctx context.Context, target *dentry, name string) (*dentry, error) { switch dt := d.impl.(type) { case *lisafsDentry: return dt.link(ctx, target.impl.(*lisafsDentry), name) case *directfsDentry: return dt.link(target.impl.(*directfsDentry), name) default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). func (d *dentry) mkdir(ctx context.Context, name string, mode linux.FileMode, uid auth.KUID, gid auth.KGID) (*dentry, error) { switch dt := d.impl.(type) { case *lisafsDentry: return dt.mkdir(ctx, name, mode, uid, gid) case *directfsDentry: return dt.mkdir(name, mode, uid, gid) default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). func (d *dentry) symlink(ctx context.Context, name, target string, creds *auth.Credentials) (*dentry, error) { switch dt := d.impl.(type) { case *lisafsDentry: return dt.symlink(ctx, name, target, creds) case *directfsDentry: return dt.symlink(name, target, creds) default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). func (d *dentry) openCreate(ctx context.Context, name string, accessFlags uint32, mode linux.FileMode, uid auth.KUID, gid auth.KGID) (*dentry, handle, error) { switch dt := d.impl.(type) { case *lisafsDentry: return dt.openCreate(ctx, name, accessFlags, mode, uid, gid) case *directfsDentry: return dt.openCreate(name, accessFlags, mode, uid, gid) default: panic("unknown dentry implementation") } } // Preconditions: // - d.isDir(). // - d.handleMu must be locked. // - !d.isSynthetic(). func (d *dentry) getDirentsLocked(ctx context.Context, recordDirent func(name string, key inoKey, dType uint8)) error { switch dt := d.impl.(type) { case *lisafsDentry: return dt.getDirentsLocked(ctx, recordDirent) case *directfsDentry: return dt.getDirentsLocked(recordDirent) default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). func (d *dentry) flush(ctx context.Context) error { d.handleMu.RLock() defer d.handleMu.RUnlock() switch dt := d.impl.(type) { case *lisafsDentry: return flush(ctx, dt.writeFDLisa) case *directfsDentry: // Nothing to do here. return nil default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). func (d *dentry) allocate(ctx context.Context, mode, offset, length uint64) error { d.handleMu.RLock() defer d.handleMu.RUnlock() switch dt := d.impl.(type) { case *lisafsDentry: return dt.writeFDLisa.Allocate(ctx, mode, offset, length) case *directfsDentry: return unix.Fallocate(int(d.writeFD.RacyLoad()), uint32(mode), int64(offset), int64(length)) default: panic("unknown dentry implementation") } } // Preconditions: // - !d.isSynthetic(). // - fs.renameMu is locked. func (d *dentry) connect(ctx context.Context, sockType linux.SockType) (int, error) { switch dt := d.impl.(type) { case *lisafsDentry: return dt.controlFD.Connect(ctx, sockType) case *directfsDentry: return dt.connect(ctx, sockType) default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). func (d *dentry) readlinkImpl(ctx context.Context) (string, error) { switch dt := d.impl.(type) { case *lisafsDentry: return dt.controlFD.ReadLinkAt(ctx) case *directfsDentry: return dt.readlink() default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). func (d *dentry) unlink(ctx context.Context, name string, flags uint32) error { switch dt := d.impl.(type) { case *lisafsDentry: return dt.controlFD.UnlinkAt(ctx, name, flags) case *directfsDentry: return unix.Unlinkat(dt.controlFD, name, int(flags)) default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). func (d *dentry) rename(ctx context.Context, oldName string, newParent *dentry, newName string) error { switch dt := d.impl.(type) { case *lisafsDentry: return dt.controlFD.RenameAt(ctx, oldName, newParent.impl.(*lisafsDentry).controlFD.ID(), newName) case *directfsDentry: return fsutil.RenameAt(dt.controlFD, oldName, newParent.impl.(*directfsDentry).controlFD, newName) default: panic("unknown dentry implementation") } } // Precondition: !d.isSynthetic(). func (d *dentry) statfs(ctx context.Context) (linux.Statfs, error) { switch dt := d.impl.(type) { case *lisafsDentry: return dt.statfs(ctx) case *directfsDentry: return dt.statfs() default: panic("unknown dentry implementation") } } func (fs *filesystem) restoreRoot(ctx context.Context, opts *vfs.CompleteRestoreOptions) error { rootInode, rootHostFD, err := fs.initClientAndGetRoot(ctx) if err != nil { return err } // The root is always non-synthetic. switch dt := fs.root.impl.(type) { case *lisafsDentry: return dt.restoreFile(ctx, &rootInode, opts) case *directfsDentry: dt.controlFDLisa = fs.client.NewFD(rootInode.ControlFD) return dt.restoreFile(ctx, rootHostFD, opts) default: panic("unknown dentry implementation") } } // Preconditions: // - !d.isSynthetic(). // - d.parent != nil and has been restored. func (d *dentry) restoreFile(ctx context.Context, opts *vfs.CompleteRestoreOptions) error { switch dt := d.impl.(type) { case *lisafsDentry: controlFD := d.parent.Load().impl.(*lisafsDentry).controlFD inode, err := controlFD.Walk(ctx, d.name) if err != nil { if !dt.isDir() || !dt.forMountpoint { return err } // Recreate directories that were created during volume mounting, since // during restore we don't attempt to remount them. inode, err = controlFD.MkdirAt(ctx, d.name, linux.FileMode(d.mode.Load()), lisafs.UID(d.uid.Load()), lisafs.GID(d.gid.Load())) if err != nil { return err } } return dt.restoreFile(ctx, &inode, opts) case *directfsDentry: controlFD := d.parent.Load().impl.(*directfsDentry).controlFD childFD, err := tryOpen(func(flags int) (int, error) { n, err := unix.Openat(controlFD, d.name, flags, 0) return n, err }) if err != nil { if !dt.isDir() || !dt.forMountpoint { return err } // Recreate directories that were created during volume mounting, since // during restore we don't attempt to remount them. if err := unix.Mkdirat(controlFD, d.name, d.mode.Load()); err != nil { return err } // Try again... childFD, err = tryOpen(func(flags int) (int, error) { return unix.Openat(controlFD, d.name, flags, 0) }) if err != nil { return err } } return dt.restoreFile(ctx, childFD, opts) default: panic("unknown dentry implementation") } } // doRevalidation calls into r.start's dentry implementation to perform // revalidation on all the dentries contained in r. // // Preconditions: // - fs.renameMu must be locked. // - InteropModeShared is in effect. func (r *revalidateState) doRevalidation(ctx context.Context, vfsObj *vfs.VirtualFilesystem, ds **[]*dentry) error { // Skip synthetic dentries because there is no actual implementation that can // be used to walk the remote filesystem. A start dentry cannot be replaced. if r.start.isSynthetic() { return nil } switch r.start.impl.(type) { case *lisafsDentry: return doRevalidationLisafs(ctx, vfsObj, r, ds) case *directfsDentry: return doRevalidationDirectfs(ctx, vfsObj, r, ds) default: panic("unknown dentry implementation") } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/dentry_list.go000066400000000000000000000122131465435605700262320ustar00rootroot00000000000000package gofer // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type dentryElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (dentryElementMapper) linkerFor(elem *dentryListElem) *dentryListElem { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type dentryList struct { head *dentryListElem tail *dentryListElem } // Reset resets list l to the empty state. func (l *dentryList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *dentryList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *dentryList) Front() *dentryListElem { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *dentryList) Back() *dentryListElem { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *dentryList) Len() (count int) { for e := l.Front(); e != nil; e = (dentryElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *dentryList) PushFront(e *dentryListElem) { linker := dentryElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { dentryElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *dentryList) PushFrontList(m *dentryList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { dentryElementMapper{}.linkerFor(l.head).SetPrev(m.tail) dentryElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *dentryList) PushBack(e *dentryListElem) { linker := dentryElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { dentryElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *dentryList) PushBackList(m *dentryList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { dentryElementMapper{}.linkerFor(l.tail).SetNext(m.head) dentryElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *dentryList) InsertAfter(b, e *dentryListElem) { bLinker := dentryElementMapper{}.linkerFor(b) eLinker := dentryElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { dentryElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *dentryList) InsertBefore(a, e *dentryListElem) { aLinker := dentryElementMapper{}.linkerFor(a) eLinker := dentryElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { dentryElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *dentryList) Remove(e *dentryListElem) { linker := dentryElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { dentryElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { dentryElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type dentryEntry struct { next *dentryListElem prev *dentryListElem } // Next returns the entry that follows e in the list. // //go:nosplit func (e *dentryEntry) Next() *dentryListElem { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *dentryEntry) Prev() *dentryListElem { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *dentryEntry) SetNext(elem *dentryListElem) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *dentryEntry) SetPrev(elem *dentryListElem) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/directfs_dentry.go000066400000000000000000000611771465435605700270770ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gofer import ( "fmt" "math" "path" "path/filepath" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fsutil" "gvisor.dev/gvisor/pkg/lisafs" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // LINT.IfChange const ( hostOpenFlags = unix.O_NOFOLLOW | unix.O_CLOEXEC ) // tryOpen tries to open() with different modes in the following order: // 1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs. // Use non-blocking to prevent getting stuck inside open(2) for // FIFOs. This option has no effect on regular files. // 2. PATH: for symlinks, sockets. func tryOpen(open func(int) (int, error)) (int, error) { flags := []int{ unix.O_RDONLY | unix.O_NONBLOCK, unix.O_PATH, } var ( hostFD int err error ) for _, flag := range flags { hostFD, err = open(flag | hostOpenFlags) if err == nil { return hostFD, nil } if err == unix.ENOENT { // File doesn't exist, no point in retrying. break } } return -1, err } // getDirectfsRootDentry creates a new dentry representing the root dentry for // this mountpoint. getDirectfsRootDentry takes ownership of rootHostFD and // rootControlFD. func (fs *filesystem) getDirectfsRootDentry(ctx context.Context, rootHostFD int, rootControlFD lisafs.ClientFD) (*dentry, error) { d, err := fs.newDirectfsDentry(rootHostFD) if err != nil { log.Warningf("newDirectfsDentry failed for mount point dentry: %v", err) rootControlFD.Close(ctx, false /* flush */) return nil, err } d.impl.(*directfsDentry).controlFDLisa = rootControlFD return d, nil } // directfsDentry is a host dentry implementation. It represents a dentry // backed by a host file descriptor. All operations are directly performed on // the host. A gofer is only involved for some operations on the mount point // dentry (when dentry.parent = nil). We are forced to fall back to the gofer // due to the lack of procfs in the sandbox process. // // +stateify savable type directfsDentry struct { dentry // controlFD is the host FD to this file. controlFD is immutable until // destruction, which is synchronized with dentry.handleMu. controlFD int // controlFDLisa is a lisafs control FD on this dentry. // This is used to fallback to using lisafs RPCs in the following cases: // * When parent dentry is required to perform operations but // dentry.parent = nil (root dentry). // * For path-based syscalls (like connect(2) and bind(2)) on sockets. // // For the root dentry, controlFDLisa is always set and is immutable. // For sockets, controlFDLisa is protected by dentry.handleMu and is // immutable after initialization. controlFDLisa lisafs.ClientFD `state:"nosave"` } // newDirectfsDentry creates a new dentry representing the given file. The dentry // initially has no references, but is not cached; it is the caller's // responsibility to set the dentry's reference count and/or call // dentry.checkCachingLocked() as appropriate. // newDirectDentry takes ownership of controlFD. func (fs *filesystem) newDirectfsDentry(controlFD int) (*dentry, error) { var stat unix.Stat_t if err := unix.Fstat(controlFD, &stat); err != nil { log.Warningf("failed to fstat(2) FD %d: %v", controlFD, err) _ = unix.Close(controlFD) return nil, err } inoKey := inoKeyFromStat(&stat) d := &directfsDentry{ dentry: dentry{ fs: fs, inoKey: inoKey, ino: fs.inoFromKey(inoKey), mode: atomicbitops.FromUint32(stat.Mode), uid: atomicbitops.FromUint32(stat.Uid), gid: atomicbitops.FromUint32(stat.Gid), blockSize: atomicbitops.FromUint32(uint32(stat.Blksize)), readFD: atomicbitops.FromInt32(-1), writeFD: atomicbitops.FromInt32(-1), mmapFD: atomicbitops.FromInt32(-1), size: atomicbitops.FromUint64(uint64(stat.Size)), atime: atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Atim)), mtime: atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Mtim)), ctime: atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Ctim)), nlink: atomicbitops.FromUint32(uint32(stat.Nlink)), }, controlFD: controlFD, } d.dentry.init(d) fs.syncMu.Lock() fs.syncableDentries.PushBack(&d.syncableListEntry) fs.syncMu.Unlock() return &d.dentry, nil } // Precondition: fs.renameMu is locked. func (d *directfsDentry) openHandle(ctx context.Context, flags uint32) (handle, error) { parent := d.parent.Load() if parent == nil { // This is a mount point. We don't have parent. Fallback to using lisafs. if !d.controlFDLisa.Ok() { panic("directfsDentry.controlFDLisa is not set for mount point dentry") } openFD, hostFD, err := d.controlFDLisa.OpenAt(ctx, flags) if err != nil { return noHandle, err } d.fs.client.CloseFD(ctx, openFD, true /* flush */) if hostFD < 0 { log.Warningf("gofer did not donate an FD for mount point") return noHandle, unix.EIO } return handle{fd: int32(hostFD)}, nil } // The only way to re-open an FD with different flags is via procfs or // openat(2) from the parent. Procfs does not exist here. So use parent. flags |= hostOpenFlags openFD, err := unix.Openat(parent.impl.(*directfsDentry).controlFD, d.name, int(flags), 0) if err != nil { return noHandle, err } return handle{fd: int32(openFD)}, nil } // Precondition: fs.renameMu is locked. func (d *directfsDentry) ensureLisafsControlFD(ctx context.Context) error { d.handleMu.Lock() defer d.handleMu.Unlock() if d.controlFDLisa.Ok() { return nil } var names []string root := d for root.parent.Load() != nil { names = append(names, root.name) root = root.parent.Load().impl.(*directfsDentry) } if !root.controlFDLisa.Ok() { panic("controlFDLisa is not set for mount point dentry") } if len(names) == 0 { return nil // d == root } // Reverse names. last := len(names) - 1 for i := 0; i < len(names)/2; i++ { names[i], names[last-i] = names[last-i], names[i] } status, inodes, err := root.controlFDLisa.WalkMultiple(ctx, names) if err != nil { return err } defer func() { // Close everything except for inodes[last] if it exists. for i := 0; i < len(inodes) && i < last; i++ { flush := i == last-1 || i == len(inodes)-1 d.fs.client.CloseFD(ctx, inodes[i].ControlFD, flush) } }() switch status { case lisafs.WalkComponentDoesNotExist: return unix.ENOENT case lisafs.WalkComponentSymlink: log.Warningf("intermediate path component was a symlink? names = %v, inodes = %+v", names, inodes) return unix.ELOOP case lisafs.WalkSuccess: d.controlFDLisa = d.fs.client.NewFD(inodes[last].ControlFD) return nil } panic("unreachable") } // Precondition: d.metadataMu must be locked. // // +checklocks:d.metadataMu func (d *directfsDentry) updateMetadataLocked(h handle) error { handleMuRLocked := false if h.fd < 0 { // Use open FDs in preferenece to the control FD. Control FDs may be opened // with O_PATH. This may be significantly more efficient in some // implementations. Prefer a writable FD over a readable one since some // filesystem implementations may update a writable FD's metadata after // writes, without making metadata updates immediately visible to read-only // FDs representing the same file. d.handleMu.RLock() switch { case d.writeFD.RacyLoad() >= 0: h.fd = d.writeFD.RacyLoad() handleMuRLocked = true case d.readFD.RacyLoad() >= 0: h.fd = d.readFD.RacyLoad() handleMuRLocked = true default: h.fd = int32(d.controlFD) d.handleMu.RUnlock() } } var stat unix.Stat_t err := unix.Fstat(int(h.fd), &stat) if handleMuRLocked { // handleMu must be released before updateMetadataFromStatLocked(). d.handleMu.RUnlock() // +checklocksforce: complex case. } if err != nil { return err } return d.updateMetadataFromStatLocked(&stat) } // Precondition: fs.renameMu is locked if d is a socket. func (d *directfsDentry) chmod(ctx context.Context, mode uint16) error { if !d.isSocket() { return unix.Fchmod(d.controlFD, uint32(mode)) } // fchmod(2) on socket files created via bind(2) fails. We need to // fchmodat(2) it from its parent. if parent := d.parent.Load(); parent != nil { // We have parent FD, just use that. Note that AT_SYMLINK_NOFOLLOW flag is // currently not supported. So we don't use it. return unix.Fchmodat(parent.impl.(*directfsDentry).controlFD, d.name, uint32(mode), 0 /* flags */) } // This is a mount point socket. We don't have a parent FD. Fallback to using // lisafs. if !d.controlFDLisa.Ok() { panic("directfsDentry.controlFDLisa is not set for mount point socket") } return chmod(ctx, d.controlFDLisa, mode) } // Preconditions: // - d.handleMu is locked if d is a regular file. // - fs.renameMu is locked if d is a symlink. func (d *directfsDentry) utimensat(ctx context.Context, stat *linux.Statx) error { if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME) == 0 { return nil } utimes := [2]unix.Timespec{ {Sec: 0, Nsec: unix.UTIME_OMIT}, {Sec: 0, Nsec: unix.UTIME_OMIT}, } if stat.Mask&unix.STATX_ATIME != 0 { utimes[0].Sec = stat.Atime.Sec utimes[0].Nsec = int64(stat.Atime.Nsec) } if stat.Mask&unix.STATX_MTIME != 0 { utimes[1].Sec = stat.Mtime.Sec utimes[1].Nsec = int64(stat.Mtime.Nsec) } if !d.isSymlink() { hostFD := d.controlFD if d.isRegularFile() { // utimensat(2) requires a writable FD for regular files. See BUGS // section. dentry.prepareSetStat() should have acquired a writable FD. hostFD = int(d.writeFD.RacyLoad()) } // Non-symlinks can operate directly on the fd using an empty name. return fsutil.Utimensat(hostFD, "", utimes, 0) } // utimensat operates different that other syscalls. To operate on a // symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty // name. if parent := d.parent.Load(); parent != nil { return fsutil.Utimensat(parent.impl.(*directfsDentry).controlFD, d.name, utimes, unix.AT_SYMLINK_NOFOLLOW) } // This is a mount point symlink. We don't have a parent FD. Fallback to // using lisafs. if !d.controlFDLisa.Ok() { panic("directfsDentry.controlFDLisa is not set for mount point symlink") } setStat := linux.Statx{ Mask: stat.Mask & (linux.STATX_ATIME | linux.STATX_MTIME), Atime: stat.Atime, Mtime: stat.Mtime, } _, failureErr, err := d.controlFDLisa.SetStat(ctx, &setStat) if err != nil { return err } return failureErr } // Precondition: fs.renameMu is locked. func (d *directfsDentry) prepareSetStat(ctx context.Context, stat *linux.Statx) error { if stat.Mask&unix.STATX_SIZE != 0 || (stat.Mask&(unix.STATX_ATIME|unix.STATX_MTIME) != 0 && d.isRegularFile()) { // Need to ensure a writable FD is available. See setStatLocked() to // understand why. return d.ensureSharedHandle(ctx, false /* read */, true /* write */, false /* trunc */) } return nil } // Preconditions: // - d.handleMu is locked. // - fs.renameMu is locked. func (d *directfsDentry) setStatLocked(ctx context.Context, stat *linux.Statx) (failureMask uint32, failureErr error) { if stat.Mask&unix.STATX_MODE != 0 { if err := d.chmod(ctx, stat.Mode&^unix.S_IFMT); err != nil { failureMask |= unix.STATX_MODE failureErr = err } } if stat.Mask&unix.STATX_SIZE != 0 { // ftruncate(2) requires a writable FD. if err := unix.Ftruncate(int(d.writeFD.RacyLoad()), int64(stat.Size)); err != nil { failureMask |= unix.STATX_SIZE failureErr = err } } if err := d.utimensat(ctx, stat); err != nil { failureMask |= (stat.Mask & (unix.STATX_ATIME | unix.STATX_MTIME)) failureErr = err } if stat.Mask&(unix.STATX_UID|unix.STATX_GID) != 0 { // "If the owner or group is specified as -1, then that ID is not changed" // - chown(2) uid := -1 if stat.Mask&unix.STATX_UID != 0 { uid = int(stat.UID) } gid := -1 if stat.Mask&unix.STATX_GID != 0 { gid = int(stat.GID) } if err := fchown(d.controlFD, uid, gid); err != nil { failureMask |= stat.Mask & (unix.STATX_UID | unix.STATX_GID) failureErr = err } } return } func fchown(fd, uid, gid int) error { return unix.Fchownat(fd, "", uid, gid, unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW) } // Precondition: d.handleMu must be locked. func (d *directfsDentry) destroy(ctx context.Context) { if d.controlFD >= 0 { _ = unix.Close(d.controlFD) d.controlFD = -1 } if d.controlFDLisa.Ok() { d.controlFDLisa.Close(ctx, true /* flush */) } } func (d *directfsDentry) getHostChild(name string) (*dentry, error) { childFD, err := tryOpen(func(flags int) (int, error) { return unix.Openat(d.controlFD, name, flags, 0) }) if err != nil { return nil, err } return d.fs.newDirectfsDentry(childFD) } func (d *directfsDentry) getXattr(name string, size uint64) (string, error) { data := make([]byte, size) if _, err := unix.Fgetxattr(d.controlFD, name, data); err != nil { return "", err } return string(data), nil } // getCreatedChild opens the newly created child, sets its uid/gid, constructs // a disconnected dentry and returns it. func (d *directfsDentry) getCreatedChild(name string, uid, gid int, isDir bool) (*dentry, error) { unlinkFlags := 0 extraOpenFlags := 0 if isDir { extraOpenFlags |= unix.O_DIRECTORY unlinkFlags |= unix.AT_REMOVEDIR } deleteChild := func() { // Best effort attempt to remove the newly created child on failure. if err := unix.Unlinkat(d.controlFD, name, unlinkFlags); err != nil { log.Warningf("error unlinking newly created child %q after failure: %v", filepath.Join(genericDebugPathname(&d.dentry), name), err) } } childFD, err := tryOpen(func(flags int) (int, error) { return unix.Openat(d.controlFD, name, flags|extraOpenFlags, 0) }) if err != nil { deleteChild() return nil, err } // "If the owner or group is specified as -1, then that ID is not changed" // - chown(2). Only bother making the syscall if the owner is changing. if uid != -1 || gid != -1 { if err := fchown(childFD, uid, gid); err != nil { deleteChild() _ = unix.Close(childFD) return nil, err } } child, err := d.fs.newDirectfsDentry(childFD) if err != nil { // Ownership of childFD was passed to newDirectDentry(), so no need to // clean that up. deleteChild() return nil, err } return child, nil } func (d *directfsDentry) mknod(ctx context.Context, name string, creds *auth.Credentials, opts *vfs.MknodOptions) (*dentry, error) { if _, ok := opts.Endpoint.(transport.HostBoundEndpoint); ok { return d.bindAt(ctx, name, creds, opts) } // From mknod(2) man page: // "EPERM: [...] if the filesystem containing pathname does not support // the type of node requested." if opts.Mode.FileType() != linux.ModeRegular { return nil, unix.EPERM } if err := unix.Mknodat(d.controlFD, name, uint32(opts.Mode), 0); err != nil { return nil, err } return d.getCreatedChild(name, int(creds.EffectiveKUID), int(creds.EffectiveKGID), false /* isDir */) } // Precondition: opts.Endpoint != nil and is transport.HostBoundEndpoint type. func (d *directfsDentry) bindAt(ctx context.Context, name string, creds *auth.Credentials, opts *vfs.MknodOptions) (*dentry, error) { // There are no filesystems mounted in the sandbox process's mount namespace. // So we can't perform absolute path traversals. So fallback to using lisafs. if err := d.ensureLisafsControlFD(ctx); err != nil { return nil, err } sockType := opts.Endpoint.(transport.Endpoint).Type() childInode, boundSocketFD, err := d.controlFDLisa.BindAt(ctx, sockType, name, opts.Mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID)) if err != nil { return nil, err } d.fs.client.CloseFD(ctx, childInode.ControlFD, true /* flush */) // Update opts.Endpoint that it is bound. hbep := opts.Endpoint.(transport.HostBoundEndpoint) if err := hbep.SetBoundSocketFD(ctx, boundSocketFD); err != nil { if err := unix.Unlinkat(d.controlFD, name, 0); err != nil { log.Warningf("error unlinking newly created socket %q after failure: %v", filepath.Join(genericDebugPathname(&d.dentry), name), err) } return nil, err } // Socket already has the right UID/GID set, so use uid = gid = -1. child, err := d.getCreatedChild(name, -1 /* uid */, -1 /* gid */, false /* isDir */) if err != nil { hbep.ResetBoundSocketFD(ctx) return nil, err } // Set the endpoint on the newly created child dentry. child.endpoint = opts.Endpoint return child, nil } // Precondition: d.fs.renameMu must be locked. func (d *directfsDentry) link(target *directfsDentry, name string) (*dentry, error) { // Using linkat(targetFD, "", newdirfd, name, AT_EMPTY_PATH) requires // CAP_DAC_READ_SEARCH in the *root* userns. With directfs, the sandbox // process has CAP_DAC_READ_SEARCH in its own userns. But the sandbox is // running in a different userns. So we can't use AT_EMPTY_PATH. Fallback to // using olddirfd to call linkat(2). // Also note that d and target are from the same mount. Given target is a // non-directory and d is a directory, target.parent must exist. if err := unix.Linkat(target.parent.Load().impl.(*directfsDentry).controlFD, target.name, d.controlFD, name, 0); err != nil { return nil, err } // Note that we don't need to set uid/gid for the new child. This is a hard // link. The original file already has the right owner. // TODO(gvisor.dev/issue/6739): Hard linked dentries should share the same // inode fields. return d.getCreatedChild(name, -1 /* uid */, -1 /* gid */, false /* isDir */) } func (d *directfsDentry) mkdir(name string, mode linux.FileMode, uid auth.KUID, gid auth.KGID) (*dentry, error) { if err := unix.Mkdirat(d.controlFD, name, uint32(mode)); err != nil { return nil, err } return d.getCreatedChild(name, int(uid), int(gid), true /* isDir */) } func (d *directfsDentry) symlink(name, target string, creds *auth.Credentials) (*dentry, error) { if err := unix.Symlinkat(target, d.controlFD, name); err != nil { return nil, err } return d.getCreatedChild(name, int(creds.EffectiveKUID), int(creds.EffectiveKGID), false /* isDir */) } func (d *directfsDentry) openCreate(name string, accessFlags uint32, mode linux.FileMode, uid auth.KUID, gid auth.KGID) (*dentry, handle, error) { createFlags := unix.O_CREAT | unix.O_EXCL | int(accessFlags) | hostOpenFlags childHandleFD, err := unix.Openat(d.controlFD, name, createFlags, uint32(mode&^linux.FileTypeMask)) if err != nil { return nil, noHandle, err } child, err := d.getCreatedChild(name, int(uid), int(gid), false /* isDir */) if err != nil { _ = unix.Close(childHandleFD) return nil, noHandle, err } return child, handle{fd: int32(childHandleFD)}, nil } func (d *directfsDentry) getDirentsLocked(recordDirent func(name string, key inoKey, dType uint8)) error { readFD := int(d.readFD.RacyLoad()) if _, err := unix.Seek(readFD, 0, 0); err != nil { return err } return fsutil.ForEachDirent(readFD, func(ino uint64, off int64, ftype uint8, name string, reclen uint16) { // We also want the device ID, which annoyingly incurs an additional // syscall per dirent. // TODO(gvisor.dev/issue/6665): Get rid of per-dirent stat. stat, err := fsutil.StatAt(d.controlFD, name) if err != nil { log.Warningf("Getdent64: skipping file %q with failed stat, err: %v", path.Join(genericDebugPathname(&d.dentry), name), err) return } recordDirent(name, inoKeyFromStat(&stat), ftype) }) } // Precondition: fs.renameMu is locked. func (d *directfsDentry) connect(ctx context.Context, sockType linux.SockType) (int, error) { // There are no filesystems mounted in the sandbox process's mount namespace. // So we can't perform absolute path traversals. So fallback to using lisafs. if err := d.ensureLisafsControlFD(ctx); err != nil { return -1, err } return d.controlFDLisa.Connect(ctx, sockType) } func (d *directfsDentry) readlink() (string, error) { // This is similar to what os.Readlink does. for linkLen := 128; linkLen < math.MaxUint16; linkLen *= 2 { b := make([]byte, linkLen) n, err := unix.Readlinkat(d.controlFD, "", b) if err != nil { return "", err } if n < int(linkLen) { return string(b[:n]), nil } } return "", unix.ENOMEM } func (d *directfsDentry) statfs() (linux.Statfs, error) { var statFS unix.Statfs_t if err := unix.Fstatfs(d.controlFD, &statFS); err != nil { return linux.Statfs{}, err } return linux.Statfs{ BlockSize: statFS.Bsize, FragmentSize: statFS.Bsize, Blocks: statFS.Blocks, BlocksFree: statFS.Bfree, BlocksAvailable: statFS.Bavail, Files: statFS.Files, FilesFree: statFS.Ffree, NameLength: uint64(statFS.Namelen), }, nil } func (d *directfsDentry) restoreFile(ctx context.Context, controlFD int, opts *vfs.CompleteRestoreOptions) error { if controlFD < 0 { log.Warningf("directfsDentry.restoreFile called with invalid controlFD") return unix.EINVAL } var stat unix.Stat_t if err := unix.Fstat(controlFD, &stat); err != nil { _ = unix.Close(controlFD) return err } d.controlFD = controlFD // We do not preserve inoKey across checkpoint/restore, so: // // - We must assume that the host filesystem did not change in a way that // would invalidate dentries, since we can't revalidate dentries by // checking inoKey. // // - We need to associate the new inoKey with the existing d.ino. d.inoKey = inoKeyFromStat(&stat) d.fs.inoMu.Lock() d.fs.inoByKey[d.inoKey] = d.ino d.fs.inoMu.Unlock() // Check metadata stability before updating metadata. d.metadataMu.Lock() defer d.metadataMu.Unlock() if d.isRegularFile() { if opts.ValidateFileSizes { if d.size.RacyLoad() != uint64(stat.Size) { return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(&d.dentry), d.size.Load(), stat.Size)} } } if opts.ValidateFileModificationTimestamps { if want := dentryTimestampFromUnix(stat.Mtim); d.mtime.RacyLoad() != want { return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(&d.dentry), linux.NsecToStatxTimestamp(d.mtime.RacyLoad()), linux.NsecToStatxTimestamp(want))} } } } if !d.cachedMetadataAuthoritative() { d.updateMetadataFromStatLocked(&stat) } if rw, ok := d.fs.savedDentryRW[&d.dentry]; ok { if err := d.ensureSharedHandle(ctx, rw.read, rw.write, false /* trunc */); err != nil { return err } } return nil } // doRevalidationDirectfs stats all dentries in `state`. It will update or // invalidate dentries in the cache based on the result. // // Preconditions: // - fs.renameMu must be locked. // - InteropModeShared is in effect. func doRevalidationDirectfs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, state *revalidateState, ds **[]*dentry) error { // Explicitly declare start dentry, instead of using the function receiver. // The function receiver has to be named `d` (to be consistent with other // receivers). But `d` variable is also used below in various places. This // helps with readability and makes code less error prone. start := state.start.impl.(*directfsDentry) if state.refreshStart { start.updateMetadata(ctx) } parent := start for _, d := range state.dentries { childFD, err := unix.Openat(parent.controlFD, d.name, unix.O_PATH|hostOpenFlags, 0) if err != nil && err != unix.ENOENT { return err } var stat unix.Stat_t // Lock metadata *before* getting attributes for d. d.metadataMu.Lock() found := err == nil if found { err = unix.Fstat(childFD, &stat) _ = unix.Close(childFD) if err != nil { d.metadataMu.Unlock() return err } } // Note that synthetic dentries will always fail this comparison check. if !found || d.inoKey != inoKeyFromStat(&stat) { d.metadataMu.Unlock() if !found && d.isSynthetic() { // We have a synthetic file, and no remote file has arisen to replace // it. return nil } // The file at this path has changed or no longer exists. Mark the // dentry invalidated. d.invalidate(ctx, vfsObj, ds) return nil } // The file at this path hasn't changed. Just update cached metadata. d.impl.(*directfsDentry).updateMetadataFromStatLocked(&stat) // +checklocksforce: d.metadataMu is locked above. d.metadataMu.Unlock() // Advance parent. parent = d.impl.(*directfsDentry) } return nil } // LINT.ThenChange(../../../../runsc/fsgofer/lisafs.go) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/directory.go000066400000000000000000000250201465435605700256760ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gofer import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) func (d *dentry) isDir() bool { return d.fileType() == linux.S_IFDIR } // cacheNewChildLocked will cache the new child dentry, and will panic if a // non-negative child is already cached. It is the caller's responsibility to // check that the child does not exist before calling this method. // // Preconditions: // - filesystem.renameMu must be locked. // - If the addition to the dentry tree is due to a read-only operation (like // Walk), then d.opMu must be held for reading. Otherwise d.opMu must be // held for writing. // - d.childrenMu must be locked. // - d.isDir(). // - child must be a newly-created dentry that has never had a parent. // - d.children[name] must be unset or nil (a "negative child") // // +checklocksread:d.opMu // +checklocks:d.childrenMu func (d *dentry) cacheNewChildLocked(child *dentry, name string) { d.IncRef() // reference held by child on its parent child.parent.Store(d) child.name = name if d.children == nil { d.children = make(map[string]*dentry) } else if c, ok := d.children[name]; ok { if c != nil { panic(fmt.Sprintf("cacheNewChildLocked collision; child with name=%q already cached", name)) } // Cached child is negative. OK to cache over, but we must // update the count of negative children. d.negativeChildren-- } d.children[name] = child } // Preconditions: // - d.childrenMu must be locked. // - d.isDir(). // - name is not already a negative entry. // // +checklocks:d.childrenMu func (d *dentry) cacheNegativeLookupLocked(name string) { // Don't cache negative lookups if InteropModeShared is in effect (since // this makes remote lookup unavoidable), or if d.isSynthetic() (in which // case the only files in the directory are those for which a dentry exists // in d.children). Instead, just delete any previously-cached dentry. if d.fs.opts.interop == InteropModeShared || d.isSynthetic() { delete(d.children, name) return } if d.children == nil { d.children = make(map[string]*dentry) } d.children[name] = nil d.negativeChildren++ if !d.negativeChildrenCache.isInited() { // Initializing cache with all negative children name at the first time // that negativeChildren increase upto max. if d.negativeChildren >= maxCachedNegativeChildren { d.negativeChildrenCache.init(maxCachedNegativeChildren) for childName, child := range d.children { if child == nil { d.negativeChildrenCache.add(childName) } } } } else if victim := d.negativeChildrenCache.add(name); victim != "" { // If victim is a negative entry in d.children, delete it. if child, ok := d.children[victim]; ok && child == nil { delete(d.children, victim) d.negativeChildren-- } } } type createSyntheticOpts struct { name string mode linux.FileMode kuid auth.KUID kgid auth.KGID // The endpoint for a synthetic socket. endpoint should be nil if the file // being created is not a socket. endpoint transport.BoundEndpoint // pipe should be nil if the file being created is not a pipe. pipe *pipe.VFSPipe } // newSyntheticDentry creates a synthetic file with the given name. func (fs *filesystem) newSyntheticDentry(opts *createSyntheticOpts) *dentry { now := fs.clock.Now().Nanoseconds() child := &dentry{ refs: atomicbitops.FromInt64(1), // held by parent. fs: fs, ino: fs.nextIno(), mode: atomicbitops.FromUint32(uint32(opts.mode)), uid: atomicbitops.FromUint32(uint32(opts.kuid)), gid: atomicbitops.FromUint32(uint32(opts.kgid)), blockSize: atomicbitops.FromUint32(hostarch.PageSize), // arbitrary atime: atomicbitops.FromInt64(now), mtime: atomicbitops.FromInt64(now), ctime: atomicbitops.FromInt64(now), btime: atomicbitops.FromInt64(now), readFD: atomicbitops.FromInt32(-1), writeFD: atomicbitops.FromInt32(-1), mmapFD: atomicbitops.FromInt32(-1), nlink: atomicbitops.FromUint32(2), } switch opts.mode.FileType() { case linux.S_IFDIR: // Nothing else needs to be done. case linux.S_IFSOCK: child.endpoint = opts.endpoint case linux.S_IFIFO: child.pipe = opts.pipe default: panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType())) } child.init(nil /* impl */) return child } // Preconditions: // - d.childrenMu must be locked. // // +checklocks:d.childrenMu func (d *dentry) clearDirentsLocked() { d.dirents = nil d.childrenSet = nil } // +stateify savable type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl mu sync.Mutex `state:"nosave"` off int64 dirents []vfs.Dirent } // Release implements vfs.FileDescriptionImpl.Release. func (fd *directoryFD) Release(context.Context) { } // IterDirents implements vfs.FileDescriptionImpl.IterDirents. func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { fd.mu.Lock() defer fd.mu.Unlock() d := fd.dentry() if fd.dirents == nil { ds, err := d.getDirents(ctx) if err != nil { return err } fd.dirents = ds } if d.cachedMetadataAuthoritative() { d.touchAtime(fd.vfsfd.Mount()) } for fd.off < int64(len(fd.dirents)) { if err := cb.Handle(fd.dirents[fd.off]); err != nil { return err } fd.off++ } return nil } // Preconditions: // - d.isDir(). // - There exists at least one directoryFD representing d. func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { // NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the // presence of concurrent mutation of an iterated directory, so // implementations may duplicate or omit entries in this case, which // violates POSIX semantics. Thus we read all directory entries while // holding d.opMu to exclude directory mutations. (Note that it is // impossible for the client to exclude concurrent mutation from other // remote filesystem users. Since there is no way to detect if the server // has incorrectly omitted directory entries, we simply assume that the // server is well-behaved under InteropModeShared.) This is inconsistent // with Linux (which appears to assume that directory fids have the correct // semantics, and translates struct file_operations::readdir calls directly // to readdir RPCs), but is consistent with VFS1. // filesystem.renameMu is needed for d.parent, and must be locked before // d.opMu. d.fs.renameMu.RLock() defer d.fs.renameMu.RUnlock() d.opMu.RLock() defer d.opMu.RUnlock() // d.childrenMu must be locked after d.opMu and held for the entire // function. This synchronizes concurrent getDirents() attempts. // getdents(2) advances the file offset. To get complete results from // multiple getdents(2) calls, the directory FD's offset needs to be // protected. d.childrenMu.Lock() defer d.childrenMu.Unlock() if d.dirents != nil { return d.dirents, nil } // It's not clear if 9P2000.L's readdir is expected to return "." and "..", // so we generate them here. parent := genericParentOrSelf(d) dirents := []vfs.Dirent{ { Name: ".", Type: linux.DT_DIR, Ino: uint64(d.ino), NextOff: 1, }, { Name: "..", Type: uint8(parent.mode.Load() >> 12), Ino: uint64(parent.ino), NextOff: 2, }, } var realChildren map[string]struct{} if !d.isSynthetic() { if d.syntheticChildren != 0 && d.fs.opts.interop == InteropModeShared { // Record the set of children d actually has so that we don't emit // duplicate entries for synthetic children. realChildren = make(map[string]struct{}) } d.handleMu.RLock() if !d.isReadHandleOk() { // This should not be possible because a readable handle should // have been opened when the calling directoryFD was opened. panic("gofer.dentry.getDirents called without a readable handle") } err := d.getDirentsLocked(ctx, func(name string, key inoKey, dType uint8) { dirent := vfs.Dirent{ Name: name, Ino: d.fs.inoFromKey(key), NextOff: int64(len(dirents) + 1), Type: dType, } dirents = append(dirents, dirent) if realChildren != nil { realChildren[name] = struct{}{} } }) d.handleMu.RUnlock() if err != nil { return nil, err } } // Emit entries for synthetic children. if d.syntheticChildren != 0 { for _, child := range d.children { if child == nil || !child.isSynthetic() { continue } if _, ok := realChildren[child.name]; ok { continue } dirents = append(dirents, vfs.Dirent{ Name: child.name, Type: uint8(child.mode.Load() >> 12), Ino: uint64(child.ino), NextOff: int64(len(dirents) + 1), }) } } // Cache dirents for future directoryFDs if permitted. if d.cachedMetadataAuthoritative() { d.dirents = dirents d.childrenSet = make(map[string]struct{}, len(dirents)) for _, dirent := range d.dirents { d.childrenSet[dirent.Name] = struct{}{} } } return dirents, nil } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { fd.mu.Lock() defer fd.mu.Unlock() switch whence { case linux.SEEK_SET: if offset < 0 { return 0, linuxerr.EINVAL } if offset == 0 { // Ensure that the next call to fd.IterDirents() calls // fd.dentry().getDirents(). fd.dirents = nil } fd.off = offset return fd.off, nil case linux.SEEK_CUR: offset += fd.off if offset < 0 { return 0, linuxerr.EINVAL } // Don't clear fd.dirents in this case, even if offset == 0. fd.off = offset return fd.off, nil default: return 0, linuxerr.EINVAL } } // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *directoryFD) Sync(ctx context.Context) error { return fd.dentry().syncRemoteFile(ctx) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/filesystem.go000066400000000000000000001566461465435605700261010ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gofer import ( "fmt" "math" "strings" "sync" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { // Snapshot current syncable dentries and special file FDs. fs.syncMu.Lock() ds := make([]*dentry, 0, fs.syncableDentries.Len()) for elem := fs.syncableDentries.Front(); elem != nil; elem = elem.Next() { ds = append(ds, elem.d) } sffds := make([]*specialFileFD, 0, fs.specialFileFDs.Len()) for sffd := fs.specialFileFDs.Front(); sffd != nil; sffd = sffd.Next() { sffds = append(sffds, sffd) } fs.syncMu.Unlock() // Return the first error we encounter, but sync everything we can // regardless. var retErr error // Note that lisafs is capable of batching FSync RPCs. However, we can not // batch all the FDIDs to be synced from ds and sffds. Because the error // handling varies based on file type. FSync errors are only considered for // regular file FDIDs that were opened for writing. We could do individual // RPCs for such FDIDs and batch the rest, but it increases code complexity // substantially. We could implement it in the future if need be. // Sync syncable dentries. for _, d := range ds { if err := d.syncCachedFile(ctx, true /* forFilesystemSync */); err != nil { ctx.Infof("gofer.filesystem.Sync: dentry.syncCachedFile failed: %v", err) if retErr == nil { retErr = err } } } // Sync special files, which may be writable but do not use dentry shared // handles (so they won't be synced by the above). for _, sffd := range sffds { if err := sffd.sync(ctx, true /* forFilesystemSync */); err != nil { ctx.Infof("gofer.filesystem.Sync: specialFileFD.sync failed: %v", err) if retErr == nil { retErr = err } } } return retErr } // MaxFilenameLen is the maximum length of a filename. This is dictated by 9P's // encoding of strings, which uses 2 bytes for the length prefix. const MaxFilenameLen = (1 << 16) - 1 // dentrySlicePool is a pool of *[]*dentry used to store dentries for which // dentry.checkCachingLocked() must be called. The pool holds pointers to // slices because Go lacks generics, so sync.Pool operates on any, so // every call to (what should be) sync.Pool<[]*dentry>.Put() allocates a copy // of the slice header on the heap. var dentrySlicePool = sync.Pool{ New: func() any { ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity return &ds }, } func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry { if ds == nil { ds = dentrySlicePool.Get().(*[]*dentry) } *ds = append(*ds, d) return ds } // Precondition: !parent.isSynthetic() && !child.isSynthetic(). func appendNewChildDentry(ds **[]*dentry, parent *dentry, child *dentry) { // The new child was added to parent and took a ref on the parent (hence // parent can be removed from cache). A new child has 0 refs for now. So // checkCachingLocked() should be called on both. Call it first on the parent // as it may create space in the cache for child to be inserted - hence // avoiding a cache eviction. *ds = appendDentry(*ds, parent) *ds = appendDentry(*ds, child) } // Preconditions: ds != nil. func putDentrySlice(ds *[]*dentry) { // Allow dentries to be GC'd. for i := range *ds { (*ds)[i] = nil } *ds = (*ds)[:0] dentrySlicePool.Put(ds) } // renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls // dentry.checkCachingLocked on all dentries in *dsp with fs.renameMu locked // for writing. // // dsp is a pointer-to-pointer since defer evaluates its arguments immediately, // but dentry slices are allocated lazily, and it's much easier to say "defer // fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { // fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. // +checklocksreleaseread:fs.renameMu func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, dsp **[]*dentry) { fs.renameMu.RUnlock() if *dsp == nil { return } ds := **dsp for _, d := range ds { d.checkCachingLocked(ctx, false /* renameMuWriteLocked */) } putDentrySlice(*dsp) } // +checklocksrelease:fs.renameMu func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { if *ds == nil { fs.renameMu.Unlock() return } for _, d := range **ds { d.checkCachingLocked(ctx, true /* renameMuWriteLocked */) } fs.renameMu.Unlock() putDentrySlice(*ds) } // stepLocked resolves rp.Component() to an existing file, starting from the // given directory. // // Dentries which may become cached as a result of the traversal are appended // to *ds. // // Preconditions: // - fs.renameMu must be locked. // - d.opMu must be locked for reading. // - !rp.Done(). // - If !d.cachedMetadataAuthoritative(), then d and all children that are // part of rp must have been revalidated. // // +checklocksread:d.opMu func (fs *filesystem) stepLocked(ctx context.Context, rp resolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, bool, error) { if !d.isDir() { return nil, false, linuxerr.ENOTDIR } if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, false, err } name := rp.Component() if name == "." { rp.Advance() return d, false, nil } if name == ".." { if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { return nil, false, err } else if isRoot || d.parent.Load() == nil { rp.Advance() return d, false, nil } if err := rp.CheckMount(ctx, &d.parent.Load().vfsd); err != nil { return nil, false, err } rp.Advance() return d.parent.Load(), false, nil } child, err := fs.getChildAndWalkPathLocked(ctx, d, rp, ds) if err != nil { return nil, false, err } if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, false, err } if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() { target, err := child.readlink(ctx, rp.Mount()) if err != nil { return nil, false, err } followedSymlink, err := rp.HandleSymlink(target) return d, followedSymlink, err } rp.Advance() return child, false, nil } // getChildLocked returns a dentry representing the child of parent with the // given name. Returns ENOENT if the child doesn't exist. // // Preconditions: // - fs.renameMu must be locked. // - parent.opMu must be locked. // - parent.isDir(). // - name is not "." or "..". // - parent and the dentry at name have been revalidated. // // +checklocks:parent.opMu func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { if child, err := parent.getCachedChildLocked(name); child != nil || err != nil { return child, err } // We don't need to check for race here because parent.opMu is held for // writing. return fs.getRemoteChildLocked(ctx, parent, name, false /* checkForRace */, ds) } // getRemoteChildLocked is similar to getChildLocked, with the additional // precondition that the child identified by name does not exist in cache. // // If checkForRace argument is true, then this method will check to see if the // call has raced with another getRemoteChild call, and will handle the race if // so. // // Preconditions: // - If checkForRace is false, then parent.opMu must be held for writing. // - Otherwise, parent.opMu must be held for reading. // // Postcondition: The returned dentry is already cached appropriately. // // +checklocksread:parent.opMu func (fs *filesystem) getRemoteChildLocked(ctx context.Context, parent *dentry, name string, checkForRace bool, ds **[]*dentry) (*dentry, error) { child, err := parent.getRemoteChild(ctx, name) // Cache the result appropriately in the dentry tree. if err != nil { if linuxerr.Equals(linuxerr.ENOENT, err) { parent.childrenMu.Lock() defer parent.childrenMu.Unlock() parent.cacheNegativeLookupLocked(name) } return nil, err } parent.childrenMu.Lock() defer parent.childrenMu.Unlock() if checkForRace { // See if we raced with another getRemoteChild call that added // to the cache. if cachedChild, ok := parent.children[name]; ok && cachedChild != nil { // We raced. Destroy our child and return the cached // one. This child has no handles, no data, and has not // been cached, so destruction is quick and painless. child.destroyDisconnected(ctx) // All good. Return the cached child. return cachedChild, nil } // No race, continue with the child we got. } parent.cacheNewChildLocked(child, name) appendNewChildDentry(ds, parent, child) return child, nil } // getChildAndWalkPathLocked is the same as getChildLocked, except that it // may prefetch the entire path represented by rp. // // +checklocksread:parent.opMu func (fs *filesystem) getChildAndWalkPathLocked(ctx context.Context, parent *dentry, rp resolvingPath, ds **[]*dentry) (*dentry, error) { if child, err := parent.getCachedChildLocked(rp.Component()); child != nil || err != nil { return child, err } // dentry.getRemoteChildAndWalkPathLocked already handles dentry caching. return parent.getRemoteChildAndWalkPathLocked(ctx, rp, ds) } // getCachedChildLocked returns a child dentry if it was cached earlier. If no // cached child dentry exists, (nil, nil) is returned. // // Preconditions: // - fs.renameMu must be locked. // - d.opMu must be locked for reading. // - d.isDir(). // - name is not "." or "..". // - d and the dentry at name have been revalidated. // // +checklocksread:d.opMu func (d *dentry) getCachedChildLocked(name string) (*dentry, error) { if len(name) > MaxFilenameLen { return nil, linuxerr.ENAMETOOLONG } d.childrenMu.Lock() defer d.childrenMu.Unlock() if child, ok := d.children[name]; ok || d.isSynthetic() { if child == nil { return nil, linuxerr.ENOENT } return child, nil } if d.childrenSet != nil { // Is the child even there? Don't make RPC if not. if _, ok := d.childrenSet[name]; !ok { return nil, linuxerr.ENOENT } } return nil, nil } // walkParentDirLocked resolves all but the last path component of rp to an // existing directory, starting from the given directory (which is usually // rp.Start().Impl().(*dentry)). It does not check that the returned directory // is searchable by the provider of rp. // // Preconditions: // - fs.renameMu must be locked. // - !rp.Done(). // - If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up // to date. func (fs *filesystem) walkParentDirLocked(ctx context.Context, vfsRP *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { rp := resolvingPathParent(vfsRP) if err := fs.revalidatePath(ctx, rp, d, ds); err != nil { return nil, err } for !rp.done() { d.opMu.RLock() next, followedSymlink, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) d.opMu.RUnlock() if err != nil { return nil, err } d = next if followedSymlink { if err := fs.revalidatePath(ctx, rp, d, ds); err != nil { return nil, err } } } if !d.isDir() { return nil, linuxerr.ENOTDIR } return d, nil } // resolveLocked resolves rp to an existing file. // // Preconditions: fs.renameMu must be locked. func (fs *filesystem) resolveLocked(ctx context.Context, vfsRP *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { rp := resolvingPathFull(vfsRP) d := rp.Start().Impl().(*dentry) if err := fs.revalidatePath(ctx, rp, d, ds); err != nil { return nil, err } for !rp.done() { d.opMu.RLock() next, followedSymlink, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) d.opMu.RUnlock() if err != nil { return nil, err } d = next if followedSymlink { if err := fs.revalidatePath(ctx, rp, d, ds); err != nil { return nil, err } } } if rp.MustBeDir() && !d.isDir() { return nil, linuxerr.ENOTDIR } return d, nil } // doCreateAt checks that creating a file at rp is permitted, then invokes // createInRemoteDir (if the parent directory is a real remote directory) or // createInSyntheticDir (if the parent directory is synthetic) to do so. // // Preconditions: // - !rp.Done(). // - For the final path component in rp, !rp.ShouldFollowSymlink(). func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string, ds **[]*dentry) (*dentry, error), createInSyntheticDir func(parent *dentry, name string) (*dentry, error)) error { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { return err } // Order of checks is important. First check if parent directory can be // executed, then check for existence, and lastly check if mount is writable. if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return err } name := rp.Component() if name == "." || name == ".." { return linuxerr.EEXIST } if parent.isDeleted() { return linuxerr.ENOENT } if err := fs.revalidateOne(ctx, rp.VirtualFilesystem(), parent, name, &ds); err != nil { return err } parent.opMu.Lock() defer parent.opMu.Unlock() if len(name) > MaxFilenameLen { return linuxerr.ENAMETOOLONG } // Check for existence only if caching information is available. Otherwise, // don't check for existence just yet. We will check for existence if the // checks for writability fail below. Existence check is done by the creation // RPCs themselves. parent.childrenMu.Lock() if child, ok := parent.children[name]; ok && child != nil { parent.childrenMu.Unlock() return linuxerr.EEXIST } if parent.childrenSet != nil { if _, ok := parent.childrenSet[name]; ok { parent.childrenMu.Unlock() return linuxerr.EEXIST } } parent.childrenMu.Unlock() checkExistence := func() error { if child, err := fs.getChildLocked(ctx, parent, name, &ds); err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) { return err } else if child != nil { return linuxerr.EEXIST } return nil } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { // Existence check takes precedence. if existenceErr := checkExistence(); existenceErr != nil { return existenceErr } return err } defer mnt.EndWrite() if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { // Existence check takes precedence. if existenceErr := checkExistence(); existenceErr != nil { return existenceErr } return err } if !dir && rp.MustBeDir() { return linuxerr.ENOENT } if parent.isSynthetic() { if createInSyntheticDir == nil { return linuxerr.EPERM } child, err := createInSyntheticDir(parent, name) if err != nil { return err } parent.childrenMu.Lock() parent.cacheNewChildLocked(child, name) parent.syntheticChildren++ parent.clearDirentsLocked() parent.childrenMu.Unlock() parent.touchCMtime() ev := linux.IN_CREATE if dir { ev |= linux.IN_ISDIR } parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } // No cached dentry exists; however, in InteropModeShared there might still be // an existing file at name. Just attempt the file creation RPC anyways. If a // file does exist, the RPC will fail with EEXIST like we would have. child, err := createInRemoteDir(parent, name, &ds) if err != nil { return err } parent.childrenMu.Lock() parent.cacheNewChildLocked(child, name) if child.isSynthetic() { parent.syntheticChildren++ ds = appendDentry(ds, parent) } else { appendNewChildDentry(&ds, parent, child) } if fs.opts.interop != InteropModeShared { if child, ok := parent.children[name]; ok && child == nil { // Delete the now-stale negative dentry. delete(parent.children, name) parent.negativeChildren-- } parent.clearDirentsLocked() parent.touchCMtime() } parent.childrenMu.Unlock() ev := linux.IN_CREATE if dir { ev |= linux.IN_ISDIR } parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } // Preconditions: !rp.Done(). func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error { var ds *[]*dentry fs.renameMu.RLock() // We need to DecRef outside of fs.renameMu because forgetting a dead // mountpoint could result in this filesystem being released which acquires // fs.renameMu. var toDecRef []refs.RefCounter defer func() { for _, ref := range toDecRef { ref.DecRef(ctx) } }() defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { return err } if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } if err := rp.Mount().CheckBeginWrite(); err != nil { return err } defer rp.Mount().EndWrite() name := rp.Component() if dir { if name == "." { return linuxerr.EINVAL } if name == ".." { return linuxerr.ENOTEMPTY } } else { if name == "." || name == ".." { return linuxerr.EISDIR } } vfsObj := rp.VirtualFilesystem() if err := fs.revalidateOne(ctx, vfsObj, parent, rp.Component(), &ds); err != nil { return err } mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef(ctx) parent.opMu.Lock() defer parent.opMu.Unlock() parent.childrenMu.Lock() if parent.childrenSet != nil { if _, ok := parent.childrenSet[name]; !ok { parent.childrenMu.Unlock() return linuxerr.ENOENT } } parent.childrenMu.Unlock() // Load child if sticky bit is set because we need to determine whether // deletion is allowed. var child *dentry if parent.mode.Load()&linux.ModeSticky == 0 { var ok bool parent.childrenMu.Lock() child, ok = parent.children[name] parent.childrenMu.Unlock() if ok && child == nil { // Hit a negative cached entry, child doesn't exist. return linuxerr.ENOENT } } else { child, _, err = fs.stepLocked(ctx, resolvingPathFull(rp), parent, false /* mayFollowSymlinks */, &ds) if err != nil { return err } if err := parent.mayDelete(rp.Credentials(), child); err != nil { return err } } // If a child dentry exists, prepare to delete it. This should fail if it is // a mount point. We detect mount points by speculatively calling // PrepareDeleteDentry, which fails if child is a mount point. // // Also note that if child is nil, then it can't be a mount point. if child != nil { // Hold child.childrenMu so we can check child.children and // child.syntheticChildren. We don't access these fields until a bit later, // but locking child.childrenMu after calling vfs.PrepareDeleteDentry() would // create an inconsistent lock ordering between dentry.childrenMu and // vfs.Dentry.mu (in the VFS lock order, it would make dentry.childrenMu both "a // FilesystemImpl lock" and "a lock acquired by a FilesystemImpl between // PrepareDeleteDentry and CommitDeleteDentry). To avoid this, lock // child.childrenMu before calling PrepareDeleteDentry. child.childrenMu.Lock() defer child.childrenMu.Unlock() if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } } flags := uint32(0) // If a dentry exists, use it for best-effort checks on its deletability. if dir { if child != nil { // child must be an empty directory. if child.syntheticChildren != 0 { // +checklocksforce: child.childrenMu is held if child != nil. // This is definitely not an empty directory, irrespective of // fs.opts.interop. vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: PrepareDeleteDentry called if child != nil. return linuxerr.ENOTEMPTY } // If InteropModeShared is in effect and the first call to // PrepareDeleteDentry above succeeded, then child wasn't // revalidated (so we can't expect its file type to be correct) and // individually revalidating its children (to confirm that they // still exist) would be a waste of time. if child.cachedMetadataAuthoritative() { if !child.isDir() { vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. return linuxerr.ENOTDIR } for _, grandchild := range child.children { // +checklocksforce: child.childrenMu is held if child != nil. if grandchild != nil { vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. return linuxerr.ENOTEMPTY } } } } flags = linux.AT_REMOVEDIR } else { // child must be a non-directory file. if child != nil && child.isDir() { vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. return linuxerr.EISDIR } if rp.MustBeDir() { if child != nil { vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. } return linuxerr.ENOTDIR } } if parent.isSynthetic() { if child == nil { return linuxerr.ENOENT } } else if child == nil || !child.isSynthetic() { if err := parent.unlink(ctx, name, flags); err != nil { if child != nil { vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. } return err } } // Generate inotify events for rmdir or unlink. if dir { parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) } else { var cw *vfs.Watches if child != nil { cw = &child.watches } vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name) } parent.childrenMu.Lock() defer parent.childrenMu.Unlock() if child != nil { toDecRef = vfsObj.CommitDeleteDentry(ctx, &child.vfsd) // +checklocksforce: see above. child.setDeleted() if child.isSynthetic() { parent.syntheticChildren-- child.decRefNoCaching() } ds = appendDentry(ds, child) } parent.cacheNegativeLookupLocked(name) if parent.cachedMetadataAuthoritative() { parent.clearDirentsLocked() parent.touchCMtime() if dir { parent.decLinks() } } return nil } // AccessAt implements vfs.Filesystem.Impl.AccessAt. func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err } if err := d.checkPermissions(creds, ats); err != nil { return err } if ats.MayWrite() && rp.Mount().ReadOnly() { return linuxerr.EROFS } return nil } // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err } if opts.CheckSearchable { if !d.isDir() { return nil, linuxerr.ENOTDIR } if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } } d.IncRef() // Call d.checkCachingLocked() so it can be removed from the cache if needed. ds = appendDentry(ds, d) return &d.vfsd, nil } // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { return nil, err } d.IncRef() // Call d.checkCachingLocked() so it can be removed from the cache if needed. ds = appendDentry(ds, d) return &d.vfsd, nil } // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { err := fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) (*dentry, error) { if rp.Mount() != vd.Mount() { return nil, linuxerr.EXDEV } d := vd.Dentry().Impl().(*dentry) if d.isDir() { return nil, linuxerr.EPERM } gid := auth.KGID(d.gid.Load()) uid := auth.KUID(d.uid.Load()) mode := linux.FileMode(d.mode.Load()) if err := vfs.MayLink(rp.Credentials(), mode, uid, gid); err != nil { return nil, err } if d.nlink.Load() == 0 { return nil, linuxerr.ENOENT } if d.nlink.Load() == math.MaxUint32 { return nil, linuxerr.EMLINK } if d.isSynthetic() { // TODO(gvisor.dev/issue/6739): Add synthetic file hard link support. return nil, linuxerr.EOPNOTSUPP } return parent.link(ctx, d, name) }, nil) if err == nil { // Success! vd.Dentry().Impl().(*dentry).incLinks() } return err } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { creds := rp.Credentials() return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string, ds **[]*dentry) (*dentry, error) { // If the parent is a setgid directory, use the parent's GID // rather than the caller's and enable setgid. kgid := creds.EffectiveKGID mode := opts.Mode if parent.mode.Load()&linux.S_ISGID != 0 { kgid = auth.KGID(parent.gid.Load()) mode |= linux.S_ISGID } child, err := parent.mkdir(ctx, name, mode, creds.EffectiveKUID, kgid) if err == nil { if fs.opts.interop != InteropModeShared { parent.incLinks() } child.forMountpoint = opts.ForSyntheticMountpoint return child, nil } if !opts.ForSyntheticMountpoint || linuxerr.Equals(linuxerr.EEXIST, err) { return nil, err } ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err) child = fs.newSyntheticDentry(&createSyntheticOpts{ name: name, mode: linux.S_IFDIR | opts.Mode, kuid: creds.EffectiveKUID, kgid: creds.EffectiveKGID, }) if fs.opts.interop != InteropModeShared { parent.incLinks() } return child, nil }, func(parent *dentry, name string) (*dentry, error) { if !opts.ForSyntheticMountpoint { // Can't create non-synthetic files in synthetic directories. return nil, linuxerr.EPERM } child := fs.newSyntheticDentry(&createSyntheticOpts{ name: name, mode: linux.S_IFDIR | opts.Mode, kuid: creds.EffectiveKUID, kgid: creds.EffectiveKGID, }) parent.incLinks() return child, nil }) } // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) (*dentry, error) { creds := rp.Credentials() if child, err := parent.mknod(ctx, name, creds, &opts); err == nil { return child, nil } else if !linuxerr.Equals(linuxerr.EPERM, err) { return nil, err } // EPERM means that gofer does not allow creating a socket or pipe. Fallback // to creating a synthetic one, i.e. one that is kept entirely in memory. // Check that we're not overriding an existing file with a synthetic one. _, _, err := fs.stepLocked(ctx, resolvingPathFull(rp), parent, false /* mayFollowSymlinks */, ds) // +checklocksforce: parent.opMu taken by doCreateAt. switch { case err == nil: // Step succeeded, another file exists. return nil, linuxerr.EEXIST case !linuxerr.Equals(linuxerr.ENOENT, err): // Schrödinger. File/Cat may or may not exist. return nil, err } switch opts.Mode.FileType() { case linux.S_IFSOCK: return fs.newSyntheticDentry(&createSyntheticOpts{ name: name, mode: opts.Mode, kuid: creds.EffectiveKUID, kgid: creds.EffectiveKGID, endpoint: opts.Endpoint, }), nil case linux.S_IFIFO: return fs.newSyntheticDentry(&createSyntheticOpts{ name: name, mode: opts.Mode, kuid: creds.EffectiveKUID, kgid: creds.EffectiveKGID, pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize), }), nil } // Retain error from gofer if synthetic file cannot be created internally. return nil, linuxerr.EPERM }, nil) } // OpenAt implements vfs.FilesystemImpl.OpenAt. func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { // Reject O_TMPFILE, which is not supported; supporting it correctly in the // presence of other remote filesystem users requires remote filesystem // support, and it isn't clear that there's any way to implement this in // 9P. if opts.Flags&linux.O_TMPFILE != 0 { return nil, linuxerr.EOPNOTSUPP } mayCreate := opts.Flags&linux.O_CREAT != 0 mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL) var ds *[]*dentry fs.renameMu.RLock() unlocked := false unlock := func() { if !unlocked { fs.renameMuRUnlockAndCheckCaching(ctx, &ds) unlocked = true } } defer unlock() start := rp.Start().Impl().(*dentry) if rp.Done() { // Reject attempts to open mount root directory with O_CREAT. if mayCreate && rp.MustBeDir() { return nil, linuxerr.EISDIR } if mustCreate { return nil, linuxerr.EEXIST } if !start.cachedMetadataAuthoritative() { // Refresh dentry's attributes before opening. if err := start.updateMetadata(ctx); err != nil { return nil, err } } start.IncRef() defer start.DecRef(ctx) unlock() // start is intentionally not added to ds (which would remove it from the // cache) because doing so regresses performance in practice. return start.open(ctx, rp, &opts) } afterTrailingSymlink: parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { return nil, err } // Check for search permission in the parent directory. if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Reject attempts to open directories with O_CREAT. if mayCreate && rp.MustBeDir() { return nil, linuxerr.EISDIR } if err := fs.revalidateOne(ctx, rp.VirtualFilesystem(), parent, rp.Component(), &ds); err != nil { return nil, err } // Determine whether or not we need to create a file. // NOTE(b/263297063): Don't hold opMu for writing here, to avoid // serializing OpenAt calls in the same directory in the common case // that the file exists. parent.opMu.RLock() child, followedSymlink, err := fs.stepLocked(ctx, resolvingPathFull(rp), parent, true /* mayFollowSymlinks */, &ds) parent.opMu.RUnlock() if followedSymlink { if mustCreate { // EEXIST must be returned if an existing symlink is opened with O_EXCL. return nil, linuxerr.EEXIST } if err != nil { // If followedSymlink && err != nil, then this symlink resolution error // must be handled by the VFS layer. return nil, err } start = parent goto afterTrailingSymlink } if linuxerr.Equals(linuxerr.ENOENT, err) && mayCreate { if parent.isSynthetic() { return nil, linuxerr.EPERM } // Take opMu for writing, but note that the file may have been // created by another goroutine since we checked for existence // a few lines ago. We must handle that case. parent.opMu.Lock() fd, createErr := parent.createAndOpenChildLocked(ctx, rp, &opts, &ds) if !linuxerr.Equals(linuxerr.EEXIST, createErr) { // Either the creation was a success, or we got an // unexpected error. Either way we can return here. parent.opMu.Unlock() return fd, createErr } // We raced, and now the file exists. if mustCreate { parent.opMu.Unlock() return nil, linuxerr.EEXIST } // Step to the file again. Since we still hold opMu for // writing, there can't be a race here. child, _, err = fs.stepLocked(ctx, resolvingPathFull(rp), parent, false /* mayFollowSymlinks */, &ds) parent.opMu.Unlock() } if err != nil { return nil, err } if mustCreate { return nil, linuxerr.EEXIST } if rp.MustBeDir() && !child.isDir() { return nil, linuxerr.ENOTDIR } child.IncRef() defer child.DecRef(ctx) unlock() // child is intentionally not added to ds (which would remove it from the // cache) because doing so regresses performance in practice. return child.open(ctx, rp, &opts) } // Used to log a rejected fifo open, once. var logRejectedFifoOpenOnce sync.Once // Preconditions: The caller must hold no locks (since opening pipes may block // indefinitely). func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) if err := d.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } if !d.isSynthetic() { // renameMu is locked here because it is required by d.openHandle(), which // is called by d.ensureSharedHandle() and d.openSpecialFile() below. It is // also required by d.connect() which is called by // d.openSocketByConnecting(). Note that opening non-synthetic pipes may // block, renameMu is unlocked separately in d.openSpecialFile() for pipes. d.fs.renameMu.RLock() defer d.fs.renameMu.RUnlock() } trunc := opts.Flags&linux.O_TRUNC != 0 && d.fileType() == linux.S_IFREG if trunc { // Lock metadataMu *while* we open a regular file with O_TRUNC because // open(2) will change the file size on server. d.metadataMu.Lock() defer d.metadataMu.Unlock() } var vfd *vfs.FileDescription var err error mnt := rp.Mount() switch d.fileType() { case linux.S_IFREG: if !d.fs.opts.regularFilesUseSpecialFileFD { if err := d.ensureSharedHandle(ctx, ats.MayRead(), ats.MayWrite(), trunc); err != nil { return nil, err } fd, err := newRegularFileFD(mnt, d, opts.Flags) if err != nil { return nil, err } vfd = &fd.vfsfd } case linux.S_IFDIR: // Can't open directories with O_CREAT. if opts.Flags&linux.O_CREAT != 0 { return nil, linuxerr.EISDIR } // Can't open directories writably. if ats&vfs.MayWrite != 0 { return nil, linuxerr.EISDIR } if opts.Flags&linux.O_DIRECT != 0 { return nil, linuxerr.EINVAL } if !d.isSynthetic() { if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { return nil, err } } fd := &directoryFD{} fd.LockFD.Init(&d.locks) if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } if d.readFD.Load() >= 0 { fsmetric.GoferOpensHost.Increment() } else { fsmetric.GoferOpens9P.Increment() } return &fd.vfsfd, nil case linux.S_IFLNK: // Can't open symlinks without O_PATH, which is handled at the VFS layer. return nil, linuxerr.ELOOP case linux.S_IFSOCK: if d.isSynthetic() { return nil, linuxerr.ENXIO } if d.fs.iopts.OpenSocketsByConnecting { return d.openSocketByConnecting(ctx, opts) } case linux.S_IFIFO: if d.isSynthetic() { return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.locks) } if d.fs.opts.disableFifoOpen { logRejectedFifoOpenOnce.Do(func() { log.Warningf("Rejecting attempt to open fifo/pipe from host filesystem: %q. If you want to allow this, set flag --host-fifo=open", d.name) }) return nil, linuxerr.EPERM } } if vfd == nil { if vfd, err = d.openSpecialFile(ctx, mnt, opts); err != nil { return nil, err } } if trunc { // If no errors occurred so far then update file size in memory. This // step is required even if !d.cachedMetadataAuthoritative() because // d.mappings has to be updated. // d.metadataMu has already been acquired if trunc == true. d.updateSizeLocked(0) if d.cachedMetadataAuthoritative() { d.touchCMtimeLocked() } } return vfd, err } // Precondition: fs.renameMu is locked. func (d *dentry) openSocketByConnecting(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { if opts.Flags&linux.O_DIRECT != 0 { return nil, linuxerr.EINVAL } // Note that special value of linux.SockType = 0 is interpreted by lisafs // as "do not care about the socket type". Analogous to p9.AnonymousSocket. sockFD, err := d.connect(ctx, 0 /* sockType */) if err != nil { return nil, err } fd, err := host.NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), sockFD, &host.NewFDOptions{ HaveFlags: true, Flags: opts.Flags, }) if err != nil { unix.Close(sockFD) return nil, err } return fd, nil } // Preconditions: // - !d.isSynthetic(). // - fs.renameMu is locked. It may be released temporarily while pipe blocks. // - If d is a pipe, no other locks (other than fs.renameMu) should be held. func (d *dentry) openSpecialFile(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) if opts.Flags&linux.O_DIRECT != 0 && !d.isRegularFile() { return nil, linuxerr.EINVAL } // We assume that the server silently inserts O_NONBLOCK in the open flags // for all named pipes (because all existing gofers do this). // // NOTE(b/133875563): This makes named pipe opens racy, because the // mechanisms for translating nonblocking to blocking opens can only detect // the instantaneous presence of a peer holding the other end of the pipe // open, not whether the pipe was *previously* opened by a peer that has // since closed its end. isBlockingOpenOfNamedPipe := d.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0 retry: h, err := d.openHandle(ctx, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0) if err != nil { if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && linuxerr.Equals(linuxerr.ENXIO, err) { // An attempt to open a named pipe with O_WRONLY|O_NONBLOCK fails // with ENXIO if opening the same named pipe with O_WRONLY would // block because there are no readers of the pipe. Release renameMu // while blocking. d.fs.renameMu.RUnlock() err := sleepBetweenNamedPipeOpenChecks(ctx) d.fs.renameMu.RLock() if err != nil { return nil, err } goto retry } return nil, err } if isBlockingOpenOfNamedPipe && ats == vfs.MayRead && h.fd >= 0 { // Release renameMu while blocking. d.fs.renameMu.RUnlock() err := blockUntilNonblockingPipeHasWriter(ctx, h.fd) d.fs.renameMu.RLock() if err != nil { h.close(ctx) return nil, err } } fd, err := newSpecialFileFD(h, mnt, d, opts.Flags) if err != nil { h.close(ctx) return nil, err } return &fd.vfsfd, nil } // Preconditions: // - d.fs.renameMu must be locked. // - d.opMu must be locked for writing. // - !d.isSynthetic(). // // +checklocks:d.opMu func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) { if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } if d.isDeleted() { return nil, linuxerr.ENOENT } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { return nil, err } defer mnt.EndWrite() creds := rp.Credentials() name := rp.Component() // If the parent is a setgid directory, use the parent's GID rather // than the caller's. kgid := creds.EffectiveKGID if d.mode.Load()&linux.S_ISGID != 0 { kgid = auth.KGID(d.gid.Load()) } child, h, err := d.openCreate(ctx, name, opts.Flags&linux.O_ACCMODE, opts.Mode, creds.EffectiveKUID, kgid) if err != nil { return nil, err } // Incorporate the fid that was opened by lcreate. useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD if useRegularFileFD { var readable, writable bool child.handleMu.Lock() if vfs.MayReadFileWithOpenFlags(opts.Flags) { readable = true if h.fd != -1 { child.readFD = atomicbitops.FromInt32(h.fd) child.mmapFD = atomicbitops.FromInt32(h.fd) } } if vfs.MayWriteFileWithOpenFlags(opts.Flags) { writable = true child.writeFD = atomicbitops.FromInt32(h.fd) } child.updateHandles(ctx, h, readable, writable) child.handleMu.Unlock() } // Insert the dentry into the tree. d.childrenMu.Lock() // We have d.opMu for writing, so there can not be a cached child with // this name. We could not have raced. d.cacheNewChildLocked(child, name) appendNewChildDentry(ds, d, child) if d.cachedMetadataAuthoritative() { d.touchCMtime() d.clearDirentsLocked() } d.childrenMu.Unlock() // Finally, construct a file description representing the created file. var childVFSFD *vfs.FileDescription if useRegularFileFD { fd, err := newRegularFileFD(mnt, child, opts.Flags) if err != nil { return nil, err } childVFSFD = &fd.vfsfd } else { fd, err := newSpecialFileFD(h, mnt, child, opts.Flags) if err != nil { h.close(ctx) return nil, err } childVFSFD = &fd.vfsfd } d.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) return childVFSFD, nil } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err } if !d.isSymlink() { return "", linuxerr.EINVAL } return d.readlink(ctx, rp.Mount()) } // RenameAt implements vfs.FilesystemImpl.RenameAt. func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { // Resolve newParent first to verify that it's on this Mount. var ds *[]*dentry fs.renameMu.Lock() // We need to DecRef outside of fs.mu because forgetting a dead mountpoint // could result in this filesystem being released which acquires fs.mu. var toDecRef []refs.RefCounter defer func() { for _, ref := range toDecRef { ref.DecRef(ctx) } }() defer fs.renameMuUnlockAndCheckCaching(ctx, &ds) newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds) if err != nil { return err } if opts.Flags&^linux.RENAME_NOREPLACE != 0 { return linuxerr.EINVAL } if fs.opts.interop == InteropModeShared && opts.Flags&linux.RENAME_NOREPLACE != 0 { // Requires 9P support to synchronize with other remote filesystem // users. return linuxerr.EINVAL } newName := rp.Component() if newName == "." || newName == ".." { if opts.Flags&linux.RENAME_NOREPLACE != 0 { return linuxerr.EEXIST } return linuxerr.EBUSY } if len(newName) > MaxFilenameLen { return linuxerr.ENAMETOOLONG } mnt := rp.Mount() if mnt != oldParentVD.Mount() { return linuxerr.EXDEV } if err := mnt.CheckBeginWrite(); err != nil { return err } defer mnt.EndWrite() oldParent := oldParentVD.Dentry().Impl().(*dentry) if !oldParent.cachedMetadataAuthoritative() { if err := oldParent.updateMetadata(ctx); err != nil { return err } } creds := rp.Credentials() if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { return err } vfsObj := rp.VirtualFilesystem() if err := fs.revalidateOne(ctx, vfsObj, newParent, newName, &ds); err != nil { return err } if err := fs.revalidateOne(ctx, vfsObj, oldParent, oldName, &ds); err != nil { return err } // We need a dentry representing the renamed file since, if it's a // directory, we need to check for write permission on it. oldParent.opMu.Lock() defer oldParent.opMu.Unlock() renamed, err := fs.getChildLocked(ctx, oldParent, oldName, &ds) if err != nil { return err } if err := oldParent.mayDelete(creds, renamed); err != nil { return err } if renamed.isDir() { if renamed == newParent || genericIsAncestorDentry(renamed, newParent) { return linuxerr.EINVAL } if oldParent != newParent { if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil { return err } } } else { if opts.MustBeDir || rp.MustBeDir() { return linuxerr.ENOTDIR } } if oldParent != newParent { if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { return err } newParent.opMu.Lock() defer newParent.opMu.Unlock() } if newParent.isDeleted() { return linuxerr.ENOENT } replaced, err := fs.getChildLocked(ctx, newParent, newName, &ds) // +checklocksforce: newParent.opMu taken if newParent != oldParent. if err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) { return err } var replacedVFSD *vfs.Dentry if replaced != nil { if opts.Flags&linux.RENAME_NOREPLACE != 0 { return linuxerr.EEXIST } replacedVFSD = &replaced.vfsd if replaced.isDir() { if !renamed.isDir() { return linuxerr.EISDIR } if genericIsAncestorDentry(replaced, renamed) { return linuxerr.ENOTEMPTY } } else { if rp.MustBeDir() || renamed.isDir() { return linuxerr.ENOTDIR } } } if oldParent == newParent && oldName == newName { return nil } mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef(ctx) if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { return err } // Update the remote filesystem. if !renamed.isSynthetic() { if err := oldParent.rename(ctx, oldName, newParent, newName); err != nil { vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) return err } } else if replaced != nil && !replaced.isSynthetic() { // We are replacing an existing real file with a synthetic one, so we // need to unlink the former. flags := uint32(0) if replaced.isDir() { flags = linux.AT_REMOVEDIR } if err := newParent.unlink(ctx, newName, flags); err != nil { vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) return err } } // Update the dentry tree. newParent.childrenMu.Lock() defer newParent.childrenMu.Unlock() if oldParent != newParent { oldParent.childrenMu.Lock() defer oldParent.childrenMu.Unlock() } toDecRef = vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD) if replaced != nil { replaced.setDeleted() if replaced.isSynthetic() { newParent.syntheticChildren-- replaced.decRefNoCaching() } ds = appendDentry(ds, replaced) // Remove the replaced entry from its parent's cache. delete(newParent.children, newName) } oldParent.cacheNegativeLookupLocked(oldName) // +checklocksforce: oldParent.childrenMu is held if oldParent != newParent. if renamed.isSynthetic() { oldParent.syntheticChildren-- newParent.syntheticChildren++ } // We have d.opMu for writing, so no need to check for existence of a // child with the given name. We could not have raced. newParent.cacheNewChildLocked(renamed, newName) oldParent.decRefNoCaching() if oldParent != newParent { ds = appendDentry(ds, newParent) ds = appendDentry(ds, oldParent) } // Update metadata. if renamed.cachedMetadataAuthoritative() { renamed.touchCtime() } if oldParent.cachedMetadataAuthoritative() { oldParent.clearDirentsLocked() oldParent.touchCMtime() if renamed.isDir() { oldParent.decLinks() } } if newParent.cachedMetadataAuthoritative() { newParent.clearDirentsLocked() newParent.touchCMtime() if renamed.isDir() && (replaced == nil || !replaced.isDir()) { // Increase the link count if we did not replace another directory. newParent.incLinks() } } vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir()) return nil } // RmdirAt implements vfs.FilesystemImpl.RmdirAt. func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { return fs.unlinkAt(ctx, rp, true /* dir */) } // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { var ds *[]*dentry fs.renameMu.RLock() d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } err = d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()) fs.renameMuRUnlockAndCheckCaching(ctx, &ds) if err != nil { return err } if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) } return nil } // StatAt implements vfs.FilesystemImpl.StatAt. func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statx{}, err } // Since walking updates metadata for all traversed dentries under // InteropModeShared, including the returned one, we can return cached // metadata here regardless of fs.opts.interop. var stat linux.Statx d.statTo(&stat) return stat, nil } // StatFSAt implements vfs.FilesystemImpl.StatFSAt. func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statfs{}, err } // If d is synthetic, invoke statfs on the first ancestor of d that isn't. for d.isSynthetic() { d = d.parent.Load() } statfs, err := d.statfs(ctx) if err != nil { return linux.Statfs{}, err } if statfs.NameLength == 0 || statfs.NameLength > MaxFilenameLen { statfs.NameLength = MaxFilenameLen } // This is primarily for distinguishing a gofer file system in // tests. Testing is important, so instead of defining // something completely random, use a standard value. statfs.Type = linux.V9FS_MAGIC return statfs, nil } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) (*dentry, error) { child, err := parent.symlink(ctx, name, target, rp.Credentials()) if err != nil { return nil, err } if parent.fs.opts.interop != InteropModeShared { // Cache the symlink target on creation. In practice, this helps avoid a // lot of ReadLink RPCs. Note that when InteropModeShared is in effect, // we are forced to make Readlink RPCs. Because in this mode, we use host // timestamps, not timestamps based on our internal clock. And readlink // updates the atime on the host. child.haveTarget = true child.target = target } return child, nil }, nil) } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { return fs.unlinkAt(ctx, rp, false /* dir */) } // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err } if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } if !d.isSocket() { return nil, linuxerr.ECONNREFUSED } if d.endpoint != nil { return d.endpoint, nil } if !d.isSynthetic() { d.IncRef() ds = appendDentry(ds, d) return &endpoint{ dentry: d, path: opts.Addr, }, nil } return nil, linuxerr.ECONNREFUSED } // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err } return d.listXattr(ctx, size) } // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err } return d.getXattr(ctx, rp.Credentials(), &opts) } // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { var ds *[]*dentry fs.renameMu.RLock() d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } err = d.setXattr(ctx, rp.Credentials(), &opts) fs.renameMuRUnlockAndCheckCaching(ctx, &ds) if err != nil { return err } d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { var ds *[]*dentry fs.renameMu.RLock() d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } err = d.removeXattr(ctx, rp.Credentials(), name) fs.renameMuRUnlockAndCheckCaching(ctx, &ds) if err != nil { return err } d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } // PrependPath implements vfs.FilesystemImpl.PrependPath. func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.renameMu.RLock() defer fs.renameMu.RUnlock() return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) } type mopt struct { key string value any } func (m mopt) String() string { if m.value == nil { return fmt.Sprintf("%s", m.key) } return fmt.Sprintf("%s=%v", m.key, m.value) } // MountOptions implements vfs.FilesystemImpl.MountOptions. func (fs *filesystem) MountOptions() string { optsKV := []mopt{ {moptTransport, transportModeFD}, // Only valid value, currently. {moptReadFD, fs.opts.fd}, // Currently, read and write FD are the same. {moptWriteFD, fs.opts.fd}, // Currently, read and write FD are the same. {moptAname, fs.opts.aname}, {moptDfltUID, fs.opts.dfltuid}, {moptDfltGID, fs.opts.dfltgid}, } if globalDentryCache != nil { optsKV = append(optsKV, mopt{moptDcache, fmt.Sprintf("%d-global", globalDentryCache.maxCachedDentries)}) } else { optsKV = append(optsKV, mopt{moptDcache, fs.opts.dcache}) } switch fs.opts.interop { case InteropModeExclusive: optsKV = append(optsKV, mopt{moptCache, cacheFSCache}) case InteropModeWritethrough: optsKV = append(optsKV, mopt{moptCache, cacheFSCacheWritethrough}) case InteropModeShared: optsKV = append(optsKV, mopt{moptCache, cacheRemoteRevalidating}) } if fs.opts.regularFilesUseSpecialFileFD { optsKV = append(optsKV, mopt{moptDisableFileHandleSharing, nil}) } if fs.opts.disableFifoOpen { optsKV = append(optsKV, mopt{moptDisableFifoOpen, nil}) } if fs.opts.forcePageCache { optsKV = append(optsKV, mopt{moptForcePageCache, nil}) } if fs.opts.limitHostFDTranslation { optsKV = append(optsKV, mopt{moptLimitHostFDTranslation, nil}) } if fs.opts.overlayfsStaleRead { optsKV = append(optsKV, mopt{moptOverlayfsStaleRead, nil}) } if fs.opts.directfs.enabled { optsKV = append(optsKV, mopt{moptDirectfs, nil}) } opts := make([]string, 0, len(optsKV)) for _, opt := range optsKV { opts = append(opts, opt.String()) } return strings.Join(opts, ",") } // IsDescendant implements vfs.FilesystemImpl.IsDescendant. func (fs *filesystem) IsDescendant(vfsroot, vd vfs.VirtualDentry) bool { return genericIsDescendant(vfsroot.Dentry(), vd.Dentry().Impl().(*dentry)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/fstree.go000066400000000000000000000036701465435605700251710ustar00rootroot00000000000000package gofer import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // We need to define an interface instead of using atomic.Pointer because // the Dentry type gets removed during code generation and the compiler // complains about the unused sync/atomic type. type genericatomicptr interface { Load() *dentry } // IsAncestorDentry returns true if d is an ancestor of d2; that is, d is // either d2's parent or an ancestor of d2's parent. func genericIsAncestorDentry(d, d2 *dentry) bool { for d2 != nil { parent := d2.parent.Load() if parent == d { return true } if parent == d2 { return false } d2 = parent } return false } // IsDescendant returns true if vd is a descendant of vfsroot or if vd and // vfsroot are the same dentry. func genericIsDescendant(vfsroot *vfs.Dentry, d *dentry) bool { for d != nil && &d.vfsd != vfsroot { d = d.parent.Load() } return d != nil } // ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d. func genericParentOrSelf(d *dentry) *dentry { if parent := d.parent.Load(); parent != nil { return parent } return d } // PrependPath is a generic implementation of FilesystemImpl.PrependPath(). func genericPrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *dentry, b *fspath.Builder) error { for { if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { return vfs.PrependPathAtVFSRootError{} } if mnt != nil && &d.vfsd == mnt.Root() { return nil } parent := d.parent.Load() if parent == nil { return vfs.PrependPathAtNonMountRootError{} } b.PrependComponent(d.name) d = parent } } // DebugPathname returns a pathname to d relative to its filesystem root. // DebugPathname does not correspond to any Linux function; it's used to // generate dentry pathnames for debugging. func genericDebugPathname(d *dentry) string { var b fspath.Builder _ = genericPrependPath(vfs.VirtualDentry{}, nil, d, &b) return b.String() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/gofer.go000066400000000000000000002273201465435605700250030ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package gofer provides a filesystem implementation that is backed by a 9p // server, interchangeably referred to as "gofers" throughout this package. // // Lock order: // // regularFileFD/directoryFD.mu // filesystem.renameMu // dentry.cachingMu // dentryCache.mu // dentry.opMu // dentry.childrenMu // filesystem.syncMu // dentry.metadataMu // *** "memmap.Mappable locks" below this point // dentry.mapsMu // *** "memmap.Mappable locks taken by Translate" below this point // dentry.handleMu // dentry.dataMu // filesystem.inoMu // specialFileFD.mu // specialFileFD.bufMu // // Locking dentry.opMu and dentry.metadataMu in multiple dentries requires that // either ancestor dentries are locked before descendant dentries, or that // filesystem.renameMu is locked for writing. package gofer import ( "fmt" "path" "strconv" "strings" "sync/atomic" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/lisafs" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" fslock "gvisor.dev/gvisor/pkg/sentry/fsimpl/lock" "gvisor.dev/gvisor/pkg/sentry/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) // Name is the default filesystem name. const Name = "9p" // Mount option names for goferfs. const ( moptTransport = "trans" moptReadFD = "rfdno" moptWriteFD = "wfdno" moptAname = "aname" moptDfltUID = "dfltuid" moptDfltGID = "dfltgid" moptCache = "cache" moptDcache = "dcache" moptForcePageCache = "force_page_cache" moptLimitHostFDTranslation = "limit_host_fd_translation" moptOverlayfsStaleRead = "overlayfs_stale_read" moptDisableFileHandleSharing = "disable_file_handle_sharing" moptDisableFifoOpen = "disable_fifo_open" // Directfs options. moptDirectfs = "directfs" ) // Valid values for the "cache" mount option. const ( cacheFSCache = "fscache" cacheFSCacheWritethrough = "fscache_writethrough" cacheRemoteRevalidating = "remote_revalidating" ) // SupportedMountOptions is the set of mount options that can be set externally. var SupportedMountOptions = []string{moptOverlayfsStaleRead, moptDisableFileHandleSharing, moptDcache} const ( defaultMaxCachedDentries = 1000 maxCachedNegativeChildren = 1000 ) // stringFixedCache is a fixed sized cache, once initialized, // its size never changes. // // +stateify savable type stringFixedCache struct { // namesList stores negative names with fifo list. // name stored in namesList only means it used to be negative // at the moment you pushed it to the list. namesList stringList size uint64 } func (cache *stringFixedCache) isInited() bool { return cache.size != 0 } func (cache *stringFixedCache) init(size uint64) { elements := make([]stringListElem, size) for i := uint64(0); i < size; i++ { cache.namesList.PushFront(&elements[i]) } cache.size = size } // Update will push name to the front of the list, // and pop the tail value. func (cache *stringFixedCache) add(name string) string { tail := cache.namesList.Back() victimName := tail.str tail.str = name cache.namesList.Remove(tail) cache.namesList.PushFront(tail) return victimName } // +stateify savable type dentryCache struct { // maxCachedDentries is the maximum number of cacheable dentries. // maxCachedDentries is immutable. maxCachedDentries uint64 // mu protects the below fields. mu sync.Mutex `state:"nosave"` // dentries contains all dentries with 0 references. Due to race conditions, // it may also contain dentries with non-zero references. dentries dentryList // dentriesLen is the number of dentries in dentries. dentriesLen uint64 } // SetDentryCacheSize sets the size of the global gofer dentry cache. func SetDentryCacheSize(size int) { if size < 0 { return } if globalDentryCache != nil { log.Warningf("Global dentry cache has already been initialized. Ignoring subsequent attempt.") return } globalDentryCache = &dentryCache{maxCachedDentries: uint64(size)} } // globalDentryCache is a global cache of dentries across all gofer clients. var globalDentryCache *dentryCache // Valid values for "trans" mount option. const transportModeFD = "fd" // FilesystemType implements vfs.FilesystemType. // // +stateify savable type FilesystemType struct{} // filesystem implements vfs.FilesystemImpl. // // +stateify savable type filesystem struct { vfsfs vfs.Filesystem // mf is used to allocate memory that caches regular file contents. mf is // immutable. mf *pgalloc.MemoryFile `state:"nosave"` // Immutable options. opts filesystemOptions iopts InternalFilesystemOptions // client is the LISAFS client used for communicating with the server. client // is immutable. client *lisafs.Client `state:"nosave"` // clock is a realtime clock used to set timestamps in file operations. clock ktime.Clock // devMinor is the filesystem's minor device number. devMinor is immutable. devMinor uint32 // root is the root dentry. root is immutable. root *dentry // renameMu serves two purposes: // // - It synchronizes path resolution with renaming initiated by this // client. // // - It is held by path resolution to ensure that reachable dentries remain // valid. A dentry is reachable by path resolution if it has a non-zero // reference count (such that it is usable as vfs.ResolvingPath.Start() or // is reachable from its children), or if it is a child dentry (such that // it is reachable from its parent). renameMu sync.RWMutex `state:"nosave"` dentryCache *dentryCache // syncableDentries contains all non-synthetic dentries. specialFileFDs // contains all open specialFileFDs. These fields are protected by syncMu. syncMu sync.Mutex `state:"nosave"` syncableDentries dentryList specialFileFDs specialFDList // inoByKey maps previously-observed device ID and host inode numbers to // internal inode numbers assigned to those files. inoByKey is not preserved // across checkpoint/restore because inode numbers may be reused between // different gofer processes, so inode numbers may be repeated for different // files across checkpoint/restore. inoByKey is protected by inoMu. inoMu sync.Mutex `state:"nosave"` inoByKey map[inoKey]uint64 `state:"nosave"` // lastIno is the last inode number assigned to a file. lastIno is accessed // using atomic memory operations. lastIno atomicbitops.Uint64 // savedDentryRW records open read/write handles during save/restore. savedDentryRW map[*dentry]savedDentryRW // released is nonzero once filesystem.Release has been called. released atomicbitops.Int32 } // +stateify savable type filesystemOptions struct { fd int aname string interop InteropMode // derived from the "cache" mount option dfltuid auth.KUID dfltgid auth.KGID // dcache is the maximum number of dentries that can be cached. This is // effective only if globalDentryCache is not being used. dcache uint64 // If forcePageCache is true, host FDs may not be used for application // memory mappings even if available; instead, the client must perform its // own caching of regular file pages. This is primarily useful for testing. forcePageCache bool // If limitHostFDTranslation is true, apply maxFillRange() constraints to // host FD mappings returned by dentry.(memmap.Mappable).Translate(). This // makes memory accounting behavior more consistent between cases where // host FDs are / are not available, but may increase the frequency of // sentry-handled page faults on files for which a host FD is available. limitHostFDTranslation bool // If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote // filesystem may not be coherent with writable host FDs opened later, so // all uses of the former must be replaced by uses of the latter. This is // usually only the case when the remote filesystem is a Linux overlayfs // mount. (Prior to Linux 4.18, patch series centered on commit // d1d04ef8572b "ovl: stack file ops", both I/O and memory mappings were // incoherent between pre-copy-up and post-copy-up FDs; after that patch // series, only memory mappings are incoherent.) overlayfsStaleRead bool // If regularFilesUseSpecialFileFD is true, application FDs representing // regular files will use distinct file handles for each FD, in the same // way that application FDs representing "special files" such as sockets // do. Note that this disables client caching for regular files. This option // may regress performance due to excessive Open RPCs. This option is not // supported with overlayfsStaleRead for now. regularFilesUseSpecialFileFD bool // If disableFifoOpen is true, application attempts to open(2) a host FIFO // are disallowed. disableFifoOpen bool // directfs holds options for directfs mode. directfs directfsOpts } // +stateify savable type directfsOpts struct { // If directfs is enabled, the gofer client does not make RPCs to the gofer // process. Instead, it makes host syscalls to perform file operations. enabled bool } // InteropMode controls the client's interaction with other remote filesystem // users. // // +stateify savable type InteropMode uint32 const ( // InteropModeExclusive is appropriate when the filesystem client is the // only user of the remote filesystem. // // - The client may cache arbitrary filesystem state (file data, metadata, // filesystem structure, etc.). // // - Client changes to filesystem state may be sent to the remote // filesystem asynchronously, except when server permission checks are // necessary. // // - File timestamps are based on client clocks. This ensures that users of // the client observe timestamps that are coherent with their own clocks // and consistent with Linux's semantics (in particular, it is not always // possible for clients to set arbitrary atimes and mtimes depending on the // remote filesystem implementation, and never possible for clients to set // arbitrary ctimes.) InteropModeExclusive InteropMode = iota // InteropModeWritethrough is appropriate when there are read-only users of // the remote filesystem that expect to observe changes made by the // filesystem client. // // - The client may cache arbitrary filesystem state. // // - Client changes to filesystem state must be sent to the remote // filesystem synchronously. // // - File timestamps are based on client clocks. As a corollary, access // timestamp changes from other remote filesystem users will not be visible // to the client. InteropModeWritethrough // InteropModeShared is appropriate when there are users of the remote // filesystem that may mutate its state other than the client. // // - The client must verify ("revalidate") cached filesystem state before // using it. // // - Client changes to filesystem state must be sent to the remote // filesystem synchronously. // // - File timestamps are based on server clocks. This is necessary to // ensure that timestamp changes are synchronized between remote filesystem // users. // // Note that the correctness of InteropModeShared depends on the server // correctly implementing 9P fids (i.e. each fid immutably represents a // single filesystem object), even in the presence of remote filesystem // mutations from other users. If this is violated, the behavior of the // client is undefined. InteropModeShared ) // InternalFilesystemOptions may be passed as // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem. // // +stateify savable type InternalFilesystemOptions struct { // If UniqueID is non-empty, it is an opaque string used to reassociate the // filesystem with a new server FD during restoration from checkpoint. UniqueID vfs.RestoreID // If LeakConnection is true, do not close the connection to the server // when the Filesystem is released. This is necessary for deployments in // which servers can handle only a single client and report failure if that // client disconnects. LeakConnection bool // If OpenSocketsByConnecting is true, silently translate attempts to open // files identifying as sockets to connect RPCs. OpenSocketsByConnecting bool } // _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default // UIDs and GIDs used for files that do not provide a specific owner or group // respectively. const ( // uint32(-2) doesn't work in Go. _V9FS_DEFUID = auth.KUID(4294967294) _V9FS_DEFGID = auth.KGID(4294967294) ) // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name } // Release implements vfs.FilesystemType.Release. func (FilesystemType) Release(ctx context.Context) {} // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { mf := pgalloc.MemoryFileFromContext(ctx) if mf == nil { ctx.Warningf("gofer.FilesystemType.GetFilesystem: CtxMemoryFile is nil") return nil, nil, linuxerr.EINVAL } mopts := vfs.GenericParseMountOptions(opts.Data) var fsopts filesystemOptions fd, err := getFDFromMountOptionsMap(ctx, mopts) if err != nil { return nil, nil, err } fsopts.fd = fd // Get the attach name. fsopts.aname = "/" if aname, ok := mopts[moptAname]; ok { delete(mopts, moptAname) if !path.IsAbs(aname) { ctx.Warningf("gofer.FilesystemType.GetFilesystem: aname is not absolute: %s=%s", moptAname, aname) return nil, nil, linuxerr.EINVAL } fsopts.aname = path.Clean(aname) } // Parse the cache policy. For historical reasons, this defaults to the // least generally-applicable option, InteropModeExclusive. fsopts.interop = InteropModeExclusive if cache, ok := mopts[moptCache]; ok { delete(mopts, moptCache) switch cache { case cacheFSCache: fsopts.interop = InteropModeExclusive case cacheFSCacheWritethrough: fsopts.interop = InteropModeWritethrough case cacheRemoteRevalidating: fsopts.interop = InteropModeShared default: ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: %s=%s", moptCache, cache) return nil, nil, linuxerr.EINVAL } } // Parse the dentry cache size. fsopts.dcache = defaultMaxCachedDentries if dcacheStr, ok := mopts[moptDcache]; ok { delete(mopts, moptDcache) dcache, err := strconv.ParseInt(dcacheStr, 10, 64) if err != nil { ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid dcache: %s=%s", moptDcache, dcacheStr) return nil, nil, linuxerr.EINVAL } if dcache >= 0 { fsopts.dcache = uint64(dcache) } } // Parse the default UID and GID. fsopts.dfltuid = _V9FS_DEFUID if dfltuidstr, ok := mopts[moptDfltUID]; ok { delete(mopts, moptDfltUID) dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32) if err != nil { ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltUID, dfltuidstr) return nil, nil, linuxerr.EINVAL } // In Linux, dfltuid is interpreted as a UID and is converted to a KUID // in the caller's user namespace, but goferfs isn't // application-mountable. fsopts.dfltuid = auth.KUID(dfltuid) } fsopts.dfltgid = _V9FS_DEFGID if dfltgidstr, ok := mopts[moptDfltGID]; ok { delete(mopts, moptDfltGID) dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32) if err != nil { ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltGID, dfltgidstr) return nil, nil, linuxerr.EINVAL } fsopts.dfltgid = auth.KGID(dfltgid) } // Handle simple flags. if _, ok := mopts[moptDisableFileHandleSharing]; ok { delete(mopts, moptDisableFileHandleSharing) fsopts.regularFilesUseSpecialFileFD = true } if _, ok := mopts[moptDisableFifoOpen]; ok { delete(mopts, moptDisableFifoOpen) fsopts.disableFifoOpen = true } if _, ok := mopts[moptForcePageCache]; ok { delete(mopts, moptForcePageCache) fsopts.forcePageCache = true } if _, ok := mopts[moptLimitHostFDTranslation]; ok { delete(mopts, moptLimitHostFDTranslation) fsopts.limitHostFDTranslation = true } if _, ok := mopts[moptOverlayfsStaleRead]; ok { delete(mopts, moptOverlayfsStaleRead) fsopts.overlayfsStaleRead = true } if _, ok := mopts[moptDirectfs]; ok { delete(mopts, moptDirectfs) fsopts.directfs.enabled = true } // fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying // "cache=none". // Check for unparsed options. if len(mopts) != 0 { ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts) return nil, nil, linuxerr.EINVAL } // Validation. if fsopts.regularFilesUseSpecialFileFD && fsopts.overlayfsStaleRead { // These options are not supported together. To support this, when a dentry // is opened writably for the first time, we need to iterate over all the // specialFileFDs of that dentry that represent a regular file and call // fd.hostFileMapper.RegenerateMappings(writable_fd). ctx.Warningf("gofer.FilesystemType.GetFilesystem: regularFilesUseSpecialFileFD and overlayfsStaleRead options are not supported together.") return nil, nil, linuxerr.EINVAL } // Handle internal options. iopts, ok := opts.InternalData.(InternalFilesystemOptions) if opts.InternalData != nil && !ok { ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData) return nil, nil, linuxerr.EINVAL } // If !ok, iopts being the zero value is correct. // Construct the filesystem object. devMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, nil, err } fs := &filesystem{ mf: mf, opts: fsopts, iopts: iopts, clock: ktime.RealtimeClockFromContext(ctx), devMinor: devMinor, inoByKey: make(map[inoKey]uint64), } // Did the user configure a global dentry cache? if globalDentryCache != nil { fs.dentryCache = globalDentryCache } else { fs.dentryCache = &dentryCache{maxCachedDentries: fsopts.dcache} } fs.vfsfs.Init(vfsObj, &fstype, fs) rootInode, rootHostFD, err := fs.initClientAndGetRoot(ctx) if err != nil { fs.vfsfs.DecRef(ctx) return nil, nil, err } if fs.opts.directfs.enabled { fs.root, err = fs.getDirectfsRootDentry(ctx, rootHostFD, fs.client.NewFD(rootInode.ControlFD)) } else { fs.root, err = fs.newLisafsDentry(ctx, &rootInode) } if err != nil { fs.vfsfs.DecRef(ctx) return nil, nil, err } // Set the root's reference count to 2. One reference is returned to the // caller, and the other is held by fs to prevent the root from being "cached" // and subsequently evicted. fs.root.refs = atomicbitops.FromInt64(2) return &fs.vfsfs, &fs.root.vfsd, nil } // initClientAndGetRoot initializes fs.client and returns the root inode for // this mount point. It handles the attach point (fs.opts.aname) resolution. func (fs *filesystem) initClientAndGetRoot(ctx context.Context) (lisafs.Inode, int, error) { sock, err := unet.NewSocket(fs.opts.fd) if err != nil { return lisafs.Inode{}, -1, err } ctx.UninterruptibleSleepStart(false) defer ctx.UninterruptibleSleepFinish(false) var ( rootInode lisafs.Inode rootHostFD int ) fs.client, rootInode, rootHostFD, err = lisafs.NewClient(sock) if err != nil { return lisafs.Inode{}, -1, err } cu := cleanup.Make(func() { if rootHostFD >= 0 { _ = unix.Close(rootHostFD) } rootControlFD := fs.client.NewFD(rootInode.ControlFD) rootControlFD.Close(ctx, false /* flush */) }) defer cu.Clean() if fs.opts.directfs.enabled { if fs.opts.aname != "/" { log.Warningf("directfs does not support aname filesystem option: aname=%q", fs.opts.aname) return lisafs.Inode{}, -1, unix.EINVAL } if rootHostFD < 0 { log.Warningf("Mount RPC did not return host FD to mount point with directfs enabled") return lisafs.Inode{}, -1, unix.EINVAL } } else { if rootHostFD >= 0 { log.Warningf("Mount RPC returned a host FD to mount point without directfs, we didn't ask for it") _ = unix.Close(rootHostFD) rootHostFD = -1 } // Use flipcall channels with lisafs because it makes a lot of RPCs. if err := fs.client.StartChannels(); err != nil { return lisafs.Inode{}, -1, err } rootInode, err = fs.handleAnameLisafs(ctx, rootInode) if err != nil { return lisafs.Inode{}, -1, err } } cu.Release() return rootInode, rootHostFD, nil } func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) { // Check that the transport is "fd". trans, ok := mopts[moptTransport] if !ok || trans != transportModeFD { ctx.Warningf("gofer.getFDFromMountOptionsMap: transport must be specified as '%s=%s'", moptTransport, transportModeFD) return -1, linuxerr.EINVAL } delete(mopts, moptTransport) // Check that read and write FDs are provided and identical. rfdstr, ok := mopts[moptReadFD] if !ok { ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD must be specified as '%s='", moptReadFD) return -1, linuxerr.EINVAL } delete(mopts, moptReadFD) rfd, err := strconv.Atoi(rfdstr) if err != nil { ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid read FD: %s=%s", moptReadFD, rfdstr) return -1, linuxerr.EINVAL } wfdstr, ok := mopts[moptWriteFD] if !ok { ctx.Warningf("gofer.getFDFromMountOptionsMap: write FD must be specified as '%s='", moptWriteFD) return -1, linuxerr.EINVAL } delete(mopts, moptWriteFD) wfd, err := strconv.Atoi(wfdstr) if err != nil { ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid write FD: %s=%s", moptWriteFD, wfdstr) return -1, linuxerr.EINVAL } if rfd != wfd { ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD (%d) and write FD (%d) must be equal", rfd, wfd) return -1, linuxerr.EINVAL } return rfd, nil } // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { fs.released.Store(1) mf := fs.mf fs.syncMu.Lock() for elem := fs.syncableDentries.Front(); elem != nil; elem = elem.Next() { d := elem.d d.handleMu.Lock() d.dataMu.Lock() if d.isWriteHandleOk() { // Write dirty cached data to the remote file. h := d.writeHandle() if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err) } // TODO(jamieliu): Do we need to flushf/fsync d? } // Discard cached pages. d.cache.DropAll(mf) d.dirty.RemoveAll() d.dataMu.Unlock() // Close host FDs if they exist. d.closeHostFDs() d.handleMu.Unlock() } // There can't be any specialFileFDs still using fs, since each such // FileDescription would hold a reference on a Mount holding a reference on // fs. fs.syncMu.Unlock() // If leak checking is enabled, release all outstanding references in the // filesystem. We deliberately avoid doing this outside of leak checking; we // have released all external resources above rather than relying on dentry // destructors. fs.root may be nil if creating the client or initializing the // root dentry failed in GetFilesystem. if refs.GetLeakMode() != refs.NoLeakChecking && fs.root != nil { fs.renameMu.Lock() fs.root.releaseSyntheticRecursiveLocked(ctx) fs.evictAllCachedDentriesLocked(ctx) fs.renameMu.Unlock() // An extra reference was held by the filesystem on the root to prevent it from // being cached/evicted. fs.root.DecRef(ctx) } if !fs.iopts.LeakConnection { // Close the connection to the server. This implicitly closes all FDs. if fs.client != nil { fs.client.Close() } } fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) } // releaseSyntheticRecursiveLocked traverses the tree with root d and decrements // the reference count on every synthetic dentry. Synthetic dentries have one // reference for existence that should be dropped during filesystem.Release. // // Precondition: d.fs.renameMu is locked for writing. func (d *dentry) releaseSyntheticRecursiveLocked(ctx context.Context) { if d.isSynthetic() { d.decRefNoCaching() d.checkCachingLocked(ctx, true /* renameMuWriteLocked */) } if d.isDir() { var children []*dentry d.childrenMu.Lock() for _, child := range d.children { children = append(children, child) } d.childrenMu.Unlock() for _, child := range children { if child != nil { child.releaseSyntheticRecursiveLocked(ctx) } } } } // inoKey is the key used to identify the inode backed by this dentry. // // +stateify savable type inoKey struct { ino uint64 devMinor uint32 devMajor uint32 } func inoKeyFromStatx(stat *linux.Statx) inoKey { return inoKey{ ino: stat.Ino, devMinor: stat.DevMinor, devMajor: stat.DevMajor, } } func inoKeyFromStat(stat *unix.Stat_t) inoKey { return inoKey{ ino: stat.Ino, devMinor: unix.Minor(stat.Dev), devMajor: unix.Major(stat.Dev), } } // dentry implements vfs.DentryImpl. // // +stateify savable type dentry struct { vfsd vfs.Dentry // refs is the reference count. Each dentry holds a reference on its // parent, even if disowned. An additional reference is held on all // synthetic dentries until they are unlinked or invalidated. When refs // reaches 0, the dentry may be added to the cache or destroyed. If refs == // -1, the dentry has already been destroyed. refs is accessed using atomic // memory operations. refs atomicbitops.Int64 // fs is the owning filesystem. fs is immutable. fs *filesystem // parent is this dentry's parent directory. Each dentry holds a reference // on its parent. If this dentry is a filesystem root, parent is nil. // parent is protected by filesystem.renameMu. parent atomic.Pointer[dentry] `state:".(*dentry)"` // name is the name of this dentry in its parent. If this dentry is a // filesystem root, name is the empty string. name is protected by // filesystem.renameMu. name string // inoKey is used to identify this dentry's inode. inoKey inoKey // If deleted is non-zero, the file represented by this dentry has been // deleted is accessed using atomic memory operations. deleted atomicbitops.Uint32 // cachingMu is used to synchronize concurrent dentry caching attempts on // this dentry. cachingMu sync.Mutex `state:"nosave"` // If cached is true, this dentry is part of filesystem.dentryCache. cached // is protected by cachingMu. cached bool // cacheEntry links dentry into filesystem.dentryCache.dentries. It is // protected by filesystem.dentryCache.mu. cacheEntry dentryListElem // syncableListEntry links dentry into filesystem.syncableDentries. It is // protected by filesystem.syncMu. syncableListEntry dentryListElem // opMu synchronizes operations on this dentry. Operations that mutate // the dentry tree must hold this lock for writing. Operations that // only read the tree must hold for reading. opMu sync.RWMutex `state:"nosave"` // childrenMu protects the cached children data for this dentry. childrenMu sync.Mutex `state:"nosave"` // If this dentry represents a directory, children contains: // // - Mappings of child filenames to dentries representing those children. // // - Mappings of child filenames that are known not to exist to nil // dentries (only if InteropModeShared is not in effect and the directory // is not synthetic). // // +checklocks:childrenMu children map[string]*dentry // If this dentry represents a directory, negativeChildrenCache cache // names of negative children. negativeChildrenCache is not saved since // dentry.prepareSaveRecursive() drops all negative children. // // +checklocks:childrenMu negativeChildrenCache stringFixedCache `state:"nosave"` // If this dentry represents a directory, negativeChildren is the number of // negative children cached in dentry.children. negativeChildren is not // saved since dentry.prepareSaveRecursive() drops all negative children. // // +checklocks:childrenMu negativeChildren int `state:"nosave"` // If this dentry represents a directory, syntheticChildren is the number // of child dentries for which dentry.isSynthetic() == true. // // +checklocks:childrenMu syntheticChildren int // If this dentry represents a directory, // dentry.cachedMetadataAuthoritative() == true, and dirents is not // nil, then dirents is a cache of all entries in the directory, in the // order they were returned by the server. childrenSet just stores the // `Name` field of all dirents in a set for fast query. dirents and // childrenSet share the same lifecycle. // // +checklocks:childrenMu dirents []vfs.Dirent `state:"nosave"` // +checklocks:childrenMu childrenSet map[string]struct{} `state:"nosave"` // Cached metadata; protected by metadataMu. // To access: // - In situations where consistency is not required (like stat), these // can be accessed using atomic operations only (without locking). // - Lock metadataMu and can access without atomic operations. // To mutate: // - Lock metadataMu and use atomic operations to update because we might // have atomic readers that don't hold the lock. metadataMu sync.Mutex `state:"nosave"` ino uint64 // immutable mode atomicbitops.Uint32 // type is immutable, perms are mutable uid atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic gid atomicbitops.Uint32 // auth.KGID, but ... blockSize atomicbitops.Uint32 // 0 if unknown // Timestamps, all nsecs from the Unix epoch. atime atomicbitops.Int64 mtime atomicbitops.Int64 ctime atomicbitops.Int64 btime atomicbitops.Int64 // File size, which differs from other metadata in two ways: // // - We make a best-effort attempt to keep it up to date even if // !dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes. // // - size is protected by both metadataMu and dataMu (i.e. both must be // locked to mutate it; locking either is sufficient to access it). size atomicbitops.Uint64 // If this dentry does not represent a synthetic file, deleted is 0, and // atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the // remote file's timestamps, which should be updated when this dentry is // evicted. atimeDirty atomicbitops.Uint32 mtimeDirty atomicbitops.Uint32 // nlink counts the number of hard links to this dentry. It's updated and // accessed using atomic operations. It's not protected by metadataMu like the // other metadata fields. nlink atomicbitops.Uint32 mapsMu sync.Mutex `state:"nosave"` // If this dentry represents a regular file, mappings tracks mappings of // the file into memmap.MappingSpaces. mappings is protected by mapsMu. mappings memmap.MappingSet // - If this dentry represents a regular file or directory, readFD (if not // -1) is a host FD used for reads by all regularFileFDs/directoryFDs // representing this dentry. // // - If this dentry represents a regular file, writeFD (if not -1) is a host // FD used for writes by all regularFileFDs representing this dentry. // // - If this dentry represents a regular file, mmapFD is the host FD used // for memory mappings. If mmapFD is -1, no such FD is available, and the // internal page cache implementation is used for memory mappings instead. // // These fields are protected by handleMu. readFD, writeFD, and mmapFD are // additionally written using atomic memory operations, allowing them to be // read (albeit racily) with atomic.LoadInt32() without locking handleMu. // // readFD and writeFD may or may not be the same file descriptor. Once either // transitions from closed (-1) to open, it may be mutated with handleMu // locked, but cannot be closed until the dentry is destroyed. // // readFD and writeFD may or may not be the same file descriptor. mmapFD is // always either -1 or equal to readFD; if the file has been opened for // writing, it is additionally either -1 or equal to writeFD. handleMu sync.RWMutex `state:"nosave"` readFD atomicbitops.Int32 `state:"nosave"` writeFD atomicbitops.Int32 `state:"nosave"` mmapFD atomicbitops.Int32 `state:"nosave"` dataMu sync.RWMutex `state:"nosave"` // If this dentry represents a regular file that is client-cached, cache // maps offsets into the cached file to offsets into // filesystem.mfp.MemoryFile() that store the file's data. cache is // protected by dataMu. cache fsutil.FileRangeSet // If this dentry represents a regular file that is client-cached, dirty // tracks dirty segments in cache. dirty is protected by dataMu. dirty fsutil.DirtySet // pf implements memmap.File for mappings of hostFD. pf dentryPlatformFile // If this dentry represents a symbolic link, InteropModeShared is not in // effect, and haveTarget is true, target is the symlink target. haveTarget // and target are protected by dataMu. haveTarget bool target string // If this dentry represents a synthetic socket file, endpoint is the // transport endpoint bound to this file. endpoint transport.BoundEndpoint // If this dentry represents a synthetic named pipe, pipe is the pipe // endpoint bound to this file. pipe *pipe.VFSPipe locks vfs.FileLocks // Inotify watches for this dentry. // // Note that inotify may behave unexpectedly in the presence of hard links, // because dentries corresponding to the same file have separate inotify // watches when they should share the same set. This is the case because it is // impossible for us to know for sure whether two dentries correspond to the // same underlying file (see the gofer filesystem section fo vfs/inotify.md for // a more in-depth discussion on this matter). watches vfs.Watches // forMountpoint marks directories that were created for mount points during // container startup. This is used during restore, in case these mount points // need to be recreated. forMountpoint bool // impl is the specific dentry implementation for non-synthetic dentries. // impl is immutable. // // If impl is nil, this dentry represents a synthetic file, i.e. a // file that does not exist on the host filesystem. As of this writing, the // only files that can be synthetic are sockets, pipes, and directories. impl any } // +stateify savable type stringListElem struct { // str is the string that this elem represents. str string stringEntry } // +stateify savable type dentryListElem struct { // d is the dentry that this elem represents. d *dentry dentryEntry } func (fs *filesystem) inoFromKey(key inoKey) uint64 { fs.inoMu.Lock() defer fs.inoMu.Unlock() if ino, ok := fs.inoByKey[key]; ok { return ino } ino := fs.nextIno() fs.inoByKey[key] = ino return ino } func (fs *filesystem) nextIno() uint64 { return fs.lastIno.Add(1) } // init must be called before first use of d. func (d *dentry) init(impl any) { d.pf.dentry = d d.cacheEntry.d = d d.syncableListEntry.d = d // Nested impl-inheritance pattern. In memory it looks like: // [[[ vfs.Dentry ] dentry ] dentryImpl ] // All 3 abstractions are allocated in one allocation. We achieve this by // making each outer dentry implementation hold the inner dentry by value. // Then the outer most dentry is allocated and we initialize fields inward. // Each inner dentry has a pointer to the next level of implementation. d.impl = impl d.vfsd.Init(d) refs.Register(d) } func (d *dentry) isSynthetic() bool { return d.impl == nil } func (d *dentry) cachedMetadataAuthoritative() bool { return d.fs.opts.interop != InteropModeShared || d.isSynthetic() } // updateMetadataFromStatxLocked is called to update d's metadata after an update // from the remote filesystem. // Precondition: d.metadataMu must be locked. // +checklocks:d.metadataMu func (d *lisafsDentry) updateMetadataFromStatxLocked(stat *linux.Statx) { if stat.Mask&linux.STATX_TYPE != 0 { if got, want := stat.Mode&linux.FileTypeMask, d.fileType(); uint32(got) != want { panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got)) } } if stat.Mask&linux.STATX_MODE != 0 { d.mode.Store(uint32(stat.Mode)) } if stat.Mask&linux.STATX_UID != 0 { d.uid.Store(dentryUID(lisafs.UID(stat.UID))) } if stat.Mask&linux.STATX_GID != 0 { d.gid.Store(dentryGID(lisafs.GID(stat.GID))) } if stat.Blksize != 0 { d.blockSize.Store(stat.Blksize) } // Don't override newer client-defined timestamps with old server-defined // ones. if stat.Mask&linux.STATX_ATIME != 0 && d.atimeDirty.Load() == 0 { d.atime.Store(dentryTimestamp(stat.Atime)) } if stat.Mask&linux.STATX_MTIME != 0 && d.mtimeDirty.Load() == 0 { d.mtime.Store(dentryTimestamp(stat.Mtime)) } if stat.Mask&linux.STATX_CTIME != 0 { d.ctime.Store(dentryTimestamp(stat.Ctime)) } if stat.Mask&linux.STATX_BTIME != 0 { d.btime.Store(dentryTimestamp(stat.Btime)) } if stat.Mask&linux.STATX_NLINK != 0 { d.nlink.Store(stat.Nlink) } if stat.Mask&linux.STATX_SIZE != 0 { d.updateSizeLocked(stat.Size) } } // updateMetadataFromStatLocked is similar to updateMetadataFromStatxLocked, // except that it takes a unix.Stat_t argument. // Precondition: d.metadataMu must be locked. // +checklocks:d.metadataMu func (d *directfsDentry) updateMetadataFromStatLocked(stat *unix.Stat_t) error { if got, want := stat.Mode&unix.S_IFMT, d.fileType(); got != want { panic(fmt.Sprintf("direct.dentry file type changed from %#o to %#o", want, got)) } d.mode.Store(stat.Mode) d.uid.Store(stat.Uid) d.gid.Store(stat.Gid) d.blockSize.Store(uint32(stat.Blksize)) // Don't override newer client-defined timestamps with old host-defined // ones. if d.atimeDirty.Load() == 0 { d.atime.Store(dentryTimestampFromUnix(stat.Atim)) } if d.mtimeDirty.Load() == 0 { d.mtime.Store(dentryTimestampFromUnix(stat.Mtim)) } d.ctime.Store(dentryTimestampFromUnix(stat.Ctim)) d.nlink.Store(uint32(stat.Nlink)) d.updateSizeLocked(uint64(stat.Size)) return nil } // Preconditions: !d.isSynthetic(). // Preconditions: d.metadataMu is locked. // +checklocks:d.metadataMu func (d *dentry) refreshSizeLocked(ctx context.Context) error { d.handleMu.RLock() // Can use RacyLoad() because handleMu is locked. if d.writeFD.RacyLoad() < 0 { d.handleMu.RUnlock() // Use a suitable FD if we don't have a writable host FD. return d.updateMetadataLocked(ctx, noHandle) } // Using statx(2) with a minimal mask is faster than fstat(2). var stat unix.Statx_t // Can use RacyLoad() because handleMu is locked. err := unix.Statx(int(d.writeFD.RacyLoad()), "", unix.AT_EMPTY_PATH, unix.STATX_SIZE, &stat) d.handleMu.RUnlock() // must be released before updateSizeLocked() if err != nil { return err } d.updateSizeLocked(stat.Size) return nil } // Preconditions: !d.isSynthetic(). func (d *dentry) updateMetadata(ctx context.Context) error { // d.metadataMu must be locked *before* we stat so that we do not end up // updating stale attributes in d.updateMetadataFromStatLocked(). d.metadataMu.Lock() defer d.metadataMu.Unlock() return d.updateMetadataLocked(ctx, noHandle) } func (d *dentry) fileType() uint32 { return d.mode.Load() & linux.S_IFMT } func (d *dentry) statTo(stat *linux.Statx) { stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME stat.Blksize = d.blockSize.Load() stat.Nlink = d.nlink.Load() if stat.Nlink == 0 { // The remote filesystem doesn't support link count; just make // something up. This is consistent with Linux, where // fs/inode.c:inode_init_always() initializes link count to 1, and // fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if // it's not provided by the remote filesystem. stat.Nlink = 1 } stat.UID = d.uid.Load() stat.GID = d.gid.Load() stat.Mode = uint16(d.mode.Load()) stat.Ino = uint64(d.ino) stat.Size = d.size.Load() // This is consistent with regularFileFD.Seek(), which treats regular files // as having no holes. stat.Blocks = (stat.Size + 511) / 512 stat.Atime = linux.NsecToStatxTimestamp(d.atime.Load()) stat.Btime = linux.NsecToStatxTimestamp(d.btime.Load()) stat.Ctime = linux.NsecToStatxTimestamp(d.ctime.Load()) stat.Mtime = linux.NsecToStatxTimestamp(d.mtime.Load()) stat.DevMajor = linux.UNNAMED_MAJOR stat.DevMinor = d.fs.devMinor } // Precondition: fs.renameMu is locked. func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error { stat := &opts.Stat if stat.Mask == 0 { return nil } if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { return linuxerr.EPERM } mode := linux.FileMode(d.mode.Load()) if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())); err != nil { return err } if err := mnt.CheckBeginWrite(); err != nil { return err } defer mnt.EndWrite() if stat.Mask&linux.STATX_SIZE != 0 { // Reject attempts to truncate files other than regular files, since // filesystem implementations may return the wrong errno. switch mode.FileType() { case linux.S_IFREG: // ok case linux.S_IFDIR: return linuxerr.EISDIR default: return linuxerr.EINVAL } } var now int64 if d.cachedMetadataAuthoritative() { // Truncate updates mtime. if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE { stat.Mask |= linux.STATX_MTIME stat.Mtime = linux.StatxTimestamp{ Nsec: linux.UTIME_NOW, } } // Use client clocks for timestamps. now = d.fs.clock.Now().Nanoseconds() if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW { stat.Atime = linux.NsecToStatxTimestamp(now) } if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW { stat.Mtime = linux.NsecToStatxTimestamp(now) } } d.metadataMu.Lock() defer d.metadataMu.Unlock() // As with Linux, if the UID, GID, or file size is changing, we have to // clear permission bits. Note that when set, clearSGID may cause // permissions to be updated. clearSGID := (stat.Mask&linux.STATX_UID != 0 && stat.UID != d.uid.Load()) || (stat.Mask&linux.STATX_GID != 0 && stat.GID != d.gid.Load()) || stat.Mask&linux.STATX_SIZE != 0 if clearSGID { if stat.Mask&linux.STATX_MODE != 0 { stat.Mode = uint16(vfs.ClearSUIDAndSGID(uint32(stat.Mode))) } else { oldMode := d.mode.Load() if updatedMode := vfs.ClearSUIDAndSGID(oldMode); updatedMode != oldMode { stat.Mode = uint16(updatedMode) stat.Mask |= linux.STATX_MODE } } } // failureMask indicates which attributes could not be set on the remote // filesystem. p9 returns an error if any of the attributes could not be set // but that leads to inconsistency as the server could have set a few // attributes successfully but a later failure will cause the successful ones // to not be updated in the dentry cache. var failureMask uint32 var failureErr error if !d.isSynthetic() { if stat.Mask != 0 { if err := d.prepareSetStat(ctx, stat); err != nil { return err } d.handleMu.RLock() if stat.Mask&linux.STATX_SIZE != 0 { // d.dataMu must be held around the update to both the remote // file's size and d.size to serialize with writeback (which // might otherwise write data back up to the old d.size after // the remote file has been truncated). d.dataMu.Lock() } var err error failureMask, failureErr, err = d.setStatLocked(ctx, stat) d.handleMu.RUnlock() if err != nil { if stat.Mask&linux.STATX_SIZE != 0 { d.dataMu.Unlock() // +checklocksforce: locked conditionally above } return err } if stat.Mask&linux.STATX_SIZE != 0 { if failureMask&linux.STATX_SIZE == 0 { // d.size should be kept up to date, and privatized // copy-on-write mappings of truncated pages need to be // invalidated, even if InteropModeShared is in effect. d.updateSizeAndUnlockDataMuLocked(stat.Size) // +checklocksforce: locked conditionally above } else { d.dataMu.Unlock() // +checklocksforce: locked conditionally above } } } if d.fs.opts.interop == InteropModeShared { // There's no point to updating d's metadata in this case since // it'll be overwritten by revalidation before the next time it's // used anyway. (InteropModeShared inhibits client caching of // regular file data, so there's no cache to truncate either.) return nil } } if stat.Mask&linux.STATX_MODE != 0 && failureMask&linux.STATX_MODE == 0 { d.mode.Store(d.fileType() | uint32(stat.Mode)) } if stat.Mask&linux.STATX_UID != 0 && failureMask&linux.STATX_UID == 0 { d.uid.Store(stat.UID) } if stat.Mask&linux.STATX_GID != 0 && failureMask&linux.STATX_GID == 0 { d.gid.Store(stat.GID) } // Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because // if d.cachedMetadataAuthoritative() then we converted stat.Atime and // stat.Mtime to client-local timestamps above, and if // !d.cachedMetadataAuthoritative() then we returned after calling // d.file.setAttr(). For the same reason, now must have been initialized. if stat.Mask&linux.STATX_ATIME != 0 && failureMask&linux.STATX_ATIME == 0 { d.atime.Store(stat.Atime.ToNsec()) d.atimeDirty.Store(0) } if stat.Mask&linux.STATX_MTIME != 0 && failureMask&linux.STATX_MTIME == 0 { d.mtime.Store(stat.Mtime.ToNsec()) d.mtimeDirty.Store(0) } d.ctime.Store(now) if failureMask != 0 { // Setting some attribute failed on the remote filesystem. return failureErr } return nil } // doAllocate performs an allocate operation on d. Note that d.metadataMu will // be held when allocate is called. func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error { d.metadataMu.Lock() defer d.metadataMu.Unlock() // Allocating a smaller size is a noop. size := offset + length if d.cachedMetadataAuthoritative() && size <= d.size.RacyLoad() { return nil } err := allocate() if err != nil { return err } d.updateSizeLocked(size) if d.cachedMetadataAuthoritative() { d.touchCMtimeLocked() } return nil } // Preconditions: d.metadataMu must be locked. func (d *dentry) updateSizeLocked(newSize uint64) { d.dataMu.Lock() d.updateSizeAndUnlockDataMuLocked(newSize) } // Preconditions: d.metadataMu and d.dataMu must be locked. // // Postconditions: d.dataMu is unlocked. // +checklocksrelease:d.dataMu func (d *dentry) updateSizeAndUnlockDataMuLocked(newSize uint64) { oldSize := d.size.RacyLoad() d.size.Store(newSize) // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings // below. This allows concurrent calls to Read/Translate/etc. These // functions synchronize with truncation by refusing to use cache // contents beyond the new d.size. (We are still holding d.metadataMu, // so we can't race with Write or another truncate.) d.dataMu.Unlock() if newSize < oldSize { oldpgend, _ := hostarch.PageRoundUp(oldSize) newpgend, _ := hostarch.PageRoundUp(newSize) if oldpgend != newpgend { d.mapsMu.Lock() d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ // Compare Linux's mm/truncate.c:truncate_setsize() => // truncate_pagecache() => // mm/memory.c:unmap_mapping_range(evencows=1). InvalidatePrivate: true, }) d.mapsMu.Unlock() } // We are now guaranteed that there are no translations of // truncated pages, and can remove them from the cache. Since // truncated pages have been removed from the remote file, they // should be dropped without being written back. d.dataMu.Lock() d.cache.Truncate(newSize, d.fs.mf) d.dirty.KeepClean(memmap.MappableRange{newSize, oldpgend}) d.dataMu.Unlock() } } func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())) } func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { // Deny access to the "system" namespaces since applications // may expect these to affect kernel behavior in unimplemented ways // (b/148380782). Allow all other extended attributes to be passed through // to the remote filesystem. This is inconsistent with Linux's 9p client, // but consistent with other filesystems (e.g. FUSE). // // NOTE(b/202533394): Also disallow "trusted" namespace for now. This is // consistent with the VFS1 gofer client. if strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) || strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) { return linuxerr.EOPNOTSUPP } mode := linux.FileMode(d.mode.Load()) kuid := auth.KUID(d.uid.Load()) kgid := auth.KGID(d.gid.Load()) if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { return err } return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name) } func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error { return vfs.CheckDeleteSticky( creds, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KUID(child.uid.Load()), auth.KGID(child.gid.Load()), ) } func dentryUID(uid lisafs.UID) uint32 { if !uid.Ok() { return uint32(auth.OverflowUID) } return uint32(uid) } func dentryGID(gid lisafs.GID) uint32 { if !gid.Ok() { return uint32(auth.OverflowGID) } return uint32(gid) } // IncRef implements vfs.DentryImpl.IncRef. func (d *dentry) IncRef() { // d.refs may be 0 if d.fs.renameMu is locked, which serializes against // d.checkCachingLocked(). r := d.refs.Add(1) if d.LogRefs() { refs.LogIncRef(d, r) } } // TryIncRef implements vfs.DentryImpl.TryIncRef. func (d *dentry) TryIncRef() bool { for { r := d.refs.Load() if r <= 0 { return false } if d.refs.CompareAndSwap(r, r+1) { if d.LogRefs() { refs.LogTryIncRef(d, r+1) } return true } } } // DecRef implements vfs.DentryImpl.DecRef. func (d *dentry) DecRef(ctx context.Context) { if d.decRefNoCaching() == 0 { d.checkCachingLocked(ctx, false /* renameMuWriteLocked */) } } // decRefNoCaching decrements d's reference count without calling // d.checkCachingLocked, even if d's reference count reaches 0; callers are // responsible for ensuring that d.checkCachingLocked will be called later. func (d *dentry) decRefNoCaching() int64 { r := d.refs.Add(-1) if d.LogRefs() { refs.LogDecRef(d, r) } if r < 0 { panic("gofer.dentry.decRefNoCaching() called without holding a reference") } return r } // RefType implements refs.CheckedObject.Type. func (d *dentry) RefType() string { return "gofer.dentry" } // LeakMessage implements refs.CheckedObject.LeakMessage. func (d *dentry) LeakMessage() string { return fmt.Sprintf("[gofer.dentry %p] reference count of %d instead of -1", d, d.refs.Load()) } // LogRefs implements refs.CheckedObject.LogRefs. // // This should only be set to true for debugging purposes, as it can generate an // extremely large amount of output and drastically degrade performance. func (d *dentry) LogRefs() bool { return false } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { if d.isDir() { events |= linux.IN_ISDIR } d.fs.renameMu.RLock() // The ordering below is important, Linux always notifies the parent first. if parent := d.parent.Load(); parent != nil { parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted()) } d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted()) d.fs.renameMu.RUnlock() } // Watches implements vfs.DentryImpl.Watches. func (d *dentry) Watches() *vfs.Watches { return &d.watches } // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. // // If no watches are left on this dentry and it has no references, cache it. func (d *dentry) OnZeroWatches(ctx context.Context) { d.checkCachingLocked(ctx, false /* renameMuWriteLocked */) } // checkCachingLocked should be called after d's reference count becomes 0 or // it becomes disowned. // // For performance, checkCachingLocked can also be called after d's reference // count becomes non-zero, so that d can be removed from the LRU cache. This // may help in reducing the size of the cache and hence reduce evictions. Note // that this is not necessary for correctness. // // It may be called on a destroyed dentry. For example, // renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times // for the same dentry when the dentry is visited more than once in the same // operation. One of the calls may destroy the dentry, so subsequent calls will // do nothing. // // Preconditions: d.fs.renameMu must be locked for writing if // renameMuWriteLocked is true; it may be temporarily unlocked. func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked bool) { d.cachingMu.Lock() refs := d.refs.Load() if refs == -1 { // Dentry has already been destroyed. d.cachingMu.Unlock() return } if refs > 0 { // fs.dentryCache.dentries is permitted to contain dentries with non-zero // refs, which are skipped by fs.evictCachedDentryLocked() upon reaching // the end of the LRU. But it is still beneficial to remove d from the // cache as we are already holding d.cachingMu. Keeping a cleaner cache // also reduces the number of evictions (which is expensive as it acquires // fs.renameMu). d.removeFromCacheLocked() d.cachingMu.Unlock() return } // Deleted and invalidated dentries with zero references are no longer // reachable by path resolution and should be dropped immediately. if d.vfsd.IsDead() { d.removeFromCacheLocked() d.cachingMu.Unlock() if !renameMuWriteLocked { // Need to lock d.fs.renameMu for writing as needed by d.destroyLocked(). d.fs.renameMu.Lock() defer d.fs.renameMu.Unlock() // Now that renameMu is locked for writing, no more refs can be taken on // d because path resolution requires renameMu for reading at least. if d.refs.Load() != 0 { // Destroy d only if its ref is still 0. If not, either someone took a // ref on it or it got destroyed before fs.renameMu could be acquired. return } } if d.isDeleted() { d.watches.HandleDeletion(ctx) } d.destroyLocked(ctx) // +checklocksforce: renameMu must be acquired at this point. return } if d.vfsd.IsEvictable() { d.cachingMu.Unlock() // Attempt to evict. if renameMuWriteLocked { d.evictLocked(ctx) // +checklocksforce: renameMu is locked in this case. return } d.evict(ctx) return } // If d still has inotify watches and it is not deleted or invalidated, it // can't be evicted. Otherwise, we will lose its watches, even if a new // dentry is created for the same file in the future. Note that the size of // d.watches cannot concurrently transition from zero to non-zero, because // adding a watch requires holding a reference on d. if d.watches.Size() > 0 { // As in the refs > 0 case, removing d is beneficial. d.removeFromCacheLocked() d.cachingMu.Unlock() return } if d.fs.released.Load() != 0 { d.cachingMu.Unlock() if !renameMuWriteLocked { // Need to lock d.fs.renameMu to access d.parent. Lock it for writing as // needed by d.destroyLocked() later. d.fs.renameMu.Lock() defer d.fs.renameMu.Unlock() } if parent := d.parent.Load(); parent != nil { parent.childrenMu.Lock() delete(parent.children, d.name) parent.childrenMu.Unlock() } d.destroyLocked(ctx) // +checklocksforce: see above. return } d.fs.dentryCache.mu.Lock() // If d is already cached, just move it to the front of the LRU. if d.cached { d.fs.dentryCache.dentries.Remove(&d.cacheEntry) d.fs.dentryCache.dentries.PushFront(&d.cacheEntry) d.fs.dentryCache.mu.Unlock() d.cachingMu.Unlock() return } // Cache the dentry, then evict the least recently used cached dentry if // the cache becomes over-full. d.fs.dentryCache.dentries.PushFront(&d.cacheEntry) d.fs.dentryCache.dentriesLen++ d.cached = true shouldEvict := d.fs.dentryCache.dentriesLen > d.fs.dentryCache.maxCachedDentries d.fs.dentryCache.mu.Unlock() d.cachingMu.Unlock() if shouldEvict { if !renameMuWriteLocked { // Need to lock d.fs.renameMu for writing as needed by // d.evictCachedDentryLocked(). d.fs.renameMu.Lock() defer d.fs.renameMu.Unlock() } d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above. } } // Preconditions: d.cachingMu must be locked. func (d *dentry) removeFromCacheLocked() { if d.cached { d.fs.dentryCache.mu.Lock() d.fs.dentryCache.dentries.Remove(&d.cacheEntry) d.fs.dentryCache.dentriesLen-- d.fs.dentryCache.mu.Unlock() d.cached = false } } // Precondition: fs.renameMu must be locked for writing; it may be temporarily // unlocked. // +checklocks:fs.renameMu func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) { for fs.dentryCache.dentriesLen != 0 { fs.evictCachedDentryLocked(ctx) } } // Preconditions: // - fs.renameMu must be locked for writing; it may be temporarily unlocked. // // +checklocks:fs.renameMu func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { fs.dentryCache.mu.Lock() victim := fs.dentryCache.dentries.Back() fs.dentryCache.mu.Unlock() if victim == nil { // fs.dentryCache.dentries may have become empty between when it was // checked and when we locked fs.dentryCache.mu. return } if victim.d.fs == fs { victim.d.evictLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs return } // The dentry cache is shared between all gofer filesystems and the victim is // from another filesystem. Have that filesystem do the work. We unlock // fs.renameMu to prevent deadlock: two filesystems could otherwise wait on // each others' renameMu. fs.renameMu.Unlock() defer fs.renameMu.Lock() victim.d.evict(ctx) } // Preconditions: // - d.fs.renameMu must not be locked for writing. func (d *dentry) evict(ctx context.Context) { d.fs.renameMu.Lock() defer d.fs.renameMu.Unlock() d.evictLocked(ctx) } // Preconditions: // - d.fs.renameMu must be locked for writing; it may be temporarily unlocked. // // +checklocks:d.fs.renameMu func (d *dentry) evictLocked(ctx context.Context) { d.cachingMu.Lock() d.removeFromCacheLocked() // d.refs or d.watches.Size() may have become non-zero from an earlier path // resolution since it was inserted into fs.dentryCache.dentries. if d.refs.Load() != 0 || d.watches.Size() != 0 { d.cachingMu.Unlock() return } if parent := d.parent.Load(); parent != nil { parent.opMu.Lock() if !d.vfsd.IsDead() { // Note that d can't be a mount point (in any mount namespace), since VFS // holds references on mount points. rcs := d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &d.vfsd) for _, rc := range rcs { rc.DecRef(ctx) } parent.childrenMu.Lock() delete(parent.children, d.name) parent.childrenMu.Unlock() // We're only deleting the dentry, not the file it // represents, so we don't need to update // victim parent.dirents etc. } parent.opMu.Unlock() } // Safe to unlock cachingMu now that d.vfsd.IsDead(). Henceforth any // concurrent caching attempts on d will attempt to destroy it and so will // try to acquire fs.renameMu (which we have already acquiredd). Hence, // fs.renameMu will synchronize the destroy attempts. d.cachingMu.Unlock() d.destroyLocked(ctx) // +checklocksforce: owned as precondition. } // destroyDisconnected destroys an uncached, unparented dentry. There are no // locking preconditions. func (d *dentry) destroyDisconnected(ctx context.Context) { mf := d.fs.mf d.handleMu.Lock() d.dataMu.Lock() if d.isWriteHandleOk() { // Write dirty pages back to the remote filesystem. h := d.writeHandle() if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err) } } // Discard cached data. if !d.cache.IsEmpty() { mf.MarkAllUnevictable(d) d.cache.DropAll(mf) d.dirty.RemoveAll() } d.dataMu.Unlock() // Close any resources held by the implementation. d.destroyImpl(ctx) // Can use RacyLoad() because handleMu is locked. if d.readFD.RacyLoad() >= 0 { _ = unix.Close(int(d.readFD.RacyLoad())) } if d.writeFD.RacyLoad() >= 0 && d.readFD.RacyLoad() != d.writeFD.RacyLoad() { _ = unix.Close(int(d.writeFD.RacyLoad())) } d.readFD = atomicbitops.FromInt32(-1) d.writeFD = atomicbitops.FromInt32(-1) d.mmapFD = atomicbitops.FromInt32(-1) d.handleMu.Unlock() if !d.isSynthetic() { // Note that it's possible that d.atimeDirty or d.mtimeDirty are true, // i.e. client and server timestamps may differ (because e.g. a client // write was serviced by the page cache, and only written back to the // remote file later). Ideally, we'd write client timestamps back to // the remote filesystem so that timestamps for a new dentry // instantiated for the same file would remain coherent. Unfortunately, // this turns out to be too expensive in many cases, so for now we // don't do this. // Remove d from the set of syncable dentries. d.fs.syncMu.Lock() d.fs.syncableDentries.Remove(&d.syncableListEntry) d.fs.syncMu.Unlock() } // Drop references and stop tracking this child. d.refs.Store(-1) refs.Unregister(d) } // destroyLocked destroys the dentry. // // Preconditions: // - d.fs.renameMu must be locked for writing; it may be temporarily unlocked. // - d.refs == 0. // - d.parent.children[d.name] != d, i.e. d is not reachable by path traversal // from its former parent dentry. // // +checklocks:d.fs.renameMu func (d *dentry) destroyLocked(ctx context.Context) { switch d.refs.Load() { case 0: // Mark the dentry destroyed. d.refs.Store(-1) case -1: panic("dentry.destroyLocked() called on already destroyed dentry") default: panic("dentry.destroyLocked() called with references on the dentry") } // Allow the following to proceed without renameMu locked to improve // scalability. d.fs.renameMu.Unlock() // No locks need to be held during destoryDisconnected. d.destroyDisconnected(ctx) d.fs.renameMu.Lock() // Drop the reference held by d on its parent without recursively locking // d.fs.renameMu. if parent := d.parent.Load(); parent != nil && parent.decRefNoCaching() == 0 { parent.checkCachingLocked(ctx, true /* renameMuWriteLocked */) } } func (d *dentry) isDeleted() bool { return d.deleted.Load() != 0 } func (d *dentry) setDeleted() { d.deleted.Store(1) } func (d *dentry) listXattr(ctx context.Context, size uint64) ([]string, error) { if d.isSynthetic() { return nil, nil } return d.listXattrImpl(ctx, size) } func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { if d.isSynthetic() { return "", linuxerr.ENODATA } if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { return "", err } return d.getXattrImpl(ctx, opts) } func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error { if d.isSynthetic() { return linuxerr.EPERM } if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { return err } return d.setXattrImpl(ctx, opts) } func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error { if d.isSynthetic() { return linuxerr.EPERM } if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { return err } return d.removeXattrImpl(ctx, name) } // Preconditions: // - !d.isSynthetic(). // - d.isRegularFile() || d.isDir(). // - fs.renameMu is locked. func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { // O_TRUNC unconditionally requires us to obtain a new handle (opened with // O_TRUNC). if !trunc { d.handleMu.RLock() canReuseCurHandle := (!read || d.isReadHandleOk()) && (!write || d.isWriteHandleOk()) d.handleMu.RUnlock() if canReuseCurHandle { // Current handles are sufficient. return nil } } d.handleMu.Lock() needNewHandle := (read && !d.isReadHandleOk()) || (write && !d.isWriteHandleOk()) || trunc if !needNewHandle { d.handleMu.Unlock() return nil } var fdsToCloseArr [2]int32 fdsToClose := fdsToCloseArr[:0] invalidateTranslations := false // Get a new handle. If this file has been opened for both reading and // writing, try to get a single handle that is usable for both: // // - Writable memory mappings of a host FD require that the host FD is // opened for both reading and writing. // // - NOTE(b/141991141): Some filesystems may not ensure coherence // between multiple handles for the same file. openReadable := d.isReadHandleOk() || read openWritable := d.isWriteHandleOk() || write h, err := d.openHandle(ctx, openReadable, openWritable, trunc) if linuxerr.Equals(linuxerr.EACCES, err) && (openReadable != read || openWritable != write) { // It may not be possible to use a single handle for both // reading and writing, since permissions on the file may have // changed to e.g. disallow reading after previously being // opened for reading. In this case, we have no choice but to // use separate handles for reading and writing. ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d) openReadable = read openWritable = write h, err = d.openHandle(ctx, openReadable, openWritable, trunc) } if err != nil { d.handleMu.Unlock() return err } // Update d.readFD and d.writeFD if h.fd >= 0 { if openReadable && openWritable && (d.readFD.RacyLoad() < 0 || d.writeFD.RacyLoad() < 0 || d.readFD.RacyLoad() != d.writeFD.RacyLoad()) { // Replace existing FDs with this one. if d.readFD.RacyLoad() >= 0 { // We already have a readable FD that may be in use by // concurrent callers of d.pf.FD(). if d.fs.opts.overlayfsStaleRead { // If overlayfsStaleRead is in effect, then the new FD // may not be coherent with the existing one, so we // have no choice but to switch to mappings of the new // FD in both the application and sentry. if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { d.handleMu.Unlock() ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err) h.close(ctx) return err } fdsToClose = append(fdsToClose, d.readFD.RacyLoad()) invalidateTranslations = true d.readFD.Store(h.fd) } else { // Otherwise, we want to avoid invalidating existing // memmap.Translations (which is expensive); instead, use // dup3 to make the old file descriptor refer to the new // file description, then close the new file descriptor // (which is no longer needed). Racing callers of d.pf.FD() // may use the old or new file description, but this // doesn't matter since they refer to the same file, and // any racing mappings must be read-only. if err := unix.Dup3(int(h.fd), int(d.readFD.RacyLoad()), unix.O_CLOEXEC); err != nil { oldFD := d.readFD.RacyLoad() d.handleMu.Unlock() ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldFD, err) h.close(ctx) return err } fdsToClose = append(fdsToClose, h.fd) h.fd = d.readFD.RacyLoad() } } else { d.readFD.Store(h.fd) } if d.writeFD.RacyLoad() != h.fd && d.writeFD.RacyLoad() >= 0 { fdsToClose = append(fdsToClose, d.writeFD.RacyLoad()) } d.writeFD.Store(h.fd) d.mmapFD.Store(h.fd) } else if openReadable && d.readFD.RacyLoad() < 0 { readHandleWasOk := d.isReadHandleOk() d.readFD.Store(h.fd) // If the file has not been opened for writing, the new FD may // be used for read-only memory mappings. If the file was // previously opened for reading (without an FD), then existing // translations of the file may use the internal page cache; // invalidate those mappings. if !d.isWriteHandleOk() { invalidateTranslations = readHandleWasOk d.mmapFD.Store(h.fd) } } else if openWritable && d.writeFD.RacyLoad() < 0 { d.writeFD.Store(h.fd) if d.readFD.RacyLoad() >= 0 { // We have an existing read-only FD, but the file has just // been opened for writing, so we need to start supporting // writable memory mappings. However, the new FD is not // readable, so we have no FD that can be used to create // writable memory mappings. Switch to using the internal // page cache. invalidateTranslations = true d.mmapFD.Store(-1) } } else { // The new FD is not useful. fdsToClose = append(fdsToClose, h.fd) } } else if openWritable && d.writeFD.RacyLoad() < 0 && d.mmapFD.RacyLoad() >= 0 { // We have an existing read-only FD, but the file has just been // opened for writing, so we need to start supporting writable // memory mappings. However, we have no writable host FD. Switch to // using the internal page cache. invalidateTranslations = true d.mmapFD.Store(-1) } d.updateHandles(ctx, h, openReadable, openWritable) d.handleMu.Unlock() if invalidateTranslations { // Invalidate application mappings that may be using an old FD; they // will be replaced with mappings using the new FD after future calls // to d.Translate(). This requires holding d.mapsMu, which precedes // d.handleMu in the lock order. d.mapsMu.Lock() d.mappings.InvalidateAll(memmap.InvalidateOpts{}) d.mapsMu.Unlock() } for _, fd := range fdsToClose { unix.Close(int(fd)) } return nil } func (d *dentry) syncRemoteFile(ctx context.Context) error { d.handleMu.RLock() defer d.handleMu.RUnlock() return d.syncRemoteFileLocked(ctx) } // Preconditions: d.handleMu must be locked. func (d *dentry) syncRemoteFileLocked(ctx context.Context) error { // Prefer syncing write handles over read handles, since some remote // filesystem implementations may not sync changes made through write // handles otherwise. wh := d.writeHandle() wh.sync(ctx) rh := d.readHandle() rh.sync(ctx) return nil } func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error { d.handleMu.RLock() defer d.handleMu.RUnlock() if d.isWriteHandleOk() { // Write back dirty pages to the remote file. d.dataMu.Lock() h := d.writeHandle() err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), d.fs.mf, h.writeFromBlocksAt) d.dataMu.Unlock() if err != nil { return err } } if err := d.syncRemoteFileLocked(ctx); err != nil { if !forFilesystemSync { return err } // Only return err if we can reasonably have expected sync to succeed // (d is a regular file and was opened for writing). if d.isRegularFile() && d.isWriteHandleOk() { return err } ctx.Debugf("gofer.dentry.syncCachedFile: syncing non-writable or non-regular-file dentry failed: %v", err) } return nil } // incLinks increments link count. func (d *dentry) incLinks() { if d.nlink.Load() == 0 { // The remote filesystem doesn't support link count. return } d.nlink.Add(1) } // decLinks decrements link count. func (d *dentry) decLinks() { if d.nlink.Load() == 0 { // The remote filesystem doesn't support link count. return } d.nlink.Add(^uint32(0)) } // fileDescription is embedded by gofer implementations of // vfs.FileDescriptionImpl. // // +stateify savable type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.LockFD lockLogging sync.Once `state:"nosave"` } func (fd *fileDescription) filesystem() *filesystem { return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) } func (fd *fileDescription) dentry() *dentry { return fd.vfsfd.Dentry().Impl().(*dentry) } // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { d := fd.dentry() const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME) if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { // Use specialFileFD.handle.fileLisa for the Stat if available, for the // same reason that we try to use open FD in updateMetadataLocked(). var err error if sffd, ok := fd.vfsfd.Impl().(*specialFileFD); ok { err = sffd.updateMetadata(ctx) } else { err = d.updateMetadata(ctx) } if err != nil { return linux.Statx{}, err } } var stat linux.Statx d.statTo(&stat) return stat, nil } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { fs := fd.filesystem() fs.renameMu.RLock() defer fs.renameMu.RUnlock() return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount()) } // ListXattr implements vfs.FileDescriptionImpl.ListXattr. func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { return fd.dentry().listXattr(ctx, size) } // GetXattr implements vfs.FileDescriptionImpl.GetXattr. func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts) } // SetXattr implements vfs.FileDescriptionImpl.SetXattr. func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { return fd.dentry().setXattr(ctx, auth.CredentialsFromContext(ctx), &opts) } // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { return fd.dentry().removeXattr(ctx, auth.CredentialsFromContext(ctx), name) } // LockBSD implements vfs.FileDescriptionImpl.LockBSD. func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block bool) error { fd.lockLogging.Do(func() { log.Infof("File lock using gofer file handled internally.") }) return fd.LockFD.LockBSD(ctx, uid, ownerPID, t, block) } // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block bool) error { fd.lockLogging.Do(func() { log.Infof("Range lock using gofer file handled internally.") }) return fd.Locks().LockPOSIX(ctx, uid, ownerPID, t, r, block) } // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { return fd.Locks().UnlockPOSIX(ctx, uid, r) } // resolvingPath is just a wrapper around *vfs.ResolvingPath. It additionally // holds some information around the intent behind resolving the path. type resolvingPath struct { *vfs.ResolvingPath // excludeLast indicates whether the intent is to resolve until the last path // component. If true, the last path component should remain unresolved. excludeLast bool } func resolvingPathFull(rp *vfs.ResolvingPath) resolvingPath { return resolvingPath{ResolvingPath: rp, excludeLast: false} } func resolvingPathParent(rp *vfs.ResolvingPath) resolvingPath { return resolvingPath{ResolvingPath: rp, excludeLast: true} } func (rp *resolvingPath) done() bool { if rp.excludeLast { return rp.Final() } return rp.Done() } func (rp *resolvingPath) copy() resolvingPath { return resolvingPath{ ResolvingPath: rp.ResolvingPath.Copy(), excludeLast: rp.excludeLast, } } // Precondition: !rp.done() && rp.Component() is not "." or "..". func (rp *resolvingPath) getComponents(emit func(string) bool) { rp.GetComponents(rp.excludeLast, emit) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/gofer_impl_state_autogen.go000066400000000000000000000000671465435605700307430ustar00rootroot00000000000000// automatically generated by stateify. package gofer golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/gofer_state_autogen.go000066400000000000000000000614301465435605700277230ustar00rootroot00000000000000// automatically generated by stateify. package gofer import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (l *dentryList) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.dentryList" } func (l *dentryList) StateFields() []string { return []string{ "head", "tail", } } func (l *dentryList) beforeSave() {} // +checklocksignore func (l *dentryList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *dentryList) afterLoad(context.Context) {} // +checklocksignore func (l *dentryList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *dentryEntry) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.dentryEntry" } func (e *dentryEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *dentryEntry) beforeSave() {} // +checklocksignore func (e *dentryEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *dentryEntry) afterLoad(context.Context) {} // +checklocksignore func (e *dentryEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (d *directfsDentry) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.directfsDentry" } func (d *directfsDentry) StateFields() []string { return []string{ "dentry", "controlFD", } } func (d *directfsDentry) beforeSave() {} // +checklocksignore func (d *directfsDentry) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.dentry) stateSinkObject.Save(1, &d.controlFD) } // +checklocksignore func (d *directfsDentry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.dentry) stateSourceObject.Load(1, &d.controlFD) stateSourceObject.AfterLoad(func() { d.afterLoad(ctx) }) } func (fd *directoryFD) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.directoryFD" } func (fd *directoryFD) StateFields() []string { return []string{ "fileDescription", "DirectoryFileDescriptionDefaultImpl", "off", "dirents", } } func (fd *directoryFD) beforeSave() {} // +checklocksignore func (fd *directoryFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.fileDescription) stateSinkObject.Save(1, &fd.DirectoryFileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.off) stateSinkObject.Save(3, &fd.dirents) } func (fd *directoryFD) afterLoad(context.Context) {} // +checklocksignore func (fd *directoryFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.fileDescription) stateSourceObject.Load(1, &fd.DirectoryFileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.off) stateSourceObject.Load(3, &fd.dirents) } func (cache *stringFixedCache) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.stringFixedCache" } func (cache *stringFixedCache) StateFields() []string { return []string{ "namesList", "size", } } func (cache *stringFixedCache) beforeSave() {} // +checklocksignore func (cache *stringFixedCache) StateSave(stateSinkObject state.Sink) { cache.beforeSave() stateSinkObject.Save(0, &cache.namesList) stateSinkObject.Save(1, &cache.size) } func (cache *stringFixedCache) afterLoad(context.Context) {} // +checklocksignore func (cache *stringFixedCache) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &cache.namesList) stateSourceObject.Load(1, &cache.size) } func (d *dentryCache) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.dentryCache" } func (d *dentryCache) StateFields() []string { return []string{ "maxCachedDentries", "dentries", "dentriesLen", } } func (d *dentryCache) beforeSave() {} // +checklocksignore func (d *dentryCache) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.maxCachedDentries) stateSinkObject.Save(1, &d.dentries) stateSinkObject.Save(2, &d.dentriesLen) } func (d *dentryCache) afterLoad(context.Context) {} // +checklocksignore func (d *dentryCache) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.maxCachedDentries) stateSourceObject.Load(1, &d.dentries) stateSourceObject.Load(2, &d.dentriesLen) } func (fstype *FilesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.FilesystemType" } func (fstype *FilesystemType) StateFields() []string { return []string{} } func (fstype *FilesystemType) beforeSave() {} // +checklocksignore func (fstype *FilesystemType) StateSave(stateSinkObject state.Sink) { fstype.beforeSave() } func (fstype *FilesystemType) afterLoad(context.Context) {} // +checklocksignore func (fstype *FilesystemType) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (fs *filesystem) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.filesystem" } func (fs *filesystem) StateFields() []string { return []string{ "vfsfs", "opts", "iopts", "clock", "devMinor", "root", "dentryCache", "syncableDentries", "specialFileFDs", "lastIno", "savedDentryRW", "released", } } func (fs *filesystem) beforeSave() {} // +checklocksignore func (fs *filesystem) StateSave(stateSinkObject state.Sink) { fs.beforeSave() stateSinkObject.Save(0, &fs.vfsfs) stateSinkObject.Save(1, &fs.opts) stateSinkObject.Save(2, &fs.iopts) stateSinkObject.Save(3, &fs.clock) stateSinkObject.Save(4, &fs.devMinor) stateSinkObject.Save(5, &fs.root) stateSinkObject.Save(6, &fs.dentryCache) stateSinkObject.Save(7, &fs.syncableDentries) stateSinkObject.Save(8, &fs.specialFileFDs) stateSinkObject.Save(9, &fs.lastIno) stateSinkObject.Save(10, &fs.savedDentryRW) stateSinkObject.Save(11, &fs.released) } // +checklocksignore func (fs *filesystem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fs.vfsfs) stateSourceObject.Load(1, &fs.opts) stateSourceObject.Load(2, &fs.iopts) stateSourceObject.Load(3, &fs.clock) stateSourceObject.Load(4, &fs.devMinor) stateSourceObject.Load(5, &fs.root) stateSourceObject.Load(6, &fs.dentryCache) stateSourceObject.Load(7, &fs.syncableDentries) stateSourceObject.Load(8, &fs.specialFileFDs) stateSourceObject.Load(9, &fs.lastIno) stateSourceObject.Load(10, &fs.savedDentryRW) stateSourceObject.Load(11, &fs.released) stateSourceObject.AfterLoad(func() { fs.afterLoad(ctx) }) } func (f *filesystemOptions) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.filesystemOptions" } func (f *filesystemOptions) StateFields() []string { return []string{ "fd", "aname", "interop", "dfltuid", "dfltgid", "dcache", "forcePageCache", "limitHostFDTranslation", "overlayfsStaleRead", "regularFilesUseSpecialFileFD", "disableFifoOpen", "directfs", } } func (f *filesystemOptions) beforeSave() {} // +checklocksignore func (f *filesystemOptions) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.fd) stateSinkObject.Save(1, &f.aname) stateSinkObject.Save(2, &f.interop) stateSinkObject.Save(3, &f.dfltuid) stateSinkObject.Save(4, &f.dfltgid) stateSinkObject.Save(5, &f.dcache) stateSinkObject.Save(6, &f.forcePageCache) stateSinkObject.Save(7, &f.limitHostFDTranslation) stateSinkObject.Save(8, &f.overlayfsStaleRead) stateSinkObject.Save(9, &f.regularFilesUseSpecialFileFD) stateSinkObject.Save(10, &f.disableFifoOpen) stateSinkObject.Save(11, &f.directfs) } func (f *filesystemOptions) afterLoad(context.Context) {} // +checklocksignore func (f *filesystemOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.fd) stateSourceObject.Load(1, &f.aname) stateSourceObject.Load(2, &f.interop) stateSourceObject.Load(3, &f.dfltuid) stateSourceObject.Load(4, &f.dfltgid) stateSourceObject.Load(5, &f.dcache) stateSourceObject.Load(6, &f.forcePageCache) stateSourceObject.Load(7, &f.limitHostFDTranslation) stateSourceObject.Load(8, &f.overlayfsStaleRead) stateSourceObject.Load(9, &f.regularFilesUseSpecialFileFD) stateSourceObject.Load(10, &f.disableFifoOpen) stateSourceObject.Load(11, &f.directfs) } func (d *directfsOpts) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.directfsOpts" } func (d *directfsOpts) StateFields() []string { return []string{ "enabled", } } func (d *directfsOpts) beforeSave() {} // +checklocksignore func (d *directfsOpts) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.enabled) } func (d *directfsOpts) afterLoad(context.Context) {} // +checklocksignore func (d *directfsOpts) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.enabled) } func (i *InteropMode) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.InteropMode" } func (i *InteropMode) StateFields() []string { return nil } func (i *InternalFilesystemOptions) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.InternalFilesystemOptions" } func (i *InternalFilesystemOptions) StateFields() []string { return []string{ "UniqueID", "LeakConnection", "OpenSocketsByConnecting", } } func (i *InternalFilesystemOptions) beforeSave() {} // +checklocksignore func (i *InternalFilesystemOptions) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.UniqueID) stateSinkObject.Save(1, &i.LeakConnection) stateSinkObject.Save(2, &i.OpenSocketsByConnecting) } func (i *InternalFilesystemOptions) afterLoad(context.Context) {} // +checklocksignore func (i *InternalFilesystemOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.UniqueID) stateSourceObject.Load(1, &i.LeakConnection) stateSourceObject.Load(2, &i.OpenSocketsByConnecting) } func (i *inoKey) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.inoKey" } func (i *inoKey) StateFields() []string { return []string{ "ino", "devMinor", "devMajor", } } func (i *inoKey) beforeSave() {} // +checklocksignore func (i *inoKey) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.ino) stateSinkObject.Save(1, &i.devMinor) stateSinkObject.Save(2, &i.devMajor) } func (i *inoKey) afterLoad(context.Context) {} // +checklocksignore func (i *inoKey) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.ino) stateSourceObject.Load(1, &i.devMinor) stateSourceObject.Load(2, &i.devMajor) } func (d *dentry) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.dentry" } func (d *dentry) StateFields() []string { return []string{ "vfsd", "refs", "fs", "parent", "name", "inoKey", "deleted", "cached", "cacheEntry", "syncableListEntry", "children", "syntheticChildren", "ino", "mode", "uid", "gid", "blockSize", "atime", "mtime", "ctime", "btime", "size", "atimeDirty", "mtimeDirty", "nlink", "mappings", "cache", "dirty", "pf", "haveTarget", "target", "endpoint", "pipe", "locks", "watches", "forMountpoint", "impl", } } // +checklocksignore func (d *dentry) StateSave(stateSinkObject state.Sink) { d.beforeSave() var parentValue *dentry parentValue = d.saveParent() stateSinkObject.SaveValue(3, parentValue) stateSinkObject.Save(0, &d.vfsd) stateSinkObject.Save(1, &d.refs) stateSinkObject.Save(2, &d.fs) stateSinkObject.Save(4, &d.name) stateSinkObject.Save(5, &d.inoKey) stateSinkObject.Save(6, &d.deleted) stateSinkObject.Save(7, &d.cached) stateSinkObject.Save(8, &d.cacheEntry) stateSinkObject.Save(9, &d.syncableListEntry) stateSinkObject.Save(10, &d.children) stateSinkObject.Save(11, &d.syntheticChildren) stateSinkObject.Save(12, &d.ino) stateSinkObject.Save(13, &d.mode) stateSinkObject.Save(14, &d.uid) stateSinkObject.Save(15, &d.gid) stateSinkObject.Save(16, &d.blockSize) stateSinkObject.Save(17, &d.atime) stateSinkObject.Save(18, &d.mtime) stateSinkObject.Save(19, &d.ctime) stateSinkObject.Save(20, &d.btime) stateSinkObject.Save(21, &d.size) stateSinkObject.Save(22, &d.atimeDirty) stateSinkObject.Save(23, &d.mtimeDirty) stateSinkObject.Save(24, &d.nlink) stateSinkObject.Save(25, &d.mappings) stateSinkObject.Save(26, &d.cache) stateSinkObject.Save(27, &d.dirty) stateSinkObject.Save(28, &d.pf) stateSinkObject.Save(29, &d.haveTarget) stateSinkObject.Save(30, &d.target) stateSinkObject.Save(31, &d.endpoint) stateSinkObject.Save(32, &d.pipe) stateSinkObject.Save(33, &d.locks) stateSinkObject.Save(34, &d.watches) stateSinkObject.Save(35, &d.forMountpoint) stateSinkObject.Save(36, &d.impl) } // +checklocksignore func (d *dentry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.vfsd) stateSourceObject.Load(1, &d.refs) stateSourceObject.Load(2, &d.fs) stateSourceObject.Load(4, &d.name) stateSourceObject.Load(5, &d.inoKey) stateSourceObject.Load(6, &d.deleted) stateSourceObject.Load(7, &d.cached) stateSourceObject.Load(8, &d.cacheEntry) stateSourceObject.Load(9, &d.syncableListEntry) stateSourceObject.Load(10, &d.children) stateSourceObject.Load(11, &d.syntheticChildren) stateSourceObject.Load(12, &d.ino) stateSourceObject.Load(13, &d.mode) stateSourceObject.Load(14, &d.uid) stateSourceObject.Load(15, &d.gid) stateSourceObject.Load(16, &d.blockSize) stateSourceObject.Load(17, &d.atime) stateSourceObject.Load(18, &d.mtime) stateSourceObject.Load(19, &d.ctime) stateSourceObject.Load(20, &d.btime) stateSourceObject.Load(21, &d.size) stateSourceObject.Load(22, &d.atimeDirty) stateSourceObject.Load(23, &d.mtimeDirty) stateSourceObject.Load(24, &d.nlink) stateSourceObject.Load(25, &d.mappings) stateSourceObject.Load(26, &d.cache) stateSourceObject.Load(27, &d.dirty) stateSourceObject.Load(28, &d.pf) stateSourceObject.Load(29, &d.haveTarget) stateSourceObject.Load(30, &d.target) stateSourceObject.Load(31, &d.endpoint) stateSourceObject.Load(32, &d.pipe) stateSourceObject.Load(33, &d.locks) stateSourceObject.Load(34, &d.watches) stateSourceObject.Load(35, &d.forMountpoint) stateSourceObject.Load(36, &d.impl) stateSourceObject.LoadValue(3, new(*dentry), func(y any) { d.loadParent(ctx, y.(*dentry)) }) stateSourceObject.AfterLoad(func() { d.afterLoad(ctx) }) } func (s *stringListElem) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.stringListElem" } func (s *stringListElem) StateFields() []string { return []string{ "str", "stringEntry", } } func (s *stringListElem) beforeSave() {} // +checklocksignore func (s *stringListElem) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.str) stateSinkObject.Save(1, &s.stringEntry) } func (s *stringListElem) afterLoad(context.Context) {} // +checklocksignore func (s *stringListElem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.str) stateSourceObject.Load(1, &s.stringEntry) } func (d *dentryListElem) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.dentryListElem" } func (d *dentryListElem) StateFields() []string { return []string{ "d", "dentryEntry", } } func (d *dentryListElem) beforeSave() {} // +checklocksignore func (d *dentryListElem) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.d) stateSinkObject.Save(1, &d.dentryEntry) } func (d *dentryListElem) afterLoad(context.Context) {} // +checklocksignore func (d *dentryListElem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.d) stateSourceObject.Load(1, &d.dentryEntry) } func (fd *fileDescription) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.fileDescription" } func (fd *fileDescription) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "LockFD", } } func (fd *fileDescription) beforeSave() {} // +checklocksignore func (fd *fileDescription) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.vfsfd) stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.LockFD) } func (fd *fileDescription) afterLoad(context.Context) {} // +checklocksignore func (fd *fileDescription) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.vfsfd) stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.LockFD) } func (d *lisafsDentry) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.lisafsDentry" } func (d *lisafsDentry) StateFields() []string { return []string{ "dentry", } } func (d *lisafsDentry) beforeSave() {} // +checklocksignore func (d *lisafsDentry) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.dentry) } func (d *lisafsDentry) afterLoad(context.Context) {} // +checklocksignore func (d *lisafsDentry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.dentry) } func (fd *regularFileFD) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.regularFileFD" } func (fd *regularFileFD) StateFields() []string { return []string{ "fileDescription", "off", } } func (fd *regularFileFD) beforeSave() {} // +checklocksignore func (fd *regularFileFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.fileDescription) stateSinkObject.Save(1, &fd.off) } func (fd *regularFileFD) afterLoad(context.Context) {} // +checklocksignore func (fd *regularFileFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.fileDescription) stateSourceObject.Load(1, &fd.off) } func (d *dentryPlatformFile) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.dentryPlatformFile" } func (d *dentryPlatformFile) StateFields() []string { return []string{ "NoBufferedIOFallback", "dentry", "fdRefs", "hostFileMapper", } } func (d *dentryPlatformFile) beforeSave() {} // +checklocksignore func (d *dentryPlatformFile) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.NoBufferedIOFallback) stateSinkObject.Save(1, &d.dentry) stateSinkObject.Save(2, &d.fdRefs) stateSinkObject.Save(3, &d.hostFileMapper) } // +checklocksignore func (d *dentryPlatformFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.NoBufferedIOFallback) stateSourceObject.Load(1, &d.dentry) stateSourceObject.Load(2, &d.fdRefs) stateSourceObject.Load(3, &d.hostFileMapper) stateSourceObject.AfterLoad(func() { d.afterLoad(ctx) }) } func (s *savedDentryRW) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.savedDentryRW" } func (s *savedDentryRW) StateFields() []string { return []string{ "read", "write", } } func (s *savedDentryRW) beforeSave() {} // +checklocksignore func (s *savedDentryRW) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.read) stateSinkObject.Save(1, &s.write) } func (s *savedDentryRW) afterLoad(context.Context) {} // +checklocksignore func (s *savedDentryRW) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.read) stateSourceObject.Load(1, &s.write) } func (e *endpoint) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.endpoint" } func (e *endpoint) StateFields() []string { return []string{ "dentry", "path", } } func (e *endpoint) beforeSave() {} // +checklocksignore func (e *endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.dentry) stateSinkObject.Save(1, &e.path) } func (e *endpoint) afterLoad(context.Context) {} // +checklocksignore func (e *endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.dentry) stateSourceObject.Load(1, &e.path) } func (l *specialFDList) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.specialFDList" } func (l *specialFDList) StateFields() []string { return []string{ "head", "tail", } } func (l *specialFDList) beforeSave() {} // +checklocksignore func (l *specialFDList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *specialFDList) afterLoad(context.Context) {} // +checklocksignore func (l *specialFDList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *specialFDEntry) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.specialFDEntry" } func (e *specialFDEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *specialFDEntry) beforeSave() {} // +checklocksignore func (e *specialFDEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *specialFDEntry) afterLoad(context.Context) {} // +checklocksignore func (e *specialFDEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (fd *specialFileFD) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.specialFileFD" } func (fd *specialFileFD) StateFields() []string { return []string{ "fileDescription", "specialFDEntry", "NoBufferedIOFallback", "isRegularFile", "seekable", "queue", "off", "haveBuf", "buf", "hostFileMapper", "fileRefs", } } func (fd *specialFileFD) beforeSave() {} // +checklocksignore func (fd *specialFileFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.fileDescription) stateSinkObject.Save(1, &fd.specialFDEntry) stateSinkObject.Save(2, &fd.NoBufferedIOFallback) stateSinkObject.Save(3, &fd.isRegularFile) stateSinkObject.Save(4, &fd.seekable) stateSinkObject.Save(5, &fd.queue) stateSinkObject.Save(6, &fd.off) stateSinkObject.Save(7, &fd.haveBuf) stateSinkObject.Save(8, &fd.buf) stateSinkObject.Save(9, &fd.hostFileMapper) stateSinkObject.Save(10, &fd.fileRefs) } // +checklocksignore func (fd *specialFileFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.fileDescription) stateSourceObject.Load(1, &fd.specialFDEntry) stateSourceObject.Load(2, &fd.NoBufferedIOFallback) stateSourceObject.Load(3, &fd.isRegularFile) stateSourceObject.Load(4, &fd.seekable) stateSourceObject.Load(5, &fd.queue) stateSourceObject.Load(6, &fd.off) stateSourceObject.Load(7, &fd.haveBuf) stateSourceObject.Load(8, &fd.buf) stateSourceObject.Load(9, &fd.hostFileMapper) stateSourceObject.Load(10, &fd.fileRefs) stateSourceObject.AfterLoad(func() { fd.afterLoad(ctx) }) } func (l *stringList) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.stringList" } func (l *stringList) StateFields() []string { return []string{ "head", "tail", } } func (l *stringList) beforeSave() {} // +checklocksignore func (l *stringList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *stringList) afterLoad(context.Context) {} // +checklocksignore func (l *stringList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *stringEntry) StateTypeName() string { return "pkg/sentry/fsimpl/gofer.stringEntry" } func (e *stringEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *stringEntry) beforeSave() {} // +checklocksignore func (e *stringEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *stringEntry) afterLoad(context.Context) {} // +checklocksignore func (e *stringEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func init() { state.Register((*dentryList)(nil)) state.Register((*dentryEntry)(nil)) state.Register((*directfsDentry)(nil)) state.Register((*directoryFD)(nil)) state.Register((*stringFixedCache)(nil)) state.Register((*dentryCache)(nil)) state.Register((*FilesystemType)(nil)) state.Register((*filesystem)(nil)) state.Register((*filesystemOptions)(nil)) state.Register((*directfsOpts)(nil)) state.Register((*InteropMode)(nil)) state.Register((*InternalFilesystemOptions)(nil)) state.Register((*inoKey)(nil)) state.Register((*dentry)(nil)) state.Register((*stringListElem)(nil)) state.Register((*dentryListElem)(nil)) state.Register((*fileDescription)(nil)) state.Register((*lisafsDentry)(nil)) state.Register((*regularFileFD)(nil)) state.Register((*dentryPlatformFile)(nil)) state.Register((*savedDentryRW)(nil)) state.Register((*endpoint)(nil)) state.Register((*specialFDList)(nil)) state.Register((*specialFDEntry)(nil)) state.Register((*specialFileFD)(nil)) state.Register((*stringList)(nil)) state.Register((*stringEntry)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/handle.go000066400000000000000000000101471465435605700251310ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gofer import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/lisafs" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sync" ) var noHandle = handle{ fdLisa: lisafs.ClientFD{}, // zero value is fine. fd: -1, } // handle represents a remote "open file descriptor", consisting of an opened // lisafs FD and optionally a host file descriptor. // // These are explicitly not savable. type handle struct { fdLisa lisafs.ClientFD fd int32 // -1 if unavailable } func (h *handle) close(ctx context.Context) { if h.fdLisa.Ok() { h.fdLisa.Close(ctx, true /* flush */) } if h.fd >= 0 { unix.Close(int(h.fd)) h.fd = -1 } } func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) { if dsts.IsEmpty() { return 0, nil } if h.fd >= 0 { ctx.UninterruptibleSleepStart(false) n, err := hostfd.Preadv2(h.fd, dsts, int64(offset), 0 /* flags */) ctx.UninterruptibleSleepFinish(false) return n, err } rw := getHandleReadWriter(ctx, h, int64(offset)) defer putHandleReadWriter(rw) return safemem.FromIOReader{rw}.ReadToBlocks(dsts) } func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) { if srcs.IsEmpty() { return 0, nil } if h.fd >= 0 { ctx.UninterruptibleSleepStart(false) n, err := hostfd.Pwritev2(h.fd, srcs, int64(offset), 0 /* flags */) ctx.UninterruptibleSleepFinish(false) return n, err } rw := getHandleReadWriter(ctx, h, int64(offset)) defer putHandleReadWriter(rw) return safemem.FromIOWriter{rw}.WriteFromBlocks(srcs) } func (h *handle) allocate(ctx context.Context, mode, offset, length uint64) error { if h.fdLisa.Ok() { return h.fdLisa.Allocate(ctx, mode, offset, length) } if h.fd >= 0 { return unix.Fallocate(int(h.fd), uint32(mode), int64(offset), int64(length)) } return nil } func (h *handle) sync(ctx context.Context) error { // If we have a host FD, fsyncing it is likely to be faster than an fsync // RPC. if h.fd >= 0 { ctx.UninterruptibleSleepStart(false) err := unix.Fsync(int(h.fd)) ctx.UninterruptibleSleepFinish(false) return err } if h.fdLisa.Ok() { return h.fdLisa.Sync(ctx) } return nil } type handleReadWriter struct { ctx context.Context h handle off uint64 } var handleReadWriterPool = sync.Pool{ New: func() any { return &handleReadWriter{} }, } func getHandleReadWriter(ctx context.Context, h *handle, offset int64) *handleReadWriter { rw := handleReadWriterPool.Get().(*handleReadWriter) rw.ctx = ctx rw.h = *h rw.off = uint64(offset) return rw } func putHandleReadWriter(rw *handleReadWriter) { rw.ctx = nil rw.h = noHandle handleReadWriterPool.Put(rw) } // Read implements io.Reader.Read. func (rw *handleReadWriter) Read(dst []byte) (int, error) { n, err := rw.h.fdLisa.Read(rw.ctx, dst, rw.off) rw.off += n return int(n), err } // Write implements io.Writer.Write. func (rw *handleReadWriter) Write(src []byte) (int, error) { n, err := rw.h.fdLisa.Write(rw.ctx, src, rw.off) rw.off += n return int(n), err } // ReadToBlocks implements safemem.Reader.ReadToBlocks. func (rw *handleReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { n, err := rw.h.readToBlocksAt(rw.ctx, dsts, rw.off) rw.off += n return n, err } // WriteFromBlocks implements safemem.Writer.WriteFromBlocks. func (rw *handleReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { n, err := rw.h.writeFromBlocksAt(rw.ctx, srcs, rw.off) rw.off += n return n, err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/host_named_pipe.go000066400000000000000000000065231465435605700270370ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gofer import ( "fmt" "sync" "time" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/waiter" ) // Global pipe used by blockUntilNonblockingPipeHasWriter since we can't create // pipes after sentry initialization due to syscall filters. var ( tempPipeMu sync.Mutex tempPipeReadFD int tempPipeWriteFD int tempPipeBuf [1]byte ) func init() { var pipeFDs [2]int if err := unix.Pipe(pipeFDs[:]); err != nil { panic(fmt.Sprintf("failed to create pipe for gofer.blockUntilNonblockingPipeHasWriter: %v", err)) } tempPipeReadFD = pipeFDs[0] tempPipeWriteFD = pipeFDs[1] } func blockUntilNonblockingPipeHasWriter(ctx context.Context, fd int32) error { for { ok, err := nonblockingPipeHasWriter(fd) if err != nil { return err } if ok { return nil } // Delay before trying again. if sleepErr := sleepBetweenNamedPipeOpenChecks(ctx); sleepErr != nil { // Another application thread may have opened this pipe for // writing, succeeded because we previously opened the pipe for // reading, and subsequently interrupted us for checkpointing (e.g. // this occurs in mknod tests under cooperative save/restore). In // this case, our open has to succeed for the checkpoint to include // a readable FD for the pipe, which is in turn necessary to // restore the other thread's writable FD for the same pipe // (otherwise it will get ENXIO). So we have to check // nonblockingPipeHasWriter() once last time. ok, err := nonblockingPipeHasWriter(fd) if err != nil { return err } if ok { return nil } return sleepErr } } } func nonblockingPipeHasWriter(fd int32) (bool, error) { tempPipeMu.Lock() defer tempPipeMu.Unlock() // Copy 1 byte from fd into the temporary pipe. n, err := unix.Tee(int(fd), tempPipeWriteFD, 1, unix.SPLICE_F_NONBLOCK) if linuxerr.Equals(linuxerr.EAGAIN, err) { // The pipe represented by fd is empty, but has a writer. return true, nil } if err != nil { return false, err } if n == 0 { // The pipe represented by fd is empty and has no writer. return false, nil } // The pipe represented by fd is non-empty, so it either has, or has // previously had, a writer. Remove the byte copied to the temporary pipe // before returning. if n, err := unix.Read(tempPipeReadFD, tempPipeBuf[:]); err != nil || n != 1 { panic(fmt.Sprintf("failed to drain pipe for gofer.blockUntilNonblockingPipeHasWriter: got (%d, %v), wanted (1, nil)", n, err)) } return true, nil } func sleepBetweenNamedPipeOpenChecks(ctx context.Context) error { var q waiter.NeverReady left, ok := ctx.BlockWithTimeoutOn(&q, waiter.EventIn, 100*time.Millisecond) if !ok && left != 0 { return linuxerr.ErrInterrupted } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/lisafs_dentry.go000066400000000000000000000533171465435605700265520ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gofer import ( "fmt" "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/lisafs" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" ) func (fs *filesystem) handleAnameLisafs(ctx context.Context, rootInode lisafs.Inode) (lisafs.Inode, error) { if fs.opts.aname == "/" { return rootInode, nil } // Walk to the attach point from root inode. aname is always absolute. rootFD := fs.client.NewFD(rootInode.ControlFD) status, inodes, err := rootFD.WalkMultiple(ctx, strings.Split(fs.opts.aname, "/")[1:]) if err != nil { return lisafs.Inode{}, err } // Close all intermediate FDs to the attach point. rootFD.Close(ctx, false /* flush */) numInodes := len(inodes) for i := 0; i < numInodes-1; i++ { curFD := fs.client.NewFD(inodes[i].ControlFD) curFD.Close(ctx, false /* flush */) } switch status { case lisafs.WalkSuccess: return inodes[numInodes-1], nil default: if numInodes > 0 { last := fs.client.NewFD(inodes[numInodes-1].ControlFD) last.Close(ctx, false /* flush */) } log.Warningf("initClient failed because walk to attach point %q failed: lisafs.WalkStatus = %v", fs.opts.aname, status) return lisafs.Inode{}, linuxerr.ENOENT } } // lisafsDentry is a gofer dentry implementation. It represents a dentry backed // by a lisafs connection. // // +stateify savable type lisafsDentry struct { dentry // controlFD is used by lisafs to perform path based operations on this // dentry. controlFD is immutable. // // if !controlFD.Ok(), this dentry represents a synthetic file, i.e. a // file that does not exist on the remote filesystem. As of this writing, the // only files that can be synthetic are sockets, pipes, and directories. controlFD lisafs.ClientFD `state:"nosave"` // If this dentry represents a regular file or directory, readFDLisa is a // LISAFS FD used for reads by all regularFileFDs/directoryFDs representing // this dentry. readFDLisa is protected by dentry.handleMu. readFDLisa lisafs.ClientFD `state:"nosave"` // If this dentry represents a regular file, writeFDLisa is the LISAFS FD // used for writes by all regularFileFDs representing this dentry. // readFDLisa and writeFDLisa may or may not represent the same LISAFS FD. // Once either transitions from closed (Ok() == false) to open // (Ok() == true), it may be mutated with dentry.handleMu locked, but cannot // be closed until the dentry is destroyed. writeFDLisa is protected by // dentry.handleMu. writeFDLisa lisafs.ClientFD `state:"nosave"` } // newLisafsDentry creates a new dentry representing the given file. The dentry // initially has no references, but is not cached; it is the caller's // responsibility to set the dentry's reference count and/or call // dentry.checkCachingLocked() as appropriate. // newLisafsDentry takes ownership of ino. func (fs *filesystem) newLisafsDentry(ctx context.Context, ino *lisafs.Inode) (*dentry, error) { if ino.Stat.Mask&linux.STATX_TYPE == 0 { ctx.Warningf("can't create gofer.dentry without file type") fs.client.CloseFD(ctx, ino.ControlFD, false /* flush */) return nil, linuxerr.EIO } if ino.Stat.Mode&linux.FileTypeMask == linux.ModeRegular && ino.Stat.Mask&linux.STATX_SIZE == 0 { ctx.Warningf("can't create regular file gofer.dentry without file size") fs.client.CloseFD(ctx, ino.ControlFD, false /* flush */) return nil, linuxerr.EIO } inoKey := inoKeyFromStatx(&ino.Stat) d := &lisafsDentry{ dentry: dentry{ fs: fs, inoKey: inoKey, ino: fs.inoFromKey(inoKey), mode: atomicbitops.FromUint32(uint32(ino.Stat.Mode)), uid: atomicbitops.FromUint32(uint32(fs.opts.dfltuid)), gid: atomicbitops.FromUint32(uint32(fs.opts.dfltgid)), blockSize: atomicbitops.FromUint32(hostarch.PageSize), readFD: atomicbitops.FromInt32(-1), writeFD: atomicbitops.FromInt32(-1), mmapFD: atomicbitops.FromInt32(-1), }, controlFD: fs.client.NewFD(ino.ControlFD), } if ino.Stat.Mask&linux.STATX_UID != 0 { d.uid = atomicbitops.FromUint32(dentryUID(lisafs.UID(ino.Stat.UID))) } if ino.Stat.Mask&linux.STATX_GID != 0 { d.gid = atomicbitops.FromUint32(dentryGID(lisafs.GID(ino.Stat.GID))) } if ino.Stat.Mask&linux.STATX_SIZE != 0 { d.size = atomicbitops.FromUint64(ino.Stat.Size) } if ino.Stat.Blksize != 0 { d.blockSize = atomicbitops.FromUint32(ino.Stat.Blksize) } if ino.Stat.Mask&linux.STATX_ATIME != 0 { d.atime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Atime)) } else { d.atime = atomicbitops.FromInt64(fs.clock.Now().Nanoseconds()) } if ino.Stat.Mask&linux.STATX_MTIME != 0 { d.mtime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Mtime)) } else { d.mtime = atomicbitops.FromInt64(fs.clock.Now().Nanoseconds()) } if ino.Stat.Mask&linux.STATX_CTIME != 0 { d.ctime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Ctime)) } else { // Approximate ctime with mtime if ctime isn't available. d.ctime = atomicbitops.FromInt64(d.mtime.Load()) } if ino.Stat.Mask&linux.STATX_BTIME != 0 { d.btime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Btime)) } if ino.Stat.Mask&linux.STATX_NLINK != 0 { d.nlink = atomicbitops.FromUint32(ino.Stat.Nlink) } else { if ino.Stat.Mode&linux.FileTypeMask == linux.ModeDirectory { d.nlink = atomicbitops.FromUint32(2) } else { d.nlink = atomicbitops.FromUint32(1) } } d.dentry.init(d) fs.syncMu.Lock() fs.syncableDentries.PushBack(&d.syncableListEntry) fs.syncMu.Unlock() return &d.dentry, nil } func (d *lisafsDentry) openHandle(ctx context.Context, flags uint32) (handle, error) { openFD, hostFD, err := d.controlFD.OpenAt(ctx, flags) if err != nil { return noHandle, err } return handle{ fdLisa: d.controlFD.Client().NewFD(openFD), fd: int32(hostFD), }, nil } func (d *lisafsDentry) updateHandles(ctx context.Context, h handle, readable, writable bool) { // Switch to new LISAFS FDs. Note that the read, write and mmap host FDs are // updated separately. oldReadFD := lisafs.InvalidFDID if readable { oldReadFD = d.readFDLisa.ID() d.readFDLisa = h.fdLisa } oldWriteFD := lisafs.InvalidFDID if writable { oldWriteFD = d.writeFDLisa.ID() d.writeFDLisa = h.fdLisa } // NOTE(b/141991141): Close old FDs before making new fids visible (by // unlocking d.handleMu). if oldReadFD.Ok() { d.fs.client.CloseFD(ctx, oldReadFD, false /* flush */) } if oldWriteFD.Ok() && oldReadFD != oldWriteFD { d.fs.client.CloseFD(ctx, oldWriteFD, false /* flush */) } } // Precondition: d.metadataMu must be locked. // // +checklocks:d.metadataMu func (d *lisafsDentry) updateMetadataLocked(ctx context.Context, h handle) error { handleMuRLocked := false if !h.fdLisa.Ok() { // Use open FDs in preferenece to the control FD. This may be significantly // more efficient in some implementations. Prefer a writable FD over a // readable one since some filesystem implementations may update a writable // FD's metadata after writes, without making metadata updates immediately // visible to read-only FDs representing the same file. d.handleMu.RLock() switch { case d.writeFDLisa.Ok(): h.fdLisa = d.writeFDLisa handleMuRLocked = true case d.readFDLisa.Ok(): h.fdLisa = d.readFDLisa handleMuRLocked = true default: h.fdLisa = d.controlFD d.handleMu.RUnlock() } } var stat linux.Statx err := h.fdLisa.StatTo(ctx, &stat) if handleMuRLocked { // handleMu must be released before updateMetadataFromStatLocked(). d.handleMu.RUnlock() // +checklocksforce: complex case. } if err != nil { return err } d.updateMetadataFromStatxLocked(&stat) return nil } func chmod(ctx context.Context, controlFD lisafs.ClientFD, mode uint16) error { setStat := linux.Statx{ Mask: linux.STATX_MODE, Mode: mode, } _, failureErr, err := controlFD.SetStat(ctx, &setStat) if err != nil { return err } return failureErr } func (d *lisafsDentry) destroy(ctx context.Context) { if d.readFDLisa.Ok() && d.readFDLisa.ID() != d.writeFDLisa.ID() { d.readFDLisa.Close(ctx, false /* flush */) } if d.writeFDLisa.Ok() { d.writeFDLisa.Close(ctx, false /* flush */) } if d.controlFD.Ok() { // Close the control FD. Propagate the Close RPCs immediately to the server // if the dentry being destroyed is a deleted regular file. This is to // release the disk space on remote immediately. This will flush the above // read/write lisa FDs as well. flushClose := d.isDeleted() && d.isRegularFile() d.controlFD.Close(ctx, flushClose) } } func (d *lisafsDentry) getRemoteChild(ctx context.Context, name string) (*dentry, error) { childInode, err := d.controlFD.Walk(ctx, name) if err != nil { return nil, err } return d.fs.newLisafsDentry(ctx, &childInode) } // Preconditions: // - fs.renameMu must be locked. // - d.opMu must be locked. // - d.isDir(). // - !rp.done() && rp.Component() is not "." or "..". // // Postcondition: The returned dentry is already cached appropriately. func (d *lisafsDentry) getRemoteChildAndWalkPathLocked(ctx context.Context, rp resolvingPath, ds **[]*dentry) (*dentry, error) { // Collect as many path components as possible to walk. var namesArr [16]string // arbitrarily sized array to help avoid slice allocation. names := namesArr[:0] rp.getComponents(func(name string) bool { if name == "." { return true } if name == ".." { return false } names = append(names, name) return true }) // Walk as much of the path as possible in 1 RPC. _, inodes, err := d.controlFD.WalkMultiple(ctx, names) if err != nil { return nil, err } if len(inodes) == 0 { // d.opMu is locked. So a new child could not have appeared concurrently. // It should be safe to mark this as a negative entry. d.childrenMu.Lock() defer d.childrenMu.Unlock() d.cacheNegativeLookupLocked(names[0]) return nil, linuxerr.ENOENT } // Add the walked inodes into the dentry tree. startParent := &d.dentry curParent := startParent curParentLock := func() { if curParent != startParent { curParent.opMu.RLock() } curParent.childrenMu.Lock() } curParentUnlock := func() { curParent.childrenMu.Unlock() if curParent != startParent { curParent.opMu.RUnlock() // +checklocksforce: locked via curParentLock(). } } var ret *dentry var dentryCreationErr error for i := range inodes { if dentryCreationErr != nil { d.fs.client.CloseFD(ctx, inodes[i].ControlFD, false /* flush */) continue } curParentLock() // Did we race with another walk + cache operation? child, ok := curParent.children[names[i]] // +checklocksforce: locked via curParentLock() if ok && child != nil { // We raced. Clean up the new inode and proceed with // the cached child. d.fs.client.CloseFD(ctx, inodes[i].ControlFD, false /* flush */) } else { // Create and cache the new dentry. var err error child, err = d.fs.newLisafsDentry(ctx, &inodes[i]) if err != nil { dentryCreationErr = err curParentUnlock() continue } curParent.cacheNewChildLocked(child, names[i]) // +checklocksforce: locked via curParentLock(). } curParentUnlock() // For now, child has 0 references, so our caller should call // child.checkCachingLocked(). curParent gained a ref so we should also // call curParent.checkCachingLocked() so it can be removed from the cache // if needed. We only do that for the first iteration because all // subsequent parents would have already been added to ds. if i == 0 { *ds = appendDentry(*ds, curParent) } *ds = appendDentry(*ds, child) curParent = child if i == 0 { ret = child } } return ret, dentryCreationErr } func (d *lisafsDentry) newChildDentry(ctx context.Context, childIno *lisafs.Inode, childName string) (*dentry, error) { child, err := d.fs.newLisafsDentry(ctx, childIno) if err != nil { if err := d.controlFD.UnlinkAt(ctx, childName, 0 /* flags */); err != nil { log.Warningf("failed to clean up created child %s after newLisafsDentry() failed: %v", childName, err) } } return child, err } func (d *lisafsDentry) mknod(ctx context.Context, name string, creds *auth.Credentials, opts *vfs.MknodOptions) (*dentry, error) { if _, ok := opts.Endpoint.(transport.HostBoundEndpoint); !ok { childInode, err := d.controlFD.MknodAt(ctx, name, opts.Mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID), opts.DevMinor, opts.DevMajor) if err != nil { return nil, err } return d.newChildDentry(ctx, &childInode, name) } // This mknod(2) is coming from unix bind(2), as opts.Endpoint is set. sockType := opts.Endpoint.(transport.Endpoint).Type() childInode, boundSocketFD, err := d.controlFD.BindAt(ctx, sockType, name, opts.Mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID)) if err != nil { return nil, err } hbep := opts.Endpoint.(transport.HostBoundEndpoint) if err := hbep.SetBoundSocketFD(ctx, boundSocketFD); err != nil { if err := d.controlFD.UnlinkAt(ctx, name, 0 /* flags */); err != nil { log.Warningf("failed to clean up socket which was created by BindAt RPC: %v", err) } d.fs.client.CloseFD(ctx, childInode.ControlFD, false /* flush */) return nil, err } child, err := d.newChildDentry(ctx, &childInode, name) if err != nil { hbep.ResetBoundSocketFD(ctx) return nil, err } // Set the endpoint on the newly created child dentry. child.endpoint = opts.Endpoint return child, nil } func (d *lisafsDentry) link(ctx context.Context, target *lisafsDentry, name string) (*dentry, error) { linkInode, err := d.controlFD.LinkAt(ctx, target.controlFD.ID(), name) if err != nil { return nil, err } // TODO(gvisor.dev/issue/6739): Hard linked dentries should share the same // inode fields. return d.newChildDentry(ctx, &linkInode, name) } func (d *lisafsDentry) mkdir(ctx context.Context, name string, mode linux.FileMode, uid auth.KUID, gid auth.KGID) (*dentry, error) { childDirInode, err := d.controlFD.MkdirAt(ctx, name, mode, lisafs.UID(uid), lisafs.GID(gid)) if err != nil { return nil, err } return d.newChildDentry(ctx, &childDirInode, name) } func (d *lisafsDentry) symlink(ctx context.Context, name, target string, creds *auth.Credentials) (*dentry, error) { symlinkInode, err := d.controlFD.SymlinkAt(ctx, name, target, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID)) if err != nil { return nil, err } return d.newChildDentry(ctx, &symlinkInode, name) } func (d *lisafsDentry) openCreate(ctx context.Context, name string, flags uint32, mode linux.FileMode, uid auth.KUID, gid auth.KGID) (*dentry, handle, error) { ino, openFD, hostFD, err := d.controlFD.OpenCreateAt(ctx, name, flags, mode, lisafs.UID(uid), lisafs.GID(gid)) if err != nil { return nil, noHandle, err } h := handle{ fdLisa: d.fs.client.NewFD(openFD), fd: int32(hostFD), } child, err := d.fs.newLisafsDentry(ctx, &ino) if err != nil { h.close(ctx) return nil, noHandle, err } return child, h, nil } // lisafsGetdentsCount is the number of bytes of dirents to read from the // server in each Getdents RPC. This value is consistent with vfs1 client. const lisafsGetdentsCount = int32(64 * 1024) // Preconditions: // - getDirents may not be called concurrently with another getDirents call. func (d *lisafsDentry) getDirentsLocked(ctx context.Context, recordDirent func(name string, key inoKey, dType uint8)) error { // shouldSeek0 indicates whether the server should SEEK to 0 before reading // directory entries. shouldSeek0 := true for { count := lisafsGetdentsCount if shouldSeek0 { // See lisafs.Getdents64Req.Count. count = -count shouldSeek0 = false } dirents, err := d.readFDLisa.Getdents64(ctx, count) if err != nil { return err } if len(dirents) == 0 { return nil } for i := range dirents { name := string(dirents[i].Name) if name == "." || name == ".." { continue } recordDirent(name, inoKey{ ino: uint64(dirents[i].Ino), devMinor: uint32(dirents[i].DevMinor), devMajor: uint32(dirents[i].DevMajor), }, uint8(dirents[i].Type)) } } } func flush(ctx context.Context, fd lisafs.ClientFD) error { if fd.Ok() { return fd.Flush(ctx) } return nil } func (d *lisafsDentry) statfs(ctx context.Context) (linux.Statfs, error) { var statFS lisafs.StatFS if err := d.controlFD.StatFSTo(ctx, &statFS); err != nil { return linux.Statfs{}, err } return linux.Statfs{ BlockSize: statFS.BlockSize, FragmentSize: statFS.BlockSize, Blocks: statFS.Blocks, BlocksFree: statFS.BlocksFree, BlocksAvailable: statFS.BlocksAvailable, Files: statFS.Files, FilesFree: statFS.FilesFree, NameLength: statFS.NameLength, }, nil } func (d *lisafsDentry) restoreFile(ctx context.Context, inode *lisafs.Inode, opts *vfs.CompleteRestoreOptions) error { d.controlFD = d.fs.client.NewFD(inode.ControlFD) // Gofers do not preserve inoKey across checkpoint/restore, so: // // - We must assume that the remote filesystem did not change in a way that // would invalidate dentries, since we can't revalidate dentries by // checking inoKey. // // - We need to associate the new inoKey with the existing d.ino. d.inoKey = inoKeyFromStatx(&inode.Stat) d.fs.inoMu.Lock() d.fs.inoByKey[d.inoKey] = d.ino d.fs.inoMu.Unlock() // Check metadata stability before updating metadata. d.metadataMu.Lock() defer d.metadataMu.Unlock() if d.isRegularFile() { if opts.ValidateFileSizes { if inode.Stat.Mask&linux.STATX_SIZE == 0 { return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: file size not available", genericDebugPathname(&d.dentry))} } if d.size.RacyLoad() != inode.Stat.Size { return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(&d.dentry), d.size.Load(), inode.Stat.Size)} } } if opts.ValidateFileModificationTimestamps { if inode.Stat.Mask&linux.STATX_MTIME == 0 { return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime not available", genericDebugPathname(&d.dentry))} } if want := dentryTimestamp(inode.Stat.Mtime); d.mtime.RacyLoad() != want { return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(&d.dentry), linux.NsecToStatxTimestamp(d.mtime.RacyLoad()), linux.NsecToStatxTimestamp(want))} } } } if !d.cachedMetadataAuthoritative() { d.updateMetadataFromStatxLocked(&inode.Stat) } if rw, ok := d.fs.savedDentryRW[&d.dentry]; ok { if err := d.ensureSharedHandle(ctx, rw.read, rw.write, false /* trunc */); err != nil { return err } } return nil } // doRevalidationLisafs stats all dentries in `state`. It will update or // invalidate dentries in the cache based on the result. // // Preconditions: // - fs.renameMu must be locked. // - InteropModeShared is in effect. func doRevalidationLisafs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, state *revalidateState, ds **[]*dentry) error { start := state.start.impl.(*lisafsDentry) // Populate state.names. state.names = state.names[:0] // For sanity. if state.refreshStart { state.names = append(state.names, "") } for _, d := range state.dentries { state.names = append(state.names, d.name) } // Lock metadata on all dentries *before* getting attributes for them. if state.refreshStart { start.metadataMu.Lock() defer start.metadataMu.Unlock() } for _, d := range state.dentries { d.metadataMu.Lock() } // lastUnlockedDentry keeps track of the dentries in state.dentries that have // already had their metadataMu unlocked. Avoid defer unlock in the loop // above to avoid heap allocation. lastUnlockedDentry := -1 defer func() { // Advance to the first unevaluated dentry and unlock the remaining // dentries. for lastUnlockedDentry++; lastUnlockedDentry < len(state.dentries); lastUnlockedDentry++ { state.dentries[lastUnlockedDentry].metadataMu.Unlock() } }() // Make WalkStat RPC. stats, err := start.controlFD.WalkStat(ctx, state.names) if err != nil { return err } if state.refreshStart { if len(stats) > 0 { // First dentry is where the search is starting, just update attributes // since it cannot be replaced. start.updateMetadataFromStatxLocked(&stats[0]) // +checklocksforce: see above. stats = stats[1:] } } for i := 0; i < len(state.dentries); i++ { d := state.dentries[i] found := i < len(stats) // Advance lastUnlockedDentry. It is the responsibility of this for loop // block to unlock d.metadataMu. lastUnlockedDentry = i // Note that synthetic dentries will always fail this comparison check. if !found || d.inoKey != inoKeyFromStatx(&stats[i]) { d.metadataMu.Unlock() if !found && d.isSynthetic() { // We have a synthetic file, and no remote file has arisen to replace // it. return nil } // The file at this path has changed or no longer exists. Mark the // dentry invalidated. d.invalidate(ctx, vfsObj, ds) return nil } // The file at this path hasn't changed. Just update cached metadata. d.impl.(*lisafsDentry).updateMetadataFromStatxLocked(&stats[i]) // +checklocksforce: see above. d.metadataMu.Unlock() } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/regular_file.go000066400000000000000000000706631465435605700263470ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gofer import ( "fmt" "io" "math" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) func (d *dentry) isRegularFile() bool { return d.fileType() == linux.S_IFREG } // +stateify savable type regularFileFD struct { fileDescription // off is the file offset. off is protected by mu. mu sync.Mutex `state:"nosave"` off int64 } func newRegularFileFD(mnt *vfs.Mount, d *dentry, flags uint32) (*regularFileFD, error) { fd := ®ularFileFD{} fd.LockFD.Init(&d.locks) if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ AllowDirectIO: true, }); err != nil { return nil, err } if fd.vfsfd.IsWritable() && (d.mode.Load()&0111 != 0) { metric.SuspiciousOperationsMetric.Increment(&metric.SuspiciousOperationsTypeOpenedWriteExecuteFile) } if d.mmapFD.Load() >= 0 { fsmetric.GoferOpensHost.Increment() } else { fsmetric.GoferOpens9P.Increment() } return fd, nil } // Release implements vfs.FileDescriptionImpl.Release. func (fd *regularFileFD) Release(context.Context) { } // OnClose implements vfs.FileDescriptionImpl.OnClose. func (fd *regularFileFD) OnClose(ctx context.Context) error { if !fd.vfsfd.IsWritable() { return nil } d := fd.dentry() if d.fs.opts.interop == InteropModeExclusive { // d may have dirty pages that we won't write back now (and wouldn't // have in VFS1), making a flushf RPC ineffective. If this is the case, // skip the flushf. // // Note that it's also possible to have dirty pages under other interop // modes if forcePageCache is in effect; we conservatively assume that // applications have some way of tolerating this and still want the // flushf. d.dataMu.RLock() haveDirtyPages := !d.dirty.IsEmpty() d.dataMu.RUnlock() if haveDirtyPages { return nil } } return d.flush(ctx) } // Allocate implements vfs.FileDescriptionImpl.Allocate. func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { d := fd.dentry() return d.doAllocate(ctx, offset, length, func() error { return d.allocate(ctx, mode, offset, length) }) } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { start := fsmetric.StartReadWait() d := fd.dentry() defer func() { if d.readFD.Load() >= 0 { fsmetric.GoferReadsHost.Increment() fsmetric.FinishReadWait(fsmetric.GoferReadWaitHost, start) } else { fsmetric.GoferReads9P.Increment() fsmetric.FinishReadWait(fsmetric.GoferReadWait9P, start) } }() if offset < 0 { return 0, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, linuxerr.EOPNOTSUPP } // Check for reading at EOF before calling into MM (but not under // InteropModeShared, which makes d.size unreliable). if d.cachedMetadataAuthoritative() && uint64(offset) >= d.size.Load() { return 0, io.EOF } var ( n int64 readErr error ) if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { // Write dirty cached pages that will be touched by the read back to // the remote file. if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil { return 0, err } rw := getDentryReadWriter(ctx, d, offset) // Require the read to go to the remote file. rw.direct = true n, readErr = dst.CopyOutFrom(ctx, rw) putDentryReadWriter(rw) if d.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). d.touchAtimeLocked(fd.vfsfd.Mount()) } } else { rw := getDentryReadWriter(ctx, d, offset) n, readErr = dst.CopyOutFrom(ctx, rw) putDentryReadWriter(rw) if d.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). d.touchAtime(fd.vfsfd.Mount()) } } return n, readErr } // Read implements vfs.FileDescriptionImpl.Read. func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { fd.mu.Lock() n, err := fd.PRead(ctx, dst, fd.off, opts) fd.off += n fd.mu.Unlock() return n, err } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { n, _, err := fd.pwrite(ctx, src, offset, opts) return n, err } // pwrite returns the number of bytes written, final offset, error. The final // offset should be ignored by PWrite. func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if offset < 0 { return 0, offset, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, offset, linuxerr.EOPNOTSUPP } d := fd.dentry() d.metadataMu.Lock() defer d.metadataMu.Unlock() // If the fd was opened with O_APPEND, make sure the file size is updated. // There is a possible race here if size is modified externally after // metadata cache is updated. if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() { if err := d.refreshSizeLocked(ctx); err != nil { return 0, offset, err } } // Set offset to file size if the fd was opened with O_APPEND. if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { // Holding d.metadataMu is sufficient for reading d.size. offset = int64(d.size.RacyLoad()) } limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) if err != nil { return 0, offset, err } src = src.TakeFirst64(limit) if d.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:__generic_file_write_iter() => // file_update_time(). This is d.touchCMtime(), but without locking // d.metadataMu (recursively). d.touchCMtimeLocked() } rw := getDentryReadWriter(ctx, d, offset) defer putDentryReadWriter(rw) if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { if err := fd.writeCache(ctx, d, offset, src); err != nil { return 0, offset, err } // Require the write to go to the remote file. rw.direct = true } n, err := src.CopyInTo(ctx, rw) if err != nil { return n, offset + n, err } if n > 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { // Note that if any of the following fail, then we can't guarantee that // any data was actually written with the semantics of O_DSYNC or // O_SYNC, so we return zero bytes written. Compare Linux's // mm/filemap.c:generic_file_write_iter() => // include/linux/fs.h:generic_write_sync(). // // Write dirty cached pages touched by the write back to the remote // file. if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { return 0, offset, err } // Request the remote filesystem to sync the remote file. if err := d.syncRemoteFile(ctx); err != nil { return 0, offset, err } } // As with Linux, writing clears the setuid and setgid bits. if n > 0 { oldMode := d.mode.Load() // If setuid or setgid were set, update d.mode and propagate // changes to the host. if newMode := vfs.ClearSUIDAndSGID(oldMode); newMode != oldMode { if err := d.chmod(ctx, uint16(newMode)); err != nil { return 0, offset, err } d.mode.Store(newMode) } } return n, offset + n, nil } func (fd *regularFileFD) writeCache(ctx context.Context, d *dentry, offset int64, src usermem.IOSequence) error { // Write dirty cached pages that will be touched by the write back to // the remote file. if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { return err } // Remove touched pages from the cache. pgstart := hostarch.PageRoundDown(uint64(offset)) pgend, ok := hostarch.PageRoundUp(uint64(offset + src.NumBytes())) if !ok { return linuxerr.EINVAL } mr := memmap.MappableRange{pgstart, pgend} var freed []memmap.FileRange d.dataMu.Lock() cseg := d.cache.LowerBoundSegment(mr.Start) for cseg.Ok() && cseg.Start() < mr.End { cseg = d.cache.Isolate(cseg, mr) freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) cseg = d.cache.Remove(cseg).NextSegment() } d.dataMu.Unlock() // Invalidate mappings of removed pages. d.mapsMu.Lock() d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) d.mapsMu.Unlock() // Finally free pages removed from the cache. mf := d.fs.mf for _, freedFR := range freed { mf.DecRef(freedFR) } return nil } // Write implements vfs.FileDescriptionImpl.Write. func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { fd.mu.Lock() n, off, err := fd.pwrite(ctx, src, fd.off, opts) fd.off = off fd.mu.Unlock() return n, err } type dentryReadWriter struct { ctx context.Context d *dentry off uint64 direct bool } var dentryReadWriterPool = sync.Pool{ New: func() any { return &dentryReadWriter{} }, } func getDentryReadWriter(ctx context.Context, d *dentry, offset int64) *dentryReadWriter { rw := dentryReadWriterPool.Get().(*dentryReadWriter) rw.ctx = ctx rw.d = d rw.off = uint64(offset) rw.direct = false return rw } func putDentryReadWriter(rw *dentryReadWriter) { rw.ctx = nil rw.d = nil dentryReadWriterPool.Put(rw) } // ReadToBlocks implements safemem.Reader.ReadToBlocks. func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { if dsts.IsEmpty() { return 0, nil } // If we have a mmappable host FD (which must be used here to ensure // coherence with memory-mapped I/O), or if InteropModeShared is in effect // (which prevents us from caching file contents and makes dentry.size // unreliable), or if the file was opened O_DIRECT, read directly from // readHandle() without locking dentry.dataMu. rw.d.handleMu.RLock() h := rw.d.readHandle() if (rw.d.mmapFD.RacyLoad() >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { n, err := h.readToBlocksAt(rw.ctx, dsts, rw.off) rw.d.handleMu.RUnlock() rw.off += n return n, err } // Otherwise read from/through the cache. memCgID := pgalloc.MemoryCgroupIDFromContext(rw.ctx) mf := rw.d.fs.mf fillCache := mf.ShouldCacheEvictable() var dataMuUnlock func() if fillCache { rw.d.dataMu.Lock() dataMuUnlock = rw.d.dataMu.Unlock } else { rw.d.dataMu.RLock() dataMuUnlock = rw.d.dataMu.RUnlock } // Compute the range to read (limited by file size and overflow-checked). end := rw.d.size.Load() if rw.off >= end { dataMuUnlock() rw.d.handleMu.RUnlock() return 0, io.EOF } if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end { end = rend } var done uint64 seg, gap := rw.d.cache.Find(rw.off) for rw.off < end { mr := memmap.MappableRange{rw.off, end} switch { case seg.Ok(): // Get internal mappings from the cache. ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read) if err != nil { dataMuUnlock() rw.d.handleMu.RUnlock() return done, err } // Copy from internal mappings. n, err := safemem.CopySeq(dsts, ims) done += n rw.off += n dsts = dsts.DropFirst64(n) if err != nil { dataMuUnlock() rw.d.handleMu.RUnlock() return done, err } // Continue. seg, gap = seg.NextNonEmpty() case gap.Ok(): gapMR := gap.Range().Intersect(mr) if fillCache { // Read into the cache, then re-enter the loop to read from the // cache. gapEnd, _ := hostarch.PageRoundUp(gapMR.End) reqMR := memmap.MappableRange{ Start: hostarch.PageRoundDown(gapMR.Start), End: gapEnd, } optMR := gap.Range() _, err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), rw.d.size.Load(), mf, pgalloc.AllocOpts{ Kind: usage.PageCache, MemCgID: memCgID, Mode: pgalloc.AllocateAndWritePopulate, }, h.readToBlocksAt) mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End}) seg, gap = rw.d.cache.Find(rw.off) if !seg.Ok() { dataMuUnlock() rw.d.handleMu.RUnlock() return done, err } // err might have occurred in part of gap.Range() outside gapMR // (in particular, gap.End() might be beyond EOF). Forget about // it for now; if the error matters and persists, we'll run // into it again in a later iteration of this loop. } else { // Read directly from the file. gapDsts := dsts.TakeFirst64(gapMR.Length()) n, err := h.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start) done += n rw.off += n dsts = dsts.DropFirst64(n) // Partial reads are fine. But we must stop reading. if n != gapDsts.NumBytes() || err != nil { dataMuUnlock() rw.d.handleMu.RUnlock() return done, err } // Continue. seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} } } } dataMuUnlock() rw.d.handleMu.RUnlock() return done, nil } // WriteFromBlocks implements safemem.Writer.WriteFromBlocks. // // Preconditions: rw.d.metadataMu must be locked. func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { if srcs.IsEmpty() { return 0, nil } // If we have a mmappable host FD (which must be used here to ensure // coherence with memory-mapped I/O), or if InteropModeShared is in effect // (which prevents us from caching file contents), or if the file was // opened with O_DIRECT, write directly to dentry.writeHandle() // without locking dentry.dataMu. rw.d.handleMu.RLock() h := rw.d.writeHandle() if (rw.d.mmapFD.RacyLoad() >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { n, err := h.writeFromBlocksAt(rw.ctx, srcs, rw.off) rw.off += n rw.d.dataMu.Lock() if rw.off > rw.d.size.Load() { rw.d.size.Store(rw.off) // The remote file's size will implicitly be extended to the correct // value when we write back to it. } rw.d.dataMu.Unlock() rw.d.handleMu.RUnlock() return n, err } // Otherwise write to/through the cache. mf := rw.d.fs.mf rw.d.dataMu.Lock() // Compute the range to write (overflow-checked). start := rw.off end := rw.off + srcs.NumBytes() if end <= rw.off { end = math.MaxInt64 } var ( done uint64 retErr error ) seg, gap := rw.d.cache.Find(rw.off) for rw.off < end { mr := memmap.MappableRange{rw.off, end} switch { case seg.Ok(): // Get internal mappings from the cache. segMR := seg.Range().Intersect(mr) ims, err := mf.MapInternal(seg.FileRangeOf(segMR), hostarch.Write) if err != nil { retErr = err goto exitLoop } // Copy to internal mappings. n, err := safemem.CopySeq(ims, srcs) done += n rw.off += n srcs = srcs.DropFirst64(n) rw.d.dirty.MarkDirty(segMR) if err != nil { retErr = err goto exitLoop } // Continue. seg, gap = seg.NextNonEmpty() case gap.Ok(): // Write directly to the file. At present, we never fill the cache // when writing, since doing so can convert small writes into // inefficient read-modify-write cycles, and we have no mechanism // for detecting or avoiding this. gapMR := gap.Range().Intersect(mr) gapSrcs := srcs.TakeFirst64(gapMR.Length()) n, err := h.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start) done += n rw.off += n srcs = srcs.DropFirst64(n) // Partial writes are fine. But we must stop writing. if n != gapSrcs.NumBytes() || err != nil { retErr = err goto exitLoop } // Continue. seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} } } exitLoop: if rw.off > rw.d.size.Load() { rw.d.size.Store(rw.off) // The remote file's size will implicitly be extended to the correct // value when we write back to it. } // If InteropModeWritethrough is in effect, flush written data back to the // remote filesystem. if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 { if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{ Start: start, End: rw.off, }, &rw.d.cache, &rw.d.dirty, rw.d.size.Load(), mf, h.writeFromBlocksAt); err != nil { // We have no idea how many bytes were actually flushed. rw.off = start done = 0 retErr = err } } rw.d.dataMu.Unlock() rw.d.handleMu.RUnlock() return done, retErr } func (d *dentry) writeback(ctx context.Context, offset, size int64) error { if size == 0 { return nil } d.handleMu.RLock() defer d.handleMu.RUnlock() h := d.writeHandle() d.dataMu.Lock() defer d.dataMu.Unlock() // Compute the range of valid bytes (overflow-checked). dentrySize := d.size.Load() if uint64(offset) >= dentrySize { return nil } end := int64(dentrySize) if rend := offset + size; rend > offset && rend < end { end = rend } return fsutil.SyncDirty(ctx, memmap.MappableRange{ Start: uint64(offset), End: uint64(end), }, &d.cache, &d.dirty, dentrySize, d.fs.mf, h.writeFromBlocksAt) } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { fd.mu.Lock() defer fd.mu.Unlock() newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence) if err != nil { return 0, err } fd.off = newOffset return newOffset, nil } // Calculate the new offset for a seek operation on a regular file. func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int64, whence int32) (int64, error) { switch whence { case linux.SEEK_SET: // Use offset as specified. case linux.SEEK_CUR: offset += fdOffset case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE: // Ensure file size is up to date. if !d.cachedMetadataAuthoritative() { if err := d.updateMetadata(ctx); err != nil { return 0, err } } size := int64(d.size.Load()) // For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous // block of data. switch whence { case linux.SEEK_END: offset += size case linux.SEEK_DATA: if offset >= size { return 0, linuxerr.ENXIO } // Use offset as specified. case linux.SEEK_HOLE: if offset >= size { return 0, linuxerr.ENXIO } offset = size } default: return 0, linuxerr.EINVAL } if offset < 0 { return 0, linuxerr.EINVAL } return offset, nil } // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *regularFileFD) Sync(ctx context.Context) error { return fd.dentry().syncCachedFile(ctx, false /* forFilesystemSync */) } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { d := fd.dentry() // Force sentry page caching at your own risk. if !d.fs.opts.forcePageCache { switch d.fs.opts.interop { case InteropModeExclusive: // Any mapping is fine. case InteropModeWritethrough: // Shared writable mappings require a host FD, since otherwise we // can't synchronously flush memory-mapped writes to the remote // file. if opts.Private || !opts.MaxPerms.Write { break } fallthrough case InteropModeShared: // All mappings require a host FD to be coherent with other // filesystem users. if d.mmapFD.Load() < 0 { return linuxerr.ENODEV } default: panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop)) } } // After this point, d may be used as a memmap.Mappable. d.pf.hostFileMapperInitOnce.Do(d.pf.hostFileMapper.Init) opts.SentryOwnedContent = d.fs.opts.forcePageCache return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts) } func (fs *filesystem) mayCachePagesInMemoryFile() bool { return fs.opts.forcePageCache || fs.opts.interop != InteropModeShared } // AddMapping implements memmap.Mappable.AddMapping. func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { d.mapsMu.Lock() mapped := d.mappings.AddMapping(ms, ar, offset, writable) // Do this unconditionally since whether we have a host FD can change // across save/restore. for _, r := range mapped { d.pf.hostFileMapper.IncRefOn(r) } if d.fs.mayCachePagesInMemoryFile() { // d.Evict() will refuse to evict memory-mapped pages, so tell the // MemoryFile to not bother trying. mf := d.fs.mf for _, r := range mapped { mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End}) } } d.mapsMu.Unlock() return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { d.mapsMu.Lock() unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable) for _, r := range unmapped { d.pf.hostFileMapper.DecRefOn(r) } if d.fs.mayCachePagesInMemoryFile() { // Pages that are no longer referenced by any application memory // mappings are now considered unused; allow MemoryFile to evict them // when necessary. mf := d.fs.mf d.dataMu.Lock() for _, r := range unmapped { // Since these pages are no longer mapped, they are no longer // concurrently dirtyable by a writable memory mapping. d.dirty.AllowClean(r) mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End}) } d.dataMu.Unlock() } d.mapsMu.Unlock() } // CopyMapping implements memmap.Mappable.CopyMapping. func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { return d.AddMapping(ctx, ms, dstAR, offset, writable) } // Translate implements memmap.Mappable.Translate. func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { d.handleMu.RLock() if d.mmapFD.RacyLoad() >= 0 && !d.fs.opts.forcePageCache { d.handleMu.RUnlock() mr := optional if d.fs.opts.limitHostFDTranslation { mr = maxFillRange(required, optional) } return []memmap.Translation{ { Source: mr, File: &d.pf, Offset: mr.Start, Perms: hostarch.AnyAccess, }, }, nil } memCgID := pgalloc.MemoryCgroupIDFromContext(ctx) d.dataMu.Lock() // Constrain translations to d.size (rounded up) to prevent translation to // pages that may be concurrently truncated. pgend, _ := hostarch.PageRoundUp(d.size.Load()) var beyondEOF bool if required.End > pgend { if required.Start >= pgend { d.dataMu.Unlock() d.handleMu.RUnlock() return nil, &memmap.BusError{io.EOF} } beyondEOF = true required.End = pgend } if optional.End > pgend { optional.End = pgend } mf := d.fs.mf h := d.readHandle() _, cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), d.size.Load(), mf, pgalloc.AllocOpts{ Kind: usage.PageCache, MemCgID: memCgID, Mode: pgalloc.AllocateAndWritePopulate, }, h.readToBlocksAt) var ts []memmap.Translation var translatedEnd uint64 for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { segMR := seg.Range().Intersect(optional) // TODO(jamieliu): Make Translations writable even if writability is // not required if already kept-dirty by another writable translation. perms := hostarch.AccessType{ Read: true, Execute: true, } if at.Write { // From this point forward, this memory can be dirtied through the // mapping at any time. d.dirty.KeepDirty(segMR) perms.Write = true } ts = append(ts, memmap.Translation{ Source: segMR, File: mf, Offset: seg.FileRangeOf(segMR).Start, Perms: perms, }) translatedEnd = segMR.End } d.dataMu.Unlock() d.handleMu.RUnlock() // Don't return the error returned by c.cache.Fill if it occurred outside // of required. if translatedEnd < required.End && cerr != nil { return ts, &memmap.BusError{cerr} } if beyondEOF { return ts, &memmap.BusError{io.EOF} } return ts, nil } func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange { const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily if required.Length() >= maxReadahead { return required } if optional.Length() <= maxReadahead { return optional } optional.Start = required.Start if optional.Length() <= maxReadahead { return optional } optional.End = optional.Start + maxReadahead return optional } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (d *dentry) InvalidateUnsavable(ctx context.Context) error { // Whether we have a host fd (and consequently what memmap.File is // mapped) can change across save/restore, so invalidate all translations // unconditionally. d.mapsMu.Lock() defer d.mapsMu.Unlock() d.mappings.InvalidateAll(memmap.InvalidateOpts{}) // Write the cache's contents back to the remote file so that if we have a // host fd after restore, the remote file's contents are coherent. mf := d.fs.mf d.handleMu.RLock() defer d.handleMu.RUnlock() h := d.writeHandle() d.dataMu.Lock() defer d.dataMu.Unlock() if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { return err } // Discard the cache so that it's not stored in saved state. This is safe // because per InvalidateUnsavable invariants, no new translations can have // been returned after we invalidated all existing translations above. d.cache.DropAll(mf) d.dirty.RemoveAll() return nil } // Evict implements pgalloc.EvictableMemoryUser.Evict. func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { mr := memmap.MappableRange{er.Start, er.End} mf := d.fs.mf d.mapsMu.Lock() defer d.mapsMu.Unlock() d.handleMu.RLock() defer d.handleMu.RUnlock() h := d.writeHandle() d.dataMu.Lock() defer d.dataMu.Unlock() // Only allow pages that are no longer memory-mapped to be evicted. for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { mgapMR := mgap.Range().Intersect(mr) if mgapMR.Length() == 0 { continue } if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) } d.cache.Drop(mgapMR, mf) d.dirty.KeepClean(mgapMR) } } // dentryPlatformFile implements memmap.File. It exists solely because dentry // cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef. // // dentryPlatformFile is only used when a host FD representing the remote file // is available (i.e. dentry.mmapFD >= 0), and that FD is used for application // memory mappings (i.e. !filesystem.opts.forcePageCache). // // +stateify savable type dentryPlatformFile struct { memmap.NoBufferedIOFallback *dentry // fdRefs counts references on memmap.File offsets. fdRefs is protected // by dentry.dataMu. fdRefs fsutil.FrameRefSet // If this dentry represents a regular file, and dentry.mmapFD >= 0, // hostFileMapper caches mappings of dentry.mmapFD. hostFileMapper fsutil.HostFileMapper // hostFileMapperInitOnce is used to lazily initialize hostFileMapper. hostFileMapperInitOnce sync.Once `state:"nosave"` } // IncRef implements memmap.File.IncRef. func (d *dentryPlatformFile) IncRef(fr memmap.FileRange, memCgID uint32) { d.dataMu.Lock() d.fdRefs.IncRefAndAccount(fr, memCgID) d.dataMu.Unlock() } // DecRef implements memmap.File.DecRef. func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) { d.dataMu.Lock() d.fdRefs.DecRefAndAccount(fr) d.dataMu.Unlock() } // MapInternal implements memmap.File.MapInternal. func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { d.handleMu.RLock() defer d.handleMu.RUnlock() return d.hostFileMapper.MapInternal(fr, int(d.mmapFD.RacyLoad()), at.Write) } // FD implements memmap.File.FD. func (d *dentryPlatformFile) FD() int { d.handleMu.RLock() defer d.handleMu.RUnlock() return int(d.mmapFD.RacyLoad()) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/revalidate.go000066400000000000000000000263451465435605700260250ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gofer import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) type errPartialRevalidation struct{} // Error implements error.Error. func (errPartialRevalidation) Error() string { return "partial revalidation" } type errRevalidationStepDone struct{} // Error implements error.Error. func (errRevalidationStepDone) Error() string { return "stop revalidation" } // revalidatePath checks cached dentries for external modification. File // attributes are refreshed and cache is invalidated in case the dentry has been // deleted, or a new file/directory created in its place. // // Revalidation stops at symlinks and mount points. The caller is responsible // for revalidating again after symlinks are resolved and after changing to // different mounts. // // Preconditions: // - fs.renameMu must be locked. func (fs *filesystem) revalidatePath(ctx context.Context, rpOrig resolvingPath, start *dentry, ds **[]*dentry) error { // Revalidation is done even if start is synthetic in case the path is // something like: ../non_synthetic_file. if fs.opts.interop != InteropModeShared { return nil } // Copy resolving path to walk the path for revalidation. rp := rpOrig.copy() err := fs.revalidate(ctx, rp, start, ds) rp.Release(ctx) return err } // revalidateOne does the same as revalidatePath, but checks a single dentry. // // Preconditions: // - fs.renameMu must be locked. // - parent must have up to date metadata. func (fs *filesystem) revalidateOne(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, ds **[]*dentry) error { // Skip revalidation for interop mode different than InteropModeShared or // if the parent is synthetic (child must be synthetic too, but it cannot be // replaced without first replacing the parent). if parent.cachedMetadataAuthoritative() { return nil } parent.childrenMu.Lock() child, ok := parent.children[name] parent.childrenMu.Unlock() if !ok { return nil } state := makeRevalidateState(parent, false /* refreshStart */) defer state.release() // Note that child can not be nil, because we don't cache negative entries // when InteropModeShared is in effect. state.add(child) return state.doRevalidation(ctx, vfsObj, ds) } // revalidate revalidates path components in rp until done returns true, or // until a mount point or symlink is reached. It may send multiple MultiGetAttr // calls to the gofer to handle ".." in the path. // // Preconditions: // - fs.renameMu must be locked. // - InteropModeShared is in effect. func (fs *filesystem) revalidate(ctx context.Context, rp resolvingPath, start *dentry, ds **[]*dentry) error { state := makeRevalidateState(start, true /* refreshStart */) defer state.release() done: for cur := start; !rp.done(); { var err error cur, err = fs.revalidateStep(ctx, rp, cur, state) if err != nil { switch err.(type) { case errPartialRevalidation: if err := state.doRevalidation(ctx, rp.VirtualFilesystem(), ds); err != nil { return err } // Reset state to release any remaining locks and restart from where // stepping stopped. state.reset(cur /* start */, true /* refreshStart */) case errRevalidationStepDone: break done default: return err } } } return state.doRevalidation(ctx, rp.VirtualFilesystem(), ds) } // revalidateStep walks one element of the path and updates revalidationState // with the dentry if needed. It may also stop the stepping or ask for a // partial revalidation. Partial revalidation requires the caller to revalidate // the current revalidationState, release all locks, and resume stepping. // In case a symlink is hit, revalidation stops and the caller is responsible // for calling revalidate again after the symlink is resolved. Revalidation may // also stop for other reasons, like hitting a child not in the cache. // // Returns: // - (dentry, nil): step worked, continue stepping.` // - (dentry, errPartialRevalidation): revalidation should be done with the // state gathered so far. Then continue stepping with the remainder of the // path, starting at `dentry`. // - (nil, errRevalidationStepDone): revalidation doesn't need to step any // further. It hit a symlink, a mount point, or an uncached dentry. // // Preconditions: // - fs.renameMu must be locked. // - !rp.Done(). // - InteropModeShared is in effect (assumes no negative dentries). func (fs *filesystem) revalidateStep(ctx context.Context, rp resolvingPath, d *dentry, state *revalidateState) (*dentry, error) { switch name := rp.Component(); name { case ".": // Do nothing. case "..": // Partial revalidation is required when ".." is hit because metadata locks // can only be acquired from parent to child to avoid deadlocks. if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { return nil, errRevalidationStepDone{} } else if isRoot || d.parent.Load() == nil { rp.Advance() return d, errPartialRevalidation{} } // We must assume that d.parent is correct, because if d has been moved // elsewhere in the remote filesystem so that its parent has changed, // we have no way of determining its new parent's location in the // filesystem. // // Call rp.CheckMount() before updating d.parent's metadata, since if // we traverse to another mount then d.parent's metadata is irrelevant. if err := rp.CheckMount(ctx, &d.parent.Load().vfsd); err != nil { return nil, errRevalidationStepDone{} } rp.Advance() return d.parent.Load(), errPartialRevalidation{} default: d.childrenMu.Lock() child, ok := d.children[name] d.childrenMu.Unlock() if !ok { // child is not cached, no need to validate any further. return nil, errRevalidationStepDone{} } // Note that child can not be nil, because we don't cache negative entries // when InteropModeShared is in effect. state.add(child) // Symlink must be resolved before continuing with revalidation. if child.isSymlink() { return nil, errRevalidationStepDone{} } d = child } rp.Advance() return d, nil } // Precondition: fs.renameMu must be locked. func (d *dentry) invalidate(ctx context.Context, vfsObj *vfs.VirtualFilesystem, ds **[]*dentry) { // Remove d from its parent. func() { parent := d.parent.Load() parent.opMu.RLock() defer parent.opMu.RUnlock() parent.childrenMu.Lock() defer parent.childrenMu.Unlock() if d.isSynthetic() { // Normally we don't mark invalidated dentries as deleted since // they may still exist (but at a different path), and also for // consistency with Linux. However, synthetic files are guaranteed // to become unreachable if their dentries are invalidated, so // treat their invalidation as deletion. d.deleteSynthetic(parent, ds) } // Since the opMu was just reacquired above, re-check that the // parent's child with this name is still the same. Do not touch it if // it has been replaced with a different one. if child := parent.children[d.name]; child == d { // Invalidate dentry so it gets reloaded next time it's accessed. delete(parent.children, d.name) } }() // Invalidate d and its descendants. toInvalidate := []*dentry{d} for len(toInvalidate) != 0 { d := toInvalidate[len(toInvalidate)-1] toInvalidate = toInvalidate[:len(toInvalidate)-1] // If the dentry is a mountpoint, InvalidateDentry may drop the // last reference on it, resulting in lock recursion. To avoid // this, take a dentry reference first, then drop it while // deferring the call to dentry.checkCachingLocked(). d.IncRef() rcs := vfsObj.InvalidateDentry(ctx, &d.vfsd) for _, rc := range rcs { rc.DecRef(ctx) } d.decRefNoCaching() // Re-evaluate its caching status (i.e. if it has 0 references, drop it). // The dentry will be reloaded next time it's accessed. *ds = appendDentry(*ds, d) if d.isDir() { toInvalidate = d.disownAllChildrenForInvalidation(ctx, vfsObj, toInvalidate, ds) } } } // +checklocks:parent.childrenMu func (d *dentry) deleteSynthetic(parent *dentry, ds **[]*dentry) { d.setDeleted() d.decRefNoCaching() *ds = appendDentry(*ds, d) parent.syntheticChildren-- parent.clearDirentsLocked() } // disownAllChildrenForInvalidation removes all child dentries from d, appends // them to children, and returns an updated slice. Consistent with // dentry.invalidate(), removed synthetic dentries are marked deleted. // // Precondition: fs.renameMu must be locked. func (d *dentry) disownAllChildrenForInvalidation(ctx context.Context, vfsObj *vfs.VirtualFilesystem, children []*dentry, ds **[]*dentry) []*dentry { d.opMu.RLock() defer d.opMu.RUnlock() d.childrenMu.Lock() defer d.childrenMu.Unlock() for name, child := range d.children { children = append(children, child) delete(d.children, name) if child.isSynthetic() { child.deleteSynthetic(d, ds) } } return children } // revalidateStatePool caches revalidateState instances to save array // allocations for dentries and names. var revalidateStatePool = sync.Pool{ New: func() any { return &revalidateState{} }, } // revalidateState keeps state related to a revalidation request. It keeps track // of {name, dentry} list being revalidated, as well as metadata locks on the // dentries. The list must be in ancestry order, in other words `n` must be // `n-1` child. type revalidateState struct { // start is the dentry where to start the revalidation of dentries. start *dentry // refreshStart indicates whether the attributes of the start dentry should // be refreshed. refreshStart bool // names is just a slice of names which can be used while making LISAFS RPCs. // This exists to avoid the cost of repeated string slice allocation to make // RPCs. names []string // dentries is the list of dentries that need to be revalidated. The first // dentry is a child of start and each successive dentry is a child of the // previous. dentries []*dentry } func makeRevalidateState(start *dentry, refreshStart bool) *revalidateState { r := revalidateStatePool.Get().(*revalidateState) r.start = start r.refreshStart = refreshStart return r } // release must be called after the caller is done with this object. It releases // all metadata locks and resources. func (r *revalidateState) release() { r.reset(nil /* start */, false /* refreshStart */) revalidateStatePool.Put(r) } // Preconditions: // - d != nil. // - d is a descendant of all dentries in r.dentries. func (r *revalidateState) add(d *dentry) { r.dentries = append(r.dentries, d) } // reset releases all metadata locks and resets all fields to allow this // instance to be reused. // +checklocksignore func (r *revalidateState) reset(start *dentry, refreshStart bool) { r.start = start r.refreshStart = refreshStart r.names = r.names[:0] r.dentries = r.dentries[:0] } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/save_restore.go000066400000000000000000000170021465435605700263740ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gofer import ( goContext "context" "fmt" "io" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // +stateify savable type savedDentryRW struct { read bool write bool } // PrepareSave implements vfs.FilesystemImplSaveRestoreExtension.PrepareSave. func (fs *filesystem) PrepareSave(ctx context.Context) error { if len(fs.iopts.UniqueID.Path) == 0 { return fmt.Errorf("gofer.filesystem with no UniqueID cannot be saved") } // Purge cached dentries, which may not be reopenable after restore due to // permission changes. fs.renameMu.Lock() fs.evictAllCachedDentriesLocked(ctx) fs.renameMu.Unlock() // Buffer pipe data so that it's available for reading after restore. (This // is a legacy VFS1 feature.) fs.syncMu.Lock() for sffd := fs.specialFileFDs.Front(); sffd != nil; sffd = sffd.Next() { if sffd.dentry().fileType() == linux.S_IFIFO && sffd.vfsfd.IsReadable() { if err := sffd.savePipeData(ctx); err != nil { fs.syncMu.Unlock() return err } } } fs.syncMu.Unlock() // Flush local state to the remote filesystem. if err := fs.Sync(ctx); err != nil { return err } fs.savedDentryRW = make(map[*dentry]savedDentryRW) return fs.root.prepareSaveRecursive(ctx) } // Preconditions: // - fd represents a pipe. // - fd is readable. func (fd *specialFileFD) savePipeData(ctx context.Context) error { fd.bufMu.Lock() defer fd.bufMu.Unlock() var buf [hostarch.PageSize]byte for { n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:])), ^uint64(0)) if n != 0 { fd.buf = append(fd.buf, buf[:n]...) } if err != nil { if err == io.EOF || linuxerr.Equals(linuxerr.EAGAIN, err) { break } return err } } if len(fd.buf) != 0 { fd.haveBuf.Store(1) } return nil } func (d *dentry) prepareSaveRecursive(ctx context.Context) error { if d.isRegularFile() && !d.cachedMetadataAuthoritative() { // Get updated metadata for d in case we need to perform metadata // validation during restore. if err := d.updateMetadata(ctx); err != nil { return err } } if d.isReadHandleOk() || d.isWriteHandleOk() { d.fs.savedDentryRW[d] = savedDentryRW{ read: d.isReadHandleOk(), write: d.isWriteHandleOk(), } } d.childrenMu.Lock() defer d.childrenMu.Unlock() for childName, child := range d.children { if child == nil { // Unsaved filesystem state may change across save/restore. Remove // negative entries from d.children to ensure that files created // after save are visible after restore. delete(d.children, childName) continue } if err := child.prepareSaveRecursive(ctx); err != nil { return err } } return nil } // beforeSave is invoked by stateify. func (d *dentry) beforeSave() { if d.vfsd.IsDead() { panic(fmt.Sprintf("gofer.dentry(%q).beforeSave: deleted and invalidated dentries can't be restored", genericDebugPathname(d))) } } // afterLoad is invoked by stateify. func (fs *filesystem) afterLoad(ctx goContext.Context) { fs.mf = pgalloc.MemoryFileFromContext(ctx) } // afterLoad is invoked by stateify. func (d *dentry) afterLoad(goContext.Context) { d.readFD = atomicbitops.FromInt32(-1) d.writeFD = atomicbitops.FromInt32(-1) d.mmapFD = atomicbitops.FromInt32(-1) if d.refs.Load() != -1 { refs.Register(d) } } // afterLoad is invoked by stateify. func (d *directfsDentry) afterLoad(goContext.Context) { d.controlFD = -1 } // afterLoad is invoked by stateify. func (d *dentryPlatformFile) afterLoad(goContext.Context) { if d.hostFileMapper.IsInited() { // Ensure that we don't call d.hostFileMapper.Init() again. d.hostFileMapperInitOnce.Do(func() {}) } } // afterLoad is invoked by stateify. func (fd *specialFileFD) afterLoad(goContext.Context) { fd.handle.fd = -1 if fd.hostFileMapper.IsInited() { // Ensure that we don't call fd.hostFileMapper.Init() again. fd.hostFileMapperInitOnce.Do(func() {}) } } // saveParent is called by stateify. func (d *dentry) saveParent() *dentry { return d.parent.Load() } // loadParent is called by stateify. func (d *dentry) loadParent(_ goContext.Context, parent *dentry) { d.parent.Store(parent) } // CompleteRestore implements // vfs.FilesystemImplSaveRestoreExtension.CompleteRestore. func (fs *filesystem) CompleteRestore(ctx context.Context, opts vfs.CompleteRestoreOptions) error { fdmap := vfs.RestoreFilesystemFDMapFromContext(ctx) if fdmap == nil { return fmt.Errorf("no server FD map available") } fd, ok := fdmap[fs.iopts.UniqueID] if !ok { return fmt.Errorf("no server FD available for filesystem with unique ID %+v, map: %v", fs.iopts.UniqueID, fdmap) } fs.opts.fd = fd fs.inoByKey = make(map[inoKey]uint64) if err := fs.restoreRoot(ctx, &opts); err != nil { return err } // Restore remaining dentries. if err := fs.root.restoreDescendantsRecursive(ctx, &opts); err != nil { return err } // Re-open handles for specialFileFDs. Unlike the initial open // (dentry.openSpecialFile()), pipes are always opened without blocking; // non-readable pipe FDs are opened last to ensure that they don't get // ENXIO if another specialFileFD represents the read end of the same pipe. // This is consistent with VFS1. haveWriteOnlyPipes := false for fd := fs.specialFileFDs.Front(); fd != nil; fd = fd.Next() { if fd.dentry().fileType() == linux.S_IFIFO && !fd.vfsfd.IsReadable() { haveWriteOnlyPipes = true continue } if err := fd.completeRestore(ctx); err != nil { return err } } if haveWriteOnlyPipes { for fd := fs.specialFileFDs.Front(); fd != nil; fd = fd.Next() { if fd.dentry().fileType() == linux.S_IFIFO && !fd.vfsfd.IsReadable() { if err := fd.completeRestore(ctx); err != nil { return err } } } } // Discard state only required during restore. fs.savedDentryRW = nil return nil } // Preconditions: d is not synthetic. func (d *dentry) restoreDescendantsRecursive(ctx context.Context, opts *vfs.CompleteRestoreOptions) error { d.childrenMu.Lock() defer d.childrenMu.Unlock() for _, child := range d.children { if child == nil { continue } if child.isSynthetic() { continue } if err := child.restoreFile(ctx, opts); err != nil { return err } if err := child.restoreDescendantsRecursive(ctx, opts); err != nil { return err } } return nil } func (fd *specialFileFD) completeRestore(ctx context.Context) error { d := fd.dentry() h, err := d.openHandle(ctx, fd.vfsfd.IsReadable(), fd.vfsfd.IsWritable(), false /* trunc */) if err != nil { return err } fd.handle = h ftype := d.fileType() fd.haveQueue = (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && fd.handle.fd >= 0 if fd.haveQueue { if err := fdnotifier.AddFD(fd.handle.fd, &fd.queue); err != nil { return err } } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/socket.go000066400000000000000000000070471465435605700251730ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gofer import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/waiter" ) func (d *dentry) isSocket() bool { return d.fileType() == linux.S_IFSOCK } func isSocketTypeSupported(sockType linux.SockType) bool { switch sockType { case unix.SOCK_STREAM, unix.SOCK_DGRAM, unix.SOCK_SEQPACKET: return true default: return false } } // endpoint is a Gofer-backed transport.BoundEndpoint. // // An endpoint's lifetime is the time between when filesystem.BoundEndpointAt() // is called and either BoundEndpoint.BidirectionalConnect or // BoundEndpoint.UnidirectionalConnect is called. // // +stateify savable type endpoint struct { // dentry is the filesystem dentry which produced this endpoint. dentry is // not synthetic. dentry *dentry // path is the sentry path where this endpoint is bound. path string } // BidirectionalConnect implements BoundEndpoint.BidirectionalConnect. func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *syserr.Error { // No lock ordering required as only the ConnectingEndpoint has a mutex. ce.Lock() // Check connecting state. if ce.Connected() { ce.Unlock() return syserr.ErrAlreadyConnected } if ce.ListeningLocked() { ce.Unlock() return syserr.ErrInvalidEndpointState } c, err := e.newConnectedEndpoint(ctx, ce.Type(), ce.WaiterQueue()) if err != nil { ce.Unlock() return err } returnConnect(c, c) ce.Unlock() if err := c.Init(); err != nil { return syserr.FromError(err) } return nil } // UnidirectionalConnect implements // transport.BoundEndpoint.UnidirectionalConnect. func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.ConnectedEndpoint, *syserr.Error) { c, err := e.newConnectedEndpoint(ctx, linux.SOCK_DGRAM, &waiter.Queue{}) if err != nil { return nil, err } if err := c.Init(); err != nil { return nil, syserr.FromError(err) } // We don't need the receiver. c.CloseRecv() c.Release(ctx) return c, nil } func (e *endpoint) newConnectedEndpoint(ctx context.Context, sockType linux.SockType, queue *waiter.Queue) (*transport.SCMConnectedEndpoint, *syserr.Error) { e.dentry.fs.renameMu.RLock() hostSockFD, err := e.dentry.connect(ctx, sockType) e.dentry.fs.renameMu.RUnlock() if err != nil { return nil, syserr.ErrConnectionRefused } c, serr := transport.NewSCMEndpoint(hostSockFD, queue, e.path) if serr != nil { unix.Close(hostSockFD) log.Warningf("NewSCMEndpoint failed: path=%q, err=%v", e.path, serr) return nil, serr } return c, nil } // Release implements transport.BoundEndpoint.Release. func (e *endpoint) Release(ctx context.Context) { e.dentry.DecRef(ctx) } // Passcred implements transport.BoundEndpoint.Passcred. func (e *endpoint) Passcred() bool { return false } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/special_fd_list.go000066400000000000000000000123621465435605700270230ustar00rootroot00000000000000package gofer // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type specialFDElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (specialFDElementMapper) linkerFor(elem *specialFileFD) *specialFileFD { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type specialFDList struct { head *specialFileFD tail *specialFileFD } // Reset resets list l to the empty state. func (l *specialFDList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *specialFDList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *specialFDList) Front() *specialFileFD { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *specialFDList) Back() *specialFileFD { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *specialFDList) Len() (count int) { for e := l.Front(); e != nil; e = (specialFDElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *specialFDList) PushFront(e *specialFileFD) { linker := specialFDElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { specialFDElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *specialFDList) PushFrontList(m *specialFDList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { specialFDElementMapper{}.linkerFor(l.head).SetPrev(m.tail) specialFDElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *specialFDList) PushBack(e *specialFileFD) { linker := specialFDElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { specialFDElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *specialFDList) PushBackList(m *specialFDList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { specialFDElementMapper{}.linkerFor(l.tail).SetNext(m.head) specialFDElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *specialFDList) InsertAfter(b, e *specialFileFD) { bLinker := specialFDElementMapper{}.linkerFor(b) eLinker := specialFDElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { specialFDElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *specialFDList) InsertBefore(a, e *specialFileFD) { aLinker := specialFDElementMapper{}.linkerFor(a) eLinker := specialFDElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { specialFDElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *specialFDList) Remove(e *specialFileFD) { linker := specialFDElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { specialFDElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { specialFDElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type specialFDEntry struct { next *specialFileFD prev *specialFileFD } // Next returns the entry that follows e in the list. // //go:nosplit func (e *specialFDEntry) Next() *specialFileFD { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *specialFDEntry) Prev() *specialFileFD { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *specialFDEntry) SetNext(elem *specialFileFD) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *specialFDEntry) SetPrev(elem *specialFileFD) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/special_file.go000066400000000000000000000402321465435605700263130ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gofer import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // specialFileFD implements vfs.FileDescriptionImpl for pipes, sockets, device // special files, and (when filesystemOptions.regularFilesUseSpecialFileFD is // in effect) regular files. specialFileFD differs from regularFileFD by using // per-FD handles instead of shared per-dentry handles, and never buffering I/O. // // +stateify savable type specialFileFD struct { fileDescription specialFDEntry memmap.NoBufferedIOFallback // releaseMu synchronizes the closing of fd.handle with fd.sync(). It's safe // to access fd.handle without locking for operations that require a ref to // be held by the caller, e.g. vfs.FileDescriptionImpl implementations. releaseMu sync.RWMutex `state:"nosave"` // handle is used for file I/O. handle is immutable. handle handle `state:"nosave"` // isRegularFile is true if this FD represents a regular file which is only // possible when filesystemOptions.regularFilesUseSpecialFileFD is in // effect. isRegularFile is immutable. isRegularFile bool // seekable is true if this file description represents a file for which // file offset is significant, i.e. a regular file, character device or // block device. seekable is immutable. seekable bool // haveQueue is true if this file description represents a file for which // queue may send I/O readiness events. haveQueue is immutable. haveQueue bool `state:"nosave"` queue waiter.Queue // If seekable is true, off is the file offset. off is protected by mu. mu sync.Mutex `state:"nosave"` off int64 // If haveBuf is non-zero, this FD represents a pipe, and buf contains data // read from the pipe from previous calls to specialFileFD.savePipeData(). // haveBuf and buf are protected by bufMu. bufMu sync.Mutex `state:"nosave"` haveBuf atomicbitops.Uint32 buf []byte // If handle.fd >= 0, hostFileMapper caches mappings of handle.fd, and // hostFileMapperInitOnce is used to initialize it on first use. hostFileMapperInitOnce sync.Once `state:"nosave"` hostFileMapper fsutil.HostFileMapper // If handle.fd >= 0, fileRefs counts references on memmap.File offsets. // fileRefs is protected by fileRefsMu. fileRefsMu sync.Mutex `state:"nosave"` fileRefs fsutil.FrameRefSet } func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) { ftype := d.fileType() seekable := ftype == linux.S_IFREG || ftype == linux.S_IFCHR || ftype == linux.S_IFBLK haveQueue := (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK || ftype == linux.S_IFCHR) && h.fd >= 0 fd := &specialFileFD{ handle: h, isRegularFile: ftype == linux.S_IFREG, seekable: seekable, haveQueue: haveQueue, } fd.LockFD.Init(&d.locks) if haveQueue { if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil { return nil, err } } if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ AllowDirectIO: true, DenyPRead: !seekable, DenyPWrite: !seekable, }); err != nil { if haveQueue { fdnotifier.RemoveFD(h.fd) } return nil, err } d.fs.syncMu.Lock() d.fs.specialFileFDs.PushBack(fd) d.fs.syncMu.Unlock() if fd.vfsfd.IsWritable() && (d.mode.Load()&0111 != 0) { metric.SuspiciousOperationsMetric.Increment(&metric.SuspiciousOperationsTypeOpenedWriteExecuteFile) } if h.fd >= 0 { fsmetric.GoferOpensHost.Increment() } else { fsmetric.GoferOpens9P.Increment() } return fd, nil } // Release implements vfs.FileDescriptionImpl.Release. func (fd *specialFileFD) Release(ctx context.Context) { if fd.haveQueue { fdnotifier.RemoveFD(fd.handle.fd) } fd.releaseMu.Lock() fd.handle.close(ctx) fd.releaseMu.Unlock() fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) fs.syncMu.Lock() fs.specialFileFDs.Remove(fd) fs.syncMu.Unlock() } // OnClose implements vfs.FileDescriptionImpl.OnClose. func (fd *specialFileFD) OnClose(ctx context.Context) error { if !fd.vfsfd.IsWritable() { return nil } return flush(ctx, fd.handle.fdLisa) } // Readiness implements waiter.Waitable.Readiness. func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask { if fd.haveQueue { return fdnotifier.NonBlockingPoll(fd.handle.fd, mask) } return fd.fileDescription.Readiness(mask) } // EventRegister implements waiter.Waitable.EventRegister. func (fd *specialFileFD) EventRegister(e *waiter.Entry) error { if fd.haveQueue { fd.queue.EventRegister(e) if err := fdnotifier.UpdateFD(fd.handle.fd); err != nil { fd.queue.EventUnregister(e) return err } return nil } return fd.fileDescription.EventRegister(e) } // EventUnregister implements waiter.Waitable.EventUnregister. func (fd *specialFileFD) EventUnregister(e *waiter.Entry) { if fd.haveQueue { fd.queue.EventUnregister(e) if err := fdnotifier.UpdateFD(fd.handle.fd); err != nil { panic(fmt.Sprint("UpdateFD:", err)) } return } fd.fileDescription.EventUnregister(e) } // Epollable implements FileDescriptionImpl.Epollable. func (fd *specialFileFD) Epollable() bool { if fd.haveQueue { return true } return fd.fileDescription.Epollable() } func (fd *specialFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { if fd.isRegularFile { d := fd.dentry() return d.doAllocate(ctx, offset, length, func() error { return fd.handle.allocate(ctx, mode, offset, length) }) } return fd.FileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length) } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { start := fsmetric.StartReadWait() defer func() { if fd.handle.fd >= 0 { fsmetric.GoferReadsHost.Increment() fsmetric.FinishReadWait(fsmetric.GoferReadWaitHost, start) } else { fsmetric.GoferReads9P.Increment() fsmetric.FinishReadWait(fsmetric.GoferReadWait9P, start) } }() if fd.seekable && offset < 0 { return 0, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, linuxerr.EOPNOTSUPP } if d := fd.dentry(); d.cachedMetadataAuthoritative() { d.touchAtime(fd.vfsfd.Mount()) } bufN := int64(0) if fd.haveBuf.Load() != 0 { var err error fd.bufMu.Lock() if len(fd.buf) != 0 { var n int n, err = dst.CopyOut(ctx, fd.buf) dst = dst.DropFirst(n) fd.buf = fd.buf[n:] if len(fd.buf) == 0 { fd.haveBuf.Store(0) fd.buf = nil } bufN = int64(n) if offset >= 0 { offset += bufN } } fd.bufMu.Unlock() if err != nil { return bufN, err } } rw := getHandleReadWriter(ctx, &fd.handle, offset) n, err := dst.CopyOutFrom(ctx, rw) putHandleReadWriter(rw) if linuxerr.Equals(linuxerr.EAGAIN, err) { err = linuxerr.ErrWouldBlock } return bufN + n, err } // Read implements vfs.FileDescriptionImpl.Read. func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { if !fd.seekable { return fd.PRead(ctx, dst, -1, opts) } fd.mu.Lock() n, err := fd.PRead(ctx, dst, fd.off, opts) fd.off += n fd.mu.Unlock() return n, err } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { n, _, err := fd.pwrite(ctx, src, offset, opts) return n, err } // pwrite returns the number of bytes written, final offset, error. The final // offset should be ignored by PWrite. func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if fd.seekable && offset < 0 { return 0, offset, linuxerr.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, offset, linuxerr.EOPNOTSUPP } d := fd.dentry() if fd.isRegularFile { // If the regular file fd was opened with O_APPEND, make sure the file // size is updated. There is a possible race here if size is modified // externally after metadata cache is updated. if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() { if err := d.updateMetadata(ctx); err != nil { return 0, offset, err } } // We need to hold the metadataMu *while* writing to a regular file. d.metadataMu.Lock() defer d.metadataMu.Unlock() // Set offset to file size if the regular file was opened with O_APPEND. if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { // Holding d.metadataMu is sufficient for reading d.size. offset = int64(d.size.RacyLoad()) } limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) if err != nil { return 0, offset, err } src = src.TakeFirst64(limit) } if d.cachedMetadataAuthoritative() { if fd.isRegularFile { d.touchCMtimeLocked() } else { d.touchCMtime() } } // handleReadWriter always writes to the remote file. So O_DIRECT is // effectively always set. Invalidate pages in d.mappings that have been // written to. pgstart := hostarch.PageRoundDown(uint64(offset)) pgend, ok := hostarch.PageRoundUp(uint64(offset + src.NumBytes())) if !ok { return 0, offset, linuxerr.EINVAL } mr := memmap.MappableRange{pgstart, pgend} d.mapsMu.Lock() d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) d.mapsMu.Unlock() rw := getHandleReadWriter(ctx, &fd.handle, offset) n, err := src.CopyInTo(ctx, rw) putHandleReadWriter(rw) if n > 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { // Note that if syncing the remote file fails, then we can't guarantee that // any data was actually written with the semantics of O_DSYNC or // O_SYNC, so we return zero bytes written. Compare Linux's // mm/filemap.c:generic_file_write_iter() => // include/linux/fs.h:generic_write_sync(). if err := fd.sync(ctx, false /* forFilesystemSync */); err != nil { return 0, offset, err } } if linuxerr.Equals(linuxerr.EAGAIN, err) { err = linuxerr.ErrWouldBlock } // Update offset if the offset is valid. if offset >= 0 { offset += n } // Update file size for regular files. if fd.isRegularFile { // d.metadataMu is already locked at this point. if uint64(offset) > d.size.RacyLoad() { d.dataMu.Lock() defer d.dataMu.Unlock() d.size.Store(uint64(offset)) } } return int64(n), offset, err } // Write implements vfs.FileDescriptionImpl.Write. func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { if !fd.seekable { return fd.PWrite(ctx, src, -1, opts) } fd.mu.Lock() n, off, err := fd.pwrite(ctx, src, fd.off, opts) fd.off = off fd.mu.Unlock() return n, err } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { if !fd.seekable { return 0, linuxerr.ESPIPE } fd.mu.Lock() defer fd.mu.Unlock() newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence) if err != nil { return 0, err } fd.off = newOffset return newOffset, nil } // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *specialFileFD) Sync(ctx context.Context) error { return fd.sync(ctx, false /* forFilesystemSync */) } func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error { // Locks to ensure it didn't race with fd.Release(). fd.releaseMu.RLock() defer fd.releaseMu.RUnlock() if err := fd.handle.sync(ctx); err != nil { if !forFilesystemSync { return err } // Only return err if we can reasonably have expected sync to succeed // (fd represents a regular file that was opened for writing). if fd.isRegularFile && fd.vfsfd.IsWritable() { return err } ctx.Debugf("gofer.specialFileFD.sync: syncing non-writable or non-regular-file FD failed: %v", err) } return nil } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *specialFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { if fd.handle.fd < 0 || fd.filesystem().opts.forcePageCache { return linuxerr.ENODEV } // After this point, fd may be used as a memmap.Mappable and memmap.File. fd.hostFileMapperInitOnce.Do(fd.hostFileMapper.Init) return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts) } // AddMapping implements memmap.Mappable.AddMapping. func (fd *specialFileFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { d := fd.dentry() d.mapsMu.Lock() defer d.mapsMu.Unlock() d.mappings.AddMapping(ms, ar, offset, writable) fd.hostFileMapper.IncRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())}) return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (fd *specialFileFD) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { d := fd.dentry() d.mapsMu.Lock() defer d.mapsMu.Unlock() d.mappings.RemoveMapping(ms, ar, offset, writable) fd.hostFileMapper.DecRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())}) } // CopyMapping implements memmap.Mappable.CopyMapping. func (fd *specialFileFD) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { return fd.AddMapping(ctx, ms, dstAR, offset, writable) } // Translate implements memmap.Mappable.Translate. func (fd *specialFileFD) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { mr := optional if fd.filesystem().opts.limitHostFDTranslation { mr = maxFillRange(required, optional) } return []memmap.Translation{ { Source: mr, File: fd, Offset: mr.Start, Perms: hostarch.AnyAccess, }, }, nil } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (fd *specialFileFD) InvalidateUnsavable(ctx context.Context) error { return nil } // IncRef implements memmap.File.IncRef. func (fd *specialFileFD) IncRef(fr memmap.FileRange, memCgID uint32) { fd.fileRefsMu.Lock() defer fd.fileRefsMu.Unlock() fd.fileRefs.IncRefAndAccount(fr, memCgID) } // DecRef implements memmap.File.DecRef. func (fd *specialFileFD) DecRef(fr memmap.FileRange) { fd.fileRefsMu.Lock() defer fd.fileRefsMu.Unlock() fd.fileRefs.DecRefAndAccount(fr) } // MapInternal implements memmap.File.MapInternal. func (fd *specialFileFD) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { fd.requireHostFD() return fd.hostFileMapper.MapInternal(fr, int(fd.handle.fd), at.Write) } // FD implements memmap.File.FD. func (fd *specialFileFD) FD() int { fd.requireHostFD() return int(fd.handle.fd) } func (fd *specialFileFD) requireHostFD() { if fd.handle.fd < 0 { // This is possible if fd was successfully mmapped before saving, then // was restored without a host FD. This is unrecoverable: without a // host FD, we can't mmap this file post-restore. panic("gofer.specialFileFD can no longer be memory-mapped without a host FD") } } func (fd *specialFileFD) updateMetadata(ctx context.Context) error { d := fd.dentry() d.metadataMu.Lock() defer d.metadataMu.Unlock() return d.updateMetadataLocked(ctx, fd.handle) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/string_list.go000066400000000000000000000122131465435605700262330ustar00rootroot00000000000000package gofer // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type stringElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (stringElementMapper) linkerFor(elem *stringListElem) *stringListElem { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type stringList struct { head *stringListElem tail *stringListElem } // Reset resets list l to the empty state. func (l *stringList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *stringList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *stringList) Front() *stringListElem { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *stringList) Back() *stringListElem { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *stringList) Len() (count int) { for e := l.Front(); e != nil; e = (stringElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *stringList) PushFront(e *stringListElem) { linker := stringElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { stringElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *stringList) PushFrontList(m *stringList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { stringElementMapper{}.linkerFor(l.head).SetPrev(m.tail) stringElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *stringList) PushBack(e *stringListElem) { linker := stringElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { stringElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *stringList) PushBackList(m *stringList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { stringElementMapper{}.linkerFor(l.tail).SetNext(m.head) stringElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *stringList) InsertAfter(b, e *stringListElem) { bLinker := stringElementMapper{}.linkerFor(b) eLinker := stringElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { stringElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *stringList) InsertBefore(a, e *stringListElem) { aLinker := stringElementMapper{}.linkerFor(a) eLinker := stringElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { stringElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *stringList) Remove(e *stringListElem) { linker := stringElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { stringElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { stringElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type stringEntry struct { next *stringListElem prev *stringListElem } // Next returns the entry that follows e in the list. // //go:nosplit func (e *stringEntry) Next() *stringListElem { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *stringEntry) Prev() *stringListElem { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *stringEntry) SetNext(elem *stringListElem) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *stringEntry) SetPrev(elem *stringListElem) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/symlink.go000066400000000000000000000024701465435605700253640ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gofer import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/vfs" ) func (d *dentry) isSymlink() bool { return d.fileType() == linux.S_IFLNK } // Precondition: d.isSymlink(). func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { if d.fs.opts.interop != InteropModeShared { d.touchAtime(mnt) d.dataMu.Lock() if d.haveTarget { target := d.target d.dataMu.Unlock() return target, nil } } target, err := d.readlinkImpl(ctx) if d.fs.opts.interop != InteropModeShared { if err == nil { d.haveTarget = true d.target = target } d.dataMu.Unlock() // +checklocksforce: guaranteed locked from above. } return target, err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/gofer/time.go000066400000000000000000000047501465435605700246370ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gofer import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" ) func dentryTimestamp(t linux.StatxTimestamp) int64 { return t.ToNsec() } func dentryTimestampFromUnix(t unix.Timespec) int64 { return dentryTimestamp(linux.StatxTimestamp{Sec: t.Sec, Nsec: uint32(t.Nsec)}) } // Preconditions: d.cachedMetadataAuthoritative() == true. func (d *dentry) touchAtime(mnt *vfs.Mount) { if opts := mnt.Options(); opts.Flags.NoATime || opts.ReadOnly { return } if err := mnt.CheckBeginWrite(); err != nil { return } now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() d.atime.Store(now) d.atimeDirty.Store(1) d.metadataMu.Unlock() mnt.EndWrite() } // Preconditions: d.metadataMu is locked. d.cachedMetadataAuthoritative() == true. func (d *dentry) touchAtimeLocked(mnt *vfs.Mount) { if opts := mnt.Options(); opts.Flags.NoATime || opts.ReadOnly { return } if err := mnt.CheckBeginWrite(); err != nil { return } now := d.fs.clock.Now().Nanoseconds() d.atime.Store(now) d.atimeDirty.Store(1) mnt.EndWrite() } // Preconditions: // - d.cachedMetadataAuthoritative() == true. // - The caller has successfully called vfs.Mount.CheckBeginWrite(). func (d *dentry) touchCtime() { now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() d.ctime.Store(now) d.metadataMu.Unlock() } // Preconditions: // - d.cachedMetadataAuthoritative() == true. // - The caller has successfully called vfs.Mount.CheckBeginWrite(). func (d *dentry) touchCMtime() { now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() d.mtime.Store(now) d.ctime.Store(now) d.mtimeDirty.Store(1) d.metadataMu.Unlock() } // Preconditions: // - d.cachedMetadataAuthoritative() == true. // - The caller has locked d.metadataMu. func (d *dentry) touchCMtimeLocked() { now := d.fs.clock.Now().Nanoseconds() d.mtime.Store(now) d.ctime.Store(now) d.mtimeDirty.Store(1) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/host/000077500000000000000000000000001465435605700232175ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/host/host.go000066400000000000000000000754541465435605700245420ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package host provides a filesystem implementation for host files imported as // file descriptors. package host import ( "fmt" "math" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // These are the modes that are stored with virtualOwner. const virtualOwnerModes = linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID // +stateify savable type virtualOwner struct { // This field is initialized at creation time and is immutable. enabled bool // mu protects the fields below and they can be accessed using atomic memory // operations. mu sync.Mutex `state:"nosave"` uid atomicbitops.Uint32 gid atomicbitops.Uint32 // mode is also stored, otherwise setting the host file to `0000` could remove // access to the file. mode atomicbitops.Uint32 } func (v *virtualOwner) atomicUID() uint32 { return v.uid.Load() } func (v *virtualOwner) atomicGID() uint32 { return v.gid.Load() } func (v *virtualOwner) atomicMode() uint32 { return v.mode.Load() } func isEpollable(fd int) bool { epollfd, err := unix.EpollCreate1(0) if err != nil { // This shouldn't happen. If it does, just say file doesn't support epoll. return false } defer unix.Close(epollfd) event := unix.EpollEvent{ Fd: int32(fd), Events: unix.EPOLLIN, } err = unix.EpollCtl(epollfd, unix.EPOLL_CTL_ADD, fd, &event) return err == nil } // inode implements kernfs.Inode. // // +stateify savable type inode struct { kernfs.CachedMappable kernfs.InodeNoStatFS kernfs.InodeAnonymous // inode is effectively anonymous because it represents a donated FD. kernfs.InodeNotDirectory kernfs.InodeNotSymlink kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid. kernfs.InodeWatches locks vfs.FileLocks // When the reference count reaches zero, the host fd is closed. inodeRefs // hostFD contains the host fd that this file was originally created from. // Upon restore, it must be remapped using restoreKey and vfs.CtxRestoreFilesystemFDMap // from the restore context. // // This field is initialized at creation time and is immutable. hostFD int `state:"nosave"` // restoreKey is used to identify the `hostFD` after a restore is performed. restoreKey vfs.RestoreID // ino is an inode number unique within this filesystem. // // This field is initialized at creation time and is immutable. ino uint64 // ftype is the file's type (a linux.S_IFMT mask). // // This field is initialized at creation time and is immutable. ftype uint16 // epollable indicates whether the hostFD can be used with epoll_ctl(2). This // also indicates that hostFD has been set to non-blocking. // // This field is initialized at creation time and is immutable. epollable bool // seekable is false if lseek(hostFD) returns ESPIPE. We assume that file // offsets are meaningful iff seekable is true. // // This field is initialized at creation time and is immutable. seekable bool // isTTY is true if this file represents a TTY. // // This field is initialized at creation time and is immutable. isTTY bool // savable is true if hostFD may be saved/restored by its numeric value. // // This field is initialized at creation time and is immutable. savable bool // readonly is true if operations that can potentially change the host file // are blocked. // // This field is initialized at creation time and is immutable. readonly bool // Event queue for blocking operations. queue waiter.Queue // virtualOwner caches ownership and permission information to override the // underlying file owner and permission. This is used to allow the unstrusted // application to change these fields without affecting the host. virtualOwner virtualOwner // If haveBuf is non-zero, hostFD represents a pipe, and buf contains data // read from the pipe from previous calls to inode.beforeSave(). haveBuf // and buf are protected by bufMu. bufMu sync.Mutex `state:"nosave"` haveBuf atomicbitops.Uint32 buf []byte } func newInode(ctx context.Context, fs *filesystem, hostFD int, savable bool, restoreKey vfs.RestoreID, fileType linux.FileMode, isTTY bool, readonly bool) (*inode, error) { // Determine if hostFD is seekable. _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR) seekable := !linuxerr.Equals(linuxerr.ESPIPE, err) // We expect regular files to be seekable, as this is required for them to // be memory-mappable. if !seekable && fileType == unix.S_IFREG { ctx.Infof("host.newInode: host FD %d is a non-seekable regular file", hostFD) return nil, linuxerr.ESPIPE } i := &inode{ hostFD: hostFD, ino: fs.NextIno(), ftype: uint16(fileType), epollable: isEpollable(hostFD), seekable: seekable, isTTY: isTTY, savable: savable, restoreKey: restoreKey, readonly: readonly, } i.InitRefs() i.CachedMappable.Init(hostFD) // If the hostFD can return EWOULDBLOCK when set to non-blocking, do so and // handle blocking behavior in the sentry. if i.epollable { if err := unix.SetNonblock(i.hostFD, true); err != nil { return nil, err } if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil { return nil, err } } return i, nil } // NewFDOptions contains options to NewFD. type NewFDOptions struct { // If Savable is true, the host file descriptor may be saved/restored by // numeric value. RestoreKey is used to map the FD after restore. Savable bool // RestoreKey is only used when Savable==true. It uniquely identifies the // host FD so that a mapping to the corresponding FD can be provided during // restore. RestoreKey vfs.RestoreID // If IsTTY is true, the file descriptor is a TTY. IsTTY bool // If HaveFlags is true, use Flags for the new file description. Otherwise, // the new file description will inherit flags from hostFD. HaveFlags bool Flags uint32 // VirtualOwner allow the host file to have owner and permissions different // than the underlying host file. VirtualOwner bool UID auth.KUID GID auth.KGID // If Readonly is true, we disallow operations that can potentially change // the host file associated with the file descriptor. Readonly bool } // NewFD returns a vfs.FileDescription representing the given host file // descriptor. mnt must be Kernel.HostMount(). func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) (*vfs.FileDescription, error) { fs, ok := mnt.Filesystem().Impl().(*filesystem) if !ok { return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl()) } if opts.Readonly { if opts.IsTTY { // This is not a technical limitation, but access checks for TTYs // have not been implemented yet. return nil, fmt.Errorf("readonly file descriptor may currently not be a TTY") } flagsInt, err := unix.FcntlInt(uintptr(hostFD), unix.F_GETFL, 0) if err != nil { return nil, err } accessMode := uint32(flagsInt) & unix.O_ACCMODE if accessMode != unix.O_RDONLY { return nil, fmt.Errorf("readonly file descriptor may only be opened as O_RDONLY on the host") } } // Retrieve metadata. var stat unix.Stat_t if err := unix.Fstat(hostFD, &stat); err != nil { return nil, err } flags := opts.Flags if !opts.HaveFlags { // Get flags for the imported FD. flagsInt, err := unix.FcntlInt(uintptr(hostFD), unix.F_GETFL, 0) if err != nil { return nil, err } flags = uint32(flagsInt) } fileType := linux.FileMode(stat.Mode).FileType() i, err := newInode(ctx, fs, hostFD, opts.Savable, opts.RestoreKey, fileType, opts.IsTTY, opts.Readonly) if err != nil { return nil, err } if opts.VirtualOwner { i.virtualOwner.enabled = true i.virtualOwner.uid = atomicbitops.FromUint32(uint32(opts.UID)) i.virtualOwner.gid = atomicbitops.FromUint32(uint32(opts.GID)) i.virtualOwner.mode = atomicbitops.FromUint32(stat.Mode) } d := &kernfs.Dentry{} d.Init(&fs.Filesystem, i) // i.open will take a reference on d. defer d.DecRef(ctx) // For simplicity, fileDescription.offset is set to 0. Technically, we // should only set to 0 on files that are not seekable (sockets, pipes, // etc.), and use the offset from the host fd otherwise when importing. return i.open(ctx, d, mnt, fileType, flags) } // filesystemType implements vfs.FilesystemType. // // +stateify savable type filesystemType struct{} // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { panic("host.filesystemType.GetFilesystem should never be called") } // Name implements vfs.FilesystemType.Name. func (filesystemType) Name() string { return "none" } // Release implements vfs.FilesystemType.Release. func (filesystemType) Release(ctx context.Context) {} // NewFilesystem sets up and returns a new hostfs filesystem. // // Note that there should only ever be one instance of host.filesystem, // a global mount for host fds. func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { devMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, err } fs := &filesystem{ devMinor: devMinor, } fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) return fs.VFSFilesystem(), nil } // filesystem implements vfs.FilesystemImpl. // // +stateify savable type filesystem struct { kernfs.Filesystem devMinor uint32 } func (fs *filesystem) Release(ctx context.Context) { fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) } func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { d := vd.Dentry().Impl().(*kernfs.Dentry) inode := d.Inode().(*inode) b.PrependComponent(fmt.Sprintf("host:[%d]", inode.ino)) return vfs.PrependPathSyntheticError{} } // MountOptions implements vfs.FilesystemImpl.MountOptions. func (fs *filesystem) MountOptions() string { return "" } // CheckPermissions implements kernfs.Inode.CheckPermissions. func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { var s unix.Stat_t if err := i.stat(&s); err != nil { return err } return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid)) } // Mode implements kernfs.Inode.Mode. func (i *inode) Mode() linux.FileMode { var s unix.Stat_t if err := i.stat(&s); err != nil { // Retrieving the mode from the host fd using fstat(2) should not fail. // If the syscall does not succeed, something is fundamentally wrong. panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err)) } return linux.FileMode(s.Mode) } // Mode implements kernfs.Inode.UID func (i *inode) UID() auth.KUID { return auth.KUID(i.virtualOwner.uid.Load()) } // Mode implements kernfs.Inode.GID func (i *inode) GID() auth.KGID { return auth.KGID(i.virtualOwner.gid.Load()) } // Stat implements kernfs.Inode.Stat. func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { if opts.Mask&linux.STATX__RESERVED != 0 { return linux.Statx{}, linuxerr.EINVAL } if opts.Sync&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE { return linux.Statx{}, linuxerr.EINVAL } fs := vfsfs.Impl().(*filesystem) // Limit our host call only to known flags. mask := opts.Mask & linux.STATX_ALL var s unix.Statx_t err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s) if linuxerr.Equals(linuxerr.ENOSYS, err) { // Fallback to fstat(2), if statx(2) is not supported on the host. // // TODO(b/151263641): Remove fallback. return i.statxFromStat(fs) } if err != nil { return linux.Statx{}, err } // Unconditionally fill blksize, attributes, and device numbers, as // indicated by /include/uapi/linux/stat.h. Inode number is always // available, since we use our own rather than the host's. ls := linux.Statx{ Mask: linux.STATX_INO, Blksize: s.Blksize, Attributes: s.Attributes, Ino: i.ino, AttributesMask: s.Attributes_mask, DevMajor: linux.UNNAMED_MAJOR, DevMinor: fs.devMinor, } // Copy other fields that were returned by the host. RdevMajor/RdevMinor // are never copied (and therefore left as zero), so as not to expose host // device numbers. ls.Mask |= s.Mask & linux.STATX_ALL if s.Mask&linux.STATX_TYPE != 0 { if i.virtualOwner.enabled { ls.Mode |= uint16(i.virtualOwner.atomicMode()) & linux.S_IFMT } else { ls.Mode |= s.Mode & linux.S_IFMT } } if s.Mask&linux.STATX_MODE != 0 { if i.virtualOwner.enabled { ls.Mode |= uint16(i.virtualOwner.atomicMode()) &^ linux.S_IFMT } else { ls.Mode |= s.Mode &^ linux.S_IFMT } } if s.Mask&linux.STATX_NLINK != 0 { ls.Nlink = s.Nlink } if s.Mask&linux.STATX_UID != 0 { if i.virtualOwner.enabled { ls.UID = i.virtualOwner.atomicUID() } else { ls.UID = s.Uid } } if s.Mask&linux.STATX_GID != 0 { if i.virtualOwner.enabled { ls.GID = i.virtualOwner.atomicGID() } else { ls.GID = s.Gid } } if s.Mask&linux.STATX_ATIME != 0 { ls.Atime = unixToLinuxStatxTimestamp(s.Atime) } if s.Mask&linux.STATX_BTIME != 0 { ls.Btime = unixToLinuxStatxTimestamp(s.Btime) } if s.Mask&linux.STATX_CTIME != 0 { ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime) } if s.Mask&linux.STATX_MTIME != 0 { ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime) } if s.Mask&linux.STATX_SIZE != 0 { ls.Size = s.Size } if s.Mask&linux.STATX_BLOCKS != 0 { ls.Blocks = s.Blocks } return ls, nil } // statxFromStat is a best-effort fallback for inode.Stat() if the host does not // support statx(2). // // We ignore the mask and sync flags in opts and simply supply // STATX_BASIC_STATS, as fstat(2) itself does not allow the specification // of a mask or sync flags. fstat(2) does not provide any metadata // equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so // those fields remain empty. func (i *inode) statxFromStat(fs *filesystem) (linux.Statx, error) { var s unix.Stat_t if err := i.stat(&s); err != nil { return linux.Statx{}, err } // As with inode.Stat(), we always use internal device and inode numbers, // and never expose the host's represented device numbers. return linux.Statx{ Mask: linux.STATX_BASIC_STATS, Blksize: uint32(s.Blksize), Nlink: uint32(s.Nlink), UID: s.Uid, GID: s.Gid, Mode: uint16(s.Mode), Ino: i.ino, Size: uint64(s.Size), Blocks: uint64(s.Blocks), Atime: timespecToStatxTimestamp(s.Atim), Ctime: timespecToStatxTimestamp(s.Ctim), Mtime: timespecToStatxTimestamp(s.Mtim), DevMajor: linux.UNNAMED_MAJOR, DevMinor: fs.devMinor, }, nil } func (i *inode) stat(stat *unix.Stat_t) error { if err := unix.Fstat(i.hostFD, stat); err != nil { return err } if i.virtualOwner.enabled { stat.Uid = i.virtualOwner.atomicUID() stat.Gid = i.virtualOwner.atomicGID() stat.Mode = i.virtualOwner.atomicMode() } return nil } // SetStat implements kernfs.Inode.SetStat. // // +checklocksignore func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { if i.readonly { return linuxerr.EPERM } s := &opts.Stat m := s.Mask if m == 0 { return nil } supportedModes := uint32(linux.STATX_MODE | linux.STATX_SIZE | linux.STATX_ATIME | linux.STATX_MTIME) if i.virtualOwner.enabled { if m&virtualOwnerModes != 0 { // Take lock if any of the virtual owner fields will be updated. i.virtualOwner.mu.Lock() defer i.virtualOwner.mu.Unlock() } supportedModes |= virtualOwnerModes } if m&^supportedModes != 0 { return linuxerr.EPERM } var hostStat unix.Stat_t if err := i.stat(&hostStat); err != nil { return err } if err := vfs.CheckSetStat(ctx, creds, &opts, linux.FileMode(hostStat.Mode), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil { return err } if m&linux.STATX_MODE != 0 { if i.virtualOwner.enabled { // We hold i.virtualOwner.mu. i.virtualOwner.mode = atomicbitops.FromUint32(uint32(opts.Stat.Mode)) } else { log.Warningf("sentry seccomp filters don't allow making fchmod(2) syscall") return unix.EPERM } } if m&linux.STATX_SIZE != 0 { if hostStat.Mode&linux.S_IFMT != linux.S_IFREG { return linuxerr.EINVAL } if err := unix.Ftruncate(i.hostFD, int64(s.Size)); err != nil { return err } oldSize := uint64(hostStat.Size) if s.Size < oldSize { oldpgend, _ := hostarch.PageRoundUp(oldSize) newpgend, _ := hostarch.PageRoundUp(s.Size) if oldpgend != newpgend { i.CachedMappable.InvalidateRange(memmap.MappableRange{newpgend, oldpgend}) } } } if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 { ts := [2]unix.Timespec{ toTimespec(s.Atime, m&linux.STATX_ATIME == 0), toTimespec(s.Mtime, m&linux.STATX_MTIME == 0), } if err := setTimestamps(i.hostFD, &ts); err != nil { return err } } if i.virtualOwner.enabled { if m&linux.STATX_UID != 0 { // We hold i.virtualOwner.mu. i.virtualOwner.uid = atomicbitops.FromUint32(opts.Stat.UID) } if m&linux.STATX_GID != 0 { // We hold i.virtualOwner.mu. i.virtualOwner.gid = atomicbitops.FromUint32(opts.Stat.GID) } } return nil } // DecRef implements kernfs.Inode.DecRef. func (i *inode) DecRef(ctx context.Context) { i.inodeRefs.DecRef(func() { if i.epollable { fdnotifier.RemoveFD(int32(i.hostFD)) } if err := unix.Close(i.hostFD); err != nil { log.Warningf("failed to close host fd %d: %v", i.hostFD, err) } // We can't rely on fdnotifier when closing the fd, because the event may race // with fdnotifier.RemoveFD. Instead, notify the queue explicitly. i.queue.Notify(waiter.EventHUp | waiter.ReadableEvents | waiter.WritableEvents) }) } // Open implements kernfs.Inode.Open. func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { // Once created, we cannot re-open a socket fd through /proc/[pid]/fd/. if i.Mode().FileType() == linux.S_IFSOCK { return nil, linuxerr.ENXIO } var stat unix.Stat_t if err := i.stat(&stat); err != nil { return nil, err } fileType := linux.FileMode(stat.Mode).FileType() return i.open(ctx, d, rp.Mount(), fileType, opts.Flags) } func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, fileType linux.FileMode, flags uint32) (*vfs.FileDescription, error) { // Constrain flags to a subset we can handle. // // TODO(gvisor.dev/issue/2601): Support O_NONBLOCK by adding RWF_NOWAIT to pread/pwrite calls. flags &= unix.O_ACCMODE | unix.O_NONBLOCK | unix.O_DSYNC | unix.O_SYNC | unix.O_APPEND switch fileType { case unix.S_IFSOCK: if i.isTTY { log.Warningf("cannot use host socket fd %d as TTY", i.hostFD) return nil, linuxerr.ENOTTY } ep, err := newEndpoint(ctx, i.hostFD, &i.queue) if err != nil { return nil, err } // Currently, we only allow Unix sockets to be imported. return unixsocket.NewFileDescription(ep, ep.Type(), flags, nil, mnt, d.VFSDentry(), &i.locks) case unix.S_IFREG, unix.S_IFIFO, unix.S_IFCHR: if i.isTTY { fd := &TTYFileDescription{ fileDescription: fileDescription{inode: i}, termios: linux.DefaultReplicaTermios, } if task := kernel.TaskFromContext(ctx); task != nil { fd.fgProcessGroup = task.ThreadGroup().ProcessGroup() fd.session = fd.fgProcessGroup.Session() } fd.LockFD.Init(&i.locks) vfsfd := &fd.vfsfd if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return vfsfd, nil } fd := &fileDescription{inode: i} fd.LockFD.Init(&i.locks) vfsfd := &fd.vfsfd if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return vfsfd, nil default: log.Warningf("cannot import host fd %d with file type %o", i.hostFD, fileType) return nil, linuxerr.EPERM } } // Create a new host-backed endpoint from the given fd and its corresponding // notification queue. func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transport.Endpoint, error) { // Set up an external transport.Endpoint using the host fd. addr := fmt.Sprintf("hostfd:[%d]", hostFD) e, err := transport.NewHostConnectedEndpoint(hostFD, addr) if err != nil { return nil, err.ToError() } ep := transport.NewExternal(e.SockType(), uniqueid.GlobalProviderFromContext(ctx), queue, e, e) return ep, nil } // fileDescription is embedded by host fd implementations of FileDescriptionImpl. // // +stateify savable type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.LockFD // inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but // cached to reduce indirections and casting. fileDescription does not hold // a reference on the inode through the inode field (since one is already // held via the Dentry). // // inode is immutable after fileDescription creation. inode *inode // offsetMu protects offset. offsetMu sync.Mutex `state:"nosave"` // offset specifies the current file offset. It is only meaningful when // inode.seekable is true. offset int64 } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts) } // Stat implements vfs.FileDescriptionImpl.Stat. func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts) } // Release implements vfs.FileDescriptionImpl.Release. func (f *fileDescription) Release(context.Context) { // noop } // Allocate implements vfs.FileDescriptionImpl.Allocate. func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { if f.inode.readonly { return linuxerr.EPERM } return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length)) } // PRead implements vfs.FileDescriptionImpl.PRead. func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, linuxerr.EOPNOTSUPP } i := f.inode if !i.seekable { return 0, linuxerr.ESPIPE } return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags) } // Read implements vfs.FileDescriptionImpl.Read. func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, linuxerr.EOPNOTSUPP } i := f.inode if !i.seekable { bufN, err := i.readFromBuf(ctx, &dst) if err != nil { return bufN, err } n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags) total := bufN + n if isBlockError(err) { // If we got any data at all, return it as a "completed" partial read // rather than retrying until complete. if total != 0 { err = nil } else { err = linuxerr.ErrWouldBlock } } return total, err } f.offsetMu.Lock() n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags) f.offset += n f.offsetMu.Unlock() return n, err } func (i *inode) readFromBuf(ctx context.Context, dst *usermem.IOSequence) (int64, error) { if i.haveBuf.Load() == 0 { return 0, nil } i.bufMu.Lock() defer i.bufMu.Unlock() if len(i.buf) == 0 { return 0, nil } n, err := dst.CopyOut(ctx, i.buf) *dst = dst.DropFirst(n) i.buf = i.buf[n:] if len(i.buf) == 0 { i.haveBuf.Store(0) i.buf = nil } return int64(n), err } func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) { reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) n, err := dst.CopyOutFrom(ctx, reader) hostfd.PutReadWriterAt(reader) return int64(n), err } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { if !f.inode.seekable { return 0, linuxerr.ESPIPE } return f.writeToHostFD(ctx, src, offset, opts.Flags) } // Write implements vfs.FileDescriptionImpl.Write. func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { i := f.inode if !i.seekable { n, err := f.writeToHostFD(ctx, src, -1, opts.Flags) if isBlockError(err) { err = linuxerr.ErrWouldBlock } return n, err } f.offsetMu.Lock() // NOTE(gvisor.dev/issue/2983): O_APPEND may cause memory corruption if // another process modifies the host file between retrieving the file size // and writing to the host fd. This is an unavoidable race condition because // we cannot enforce synchronization on the host. if f.vfsfd.StatusFlags()&linux.O_APPEND != 0 { var s unix.Stat_t if err := unix.Fstat(i.hostFD, &s); err != nil { f.offsetMu.Unlock() return 0, err } f.offset = s.Size } n, err := f.writeToHostFD(ctx, src, f.offset, opts.Flags) f.offset += n f.offsetMu.Unlock() return n, err } func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSequence, offset int64, flags uint32) (int64, error) { if f.inode.readonly { return 0, linuxerr.EPERM } hostFD := f.inode.hostFD // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if flags != 0 { return 0, linuxerr.EOPNOTSUPP } writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) n, err := src.CopyInTo(ctx, writer) hostfd.PutReadWriterAt(writer) // NOTE(gvisor.dev/issue/2979): We always sync everything, even for O_DSYNC. if n > 0 && f.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { if syncErr := unix.Fsync(hostFD); syncErr != nil { return int64(n), syncErr } } return int64(n), err } // Seek implements vfs.FileDescriptionImpl.Seek. // // Note that we do not support seeking on directories, since we do not even // allow directory fds to be imported at all. func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) { i := f.inode if !i.seekable { return 0, linuxerr.ESPIPE } f.offsetMu.Lock() defer f.offsetMu.Unlock() switch whence { case linux.SEEK_SET: if offset < 0 { return f.offset, linuxerr.EINVAL } f.offset = offset case linux.SEEK_CUR: // Check for overflow. Note that underflow cannot occur, since f.offset >= 0. if offset > math.MaxInt64-f.offset { return f.offset, linuxerr.EOVERFLOW } if f.offset+offset < 0 { return f.offset, linuxerr.EINVAL } f.offset += offset case linux.SEEK_END: var s unix.Stat_t if err := unix.Fstat(i.hostFD, &s); err != nil { return f.offset, err } size := s.Size // Check for overflow. Note that underflow cannot occur, since size >= 0. if offset > math.MaxInt64-size { return f.offset, linuxerr.EOVERFLOW } if size+offset < 0 { return f.offset, linuxerr.EINVAL } f.offset = size + offset case linux.SEEK_DATA, linux.SEEK_HOLE: // Modifying the offset in the host file table should not matter, since // this is the only place where we use it. // // For reading and writing, we always rely on our internal offset. n, err := unix.Seek(i.hostFD, offset, int(whence)) if err != nil { return f.offset, err } f.offset = n default: // Invalid whence. return f.offset, linuxerr.EINVAL } return f.offset, nil } // Sync implements vfs.FileDescriptionImpl.Sync. func (f *fileDescription) Sync(ctx context.Context) error { if f.inode.readonly { return linuxerr.EPERM } // TODO(gvisor.dev/issue/1897): Currently, we always sync everything. return unix.Fsync(f.inode.hostFD) } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error { // NOTE(b/38213152): Technically, some obscure char devices can be memory // mapped, but we only allow regular files. if f.inode.ftype != unix.S_IFREG { return linuxerr.ENODEV } i := f.inode i.CachedMappable.InitFileMapperOnce() return vfs.GenericConfigureMMap(&f.vfsfd, i, opts) } // EventRegister implements waiter.Waitable.EventRegister. func (f *fileDescription) EventRegister(e *waiter.Entry) error { f.inode.queue.EventRegister(e) if f.inode.epollable { if err := fdnotifier.UpdateFD(int32(f.inode.hostFD)); err != nil { f.inode.queue.EventUnregister(e) return err } } return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (f *fileDescription) EventUnregister(e *waiter.Entry) { f.inode.queue.EventUnregister(e) if f.inode.epollable { if err := fdnotifier.UpdateFD(int32(f.inode.hostFD)); err != nil { panic(fmt.Sprint("UpdateFD:", err)) } } } // Readiness uses the poll() syscall to check the status of the underlying FD. func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask) } // Epollable implements FileDescriptionImpl.Epollable. func (f *fileDescription) Epollable() bool { return f.inode.epollable } // Ioctl queries the underlying FD for allowed ioctl commands. func (f *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { switch cmd := args[1].Int(); cmd { case linux.FIONREAD: v, err := ioctlFionread(f.inode.hostFD) if err != nil { return 0, err } var buf [4]byte hostarch.ByteOrder.PutUint32(buf[:], v) _, err = uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{}) return 0, err } return f.FileDescriptionDefaultImpl.Ioctl(ctx, uio, sysno, args) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/host/host_state_autogen.go000066400000000000000000000161211465435605700274460ustar00rootroot00000000000000// automatically generated by stateify. package host import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (v *virtualOwner) StateTypeName() string { return "pkg/sentry/fsimpl/host.virtualOwner" } func (v *virtualOwner) StateFields() []string { return []string{ "enabled", "uid", "gid", "mode", } } func (v *virtualOwner) beforeSave() {} // +checklocksignore func (v *virtualOwner) StateSave(stateSinkObject state.Sink) { v.beforeSave() stateSinkObject.Save(0, &v.enabled) stateSinkObject.Save(1, &v.uid) stateSinkObject.Save(2, &v.gid) stateSinkObject.Save(3, &v.mode) } func (v *virtualOwner) afterLoad(context.Context) {} // +checklocksignore func (v *virtualOwner) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &v.enabled) stateSourceObject.Load(1, &v.uid) stateSourceObject.Load(2, &v.gid) stateSourceObject.Load(3, &v.mode) } func (i *inode) StateTypeName() string { return "pkg/sentry/fsimpl/host.inode" } func (i *inode) StateFields() []string { return []string{ "CachedMappable", "InodeNoStatFS", "InodeAnonymous", "InodeNotDirectory", "InodeNotSymlink", "InodeTemporary", "InodeWatches", "locks", "inodeRefs", "restoreKey", "ino", "ftype", "epollable", "seekable", "isTTY", "savable", "readonly", "queue", "virtualOwner", "haveBuf", "buf", } } // +checklocksignore func (i *inode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.CachedMappable) stateSinkObject.Save(1, &i.InodeNoStatFS) stateSinkObject.Save(2, &i.InodeAnonymous) stateSinkObject.Save(3, &i.InodeNotDirectory) stateSinkObject.Save(4, &i.InodeNotSymlink) stateSinkObject.Save(5, &i.InodeTemporary) stateSinkObject.Save(6, &i.InodeWatches) stateSinkObject.Save(7, &i.locks) stateSinkObject.Save(8, &i.inodeRefs) stateSinkObject.Save(9, &i.restoreKey) stateSinkObject.Save(10, &i.ino) stateSinkObject.Save(11, &i.ftype) stateSinkObject.Save(12, &i.epollable) stateSinkObject.Save(13, &i.seekable) stateSinkObject.Save(14, &i.isTTY) stateSinkObject.Save(15, &i.savable) stateSinkObject.Save(16, &i.readonly) stateSinkObject.Save(17, &i.queue) stateSinkObject.Save(18, &i.virtualOwner) stateSinkObject.Save(19, &i.haveBuf) stateSinkObject.Save(20, &i.buf) } // +checklocksignore func (i *inode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.CachedMappable) stateSourceObject.Load(1, &i.InodeNoStatFS) stateSourceObject.Load(2, &i.InodeAnonymous) stateSourceObject.Load(3, &i.InodeNotDirectory) stateSourceObject.Load(4, &i.InodeNotSymlink) stateSourceObject.Load(5, &i.InodeTemporary) stateSourceObject.Load(6, &i.InodeWatches) stateSourceObject.Load(7, &i.locks) stateSourceObject.Load(8, &i.inodeRefs) stateSourceObject.Load(9, &i.restoreKey) stateSourceObject.Load(10, &i.ino) stateSourceObject.Load(11, &i.ftype) stateSourceObject.Load(12, &i.epollable) stateSourceObject.Load(13, &i.seekable) stateSourceObject.Load(14, &i.isTTY) stateSourceObject.Load(15, &i.savable) stateSourceObject.Load(16, &i.readonly) stateSourceObject.Load(17, &i.queue) stateSourceObject.Load(18, &i.virtualOwner) stateSourceObject.Load(19, &i.haveBuf) stateSourceObject.Load(20, &i.buf) stateSourceObject.AfterLoad(func() { i.afterLoad(ctx) }) } func (f *filesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/host.filesystemType" } func (f *filesystemType) StateFields() []string { return []string{} } func (f *filesystemType) beforeSave() {} // +checklocksignore func (f *filesystemType) StateSave(stateSinkObject state.Sink) { f.beforeSave() } func (f *filesystemType) afterLoad(context.Context) {} // +checklocksignore func (f *filesystemType) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (fs *filesystem) StateTypeName() string { return "pkg/sentry/fsimpl/host.filesystem" } func (fs *filesystem) StateFields() []string { return []string{ "Filesystem", "devMinor", } } func (fs *filesystem) beforeSave() {} // +checklocksignore func (fs *filesystem) StateSave(stateSinkObject state.Sink) { fs.beforeSave() stateSinkObject.Save(0, &fs.Filesystem) stateSinkObject.Save(1, &fs.devMinor) } func (fs *filesystem) afterLoad(context.Context) {} // +checklocksignore func (fs *filesystem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fs.Filesystem) stateSourceObject.Load(1, &fs.devMinor) } func (f *fileDescription) StateTypeName() string { return "pkg/sentry/fsimpl/host.fileDescription" } func (f *fileDescription) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "LockFD", "inode", "offset", } } func (f *fileDescription) beforeSave() {} // +checklocksignore func (f *fileDescription) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.vfsfd) stateSinkObject.Save(1, &f.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &f.LockFD) stateSinkObject.Save(3, &f.inode) stateSinkObject.Save(4, &f.offset) } func (f *fileDescription) afterLoad(context.Context) {} // +checklocksignore func (f *fileDescription) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.vfsfd) stateSourceObject.Load(1, &f.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &f.LockFD) stateSourceObject.Load(3, &f.inode) stateSourceObject.Load(4, &f.offset) } func (r *inodeRefs) StateTypeName() string { return "pkg/sentry/fsimpl/host.inodeRefs" } func (r *inodeRefs) StateFields() []string { return []string{ "refCount", } } func (r *inodeRefs) beforeSave() {} // +checklocksignore func (r *inodeRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *inodeRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (t *TTYFileDescription) StateTypeName() string { return "pkg/sentry/fsimpl/host.TTYFileDescription" } func (t *TTYFileDescription) StateFields() []string { return []string{ "fileDescription", "session", "fgProcessGroup", "termios", } } func (t *TTYFileDescription) beforeSave() {} // +checklocksignore func (t *TTYFileDescription) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.fileDescription) stateSinkObject.Save(1, &t.session) stateSinkObject.Save(2, &t.fgProcessGroup) stateSinkObject.Save(3, &t.termios) } func (t *TTYFileDescription) afterLoad(context.Context) {} // +checklocksignore func (t *TTYFileDescription) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.fileDescription) stateSourceObject.Load(1, &t.session) stateSourceObject.Load(2, &t.fgProcessGroup) stateSourceObject.Load(3, &t.termios) } func init() { state.Register((*virtualOwner)(nil)) state.Register((*inode)(nil)) state.Register((*filesystemType)(nil)) state.Register((*filesystem)(nil)) state.Register((*fileDescription)(nil)) state.Register((*inodeRefs)(nil)) state.Register((*TTYFileDescription)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/host/host_unsafe.go000066400000000000000000000016171465435605700260710ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package host import ( "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" ) func ioctlFionread(fd int) (uint32, error) { var v uint32 if _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(fd), linux.FIONREAD, uintptr(unsafe.Pointer(&v))); errno != 0 { return 0, errno } return v, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/host/host_unsafe_state_autogen.go000066400000000000000000000000661465435605700310100ustar00rootroot00000000000000// automatically generated by stateify. package host golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/host/inode_refs.go000066400000000000000000000100651465435605700256650ustar00rootroot00000000000000package host import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const inodeenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var inodeobj *inode // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type inodeRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *inodeRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *inodeRefs) RefType() string { return fmt.Sprintf("%T", inodeobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *inodeRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *inodeRefs) LogRefs() bool { return inodeenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *inodeRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *inodeRefs) IncRef() { v := r.refCount.Add(1) if inodeenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *inodeRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if inodeenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *inodeRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if inodeenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *inodeRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/host/ioctl_unsafe.go000066400000000000000000000030731465435605700262240ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package host import ( "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" ) func ioctlGetTermios(fd int) (*linux.Termios, error) { var t linux.Termios _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t))) if errno != 0 { return nil, errno } return &t, nil } func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error { _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(unsafe.Pointer(t))) if errno != 0 { return errno } return nil } func ioctlGetWinsize(fd int) (*linux.Winsize, error) { var w linux.Winsize _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(fd), linux.TIOCGWINSZ, uintptr(unsafe.Pointer(&w))) if errno != 0 { return nil, errno } return &w, nil } func ioctlSetWinsize(fd int, w *linux.Winsize) error { _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(fd), linux.TIOCSWINSZ, uintptr(unsafe.Pointer(w))) if errno != 0 { return errno } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/host/save_restore.go000066400000000000000000000055231465435605700262540ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package host import ( "context" "fmt" "io" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // MakeRestoreID creates a RestoreID for a given application FD. The application // FD remains the same between restores, e.g. stdout=2 before and after restore, // but the host FD that is maps to can change between restores. This ID is used // to map application FDs to their respective FD after a restore happens. func MakeRestoreID(containerName string, fd int) vfs.RestoreID { return vfs.RestoreID{ ContainerName: containerName, Path: fmt.Sprintf("host:%d", fd), } } // beforeSave is invoked by stateify. func (i *inode) beforeSave() { if !i.savable { panic("host.inode is not savable") } if i.ftype == unix.S_IFIFO { // If this pipe FD is readable, drain it so that bytes in the pipe can // be read after restore. (This is a legacy VFS1 feature.) We don't // know if the pipe FD is readable, so just try reading and tolerate // EBADF from the read. i.bufMu.Lock() defer i.bufMu.Unlock() var buf [hostarch.PageSize]byte for { n, err := hostfd.Preadv2(int32(i.hostFD), safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:])), -1 /* offset */, 0 /* flags */) if n != 0 { i.buf = append(i.buf, buf[:n]...) } if err != nil { if err == io.EOF || err == unix.EAGAIN || err == unix.EBADF { break } panic(fmt.Errorf("host.inode.beforeSave: buffering from pipe failed: %v", err)) } } if len(i.buf) != 0 { i.haveBuf.Store(1) } } } // afterLoad is invoked by stateify. func (i *inode) afterLoad(ctx context.Context) { fdmap := vfs.RestoreFilesystemFDMapFromContext(ctx) fd, ok := fdmap[i.restoreKey] if !ok { panic(fmt.Sprintf("no host FD available for %+v, map: %v", i.restoreKey, fdmap)) } i.hostFD = fd if i.epollable { if err := unix.SetNonblock(i.hostFD, true); err != nil { panic(fmt.Sprintf("host.inode.afterLoad: failed to set host FD %d non-blocking: %v", i.hostFD, err)) } if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil { panic(fmt.Sprintf("host.inode.afterLoad: fdnotifier.AddFD(%d) failed: %v", i.hostFD, err)) } } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/host/tty.go000066400000000000000000000253211465435605700243710ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package host import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) // TTYFileDescription implements vfs.FileDescriptionImpl for a host file // descriptor that wraps a TTY FD. // // +stateify savable type TTYFileDescription struct { fileDescription // mu protects the fields below. mu sync.Mutex `state:"nosave"` // session is the session attached to this TTYFileDescription. session *kernel.Session // fgProcessGroup is the foreground process group that is currently // connected to this TTY. fgProcessGroup *kernel.ProcessGroup // termios contains the terminal attributes for this TTY. termios linux.KernelTermios } // InitForegroundProcessGroup sets the foreground process group and session for // the TTY. This should only be called once, after the foreground process group // has been created, but before it has started running. func (t *TTYFileDescription) InitForegroundProcessGroup(pg *kernel.ProcessGroup) { t.mu.Lock() defer t.mu.Unlock() if t.fgProcessGroup != nil { panic("foreground process group is already set") } t.fgProcessGroup = pg t.session = pg.Session() } // ForegroundProcessGroup returns the foreground process for the TTY. func (t *TTYFileDescription) ForegroundProcessGroup() *kernel.ProcessGroup { t.mu.Lock() defer t.mu.Unlock() return t.fgProcessGroup } // Release implements fs.FileOperations.Release. func (t *TTYFileDescription) Release(ctx context.Context) { t.mu.Lock() t.fgProcessGroup = nil t.mu.Unlock() t.fileDescription.Release(ctx) } // PRead implements vfs.FileDescriptionImpl.PRead. // // Reading from a TTY is only allowed for foreground process groups. Background // process groups will either get EIO or a SIGTTIN. func (t *TTYFileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { t.mu.Lock() defer t.mu.Unlock() // Are we allowed to do the read? // drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change(). if err := t.checkChange(ctx, linux.SIGTTIN); err != nil { return 0, err } // Do the read. return t.fileDescription.PRead(ctx, dst, offset, opts) } // Read implements vfs.FileDescriptionImpl.Read. // // Reading from a TTY is only allowed for foreground process groups. Background // process groups will either get EIO or a SIGTTIN. func (t *TTYFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { t.mu.Lock() defer t.mu.Unlock() // Are we allowed to do the read? // drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change(). if err := t.checkChange(ctx, linux.SIGTTIN); err != nil { return 0, err } // Do the read. return t.fileDescription.Read(ctx, dst, opts) } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { t.mu.Lock() defer t.mu.Unlock() // Check whether TOSTOP is enabled. This corresponds to the check in // drivers/tty/n_tty.c:n_tty_write(). if t.termios.LEnabled(linux.TOSTOP) { if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { return 0, err } } return t.fileDescription.PWrite(ctx, src, offset, opts) } // Write implements vfs.FileDescriptionImpl.Write. func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { t.mu.Lock() defer t.mu.Unlock() // Check whether TOSTOP is enabled. This corresponds to the check in // drivers/tty/n_tty.c:n_tty_write(). if t.termios.LEnabled(linux.TOSTOP) { if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { return 0, err } } return t.fileDescription.Write(ctx, src, opts) } // Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { task := kernel.TaskFromContext(ctx) if task == nil { return 0, linuxerr.ENOTTY } // Ignore arg[0]. This is the real FD: fd := t.inode.hostFD ioctl := args[1].Uint64() switch ioctl { case linux.FIONREAD: v, err := ioctlFionread(fd) if err != nil { return 0, err } var buf [4]byte hostarch.ByteOrder.PutUint32(buf[:], v) _, err = io.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{}) return 0, err case linux.TCGETS: termios, err := ioctlGetTermios(fd) if err != nil { return 0, err } _, err = termios.CopyOut(task, args[2].Pointer()) return 0, err case linux.TCSETS, linux.TCSETSW, linux.TCSETSF: t.mu.Lock() defer t.mu.Unlock() if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { return 0, err } var termios linux.Termios if _, err := termios.CopyIn(task, args[2].Pointer()); err != nil { return 0, err } err := ioctlSetTermios(fd, ioctl, &termios) if err == nil { t.termios.FromTermios(termios) } return 0, err case linux.TIOCGPGRP: // Args: pid_t *argp // When successful, equivalent to *argp = tcgetpgrp(fd). // Get the process group ID of the foreground process group on this // terminal. pidns := kernel.PIDNamespaceFromContext(ctx) if pidns == nil { return 0, linuxerr.ENOTTY } t.mu.Lock() defer t.mu.Unlock() // Map the ProcessGroup into a ProcessGroupID in the task's PID namespace. pgID := primitive.Int32(pidns.IDOfProcessGroup(t.fgProcessGroup)) _, err := pgID.CopyOut(task, args[2].Pointer()) return 0, err case linux.TIOCSPGRP: // Args: const pid_t *argp // Equivalent to tcsetpgrp(fd, *argp). // Set the foreground process group ID of this terminal. t.mu.Lock() defer t.mu.Unlock() // Check that we are allowed to set the process group. if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { // drivers/tty/tty_io.c:tiocspgrp() converts -EIO from tty_check_change() // to -ENOTTY. if linuxerr.Equals(linuxerr.EIO, err) { return 0, linuxerr.ENOTTY } return 0, err } // Check that calling task's process group is in the TTY session. if task.ThreadGroup().Session() != t.session { return 0, linuxerr.ENOTTY } var pgIDP primitive.Int32 if _, err := pgIDP.CopyIn(task, args[2].Pointer()); err != nil { return 0, err } pgID := kernel.ProcessGroupID(pgIDP) // pgID must be non-negative. if pgID < 0 { return 0, linuxerr.EINVAL } // Process group with pgID must exist in this PID namespace. pidns := task.PIDNamespace() pg := pidns.ProcessGroupWithID(pgID) if pg == nil { return 0, linuxerr.ESRCH } // Check that new process group is in the TTY session. if pg.Session() != t.session { return 0, linuxerr.EPERM } t.fgProcessGroup = pg return 0, nil case linux.TIOCGWINSZ: // Args: struct winsize *argp // Get window size. winsize, err := ioctlGetWinsize(fd) if err != nil { return 0, err } _, err = winsize.CopyOut(task, args[2].Pointer()) return 0, err case linux.TIOCSWINSZ: // Args: const struct winsize *argp // Set window size. // Unlike setting the termios, any process group (even background ones) can // set the winsize. var winsize linux.Winsize if _, err := winsize.CopyIn(task, args[2].Pointer()); err != nil { return 0, err } err := ioctlSetWinsize(fd, &winsize) return 0, err // Unimplemented commands. case linux.TIOCSETD, linux.TIOCSBRK, linux.TIOCCBRK, linux.TCSBRK, linux.TCSBRKP, linux.TIOCSTI, linux.TIOCCONS, linux.FIONBIO, linux.TIOCEXCL, linux.TIOCNXCL, linux.TIOCGEXCL, linux.TIOCNOTTY, linux.TIOCSCTTY, linux.TIOCGSID, linux.TIOCGETD, linux.TIOCVHANGUP, linux.TIOCGDEV, linux.TIOCMGET, linux.TIOCMSET, linux.TIOCMBIC, linux.TIOCMBIS, linux.TIOCGICOUNT, linux.TCFLSH, linux.TIOCSSERIAL, linux.TIOCGPTPEER: unimpl.EmitUnimplementedEvent(ctx, sysno) fallthrough default: return 0, linuxerr.ENOTTY } } // checkChange checks that the process group is allowed to read, write, or // change the state of the TTY. // // This corresponds to Linux drivers/tty/tty_io.c:tty_check_change(). The logic // is a bit convoluted, but documented inline. // // Preconditions: t.mu must be held. func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) error { task := kernel.TaskFromContext(ctx) if task == nil { // No task? Linux does not have an analog for this case, but // tty_check_change only blocks specific cases and is // surprisingly permissive. Allowing the change seems // appropriate. return nil } tg := task.ThreadGroup() pg := tg.ProcessGroup() // If the session for the task is different than the session for the // controlling TTY, then the change is allowed. Seems like a bad idea, // but that's exactly what linux does. if tg.Session() != t.fgProcessGroup.Session() { return nil } // If we are the foreground process group, then the change is allowed. if pg == t.fgProcessGroup { return nil } // We are not the foreground process group. // Is the provided signal blocked or ignored? if (task.SignalMask()&linux.SignalSetOf(sig) != 0) || tg.SignalHandlers().IsIgnored(sig) { // If the signal is SIGTTIN, then we are attempting to read // from the TTY. Don't send the signal and return EIO. if sig == linux.SIGTTIN { return linuxerr.EIO } // Otherwise, we are writing or changing terminal state. This is allowed. return nil } // If the process group is an orphan, return EIO. if pg.IsOrphan() { return linuxerr.EIO } // Otherwise, send the signal to the process group and return ERESTARTSYS. // // Note that Linux also unconditionally sets TIF_SIGPENDING on current, // but this isn't necessary in gVisor because the rationale given in // 040b6362d58f "tty: fix leakage of -ERESTARTSYS to userland" doesn't // apply: the sentry will handle -ERESTARTSYS in // kernel.runApp.execute() even if the kernel.Task isn't interrupted. // // Linux ignores the result of kill_pgrp(). _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) return linuxerr.ERESTARTSYS } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/host/util.go000066400000000000000000000027071465435605700245310ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package host import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" ) func toTimespec(ts linux.StatxTimestamp, omit bool) unix.Timespec { if omit { return unix.Timespec{ Sec: 0, Nsec: unix.UTIME_OMIT, } } return unix.Timespec{ Sec: ts.Sec, Nsec: int64(ts.Nsec), } } func unixToLinuxStatxTimestamp(ts unix.StatxTimestamp) linux.StatxTimestamp { return linux.StatxTimestamp{Sec: ts.Sec, Nsec: ts.Nsec} } func timespecToStatxTimestamp(ts unix.Timespec) linux.StatxTimestamp { return linux.StatxTimestamp{Sec: int64(ts.Sec), Nsec: uint32(ts.Nsec)} } // isBlockError checks if an error is EAGAIN or EWOULDBLOCK. // If so, they can be transformed into linuxerr.ErrWouldBlock. func isBlockError(err error) bool { return linuxerr.Equals(linuxerr.EAGAIN, err) || linuxerr.Equals(linuxerr.EWOULDBLOCK, err) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/host/util_unsafe.go000066400000000000000000000016111465435605700260630ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package host import ( "unsafe" "golang.org/x/sys/unix" ) func setTimestamps(fd int, ts *[2]unix.Timespec) error { _, _, errno := unix.Syscall6( unix.SYS_UTIMENSAT, uintptr(fd), 0, /* path */ uintptr(unsafe.Pointer(ts)), 0, /* flags */ 0, 0) if errno != 0 { return errno } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/iouringfs/000077500000000000000000000000001465435605700242475ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/iouringfs/buffer.go000066400000000000000000000131751465435605700260560ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package iouringfs import ( "fmt" "gvisor.dev/gvisor/pkg/safemem" ) // sharedBuffer represents a memory buffer shared between the sentry and // userspace. In many cases, this is simply an internal mmap on the underlying // memory (aka fast mode). However in some cases the mapped region may lie // across multiple blocks and we need to copy the region into a contiguous // buffer (aka slow mode). The goal in either case is to present a contiguous // slice for easy access. // // sharedBuffer must be initialized with init before first use. // // Example // ======= /* var sb sharedBuffer bs := MapInternal(...) sb.init(bs) fetch := true for !done { var err error // (Re-)Fetch the view. var view []byte if fetch { view, err = sb.view(128) } // Use the view slice to access the region, both for read or write. someState := dosomething(view[10]) view[20] = someState & mask // Write back the changes. fetch, err = sb.writeback(128) } */ // In the above example, in fast mode view returns a slice that points directly // to the underlying memory and requires no copying. Writeback is a no-op, and // the view can be reused on subsequent loop iterations (writeback will return // refetch == false). // // In slow mode, view will copy disjoint parts of the region from different // blocks to a single contiguous slice. Writeback will also required a copy, and // a new view will have to be fetched on every loop iteration (writeback will // return refetch == true). // // sharedBuffer is *not* thread safe. type sharedBuffer struct { bs safemem.BlockSeq // copy is allocated once and reused on subsequent calls to view. We don't // use the Task's copy scratch buffer because these buffers may be accessed // from a background context. copy []byte // needsWriteback indicates whether we need to copy out back data from the // slice returned by the last view() call. needsWriteback bool } // init initializes the sharedBuffer, and must be called before first use. func (b *sharedBuffer) init(bs safemem.BlockSeq) { b.bs = bs } func (b *sharedBuffer) valid() bool { return !b.bs.IsEmpty() } // view returns a slice representing the shared buffer. When done, view must be // released with either writeback{,Window} or drop. func (b *sharedBuffer) view(n int) ([]byte, error) { if uint64(n) > b.bs.NumBytes() { // Mapping too short? This is a bug. panic(fmt.Sprintf("iouringfs: mapping too short for requested len: mapping length %v, requested %d", b.bs.NumBytes(), n)) } // Fast path: use mapping directly, no copies required. h := b.bs.Head() if h.Len() <= n && !h.NeedSafecopy() { b.needsWriteback = false return h.ToSlice()[:n], nil } // Buffer mapped across multiple blocks, or requires safe copy. if len(b.copy) < n { b.copy = make([]byte, n) } dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.copy[:n])) copyN, err := safemem.CopySeq(dst, b.bs) if err != nil { return nil, err } if copyN != uint64(n) { // Short copy risks exposing stale data from view buffer. This should never happen. panic(fmt.Sprintf("iouringfs: short copy for shared buffer view: want %d, got %d", n, copyN)) } b.needsWriteback = true return b.copy, nil } // writeback writes back the changes to the slice returned by the previous view // call. On return, writeback indicates if the previous view may be reused, or // needs to be refetched with a new call to view. // // Precondition: Must follow a call to view. n must match the value passed to // view. // // Postcondition: Previous view is invalidated whether writeback is successful // or not. To attempt another modification, a new view may need to be obtained, // according to refetch. func (b *sharedBuffer) writeback(n int) (refetch bool, err error) { return b.writebackWindow(0, n) } // writebackWindow is like writeback, but only writes back a subregion. Useful // if the caller knows only a small region has been updated, as it reduces how // much data need to be copied. writebackWindow still potentially invalidates // the entire view, caller must check refetch to determine if the view needs to // be refreshed. func (b *sharedBuffer) writebackWindow(off, len int) (refetch bool, err error) { if uint64(off+len) > b.bs.NumBytes() { panic(fmt.Sprintf("iouringfs: requested writeback to shared buffer from offset %d for %d bytes would overflow underlying region of size %d", off, len, b.bs.NumBytes())) } if !b.needsWriteback { return false, nil } // Existing view invalid after this point. b.needsWriteback = false src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.copy[off : off+len])) dst := b.bs.DropFirst(off) copyN, err := safemem.CopySeq(dst, src) if err != nil { return true, err } if copyN != uint64(len) { panic(fmt.Sprintf("iouringfs: short copy for shared buffer writeback: want %d, got %d", len, copyN)) } return true, nil } // drop releases a view without writeback. Returns whether any existing views // need to be refetched. Useful when caller is done with a view that doesn't // need to be modified. func (b *sharedBuffer) drop() bool { wb := b.needsWriteback b.needsWriteback = false return wb } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/iouringfs/iouringfs.go000066400000000000000000000471721465435605700266160ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package iouringfs provides a filesystem implementation for IO_URING basing // it on anonfs. Currently, we don't support neither IOPOLL nor SQPOLL modes. // Thus, user needs to set up IO_URING first with io_uring_setup(2) syscall and // then issue submission request using io_uring_enter(2). // // Another important note, as of now, we don't support deferred CQE. In other // words, the size of the backlogged set of CQE is zero. Whenever, completion // queue ring buffer is full, we drop the subsequent completion queue entries. package iouringfs import ( "fmt" "io" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) // FileDescription implements vfs.FileDescriptionImpl for file-based IO_URING. // It is based on io_rings struct. See io_uring/io_uring.c. // // +stateify savable type FileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD mf *pgalloc.MemoryFile `state:"nosave"` rbmf ringsBufferFile sqemf sqEntriesFile // running indicates whether the submission queue is currently being // processed. This is either 0 for not running, or 1 for running. running atomicbitops.Uint32 // runC is used to wake up serialized task goroutines waiting for any // concurrent processors of the submission queue. runC chan struct{} `state:"nosave"` ioRings linux.IORings ioRingsBuf sharedBuffer `state:"nosave"` sqesBuf sharedBuffer `state:"nosave"` cqesBuf sharedBuffer `state:"nosave"` // remap indicates whether the shared buffers need to be remapped // due to a S/R. Protected by ProcessSubmissions critical section. remap bool } var _ vfs.FileDescriptionImpl = (*FileDescription)(nil) func roundUpPowerOfTwo(n uint32) (uint32, bool) { if n > (1 << 31) { return 0, false } result := uint32(1) for result < n { result = result << 1 } return result, true } // New creates a new iouring fd. func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, entries uint32, params *linux.IOUringParams) (*vfs.FileDescription, error) { if entries > linux.IORING_MAX_ENTRIES { return nil, linuxerr.EINVAL } vd := vfsObj.NewAnonVirtualDentry("[io_uring]") defer vd.DecRef(ctx) mf := pgalloc.MemoryFileFromContext(ctx) if mf == nil { panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFile)) } numSqEntries, ok := roundUpPowerOfTwo(entries) if !ok { return nil, linuxerr.EOVERFLOW } var numCqEntries uint32 if params.Flags&linux.IORING_SETUP_CQSIZE != 0 { var ok bool numCqEntries, ok = roundUpPowerOfTwo(params.CqEntries) if !ok || numCqEntries < numSqEntries || numCqEntries > linux.IORING_MAX_CQ_ENTRIES { return nil, linuxerr.EINVAL } } else { numCqEntries = 2 * numSqEntries } // Allocate enough space to store the `struct io_rings` plus a given number of indexes // corresponding to the number of SQEs. ioRingsWithCqesSize := uint32((*linux.IORings)(nil).SizeBytes()) + numCqEntries*uint32((*linux.IOUringCqe)(nil).SizeBytes()) ringsBufferSize := uint64(ioRingsWithCqesSize + numSqEntries*uint32((*linux.IORingIndex)(nil).SizeBytes())) ringsBufferSize = uint64(hostarch.Addr(ringsBufferSize).MustRoundUp()) memCgID := pgalloc.MemoryCgroupIDFromContext(ctx) rbfr, err := mf.Allocate(ringsBufferSize, pgalloc.AllocOpts{Kind: usage.Anonymous, MemCgID: memCgID}) if err != nil { return nil, linuxerr.ENOMEM } // Allocate enough space to store the given number of submission queue entries. sqEntriesSize := uint64(numSqEntries * uint32((*linux.IOUringSqe)(nil).SizeBytes())) sqEntriesSize = uint64(hostarch.Addr(sqEntriesSize).MustRoundUp()) sqefr, err := mf.Allocate(sqEntriesSize, pgalloc.AllocOpts{Kind: usage.Anonymous, MemCgID: memCgID}) if err != nil { return nil, linuxerr.ENOMEM } iouringfd := &FileDescription{ mf: mf, rbmf: ringsBufferFile{ fr: rbfr, }, sqemf: sqEntriesFile{ fr: sqefr, }, // See ProcessSubmissions for why the capacity is 1. runC: make(chan struct{}, 1), } // iouringfd is always set up with read/write mode. // See io_uring/io_uring.c:io_uring_install_fd(). if err := iouringfd.vfsfd.Init(iouringfd, uint32(linux.O_RDWR), vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ UseDentryMetadata: true, DenyPRead: true, DenyPWrite: true, DenySpliceIn: true, }); err != nil { return nil, err } params.SqEntries = numSqEntries params.CqEntries = numCqEntries arrayOffset := uint64(hostarch.Addr(ioRingsWithCqesSize)) arrayOffset, ok = hostarch.CacheLineRoundUp(arrayOffset) if !ok { return nil, linuxerr.EOVERFLOW } params.SqOff = linux.PreComputedIOSqRingOffsets() params.SqOff.Array = uint32(arrayOffset) cqesOffset := uint64(hostarch.Addr((*linux.IORings)(nil).SizeBytes())) cqesOffset, ok = hostarch.CacheLineRoundUp(cqesOffset) if !ok { return nil, linuxerr.EOVERFLOW } params.CqOff = linux.PreComputedIOCqRingOffsets() params.CqOff.Cqes = uint32(cqesOffset) // Set features supported by the current IO_URING implementation. params.Features = linux.IORING_FEAT_SINGLE_MMAP // Map all shared buffers. if err := iouringfd.mapSharedBuffers(); err != nil { return nil, err } // Initialize IORings struct from params. iouringfd.ioRings.SqRingMask = params.SqEntries - 1 iouringfd.ioRings.CqRingMask = params.CqEntries - 1 iouringfd.ioRings.SqRingEntries = params.SqEntries iouringfd.ioRings.CqRingEntries = params.CqEntries // Write IORings out to shared buffer. view, err := iouringfd.ioRingsBuf.view(iouringfd.ioRings.SizeBytes()) if err != nil { return nil, err } iouringfd.ioRings.MarshalUnsafe(view) buf := make([]byte, iouringfd.ioRings.SizeBytes()) iouringfd.ioRings.MarshalUnsafe(buf) if _, err := iouringfd.ioRingsBuf.writeback(iouringfd.ioRings.SizeBytes()); err != nil { return nil, err } return &iouringfd.vfsfd, nil } // Release implements vfs.FileDescriptionImpl.Release. func (fd *FileDescription) Release(ctx context.Context) { fd.mf.DecRef(fd.rbmf.fr) fd.mf.DecRef(fd.sqemf.fr) } // mapSharedBuffers caches internal mappings for the ring's shared memory // regions. func (fd *FileDescription) mapSharedBuffers() error { // Mapping for the IORings header struct. rb, err := fd.mf.MapInternal(fd.rbmf.fr, hostarch.ReadWrite) if err != nil { return err } fd.ioRingsBuf.init(rb) // Mapping for the CQEs array. This is contiguous to the header struct. cqesOffset := uint64(fd.ioRings.SizeBytes()) cqesOffset, ok := hostarch.CacheLineRoundUp(cqesOffset) if !ok { return linuxerr.EOVERFLOW } cqes := rb.DropFirst(int(cqesOffset)) fd.cqesBuf.init(cqes) // Mapping for the SQEs array. sqes, err := fd.mf.MapInternal(fd.sqemf.fr, hostarch.ReadWrite) if err != nil { return err } fd.sqesBuf.init(sqes) return nil } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *FileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { var mf memmap.Mappable switch opts.Offset { case linux.IORING_OFF_SQ_RING, linux.IORING_OFF_CQ_RING: mf = &fd.rbmf case linux.IORING_OFF_SQES: mf = &fd.sqemf default: return linuxerr.EINVAL } opts.Offset = 0 return vfs.GenericConfigureMMap(&fd.vfsfd, mf, opts) } // ProcessSubmissions processes the submission queue. Concurrent calls to // ProcessSubmissions serialize, yielding task goroutines with Task.Block since // processing can take a long time. func (fd *FileDescription) ProcessSubmissions(t *kernel.Task, toSubmit uint32, minComplete uint32, flags uint32) (int, error) { // We use a combination of fd.running and fd.runC to serialize concurrent // callers to ProcessSubmissions. runC has a capacity of 1. The protocol // works as follows: // // * Becoming the active task // // On entry to ProcessSubmissions, we try to transition running from 0 to // 1. If there is already an active task, this will fail and we'll go to // sleep with Task.Block(). If we succeed, we're the active task. // // * Sleep, Wakeup // // If we had to sleep, on wakeup we try to transition running to 1 again as // we could still be racing with other tasks. Note that if multiple tasks // are sleeping, only one will wake up since only one will successfully // receive from runC. However we could still race with a new caller of // ProcessSubmissions that hasn't gone to sleep yet. Only one waiting task // will succeed and become the active task, the rest will go to sleep. // // runC needs to be buffered to avoid a race between checking running and // going back to sleep. With an unbuffered channel, we could miss a wakeup // like this: // // Task B (entering, sleeping) | Task A (active, releasing) // ---------------------------------------------------+------------------------- // | fd.running.Store(0) // for !fd.running.CompareAndSwap(0, 1) { // Success | // | nonblockingSend(runC) // Missed! // t.Block(fd.runC) // Will block forever | // } // // Task A's send would have to be non-blocking, as there may not be a // concurrent Task B. // // A side-effect of using a buffered channel is the first task that needs to // sleep may wake up once immediately due to a previously queued // wakeup. This isn't a problem, as it'll immediately try to transition // running to 1, likely fail again and go back to sleep. Task.Block has a // fast path if runC already has a queued message so this won't result in a // task state change. // // * Release // // When the active task is done, it releases the critical section by setting // running = 0, then doing a non-blocking send on runC. The send needs to be // non-blocking, as there may not be a concurrent sleeper. for !fd.running.CompareAndSwap(0, 1) { t.Block(fd.runC) } // We successfully set fd.running, so we're the active task now. defer func() { // Unblock any potentially waiting tasks. if !fd.running.CompareAndSwap(1, 0) { panic(fmt.Sprintf("iouringfs.FileDescription.ProcessSubmissions: active task encountered invalid fd.running state %v", fd.running.Load())) } select { case fd.runC <- struct{}{}: default: } }() // The rest of this function is a critical section with respect to // concurrent callers. if fd.remap { fd.mapSharedBuffers() fd.remap = false } var err error var sqe linux.IOUringSqe sqOff := linux.PreComputedIOSqRingOffsets() cqOff := linux.PreComputedIOCqRingOffsets() sqArraySize := sqe.SizeBytes() * int(fd.ioRings.SqRingEntries) cqArraySize := (*linux.IOUringCqe)(nil).SizeBytes() * int(fd.ioRings.CqRingEntries) // Fetch all buffers initially. fetchRB := true fetchSQA := true fetchCQA := true var view, sqaView, cqaView []byte submitted := uint32(0) for toSubmit > submitted { // This loop can take a long time to process, so periodically check for // interrupts. This also pets the watchdog. if t.Interrupted() { return -1, linuxerr.EINTR } if fetchRB { view, err = fd.ioRingsBuf.view(fd.ioRings.SizeBytes()) if err != nil { return -1, err } } // Note: The kernel uses sqHead as a cursor and writes cqTail. Userspace // uses cqHead as a cursor and writes sqTail. sqHeadPtr := atomicUint32AtOffset(view, int(sqOff.Head)) sqTailPtr := atomicUint32AtOffset(view, int(sqOff.Tail)) cqHeadPtr := atomicUint32AtOffset(view, int(cqOff.Head)) cqTailPtr := atomicUint32AtOffset(view, int(cqOff.Tail)) overflowPtr := atomicUint32AtOffset(view, int(cqOff.Overflow)) // Load the pointers once, so we work with a stable value. Particularly, // userspace can update the SQ tail at any time. sqHead := sqHeadPtr.Load() sqTail := sqTailPtr.Load() // Is the submission queue is empty? if sqHead == sqTail { return int(submitted), nil } // We have at least one pending sqe, unmarshal the first from the // submission queue. if fetchSQA { sqaView, err = fd.sqesBuf.view(sqArraySize) if err != nil { return -1, err } } sqaOff := int(sqHead&fd.ioRings.SqRingMask) * sqe.SizeBytes() sqe.UnmarshalUnsafe(sqaView[sqaOff : sqaOff+sqe.SizeBytes()]) fetchSQA = fd.sqesBuf.drop() // Dispatch request from unmarshalled entry. cqe := fd.ProcessSubmission(t, &sqe, flags) // Advance sq head. sqHeadPtr.Add(1) // Load once so we have stable values. Particularly, userspace can // update the CQ head at any time. cqHead := cqHeadPtr.Load() cqTail := cqTailPtr.Load() // Marshal response to completion queue. if (cqTail - cqHead) >= fd.ioRings.CqRingEntries { // CQ ring full. fd.ioRings.CqOverflow++ overflowPtr.Store(fd.ioRings.CqOverflow) } else { // Have room in CQ, marshal CQE. if fetchCQA { cqaView, err = fd.cqesBuf.view(cqArraySize) if err != nil { return -1, err } } cqaOff := int(cqTail&fd.ioRings.CqRingMask) * cqe.SizeBytes() cqe.MarshalUnsafe(cqaView[cqaOff : cqaOff+cqe.SizeBytes()]) fetchCQA, err = fd.cqesBuf.writebackWindow(cqaOff, cqe.SizeBytes()) if err != nil { return -1, err } // Advance cq tail. cqTailPtr.Add(1) } fetchRB, err = fd.ioRingsBuf.writeback(fd.ioRings.SizeBytes()) if err != nil { return -1, err } submitted++ } return int(submitted), nil } // ProcessSubmission processes a single submission request. func (fd *FileDescription) ProcessSubmission(t *kernel.Task, sqe *linux.IOUringSqe, flags uint32) *linux.IOUringCqe { var ( cqeErr error cqeFlags uint32 retValue int32 ) switch op := sqe.Opcode; op { case linux.IORING_OP_NOP: // For the NOP operation, we don't do anything special. case linux.IORING_OP_READV: retValue, cqeErr = fd.handleReadv(t, sqe, flags) if cqeErr == io.EOF { // Don't raise EOF as errno, error translation will fail. Short // reads aren't failures. cqeErr = nil } default: // Unsupported operation retValue = -int32(linuxerr.EINVAL.Errno()) } if cqeErr != nil { retValue = -int32(kernel.ExtractErrno(cqeErr, -1)) } return &linux.IOUringCqe{ UserData: sqe.UserData, Res: retValue, Flags: cqeFlags, } } // handleReadv handles IORING_OP_READV. func (fd *FileDescription) handleReadv(t *kernel.Task, sqe *linux.IOUringSqe, flags uint32) (int32, error) { // Check that a file descriptor is valid. if sqe.Fd < 0 { return 0, linuxerr.EBADF } // Currently we don't support any flags for the SQEs. if sqe.Flags != 0 { return 0, linuxerr.EINVAL } // If the file is not seekable then offset must be zero. And currently, we don't support them. if sqe.OffOrAddrOrCmdOp != 0 { return 0, linuxerr.EINVAL } // ioprio should not be set for the READV operation. if sqe.IoPrio != 0 { return 0, linuxerr.EINVAL } // AddressSpaceActive is set to true as we are doing this from the task goroutine.And this is a // case as we currently don't support neither IOPOLL nor SQPOLL modes. dst, err := t.IovecsIOSequence(hostarch.Addr(sqe.AddrOrSpliceOff), int(sqe.Len), usermem.IOOpts{ AddressSpaceActive: true, }) if err != nil { return 0, err } file := t.GetFile(sqe.Fd) if file == nil { return 0, linuxerr.EBADF } defer file.DecRef(t) n, err := file.PRead(t, dst, 0, vfs.ReadOptions{}) if err != nil { return 0, err } return int32(n), nil } // updateCq updates a completion queue by adding a given completion queue entry. func (fd *FileDescription) updateCq(cqes *safemem.BlockSeq, cqe *linux.IOUringCqe, cqTail uint32) error { cqeSize := uint32((*linux.IOUringCqe)(nil).SizeBytes()) if cqes.NumBlocks() == 1 && !cqes.Head().NeedSafecopy() { cqe.MarshalBytes(cqes.Head().ToSlice()[cqTail*cqeSize : (cqTail+1)*cqeSize]) return nil } buf := make([]byte, cqes.NumBytes()) cqe.MarshalBytes(buf) cp, cperr := safemem.CopySeq(cqes.DropFirst64(uint64(cqTail*cqeSize)), safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf))) if cp == 0 { return cperr } return nil } // sqEntriesFile implements memmap.Mappable for SQ entries. // // +stateify savable type sqEntriesFile struct { fr memmap.FileRange } // AddMapping implements memmap.Mappable.AddMapping. func (sqemf *sqEntriesFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (sqemf *sqEntriesFile) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { } // CopyMapping implements memmap.Mappable.CopyMapping. func (sqemf *sqEntriesFile) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { return nil } // Translate implements memmap.Mappable.Translate. func (sqemf *sqEntriesFile) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { if required.End > sqemf.fr.Length() { return nil, &memmap.BusError{linuxerr.EFAULT} } if source := optional.Intersect(memmap.MappableRange{0, sqemf.fr.Length()}); source.Length() != 0 { return []memmap.Translation{ { Source: source, File: pgalloc.MemoryFileFromContext(ctx), Offset: sqemf.fr.Start + source.Start, Perms: at, }, }, nil } return nil, linuxerr.EFAULT } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (sqemf *sqEntriesFile) InvalidateUnsavable(ctx context.Context) error { return nil } // ringBuffersFile implements memmap.Mappable for SQ and CQ ring buffers. // // +stateify savable type ringsBufferFile struct { fr memmap.FileRange } // AddMapping implements memmap.Mappable.AddMapping. func (rbmf *ringsBufferFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (rbmf *ringsBufferFile) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { } // CopyMapping implements memmap.Mappable.CopyMapping. func (rbmf *ringsBufferFile) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { return nil } // Translate implements memmap.Mappable.Translate. func (rbmf *ringsBufferFile) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { if required.End > rbmf.fr.Length() { return nil, &memmap.BusError{linuxerr.EFAULT} } if source := optional.Intersect(memmap.MappableRange{0, rbmf.fr.Length()}); source.Length() != 0 { return []memmap.Translation{ { Source: source, File: pgalloc.MemoryFileFromContext(ctx), Offset: rbmf.fr.Start + source.Start, Perms: at, }, }, nil } return nil, linuxerr.EFAULT } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (rbmf *ringsBufferFile) InvalidateUnsavable(ctx context.Context) error { return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/iouringfs/iouringfs_state.go000066400000000000000000000021461465435605700300060ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package iouringfs import ( "context" "gvisor.dev/gvisor/pkg/sentry/pgalloc" ) // beforeSave is invoked by stateify. func (fd *FileDescription) beforeSave() { if fd.running.Load() != 0 { panic("Task goroutine in fd.ProcessSubmissions during Save! This shouldn't be possible due to Kernel.Pause") } } // afterLoad is invoked by stateify. func (fd *FileDescription) afterLoad(ctx context.Context) { fd.mf = pgalloc.MemoryFileFromContext(ctx) // Remap shared buffers. fd.remap = true fd.runC = make(chan struct{}, 1) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/iouringfs/iouringfs_state_autogen.go000066400000000000000000000054471465435605700315370ustar00rootroot00000000000000// automatically generated by stateify. package iouringfs import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (fd *FileDescription) StateTypeName() string { return "pkg/sentry/fsimpl/iouringfs.FileDescription" } func (fd *FileDescription) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "NoLockFD", "rbmf", "sqemf", "running", "ioRings", "remap", } } // +checklocksignore func (fd *FileDescription) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.vfsfd) stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &fd.NoLockFD) stateSinkObject.Save(4, &fd.rbmf) stateSinkObject.Save(5, &fd.sqemf) stateSinkObject.Save(6, &fd.running) stateSinkObject.Save(7, &fd.ioRings) stateSinkObject.Save(8, &fd.remap) } // +checklocksignore func (fd *FileDescription) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.vfsfd) stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &fd.NoLockFD) stateSourceObject.Load(4, &fd.rbmf) stateSourceObject.Load(5, &fd.sqemf) stateSourceObject.Load(6, &fd.running) stateSourceObject.Load(7, &fd.ioRings) stateSourceObject.Load(8, &fd.remap) stateSourceObject.AfterLoad(func() { fd.afterLoad(ctx) }) } func (sqemf *sqEntriesFile) StateTypeName() string { return "pkg/sentry/fsimpl/iouringfs.sqEntriesFile" } func (sqemf *sqEntriesFile) StateFields() []string { return []string{ "fr", } } func (sqemf *sqEntriesFile) beforeSave() {} // +checklocksignore func (sqemf *sqEntriesFile) StateSave(stateSinkObject state.Sink) { sqemf.beforeSave() stateSinkObject.Save(0, &sqemf.fr) } func (sqemf *sqEntriesFile) afterLoad(context.Context) {} // +checklocksignore func (sqemf *sqEntriesFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &sqemf.fr) } func (rbmf *ringsBufferFile) StateTypeName() string { return "pkg/sentry/fsimpl/iouringfs.ringsBufferFile" } func (rbmf *ringsBufferFile) StateFields() []string { return []string{ "fr", } } func (rbmf *ringsBufferFile) beforeSave() {} // +checklocksignore func (rbmf *ringsBufferFile) StateSave(stateSinkObject state.Sink) { rbmf.beforeSave() stateSinkObject.Save(0, &rbmf.fr) } func (rbmf *ringsBufferFile) afterLoad(context.Context) {} // +checklocksignore func (rbmf *ringsBufferFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &rbmf.fr) } func init() { state.Register((*FileDescription)(nil)) state.Register((*sqEntriesFile)(nil)) state.Register((*ringsBufferFile)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/iouringfs/iouringfs_unsafe.go000066400000000000000000000021421465435605700301430ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package iouringfs import ( "fmt" "unsafe" "gvisor.dev/gvisor/pkg/atomicbitops" ) func atomicUint32AtOffset(buf []byte, offset int) *atomicbitops.Uint32 { const sizeOfUint32 int = 4 if offset+sizeOfUint32 > len(buf) || offset < 0 { panic(fmt.Sprintf("cast at offset %d for slice of len %d would result in overrun", offset, len(buf))) } if offset%sizeOfUint32 != 0 { panic(fmt.Sprintf("cast at offset %d would produce unaligned pointer", offset)) } return (*atomicbitops.Uint32)(unsafe.Pointer(&buf[offset])) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/iouringfs/iouringfs_unsafe_state_autogen.go000066400000000000000000000000731465435605700330660ustar00rootroot00000000000000// automatically generated by stateify. package iouringfs golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/kernfs/000077500000000000000000000000001465435605700235325ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/kernfs/deferred_dec_refs_mutex.go000066400000000000000000000034131465435605700307160ustar00rootroot00000000000000package kernfs import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type deferredDecRefsMutex struct { mu sync.Mutex } var deferredDecRefsprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var deferredDecRefslockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type deferredDecRefslockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *deferredDecRefsMutex) Lock() { locking.AddGLock(deferredDecRefsprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *deferredDecRefsMutex) NestedLock(i deferredDecRefslockNameIndex) { locking.AddGLock(deferredDecRefsprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *deferredDecRefsMutex) Unlock() { locking.DelGLock(deferredDecRefsprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *deferredDecRefsMutex) NestedUnlock(i deferredDecRefslockNameIndex) { locking.DelGLock(deferredDecRefsprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func deferredDecRefsinitLockNames() {} func init() { deferredDecRefsinitLockNames() deferredDecRefsprefixIndex = locking.NewMutexClass(reflect.TypeOf(deferredDecRefsMutex{}), deferredDecRefslockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/kernfs/dentry_list.go000066400000000000000000000120041465435605700264160ustar00rootroot00000000000000package kernfs // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type dentryElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (dentryElementMapper) linkerFor(elem *Dentry) *Dentry { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type dentryList struct { head *Dentry tail *Dentry } // Reset resets list l to the empty state. func (l *dentryList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *dentryList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *dentryList) Front() *Dentry { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *dentryList) Back() *Dentry { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *dentryList) Len() (count int) { for e := l.Front(); e != nil; e = (dentryElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *dentryList) PushFront(e *Dentry) { linker := dentryElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { dentryElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *dentryList) PushFrontList(m *dentryList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { dentryElementMapper{}.linkerFor(l.head).SetPrev(m.tail) dentryElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *dentryList) PushBack(e *Dentry) { linker := dentryElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { dentryElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *dentryList) PushBackList(m *dentryList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { dentryElementMapper{}.linkerFor(l.tail).SetNext(m.head) dentryElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *dentryList) InsertAfter(b, e *Dentry) { bLinker := dentryElementMapper{}.linkerFor(b) eLinker := dentryElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { dentryElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *dentryList) InsertBefore(a, e *Dentry) { aLinker := dentryElementMapper{}.linkerFor(a) eLinker := dentryElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { dentryElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *dentryList) Remove(e *Dentry) { linker := dentryElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { dentryElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { dentryElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type dentryEntry struct { next *Dentry prev *Dentry } // Next returns the entry that follows e in the list. // //go:nosplit func (e *dentryEntry) Next() *Dentry { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *dentryEntry) Prev() *Dentry { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *dentryEntry) SetNext(elem *Dentry) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *dentryEntry) SetPrev(elem *Dentry) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go000066400000000000000000000126231465435605700277160ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernfs import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) // DynamicBytesFile implements kernfs.Inode and represents a read-only file // whose contents are backed by a vfs.DynamicBytesSource. If data additionally // implements vfs.WritableDynamicBytesSource, the file also supports dispatching // writes to the implementer, but note that this will not update the source data. // // Must be instantiated with NewDynamicBytesFile or initialized with Init // before first use. // // +stateify savable type DynamicBytesFile struct { InodeAttrs InodeNoStatFS InodeNoopRefCount InodeNotAnonymous InodeNotDirectory InodeNotSymlink InodeWatches locks vfs.FileLocks // data can additionally implement vfs.WritableDynamicBytesSource to support // writes. This field cannot be changed to a different bytes source after // Init. data vfs.DynamicBytesSource } var _ Inode = (*DynamicBytesFile)(nil) // Init initializes a dynamic bytes file. func (f *DynamicBytesFile) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) } f.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm) f.data = data } // Open implements Inode.Open. func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &DynamicBytesFD{} if err := fd.Init(rp.Mount(), d, f.data, &f.locks, opts.Flags); err != nil { return nil, err } return &fd.vfsfd, nil } // SetStat implements Inode.SetStat. By default DynamicBytesFile doesn't allow // inode attributes to be changed. Override SetStat() making it call // f.InodeAttrs to allow it. func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return linuxerr.EPERM } // Locks returns the file locks for this file. func (f *DynamicBytesFile) Locks() *vfs.FileLocks { return &f.locks } // Data returns the underlying data source. func (f *DynamicBytesFile) Data() vfs.DynamicBytesSource { return f.data } // DynamicBytesFD implements vfs.FileDescriptionImpl for an FD backed by a // DynamicBytesFile. // // Must be initialized with Init before first use. // // +stateify savable type DynamicBytesFD struct { vfs.FileDescriptionDefaultImpl vfs.DynamicBytesFileDescriptionImpl vfs.LockFD vfsfd vfs.FileDescription inode Inode } // Init initializes a DynamicBytesFD. func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error { fd.LockFD.Init(locks) if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{ DenySpliceIn: true, }, ); err != nil { return err } fd.inode = d.inode fd.DynamicBytesFileDescriptionImpl.Init(&fd.vfsfd, data) return nil } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *DynamicBytesFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence) } // Read implements vfs.FileDescriptionImpl.Read. func (fd *DynamicBytesFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts) } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts) } // Write implements vfs.FileDescriptionImpl.Write. func (fd *DynamicBytesFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { return fd.DynamicBytesFileDescriptionImpl.Write(ctx, src, opts) } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { return fd.DynamicBytesFileDescriptionImpl.PWrite(ctx, src, offset, opts) } // Release implements vfs.FileDescriptionImpl.Release. func (fd *DynamicBytesFD) Release(context.Context) {} // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() return fd.inode.Stat(ctx, fs, opts) } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error { // DynamicBytesFiles are immutable. return linuxerr.EPERM } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/kernfs/fd_impl_util.go000066400000000000000000000204361465435605700265350ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernfs import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) // SeekEndConfig describes the SEEK_END behaviour for FDs. // // +stateify savable type SeekEndConfig int // Constants related to SEEK_END behaviour for FDs. const ( // Consider the end of the file to be after the final static entry. This is // the default option. SeekEndStaticEntries = iota // Consider the end of the file to be at offset 0. SeekEndZero ) // GenericDirectoryFDOptions contains configuration for a GenericDirectoryFD. // // +stateify savable type GenericDirectoryFDOptions struct { SeekEnd SeekEndConfig } // GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory // inode that uses OrderChildren to track child nodes. // // Note that GenericDirectoryFD holds a lock over OrderedChildren while calling // IterDirents callback. The IterDirents callback therefore cannot hash or // unhash children, or recursively call IterDirents on the same underlying // inode. // // Must be initialize with Init before first use. // // Lock ordering: mu => children.mu. // // +stateify savable type GenericDirectoryFD struct { vfs.FileDescriptionDefaultImpl vfs.DirectoryFileDescriptionDefaultImpl vfs.LockFD // Immutable. seekEnd SeekEndConfig vfsfd vfs.FileDescription children *OrderedChildren // mu protects the fields below. mu sync.Mutex `state:"nosave"` // off is the current directory offset. Protected by "mu". off int64 } // NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its // dentry. func NewGenericDirectoryFD(m *vfs.Mount, d *Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) (*GenericDirectoryFD, error) { fd := &GenericDirectoryFD{} if err := fd.Init(children, locks, opts, fdOpts); err != nil { return nil, err } if err := fd.vfsfd.Init(fd, opts.Flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return fd, nil } // Init initializes a GenericDirectoryFD. Use it when overriding // GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the // correct implementation. func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) error { if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 { // Can't open directories for writing. return linuxerr.EISDIR } fd.LockFD.Init(locks) fd.seekEnd = fdOpts.SeekEnd fd.children = children return nil } // VFSFileDescription returns a pointer to the vfs.FileDescription representing // this object. func (fd *GenericDirectoryFD) VFSFileDescription() *vfs.FileDescription { return &fd.vfsfd } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *GenericDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { return fd.FileDescriptionDefaultImpl.ConfigureMMap(ctx, opts) } // Read implements vfs.FileDescriptionImpl.Read. func (fd *GenericDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { return fd.DirectoryFileDescriptionDefaultImpl.Read(ctx, dst, opts) } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *GenericDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return fd.DirectoryFileDescriptionDefaultImpl.PRead(ctx, dst, offset, opts) } // Write implements vfs.FileDescriptionImpl.Write. func (fd *GenericDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { return fd.DirectoryFileDescriptionDefaultImpl.Write(ctx, src, opts) } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { return fd.DirectoryFileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts) } // Release implements vfs.FileDescriptionImpl.Release. func (fd *GenericDirectoryFD) Release(context.Context) {} func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem { return fd.vfsfd.VirtualDentry().Mount().Filesystem() } func (fd *GenericDirectoryFD) dentry() *Dentry { return fd.vfsfd.Dentry().Impl().(*Dentry) } func (fd *GenericDirectoryFD) inode() Inode { return fd.dentry().inode } // IterDirents implements vfs.FileDescriptionImpl.IterDirents. IterDirents holds // o.mu when calling cb. func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { fd.mu.Lock() defer fd.mu.Unlock() opts := vfs.StatOptions{Mask: linux.STATX_INO} // Handle ".". if fd.off == 0 { stat, err := fd.inode().Stat(ctx, fd.filesystem(), opts) if err != nil { return err } dirent := vfs.Dirent{ Name: ".", Type: linux.DT_DIR, Ino: stat.Ino, NextOff: 1, } if err := cb.Handle(dirent); err != nil { return err } fd.off++ } // Handle "..". if fd.off == 1 { parentInode := genericParentOrSelf(fd.dentry()).inode stat, err := parentInode.Stat(ctx, fd.filesystem(), opts) if err != nil { return err } dirent := vfs.Dirent{ Name: "..", Type: linux.FileMode(stat.Mode).DirentType(), Ino: stat.Ino, NextOff: 2, } if err := cb.Handle(dirent); err != nil { return err } fd.off++ } // Handle static children. fd.children.mu.RLock() defer fd.children.mu.RUnlock() // fd.off accounts for "." and "..", but fd.children do not track // these. childIdx := fd.off - 2 for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() { stat, err := it.inode.Stat(ctx, fd.filesystem(), opts) if err != nil { return err } dirent := vfs.Dirent{ Name: it.name, Type: linux.FileMode(stat.Mode).DirentType(), Ino: stat.Ino, NextOff: fd.off + 1, } if err := cb.Handle(dirent); err != nil { return err } fd.off++ } var err error relOffset := fd.off - int64(len(fd.children.set)) - 2 fd.off, err = fd.inode().IterDirents(ctx, fd.vfsfd.Mount(), cb, fd.off, relOffset) return err } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { fd.mu.Lock() defer fd.mu.Unlock() switch whence { case linux.SEEK_SET: // Use offset as given. case linux.SEEK_CUR: offset += fd.off case linux.SEEK_END: switch fd.seekEnd { case SeekEndStaticEntries: fd.children.mu.RLock() offset += int64(len(fd.children.set)) offset += 2 // '.' and '..' aren't tracked in children. fd.children.mu.RUnlock() case SeekEndZero: // No-op: offset += 0. default: panic(fmt.Sprintf("Invalid GenericDirectoryFD.seekEnd = %v", fd.seekEnd)) } default: return 0, linuxerr.EINVAL } if offset < 0 { return 0, linuxerr.EINVAL } fd.off = offset return offset, nil } // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := fd.filesystem() inode := fd.inode() return inode.Stat(ctx, fs, opts) } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) return fd.inode().SetStat(ctx, fd.filesystem(), creds, opts) } // Allocate implements vfs.FileDescriptionImpl.Allocate. func (fd *GenericDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { return fd.DirectoryFileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/kernfs/filesystem.go000066400000000000000000000775401465435605700262620ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernfs // This file implements vfs.FilesystemImpl for kernfs. import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // stepExistingLocked resolves rp.Component() in parent directory vfsd. // // stepExistingLocked is loosely analogous to fs/namei.c:walk_component(). // // Preconditions: // - Filesystem.mu must be locked for at least reading. // - !rp.Done(). // // Postcondition: Caller must call fs.processDeferredDecRefs*. func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) (*Dentry, bool, error) { if !d.isDir() { return nil, false, linuxerr.ENOTDIR } // Directory searchable? if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { return nil, false, err } name := rp.Component() // Revalidation must be skipped if name is "." or ".."; d or its parent // respectively can't be expected to transition from invalidated back to // valid, so detecting invalidation and retrying would loop forever. This // is consistent with Linux: fs/namei.c:walk_component() => lookup_fast() // calls d_revalidate(), but walk_component() => handle_dots() does not. if name == "." { rp.Advance() return d, false, nil } if name == ".." { if isRoot, err := rp.CheckRoot(ctx, d.VFSDentry()); err != nil { return nil, false, err } else if isRoot || d.parent.Load() == nil { rp.Advance() return d, false, nil } if err := rp.CheckMount(ctx, d.Parent().VFSDentry()); err != nil { return nil, false, err } rp.Advance() return d.parent.Load(), false, nil } if len(name) > linux.NAME_MAX { return nil, false, linuxerr.ENAMETOOLONG } next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name) if err != nil { return nil, false, err } if err := rp.CheckMount(ctx, next.VFSDentry()); err != nil { return nil, false, err } // Resolve any symlink at current path component. if rp.ShouldFollowSymlink() && next.isSymlink() { targetVD, targetPathname, err := next.inode.Getlink(ctx, rp.Mount()) if err != nil { return nil, false, err } if targetVD.Ok() { followedTarget, err := rp.HandleJump(targetVD) fs.deferDecRefVD(ctx, targetVD) return d, followedTarget, err } followedSymlink, err := rp.HandleSymlink(targetPathname) return d, followedSymlink, err } rp.Advance() return next, false, nil } // revalidateChildLocked is called to look up the child of parent named name, // while verifying that any cached lookups are still correct. // // Preconditions: // - Filesystem.mu must be locked for at least reading. // - parent.isDir(). // - name is not "." or "..". // // Postconditions: Caller must call fs.processDeferredDecRefs*. func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string) (*Dentry, error) { parent.dirMu.Lock() defer parent.dirMu.Unlock() // may be temporarily unlocked and re-locked below child := parent.children[name] for child != nil { // Cached dentry exists, revalidate. if child.inode.Valid(ctx, parent, name) { break } delete(parent.children, child.name) parent.dirMu.Unlock() fs.invalidateRemovedChildLocked(ctx, vfsObj, child) parent.dirMu.Lock() // Check for concurrent insertion of a new cached dentry. child = parent.children[name] } if child == nil { // Dentry isn't cached; it either doesn't exist or failed revalidation. // Attempt to resolve it via Lookup. childInode, err := parent.inode.Lookup(ctx, name) if err != nil { return nil, err } var newChild Dentry newChild.Init(fs, childInode) // childInode's ref is transferred to newChild. parent.insertChildLocked(name, &newChild) child = &newChild // Drop the ref on newChild. This will cause the dentry to get pruned // from the dentry tree by the end of current filesystem operation // (before returning to the VFS layer) if another ref is not picked on // this dentry. if !childInode.Keep() { fs.deferDecRef(&newChild) } } return child, nil } // Preconditions: // - Filesystem.mu must be locked for at least reading. // - d has been removed from its parent.children. // // Postconditions: Caller must call fs.processDeferredDecRefs*. func (fs *Filesystem) invalidateRemovedChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, d *Dentry) { toInvalidate := []*Dentry{d} for len(toInvalidate) != 0 { d := toInvalidate[len(toInvalidate)-1] toInvalidate = toInvalidate[:len(toInvalidate)-1] if d.inode.Keep() { fs.deferDecRef(d) } rcs := vfsObj.InvalidateDentry(ctx, d.VFSDentry()) for _, rc := range rcs { fs.deferDecRef(rc) } if d.isDir() { d.dirMu.Lock() for name, child := range d.children { toInvalidate = append(toInvalidate, child) delete(d.children, name) } d.dirMu.Unlock() } } } // walkExistingLocked resolves rp to an existing file. // // walkExistingLocked is loosely analogous to Linux's // fs/namei.c:path_lookupat(). // // Preconditions: Filesystem.mu must be locked for at least reading. // // Postconditions: Caller must call fs.processDeferredDecRefs*. func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) { d := rp.Start().Impl().(*Dentry) for !rp.Done() { var err error d, _, err = fs.stepExistingLocked(ctx, rp, d) if err != nil { return nil, err } } if rp.MustBeDir() && !d.isDir() { return nil, linuxerr.ENOTDIR } return d, nil } // walkParentDirLocked resolves all but the last path component of rp to an // existing directory. It does not check that the returned directory is // searchable by the provider of rp. // // walkParentDirLocked is loosely analogous to Linux's // fs/namei.c:path_parentat(). // // Preconditions: // - Filesystem.mu must be locked for at least reading. // - !rp.Done(). // // Postconditions: Caller must call fs.processDeferredDecRefs*. func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) (*Dentry, error) { for !rp.Final() { var err error d, _, err = fs.stepExistingLocked(ctx, rp, d) if err != nil { return nil, err } } if !d.isDir() { return nil, linuxerr.ENOTDIR } return d, nil } // checkCreateLocked checks that a file named rp.Component() may be created in // directory parent, then returns rp.Component(). // // Preconditions: // - Filesystem.mu must be locked for at least reading. // - isDir(parentInode) == true. func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string, parent *Dentry) error { // Order of checks is important. First check if parent directory can be // executed, then check for existence, and lastly check if mount is writable. if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayExec); err != nil { return err } if name == "." || name == ".." { return linuxerr.EEXIST } if len(name) > linux.NAME_MAX { return linuxerr.ENAMETOOLONG } if _, ok := parent.children[name]; ok { return linuxerr.EEXIST } if parent.VFSDentry().IsDead() { return linuxerr.ENOENT } if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayWrite); err != nil { return err } return nil } // checkDeleteLocked checks that the file represented by vfsd may be deleted. // // Preconditions: Filesystem.mu must be locked for at least reading. func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) error { parent := d.parent.Load() if parent == nil { return linuxerr.EBUSY } if parent.vfsd.IsDead() { return linuxerr.ENOENT } if d.vfsd.IsDead() { // This implies a duplicate unlink on an orphaned dentry, where the path // resolution was successful. This is possible when the orphan is // replaced by a new node of the same name (so the path resolution // succeeds), and the orphan is unlinked again through a dirfd using // unlinkat(2) (so the unlink refers to the orphan and not the new // node). See Linux, fs/namei.c:do_rmdir(). return linuxerr.EINVAL } if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } return nil } // Release implements vfs.FilesystemImpl.Release. func (fs *Filesystem) Release(ctx context.Context) { root := fs.root if root == nil { return } fs.mu.Lock() root.releaseKeptDentriesLocked(ctx) for fs.cachedDentriesLen != 0 { fs.evictCachedDentryLocked(ctx) } fs.mu.Unlock() // Drop ref acquired in Dentry.InitRoot(). root.DecRef(ctx) } // releaseKeptDentriesLocked recursively drops all dentry references created by // Lookup when Dentry.inode.Keep() is true. // // Precondition: Filesystem.mu is held. func (d *Dentry) releaseKeptDentriesLocked(ctx context.Context) { if d.inode.Keep() && d != d.fs.root { d.decRefLocked(ctx) } if d.isDir() { var children []*Dentry d.dirMu.Lock() for _, child := range d.children { children = append(children, child) } d.dirMu.Unlock() for _, child := range children { child.releaseKeptDentriesLocked(ctx) } } } // Sync implements vfs.FilesystemImpl.Sync. func (fs *Filesystem) Sync(ctx context.Context) error { // All filesystem state is in-memory. return nil } // AccessAt implements vfs.Filesystem.Impl.AccessAt. func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { fs.mu.RLock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return err } if err := d.inode.CheckPermissions(ctx, creds, ats); err != nil { return err } if ats.MayWrite() && rp.Mount().ReadOnly() { return linuxerr.EROFS } return nil } // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { fs.mu.RLock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return nil, err } if opts.CheckSearchable { if !d.isDir() { return nil, linuxerr.ENOTDIR } if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { return nil, err } } vfsd := d.VFSDentry() vfsd.IncRef() // Ownership transferred to caller. return vfsd, nil } // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { fs.mu.RLock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() d, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry)) if err != nil { return nil, err } d.IncRef() // Ownership transferred to caller. return d.VFSDentry(), nil } // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { if rp.Done() { return linuxerr.EEXIST } fs.mu.Lock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry)) if err != nil { return err } if rp.Mount() != vd.Mount() { return linuxerr.EXDEV } inode := vd.Dentry().Impl().(*Dentry).Inode() if inode.Mode().IsDir() { return linuxerr.EPERM } if err := vfs.MayLink(rp.Credentials(), inode.Mode(), inode.UID(), inode.GID()); err != nil { return err } parent.dirMu.Lock() defer parent.dirMu.Unlock() pc := rp.Component() if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil { return err } if rp.MustBeDir() { return linuxerr.ENOENT } if err := rp.Mount().CheckBeginWrite(); err != nil { return err } defer rp.Mount().EndWrite() childI, err := parent.inode.NewLink(ctx, pc, inode) if err != nil { return err } parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE, 0, vfs.InodeEvent, false /* unlinked */) inode.Watches().Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */) var child Dentry child.Init(fs, childI) parent.insertChildLocked(pc, &child) return nil } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { if rp.Done() { return linuxerr.EEXIST } fs.mu.Lock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry)) if err != nil { return err } parent.dirMu.Lock() defer parent.dirMu.Unlock() pc := rp.Component() if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil { return err } if err := rp.Mount().CheckBeginWrite(); err != nil { return err } defer rp.Mount().EndWrite() childI, err := parent.inode.NewDir(ctx, pc, opts) if err != nil { if !opts.ForSyntheticMountpoint || linuxerr.Equals(linuxerr.EEXIST, err) { return err } childI = newSyntheticDirectory(ctx, rp.Credentials(), opts.Mode) } var child Dentry child.Init(fs, childI) parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE|linux.IN_ISDIR, 0, vfs.InodeEvent, false /* unlinked */) parent.insertChildLocked(pc, &child) return nil } // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { if rp.Done() { return linuxerr.EEXIST } fs.mu.Lock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry)) if err != nil { return err } parent.dirMu.Lock() defer parent.dirMu.Unlock() pc := rp.Component() if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil { return err } if rp.MustBeDir() { return linuxerr.ENOENT } if err := rp.Mount().CheckBeginWrite(); err != nil { return err } defer rp.Mount().EndWrite() newI, err := parent.inode.NewNode(ctx, pc, opts) if err != nil { return err } parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE, 0, vfs.InodeEvent, false /* unlinked */) var newD Dentry newD.Init(fs, newI) parent.insertChildLocked(pc, &newD) return nil } // OpenAt implements vfs.FilesystemImpl.OpenAt. func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(&opts) // Do not create new file. if opts.Flags&linux.O_CREAT == 0 { fs.mu.RLock() defer fs.processDeferredDecRefs(ctx) d, err := fs.walkExistingLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return nil, err } if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { fs.mu.RUnlock() return nil, err } // Open may block so we need to unlock fs.mu. IncRef d to prevent // its destruction while fs.mu is unlocked. d.IncRef() fs.mu.RUnlock() fd, err := d.inode.Open(ctx, rp, d, opts) d.DecRef(ctx) return fd, err } // May create new file. mustCreate := opts.Flags&linux.O_EXCL != 0 start := rp.Start().Impl().(*Dentry) fs.mu.Lock() unlocked := false unlock := func() { if !unlocked { fs.mu.Unlock() unlocked = true } } // Process all to-be-decref'd dentries at the end at once. // Since we defer unlock() AFTER this, fs.mu is guaranteed to be unlocked // when this is executed. defer fs.processDeferredDecRefs(ctx) defer unlock() if rp.Done() { if rp.MustBeDir() { return nil, linuxerr.EISDIR } if mustCreate { return nil, linuxerr.EEXIST } if err := start.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } // Open may block so we need to unlock fs.mu. IncRef d to prevent // its destruction while fs.mu is unlocked. start.IncRef() unlock() fd, err := start.inode.Open(ctx, rp, start, opts) start.DecRef(ctx) return fd, err } afterTrailingSymlink: parent, err := fs.walkParentDirLocked(ctx, rp, start) if err != nil { return nil, err } // Check for search permission in the parent directory. if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Reject attempts to open directories with O_CREAT. if rp.MustBeDir() { return nil, linuxerr.EISDIR } pc := rp.Component() if pc == "." || pc == ".." { return nil, linuxerr.EISDIR } if len(pc) > linux.NAME_MAX { return nil, linuxerr.ENAMETOOLONG } if parent.VFSDentry().IsDead() { return nil, linuxerr.ENOENT } // Determine whether or not we need to create a file. child, followedSymlink, err := fs.stepExistingLocked(ctx, rp, parent) if followedSymlink { if mustCreate { // EEXIST must be returned if an existing symlink is opened with O_EXCL. return nil, linuxerr.EEXIST } if err != nil { // If followedSymlink && err != nil, then this symlink resolution error // must be handled by the VFS layer. return nil, err } start = parent goto afterTrailingSymlink } if linuxerr.Equals(linuxerr.ENOENT, err) { // Already checked for searchability above; now check for writability. if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } if err := rp.Mount().CheckBeginWrite(); err != nil { return nil, err } defer rp.Mount().EndWrite() // Create and open the child. childI, err := parent.inode.NewFile(ctx, pc, opts) if err != nil { return nil, err } var child Dentry child.Init(fs, childI) parent.insertChild(pc, &child) // Open may block so we need to unlock fs.mu. IncRef child to prevent // its destruction while fs.mu is unlocked. child.IncRef() unlock() parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) fd, err := child.inode.Open(ctx, rp, &child, opts) child.DecRef(ctx) return fd, err } if err != nil { return nil, err } // Open existing file or follow symlink. if mustCreate { return nil, linuxerr.EEXIST } if rp.MustBeDir() && !child.isDir() { return nil, linuxerr.ENOTDIR } if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } if child.isDir() { // Can't open directories with O_CREAT. if opts.Flags&linux.O_CREAT != 0 { return nil, linuxerr.EISDIR } // Can't open directories writably. if ats&vfs.MayWrite != 0 { return nil, linuxerr.EISDIR } if opts.Flags&linux.O_DIRECT != 0 { return nil, linuxerr.EINVAL } } // Open may block so we need to unlock fs.mu. IncRef child to prevent // its destruction while fs.mu is unlocked. child.IncRef() unlock() fd, err := child.inode.Open(ctx, rp, child, opts) child.DecRef(ctx) return fd, err } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { defer fs.processDeferredDecRefs(ctx) fs.mu.RLock() d, err := fs.walkExistingLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return "", err } if !d.isSymlink() { fs.mu.RUnlock() return "", linuxerr.EINVAL } // Inode.Readlink() cannot be called holding fs locks. d.IncRef() defer d.DecRef(ctx) fs.mu.RUnlock() return d.inode.Readlink(ctx, rp.Mount()) } // RenameAt implements vfs.FilesystemImpl.RenameAt. func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { fs.mu.Lock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() // Resolve the destination directory first to verify that it's on this // Mount. dstDir, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry)) if err != nil { return err } // Only RENAME_NOREPLACE is supported. if opts.Flags&^linux.RENAME_NOREPLACE != 0 { return linuxerr.EINVAL } noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 mnt := rp.Mount() if mnt != oldParentVD.Mount() { return linuxerr.EXDEV } if err := mnt.CheckBeginWrite(); err != nil { return err } defer mnt.EndWrite() oldParentDir := oldParentVD.Dentry().Impl().(*Dentry).Inode() if err := oldParentDir.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } if err := dstDir.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } srcDirVFSD := oldParentVD.Dentry() srcDir := srcDirVFSD.Impl().(*Dentry) src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName) if err != nil { return err } // Can we remove the src dentry? if err := checkDeleteLocked(ctx, rp, src); err != nil { return err } // Can we create the dst dentry? var dst *Dentry newName := rp.Component() if newName == "." || newName == ".." { if noReplace { return linuxerr.EEXIST } return linuxerr.EBUSY } if len(newName) > linux.NAME_MAX { return linuxerr.ENAMETOOLONG } err = checkCreateLocked(ctx, rp.Credentials(), newName, dstDir) switch { case err == nil: // Ok, continue with rename as replacement. case linuxerr.Equals(linuxerr.EEXIST, err): if noReplace { // Won't overwrite existing node since RENAME_NOREPLACE was requested. return linuxerr.EEXIST } dst = dstDir.children[newName] if dst == nil { panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", newName, dstDir)) } default: return err } if srcDir == dstDir && oldName == newName { return nil } var dstVFSD *vfs.Dentry if dst != nil { dstVFSD = dst.VFSDentry() } mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef(ctx) virtfs := rp.VirtualFilesystem() // We can't deadlock here due to lock ordering because we're protected from // concurrent renames by fs.mu held for writing. srcDir.dirMu.Lock() defer srcDir.dirMu.Unlock() if srcDir != dstDir { dstDir.dirMu.Lock() defer dstDir.dirMu.Unlock() } srcVFSD := src.VFSDentry() if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil { return err } err = srcDir.inode.Rename(ctx, src.name, newName, src.inode, dstDir.inode) if err != nil { virtfs.AbortRenameDentry(srcVFSD, dstVFSD) return err } delete(srcDir.children, src.name) if srcDir != dstDir { fs.deferDecRef(srcDir) // child (src) drops ref on old parent. dstDir.IncRef() // child (src) takes a ref on the new parent. } src.parent.Store(dstDir) src.name = newName if dstDir.children == nil { dstDir.children = make(map[string]*Dentry) } replaced := dstDir.children[newName] dstDir.children[newName] = src var replaceVFSD *vfs.Dentry if replaced != nil { // deferDecRef so that fs.mu and dstDir.mu are unlocked by then. fs.deferDecRef(replaced) replaceVFSD = replaced.VFSDentry() replaced.setDeleted() } vfs.InotifyRename(ctx, src.inode.Watches(), srcDir.inode.Watches(), dstDir.inode.Watches(), oldName, newName, src.isDir()) for _, rc := range virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD) { // +checklocksforce: to may be nil, that's okay. fs.deferDecRef(rc) } return nil } // RmdirAt implements vfs.FilesystemImpl.RmdirAt. func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry)) if err != nil { return err } if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } if err := rp.Mount().CheckBeginWrite(); err != nil { return err } defer rp.Mount().EndWrite() name := rp.Component() if name == "." { return linuxerr.EINVAL } if name == ".." { return linuxerr.ENOTEMPTY } child, ok := parent.children[name] if !ok { return linuxerr.ENOENT } if err := checkDeleteLocked(ctx, rp, child); err != nil { return err } if err := vfs.CheckDeleteSticky( rp.Credentials(), linux.FileMode(parent.inode.Mode()), auth.KUID(parent.inode.UID()), auth.KUID(child.inode.UID()), auth.KGID(child.inode.GID()), ); err != nil { return err } if !child.isDir() { return linuxerr.ENOTDIR } if child.inode.HasChildren() { return linuxerr.ENOTEMPTY } virtfs := rp.VirtualFilesystem() parent.dirMu.Lock() defer parent.dirMu.Unlock() mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef(ctx) vfsd := child.VFSDentry() if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err // +checklocksforce: vfsd is not locked. } if err := parent.inode.RmDir(ctx, child.name, child.inode); err != nil { virtfs.AbortDeleteDentry(vfsd) return err } delete(parent.children, child.name) parent.inode.Watches().Notify(ctx, child.name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) // Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then. fs.deferDecRef(child) rcs := virtfs.CommitDeleteDentry(ctx, vfsd) for _, rc := range rcs { fs.deferDecRef(rc) } child.setDeleted() return nil } // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { fs.mu.RLock() defer fs.processDeferredDecRefs(ctx) d, err := fs.walkExistingLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return err } if opts.Stat.Mask == 0 { fs.mu.RUnlock() return nil } err = d.inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts) fs.mu.RUnlock() if err != nil { return err } if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) } return nil } // StatAt implements vfs.FilesystemImpl.StatAt. func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { fs.mu.RLock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return linux.Statx{}, err } return d.inode.Stat(ctx, fs.VFSFilesystem(), opts) } // StatFSAt implements vfs.FilesystemImpl.StatFSAt. func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { fs.mu.RLock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return linux.Statfs{}, err } return d.inode.StatFS(ctx, fs.VFSFilesystem()) } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { if rp.Done() { return linuxerr.EEXIST } fs.mu.Lock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry)) if err != nil { return err } parent.dirMu.Lock() defer parent.dirMu.Unlock() pc := rp.Component() if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil { return err } if rp.MustBeDir() { return linuxerr.ENOENT } if err := rp.Mount().CheckBeginWrite(); err != nil { return err } defer rp.Mount().EndWrite() childI, err := parent.inode.NewSymlink(ctx, pc, target) if err != nil { return err } parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE, 0, vfs.InodeEvent, false /* unlinked */) var child Dentry child.Init(fs, childI) parent.insertChildLocked(pc, &child) return nil } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return err } if err := rp.Mount().CheckBeginWrite(); err != nil { return err } defer rp.Mount().EndWrite() if err := checkDeleteLocked(ctx, rp, d); err != nil { return err } if d.isDir() { return linuxerr.EISDIR } virtfs := rp.VirtualFilesystem() parentDentry := d.parent.Load() parentDentry.dirMu.Lock() defer parentDentry.dirMu.Unlock() mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef(ctx) vfsd := d.VFSDentry() if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } if err := parentDentry.inode.Unlink(ctx, d.name, d.inode); err != nil { virtfs.AbortDeleteDentry(vfsd) return err } delete(parentDentry.children, d.name) vfs.InotifyRemoveChild(ctx, d.inode.Watches(), parentDentry.inode.Watches(), d.name) // Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then. fs.deferDecRef(d) rcs := virtfs.CommitDeleteDentry(ctx, vfsd) for _, rc := range rcs { fs.deferDecRef(rc) } d.setDeleted() return nil } // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return nil, err } if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } return nil, linuxerr.ECONNREFUSED } // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { fs.mu.RLock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() _, err := fs.walkExistingLocked(ctx, rp) if err != nil { return nil, err } // kernfs currently does not support extended attributes. return nil, linuxerr.ENOTSUP } // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { fs.mu.RLock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() _, err := fs.walkExistingLocked(ctx, rp) if err != nil { return "", err } // kernfs currently does not support extended attributes. return "", linuxerr.ENOTSUP } // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { fs.mu.RLock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() _, err := fs.walkExistingLocked(ctx, rp) if err != nil { return err } // kernfs currently does not support extended attributes. return linuxerr.ENOTSUP } // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { fs.mu.RLock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() _, err := fs.walkExistingLocked(ctx, rp) if err != nil { return err } // kernfs currently does not support extended attributes. return linuxerr.ENOTSUP } // PrependPath implements vfs.FilesystemImpl.PrependPath. func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() defer fs.mu.RUnlock() return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b) } func (fs *Filesystem) deferDecRefVD(ctx context.Context, vd vfs.VirtualDentry) { if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs { // The following is equivalent to vd.DecRef(ctx). This is needed // because if d belongs to this filesystem, we can not DecRef it right // away as we may be holding fs.mu. d.DecRef may acquire fs.mu. So we // defer the DecRef to when locks are dropped. vd.Mount().DecRef(ctx) fs.deferDecRef(d) } else { vd.DecRef(ctx) } } // IsDescendant implements vfs.FilesystemImpl.IsDescendant. func (fs *Filesystem) IsDescendant(vfsroot, vd vfs.VirtualDentry) bool { return genericIsDescendant(vfsroot.Dentry(), vd.Dentry().Impl().(*Dentry)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/kernfs/filesystem_mutex.go000066400000000000000000000047011465435605700274710ustar00rootroot00000000000000package kernfs import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type filesystemRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var filesystemlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type filesystemlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *filesystemRWMutex) Lock() { locking.AddGLock(filesystemprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *filesystemRWMutex) NestedLock(i filesystemlockNameIndex) { locking.AddGLock(filesystemprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *filesystemRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(filesystemprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *filesystemRWMutex) NestedUnlock(i filesystemlockNameIndex) { m.mu.Unlock() locking.DelGLock(filesystemprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *filesystemRWMutex) RLock() { locking.AddGLock(filesystemprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *filesystemRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(filesystemprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *filesystemRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *filesystemRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *filesystemRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var filesystemprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func filesysteminitLockNames() {} func init() { filesysteminitLockNames() filesystemprefixIndex = locking.NewMutexClass(reflect.TypeOf(filesystemRWMutex{}), filesystemlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/kernfs/fstree.go000066400000000000000000000036711465435605700253600ustar00rootroot00000000000000package kernfs import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // We need to define an interface instead of using atomic.Pointer because // the Dentry type gets removed during code generation and the compiler // complains about the unused sync/atomic type. type genericatomicptr interface { Load() *Dentry } // IsAncestorDentry returns true if d is an ancestor of d2; that is, d is // either d2's parent or an ancestor of d2's parent. func genericIsAncestorDentry(d, d2 *Dentry) bool { for d2 != nil { parent := d2.parent.Load() if parent == d { return true } if parent == d2 { return false } d2 = parent } return false } // IsDescendant returns true if vd is a descendant of vfsroot or if vd and // vfsroot are the same dentry. func genericIsDescendant(vfsroot *vfs.Dentry, d *Dentry) bool { for d != nil && &d.vfsd != vfsroot { d = d.parent.Load() } return d != nil } // ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d. func genericParentOrSelf(d *Dentry) *Dentry { if parent := d.parent.Load(); parent != nil { return parent } return d } // PrependPath is a generic implementation of FilesystemImpl.PrependPath(). func genericPrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *Dentry, b *fspath.Builder) error { for { if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { return vfs.PrependPathAtVFSRootError{} } if mnt != nil && &d.vfsd == mnt.Root() { return nil } parent := d.parent.Load() if parent == nil { return vfs.PrependPathAtNonMountRootError{} } b.PrependComponent(d.name) d = parent } } // DebugPathname returns a pathname to d relative to its filesystem root. // DebugPathname does not correspond to any Linux function; it's used to // generate dentry pathnames for debugging. func genericDebugPathname(d *Dentry) string { var b fspath.Builder _ = genericPrependPath(vfs.VirtualDentry{}, nil, d, &b) return b.String() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/kernfs/inode_impl_util.go000066400000000000000000000616331465435605700272460ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernfs import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) // InodeNoopRefCount partially implements the Inode interface, specifically the // inodeRefs sub interface. InodeNoopRefCount implements a simple reference // count for inodes, performing no extra actions when references are obtained or // released. This is suitable for simple file inodes that don't reference any // resources. // // +stateify savable type InodeNoopRefCount struct { InodeTemporary } // IncRef implements Inode.IncRef. func (InodeNoopRefCount) IncRef() { } // DecRef implements Inode.DecRef. func (InodeNoopRefCount) DecRef(context.Context) { } // TryIncRef implements Inode.TryIncRef. func (InodeNoopRefCount) TryIncRef() bool { return true } // InodeDirectoryNoNewChildren partially implements the Inode interface. // InodeDirectoryNoNewChildren represents a directory inode which does not // support creation of new children. // // +stateify savable type InodeDirectoryNoNewChildren struct{} // NewFile implements Inode.NewFile. func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) { return nil, linuxerr.EPERM } // NewDir implements Inode.NewDir. func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) { return nil, linuxerr.EPERM } // NewLink implements Inode.NewLink. func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (Inode, error) { return nil, linuxerr.EPERM } // NewSymlink implements Inode.NewSymlink. func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (Inode, error) { return nil, linuxerr.EPERM } // NewNode implements Inode.NewNode. func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) { return nil, linuxerr.EPERM } // InodeNotDirectory partially implements the Inode interface, specifically the // inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not // represent directories can embed this to provide no-op implementations for // directory-related functions. // // +stateify savable type InodeNotDirectory struct { InodeAlwaysValid } // HasChildren implements Inode.HasChildren. func (InodeNotDirectory) HasChildren() bool { return false } // NewFile implements Inode.NewFile. func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) { panic("NewFile called on non-directory inode") } // NewDir implements Inode.NewDir. func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) { panic("NewDir called on non-directory inode") } // NewLink implements Inode.NewLinkink. func (InodeNotDirectory) NewLink(context.Context, string, Inode) (Inode, error) { panic("NewLink called on non-directory inode") } // NewSymlink implements Inode.NewSymlink. func (InodeNotDirectory) NewSymlink(context.Context, string, string) (Inode, error) { panic("NewSymlink called on non-directory inode") } // NewNode implements Inode.NewNode. func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) { panic("NewNode called on non-directory inode") } // Unlink implements Inode.Unlink. func (InodeNotDirectory) Unlink(context.Context, string, Inode) error { panic("Unlink called on non-directory inode") } // RmDir implements Inode.RmDir. func (InodeNotDirectory) RmDir(context.Context, string, Inode) error { panic("RmDir called on non-directory inode") } // Rename implements Inode.Rename. func (InodeNotDirectory) Rename(context.Context, string, string, Inode, Inode) error { panic("Rename called on non-directory inode") } // Lookup implements Inode.Lookup. func (InodeNotDirectory) Lookup(ctx context.Context, name string) (Inode, error) { panic("Lookup called on non-directory inode") } // IterDirents implements Inode.IterDirents. func (InodeNotDirectory) IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) { panic("IterDirents called on non-directory inode") } // InodeNotSymlink partially implements the Inode interface, specifically the // inodeSymlink sub interface. All inodes that are not symlinks may embed this // to return the appropriate errors from symlink-related functions. // // +stateify savable type InodeNotSymlink struct{} // Readlink implements Inode.Readlink. func (InodeNotSymlink) Readlink(context.Context, *vfs.Mount) (string, error) { return "", linuxerr.EINVAL } // Getlink implements Inode.Getlink. func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) { return vfs.VirtualDentry{}, "", linuxerr.EINVAL } // InodeAttrs partially implements the Inode interface, specifically the // inodeMetadata sub interface. InodeAttrs provides functionality related to // inode attributes. // // Must be initialized by Init prior to first use. // // +stateify savable type InodeAttrs struct { devMajor uint32 devMinor uint32 ino atomicbitops.Uint64 mode atomicbitops.Uint32 uid atomicbitops.Uint32 gid atomicbitops.Uint32 nlink atomicbitops.Uint32 blockSize atomicbitops.Uint32 // Timestamps, all nsecs from the Unix epoch. atime atomicbitops.Int64 mtime atomicbitops.Int64 ctime atomicbitops.Int64 } // Init initializes this InodeAttrs. func (a *InodeAttrs) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) { a.InitWithIDs(ctx, creds.EffectiveKUID, creds.EffectiveKGID, devMajor, devMinor, ino, mode) } // InitWithIDs initializes this InodeAttrs. func (a *InodeAttrs) InitWithIDs(ctx context.Context, uid auth.KUID, gid auth.KGID, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) { if mode.FileType() == 0 { panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode)) } nlink := uint32(1) if mode.FileType() == linux.ModeDirectory { nlink = 2 } a.devMajor = devMajor a.devMinor = devMinor a.ino.Store(ino) a.mode.Store(uint32(mode)) a.uid.Store(uint32(uid)) a.gid.Store(uint32(gid)) a.nlink.Store(nlink) a.blockSize.Store(hostarch.PageSize) now := ktime.NowFromContext(ctx).Nanoseconds() a.atime.Store(now) a.mtime.Store(now) a.ctime.Store(now) } // DevMajor returns the device major number. func (a *InodeAttrs) DevMajor() uint32 { return a.devMajor } // DevMinor returns the device minor number. func (a *InodeAttrs) DevMinor() uint32 { return a.devMinor } // Ino returns the inode id. func (a *InodeAttrs) Ino() uint64 { return a.ino.Load() } // UID implements Inode.UID. func (a *InodeAttrs) UID() auth.KUID { return auth.KUID(a.uid.Load()) } // GID implements Inode.GID. func (a *InodeAttrs) GID() auth.KGID { return auth.KGID(a.gid.Load()) } // Mode implements Inode.Mode. func (a *InodeAttrs) Mode() linux.FileMode { return linux.FileMode(a.mode.Load()) } // Links returns the link count. func (a *InodeAttrs) Links() uint32 { return a.nlink.Load() } // TouchAtime updates a.atime to the current time. func (a *InodeAttrs) TouchAtime(ctx context.Context, mnt *vfs.Mount) { if opts := mnt.Options(); opts.Flags.NoATime || opts.ReadOnly { return } if err := mnt.CheckBeginWrite(); err != nil { return } a.atime.Store(ktime.NowFromContext(ctx).Nanoseconds()) mnt.EndWrite() } // TouchCMtime updates a.{c/m}time to the current time. The caller should // synchronize calls to this so that ctime and mtime are updated to the same // value. func (a *InodeAttrs) TouchCMtime(ctx context.Context) { now := ktime.NowFromContext(ctx).Nanoseconds() a.mtime.Store(now) a.ctime.Store(now) } // Stat partially implements Inode.Stat. Note that this function doesn't provide // all the stat fields, and the embedder should consider extending the result // with filesystem-specific fields. func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME stat.DevMajor = a.devMajor stat.DevMinor = a.devMinor stat.Ino = a.ino.Load() stat.Mode = uint16(a.Mode()) stat.UID = a.uid.Load() stat.GID = a.gid.Load() stat.Nlink = a.nlink.Load() stat.Blksize = a.blockSize.Load() stat.Atime = linux.NsecToStatxTimestamp(a.atime.Load()) stat.Mtime = linux.NsecToStatxTimestamp(a.mtime.Load()) stat.Ctime = linux.NsecToStatxTimestamp(a.ctime.Load()) return stat, nil } // SetStat implements Inode.SetStat. func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { if opts.Stat.Mask == 0 { return nil } // Note that not all fields are modifiable. For example, the file type and // inode numbers are immutable after node creation. Setting the size is often // allowed by kernfs files but does not do anything. If some other behavior is // needed, the embedder should consider extending SetStat. if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { return linuxerr.EPERM } if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() { return linuxerr.EISDIR } if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(a.uid.Load()), auth.KGID(a.gid.Load())); err != nil { return err } clearSID := false stat := opts.Stat if stat.Mask&linux.STATX_UID != 0 { a.uid.Store(stat.UID) clearSID = true } if stat.Mask&linux.STATX_GID != 0 { a.gid.Store(stat.GID) clearSID = true } if stat.Mask&linux.STATX_MODE != 0 { for { old := a.mode.Load() ft := old & linux.S_IFMT newMode := ft | uint32(stat.Mode & ^uint16(linux.S_IFMT)) if clearSID { newMode = vfs.ClearSUIDAndSGID(newMode) } if swapped := a.mode.CompareAndSwap(old, newMode); swapped { clearSID = false break } } } // We may have to clear the SUID/SGID bits, but didn't do so as part of // STATX_MODE. if clearSID { for { old := a.mode.Load() newMode := vfs.ClearSUIDAndSGID(old) if swapped := a.mode.CompareAndSwap(old, newMode); swapped { break } } } now := ktime.NowFromContext(ctx).Nanoseconds() if stat.Mask&linux.STATX_ATIME != 0 { if stat.Atime.Nsec == linux.UTIME_NOW { stat.Atime = linux.NsecToStatxTimestamp(now) } a.atime.Store(stat.Atime.ToNsec()) } if stat.Mask&linux.STATX_MTIME != 0 { if stat.Mtime.Nsec == linux.UTIME_NOW { stat.Mtime = linux.NsecToStatxTimestamp(now) } a.mtime.Store(stat.Mtime.ToNsec()) } return nil } // CheckPermissions implements Inode.CheckPermissions. func (a *InodeAttrs) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { return vfs.GenericCheckPermissions( creds, ats, a.Mode(), auth.KUID(a.uid.Load()), auth.KGID(a.gid.Load()), ) } // IncLinks implements Inode.IncLinks. func (a *InodeAttrs) IncLinks(n uint32) { if a.nlink.Add(n) <= n { panic("InodeLink.IncLinks called with no existing links") } } // DecLinks implements Inode.DecLinks. func (a *InodeAttrs) DecLinks() { if nlink := a.nlink.Add(^uint32(0)); nlink == ^uint32(0) { // Negative overflow panic("Inode.DecLinks called at 0 links") } } // +stateify savable type slot struct { name string inode Inode static bool slotEntry } // OrderedChildrenOptions contains initialization options for OrderedChildren. // // +stateify savable type OrderedChildrenOptions struct { // Writable indicates whether vfs.FilesystemImpl methods implemented by // OrderedChildren may modify the tracked children. This applies to // operations related to rename, unlink and rmdir. If an OrderedChildren is // not writable, these operations all fail with EPERM. // // Note that writable users must implement the sticky bit (I_SVTX). Writable bool } // inodeWithOrderedChildren allows extraction of an OrderedChildren from an // Inode implementation. A concrete type that both implements the Inode // interface and embeds OrderedChildren will be castable to this interface, and // we can get to the embedded OrderedChildren through the orderedChildren // method. type inodeWithOrderedChildren interface { Inode orderedChildren() *OrderedChildren } // OrderedChildren partially implements the Inode interface. OrderedChildren can // be embedded in directory inodes to keep track of children in the // directory, and can then be used to implement a generic directory FD -- see // GenericDirectoryFD. // // OrderedChildren can represent a node in an Inode tree. The children inodes // might be directories themselves using OrderedChildren; hence extending the // tree. The parent inode (OrderedChildren user) holds a ref on all its static // children. This lets the static inodes outlive their associated dentry. // While the dentry might have to be regenerated via a Lookup() call, we can // keep reusing the same static inode. These static children inodes are finally // DecRef'd when this directory inode is being destroyed. This makes // OrderedChildren suitable for static directory entries as well. // // Must be initialize with Init before first use. // // +stateify savable type OrderedChildren struct { // Can children be modified by user syscalls? It set to false, interface // methods that would modify the children return EPERM. Immutable. writable bool mu sync.RWMutex `state:"nosave"` order slotList set map[string]*slot } // orderedChildren implements inodeWithOrderedChildren.orderedChildren. func (o *OrderedChildren) orderedChildren() *OrderedChildren { return o } // Init initializes an OrderedChildren. func (o *OrderedChildren) Init(opts OrderedChildrenOptions) { o.writable = opts.Writable o.set = make(map[string]*slot) } // Destroy clears the children stored in o. It should be called by structs // embedding OrderedChildren upon destruction, i.e. when their reference count // reaches zero. func (o *OrderedChildren) Destroy(ctx context.Context) { o.mu.Lock() defer o.mu.Unlock() // Drop the ref that o owns on the static inodes it holds. for _, s := range o.set { if s.static { s.inode.DecRef(ctx) } } o.order.Reset() o.set = nil } // Populate inserts static children into this OrderedChildren. // Populate returns the number of directories inserted, which the caller // may use to update the link count for the parent directory. // // Precondition: // - d must represent a directory inode. // - children must not contain any conflicting entries already in o. // - Caller must hold a reference on all inodes passed. // // Postcondition: Caller's references on inodes are transferred to o. func (o *OrderedChildren) Populate(children map[string]Inode) uint32 { var links uint32 for name, child := range children { if child.Mode().IsDir() { links++ } if err := o.insert(name, child, true); err != nil { panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v)", name, child)) } } return links } // Lookup implements Inode.Lookup. func (o *OrderedChildren) Lookup(ctx context.Context, name string) (Inode, error) { o.mu.RLock() defer o.mu.RUnlock() s, ok := o.set[name] if !ok { return nil, linuxerr.ENOENT } s.inode.IncRef() // This ref is passed to the dentry upon creation via Init. return s.inode, nil } // ForEachChild calls fn on all children tracked by this ordered children. func (o *OrderedChildren) ForEachChild(fn func(string, Inode)) { o.mu.RLock() defer o.mu.RUnlock() for name, slot := range o.set { fn(name, slot.inode) } } // IterDirents implements Inode.IterDirents. func (o *OrderedChildren) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) { // All entries from OrderedChildren have already been handled in // GenericDirectoryFD.IterDirents. return offset, nil } // HasChildren implements Inode.HasChildren. func (o *OrderedChildren) HasChildren() bool { o.mu.RLock() defer o.mu.RUnlock() return len(o.set) > 0 } // Insert inserts a dynamic child into o. This ignores the writability of o, as // this is not part of the vfs.FilesystemImpl interface, and is a lower-level operation. func (o *OrderedChildren) Insert(name string, child Inode) error { return o.insert(name, child, false) } // Inserter is like Insert, but obtains the child to insert by calling // makeChild. makeChild is only called if the insert will succeed. This allows // the caller to atomically check and insert a child without having to // clean up the child on failure. func (o *OrderedChildren) Inserter(name string, makeChild func() Inode) (Inode, error) { o.mu.Lock() defer o.mu.Unlock() if _, ok := o.set[name]; ok { return nil, linuxerr.EEXIST } // Note: We must not fail after we call makeChild(). child := makeChild() s := &slot{ name: name, inode: child, static: false, } o.order.PushBack(s) o.set[name] = s return child, nil } // insert inserts child into o. // // Precondition: Caller must be holding a ref on child if static is true. // // Postcondition: Caller's ref on child is transferred to o if static is true. func (o *OrderedChildren) insert(name string, child Inode, static bool) error { o.mu.Lock() defer o.mu.Unlock() if _, ok := o.set[name]; ok { return linuxerr.EEXIST } s := &slot{ name: name, inode: child, static: static, } o.order.PushBack(s) o.set[name] = s return nil } // Precondition: caller must hold o.mu for writing. func (o *OrderedChildren) removeLocked(name string) { if s, ok := o.set[name]; ok { if s.static { panic(fmt.Sprintf("removeLocked called on a static inode: %v", s.inode)) } delete(o.set, name) o.order.Remove(s) } } // Precondition: caller must hold o.mu for reading or writing. func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error { s, ok := o.set[name] if !ok { return linuxerr.ENOENT } if s.inode != child { panic(fmt.Sprintf("Inode doesn't match what kernfs thinks! Name: %q, OrderedChild: %p, kernfs: %p", name, s.inode, child)) } return nil } // Unlink implements Inode.Unlink. func (o *OrderedChildren) Unlink(ctx context.Context, name string, child Inode) error { if !o.writable { return linuxerr.EPERM } o.mu.Lock() defer o.mu.Unlock() if err := o.checkExistingLocked(name, child); err != nil { return err } o.removeLocked(name) return nil } // RmDir implements Inode.RmDir. func (o *OrderedChildren) RmDir(ctx context.Context, name string, child Inode) error { // We're not responsible for checking that child is a directory, that it's // empty, or updating any link counts; so this is the same as unlink. return o.Unlink(ctx, name, child) } // Rename implements Inode.Rename. // // Precondition: Rename may only be called across two directory inodes with // identical implementations of Rename. Practically, this means filesystems that // implement Rename by embedding OrderedChildren for any directory // implementation must use OrderedChildren for all directory implementations // that will support Rename. // // Postcondition: reference on any replaced dentry transferred to caller. func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error { if !o.writable { return linuxerr.EPERM } dstIOC, ok := dstDir.(inodeWithOrderedChildren) if !ok { return linuxerr.EXDEV } dst := dstIOC.orderedChildren() if !dst.writable { return linuxerr.EPERM } // Note: There's a potential deadlock below if concurrent calls to Rename // refer to the same src and dst directories in reverse. We avoid any // ordering issues because the caller is required to serialize concurrent // calls to Rename in accordance with the interface declaration. o.mu.Lock() defer o.mu.Unlock() if dst != o { dst.mu.Lock() defer dst.mu.Unlock() } // Ensure target inode exists in src. if err := o.checkExistingLocked(oldname, child); err != nil { return err } // Ensure no name collision in dst. if _, ok := dst.set[newname]; ok { return linuxerr.EEXIST } // Remove from src. o.removeLocked(oldname) // Add to dst. s := &slot{ name: newname, inode: child, } dst.order.PushBack(s) dst.set[newname] = s return nil } // nthLocked returns an iterator to the nth child tracked by this object. The // iterator is valid until the caller releases o.mu. Returns nil if the // requested index falls out of bounds. // // Preconditon: Caller must hold o.mu for reading. func (o *OrderedChildren) nthLocked(i int64) *slot { for it := o.order.Front(); it != nil && i >= 0; it = it.Next() { if i == 0 { return it } i-- } return nil } // InodeSymlink partially implements Inode interface for symlinks. // // +stateify savable type InodeSymlink struct { InodeNotDirectory } // Open implements Inode.Open. func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return nil, linuxerr.ELOOP } // StaticDirectory is a standard implementation of a directory with static // contents. // // +stateify savable type StaticDirectory struct { InodeAlwaysValid InodeAttrs InodeDirectoryNoNewChildren InodeNoStatFS InodeNotAnonymous InodeNotSymlink InodeTemporary InodeWatches OrderedChildren StaticDirectoryRefs locks vfs.FileLocks fdOpts GenericDirectoryFDOptions } var _ Inode = (*StaticDirectory)(nil) // NewStaticDir creates a new static directory and returns its dentry. func NewStaticDir(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]Inode, fdOpts GenericDirectoryFDOptions) Inode { inode := &StaticDirectory{} inode.Init(ctx, creds, devMajor, devMinor, ino, perm, fdOpts) inode.InitRefs() inode.OrderedChildren.Init(OrderedChildrenOptions{}) links := inode.OrderedChildren.Populate(children) inode.IncLinks(links) return inode } // Init initializes StaticDirectory. func (s *StaticDirectory) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, fdOpts GenericDirectoryFDOptions) { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) } s.fdOpts = fdOpts s.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeDirectory|perm) } // Open implements Inode.Open. func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := NewGenericDirectoryFD(rp.Mount(), d, &s.OrderedChildren, &s.locks, &opts, s.fdOpts) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } // SetStat implements Inode.SetStat not allowing inode attributes to be changed. func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return linuxerr.EPERM } // DecRef implements Inode.DecRef. func (s *StaticDirectory) DecRef(ctx context.Context) { s.StaticDirectoryRefs.DecRef(func() { s.Destroy(ctx) }) } // InodeAlwaysValid partially implements Inode. // // +stateify savable type InodeAlwaysValid struct{} // Valid implements Inode.Valid. func (*InodeAlwaysValid) Valid(context.Context, *Dentry, string) bool { return true } // InodeTemporary partially implements Inode. // // +stateify savable type InodeTemporary struct{} // Keep implements Inode.Keep. func (*InodeTemporary) Keep() bool { return false } // InodeNoStatFS partially implements the Inode interface, where the client // filesystem doesn't support statfs(2). // // +stateify savable type InodeNoStatFS struct{} // StatFS implements Inode.StatFS. func (*InodeNoStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { return linux.Statfs{}, linuxerr.ENOSYS } // InodeWatches partially implements Inode. // // +stateify savable type InodeWatches struct { watches vfs.Watches } // Watches implements Inode.Watches. func (i *InodeWatches) Watches() *vfs.Watches { return &i.watches } // InodeAnonymous partially implements Inode. // // +stateify savable type InodeAnonymous struct{} // Anonymous implements Inode.Anonymous func (*InodeAnonymous) Anonymous() bool { return true } // InodeNotAnonymous partially implements Inode. // // +stateify savable type InodeNotAnonymous struct{} // Anonymous implements Inode.Anonymous func (*InodeNotAnonymous) Anonymous() bool { return false } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/kernfs/kernfs.go000066400000000000000000000724621465435605700253640ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package kernfs provides the tools to implement inode-based filesystems. // Kernfs has two main features: // // 1. The Inode interface, which maps VFS's path-based filesystem operations to // specific filesystem nodes. Kernfs uses the Inode interface to provide a // blanket implementation for the vfs.FilesystemImpl. Kernfs also serves as // the synchronization mechanism for all filesystem operations by holding a // filesystem-wide lock across all operations. // // 2. Various utility types which provide generic implementations for various // parts of the Inode and vfs.FileDescription interfaces. Client filesystems // based on kernfs can embed the appropriate set of these to avoid having to // reimplement common filesystem operations. See inode_impl_util.go and // fd_impl_util.go. // // Reference Model: // // Kernfs dentries represents named pointers to inodes. Kernfs is solely // responsible for maintaining and modifying its dentry tree; inode // implementations can not access the tree. Dentries and inodes have // independent lifetimes and reference counts. A child dentry unconditionally // holds a reference on its parent directory's dentry. A dentry also holds a // reference on the inode it points to (although that might not be the only // reference on the inode). Due to this inodes can outlive the dentries that // point to them. Multiple dentries can point to the same inode (for example, // in the case of hardlinks). File descriptors hold a reference to the dentry // they're opened on. // // Dentries are guaranteed to exist while holding Filesystem.mu for // reading. Dropping dentries require holding Filesystem.mu for writing. To // queue dentries for destruction from a read critical section, see // Filesystem.deferDecRef. // // Lock ordering: // // kernfs.Filesystem.mu // kernel.TaskSet.mu // kernel.Task.mu // kernfs.Dentry.dirMu // vfs.VirtualFilesystem.mountMu // vfs.Dentry.mu // (inode implementation locks, if any) // // kernfs.Filesystem.deferredDecRefsMu package kernfs import ( "fmt" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory // filesystem. Concrete implementations are expected to embed this in their own // Filesystem type. // // +stateify savable type Filesystem struct { vfsfs vfs.Filesystem deferredDecRefsMu deferredDecRefsMutex `state:"nosave"` // deferredDecRefs is a list of dentries waiting to be DecRef()ed. This is // used to defer dentry destruction until mu can be acquired for // writing. Protected by deferredDecRefsMu. deferredDecRefs []refs.RefCounter // mu synchronizes the lifetime of Dentries on this filesystem. Holding it // for reading guarantees continued existence of any resolved dentries, but // the dentry tree may be modified. // // Kernfs dentries can only be DecRef()ed while holding mu for writing. For // example: // // fs.mu.Lock() // defer fs.mu.Unlock() // ... // dentry1.DecRef() // defer dentry2.DecRef() // Ok, will run before Unlock. // // If discarding dentries in a read context, use Filesystem.deferDecRef. For // example: // // fs.mu.RLock() // defer fs.processDeferredDecRefs() // defer fs.mu.RUnlock() // ... // fs.deferDecRef(dentry) mu filesystemRWMutex `state:"nosave"` // nextInoMinusOne is used to to allocate inode numbers on this // filesystem. Must be accessed by atomic operations. nextInoMinusOne atomicbitops.Uint64 // cachedDentries contains all dentries with 0 references. (Due to race // conditions, it may also contain dentries with non-zero references.) // cachedDentriesLen is the number of dentries in cachedDentries. These // fields are protected by mu. cachedDentries dentryList cachedDentriesLen uint64 // MaxCachedDentries is the maximum size of cachedDentries. If not set, // defaults to 0 and kernfs does not cache any dentries. This is immutable. MaxCachedDentries uint64 // root is the root dentry of this filesystem. Note that root may be nil for // filesystems on a disconnected mount without a root (e.g. pipefs, sockfs, // hostfs). Filesystem holds an extra reference on root to prevent it from // being destroyed prematurely. This is immutable. root *Dentry } // deferDecRef defers dropping a dentry ref until the next call to // processDeferredDecRefs{,Locked}. See comment on Filesystem.mu. // This may be called while Filesystem.mu or Dentry.dirMu is locked. func (fs *Filesystem) deferDecRef(d refs.RefCounter) { fs.deferredDecRefsMu.Lock() fs.deferredDecRefs = append(fs.deferredDecRefs, d) fs.deferredDecRefsMu.Unlock() } // SafeDecRefFD safely DecRef the FileDescription making sure DecRef is deferred // in case Filesystem.mu is held. See comment on Filesystem.mu. func (fs *Filesystem) SafeDecRefFD(ctx context.Context, fd *vfs.FileDescription) { if d, ok := fd.Dentry().Impl().(*Dentry); ok && d.fs == fs { // Only defer if dentry belongs to this filesystem, since locks cannot cross // filesystems. fs.deferDecRef(fd) return } fd.DecRef(ctx) } // SafeDecRef safely DecRef the virtual dentry making sure DecRef is deferred // in case Filesystem.mu is held. See comment on Filesystem.mu. func (fs *Filesystem) SafeDecRef(ctx context.Context, vd vfs.VirtualDentry) { if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs { // Only defer if dentry belongs to this filesystem, since locks cannot cross // filesystems. fs.deferDecRef(&vd) return } vd.DecRef(ctx) } // processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the // deferredDecRefs list. See comment on Filesystem.mu. // // Precondition: Filesystem.mu or Dentry.dirMu must NOT be locked. func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) { fs.deferredDecRefsMu.Lock() for _, d := range fs.deferredDecRefs { // Defer the DecRef call so that we are not holding deferredDecRefsMu // when DecRef is called. defer d.DecRef(ctx) } fs.deferredDecRefs = fs.deferredDecRefs[:0] // Keep slice memory for reuse. fs.deferredDecRefsMu.Unlock() } // VFSFilesystem returns the generic vfs filesystem object. func (fs *Filesystem) VFSFilesystem() *vfs.Filesystem { return &fs.vfsfs } // NextIno allocates a new inode number on this filesystem. func (fs *Filesystem) NextIno() uint64 { return fs.nextInoMinusOne.Add(1) } // These consts are used in the Dentry.flags field. const ( // Dentry points to a directory inode. dflagsIsDir = 1 << iota // Dentry points to a symlink inode. dflagsIsSymlink ) // Dentry implements vfs.DentryImpl. // // A kernfs dentry is similar to a dentry in a traditional filesystem: it's a // named reference to an inode. A dentry generally lives as long as it's part of // a mounted filesystem tree. Kernfs drops dentries once all references to them // are dropped. Dentries hold a single reference to the inode they point // to, and child dentries hold a reference on their parent. // // Must be initialized by Init prior to first use. // // +stateify savable type Dentry struct { vfsd vfs.Dentry // refs is the reference count. When refs reaches 0, the dentry may be // added to the cache or destroyed. If refs == -1, the dentry has already // been destroyed. refs are allowed to go to 0 and increase again. refs is // accessed using atomic memory operations. refs atomicbitops.Int64 // fs is the owning filesystem. fs is immutable. fs *Filesystem // flags caches useful information about the dentry from the inode. See the // dflags* consts above. flags atomicbitops.Uint32 parent atomic.Pointer[Dentry] `state:".(*Dentry)"` name string // If cached is true, dentryEntry links dentry into // Filesystem.cachedDentries. cached and dentryEntry are protected by // Filesystem.mu. cached bool dentryEntry // dirMu protects children and the names of child Dentries. // // Note that holding fs.mu for writing is not sufficient; // revalidateChildLocked(), which is a very hot path, may modify children with // fs.mu acquired for reading only. dirMu sync.Mutex `state:"nosave"` children map[string]*Dentry inode Inode // If deleted is non-zero, the file represented by this dentry has been // deleted. deleted is accessed using atomic memory operations. deleted atomicbitops.Uint32 } // IncRef implements vfs.DentryImpl.IncRef. func (d *Dentry) IncRef() { // d.refs may be 0 if d.fs.mu is locked, which serializes against // d.cacheLocked(). r := d.refs.Add(1) if d.LogRefs() { refs.LogIncRef(d, r) } } // TryIncRef implements vfs.DentryImpl.TryIncRef. func (d *Dentry) TryIncRef() bool { for { r := d.refs.Load() if r <= 0 { return false } if d.refs.CompareAndSwap(r, r+1) { if d.LogRefs() { refs.LogTryIncRef(d, r+1) } return true } } } // DecRef implements vfs.DentryImpl.DecRef. func (d *Dentry) DecRef(ctx context.Context) { r := d.refs.Add(-1) if d.LogRefs() { refs.LogDecRef(d, r) } if r == 0 { if d.inode.Anonymous() { // Nothing to cache. Skip right to destroy. This avoids // taking fs.mu in the DecRef() path for anonymous // inodes. d.destroy(ctx) return } d.fs.mu.Lock() defer d.fs.mu.Unlock() d.cacheLocked(ctx) } else if r < 0 { panic("kernfs.Dentry.DecRef() called without holding a reference") } } func (d *Dentry) decRefLocked(ctx context.Context) { r := d.refs.Add(-1) if d.LogRefs() { refs.LogDecRef(d, r) } if r == 0 { d.cacheLocked(ctx) } else if r < 0 { panic("kernfs.Dentry.DecRef() called without holding a reference") } } // cacheLocked should be called after d's reference count becomes 0. The ref // count check may happen before acquiring d.fs.mu so there might be a race // condition where the ref count is increased again by the time the caller // acquires d.fs.mu. This race is handled. // Only reachable dentries are added to the cache. However, a dentry might // become unreachable *while* it is in the cache due to invalidation. // // Preconditions: d.fs.mu must be locked for writing. func (d *Dentry) cacheLocked(ctx context.Context) { // Dentries with a non-zero reference count must be retained. (The only way // to obtain a reference on a dentry with zero references is via path // resolution, which requires d.fs.mu, so if d.refs is zero then it will // remain zero while we hold d.fs.mu for writing.) refs := d.refs.Load() if refs == -1 { // Dentry has already been destroyed. return } if refs > 0 { if d.cached { d.fs.cachedDentries.Remove(d) d.fs.cachedDentriesLen-- d.cached = false } return } // If the dentry is deleted and invalidated or has no parent, then it is no // longer reachable by path resolution and should be dropped immediately // because it has zero references. // Note that a dentry may not always have a parent; for example magic links // as described in Inode.Getlink. if isDead, parent := d.VFSDentry().IsDead(), d.parent.Load(); isDead || parent == nil { if !isDead { rcs := d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, d.VFSDentry()) for _, rc := range rcs { d.fs.deferDecRef(rc) } } if d.cached { d.fs.cachedDentries.Remove(d) d.fs.cachedDentriesLen-- d.cached = false } if d.isDeleted() { d.inode.Watches().HandleDeletion(ctx) } d.destroy(ctx) if parent != nil { parent.decRefLocked(ctx) } return } if d.VFSDentry().IsEvictable() { d.evictLocked(ctx) return } // If d is already cached, just move it to the front of the LRU. if d.cached { d.fs.cachedDentries.Remove(d) d.fs.cachedDentries.PushFront(d) return } // Cache the dentry, then evict the least recently used cached dentry if // the cache becomes over-full. d.fs.cachedDentries.PushFront(d) d.fs.cachedDentriesLen++ d.cached = true if d.fs.cachedDentriesLen <= d.fs.MaxCachedDentries { return } d.fs.evictCachedDentryLocked(ctx) // Whether or not victim was destroyed, we brought fs.cachedDentriesLen // back down to fs.opts.maxCachedDentries, so we don't loop. } // Preconditions: // - fs.mu must be locked for writing. func (fs *Filesystem) evictCachedDentryLocked(ctx context.Context) { // Evict the least recently used dentry because cache size is greater than // max cache size (configured on mount). fs.cachedDentries.Back().evictLocked(ctx) } // Preconditions: // - d.fs.mu must be locked for writing. func (d *Dentry) evictLocked(ctx context.Context) { if d == nil { return } if d.cached { d.fs.cachedDentries.Remove(d) d.fs.cachedDentriesLen-- d.cached = false } // victim.refs may have become non-zero from an earlier path resolution // after it was inserted into fs.cachedDentries. if d.refs.Load() == 0 { if !d.vfsd.IsDead() { parent := d.parent.Load() parent.dirMu.Lock() // Note that victim can't be a mount point (in any mount // namespace), since VFS holds references on mount points. rcs := d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, d.VFSDentry()) for _, rc := range rcs { d.fs.deferDecRef(rc) } delete(parent.children, d.name) parent.dirMu.Unlock() } d.destroy(ctx) if parent := d.parent.Load(); parent != nil { parent.decRefLocked(ctx) } } } // destroy destroys the dentry. // // Preconditions: // - d.refs == 0. // - d should have been removed from d.parent.children, i.e. d is not reachable // by path traversal. // - d.vfsd.IsDead() is true. func (d *Dentry) destroy(ctx context.Context) { switch refs := d.refs.Load(); refs { case 0: // Mark the dentry destroyed. d.refs.Store(-1) case -1: panic("dentry.destroy() called on already destroyed dentry") default: panic("dentry.destroy() called with references on the dentry") } d.inode.DecRef(ctx) // IncRef from Init. refs.Unregister(d) } // RefType implements refs.CheckedObject.Type. func (d *Dentry) RefType() string { return "kernfs.Dentry" } // LeakMessage implements refs.CheckedObject.LeakMessage. func (d *Dentry) LeakMessage() string { return fmt.Sprintf("[kernfs.Dentry %p] reference count of %d instead of -1", d, d.refs.Load()) } // LogRefs implements refs.CheckedObject.LogRefs. // // This should only be set to true for debugging purposes, as it can generate an // extremely large amount of output and drastically degrade performance. func (d *Dentry) LogRefs() bool { return false } // InitRoot initializes this dentry as the root of the filesystem. // // Precondition: Caller must hold a reference on inode. // // Postcondition: Caller's reference on inode is transferred to the dentry. func (d *Dentry) InitRoot(fs *Filesystem, inode Inode) { d.Init(fs, inode) fs.root = d // Hold an extra reference on the root dentry. It is held by fs to prevent the // root from being "cached" and subsequently evicted. d.IncRef() } // Init initializes this dentry. // // Precondition: Caller must hold a reference on inode. // // Postcondition: Caller's reference on inode is transferred to the dentry. func (d *Dentry) Init(fs *Filesystem, inode Inode) { d.vfsd.Init(d) d.fs = fs d.inode = inode d.refs.Store(1) ftype := inode.Mode().FileType() if ftype == linux.ModeDirectory { d.flags = atomicbitops.FromUint32(d.flags.RacyLoad() | dflagsIsDir) } if ftype == linux.ModeSymlink { d.flags = atomicbitops.FromUint32(d.flags.RacyLoad() | dflagsIsSymlink) } refs.Register(d) } // VFSDentry returns the generic vfs dentry for this kernfs dentry. func (d *Dentry) VFSDentry() *vfs.Dentry { return &d.vfsd } func (d *Dentry) isDeleted() bool { return d.deleted.Load() != 0 } func (d *Dentry) setDeleted() { d.deleted.Store(1) } // isDir checks whether the dentry points to a directory inode. func (d *Dentry) isDir() bool { return d.flags.Load()&dflagsIsDir != 0 } // isSymlink checks whether the dentry points to a symlink inode. func (d *Dentry) isSymlink() bool { return d.flags.Load()&dflagsIsSymlink != 0 } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { if d.isDir() { events |= linux.IN_ISDIR } // Linux always notifies the parent first. // Don't bother looking for a parent if the inode is anonymous. It // won't have one. if !d.inode.Anonymous() { d.fs.mu.RLock() if parent := d.parent.Load(); parent != nil { parent.inode.Watches().Notify(ctx, d.name, events, cookie, et, d.isDeleted()) } d.fs.mu.RUnlock() } d.inode.Watches().Notify(ctx, "", events, cookie, et, d.isDeleted()) } // Watches implements vfs.DentryImpl.Watches. func (d *Dentry) Watches() *vfs.Watches { return d.inode.Watches() } // OnZeroWatches implements vfs.Dentry.OnZeroWatches. func (d *Dentry) OnZeroWatches(context.Context) {} // insertChild inserts child into the vfs dentry cache with the given name under // this dentry. This does not update the directory inode, so calling this on its // own isn't sufficient to insert a child into a directory. // // Preconditions: // - d must represent a directory inode. // - d.fs.mu must be locked for at least reading. func (d *Dentry) insertChild(name string, child *Dentry) { d.dirMu.Lock() d.insertChildLocked(name, child) d.dirMu.Unlock() } // insertChildLocked is equivalent to insertChild, with additional // preconditions. // // Preconditions: // - d must represent a directory inode. // - d.dirMu must be locked. // - d.fs.mu must be locked for at least reading. func (d *Dentry) insertChildLocked(name string, child *Dentry) { if !d.isDir() { panic(fmt.Sprintf("insertChildLocked called on non-directory Dentry: %+v.", d)) } d.IncRef() // DecRef in child's Dentry.destroy. child.parent.Store(d) child.name = name if d.children == nil { d.children = make(map[string]*Dentry) } d.children[name] = child } // Inode returns the dentry's inode. func (d *Dentry) Inode() Inode { return d.inode } // FSLocalPath returns an absolute path to d, relative to the root of its // filesystem. func (d *Dentry) FSLocalPath() string { var b fspath.Builder _ = genericPrependPath(vfs.VirtualDentry{}, nil, d, &b) b.PrependByte('/') return b.String() } // WalkDentryTree traverses p in the dentry tree for this filesystem. Note that // this only traverses the dentry tree and is not a general path traversal. No // symlinks and dynamic children are resolved, and no permission checks are // performed. The caller is responsible for ensuring the returned Dentry exists // for an appropriate lifetime. // // p is interpreted starting at d, and may be absolute or relative (absolute vs // relative paths both refer to the same target here, since p is absolute from // d). p may contain "." and "..", but will not allow traversal above d (similar // to ".." at the root dentry). // // This is useful for filesystem internals, where the filesystem may not be // mounted yet. For a mounted filesystem, use GetDentryAt. func (d *Dentry) WalkDentryTree(ctx context.Context, vfsObj *vfs.VirtualFilesystem, p fspath.Path) (*Dentry, error) { d.fs.mu.RLock() defer d.fs.processDeferredDecRefs(ctx) defer d.fs.mu.RUnlock() target := d for pit := p.Begin; pit.Ok(); pit = pit.Next() { pc := pit.String() switch { case target == nil: return nil, linuxerr.ENOENT case pc == ".": // No-op, consume component and continue. case pc == "..": if target == d { // Don't let .. traverse above the start point of the walk. continue } target = target.parent.Load() // Parent doesn't need revalidation since we revalidated it on the // way to the child, and we're still holding fs.mu. default: var err error target, err = d.fs.revalidateChildLocked(ctx, vfsObj, target, pc) if err != nil { return nil, err } } } if target == nil { return nil, linuxerr.ENOENT } target.IncRef() return target, nil } // Parent returns the parent of this Dentry. This is not safe in general, the // filesystem may concurrently move d elsewhere. The caller is responsible for // ensuring the returned result remains valid while it is used. func (d *Dentry) Parent() *Dentry { return d.parent.Load() } // The Inode interface maps filesystem-level operations that operate on paths to // equivalent operations on specific filesystem nodes. // // The interface methods are groups into logical categories as sub interfaces // below. Generally, an implementation for each sub interface can be provided by // embedding an appropriate type from inode_impl_utils.go. The sub interfaces // are purely organizational. Methods declared directly in the main interface // have no generic implementations, and should be explicitly provided by the // client filesystem. // // Generally, implementations are not responsible for tasks that are common to // all filesystems. These include: // // - Checking that dentries passed to methods are of the appropriate file type. // - Checking permissions. // // Inode functions may be called holding filesystem wide locks and are not // allowed to call vfs functions that may reenter, unless otherwise noted. // // Specific responsibilities of implementations are documented below. type Inode interface { // Methods related to reference counting. A generic implementation is // provided by InodeNoopRefCount. These methods are generally called by the // equivalent Dentry methods. inodeRefs // Methods related to node metadata. A generic implementation is provided by // InodeAttrs. Note that a concrete filesystem using kernfs is responsible for // managing link counts. inodeMetadata // Method for inodes that represent symlink. InodeNotSymlink provides a // blanket implementation for all non-symlink inodes. inodeSymlink // Method for inodes that represent directories. InodeNotDirectory provides // a blanket implementation for all non-directory inodes. inodeDirectory // Open creates a file description for the filesystem object represented by // this inode. The returned file description should hold a reference on the // dentry for its lifetime. // // Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing // the inode on which Open() is being called. Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) // StatFS returns filesystem statistics for the client filesystem. This // corresponds to vfs.FilesystemImpl.StatFSAt. If the client filesystem // doesn't support statfs(2), this should return ENOSYS. StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) // Keep indicates whether the dentry created after Inode.Lookup should be // kept in the kernfs dentry tree. Keep() bool // Valid should return true if this inode is still valid, or needs to // be resolved again by a call to Lookup. Valid(ctx context.Context, parent *Dentry, name string) bool // Watches returns the set of inotify watches associated with this inode. Watches() *vfs.Watches // Anonymous indicates that the Inode is anonymous. It will never have // a name or parent. Anonymous() bool } type inodeRefs interface { IncRef() DecRef(ctx context.Context) TryIncRef() bool } type inodeMetadata interface { // CheckPermissions checks that creds may access this inode for the // requested access type, per the rules of // fs/namei.c:generic_permission(). CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error // Mode returns the (struct stat)::st_mode value for this inode. This is // separated from Stat for performance. Mode() linux.FileMode // UID returns the (struct stat)::st_uid value for this inode. This is // separated from Stat for performance. UID() auth.KUID // GID returns the (struct stat)::st_gid value for this inode. This is // separated from Stat for performance. GID() auth.KGID // Stat returns the metadata for this inode. This corresponds to // vfs.FilesystemImpl.StatAt. Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) // SetStat updates the metadata for this inode. This corresponds to // vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking // if the operation can be performed (see vfs.CheckSetStat() for common // checks). SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error } // Precondition: All methods in this interface may only be called on directory // inodes. type inodeDirectory interface { // The New{File,Dir,Node,Link,Symlink} methods below should return a new inode // that will be hashed into the dentry tree. // // These inode constructors are inode-level operations rather than // filesystem-level operations to allow client filesystems to mix different // implementations based on the new node's location in the // filesystem. // HasChildren returns true if the directory inode has any children. HasChildren() bool // NewFile creates a new regular file inode. NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error) // NewDir creates a new directory inode. NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error) // NewLink creates a new hardlink to a specified inode in this // directory. Implementations should create a new kernfs Dentry pointing to // target, and update target's link count. NewLink(ctx context.Context, name string, target Inode) (Inode, error) // NewSymlink creates a new symbolic link inode. NewSymlink(ctx context.Context, name, target string) (Inode, error) // NewNode creates a new filesystem node for a mknod syscall. NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error) // Unlink removes a child dentry from this directory inode. Unlink(ctx context.Context, name string, child Inode) error // RmDir removes an empty child directory from this directory // inode. Implementations must update the parent directory's link count, // if required. Implementations are not responsible for checking that child // is a directory, or checking for an empty directory. RmDir(ctx context.Context, name string, child Inode) error // Rename is called on the source directory containing an inode being // renamed. child points to the resolved child in the source directory. // dstDir is guaranteed to be a directory inode. // // On a successful call to Rename, the caller updates the dentry tree to // reflect the name change. // // Precondition: Caller must serialize concurrent calls to Rename. Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error // Lookup should return an appropriate inode if name should resolve to a // child of this directory inode. This gives the directory an opportunity // on every lookup to resolve additional entries. This is only called when // the inode is a directory. // // The child returned by Lookup will be hashed into the VFS dentry tree, // at least for the duration of the current FS operation. // // Lookup must return the child with an extra reference whose ownership is // transferred to the dentry that is created to point to that inode. If // Inode.Keep returns false, that new dentry will be dropped at the end of // the current filesystem operation (before returning back to the VFS // layer) if no other ref is picked on that dentry. If Inode.Keep returns // true, then the dentry will be cached into the dentry tree until it is // Unlink'd or RmDir'd. Lookup(ctx context.Context, name string) (Inode, error) // IterDirents is used to iterate over dynamically created entries. It invokes // cb on each entry in the directory represented by the Inode. // 'offset' is the offset for the entire IterDirents call, which may include // results from the caller (e.g. "." and ".."). 'relOffset' is the offset // inside the entries returned by this IterDirents invocation. In other words, // 'offset' should be used to calculate each vfs.Dirent.NextOff as well as // the return value, while 'relOffset' is the place to start iteration. IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) } type inodeSymlink interface { // Readlink returns the target of a symbolic link. If an inode is not a // symlink, the implementation should return EINVAL. // // Readlink is called with no kernfs locks held, so it may reenter if needed // to resolve symlink targets. Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) // Getlink returns the target of a symbolic link, as used by path // resolution: // // - If the inode is a "magic link" (a link whose target is most accurately // represented as a VirtualDentry), Getlink returns (ok VirtualDentry, "", // nil). A reference is taken on the returned VirtualDentry. // // - If the inode is an ordinary symlink, Getlink returns (zero-value // VirtualDentry, symlink target, nil). // // - If the inode is not a symlink, Getlink returns (zero-value // VirtualDentry, "", EINVAL). Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/kernfs/kernfs_state_autogen.go000066400000000000000000000672171465435605700303100ustar00rootroot00000000000000// automatically generated by stateify. package kernfs import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (l *dentryList) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.dentryList" } func (l *dentryList) StateFields() []string { return []string{ "head", "tail", } } func (l *dentryList) beforeSave() {} // +checklocksignore func (l *dentryList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *dentryList) afterLoad(context.Context) {} // +checklocksignore func (l *dentryList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *dentryEntry) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.dentryEntry" } func (e *dentryEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *dentryEntry) beforeSave() {} // +checklocksignore func (e *dentryEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *dentryEntry) afterLoad(context.Context) {} // +checklocksignore func (e *dentryEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (f *DynamicBytesFile) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.DynamicBytesFile" } func (f *DynamicBytesFile) StateFields() []string { return []string{ "InodeAttrs", "InodeNoStatFS", "InodeNoopRefCount", "InodeNotAnonymous", "InodeNotDirectory", "InodeNotSymlink", "InodeWatches", "locks", "data", } } func (f *DynamicBytesFile) beforeSave() {} // +checklocksignore func (f *DynamicBytesFile) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.InodeAttrs) stateSinkObject.Save(1, &f.InodeNoStatFS) stateSinkObject.Save(2, &f.InodeNoopRefCount) stateSinkObject.Save(3, &f.InodeNotAnonymous) stateSinkObject.Save(4, &f.InodeNotDirectory) stateSinkObject.Save(5, &f.InodeNotSymlink) stateSinkObject.Save(6, &f.InodeWatches) stateSinkObject.Save(7, &f.locks) stateSinkObject.Save(8, &f.data) } func (f *DynamicBytesFile) afterLoad(context.Context) {} // +checklocksignore func (f *DynamicBytesFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.InodeAttrs) stateSourceObject.Load(1, &f.InodeNoStatFS) stateSourceObject.Load(2, &f.InodeNoopRefCount) stateSourceObject.Load(3, &f.InodeNotAnonymous) stateSourceObject.Load(4, &f.InodeNotDirectory) stateSourceObject.Load(5, &f.InodeNotSymlink) stateSourceObject.Load(6, &f.InodeWatches) stateSourceObject.Load(7, &f.locks) stateSourceObject.Load(8, &f.data) } func (fd *DynamicBytesFD) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.DynamicBytesFD" } func (fd *DynamicBytesFD) StateFields() []string { return []string{ "FileDescriptionDefaultImpl", "DynamicBytesFileDescriptionImpl", "LockFD", "vfsfd", "inode", } } func (fd *DynamicBytesFD) beforeSave() {} // +checklocksignore func (fd *DynamicBytesFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(1, &fd.DynamicBytesFileDescriptionImpl) stateSinkObject.Save(2, &fd.LockFD) stateSinkObject.Save(3, &fd.vfsfd) stateSinkObject.Save(4, &fd.inode) } func (fd *DynamicBytesFD) afterLoad(context.Context) {} // +checklocksignore func (fd *DynamicBytesFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(1, &fd.DynamicBytesFileDescriptionImpl) stateSourceObject.Load(2, &fd.LockFD) stateSourceObject.Load(3, &fd.vfsfd) stateSourceObject.Load(4, &fd.inode) } func (s *SeekEndConfig) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.SeekEndConfig" } func (s *SeekEndConfig) StateFields() []string { return nil } func (g *GenericDirectoryFDOptions) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.GenericDirectoryFDOptions" } func (g *GenericDirectoryFDOptions) StateFields() []string { return []string{ "SeekEnd", } } func (g *GenericDirectoryFDOptions) beforeSave() {} // +checklocksignore func (g *GenericDirectoryFDOptions) StateSave(stateSinkObject state.Sink) { g.beforeSave() stateSinkObject.Save(0, &g.SeekEnd) } func (g *GenericDirectoryFDOptions) afterLoad(context.Context) {} // +checklocksignore func (g *GenericDirectoryFDOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &g.SeekEnd) } func (fd *GenericDirectoryFD) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.GenericDirectoryFD" } func (fd *GenericDirectoryFD) StateFields() []string { return []string{ "FileDescriptionDefaultImpl", "DirectoryFileDescriptionDefaultImpl", "LockFD", "seekEnd", "vfsfd", "children", "off", } } func (fd *GenericDirectoryFD) beforeSave() {} // +checklocksignore func (fd *GenericDirectoryFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(1, &fd.DirectoryFileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.LockFD) stateSinkObject.Save(3, &fd.seekEnd) stateSinkObject.Save(4, &fd.vfsfd) stateSinkObject.Save(5, &fd.children) stateSinkObject.Save(6, &fd.off) } func (fd *GenericDirectoryFD) afterLoad(context.Context) {} // +checklocksignore func (fd *GenericDirectoryFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(1, &fd.DirectoryFileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.LockFD) stateSourceObject.Load(3, &fd.seekEnd) stateSourceObject.Load(4, &fd.vfsfd) stateSourceObject.Load(5, &fd.children) stateSourceObject.Load(6, &fd.off) } func (i *InodeNoopRefCount) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.InodeNoopRefCount" } func (i *InodeNoopRefCount) StateFields() []string { return []string{ "InodeTemporary", } } func (i *InodeNoopRefCount) beforeSave() {} // +checklocksignore func (i *InodeNoopRefCount) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.InodeTemporary) } func (i *InodeNoopRefCount) afterLoad(context.Context) {} // +checklocksignore func (i *InodeNoopRefCount) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.InodeTemporary) } func (i *InodeDirectoryNoNewChildren) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.InodeDirectoryNoNewChildren" } func (i *InodeDirectoryNoNewChildren) StateFields() []string { return []string{} } func (i *InodeDirectoryNoNewChildren) beforeSave() {} // +checklocksignore func (i *InodeDirectoryNoNewChildren) StateSave(stateSinkObject state.Sink) { i.beforeSave() } func (i *InodeDirectoryNoNewChildren) afterLoad(context.Context) {} // +checklocksignore func (i *InodeDirectoryNoNewChildren) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (i *InodeNotDirectory) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.InodeNotDirectory" } func (i *InodeNotDirectory) StateFields() []string { return []string{ "InodeAlwaysValid", } } func (i *InodeNotDirectory) beforeSave() {} // +checklocksignore func (i *InodeNotDirectory) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.InodeAlwaysValid) } func (i *InodeNotDirectory) afterLoad(context.Context) {} // +checklocksignore func (i *InodeNotDirectory) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.InodeAlwaysValid) } func (i *InodeNotSymlink) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.InodeNotSymlink" } func (i *InodeNotSymlink) StateFields() []string { return []string{} } func (i *InodeNotSymlink) beforeSave() {} // +checklocksignore func (i *InodeNotSymlink) StateSave(stateSinkObject state.Sink) { i.beforeSave() } func (i *InodeNotSymlink) afterLoad(context.Context) {} // +checklocksignore func (i *InodeNotSymlink) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (a *InodeAttrs) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.InodeAttrs" } func (a *InodeAttrs) StateFields() []string { return []string{ "devMajor", "devMinor", "ino", "mode", "uid", "gid", "nlink", "blockSize", "atime", "mtime", "ctime", } } func (a *InodeAttrs) beforeSave() {} // +checklocksignore func (a *InodeAttrs) StateSave(stateSinkObject state.Sink) { a.beforeSave() stateSinkObject.Save(0, &a.devMajor) stateSinkObject.Save(1, &a.devMinor) stateSinkObject.Save(2, &a.ino) stateSinkObject.Save(3, &a.mode) stateSinkObject.Save(4, &a.uid) stateSinkObject.Save(5, &a.gid) stateSinkObject.Save(6, &a.nlink) stateSinkObject.Save(7, &a.blockSize) stateSinkObject.Save(8, &a.atime) stateSinkObject.Save(9, &a.mtime) stateSinkObject.Save(10, &a.ctime) } func (a *InodeAttrs) afterLoad(context.Context) {} // +checklocksignore func (a *InodeAttrs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &a.devMajor) stateSourceObject.Load(1, &a.devMinor) stateSourceObject.Load(2, &a.ino) stateSourceObject.Load(3, &a.mode) stateSourceObject.Load(4, &a.uid) stateSourceObject.Load(5, &a.gid) stateSourceObject.Load(6, &a.nlink) stateSourceObject.Load(7, &a.blockSize) stateSourceObject.Load(8, &a.atime) stateSourceObject.Load(9, &a.mtime) stateSourceObject.Load(10, &a.ctime) } func (s *slot) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.slot" } func (s *slot) StateFields() []string { return []string{ "name", "inode", "static", "slotEntry", } } func (s *slot) beforeSave() {} // +checklocksignore func (s *slot) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.name) stateSinkObject.Save(1, &s.inode) stateSinkObject.Save(2, &s.static) stateSinkObject.Save(3, &s.slotEntry) } func (s *slot) afterLoad(context.Context) {} // +checklocksignore func (s *slot) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.name) stateSourceObject.Load(1, &s.inode) stateSourceObject.Load(2, &s.static) stateSourceObject.Load(3, &s.slotEntry) } func (o *OrderedChildrenOptions) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.OrderedChildrenOptions" } func (o *OrderedChildrenOptions) StateFields() []string { return []string{ "Writable", } } func (o *OrderedChildrenOptions) beforeSave() {} // +checklocksignore func (o *OrderedChildrenOptions) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.Writable) } func (o *OrderedChildrenOptions) afterLoad(context.Context) {} // +checklocksignore func (o *OrderedChildrenOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.Writable) } func (o *OrderedChildren) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.OrderedChildren" } func (o *OrderedChildren) StateFields() []string { return []string{ "writable", "order", "set", } } func (o *OrderedChildren) beforeSave() {} // +checklocksignore func (o *OrderedChildren) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.writable) stateSinkObject.Save(1, &o.order) stateSinkObject.Save(2, &o.set) } func (o *OrderedChildren) afterLoad(context.Context) {} // +checklocksignore func (o *OrderedChildren) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.writable) stateSourceObject.Load(1, &o.order) stateSourceObject.Load(2, &o.set) } func (i *InodeSymlink) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.InodeSymlink" } func (i *InodeSymlink) StateFields() []string { return []string{ "InodeNotDirectory", } } func (i *InodeSymlink) beforeSave() {} // +checklocksignore func (i *InodeSymlink) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.InodeNotDirectory) } func (i *InodeSymlink) afterLoad(context.Context) {} // +checklocksignore func (i *InodeSymlink) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.InodeNotDirectory) } func (s *StaticDirectory) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.StaticDirectory" } func (s *StaticDirectory) StateFields() []string { return []string{ "InodeAlwaysValid", "InodeAttrs", "InodeDirectoryNoNewChildren", "InodeNoStatFS", "InodeNotAnonymous", "InodeNotSymlink", "InodeTemporary", "InodeWatches", "OrderedChildren", "StaticDirectoryRefs", "locks", "fdOpts", } } func (s *StaticDirectory) beforeSave() {} // +checklocksignore func (s *StaticDirectory) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.InodeAlwaysValid) stateSinkObject.Save(1, &s.InodeAttrs) stateSinkObject.Save(2, &s.InodeDirectoryNoNewChildren) stateSinkObject.Save(3, &s.InodeNoStatFS) stateSinkObject.Save(4, &s.InodeNotAnonymous) stateSinkObject.Save(5, &s.InodeNotSymlink) stateSinkObject.Save(6, &s.InodeTemporary) stateSinkObject.Save(7, &s.InodeWatches) stateSinkObject.Save(8, &s.OrderedChildren) stateSinkObject.Save(9, &s.StaticDirectoryRefs) stateSinkObject.Save(10, &s.locks) stateSinkObject.Save(11, &s.fdOpts) } func (s *StaticDirectory) afterLoad(context.Context) {} // +checklocksignore func (s *StaticDirectory) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.InodeAlwaysValid) stateSourceObject.Load(1, &s.InodeAttrs) stateSourceObject.Load(2, &s.InodeDirectoryNoNewChildren) stateSourceObject.Load(3, &s.InodeNoStatFS) stateSourceObject.Load(4, &s.InodeNotAnonymous) stateSourceObject.Load(5, &s.InodeNotSymlink) stateSourceObject.Load(6, &s.InodeTemporary) stateSourceObject.Load(7, &s.InodeWatches) stateSourceObject.Load(8, &s.OrderedChildren) stateSourceObject.Load(9, &s.StaticDirectoryRefs) stateSourceObject.Load(10, &s.locks) stateSourceObject.Load(11, &s.fdOpts) } func (i *InodeAlwaysValid) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.InodeAlwaysValid" } func (i *InodeAlwaysValid) StateFields() []string { return []string{} } func (i *InodeAlwaysValid) beforeSave() {} // +checklocksignore func (i *InodeAlwaysValid) StateSave(stateSinkObject state.Sink) { i.beforeSave() } func (i *InodeAlwaysValid) afterLoad(context.Context) {} // +checklocksignore func (i *InodeAlwaysValid) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (i *InodeTemporary) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.InodeTemporary" } func (i *InodeTemporary) StateFields() []string { return []string{} } func (i *InodeTemporary) beforeSave() {} // +checklocksignore func (i *InodeTemporary) StateSave(stateSinkObject state.Sink) { i.beforeSave() } func (i *InodeTemporary) afterLoad(context.Context) {} // +checklocksignore func (i *InodeTemporary) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (i *InodeNoStatFS) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.InodeNoStatFS" } func (i *InodeNoStatFS) StateFields() []string { return []string{} } func (i *InodeNoStatFS) beforeSave() {} // +checklocksignore func (i *InodeNoStatFS) StateSave(stateSinkObject state.Sink) { i.beforeSave() } func (i *InodeNoStatFS) afterLoad(context.Context) {} // +checklocksignore func (i *InodeNoStatFS) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (i *InodeWatches) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.InodeWatches" } func (i *InodeWatches) StateFields() []string { return []string{ "watches", } } func (i *InodeWatches) beforeSave() {} // +checklocksignore func (i *InodeWatches) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.watches) } func (i *InodeWatches) afterLoad(context.Context) {} // +checklocksignore func (i *InodeWatches) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.watches) } func (i *InodeAnonymous) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.InodeAnonymous" } func (i *InodeAnonymous) StateFields() []string { return []string{} } func (i *InodeAnonymous) beforeSave() {} // +checklocksignore func (i *InodeAnonymous) StateSave(stateSinkObject state.Sink) { i.beforeSave() } func (i *InodeAnonymous) afterLoad(context.Context) {} // +checklocksignore func (i *InodeAnonymous) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (i *InodeNotAnonymous) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.InodeNotAnonymous" } func (i *InodeNotAnonymous) StateFields() []string { return []string{} } func (i *InodeNotAnonymous) beforeSave() {} // +checklocksignore func (i *InodeNotAnonymous) StateSave(stateSinkObject state.Sink) { i.beforeSave() } func (i *InodeNotAnonymous) afterLoad(context.Context) {} // +checklocksignore func (i *InodeNotAnonymous) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (fs *Filesystem) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.Filesystem" } func (fs *Filesystem) StateFields() []string { return []string{ "vfsfs", "deferredDecRefs", "nextInoMinusOne", "cachedDentries", "cachedDentriesLen", "MaxCachedDentries", "root", } } func (fs *Filesystem) beforeSave() {} // +checklocksignore func (fs *Filesystem) StateSave(stateSinkObject state.Sink) { fs.beforeSave() stateSinkObject.Save(0, &fs.vfsfs) stateSinkObject.Save(1, &fs.deferredDecRefs) stateSinkObject.Save(2, &fs.nextInoMinusOne) stateSinkObject.Save(3, &fs.cachedDentries) stateSinkObject.Save(4, &fs.cachedDentriesLen) stateSinkObject.Save(5, &fs.MaxCachedDentries) stateSinkObject.Save(6, &fs.root) } func (fs *Filesystem) afterLoad(context.Context) {} // +checklocksignore func (fs *Filesystem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fs.vfsfs) stateSourceObject.Load(1, &fs.deferredDecRefs) stateSourceObject.Load(2, &fs.nextInoMinusOne) stateSourceObject.Load(3, &fs.cachedDentries) stateSourceObject.Load(4, &fs.cachedDentriesLen) stateSourceObject.Load(5, &fs.MaxCachedDentries) stateSourceObject.Load(6, &fs.root) } func (d *Dentry) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.Dentry" } func (d *Dentry) StateFields() []string { return []string{ "vfsd", "refs", "fs", "flags", "parent", "name", "cached", "dentryEntry", "children", "inode", "deleted", } } func (d *Dentry) beforeSave() {} // +checklocksignore func (d *Dentry) StateSave(stateSinkObject state.Sink) { d.beforeSave() var parentValue *Dentry parentValue = d.saveParent() stateSinkObject.SaveValue(4, parentValue) stateSinkObject.Save(0, &d.vfsd) stateSinkObject.Save(1, &d.refs) stateSinkObject.Save(2, &d.fs) stateSinkObject.Save(3, &d.flags) stateSinkObject.Save(5, &d.name) stateSinkObject.Save(6, &d.cached) stateSinkObject.Save(7, &d.dentryEntry) stateSinkObject.Save(8, &d.children) stateSinkObject.Save(9, &d.inode) stateSinkObject.Save(10, &d.deleted) } // +checklocksignore func (d *Dentry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.vfsd) stateSourceObject.Load(1, &d.refs) stateSourceObject.Load(2, &d.fs) stateSourceObject.Load(3, &d.flags) stateSourceObject.Load(5, &d.name) stateSourceObject.Load(6, &d.cached) stateSourceObject.Load(7, &d.dentryEntry) stateSourceObject.Load(8, &d.children) stateSourceObject.Load(9, &d.inode) stateSourceObject.Load(10, &d.deleted) stateSourceObject.LoadValue(4, new(*Dentry), func(y any) { d.loadParent(ctx, y.(*Dentry)) }) stateSourceObject.AfterLoad(func() { d.afterLoad(ctx) }) } func (i *inodePlatformFile) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.inodePlatformFile" } func (i *inodePlatformFile) StateFields() []string { return []string{ "NoBufferedIOFallback", "hostFD", "fdRefs", "fileMapper", } } func (i *inodePlatformFile) beforeSave() {} // +checklocksignore func (i *inodePlatformFile) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.NoBufferedIOFallback) stateSinkObject.Save(1, &i.hostFD) stateSinkObject.Save(2, &i.fdRefs) stateSinkObject.Save(3, &i.fileMapper) } // +checklocksignore func (i *inodePlatformFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.NoBufferedIOFallback) stateSourceObject.Load(1, &i.hostFD) stateSourceObject.Load(2, &i.fdRefs) stateSourceObject.Load(3, &i.fileMapper) stateSourceObject.AfterLoad(func() { i.afterLoad(ctx) }) } func (i *CachedMappable) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.CachedMappable" } func (i *CachedMappable) StateFields() []string { return []string{ "mappings", "pf", } } func (i *CachedMappable) beforeSave() {} // +checklocksignore func (i *CachedMappable) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.mappings) stateSinkObject.Save(1, &i.pf) } func (i *CachedMappable) afterLoad(context.Context) {} // +checklocksignore func (i *CachedMappable) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.mappings) stateSourceObject.Load(1, &i.pf) } func (l *slotList) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.slotList" } func (l *slotList) StateFields() []string { return []string{ "head", "tail", } } func (l *slotList) beforeSave() {} // +checklocksignore func (l *slotList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *slotList) afterLoad(context.Context) {} // +checklocksignore func (l *slotList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *slotEntry) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.slotEntry" } func (e *slotEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *slotEntry) beforeSave() {} // +checklocksignore func (e *slotEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *slotEntry) afterLoad(context.Context) {} // +checklocksignore func (e *slotEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (r *StaticDirectoryRefs) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.StaticDirectoryRefs" } func (r *StaticDirectoryRefs) StateFields() []string { return []string{ "refCount", } } func (r *StaticDirectoryRefs) beforeSave() {} // +checklocksignore func (r *StaticDirectoryRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *StaticDirectoryRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (s *StaticSymlink) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.StaticSymlink" } func (s *StaticSymlink) StateFields() []string { return []string{ "InodeAttrs", "InodeNoopRefCount", "InodeNotAnonymous", "InodeSymlink", "InodeNoStatFS", "InodeWatches", "target", } } func (s *StaticSymlink) beforeSave() {} // +checklocksignore func (s *StaticSymlink) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.InodeAttrs) stateSinkObject.Save(1, &s.InodeNoopRefCount) stateSinkObject.Save(2, &s.InodeNotAnonymous) stateSinkObject.Save(3, &s.InodeSymlink) stateSinkObject.Save(4, &s.InodeNoStatFS) stateSinkObject.Save(5, &s.InodeWatches) stateSinkObject.Save(6, &s.target) } func (s *StaticSymlink) afterLoad(context.Context) {} // +checklocksignore func (s *StaticSymlink) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.InodeAttrs) stateSourceObject.Load(1, &s.InodeNoopRefCount) stateSourceObject.Load(2, &s.InodeNotAnonymous) stateSourceObject.Load(3, &s.InodeSymlink) stateSourceObject.Load(4, &s.InodeNoStatFS) stateSourceObject.Load(5, &s.InodeWatches) stateSourceObject.Load(6, &s.target) } func (dir *syntheticDirectory) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.syntheticDirectory" } func (dir *syntheticDirectory) StateFields() []string { return []string{ "InodeAlwaysValid", "InodeAttrs", "InodeNoStatFS", "InodeNotAnonymous", "InodeNotSymlink", "InodeWatches", "OrderedChildren", "syntheticDirectoryRefs", "locks", } } func (dir *syntheticDirectory) beforeSave() {} // +checklocksignore func (dir *syntheticDirectory) StateSave(stateSinkObject state.Sink) { dir.beforeSave() stateSinkObject.Save(0, &dir.InodeAlwaysValid) stateSinkObject.Save(1, &dir.InodeAttrs) stateSinkObject.Save(2, &dir.InodeNoStatFS) stateSinkObject.Save(3, &dir.InodeNotAnonymous) stateSinkObject.Save(4, &dir.InodeNotSymlink) stateSinkObject.Save(5, &dir.InodeWatches) stateSinkObject.Save(6, &dir.OrderedChildren) stateSinkObject.Save(7, &dir.syntheticDirectoryRefs) stateSinkObject.Save(8, &dir.locks) } func (dir *syntheticDirectory) afterLoad(context.Context) {} // +checklocksignore func (dir *syntheticDirectory) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &dir.InodeAlwaysValid) stateSourceObject.Load(1, &dir.InodeAttrs) stateSourceObject.Load(2, &dir.InodeNoStatFS) stateSourceObject.Load(3, &dir.InodeNotAnonymous) stateSourceObject.Load(4, &dir.InodeNotSymlink) stateSourceObject.Load(5, &dir.InodeWatches) stateSourceObject.Load(6, &dir.OrderedChildren) stateSourceObject.Load(7, &dir.syntheticDirectoryRefs) stateSourceObject.Load(8, &dir.locks) } func (r *syntheticDirectoryRefs) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.syntheticDirectoryRefs" } func (r *syntheticDirectoryRefs) StateFields() []string { return []string{ "refCount", } } func (r *syntheticDirectoryRefs) beforeSave() {} // +checklocksignore func (r *syntheticDirectoryRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *syntheticDirectoryRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func init() { state.Register((*dentryList)(nil)) state.Register((*dentryEntry)(nil)) state.Register((*DynamicBytesFile)(nil)) state.Register((*DynamicBytesFD)(nil)) state.Register((*SeekEndConfig)(nil)) state.Register((*GenericDirectoryFDOptions)(nil)) state.Register((*GenericDirectoryFD)(nil)) state.Register((*InodeNoopRefCount)(nil)) state.Register((*InodeDirectoryNoNewChildren)(nil)) state.Register((*InodeNotDirectory)(nil)) state.Register((*InodeNotSymlink)(nil)) state.Register((*InodeAttrs)(nil)) state.Register((*slot)(nil)) state.Register((*OrderedChildrenOptions)(nil)) state.Register((*OrderedChildren)(nil)) state.Register((*InodeSymlink)(nil)) state.Register((*StaticDirectory)(nil)) state.Register((*InodeAlwaysValid)(nil)) state.Register((*InodeTemporary)(nil)) state.Register((*InodeNoStatFS)(nil)) state.Register((*InodeWatches)(nil)) state.Register((*InodeAnonymous)(nil)) state.Register((*InodeNotAnonymous)(nil)) state.Register((*Filesystem)(nil)) state.Register((*Dentry)(nil)) state.Register((*inodePlatformFile)(nil)) state.Register((*CachedMappable)(nil)) state.Register((*slotList)(nil)) state.Register((*slotEntry)(nil)) state.Register((*StaticDirectoryRefs)(nil)) state.Register((*StaticSymlink)(nil)) state.Register((*syntheticDirectory)(nil)) state.Register((*syntheticDirectoryRefs)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/kernfs/mmap_util.go000066400000000000000000000125521465435605700260550ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernfs import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" ) // inodePlatformFile implements memmap.File. It exists solely because inode // cannot implement both kernfs.Inode.IncRef and memmap.File.IncRef. // // +stateify savable type inodePlatformFile struct { memmap.NoBufferedIOFallback // hostFD contains the host fd that this file was originally created from, // which must be available at time of restore. // // This field is initialized at creation time and is immutable. // inodePlatformFile does not own hostFD and hence should not close it. hostFD int // fdRefsMu protects fdRefs. fdRefsMu sync.Mutex `state:"nosave"` // fdRefs counts references on memmap.File offsets. It is used solely for // memory accounting. fdRefs fsutil.FrameRefSet // fileMapper caches mappings of the host file represented by this inode. fileMapper fsutil.HostFileMapper // fileMapperInitOnce is used to lazily initialize fileMapper. fileMapperInitOnce sync.Once `state:"nosave"` } var _ memmap.File = (*inodePlatformFile)(nil) // IncRef implements memmap.File.IncRef. func (i *inodePlatformFile) IncRef(fr memmap.FileRange, memCgID uint32) { i.fdRefsMu.Lock() i.fdRefs.IncRefAndAccount(fr, memCgID) i.fdRefsMu.Unlock() } // DecRef implements memmap.File.DecRef. func (i *inodePlatformFile) DecRef(fr memmap.FileRange) { i.fdRefsMu.Lock() i.fdRefs.DecRefAndAccount(fr) i.fdRefsMu.Unlock() } // MapInternal implements memmap.File.MapInternal. func (i *inodePlatformFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { return i.fileMapper.MapInternal(fr, i.hostFD, at.Write) } // FD implements memmap.File.FD. func (i *inodePlatformFile) FD() int { return i.hostFD } // CachedMappable implements memmap.Mappable. This utility can be embedded in a // kernfs.Inode that represents a host file to make the inode mappable. // CachedMappable caches the mappings of the host file. CachedMappable must be // initialized (via Init) with a hostFD before use. // // +stateify savable type CachedMappable struct { // mapsMu protects mappings. mapsMu sync.Mutex `state:"nosave"` // mappings tracks mappings of hostFD into memmap.MappingSpaces. mappings memmap.MappingSet // pf implements memmap.File for mappings backed by a host fd. pf inodePlatformFile } var _ memmap.Mappable = (*CachedMappable)(nil) // Init initializes i.pf. This must be called before using CachedMappable. func (i *CachedMappable) Init(hostFD int) { i.pf.hostFD = hostFD } // AddMapping implements memmap.Mappable.AddMapping. func (i *CachedMappable) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { i.mapsMu.Lock() mapped := i.mappings.AddMapping(ms, ar, offset, writable) for _, r := range mapped { i.pf.fileMapper.IncRefOn(r) } i.mapsMu.Unlock() return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (i *CachedMappable) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { i.mapsMu.Lock() unmapped := i.mappings.RemoveMapping(ms, ar, offset, writable) for _, r := range unmapped { i.pf.fileMapper.DecRefOn(r) } i.mapsMu.Unlock() } // CopyMapping implements memmap.Mappable.CopyMapping. func (i *CachedMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { return i.AddMapping(ctx, ms, dstAR, offset, writable) } // Translate implements memmap.Mappable.Translate. func (i *CachedMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { mr := optional return []memmap.Translation{ { Source: mr, File: &i.pf, Offset: mr.Start, Perms: hostarch.AnyAccess, }, }, nil } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (i *CachedMappable) InvalidateUnsavable(ctx context.Context) error { // We expect the same host fd across save/restore, so all translations // should be valid. return nil } // InvalidateRange invalidates the passed range on i.mappings. func (i *CachedMappable) InvalidateRange(r memmap.MappableRange) { i.mapsMu.Lock() i.mappings.Invalidate(r, memmap.InvalidateOpts{ // Compare Linux's mm/truncate.c:truncate_setsize() => // truncate_pagecache() => // mm/memory.c:unmap_mapping_range(evencows=1). InvalidatePrivate: true, }) i.mapsMu.Unlock() } // InitFileMapperOnce initializes the host file mapper. It ensures that the // file mapper is initialized just once. func (i *CachedMappable) InitFileMapperOnce() { i.pf.fileMapperInitOnce.Do(i.pf.fileMapper.Init) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/kernfs/save_restore.go000066400000000000000000000023511465435605700265630ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernfs import ( "context" "gvisor.dev/gvisor/pkg/refs" ) // afterLoad is invoked by stateify. func (d *Dentry) afterLoad(context.Context) { if d.refs.Load() >= 0 { refs.Register(d) } } // afterLoad is invoked by stateify. func (i *inodePlatformFile) afterLoad(context.Context) { if i.fileMapper.IsInited() { // Ensure that we don't call i.fileMapper.Init() again. i.fileMapperInitOnce.Do(func() {}) } } // saveParent is called by stateify. func (d *Dentry) saveParent() *Dentry { return d.parent.Load() } // loadParent is called by stateify. func (d *Dentry) loadParent(_ context.Context, parent *Dentry) { d.parent.Store(parent) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/kernfs/slot_list.go000066400000000000000000000116221465435605700260770ustar00rootroot00000000000000package kernfs // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type slotElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (slotElementMapper) linkerFor(elem *slot) *slot { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type slotList struct { head *slot tail *slot } // Reset resets list l to the empty state. func (l *slotList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *slotList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *slotList) Front() *slot { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *slotList) Back() *slot { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *slotList) Len() (count int) { for e := l.Front(); e != nil; e = (slotElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *slotList) PushFront(e *slot) { linker := slotElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { slotElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *slotList) PushFrontList(m *slotList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { slotElementMapper{}.linkerFor(l.head).SetPrev(m.tail) slotElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *slotList) PushBack(e *slot) { linker := slotElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { slotElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *slotList) PushBackList(m *slotList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { slotElementMapper{}.linkerFor(l.tail).SetNext(m.head) slotElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *slotList) InsertAfter(b, e *slot) { bLinker := slotElementMapper{}.linkerFor(b) eLinker := slotElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { slotElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *slotList) InsertBefore(a, e *slot) { aLinker := slotElementMapper{}.linkerFor(a) eLinker := slotElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { slotElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *slotList) Remove(e *slot) { linker := slotElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { slotElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { slotElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type slotEntry struct { next *slot prev *slot } // Next returns the entry that follows e in the list. // //go:nosplit func (e *slotEntry) Next() *slot { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *slotEntry) Prev() *slot { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *slotEntry) SetNext(elem *slot) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *slotEntry) SetPrev(elem *slot) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/kernfs/static_directory_refs.go000066400000000000000000000103531465435605700304550ustar00rootroot00000000000000package kernfs import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const StaticDirectoryenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var StaticDirectoryobj *StaticDirectory // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type StaticDirectoryRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *StaticDirectoryRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *StaticDirectoryRefs) RefType() string { return fmt.Sprintf("%T", StaticDirectoryobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *StaticDirectoryRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *StaticDirectoryRefs) LogRefs() bool { return StaticDirectoryenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *StaticDirectoryRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *StaticDirectoryRefs) IncRef() { v := r.refCount.Add(1) if StaticDirectoryenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *StaticDirectoryRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if StaticDirectoryenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *StaticDirectoryRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if StaticDirectoryenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *StaticDirectoryRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/kernfs/symlink.go000066400000000000000000000042731465435605700255550ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // StaticSymlink provides an Inode implementation for symlinks that point to // a immutable target. // // +stateify savable type StaticSymlink struct { InodeAttrs InodeNoopRefCount InodeNotAnonymous InodeSymlink InodeNoStatFS InodeWatches target string } var _ Inode = (*StaticSymlink)(nil) // NewStaticSymlink creates a new symlink file pointing to 'target'. func NewStaticSymlink(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) Inode { inode := &StaticSymlink{} inode.Init(ctx, creds, devMajor, devMinor, ino, target) return inode } // Init initializes the instance. func (s *StaticSymlink) Init(ctx context.Context, creds *auth.Credentials, devMajor uint32, devMinor uint32, ino uint64, target string) { s.target = target s.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeSymlink|0777) } // Readlink implements Inode.Readlink. func (s *StaticSymlink) Readlink(_ context.Context, _ *vfs.Mount) (string, error) { return s.target, nil } // Getlink implements Inode.Getlink. func (s *StaticSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) { return vfs.VirtualDentry{}, s.target, nil } // SetStat implements Inode.SetStat not allowing inode attributes to be changed. func (*StaticSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return linuxerr.EPERM } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/kernfs/synthetic_directory.go000066400000000000000000000070751465435605700301700ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernfs import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // syntheticDirectory implements kernfs.Inode for a directory created by // MkdirAt(ForSyntheticMountpoint=true). // // +stateify savable type syntheticDirectory struct { InodeAlwaysValid InodeAttrs InodeNoStatFS InodeNotAnonymous InodeNotSymlink InodeWatches OrderedChildren syntheticDirectoryRefs locks vfs.FileLocks } var _ Inode = (*syntheticDirectory)(nil) func newSyntheticDirectory(ctx context.Context, creds *auth.Credentials, perm linux.FileMode) Inode { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("perm contains non-permission bits: %#o", perm)) } dir := &syntheticDirectory{} dir.InitRefs() dir.InodeAttrs.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, 0 /* ino */, linux.S_IFDIR|perm) dir.OrderedChildren.Init(OrderedChildrenOptions{ Writable: true, }) return dir } // Open implements Inode.Open. func (dir *syntheticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := NewGenericDirectoryFD(rp.Mount(), d, &dir.OrderedChildren, &dir.locks, &opts, GenericDirectoryFDOptions{}) if err != nil { return nil, err } return &fd.vfsfd, nil } // NewFile implements Inode.NewFile. func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error) { return nil, linuxerr.EPERM } // NewDir implements Inode.NewDir. func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error) { if !opts.ForSyntheticMountpoint { return nil, linuxerr.EPERM } subdirI := newSyntheticDirectory(ctx, auth.CredentialsFromContext(ctx), opts.Mode&linux.PermissionsMask) if err := dir.OrderedChildren.Insert(name, subdirI); err != nil { subdirI.DecRef(ctx) return nil, err } dir.TouchCMtime(ctx) return subdirI, nil } // NewLink implements Inode.NewLink. func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (Inode, error) { return nil, linuxerr.EPERM } // NewSymlink implements Inode.NewSymlink. func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (Inode, error) { return nil, linuxerr.EPERM } // NewNode implements Inode.NewNode. func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error) { return nil, linuxerr.EPERM } // DecRef implements Inode.DecRef. func (dir *syntheticDirectory) DecRef(ctx context.Context) { dir.syntheticDirectoryRefs.DecRef(func() { dir.Destroy(ctx) }) } // Keep implements Inode.Keep. This is redundant because inodes will never be // created via Lookup and inodes are always valid. Makes sense to return true // because these inodes are not temporary and should only be removed on RmDir. func (dir *syntheticDirectory) Keep() bool { return true } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/kernfs/synthetic_directory_refs.go000066400000000000000000000104411465435605700311760ustar00rootroot00000000000000package kernfs import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const syntheticDirectoryenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var syntheticDirectoryobj *syntheticDirectory // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type syntheticDirectoryRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *syntheticDirectoryRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *syntheticDirectoryRefs) RefType() string { return fmt.Sprintf("%T", syntheticDirectoryobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *syntheticDirectoryRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *syntheticDirectoryRefs) LogRefs() bool { return syntheticDirectoryenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *syntheticDirectoryRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *syntheticDirectoryRefs) IncRef() { v := r.refCount.Add(1) if syntheticDirectoryenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *syntheticDirectoryRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if syntheticDirectoryenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *syntheticDirectoryRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if syntheticDirectoryenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *syntheticDirectoryRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/lock/000077500000000000000000000000001465435605700231725ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/lock/lock.go000066400000000000000000000431641465435605700244610ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package lock is the API for POSIX-style advisory regional file locks and // BSD-style full file locks. // // Callers needing to enforce these types of locks, like sys_fcntl, can call // LockRegion and UnlockRegion on a thread-safe set of Locks. Locks are // specific to a unique file (unique device/inode pair) and for this reason // should not be shared between files. // // A Lock has a set of holders identified by UniqueID. Normally this is the // pid of the thread attempting to acquire the lock. // // Since these are advisory locks, they do not need to be integrated into // Reads/Writes and for this reason there is no way to *check* if a lock is // held. One can only attempt to take a lock or unlock an existing lock. // // A Lock in a set of Locks is typed: it is either a read lock with any number // of readers and no writer, or a write lock with no readers. // // As expected from POSIX, any attempt to acquire a write lock on a file region // when there already exits a write lock held by a different uid will fail. Any // attempt to acquire a write lock on a file region when there is more than one // reader will fail. Any attempt to acquire a read lock on a file region when // there is already a writer will fail. // // In special cases, a read lock may be upgraded to a write lock and a write lock // can be downgraded to a read lock. This can only happen if: // // - read lock upgrade to write lock: There can be only one reader and the reader // must be the same as the requested write lock holder. // // - write lock downgrade to read lock: The writer must be the same as the requested // read lock holder. // // UnlockRegion always succeeds. If LockRegion fails the caller should normally // interpret this as "try again later". package lock import ( "fmt" "math" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) // LockType is a type of regional file lock. type LockType int // UniqueID is a unique identifier of the holder of a regional file lock. type UniqueID any const ( // ReadLock describes a POSIX regional file lock to be taken // read only. There may be multiple of these locks on a single // file region as long as there is no writer lock on the same // region. ReadLock LockType = iota // WriteLock describes a POSIX regional file lock to be taken // write only. There may be only a single holder of this lock // and no read locks. WriteLock ) // LockEOF is the maximal possible end of a regional file lock. // // A BSD-style full file lock can be represented as a regional file lock from // offset 0 to LockEOF. const LockEOF = math.MaxInt64 // OwnerInfo describes the owner of a lock. // // +stateify savable type OwnerInfo struct { // PID is the process ID of the lock owner. PID int32 // OFD is whether this is an open file descriptor lock. OFD bool } // Lock is a regional file lock. It consists of either a single writer // or a set of readers. // // A Lock may be upgraded from a read lock to a write lock only if there // is a single reader and that reader has the same uid as the write lock. // // A Lock may be downgraded from a write lock to a read lock only if // the write lock's uid is the same as the read lock. // // Accesses to Lock are synchronized through the Locks object to which it // belongs. // // +stateify savable type Lock struct { // Readers are the set of read lock holders identified by UniqueID. // If len(Readers) > 0 then Writer must be nil. Readers map[UniqueID]OwnerInfo // Writer holds the writer unique ID. It's nil if there are no writers. Writer UniqueID // WriterInfo describes the writer. It is only meaningful if Writer != nil. WriterInfo OwnerInfo } // Locks is a thread-safe wrapper around a LockSet. // // +stateify savable type Locks struct { // mu protects locks below. mu sync.Mutex `state:"nosave"` // locks is the set of region locks currently held on an Inode. locks LockSet // blockedQueue is the queue of waiters that are waiting on a lock. blockedQueue waiter.Queue } // LockRegion attempts to acquire a typed lock for the uid on a region of a // file. Returns nil if successful in locking the region, otherwise an // appropriate error is returned. func (l *Locks) LockRegion(ctx context.Context, uid UniqueID, ownerPID int32, t LockType, r LockRange, ofd bool, block bool) error { l.mu.Lock() defer l.mu.Unlock() for { // Blocking locks must run in a loop because we'll be woken up whenever an unlock event // happens for this lock. We will then attempt to take the lock again and if it fails // continue blocking. err := l.locks.lock(uid, ownerPID, t, r, ofd) if err == linuxerr.ErrWouldBlock && block { // Note: we release the lock in EventRegister below, in // order to avoid a possible race. ok := ctx.BlockOn(l, waiter.EventIn) l.mu.Lock() // +checklocksforce: see above. if ok { continue // Try again now that someone has unlocked. } // Must be interrupted. return linuxerr.ErrInterrupted } return err } } // Readiness always returns zero. func (l *Locks) Readiness(waiter.EventMask) waiter.EventMask { return 0 } // EventRegister implements waiter.Waitable.EventRegister. func (l *Locks) EventRegister(e *waiter.Entry) error { defer l.mu.Unlock() // +checklocksforce: see above. l.blockedQueue.EventRegister(e) return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (l *Locks) EventUnregister(e *waiter.Entry) { l.blockedQueue.EventUnregister(e) } // UnlockRegion attempts to release a lock for the uid on a region of a file. // This operation is always successful, even if there did not exist a lock on // the requested region held by uid in the first place. func (l *Locks) UnlockRegion(uid UniqueID, r LockRange) { l.mu.Lock() defer l.mu.Unlock() l.locks.unlock(uid, r) // Now that we've released the lock, we need to wake up any waiters. // We track how many notifications have happened since the last attempt // to acquire the lock, in order to ensure that we avoid races. l.blockedQueue.Notify(waiter.EventIn) } // makeLock returns a new typed Lock that has either uid as its only reader // or uid as its only writer. func makeLock(uid UniqueID, ownerPID int32, t LockType, ofd bool) Lock { value := Lock{Readers: make(map[UniqueID]OwnerInfo)} switch t { case ReadLock: value.Readers[uid] = OwnerInfo{PID: ownerPID, OFD: ofd} case WriteLock: value.Writer = uid value.WriterInfo = OwnerInfo{PID: ownerPID, OFD: ofd} default: panic(fmt.Sprintf("makeLock: invalid lock type %d", t)) } return value } // isHeld returns true if uid is a holder of Lock. func (l Lock) isHeld(uid UniqueID) bool { if _, ok := l.Readers[uid]; ok { return true } return l.Writer == uid } // lock sets uid as a holder of a typed lock on Lock. // // Preconditions: canLock is true for the range containing this Lock. func (l *Lock) lock(uid UniqueID, ownerPID int32, t LockType, ofd bool) { switch t { case ReadLock: // If we are already a reader, then this is a no-op. if _, ok := l.Readers[uid]; ok { return } // We cannot downgrade a write lock to a read lock unless the // uid is the same. if l.Writer != nil { if l.Writer != uid { panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer)) } // Ensure that there is only one reader if upgrading. clear(l.Readers) // Ensure that there is no longer a writer. l.Writer = nil } l.Readers[uid] = OwnerInfo{PID: ownerPID, OFD: ofd} return case WriteLock: // If we are already the writer, then this is a no-op. if l.Writer == uid { return } // We can only upgrade a read lock to a write lock if there // is only one reader and that reader has the same uid as // the write lock. if readers := len(l.Readers); readers > 0 { if readers != 1 { panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, too many readers %v", uid, l.Readers)) } if _, ok := l.Readers[uid]; !ok { panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, conflicting reader %v", uid, l.Readers)) } } // Ensure that there is only a writer. clear(l.Readers) l.Writer = uid l.WriterInfo = OwnerInfo{PID: ownerPID, OFD: ofd} default: panic(fmt.Sprintf("lock: invalid lock type %d", t)) } } // lockable returns true if check returns true for every Lock in LockRange. // Further, check should return true if Lock meets the callers requirements // for locking Lock. func (l LockSet) lockable(r LockRange, check func(value Lock) bool) bool { // Get our starting point. seg := l.LowerBoundSegment(r.Start) for seg.Ok() && seg.Start() < r.End { // Note that we don't care about overrunning the end of the // last segment because if everything checks out we'll just // split the last segment. if !check(seg.Value()) { return false } // Jump to the next segment, ignoring gaps, for the same // reason we ignored the first gap. seg = seg.NextSegment() } // No conflict, we can get a lock for uid over the entire range. return true } // canLock returns true if uid will be able to take a Lock of type t on the // entire range specified by LockRange. func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool { switch t { case ReadLock: return l.lockable(r, func(value Lock) bool { // If there is no writer, there's no problem adding another reader. if value.Writer == nil { return true } // If there is a writer, then it must be the same uid // in order to downgrade the lock to a read lock. return value.Writer == uid }) case WriteLock: return l.lockable(r, func(value Lock) bool { // If there are only readers. if value.Writer == nil { // Then this uid can only take a write lock if this is a private // upgrade, meaning that the only reader is uid. return value.isOnlyReader(uid) } // If the uid is already a writer on this region, then // adding a write lock would be a no-op. return value.Writer == uid }) default: panic(fmt.Sprintf("canLock: invalid lock type %d", t)) } } func (l *Lock) isOnlyReader(uid UniqueID) bool { if len(l.Readers) != 1 { return false } _, ok := l.Readers[uid] return ok } // lock returns nil if uid took a lock of type t on the entire range of // LockRange. Otherwise, linuxerr.ErrWouldBlock is returned. // // Preconditions: r.Start <= r.End (will panic otherwise). func (l *LockSet) lock(uid UniqueID, ownerPID int32, t LockType, r LockRange, ofd bool) error { if r.Start > r.End { panic(fmt.Sprintf("lock: r.Start %d > r.End %d", r.Start, r.End)) } // Don't attempt to insert anything with a range of 0 and treat this // as a successful no-op. if r.Length() == 0 { return nil } // Do a first-pass check. We *could* hold onto the segments we checked // if canLock would return true, but traversing the segment set should // be fast and this keeps things simple. if !l.canLock(uid, t, r) { return linuxerr.ErrWouldBlock } // Get our starting point. seg, gap := l.Find(r.Start) if gap.Ok() { // Fill in the gap and get the next segment to modify. seg = l.Insert(gap, gap.Range().Intersect(r), makeLock(uid, ownerPID, t, ofd)).NextSegment() } else if seg.Start() < r.Start { // Get our first segment to modify. _, seg = l.Split(seg, r.Start) } for seg.Ok() && seg.Start() < r.End { // Split the last one if necessary. if seg.End() > r.End { seg, _ = l.SplitUnchecked(seg, r.End) } // Set the lock on the segment. This is guaranteed to // always be safe, given canLock above. value := seg.ValuePtr() value.lock(uid, ownerPID, t, ofd) // Fill subsequent gaps. gap = seg.NextGap() if gr := gap.Range().Intersect(r); gr.Length() > 0 { seg = l.Insert(gap, gr, makeLock(uid, ownerPID, t, ofd)).NextSegment() } else { seg = gap.NextSegment() } } return nil } // unlock is always successful. If uid has no locks held for the range LockRange, // unlock is a no-op. // // Preconditions: same as lock. func (l *LockSet) unlock(uid UniqueID, r LockRange) { if r.Start > r.End { panic(fmt.Sprintf("unlock: r.Start %d > r.End %d", r.Start, r.End)) } // Same as setlock. if r.Length() == 0 { return } // Get our starting point. seg := l.LowerBoundSegment(r.Start) for seg.Ok() && seg.Start() < r.End { // If this segment doesn't have a lock from uid then // there is no need to fragment the set with Isolate (below). // In this case just move on to the next segment. if !seg.Value().isHeld(uid) { seg = seg.NextSegment() continue } // Ensure that if we need to unlock a sub-segment that // we don't unlock/remove that entire segment. seg = l.Isolate(seg, r) value := seg.Value() var remove bool if value.Writer == uid { // If we are unlocking a writer, then since there can // only ever be one writer and no readers, then this // lock should always be removed from the set. remove = true } else if _, ok := value.Readers[uid]; ok { // If uid is the last reader, then just remove the entire // segment. if len(value.Readers) == 1 { remove = true } else { // Otherwise we need to remove this reader without // affecting any other segment's readers. To do // this, we need to make a copy of the Readers map // and not add this uid. newValue := Lock{Readers: make(map[UniqueID]OwnerInfo)} for k, v := range value.Readers { if k != uid { newValue.Readers[k] = v } } seg.SetValue(newValue) } } if remove { seg = l.Remove(seg).NextSegment() } else { seg = seg.NextSegment() } } } // ComputeRange takes a positive file offset and computes the start of a LockRange // using start (relative to offset) and the end of the LockRange using length. The // values of start and length may be negative but the resulting LockRange must // preserve that LockRange.Start < LockRange.End and LockRange.Start > 0. func ComputeRange(start, length, offset int64) (LockRange, error) { offset += start // fcntl(2): "l_start can be a negative number provided the offset // does not lie before the start of the file" if offset < 0 { return LockRange{}, unix.EINVAL } // fcntl(2): Specifying 0 for l_len has the special meaning: lock all // bytes starting at the location specified by l_whence and l_start // through to the end of file, no matter how large the file grows. end := uint64(LockEOF) if length > 0 { // fcntl(2): If l_len is positive, then the range to be locked // covers bytes l_start up to and including l_start+l_len-1. // // Since LockRange.End is exclusive we need not -1 from length.. end = uint64(offset + length) } else if length < 0 { // fcntl(2): If l_len is negative, the interval described by // lock covers bytes l_start+l_len up to and including l_start-1. // // Since LockRange.End is exclusive we need not -1 from offset. signedEnd := offset // Add to offset using a negative length (subtract). offset += length if offset < 0 { return LockRange{}, unix.EINVAL } if signedEnd < offset { return LockRange{}, unix.EOVERFLOW } // At this point signedEnd cannot be negative, // since we asserted that offset is not negative // and it is not less than offset. end = uint64(signedEnd) } // Offset is guaranteed to be positive at this point. return LockRange{Start: uint64(offset), End: end}, nil } // TestRegion checks whether the lock holder identified by uid can hold a lock // of type t on range r. It returns a Flock struct representing this // information as the F_GETLK fcntl does. // // Note that the PID returned in the flock structure is relative to the root PID // namespace. It needs to be converted to the caller's PID namespace before // returning to userspace. func (l *Locks) TestRegion(ctx context.Context, uid UniqueID, t LockType, r LockRange, ofd bool) linux.Flock { f := linux.Flock{Type: linux.F_UNLCK} switch t { case ReadLock: l.testRegion(r, func(lock Lock, start, length uint64) bool { if lock.Writer == nil || lock.Writer == uid { return true } f.Type = linux.F_WRLCK f.PID = lock.WriterInfo.PID f.Start = int64(start) f.Len = int64(length) return false }) case WriteLock: l.testRegion(r, func(lock Lock, start, length uint64) bool { if lock.Writer == nil { for k, v := range lock.Readers { if k != uid && v.OFD == ofd { // Stop at the first conflict detected. f.Type = linux.F_RDLCK f.PID = v.PID f.Start = int64(start) f.Len = int64(length) return false } } return true } if lock.Writer == uid { return true } f.Type = linux.F_WRLCK f.PID = lock.WriterInfo.PID f.Start = int64(start) f.Len = int64(length) return false }) default: panic(fmt.Sprintf("TestRegion: invalid lock type %d", t)) } return f } func (l *Locks) testRegion(r LockRange, check func(lock Lock, start, length uint64) bool) { l.mu.Lock() defer l.mu.Unlock() seg := l.locks.LowerBoundSegment(r.Start) for seg.Ok() && seg.Start() < r.End { lock := seg.Value() if !check(lock, seg.Start(), seg.End()-seg.Start()) { // Stop at the first conflict detected. return } seg = seg.NextSegment() } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/lock/lock_range.go000066400000000000000000000033461465435605700256330ustar00rootroot00000000000000package lock // A Range represents a contiguous range of T. // // +stateify savable type LockRange struct { // Start is the inclusive start of the range. Start uint64 // End is the exclusive end of the range. End uint64 } // WellFormed returns true if r.Start <= r.End. All other methods on a Range // require that the Range is well-formed. // //go:nosplit func (r LockRange) WellFormed() bool { return r.Start <= r.End } // Length returns the length of the range. // //go:nosplit func (r LockRange) Length() uint64 { return r.End - r.Start } // Contains returns true if r contains x. // //go:nosplit func (r LockRange) Contains(x uint64) bool { return r.Start <= x && x < r.End } // Overlaps returns true if r and r2 overlap. // //go:nosplit func (r LockRange) Overlaps(r2 LockRange) bool { return r.Start < r2.End && r2.Start < r.End } // IsSupersetOf returns true if r is a superset of r2; that is, the range r2 is // contained within r. // //go:nosplit func (r LockRange) IsSupersetOf(r2 LockRange) bool { return r.Start <= r2.Start && r.End >= r2.End } // Intersect returns a range consisting of the intersection between r and r2. // If r and r2 do not overlap, Intersect returns a range with unspecified // bounds, but for which Length() == 0. // //go:nosplit func (r LockRange) Intersect(r2 LockRange) LockRange { if r.Start < r2.Start { r.Start = r2.Start } if r.End > r2.End { r.End = r2.End } if r.End < r.Start { r.End = r.Start } return r } // CanSplitAt returns true if it is legal to split a segment spanning the range // r at x; that is, splitting at x would produce two ranges, both of which have // non-zero length. // //go:nosplit func (r LockRange) CanSplitAt(x uint64) bool { return r.Contains(x) && r.Start < x } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/lock/lock_set.go000066400000000000000000002002421465435605700253240ustar00rootroot00000000000000package lock import ( "bytes" "context" "fmt" ) // trackGaps is an optional parameter. // // If trackGaps is 1, the Set will track maximum gap size recursively, // enabling the GapIterator.{Prev,Next}LargeEnoughGap functions. In this // case, Key must be an unsigned integer. // // trackGaps must be 0 or 1. const LocktrackGaps = 0 var _ = uint8(LocktrackGaps << 7) // Will fail if not zero or one. // dynamicGap is a type that disappears if trackGaps is 0. type LockdynamicGap [LocktrackGaps]uint64 // Get returns the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *LockdynamicGap) Get() uint64 { return d[:][0] } // Set sets the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *LockdynamicGap) Set(v uint64) { d[:][0] = v } const ( // minDegree is the minimum degree of an internal node in a Set B-tree. // // - Any non-root node has at least minDegree-1 segments. // // - Any non-root internal (non-leaf) node has at least minDegree children. // // - The root node may have fewer than minDegree-1 segments, but it may // only have 0 segments if the tree is empty. // // Our implementation requires minDegree >= 3. Higher values of minDegree // usually improve performance, but increase memory usage for small sets. LockminDegree = 3 LockmaxDegree = 2 * LockminDegree ) // A Set is a mapping of segments with non-overlapping Range keys. The zero // value for a Set is an empty set. Set values are not safely movable nor // copyable. Set is thread-compatible. // // +stateify savable type LockSet struct { root Locknode `state:".([]LockFlatSegment)"` } // IsEmpty returns true if the set contains no segments. func (s *LockSet) IsEmpty() bool { return s.root.nrSegments == 0 } // IsEmptyRange returns true iff no segments in the set overlap the given // range. This is semantically equivalent to s.SpanRange(r) == 0, but may be // more efficient. func (s *LockSet) IsEmptyRange(r LockRange) bool { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return true } _, gap := s.Find(r.Start) if !gap.Ok() { return false } return r.End <= gap.End() } // Span returns the total size of all segments in the set. func (s *LockSet) Span() uint64 { var sz uint64 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { sz += seg.Range().Length() } return sz } // SpanRange returns the total size of the intersection of segments in the set // with the given range. func (s *LockSet) SpanRange(r LockRange) uint64 { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return 0 } var sz uint64 for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { sz += seg.Range().Intersect(r).Length() } return sz } // FirstSegment returns the first segment in the set. If the set is empty, // FirstSegment returns a terminal iterator. func (s *LockSet) FirstSegment() LockIterator { if s.root.nrSegments == 0 { return LockIterator{} } return s.root.firstSegment() } // LastSegment returns the last segment in the set. If the set is empty, // LastSegment returns a terminal iterator. func (s *LockSet) LastSegment() LockIterator { if s.root.nrSegments == 0 { return LockIterator{} } return s.root.lastSegment() } // FirstGap returns the first gap in the set. func (s *LockSet) FirstGap() LockGapIterator { n := &s.root for n.hasChildren { n = n.children[0] } return LockGapIterator{n, 0} } // LastGap returns the last gap in the set. func (s *LockSet) LastGap() LockGapIterator { n := &s.root for n.hasChildren { n = n.children[n.nrSegments] } return LockGapIterator{n, n.nrSegments} } // Find returns the segment or gap whose range contains the given key. If a // segment is found, the returned Iterator is non-terminal and the // returned GapIterator is terminal. Otherwise, the returned Iterator is // terminal and the returned GapIterator is non-terminal. func (s *LockSet) Find(key uint64) (LockIterator, LockGapIterator) { n := &s.root for { lower := 0 upper := n.nrSegments for lower < upper { i := lower + (upper-lower)/2 if r := n.keys[i]; key < r.End { if key >= r.Start { return LockIterator{n, i}, LockGapIterator{} } upper = i } else { lower = i + 1 } } i := lower if !n.hasChildren { return LockIterator{}, LockGapIterator{n, i} } n = n.children[i] } } // FindSegment returns the segment whose range contains the given key. If no // such segment exists, FindSegment returns a terminal iterator. func (s *LockSet) FindSegment(key uint64) LockIterator { seg, _ := s.Find(key) return seg } // LowerBoundSegment returns the segment with the lowest range that contains a // key greater than or equal to min. If no such segment exists, // LowerBoundSegment returns a terminal iterator. func (s *LockSet) LowerBoundSegment(min uint64) LockIterator { seg, gap := s.Find(min) if seg.Ok() { return seg } return gap.NextSegment() } // UpperBoundSegment returns the segment with the highest range that contains a // key less than or equal to max. If no such segment exists, UpperBoundSegment // returns a terminal iterator. func (s *LockSet) UpperBoundSegment(max uint64) LockIterator { seg, gap := s.Find(max) if seg.Ok() { return seg } return gap.PrevSegment() } // FindGap returns the gap containing the given key. If no such gap exists // (i.e. the set contains a segment containing that key), FindGap returns a // terminal iterator. func (s *LockSet) FindGap(key uint64) LockGapIterator { _, gap := s.Find(key) return gap } // LowerBoundGap returns the gap with the lowest range that is greater than or // equal to min. func (s *LockSet) LowerBoundGap(min uint64) LockGapIterator { seg, gap := s.Find(min) if gap.Ok() { return gap } return seg.NextGap() } // UpperBoundGap returns the gap with the highest range that is less than or // equal to max. func (s *LockSet) UpperBoundGap(max uint64) LockGapIterator { seg, gap := s.Find(max) if gap.Ok() { return gap } return seg.PrevGap() } // FirstLargeEnoughGap returns the first gap in the set with at least the given // length. If no such gap exists, FirstLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *LockSet) FirstLargeEnoughGap(minSize uint64) LockGapIterator { if LocktrackGaps != 1 { panic("set is not tracking gaps") } gap := s.FirstGap() if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // LastLargeEnoughGap returns the last gap in the set with at least the given // length. If no such gap exists, LastLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *LockSet) LastLargeEnoughGap(minSize uint64) LockGapIterator { if LocktrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LastGap() if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // LowerBoundLargeEnoughGap returns the first gap in the set with at least the // given length and whose range contains a key greater than or equal to min. If // no such gap exists, LowerBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *LockSet) LowerBoundLargeEnoughGap(min, minSize uint64) LockGapIterator { if LocktrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LowerBoundGap(min) if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // UpperBoundLargeEnoughGap returns the last gap in the set with at least the // given length and whose range contains a key less than or equal to max. If no // such gap exists, UpperBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *LockSet) UpperBoundLargeEnoughGap(max, minSize uint64) LockGapIterator { if LocktrackGaps != 1 { panic("set is not tracking gaps") } gap := s.UpperBoundGap(max) if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // Insert inserts the given segment into the given gap. If the new segment can // be merged with adjacent segments, Insert will do so. Insert returns an // iterator to the segment containing the inserted value (which may have been // merged with other values). All existing iterators (including gap, but not // including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, Insert panics. // // Insert is semantically equivalent to a InsertWithoutMerging followed by a // Merge, but may be more efficient. Note that there is no unchecked variant of // Insert since Insert must retrieve and inspect gap's predecessor and // successor segments regardless. func (s *LockSet) Insert(gap LockGapIterator, r LockRange, val Lock) LockIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } prev, next := gap.PrevSegment(), gap.NextSegment() if prev.Ok() && prev.End() > r.Start { panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) } if next.Ok() && next.Start() < r.End { panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) } if prev.Ok() && prev.End() == r.Start { if mval, ok := (lockSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { shrinkMaxGap := LocktrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() prev.SetEndUnchecked(r.End) prev.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } if next.Ok() && next.Start() == r.End { val = mval if mval, ok := (lockSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { prev.SetEndUnchecked(next.End()) prev.SetValue(mval) return s.Remove(next).PrevSegment() } } return prev } } if next.Ok() && next.Start() == r.End { if mval, ok := (lockSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { shrinkMaxGap := LocktrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() next.SetStartUnchecked(r.Start) next.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } return next } } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMerging inserts the given segment into the given gap and // returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, // InsertWithoutMerging panics. func (s *LockSet) InsertWithoutMerging(gap LockGapIterator, r LockRange, val Lock) LockIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if gr := gap.Range(); !gr.IsSupersetOf(r) { panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMergingUnchecked inserts the given segment into the given gap // and returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // Preconditions: // - r.Start >= gap.Start(). // - r.End <= gap.End(). func (s *LockSet) InsertWithoutMergingUnchecked(gap LockGapIterator, r LockRange, val Lock) LockIterator { gap = gap.node.rebalanceBeforeInsert(gap) splitMaxGap := LocktrackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get()) copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) gap.node.keys[gap.index] = r gap.node.values[gap.index] = val gap.node.nrSegments++ if splitMaxGap { gap.node.updateMaxGapLeaf() } return LockIterator{gap.node, gap.index} } // InsertRange inserts the given segment into the set. If the new segment can // be merged with adjacent segments, InsertRange will do so. InsertRange // returns an iterator to the segment containing the inserted value (which may // have been merged with other values). All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertRange panics. // // InsertRange searches the set to find the gap to insert into. If the caller // already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *LockSet) InsertRange(r LockRange, val Lock) LockIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.Insert(gap, r, val) } // InsertWithoutMergingRange inserts the given segment into the set and returns // an iterator to the inserted segment. All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertWithoutMergingRange panics. // // InsertWithoutMergingRange searches the set to find the gap to insert into. // If the caller already has the appropriate GapIterator, or if the caller // needs to do additional work between finding the gap and insertion, use // InsertWithoutMerging instead. func (s *LockSet) InsertWithoutMergingRange(r LockRange, val Lock) LockIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.InsertWithoutMerging(gap, r, val) } // TryInsertRange attempts to insert the given segment into the set. If the new // segment can be merged with adjacent segments, TryInsertRange will do so. // TryInsertRange returns an iterator to the segment containing the inserted // value (which may have been merged with other values). All existing iterators // (excluding the returned iterator) are invalidated. // // If the new segment would overlap an existing segment, TryInsertRange does // nothing and returns a terminal iterator. // // TryInsertRange searches the set to find the gap to insert into. If the // caller already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *LockSet) TryInsertRange(r LockRange, val Lock) LockIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return LockIterator{} } if gap.End() < r.End { return LockIterator{} } return s.Insert(gap, r, val) } // TryInsertWithoutMergingRange attempts to insert the given segment into the // set. If successful, it returns an iterator to the inserted segment; all // existing iterators (excluding the returned iterator) are invalidated. If the // new segment would overlap an existing segment, TryInsertWithoutMergingRange // does nothing and returns a terminal iterator. // // TryInsertWithoutMergingRange searches the set to find the gap to insert // into. If the caller already has the appropriate GapIterator, or if the // caller needs to do additional work between finding the gap and insertion, // use InsertWithoutMerging instead. func (s *LockSet) TryInsertWithoutMergingRange(r LockRange, val Lock) LockIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return LockIterator{} } if gap.End() < r.End { return LockIterator{} } return s.InsertWithoutMerging(gap, r, val) } // Remove removes the given segment and returns an iterator to the vacated gap. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. func (s *LockSet) Remove(seg LockIterator) LockGapIterator { if seg.node.hasChildren { victim := seg.PrevSegment() seg.SetRangeUnchecked(victim.Range()) seg.SetValue(victim.Value()) nextAdjacentNode := seg.NextSegment().node if LocktrackGaps != 0 { nextAdjacentNode.updateMaxGapLeaf() } return s.Remove(victim).NextGap() } copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) lockSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) seg.node.nrSegments-- if LocktrackGaps != 0 { seg.node.updateMaxGapLeaf() } return seg.node.rebalanceAfterRemove(LockGapIterator{seg.node, seg.index}) } // RemoveAll removes all segments from the set. All existing iterators are // invalidated. func (s *LockSet) RemoveAll() { s.root = Locknode{} } // RemoveRange removes all segments in the given range. An iterator to the // newly formed gap is returned, and all existing iterators are invalidated. // // RemoveRange searches the set to find segments to remove. If the caller // already has an iterator to either end of the range of segments to remove, or // if the caller needs to do additional work before removing each segment, // iterate segments and call Remove in a loop instead. func (s *LockSet) RemoveRange(r LockRange) LockGapIterator { seg, gap := s.Find(r.Start) if seg.Ok() { seg = s.Isolate(seg, r) gap = s.Remove(seg) } for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { seg = s.SplitAfter(seg, r.End) gap = s.Remove(seg) } return gap } // RemoveFullRange is equivalent to RemoveRange, except that if any key in the // given range does not correspond to a segment, RemoveFullRange panics. func (s *LockSet) RemoveFullRange(r LockRange) LockGapIterator { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) end := seg.End() gap := s.Remove(seg) if r.End <= end { return gap } seg = gap.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // Merge attempts to merge two neighboring segments. If successful, Merge // returns an iterator to the merged segment, and all existing iterators are // invalidated. Otherwise, Merge returns a terminal iterator. // // If first is not the predecessor of second, Merge panics. func (s *LockSet) Merge(first, second LockIterator) LockIterator { if first.NextSegment() != second { panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) } return s.MergeUnchecked(first, second) } // MergeUnchecked attempts to merge two neighboring segments. If successful, // MergeUnchecked returns an iterator to the merged segment, and all existing // iterators are invalidated. Otherwise, MergeUnchecked returns a terminal // iterator. // // Precondition: first is the predecessor of second: first.NextSegment() == // second, first == second.PrevSegment(). func (s *LockSet) MergeUnchecked(first, second LockIterator) LockIterator { if first.End() == second.Start() { if mval, ok := (lockSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { first.SetEndUnchecked(second.End()) first.SetValue(mval) return s.Remove(second).PrevSegment() } } return LockIterator{} } // MergePrev attempts to merge the given segment with its predecessor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergePrev is usually used when mutating segments while iterating them in // order of increasing keys, to attempt merging of each mutated segment with // its previously-mutated predecessor. In such cases, merging a mutated segment // with its unmutated successor would incorrectly cause the latter to be // skipped. func (s *LockSet) MergePrev(seg LockIterator) LockIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } return seg } // MergeNext attempts to merge the given segment with its successor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergeNext is usually used when mutating segments while iterating them in // order of decreasing keys, to attempt merging of each mutated segment with // its previously-mutated successor. In such cases, merging a mutated segment // with its unmutated predecessor would incorrectly cause the latter to be // skipped. func (s *LockSet) MergeNext(seg LockIterator) LockIterator { if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // Unisolate attempts to merge the given segment with its predecessor and // successor if possible, and returns an updated iterator to the extended // segment. All existing iterators (including seg, but not including the // returned iterator) are invalidated. // // Unisolate is usually used in conjunction with Isolate when mutating part of // a single segment in a way that may affect its mergeability. For the reasons // described by MergePrev and MergeNext, it is usually incorrect to use the // return value of Unisolate in a loop variable. func (s *LockSet) Unisolate(seg LockIterator) LockIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // MergeAll merges all mergeable adjacent segments in the set. All existing // iterators are invalidated. func (s *LockSet) MergeAll() { seg := s.FirstSegment() if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeInsideRange attempts to merge all adjacent segments that contain a key // in the specific range. All existing iterators are invalidated. // // MergeInsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid a redundant search. func (s *LockSet) MergeInsideRange(r LockRange) { seg := s.LowerBoundSegment(r.Start) if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() && next.Start() < r.End { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeOutsideRange attempts to merge the segment containing r.Start with its // predecessor, and the segment containing r.End-1 with its successor. // // MergeOutsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid two redundant searches. func (s *LockSet) MergeOutsideRange(r LockRange) { first := s.FindSegment(r.Start) if first.Ok() { if prev := first.PrevSegment(); prev.Ok() { s.Merge(prev, first) } } last := s.FindSegment(r.End - 1) if last.Ok() { if next := last.NextSegment(); next.Ok() { s.Merge(last, next) } } } // Split splits the given segment at the given key and returns iterators to the // two resulting segments. All existing iterators (including seg, but not // including the returned iterators) are invalidated. // // If the segment cannot be split at split (because split is at the start or // end of the segment's range, so splitting would produce a segment with zero // length, or because split falls outside the segment's range altogether), // Split panics. func (s *LockSet) Split(seg LockIterator, split uint64) (LockIterator, LockIterator) { if !seg.Range().CanSplitAt(split) { panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) } return s.SplitUnchecked(seg, split) } // SplitUnchecked splits the given segment at the given key and returns // iterators to the two resulting segments. All existing iterators (including // seg, but not including the returned iterators) are invalidated. // // Preconditions: seg.Start() < key < seg.End(). func (s *LockSet) SplitUnchecked(seg LockIterator, split uint64) (LockIterator, LockIterator) { val1, val2 := (lockSetFunctions{}).Split(seg.Range(), seg.Value(), split) end2 := seg.End() seg.SetEndUnchecked(split) seg.SetValue(val1) seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), LockRange{split, end2}, val2) return seg2.PrevSegment(), seg2 } // SplitBefore ensures that the given segment's start is at least start by // splitting at start if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterator) are invalidated. // // SplitBefore is usually when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, the first segment may // extend beyond the start of the range to be mutated, and needs to be // SplitBefore to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter; i.e. SplitBefore needs to be invoked on each segment, while // SplitAfter only needs to be invoked on the first. // // Preconditions: start < seg.End(). func (s *LockSet) SplitBefore(seg LockIterator, start uint64) LockIterator { if seg.Range().CanSplitAt(start) { _, seg = s.SplitUnchecked(seg, start) } return seg } // SplitAfter ensures that the given segment's end is at most end by splitting // at end if necessary, and returns an updated iterator to the bounded segment. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. // // SplitAfter is usually used when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, each iterated segment // may extend beyond the end of the range to be mutated, and needs to be // SplitAfter to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter exchange roles; i.e. SplitBefore needs to be invoked on each // segment, while SplitAfter only needs to be invoked on the first. // // Preconditions: seg.Start() < end. func (s *LockSet) SplitAfter(seg LockIterator, end uint64) LockIterator { if seg.Range().CanSplitAt(end) { seg, _ = s.SplitUnchecked(seg, end) } return seg } // Isolate ensures that the given segment's range is a subset of r by splitting // at r.Start and r.End if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterators) are invalidated. // // Isolate is usually used when mutating part of a single segment, or when // mutating segments in a range where the first segment is not necessarily // split, making use of SplitBefore/SplitAfter complex. // // Preconditions: seg.Range().Overlaps(r). func (s *LockSet) Isolate(seg LockIterator, r LockRange) LockIterator { if seg.Range().CanSplitAt(r.Start) { _, seg = s.SplitUnchecked(seg, r.Start) } if seg.Range().CanSplitAt(r.End) { seg, _ = s.SplitUnchecked(seg, r.End) } return seg } // LowerBoundSegmentSplitBefore combines LowerBoundSegment and SplitBefore. // // LowerBoundSegmentSplitBefore is usually used when mutating segments in a // range while iterating them in order of increasing keys. In such cases, // LowerBoundSegmentSplitBefore provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *LockSet) LowerBoundSegmentSplitBefore(min uint64) LockIterator { seg := s.LowerBoundSegment(min) if seg.Ok() { seg = s.SplitBefore(seg, min) } return seg } // UpperBoundSegmentSplitAfter combines UpperBoundSegment and SplitAfter. // // UpperBoundSegmentSplitAfter is usually used when mutating segments in a // range while iterating them in order of decreasing keys. In such cases, // UpperBoundSegmentSplitAfter provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *LockSet) UpperBoundSegmentSplitAfter(max uint64) LockIterator { seg := s.UpperBoundSegment(max) if seg.Ok() { seg = s.SplitAfter(seg, max) } return seg } // VisitRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments will not be split, so f may be called // on segments lying partially outside r. Non-empty gaps between segments are // skipped. If a call to f returns false, VisitRange stops iteration // immediately. // // N.B. f must not invalidate iterators into s. func (s *LockSet) VisitRange(r LockRange, f func(seg LockIterator) bool) { for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { if !f(seg) { return } } } // VisitFullRange is equivalent to VisitRange, except that if any key in r that // is visited before f returns false does not correspond to a segment, // VisitFullRange panics. func (s *LockSet) VisitFullRange(r LockRange, f func(seg LockIterator) bool) { pos := r.Start seg := s.FindSegment(r.Start) for { if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", pos)) } if !f(seg) { return } pos = seg.End() if r.End <= pos { return } seg, _ = seg.NextNonEmpty() } } // MutateRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments that lie partially outside r are split // before f is called, such that f only observes segments entirely within r. // Iterated segments are merged again after f is called. Non-empty gaps between // segments are skipped. If a call to f returns false, MutateRange stops // iteration immediately. // // MutateRange invalidates all existing iterators. // // N.B. f must not invalidate iterators into s. func (s *LockSet) MutateRange(r LockRange, f func(seg LockIterator) bool) { seg := s.LowerBoundSegmentSplitBefore(r.Start) for seg.Ok() && seg.Start() < r.End { seg = s.SplitAfter(seg, r.End) cont := f(seg) seg = s.MergePrev(seg) if !cont { s.MergeNext(seg) return } seg = seg.NextSegment() } if seg.Ok() { s.MergePrev(seg) } } // MutateFullRange is equivalent to MutateRange, except that if any key in r // that is visited before f returns false does not correspond to a segment, // MutateFullRange panics. func (s *LockSet) MutateFullRange(r LockRange, f func(seg LockIterator) bool) { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) cont := f(seg) end := seg.End() seg = s.MergePrev(seg) if !cont || r.End <= end { s.MergeNext(seg) return } seg = seg.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // +stateify savable type Locknode struct { // An internal binary tree node looks like: // // K // / \ // Cl Cr // // where all keys in the subtree rooted by Cl (the left subtree) are less // than K (the key of the parent node), and all keys in the subtree rooted // by Cr (the right subtree) are greater than K. // // An internal B-tree node's indexes work out to look like: // // K0 K1 K2 ... Kn-1 // / \/ \/ \ ... / \ // C0 C1 C2 C3 ... Cn-1 Cn // // where n is nrSegments. nrSegments int // parent is a pointer to this node's parent. If this node is root, parent // is nil. parent *Locknode // parentIndex is the index of this node in parent.children. parentIndex int // Flag for internal nodes that is technically redundant with "children[0] // != nil", but is stored in the first cache line. "hasChildren" rather // than "isLeaf" because false must be the correct value for an empty root. hasChildren bool // The longest gap within this node. If the node is a leaf, it's simply the // maximum gap among all the (nrSegments+1) gaps formed by its nrSegments keys // including the 0th and nrSegments-th gap possibly shared with its upper-level // nodes; if it's a non-leaf node, it's the max of all children's maxGap. maxGap LockdynamicGap // Nodes store keys and values in separate arrays to maximize locality in // the common case (scanning keys for lookup). keys [LockmaxDegree - 1]LockRange values [LockmaxDegree - 1]Lock children [LockmaxDegree]*Locknode } // firstSegment returns the first segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *Locknode) firstSegment() LockIterator { for n.hasChildren { n = n.children[0] } return LockIterator{n, 0} } // lastSegment returns the last segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *Locknode) lastSegment() LockIterator { for n.hasChildren { n = n.children[n.nrSegments] } return LockIterator{n, n.nrSegments - 1} } func (n *Locknode) prevSibling() *Locknode { if n.parent == nil || n.parentIndex == 0 { return nil } return n.parent.children[n.parentIndex-1] } func (n *Locknode) nextSibling() *Locknode { if n.parent == nil || n.parentIndex == n.parent.nrSegments { return nil } return n.parent.children[n.parentIndex+1] } // rebalanceBeforeInsert splits n and its ancestors if they are full, as // required for insertion, and returns an updated iterator to the position // represented by gap. func (n *Locknode) rebalanceBeforeInsert(gap LockGapIterator) LockGapIterator { if n.nrSegments < LockmaxDegree-1 { return gap } if n.parent != nil { gap = n.parent.rebalanceBeforeInsert(gap) } if n.parent == nil { left := &Locknode{ nrSegments: LockminDegree - 1, parent: n, parentIndex: 0, hasChildren: n.hasChildren, } right := &Locknode{ nrSegments: LockminDegree - 1, parent: n, parentIndex: 1, hasChildren: n.hasChildren, } copy(left.keys[:LockminDegree-1], n.keys[:LockminDegree-1]) copy(left.values[:LockminDegree-1], n.values[:LockminDegree-1]) copy(right.keys[:LockminDegree-1], n.keys[LockminDegree:]) copy(right.values[:LockminDegree-1], n.values[LockminDegree:]) n.keys[0], n.values[0] = n.keys[LockminDegree-1], n.values[LockminDegree-1] LockzeroValueSlice(n.values[1:]) if n.hasChildren { copy(left.children[:LockminDegree], n.children[:LockminDegree]) copy(right.children[:LockminDegree], n.children[LockminDegree:]) LockzeroNodeSlice(n.children[2:]) for i := 0; i < LockminDegree; i++ { left.children[i].parent = left left.children[i].parentIndex = i right.children[i].parent = right right.children[i].parentIndex = i } } n.nrSegments = 1 n.hasChildren = true n.children[0] = left n.children[1] = right if LocktrackGaps != 0 { left.updateMaxGapLocal() right.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < LockminDegree { return LockGapIterator{left, gap.index} } return LockGapIterator{right, gap.index - LockminDegree} } copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[LockminDegree-1], n.values[LockminDegree-1] copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { n.parent.children[i].parentIndex = i } sibling := &Locknode{ nrSegments: LockminDegree - 1, parent: n.parent, parentIndex: n.parentIndex + 1, hasChildren: n.hasChildren, } n.parent.children[n.parentIndex+1] = sibling n.parent.nrSegments++ copy(sibling.keys[:LockminDegree-1], n.keys[LockminDegree:]) copy(sibling.values[:LockminDegree-1], n.values[LockminDegree:]) LockzeroValueSlice(n.values[LockminDegree-1:]) if n.hasChildren { copy(sibling.children[:LockminDegree], n.children[LockminDegree:]) LockzeroNodeSlice(n.children[LockminDegree:]) for i := 0; i < LockminDegree; i++ { sibling.children[i].parent = sibling sibling.children[i].parentIndex = i } } n.nrSegments = LockminDegree - 1 if LocktrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < LockminDegree { return gap } return LockGapIterator{sibling, gap.index - LockminDegree} } // rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient // (contain fewer segments than required by B-tree invariants), as required for // removal, and returns an updated iterator to the position represented by gap. // // Precondition: n is the only node in the tree that may currently violate a // B-tree invariant. func (n *Locknode) rebalanceAfterRemove(gap LockGapIterator) LockGapIterator { for { if n.nrSegments >= LockminDegree-1 { return gap } if n.parent == nil { return gap } if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= LockminDegree { copy(n.keys[1:], n.keys[:n.nrSegments]) copy(n.values[1:], n.values[:n.nrSegments]) n.keys[0] = n.parent.keys[n.parentIndex-1] n.values[0] = n.parent.values[n.parentIndex-1] n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] lockSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { copy(n.children[1:], n.children[:n.nrSegments+1]) n.children[0] = sibling.children[sibling.nrSegments] sibling.children[sibling.nrSegments] = nil n.children[0].parent = n n.children[0].parentIndex = 0 for i := 1; i < n.nrSegments+2; i++ { n.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if LocktrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling && gap.index == sibling.nrSegments { return LockGapIterator{n, 0} } if gap.node == n { return LockGapIterator{n, gap.index + 1} } return gap } if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= LockminDegree { n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] n.values[n.nrSegments] = n.parent.values[n.parentIndex] n.parent.keys[n.parentIndex] = sibling.keys[0] n.parent.values[n.parentIndex] = sibling.values[0] copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) lockSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { n.children[n.nrSegments+1] = sibling.children[0] copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) sibling.children[sibling.nrSegments] = nil n.children[n.nrSegments+1].parent = n n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 for i := 0; i < sibling.nrSegments; i++ { sibling.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if LocktrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling { if gap.index == 0 { return LockGapIterator{n, n.nrSegments} } return LockGapIterator{sibling, gap.index - 1} } return gap } p := n.parent if p.nrSegments == 1 { left, right := p.children[0], p.children[1] p.nrSegments = left.nrSegments + right.nrSegments + 1 p.hasChildren = left.hasChildren p.keys[left.nrSegments] = p.keys[0] p.values[left.nrSegments] = p.values[0] copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := 0; i < p.nrSegments+1; i++ { p.children[i].parent = p p.children[i].parentIndex = i } } else { p.children[0] = nil p.children[1] = nil } if gap.node == left { return LockGapIterator{p, gap.index} } if gap.node == right { return LockGapIterator{p, gap.index + left.nrSegments + 1} } return gap } // Merge n and either sibling, along with the segment separating the // two, into whichever of the two nodes comes first. This is the // reverse of the non-root splitting case in // node.rebalanceBeforeInsert. var left, right *Locknode if n.parentIndex > 0 { left = n.prevSibling() right = n } else { left = n right = n.nextSibling() } if gap.node == right { gap = LockGapIterator{left, gap.index + left.nrSegments + 1} } left.keys[left.nrSegments] = p.keys[left.parentIndex] left.values[left.nrSegments] = p.values[left.parentIndex] copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { left.children[i].parent = left left.children[i].parentIndex = i } } left.nrSegments += right.nrSegments + 1 copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) lockSetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) for i := 0; i < p.nrSegments; i++ { p.children[i].parentIndex = i } p.children[p.nrSegments] = nil p.nrSegments-- if LocktrackGaps != 0 { left.updateMaxGapLocal() } n = p } } // updateMaxGapLeaf updates maxGap bottom-up from the calling leaf until no // necessary update. // // Preconditions: n must be a leaf node, trackGaps must be 1. func (n *Locknode) updateMaxGapLeaf() { if n.hasChildren { panic(fmt.Sprintf("updateMaxGapLeaf should always be called on leaf node: %v", n)) } max := n.calculateMaxGapLeaf() if max == n.maxGap.Get() { return } oldMax := n.maxGap.Get() n.maxGap.Set(max) if max > oldMax { for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() >= max { break } p.maxGap.Set(max) } return } for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() > oldMax { break } parentNewMax := p.calculateMaxGapInternal() if p.maxGap.Get() == parentNewMax { break } p.maxGap.Set(parentNewMax) } } // updateMaxGapLocal updates maxGap of the calling node solely with no // propagation to ancestor nodes. // // Precondition: trackGaps must be 1. func (n *Locknode) updateMaxGapLocal() { if !n.hasChildren { n.maxGap.Set(n.calculateMaxGapLeaf()) } else { n.maxGap.Set(n.calculateMaxGapInternal()) } } // calculateMaxGapLeaf iterates the gaps within a leaf node and calculate the // max. // // Preconditions: n must be a leaf node. func (n *Locknode) calculateMaxGapLeaf() uint64 { max := LockGapIterator{n, 0}.Range().Length() for i := 1; i <= n.nrSegments; i++ { if current := (LockGapIterator{n, i}).Range().Length(); current > max { max = current } } return max } // calculateMaxGapInternal iterates children's maxGap within an internal node n // and calculate the max. // // Preconditions: n must be a non-leaf node. func (n *Locknode) calculateMaxGapInternal() uint64 { max := n.children[0].maxGap.Get() for i := 1; i <= n.nrSegments; i++ { if current := n.children[i].maxGap.Get(); current > max { max = current } } return max } // searchFirstLargeEnoughGap returns the first gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *Locknode) searchFirstLargeEnoughGap(minSize uint64) LockGapIterator { if n.maxGap.Get() < minSize { return LockGapIterator{} } if n.hasChildren { for i := 0; i <= n.nrSegments; i++ { if largeEnoughGap := n.children[i].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := 0; i <= n.nrSegments; i++ { currentGap := LockGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // searchLastLargeEnoughGap returns the last gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *Locknode) searchLastLargeEnoughGap(minSize uint64) LockGapIterator { if n.maxGap.Get() < minSize { return LockGapIterator{} } if n.hasChildren { for i := n.nrSegments; i >= 0; i-- { if largeEnoughGap := n.children[i].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := n.nrSegments; i >= 0; i-- { currentGap := LockGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // A Iterator is conceptually one of: // // - A pointer to a segment in a set; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Iterators are copyable values and are meaningfully equality-comparable. The // zero value of Iterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type LockIterator struct { // node is the node containing the iterated segment. If the iterator is // terminal, node is nil. node *Locknode // index is the index of the segment in node.keys/values. index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (seg LockIterator) Ok() bool { return seg.node != nil } // Range returns the iterated segment's range key. func (seg LockIterator) Range() LockRange { return seg.node.keys[seg.index] } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (seg LockIterator) Start() uint64 { return seg.node.keys[seg.index].Start } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (seg LockIterator) End() uint64 { return seg.node.keys[seg.index].End } // SetRangeUnchecked mutates the iterated segment's range key. This operation // does not invalidate any iterators. // // Preconditions: // - r.Length() > 0. // - The new range must not overlap an existing one: // - If seg.NextSegment().Ok(), then r.end <= seg.NextSegment().Start(). // - If seg.PrevSegment().Ok(), then r.start >= seg.PrevSegment().End(). func (seg LockIterator) SetRangeUnchecked(r LockRange) { seg.node.keys[seg.index] = r } // SetRange mutates the iterated segment's range key. If the new range would // cause the iterated segment to overlap another segment, or if the new range // is invalid, SetRange panics. This operation does not invalidate any // iterators. func (seg LockIterator) SetRange(r LockRange) { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) } if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) } seg.SetRangeUnchecked(r) } // SetStartUnchecked mutates the iterated segment's start. This operation does // not invalidate any iterators. // // Preconditions: The new start must be valid: // - start < seg.End() // - If seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). func (seg LockIterator) SetStartUnchecked(start uint64) { seg.node.keys[seg.index].Start = start } // SetStart mutates the iterated segment's start. If the new start value would // cause the iterated segment to overlap another segment, or would result in an // invalid range, SetStart panics. This operation does not invalidate any // iterators. func (seg LockIterator) SetStart(start uint64) { if start >= seg.End() { panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) } if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) } seg.SetStartUnchecked(start) } // SetEndUnchecked mutates the iterated segment's end. This operation does not // invalidate any iterators. // // Preconditions: The new end must be valid: // - end > seg.Start(). // - If seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). func (seg LockIterator) SetEndUnchecked(end uint64) { seg.node.keys[seg.index].End = end } // SetEnd mutates the iterated segment's end. If the new end value would cause // the iterated segment to overlap another segment, or would result in an // invalid range, SetEnd panics. This operation does not invalidate any // iterators. func (seg LockIterator) SetEnd(end uint64) { if end <= seg.Start() { panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) } if next := seg.NextSegment(); next.Ok() && end > next.Start() { panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) } seg.SetEndUnchecked(end) } // Value returns a copy of the iterated segment's value. func (seg LockIterator) Value() Lock { return seg.node.values[seg.index] } // ValuePtr returns a pointer to the iterated segment's value. The pointer is // invalidated if the iterator is invalidated. This operation does not // invalidate any iterators. func (seg LockIterator) ValuePtr() *Lock { return &seg.node.values[seg.index] } // SetValue mutates the iterated segment's value. This operation does not // invalidate any iterators. func (seg LockIterator) SetValue(val Lock) { seg.node.values[seg.index] = val } // PrevSegment returns the iterated segment's predecessor. If there is no // preceding segment, PrevSegment returns a terminal iterator. func (seg LockIterator) PrevSegment() LockIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment() } if seg.index > 0 { return LockIterator{seg.node, seg.index - 1} } if seg.node.parent == nil { return LockIterator{} } return LocksegmentBeforePosition(seg.node.parent, seg.node.parentIndex) } // NextSegment returns the iterated segment's successor. If there is no // succeeding segment, NextSegment returns a terminal iterator. func (seg LockIterator) NextSegment() LockIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment() } if seg.index < seg.node.nrSegments-1 { return LockIterator{seg.node, seg.index + 1} } if seg.node.parent == nil { return LockIterator{} } return LocksegmentAfterPosition(seg.node.parent, seg.node.parentIndex) } // PrevGap returns the gap immediately before the iterated segment. func (seg LockIterator) PrevGap() LockGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment().NextGap() } return LockGapIterator{seg.node, seg.index} } // NextGap returns the gap immediately after the iterated segment. func (seg LockIterator) NextGap() LockGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment().PrevGap() } return LockGapIterator{seg.node, seg.index + 1} } // PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, // or the gap before the iterated segment otherwise. If seg.Start() == // Functions.MinKey(), PrevNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by PrevNonEmpty will be // non-terminal. func (seg LockIterator) PrevNonEmpty() (LockIterator, LockGapIterator) { if prev := seg.PrevSegment(); prev.Ok() && prev.End() == seg.Start() { return prev, LockGapIterator{} } return LockIterator{}, seg.PrevGap() } // NextNonEmpty returns the iterated segment's successor if it is adjacent, or // the gap after the iterated segment otherwise. If seg.End() == // Functions.MaxKey(), NextNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by NextNonEmpty will be // non-terminal. func (seg LockIterator) NextNonEmpty() (LockIterator, LockGapIterator) { if next := seg.NextSegment(); next.Ok() && next.Start() == seg.End() { return next, LockGapIterator{} } return LockIterator{}, seg.NextGap() } // A GapIterator is conceptually one of: // // - A pointer to a position between two segments, before the first segment, or // after the last segment in a set, called a *gap*; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Note that the gap between two adjacent segments exists (iterators to it are // non-terminal), but has a length of zero. GapIterator.IsEmpty returns true // for such gaps. An empty set contains a single gap, spanning the entire range // of the set's keys. // // GapIterators are copyable values and are meaningfully equality-comparable. // The zero value of GapIterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type LockGapIterator struct { // The representation of a GapIterator is identical to that of an Iterator, // except that index corresponds to positions between segments in the same // way as for node.children (see comment for node.nrSegments). node *Locknode index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (gap LockGapIterator) Ok() bool { return gap.node != nil } // Range returns the range spanned by the iterated gap. func (gap LockGapIterator) Range() LockRange { return LockRange{gap.Start(), gap.End()} } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (gap LockGapIterator) Start() uint64 { if ps := gap.PrevSegment(); ps.Ok() { return ps.End() } return lockSetFunctions{}.MinKey() } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (gap LockGapIterator) End() uint64 { if ns := gap.NextSegment(); ns.Ok() { return ns.Start() } return lockSetFunctions{}.MaxKey() } // IsEmpty returns true if the iterated gap is empty (that is, the "gap" is // between two adjacent segments.) func (gap LockGapIterator) IsEmpty() bool { return gap.Range().Length() == 0 } // PrevSegment returns the segment immediately before the iterated gap. If no // such segment exists, PrevSegment returns a terminal iterator. func (gap LockGapIterator) PrevSegment() LockIterator { return LocksegmentBeforePosition(gap.node, gap.index) } // NextSegment returns the segment immediately after the iterated gap. If no // such segment exists, NextSegment returns a terminal iterator. func (gap LockGapIterator) NextSegment() LockIterator { return LocksegmentAfterPosition(gap.node, gap.index) } // PrevGap returns the iterated gap's predecessor. If no such gap exists, // PrevGap returns a terminal iterator. func (gap LockGapIterator) PrevGap() LockGapIterator { seg := gap.PrevSegment() if !seg.Ok() { return LockGapIterator{} } return seg.PrevGap() } // NextGap returns the iterated gap's successor. If no such gap exists, NextGap // returns a terminal iterator. func (gap LockGapIterator) NextGap() LockGapIterator { seg := gap.NextSegment() if !seg.Ok() { return LockGapIterator{} } return seg.NextGap() } // NextLargeEnoughGap returns the iterated gap's first next gap with larger // length than minSize. If not found, return a terminal gap iterator (does NOT // include this gap itself). // // Precondition: trackGaps must be 1. func (gap LockGapIterator) NextLargeEnoughGap(minSize uint64) LockGapIterator { if LocktrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == gap.node.nrSegments { gap.node = gap.NextSegment().node gap.index = 0 return gap.nextLargeEnoughGapHelper(minSize) } return gap.nextLargeEnoughGapHelper(minSize) } // nextLargeEnoughGapHelper is the helper function used by NextLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the trailing gap of a non-leaf node. func (gap LockGapIterator) nextLargeEnoughGapHelper(minSize uint64) LockGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == gap.node.nrSegments)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return LockGapIterator{} } gap.index++ for gap.index <= gap.node.nrSegments { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index++ } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == gap.node.nrSegments { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // PrevLargeEnoughGap returns the iterated gap's first prev gap with larger or // equal length than minSize. If not found, return a terminal gap iterator // (does NOT include this gap itself). // // Precondition: trackGaps must be 1. func (gap LockGapIterator) PrevLargeEnoughGap(minSize uint64) LockGapIterator { if LocktrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == 0 { gap.node = gap.PrevSegment().node gap.index = gap.node.nrSegments return gap.prevLargeEnoughGapHelper(minSize) } return gap.prevLargeEnoughGapHelper(minSize) } // prevLargeEnoughGapHelper is the helper function used by PrevLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the first gap of a non-leaf node. func (gap LockGapIterator) prevLargeEnoughGapHelper(minSize uint64) LockGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == 0)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return LockGapIterator{} } gap.index-- for gap.index >= 0 { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index-- } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == 0 { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // segmentBeforePosition returns the predecessor segment of the position given // by n.children[i], which may or may not contain a child. If no such segment // exists, segmentBeforePosition returns a terminal iterator. func LocksegmentBeforePosition(n *Locknode, i int) LockIterator { for i == 0 { if n.parent == nil { return LockIterator{} } n, i = n.parent, n.parentIndex } return LockIterator{n, i - 1} } // segmentAfterPosition returns the successor segment of the position given by // n.children[i], which may or may not contain a child. If no such segment // exists, segmentAfterPosition returns a terminal iterator. func LocksegmentAfterPosition(n *Locknode, i int) LockIterator { for i == n.nrSegments { if n.parent == nil { return LockIterator{} } n, i = n.parent, n.parentIndex } return LockIterator{n, i} } func LockzeroValueSlice(slice []Lock) { for i := range slice { lockSetFunctions{}.ClearValue(&slice[i]) } } func LockzeroNodeSlice(slice []*Locknode) { for i := range slice { slice[i] = nil } } // String stringifies a Set for debugging. func (s *LockSet) String() string { return s.root.String() } // String stringifies a node (and all of its children) for debugging. func (n *Locknode) String() string { var buf bytes.Buffer n.writeDebugString(&buf, "") return buf.String() } func (n *Locknode) writeDebugString(buf *bytes.Buffer, prefix string) { if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { buf.WriteString(prefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) } for i := 0; i < n.nrSegments; i++ { if child := n.children[i]; child != nil { cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) if child.parent != n || child.parentIndex != i { buf.WriteString(cprefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) } child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) } buf.WriteString(prefix) if n.hasChildren { if LocktrackGaps != 0 { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v, maxGap: %d\n", i, n.keys[i], n.values[i], n.maxGap.Get())) } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } if child := n.children[n.nrSegments]; child != nil { child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) } } // FlatSegment represents a segment as a single object. FlatSegment is used as // an intermediate representation for save/restore and tests. // // +stateify savable type LockFlatSegment struct { Start uint64 End uint64 Value Lock } // ExportSlice returns a copy of all segments in the given set, in ascending // key order. func (s *LockSet) ExportSlice() []LockFlatSegment { var fs []LockFlatSegment for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { fs = append(fs, LockFlatSegment{ Start: seg.Start(), End: seg.End(), Value: seg.Value(), }) } return fs } // ImportSlice initializes the given set from the given slice. // // Preconditions: // - s must be empty. // - fs must represent a valid set (the segments in fs must have valid // lengths that do not overlap). // - The segments in fs must be sorted in ascending key order. func (s *LockSet) ImportSlice(fs []LockFlatSegment) error { if !s.IsEmpty() { return fmt.Errorf("cannot import into non-empty set %v", s) } gap := s.FirstGap() for i := range fs { f := &fs[i] r := LockRange{f.Start, f.End} if !gap.Range().IsSupersetOf(r) { return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: %v => %v", r, f.Value) } gap = s.InsertWithoutMerging(gap, r, f.Value).NextGap() } return nil } // segmentTestCheck returns an error if s is incorrectly sorted, does not // contain exactly expectedSegments segments, or contains a segment which // fails the passed check. // // This should be used only for testing, and has been added to this package for // templating convenience. func (s *LockSet) segmentTestCheck(expectedSegments int, segFunc func(int, LockRange, Lock) error) error { havePrev := false prev := uint64(0) nrSegments := 0 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { next := seg.Start() if havePrev && prev >= next { return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments) } if segFunc != nil { if err := segFunc(nrSegments, seg.Range(), seg.Value()); err != nil { return err } } prev = next havePrev = true nrSegments++ } if nrSegments != expectedSegments { return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments) } return nil } // countSegments counts the number of segments in the set. // // Similar to Check, this should only be used for testing. func (s *LockSet) countSegments() (segments int) { for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { segments++ } return segments } func (s *LockSet) saveRoot() []LockFlatSegment { fs := s.ExportSlice() fs = fs[:len(fs):len(fs)] return fs } func (s *LockSet) loadRoot(_ context.Context, fs []LockFlatSegment) { if err := s.ImportSlice(fs); err != nil { panic(err) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/lock/lock_set_functions.go000066400000000000000000000032331465435605700274150ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package lock import ( "math" ) // LockSet maps a set of Locks into a file. The key is the file offset. type lockSetFunctions struct{} func (lockSetFunctions) MinKey() uint64 { return 0 } func (lockSetFunctions) MaxKey() uint64 { return math.MaxUint64 } func (lockSetFunctions) ClearValue(l *Lock) { *l = Lock{} } func (lockSetFunctions) Merge(r1 LockRange, val1 Lock, r2 LockRange, val2 Lock) (Lock, bool) { // Merge only if the Readers/Writers are identical. if len(val1.Readers) != len(val2.Readers) { return Lock{}, false } for k := range val1.Readers { if _, ok := val2.Readers[k]; !ok { return Lock{}, false } } if val1.Writer != val2.Writer { return Lock{}, false } return val1, true } func (lockSetFunctions) Split(r LockRange, val Lock, split uint64) (Lock, Lock) { // Copy the segment so that split segments don't contain map references // to other segments. val0 := Lock{Readers: make(map[UniqueID]OwnerInfo)} for k, v := range val.Readers { val0.Readers[k] = v } val0.Writer = val.Writer val0.WriterInfo = val.WriterInfo return val, val0 } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/lock/lock_state_autogen.go000066400000000000000000000125631465435605700274020ustar00rootroot00000000000000// automatically generated by stateify. package lock import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (o *OwnerInfo) StateTypeName() string { return "pkg/sentry/fsimpl/lock.OwnerInfo" } func (o *OwnerInfo) StateFields() []string { return []string{ "PID", "OFD", } } func (o *OwnerInfo) beforeSave() {} // +checklocksignore func (o *OwnerInfo) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.PID) stateSinkObject.Save(1, &o.OFD) } func (o *OwnerInfo) afterLoad(context.Context) {} // +checklocksignore func (o *OwnerInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.PID) stateSourceObject.Load(1, &o.OFD) } func (l *Lock) StateTypeName() string { return "pkg/sentry/fsimpl/lock.Lock" } func (l *Lock) StateFields() []string { return []string{ "Readers", "Writer", "WriterInfo", } } func (l *Lock) beforeSave() {} // +checklocksignore func (l *Lock) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.Readers) stateSinkObject.Save(1, &l.Writer) stateSinkObject.Save(2, &l.WriterInfo) } func (l *Lock) afterLoad(context.Context) {} // +checklocksignore func (l *Lock) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.Readers) stateSourceObject.Load(1, &l.Writer) stateSourceObject.Load(2, &l.WriterInfo) } func (l *Locks) StateTypeName() string { return "pkg/sentry/fsimpl/lock.Locks" } func (l *Locks) StateFields() []string { return []string{ "locks", "blockedQueue", } } func (l *Locks) beforeSave() {} // +checklocksignore func (l *Locks) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.locks) stateSinkObject.Save(1, &l.blockedQueue) } func (l *Locks) afterLoad(context.Context) {} // +checklocksignore func (l *Locks) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.locks) stateSourceObject.Load(1, &l.blockedQueue) } func (r *LockRange) StateTypeName() string { return "pkg/sentry/fsimpl/lock.LockRange" } func (r *LockRange) StateFields() []string { return []string{ "Start", "End", } } func (r *LockRange) beforeSave() {} // +checklocksignore func (r *LockRange) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.Start) stateSinkObject.Save(1, &r.End) } func (r *LockRange) afterLoad(context.Context) {} // +checklocksignore func (r *LockRange) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.Start) stateSourceObject.Load(1, &r.End) } func (s *LockSet) StateTypeName() string { return "pkg/sentry/fsimpl/lock.LockSet" } func (s *LockSet) StateFields() []string { return []string{ "root", } } func (s *LockSet) beforeSave() {} // +checklocksignore func (s *LockSet) StateSave(stateSinkObject state.Sink) { s.beforeSave() var rootValue []LockFlatSegment rootValue = s.saveRoot() stateSinkObject.SaveValue(0, rootValue) } func (s *LockSet) afterLoad(context.Context) {} // +checklocksignore func (s *LockSet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new([]LockFlatSegment), func(y any) { s.loadRoot(ctx, y.([]LockFlatSegment)) }) } func (n *Locknode) StateTypeName() string { return "pkg/sentry/fsimpl/lock.Locknode" } func (n *Locknode) StateFields() []string { return []string{ "nrSegments", "parent", "parentIndex", "hasChildren", "maxGap", "keys", "values", "children", } } func (n *Locknode) beforeSave() {} // +checklocksignore func (n *Locknode) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.nrSegments) stateSinkObject.Save(1, &n.parent) stateSinkObject.Save(2, &n.parentIndex) stateSinkObject.Save(3, &n.hasChildren) stateSinkObject.Save(4, &n.maxGap) stateSinkObject.Save(5, &n.keys) stateSinkObject.Save(6, &n.values) stateSinkObject.Save(7, &n.children) } func (n *Locknode) afterLoad(context.Context) {} // +checklocksignore func (n *Locknode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.nrSegments) stateSourceObject.Load(1, &n.parent) stateSourceObject.Load(2, &n.parentIndex) stateSourceObject.Load(3, &n.hasChildren) stateSourceObject.Load(4, &n.maxGap) stateSourceObject.Load(5, &n.keys) stateSourceObject.Load(6, &n.values) stateSourceObject.Load(7, &n.children) } func (l *LockFlatSegment) StateTypeName() string { return "pkg/sentry/fsimpl/lock.LockFlatSegment" } func (l *LockFlatSegment) StateFields() []string { return []string{ "Start", "End", "Value", } } func (l *LockFlatSegment) beforeSave() {} // +checklocksignore func (l *LockFlatSegment) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.Start) stateSinkObject.Save(1, &l.End) stateSinkObject.Save(2, &l.Value) } func (l *LockFlatSegment) afterLoad(context.Context) {} // +checklocksignore func (l *LockFlatSegment) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.Start) stateSourceObject.Load(1, &l.End) stateSourceObject.Load(2, &l.Value) } func init() { state.Register((*OwnerInfo)(nil)) state.Register((*Lock)(nil)) state.Register((*Locks)(nil)) state.Register((*LockRange)(nil)) state.Register((*LockSet)(nil)) state.Register((*Locknode)(nil)) state.Register((*LockFlatSegment)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/mqfs/000077500000000000000000000000001465435605700232105ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/mqfs/mqfs.go000066400000000000000000000070321465435605700245070ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package mqfs provides a filesystem implementation to back POSIX message // queues. package mqfs import ( "fmt" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" "gvisor.dev/gvisor/pkg/sentry/kernel/mq" "gvisor.dev/gvisor/pkg/sentry/vfs" ) const ( // Name is the user-visible filesystem name. Name = "mqueue" ) // FilesystemType implements vfs.FilesystemType. // // +stateify savable type FilesystemType struct{} // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name } // Release implements vfs.FilesystemType.Release. func (FilesystemType) Release(ctx context.Context) {} // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { // mqfs is initialized only once per ipc namespace. Each ipc namespace has // a POSIX message registry with a root dentry, filesystem, and a // disconnected mount. We want the fs to be consistent for all processes in // the same ipc namespace, so instead of creating a new fs and root dentry, // we retrieve them using IPCNamespace.PosixQueues and use them. i := ipcNamespaceFromContext(ctx) if i == nil { return nil, nil, fmt.Errorf("mqfs.FilesystemType.GetFilesystem: ipc namespace doesn't exist") } defer i.DecRef(ctx) registry := i.PosixQueues() if registry == nil { return nil, nil, fmt.Errorf("mqfs.FilesystemType.GetFilesystem: ipc namespace doesn't have a POSIX registry") } impl := registry.Impl().(*RegistryImpl) impl.fs.VFSFilesystem().IncRef() impl.root.IncRef() return impl.fs.VFSFilesystem(), impl.root.VFSDentry(), nil } // filesystem implements kernfs.Filesystem. // // +stateify savable type filesystem struct { kernfs.Filesystem devMinor uint32 } // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) } // MountOptions implements vfs.FilesystemImpl.MountOptions. func (fs *filesystem) MountOptions() string { return "" } // ipcNamespace defines functions we need from kernel.IPCNamespace. We redefine // ipcNamespace along with ipcNamespaceFromContext to avoid circular dependency // with package sentry/kernel. type ipcNamespace interface { // PosixQueues returns a POSIX message queue registry. PosixQueues() *mq.Registry // DecRef decrements ipcNamespace's number of references. DecRef(ctx context.Context) } // ipcNamespaceFromContext returns the IPC namespace in which ctx is executing. // Copied from package sentry/kernel. func ipcNamespaceFromContext(ctx context.Context) ipcNamespace { if v := ctx.Value(ipc.CtxIPCNamespace); v != nil { return v.(ipcNamespace) } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/mqfs/mqfs_state_autogen.go000066400000000000000000000140271465435605700274330ustar00rootroot00000000000000// automatically generated by stateify. package mqfs import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (ft *FilesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/mqfs.FilesystemType" } func (ft *FilesystemType) StateFields() []string { return []string{} } func (ft *FilesystemType) beforeSave() {} // +checklocksignore func (ft *FilesystemType) StateSave(stateSinkObject state.Sink) { ft.beforeSave() } func (ft *FilesystemType) afterLoad(context.Context) {} // +checklocksignore func (ft *FilesystemType) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (fs *filesystem) StateTypeName() string { return "pkg/sentry/fsimpl/mqfs.filesystem" } func (fs *filesystem) StateFields() []string { return []string{ "Filesystem", "devMinor", } } func (fs *filesystem) beforeSave() {} // +checklocksignore func (fs *filesystem) StateSave(stateSinkObject state.Sink) { fs.beforeSave() stateSinkObject.Save(0, &fs.Filesystem) stateSinkObject.Save(1, &fs.devMinor) } func (fs *filesystem) afterLoad(context.Context) {} // +checklocksignore func (fs *filesystem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fs.Filesystem) stateSourceObject.Load(1, &fs.devMinor) } func (q *queueInode) StateTypeName() string { return "pkg/sentry/fsimpl/mqfs.queueInode" } func (q *queueInode) StateFields() []string { return []string{ "DynamicBytesFile", "queue", } } func (q *queueInode) beforeSave() {} // +checklocksignore func (q *queueInode) StateSave(stateSinkObject state.Sink) { q.beforeSave() stateSinkObject.Save(0, &q.DynamicBytesFile) stateSinkObject.Save(1, &q.queue) } func (q *queueInode) afterLoad(context.Context) {} // +checklocksignore func (q *queueInode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &q.DynamicBytesFile) stateSourceObject.Load(1, &q.queue) } func (fd *queueFD) StateTypeName() string { return "pkg/sentry/fsimpl/mqfs.queueFD" } func (fd *queueFD) StateFields() []string { return []string{ "FileDescriptionDefaultImpl", "DynamicBytesFileDescriptionImpl", "LockFD", "vfsfd", "inode", "queue", } } func (fd *queueFD) beforeSave() {} // +checklocksignore func (fd *queueFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(1, &fd.DynamicBytesFileDescriptionImpl) stateSinkObject.Save(2, &fd.LockFD) stateSinkObject.Save(3, &fd.vfsfd) stateSinkObject.Save(4, &fd.inode) stateSinkObject.Save(5, &fd.queue) } func (fd *queueFD) afterLoad(context.Context) {} // +checklocksignore func (fd *queueFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(1, &fd.DynamicBytesFileDescriptionImpl) stateSourceObject.Load(2, &fd.LockFD) stateSourceObject.Load(3, &fd.vfsfd) stateSourceObject.Load(4, &fd.inode) stateSourceObject.Load(5, &fd.queue) } func (r *RegistryImpl) StateTypeName() string { return "pkg/sentry/fsimpl/mqfs.RegistryImpl" } func (r *RegistryImpl) StateFields() []string { return []string{ "root", "fs", "mount", } } func (r *RegistryImpl) beforeSave() {} // +checklocksignore func (r *RegistryImpl) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.root) stateSinkObject.Save(1, &r.fs) stateSinkObject.Save(2, &r.mount) } func (r *RegistryImpl) afterLoad(context.Context) {} // +checklocksignore func (r *RegistryImpl) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.root) stateSourceObject.Load(1, &r.fs) stateSourceObject.Load(2, &r.mount) } func (i *rootInode) StateTypeName() string { return "pkg/sentry/fsimpl/mqfs.rootInode" } func (i *rootInode) StateFields() []string { return []string{ "rootInodeRefs", "InodeAlwaysValid", "InodeAnonymous", "InodeAttrs", "InodeDirectoryNoNewChildren", "InodeNotSymlink", "InodeTemporary", "InodeWatches", "OrderedChildren", "locks", } } func (i *rootInode) beforeSave() {} // +checklocksignore func (i *rootInode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.rootInodeRefs) stateSinkObject.Save(1, &i.InodeAlwaysValid) stateSinkObject.Save(2, &i.InodeAnonymous) stateSinkObject.Save(3, &i.InodeAttrs) stateSinkObject.Save(4, &i.InodeDirectoryNoNewChildren) stateSinkObject.Save(5, &i.InodeNotSymlink) stateSinkObject.Save(6, &i.InodeTemporary) stateSinkObject.Save(7, &i.InodeWatches) stateSinkObject.Save(8, &i.OrderedChildren) stateSinkObject.Save(9, &i.locks) } func (i *rootInode) afterLoad(context.Context) {} // +checklocksignore func (i *rootInode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.rootInodeRefs) stateSourceObject.Load(1, &i.InodeAlwaysValid) stateSourceObject.Load(2, &i.InodeAnonymous) stateSourceObject.Load(3, &i.InodeAttrs) stateSourceObject.Load(4, &i.InodeDirectoryNoNewChildren) stateSourceObject.Load(5, &i.InodeNotSymlink) stateSourceObject.Load(6, &i.InodeTemporary) stateSourceObject.Load(7, &i.InodeWatches) stateSourceObject.Load(8, &i.OrderedChildren) stateSourceObject.Load(9, &i.locks) } func (r *rootInodeRefs) StateTypeName() string { return "pkg/sentry/fsimpl/mqfs.rootInodeRefs" } func (r *rootInodeRefs) StateFields() []string { return []string{ "refCount", } } func (r *rootInodeRefs) beforeSave() {} // +checklocksignore func (r *rootInodeRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *rootInodeRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func init() { state.Register((*FilesystemType)(nil)) state.Register((*filesystem)(nil)) state.Register((*queueInode)(nil)) state.Register((*queueFD)(nil)) state.Register((*RegistryImpl)(nil)) state.Register((*rootInode)(nil)) state.Register((*rootInodeRefs)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/mqfs/queue.go000066400000000000000000000121131465435605700246610ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mqfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/mq" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // queueInode represents an inode for a message queue (/dev/mqueue/[name]). // // +stateify savable type queueInode struct { kernfs.DynamicBytesFile // queue is the message queue backing this inode. queue *mq.Queue } var _ kernfs.Inode = (*queueInode)(nil) // newQueueInode returns a new, initialized queueInode. func (fs *filesystem) newQueueInode(ctx context.Context, creds *auth.Credentials, q *mq.Queue, perm linux.FileMode) kernfs.Inode { inode := &queueInode{queue: q} inode.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), q, perm) return inode } // Keep implements kernfs.Inode.Keep. func (q *queueInode) Keep() bool { // Return true so that the fs keeps newly created dentries. This is done // because inodes returned by root.Lookup are not temporary, they exist // in the fs, and refer to message queues. return true } // queueFD implements vfs.FileDescriptionImpl for FD backed by a POSIX message // queue. It's mostly similar to DynamicBytesFD, but implements more operations. // // +stateify savable type queueFD struct { vfs.FileDescriptionDefaultImpl vfs.DynamicBytesFileDescriptionImpl vfs.LockFD vfsfd vfs.FileDescription inode kernfs.Inode // queue is a view into the queue backing this fd. queue mq.View } // Init initializes a queueFD. Mostly copied from DynamicBytesFD.Init, but uses // the queueFD as FileDescriptionImpl. func (fd *queueFD) Init(m *vfs.Mount, d *kernfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error { fd.LockFD.Init(locks) if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return err } fd.inode = d.Inode() fd.DynamicBytesFileDescriptionImpl.Init(&fd.vfsfd, data) return nil } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *queueFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence) } // Read implements vfs.FileDescriptionImpl.Read. func (fd *queueFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts) } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *queueFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts) } // Write implements vfs.FileDescriptionImpl.Write. func (fd *queueFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { return fd.DynamicBytesFileDescriptionImpl.Write(ctx, src, opts) } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *queueFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { return fd.DynamicBytesFileDescriptionImpl.PWrite(ctx, src, offset, opts) } // Release implements vfs.FileDescriptionImpl.Release. func (fd *queueFD) Release(context.Context) {} // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *queueFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() return fd.inode.Stat(ctx, fs, opts) } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *queueFD) SetStat(context.Context, vfs.SetStatOptions) error { // DynamicBytesFiles are immutable. return linuxerr.EPERM } // OnClose implements FileDescriptionImpl.OnClose similar to // ipc/mqueue.c::mqueue_flush_file. func (fd *queueFD) OnClose(ctx context.Context) error { fd.queue.Flush(ctx) return nil } // Readiness implements waiter.Waitable.Readiness similar to // ipc/mqueue.c::mqueue_poll_file. func (fd *queueFD) Readiness(mask waiter.EventMask) waiter.EventMask { return fd.queue.Readiness(mask) } // EventRegister implements Waitable.EventRegister. func (fd *queueFD) EventRegister(e *waiter.Entry) error { return fd.queue.EventRegister(e) } // EventUnregister implements Waitable.EventUnregister. func (fd *queueFD) EventUnregister(e *waiter.Entry) { fd.queue.EventUnregister(e) } // Epollable implements FileDescriptionImpl.Epollable. func (fd *queueFD) Epollable() bool { return true } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/mqfs/registry.go000066400000000000000000000124451465435605700254150ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mqfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/mq" "gvisor.dev/gvisor/pkg/sentry/vfs" ) const ( maxCachedDentries = 1000 ) // RegistryImpl implements mq.RegistryImpl. It implements the interface using // the message queue filesystem, and is provided to mq.Registry at // initialization. // // RegistryImpl is not thread-safe, so it is the responsibility of the user // (the containing mq.Registry) to protect using a lock. // // +stateify savable type RegistryImpl struct { // root is the root dentry of the mq filesystem. Its main usage is to // retrieve the root inode, which we use to add, remove, and lookup message // queues. // // We hold a reference on root and release when the registry is destroyed. root *kernfs.Dentry // fs is the filesystem backing this registry, used mainly to initialize // new inodes. fs *filesystem // mount is the mount point used for this filesystem. mount *vfs.Mount } // NewRegistryImpl returns a new, initialized RegistryImpl, and takes a // reference on root. func NewRegistryImpl(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*RegistryImpl, error) { devMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, err } fs := &filesystem{ devMinor: devMinor, Filesystem: kernfs.Filesystem{MaxCachedDentries: maxCachedDentries}, } fs.VFSFilesystem().Init(vfsObj, &FilesystemType{}, fs) vfsfs := fs.VFSFilesystem() // NewDisconnectedMount will obtain a ref on dentry and vfsfs which is // transferred to mount. vfsfs was initiated with 1 ref already. So get rid // of the extra ref. defer vfsfs.DecRef(ctx) // dentry is initialized with 1 ref which is transferred to fs. var dentry kernfs.Dentry dentry.InitRoot(&fs.Filesystem, fs.newRootInode(ctx, creds)) mount := vfsObj.NewDisconnectedMount(vfsfs, dentry.VFSDentry(), &vfs.MountOptions{}) return &RegistryImpl{ root: &dentry, fs: fs, mount: mount, }, nil } // Get implements mq.RegistryImpl.Get. func (r *RegistryImpl) Get(ctx context.Context, name string, access mq.AccessType, block bool, flags uint32) (*vfs.FileDescription, bool, error) { inode, err := r.root.Inode().(*rootInode).Lookup(ctx, name) if err != nil { return nil, false, nil } qInode := inode.(*queueInode) if !qInode.queue.HasPermissions(auth.CredentialsFromContext(ctx), perm(access)) { // "The queue exists, but the caller does not have permission to // open it in the specified mode." return nil, false, linuxerr.EACCES } fd, err := r.newFD(ctx, qInode.queue, qInode, access, block, flags) if err != nil { return nil, false, err } return fd, true, nil } // New implements mq.RegistryImpl.New. func (r *RegistryImpl) New(ctx context.Context, name string, q *mq.Queue, access mq.AccessType, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error) { root := r.root.Inode().(*rootInode) qInode := r.fs.newQueueInode(ctx, auth.CredentialsFromContext(ctx), q, perm).(*queueInode) err := root.Insert(name, qInode) if err != nil { return nil, err } return r.newFD(ctx, q, qInode, access, block, flags) } // Unlink implements mq.RegistryImpl.Unlink. func (r *RegistryImpl) Unlink(ctx context.Context, name string) error { creds := auth.CredentialsFromContext(ctx) if err := r.root.Inode().CheckPermissions(ctx, creds, vfs.MayWrite|vfs.MayExec); err != nil { return err } root := r.root.Inode().(*rootInode) inode, err := root.Lookup(ctx, name) if err != nil { return err } defer inode.DecRef(ctx) return root.Unlink(ctx, name, inode) } // Destroy implements mq.RegistryImpl.Destroy. func (r *RegistryImpl) Destroy(ctx context.Context) { r.root.DecRef(ctx) r.mount.DecRef(ctx) } // newFD returns a new file description created using the given queue and inode. func (r *RegistryImpl) newFD(ctx context.Context, q *mq.Queue, inode *queueInode, access mq.AccessType, block bool, flags uint32) (*vfs.FileDescription, error) { view, err := mq.NewView(q, access, block) if err != nil { return nil, err } var dentry kernfs.Dentry dentry.Init(&r.fs.Filesystem, inode) defer dentry.DecRef(ctx) fd := &queueFD{queue: view} err = fd.Init(r.mount, &dentry, inode.queue, inode.Locks(), flags) if err != nil { return nil, err } return &fd.vfsfd, nil } // perm returns a permission mask created using given flags. func perm(access mq.AccessType) vfs.AccessTypes { switch access { case mq.ReadWrite: return vfs.MayRead | vfs.MayWrite case mq.WriteOnly: return vfs.MayWrite case mq.ReadOnly: return vfs.MayRead default: return 0 // Can't happen, see NewView. } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/mqfs/root.go000066400000000000000000000055301465435605700245250ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mqfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // rootInode represents inode for filesystem's root directory (/dev/mqueue). // // +stateify savable type rootInode struct { rootInodeRefs kernfs.InodeAlwaysValid kernfs.InodeAnonymous kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren kernfs.InodeNotSymlink kernfs.InodeTemporary kernfs.InodeWatches kernfs.OrderedChildren locks vfs.FileLocks } var _ kernfs.Inode = (*rootInode)(nil) // newRootInode returns a new, initialized rootInode. func (fs *filesystem) newRootInode(ctx context.Context, creds *auth.Credentials) kernfs.Inode { inode := &rootInode{} inode.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|linux.FileMode(0555)) inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) inode.InitRefs() return inode } // Open implements kernfs.Inode.Open. func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, }) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } // DecRef implements kernfs.Inode.DecRef. func (i *rootInode) DecRef(ctx context.Context) { i.rootInodeRefs.DecRef(func() { i.Destroy(ctx) }) } // Rename implements Inode.Rename and overrides OrderedChildren.Rename. mqueue // filesystem allows files to be unlinked, but not renamed. func (i *rootInode) Rename(ctx context.Context, oldname, newname string, child, dstDir kernfs.Inode) error { return linuxerr.EPERM } // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*rootInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return linuxerr.EPERM } // StatFS implements kernfs.Inode.StatFS. func (*rootInode) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { return vfs.GenericStatFS(linux.MQUEUE_MAGIC), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/mqfs/root_inode_refs.go000066400000000000000000000101751465435605700267230ustar00rootroot00000000000000package mqfs import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const rootInodeenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var rootInodeobj *rootInode // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type rootInodeRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *rootInodeRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *rootInodeRefs) RefType() string { return fmt.Sprintf("%T", rootInodeobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *rootInodeRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *rootInodeRefs) LogRefs() bool { return rootInodeenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *rootInodeRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *rootInodeRefs) IncRef() { v := r.refCount.Add(1) if rootInodeenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *rootInodeRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if rootInodeenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *rootInodeRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if rootInodeenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *rootInodeRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/nsfs/000077500000000000000000000000001465435605700232135ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/nsfs/inode_refs.go000066400000000000000000000100651465435605700256610ustar00rootroot00000000000000package nsfs import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const inodeenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var inodeobj *Inode // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type inodeRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *inodeRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *inodeRefs) RefType() string { return fmt.Sprintf("%T", inodeobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *inodeRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *inodeRefs) LogRefs() bool { return inodeenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *inodeRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *inodeRefs) IncRef() { v := r.refCount.Add(1) if inodeenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *inodeRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if inodeenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *inodeRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if inodeenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *inodeRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/nsfs/nsfs.go000066400000000000000000000133401465435605700245140ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package nsfs provides the filesystem implementation backing // Kernel.NsfsMount. package nsfs import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // +stateify savable type filesystemType struct{} // Name implements vfs.FilesystemType.Name. func (filesystemType) Name() string { return "nsfs" } // Release implements vfs.FilesystemType.Release. func (filesystemType) Release(ctx context.Context) {} // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { panic("nsfs.filesystemType.GetFilesystem should never be called") } // +stateify savable type filesystem struct { kernfs.Filesystem devMinor uint32 } // NewFilesystem sets up and returns a new vfs.Filesystem implemented by nsfs. func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { devMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, err } fs := &filesystem{ devMinor: devMinor, } fs.Filesystem.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) return fs.Filesystem.VFSFilesystem(), nil } // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) } // MountOptions implements vfs.FilesystemImpl.MountOptions. func (fs *filesystem) MountOptions() string { return "" } // Inode implements kernfs.Inode. // // +stateify savable type Inode struct { kernfs.InodeAttrs kernfs.InodeAnonymous kernfs.InodeNotDirectory kernfs.InodeNotSymlink kernfs.InodeWatches inodeRefs locks vfs.FileLocks namespace vfs.Namespace mnt *vfs.Mount } // DecRef implements kernfs.Inode.DecRef. func (i *Inode) DecRef(ctx context.Context) { i.inodeRefs.DecRef(func() { i.namespace.Destroy(ctx) }) } // Keep implements kernfs.Inode.Keep. func (i *Inode) Keep() bool { return false } // NewInode creates a new nsfs inode. func NewInode(ctx context.Context, mnt *vfs.Mount, namespace vfs.Namespace) *Inode { fs := mnt.Filesystem().Impl().(*filesystem) creds := auth.CredentialsFromContext(ctx) i := &Inode{ namespace: namespace, mnt: mnt, } i.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), nsfsMode) i.InitRefs() return i } const nsfsMode = linux.S_IFREG | linux.ModeUserRead | linux.ModeGroupRead | linux.ModeOtherRead // Namespace returns the namespace associated with the inode. func (i *Inode) Namespace() vfs.Namespace { return i.namespace } // Name returns the inode name that is used to implement readlink() of // /proc/pid/ns/ files. func (i *Inode) Name() string { return fmt.Sprintf("%s:[%d]", i.namespace.Type(), i.Ino()) } // VirtualDentry returns VirtualDentry for the inode. func (i *Inode) VirtualDentry() vfs.VirtualDentry { dentry := &kernfs.Dentry{} mnt := i.mnt fs := mnt.Filesystem().Impl().(*filesystem) i.IncRef() mnt.IncRef() dentry.Init(&fs.Filesystem, i) vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry()) return vd } // Mode implements kernfs.Inode.Mode. func (i *Inode) Mode() linux.FileMode { return nsfsMode } // SetStat implements kernfs.Inode.SetStat. // // Linux sets S_IMMUTABLE to nsfs inodes that prevents any attribute changes on // them. func (i *Inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { return linuxerr.EPERM } // namespace FD is a synthetic file that represents a namespace in // /proc/[pid]/ns/*. // // +stateify savable type namespaceFD struct { vfs.FileDescriptionDefaultImpl vfs.LockFD vfsfd vfs.FileDescription inode *Inode } // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() return fd.inode.Stat(ctx, vfs, opts) } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() creds := auth.CredentialsFromContext(ctx) return fd.inode.SetStat(ctx, vfs, creds, opts) } // Release implements vfs.FileDescriptionImpl.Release. func (fd *namespaceFD) Release(ctx context.Context) { fd.inode.DecRef(ctx) } // Open implements kernfs.Inode.Open. func (i *Inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &namespaceFD{inode: i} i.IncRef() fd.LockFD.Init(&i.locks) if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil } // StatFS implements kernfs.Inode.StatFS. func (i *Inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { return vfs.GenericStatFS(linux.NSFS_MAGIC), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/nsfs/nsfs_state_autogen.go000066400000000000000000000101771465435605700274430ustar00rootroot00000000000000// automatically generated by stateify. package nsfs import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (r *inodeRefs) StateTypeName() string { return "pkg/sentry/fsimpl/nsfs.inodeRefs" } func (r *inodeRefs) StateFields() []string { return []string{ "refCount", } } func (r *inodeRefs) beforeSave() {} // +checklocksignore func (r *inodeRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *inodeRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (f *filesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/nsfs.filesystemType" } func (f *filesystemType) StateFields() []string { return []string{} } func (f *filesystemType) beforeSave() {} // +checklocksignore func (f *filesystemType) StateSave(stateSinkObject state.Sink) { f.beforeSave() } func (f *filesystemType) afterLoad(context.Context) {} // +checklocksignore func (f *filesystemType) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (fs *filesystem) StateTypeName() string { return "pkg/sentry/fsimpl/nsfs.filesystem" } func (fs *filesystem) StateFields() []string { return []string{ "Filesystem", "devMinor", } } func (fs *filesystem) beforeSave() {} // +checklocksignore func (fs *filesystem) StateSave(stateSinkObject state.Sink) { fs.beforeSave() stateSinkObject.Save(0, &fs.Filesystem) stateSinkObject.Save(1, &fs.devMinor) } func (fs *filesystem) afterLoad(context.Context) {} // +checklocksignore func (fs *filesystem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fs.Filesystem) stateSourceObject.Load(1, &fs.devMinor) } func (i *Inode) StateTypeName() string { return "pkg/sentry/fsimpl/nsfs.Inode" } func (i *Inode) StateFields() []string { return []string{ "InodeAttrs", "InodeAnonymous", "InodeNotDirectory", "InodeNotSymlink", "InodeWatches", "inodeRefs", "locks", "namespace", "mnt", } } func (i *Inode) beforeSave() {} // +checklocksignore func (i *Inode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.InodeAttrs) stateSinkObject.Save(1, &i.InodeAnonymous) stateSinkObject.Save(2, &i.InodeNotDirectory) stateSinkObject.Save(3, &i.InodeNotSymlink) stateSinkObject.Save(4, &i.InodeWatches) stateSinkObject.Save(5, &i.inodeRefs) stateSinkObject.Save(6, &i.locks) stateSinkObject.Save(7, &i.namespace) stateSinkObject.Save(8, &i.mnt) } func (i *Inode) afterLoad(context.Context) {} // +checklocksignore func (i *Inode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.InodeAttrs) stateSourceObject.Load(1, &i.InodeAnonymous) stateSourceObject.Load(2, &i.InodeNotDirectory) stateSourceObject.Load(3, &i.InodeNotSymlink) stateSourceObject.Load(4, &i.InodeWatches) stateSourceObject.Load(5, &i.inodeRefs) stateSourceObject.Load(6, &i.locks) stateSourceObject.Load(7, &i.namespace) stateSourceObject.Load(8, &i.mnt) } func (fd *namespaceFD) StateTypeName() string { return "pkg/sentry/fsimpl/nsfs.namespaceFD" } func (fd *namespaceFD) StateFields() []string { return []string{ "FileDescriptionDefaultImpl", "LockFD", "vfsfd", "inode", } } func (fd *namespaceFD) beforeSave() {} // +checklocksignore func (fd *namespaceFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(1, &fd.LockFD) stateSinkObject.Save(2, &fd.vfsfd) stateSinkObject.Save(3, &fd.inode) } func (fd *namespaceFD) afterLoad(context.Context) {} // +checklocksignore func (fd *namespaceFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(1, &fd.LockFD) stateSourceObject.Load(2, &fd.vfsfd) stateSourceObject.Load(3, &fd.inode) } func init() { state.Register((*inodeRefs)(nil)) state.Register((*filesystemType)(nil)) state.Register((*filesystem)(nil)) state.Register((*Inode)(nil)) state.Register((*namespaceFD)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/overlay/000077500000000000000000000000001465435605700237235ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/overlay/copy_up.go000066400000000000000000000314611465435605700257350ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package overlay import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" ) func (d *dentry) isCopiedUp() bool { return d.copiedUp.Load() != 0 } func (d *dentry) canBeCopiedUp() bool { ftype := d.mode.Load() & linux.S_IFMT switch ftype { case linux.S_IFREG, linux.S_IFDIR, linux.S_IFLNK, linux.S_IFBLK, linux.S_IFCHR: // Can be copied-up. return true default: // Can't be copied-up. return false } } // copyUpLocked ensures that d exists on the upper layer, i.e. d.upperVD.Ok(). // // Preconditions: filesystem.renameMu must be locked. func (d *dentry) copyUpLocked(ctx context.Context) error { return d.copyUpMaybeSyntheticMountpointLocked(ctx, false /* forSyntheticMountpoint */) } func (d *dentry) copyUpMaybeSyntheticMountpointLocked(ctx context.Context, forSyntheticMountpoint bool) error { // Fast path. if d.isCopiedUp() { return nil } // Attach our credentials to the context, as some VFS operations use // credentials from context rather an take an explicit creds parameter. ctx = auth.ContextWithCredentials(ctx, d.fs.creds) if !d.canBeCopiedUp() { return linuxerr.EPERM } // Ensure that our parent directory is copied-up. parent := d.parent.Load() if parent == nil { // d is a filesystem root with no upper layer. return linuxerr.EROFS } if err := parent.copyUpMaybeSyntheticMountpointLocked(ctx, forSyntheticMountpoint); err != nil { return err } d.copyMu.Lock() defer d.copyMu.Unlock() if d.upperVD.Ok() { // Raced with another call to d.copyUpLocked(). return nil } if d.vfsd.IsDead() { // Raced with deletion of d. return linuxerr.ENOENT } // Obtain settable timestamps from the lower layer. vfsObj := d.fs.vfsfs.VirtualFilesystem() oldpop := vfs.PathOperation{ Root: d.lowerVDs[0], Start: d.lowerVDs[0], } const timestampsMask = linux.STATX_ATIME | linux.STATX_MTIME oldStat, err := vfsObj.StatAt(ctx, d.fs.creds, &oldpop, &vfs.StatOptions{ Mask: timestampsMask, }) if err != nil { return err } // Perform copy-up. ftype := d.mode.Load() & linux.S_IFMT newpop := vfs.PathOperation{ Root: parent.upperVD, Start: parent.upperVD, Path: fspath.Parse(d.name), } // Used during copy-up of memory-mapped regular files. var mmapOpts *memmap.MMapOpts cleanupUndoCopyUp := func() { var err error if ftype == linux.S_IFDIR { err = vfsObj.RmdirAt(ctx, d.fs.creds, &newpop) } else { err = vfsObj.UnlinkAt(ctx, d.fs.creds, &newpop) } if err != nil { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err)) } if d.upperVD.Ok() { d.upperVD.DecRef(ctx) d.upperVD = vfs.VirtualDentry{} } } switch ftype { case linux.S_IFREG: oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &oldpop, &vfs.OpenOptions{ Flags: linux.O_RDONLY, }) if err != nil { return err } defer oldFD.DecRef(ctx) newFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &newpop, &vfs.OpenOptions{ Flags: linux.O_WRONLY | linux.O_CREAT | linux.O_EXCL, // d.mode can be read because d.copyMu is locked. Mode: linux.FileMode(d.mode.RacyLoad() &^ linux.S_IFMT), }) if err != nil { return err } defer newFD.DecRef(ctx) if _, err := vfs.CopyRegularFileData(ctx, newFD, oldFD); err != nil { cleanupUndoCopyUp() return err } if d.wrappedMappable != nil { // We may have memory mappings of the file on the lower layer. // Switch to mapping the file on the upper layer instead. mmapOpts = &memmap.MMapOpts{ Perms: hostarch.ReadWrite, MaxPerms: hostarch.ReadWrite, } if err := newFD.ConfigureMMap(ctx, mmapOpts); err != nil { cleanupUndoCopyUp() return err } if mmapOpts.MappingIdentity != nil { mmapOpts.MappingIdentity.DecRef(ctx) } // Don't actually switch Mappables until the end of copy-up; see // below for why. } if err := newFD.SetStat(ctx, vfs.SetStatOptions{ Stat: linux.Statx{ Mask: linux.STATX_UID | linux.STATX_GID | oldStat.Mask×tampsMask, // d.uid and d.gid can be read because d.copyMu is locked. UID: d.uid.RacyLoad(), GID: d.gid.RacyLoad(), Atime: oldStat.Atime, Mtime: oldStat.Mtime, }, }); err != nil { cleanupUndoCopyUp() return err } d.upperVD = newFD.VirtualDentry() d.upperVD.IncRef() case linux.S_IFDIR: if err := vfsObj.MkdirAt(ctx, d.fs.creds, &newpop, &vfs.MkdirOptions{ // d.mode can be read because d.copyMu is locked. Mode: linux.FileMode(d.mode.RacyLoad() &^ linux.S_IFMT), ForSyntheticMountpoint: forSyntheticMountpoint, }); err != nil { return err } if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{ Stat: linux.Statx{ Mask: linux.STATX_UID | linux.STATX_GID | oldStat.Mask×tampsMask, // d.uid and d.gid can be read because d.copyMu is locked. UID: d.uid.RacyLoad(), GID: d.gid.RacyLoad(), Atime: oldStat.Atime, Mtime: oldStat.Mtime, }, }); err != nil { cleanupUndoCopyUp() return err } upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{}) if err != nil { cleanupUndoCopyUp() return err } d.upperVD = upperVD case linux.S_IFLNK: target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &oldpop) if err != nil { return err } if err := vfsObj.SymlinkAt(ctx, d.fs.creds, &newpop, target); err != nil { return err } if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{ Stat: linux.Statx{ Mask: linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | oldStat.Mask×tampsMask, // d.{uid,gid,mode} can be read because d.copyMu is locked. Mode: uint16(d.mode.RacyLoad()), UID: d.uid.RacyLoad(), GID: d.gid.RacyLoad(), Atime: oldStat.Atime, Mtime: oldStat.Mtime, }, }); err != nil { cleanupUndoCopyUp() return err } upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{}) if err != nil { cleanupUndoCopyUp() return err } d.upperVD = upperVD case linux.S_IFBLK, linux.S_IFCHR: if err := vfsObj.MknodAt(ctx, d.fs.creds, &newpop, &vfs.MknodOptions{ // d.mode can be read because d.copyMu is locked. Mode: linux.FileMode(d.mode.RacyLoad()), DevMajor: oldStat.RdevMajor, DevMinor: oldStat.RdevMinor, }); err != nil { return err } if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{ Stat: linux.Statx{ Mask: linux.STATX_UID | linux.STATX_GID | oldStat.Mask×tampsMask, // d.uid and d.gid can be read because d.copyMu is locked. UID: d.uid.RacyLoad(), GID: d.gid.RacyLoad(), Atime: oldStat.Atime, Mtime: oldStat.Mtime, }, }); err != nil { cleanupUndoCopyUp() return err } upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{}) if err != nil { cleanupUndoCopyUp() return err } d.upperVD = upperVD default: // Should have rejected this at the beginning of this function? panic(fmt.Sprintf("unexpected file type %o", ftype)) } if err := d.copyXattrsLocked(ctx); err != nil { cleanupUndoCopyUp() return err } // Update the dentry's device and inode numbers (except for directories, // for which these remain overlay-assigned). if ftype != linux.S_IFDIR { upperStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{ Root: d.upperVD, Start: d.upperVD, }, &vfs.StatOptions{ Mask: linux.STATX_INO, }) if err != nil { cleanupUndoCopyUp() return err } if upperStat.Mask&linux.STATX_INO == 0 { cleanupUndoCopyUp() return linuxerr.EREMOTE } d.devMajor.Store(upperStat.DevMajor) d.devMinor.Store(upperStat.DevMinor) d.ino.Store(upperStat.Ino) // Lower level dentries for non-directories are no longer accessible from // the overlayfs anymore after copyup. Ask filesystems to release their // resources whenever possible. for _, lowerDentry := range d.lowerVDs { lowerDentry.Dentry().MarkEvictable() } } if mmapOpts != nil && mmapOpts.Mappable != nil { d.mapsMu.Lock() defer d.mapsMu.Unlock() // Propagate mappings of d to the new Mappable. Remember which mappings // we added so we can remove them on failure. upperMappable := mmapOpts.Mappable allAdded := make(map[memmap.MappableRange]memmap.MappingsOfRange) for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { added := make(memmap.MappingsOfRange) for m := range seg.Value() { if err := upperMappable.AddMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable); err != nil { for m := range added { upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable) } for mr, mappings := range allAdded { for m := range mappings { upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, mr.Start, m.Writable) } } return err } added[m] = struct{}{} } allAdded[seg.Range()] = added } // Switch to the new Mappable. We do this at the end of copy-up // because: // // - We need to switch Mappables (by changing d.wrappedMappable) before // invalidating Translations from the old Mappable (to pick up // Translations from the new one). // // - We need to lock d.dataMu while changing d.wrappedMappable, but // must invalidate Translations with d.dataMu unlocked (due to lock // ordering). // // - Consequently, once we unlock d.dataMu, other threads may // immediately observe the new (copied-up) Mappable, which we want to // delay until copy-up is guaranteed to succeed. d.dataMu.Lock() lowerMappable := d.wrappedMappable d.wrappedMappable = upperMappable d.dataMu.Unlock() d.lowerMappings.InvalidateAll(memmap.InvalidateOpts{}) // Remove mappings from the old Mappable. for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { for m := range seg.Value() { lowerMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable) } } d.lowerMappings.RemoveAll() } d.copiedUp.Store(1) return nil } // copyXattrsLocked copies a subset of lower's extended attributes to upper. // Attributes that configure an overlay in the lower are not copied up. // // Preconditions: d.copyMu must be locked for writing. func (d *dentry) copyXattrsLocked(ctx context.Context) error { vfsObj := d.fs.vfsfs.VirtualFilesystem() lowerPop := &vfs.PathOperation{Root: d.lowerVDs[0], Start: d.lowerVDs[0]} upperPop := &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD} lowerXattrs, err := vfsObj.ListXattrAt(ctx, d.fs.creds, lowerPop, 0) if err != nil { if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) { // There are no guarantees as to the contents of lowerXattrs. return nil } ctx.Infof("failed to copy up xattrs because ListXattrAt failed: %v", err) return err } for _, name := range lowerXattrs { // Do not copy up overlay attributes. if isOverlayXattr(name) { continue } value, err := vfsObj.GetXattrAt(ctx, d.fs.creds, lowerPop, &vfs.GetXattrOptions{Name: name, Size: 0}) if err != nil { ctx.Infof("failed to copy up xattrs because GetXattrAt failed: %v", err) return err } if err := vfsObj.SetXattrAt(ctx, d.fs.creds, upperPop, &vfs.SetXattrOptions{Name: name, Value: value}); err != nil { ctx.Infof("failed to copy up xattrs because SetXattrAt failed: %v", err) return err } } return nil } // copyUpDescendantsLocked ensures that all descendants of d are copied up. // // Preconditions: // - filesystem.renameMu must be locked. // - d.dirMu must be locked. // - d.isDir(). func (d *dentry) copyUpDescendantsLocked(ctx context.Context, ds **[]*dentry) error { dirents, err := d.getDirentsLocked(ctx) if err != nil { return err } for _, dirent := range dirents { if dirent.Name == "." || dirent.Name == ".." { continue } child, _, err := d.fs.getChildLocked(ctx, d, dirent.Name, ds) if err != nil { return err } if err := child.copyUpLocked(ctx); err != nil { return err } if child.isDir() { child.dirMu.Lock() err := child.copyUpDescendantsLocked(ctx, ds) child.dirMu.Unlock() if err != nil { return err } } } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/overlay/data_rwmutex.go000066400000000000000000000044461465435605700267660ustar00rootroot00000000000000package overlay import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type dataRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var datalockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type datalockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *dataRWMutex) Lock() { locking.AddGLock(dataprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *dataRWMutex) NestedLock(i datalockNameIndex) { locking.AddGLock(dataprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *dataRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(dataprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *dataRWMutex) NestedUnlock(i datalockNameIndex) { m.mu.Unlock() locking.DelGLock(dataprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *dataRWMutex) RLock() { locking.AddGLock(dataprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *dataRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(dataprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *dataRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *dataRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *dataRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var dataprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func datainitLockNames() {} func init() { datainitLockNames() dataprefixIndex = locking.NewMutexClass(reflect.TypeOf(dataRWMutex{}), datalockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/overlay/dev_mutex.go000066400000000000000000000030501465435605700262500ustar00rootroot00000000000000package overlay import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type devMutex struct { mu sync.Mutex } var devprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var devlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type devlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *devMutex) Lock() { locking.AddGLock(devprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *devMutex) NestedLock(i devlockNameIndex) { locking.AddGLock(devprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *devMutex) Unlock() { locking.DelGLock(devprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *devMutex) NestedUnlock(i devlockNameIndex) { locking.DelGLock(devprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func devinitLockNames() {} func init() { devinitLockNames() devprefixIndex = locking.NewMutexClass(reflect.TypeOf(devMutex{}), devlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/overlay/dir_cache_mutex.go000066400000000000000000000033001465435605700273710ustar00rootroot00000000000000package overlay import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type dirInoCacheMutex struct { mu sync.Mutex } var dirInoCacheprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var dirInoCachelockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type dirInoCachelockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *dirInoCacheMutex) Lock() { locking.AddGLock(dirInoCacheprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *dirInoCacheMutex) NestedLock(i dirInoCachelockNameIndex) { locking.AddGLock(dirInoCacheprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *dirInoCacheMutex) Unlock() { locking.DelGLock(dirInoCacheprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *dirInoCacheMutex) NestedUnlock(i dirInoCachelockNameIndex) { locking.DelGLock(dirInoCacheprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func dirInoCacheinitLockNames() {} func init() { dirInoCacheinitLockNames() dirInoCacheprefixIndex = locking.NewMutexClass(reflect.TypeOf(dirInoCacheMutex{}), dirInoCachelockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/overlay/dir_fd_mutex.go000066400000000000000000000033001465435605700267170ustar00rootroot00000000000000package overlay import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type directoryFDMutex struct { mu sync.Mutex } var directoryFDprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var directoryFDlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type directoryFDlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *directoryFDMutex) Lock() { locking.AddGLock(directoryFDprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *directoryFDMutex) NestedLock(i directoryFDlockNameIndex) { locking.AddGLock(directoryFDprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *directoryFDMutex) Unlock() { locking.DelGLock(directoryFDprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *directoryFDMutex) NestedUnlock(i directoryFDlockNameIndex) { locking.DelGLock(directoryFDprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func directoryFDinitLockNames() {} func init() { directoryFDinitLockNames() directoryFDprefixIndex = locking.NewMutexClass(reflect.TypeOf(directoryFDMutex{}), directoryFDlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/overlay/dir_mutex.go000066400000000000000000000032771465435605700262630ustar00rootroot00000000000000package overlay import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type dirMutex struct { mu sync.Mutex } var dirprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var dirlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type dirlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. const ( dirLockNew = dirlockNameIndex(0) dirLockReplaced = dirlockNameIndex(1) dirLockChild = dirlockNameIndex(2) ) const () // Lock locks m. // +checklocksignore func (m *dirMutex) Lock() { locking.AddGLock(dirprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *dirMutex) NestedLock(i dirlockNameIndex) { locking.AddGLock(dirprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *dirMutex) Unlock() { locking.DelGLock(dirprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *dirMutex) NestedUnlock(i dirlockNameIndex) { locking.DelGLock(dirprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func dirinitLockNames() { dirlockNames = []string{"new", "replaced", "child"} } func init() { dirinitLockNames() dirprefixIndex = locking.NewMutexClass(reflect.TypeOf(dirMutex{}), dirlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/overlay/directory.go000066400000000000000000000174261465435605700262700ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package overlay import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/vfs" ) func (d *dentry) isDir() bool { return d.mode.Load()&linux.S_IFMT == linux.S_IFDIR } // Preconditions: // - d.dirMu must be locked. // - d.isDir(). func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string]bool, error) { vfsObj := d.fs.vfsfs.VirtualFilesystem() var readdirErr error whiteouts := make(map[string]bool) var maybeWhiteouts []string d.iterLayers(func(layerVD vfs.VirtualDentry, isUpper bool) bool { layerFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ Root: layerVD, Start: layerVD, }, &vfs.OpenOptions{ Flags: linux.O_RDONLY | linux.O_DIRECTORY, }) if err != nil { readdirErr = err return false } defer layerFD.DecRef(ctx) // Reuse slice allocated for maybeWhiteouts from a previous layer to // reduce allocations. maybeWhiteouts = maybeWhiteouts[:0] err = layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { if dirent.Name == "." || dirent.Name == ".." { return nil } if _, ok := whiteouts[dirent.Name]; ok { // This file has been whited-out in a previous layer. return nil } if dirent.Type == linux.DT_CHR { // We have to determine if this is a whiteout, which doesn't // count against the directory's emptiness. However, we can't // do so while holding locks held by layerFD.IterDirents(). maybeWhiteouts = append(maybeWhiteouts, dirent.Name) return nil } // Non-whiteout file in the directory prevents rmdir. return linuxerr.ENOTEMPTY })) if err != nil { readdirErr = err return false } for _, maybeWhiteoutName := range maybeWhiteouts { stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{ Root: layerVD, Start: layerVD, Path: fspath.Parse(maybeWhiteoutName), }, &vfs.StatOptions{}) if err != nil { readdirErr = err return false } if stat.RdevMajor != 0 || stat.RdevMinor != 0 { // This file is a real character device, not a whiteout. readdirErr = linuxerr.ENOTEMPTY return false } whiteouts[maybeWhiteoutName] = isUpper } // Continue iteration since we haven't found any non-whiteout files in // this directory yet. return true }) return whiteouts, readdirErr } // +stateify savable type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl mu directoryFDMutex `state:"nosave"` off int64 dirents []vfs.Dirent } // Release implements vfs.FileDescriptionImpl.Release. func (fd *directoryFD) Release(ctx context.Context) { } // IterDirents implements vfs.FileDescriptionImpl.IterDirents. func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { d := fd.dentry() fd.mu.Lock() defer fd.mu.Unlock() if fd.dirents == nil { ds, err := d.getDirents(ctx) if err != nil { return err } fd.dirents = ds } for fd.off < int64(len(fd.dirents)) { if err := cb.Handle(fd.dirents[fd.off]); err != nil { return err } fd.off++ } return nil } // Preconditions: d.isDir(). func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { d.fs.renameMu.RLock() defer d.fs.renameMu.RUnlock() d.dirMu.Lock() defer d.dirMu.Unlock() return d.getDirentsLocked(ctx) } // Preconditions: // - filesystem.renameMu must be locked. // - d.dirMu must be locked. // - d.isDir(). func (d *dentry) getDirentsLocked(ctx context.Context) ([]vfs.Dirent, error) { if d.dirents != nil { return d.dirents, nil } parent := genericParentOrSelf(d) dirents := []vfs.Dirent{ { Name: ".", Type: linux.DT_DIR, Ino: d.ino.Load(), NextOff: 1, }, { Name: "..", Type: uint8(parent.mode.Load() >> 12), Ino: parent.ino.Load(), NextOff: 2, }, } // Merge dirents from all layers comprising this directory. vfsObj := d.fs.vfsfs.VirtualFilesystem() var readdirErr error prevDirents := make(map[string]struct{}) var maybeWhiteouts []vfs.Dirent d.iterLayers(func(layerVD vfs.VirtualDentry, isUpper bool) bool { layerFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ Root: layerVD, Start: layerVD, }, &vfs.OpenOptions{ Flags: linux.O_RDONLY | linux.O_DIRECTORY, }) if err != nil { readdirErr = err return false } defer layerFD.DecRef(ctx) // Reuse slice allocated for maybeWhiteouts from a previous layer to // reduce allocations. maybeWhiteouts = maybeWhiteouts[:0] err = layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { if dirent.Name == "." || dirent.Name == ".." { return nil } if _, ok := prevDirents[dirent.Name]; ok { // This file is hidden by, or merged with, another file with // the same name in a previous layer. return nil } prevDirents[dirent.Name] = struct{}{} if dirent.Type == linux.DT_CHR { // We can't determine if this file is a whiteout while holding // locks held by layerFD.IterDirents(). maybeWhiteouts = append(maybeWhiteouts, dirent) return nil } dirent.NextOff = int64(len(dirents) + 1) dirents = append(dirents, dirent) return nil })) if err != nil { readdirErr = err return false } for _, dirent := range maybeWhiteouts { stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{ Root: layerVD, Start: layerVD, Path: fspath.Parse(dirent.Name), }, &vfs.StatOptions{}) if err != nil { readdirErr = err return false } if stat.RdevMajor == 0 && stat.RdevMinor == 0 { // This file is a whiteout; don't emit a dirent for it. continue } dirent.NextOff = int64(len(dirents) + 1) dirents = append(dirents, dirent) } return true }) if readdirErr != nil { return nil, readdirErr } // Cache dirents for future directoryFDs. d.dirents = dirents return dirents, nil } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { fd.mu.Lock() defer fd.mu.Unlock() switch whence { case linux.SEEK_SET: if offset < 0 { return 0, linuxerr.EINVAL } if offset == 0 { // Ensure that the next call to fd.IterDirents() calls // fd.dentry().getDirents(). fd.dirents = nil } fd.off = offset return fd.off, nil case linux.SEEK_CUR: offset += fd.off if offset < 0 { return 0, linuxerr.EINVAL } // Don't clear fd.dirents in this case, even if offset == 0. fd.off = offset return fd.off, nil default: return 0, linuxerr.EINVAL } } // Sync implements vfs.FileDescriptionImpl.Sync. Forwards sync to the upper // layer, if there is one. The lower layer doesn't need to sync because it // never changes. func (fd *directoryFD) Sync(ctx context.Context) error { d := fd.dentry() if !d.isCopiedUp() { return nil } vfsObj := d.fs.vfsfs.VirtualFilesystem() pop := vfs.PathOperation{ Root: d.upperVD, Start: d.upperVD, } upperFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY}) if err != nil { return err } err = upperFD.Sync(ctx) upperFD.DecRef(ctx) return err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/overlay/filesystem.go000066400000000000000000001637221465435605700264510ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package overlay import ( "fmt" "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) // _OVL_XATTR_PREFIX is an extended attribute key prefix to identify overlayfs // attributes. // Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_PREFIX const _OVL_XATTR_PREFIX = linux.XATTR_TRUSTED_PREFIX + "overlay." // _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for // opaque directories. // Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE const _OVL_XATTR_OPAQUE = _OVL_XATTR_PREFIX + "opaque" func isWhiteout(stat *linux.Statx) bool { return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0 } // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { if fs.opts.UpperRoot.Ok() { return fs.opts.UpperRoot.Mount().Filesystem().Impl().Sync(ctx) } return nil } var dentrySlicePool = sync.Pool{ New: func() any { ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity return &ds }, } func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry { if ds == nil { ds = dentrySlicePool.Get().(*[]*dentry) } *ds = append(*ds, d) return ds } // Preconditions: ds != nil. func putDentrySlice(ds *[]*dentry) { // Allow dentries to be GC'd. for i := range *ds { (*ds)[i] = nil } *ds = (*ds)[:0] dentrySlicePool.Put(ds) } // renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls // dentry.checkDropLocked on all dentries in *dsp with fs.renameMu locked for // writing. // // dsp is a pointer-to-pointer since defer evaluates its arguments immediately, // but dentry slices are allocated lazily, and it's much easier to say "defer // fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { // fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. // // +checklocksreleaseread:fs.renameMu func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, dsp **[]*dentry) { fs.renameMu.RUnlock() if *dsp == nil { return } ds := **dsp // Only go through calling dentry.checkDropLocked() (which requires // re-locking renameMu) if we actually have any dentries with zero refs. checkAny := false for i := range ds { if ds[i].refs.Load() == 0 { checkAny = true break } } if checkAny { fs.renameMu.Lock() for _, d := range ds { d.checkDropLocked(ctx) } fs.renameMu.Unlock() } putDentrySlice(*dsp) } // +checklocksrelease:fs.renameMu func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { if *ds == nil { fs.renameMu.Unlock() return } for _, d := range **ds { d.checkDropLocked(ctx) } fs.renameMu.Unlock() putDentrySlice(*ds) } // stepLocked resolves rp.Component() to an existing file, starting from the // given directory. // // Dentries which may have a reference count of zero, and which therefore // should be dropped once traversal is complete, are appended to ds. // // Preconditions: // - fs.renameMu must be locked. // - d.dirMu must be locked. // - !rp.Done(). func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, lookupLayer, bool, error) { if !d.isDir() { return nil, lookupLayerNone, false, linuxerr.ENOTDIR } if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, lookupLayerNone, false, err } name := rp.Component() if name == "." { rp.Advance() return d, d.topLookupLayer(), false, nil } if name == ".." { if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { return nil, lookupLayerNone, false, err } else if isRoot || d.parent.Load() == nil { rp.Advance() return d, d.topLookupLayer(), false, nil } if err := rp.CheckMount(ctx, &d.parent.Load().vfsd); err != nil { return nil, lookupLayerNone, false, err } rp.Advance() parent := d.parent.Load() return parent, parent.topLookupLayer(), false, nil } if uint64(len(name)) > fs.maxFilenameLen { return nil, lookupLayerNone, false, linuxerr.ENAMETOOLONG } child, topLookupLayer, err := fs.getChildLocked(ctx, d, name, ds) if err != nil { return nil, topLookupLayer, false, err } if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, lookupLayerNone, false, err } if child.isSymlink() && rp.ShouldFollowSymlink() { target, err := child.readlink(ctx) if err != nil { return nil, lookupLayerNone, false, err } followedSymlink, err := rp.HandleSymlink(target) return d, topLookupLayer, followedSymlink, err } rp.Advance() return child, topLookupLayer, false, nil } // Preconditions: // - fs.renameMu must be locked. // - d.dirMu must be locked. func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, lookupLayer, error) { if child, ok := parent.children[name]; ok { return child, child.topLookupLayer(), nil } child, topLookupLayer, err := fs.lookupLocked(ctx, parent, name) if err != nil { return nil, topLookupLayer, err } if parent.children == nil { parent.children = make(map[string]*dentry) } parent.children[name] = child // child's refcount is initially 0, so it may be dropped after traversal. *ds = appendDentry(*ds, child) return child, topLookupLayer, nil } // Preconditions: // - fs.renameMu must be locked. // - parent.dirMu must be locked. func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name string) (*dentry, lookupLayer, error) { childPath := fspath.Parse(name) child := fs.newDentry() topLookupLayer := lookupLayerNone var lookupErr error vfsObj := fs.vfsfs.VirtualFilesystem() parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool { childVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{ Root: parentVD, Start: parentVD, Path: childPath, }, &vfs.GetDentryOptions{}) if linuxerr.Equals(linuxerr.ENOENT, err) || linuxerr.Equals(linuxerr.ENAMETOOLONG, err) { // The file doesn't exist on this layer. Proceed to the next one. return true } if err != nil { lookupErr = err return false } defer childVD.DecRef(ctx) mask := uint32(linux.STATX_TYPE) if topLookupLayer == lookupLayerNone { // Mode, UID, GID, and (for non-directories) inode number come from // the topmost layer on which the file exists. mask |= linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO } stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{ Root: childVD, Start: childVD, }, &vfs.StatOptions{ Mask: mask, }) if err != nil { lookupErr = err return false } if stat.Mask&mask != mask { lookupErr = linuxerr.EREMOTE return false } if isWhiteout(&stat) { // This is a whiteout, so it "doesn't exist" on this layer, and // layers below this one are ignored. if isUpper { topLookupLayer = lookupLayerUpperWhiteout } return false } isDir := stat.Mode&linux.S_IFMT == linux.S_IFDIR if topLookupLayer != lookupLayerNone && !isDir { // Directories are not merged with non-directory files from lower // layers; instead, layers including and below the first // non-directory file are ignored. (This file must be a directory // on previous layers, since lower layers aren't searched for // non-directory files.) return false } // Update child to include this layer. childVD.IncRef() if isUpper { child.upperVD = childVD child.copiedUp = atomicbitops.FromUint32(1) } else { child.lowerVDs = append(child.lowerVDs, childVD) } if topLookupLayer == lookupLayerNone { if isUpper { topLookupLayer = lookupLayerUpper } else { topLookupLayer = lookupLayerLower } child.mode = atomicbitops.FromUint32(uint32(stat.Mode)) child.uid = atomicbitops.FromUint32(stat.UID) child.gid = atomicbitops.FromUint32(stat.GID) child.devMajor = atomicbitops.FromUint32(stat.DevMajor) child.devMinor = atomicbitops.FromUint32(stat.DevMinor) child.ino = atomicbitops.FromUint64(stat.Ino) } // For non-directory files, only the topmost layer that contains a file // matters. if !isDir { return false } // Directories use the lowest layer inode and device numbers to generate a // filesystem local inode number. This way the inode number does not change // after copy ups. child.devMajor = atomicbitops.FromUint32(stat.DevMajor) child.devMinor = atomicbitops.FromUint32(stat.DevMinor) child.ino = atomicbitops.FromUint64(stat.Ino) // Directories are merged with directories from lower layers if they // are not explicitly opaque. opaqueVal, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{ Root: childVD, Start: childVD, }, &vfs.GetXattrOptions{ Name: _OVL_XATTR_OPAQUE, Size: 1, }) return !(err == nil && opaqueVal == "y") }) if lookupErr != nil { child.destroyLocked(ctx) return nil, topLookupLayer, lookupErr } if !topLookupLayer.existsInOverlay() { child.destroyLocked(ctx) return nil, topLookupLayer, linuxerr.ENOENT } // Device and inode numbers were copied from the topmost layer above for // non-directories. They were copied from the bottommost layer for // directories. Override them if necessary. We can use RacyLoad() because // child is still being initialized. if child.isDir() { child.ino.Store(fs.newDirIno(child.devMajor.RacyLoad(), child.devMinor.RacyLoad(), child.ino.RacyLoad())) child.devMajor = atomicbitops.FromUint32(linux.UNNAMED_MAJOR) child.devMinor = atomicbitops.FromUint32(fs.dirDevMinor) } else if !child.upperVD.Ok() { childDevMinor, err := fs.getLowerDevMinor(child.devMajor.RacyLoad(), child.devMinor.RacyLoad()) if err != nil { ctx.Infof("overlay.filesystem.lookupLocked: failed to map lower layer device number (%d, %d) to an overlay-specific device number: %v", child.devMajor.RacyLoad(), child.devMinor.RacyLoad(), err) child.destroyLocked(ctx) return nil, topLookupLayer, err } child.devMajor = atomicbitops.FromUint32(linux.UNNAMED_MAJOR) child.devMinor = atomicbitops.FromUint32(childDevMinor) } parent.IncRef() child.parent.Store(parent) child.name = name return child, topLookupLayer, nil } // lookupLayerLocked is similar to lookupLocked, but only returns information // about the file rather than a dentry. // // Preconditions: // - fs.renameMu must be locked. // - parent.dirMu must be locked. func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, name string) (lookupLayer, error) { childPath := fspath.Parse(name) lookupLayer := lookupLayerNone var lookupErr error parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool { stat, err := fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{ Root: parentVD, Start: parentVD, Path: childPath, }, &vfs.StatOptions{ Mask: linux.STATX_TYPE, }) if linuxerr.Equals(linuxerr.ENOENT, err) || linuxerr.Equals(linuxerr.ENAMETOOLONG, err) { // The file doesn't exist on this layer. Proceed to the next // one. return true } if err != nil { lookupErr = err return false } if stat.Mask&linux.STATX_TYPE == 0 { // Linux's overlayfs tends to return EREMOTE in cases where a file // is unusable for reasons that are not better captured by another // errno. lookupErr = linuxerr.EREMOTE return false } if isWhiteout(&stat) { // This is a whiteout, so it "doesn't exist" on this layer, and // layers below this one are ignored. if isUpper { lookupLayer = lookupLayerUpperWhiteout } return false } // The file exists; we can stop searching. if isUpper { lookupLayer = lookupLayerUpper } else { lookupLayer = lookupLayerLower } return false }) return lookupLayer, lookupErr } type lookupLayer int const ( // lookupLayerNone indicates that no file exists at the given path on the // upper layer, and is either whited out or does not exist on lower layers. // Therefore, the file does not exist in the overlay filesystem, and file // creation may proceed normally (if an upper layer exists). lookupLayerNone lookupLayer = iota // lookupLayerLower indicates that no file exists at the given path on the // upper layer, but exists on a lower layer. Therefore, the file exists in // the overlay filesystem, but must be copied-up before mutation. lookupLayerLower // lookupLayerUpper indicates that a non-whiteout file exists at the given // path on the upper layer. Therefore, the file exists in the overlay // filesystem, and is already copied-up. lookupLayerUpper // lookupLayerUpperWhiteout indicates that a whiteout exists at the given // path on the upper layer. Therefore, the file does not exist in the // overlay filesystem, and file creation must remove the whiteout before // proceeding. lookupLayerUpperWhiteout ) func (ll lookupLayer) existsInOverlay() bool { return ll == lookupLayerLower || ll == lookupLayerUpper } // walkParentDirLocked resolves all but the last path component of rp to an // existing directory, starting from the given directory (which is usually // rp.Start().Impl().(*dentry)). It does not check that the returned directory // is searchable by the provider of rp. // // Preconditions: // - fs.renameMu must be locked. // - !rp.Done(). func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { for !rp.Final() { d.dirMu.Lock() next, _, _, err := fs.stepLocked(ctx, rp, d, ds) d.dirMu.Unlock() if err != nil { return nil, err } d = next } if !d.isDir() { return nil, linuxerr.ENOTDIR } return d, nil } // resolveLocked resolves rp to an existing file. // // Preconditions: fs.renameMu must be locked. func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { d := rp.Start().Impl().(*dentry) for !rp.Done() { d.dirMu.Lock() next, _, _, err := fs.stepLocked(ctx, rp, d, ds) d.dirMu.Unlock() if err != nil { return nil, err } d = next } if rp.MustBeDir() && !d.isDir() { return nil, linuxerr.ENOTDIR } return d, nil } type createType int const ( createNonDirectory createType = iota createDirectory createSyntheticMountpoint ) // doCreateAt checks that creating a file at rp is permitted, then invokes // create to do so. // // Preconditions: // - !rp.Done(). // - For the final path component in rp, !rp.ShouldFollowSymlink(). func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, ct createType, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { return err } name := rp.Component() if name == "." || name == ".." { return linuxerr.EEXIST } if uint64(len(name)) > fs.maxFilenameLen { return linuxerr.ENAMETOOLONG } if parent.vfsd.IsDead() { return linuxerr.ENOENT } if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return err } parent.dirMu.Lock() defer parent.dirMu.Unlock() // Determine if a file already exists at name. if _, ok := parent.children[name]; ok { return linuxerr.EEXIST } childLayer, err := fs.lookupLayerLocked(ctx, parent, name) if err != nil { return err } if childLayer.existsInOverlay() { return linuxerr.EEXIST } if ct == createNonDirectory && rp.MustBeDir() { return linuxerr.ENOENT } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { return err } defer mnt.EndWrite() if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } // Ensure that the parent directory is copied-up so that we can create the // new file in the upper layer. if err := parent.copyUpMaybeSyntheticMountpointLocked(ctx, ct == createSyntheticMountpoint); err != nil { return err } // Finally create the new file. if err := create(parent, name, childLayer == lookupLayerUpperWhiteout); err != nil { return err } parent.dirents = nil ev := linux.IN_CREATE if ct != createNonDirectory { ev |= linux.IN_ISDIR } parent.watches.Notify(ctx, name, uint32(ev), 0 /* cookie */, vfs.InodeEvent, false /* unlinked */) return nil } // CreateWhiteout creates a whiteout at pop. Whiteouts are created with // character devices with device ID = 0. // // Preconditions: pop's parent directory has been copied up. func CreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, pop *vfs.PathOperation) error { return vfsObj.MknodAt(ctx, creds, pop, &vfs.MknodOptions{ Mode: linux.S_IFCHR, // permissions == include/linux/fs.h:WHITEOUT_MODE == 0 // DevMajor == DevMinor == 0, from include/linux/fs.h:WHITEOUT_DEV }) } func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) { if err := CreateWhiteout(ctx, vfsObj, fs.creds, pop); err != nil { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err)) } } // AccessAt implements vfs.Filesystem.Impl.AccessAt. func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err } if err := d.checkPermissions(creds, ats); err != nil { return err } if !ats.MayWrite() { // Not requesting write permission. Allow it. return nil } if rp.Mount().ReadOnly() { return linuxerr.EROFS } if !d.upperVD.Ok() && !d.canBeCopiedUp() { // A lower layer file that can not be copied up, can not be written to. // Error out here. Don't give the application false hopes. return linuxerr.EACCES } return nil } // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err } if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } layerVD := d.topLayer() return fs.vfsfs.VirtualFilesystem().BoundEndpointAt(ctx, fs.creds, &vfs.PathOperation{ Root: layerVD, Start: layerVD, }, &opts) } // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err } if opts.CheckSearchable { if !d.isDir() { return nil, linuxerr.ENOTDIR } if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } } d.IncRef() return &d.vfsd, nil } // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { return nil, err } d.IncRef() return &d.vfsd, nil } // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { return fs.doCreateAt(ctx, rp, createNonDirectory, func(parent *dentry, childName string, haveUpperWhiteout bool) error { if rp.Mount() != vd.Mount() { return linuxerr.EXDEV } old := vd.Dentry().Impl().(*dentry) if old.isDir() { return linuxerr.EPERM } if err := old.copyUpLocked(ctx); err != nil { return err } vfsObj := fs.vfsfs.VirtualFilesystem() newpop := vfs.PathOperation{ Root: parent.upperVD, Start: parent.upperVD, Path: fspath.Parse(childName), } if haveUpperWhiteout { if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil { return err } } if err := vfsObj.LinkAt(ctx, fs.creds, &vfs.PathOperation{ Root: old.upperVD, Start: old.upperVD, }, &newpop); err != nil { if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop) } return err } creds := rp.Credentials() if err := vfsObj.SetStatAt(ctx, fs.creds, &newpop, &vfs.SetStatOptions{ Stat: linux.Statx{ Mask: linux.STATX_UID | linux.STATX_GID, UID: uint32(creds.EffectiveKUID), GID: uint32(creds.EffectiveKGID), }, }); err != nil { if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr)) } else if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop) } return err } old.watches.Notify(ctx, "", linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent, false /* unlinked */) return nil }) } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { ct := createDirectory if opts.ForSyntheticMountpoint { ct = createSyntheticMountpoint } return fs.doCreateAt(ctx, rp, ct, func(parent *dentry, childName string, haveUpperWhiteout bool) error { vfsObj := fs.vfsfs.VirtualFilesystem() pop := vfs.PathOperation{ Root: parent.upperVD, Start: parent.upperVD, Path: fspath.Parse(childName), } if haveUpperWhiteout { if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil { return err } } if err := vfsObj.MkdirAt(ctx, fs.creds, &pop, &opts); err != nil { if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } return err } if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ Stat: parent.newChildOwnerStat(opts.Mode, rp.Credentials()), }); err != nil { if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr)) } else if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } return err } if haveUpperWhiteout { // A whiteout is being replaced with this new directory. There may be // directories on lower layers (previously hidden by the whiteout) that // the new directory should not be merged with, so mark as opaque. // See fs/overlayfs/dir.c:ovl_create_over_whiteout() -> ovl_set_opaque(). if err := vfsObj.SetXattrAt(ctx, fs.creds, &pop, &vfs.SetXattrOptions{ Name: _OVL_XATTR_OPAQUE, Value: "y", }); err != nil { if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr)) } else { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } return err } } else if len(parent.lowerVDs) > 0 { // If haveUpperWhiteout is false and the parent is merged, then we should // apply an optimization. We know that nothing exists on the parent's // lower layers. Otherwise doCreateAt() would have failed with EEXIST. // Mark the new directory opaque to avoid unnecessary lower lookups in // fs.lookupLocked(). Allow it to fail since this is an optimization. // See fs/overlayfs/dir.c:ovl_create_upper() -> ovl_set_opaque(). _ = vfsObj.SetXattrAt(ctx, fs.creds, &pop, &vfs.SetXattrOptions{ Name: _OVL_XATTR_OPAQUE, Value: "y", }) } return nil }) } // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { return fs.doCreateAt(ctx, rp, createNonDirectory, func(parent *dentry, childName string, haveUpperWhiteout bool) error { // Disallow attempts to create whiteouts. if opts.Mode&linux.S_IFMT == linux.S_IFCHR && opts.DevMajor == 0 && opts.DevMinor == 0 { return linuxerr.EPERM } vfsObj := fs.vfsfs.VirtualFilesystem() pop := vfs.PathOperation{ Root: parent.upperVD, Start: parent.upperVD, Path: fspath.Parse(childName), } if haveUpperWhiteout { if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil { return err } } if err := vfsObj.MknodAt(ctx, fs.creds, &pop, &opts); err != nil { if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } return err } creds := rp.Credentials() if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ Stat: parent.newChildOwnerStat(opts.Mode, creds), }); err != nil { if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr)) } else if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } return err } return nil }) } // OpenAt implements vfs.FilesystemImpl.OpenAt. func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { mayCreate := opts.Flags&linux.O_CREAT != 0 mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL) var ds *[]*dentry fs.renameMu.RLock() unlocked := false unlock := func() { if !unlocked { fs.renameMuRUnlockAndCheckDrop(ctx, &ds) unlocked = true } } defer unlock() start := rp.Start().Impl().(*dentry) if rp.Done() { if mayCreate && rp.MustBeDir() { return nil, linuxerr.EISDIR } if mustCreate { return nil, linuxerr.EEXIST } if err := start.ensureOpenableLocked(ctx, rp, &opts); err != nil { return nil, err } start.IncRef() defer start.DecRef(ctx) unlock() return start.openCopiedUp(ctx, rp, &opts) } afterTrailingSymlink: parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { return nil, err } // Check for search permission in the parent directory. if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Reject attempts to open directories with O_CREAT. if mayCreate && rp.MustBeDir() { return nil, linuxerr.EISDIR } // Determine whether or not we need to create a file. parent.dirMu.Lock() child, topLookupLayer, followedSymlink, err := fs.stepLocked(ctx, rp, parent, &ds) if followedSymlink { parent.dirMu.Unlock() if mustCreate { // EEXIST must be returned if an existing symlink is opened with O_EXCL. return nil, linuxerr.EEXIST } if err != nil { // If followedSymlink && err != nil, then this symlink resolution error // must be handled by the VFS layer. return nil, err } start = parent goto afterTrailingSymlink } if linuxerr.Equals(linuxerr.ENOENT, err) && mayCreate { fd, err := fs.createAndOpenLocked(ctx, rp, parent, &opts, &ds, topLookupLayer == lookupLayerUpperWhiteout) parent.dirMu.Unlock() return fd, err } parent.dirMu.Unlock() if err != nil { return nil, err } if mustCreate { return nil, linuxerr.EEXIST } if rp.MustBeDir() && !child.isDir() { return nil, linuxerr.ENOTDIR } if err := child.ensureOpenableLocked(ctx, rp, &opts); err != nil { return nil, err } child.IncRef() defer child.DecRef(ctx) unlock() return child.openCopiedUp(ctx, rp, &opts) } // Preconditions: filesystem.renameMu must be locked. func (d *dentry) ensureOpenableLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) error { ats := vfs.AccessTypesForOpenFlags(opts) if err := d.checkPermissions(rp.Credentials(), ats); err != nil { return err } if d.isDir() { if ats.MayWrite() { return linuxerr.EISDIR } if opts.Flags&linux.O_CREAT != 0 { return linuxerr.EISDIR } if opts.Flags&linux.O_DIRECT != 0 { return linuxerr.EINVAL } return nil } if !ats.MayWrite() { return nil } // Copy up! if err := rp.Mount().CheckBeginWrite(); err != nil { return err } defer rp.Mount().EndWrite() return d.copyUpLocked(ctx) } // Preconditions: If vfs.AccessTypesForOpenFlags(opts).MayWrite(), then d has // been copied up. func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { mnt := rp.Mount() // Directory FDs open FDs from each layer when directory entries are read, // so they don't require opening an FD from d.topLayer() up front. ftype := d.mode.Load() & linux.S_IFMT if ftype == linux.S_IFDIR { fd := &directoryFD{} fd.LockFD.Init(&d.locks) if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ UseDentryMetadata: true, }); err != nil { return nil, err } return &fd.vfsfd, nil } layerVD, isUpper := d.topLayerInfo() layerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ Root: layerVD, Start: layerVD, }, opts) if err != nil { return nil, err } if ftype != linux.S_IFREG { return layerFD, nil } layerFlags := layerFD.StatusFlags() fd := ®ularFileFD{ copiedUp: isUpper, cachedFD: layerFD, cachedFlags: layerFlags, } fd.LockFD.Init(&d.locks) layerFDOpts := layerFD.Options() if err := fd.vfsfd.Init(fd, layerFlags, mnt, &d.vfsd, &layerFDOpts); err != nil { layerFD.DecRef(ctx) return nil, err } return &fd.vfsfd, nil } // Preconditions: // - parent.dirMu must be locked. // - parent does not already contain a child named rp.Component(). func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *dentry, opts *vfs.OpenOptions, ds **[]*dentry, haveUpperWhiteout bool) (*vfs.FileDescription, error) { creds := rp.Credentials() if err := parent.checkPermissions(creds, vfs.MayWrite); err != nil { return nil, err } if parent.vfsd.IsDead() { return nil, linuxerr.ENOENT } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { return nil, err } defer mnt.EndWrite() if err := parent.copyUpLocked(ctx); err != nil { return nil, err } vfsObj := fs.vfsfs.VirtualFilesystem() childName := rp.Component() pop := vfs.PathOperation{ Root: parent.upperVD, Start: parent.upperVD, Path: fspath.Parse(childName), } // Unlink the whiteout if it exists. if haveUpperWhiteout { if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil { log.Warningf("overlay.filesystem.createAndOpenLocked: failed to unlink whiteout: %v", err) return nil, err } } // Create the file on the upper layer, and get an FD representing it. upperFD, err := vfsObj.OpenAt(ctx, fs.creds, &pop, &vfs.OpenOptions{ Flags: opts.Flags&^vfs.FileCreationFlags | linux.O_CREAT | linux.O_EXCL, Mode: opts.Mode, }) if err != nil { if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } return nil, err } // Change the file's owner to the caller. We can't use upperFD.SetStat() // because it will pick up creds from ctx. if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ Stat: parent.newChildOwnerStat(opts.Mode, creds), }); err != nil { if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr)) } else if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } return nil, err } // Re-lookup to get a dentry representing the new file, which is needed for // the returned FD. child, _, err := fs.getChildLocked(ctx, parent, childName, ds) if err != nil { if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr)) } else if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } return nil, err } // Finally construct the overlay FD. Below this point, we don't perform // cleanup (the file was created successfully even if we can no longer open // it for some reason). parent.dirents = nil upperFlags := upperFD.StatusFlags() fd := ®ularFileFD{ copiedUp: true, cachedFD: upperFD, cachedFlags: upperFlags, } fd.LockFD.Init(&child.locks) upperFDOpts := upperFD.Options() if err := fd.vfsfd.Init(fd, upperFlags, mnt, &child.vfsd, &upperFDOpts); err != nil { upperFD.DecRef(ctx) return nil, err } parent.watches.Notify(ctx, childName, linux.IN_CREATE, 0 /* cookie */, vfs.PathEvent, false /* unlinked */) return &fd.vfsfd, nil } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err } layerVD := d.topLayer() return fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{ Root: layerVD, Start: layerVD, }) } // RenameAt implements vfs.FilesystemImpl.RenameAt. func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { // Resolve newParent first to verify that it's on this Mount. var ds *[]*dentry fs.renameMu.Lock() // We need to DecRef outside of fs.mu because forgetting a dead mountpoint // could result in this filesystem being released which acquires fs.mu. var toDecRef []refs.RefCounter defer func() { for _, ref := range toDecRef { ref.DecRef(ctx) } }() defer fs.renameMuUnlockAndCheckDrop(ctx, &ds) newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds) if err != nil { return err } if opts.Flags&^linux.RENAME_NOREPLACE != 0 { return linuxerr.EINVAL } newName := rp.Component() if newName == "." || newName == ".." { if opts.Flags&linux.RENAME_NOREPLACE != 0 { return linuxerr.EEXIST } return linuxerr.EBUSY } if uint64(len(newName)) > fs.maxFilenameLen { return linuxerr.ENAMETOOLONG } // Do not check for newName length, since different filesystem // implementations impose different name limits. upperfs.RenameAt() will fail // appropriately if it has to. mnt := rp.Mount() if mnt != oldParentVD.Mount() { return linuxerr.EXDEV } if err := mnt.CheckBeginWrite(); err != nil { return err } defer mnt.EndWrite() oldParent := oldParentVD.Dentry().Impl().(*dentry) creds := rp.Credentials() if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { return err } // We need a dentry representing the renamed file since, if it's a // directory, we need to check for write permission on it. oldParent.dirMu.Lock() defer oldParent.dirMu.Unlock() renamed, _, err := fs.getChildLocked(ctx, oldParent, oldName, &ds) if err != nil { return err } if err := oldParent.mayDelete(creds, renamed); err != nil { return err } if renamed.isDir() { if renamed == newParent || genericIsAncestorDentry(renamed, newParent) { return linuxerr.EINVAL } if oldParent != newParent { if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil { return err } } } else { if opts.MustBeDir || rp.MustBeDir() { return linuxerr.ENOTDIR } } if oldParent != newParent { if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { return err } newParent.dirMu.NestedLock(dirLockNew) defer newParent.dirMu.NestedUnlock(dirLockNew) } if newParent.vfsd.IsDead() { return linuxerr.ENOENT } var ( replaced *dentry replacedVFSD *vfs.Dentry replacedLayer lookupLayer whiteouts map[string]bool ) replaced, replacedLayer, err = fs.getChildLocked(ctx, newParent, newName, &ds) if err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) { return err } if replaced != nil { if opts.Flags&linux.RENAME_NOREPLACE != 0 { return linuxerr.EEXIST } replacedVFSD = &replaced.vfsd if replaced.isDir() { if !renamed.isDir() { return linuxerr.EISDIR } if genericIsAncestorDentry(replaced, renamed) { return linuxerr.ENOTEMPTY } replaced.dirMu.NestedLock(dirLockReplaced) defer replaced.dirMu.NestedUnlock(dirLockReplaced) whiteouts, err = replaced.collectWhiteoutsForRmdirLocked(ctx) if err != nil { return err } } else { if rp.MustBeDir() || renamed.isDir() { return linuxerr.ENOTDIR } } } if oldParent == newParent && oldName == newName { return nil } // renamed and oldParent need to be copied-up before they're renamed on the // upper layer. if err := renamed.copyUpLocked(ctx); err != nil { return err } // If renamed is a directory, all of its descendants need to be copied-up // before they're renamed on the upper layer. if renamed.isDir() { if err := renamed.copyUpDescendantsLocked(ctx, &ds); err != nil { return err } } // newParent must be copied-up before it can contain renamed on the upper // layer. if err := newParent.copyUpLocked(ctx); err != nil { return err } // If replaced exists, it doesn't need to be copied-up, but we do need to // serialize with copy-up. Holding renameMu for writing should be // sufficient, but out of an abundance of caution... if replaced != nil { replaced.copyMu.RLock() defer replaced.copyMu.RUnlock() } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef(ctx) if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { return err } newpop := vfs.PathOperation{ Root: newParent.upperVD, Start: newParent.upperVD, Path: fspath.Parse(newName), } needRecreateWhiteouts := false cleanupRecreateWhiteouts := func() { if !needRecreateWhiteouts { return } for whiteoutName, whiteoutUpper := range whiteouts { if !whiteoutUpper { continue } if err := CreateWhiteout(ctx, vfsObj, fs.creds, &vfs.PathOperation{ Root: replaced.upperVD, Start: replaced.upperVD, Path: fspath.Parse(whiteoutName), }); err != nil && !linuxerr.Equals(linuxerr.EEXIST, err) { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RenameAt failure: %v", err)) } } } if renamed.isDir() { if replacedLayer == lookupLayerUpper { // Remove whiteouts from the directory being replaced. needRecreateWhiteouts = true for whiteoutName, whiteoutUpper := range whiteouts { if !whiteoutUpper { continue } if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{ Root: replaced.upperVD, Start: replaced.upperVD, Path: fspath.Parse(whiteoutName), }); err != nil { vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) cleanupRecreateWhiteouts() return err } } } else if replacedLayer == lookupLayerUpperWhiteout { // We need to explicitly remove the whiteout since otherwise rename // on the upper layer will fail with ENOTDIR. if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil { vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) return err } } } // Essentially no gVisor filesystem supports RENAME_WHITEOUT, so just do a // regular rename and create the whiteout at the origin manually. Unlike // RENAME_WHITEOUT, this isn't atomic with respect to other users of the // upper filesystem, but this is already the case for virtually all other // overlay filesystem operations too. oldpop := vfs.PathOperation{ Root: oldParent.upperVD, Start: oldParent.upperVD, Path: fspath.Parse(oldName), } if err := vfsObj.RenameAt(ctx, creds, &oldpop, &newpop, &opts); err != nil { vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) cleanupRecreateWhiteouts() return err } // Below this point, the renamed dentry is now at newpop, and anything we // replaced is gone forever. Commit the rename, update the overlay // filesystem tree, and abandon attempts to recover from errors. toDecRef = vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD) delete(oldParent.children, oldName) if replaced != nil { // Lower dentries of replaced are not reachable from the overlay anymore. // NOTE(b/237573779): Ask lower filesystem to release resources for this // dentry whenever possible to reduce resource usage. for _, replaceLower := range replaced.lowerVDs { replaceLower.Dentry().MarkEvictable() } ds = appendDentry(ds, replaced) } if oldParent != newParent { newParent.dirents = nil // This can't drop the last reference on oldParent because one is held // by oldParentVD, so lock recursion is impossible. oldParent.DecRef(ctx) ds = appendDentry(ds, oldParent) newParent.IncRef() renamed.parent.Store(newParent) } renamed.name = newName if newParent.children == nil { newParent.children = make(map[string]*dentry) } newParent.children[newName] = renamed oldParent.dirents = nil if err := CreateWhiteout(ctx, vfsObj, fs.creds, &oldpop); err != nil { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout at origin after RenameAt: %v", err)) } if renamed.isDir() { if err := vfsObj.SetXattrAt(ctx, fs.creds, &newpop, &vfs.SetXattrOptions{ Name: _OVL_XATTR_OPAQUE, Value: "y", }); err != nil { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to make renamed directory opaque: %v", err)) } } vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir()) return nil } // RmdirAt implements vfs.FilesystemImpl.RmdirAt. func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { var ds *[]*dentry fs.renameMu.RLock() // We need to DecRef outside of fs.mu because forgetting a dead mountpoint // could result in this filesystem being released which acquires fs.mu. var toDecRef []refs.RefCounter defer func() { for _, ref := range toDecRef { ref.DecRef(ctx) } }() defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { return err } if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } if err := rp.Mount().CheckBeginWrite(); err != nil { return err } defer rp.Mount().EndWrite() name := rp.Component() if name == "." { return linuxerr.EINVAL } if name == ".." { return linuxerr.ENOTEMPTY } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef(ctx) parent.dirMu.Lock() defer parent.dirMu.Unlock() // Ensure that parent is copied-up before potentially holding child.copyMu // below. if err := parent.copyUpLocked(ctx); err != nil { return err } // We need a dentry representing the child directory being removed in order // to verify that it's empty. child, _, err := fs.getChildLocked(ctx, parent, name, &ds) if err != nil { return err } if !child.isDir() { return linuxerr.ENOTDIR } if err := parent.mayDelete(rp.Credentials(), child); err != nil { return err } child.dirMu.NestedLock(dirLockChild) defer child.dirMu.NestedUnlock(dirLockChild) whiteouts, err := child.collectWhiteoutsForRmdirLocked(ctx) if err != nil { return err } child.copyMu.RLock() defer child.copyMu.RUnlock() if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } pop := vfs.PathOperation{ Root: parent.upperVD, Start: parent.upperVD, Path: fspath.Parse(name), } if child.upperVD.Ok() { cleanupRecreateWhiteouts := func() { if !child.upperVD.Ok() { return } for whiteoutName, whiteoutUpper := range whiteouts { if !whiteoutUpper { continue } if err := CreateWhiteout(ctx, vfsObj, fs.creds, &vfs.PathOperation{ Root: child.upperVD, Start: child.upperVD, Path: fspath.Parse(whiteoutName), }); err != nil && !linuxerr.Equals(linuxerr.EEXIST, err) { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err)) } } } // Remove existing whiteouts on the upper layer. for whiteoutName, whiteoutUpper := range whiteouts { if !whiteoutUpper { continue } if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{ Root: child.upperVD, Start: child.upperVD, Path: fspath.Parse(whiteoutName), }); err != nil { vfsObj.AbortDeleteDentry(&child.vfsd) cleanupRecreateWhiteouts() return err } } // Remove the existing directory on the upper layer. if err := vfsObj.RmdirAt(ctx, fs.creds, &pop); err != nil { vfsObj.AbortDeleteDentry(&child.vfsd) cleanupRecreateWhiteouts() return err } } if err := CreateWhiteout(ctx, vfsObj, fs.creds, &pop); err != nil { vfsObj.AbortDeleteDentry(&child.vfsd) if child.upperVD.Ok() { // Don't attempt to recover from this: the original directory is // already gone, so any dentries representing it are invalid, and // creating a new directory won't undo that. panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout after removing upper layer directory during RmdirAt: %v", err)) } return err } toDecRef = vfsObj.CommitDeleteDentry(ctx, &child.vfsd) delete(parent.children, name) ds = appendDentry(ds, child) parent.dirents = nil parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0 /* cookie */, vfs.InodeEvent, true /* unlinked */) return nil } // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { var ds *[]*dentry fs.renameMu.RLock() d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { fs.renameMuRUnlockAndCheckDrop(ctx, &ds) return err } err = d.setStatLocked(ctx, rp, opts) fs.renameMuRUnlockAndCheckDrop(ctx, &ds) if err != nil { return err } if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { d.InotifyWithParent(ctx, ev, 0 /* cookie */, vfs.InodeEvent) } return nil } // Precondition: d.fs.renameMu must be held for reading. func (d *dentry) setStatLocked(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { mode := linux.FileMode(d.mode.Load()) if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts, mode, auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())); err != nil { return err } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { return err } defer mnt.EndWrite() if err := d.copyUpLocked(ctx); err != nil { return err } // Changes to d's attributes are serialized by d.copyMu. d.copyMu.Lock() defer d.copyMu.Unlock() if err := d.fs.vfsfs.VirtualFilesystem().SetStatAt(ctx, d.fs.creds, &vfs.PathOperation{ Root: d.upperVD, Start: d.upperVD, }, &opts); err != nil { return err } d.updateAfterSetStatLocked(&opts) return nil } // StatAt implements vfs.FilesystemImpl.StatAt. func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statx{}, err } var stat linux.Statx if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 { layerVD := d.topLayer() stat, err = fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{ Root: layerVD, Start: layerVD, }, &vfs.StatOptions{ Mask: layerMask, Sync: opts.Sync, }) if err != nil { return linux.Statx{}, err } } d.statInternalTo(ctx, &opts, &stat) return stat, nil } // StatFSAt implements vfs.FilesystemImpl.StatFSAt. func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) _, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statfs{}, err } return fs.statFS(ctx) } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { return fs.doCreateAt(ctx, rp, createNonDirectory, func(parent *dentry, childName string, haveUpperWhiteout bool) error { vfsObj := fs.vfsfs.VirtualFilesystem() pop := vfs.PathOperation{ Root: parent.upperVD, Start: parent.upperVD, Path: fspath.Parse(childName), } if haveUpperWhiteout { if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil { return err } } if err := vfsObj.SymlinkAt(ctx, fs.creds, &pop, target); err != nil { if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } return err } creds := rp.Credentials() if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ Stat: linux.Statx{ Mask: linux.STATX_UID | linux.STATX_GID, UID: uint32(creds.EffectiveKUID), GID: uint32(creds.EffectiveKGID), }, }); err != nil { if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr)) } else if haveUpperWhiteout { fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) } return err } return nil }) } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { var ds *[]*dentry fs.renameMu.RLock() // We need to DecRef outside of fs.renameMu because forgetting a dead // mountpoint could result in this filesystem being released which acquires // fs.renameMu. var toDecRef []refs.RefCounter defer func() { for _, ref := range toDecRef { ref.DecRef(ctx) } }() defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { return err } if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } if err := rp.Mount().CheckBeginWrite(); err != nil { return err } defer rp.Mount().EndWrite() name := rp.Component() if name == "." || name == ".." { return linuxerr.EISDIR } if rp.MustBeDir() { return linuxerr.ENOTDIR } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef(ctx) parent.dirMu.Lock() defer parent.dirMu.Unlock() // Ensure that parent is copied-up before potentially holding child.copyMu // below. if err := parent.copyUpLocked(ctx); err != nil { return err } // We need a dentry representing the child being removed in order to verify // that it's not a directory. child, childLayer, err := fs.getChildLocked(ctx, parent, name, &ds) if err != nil { return err } if child.isDir() { return linuxerr.EISDIR } if err := parent.mayDelete(rp.Credentials(), child); err != nil { return err } // Hold child.copyMu to prevent it from being copied-up during // deletion. child.copyMu.RLock() defer child.copyMu.RUnlock() if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } pop := vfs.PathOperation{ Root: parent.upperVD, Start: parent.upperVD, Path: fspath.Parse(name), } if childLayer == lookupLayerUpper { // Remove the existing file on the upper layer. if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil { vfsObj.AbortDeleteDentry(&child.vfsd) return err } } if err := CreateWhiteout(ctx, vfsObj, fs.creds, &pop); err != nil { vfsObj.AbortDeleteDentry(&child.vfsd) if childLayer == lookupLayerUpper { panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout after unlinking upper layer file during UnlinkAt: %v", err)) } return err } toDecRef = vfsObj.CommitDeleteDentry(ctx, &child.vfsd) delete(parent.children, name) if !child.isDir() { // Once a whiteout is created, non-directory dentries on the lower layers // are no longer reachable from the overlayfs. Ask filesystems to release // their resources whenever possible. for _, lowerDentry := range child.lowerVDs { lowerDentry.Dentry().MarkEvictable() } } ds = appendDentry(ds, child) vfs.InotifyRemoveChild(ctx, &child.watches, &parent.watches, name) parent.dirents = nil return nil } // isOverlayXattr returns whether the given extended attribute configures the // overlay. func isOverlayXattr(name string) bool { return strings.HasPrefix(name, _OVL_XATTR_PREFIX) } // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err } return fs.listXattr(ctx, d, size) } func (fs *filesystem) listXattr(ctx context.Context, d *dentry, size uint64) ([]string, error) { vfsObj := d.fs.vfsfs.VirtualFilesystem() top := d.topLayer() names, err := vfsObj.ListXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, size) if err != nil { return nil, err } // Filter out all overlay attributes. n := 0 for _, name := range names { if !isOverlayXattr(name) { names[n] = name n++ } } return names[:n], err } // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err } return fs.getXattr(ctx, d, rp.Credentials(), &opts) } func (fs *filesystem) getXattr(ctx context.Context, d *dentry, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { return "", err } // Return EOPNOTSUPP when fetching an overlay attribute. // See fs/overlayfs/super.c:ovl_own_xattr_get(). if isOverlayXattr(opts.Name) { return "", linuxerr.EOPNOTSUPP } // Analogous to fs/overlayfs/super.c:ovl_other_xattr_get(). vfsObj := d.fs.vfsfs.VirtualFilesystem() top := d.topLayer() return vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, opts) } // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { var ds *[]*dentry fs.renameMu.RLock() d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { fs.renameMuRUnlockAndCheckDrop(ctx, &ds) return err } err = fs.setXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), &opts) fs.renameMuRUnlockAndCheckDrop(ctx, &ds) if err != nil { return err } d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent) return nil } // Precondition: fs.renameMu must be locked. func (fs *filesystem) setXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, opts *vfs.SetXattrOptions) error { if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { return err } // Return EOPNOTSUPP when setting an overlay attribute. // See fs/overlayfs/super.c:ovl_own_xattr_set(). if isOverlayXattr(opts.Name) { return linuxerr.EOPNOTSUPP } // Analogous to fs/overlayfs/super.c:ovl_other_xattr_set(). if err := mnt.CheckBeginWrite(); err != nil { return err } defer mnt.EndWrite() if err := d.copyUpLocked(ctx); err != nil { return err } vfsObj := d.fs.vfsfs.VirtualFilesystem() return vfsObj.SetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, opts) } // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { var ds *[]*dentry fs.renameMu.RLock() d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { fs.renameMuRUnlockAndCheckDrop(ctx, &ds) return err } err = fs.removeXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), name) fs.renameMuRUnlockAndCheckDrop(ctx, &ds) if err != nil { return err } d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent) return nil } // Precondition: fs.renameMu must be locked. func (fs *filesystem) removeXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, name string) error { if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { return err } // Like SetXattrAt, return EOPNOTSUPP when removing an overlay attribute. // Linux passes the remove request to xattr_handler->set. // See fs/xattr.c:vfs_removexattr(). if isOverlayXattr(name) { return linuxerr.EOPNOTSUPP } if err := mnt.CheckBeginWrite(); err != nil { return err } defer mnt.EndWrite() if err := d.copyUpLocked(ctx); err != nil { return err } vfsObj := d.fs.vfsfs.VirtualFilesystem() return vfsObj.RemoveXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, name) } // PrependPath implements vfs.FilesystemImpl.PrependPath. func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.renameMu.RLock() defer fs.renameMu.RUnlock() return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) } // MountOptions implements vfs.FilesystemImpl.MountOptions. func (fs *filesystem) MountOptions() string { // Return the mount options from the topmost layer. var vd vfs.VirtualDentry if fs.opts.UpperRoot.Ok() { vd = fs.opts.UpperRoot } else { vd = fs.opts.LowerRoots[0] } return vd.Mount().Filesystem().Impl().MountOptions() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/overlay/fstree.go000066400000000000000000000036721465435605700255520ustar00rootroot00000000000000package overlay import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // We need to define an interface instead of using atomic.Pointer because // the Dentry type gets removed during code generation and the compiler // complains about the unused sync/atomic type. type genericatomicptr interface { Load() *dentry } // IsAncestorDentry returns true if d is an ancestor of d2; that is, d is // either d2's parent or an ancestor of d2's parent. func genericIsAncestorDentry(d, d2 *dentry) bool { for d2 != nil { parent := d2.parent.Load() if parent == d { return true } if parent == d2 { return false } d2 = parent } return false } // IsDescendant returns true if vd is a descendant of vfsroot or if vd and // vfsroot are the same dentry. func genericIsDescendant(vfsroot *vfs.Dentry, d *dentry) bool { for d != nil && &d.vfsd != vfsroot { d = d.parent.Load() } return d != nil } // ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d. func genericParentOrSelf(d *dentry) *dentry { if parent := d.parent.Load(); parent != nil { return parent } return d } // PrependPath is a generic implementation of FilesystemImpl.PrependPath(). func genericPrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *dentry, b *fspath.Builder) error { for { if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { return vfs.PrependPathAtVFSRootError{} } if mnt != nil && &d.vfsd == mnt.Root() { return nil } parent := d.parent.Load() if parent == nil { return vfs.PrependPathAtNonMountRootError{} } b.PrependComponent(d.name) d = parent } } // DebugPathname returns a pathname to d relative to its filesystem root. // DebugPathname does not correspond to any Linux function; it's used to // generate dentry pathnames for debugging. func genericDebugPathname(d *dentry) string { var b fspath.Builder _ = genericPrependPath(vfs.VirtualDentry{}, nil, d, &b) return b.String() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/overlay/maps_mutex.go000066400000000000000000000030731465435605700264370ustar00rootroot00000000000000package overlay import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type mapsMutex struct { mu sync.Mutex } var mapsprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var mapslockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type mapslockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *mapsMutex) Lock() { locking.AddGLock(mapsprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *mapsMutex) NestedLock(i mapslockNameIndex) { locking.AddGLock(mapsprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *mapsMutex) Unlock() { locking.DelGLock(mapsprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *mapsMutex) NestedUnlock(i mapslockNameIndex) { locking.DelGLock(mapsprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func mapsinitLockNames() {} func init() { mapsinitLockNames() mapsprefixIndex = locking.NewMutexClass(reflect.TypeOf(mapsMutex{}), mapslockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/overlay/overlay.go000066400000000000000000000747021465435605700257450ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package overlay provides an overlay filesystem implementation, which // synthesizes a filesystem by composing one or more immutable filesystems // ("lower layers") with an optional mutable filesystem ("upper layer"). // // Lock order: // // directoryFD.mu / regularFileFD.mu // filesystem.renameMu // dentry.dirMu // dentry.copyMu // filesystem.devMu // *** "memmap.Mappable locks" below this point // dentry.mapsMu // *** "memmap.Mappable locks taken by Translate" below this point // dentry.dataMu // // Locking dentry.dirMu in multiple dentries requires that parent dentries are // locked before child dentries, and that filesystem.renameMu is locked to // stabilize this relationship. package overlay import ( "fmt" "strings" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) // Name is the default filesystem name. const Name = "overlay" // FilesystemType implements vfs.FilesystemType. // // +stateify savable type FilesystemType struct{} // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name } // Release implements FilesystemType.Release. func (FilesystemType) Release(ctx context.Context) {} // FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to // FilesystemType.GetFilesystem. // // +stateify savable type FilesystemOptions struct { // Callers passing FilesystemOptions to // overlay.FilesystemType.GetFilesystem() are responsible for ensuring that // the vfs.Mounts comprising the layers of the overlay filesystem do not // contain submounts. // If UpperRoot.Ok(), it is the root of the writable upper layer of the // overlay. UpperRoot vfs.VirtualDentry // LowerRoots contains the roots of the immutable lower layers of the // overlay. LowerRoots is immutable. LowerRoots []vfs.VirtualDentry } // filesystem implements vfs.FilesystemImpl. // // +stateify savable type filesystem struct { vfsfs vfs.Filesystem // Immutable options. opts FilesystemOptions // creds is a copy of the filesystem's creator's credentials, which are // used for accesses to the filesystem's layers. creds is immutable. creds *auth.Credentials // dirDevMinor is the device minor number used for directories. dirDevMinor // is immutable. dirDevMinor uint32 // lowerDevMinors maps device numbers from lower layer filesystems to // device minor numbers assigned to non-directory files originating from // that filesystem. (This remapping is necessary for lower layers because a // file on a lower layer, and that same file on an overlay, are // distinguishable because they will diverge after copy-up; this isn't true // for non-directory files already on the upper layer.) lowerDevMinors is // protected by devMu. devMu devMutex `state:"nosave"` lowerDevMinors map[layerDevNumber]uint32 // renameMu synchronizes renaming with non-renaming operations in order to // ensure consistent lock ordering between dentry.dirMu in different // dentries. renameMu renameRWMutex `state:"nosave"` // dirInoCache caches overlay-private directory inode numbers by mapped // bottommost device numbers and inode number. dirInoCache is protected by // dirInoCacheMu. dirInoCacheMu dirInoCacheMutex `state:"nosave"` dirInoCache map[layerDevNoAndIno]uint64 // lastDirIno is the last inode number assigned to a directory. lastDirIno // is protected by dirInoCacheMu. lastDirIno uint64 // MaxFilenameLen is the maximum filename length allowed by the overlayfs. maxFilenameLen uint64 } // +stateify savable type layerDevNumber struct { major uint32 minor uint32 } // +stateify savable type layerDevNoAndIno struct { layerDevNumber ino uint64 } // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { mopts := vfs.GenericParseMountOptions(opts.Data) fsoptsRaw := opts.InternalData fsopts, ok := fsoptsRaw.(FilesystemOptions) if fsoptsRaw != nil && !ok { ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw) return nil, nil, linuxerr.EINVAL } vfsroot := vfs.RootFromContext(ctx) if vfsroot.Ok() { defer vfsroot.DecRef(ctx) } if upperPathname, ok := mopts["upperdir"]; ok { if fsopts.UpperRoot.Ok() { ctx.Infof("overlay.FilesystemType.GetFilesystem: both upperdir and FilesystemOptions.UpperRoot are specified") return nil, nil, linuxerr.EINVAL } delete(mopts, "upperdir") // Linux overlayfs also requires a workdir when upperdir is // specified; we don't, so silently ignore this option. if workdir, ok := mopts["workdir"]; ok { // Linux creates the "work" directory in `workdir`. // Docker calls chown on it and fails if it doesn't // exist. workdirPath := fspath.Parse(workdir + "/work") if !workdirPath.Absolute { ctx.Infof("overlay.FilesystemType.GetFilesystem: workdir %q must be absolute", workdir) return nil, nil, linuxerr.EINVAL } pop := vfs.PathOperation{ Root: vfsroot, Start: vfsroot, Path: workdirPath, FollowFinalSymlink: false, } mode := vfs.MkdirOptions{ Mode: linux.ModeUserAll, } if err := vfsObj.MkdirAt(ctx, creds, &pop, &mode); err != nil && err != linuxerr.EEXIST { ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to create %s/work: %v", workdir, err) } delete(mopts, "workdir") } upperPath := fspath.Parse(upperPathname) if !upperPath.Absolute { ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname) return nil, nil, linuxerr.EINVAL } upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ Root: vfsroot, Start: vfsroot, Path: upperPath, FollowFinalSymlink: true, }, &vfs.GetDentryOptions{ CheckSearchable: true, }) if err != nil { ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err) return nil, nil, err } // TODO(b/286942303): Only tmpfs supports whiteouts and // trusted.overlay attributes. Don't allow to use non-tmpfs // mounts on upper levels for mounts created through the mount // syscall. In gVisor configs, users can specify any // configurations on their own risk. if !opts.InternalMount && upperRoot.Mount().Filesystem().FilesystemType().Name() != "tmpfs" { return nil, nil, linuxerr.EINVAL } privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */) upperRoot.DecRef(ctx) if err != nil { ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err) return nil, nil, err } defer privateUpperRoot.DecRef(ctx) fsopts.UpperRoot = privateUpperRoot } if lowerPathnamesStr, ok := mopts["lowerdir"]; ok { if len(fsopts.LowerRoots) != 0 { ctx.Infof("overlay.FilesystemType.GetFilesystem: both lowerdir and FilesystemOptions.LowerRoots are specified") return nil, nil, linuxerr.EINVAL } delete(mopts, "lowerdir") lowerPathnames := strings.Split(lowerPathnamesStr, ":") for _, lowerPathname := range lowerPathnames { lowerPath := fspath.Parse(lowerPathname) if !lowerPath.Absolute { ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname) return nil, nil, linuxerr.EINVAL } lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ Root: vfsroot, Start: vfsroot, Path: lowerPath, FollowFinalSymlink: true, }, &vfs.GetDentryOptions{ CheckSearchable: true, }) if err != nil { ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err) return nil, nil, err } privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */) lowerRoot.DecRef(ctx) if err != nil { ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err) return nil, nil, err } defer privateLowerRoot.DecRef(ctx) fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot) } } if len(mopts) != 0 { ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts) return nil, nil, linuxerr.EINVAL } if len(fsopts.LowerRoots) == 0 { ctx.Infof("overlay.FilesystemType.GetFilesystem: at least one lower layer is required") return nil, nil, linuxerr.EINVAL } if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() { ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lower layers are required when no upper layer is present") return nil, nil, linuxerr.EINVAL } const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK if len(fsopts.LowerRoots) > maxLowerLayers { ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lower layers specified, maximum %d", len(fsopts.LowerRoots), maxLowerLayers) return nil, nil, linuxerr.EINVAL } // Allocate dirDevMinor. lowerDevMinors are allocated dynamically. dirDevMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, nil, err } // Take extra references held by the filesystem. if fsopts.UpperRoot.Ok() { fsopts.UpperRoot.IncRef() } for _, lowerRoot := range fsopts.LowerRoots { lowerRoot.IncRef() } fs := &filesystem{ opts: fsopts, creds: creds.Fork(), dirDevMinor: dirDevMinor, lowerDevMinors: make(map[layerDevNumber]uint32), dirInoCache: make(map[layerDevNoAndIno]uint64), maxFilenameLen: linux.NAME_MAX, } fs.vfsfs.Init(vfsObj, &fstype, fs) // Configure max filename length. Similar to what Linux does in // fs/overlayfs/super.c:ovl_fill_super() -> ... -> ovl_check_namelen(). if fsopts.UpperRoot.Ok() { if err := fs.updateMaxNameLen(ctx, creds, vfsObj, fs.opts.UpperRoot); err != nil { ctx.Debugf("overlay.FilesystemType.GetFilesystem: failed to StatFSAt on upper layer root: %v", err) } } for _, lowerRoot := range fsopts.LowerRoots { if err := fs.updateMaxNameLen(ctx, creds, vfsObj, lowerRoot); err != nil { ctx.Debugf("overlay.FilesystemType.GetFilesystem: failed to StatFSAt on lower layer root: %v", err) } } // Construct the root dentry. root := fs.newDentry() root.refs = atomicbitops.FromInt64(1) if fs.opts.UpperRoot.Ok() { fs.opts.UpperRoot.IncRef() root.copiedUp = atomicbitops.FromUint32(1) root.upperVD = fs.opts.UpperRoot } for _, lowerRoot := range fs.opts.LowerRoots { lowerRoot.IncRef() root.lowerVDs = append(root.lowerVDs, lowerRoot) } rootTopVD := root.topLayer() // Get metadata from the topmost layer. See fs.lookupLocked(). const rootStatMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO rootStat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ Root: rootTopVD, Start: rootTopVD, }, &vfs.StatOptions{ Mask: rootStatMask, }) if err != nil { root.destroyLocked(ctx) fs.vfsfs.DecRef(ctx) return nil, nil, err } if rootStat.Mask&rootStatMask != rootStatMask { root.destroyLocked(ctx) fs.vfsfs.DecRef(ctx) return nil, nil, linuxerr.EREMOTE } if isWhiteout(&rootStat) { ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout") root.destroyLocked(ctx) fs.vfsfs.DecRef(ctx) return nil, nil, linuxerr.EINVAL } root.mode = atomicbitops.FromUint32(uint32(rootStat.Mode)) root.uid = atomicbitops.FromUint32(rootStat.UID) root.gid = atomicbitops.FromUint32(rootStat.GID) if rootStat.Mode&linux.S_IFMT == linux.S_IFDIR { root.devMajor = atomicbitops.FromUint32(linux.UNNAMED_MAJOR) root.devMinor = atomicbitops.FromUint32(fs.dirDevMinor) // For root dir, it is okay to use top most level's stat to compute inode // number because we don't allow copy ups on root dentries. root.ino.Store(fs.newDirIno(rootStat.DevMajor, rootStat.DevMinor, rootStat.Ino)) } else if !root.upperVD.Ok() { root.devMajor = atomicbitops.FromUint32(linux.UNNAMED_MAJOR) rootDevMinor, err := fs.getLowerDevMinor(rootStat.DevMajor, rootStat.DevMinor) if err != nil { ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to get device number for root: %v", err) root.destroyLocked(ctx) fs.vfsfs.DecRef(ctx) return nil, nil, err } root.devMinor = atomicbitops.FromUint32(rootDevMinor) root.ino.Store(rootStat.Ino) } else { root.devMajor = atomicbitops.FromUint32(rootStat.DevMajor) root.devMinor = atomicbitops.FromUint32(rootStat.DevMinor) root.ino.Store(rootStat.Ino) } return &fs.vfsfs, &root.vfsd, nil } // clonePrivateMount creates a non-recursive bind mount rooted at vd, not // associated with any MountNamespace, and returns the root of the new mount. // (This is required to ensure that each layer of an overlay comprises only a // single mount, and therefore can't cross into e.g. the overlay filesystem // itself, risking lock recursion.) A reference is held on the returned // VirtualDentry. func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forceReadOnly bool) (vfs.VirtualDentry, error) { oldmnt := vd.Mount() opts := oldmnt.Options() if forceReadOnly { opts.ReadOnly = true } newmnt := vfsObj.NewDisconnectedMount(oldmnt.Filesystem(), vd.Dentry(), &opts) // Take a reference on the dentry which will be owned by the returned // VirtualDentry. d := vd.Dentry() d.IncRef() return vfs.MakeVirtualDentry(newmnt, d), nil } // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { vfsObj := fs.vfsfs.VirtualFilesystem() vfsObj.PutAnonBlockDevMinor(fs.dirDevMinor) for _, lowerDevMinor := range fs.lowerDevMinors { vfsObj.PutAnonBlockDevMinor(lowerDevMinor) } if fs.opts.UpperRoot.Ok() { fs.opts.UpperRoot.DecRef(ctx) } for _, lowerRoot := range fs.opts.LowerRoots { lowerRoot.DecRef(ctx) } } // updateMaxNameLen is analogous to fs/overlayfs/super.c:ovl_check_namelen(). func (fs *filesystem) updateMaxNameLen(ctx context.Context, creds *auth.Credentials, vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry) error { statfs, err := vfsObj.StatFSAt(ctx, creds, &vfs.PathOperation{ Root: vd, Start: vd, }) if err != nil { return err } if statfs.NameLength > fs.maxFilenameLen { fs.maxFilenameLen = statfs.NameLength } return nil } func (fs *filesystem) statFS(ctx context.Context) (linux.Statfs, error) { // Always statfs the root of the topmost layer. Compare Linux's // fs/overlayfs/super.c:ovl_statfs(). var rootVD vfs.VirtualDentry if fs.opts.UpperRoot.Ok() { rootVD = fs.opts.UpperRoot } else { rootVD = fs.opts.LowerRoots[0] } fsstat, err := fs.vfsfs.VirtualFilesystem().StatFSAt(ctx, fs.creds, &vfs.PathOperation{ Root: rootVD, Start: rootVD, }) if err != nil { return linux.Statfs{}, err } fsstat.Type = linux.OVERLAYFS_SUPER_MAGIC return fsstat, nil } func (fs *filesystem) newDirIno(layerMajor, layerMinor uint32, layerIno uint64) uint64 { fs.dirInoCacheMu.Lock() defer fs.dirInoCacheMu.Unlock() orig := layerDevNoAndIno{ layerDevNumber: layerDevNumber{layerMajor, layerMinor}, ino: layerIno, } if ino, ok := fs.dirInoCache[orig]; ok { return ino } fs.lastDirIno++ newIno := fs.lastDirIno fs.dirInoCache[orig] = newIno return newIno } func (fs *filesystem) getLowerDevMinor(layerMajor, layerMinor uint32) (uint32, error) { fs.devMu.Lock() defer fs.devMu.Unlock() orig := layerDevNumber{layerMajor, layerMinor} if minor, ok := fs.lowerDevMinors[orig]; ok { return minor, nil } minor, err := fs.vfsfs.VirtualFilesystem().GetAnonBlockDevMinor() if err != nil { return 0, err } fs.lowerDevMinors[orig] = minor return minor, nil } // IsDescendant implements vfs.FilesystemImpl.IsDescendant. func (fs *filesystem) IsDescendant(vfsroot, vd vfs.VirtualDentry) bool { return genericIsDescendant(vfsroot.Dentry(), vd.Dentry().Impl().(*dentry)) } // dentry implements vfs.DentryImpl. // // +stateify savable type dentry struct { vfsd vfs.Dentry refs atomicbitops.Int64 // fs is the owning filesystem. fs is immutable. fs *filesystem // mode, uid, and gid are the file mode, owner, and group of the file in // the topmost layer (and therefore the overlay file as well), and are used // for permission checks on this dentry. These fields are protected by // copyMu. mode atomicbitops.Uint32 uid atomicbitops.Uint32 gid atomicbitops.Uint32 // copiedUp is 1 if this dentry has been copied-up (i.e. upperVD.Ok()) and // 0 otherwise. copiedUp atomicbitops.Uint32 // parent is the dentry corresponding to this dentry's parent directory. // name is this dentry's name in parent. If this dentry is a filesystem // root, parent is nil and name is the empty string. parent and name are // protected by fs.renameMu. parent atomic.Pointer[dentry] `state:".(*dentry)"` name string // If this dentry represents a directory, children maps the names of // children for which dentries have been instantiated to those dentries, // and dirents (if not nil) is a cache of dirents as returned by // directoryFDs representing this directory. children is protected by // dirMu. dirMu dirMutex `state:"nosave"` children map[string]*dentry dirents []vfs.Dirent // upperVD and lowerVDs are the files from the overlay filesystem's layers // that comprise the file on the overlay filesystem. // // If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e. // be copied up) with copyMu locked for writing; otherwise, it is // immutable. lowerVDs is always immutable. copyMu sync.RWMutex `state:"nosave"` upperVD vfs.VirtualDentry lowerVDs []vfs.VirtualDentry // inlineLowerVDs backs lowerVDs in the common case where len(lowerVDs) <= // len(inlineLowerVDs). inlineLowerVDs [1]vfs.VirtualDentry // devMajor, devMinor, and ino are the device major/minor and inode numbers // used by this dentry. These fields are protected by copyMu. devMajor atomicbitops.Uint32 devMinor atomicbitops.Uint32 ino atomicbitops.Uint64 // If this dentry represents a regular file, then: // // - mapsMu is used to synchronize between copy-up and memmap.Mappable // methods on dentry preceding mm.MemoryManager.activeMu in the lock order. // // - dataMu is used to synchronize between copy-up and // dentry.(memmap.Mappable).Translate. // // - lowerMappings tracks memory mappings of the file. lowerMappings is // used to invalidate mappings of the lower layer when the file is copied // up to ensure that they remain coherent with subsequent writes to the // file. (Note that, as of this writing, Linux overlayfs does not do this; // this feature is a gVisor extension.) lowerMappings is protected by // mapsMu. // // - If this dentry is copied-up, then wrappedMappable is the Mappable // obtained from a call to the current top layer's // FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil // (from a call to regularFileFD.ensureMappable()), it cannot become nil. // wrappedMappable is protected by mapsMu and dataMu. // // - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is // accessed using atomic memory operations. // // - wrappedMappable is protected by mapsMu and dataMu. In addition, // it has to be immutable if copyMu is taken for write. // copyUpMaybeSyntheticMountpointLocked relies on this behavior. mapsMu mapsMutex `state:"nosave"` lowerMappings memmap.MappingSet dataMu dataRWMutex `state:"nosave"` wrappedMappable memmap.Mappable isMappable atomicbitops.Uint32 locks vfs.FileLocks // watches is the set of inotify watches on the file represented by this dentry. // // Note that hard links to the same file will not share the same set of // watches, due to the fact that we do not have inode structures in this // overlay implementation. watches vfs.Watches } // newDentry creates a new dentry. The dentry initially has no references; it // is the caller's responsibility to set the dentry's reference count and/or // call dentry.destroy() as appropriate. The dentry is initially invalid in // that it contains no layers; the caller is responsible for setting them. func (fs *filesystem) newDentry() *dentry { d := &dentry{ fs: fs, } d.lowerVDs = d.inlineLowerVDs[:0] d.vfsd.Init(d) refs.Register(d) return d } // IncRef implements vfs.DentryImpl.IncRef. func (d *dentry) IncRef() { // d.refs may be 0 if d.fs.renameMu is locked, which serializes against // d.checkDropLocked(). r := d.refs.Add(1) if d.LogRefs() { refs.LogIncRef(d, r) } } // TryIncRef implements vfs.DentryImpl.TryIncRef. func (d *dentry) TryIncRef() bool { for { r := d.refs.Load() if r <= 0 { return false } if d.refs.CompareAndSwap(r, r+1) { if d.LogRefs() { refs.LogTryIncRef(d, r+1) } return true } } } // DecRef implements vfs.DentryImpl.DecRef. func (d *dentry) DecRef(ctx context.Context) { r := d.refs.Add(-1) if d.LogRefs() { refs.LogDecRef(d, r) } if r == 0 { d.fs.renameMu.Lock() d.checkDropLocked(ctx) d.fs.renameMu.Unlock() } else if r < 0 { panic("overlay.dentry.DecRef() called without holding a reference") } } func (d *dentry) decRefLocked(ctx context.Context) { r := d.refs.Add(-1) if d.LogRefs() { refs.LogDecRef(d, r) } if r == 0 { d.checkDropLocked(ctx) } else if r < 0 { panic("overlay.dentry.decRefLocked() called without holding a reference") } } // checkDropLocked should be called after d's reference count becomes 0 or it // becomes deleted. // // Preconditions: d.fs.renameMu must be locked for writing. func (d *dentry) checkDropLocked(ctx context.Context) { // Dentries with a positive reference count must be retained. (The only way // to obtain a reference on a dentry with zero references is via path // resolution, which requires renameMu, so if d.refs is zero then it will // remain zero while we hold renameMu for writing.) Dentries with a // negative reference count have already been destroyed. if d.refs.Load() != 0 { return } // Make sure that we do not lose watches on dentries that have not been // deleted. Note that overlayfs never calls VFS.InvalidateDentry(), so // d.vfsd.IsDead() indicates that d was deleted. if !d.vfsd.IsDead() && d.watches.Size() > 0 { return } // Refs is still zero; destroy it. d.destroyLocked(ctx) return } // destroyLocked destroys the dentry. // // Preconditions: // - d.fs.renameMu must be locked for writing. // - d.refs == 0. func (d *dentry) destroyLocked(ctx context.Context) { switch d.refs.Load() { case 0: // Mark the dentry destroyed. d.refs.Store(-1) case -1: panic("overlay.dentry.destroyLocked() called on already destroyed dentry") default: panic("overlay.dentry.destroyLocked() called with references on the dentry") } if d.upperVD.Ok() { d.upperVD.DecRef(ctx) } for _, lowerVD := range d.lowerVDs { lowerVD.DecRef(ctx) } d.watches.HandleDeletion(ctx) if parent := d.parent.Load(); parent != nil { parent.dirMu.Lock() if !d.vfsd.IsDead() { delete(parent.children, d.name) } parent.dirMu.Unlock() // Drop the reference held by d on its parent without recursively // locking d.fs.renameMu. parent.decRefLocked(ctx) } refs.Unregister(d) } // RefType implements refs.CheckedObject.Type. func (d *dentry) RefType() string { return "overlay.dentry" } // LeakMessage implements refs.CheckedObject.LeakMessage. func (d *dentry) LeakMessage() string { return fmt.Sprintf("[overlay.dentry %p] reference count of %d instead of -1", d, d.refs.Load()) } // LogRefs implements refs.CheckedObject.LogRefs. // // This should only be set to true for debugging purposes, as it can generate an // extremely large amount of output and drastically degrade performance. func (d *dentry) LogRefs() bool { return false } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) { if d.isDir() { events |= linux.IN_ISDIR } // overlayfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates // that d was deleted. deleted := d.vfsd.IsDead() d.fs.renameMu.RLock() // The ordering below is important, Linux always notifies the parent first. if parent := d.parent.Load(); parent != nil { parent.watches.Notify(ctx, d.name, events, cookie, et, deleted) } d.watches.Notify(ctx, "", events, cookie, et, deleted) d.fs.renameMu.RUnlock() } // Watches implements vfs.DentryImpl.Watches. func (d *dentry) Watches() *vfs.Watches { return &d.watches } // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. func (d *dentry) OnZeroWatches(ctx context.Context) { if d.refs.Load() == 0 { d.fs.renameMu.Lock() d.checkDropLocked(ctx) d.fs.renameMu.Unlock() } } // iterLayers invokes yield on each layer comprising d, from top to bottom. If // any call to yield returns false, iterLayer stops iteration. func (d *dentry) iterLayers(yield func(vd vfs.VirtualDentry, isUpper bool) bool) { if d.isCopiedUp() { if !yield(d.upperVD, true) { return } } for _, lowerVD := range d.lowerVDs { if !yield(lowerVD, false) { return } } } func (d *dentry) topLayerInfo() (vd vfs.VirtualDentry, isUpper bool) { if d.isCopiedUp() { return d.upperVD, true } return d.lowerVDs[0], false } func (d *dentry) topLayer() vfs.VirtualDentry { vd, _ := d.topLayerInfo() return vd } func (d *dentry) topLookupLayer() lookupLayer { if d.upperVD.Ok() { return lookupLayerUpper } return lookupLayerLower } func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())) } func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { mode := linux.FileMode(d.mode.Load()) kuid := auth.KUID(d.uid.Load()) kgid := auth.KGID(d.gid.Load()) if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { return err } return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name) } // statInternalMask is the set of stat fields that is set by // dentry.statInternalTo(). const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO // statInternalTo writes fields to stat that are stored in d, and therefore do // not requiring invoking StatAt on the overlay's layers. func (d *dentry) statInternalTo(ctx context.Context, opts *vfs.StatOptions, stat *linux.Statx) { stat.Mask |= statInternalMask if d.isDir() { // Linux sets nlink to 1 for merged directories // (fs/overlayfs/inode.c:ovl_getattr()); we set it to 2 because this is // correct more often ("." and the directory's entry in its parent), // and some of our tests expect this. stat.Nlink = 2 } stat.UID = d.uid.Load() stat.GID = d.gid.Load() stat.Mode = uint16(d.mode.Load()) stat.Ino = d.ino.Load() stat.DevMajor = d.devMajor.Load() stat.DevMinor = d.devMinor.Load() } // Preconditions: d.copyMu must be locked for writing. func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) { if opts.Stat.Mask&linux.STATX_MODE != 0 { d.mode.Store((d.mode.RacyLoad() & linux.S_IFMT) | uint32(opts.Stat.Mode&^linux.S_IFMT)) } if opts.Stat.Mask&linux.STATX_UID != 0 { d.uid.Store(opts.Stat.UID) } if opts.Stat.Mask&linux.STATX_GID != 0 { d.gid.Store(opts.Stat.GID) } } func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error { return vfs.CheckDeleteSticky( creds, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KUID(child.uid.Load()), auth.KGID(child.gid.Load()), ) } // newChildOwnerStat returns a Statx for configuring the UID, GID, and mode of // children. func (d *dentry) newChildOwnerStat(mode linux.FileMode, creds *auth.Credentials) linux.Statx { stat := linux.Statx{ Mask: uint32(linux.STATX_UID | linux.STATX_GID), UID: uint32(creds.EffectiveKUID), GID: uint32(creds.EffectiveKGID), } // Set GID and possibly the SGID bit if the parent is an SGID directory. d.copyMu.RLock() defer d.copyMu.RUnlock() if d.mode.Load()&linux.ModeSetGID == linux.ModeSetGID { stat.GID = d.gid.Load() if stat.Mode&linux.ModeDirectory == linux.ModeDirectory { stat.Mode = uint16(mode) | linux.ModeSetGID stat.Mask |= linux.STATX_MODE } } return stat } // fileDescription is embedded by overlay implementations of // vfs.FileDescriptionImpl. // // +stateify savable type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.LockFD } func (fd *fileDescription) filesystem() *filesystem { return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) } func (fd *fileDescription) dentry() *dentry { return fd.vfsfd.Dentry().Impl().(*dentry) } // ListXattr implements vfs.FileDescriptionImpl.ListXattr. func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { return fd.filesystem().listXattr(ctx, fd.dentry(), size) } // GetXattr implements vfs.FileDescriptionImpl.GetXattr. func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { return fd.filesystem().getXattr(ctx, fd.dentry(), auth.CredentialsFromContext(ctx), &opts) } // SetXattr implements vfs.FileDescriptionImpl.SetXattr. func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { fs := fd.filesystem() fs.renameMu.RLock() defer fs.renameMu.RUnlock() return fs.setXattrLocked(ctx, fd.dentry(), fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), &opts) } // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { fs := fd.filesystem() fs.renameMu.RLock() defer fs.renameMu.RUnlock() return fs.removeXattrLocked(ctx, fd.dentry(), fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), name) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/overlay/overlay_state_autogen.go000066400000000000000000000227561465435605700306710ustar00rootroot00000000000000// automatically generated by stateify. package overlay import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (fd *directoryFD) StateTypeName() string { return "pkg/sentry/fsimpl/overlay.directoryFD" } func (fd *directoryFD) StateFields() []string { return []string{ "fileDescription", "DirectoryFileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "off", "dirents", } } func (fd *directoryFD) beforeSave() {} // +checklocksignore func (fd *directoryFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.fileDescription) stateSinkObject.Save(1, &fd.DirectoryFileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &fd.off) stateSinkObject.Save(4, &fd.dirents) } func (fd *directoryFD) afterLoad(context.Context) {} // +checklocksignore func (fd *directoryFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.fileDescription) stateSourceObject.Load(1, &fd.DirectoryFileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &fd.off) stateSourceObject.Load(4, &fd.dirents) } func (fstype *FilesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/overlay.FilesystemType" } func (fstype *FilesystemType) StateFields() []string { return []string{} } func (fstype *FilesystemType) beforeSave() {} // +checklocksignore func (fstype *FilesystemType) StateSave(stateSinkObject state.Sink) { fstype.beforeSave() } func (fstype *FilesystemType) afterLoad(context.Context) {} // +checklocksignore func (fstype *FilesystemType) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (f *FilesystemOptions) StateTypeName() string { return "pkg/sentry/fsimpl/overlay.FilesystemOptions" } func (f *FilesystemOptions) StateFields() []string { return []string{ "UpperRoot", "LowerRoots", } } func (f *FilesystemOptions) beforeSave() {} // +checklocksignore func (f *FilesystemOptions) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.UpperRoot) stateSinkObject.Save(1, &f.LowerRoots) } func (f *FilesystemOptions) afterLoad(context.Context) {} // +checklocksignore func (f *FilesystemOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.UpperRoot) stateSourceObject.Load(1, &f.LowerRoots) } func (fs *filesystem) StateTypeName() string { return "pkg/sentry/fsimpl/overlay.filesystem" } func (fs *filesystem) StateFields() []string { return []string{ "vfsfs", "opts", "creds", "dirDevMinor", "lowerDevMinors", "dirInoCache", "lastDirIno", "maxFilenameLen", } } func (fs *filesystem) beforeSave() {} // +checklocksignore func (fs *filesystem) StateSave(stateSinkObject state.Sink) { fs.beforeSave() stateSinkObject.Save(0, &fs.vfsfs) stateSinkObject.Save(1, &fs.opts) stateSinkObject.Save(2, &fs.creds) stateSinkObject.Save(3, &fs.dirDevMinor) stateSinkObject.Save(4, &fs.lowerDevMinors) stateSinkObject.Save(5, &fs.dirInoCache) stateSinkObject.Save(6, &fs.lastDirIno) stateSinkObject.Save(7, &fs.maxFilenameLen) } func (fs *filesystem) afterLoad(context.Context) {} // +checklocksignore func (fs *filesystem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fs.vfsfs) stateSourceObject.Load(1, &fs.opts) stateSourceObject.Load(2, &fs.creds) stateSourceObject.Load(3, &fs.dirDevMinor) stateSourceObject.Load(4, &fs.lowerDevMinors) stateSourceObject.Load(5, &fs.dirInoCache) stateSourceObject.Load(6, &fs.lastDirIno) stateSourceObject.Load(7, &fs.maxFilenameLen) } func (l *layerDevNumber) StateTypeName() string { return "pkg/sentry/fsimpl/overlay.layerDevNumber" } func (l *layerDevNumber) StateFields() []string { return []string{ "major", "minor", } } func (l *layerDevNumber) beforeSave() {} // +checklocksignore func (l *layerDevNumber) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.major) stateSinkObject.Save(1, &l.minor) } func (l *layerDevNumber) afterLoad(context.Context) {} // +checklocksignore func (l *layerDevNumber) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.major) stateSourceObject.Load(1, &l.minor) } func (l *layerDevNoAndIno) StateTypeName() string { return "pkg/sentry/fsimpl/overlay.layerDevNoAndIno" } func (l *layerDevNoAndIno) StateFields() []string { return []string{ "layerDevNumber", "ino", } } func (l *layerDevNoAndIno) beforeSave() {} // +checklocksignore func (l *layerDevNoAndIno) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.layerDevNumber) stateSinkObject.Save(1, &l.ino) } func (l *layerDevNoAndIno) afterLoad(context.Context) {} // +checklocksignore func (l *layerDevNoAndIno) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.layerDevNumber) stateSourceObject.Load(1, &l.ino) } func (d *dentry) StateTypeName() string { return "pkg/sentry/fsimpl/overlay.dentry" } func (d *dentry) StateFields() []string { return []string{ "vfsd", "refs", "fs", "mode", "uid", "gid", "copiedUp", "parent", "name", "children", "dirents", "upperVD", "lowerVDs", "inlineLowerVDs", "devMajor", "devMinor", "ino", "lowerMappings", "wrappedMappable", "isMappable", "locks", "watches", } } func (d *dentry) beforeSave() {} // +checklocksignore func (d *dentry) StateSave(stateSinkObject state.Sink) { d.beforeSave() var parentValue *dentry parentValue = d.saveParent() stateSinkObject.SaveValue(7, parentValue) stateSinkObject.Save(0, &d.vfsd) stateSinkObject.Save(1, &d.refs) stateSinkObject.Save(2, &d.fs) stateSinkObject.Save(3, &d.mode) stateSinkObject.Save(4, &d.uid) stateSinkObject.Save(5, &d.gid) stateSinkObject.Save(6, &d.copiedUp) stateSinkObject.Save(8, &d.name) stateSinkObject.Save(9, &d.children) stateSinkObject.Save(10, &d.dirents) stateSinkObject.Save(11, &d.upperVD) stateSinkObject.Save(12, &d.lowerVDs) stateSinkObject.Save(13, &d.inlineLowerVDs) stateSinkObject.Save(14, &d.devMajor) stateSinkObject.Save(15, &d.devMinor) stateSinkObject.Save(16, &d.ino) stateSinkObject.Save(17, &d.lowerMappings) stateSinkObject.Save(18, &d.wrappedMappable) stateSinkObject.Save(19, &d.isMappable) stateSinkObject.Save(20, &d.locks) stateSinkObject.Save(21, &d.watches) } // +checklocksignore func (d *dentry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.vfsd) stateSourceObject.Load(1, &d.refs) stateSourceObject.Load(2, &d.fs) stateSourceObject.Load(3, &d.mode) stateSourceObject.Load(4, &d.uid) stateSourceObject.Load(5, &d.gid) stateSourceObject.Load(6, &d.copiedUp) stateSourceObject.Load(8, &d.name) stateSourceObject.Load(9, &d.children) stateSourceObject.Load(10, &d.dirents) stateSourceObject.Load(11, &d.upperVD) stateSourceObject.Load(12, &d.lowerVDs) stateSourceObject.Load(13, &d.inlineLowerVDs) stateSourceObject.Load(14, &d.devMajor) stateSourceObject.Load(15, &d.devMinor) stateSourceObject.Load(16, &d.ino) stateSourceObject.Load(17, &d.lowerMappings) stateSourceObject.Load(18, &d.wrappedMappable) stateSourceObject.Load(19, &d.isMappable) stateSourceObject.Load(20, &d.locks) stateSourceObject.Load(21, &d.watches) stateSourceObject.LoadValue(7, new(*dentry), func(y any) { d.loadParent(ctx, y.(*dentry)) }) stateSourceObject.AfterLoad(func() { d.afterLoad(ctx) }) } func (fd *fileDescription) StateTypeName() string { return "pkg/sentry/fsimpl/overlay.fileDescription" } func (fd *fileDescription) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "LockFD", } } func (fd *fileDescription) beforeSave() {} // +checklocksignore func (fd *fileDescription) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.vfsfd) stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.LockFD) } func (fd *fileDescription) afterLoad(context.Context) {} // +checklocksignore func (fd *fileDescription) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.vfsfd) stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.LockFD) } func (fd *regularFileFD) StateTypeName() string { return "pkg/sentry/fsimpl/overlay.regularFileFD" } func (fd *regularFileFD) StateFields() []string { return []string{ "fileDescription", "copiedUp", "cachedFD", "cachedFlags", } } func (fd *regularFileFD) beforeSave() {} // +checklocksignore func (fd *regularFileFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.fileDescription) stateSinkObject.Save(1, &fd.copiedUp) stateSinkObject.Save(2, &fd.cachedFD) stateSinkObject.Save(3, &fd.cachedFlags) } func (fd *regularFileFD) afterLoad(context.Context) {} // +checklocksignore func (fd *regularFileFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.fileDescription) stateSourceObject.Load(1, &fd.copiedUp) stateSourceObject.Load(2, &fd.cachedFD) stateSourceObject.Load(3, &fd.cachedFlags) } func init() { state.Register((*directoryFD)(nil)) state.Register((*FilesystemType)(nil)) state.Register((*FilesystemOptions)(nil)) state.Register((*filesystem)(nil)) state.Register((*layerDevNumber)(nil)) state.Register((*layerDevNoAndIno)(nil)) state.Register((*dentry)(nil)) state.Register((*fileDescription)(nil)) state.Register((*regularFileFD)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/overlay/regular_file.go000066400000000000000000000350361465435605700267210ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package overlay import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) func (d *dentry) isRegularFile() bool { return d.mode.Load()&linux.S_IFMT == linux.S_IFREG } func (d *dentry) isSymlink() bool { return d.mode.Load()&linux.S_IFMT == linux.S_IFLNK } func (d *dentry) readlink(ctx context.Context) (string, error) { layerVD := d.topLayer() return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{ Root: layerVD, Start: layerVD, }) } // +stateify savable type regularFileFD struct { fileDescription // If copiedUp is false, cachedFD represents // fileDescription.dentry().lowerVDs[0]; otherwise, cachedFD represents // fileDescription.dentry().upperVD. cachedFlags is the last known value of // cachedFD.StatusFlags(). copiedUp, cachedFD, and cachedFlags are // protected by mu. mu regularFileFDMutex `state:"nosave"` copiedUp bool cachedFD *vfs.FileDescription cachedFlags uint32 } func (fd *regularFileFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) { fd.mu.Lock() defer fd.mu.Unlock() wrappedFD, err := fd.currentFDLocked(ctx) if err != nil { return nil, err } wrappedFD.IncRef() return wrappedFD, nil } func (fd *regularFileFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) { d := fd.dentry() statusFlags := fd.vfsfd.StatusFlags() if !fd.copiedUp && d.isCopiedUp() { // Switch to the copied-up file. upperVD := d.topLayer() upperFD, err := fd.filesystem().vfsfs.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ Root: upperVD, Start: upperVD, }, &vfs.OpenOptions{ Flags: statusFlags, }) if err != nil { return nil, err } oldOff, oldOffErr := fd.cachedFD.Seek(ctx, 0, linux.SEEK_CUR) if oldOffErr == nil { if _, err := upperFD.Seek(ctx, oldOff, linux.SEEK_SET); err != nil { upperFD.DecRef(ctx) return nil, err } } fd.cachedFD.DecRef(ctx) fd.copiedUp = true fd.cachedFD = upperFD fd.cachedFlags = statusFlags } else if fd.cachedFlags != statusFlags { if err := fd.cachedFD.SetStatusFlags(ctx, d.fs.creds, statusFlags); err != nil { return nil, err } fd.cachedFlags = statusFlags } return fd.cachedFD, nil } // Release implements vfs.FileDescriptionImpl.Release. func (fd *regularFileFD) Release(ctx context.Context) { fd.cachedFD.DecRef(ctx) fd.cachedFD = nil } // OnClose implements vfs.FileDescriptionImpl.OnClose. func (fd *regularFileFD) OnClose(ctx context.Context) error { // Linux doesn't define ovl_file_operations.flush at all (i.e. its // equivalent to OnClose is a no-op). We pass through to // fd.cachedFD.OnClose() without upgrading if fd.dentry() has been // copied-up, since OnClose is mostly used to define post-close writeback, // and if fd.cachedFD hasn't been updated then it can't have been used to // mutate fd.dentry() anyway. fd.mu.Lock() if statusFlags := fd.vfsfd.StatusFlags(); fd.cachedFlags != statusFlags { if err := fd.cachedFD.SetStatusFlags(ctx, fd.filesystem().creds, statusFlags); err != nil { fd.mu.Unlock() return err } fd.cachedFlags = statusFlags } wrappedFD := fd.cachedFD fd.mu.Unlock() return wrappedFD.OnClose(ctx) } // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *regularFileFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 { wrappedFD, err := fd.getCurrentFD(ctx) if err != nil { return linux.Statx{}, err } stat, err = wrappedFD.Stat(ctx, vfs.StatOptions{ Mask: layerMask, Sync: opts.Sync, }) wrappedFD.DecRef(ctx) if err != nil { return linux.Statx{}, err } } fd.dentry().statInternalTo(ctx, &opts, &stat) return stat, nil } // Allocate implements vfs.FileDescriptionImpl.Allocate. func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { wrappedFD, err := fd.getCurrentFD(ctx) if err != nil { return err } defer wrappedFD.DecRef(ctx) return wrappedFD.Allocate(ctx, mode, offset, length) } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *regularFileFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { d := fd.dentry() mode := linux.FileMode(d.mode.Load()) if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts, mode, auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())); err != nil { return err } mnt := fd.vfsfd.Mount() if err := mnt.CheckBeginWrite(); err != nil { return err } defer mnt.EndWrite() if err := d.copyUpLocked(ctx); err != nil { return err } // Changes to d's attributes are serialized by d.copyMu. d.copyMu.Lock() defer d.copyMu.Unlock() wrappedFD, err := fd.currentFDLocked(ctx) if err != nil { return err } if err := wrappedFD.SetStat(ctx, opts); err != nil { return err } // Changing owners or truncating may clear one or both of the setuid and // setgid bits, so we may have to update opts before setting d.mode. if opts.Stat.Mask&(linux.STATX_UID|linux.STATX_GID|linux.STATX_SIZE) != 0 { stat, err := wrappedFD.Stat(ctx, vfs.StatOptions{ Mask: linux.STATX_MODE, }) if err != nil { return err } opts.Stat.Mode = stat.Mode opts.Stat.Mask |= linux.STATX_MODE } d.updateAfterSetStatLocked(&opts) return nil } // StatFS implements vfs.FileDescriptionImpl.StatFS. func (fd *regularFileFD) StatFS(ctx context.Context) (linux.Statfs, error) { return fd.filesystem().statFS(ctx) } // Readiness implements waiter.Waitable.Readiness. func (fd *regularFileFD) Readiness(mask waiter.EventMask) waiter.EventMask { ctx := context.Background() wrappedFD, err := fd.getCurrentFD(ctx) if err != nil { // TODO(b/171089913): Just use fd.cachedFD since Readiness can't return // an error. This is obviously wrong, but at least consistent with // VFS1. log.Warningf("overlay.regularFileFD.Readiness: currentFDLocked failed: %v", err) fd.mu.Lock() wrappedFD = fd.cachedFD wrappedFD.IncRef() fd.mu.Unlock() } defer wrappedFD.DecRef(ctx) return wrappedFD.Readiness(mask) } // EventRegister implements waiter.Waitable.EventRegister. func (fd *regularFileFD) EventRegister(e *waiter.Entry) error { fd.mu.Lock() defer fd.mu.Unlock() wrappedFD, err := fd.currentFDLocked(context.Background()) if err != nil { // TODO(b/171089913): Just use fd.cachedFD for backward compatibility // with VFS1. log.Warningf("overlay.regularFileFD.EventRegister: currentFDLocked failed: %v", err) wrappedFD = fd.cachedFD } return wrappedFD.EventRegister(e) } // EventUnregister implements waiter.Waitable.EventUnregister. func (fd *regularFileFD) EventUnregister(e *waiter.Entry) { fd.mu.Lock() defer fd.mu.Unlock() fd.cachedFD.EventUnregister(e) } // Epollable implements FileDescriptionImpl.Epollable. func (fd *regularFileFD) Epollable() bool { fd.mu.Lock() defer fd.mu.Unlock() wrappedFD, err := fd.currentFDLocked(context.Background()) if err != nil { // TODO(b/171089913): Just use fd.cachedFD since EventRegister can't // return an error. This is obviously wrong, but at least consistent // with VFS1. log.Warningf("overlay.regularFileFD.Epollable: currentFDLocked failed: %v", err) wrappedFD = fd.cachedFD } return wrappedFD.Epollable() } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { wrappedFD, err := fd.getCurrentFD(ctx) if err != nil { return 0, err } defer wrappedFD.DecRef(ctx) return wrappedFD.PRead(ctx, dst, offset, opts) } // Read implements vfs.FileDescriptionImpl.Read. func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { // Hold fd.mu during the read to serialize the file offset. fd.mu.Lock() defer fd.mu.Unlock() wrappedFD, err := fd.currentFDLocked(ctx) if err != nil { return 0, err } return wrappedFD.Read(ctx, dst, opts) } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { wrappedFD, err := fd.getCurrentFD(ctx) if err != nil { return 0, err } defer wrappedFD.DecRef(ctx) n, err := wrappedFD.PWrite(ctx, src, offset, opts) if err != nil { return n, err } return fd.updateSetUserGroupIDs(ctx, wrappedFD, n) } // Write implements vfs.FileDescriptionImpl.Write. func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { // Hold fd.mu during the write to serialize the file offset. fd.mu.Lock() defer fd.mu.Unlock() wrappedFD, err := fd.currentFDLocked(ctx) if err != nil { return 0, err } n, err := wrappedFD.Write(ctx, src, opts) if err != nil { return n, err } return fd.updateSetUserGroupIDs(ctx, wrappedFD, n) } func (fd *regularFileFD) updateSetUserGroupIDs(ctx context.Context, wrappedFD *vfs.FileDescription, written int64) (int64, error) { // Writing can clear the setuid and/or setgid bits. We only have to // check this if something was written and one of those bits was set. dentry := fd.dentry() if written == 0 || dentry.mode.Load()&(linux.S_ISUID|linux.S_ISGID) == 0 { return written, nil } stat, err := wrappedFD.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_MODE}) if err != nil { return written, err } dentry.copyMu.Lock() defer dentry.copyMu.Unlock() dentry.mode.Store(uint32(stat.Mode)) return written, nil } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { // Hold fd.mu during the seek to serialize the file offset. fd.mu.Lock() defer fd.mu.Unlock() wrappedFD, err := fd.currentFDLocked(ctx) if err != nil { return 0, err } return wrappedFD.Seek(ctx, offset, whence) } // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *regularFileFD) Sync(ctx context.Context) error { fd.mu.Lock() if !fd.dentry().isCopiedUp() { fd.mu.Unlock() return nil } wrappedFD, err := fd.currentFDLocked(ctx) if err != nil { fd.mu.Unlock() return err } wrappedFD.IncRef() defer wrappedFD.DecRef(ctx) fd.mu.Unlock() return wrappedFD.Sync(ctx) } // Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (fd *regularFileFD) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { wrappedFD, err := fd.getCurrentFD(ctx) if err != nil { return 0, err } defer wrappedFD.DecRef(ctx) return wrappedFD.Ioctl(ctx, uio, sysno, args) } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { if err := fd.ensureMappable(ctx, opts); err != nil { return err } return vfs.GenericConfigureMMap(&fd.vfsfd, fd.dentry(), opts) } // ensureMappable ensures that fd.dentry().wrappedMappable is not nil. func (fd *regularFileFD) ensureMappable(ctx context.Context, opts *memmap.MMapOpts) error { d := fd.dentry() // Fast path if we already have a Mappable for the current top layer. if d.isMappable.Load() != 0 { return nil } // Only permit mmap of regular files, since other file types may have // unpredictable behavior when mmapped (e.g. /dev/zero). if d.mode.Load()&linux.S_IFMT != linux.S_IFREG { return linuxerr.ENODEV } // Get a Mappable for the current top layer. fd.mu.Lock() defer fd.mu.Unlock() d.copyMu.RLock() defer d.copyMu.RUnlock() if d.isMappable.Load() != 0 { return nil } wrappedFD, err := fd.currentFDLocked(ctx) if err != nil { return err } if err := wrappedFD.ConfigureMMap(ctx, opts); err != nil { return err } if opts.MappingIdentity != nil { opts.MappingIdentity.DecRef(ctx) opts.MappingIdentity = nil } // Use this Mappable for all mappings of this layer (unless we raced with // another call to ensureMappable). d.mapsMu.Lock() defer d.mapsMu.Unlock() d.dataMu.Lock() defer d.dataMu.Unlock() if d.wrappedMappable == nil { d.wrappedMappable = opts.Mappable d.isMappable.Store(1) } return nil } // AddMapping implements memmap.Mappable.AddMapping. func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { d.mapsMu.Lock() defer d.mapsMu.Unlock() if err := d.wrappedMappable.AddMapping(ctx, ms, ar, offset, writable); err != nil { return err } if !d.isCopiedUp() { d.lowerMappings.AddMapping(ms, ar, offset, writable) } return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { d.mapsMu.Lock() defer d.mapsMu.Unlock() d.wrappedMappable.RemoveMapping(ctx, ms, ar, offset, writable) if !d.isCopiedUp() { d.lowerMappings.RemoveMapping(ms, ar, offset, writable) } } // CopyMapping implements memmap.Mappable.CopyMapping. func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { d.mapsMu.Lock() defer d.mapsMu.Unlock() if err := d.wrappedMappable.CopyMapping(ctx, ms, srcAR, dstAR, offset, writable); err != nil { return err } if !d.isCopiedUp() { d.lowerMappings.AddMapping(ms, dstAR, offset, writable) } return nil } // Translate implements memmap.Mappable.Translate. func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { d.dataMu.RLock() defer d.dataMu.RUnlock() return d.wrappedMappable.Translate(ctx, required, optional, at) } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (d *dentry) InvalidateUnsavable(ctx context.Context) error { d.mapsMu.Lock() defer d.mapsMu.Unlock() return d.wrappedMappable.InvalidateUnsavable(ctx) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/overlay/rename_rwmutex.go000066400000000000000000000045321465435605700273200ustar00rootroot00000000000000package overlay import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type renameRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var renamelockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type renamelockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *renameRWMutex) Lock() { locking.AddGLock(renameprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *renameRWMutex) NestedLock(i renamelockNameIndex) { locking.AddGLock(renameprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *renameRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(renameprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *renameRWMutex) NestedUnlock(i renamelockNameIndex) { m.mu.Unlock() locking.DelGLock(renameprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *renameRWMutex) RLock() { locking.AddGLock(renameprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *renameRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(renameprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *renameRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *renameRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *renameRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var renameprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func renameinitLockNames() {} func init() { renameinitLockNames() renameprefixIndex = locking.NewMutexClass(reflect.TypeOf(renameRWMutex{}), renamelockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/overlay/req_file_fd_mutex.go000066400000000000000000000033461465435605700277410ustar00rootroot00000000000000package overlay import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type regularFileFDMutex struct { mu sync.Mutex } var regularFileFDprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var regularFileFDlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type regularFileFDlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *regularFileFDMutex) Lock() { locking.AddGLock(regularFileFDprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *regularFileFDMutex) NestedLock(i regularFileFDlockNameIndex) { locking.AddGLock(regularFileFDprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *regularFileFDMutex) Unlock() { locking.DelGLock(regularFileFDprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *regularFileFDMutex) NestedUnlock(i regularFileFDlockNameIndex) { locking.DelGLock(regularFileFDprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func regularFileFDinitLockNames() {} func init() { regularFileFDinitLockNames() regularFileFDprefixIndex = locking.NewMutexClass(reflect.TypeOf(regularFileFDMutex{}), regularFileFDlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/overlay/save_restore.go000066400000000000000000000017451465435605700267620ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package overlay import ( "context" "gvisor.dev/gvisor/pkg/refs" ) func (d *dentry) afterLoad(context.Context) { if d.refs.Load() != -1 { refs.Register(d) } } // saveParent is called by stateify. func (d *dentry) saveParent() *dentry { return d.parent.Load() } // loadParent is called by stateify. func (d *dentry) loadParent(_ context.Context, parent *dentry) { d.parent.Store(parent) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/pipefs/000077500000000000000000000000001465435605700235305ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/pipefs/pipefs.go000066400000000000000000000153721465435605700253550ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package pipefs provides the filesystem implementation backing // Kernel.PipeMount. package pipefs import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) // +stateify savable type filesystemType struct{} // Name implements vfs.FilesystemType.Name. func (filesystemType) Name() string { return "pipefs" } // Release implements vfs.FilesystemType.Release. func (filesystemType) Release(ctx context.Context) {} // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { panic("pipefs.filesystemType.GetFilesystem should never be called") } // +stateify savable type filesystem struct { kernfs.Filesystem devMinor uint32 } // NewFilesystem sets up and returns a new vfs.Filesystem implemented by pipefs. func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { devMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, err } fs := &filesystem{ devMinor: devMinor, } fs.Filesystem.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) return fs.Filesystem.VFSFilesystem(), nil } // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) } // PrependPath implements vfs.FilesystemImpl.PrependPath. func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { inode := vd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode) b.PrependComponent(fmt.Sprintf("pipe:[%d]", inode.ino)) return vfs.PrependPathSyntheticError{} } // MountOptions implements vfs.FilesystemImpl.MountOptions. func (fs *filesystem) MountOptions() string { return "" } // inode implements kernfs.Inode. // // +stateify savable type inode struct { kernfs.InodeAnonymous kernfs.InodeNotDirectory kernfs.InodeNotSymlink kernfs.InodeNoopRefCount kernfs.InodeWatches locks vfs.FileLocks pipe *pipe.VFSPipe attrMu sync.Mutex `state:"nosave"` ino uint64 uid auth.KUID gid auth.KGID // We use the creation timestamp for all of atime, mtime, and ctime. ctime ktime.Time } func newInode(ctx context.Context, fs *filesystem) *inode { creds := auth.CredentialsFromContext(ctx) return &inode{ pipe: pipe.NewVFSPipe(false /* isNamed */, pipe.DefaultPipeSize), ino: fs.Filesystem.NextIno(), uid: creds.EffectiveKUID, gid: creds.EffectiveKGID, ctime: ktime.NowFromContext(ctx), } } const pipeMode = 0600 | linux.S_IFIFO // CheckPermissions implements kernfs.Inode.CheckPermissions. func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { i.attrMu.Lock() defer i.attrMu.Unlock() return vfs.GenericCheckPermissions(creds, ats, pipeMode, i.uid, i.gid) } // Mode implements kernfs.Inode.Mode. func (i *inode) Mode() linux.FileMode { return pipeMode } // UID implements kernfs.Inode.UID. func (i *inode) UID() auth.KUID { i.attrMu.Lock() defer i.attrMu.Unlock() return auth.KUID(i.uid) } // GID implements kernfs.Inode.GID. func (i *inode) GID() auth.KGID { i.attrMu.Lock() defer i.attrMu.Unlock() return auth.KGID(i.gid) } // Stat implements kernfs.Inode.Stat. func (i *inode) Stat(_ context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds()) i.attrMu.Lock() defer i.attrMu.Unlock() return linux.Statx{ Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS, Blksize: hostarch.PageSize, Nlink: 1, UID: uint32(i.uid), GID: uint32(i.gid), Mode: pipeMode, Ino: i.ino, Size: 0, Blocks: 0, Atime: ts, Ctime: ts, Mtime: ts, DevMajor: linux.UNNAMED_MAJOR, DevMinor: vfsfs.Impl().(*filesystem).devMinor, }, nil } // SetStat implements kernfs.Inode.SetStat. func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { if opts.Stat.Mask&^(linux.STATX_UID|linux.STATX_GID) != 0 { return linuxerr.EPERM } i.attrMu.Lock() defer i.attrMu.Unlock() if err := vfs.CheckSetStat(ctx, creds, &opts, pipeMode, auth.KUID(i.uid), auth.KGID(i.gid)); err != nil { return err } if opts.Stat.Mask&linux.STATX_UID != 0 { i.uid = auth.KUID(opts.Stat.UID) } if opts.Stat.Mask&linux.STATX_GID != 0 { i.gid = auth.KGID(opts.Stat.GID) } return nil } // Open implements kernfs.Inode.Open. func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK | linux.O_NOCTTY return i.pipe.Open(ctx, rp.Mount(), d.VFSDentry(), opts.Flags, &i.locks) } // StatFS implements kernfs.Inode.StatFS. func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { return vfs.GenericStatFS(linux.PIPEFS_MAGIC), nil } // NewConnectedPipeFDs returns a pair of FileDescriptions representing the read // and write ends of a newly-created pipe, as for pipe(2) and pipe2(2). // // Preconditions: mnt.Filesystem() must have been returned by NewFilesystem(). func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, *vfs.FileDescription, error) { fs := mnt.Filesystem().Impl().(*filesystem) inode := newInode(ctx, fs) var d kernfs.Dentry d.Init(&fs.Filesystem, inode) defer d.DecRef(ctx) return inode.pipe.ReaderWriterPair(ctx, mnt, d.VFSDentry(), flags) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/pipefs/pipefs_state_autogen.go000066400000000000000000000054651465435605700303010ustar00rootroot00000000000000// automatically generated by stateify. package pipefs import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (f *filesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/pipefs.filesystemType" } func (f *filesystemType) StateFields() []string { return []string{} } func (f *filesystemType) beforeSave() {} // +checklocksignore func (f *filesystemType) StateSave(stateSinkObject state.Sink) { f.beforeSave() } func (f *filesystemType) afterLoad(context.Context) {} // +checklocksignore func (f *filesystemType) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (fs *filesystem) StateTypeName() string { return "pkg/sentry/fsimpl/pipefs.filesystem" } func (fs *filesystem) StateFields() []string { return []string{ "Filesystem", "devMinor", } } func (fs *filesystem) beforeSave() {} // +checklocksignore func (fs *filesystem) StateSave(stateSinkObject state.Sink) { fs.beforeSave() stateSinkObject.Save(0, &fs.Filesystem) stateSinkObject.Save(1, &fs.devMinor) } func (fs *filesystem) afterLoad(context.Context) {} // +checklocksignore func (fs *filesystem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fs.Filesystem) stateSourceObject.Load(1, &fs.devMinor) } func (i *inode) StateTypeName() string { return "pkg/sentry/fsimpl/pipefs.inode" } func (i *inode) StateFields() []string { return []string{ "InodeAnonymous", "InodeNotDirectory", "InodeNotSymlink", "InodeNoopRefCount", "InodeWatches", "locks", "pipe", "ino", "uid", "gid", "ctime", } } func (i *inode) beforeSave() {} // +checklocksignore func (i *inode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.InodeAnonymous) stateSinkObject.Save(1, &i.InodeNotDirectory) stateSinkObject.Save(2, &i.InodeNotSymlink) stateSinkObject.Save(3, &i.InodeNoopRefCount) stateSinkObject.Save(4, &i.InodeWatches) stateSinkObject.Save(5, &i.locks) stateSinkObject.Save(6, &i.pipe) stateSinkObject.Save(7, &i.ino) stateSinkObject.Save(8, &i.uid) stateSinkObject.Save(9, &i.gid) stateSinkObject.Save(10, &i.ctime) } func (i *inode) afterLoad(context.Context) {} // +checklocksignore func (i *inode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.InodeAnonymous) stateSourceObject.Load(1, &i.InodeNotDirectory) stateSourceObject.Load(2, &i.InodeNotSymlink) stateSourceObject.Load(3, &i.InodeNoopRefCount) stateSourceObject.Load(4, &i.InodeWatches) stateSourceObject.Load(5, &i.locks) stateSourceObject.Load(6, &i.pipe) stateSourceObject.Load(7, &i.ino) stateSourceObject.Load(8, &i.uid) stateSourceObject.Load(9, &i.gid) stateSourceObject.Load(10, &i.ctime) } func init() { state.Register((*filesystemType)(nil)) state.Register((*filesystem)(nil)) state.Register((*inode)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/proc/000077500000000000000000000000001465435605700232055ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go000066400000000000000000000102171465435605700271610ustar00rootroot00000000000000package proc import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const fdDirInodeenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var fdDirInodeobj *fdDirInode // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type fdDirInodeRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *fdDirInodeRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *fdDirInodeRefs) RefType() string { return fmt.Sprintf("%T", fdDirInodeobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *fdDirInodeRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *fdDirInodeRefs) LogRefs() bool { return fdDirInodeenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *fdDirInodeRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *fdDirInodeRefs) IncRef() { v := r.refCount.Add(1) if fdDirInodeenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *fdDirInodeRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if fdDirInodeenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *fdDirInodeRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if fdDirInodeenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *fdDirInodeRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go000066400000000000000000000103271465435605700301760ustar00rootroot00000000000000package proc import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const fdInfoDirInodeenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var fdInfoDirInodeobj *fdInfoDirInode // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type fdInfoDirInodeRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *fdInfoDirInodeRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *fdInfoDirInodeRefs) RefType() string { return fmt.Sprintf("%T", fdInfoDirInodeobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *fdInfoDirInodeRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *fdInfoDirInodeRefs) LogRefs() bool { return fdInfoDirInodeenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *fdInfoDirInodeRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *fdInfoDirInodeRefs) IncRef() { v := r.refCount.Add(1) if fdInfoDirInodeenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *fdInfoDirInodeRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if fdInfoDirInodeenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *fdInfoDirInodeRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if fdInfoDirInodeenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *fdInfoDirInodeRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/proc/filesystem.go000066400000000000000000000122221465435605700257170ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package proc implements a partial in-memory file system for procfs. package proc import ( "fmt" "strconv" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) const ( // Name is the default filesystem name. Name = "proc" defaultMaxCachedDentries = uint64(1000) ) // FilesystemType is the factory class for procfs. // // +stateify savable type FilesystemType struct{} // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name } // Release implements vfs.FilesystemType.Release. func (FilesystemType) Release(ctx context.Context) {} // +stateify savable type filesystem struct { kernfs.Filesystem devMinor uint32 } func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { d, err := fs.GetDentryAt(ctx, rp, vfs.GetDentryOptions{}) if err != nil { return linux.Statfs{}, err } d.DecRef(ctx) return vfs.GenericStatFS(linux.PROC_SUPER_MAGIC), nil } // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { k := kernel.KernelFromContext(ctx) if k == nil { return nil, nil, fmt.Errorf("procfs requires a kernel") } pidns := kernel.PIDNamespaceFromContext(ctx) if pidns == nil { return nil, nil, fmt.Errorf("procfs requires a PID namespace") } devMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, nil, err } mopts := vfs.GenericParseMountOptions(opts.Data) maxCachedDentries := defaultMaxCachedDentries if str, ok := mopts["dentry_cache_limit"]; ok { delete(mopts, "dentry_cache_limit") maxCachedDentries, err = strconv.ParseUint(str, 10, 64) if err != nil { ctx.Warningf("proc.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) return nil, nil, linuxerr.EINVAL } } procfs := &filesystem{ devMinor: devMinor, } procfs.MaxCachedDentries = maxCachedDentries procfs.VFSFilesystem().Init(vfsObj, &ft, procfs) var internalData *InternalData if opts.InternalData == nil { internalData = &InternalData{} } else { internalData = opts.InternalData.(*InternalData) } inode := procfs.newTasksInode(ctx, k, pidns, internalData) var dentry kernfs.Dentry dentry.InitRoot(&procfs.Filesystem, inode) return procfs.VFSFilesystem(), dentry.VFSDentry(), nil } // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) } // MountOptions implements vfs.FilesystemImpl.MountOptions. func (fs *filesystem) MountOptions() string { return fmt.Sprintf("dentry_cache_limit=%d", fs.MaxCachedDentries) } // dynamicInode is an overfitted interface for common Inodes with // dynamicByteSource types used in procfs. // // +stateify savable type dynamicInode interface { kernfs.Inode vfs.DynamicBytesSource Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) } func (fs *filesystem) newInode(ctx context.Context, creds *auth.Credentials, perm linux.FileMode, inode dynamicInode) dynamicInode { inode.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), inode, perm) return inode } // +stateify savable type staticFile struct { kernfs.DynamicBytesFile vfs.StaticData } var _ dynamicInode = (*staticFile)(nil) func newStaticFile(data string) *staticFile { return &staticFile{StaticData: vfs.StaticData{Data: data}} } func (fs *filesystem) newStaticDir(ctx context.Context, creds *auth.Credentials, children map[string]kernfs.Inode) kernfs.Inode { return kernfs.NewStaticDir(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, children, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, }) } // InternalData contains internal data passed in to the procfs mount via // vfs.GetFilesystemOptions.InternalData. // // +stateify savable type InternalData struct { ExtraInternalData Cgroups map[string]string } // +stateify savable type implStatFS struct{} // StatFS implements kernfs.Inode.StatFS. func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { return vfs.GenericStatFS(linux.PROC_SUPER_MAGIC), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/proc/proc_impl.go000066400000000000000000000021151465435605700255170ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false package proc import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) // ExtraInternalData is an empty struct that could contain extra data for the procfs. // // +stateify savable type ExtraInternalData struct{} func (fs *filesystem) newTasksInodeExtra(context.Context, *auth.Credentials, *InternalData, *kernel.Kernel, map[string]kernfs.Inode) { } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/proc/proc_impl_state_autogen.go000066400000000000000000000013141465435605700304410ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package proc import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (e *ExtraInternalData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.ExtraInternalData" } func (e *ExtraInternalData) StateFields() []string { return []string{} } func (e *ExtraInternalData) beforeSave() {} // +checklocksignore func (e *ExtraInternalData) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ExtraInternalData) afterLoad(context.Context) {} // +checklocksignore func (e *ExtraInternalData) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func init() { state.Register((*ExtraInternalData)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/proc/proc_state_autogen.go000066400000000000000000002110331465435605700274210ustar00rootroot00000000000000// automatically generated by stateify. package proc import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (r *fdDirInodeRefs) StateTypeName() string { return "pkg/sentry/fsimpl/proc.fdDirInodeRefs" } func (r *fdDirInodeRefs) StateFields() []string { return []string{ "refCount", } } func (r *fdDirInodeRefs) beforeSave() {} // +checklocksignore func (r *fdDirInodeRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *fdDirInodeRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (r *fdInfoDirInodeRefs) StateTypeName() string { return "pkg/sentry/fsimpl/proc.fdInfoDirInodeRefs" } func (r *fdInfoDirInodeRefs) StateFields() []string { return []string{ "refCount", } } func (r *fdInfoDirInodeRefs) beforeSave() {} // +checklocksignore func (r *fdInfoDirInodeRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *fdInfoDirInodeRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (ft *FilesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/proc.FilesystemType" } func (ft *FilesystemType) StateFields() []string { return []string{} } func (ft *FilesystemType) beforeSave() {} // +checklocksignore func (ft *FilesystemType) StateSave(stateSinkObject state.Sink) { ft.beforeSave() } func (ft *FilesystemType) afterLoad(context.Context) {} // +checklocksignore func (ft *FilesystemType) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (fs *filesystem) StateTypeName() string { return "pkg/sentry/fsimpl/proc.filesystem" } func (fs *filesystem) StateFields() []string { return []string{ "Filesystem", "devMinor", } } func (fs *filesystem) beforeSave() {} // +checklocksignore func (fs *filesystem) StateSave(stateSinkObject state.Sink) { fs.beforeSave() stateSinkObject.Save(0, &fs.Filesystem) stateSinkObject.Save(1, &fs.devMinor) } func (fs *filesystem) afterLoad(context.Context) {} // +checklocksignore func (fs *filesystem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fs.Filesystem) stateSourceObject.Load(1, &fs.devMinor) } func (s *staticFile) StateTypeName() string { return "pkg/sentry/fsimpl/proc.staticFile" } func (s *staticFile) StateFields() []string { return []string{ "DynamicBytesFile", "StaticData", } } func (s *staticFile) beforeSave() {} // +checklocksignore func (s *staticFile) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.DynamicBytesFile) stateSinkObject.Save(1, &s.StaticData) } func (s *staticFile) afterLoad(context.Context) {} // +checklocksignore func (s *staticFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.DynamicBytesFile) stateSourceObject.Load(1, &s.StaticData) } func (i *InternalData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.InternalData" } func (i *InternalData) StateFields() []string { return []string{ "ExtraInternalData", "Cgroups", } } func (i *InternalData) beforeSave() {} // +checklocksignore func (i *InternalData) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.ExtraInternalData) stateSinkObject.Save(1, &i.Cgroups) } func (i *InternalData) afterLoad(context.Context) {} // +checklocksignore func (i *InternalData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.ExtraInternalData) stateSourceObject.Load(1, &i.Cgroups) } func (i *implStatFS) StateTypeName() string { return "pkg/sentry/fsimpl/proc.implStatFS" } func (i *implStatFS) StateFields() []string { return []string{} } func (i *implStatFS) beforeSave() {} // +checklocksignore func (i *implStatFS) StateSave(stateSinkObject state.Sink) { i.beforeSave() } func (i *implStatFS) afterLoad(context.Context) {} // +checklocksignore func (i *implStatFS) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (i *subtasksInode) StateTypeName() string { return "pkg/sentry/fsimpl/proc.subtasksInode" } func (i *subtasksInode) StateFields() []string { return []string{ "implStatFS", "InodeAlwaysValid", "InodeAttrs", "InodeDirectoryNoNewChildren", "InodeNotAnonymous", "InodeNotSymlink", "InodeTemporary", "InodeWatches", "OrderedChildren", "subtasksInodeRefs", "locks", "fs", "task", "pidns", "cgroupControllers", } } func (i *subtasksInode) beforeSave() {} // +checklocksignore func (i *subtasksInode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.implStatFS) stateSinkObject.Save(1, &i.InodeAlwaysValid) stateSinkObject.Save(2, &i.InodeAttrs) stateSinkObject.Save(3, &i.InodeDirectoryNoNewChildren) stateSinkObject.Save(4, &i.InodeNotAnonymous) stateSinkObject.Save(5, &i.InodeNotSymlink) stateSinkObject.Save(6, &i.InodeTemporary) stateSinkObject.Save(7, &i.InodeWatches) stateSinkObject.Save(8, &i.OrderedChildren) stateSinkObject.Save(9, &i.subtasksInodeRefs) stateSinkObject.Save(10, &i.locks) stateSinkObject.Save(11, &i.fs) stateSinkObject.Save(12, &i.task) stateSinkObject.Save(13, &i.pidns) stateSinkObject.Save(14, &i.cgroupControllers) } func (i *subtasksInode) afterLoad(context.Context) {} // +checklocksignore func (i *subtasksInode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.implStatFS) stateSourceObject.Load(1, &i.InodeAlwaysValid) stateSourceObject.Load(2, &i.InodeAttrs) stateSourceObject.Load(3, &i.InodeDirectoryNoNewChildren) stateSourceObject.Load(4, &i.InodeNotAnonymous) stateSourceObject.Load(5, &i.InodeNotSymlink) stateSourceObject.Load(6, &i.InodeTemporary) stateSourceObject.Load(7, &i.InodeWatches) stateSourceObject.Load(8, &i.OrderedChildren) stateSourceObject.Load(9, &i.subtasksInodeRefs) stateSourceObject.Load(10, &i.locks) stateSourceObject.Load(11, &i.fs) stateSourceObject.Load(12, &i.task) stateSourceObject.Load(13, &i.pidns) stateSourceObject.Load(14, &i.cgroupControllers) } func (fd *subtasksFD) StateTypeName() string { return "pkg/sentry/fsimpl/proc.subtasksFD" } func (fd *subtasksFD) StateFields() []string { return []string{ "GenericDirectoryFD", "task", } } func (fd *subtasksFD) beforeSave() {} // +checklocksignore func (fd *subtasksFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.GenericDirectoryFD) stateSinkObject.Save(1, &fd.task) } func (fd *subtasksFD) afterLoad(context.Context) {} // +checklocksignore func (fd *subtasksFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.GenericDirectoryFD) stateSourceObject.Load(1, &fd.task) } func (r *subtasksInodeRefs) StateTypeName() string { return "pkg/sentry/fsimpl/proc.subtasksInodeRefs" } func (r *subtasksInodeRefs) StateFields() []string { return []string{ "refCount", } } func (r *subtasksInodeRefs) beforeSave() {} // +checklocksignore func (r *subtasksInodeRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *subtasksInodeRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (i *taskInode) StateTypeName() string { return "pkg/sentry/fsimpl/proc.taskInode" } func (i *taskInode) StateFields() []string { return []string{ "implStatFS", "InodeAttrs", "InodeDirectoryNoNewChildren", "InodeNotAnonymous", "InodeNotSymlink", "InodeTemporary", "InodeWatches", "OrderedChildren", "taskInodeRefs", "locks", "task", } } func (i *taskInode) beforeSave() {} // +checklocksignore func (i *taskInode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.implStatFS) stateSinkObject.Save(1, &i.InodeAttrs) stateSinkObject.Save(2, &i.InodeDirectoryNoNewChildren) stateSinkObject.Save(3, &i.InodeNotAnonymous) stateSinkObject.Save(4, &i.InodeNotSymlink) stateSinkObject.Save(5, &i.InodeTemporary) stateSinkObject.Save(6, &i.InodeWatches) stateSinkObject.Save(7, &i.OrderedChildren) stateSinkObject.Save(8, &i.taskInodeRefs) stateSinkObject.Save(9, &i.locks) stateSinkObject.Save(10, &i.task) } func (i *taskInode) afterLoad(context.Context) {} // +checklocksignore func (i *taskInode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.implStatFS) stateSourceObject.Load(1, &i.InodeAttrs) stateSourceObject.Load(2, &i.InodeDirectoryNoNewChildren) stateSourceObject.Load(3, &i.InodeNotAnonymous) stateSourceObject.Load(4, &i.InodeNotSymlink) stateSourceObject.Load(5, &i.InodeTemporary) stateSourceObject.Load(6, &i.InodeWatches) stateSourceObject.Load(7, &i.OrderedChildren) stateSourceObject.Load(8, &i.taskInodeRefs) stateSourceObject.Load(9, &i.locks) stateSourceObject.Load(10, &i.task) } func (i *taskOwnedInode) StateTypeName() string { return "pkg/sentry/fsimpl/proc.taskOwnedInode" } func (i *taskOwnedInode) StateFields() []string { return []string{ "Inode", "owner", } } func (i *taskOwnedInode) beforeSave() {} // +checklocksignore func (i *taskOwnedInode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.Inode) stateSinkObject.Save(1, &i.owner) } func (i *taskOwnedInode) afterLoad(context.Context) {} // +checklocksignore func (i *taskOwnedInode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.Inode) stateSourceObject.Load(1, &i.owner) } func (i *fdDir) StateTypeName() string { return "pkg/sentry/fsimpl/proc.fdDir" } func (i *fdDir) StateFields() []string { return []string{ "locks", "fs", "task", "produceSymlink", } } func (i *fdDir) beforeSave() {} // +checklocksignore func (i *fdDir) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.locks) stateSinkObject.Save(1, &i.fs) stateSinkObject.Save(2, &i.task) stateSinkObject.Save(3, &i.produceSymlink) } func (i *fdDir) afterLoad(context.Context) {} // +checklocksignore func (i *fdDir) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.locks) stateSourceObject.Load(1, &i.fs) stateSourceObject.Load(2, &i.task) stateSourceObject.Load(3, &i.produceSymlink) } func (i *fdDirInode) StateTypeName() string { return "pkg/sentry/fsimpl/proc.fdDirInode" } func (i *fdDirInode) StateFields() []string { return []string{ "fdDir", "fdDirInodeRefs", "implStatFS", "InodeAlwaysValid", "InodeAttrs", "InodeDirectoryNoNewChildren", "InodeNotAnonymous", "InodeNotSymlink", "InodeTemporary", "InodeWatches", "OrderedChildren", } } func (i *fdDirInode) beforeSave() {} // +checklocksignore func (i *fdDirInode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.fdDir) stateSinkObject.Save(1, &i.fdDirInodeRefs) stateSinkObject.Save(2, &i.implStatFS) stateSinkObject.Save(3, &i.InodeAlwaysValid) stateSinkObject.Save(4, &i.InodeAttrs) stateSinkObject.Save(5, &i.InodeDirectoryNoNewChildren) stateSinkObject.Save(6, &i.InodeNotAnonymous) stateSinkObject.Save(7, &i.InodeNotSymlink) stateSinkObject.Save(8, &i.InodeTemporary) stateSinkObject.Save(9, &i.InodeWatches) stateSinkObject.Save(10, &i.OrderedChildren) } func (i *fdDirInode) afterLoad(context.Context) {} // +checklocksignore func (i *fdDirInode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.fdDir) stateSourceObject.Load(1, &i.fdDirInodeRefs) stateSourceObject.Load(2, &i.implStatFS) stateSourceObject.Load(3, &i.InodeAlwaysValid) stateSourceObject.Load(4, &i.InodeAttrs) stateSourceObject.Load(5, &i.InodeDirectoryNoNewChildren) stateSourceObject.Load(6, &i.InodeNotAnonymous) stateSourceObject.Load(7, &i.InodeNotSymlink) stateSourceObject.Load(8, &i.InodeTemporary) stateSourceObject.Load(9, &i.InodeWatches) stateSourceObject.Load(10, &i.OrderedChildren) } func (s *fdSymlink) StateTypeName() string { return "pkg/sentry/fsimpl/proc.fdSymlink" } func (s *fdSymlink) StateFields() []string { return []string{ "implStatFS", "InodeAttrs", "InodeNoopRefCount", "InodeNotAnonymous", "InodeSymlink", "InodeWatches", "fs", "task", "fd", } } func (s *fdSymlink) beforeSave() {} // +checklocksignore func (s *fdSymlink) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.implStatFS) stateSinkObject.Save(1, &s.InodeAttrs) stateSinkObject.Save(2, &s.InodeNoopRefCount) stateSinkObject.Save(3, &s.InodeNotAnonymous) stateSinkObject.Save(4, &s.InodeSymlink) stateSinkObject.Save(5, &s.InodeWatches) stateSinkObject.Save(6, &s.fs) stateSinkObject.Save(7, &s.task) stateSinkObject.Save(8, &s.fd) } func (s *fdSymlink) afterLoad(context.Context) {} // +checklocksignore func (s *fdSymlink) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.implStatFS) stateSourceObject.Load(1, &s.InodeAttrs) stateSourceObject.Load(2, &s.InodeNoopRefCount) stateSourceObject.Load(3, &s.InodeNotAnonymous) stateSourceObject.Load(4, &s.InodeSymlink) stateSourceObject.Load(5, &s.InodeWatches) stateSourceObject.Load(6, &s.fs) stateSourceObject.Load(7, &s.task) stateSourceObject.Load(8, &s.fd) } func (i *fdInfoDirInode) StateTypeName() string { return "pkg/sentry/fsimpl/proc.fdInfoDirInode" } func (i *fdInfoDirInode) StateFields() []string { return []string{ "fdDir", "fdInfoDirInodeRefs", "implStatFS", "InodeAlwaysValid", "InodeAttrs", "InodeDirectoryNoNewChildren", "InodeNotAnonymous", "InodeNotSymlink", "InodeTemporary", "InodeWatches", "OrderedChildren", } } func (i *fdInfoDirInode) beforeSave() {} // +checklocksignore func (i *fdInfoDirInode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.fdDir) stateSinkObject.Save(1, &i.fdInfoDirInodeRefs) stateSinkObject.Save(2, &i.implStatFS) stateSinkObject.Save(3, &i.InodeAlwaysValid) stateSinkObject.Save(4, &i.InodeAttrs) stateSinkObject.Save(5, &i.InodeDirectoryNoNewChildren) stateSinkObject.Save(6, &i.InodeNotAnonymous) stateSinkObject.Save(7, &i.InodeNotSymlink) stateSinkObject.Save(8, &i.InodeTemporary) stateSinkObject.Save(9, &i.InodeWatches) stateSinkObject.Save(10, &i.OrderedChildren) } func (i *fdInfoDirInode) afterLoad(context.Context) {} // +checklocksignore func (i *fdInfoDirInode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.fdDir) stateSourceObject.Load(1, &i.fdInfoDirInodeRefs) stateSourceObject.Load(2, &i.implStatFS) stateSourceObject.Load(3, &i.InodeAlwaysValid) stateSourceObject.Load(4, &i.InodeAttrs) stateSourceObject.Load(5, &i.InodeDirectoryNoNewChildren) stateSourceObject.Load(6, &i.InodeNotAnonymous) stateSourceObject.Load(7, &i.InodeNotSymlink) stateSourceObject.Load(8, &i.InodeTemporary) stateSourceObject.Load(9, &i.InodeWatches) stateSourceObject.Load(10, &i.OrderedChildren) } func (d *fdInfoData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.fdInfoData" } func (d *fdInfoData) StateFields() []string { return []string{ "DynamicBytesFile", "fs", "task", "fd", } } func (d *fdInfoData) beforeSave() {} // +checklocksignore func (d *fdInfoData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.fs) stateSinkObject.Save(2, &d.task) stateSinkObject.Save(3, &d.fd) } func (d *fdInfoData) afterLoad(context.Context) {} // +checklocksignore func (d *fdInfoData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.Load(1, &d.fs) stateSourceObject.Load(2, &d.task) stateSourceObject.Load(3, &d.fd) } func (d *auxvData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.auxvData" } func (d *auxvData) StateFields() []string { return []string{ "DynamicBytesFile", "task", } } func (d *auxvData) beforeSave() {} // +checklocksignore func (d *auxvData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.task) } func (d *auxvData) afterLoad(context.Context) {} // +checklocksignore func (d *auxvData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.Load(1, &d.task) } func (d *metadataData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.metadataData" } func (d *metadataData) StateFields() []string { return []string{ "DynamicBytesFile", "task", "metaType", } } func (d *metadataData) beforeSave() {} // +checklocksignore func (d *metadataData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.task) stateSinkObject.Save(2, &d.metaType) } func (d *metadataData) afterLoad(context.Context) {} // +checklocksignore func (d *metadataData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.Load(1, &d.task) stateSourceObject.Load(2, &d.metaType) } func (i *commInode) StateTypeName() string { return "pkg/sentry/fsimpl/proc.commInode" } func (i *commInode) StateFields() []string { return []string{ "DynamicBytesFile", "task", } } func (i *commInode) beforeSave() {} // +checklocksignore func (i *commInode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.DynamicBytesFile) stateSinkObject.Save(1, &i.task) } func (i *commInode) afterLoad(context.Context) {} // +checklocksignore func (i *commInode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.DynamicBytesFile) stateSourceObject.Load(1, &i.task) } func (d *commData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.commData" } func (d *commData) StateFields() []string { return []string{ "DynamicBytesFile", "task", } } func (d *commData) beforeSave() {} // +checklocksignore func (d *commData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.task) } func (d *commData) afterLoad(context.Context) {} // +checklocksignore func (d *commData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.Load(1, &d.task) } func (d *idMapData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.idMapData" } func (d *idMapData) StateFields() []string { return []string{ "DynamicBytesFile", "task", "gids", } } func (d *idMapData) beforeSave() {} // +checklocksignore func (d *idMapData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.task) stateSinkObject.Save(2, &d.gids) } func (d *idMapData) afterLoad(context.Context) {} // +checklocksignore func (d *idMapData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.Load(1, &d.task) stateSourceObject.Load(2, &d.gids) } func (f *memInode) StateTypeName() string { return "pkg/sentry/fsimpl/proc.memInode" } func (f *memInode) StateFields() []string { return []string{ "InodeAttrs", "InodeNoStatFS", "InodeNoopRefCount", "InodeNotAnonymous", "InodeNotDirectory", "InodeNotSymlink", "InodeWatches", "task", "locks", } } func (f *memInode) beforeSave() {} // +checklocksignore func (f *memInode) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.InodeAttrs) stateSinkObject.Save(1, &f.InodeNoStatFS) stateSinkObject.Save(2, &f.InodeNoopRefCount) stateSinkObject.Save(3, &f.InodeNotAnonymous) stateSinkObject.Save(4, &f.InodeNotDirectory) stateSinkObject.Save(5, &f.InodeNotSymlink) stateSinkObject.Save(6, &f.InodeWatches) stateSinkObject.Save(7, &f.task) stateSinkObject.Save(8, &f.locks) } func (f *memInode) afterLoad(context.Context) {} // +checklocksignore func (f *memInode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.InodeAttrs) stateSourceObject.Load(1, &f.InodeNoStatFS) stateSourceObject.Load(2, &f.InodeNoopRefCount) stateSourceObject.Load(3, &f.InodeNotAnonymous) stateSourceObject.Load(4, &f.InodeNotDirectory) stateSourceObject.Load(5, &f.InodeNotSymlink) stateSourceObject.Load(6, &f.InodeWatches) stateSourceObject.Load(7, &f.task) stateSourceObject.Load(8, &f.locks) } func (fd *memFD) StateTypeName() string { return "pkg/sentry/fsimpl/proc.memFD" } func (fd *memFD) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "LockFD", "inode", "offset", } } func (fd *memFD) beforeSave() {} // +checklocksignore func (fd *memFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.vfsfd) stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.LockFD) stateSinkObject.Save(3, &fd.inode) stateSinkObject.Save(4, &fd.offset) } func (fd *memFD) afterLoad(context.Context) {} // +checklocksignore func (fd *memFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.vfsfd) stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.LockFD) stateSourceObject.Load(3, &fd.inode) stateSourceObject.Load(4, &fd.offset) } func (d *limitsData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.limitsData" } func (d *limitsData) StateFields() []string { return []string{ "DynamicBytesFile", "task", } } func (d *limitsData) beforeSave() {} // +checklocksignore func (d *limitsData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.task) } func (d *limitsData) afterLoad(context.Context) {} // +checklocksignore func (d *limitsData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.Load(1, &d.task) } func (d *mapsData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.mapsData" } func (d *mapsData) StateFields() []string { return []string{ "DynamicBytesFile", "task", } } func (d *mapsData) beforeSave() {} // +checklocksignore func (d *mapsData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.task) } func (d *mapsData) afterLoad(context.Context) {} // +checklocksignore func (d *mapsData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.Load(1, &d.task) } func (d *smapsData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.smapsData" } func (d *smapsData) StateFields() []string { return []string{ "DynamicBytesFile", "task", } } func (d *smapsData) beforeSave() {} // +checklocksignore func (d *smapsData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.task) } func (d *smapsData) afterLoad(context.Context) {} // +checklocksignore func (d *smapsData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.Load(1, &d.task) } func (s *taskStatData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.taskStatData" } func (s *taskStatData) StateFields() []string { return []string{ "DynamicBytesFile", "task", "tgstats", "pidns", } } func (s *taskStatData) beforeSave() {} // +checklocksignore func (s *taskStatData) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.DynamicBytesFile) stateSinkObject.Save(1, &s.task) stateSinkObject.Save(2, &s.tgstats) stateSinkObject.Save(3, &s.pidns) } func (s *taskStatData) afterLoad(context.Context) {} // +checklocksignore func (s *taskStatData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.DynamicBytesFile) stateSourceObject.Load(1, &s.task) stateSourceObject.Load(2, &s.tgstats) stateSourceObject.Load(3, &s.pidns) } func (s *statmData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.statmData" } func (s *statmData) StateFields() []string { return []string{ "DynamicBytesFile", "task", } } func (s *statmData) beforeSave() {} // +checklocksignore func (s *statmData) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.DynamicBytesFile) stateSinkObject.Save(1, &s.task) } func (s *statmData) afterLoad(context.Context) {} // +checklocksignore func (s *statmData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.DynamicBytesFile) stateSourceObject.Load(1, &s.task) } func (s *statusInode) StateTypeName() string { return "pkg/sentry/fsimpl/proc.statusInode" } func (s *statusInode) StateFields() []string { return []string{ "InodeAttrs", "InodeNoStatFS", "InodeNoopRefCount", "InodeNotAnonymous", "InodeNotDirectory", "InodeNotSymlink", "InodeWatches", "task", "pidns", "locks", } } func (s *statusInode) beforeSave() {} // +checklocksignore func (s *statusInode) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.InodeAttrs) stateSinkObject.Save(1, &s.InodeNoStatFS) stateSinkObject.Save(2, &s.InodeNoopRefCount) stateSinkObject.Save(3, &s.InodeNotAnonymous) stateSinkObject.Save(4, &s.InodeNotDirectory) stateSinkObject.Save(5, &s.InodeNotSymlink) stateSinkObject.Save(6, &s.InodeWatches) stateSinkObject.Save(7, &s.task) stateSinkObject.Save(8, &s.pidns) stateSinkObject.Save(9, &s.locks) } func (s *statusInode) afterLoad(context.Context) {} // +checklocksignore func (s *statusInode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.InodeAttrs) stateSourceObject.Load(1, &s.InodeNoStatFS) stateSourceObject.Load(2, &s.InodeNoopRefCount) stateSourceObject.Load(3, &s.InodeNotAnonymous) stateSourceObject.Load(4, &s.InodeNotDirectory) stateSourceObject.Load(5, &s.InodeNotSymlink) stateSourceObject.Load(6, &s.InodeWatches) stateSourceObject.Load(7, &s.task) stateSourceObject.Load(8, &s.pidns) stateSourceObject.Load(9, &s.locks) } func (s *statusFD) StateTypeName() string { return "pkg/sentry/fsimpl/proc.statusFD" } func (s *statusFD) StateFields() []string { return []string{ "statusFDLowerBase", "DynamicBytesFileDescriptionImpl", "LockFD", "vfsfd", "inode", "task", "pidns", "userns", } } func (s *statusFD) beforeSave() {} // +checklocksignore func (s *statusFD) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.statusFDLowerBase) stateSinkObject.Save(1, &s.DynamicBytesFileDescriptionImpl) stateSinkObject.Save(2, &s.LockFD) stateSinkObject.Save(3, &s.vfsfd) stateSinkObject.Save(4, &s.inode) stateSinkObject.Save(5, &s.task) stateSinkObject.Save(6, &s.pidns) stateSinkObject.Save(7, &s.userns) } func (s *statusFD) afterLoad(context.Context) {} // +checklocksignore func (s *statusFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.statusFDLowerBase) stateSourceObject.Load(1, &s.DynamicBytesFileDescriptionImpl) stateSourceObject.Load(2, &s.LockFD) stateSourceObject.Load(3, &s.vfsfd) stateSourceObject.Load(4, &s.inode) stateSourceObject.Load(5, &s.task) stateSourceObject.Load(6, &s.pidns) stateSourceObject.Load(7, &s.userns) } func (s *statusFDLowerBase) StateTypeName() string { return "pkg/sentry/fsimpl/proc.statusFDLowerBase" } func (s *statusFDLowerBase) StateFields() []string { return []string{ "FileDescriptionDefaultImpl", } } func (s *statusFDLowerBase) beforeSave() {} // +checklocksignore func (s *statusFDLowerBase) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.FileDescriptionDefaultImpl) } func (s *statusFDLowerBase) afterLoad(context.Context) {} // +checklocksignore func (s *statusFDLowerBase) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.FileDescriptionDefaultImpl) } func (i *ioData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.ioData" } func (i *ioData) StateFields() []string { return []string{ "DynamicBytesFile", "ioUsage", } } func (i *ioData) beforeSave() {} // +checklocksignore func (i *ioData) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.DynamicBytesFile) stateSinkObject.Save(1, &i.ioUsage) } func (i *ioData) afterLoad(context.Context) {} // +checklocksignore func (i *ioData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.DynamicBytesFile) stateSourceObject.Load(1, &i.ioUsage) } func (o *oomScoreAdj) StateTypeName() string { return "pkg/sentry/fsimpl/proc.oomScoreAdj" } func (o *oomScoreAdj) StateFields() []string { return []string{ "DynamicBytesFile", "task", } } func (o *oomScoreAdj) beforeSave() {} // +checklocksignore func (o *oomScoreAdj) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.DynamicBytesFile) stateSinkObject.Save(1, &o.task) } func (o *oomScoreAdj) afterLoad(context.Context) {} // +checklocksignore func (o *oomScoreAdj) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.DynamicBytesFile) stateSourceObject.Load(1, &o.task) } func (s *exeSymlink) StateTypeName() string { return "pkg/sentry/fsimpl/proc.exeSymlink" } func (s *exeSymlink) StateFields() []string { return []string{ "implStatFS", "InodeAttrs", "InodeNoopRefCount", "InodeNotAnonymous", "InodeSymlink", "InodeWatches", "fs", "task", } } func (s *exeSymlink) beforeSave() {} // +checklocksignore func (s *exeSymlink) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.implStatFS) stateSinkObject.Save(1, &s.InodeAttrs) stateSinkObject.Save(2, &s.InodeNoopRefCount) stateSinkObject.Save(3, &s.InodeNotAnonymous) stateSinkObject.Save(4, &s.InodeSymlink) stateSinkObject.Save(5, &s.InodeWatches) stateSinkObject.Save(6, &s.fs) stateSinkObject.Save(7, &s.task) } func (s *exeSymlink) afterLoad(context.Context) {} // +checklocksignore func (s *exeSymlink) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.implStatFS) stateSourceObject.Load(1, &s.InodeAttrs) stateSourceObject.Load(2, &s.InodeNoopRefCount) stateSourceObject.Load(3, &s.InodeNotAnonymous) stateSourceObject.Load(4, &s.InodeSymlink) stateSourceObject.Load(5, &s.InodeWatches) stateSourceObject.Load(6, &s.fs) stateSourceObject.Load(7, &s.task) } func (s *cwdSymlink) StateTypeName() string { return "pkg/sentry/fsimpl/proc.cwdSymlink" } func (s *cwdSymlink) StateFields() []string { return []string{ "implStatFS", "InodeAttrs", "InodeNoopRefCount", "InodeNotAnonymous", "InodeSymlink", "InodeWatches", "fs", "task", } } func (s *cwdSymlink) beforeSave() {} // +checklocksignore func (s *cwdSymlink) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.implStatFS) stateSinkObject.Save(1, &s.InodeAttrs) stateSinkObject.Save(2, &s.InodeNoopRefCount) stateSinkObject.Save(3, &s.InodeNotAnonymous) stateSinkObject.Save(4, &s.InodeSymlink) stateSinkObject.Save(5, &s.InodeWatches) stateSinkObject.Save(6, &s.fs) stateSinkObject.Save(7, &s.task) } func (s *cwdSymlink) afterLoad(context.Context) {} // +checklocksignore func (s *cwdSymlink) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.implStatFS) stateSourceObject.Load(1, &s.InodeAttrs) stateSourceObject.Load(2, &s.InodeNoopRefCount) stateSourceObject.Load(3, &s.InodeNotAnonymous) stateSourceObject.Load(4, &s.InodeSymlink) stateSourceObject.Load(5, &s.InodeWatches) stateSourceObject.Load(6, &s.fs) stateSourceObject.Load(7, &s.task) } func (s *rootSymlink) StateTypeName() string { return "pkg/sentry/fsimpl/proc.rootSymlink" } func (s *rootSymlink) StateFields() []string { return []string{ "implStatFS", "InodeAttrs", "InodeNoopRefCount", "InodeNotAnonymous", "InodeSymlink", "InodeWatches", "fs", "task", } } func (s *rootSymlink) beforeSave() {} // +checklocksignore func (s *rootSymlink) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.implStatFS) stateSinkObject.Save(1, &s.InodeAttrs) stateSinkObject.Save(2, &s.InodeNoopRefCount) stateSinkObject.Save(3, &s.InodeNotAnonymous) stateSinkObject.Save(4, &s.InodeSymlink) stateSinkObject.Save(5, &s.InodeWatches) stateSinkObject.Save(6, &s.fs) stateSinkObject.Save(7, &s.task) } func (s *rootSymlink) afterLoad(context.Context) {} // +checklocksignore func (s *rootSymlink) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.implStatFS) stateSourceObject.Load(1, &s.InodeAttrs) stateSourceObject.Load(2, &s.InodeNoopRefCount) stateSourceObject.Load(3, &s.InodeNotAnonymous) stateSourceObject.Load(4, &s.InodeSymlink) stateSourceObject.Load(5, &s.InodeWatches) stateSourceObject.Load(6, &s.fs) stateSourceObject.Load(7, &s.task) } func (i *mountInfoData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.mountInfoData" } func (i *mountInfoData) StateFields() []string { return []string{ "DynamicBytesFile", "fs", "task", } } func (i *mountInfoData) beforeSave() {} // +checklocksignore func (i *mountInfoData) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.DynamicBytesFile) stateSinkObject.Save(1, &i.fs) stateSinkObject.Save(2, &i.task) } func (i *mountInfoData) afterLoad(context.Context) {} // +checklocksignore func (i *mountInfoData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.DynamicBytesFile) stateSourceObject.Load(1, &i.fs) stateSourceObject.Load(2, &i.task) } func (i *mountsData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.mountsData" } func (i *mountsData) StateFields() []string { return []string{ "DynamicBytesFile", "fs", "task", } } func (i *mountsData) beforeSave() {} // +checklocksignore func (i *mountsData) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.DynamicBytesFile) stateSinkObject.Save(1, &i.fs) stateSinkObject.Save(2, &i.task) } func (i *mountsData) afterLoad(context.Context) {} // +checklocksignore func (i *mountsData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.DynamicBytesFile) stateSourceObject.Load(1, &i.fs) stateSourceObject.Load(2, &i.task) } func (s *namespaceSymlink) StateTypeName() string { return "pkg/sentry/fsimpl/proc.namespaceSymlink" } func (s *namespaceSymlink) StateFields() []string { return []string{ "StaticSymlink", "task", "nsType", } } func (s *namespaceSymlink) beforeSave() {} // +checklocksignore func (s *namespaceSymlink) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.StaticSymlink) stateSinkObject.Save(1, &s.task) stateSinkObject.Save(2, &s.nsType) } func (s *namespaceSymlink) afterLoad(context.Context) {} // +checklocksignore func (s *namespaceSymlink) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.StaticSymlink) stateSourceObject.Load(1, &s.task) stateSourceObject.Load(2, &s.nsType) } func (i *namespaceInode) StateTypeName() string { return "pkg/sentry/fsimpl/proc.namespaceInode" } func (i *namespaceInode) StateFields() []string { return []string{ "implStatFS", "InodeAttrs", "InodeNoopRefCount", "InodeNotAnonymous", "InodeNotDirectory", "InodeNotSymlink", "InodeWatches", "locks", } } func (i *namespaceInode) beforeSave() {} // +checklocksignore func (i *namespaceInode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.implStatFS) stateSinkObject.Save(1, &i.InodeAttrs) stateSinkObject.Save(2, &i.InodeNoopRefCount) stateSinkObject.Save(3, &i.InodeNotAnonymous) stateSinkObject.Save(4, &i.InodeNotDirectory) stateSinkObject.Save(5, &i.InodeNotSymlink) stateSinkObject.Save(6, &i.InodeWatches) stateSinkObject.Save(7, &i.locks) } func (i *namespaceInode) afterLoad(context.Context) {} // +checklocksignore func (i *namespaceInode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.implStatFS) stateSourceObject.Load(1, &i.InodeAttrs) stateSourceObject.Load(2, &i.InodeNoopRefCount) stateSourceObject.Load(3, &i.InodeNotAnonymous) stateSourceObject.Load(4, &i.InodeNotDirectory) stateSourceObject.Load(5, &i.InodeNotSymlink) stateSourceObject.Load(6, &i.InodeWatches) stateSourceObject.Load(7, &i.locks) } func (fd *namespaceFD) StateTypeName() string { return "pkg/sentry/fsimpl/proc.namespaceFD" } func (fd *namespaceFD) StateFields() []string { return []string{ "FileDescriptionDefaultImpl", "LockFD", "vfsfd", "inode", } } func (fd *namespaceFD) beforeSave() {} // +checklocksignore func (fd *namespaceFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(1, &fd.LockFD) stateSinkObject.Save(2, &fd.vfsfd) stateSinkObject.Save(3, &fd.inode) } func (fd *namespaceFD) afterLoad(context.Context) {} // +checklocksignore func (fd *namespaceFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(1, &fd.LockFD) stateSourceObject.Load(2, &fd.vfsfd) stateSourceObject.Load(3, &fd.inode) } func (d *taskCgroupData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.taskCgroupData" } func (d *taskCgroupData) StateFields() []string { return []string{ "dynamicBytesFileSetAttr", "task", } } func (d *taskCgroupData) beforeSave() {} // +checklocksignore func (d *taskCgroupData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.dynamicBytesFileSetAttr) stateSinkObject.Save(1, &d.task) } func (d *taskCgroupData) afterLoad(context.Context) {} // +checklocksignore func (d *taskCgroupData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.dynamicBytesFileSetAttr) stateSourceObject.Load(1, &d.task) } func (d *childrenData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.childrenData" } func (d *childrenData) StateFields() []string { return []string{ "DynamicBytesFile", "task", "pidns", } } func (d *childrenData) beforeSave() {} // +checklocksignore func (d *childrenData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.task) stateSinkObject.Save(2, &d.pidns) } func (d *childrenData) afterLoad(context.Context) {} // +checklocksignore func (d *childrenData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.Load(1, &d.task) stateSourceObject.Load(2, &d.pidns) } func (r *taskInodeRefs) StateTypeName() string { return "pkg/sentry/fsimpl/proc.taskInodeRefs" } func (r *taskInodeRefs) StateFields() []string { return []string{ "refCount", } } func (r *taskInodeRefs) beforeSave() {} // +checklocksignore func (r *taskInodeRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *taskInodeRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (n *ifinet6) StateTypeName() string { return "pkg/sentry/fsimpl/proc.ifinet6" } func (n *ifinet6) StateFields() []string { return []string{ "DynamicBytesFile", "stack", } } func (n *ifinet6) beforeSave() {} // +checklocksignore func (n *ifinet6) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.DynamicBytesFile) stateSinkObject.Save(1, &n.stack) } func (n *ifinet6) afterLoad(context.Context) {} // +checklocksignore func (n *ifinet6) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.DynamicBytesFile) stateSourceObject.Load(1, &n.stack) } func (n *netDevData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.netDevData" } func (n *netDevData) StateFields() []string { return []string{ "DynamicBytesFile", "stack", } } func (n *netDevData) beforeSave() {} // +checklocksignore func (n *netDevData) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.DynamicBytesFile) stateSinkObject.Save(1, &n.stack) } func (n *netDevData) afterLoad(context.Context) {} // +checklocksignore func (n *netDevData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.DynamicBytesFile) stateSourceObject.Load(1, &n.stack) } func (n *netUnixData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.netUnixData" } func (n *netUnixData) StateFields() []string { return []string{ "DynamicBytesFile", "kernel", } } func (n *netUnixData) beforeSave() {} // +checklocksignore func (n *netUnixData) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.DynamicBytesFile) stateSinkObject.Save(1, &n.kernel) } func (n *netUnixData) afterLoad(context.Context) {} // +checklocksignore func (n *netUnixData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.DynamicBytesFile) stateSourceObject.Load(1, &n.kernel) } func (d *netTCPData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.netTCPData" } func (d *netTCPData) StateFields() []string { return []string{ "DynamicBytesFile", "kernel", } } func (d *netTCPData) beforeSave() {} // +checklocksignore func (d *netTCPData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.kernel) } func (d *netTCPData) afterLoad(context.Context) {} // +checklocksignore func (d *netTCPData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.Load(1, &d.kernel) } func (d *netTCP6Data) StateTypeName() string { return "pkg/sentry/fsimpl/proc.netTCP6Data" } func (d *netTCP6Data) StateFields() []string { return []string{ "DynamicBytesFile", "kernel", } } func (d *netTCP6Data) beforeSave() {} // +checklocksignore func (d *netTCP6Data) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.kernel) } func (d *netTCP6Data) afterLoad(context.Context) {} // +checklocksignore func (d *netTCP6Data) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.Load(1, &d.kernel) } func (d *netUDPData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.netUDPData" } func (d *netUDPData) StateFields() []string { return []string{ "DynamicBytesFile", "kernel", } } func (d *netUDPData) beforeSave() {} // +checklocksignore func (d *netUDPData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.kernel) } func (d *netUDPData) afterLoad(context.Context) {} // +checklocksignore func (d *netUDPData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.Load(1, &d.kernel) } func (d *netSnmpData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.netSnmpData" } func (d *netSnmpData) StateFields() []string { return []string{ "DynamicBytesFile", "stack", } } func (d *netSnmpData) beforeSave() {} // +checklocksignore func (d *netSnmpData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.stack) } func (d *netSnmpData) afterLoad(context.Context) {} // +checklocksignore func (d *netSnmpData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.Load(1, &d.stack) } func (s *snmpLine) StateTypeName() string { return "pkg/sentry/fsimpl/proc.snmpLine" } func (s *snmpLine) StateFields() []string { return []string{ "prefix", "header", } } func (s *snmpLine) beforeSave() {} // +checklocksignore func (s *snmpLine) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.prefix) stateSinkObject.Save(1, &s.header) } func (s *snmpLine) afterLoad(context.Context) {} // +checklocksignore func (s *snmpLine) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.prefix) stateSourceObject.Load(1, &s.header) } func (d *netRouteData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.netRouteData" } func (d *netRouteData) StateFields() []string { return []string{ "DynamicBytesFile", "stack", } } func (d *netRouteData) beforeSave() {} // +checklocksignore func (d *netRouteData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.stack) } func (d *netRouteData) afterLoad(context.Context) {} // +checklocksignore func (d *netRouteData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.Load(1, &d.stack) } func (d *netStatData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.netStatData" } func (d *netStatData) StateFields() []string { return []string{ "DynamicBytesFile", "stack", } } func (d *netStatData) beforeSave() {} // +checklocksignore func (d *netStatData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.stack) } func (d *netStatData) afterLoad(context.Context) {} // +checklocksignore func (d *netStatData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.Load(1, &d.stack) } func (i *tasksInode) StateTypeName() string { return "pkg/sentry/fsimpl/proc.tasksInode" } func (i *tasksInode) StateFields() []string { return []string{ "implStatFS", "InodeAlwaysValid", "InodeAttrs", "InodeDirectoryNoNewChildren", "InodeNotAnonymous", "InodeNotSymlink", "InodeTemporary", "InodeWatches", "OrderedChildren", "tasksInodeRefs", "locks", "fs", "pidns", "fakeCgroupControllers", } } func (i *tasksInode) beforeSave() {} // +checklocksignore func (i *tasksInode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.implStatFS) stateSinkObject.Save(1, &i.InodeAlwaysValid) stateSinkObject.Save(2, &i.InodeAttrs) stateSinkObject.Save(3, &i.InodeDirectoryNoNewChildren) stateSinkObject.Save(4, &i.InodeNotAnonymous) stateSinkObject.Save(5, &i.InodeNotSymlink) stateSinkObject.Save(6, &i.InodeTemporary) stateSinkObject.Save(7, &i.InodeWatches) stateSinkObject.Save(8, &i.OrderedChildren) stateSinkObject.Save(9, &i.tasksInodeRefs) stateSinkObject.Save(10, &i.locks) stateSinkObject.Save(11, &i.fs) stateSinkObject.Save(12, &i.pidns) stateSinkObject.Save(13, &i.fakeCgroupControllers) } func (i *tasksInode) afterLoad(context.Context) {} // +checklocksignore func (i *tasksInode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.implStatFS) stateSourceObject.Load(1, &i.InodeAlwaysValid) stateSourceObject.Load(2, &i.InodeAttrs) stateSourceObject.Load(3, &i.InodeDirectoryNoNewChildren) stateSourceObject.Load(4, &i.InodeNotAnonymous) stateSourceObject.Load(5, &i.InodeNotSymlink) stateSourceObject.Load(6, &i.InodeTemporary) stateSourceObject.Load(7, &i.InodeWatches) stateSourceObject.Load(8, &i.OrderedChildren) stateSourceObject.Load(9, &i.tasksInodeRefs) stateSourceObject.Load(10, &i.locks) stateSourceObject.Load(11, &i.fs) stateSourceObject.Load(12, &i.pidns) stateSourceObject.Load(13, &i.fakeCgroupControllers) } func (s *staticFileSetStat) StateTypeName() string { return "pkg/sentry/fsimpl/proc.staticFileSetStat" } func (s *staticFileSetStat) StateFields() []string { return []string{ "dynamicBytesFileSetAttr", "StaticData", } } func (s *staticFileSetStat) beforeSave() {} // +checklocksignore func (s *staticFileSetStat) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.dynamicBytesFileSetAttr) stateSinkObject.Save(1, &s.StaticData) } func (s *staticFileSetStat) afterLoad(context.Context) {} // +checklocksignore func (s *staticFileSetStat) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.dynamicBytesFileSetAttr) stateSourceObject.Load(1, &s.StaticData) } func (s *selfSymlink) StateTypeName() string { return "pkg/sentry/fsimpl/proc.selfSymlink" } func (s *selfSymlink) StateFields() []string { return []string{ "implStatFS", "InodeAttrs", "InodeNoopRefCount", "InodeNotAnonymous", "InodeSymlink", "InodeWatches", "pidns", } } func (s *selfSymlink) beforeSave() {} // +checklocksignore func (s *selfSymlink) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.implStatFS) stateSinkObject.Save(1, &s.InodeAttrs) stateSinkObject.Save(2, &s.InodeNoopRefCount) stateSinkObject.Save(3, &s.InodeNotAnonymous) stateSinkObject.Save(4, &s.InodeSymlink) stateSinkObject.Save(5, &s.InodeWatches) stateSinkObject.Save(6, &s.pidns) } func (s *selfSymlink) afterLoad(context.Context) {} // +checklocksignore func (s *selfSymlink) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.implStatFS) stateSourceObject.Load(1, &s.InodeAttrs) stateSourceObject.Load(2, &s.InodeNoopRefCount) stateSourceObject.Load(3, &s.InodeNotAnonymous) stateSourceObject.Load(4, &s.InodeSymlink) stateSourceObject.Load(5, &s.InodeWatches) stateSourceObject.Load(6, &s.pidns) } func (s *threadSelfSymlink) StateTypeName() string { return "pkg/sentry/fsimpl/proc.threadSelfSymlink" } func (s *threadSelfSymlink) StateFields() []string { return []string{ "implStatFS", "InodeAttrs", "InodeNoopRefCount", "InodeNotAnonymous", "InodeSymlink", "InodeWatches", "pidns", } } func (s *threadSelfSymlink) beforeSave() {} // +checklocksignore func (s *threadSelfSymlink) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.implStatFS) stateSinkObject.Save(1, &s.InodeAttrs) stateSinkObject.Save(2, &s.InodeNoopRefCount) stateSinkObject.Save(3, &s.InodeNotAnonymous) stateSinkObject.Save(4, &s.InodeSymlink) stateSinkObject.Save(5, &s.InodeWatches) stateSinkObject.Save(6, &s.pidns) } func (s *threadSelfSymlink) afterLoad(context.Context) {} // +checklocksignore func (s *threadSelfSymlink) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.implStatFS) stateSourceObject.Load(1, &s.InodeAttrs) stateSourceObject.Load(2, &s.InodeNoopRefCount) stateSourceObject.Load(3, &s.InodeNotAnonymous) stateSourceObject.Load(4, &s.InodeSymlink) stateSourceObject.Load(5, &s.InodeWatches) stateSourceObject.Load(6, &s.pidns) } func (d *dynamicBytesFileSetAttr) StateTypeName() string { return "pkg/sentry/fsimpl/proc.dynamicBytesFileSetAttr" } func (d *dynamicBytesFileSetAttr) StateFields() []string { return []string{ "DynamicBytesFile", } } func (d *dynamicBytesFileSetAttr) beforeSave() {} // +checklocksignore func (d *dynamicBytesFileSetAttr) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) } func (d *dynamicBytesFileSetAttr) afterLoad(context.Context) {} // +checklocksignore func (d *dynamicBytesFileSetAttr) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) } func (c *cpuStats) StateTypeName() string { return "pkg/sentry/fsimpl/proc.cpuStats" } func (c *cpuStats) StateFields() []string { return []string{ "user", "nice", "system", "idle", "ioWait", "irq", "softirq", "steal", "guest", "guestNice", } } func (c *cpuStats) beforeSave() {} // +checklocksignore func (c *cpuStats) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.user) stateSinkObject.Save(1, &c.nice) stateSinkObject.Save(2, &c.system) stateSinkObject.Save(3, &c.idle) stateSinkObject.Save(4, &c.ioWait) stateSinkObject.Save(5, &c.irq) stateSinkObject.Save(6, &c.softirq) stateSinkObject.Save(7, &c.steal) stateSinkObject.Save(8, &c.guest) stateSinkObject.Save(9, &c.guestNice) } func (c *cpuStats) afterLoad(context.Context) {} // +checklocksignore func (c *cpuStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.user) stateSourceObject.Load(1, &c.nice) stateSourceObject.Load(2, &c.system) stateSourceObject.Load(3, &c.idle) stateSourceObject.Load(4, &c.ioWait) stateSourceObject.Load(5, &c.irq) stateSourceObject.Load(6, &c.softirq) stateSourceObject.Load(7, &c.steal) stateSourceObject.Load(8, &c.guest) stateSourceObject.Load(9, &c.guestNice) } func (s *statData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.statData" } func (s *statData) StateFields() []string { return []string{ "dynamicBytesFileSetAttr", } } func (s *statData) beforeSave() {} // +checklocksignore func (s *statData) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.dynamicBytesFileSetAttr) } func (s *statData) afterLoad(context.Context) {} // +checklocksignore func (s *statData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.dynamicBytesFileSetAttr) } func (l *loadavgData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.loadavgData" } func (l *loadavgData) StateFields() []string { return []string{ "dynamicBytesFileSetAttr", } } func (l *loadavgData) beforeSave() {} // +checklocksignore func (l *loadavgData) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.dynamicBytesFileSetAttr) } func (l *loadavgData) afterLoad(context.Context) {} // +checklocksignore func (l *loadavgData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.dynamicBytesFileSetAttr) } func (m *meminfoData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.meminfoData" } func (m *meminfoData) StateFields() []string { return []string{ "dynamicBytesFileSetAttr", } } func (m *meminfoData) beforeSave() {} // +checklocksignore func (m *meminfoData) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.dynamicBytesFileSetAttr) } func (m *meminfoData) afterLoad(context.Context) {} // +checklocksignore func (m *meminfoData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.dynamicBytesFileSetAttr) } func (u *uptimeData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.uptimeData" } func (u *uptimeData) StateFields() []string { return []string{ "dynamicBytesFileSetAttr", } } func (u *uptimeData) beforeSave() {} // +checklocksignore func (u *uptimeData) StateSave(stateSinkObject state.Sink) { u.beforeSave() stateSinkObject.Save(0, &u.dynamicBytesFileSetAttr) } func (u *uptimeData) afterLoad(context.Context) {} // +checklocksignore func (u *uptimeData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &u.dynamicBytesFileSetAttr) } func (v *versionData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.versionData" } func (v *versionData) StateFields() []string { return []string{ "dynamicBytesFileSetAttr", } } func (v *versionData) beforeSave() {} // +checklocksignore func (v *versionData) StateSave(stateSinkObject state.Sink) { v.beforeSave() stateSinkObject.Save(0, &v.dynamicBytesFileSetAttr) } func (v *versionData) afterLoad(context.Context) {} // +checklocksignore func (v *versionData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &v.dynamicBytesFileSetAttr) } func (d *filesystemsData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.filesystemsData" } func (d *filesystemsData) StateFields() []string { return []string{ "DynamicBytesFile", } } func (d *filesystemsData) beforeSave() {} // +checklocksignore func (d *filesystemsData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) } func (d *filesystemsData) afterLoad(context.Context) {} // +checklocksignore func (d *filesystemsData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) } func (c *cgroupsData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.cgroupsData" } func (c *cgroupsData) StateFields() []string { return []string{ "dynamicBytesFileSetAttr", } } func (c *cgroupsData) beforeSave() {} // +checklocksignore func (c *cgroupsData) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.dynamicBytesFileSetAttr) } func (c *cgroupsData) afterLoad(context.Context) {} // +checklocksignore func (c *cgroupsData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.dynamicBytesFileSetAttr) } func (c *cmdLineData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.cmdLineData" } func (c *cmdLineData) StateFields() []string { return []string{ "dynamicBytesFileSetAttr", } } func (c *cmdLineData) beforeSave() {} // +checklocksignore func (c *cmdLineData) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.dynamicBytesFileSetAttr) } func (c *cmdLineData) afterLoad(context.Context) {} // +checklocksignore func (c *cmdLineData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.dynamicBytesFileSetAttr) } func (s *sentryMeminfoData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.sentryMeminfoData" } func (s *sentryMeminfoData) StateFields() []string { return []string{ "dynamicBytesFileSetAttr", } } func (s *sentryMeminfoData) beforeSave() {} // +checklocksignore func (s *sentryMeminfoData) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.dynamicBytesFileSetAttr) } func (s *sentryMeminfoData) afterLoad(context.Context) {} // +checklocksignore func (s *sentryMeminfoData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.dynamicBytesFileSetAttr) } func (r *tasksInodeRefs) StateTypeName() string { return "pkg/sentry/fsimpl/proc.tasksInodeRefs" } func (r *tasksInodeRefs) StateFields() []string { return []string{ "refCount", } } func (r *tasksInodeRefs) beforeSave() {} // +checklocksignore func (r *tasksInodeRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *tasksInodeRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (t *tcpMemDir) StateTypeName() string { return "pkg/sentry/fsimpl/proc.tcpMemDir" } func (t *tcpMemDir) StateFields() []string { return nil } func (d *mmapMinAddrData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.mmapMinAddrData" } func (d *mmapMinAddrData) StateFields() []string { return []string{ "DynamicBytesFile", "k", } } func (d *mmapMinAddrData) beforeSave() {} // +checklocksignore func (d *mmapMinAddrData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.k) } func (d *mmapMinAddrData) afterLoad(context.Context) {} // +checklocksignore func (d *mmapMinAddrData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.Load(1, &d.k) } func (h *hostnameData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.hostnameData" } func (h *hostnameData) StateFields() []string { return []string{ "DynamicBytesFile", } } func (h *hostnameData) beforeSave() {} // +checklocksignore func (h *hostnameData) StateSave(stateSinkObject state.Sink) { h.beforeSave() stateSinkObject.Save(0, &h.DynamicBytesFile) } func (h *hostnameData) afterLoad(context.Context) {} // +checklocksignore func (h *hostnameData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &h.DynamicBytesFile) } func (d *tcpSackData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.tcpSackData" } func (d *tcpSackData) StateFields() []string { return []string{ "DynamicBytesFile", "stack", "enabled", } } func (d *tcpSackData) beforeSave() {} // +checklocksignore func (d *tcpSackData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.stack) stateSinkObject.Save(2, &d.enabled) } func (d *tcpSackData) afterLoad(context.Context) {} // +checklocksignore func (d *tcpSackData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.LoadWait(1, &d.stack) stateSourceObject.Load(2, &d.enabled) } func (d *tcpRecoveryData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.tcpRecoveryData" } func (d *tcpRecoveryData) StateFields() []string { return []string{ "DynamicBytesFile", "stack", } } func (d *tcpRecoveryData) beforeSave() {} // +checklocksignore func (d *tcpRecoveryData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.stack) } func (d *tcpRecoveryData) afterLoad(context.Context) {} // +checklocksignore func (d *tcpRecoveryData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.LoadWait(1, &d.stack) } func (d *tcpMemData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.tcpMemData" } func (d *tcpMemData) StateFields() []string { return []string{ "DynamicBytesFile", "dir", "stack", } } func (d *tcpMemData) beforeSave() {} // +checklocksignore func (d *tcpMemData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.DynamicBytesFile) stateSinkObject.Save(1, &d.dir) stateSinkObject.Save(2, &d.stack) } func (d *tcpMemData) afterLoad(context.Context) {} // +checklocksignore func (d *tcpMemData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.DynamicBytesFile) stateSourceObject.Load(1, &d.dir) stateSourceObject.LoadWait(2, &d.stack) } func (ipf *ipForwarding) StateTypeName() string { return "pkg/sentry/fsimpl/proc.ipForwarding" } func (ipf *ipForwarding) StateFields() []string { return []string{ "DynamicBytesFile", "stack", "enabled", } } func (ipf *ipForwarding) beforeSave() {} // +checklocksignore func (ipf *ipForwarding) StateSave(stateSinkObject state.Sink) { ipf.beforeSave() stateSinkObject.Save(0, &ipf.DynamicBytesFile) stateSinkObject.Save(1, &ipf.stack) stateSinkObject.Save(2, &ipf.enabled) } func (ipf *ipForwarding) afterLoad(context.Context) {} // +checklocksignore func (ipf *ipForwarding) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &ipf.DynamicBytesFile) stateSourceObject.LoadWait(1, &ipf.stack) stateSourceObject.Load(2, &ipf.enabled) } func (pr *portRange) StateTypeName() string { return "pkg/sentry/fsimpl/proc.portRange" } func (pr *portRange) StateFields() []string { return []string{ "DynamicBytesFile", "stack", "start", "end", } } func (pr *portRange) beforeSave() {} // +checklocksignore func (pr *portRange) StateSave(stateSinkObject state.Sink) { pr.beforeSave() stateSinkObject.Save(0, &pr.DynamicBytesFile) stateSinkObject.Save(1, &pr.stack) stateSinkObject.Save(2, &pr.start) stateSinkObject.Save(3, &pr.end) } func (pr *portRange) afterLoad(context.Context) {} // +checklocksignore func (pr *portRange) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &pr.DynamicBytesFile) stateSourceObject.LoadWait(1, &pr.stack) stateSourceObject.Load(2, &pr.start) stateSourceObject.Load(3, &pr.end) } func (f *atomicInt32File) StateTypeName() string { return "pkg/sentry/fsimpl/proc.atomicInt32File" } func (f *atomicInt32File) StateFields() []string { return []string{ "DynamicBytesFile", "val", "min", "max", } } func (f *atomicInt32File) beforeSave() {} // +checklocksignore func (f *atomicInt32File) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.DynamicBytesFile) stateSinkObject.Save(1, &f.val) stateSinkObject.Save(2, &f.min) stateSinkObject.Save(3, &f.max) } func (f *atomicInt32File) afterLoad(context.Context) {} // +checklocksignore func (f *atomicInt32File) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.DynamicBytesFile) stateSourceObject.Load(1, &f.val) stateSourceObject.Load(2, &f.min) stateSourceObject.Load(3, &f.max) } func (s *yamaPtraceScope) StateTypeName() string { return "pkg/sentry/fsimpl/proc.yamaPtraceScope" } func (s *yamaPtraceScope) StateFields() []string { return []string{ "DynamicBytesFile", "level", } } func (s *yamaPtraceScope) beforeSave() {} // +checklocksignore func (s *yamaPtraceScope) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.DynamicBytesFile) stateSinkObject.Save(1, &s.level) } func (s *yamaPtraceScope) afterLoad(context.Context) {} // +checklocksignore func (s *yamaPtraceScope) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.DynamicBytesFile) stateSourceObject.Load(1, &s.level) } func init() { state.Register((*fdDirInodeRefs)(nil)) state.Register((*fdInfoDirInodeRefs)(nil)) state.Register((*FilesystemType)(nil)) state.Register((*filesystem)(nil)) state.Register((*staticFile)(nil)) state.Register((*InternalData)(nil)) state.Register((*implStatFS)(nil)) state.Register((*subtasksInode)(nil)) state.Register((*subtasksFD)(nil)) state.Register((*subtasksInodeRefs)(nil)) state.Register((*taskInode)(nil)) state.Register((*taskOwnedInode)(nil)) state.Register((*fdDir)(nil)) state.Register((*fdDirInode)(nil)) state.Register((*fdSymlink)(nil)) state.Register((*fdInfoDirInode)(nil)) state.Register((*fdInfoData)(nil)) state.Register((*auxvData)(nil)) state.Register((*metadataData)(nil)) state.Register((*commInode)(nil)) state.Register((*commData)(nil)) state.Register((*idMapData)(nil)) state.Register((*memInode)(nil)) state.Register((*memFD)(nil)) state.Register((*limitsData)(nil)) state.Register((*mapsData)(nil)) state.Register((*smapsData)(nil)) state.Register((*taskStatData)(nil)) state.Register((*statmData)(nil)) state.Register((*statusInode)(nil)) state.Register((*statusFD)(nil)) state.Register((*statusFDLowerBase)(nil)) state.Register((*ioData)(nil)) state.Register((*oomScoreAdj)(nil)) state.Register((*exeSymlink)(nil)) state.Register((*cwdSymlink)(nil)) state.Register((*rootSymlink)(nil)) state.Register((*mountInfoData)(nil)) state.Register((*mountsData)(nil)) state.Register((*namespaceSymlink)(nil)) state.Register((*namespaceInode)(nil)) state.Register((*namespaceFD)(nil)) state.Register((*taskCgroupData)(nil)) state.Register((*childrenData)(nil)) state.Register((*taskInodeRefs)(nil)) state.Register((*ifinet6)(nil)) state.Register((*netDevData)(nil)) state.Register((*netUnixData)(nil)) state.Register((*netTCPData)(nil)) state.Register((*netTCP6Data)(nil)) state.Register((*netUDPData)(nil)) state.Register((*netSnmpData)(nil)) state.Register((*snmpLine)(nil)) state.Register((*netRouteData)(nil)) state.Register((*netStatData)(nil)) state.Register((*tasksInode)(nil)) state.Register((*staticFileSetStat)(nil)) state.Register((*selfSymlink)(nil)) state.Register((*threadSelfSymlink)(nil)) state.Register((*dynamicBytesFileSetAttr)(nil)) state.Register((*cpuStats)(nil)) state.Register((*statData)(nil)) state.Register((*loadavgData)(nil)) state.Register((*meminfoData)(nil)) state.Register((*uptimeData)(nil)) state.Register((*versionData)(nil)) state.Register((*filesystemsData)(nil)) state.Register((*cgroupsData)(nil)) state.Register((*cmdLineData)(nil)) state.Register((*sentryMeminfoData)(nil)) state.Register((*tasksInodeRefs)(nil)) state.Register((*tcpMemDir)(nil)) state.Register((*mmapMinAddrData)(nil)) state.Register((*hostnameData)(nil)) state.Register((*tcpSackData)(nil)) state.Register((*tcpRecoveryData)(nil)) state.Register((*tcpMemData)(nil)) state.Register((*ipForwarding)(nil)) state.Register((*portRange)(nil)) state.Register((*atomicInt32File)(nil)) state.Register((*yamaPtraceScope)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/proc/subtasks.go000066400000000000000000000134101465435605700253720ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package proc import ( "sort" "strconv" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // subtasksInode represents the inode for /proc/[pid]/task/ directory. // // +stateify savable type subtasksInode struct { implStatFS kernfs.InodeAlwaysValid kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren kernfs.InodeNotAnonymous kernfs.InodeNotSymlink kernfs.InodeTemporary kernfs.InodeWatches kernfs.OrderedChildren subtasksInodeRefs locks vfs.FileLocks fs *filesystem task *kernel.Task pidns *kernel.PIDNamespace cgroupControllers map[string]string } var _ kernfs.Inode = (*subtasksInode)(nil) func (fs *filesystem) newSubtasks(ctx context.Context, task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) kernfs.Inode { subInode := &subtasksInode{ fs: fs, task: task, pidns: pidns, cgroupControllers: cgroupControllers, } // Note: credentials are overridden by taskOwnedInode. subInode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) subInode.InitRefs() inode := &taskOwnedInode{Inode: subInode, owner: task} return inode } // Lookup implements kernfs.inodeDirectory.Lookup. func (i *subtasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { tid, err := strconv.ParseUint(name, 10, 32) if err != nil { return nil, linuxerr.ENOENT } subTask := i.pidns.TaskWithID(kernel.ThreadID(tid)) if subTask == nil { return nil, linuxerr.ENOENT } if subTask.ThreadGroup() != i.task.ThreadGroup() { return nil, linuxerr.ENOENT } return i.fs.newTaskInode(ctx, subTask, i.pidns, false, i.cgroupControllers) } // IterDirents implements kernfs.inodeDirectory.IterDirents. func (i *subtasksInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { tasks := i.task.ThreadGroup().MemberIDs(i.pidns) if len(tasks) == 0 { return offset, linuxerr.ENOENT } if relOffset >= int64(len(tasks)) { return offset, nil } tids := make([]int, 0, len(tasks)) for _, tid := range tasks { tids = append(tids, int(tid)) } sort.Ints(tids) for _, tid := range tids[relOffset:] { dirent := vfs.Dirent{ Name: strconv.FormatUint(uint64(tid), 10), Type: linux.DT_DIR, Ino: i.fs.NextIno(), NextOff: offset + 1, } if err := cb.Handle(dirent); err != nil { return offset, err } offset++ } return offset, nil } // +stateify savable type subtasksFD struct { kernfs.GenericDirectoryFD task *kernel.Task } func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { if fd.task.ExitState() >= kernel.TaskExitZombie { return linuxerr.ENOENT } return fd.GenericDirectoryFD.IterDirents(ctx, cb) } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { if fd.task.ExitState() >= kernel.TaskExitZombie { return 0, linuxerr.ENOENT } return fd.GenericDirectoryFD.Seek(ctx, offset, whence) } // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *subtasksFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { if fd.task.ExitState() >= kernel.TaskExitZombie { return linux.Statx{}, linuxerr.ENOENT } return fd.GenericDirectoryFD.Stat(ctx, opts) } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { if fd.task.ExitState() >= kernel.TaskExitZombie { return linuxerr.ENOENT } return fd.GenericDirectoryFD.SetStat(ctx, opts) } // Open implements kernfs.Inode.Open. func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &subtasksFD{task: i.task} if err := fd.Init(&i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, }); err != nil { return nil, err } if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return fd.VFSFileDescription(), nil } // Stat implements kernfs.Inode.Stat. func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts) if err != nil { return linux.Statx{}, err } if opts.Mask&linux.STATX_NLINK != 0 { stat.Nlink += uint32(i.task.ThreadGroup().Count()) } return stat, nil } // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return linuxerr.EPERM } // DecRef implements kernfs.Inode.DecRef. func (i *subtasksInode) DecRef(ctx context.Context) { i.subtasksInodeRefs.DecRef(func() { i.Destroy(ctx) }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go000066400000000000000000000103051465435605700275670ustar00rootroot00000000000000package proc import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const subtasksInodeenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var subtasksInodeobj *subtasksInode // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type subtasksInodeRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *subtasksInodeRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *subtasksInodeRefs) RefType() string { return fmt.Sprintf("%T", subtasksInodeobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *subtasksInodeRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *subtasksInodeRefs) LogRefs() bool { return subtasksInodeenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *subtasksInodeRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *subtasksInodeRefs) IncRef() { v := r.refCount.Add(1) if subtasksInodeenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *subtasksInodeRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if subtasksInodeenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *subtasksInodeRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if subtasksInodeenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *subtasksInodeRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/proc/task.go000066400000000000000000000242621465435605700245040ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package proc import ( "bytes" "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // taskInode represents the inode for /proc/PID/ directory. // // +stateify savable type taskInode struct { implStatFS kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren kernfs.InodeNotAnonymous kernfs.InodeNotSymlink kernfs.InodeTemporary kernfs.InodeWatches kernfs.OrderedChildren taskInodeRefs locks vfs.FileLocks task *kernel.Task } var _ kernfs.Inode = (*taskInode)(nil) func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, fakeCgroupControllers map[string]string) (kernfs.Inode, error) { if task.ExitState() == kernel.TaskExitDead { return nil, linuxerr.ESRCH } contents := map[string]kernfs.Inode{ "auxv": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &auxvData{task: task}), "cmdline": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &metadataData{task: task, metaType: Cmdline}), "comm": fs.newComm(ctx, task, fs.NextIno(), 0644), "cwd": fs.newCwdSymlink(ctx, task, fs.NextIno()), "environ": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &metadataData{task: task, metaType: Environ}), "exe": fs.newExeSymlink(ctx, task, fs.NextIno()), "fd": fs.newFDDirInode(ctx, task), "fdinfo": fs.newFDInfoDirInode(ctx, task), "gid_map": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}), "io": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0400, newIO(task, isThreadGroup)), "limits": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &limitsData{task: task}), "maps": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mapsData{task: task}), "mem": fs.newMemInode(ctx, task, fs.NextIno(), 0400), "mountinfo": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountInfoData{fs: fs, task: task}), "mounts": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountsData{fs: fs, task: task}), "net": fs.newTaskNetDir(ctx, task), "ns": fs.newTaskOwnedDir(ctx, task, fs.NextIno(), 0511, map[string]kernfs.Inode{ "net": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNET), "mnt": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNS), "pid": fs.newPIDNamespaceSymlink(ctx, task, fs.NextIno()), "user": fs.newFakeNamespaceSymlink(ctx, task, fs.NextIno(), "user"), "ipc": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWIPC), "uts": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWUTS), }), "oom_score": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, newStaticFile("0\n")), "oom_score_adj": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &oomScoreAdj{task: task}), "root": fs.newRootSymlink(ctx, task, fs.NextIno()), "smaps": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &smapsData{task: task}), "stat": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), "statm": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &statmData{task: task}), "status": fs.newStatusInode(ctx, task, pidns, fs.NextIno(), 0444), "uid_map": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}), } if isThreadGroup { contents["task"] = fs.newSubtasks(ctx, task, pidns, fakeCgroupControllers) } else { contents["children"] = fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &childrenData{task: task, pidns: pidns}) } if len(fakeCgroupControllers) > 0 { contents["cgroup"] = fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, newFakeCgroupData(fakeCgroupControllers)) } else { contents["cgroup"] = fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &taskCgroupData{task: task}) } taskInode := &taskInode{task: task} // Note: credentials are overridden by taskOwnedInode. taskInode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) taskInode.InitRefs() inode := &taskOwnedInode{Inode: taskInode, owner: task} taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) links := taskInode.OrderedChildren.Populate(contents) taskInode.IncLinks(links) return inode, nil } // Valid implements kernfs.Inode.Valid. This inode remains valid as long // as the task is still running. When it's dead, another tasks with the same // PID could replace it. func (i *taskInode) Valid(ctx context.Context, parent *kernfs.Dentry, name string) bool { return i.task.ExitState() != kernel.TaskExitDead } // Open implements kernfs.Inode.Open. func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, }) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return linuxerr.EPERM } // DecRef implements kernfs.Inode.DecRef. func (i *taskInode) DecRef(ctx context.Context) { i.taskInodeRefs.DecRef(func() { i.Destroy(ctx) }) } // taskOwnedInode implements kernfs.Inode and overrides inode owner with task // effective user and group. // // +stateify savable type taskOwnedInode struct { kernfs.Inode // owner is the task that owns this inode. owner *kernel.Task } var _ kernfs.Inode = (*taskOwnedInode)(nil) func (fs *filesystem) newTaskOwnedInode(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) kernfs.Inode { // Note: credentials are overridden by taskOwnedInode. inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm) return &taskOwnedInode{Inode: inode, owner: task} } func (fs *filesystem) newTaskOwnedDir(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]kernfs.Inode) kernfs.Inode { // Note: credentials are overridden by taskOwnedInode. fdOpts := kernfs.GenericDirectoryFDOptions{SeekEnd: kernfs.SeekEndZero} dir := kernfs.NewStaticDir(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, children, fdOpts) return &taskOwnedInode{Inode: dir, owner: task} } func (i *taskOwnedInode) Valid(ctx context.Context, parent *kernfs.Dentry, name string) bool { return i.owner.ExitState() != kernel.TaskExitDead && i.Inode.Valid(ctx, parent, name) } // Stat implements kernfs.Inode.Stat. func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { stat, err := i.Inode.Stat(ctx, fs, opts) if err != nil { return linux.Statx{}, err } if opts.Mask&(linux.STATX_UID|linux.STATX_GID) != 0 { uid, gid := i.getOwner(linux.FileMode(stat.Mode)) if opts.Mask&linux.STATX_UID != 0 { stat.UID = uint32(uid) } if opts.Mask&linux.STATX_GID != 0 { stat.GID = uint32(gid) } } return stat, nil } // CheckPermissions implements kernfs.Inode.CheckPermissions. func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { mode := i.Mode() uid, gid := i.getOwner(mode) return vfs.GenericCheckPermissions(creds, ats, mode, uid, gid) } func (i *taskOwnedInode) getOwner(mode linux.FileMode) (auth.KUID, auth.KGID) { // By default, set the task owner as the file owner. creds := i.owner.Credentials() uid := creds.EffectiveKUID gid := creds.EffectiveKGID // Linux doesn't apply dumpability adjustments to world readable/executable // directories so that applications can stat /proc/PID to determine the // effective UID of a process. See fs/proc/base.c:task_dump_owner. if mode.FileType() == linux.ModeDirectory && mode.Permissions() == 0555 { return uid, gid } // If the task is not dumpable, then root (in the namespace preferred) // owns the file. m := getMM(i.owner) if m == nil { return auth.RootKUID, auth.RootKGID } if m.Dumpability() != mm.UserDumpable { uid = auth.RootKUID if kuid := creds.UserNamespace.MapToKUID(auth.RootUID); kuid.Ok() { uid = kuid } gid = auth.RootKGID if kgid := creds.UserNamespace.MapToKGID(auth.RootGID); kgid.Ok() { gid = kgid } } return uid, gid } func newIO(t *kernel.Task, isThreadGroup bool) *ioData { if isThreadGroup { return &ioData{ioUsage: t.ThreadGroup()} } return &ioData{ioUsage: t} } // newFakeCgroupData creates an inode that shows fake cgroup // information passed in as mount options. From man 7 cgroups: "For // each cgroup hierarchy of which the process is a member, there is // one entry containing three colon-separated fields: // hierarchy-ID:controller-list:cgroup-path" // // TODO(b/182488796): Remove once all users adopt cgroupfs. func newFakeCgroupData(controllers map[string]string) dynamicInode { var buf bytes.Buffer // The hierarchy ids must be positive integers (for cgroup v1), but the // exact number does not matter, so long as they are unique. We can // just use a counter, but since linux sorts this file in descending // order, we must count down to preserve this behavior. i := len(controllers) for name, dir := range controllers { fmt.Fprintf(&buf, "%d:%s:%s\n", i, name, dir) i-- } return newStaticFile(buf.String()) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/proc/task_fds.go000066400000000000000000000242211465435605700253330ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package proc import ( "bytes" "fmt" "sort" "strconv" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) { var ( file *vfs.FileDescription flags kernel.FDFlags ) t.WithMuLocked(func(t *kernel.Task) { if fdt := t.FDTable(); fdt != nil { file, flags = fdt.Get(fd) } }) return file, flags } func taskFDExists(ctx context.Context, fs *filesystem, t *kernel.Task, fd int32) bool { var exists bool t.WithMuLocked(func(task *kernel.Task) { if fdt := t.FDTable(); fdt != nil { exists = fdt.Exists(fd) } }) return exists } // +stateify savable type fdDir struct { locks vfs.FileLocks fs *filesystem task *kernel.Task // When produceSymlinks is set, dirents produces for the FDs are reported // as symlink. Otherwise, they are reported as regular files. produceSymlink bool } // IterDirents implements kernfs.inodeDirectory.IterDirents. func (i *fdDir) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { var fds []int32 i.task.WithMuLocked(func(t *kernel.Task) { if fdTable := t.FDTable(); fdTable != nil { fds = fdTable.GetFDs(ctx) } }) typ := uint8(linux.DT_REG) if i.produceSymlink { typ = linux.DT_LNK } // Find the appropriate starting point. idx := sort.Search(len(fds), func(i int) bool { return fds[i] >= int32(relOffset) }) if idx >= len(fds) { return offset, nil } for _, fd := range fds[idx:] { dirent := vfs.Dirent{ Name: strconv.FormatUint(uint64(fd), 10), Type: typ, Ino: i.fs.NextIno(), NextOff: int64(fd) + 3, } if err := cb.Handle(dirent); err != nil { // Getdents should iterate correctly despite mutation // of fds, so we return the next fd to serialize plus // 2 (which accounts for the "." and ".." tracked by // kernfs) as the offset. return int64(fd) + 2, err } } // We serialized them all. Next offset should be higher than last // serialized fd. return int64(fds[len(fds)-1]) + 3, nil } // fdDirInode represents the inode for /proc/[pid]/fd directory. // // +stateify savable type fdDirInode struct { fdDir fdDirInodeRefs implStatFS kernfs.InodeAlwaysValid kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren kernfs.InodeNotAnonymous kernfs.InodeNotSymlink kernfs.InodeTemporary kernfs.InodeWatches kernfs.OrderedChildren } var _ kernfs.Inode = (*fdDirInode)(nil) func (fs *filesystem) newFDDirInode(ctx context.Context, task *kernel.Task) kernfs.Inode { inode := &fdDirInode{ fdDir: fdDir{ fs: fs, task: task, produceSymlink: true, }, } inode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) inode.InitRefs() inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) return inode } // IterDirents implements kernfs.inodeDirectory.IterDirents. func (i *fdDirInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { return i.fdDir.IterDirents(ctx, mnt, cb, offset, relOffset) } // Lookup implements kernfs.inodeDirectory.Lookup. func (i *fdDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { return nil, linuxerr.ENOENT } fd := int32(fdInt) if !taskFDExists(ctx, i.fs, i.task, fd) { return nil, linuxerr.ENOENT } return i.fs.newFDSymlink(ctx, i.task, fd, i.fs.NextIno()), nil } // Open implements kernfs.Inode.Open. func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, }) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } // CheckPermissions implements kernfs.Inode.CheckPermissions. // // This is to match Linux, which uses a special permission handler to guarantee // that a process can still access /proc/self/fd after it has executed // setuid. See fs/proc/fd.c:proc_fd_permission. func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { err := i.InodeAttrs.CheckPermissions(ctx, creds, ats) if err == nil { // Access granted, no extra check needed. return nil } if t := kernel.TaskFromContext(ctx); t != nil { // Allow access if the task trying to access it is in the thread group // corresponding to this directory. if i.task.ThreadGroup() == t.ThreadGroup() { // Access granted (overridden). return nil } } return err } // DecRef implements kernfs.Inode.DecRef. func (i *fdDirInode) DecRef(ctx context.Context) { i.fdDirInodeRefs.DecRef(func() { i.Destroy(ctx) }) } // fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file. // // +stateify savable type fdSymlink struct { implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeNotAnonymous kernfs.InodeSymlink kernfs.InodeWatches fs *filesystem task *kernel.Task fd int32 } var _ kernfs.Inode = (*fdSymlink)(nil) func (fs *filesystem) newFDSymlink(ctx context.Context, task *kernel.Task, fd int32, ino uint64) kernfs.Inode { inode := &fdSymlink{ fs: fs, task: task, fd: fd, } inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) return inode } func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { file, _ := getTaskFD(s.task, s.fd) if file == nil { return "", linuxerr.ENOENT } defer s.fs.SafeDecRefFD(ctx, file) root := vfs.RootFromContext(ctx) defer s.fs.SafeDecRef(ctx, root) // Note: it's safe to reenter kernfs from Readlink if needed to resolve path. return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry()) } func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { file, _ := getTaskFD(s.task, s.fd) if file == nil { return vfs.VirtualDentry{}, "", linuxerr.ENOENT } defer s.fs.SafeDecRefFD(ctx, file) vd := file.VirtualDentry() vd.IncRef() return vd, "", nil } // Valid implements kernfs.Inode.Valid. func (s *fdSymlink) Valid(ctx context.Context, parent *kernfs.Dentry, name string) bool { return taskFDExists(ctx, s.fs, s.task, s.fd) } // fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory. // // +stateify savable type fdInfoDirInode struct { fdDir fdInfoDirInodeRefs implStatFS kernfs.InodeAlwaysValid kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren kernfs.InodeNotAnonymous kernfs.InodeNotSymlink kernfs.InodeTemporary kernfs.InodeWatches kernfs.OrderedChildren } var _ kernfs.Inode = (*fdInfoDirInode)(nil) func (fs *filesystem) newFDInfoDirInode(ctx context.Context, task *kernel.Task) kernfs.Inode { inode := &fdInfoDirInode{ fdDir: fdDir{ fs: fs, task: task, }, } inode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) inode.InitRefs() inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) return inode } // Lookup implements kernfs.inodeDirectory.Lookup. func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { return nil, linuxerr.ENOENT } fd := int32(fdInt) if !taskFDExists(ctx, i.fs, i.task, fd) { return nil, linuxerr.ENOENT } data := &fdInfoData{ fs: i.fs, task: i.task, fd: fd, } return i.fs.newTaskOwnedInode(ctx, i.task, i.fs.NextIno(), 0444, data), nil } // IterDirents implements Inode.IterDirents. func (i *fdInfoDirInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) { return i.fdDir.IterDirents(ctx, mnt, cb, offset, relOffset) } // Open implements kernfs.Inode.Open. func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, }) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } // DecRef implements kernfs.Inode.DecRef. func (i *fdInfoDirInode) DecRef(ctx context.Context) { i.fdInfoDirInodeRefs.DecRef(func() { i.Destroy(ctx) }) } // fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd]. // // +stateify savable type fdInfoData struct { kernfs.DynamicBytesFile fs *filesystem task *kernel.Task fd int32 } var _ dynamicInode = (*fdInfoData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { file, descriptorFlags := getTaskFD(d.task, d.fd) if file == nil { return linuxerr.ENOENT } defer d.fs.SafeDecRefFD(ctx, file) // TODO(b/121266871): Include pos, locks, and other data. For now we only // have flags. // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt flags := uint(file.StatusFlags()) | descriptorFlags.ToLinuxFileFlags() fmt.Fprintf(buf, "flags:\t0%o\n", flags) return nil } // Valid implements kernfs.Inode.Valid. func (d *fdInfoData) Valid(ctx context.Context, parent *kernfs.Dentry, name string) bool { return taskFDExists(ctx, d.fs, d.task, d.fd) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/proc/task_files.go000066400000000000000000001250321465435605700256630ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package proc import ( "bytes" "fmt" "io" "sort" "strconv" "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) // "There is an (arbitrary) limit on the number of lines in the file. As at // Linux 3.18, the limit is five lines." - user_namespaces(7) const maxIDMapLines = 5 // getMM gets the kernel task's MemoryManager. No additional reference is taken on // mm here. This is safe because MemoryManager.destroy is required to leave the // MemoryManager in a state where it's still usable as a DynamicBytesSource. func getMM(task *kernel.Task) *mm.MemoryManager { var tmm *mm.MemoryManager task.WithMuLocked(func(t *kernel.Task) { if mm := t.MemoryManager(); mm != nil { tmm = mm } }) return tmm } // getMMIncRef returns t's MemoryManager. If getMMIncRef succeeds, the // MemoryManager's users count is incremented, and must be decremented by the // caller when it is no longer in use. func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) { var m *mm.MemoryManager task.WithMuLocked(func(t *kernel.Task) { m = t.MemoryManager() }) if m == nil || !m.IncUsers() { return nil, io.EOF } return m, nil } func checkTaskState(t *kernel.Task) error { switch t.ExitState() { case kernel.TaskExitZombie: return linuxerr.EACCES case kernel.TaskExitDead: return linuxerr.ESRCH } return nil } type bufferWriter struct { buf *bytes.Buffer } // WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns // the number of bytes written. It may return a partial write without an // error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not // return a full write with an error (i.e. srcs.NumBytes(), err) where err // != nil). func (w *bufferWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { written := srcs.NumBytes() for !srcs.IsEmpty() { w.buf.Write(srcs.Head().ToSlice()) srcs = srcs.Tail() } return written, nil } // auxvData implements vfs.DynamicBytesSource for /proc/[pid]/auxv. // // +stateify savable type auxvData struct { kernfs.DynamicBytesFile task *kernel.Task } var _ dynamicInode = (*auxvData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error { if d.task.ExitState() == kernel.TaskExitDead { return linuxerr.ESRCH } m, err := getMMIncRef(d.task) if err != nil { // Return empty file. return nil } defer m.DecUsers(ctx) auxv := m.Auxv() // Space for buffer with AT_NULL (0) terminator at the end. buf.Grow((len(auxv) + 1) * 16) for _, e := range auxv { var tmp [16]byte hostarch.ByteOrder.PutUint64(tmp[:8], e.Key) hostarch.ByteOrder.PutUint64(tmp[8:], uint64(e.Value)) buf.Write(tmp[:]) } var atNull [16]byte buf.Write(atNull[:]) return nil } // MetadataType enumerates the types of metadata that is exposed through proc. type MetadataType int const ( // Cmdline represents /proc/[pid]/cmdline. Cmdline MetadataType = iota // Environ represents /proc/[pid]/environ. Environ ) // GetMetadata fetches the process's metadata of type t and writes it into // buf. The process is identified by mm. func GetMetadata(ctx context.Context, mm *mm.MemoryManager, buf *bytes.Buffer, t MetadataType) error { // Figure out the bounds of the exec arg we are trying to read. var ar hostarch.AddrRange switch t { case Cmdline: ar = hostarch.AddrRange{ Start: mm.ArgvStart(), End: mm.ArgvEnd(), } case Environ: ar = hostarch.AddrRange{ Start: mm.EnvvStart(), End: mm.EnvvEnd(), } default: panic(fmt.Sprintf("unknown exec arg type %v", t)) } if ar.Start == 0 || ar.End == 0 { // Don't attempt to read before the start/end are set up. return io.EOF } // N.B. Technically this should be usermem.IOOpts.IgnorePermissions = true // until Linux 4.9 (272ddc8b3735 "proc: don't use FOLL_FORCE for reading // cmdline and environment"). writer := &bufferWriter{buf: buf} if n, err := mm.CopyInTo(ctx, hostarch.AddrRangeSeqOf(ar), writer, usermem.IOOpts{}); n == 0 || err != nil { // Nothing to copy or something went wrong. return err } // On Linux, if the NULL byte at the end of the argument vector has been // overwritten, it continues reading the environment vector as part of // the argument vector. if t == Cmdline && buf.Bytes()[buf.Len()-1] != 0 { if end := bytes.IndexByte(buf.Bytes(), 0); end != -1 { // If we found a NULL character somewhere else in argv, truncate the // return up to the NULL terminator (including it). buf.Truncate(end) return nil } // There is no NULL terminator in the string, return into envp. arEnvv := hostarch.AddrRange{ Start: mm.EnvvStart(), End: mm.EnvvEnd(), } // Upstream limits the returned amount to one page of slop. // https://elixir.bootlin.com/linux/v4.20/source/fs/proc/base.c#L208 // we'll return one page total between argv and envp because of the // above page restrictions. if buf.Len() >= hostarch.PageSize { // Returned at least one page already, nothing else to add. return nil } remaining := hostarch.PageSize - buf.Len() if int(arEnvv.Length()) > remaining { end, ok := arEnvv.Start.AddLength(uint64(remaining)) if !ok { return linuxerr.EFAULT } arEnvv.End = end } if _, err := mm.CopyInTo(ctx, hostarch.AddrRangeSeqOf(arEnvv), writer, usermem.IOOpts{}); err != nil { return err } // Linux will return envp up to and including the first NULL character, // so find it. envStart := int(ar.Length()) if nullIdx := bytes.IndexByte(buf.Bytes()[envStart:], 0); nullIdx != -1 { buf.Truncate(envStart + nullIdx) } } return nil } // metadataData implements vfs.DynamicBytesSource for proc metadata fields like: // // - /proc/[pid]/cmdline // - /proc/[pid]/environ // // +stateify savable type metadataData struct { kernfs.DynamicBytesFile task *kernel.Task // arg is the type of exec argument this file contains. metaType MetadataType } var _ dynamicInode = (*metadataData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *metadataData) Generate(ctx context.Context, buf *bytes.Buffer) error { if d.task.ExitState() == kernel.TaskExitDead { return linuxerr.ESRCH } m, err := getMMIncRef(d.task) if err != nil { // Return empty file. return nil } defer m.DecUsers(ctx) return GetMetadata(ctx, m, buf, d.metaType) } // +stateify savable type commInode struct { kernfs.DynamicBytesFile task *kernel.Task } func (fs *filesystem) newComm(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode { inode := &commInode{task: task} inode.DynamicBytesFile.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm) return inode } func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { // This file can always be read or written by members of the same thread // group. See fs/proc/base.c:proc_tid_comm_permission. t := kernel.TaskFromContext(ctx) if t != nil && t.ThreadGroup() == i.task.ThreadGroup() && !ats.MayExec() { return nil } return i.DynamicBytesFile.CheckPermissions(ctx, creds, ats) } // commData implements vfs.WritableDynamicBytesSource for /proc/[pid]/comm. // // +stateify savable type commData struct { kernfs.DynamicBytesFile task *kernel.Task } var _ dynamicInode = (*commData)(nil) var _ vfs.WritableDynamicBytesSource = (*commData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error { buf.WriteString(d.task.Name()) buf.WriteString("\n") return nil } // Write implements vfs.WritableDynamicBytesSource.Write. func (d *commData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { srclen := src.NumBytes() nameLen := int64(linux.TASK_COMM_LEN - 1) if srclen < nameLen { nameLen = srclen } name := make([]byte, nameLen) if _, err := src.CopyIn(ctx, name); err != nil { return 0, err } // Only allow writes from the same thread group, otherwise return // EINVAL. See fs/proc/base.c:comm_write. // // Note that this check exists in addition to the same-thread-group // check in CheckPermissions. t := kernel.TaskFromContext(ctx) if t == nil || t.ThreadGroup() != d.task.ThreadGroup() { return 0, linuxerr.EINVAL } d.task.SetName(string(name)) return int64(srclen), nil } // idMapData implements vfs.WritableDynamicBytesSource for // /proc/[pid]/{gid_map|uid_map}. // // +stateify savable type idMapData struct { kernfs.DynamicBytesFile task *kernel.Task gids bool } var _ dynamicInode = (*idMapData)(nil) var _ vfs.WritableDynamicBytesSource = (*idMapData)(nil) // Generate implements vfs.WritableDynamicBytesSource.Generate. func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error { var entries []auth.IDMapEntry if d.gids { entries = d.task.UserNamespace().GIDMap() } else { entries = d.task.UserNamespace().UIDMap() } for _, e := range entries { fmt.Fprintf(buf, "%10d %10d %10d\n", e.FirstID, e.FirstParentID, e.Length) } return nil } // Write implements vfs.WritableDynamicBytesSource.Write. func (d *idMapData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { // "In addition, the number of bytes written to the file must be less than // the system page size, and the write must be performed at the start of // the file ..." - user_namespaces(7) srclen := src.NumBytes() if srclen >= hostarch.PageSize || offset != 0 { return 0, linuxerr.EINVAL } b := make([]byte, srclen) if _, err := src.CopyIn(ctx, b); err != nil { return 0, err } // Truncate from the first NULL byte. var nul int64 nul = int64(bytes.IndexByte(b, 0)) if nul == -1 { nul = srclen } b = b[:nul] // Remove the last \n. if nul >= 1 && b[nul-1] == '\n' { b = b[:nul-1] } lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1) if len(lines) > maxIDMapLines { return 0, linuxerr.EINVAL } entries := make([]auth.IDMapEntry, len(lines)) for i, l := range lines { var e auth.IDMapEntry _, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length) if err != nil { return 0, linuxerr.EINVAL } entries[i] = e } var err error if d.gids { err = d.task.UserNamespace().SetGIDMap(ctx, entries) } else { err = d.task.UserNamespace().SetUIDMap(ctx, entries) } if err != nil { return 0, err } // On success, Linux's kernel/user_namespace.c:map_write() always returns // count, even if fewer bytes were used. return int64(srclen), nil } var _ kernfs.Inode = (*memInode)(nil) // memInode implements kernfs.Inode for /proc/[pid]/mem. // // +stateify savable type memInode struct { kernfs.InodeAttrs kernfs.InodeNoStatFS kernfs.InodeNoopRefCount kernfs.InodeNotAnonymous kernfs.InodeNotDirectory kernfs.InodeNotSymlink kernfs.InodeWatches task *kernel.Task locks vfs.FileLocks } func (fs *filesystem) newMemInode(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode { // Note: credentials are overridden by taskOwnedInode. inode := &memInode{task: task} inode.init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm) return &taskOwnedInode{Inode: inode, owner: task} } func (f *memInode) init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) } f.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm) } // Open implements kernfs.Inode.Open. func (f *memInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { // TODO(gvisor.dev/issue/260): Add check for PTRACE_MODE_ATTACH_FSCREDS // Permission to read this file is governed by PTRACE_MODE_ATTACH_FSCREDS // Since we dont implement setfsuid/setfsgid we can just use PTRACE_MODE_ATTACH if !kernel.ContextCanTrace(ctx, f.task, true) { return nil, linuxerr.EACCES } if err := checkTaskState(f.task); err != nil { return nil, err } fd := &memFD{} if err := fd.Init(rp.Mount(), d, f, opts.Flags); err != nil { return nil, err } return &fd.vfsfd, nil } // SetStat implements kernfs.Inode.SetStat. func (*memInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return linuxerr.EPERM } var _ vfs.FileDescriptionImpl = (*memFD)(nil) // memFD implements vfs.FileDescriptionImpl for /proc/[pid]/mem. // // +stateify savable type memFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.LockFD inode *memInode // mu guards the fields below. mu sync.Mutex `state:"nosave"` offset int64 } // Init initializes memFD. func (fd *memFD) Init(m *vfs.Mount, d *kernfs.Dentry, inode *memInode, flags uint32) error { fd.LockFD.Init(&inode.locks) if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return err } fd.inode = inode return nil } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *memFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { fd.mu.Lock() defer fd.mu.Unlock() switch whence { case linux.SEEK_SET: case linux.SEEK_CUR: offset += fd.offset default: return 0, linuxerr.EINVAL } if offset < 0 { return 0, linuxerr.EINVAL } fd.offset = offset return offset, nil } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *memFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { if dst.NumBytes() == 0 { return 0, nil } m, err := getMMIncRef(fd.inode.task) if err != nil { return 0, err } defer m.DecUsers(ctx) // Buffer the read data because of MM locks buf := make([]byte, dst.NumBytes()) n, readErr := m.CopyIn(ctx, hostarch.Addr(offset), buf, usermem.IOOpts{IgnorePermissions: true}) if n > 0 { if _, err := dst.CopyOut(ctx, buf[:n]); err != nil { return 0, linuxerr.EFAULT } return int64(n), nil } if readErr != nil { return 0, linuxerr.EIO } return 0, nil } // Read implements vfs.FileDescriptionImpl.Read. func (fd *memFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { fd.mu.Lock() n, err := fd.PRead(ctx, dst, fd.offset, opts) fd.offset += n fd.mu.Unlock() return n, err } // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *memFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() return fd.inode.Stat(ctx, fs, opts) } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *memFD) SetStat(context.Context, vfs.SetStatOptions) error { return linuxerr.EPERM } // Release implements vfs.FileDescriptionImpl.Release. func (fd *memFD) Release(context.Context) {} // limitsData implements vfs.DynamicBytesSource for /proc/[pid]/limits. // // +stateify savable type limitsData struct { kernfs.DynamicBytesFile task *kernel.Task } func (d *limitsData) Generate(ctx context.Context, buf *bytes.Buffer) error { taskLimits := d.task.Limits() // formatting matches the kernel output from linux/fs/proc/base.c:proc_pid_limits() fmt.Fprintf(buf, "Limit Soft Limit Hard Limit Units \n") for _, lt := range limits.AllLimitTypes { fmt.Fprintf(buf, "%-25s ", lt.Name()) l := taskLimits.Get(lt) if l.Cur == limits.Infinity { fmt.Fprintf(buf, "%-20s ", "unlimited") } else { fmt.Fprintf(buf, "%-20d ", l.Cur) } if l.Max == limits.Infinity { fmt.Fprintf(buf, "%-20s ", "unlimited") } else { fmt.Fprintf(buf, "%-20d ", l.Max) } if u := lt.Unit(); u != "" { fmt.Fprintf(buf, "%-10s", u) } buf.WriteByte('\n') } return nil } // mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. // // +stateify savable type mapsData struct { kernfs.DynamicBytesFile task *kernel.Task } var _ dynamicInode = (*mapsData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { if mm := getMM(d.task); mm != nil { mm.ReadMapsDataInto(ctx, mm.MapsCallbackFuncForBuffer(buf)) } return nil } // smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps. // // +stateify savable type smapsData struct { kernfs.DynamicBytesFile task *kernel.Task } var _ dynamicInode = (*smapsData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { if mm := getMM(d.task); mm != nil { mm.ReadSmapsDataInto(ctx, buf) } return nil } // +stateify savable type taskStatData struct { kernfs.DynamicBytesFile task *kernel.Task // If tgstats is true, accumulate fault stats (not implemented) and CPU // time across all tasks in t's thread group. tgstats bool // pidns is the PID namespace associated with the proc filesystem that // includes the file using this statData. pidns *kernel.PIDNamespace } var _ dynamicInode = (*taskStatData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.task)) fmt.Fprintf(buf, "(%s) ", s.task.Name()) fmt.Fprintf(buf, "%c ", s.task.StateStatus()[0]) ppid := kernel.ThreadID(0) if parent := s.task.Parent(); parent != nil { ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) } fmt.Fprintf(buf, "%d ", ppid) fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.task.ThreadGroup().ProcessGroup())) fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.task.ThreadGroup().Session())) fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */) fmt.Fprintf(buf, "0 " /* flags */) fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */) var cputime usage.CPUStats if s.tgstats { cputime = s.task.ThreadGroup().CPUStats() } else { cputime = s.task.CPUStats() } fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) cputime = s.task.ThreadGroup().JoinedChildCPUStats() fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) fmt.Fprintf(buf, "%d %d ", s.task.Priority(), s.task.Niceness()) fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Count()) // itrealvalue. Since kernel 2.6.17, this field is no longer // maintained, and is hard coded as 0. fmt.Fprintf(buf, "0 ") // Start time is relative to boot time, expressed in clock ticks. fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.task.StartTime().Sub(s.task.Kernel().Timekeeper().BootTime()))) var vss, rss uint64 if mm := getMM(s.task); mm != nil { vss = mm.VirtualMemorySize() rss = mm.ResidentSetSize() } fmt.Fprintf(buf, "%d %d ", vss, rss/hostarch.PageSize) // rsslim. fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Limits().Get(limits.Rss).Cur) fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */) fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */) fmt.Fprintf(buf, "0 0 " /* nswap cnswap */) terminationSignal := linux.Signal(0) if s.task == s.task.ThreadGroup().Leader() { terminationSignal = s.task.ThreadGroup().TerminationSignal() } fmt.Fprintf(buf, "%d ", terminationSignal) fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */) fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */) fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */) fmt.Fprintf(buf, "0\n" /* exit_code */) return nil } // statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm. // // +stateify savable type statmData struct { kernfs.DynamicBytesFile task *kernel.Task } var _ dynamicInode = (*statmData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { var vss, rss uint64 if mm := getMM(s.task); mm != nil { vss = mm.VirtualMemorySize() rss = mm.ResidentSetSize() } fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/hostarch.PageSize, rss/hostarch.PageSize) return nil } // statusInode implements kernfs.Inode for /proc/[pid]/status. // // +stateify savable type statusInode struct { kernfs.InodeAttrs kernfs.InodeNoStatFS kernfs.InodeNoopRefCount kernfs.InodeNotAnonymous kernfs.InodeNotDirectory kernfs.InodeNotSymlink kernfs.InodeWatches task *kernel.Task pidns *kernel.PIDNamespace locks vfs.FileLocks } // statusFD implements vfs.FileDescriptionImpl and vfs.DynamicByteSource for // /proc/[pid]/status. // // +stateify savable type statusFD struct { statusFDLowerBase vfs.DynamicBytesFileDescriptionImpl vfs.LockFD vfsfd vfs.FileDescription inode *statusInode task *kernel.Task pidns *kernel.PIDNamespace userns *auth.UserNamespace // equivalent to struct file::f_cred::user_ns } // statusFDLowerBase is a dumb hack to ensure that statusFD prefers // vfs.DynamicBytesFileDescriptionImpl methods to vfs.FileDescriptinDefaultImpl // methods. // // +stateify savable type statusFDLowerBase struct { vfs.FileDescriptionDefaultImpl } func (fs *filesystem) newStatusInode(ctx context.Context, task *kernel.Task, pidns *kernel.PIDNamespace, ino uint64, perm linux.FileMode) kernfs.Inode { // Note: credentials are overridden by taskOwnedInode. inode := &statusInode{ task: task, pidns: pidns, } inode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeRegular|perm) return &taskOwnedInode{Inode: inode, owner: task} } // Open implements kernfs.Inode.Open. func (s *statusInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &statusFD{ inode: s, task: s.task, pidns: s.pidns, userns: rp.Credentials().UserNamespace, } fd.LockFD.Init(&s.locks) if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } fd.DynamicBytesFileDescriptionImpl.Init(&fd.vfsfd, fd) return &fd.vfsfd, nil } // SetStat implements kernfs.Inode.SetStat. func (*statusInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { return linuxerr.EPERM } // Release implements vfs.FileDescriptionImpl.Release. func (s *statusFD) Release(ctx context.Context) { } // Stat implements vfs.FileDescriptionImpl.Stat. func (s *statusFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := s.vfsfd.VirtualDentry().Mount().Filesystem() return s.inode.Stat(ctx, fs, opts) } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (s *statusFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { return linuxerr.EPERM } // Generate implements vfs.DynamicBytesSource.Generate. func (s *statusFD) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "Name:\t%s\n", s.task.Name()) fmt.Fprintf(buf, "State:\t%s\n", s.task.StateStatus()) fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.task.ThreadGroup())) fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.task)) ppid := kernel.ThreadID(0) if parent := s.task.Parent(); parent != nil { ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) } fmt.Fprintf(buf, "PPid:\t%d\n", ppid) tpid := kernel.ThreadID(0) if tracer := s.task.Tracer(); tracer != nil { tpid = s.pidns.IDOfTask(tracer) } fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid) creds := s.task.Credentials() ruid := creds.RealKUID.In(s.userns).OrOverflow() euid := creds.EffectiveKUID.In(s.userns).OrOverflow() suid := creds.SavedKUID.In(s.userns).OrOverflow() rgid := creds.RealKGID.In(s.userns).OrOverflow() egid := creds.EffectiveKGID.In(s.userns).OrOverflow() sgid := creds.SavedKGID.In(s.userns).OrOverflow() var fds int var vss, rss, data uint64 s.task.WithMuLocked(func(t *kernel.Task) { if fdTable := t.FDTable(); fdTable != nil { fds = fdTable.CurrentMaxFDs() } }) if mm := getMM(s.task); mm != nil { vss = mm.VirtualMemorySize() rss = mm.ResidentSetSize() data = mm.VirtualDataSize() } // Filesystem user/group IDs aren't implemented; effective UID/GID are used // instead. fmt.Fprintf(buf, "Uid:\t%d\t%d\t%d\t%d\n", ruid, euid, suid, euid) fmt.Fprintf(buf, "Gid:\t%d\t%d\t%d\t%d\n", rgid, egid, sgid, egid) fmt.Fprintf(buf, "FDSize:\t%d\n", fds) buf.WriteString("Groups:\t") // There is a space between each pair of supplemental GIDs, as well as an // unconditional trailing space that some applications actually depend on. var sep string for _, kgid := range creds.ExtraKGIDs { fmt.Fprintf(buf, "%s%d", sep, kgid.In(s.userns).OrOverflow()) sep = " " } buf.WriteString(" \n") fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10) fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10) fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10) fmt.Fprintf(buf, "Threads:\t%d\n", s.task.ThreadGroup().Count()) fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps) fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps) fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps) fmt.Fprintf(buf, "Seccomp:\t%d\n", s.task.SeccompMode()) // We unconditionally report a single NUMA node. See // pkg/sentry/syscalls/linux/sys_mempolicy.go. fmt.Fprintf(buf, "Mems_allowed:\t1\n") fmt.Fprintf(buf, "Mems_allowed_list:\t0\n") return nil } // ioUsage is the /proc/[pid]/io and /proc/[pid]/task/[tid]/io data provider. type ioUsage interface { // IOUsage returns the io usage data. IOUsage() *usage.IO } // +stateify savable type ioData struct { kernfs.DynamicBytesFile ioUsage } var _ dynamicInode = (*ioData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error { io := usage.IO{} io.Accumulate(i.IOUsage()) fmt.Fprintf(buf, "char: %d\n", io.CharsRead.RacyLoad()) fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten.RacyLoad()) fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls.RacyLoad()) fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls.RacyLoad()) fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead.RacyLoad()) fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten.RacyLoad()) fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled.RacyLoad()) return nil } // oomScoreAdj is a stub of the /proc//oom_score_adj file. // // +stateify savable type oomScoreAdj struct { kernfs.DynamicBytesFile task *kernel.Task } var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error { if o.task.ExitState() == kernel.TaskExitDead { return linuxerr.ESRCH } fmt.Fprintf(buf, "%d\n", o.task.OOMScoreAdj()) return nil } // Write implements vfs.WritableDynamicBytesSource.Write. func (o *oomScoreAdj) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { if src.NumBytes() == 0 { return 0, nil } // Limit input size so as not to impact performance if input size is large. src = src.TakeFirst(hostarch.PageSize - 1) str, err := usermem.CopyStringIn(ctx, src.IO, src.Addrs.Head().Start, int(src.Addrs.Head().Length()), src.Opts) if err != nil && err != linuxerr.ENAMETOOLONG { return 0, err } str = strings.TrimSpace(str) v, err := strconv.ParseInt(str, 0, 32) if err != nil { return 0, linuxerr.EINVAL } if o.task.ExitState() == kernel.TaskExitDead { return 0, linuxerr.ESRCH } if err := o.task.SetOOMScoreAdj(int32(v)); err != nil { return 0, err } return src.NumBytes(), nil } // exeSymlink is an symlink for the /proc/[pid]/exe file. // // +stateify savable type exeSymlink struct { implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeNotAnonymous kernfs.InodeSymlink kernfs.InodeWatches fs *filesystem task *kernel.Task } var _ kernfs.Inode = (*exeSymlink)(nil) func (fs *filesystem) newExeSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { inode := &exeSymlink{ fs: fs, task: task, } inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) return inode } // Readlink implements kernfs.Inode.Readlink. func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { exec, _, err := s.Getlink(ctx, nil) if err != nil { return "", err } defer s.fs.SafeDecRef(ctx, exec) root := vfs.RootFromContext(ctx) if !root.Ok() { panic("procfs Readlink requires context with root value") } defer s.fs.SafeDecRef(ctx, root) vfsObj := exec.Mount().Filesystem().VirtualFilesystem() name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec) return name, nil } // Getlink implements kernfs.Inode.Getlink. func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { if !kernel.ContextCanTrace(ctx, s.task, false) { return vfs.VirtualDentry{}, "", linuxerr.EACCES } if err := checkTaskState(s.task); err != nil { return vfs.VirtualDentry{}, "", err } mm := getMM(s.task) if mm == nil { return vfs.VirtualDentry{}, "", linuxerr.EACCES } // The MemoryManager may be destroyed, in which case // MemoryManager.destroy will simply set the executable to nil // (with locks held). exec := mm.Executable() if exec == nil { return vfs.VirtualDentry{}, "", linuxerr.ESRCH } defer exec.DecRef(ctx) vd := exec.VirtualDentry() vd.IncRef() return vd, "", nil } // cwdSymlink is an symlink for the /proc/[pid]/cwd file. // // +stateify savable type cwdSymlink struct { implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeNotAnonymous kernfs.InodeSymlink kernfs.InodeWatches fs *filesystem task *kernel.Task } var _ kernfs.Inode = (*cwdSymlink)(nil) func (fs *filesystem) newCwdSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { inode := &cwdSymlink{ fs: fs, task: task, } inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) return inode } // Readlink implements kernfs.Inode.Readlink. func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { cwd, _, err := s.Getlink(ctx, nil) if err != nil { return "", err } defer s.fs.SafeDecRef(ctx, cwd) root := vfs.RootFromContext(ctx) if !root.Ok() { panic("procfs Readlink requires context with root value") } defer s.fs.SafeDecRef(ctx, root) vfsObj := cwd.Mount().Filesystem().VirtualFilesystem() name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd) return name, nil } // Getlink implements kernfs.Inode.Getlink. func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { if !kernel.ContextCanTrace(ctx, s.task, false) { return vfs.VirtualDentry{}, "", linuxerr.EACCES } if err := checkTaskState(s.task); err != nil { return vfs.VirtualDentry{}, "", err } cwd := s.task.FSContext().WorkingDirectory() if !cwd.Ok() { // It could have raced with process deletion. return vfs.VirtualDentry{}, "", linuxerr.ESRCH } // The reference is transferred to the caller. return cwd, "", nil } // rootSymlink is an symlink for the /proc/[pid]/root file. // // +stateify savable type rootSymlink struct { implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeNotAnonymous kernfs.InodeSymlink kernfs.InodeWatches fs *filesystem task *kernel.Task } var _ kernfs.Inode = (*rootSymlink)(nil) func (fs *filesystem) newRootSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { inode := &rootSymlink{ fs: fs, task: task, } inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) return inode } // Readlink implements kernfs.Inode.Readlink. func (s *rootSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { root, _, err := s.Getlink(ctx, nil) if err != nil { return "", err } defer s.fs.SafeDecRef(ctx, root) vfsRoot := vfs.RootFromContext(ctx) if !vfsRoot.Ok() { panic("procfs Readlink requires context with root value") } defer s.fs.SafeDecRef(ctx, vfsRoot) vfsObj := root.Mount().Filesystem().VirtualFilesystem() name, _ := vfsObj.PathnameWithDeleted(ctx, vfsRoot, root) return name, nil } // Getlink implements kernfs.Inode.Getlink. func (s *rootSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { if !kernel.ContextCanTrace(ctx, s.task, false) { return vfs.VirtualDentry{}, "", linuxerr.EACCES } if err := checkTaskState(s.task); err != nil { return vfs.VirtualDentry{}, "", err } root := s.task.FSContext().RootDirectory() if !root.Ok() { // It could have raced with process deletion. return vfs.VirtualDentry{}, "", linuxerr.ESRCH } // The reference is transferred to the caller. return root, "", nil } // mountInfoData is used to implement /proc/[pid]/mountinfo. // // +stateify savable type mountInfoData struct { kernfs.DynamicBytesFile fs *filesystem task *kernel.Task } var _ dynamicInode = (*mountInfoData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { var fsctx *kernel.FSContext i.task.WithMuLocked(func(t *kernel.Task) { fsctx = t.FSContext() }) if fsctx == nil { // The task has been destroyed. Nothing to show here. return nil } rootDir := fsctx.RootDirectory() if !rootDir.Ok() { // Root has been destroyed. Don't try to read mounts. return nil } defer i.fs.SafeDecRef(ctx, rootDir) i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf) return nil } // mountsData is used to implement /proc/[pid]/mounts. // // +stateify savable type mountsData struct { kernfs.DynamicBytesFile fs *filesystem task *kernel.Task } var _ dynamicInode = (*mountsData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { var fsctx *kernel.FSContext i.task.WithMuLocked(func(t *kernel.Task) { fsctx = t.FSContext() }) if fsctx == nil { // The task has been destroyed. Nothing to show here. return nil } rootDir := fsctx.RootDirectory() if !rootDir.Ok() { // Root has been destroyed. Don't try to read mounts. return nil } defer i.fs.SafeDecRef(ctx, rootDir) i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf) return nil } // +stateify savable type namespaceSymlink struct { kernfs.StaticSymlink task *kernel.Task nsType int } func (fs *filesystem) newNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64, nsType int) kernfs.Inode { inode := &namespaceSymlink{task: task, nsType: nsType} // Note: credentials are overridden by taskOwnedInode. inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, "") taskInode := &taskOwnedInode{Inode: inode, owner: task} return taskInode } func (fs *filesystem) newPIDNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { target := fmt.Sprintf("pid:[%d]", task.PIDNamespace().ID()) inode := &namespaceSymlink{task: task} // Note: credentials are overridden by taskOwnedInode. inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target) taskInode := &taskOwnedInode{Inode: inode, owner: task} return taskInode } func (fs *filesystem) newFakeNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64, ns string) kernfs.Inode { // Namespace symlinks should contain the namespace name and the inode number // for the namespace instance, so for example user:[123456]. We currently fake // the inode number by sticking the symlink inode in its place. target := fmt.Sprintf("%s:[%d]", ns, ino) inode := &namespaceSymlink{task: task} // Note: credentials are overridden by taskOwnedInode. inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target) taskInode := &taskOwnedInode{Inode: inode, owner: task} return taskInode } func (s *namespaceSymlink) getInode(t *kernel.Task) *nsfs.Inode { switch s.nsType { case linux.CLONE_NEWNET: netns := t.GetNetworkNamespace() if netns == nil { return nil } return netns.GetInode() case linux.CLONE_NEWIPC: if ipcns := t.GetIPCNamespace(); ipcns != nil { return ipcns.GetInode() } return nil case linux.CLONE_NEWUTS: if utsns := t.GetUTSNamespace(); utsns != nil { return utsns.GetInode() } return nil case linux.CLONE_NEWNS: mntns := t.GetMountNamespace() if mntns == nil { return nil } inode, _ := mntns.Refs.(*nsfs.Inode) return inode default: panic("unknown namespace") } } // Readlink implements kernfs.Inode.Readlink. func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { if err := checkTaskState(s.task); err != nil { return "", err } if s.nsType != 0 { inode := s.getInode(s.task) if inode == nil { return "", linuxerr.ENOENT } target := inode.Name() inode.DecRef(ctx) return target, nil } return s.StaticSymlink.Readlink(ctx, mnt) } // Getlink implements kernfs.Inode.Getlink. func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { if err := checkTaskState(s.task); err != nil { return vfs.VirtualDentry{}, "", err } if s.nsType != 0 { inode := s.getInode(s.task) if inode == nil { return vfs.VirtualDentry{}, "", linuxerr.ENOENT } defer inode.DecRef(ctx) return inode.VirtualDentry(), "", nil } // Create a synthetic inode to represent the namespace. fs := mnt.Filesystem().Impl().(*filesystem) nsInode := &namespaceInode{} nsInode.Init(ctx, auth.CredentialsFromContext(ctx), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0444) dentry := &kernfs.Dentry{} dentry.Init(&fs.Filesystem, nsInode) vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry()) // Only IncRef vd.Mount() because vd.Dentry() already holds a ref of 1. mnt.IncRef() return vd, "", nil } // namespaceInode is a synthetic inode created to represent a namespace in // /proc/[pid]/ns/*. // // +stateify savable type namespaceInode struct { implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeNotAnonymous kernfs.InodeNotDirectory kernfs.InodeNotSymlink kernfs.InodeWatches locks vfs.FileLocks } var _ kernfs.Inode = (*namespaceInode)(nil) // Init initializes a namespace inode. func (i *namespaceInode) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) } i.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm) } // Open implements kernfs.Inode.Open. func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &namespaceFD{inode: i} i.IncRef() fd.LockFD.Init(&i.locks) if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil } // namespace FD is a synthetic file that represents a namespace in // /proc/[pid]/ns/*. // // +stateify savable type namespaceFD struct { vfs.FileDescriptionDefaultImpl vfs.LockFD vfsfd vfs.FileDescription inode *namespaceInode } var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil) // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() return fd.inode.Stat(ctx, vfs, opts) } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() creds := auth.CredentialsFromContext(ctx) return fd.inode.SetStat(ctx, vfs, creds, opts) } // Release implements vfs.FileDescriptionImpl.Release. func (fd *namespaceFD) Release(ctx context.Context) { fd.inode.DecRef(ctx) } // taskCgroupData generates data for /proc/[pid]/cgroup. // // +stateify savable type taskCgroupData struct { dynamicBytesFileSetAttr task *kernel.Task } var _ dynamicInode = (*taskCgroupData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *taskCgroupData) Generate(ctx context.Context, buf *bytes.Buffer) error { // When a task is existing on Linux, a task's cgroup set is cleared and // reset to the initial cgroup set, which is essentially the set of root // cgroups. Because of this, the /proc//cgroup file is always readable // on Linux throughout a task's lifetime. // // The sentry removes tasks from cgroups during the exit process, but // doesn't move them into an initial cgroup set, so partway through task // exit this file show a task is in no cgroups, which is incorrect. Instead, // once a task has left its cgroups, we return an error. if d.task.ExitState() >= kernel.TaskExitInitiated { return linuxerr.ESRCH } d.task.GenerateProcTaskCgroup(buf) return nil } // childrenData implements vfs.DynamicBytesSource for /proc/[pid]/task/[tid]/children. // // +stateify savable type childrenData struct { kernfs.DynamicBytesFile task *kernel.Task // pidns is the PID namespace associated with the proc filesystem that // includes the file using this childrenData. pidns *kernel.PIDNamespace } // Generate implements vfs.DynamicBytesSource.Generate. func (d *childrenData) Generate(ctx context.Context, buf *bytes.Buffer) error { children := d.task.Children() var childrenTIDs []int for childTask := range children { childrenTIDs = append(childrenTIDs, int(d.pidns.IDOfTask(childTask))) } // The TIDs need to be in sorted order in accordance with the Linux implementation. sort.Ints(childrenTIDs) for _, childrenTID := range childrenTIDs { // It contains a space-separated list of child tasks of the `task`. // Each task is represented by its TID. fmt.Fprintf(buf, "%d ", childrenTID) } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/proc/task_inode_refs.go000066400000000000000000000101751465435605700266770ustar00rootroot00000000000000package proc import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const taskInodeenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var taskInodeobj *taskInode // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type taskInodeRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *taskInodeRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *taskInodeRefs) RefType() string { return fmt.Sprintf("%T", taskInodeobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *taskInodeRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *taskInodeRefs) LogRefs() bool { return taskInodeenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *taskInodeRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *taskInodeRefs) IncRef() { v := r.refCount.Add(1) if taskInodeenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *taskInodeRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if taskInodeenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *taskInodeRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if taskInodeenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *taskInodeRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/proc/task_net.go000066400000000000000000000654751465435605700253650ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package proc import ( "bytes" "fmt" "io" "reflect" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/tcpip/header" ) func (fs *filesystem) newTaskNetDir(ctx context.Context, task *kernel.Task) kernfs.Inode { k := task.Kernel() pidns := task.PIDNamespace() root := auth.NewRootCredentials(pidns.UserNamespace()) var contents map[string]kernfs.Inode var stack inet.Stack if netns := task.GetNetworkNamespace(); netns != nil { netns.DecRef(ctx) stack = netns.Stack() } if stack != nil { const ( arp = "IP address HW type Flags HW address Mask Device\n" netlink = "sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode\n" packet = "sk RefCnt Type Proto Iface R Rmem User Inode\n" protocols = "protocol size sockets memory press maxhdr slab module cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n" ptype = "Type Device Function\n" upd6 = " sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n" ) psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)) // TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task // network namespace. contents = map[string]kernfs.Inode{ "dev": fs.newInode(ctx, root, 0444, &netDevData{stack: stack}), "snmp": fs.newInode(ctx, root, 0444, &netSnmpData{stack: stack}), // The following files are simple stubs until they are implemented in // netstack, if the file contains a header the stub is just the header // otherwise it is an empty file. "arp": fs.newInode(ctx, root, 0444, newStaticFile(arp)), "netlink": fs.newInode(ctx, root, 0444, newStaticFile(netlink)), "netstat": fs.newInode(ctx, root, 0444, &netStatData{}), "packet": fs.newInode(ctx, root, 0444, newStaticFile(packet)), "protocols": fs.newInode(ctx, root, 0444, newStaticFile(protocols)), // Linux sets psched values to: nsec per usec, psched tick in ns, 1000000, // high res timer ticks per sec (ClockGetres returns 1ns resolution). "psched": fs.newInode(ctx, root, 0444, newStaticFile(psched)), "ptype": fs.newInode(ctx, root, 0444, newStaticFile(ptype)), "route": fs.newInode(ctx, root, 0444, &netRouteData{stack: stack}), "tcp": fs.newInode(ctx, root, 0444, &netTCPData{kernel: k}), "udp": fs.newInode(ctx, root, 0444, &netUDPData{kernel: k}), "unix": fs.newInode(ctx, root, 0444, &netUnixData{kernel: k}), } if stack.SupportsIPv6() { contents["if_inet6"] = fs.newInode(ctx, root, 0444, &ifinet6{stack: stack}) contents["ipv6_route"] = fs.newInode(ctx, root, 0444, newStaticFile("")) contents["tcp6"] = fs.newInode(ctx, root, 0444, &netTCP6Data{kernel: k}) contents["udp6"] = fs.newInode(ctx, root, 0444, newStaticFile(upd6)) } } return fs.newTaskOwnedDir(ctx, task, fs.NextIno(), 0555, contents) } // ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6. // // +stateify savable type ifinet6 struct { kernfs.DynamicBytesFile stack inet.Stack } var _ dynamicInode = (*ifinet6)(nil) func (n *ifinet6) contents() []string { var lines []string nics := n.stack.Interfaces() for id, naddrs := range n.stack.InterfaceAddrs() { nic, ok := nics[id] if !ok { // NIC was added after NICNames was called. We'll just ignore it. continue } for _, a := range naddrs { // IPv6 only. if a.Family != linux.AF_INET6 { continue } // Fields: // IPv6 address displayed in 32 hexadecimal chars without colons // Netlink device number (interface index) in hexadecimal (use nic id) // Prefix length in hexadecimal // Scope value (use 0) // Interface flags // Device name lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name)) } } return lines } // Generate implements vfs.DynamicBytesSource.Generate. func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error { for _, l := range n.contents() { buf.WriteString(l) } return nil } // netDevData implements vfs.DynamicBytesSource for /proc/net/dev. // // +stateify savable type netDevData struct { kernfs.DynamicBytesFile stack inet.Stack } var _ dynamicInode = (*netDevData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (n *netDevData) Generate(ctx context.Context, buf *bytes.Buffer) error { interfaces := n.stack.Interfaces() buf.WriteString("Inter-| Receive | Transmit\n") buf.WriteString(" face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n") for _, i := range interfaces { // Implements the same format as // net/core/net-procfs.c:dev_seq_printf_stats. var stats inet.StatDev if err := n.stack.Statistics(&stats, i.Name); err != nil { log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err) continue } fmt.Fprintf( buf, "%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n", i.Name, // Received stats[0], // bytes stats[1], // packets stats[2], // errors stats[3], // dropped stats[4], // fifo stats[5], // frame stats[6], // compressed stats[7], // multicast // Transmitted stats[8], // bytes stats[9], // packets stats[10], // errors stats[11], // dropped stats[12], // fifo stats[13], // frame stats[14], // compressed stats[15], // multicast ) } return nil } // netUnixData implements vfs.DynamicBytesSource for /proc/net/unix. // // +stateify savable type netUnixData struct { kernfs.DynamicBytesFile kernel *kernel.Kernel } var _ dynamicInode = (*netUnixData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { buf.WriteString("Num RefCount Protocol Flags Type St Inode Path\n") for _, se := range n.kernel.ListSockets() { s := se.Sock if !s.TryIncRef() { // Racing with socket destruction, this is ok. continue } if family, _, _ := s.Impl().(socket.Socket).Type(); family != linux.AF_UNIX { s.DecRef(ctx) // Not a unix socket. continue } sops := s.Impl().(*unix.Socket) addr, err := sops.Endpoint().GetLocalAddress() if err != nil { log.Warningf("Failed to retrieve socket name from %+v: %v", s, err) addr.Addr = "" } sockFlags := 0 if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok { ce.Lock() if ce.ListeningLocked() { // For unix domain sockets, linux reports a single flag // value if the socket is listening, of __SO_ACCEPTCON. sockFlags = linux.SO_ACCEPTCON } ce.Unlock() } // Get inode number. var ino uint64 stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_INO}) if statErr != nil || stat.Mask&linux.STATX_INO == 0 { log.Warningf("Failed to retrieve ino for socket file: %v", statErr) } else { ino = stat.Ino } // In the socket entry below, the value for the 'Num' field requires // some consideration. Linux prints the address to the struct // unix_sock representing a socket in the kernel, but may redact the // value for unprivileged users depending on the kptr_restrict // sysctl. // // One use for this field is to allow a privileged user to // introspect into the kernel memory to determine information about // a socket not available through procfs, such as the socket's peer. // // In gvisor, returning a pointer to our internal structures would // be pointless, as it wouldn't match the memory layout for struct // unix_sock, making introspection difficult. We could populate a // struct unix_sock with the appropriate data, but even that // requires consideration for which kernel version to emulate, as // the definition of this struct changes over time. // // For now, we always redact this pointer. fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d", (*unix.Socket)(nil), // Num, pointer to kernel socket struct. s.ReadRefs()-1, // RefCount, don't count our own ref. 0, // Protocol, always 0 for UDS. sockFlags, // Flags. sops.Endpoint().Type(), // Type. sops.State(), // State. ino, // Inode. ) // Path if len(addr.Addr) != 0 { if addr.Addr[0] == 0 { // Abstract path. fmt.Fprintf(buf, " @%s", string(addr.Addr[1:])) } else { fmt.Fprintf(buf, " %s", string(addr.Addr)) } } fmt.Fprintf(buf, "\n") s.DecRef(ctx) } return nil } func networkToHost16(n uint16) uint16 { // n is in network byte order, so is big-endian. The most-significant byte // should be stored in the lower address. // // We manually inline binary.BigEndian.Uint16() because Go does not support // non-primitive consts, so binary.BigEndian is a (mutable) var, so calls to // binary.BigEndian.Uint16() require a read of binary.BigEndian and an // interface method call, defeating inlining. buf := [2]byte{byte(n >> 8 & 0xff), byte(n & 0xff)} return hostarch.ByteOrder.Uint16(buf[:]) } func writeInetAddr(w io.Writer, family int, i linux.SockAddr) { switch family { case linux.AF_INET: var a linux.SockAddrInet if i != nil { a = *i.(*linux.SockAddrInet) } // linux.SockAddrInet.Port is stored in the network byte order and is // printed like a number in host byte order. Note that all numbers in host // byte order are printed with the most-significant byte first when // formatted with %X. See get_tcp4_sock() and udp4_format_sock() in Linux. port := networkToHost16(a.Port) // linux.SockAddrInet.Addr is stored as a byte slice in big-endian order // (i.e. most-significant byte in index 0). Linux represents this as a // __be32 which is a typedef for an unsigned int, and is printed with // %X. This means that for a little-endian machine, Linux prints the // least-significant byte of the address first. To emulate this, we first // invert the byte order for the address using hostarch.ByteOrder.Uint32, // which makes it have the equivalent encoding to a __be32 on a little // endian machine. Note that this operation is a no-op on a big endian // machine. Then similar to Linux, we format it with %X, which will print // the most-significant byte of the __be32 address first, which is now // actually the least-significant byte of the original address in // linux.SockAddrInet.Addr on little endian machines, due to the conversion. addr := hostarch.ByteOrder.Uint32(a.Addr[:]) fmt.Fprintf(w, "%08X:%04X ", addr, port) case linux.AF_INET6: var a linux.SockAddrInet6 if i != nil { a = *i.(*linux.SockAddrInet6) } port := networkToHost16(a.Port) addr0 := hostarch.ByteOrder.Uint32(a.Addr[0:4]) addr1 := hostarch.ByteOrder.Uint32(a.Addr[4:8]) addr2 := hostarch.ByteOrder.Uint32(a.Addr[8:12]) addr3 := hostarch.ByteOrder.Uint32(a.Addr[12:16]) fmt.Fprintf(w, "%08X%08X%08X%08X:%04X ", addr0, addr1, addr2, addr3, port) } } func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, family int) error { // t may be nil here if our caller is not part of a task goroutine. This can // happen for example if we're here for "sentryctl cat". When t is nil, // degrade gracefully and retrieve what we can. t := kernel.TaskFromContext(ctx) for _, se := range k.ListSockets() { s := se.Sock if !s.TryIncRef() { // Racing with socket destruction, this is ok. continue } sops, ok := s.Impl().(socket.Socket) if !ok { panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) } if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { s.DecRef(ctx) // Not tcp4 sockets. continue } // Linux's documentation for the fields below can be found at // https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt. // For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock(). // Note that the header doesn't contain labels for all the fields. // Field: sl; entry number. fmt.Fprintf(buf, "%4d: ", se.ID) // Field: local_adddress. var localAddr linux.SockAddr if t != nil { if local, _, err := sops.GetSockName(t); err == nil { localAddr = local } } writeInetAddr(buf, family, localAddr) // Field: rem_address. var remoteAddr linux.SockAddr if t != nil { if remote, _, err := sops.GetPeerName(t); err == nil { remoteAddr = remote } } writeInetAddr(buf, family, remoteAddr) // Field: state; socket state. fmt.Fprintf(buf, "%02X ", sops.State()) // Field: tx_queue, rx_queue; number of packets in the transmit and // receive queue. Unimplemented. fmt.Fprintf(buf, "%08X:%08X ", 0, 0) // Field: tr, tm->when; timer active state and number of jiffies // until timer expires. Unimplemented. fmt.Fprintf(buf, "%02X:%08X ", 0, 0) // Field: retrnsmt; number of unrecovered RTO timeouts. // Unimplemented. fmt.Fprintf(buf, "%08X ", 0) stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO}) // Field: uid. if statErr != nil || stat.Mask&linux.STATX_UID == 0 { log.Warningf("Failed to retrieve uid for socket file: %v", statErr) fmt.Fprintf(buf, "%5d ", 0) } else { creds := auth.CredentialsFromContext(ctx) fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow())) } // Field: timeout; number of unanswered 0-window probes. // Unimplemented. fmt.Fprintf(buf, "%8d ", 0) // Field: inode. if statErr != nil || stat.Mask&linux.STATX_INO == 0 { log.Warningf("Failed to retrieve inode for socket file: %v", statErr) fmt.Fprintf(buf, "%8d ", 0) } else { fmt.Fprintf(buf, "%8d ", stat.Ino) } // Field: refcount. Don't count the ref we obtain while dereferencing // the weakref to this socket. fmt.Fprintf(buf, "%d ", s.ReadRefs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil)) // Field: retransmit timeout. Unimplemented. fmt.Fprintf(buf, "%d ", 0) // Field: predicted tick of soft clock (delayed ACK control data). // Unimplemented. fmt.Fprintf(buf, "%d ", 0) // Field: (ack.quick<<1)|ack.pingpong, Unimplemented. fmt.Fprintf(buf, "%d ", 0) // Field: sending congestion window, Unimplemented. fmt.Fprintf(buf, "%d ", 0) // Field: Slow start size threshold, -1 if threshold >= 0xFFFF. // Unimplemented, report as large threshold. fmt.Fprintf(buf, "%d", -1) fmt.Fprintf(buf, "\n") s.DecRef(ctx) } return nil } // netTCPData implements vfs.DynamicBytesSource for /proc/net/tcp. // // +stateify savable type netTCPData struct { kernfs.DynamicBytesFile kernel *kernel.Kernel } var _ dynamicInode = (*netTCPData)(nil) func (d *netTCPData) Generate(ctx context.Context, buf *bytes.Buffer) error { buf.WriteString(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode \n") return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET) } // netTCP6Data implements vfs.DynamicBytesSource for /proc/net/tcp6. // // +stateify savable type netTCP6Data struct { kernfs.DynamicBytesFile kernel *kernel.Kernel } var _ dynamicInode = (*netTCP6Data)(nil) func (d *netTCP6Data) Generate(ctx context.Context, buf *bytes.Buffer) error { buf.WriteString(" sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n") return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET6) } // netUDPData implements vfs.DynamicBytesSource for /proc/net/udp. // // +stateify savable type netUDPData struct { kernfs.DynamicBytesFile kernel *kernel.Kernel } var _ dynamicInode = (*netUDPData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { // t may be nil here if our caller is not part of a task goroutine. This can // happen for example if we're here for "sentryctl cat". When t is nil, // degrade gracefully and retrieve what we can. t := kernel.TaskFromContext(ctx) buf.WriteString(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode ref pointer drops \n") for _, se := range d.kernel.ListSockets() { s := se.Sock if !s.TryIncRef() { // Racing with socket destruction, this is ok. continue } sops, ok := s.Impl().(socket.Socket) if !ok { panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) } if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM { s.DecRef(ctx) // Not udp4 socket. continue } // For Linux's implementation, see net/ipv4/udp.c:udp4_format_sock(). // Field: sl; entry number. fmt.Fprintf(buf, "%5d: ", se.ID) // Field: local_adddress. var localAddr linux.SockAddrInet if t != nil { if local, _, err := sops.GetSockName(t); err == nil { localAddr = *local.(*linux.SockAddrInet) } } writeInetAddr(buf, linux.AF_INET, &localAddr) // Field: rem_address. var remoteAddr linux.SockAddrInet if t != nil { if remote, _, err := sops.GetPeerName(t); err == nil { remoteAddr = *remote.(*linux.SockAddrInet) } } writeInetAddr(buf, linux.AF_INET, &remoteAddr) // Field: state; socket state. fmt.Fprintf(buf, "%02X ", sops.State()) // Field: tx_queue, rx_queue; number of packets in the transmit and // receive queue. Unimplemented. fmt.Fprintf(buf, "%08X:%08X ", 0, 0) // Field: tr, tm->when. Always 0 for UDP. fmt.Fprintf(buf, "%02X:%08X ", 0, 0) // Field: retrnsmt. Always 0 for UDP. fmt.Fprintf(buf, "%08X ", 0) stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO}) // Field: uid. if statErr != nil || stat.Mask&linux.STATX_UID == 0 { log.Warningf("Failed to retrieve uid for socket file: %v", statErr) fmt.Fprintf(buf, "%5d ", 0) } else { creds := auth.CredentialsFromContext(ctx) fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow())) } // Field: timeout. Always 0 for UDP. fmt.Fprintf(buf, "%8d ", 0) // Field: inode. if statErr != nil || stat.Mask&linux.STATX_INO == 0 { log.Warningf("Failed to retrieve inode for socket file: %v", statErr) fmt.Fprintf(buf, "%8d ", 0) } else { fmt.Fprintf(buf, "%8d ", stat.Ino) } // Field: ref; reference count on the socket inode. Don't count the ref // we obtain while dereferencing the weakref to this socket. fmt.Fprintf(buf, "%d ", s.ReadRefs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil)) // Field: drops; number of dropped packets. Unimplemented. fmt.Fprintf(buf, "%d", 0) fmt.Fprintf(buf, "\n") s.DecRef(ctx) } return nil } // netSnmpData implements vfs.DynamicBytesSource for /proc/net/snmp. // // +stateify savable type netSnmpData struct { kernfs.DynamicBytesFile stack inet.Stack } var _ dynamicInode = (*netSnmpData)(nil) // +stateify savable type snmpLine struct { prefix string header string } var snmp = []snmpLine{ { prefix: "Ip", header: "Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates", }, { prefix: "Icmp", header: "InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps", }, { prefix: "IcmpMsg", }, { prefix: "Tcp", header: "RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors", }, { prefix: "Udp", header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti", }, { prefix: "UdpLite", header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti", }, } func toSlice(a any) []uint64 { v := reflect.Indirect(reflect.ValueOf(a)) return v.Slice(0, v.Len()).Interface().([]uint64) } func sprintSlice(s []uint64) string { if len(s) == 0 { return "" } r := fmt.Sprint(s) return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice. } // Generate implements vfs.DynamicBytesSource.Generate. func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error { types := []any{ &inet.StatSNMPIP{}, &inet.StatSNMPICMP{}, nil, // TODO(gvisor.dev/issue/628): Support IcmpMsg stats. &inet.StatSNMPTCP{}, &inet.StatSNMPUDP{}, &inet.StatSNMPUDPLite{}, } for i, stat := range types { line := snmp[i] if stat == nil { fmt.Fprintf(buf, "%s:\n", line.prefix) fmt.Fprintf(buf, "%s:\n", line.prefix) continue } if err := d.stack.Statistics(stat, line.prefix); err != nil { if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) { log.Infof("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err) } else { log.Warningf("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err) } } fmt.Fprintf(buf, "%s: %s\n", line.prefix, line.header) if line.prefix == "Tcp" { tcp := stat.(*inet.StatSNMPTCP) // "Tcp" needs special processing because MaxConn is signed. RFC 2012. fmt.Fprintf(buf, "%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:])) } else { fmt.Fprintf(buf, "%s: %s\n", line.prefix, sprintSlice(toSlice(stat))) } } return nil } // netRouteData implements vfs.DynamicBytesSource for /proc/net/route. // // +stateify savable type netRouteData struct { kernfs.DynamicBytesFile stack inet.Stack } var _ dynamicInode = (*netRouteData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. // See Linux's net/ipv4/fib_trie.c:fib_route_seq_show. func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT") interfaces := d.stack.Interfaces() for _, rt := range d.stack.RouteTable() { // /proc/net/route only includes ipv4 routes. if rt.Family != linux.AF_INET { continue } // /proc/net/route does not include broadcast or multicast routes. if rt.Type == linux.RTN_BROADCAST || rt.Type == linux.RTN_MULTICAST { continue } iface, ok := interfaces[rt.OutputInterface] if !ok || iface.Name == "lo" { continue } var ( gw uint32 prefix uint32 flags = linux.RTF_UP ) if len(rt.GatewayAddr) == header.IPv4AddressSize { flags |= linux.RTF_GATEWAY gw = hostarch.ByteOrder.Uint32(rt.GatewayAddr) } if len(rt.DstAddr) == header.IPv4AddressSize { prefix = hostarch.ByteOrder.Uint32(rt.DstAddr) } l := fmt.Sprintf( "%s\t%08X\t%08X\t%04X\t%d\t%d\t%d\t%08X\t%d\t%d\t%d", iface.Name, prefix, gw, flags, 0, // RefCnt. 0, // Use. 0, // Metric. (uint32(1)<= maxTaskID { return offset, nil } // According to Linux (fs/proc/base.c:proc_pid_readdir()), process directories // start at offset FIRST_PROCESS_ENTRY with '/proc/self', followed by // '/proc/thread-self' and then '/proc/[pid]'. if offset < FIRST_PROCESS_ENTRY { offset = FIRST_PROCESS_ENTRY } if offset == FIRST_PROCESS_ENTRY { dirent := vfs.Dirent{ Name: selfName, Type: linux.DT_LNK, Ino: i.fs.NextIno(), NextOff: offset + 1, } if err := cb.Handle(dirent); err != nil { return offset, err } offset++ } if offset == FIRST_PROCESS_ENTRY+1 { dirent := vfs.Dirent{ Name: threadSelfName, Type: linux.DT_LNK, Ino: i.fs.NextIno(), NextOff: offset + 1, } if err := cb.Handle(dirent); err != nil { return offset, err } offset++ } // Collect all tasks that TGIDs are greater than the offset specified. Per // Linux we only include in directory listings if it's the leader. But for // whatever crazy reason, you can still walk to the given node. var tids []int startTid := offset - FIRST_PROCESS_ENTRY - 2 for _, tg := range i.pidns.ThreadGroups() { tid := i.pidns.IDOfThreadGroup(tg) if int64(tid) < startTid { continue } if leader := tg.Leader(); leader != nil { tids = append(tids, int(tid)) } } if len(tids) == 0 { return offset, nil } sort.Ints(tids) for _, tid := range tids { dirent := vfs.Dirent{ Name: strconv.FormatUint(uint64(tid), 10), Type: linux.DT_DIR, Ino: i.fs.NextIno(), NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1, } if err := cb.Handle(dirent); err != nil { return offset, err } offset++ } return maxTaskID, nil } // Open implements kernfs.Inode.Open. func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, }) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts) if err != nil { return linux.Statx{}, err } if opts.Mask&linux.STATX_NLINK != 0 { // Add dynamic children to link count. for _, tg := range i.pidns.ThreadGroups() { if leader := tg.Leader(); leader != nil { stat.Nlink++ } } } return stat, nil } // DecRef implements kernfs.Inode.DecRef. func (i *tasksInode) DecRef(ctx context.Context) { i.tasksInodeRefs.DecRef(func() { i.Destroy(ctx) }) } // staticFileSetStat implements a special static file that allows inode // attributes to be set. This is to support /proc files that are readonly, but // allow attributes to be set. // // +stateify savable type staticFileSetStat struct { dynamicBytesFileSetAttr vfs.StaticData } var _ dynamicInode = (*staticFileSetStat)(nil) func newStaticFileSetStat(data string) *staticFileSetStat { return &staticFileSetStat{StaticData: vfs.StaticData{Data: data}} } func cpuInfoData(k *kernel.Kernel) string { features := k.FeatureSet() var buf bytes.Buffer for i, max := uint(0), k.ApplicationCores(); i < max; i++ { features.WriteCPUInfoTo(i, max, &buf) } return buf.String() } func ipcData(v uint64) dynamicInode { return newStaticFile(strconv.FormatUint(v, 10)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/proc/tasks_files.go000066400000000000000000000342631465435605700260530ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package proc import ( "bytes" "fmt" "runtime" "strconv" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // +stateify savable type selfSymlink struct { implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeNotAnonymous kernfs.InodeSymlink kernfs.InodeWatches pidns *kernel.PIDNamespace } var _ kernfs.Inode = (*selfSymlink)(nil) func (i *tasksInode) newSelfSymlink(ctx context.Context, creds *auth.Credentials) kernfs.Inode { inode := &selfSymlink{pidns: i.pidns} inode.Init(ctx, creds, linux.UNNAMED_MAJOR, i.fs.devMinor, i.fs.NextIno(), linux.ModeSymlink|0777) return inode } func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { t := kernel.TaskFromContext(ctx) if t == nil { // Who is reading this link? return "", linuxerr.EINVAL } tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) if tgid == 0 { return "", linuxerr.ENOENT } return strconv.FormatUint(uint64(tgid), 10), nil } func (s *selfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { target, err := s.Readlink(ctx, mnt) return vfs.VirtualDentry{}, target, err } // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return linuxerr.EPERM } // +stateify savable type threadSelfSymlink struct { implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeNotAnonymous kernfs.InodeSymlink kernfs.InodeWatches pidns *kernel.PIDNamespace } var _ kernfs.Inode = (*threadSelfSymlink)(nil) func (i *tasksInode) newThreadSelfSymlink(ctx context.Context, creds *auth.Credentials) kernfs.Inode { inode := &threadSelfSymlink{pidns: i.pidns} inode.Init(ctx, creds, linux.UNNAMED_MAJOR, i.fs.devMinor, i.fs.NextIno(), linux.ModeSymlink|0777) return inode } func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { t := kernel.TaskFromContext(ctx) if t == nil { // Who is reading this link? return "", linuxerr.EINVAL } tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) tid := s.pidns.IDOfTask(t) if tid == 0 || tgid == 0 { return "", linuxerr.ENOENT } return fmt.Sprintf("%d/task/%d", tgid, tid), nil } func (s *threadSelfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { target, err := s.Readlink(ctx, mnt) return vfs.VirtualDentry{}, target, err } // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return linuxerr.EPERM } // dynamicBytesFileSetAttr implements a special file that allows inode // attributes to be set. This is to support /proc files that are readonly, but // allow attributes to be set. // // +stateify savable type dynamicBytesFileSetAttr struct { kernfs.DynamicBytesFile } // SetStat implements kernfs.Inode.SetStat. func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { return d.DynamicBytesFile.InodeAttrs.SetStat(ctx, fs, creds, opts) } // cpuStats contains the breakdown of CPU time for /proc/stat. // // +stateify savable type cpuStats struct { // user is time spent in userspace tasks with non-positive niceness. user uint64 // nice is time spent in userspace tasks with positive niceness. nice uint64 // system is time spent in non-interrupt kernel context. system uint64 // idle is time spent idle. idle uint64 // ioWait is time spent waiting for IO. ioWait uint64 // irq is time spent in interrupt context. irq uint64 // softirq is time spent in software interrupt context. softirq uint64 // steal is involuntary wait time. steal uint64 // guest is time spent in guests with non-positive niceness. guest uint64 // guestNice is time spent in guests with positive niceness. guestNice uint64 } // String implements fmt.Stringer. func (c cpuStats) String() string { return fmt.Sprintf("%d %d %d %d %d %d %d %d %d %d", c.user, c.nice, c.system, c.idle, c.ioWait, c.irq, c.softirq, c.steal, c.guest, c.guestNice) } // statData implements vfs.DynamicBytesSource for /proc/stat. // // +stateify savable type statData struct { dynamicBytesFileSetAttr } var _ dynamicInode = (*statData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (*statData) Generate(ctx context.Context, buf *bytes.Buffer) error { // TODO(b/37226836): We currently export only zero CPU stats. We could // at least provide some aggregate stats. var cpu cpuStats fmt.Fprintf(buf, "cpu %s\n", cpu) k := kernel.KernelFromContext(ctx) for c, max := uint(0), k.ApplicationCores(); c < max; c++ { fmt.Fprintf(buf, "cpu%d %s\n", c, cpu) } // The total number of interrupts is dependent on the CPUs and PCI // devices on the system. See arch_probe_nr_irqs. // // Since we don't report real interrupt stats, just choose an arbitrary // value from a representative VM. const numInterrupts = 256 // The Kernel doesn't handle real interrupts, so report all zeroes. // TODO(b/37226836): We could count page faults as #PF. fmt.Fprintf(buf, "intr 0") // total for i := 0; i < numInterrupts; i++ { fmt.Fprintf(buf, " 0") } fmt.Fprintf(buf, "\n") // Total number of context switches. // TODO(b/37226836): Count this. fmt.Fprintf(buf, "ctxt 0\n") // CLOCK_REALTIME timestamp from boot, in seconds. fmt.Fprintf(buf, "btime %d\n", k.Timekeeper().BootTime().Seconds()) // Total number of clones. // TODO(b/37226836): Count this. fmt.Fprintf(buf, "processes 0\n") // Number of runnable tasks. // TODO(b/37226836): Count this. fmt.Fprintf(buf, "procs_running 0\n") // Number of tasks waiting on IO. // TODO(b/37226836): Count this. fmt.Fprintf(buf, "procs_blocked 0\n") // Number of each softirq handled. fmt.Fprintf(buf, "softirq 0") // total for i := 0; i < linux.NumSoftIRQ; i++ { fmt.Fprintf(buf, " 0") } fmt.Fprintf(buf, "\n") return nil } // loadavgData backs /proc/loadavg. // // +stateify savable type loadavgData struct { dynamicBytesFileSetAttr } var _ dynamicInode = (*loadavgData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (*loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { // TODO(b/62345059): Include real data in fields. // Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods. // Column 4-5: currently running processes and the total number of processes. // Column 6: the last process ID used. fmt.Fprintf(buf, "%.2f %.2f %.2f %d/%d %d\n", 0.00, 0.00, 0.00, 0, 0, 0) return nil } // meminfoData implements vfs.DynamicBytesSource for /proc/meminfo. // // +stateify savable type meminfoData struct { dynamicBytesFileSetAttr } var _ dynamicInode = (*meminfoData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (*meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { mf := kernel.KernelFromContext(ctx).MemoryFile() _ = mf.UpdateUsage(nil) // Best effort snapshot, totalUsage := usage.MemoryAccounting.Copy() totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage) anon := snapshot.Anonymous + snapshot.Tmpfs file := snapshot.PageCache + snapshot.Mapped // We don't actually have active/inactive LRUs, so just make up numbers. activeFile := (file / 2) &^ (hostarch.PageSize - 1) inactiveFile := file - activeFile fmt.Fprintf(buf, "MemTotal: %8d kB\n", totalSize/1024) memFree := totalSize - totalUsage if memFree > totalSize { // Underflow. memFree = 0 } // We use MemFree as MemAvailable because we don't swap. // TODO(rahat): When reclaim is implemented the value of MemAvailable // should change. fmt.Fprintf(buf, "MemFree: %8d kB\n", memFree/1024) fmt.Fprintf(buf, "MemAvailable: %8d kB\n", memFree/1024) fmt.Fprintf(buf, "Buffers: 0 kB\n") // memory usage by block devices fmt.Fprintf(buf, "Cached: %8d kB\n", (file+snapshot.Tmpfs)/1024) // Emulate a system with no swap, which disables inactivation of anon pages. fmt.Fprintf(buf, "SwapCache: 0 kB\n") fmt.Fprintf(buf, "Active: %8d kB\n", (anon+activeFile)/1024) fmt.Fprintf(buf, "Inactive: %8d kB\n", inactiveFile/1024) fmt.Fprintf(buf, "Active(anon): %8d kB\n", anon/1024) fmt.Fprintf(buf, "Inactive(anon): 0 kB\n") fmt.Fprintf(buf, "Active(file): %8d kB\n", activeFile/1024) fmt.Fprintf(buf, "Inactive(file): %8d kB\n", inactiveFile/1024) fmt.Fprintf(buf, "Unevictable: 0 kB\n") // TODO(b/31823263) fmt.Fprintf(buf, "Mlocked: 0 kB\n") // TODO(b/31823263) fmt.Fprintf(buf, "SwapTotal: 0 kB\n") fmt.Fprintf(buf, "SwapFree: 0 kB\n") fmt.Fprintf(buf, "Dirty: 0 kB\n") fmt.Fprintf(buf, "Writeback: 0 kB\n") fmt.Fprintf(buf, "AnonPages: %8d kB\n", anon/1024) fmt.Fprintf(buf, "Mapped: %8d kB\n", file/1024) // doesn't count mapped tmpfs, which we don't know fmt.Fprintf(buf, "Shmem: %8d kB\n", snapshot.Tmpfs/1024) return nil } // uptimeData implements vfs.DynamicBytesSource for /proc/uptime. // // +stateify savable type uptimeData struct { dynamicBytesFileSetAttr } var _ dynamicInode = (*uptimeData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (*uptimeData) Generate(ctx context.Context, buf *bytes.Buffer) error { k := kernel.KernelFromContext(ctx) now := time.NowFromContext(ctx) // Pretend that we've spent zero time sleeping (second number). fmt.Fprintf(buf, "%.2f 0.00\n", now.Sub(k.Timekeeper().BootTime()).Seconds()) return nil } // versionData implements vfs.DynamicBytesSource for /proc/version. // // +stateify savable type versionData struct { dynamicBytesFileSetAttr } var _ dynamicInode = (*versionData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (*versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { // /proc/version takes the form: // // "SYSNAME version RELEASE (COMPILE_USER@COMPILE_HOST) // (COMPILER_VERSION) VERSION" // // where: // - SYSNAME, RELEASE, and VERSION are the same as returned by // sys_utsname // - COMPILE_USER is the user that build the kernel // - COMPILE_HOST is the hostname of the machine on which the kernel // was built // - COMPILER_VERSION is the version reported by the building compiler // // Since we don't really want to expose build information to // applications, those fields are omitted. // // FIXME(mpratt): Using Version from the init task SyscallTable // disregards the different version a task may have (e.g., in a uts // namespace). ver := kernelVersion(ctx) fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version) return nil } // filesystemsData backs /proc/filesystems. // // +stateify savable type filesystemsData struct { kernfs.DynamicBytesFile } var _ dynamicInode = (*filesystemsData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *filesystemsData) Generate(ctx context.Context, buf *bytes.Buffer) error { k := kernel.KernelFromContext(ctx) k.VFS().GenerateProcFilesystems(buf) return nil } // cgroupsData backs /proc/cgroups. // // +stateify savable type cgroupsData struct { dynamicBytesFileSetAttr } var _ dynamicInode = (*cgroupsData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (*cgroupsData) Generate(ctx context.Context, buf *bytes.Buffer) error { r := kernel.KernelFromContext(ctx).CgroupRegistry() r.GenerateProcCgroups(buf) return nil } // cmdLineData backs /proc/cmdline. // // +stateify savable type cmdLineData struct { dynamicBytesFileSetAttr } var _ dynamicInode = (*cmdLineData)(nil) // Generate implements vfs.DynamicByteSource.Generate. func (*cmdLineData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "BOOT_IMAGE=/vmlinuz-%s-gvisor quiet\n", kernelVersion(ctx).Release) return nil } // kernelVersion returns the kernel version. func kernelVersion(ctx context.Context) kernel.Version { k := kernel.KernelFromContext(ctx) init := k.GlobalInit() if init == nil { // Attempted to read before the init Task is created. This can // only occur during startup, which should never need to read // this file. panic("Attempted to read version before initial Task is available") } return init.Leader().SyscallTable().Version } // sentryMeminfoData implements vfs.DynamicBytesSource for /proc/sentry-meminfo. // // +stateify savable type sentryMeminfoData struct { dynamicBytesFileSetAttr } var _ dynamicInode = (*sentryMeminfoData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (*sentryMeminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { var sentryMeminfo runtime.MemStats runtime.ReadMemStats(&sentryMeminfo) fmt.Fprintf(buf, "Alloc: %8d kB\n", sentryMeminfo.Alloc/1024) fmt.Fprintf(buf, "TotalAlloc: %8d kB\n", sentryMeminfo.TotalAlloc/1024) fmt.Fprintf(buf, "Sys: %8d kB\n", sentryMeminfo.Sys/1024) fmt.Fprintf(buf, "Mallocs: %8d\n", sentryMeminfo.Mallocs) fmt.Fprintf(buf, "Frees: %8d\n", sentryMeminfo.Frees) fmt.Fprintf(buf, "Live Objects: %8d\n", sentryMeminfo.Mallocs-sentryMeminfo.Frees) fmt.Fprintf(buf, "HeapAlloc: %8d kB\n", sentryMeminfo.HeapAlloc/1024) fmt.Fprintf(buf, "HeapSys: %8d kB\n", sentryMeminfo.HeapSys/1024) fmt.Fprintf(buf, "HeapObjects: %8d\n", sentryMeminfo.HeapObjects) return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/proc/tasks_inode_refs.go000066400000000000000000000102171465435605700270570ustar00rootroot00000000000000package proc import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const tasksInodeenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var tasksInodeobj *tasksInode // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type tasksInodeRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *tasksInodeRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *tasksInodeRefs) RefType() string { return fmt.Sprintf("%T", tasksInodeobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *tasksInodeRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *tasksInodeRefs) LogRefs() bool { return tasksInodeenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *tasksInodeRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *tasksInodeRefs) IncRef() { v := r.refCount.Add(1) if tasksInodeenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *tasksInodeRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if tasksInodeenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *tasksInodeRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if tasksInodeenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *tasksInodeRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/proc/tasks_sys.go000066400000000000000000000437031465435605700255660ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package proc import ( "bytes" "fmt" "io" "math" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/usermem" ) // +stateify savable type tcpMemDir int const ( tcpRMem tcpMemDir = iota tcpWMem ) // newSysDir returns the dentry corresponding to /proc/sys directory. func (fs *filesystem) newSysDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode { return fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ "kernel": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ "cap_last_cap": fs.newInode(ctx, root, 0444, newStaticFile(fmt.Sprintf("%d\n", linux.CAP_LAST_CAP))), "hostname": fs.newInode(ctx, root, 0444, &hostnameData{}), "overflowgid": fs.newInode(ctx, root, 0444, newStaticFile(fmt.Sprintf("%d\n", auth.OverflowGID))), "overflowuid": fs.newInode(ctx, root, 0444, newStaticFile(fmt.Sprintf("%d\n", auth.OverflowUID))), "random": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ "boot_id": fs.newInode(ctx, root, 0444, newStaticFile(randUUID())), }), "sem": fs.newInode(ctx, root, 0444, newStaticFile(fmt.Sprintf("%d\t%d\t%d\t%d\n", linux.SEMMSL, linux.SEMMNS, linux.SEMOPM, linux.SEMMNI))), "shmall": fs.newInode(ctx, root, 0444, ipcData(linux.SHMALL)), "shmmax": fs.newInode(ctx, root, 0444, ipcData(linux.SHMMAX)), "shmmni": fs.newInode(ctx, root, 0444, ipcData(linux.SHMMNI)), "msgmni": fs.newInode(ctx, root, 0444, ipcData(linux.MSGMNI)), "msgmax": fs.newInode(ctx, root, 0444, ipcData(linux.MSGMAX)), "msgmnb": fs.newInode(ctx, root, 0444, ipcData(linux.MSGMNB)), "yama": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ "ptrace_scope": fs.newYAMAPtraceScopeFile(ctx, k, root), }), }), "fs": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ "nr_open": fs.newInode(ctx, root, 0644, &atomicInt32File{val: &k.MaxFDLimit, min: 8, max: kernel.MaxFdLimit}), }), "vm": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ "max_map_count": fs.newInode(ctx, root, 0444, newStaticFile("2147483647\n")), "mmap_min_addr": fs.newInode(ctx, root, 0444, &mmapMinAddrData{k: k}), "overcommit_memory": fs.newInode(ctx, root, 0444, newStaticFile("0\n")), }), "net": fs.newSysNetDir(ctx, root, k), }) } // newSysNetDir returns the dentry corresponding to /proc/sys/net directory. func (fs *filesystem) newSysNetDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode { var contents map[string]kernfs.Inode // TODO(gvisor.dev/issue/1833): Support for using the network stack in the // network namespace of the calling process. if stack := k.RootNetworkNamespace().Stack(); stack != nil { contents = map[string]kernfs.Inode{ "ipv4": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ "ip_forward": fs.newInode(ctx, root, 0444, &ipForwarding{stack: stack}), "ip_local_port_range": fs.newInode(ctx, root, 0644, &portRange{stack: stack}), "tcp_recovery": fs.newInode(ctx, root, 0644, &tcpRecoveryData{stack: stack}), "tcp_rmem": fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpRMem}), "tcp_sack": fs.newInode(ctx, root, 0644, &tcpSackData{stack: stack}), "tcp_wmem": fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpWMem}), // The following files are simple stubs until they are implemented in // netstack, most of these files are configuration related. We use the // value closest to the actual netstack behavior or any empty file, all // of these files will have mode 0444 (read-only for all users). "ip_local_reserved_ports": fs.newInode(ctx, root, 0444, newStaticFile("")), "ipfrag_time": fs.newInode(ctx, root, 0444, newStaticFile("30")), "ip_nonlocal_bind": fs.newInode(ctx, root, 0444, newStaticFile("0")), "ip_no_pmtu_disc": fs.newInode(ctx, root, 0444, newStaticFile("1")), // tcp_allowed_congestion_control tell the user what they are able to // do as an unprivledged process so we leave it empty. "tcp_allowed_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("")), "tcp_available_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("reno")), "tcp_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("reno")), // Many of the following stub files are features netstack doesn't // support. The unsupported features return "0" to indicate they are // disabled. "tcp_base_mss": fs.newInode(ctx, root, 0444, newStaticFile("1280")), "tcp_dsack": fs.newInode(ctx, root, 0444, newStaticFile("0")), "tcp_early_retrans": fs.newInode(ctx, root, 0444, newStaticFile("0")), "tcp_fack": fs.newInode(ctx, root, 0444, newStaticFile("0")), "tcp_fastopen": fs.newInode(ctx, root, 0444, newStaticFile("0")), "tcp_fastopen_key": fs.newInode(ctx, root, 0444, newStaticFile("")), "tcp_invalid_ratelimit": fs.newInode(ctx, root, 0444, newStaticFile("0")), "tcp_keepalive_intvl": fs.newInode(ctx, root, 0444, newStaticFile("0")), "tcp_keepalive_probes": fs.newInode(ctx, root, 0444, newStaticFile("0")), "tcp_keepalive_time": fs.newInode(ctx, root, 0444, newStaticFile("7200")), "tcp_mtu_probing": fs.newInode(ctx, root, 0444, newStaticFile("0")), "tcp_no_metrics_save": fs.newInode(ctx, root, 0444, newStaticFile("1")), "tcp_probe_interval": fs.newInode(ctx, root, 0444, newStaticFile("0")), "tcp_probe_threshold": fs.newInode(ctx, root, 0444, newStaticFile("0")), "tcp_retries1": fs.newInode(ctx, root, 0444, newStaticFile("3")), "tcp_retries2": fs.newInode(ctx, root, 0444, newStaticFile("15")), "tcp_rfc1337": fs.newInode(ctx, root, 0444, newStaticFile("1")), "tcp_slow_start_after_idle": fs.newInode(ctx, root, 0444, newStaticFile("1")), "tcp_synack_retries": fs.newInode(ctx, root, 0444, newStaticFile("5")), "tcp_syn_retries": fs.newInode(ctx, root, 0444, newStaticFile("3")), "tcp_timestamps": fs.newInode(ctx, root, 0444, newStaticFile("1")), }), "core": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ "default_qdisc": fs.newInode(ctx, root, 0444, newStaticFile("pfifo_fast")), "message_burst": fs.newInode(ctx, root, 0444, newStaticFile("10")), "message_cost": fs.newInode(ctx, root, 0444, newStaticFile("5")), "optmem_max": fs.newInode(ctx, root, 0444, newStaticFile("0")), "rmem_default": fs.newInode(ctx, root, 0444, newStaticFile("212992")), "rmem_max": fs.newInode(ctx, root, 0444, newStaticFile("212992")), "somaxconn": fs.newInode(ctx, root, 0444, newStaticFile("128")), "wmem_default": fs.newInode(ctx, root, 0444, newStaticFile("212992")), "wmem_max": fs.newInode(ctx, root, 0444, newStaticFile("212992")), }), } } return fs.newStaticDir(ctx, root, contents) } // mmapMinAddrData implements vfs.DynamicBytesSource for // /proc/sys/vm/mmap_min_addr. // // +stateify savable type mmapMinAddrData struct { kernfs.DynamicBytesFile k *kernel.Kernel } var _ dynamicInode = (*mmapMinAddrData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress()) return nil } // hostnameData implements vfs.DynamicBytesSource for /proc/sys/kernel/hostname. // // +stateify savable type hostnameData struct { kernfs.DynamicBytesFile } var _ dynamicInode = (*hostnameData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (*hostnameData) Generate(ctx context.Context, buf *bytes.Buffer) error { utsns := kernel.UTSNamespaceFromContext(ctx) defer utsns.DecRef(ctx) buf.WriteString(utsns.HostName()) buf.WriteString("\n") return nil } // tcpSackData implements vfs.WritableDynamicBytesSource for // /proc/sys/net/tcp_sack. // // +stateify savable type tcpSackData struct { kernfs.DynamicBytesFile stack inet.Stack `state:"wait"` enabled *bool } var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error { if d.enabled == nil { sack, err := d.stack.TCPSACKEnabled() if err != nil { return err } d.enabled = &sack } val := "0\n" if *d.enabled { // Technically, this is not quite compatible with Linux. Linux stores these // as an integer, so if you write "2" into tcp_sack, you should get 2 back. // Tough luck. val = "1\n" } _, err := buf.WriteString(val) return err } // Write implements vfs.WritableDynamicBytesSource.Write. func (d *tcpSackData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // No need to handle partial writes thus far. return 0, linuxerr.EINVAL } buf := make([]int32, 1) n, err := ParseInt32Vec(ctx, src, buf) if err != nil || n == 0 { return 0, err } if d.enabled == nil { d.enabled = new(bool) } *d.enabled = buf[0] != 0 return n, d.stack.SetTCPSACKEnabled(*d.enabled) } // tcpRecoveryData implements vfs.WritableDynamicBytesSource for // /proc/sys/net/ipv4/tcp_recovery. // // +stateify savable type tcpRecoveryData struct { kernfs.DynamicBytesFile stack inet.Stack `state:"wait"` } var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error { recovery, err := d.stack.TCPRecovery() if err != nil { return err } _, err = buf.WriteString(fmt.Sprintf("%d\n", recovery)) return err } // Write implements vfs.WritableDynamicBytesSource.Write. func (d *tcpRecoveryData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // No need to handle partial writes thus far. return 0, linuxerr.EINVAL } buf := make([]int32, 1) n, err := ParseInt32Vec(ctx, src, buf) if err != nil || n == 0 { return 0, err } if err := d.stack.SetTCPRecovery(inet.TCPLossRecovery(buf[0])); err != nil { return 0, err } return n, nil } // tcpMemData implements vfs.WritableDynamicBytesSource for // /proc/sys/net/ipv4/tcp_rmem and /proc/sys/net/ipv4/tcp_wmem. // // +stateify savable type tcpMemData struct { kernfs.DynamicBytesFile dir tcpMemDir stack inet.Stack `state:"wait"` // mu protects against concurrent reads/writes to FDs based on the dentry // backing this byte source. mu sync.Mutex `state:"nosave"` } var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error { d.mu.Lock() defer d.mu.Unlock() size, err := d.readSizeLocked() if err != nil { return err } _, err = buf.WriteString(fmt.Sprintf("%d\t%d\t%d\n", size.Min, size.Default, size.Max)) return err } // Write implements vfs.WritableDynamicBytesSource.Write. func (d *tcpMemData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // No need to handle partial writes thus far. return 0, linuxerr.EINVAL } d.mu.Lock() defer d.mu.Unlock() size, err := d.readSizeLocked() if err != nil { return 0, err } buf := []int32{int32(size.Min), int32(size.Default), int32(size.Max)} n, err := ParseInt32Vec(ctx, src, buf) if err != nil || n == 0 { return 0, err } newSize := inet.TCPBufferSize{ Min: int(buf[0]), Default: int(buf[1]), Max: int(buf[2]), } if err := d.writeSizeLocked(newSize); err != nil { return 0, err } return n, nil } // Precondition: d.mu must be locked. func (d *tcpMemData) readSizeLocked() (inet.TCPBufferSize, error) { switch d.dir { case tcpRMem: return d.stack.TCPReceiveBufferSize() case tcpWMem: return d.stack.TCPSendBufferSize() default: panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir)) } } // Precondition: d.mu must be locked. func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error { switch d.dir { case tcpRMem: return d.stack.SetTCPReceiveBufferSize(size) case tcpWMem: return d.stack.SetTCPSendBufferSize(size) default: panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir)) } } // ipForwarding implements vfs.WritableDynamicBytesSource for // /proc/sys/net/ipv4/ip_forward. // // +stateify savable type ipForwarding struct { kernfs.DynamicBytesFile stack inet.Stack `state:"wait"` enabled bool } var _ vfs.WritableDynamicBytesSource = (*ipForwarding)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error { val := "0\n" if ipf.enabled { // Technically, this is not quite compatible with Linux. Linux stores these // as an integer, so if you write "2" into tcp_sack, you should get 2 back. // Tough luck. val = "1\n" } buf.WriteString(val) return nil } // Write implements vfs.WritableDynamicBytesSource.Write. func (ipf *ipForwarding) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // No need to handle partial writes thus far. return 0, linuxerr.EINVAL } buf := make([]int32, 1) n, err := ParseInt32Vec(ctx, src, buf) if err != nil || n == 0 { return 0, err } ipf.enabled = buf[0] != 0 if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, ipf.enabled); err != nil { return 0, err } return n, nil } // portRange implements vfs.WritableDynamicBytesSource for // /proc/sys/net/ipv4/ip_local_port_range. // // +stateify savable type portRange struct { kernfs.DynamicBytesFile stack inet.Stack `state:"wait"` // start and end store the port range. We must save/restore this here, // since a netstack instance is created on restore. start *uint16 end *uint16 } var _ vfs.WritableDynamicBytesSource = (*portRange)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (pr *portRange) Generate(ctx context.Context, buf *bytes.Buffer) error { if pr.start == nil { start, end := pr.stack.PortRange() pr.start = &start pr.end = &end } _, err := fmt.Fprintf(buf, "%d %d\n", *pr.start, *pr.end) return err } // Write implements vfs.WritableDynamicBytesSource.Write. func (pr *portRange) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // No need to handle partial writes thus far. return 0, linuxerr.EINVAL } ports := make([]int32, 2) n, err := ParseInt32Vec(ctx, src, ports) if err != nil || n == 0 { return 0, err } // Port numbers must be uint16s. if ports[0] < 0 || ports[1] < 0 || ports[0] > math.MaxUint16 || ports[1] > math.MaxUint16 { return 0, linuxerr.EINVAL } if err := pr.stack.SetPortRange(uint16(ports[0]), uint16(ports[1])); err != nil { return 0, err } if pr.start == nil { pr.start = new(uint16) pr.end = new(uint16) } *pr.start = uint16(ports[0]) *pr.end = uint16(ports[1]) return n, nil } // atomicInt32File implements vfs.WritableDynamicBytesSource sysctls // represented by int32 atomic objects. // // +stateify savable type atomicInt32File struct { kernfs.DynamicBytesFile val *atomicbitops.Int32 min, max int32 } var _ vfs.WritableDynamicBytesSource = (*atomicInt32File)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (f *atomicInt32File) Generate(ctx context.Context, buf *bytes.Buffer) error { _, err := fmt.Fprintf(buf, "%d\n", f.val.Load()) return err } // Write implements vfs.WritableDynamicBytesSource.Write. func (f *atomicInt32File) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // Ignore partial writes. return 0, linuxerr.EINVAL } buf := make([]int32, 1) n, err := ParseInt32Vec(ctx, src, buf) if err != nil || n == 0 { return 0, err } if buf[0] < f.min || buf[0] > f.max { return 0, linuxerr.EINVAL } f.val.Store(buf[0]) return n, nil } // randUUID returns a string containing a randomly-generated UUID followed by a // newline. func randUUID() string { var uuid [16]byte if _, err := io.ReadFull(rand.Reader, uuid[:]); err != nil { panic(fmt.Sprintf("failed to read random bytes for UUID: %v", err)) } uuid[8] = (uuid[8] & 0x3f) | 0x80 // RFC 4122 UUID uuid[6] = (uuid[6] & 0x0f) | 0x40 // Version 4 (random) return fmt.Sprintf("%x-%x-%x-%x-%x\n", uuid[:4], uuid[4:6], uuid[6:8], uuid[8:10], uuid[10:]) } // ParseInt32Vec interprets src as string encoding slice of int32, and // returns the parsed value and the number of bytes read. // // The numbers of int32 will be populated even if an error is returned eventually. func ParseInt32Vec(ctx context.Context, src usermem.IOSequence, buf []int32) (int64, error) { if src.NumBytes() == 0 { return 0, nil } // Limit input size so as not to impact performance if input size is large. src = src.TakeFirst(hostarch.PageSize - 1) return usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, buf, src.Opts) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/proc/yama.go000066400000000000000000000044641465435605700244730ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package proc import ( "bytes" "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) func (fs *filesystem) newYAMAPtraceScopeFile(ctx context.Context, k *kernel.Kernel, creds *auth.Credentials) kernfs.Inode { s := &yamaPtraceScope{level: &k.YAMAPtraceScope} s.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), s, 0644) return s } // yamaPtraceScope implements vfs.WritableDynamicBytesSource for // /sys/kernel/yama/ptrace_scope. // // +stateify savable type yamaPtraceScope struct { kernfs.DynamicBytesFile // level is the ptrace_scope level. level *atomicbitops.Int32 } var _ vfs.WritableDynamicBytesSource = (*yamaPtraceScope)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (s *yamaPtraceScope) Generate(ctx context.Context, buf *bytes.Buffer) error { _, err := fmt.Fprintf(buf, "%d\n", s.level.Load()) return err } // Write implements vfs.WritableDynamicBytesSource.Write. func (s *yamaPtraceScope) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // Ignore partial writes. return 0, linuxerr.EINVAL } buf := make([]int32, 1) n, err := ParseInt32Vec(ctx, src, buf) if err != nil || n == 0 { return 0, err } // We do not support YAMA levels > YAMA_SCOPE_RELATIONAL. if buf[0] < linux.YAMA_SCOPE_DISABLED || buf[0] > linux.YAMA_SCOPE_RELATIONAL { return 0, linuxerr.EINVAL } s.level.Store(buf[0]) return n, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/signalfd/000077500000000000000000000000001465435605700240315ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/signalfd/signalfd.go000066400000000000000000000125301465435605700261500ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package signalfd provides basic signalfd file implementations. package signalfd import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // SignalFileDescription implements vfs.FileDescriptionImpl for signal fds. // // +stateify savable type SignalFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD vfs.NoAsyncEventFD // target is the original signal target task. // // The semantics here are a bit broken. Linux will always use current // for all reads, regardless of where the signalfd originated. We can't // do exactly that because we need to plumb the context through // EventRegister in order to support proper blocking behavior. This // will undoubtedly become very complicated quickly. target *kernel.Task // queue is the queue for listeners. queue waiter.Queue // mu protects entry. mu sync.Mutex `state:"nosave"` // entry is the entry in the task signal queue. entry waiter.Entry } var _ vfs.FileDescriptionImpl = (*SignalFileDescription)(nil) // New creates a new signal fd. func New(vfsObj *vfs.VirtualFilesystem, target *kernel.Task, mask linux.SignalSet, flags uint32) (*vfs.FileDescription, error) { vd := vfsObj.NewAnonVirtualDentry("[signalfd]") defer vd.DecRef(target) sfd := &SignalFileDescription{ target: target, } sfd.entry.Init(sfd, waiter.EventMask(mask)) sfd.target.SignalRegister(&sfd.entry) if err := sfd.vfsfd.Init(sfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ UseDentryMetadata: true, DenyPRead: true, DenyPWrite: true, }); err != nil { sfd.target.SignalUnregister(&sfd.entry) return nil, err } return &sfd.vfsfd, nil } // Mask returns the signal mask. func (sfd *SignalFileDescription) Mask() linux.SignalSet { sfd.mu.Lock() defer sfd.mu.Unlock() return linux.SignalSet(sfd.entry.Mask()) } // SetMask sets the signal mask. func (sfd *SignalFileDescription) SetMask(mask linux.SignalSet) { sfd.mu.Lock() defer sfd.mu.Unlock() sfd.target.SignalUnregister(&sfd.entry) sfd.entry.Init(sfd, waiter.EventMask(mask)) sfd.target.SignalRegister(&sfd.entry) } // Read implements vfs.FileDescriptionImpl.Read. func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { // Attempt to dequeue relevant signals. info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0) if err != nil { // There must be no signal available. return 0, linuxerr.ErrWouldBlock } // Copy out the signal info using the specified format. infoNative := linux.SignalfdSiginfo{ Signo: uint32(info.Signo), Errno: info.Errno, Code: info.Code, PID: uint32(info.PID()), UID: uint32(info.UID()), Status: info.Status(), Overrun: uint32(info.Overrun()), Addr: info.Addr(), } n, err := infoNative.WriteTo(dst.Writer(ctx)) if err == usermem.ErrEndOfIOSequence { // Partial copy-out ok. err = nil } return n, err } // Readiness implements waiter.Waitable.Readiness. func (sfd *SignalFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { sfd.mu.Lock() defer sfd.mu.Unlock() if mask&waiter.ReadableEvents != 0 && sfd.target.PendingSignals()&linux.SignalSet(sfd.entry.Mask()) != 0 { return waiter.ReadableEvents // Pending signals. } return 0 } // EventRegister implements waiter.Waitable.EventRegister. func (sfd *SignalFileDescription) EventRegister(e *waiter.Entry) error { sfd.queue.EventRegister(e) return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (sfd *SignalFileDescription) EventUnregister(e *waiter.Entry) { sfd.queue.EventUnregister(e) } // NotifyEvent implements waiter.EventListener.NotifyEvent. func (sfd *SignalFileDescription) NotifyEvent(mask waiter.EventMask) { sfd.queue.Notify(waiter.EventIn) // Always notify data available. } // Epollable implements FileDescriptionImpl.Epollable. func (sfd *SignalFileDescription) Epollable() bool { return true } // Release implements vfs.FileDescriptionImpl.Release. func (sfd *SignalFileDescription) Release(context.Context) { sfd.target.SignalUnregister(&sfd.entry) } // RegisterFileAsyncHandler implements vfs.FileDescriptionImpl.RegisterFileAsyncHandler. func (sfd *SignalFileDescription) RegisterFileAsyncHandler(fd *vfs.FileDescription) error { return sfd.NoAsyncEventFD.RegisterFileAsyncHandler(fd) } // UnregisterFileAsyncHandler implements vfs.FileDescriptionImpl.UnregisterFileAsyncHandler. func (sfd *SignalFileDescription) UnregisterFileAsyncHandler(fd *vfs.FileDescription) { sfd.NoAsyncEventFD.UnregisterFileAsyncHandler(fd) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/signalfd/signalfd_state_autogen.go000066400000000000000000000031221465435605700310670ustar00rootroot00000000000000// automatically generated by stateify. package signalfd import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (sfd *SignalFileDescription) StateTypeName() string { return "pkg/sentry/fsimpl/signalfd.SignalFileDescription" } func (sfd *SignalFileDescription) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "NoLockFD", "NoAsyncEventFD", "target", "queue", "entry", } } func (sfd *SignalFileDescription) beforeSave() {} // +checklocksignore func (sfd *SignalFileDescription) StateSave(stateSinkObject state.Sink) { sfd.beforeSave() stateSinkObject.Save(0, &sfd.vfsfd) stateSinkObject.Save(1, &sfd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &sfd.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &sfd.NoLockFD) stateSinkObject.Save(4, &sfd.NoAsyncEventFD) stateSinkObject.Save(5, &sfd.target) stateSinkObject.Save(6, &sfd.queue) stateSinkObject.Save(7, &sfd.entry) } func (sfd *SignalFileDescription) afterLoad(context.Context) {} // +checklocksignore func (sfd *SignalFileDescription) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &sfd.vfsfd) stateSourceObject.Load(1, &sfd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &sfd.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &sfd.NoLockFD) stateSourceObject.Load(4, &sfd.NoAsyncEventFD) stateSourceObject.Load(5, &sfd.target) stateSourceObject.Load(6, &sfd.queue) stateSourceObject.Load(7, &sfd.entry) } func init() { state.Register((*SignalFileDescription)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/sockfs/000077500000000000000000000000001465435605700235325ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/sockfs/sockfs.go000066400000000000000000000102151465435605700253500ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package sockfs provides a filesystem implementation for anonymous sockets. package sockfs import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // filesystemType implements vfs.FilesystemType. // // +stateify savable type filesystemType struct{} // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { panic("sockfs.filesystemType.GetFilesystem should never be called") } // Name implements vfs.FilesystemType.Name. // // Note that registering sockfs is unnecessary, except for the fact that it // will not show up under /proc/filesystems as a result. This is a very minor // discrepancy from Linux. func (filesystemType) Name() string { return "sockfs" } // Release implements vfs.FilesystemType.Release. func (filesystemType) Release(ctx context.Context) {} // +stateify savable type filesystem struct { kernfs.Filesystem devMinor uint32 } // NewFilesystem sets up and returns a new sockfs filesystem. // // Note that there should only ever be one instance of sockfs.Filesystem, // backing a global socket mount. func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { devMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, err } fs := &filesystem{ devMinor: devMinor, } fs.Filesystem.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) return fs.Filesystem.VFSFilesystem(), nil } // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) } // PrependPath implements vfs.FilesystemImpl.PrependPath. func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { inode := vd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode) b.PrependComponent(fmt.Sprintf("socket:[%d]", inode.InodeAttrs.Ino())) return vfs.PrependPathSyntheticError{} } // MountOptions implements vfs.FilesystemImpl.MountOptions. func (fs *filesystem) MountOptions() string { return "" } // inode implements kernfs.Inode. // // +stateify savable type inode struct { kernfs.InodeAnonymous kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeNotDirectory kernfs.InodeNotSymlink kernfs.InodeWatches } // Open implements kernfs.Inode.Open. func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return nil, linuxerr.ENXIO } // StatFS implements kernfs.Inode.StatFS. func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { return vfs.GenericStatFS(linux.SOCKFS_MAGIC), nil } // NewDentry constructs and returns a sockfs dentry. // // Preconditions: mnt.Filesystem() must have been returned by NewFilesystem(). func NewDentry(ctx context.Context, mnt *vfs.Mount) *vfs.Dentry { fs := mnt.Filesystem().Impl().(*filesystem) // File mode matches net/socket.c:sock_alloc. filemode := linux.FileMode(linux.S_IFSOCK | 0777) i := &inode{} i.InodeAttrs.Init(ctx, auth.CredentialsFromContext(ctx), linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode) d := &kernfs.Dentry{} d.Init(&fs.Filesystem, i) return d.VFSDentry() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/sockfs/sockfs_state_autogen.go000066400000000000000000000047331465435605700303020ustar00rootroot00000000000000// automatically generated by stateify. package sockfs import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (fsType *filesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/sockfs.filesystemType" } func (fsType *filesystemType) StateFields() []string { return []string{} } func (fsType *filesystemType) beforeSave() {} // +checklocksignore func (fsType *filesystemType) StateSave(stateSinkObject state.Sink) { fsType.beforeSave() } func (fsType *filesystemType) afterLoad(context.Context) {} // +checklocksignore func (fsType *filesystemType) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (fs *filesystem) StateTypeName() string { return "pkg/sentry/fsimpl/sockfs.filesystem" } func (fs *filesystem) StateFields() []string { return []string{ "Filesystem", "devMinor", } } func (fs *filesystem) beforeSave() {} // +checklocksignore func (fs *filesystem) StateSave(stateSinkObject state.Sink) { fs.beforeSave() stateSinkObject.Save(0, &fs.Filesystem) stateSinkObject.Save(1, &fs.devMinor) } func (fs *filesystem) afterLoad(context.Context) {} // +checklocksignore func (fs *filesystem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fs.Filesystem) stateSourceObject.Load(1, &fs.devMinor) } func (i *inode) StateTypeName() string { return "pkg/sentry/fsimpl/sockfs.inode" } func (i *inode) StateFields() []string { return []string{ "InodeAnonymous", "InodeAttrs", "InodeNoopRefCount", "InodeNotDirectory", "InodeNotSymlink", "InodeWatches", } } func (i *inode) beforeSave() {} // +checklocksignore func (i *inode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.InodeAnonymous) stateSinkObject.Save(1, &i.InodeAttrs) stateSinkObject.Save(2, &i.InodeNoopRefCount) stateSinkObject.Save(3, &i.InodeNotDirectory) stateSinkObject.Save(4, &i.InodeNotSymlink) stateSinkObject.Save(5, &i.InodeWatches) } func (i *inode) afterLoad(context.Context) {} // +checklocksignore func (i *inode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.InodeAnonymous) stateSourceObject.Load(1, &i.InodeAttrs) stateSourceObject.Load(2, &i.InodeNoopRefCount) stateSourceObject.Load(3, &i.InodeNotDirectory) stateSourceObject.Load(4, &i.InodeNotSymlink) stateSourceObject.Load(5, &i.InodeWatches) } func init() { state.Register((*filesystemType)(nil)) state.Register((*filesystem)(nil)) state.Register((*inode)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/sys/000077500000000000000000000000001465435605700230605ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/sys/dir_refs.go000066400000000000000000000100201465435605700251750ustar00rootroot00000000000000package sys import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const direnableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var dirobj *dir // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type dirRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *dirRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *dirRefs) RefType() string { return fmt.Sprintf("%T", dirobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *dirRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *dirRefs) LogRefs() bool { return direnableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *dirRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *dirRefs) IncRef() { v := r.refCount.Add(1) if direnableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *dirRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if direnableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *dirRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if direnableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *dirRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/sys/kcov.go000066400000000000000000000070061465435605700243540ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sys import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) func (fs *filesystem) newKcovFile(ctx context.Context, creds *auth.Credentials) kernfs.Inode { k := &kcovInode{} k.InodeAttrs.Init(ctx, creds, 0, 0, fs.NextIno(), linux.S_IFREG|0600) return k } // kcovInode implements kernfs.Inode. // // +stateify savable type kcovInode struct { kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeNotAnonymous kernfs.InodeNotDirectory kernfs.InodeNotSymlink kernfs.InodeWatches implStatFS } func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { k := kernel.KernelFromContext(ctx) if k == nil { panic("KernelFromContext returned nil") } fd := &kcovFD{ inode: i, kcov: k.NewKcov(), } if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{ DenyPRead: true, DenyPWrite: true, }); err != nil { return nil, err } return &fd.vfsfd, nil } // +stateify savable type kcovFD struct { vfs.FileDescriptionDefaultImpl vfs.NoLockFD vfsfd vfs.FileDescription inode *kcovInode kcov *kernel.Kcov } // Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (fd *kcovFD) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { cmd := uint32(args[1].Int()) arg := args[2].Uint64() switch uint32(cmd) { case linux.KCOV_INIT_TRACE: return 0, fd.kcov.InitTrace(arg) case linux.KCOV_ENABLE: return 0, fd.kcov.EnableTrace(ctx, uint8(arg)) case linux.KCOV_DISABLE: if arg != 0 { // This arg is unused; it should be 0. return 0, linuxerr.EINVAL } return 0, fd.kcov.DisableTrace(ctx) default: return 0, linuxerr.ENOTTY } } // ConfigureMmap implements vfs.FileDescriptionImpl.ConfigureMmap. func (fd *kcovFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { return fd.kcov.ConfigureMMap(ctx, opts) } // Release implements vfs.FileDescriptionImpl.Release. func (fd *kcovFD) Release(ctx context.Context) { // kcov instances have reference counts in Linux, but this seems sufficient // for our purposes. fd.kcov.Clear(ctx) } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *kcovFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() return fd.inode.SetStat(ctx, fs, creds, opts) } // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *kcovFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { return fd.inode.Stat(ctx, fd.vfsfd.Mount().Filesystem(), opts) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/sys/pci.go000066400000000000000000000175721465435605700241760ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sys import ( "errors" "fmt" "path" regex "regexp" "strings" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fsutil" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) const ( accelDevice = "accel" vfioDevice = "vfio-dev" sysDevicesMainPath = "/sys/devices" ) var ( // pciBusRegex matches PCI bus addresses. pciBusRegex = regex.MustCompile(`pci0000:[[:xdigit:]]{2}`) // Matches PCI device addresses. pciDeviceRegex = regex.MustCompile(`0000:([[:xdigit:]]{2}|[[:xdigit:]]{4}):[[:xdigit:]]{2}\.[[:xdigit:]]{1,2}`) // Matches the directories for the main bus (i.e. pci000:00), // individual devices (e.g. 00:00:04.0), accel (TPU v4), and vfio (TPU v5) sysDevicesDirRegex = regex.MustCompile(`pci0000:[[:xdigit:]]{2}|accel|vfio|vfio-dev|(0000:([[:xdigit:]]{2}|[[:xdigit:]]{4}):[[:xdigit:]]{2}\.[[:xdigit:]]{1,2})`) // Files allowlisted for host passthrough. These files are read-only. sysDevicesFiles = map[string]any{ "vendor": nil, "device": nil, "subsystem_vendor": nil, "subsystem_device": nil, "revision": nil, "class": nil, "numa_node": nil, "resource": nil, "pci_address": nil, "dev": nil, "driver_version": nil, "reset_count": nil, "write_open_count": nil, "status": nil, "is_device_owned": nil, "device_owner": nil, "framework_version": nil, "user_mem_ranges": nil, "interrupt_counts": nil, "chip_model": nil, "bar_offsets": nil, "bar_sizes": nil, "resource0": nil, "resource1": nil, "resource2": nil, "resource3": nil, "resource4": nil, "resource5": nil, "enable": nil, } ) // sysDevicesPCIPaths returns the paths of all PCI devices on the host in a // /sys/devices directory. func sysDevicesPCIPaths(sysDevicesPath string) ([]string, error) { sysDevicesDents, err := hostDirEntries(sysDevicesPath) if err != nil { return nil, err } var pciPaths []string for _, dent := range sysDevicesDents { if pciBusRegex.MatchString(dent) { pciDents, err := hostDirEntries(path.Join(sysDevicesPath, dent)) if err != nil { return nil, err } for _, pciDent := range pciDents { pciPaths = append(pciPaths, path.Join(sysDevicesPath, dent, pciDent)) } } } return pciPaths, nil } // pciBusFromAddress returns the PCI bus address from a PCI address. // // Preconditions: pciAddr is a valid PCI address. func pciBusFromAddress(pciAddr string) string { return strings.Join(strings.Split(pciAddr, ":")[:2], ":") } // Creates TPU devices' symlinks under /sys/class/. TPU device types that are // not present on host will be ignored. // // TPU v4 symlinks are created at /sys/class/accel/accel#. // TPU v5 symlinks go to /sys/class/vfio-dev/vfio#. func (fs *filesystem) newDeviceClassDir(ctx context.Context, creds *auth.Credentials, tpuDeviceTypes []string, sysDevicesPath string) (map[string]map[string]kernfs.Inode, error) { dirs := map[string]map[string]kernfs.Inode{} for _, tpuDeviceType := range tpuDeviceTypes { dirs[tpuDeviceType] = map[string]kernfs.Inode{} } pciPaths, err := sysDevicesPCIPaths(sysDevicesPath) if err != nil { return nil, err } for _, pciPath := range pciPaths { for _, tpuDeviceType := range tpuDeviceTypes { subPath := path.Join(pciPath, tpuDeviceType) deviceDents, err := hostDirEntries(subPath) if err != nil { // Skips the path that doesn't exist. if err == unix.ENOENT { continue } return nil, err } if numOfDeviceDents := len(deviceDents); numOfDeviceDents != 1 { return nil, fmt.Errorf("exactly one entry is expected at %v while there are %d", subPath, numOfDeviceDents) } pciAddr := path.Base(pciPath) pciBus := pciBusFromAddress(pciAddr) dirs[tpuDeviceType][deviceDents[0]] = kernfs.NewStaticSymlink(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), fmt.Sprintf("../../devices/pci%s/%s/%s/%s", pciBus, pciAddr, tpuDeviceType, deviceDents[0])) } } if len(dirs) == 0 { return nil, errors.New("no TPU device sysfile is found") } return dirs, nil } // Create /sys/bus/pci/devices symlinks. func (fs *filesystem) newBusPCIDevicesDir(ctx context.Context, creds *auth.Credentials, sysDevicesPath string) (map[string]kernfs.Inode, error) { pciDevicesDir := map[string]kernfs.Inode{} pciPaths, err := sysDevicesPCIPaths(sysDevicesPath) if err != nil { return nil, err } for _, pciPath := range pciPaths { pciAddr := path.Base(pciPath) pciBus := pciBusFromAddress(pciAddr) pciDevicesDir[pciAddr] = kernfs.NewStaticSymlink(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), fmt.Sprintf("../../../devices/pci%s/%s", pciBus, pciAddr)) } return pciDevicesDir, nil } // Recursively build out sysfs directories according to the allowlisted files, // directories, and symlinks defined in this package. func (fs *filesystem) mirrorSysDevicesDir(ctx context.Context, creds *auth.Credentials, dir string, iommuGroups map[string]string) (map[string]kernfs.Inode, error) { subs := map[string]kernfs.Inode{} dents, err := hostDirEntries(dir) if err != nil { return nil, err } for _, dent := range dents { dentPath := path.Join(dir, dent) dentMode, err := hostFileMode(dentPath) if err != nil { return nil, err } switch dentMode { case unix.S_IFDIR: if match := sysDevicesDirRegex.MatchString(dent); !match { continue } contents, err := fs.mirrorSysDevicesDir(ctx, creds, dentPath, iommuGroups) if err != nil { return nil, err } subs[dent] = fs.newDir(ctx, creds, defaultSysMode, contents) case unix.S_IFREG: if _, ok := sysDevicesFiles[dent]; ok { subs[dent] = fs.newHostFile(ctx, creds, defaultSysMode, dentPath) } case unix.S_IFLNK: linkContent := "" switch { case pciDeviceRegex.MatchString(dent) || dent == "device": pciDeviceName, err := pciDeviceName(dir) if err != nil { return nil, err } // Both the device and PCI address entries are links to the original PCI // device directory that's at the same place earlier in the dir tree. linkContent = fmt.Sprintf("../../../%s", pciDeviceName) case dent == "iommu_group": pciDeviceName, err := pciDeviceName(dir) if err != nil { return nil, err } iommuGroupNum, exist := iommuGroups[pciDeviceName] if !exist { return nil, fmt.Errorf("no IOMMU group is found for device %v", pciDeviceName) } linkContent = fmt.Sprintf("../../../kernel/iommu_groups/%s", iommuGroupNum) default: continue } subs[dent] = kernfs.NewStaticSymlink(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linkContent) } } return subs, nil } // Infer a PCI device's name from its path. func pciDeviceName(pciDevicePath string) (string, error) { pciDeviceName := pciDeviceRegex.FindString(pciDevicePath) if pciDeviceName == "" { return "", fmt.Errorf("no valid device name for the device path at %v", pciDevicePath) } return pciDeviceName, nil } func hostFileMode(path string) (uint32, error) { fd, err := unix.Openat(-1, path, unix.O_RDONLY|unix.O_NOFOLLOW|unix.O_PATH, 0) if err != nil { return 0, err } stat := unix.Stat_t{} if err := unix.Fstat(fd, &stat); err != nil { return 0, err } return stat.Mode & unix.S_IFMT, nil } func hostDirEntries(path string) ([]string, error) { fd, err := unix.Openat(-1, path, unix.O_RDONLY|unix.O_NOFOLLOW, 0) if err != nil { return nil, err } defer unix.Close(fd) return fsutil.DirentNames(fd) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/sys/sys.go000066400000000000000000000357341465435605700242410ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package sys implements sysfs. package sys import ( "bytes" "fmt" "os" "path" "strconv" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/coverage" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) const ( // Name is the default filesystem name. Name = "sysfs" defaultSysMode = linux.FileMode(0444) defaultSysDirMode = linux.FileMode(0755) defaultMaxCachedDentries = uint64(1000) iommuGroupSysPath = "/sys/kernel/iommu_groups/" ) // FilesystemType implements vfs.FilesystemType. // // +stateify savable type FilesystemType struct{} // InternalData contains internal data passed in via // vfs.GetFilesystemOptions.InternalData. // // +stateify savable type InternalData struct { // ProductName is the value to be set to devices/virtual/dmi/id/product_name. ProductName string // EnableTPUProxyPaths is whether to populate sysfs paths used by hardware // accelerators. EnableTPUProxyPaths bool // TestSysfsPathPrefix is a prefix for the sysfs paths. It is useful for // unit testing. TestSysfsPathPrefix string } // filesystem implements vfs.FilesystemImpl. // // +stateify savable type filesystem struct { kernfs.Filesystem devMinor uint32 } // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name } // Release implements vfs.FilesystemType.Release. func (FilesystemType) Release(ctx context.Context) {} // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { devMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, nil, err } mopts := vfs.GenericParseMountOptions(opts.Data) maxCachedDentries := defaultMaxCachedDentries if str, ok := mopts["dentry_cache_limit"]; ok { delete(mopts, "dentry_cache_limit") maxCachedDentries, err = strconv.ParseUint(str, 10, 64) if err != nil { ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) return nil, nil, linuxerr.EINVAL } } fs := &filesystem{ devMinor: devMinor, } fs.MaxCachedDentries = maxCachedDentries fs.VFSFilesystem().Init(vfsObj, &fsType, fs) k := kernel.KernelFromContext(ctx) fsDirChildren := make(map[string]kernfs.Inode) // Create an empty directory to serve as the mount point for cgroupfs when // cgroups are available. This emulates Linux behaviour, see // kernel/cgroup.c:cgroup_init(). Note that in Linux, userspace (typically // the init process) is ultimately responsible for actually mounting // cgroupfs, but the kernel creates the mountpoint. For the sentry, the // launcher mounts cgroupfs. if k.CgroupRegistry() != nil { fsDirChildren["cgroup"] = fs.newCgroupDir(ctx, creds, defaultSysDirMode, nil) } classSub := map[string]kernfs.Inode{ "power_supply": fs.newDir(ctx, creds, defaultSysDirMode, nil), } devicesSub := map[string]kernfs.Inode{ "system": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ "cpu": cpuDir(ctx, fs, creds), }), } productName := "" busSub := make(map[string]kernfs.Inode) kernelSub := kernelDir(ctx, fs, creds) if opts.InternalData != nil { idata := opts.InternalData.(*InternalData) productName = idata.ProductName if idata.EnableTPUProxyPaths { deviceToIOMMUGroup, err := pciDeviceIOMMUGroups(path.Join(idata.TestSysfsPathPrefix, iommuGroupSysPath)) if err != nil { return nil, nil, err } sysDevicesPath := path.Join(idata.TestSysfsPathPrefix, sysDevicesMainPath) sysDevicesSub, err := fs.mirrorSysDevicesDir(ctx, creds, sysDevicesPath, deviceToIOMMUGroup) if err != nil { return nil, nil, err } for dir, sub := range sysDevicesSub { devicesSub[dir] = sub } deviceDirs, err := fs.newDeviceClassDir(ctx, creds, []string{accelDevice, vfioDevice}, sysDevicesPath) if err != nil { return nil, nil, err } for tpuDeviceType, symlinkDir := range deviceDirs { classSub[tpuDeviceType] = fs.newDir(ctx, creds, defaultSysDirMode, symlinkDir) } pciDevicesSub, err := fs.newBusPCIDevicesDir(ctx, creds, sysDevicesPath) if err != nil { return nil, nil, err } busSub["pci"] = fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ "devices": fs.newDir(ctx, creds, defaultSysDirMode, pciDevicesSub), }) iommuPath := path.Join(idata.TestSysfsPathPrefix, iommuGroupSysPath) iommuGroups, err := fs.mirrorIOMMUGroups(ctx, creds, iommuPath) if err != nil { return nil, nil, err } kernelSub["iommu_groups"] = fs.newDir(ctx, creds, defaultSysDirMode, iommuGroups) } } if len(productName) > 0 { log.Debugf("Setting product_name: %q", productName) classSub["dmi"] = fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ "id": kernfs.NewStaticSymlink(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "../../devices/virtual/dmi/id"), }) devicesSub["virtual"] = fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ "dmi": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ "id": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ "product_name": fs.newStaticFile(ctx, creds, defaultSysMode, productName+"\n"), }), }), }) } root := fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ "block": fs.newDir(ctx, creds, defaultSysDirMode, nil), "bus": fs.newDir(ctx, creds, defaultSysDirMode, busSub), "class": fs.newDir(ctx, creds, defaultSysDirMode, classSub), "dev": fs.newDir(ctx, creds, defaultSysDirMode, nil), "devices": fs.newDir(ctx, creds, defaultSysDirMode, devicesSub), "firmware": fs.newDir(ctx, creds, defaultSysDirMode, nil), "fs": fs.newDir(ctx, creds, defaultSysDirMode, fsDirChildren), "kernel": fs.newDir(ctx, creds, defaultSysDirMode, kernelSub), "module": fs.newDir(ctx, creds, defaultSysDirMode, nil), "power": fs.newDir(ctx, creds, defaultSysDirMode, nil), }) var rootD kernfs.Dentry rootD.InitRoot(&fs.Filesystem, root) return fs.VFSFilesystem(), rootD.VFSDentry(), nil } func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode { k := kernel.KernelFromContext(ctx) maxCPUCores := k.ApplicationCores() children := map[string]kernfs.Inode{ "online": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)), "possible": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)), "present": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)), } for i := uint(0); i < maxCPUCores; i++ { children[fmt.Sprintf("cpu%d", i)] = fs.newDir(ctx, creds, linux.FileMode(0555), nil) } return fs.newDir(ctx, creds, defaultSysDirMode, children) } // Returns a map from a PCI device name to its IOMMU group if available. func pciDeviceIOMMUGroups(iommuGroupsPath string) (map[string]string, error) { // IOMMU groups are organized as iommu_group_path/$GROUP, where $GROUP is // the IOMMU group number of which the device is a member. iommuGroupNums, err := hostDirEntries(iommuGroupsPath) if err != nil { // When IOMMU is not enabled, skip the rest of the process. if err == unix.ENOENT { return nil, nil } return nil, err } // The returned map from PCI device name to its IOMMU group. iommuGroups := map[string]string{} for _, iommuGroupNum := range iommuGroupNums { groupDevicesPath := path.Join(iommuGroupsPath, iommuGroupNum, "devices") pciDeviceNames, err := hostDirEntries(groupDevicesPath) if err != nil { return nil, err } // An IOMMU group may include multiple devices. for _, pciDeviceName := range pciDeviceNames { iommuGroups[pciDeviceName] = iommuGroupNum } } return iommuGroups, nil } func kernelDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) map[string]kernfs.Inode { // Set up /sys/kernel/debug/kcov. Technically, debugfs should be // mounted at debug/, but for our purposes, it is sufficient to keep it // in sys. children := make(map[string]kernfs.Inode) if coverage.KcovSupported() { log.Debugf("Set up /sys/kernel/debug/kcov") children["debug"] = fs.newDir(ctx, creds, linux.FileMode(0700), map[string]kernfs.Inode{ "kcov": fs.newKcovFile(ctx, creds), }) } return children } // Recursively build out IOMMU directories from the host. func (fs *filesystem) mirrorIOMMUGroups(ctx context.Context, creds *auth.Credentials, dir string) (map[string]kernfs.Inode, error) { subs := map[string]kernfs.Inode{} dents, err := hostDirEntries(dir) if err != nil { // TPU before v5 doesn't need IOMMU, skip the whole process for the backward compatibility when the directory can't be found. if err == unix.ENOENT { log.Debugf("Skip the path at %v which cannot be found.", dir) return nil, nil } return nil, err } for _, dent := range dents { absPath := path.Join(dir, dent) mode, err := hostFileMode(absPath) if err != nil { return nil, err } switch mode { case unix.S_IFDIR: contents, err := fs.mirrorIOMMUGroups(ctx, creds, absPath) if err != nil { return nil, err } subs[dent] = fs.newDir(ctx, creds, defaultSysMode, contents) case unix.S_IFREG: subs[dent] = fs.newHostFile(ctx, creds, defaultSysMode, absPath) case unix.S_IFLNK: if pciDeviceRegex.MatchString(dent) { pciBus := pciBusFromAddress(dent) subs[dent] = kernfs.NewStaticSymlink(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), fmt.Sprintf("../../../../devices/pci%s/%s", pciBus, dent)) } } } return subs, nil } // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) } // MountOptions implements vfs.FilesystemImpl.MountOptions. func (fs *filesystem) MountOptions() string { return fmt.Sprintf("dentry_cache_limit=%d", fs.MaxCachedDentries) } // dir implements kernfs.Inode. // // +stateify savable type dir struct { dirRefs kernfs.InodeAlwaysValid kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren kernfs.InodeNotAnonymous kernfs.InodeNotSymlink kernfs.InodeTemporary kernfs.InodeWatches kernfs.OrderedChildren locks vfs.FileLocks } func (fs *filesystem) newDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode { d := &dir{} d.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) d.InitRefs() d.IncLinks(d.OrderedChildren.Populate(contents)) return d } func (fs *filesystem) newCgroupDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode { d := &cgroupDir{} d.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) d.InitRefs() d.IncLinks(d.OrderedChildren.Populate(contents)) return d } // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return linuxerr.EPERM } // Open implements kernfs.Inode.Open. func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK | linux.O_NOCTTY fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndStaticEntries, }) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } // DecRef implements kernfs.Inode.DecRef. func (d *dir) DecRef(ctx context.Context) { d.dirRefs.DecRef(func() { d.Destroy(ctx) }) } // StatFS implements kernfs.Inode.StatFS. func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil } // cgroupDir implements kernfs.Inode. // // +stateify savable type cgroupDir struct { dir } // StatFS implements kernfs.Inode.StatFS. func (d *cgroupDir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { return vfs.GenericStatFS(linux.TMPFS_MAGIC), nil } // cpuFile implements kernfs.Inode. // // +stateify savable type cpuFile struct { implStatFS kernfs.DynamicBytesFile maxCores uint } // Generate implements vfs.DynamicBytesSource.Generate. func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "0-%d\n", c.maxCores-1) return nil } func (fs *filesystem) newCPUFile(ctx context.Context, creds *auth.Credentials, maxCores uint, mode linux.FileMode) kernfs.Inode { c := &cpuFile{maxCores: maxCores} c.DynamicBytesFile.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode) return c } // +stateify savable type implStatFS struct{} // StatFS implements kernfs.Inode.StatFS. func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil } // +stateify savable type staticFile struct { kernfs.DynamicBytesFile vfs.StaticData } func (fs *filesystem) newStaticFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, data string) kernfs.Inode { s := &staticFile{StaticData: vfs.StaticData{Data: data}} s.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), s, mode) return s } // hostFile is an inode whose contents are generated by reading from the // host. // // +stateify savable type hostFile struct { kernfs.DynamicBytesFile hostPath string } func (hf *hostFile) Generate(ctx context.Context, buf *bytes.Buffer) error { fd, err := unix.Openat(-1, hf.hostPath, unix.O_RDONLY|unix.O_NOFOLLOW, 0) if err != nil { return err } file := os.NewFile(uintptr(fd), hf.hostPath) defer file.Close() _, err = buf.ReadFrom(file) return err } func (fs *filesystem) newHostFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, hostPath string) kernfs.Inode { hf := &hostFile{hostPath: hostPath} hf.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), hf, mode) return hf } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/sys/sys_state_autogen.go000066400000000000000000000232021465435605700271460ustar00rootroot00000000000000// automatically generated by stateify. package sys import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (r *dirRefs) StateTypeName() string { return "pkg/sentry/fsimpl/sys.dirRefs" } func (r *dirRefs) StateFields() []string { return []string{ "refCount", } } func (r *dirRefs) beforeSave() {} // +checklocksignore func (r *dirRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *dirRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (i *kcovInode) StateTypeName() string { return "pkg/sentry/fsimpl/sys.kcovInode" } func (i *kcovInode) StateFields() []string { return []string{ "InodeAttrs", "InodeNoopRefCount", "InodeNotAnonymous", "InodeNotDirectory", "InodeNotSymlink", "InodeWatches", "implStatFS", } } func (i *kcovInode) beforeSave() {} // +checklocksignore func (i *kcovInode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.InodeAttrs) stateSinkObject.Save(1, &i.InodeNoopRefCount) stateSinkObject.Save(2, &i.InodeNotAnonymous) stateSinkObject.Save(3, &i.InodeNotDirectory) stateSinkObject.Save(4, &i.InodeNotSymlink) stateSinkObject.Save(5, &i.InodeWatches) stateSinkObject.Save(6, &i.implStatFS) } func (i *kcovInode) afterLoad(context.Context) {} // +checklocksignore func (i *kcovInode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.InodeAttrs) stateSourceObject.Load(1, &i.InodeNoopRefCount) stateSourceObject.Load(2, &i.InodeNotAnonymous) stateSourceObject.Load(3, &i.InodeNotDirectory) stateSourceObject.Load(4, &i.InodeNotSymlink) stateSourceObject.Load(5, &i.InodeWatches) stateSourceObject.Load(6, &i.implStatFS) } func (fd *kcovFD) StateTypeName() string { return "pkg/sentry/fsimpl/sys.kcovFD" } func (fd *kcovFD) StateFields() []string { return []string{ "FileDescriptionDefaultImpl", "NoLockFD", "vfsfd", "inode", "kcov", } } func (fd *kcovFD) beforeSave() {} // +checklocksignore func (fd *kcovFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(1, &fd.NoLockFD) stateSinkObject.Save(2, &fd.vfsfd) stateSinkObject.Save(3, &fd.inode) stateSinkObject.Save(4, &fd.kcov) } func (fd *kcovFD) afterLoad(context.Context) {} // +checklocksignore func (fd *kcovFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(1, &fd.NoLockFD) stateSourceObject.Load(2, &fd.vfsfd) stateSourceObject.Load(3, &fd.inode) stateSourceObject.Load(4, &fd.kcov) } func (fsType *FilesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/sys.FilesystemType" } func (fsType *FilesystemType) StateFields() []string { return []string{} } func (fsType *FilesystemType) beforeSave() {} // +checklocksignore func (fsType *FilesystemType) StateSave(stateSinkObject state.Sink) { fsType.beforeSave() } func (fsType *FilesystemType) afterLoad(context.Context) {} // +checklocksignore func (fsType *FilesystemType) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (i *InternalData) StateTypeName() string { return "pkg/sentry/fsimpl/sys.InternalData" } func (i *InternalData) StateFields() []string { return []string{ "ProductName", "EnableTPUProxyPaths", "TestSysfsPathPrefix", } } func (i *InternalData) beforeSave() {} // +checklocksignore func (i *InternalData) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.ProductName) stateSinkObject.Save(1, &i.EnableTPUProxyPaths) stateSinkObject.Save(2, &i.TestSysfsPathPrefix) } func (i *InternalData) afterLoad(context.Context) {} // +checklocksignore func (i *InternalData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.ProductName) stateSourceObject.Load(1, &i.EnableTPUProxyPaths) stateSourceObject.Load(2, &i.TestSysfsPathPrefix) } func (fs *filesystem) StateTypeName() string { return "pkg/sentry/fsimpl/sys.filesystem" } func (fs *filesystem) StateFields() []string { return []string{ "Filesystem", "devMinor", } } func (fs *filesystem) beforeSave() {} // +checklocksignore func (fs *filesystem) StateSave(stateSinkObject state.Sink) { fs.beforeSave() stateSinkObject.Save(0, &fs.Filesystem) stateSinkObject.Save(1, &fs.devMinor) } func (fs *filesystem) afterLoad(context.Context) {} // +checklocksignore func (fs *filesystem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fs.Filesystem) stateSourceObject.Load(1, &fs.devMinor) } func (d *dir) StateTypeName() string { return "pkg/sentry/fsimpl/sys.dir" } func (d *dir) StateFields() []string { return []string{ "dirRefs", "InodeAlwaysValid", "InodeAttrs", "InodeDirectoryNoNewChildren", "InodeNotAnonymous", "InodeNotSymlink", "InodeTemporary", "InodeWatches", "OrderedChildren", "locks", } } func (d *dir) beforeSave() {} // +checklocksignore func (d *dir) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.dirRefs) stateSinkObject.Save(1, &d.InodeAlwaysValid) stateSinkObject.Save(2, &d.InodeAttrs) stateSinkObject.Save(3, &d.InodeDirectoryNoNewChildren) stateSinkObject.Save(4, &d.InodeNotAnonymous) stateSinkObject.Save(5, &d.InodeNotSymlink) stateSinkObject.Save(6, &d.InodeTemporary) stateSinkObject.Save(7, &d.InodeWatches) stateSinkObject.Save(8, &d.OrderedChildren) stateSinkObject.Save(9, &d.locks) } func (d *dir) afterLoad(context.Context) {} // +checklocksignore func (d *dir) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.dirRefs) stateSourceObject.Load(1, &d.InodeAlwaysValid) stateSourceObject.Load(2, &d.InodeAttrs) stateSourceObject.Load(3, &d.InodeDirectoryNoNewChildren) stateSourceObject.Load(4, &d.InodeNotAnonymous) stateSourceObject.Load(5, &d.InodeNotSymlink) stateSourceObject.Load(6, &d.InodeTemporary) stateSourceObject.Load(7, &d.InodeWatches) stateSourceObject.Load(8, &d.OrderedChildren) stateSourceObject.Load(9, &d.locks) } func (d *cgroupDir) StateTypeName() string { return "pkg/sentry/fsimpl/sys.cgroupDir" } func (d *cgroupDir) StateFields() []string { return []string{ "dir", } } func (d *cgroupDir) beforeSave() {} // +checklocksignore func (d *cgroupDir) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.dir) } func (d *cgroupDir) afterLoad(context.Context) {} // +checklocksignore func (d *cgroupDir) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.dir) } func (c *cpuFile) StateTypeName() string { return "pkg/sentry/fsimpl/sys.cpuFile" } func (c *cpuFile) StateFields() []string { return []string{ "implStatFS", "DynamicBytesFile", "maxCores", } } func (c *cpuFile) beforeSave() {} // +checklocksignore func (c *cpuFile) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.implStatFS) stateSinkObject.Save(1, &c.DynamicBytesFile) stateSinkObject.Save(2, &c.maxCores) } func (c *cpuFile) afterLoad(context.Context) {} // +checklocksignore func (c *cpuFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.implStatFS) stateSourceObject.Load(1, &c.DynamicBytesFile) stateSourceObject.Load(2, &c.maxCores) } func (i *implStatFS) StateTypeName() string { return "pkg/sentry/fsimpl/sys.implStatFS" } func (i *implStatFS) StateFields() []string { return []string{} } func (i *implStatFS) beforeSave() {} // +checklocksignore func (i *implStatFS) StateSave(stateSinkObject state.Sink) { i.beforeSave() } func (i *implStatFS) afterLoad(context.Context) {} // +checklocksignore func (i *implStatFS) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (s *staticFile) StateTypeName() string { return "pkg/sentry/fsimpl/sys.staticFile" } func (s *staticFile) StateFields() []string { return []string{ "DynamicBytesFile", "StaticData", } } func (s *staticFile) beforeSave() {} // +checklocksignore func (s *staticFile) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.DynamicBytesFile) stateSinkObject.Save(1, &s.StaticData) } func (s *staticFile) afterLoad(context.Context) {} // +checklocksignore func (s *staticFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.DynamicBytesFile) stateSourceObject.Load(1, &s.StaticData) } func (hf *hostFile) StateTypeName() string { return "pkg/sentry/fsimpl/sys.hostFile" } func (hf *hostFile) StateFields() []string { return []string{ "DynamicBytesFile", "hostPath", } } func (hf *hostFile) beforeSave() {} // +checklocksignore func (hf *hostFile) StateSave(stateSinkObject state.Sink) { hf.beforeSave() stateSinkObject.Save(0, &hf.DynamicBytesFile) stateSinkObject.Save(1, &hf.hostPath) } func (hf *hostFile) afterLoad(context.Context) {} // +checklocksignore func (hf *hostFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &hf.DynamicBytesFile) stateSourceObject.Load(1, &hf.hostPath) } func init() { state.Register((*dirRefs)(nil)) state.Register((*kcovInode)(nil)) state.Register((*kcovFD)(nil)) state.Register((*FilesystemType)(nil)) state.Register((*InternalData)(nil)) state.Register((*filesystem)(nil)) state.Register((*dir)(nil)) state.Register((*cgroupDir)(nil)) state.Register((*cpuFile)(nil)) state.Register((*implStatFS)(nil)) state.Register((*staticFile)(nil)) state.Register((*hostFile)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/timerfd/000077500000000000000000000000001465435605700236745ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/timerfd/timerfd.go000066400000000000000000000111671465435605700256630ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package timerfd implements timer fds. package timerfd import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // TimerFileDescription implements vfs.FileDescriptionImpl for timer fds. It also // implements ktime.TimerListener. // // +stateify savable type TimerFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD events waiter.Queue timer *ktime.Timer // val is the number of timer expirations since the last successful // call to PRead, or SetTime. val must be accessed using atomic memory // operations. val atomicbitops.Uint64 } var _ vfs.FileDescriptionImpl = (*TimerFileDescription)(nil) var _ ktime.Listener = (*TimerFileDescription)(nil) // New returns a new timer fd. func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) { vd := vfsObj.NewAnonVirtualDentry("[timerfd]") defer vd.DecRef(ctx) tfd := &TimerFileDescription{} tfd.timer = ktime.NewTimer(clock, tfd) if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ UseDentryMetadata: true, DenyPRead: true, DenyPWrite: true, }); err != nil { return nil, err } return &tfd.vfsfd, nil } // Read implements vfs.FileDescriptionImpl.Read. func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { const sizeofUint64 = 8 if dst.NumBytes() < sizeofUint64 { return 0, linuxerr.EINVAL } if val := tfd.val.Swap(0); val != 0 { var buf [sizeofUint64]byte hostarch.ByteOrder.PutUint64(buf[:], val) if _, err := dst.CopyOut(ctx, buf[:]); err != nil { // Linux does not undo consuming the number of // expirations even if writing to userspace fails. return 0, err } return sizeofUint64, nil } return 0, linuxerr.ErrWouldBlock } // Clock returns the timer fd's Clock. func (tfd *TimerFileDescription) Clock() ktime.Clock { return tfd.timer.Clock() } // GetTime returns the associated Timer's setting and the time at which it was // observed. func (tfd *TimerFileDescription) GetTime() (ktime.Time, ktime.Setting) { return tfd.timer.Get() } // SetTime atomically changes the associated Timer's setting, resets the number // of expirations to 0, and returns the previous setting and the time at which // it was observed. func (tfd *TimerFileDescription) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) { return tfd.timer.SwapAnd(s, func() { tfd.val.Store(0) }) } // Readiness implements waiter.Waitable.Readiness. func (tfd *TimerFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { var ready waiter.EventMask if tfd.val.Load() != 0 { ready |= waiter.ReadableEvents } return ready } // EventRegister implements waiter.Waitable.EventRegister. func (tfd *TimerFileDescription) EventRegister(e *waiter.Entry) error { tfd.events.EventRegister(e) return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (tfd *TimerFileDescription) EventUnregister(e *waiter.Entry) { tfd.events.EventUnregister(e) } // Epollable implements FileDescriptionImpl.Epollable. func (tfd *TimerFileDescription) Epollable() bool { return true } // PauseTimer pauses the associated Timer. func (tfd *TimerFileDescription) PauseTimer() { tfd.timer.Pause() } // ResumeTimer resumes the associated Timer. func (tfd *TimerFileDescription) ResumeTimer() { tfd.timer.Resume() } // Release implements vfs.FileDescriptionImpl.Release. func (tfd *TimerFileDescription) Release(context.Context) { tfd.timer.Destroy() } // NotifyTimer implements ktime.TimerListener.NotifyTimer. func (tfd *TimerFileDescription) NotifyTimer(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { tfd.val.Add(exp) tfd.events.Notify(waiter.ReadableEvents) return ktime.Setting{}, false } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/timerfd/timerfd_state_autogen.go000066400000000000000000000027201465435605700306000ustar00rootroot00000000000000// automatically generated by stateify. package timerfd import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (tfd *TimerFileDescription) StateTypeName() string { return "pkg/sentry/fsimpl/timerfd.TimerFileDescription" } func (tfd *TimerFileDescription) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "NoLockFD", "events", "timer", "val", } } func (tfd *TimerFileDescription) beforeSave() {} // +checklocksignore func (tfd *TimerFileDescription) StateSave(stateSinkObject state.Sink) { tfd.beforeSave() stateSinkObject.Save(0, &tfd.vfsfd) stateSinkObject.Save(1, &tfd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &tfd.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &tfd.NoLockFD) stateSinkObject.Save(4, &tfd.events) stateSinkObject.Save(5, &tfd.timer) stateSinkObject.Save(6, &tfd.val) } func (tfd *TimerFileDescription) afterLoad(context.Context) {} // +checklocksignore func (tfd *TimerFileDescription) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &tfd.vfsfd) stateSourceObject.Load(1, &tfd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &tfd.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &tfd.NoLockFD) stateSourceObject.Load(4, &tfd.events) stateSourceObject.Load(5, &tfd.timer) stateSourceObject.Load(6, &tfd.val) } func init() { state.Register((*TimerFileDescription)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/tmpfs/000077500000000000000000000000001465435605700233735ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/tmpfs/dentry_list.go000066400000000000000000000120031465435605700262560ustar00rootroot00000000000000package tmpfs // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type dentryElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (dentryElementMapper) linkerFor(elem *dentry) *dentry { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type dentryList struct { head *dentry tail *dentry } // Reset resets list l to the empty state. func (l *dentryList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *dentryList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *dentryList) Front() *dentry { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *dentryList) Back() *dentry { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *dentryList) Len() (count int) { for e := l.Front(); e != nil; e = (dentryElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *dentryList) PushFront(e *dentry) { linker := dentryElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { dentryElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *dentryList) PushFrontList(m *dentryList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { dentryElementMapper{}.linkerFor(l.head).SetPrev(m.tail) dentryElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *dentryList) PushBack(e *dentry) { linker := dentryElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { dentryElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *dentryList) PushBackList(m *dentryList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { dentryElementMapper{}.linkerFor(l.tail).SetNext(m.head) dentryElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *dentryList) InsertAfter(b, e *dentry) { bLinker := dentryElementMapper{}.linkerFor(b) eLinker := dentryElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { dentryElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *dentryList) InsertBefore(a, e *dentry) { aLinker := dentryElementMapper{}.linkerFor(a) eLinker := dentryElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { dentryElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *dentryList) Remove(e *dentry) { linker := dentryElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { dentryElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { dentryElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type dentryEntry struct { next *dentry prev *dentry } // Next returns the entry that follows e in the list. // //go:nosplit func (e *dentryEntry) Next() *dentry { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *dentryEntry) Prev() *dentry { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *dentryEntry) SetNext(elem *dentry) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *dentryEntry) SetPrev(elem *dentry) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/tmpfs/device_file.go000066400000000000000000000026571465435605700261720ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tmpfs import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // +stateify savable type deviceFile struct { inode inode kind vfs.DeviceKind major uint32 minor uint32 } func (fs *filesystem) newDeviceFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32, parentDir *directory) *inode { file := &deviceFile{ kind: kind, major: major, minor: minor, } switch kind { case vfs.BlockDevice: mode |= linux.S_IFBLK case vfs.CharDevice: mode |= linux.S_IFCHR default: panic(fmt.Sprintf("invalid DeviceKind: %v", kind)) } file.inode.init(file, fs, kuid, kgid, mode, parentDir) file.inode.nlink = atomicbitops.FromUint32(1) // from parent directory return &file.inode } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/tmpfs/directory.go000066400000000000000000000144461465435605700257370ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tmpfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // +stateify savable type directory struct { // Since directories can't be hard-linked, each directory can only be // associated with a single dentry, which we can store in the directory // struct. dentry dentry inode inode // childMap maps the names of the directory's children to their dentries. // childMap is protected by filesystem.mu. childMap map[string]*dentry // numChildren is len(childMap), but accessed using atomic memory // operations to avoid locking in inode.statTo(). numChildren atomicbitops.Int64 // childList is a list containing (1) child dentries and (2) fake dentries // (with inode == nil) that represent the iteration position of // directoryFDs. childList is used to support directoryFD.IterDirents() // efficiently. childList is protected by iterMu. iterMu iterMutex `state:"nosave"` childList dentryList } func (fs *filesystem) newDirectory(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) *directory { dir := &directory{} dir.inode.init(dir, fs, kuid, kgid, linux.S_IFDIR|mode, parentDir) dir.inode.nlink = atomicbitops.FromUint32(2) // from "." and parent directory or ".." for root dir.dentry.inode = &dir.inode dir.dentry.vfsd.Init(&dir.dentry) return dir } // Preconditions: // - filesystem.mu must be locked for writing. // - dir must not already contain a child with the given name. func (dir *directory) insertChildLocked(child *dentry, name string) { child.parent.Store(&dir.dentry) child.name = name if dir.childMap == nil { dir.childMap = make(map[string]*dentry) } dir.childMap[name] = child dir.numChildren.Add(1) dir.iterMu.Lock() dir.childList.PushBack(child) dir.iterMu.Unlock() } // Preconditions: filesystem.mu must be locked for writing. func (dir *directory) removeChildLocked(child *dentry) { delete(dir.childMap, child.name) dir.numChildren.Add(-1) dir.iterMu.Lock() dir.childList.Remove(child) dir.iterMu.Unlock() } func (dir *directory) mayDelete(creds *auth.Credentials, child *dentry) error { return vfs.CheckDeleteSticky( creds, linux.FileMode(dir.inode.mode.Load()), auth.KUID(dir.inode.uid.Load()), auth.KUID(child.inode.uid.Load()), auth.KGID(child.inode.gid.Load()), ) } // +stateify savable type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl // Protected by directory.iterMu. iter *dentry off int64 } // Release implements vfs.FileDescriptionImpl.Release. func (fd *directoryFD) Release(ctx context.Context) { if fd.iter != nil { dir := fd.inode().impl.(*directory) dir.iterMu.Lock() dir.childList.Remove(fd.iter) dir.iterMu.Unlock() fd.iter = nil } } // IterDirents implements vfs.FileDescriptionImpl.IterDirents. func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { fs := fd.filesystem() dir := fd.inode().impl.(*directory) // fs.mu is required to read d.parent and dentry.name. fs.mu.RLock() defer fs.mu.RUnlock() dir.iterMu.Lock() defer dir.iterMu.Unlock() fd.inode().touchAtime(fd.vfsfd.Mount()) if fd.off == 0 { if err := cb.Handle(vfs.Dirent{ Name: ".", Type: linux.DT_DIR, Ino: dir.inode.ino, NextOff: 1, }); err != nil { return err } fd.off++ } if fd.off == 1 { parentInode := genericParentOrSelf(&dir.dentry).inode if err := cb.Handle(vfs.Dirent{ Name: "..", Type: parentInode.direntType(), Ino: parentInode.ino, NextOff: 2, }); err != nil { return err } fd.off++ } var child *dentry if fd.iter == nil { // Start iteration at the beginning of dir. child = dir.childList.Front() fd.iter = &dentry{} } else { // Continue iteration from where we left off. child = fd.iter.Next() dir.childList.Remove(fd.iter) } for child != nil { // Skip other directoryFD iterators. if child.inode != nil { if err := cb.Handle(vfs.Dirent{ Name: child.name, Type: child.inode.direntType(), Ino: child.inode.ino, NextOff: fd.off + 1, }); err != nil { dir.childList.InsertBefore(child, fd.iter) return err } fd.off++ } child = child.Next() } dir.childList.PushBack(fd.iter) return nil } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { dir := fd.inode().impl.(*directory) dir.iterMu.Lock() defer dir.iterMu.Unlock() switch whence { case linux.SEEK_SET: // Use offset as given. case linux.SEEK_CUR: offset += fd.off default: return 0, linuxerr.EINVAL } if offset < 0 { return 0, linuxerr.EINVAL } // If the offset isn't changing (e.g. due to lseek(0, SEEK_CUR)), don't // seek even if doing so might reposition the iterator due to concurrent // mutation of the directory. Compare fs/libfs.c:dcache_dir_lseek(). if fd.off == offset { return offset, nil } fd.off = offset // Compensate for "." and "..". remChildren := int64(0) if offset >= 2 { remChildren = offset - 2 } // Ensure that fd.iter exists and is not linked into dir.childList. if fd.iter == nil { fd.iter = &dentry{} } else { dir.childList.Remove(fd.iter) } // Insert fd.iter before the remChildren'th child, or at the end of the // list if remChildren >= number of children. child := dir.childList.Front() for child != nil { // Skip other directoryFD iterators. if child.inode != nil { if remChildren == 0 { dir.childList.InsertBefore(child, fd.iter) return offset, nil } remChildren-- } child = child.Next() } dir.childList.PushBack(fd.iter) return offset, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/tmpfs/filesystem.go000066400000000000000000000765361465435605700261270ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tmpfs import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" ) const ( // direntSize is the size of each directory entry // that Linux uses for computing directory size. // "20" is mm/shmem.c:BOGO_DIRENT_SIZE. direntSize = 20 // Linux implementation uses a SHORT_SYMLINK_LEN 128. // It accounts size for only SYMLINK with size >= 128. shortSymlinkLen = 128 ) // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { // All filesystem state is in-memory. return nil } // stepLocked resolves rp.Component() to an existing file, starting from the // given directory. // // stepLocked is loosely analogous to fs/namei.c:walk_component(). // // Preconditions: // - filesystem.mu must be locked. // - !rp.Done(). func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, bool, error) { dir, ok := d.inode.impl.(*directory) if !ok { return nil, false, linuxerr.ENOTDIR } if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, false, err } name := rp.Component() if name == "." { rp.Advance() return d, false, nil } if name == ".." { if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { return nil, false, err } else if isRoot || d.parent.Load() == nil { rp.Advance() return d, false, nil } if err := rp.CheckMount(ctx, &d.parent.Load().vfsd); err != nil { return nil, false, err } rp.Advance() return d.parent.Load(), false, nil } if len(name) > d.inode.fs.maxFilenameLen { return nil, false, linuxerr.ENAMETOOLONG } child, ok := dir.childMap[name] if !ok { return nil, false, linuxerr.ENOENT } if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, false, err } if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { // Symlink traversal updates access time. child.inode.touchAtime(rp.Mount()) followedSymlink, err := rp.HandleSymlink(symlink.target) return d, followedSymlink, err } rp.Advance() return child, false, nil } // walkParentDirLocked resolves all but the last path component of rp to an // existing directory, starting from the given directory (which is usually // rp.Start().Impl().(*dentry)). It does not check that the returned directory // is searchable by the provider of rp. // // walkParentDirLocked is loosely analogous to Linux's // fs/namei.c:path_parentat(). // // Preconditions: // - filesystem.mu must be locked. // - !rp.Done(). func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) { for !rp.Final() { next, _, err := stepLocked(ctx, rp, d) if err != nil { return nil, err } d = next } dir, ok := d.inode.impl.(*directory) if !ok { return nil, linuxerr.ENOTDIR } return dir, nil } // resolveLocked resolves rp to an existing file. // // resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). // // Preconditions: filesystem.mu must be locked. func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) { d := rp.Start().Impl().(*dentry) if symlink, ok := d.inode.impl.(*symlink); rp.Done() && ok && rp.ShouldFollowSymlink() { // Path with a single component. We don't need to step to the next // component, but still need to resolve any symlinks. // // Symlink traversal updates access time. d.inode.touchAtime(rp.Mount()) if _, err := rp.HandleSymlink(symlink.target); err != nil { return nil, err } } else { // Path with multiple components, walk and resolve as required. for !rp.Done() { next, _, err := stepLocked(ctx, rp, d) if err != nil { return nil, err } d = next } } if rp.MustBeDir() && !d.inode.isDir() { return nil, linuxerr.ENOTDIR } return d, nil } // doCreateAt checks that creating a file at rp is permitted, then invokes // create to do so. // // doCreateAt is loosely analogous to a conjunction of Linux's // fs/namei.c:filename_create() and done_path_create(). // // Preconditions: // - !rp.Done(). // - For the final path component in rp, !rp.ShouldFollowSymlink(). func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error { fs.mu.Lock() defer fs.mu.Unlock() parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } // Order of checks is important. First check if parent directory can be // executed, then check for existence, and lastly check if mount is writable. if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return err } name := rp.Component() if name == "." || name == ".." { return linuxerr.EEXIST } if len(name) > fs.maxFilenameLen { return linuxerr.ENAMETOOLONG } if _, ok := parentDir.childMap[name]; ok { return linuxerr.EEXIST } if !dir && rp.MustBeDir() { return linuxerr.ENOENT } // tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only // be dead if it was deleted. if parentDir.dentry.vfsd.IsDead() { return linuxerr.ENOENT } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { return err } defer mnt.EndWrite() if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return err } if err := create(parentDir, name); err != nil { return err } ev := linux.IN_CREATE if dir { ev |= linux.IN_ISDIR } parentDir.inode.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) parentDir.inode.touchCMtime() return nil } // AccessAt implements vfs.Filesystem.Impl.AccessAt. func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { fs.mu.RLock() defer fs.mu.RUnlock() d, err := resolveLocked(ctx, rp) if err != nil { return err } if err := d.inode.checkPermissions(creds, ats); err != nil { return err } if ats.MayWrite() && rp.Mount().ReadOnly() { return linuxerr.EROFS } return nil } // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { fs.mu.RLock() defer fs.mu.RUnlock() d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } if opts.CheckSearchable { if !d.inode.isDir() { return nil, linuxerr.ENOTDIR } if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } } d.IncRef() return &d.vfsd, nil } // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { fs.mu.RLock() defer fs.mu.RUnlock() dir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return nil, err } dir.dentry.IncRef() return &dir.dentry.vfsd, nil } // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { if rp.Mount() != vd.Mount() { return linuxerr.EXDEV } d := vd.Dentry().Impl().(*dentry) i := d.inode if i.isDir() { return linuxerr.EPERM } if err := vfs.MayLink(auth.CredentialsFromContext(ctx), linux.FileMode(i.mode.Load()), auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())); err != nil { return err } if i.nlink.Load() == 0 { return linuxerr.ENOENT } if i.nlink.Load() == maxLinks { return linuxerr.EMLINK } i.incLinksLocked() i.watches.Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */) parentDir.insertChildLocked(fs.newDentry(i), name) return nil }) } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { return fs.doCreateAt(ctx, rp, true /* dir */, func(parentDir *directory, name string) error { creds := rp.Credentials() if parentDir.inode.nlink.Load() == maxLinks { return linuxerr.EMLINK } parentDir.inode.incLinksLocked() // from child's ".." childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir) parentDir.insertChildLocked(&childDir.dentry, name) return nil }) } // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { creds := rp.Credentials() var childInode *inode switch opts.Mode.FileType() { case linux.S_IFREG: childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir) case linux.S_IFIFO: childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir) case linux.S_IFBLK: childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor, parentDir) case linux.S_IFCHR: childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor, parentDir) case linux.S_IFSOCK: childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint, parentDir) default: return linuxerr.EINVAL } child := fs.newDentry(childInode) parentDir.insertChildLocked(child, name) return nil }) } // OpenAt implements vfs.FilesystemImpl.OpenAt. func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { if opts.Flags&linux.O_TMPFILE != 0 { // Not yet supported. return nil, linuxerr.EOPNOTSUPP } // Handle O_CREAT and !O_CREAT separately, since in the latter case we // don't need fs.mu for writing. if opts.Flags&linux.O_CREAT == 0 { fs.mu.RLock() d, err := resolveLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return nil, err } d.IncRef() defer d.DecRef(ctx) fs.mu.RUnlock() return d.open(ctx, rp, &opts, false /* afterCreate */) } mustCreate := opts.Flags&linux.O_EXCL != 0 start := rp.Start().Impl().(*dentry) fs.mu.Lock() unlocked := false unlock := func() { if !unlocked { fs.mu.Unlock() unlocked = true } } defer unlock() if rp.Done() { // Reject attempts to open mount root directory with O_CREAT. if rp.MustBeDir() { return nil, linuxerr.EISDIR } if mustCreate { return nil, linuxerr.EEXIST } start.IncRef() defer start.DecRef(ctx) unlock() return start.open(ctx, rp, &opts, false /* afterCreate */) } afterTrailingSymlink: parentDir, err := walkParentDirLocked(ctx, rp, start) if err != nil { return nil, err } // Check for search permission in the parent directory. if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Reject attempts to open directories with O_CREAT. if rp.MustBeDir() { return nil, linuxerr.EISDIR } name := rp.Component() child, followedSymlink, err := stepLocked(ctx, rp, &parentDir.dentry) if followedSymlink { if mustCreate { // EEXIST must be returned if an existing symlink is opened with O_EXCL. return nil, linuxerr.EEXIST } if err != nil { // If followedSymlink && err != nil, then this symlink resolution error // must be handled by the VFS layer. return nil, err } start = &parentDir.dentry goto afterTrailingSymlink } if linuxerr.Equals(linuxerr.ENOENT, err) { // Already checked for searchability above; now check for writability. if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } if err := rp.Mount().CheckBeginWrite(); err != nil { return nil, err } defer rp.Mount().EndWrite() // Create and open the child. creds := rp.Credentials() child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir)) parentDir.insertChildLocked(child, name) child.IncRef() defer child.DecRef(ctx) unlock() fd, err := child.open(ctx, rp, &opts, true) if err != nil { return nil, err } parentDir.inode.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) parentDir.inode.touchCMtime() return fd, nil } if err != nil { return nil, err } if mustCreate { return nil, linuxerr.EEXIST } if rp.MustBeDir() && !child.inode.isDir() { return nil, linuxerr.ENOTDIR } child.IncRef() defer child.DecRef(ctx) unlock() return child.open(ctx, rp, &opts, false) } // Preconditions: The caller must hold no locks (since opening pipes may block // indefinitely). func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) if !afterCreate { if err := d.inode.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } } switch impl := d.inode.impl.(type) { case *regularFile: var fd regularFileFD fd.LockFD.Init(&d.inode.locks) if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { return nil, err } if !afterCreate && opts.Flags&linux.O_TRUNC != 0 { if _, err := impl.truncate(0); err != nil { return nil, err } } if fd.vfsfd.IsWritable() { fsmetric.TmpfsOpensW.Increment() } else if fd.vfsfd.IsReadable() { fsmetric.TmpfsOpensRO.Increment() } return &fd.vfsfd, nil case *directory: // Can't open directories with O_CREAT. if opts.Flags&linux.O_CREAT != 0 { return nil, linuxerr.EISDIR } // Can't open directories writably. if ats&vfs.MayWrite != 0 { return nil, linuxerr.EISDIR } if opts.Flags&linux.O_DIRECT != 0 { return nil, linuxerr.EINVAL } var fd directoryFD fd.LockFD.Init(&d.inode.locks) if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { return nil, err } return &fd.vfsfd, nil case *symlink: // Can't open symlinks without O_PATH, which is handled at the VFS layer. return nil, linuxerr.ELOOP case *namedPipe: return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks) case *deviceFile: return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts) case *socketFile: return nil, linuxerr.ENXIO default: panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl)) } } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { fs.mu.RLock() defer fs.mu.RUnlock() d, err := resolveLocked(ctx, rp) if err != nil { return "", err } symlink, ok := d.inode.impl.(*symlink) if !ok { return "", linuxerr.EINVAL } symlink.inode.touchAtime(rp.Mount()) return symlink.target, nil } // RenameAt implements vfs.FilesystemImpl.RenameAt. func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { // Resolve newParentDir first to verify that it's on this Mount. fs.mu.Lock() // We need to DecRef outside of fs.mu because forgetting a dead mountpoint // could result in this filesystem being released which acquires fs.mu. var toDecRef []refs.RefCounter defer func() { for _, ref := range toDecRef { ref.DecRef(ctx) } }() defer fs.mu.Unlock() newParentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } if opts.Flags&^linux.RENAME_NOREPLACE != 0 { // TODO(b/145974740): Support other renameat2 flags. return linuxerr.EINVAL } newName := rp.Component() if newName == "." || newName == ".." { if opts.Flags&linux.RENAME_NOREPLACE != 0 { return linuxerr.EEXIST } return linuxerr.EBUSY } if len(newName) > fs.maxFilenameLen { return linuxerr.ENAMETOOLONG } mnt := rp.Mount() if mnt != oldParentVD.Mount() { return linuxerr.EXDEV } if err := mnt.CheckBeginWrite(); err != nil { return err } defer mnt.EndWrite() oldParentDir := oldParentVD.Dentry().Impl().(*dentry).inode.impl.(*directory) if err := oldParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } renamed, ok := oldParentDir.childMap[oldName] if !ok { return linuxerr.ENOENT } if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil { return err } // Note that we don't need to call rp.CheckMount(), since if renamed is a // mount point then we want to rename the mount point, not anything in the // mounted filesystem. if renamed.inode.isDir() { if renamed == &newParentDir.dentry || genericIsAncestorDentry(renamed, &newParentDir.dentry) { return linuxerr.EINVAL } if oldParentDir != newParentDir { // Writability is needed to change renamed's "..". if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return err } } } else { if opts.MustBeDir || rp.MustBeDir() { return linuxerr.ENOTDIR } } if err := newParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } replaced, ok := newParentDir.childMap[newName] if ok { if opts.Flags&linux.RENAME_NOREPLACE != 0 { return linuxerr.EEXIST } replacedDir, ok := replaced.inode.impl.(*directory) if ok { if !renamed.inode.isDir() { return linuxerr.EISDIR } if len(replacedDir.childMap) != 0 { return linuxerr.ENOTEMPTY } } else { if rp.MustBeDir() { return linuxerr.ENOTDIR } if renamed.inode.isDir() { return linuxerr.ENOTDIR } } } else { if renamed.inode.isDir() && newParentDir.inode.nlink.Load() == maxLinks { return linuxerr.EMLINK } } // tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can // only be dead if it was deleted. if newParentDir.dentry.vfsd.IsDead() { return linuxerr.ENOENT } // Linux places this check before some of those above; we do it here for // simplicity, under the assumption that applications are not intentionally // doing noop renames expecting them to succeed where non-noop renames // would fail. if renamed == replaced { return nil } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef(ctx) var replacedVFSD *vfs.Dentry if replaced != nil { replacedVFSD = &replaced.vfsd } if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { return err } if replaced != nil { newParentDir.removeChildLocked(replaced) if replaced.inode.isDir() { // Remove links for replaced/. and replaced/.. replaced.inode.decLinksLocked(ctx) newParentDir.inode.decLinksLocked(ctx) } replaced.inode.decLinksLocked(ctx) } oldParentDir.removeChildLocked(renamed) newParentDir.insertChildLocked(renamed, newName) toDecRef = vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD) oldParentDir.inode.touchCMtime() if oldParentDir != newParentDir { if renamed.inode.isDir() { oldParentDir.inode.decLinksLocked(ctx) newParentDir.inode.incLinksLocked() } newParentDir.inode.touchCMtime() } renamed.inode.touchCtime() vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir()) return nil } // RmdirAt implements vfs.FilesystemImpl.RmdirAt. func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() // We need to DecRef outside of fs.mu because forgetting a dead mountpoint // could result in this filesystem being released which acquires fs.mu. var toDecRef []refs.RefCounter defer func() { for _, ref := range toDecRef { ref.DecRef(ctx) } }() defer fs.mu.Unlock() parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } name := rp.Component() if name == "." { return linuxerr.EINVAL } if name == ".." { return linuxerr.ENOTEMPTY } child, ok := parentDir.childMap[name] if !ok { return linuxerr.ENOENT } if err := parentDir.mayDelete(rp.Credentials(), child); err != nil { return err } childDir, ok := child.inode.impl.(*directory) if !ok { return linuxerr.ENOTDIR } if len(childDir.childMap) != 0 { return linuxerr.ENOTEMPTY } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { return err } defer mnt.EndWrite() vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef(ctx) if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } parentDir.removeChildLocked(child) parentDir.inode.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) // Remove links for child, child/., and child/.. child.inode.decLinksLocked(ctx) child.inode.decLinksLocked(ctx) parentDir.inode.decLinksLocked(ctx) toDecRef = vfsObj.CommitDeleteDentry(ctx, &child.vfsd) parentDir.inode.touchCMtime() return nil } // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { fs.mu.RLock() d, err := resolveLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return err } err = d.inode.setStat(ctx, rp.Credentials(), &opts) fs.mu.RUnlock() if err != nil { return err } if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) } return nil } // StatAt implements vfs.FilesystemImpl.StatAt. func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { fs.mu.RLock() defer fs.mu.RUnlock() d, err := resolveLocked(ctx, rp) if err != nil { return linux.Statx{}, err } var stat linux.Statx d.inode.statTo(&stat) return stat, nil } // StatFSAt implements vfs.FilesystemImpl.StatFSAt. func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { fs.mu.RLock() defer fs.mu.RUnlock() if _, err := resolveLocked(ctx, rp); err != nil { return linux.Statfs{}, err } return fs.statFS(), nil } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { // Linux allocates a page to store symlink targets that have length larger // than shortSymlinkLen. Targets are just stored as string here, but simulate // the page accounting for it. See mm/shmem.c:shmem_symlink(). if len(target) >= shortSymlinkLen { if !fs.accountPages(1) { return linuxerr.ENOSPC } } creds := rp.Credentials() child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target, parentDir)) parentDir.insertChildLocked(child, name) return nil }) } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() // We need to DecRef outside of fs.mu because forgetting a dead mountpoint // could result in this filesystem being released which acquires fs.mu. var toDecRef []refs.RefCounter defer func() { for _, ref := range toDecRef { ref.DecRef(ctx) } }() defer fs.mu.Unlock() parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } name := rp.Component() if name == "." || name == ".." { return linuxerr.EISDIR } child, ok := parentDir.childMap[name] if !ok { return linuxerr.ENOENT } if err := parentDir.mayDelete(rp.Credentials(), child); err != nil { return err } if child.inode.isDir() { return linuxerr.EISDIR } if rp.MustBeDir() { return linuxerr.ENOTDIR } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { return err } defer mnt.EndWrite() vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef(ctx) if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } // Generate inotify events. Note that this must take place before the link // count of the child is decremented, or else the watches may be dropped // before these events are added. vfs.InotifyRemoveChild(ctx, &child.inode.watches, &parentDir.inode.watches, name) parentDir.removeChildLocked(child) child.inode.decLinksLocked(ctx) toDecRef = vfsObj.CommitDeleteDentry(ctx, &child.vfsd) parentDir.inode.touchCMtime() return nil } // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() defer fs.mu.RUnlock() d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } switch impl := d.inode.impl.(type) { case *socketFile: if impl.ep == nil { return nil, linuxerr.ECONNREFUSED } return impl.ep, nil default: return nil, linuxerr.ECONNREFUSED } } // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { fs.mu.RLock() defer fs.mu.RUnlock() d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } return d.inode.listXattr(rp.Credentials(), size) } // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { fs.mu.RLock() defer fs.mu.RUnlock() d, err := resolveLocked(ctx, rp) if err != nil { return "", err } return d.inode.getXattr(rp.Credentials(), &opts) } // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { fs.mu.RLock() d, err := resolveLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return err } err = d.inode.setXattr(rp.Credentials(), &opts) fs.mu.RUnlock() if err != nil { return err } d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { fs.mu.RLock() d, err := resolveLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return err } err = d.inode.removeXattr(rp.Credentials(), name) fs.mu.RUnlock() if err != nil { return err } d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } // PrependPath implements vfs.FilesystemImpl.PrependPath. func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() defer fs.mu.RUnlock() mnt := vd.Mount() d := vd.Dentry().Impl().(*dentry) for { if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { return vfs.PrependPathAtVFSRootError{} } if mnt != nil && &d.vfsd == mnt.Root() { return nil } parent := d.parent.Load() if parent == nil { if d.name != "" { // This file must have been created by // newUnlinkedRegularFileDescription(). In Linux, // mm/shmem.c:__shmem_file_setup() => // fs/file_table.c:alloc_file_pseudo() sets the created // dentry's dentry_operations to anon_ops, for which d_dname == // simple_dname. fs/d_path.c:simple_dname() defines the // dentry's pathname to be its name, prefixed with "/" and // suffixed with " (deleted)". b.PrependComponent("/" + d.name) b.AppendString(" (deleted)") return vfs.PrependPathSyntheticError{} } return vfs.PrependPathAtNonMountRootError{} } b.PrependComponent(d.name) d = parent } } // MountOptions implements vfs.FilesystemImpl.MountOptions. func (fs *filesystem) MountOptions() string { return fs.mopts } // IsDescendant implements vfs.FilesystemImpl.IsDescendant. func (fs *filesystem) IsDescendant(vfsroot, vd vfs.VirtualDentry) bool { return genericIsDescendant(vfsroot.Dentry(), vd.Dentry().Impl().(*dentry)) } // adjustPageAcct adjusts the accounting done against filesystem size limit in // case there is any discrepancy between the number of pages reserved vs the // number of pages actually allocated. func (fs *filesystem) adjustPageAcct(reserved, alloced uint64) { if reserved < alloced { panic(fmt.Sprintf("More pages were allocated than the pages reserved: reserved=%d, alloced=%d", reserved, alloced)) } if pagesDiff := reserved - alloced; pagesDiff > 0 { fs.unaccountPages(pagesDiff) } } // accountPagesPartial increases the pagesUsed if tmpfs is mounted with size // option by as much as possible without going over the size mount option. It // returns the number of pages that we were able to account for. It returns false // when the maxSizeInPages has been exhausted and no more allocation can be done. // The returned value is guaranteed to be <= pagesInc. If the size mount option is // not set, then pagesInc will be returned. func (fs *filesystem) accountPagesPartial(pagesInc uint64) uint64 { if pagesInc == 0 { return pagesInc } for { pagesUsed := fs.pagesUsed.Load() if fs.maxSizeInPages <= pagesUsed { return 0 } pagesFree := fs.maxSizeInPages - pagesUsed toInc := pagesInc if pagesFree < pagesInc { toInc = pagesFree } if fs.pagesUsed.CompareAndSwap(pagesUsed, pagesUsed+toInc) { return toInc } } } // accountPages increases the pagesUsed in filesystem struct if tmpfs // is mounted with size option. We return a false when the maxSizeInPages // has been exhausted and no more allocation can be done. func (fs *filesystem) accountPages(pagesInc uint64) bool { if pagesInc == 0 { return true // No accounting needed. } for { pagesUsed := fs.pagesUsed.Load() if fs.maxSizeInPages <= pagesUsed { return false } pagesFree := fs.maxSizeInPages - pagesUsed if pagesFree < pagesInc { return false } if fs.pagesUsed.CompareAndSwap(pagesUsed, pagesUsed+pagesInc) { return true } } } // unaccountPages decreases the pagesUsed in filesystem struct if tmpfs // is mounted with size option. func (fs *filesystem) unaccountPages(pagesDec uint64) { if pagesDec == 0 { return } for { pagesUsed := fs.pagesUsed.Load() if pagesUsed < pagesDec { panic(fmt.Sprintf("Deallocating more pages than allocated: fs.pagesUsed = %d, pagesDec = %d", pagesUsed, pagesDec)) } if fs.pagesUsed.CompareAndSwap(pagesUsed, pagesUsed-pagesDec) { break } } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/tmpfs/filesystem_mutex.go000066400000000000000000000047001465435605700273310ustar00rootroot00000000000000package tmpfs import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type filesystemRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var filesystemlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type filesystemlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *filesystemRWMutex) Lock() { locking.AddGLock(filesystemprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *filesystemRWMutex) NestedLock(i filesystemlockNameIndex) { locking.AddGLock(filesystemprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *filesystemRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(filesystemprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *filesystemRWMutex) NestedUnlock(i filesystemlockNameIndex) { m.mu.Unlock() locking.DelGLock(filesystemprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *filesystemRWMutex) RLock() { locking.AddGLock(filesystemprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *filesystemRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(filesystemprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *filesystemRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *filesystemRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *filesystemRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var filesystemprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func filesysteminitLockNames() {} func init() { filesysteminitLockNames() filesystemprefixIndex = locking.NewMutexClass(reflect.TypeOf(filesystemRWMutex{}), filesystemlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/tmpfs/fstree.go000066400000000000000000000036701465435605700252200ustar00rootroot00000000000000package tmpfs import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // We need to define an interface instead of using atomic.Pointer because // the Dentry type gets removed during code generation and the compiler // complains about the unused sync/atomic type. type genericatomicptr interface { Load() *dentry } // IsAncestorDentry returns true if d is an ancestor of d2; that is, d is // either d2's parent or an ancestor of d2's parent. func genericIsAncestorDentry(d, d2 *dentry) bool { for d2 != nil { parent := d2.parent.Load() if parent == d { return true } if parent == d2 { return false } d2 = parent } return false } // IsDescendant returns true if vd is a descendant of vfsroot or if vd and // vfsroot are the same dentry. func genericIsDescendant(vfsroot *vfs.Dentry, d *dentry) bool { for d != nil && &d.vfsd != vfsroot { d = d.parent.Load() } return d != nil } // ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d. func genericParentOrSelf(d *dentry) *dentry { if parent := d.parent.Load(); parent != nil { return parent } return d } // PrependPath is a generic implementation of FilesystemImpl.PrependPath(). func genericPrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *dentry, b *fspath.Builder) error { for { if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { return vfs.PrependPathAtVFSRootError{} } if mnt != nil && &d.vfsd == mnt.Root() { return nil } parent := d.parent.Load() if parent == nil { return vfs.PrependPathAtNonMountRootError{} } b.PrependComponent(d.name) d = parent } } // DebugPathname returns a pathname to d relative to its filesystem root. // DebugPathname does not correspond to any Linux function; it's used to // generate dentry pathnames for debugging. func genericDebugPathname(d *dentry) string { var b fspath.Builder _ = genericPrependPath(vfs.VirtualDentry{}, nil, d, &b) return b.String() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/tmpfs/inode_mutex.go000066400000000000000000000031141465435605700262410ustar00rootroot00000000000000package tmpfs import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type inodeMutex struct { mu sync.Mutex } var inodeprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var inodelockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type inodelockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *inodeMutex) Lock() { locking.AddGLock(inodeprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *inodeMutex) NestedLock(i inodelockNameIndex) { locking.AddGLock(inodeprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *inodeMutex) Unlock() { locking.DelGLock(inodeprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *inodeMutex) NestedUnlock(i inodelockNameIndex) { locking.DelGLock(inodeprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func inodeinitLockNames() {} func init() { inodeinitLockNames() inodeprefixIndex = locking.NewMutexClass(reflect.TypeOf(inodeMutex{}), inodelockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/tmpfs/inode_refs.go000066400000000000000000000100661465435605700260420ustar00rootroot00000000000000package tmpfs import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const inodeenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var inodeobj *inode // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type inodeRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *inodeRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *inodeRefs) RefType() string { return fmt.Sprintf("%T", inodeobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *inodeRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *inodeRefs) LogRefs() bool { return inodeenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *inodeRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *inodeRefs) IncRef() { v := r.refCount.Add(1) if inodeenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *inodeRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if inodeenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *inodeRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if inodeenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *inodeRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/tmpfs/iter_mutex.go000066400000000000000000000030711465435605700261100ustar00rootroot00000000000000package tmpfs import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type iterMutex struct { mu sync.Mutex } var iterprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var iterlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type iterlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *iterMutex) Lock() { locking.AddGLock(iterprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *iterMutex) NestedLock(i iterlockNameIndex) { locking.AddGLock(iterprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *iterMutex) Unlock() { locking.DelGLock(iterprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *iterMutex) NestedUnlock(i iterlockNameIndex) { locking.DelGLock(iterprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func iterinitLockNames() {} func init() { iterinitLockNames() iterprefixIndex = locking.NewMutexClass(reflect.TypeOf(iterMutex{}), iterlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/tmpfs/named_pipe.go000066400000000000000000000025121465435605700260230ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tmpfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" ) // +stateify savable type namedPipe struct { inode inode pipe *pipe.VFSPipe } // Preconditions: // - fs.mu must be locked. // - rp.Mount().CheckBeginWrite() has been called successfully. func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) *inode { file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize)} file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode, parentDir) file.inode.nlink = atomicbitops.FromUint32(1) // Only the parent has a link. return &file.inode } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/tmpfs/pages_used_mutex.go000066400000000000000000000032301465435605700272610ustar00rootroot00000000000000package tmpfs import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type pagesUsedMutex struct { mu sync.Mutex } var pagesUsedprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var pagesUsedlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type pagesUsedlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *pagesUsedMutex) Lock() { locking.AddGLock(pagesUsedprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *pagesUsedMutex) NestedLock(i pagesUsedlockNameIndex) { locking.AddGLock(pagesUsedprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *pagesUsedMutex) Unlock() { locking.DelGLock(pagesUsedprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *pagesUsedMutex) NestedUnlock(i pagesUsedlockNameIndex) { locking.DelGLock(pagesUsedprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func pagesUsedinitLockNames() {} func init() { pagesUsedinitLockNames() pagesUsedprefixIndex = locking.NewMutexClass(reflect.TypeOf(pagesUsedMutex{}), pagesUsedlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/tmpfs/regular_file.go000066400000000000000000000664471465435605700264030ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tmpfs import ( "fmt" "io" "math" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/fsutil" "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) // regularFile is a regular (=S_IFREG) tmpfs file. // // +stateify savable type regularFile struct { inode inode // memoryUsageKind is the memory accounting category under which pages backing // this regularFile's contents are accounted. memoryUsageKind usage.MemoryKind // mapsMu protects mappings. mapsMu sync.Mutex `state:"nosave"` // mappings tracks mappings of the file into memmap.MappingSpaces. // // Protected by mapsMu. mappings memmap.MappingSet // writableMappingPages tracks how many pages of virtual memory are mapped // as potentially writable from this file. If a page has multiple mappings, // each mapping is counted separately. // // This counter is susceptible to overflow as we can potentially count // mappings from many VMAs. We count pages rather than bytes to slightly // mitigate this. // // Protected by mapsMu. writableMappingPages uint64 // dataMu protects the fields below. dataMu sync.RWMutex `state:"nosave"` // data maps offsets into the file to offsets into memFile that store // the file's data. // // Protected by dataMu. data fsutil.FileRangeSet // seals represents file seals on this inode. // // Protected by dataMu. seals uint32 // size is the size of data. // // Protected by both dataMu and inode.mu; reading it requires holding // either mutex, while writing requires holding both AND using atomics. // Readers that do not require consistency (like Stat) may read the // value atomically without holding either lock. size atomicbitops.Uint64 } func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) *inode { file := ®ularFile{ memoryUsageKind: fs.usage, seals: linux.F_SEAL_SEAL, } file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode, parentDir) file.inode.nlink = atomicbitops.FromUint32(1) // from parent directory return &file.inode } // newUnlinkedRegularFileDescription creates a regular file on the tmpfs // filesystem represented by mount and returns an FD representing that file. // The new file is not reachable by path traversal from any other file. // // newUnlinkedRegularFileDescription is analogous to Linux's // mm/shmem.c:__shmem_file_setup(). // // Preconditions: mount must be a tmpfs mount. func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) { fs, ok := mount.Filesystem().Impl().(*filesystem) if !ok { panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount") } inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777, nil /* parentDir */) d := fs.newDentry(inode) defer d.DecRef(ctx) d.name = name fd := ®ularFileFD{} fd.Init(&inode.locks) flags := uint32(linux.O_RDWR) if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return fd, nil } // NewZeroFile creates a new regular file and file description as for // mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is // initially (implicitly) filled with zeroes. // // Preconditions: mount must be a tmpfs mount. func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) { // Compare mm/shmem.c:shmem_zero_setup(). fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero") if err != nil { return nil, err } rf := fd.inode().impl.(*regularFile) rf.memoryUsageKind = usage.Anonymous rf.size.Store(size) return &fd.vfsfd, err } // NewMemfd creates a new regular file and file description as for // memfd_create. // // Preconditions: mount must be a tmpfs mount. func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) { fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name) if err != nil { return nil, err } if allowSeals { fd.inode().impl.(*regularFile).seals = 0 } return &fd.vfsfd, nil } // truncate grows or shrinks the file to the given size. It returns true if the // file size was updated. func (rf *regularFile) truncate(newSize uint64) (bool, error) { rf.inode.mu.Lock() defer rf.inode.mu.Unlock() return rf.truncateLocked(newSize) } // Preconditions: // - rf.inode.mu must be held. // - rf.dataMu must be locked for writing. // - newSize > rf.size. func (rf *regularFile) growLocked(newSize uint64) error { // Can we grow the file? if rf.seals&linux.F_SEAL_GROW != 0 { return linuxerr.EPERM } rf.size.Store(newSize) return nil } // Preconditions: rf.inode.mu must be held. func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) { oldSize := rf.size.RacyLoad() if newSize == oldSize { // Nothing to do. return false, nil } // Need to hold inode.mu and dataMu while modifying size. rf.dataMu.Lock() if newSize > oldSize { err := rf.growLocked(newSize) rf.dataMu.Unlock() return err == nil, err } // We are shrinking the file. First check if this is allowed. if rf.seals&linux.F_SEAL_SHRINK != 0 { rf.dataMu.Unlock() return false, linuxerr.EPERM } rf.size.Store(newSize) rf.dataMu.Unlock() // Invalidate past translations of truncated pages. oldpgend := offsetPageEnd(int64(oldSize)) newpgend := offsetPageEnd(int64(newSize)) if newpgend < oldpgend { rf.mapsMu.Lock() rf.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ // Compare Linux's mm/shmem.c:shmem_setattr() => // mm/memory.c:unmap_mapping_range(evencows=1). InvalidatePrivate: true, }) rf.mapsMu.Unlock() } // We are now guaranteed that there are no translations of truncated pages, // and can remove them. rf.dataMu.Lock() decPages := rf.data.Truncate(newSize, rf.inode.fs.mf) rf.dataMu.Unlock() rf.inode.fs.unaccountPages(decPages) return true, nil } // AddMapping implements memmap.Mappable.AddMapping. func (rf *regularFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { rf.mapsMu.Lock() defer rf.mapsMu.Unlock() rf.dataMu.RLock() defer rf.dataMu.RUnlock() // Reject writable mapping if F_SEAL_WRITE is set. if rf.seals&linux.F_SEAL_WRITE != 0 && writable { return linuxerr.EPERM } rf.mappings.AddMapping(ms, ar, offset, writable) if writable { pagesBefore := rf.writableMappingPages // ar is guaranteed to be page aligned per memmap.Mappable. rf.writableMappingPages += uint64(ar.Length() / hostarch.PageSize) if rf.writableMappingPages < pagesBefore { panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages)) } } return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (rf *regularFile) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { rf.mapsMu.Lock() defer rf.mapsMu.Unlock() rf.mappings.RemoveMapping(ms, ar, offset, writable) if writable { pagesBefore := rf.writableMappingPages // ar is guaranteed to be page aligned per memmap.Mappable. rf.writableMappingPages -= uint64(ar.Length() / hostarch.PageSize) if rf.writableMappingPages > pagesBefore { panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages)) } } } // CopyMapping implements memmap.Mappable.CopyMapping. func (rf *regularFile) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { return rf.AddMapping(ctx, ms, dstAR, offset, writable) } // Translate implements memmap.Mappable.Translate. func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { memCgID := pgalloc.MemoryCgroupIDFromContext(ctx) rf.dataMu.Lock() defer rf.dataMu.Unlock() // Constrain translations to f.attr.Size (rounded up) to prevent // translation to pages that may be concurrently truncated. pgend := offsetPageEnd(int64(rf.size.RacyLoad())) var beyondEOF bool if required.End > pgend { if required.Start >= pgend { return nil, &memmap.BusError{io.EOF} } beyondEOF = true required.End = pgend } if optional.End > pgend { optional.End = pgend } pagesToFill := rf.data.PagesToFill(required, optional) if !rf.inode.fs.accountPages(pagesToFill) { // If we can not accommodate pagesToFill pages, then retry with just // the required range. Because optional may be larger than required. // Only error out if even the required range can not be allocated for. pagesToFill = rf.data.PagesToFill(required, required) if !rf.inode.fs.accountPages(pagesToFill) { return nil, &memmap.BusError{linuxerr.ENOSPC} } optional = required } pagesAlloced, cerr := rf.data.Fill(ctx, required, optional, rf.size.RacyLoad(), rf.inode.fs.mf, pgalloc.AllocOpts{ Kind: rf.memoryUsageKind, MemCgID: memCgID, }, nil) // rf.data.Fill() may fail mid-way. We still want to account any pages that // were allocated, irrespective of an error. rf.inode.fs.adjustPageAcct(pagesToFill, pagesAlloced) var ts []memmap.Translation var translatedEnd uint64 for seg := rf.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { segMR := seg.Range().Intersect(optional) ts = append(ts, memmap.Translation{ Source: segMR, File: rf.inode.fs.mf, Offset: seg.FileRangeOf(segMR).Start, Perms: hostarch.AnyAccess, }) translatedEnd = segMR.End } // Don't return the error returned by f.data.Fill if it occurred outside of // required. if translatedEnd < required.End && cerr != nil { return ts, &memmap.BusError{cerr} } if beyondEOF { return ts, &memmap.BusError{io.EOF} } return ts, nil } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (*regularFile) InvalidateUnsavable(context.Context) error { return nil } // +stateify savable type regularFileFD struct { fileDescription // off is the file offset. off is accessed using atomic memory operations. // offMu serializes operations that may mutate off. off int64 offMu sync.Mutex `state:"nosave"` } // Release implements vfs.FileDescriptionImpl.Release. func (fd *regularFileFD) Release(context.Context) { // noop } // Allocate implements vfs.FileDescriptionImpl.Allocate. func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { f := fd.inode().impl.(*regularFile) memCgID := pgalloc.MemoryCgroupIDFromContext(ctx) // To be consistent with Linux, inode.mu must be locked throughout. f.inode.mu.Lock() defer f.inode.mu.Unlock() end := offset + length pgEnd, ok := hostarch.PageRoundUp(end) if !ok { return linuxerr.EFBIG } // Allocate in chunks for the following reasons: // 1. Size limit may permit really large fallocate, which can take a long // time to execute on the host. This can cause watchdog to timeout and // crash the system. Watchdog needs petting. // 2. Linux allocates folios iteratively while checking for interrupts. In // gVisor, we need to manually check for interrupts between chunks. const chunkSize = 4 << 30 // 4 GiB for curPgStart := hostarch.PageRoundDown(offset); curPgStart < pgEnd; { curPgEnd := pgEnd newSize := end if curPgEnd-curPgStart > chunkSize { curPgEnd = curPgStart + chunkSize newSize = curPgEnd } required := memmap.MappableRange{Start: curPgStart, End: curPgEnd} if err := f.allocateLocked(ctx, mode, newSize, required, memCgID); err != nil { return err } // This loop can take a long time to process, so periodically check for // interrupts. This also pets the watchdog. if ctx.Interrupted() { return linuxerr.EINTR } // Advance curPgStart. curPgStart = curPgEnd } return nil } // Preconditions: // - rf.inode.mu is locked. // - required must be page-aligned. // - required.Start < newSize <= required.End. func (rf *regularFile) allocateLocked(ctx context.Context, mode, newSize uint64, required memmap.MappableRange, memCgID uint32) error { rf.dataMu.Lock() defer rf.dataMu.Unlock() // We must allocate pages in the range specified by offset and length. // Even if newSize <= oldSize, there might not be actual memory backing this // range, so any gaps must be filled by calling f.data.Fill(). // "After a successful call, subsequent writes into the range // specified by offset and len are guaranteed not to fail because of // lack of disk space." - fallocate(2) pagesToFill := rf.data.PagesToFill(required, required) if !rf.inode.fs.accountPages(pagesToFill) { return linuxerr.ENOSPC } // Given our definitions in pgalloc, fallocate(2) semantics imply that pages // in the MemoryFile must be committed, in addition to being allocated. allocMode := pgalloc.AllocateAndCommit if !rf.inode.fs.mf.IsDiskBacked() { // Upgrade to AllocateAndWritePopulate for memory(shmem)-backed files. We // take a more aggressive approach in populating pages for memory-backed // MemoryFiles. shmem pages are subject to swap rather than disk writeback. // They are not likely to be swapped before they are written to. Hence it // is beneficial to populate (in addition to commit) shmem pages to avoid // faulting page-by-page when these pages are written to in the future. allocMode = pgalloc.AllocateAndWritePopulate } pagesAlloced, err := rf.data.Fill(ctx, required, required, newSize, rf.inode.fs.mf, pgalloc.AllocOpts{ Kind: rf.memoryUsageKind, MemCgID: memCgID, Mode: allocMode, }, nil /* r */) // f.data.Fill() may fail mid-way. We still want to account any pages that // were allocated, irrespective of an error. rf.inode.fs.adjustPageAcct(pagesToFill, pagesAlloced) if err != nil && err != io.EOF { return err } oldSize := rf.size.Load() if oldSize >= newSize { return nil } return rf.growLocked(newSize) } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { start := fsmetric.StartReadWait() defer fsmetric.FinishReadWait(fsmetric.TmpfsReadWait, start) fsmetric.TmpfsReads.Increment() if offset < 0 { return 0, linuxerr.EINVAL } // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since // all state is in-memory. // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { return 0, linuxerr.EOPNOTSUPP } if dst.NumBytes() == 0 { return 0, nil } f := fd.inode().impl.(*regularFile) // memCgID can be 0 here because regularFileReadWriter.ReadToBlocks() never // allocates from pgalloc. rw := getRegularFileReadWriter(f, offset, 0) n, err := dst.CopyOutFrom(ctx, rw) putRegularFileReadWriter(rw) fd.inode().touchAtime(fd.vfsfd.Mount()) return n, err } // Read implements vfs.FileDescriptionImpl.Read. func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { fd.offMu.Lock() n, err := fd.PRead(ctx, dst, fd.off, opts) fd.off += n fd.offMu.Unlock() return n, err } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { n, _, err := fd.pwrite(ctx, src, offset, opts) return n, err } // pwrite returns the number of bytes written, final offset and error. The // final offset should be ignored by PWrite. func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if offset < 0 { return 0, offset, linuxerr.EINVAL } // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since // all state is in-memory. // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { return 0, offset, linuxerr.EOPNOTSUPP } srclen := src.NumBytes() if srclen == 0 { return 0, offset, nil } f := fd.inode().impl.(*regularFile) f.inode.mu.Lock() defer f.inode.mu.Unlock() // If the file is opened with O_APPEND, update offset to file size. if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { // Locking f.inode.mu is sufficient for reading f.size. offset = int64(f.size.RacyLoad()) } end := offset + srclen if end < offset { // Overflow. return 0, offset, linuxerr.EINVAL } srclen, err = vfs.CheckLimit(ctx, offset, srclen) if err != nil { return 0, offset, err } src = src.TakeFirst64(srclen) // Perform the write. rw := getRegularFileReadWriter(f, offset, pgalloc.MemoryCgroupIDFromContext(ctx)) n, err := src.CopyInTo(ctx, rw) f.inode.touchCMtimeLocked() for { old := f.inode.mode.Load() new := vfs.ClearSUIDAndSGID(old) if swapped := f.inode.mode.CompareAndSwap(old, new); swapped { break } } putRegularFileReadWriter(rw) return n, n + offset, err } // Write implements vfs.FileDescriptionImpl.Write. func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { fd.offMu.Lock() n, off, err := fd.pwrite(ctx, src, fd.off, opts) fd.off = off fd.offMu.Unlock() return n, err } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { fd.offMu.Lock() defer fd.offMu.Unlock() switch whence { case linux.SEEK_SET: // use offset as specified case linux.SEEK_CUR: offset += fd.off case linux.SEEK_END: offset += int64(fd.inode().impl.(*regularFile).size.Load()) default: return 0, linuxerr.EINVAL } if offset < 0 { return 0, linuxerr.EINVAL } fd.off = offset return offset, nil } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { file := fd.inode().impl.(*regularFile) opts.SentryOwnedContent = true return vfs.GenericConfigureMMap(&fd.vfsfd, file, opts) } // offsetPageEnd returns the file offset rounded up to the nearest // page boundary. offsetPageEnd panics if rounding up causes overflow, // which shouldn't be possible given that offset is an int64. func offsetPageEnd(offset int64) uint64 { end, ok := hostarch.Addr(offset).RoundUp() if !ok { panic("impossible overflow") } return uint64(end) } // regularFileReadWriter implements safemem.Reader and Safemem.Writer. type regularFileReadWriter struct { file *regularFile // Offset into the file to read/write at. Note that this may be // different from the FD offset if PRead/PWrite is used. off uint64 // memCgID is the memory cgroup ID used for accounting the allocated // pages. memCgID uint32 } var regularFileReadWriterPool = sync.Pool{ New: func() any { return ®ularFileReadWriter{} }, } func getRegularFileReadWriter(file *regularFile, offset int64, memCgID uint32) *regularFileReadWriter { rw := regularFileReadWriterPool.Get().(*regularFileReadWriter) rw.file = file rw.off = uint64(offset) rw.memCgID = memCgID return rw } func putRegularFileReadWriter(rw *regularFileReadWriter) { rw.file = nil regularFileReadWriterPool.Put(rw) } // ReadToBlocks implements safemem.Reader.ReadToBlocks. func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { rw.file.dataMu.RLock() defer rw.file.dataMu.RUnlock() size := rw.file.size.RacyLoad() // Compute the range to read (limited by file size and overflow-checked). if rw.off >= size { return 0, io.EOF } end := size if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end { end = rend } var done uint64 seg, gap := rw.file.data.Find(uint64(rw.off)) for rw.off < end { mr := memmap.MappableRange{uint64(rw.off), uint64(end)} switch { case seg.Ok(): // Get internal mappings. ims, err := rw.file.inode.fs.mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read) if err != nil { return done, err } // Copy from internal mappings. n, err := safemem.CopySeq(dsts, ims) done += n rw.off += uint64(n) dsts = dsts.DropFirst64(n) if err != nil { return done, err } // Continue. seg, gap = seg.NextNonEmpty() case gap.Ok(): // Tmpfs holes are zero-filled. gapmr := gap.Range().Intersect(mr) dst := dsts.TakeFirst64(gapmr.Length()) n, err := safemem.ZeroSeq(dst) done += n rw.off += uint64(n) dsts = dsts.DropFirst64(n) if err != nil { return done, err } // Continue. seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} } } return done, nil } // WriteFromBlocks implements safemem.Writer.WriteFromBlocks. // // Preconditions: rw.file.inode.mu must be held. func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { // Hold dataMu so we can modify size. rw.file.dataMu.Lock() defer rw.file.dataMu.Unlock() // Compute the range to write (overflow-checked). end := rw.off + srcs.NumBytes() if end <= rw.off { end = math.MaxInt64 } // Check if seals prevent either file growth or all writes. switch { case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed return 0, linuxerr.EPERM case end > rw.file.size.RacyLoad() && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed // When growth is sealed, Linux effectively allows writes which would // normally grow the file to partially succeed up to the current EOF, // rounded down to the page boundary before the EOF. // // This happens because writes (and thus the growth check) for tmpfs // files proceed page-by-page on Linux, and the final write to the page // containing EOF fails, resulting in a partial write up to the start of // that page. // // To emulate this behaviour, artificially truncate the write to the // start of the page containing the current EOF. // // See Linux, mm/filemap.c:generic_perform_write() and // mm/shmem.c:shmem_write_begin(). if pgstart := uint64(hostarch.Addr(rw.file.size.RacyLoad()).RoundDown()); end > pgstart { end = pgstart } if end <= rw.off { // Truncation would result in no data being written. return 0, linuxerr.EPERM } } // Page-aligned mr for when we need to allocate memory. RoundUp can't // overflow since end is an int64. pgstartaddr := hostarch.Addr(rw.off).RoundDown() pgendaddr, _ := hostarch.Addr(end).RoundUp() pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)} var ( done uint64 retErr error ) seg, gap := rw.file.data.Find(uint64(rw.off)) for rw.off < end { mr := memmap.MappableRange{uint64(rw.off), uint64(end)} switch { case seg.Ok(): n, err := rw.writeToMF(seg.FileRangeOf(seg.Range().Intersect(mr)), srcs) done += n rw.off += uint64(n) srcs = srcs.DropFirst64(n) if err != nil { retErr = err goto exitLoop } // Continue. seg, gap = seg.NextNonEmpty() case gap.Ok(): // Allocate memory for the write. gapMR := gap.Range().Intersect(pgMR) pagesToFill := gapMR.Length() / hostarch.PageSize pagesReserved := rw.file.inode.fs.accountPagesPartial(pagesToFill) if pagesReserved == 0 { if done == 0 { retErr = linuxerr.ENOSPC goto exitLoop } retErr = nil goto exitLoop } gapMR.End = gapMR.Start + (hostarch.PageSize * pagesReserved) allocMode := pgalloc.AllocateAndWritePopulate if rw.file.inode.fs.mf.IsDiskBacked() { // Don't populate pages for disk-backed files. Benchmarking showed that // disk-backed pages are likely to be written back to disk before we // can write to them. The pages fault again on write anyways. In total, // prepopulating disk-backed pages deteriorates performance as it fails // to eliminate future page faults and we also additionally incur // useless disk writebacks. allocMode = pgalloc.AllocateCallerIndirectCommit } fr, err := rw.file.inode.fs.mf.Allocate(gapMR.Length(), pgalloc.AllocOpts{ Kind: rw.file.memoryUsageKind, MemCgID: rw.memCgID, Mode: allocMode, }) if err != nil { retErr = err rw.file.inode.fs.unaccountPages(pagesReserved) goto exitLoop } // Write to that memory as usual. seg, gap = rw.file.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{} default: panic("unreachable") } } exitLoop: // If the write ends beyond the file's previous size, it causes the // file to grow. if rw.off > rw.file.size.RacyLoad() { rw.file.size.Store(rw.off) } return done, retErr } func (rw *regularFileReadWriter) writeToMF(fr memmap.FileRange, srcs safemem.BlockSeq) (uint64, error) { if rw.file.inode.fs.mf.IsDiskBacked() { // Disk-backed files are not prepopulated. The safemem.CopySeq() approach // used below incurs a lot of page faults without page prepopulation, which // causes a lot of context switching. Use write(2) host syscall instead, // which makes one context switch and faults all the pages that are touched // during the write. return hostfd.Pwritev2( int32(rw.file.inode.fs.mf.FD()), // fd srcs.TakeFirst64(fr.Length()), // srcs int64(fr.Start), // offset 0, // flags ) } // Get internal mappings. ims, err := rw.file.inode.fs.mf.MapInternal(fr, hostarch.Write) if err != nil { return 0, err } // Copy to internal mappings. return safemem.CopySeq(ims, srcs) } // GetSeals returns the current set of seals on a memfd inode. func GetSeals(fd *vfs.FileDescription) (uint32, error) { f, ok := fd.Impl().(*regularFileFD) if !ok { return 0, linuxerr.EINVAL } rf := f.inode().impl.(*regularFile) rf.dataMu.RLock() defer rf.dataMu.RUnlock() return rf.seals, nil } // AddSeals adds new file seals to a memfd inode. func AddSeals(fd *vfs.FileDescription, val uint32) error { f, ok := fd.Impl().(*regularFileFD) if !ok { return linuxerr.EINVAL } rf := f.inode().impl.(*regularFile) rf.mapsMu.Lock() defer rf.mapsMu.Unlock() rf.dataMu.Lock() defer rf.dataMu.Unlock() if rf.seals&linux.F_SEAL_SEAL != 0 { // Seal applied which prevents addition of any new seals. return linuxerr.EPERM } // F_SEAL_WRITE can only be added if there are no active writable maps. if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 { if rf.writableMappingPages > 0 { return linuxerr.EBUSY } } // Seals can only be added, never removed. rf.seals |= val return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/tmpfs/save_restore.go000066400000000000000000000045101465435605700264230ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tmpfs import ( goContext "context" "fmt" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // saveMf is called by stateify. func (fs *filesystem) saveMf() string { if !fs.mf.IsSavable() { panic(fmt.Sprintf("Can't save tmpfs filesystem because its MemoryFile is not savable: %v", fs.mf)) } return fs.mf.RestoreID() } // loadMf is called by stateify. func (fs *filesystem) loadMf(ctx goContext.Context, restoreID string) { if restoreID == "" { fs.mf = pgalloc.MemoryFileFromContext(ctx) return } mfmap := pgalloc.MemoryFileMapFromContext(ctx) if mfmap == nil { panic("CtxMemoryFileMap was not provided") } mf, ok := mfmap[restoreID] if !ok { panic(fmt.Sprintf("Memory file for %q not found in CtxMemoryFileMap", restoreID)) } fs.mf = mf } // saveParent is called by stateify. func (d *dentry) saveParent() *dentry { return d.parent.Load() } // loadParent is called by stateify. func (d *dentry) loadParent(_ goContext.Context, parent *dentry) { d.parent.Store(parent) } // PrepareSave implements vfs.FilesystemImplSaveRestoreExtension.PrepareSave. func (fs *filesystem) PrepareSave(ctx context.Context) error { restoreID := fs.mf.RestoreID() if restoreID == "" { return nil } mfmap := pgalloc.MemoryFileMapFromContext(ctx) if mfmap == nil { return fmt.Errorf("CtxMemoryFileMap was not provided") } if _, ok := mfmap[restoreID]; ok { return fmt.Errorf("memory file for %q already exists in CtxMemoryFileMap", restoreID) } mfmap[restoreID] = fs.mf return nil } // CompleteRestore implements // vfs.FilesystemImplSaveRestoreExtension.CompleteRestore. func (fs *filesystem) CompleteRestore(ctx context.Context, opts vfs.CompleteRestoreOptions) error { return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/tmpfs/socket_file.go000066400000000000000000000023641465435605700262160ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tmpfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" ) // socketFile is a socket (=S_IFSOCK) tmpfs file. // // +stateify savable type socketFile struct { inode inode ep transport.BoundEndpoint } func (fs *filesystem) newSocketFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, ep transport.BoundEndpoint, parentDir *directory) *inode { file := &socketFile{ep: ep} file.inode.init(file, fs, kuid, kgid, mode, parentDir) file.inode.nlink = atomicbitops.FromUint32(1) // from parent directory return &file.inode } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/tmpfs/symlink.go000066400000000000000000000023641465435605700254150ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tmpfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) // +stateify savable type symlink struct { inode inode target string // immutable } func (fs *filesystem) newSymlink(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, target string, parentDir *directory) *inode { link := &symlink{ target: target, } link.inode.init(link, fs, kuid, kgid, linux.S_IFLNK|mode, parentDir) link.inode.nlink = atomicbitops.FromUint32(1) // from parent directory return &link.inode } // O_PATH is unimplemented, so there's no way to get a FileDescription // representing a symlink yet. golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/tmpfs/tmpfs.go000066400000000000000000000723211465435605700250600ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package tmpfs provides an in-memory filesystem whose contents are // application-mutable, consistent with Linux's tmpfs. // // Lock order: // // filesystem.mu // inode.mu // regularFileFD.offMu // *** "memmap.Mappable locks" below this point // regularFile.mapsMu // *** "memmap.Mappable locks taken by Translate" below this point // regularFile.dataMu // fs.pagesUsedMu // directory.iterMu package tmpfs import ( "fmt" "math" "strconv" "strings" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/vfs/memxattr" ) // Name is the default filesystem name. const Name = "tmpfs" // FilesystemType implements vfs.FilesystemType. // // +stateify savable type FilesystemType struct{} // filesystem implements vfs.FilesystemImpl. // // +stateify savable type filesystem struct { vfsfs vfs.Filesystem // mf is used to allocate memory that stores regular file contents. mf is // immutable, except it is changed during restore. mf *pgalloc.MemoryFile `state:".(string)"` // clock is a realtime clock used to set timestamps in file operations. clock time.Clock // devMinor is the filesystem's minor device number. devMinor is immutable. devMinor uint32 // mopts contains the tmpfs-specific mount options passed to this // filesystem. Immutable. mopts string // usage is the memory accounting category under which pages backing // files in this filesystem are accounted. usage usage.MemoryKind // mu serializes changes to the Dentry tree. mu filesystemRWMutex `state:"nosave"` nextInoMinusOne atomicbitops.Uint64 // accessed using atomic memory operations root *dentry maxFilenameLen int // maxSizeInPages is the maximum permissible size for the tmpfs in terms of pages. // This field is immutable. maxSizeInPages uint64 // pagesUsed is the number of pages used by this filesystem. pagesUsed atomicbitops.Uint64 // allowXattrPrefix is a set of xattr namespace prefixes that this // tmpfs mount will allow. It is immutable. allowXattrPrefix map[string]struct{} } // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name } // Release implements vfs.FilesystemType.Release. func (FilesystemType) Release(ctx context.Context) {} // FilesystemOpts is used to pass configuration data to tmpfs. // // +stateify savable type FilesystemOpts struct { // RootFileType is the FileType of the filesystem root. Valid values // are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR. RootFileType uint16 // RootSymlinkTarget is the target of the root symlink. Only valid if // RootFileType == S_IFLNK. RootSymlinkTarget string // FilesystemType allows setting a different FilesystemType for this // tmpfs filesystem. This allows tmpfs to "impersonate" other // filesystems, like ramdiskfs and cgroupfs. FilesystemType vfs.FilesystemType // Usage is the memory accounting category under which pages backing files in // the filesystem are accounted. Usage *usage.MemoryKind // MaxFilenameLen is the maximum filename length allowed by the tmpfs. MaxFilenameLen int // MemoryFile is the memory file that will be used to store file data. If // this is nil, then MemoryFileFromContext() is used. MemoryFile *pgalloc.MemoryFile // DisableDefaultSizeLimit disables setting a default size limit. In Linux, // SB_KERNMOUNT has this effect on tmpfs mounts; see mm/shmem.c:shmem_fill_super(). DisableDefaultSizeLimit bool // AllowXattrPrefix is a set of xattr namespace prefixes that this // tmpfs mount will allow. AllowXattrPrefix []string } // Default size limit mount option. It is immutable after initialization. var defaultSizeLimit uint64 // SetDefaultSizeLimit configures the size limit to be used for tmpfs mounts // that do not specify a size= mount option. This must be called only once, // before any tmpfs filesystems are created. func SetDefaultSizeLimit(sizeLimit uint64) { defaultSizeLimit = sizeLimit } func getDefaultSizeLimit(disable bool) uint64 { if disable || defaultSizeLimit == 0 { // The size limit is used to populate statfs(2) results. If Linux tmpfs is // mounted with no size option, then statfs(2) returns f_blocks == f_bfree // == f_bavail == 0. However, many applications treat this as having a size // limit of 0. To work around this, return a very large but non-zero size // limit, chosen to ensure that it does not overflow int64. return math.MaxInt64 } return defaultSizeLimit } // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { mf := pgalloc.MemoryFileFromContext(ctx) if mf == nil { panic("CtxMemoryFile returned nil") } rootFileType := uint16(linux.S_IFDIR) disableDefaultSizeLimit := false newFSType := vfs.FilesystemType(&fstype) // By default we support only "trusted" and "user" namespaces. Linux // also supports "security" and (if configured) POSIX ACL namespaces // "system.posix_acl_access" and "system.posix_acl_default". allowXattrPrefix := map[string]struct{}{ linux.XATTR_TRUSTED_PREFIX: struct{}{}, linux.XATTR_USER_PREFIX: struct{}{}, // The "security" namespace is allowed, but it always returns an error. linux.XATTR_SECURITY_PREFIX: struct{}{}, } tmpfsOpts, tmpfsOptsOk := opts.InternalData.(FilesystemOpts) if tmpfsOptsOk { if tmpfsOpts.RootFileType != 0 { rootFileType = tmpfsOpts.RootFileType } if tmpfsOpts.FilesystemType != nil { newFSType = tmpfsOpts.FilesystemType } disableDefaultSizeLimit = tmpfsOpts.DisableDefaultSizeLimit if tmpfsOpts.MemoryFile != nil { mf = tmpfsOpts.MemoryFile } for _, xattr := range tmpfsOpts.AllowXattrPrefix { allowXattrPrefix[xattr] = struct{}{} } } mopts := vfs.GenericParseMountOptions(opts.Data) rootMode := linux.FileMode(0777) if rootFileType == linux.S_IFDIR { rootMode = 01777 } modeStr, ok := mopts["mode"] if ok { delete(mopts, "mode") mode, err := strconv.ParseUint(modeStr, 8, 32) if err != nil { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr) return nil, nil, linuxerr.EINVAL } rootMode = linux.FileMode(mode & 07777) } rootKUID := creds.EffectiveKUID uidStr, ok := mopts["uid"] if ok { delete(mopts, "uid") uid, err := strconv.ParseUint(uidStr, 10, 32) if err != nil { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr) return nil, nil, linuxerr.EINVAL } kuid := creds.UserNamespace.MapToKUID(auth.UID(uid)) if !kuid.Ok() { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid) return nil, nil, linuxerr.EINVAL } rootKUID = kuid } rootKGID := creds.EffectiveKGID gidStr, ok := mopts["gid"] if ok { delete(mopts, "gid") gid, err := strconv.ParseUint(gidStr, 10, 32) if err != nil { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr) return nil, nil, linuxerr.EINVAL } kgid := creds.UserNamespace.MapToKGID(auth.GID(gid)) if !kgid.Ok() { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid) return nil, nil, linuxerr.EINVAL } rootKGID = kgid } maxSizeInPages := getDefaultSizeLimit(disableDefaultSizeLimit) / hostarch.PageSize maxSizeStr, ok := mopts["size"] if ok { delete(mopts, "size") maxSizeInBytes, err := parseSize(maxSizeStr) if err != nil { ctx.Debugf("tmpfs.FilesystemType.GetFilesystem: parseSize() failed: %v", err) return nil, nil, linuxerr.EINVAL } // Convert size in bytes to nearest Page Size bytes // as Linux allocates memory in terms of Page size. maxSizeInPages, ok = hostarch.ToPagesRoundUp(maxSizeInBytes) if !ok { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: Pages RoundUp Overflow error: %q", ok) return nil, nil, linuxerr.EINVAL } } if len(mopts) != 0 { ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts) return nil, nil, linuxerr.EINVAL } devMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, nil, err } clock := time.RealtimeClockFromContext(ctx) memUsage := usage.Tmpfs if tmpfsOpts.Usage != nil { memUsage = *tmpfsOpts.Usage } fs := filesystem{ mf: mf, clock: clock, devMinor: devMinor, mopts: opts.Data, usage: memUsage, maxFilenameLen: linux.NAME_MAX, maxSizeInPages: maxSizeInPages, allowXattrPrefix: allowXattrPrefix, } fs.vfsfs.Init(vfsObj, newFSType, &fs) if tmpfsOptsOk && tmpfsOpts.MaxFilenameLen > 0 { fs.maxFilenameLen = tmpfsOpts.MaxFilenameLen } var root *dentry switch rootFileType { case linux.S_IFREG: root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode, nil /* parentDir */)) case linux.S_IFLNK: root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget, nil /* parentDir */)) case linux.S_IFDIR: root = &fs.newDirectory(rootKUID, rootKGID, rootMode, nil /* parentDir */).dentry default: fs.vfsfs.DecRef(ctx) return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) } fs.root = root return &fs.vfsfs, &root.vfsd, nil } // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.mu.Lock() if fs.root.inode.isDir() { fs.root.releaseChildrenLocked(ctx) } fs.mu.Unlock() if fs.mf.RestoreID() != "" { // If RestoreID is set, then this is a private MemoryFile which needs to be // destroyed since this tmpfs is the only user. fs.mf.Destroy() } } // releaseChildrenLocked is called on the mount point by filesystem.Release() to // destroy all objects in the mount. It performs a depth-first walk of the // filesystem and "unlinks" everything by decrementing link counts // appropriately. There should be no open file descriptors when this is called, // so each inode should only have one outstanding reference that is removed once // its link count hits zero. // // Note that we do not update filesystem state precisely while tearing down (for // instance, the child maps are ignored)--we only care to remove all remaining // references so that every filesystem object gets destroyed. Also note that we // do not need to trigger DecRef on the mount point itself or any child mount; // these are taken care of by the destructor of the enclosing MountNamespace. // // Precondition: filesystem.mu is held. func (d *dentry) releaseChildrenLocked(ctx context.Context) { dir := d.inode.impl.(*directory) for _, child := range dir.childMap { if child.inode.isDir() { child.releaseChildrenLocked(ctx) child.inode.decLinksLocked(ctx) // link for child/. dir.inode.decLinksLocked(ctx) // link for child/.. } child.inode.decLinksLocked(ctx) // link for child } } func (fs *filesystem) statFS() linux.Statfs { st := linux.Statfs{ Type: linux.TMPFS_MAGIC, BlockSize: hostarch.PageSize, FragmentSize: hostarch.PageSize, NameLength: linux.NAME_MAX, } // If size is set for tmpfs return set values. st.Blocks = fs.maxSizeInPages pagesUsed := fs.pagesUsed.Load() st.BlocksFree = fs.maxSizeInPages - pagesUsed st.BlocksAvailable = fs.maxSizeInPages - pagesUsed return st } // dentry implements vfs.DentryImpl. // // +stateify savable type dentry struct { vfsd vfs.Dentry // parent is this dentry's parent directory. Each referenced dentry holds a // reference on parent.dentry. If this dentry is a filesystem root, parent // is nil. parent is protected by filesystem.mu. parent atomic.Pointer[dentry] `state:".(*dentry)"` // name is the name of this dentry in its parent. If this dentry is a // filesystem root, name is the empty string. name is protected by // filesystem.mu. name string // dentryEntry (ugh) links dentries into their parent directory.childList. dentryEntry // inode is the inode represented by this dentry. Multiple Dentries may // share a single non-directory inode (with hard links). inode is // immutable. // // tmpfs doesn't count references on dentries; because the dentry tree is // the sole source of truth, it is by definition always consistent with the // state of the filesystem. However, it does count references on inodes, // because inode resources are released when all references are dropped. // dentry therefore forwards reference counting directly to inode. inode *inode } func (fs *filesystem) newDentry(inode *inode) *dentry { d := &dentry{ inode: inode, } d.vfsd.Init(d) return d } // IncRef implements vfs.DentryImpl.IncRef. func (d *dentry) IncRef() { d.inode.incRef() } // TryIncRef implements vfs.DentryImpl.TryIncRef. func (d *dentry) TryIncRef() bool { return d.inode.tryIncRef() } // DecRef implements vfs.DentryImpl.DecRef. func (d *dentry) DecRef(ctx context.Context) { d.inode.decRef(ctx) } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { if d.inode.isDir() { events |= linux.IN_ISDIR } // tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates // that d was deleted. deleted := d.vfsd.IsDead() d.inode.fs.mu.RLock() // The ordering below is important, Linux always notifies the parent first. parent := d.parent.Load() if parent != nil { parent.inode.watches.Notify(ctx, d.name, events, cookie, et, deleted) } d.inode.watches.Notify(ctx, "", events, cookie, et, deleted) d.inode.fs.mu.RUnlock() } // Watches implements vfs.DentryImpl.Watches. func (d *dentry) Watches() *vfs.Watches { return &d.inode.watches } // OnZeroWatches implements vfs.Dentry.OnZeroWatches. func (d *dentry) OnZeroWatches(context.Context) {} // inode represents a filesystem object. // // +stateify savable type inode struct { // fs is the owning filesystem. fs is immutable. fs *filesystem // A reference is held on all inodes as long as they are reachable in the // filesystem tree, i.e. nlink is nonzero. This reference is dropped when // nlink reaches 0. refs inodeRefs // xattrs implements extended attributes. // // TODO(b/148380782): Support xattrs other than user.* xattrs memxattr.SimpleExtendedAttributes // Inode metadata. Writing multiple fields atomically requires holding // mu, otherwise atomic operations can be used. mu inodeMutex `state:"nosave"` mode atomicbitops.Uint32 // file type and mode nlink atomicbitops.Uint32 // protected by filesystem.mu instead of inode.mu uid atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic gid atomicbitops.Uint32 // auth.KGID, but ... ino uint64 // immutable // Linux's tmpfs has no concept of btime. atime atomicbitops.Int64 // nanoseconds ctime atomicbitops.Int64 // nanoseconds mtime atomicbitops.Int64 // nanoseconds locks vfs.FileLocks // Inotify watches for this inode. watches vfs.Watches impl any // immutable } const maxLinks = math.MaxUint32 func (i *inode) init(impl any, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) { if mode.FileType() == 0 { panic("file type is required in FileMode") } // Inherit the group and setgid bit as in fs/inode.c:inode_init_owner(). if parentDir != nil && parentDir.inode.mode.Load()&linux.S_ISGID == linux.S_ISGID { kgid = auth.KGID(parentDir.inode.gid.Load()) if mode&linux.S_IFDIR == linux.S_IFDIR { mode |= linux.S_ISGID } } i.fs = fs i.mode = atomicbitops.FromUint32(uint32(mode)) i.uid = atomicbitops.FromUint32(uint32(kuid)) i.gid = atomicbitops.FromUint32(uint32(kgid)) i.ino = fs.nextInoMinusOne.Add(1) // Tmpfs creation sets atime, ctime, and mtime to current time. now := fs.clock.Now().Nanoseconds() i.atime = atomicbitops.FromInt64(now) i.ctime = atomicbitops.FromInt64(now) i.mtime = atomicbitops.FromInt64(now) // i.nlink initialized by caller i.impl = impl i.refs.InitRefs() } // incLinksLocked increments i's link count. // // Preconditions: // - filesystem.mu must be locked for writing. // - i.mu must be lcoked. // - i.nlink != 0. // - i.nlink < maxLinks. func (i *inode) incLinksLocked() { if i.nlink.RacyLoad() == 0 { panic("tmpfs.inode.incLinksLocked() called with no existing links") } if i.nlink.RacyLoad() == maxLinks { panic("tmpfs.inode.incLinksLocked() called with maximum link count") } i.nlink.Add(1) } // decLinksLocked decrements i's link count. If the link count reaches 0, we // remove a reference on i as well. // // Preconditions: // - filesystem.mu must be locked for writing. // - i.mu must be lcoked. // - i.nlink != 0. func (i *inode) decLinksLocked(ctx context.Context) { if i.nlink.RacyLoad() == 0 { panic("tmpfs.inode.decLinksLocked() called with no existing links") } if i.nlink.Add(^uint32(0)) == 0 { i.decRef(ctx) } } func (i *inode) incRef() { i.refs.IncRef() } func (i *inode) tryIncRef() bool { return i.refs.TryIncRef() } func (i *inode) decRef(ctx context.Context) { i.refs.DecRef(func() { i.watches.HandleDeletion(ctx) // Remove pages used if child being removed is a SymLink or Regular File. switch impl := i.impl.(type) { case *symlink: if len(impl.target) >= shortSymlinkLen { impl.inode.fs.unaccountPages(1) } case *regularFile: // Release memory used by regFile to store data. Since regFile is // no longer usable, we don't need to grab any locks or update any // metadata. pagesDec := impl.data.DropAll(i.fs.mf) impl.inode.fs.unaccountPages(pagesDec) } }) } func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { mode := linux.FileMode(i.mode.Load()) return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())) } // Go won't inline this function, and returning linux.Statx (which is quite // big) means spending a lot of time in runtime.duffcopy(), so instead it's an // output parameter. // // Note that Linux does not guarantee to return consistent data (in the case of // a concurrent modification), so we do not require holding inode.mu. func (i *inode) statTo(stat *linux.Statx) { stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME | linux.STATX_MTIME stat.Blksize = hostarch.PageSize stat.Nlink = i.nlink.Load() stat.UID = i.uid.Load() stat.GID = i.gid.Load() stat.Mode = uint16(i.mode.Load()) stat.Ino = i.ino stat.Atime = linux.NsecToStatxTimestamp(i.atime.Load()) stat.Ctime = linux.NsecToStatxTimestamp(i.ctime.Load()) stat.Mtime = linux.NsecToStatxTimestamp(i.mtime.Load()) stat.DevMajor = linux.UNNAMED_MAJOR stat.DevMinor = i.fs.devMinor switch impl := i.impl.(type) { case *regularFile: stat.Size = uint64(impl.size.Load()) // TODO(jamieliu): This should be impl.data.Span() / 512, but this is // too expensive to compute here. Cache it in regularFile. stat.Blocks = allocatedBlocksForSize(stat.Size) case *directory: stat.Size = direntSize * (2 + uint64(impl.numChildren.Load())) // stat.Blocks is 0. case *symlink: stat.Size = uint64(len(impl.target)) // stat.Blocks is 0. case *namedPipe, *socketFile: // stat.Size and stat.Blocks are 0. case *deviceFile: // stat.Size and stat.Blocks are 0. stat.RdevMajor = impl.major stat.RdevMinor = impl.minor default: panic(fmt.Sprintf("unknown inode type: %T", i.impl)) } } func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions) error { stat := &opts.Stat if stat.Mask == 0 { return nil } if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 { return linuxerr.EPERM } mode := linux.FileMode(i.mode.Load()) if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())); err != nil { return err } i.mu.Lock() defer i.mu.Unlock() var ( needsMtimeBump bool needsCtimeBump bool ) clearSID := false mask := stat.Mask if mask&linux.STATX_SIZE != 0 { switch impl := i.impl.(type) { case *regularFile: updated, err := impl.truncateLocked(stat.Size) if err != nil { return err } if updated { clearSID = true needsMtimeBump = true needsCtimeBump = true } case *directory: return linuxerr.EISDIR default: return linuxerr.EINVAL } } if mask&linux.STATX_UID != 0 { i.uid.Store(stat.UID) needsCtimeBump = true clearSID = true } if mask&linux.STATX_GID != 0 { i.gid.Store(stat.GID) needsCtimeBump = true clearSID = true } if mask&linux.STATX_MODE != 0 { for { old := i.mode.Load() ft := old & linux.S_IFMT newMode := ft | uint32(stat.Mode & ^uint16(linux.S_IFMT)) if clearSID { newMode = vfs.ClearSUIDAndSGID(newMode) } if swapped := i.mode.CompareAndSwap(old, newMode); swapped { clearSID = false break } } needsCtimeBump = true } now := i.fs.clock.Now().Nanoseconds() if mask&linux.STATX_ATIME != 0 { if stat.Atime.Nsec == linux.UTIME_NOW { i.atime.Store(now) } else { i.atime.Store(stat.Atime.ToNsecCapped()) } needsCtimeBump = true } if mask&linux.STATX_MTIME != 0 { if stat.Mtime.Nsec == linux.UTIME_NOW { i.mtime.Store(now) } else { i.mtime.Store(stat.Mtime.ToNsecCapped()) } needsCtimeBump = true // Ignore the mtime bump, since we just set it ourselves. needsMtimeBump = false } if mask&linux.STATX_CTIME != 0 { if stat.Ctime.Nsec == linux.UTIME_NOW { i.ctime.Store(now) } else { i.ctime.Store(stat.Ctime.ToNsecCapped()) } // Ignore the ctime bump, since we just set it ourselves. needsCtimeBump = false } // We may have to clear the SUID/SGID bits, but didn't do so as part of // STATX_MODE. if clearSID { for { old := i.mode.Load() newMode := vfs.ClearSUIDAndSGID(old) if swapped := i.mode.CompareAndSwap(old, newMode); swapped { break } } needsCtimeBump = true } if needsMtimeBump { i.mtime.Store(now) } if needsCtimeBump { i.ctime.Store(now) } return nil } // allocatedBlocksForSize returns the number of 512B blocks needed to // accommodate the given size in bytes, as appropriate for struct // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block // size is independent of the "preferred block size for I/O", struct // stat::st_blksize and struct statx::stx_blksize.) func allocatedBlocksForSize(size uint64) uint64 { return (size + 511) / 512 } func (i *inode) direntType() uint8 { switch impl := i.impl.(type) { case *regularFile: return linux.DT_REG case *directory: return linux.DT_DIR case *symlink: return linux.DT_LNK case *socketFile: return linux.DT_SOCK case *namedPipe: return linux.DT_FIFO case *deviceFile: switch impl.kind { case vfs.BlockDevice: return linux.DT_BLK case vfs.CharDevice: return linux.DT_CHR default: panic(fmt.Sprintf("unknown vfs.DeviceKind: %v", impl.kind)) } default: panic(fmt.Sprintf("unknown inode type: %T", i.impl)) } } func (i *inode) isDir() bool { mode := linux.FileMode(i.mode.Load()) return mode.FileType() == linux.S_IFDIR } func (i *inode) touchAtime(mnt *vfs.Mount) { if mnt.Options().Flags.NoATime { return } if err := mnt.CheckBeginWrite(); err != nil { return } now := i.fs.clock.Now().Nanoseconds() i.mu.Lock() i.atime.Store(now) i.mu.Unlock() mnt.EndWrite() } // Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). func (i *inode) touchCtime() { now := i.fs.clock.Now().Nanoseconds() i.mu.Lock() i.ctime.Store(now) i.mu.Unlock() } // Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). func (i *inode) touchCMtime() { now := i.fs.clock.Now().Nanoseconds() i.mu.Lock() i.mtime.Store(now) i.ctime.Store(now) i.mu.Unlock() } // Preconditions: // - The caller has called vfs.Mount.CheckBeginWrite(). // - inode.mu must be locked. func (i *inode) touchCMtimeLocked() { now := i.fs.clock.Now().Nanoseconds() i.mtime.Store(now) i.ctime.Store(now) } func (i *inode) checkXattrPrefix(name string) error { for prefix := range i.fs.allowXattrPrefix { if strings.HasPrefix(name, prefix) { return nil } } return linuxerr.EOPNOTSUPP } func (i *inode) listXattr(creds *auth.Credentials, size uint64) ([]string, error) { return i.xattrs.ListXattr(creds, size) } func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { if err := i.checkXattrPrefix(opts.Name); err != nil { return "", err } mode := linux.FileMode(i.mode.Load()) kuid := auth.KUID(i.uid.Load()) kgid := auth.KGID(i.gid.Load()) if err := vfs.GenericCheckPermissions(creds, vfs.MayRead, mode, kuid, kgid); err != nil { return "", err } return i.xattrs.GetXattr(creds, mode, kuid, opts) } func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error { if err := i.checkXattrPrefix(opts.Name); err != nil { return err } mode := linux.FileMode(i.mode.Load()) kuid := auth.KUID(i.uid.Load()) kgid := auth.KGID(i.gid.Load()) if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil { return err } return i.xattrs.SetXattr(creds, mode, kuid, opts) } func (i *inode) removeXattr(creds *auth.Credentials, name string) error { if err := i.checkXattrPrefix(name); err != nil { return err } mode := linux.FileMode(i.mode.Load()) kuid := auth.KUID(i.uid.Load()) kgid := auth.KGID(i.gid.Load()) if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil { return err } return i.xattrs.RemoveXattr(creds, mode, kuid, name) } // fileDescription is embedded by tmpfs implementations of // vfs.FileDescriptionImpl. // // +stateify savable type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.LockFD } func (fd *fileDescription) filesystem() *filesystem { return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) } func (fd *fileDescription) dentry() *dentry { return fd.vfsfd.Dentry().Impl().(*dentry) } func (fd *fileDescription) inode() *inode { return fd.dentry().inode } // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx fd.inode().statTo(&stat) return stat, nil } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { return fd.dentry().inode.setStat(ctx, auth.CredentialsFromContext(ctx), &opts) } // StatFS implements vfs.FileDescriptionImpl.StatFS. func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { return fd.filesystem().statFS(), nil } // ListXattr implements vfs.FileDescriptionImpl.ListXattr. func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { return fd.inode().listXattr(auth.CredentialsFromContext(ctx), size) } // GetXattr implements vfs.FileDescriptionImpl.GetXattr. func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts) } // SetXattr implements vfs.FileDescriptionImpl.SetXattr. func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { return fd.dentry().inode.setXattr(auth.CredentialsFromContext(ctx), &opts) } // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { return fd.dentry().inode.removeXattr(auth.CredentialsFromContext(ctx), name) } // Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all // filesystem state is in-memory. func (*fileDescription) Sync(context.Context) error { return nil } // parseSize converts size in string to an integer bytes. // Supported suffixes in string are:K, M, G, T, P, E. func parseSize(s string) (uint64, error) { if len(s) == 0 { return 0, fmt.Errorf("size parameter empty") } suffix := s[len(s)-1] count := 1 switch suffix { case 'e', 'E': count = count << 10 fallthrough case 'p', 'P': count = count << 10 fallthrough case 't', 'T': count = count << 10 fallthrough case 'g', 'G': count = count << 10 fallthrough case 'm', 'M': count = count << 10 fallthrough case 'k', 'K': count = count << 10 s = s[:len(s)-1] } byteTmp, err := strconv.ParseUint(s, 10, 64) if err != nil { return 0, linuxerr.EINVAL } // Check for overflow. bytes := byteTmp * uint64(count) if byteTmp != 0 && bytes/byteTmp != uint64(count) { return 0, fmt.Errorf("size overflow") } return bytes, err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/tmpfs/tmpfs_state_autogen.go000066400000000000000000000370741465435605700300100ustar00rootroot00000000000000// automatically generated by stateify. package tmpfs import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (l *dentryList) StateTypeName() string { return "pkg/sentry/fsimpl/tmpfs.dentryList" } func (l *dentryList) StateFields() []string { return []string{ "head", "tail", } } func (l *dentryList) beforeSave() {} // +checklocksignore func (l *dentryList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *dentryList) afterLoad(context.Context) {} // +checklocksignore func (l *dentryList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *dentryEntry) StateTypeName() string { return "pkg/sentry/fsimpl/tmpfs.dentryEntry" } func (e *dentryEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *dentryEntry) beforeSave() {} // +checklocksignore func (e *dentryEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *dentryEntry) afterLoad(context.Context) {} // +checklocksignore func (e *dentryEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (d *deviceFile) StateTypeName() string { return "pkg/sentry/fsimpl/tmpfs.deviceFile" } func (d *deviceFile) StateFields() []string { return []string{ "inode", "kind", "major", "minor", } } func (d *deviceFile) beforeSave() {} // +checklocksignore func (d *deviceFile) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.inode) stateSinkObject.Save(1, &d.kind) stateSinkObject.Save(2, &d.major) stateSinkObject.Save(3, &d.minor) } func (d *deviceFile) afterLoad(context.Context) {} // +checklocksignore func (d *deviceFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.inode) stateSourceObject.Load(1, &d.kind) stateSourceObject.Load(2, &d.major) stateSourceObject.Load(3, &d.minor) } func (dir *directory) StateTypeName() string { return "pkg/sentry/fsimpl/tmpfs.directory" } func (dir *directory) StateFields() []string { return []string{ "dentry", "inode", "childMap", "numChildren", "childList", } } func (dir *directory) beforeSave() {} // +checklocksignore func (dir *directory) StateSave(stateSinkObject state.Sink) { dir.beforeSave() stateSinkObject.Save(0, &dir.dentry) stateSinkObject.Save(1, &dir.inode) stateSinkObject.Save(2, &dir.childMap) stateSinkObject.Save(3, &dir.numChildren) stateSinkObject.Save(4, &dir.childList) } func (dir *directory) afterLoad(context.Context) {} // +checklocksignore func (dir *directory) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &dir.dentry) stateSourceObject.Load(1, &dir.inode) stateSourceObject.Load(2, &dir.childMap) stateSourceObject.Load(3, &dir.numChildren) stateSourceObject.Load(4, &dir.childList) } func (fd *directoryFD) StateTypeName() string { return "pkg/sentry/fsimpl/tmpfs.directoryFD" } func (fd *directoryFD) StateFields() []string { return []string{ "fileDescription", "DirectoryFileDescriptionDefaultImpl", "iter", "off", } } func (fd *directoryFD) beforeSave() {} // +checklocksignore func (fd *directoryFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.fileDescription) stateSinkObject.Save(1, &fd.DirectoryFileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.iter) stateSinkObject.Save(3, &fd.off) } func (fd *directoryFD) afterLoad(context.Context) {} // +checklocksignore func (fd *directoryFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.fileDescription) stateSourceObject.Load(1, &fd.DirectoryFileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.iter) stateSourceObject.Load(3, &fd.off) } func (r *inodeRefs) StateTypeName() string { return "pkg/sentry/fsimpl/tmpfs.inodeRefs" } func (r *inodeRefs) StateFields() []string { return []string{ "refCount", } } func (r *inodeRefs) beforeSave() {} // +checklocksignore func (r *inodeRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *inodeRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (n *namedPipe) StateTypeName() string { return "pkg/sentry/fsimpl/tmpfs.namedPipe" } func (n *namedPipe) StateFields() []string { return []string{ "inode", "pipe", } } func (n *namedPipe) beforeSave() {} // +checklocksignore func (n *namedPipe) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.inode) stateSinkObject.Save(1, &n.pipe) } func (n *namedPipe) afterLoad(context.Context) {} // +checklocksignore func (n *namedPipe) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.inode) stateSourceObject.Load(1, &n.pipe) } func (rf *regularFile) StateTypeName() string { return "pkg/sentry/fsimpl/tmpfs.regularFile" } func (rf *regularFile) StateFields() []string { return []string{ "inode", "memoryUsageKind", "mappings", "writableMappingPages", "data", "seals", "size", } } func (rf *regularFile) beforeSave() {} // +checklocksignore func (rf *regularFile) StateSave(stateSinkObject state.Sink) { rf.beforeSave() stateSinkObject.Save(0, &rf.inode) stateSinkObject.Save(1, &rf.memoryUsageKind) stateSinkObject.Save(2, &rf.mappings) stateSinkObject.Save(3, &rf.writableMappingPages) stateSinkObject.Save(4, &rf.data) stateSinkObject.Save(5, &rf.seals) stateSinkObject.Save(6, &rf.size) } func (rf *regularFile) afterLoad(context.Context) {} // +checklocksignore func (rf *regularFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &rf.inode) stateSourceObject.Load(1, &rf.memoryUsageKind) stateSourceObject.Load(2, &rf.mappings) stateSourceObject.Load(3, &rf.writableMappingPages) stateSourceObject.Load(4, &rf.data) stateSourceObject.Load(5, &rf.seals) stateSourceObject.Load(6, &rf.size) } func (fd *regularFileFD) StateTypeName() string { return "pkg/sentry/fsimpl/tmpfs.regularFileFD" } func (fd *regularFileFD) StateFields() []string { return []string{ "fileDescription", "off", } } func (fd *regularFileFD) beforeSave() {} // +checklocksignore func (fd *regularFileFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.fileDescription) stateSinkObject.Save(1, &fd.off) } func (fd *regularFileFD) afterLoad(context.Context) {} // +checklocksignore func (fd *regularFileFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.fileDescription) stateSourceObject.Load(1, &fd.off) } func (s *socketFile) StateTypeName() string { return "pkg/sentry/fsimpl/tmpfs.socketFile" } func (s *socketFile) StateFields() []string { return []string{ "inode", "ep", } } func (s *socketFile) beforeSave() {} // +checklocksignore func (s *socketFile) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.inode) stateSinkObject.Save(1, &s.ep) } func (s *socketFile) afterLoad(context.Context) {} // +checklocksignore func (s *socketFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.inode) stateSourceObject.Load(1, &s.ep) } func (s *symlink) StateTypeName() string { return "pkg/sentry/fsimpl/tmpfs.symlink" } func (s *symlink) StateFields() []string { return []string{ "inode", "target", } } func (s *symlink) beforeSave() {} // +checklocksignore func (s *symlink) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.inode) stateSinkObject.Save(1, &s.target) } func (s *symlink) afterLoad(context.Context) {} // +checklocksignore func (s *symlink) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.inode) stateSourceObject.Load(1, &s.target) } func (fstype *FilesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/tmpfs.FilesystemType" } func (fstype *FilesystemType) StateFields() []string { return []string{} } func (fstype *FilesystemType) beforeSave() {} // +checklocksignore func (fstype *FilesystemType) StateSave(stateSinkObject state.Sink) { fstype.beforeSave() } func (fstype *FilesystemType) afterLoad(context.Context) {} // +checklocksignore func (fstype *FilesystemType) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (fs *filesystem) StateTypeName() string { return "pkg/sentry/fsimpl/tmpfs.filesystem" } func (fs *filesystem) StateFields() []string { return []string{ "vfsfs", "mf", "clock", "devMinor", "mopts", "usage", "nextInoMinusOne", "root", "maxFilenameLen", "maxSizeInPages", "pagesUsed", "allowXattrPrefix", } } func (fs *filesystem) beforeSave() {} // +checklocksignore func (fs *filesystem) StateSave(stateSinkObject state.Sink) { fs.beforeSave() var mfValue string mfValue = fs.saveMf() stateSinkObject.SaveValue(1, mfValue) stateSinkObject.Save(0, &fs.vfsfs) stateSinkObject.Save(2, &fs.clock) stateSinkObject.Save(3, &fs.devMinor) stateSinkObject.Save(4, &fs.mopts) stateSinkObject.Save(5, &fs.usage) stateSinkObject.Save(6, &fs.nextInoMinusOne) stateSinkObject.Save(7, &fs.root) stateSinkObject.Save(8, &fs.maxFilenameLen) stateSinkObject.Save(9, &fs.maxSizeInPages) stateSinkObject.Save(10, &fs.pagesUsed) stateSinkObject.Save(11, &fs.allowXattrPrefix) } func (fs *filesystem) afterLoad(context.Context) {} // +checklocksignore func (fs *filesystem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fs.vfsfs) stateSourceObject.Load(2, &fs.clock) stateSourceObject.Load(3, &fs.devMinor) stateSourceObject.Load(4, &fs.mopts) stateSourceObject.Load(5, &fs.usage) stateSourceObject.Load(6, &fs.nextInoMinusOne) stateSourceObject.Load(7, &fs.root) stateSourceObject.Load(8, &fs.maxFilenameLen) stateSourceObject.Load(9, &fs.maxSizeInPages) stateSourceObject.Load(10, &fs.pagesUsed) stateSourceObject.Load(11, &fs.allowXattrPrefix) stateSourceObject.LoadValue(1, new(string), func(y any) { fs.loadMf(ctx, y.(string)) }) } func (f *FilesystemOpts) StateTypeName() string { return "pkg/sentry/fsimpl/tmpfs.FilesystemOpts" } func (f *FilesystemOpts) StateFields() []string { return []string{ "RootFileType", "RootSymlinkTarget", "FilesystemType", "Usage", "MaxFilenameLen", "MemoryFile", "DisableDefaultSizeLimit", "AllowXattrPrefix", } } func (f *FilesystemOpts) beforeSave() {} // +checklocksignore func (f *FilesystemOpts) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.RootFileType) stateSinkObject.Save(1, &f.RootSymlinkTarget) stateSinkObject.Save(2, &f.FilesystemType) stateSinkObject.Save(3, &f.Usage) stateSinkObject.Save(4, &f.MaxFilenameLen) stateSinkObject.Save(5, &f.MemoryFile) stateSinkObject.Save(6, &f.DisableDefaultSizeLimit) stateSinkObject.Save(7, &f.AllowXattrPrefix) } func (f *FilesystemOpts) afterLoad(context.Context) {} // +checklocksignore func (f *FilesystemOpts) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.RootFileType) stateSourceObject.Load(1, &f.RootSymlinkTarget) stateSourceObject.Load(2, &f.FilesystemType) stateSourceObject.Load(3, &f.Usage) stateSourceObject.Load(4, &f.MaxFilenameLen) stateSourceObject.Load(5, &f.MemoryFile) stateSourceObject.Load(6, &f.DisableDefaultSizeLimit) stateSourceObject.Load(7, &f.AllowXattrPrefix) } func (d *dentry) StateTypeName() string { return "pkg/sentry/fsimpl/tmpfs.dentry" } func (d *dentry) StateFields() []string { return []string{ "vfsd", "parent", "name", "dentryEntry", "inode", } } func (d *dentry) beforeSave() {} // +checklocksignore func (d *dentry) StateSave(stateSinkObject state.Sink) { d.beforeSave() var parentValue *dentry parentValue = d.saveParent() stateSinkObject.SaveValue(1, parentValue) stateSinkObject.Save(0, &d.vfsd) stateSinkObject.Save(2, &d.name) stateSinkObject.Save(3, &d.dentryEntry) stateSinkObject.Save(4, &d.inode) } func (d *dentry) afterLoad(context.Context) {} // +checklocksignore func (d *dentry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.vfsd) stateSourceObject.Load(2, &d.name) stateSourceObject.Load(3, &d.dentryEntry) stateSourceObject.Load(4, &d.inode) stateSourceObject.LoadValue(1, new(*dentry), func(y any) { d.loadParent(ctx, y.(*dentry)) }) } func (i *inode) StateTypeName() string { return "pkg/sentry/fsimpl/tmpfs.inode" } func (i *inode) StateFields() []string { return []string{ "fs", "refs", "xattrs", "mode", "nlink", "uid", "gid", "ino", "atime", "ctime", "mtime", "locks", "watches", "impl", } } func (i *inode) beforeSave() {} // +checklocksignore func (i *inode) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.fs) stateSinkObject.Save(1, &i.refs) stateSinkObject.Save(2, &i.xattrs) stateSinkObject.Save(3, &i.mode) stateSinkObject.Save(4, &i.nlink) stateSinkObject.Save(5, &i.uid) stateSinkObject.Save(6, &i.gid) stateSinkObject.Save(7, &i.ino) stateSinkObject.Save(8, &i.atime) stateSinkObject.Save(9, &i.ctime) stateSinkObject.Save(10, &i.mtime) stateSinkObject.Save(11, &i.locks) stateSinkObject.Save(12, &i.watches) stateSinkObject.Save(13, &i.impl) } func (i *inode) afterLoad(context.Context) {} // +checklocksignore func (i *inode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.fs) stateSourceObject.Load(1, &i.refs) stateSourceObject.Load(2, &i.xattrs) stateSourceObject.Load(3, &i.mode) stateSourceObject.Load(4, &i.nlink) stateSourceObject.Load(5, &i.uid) stateSourceObject.Load(6, &i.gid) stateSourceObject.Load(7, &i.ino) stateSourceObject.Load(8, &i.atime) stateSourceObject.Load(9, &i.ctime) stateSourceObject.Load(10, &i.mtime) stateSourceObject.Load(11, &i.locks) stateSourceObject.Load(12, &i.watches) stateSourceObject.Load(13, &i.impl) } func (fd *fileDescription) StateTypeName() string { return "pkg/sentry/fsimpl/tmpfs.fileDescription" } func (fd *fileDescription) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "LockFD", } } func (fd *fileDescription) beforeSave() {} // +checklocksignore func (fd *fileDescription) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.vfsfd) stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.LockFD) } func (fd *fileDescription) afterLoad(context.Context) {} // +checklocksignore func (fd *fileDescription) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.vfsfd) stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.LockFD) } func init() { state.Register((*dentryList)(nil)) state.Register((*dentryEntry)(nil)) state.Register((*deviceFile)(nil)) state.Register((*directory)(nil)) state.Register((*directoryFD)(nil)) state.Register((*inodeRefs)(nil)) state.Register((*namedPipe)(nil)) state.Register((*regularFile)(nil)) state.Register((*regularFileFD)(nil)) state.Register((*socketFile)(nil)) state.Register((*symlink)(nil)) state.Register((*FilesystemType)(nil)) state.Register((*filesystem)(nil)) state.Register((*FilesystemOpts)(nil)) state.Register((*dentry)(nil)) state.Register((*inode)(nil)) state.Register((*fileDescription)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/user/000077500000000000000000000000001465435605700232205ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/user/path.go000066400000000000000000000067631465435605700245170ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package user import ( "fmt" "path" "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // ExecutableResolveError represents a failure to resolve the executable // in ResolveExecutablePath. type ExecutableResolveError struct{ error } // ResolveExecutablePath resolves the given executable name given the working // dir and environment. // Returns *ExecutableResolveError when the executable cannot be resolved. func ResolveExecutablePath(ctx context.Context, args *kernel.CreateProcessArgs) (string, error) { name := args.Filename if len(name) == 0 { if len(args.Argv) == 0 { return "", fmt.Errorf("no filename or command provided") } name = args.Argv[0] } // Absolute paths can be used directly. if path.IsAbs(name) { return name, nil } // Paths with '/' in them should be joined to the working directory, or // to the root if working directory is not set. if strings.IndexByte(name, '/') > 0 { wd := args.WorkingDirectory if wd == "" { wd = "/" } if !path.IsAbs(wd) { return "", fmt.Errorf("working directory %q must be absolute", wd) } return path.Join(wd, name), nil } // Otherwise, We must lookup the name in the paths. paths := getPath(args.Envv) f, err := resolve(ctx, args.Credentials, args.MountNamespace, paths, name) if err != nil { return "", &ExecutableResolveError{fmt.Errorf("error finding executable %q in PATH %v: %v", name, paths, err)} } return f, nil } func resolve(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, paths []string, name string) (string, error) { root := mns.Root(ctx) defer root.DecRef(ctx) for _, p := range paths { if !path.IsAbs(p) { // Relative paths aren't safe, no one should be using them. log.Warningf("Skipping relative path %q in $PATH", p) continue } binPath := path.Join(p, name) pop := &vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse(binPath), FollowFinalSymlink: true, } opts := &vfs.OpenOptions{ FileExec: true, Flags: linux.O_RDONLY, } dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts) if linuxerr.Equals(linuxerr.ENOENT, err) || linuxerr.Equals(linuxerr.EACCES, err) { // Didn't find it here. continue } if err != nil { return "", err } dentry.DecRef(ctx) return binPath, nil } // Couldn't find it. return "", linuxerr.ENOENT } // getPath returns the PATH as a slice of strings given the environment // variables. func getPath(env []string) []string { const prefix = "PATH=" for _, e := range env { if strings.HasPrefix(e, prefix) { return strings.Split(strings.TrimPrefix(e, prefix), ":") } } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/user/user.go000066400000000000000000000176771465435605700245470ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package user contains methods for resolving filesystem paths based on the // user and their environment. package user import ( "bufio" "fmt" "io" "strconv" "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) type fileReader struct { ctx context.Context fd *vfs.FileDescription } func (r *fileReader) Read(buf []byte) (int, error) { n, err := r.fd.Read(r.ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{}) return int(n), err } func getExecUserHome(ctx context.Context, mns *vfs.MountNamespace, uid auth.KUID) (string, error) { const defaultHome = "/" root := mns.Root(ctx) defer root.DecRef(ctx) creds := auth.CredentialsFromContext(ctx) target := &vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse("/etc/passwd"), } stat, err := root.Mount().Filesystem().VirtualFilesystem().StatAt(ctx, creds, target, &vfs.StatOptions{Mask: linux.STATX_TYPE}) if err != nil { return defaultHome, nil } if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.FileTypeMask != linux.ModeRegular { return defaultHome, nil } opts := &vfs.OpenOptions{ Flags: linux.O_RDONLY, } fd, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, target, opts) if err != nil { return defaultHome, nil } defer fd.DecRef(ctx) r := &fileReader{ ctx: ctx, fd: fd, } homeDir, err := findHomeInPasswd(uint32(uid), r, defaultHome) if err != nil { return "", err } return homeDir, nil } // MaybeAddExecUserHome returns a new slice with the HOME environment // variable set if the slice does not already contain it, otherwise it returns // the original slice unmodified. func MaybeAddExecUserHome(ctx context.Context, vmns *vfs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) { // Check if the envv already contains HOME. for _, env := range envv { if strings.HasPrefix(env, "HOME=") { // We have it. Return the original slice unmodified. return envv, nil } } // Read /etc/passwd for the user's HOME directory and set the HOME // environment variable as required by POSIX if it is not overridden by // the user. homeDir, err := getExecUserHome(ctx, vmns, uid) if err != nil { return nil, fmt.Errorf("error reading exec user: %v", err) } return append(envv, "HOME="+homeDir), nil } // findHomeInPasswd parses a passwd file and returns the given user's home // directory. This function does it's best to replicate the runc's behavior. func findHomeInPasswd(uid uint32, passwd io.Reader, defaultHome string) (string, error) { s := bufio.NewScanner(passwd) for s.Scan() { if err := s.Err(); err != nil { return "", err } line := strings.TrimSpace(s.Text()) if line == "" { continue } // Pull out part of passwd entry. Loosely parse the passwd entry as some // passwd files could be poorly written and for compatibility with runc. // // Per 'man 5 passwd' // /etc/passwd contains one line for each user account, with seven // fields delimited by colons (“:”). These fields are: // // - login name // - optional encrypted password // - numerical user ID // - numerical group ID // - user name or comment field // - user home directory // - optional user command interpreter parts := strings.Split(line, ":") found := false homeDir := "" for i, p := range parts { switch i { case 2: parsedUID, err := strconv.ParseUint(p, 10, 32) if err == nil && parsedUID == uint64(uid) { found = true } case 5: homeDir = p } } if found { // NOTE: If the uid is present but the home directory is not // present in the /etc/passwd entry we return an empty string. This // is, for better or worse, what runc does. return homeDir, nil } } return defaultHome, nil } func findUIDGIDInPasswd(passwd io.Reader, user string) (auth.KUID, auth.KGID, error) { defaultUID := auth.KUID(auth.OverflowUID) defaultGID := auth.KGID(auth.OverflowGID) uid := defaultUID gid := defaultGID // Per 'man 5 passwd' // /etc/passwd contains one line for each user account, with seven // fields delimited by colons (“:”). These fields are: // // - login name // - optional encrypted password // - numerical user ID // - numerical group ID // - Gecos field // - user home directory // - optional user command interpreter const ( numFields = 7 userIdx = 0 passwdIdx = 1 uidIdx = 2 gidIdx = 3 gecosIdx = 4 shellIdx = 6 ) usergroup := strings.SplitN(user, ":", 2) uStringOrID := usergroup[0] // Check if we have a uid or string for user. idxToMatch := uidIdx _, err := strconv.Atoi(uStringOrID) if err != nil { idxToMatch = userIdx } s := bufio.NewScanner(passwd) for s.Scan() { if err := s.Err(); err != nil { return defaultUID, defaultGID, err } line := strings.TrimSpace(s.Text()) if line == "" || strings.HasPrefix(line, "#") { continue } parts := strings.Split(line, ":") if len(parts) != numFields { // Return error if the format is invalid. return defaultUID, defaultGID, fmt.Errorf("invalid line found in /etc/passwd, there should be 7 fields but found %v", len(parts)) } for i := 0; i < numFields; i++ { // The password, GECOS and user command interpreter fields are // optional, no need to check if they are empty. if i == passwdIdx || i == shellIdx || i == gecosIdx { continue } if parts[i] == "" { // Return error if the format is invalid. return defaultUID, defaultGID, fmt.Errorf("invalid line found in /etc/passwd, field[%v] is empty", i) } } if parts[idxToMatch] == uStringOrID { parseUID, err := strconv.ParseUint(parts[uidIdx], 10, 32) if err != nil { return defaultUID, defaultGID, err } parseGID, err := strconv.ParseUint(parts[gidIdx], 10, 32) if err != nil { return defaultUID, defaultGID, err } if uid != defaultUID || gid != defaultGID { return defaultUID, defaultGID, fmt.Errorf("multiple matches for the user: %v", user) } uid = auth.KUID(parseUID) gid = auth.KGID(parseGID) } } if uid == defaultUID || gid == defaultGID { return defaultUID, defaultGID, fmt.Errorf("couldn't retrieve UID/GID from user: %v", user) } return uid, gid, nil } func getExecUIDGID(ctx context.Context, mns *vfs.MountNamespace, user string) (auth.KUID, auth.KGID, error) { root := mns.Root(ctx) defer root.DecRef(ctx) creds := auth.CredentialsFromContext(ctx) target := &vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse("/etc/passwd"), } fd, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, target, &vfs.OpenOptions{Flags: linux.O_RDONLY}) if err != nil { return auth.KUID(auth.OverflowUID), auth.KGID(auth.OverflowGID), fmt.Errorf("couldn't retrieve UID/GID from user: %v, err: %v", user, err) } defer fd.DecRef(ctx) r := &fileReader{ ctx: ctx, fd: fd, } return findUIDGIDInPasswd(r, user) } // GetExecUIDGIDFromUser retrieves the UID and GID from /etc/passwd file for // the given user. func GetExecUIDGIDFromUser(ctx context.Context, vmns *vfs.MountNamespace, user string) (auth.KUID, auth.KGID, error) { // Read /etc/passwd and retrieve the UID/GID based on the user string. uid, gid, err := getExecUIDGID(ctx, vmns, user) if err != nil { return uid, gid, fmt.Errorf("error reading /etc/passwd: %v", err) } return uid, gid, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsimpl/user/user_state_autogen.go000066400000000000000000000000661465435605700274510ustar00rootroot00000000000000// automatically generated by stateify. package user golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsmetric/000077500000000000000000000000001465435605700225645ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsmetric/fsmetric.go000066400000000000000000000111671465435605700247350ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package fsmetric defines filesystem metrics. package fsmetric import ( "time" "gvisor.dev/gvisor/pkg/metric" metricpb "gvisor.dev/gvisor/pkg/metric/metric_go_proto" ) // RecordWaitTime enables the ReadWait, GoferReadWait9P, GoferReadWaitHost, and // TmpfsReadWait metrics. Enabling this comes at a CPU cost due to performing // three clock reads per read call. // // Note that this is only performed in the direct read path, and may not be // consistently applied for other forms of reads, such as splice. var RecordWaitTime = false // Metrics that apply to all filesystems. var ( Opens = metric.MustCreateNewUint64Metric("/fs/opens", metric.Uint64Metadata{ Cumulative: true, Description: "Number of file opens.", }) Reads = metric.MustCreateNewUint64Metric("/fs/reads", metric.Uint64Metadata{ Cumulative: true, Description: "Number of file reads.", }) ReadWait = metric.MustCreateNewUint64Metric("/fs/read_wait", metric.Uint64Metadata{ Cumulative: true, Description: "Time waiting on file reads, in nanoseconds.", Unit: metricpb.MetricMetadata_UNITS_NANOSECONDS, }) ) // Metrics that only apply to fs/gofer and fsimpl/gofer. var ( GoferOpens9P = metric.MustCreateNewUint64Metric("/gofer/opens_9p", metric.Uint64Metadata{ Cumulative: true, Description: "Number of times a file was opened from a gofer and did not have a host file descriptor.", }) GoferOpensHost = metric.MustCreateNewUint64Metric("/gofer/opens_host", metric.Uint64Metadata{ Cumulative: true, Description: "Number of times a file was opened from a gofer and did have a host file descriptor.", }) GoferReads9P = metric.MustCreateNewUint64Metric("/gofer/reads_9p", metric.Uint64Metadata{ Cumulative: true, Description: "Number of 9P file reads from a gofer.", }) GoferReadWait9P = metric.MustCreateNewUint64Metric("/gofer/read_wait_9p", metric.Uint64Metadata{ Cumulative: true, Description: "Time waiting on 9P file reads from a gofer, in nanoseconds.", Unit: metricpb.MetricMetadata_UNITS_NANOSECONDS, }) GoferReadsHost = metric.MustCreateNewUint64Metric("/gofer/reads_host", metric.Uint64Metadata{ Cumulative: true, Description: "Number of host file reads from a gofer.", }) GoferReadWaitHost = metric.MustCreateNewUint64Metric("/gofer/read_wait_host", metric.Uint64Metadata{ Cumulative: true, Description: "Time waiting on host file reads from a gofer, in nanoseconds.", Unit: metricpb.MetricMetadata_UNITS_NANOSECONDS, }) ) // Metrics that only apply to fs/tmpfs and fsimpl/tmpfs. var ( TmpfsOpensRO = metric.MustCreateNewUint64Metric("/in_memory_file/opens_ro", metric.Uint64Metadata{ Cumulative: true, Description: "Number of times an in-memory file was opened in read-only mode.", }) TmpfsOpensW = metric.MustCreateNewUint64Metric("/in_memory_file/opens_w", metric.Uint64Metadata{ Cumulative: true, Description: "Number of times an in-memory file was opened in write mode.", }) TmpfsReads = metric.MustCreateNewUint64Metric("/in_memory_file/reads", metric.Uint64Metadata{ Cumulative: true, Description: "Number of in-memory file reads.", }) TmpfsReadWait = metric.MustCreateNewUint64Metric("/in_memory_file/read_wait", metric.Uint64Metadata{ Cumulative: true, Description: "Time waiting on in-memory file reads, in nanoseconds.", Unit: metricpb.MetricMetadata_UNITS_NANOSECONDS, }) ) // StartReadWait indicates the beginning of a file read. func StartReadWait() time.Time { if !RecordWaitTime { return time.Time{} } return time.Now() } // FinishReadWait indicates the end of a file read whose time is accounted by // m. start must be the value returned by the corresponding call to // StartReadWait. // // FinishReadWait is marked nosplit for performance since it's often called // from defer statements, which prevents it from being inlined // (https://github.com/golang/go/issues/38471). // //go:nosplit func FinishReadWait(m *metric.Uint64Metric, start time.Time) { if !RecordWaitTime { return } m.IncrementBy(uint64(time.Since(start).Nanoseconds())) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsmetric/fsmetric_state_autogen.go000066400000000000000000000000721465435605700276500ustar00rootroot00000000000000// automatically generated by stateify. package fsmetric golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsutil/000077500000000000000000000000001465435605700222565ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsutil/dirty_set.go000066400000000000000000000163401465435605700246170ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fsutil import ( "math" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" ) // DirtySet maps offsets into a memmap.Mappable to DirtyInfo. It is used to // implement Mappables that cache data from another source. // // type DirtySet // DirtyInfo is the value type of DirtySet, and represents information about a // Mappable offset that is dirty (the cached data for that offset is newer than // its source). // // +stateify savable type DirtyInfo struct { // Keep is true if the represented offset is concurrently writable, such // that writing the data for that offset back to the source does not // guarantee that the offset is clean (since it may be concurrently // rewritten after the writeback). Keep bool } // dirtySetFunctions implements segment.Functions for DirtySet. type dirtySetFunctions struct{} // MinKey implements segment.Functions.MinKey. func (dirtySetFunctions) MinKey() uint64 { return 0 } // MaxKey implements segment.Functions.MaxKey. func (dirtySetFunctions) MaxKey() uint64 { return math.MaxUint64 } // ClearValue implements segment.Functions.ClearValue. func (dirtySetFunctions) ClearValue(val *DirtyInfo) { } // Merge implements segment.Functions.Merge. func (dirtySetFunctions) Merge(_ memmap.MappableRange, val1 DirtyInfo, _ memmap.MappableRange, val2 DirtyInfo) (DirtyInfo, bool) { if val1 != val2 { return DirtyInfo{}, false } return val1, true } // Split implements segment.Functions.Split. func (dirtySetFunctions) Split(_ memmap.MappableRange, val DirtyInfo, _ uint64) (DirtyInfo, DirtyInfo) { return val, val } // MarkClean marks all offsets in mr as not dirty, except for those to which // KeepDirty has been applied. func (s *DirtySet) MarkClean(mr memmap.MappableRange) { seg := s.LowerBoundSegment(mr.Start) for seg.Ok() && seg.Start() < mr.End { if seg.Value().Keep { seg = seg.NextSegment() continue } seg = s.Isolate(seg, mr) seg = s.Remove(seg).NextSegment() } } // KeepClean marks all offsets in mr as not dirty, even those that were // previously kept dirty by KeepDirty. func (s *DirtySet) KeepClean(mr memmap.MappableRange) { s.RemoveRange(mr) } // MarkDirty marks all offsets in mr as dirty. func (s *DirtySet) MarkDirty(mr memmap.MappableRange) { s.setDirty(mr, false) } // KeepDirty marks all offsets in mr as dirty and prevents them from being // marked as clean by MarkClean. func (s *DirtySet) KeepDirty(mr memmap.MappableRange) { s.setDirty(mr, true) } func (s *DirtySet) setDirty(mr memmap.MappableRange, keep bool) { var changedAny bool defer func() { if changedAny { // Merge segments split by Isolate to reduce cost of iteration. s.MergeInsideRange(mr) } }() seg, gap := s.Find(mr.Start) for { switch { case seg.Ok() && seg.Start() < mr.End: if keep && !seg.Value().Keep { changedAny = true seg = s.Isolate(seg, mr) seg.ValuePtr().Keep = true } seg, gap = seg.NextNonEmpty() case gap.Ok() && gap.Start() < mr.End: changedAny = true seg = s.Insert(gap, gap.Range().Intersect(mr), DirtyInfo{keep}) seg, gap = seg.NextNonEmpty() default: return } } } // AllowClean allows MarkClean to mark offsets in mr as not dirty, ending the // effect of a previous call to KeepDirty. (It does not itself mark those // offsets as not dirty.) func (s *DirtySet) AllowClean(mr memmap.MappableRange) { var changedAny bool defer func() { if changedAny { // Merge segments split by Isolate to reduce cost of iteration. s.MergeInsideRange(mr) } }() for seg := s.LowerBoundSegment(mr.Start); seg.Ok() && seg.Start() < mr.End; seg = seg.NextSegment() { if seg.Value().Keep { changedAny = true seg = s.Isolate(seg, mr) seg.ValuePtr().Keep = false } } } // SyncDirty passes pages in the range mr that are stored in cache and // identified as dirty to writeAt, updating dirty to reflect successful writes. // If writeAt returns a successful partial write, SyncDirty will call it // repeatedly until all bytes have been written. max is the true size of the // cached object; offsets beyond max will not be passed to writeAt, even if // they are marked dirty. func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { var changedDirty bool defer func() { if changedDirty { // Merge segments split by Isolate to reduce cost of iteration. dirty.MergeInsideRange(mr) } }() dseg := dirty.LowerBoundSegment(mr.Start) for dseg.Ok() && dseg.Start() < mr.End { var dr memmap.MappableRange if dseg.Value().Keep { dr = dseg.Range().Intersect(mr) } else { changedDirty = true dseg = dirty.Isolate(dseg, mr) dr = dseg.Range() } if err := syncDirtyRange(ctx, dr, cache, max, mem, writeAt); err != nil { return err } if dseg.Value().Keep { dseg = dseg.NextSegment() } else { dseg = dirty.Remove(dseg).NextSegment() } } return nil } // SyncDirtyAll passes all pages stored in cache identified as dirty to // writeAt, updating dirty to reflect successful writes. If writeAt returns a // successful partial write, SyncDirtyAll will call it repeatedly until all // bytes have been written. max is the true size of the cached object; offsets // beyond max will not be passed to writeAt, even if they are marked dirty. func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { dseg := dirty.FirstSegment() for dseg.Ok() { if err := syncDirtyRange(ctx, dseg.Range(), cache, max, mem, writeAt); err != nil { return err } if dseg.Value().Keep { dseg = dseg.NextSegment() } else { dseg = dirty.Remove(dseg).NextSegment() } } return nil } // Preconditions: mr must be page-aligned. func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { for cseg := cache.LowerBoundSegment(mr.Start); cseg.Ok() && cseg.Start() < mr.End; cseg = cseg.NextSegment() { wbr := cseg.Range().Intersect(mr) if max < wbr.Start { break } ims, err := mem.MapInternal(cseg.FileRangeOf(wbr), hostarch.Read) if err != nil { return err } if max < wbr.End { ims = ims.TakeFirst64(max - wbr.Start) } offset := wbr.Start for !ims.IsEmpty() { n, err := writeAt(ctx, ims, offset) if err != nil { return err } offset += n ims = ims.DropFirst64(n) } } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsutil/dirty_set_impl.go000066400000000000000000002024711465435605700256420ustar00rootroot00000000000000package fsutil import ( __generics_imported0 "gvisor.dev/gvisor/pkg/sentry/memmap" ) import ( "bytes" "context" "fmt" ) // trackGaps is an optional parameter. // // If trackGaps is 1, the Set will track maximum gap size recursively, // enabling the GapIterator.{Prev,Next}LargeEnoughGap functions. In this // case, Key must be an unsigned integer. // // trackGaps must be 0 or 1. const DirtytrackGaps = 0 var _ = uint8(DirtytrackGaps << 7) // Will fail if not zero or one. // dynamicGap is a type that disappears if trackGaps is 0. type DirtydynamicGap [DirtytrackGaps]uint64 // Get returns the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *DirtydynamicGap) Get() uint64 { return d[:][0] } // Set sets the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *DirtydynamicGap) Set(v uint64) { d[:][0] = v } const ( // minDegree is the minimum degree of an internal node in a Set B-tree. // // - Any non-root node has at least minDegree-1 segments. // // - Any non-root internal (non-leaf) node has at least minDegree children. // // - The root node may have fewer than minDegree-1 segments, but it may // only have 0 segments if the tree is empty. // // Our implementation requires minDegree >= 3. Higher values of minDegree // usually improve performance, but increase memory usage for small sets. DirtyminDegree = 3 DirtymaxDegree = 2 * DirtyminDegree ) // A Set is a mapping of segments with non-overlapping Range keys. The zero // value for a Set is an empty set. Set values are not safely movable nor // copyable. Set is thread-compatible. // // +stateify savable type DirtySet struct { root Dirtynode `state:".([]DirtyFlatSegment)"` } // IsEmpty returns true if the set contains no segments. func (s *DirtySet) IsEmpty() bool { return s.root.nrSegments == 0 } // IsEmptyRange returns true iff no segments in the set overlap the given // range. This is semantically equivalent to s.SpanRange(r) == 0, but may be // more efficient. func (s *DirtySet) IsEmptyRange(r __generics_imported0.MappableRange) bool { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return true } _, gap := s.Find(r.Start) if !gap.Ok() { return false } return r.End <= gap.End() } // Span returns the total size of all segments in the set. func (s *DirtySet) Span() uint64 { var sz uint64 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { sz += seg.Range().Length() } return sz } // SpanRange returns the total size of the intersection of segments in the set // with the given range. func (s *DirtySet) SpanRange(r __generics_imported0.MappableRange) uint64 { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return 0 } var sz uint64 for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { sz += seg.Range().Intersect(r).Length() } return sz } // FirstSegment returns the first segment in the set. If the set is empty, // FirstSegment returns a terminal iterator. func (s *DirtySet) FirstSegment() DirtyIterator { if s.root.nrSegments == 0 { return DirtyIterator{} } return s.root.firstSegment() } // LastSegment returns the last segment in the set. If the set is empty, // LastSegment returns a terminal iterator. func (s *DirtySet) LastSegment() DirtyIterator { if s.root.nrSegments == 0 { return DirtyIterator{} } return s.root.lastSegment() } // FirstGap returns the first gap in the set. func (s *DirtySet) FirstGap() DirtyGapIterator { n := &s.root for n.hasChildren { n = n.children[0] } return DirtyGapIterator{n, 0} } // LastGap returns the last gap in the set. func (s *DirtySet) LastGap() DirtyGapIterator { n := &s.root for n.hasChildren { n = n.children[n.nrSegments] } return DirtyGapIterator{n, n.nrSegments} } // Find returns the segment or gap whose range contains the given key. If a // segment is found, the returned Iterator is non-terminal and the // returned GapIterator is terminal. Otherwise, the returned Iterator is // terminal and the returned GapIterator is non-terminal. func (s *DirtySet) Find(key uint64) (DirtyIterator, DirtyGapIterator) { n := &s.root for { lower := 0 upper := n.nrSegments for lower < upper { i := lower + (upper-lower)/2 if r := n.keys[i]; key < r.End { if key >= r.Start { return DirtyIterator{n, i}, DirtyGapIterator{} } upper = i } else { lower = i + 1 } } i := lower if !n.hasChildren { return DirtyIterator{}, DirtyGapIterator{n, i} } n = n.children[i] } } // FindSegment returns the segment whose range contains the given key. If no // such segment exists, FindSegment returns a terminal iterator. func (s *DirtySet) FindSegment(key uint64) DirtyIterator { seg, _ := s.Find(key) return seg } // LowerBoundSegment returns the segment with the lowest range that contains a // key greater than or equal to min. If no such segment exists, // LowerBoundSegment returns a terminal iterator. func (s *DirtySet) LowerBoundSegment(min uint64) DirtyIterator { seg, gap := s.Find(min) if seg.Ok() { return seg } return gap.NextSegment() } // UpperBoundSegment returns the segment with the highest range that contains a // key less than or equal to max. If no such segment exists, UpperBoundSegment // returns a terminal iterator. func (s *DirtySet) UpperBoundSegment(max uint64) DirtyIterator { seg, gap := s.Find(max) if seg.Ok() { return seg } return gap.PrevSegment() } // FindGap returns the gap containing the given key. If no such gap exists // (i.e. the set contains a segment containing that key), FindGap returns a // terminal iterator. func (s *DirtySet) FindGap(key uint64) DirtyGapIterator { _, gap := s.Find(key) return gap } // LowerBoundGap returns the gap with the lowest range that is greater than or // equal to min. func (s *DirtySet) LowerBoundGap(min uint64) DirtyGapIterator { seg, gap := s.Find(min) if gap.Ok() { return gap } return seg.NextGap() } // UpperBoundGap returns the gap with the highest range that is less than or // equal to max. func (s *DirtySet) UpperBoundGap(max uint64) DirtyGapIterator { seg, gap := s.Find(max) if gap.Ok() { return gap } return seg.PrevGap() } // FirstLargeEnoughGap returns the first gap in the set with at least the given // length. If no such gap exists, FirstLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *DirtySet) FirstLargeEnoughGap(minSize uint64) DirtyGapIterator { if DirtytrackGaps != 1 { panic("set is not tracking gaps") } gap := s.FirstGap() if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // LastLargeEnoughGap returns the last gap in the set with at least the given // length. If no such gap exists, LastLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *DirtySet) LastLargeEnoughGap(minSize uint64) DirtyGapIterator { if DirtytrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LastGap() if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // LowerBoundLargeEnoughGap returns the first gap in the set with at least the // given length and whose range contains a key greater than or equal to min. If // no such gap exists, LowerBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *DirtySet) LowerBoundLargeEnoughGap(min, minSize uint64) DirtyGapIterator { if DirtytrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LowerBoundGap(min) if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // UpperBoundLargeEnoughGap returns the last gap in the set with at least the // given length and whose range contains a key less than or equal to max. If no // such gap exists, UpperBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *DirtySet) UpperBoundLargeEnoughGap(max, minSize uint64) DirtyGapIterator { if DirtytrackGaps != 1 { panic("set is not tracking gaps") } gap := s.UpperBoundGap(max) if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // Insert inserts the given segment into the given gap. If the new segment can // be merged with adjacent segments, Insert will do so. Insert returns an // iterator to the segment containing the inserted value (which may have been // merged with other values). All existing iterators (including gap, but not // including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, Insert panics. // // Insert is semantically equivalent to a InsertWithoutMerging followed by a // Merge, but may be more efficient. Note that there is no unchecked variant of // Insert since Insert must retrieve and inspect gap's predecessor and // successor segments regardless. func (s *DirtySet) Insert(gap DirtyGapIterator, r __generics_imported0.MappableRange, val DirtyInfo) DirtyIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } prev, next := gap.PrevSegment(), gap.NextSegment() if prev.Ok() && prev.End() > r.Start { panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) } if next.Ok() && next.Start() < r.End { panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) } if prev.Ok() && prev.End() == r.Start { if mval, ok := (dirtySetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { shrinkMaxGap := DirtytrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() prev.SetEndUnchecked(r.End) prev.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } if next.Ok() && next.Start() == r.End { val = mval if mval, ok := (dirtySetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { prev.SetEndUnchecked(next.End()) prev.SetValue(mval) return s.Remove(next).PrevSegment() } } return prev } } if next.Ok() && next.Start() == r.End { if mval, ok := (dirtySetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { shrinkMaxGap := DirtytrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() next.SetStartUnchecked(r.Start) next.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } return next } } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMerging inserts the given segment into the given gap and // returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, // InsertWithoutMerging panics. func (s *DirtySet) InsertWithoutMerging(gap DirtyGapIterator, r __generics_imported0.MappableRange, val DirtyInfo) DirtyIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if gr := gap.Range(); !gr.IsSupersetOf(r) { panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMergingUnchecked inserts the given segment into the given gap // and returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // Preconditions: // - r.Start >= gap.Start(). // - r.End <= gap.End(). func (s *DirtySet) InsertWithoutMergingUnchecked(gap DirtyGapIterator, r __generics_imported0.MappableRange, val DirtyInfo) DirtyIterator { gap = gap.node.rebalanceBeforeInsert(gap) splitMaxGap := DirtytrackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get()) copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) gap.node.keys[gap.index] = r gap.node.values[gap.index] = val gap.node.nrSegments++ if splitMaxGap { gap.node.updateMaxGapLeaf() } return DirtyIterator{gap.node, gap.index} } // InsertRange inserts the given segment into the set. If the new segment can // be merged with adjacent segments, InsertRange will do so. InsertRange // returns an iterator to the segment containing the inserted value (which may // have been merged with other values). All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertRange panics. // // InsertRange searches the set to find the gap to insert into. If the caller // already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *DirtySet) InsertRange(r __generics_imported0.MappableRange, val DirtyInfo) DirtyIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.Insert(gap, r, val) } // InsertWithoutMergingRange inserts the given segment into the set and returns // an iterator to the inserted segment. All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertWithoutMergingRange panics. // // InsertWithoutMergingRange searches the set to find the gap to insert into. // If the caller already has the appropriate GapIterator, or if the caller // needs to do additional work between finding the gap and insertion, use // InsertWithoutMerging instead. func (s *DirtySet) InsertWithoutMergingRange(r __generics_imported0.MappableRange, val DirtyInfo) DirtyIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.InsertWithoutMerging(gap, r, val) } // TryInsertRange attempts to insert the given segment into the set. If the new // segment can be merged with adjacent segments, TryInsertRange will do so. // TryInsertRange returns an iterator to the segment containing the inserted // value (which may have been merged with other values). All existing iterators // (excluding the returned iterator) are invalidated. // // If the new segment would overlap an existing segment, TryInsertRange does // nothing and returns a terminal iterator. // // TryInsertRange searches the set to find the gap to insert into. If the // caller already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *DirtySet) TryInsertRange(r __generics_imported0.MappableRange, val DirtyInfo) DirtyIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return DirtyIterator{} } if gap.End() < r.End { return DirtyIterator{} } return s.Insert(gap, r, val) } // TryInsertWithoutMergingRange attempts to insert the given segment into the // set. If successful, it returns an iterator to the inserted segment; all // existing iterators (excluding the returned iterator) are invalidated. If the // new segment would overlap an existing segment, TryInsertWithoutMergingRange // does nothing and returns a terminal iterator. // // TryInsertWithoutMergingRange searches the set to find the gap to insert // into. If the caller already has the appropriate GapIterator, or if the // caller needs to do additional work between finding the gap and insertion, // use InsertWithoutMerging instead. func (s *DirtySet) TryInsertWithoutMergingRange(r __generics_imported0.MappableRange, val DirtyInfo) DirtyIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return DirtyIterator{} } if gap.End() < r.End { return DirtyIterator{} } return s.InsertWithoutMerging(gap, r, val) } // Remove removes the given segment and returns an iterator to the vacated gap. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. func (s *DirtySet) Remove(seg DirtyIterator) DirtyGapIterator { if seg.node.hasChildren { victim := seg.PrevSegment() seg.SetRangeUnchecked(victim.Range()) seg.SetValue(victim.Value()) nextAdjacentNode := seg.NextSegment().node if DirtytrackGaps != 0 { nextAdjacentNode.updateMaxGapLeaf() } return s.Remove(victim).NextGap() } copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) dirtySetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) seg.node.nrSegments-- if DirtytrackGaps != 0 { seg.node.updateMaxGapLeaf() } return seg.node.rebalanceAfterRemove(DirtyGapIterator{seg.node, seg.index}) } // RemoveAll removes all segments from the set. All existing iterators are // invalidated. func (s *DirtySet) RemoveAll() { s.root = Dirtynode{} } // RemoveRange removes all segments in the given range. An iterator to the // newly formed gap is returned, and all existing iterators are invalidated. // // RemoveRange searches the set to find segments to remove. If the caller // already has an iterator to either end of the range of segments to remove, or // if the caller needs to do additional work before removing each segment, // iterate segments and call Remove in a loop instead. func (s *DirtySet) RemoveRange(r __generics_imported0.MappableRange) DirtyGapIterator { seg, gap := s.Find(r.Start) if seg.Ok() { seg = s.Isolate(seg, r) gap = s.Remove(seg) } for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { seg = s.SplitAfter(seg, r.End) gap = s.Remove(seg) } return gap } // RemoveFullRange is equivalent to RemoveRange, except that if any key in the // given range does not correspond to a segment, RemoveFullRange panics. func (s *DirtySet) RemoveFullRange(r __generics_imported0.MappableRange) DirtyGapIterator { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) end := seg.End() gap := s.Remove(seg) if r.End <= end { return gap } seg = gap.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // Merge attempts to merge two neighboring segments. If successful, Merge // returns an iterator to the merged segment, and all existing iterators are // invalidated. Otherwise, Merge returns a terminal iterator. // // If first is not the predecessor of second, Merge panics. func (s *DirtySet) Merge(first, second DirtyIterator) DirtyIterator { if first.NextSegment() != second { panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) } return s.MergeUnchecked(first, second) } // MergeUnchecked attempts to merge two neighboring segments. If successful, // MergeUnchecked returns an iterator to the merged segment, and all existing // iterators are invalidated. Otherwise, MergeUnchecked returns a terminal // iterator. // // Precondition: first is the predecessor of second: first.NextSegment() == // second, first == second.PrevSegment(). func (s *DirtySet) MergeUnchecked(first, second DirtyIterator) DirtyIterator { if first.End() == second.Start() { if mval, ok := (dirtySetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { first.SetEndUnchecked(second.End()) first.SetValue(mval) return s.Remove(second).PrevSegment() } } return DirtyIterator{} } // MergePrev attempts to merge the given segment with its predecessor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergePrev is usually used when mutating segments while iterating them in // order of increasing keys, to attempt merging of each mutated segment with // its previously-mutated predecessor. In such cases, merging a mutated segment // with its unmutated successor would incorrectly cause the latter to be // skipped. func (s *DirtySet) MergePrev(seg DirtyIterator) DirtyIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } return seg } // MergeNext attempts to merge the given segment with its successor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergeNext is usually used when mutating segments while iterating them in // order of decreasing keys, to attempt merging of each mutated segment with // its previously-mutated successor. In such cases, merging a mutated segment // with its unmutated predecessor would incorrectly cause the latter to be // skipped. func (s *DirtySet) MergeNext(seg DirtyIterator) DirtyIterator { if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // Unisolate attempts to merge the given segment with its predecessor and // successor if possible, and returns an updated iterator to the extended // segment. All existing iterators (including seg, but not including the // returned iterator) are invalidated. // // Unisolate is usually used in conjunction with Isolate when mutating part of // a single segment in a way that may affect its mergeability. For the reasons // described by MergePrev and MergeNext, it is usually incorrect to use the // return value of Unisolate in a loop variable. func (s *DirtySet) Unisolate(seg DirtyIterator) DirtyIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // MergeAll merges all mergeable adjacent segments in the set. All existing // iterators are invalidated. func (s *DirtySet) MergeAll() { seg := s.FirstSegment() if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeInsideRange attempts to merge all adjacent segments that contain a key // in the specific range. All existing iterators are invalidated. // // MergeInsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid a redundant search. func (s *DirtySet) MergeInsideRange(r __generics_imported0.MappableRange) { seg := s.LowerBoundSegment(r.Start) if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() && next.Start() < r.End { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeOutsideRange attempts to merge the segment containing r.Start with its // predecessor, and the segment containing r.End-1 with its successor. // // MergeOutsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid two redundant searches. func (s *DirtySet) MergeOutsideRange(r __generics_imported0.MappableRange) { first := s.FindSegment(r.Start) if first.Ok() { if prev := first.PrevSegment(); prev.Ok() { s.Merge(prev, first) } } last := s.FindSegment(r.End - 1) if last.Ok() { if next := last.NextSegment(); next.Ok() { s.Merge(last, next) } } } // Split splits the given segment at the given key and returns iterators to the // two resulting segments. All existing iterators (including seg, but not // including the returned iterators) are invalidated. // // If the segment cannot be split at split (because split is at the start or // end of the segment's range, so splitting would produce a segment with zero // length, or because split falls outside the segment's range altogether), // Split panics. func (s *DirtySet) Split(seg DirtyIterator, split uint64) (DirtyIterator, DirtyIterator) { if !seg.Range().CanSplitAt(split) { panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) } return s.SplitUnchecked(seg, split) } // SplitUnchecked splits the given segment at the given key and returns // iterators to the two resulting segments. All existing iterators (including // seg, but not including the returned iterators) are invalidated. // // Preconditions: seg.Start() < key < seg.End(). func (s *DirtySet) SplitUnchecked(seg DirtyIterator, split uint64) (DirtyIterator, DirtyIterator) { val1, val2 := (dirtySetFunctions{}).Split(seg.Range(), seg.Value(), split) end2 := seg.End() seg.SetEndUnchecked(split) seg.SetValue(val1) seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), __generics_imported0.MappableRange{split, end2}, val2) return seg2.PrevSegment(), seg2 } // SplitBefore ensures that the given segment's start is at least start by // splitting at start if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterator) are invalidated. // // SplitBefore is usually when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, the first segment may // extend beyond the start of the range to be mutated, and needs to be // SplitBefore to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter; i.e. SplitBefore needs to be invoked on each segment, while // SplitAfter only needs to be invoked on the first. // // Preconditions: start < seg.End(). func (s *DirtySet) SplitBefore(seg DirtyIterator, start uint64) DirtyIterator { if seg.Range().CanSplitAt(start) { _, seg = s.SplitUnchecked(seg, start) } return seg } // SplitAfter ensures that the given segment's end is at most end by splitting // at end if necessary, and returns an updated iterator to the bounded segment. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. // // SplitAfter is usually used when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, each iterated segment // may extend beyond the end of the range to be mutated, and needs to be // SplitAfter to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter exchange roles; i.e. SplitBefore needs to be invoked on each // segment, while SplitAfter only needs to be invoked on the first. // // Preconditions: seg.Start() < end. func (s *DirtySet) SplitAfter(seg DirtyIterator, end uint64) DirtyIterator { if seg.Range().CanSplitAt(end) { seg, _ = s.SplitUnchecked(seg, end) } return seg } // Isolate ensures that the given segment's range is a subset of r by splitting // at r.Start and r.End if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterators) are invalidated. // // Isolate is usually used when mutating part of a single segment, or when // mutating segments in a range where the first segment is not necessarily // split, making use of SplitBefore/SplitAfter complex. // // Preconditions: seg.Range().Overlaps(r). func (s *DirtySet) Isolate(seg DirtyIterator, r __generics_imported0.MappableRange) DirtyIterator { if seg.Range().CanSplitAt(r.Start) { _, seg = s.SplitUnchecked(seg, r.Start) } if seg.Range().CanSplitAt(r.End) { seg, _ = s.SplitUnchecked(seg, r.End) } return seg } // LowerBoundSegmentSplitBefore combines LowerBoundSegment and SplitBefore. // // LowerBoundSegmentSplitBefore is usually used when mutating segments in a // range while iterating them in order of increasing keys. In such cases, // LowerBoundSegmentSplitBefore provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *DirtySet) LowerBoundSegmentSplitBefore(min uint64) DirtyIterator { seg := s.LowerBoundSegment(min) if seg.Ok() { seg = s.SplitBefore(seg, min) } return seg } // UpperBoundSegmentSplitAfter combines UpperBoundSegment and SplitAfter. // // UpperBoundSegmentSplitAfter is usually used when mutating segments in a // range while iterating them in order of decreasing keys. In such cases, // UpperBoundSegmentSplitAfter provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *DirtySet) UpperBoundSegmentSplitAfter(max uint64) DirtyIterator { seg := s.UpperBoundSegment(max) if seg.Ok() { seg = s.SplitAfter(seg, max) } return seg } // VisitRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments will not be split, so f may be called // on segments lying partially outside r. Non-empty gaps between segments are // skipped. If a call to f returns false, VisitRange stops iteration // immediately. // // N.B. f must not invalidate iterators into s. func (s *DirtySet) VisitRange(r __generics_imported0.MappableRange, f func(seg DirtyIterator) bool) { for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { if !f(seg) { return } } } // VisitFullRange is equivalent to VisitRange, except that if any key in r that // is visited before f returns false does not correspond to a segment, // VisitFullRange panics. func (s *DirtySet) VisitFullRange(r __generics_imported0.MappableRange, f func(seg DirtyIterator) bool) { pos := r.Start seg := s.FindSegment(r.Start) for { if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", pos)) } if !f(seg) { return } pos = seg.End() if r.End <= pos { return } seg, _ = seg.NextNonEmpty() } } // MutateRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments that lie partially outside r are split // before f is called, such that f only observes segments entirely within r. // Iterated segments are merged again after f is called. Non-empty gaps between // segments are skipped. If a call to f returns false, MutateRange stops // iteration immediately. // // MutateRange invalidates all existing iterators. // // N.B. f must not invalidate iterators into s. func (s *DirtySet) MutateRange(r __generics_imported0.MappableRange, f func(seg DirtyIterator) bool) { seg := s.LowerBoundSegmentSplitBefore(r.Start) for seg.Ok() && seg.Start() < r.End { seg = s.SplitAfter(seg, r.End) cont := f(seg) seg = s.MergePrev(seg) if !cont { s.MergeNext(seg) return } seg = seg.NextSegment() } if seg.Ok() { s.MergePrev(seg) } } // MutateFullRange is equivalent to MutateRange, except that if any key in r // that is visited before f returns false does not correspond to a segment, // MutateFullRange panics. func (s *DirtySet) MutateFullRange(r __generics_imported0.MappableRange, f func(seg DirtyIterator) bool) { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) cont := f(seg) end := seg.End() seg = s.MergePrev(seg) if !cont || r.End <= end { s.MergeNext(seg) return } seg = seg.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // +stateify savable type Dirtynode struct { // An internal binary tree node looks like: // // K // / \ // Cl Cr // // where all keys in the subtree rooted by Cl (the left subtree) are less // than K (the key of the parent node), and all keys in the subtree rooted // by Cr (the right subtree) are greater than K. // // An internal B-tree node's indexes work out to look like: // // K0 K1 K2 ... Kn-1 // / \/ \/ \ ... / \ // C0 C1 C2 C3 ... Cn-1 Cn // // where n is nrSegments. nrSegments int // parent is a pointer to this node's parent. If this node is root, parent // is nil. parent *Dirtynode // parentIndex is the index of this node in parent.children. parentIndex int // Flag for internal nodes that is technically redundant with "children[0] // != nil", but is stored in the first cache line. "hasChildren" rather // than "isLeaf" because false must be the correct value for an empty root. hasChildren bool // The longest gap within this node. If the node is a leaf, it's simply the // maximum gap among all the (nrSegments+1) gaps formed by its nrSegments keys // including the 0th and nrSegments-th gap possibly shared with its upper-level // nodes; if it's a non-leaf node, it's the max of all children's maxGap. maxGap DirtydynamicGap // Nodes store keys and values in separate arrays to maximize locality in // the common case (scanning keys for lookup). keys [DirtymaxDegree - 1]__generics_imported0.MappableRange values [DirtymaxDegree - 1]DirtyInfo children [DirtymaxDegree]*Dirtynode } // firstSegment returns the first segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *Dirtynode) firstSegment() DirtyIterator { for n.hasChildren { n = n.children[0] } return DirtyIterator{n, 0} } // lastSegment returns the last segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *Dirtynode) lastSegment() DirtyIterator { for n.hasChildren { n = n.children[n.nrSegments] } return DirtyIterator{n, n.nrSegments - 1} } func (n *Dirtynode) prevSibling() *Dirtynode { if n.parent == nil || n.parentIndex == 0 { return nil } return n.parent.children[n.parentIndex-1] } func (n *Dirtynode) nextSibling() *Dirtynode { if n.parent == nil || n.parentIndex == n.parent.nrSegments { return nil } return n.parent.children[n.parentIndex+1] } // rebalanceBeforeInsert splits n and its ancestors if they are full, as // required for insertion, and returns an updated iterator to the position // represented by gap. func (n *Dirtynode) rebalanceBeforeInsert(gap DirtyGapIterator) DirtyGapIterator { if n.nrSegments < DirtymaxDegree-1 { return gap } if n.parent != nil { gap = n.parent.rebalanceBeforeInsert(gap) } if n.parent == nil { left := &Dirtynode{ nrSegments: DirtyminDegree - 1, parent: n, parentIndex: 0, hasChildren: n.hasChildren, } right := &Dirtynode{ nrSegments: DirtyminDegree - 1, parent: n, parentIndex: 1, hasChildren: n.hasChildren, } copy(left.keys[:DirtyminDegree-1], n.keys[:DirtyminDegree-1]) copy(left.values[:DirtyminDegree-1], n.values[:DirtyminDegree-1]) copy(right.keys[:DirtyminDegree-1], n.keys[DirtyminDegree:]) copy(right.values[:DirtyminDegree-1], n.values[DirtyminDegree:]) n.keys[0], n.values[0] = n.keys[DirtyminDegree-1], n.values[DirtyminDegree-1] DirtyzeroValueSlice(n.values[1:]) if n.hasChildren { copy(left.children[:DirtyminDegree], n.children[:DirtyminDegree]) copy(right.children[:DirtyminDegree], n.children[DirtyminDegree:]) DirtyzeroNodeSlice(n.children[2:]) for i := 0; i < DirtyminDegree; i++ { left.children[i].parent = left left.children[i].parentIndex = i right.children[i].parent = right right.children[i].parentIndex = i } } n.nrSegments = 1 n.hasChildren = true n.children[0] = left n.children[1] = right if DirtytrackGaps != 0 { left.updateMaxGapLocal() right.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < DirtyminDegree { return DirtyGapIterator{left, gap.index} } return DirtyGapIterator{right, gap.index - DirtyminDegree} } copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[DirtyminDegree-1], n.values[DirtyminDegree-1] copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { n.parent.children[i].parentIndex = i } sibling := &Dirtynode{ nrSegments: DirtyminDegree - 1, parent: n.parent, parentIndex: n.parentIndex + 1, hasChildren: n.hasChildren, } n.parent.children[n.parentIndex+1] = sibling n.parent.nrSegments++ copy(sibling.keys[:DirtyminDegree-1], n.keys[DirtyminDegree:]) copy(sibling.values[:DirtyminDegree-1], n.values[DirtyminDegree:]) DirtyzeroValueSlice(n.values[DirtyminDegree-1:]) if n.hasChildren { copy(sibling.children[:DirtyminDegree], n.children[DirtyminDegree:]) DirtyzeroNodeSlice(n.children[DirtyminDegree:]) for i := 0; i < DirtyminDegree; i++ { sibling.children[i].parent = sibling sibling.children[i].parentIndex = i } } n.nrSegments = DirtyminDegree - 1 if DirtytrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < DirtyminDegree { return gap } return DirtyGapIterator{sibling, gap.index - DirtyminDegree} } // rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient // (contain fewer segments than required by B-tree invariants), as required for // removal, and returns an updated iterator to the position represented by gap. // // Precondition: n is the only node in the tree that may currently violate a // B-tree invariant. func (n *Dirtynode) rebalanceAfterRemove(gap DirtyGapIterator) DirtyGapIterator { for { if n.nrSegments >= DirtyminDegree-1 { return gap } if n.parent == nil { return gap } if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= DirtyminDegree { copy(n.keys[1:], n.keys[:n.nrSegments]) copy(n.values[1:], n.values[:n.nrSegments]) n.keys[0] = n.parent.keys[n.parentIndex-1] n.values[0] = n.parent.values[n.parentIndex-1] n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] dirtySetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { copy(n.children[1:], n.children[:n.nrSegments+1]) n.children[0] = sibling.children[sibling.nrSegments] sibling.children[sibling.nrSegments] = nil n.children[0].parent = n n.children[0].parentIndex = 0 for i := 1; i < n.nrSegments+2; i++ { n.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if DirtytrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling && gap.index == sibling.nrSegments { return DirtyGapIterator{n, 0} } if gap.node == n { return DirtyGapIterator{n, gap.index + 1} } return gap } if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= DirtyminDegree { n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] n.values[n.nrSegments] = n.parent.values[n.parentIndex] n.parent.keys[n.parentIndex] = sibling.keys[0] n.parent.values[n.parentIndex] = sibling.values[0] copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) dirtySetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { n.children[n.nrSegments+1] = sibling.children[0] copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) sibling.children[sibling.nrSegments] = nil n.children[n.nrSegments+1].parent = n n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 for i := 0; i < sibling.nrSegments; i++ { sibling.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if DirtytrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling { if gap.index == 0 { return DirtyGapIterator{n, n.nrSegments} } return DirtyGapIterator{sibling, gap.index - 1} } return gap } p := n.parent if p.nrSegments == 1 { left, right := p.children[0], p.children[1] p.nrSegments = left.nrSegments + right.nrSegments + 1 p.hasChildren = left.hasChildren p.keys[left.nrSegments] = p.keys[0] p.values[left.nrSegments] = p.values[0] copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := 0; i < p.nrSegments+1; i++ { p.children[i].parent = p p.children[i].parentIndex = i } } else { p.children[0] = nil p.children[1] = nil } if gap.node == left { return DirtyGapIterator{p, gap.index} } if gap.node == right { return DirtyGapIterator{p, gap.index + left.nrSegments + 1} } return gap } // Merge n and either sibling, along with the segment separating the // two, into whichever of the two nodes comes first. This is the // reverse of the non-root splitting case in // node.rebalanceBeforeInsert. var left, right *Dirtynode if n.parentIndex > 0 { left = n.prevSibling() right = n } else { left = n right = n.nextSibling() } if gap.node == right { gap = DirtyGapIterator{left, gap.index + left.nrSegments + 1} } left.keys[left.nrSegments] = p.keys[left.parentIndex] left.values[left.nrSegments] = p.values[left.parentIndex] copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { left.children[i].parent = left left.children[i].parentIndex = i } } left.nrSegments += right.nrSegments + 1 copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) dirtySetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) for i := 0; i < p.nrSegments; i++ { p.children[i].parentIndex = i } p.children[p.nrSegments] = nil p.nrSegments-- if DirtytrackGaps != 0 { left.updateMaxGapLocal() } n = p } } // updateMaxGapLeaf updates maxGap bottom-up from the calling leaf until no // necessary update. // // Preconditions: n must be a leaf node, trackGaps must be 1. func (n *Dirtynode) updateMaxGapLeaf() { if n.hasChildren { panic(fmt.Sprintf("updateMaxGapLeaf should always be called on leaf node: %v", n)) } max := n.calculateMaxGapLeaf() if max == n.maxGap.Get() { return } oldMax := n.maxGap.Get() n.maxGap.Set(max) if max > oldMax { for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() >= max { break } p.maxGap.Set(max) } return } for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() > oldMax { break } parentNewMax := p.calculateMaxGapInternal() if p.maxGap.Get() == parentNewMax { break } p.maxGap.Set(parentNewMax) } } // updateMaxGapLocal updates maxGap of the calling node solely with no // propagation to ancestor nodes. // // Precondition: trackGaps must be 1. func (n *Dirtynode) updateMaxGapLocal() { if !n.hasChildren { n.maxGap.Set(n.calculateMaxGapLeaf()) } else { n.maxGap.Set(n.calculateMaxGapInternal()) } } // calculateMaxGapLeaf iterates the gaps within a leaf node and calculate the // max. // // Preconditions: n must be a leaf node. func (n *Dirtynode) calculateMaxGapLeaf() uint64 { max := DirtyGapIterator{n, 0}.Range().Length() for i := 1; i <= n.nrSegments; i++ { if current := (DirtyGapIterator{n, i}).Range().Length(); current > max { max = current } } return max } // calculateMaxGapInternal iterates children's maxGap within an internal node n // and calculate the max. // // Preconditions: n must be a non-leaf node. func (n *Dirtynode) calculateMaxGapInternal() uint64 { max := n.children[0].maxGap.Get() for i := 1; i <= n.nrSegments; i++ { if current := n.children[i].maxGap.Get(); current > max { max = current } } return max } // searchFirstLargeEnoughGap returns the first gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *Dirtynode) searchFirstLargeEnoughGap(minSize uint64) DirtyGapIterator { if n.maxGap.Get() < minSize { return DirtyGapIterator{} } if n.hasChildren { for i := 0; i <= n.nrSegments; i++ { if largeEnoughGap := n.children[i].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := 0; i <= n.nrSegments; i++ { currentGap := DirtyGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // searchLastLargeEnoughGap returns the last gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *Dirtynode) searchLastLargeEnoughGap(minSize uint64) DirtyGapIterator { if n.maxGap.Get() < minSize { return DirtyGapIterator{} } if n.hasChildren { for i := n.nrSegments; i >= 0; i-- { if largeEnoughGap := n.children[i].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := n.nrSegments; i >= 0; i-- { currentGap := DirtyGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // A Iterator is conceptually one of: // // - A pointer to a segment in a set; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Iterators are copyable values and are meaningfully equality-comparable. The // zero value of Iterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type DirtyIterator struct { // node is the node containing the iterated segment. If the iterator is // terminal, node is nil. node *Dirtynode // index is the index of the segment in node.keys/values. index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (seg DirtyIterator) Ok() bool { return seg.node != nil } // Range returns the iterated segment's range key. func (seg DirtyIterator) Range() __generics_imported0.MappableRange { return seg.node.keys[seg.index] } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (seg DirtyIterator) Start() uint64 { return seg.node.keys[seg.index].Start } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (seg DirtyIterator) End() uint64 { return seg.node.keys[seg.index].End } // SetRangeUnchecked mutates the iterated segment's range key. This operation // does not invalidate any iterators. // // Preconditions: // - r.Length() > 0. // - The new range must not overlap an existing one: // - If seg.NextSegment().Ok(), then r.end <= seg.NextSegment().Start(). // - If seg.PrevSegment().Ok(), then r.start >= seg.PrevSegment().End(). func (seg DirtyIterator) SetRangeUnchecked(r __generics_imported0.MappableRange) { seg.node.keys[seg.index] = r } // SetRange mutates the iterated segment's range key. If the new range would // cause the iterated segment to overlap another segment, or if the new range // is invalid, SetRange panics. This operation does not invalidate any // iterators. func (seg DirtyIterator) SetRange(r __generics_imported0.MappableRange) { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) } if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) } seg.SetRangeUnchecked(r) } // SetStartUnchecked mutates the iterated segment's start. This operation does // not invalidate any iterators. // // Preconditions: The new start must be valid: // - start < seg.End() // - If seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). func (seg DirtyIterator) SetStartUnchecked(start uint64) { seg.node.keys[seg.index].Start = start } // SetStart mutates the iterated segment's start. If the new start value would // cause the iterated segment to overlap another segment, or would result in an // invalid range, SetStart panics. This operation does not invalidate any // iterators. func (seg DirtyIterator) SetStart(start uint64) { if start >= seg.End() { panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) } if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) } seg.SetStartUnchecked(start) } // SetEndUnchecked mutates the iterated segment's end. This operation does not // invalidate any iterators. // // Preconditions: The new end must be valid: // - end > seg.Start(). // - If seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). func (seg DirtyIterator) SetEndUnchecked(end uint64) { seg.node.keys[seg.index].End = end } // SetEnd mutates the iterated segment's end. If the new end value would cause // the iterated segment to overlap another segment, or would result in an // invalid range, SetEnd panics. This operation does not invalidate any // iterators. func (seg DirtyIterator) SetEnd(end uint64) { if end <= seg.Start() { panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) } if next := seg.NextSegment(); next.Ok() && end > next.Start() { panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) } seg.SetEndUnchecked(end) } // Value returns a copy of the iterated segment's value. func (seg DirtyIterator) Value() DirtyInfo { return seg.node.values[seg.index] } // ValuePtr returns a pointer to the iterated segment's value. The pointer is // invalidated if the iterator is invalidated. This operation does not // invalidate any iterators. func (seg DirtyIterator) ValuePtr() *DirtyInfo { return &seg.node.values[seg.index] } // SetValue mutates the iterated segment's value. This operation does not // invalidate any iterators. func (seg DirtyIterator) SetValue(val DirtyInfo) { seg.node.values[seg.index] = val } // PrevSegment returns the iterated segment's predecessor. If there is no // preceding segment, PrevSegment returns a terminal iterator. func (seg DirtyIterator) PrevSegment() DirtyIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment() } if seg.index > 0 { return DirtyIterator{seg.node, seg.index - 1} } if seg.node.parent == nil { return DirtyIterator{} } return DirtysegmentBeforePosition(seg.node.parent, seg.node.parentIndex) } // NextSegment returns the iterated segment's successor. If there is no // succeeding segment, NextSegment returns a terminal iterator. func (seg DirtyIterator) NextSegment() DirtyIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment() } if seg.index < seg.node.nrSegments-1 { return DirtyIterator{seg.node, seg.index + 1} } if seg.node.parent == nil { return DirtyIterator{} } return DirtysegmentAfterPosition(seg.node.parent, seg.node.parentIndex) } // PrevGap returns the gap immediately before the iterated segment. func (seg DirtyIterator) PrevGap() DirtyGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment().NextGap() } return DirtyGapIterator{seg.node, seg.index} } // NextGap returns the gap immediately after the iterated segment. func (seg DirtyIterator) NextGap() DirtyGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment().PrevGap() } return DirtyGapIterator{seg.node, seg.index + 1} } // PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, // or the gap before the iterated segment otherwise. If seg.Start() == // Functions.MinKey(), PrevNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by PrevNonEmpty will be // non-terminal. func (seg DirtyIterator) PrevNonEmpty() (DirtyIterator, DirtyGapIterator) { if prev := seg.PrevSegment(); prev.Ok() && prev.End() == seg.Start() { return prev, DirtyGapIterator{} } return DirtyIterator{}, seg.PrevGap() } // NextNonEmpty returns the iterated segment's successor if it is adjacent, or // the gap after the iterated segment otherwise. If seg.End() == // Functions.MaxKey(), NextNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by NextNonEmpty will be // non-terminal. func (seg DirtyIterator) NextNonEmpty() (DirtyIterator, DirtyGapIterator) { if next := seg.NextSegment(); next.Ok() && next.Start() == seg.End() { return next, DirtyGapIterator{} } return DirtyIterator{}, seg.NextGap() } // A GapIterator is conceptually one of: // // - A pointer to a position between two segments, before the first segment, or // after the last segment in a set, called a *gap*; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Note that the gap between two adjacent segments exists (iterators to it are // non-terminal), but has a length of zero. GapIterator.IsEmpty returns true // for such gaps. An empty set contains a single gap, spanning the entire range // of the set's keys. // // GapIterators are copyable values and are meaningfully equality-comparable. // The zero value of GapIterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type DirtyGapIterator struct { // The representation of a GapIterator is identical to that of an Iterator, // except that index corresponds to positions between segments in the same // way as for node.children (see comment for node.nrSegments). node *Dirtynode index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (gap DirtyGapIterator) Ok() bool { return gap.node != nil } // Range returns the range spanned by the iterated gap. func (gap DirtyGapIterator) Range() __generics_imported0.MappableRange { return __generics_imported0.MappableRange{gap.Start(), gap.End()} } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (gap DirtyGapIterator) Start() uint64 { if ps := gap.PrevSegment(); ps.Ok() { return ps.End() } return dirtySetFunctions{}.MinKey() } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (gap DirtyGapIterator) End() uint64 { if ns := gap.NextSegment(); ns.Ok() { return ns.Start() } return dirtySetFunctions{}.MaxKey() } // IsEmpty returns true if the iterated gap is empty (that is, the "gap" is // between two adjacent segments.) func (gap DirtyGapIterator) IsEmpty() bool { return gap.Range().Length() == 0 } // PrevSegment returns the segment immediately before the iterated gap. If no // such segment exists, PrevSegment returns a terminal iterator. func (gap DirtyGapIterator) PrevSegment() DirtyIterator { return DirtysegmentBeforePosition(gap.node, gap.index) } // NextSegment returns the segment immediately after the iterated gap. If no // such segment exists, NextSegment returns a terminal iterator. func (gap DirtyGapIterator) NextSegment() DirtyIterator { return DirtysegmentAfterPosition(gap.node, gap.index) } // PrevGap returns the iterated gap's predecessor. If no such gap exists, // PrevGap returns a terminal iterator. func (gap DirtyGapIterator) PrevGap() DirtyGapIterator { seg := gap.PrevSegment() if !seg.Ok() { return DirtyGapIterator{} } return seg.PrevGap() } // NextGap returns the iterated gap's successor. If no such gap exists, NextGap // returns a terminal iterator. func (gap DirtyGapIterator) NextGap() DirtyGapIterator { seg := gap.NextSegment() if !seg.Ok() { return DirtyGapIterator{} } return seg.NextGap() } // NextLargeEnoughGap returns the iterated gap's first next gap with larger // length than minSize. If not found, return a terminal gap iterator (does NOT // include this gap itself). // // Precondition: trackGaps must be 1. func (gap DirtyGapIterator) NextLargeEnoughGap(minSize uint64) DirtyGapIterator { if DirtytrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == gap.node.nrSegments { gap.node = gap.NextSegment().node gap.index = 0 return gap.nextLargeEnoughGapHelper(minSize) } return gap.nextLargeEnoughGapHelper(minSize) } // nextLargeEnoughGapHelper is the helper function used by NextLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the trailing gap of a non-leaf node. func (gap DirtyGapIterator) nextLargeEnoughGapHelper(minSize uint64) DirtyGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == gap.node.nrSegments)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return DirtyGapIterator{} } gap.index++ for gap.index <= gap.node.nrSegments { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index++ } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == gap.node.nrSegments { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // PrevLargeEnoughGap returns the iterated gap's first prev gap with larger or // equal length than minSize. If not found, return a terminal gap iterator // (does NOT include this gap itself). // // Precondition: trackGaps must be 1. func (gap DirtyGapIterator) PrevLargeEnoughGap(minSize uint64) DirtyGapIterator { if DirtytrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == 0 { gap.node = gap.PrevSegment().node gap.index = gap.node.nrSegments return gap.prevLargeEnoughGapHelper(minSize) } return gap.prevLargeEnoughGapHelper(minSize) } // prevLargeEnoughGapHelper is the helper function used by PrevLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the first gap of a non-leaf node. func (gap DirtyGapIterator) prevLargeEnoughGapHelper(minSize uint64) DirtyGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == 0)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return DirtyGapIterator{} } gap.index-- for gap.index >= 0 { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index-- } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == 0 { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // segmentBeforePosition returns the predecessor segment of the position given // by n.children[i], which may or may not contain a child. If no such segment // exists, segmentBeforePosition returns a terminal iterator. func DirtysegmentBeforePosition(n *Dirtynode, i int) DirtyIterator { for i == 0 { if n.parent == nil { return DirtyIterator{} } n, i = n.parent, n.parentIndex } return DirtyIterator{n, i - 1} } // segmentAfterPosition returns the successor segment of the position given by // n.children[i], which may or may not contain a child. If no such segment // exists, segmentAfterPosition returns a terminal iterator. func DirtysegmentAfterPosition(n *Dirtynode, i int) DirtyIterator { for i == n.nrSegments { if n.parent == nil { return DirtyIterator{} } n, i = n.parent, n.parentIndex } return DirtyIterator{n, i} } func DirtyzeroValueSlice(slice []DirtyInfo) { for i := range slice { dirtySetFunctions{}.ClearValue(&slice[i]) } } func DirtyzeroNodeSlice(slice []*Dirtynode) { for i := range slice { slice[i] = nil } } // String stringifies a Set for debugging. func (s *DirtySet) String() string { return s.root.String() } // String stringifies a node (and all of its children) for debugging. func (n *Dirtynode) String() string { var buf bytes.Buffer n.writeDebugString(&buf, "") return buf.String() } func (n *Dirtynode) writeDebugString(buf *bytes.Buffer, prefix string) { if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { buf.WriteString(prefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) } for i := 0; i < n.nrSegments; i++ { if child := n.children[i]; child != nil { cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) if child.parent != n || child.parentIndex != i { buf.WriteString(cprefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) } child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) } buf.WriteString(prefix) if n.hasChildren { if DirtytrackGaps != 0 { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v, maxGap: %d\n", i, n.keys[i], n.values[i], n.maxGap.Get())) } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } if child := n.children[n.nrSegments]; child != nil { child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) } } // FlatSegment represents a segment as a single object. FlatSegment is used as // an intermediate representation for save/restore and tests. // // +stateify savable type DirtyFlatSegment struct { Start uint64 End uint64 Value DirtyInfo } // ExportSlice returns a copy of all segments in the given set, in ascending // key order. func (s *DirtySet) ExportSlice() []DirtyFlatSegment { var fs []DirtyFlatSegment for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { fs = append(fs, DirtyFlatSegment{ Start: seg.Start(), End: seg.End(), Value: seg.Value(), }) } return fs } // ImportSlice initializes the given set from the given slice. // // Preconditions: // - s must be empty. // - fs must represent a valid set (the segments in fs must have valid // lengths that do not overlap). // - The segments in fs must be sorted in ascending key order. func (s *DirtySet) ImportSlice(fs []DirtyFlatSegment) error { if !s.IsEmpty() { return fmt.Errorf("cannot import into non-empty set %v", s) } gap := s.FirstGap() for i := range fs { f := &fs[i] r := __generics_imported0.MappableRange{f.Start, f.End} if !gap.Range().IsSupersetOf(r) { return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: %v => %v", r, f.Value) } gap = s.InsertWithoutMerging(gap, r, f.Value).NextGap() } return nil } // segmentTestCheck returns an error if s is incorrectly sorted, does not // contain exactly expectedSegments segments, or contains a segment which // fails the passed check. // // This should be used only for testing, and has been added to this package for // templating convenience. func (s *DirtySet) segmentTestCheck(expectedSegments int, segFunc func(int, __generics_imported0.MappableRange, DirtyInfo) error) error { havePrev := false prev := uint64(0) nrSegments := 0 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { next := seg.Start() if havePrev && prev >= next { return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments) } if segFunc != nil { if err := segFunc(nrSegments, seg.Range(), seg.Value()); err != nil { return err } } prev = next havePrev = true nrSegments++ } if nrSegments != expectedSegments { return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments) } return nil } // countSegments counts the number of segments in the set. // // Similar to Check, this should only be used for testing. func (s *DirtySet) countSegments() (segments int) { for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { segments++ } return segments } func (s *DirtySet) saveRoot() []DirtyFlatSegment { fs := s.ExportSlice() fs = fs[:len(fs):len(fs)] return fs } func (s *DirtySet) loadRoot(_ context.Context, fs []DirtyFlatSegment) { if err := s.ImportSlice(fs); err != nil { panic(err) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsutil/file_range_set.go000066400000000000000000000201431465435605700255530ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fsutil import ( "fmt" "io" "math" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" ) // FileRangeSet maps offsets into a memmap.Mappable to offsets into a // memmap.File. It is used to implement Mappables that store data in // sparsely-allocated memory. // // type FileRangeSet // FileRangeSetFunctions implements segment.Functions for FileRangeSet. type FileRangeSetFunctions struct{} // MinKey implements segment.Functions.MinKey. func (FileRangeSetFunctions) MinKey() uint64 { return 0 } // MaxKey implements segment.Functions.MaxKey. func (FileRangeSetFunctions) MaxKey() uint64 { return math.MaxUint64 } // ClearValue implements segment.Functions.ClearValue. func (FileRangeSetFunctions) ClearValue(_ *uint64) { } // Merge implements segment.Functions.Merge. func (FileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) { if frstart1+mr1.Length() != frstart2 { return 0, false } return frstart1, true } // Split implements segment.Functions.Split. func (FileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) { return frstart, frstart + (split - mr.Start) } // FileRange returns the FileRange mapped by seg. func (seg FileRangeIterator) FileRange() memmap.FileRange { return seg.FileRangeOf(seg.Range()) } // FileRangeOf returns the FileRange mapped by mr. // // Preconditions: // - seg.Range().IsSupersetOf(mr). // - mr.Length() != 0. func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRange { frstart := seg.Value() + (mr.Start - seg.Start()) return memmap.FileRange{frstart, frstart + mr.Length()} } // PagesToFill returns the number of pages that that Fill() will allocate // for the given required and optional parameters. func (s *FileRangeSet) PagesToFill(required, optional memmap.MappableRange) uint64 { var numPages uint64 gap := s.LowerBoundGap(required.Start) for gap.Ok() && gap.Start() < required.End { gr := gap.Range().Intersect(optional) numPages += gr.Length() / hostarch.PageSize gap = gap.NextGap() } return numPages } // Fill attempts to ensure that all memmap.Mappable offsets in required are // mapped to a memmap.File offset, by allocating from mf with the given options // and invoking readAt to store data into memory. (If readAt is not nil, // opts.ReaderFunc will be overridden. If readAt returns a successful partial // read, Fill will call it repeatedly until all bytes have been read.) EOF is // handled consistently with the requirements of mmap(2): bytes after EOF on // the same page are zeroed; pages after EOF are invalid. fileSize is an upper // bound on the file's size; bytes after fileSize will be zeroed without // calling readAt. // // Fill may read offsets outside of required, but will never read offsets // outside of optional. It returns a non-nil error if any error occurs, even // if the error only affects offsets in optional, but not in required. // // Fill returns the number of pages that were allocated. // // Preconditions: // - required.Length() > 0. // - optional.IsSupersetOf(required). // - required and optional must be page-aligned. func (s *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, fileSize uint64, mf *pgalloc.MemoryFile, opts pgalloc.AllocOpts, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) (uint64, error) { gap := s.LowerBoundGap(required.Start) var pagesAlloced uint64 for gap.Ok() && gap.Start() < required.End { if gap.Range().Length() == 0 { gap = gap.NextGap() continue } gr := gap.Range().Intersect(optional) // Read data into the gap. if readAt != nil { opts.ReaderFunc = func(dsts safemem.BlockSeq) (uint64, error) { var done uint64 for !dsts.IsEmpty() { n, err := func() (uint64, error) { off := gr.Start + done if off >= fileSize { return 0, io.EOF } if off+dsts.NumBytes() > fileSize { rd := fileSize - off n, err := readAt(ctx, dsts.TakeFirst64(rd), off) if n == rd && err == nil { return n, io.EOF } return n, err } return readAt(ctx, dsts, off) }() done += n dsts = dsts.DropFirst64(n) if err != nil { if err == io.EOF { // MemoryFile.AllocateAndFill truncates down to a page // boundary, but FileRangeSet.Fill is supposed to // zero-fill to the end of the page in this case. donepgaddr, ok := hostarch.Addr(done).RoundUp() if donepg := uint64(donepgaddr); ok && donepg != done { dsts.DropFirst64(donepg - done) done = donepg if dsts.IsEmpty() { return done, nil } } } return done, err } } return done, nil } } fr, err := mf.Allocate(gr.Length(), opts) // Store anything we managed to read into the cache. if done := fr.Length(); done != 0 { gr.End = gr.Start + done pagesAlloced += gr.Length() / hostarch.PageSize gap = s.Insert(gap, gr, fr.Start).NextGap() } if err != nil { return pagesAlloced, err } } return pagesAlloced, nil } // Drop removes segments for memmap.Mappable offsets in mr, freeing the // corresponding memmap.FileRanges. // // Preconditions: mr must be page-aligned. func (s *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) { seg := s.LowerBoundSegment(mr.Start) for seg.Ok() && seg.Start() < mr.End { seg = s.Isolate(seg, mr) mf.DecRef(seg.FileRange()) seg = s.Remove(seg).NextSegment() } } // DropAll removes all segments in mr, freeing the corresponding // memmap.FileRanges. It returns the number of pages freed. func (s *FileRangeSet) DropAll(mf *pgalloc.MemoryFile) uint64 { var pagesFreed uint64 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { mf.DecRef(seg.FileRange()) pagesFreed += seg.Range().Length() / hostarch.PageSize } s.RemoveAll() return pagesFreed } // Truncate updates s to reflect Mappable truncation to the given length: // bytes after the new EOF on the same page are zeroed, and pages after the new // EOF are freed. It returns the number of pages freed. func (s *FileRangeSet) Truncate(end uint64, mf *pgalloc.MemoryFile) uint64 { var pagesFreed uint64 pgendaddr, ok := hostarch.Addr(end).RoundUp() if ok { pgend := uint64(pgendaddr) // Free truncated pages. seg := s.LowerBoundSegmentSplitBefore(pgend) for seg.Ok() { mf.DecRef(seg.FileRange()) pagesFreed += seg.Range().Length() / hostarch.PageSize seg = s.Remove(seg).NextSegment() } if end == pgend { return pagesFreed } } // Here we know end < end.RoundUp(). If the new EOF lands in the // middle of a page that we have, zero out its contents beyond the new // length. seg := s.FindSegment(end) if seg.Ok() { fr := seg.FileRange() fr.Start += end - seg.Start() ims, err := mf.MapInternal(fr, hostarch.Write) if err != nil { // There's no good recourse from here. This means // that we can't keep cached memory consistent with // the new end of file. The caller may have already // updated the file size on their backing file system. // // We don't want to risk blindly continuing onward, // so in the extremely rare cases this does happen, // we abandon ship. panic(fmt.Sprintf("Failed to map %v: %v", fr, err)) } if _, err := safemem.ZeroSeq(ims); err != nil { panic(fmt.Sprintf("Zeroing %v failed: %v", fr, err)) } } return pagesFreed } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsutil/file_range_set_impl.go000066400000000000000000002052371465435605700266050ustar00rootroot00000000000000package fsutil import ( __generics_imported0 "gvisor.dev/gvisor/pkg/sentry/memmap" ) import ( "bytes" "context" "fmt" ) // trackGaps is an optional parameter. // // If trackGaps is 1, the Set will track maximum gap size recursively, // enabling the GapIterator.{Prev,Next}LargeEnoughGap functions. In this // case, Key must be an unsigned integer. // // trackGaps must be 0 or 1. const FileRangetrackGaps = 0 var _ = uint8(FileRangetrackGaps << 7) // Will fail if not zero or one. // dynamicGap is a type that disappears if trackGaps is 0. type FileRangedynamicGap [FileRangetrackGaps]uint64 // Get returns the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *FileRangedynamicGap) Get() uint64 { return d[:][0] } // Set sets the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *FileRangedynamicGap) Set(v uint64) { d[:][0] = v } const ( // minDegree is the minimum degree of an internal node in a Set B-tree. // // - Any non-root node has at least minDegree-1 segments. // // - Any non-root internal (non-leaf) node has at least minDegree children. // // - The root node may have fewer than minDegree-1 segments, but it may // only have 0 segments if the tree is empty. // // Our implementation requires minDegree >= 3. Higher values of minDegree // usually improve performance, but increase memory usage for small sets. FileRangeminDegree = 3 FileRangemaxDegree = 2 * FileRangeminDegree ) // A Set is a mapping of segments with non-overlapping Range keys. The zero // value for a Set is an empty set. Set values are not safely movable nor // copyable. Set is thread-compatible. // // +stateify savable type FileRangeSet struct { root FileRangenode `state:".([]FileRangeFlatSegment)"` } // IsEmpty returns true if the set contains no segments. func (s *FileRangeSet) IsEmpty() bool { return s.root.nrSegments == 0 } // IsEmptyRange returns true iff no segments in the set overlap the given // range. This is semantically equivalent to s.SpanRange(r) == 0, but may be // more efficient. func (s *FileRangeSet) IsEmptyRange(r __generics_imported0.MappableRange) bool { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return true } _, gap := s.Find(r.Start) if !gap.Ok() { return false } return r.End <= gap.End() } // Span returns the total size of all segments in the set. func (s *FileRangeSet) Span() uint64 { var sz uint64 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { sz += seg.Range().Length() } return sz } // SpanRange returns the total size of the intersection of segments in the set // with the given range. func (s *FileRangeSet) SpanRange(r __generics_imported0.MappableRange) uint64 { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return 0 } var sz uint64 for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { sz += seg.Range().Intersect(r).Length() } return sz } // FirstSegment returns the first segment in the set. If the set is empty, // FirstSegment returns a terminal iterator. func (s *FileRangeSet) FirstSegment() FileRangeIterator { if s.root.nrSegments == 0 { return FileRangeIterator{} } return s.root.firstSegment() } // LastSegment returns the last segment in the set. If the set is empty, // LastSegment returns a terminal iterator. func (s *FileRangeSet) LastSegment() FileRangeIterator { if s.root.nrSegments == 0 { return FileRangeIterator{} } return s.root.lastSegment() } // FirstGap returns the first gap in the set. func (s *FileRangeSet) FirstGap() FileRangeGapIterator { n := &s.root for n.hasChildren { n = n.children[0] } return FileRangeGapIterator{n, 0} } // LastGap returns the last gap in the set. func (s *FileRangeSet) LastGap() FileRangeGapIterator { n := &s.root for n.hasChildren { n = n.children[n.nrSegments] } return FileRangeGapIterator{n, n.nrSegments} } // Find returns the segment or gap whose range contains the given key. If a // segment is found, the returned Iterator is non-terminal and the // returned GapIterator is terminal. Otherwise, the returned Iterator is // terminal and the returned GapIterator is non-terminal. func (s *FileRangeSet) Find(key uint64) (FileRangeIterator, FileRangeGapIterator) { n := &s.root for { lower := 0 upper := n.nrSegments for lower < upper { i := lower + (upper-lower)/2 if r := n.keys[i]; key < r.End { if key >= r.Start { return FileRangeIterator{n, i}, FileRangeGapIterator{} } upper = i } else { lower = i + 1 } } i := lower if !n.hasChildren { return FileRangeIterator{}, FileRangeGapIterator{n, i} } n = n.children[i] } } // FindSegment returns the segment whose range contains the given key. If no // such segment exists, FindSegment returns a terminal iterator. func (s *FileRangeSet) FindSegment(key uint64) FileRangeIterator { seg, _ := s.Find(key) return seg } // LowerBoundSegment returns the segment with the lowest range that contains a // key greater than or equal to min. If no such segment exists, // LowerBoundSegment returns a terminal iterator. func (s *FileRangeSet) LowerBoundSegment(min uint64) FileRangeIterator { seg, gap := s.Find(min) if seg.Ok() { return seg } return gap.NextSegment() } // UpperBoundSegment returns the segment with the highest range that contains a // key less than or equal to max. If no such segment exists, UpperBoundSegment // returns a terminal iterator. func (s *FileRangeSet) UpperBoundSegment(max uint64) FileRangeIterator { seg, gap := s.Find(max) if seg.Ok() { return seg } return gap.PrevSegment() } // FindGap returns the gap containing the given key. If no such gap exists // (i.e. the set contains a segment containing that key), FindGap returns a // terminal iterator. func (s *FileRangeSet) FindGap(key uint64) FileRangeGapIterator { _, gap := s.Find(key) return gap } // LowerBoundGap returns the gap with the lowest range that is greater than or // equal to min. func (s *FileRangeSet) LowerBoundGap(min uint64) FileRangeGapIterator { seg, gap := s.Find(min) if gap.Ok() { return gap } return seg.NextGap() } // UpperBoundGap returns the gap with the highest range that is less than or // equal to max. func (s *FileRangeSet) UpperBoundGap(max uint64) FileRangeGapIterator { seg, gap := s.Find(max) if gap.Ok() { return gap } return seg.PrevGap() } // FirstLargeEnoughGap returns the first gap in the set with at least the given // length. If no such gap exists, FirstLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *FileRangeSet) FirstLargeEnoughGap(minSize uint64) FileRangeGapIterator { if FileRangetrackGaps != 1 { panic("set is not tracking gaps") } gap := s.FirstGap() if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // LastLargeEnoughGap returns the last gap in the set with at least the given // length. If no such gap exists, LastLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *FileRangeSet) LastLargeEnoughGap(minSize uint64) FileRangeGapIterator { if FileRangetrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LastGap() if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // LowerBoundLargeEnoughGap returns the first gap in the set with at least the // given length and whose range contains a key greater than or equal to min. If // no such gap exists, LowerBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *FileRangeSet) LowerBoundLargeEnoughGap(min, minSize uint64) FileRangeGapIterator { if FileRangetrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LowerBoundGap(min) if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // UpperBoundLargeEnoughGap returns the last gap in the set with at least the // given length and whose range contains a key less than or equal to max. If no // such gap exists, UpperBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *FileRangeSet) UpperBoundLargeEnoughGap(max, minSize uint64) FileRangeGapIterator { if FileRangetrackGaps != 1 { panic("set is not tracking gaps") } gap := s.UpperBoundGap(max) if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // Insert inserts the given segment into the given gap. If the new segment can // be merged with adjacent segments, Insert will do so. Insert returns an // iterator to the segment containing the inserted value (which may have been // merged with other values). All existing iterators (including gap, but not // including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, Insert panics. // // Insert is semantically equivalent to a InsertWithoutMerging followed by a // Merge, but may be more efficient. Note that there is no unchecked variant of // Insert since Insert must retrieve and inspect gap's predecessor and // successor segments regardless. func (s *FileRangeSet) Insert(gap FileRangeGapIterator, r __generics_imported0.MappableRange, val uint64) FileRangeIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } prev, next := gap.PrevSegment(), gap.NextSegment() if prev.Ok() && prev.End() > r.Start { panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) } if next.Ok() && next.Start() < r.End { panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) } if prev.Ok() && prev.End() == r.Start { if mval, ok := (FileRangeSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { shrinkMaxGap := FileRangetrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() prev.SetEndUnchecked(r.End) prev.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } if next.Ok() && next.Start() == r.End { val = mval if mval, ok := (FileRangeSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { prev.SetEndUnchecked(next.End()) prev.SetValue(mval) return s.Remove(next).PrevSegment() } } return prev } } if next.Ok() && next.Start() == r.End { if mval, ok := (FileRangeSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { shrinkMaxGap := FileRangetrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() next.SetStartUnchecked(r.Start) next.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } return next } } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMerging inserts the given segment into the given gap and // returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, // InsertWithoutMerging panics. func (s *FileRangeSet) InsertWithoutMerging(gap FileRangeGapIterator, r __generics_imported0.MappableRange, val uint64) FileRangeIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if gr := gap.Range(); !gr.IsSupersetOf(r) { panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMergingUnchecked inserts the given segment into the given gap // and returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // Preconditions: // - r.Start >= gap.Start(). // - r.End <= gap.End(). func (s *FileRangeSet) InsertWithoutMergingUnchecked(gap FileRangeGapIterator, r __generics_imported0.MappableRange, val uint64) FileRangeIterator { gap = gap.node.rebalanceBeforeInsert(gap) splitMaxGap := FileRangetrackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get()) copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) gap.node.keys[gap.index] = r gap.node.values[gap.index] = val gap.node.nrSegments++ if splitMaxGap { gap.node.updateMaxGapLeaf() } return FileRangeIterator{gap.node, gap.index} } // InsertRange inserts the given segment into the set. If the new segment can // be merged with adjacent segments, InsertRange will do so. InsertRange // returns an iterator to the segment containing the inserted value (which may // have been merged with other values). All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertRange panics. // // InsertRange searches the set to find the gap to insert into. If the caller // already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *FileRangeSet) InsertRange(r __generics_imported0.MappableRange, val uint64) FileRangeIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.Insert(gap, r, val) } // InsertWithoutMergingRange inserts the given segment into the set and returns // an iterator to the inserted segment. All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertWithoutMergingRange panics. // // InsertWithoutMergingRange searches the set to find the gap to insert into. // If the caller already has the appropriate GapIterator, or if the caller // needs to do additional work between finding the gap and insertion, use // InsertWithoutMerging instead. func (s *FileRangeSet) InsertWithoutMergingRange(r __generics_imported0.MappableRange, val uint64) FileRangeIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.InsertWithoutMerging(gap, r, val) } // TryInsertRange attempts to insert the given segment into the set. If the new // segment can be merged with adjacent segments, TryInsertRange will do so. // TryInsertRange returns an iterator to the segment containing the inserted // value (which may have been merged with other values). All existing iterators // (excluding the returned iterator) are invalidated. // // If the new segment would overlap an existing segment, TryInsertRange does // nothing and returns a terminal iterator. // // TryInsertRange searches the set to find the gap to insert into. If the // caller already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *FileRangeSet) TryInsertRange(r __generics_imported0.MappableRange, val uint64) FileRangeIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return FileRangeIterator{} } if gap.End() < r.End { return FileRangeIterator{} } return s.Insert(gap, r, val) } // TryInsertWithoutMergingRange attempts to insert the given segment into the // set. If successful, it returns an iterator to the inserted segment; all // existing iterators (excluding the returned iterator) are invalidated. If the // new segment would overlap an existing segment, TryInsertWithoutMergingRange // does nothing and returns a terminal iterator. // // TryInsertWithoutMergingRange searches the set to find the gap to insert // into. If the caller already has the appropriate GapIterator, or if the // caller needs to do additional work between finding the gap and insertion, // use InsertWithoutMerging instead. func (s *FileRangeSet) TryInsertWithoutMergingRange(r __generics_imported0.MappableRange, val uint64) FileRangeIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return FileRangeIterator{} } if gap.End() < r.End { return FileRangeIterator{} } return s.InsertWithoutMerging(gap, r, val) } // Remove removes the given segment and returns an iterator to the vacated gap. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. func (s *FileRangeSet) Remove(seg FileRangeIterator) FileRangeGapIterator { if seg.node.hasChildren { victim := seg.PrevSegment() seg.SetRangeUnchecked(victim.Range()) seg.SetValue(victim.Value()) nextAdjacentNode := seg.NextSegment().node if FileRangetrackGaps != 0 { nextAdjacentNode.updateMaxGapLeaf() } return s.Remove(victim).NextGap() } copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) FileRangeSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) seg.node.nrSegments-- if FileRangetrackGaps != 0 { seg.node.updateMaxGapLeaf() } return seg.node.rebalanceAfterRemove(FileRangeGapIterator{seg.node, seg.index}) } // RemoveAll removes all segments from the set. All existing iterators are // invalidated. func (s *FileRangeSet) RemoveAll() { s.root = FileRangenode{} } // RemoveRange removes all segments in the given range. An iterator to the // newly formed gap is returned, and all existing iterators are invalidated. // // RemoveRange searches the set to find segments to remove. If the caller // already has an iterator to either end of the range of segments to remove, or // if the caller needs to do additional work before removing each segment, // iterate segments and call Remove in a loop instead. func (s *FileRangeSet) RemoveRange(r __generics_imported0.MappableRange) FileRangeGapIterator { seg, gap := s.Find(r.Start) if seg.Ok() { seg = s.Isolate(seg, r) gap = s.Remove(seg) } for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { seg = s.SplitAfter(seg, r.End) gap = s.Remove(seg) } return gap } // RemoveFullRange is equivalent to RemoveRange, except that if any key in the // given range does not correspond to a segment, RemoveFullRange panics. func (s *FileRangeSet) RemoveFullRange(r __generics_imported0.MappableRange) FileRangeGapIterator { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) end := seg.End() gap := s.Remove(seg) if r.End <= end { return gap } seg = gap.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // Merge attempts to merge two neighboring segments. If successful, Merge // returns an iterator to the merged segment, and all existing iterators are // invalidated. Otherwise, Merge returns a terminal iterator. // // If first is not the predecessor of second, Merge panics. func (s *FileRangeSet) Merge(first, second FileRangeIterator) FileRangeIterator { if first.NextSegment() != second { panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) } return s.MergeUnchecked(first, second) } // MergeUnchecked attempts to merge two neighboring segments. If successful, // MergeUnchecked returns an iterator to the merged segment, and all existing // iterators are invalidated. Otherwise, MergeUnchecked returns a terminal // iterator. // // Precondition: first is the predecessor of second: first.NextSegment() == // second, first == second.PrevSegment(). func (s *FileRangeSet) MergeUnchecked(first, second FileRangeIterator) FileRangeIterator { if first.End() == second.Start() { if mval, ok := (FileRangeSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { first.SetEndUnchecked(second.End()) first.SetValue(mval) return s.Remove(second).PrevSegment() } } return FileRangeIterator{} } // MergePrev attempts to merge the given segment with its predecessor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergePrev is usually used when mutating segments while iterating them in // order of increasing keys, to attempt merging of each mutated segment with // its previously-mutated predecessor. In such cases, merging a mutated segment // with its unmutated successor would incorrectly cause the latter to be // skipped. func (s *FileRangeSet) MergePrev(seg FileRangeIterator) FileRangeIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } return seg } // MergeNext attempts to merge the given segment with its successor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergeNext is usually used when mutating segments while iterating them in // order of decreasing keys, to attempt merging of each mutated segment with // its previously-mutated successor. In such cases, merging a mutated segment // with its unmutated predecessor would incorrectly cause the latter to be // skipped. func (s *FileRangeSet) MergeNext(seg FileRangeIterator) FileRangeIterator { if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // Unisolate attempts to merge the given segment with its predecessor and // successor if possible, and returns an updated iterator to the extended // segment. All existing iterators (including seg, but not including the // returned iterator) are invalidated. // // Unisolate is usually used in conjunction with Isolate when mutating part of // a single segment in a way that may affect its mergeability. For the reasons // described by MergePrev and MergeNext, it is usually incorrect to use the // return value of Unisolate in a loop variable. func (s *FileRangeSet) Unisolate(seg FileRangeIterator) FileRangeIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // MergeAll merges all mergeable adjacent segments in the set. All existing // iterators are invalidated. func (s *FileRangeSet) MergeAll() { seg := s.FirstSegment() if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeInsideRange attempts to merge all adjacent segments that contain a key // in the specific range. All existing iterators are invalidated. // // MergeInsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid a redundant search. func (s *FileRangeSet) MergeInsideRange(r __generics_imported0.MappableRange) { seg := s.LowerBoundSegment(r.Start) if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() && next.Start() < r.End { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeOutsideRange attempts to merge the segment containing r.Start with its // predecessor, and the segment containing r.End-1 with its successor. // // MergeOutsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid two redundant searches. func (s *FileRangeSet) MergeOutsideRange(r __generics_imported0.MappableRange) { first := s.FindSegment(r.Start) if first.Ok() { if prev := first.PrevSegment(); prev.Ok() { s.Merge(prev, first) } } last := s.FindSegment(r.End - 1) if last.Ok() { if next := last.NextSegment(); next.Ok() { s.Merge(last, next) } } } // Split splits the given segment at the given key and returns iterators to the // two resulting segments. All existing iterators (including seg, but not // including the returned iterators) are invalidated. // // If the segment cannot be split at split (because split is at the start or // end of the segment's range, so splitting would produce a segment with zero // length, or because split falls outside the segment's range altogether), // Split panics. func (s *FileRangeSet) Split(seg FileRangeIterator, split uint64) (FileRangeIterator, FileRangeIterator) { if !seg.Range().CanSplitAt(split) { panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) } return s.SplitUnchecked(seg, split) } // SplitUnchecked splits the given segment at the given key and returns // iterators to the two resulting segments. All existing iterators (including // seg, but not including the returned iterators) are invalidated. // // Preconditions: seg.Start() < key < seg.End(). func (s *FileRangeSet) SplitUnchecked(seg FileRangeIterator, split uint64) (FileRangeIterator, FileRangeIterator) { val1, val2 := (FileRangeSetFunctions{}).Split(seg.Range(), seg.Value(), split) end2 := seg.End() seg.SetEndUnchecked(split) seg.SetValue(val1) seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), __generics_imported0.MappableRange{split, end2}, val2) return seg2.PrevSegment(), seg2 } // SplitBefore ensures that the given segment's start is at least start by // splitting at start if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterator) are invalidated. // // SplitBefore is usually when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, the first segment may // extend beyond the start of the range to be mutated, and needs to be // SplitBefore to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter; i.e. SplitBefore needs to be invoked on each segment, while // SplitAfter only needs to be invoked on the first. // // Preconditions: start < seg.End(). func (s *FileRangeSet) SplitBefore(seg FileRangeIterator, start uint64) FileRangeIterator { if seg.Range().CanSplitAt(start) { _, seg = s.SplitUnchecked(seg, start) } return seg } // SplitAfter ensures that the given segment's end is at most end by splitting // at end if necessary, and returns an updated iterator to the bounded segment. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. // // SplitAfter is usually used when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, each iterated segment // may extend beyond the end of the range to be mutated, and needs to be // SplitAfter to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter exchange roles; i.e. SplitBefore needs to be invoked on each // segment, while SplitAfter only needs to be invoked on the first. // // Preconditions: seg.Start() < end. func (s *FileRangeSet) SplitAfter(seg FileRangeIterator, end uint64) FileRangeIterator { if seg.Range().CanSplitAt(end) { seg, _ = s.SplitUnchecked(seg, end) } return seg } // Isolate ensures that the given segment's range is a subset of r by splitting // at r.Start and r.End if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterators) are invalidated. // // Isolate is usually used when mutating part of a single segment, or when // mutating segments in a range where the first segment is not necessarily // split, making use of SplitBefore/SplitAfter complex. // // Preconditions: seg.Range().Overlaps(r). func (s *FileRangeSet) Isolate(seg FileRangeIterator, r __generics_imported0.MappableRange) FileRangeIterator { if seg.Range().CanSplitAt(r.Start) { _, seg = s.SplitUnchecked(seg, r.Start) } if seg.Range().CanSplitAt(r.End) { seg, _ = s.SplitUnchecked(seg, r.End) } return seg } // LowerBoundSegmentSplitBefore combines LowerBoundSegment and SplitBefore. // // LowerBoundSegmentSplitBefore is usually used when mutating segments in a // range while iterating them in order of increasing keys. In such cases, // LowerBoundSegmentSplitBefore provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *FileRangeSet) LowerBoundSegmentSplitBefore(min uint64) FileRangeIterator { seg := s.LowerBoundSegment(min) if seg.Ok() { seg = s.SplitBefore(seg, min) } return seg } // UpperBoundSegmentSplitAfter combines UpperBoundSegment and SplitAfter. // // UpperBoundSegmentSplitAfter is usually used when mutating segments in a // range while iterating them in order of decreasing keys. In such cases, // UpperBoundSegmentSplitAfter provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *FileRangeSet) UpperBoundSegmentSplitAfter(max uint64) FileRangeIterator { seg := s.UpperBoundSegment(max) if seg.Ok() { seg = s.SplitAfter(seg, max) } return seg } // VisitRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments will not be split, so f may be called // on segments lying partially outside r. Non-empty gaps between segments are // skipped. If a call to f returns false, VisitRange stops iteration // immediately. // // N.B. f must not invalidate iterators into s. func (s *FileRangeSet) VisitRange(r __generics_imported0.MappableRange, f func(seg FileRangeIterator) bool) { for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { if !f(seg) { return } } } // VisitFullRange is equivalent to VisitRange, except that if any key in r that // is visited before f returns false does not correspond to a segment, // VisitFullRange panics. func (s *FileRangeSet) VisitFullRange(r __generics_imported0.MappableRange, f func(seg FileRangeIterator) bool) { pos := r.Start seg := s.FindSegment(r.Start) for { if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", pos)) } if !f(seg) { return } pos = seg.End() if r.End <= pos { return } seg, _ = seg.NextNonEmpty() } } // MutateRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments that lie partially outside r are split // before f is called, such that f only observes segments entirely within r. // Iterated segments are merged again after f is called. Non-empty gaps between // segments are skipped. If a call to f returns false, MutateRange stops // iteration immediately. // // MutateRange invalidates all existing iterators. // // N.B. f must not invalidate iterators into s. func (s *FileRangeSet) MutateRange(r __generics_imported0.MappableRange, f func(seg FileRangeIterator) bool) { seg := s.LowerBoundSegmentSplitBefore(r.Start) for seg.Ok() && seg.Start() < r.End { seg = s.SplitAfter(seg, r.End) cont := f(seg) seg = s.MergePrev(seg) if !cont { s.MergeNext(seg) return } seg = seg.NextSegment() } if seg.Ok() { s.MergePrev(seg) } } // MutateFullRange is equivalent to MutateRange, except that if any key in r // that is visited before f returns false does not correspond to a segment, // MutateFullRange panics. func (s *FileRangeSet) MutateFullRange(r __generics_imported0.MappableRange, f func(seg FileRangeIterator) bool) { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) cont := f(seg) end := seg.End() seg = s.MergePrev(seg) if !cont || r.End <= end { s.MergeNext(seg) return } seg = seg.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // +stateify savable type FileRangenode struct { // An internal binary tree node looks like: // // K // / \ // Cl Cr // // where all keys in the subtree rooted by Cl (the left subtree) are less // than K (the key of the parent node), and all keys in the subtree rooted // by Cr (the right subtree) are greater than K. // // An internal B-tree node's indexes work out to look like: // // K0 K1 K2 ... Kn-1 // / \/ \/ \ ... / \ // C0 C1 C2 C3 ... Cn-1 Cn // // where n is nrSegments. nrSegments int // parent is a pointer to this node's parent. If this node is root, parent // is nil. parent *FileRangenode // parentIndex is the index of this node in parent.children. parentIndex int // Flag for internal nodes that is technically redundant with "children[0] // != nil", but is stored in the first cache line. "hasChildren" rather // than "isLeaf" because false must be the correct value for an empty root. hasChildren bool // The longest gap within this node. If the node is a leaf, it's simply the // maximum gap among all the (nrSegments+1) gaps formed by its nrSegments keys // including the 0th and nrSegments-th gap possibly shared with its upper-level // nodes; if it's a non-leaf node, it's the max of all children's maxGap. maxGap FileRangedynamicGap // Nodes store keys and values in separate arrays to maximize locality in // the common case (scanning keys for lookup). keys [FileRangemaxDegree - 1]__generics_imported0.MappableRange values [FileRangemaxDegree - 1]uint64 children [FileRangemaxDegree]*FileRangenode } // firstSegment returns the first segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *FileRangenode) firstSegment() FileRangeIterator { for n.hasChildren { n = n.children[0] } return FileRangeIterator{n, 0} } // lastSegment returns the last segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *FileRangenode) lastSegment() FileRangeIterator { for n.hasChildren { n = n.children[n.nrSegments] } return FileRangeIterator{n, n.nrSegments - 1} } func (n *FileRangenode) prevSibling() *FileRangenode { if n.parent == nil || n.parentIndex == 0 { return nil } return n.parent.children[n.parentIndex-1] } func (n *FileRangenode) nextSibling() *FileRangenode { if n.parent == nil || n.parentIndex == n.parent.nrSegments { return nil } return n.parent.children[n.parentIndex+1] } // rebalanceBeforeInsert splits n and its ancestors if they are full, as // required for insertion, and returns an updated iterator to the position // represented by gap. func (n *FileRangenode) rebalanceBeforeInsert(gap FileRangeGapIterator) FileRangeGapIterator { if n.nrSegments < FileRangemaxDegree-1 { return gap } if n.parent != nil { gap = n.parent.rebalanceBeforeInsert(gap) } if n.parent == nil { left := &FileRangenode{ nrSegments: FileRangeminDegree - 1, parent: n, parentIndex: 0, hasChildren: n.hasChildren, } right := &FileRangenode{ nrSegments: FileRangeminDegree - 1, parent: n, parentIndex: 1, hasChildren: n.hasChildren, } copy(left.keys[:FileRangeminDegree-1], n.keys[:FileRangeminDegree-1]) copy(left.values[:FileRangeminDegree-1], n.values[:FileRangeminDegree-1]) copy(right.keys[:FileRangeminDegree-1], n.keys[FileRangeminDegree:]) copy(right.values[:FileRangeminDegree-1], n.values[FileRangeminDegree:]) n.keys[0], n.values[0] = n.keys[FileRangeminDegree-1], n.values[FileRangeminDegree-1] FileRangezeroValueSlice(n.values[1:]) if n.hasChildren { copy(left.children[:FileRangeminDegree], n.children[:FileRangeminDegree]) copy(right.children[:FileRangeminDegree], n.children[FileRangeminDegree:]) FileRangezeroNodeSlice(n.children[2:]) for i := 0; i < FileRangeminDegree; i++ { left.children[i].parent = left left.children[i].parentIndex = i right.children[i].parent = right right.children[i].parentIndex = i } } n.nrSegments = 1 n.hasChildren = true n.children[0] = left n.children[1] = right if FileRangetrackGaps != 0 { left.updateMaxGapLocal() right.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < FileRangeminDegree { return FileRangeGapIterator{left, gap.index} } return FileRangeGapIterator{right, gap.index - FileRangeminDegree} } copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[FileRangeminDegree-1], n.values[FileRangeminDegree-1] copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { n.parent.children[i].parentIndex = i } sibling := &FileRangenode{ nrSegments: FileRangeminDegree - 1, parent: n.parent, parentIndex: n.parentIndex + 1, hasChildren: n.hasChildren, } n.parent.children[n.parentIndex+1] = sibling n.parent.nrSegments++ copy(sibling.keys[:FileRangeminDegree-1], n.keys[FileRangeminDegree:]) copy(sibling.values[:FileRangeminDegree-1], n.values[FileRangeminDegree:]) FileRangezeroValueSlice(n.values[FileRangeminDegree-1:]) if n.hasChildren { copy(sibling.children[:FileRangeminDegree], n.children[FileRangeminDegree:]) FileRangezeroNodeSlice(n.children[FileRangeminDegree:]) for i := 0; i < FileRangeminDegree; i++ { sibling.children[i].parent = sibling sibling.children[i].parentIndex = i } } n.nrSegments = FileRangeminDegree - 1 if FileRangetrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < FileRangeminDegree { return gap } return FileRangeGapIterator{sibling, gap.index - FileRangeminDegree} } // rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient // (contain fewer segments than required by B-tree invariants), as required for // removal, and returns an updated iterator to the position represented by gap. // // Precondition: n is the only node in the tree that may currently violate a // B-tree invariant. func (n *FileRangenode) rebalanceAfterRemove(gap FileRangeGapIterator) FileRangeGapIterator { for { if n.nrSegments >= FileRangeminDegree-1 { return gap } if n.parent == nil { return gap } if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= FileRangeminDegree { copy(n.keys[1:], n.keys[:n.nrSegments]) copy(n.values[1:], n.values[:n.nrSegments]) n.keys[0] = n.parent.keys[n.parentIndex-1] n.values[0] = n.parent.values[n.parentIndex-1] n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] FileRangeSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { copy(n.children[1:], n.children[:n.nrSegments+1]) n.children[0] = sibling.children[sibling.nrSegments] sibling.children[sibling.nrSegments] = nil n.children[0].parent = n n.children[0].parentIndex = 0 for i := 1; i < n.nrSegments+2; i++ { n.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if FileRangetrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling && gap.index == sibling.nrSegments { return FileRangeGapIterator{n, 0} } if gap.node == n { return FileRangeGapIterator{n, gap.index + 1} } return gap } if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= FileRangeminDegree { n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] n.values[n.nrSegments] = n.parent.values[n.parentIndex] n.parent.keys[n.parentIndex] = sibling.keys[0] n.parent.values[n.parentIndex] = sibling.values[0] copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) FileRangeSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { n.children[n.nrSegments+1] = sibling.children[0] copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) sibling.children[sibling.nrSegments] = nil n.children[n.nrSegments+1].parent = n n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 for i := 0; i < sibling.nrSegments; i++ { sibling.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if FileRangetrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling { if gap.index == 0 { return FileRangeGapIterator{n, n.nrSegments} } return FileRangeGapIterator{sibling, gap.index - 1} } return gap } p := n.parent if p.nrSegments == 1 { left, right := p.children[0], p.children[1] p.nrSegments = left.nrSegments + right.nrSegments + 1 p.hasChildren = left.hasChildren p.keys[left.nrSegments] = p.keys[0] p.values[left.nrSegments] = p.values[0] copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := 0; i < p.nrSegments+1; i++ { p.children[i].parent = p p.children[i].parentIndex = i } } else { p.children[0] = nil p.children[1] = nil } if gap.node == left { return FileRangeGapIterator{p, gap.index} } if gap.node == right { return FileRangeGapIterator{p, gap.index + left.nrSegments + 1} } return gap } // Merge n and either sibling, along with the segment separating the // two, into whichever of the two nodes comes first. This is the // reverse of the non-root splitting case in // node.rebalanceBeforeInsert. var left, right *FileRangenode if n.parentIndex > 0 { left = n.prevSibling() right = n } else { left = n right = n.nextSibling() } if gap.node == right { gap = FileRangeGapIterator{left, gap.index + left.nrSegments + 1} } left.keys[left.nrSegments] = p.keys[left.parentIndex] left.values[left.nrSegments] = p.values[left.parentIndex] copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { left.children[i].parent = left left.children[i].parentIndex = i } } left.nrSegments += right.nrSegments + 1 copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) FileRangeSetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) for i := 0; i < p.nrSegments; i++ { p.children[i].parentIndex = i } p.children[p.nrSegments] = nil p.nrSegments-- if FileRangetrackGaps != 0 { left.updateMaxGapLocal() } n = p } } // updateMaxGapLeaf updates maxGap bottom-up from the calling leaf until no // necessary update. // // Preconditions: n must be a leaf node, trackGaps must be 1. func (n *FileRangenode) updateMaxGapLeaf() { if n.hasChildren { panic(fmt.Sprintf("updateMaxGapLeaf should always be called on leaf node: %v", n)) } max := n.calculateMaxGapLeaf() if max == n.maxGap.Get() { return } oldMax := n.maxGap.Get() n.maxGap.Set(max) if max > oldMax { for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() >= max { break } p.maxGap.Set(max) } return } for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() > oldMax { break } parentNewMax := p.calculateMaxGapInternal() if p.maxGap.Get() == parentNewMax { break } p.maxGap.Set(parentNewMax) } } // updateMaxGapLocal updates maxGap of the calling node solely with no // propagation to ancestor nodes. // // Precondition: trackGaps must be 1. func (n *FileRangenode) updateMaxGapLocal() { if !n.hasChildren { n.maxGap.Set(n.calculateMaxGapLeaf()) } else { n.maxGap.Set(n.calculateMaxGapInternal()) } } // calculateMaxGapLeaf iterates the gaps within a leaf node and calculate the // max. // // Preconditions: n must be a leaf node. func (n *FileRangenode) calculateMaxGapLeaf() uint64 { max := FileRangeGapIterator{n, 0}.Range().Length() for i := 1; i <= n.nrSegments; i++ { if current := (FileRangeGapIterator{n, i}).Range().Length(); current > max { max = current } } return max } // calculateMaxGapInternal iterates children's maxGap within an internal node n // and calculate the max. // // Preconditions: n must be a non-leaf node. func (n *FileRangenode) calculateMaxGapInternal() uint64 { max := n.children[0].maxGap.Get() for i := 1; i <= n.nrSegments; i++ { if current := n.children[i].maxGap.Get(); current > max { max = current } } return max } // searchFirstLargeEnoughGap returns the first gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *FileRangenode) searchFirstLargeEnoughGap(minSize uint64) FileRangeGapIterator { if n.maxGap.Get() < minSize { return FileRangeGapIterator{} } if n.hasChildren { for i := 0; i <= n.nrSegments; i++ { if largeEnoughGap := n.children[i].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := 0; i <= n.nrSegments; i++ { currentGap := FileRangeGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // searchLastLargeEnoughGap returns the last gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *FileRangenode) searchLastLargeEnoughGap(minSize uint64) FileRangeGapIterator { if n.maxGap.Get() < minSize { return FileRangeGapIterator{} } if n.hasChildren { for i := n.nrSegments; i >= 0; i-- { if largeEnoughGap := n.children[i].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := n.nrSegments; i >= 0; i-- { currentGap := FileRangeGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // A Iterator is conceptually one of: // // - A pointer to a segment in a set; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Iterators are copyable values and are meaningfully equality-comparable. The // zero value of Iterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type FileRangeIterator struct { // node is the node containing the iterated segment. If the iterator is // terminal, node is nil. node *FileRangenode // index is the index of the segment in node.keys/values. index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (seg FileRangeIterator) Ok() bool { return seg.node != nil } // Range returns the iterated segment's range key. func (seg FileRangeIterator) Range() __generics_imported0.MappableRange { return seg.node.keys[seg.index] } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (seg FileRangeIterator) Start() uint64 { return seg.node.keys[seg.index].Start } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (seg FileRangeIterator) End() uint64 { return seg.node.keys[seg.index].End } // SetRangeUnchecked mutates the iterated segment's range key. This operation // does not invalidate any iterators. // // Preconditions: // - r.Length() > 0. // - The new range must not overlap an existing one: // - If seg.NextSegment().Ok(), then r.end <= seg.NextSegment().Start(). // - If seg.PrevSegment().Ok(), then r.start >= seg.PrevSegment().End(). func (seg FileRangeIterator) SetRangeUnchecked(r __generics_imported0.MappableRange) { seg.node.keys[seg.index] = r } // SetRange mutates the iterated segment's range key. If the new range would // cause the iterated segment to overlap another segment, or if the new range // is invalid, SetRange panics. This operation does not invalidate any // iterators. func (seg FileRangeIterator) SetRange(r __generics_imported0.MappableRange) { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) } if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) } seg.SetRangeUnchecked(r) } // SetStartUnchecked mutates the iterated segment's start. This operation does // not invalidate any iterators. // // Preconditions: The new start must be valid: // - start < seg.End() // - If seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). func (seg FileRangeIterator) SetStartUnchecked(start uint64) { seg.node.keys[seg.index].Start = start } // SetStart mutates the iterated segment's start. If the new start value would // cause the iterated segment to overlap another segment, or would result in an // invalid range, SetStart panics. This operation does not invalidate any // iterators. func (seg FileRangeIterator) SetStart(start uint64) { if start >= seg.End() { panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) } if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) } seg.SetStartUnchecked(start) } // SetEndUnchecked mutates the iterated segment's end. This operation does not // invalidate any iterators. // // Preconditions: The new end must be valid: // - end > seg.Start(). // - If seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). func (seg FileRangeIterator) SetEndUnchecked(end uint64) { seg.node.keys[seg.index].End = end } // SetEnd mutates the iterated segment's end. If the new end value would cause // the iterated segment to overlap another segment, or would result in an // invalid range, SetEnd panics. This operation does not invalidate any // iterators. func (seg FileRangeIterator) SetEnd(end uint64) { if end <= seg.Start() { panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) } if next := seg.NextSegment(); next.Ok() && end > next.Start() { panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) } seg.SetEndUnchecked(end) } // Value returns a copy of the iterated segment's value. func (seg FileRangeIterator) Value() uint64 { return seg.node.values[seg.index] } // ValuePtr returns a pointer to the iterated segment's value. The pointer is // invalidated if the iterator is invalidated. This operation does not // invalidate any iterators. func (seg FileRangeIterator) ValuePtr() *uint64 { return &seg.node.values[seg.index] } // SetValue mutates the iterated segment's value. This operation does not // invalidate any iterators. func (seg FileRangeIterator) SetValue(val uint64) { seg.node.values[seg.index] = val } // PrevSegment returns the iterated segment's predecessor. If there is no // preceding segment, PrevSegment returns a terminal iterator. func (seg FileRangeIterator) PrevSegment() FileRangeIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment() } if seg.index > 0 { return FileRangeIterator{seg.node, seg.index - 1} } if seg.node.parent == nil { return FileRangeIterator{} } return FileRangesegmentBeforePosition(seg.node.parent, seg.node.parentIndex) } // NextSegment returns the iterated segment's successor. If there is no // succeeding segment, NextSegment returns a terminal iterator. func (seg FileRangeIterator) NextSegment() FileRangeIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment() } if seg.index < seg.node.nrSegments-1 { return FileRangeIterator{seg.node, seg.index + 1} } if seg.node.parent == nil { return FileRangeIterator{} } return FileRangesegmentAfterPosition(seg.node.parent, seg.node.parentIndex) } // PrevGap returns the gap immediately before the iterated segment. func (seg FileRangeIterator) PrevGap() FileRangeGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment().NextGap() } return FileRangeGapIterator{seg.node, seg.index} } // NextGap returns the gap immediately after the iterated segment. func (seg FileRangeIterator) NextGap() FileRangeGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment().PrevGap() } return FileRangeGapIterator{seg.node, seg.index + 1} } // PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, // or the gap before the iterated segment otherwise. If seg.Start() == // Functions.MinKey(), PrevNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by PrevNonEmpty will be // non-terminal. func (seg FileRangeIterator) PrevNonEmpty() (FileRangeIterator, FileRangeGapIterator) { if prev := seg.PrevSegment(); prev.Ok() && prev.End() == seg.Start() { return prev, FileRangeGapIterator{} } return FileRangeIterator{}, seg.PrevGap() } // NextNonEmpty returns the iterated segment's successor if it is adjacent, or // the gap after the iterated segment otherwise. If seg.End() == // Functions.MaxKey(), NextNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by NextNonEmpty will be // non-terminal. func (seg FileRangeIterator) NextNonEmpty() (FileRangeIterator, FileRangeGapIterator) { if next := seg.NextSegment(); next.Ok() && next.Start() == seg.End() { return next, FileRangeGapIterator{} } return FileRangeIterator{}, seg.NextGap() } // A GapIterator is conceptually one of: // // - A pointer to a position between two segments, before the first segment, or // after the last segment in a set, called a *gap*; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Note that the gap between two adjacent segments exists (iterators to it are // non-terminal), but has a length of zero. GapIterator.IsEmpty returns true // for such gaps. An empty set contains a single gap, spanning the entire range // of the set's keys. // // GapIterators are copyable values and are meaningfully equality-comparable. // The zero value of GapIterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type FileRangeGapIterator struct { // The representation of a GapIterator is identical to that of an Iterator, // except that index corresponds to positions between segments in the same // way as for node.children (see comment for node.nrSegments). node *FileRangenode index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (gap FileRangeGapIterator) Ok() bool { return gap.node != nil } // Range returns the range spanned by the iterated gap. func (gap FileRangeGapIterator) Range() __generics_imported0.MappableRange { return __generics_imported0.MappableRange{gap.Start(), gap.End()} } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (gap FileRangeGapIterator) Start() uint64 { if ps := gap.PrevSegment(); ps.Ok() { return ps.End() } return FileRangeSetFunctions{}.MinKey() } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (gap FileRangeGapIterator) End() uint64 { if ns := gap.NextSegment(); ns.Ok() { return ns.Start() } return FileRangeSetFunctions{}.MaxKey() } // IsEmpty returns true if the iterated gap is empty (that is, the "gap" is // between two adjacent segments.) func (gap FileRangeGapIterator) IsEmpty() bool { return gap.Range().Length() == 0 } // PrevSegment returns the segment immediately before the iterated gap. If no // such segment exists, PrevSegment returns a terminal iterator. func (gap FileRangeGapIterator) PrevSegment() FileRangeIterator { return FileRangesegmentBeforePosition(gap.node, gap.index) } // NextSegment returns the segment immediately after the iterated gap. If no // such segment exists, NextSegment returns a terminal iterator. func (gap FileRangeGapIterator) NextSegment() FileRangeIterator { return FileRangesegmentAfterPosition(gap.node, gap.index) } // PrevGap returns the iterated gap's predecessor. If no such gap exists, // PrevGap returns a terminal iterator. func (gap FileRangeGapIterator) PrevGap() FileRangeGapIterator { seg := gap.PrevSegment() if !seg.Ok() { return FileRangeGapIterator{} } return seg.PrevGap() } // NextGap returns the iterated gap's successor. If no such gap exists, NextGap // returns a terminal iterator. func (gap FileRangeGapIterator) NextGap() FileRangeGapIterator { seg := gap.NextSegment() if !seg.Ok() { return FileRangeGapIterator{} } return seg.NextGap() } // NextLargeEnoughGap returns the iterated gap's first next gap with larger // length than minSize. If not found, return a terminal gap iterator (does NOT // include this gap itself). // // Precondition: trackGaps must be 1. func (gap FileRangeGapIterator) NextLargeEnoughGap(minSize uint64) FileRangeGapIterator { if FileRangetrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == gap.node.nrSegments { gap.node = gap.NextSegment().node gap.index = 0 return gap.nextLargeEnoughGapHelper(minSize) } return gap.nextLargeEnoughGapHelper(minSize) } // nextLargeEnoughGapHelper is the helper function used by NextLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the trailing gap of a non-leaf node. func (gap FileRangeGapIterator) nextLargeEnoughGapHelper(minSize uint64) FileRangeGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == gap.node.nrSegments)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return FileRangeGapIterator{} } gap.index++ for gap.index <= gap.node.nrSegments { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index++ } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == gap.node.nrSegments { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // PrevLargeEnoughGap returns the iterated gap's first prev gap with larger or // equal length than minSize. If not found, return a terminal gap iterator // (does NOT include this gap itself). // // Precondition: trackGaps must be 1. func (gap FileRangeGapIterator) PrevLargeEnoughGap(minSize uint64) FileRangeGapIterator { if FileRangetrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == 0 { gap.node = gap.PrevSegment().node gap.index = gap.node.nrSegments return gap.prevLargeEnoughGapHelper(minSize) } return gap.prevLargeEnoughGapHelper(minSize) } // prevLargeEnoughGapHelper is the helper function used by PrevLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the first gap of a non-leaf node. func (gap FileRangeGapIterator) prevLargeEnoughGapHelper(minSize uint64) FileRangeGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == 0)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return FileRangeGapIterator{} } gap.index-- for gap.index >= 0 { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index-- } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == 0 { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // segmentBeforePosition returns the predecessor segment of the position given // by n.children[i], which may or may not contain a child. If no such segment // exists, segmentBeforePosition returns a terminal iterator. func FileRangesegmentBeforePosition(n *FileRangenode, i int) FileRangeIterator { for i == 0 { if n.parent == nil { return FileRangeIterator{} } n, i = n.parent, n.parentIndex } return FileRangeIterator{n, i - 1} } // segmentAfterPosition returns the successor segment of the position given by // n.children[i], which may or may not contain a child. If no such segment // exists, segmentAfterPosition returns a terminal iterator. func FileRangesegmentAfterPosition(n *FileRangenode, i int) FileRangeIterator { for i == n.nrSegments { if n.parent == nil { return FileRangeIterator{} } n, i = n.parent, n.parentIndex } return FileRangeIterator{n, i} } func FileRangezeroValueSlice(slice []uint64) { for i := range slice { FileRangeSetFunctions{}.ClearValue(&slice[i]) } } func FileRangezeroNodeSlice(slice []*FileRangenode) { for i := range slice { slice[i] = nil } } // String stringifies a Set for debugging. func (s *FileRangeSet) String() string { return s.root.String() } // String stringifies a node (and all of its children) for debugging. func (n *FileRangenode) String() string { var buf bytes.Buffer n.writeDebugString(&buf, "") return buf.String() } func (n *FileRangenode) writeDebugString(buf *bytes.Buffer, prefix string) { if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { buf.WriteString(prefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) } for i := 0; i < n.nrSegments; i++ { if child := n.children[i]; child != nil { cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) if child.parent != n || child.parentIndex != i { buf.WriteString(cprefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) } child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) } buf.WriteString(prefix) if n.hasChildren { if FileRangetrackGaps != 0 { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v, maxGap: %d\n", i, n.keys[i], n.values[i], n.maxGap.Get())) } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } if child := n.children[n.nrSegments]; child != nil { child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) } } // FlatSegment represents a segment as a single object. FlatSegment is used as // an intermediate representation for save/restore and tests. // // +stateify savable type FileRangeFlatSegment struct { Start uint64 End uint64 Value uint64 } // ExportSlice returns a copy of all segments in the given set, in ascending // key order. func (s *FileRangeSet) ExportSlice() []FileRangeFlatSegment { var fs []FileRangeFlatSegment for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { fs = append(fs, FileRangeFlatSegment{ Start: seg.Start(), End: seg.End(), Value: seg.Value(), }) } return fs } // ImportSlice initializes the given set from the given slice. // // Preconditions: // - s must be empty. // - fs must represent a valid set (the segments in fs must have valid // lengths that do not overlap). // - The segments in fs must be sorted in ascending key order. func (s *FileRangeSet) ImportSlice(fs []FileRangeFlatSegment) error { if !s.IsEmpty() { return fmt.Errorf("cannot import into non-empty set %v", s) } gap := s.FirstGap() for i := range fs { f := &fs[i] r := __generics_imported0.MappableRange{f.Start, f.End} if !gap.Range().IsSupersetOf(r) { return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: %v => %v", r, f.Value) } gap = s.InsertWithoutMerging(gap, r, f.Value).NextGap() } return nil } // segmentTestCheck returns an error if s is incorrectly sorted, does not // contain exactly expectedSegments segments, or contains a segment which // fails the passed check. // // This should be used only for testing, and has been added to this package for // templating convenience. func (s *FileRangeSet) segmentTestCheck(expectedSegments int, segFunc func(int, __generics_imported0.MappableRange, uint64) error) error { havePrev := false prev := uint64(0) nrSegments := 0 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { next := seg.Start() if havePrev && prev >= next { return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments) } if segFunc != nil { if err := segFunc(nrSegments, seg.Range(), seg.Value()); err != nil { return err } } prev = next havePrev = true nrSegments++ } if nrSegments != expectedSegments { return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments) } return nil } // countSegments counts the number of segments in the set. // // Similar to Check, this should only be used for testing. func (s *FileRangeSet) countSegments() (segments int) { for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { segments++ } return segments } func (s *FileRangeSet) saveRoot() []FileRangeFlatSegment { fs := s.ExportSlice() fs = fs[:len(fs):len(fs)] return fs } func (s *FileRangeSet) loadRoot(_ context.Context, fs []FileRangeFlatSegment) { if err := s.ImportSlice(fs); err != nil { panic(err) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsutil/frame_ref_set.go000066400000000000000000000064471465435605700254210ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fsutil import ( "math" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usage" ) // FrameRefSegInfo holds reference count and memory cgroup id of the segment. type FrameRefSegInfo struct { // refs indicates the reference count of the segment. refs uint64 // memCgID is the memory cgroup id of the first task which touches the // segment. This will not be changed over the lifetime of the segment. memCgID uint32 } // FrameRefSetFunctions implements segment.Functions for FrameRefSet. type FrameRefSetFunctions struct{} // MinKey implements segment.Functions.MinKey. func (FrameRefSetFunctions) MinKey() uint64 { return 0 } // MaxKey implements segment.Functions.MaxKey. func (FrameRefSetFunctions) MaxKey() uint64 { return math.MaxUint64 } // ClearValue implements segment.Functions.ClearValue. func (FrameRefSetFunctions) ClearValue(val *FrameRefSegInfo) { } // Merge implements segment.Functions.Merge. func (FrameRefSetFunctions) Merge(_ memmap.FileRange, val1 FrameRefSegInfo, _ memmap.FileRange, val2 FrameRefSegInfo) (FrameRefSegInfo, bool) { if val1 != val2 { return FrameRefSegInfo{}, false } return val1, true } // Split implements segment.Functions.Split. func (FrameRefSetFunctions) Split(_ memmap.FileRange, val FrameRefSegInfo, _ uint64) (FrameRefSegInfo, FrameRefSegInfo) { return val, val } // IncRefAndAccount adds a reference on the range fr. All newly inserted segments // are accounted as host page cache memory mappings. The new segments will be // associated with the memCgID, if the segment already exists then the memCgID // will not be changed. func (s *FrameRefSet) IncRefAndAccount(fr memmap.FileRange, memCgID uint32) { seg, gap := s.Find(fr.Start) for { switch { case seg.Ok() && seg.Start() < fr.End: seg = s.Isolate(seg, fr) seg.ValuePtr().refs++ seg, gap = seg.NextNonEmpty() case gap.Ok() && gap.Start() < fr.End: newRange := gap.Range().Intersect(fr) usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped, memCgID) frInfo := FrameRefSegInfo{refs: 1, memCgID: memCgID} seg, gap = s.InsertWithoutMerging(gap, newRange, frInfo).NextNonEmpty() default: s.MergeOutsideRange(fr) return } } } // DecRefAndAccount removes a reference on the range fr and untracks segments // that are removed from memory accounting. func (s *FrameRefSet) DecRefAndAccount(fr memmap.FileRange) { seg := s.FindSegment(fr.Start) for seg.Ok() && seg.Start() < fr.End { seg = s.Isolate(seg, fr) if old := seg.ValuePtr().refs; old == 1 { usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped, seg.ValuePtr().memCgID) seg = s.Remove(seg).NextSegment() } else { seg.ValuePtr().refs-- seg = seg.NextSegment() } } s.MergeOutsideRange(fr) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsutil/frame_ref_set_impl.go000066400000000000000000002045151465435605700264360ustar00rootroot00000000000000package fsutil import ( __generics_imported0 "gvisor.dev/gvisor/pkg/sentry/memmap" ) import ( "bytes" "context" "fmt" ) // trackGaps is an optional parameter. // // If trackGaps is 1, the Set will track maximum gap size recursively, // enabling the GapIterator.{Prev,Next}LargeEnoughGap functions. In this // case, Key must be an unsigned integer. // // trackGaps must be 0 or 1. const FrameReftrackGaps = 0 var _ = uint8(FrameReftrackGaps << 7) // Will fail if not zero or one. // dynamicGap is a type that disappears if trackGaps is 0. type FrameRefdynamicGap [FrameReftrackGaps]uint64 // Get returns the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *FrameRefdynamicGap) Get() uint64 { return d[:][0] } // Set sets the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *FrameRefdynamicGap) Set(v uint64) { d[:][0] = v } const ( // minDegree is the minimum degree of an internal node in a Set B-tree. // // - Any non-root node has at least minDegree-1 segments. // // - Any non-root internal (non-leaf) node has at least minDegree children. // // - The root node may have fewer than minDegree-1 segments, but it may // only have 0 segments if the tree is empty. // // Our implementation requires minDegree >= 3. Higher values of minDegree // usually improve performance, but increase memory usage for small sets. FrameRefminDegree = 3 FrameRefmaxDegree = 2 * FrameRefminDegree ) // A Set is a mapping of segments with non-overlapping Range keys. The zero // value for a Set is an empty set. Set values are not safely movable nor // copyable. Set is thread-compatible. // // +stateify savable type FrameRefSet struct { root FrameRefnode `state:".([]FrameRefFlatSegment)"` } // IsEmpty returns true if the set contains no segments. func (s *FrameRefSet) IsEmpty() bool { return s.root.nrSegments == 0 } // IsEmptyRange returns true iff no segments in the set overlap the given // range. This is semantically equivalent to s.SpanRange(r) == 0, but may be // more efficient. func (s *FrameRefSet) IsEmptyRange(r __generics_imported0.FileRange) bool { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return true } _, gap := s.Find(r.Start) if !gap.Ok() { return false } return r.End <= gap.End() } // Span returns the total size of all segments in the set. func (s *FrameRefSet) Span() uint64 { var sz uint64 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { sz += seg.Range().Length() } return sz } // SpanRange returns the total size of the intersection of segments in the set // with the given range. func (s *FrameRefSet) SpanRange(r __generics_imported0.FileRange) uint64 { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return 0 } var sz uint64 for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { sz += seg.Range().Intersect(r).Length() } return sz } // FirstSegment returns the first segment in the set. If the set is empty, // FirstSegment returns a terminal iterator. func (s *FrameRefSet) FirstSegment() FrameRefIterator { if s.root.nrSegments == 0 { return FrameRefIterator{} } return s.root.firstSegment() } // LastSegment returns the last segment in the set. If the set is empty, // LastSegment returns a terminal iterator. func (s *FrameRefSet) LastSegment() FrameRefIterator { if s.root.nrSegments == 0 { return FrameRefIterator{} } return s.root.lastSegment() } // FirstGap returns the first gap in the set. func (s *FrameRefSet) FirstGap() FrameRefGapIterator { n := &s.root for n.hasChildren { n = n.children[0] } return FrameRefGapIterator{n, 0} } // LastGap returns the last gap in the set. func (s *FrameRefSet) LastGap() FrameRefGapIterator { n := &s.root for n.hasChildren { n = n.children[n.nrSegments] } return FrameRefGapIterator{n, n.nrSegments} } // Find returns the segment or gap whose range contains the given key. If a // segment is found, the returned Iterator is non-terminal and the // returned GapIterator is terminal. Otherwise, the returned Iterator is // terminal and the returned GapIterator is non-terminal. func (s *FrameRefSet) Find(key uint64) (FrameRefIterator, FrameRefGapIterator) { n := &s.root for { lower := 0 upper := n.nrSegments for lower < upper { i := lower + (upper-lower)/2 if r := n.keys[i]; key < r.End { if key >= r.Start { return FrameRefIterator{n, i}, FrameRefGapIterator{} } upper = i } else { lower = i + 1 } } i := lower if !n.hasChildren { return FrameRefIterator{}, FrameRefGapIterator{n, i} } n = n.children[i] } } // FindSegment returns the segment whose range contains the given key. If no // such segment exists, FindSegment returns a terminal iterator. func (s *FrameRefSet) FindSegment(key uint64) FrameRefIterator { seg, _ := s.Find(key) return seg } // LowerBoundSegment returns the segment with the lowest range that contains a // key greater than or equal to min. If no such segment exists, // LowerBoundSegment returns a terminal iterator. func (s *FrameRefSet) LowerBoundSegment(min uint64) FrameRefIterator { seg, gap := s.Find(min) if seg.Ok() { return seg } return gap.NextSegment() } // UpperBoundSegment returns the segment with the highest range that contains a // key less than or equal to max. If no such segment exists, UpperBoundSegment // returns a terminal iterator. func (s *FrameRefSet) UpperBoundSegment(max uint64) FrameRefIterator { seg, gap := s.Find(max) if seg.Ok() { return seg } return gap.PrevSegment() } // FindGap returns the gap containing the given key. If no such gap exists // (i.e. the set contains a segment containing that key), FindGap returns a // terminal iterator. func (s *FrameRefSet) FindGap(key uint64) FrameRefGapIterator { _, gap := s.Find(key) return gap } // LowerBoundGap returns the gap with the lowest range that is greater than or // equal to min. func (s *FrameRefSet) LowerBoundGap(min uint64) FrameRefGapIterator { seg, gap := s.Find(min) if gap.Ok() { return gap } return seg.NextGap() } // UpperBoundGap returns the gap with the highest range that is less than or // equal to max. func (s *FrameRefSet) UpperBoundGap(max uint64) FrameRefGapIterator { seg, gap := s.Find(max) if gap.Ok() { return gap } return seg.PrevGap() } // FirstLargeEnoughGap returns the first gap in the set with at least the given // length. If no such gap exists, FirstLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *FrameRefSet) FirstLargeEnoughGap(minSize uint64) FrameRefGapIterator { if FrameReftrackGaps != 1 { panic("set is not tracking gaps") } gap := s.FirstGap() if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // LastLargeEnoughGap returns the last gap in the set with at least the given // length. If no such gap exists, LastLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *FrameRefSet) LastLargeEnoughGap(minSize uint64) FrameRefGapIterator { if FrameReftrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LastGap() if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // LowerBoundLargeEnoughGap returns the first gap in the set with at least the // given length and whose range contains a key greater than or equal to min. If // no such gap exists, LowerBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *FrameRefSet) LowerBoundLargeEnoughGap(min, minSize uint64) FrameRefGapIterator { if FrameReftrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LowerBoundGap(min) if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // UpperBoundLargeEnoughGap returns the last gap in the set with at least the // given length and whose range contains a key less than or equal to max. If no // such gap exists, UpperBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *FrameRefSet) UpperBoundLargeEnoughGap(max, minSize uint64) FrameRefGapIterator { if FrameReftrackGaps != 1 { panic("set is not tracking gaps") } gap := s.UpperBoundGap(max) if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // Insert inserts the given segment into the given gap. If the new segment can // be merged with adjacent segments, Insert will do so. Insert returns an // iterator to the segment containing the inserted value (which may have been // merged with other values). All existing iterators (including gap, but not // including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, Insert panics. // // Insert is semantically equivalent to a InsertWithoutMerging followed by a // Merge, but may be more efficient. Note that there is no unchecked variant of // Insert since Insert must retrieve and inspect gap's predecessor and // successor segments regardless. func (s *FrameRefSet) Insert(gap FrameRefGapIterator, r __generics_imported0.FileRange, val FrameRefSegInfo) FrameRefIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } prev, next := gap.PrevSegment(), gap.NextSegment() if prev.Ok() && prev.End() > r.Start { panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) } if next.Ok() && next.Start() < r.End { panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) } if prev.Ok() && prev.End() == r.Start { if mval, ok := (FrameRefSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { shrinkMaxGap := FrameReftrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() prev.SetEndUnchecked(r.End) prev.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } if next.Ok() && next.Start() == r.End { val = mval if mval, ok := (FrameRefSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { prev.SetEndUnchecked(next.End()) prev.SetValue(mval) return s.Remove(next).PrevSegment() } } return prev } } if next.Ok() && next.Start() == r.End { if mval, ok := (FrameRefSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { shrinkMaxGap := FrameReftrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() next.SetStartUnchecked(r.Start) next.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } return next } } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMerging inserts the given segment into the given gap and // returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, // InsertWithoutMerging panics. func (s *FrameRefSet) InsertWithoutMerging(gap FrameRefGapIterator, r __generics_imported0.FileRange, val FrameRefSegInfo) FrameRefIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if gr := gap.Range(); !gr.IsSupersetOf(r) { panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMergingUnchecked inserts the given segment into the given gap // and returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // Preconditions: // - r.Start >= gap.Start(). // - r.End <= gap.End(). func (s *FrameRefSet) InsertWithoutMergingUnchecked(gap FrameRefGapIterator, r __generics_imported0.FileRange, val FrameRefSegInfo) FrameRefIterator { gap = gap.node.rebalanceBeforeInsert(gap) splitMaxGap := FrameReftrackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get()) copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) gap.node.keys[gap.index] = r gap.node.values[gap.index] = val gap.node.nrSegments++ if splitMaxGap { gap.node.updateMaxGapLeaf() } return FrameRefIterator{gap.node, gap.index} } // InsertRange inserts the given segment into the set. If the new segment can // be merged with adjacent segments, InsertRange will do so. InsertRange // returns an iterator to the segment containing the inserted value (which may // have been merged with other values). All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertRange panics. // // InsertRange searches the set to find the gap to insert into. If the caller // already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *FrameRefSet) InsertRange(r __generics_imported0.FileRange, val FrameRefSegInfo) FrameRefIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.Insert(gap, r, val) } // InsertWithoutMergingRange inserts the given segment into the set and returns // an iterator to the inserted segment. All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertWithoutMergingRange panics. // // InsertWithoutMergingRange searches the set to find the gap to insert into. // If the caller already has the appropriate GapIterator, or if the caller // needs to do additional work between finding the gap and insertion, use // InsertWithoutMerging instead. func (s *FrameRefSet) InsertWithoutMergingRange(r __generics_imported0.FileRange, val FrameRefSegInfo) FrameRefIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.InsertWithoutMerging(gap, r, val) } // TryInsertRange attempts to insert the given segment into the set. If the new // segment can be merged with adjacent segments, TryInsertRange will do so. // TryInsertRange returns an iterator to the segment containing the inserted // value (which may have been merged with other values). All existing iterators // (excluding the returned iterator) are invalidated. // // If the new segment would overlap an existing segment, TryInsertRange does // nothing and returns a terminal iterator. // // TryInsertRange searches the set to find the gap to insert into. If the // caller already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *FrameRefSet) TryInsertRange(r __generics_imported0.FileRange, val FrameRefSegInfo) FrameRefIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return FrameRefIterator{} } if gap.End() < r.End { return FrameRefIterator{} } return s.Insert(gap, r, val) } // TryInsertWithoutMergingRange attempts to insert the given segment into the // set. If successful, it returns an iterator to the inserted segment; all // existing iterators (excluding the returned iterator) are invalidated. If the // new segment would overlap an existing segment, TryInsertWithoutMergingRange // does nothing and returns a terminal iterator. // // TryInsertWithoutMergingRange searches the set to find the gap to insert // into. If the caller already has the appropriate GapIterator, or if the // caller needs to do additional work between finding the gap and insertion, // use InsertWithoutMerging instead. func (s *FrameRefSet) TryInsertWithoutMergingRange(r __generics_imported0.FileRange, val FrameRefSegInfo) FrameRefIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return FrameRefIterator{} } if gap.End() < r.End { return FrameRefIterator{} } return s.InsertWithoutMerging(gap, r, val) } // Remove removes the given segment and returns an iterator to the vacated gap. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. func (s *FrameRefSet) Remove(seg FrameRefIterator) FrameRefGapIterator { if seg.node.hasChildren { victim := seg.PrevSegment() seg.SetRangeUnchecked(victim.Range()) seg.SetValue(victim.Value()) nextAdjacentNode := seg.NextSegment().node if FrameReftrackGaps != 0 { nextAdjacentNode.updateMaxGapLeaf() } return s.Remove(victim).NextGap() } copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) FrameRefSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) seg.node.nrSegments-- if FrameReftrackGaps != 0 { seg.node.updateMaxGapLeaf() } return seg.node.rebalanceAfterRemove(FrameRefGapIterator{seg.node, seg.index}) } // RemoveAll removes all segments from the set. All existing iterators are // invalidated. func (s *FrameRefSet) RemoveAll() { s.root = FrameRefnode{} } // RemoveRange removes all segments in the given range. An iterator to the // newly formed gap is returned, and all existing iterators are invalidated. // // RemoveRange searches the set to find segments to remove. If the caller // already has an iterator to either end of the range of segments to remove, or // if the caller needs to do additional work before removing each segment, // iterate segments and call Remove in a loop instead. func (s *FrameRefSet) RemoveRange(r __generics_imported0.FileRange) FrameRefGapIterator { seg, gap := s.Find(r.Start) if seg.Ok() { seg = s.Isolate(seg, r) gap = s.Remove(seg) } for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { seg = s.SplitAfter(seg, r.End) gap = s.Remove(seg) } return gap } // RemoveFullRange is equivalent to RemoveRange, except that if any key in the // given range does not correspond to a segment, RemoveFullRange panics. func (s *FrameRefSet) RemoveFullRange(r __generics_imported0.FileRange) FrameRefGapIterator { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) end := seg.End() gap := s.Remove(seg) if r.End <= end { return gap } seg = gap.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // Merge attempts to merge two neighboring segments. If successful, Merge // returns an iterator to the merged segment, and all existing iterators are // invalidated. Otherwise, Merge returns a terminal iterator. // // If first is not the predecessor of second, Merge panics. func (s *FrameRefSet) Merge(first, second FrameRefIterator) FrameRefIterator { if first.NextSegment() != second { panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) } return s.MergeUnchecked(first, second) } // MergeUnchecked attempts to merge two neighboring segments. If successful, // MergeUnchecked returns an iterator to the merged segment, and all existing // iterators are invalidated. Otherwise, MergeUnchecked returns a terminal // iterator. // // Precondition: first is the predecessor of second: first.NextSegment() == // second, first == second.PrevSegment(). func (s *FrameRefSet) MergeUnchecked(first, second FrameRefIterator) FrameRefIterator { if first.End() == second.Start() { if mval, ok := (FrameRefSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { first.SetEndUnchecked(second.End()) first.SetValue(mval) return s.Remove(second).PrevSegment() } } return FrameRefIterator{} } // MergePrev attempts to merge the given segment with its predecessor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergePrev is usually used when mutating segments while iterating them in // order of increasing keys, to attempt merging of each mutated segment with // its previously-mutated predecessor. In such cases, merging a mutated segment // with its unmutated successor would incorrectly cause the latter to be // skipped. func (s *FrameRefSet) MergePrev(seg FrameRefIterator) FrameRefIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } return seg } // MergeNext attempts to merge the given segment with its successor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergeNext is usually used when mutating segments while iterating them in // order of decreasing keys, to attempt merging of each mutated segment with // its previously-mutated successor. In such cases, merging a mutated segment // with its unmutated predecessor would incorrectly cause the latter to be // skipped. func (s *FrameRefSet) MergeNext(seg FrameRefIterator) FrameRefIterator { if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // Unisolate attempts to merge the given segment with its predecessor and // successor if possible, and returns an updated iterator to the extended // segment. All existing iterators (including seg, but not including the // returned iterator) are invalidated. // // Unisolate is usually used in conjunction with Isolate when mutating part of // a single segment in a way that may affect its mergeability. For the reasons // described by MergePrev and MergeNext, it is usually incorrect to use the // return value of Unisolate in a loop variable. func (s *FrameRefSet) Unisolate(seg FrameRefIterator) FrameRefIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // MergeAll merges all mergeable adjacent segments in the set. All existing // iterators are invalidated. func (s *FrameRefSet) MergeAll() { seg := s.FirstSegment() if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeInsideRange attempts to merge all adjacent segments that contain a key // in the specific range. All existing iterators are invalidated. // // MergeInsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid a redundant search. func (s *FrameRefSet) MergeInsideRange(r __generics_imported0.FileRange) { seg := s.LowerBoundSegment(r.Start) if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() && next.Start() < r.End { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeOutsideRange attempts to merge the segment containing r.Start with its // predecessor, and the segment containing r.End-1 with its successor. // // MergeOutsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid two redundant searches. func (s *FrameRefSet) MergeOutsideRange(r __generics_imported0.FileRange) { first := s.FindSegment(r.Start) if first.Ok() { if prev := first.PrevSegment(); prev.Ok() { s.Merge(prev, first) } } last := s.FindSegment(r.End - 1) if last.Ok() { if next := last.NextSegment(); next.Ok() { s.Merge(last, next) } } } // Split splits the given segment at the given key and returns iterators to the // two resulting segments. All existing iterators (including seg, but not // including the returned iterators) are invalidated. // // If the segment cannot be split at split (because split is at the start or // end of the segment's range, so splitting would produce a segment with zero // length, or because split falls outside the segment's range altogether), // Split panics. func (s *FrameRefSet) Split(seg FrameRefIterator, split uint64) (FrameRefIterator, FrameRefIterator) { if !seg.Range().CanSplitAt(split) { panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) } return s.SplitUnchecked(seg, split) } // SplitUnchecked splits the given segment at the given key and returns // iterators to the two resulting segments. All existing iterators (including // seg, but not including the returned iterators) are invalidated. // // Preconditions: seg.Start() < key < seg.End(). func (s *FrameRefSet) SplitUnchecked(seg FrameRefIterator, split uint64) (FrameRefIterator, FrameRefIterator) { val1, val2 := (FrameRefSetFunctions{}).Split(seg.Range(), seg.Value(), split) end2 := seg.End() seg.SetEndUnchecked(split) seg.SetValue(val1) seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), __generics_imported0.FileRange{split, end2}, val2) return seg2.PrevSegment(), seg2 } // SplitBefore ensures that the given segment's start is at least start by // splitting at start if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterator) are invalidated. // // SplitBefore is usually when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, the first segment may // extend beyond the start of the range to be mutated, and needs to be // SplitBefore to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter; i.e. SplitBefore needs to be invoked on each segment, while // SplitAfter only needs to be invoked on the first. // // Preconditions: start < seg.End(). func (s *FrameRefSet) SplitBefore(seg FrameRefIterator, start uint64) FrameRefIterator { if seg.Range().CanSplitAt(start) { _, seg = s.SplitUnchecked(seg, start) } return seg } // SplitAfter ensures that the given segment's end is at most end by splitting // at end if necessary, and returns an updated iterator to the bounded segment. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. // // SplitAfter is usually used when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, each iterated segment // may extend beyond the end of the range to be mutated, and needs to be // SplitAfter to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter exchange roles; i.e. SplitBefore needs to be invoked on each // segment, while SplitAfter only needs to be invoked on the first. // // Preconditions: seg.Start() < end. func (s *FrameRefSet) SplitAfter(seg FrameRefIterator, end uint64) FrameRefIterator { if seg.Range().CanSplitAt(end) { seg, _ = s.SplitUnchecked(seg, end) } return seg } // Isolate ensures that the given segment's range is a subset of r by splitting // at r.Start and r.End if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterators) are invalidated. // // Isolate is usually used when mutating part of a single segment, or when // mutating segments in a range where the first segment is not necessarily // split, making use of SplitBefore/SplitAfter complex. // // Preconditions: seg.Range().Overlaps(r). func (s *FrameRefSet) Isolate(seg FrameRefIterator, r __generics_imported0.FileRange) FrameRefIterator { if seg.Range().CanSplitAt(r.Start) { _, seg = s.SplitUnchecked(seg, r.Start) } if seg.Range().CanSplitAt(r.End) { seg, _ = s.SplitUnchecked(seg, r.End) } return seg } // LowerBoundSegmentSplitBefore combines LowerBoundSegment and SplitBefore. // // LowerBoundSegmentSplitBefore is usually used when mutating segments in a // range while iterating them in order of increasing keys. In such cases, // LowerBoundSegmentSplitBefore provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *FrameRefSet) LowerBoundSegmentSplitBefore(min uint64) FrameRefIterator { seg := s.LowerBoundSegment(min) if seg.Ok() { seg = s.SplitBefore(seg, min) } return seg } // UpperBoundSegmentSplitAfter combines UpperBoundSegment and SplitAfter. // // UpperBoundSegmentSplitAfter is usually used when mutating segments in a // range while iterating them in order of decreasing keys. In such cases, // UpperBoundSegmentSplitAfter provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *FrameRefSet) UpperBoundSegmentSplitAfter(max uint64) FrameRefIterator { seg := s.UpperBoundSegment(max) if seg.Ok() { seg = s.SplitAfter(seg, max) } return seg } // VisitRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments will not be split, so f may be called // on segments lying partially outside r. Non-empty gaps between segments are // skipped. If a call to f returns false, VisitRange stops iteration // immediately. // // N.B. f must not invalidate iterators into s. func (s *FrameRefSet) VisitRange(r __generics_imported0.FileRange, f func(seg FrameRefIterator) bool) { for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { if !f(seg) { return } } } // VisitFullRange is equivalent to VisitRange, except that if any key in r that // is visited before f returns false does not correspond to a segment, // VisitFullRange panics. func (s *FrameRefSet) VisitFullRange(r __generics_imported0.FileRange, f func(seg FrameRefIterator) bool) { pos := r.Start seg := s.FindSegment(r.Start) for { if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", pos)) } if !f(seg) { return } pos = seg.End() if r.End <= pos { return } seg, _ = seg.NextNonEmpty() } } // MutateRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments that lie partially outside r are split // before f is called, such that f only observes segments entirely within r. // Iterated segments are merged again after f is called. Non-empty gaps between // segments are skipped. If a call to f returns false, MutateRange stops // iteration immediately. // // MutateRange invalidates all existing iterators. // // N.B. f must not invalidate iterators into s. func (s *FrameRefSet) MutateRange(r __generics_imported0.FileRange, f func(seg FrameRefIterator) bool) { seg := s.LowerBoundSegmentSplitBefore(r.Start) for seg.Ok() && seg.Start() < r.End { seg = s.SplitAfter(seg, r.End) cont := f(seg) seg = s.MergePrev(seg) if !cont { s.MergeNext(seg) return } seg = seg.NextSegment() } if seg.Ok() { s.MergePrev(seg) } } // MutateFullRange is equivalent to MutateRange, except that if any key in r // that is visited before f returns false does not correspond to a segment, // MutateFullRange panics. func (s *FrameRefSet) MutateFullRange(r __generics_imported0.FileRange, f func(seg FrameRefIterator) bool) { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) cont := f(seg) end := seg.End() seg = s.MergePrev(seg) if !cont || r.End <= end { s.MergeNext(seg) return } seg = seg.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // +stateify savable type FrameRefnode struct { // An internal binary tree node looks like: // // K // / \ // Cl Cr // // where all keys in the subtree rooted by Cl (the left subtree) are less // than K (the key of the parent node), and all keys in the subtree rooted // by Cr (the right subtree) are greater than K. // // An internal B-tree node's indexes work out to look like: // // K0 K1 K2 ... Kn-1 // / \/ \/ \ ... / \ // C0 C1 C2 C3 ... Cn-1 Cn // // where n is nrSegments. nrSegments int // parent is a pointer to this node's parent. If this node is root, parent // is nil. parent *FrameRefnode // parentIndex is the index of this node in parent.children. parentIndex int // Flag for internal nodes that is technically redundant with "children[0] // != nil", but is stored in the first cache line. "hasChildren" rather // than "isLeaf" because false must be the correct value for an empty root. hasChildren bool // The longest gap within this node. If the node is a leaf, it's simply the // maximum gap among all the (nrSegments+1) gaps formed by its nrSegments keys // including the 0th and nrSegments-th gap possibly shared with its upper-level // nodes; if it's a non-leaf node, it's the max of all children's maxGap. maxGap FrameRefdynamicGap // Nodes store keys and values in separate arrays to maximize locality in // the common case (scanning keys for lookup). keys [FrameRefmaxDegree - 1]__generics_imported0.FileRange values [FrameRefmaxDegree - 1]FrameRefSegInfo children [FrameRefmaxDegree]*FrameRefnode } // firstSegment returns the first segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *FrameRefnode) firstSegment() FrameRefIterator { for n.hasChildren { n = n.children[0] } return FrameRefIterator{n, 0} } // lastSegment returns the last segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *FrameRefnode) lastSegment() FrameRefIterator { for n.hasChildren { n = n.children[n.nrSegments] } return FrameRefIterator{n, n.nrSegments - 1} } func (n *FrameRefnode) prevSibling() *FrameRefnode { if n.parent == nil || n.parentIndex == 0 { return nil } return n.parent.children[n.parentIndex-1] } func (n *FrameRefnode) nextSibling() *FrameRefnode { if n.parent == nil || n.parentIndex == n.parent.nrSegments { return nil } return n.parent.children[n.parentIndex+1] } // rebalanceBeforeInsert splits n and its ancestors if they are full, as // required for insertion, and returns an updated iterator to the position // represented by gap. func (n *FrameRefnode) rebalanceBeforeInsert(gap FrameRefGapIterator) FrameRefGapIterator { if n.nrSegments < FrameRefmaxDegree-1 { return gap } if n.parent != nil { gap = n.parent.rebalanceBeforeInsert(gap) } if n.parent == nil { left := &FrameRefnode{ nrSegments: FrameRefminDegree - 1, parent: n, parentIndex: 0, hasChildren: n.hasChildren, } right := &FrameRefnode{ nrSegments: FrameRefminDegree - 1, parent: n, parentIndex: 1, hasChildren: n.hasChildren, } copy(left.keys[:FrameRefminDegree-1], n.keys[:FrameRefminDegree-1]) copy(left.values[:FrameRefminDegree-1], n.values[:FrameRefminDegree-1]) copy(right.keys[:FrameRefminDegree-1], n.keys[FrameRefminDegree:]) copy(right.values[:FrameRefminDegree-1], n.values[FrameRefminDegree:]) n.keys[0], n.values[0] = n.keys[FrameRefminDegree-1], n.values[FrameRefminDegree-1] FrameRefzeroValueSlice(n.values[1:]) if n.hasChildren { copy(left.children[:FrameRefminDegree], n.children[:FrameRefminDegree]) copy(right.children[:FrameRefminDegree], n.children[FrameRefminDegree:]) FrameRefzeroNodeSlice(n.children[2:]) for i := 0; i < FrameRefminDegree; i++ { left.children[i].parent = left left.children[i].parentIndex = i right.children[i].parent = right right.children[i].parentIndex = i } } n.nrSegments = 1 n.hasChildren = true n.children[0] = left n.children[1] = right if FrameReftrackGaps != 0 { left.updateMaxGapLocal() right.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < FrameRefminDegree { return FrameRefGapIterator{left, gap.index} } return FrameRefGapIterator{right, gap.index - FrameRefminDegree} } copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[FrameRefminDegree-1], n.values[FrameRefminDegree-1] copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { n.parent.children[i].parentIndex = i } sibling := &FrameRefnode{ nrSegments: FrameRefminDegree - 1, parent: n.parent, parentIndex: n.parentIndex + 1, hasChildren: n.hasChildren, } n.parent.children[n.parentIndex+1] = sibling n.parent.nrSegments++ copy(sibling.keys[:FrameRefminDegree-1], n.keys[FrameRefminDegree:]) copy(sibling.values[:FrameRefminDegree-1], n.values[FrameRefminDegree:]) FrameRefzeroValueSlice(n.values[FrameRefminDegree-1:]) if n.hasChildren { copy(sibling.children[:FrameRefminDegree], n.children[FrameRefminDegree:]) FrameRefzeroNodeSlice(n.children[FrameRefminDegree:]) for i := 0; i < FrameRefminDegree; i++ { sibling.children[i].parent = sibling sibling.children[i].parentIndex = i } } n.nrSegments = FrameRefminDegree - 1 if FrameReftrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < FrameRefminDegree { return gap } return FrameRefGapIterator{sibling, gap.index - FrameRefminDegree} } // rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient // (contain fewer segments than required by B-tree invariants), as required for // removal, and returns an updated iterator to the position represented by gap. // // Precondition: n is the only node in the tree that may currently violate a // B-tree invariant. func (n *FrameRefnode) rebalanceAfterRemove(gap FrameRefGapIterator) FrameRefGapIterator { for { if n.nrSegments >= FrameRefminDegree-1 { return gap } if n.parent == nil { return gap } if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= FrameRefminDegree { copy(n.keys[1:], n.keys[:n.nrSegments]) copy(n.values[1:], n.values[:n.nrSegments]) n.keys[0] = n.parent.keys[n.parentIndex-1] n.values[0] = n.parent.values[n.parentIndex-1] n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] FrameRefSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { copy(n.children[1:], n.children[:n.nrSegments+1]) n.children[0] = sibling.children[sibling.nrSegments] sibling.children[sibling.nrSegments] = nil n.children[0].parent = n n.children[0].parentIndex = 0 for i := 1; i < n.nrSegments+2; i++ { n.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if FrameReftrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling && gap.index == sibling.nrSegments { return FrameRefGapIterator{n, 0} } if gap.node == n { return FrameRefGapIterator{n, gap.index + 1} } return gap } if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= FrameRefminDegree { n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] n.values[n.nrSegments] = n.parent.values[n.parentIndex] n.parent.keys[n.parentIndex] = sibling.keys[0] n.parent.values[n.parentIndex] = sibling.values[0] copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) FrameRefSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { n.children[n.nrSegments+1] = sibling.children[0] copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) sibling.children[sibling.nrSegments] = nil n.children[n.nrSegments+1].parent = n n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 for i := 0; i < sibling.nrSegments; i++ { sibling.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if FrameReftrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling { if gap.index == 0 { return FrameRefGapIterator{n, n.nrSegments} } return FrameRefGapIterator{sibling, gap.index - 1} } return gap } p := n.parent if p.nrSegments == 1 { left, right := p.children[0], p.children[1] p.nrSegments = left.nrSegments + right.nrSegments + 1 p.hasChildren = left.hasChildren p.keys[left.nrSegments] = p.keys[0] p.values[left.nrSegments] = p.values[0] copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := 0; i < p.nrSegments+1; i++ { p.children[i].parent = p p.children[i].parentIndex = i } } else { p.children[0] = nil p.children[1] = nil } if gap.node == left { return FrameRefGapIterator{p, gap.index} } if gap.node == right { return FrameRefGapIterator{p, gap.index + left.nrSegments + 1} } return gap } // Merge n and either sibling, along with the segment separating the // two, into whichever of the two nodes comes first. This is the // reverse of the non-root splitting case in // node.rebalanceBeforeInsert. var left, right *FrameRefnode if n.parentIndex > 0 { left = n.prevSibling() right = n } else { left = n right = n.nextSibling() } if gap.node == right { gap = FrameRefGapIterator{left, gap.index + left.nrSegments + 1} } left.keys[left.nrSegments] = p.keys[left.parentIndex] left.values[left.nrSegments] = p.values[left.parentIndex] copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { left.children[i].parent = left left.children[i].parentIndex = i } } left.nrSegments += right.nrSegments + 1 copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) FrameRefSetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) for i := 0; i < p.nrSegments; i++ { p.children[i].parentIndex = i } p.children[p.nrSegments] = nil p.nrSegments-- if FrameReftrackGaps != 0 { left.updateMaxGapLocal() } n = p } } // updateMaxGapLeaf updates maxGap bottom-up from the calling leaf until no // necessary update. // // Preconditions: n must be a leaf node, trackGaps must be 1. func (n *FrameRefnode) updateMaxGapLeaf() { if n.hasChildren { panic(fmt.Sprintf("updateMaxGapLeaf should always be called on leaf node: %v", n)) } max := n.calculateMaxGapLeaf() if max == n.maxGap.Get() { return } oldMax := n.maxGap.Get() n.maxGap.Set(max) if max > oldMax { for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() >= max { break } p.maxGap.Set(max) } return } for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() > oldMax { break } parentNewMax := p.calculateMaxGapInternal() if p.maxGap.Get() == parentNewMax { break } p.maxGap.Set(parentNewMax) } } // updateMaxGapLocal updates maxGap of the calling node solely with no // propagation to ancestor nodes. // // Precondition: trackGaps must be 1. func (n *FrameRefnode) updateMaxGapLocal() { if !n.hasChildren { n.maxGap.Set(n.calculateMaxGapLeaf()) } else { n.maxGap.Set(n.calculateMaxGapInternal()) } } // calculateMaxGapLeaf iterates the gaps within a leaf node and calculate the // max. // // Preconditions: n must be a leaf node. func (n *FrameRefnode) calculateMaxGapLeaf() uint64 { max := FrameRefGapIterator{n, 0}.Range().Length() for i := 1; i <= n.nrSegments; i++ { if current := (FrameRefGapIterator{n, i}).Range().Length(); current > max { max = current } } return max } // calculateMaxGapInternal iterates children's maxGap within an internal node n // and calculate the max. // // Preconditions: n must be a non-leaf node. func (n *FrameRefnode) calculateMaxGapInternal() uint64 { max := n.children[0].maxGap.Get() for i := 1; i <= n.nrSegments; i++ { if current := n.children[i].maxGap.Get(); current > max { max = current } } return max } // searchFirstLargeEnoughGap returns the first gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *FrameRefnode) searchFirstLargeEnoughGap(minSize uint64) FrameRefGapIterator { if n.maxGap.Get() < minSize { return FrameRefGapIterator{} } if n.hasChildren { for i := 0; i <= n.nrSegments; i++ { if largeEnoughGap := n.children[i].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := 0; i <= n.nrSegments; i++ { currentGap := FrameRefGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // searchLastLargeEnoughGap returns the last gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *FrameRefnode) searchLastLargeEnoughGap(minSize uint64) FrameRefGapIterator { if n.maxGap.Get() < minSize { return FrameRefGapIterator{} } if n.hasChildren { for i := n.nrSegments; i >= 0; i-- { if largeEnoughGap := n.children[i].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := n.nrSegments; i >= 0; i-- { currentGap := FrameRefGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // A Iterator is conceptually one of: // // - A pointer to a segment in a set; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Iterators are copyable values and are meaningfully equality-comparable. The // zero value of Iterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type FrameRefIterator struct { // node is the node containing the iterated segment. If the iterator is // terminal, node is nil. node *FrameRefnode // index is the index of the segment in node.keys/values. index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (seg FrameRefIterator) Ok() bool { return seg.node != nil } // Range returns the iterated segment's range key. func (seg FrameRefIterator) Range() __generics_imported0.FileRange { return seg.node.keys[seg.index] } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (seg FrameRefIterator) Start() uint64 { return seg.node.keys[seg.index].Start } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (seg FrameRefIterator) End() uint64 { return seg.node.keys[seg.index].End } // SetRangeUnchecked mutates the iterated segment's range key. This operation // does not invalidate any iterators. // // Preconditions: // - r.Length() > 0. // - The new range must not overlap an existing one: // - If seg.NextSegment().Ok(), then r.end <= seg.NextSegment().Start(). // - If seg.PrevSegment().Ok(), then r.start >= seg.PrevSegment().End(). func (seg FrameRefIterator) SetRangeUnchecked(r __generics_imported0.FileRange) { seg.node.keys[seg.index] = r } // SetRange mutates the iterated segment's range key. If the new range would // cause the iterated segment to overlap another segment, or if the new range // is invalid, SetRange panics. This operation does not invalidate any // iterators. func (seg FrameRefIterator) SetRange(r __generics_imported0.FileRange) { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) } if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) } seg.SetRangeUnchecked(r) } // SetStartUnchecked mutates the iterated segment's start. This operation does // not invalidate any iterators. // // Preconditions: The new start must be valid: // - start < seg.End() // - If seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). func (seg FrameRefIterator) SetStartUnchecked(start uint64) { seg.node.keys[seg.index].Start = start } // SetStart mutates the iterated segment's start. If the new start value would // cause the iterated segment to overlap another segment, or would result in an // invalid range, SetStart panics. This operation does not invalidate any // iterators. func (seg FrameRefIterator) SetStart(start uint64) { if start >= seg.End() { panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) } if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) } seg.SetStartUnchecked(start) } // SetEndUnchecked mutates the iterated segment's end. This operation does not // invalidate any iterators. // // Preconditions: The new end must be valid: // - end > seg.Start(). // - If seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). func (seg FrameRefIterator) SetEndUnchecked(end uint64) { seg.node.keys[seg.index].End = end } // SetEnd mutates the iterated segment's end. If the new end value would cause // the iterated segment to overlap another segment, or would result in an // invalid range, SetEnd panics. This operation does not invalidate any // iterators. func (seg FrameRefIterator) SetEnd(end uint64) { if end <= seg.Start() { panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) } if next := seg.NextSegment(); next.Ok() && end > next.Start() { panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) } seg.SetEndUnchecked(end) } // Value returns a copy of the iterated segment's value. func (seg FrameRefIterator) Value() FrameRefSegInfo { return seg.node.values[seg.index] } // ValuePtr returns a pointer to the iterated segment's value. The pointer is // invalidated if the iterator is invalidated. This operation does not // invalidate any iterators. func (seg FrameRefIterator) ValuePtr() *FrameRefSegInfo { return &seg.node.values[seg.index] } // SetValue mutates the iterated segment's value. This operation does not // invalidate any iterators. func (seg FrameRefIterator) SetValue(val FrameRefSegInfo) { seg.node.values[seg.index] = val } // PrevSegment returns the iterated segment's predecessor. If there is no // preceding segment, PrevSegment returns a terminal iterator. func (seg FrameRefIterator) PrevSegment() FrameRefIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment() } if seg.index > 0 { return FrameRefIterator{seg.node, seg.index - 1} } if seg.node.parent == nil { return FrameRefIterator{} } return FrameRefsegmentBeforePosition(seg.node.parent, seg.node.parentIndex) } // NextSegment returns the iterated segment's successor. If there is no // succeeding segment, NextSegment returns a terminal iterator. func (seg FrameRefIterator) NextSegment() FrameRefIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment() } if seg.index < seg.node.nrSegments-1 { return FrameRefIterator{seg.node, seg.index + 1} } if seg.node.parent == nil { return FrameRefIterator{} } return FrameRefsegmentAfterPosition(seg.node.parent, seg.node.parentIndex) } // PrevGap returns the gap immediately before the iterated segment. func (seg FrameRefIterator) PrevGap() FrameRefGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment().NextGap() } return FrameRefGapIterator{seg.node, seg.index} } // NextGap returns the gap immediately after the iterated segment. func (seg FrameRefIterator) NextGap() FrameRefGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment().PrevGap() } return FrameRefGapIterator{seg.node, seg.index + 1} } // PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, // or the gap before the iterated segment otherwise. If seg.Start() == // Functions.MinKey(), PrevNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by PrevNonEmpty will be // non-terminal. func (seg FrameRefIterator) PrevNonEmpty() (FrameRefIterator, FrameRefGapIterator) { if prev := seg.PrevSegment(); prev.Ok() && prev.End() == seg.Start() { return prev, FrameRefGapIterator{} } return FrameRefIterator{}, seg.PrevGap() } // NextNonEmpty returns the iterated segment's successor if it is adjacent, or // the gap after the iterated segment otherwise. If seg.End() == // Functions.MaxKey(), NextNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by NextNonEmpty will be // non-terminal. func (seg FrameRefIterator) NextNonEmpty() (FrameRefIterator, FrameRefGapIterator) { if next := seg.NextSegment(); next.Ok() && next.Start() == seg.End() { return next, FrameRefGapIterator{} } return FrameRefIterator{}, seg.NextGap() } // A GapIterator is conceptually one of: // // - A pointer to a position between two segments, before the first segment, or // after the last segment in a set, called a *gap*; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Note that the gap between two adjacent segments exists (iterators to it are // non-terminal), but has a length of zero. GapIterator.IsEmpty returns true // for such gaps. An empty set contains a single gap, spanning the entire range // of the set's keys. // // GapIterators are copyable values and are meaningfully equality-comparable. // The zero value of GapIterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type FrameRefGapIterator struct { // The representation of a GapIterator is identical to that of an Iterator, // except that index corresponds to positions between segments in the same // way as for node.children (see comment for node.nrSegments). node *FrameRefnode index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (gap FrameRefGapIterator) Ok() bool { return gap.node != nil } // Range returns the range spanned by the iterated gap. func (gap FrameRefGapIterator) Range() __generics_imported0.FileRange { return __generics_imported0.FileRange{gap.Start(), gap.End()} } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (gap FrameRefGapIterator) Start() uint64 { if ps := gap.PrevSegment(); ps.Ok() { return ps.End() } return FrameRefSetFunctions{}.MinKey() } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (gap FrameRefGapIterator) End() uint64 { if ns := gap.NextSegment(); ns.Ok() { return ns.Start() } return FrameRefSetFunctions{}.MaxKey() } // IsEmpty returns true if the iterated gap is empty (that is, the "gap" is // between two adjacent segments.) func (gap FrameRefGapIterator) IsEmpty() bool { return gap.Range().Length() == 0 } // PrevSegment returns the segment immediately before the iterated gap. If no // such segment exists, PrevSegment returns a terminal iterator. func (gap FrameRefGapIterator) PrevSegment() FrameRefIterator { return FrameRefsegmentBeforePosition(gap.node, gap.index) } // NextSegment returns the segment immediately after the iterated gap. If no // such segment exists, NextSegment returns a terminal iterator. func (gap FrameRefGapIterator) NextSegment() FrameRefIterator { return FrameRefsegmentAfterPosition(gap.node, gap.index) } // PrevGap returns the iterated gap's predecessor. If no such gap exists, // PrevGap returns a terminal iterator. func (gap FrameRefGapIterator) PrevGap() FrameRefGapIterator { seg := gap.PrevSegment() if !seg.Ok() { return FrameRefGapIterator{} } return seg.PrevGap() } // NextGap returns the iterated gap's successor. If no such gap exists, NextGap // returns a terminal iterator. func (gap FrameRefGapIterator) NextGap() FrameRefGapIterator { seg := gap.NextSegment() if !seg.Ok() { return FrameRefGapIterator{} } return seg.NextGap() } // NextLargeEnoughGap returns the iterated gap's first next gap with larger // length than minSize. If not found, return a terminal gap iterator (does NOT // include this gap itself). // // Precondition: trackGaps must be 1. func (gap FrameRefGapIterator) NextLargeEnoughGap(minSize uint64) FrameRefGapIterator { if FrameReftrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == gap.node.nrSegments { gap.node = gap.NextSegment().node gap.index = 0 return gap.nextLargeEnoughGapHelper(minSize) } return gap.nextLargeEnoughGapHelper(minSize) } // nextLargeEnoughGapHelper is the helper function used by NextLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the trailing gap of a non-leaf node. func (gap FrameRefGapIterator) nextLargeEnoughGapHelper(minSize uint64) FrameRefGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == gap.node.nrSegments)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return FrameRefGapIterator{} } gap.index++ for gap.index <= gap.node.nrSegments { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index++ } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == gap.node.nrSegments { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // PrevLargeEnoughGap returns the iterated gap's first prev gap with larger or // equal length than minSize. If not found, return a terminal gap iterator // (does NOT include this gap itself). // // Precondition: trackGaps must be 1. func (gap FrameRefGapIterator) PrevLargeEnoughGap(minSize uint64) FrameRefGapIterator { if FrameReftrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == 0 { gap.node = gap.PrevSegment().node gap.index = gap.node.nrSegments return gap.prevLargeEnoughGapHelper(minSize) } return gap.prevLargeEnoughGapHelper(minSize) } // prevLargeEnoughGapHelper is the helper function used by PrevLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the first gap of a non-leaf node. func (gap FrameRefGapIterator) prevLargeEnoughGapHelper(minSize uint64) FrameRefGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == 0)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return FrameRefGapIterator{} } gap.index-- for gap.index >= 0 { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index-- } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == 0 { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // segmentBeforePosition returns the predecessor segment of the position given // by n.children[i], which may or may not contain a child. If no such segment // exists, segmentBeforePosition returns a terminal iterator. func FrameRefsegmentBeforePosition(n *FrameRefnode, i int) FrameRefIterator { for i == 0 { if n.parent == nil { return FrameRefIterator{} } n, i = n.parent, n.parentIndex } return FrameRefIterator{n, i - 1} } // segmentAfterPosition returns the successor segment of the position given by // n.children[i], which may or may not contain a child. If no such segment // exists, segmentAfterPosition returns a terminal iterator. func FrameRefsegmentAfterPosition(n *FrameRefnode, i int) FrameRefIterator { for i == n.nrSegments { if n.parent == nil { return FrameRefIterator{} } n, i = n.parent, n.parentIndex } return FrameRefIterator{n, i} } func FrameRefzeroValueSlice(slice []FrameRefSegInfo) { for i := range slice { FrameRefSetFunctions{}.ClearValue(&slice[i]) } } func FrameRefzeroNodeSlice(slice []*FrameRefnode) { for i := range slice { slice[i] = nil } } // String stringifies a Set for debugging. func (s *FrameRefSet) String() string { return s.root.String() } // String stringifies a node (and all of its children) for debugging. func (n *FrameRefnode) String() string { var buf bytes.Buffer n.writeDebugString(&buf, "") return buf.String() } func (n *FrameRefnode) writeDebugString(buf *bytes.Buffer, prefix string) { if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { buf.WriteString(prefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) } for i := 0; i < n.nrSegments; i++ { if child := n.children[i]; child != nil { cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) if child.parent != n || child.parentIndex != i { buf.WriteString(cprefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) } child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) } buf.WriteString(prefix) if n.hasChildren { if FrameReftrackGaps != 0 { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v, maxGap: %d\n", i, n.keys[i], n.values[i], n.maxGap.Get())) } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } if child := n.children[n.nrSegments]; child != nil { child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) } } // FlatSegment represents a segment as a single object. FlatSegment is used as // an intermediate representation for save/restore and tests. // // +stateify savable type FrameRefFlatSegment struct { Start uint64 End uint64 Value FrameRefSegInfo } // ExportSlice returns a copy of all segments in the given set, in ascending // key order. func (s *FrameRefSet) ExportSlice() []FrameRefFlatSegment { var fs []FrameRefFlatSegment for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { fs = append(fs, FrameRefFlatSegment{ Start: seg.Start(), End: seg.End(), Value: seg.Value(), }) } return fs } // ImportSlice initializes the given set from the given slice. // // Preconditions: // - s must be empty. // - fs must represent a valid set (the segments in fs must have valid // lengths that do not overlap). // - The segments in fs must be sorted in ascending key order. func (s *FrameRefSet) ImportSlice(fs []FrameRefFlatSegment) error { if !s.IsEmpty() { return fmt.Errorf("cannot import into non-empty set %v", s) } gap := s.FirstGap() for i := range fs { f := &fs[i] r := __generics_imported0.FileRange{f.Start, f.End} if !gap.Range().IsSupersetOf(r) { return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: %v => %v", r, f.Value) } gap = s.InsertWithoutMerging(gap, r, f.Value).NextGap() } return nil } // segmentTestCheck returns an error if s is incorrectly sorted, does not // contain exactly expectedSegments segments, or contains a segment which // fails the passed check. // // This should be used only for testing, and has been added to this package for // templating convenience. func (s *FrameRefSet) segmentTestCheck(expectedSegments int, segFunc func(int, __generics_imported0.FileRange, FrameRefSegInfo) error) error { havePrev := false prev := uint64(0) nrSegments := 0 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { next := seg.Start() if havePrev && prev >= next { return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments) } if segFunc != nil { if err := segFunc(nrSegments, seg.Range(), seg.Value()); err != nil { return err } } prev = next havePrev = true nrSegments++ } if nrSegments != expectedSegments { return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments) } return nil } // countSegments counts the number of segments in the set. // // Similar to Check, this should only be used for testing. func (s *FrameRefSet) countSegments() (segments int) { for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { segments++ } return segments } func (s *FrameRefSet) saveRoot() []FrameRefFlatSegment { fs := s.ExportSlice() fs = fs[:len(fs):len(fs)] return fs } func (s *FrameRefSet) loadRoot(_ context.Context, fs []FrameRefFlatSegment) { if err := s.ImportSlice(fs); err != nil { panic(err) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsutil/fsutil.go000066400000000000000000000013151465435605700241130ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package fsutil provides utilities for implementing vfs.FileDescriptionImpl // and vfs.FilesystemImpl. package fsutil golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsutil/fsutil_impl_state_autogen.go000066400000000000000000000203661465435605700300650ustar00rootroot00000000000000// automatically generated by stateify. package fsutil import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (s *DirtySet) StateTypeName() string { return "pkg/sentry/fsutil.DirtySet" } func (s *DirtySet) StateFields() []string { return []string{ "root", } } func (s *DirtySet) beforeSave() {} // +checklocksignore func (s *DirtySet) StateSave(stateSinkObject state.Sink) { s.beforeSave() var rootValue []DirtyFlatSegment rootValue = s.saveRoot() stateSinkObject.SaveValue(0, rootValue) } func (s *DirtySet) afterLoad(context.Context) {} // +checklocksignore func (s *DirtySet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new([]DirtyFlatSegment), func(y any) { s.loadRoot(ctx, y.([]DirtyFlatSegment)) }) } func (n *Dirtynode) StateTypeName() string { return "pkg/sentry/fsutil.Dirtynode" } func (n *Dirtynode) StateFields() []string { return []string{ "nrSegments", "parent", "parentIndex", "hasChildren", "maxGap", "keys", "values", "children", } } func (n *Dirtynode) beforeSave() {} // +checklocksignore func (n *Dirtynode) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.nrSegments) stateSinkObject.Save(1, &n.parent) stateSinkObject.Save(2, &n.parentIndex) stateSinkObject.Save(3, &n.hasChildren) stateSinkObject.Save(4, &n.maxGap) stateSinkObject.Save(5, &n.keys) stateSinkObject.Save(6, &n.values) stateSinkObject.Save(7, &n.children) } func (n *Dirtynode) afterLoad(context.Context) {} // +checklocksignore func (n *Dirtynode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.nrSegments) stateSourceObject.Load(1, &n.parent) stateSourceObject.Load(2, &n.parentIndex) stateSourceObject.Load(3, &n.hasChildren) stateSourceObject.Load(4, &n.maxGap) stateSourceObject.Load(5, &n.keys) stateSourceObject.Load(6, &n.values) stateSourceObject.Load(7, &n.children) } func (d *DirtyFlatSegment) StateTypeName() string { return "pkg/sentry/fsutil.DirtyFlatSegment" } func (d *DirtyFlatSegment) StateFields() []string { return []string{ "Start", "End", "Value", } } func (d *DirtyFlatSegment) beforeSave() {} // +checklocksignore func (d *DirtyFlatSegment) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.Start) stateSinkObject.Save(1, &d.End) stateSinkObject.Save(2, &d.Value) } func (d *DirtyFlatSegment) afterLoad(context.Context) {} // +checklocksignore func (d *DirtyFlatSegment) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.Start) stateSourceObject.Load(1, &d.End) stateSourceObject.Load(2, &d.Value) } func (s *FileRangeSet) StateTypeName() string { return "pkg/sentry/fsutil.FileRangeSet" } func (s *FileRangeSet) StateFields() []string { return []string{ "root", } } func (s *FileRangeSet) beforeSave() {} // +checklocksignore func (s *FileRangeSet) StateSave(stateSinkObject state.Sink) { s.beforeSave() var rootValue []FileRangeFlatSegment rootValue = s.saveRoot() stateSinkObject.SaveValue(0, rootValue) } func (s *FileRangeSet) afterLoad(context.Context) {} // +checklocksignore func (s *FileRangeSet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new([]FileRangeFlatSegment), func(y any) { s.loadRoot(ctx, y.([]FileRangeFlatSegment)) }) } func (n *FileRangenode) StateTypeName() string { return "pkg/sentry/fsutil.FileRangenode" } func (n *FileRangenode) StateFields() []string { return []string{ "nrSegments", "parent", "parentIndex", "hasChildren", "maxGap", "keys", "values", "children", } } func (n *FileRangenode) beforeSave() {} // +checklocksignore func (n *FileRangenode) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.nrSegments) stateSinkObject.Save(1, &n.parent) stateSinkObject.Save(2, &n.parentIndex) stateSinkObject.Save(3, &n.hasChildren) stateSinkObject.Save(4, &n.maxGap) stateSinkObject.Save(5, &n.keys) stateSinkObject.Save(6, &n.values) stateSinkObject.Save(7, &n.children) } func (n *FileRangenode) afterLoad(context.Context) {} // +checklocksignore func (n *FileRangenode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.nrSegments) stateSourceObject.Load(1, &n.parent) stateSourceObject.Load(2, &n.parentIndex) stateSourceObject.Load(3, &n.hasChildren) stateSourceObject.Load(4, &n.maxGap) stateSourceObject.Load(5, &n.keys) stateSourceObject.Load(6, &n.values) stateSourceObject.Load(7, &n.children) } func (f *FileRangeFlatSegment) StateTypeName() string { return "pkg/sentry/fsutil.FileRangeFlatSegment" } func (f *FileRangeFlatSegment) StateFields() []string { return []string{ "Start", "End", "Value", } } func (f *FileRangeFlatSegment) beforeSave() {} // +checklocksignore func (f *FileRangeFlatSegment) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.Start) stateSinkObject.Save(1, &f.End) stateSinkObject.Save(2, &f.Value) } func (f *FileRangeFlatSegment) afterLoad(context.Context) {} // +checklocksignore func (f *FileRangeFlatSegment) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.Start) stateSourceObject.Load(1, &f.End) stateSourceObject.Load(2, &f.Value) } func (s *FrameRefSet) StateTypeName() string { return "pkg/sentry/fsutil.FrameRefSet" } func (s *FrameRefSet) StateFields() []string { return []string{ "root", } } func (s *FrameRefSet) beforeSave() {} // +checklocksignore func (s *FrameRefSet) StateSave(stateSinkObject state.Sink) { s.beforeSave() var rootValue []FrameRefFlatSegment rootValue = s.saveRoot() stateSinkObject.SaveValue(0, rootValue) } func (s *FrameRefSet) afterLoad(context.Context) {} // +checklocksignore func (s *FrameRefSet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new([]FrameRefFlatSegment), func(y any) { s.loadRoot(ctx, y.([]FrameRefFlatSegment)) }) } func (n *FrameRefnode) StateTypeName() string { return "pkg/sentry/fsutil.FrameRefnode" } func (n *FrameRefnode) StateFields() []string { return []string{ "nrSegments", "parent", "parentIndex", "hasChildren", "maxGap", "keys", "values", "children", } } func (n *FrameRefnode) beforeSave() {} // +checklocksignore func (n *FrameRefnode) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.nrSegments) stateSinkObject.Save(1, &n.parent) stateSinkObject.Save(2, &n.parentIndex) stateSinkObject.Save(3, &n.hasChildren) stateSinkObject.Save(4, &n.maxGap) stateSinkObject.Save(5, &n.keys) stateSinkObject.Save(6, &n.values) stateSinkObject.Save(7, &n.children) } func (n *FrameRefnode) afterLoad(context.Context) {} // +checklocksignore func (n *FrameRefnode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.nrSegments) stateSourceObject.Load(1, &n.parent) stateSourceObject.Load(2, &n.parentIndex) stateSourceObject.Load(3, &n.hasChildren) stateSourceObject.Load(4, &n.maxGap) stateSourceObject.Load(5, &n.keys) stateSourceObject.Load(6, &n.values) stateSourceObject.Load(7, &n.children) } func (f *FrameRefFlatSegment) StateTypeName() string { return "pkg/sentry/fsutil.FrameRefFlatSegment" } func (f *FrameRefFlatSegment) StateFields() []string { return []string{ "Start", "End", "Value", } } func (f *FrameRefFlatSegment) beforeSave() {} // +checklocksignore func (f *FrameRefFlatSegment) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.Start) stateSinkObject.Save(1, &f.End) stateSinkObject.Save(2, &f.Value) } func (f *FrameRefFlatSegment) afterLoad(context.Context) {} // +checklocksignore func (f *FrameRefFlatSegment) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.Start) stateSourceObject.Load(1, &f.End) stateSourceObject.Load(2, &f.Value) } func init() { state.Register((*DirtySet)(nil)) state.Register((*Dirtynode)(nil)) state.Register((*DirtyFlatSegment)(nil)) state.Register((*FileRangeSet)(nil)) state.Register((*FileRangenode)(nil)) state.Register((*FileRangeFlatSegment)(nil)) state.Register((*FrameRefSet)(nil)) state.Register((*FrameRefnode)(nil)) state.Register((*FrameRefFlatSegment)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsutil/fsutil_state_autogen.go000066400000000000000000000024301465435605700270340ustar00rootroot00000000000000// automatically generated by stateify. package fsutil import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (d *DirtyInfo) StateTypeName() string { return "pkg/sentry/fsutil.DirtyInfo" } func (d *DirtyInfo) StateFields() []string { return []string{ "Keep", } } func (d *DirtyInfo) beforeSave() {} // +checklocksignore func (d *DirtyInfo) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.Keep) } func (d *DirtyInfo) afterLoad(context.Context) {} // +checklocksignore func (d *DirtyInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.Keep) } func (f *HostFileMapper) StateTypeName() string { return "pkg/sentry/fsutil.HostFileMapper" } func (f *HostFileMapper) StateFields() []string { return []string{ "refs", } } func (f *HostFileMapper) beforeSave() {} // +checklocksignore func (f *HostFileMapper) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.refs) } // +checklocksignore func (f *HostFileMapper) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.refs) stateSourceObject.AfterLoad(func() { f.afterLoad(ctx) }) } func init() { state.Register((*DirtyInfo)(nil)) state.Register((*HostFileMapper)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsutil/fsutil_unsafe_state_autogen.go000066400000000000000000000000701465435605700303730ustar00rootroot00000000000000// automatically generated by stateify. package fsutil golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsutil/host_file_mapper.go000066400000000000000000000171421465435605700261320ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fsutil import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" ) // HostFileMapper caches mappings of an arbitrary host file descriptor. It is // used by implementations of memmap.Mappable that represent a host file // descriptor. // // +stateify savable type HostFileMapper struct { // HostFile conceptually breaks the file into pieces called chunks, of // size and alignment chunkSize, and caches mappings of the file on a chunk // granularity. refsMu refsMutex `state:"nosave"` // refs maps chunk start offsets to the sum of reference counts for all // pages in that chunk. refs is protected by refsMu. refs map[uint64]int32 mapsMu mapsMutex `state:"nosave"` // mappings maps chunk start offsets to mappings of those chunks, // obtained by calling unix.Mmap. mappings is protected by // mapsMu. mappings map[uint64]mapping `state:"nosave"` } const ( chunkShift = hostarch.HugePageShift chunkSize = 1 << chunkShift chunkMask = chunkSize - 1 ) func pagesInChunk(mr memmap.MappableRange, chunkStart uint64) int32 { return int32(mr.Intersect(memmap.MappableRange{chunkStart, chunkStart + chunkSize}).Length() / hostarch.PageSize) } type mapping struct { addr uintptr writable bool } // Init must be called on zero-value HostFileMappers before first use. func (f *HostFileMapper) Init() { f.refs = make(map[uint64]int32) f.mappings = make(map[uint64]mapping) } // IsInited returns true if f.Init() has been called. This is used when // restoring a checkpoint that contains a HostFileMapper that may or may not // have been initialized. func (f *HostFileMapper) IsInited() bool { return f.refs != nil } // NewHostFileMapper returns an initialized HostFileMapper allocated on the // heap with no references or cached mappings. func NewHostFileMapper() *HostFileMapper { f := &HostFileMapper{} f.Init() return f } // IncRefOn increments the reference count on all offsets in mr. // // Preconditions: // - mr.Length() != 0. // - mr.Start and mr.End must be page-aligned. func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) { f.refsMu.Lock() defer f.refsMu.Unlock() chunkStart := mr.Start &^ chunkMask for { refs := f.refs[chunkStart] pgs := pagesInChunk(mr, chunkStart) if refs+pgs < refs { // Would overflow. panic(fmt.Sprintf("HostFileMapper.IncRefOn(%v): adding %d page references to chunk %#x, which has %d page references", mr, pgs, chunkStart, refs)) } f.refs[chunkStart] = refs + pgs chunkStart += chunkSize if chunkStart >= mr.End || chunkStart == 0 { break } } } // DecRefOn decrements the reference count on all offsets in mr. // // Preconditions: // - mr.Length() != 0. // - mr.Start and mr.End must be page-aligned. func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) { f.refsMu.Lock() defer f.refsMu.Unlock() chunkStart := mr.Start &^ chunkMask for { refs := f.refs[chunkStart] pgs := pagesInChunk(mr, chunkStart) switch { case refs > pgs: f.refs[chunkStart] = refs - pgs case refs == pgs: f.mapsMu.Lock() delete(f.refs, chunkStart) if m, ok := f.mappings[chunkStart]; ok { f.unmapAndRemoveLocked(chunkStart, m) } f.mapsMu.Unlock() case refs < pgs: panic(fmt.Sprintf("HostFileMapper.DecRefOn(%v): removing %d page references from chunk %#x, which has %d page references", mr, pgs, chunkStart, refs)) } chunkStart += chunkSize if chunkStart >= mr.End || chunkStart == 0 { break } } } // MapInternal returns a mapping of offsets in fr from fd. The returned // safemem.BlockSeq is valid as long as at least one reference is held on all // offsets in fr or until the next call to UnmapAll. // // Preconditions: The caller must hold a reference on all offsets in fr. func (f *HostFileMapper) MapInternal(fr memmap.FileRange, fd int, write bool) (safemem.BlockSeq, error) { chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift) f.mapsMu.Lock() defer f.mapsMu.Unlock() if chunks == 1 { // Avoid an unnecessary slice allocation. var seq safemem.BlockSeq err := f.forEachMappingBlockLocked(fr, fd, write, func(b safemem.Block) { seq = safemem.BlockSeqOf(b) }) return seq, err } blocks := make([]safemem.Block, 0, chunks) err := f.forEachMappingBlockLocked(fr, fd, write, func(b safemem.Block) { blocks = append(blocks, b) }) return safemem.BlockSeqFromSlice(blocks), err } // Preconditions: f.mapsMu must be locked. func (f *HostFileMapper) forEachMappingBlockLocked(fr memmap.FileRange, fd int, write bool, fn func(safemem.Block)) error { prot := unix.PROT_READ if write { prot |= unix.PROT_WRITE } chunkStart := fr.Start &^ chunkMask for { m, ok := f.mappings[chunkStart] if !ok { addr, _, errno := unix.Syscall6( unix.SYS_MMAP, 0, chunkSize, uintptr(prot), unix.MAP_SHARED, uintptr(fd), uintptr(chunkStart)) if errno != 0 { return errno } m = mapping{addr, write} f.mappings[chunkStart] = m } else if write && !m.writable { addr, _, errno := unix.Syscall6( unix.SYS_MMAP, m.addr, chunkSize, uintptr(prot), unix.MAP_SHARED|unix.MAP_FIXED, uintptr(fd), uintptr(chunkStart)) if errno != 0 { return errno } m = mapping{addr, write} f.mappings[chunkStart] = m } var startOff uint64 if chunkStart < fr.Start { startOff = fr.Start - chunkStart } endOff := uint64(chunkSize) if chunkStart+chunkSize > fr.End { endOff = fr.End - chunkStart } fn(f.unsafeBlockFromChunkMapping(m.addr).TakeFirst64(endOff).DropFirst64(startOff)) chunkStart += chunkSize if chunkStart >= fr.End || chunkStart == 0 { break } } return nil } // UnmapAll unmaps all cached mappings. Callers are responsible for // synchronization with mappings returned by previous calls to MapInternal. func (f *HostFileMapper) UnmapAll() { f.mapsMu.Lock() defer f.mapsMu.Unlock() for chunkStart, m := range f.mappings { f.unmapAndRemoveLocked(chunkStart, m) } } // Preconditions: // - f.mapsMu must be locked. // - f.mappings[chunkStart] == m. func (f *HostFileMapper) unmapAndRemoveLocked(chunkStart uint64, m mapping) { if _, _, errno := unix.Syscall(unix.SYS_MUNMAP, m.addr, chunkSize, 0); errno != 0 { // This leaks address space and is unexpected, but is otherwise // harmless, so complain but don't panic. log.Warningf("HostFileMapper: failed to unmap mapping %#x for chunk %#x: %v", m.addr, chunkStart, errno) } delete(f.mappings, chunkStart) } // RegenerateMappings must be called when the file description mapped by f // changes, to replace existing mappings of the previous file description. func (f *HostFileMapper) RegenerateMappings(fd int) error { f.mapsMu.Lock() defer f.mapsMu.Unlock() for chunkStart, m := range f.mappings { prot := unix.PROT_READ if m.writable { prot |= unix.PROT_WRITE } _, _, errno := unix.Syscall6( unix.SYS_MMAP, m.addr, chunkSize, uintptr(prot), unix.MAP_SHARED|unix.MAP_FIXED, uintptr(fd), uintptr(chunkStart)) if errno != 0 { return errno } } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsutil/host_file_mapper_state.go000066400000000000000000000013731465435605700273310ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fsutil import "context" // afterLoad is invoked by stateify. func (f *HostFileMapper) afterLoad(context.Context) { f.mappings = make(map[uint64]mapping) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsutil/host_file_mapper_unsafe.go000066400000000000000000000016751465435605700274770ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fsutil import ( "unsafe" "gvisor.dev/gvisor/pkg/safemem" ) func (*HostFileMapper) unsafeBlockFromChunkMapping(addr uintptr) safemem.Block { // We don't control the host file's length, so touching its mappings may // raise SIGBUS. Thus accesses to it must use safecopy. return safemem.BlockFromUnsafePointer((unsafe.Pointer)(addr), chunkSize) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsutil/maps_mutex.go000066400000000000000000000030721465435605700247710ustar00rootroot00000000000000package fsutil import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type mapsMutex struct { mu sync.Mutex } var mapsprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var mapslockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type mapslockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *mapsMutex) Lock() { locking.AddGLock(mapsprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *mapsMutex) NestedLock(i mapslockNameIndex) { locking.AddGLock(mapsprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *mapsMutex) Unlock() { locking.DelGLock(mapsprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *mapsMutex) NestedUnlock(i mapslockNameIndex) { locking.DelGLock(mapsprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func mapsinitLockNames() {} func init() { mapsinitLockNames() mapsprefixIndex = locking.NewMutexClass(reflect.TypeOf(mapsMutex{}), mapslockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/fsutil/refs_mutex.go000066400000000000000000000030721465435605700247700ustar00rootroot00000000000000package fsutil import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type refsMutex struct { mu sync.Mutex } var refsprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var refslockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type refslockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *refsMutex) Lock() { locking.AddGLock(refsprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *refsMutex) NestedLock(i refslockNameIndex) { locking.AddGLock(refsprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *refsMutex) Unlock() { locking.DelGLock(refsprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *refsMutex) NestedUnlock(i refslockNameIndex) { locking.DelGLock(refsprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func refsinitLockNames() {} func init() { refsinitLockNames() refsprefixIndex = locking.NewMutexClass(reflect.TypeOf(refsMutex{}), refslockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/hostcpu/000077500000000000000000000000001465435605700224355ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/hostcpu/getcpu_amd64.s000066400000000000000000000016161465435605700251070ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "textflag.h" // func GetCPU() uint32 TEXT ·GetCPU(SB),NOSPLIT|NOFRAME,$0-4 BYTE $0x0f; BYTE $0x01; BYTE $0xf9; // RDTSCP // On Linux, the bottom 12 bits of IA32_TSC_AUX are CPU and the upper 20 // are node. See arch/x86/entry/vdso/vma.c:vgetcpu_cpu_init(). ANDL $0xfff, CX MOVL CX, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/hostcpu/getcpu_arm64.s000066400000000000000000000017031465435605700251220ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "textflag.h" // GetCPU makes the getcpu(unsigned *cpu, unsigned *node, NULL) syscall for // the lack of an optimized way of getting the current CPU number on arm64. // func GetCPU() uint32 TEXT ·GetCPU(SB), NOSPLIT, $0-4 MOVW ZR, ret+0(FP) MOVD $ret+0(FP), R0 MOVD $0x0, R1 // unused MOVD $0x0, R2 // unused MOVD $0xA8, R8 // SYS_GETCPU SVC RET golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/hostcpu/hostcpu.go000066400000000000000000000040761465435605700244600ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package hostcpu provides utilities for working with CPU information provided // by a host Linux kernel. package hostcpu import ( "fmt" "io/ioutil" "strconv" "strings" "unicode" ) // GetCPU returns the caller's current CPU number, without using the Linux VDSO // (which is not available to the sentry) or the getcpu(2) system call (which // is relatively slow). func GetCPU() uint32 // MaxPossibleCPU returns the highest possible CPU number, which is guaranteed // not to change for the lifetime of the host kernel. func MaxPossibleCPU() (uint32, error) { const path = "/sys/devices/system/cpu/possible" data, err := ioutil.ReadFile(path) if err != nil { return 0, err } str := string(data) // Linux: drivers/base/cpu.c:show_cpus_attr() => // include/linux/cpumask.h:cpumask_print_to_pagebuf() => // lib/bitmap.c:bitmap_print_to_pagebuf() i, err := maxValueInLinuxBitmap(str) if err != nil { return 0, fmt.Errorf("invalid %s (%q): %v", path, str, err) } return uint32(i), nil } // maxValueInLinuxBitmap returns the maximum value specified in str, which is a // string emitted by Linux's lib/bitmap.c:bitmap_print_to_pagebuf(list=true). func maxValueInLinuxBitmap(str string) (uint64, error) { str = strings.TrimSpace(str) // Find the last decimal number in str. idx := strings.LastIndexFunc(str, func(c rune) bool { return !unicode.IsDigit(c) }) if idx != -1 { str = str[idx+1:] } i, err := strconv.ParseUint(str, 10, 64) if err != nil { return 0, err } return i, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/hostcpu/hostcpu_state_autogen.go000066400000000000000000000000711465435605700273710ustar00rootroot00000000000000// automatically generated by stateify. package hostcpu golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/hostfd/000077500000000000000000000000001465435605700222375ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/hostfd/hostfd.go000066400000000000000000000110301465435605700240500ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package hostfd provides efficient I/O with host file descriptors. package hostfd import ( "io" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sync" ) // ReadWriterAt implements safemem.Reader and safemem.Writer by reading from // and writing to a host file descriptor respectively. ReadWriterAts should be // obtained by calling GetReadWriterAt. // // Clients should usually prefer to use Preadv2 and Pwritev2 directly. type ReadWriterAt struct { fd int32 offset int64 flags uint32 } var rwpool = sync.Pool{ New: func() any { return &ReadWriterAt{} }, } // GetReadWriterAt returns a ReadWriterAt that reads from / writes to the given // host file descriptor, starting at the given offset and using the given // preadv2(2)/pwritev2(2) flags. If offset is -1, the host file descriptor's // offset is used instead. Users are responsible for ensuring that fd remains // valid for the lifetime of the returned ReadWriterAt, and must call // PutReadWriterAt when it is no longer needed. func GetReadWriterAt(fd int32, offset int64, flags uint32) *ReadWriterAt { rw := rwpool.Get().(*ReadWriterAt) *rw = ReadWriterAt{ fd: fd, offset: offset, flags: flags, } return rw } // PutReadWriterAt releases a ReadWriterAt returned by a previous call to // GetReadWriterAt that is no longer in use. func PutReadWriterAt(rw *ReadWriterAt) { rwpool.Put(rw) } // ReadToBlocks implements safemem.Reader.ReadToBlocks. func (rw *ReadWriterAt) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { if dsts.IsEmpty() { return 0, nil } n, err := Preadv2(rw.fd, dsts, rw.offset, rw.flags) if rw.offset >= 0 { rw.offset += int64(n) } return n, err } // WriteFromBlocks implements safemem.Writer.WriteFromBlocks. func (rw *ReadWriterAt) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { if srcs.IsEmpty() { return 0, nil } n, err := Pwritev2(rw.fd, srcs, rw.offset, rw.flags) if rw.offset >= 0 { rw.offset += int64(n) } return n, err } // Preadv2 reads up to dsts.NumBytes() bytes from host file descriptor fd into // dsts. offset and flags are interpreted as for preadv2(2). // // Preconditions: !dsts.IsEmpty(). func Preadv2(fd int32, dsts safemem.BlockSeq, offset int64, flags uint32) (uint64, error) { // No buffering is necessary regardless of safecopy; host syscalls will // return EFAULT if appropriate, instead of raising SIGBUS. var ( n uintptr e unix.Errno ) if flags == 0 && dsts.NumBlocks() == 1 { // Use read() or pread() to avoid iovec allocation and copying. dst := dsts.Head() if offset == -1 { n, _, e = unix.Syscall(unix.SYS_READ, uintptr(fd), dst.Addr(), uintptr(dst.Len())) } else { n, _, e = unix.Syscall6(unix.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(offset), 0 /* pos_h */, 0 /* unused */) } } else { n, e = iovecsReadWrite(unix.SYS_PREADV2, fd, safemem.IovecsFromBlockSeq(dsts), offset, flags) } if e != 0 { return 0, e } if n == 0 { return 0, io.EOF } return uint64(n), nil } // Pwritev2 writes up to srcs.NumBytes() from srcs into host file descriptor // fd. offset and flags are interpreted as for pwritev2(2). // // Preconditions: !srcs.IsEmpty(). func Pwritev2(fd int32, srcs safemem.BlockSeq, offset int64, flags uint32) (uint64, error) { // No buffering is necessary regardless of safecopy; host syscalls will // return EFAULT if appropriate, instead of raising SIGBUS. var ( n uintptr e unix.Errno ) if flags == 0 && srcs.NumBlocks() == 1 { // Use write() or pwrite() to avoid iovec allocation and copying. src := srcs.Head() if offset == -1 { n, _, e = unix.Syscall(unix.SYS_WRITE, uintptr(fd), src.Addr(), uintptr(src.Len())) } else { n, _, e = unix.Syscall6(unix.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(offset), 0 /* pos_h */, 0 /* unused */) } } else { n, e = iovecsReadWrite(unix.SYS_PWRITEV2, fd, safemem.IovecsFromBlockSeq(srcs), offset, flags) } if e != 0 { return 0, e } return uint64(n), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/hostfd/hostfd_linux.go000066400000000000000000000017271465435605700253030ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false package hostfd // MaxReadWriteIov is the maximum permitted size of a struct iovec array in a // readv, writev, preadv, or pwritev host syscall. const MaxReadWriteIov = 1024 // UIO_MAXIOV // MaxSendRecvMsgIov is the maximum permitted size of a struct iovec array in a // sendmsg or recvmsg host syscall. const MaxSendRecvMsgIov = 1024 // UIO_MAXIOV golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/hostfd/hostfd_linux_state_autogen.go000066400000000000000000000001341465435605700302140ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package hostfd golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/hostfd/hostfd_state_autogen.go000066400000000000000000000000701465435605700267740ustar00rootroot00000000000000// automatically generated by stateify. package hostfd golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/hostfd/hostfd_unsafe.go000066400000000000000000000031201465435605700254120ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package hostfd import ( "unsafe" "golang.org/x/sys/unix" ) const ( sizeofIovec = unsafe.Sizeof(unix.Iovec{}) sizeofMsghdr = unsafe.Sizeof(unix.Msghdr{}) ) func iovecsReadWrite(sysno uintptr, fd int32, iovs []unix.Iovec, offset int64, flags uint32) (uintptr, unix.Errno) { var total uintptr for start := 0; start < len(iovs); start += MaxReadWriteIov { last := true size := len(iovs) - start if size > MaxReadWriteIov { last = false size = MaxReadWriteIov } curOff := offset if offset >= 0 { curOff = offset + int64(total) } cur, _, e := unix.Syscall6(sysno, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[start])), uintptr(size), uintptr(curOff), 0 /* pos_h */, uintptr(flags)) if cur > 0 { total += cur } if e != 0 { return total, e } if last { break } // If this was a short read/write, then break. var curTotal uint64 for i := range iovs[start : start+size] { curTotal += iovs[i].Len } if uint64(cur) < curTotal { break } } return total, 0 } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/hostfd/hostfd_unsafe_state_autogen.go000066400000000000000000000000701465435605700303350ustar00rootroot00000000000000// automatically generated by stateify. package hostfd golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/hostmm/000077500000000000000000000000001465435605700222575ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/hostmm/cgroup.go000066400000000000000000000067151465435605700241160ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package hostmm import ( "bufio" "fmt" "os" "path" "strings" ) // currentCgroupDirectory returns the directory for the cgroup for the given // controller in which the calling process resides. func currentCgroupDirectory(ctrl string) (string, error) { root, err := cgroupRootDirectory(ctrl) if err != nil { return "", err } cg, err := currentCgroup(ctrl) if err != nil { return "", err } return path.Join(root, cg), nil } // cgroupRootDirectory returns the root directory for the cgroup hierarchy in // which the given cgroup controller is mounted in the calling process' mount // namespace. func cgroupRootDirectory(ctrl string) (string, error) { const path = "/proc/self/mounts" file, err := os.Open(path) if err != nil { return "", err } defer file.Close() // Per proc(5) -> fstab(5): // Each line of /proc/self/mounts describes a mount. scanner := bufio.NewScanner(file) for scanner.Scan() { // Each line consists of 6 space-separated fields. Find the line for // which the third field (fs_vfstype) is cgroup, and the fourth field // (fs_mntops, a comma-separated list of mount options) contains // ctrl. var spec, file, vfstype, mntopts, freq, passno string const nrfields = 6 line := scanner.Text() n, err := fmt.Sscan(line, &spec, &file, &vfstype, &mntopts, &freq, &passno) if err != nil { return "", fmt.Errorf("failed to parse %s: %v", path, err) } if n != nrfields { return "", fmt.Errorf("failed to parse %s: line %q: got %d fields, wanted %d", path, line, n, nrfields) } if vfstype != "cgroup" { continue } for _, mntopt := range strings.Split(mntopts, ",") { if mntopt == ctrl { return file, nil } } } return "", fmt.Errorf("no cgroup hierarchy mounted for controller %s", ctrl) } // currentCgroup returns the cgroup for the given controller in which the // calling process resides. The returned string is a path that should be // interpreted as relative to cgroupRootDirectory(ctrl). func currentCgroup(ctrl string) (string, error) { const path = "/proc/self/cgroup" file, err := os.Open(path) if err != nil { return "", err } defer file.Close() // Per proc(5) -> cgroups(7): // Each line of /proc/self/cgroups describes a cgroup hierarchy. scanner := bufio.NewScanner(file) for scanner.Scan() { // Each line consists of 3 colon-separated fields. Find the line for // which the second field (controller-list, a comma-separated list of // cgroup controllers) contains ctrl. line := scanner.Text() const nrfields = 3 fields := strings.Split(line, ":") if len(fields) != nrfields { return "", fmt.Errorf("failed to parse %s: line %q: got %d fields, wanted %d", path, line, len(fields), nrfields) } for _, controller := range strings.Split(fields[1], ",") { if controller == ctrl { return fields[2], nil } } } return "", fmt.Errorf("not a member of a cgroup hierarchy for controller %s", ctrl) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/hostmm/hostmm.go000066400000000000000000000075521465435605700241260ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package hostmm provides tools for interacting with the host Linux kernel's // virtual memory management subsystem. package hostmm import ( "fmt" "os" "path" "regexp" "gvisor.dev/gvisor/pkg/eventfd" "gvisor.dev/gvisor/pkg/log" ) // GetTransparentHugepageEnum returns the currently selected option for // whichever of // /sys/kernel/mm/transparent_hugepage/{enabled,shmem_enabled,defrag} is // specified by filename. (Only the basename is required, not the full path.) func GetTransparentHugepageEnum(filename string) (string, error) { pathname := path.Join("/sys/kernel/mm/transparent_hugepage/", filename) data, err := os.ReadFile(pathname) if err != nil { return "", err } // In these files, the selected option is highlighted by square brackets. m := regexp.MustCompile(`\[.*\]`).Find(data) if m == nil { return "", fmt.Errorf("failed to parse %s: %q", pathname, data) } // Remove the square brackets. return string(m[1 : len(m)-1]), nil } // NotifyCurrentMemcgPressureCallback requests that f is called whenever the // calling process' memory cgroup indicates memory pressure of the given level, // as specified by Linux's Documentation/cgroup-v1/memory.txt. // // If NotifyCurrentMemcgPressureCallback succeeds, it returns a function that // terminates the requested memory pressure notifications. This function may be // called at most once. func NotifyCurrentMemcgPressureCallback(f func(), level string) (func(), error) { cgdir, err := currentCgroupDirectory("memory") if err != nil { return nil, err } pressurePath := path.Join(cgdir, "memory.pressure_level") pressureFile, err := os.Open(pressurePath) if err != nil { return nil, err } defer pressureFile.Close() eventControlPath := path.Join(cgdir, "cgroup.event_control") eventControlFile, err := os.OpenFile(eventControlPath, os.O_WRONLY, 0) if err != nil { return nil, err } defer eventControlFile.Close() eventFD, err := eventfd.Create() if err != nil { return nil, err } // Don't use fmt.Fprintf since the whole string needs to be written in a // single unix. eventControlStr := fmt.Sprintf("%d %d %s", eventFD.FD(), pressureFile.Fd(), level) if n, err := eventControlFile.Write([]byte(eventControlStr)); n != len(eventControlStr) || err != nil { eventFD.Close() return nil, fmt.Errorf("error writing %q to %s: got (%d, %v), wanted (%d, nil)", eventControlStr, eventControlPath, n, err, len(eventControlStr)) } log.Debugf("Receiving memory pressure level notifications from %s at level %q", pressurePath, level) const sizeofUint64 = 8 // The most significant bit of the eventfd value is set by the stop // function, which is practically unambiguous since it's not plausible for // 2**63 pressure events to occur between eventfd reads. const stopVal = 1 << 63 stopCh := make(chan struct{}) go func() { // S/R-SAFE: f provides synchronization if necessary for { val, err := eventFD.Read() if err != nil { panic(fmt.Sprintf("failed to read from memory pressure level eventfd: %v", err)) } if val >= stopVal { // Assume this was due to the notifier's "destructor" (the // function returned by NotifyCurrentMemcgPressureCallback // below) being called. eventFD.Close() close(stopCh) return } f() } }() return func() { eventFD.Write(stopVal) <-stopCh }, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/hostmm/hostmm_state_autogen.go000066400000000000000000000000701465435605700270340ustar00rootroot00000000000000// automatically generated by stateify. package hostmm golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/hostmm/membarrier.go000066400000000000000000000063041465435605700247360ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package hostmm import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" ) var ( haveMembarrierGlobal = false haveMembarrierPrivateExpedited = false ) func init() { supported, _, e := unix.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_QUERY, 0 /* flags */, 0 /* unused */) if e != 0 { if e != unix.ENOSYS { log.Warningf("membarrier(MEMBARRIER_CMD_QUERY) failed: %s", e.Error()) } return } // We don't use MEMBARRIER_CMD_GLOBAL_EXPEDITED because this sends IPIs to // all CPUs running tasks that have previously invoked // MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, which presents a DOS risk. // (MEMBARRIER_CMD_GLOBAL is synchronize_rcu(), i.e. it waits for an RCU // grace period to elapse without bothering other CPUs. // MEMBARRIER_CMD_PRIVATE_EXPEDITED sends IPIs only to CPUs running tasks // sharing the caller's MM.) if supported&linux.MEMBARRIER_CMD_GLOBAL != 0 { haveMembarrierGlobal = true } if req := uintptr(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED | linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED); supported&req == req { if _, _, e := unix.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0 /* flags */, 0 /* unused */); e != 0 { log.Warningf("membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) failed: %s", e.Error()) } else { haveMembarrierPrivateExpedited = true } } } // HaveGlobalMemoryBarrier returns true if GlobalMemoryBarrier is supported. func HaveGlobalMemoryBarrier() bool { return haveMembarrierGlobal } // GlobalMemoryBarrier blocks until "all running threads [in the host OS] have // passed through a state where all memory accesses to user-space addresses // match program order between entry to and return from [GlobalMemoryBarrier]", // as for membarrier(2). // // Preconditions: HaveGlobalMemoryBarrier() == true. func GlobalMemoryBarrier() error { if _, _, e := unix.Syscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_GLOBAL, 0 /* flags */, 0 /* unused */); e != 0 { return e } return nil } // HaveProcessMemoryBarrier returns true if ProcessMemoryBarrier is supported. func HaveProcessMemoryBarrier() bool { return haveMembarrierPrivateExpedited } // ProcessMemoryBarrier is equivalent to GlobalMemoryBarrier, but only // synchronizes with threads sharing a virtual address space (from the host OS' // perspective) with the calling thread. // // Preconditions: HaveProcessMemoryBarrier() == true. func ProcessMemoryBarrier() error { if _, _, e := unix.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0 /* flags */, 0 /* unused */); e != 0 { return e } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/inet/000077500000000000000000000000001465435605700217075ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/inet/abstract_socket_namespace.go000066400000000000000000000105751465435605700274350ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package inet import ( "fmt" "math/rand" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/syserr" ) // +stateify savable type abstractEndpoint struct { ep transport.BoundEndpoint socket refs.TryRefCounter name string ns *AbstractSocketNamespace } // AbstractSocketNamespace is used to implement the Linux abstract socket functionality. // // +stateify savable type AbstractSocketNamespace struct { mu abstractSocketNamespaceMutex `state:"nosave"` // Keeps a mapping from name to endpoint. AbstractSocketNamespace does not hold // any references on any sockets that it contains; when retrieving a socket, // TryIncRef() must be called in case the socket is concurrently being // destroyed. It is the responsibility of the socket to remove itself from the // abstract socket namespace when it is destroyed. endpoints map[string]abstractEndpoint } // A boundEndpoint wraps a transport.BoundEndpoint to maintain a reference on // its backing socket. type boundEndpoint struct { transport.BoundEndpoint socket refs.TryRefCounter } // Release implements transport.BoundEndpoint.Release. func (e *boundEndpoint) Release(ctx context.Context) { e.socket.DecRef(ctx) e.BoundEndpoint.Release(ctx) } func (a *AbstractSocketNamespace) init() { a.endpoints = make(map[string]abstractEndpoint) } // BoundEndpoint retrieves the endpoint bound to the given name. The return // value is nil if no endpoint was bound. func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndpoint { a.mu.Lock() defer a.mu.Unlock() ep, ok := a.endpoints[name] if !ok { return nil } if !ep.socket.TryIncRef() { // The socket has reached zero references and is being destroyed. return nil } return &boundEndpoint{ep.ep, ep.socket} } // Bind binds the given socket. // // When the last reference managed by socket is dropped, ep may be removed from the // namespace. func (a *AbstractSocketNamespace) Bind(ctx context.Context, path string, ep transport.BoundEndpoint, socket refs.TryRefCounter) (string, *syserr.Error) { a.mu.Lock() defer a.mu.Unlock() name := "" if path == "" { // Autobind feature. mask := uint32(0xFFFFF) r := rand.Uint32() for i := uint32(0); i <= mask; i++ { p := fmt.Sprintf("X%05x", (r+i)&mask) if _, ok := a.endpoints[p[1:]]; ok { continue } b := ([]byte)(p) b[0] = 0 path = string(b) break } if path == "" { return "", syserr.ErrNoSpace } name = path[1:] } else { name = path[1:] // Check if there is already a socket (which has not yet been destroyed) bound at name. if _, ok := a.endpoints[name]; ok { return "", syserr.ErrPortInUse } } ae := abstractEndpoint{ep: ep, name: name, ns: a} ae.socket = socket a.endpoints[name] = ae return path, nil } // Remove removes the specified socket at name from the abstract socket // namespace, if it has not yet been replaced. func (a *AbstractSocketNamespace) Remove(name string, socket refs.TryRefCounter) { a.mu.Lock() defer a.mu.Unlock() ep, ok := a.endpoints[name] if !ok { // We never delete a map entry apart from a socket's destructor (although the // map entry may be overwritten). Therefore, a socket should exist, even if it // may not be the one we expect. panic(fmt.Sprintf("expected socket to exist at '%s' in abstract socket namespace", name)) } // A Bind() operation may race with callers of Remove(), e.g. in the // following case: // socket1 reaches zero references and begins destruction // a.Bind("foo", ep, socket2) replaces socket1 with socket2 // socket1's destructor calls a.Remove("foo", socket1) // // Therefore, we need to check that the socket at name is what we expect // before modifying the map. if ep.socket == socket { delete(a.endpoints, name) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/inet/abstract_socket_namespace_mutex.go000066400000000000000000000036411465435605700306530ustar00rootroot00000000000000package inet import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type abstractSocketNamespaceMutex struct { mu sync.Mutex } var abstractSocketNamespaceprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var abstractSocketNamespacelockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type abstractSocketNamespacelockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *abstractSocketNamespaceMutex) Lock() { locking.AddGLock(abstractSocketNamespaceprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *abstractSocketNamespaceMutex) NestedLock(i abstractSocketNamespacelockNameIndex) { locking.AddGLock(abstractSocketNamespaceprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *abstractSocketNamespaceMutex) Unlock() { locking.DelGLock(abstractSocketNamespaceprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *abstractSocketNamespaceMutex) NestedUnlock(i abstractSocketNamespacelockNameIndex) { locking.DelGLock(abstractSocketNamespaceprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func abstractSocketNamespaceinitLockNames() {} func init() { abstractSocketNamespaceinitLockNames() abstractSocketNamespaceprefixIndex = locking.NewMutexClass(reflect.TypeOf(abstractSocketNamespaceMutex{}), abstractSocketNamespacelockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/inet/context.go000066400000000000000000000030361465435605700237240ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package inet import ( "gvisor.dev/gvisor/pkg/context" ) // contextID is the inet package's type for context.Context.Value keys. type contextID int const ( // CtxStack is a Context.Value key for a network stack. CtxStack contextID = iota // CtxNamespaceByFD is a Context.Value key for NamespaceByFD. CtxNamespaceByFD ) // StackFromContext returns the network stack associated with ctx. func StackFromContext(ctx context.Context) Stack { if v := ctx.Value(CtxStack); v != nil { return v.(Stack) } return nil } // NamespaceByFD returns the network namespace associated with the specified // file descriptor. type NamespaceByFD = func(fd int32) (*Namespace, error) // NamespaceByFDFromContext returns NamespaceByFD to lookup the network // namespace associated with the specified file descriptor. func NamespaceByFDFromContext(ctx context.Context) NamespaceByFD { if v := ctx.Value(CtxNamespaceByFD); v != nil { return v.(NamespaceByFD) } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/inet/inet.go000066400000000000000000000176131465435605700232050ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package inet defines semantics for IP stacks. package inet import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/socket/netlink/nlmsg" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // Stack represents a TCP/IP stack. type Stack interface { // Interfaces returns all network interfaces as a mapping from interface // indexes to interface properties. Interface indices are strictly positive // integers. Interfaces() map[int32]Interface // RemoveInterface removes the specified network interface. RemoveInterface(idx int32) error // InterfaceAddrs returns all network interface addresses as a mapping from // interface indexes to a slice of associated interface address properties. InterfaceAddrs() map[int32][]InterfaceAddr // AddInterfaceAddr adds an address to the network interface identified by // idx. AddInterfaceAddr(idx int32, addr InterfaceAddr) error // SetInterface modifies or adds a new interface. SetInterface(ctx context.Context, msg *nlmsg.Message) *syserr.Error // RemoveInterfaceAddr removes an address from the network interface // identified by idx. RemoveInterfaceAddr(idx int32, addr InterfaceAddr) error // SupportsIPv6 returns true if the stack supports IPv6 connectivity. SupportsIPv6() bool // TCPReceiveBufferSize returns TCP receive buffer size settings. TCPReceiveBufferSize() (TCPBufferSize, error) // SetTCPReceiveBufferSize attempts to change TCP receive buffer size // settings. SetTCPReceiveBufferSize(size TCPBufferSize) error // TCPSendBufferSize returns TCP send buffer size settings. TCPSendBufferSize() (TCPBufferSize, error) // SetTCPSendBufferSize attempts to change TCP send buffer size settings. SetTCPSendBufferSize(size TCPBufferSize) error // TCPSACKEnabled returns true if RFC 2018 TCP Selective Acknowledgements // are enabled. TCPSACKEnabled() (bool, error) // SetTCPSACKEnabled attempts to change TCP selective acknowledgement // settings. SetTCPSACKEnabled(enabled bool) error // TCPRecovery returns the TCP loss detection algorithm. TCPRecovery() (TCPLossRecovery, error) // SetTCPRecovery attempts to change TCP loss detection algorithm. SetTCPRecovery(recovery TCPLossRecovery) error // Statistics reports stack statistics. Statistics(stat any, arg string) error // RouteTable returns the network stack's route table. RouteTable() []Route // NewRoute adds the given route to the network stack's route table. NewRoute(ctx context.Context, msg *nlmsg.Message) *syserr.Error // Pause pauses the network stack before save. Pause() // Resume resumes the network stack after save. Resume() // Restore restarts the network stack after restore. Restore() // Destroy the network stack. Destroy() // RegisteredEndpoints returns all endpoints which are currently registered. RegisteredEndpoints() []stack.TransportEndpoint // CleanupEndpoints returns endpoints currently in the cleanup state. CleanupEndpoints() []stack.TransportEndpoint // RestoreCleanupEndpoints adds endpoints to cleanup tracking. This is useful // for restoring a stack after a save. RestoreCleanupEndpoints([]stack.TransportEndpoint) // SetForwarding enables or disables packet forwarding between NICs. SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error // PortRange returns the UDP and TCP inclusive range of ephemeral ports // used in both IPv4 and IPv6. PortRange() (uint16, uint16) // SetPortRange sets the UDP and TCP IPv4 and IPv6 ephemeral port range // (inclusive). SetPortRange(start uint16, end uint16) error } // Interface contains information about a network interface. type Interface struct { // DeviceType is the device type, a Linux ARPHRD_* constant. DeviceType uint16 // Flags is the device flags; see netdevice(7), under "Ioctls", // "SIOCGIFFLAGS, SIOCSIFFLAGS". Flags uint32 // Name is the device name. Name string // Addr is the hardware device address. Addr []byte // MTU is the maximum transmission unit. MTU uint32 // Features are the device features queried from the host at // stack creation time. These are immutable after startup. Features []linux.EthtoolGetFeaturesBlock } // InterfaceAddr contains information about a network interface address. type InterfaceAddr struct { // Family is the address family, a Linux AF_* constant. Family uint8 // PrefixLen is the address prefix length. PrefixLen uint8 // Flags is the address flags. Flags uint8 // Addr is the actual address. Addr []byte } // TCPBufferSize contains settings controlling TCP buffer sizing. // // +stateify savable type TCPBufferSize struct { // Min is the minimum size. Min int // Default is the default size. Default int // Max is the maximum size. Max int } // StatDev describes one line of /proc/net/dev, i.e., stats for one network // interface. type StatDev [16]uint64 // Route contains information about a network route. type Route struct { // Family is the address family, a Linux AF_* constant. Family uint8 // DstLen is the length of the destination address. DstLen uint8 // SrcLen is the length of the source address. SrcLen uint8 // TOS is the Type of Service filter. TOS uint8 // Table is the routing table ID. Table uint8 // Protocol is the route origin, a Linux RTPROT_* constant. Protocol uint8 // Scope is the distance to destination, a Linux RT_SCOPE_* constant. Scope uint8 // Type is the route origin, a Linux RTN_* constant. Type uint8 // Flags are route flags. See rtnetlink(7) under "rtm_flags". Flags uint32 // DstAddr is the route destination address (RTA_DST). DstAddr []byte // SrcAddr is the route source address (RTA_SRC). SrcAddr []byte // OutputInterface is the output interface index (RTA_OIF). OutputInterface int32 // GatewayAddr is the route gateway address (RTA_GATEWAY). GatewayAddr []byte } // Below SNMP metrics are from Linux/usr/include/linux/snmp.h. // StatSNMPIP describes Ip line of /proc/net/snmp. type StatSNMPIP [19]uint64 // StatSNMPICMP describes Icmp line of /proc/net/snmp. type StatSNMPICMP [27]uint64 // StatSNMPICMPMSG describes IcmpMsg line of /proc/net/snmp. type StatSNMPICMPMSG [512]uint64 // StatSNMPTCP describes Tcp line of /proc/net/snmp. type StatSNMPTCP [15]uint64 // StatSNMPUDP describes Udp line of /proc/net/snmp. type StatSNMPUDP [8]uint64 // StatSNMPUDPLite describes UdpLite line of /proc/net/snmp. type StatSNMPUDPLite [8]uint64 // TCPLossRecovery indicates TCP loss detection and recovery methods to use. type TCPLossRecovery int32 // Loss recovery constants from include/net/tcp.h which are used to set // /proc/sys/net/ipv4/tcp_recovery. const ( TCP_RACK_LOSS_DETECTION TCPLossRecovery = 1 << iota TCP_RACK_STATIC_REO_WND TCP_RACK_NO_DUPTHRESH ) // InterfaceRequest contains information about an adding interface. type InterfaceRequest struct { // Kind is the link type. Kind string // Name is the interface name. Name string // Addr is the hardware device address. Addr []byte // MTU is the maximum transmission unit. MTU uint32 // Data is link type specific device properties. Data any } // VethPeerReq contains information about a second interface of a new veth pair. type VethPeerReq struct { // Req is information about the second end of the new veth pair. Req InterfaceRequest // Stack is the stack where the second end has to be added. Stack Stack } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/inet/inet_state_autogen.go000066400000000000000000000076361465435605700261330ustar00rootroot00000000000000// automatically generated by stateify. package inet import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (a *abstractEndpoint) StateTypeName() string { return "pkg/sentry/inet.abstractEndpoint" } func (a *abstractEndpoint) StateFields() []string { return []string{ "ep", "socket", "name", "ns", } } func (a *abstractEndpoint) beforeSave() {} // +checklocksignore func (a *abstractEndpoint) StateSave(stateSinkObject state.Sink) { a.beforeSave() stateSinkObject.Save(0, &a.ep) stateSinkObject.Save(1, &a.socket) stateSinkObject.Save(2, &a.name) stateSinkObject.Save(3, &a.ns) } func (a *abstractEndpoint) afterLoad(context.Context) {} // +checklocksignore func (a *abstractEndpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &a.ep) stateSourceObject.Load(1, &a.socket) stateSourceObject.Load(2, &a.name) stateSourceObject.Load(3, &a.ns) } func (a *AbstractSocketNamespace) StateTypeName() string { return "pkg/sentry/inet.AbstractSocketNamespace" } func (a *AbstractSocketNamespace) StateFields() []string { return []string{ "endpoints", } } func (a *AbstractSocketNamespace) beforeSave() {} // +checklocksignore func (a *AbstractSocketNamespace) StateSave(stateSinkObject state.Sink) { a.beforeSave() stateSinkObject.Save(0, &a.endpoints) } func (a *AbstractSocketNamespace) afterLoad(context.Context) {} // +checklocksignore func (a *AbstractSocketNamespace) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &a.endpoints) } func (t *TCPBufferSize) StateTypeName() string { return "pkg/sentry/inet.TCPBufferSize" } func (t *TCPBufferSize) StateFields() []string { return []string{ "Min", "Default", "Max", } } func (t *TCPBufferSize) beforeSave() {} // +checklocksignore func (t *TCPBufferSize) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.Min) stateSinkObject.Save(1, &t.Default) stateSinkObject.Save(2, &t.Max) } func (t *TCPBufferSize) afterLoad(context.Context) {} // +checklocksignore func (t *TCPBufferSize) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.Min) stateSourceObject.Load(1, &t.Default) stateSourceObject.Load(2, &t.Max) } func (n *Namespace) StateTypeName() string { return "pkg/sentry/inet.Namespace" } func (n *Namespace) StateFields() []string { return []string{ "inode", "creator", "isRoot", "userNS", "abstractSockets", } } func (n *Namespace) beforeSave() {} // +checklocksignore func (n *Namespace) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.inode) stateSinkObject.Save(1, &n.creator) stateSinkObject.Save(2, &n.isRoot) stateSinkObject.Save(3, &n.userNS) stateSinkObject.Save(4, &n.abstractSockets) } // +checklocksignore func (n *Namespace) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.inode) stateSourceObject.LoadWait(1, &n.creator) stateSourceObject.Load(2, &n.isRoot) stateSourceObject.Load(3, &n.userNS) stateSourceObject.Load(4, &n.abstractSockets) stateSourceObject.AfterLoad(func() { n.afterLoad(ctx) }) } func (r *namespaceRefs) StateTypeName() string { return "pkg/sentry/inet.namespaceRefs" } func (r *namespaceRefs) StateFields() []string { return []string{ "refCount", } } func (r *namespaceRefs) beforeSave() {} // +checklocksignore func (r *namespaceRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *namespaceRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func init() { state.Register((*abstractEndpoint)(nil)) state.Register((*AbstractSocketNamespace)(nil)) state.Register((*TCPBufferSize)(nil)) state.Register((*Namespace)(nil)) state.Register((*namespaceRefs)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/inet/namespace.go000066400000000000000000000111611465435605700241720ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package inet import ( goContext "context" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) // Namespace represents a network namespace. See network_namespaces(7). // // +stateify savable type Namespace struct { inode *nsfs.Inode // stack is the network stack implementation of this network namespace. stack Stack `state:"nosave"` // creator allows kernel to create new network stack for network namespaces. // If nil, no networking will function if network is namespaced. // // At afterLoad(), creator will be used to create network stack. Stateify // needs to wait for this field to be loaded before calling afterLoad(). creator NetworkStackCreator `state:"wait"` // isRoot indicates whether this is the root network namespace. isRoot bool userNS *auth.UserNamespace // abstractSockets tracks abstract sockets that are in use. abstractSockets AbstractSocketNamespace } // NewRootNamespace creates the root network namespace, with creator // allowing new network namespaces to be created. If creator is nil, no // networking will function if the network is namespaced. func NewRootNamespace(stack Stack, creator NetworkStackCreator, userNS *auth.UserNamespace) *Namespace { n := &Namespace{ stack: stack, creator: creator, isRoot: true, userNS: userNS, } n.abstractSockets.init() return n } // UserNamespace returns the user namespace associated with this namespace. func (n *Namespace) UserNamespace() *auth.UserNamespace { return n.userNS } // SetInode sets the nsfs `inode` to the namespace. func (n *Namespace) SetInode(inode *nsfs.Inode) { n.inode = inode } // GetInode returns the nsfs inode associated with this namespace. func (n *Namespace) GetInode() *nsfs.Inode { return n.inode } // NewNamespace creates a new network namespace from the root. func NewNamespace(root *Namespace, userNS *auth.UserNamespace) *Namespace { n := &Namespace{ creator: root.creator, userNS: userNS, } n.init() return n } // Destroy implements nsfs.Namespace.Destroy. func (n *Namespace) Destroy(ctx context.Context) { if s := n.Stack(); s != nil { s.Destroy() } } // Type implements nsfs.Namespace.Type. func (n *Namespace) Type() string { return "net" } // IncRef increments the Namespace's refcount. func (n *Namespace) IncRef() { n.inode.IncRef() } // DecRef decrements the Namespace's refcount. func (n *Namespace) DecRef(ctx context.Context) { n.inode.DecRef(ctx) } // Stack returns the network stack of n. Stack may return nil if no network // stack is configured. func (n *Namespace) Stack() Stack { return n.stack } // IsRoot returns whether n is the root network namespace. func (n *Namespace) IsRoot() bool { return n.isRoot } // RestoreRootStack restores the root network namespace with stack. This should // only be called when restoring kernel. func (n *Namespace) RestoreRootStack(stack Stack) { if !n.isRoot { panic("RestoreRootStack can only be called on root network namespace") } if n.stack != nil { panic("RestoreRootStack called after a stack has already been set") } n.stack = stack } // ResetStack resets the stack in the network namespace to nil. This should // only be called when restoring kernel. func (n *Namespace) ResetStack() { n.stack = nil } func (n *Namespace) init() { // Root network namespace will have stack assigned later. if n.isRoot { return } if n.creator != nil { var err error n.stack, err = n.creator.CreateStack() if err != nil { panic(err) } } n.abstractSockets.init() } // afterLoad is invoked by stateify. func (n *Namespace) afterLoad(goContext.Context) { n.init() } // AbstractSockets returns AbstractSocketNamespace. func (n *Namespace) AbstractSockets() *AbstractSocketNamespace { return &n.abstractSockets } // NetworkStackCreator allows new instances of a network stack to be created. It // is used by the kernel to create new network namespaces when requested. type NetworkStackCreator interface { // CreateStack creates a new network stack for a network namespace. CreateStack() (Stack, error) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/inet/namespace_refs.go000066400000000000000000000101751465435605700252150ustar00rootroot00000000000000package inet import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const namespaceenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var namespaceobj *Namespace // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type namespaceRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *namespaceRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *namespaceRefs) RefType() string { return fmt.Sprintf("%T", namespaceobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *namespaceRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *namespaceRefs) LogRefs() bool { return namespaceenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *namespaceRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *namespaceRefs) IncRef() { v := r.refCount.Add(1) if namespaceenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *namespaceRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if namespaceenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *namespaceRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if namespaceenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *namespaceRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/inet/test_stack.go000066400000000000000000000131011465435605700243760ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package inet import ( "bytes" "fmt" "time" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/socket/netlink/nlmsg" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) var _ Stack = (*TestStack)(nil) // TestStack is a dummy implementation of Stack for tests. type TestStack struct { InterfacesMap map[int32]Interface InterfaceAddrsMap map[int32][]InterfaceAddr RouteList []Route SupportsIPv6Flag bool TCPRecvBufSize TCPBufferSize TCPSendBufSize TCPBufferSize TCPSACKFlag bool Recovery TCPLossRecovery IPForwarding bool } // NewTestStack returns a TestStack with no network interfaces. The value of // all other options is unspecified; tests that rely on specific values must // set them explicitly. func NewTestStack() *TestStack { return &TestStack{ InterfacesMap: make(map[int32]Interface), InterfaceAddrsMap: make(map[int32][]InterfaceAddr), } } // Interfaces implements Stack. func (s *TestStack) Interfaces() map[int32]Interface { return s.InterfacesMap } // Destroy implements Stack. func (s *TestStack) Destroy() { } // RemoveInterface implements Stack. func (s *TestStack) RemoveInterface(idx int32) error { delete(s.InterfacesMap, idx) return nil } // SetInterface implements Stack. func (s *TestStack) SetInterface(ctx context.Context, msg *nlmsg.Message) *syserr.Error { panic("unimplemented") } // InterfaceAddrs implements Stack. func (s *TestStack) InterfaceAddrs() map[int32][]InterfaceAddr { return s.InterfaceAddrsMap } // AddInterfaceAddr implements Stack. func (s *TestStack) AddInterfaceAddr(idx int32, addr InterfaceAddr) error { s.InterfaceAddrsMap[idx] = append(s.InterfaceAddrsMap[idx], addr) return nil } // RemoveInterfaceAddr implements Stack. func (s *TestStack) RemoveInterfaceAddr(idx int32, addr InterfaceAddr) error { interfaceAddrs, ok := s.InterfaceAddrsMap[idx] if !ok { return fmt.Errorf("unknown idx: %d", idx) } var filteredAddrs []InterfaceAddr for _, interfaceAddr := range interfaceAddrs { if !bytes.Equal(interfaceAddr.Addr, addr.Addr) { filteredAddrs = append(filteredAddrs, addr) } } s.InterfaceAddrsMap[idx] = filteredAddrs return nil } // SupportsIPv6 implements Stack. func (s *TestStack) SupportsIPv6() bool { return s.SupportsIPv6Flag } // TCPReceiveBufferSize implements Stack. func (s *TestStack) TCPReceiveBufferSize() (TCPBufferSize, error) { return s.TCPRecvBufSize, nil } // SetTCPReceiveBufferSize implements Stack. func (s *TestStack) SetTCPReceiveBufferSize(size TCPBufferSize) error { s.TCPRecvBufSize = size return nil } // TCPSendBufferSize implements Stack. func (s *TestStack) TCPSendBufferSize() (TCPBufferSize, error) { return s.TCPSendBufSize, nil } // SetTCPSendBufferSize implements Stack. func (s *TestStack) SetTCPSendBufferSize(size TCPBufferSize) error { s.TCPSendBufSize = size return nil } // TCPSACKEnabled implements Stack. func (s *TestStack) TCPSACKEnabled() (bool, error) { return s.TCPSACKFlag, nil } // SetTCPSACKEnabled implements Stack. func (s *TestStack) SetTCPSACKEnabled(enabled bool) error { s.TCPSACKFlag = enabled return nil } // TCPRecovery implements Stack. func (s *TestStack) TCPRecovery() (TCPLossRecovery, error) { return s.Recovery, nil } // SetTCPRecovery implements Stack. func (s *TestStack) SetTCPRecovery(recovery TCPLossRecovery) error { s.Recovery = recovery return nil } // Statistics implements Stack. func (s *TestStack) Statistics(stat any, arg string) error { return nil } // RouteTable implements Stack. func (s *TestStack) RouteTable() []Route { return s.RouteList } // NewRoute implements Stack. func (s *TestStack) NewRoute(ctx context.Context, msg *nlmsg.Message) *syserr.Error { return syserr.ErrNotPermitted } // Pause implements Stack. func (s *TestStack) Pause() {} // Restore implements Stack. func (s *TestStack) Restore() {} // Resume implements Stack. func (s *TestStack) Resume() {} // RegisteredEndpoints implements Stack. func (s *TestStack) RegisteredEndpoints() []stack.TransportEndpoint { return nil } // CleanupEndpoints implements Stack. func (s *TestStack) CleanupEndpoints() []stack.TransportEndpoint { return nil } // RestoreCleanupEndpoints implements Stack. func (s *TestStack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {} // SetForwarding implements Stack. func (s *TestStack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error { s.IPForwarding = enable return nil } // PortRange implements Stack. func (*TestStack) PortRange() (uint16, uint16) { // Use the default Linux values per net/ipv4/af_inet.c:inet_init_net(). return 32768, 60999 } // SetPortRange implements Stack. func (*TestStack) SetPortRange(start uint16, end uint16) error { // No-op. return nil } // GROTimeout implements Stack. func (*TestStack) GROTimeout(NICID int32) (time.Duration, error) { // No-op. return 0, nil } // SetGROTimeout implements Stack. func (*TestStack) SetGROTimeout(NICID int32, timeout time.Duration) error { // No-op. return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/000077500000000000000000000000001465435605700222305ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/aio.go000066400000000000000000000017461465435605700233370ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "gvisor.dev/gvisor/pkg/context" ) // AIOCallback is an function that does asynchronous I/O on behalf of a task. type AIOCallback func(context.Context) // QueueAIO queues an AIOCallback which will be run asynchronously. func (t *Task) QueueAIO(cb AIOCallback) { ctx := t.AsyncContext() wg := &t.TaskSet().aioGoroutines wg.Add(1) go func() { cb(ctx) wg.Done() }() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/atomicptr_bucket_slice_unsafe.go000066400000000000000000000026521465435605700306430ustar00rootroot00000000000000package kernel import ( "context" "sync/atomic" "unsafe" ) // An AtomicPtr is a pointer to a value of type Value that can be atomically // loaded and stored. The zero value of an AtomicPtr represents nil. // // Note that copying AtomicPtr by value performs a non-atomic read of the // stored pointer, which is unsafe if Store() can be called concurrently; in // this case, do `dst.Store(src.Load())` instead. // // +stateify savable type descriptorBucketSliceAtomicPtr struct { ptr unsafe.Pointer `state:".(*descriptorBucketSlice)"` } func (p *descriptorBucketSliceAtomicPtr) savePtr() *descriptorBucketSlice { return p.Load() } func (p *descriptorBucketSliceAtomicPtr) loadPtr(_ context.Context, v *descriptorBucketSlice) { p.Store(v) } // Load returns the value set by the most recent Store. It returns nil if there // has been no previous call to Store. // //go:nosplit func (p *descriptorBucketSliceAtomicPtr) Load() *descriptorBucketSlice { return (*descriptorBucketSlice)(atomic.LoadPointer(&p.ptr)) } // Store sets the value returned by Load to x. func (p *descriptorBucketSliceAtomicPtr) Store(x *descriptorBucketSlice) { atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x)) } // Swap atomically stores `x` into *p and returns the previous *p value. func (p *descriptorBucketSliceAtomicPtr) Swap(x *descriptorBucketSlice) *descriptorBucketSlice { return (*descriptorBucketSlice)(atomic.SwapPointer(&p.ptr, (unsafe.Pointer)(x))) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/atomicptr_bucket_unsafe.go000066400000000000000000000025371465435605700274660ustar00rootroot00000000000000package kernel import ( "context" "sync/atomic" "unsafe" ) // An AtomicPtr is a pointer to a value of type Value that can be atomically // loaded and stored. The zero value of an AtomicPtr represents nil. // // Note that copying AtomicPtr by value performs a non-atomic read of the // stored pointer, which is unsafe if Store() can be called concurrently; in // this case, do `dst.Store(src.Load())` instead. // // +stateify savable type descriptorBucketAtomicPtr struct { ptr unsafe.Pointer `state:".(*descriptorBucket)"` } func (p *descriptorBucketAtomicPtr) savePtr() *descriptorBucket { return p.Load() } func (p *descriptorBucketAtomicPtr) loadPtr(_ context.Context, v *descriptorBucket) { p.Store(v) } // Load returns the value set by the most recent Store. It returns nil if there // has been no previous call to Store. // //go:nosplit func (p *descriptorBucketAtomicPtr) Load() *descriptorBucket { return (*descriptorBucket)(atomic.LoadPointer(&p.ptr)) } // Store sets the value returned by Load to x. func (p *descriptorBucketAtomicPtr) Store(x *descriptorBucket) { atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x)) } // Swap atomically stores `x` into *p and returns the previous *p value. func (p *descriptorBucketAtomicPtr) Swap(x *descriptorBucket) *descriptorBucket { return (*descriptorBucket)(atomic.SwapPointer(&p.ptr, (unsafe.Pointer)(x))) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/atomicptr_descriptor_unsafe.go000066400000000000000000000024051465435605700303610ustar00rootroot00000000000000package kernel import ( "context" "sync/atomic" "unsafe" ) // An AtomicPtr is a pointer to a value of type Value that can be atomically // loaded and stored. The zero value of an AtomicPtr represents nil. // // Note that copying AtomicPtr by value performs a non-atomic read of the // stored pointer, which is unsafe if Store() can be called concurrently; in // this case, do `dst.Store(src.Load())` instead. // // +stateify savable type descriptorAtomicPtr struct { ptr unsafe.Pointer `state:".(*descriptor)"` } func (p *descriptorAtomicPtr) savePtr() *descriptor { return p.Load() } func (p *descriptorAtomicPtr) loadPtr(_ context.Context, v *descriptor) { p.Store(v) } // Load returns the value set by the most recent Store. It returns nil if there // has been no previous call to Store. // //go:nosplit func (p *descriptorAtomicPtr) Load() *descriptor { return (*descriptor)(atomic.LoadPointer(&p.ptr)) } // Store sets the value returned by Load to x. func (p *descriptorAtomicPtr) Store(x *descriptor) { atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x)) } // Swap atomically stores `x` into *p and returns the previous *p value. func (p *descriptorAtomicPtr) Swap(x *descriptor) *descriptor { return (*descriptor)(atomic.SwapPointer(&p.ptr, (unsafe.Pointer)(x))) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/000077500000000000000000000000001465435605700231715ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/atomicptr_credentials_unsafe.go000066400000000000000000000024221465435605700314400ustar00rootroot00000000000000package auth import ( "context" "sync/atomic" "unsafe" ) // An AtomicPtr is a pointer to a value of type Value that can be atomically // loaded and stored. The zero value of an AtomicPtr represents nil. // // Note that copying AtomicPtr by value performs a non-atomic read of the // stored pointer, which is unsafe if Store() can be called concurrently; in // this case, do `dst.Store(src.Load())` instead. // // +stateify savable type AtomicPtrCredentials struct { ptr unsafe.Pointer `state:".(*Credentials)"` } func (p *AtomicPtrCredentials) savePtr() *Credentials { return p.Load() } func (p *AtomicPtrCredentials) loadPtr(_ context.Context, v *Credentials) { p.Store(v) } // Load returns the value set by the most recent Store. It returns nil if there // has been no previous call to Store. // //go:nosplit func (p *AtomicPtrCredentials) Load() *Credentials { return (*Credentials)(atomic.LoadPointer(&p.ptr)) } // Store sets the value returned by Load to x. func (p *AtomicPtrCredentials) Store(x *Credentials) { atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x)) } // Swap atomically stores `x` into *p and returns the previous *p value. func (p *AtomicPtrCredentials) Swap(x *Credentials) *Credentials { return (*Credentials)(atomic.SwapPointer(&p.ptr, (unsafe.Pointer)(x))) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/auth.go000066400000000000000000000020061465435605700244570ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package auth implements an access control model that is a subset of Linux's. // // The auth package supports two kinds of access controls: user/group IDs and // capabilities. Each resource in the security model is associated with a user // namespace; "privileged" operations check that the operator's credentials // have the required user/group IDs or capabilities within the user namespace // of accessed resources. package auth golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/auth_abi_autogen_unsafe.go000066400000000000000000000232331465435605700303620ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package auth import ( "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "io" "reflect" "runtime" "unsafe" ) // Marshallable types used by this file. var _ marshal.Marshallable = (*GID)(nil) var _ marshal.Marshallable = (*UID)(nil) // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (gid *GID) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (gid *GID) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(*gid)) return dst[4:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (gid *GID) UnmarshalBytes(src []byte) []byte { *gid = GID(uint32(hostarch.ByteOrder.Uint32(src[:4]))) return src[4:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (gid *GID) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (gid *GID) MarshalUnsafe(dst []byte) []byte { size := gid.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(gid), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (gid *GID) UnmarshalUnsafe(src []byte) []byte { size := gid.SizeBytes() gohacks.Memmove(unsafe.Pointer(gid), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (gid *GID) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(gid))) hdr.Len = gid.SizeBytes() hdr.Cap = gid.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that gid // must live until the use above. runtime.KeepAlive(gid) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (gid *GID) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return gid.CopyOutN(cc, addr, gid.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (gid *GID) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(gid))) hdr.Len = gid.SizeBytes() hdr.Cap = gid.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that gid // must live until the use above. runtime.KeepAlive(gid) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (gid *GID) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return gid.CopyInN(cc, addr, gid.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (gid *GID) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(gid))) hdr.Len = gid.SizeBytes() hdr.Cap = gid.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that gid // must live until the use above. runtime.KeepAlive(gid) // escapes: replaced by intrinsic. return int64(length), err } // CopyGIDSliceIn copies in a slice of GID objects from the task's memory. func CopyGIDSliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst []GID) (int, error) { count := len(dst) if count == 0 { return 0, nil } size := (*GID)(nil).SizeBytes() ptr := unsafe.Pointer(&dst) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyInBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that dst // must live until the use above. runtime.KeepAlive(dst) // escapes: replaced by intrinsic. return length, err } // CopyGIDSliceOut copies a slice of GID objects to the task's memory. func CopyGIDSliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []GID) (int, error) { count := len(src) if count == 0 { return 0, nil } size := (*GID)(nil).SizeBytes() ptr := unsafe.Pointer(&src) val := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data)) // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(val) hdr.Len = size * count hdr.Cap = size * count length, err := cc.CopyOutBytes(addr, buf) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that src // must live until the use above. runtime.KeepAlive(src) // escapes: replaced by intrinsic. return length, err } // MarshalUnsafeGIDSlice is like GID.MarshalUnsafe, but for a []GID. func MarshalUnsafeGIDSlice(src []GID, dst []byte) []byte { count := len(src) if count == 0 { return dst } size := (*GID)(nil).SizeBytes() buf := dst[:size*count] gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf))) return dst[size*count:] } // UnmarshalUnsafeGIDSlice is like GID.UnmarshalUnsafe, but for a []GID. func UnmarshalUnsafeGIDSlice(dst []GID, src []byte) []byte { count := len(dst) if count == 0 { return src } size := (*GID)(nil).SizeBytes() buf := src[:size*count] gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf))) return src[size*count:] } // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (uid *UID) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (uid *UID) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(*uid)) return dst[4:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (uid *UID) UnmarshalBytes(src []byte) []byte { *uid = UID(uint32(hostarch.ByteOrder.Uint32(src[:4]))) return src[4:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (uid *UID) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (uid *UID) MarshalUnsafe(dst []byte) []byte { size := uid.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(uid), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (uid *UID) UnmarshalUnsafe(src []byte) []byte { size := uid.SizeBytes() gohacks.Memmove(unsafe.Pointer(uid), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (uid *UID) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(uid))) hdr.Len = uid.SizeBytes() hdr.Cap = uid.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that uid // must live until the use above. runtime.KeepAlive(uid) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (uid *UID) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return uid.CopyOutN(cc, addr, uid.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (uid *UID) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(uid))) hdr.Len = uid.SizeBytes() hdr.Cap = uid.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that uid // must live until the use above. runtime.KeepAlive(uid) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (uid *UID) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return uid.CopyInN(cc, addr, uid.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (uid *UID) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(uid))) hdr.Len = uid.SizeBytes() hdr.Cap = uid.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that uid // must live until the use above. runtime.KeepAlive(uid) // escapes: replaced by intrinsic. return int64(length), err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/auth_state_autogen.go000066400000000000000000000211101465435605700273760ustar00rootroot00000000000000// automatically generated by stateify. package auth import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (c *Credentials) StateTypeName() string { return "pkg/sentry/kernel/auth.Credentials" } func (c *Credentials) StateFields() []string { return []string{ "RealKUID", "EffectiveKUID", "SavedKUID", "RealKGID", "EffectiveKGID", "SavedKGID", "ExtraKGIDs", "PermittedCaps", "InheritableCaps", "EffectiveCaps", "BoundingCaps", "KeepCaps", "UserNamespace", } } func (c *Credentials) beforeSave() {} // +checklocksignore func (c *Credentials) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.RealKUID) stateSinkObject.Save(1, &c.EffectiveKUID) stateSinkObject.Save(2, &c.SavedKUID) stateSinkObject.Save(3, &c.RealKGID) stateSinkObject.Save(4, &c.EffectiveKGID) stateSinkObject.Save(5, &c.SavedKGID) stateSinkObject.Save(6, &c.ExtraKGIDs) stateSinkObject.Save(7, &c.PermittedCaps) stateSinkObject.Save(8, &c.InheritableCaps) stateSinkObject.Save(9, &c.EffectiveCaps) stateSinkObject.Save(10, &c.BoundingCaps) stateSinkObject.Save(11, &c.KeepCaps) stateSinkObject.Save(12, &c.UserNamespace) } func (c *Credentials) afterLoad(context.Context) {} // +checklocksignore func (c *Credentials) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.RealKUID) stateSourceObject.Load(1, &c.EffectiveKUID) stateSourceObject.Load(2, &c.SavedKUID) stateSourceObject.Load(3, &c.RealKGID) stateSourceObject.Load(4, &c.EffectiveKGID) stateSourceObject.Load(5, &c.SavedKGID) stateSourceObject.Load(6, &c.ExtraKGIDs) stateSourceObject.Load(7, &c.PermittedCaps) stateSourceObject.Load(8, &c.InheritableCaps) stateSourceObject.Load(9, &c.EffectiveCaps) stateSourceObject.Load(10, &c.BoundingCaps) stateSourceObject.Load(11, &c.KeepCaps) stateSourceObject.Load(12, &c.UserNamespace) } func (i *IDMapEntry) StateTypeName() string { return "pkg/sentry/kernel/auth.IDMapEntry" } func (i *IDMapEntry) StateFields() []string { return []string{ "FirstID", "FirstParentID", "Length", } } func (i *IDMapEntry) beforeSave() {} // +checklocksignore func (i *IDMapEntry) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.FirstID) stateSinkObject.Save(1, &i.FirstParentID) stateSinkObject.Save(2, &i.Length) } func (i *IDMapEntry) afterLoad(context.Context) {} // +checklocksignore func (i *IDMapEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.FirstID) stateSourceObject.Load(1, &i.FirstParentID) stateSourceObject.Load(2, &i.Length) } func (r *idMapRange) StateTypeName() string { return "pkg/sentry/kernel/auth.idMapRange" } func (r *idMapRange) StateFields() []string { return []string{ "Start", "End", } } func (r *idMapRange) beforeSave() {} // +checklocksignore func (r *idMapRange) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.Start) stateSinkObject.Save(1, &r.End) } func (r *idMapRange) afterLoad(context.Context) {} // +checklocksignore func (r *idMapRange) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.Start) stateSourceObject.Load(1, &r.End) } func (s *idMapSet) StateTypeName() string { return "pkg/sentry/kernel/auth.idMapSet" } func (s *idMapSet) StateFields() []string { return []string{ "root", } } func (s *idMapSet) beforeSave() {} // +checklocksignore func (s *idMapSet) StateSave(stateSinkObject state.Sink) { s.beforeSave() var rootValue []idMapFlatSegment rootValue = s.saveRoot() stateSinkObject.SaveValue(0, rootValue) } func (s *idMapSet) afterLoad(context.Context) {} // +checklocksignore func (s *idMapSet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new([]idMapFlatSegment), func(y any) { s.loadRoot(ctx, y.([]idMapFlatSegment)) }) } func (n *idMapnode) StateTypeName() string { return "pkg/sentry/kernel/auth.idMapnode" } func (n *idMapnode) StateFields() []string { return []string{ "nrSegments", "parent", "parentIndex", "hasChildren", "maxGap", "keys", "values", "children", } } func (n *idMapnode) beforeSave() {} // +checklocksignore func (n *idMapnode) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.nrSegments) stateSinkObject.Save(1, &n.parent) stateSinkObject.Save(2, &n.parentIndex) stateSinkObject.Save(3, &n.hasChildren) stateSinkObject.Save(4, &n.maxGap) stateSinkObject.Save(5, &n.keys) stateSinkObject.Save(6, &n.values) stateSinkObject.Save(7, &n.children) } func (n *idMapnode) afterLoad(context.Context) {} // +checklocksignore func (n *idMapnode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.nrSegments) stateSourceObject.Load(1, &n.parent) stateSourceObject.Load(2, &n.parentIndex) stateSourceObject.Load(3, &n.hasChildren) stateSourceObject.Load(4, &n.maxGap) stateSourceObject.Load(5, &n.keys) stateSourceObject.Load(6, &n.values) stateSourceObject.Load(7, &n.children) } func (i *idMapFlatSegment) StateTypeName() string { return "pkg/sentry/kernel/auth.idMapFlatSegment" } func (i *idMapFlatSegment) StateFields() []string { return []string{ "Start", "End", "Value", } } func (i *idMapFlatSegment) beforeSave() {} // +checklocksignore func (i *idMapFlatSegment) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.Start) stateSinkObject.Save(1, &i.End) stateSinkObject.Save(2, &i.Value) } func (i *idMapFlatSegment) afterLoad(context.Context) {} // +checklocksignore func (i *idMapFlatSegment) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.Start) stateSourceObject.Load(1, &i.End) stateSourceObject.Load(2, &i.Value) } func (k *Key) StateTypeName() string { return "pkg/sentry/kernel/auth.Key" } func (k *Key) StateFields() []string { return []string{ "ID", "Description", "kuid", "kgid", "perms", } } func (k *Key) beforeSave() {} // +checklocksignore func (k *Key) StateSave(stateSinkObject state.Sink) { k.beforeSave() stateSinkObject.Save(0, &k.ID) stateSinkObject.Save(1, &k.Description) stateSinkObject.Save(2, &k.kuid) stateSinkObject.Save(3, &k.kgid) stateSinkObject.Save(4, &k.perms) } func (k *Key) afterLoad(context.Context) {} // +checklocksignore func (k *Key) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &k.ID) stateSourceObject.Load(1, &k.Description) stateSourceObject.Load(2, &k.kuid) stateSourceObject.Load(3, &k.kgid) stateSourceObject.Load(4, &k.perms) } func (s *KeySet) StateTypeName() string { return "pkg/sentry/kernel/auth.KeySet" } func (s *KeySet) StateFields() []string { return []string{ "keys", } } func (s *KeySet) beforeSave() {} // +checklocksignore func (s *KeySet) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.keys) } func (s *KeySet) afterLoad(context.Context) {} // +checklocksignore func (s *KeySet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.keys) } func (ns *UserNamespace) StateTypeName() string { return "pkg/sentry/kernel/auth.UserNamespace" } func (ns *UserNamespace) StateFields() []string { return []string{ "parent", "owner", "Keys", "uidMapFromParent", "uidMapToParent", "gidMapFromParent", "gidMapToParent", } } func (ns *UserNamespace) beforeSave() {} // +checklocksignore func (ns *UserNamespace) StateSave(stateSinkObject state.Sink) { ns.beforeSave() stateSinkObject.Save(0, &ns.parent) stateSinkObject.Save(1, &ns.owner) stateSinkObject.Save(2, &ns.Keys) stateSinkObject.Save(3, &ns.uidMapFromParent) stateSinkObject.Save(4, &ns.uidMapToParent) stateSinkObject.Save(5, &ns.gidMapFromParent) stateSinkObject.Save(6, &ns.gidMapToParent) } func (ns *UserNamespace) afterLoad(context.Context) {} // +checklocksignore func (ns *UserNamespace) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &ns.parent) stateSourceObject.Load(1, &ns.owner) stateSourceObject.Load(2, &ns.Keys) stateSourceObject.Load(3, &ns.uidMapFromParent) stateSourceObject.Load(4, &ns.uidMapToParent) stateSourceObject.Load(5, &ns.gidMapFromParent) stateSourceObject.Load(6, &ns.gidMapToParent) } func init() { state.Register((*Credentials)(nil)) state.Register((*IDMapEntry)(nil)) state.Register((*idMapRange)(nil)) state.Register((*idMapSet)(nil)) state.Register((*idMapnode)(nil)) state.Register((*idMapFlatSegment)(nil)) state.Register((*Key)(nil)) state.Register((*KeySet)(nil)) state.Register((*UserNamespace)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/auth_unsafe_abi_autogen_unsafe.go000066400000000000000000000001441465435605700317170ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package auth import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/auth_unsafe_state_autogen.go000066400000000000000000000016131465435605700307450ustar00rootroot00000000000000// automatically generated by stateify. package auth import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (p *AtomicPtrCredentials) StateTypeName() string { return "pkg/sentry/kernel/auth.AtomicPtrCredentials" } func (p *AtomicPtrCredentials) StateFields() []string { return []string{ "ptr", } } func (p *AtomicPtrCredentials) beforeSave() {} // +checklocksignore func (p *AtomicPtrCredentials) StateSave(stateSinkObject state.Sink) { p.beforeSave() var ptrValue *Credentials ptrValue = p.savePtr() stateSinkObject.SaveValue(0, ptrValue) } func (p *AtomicPtrCredentials) afterLoad(context.Context) {} // +checklocksignore func (p *AtomicPtrCredentials) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new(*Credentials), func(y any) { p.loadPtr(ctx, y.(*Credentials)) }) } func init() { state.Register((*AtomicPtrCredentials)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/capability_set.go000066400000000000000000000130451465435605700265170ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package auth import ( "encoding/binary" "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // A CapabilitySet is a set of capabilities implemented as a bitset. The zero // value of CapabilitySet is a set containing no capabilities. type CapabilitySet uint64 // VfsCapData is equivalent to Linux's cpu_vfs_cap_data, defined // in Linux's include/linux/capability.h. type VfsCapData struct { MagicEtc uint32 RootID uint32 Permitted CapabilitySet Inheritable CapabilitySet } // AllCapabilities is a CapabilitySet containing all valid capabilities. var AllCapabilities = CapabilitySetOf(linux.CAP_LAST_CAP+1) - 1 // CapabilitySetOf returns a CapabilitySet containing only the given // capability. func CapabilitySetOf(cp linux.Capability) CapabilitySet { return CapabilitySet(bits.MaskOf64(int(cp))) } // CapabilitySetOfMany returns a CapabilitySet containing the given capabilities. func CapabilitySetOfMany(cps []linux.Capability) CapabilitySet { var cs uint64 for _, cp := range cps { cs |= bits.MaskOf64(int(cp)) } return CapabilitySet(cs) } // VfsCapDataOf returns a VfsCapData containing the file capabilities for the given slice of bytes. // For each field of the cap data, which are in the structure of either vfs_cap_data or vfs_ns_cap_data, // the bytes are ordered in little endian. func VfsCapDataOf(data []byte) (VfsCapData, error) { var capData VfsCapData size := len(data) if size < linux.XATTR_CAPS_SZ_1 { return capData, fmt.Errorf("the size of security.capability is too small, actual size: %v", size) } capData.MagicEtc = binary.LittleEndian.Uint32(data[:4]) capData.Permitted = CapabilitySet(binary.LittleEndian.Uint32(data[4:8])) capData.Inheritable = CapabilitySet(binary.LittleEndian.Uint32(data[8:12])) // The version of the file capabilities takes first 4 bytes of the given // slice. version := capData.MagicEtc & linux.VFS_CAP_REVISION_MASK switch { case version == linux.VFS_CAP_REVISION_3 && size >= linux.XATTR_CAPS_SZ_3: // Like version 2 file capabilities, version 3 capability // masks are 64 bits in size. In addition, version 3 has // the root user ID of namespace, which is encoded in the // security.capability extended attribute. capData.RootID = binary.LittleEndian.Uint32(data[20:24]) fallthrough case version == linux.VFS_CAP_REVISION_2 && size >= linux.XATTR_CAPS_SZ_2: capData.Permitted += CapabilitySet(binary.LittleEndian.Uint32(data[12:16])) << 32 capData.Inheritable += CapabilitySet(binary.LittleEndian.Uint32(data[16:20])) << 32 default: return VfsCapData{}, fmt.Errorf("VFS_CAP_REVISION_%v with cap data size %v is not supported", version, size) } return capData, nil } // CapsFromVfsCaps returns a copy of the given creds with new capability sets // by applying the file capability that is specified by capData. func CapsFromVfsCaps(capData VfsCapData, creds *Credentials) (*Credentials, error) { // If the real or effective user ID of the process is root, // the file inheritable and permitted sets are ignored from // `Capabilities and execution of programs by root` at capabilities(7). if root := creds.UserNamespace.MapToKUID(RootUID); creds.EffectiveKUID == root || creds.RealKUID == root { return creds, nil } effective := (capData.MagicEtc & linux.VFS_CAP_FLAGS_EFFECTIVE) > 0 permittedCaps := (capData.Permitted & creds.BoundingCaps) | (capData.Inheritable & creds.InheritableCaps) // P'(effective) = effective ? P'(permitted) : P'(ambient). // The ambient capabilities has not supported yet in gVisor, // set effective capabilities to 0 when effective bit is false. effectiveCaps := CapabilitySet(0) if effective { effectiveCaps = permittedCaps } // Insufficient to execute correctly. if (capData.Permitted & ^permittedCaps) != 0 { return nil, linuxerr.EPERM } // If the capabilities don't change, it will return the creds' // original copy. if creds.PermittedCaps == permittedCaps && creds.EffectiveCaps == effectiveCaps { return creds, nil } // The credentials object is immutable. newCreds := creds.Fork() newCreds.PermittedCaps = permittedCaps newCreds.EffectiveCaps = effectiveCaps return newCreds, nil } // TaskCapabilities represents all the capability sets for a task. Each of these // sets is explained in greater detail in capabilities(7). type TaskCapabilities struct { // Permitted is a limiting superset for the effective capabilities that // the thread may assume. PermittedCaps CapabilitySet // Inheritable is a set of capabilities preserved across an execve(2). InheritableCaps CapabilitySet // Effective is the set of capabilities used by the kernel to perform // permission checks for the thread. EffectiveCaps CapabilitySet // Bounding is a limiting superset for the capabilities that a thread // can add to its inheritable set using capset(2). BoundingCaps CapabilitySet // Ambient is a set of capabilities that are preserved across an // execve(2) of a program that is not privileged. AmbientCaps CapabilitySet } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/context.go000066400000000000000000000040311465435605700252020ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package auth import ( "gvisor.dev/gvisor/pkg/context" ) // contextID is the auth package's type for context.Context.Value keys. type contextID int const ( // CtxCredentials is a Context.Value key for Credentials. CtxCredentials contextID = iota // CtxThreadGroupID is the current thread group ID when a context represents // a task context. The value is represented as an int32. CtxThreadGroupID contextID = iota ) // CredentialsFromContext returns a copy of the Credentials used by ctx, or a // set of Credentials with no capabilities if ctx does not have Credentials. func CredentialsFromContext(ctx context.Context) *Credentials { if v := ctx.Value(CtxCredentials); v != nil { return v.(*Credentials) } return NewAnonymousCredentials() } // ThreadGroupIDFromContext returns the current thread group ID when ctx // represents a task context. func ThreadGroupIDFromContext(ctx context.Context) (tgid int32, ok bool) { if tgid := ctx.Value(CtxThreadGroupID); tgid != nil { return tgid.(int32), true } return 0, false } // ContextWithCredentials returns a copy of ctx carrying creds. func ContextWithCredentials(ctx context.Context, creds *Credentials) context.Context { return &authContext{ctx, creds} } type authContext struct { context.Context creds *Credentials } // Value implements context.Context. func (ac *authContext) Value(key any) any { switch key { case CtxCredentials: return ac.creds default: return ac.Context.Value(key) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/credentials.go000066400000000000000000000216051465435605700260210ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package auth import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/seccheck" pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto" ) // Credentials contains information required to authorize privileged operations // in a user namespace. // // +stateify savable type Credentials struct { // Real/effective/saved user/group IDs in the root user namespace. None of // these should ever be NoID. RealKUID KUID EffectiveKUID KUID SavedKUID KUID RealKGID KGID EffectiveKGID KGID SavedKGID KGID // Filesystem user/group IDs are not implemented. "... setfsuid() is // nowadays unneeded and should be avoided in new applications (likewise // for setfsgid(2))." - setfsuid(2) // Supplementary groups used by set/getgroups. // // ExtraKGIDs slices are immutable, allowing multiple Credentials with the // same ExtraKGIDs to share the same slice. ExtraKGIDs []KGID // The capability sets applicable to this set of credentials. PermittedCaps CapabilitySet InheritableCaps CapabilitySet EffectiveCaps CapabilitySet BoundingCaps CapabilitySet // Ambient capabilities are not introduced until Linux 4.3. // KeepCaps is the flag for PR_SET_KEEPCAPS which allow capabilities to be // maintained after a switch from root user to non-root user via setuid(). KeepCaps bool // The user namespace associated with the owner of the credentials. UserNamespace *UserNamespace } // NewAnonymousCredentials returns a set of credentials with no capabilities in // any user namespace. func NewAnonymousCredentials() *Credentials { // Create a new root user namespace. Since the new namespace's owner is // KUID 0 and the returned credentials have non-zero KUID/KGID, the // returned credentials do not have any capabilities in the new namespace. // Since the new namespace is not part of any existing user namespace // hierarchy, the returned credentials do not have any capabilities in any // other namespace. return &Credentials{ RealKUID: NobodyKUID, EffectiveKUID: NobodyKUID, SavedKUID: NobodyKUID, RealKGID: NobodyKGID, EffectiveKGID: NobodyKGID, SavedKGID: NobodyKGID, UserNamespace: NewRootUserNamespace(), } } // NewRootCredentials returns a set of credentials with KUID and KGID 0 (i.e. // global root) in user namespace ns. func NewRootCredentials(ns *UserNamespace) *Credentials { // I can't find documentation for this anywhere, but it's correct for the // inheritable capability set to be initially empty (the capabilities test // checks for this property). return &Credentials{ RealKUID: RootKUID, EffectiveKUID: RootKUID, SavedKUID: RootKUID, RealKGID: RootKGID, EffectiveKGID: RootKGID, SavedKGID: RootKGID, PermittedCaps: AllCapabilities, EffectiveCaps: AllCapabilities, BoundingCaps: AllCapabilities, UserNamespace: ns, } } // NewUserCredentials returns a set of credentials based on the given UID, GIDs, // and capabilities in a given namespace. If all arguments are their zero // values, this returns the same credentials as NewRootCredentials. func NewUserCredentials(kuid KUID, kgid KGID, extraKGIDs []KGID, capabilities *TaskCapabilities, ns *UserNamespace) *Credentials { creds := NewRootCredentials(ns) // Set the UID. uid := kuid creds.RealKUID = uid creds.EffectiveKUID = uid creds.SavedKUID = uid // Set GID. gid := kgid creds.RealKGID = gid creds.EffectiveKGID = gid creds.SavedKGID = gid // Set additional GIDs. creds.ExtraKGIDs = append(creds.ExtraKGIDs, extraKGIDs...) // Set capabilities. if capabilities != nil { creds.PermittedCaps = capabilities.PermittedCaps creds.EffectiveCaps = capabilities.EffectiveCaps creds.BoundingCaps = capabilities.BoundingCaps creds.InheritableCaps = capabilities.InheritableCaps // TODO(gvisor.dev/issue/3166): Support ambient capabilities. } else { // If no capabilities are specified, grant capabilities consistent with // setresuid + setresgid from NewRootCredentials to the given uid and // gid. if kuid == RootKUID { creds.PermittedCaps = AllCapabilities creds.EffectiveCaps = AllCapabilities } else { creds.PermittedCaps = 0 creds.EffectiveCaps = 0 } creds.BoundingCaps = AllCapabilities } return creds } // Fork generates an identical copy of a set of credentials. func (c *Credentials) Fork() *Credentials { nc := new(Credentials) *nc = *c // Copy-by-value; this is legal for all fields. return nc } // InGroup returns true if c is in group kgid. Compare Linux's // kernel/groups.c:in_group_p(). func (c *Credentials) InGroup(kgid KGID) bool { if c.EffectiveKGID == kgid { return true } for _, extraKGID := range c.ExtraKGIDs { if extraKGID == kgid { return true } } return false } // HasCapabilityIn returns true if c has capability cp in ns. func (c *Credentials) HasCapabilityIn(cp linux.Capability, ns *UserNamespace) bool { for { // "1. A process has a capability inside a user namespace if it is a member // of that namespace and it has the capability in its effective capability // set." - user_namespaces(7) if c.UserNamespace == ns { return CapabilitySetOf(cp)&c.EffectiveCaps != 0 } // "3. ... A process that resides in the parent of the user namespace and // whose effective user ID matches the owner of the namespace has all // capabilities in the namespace." if c.UserNamespace == ns.parent && c.EffectiveKUID == ns.owner { return true } // "2. If a process has a capability in a user namespace, then it has that // capability in all child (and further removed descendant) namespaces as // well." if ns.parent == nil { return false } ns = ns.parent } } // HasCapability returns true if c has capability cp in its user namespace. func (c *Credentials) HasCapability(cp linux.Capability) bool { return c.HasCapabilityIn(cp, c.UserNamespace) } // UseUID checks that c can use uid in its user namespace, then translates it // to the root user namespace. // // The checks UseUID does are common, but you should verify that it's doing // exactly what you want. func (c *Credentials) UseUID(uid UID) (KUID, error) { // uid must be mapped. kuid := c.UserNamespace.MapToKUID(uid) if !kuid.Ok() { return NoID, linuxerr.EINVAL } // If c has CAP_SETUID, then it can use any UID in its user namespace. if c.HasCapability(linux.CAP_SETUID) { return kuid, nil } // Otherwise, c must already have the UID as its real, effective, or saved // set-user-ID. if kuid == c.RealKUID || kuid == c.EffectiveKUID || kuid == c.SavedKUID { return kuid, nil } return NoID, linuxerr.EPERM } // UseGID checks that c can use gid in its user namespace, then translates it // to the root user namespace. func (c *Credentials) UseGID(gid GID) (KGID, error) { kgid := c.UserNamespace.MapToKGID(gid) if !kgid.Ok() { return NoID, linuxerr.EINVAL } if c.HasCapability(linux.CAP_SETGID) { return kgid, nil } if kgid == c.RealKGID || kgid == c.EffectiveKGID || kgid == c.SavedKGID { return kgid, nil } return NoID, linuxerr.EPERM } // SetUID translates the provided uid to the root user namespace and updates c's // uids to it. This performs no permissions or capabilities checks, the caller // is responsible for ensuring the calling context is permitted to modify c. func (c *Credentials) SetUID(uid UID) error { kuid := c.UserNamespace.MapToKUID(uid) if !kuid.Ok() { return linuxerr.EINVAL } c.RealKUID = kuid c.EffectiveKUID = kuid c.SavedKUID = kuid return nil } // SetGID translates the provided gid to the root user namespace and updates c's // gids to it. This performs no permissions or capabilities checks, the caller // is responsible for ensuring the calling context is permitted to modify c. func (c *Credentials) SetGID(gid GID) error { kgid := c.UserNamespace.MapToKGID(gid) if !kgid.Ok() { return linuxerr.EINVAL } c.RealKGID = kgid c.EffectiveKGID = kgid c.SavedKGID = kgid return nil } // LoadSeccheckData sets credential data based on mask. func (c *Credentials) LoadSeccheckData(mask seccheck.FieldMask, info *pb.ContextData) { if mask.Contains(seccheck.FieldCtxtCredentials) { info.Credentials = &pb.Credentials{ RealUid: uint32(c.RealKUID), EffectiveUid: uint32(c.EffectiveKUID), SavedUid: uint32(c.SavedKUID), RealGid: uint32(c.RealKGID), EffectiveGid: uint32(c.EffectiveKGID), SavedGid: uint32(c.SavedKGID), } } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/id.go000066400000000000000000000076341465435605700241260ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package auth import ( "math" ) // UID is a user ID in an unspecified user namespace. // // +marshal type UID uint32 // GID is a group ID in an unspecified user namespace. // // +marshal slice:GIDSlice type GID uint32 // In the root user namespace, user/group IDs have a 1-to-1 relationship with // the users/groups they represent. In other user namespaces, this is not the // case; for example, two different unmapped users may both "have" the overflow // UID. This means that it is generally only valid to compare user and group // IDs in the root user namespace. We assign distinct types, KUID/KGID, to such // IDs to emphasize this distinction. ("k" is for "key", as in "unique key". // Linux also uses the prefix "k", but I think they mean "kernel".) // KUID is a user ID in the root user namespace. type KUID uint32 // KGID is a group ID in the root user namespace. type KGID uint32 const ( // NoID is uint32(-1). -1 is consistently used as a special value, in Linux // and by extension in the auth package, to mean "no ID": // // - ID mapping returns -1 if the ID is not mapped. // // - Most set*id() syscalls accept -1 to mean "do not change this ID". NoID = math.MaxUint32 // OverflowUID is the default value of /proc/sys/kernel/overflowuid. The // "overflow UID" is usually [1] used when translating a user ID between // namespaces fails because the ID is not mapped. (We implement this // file as read-only, so the overflow UID is constant.) // // [1] "There is one notable case where unmapped user and group IDs are not // converted to the corresponding overflow ID value. When viewing a uid_map // or gid_map file in which there is no mapping for the second field, that // field is displayed as 4294967295 (-1 as an unsigned integer);" - // user_namespaces(7) OverflowUID = UID(65534) // OverflowGID is the group equivalent to OverflowUID. OverflowGID = GID(65534) // NobodyKUID is the user ID usually reserved for the least privileged user // "nobody". NobodyKUID = KUID(65534) // NobodyKGID is the group equivalent to NobodyKUID. NobodyKGID = KGID(65534) // RootKUID is the user ID usually used for the most privileged user "root". RootKUID = KUID(0) // RootKGID is the group equivalent to RootKUID. RootKGID = KGID(0) // RootUID is the root user. RootUID = UID(0) // RootGID is the root group. RootGID = GID(0) ) // Ok returns true if uid is not -1. func (uid UID) Ok() bool { return uid != NoID } // Ok returns true if gid is not -1. func (gid GID) Ok() bool { return gid != NoID } // Ok returns true if kuid is not -1. func (kuid KUID) Ok() bool { return kuid != NoID } // Ok returns true if kgid is not -1. func (kgid KGID) Ok() bool { return kgid != NoID } // OrOverflow returns uid if it is valid and the overflow UID otherwise. func (uid UID) OrOverflow() UID { if uid.Ok() { return uid } return OverflowUID } // OrOverflow returns gid if it is valid and the overflow GID otherwise. func (gid GID) OrOverflow() GID { if gid.Ok() { return gid } return OverflowGID } // In translates kuid into user namespace ns. If kuid is not mapped in ns, In // returns NoID. func (kuid KUID) In(ns *UserNamespace) UID { return ns.MapFromKUID(kuid) } // In translates kgid into user namespace ns. If kgid is not mapped in ns, In // returns NoID. func (kgid KGID) In(ns *UserNamespace) GID { return ns.MapFromKGID(kgid) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/id_map.go000066400000000000000000000221711465435605700247540ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package auth import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // MapFromKUID translates kuid, a UID in the root namespace, to a UID in ns. func (ns *UserNamespace) MapFromKUID(kuid KUID) UID { if ns.parent == nil { return UID(kuid) } return UID(ns.mapID(&ns.uidMapFromParent, uint32(ns.parent.MapFromKUID(kuid)))) } // MapFromKGID translates kgid, a GID in the root namespace, to a GID in ns. func (ns *UserNamespace) MapFromKGID(kgid KGID) GID { if ns.parent == nil { return GID(kgid) } return GID(ns.mapID(&ns.gidMapFromParent, uint32(ns.parent.MapFromKGID(kgid)))) } // MapToKUID translates uid, a UID in ns, to a UID in the root namespace. func (ns *UserNamespace) MapToKUID(uid UID) KUID { if ns.parent == nil { return KUID(uid) } return ns.parent.MapToKUID(UID(ns.mapID(&ns.uidMapToParent, uint32(uid)))) } // MapToKGID translates gid, a GID in ns, to a GID in the root namespace. func (ns *UserNamespace) MapToKGID(gid GID) KGID { if ns.parent == nil { return KGID(gid) } return ns.parent.MapToKGID(GID(ns.mapID(&ns.gidMapToParent, uint32(gid)))) } func (ns *UserNamespace) mapID(m *idMapSet, id uint32) uint32 { if id == NoID { return NoID } ns.mu.Lock() defer ns.mu.Unlock() if it := m.FindSegment(id); it.Ok() { return it.Value() + (id - it.Start()) } return NoID } // allIDsMapped returns true if all IDs in the range [start, end) are mapped in // m. // // Preconditions: end >= start. func (ns *UserNamespace) allIDsMapped(m *idMapSet, start, end uint32) bool { ns.mu.NestedLock(userNamespaceLockNs) defer ns.mu.NestedUnlock(userNamespaceLockNs) return m.SpanRange(idMapRange{start, end}) == end-start } // An IDMapEntry represents a mapping from a range of contiguous IDs in a user // namespace to an equally-sized range of contiguous IDs in the namespace's // parent. // // +stateify savable type IDMapEntry struct { // FirstID is the first ID in the range in the namespace. FirstID uint32 // FirstParentID is the first ID in the range in the parent namespace. FirstParentID uint32 // Length is the number of IDs in the range. Length uint32 } // SetUIDMap instructs ns to translate UIDs as specified by entries. // // Note: SetUIDMap does not place an upper bound on the number of entries, but // Linux does. This restriction is implemented in SetUIDMap's caller, the // implementation of /proc/[pid]/uid_map. func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) error { c := CredentialsFromContext(ctx) ns.mu.Lock() defer ns.mu.Unlock() // "After the creation of a new user namespace, the uid_map file of *one* // of the processes in the namespace may be written to *once* to define the // mapping of user IDs in the new user namespace. An attempt to write more // than once to a uid_map file in a user namespace fails with the error // EPERM. Similar rules apply for gid_map files." - user_namespaces(7) if !ns.uidMapFromParent.IsEmpty() { return linuxerr.EPERM } // "At least one line must be written to the file." if len(entries) == 0 { return linuxerr.EINVAL } // """ // In order for a process to write to the /proc/[pid]/uid_map // (/proc/[pid]/gid_map) file, all of the following requirements must be // met: // // 1. The writing process must have the CAP_SETUID (CAP_SETGID) capability // in the user namespace of the process pid. // """ if !c.HasCapabilityIn(linux.CAP_SETUID, ns) { return linuxerr.EPERM } // "2. The writing process must either be in the user namespace of the process // pid or be in the parent user namespace of the process pid." if c.UserNamespace != ns && c.UserNamespace != ns.parent { return linuxerr.EPERM } // """ // 3. (see trySetUIDMap) // // 4. One of the following two cases applies: // // * Either the writing process has the CAP_SETUID (CAP_SETGID) capability // in the parent user namespace. // """ if !c.HasCapabilityIn(linux.CAP_SETUID, ns.parent) { // """ // * Or otherwise all of the following restrictions apply: // // + The data written to uid_map (gid_map) must consist of a single line // that maps the writing process' effective user ID (group ID) in the // parent user namespace to a user ID (group ID) in the user namespace. // """ if len(entries) != 1 || ns.parent.MapToKUID(UID(entries[0].FirstParentID)) != c.EffectiveKUID || entries[0].Length != 1 { return linuxerr.EPERM } // """ // + The writing process must have the same effective user ID as the // process that created the user namespace. // """ if c.EffectiveKUID != ns.owner { return linuxerr.EPERM } } // trySetUIDMap leaves data in maps if it fails. if err := ns.trySetUIDMap(entries); err != nil { ns.uidMapFromParent.RemoveAll() ns.uidMapToParent.RemoveAll() return err } return nil } func (ns *UserNamespace) trySetUIDMap(entries []IDMapEntry) error { for _, e := range entries { // Determine upper bounds and check for overflow. This implicitly // checks for NoID. lastID := e.FirstID + e.Length if lastID <= e.FirstID { return linuxerr.EINVAL } lastParentID := e.FirstParentID + e.Length if lastParentID <= e.FirstParentID { return linuxerr.EINVAL } // "3. The mapped user IDs (group IDs) must in turn have a mapping in // the parent user namespace." // Only the root namespace has a nil parent, and root is assigned // mappings when it's created, so SetUIDMap would have returned EPERM // without reaching this point if ns is root. if !ns.parent.allIDsMapped(&ns.parent.uidMapToParent, e.FirstParentID, lastParentID) { return linuxerr.EPERM } // If either of these Adds fail, we have an overlapping range. if !ns.uidMapFromParent.TryInsertRange(idMapRange{e.FirstParentID, lastParentID}, e.FirstID).Ok() { return linuxerr.EINVAL } if !ns.uidMapToParent.TryInsertRange(idMapRange{e.FirstID, lastID}, e.FirstParentID).Ok() { return linuxerr.EINVAL } } return nil } // SetGIDMap instructs ns to translate GIDs as specified by entries. func (ns *UserNamespace) SetGIDMap(ctx context.Context, entries []IDMapEntry) error { c := CredentialsFromContext(ctx) ns.mu.Lock() defer ns.mu.Unlock() if !ns.gidMapFromParent.IsEmpty() { return linuxerr.EPERM } if len(entries) == 0 { return linuxerr.EINVAL } if !c.HasCapabilityIn(linux.CAP_SETGID, ns) { return linuxerr.EPERM } if c.UserNamespace != ns && c.UserNamespace != ns.parent { return linuxerr.EPERM } if !c.HasCapabilityIn(linux.CAP_SETGID, ns.parent) { if len(entries) != 1 || ns.parent.MapToKGID(GID(entries[0].FirstParentID)) != c.EffectiveKGID || entries[0].Length != 1 { return linuxerr.EPERM } // It's correct for this to still be UID. if c.EffectiveKUID != ns.owner { return linuxerr.EPERM } // "In the case of gid_map, use of the setgroups(2) system call must // first be denied by writing "deny" to the /proc/[pid]/setgroups file // (see below) before writing to gid_map." (This file isn't implemented // in the version of Linux we're emulating; see comment in // UserNamespace.) } if err := ns.trySetGIDMap(entries); err != nil { ns.gidMapFromParent.RemoveAll() ns.gidMapToParent.RemoveAll() return err } return nil } func (ns *UserNamespace) trySetGIDMap(entries []IDMapEntry) error { for _, e := range entries { lastID := e.FirstID + e.Length if lastID <= e.FirstID { return linuxerr.EINVAL } lastParentID := e.FirstParentID + e.Length if lastParentID <= e.FirstParentID { return linuxerr.EINVAL } if !ns.parent.allIDsMapped(&ns.parent.gidMapToParent, e.FirstParentID, lastParentID) { return linuxerr.EPERM } if !ns.gidMapFromParent.TryInsertRange(idMapRange{e.FirstParentID, lastParentID}, e.FirstID).Ok() { return linuxerr.EINVAL } if !ns.gidMapToParent.TryInsertRange(idMapRange{e.FirstID, lastID}, e.FirstParentID).Ok() { return linuxerr.EINVAL } } return nil } // UIDMap returns the user ID mappings configured for ns. If no mappings // have been configured, UIDMap returns nil. func (ns *UserNamespace) UIDMap() []IDMapEntry { return ns.getIDMap(&ns.uidMapToParent) } // GIDMap returns the group ID mappings configured for ns. If no mappings // have been configured, GIDMap returns nil. func (ns *UserNamespace) GIDMap() []IDMapEntry { return ns.getIDMap(&ns.gidMapToParent) } func (ns *UserNamespace) getIDMap(m *idMapSet) []IDMapEntry { ns.mu.Lock() defer ns.mu.Unlock() var entries []IDMapEntry for it := m.FirstSegment(); it.Ok(); it = it.NextSegment() { entries = append(entries, IDMapEntry{ FirstID: it.Start(), FirstParentID: it.Value(), Length: it.Range().Length(), }) } return entries } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/id_map_functions.go000066400000000000000000000027771465435605700270560ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package auth // idMapFunctions "implements" generic interface segment.Functions for // idMapSet. An idMapSet maps non-overlapping ranges of contiguous IDs in one // user namespace to non-overlapping ranges of contiguous IDs in another user // namespace. Each such ID mapping is implemented as a range-to-value mapping // in the set such that [range.Start(), range.End()) => [value, value + // range.Length()). type idMapFunctions struct{} func (idMapFunctions) MinKey() uint32 { return 0 } func (idMapFunctions) MaxKey() uint32 { return NoID } func (idMapFunctions) ClearValue(*uint32) {} func (idMapFunctions) Merge(r1 idMapRange, val1 uint32, r2 idMapRange, val2 uint32) (uint32, bool) { // Mapped ranges have to be contiguous. if val1+r1.Length() != val2 { return 0, false } return val1, true } func (idMapFunctions) Split(r idMapRange, val uint32, split uint32) (uint32, uint32) { return val, val + (split - r.Start) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/id_map_range.go000066400000000000000000000033621465435605700261310ustar00rootroot00000000000000package auth // A Range represents a contiguous range of T. // // +stateify savable type idMapRange struct { // Start is the inclusive start of the range. Start uint32 // End is the exclusive end of the range. End uint32 } // WellFormed returns true if r.Start <= r.End. All other methods on a Range // require that the Range is well-formed. // //go:nosplit func (r idMapRange) WellFormed() bool { return r.Start <= r.End } // Length returns the length of the range. // //go:nosplit func (r idMapRange) Length() uint32 { return r.End - r.Start } // Contains returns true if r contains x. // //go:nosplit func (r idMapRange) Contains(x uint32) bool { return r.Start <= x && x < r.End } // Overlaps returns true if r and r2 overlap. // //go:nosplit func (r idMapRange) Overlaps(r2 idMapRange) bool { return r.Start < r2.End && r2.Start < r.End } // IsSupersetOf returns true if r is a superset of r2; that is, the range r2 is // contained within r. // //go:nosplit func (r idMapRange) IsSupersetOf(r2 idMapRange) bool { return r.Start <= r2.Start && r.End >= r2.End } // Intersect returns a range consisting of the intersection between r and r2. // If r and r2 do not overlap, Intersect returns a range with unspecified // bounds, but for which Length() == 0. // //go:nosplit func (r idMapRange) Intersect(r2 idMapRange) idMapRange { if r.Start < r2.Start { r.Start = r2.Start } if r.End > r2.End { r.End = r2.End } if r.End < r.Start { r.End = r.Start } return r } // CanSplitAt returns true if it is legal to split a segment spanning the range // r at x; that is, splitting at x would produce two ranges, both of which have // non-zero length. // //go:nosplit func (r idMapRange) CanSplitAt(x uint32) bool { return r.Contains(x) && r.Start < x } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/id_map_set.go000066400000000000000000002010311465435605700256210ustar00rootroot00000000000000package auth import ( "bytes" "context" "fmt" ) // trackGaps is an optional parameter. // // If trackGaps is 1, the Set will track maximum gap size recursively, // enabling the GapIterator.{Prev,Next}LargeEnoughGap functions. In this // case, Key must be an unsigned integer. // // trackGaps must be 0 or 1. const idMaptrackGaps = 0 var _ = uint8(idMaptrackGaps << 7) // Will fail if not zero or one. // dynamicGap is a type that disappears if trackGaps is 0. type idMapdynamicGap [idMaptrackGaps]uint32 // Get returns the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *idMapdynamicGap) Get() uint32 { return d[:][0] } // Set sets the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *idMapdynamicGap) Set(v uint32) { d[:][0] = v } const ( // minDegree is the minimum degree of an internal node in a Set B-tree. // // - Any non-root node has at least minDegree-1 segments. // // - Any non-root internal (non-leaf) node has at least minDegree children. // // - The root node may have fewer than minDegree-1 segments, but it may // only have 0 segments if the tree is empty. // // Our implementation requires minDegree >= 3. Higher values of minDegree // usually improve performance, but increase memory usage for small sets. idMapminDegree = 3 idMapmaxDegree = 2 * idMapminDegree ) // A Set is a mapping of segments with non-overlapping Range keys. The zero // value for a Set is an empty set. Set values are not safely movable nor // copyable. Set is thread-compatible. // // +stateify savable type idMapSet struct { root idMapnode `state:".([]idMapFlatSegment)"` } // IsEmpty returns true if the set contains no segments. func (s *idMapSet) IsEmpty() bool { return s.root.nrSegments == 0 } // IsEmptyRange returns true iff no segments in the set overlap the given // range. This is semantically equivalent to s.SpanRange(r) == 0, but may be // more efficient. func (s *idMapSet) IsEmptyRange(r idMapRange) bool { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return true } _, gap := s.Find(r.Start) if !gap.Ok() { return false } return r.End <= gap.End() } // Span returns the total size of all segments in the set. func (s *idMapSet) Span() uint32 { var sz uint32 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { sz += seg.Range().Length() } return sz } // SpanRange returns the total size of the intersection of segments in the set // with the given range. func (s *idMapSet) SpanRange(r idMapRange) uint32 { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return 0 } var sz uint32 for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { sz += seg.Range().Intersect(r).Length() } return sz } // FirstSegment returns the first segment in the set. If the set is empty, // FirstSegment returns a terminal iterator. func (s *idMapSet) FirstSegment() idMapIterator { if s.root.nrSegments == 0 { return idMapIterator{} } return s.root.firstSegment() } // LastSegment returns the last segment in the set. If the set is empty, // LastSegment returns a terminal iterator. func (s *idMapSet) LastSegment() idMapIterator { if s.root.nrSegments == 0 { return idMapIterator{} } return s.root.lastSegment() } // FirstGap returns the first gap in the set. func (s *idMapSet) FirstGap() idMapGapIterator { n := &s.root for n.hasChildren { n = n.children[0] } return idMapGapIterator{n, 0} } // LastGap returns the last gap in the set. func (s *idMapSet) LastGap() idMapGapIterator { n := &s.root for n.hasChildren { n = n.children[n.nrSegments] } return idMapGapIterator{n, n.nrSegments} } // Find returns the segment or gap whose range contains the given key. If a // segment is found, the returned Iterator is non-terminal and the // returned GapIterator is terminal. Otherwise, the returned Iterator is // terminal and the returned GapIterator is non-terminal. func (s *idMapSet) Find(key uint32) (idMapIterator, idMapGapIterator) { n := &s.root for { lower := 0 upper := n.nrSegments for lower < upper { i := lower + (upper-lower)/2 if r := n.keys[i]; key < r.End { if key >= r.Start { return idMapIterator{n, i}, idMapGapIterator{} } upper = i } else { lower = i + 1 } } i := lower if !n.hasChildren { return idMapIterator{}, idMapGapIterator{n, i} } n = n.children[i] } } // FindSegment returns the segment whose range contains the given key. If no // such segment exists, FindSegment returns a terminal iterator. func (s *idMapSet) FindSegment(key uint32) idMapIterator { seg, _ := s.Find(key) return seg } // LowerBoundSegment returns the segment with the lowest range that contains a // key greater than or equal to min. If no such segment exists, // LowerBoundSegment returns a terminal iterator. func (s *idMapSet) LowerBoundSegment(min uint32) idMapIterator { seg, gap := s.Find(min) if seg.Ok() { return seg } return gap.NextSegment() } // UpperBoundSegment returns the segment with the highest range that contains a // key less than or equal to max. If no such segment exists, UpperBoundSegment // returns a terminal iterator. func (s *idMapSet) UpperBoundSegment(max uint32) idMapIterator { seg, gap := s.Find(max) if seg.Ok() { return seg } return gap.PrevSegment() } // FindGap returns the gap containing the given key. If no such gap exists // (i.e. the set contains a segment containing that key), FindGap returns a // terminal iterator. func (s *idMapSet) FindGap(key uint32) idMapGapIterator { _, gap := s.Find(key) return gap } // LowerBoundGap returns the gap with the lowest range that is greater than or // equal to min. func (s *idMapSet) LowerBoundGap(min uint32) idMapGapIterator { seg, gap := s.Find(min) if gap.Ok() { return gap } return seg.NextGap() } // UpperBoundGap returns the gap with the highest range that is less than or // equal to max. func (s *idMapSet) UpperBoundGap(max uint32) idMapGapIterator { seg, gap := s.Find(max) if gap.Ok() { return gap } return seg.PrevGap() } // FirstLargeEnoughGap returns the first gap in the set with at least the given // length. If no such gap exists, FirstLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *idMapSet) FirstLargeEnoughGap(minSize uint32) idMapGapIterator { if idMaptrackGaps != 1 { panic("set is not tracking gaps") } gap := s.FirstGap() if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // LastLargeEnoughGap returns the last gap in the set with at least the given // length. If no such gap exists, LastLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *idMapSet) LastLargeEnoughGap(minSize uint32) idMapGapIterator { if idMaptrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LastGap() if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // LowerBoundLargeEnoughGap returns the first gap in the set with at least the // given length and whose range contains a key greater than or equal to min. If // no such gap exists, LowerBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *idMapSet) LowerBoundLargeEnoughGap(min, minSize uint32) idMapGapIterator { if idMaptrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LowerBoundGap(min) if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // UpperBoundLargeEnoughGap returns the last gap in the set with at least the // given length and whose range contains a key less than or equal to max. If no // such gap exists, UpperBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *idMapSet) UpperBoundLargeEnoughGap(max, minSize uint32) idMapGapIterator { if idMaptrackGaps != 1 { panic("set is not tracking gaps") } gap := s.UpperBoundGap(max) if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // Insert inserts the given segment into the given gap. If the new segment can // be merged with adjacent segments, Insert will do so. Insert returns an // iterator to the segment containing the inserted value (which may have been // merged with other values). All existing iterators (including gap, but not // including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, Insert panics. // // Insert is semantically equivalent to a InsertWithoutMerging followed by a // Merge, but may be more efficient. Note that there is no unchecked variant of // Insert since Insert must retrieve and inspect gap's predecessor and // successor segments regardless. func (s *idMapSet) Insert(gap idMapGapIterator, r idMapRange, val uint32) idMapIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } prev, next := gap.PrevSegment(), gap.NextSegment() if prev.Ok() && prev.End() > r.Start { panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) } if next.Ok() && next.Start() < r.End { panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) } if prev.Ok() && prev.End() == r.Start { if mval, ok := (idMapFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { shrinkMaxGap := idMaptrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() prev.SetEndUnchecked(r.End) prev.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } if next.Ok() && next.Start() == r.End { val = mval if mval, ok := (idMapFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { prev.SetEndUnchecked(next.End()) prev.SetValue(mval) return s.Remove(next).PrevSegment() } } return prev } } if next.Ok() && next.Start() == r.End { if mval, ok := (idMapFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { shrinkMaxGap := idMaptrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() next.SetStartUnchecked(r.Start) next.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } return next } } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMerging inserts the given segment into the given gap and // returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, // InsertWithoutMerging panics. func (s *idMapSet) InsertWithoutMerging(gap idMapGapIterator, r idMapRange, val uint32) idMapIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if gr := gap.Range(); !gr.IsSupersetOf(r) { panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMergingUnchecked inserts the given segment into the given gap // and returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // Preconditions: // - r.Start >= gap.Start(). // - r.End <= gap.End(). func (s *idMapSet) InsertWithoutMergingUnchecked(gap idMapGapIterator, r idMapRange, val uint32) idMapIterator { gap = gap.node.rebalanceBeforeInsert(gap) splitMaxGap := idMaptrackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get()) copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) gap.node.keys[gap.index] = r gap.node.values[gap.index] = val gap.node.nrSegments++ if splitMaxGap { gap.node.updateMaxGapLeaf() } return idMapIterator{gap.node, gap.index} } // InsertRange inserts the given segment into the set. If the new segment can // be merged with adjacent segments, InsertRange will do so. InsertRange // returns an iterator to the segment containing the inserted value (which may // have been merged with other values). All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertRange panics. // // InsertRange searches the set to find the gap to insert into. If the caller // already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *idMapSet) InsertRange(r idMapRange, val uint32) idMapIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.Insert(gap, r, val) } // InsertWithoutMergingRange inserts the given segment into the set and returns // an iterator to the inserted segment. All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertWithoutMergingRange panics. // // InsertWithoutMergingRange searches the set to find the gap to insert into. // If the caller already has the appropriate GapIterator, or if the caller // needs to do additional work between finding the gap and insertion, use // InsertWithoutMerging instead. func (s *idMapSet) InsertWithoutMergingRange(r idMapRange, val uint32) idMapIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.InsertWithoutMerging(gap, r, val) } // TryInsertRange attempts to insert the given segment into the set. If the new // segment can be merged with adjacent segments, TryInsertRange will do so. // TryInsertRange returns an iterator to the segment containing the inserted // value (which may have been merged with other values). All existing iterators // (excluding the returned iterator) are invalidated. // // If the new segment would overlap an existing segment, TryInsertRange does // nothing and returns a terminal iterator. // // TryInsertRange searches the set to find the gap to insert into. If the // caller already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *idMapSet) TryInsertRange(r idMapRange, val uint32) idMapIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return idMapIterator{} } if gap.End() < r.End { return idMapIterator{} } return s.Insert(gap, r, val) } // TryInsertWithoutMergingRange attempts to insert the given segment into the // set. If successful, it returns an iterator to the inserted segment; all // existing iterators (excluding the returned iterator) are invalidated. If the // new segment would overlap an existing segment, TryInsertWithoutMergingRange // does nothing and returns a terminal iterator. // // TryInsertWithoutMergingRange searches the set to find the gap to insert // into. If the caller already has the appropriate GapIterator, or if the // caller needs to do additional work between finding the gap and insertion, // use InsertWithoutMerging instead. func (s *idMapSet) TryInsertWithoutMergingRange(r idMapRange, val uint32) idMapIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return idMapIterator{} } if gap.End() < r.End { return idMapIterator{} } return s.InsertWithoutMerging(gap, r, val) } // Remove removes the given segment and returns an iterator to the vacated gap. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. func (s *idMapSet) Remove(seg idMapIterator) idMapGapIterator { if seg.node.hasChildren { victim := seg.PrevSegment() seg.SetRangeUnchecked(victim.Range()) seg.SetValue(victim.Value()) nextAdjacentNode := seg.NextSegment().node if idMaptrackGaps != 0 { nextAdjacentNode.updateMaxGapLeaf() } return s.Remove(victim).NextGap() } copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) idMapFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) seg.node.nrSegments-- if idMaptrackGaps != 0 { seg.node.updateMaxGapLeaf() } return seg.node.rebalanceAfterRemove(idMapGapIterator{seg.node, seg.index}) } // RemoveAll removes all segments from the set. All existing iterators are // invalidated. func (s *idMapSet) RemoveAll() { s.root = idMapnode{} } // RemoveRange removes all segments in the given range. An iterator to the // newly formed gap is returned, and all existing iterators are invalidated. // // RemoveRange searches the set to find segments to remove. If the caller // already has an iterator to either end of the range of segments to remove, or // if the caller needs to do additional work before removing each segment, // iterate segments and call Remove in a loop instead. func (s *idMapSet) RemoveRange(r idMapRange) idMapGapIterator { seg, gap := s.Find(r.Start) if seg.Ok() { seg = s.Isolate(seg, r) gap = s.Remove(seg) } for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { seg = s.SplitAfter(seg, r.End) gap = s.Remove(seg) } return gap } // RemoveFullRange is equivalent to RemoveRange, except that if any key in the // given range does not correspond to a segment, RemoveFullRange panics. func (s *idMapSet) RemoveFullRange(r idMapRange) idMapGapIterator { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) end := seg.End() gap := s.Remove(seg) if r.End <= end { return gap } seg = gap.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // Merge attempts to merge two neighboring segments. If successful, Merge // returns an iterator to the merged segment, and all existing iterators are // invalidated. Otherwise, Merge returns a terminal iterator. // // If first is not the predecessor of second, Merge panics. func (s *idMapSet) Merge(first, second idMapIterator) idMapIterator { if first.NextSegment() != second { panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) } return s.MergeUnchecked(first, second) } // MergeUnchecked attempts to merge two neighboring segments. If successful, // MergeUnchecked returns an iterator to the merged segment, and all existing // iterators are invalidated. Otherwise, MergeUnchecked returns a terminal // iterator. // // Precondition: first is the predecessor of second: first.NextSegment() == // second, first == second.PrevSegment(). func (s *idMapSet) MergeUnchecked(first, second idMapIterator) idMapIterator { if first.End() == second.Start() { if mval, ok := (idMapFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { first.SetEndUnchecked(second.End()) first.SetValue(mval) return s.Remove(second).PrevSegment() } } return idMapIterator{} } // MergePrev attempts to merge the given segment with its predecessor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergePrev is usually used when mutating segments while iterating them in // order of increasing keys, to attempt merging of each mutated segment with // its previously-mutated predecessor. In such cases, merging a mutated segment // with its unmutated successor would incorrectly cause the latter to be // skipped. func (s *idMapSet) MergePrev(seg idMapIterator) idMapIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } return seg } // MergeNext attempts to merge the given segment with its successor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergeNext is usually used when mutating segments while iterating them in // order of decreasing keys, to attempt merging of each mutated segment with // its previously-mutated successor. In such cases, merging a mutated segment // with its unmutated predecessor would incorrectly cause the latter to be // skipped. func (s *idMapSet) MergeNext(seg idMapIterator) idMapIterator { if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // Unisolate attempts to merge the given segment with its predecessor and // successor if possible, and returns an updated iterator to the extended // segment. All existing iterators (including seg, but not including the // returned iterator) are invalidated. // // Unisolate is usually used in conjunction with Isolate when mutating part of // a single segment in a way that may affect its mergeability. For the reasons // described by MergePrev and MergeNext, it is usually incorrect to use the // return value of Unisolate in a loop variable. func (s *idMapSet) Unisolate(seg idMapIterator) idMapIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // MergeAll merges all mergeable adjacent segments in the set. All existing // iterators are invalidated. func (s *idMapSet) MergeAll() { seg := s.FirstSegment() if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeInsideRange attempts to merge all adjacent segments that contain a key // in the specific range. All existing iterators are invalidated. // // MergeInsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid a redundant search. func (s *idMapSet) MergeInsideRange(r idMapRange) { seg := s.LowerBoundSegment(r.Start) if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() && next.Start() < r.End { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeOutsideRange attempts to merge the segment containing r.Start with its // predecessor, and the segment containing r.End-1 with its successor. // // MergeOutsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid two redundant searches. func (s *idMapSet) MergeOutsideRange(r idMapRange) { first := s.FindSegment(r.Start) if first.Ok() { if prev := first.PrevSegment(); prev.Ok() { s.Merge(prev, first) } } last := s.FindSegment(r.End - 1) if last.Ok() { if next := last.NextSegment(); next.Ok() { s.Merge(last, next) } } } // Split splits the given segment at the given key and returns iterators to the // two resulting segments. All existing iterators (including seg, but not // including the returned iterators) are invalidated. // // If the segment cannot be split at split (because split is at the start or // end of the segment's range, so splitting would produce a segment with zero // length, or because split falls outside the segment's range altogether), // Split panics. func (s *idMapSet) Split(seg idMapIterator, split uint32) (idMapIterator, idMapIterator) { if !seg.Range().CanSplitAt(split) { panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) } return s.SplitUnchecked(seg, split) } // SplitUnchecked splits the given segment at the given key and returns // iterators to the two resulting segments. All existing iterators (including // seg, but not including the returned iterators) are invalidated. // // Preconditions: seg.Start() < key < seg.End(). func (s *idMapSet) SplitUnchecked(seg idMapIterator, split uint32) (idMapIterator, idMapIterator) { val1, val2 := (idMapFunctions{}).Split(seg.Range(), seg.Value(), split) end2 := seg.End() seg.SetEndUnchecked(split) seg.SetValue(val1) seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), idMapRange{split, end2}, val2) return seg2.PrevSegment(), seg2 } // SplitBefore ensures that the given segment's start is at least start by // splitting at start if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterator) are invalidated. // // SplitBefore is usually when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, the first segment may // extend beyond the start of the range to be mutated, and needs to be // SplitBefore to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter; i.e. SplitBefore needs to be invoked on each segment, while // SplitAfter only needs to be invoked on the first. // // Preconditions: start < seg.End(). func (s *idMapSet) SplitBefore(seg idMapIterator, start uint32) idMapIterator { if seg.Range().CanSplitAt(start) { _, seg = s.SplitUnchecked(seg, start) } return seg } // SplitAfter ensures that the given segment's end is at most end by splitting // at end if necessary, and returns an updated iterator to the bounded segment. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. // // SplitAfter is usually used when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, each iterated segment // may extend beyond the end of the range to be mutated, and needs to be // SplitAfter to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter exchange roles; i.e. SplitBefore needs to be invoked on each // segment, while SplitAfter only needs to be invoked on the first. // // Preconditions: seg.Start() < end. func (s *idMapSet) SplitAfter(seg idMapIterator, end uint32) idMapIterator { if seg.Range().CanSplitAt(end) { seg, _ = s.SplitUnchecked(seg, end) } return seg } // Isolate ensures that the given segment's range is a subset of r by splitting // at r.Start and r.End if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterators) are invalidated. // // Isolate is usually used when mutating part of a single segment, or when // mutating segments in a range where the first segment is not necessarily // split, making use of SplitBefore/SplitAfter complex. // // Preconditions: seg.Range().Overlaps(r). func (s *idMapSet) Isolate(seg idMapIterator, r idMapRange) idMapIterator { if seg.Range().CanSplitAt(r.Start) { _, seg = s.SplitUnchecked(seg, r.Start) } if seg.Range().CanSplitAt(r.End) { seg, _ = s.SplitUnchecked(seg, r.End) } return seg } // LowerBoundSegmentSplitBefore combines LowerBoundSegment and SplitBefore. // // LowerBoundSegmentSplitBefore is usually used when mutating segments in a // range while iterating them in order of increasing keys. In such cases, // LowerBoundSegmentSplitBefore provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *idMapSet) LowerBoundSegmentSplitBefore(min uint32) idMapIterator { seg := s.LowerBoundSegment(min) if seg.Ok() { seg = s.SplitBefore(seg, min) } return seg } // UpperBoundSegmentSplitAfter combines UpperBoundSegment and SplitAfter. // // UpperBoundSegmentSplitAfter is usually used when mutating segments in a // range while iterating them in order of decreasing keys. In such cases, // UpperBoundSegmentSplitAfter provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *idMapSet) UpperBoundSegmentSplitAfter(max uint32) idMapIterator { seg := s.UpperBoundSegment(max) if seg.Ok() { seg = s.SplitAfter(seg, max) } return seg } // VisitRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments will not be split, so f may be called // on segments lying partially outside r. Non-empty gaps between segments are // skipped. If a call to f returns false, VisitRange stops iteration // immediately. // // N.B. f must not invalidate iterators into s. func (s *idMapSet) VisitRange(r idMapRange, f func(seg idMapIterator) bool) { for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { if !f(seg) { return } } } // VisitFullRange is equivalent to VisitRange, except that if any key in r that // is visited before f returns false does not correspond to a segment, // VisitFullRange panics. func (s *idMapSet) VisitFullRange(r idMapRange, f func(seg idMapIterator) bool) { pos := r.Start seg := s.FindSegment(r.Start) for { if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", pos)) } if !f(seg) { return } pos = seg.End() if r.End <= pos { return } seg, _ = seg.NextNonEmpty() } } // MutateRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments that lie partially outside r are split // before f is called, such that f only observes segments entirely within r. // Iterated segments are merged again after f is called. Non-empty gaps between // segments are skipped. If a call to f returns false, MutateRange stops // iteration immediately. // // MutateRange invalidates all existing iterators. // // N.B. f must not invalidate iterators into s. func (s *idMapSet) MutateRange(r idMapRange, f func(seg idMapIterator) bool) { seg := s.LowerBoundSegmentSplitBefore(r.Start) for seg.Ok() && seg.Start() < r.End { seg = s.SplitAfter(seg, r.End) cont := f(seg) seg = s.MergePrev(seg) if !cont { s.MergeNext(seg) return } seg = seg.NextSegment() } if seg.Ok() { s.MergePrev(seg) } } // MutateFullRange is equivalent to MutateRange, except that if any key in r // that is visited before f returns false does not correspond to a segment, // MutateFullRange panics. func (s *idMapSet) MutateFullRange(r idMapRange, f func(seg idMapIterator) bool) { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) cont := f(seg) end := seg.End() seg = s.MergePrev(seg) if !cont || r.End <= end { s.MergeNext(seg) return } seg = seg.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // +stateify savable type idMapnode struct { // An internal binary tree node looks like: // // K // / \ // Cl Cr // // where all keys in the subtree rooted by Cl (the left subtree) are less // than K (the key of the parent node), and all keys in the subtree rooted // by Cr (the right subtree) are greater than K. // // An internal B-tree node's indexes work out to look like: // // K0 K1 K2 ... Kn-1 // / \/ \/ \ ... / \ // C0 C1 C2 C3 ... Cn-1 Cn // // where n is nrSegments. nrSegments int // parent is a pointer to this node's parent. If this node is root, parent // is nil. parent *idMapnode // parentIndex is the index of this node in parent.children. parentIndex int // Flag for internal nodes that is technically redundant with "children[0] // != nil", but is stored in the first cache line. "hasChildren" rather // than "isLeaf" because false must be the correct value for an empty root. hasChildren bool // The longest gap within this node. If the node is a leaf, it's simply the // maximum gap among all the (nrSegments+1) gaps formed by its nrSegments keys // including the 0th and nrSegments-th gap possibly shared with its upper-level // nodes; if it's a non-leaf node, it's the max of all children's maxGap. maxGap idMapdynamicGap // Nodes store keys and values in separate arrays to maximize locality in // the common case (scanning keys for lookup). keys [idMapmaxDegree - 1]idMapRange values [idMapmaxDegree - 1]uint32 children [idMapmaxDegree]*idMapnode } // firstSegment returns the first segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *idMapnode) firstSegment() idMapIterator { for n.hasChildren { n = n.children[0] } return idMapIterator{n, 0} } // lastSegment returns the last segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *idMapnode) lastSegment() idMapIterator { for n.hasChildren { n = n.children[n.nrSegments] } return idMapIterator{n, n.nrSegments - 1} } func (n *idMapnode) prevSibling() *idMapnode { if n.parent == nil || n.parentIndex == 0 { return nil } return n.parent.children[n.parentIndex-1] } func (n *idMapnode) nextSibling() *idMapnode { if n.parent == nil || n.parentIndex == n.parent.nrSegments { return nil } return n.parent.children[n.parentIndex+1] } // rebalanceBeforeInsert splits n and its ancestors if they are full, as // required for insertion, and returns an updated iterator to the position // represented by gap. func (n *idMapnode) rebalanceBeforeInsert(gap idMapGapIterator) idMapGapIterator { if n.nrSegments < idMapmaxDegree-1 { return gap } if n.parent != nil { gap = n.parent.rebalanceBeforeInsert(gap) } if n.parent == nil { left := &idMapnode{ nrSegments: idMapminDegree - 1, parent: n, parentIndex: 0, hasChildren: n.hasChildren, } right := &idMapnode{ nrSegments: idMapminDegree - 1, parent: n, parentIndex: 1, hasChildren: n.hasChildren, } copy(left.keys[:idMapminDegree-1], n.keys[:idMapminDegree-1]) copy(left.values[:idMapminDegree-1], n.values[:idMapminDegree-1]) copy(right.keys[:idMapminDegree-1], n.keys[idMapminDegree:]) copy(right.values[:idMapminDegree-1], n.values[idMapminDegree:]) n.keys[0], n.values[0] = n.keys[idMapminDegree-1], n.values[idMapminDegree-1] idMapzeroValueSlice(n.values[1:]) if n.hasChildren { copy(left.children[:idMapminDegree], n.children[:idMapminDegree]) copy(right.children[:idMapminDegree], n.children[idMapminDegree:]) idMapzeroNodeSlice(n.children[2:]) for i := 0; i < idMapminDegree; i++ { left.children[i].parent = left left.children[i].parentIndex = i right.children[i].parent = right right.children[i].parentIndex = i } } n.nrSegments = 1 n.hasChildren = true n.children[0] = left n.children[1] = right if idMaptrackGaps != 0 { left.updateMaxGapLocal() right.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < idMapminDegree { return idMapGapIterator{left, gap.index} } return idMapGapIterator{right, gap.index - idMapminDegree} } copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[idMapminDegree-1], n.values[idMapminDegree-1] copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { n.parent.children[i].parentIndex = i } sibling := &idMapnode{ nrSegments: idMapminDegree - 1, parent: n.parent, parentIndex: n.parentIndex + 1, hasChildren: n.hasChildren, } n.parent.children[n.parentIndex+1] = sibling n.parent.nrSegments++ copy(sibling.keys[:idMapminDegree-1], n.keys[idMapminDegree:]) copy(sibling.values[:idMapminDegree-1], n.values[idMapminDegree:]) idMapzeroValueSlice(n.values[idMapminDegree-1:]) if n.hasChildren { copy(sibling.children[:idMapminDegree], n.children[idMapminDegree:]) idMapzeroNodeSlice(n.children[idMapminDegree:]) for i := 0; i < idMapminDegree; i++ { sibling.children[i].parent = sibling sibling.children[i].parentIndex = i } } n.nrSegments = idMapminDegree - 1 if idMaptrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < idMapminDegree { return gap } return idMapGapIterator{sibling, gap.index - idMapminDegree} } // rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient // (contain fewer segments than required by B-tree invariants), as required for // removal, and returns an updated iterator to the position represented by gap. // // Precondition: n is the only node in the tree that may currently violate a // B-tree invariant. func (n *idMapnode) rebalanceAfterRemove(gap idMapGapIterator) idMapGapIterator { for { if n.nrSegments >= idMapminDegree-1 { return gap } if n.parent == nil { return gap } if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= idMapminDegree { copy(n.keys[1:], n.keys[:n.nrSegments]) copy(n.values[1:], n.values[:n.nrSegments]) n.keys[0] = n.parent.keys[n.parentIndex-1] n.values[0] = n.parent.values[n.parentIndex-1] n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] idMapFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { copy(n.children[1:], n.children[:n.nrSegments+1]) n.children[0] = sibling.children[sibling.nrSegments] sibling.children[sibling.nrSegments] = nil n.children[0].parent = n n.children[0].parentIndex = 0 for i := 1; i < n.nrSegments+2; i++ { n.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if idMaptrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling && gap.index == sibling.nrSegments { return idMapGapIterator{n, 0} } if gap.node == n { return idMapGapIterator{n, gap.index + 1} } return gap } if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= idMapminDegree { n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] n.values[n.nrSegments] = n.parent.values[n.parentIndex] n.parent.keys[n.parentIndex] = sibling.keys[0] n.parent.values[n.parentIndex] = sibling.values[0] copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) idMapFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { n.children[n.nrSegments+1] = sibling.children[0] copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) sibling.children[sibling.nrSegments] = nil n.children[n.nrSegments+1].parent = n n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 for i := 0; i < sibling.nrSegments; i++ { sibling.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if idMaptrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling { if gap.index == 0 { return idMapGapIterator{n, n.nrSegments} } return idMapGapIterator{sibling, gap.index - 1} } return gap } p := n.parent if p.nrSegments == 1 { left, right := p.children[0], p.children[1] p.nrSegments = left.nrSegments + right.nrSegments + 1 p.hasChildren = left.hasChildren p.keys[left.nrSegments] = p.keys[0] p.values[left.nrSegments] = p.values[0] copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := 0; i < p.nrSegments+1; i++ { p.children[i].parent = p p.children[i].parentIndex = i } } else { p.children[0] = nil p.children[1] = nil } if gap.node == left { return idMapGapIterator{p, gap.index} } if gap.node == right { return idMapGapIterator{p, gap.index + left.nrSegments + 1} } return gap } // Merge n and either sibling, along with the segment separating the // two, into whichever of the two nodes comes first. This is the // reverse of the non-root splitting case in // node.rebalanceBeforeInsert. var left, right *idMapnode if n.parentIndex > 0 { left = n.prevSibling() right = n } else { left = n right = n.nextSibling() } if gap.node == right { gap = idMapGapIterator{left, gap.index + left.nrSegments + 1} } left.keys[left.nrSegments] = p.keys[left.parentIndex] left.values[left.nrSegments] = p.values[left.parentIndex] copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { left.children[i].parent = left left.children[i].parentIndex = i } } left.nrSegments += right.nrSegments + 1 copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) idMapFunctions{}.ClearValue(&p.values[p.nrSegments-1]) copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) for i := 0; i < p.nrSegments; i++ { p.children[i].parentIndex = i } p.children[p.nrSegments] = nil p.nrSegments-- if idMaptrackGaps != 0 { left.updateMaxGapLocal() } n = p } } // updateMaxGapLeaf updates maxGap bottom-up from the calling leaf until no // necessary update. // // Preconditions: n must be a leaf node, trackGaps must be 1. func (n *idMapnode) updateMaxGapLeaf() { if n.hasChildren { panic(fmt.Sprintf("updateMaxGapLeaf should always be called on leaf node: %v", n)) } max := n.calculateMaxGapLeaf() if max == n.maxGap.Get() { return } oldMax := n.maxGap.Get() n.maxGap.Set(max) if max > oldMax { for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() >= max { break } p.maxGap.Set(max) } return } for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() > oldMax { break } parentNewMax := p.calculateMaxGapInternal() if p.maxGap.Get() == parentNewMax { break } p.maxGap.Set(parentNewMax) } } // updateMaxGapLocal updates maxGap of the calling node solely with no // propagation to ancestor nodes. // // Precondition: trackGaps must be 1. func (n *idMapnode) updateMaxGapLocal() { if !n.hasChildren { n.maxGap.Set(n.calculateMaxGapLeaf()) } else { n.maxGap.Set(n.calculateMaxGapInternal()) } } // calculateMaxGapLeaf iterates the gaps within a leaf node and calculate the // max. // // Preconditions: n must be a leaf node. func (n *idMapnode) calculateMaxGapLeaf() uint32 { max := idMapGapIterator{n, 0}.Range().Length() for i := 1; i <= n.nrSegments; i++ { if current := (idMapGapIterator{n, i}).Range().Length(); current > max { max = current } } return max } // calculateMaxGapInternal iterates children's maxGap within an internal node n // and calculate the max. // // Preconditions: n must be a non-leaf node. func (n *idMapnode) calculateMaxGapInternal() uint32 { max := n.children[0].maxGap.Get() for i := 1; i <= n.nrSegments; i++ { if current := n.children[i].maxGap.Get(); current > max { max = current } } return max } // searchFirstLargeEnoughGap returns the first gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *idMapnode) searchFirstLargeEnoughGap(minSize uint32) idMapGapIterator { if n.maxGap.Get() < minSize { return idMapGapIterator{} } if n.hasChildren { for i := 0; i <= n.nrSegments; i++ { if largeEnoughGap := n.children[i].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := 0; i <= n.nrSegments; i++ { currentGap := idMapGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // searchLastLargeEnoughGap returns the last gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *idMapnode) searchLastLargeEnoughGap(minSize uint32) idMapGapIterator { if n.maxGap.Get() < minSize { return idMapGapIterator{} } if n.hasChildren { for i := n.nrSegments; i >= 0; i-- { if largeEnoughGap := n.children[i].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := n.nrSegments; i >= 0; i-- { currentGap := idMapGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // A Iterator is conceptually one of: // // - A pointer to a segment in a set; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Iterators are copyable values and are meaningfully equality-comparable. The // zero value of Iterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type idMapIterator struct { // node is the node containing the iterated segment. If the iterator is // terminal, node is nil. node *idMapnode // index is the index of the segment in node.keys/values. index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (seg idMapIterator) Ok() bool { return seg.node != nil } // Range returns the iterated segment's range key. func (seg idMapIterator) Range() idMapRange { return seg.node.keys[seg.index] } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (seg idMapIterator) Start() uint32 { return seg.node.keys[seg.index].Start } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (seg idMapIterator) End() uint32 { return seg.node.keys[seg.index].End } // SetRangeUnchecked mutates the iterated segment's range key. This operation // does not invalidate any iterators. // // Preconditions: // - r.Length() > 0. // - The new range must not overlap an existing one: // - If seg.NextSegment().Ok(), then r.end <= seg.NextSegment().Start(). // - If seg.PrevSegment().Ok(), then r.start >= seg.PrevSegment().End(). func (seg idMapIterator) SetRangeUnchecked(r idMapRange) { seg.node.keys[seg.index] = r } // SetRange mutates the iterated segment's range key. If the new range would // cause the iterated segment to overlap another segment, or if the new range // is invalid, SetRange panics. This operation does not invalidate any // iterators. func (seg idMapIterator) SetRange(r idMapRange) { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) } if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) } seg.SetRangeUnchecked(r) } // SetStartUnchecked mutates the iterated segment's start. This operation does // not invalidate any iterators. // // Preconditions: The new start must be valid: // - start < seg.End() // - If seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). func (seg idMapIterator) SetStartUnchecked(start uint32) { seg.node.keys[seg.index].Start = start } // SetStart mutates the iterated segment's start. If the new start value would // cause the iterated segment to overlap another segment, or would result in an // invalid range, SetStart panics. This operation does not invalidate any // iterators. func (seg idMapIterator) SetStart(start uint32) { if start >= seg.End() { panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) } if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) } seg.SetStartUnchecked(start) } // SetEndUnchecked mutates the iterated segment's end. This operation does not // invalidate any iterators. // // Preconditions: The new end must be valid: // - end > seg.Start(). // - If seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). func (seg idMapIterator) SetEndUnchecked(end uint32) { seg.node.keys[seg.index].End = end } // SetEnd mutates the iterated segment's end. If the new end value would cause // the iterated segment to overlap another segment, or would result in an // invalid range, SetEnd panics. This operation does not invalidate any // iterators. func (seg idMapIterator) SetEnd(end uint32) { if end <= seg.Start() { panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) } if next := seg.NextSegment(); next.Ok() && end > next.Start() { panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) } seg.SetEndUnchecked(end) } // Value returns a copy of the iterated segment's value. func (seg idMapIterator) Value() uint32 { return seg.node.values[seg.index] } // ValuePtr returns a pointer to the iterated segment's value. The pointer is // invalidated if the iterator is invalidated. This operation does not // invalidate any iterators. func (seg idMapIterator) ValuePtr() *uint32 { return &seg.node.values[seg.index] } // SetValue mutates the iterated segment's value. This operation does not // invalidate any iterators. func (seg idMapIterator) SetValue(val uint32) { seg.node.values[seg.index] = val } // PrevSegment returns the iterated segment's predecessor. If there is no // preceding segment, PrevSegment returns a terminal iterator. func (seg idMapIterator) PrevSegment() idMapIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment() } if seg.index > 0 { return idMapIterator{seg.node, seg.index - 1} } if seg.node.parent == nil { return idMapIterator{} } return idMapsegmentBeforePosition(seg.node.parent, seg.node.parentIndex) } // NextSegment returns the iterated segment's successor. If there is no // succeeding segment, NextSegment returns a terminal iterator. func (seg idMapIterator) NextSegment() idMapIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment() } if seg.index < seg.node.nrSegments-1 { return idMapIterator{seg.node, seg.index + 1} } if seg.node.parent == nil { return idMapIterator{} } return idMapsegmentAfterPosition(seg.node.parent, seg.node.parentIndex) } // PrevGap returns the gap immediately before the iterated segment. func (seg idMapIterator) PrevGap() idMapGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment().NextGap() } return idMapGapIterator{seg.node, seg.index} } // NextGap returns the gap immediately after the iterated segment. func (seg idMapIterator) NextGap() idMapGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment().PrevGap() } return idMapGapIterator{seg.node, seg.index + 1} } // PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, // or the gap before the iterated segment otherwise. If seg.Start() == // Functions.MinKey(), PrevNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by PrevNonEmpty will be // non-terminal. func (seg idMapIterator) PrevNonEmpty() (idMapIterator, idMapGapIterator) { if prev := seg.PrevSegment(); prev.Ok() && prev.End() == seg.Start() { return prev, idMapGapIterator{} } return idMapIterator{}, seg.PrevGap() } // NextNonEmpty returns the iterated segment's successor if it is adjacent, or // the gap after the iterated segment otherwise. If seg.End() == // Functions.MaxKey(), NextNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by NextNonEmpty will be // non-terminal. func (seg idMapIterator) NextNonEmpty() (idMapIterator, idMapGapIterator) { if next := seg.NextSegment(); next.Ok() && next.Start() == seg.End() { return next, idMapGapIterator{} } return idMapIterator{}, seg.NextGap() } // A GapIterator is conceptually one of: // // - A pointer to a position between two segments, before the first segment, or // after the last segment in a set, called a *gap*; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Note that the gap between two adjacent segments exists (iterators to it are // non-terminal), but has a length of zero. GapIterator.IsEmpty returns true // for such gaps. An empty set contains a single gap, spanning the entire range // of the set's keys. // // GapIterators are copyable values and are meaningfully equality-comparable. // The zero value of GapIterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type idMapGapIterator struct { // The representation of a GapIterator is identical to that of an Iterator, // except that index corresponds to positions between segments in the same // way as for node.children (see comment for node.nrSegments). node *idMapnode index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (gap idMapGapIterator) Ok() bool { return gap.node != nil } // Range returns the range spanned by the iterated gap. func (gap idMapGapIterator) Range() idMapRange { return idMapRange{gap.Start(), gap.End()} } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (gap idMapGapIterator) Start() uint32 { if ps := gap.PrevSegment(); ps.Ok() { return ps.End() } return idMapFunctions{}.MinKey() } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (gap idMapGapIterator) End() uint32 { if ns := gap.NextSegment(); ns.Ok() { return ns.Start() } return idMapFunctions{}.MaxKey() } // IsEmpty returns true if the iterated gap is empty (that is, the "gap" is // between two adjacent segments.) func (gap idMapGapIterator) IsEmpty() bool { return gap.Range().Length() == 0 } // PrevSegment returns the segment immediately before the iterated gap. If no // such segment exists, PrevSegment returns a terminal iterator. func (gap idMapGapIterator) PrevSegment() idMapIterator { return idMapsegmentBeforePosition(gap.node, gap.index) } // NextSegment returns the segment immediately after the iterated gap. If no // such segment exists, NextSegment returns a terminal iterator. func (gap idMapGapIterator) NextSegment() idMapIterator { return idMapsegmentAfterPosition(gap.node, gap.index) } // PrevGap returns the iterated gap's predecessor. If no such gap exists, // PrevGap returns a terminal iterator. func (gap idMapGapIterator) PrevGap() idMapGapIterator { seg := gap.PrevSegment() if !seg.Ok() { return idMapGapIterator{} } return seg.PrevGap() } // NextGap returns the iterated gap's successor. If no such gap exists, NextGap // returns a terminal iterator. func (gap idMapGapIterator) NextGap() idMapGapIterator { seg := gap.NextSegment() if !seg.Ok() { return idMapGapIterator{} } return seg.NextGap() } // NextLargeEnoughGap returns the iterated gap's first next gap with larger // length than minSize. If not found, return a terminal gap iterator (does NOT // include this gap itself). // // Precondition: trackGaps must be 1. func (gap idMapGapIterator) NextLargeEnoughGap(minSize uint32) idMapGapIterator { if idMaptrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == gap.node.nrSegments { gap.node = gap.NextSegment().node gap.index = 0 return gap.nextLargeEnoughGapHelper(minSize) } return gap.nextLargeEnoughGapHelper(minSize) } // nextLargeEnoughGapHelper is the helper function used by NextLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the trailing gap of a non-leaf node. func (gap idMapGapIterator) nextLargeEnoughGapHelper(minSize uint32) idMapGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == gap.node.nrSegments)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return idMapGapIterator{} } gap.index++ for gap.index <= gap.node.nrSegments { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index++ } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == gap.node.nrSegments { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // PrevLargeEnoughGap returns the iterated gap's first prev gap with larger or // equal length than minSize. If not found, return a terminal gap iterator // (does NOT include this gap itself). // // Precondition: trackGaps must be 1. func (gap idMapGapIterator) PrevLargeEnoughGap(minSize uint32) idMapGapIterator { if idMaptrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == 0 { gap.node = gap.PrevSegment().node gap.index = gap.node.nrSegments return gap.prevLargeEnoughGapHelper(minSize) } return gap.prevLargeEnoughGapHelper(minSize) } // prevLargeEnoughGapHelper is the helper function used by PrevLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the first gap of a non-leaf node. func (gap idMapGapIterator) prevLargeEnoughGapHelper(minSize uint32) idMapGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == 0)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return idMapGapIterator{} } gap.index-- for gap.index >= 0 { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index-- } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == 0 { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // segmentBeforePosition returns the predecessor segment of the position given // by n.children[i], which may or may not contain a child. If no such segment // exists, segmentBeforePosition returns a terminal iterator. func idMapsegmentBeforePosition(n *idMapnode, i int) idMapIterator { for i == 0 { if n.parent == nil { return idMapIterator{} } n, i = n.parent, n.parentIndex } return idMapIterator{n, i - 1} } // segmentAfterPosition returns the successor segment of the position given by // n.children[i], which may or may not contain a child. If no such segment // exists, segmentAfterPosition returns a terminal iterator. func idMapsegmentAfterPosition(n *idMapnode, i int) idMapIterator { for i == n.nrSegments { if n.parent == nil { return idMapIterator{} } n, i = n.parent, n.parentIndex } return idMapIterator{n, i} } func idMapzeroValueSlice(slice []uint32) { for i := range slice { idMapFunctions{}.ClearValue(&slice[i]) } } func idMapzeroNodeSlice(slice []*idMapnode) { for i := range slice { slice[i] = nil } } // String stringifies a Set for debugging. func (s *idMapSet) String() string { return s.root.String() } // String stringifies a node (and all of its children) for debugging. func (n *idMapnode) String() string { var buf bytes.Buffer n.writeDebugString(&buf, "") return buf.String() } func (n *idMapnode) writeDebugString(buf *bytes.Buffer, prefix string) { if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { buf.WriteString(prefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) } for i := 0; i < n.nrSegments; i++ { if child := n.children[i]; child != nil { cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) if child.parent != n || child.parentIndex != i { buf.WriteString(cprefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) } child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) } buf.WriteString(prefix) if n.hasChildren { if idMaptrackGaps != 0 { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v, maxGap: %d\n", i, n.keys[i], n.values[i], n.maxGap.Get())) } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } if child := n.children[n.nrSegments]; child != nil { child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) } } // FlatSegment represents a segment as a single object. FlatSegment is used as // an intermediate representation for save/restore and tests. // // +stateify savable type idMapFlatSegment struct { Start uint32 End uint32 Value uint32 } // ExportSlice returns a copy of all segments in the given set, in ascending // key order. func (s *idMapSet) ExportSlice() []idMapFlatSegment { var fs []idMapFlatSegment for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { fs = append(fs, idMapFlatSegment{ Start: seg.Start(), End: seg.End(), Value: seg.Value(), }) } return fs } // ImportSlice initializes the given set from the given slice. // // Preconditions: // - s must be empty. // - fs must represent a valid set (the segments in fs must have valid // lengths that do not overlap). // - The segments in fs must be sorted in ascending key order. func (s *idMapSet) ImportSlice(fs []idMapFlatSegment) error { if !s.IsEmpty() { return fmt.Errorf("cannot import into non-empty set %v", s) } gap := s.FirstGap() for i := range fs { f := &fs[i] r := idMapRange{f.Start, f.End} if !gap.Range().IsSupersetOf(r) { return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: %v => %v", r, f.Value) } gap = s.InsertWithoutMerging(gap, r, f.Value).NextGap() } return nil } // segmentTestCheck returns an error if s is incorrectly sorted, does not // contain exactly expectedSegments segments, or contains a segment which // fails the passed check. // // This should be used only for testing, and has been added to this package for // templating convenience. func (s *idMapSet) segmentTestCheck(expectedSegments int, segFunc func(int, idMapRange, uint32) error) error { havePrev := false prev := uint32(0) nrSegments := 0 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { next := seg.Start() if havePrev && prev >= next { return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments) } if segFunc != nil { if err := segFunc(nrSegments, seg.Range(), seg.Value()); err != nil { return err } } prev = next havePrev = true nrSegments++ } if nrSegments != expectedSegments { return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments) } return nil } // countSegments counts the number of segments in the set. // // Similar to Check, this should only be used for testing. func (s *idMapSet) countSegments() (segments int) { for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { segments++ } return segments } func (s *idMapSet) saveRoot() []idMapFlatSegment { fs := s.ExportSlice() fs = fs[:len(fs):len(fs)] return fs } func (s *idMapSet) loadRoot(_ context.Context, fs []idMapFlatSegment) { if err := s.ImportSlice(fs); err != nil { panic(err) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/key.go000066400000000000000000000302021465435605700243050ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package auth import ( "encoding/binary" "fmt" "strings" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/rand" ) // KeySerial is a key ID type. // Only strictly positive IDs are valid key IDs. // The zero ID is meaningless but is specified when creating new keyrings. // Strictly negative IDs are used for special key IDs which are internally // translated to real key IDs (e.g. KEY_SPEC_SESSION_KEYRING is translated // to the caller process's session keyring). type KeySerial int32 // KeyType is the type of a key. // This is an enum, but is also exposed to userspace in KEYCTL_DESCRIBE. // For this reason, it must match Linux. type KeyType string // List of known key types. const ( KeyTypeKeyring KeyType = "keyring" // Other types are not yet supported. ) // KeyPermission represents a permission on a key. type KeyPermission int // List of known key permissions. const ( KeyView KeyPermission = iota KeyRead KeyWrite KeySearch KeyLink KeySetAttr ) // KeyPermissions is the full set of permissions on a single Key. type KeyPermissions uint64 const ( // MaxKeyDescSize is the maximum size of the "Description" field of keys. // Corresponds to `KEY_MAX_DESC_SIZE` in Linux. MaxKeyDescSize = 4096 // maxSetSize is the maximum number of a keys in a `Set`. // By default, Linux limits this number to 200 per non-root user. // Here, we limit it to 200 per Set, which is stricter. maxSetSize = 200 ) // Key represents a key in the keyrings subsystem. // // +stateify savable type Key struct { // ID is the ID of the key, also often referred to as "serial number". // Note that key IDs passed in syscalls may be negative when they refer to // "special keys", sometimes also referred to as "shortcut IDs". // Key IDs of real instantiated keys are always > 0. // The key ID never changes and is unique within a KeySet (i.e. a user // namespace). // It must be chosen with cryptographic randomness to make enumeration // attacks harder. ID KeySerial // Description is a description of the key. It is also often referred to the // "name" of the key. Keys are canonically identified by their ID, but the // syscall ABI also allows look up keys by their description. // It may not be larger than `KeyMaxDescSize`. // Confusingly, the information returned by the KEYCTL_DESCRIBE operation, // which you'd think means "get the key description", actually returns a // superset of this `Description`. Description string // kuid is the owner of the key in the root namespace. // kuid is only mutable in KeySet transactions. kuid KUID // kgid is the group of the key in the root namespace. // kgid is only mutable in KeySet transactions. kgid KGID // perms is a bitfield of key permissions. // perms is only mutable in KeySet transactions. perms KeyPermissions } // Type returns the type of this key. func (*Key) Type() KeyType { return KeyTypeKeyring } // KUID returns the KUID (owner ID) of the key. func (k *Key) KUID() KUID { return k.kuid } // KGID returns the KGID (group ID) of the key. func (k *Key) KGID() KGID { return k.kgid } // Permissions returns the permission bits of the key. func (k *Key) Permissions() KeyPermissions { return k.perms } // String is a human-friendly representation of the key. // Notably, this is *not* the string returned to userspace when requested // using `KEYCTL_DESCRIBE`. func (k *Key) String() string { return fmt.Sprintf("id=%d,perms=0x%x,desc=%q", k.ID, k.perms, k.Description) } // Bitmasks for permission checks. const ( keyPossessorPermissionsMask = 0x3f000000 keyPossessorPermissionsShift = 24 keyOwnerPermissionsMask = 0x003f0000 keyOwnerPermissionsShift = 16 keyGroupPermissionsMask = 0x00003f00 keyGroupPermissionsShift = 8 keyOtherPermissionsMask = 0x0000003f keyOtherPermissionsShift = 0 keyPermissionView = 0x00000001 keyPermissionRead = 0x00000002 keyPermissionWrite = 0x00000004 keyPermissionSearch = 0x00000008 keyPermissionLink = 0x00000010 keyPermissionSetAttr = 0x00000020 keyPermissionAll = (keyPermissionView | keyPermissionRead | keyPermissionWrite | keyPermissionSearch | keyPermissionLink | keyPermissionSetAttr) ) // String returns a human-readable version of the permission bits. func (p KeyPermissions) String() string { var perms strings.Builder for i, s := range [4]struct { kind string shift int }{ {kind: "possessor", shift: keyPossessorPermissionsShift}, {kind: "owner", shift: keyOwnerPermissionsShift}, {kind: "group", shift: keyGroupPermissionsShift}, {kind: "other", shift: keyOtherPermissionsShift}, } { if i != 0 { perms.WriteRune(',') } perms.WriteString(s.kind) perms.WriteRune('=') kindPerms := p >> s.shift for _, b := range [6]struct { mask int r rune }{ {mask: keyPermissionView, r: 'v'}, {mask: keyPermissionRead, r: 'r'}, {mask: keyPermissionWrite, r: 'w'}, {mask: keyPermissionSearch, r: 's'}, {mask: keyPermissionLink, r: 'l'}, {mask: keyPermissionSetAttr, r: 'a'}, } { if uint64(kindPerms)&uint64(b.mask) != 0 { perms.WriteRune(b.r) } else { perms.WriteRune('-') } } } return fmt.Sprintf("%08x[%s]", uint64(p), perms.String()) } // Default key settings. const ( // Default session keyring name. DefaultSessionKeyringName = "_ses" // Default permissions for unnamed session keyrings: // Possessors have full permissions. // Owners have view and read permissions. DefaultUnnamedSessionKeyringPermissions KeyPermissions = ((keyPermissionAll << keyPossessorPermissionsShift) | ((keyPermissionView | keyPermissionRead) << keyOwnerPermissionsShift)) // Default permissions for named session keyrings: // Possessors have full permissions. // Owners have view, read, and link permissions. DefaultNamedSessionKeyringPermissions KeyPermissions = ((keyPermissionAll << keyPossessorPermissionsShift) | ((keyPermissionView | keyPermissionRead | keyPermissionLink) << keyOwnerPermissionsShift)) ) // PossessedKeys is an opaque type used during key permission check. // When iterating over all keys, the possessed set of keys should only be // built once. Since key possession is a recursive property, it can be // expensive to determine. PossessedKeys holds all possessed keys at // the time it is computed. // PossessedKeys is short-lived; it should only live for so long as there // are no changes to the KeySet or to any key permissions. type PossessedKeys struct { // possessed is a list of possessed key IDs. possessed map[KeySerial]struct{} } // PossessedKeys returns a new fully-expanded set of PossessedKeys. // The keys passed in are the set of keys that a task directly possesses: // session keyring, process keyring, thread keyring. Each key may be nil. // PossessedKeys is short-lived; it should only live for so long as there // are no changes to the KeySet or to any key permissions. func (c *Credentials) PossessedKeys(sessionKeyring, processKeyring, threadKeyring *Key) *PossessedKeys { possessed := &PossessedKeys{possessed: make(map[KeySerial]struct{})} for _, k := range [3]*Key{sessionKeyring, processKeyring, threadKeyring} { if k == nil { continue } // The possessor still needs "search" permission in order to actually possess anything. if ((k.perms&keyPossessorPermissionsMask)>>keyPossessorPermissionsShift)&keyPermissionSearch != 0 { possessed.possessed[k.ID] = struct{}{} } } // If we implement keyrings that contain other keys, this is where the // recursion would happen. return possessed } // HasKeyPermission returns whether the credentials grant `permission` on `k`. // //go:nosplit func (c *Credentials) HasKeyPermission(k *Key, possessed *PossessedKeys, permission KeyPermission) bool { perms := k.perms & keyOtherPermissionsMask if _, ok := possessed.possessed[k.ID]; ok { perms |= (k.perms & keyPossessorPermissionsMask) >> keyPossessorPermissionsShift } if c.EffectiveKUID == k.kuid { perms |= (k.perms & keyOwnerPermissionsMask) >> keyOwnerPermissionsShift } if c.EffectiveKGID == k.kgid { perms |= (k.perms & keyGroupPermissionsMask) >> keyGroupPermissionsShift } switch permission { case KeyView: return perms&keyPermissionView != 0 case KeyRead: return perms&keyPermissionRead != 0 case KeyWrite: return perms&keyPermissionWrite != 0 case KeySearch: return perms&keyPermissionSearch != 0 case KeyLink: return perms&keyPermissionLink != 0 case KeySetAttr: return perms&keyPermissionSetAttr != 0 default: panic("unknown key permission") } } // KeySet is a set of keys. // // +stateify savable type KeySet struct { // txnMu is used for transactionality of key changes. // This blocks multiple tasks for concurrently changing the keyset or the // permissions of any keys. txnMu keysetTransactionMutex `state:"nosave"` // mu protects the fields below. // Within functions on `KeySet`, `mu` may only be locked for reading. // Locking `mu` for writing may only be done in `LockedKeySet` functions. mu keysetRWMutex `state:"nosave"` // keys maps key IDs to the underlying Key struct. // It is initially nil to save on heap space. // It is only initialized when doing mutable transactions on it using `Do`. keys map[KeySerial]*Key } // LockedKeySet is a KeySet in a transaction. // It exposes functions that can mutate the KeySet or its keys. type LockedKeySet struct { *KeySet } // Do executes the given function as a transaction on the KeySet. // It returns the error that `fn` returns. // This is the only function where functions that lock the KeySet.mu for // writing may be called. func (s *KeySet) Do(fn func(*LockedKeySet) error) error { s.txnMu.Lock() defer s.txnMu.Unlock() ls := &LockedKeySet{s} ls.mu.Lock() if s.keys == nil { // Initialize the map from its zero value, if it hasn't been done yet. s.keys = make(map[KeySerial]*Key) } ls.mu.Unlock() return fn(ls) } // Lookup looks up a key by ID. // Callers must exercise care to verify that the key can be accessed with // proper credentials. func (s *KeySet) Lookup(keyID KeySerial) (*Key, error) { s.mu.RLock() defer s.mu.RUnlock() key, found := s.keys[keyID] if !found { return nil, linuxerr.ENOKEY } return key, nil } // ForEach iterates over all keys. // If `fn` returns true, iteration stops immediately. // Callers must exercise care to only process keys to which they have access. func (s *KeySet) ForEach(fn func(*Key) bool) { s.mu.RLock() defer s.mu.RUnlock() for _, key := range s.keys { if fn(key) { return } } } // getNewID returns a new random key ID strictly larger than zero. // It uses cryptographic randomness in order to make enumeration attacks // harder. func getNewID() (KeySerial, error) { var newID int32 for newID == 0 { if err := binary.Read(rand.Reader, binary.LittleEndian, &newID); err != nil { return 0, err } } if newID < 0 { newID = -newID } return KeySerial(newID), nil } // Add adds a new Key to the KeySet. func (s *LockedKeySet) Add(description string, creds *Credentials, perms KeyPermissions) (*Key, error) { if len(description) >= MaxKeyDescSize { return nil, linuxerr.EINVAL } s.mu.Lock() defer s.mu.Unlock() if len(s.keys) >= maxSetSize { return nil, linuxerr.EDQUOT } newID, err := getNewID() if err != nil { return nil, err } for s.keys[newID] != nil { newID, err = getNewID() if err != nil { return nil, err } } k := &Key{ ID: newID, Description: description, kuid: creds.EffectiveKUID, kgid: creds.EffectiveKGID, perms: perms, } s.keys[newID] = k return k, nil } // SetPerms sets the permissions on a given key. // The caller must have SetAttr permission on the key. func (s *LockedKeySet) SetPerms(key *Key, newPerms KeyPermissions) { key.perms = newPerms } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/keyset_mutex.go000066400000000000000000000045271465435605700262560ustar00rootroot00000000000000package auth import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type keysetRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var keysetlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type keysetlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *keysetRWMutex) Lock() { locking.AddGLock(keysetprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *keysetRWMutex) NestedLock(i keysetlockNameIndex) { locking.AddGLock(keysetprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *keysetRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(keysetprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *keysetRWMutex) NestedUnlock(i keysetlockNameIndex) { m.mu.Unlock() locking.DelGLock(keysetprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *keysetRWMutex) RLock() { locking.AddGLock(keysetprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *keysetRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(keysetprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *keysetRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *keysetRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *keysetRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var keysetprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func keysetinitLockNames() {} func init() { keysetinitLockNames() keysetprefixIndex = locking.NewMutexClass(reflect.TypeOf(keysetRWMutex{}), keysetlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/keyset_transaction_mutex.go000066400000000000000000000034571465435605700306640ustar00rootroot00000000000000package auth import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type keysetTransactionMutex struct { mu sync.Mutex } var keysetTransactionprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var keysetTransactionlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type keysetTransactionlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *keysetTransactionMutex) Lock() { locking.AddGLock(keysetTransactionprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *keysetTransactionMutex) NestedLock(i keysetTransactionlockNameIndex) { locking.AddGLock(keysetTransactionprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *keysetTransactionMutex) Unlock() { locking.DelGLock(keysetTransactionprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *keysetTransactionMutex) NestedUnlock(i keysetTransactionlockNameIndex) { locking.DelGLock(keysetTransactionprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func keysetTransactioninitLockNames() {} func init() { keysetTransactioninitLockNames() keysetTransactionprefixIndex = locking.NewMutexClass(reflect.TypeOf(keysetTransactionMutex{}), keysetTransactionlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/user_namespace.go000066400000000000000000000103721465435605700265150ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package auth import ( "math" "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // A UserNamespace represents a user namespace. See user_namespaces(7) for // details. // // +stateify savable type UserNamespace struct { // parent is this namespace's parent. If this is the root namespace, parent // is nil. The parent pointer is immutable. parent *UserNamespace // owner is the effective UID of the namespace's creator in the root // namespace. owner is immutable. owner KUID // Keys is the set of keys in this namespace. Keys KeySet // mu protects the following fields. // // If mu will be locked in multiple UserNamespaces, it must be locked in // descendant namespaces before ancestors. mu userNamespaceMutex `state:"nosave"` // Mappings of user/group IDs between this namespace and its parent. // // All ID maps, once set, cannot be changed. This means that successful // UID/GID translations cannot be racy. uidMapFromParent idMapSet uidMapToParent idMapSet gidMapFromParent idMapSet gidMapToParent idMapSet // TODO(b/27454212): Support disabling setgroups(2). } // NewRootUserNamespace returns a UserNamespace that is appropriate for a // system's root user namespace. Note that namespaces returned by separate calls // to this function are *distinct* namespaces. Once a root namespace is created // by this function, the returned value must be reused to refer to the same // namespace. func NewRootUserNamespace() *UserNamespace { var ns UserNamespace // """ // The initial user namespace has no parent namespace, but, for // consistency, the kernel provides dummy user and group ID mapping files // for this namespace. Looking at the uid_map file (gid_map is the same) // from a shell in the initial namespace shows: // // $ cat /proc/$$/uid_map // 0 0 4294967295 // """ - user_namespaces(7) for _, m := range []*idMapSet{ &ns.uidMapFromParent, &ns.uidMapToParent, &ns.gidMapFromParent, &ns.gidMapToParent, } { // Insertion into an empty map shouldn't fail. m.InsertRange(idMapRange{0, math.MaxUint32}, 0) } return &ns } // Root returns the root of the user namespace tree containing ns. func (ns *UserNamespace) Root() *UserNamespace { for ns.parent != nil { ns = ns.parent } return ns } // "The kernel imposes (since version 3.11) a limit of 32 nested levels of user // namespaces." - user_namespaces(7) const maxUserNamespaceDepth = 32 func (ns *UserNamespace) depth() int { var i int for ns != nil { i++ ns = ns.parent } return i } // NewChildUserNamespace returns a new user namespace created by a caller with // credentials c. func (c *Credentials) NewChildUserNamespace() (*UserNamespace, error) { if c.UserNamespace.depth() >= maxUserNamespaceDepth { // "... Calls to unshare(2) or clone(2) that would cause this limit to // be exceeded fail with the error EUSERS." - user_namespaces(7) return nil, linuxerr.EUSERS } // "EPERM: CLONE_NEWUSER was specified in flags, but either the effective // user ID or the effective group ID of the caller does not have a mapping // in the parent namespace (see user_namespaces(7))." - clone(2) // "CLONE_NEWUSER requires that the user ID and group ID of the calling // process are mapped to user IDs and group IDs in the user namespace of // the calling process at the time of the call." - unshare(2) if !c.EffectiveKUID.In(c.UserNamespace).Ok() { return nil, linuxerr.EPERM } if !c.EffectiveKGID.In(c.UserNamespace).Ok() { return nil, linuxerr.EPERM } return &UserNamespace{ parent: c.UserNamespace, owner: c.EffectiveKUID, // "When a user namespace is created, it starts without a mapping of // user IDs (group IDs) to the parent user namespace." - // user_namespaces(7) }, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/auth/user_namespace_mutex.go000066400000000000000000000034561465435605700277440ustar00rootroot00000000000000package auth import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type userNamespaceMutex struct { mu sync.Mutex } var userNamespaceprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var userNamespacelockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type userNamespacelockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. const ( userNamespaceLockNs = userNamespacelockNameIndex(0) ) const () // Lock locks m. // +checklocksignore func (m *userNamespaceMutex) Lock() { locking.AddGLock(userNamespaceprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *userNamespaceMutex) NestedLock(i userNamespacelockNameIndex) { locking.AddGLock(userNamespaceprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *userNamespaceMutex) Unlock() { locking.DelGLock(userNamespaceprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *userNamespaceMutex) NestedUnlock(i userNamespacelockNameIndex) { locking.DelGLock(userNamespaceprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func userNamespaceinitLockNames() { userNamespacelockNames = []string{"ns"} } func init() { userNamespaceinitLockNames() userNamespaceprefixIndex = locking.NewMutexClass(reflect.TypeOf(userNamespaceMutex{}), userNamespacelockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/cgroup.go000066400000000000000000000416011465435605700240600ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "bytes" "fmt" "sort" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // InvalidCgroupHierarchyID indicates an uninitialized hierarchy ID. const InvalidCgroupHierarchyID uint32 = 0 // InvalidCgroupID indicates an uninitialized cgroup ID. const InvalidCgroupID uint32 = 0 // CgroupControllerType is the name of a cgroup controller. type CgroupControllerType string // Available cgroup controllers. const ( CgroupControllerCPU = CgroupControllerType("cpu") CgroupControllerCPUAcct = CgroupControllerType("cpuacct") CgroupControllerCPUSet = CgroupControllerType("cpuset") CgroupControllerDevices = CgroupControllerType("devices") CgroupControllerJob = CgroupControllerType("job") CgroupControllerMemory = CgroupControllerType("memory") CgroupControllerPIDs = CgroupControllerType("pids") ) // CgroupCtrls is the list of cgroup controllers. var CgroupCtrls = []CgroupControllerType{"cpu", "cpuacct", "cpuset", "devices", "job", "memory", "pids"} // ParseCgroupController parses a string as a CgroupControllerType. func ParseCgroupController(val string) (CgroupControllerType, error) { switch val { case "cpu": return CgroupControllerCPU, nil case "cpuacct": return CgroupControllerCPUAcct, nil case "cpuset": return CgroupControllerCPUSet, nil case "devices": return CgroupControllerDevices, nil case "job": return CgroupControllerJob, nil case "memory": return CgroupControllerMemory, nil case "pids": return CgroupControllerPIDs, nil default: return "", fmt.Errorf("no such cgroup controller") } } // CgroupResourceType represents a resource type tracked by a particular // controller. type CgroupResourceType int // Resources for the cpuacct controller. const ( // CgroupResourcePID represents a charge for pids.current. CgroupResourcePID CgroupResourceType = iota ) // CgroupController is the common interface to cgroup controllers available to // the entire sentry. The controllers themselves are defined by cgroupfs. // // Callers of this interface are often unable access synchronization needed to // ensure returned values remain valid. Some of values returned from this // interface are thus snapshots in time, and may become stale. This is ok for // many callers like procfs. type CgroupController interface { // Returns the type of this cgroup controller (ex "memory", "cpu"). Returned // value is valid for the lifetime of the controller. Type() CgroupControllerType // Hierarchy returns the ID of the hierarchy this cgroup controller is // attached to. Returned value is valid for the lifetime of the controller. HierarchyID() uint32 // EffectiveRootCgroup returns the effective root cgroup for this // controller. This is either the actual root of the underlying cgroupfs // filesystem, or the override root configured at sandbox startup. Returned // value is valid for the lifetime of the controller. EffectiveRootCgroup() Cgroup // NumCgroups returns the number of cgroups managed by this controller. // Returned value is a snapshot in time. NumCgroups() uint64 // Enabled returns whether this controller is enabled. Returned value is a // snapshot in time. Enabled() bool } // Cgroup represents a named pointer to a cgroup in cgroupfs. When a task enters // a cgroup, it holds a reference on the underlying dentry pointing to the // cgroup. // // +stateify savable type Cgroup struct { *kernfs.Dentry CgroupImpl } // decRef drops a reference on the cgroup. This must happen outside a Task.mu // critical section. func (c *Cgroup) decRef() { c.Dentry.DecRef(context.Background()) } // Path returns the absolute path of c, relative to its hierarchy root. func (c *Cgroup) Path() string { return c.FSLocalPath() } // Walk returns the cgroup at p, starting from c. func (c *Cgroup) Walk(ctx context.Context, vfsObj *vfs.VirtualFilesystem, p fspath.Path) (Cgroup, error) { d, err := c.Dentry.WalkDentryTree(ctx, vfsObj, p) if err != nil { return Cgroup{}, err } return Cgroup{ Dentry: d, CgroupImpl: d.Inode().(CgroupImpl), }, nil } // CgroupMigrationContext represents an in-flight cgroup migration for // a single task. type CgroupMigrationContext struct { src Cgroup dst Cgroup t *Task } // Abort cancels a migration. func (ctx *CgroupMigrationContext) Abort() { ctx.dst.AbortMigrate(ctx.t, &ctx.src) } // Commit completes a migration. func (ctx *CgroupMigrationContext) Commit() { ctx.dst.CommitMigrate(ctx.t, &ctx.src) ctx.t.mu.Lock() delete(ctx.t.cgroups, ctx.src) ctx.src.DecRef(ctx.t) ctx.dst.IncRef() ctx.t.cgroups[ctx.dst] = struct{}{} ctx.t.mu.Unlock() } // CgroupImpl is the common interface to cgroups. type CgroupImpl interface { // Controllers lists the controller associated with this cgroup. Controllers() []CgroupController // HierarchyID returns the id of the hierarchy that contains this cgroup. HierarchyID() uint32 // Name returns the name for this cgroup, if any. If no name was provided // when the hierarchy was created, returns "". Name() string // Enter moves t into this cgroup. Enter(t *Task) // Leave moves t out of this cgroup. Leave(t *Task) // PrepareMigrate initiates a migration of t from src to this cgroup. See // cgroupfs.controller.PrepareMigrate. PrepareMigrate(t *Task, src *Cgroup) error // CommitMigrate completes an in-flight migration. See // cgroupfs.controller.CommitMigrate. CommitMigrate(t *Task, src *Cgroup) // AbortMigrate cancels an in-flight migration. See // cgroupfs.controller.AbortMigrate. AbortMigrate(t *Task, src *Cgroup) // Charge charges a controller in this cgroup for a particular resource. key // must match a valid resource for the specified controller type. // // The implementer should silently succeed if no matching controllers are // found. // // The underlying implementation will panic if passed an incompatible // resource type for a given controller. // // See cgroupfs.controller.Charge. Charge(t *Task, d *kernfs.Dentry, ctl CgroupControllerType, res CgroupResourceType, value int64) error // ReadControlFromBackground allows a background context to read a cgroup's // control values. ReadControl(ctx context.Context, name string) (string, error) // WriteControl allows a background context to write a cgroup's control // values. WriteControl(ctx context.Context, name string, val string) error // ID returns the id of this cgroup. ID() uint32 } // hierarchy represents a cgroupfs filesystem instance, with a unique set of // controllers attached to it. Multiple cgroupfs mounts may reference the same // hierarchy. // // +stateify savable type hierarchy struct { id uint32 name string // These are a subset of the controllers in CgroupRegistry.controllers, // grouped here by hierarchy for convenient lookup. controllers map[CgroupControllerType]CgroupController // fs is not owned by hierarchy. The FS is responsible for unregistering the // hierarchy on destruction, which removes this association. fs *vfs.Filesystem } func (h *hierarchy) match(ctypes []CgroupControllerType) bool { if len(ctypes) != len(h.controllers) { return false } for _, ty := range ctypes { if _, ok := h.controllers[ty]; !ok { return false } } return true } // cgroupFS is the public interface to cgroupfs. This lets the kernel package // refer to cgroupfs.filesystem methods without directly depending on the // cgroupfs package, which would lead to a circular dependency. type cgroupFS interface { // Returns the vfs.Filesystem for the cgroupfs. VFSFilesystem() *vfs.Filesystem // InitializeHierarchyID sets the hierarchy ID for this filesystem during // filesystem creation. May only be called before the filesystem is visible // to the vfs layer. InitializeHierarchyID(hid uint32) // RootCgroup returns the root cgroup of this instance. This returns the // actual root, and ignores any overrides setting an effective root. RootCgroup() Cgroup } // CgroupRegistry tracks the active set of cgroup controllers on the system. // // +stateify savable type CgroupRegistry struct { // lastHierarchyID is the id of the last allocated cgroup hierarchy. Valid // ids are from 1 to math.MaxUint32. // lastHierarchyID atomicbitops.Uint32 // lastCgroupID is the id of the last allocated cgroup. Valid ids are // from 1 to math.MaxUint32. // lastCgroupID atomicbitops.Uint32 mu cgroupMutex `state:"nosave"` // controllers is the set of currently known cgroup controllers on the // system. // // +checklocks:mu controllers map[CgroupControllerType]CgroupController // hierarchies is the active set of cgroup hierarchies. This contains all // hierarchies on the system. // // +checklocks:mu hierarchies map[uint32]hierarchy // hierarchiesByName is a map of named hierarchies. Only named hierarchies // are tracked on this map. // // +checklocks:mu hierarchiesByName map[string]hierarchy // cgroups is the active set of cgroups. This contains all the cgroups // on the system. // // +checklocks:mu cgroups map[uint32]CgroupImpl } func newCgroupRegistry() *CgroupRegistry { return &CgroupRegistry{ controllers: make(map[CgroupControllerType]CgroupController), hierarchies: make(map[uint32]hierarchy), hierarchiesByName: make(map[string]hierarchy), cgroups: make(map[uint32]CgroupImpl), } } // nextHierarchyID returns a newly allocated, unique hierarchy ID. func (r *CgroupRegistry) nextHierarchyID() (uint32, error) { if hid := r.lastHierarchyID.Add(1); hid != 0 { return hid, nil } return InvalidCgroupHierarchyID, fmt.Errorf("cgroup hierarchy ID overflow") } // FindHierarchy returns a cgroup filesystem containing exactly the set of // controllers named in ctypes, and optionally the name specified in name if it // isn't empty. If no such FS is found, FindHierarchy return nil. FindHierarchy // takes a reference on the returned FS, which is transferred to the caller. func (r *CgroupRegistry) FindHierarchy(name string, ctypes []CgroupControllerType) (*vfs.Filesystem, error) { r.mu.Lock() defer r.mu.Unlock() // If we have a hierarchy name, lookup by name. if name != "" { h, ok := r.hierarchiesByName[name] if !ok { // Name not found. return nil, nil } if h.match(ctypes) { if !h.fs.TryIncRef() { // May be racing with filesystem destruction, see below. r.unregisterLocked(h.id) return nil, nil } return h.fs, nil } // Name matched, but controllers didn't. Fail per linux // kernel/cgroup.c:cgroup_mount(). log.Debugf("cgroupfs: Registry lookup for name=%s controllers=%v failed; named matched but controllers didn't (have controllers=%v)", name, ctypes, h.controllers) return nil, linuxerr.EBUSY } for _, h := range r.hierarchies { if h.match(ctypes) { if !h.fs.TryIncRef() { // Racing with filesystem destruction, namely h.fs.Release. // Since we hold r.mu, we know the hierarchy hasn't been // unregistered yet, but its associated filesystem is tearing // down. // // If we simply indicate the hierarchy wasn't found without // cleaning up the registry, the caller can race with the // unregister and find itself temporarily unable to create a new // hierarchy with a subset of the relevant controllers. // // To keep the result of FindHierarchy consistent with the // uniqueness of controllers enforced by Register, drop the // dying hierarchy now. The eventual unregister by the FS // teardown will become a no-op. r.unregisterLocked(h.id) return nil, nil } return h.fs, nil } } return nil, nil } // FindCgroup locates a cgroup with the given parameters. // // A cgroup is considered a match even if it contains other controllers on the // same hierarchy. func (r *CgroupRegistry) FindCgroup(ctx context.Context, ctype CgroupControllerType, path string) (Cgroup, error) { p := fspath.Parse(path) if !p.Absolute { return Cgroup{}, fmt.Errorf("path must be absolute") } k := KernelFromContext(ctx) vfsfs, err := r.FindHierarchy("", []CgroupControllerType{ctype}) if err != nil { return Cgroup{}, err } if vfsfs == nil { return Cgroup{}, fmt.Errorf("controller not active") } defer vfsfs.DecRef(ctx) rootCG := vfsfs.Impl().(cgroupFS).RootCgroup() if !p.HasComponents() { // Explicit root '/'. return rootCG, nil } return rootCG.Walk(ctx, k.VFS(), p) } // Register registers the provided set of controllers with the registry as a new // hierarchy. If any controller is already registered, the function returns an // error without modifying the registry. Register sets the hierarchy ID for the // filesystem on success. func (r *CgroupRegistry) Register(name string, cs []CgroupController, fs cgroupFS) error { r.mu.Lock() defer r.mu.Unlock() if name == "" && len(cs) == 0 { return fmt.Errorf("can't register hierarchy with both no controllers and no name") } for _, c := range cs { if _, ok := r.controllers[c.Type()]; ok { return fmt.Errorf("controllers may only be mounted on a single hierarchy") } } if _, ok := r.hierarchiesByName[name]; name != "" && ok { return fmt.Errorf("hierarchy named %q already exists", name) } hid, err := r.nextHierarchyID() if err != nil { return err } // Must not fail below here, once we publish the hierarchy ID. fs.InitializeHierarchyID(hid) h := hierarchy{ id: hid, name: name, controllers: make(map[CgroupControllerType]CgroupController), fs: fs.VFSFilesystem(), } for _, c := range cs { n := c.Type() r.controllers[n] = c h.controllers[n] = c } r.hierarchies[hid] = h if name != "" { r.hierarchiesByName[name] = h } return nil } // Unregister removes a previously registered hierarchy from the registry. If no // such hierarchy is registered, Unregister is a no-op. func (r *CgroupRegistry) Unregister(hid uint32) { r.mu.Lock() r.unregisterLocked(hid) r.mu.Unlock() } // Precondition: Caller must hold r.mu. // +checklocks:r.mu func (r *CgroupRegistry) unregisterLocked(hid uint32) { if h, ok := r.hierarchies[hid]; ok { for name := range h.controllers { delete(r.controllers, name) } delete(r.hierarchies, hid) } } // computeInitialGroups takes a reference on each of the returned cgroups. The // caller takes ownership of this returned reference. func (r *CgroupRegistry) computeInitialGroups(inherit map[Cgroup]struct{}) map[Cgroup]struct{} { r.mu.Lock() defer r.mu.Unlock() ctlSet := make(map[CgroupControllerType]CgroupController) cgset := make(map[Cgroup]struct{}) // Remember controllers from the inherited cgroups set... for cg := range inherit { cg.IncRef() // Ref transferred to caller. for _, ctl := range cg.Controllers() { ctlSet[ctl.Type()] = ctl cgset[cg] = struct{}{} } } // ... and add the root cgroups of all the missing controllers. for name, ctl := range r.controllers { if _, ok := ctlSet[name]; !ok { cg := ctl.EffectiveRootCgroup() // Multiple controllers may share the same hierarchy, so may have // the same root cgroup. Grab a single ref per hierarchy root. if _, ok := cgset[cg]; ok { continue } cg.IncRef() // Ref transferred to caller. cgset[cg] = struct{}{} } } return cgset } // GenerateProcCgroups writes the contents of /proc/cgroups to buf. func (r *CgroupRegistry) GenerateProcCgroups(buf *bytes.Buffer) { r.mu.Lock() entries := make([]string, 0, len(r.controllers)) for _, c := range r.controllers { en := 0 if c.Enabled() { en = 1 } entries = append(entries, fmt.Sprintf("%s\t%d\t%d\t%d\n", c.Type(), c.HierarchyID(), c.NumCgroups(), en)) } r.mu.Unlock() sort.Strings(entries) fmt.Fprint(buf, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n") for _, e := range entries { fmt.Fprint(buf, e) } } // NextCgroupID returns a newly allocated, unique cgroup ID. func (r *CgroupRegistry) NextCgroupID() (uint32, error) { if cid := r.lastCgroupID.Add(1); cid != 0 { return cid, nil } return InvalidCgroupID, fmt.Errorf("cgroup ID overflow") } // AddCgroup adds the ID and cgroup in the map. func (r *CgroupRegistry) AddCgroup(cg CgroupImpl) { r.mu.Lock() r.cgroups[cg.ID()] = cg r.mu.Unlock() } // GetCgroup returns the cgroup associated with the cgroup ID. func (r *CgroupRegistry) GetCgroup(cid uint32) (CgroupImpl, error) { r.mu.Lock() defer r.mu.Unlock() cg, ok := r.cgroups[cid] if !ok { return nil, fmt.Errorf("cgroup with ID %d does not exist", cid) } return cg, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/cgroup_mounts_mutex.go000066400000000000000000000033221465435605700267050ustar00rootroot00000000000000package kernel import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type cgroupMountsMutex struct { mu sync.Mutex } var cgroupMountsprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var cgroupMountslockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type cgroupMountslockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *cgroupMountsMutex) Lock() { locking.AddGLock(cgroupMountsprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *cgroupMountsMutex) NestedLock(i cgroupMountslockNameIndex) { locking.AddGLock(cgroupMountsprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *cgroupMountsMutex) Unlock() { locking.DelGLock(cgroupMountsprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *cgroupMountsMutex) NestedUnlock(i cgroupMountslockNameIndex) { locking.DelGLock(cgroupMountsprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func cgroupMountsinitLockNames() {} func init() { cgroupMountsinitLockNames() cgroupMountsprefixIndex = locking.NewMutexClass(reflect.TypeOf(cgroupMountsMutex{}), cgroupMountslockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/cgroup_mutex.go000066400000000000000000000031401465435605700252760ustar00rootroot00000000000000package kernel import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type cgroupMutex struct { mu sync.Mutex } var cgroupprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var cgrouplockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type cgrouplockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *cgroupMutex) Lock() { locking.AddGLock(cgroupprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *cgroupMutex) NestedLock(i cgrouplockNameIndex) { locking.AddGLock(cgroupprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *cgroupMutex) Unlock() { locking.DelGLock(cgroupprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *cgroupMutex) NestedUnlock(i cgrouplockNameIndex) { locking.DelGLock(cgroupprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func cgroupinitLockNames() {} func init() { cgroupinitLockNames() cgroupprefixIndex = locking.NewMutexClass(reflect.TypeOf(cgroupMutex{}), cgrouplockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/context.go000066400000000000000000000055101465435605700242440ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" ) // contextID is the kernel package's type for context.Context.Value keys. type contextID int const ( // CtxCanTrace is a Context.Value key for a function with the same // signature and semantics as kernel.Task.CanTrace. CtxCanTrace contextID = iota // CtxKernel is a Context.Value key for a Kernel. CtxKernel // CtxPIDNamespace is a Context.Value key for a PIDNamespace. CtxPIDNamespace // CtxTask is a Context.Value key for a Task. CtxTask // CtxUTSNamespace is a Context.Value key for a UTSNamespace. CtxUTSNamespace ) // ContextCanTrace returns true if ctx is permitted to trace t, in the same sense // as kernel.Task.CanTrace. func ContextCanTrace(ctx context.Context, t *Task, attach bool) bool { if v := ctx.Value(CtxCanTrace); v != nil { return v.(func(*Task, bool) bool)(t, attach) } return false } // KernelFromContext returns the Kernel in which ctx is executing, or nil if // there is no such Kernel. func KernelFromContext(ctx context.Context) *Kernel { if v := ctx.Value(CtxKernel); v != nil { return v.(*Kernel) } return nil } // PIDNamespaceFromContext returns the PID namespace in which ctx is executing, // or nil if there is no such PID namespace. func PIDNamespaceFromContext(ctx context.Context) *PIDNamespace { if v := ctx.Value(CtxPIDNamespace); v != nil { return v.(*PIDNamespace) } return nil } // UTSNamespaceFromContext returns the UTS namespace in which ctx is executing, // or nil if there is no such UTS namespace. func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace { if v := ctx.Value(CtxUTSNamespace); v != nil { return v.(*UTSNamespace) } return nil } // IPCNamespaceFromContext returns the IPC namespace in which ctx is executing, // or nil if there is no such IPC namespace. It takes a reference on the // namespace. func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace { if v := ctx.Value(ipc.CtxIPCNamespace); v != nil { return v.(*IPCNamespace) } return nil } // TaskFromContext returns the Task associated with ctx, or nil if there is no // such Task. func TaskFromContext(ctx context.Context) *Task { if v := ctx.Value(CtxTask); v != nil { return v.(*Task) } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/cpu_clock_mutex.go000066400000000000000000000032061465435605700257440ustar00rootroot00000000000000package kernel import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type cpuClockMutex struct { mu sync.Mutex } var cpuClockprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var cpuClocklockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type cpuClocklockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *cpuClockMutex) Lock() { locking.AddGLock(cpuClockprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *cpuClockMutex) NestedLock(i cpuClocklockNameIndex) { locking.AddGLock(cpuClockprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *cpuClockMutex) Unlock() { locking.DelGLock(cpuClockprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *cpuClockMutex) NestedUnlock(i cpuClocklockNameIndex) { locking.DelGLock(cpuClockprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func cpuClockinitLockNames() {} func init() { cpuClockinitLockNames() cpuClockprefixIndex = locking.NewMutexClass(reflect.TypeOf(cpuClockMutex{}), cpuClocklockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/fasync/000077500000000000000000000000001465435605700235135ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/fasync/fasync.go000066400000000000000000000156241465435605700253350ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package fasync provides FIOASYNC related functionality. package fasync import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/waiter" ) // Table to convert waiter event masks into si_band siginfo codes. // Taken from fs/fcntl.c:band_table. var bandTable = map[waiter.EventMask]int64{ // POLL_IN waiter.EventIn: linux.EPOLLIN | linux.EPOLLRDNORM, // POLL_OUT waiter.EventOut: linux.EPOLLOUT | linux.EPOLLWRNORM | linux.EPOLLWRBAND, // POLL_ERR waiter.EventErr: linux.EPOLLERR, // POLL_PRI waiter.EventPri: linux.EPOLLPRI | linux.EPOLLRDBAND, // POLL_HUP waiter.EventHUp: linux.EPOLLHUP | linux.EPOLLERR, } // New returns a function that creates a new vfs.FileAsync with the given // file descriptor. func New(fd int) func() vfs.FileAsync { return func() vfs.FileAsync { return &FileAsync{fd: fd} } } // FileAsync sends signals when the registered file is ready for IO. // // +stateify savable type FileAsync struct { // e is immutable after first use (which is protected by mu below). e waiter.Entry // fd is the file descriptor to notify about. // It is immutable, set at allocation time. This matches Linux semantics in // fs/fcntl.c:fasync_helper. // The fd value is passed to the signal recipient in siginfo.si_fd. fd int // regMu protects registration and unregistration actions on e. // // regMu must be held while registration decisions are being made // through the registration action itself. // // Lock ordering: regMu, mu. regMu regMutex `state:"nosave"` // mu protects all following fields. // // Lock ordering: e.mu, mu. mu fileMutex `state:"nosave"` requester *auth.Credentials registered bool // signal is the signal to deliver upon I/O being available. // The default value ("zero signal") means the default SIGIO signal will be // delivered. signal linux.Signal // Only one of the following is allowed to be non-nil. recipientPG *kernel.ProcessGroup recipientTG *kernel.ThreadGroup recipientT *kernel.Task } // NotifyEvent implements waiter.EventListener.NotifyEvent. func (a *FileAsync) NotifyEvent(mask waiter.EventMask) { a.mu.Lock() if !a.registered { a.mu.Unlock() return } // Read all the required fields which are lock protected from FileAsync // and release the lock. t := a.recipientT tg := a.recipientTG creds := a.requester sig := a.signal if a.recipientPG != nil { tg = a.recipientPG.Originator() } a.mu.Unlock() if tg != nil { t = tg.Leader() } if t == nil { // No recipient has been registered. return } tCreds := t.Credentials() // Logic from sigio_perm in fs/fcntl.c. permCheck := (creds.EffectiveKUID == 0 || creds.EffectiveKUID == tCreds.SavedKUID || creds.EffectiveKUID == tCreds.RealKUID || creds.RealKUID == tCreds.SavedKUID || creds.RealKUID == tCreds.RealKUID) if !permCheck { return } signalInfo := &linux.SignalInfo{ Signo: int32(linux.SIGIO), Code: linux.SI_KERNEL, } if sig != 0 { signalInfo.Signo = int32(sig) signalInfo.SetFD(uint32(a.fd)) var band int64 for m, bandCode := range bandTable { if m&mask != 0 { band |= bandCode } } signalInfo.SetBand(band) } if tg != nil { t.SendGroupSignal(signalInfo) } else { t.SendSignal(signalInfo) } } // Register sets the file which will be monitored for IO events. // // The file must not be currently registered. func (a *FileAsync) Register(w waiter.Waitable) error { a.regMu.Lock() defer a.regMu.Unlock() a.mu.Lock() if a.registered { a.mu.Unlock() panic("registering already registered file") } a.e.Init(a, waiter.ReadableEvents|waiter.WritableEvents|waiter.EventErr|waiter.EventHUp) a.registered = true a.mu.Unlock() return w.EventRegister(&a.e) } // Unregister stops monitoring a file. // // The file must be currently registered. func (a *FileAsync) Unregister(w waiter.Waitable) { a.regMu.Lock() defer a.regMu.Unlock() a.mu.Lock() if !a.registered { a.mu.Unlock() panic("unregistering unregistered file") } a.registered = false a.mu.Unlock() w.EventUnregister(&a.e) } // Owner returns who is currently getting signals. All return values will be // nil if no one is set to receive signals. func (a *FileAsync) Owner() (*kernel.Task, *kernel.ThreadGroup, *kernel.ProcessGroup) { a.mu.Lock() defer a.mu.Unlock() return a.recipientT, a.recipientTG, a.recipientPG } // SetOwnerTask sets the owner (who will receive signals) to a specified task. // Only this owner will receive signals. func (a *FileAsync) SetOwnerTask(requester *kernel.Task, recipient *kernel.Task) { a.mu.Lock() defer a.mu.Unlock() a.requester = requester.Credentials() a.recipientT = recipient a.recipientTG = nil a.recipientPG = nil } // SetOwnerThreadGroup sets the owner (who will receive signals) to a specified // thread group. Only this owner will receive signals. func (a *FileAsync) SetOwnerThreadGroup(requester *kernel.Task, recipient *kernel.ThreadGroup) { a.mu.Lock() defer a.mu.Unlock() a.requester = requester.Credentials() a.recipientT = nil a.recipientTG = recipient a.recipientPG = nil } // SetOwnerProcessGroup sets the owner (who will receive signals) to a // specified process group. Only this owner will receive signals. func (a *FileAsync) SetOwnerProcessGroup(requester *kernel.Task, recipient *kernel.ProcessGroup) { a.mu.Lock() defer a.mu.Unlock() a.requester = requester.Credentials() a.recipientT = nil a.recipientTG = nil a.recipientPG = recipient } // ClearOwner unsets the current signal recipient. func (a *FileAsync) ClearOwner() { a.mu.Lock() defer a.mu.Unlock() a.requester = nil a.recipientT = nil a.recipientTG = nil a.recipientPG = nil } // Signal returns which signal will be sent to the signal recipient. // A value of zero means the signal to deliver wasn't customized, which means // the default signal (SIGIO) will be delivered. func (a *FileAsync) Signal() linux.Signal { a.mu.Lock() defer a.mu.Unlock() return a.signal } // SetSignal overrides which signal to send when I/O is available. // The default behavior can be reset by specifying signal zero, which means // to send SIGIO. func (a *FileAsync) SetSignal(signal linux.Signal) error { if signal != 0 && !signal.IsValid() { return linuxerr.EINVAL } a.mu.Lock() defer a.mu.Unlock() a.signal = signal return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/fasync/fasync_state_autogen.go000066400000000000000000000025021465435605700302460ustar00rootroot00000000000000// automatically generated by stateify. package fasync import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (a *FileAsync) StateTypeName() string { return "pkg/sentry/kernel/fasync.FileAsync" } func (a *FileAsync) StateFields() []string { return []string{ "e", "fd", "requester", "registered", "signal", "recipientPG", "recipientTG", "recipientT", } } func (a *FileAsync) beforeSave() {} // +checklocksignore func (a *FileAsync) StateSave(stateSinkObject state.Sink) { a.beforeSave() stateSinkObject.Save(0, &a.e) stateSinkObject.Save(1, &a.fd) stateSinkObject.Save(2, &a.requester) stateSinkObject.Save(3, &a.registered) stateSinkObject.Save(4, &a.signal) stateSinkObject.Save(5, &a.recipientPG) stateSinkObject.Save(6, &a.recipientTG) stateSinkObject.Save(7, &a.recipientT) } func (a *FileAsync) afterLoad(context.Context) {} // +checklocksignore func (a *FileAsync) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &a.e) stateSourceObject.Load(1, &a.fd) stateSourceObject.Load(2, &a.requester) stateSourceObject.Load(3, &a.registered) stateSourceObject.Load(4, &a.signal) stateSourceObject.Load(5, &a.recipientPG) stateSourceObject.Load(6, &a.recipientTG) stateSourceObject.Load(7, &a.recipientT) } func init() { state.Register((*FileAsync)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/fasync/file_mutex.go000066400000000000000000000030721465435605700262050ustar00rootroot00000000000000package fasync import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type fileMutex struct { mu sync.Mutex } var fileprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var filelockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type filelockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *fileMutex) Lock() { locking.AddGLock(fileprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *fileMutex) NestedLock(i filelockNameIndex) { locking.AddGLock(fileprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *fileMutex) Unlock() { locking.DelGLock(fileprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *fileMutex) NestedUnlock(i filelockNameIndex) { locking.DelGLock(fileprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func fileinitLockNames() {} func init() { fileinitLockNames() fileprefixIndex = locking.NewMutexClass(reflect.TypeOf(fileMutex{}), filelockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/fasync/reg_mutex.go000066400000000000000000000030471465435605700260450ustar00rootroot00000000000000package fasync import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type regMutex struct { mu sync.Mutex } var regprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var reglockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type reglockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *regMutex) Lock() { locking.AddGLock(regprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *regMutex) NestedLock(i reglockNameIndex) { locking.AddGLock(regprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *regMutex) Unlock() { locking.DelGLock(regprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *regMutex) NestedUnlock(i reglockNameIndex) { locking.DelGLock(regprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func reginitLockNames() {} func init() { reginitLockNames() regprefixIndex = locking.NewMutexClass(reflect.TypeOf(regMutex{}), reglockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/fd_table.go000066400000000000000000000344141465435605700243250ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( goContext "context" "fmt" "math" "strings" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bitmap" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/lock" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // FDFlags define flags for an individual descriptor. // // +stateify savable type FDFlags struct { // CloseOnExec indicates the descriptor should be closed on exec. CloseOnExec bool } // ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags // representation. func (f FDFlags) ToLinuxFileFlags() (mask uint) { if f.CloseOnExec { mask |= linux.O_CLOEXEC } return } // ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags // representation. func (f FDFlags) ToLinuxFDFlags() (mask uint) { if f.CloseOnExec { mask |= linux.FD_CLOEXEC } return } // descriptor holds the details about a file descriptor, namely a pointer to // the file itself and the descriptor flags. // // Note that this is immutable and can only be changed via operations on the // descriptorTable. // // +stateify savable type descriptor struct { file *vfs.FileDescription flags FDFlags } // MaxFdLimit defines the upper limit on the integer value of file descriptors. const MaxFdLimit int32 = int32(bitmap.MaxBitEntryLimit) // FDTable is used to manage File references and flags. // // +stateify savable type FDTable struct { FDTableRefs k *Kernel // mu protects below. mu fdTableMutex `state:"nosave"` // fdBitmap shows which fds are already in use. fdBitmap bitmap.Bitmap `state:"nosave"` // descriptorTable holds descriptors. descriptorTable `state:".(map[int32]descriptor)"` } func (f *FDTable) saveDescriptorTable() map[int32]descriptor { m := make(map[int32]descriptor) f.mu.Lock() defer f.mu.Unlock() f.ForEach(context.Background(), func(fd int32, file *vfs.FileDescription, flags FDFlags) bool { m[fd] = descriptor{ file: file, flags: flags, } return true }) return m } func (f *FDTable) loadDescriptorTable(_ goContext.Context, m map[int32]descriptor) { ctx := context.Background() f.initNoLeakCheck() // Initialize table. f.fdBitmap = bitmap.New(uint32(math.MaxUint16)) for fd, d := range m { if fd < 0 { panic(fmt.Sprintf("FD is not supposed to be negative. FD: %d", fd)) } if df := f.set(fd, d.file, d.flags); df != nil { panic("file set") } f.fdBitmap.Add(uint32(fd)) // Note that we do _not_ need to acquire a extra table reference here. The // table reference will already be accounted for in the file, so we drop the // reference taken by set above. if d.file != nil { d.file.DecRef(ctx) } } } // Release any POSIX lock possibly held by the FDTable. func (f *FDTable) fileUnlock(ctx context.Context, file *vfs.FileDescription) { if file.SupportsLocks() { err := file.UnlockPOSIX(ctx, f, lock.LockRange{0, lock.LockEOF}) if err != nil && !linuxerr.Equals(linuxerr.ENOLCK, err) { panic(fmt.Sprintf("UnlockPOSIX failed: %v", err)) } } } // NewFDTable allocates a new FDTable that may be used by tasks in k. func (k *Kernel) NewFDTable() *FDTable { f := &FDTable{k: k} f.init() return f } // DecRef implements RefCounter.DecRef. // // If f reaches zero references, all of its file descriptors are removed. func (f *FDTable) DecRef(ctx context.Context) { f.FDTableRefs.DecRef(func() { f.RemoveIf(ctx, func(*vfs.FileDescription, FDFlags) bool { return true }) }) } // forEachUpTo iterates over all non-nil files upto maxFds (non-inclusive) in sorted order. // // It is the caller's responsibility to acquire an appropriate lock. func (f *FDTable) forEachUpTo(ctx context.Context, maxFd int32, fn func(fd int32, file *vfs.FileDescription, flags FDFlags) bool) { // Iterate through the fdBitmap. f.fdBitmap.ForEach(0, uint32(maxFd), func(ufd uint32) bool { fd := int32(ufd) file, flags, ok := f.get(fd) if !ok || file == nil { return true } if !file.TryIncRef() { return true } defer file.DecRef(ctx) return fn(fd, file, flags) }) } // ForEach iterates over all non-nil files upto maxFd in sorted order. // // It is the caller's responsibility to acquire an appropriate lock. func (f *FDTable) ForEach(ctx context.Context, fn func(fd int32, file *vfs.FileDescription, flags FDFlags) bool) { f.forEachUpTo(ctx, MaxFdLimit, fn) } // String is a stringer for FDTable. func (f *FDTable) String() string { var buf strings.Builder ctx := context.Background() files := make(map[int32]*vfs.FileDescription) f.mu.Lock() // Can't release f.mu from defer, because vfsObj.PathnameWithDeleted // should not be called under the fdtable mutex. f.ForEach(ctx, func(fd int32, file *vfs.FileDescription, flags FDFlags) bool { if file != nil { file.IncRef() files[fd] = file } return true }) f.mu.Unlock() defer func() { for _, f := range files { f.DecRef(ctx) } }() for fd, file := range files { vfsObj := file.Mount().Filesystem().VirtualFilesystem() vd := file.VirtualDentry() if vd.Dentry() == nil { panic(fmt.Sprintf("fd %d (type %T) has nil dentry: %#v", fd, file.Impl(), file)) } name, err := vfsObj.PathnameWithDeleted(ctx, vfs.VirtualDentry{}, file.VirtualDentry()) if err != nil { fmt.Fprintf(&buf, "\n", err) continue } fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, name) } return buf.String() } // NewFDs allocates new FDs guaranteed to be the lowest number available // greater than or equal to the minFD parameter. All files will share the set // flags. Success is guaranteed to be all or none. func (f *FDTable) NewFDs(ctx context.Context, minFD int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) { if minFD < 0 { // Don't accept negative FDs. return nil, unix.EINVAL } // Default limit. end := f.k.MaxFDLimit.Load() // Ensure we don't get past the provided limit. if limitSet := limits.FromContext(ctx); limitSet != nil { lim := limitSet.Get(limits.NumberOfFiles) // Only set if the limit is smaller than the max to avoid overflow. if lim.Cur != limits.Infinity && lim.Cur < uint64(end) { end = int32(lim.Cur) } } if minFD+int32(len(files)) > end { return nil, unix.EMFILE } f.mu.Lock() // max is used as the largest number in fdBitmap + 1. max := int32(0) if !f.fdBitmap.IsEmpty() { max = int32(f.fdBitmap.Maximum()) max++ } // Adjust max in case it is less than minFD. if max < minFD { max = minFD } // Install all entries. for len(fds) < len(files) { // Try to use free bit in fdBitmap. // If all bits in fdBitmap are used, expand fd to the max. fd, err := f.fdBitmap.FirstZero(uint32(minFD)) if err != nil { fd = uint32(max) max++ } if fd >= uint32(end) { break } f.fdBitmap.Add(fd) if df := f.set(int32(fd), files[len(fds)], flags); df != nil { panic("file set") } fds = append(fds, int32(fd)) minFD = int32(fd) } // Failure? Unwind existing FDs. if len(fds) < len(files) { for _, i := range fds { _ = f.set(i, nil, FDFlags{}) f.fdBitmap.Remove(uint32(i)) } f.mu.Unlock() // Drop the reference taken by the call to f.set() that // originally installed the file. Don't call f.drop() // (generating inotify events, etc.) since the file should // appear to have never been inserted into f. for _, file := range files[:len(fds)] { file.DecRef(ctx) } return nil, unix.EMFILE } f.mu.Unlock() return fds, nil } // NewFD allocates a file descriptor greater than or equal to minFD for // the given file description. If it succeeds, it takes a reference on file. func (f *FDTable) NewFD(ctx context.Context, minFD int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { files := []*vfs.FileDescription{file} fileSlice, error := f.NewFDs(ctx, minFD, files, flags) if error != nil { return -1, error } return fileSlice[0], nil } // NewFDAt sets the file reference for the given FD. If there is an existing // file description for that FD, it is returned. // // N.B. Callers are required to use DecRef on the returned file when they are done. // // Precondition: file != nil. func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) (*vfs.FileDescription, error) { if fd < 0 { // Don't accept negative FDs. return nil, unix.EBADF } if fd >= f.k.MaxFDLimit.Load() { return nil, unix.EMFILE } // Check the limit for the provided file. if limitSet := limits.FromContext(ctx); limitSet != nil { if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur { return nil, unix.EMFILE } } // Install the entry. f.mu.Lock() df := f.set(fd, file, flags) // Add fd to fdBitmap. if df == nil { f.fdBitmap.Add(uint32(fd)) } f.mu.Unlock() if df != nil { f.fileUnlock(ctx, df) // Table's reference on df is transferred to caller, so don't DecRef. } return df, nil } // SetFlags sets the flags for the given file descriptor. // // True is returned iff flags were changed. func (f *FDTable) SetFlags(ctx context.Context, fd int32, flags FDFlags) error { if fd < 0 { // Don't accept negative FDs. return unix.EBADF } f.mu.Lock() defer f.mu.Unlock() file, _, _ := f.get(fd) if file == nil { // No file found. return unix.EBADF } // Update the flags. if df := f.set(fd, file, flags); df != nil { panic("file changed") } return nil } // SetFlagsForRange sets the flags for the given range of file descriptors // (inclusive: [startFd, endFd]). func (f *FDTable) SetFlagsForRange(ctx context.Context, startFd int32, endFd int32, flags FDFlags) error { if startFd < 0 || startFd > endFd { return unix.EBADF } f.mu.Lock() defer f.mu.Unlock() for fd, err := f.fdBitmap.FirstOne(uint32(startFd)); err == nil && fd <= uint32(endFd); fd, err = f.fdBitmap.FirstOne(fd + 1) { fdI32 := int32(fd) file, _, _ := f.get(fdI32) if df := f.set(fdI32, file, flags); df != nil { panic("file changed") } } return nil } // Get returns a reference to the file and the flags for the FD or nil if no // file is defined for the given fd. // // N.B. Callers are required to use DecRef when they are done. // //go:nosplit func (f *FDTable) Get(fd int32) (*vfs.FileDescription, FDFlags) { if fd < 0 { return nil, FDFlags{} } for { file, flags, _ := f.get(fd) if file != nil { if !file.TryIncRef() { continue // Race caught. } // Reference acquired. return file, flags } // No file available. return nil, FDFlags{} } } // GetFDs returns a sorted list of valid fds. // // Precondition: The caller must be running on the task goroutine, or Task.mu // must be locked. func (f *FDTable) GetFDs(ctx context.Context) []int32 { f.mu.Lock() defer f.mu.Unlock() fds := make([]int32, 0, int(f.fdBitmap.GetNumOnes())) f.ForEach(ctx, func(fd int32, _ *vfs.FileDescription, _ FDFlags) bool { fds = append(fds, fd) return true }) return fds } // Exists returns whether fd is defined in the table. It is inherently racy. // //go:nosplit func (f *FDTable) Exists(fd int32) bool { if fd < 0 { return false } file, _, _ := f.get(fd) return file != nil } // Fork returns an independent FDTable, cloning all FDs up to maxFds (non-inclusive). func (f *FDTable) Fork(ctx context.Context, maxFd int32) *FDTable { clone := f.k.NewFDTable() f.mu.Lock() defer f.mu.Unlock() f.forEachUpTo(ctx, maxFd, func(fd int32, file *vfs.FileDescription, flags FDFlags) bool { // The set function here will acquire an appropriate table // reference for the clone. We don't need anything else. if df := clone.set(fd, file, flags); df != nil { panic("file set") } clone.fdBitmap.Add(uint32(fd)) return true }) return clone } // Remove removes an FD from f. It returns the removed file description. // // N.B. Callers are required to use DecRef on the returned file when they are done. func (f *FDTable) Remove(ctx context.Context, fd int32) *vfs.FileDescription { if fd < 0 { return nil } f.mu.Lock() df := f.set(fd, nil, FDFlags{}) // Zap entry. if df != nil { f.fdBitmap.Remove(uint32(fd)) } f.mu.Unlock() if df != nil { f.fileUnlock(ctx, df) // Table's reference on df is transferred to caller, so don't DecRef. } return df } // RemoveIf removes all FDs where cond is true. func (f *FDTable) RemoveIf(ctx context.Context, cond func(*vfs.FileDescription, FDFlags) bool) { var files []*vfs.FileDescription f.mu.Lock() f.ForEach(ctx, func(fd int32, file *vfs.FileDescription, flags FDFlags) bool { if cond(file, flags) { // Clear from table. if df := f.set(fd, nil, FDFlags{}); df != nil { f.fdBitmap.Remove(uint32(fd)) files = append(files, df) } } return true }) f.mu.Unlock() for _, file := range files { f.fileUnlock(ctx, file) file.DecRef(ctx) // Drop the table's reference. } } // RemoveNextInRange removes the next FD that falls within the given range, // and returns the FD number and FileDescription of the removed FD. // // N.B. Callers are required to use DecRef on the returned file when they are done. func (f *FDTable) RemoveNextInRange(ctx context.Context, startFd int32, endFd int32) (int32, *vfs.FileDescription) { if startFd < 0 || startFd > endFd { return MaxFdLimit, nil } f.mu.Lock() fdUint, err := f.fdBitmap.FirstOne(uint32(startFd)) fd := int32(fdUint) if err != nil || fd > endFd { f.mu.Unlock() return MaxFdLimit, nil } df := f.set(fd, nil, FDFlags{}) // Zap entry. if df != nil { f.fdBitmap.Remove(uint32(fd)) } f.mu.Unlock() if df != nil { f.fileUnlock(ctx, df) // Table's reference on df is transferred to caller, so don't DecRef. } return fd, df } // GetLastFd returns the last set FD in the FDTable bitmap. func (f *FDTable) GetLastFd() int32 { f.mu.Lock() defer f.mu.Unlock() last := f.fdBitmap.Maximum() if last > bitmap.MaxBitEntryLimit { return MaxFdLimit } return int32(last) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/fd_table_mutex.go000066400000000000000000000031631465435605700255440ustar00rootroot00000000000000package kernel import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type fdTableMutex struct { mu sync.Mutex } var fdTableprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var fdTablelockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type fdTablelockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *fdTableMutex) Lock() { locking.AddGLock(fdTableprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *fdTableMutex) NestedLock(i fdTablelockNameIndex) { locking.AddGLock(fdTableprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *fdTableMutex) Unlock() { locking.DelGLock(fdTableprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *fdTableMutex) NestedUnlock(i fdTablelockNameIndex) { locking.DelGLock(fdTableprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func fdTableinitLockNames() {} func init() { fdTableinitLockNames() fdTableprefixIndex = locking.NewMutexClass(reflect.TypeOf(fdTableMutex{}), fdTablelockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/fd_table_refs.go000066400000000000000000000101331465435605700253340ustar00rootroot00000000000000package kernel import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const FDTableenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var FDTableobj *FDTable // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type FDTableRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *FDTableRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *FDTableRefs) RefType() string { return fmt.Sprintf("%T", FDTableobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *FDTableRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *FDTableRefs) LogRefs() bool { return FDTableenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *FDTableRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *FDTableRefs) IncRef() { v := r.refCount.Add(1) if FDTableenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *FDTableRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if FDTableenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *FDTableRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if FDTableenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *FDTableRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/fd_table_unsafe.go000066400000000000000000000101431465435605700256570ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "math" "gvisor.dev/gvisor/pkg/bitmap" "gvisor.dev/gvisor/pkg/sentry/vfs" ) type descriptorBucket [fdsPerBucket]descriptorAtomicPtr type descriptorBucketSlice []descriptorBucketAtomicPtr // descriptorTable is a two level table. The first level is a slice of // *descriptorBucket where each bucket is a slice of *descriptor. // // All objects are updated atomically. type descriptorTable struct { // Changes to the slice itself requiring holding FDTable.mu. slice descriptorBucketSliceAtomicPtr `state:".(map[int32]*descriptor)"` } // initNoLeakCheck initializes the table without enabling leak checking. // // This is used when loading an FDTable after S/R, during which the ref count // object itself will enable leak checking if necessary. func (f *FDTable) initNoLeakCheck() { var slice descriptorBucketSlice // Empty slice. f.slice.Store(&slice) } // init initializes the table with leak checking. func (f *FDTable) init() { f.initNoLeakCheck() f.InitRefs() f.fdBitmap = bitmap.New(uint32(math.MaxUint16)) } const ( // fdsPerBucketShift is chosen in such a way that the size of bucket is // equal to one page. fdsPerBucketShift = 9 fdsPerBucket = 1 << fdsPerBucketShift fdsPerBucketMask = fdsPerBucket - 1 ) // get gets a file entry. // // The boolean indicates whether this was in range. // //go:nosplit func (f *FDTable) get(fd int32) (*vfs.FileDescription, FDFlags, bool) { slice := *f.slice.Load() bucketN := fd >> fdsPerBucketShift if bucketN >= int32(len(slice)) { return nil, FDFlags{}, false } bucket := slice[bucketN].Load() if bucket == nil { return nil, FDFlags{}, false } d := bucket[fd&fdsPerBucketMask].Load() if d == nil { return nil, FDFlags{}, true } return d.file, d.flags, true } // CurrentMaxFDs returns the number of file descriptors that may be stored in f // without reallocation. func (f *FDTable) CurrentMaxFDs() int { slice := *f.slice.Load() return len(slice) * fdsPerBucket } // set sets the file description referred to by fd to file. If file is non-nil, // f takes a reference on it. If file is nil, the file entry at fd is cleared. // If set replaces an existing file description that is different from `file`, // it returns it with the FDTable's reference transferred to the caller, which // must call f.drop on the returned file after unlocking f.mu. // // Precondition: mu must be held. func (f *FDTable) set(fd int32, file *vfs.FileDescription, flags FDFlags) *vfs.FileDescription { slicePtr := f.slice.Load() bucketN := fd >> fdsPerBucketShift // Grow the table as required. if length := len(*slicePtr); int(bucketN) >= length { newLen := int(bucketN) + 1 if newLen < 2*length { // Ensure the table at least doubles in size without going over the limit. newLen = 2 * length if newLen > int(MaxFdLimit) { newLen = int(MaxFdLimit) } } newSlice := append(*slicePtr, make([]descriptorBucketAtomicPtr, newLen-length)...) slicePtr = &newSlice f.slice.Store(slicePtr) } slice := *slicePtr bucket := slice[bucketN].Load() if bucket == nil { bucket = &descriptorBucket{} slice[bucketN].Store(bucket) } var desc *descriptor if file != nil { desc = &descriptor{ file: file, flags: flags, } } // Update the single element. orig := bucket[fd%fdsPerBucket].Swap(desc) // Acquire a table reference. if desc != nil && desc.file != nil { if orig == nil || desc.file != orig.file { desc.file.IncRef() } } if orig != nil && orig.file != nil { if desc == nil || desc.file != orig.file { return orig.file } } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/fs_context.go000066400000000000000000000106661465435605700247440ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) // FSContext contains filesystem context. // // This includes umask and working directory. // // +stateify savable type FSContext struct { FSContextRefs // mu protects below. mu sync.Mutex `state:"nosave"` // root is the filesystem root. root vfs.VirtualDentry // cwd is the current working directory. cwd vfs.VirtualDentry // umask is the current file mode creation mask. When a thread using this // context invokes a syscall that creates a file, bits set in umask are // removed from the permissions that the file is created with. umask uint } // NewFSContext returns a new filesystem context. func NewFSContext(root, cwd vfs.VirtualDentry, umask uint) *FSContext { root.IncRef() cwd.IncRef() f := FSContext{ root: root, cwd: cwd, umask: umask, } f.InitRefs() return &f } // DecRef implements RefCounter.DecRef. // // When f reaches zero references, DecRef will be called on both root and cwd // Dirents. // // Note that there may still be calls to WorkingDirectory() or RootDirectory() // (that return nil). This is because valid references may still be held via // proc files or other mechanisms. func (f *FSContext) DecRef(ctx context.Context) { f.FSContextRefs.DecRef(func() { // Hold f.mu so that we don't race with RootDirectory() and // WorkingDirectory(). f.mu.Lock() defer f.mu.Unlock() f.root.DecRef(ctx) f.root = vfs.VirtualDentry{} f.cwd.DecRef(ctx) f.cwd = vfs.VirtualDentry{} }) } // Fork forks this FSContext. // // This is not a valid call after f is destroyed. func (f *FSContext) Fork() *FSContext { f.mu.Lock() defer f.mu.Unlock() if !f.cwd.Ok() { panic("FSContext.Fork() called after destroy") } f.cwd.IncRef() f.root.IncRef() ctx := &FSContext{ cwd: f.cwd, root: f.root, umask: f.umask, } ctx.InitRefs() return ctx } // WorkingDirectory returns the current working directory. // // This will return an empty vfs.VirtualDentry if called after f is // destroyed, otherwise it will return a Dirent with a reference taken. func (f *FSContext) WorkingDirectory() vfs.VirtualDentry { f.mu.Lock() defer f.mu.Unlock() if f.cwd.Ok() { f.cwd.IncRef() } return f.cwd } // SetWorkingDirectory sets the current working directory. // This will take an extra reference on the VirtualDentry. // // This is not a valid call after f is destroyed. func (f *FSContext) SetWorkingDirectory(ctx context.Context, d vfs.VirtualDentry) { f.mu.Lock() defer f.mu.Unlock() if !f.cwd.Ok() { panic(fmt.Sprintf("FSContext.SetWorkingDirectory(%v)) called after destroy", d)) } old := f.cwd f.cwd = d d.IncRef() old.DecRef(ctx) } // RootDirectory returns the current filesystem root. // // This will return an empty vfs.VirtualDentry if called after f is // destroyed, otherwise it will return a Dirent with a reference taken. func (f *FSContext) RootDirectory() vfs.VirtualDentry { f.mu.Lock() defer f.mu.Unlock() if f.root.Ok() { f.root.IncRef() } return f.root } // SetRootDirectory sets the root directory. It takes a reference on vd. // // This is not a valid call after f is destroyed. func (f *FSContext) SetRootDirectory(ctx context.Context, vd vfs.VirtualDentry) { if !vd.Ok() { panic("FSContext.SetRootDirectory called with zero-value VirtualDentry") } f.mu.Lock() if !f.root.Ok() { f.mu.Unlock() panic(fmt.Sprintf("FSContext.SetRootDirectory(%v)) called after destroy", vd)) } old := f.root vd.IncRef() f.root = vd f.mu.Unlock() old.DecRef(ctx) } // Umask returns the current umask. func (f *FSContext) Umask() uint { f.mu.Lock() defer f.mu.Unlock() return f.umask } // SwapUmask atomically sets the current umask and returns the old umask. func (f *FSContext) SwapUmask(mask uint) uint { f.mu.Lock() defer f.mu.Unlock() old := f.umask f.umask = mask return old } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/fs_context_refs.go000066400000000000000000000101771465435605700257600ustar00rootroot00000000000000package kernel import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const FSContextenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var FSContextobj *FSContext // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type FSContextRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *FSContextRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *FSContextRefs) RefType() string { return fmt.Sprintf("%T", FSContextobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *FSContextRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *FSContextRefs) LogRefs() bool { return FSContextenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *FSContextRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *FSContextRefs) IncRef() { v := r.refCount.Add(1) if FSContextenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *FSContextRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if FSContextenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *FSContextRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if FSContextenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *FSContextRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/futex/000077500000000000000000000000001465435605700233635ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/futex/atomicptr_bucket_unsafe.go000066400000000000000000000023101465435605700306060ustar00rootroot00000000000000package futex import ( "context" "sync/atomic" "unsafe" ) // An AtomicPtr is a pointer to a value of type Value that can be atomically // loaded and stored. The zero value of an AtomicPtr represents nil. // // Note that copying AtomicPtr by value performs a non-atomic read of the // stored pointer, which is unsafe if Store() can be called concurrently; in // this case, do `dst.Store(src.Load())` instead. // // +stateify savable type AtomicPtrBucket struct { ptr unsafe.Pointer `state:".(*bucket)"` } func (p *AtomicPtrBucket) savePtr() *bucket { return p.Load() } func (p *AtomicPtrBucket) loadPtr(_ context.Context, v *bucket) { p.Store(v) } // Load returns the value set by the most recent Store. It returns nil if there // has been no previous call to Store. // //go:nosplit func (p *AtomicPtrBucket) Load() *bucket { return (*bucket)(atomic.LoadPointer(&p.ptr)) } // Store sets the value returned by Load to x. func (p *AtomicPtrBucket) Store(x *bucket) { atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x)) } // Swap atomically stores `x` into *p and returns the previous *p value. func (p *AtomicPtrBucket) Swap(x *bucket) *bucket { return (*bucket)(atomic.SwapPointer(&p.ptr, (unsafe.Pointer)(x))) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/futex/futex.go000066400000000000000000000557531465435605700250640ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package futex provides an implementation of the futex interface as found in // the Linux kernel. It allows one to easily transform Wait() calls into waits // on a channel, which is useful in a Go-based kernel, for example. package futex import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/memmap" ) // KeyKind indicates the type of a Key. type KeyKind int const ( // KindPrivate indicates a private futex (a futex syscall with the // FUTEX_PRIVATE_FLAG set). KindPrivate KeyKind = iota // KindSharedPrivate indicates a shared futex on a private memory mapping. // Although KindPrivate and KindSharedPrivate futexes both use memory // addresses to identify futexes, they do not interoperate (in Linux, the // two are distinguished by the FUT_OFF_MMSHARED flag, which is used in key // comparison). KindSharedPrivate // KindSharedMappable indicates a shared futex on a memory mapping other // than a private anonymous memory mapping. KindSharedMappable ) // Key represents something that a futex waiter may wait on. type Key struct { // Kind is the type of the Key. Kind KeyKind // Mappable is the memory-mapped object that is represented by the Key. // Mappable is always nil if Kind is not KindSharedMappable, and may be nil // even if it is. Mappable memmap.Mappable // MappingIdentity is the MappingIdentity associated with Mappable. // MappingIdentity is always nil is Mappable is nil, and may be nil even if // it isn't. MappingIdentity memmap.MappingIdentity // If Kind is KindPrivate or KindSharedPrivate, Offset is the represented // memory address. Otherwise, Offset is the represented offset into // Mappable. Offset uint64 } func (k *Key) release(t Target) { if k.MappingIdentity != nil { k.MappingIdentity.DecRef(t) } k.Mappable = nil k.MappingIdentity = nil } func (k *Key) clone() Key { if k.MappingIdentity != nil { k.MappingIdentity.IncRef() } return *k } // Preconditions: k.Kind == KindPrivate or KindSharedPrivate. func (k *Key) addr() hostarch.Addr { return hostarch.Addr(k.Offset) } // matches returns true if a wakeup on k2 should wake a waiter waiting on k. func (k *Key) matches(k2 *Key) bool { // k.MappingIdentity is ignored; it's only used for reference counting. return k.Kind == k2.Kind && k.Mappable == k2.Mappable && k.Offset == k2.Offset } // Target abstracts memory accesses and keys. type Target interface { context.Context // SwapUint32 gives access to hostarch.IO.SwapUint32. SwapUint32(addr hostarch.Addr, new uint32) (uint32, error) // CompareAndSwap gives access to hostarch.IO.CompareAndSwapUint32. CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error) // LoadUint32 gives access to hostarch.IO.LoadUint32. LoadUint32(addr hostarch.Addr) (uint32, error) // GetSharedKey returns a Key with kind KindSharedPrivate or // KindSharedMappable corresponding to the memory mapped at address addr. // // If GetSharedKey returns a Key with a non-nil MappingIdentity, a // reference is held on the MappingIdentity, which must be dropped by the // caller when the Key is no longer in use. GetSharedKey(addr hostarch.Addr) (Key, error) } // check performs a basic equality check on the given address. func check(t Target, addr hostarch.Addr, val uint32) error { cur, err := t.LoadUint32(addr) if err != nil { return err } if cur != val { return linuxerr.EAGAIN } return nil } // atomicOp performs a complex operation on the given address. func atomicOp(t Target, addr hostarch.Addr, opIn uint32) (bool, error) { opType := (opIn >> 28) & 0xf cmp := (opIn >> 24) & 0xf opArg := (opIn >> 12) & 0xfff cmpArg := opIn & 0xfff if opType&linux.FUTEX_OP_OPARG_SHIFT != 0 { opArg = 1 << opArg opType &^= linux.FUTEX_OP_OPARG_SHIFT // Clear flag. } var ( oldVal uint32 err error ) if opType == linux.FUTEX_OP_SET { oldVal, err = t.SwapUint32(addr, opArg) if err != nil { return false, err } } else { for { oldVal, err = t.LoadUint32(addr) if err != nil { return false, err } var newVal uint32 switch opType { case linux.FUTEX_OP_ADD: newVal = oldVal + opArg case linux.FUTEX_OP_OR: newVal = oldVal | opArg case linux.FUTEX_OP_ANDN: newVal = oldVal &^ opArg case linux.FUTEX_OP_XOR: newVal = oldVal ^ opArg default: return false, linuxerr.ENOSYS } prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal) if err != nil { return false, err } if prev == oldVal { break // Success. } } } switch cmp { case linux.FUTEX_OP_CMP_EQ: return oldVal == cmpArg, nil case linux.FUTEX_OP_CMP_NE: return oldVal != cmpArg, nil case linux.FUTEX_OP_CMP_LT: return oldVal < cmpArg, nil case linux.FUTEX_OP_CMP_LE: return oldVal <= cmpArg, nil case linux.FUTEX_OP_CMP_GT: return oldVal > cmpArg, nil case linux.FUTEX_OP_CMP_GE: return oldVal >= cmpArg, nil default: return false, linuxerr.ENOSYS } } // Waiter is the struct which gets enqueued into buckets for wake up routines // and requeue routines to scan and notify. Once a Waiter has been enqueued by // WaitPrepare(), callers may listen on C for wake up events. type Waiter struct { // Synchronization: // // - A Waiter that is not enqueued in a bucket is exclusively owned (no // synchronization applies). // // - A Waiter is enqueued in a bucket by calling WaitPrepare(). After this, // waiterEntry, bucket, and key are protected by the bucket.mu ("bucket // lock") of the containing bucket, and bitmask is immutable. Note that // since bucket is mutated using atomic memory operations, bucket.Load() // may be called without holding the bucket lock, although it may change // racily. See WaitComplete(). // // - A Waiter is only guaranteed to be no longer queued after calling // WaitComplete(). // waiterEntry links Waiter into bucket.waiters. waiterEntry // bucket is the bucket this waiter is queued in. If bucket is nil, the // waiter is not waiting and is not in any bucket. bucket AtomicPtrBucket // C is sent to when the Waiter is woken. C chan struct{} // key is what this waiter is waiting on. key Key // The bitmask we're waiting on. // This is used the case of a FUTEX_WAKE_BITSET. bitmask uint32 // tid is the thread ID for the waiter in case this is a PI mutex. tid uint32 } // NewWaiter returns a new unqueued Waiter. func NewWaiter() *Waiter { return &Waiter{ C: make(chan struct{}, 1), } } // woken returns true if w has been woken since the last call to WaitPrepare. func (w *Waiter) woken() bool { return len(w.C) != 0 } // bucket holds a list of waiters for a given address hash. // // +stateify savable type bucket struct { // mu protects waiters and contained Waiter state. See comment in Waiter. mu futexBucketMutex `state:"nosave"` waiters waiterList `state:"zerovalue"` } // wakeLocked wakes up to n waiters matching the bitmask at the addr for this // bucket and returns the number of waiters woken. // // Preconditions: b.mu must be locked. func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int { done := 0 for w := b.waiters.Front(); done < n && w != nil; { if !w.key.matches(key) || w.bitmask&bitmask == 0 { // Not matching. w = w.Next() continue } // Remove from the bucket and wake the waiter. woke := w w = w.Next() // Next iteration. b.wakeWaiterLocked(woke) done++ } return done } func (b *bucket) wakeWaiterLocked(w *Waiter) { // Remove from the bucket and wake the waiter. b.waiters.Remove(w) w.C <- struct{}{} // NOTE: The above channel write establishes a write barrier according // to the memory model, so nothing may be ordered around it. Since // we've dequeued w and will never touch it again, we can safely // store nil to w.bucket here and allow the WaitComplete() to // short-circuit grabbing the bucket lock. If they somehow miss the // store, we are still holding the lock, so we can know that they won't // dequeue w, assume it's free and have the below operation // afterwards. w.bucket.Store(nil) } // requeueLocked takes n waiters from the bucket and moves them to naddr on the // bucket "to". // // Preconditions: b and to must be locked. func (b *bucket) requeueLocked(t Target, to *bucket, key, nkey *Key, n int) int { done := 0 for w := b.waiters.Front(); done < n && w != nil; { if !w.key.matches(key) { // Not matching. w = w.Next() continue } requeued := w w = w.Next() // Next iteration. b.waiters.Remove(requeued) requeued.key.release(t) requeued.key = nkey.clone() to.waiters.PushBack(requeued) requeued.bucket.Store(to) done++ } return done } const ( // bucketCount is the number of buckets per Manager. By having many of // these we reduce contention when concurrent yet unrelated calls are made. bucketCount = 1 << bucketCountBits bucketCountBits = 10 ) // getKey returns a Key representing address addr in c. func getKey(t Target, addr hostarch.Addr, private bool) (Key, error) { // Ensure the address is aligned. // It must be a DWORD boundary. if addr&0x3 != 0 { return Key{}, linuxerr.EINVAL } if private { return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil } return t.GetSharedKey(addr) } // bucketIndexForAddr returns the index into Manager.buckets for addr. func bucketIndexForAddr(addr hostarch.Addr) uintptr { // - The bottom 2 bits of addr must be 0, per getKey. // // - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47 // for a canonical address, and (on all existing platforms) bit 47 must be // 0 for an application address. // // Thus 19 bits of addr are "useless" for hashing, leaving only 45 "useful" // bits. We choose one of the simplest possible hash functions that at // least uses all 45 useful bits in the output, given that bucketCountBits // == 10. This hash function also has the property that it will usually map // adjacent addresses to adjacent buckets, slightly improving memory // locality when an application synchronization structure uses multiple // nearby futexes. // // Note that despite the large number of arithmetic operations in the // function, many components can be computed in parallel, such that the // critical path is 1 bit shift + 3 additions (2 in h1, then h1 + h2). This // is also why h1 and h2 are grouped separately; for "(addr >> 2) + ... + // (addr >> 42)" without any additional grouping, the compiler puts all 4 // additions in the critical path. h1 := uintptr(addr>>2) + uintptr(addr>>12) + uintptr(addr>>22) h2 := uintptr(addr>>32) + uintptr(addr>>42) return (h1 + h2) % bucketCount } // Manager holds futex state for a single virtual address space. // // +stateify savable type Manager struct { // privateBuckets holds buckets for KindPrivate and KindSharedPrivate // futexes. privateBuckets [bucketCount]bucket `state:"zerovalue"` // sharedBucket is the bucket for KindSharedMappable futexes. sharedBucket // may be shared by multiple Managers. The sharedBucket pointer is // immutable. sharedBucket *bucket } // NewManager returns an initialized futex manager. func NewManager() *Manager { return &Manager{ sharedBucket: &bucket{}, } } // Fork returns a new Manager. Shared futex clients using the returned Manager // may interoperate with those using m. func (m *Manager) Fork() *Manager { return &Manager{ sharedBucket: m.sharedBucket, } } // lockBucket returns a locked bucket for the given key. // +checklocksacquire:b.mu func (m *Manager) lockBucket(k *Key) (b *bucket) { if k.Kind == KindSharedMappable { b = m.sharedBucket } else { b = &m.privateBuckets[bucketIndexForAddr(k.addr())] } b.mu.Lock() return b } // lockBuckets returns locked buckets for the given keys. // It returns which bucket was locked first and second. They may be nil in case the buckets are // identical or they did not need locking. // // +checklocksacquire:lockedFirst.mu // +checklocksacquire:lockedSecond.mu func (m *Manager) lockBuckets(k1, k2 *Key) (b1, b2, lockedFirst, lockedSecond *bucket) { // Buckets must be consistently ordered to avoid circular lock // dependencies. We order buckets in m.privateBuckets by index (lowest // index first), and all buckets in m.privateBuckets precede // m.sharedBucket. // Handle the common case first: if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable { i1 := bucketIndexForAddr(k1.addr()) i2 := bucketIndexForAddr(k2.addr()) b1 = &m.privateBuckets[i1] b2 = &m.privateBuckets[i2] switch { case i1 < i2: b1.mu.Lock() b2.mu.NestedLock(futexBucketLockB) return b1, b2, b1, b2 case i2 < i1: b2.mu.Lock() b1.mu.NestedLock(futexBucketLockB) return b1, b2, b2, b1 default: b1.mu.Lock() return b1, b2, b1, nil // +checklocksforce } } // At least one of b1 or b2 should be m.sharedBucket. b1 = m.sharedBucket b2 = m.sharedBucket if k1.Kind != KindSharedMappable { b1 = m.lockBucket(k1) b2.mu.NestedLock(futexBucketLockB) return b1, b2, b1, b2 } if k2.Kind != KindSharedMappable { b2 = m.lockBucket(k2) b1.mu.NestedLock(futexBucketLockB) return b1, b2, b2, b1 } b1.mu.Lock() return b1, b2, b1, nil // +checklocksforce } // unlockBuckets unlocks two buckets. // +checklocksrelease:lockedFirst.mu // +checklocksrelease:lockedSecond.mu func (m *Manager) unlockBuckets(lockedFirst, lockedSecond *bucket) { if lockedSecond != nil { lockedSecond.mu.NestedUnlock(futexBucketLockB) } lockedFirst.mu.Unlock() } // Wake wakes up to n waiters matching the bitmask on the given addr. // The number of waiters woken is returned. func (m *Manager) Wake(t Target, addr hostarch.Addr, private bool, bitmask uint32, n int) (int, error) { // This function is very hot; avoid defer. k, err := getKey(t, addr, private) if err != nil { return 0, err } b := m.lockBucket(&k) r := b.wakeLocked(&k, bitmask, n) b.mu.Unlock() k.release(t) return r, nil } func (m *Manager) doRequeue(t Target, addr, naddr hostarch.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) { k1, err := getKey(t, addr, private) if err != nil { return 0, err } defer k1.release(t) k2, err := getKey(t, naddr, private) if err != nil { return 0, err } defer k2.release(t) b1, b2, lockedFirst, lockedSecond := m.lockBuckets(&k1, &k2) defer m.unlockBuckets(lockedFirst, lockedSecond) if checkval { if err := check(t, addr, val); err != nil { return 0, err } } // Wake the number required. done := b1.wakeLocked(&k1, ^uint32(0), nwake) // Requeue the number required. b1.requeueLocked(t, b2, &k1, &k2, nreq) return done, nil } // Requeue wakes up to nwake waiters on the given addr, and unconditionally // requeues up to nreq waiters on naddr. func (m *Manager) Requeue(t Target, addr, naddr hostarch.Addr, private bool, nwake int, nreq int) (int, error) { return m.doRequeue(t, addr, naddr, private, false, 0, nwake, nreq) } // RequeueCmp atomically checks that the addr contains val (via the Target), // wakes up to nwake waiters on addr and then unconditionally requeues nreq // waiters on naddr. func (m *Manager) RequeueCmp(t Target, addr, naddr hostarch.Addr, private bool, val uint32, nwake int, nreq int) (int, error) { return m.doRequeue(t, addr, naddr, private, true, val, nwake, nreq) } // WakeOp atomically applies op to the memory address addr2, wakes up to nwake1 // waiters unconditionally from addr1, and, based on the original value at addr2 // and a comparison encoded in op, wakes up to nwake2 waiters from addr2. // It returns the total number of waiters woken. func (m *Manager) WakeOp(t Target, addr1, addr2 hostarch.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) { k1, err := getKey(t, addr1, private) if err != nil { return 0, err } defer k1.release(t) k2, err := getKey(t, addr2, private) if err != nil { return 0, err } defer k2.release(t) b1, b2, lockedFirst, lockedSecond := m.lockBuckets(&k1, &k2) defer m.unlockBuckets(lockedFirst, lockedSecond) done := 0 cond, err := atomicOp(t, addr2, op) if err != nil { return 0, err } // Wake up up to nwake1 entries from the first bucket. done = b1.wakeLocked(&k1, ^uint32(0), nwake1) // Wake up up to nwake2 entries from the second bucket if the // operation yielded true. if cond { done += b2.wakeLocked(&k2, ^uint32(0), nwake2) } return done, nil } // WaitPrepare atomically checks that addr contains val (via the Checker), then // enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the // Waiter must be subsequently removed by calling WaitComplete, whether or not // a wakeup is received on w.C. func (m *Manager) WaitPrepare(w *Waiter, t Target, addr hostarch.Addr, private bool, val uint32, bitmask uint32) error { k, err := getKey(t, addr, private) if err != nil { return err } // Ownership of k is transferred to w below. // Prepare the Waiter before taking the bucket lock. select { case <-w.C: default: } w.key = k w.bitmask = bitmask b := m.lockBucket(&k) // This function is very hot; avoid defer. // Perform our atomic check. if err := check(t, addr, val); err != nil { b.mu.Unlock() w.key.release(t) return err } // Add the waiter to the bucket. b.waiters.PushBack(w) w.bucket.Store(b) b.mu.Unlock() return nil } // WaitComplete must be called when a Waiter previously added by WaitPrepare is // no longer eligible to be woken. func (m *Manager) WaitComplete(w *Waiter, t Target) { // Remove w from the bucket it's in. for { b := w.bucket.Load() // If b is nil, the waiter isn't in any bucket anymore. This can't be // racy because the waiter can't be concurrently re-queued in another // bucket. if b == nil { break } // Take the bucket lock. Note that without holding the bucket lock, the // waiter is not guaranteed to stay in that bucket, so after we take // the bucket lock, we must ensure that the bucket hasn't changed: if // it happens to have changed, we release the old bucket lock and try // again with the new bucket; if it hasn't changed, we know it won't // change now because we hold the lock. b.mu.Lock() if b != w.bucket.Load() { b.mu.Unlock() continue } // Remove waiter from bucket. b.waiters.Remove(w) w.bucket.Store(nil) b.mu.Unlock() break } // Release references held by the waiter. w.key.release(t) } // LockPI attempts to lock the futex following the Priority-inheritance futex // rules. The lock is acquired only when 'addr' points to 0. The TID of the // calling task is set to 'addr' to indicate the futex is owned. It returns true // if the futex was successfully acquired. // // FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see // exit_robust_list()). Given we don't support robust lists, although handled // below, it's never set. func (m *Manager) LockPI(w *Waiter, t Target, addr hostarch.Addr, tid uint32, private, try bool) (bool, error) { k, err := getKey(t, addr, private) if err != nil { return false, err } // Ownership of k is transferred to w below. // Prepare the Waiter before taking the bucket lock. select { case <-w.C: default: } w.key = k w.tid = tid b := m.lockBucket(&k) // Hot function: avoid defers. success, err := m.lockPILocked(w, t, addr, tid, b, try) if err != nil { w.key.release(t) b.mu.Unlock() return false, err } if success || try { // Release waiter if it's not going to be a wait. w.key.release(t) } b.mu.Unlock() return success, nil } func (m *Manager) lockPILocked(w *Waiter, t Target, addr hostarch.Addr, tid uint32, b *bucket, try bool) (bool, error) { for { cur, err := t.LoadUint32(addr) if err != nil { return false, err } if (cur & linux.FUTEX_TID_MASK) == tid { return false, linuxerr.EDEADLK } if (cur & linux.FUTEX_TID_MASK) == 0 { // No owner and no waiters, try to acquire the futex. // Set TID and preserve owner died status. val := tid val |= cur & linux.FUTEX_OWNER_DIED prev, err := t.CompareAndSwapUint32(addr, cur, val) if err != nil { return false, err } if prev != cur { // CAS failed, retry... // Linux reacquires the bucket lock on retries, which will re-lookup the // mapping at the futex address. However, retrying while holding the // lock is more efficient and reduces the chance of another conflict. continue } // Futex acquired. return true, nil } // Futex is already owned, prepare to wait. if try { // Caller doesn't want to wait. return false, nil } // Set waiters bit if not set yet. if cur&linux.FUTEX_WAITERS == 0 { prev, err := t.CompareAndSwapUint32(addr, cur, cur|linux.FUTEX_WAITERS) if err != nil { return false, err } if prev != cur { // CAS failed, retry... continue } } // Add the waiter to the bucket. b.waiters.PushBack(w) w.bucket.Store(b) return false, nil } } // UnlockPI unlocks the futex following the Priority-inheritance futex rules. // The address provided must contain the caller's TID. If there are waiters, // TID of the next waiter (FIFO) is set to the given address, and the waiter // woken up. If there are no waiters, 0 is set to the address. func (m *Manager) UnlockPI(t Target, addr hostarch.Addr, tid uint32, private bool) error { k, err := getKey(t, addr, private) if err != nil { return err } b := m.lockBucket(&k) err = m.unlockPILocked(t, addr, tid, b, &k) k.release(t) b.mu.Unlock() return err } func (m *Manager) unlockPILocked(t Target, addr hostarch.Addr, tid uint32, b *bucket, key *Key) error { cur, err := t.LoadUint32(addr) if err != nil { return err } if (cur & linux.FUTEX_TID_MASK) != tid { return linuxerr.EPERM } var next *Waiter // Who's the next owner? var next2 *Waiter // Who's the one after that? for w := b.waiters.Front(); w != nil; w = w.Next() { if !w.key.matches(key) { continue } if next == nil { next = w } else { next2 = w break } } if next == nil { // It's safe to set 0 because there are no waiters, no new owner, and the // executing task is the current owner (no owner died bit). prev, err := t.CompareAndSwapUint32(addr, cur, 0) if err != nil { return err } if prev != cur { // Let user mode handle CAS races. This is different than lock, which // retries when CAS fails. return linuxerr.EAGAIN } return nil } // Set next owner's TID, waiters if there are any. Resets owner died bit, if // set, because the executing task takes over as the owner. val := next.tid if next2 != nil { val |= linux.FUTEX_WAITERS } prev, err := t.CompareAndSwapUint32(addr, cur, val) if err != nil { return err } if prev != cur { return linuxerr.EINVAL } b.wakeWaiterLocked(next) return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/futex/futex_mutex.go000066400000000000000000000034011465435605700262650ustar00rootroot00000000000000package futex import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type futexBucketMutex struct { mu sync.Mutex } var futexBucketprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var futexBucketlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type futexBucketlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. const ( futexBucketLockB = futexBucketlockNameIndex(0) ) const () // Lock locks m. // +checklocksignore func (m *futexBucketMutex) Lock() { locking.AddGLock(futexBucketprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *futexBucketMutex) NestedLock(i futexBucketlockNameIndex) { locking.AddGLock(futexBucketprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *futexBucketMutex) Unlock() { locking.DelGLock(futexBucketprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *futexBucketMutex) NestedUnlock(i futexBucketlockNameIndex) { locking.DelGLock(futexBucketprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func futexBucketinitLockNames() { futexBucketlockNames = []string{"b"} } func init() { futexBucketinitLockNames() futexBucketprefixIndex = locking.NewMutexClass(reflect.TypeOf(futexBucketMutex{}), futexBucketlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/futex/futex_state_autogen.go000066400000000000000000000052441465435605700277740ustar00rootroot00000000000000// automatically generated by stateify. package futex import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (b *bucket) StateTypeName() string { return "pkg/sentry/kernel/futex.bucket" } func (b *bucket) StateFields() []string { return []string{} } func (b *bucket) beforeSave() {} // +checklocksignore func (b *bucket) StateSave(stateSinkObject state.Sink) { b.beforeSave() if !state.IsZeroValue(&b.waiters) { state.Failf("waiters is %#v, expected zero", &b.waiters) } } func (b *bucket) afterLoad(context.Context) {} // +checklocksignore func (b *bucket) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (m *Manager) StateTypeName() string { return "pkg/sentry/kernel/futex.Manager" } func (m *Manager) StateFields() []string { return []string{ "sharedBucket", } } func (m *Manager) beforeSave() {} // +checklocksignore func (m *Manager) StateSave(stateSinkObject state.Sink) { m.beforeSave() if !state.IsZeroValue(&m.privateBuckets) { state.Failf("privateBuckets is %#v, expected zero", &m.privateBuckets) } stateSinkObject.Save(0, &m.sharedBucket) } func (m *Manager) afterLoad(context.Context) {} // +checklocksignore func (m *Manager) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.sharedBucket) } func (l *waiterList) StateTypeName() string { return "pkg/sentry/kernel/futex.waiterList" } func (l *waiterList) StateFields() []string { return []string{ "head", "tail", } } func (l *waiterList) beforeSave() {} // +checklocksignore func (l *waiterList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *waiterList) afterLoad(context.Context) {} // +checklocksignore func (l *waiterList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *waiterEntry) StateTypeName() string { return "pkg/sentry/kernel/futex.waiterEntry" } func (e *waiterEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *waiterEntry) beforeSave() {} // +checklocksignore func (e *waiterEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *waiterEntry) afterLoad(context.Context) {} // +checklocksignore func (e *waiterEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func init() { state.Register((*bucket)(nil)) state.Register((*Manager)(nil)) state.Register((*waiterList)(nil)) state.Register((*waiterEntry)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/futex/futex_unsafe_state_autogen.go000066400000000000000000000015261465435605700313340ustar00rootroot00000000000000// automatically generated by stateify. package futex import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (p *AtomicPtrBucket) StateTypeName() string { return "pkg/sentry/kernel/futex.AtomicPtrBucket" } func (p *AtomicPtrBucket) StateFields() []string { return []string{ "ptr", } } func (p *AtomicPtrBucket) beforeSave() {} // +checklocksignore func (p *AtomicPtrBucket) StateSave(stateSinkObject state.Sink) { p.beforeSave() var ptrValue *bucket ptrValue = p.savePtr() stateSinkObject.SaveValue(0, ptrValue) } func (p *AtomicPtrBucket) afterLoad(context.Context) {} // +checklocksignore func (p *AtomicPtrBucket) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new(*bucket), func(y any) { p.loadPtr(ctx, y.(*bucket)) }) } func init() { state.Register((*AtomicPtrBucket)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/futex/waiter_list.go000066400000000000000000000120031465435605700262340ustar00rootroot00000000000000package futex // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type waiterElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (waiterElementMapper) linkerFor(elem *Waiter) *Waiter { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type waiterList struct { head *Waiter tail *Waiter } // Reset resets list l to the empty state. func (l *waiterList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *waiterList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *waiterList) Front() *Waiter { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *waiterList) Back() *Waiter { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *waiterList) Len() (count int) { for e := l.Front(); e != nil; e = (waiterElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *waiterList) PushFront(e *Waiter) { linker := waiterElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { waiterElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *waiterList) PushFrontList(m *waiterList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { waiterElementMapper{}.linkerFor(l.head).SetPrev(m.tail) waiterElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *waiterList) PushBack(e *Waiter) { linker := waiterElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { waiterElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *waiterList) PushBackList(m *waiterList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { waiterElementMapper{}.linkerFor(l.tail).SetNext(m.head) waiterElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *waiterList) InsertAfter(b, e *Waiter) { bLinker := waiterElementMapper{}.linkerFor(b) eLinker := waiterElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { waiterElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *waiterList) InsertBefore(a, e *Waiter) { aLinker := waiterElementMapper{}.linkerFor(a) eLinker := waiterElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { waiterElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *waiterList) Remove(e *Waiter) { linker := waiterElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { waiterElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { waiterElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type waiterEntry struct { next *Waiter prev *Waiter } // Next returns the entry that follows e in the list. // //go:nosplit func (e *waiterEntry) Next() *Waiter { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *waiterEntry) Prev() *Waiter { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *waiterEntry) SetNext(elem *Waiter) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *waiterEntry) SetPrev(elem *Waiter) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/ipc/000077500000000000000000000000001465435605700230035ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/ipc/ipc_state_autogen.go000066400000000000000000000041341465435605700270310ustar00rootroot00000000000000// automatically generated by stateify. package ipc import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (o *Object) StateTypeName() string { return "pkg/sentry/kernel/ipc.Object" } func (o *Object) StateFields() []string { return []string{ "UserNS", "ID", "Key", "CreatorUID", "CreatorGID", "OwnerUID", "OwnerGID", "Mode", } } func (o *Object) beforeSave() {} // +checklocksignore func (o *Object) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.UserNS) stateSinkObject.Save(1, &o.ID) stateSinkObject.Save(2, &o.Key) stateSinkObject.Save(3, &o.CreatorUID) stateSinkObject.Save(4, &o.CreatorGID) stateSinkObject.Save(5, &o.OwnerUID) stateSinkObject.Save(6, &o.OwnerGID) stateSinkObject.Save(7, &o.Mode) } func (o *Object) afterLoad(context.Context) {} // +checklocksignore func (o *Object) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.UserNS) stateSourceObject.Load(1, &o.ID) stateSourceObject.Load(2, &o.Key) stateSourceObject.Load(3, &o.CreatorUID) stateSourceObject.Load(4, &o.CreatorGID) stateSourceObject.Load(5, &o.OwnerUID) stateSourceObject.Load(6, &o.OwnerGID) stateSourceObject.Load(7, &o.Mode) } func (r *Registry) StateTypeName() string { return "pkg/sentry/kernel/ipc.Registry" } func (r *Registry) StateFields() []string { return []string{ "UserNS", "objects", "keysToIDs", "lastIDUsed", } } func (r *Registry) beforeSave() {} // +checklocksignore func (r *Registry) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.UserNS) stateSinkObject.Save(1, &r.objects) stateSinkObject.Save(2, &r.keysToIDs) stateSinkObject.Save(3, &r.lastIDUsed) } func (r *Registry) afterLoad(context.Context) {} // +checklocksignore func (r *Registry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.UserNS) stateSourceObject.Load(1, &r.objects) stateSourceObject.Load(2, &r.keysToIDs) stateSourceObject.Load(3, &r.lastIDUsed) } func init() { state.Register((*Object)(nil)) state.Register((*Registry)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/ipc/ns.go000066400000000000000000000015711465435605700237560ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ipc type contextID int // CtxIPCNamespace is the context.Value key used to retrieve an IPC namespace. // We define it here because it's needed in several packages, and is not // possible to use otherwise without causing a circular dependency. const CtxIPCNamespace contextID = iota golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/ipc/object.go000066400000000000000000000121651465435605700246050ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package ipc defines functionality and utilities common to sysvipc mechanisms. // // Lock ordering: [shm/semaphore/msgqueue].Registry.mu -> Mechanism package ipc import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // Key is a user-provided identifier for IPC objects. type Key int32 // ID is a kernel identifier for IPC objects. type ID int32 // Object represents an abstract IPC object with fields common to all IPC // mechanisms. // // +stateify savable type Object struct { // User namespace which owns the IPC namespace which owns the IPC object. // Immutable. UserNS *auth.UserNamespace // ID is a kernel identifier for the IPC object. Immutable. ID ID // Key is a user-provided identifier for the IPC object. Immutable. Key Key // CreatorUID is the UID of user who created the IPC object. Immutable. CreatorUID auth.KUID // CreatorGID is the GID of user who created the IPC object. Immutable. CreatorGID auth.KGID // OwnerUID is the UID of the current owner of the IPC object. Immutable. OwnerUID auth.KUID // OwnerGID is the GID of the current owner of the IPC object. Immutable. OwnerGID auth.KGID // Mode is the access permissions the IPC object. Mode linux.FileMode } // Mechanism represents a SysV mechanism that holds an IPC object. It can also // be looked at as a container for an ipc.Object, which is by definition a fully // functional SysV object. type Mechanism interface { // Lock behaves the same as Mutex.Lock on the mechanism. Lock() // Unlock behaves the same as Mutex.Unlock on the mechanism. Unlock() // Object returns a pointer to the mechanism's ipc.Object. Mechanism.Lock, // and Mechanism.Unlock should be used when the object is used. Object() *Object // Destroy destroys the mechanism. Destroy() } // NewObject returns a new, initialized ipc.Object. The newly returned object // doesn't have a valid ID. When the object is registered, the registry assigns // it a new unique ID. func NewObject(un *auth.UserNamespace, key Key, creator, owner *auth.Credentials, mode linux.FileMode) *Object { return &Object{ UserNS: un, Key: key, CreatorUID: creator.EffectiveKUID, CreatorGID: creator.EffectiveKGID, OwnerUID: owner.EffectiveKUID, OwnerGID: owner.EffectiveKGID, Mode: mode, } } // CheckOwnership verifies whether an IPC object may be accessed using creds as // an owner. See ipc/util.c:ipcctl_obtain_check() in Linux. func (o *Object) CheckOwnership(creds *auth.Credentials) bool { if o.OwnerUID == creds.EffectiveKUID || o.CreatorUID == creds.EffectiveKUID { return true } // Tasks with CAP_SYS_ADMIN may bypass ownership checks. Strangely, Linux // doesn't use CAP_IPC_OWNER for this despite CAP_IPC_OWNER being documented // for use to "override IPC ownership checks". return creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, o.UserNS) } // CheckPermissions verifies whether an IPC object is accessible using creds for // access described by req. See ipc/util.c:ipcperms() in Linux. func (o *Object) CheckPermissions(creds *auth.Credentials, req vfs.AccessTypes) bool { perms := uint16(o.Mode.Permissions()) if o.OwnerUID == creds.EffectiveKUID { perms >>= 6 } else if creds.InGroup(o.OwnerGID) { perms >>= 3 } if uint16(req)&perms == uint16(req) { return true } return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, o.UserNS) } // Set modifies attributes for an IPC object. See *ctl(IPC_SET). // // Precondition: Mechanism.mu must be held. func (o *Object) Set(ctx context.Context, perm *linux.IPCPerm) error { creds := auth.CredentialsFromContext(ctx) uid := creds.UserNamespace.MapToKUID(auth.UID(perm.UID)) gid := creds.UserNamespace.MapToKGID(auth.GID(perm.GID)) if !uid.Ok() || !gid.Ok() { // The man pages don't specify an errno for invalid uid/gid, but EINVAL // is generally used for invalid arguments. return linuxerr.EINVAL } if !o.CheckOwnership(creds) { // "The argument cmd has the value IPC_SET or IPC_RMID, but the // effective user ID of the calling process is not the creator (as // found in msg_perm.cuid) or the owner (as found in msg_perm.uid) // of the message queue, and the caller is not privileged (Linux: // does not have the CAP_SYS_ADMIN capability)." return linuxerr.EPERM } // User may only modify the lower 9 bits of the mode. All the other bits are // always 0 for the underlying inode. mode := linux.FileMode(perm.Mode & 0x1ff) o.Mode = mode o.OwnerUID = uid o.OwnerGID = gid return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/ipc/registry.go000066400000000000000000000132451465435605700252070ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ipc import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // Registry is similar to Object, but for registries. It represent an abstract // SysV IPC registry with fields common to all SysV registries. Registry is not // thread-safe, and should be protected using a mutex. // // +stateify savable type Registry struct { // UserNS owning the IPC namespace this registry belongs to. Immutable. UserNS *auth.UserNamespace // objects is a map of IDs to IPC mechanisms. objects map[ID]Mechanism // KeysToIDs maps a lookup key to an ID. keysToIDs map[Key]ID // lastIDUsed is used to find the next available ID for object creation. lastIDUsed ID } // NewRegistry return a new, initialized ipc.Registry. func NewRegistry(userNS *auth.UserNamespace) *Registry { return &Registry{ UserNS: userNS, objects: make(map[ID]Mechanism), keysToIDs: make(map[Key]ID), } } // Find uses key to search for and return a SysV mechanism. Find returns an // error if an object is found by shouldn't be, or if the user doesn't have // permission to use the object. If no object is found, Find checks create // flag, and returns an error only if it's false. func (r *Registry) Find(ctx context.Context, key Key, mode linux.FileMode, create, exclusive bool) (Mechanism, error) { if id, ok := r.keysToIDs[key]; ok { mech := r.objects[id] mech.Lock() defer mech.Unlock() obj := mech.Object() creds := auth.CredentialsFromContext(ctx) if !obj.CheckPermissions(creds, vfs.AccessTypes(mode&linux.ModeOtherAll)) { // The [calling process / user] does not have permission to access // the set, and does not have the CAP_IPC_OWNER capability in the // user namespace that governs its IPC namespace. return nil, linuxerr.EACCES } if create && exclusive { // IPC_CREAT and IPC_EXCL were specified, but an object already // exists for key. return nil, linuxerr.EEXIST } return mech, nil } if !create { // No object exists for key and msgflg did not specify IPC_CREAT. return nil, linuxerr.ENOENT } return nil, nil } // Register adds the given object into Registry.Objects, and assigns it a new // ID. It returns an error if all IDs are exhausted. func (r *Registry) Register(m Mechanism) error { id, err := r.newID() if err != nil { return err } obj := m.Object() obj.ID = id r.objects[id] = m r.keysToIDs[obj.Key] = id return nil } // newID finds the first unused ID in the registry, and returns an error if // non is found. func (r *Registry) newID() (ID, error) { // Find the next available ID. for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { // Handle wrap around. if id < 0 { id = 0 continue } if r.objects[id] == nil { r.lastIDUsed = id return id, nil } } log.Warningf("ids exhausted, they may be leaking") // The man pages for shmget(2) mention that ENOSPC should be used if "All // possible shared memory IDs have been taken (SHMMNI)". Other SysV // mechanisms don't have a specific errno for running out of IDs, but they // return ENOSPC if the max number of objects is exceeded, so we assume that // it's the same case. return 0, linuxerr.ENOSPC } // Remove removes the mechanism with the given id from the registry, and calls // mechanism.Destroy to perform mechanism-specific removal. func (r *Registry) Remove(id ID, creds *auth.Credentials) error { mech := r.objects[id] if mech == nil { return linuxerr.EINVAL } mech.Lock() defer mech.Unlock() obj := mech.Object() // The effective user ID of the calling process must match the creator or // owner of the [mechanism], or the caller must be privileged. if !obj.CheckOwnership(creds) { return linuxerr.EPERM } delete(r.objects, obj.ID) delete(r.keysToIDs, obj.Key) mech.Destroy() return nil } // ForAllObjects executes a given function for all given objects. func (r *Registry) ForAllObjects(f func(o Mechanism)) { for _, o := range r.objects { f(o) } } // FindByID returns the mechanism with the given ID, nil if non exists. func (r *Registry) FindByID(id ID) Mechanism { return r.objects[id] } // DissociateKey removes the association between a mechanism and its key // (deletes it from r.keysToIDs), preventing it from being discovered by any new // process, but not necessarily destroying it. If the given key doesn't exist, // nothing is changed. func (r *Registry) DissociateKey(key Key) { delete(r.keysToIDs, key) } // DissociateID removes the association between a mechanism and its ID (deletes // it from r.objects). An ID can't be removed unless the associated key is // removed already, this is done to prevent the users from acquiring nil a // Mechanism. // // Precondition: must be preceded by a call to r.DissociateKey. func (r *Registry) DissociateID(id ID) { delete(r.objects, id) } // ObjectCount returns the number of registered objects. func (r *Registry) ObjectCount() int { return len(r.objects) } // LastIDUsed returns the last used ID. func (r *Registry) LastIDUsed() ID { return r.lastIDUsed } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/ipc_namespace.go000066400000000000000000000105021465435605700253440ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/mqfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/mq" "gvisor.dev/gvisor/pkg/sentry/kernel/msgqueue" "gvisor.dev/gvisor/pkg/sentry/kernel/semaphore" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // IPCNamespace represents an IPC namespace. // // +stateify savable type IPCNamespace struct { inode *nsfs.Inode // User namespace which owns this IPC namespace. Immutable. userNS *auth.UserNamespace // System V utilities. queues *msgqueue.Registry semaphores *semaphore.Registry shms *shm.Registry // posixQueues is a POSIX message queue registry. // // posixQueues is somewhat equivalent to Linux's ipc_namespace.mq_mnt. // Unlike SysV utilities, mq.Registry is not map-based, but is backed by // a virtual filesystem. posixQueues *mq.Registry } // NewIPCNamespace creates a new IPC namespace. func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace { ns := &IPCNamespace{ userNS: userNS, queues: msgqueue.NewRegistry(userNS), semaphores: semaphore.NewRegistry(userNS), shms: shm.NewRegistry(userNS), } return ns } // Type implements nsfs.Namespace.Type. func (i *IPCNamespace) Type() string { return "ipc" } // Destroy implements nsfs.Namespace.Destroy. func (i *IPCNamespace) Destroy(ctx context.Context) { i.shms.Release(ctx) if i.posixQueues != nil { i.posixQueues.Destroy(ctx) } } // SetInode sets the nsfs `inode` to the IPC namespace. func (i *IPCNamespace) SetInode(inode *nsfs.Inode) { i.inode = inode } // GetInode returns the nsfs inode associated with the IPC namespace. func (i *IPCNamespace) GetInode() *nsfs.Inode { return i.inode } // UserNamespace returns the user namespace associated with the namespace. func (i *IPCNamespace) UserNamespace() *auth.UserNamespace { return i.userNS } // MsgqueueRegistry returns the message queue registry for this namespace. func (i *IPCNamespace) MsgqueueRegistry() *msgqueue.Registry { return i.queues } // SemaphoreRegistry returns the semaphore set registry for this namespace. func (i *IPCNamespace) SemaphoreRegistry() *semaphore.Registry { return i.semaphores } // ShmRegistry returns the shm segment registry for this namespace. func (i *IPCNamespace) ShmRegistry() *shm.Registry { return i.shms } // InitPosixQueues creates a new POSIX queue registry, and returns an error if // the registry was previously initialized. func (i *IPCNamespace) InitPosixQueues(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error { if i.posixQueues != nil { return fmt.Errorf("IPCNamespace.InitPosixQueues: already initialized") } impl, err := mqfs.NewRegistryImpl(ctx, vfsObj, creds) if err != nil { return err } i.posixQueues = mq.NewRegistry(i.userNS, impl) return nil } // PosixQueues returns the posix message queue registry for this namespace. // // Precondition: i.InitPosixQueues must have been called. func (i *IPCNamespace) PosixQueues() *mq.Registry { return i.posixQueues } // IncRef increments the Namespace's refcount. func (i *IPCNamespace) IncRef() { i.inode.IncRef() } // DecRef decrements the namespace's refcount. func (i *IPCNamespace) DecRef(ctx context.Context) { i.inode.DecRef(ctx) } // IPCNamespace returns the task's IPC namespace. func (t *Task) IPCNamespace() *IPCNamespace { t.mu.Lock() defer t.mu.Unlock() return t.ipcns } // GetIPCNamespace takes a reference on the task IPC namespace and // returns it. It will return nil if the task isn't alive. func (t *Task) GetIPCNamespace() *IPCNamespace { t.mu.Lock() defer t.mu.Unlock() if t.ipcns != nil { t.ipcns.IncRef() } return t.ipcns } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/kcov.go000066400000000000000000000224251465435605700235260ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "io" "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/coverage" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" ) // kcovAreaSizeMax is the maximum number of uint64 entries allowed in the kcov // area. On Linux, the maximum is INT_MAX / 8. const kcovAreaSizeMax = 10 * 1024 * 1024 // Kcov provides kernel coverage data to userspace through a memory-mapped // region, as kcov does in Linux. // // To give the illusion that the data is always up to date, we update the shared // memory every time before we return to userspace. type Kcov struct { // mf stores application memory. It is immutable after creation. mf *pgalloc.MemoryFile // mu protects all of the fields below. mu sync.RWMutex // mode is the current kcov mode. mode uint8 // size is the size of the mapping through which the kernel conveys coverage // information to userspace. size uint64 // owningTask is the task that currently owns coverage data on the system. The // interface for kcov essentially requires that coverage is only going to a // single task. Note that kcov should only generate coverage data for the // owning task, but we currently generate global coverage. owningTask *Task // count is a locally cached version of the first uint64 in the kcov data, // which is the number of subsequent entries representing PCs. // // It is used with kcovInode.countBlock(), to copy in/out the first element of // the actual data in an efficient manner, avoid boilerplate, and prevent // accidental garbage escapes by the temporary counts. count uint64 mappable *mm.SpecialMappable } // NewKcov creates and returns a Kcov instance. func (k *Kernel) NewKcov() *Kcov { return &Kcov{ mf: k.mf, } } var coveragePool = sync.Pool{ New: func() any { return make([]byte, 0) }, } // TaskWork implements TaskWorker.TaskWork. func (kcov *Kcov) TaskWork(t *Task) { kcov.mu.Lock() defer kcov.mu.Unlock() if kcov.mode != linux.KCOV_MODE_TRACE_PC { return } rw := &kcovReadWriter{ mf: kcov.mf, fr: kcov.mappable.FileRange(), } // Read in the PC count. if _, err := safemem.ReadFullToBlocks(rw.ReadToBlocks, kcov.countBlock()); err != nil { panic(fmt.Sprintf("Internal error reading count from kcov area: %v", err)) } rw.off = 8 * (1 + kcov.count) n := coverage.ConsumeCoverageData(&kcovIOWriter{rw}) // Update the pc count, based on the number of entries written. Note that if // we reached the end of the kcov area, we may not have written everything in // output. kcov.count += uint64(n / 8) rw.off = 0 if _, err := safemem.WriteFullFromBlocks(rw.WriteFromBlocks, kcov.countBlock()); err != nil { panic(fmt.Sprintf("Internal error writing count to kcov area: %v", err)) } // Re-register for future work. t.RegisterWork(kcov) } // InitTrace performs the KCOV_INIT_TRACE ioctl. func (kcov *Kcov) InitTrace(size uint64) error { kcov.mu.Lock() defer kcov.mu.Unlock() if kcov.mode != linux.KCOV_MODE_DISABLED { return linuxerr.EBUSY } // To simplify all the logic around mapping, we require that the length of the // shared region is a multiple of the system page size. if (8*size)&(hostarch.PageSize-1) != 0 { return linuxerr.EINVAL } // We need space for at least two uint64s to hold current position and a // single PC. if size < 2 || size > kcovAreaSizeMax { return linuxerr.EINVAL } kcov.size = size kcov.mode = linux.KCOV_MODE_INIT return nil } // EnableTrace performs the KCOV_ENABLE_TRACE ioctl. func (kcov *Kcov) EnableTrace(ctx context.Context, traceKind uint8) error { t := TaskFromContext(ctx) if t == nil { panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine") } kcov.mu.Lock() defer kcov.mu.Unlock() // KCOV_ENABLE must be preceded by KCOV_INIT_TRACE and an mmap call. if kcov.mode != linux.KCOV_MODE_INIT || kcov.mappable == nil { return linuxerr.EINVAL } switch traceKind { case linux.KCOV_TRACE_PC: kcov.mode = linux.KCOV_MODE_TRACE_PC case linux.KCOV_TRACE_CMP: // We do not support KCOV_MODE_TRACE_CMP. return linuxerr.ENOTSUP default: return linuxerr.EINVAL } if kcov.owningTask != nil && kcov.owningTask != t { return linuxerr.EBUSY } kcov.owningTask = t t.SetKcov(kcov) t.RegisterWork(kcov) // Clear existing coverage data; the task expects to read only coverage data // from the time it is activated. coverage.ClearCoverageData() return nil } // DisableTrace performs the KCOV_DISABLE_TRACE ioctl. func (kcov *Kcov) DisableTrace(ctx context.Context) error { kcov.mu.Lock() defer kcov.mu.Unlock() t := TaskFromContext(ctx) if t == nil { panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine") } if t != kcov.owningTask { return linuxerr.EINVAL } kcov.mode = linux.KCOV_MODE_INIT kcov.owningTask = nil if kcov.mappable != nil { kcov.mappable.DecRef(ctx) kcov.mappable = nil } return nil } // Clear resets the mode and clears the owning task and memory mapping for kcov. // It is called when the fd corresponding to kcov is closed. Note that the mode // needs to be set so that the next call to kcov.TaskWork() will exit early. func (kcov *Kcov) Clear(ctx context.Context) { kcov.mu.Lock() kcov.mode = linux.KCOV_MODE_INIT kcov.owningTask = nil if kcov.mappable != nil { kcov.mappable.DecRef(ctx) kcov.mappable = nil } kcov.mu.Unlock() } // OnTaskExit is called when the owning task exits. It is similar to // kcov.Clear(), except the memory mapping is not cleared, so that the same // mapping can be used in the future if kcov is enabled again by another task. func (kcov *Kcov) OnTaskExit() { kcov.mu.Lock() kcov.mode = linux.KCOV_MODE_INIT kcov.owningTask = nil kcov.mu.Unlock() } // ConfigureMMap is called by the vfs.FileDescription for this kcov instance to // implement vfs.FileDescription.ConfigureMMap. func (kcov *Kcov) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { kcov.mu.Lock() defer kcov.mu.Unlock() if kcov.mode != linux.KCOV_MODE_INIT { return linuxerr.EINVAL } if kcov.mappable == nil { // Set up the kcov area. opts := pgalloc.AllocOpts{ Kind: usage.Anonymous, MemCgID: pgalloc.MemoryCgroupIDFromContext(ctx), } fr, err := kcov.mf.Allocate(kcov.size*8, opts) if err != nil { return err } // Get the thread id for the mmap name. t := TaskFromContext(ctx) if t == nil { panic("ThreadFromContext returned nil") } // For convenience, a special mappable is used here. Note that these mappings // will look different under /proc/[pid]/maps than they do on Linux. kcov.mappable = mm.NewSpecialMappable(fmt.Sprintf("[kcov:%d]", t.ThreadID()), kcov.mf, fr) } kcov.mappable.IncRef() opts.Mappable = kcov.mappable opts.MappingIdentity = kcov.mappable return nil } // kcovReadWriter implements safemem.Reader and safemem.Writer. type kcovReadWriter struct { off uint64 mf *pgalloc.MemoryFile fr memmap.FileRange } // ReadToBlocks implements safemem.Reader.ReadToBlocks. func (rw *kcovReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { if dsts.IsEmpty() { return 0, nil } // Limit the read to the kcov range and check for overflow. if rw.fr.Length() <= rw.off { return 0, io.EOF } start := rw.fr.Start + rw.off end := rw.fr.Start + rw.fr.Length() if rend := start + dsts.NumBytes(); rend < end { end = rend } // Get internal mappings. bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, hostarch.Read) if err != nil { return 0, err } // Copy from internal mappings. n, err := safemem.CopySeq(dsts, bs) rw.off += n return n, err } // WriteFromBlocks implements safemem.Writer.WriteFromBlocks. func (rw *kcovReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { if srcs.IsEmpty() { return 0, nil } // Limit the write to the kcov area and check for overflow. if rw.fr.Length() <= rw.off { return 0, io.EOF } start := rw.fr.Start + rw.off end := rw.fr.Start + rw.fr.Length() if wend := start + srcs.NumBytes(); wend < end { end = wend } // Get internal mapping. bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, hostarch.Write) if err != nil { return 0, err } // Copy to internal mapping. n, err := safemem.CopySeq(bs, srcs) rw.off += n return n, err } // kcovIOWriter implements io.Writer as a basic wrapper over kcovReadWriter. type kcovIOWriter struct { rw *kcovReadWriter } // Write implements io.Writer.Write. func (w *kcovIOWriter) Write(p []byte) (int, error) { bs := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(p)) n, err := safemem.WriteFullFromBlocks(w.rw.WriteFromBlocks, bs) return int(n), err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/kcov_unsafe.go000066400000000000000000000016761465435605700250740ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "unsafe" "gvisor.dev/gvisor/pkg/safemem" ) // countBlock provides a safemem.BlockSeq for kcov.count. // // Like k.count, the block returned is protected by k.mu. func (kcov *Kcov) countBlock() safemem.BlockSeq { return safemem.BlockSeqOf(safemem.BlockFromSafePointer(unsafe.Pointer(&kcov.count), int(unsafe.Sizeof(kcov.count)))) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/kernel.go000066400000000000000000002100601465435605700240360ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package kernel provides an emulation of the Linux kernel. // // See README.md for a detailed overview. // // Lock order (outermost locks must be taken first): // // Kernel.extMu // ThreadGroup.timerMu // ktime.Timer.mu (for IntervalTimer) and Kernel.cpuClockMu // TaskSet.mu // SignalHandlers.mu // Task.mu // runningTasksMu // // Locking SignalHandlers.mu in multiple SignalHandlers requires locking // TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same // time requires locking all of their signal mutexes first. package kernel import ( "errors" "fmt" "io" "path/filepath" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/devutil" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/loader" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port" sentrytime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/unimpl" uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/state" "gvisor.dev/gvisor/pkg/state/statefile" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" ) // IOUringEnabled is set to true when IO_URING is enabled. Added as a global to // allow easy access everywhere. var IOUringEnabled = false // UserCounters is a set of user counters. // // +stateify savable type UserCounters struct { uid auth.KUID rlimitNProc atomicbitops.Uint64 } // incRLimitNProc increments the rlimitNProc counter. func (uc *UserCounters) incRLimitNProc(ctx context.Context) error { lim := limits.FromContext(ctx).Get(limits.ProcessCount) creds := auth.CredentialsFromContext(ctx) nproc := uc.rlimitNProc.Add(1) if nproc > lim.Cur && !creds.HasCapability(linux.CAP_SYS_ADMIN) && !creds.HasCapability(linux.CAP_SYS_RESOURCE) { uc.rlimitNProc.Add(^uint64(0)) return linuxerr.EAGAIN } return nil } // decRLimitNProc decrements the rlimitNProc counter. func (uc *UserCounters) decRLimitNProc() { uc.rlimitNProc.Add(^uint64(0)) } // CgroupMount contains the cgroup mount. These mounts are created for the root // container by default and are stored in the kernel. // // +stateify savable type CgroupMount struct { Fs *vfs.Filesystem Root *vfs.Dentry Mount *vfs.Mount } // Kernel represents an emulated Linux kernel. It must be initialized by calling // Init() or LoadFrom(). // // +stateify savable type Kernel struct { // extMu serializes external changes to the Kernel with calls to // Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel // remains frozen for the duration of the call; it requires that the Kernel // is paused as a precondition, which ensures that none of the tasks // running within the Kernel can affect its state, but extMu is required to // ensure that concurrent users of the Kernel *outside* the Kernel's // control cannot affect its state by calling e.g. // Kernel.SendExternalSignal.) extMu sync.Mutex `state:"nosave"` // started is true if Start has been called. Unless otherwise specified, // all Kernel fields become immutable once started becomes true. started bool `state:"nosave"` // All of the following fields are immutable unless otherwise specified. // Platform is the platform that is used to execute tasks in the created // Kernel. platform.Platform `state:"nosave"` // mf provides application memory. mf *pgalloc.MemoryFile `state:"nosave"` // See InitKernelArgs for the meaning of these fields. featureSet cpuid.FeatureSet timekeeper *Timekeeper tasks *TaskSet rootUserNamespace *auth.UserNamespace rootNetworkNamespace *inet.Namespace applicationCores uint useHostCores bool extraAuxv []arch.AuxEntry vdso *loader.VDSO vdsoParams *VDSOParamPage rootUTSNamespace *UTSNamespace rootIPCNamespace *IPCNamespace // futexes is the "root" futex.Manager, from which all others are forked. // This is necessary to ensure that shared futexes are coherent across all // tasks, including those created by CreateProcess. futexes *futex.Manager // globalInit is the thread group whose leader has ID 1 in the root PID // namespace. globalInit is stored separately so that it is accessible even // after all tasks in the thread group have exited, such that ID 1 is no // longer mapped. // // globalInit is mutable until it is assigned by the first successful call // to CreateProcess, and is protected by extMu. globalInit *ThreadGroup // syslog is the kernel log. syslog syslog runningTasksMu runningTasksMutex `state:"nosave"` // runningTasks is the total count of tasks currently in // TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are // not blocked or stopped. // // runningTasks must be accessed atomically. Increments from 0 to 1 are // further protected by runningTasksMu (see incRunningTasks). runningTasks atomicbitops.Int64 // runningTasksCond is signaled when runningTasks is incremented from 0 to 1. // // Invariant: runningTasksCond.L == &runningTasksMu. runningTasksCond sync.Cond `state:"nosave"` // cpuClock is incremented every linux.ClockTick by a goroutine running // kernel.runCPUClockTicker() while runningTasks != 0. // // cpuClock is used to measure task CPU usage, since sampling monotonicClock // twice on every syscall turns out to be unreasonably expensive. This is // similar to how Linux does task CPU accounting on x86 // (CONFIG_IRQ_TIME_ACCOUNTING), although Linux also uses scheduler timing // information to improve resolution // (kernel/sched/cputime.c:cputime_adjust()), which we can't do since // "preeemptive" scheduling is managed by the Go runtime, which doesn't // provide this information. // // cpuClock is mutable, and is accessed using atomic memory operations. cpuClock atomicbitops.Uint64 // cpuClockTickTimer drives increments of cpuClock. cpuClockTickTimer *time.Timer `state:"nosave"` // cpuClockMu is used to make increments of cpuClock, and updates of timers // based on cpuClock, atomic. cpuClockMu cpuClockMutex `state:"nosave"` // cpuClockTickerRunning is true if the goroutine that increments cpuClock is // running and false if it is blocked in runningTasksCond.Wait() or if it // never started. // // cpuClockTickerRunning is protected by runningTasksMu. cpuClockTickerRunning bool // cpuClockTickerWakeCh is sent to to wake the goroutine that increments // cpuClock if it's sleeping between ticks. cpuClockTickerWakeCh chan struct{} `state:"nosave"` // cpuClockTickerStopCond is broadcast when cpuClockTickerRunning transitions // from true to false. // // Invariant: cpuClockTickerStopCond.L == &runningTasksMu. cpuClockTickerStopCond sync.Cond `state:"nosave"` // uniqueID is used to generate unique identifiers. // // uniqueID is mutable, and is accessed using atomic memory operations. uniqueID atomicbitops.Uint64 // nextInotifyCookie is a monotonically increasing counter used for // generating unique inotify event cookies. // // nextInotifyCookie is mutable. nextInotifyCookie atomicbitops.Uint32 // netlinkPorts manages allocation of netlink socket port IDs. netlinkPorts *port.Manager // saveStatus is nil if the sandbox has not been saved, errSaved or // errAutoSaved if it has been saved successfully, or the error causing the // sandbox to exit during save. // It is protected by extMu. saveStatus error `state:"nosave"` // danglingEndpoints is used to save / restore tcpip.DanglingEndpoints. danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"` // sockets records all network sockets in the system. Protected by extMu. sockets map[*vfs.FileDescription]*SocketRecord // nextSocketRecord is the next entry number to use in sockets. Protected // by extMu. nextSocketRecord uint64 // unimplementedSyscallEmitterOnce is used in the initialization of // unimplementedSyscallEmitter. unimplementedSyscallEmitterOnce sync.Once `state:"nosave"` // unimplementedSyscallEmitter is used to emit unimplemented syscall // events. This is initialized lazily on the first unimplemented // syscall. unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"` // SpecialOpts contains special kernel options. SpecialOpts // vfs keeps the filesystem state used across the kernel. vfs vfs.VirtualFilesystem // hostMount is the Mount used for file descriptors that were imported // from the host. hostMount *vfs.Mount // pipeMount is the Mount used for pipes created by the pipe() and pipe2() // syscalls (as opposed to named pipes created by mknod()). pipeMount *vfs.Mount // nsfsMount is the Mount used for namespaces. nsfsMount *vfs.Mount // shmMount is the Mount used for anonymous files created by the // memfd_create() syscalls. It is analogous to Linux's shm_mnt. shmMount *vfs.Mount // socketMount is the Mount used for sockets created by the socket() and // socketpair() syscalls. There are several cases where a socket dentry will // not be contained in socketMount: // 1. Socket files created by mknod() // 2. Socket fds imported from the host (Kernel.hostMount is used for these) // 3. Socket files created by binding Unix sockets to a file path socketMount *vfs.Mount // sysVShmDevID is the device number used by SysV shm segments. In Linux, // SysV shm uses shmem_file_setup() and thus uses shm_mnt's device number. // In gVisor, the shm implementation does not use shmMount, extracting // shmMount's device number is inconvenient, applications accept a // different device number in practice, and using a distinct device number // avoids the possibility of inode number collisions due to the hack // described in shm.Shm.InodeID(). sysVShmDevID uint32 // If set to true, report address space activation waits as if the task is in // external wait so that the watchdog doesn't report the task stuck. SleepForAddressSpaceActivation bool // Exceptions to YAMA ptrace restrictions. Each key-value pair represents a // tracee-tracer relationship. The key is a process (technically, the thread // group leader) that can be traced by any thread that is a descendant of the // value. If the value is nil, then anyone can trace the process represented by // the key. // // ptraceExceptions is protected by the TaskSet mutex. ptraceExceptions map[*Task]*Task // YAMAPtraceScope is the current level of YAMA ptrace restrictions. YAMAPtraceScope atomicbitops.Int32 // cgroupRegistry contains the set of active cgroup controllers on the // system. It is controller by cgroupfs. Nil if cgroupfs is unavailable on // the system. cgroupRegistry *CgroupRegistry // cgroupMountsMap maps the cgroup controller names to the cgroup mounts // created for the root container. These mounts are then bind mounted // for other application containers by creating their own container // directories. cgroupMountsMap map[string]*CgroupMount cgroupMountsMapMu cgroupMountsMutex `state:"nosave"` // userCountersMap maps auth.KUID into a set of user counters. userCountersMap map[auth.KUID]*UserCounters userCountersMapMu userCountersMutex `state:"nosave"` // MaxFDLimit specifies the maximum file descriptor number that can be // used by processes. MaxFDLimit atomicbitops.Int32 // devGofers maps containers (using its name) to its device gofer client. devGofers map[string]*devutil.GoferClient `state:"nosave"` devGofersMu sync.Mutex `state:"nosave"` // containerNames store the container name based on their container ID. // Names are preserved between save/restore session, while IDs can change. // // Mapping: cid -> name. // It's protected by extMu. containerNames map[string]string // checkpointMu is used to protect the checkpointing related fields below. checkpointMu sync.Mutex `state:"nosave"` // checkpointCond is used to wait for a checkpoint to complete. It uses // checkpointMu as its mutex. checkpointCond sync.Cond `state:"nosave"` // additionalCheckpointState stores additional state that needs // to be checkpointed. It's protected by checkpointMu. additionalCheckpointState map[any]any // saver implements the Saver interface, which (as of writing) supports // asynchronous checkpointing. It's protected by checkpointMu. saver Saver `state:"nosave"` // checkpointCounter aims to track the number of times the kernel has been // successfully checkpointed. It's updated via calls to OnCheckpointAttempt() // and IncCheckpointCount(). Kernel checkpoint-ers must call these methods // appropriately so the counter is accurate. It's protected by checkpointMu. checkpointCounter uint32 // lastCheckpointStatus is the error value returned from the most recent // checkpoint attempt. If this value is nil, then the `checkpointCounter`-th // checkpoint attempt succeeded and no checkpoint attempt has completed since. // If this value is non-nil, then the `checkpointCounter`-th checkpoint // attempt succeeded, after which at least one more checkpoint attempt was // made and failed with this error. It's protected by checkpointMu. lastCheckpointStatus error `state:"nosave"` } // Saver is an interface for saving the kernel. type Saver interface { SaveAsync() error } // InitKernelArgs holds arguments to Init. type InitKernelArgs struct { // FeatureSet is the emulated CPU feature set. FeatureSet cpuid.FeatureSet // Timekeeper manages time for all tasks in the system. Timekeeper *Timekeeper // RootUserNamespace is the root user namespace. RootUserNamespace *auth.UserNamespace // RootNetworkNamespace is the root network namespace. If nil, no networking // will be available. RootNetworkNamespace *inet.Namespace // ApplicationCores is the number of logical CPUs visible to sandboxed // applications. The set of logical CPU IDs is [0, ApplicationCores); thus // ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the // most significant bit in cpu_possible_mask + 1. ApplicationCores uint // If UseHostCores is true, Task.CPU() returns the task goroutine's CPU // instead of a virtualized CPU number, and Task.CopyToCPUMask() is a // no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it // will be overridden. UseHostCores bool // ExtraAuxv contains additional auxiliary vector entries that are added to // each process by the ELF loader. ExtraAuxv []arch.AuxEntry // Vdso holds the VDSO and its parameter page. Vdso *loader.VDSO // VdsoParams is the VDSO parameter page manager. VdsoParams *VDSOParamPage // RootUTSNamespace is the root UTS namespace. RootUTSNamespace *UTSNamespace // RootIPCNamespace is the root IPC namespace. RootIPCNamespace *IPCNamespace // PIDNamespace is the root PID namespace. PIDNamespace *PIDNamespace // MaxFDLimit specifies the maximum file descriptor number that can be // used by processes. If it is zero, the limit will be set to // unlimited. MaxFDLimit int32 } // Init initialize the Kernel with no tasks. // // Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile // before calling Init. func (k *Kernel) Init(args InitKernelArgs) error { if args.Timekeeper == nil { return fmt.Errorf("args.Timekeeper is nil") } if args.Timekeeper.clocks == nil { return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()") } if args.RootUserNamespace == nil { return fmt.Errorf("args.RootUserNamespace is nil") } if args.ApplicationCores == 0 { return fmt.Errorf("args.ApplicationCores is 0") } k.featureSet = args.FeatureSet k.timekeeper = args.Timekeeper k.tasks = newTaskSet(args.PIDNamespace) k.rootUserNamespace = args.RootUserNamespace k.rootUTSNamespace = args.RootUTSNamespace k.rootIPCNamespace = args.RootIPCNamespace k.rootNetworkNamespace = args.RootNetworkNamespace if k.rootNetworkNamespace == nil { k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil, args.RootUserNamespace) } k.runningTasksCond.L = &k.runningTasksMu k.checkpointCond.L = &k.checkpointMu k.cpuClockTickerWakeCh = make(chan struct{}, 1) k.cpuClockTickerStopCond.L = &k.runningTasksMu k.applicationCores = args.ApplicationCores if args.UseHostCores { k.useHostCores = true maxCPU, err := hostcpu.MaxPossibleCPU() if err != nil { return fmt.Errorf("failed to get maximum CPU number: %v", err) } minAppCores := uint(maxCPU) + 1 if k.applicationCores < minAppCores { log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores) k.applicationCores = minAppCores } } k.extraAuxv = args.ExtraAuxv k.vdso = args.Vdso k.vdsoParams = args.VdsoParams k.futexes = futex.NewManager() k.netlinkPorts = port.New() k.ptraceExceptions = make(map[*Task]*Task) k.YAMAPtraceScope = atomicbitops.FromInt32(linux.YAMA_SCOPE_RELATIONAL) k.userCountersMap = make(map[auth.KUID]*UserCounters) if args.MaxFDLimit == 0 { args.MaxFDLimit = MaxFdLimit } k.MaxFDLimit.Store(args.MaxFDLimit) k.containerNames = make(map[string]string) ctx := k.SupervisorContext() if err := k.vfs.Init(ctx); err != nil { return fmt.Errorf("failed to initialize VFS: %v", err) } err := k.rootIPCNamespace.InitPosixQueues(ctx, &k.vfs, auth.CredentialsFromContext(ctx)) if err != nil { return fmt.Errorf("failed to create mqfs filesystem: %v", err) } pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs) if err != nil { return fmt.Errorf("failed to create pipefs filesystem: %v", err) } defer pipeFilesystem.DecRef(ctx) pipeMount := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) k.pipeMount = pipeMount nsfsFilesystem, err := nsfs.NewFilesystem(&k.vfs) if err != nil { return fmt.Errorf("failed to create nsfs filesystem: %v", err) } defer nsfsFilesystem.DecRef(ctx) k.nsfsMount = k.vfs.NewDisconnectedMount(nsfsFilesystem, nil, &vfs.MountOptions{}) k.rootNetworkNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootNetworkNamespace)) k.rootIPCNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootIPCNamespace)) k.rootUTSNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUTSNamespace)) tmpfsOpts := vfs.GetFilesystemOptions{ InternalData: tmpfs.FilesystemOpts{ // See mm/shmem.c:shmem_init() => vfs_kern_mount(flags=SB_KERNMOUNT). // Note how mm/shmem.c:shmem_fill_super() does not provide a default // value for sbinfo->max_blocks when SB_KERNMOUNT is set. DisableDefaultSizeLimit: true, }, InternalMount: true, } tmpfsFilesystem, tmpfsRoot, err := tmpfs.FilesystemType{}.GetFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace), "", tmpfsOpts) if err != nil { return fmt.Errorf("failed to create tmpfs filesystem: %v", err) } defer tmpfsFilesystem.DecRef(ctx) defer tmpfsRoot.DecRef(ctx) k.shmMount = k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{}) socketFilesystem, err := sockfs.NewFilesystem(&k.vfs) if err != nil { return fmt.Errorf("failed to create sockfs filesystem: %v", err) } defer socketFilesystem.DecRef(ctx) k.socketMount = k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) sysVShmDevMinor, err := k.vfs.GetAnonBlockDevMinor() if err != nil { return fmt.Errorf("failed to get device number for SysV shm: %v", err) } k.sysVShmDevID = linux.MakeDeviceID(linux.UNNAMED_MAJOR, sysVShmDevMinor) k.sockets = make(map[*vfs.FileDescription]*SocketRecord) k.cgroupRegistry = newCgroupRegistry() return nil } // +stateify savable type privateMemoryFileMetadata struct { owners []string } func savePrivateMFs(ctx context.Context, w io.Writer, pw io.Writer, mfsToSave map[string]*pgalloc.MemoryFile, mfOpts pgalloc.SaveOpts) error { // mfOpts.ExcludeCommittedZeroPages is expected to reflect application // memory usage behavior, but not necessarily usage of private MemoryFiles. mfOpts.ExcludeCommittedZeroPages = false var meta privateMemoryFileMetadata // Generate the order in which private memory files are saved. for fsID := range mfsToSave { meta.owners = append(meta.owners, fsID) } // Save the metadata. if _, err := state.Save(ctx, w, &meta); err != nil { return err } // Followed by the private memory files in order. for _, fsID := range meta.owners { if err := mfsToSave[fsID].SaveTo(ctx, w, pw, mfOpts); err != nil { return err } } return nil } func loadPrivateMFs(ctx context.Context, r io.Reader, pr *statefile.AsyncReader) error { // Load the metadata. var meta privateMemoryFileMetadata if _, err := state.Load(ctx, r, &meta); err != nil { return err } mfmap := pgalloc.MemoryFileMapFromContext(ctx) // Ensure that it is consistent with CtxFilesystemMemoryFileMap. if len(mfmap) != len(meta.owners) { return fmt.Errorf("inconsistent private memory files on restore: savedMFOwners = %v, CtxFilesystemMemoryFileMap = %v", meta.owners, mfmap) } // Load all private memory files. for _, fsID := range meta.owners { mf, ok := mfmap[fsID] if !ok { return fmt.Errorf("saved memory file for %q was not configured on restore", fsID) } if err := mf.LoadFrom(ctx, r, pr); err != nil { return err } } return nil } // SaveTo saves the state of k to w. // // Preconditions: The kernel must be paused throughout the call to SaveTo. func (k *Kernel) SaveTo(ctx context.Context, w io.Writer, pagesMetadata, pagesFile *fd.FD, mfOpts pgalloc.SaveOpts) error { saveStart := time.Now() // Do not allow other Kernel methods to affect it while it's being saved. k.extMu.Lock() defer k.extMu.Unlock() // Stop time. k.pauseTimeLocked(ctx) defer k.resumeTimeLocked(ctx) // Evict all evictable MemoryFile allocations. k.mf.StartEvictions() k.mf.WaitForEvictions() // Discard unsavable mappings, such as those for host file descriptors. if err := k.invalidateUnsavableMappings(ctx); err != nil { return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) } // Capture all private memory files. mfsToSave := make(map[string]*pgalloc.MemoryFile) vfsCtx := context.WithValue(ctx, pgalloc.CtxMemoryFileMap, mfsToSave) // Prepare filesystems for saving. This must be done after // invalidateUnsavableMappings(), since dropping memory mappings may // affect filesystem state (e.g. page cache reference counts). if err := k.vfs.PrepareSave(vfsCtx); err != nil { return err } // Mark all to-be-saved MemoryFiles as savable to inform kernel save below. k.mf.MarkSavable() for _, mf := range mfsToSave { mf.MarkSavable() } // Save the CPUID FeatureSet before the rest of the kernel so we can // verify its compatibility on restore before attempting to restore the // entire kernel, which may fail on an incompatible machine. // // N.B. This will also be saved along with the full kernel save below. cpuidStart := time.Now() if _, err := state.Save(ctx, w, &k.featureSet); err != nil { return err } log.Infof("CPUID save took [%s].", time.Since(cpuidStart)) // Save the timekeeper's state. if rootNS := k.rootNetworkNamespace; rootNS != nil && rootNS.Stack() != nil { // Pause the network stack. netstackPauseStart := time.Now() log.Infof("Pausing root network namespace") k.rootNetworkNamespace.Stack().Pause() defer k.rootNetworkNamespace.Stack().Resume() log.Infof("Pausing root network namespace took [%s].", time.Since(netstackPauseStart)) } // Save the kernel state. kernelStart := time.Now() stats, err := state.Save(ctx, w, k) if err != nil { return err } log.Infof("Kernel save stats: %s", stats.String()) log.Infof("Kernel save took [%s].", time.Since(kernelStart)) // Save the memory files' state. memoryStart := time.Now() pmw := w if pagesMetadata != nil { pmw = pagesMetadata } pw := w if pagesFile != nil { pw = pagesFile } if err := k.mf.SaveTo(ctx, pmw, pw, mfOpts); err != nil { return err } if err := savePrivateMFs(ctx, pmw, pw, mfsToSave, mfOpts); err != nil { return err } log.Infof("Memory files save took [%s].", time.Since(memoryStart)) log.Infof("Overall save took [%s].", time.Since(saveStart)) return nil } // Preconditions: The kernel must be paused. func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { invalidated := make(map[*mm.MemoryManager]struct{}) k.tasks.mu.RLock() defer k.tasks.mu.RUnlock() for t := range k.tasks.Root.tids { // We can skip locking Task.mu here since the kernel is paused. if memMgr := t.image.MemoryManager; memMgr != nil { if _, ok := invalidated[memMgr]; !ok { if err := memMgr.InvalidateUnsavable(ctx); err != nil { return err } invalidated[memMgr] = struct{}{} } } // I really wish we just had a sync.Map of all MMs... if r, ok := t.runState.(*runSyscallAfterExecStop); ok { if err := r.image.MemoryManager.InvalidateUnsavable(ctx); err != nil { return err } } } return nil } // LoadFrom returns a new Kernel loaded from args. func (k *Kernel) LoadFrom(ctx context.Context, r io.Reader, pagesMetadata, pagesFile *fd.FD, timeReady chan struct{}, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error { loadStart := time.Now() var ( mfLoadWg sync.WaitGroup mfLoadErr error ) parallelMfLoad := pagesMetadata != nil && pagesFile != nil if parallelMfLoad { // Parallelize MemoryFile load and kernel load. Both are independent. mfLoadWg.Add(1) go func() { defer mfLoadWg.Done() mfLoadErr = k.loadMemoryFiles(ctx, r, pagesMetadata, pagesFile) }() // Defer a Wait() so we wait for k.loadMemoryFiles() to complete even if we // error out without reaching the other Wait() below. defer mfLoadWg.Wait() } k.runningTasksCond.L = &k.runningTasksMu k.checkpointCond.L = &k.checkpointMu k.cpuClockTickerWakeCh = make(chan struct{}, 1) k.cpuClockTickerStopCond.L = &k.runningTasksMu initAppCores := k.applicationCores // Load the pre-saved CPUID FeatureSet. // // N.B. This was also saved along with the full kernel below, so we // don't need to explicitly install it in the Kernel. cpuidStart := time.Now() if _, err := state.Load(ctx, r, &k.featureSet); err != nil { return err } log.Infof("CPUID load took [%s].", time.Since(cpuidStart)) // Verify that the FeatureSet is usable on this host. We do this before // Kernel load so that the explicit CPUID mismatch error has priority // over floating point state restore errors that may occur on load on // an incompatible machine. if err := k.featureSet.CheckHostCompatible(); err != nil { return err } // Load the kernel state. kernelStart := time.Now() stats, err := state.Load(ctx, r, k) if err != nil { return err } log.Infof("Kernel load stats: %s", stats.String()) log.Infof("Kernel load took [%s].", time.Since(kernelStart)) if parallelMfLoad { mfLoadWg.Wait() } else { mfLoadErr = k.loadMemoryFiles(ctx, r, pagesMetadata, pagesFile) } if mfLoadErr != nil { return mfLoadErr } // rootNetworkNamespace should be populated after loading the state file. // Restore the root network stack. k.rootNetworkNamespace.RestoreRootStack(net) k.Timekeeper().SetClocks(clocks, k.vdsoParams) if timeReady != nil { close(timeReady) } if net != nil { net.Restore() } if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil { return err } tcpip.AsyncLoading.Wait() log.Infof("Overall load took [%s] after async work", time.Since(loadStart)) // Applications may size per-cpu structures based on k.applicationCores, so // it can't change across save/restore. When we are virtualizing CPU // numbers, this isn't a problem. However, when we are exposing host CPU // assignments, we can't tolerate an increase in the number of host CPUs, // which could result in getcpu(2) returning CPUs that applications expect // not to exist. if k.useHostCores && initAppCores > k.applicationCores { return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores) } return nil } func (k *Kernel) loadMemoryFiles(ctx context.Context, r io.Reader, pagesMetadata, pagesFile *fd.FD) error { // Load the memory files' state. memoryStart := time.Now() pmr := r if pagesMetadata != nil { pmr = pagesMetadata } var pr *statefile.AsyncReader if pagesFile != nil { pr = statefile.NewAsyncReader(pagesFile, 0 /* off */) defer pr.Close() } if err := k.mf.LoadFrom(ctx, pmr, pr); err != nil { return err } if err := loadPrivateMFs(ctx, pmr, pr); err != nil { return err } if pr != nil { if err := pr.Wait(); err != nil { return err } } log.Infof("Memory files load took [%s].", time.Since(memoryStart)) return nil } // UniqueID returns a unique identifier. func (k *Kernel) UniqueID() uint64 { id := k.uniqueID.Add(1) if id == 0 { panic("unique identifier generator wrapped around") } return id } // CreateProcessArgs holds arguments to kernel.CreateProcess. type CreateProcessArgs struct { // Filename is the filename to load as the init binary. // // If this is provided as "", File will be checked, then the file will be // guessed via Argv[0]. Filename string // File is a passed host FD pointing to a file to load as the init binary. // // This is checked if and only if Filename is "". File *vfs.FileDescription // Argv is a list of arguments. Argv []string // Envv is a list of environment variables. Envv []string // WorkingDirectory is the initial working directory. // // This defaults to the root if empty. WorkingDirectory string // Credentials is the initial credentials. Credentials *auth.Credentials // FDTable is the initial set of file descriptors. If CreateProcess succeeds, // it takes a reference on FDTable. FDTable *FDTable // Umask is the initial umask. Umask uint // Limits are the initial resource limits. Limits *limits.LimitSet // MaxSymlinkTraversals is the maximum number of symlinks to follow // during resolution. MaxSymlinkTraversals uint // UTSNamespace is the initial UTS namespace. UTSNamespace *UTSNamespace // IPCNamespace is the initial IPC namespace. IPCNamespace *IPCNamespace // PIDNamespace is the initial PID Namespace. PIDNamespace *PIDNamespace // MountNamespace optionally contains the mount namespace for this // process. If nil, the init process's mount namespace is used. // // Anyone setting MountNamespace must donate a reference (i.e. // increment it). MountNamespace *vfs.MountNamespace // ContainerID is the container that the process belongs to. ContainerID string // InitialCgroups are the cgroups the container is initialized to. InitialCgroups map[Cgroup]struct{} // Origin indicates how the task was first created. Origin TaskOrigin } // NewContext returns a context.Context that represents the task that will be // created by args.NewContext(k). func (args *CreateProcessArgs) NewContext(k *Kernel) context.Context { return &createProcessContext{ Context: context.Background(), kernel: k, args: args, } } // createProcessContext is a context.Context that represents the context // associated with a task that is being created. type createProcessContext struct { context.Context kernel *Kernel args *CreateProcessArgs } // Value implements context.Context.Value. func (ctx *createProcessContext) Value(key any) any { switch key { case CtxKernel: return ctx.kernel case CtxPIDNamespace: return ctx.args.PIDNamespace case CtxUTSNamespace: utsns := ctx.args.UTSNamespace utsns.IncRef() return utsns case ipc.CtxIPCNamespace: ipcns := ctx.args.IPCNamespace ipcns.IncRef() return ipcns case auth.CtxCredentials: return ctx.args.Credentials case vfs.CtxRoot: if ctx.args.MountNamespace == nil { return nil } root := ctx.args.MountNamespace.Root(ctx) return root case vfs.CtxMountNamespace: if ctx.kernel.globalInit == nil { return nil } mntns := ctx.kernel.GlobalInit().Leader().MountNamespace() mntns.IncRef() return mntns case devutil.CtxDevGoferClient: return ctx.kernel.GetDevGoferClient(ctx.kernel.ContainerName(ctx.args.ContainerID)) case inet.CtxStack: return ctx.kernel.RootNetworkNamespace().Stack() case ktime.CtxRealtimeClock: return ctx.kernel.RealtimeClock() case limits.CtxLimits: return ctx.args.Limits case pgalloc.CtxMemoryCgroupID: return ctx.getMemoryCgroupID() case pgalloc.CtxMemoryFile: return ctx.kernel.mf case platform.CtxPlatform: return ctx.kernel case uniqueid.CtxGlobalUniqueID: return ctx.kernel.UniqueID() case uniqueid.CtxGlobalUniqueIDProvider: return ctx.kernel case uniqueid.CtxInotifyCookie: return ctx.kernel.GenerateInotifyCookie() case unimpl.CtxEvents: return ctx.kernel default: return nil } } func (ctx *createProcessContext) getMemoryCgroupID() uint32 { for cg := range ctx.args.InitialCgroups { for _, ctl := range cg.Controllers() { if ctl.Type() == CgroupControllerMemory { return cg.ID() } } } return InvalidCgroupID } // CreateProcess creates a new task in a new thread group with the given // options. The new task has no parent and is in the root PID namespace. // // If k.Start() has already been called, then the created process must be // started by calling kernel.StartProcess(tg). // // If k.Start() has not yet been called, then the created task will begin // running when k.Start() is called. // // CreateProcess has no analogue in Linux; it is used to create the initial // application task, as well as processes started by the control server. func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) { k.extMu.Lock() defer k.extMu.Unlock() log.Infof("EXEC: %v", args.Argv) ctx := args.NewContext(k) mntns := args.MountNamespace if mntns == nil { if k.globalInit == nil { return nil, 0, fmt.Errorf("mount namespace is nil") } // Add a reference to the namespace, which is transferred to the new process. mntns = k.globalInit.Leader().MountNamespace() mntns.IncRef() } // Get the root directory from the MountNamespace. root := mntns.Root(ctx) defer root.DecRef(ctx) // Grab the working directory. wd := root // Default. if args.WorkingDirectory != "" { pop := vfs.PathOperation{ Root: root, Start: wd, Path: fspath.Parse(args.WorkingDirectory), FollowFinalSymlink: true, } // NOTE(b/236028361): Do not set CheckSearchable flag to true. // Application is allowed to start with a working directory that it can // not access/search. This is consistent with Docker and VFS1. Runc // explicitly allows for this in 6ce2d63a5db6 ("libct/init_linux: retry // chdir to fix EPERM"). As described in the commit, runc unintentionally // allowed this behavior in a couple of releases and applications started // relying on it. So they decided to allow it for backward compatibility. var err error wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{}) if err != nil { return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) } defer wd.DecRef(ctx) } fsContext := NewFSContext(root, wd, args.Umask) tg := k.NewThreadGroup(args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) cu := cleanup.Make(func() { tg.Release(ctx) }) defer cu.Clean() // Check which file to start from. switch { case args.Filename != "": // If a filename is given, take that. // Set File to nil so we resolve the path in LoadTaskImage. args.File = nil case args.File != nil: // If File is set, take the File provided directly. args.Filename = args.File.MappedName(ctx) default: // Otherwise look at Argv and see if the first argument is a valid path. if len(args.Argv) == 0 { return nil, 0, fmt.Errorf("no filename or command provided") } if !filepath.IsAbs(args.Argv[0]) { return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0]) } args.Filename = args.Argv[0] } // Create a fresh task context. remainingTraversals := args.MaxSymlinkTraversals loadArgs := loader.LoadArgs{ Root: root, WorkingDir: wd, RemainingTraversals: &remainingTraversals, ResolveFinal: true, Filename: args.Filename, File: args.File, CloseOnExec: false, Argv: args.Argv, Envv: args.Envv, Features: k.featureSet, } image, se := k.LoadTaskImage(ctx, loadArgs) if se != nil { return nil, 0, errors.New(se.String()) } var capData auth.VfsCapData if len(image.FileCaps()) != 0 { var err error capData, err = auth.VfsCapDataOf([]byte(image.FileCaps())) if err != nil { return nil, 0, err } } creds, err := auth.CapsFromVfsCaps(capData, args.Credentials) if err != nil { return nil, 0, err } args.FDTable.IncRef() // Create the task. config := &TaskConfig{ Kernel: k, ThreadGroup: tg, TaskImage: image, FSContext: fsContext, FDTable: args.FDTable, Credentials: creds, NetworkNamespace: k.RootNetworkNamespace(), AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), UTSNamespace: args.UTSNamespace, IPCNamespace: args.IPCNamespace, MountNamespace: mntns, ContainerID: args.ContainerID, InitialCgroups: args.InitialCgroups, UserCounters: k.GetUserCounters(args.Credentials.RealKUID), Origin: args.Origin, // A task with no parent starts out with no session keyring. SessionKeyring: nil, } config.UTSNamespace.IncRef() config.IPCNamespace.IncRef() config.NetworkNamespace.IncRef() t, err := k.tasks.NewTask(ctx, config) if err != nil { return nil, 0, err } t.traceExecEvent(image) // Simulate exec for tracing. // Success. cu.Release() tgid := k.tasks.Root.IDOfThreadGroup(tg) if k.globalInit == nil { k.globalInit = tg } return tg, tgid, nil } // StartProcess starts running a process that was created with CreateProcess. func (k *Kernel) StartProcess(tg *ThreadGroup) { t := tg.Leader() tid := k.tasks.Root.IDOfTask(t) t.Start(tid) } // Start starts execution of all tasks in k. // // Preconditions: Start may be called exactly once. func (k *Kernel) Start() error { k.extMu.Lock() defer k.extMu.Unlock() if k.started { return fmt.Errorf("kernel already started") } k.started = true k.cpuClockTickTimer = time.NewTimer(linux.ClockTick) k.runningTasksMu.Lock() k.cpuClockTickerRunning = true k.runningTasksMu.Unlock() go k.runCPUClockTicker() // If k was created by LoadKernelFrom, timers were stopped during // Kernel.SaveTo and need to be resumed. If k was created by NewKernel, // this is a no-op. k.resumeTimeLocked(k.SupervisorContext()) k.tasks.mu.RLock() ts := make([]*Task, 0, len(k.tasks.Root.tids)) for t := range k.tasks.Root.tids { ts = append(ts, t) } k.tasks.mu.RUnlock() // Start task goroutines. // NOTE(b/235349091): We don't actually need the TaskSet mutex, we just // need to make sure we only call t.Start() once for each task. Holding the // mutex for each task start may cause a nested locking error. for _, t := range ts { t.Start(t.ThreadID()) } return nil } // pauseTimeLocked pauses all Timers and Timekeeper updates. // // Preconditions: // - Any task goroutines running in k must be stopped. // - k.extMu must be locked. func (k *Kernel) pauseTimeLocked(ctx context.Context) { // Since all task goroutines have been stopped by precondition, the CPU clock // ticker should stop on its own; wait for it to do so, waking it up from // sleeping between ticks if necessary. k.runningTasksMu.Lock() for k.cpuClockTickerRunning { select { case k.cpuClockTickerWakeCh <- struct{}{}: default: } k.cpuClockTickerStopCond.Wait() } k.runningTasksMu.Unlock() // By precondition, nothing else can be interacting with PIDNamespace.tids // or FDTable.files, so we can iterate them without synchronization. (We // can't hold the TaskSet mutex when pausing thread group timers because // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet // mutex, while holding the Timer mutex.) for t := range k.tasks.Root.tids { if t == t.tg.leader { t.tg.itimerRealTimer.Pause() for _, it := range t.tg.timers { it.PauseTimer() } } // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. if t.fdTable != nil { t.fdTable.ForEach(ctx, func(_ int32, fd *vfs.FileDescription, _ FDFlags) bool { if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.PauseTimer() } return true }) } } k.timekeeper.PauseUpdates() } // resumeTimeLocked resumes all Timers and Timekeeper updates. If // pauseTimeLocked has not been previously called, resumeTimeLocked has no // effect. // // Preconditions: // - Any task goroutines running in k must be stopped. // - k.extMu must be locked. func (k *Kernel) resumeTimeLocked(ctx context.Context) { // The CPU clock ticker will automatically resume as task goroutines resume // execution. k.timekeeper.ResumeUpdates(k.vdsoParams) for t := range k.tasks.Root.tids { if t == t.tg.leader { t.tg.itimerRealTimer.Resume() for _, it := range t.tg.timers { it.ResumeTimer() } } if t.fdTable != nil { t.fdTable.ForEach(ctx, func(_ int32, fd *vfs.FileDescription, _ FDFlags) bool { if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.ResumeTimer() } return true }) } } } func (k *Kernel) incRunningTasks() { for { tasks := k.runningTasks.Load() if tasks != 0 { // Standard case. Simply increment. if !k.runningTasks.CompareAndSwap(tasks, tasks+1) { continue } return } // Transition from 0 -> 1. k.runningTasksMu.Lock() if k.runningTasks.Load() != 0 { // Raced with another transition and lost. k.runningTasks.Add(1) k.runningTasksMu.Unlock() return } if !k.cpuClockTickerRunning { select { case tickTime := <-k.cpuClockTickTimer.C: // Rearm the timer since we consumed the wakeup. Estimate how much time // remains on the current tick so that periodic workloads interact with // the (periodic) CPU clock ticker in the same way that they would // without the optimization of putting the ticker to sleep. missedNS := time.Since(tickTime).Nanoseconds() missedTicks := missedNS / linux.ClockTick.Nanoseconds() thisTickNS := missedNS - missedTicks*linux.ClockTick.Nanoseconds() k.cpuClockTickTimer.Reset(time.Duration(linux.ClockTick.Nanoseconds() - thisTickNS)) // Increment k.cpuClock on the CPU clock ticker goroutine's behalf. // (Whole missed ticks don't matter, and adding them to k.cpuClock will // just confuse the watchdog.) At the time the tick occurred, all task // goroutines were asleep, so there's nothing else to do. This ensures // that our caller (Task.accountTaskGoroutineLeave()) records an // updated k.cpuClock in Task.gosched.Timestamp, so that it's correctly // accounted as having resumed execution in the sentry during this tick // instead of at the end of the previous one. k.cpuClock.Add(1) default: } // We are transitioning from idle to active. Set k.cpuClockTickerRunning // = true here so that if we transition to idle and then active again // before the CPU clock ticker goroutine has a chance to run, the first // call to k.incRunningTasks() at the end of that cycle does not try to // steal k.cpuClockTickTimer.C again, as this would allow workloads that // rapidly cycle between idle and active to starve the CPU clock ticker // of chances to observe task goroutines in a running state and account // their CPU usage. k.cpuClockTickerRunning = true k.runningTasksCond.Signal() } // This store must happen after the increment of k.cpuClock above to ensure // that concurrent calls to Task.accountTaskGoroutineLeave() also observe // the updated k.cpuClock. k.runningTasks.Store(1) k.runningTasksMu.Unlock() return } } func (k *Kernel) decRunningTasks() { tasks := k.runningTasks.Add(-1) if tasks < 0 { panic(fmt.Sprintf("Invalid running count %d", tasks)) } // Nothing to do. The next CPU clock tick will disable the timer if // there is still nothing running. This provides approximately one tick // of slack in which we can switch back and forth between idle and // active without an expensive transition. } // WaitExited blocks until all tasks in k have exited. func (k *Kernel) WaitExited() { k.tasks.liveGoroutines.Wait() } // Kill requests that all tasks in k immediately exit as if group exiting with // status ws. Kill does not wait for tasks to exit. func (k *Kernel) Kill(ws linux.WaitStatus) { k.extMu.Lock() defer k.extMu.Unlock() k.tasks.Kill(ws) } // Pause requests that all tasks in k temporarily stop executing, and blocks // until all tasks and asynchronous I/O operations in k have stopped. Multiple // calls to Pause nest and require an equal number of calls to Unpause to // resume execution. func (k *Kernel) Pause() { k.extMu.Lock() k.tasks.BeginExternalStop() k.extMu.Unlock() k.tasks.runningGoroutines.Wait() k.tasks.aioGoroutines.Wait() } // IsPaused returns true if the kernel is currently paused. func (k *Kernel) IsPaused() bool { return k.tasks.isExternallyStopped() } // ReceiveTaskStates receives full states for all tasks. func (k *Kernel) ReceiveTaskStates() { k.extMu.Lock() k.tasks.PullFullState() k.extMu.Unlock() } // Unpause ends the effect of a previous call to Pause. If Unpause is called // without a matching preceding call to Pause, Unpause may panic. func (k *Kernel) Unpause() { k.extMu.Lock() defer k.extMu.Unlock() k.tasks.EndExternalStop() } // SendExternalSignal injects a signal into the kernel. // // context is used only for debugging to describe how the signal was received. // // Preconditions: Kernel must have an init process. func (k *Kernel) SendExternalSignal(info *linux.SignalInfo, context string) { k.extMu.Lock() defer k.extMu.Unlock() k.sendExternalSignal(info, context) } // SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup. // // This function doesn't skip signals like SendExternalSignal does. func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *linux.SignalInfo) error { k.extMu.Lock() defer k.extMu.Unlock() return tg.SendSignal(info) } // SendExternalSignalProcessGroup sends a signal to all ThreadGroups in the // given process group. // // This function doesn't skip signals like SendExternalSignal does. func (k *Kernel) SendExternalSignalProcessGroup(pg *ProcessGroup, info *linux.SignalInfo) error { k.extMu.Lock() defer k.extMu.Unlock() // If anything goes wrong, we'll return the error, but still try our // best to deliver to other processes in the group. var firstErr error for _, tg := range k.TaskSet().Root.ThreadGroups() { if tg.ProcessGroup() != pg { continue } if err := tg.SendSignal(info); err != nil && firstErr == nil { firstErr = err } } return firstErr } // SendContainerSignal sends the given signal to all processes inside the // namespace that match the given container ID. func (k *Kernel) SendContainerSignal(cid string, info *linux.SignalInfo) error { k.extMu.Lock() defer k.extMu.Unlock() k.tasks.mu.RLock() defer k.tasks.mu.RUnlock() var lastErr error for tg := range k.tasks.Root.tgids { if tg.leader.ContainerID() == cid { tg.signalHandlers.mu.Lock() infoCopy := *info if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { lastErr = err } tg.signalHandlers.mu.Unlock() } } return lastErr } // RebuildTraceContexts rebuilds the trace context for all tasks. // // Unfortunately, if these are built while tracing is not enabled, then we will // not have meaningful trace data. Rebuilding here ensures that we can do so // after tracing has been enabled. func (k *Kernel) RebuildTraceContexts() { // We need to pause all task goroutines because Task.rebuildTraceContext() // replaces Task.traceContext and Task.traceTask, which are // task-goroutine-exclusive (i.e. the task goroutine assumes that it can // access them without synchronization) for performance. k.Pause() defer k.Unpause() k.extMu.Lock() defer k.extMu.Unlock() k.tasks.mu.RLock() defer k.tasks.mu.RUnlock() for t, tid := range k.tasks.Root.tids { t.rebuildTraceContext(tid) } } // FeatureSet returns the FeatureSet. func (k *Kernel) FeatureSet() cpuid.FeatureSet { return k.featureSet } // Timekeeper returns the Timekeeper. func (k *Kernel) Timekeeper() *Timekeeper { return k.timekeeper } // TaskSet returns the TaskSet. func (k *Kernel) TaskSet() *TaskSet { return k.tasks } // RootUserNamespace returns the root UserNamespace. func (k *Kernel) RootUserNamespace() *auth.UserNamespace { return k.rootUserNamespace } // RootUTSNamespace returns the root UTSNamespace. func (k *Kernel) RootUTSNamespace() *UTSNamespace { return k.rootUTSNamespace } // RootIPCNamespace takes a reference and returns the root IPCNamespace. func (k *Kernel) RootIPCNamespace() *IPCNamespace { return k.rootIPCNamespace } // RootPIDNamespace returns the root PIDNamespace. func (k *Kernel) RootPIDNamespace() *PIDNamespace { return k.tasks.Root } // RootNetworkNamespace returns the root network namespace, always non-nil. func (k *Kernel) RootNetworkNamespace() *inet.Namespace { return k.rootNetworkNamespace } // GlobalInit returns the thread group with ID 1 in the root PID namespace, or // nil if no such thread group exists. GlobalInit may return a thread group // containing no tasks if the thread group has already exited. func (k *Kernel) GlobalInit() *ThreadGroup { k.extMu.Lock() defer k.extMu.Unlock() return k.globalInit } // TestOnlySetGlobalInit sets the thread group with ID 1 in the root PID namespace. func (k *Kernel) TestOnlySetGlobalInit(tg *ThreadGroup) { k.globalInit = tg } // ApplicationCores returns the number of CPUs visible to sandboxed // applications. func (k *Kernel) ApplicationCores() uint { return k.applicationCores } // RealtimeClock returns the application CLOCK_REALTIME clock. func (k *Kernel) RealtimeClock() ktime.Clock { return k.timekeeper.realtimeClock } // MonotonicClock returns the application CLOCK_MONOTONIC clock. func (k *Kernel) MonotonicClock() ktime.Clock { return k.timekeeper.monotonicClock } // CPUClockNow returns the current value of k.cpuClock. func (k *Kernel) CPUClockNow() uint64 { return k.cpuClock.Load() } // Syslog returns the syslog. func (k *Kernel) Syslog() *syslog { return &k.syslog } // GenerateInotifyCookie generates a unique inotify event cookie. // // Returned values may overlap with previously returned values if the value // space is exhausted. 0 is not a valid cookie value, all other values // representable in a uint32 are allowed. func (k *Kernel) GenerateInotifyCookie() uint32 { id := k.nextInotifyCookie.Add(1) // Wrap-around is explicitly allowed for inotify event cookies. if id == 0 { id = k.nextInotifyCookie.Add(1) } return id } // NetlinkPorts returns the netlink port manager. func (k *Kernel) NetlinkPorts() *port.Manager { return k.netlinkPorts } var ( errSaved = errors.New("sandbox has been successfully saved") errAutoSaved = errors.New("sandbox has been successfully auto-saved") ) // SaveStatus returns the sandbox save status. If it was saved successfully, // autosaved indicates whether save was triggered by autosave. If it was not // saved successfully, err indicates the sandbox error that caused the kernel to // exit during save. func (k *Kernel) SaveStatus() (saved, autosaved bool, err error) { k.extMu.Lock() defer k.extMu.Unlock() switch k.saveStatus { case nil: return false, false, nil case errSaved: return true, false, nil case errAutoSaved: return true, true, nil default: return false, false, k.saveStatus } } // SetSaveSuccess sets the flag indicating that save completed successfully, if // no status was already set. func (k *Kernel) SetSaveSuccess(autosave bool) { k.extMu.Lock() defer k.extMu.Unlock() if k.saveStatus == nil { if autosave { k.saveStatus = errAutoSaved } else { k.saveStatus = errSaved } } } // SetSaveError sets the sandbox error that caused the kernel to exit during // save, if one is not already set. func (k *Kernel) SetSaveError(err error) { k.extMu.Lock() defer k.extMu.Unlock() if k.saveStatus == nil { k.saveStatus = err } } // SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or // LoadFrom. func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) { k.mf = mf } // MemoryFile returns the MemoryFile that provides application memory. func (k *Kernel) MemoryFile() *pgalloc.MemoryFile { return k.mf } // SupervisorContext returns a Context with maximum privileges in k. It should // only be used by goroutines outside the control of the emulated kernel // defined by e. // // Callers are responsible for ensuring that the returned Context is not used // concurrently with changes to the Kernel. func (k *Kernel) SupervisorContext() context.Context { return &supervisorContext{ Kernel: k, Logger: log.Log(), } } // SocketRecord represents a socket recorded in Kernel.sockets. // // +stateify savable type SocketRecord struct { k *Kernel Sock *vfs.FileDescription ID uint64 // Socket table entry number. } // RecordSocket adds a socket to the system-wide socket table for // tracking. // // Precondition: Caller must hold a reference to sock. // // Note that the socket table will not hold a reference on the // vfs.FileDescription. func (k *Kernel) RecordSocket(sock *vfs.FileDescription) { k.extMu.Lock() if _, ok := k.sockets[sock]; ok { panic(fmt.Sprintf("Socket %p added twice", sock)) } id := k.nextSocketRecord k.nextSocketRecord++ s := &SocketRecord{ k: k, ID: id, Sock: sock, } k.sockets[sock] = s k.extMu.Unlock() } // DeleteSocket removes a socket from the system-wide socket table. func (k *Kernel) DeleteSocket(sock *vfs.FileDescription) { k.extMu.Lock() delete(k.sockets, sock) k.extMu.Unlock() } // ListSockets returns a snapshot of all sockets. // // Callers of ListSockets() should use SocketRecord.Sock.TryIncRef() // to get a reference on a socket in the table. func (k *Kernel) ListSockets() []*SocketRecord { k.extMu.Lock() var socks []*SocketRecord for _, s := range k.sockets { socks = append(socks, s) } k.extMu.Unlock() return socks } // supervisorContext is a privileged context. type supervisorContext struct { context.NoTask log.Logger *Kernel } // Deadline implements context.Context.Deadline. func (*Kernel) Deadline() (time.Time, bool) { return time.Time{}, false } // Done implements context.Context.Done. func (*Kernel) Done() <-chan struct{} { return nil } // Err implements context.Context.Err. func (*Kernel) Err() error { return nil } // Value implements context.Context. func (ctx *supervisorContext) Value(key any) any { switch key { case CtxCanTrace: // The supervisor context can trace anything. (None of // supervisorContext's users are expected to invoke ptrace, but ptrace // permissions are required for certain file accesses.) return func(*Task, bool) bool { return true } case CtxKernel: return ctx.Kernel case CtxPIDNamespace: return ctx.Kernel.tasks.Root case CtxUTSNamespace: utsns := ctx.Kernel.rootUTSNamespace utsns.IncRef() return utsns case ipc.CtxIPCNamespace: ipcns := ctx.Kernel.rootIPCNamespace ipcns.IncRef() return ipcns case auth.CtxCredentials: // The supervisor context is global root. return auth.NewRootCredentials(ctx.Kernel.rootUserNamespace) case vfs.CtxRoot: if ctx.Kernel.globalInit == nil || ctx.Kernel.globalInit.Leader() == nil { return vfs.VirtualDentry{} } root := ctx.Kernel.GlobalInit().Leader().MountNamespace().Root(ctx) return root case vfs.CtxMountNamespace: if ctx.Kernel.globalInit == nil || ctx.Kernel.globalInit.Leader() == nil { return nil } mntns := ctx.Kernel.GlobalInit().Leader().MountNamespace() mntns.IncRef() return mntns case inet.CtxStack: return ctx.Kernel.RootNetworkNamespace().Stack() case ktime.CtxRealtimeClock: return ctx.Kernel.RealtimeClock() case limits.CtxLimits: // No limits apply. return limits.NewLimitSet() case pgalloc.CtxMemoryFile: return ctx.Kernel.mf case platform.CtxPlatform: return ctx.Kernel case uniqueid.CtxGlobalUniqueID: return ctx.Kernel.UniqueID() case uniqueid.CtxGlobalUniqueIDProvider: return ctx.Kernel case uniqueid.CtxInotifyCookie: return ctx.Kernel.GenerateInotifyCookie() case unimpl.CtxEvents: return ctx.Kernel case cpuid.CtxFeatureSet: return ctx.Kernel.featureSet default: return nil } } // Rate limits for the number of unimplemented syscall events. const ( unimplementedSyscallsMaxRate = 100 // events per second unimplementedSyscallBurst = 1000 // events ) // EmitUnimplementedEvent emits an UnimplementedSyscall event via the event // channel. func (k *Kernel) EmitUnimplementedEvent(ctx context.Context, sysno uintptr) { k.unimplementedSyscallEmitterOnce.Do(func() { k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst) }) t := TaskFromContext(ctx) IncrementUnimplementedSyscallCounter(sysno) _, _ = k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{ Tid: int32(t.ThreadID()), Registers: t.Arch().StateData().Proto(), }) } // VFS returns the virtual filesystem for the kernel. func (k *Kernel) VFS() *vfs.VirtualFilesystem { return &k.vfs } // SetHostMount sets the hostfs mount. func (k *Kernel) SetHostMount(mnt *vfs.Mount) { if k.hostMount != nil { panic("Kernel.hostMount cannot be set more than once") } k.hostMount = mnt } // AddStateToCheckpoint adds a key-value pair to be additionally checkpointed. func (k *Kernel) AddStateToCheckpoint(key, v any) { k.checkpointMu.Lock() defer k.checkpointMu.Unlock() if k.additionalCheckpointState == nil { k.additionalCheckpointState = make(map[any]any) } k.additionalCheckpointState[key] = v } // PopCheckpointState pops a key-value pair from the additional checkpoint // state. If the key doesn't exist, nil is returned. func (k *Kernel) PopCheckpointState(key any) any { k.checkpointMu.Lock() defer k.checkpointMu.Unlock() if v, ok := k.additionalCheckpointState[key]; ok { delete(k.additionalCheckpointState, key) return v } return nil } // HostMount returns the hostfs mount. func (k *Kernel) HostMount() *vfs.Mount { return k.hostMount } // PipeMount returns the pipefs mount. func (k *Kernel) PipeMount() *vfs.Mount { return k.pipeMount } // GetNamespaceInode returns a new nsfs inode which serves as a reference counter for the namespace. func (k *Kernel) GetNamespaceInode(ctx context.Context, ns vfs.Namespace) refs.TryRefCounter { return nsfs.NewInode(ctx, k.nsfsMount, ns) } // ShmMount returns the tmpfs mount. func (k *Kernel) ShmMount() *vfs.Mount { return k.shmMount } // SocketMount returns the sockfs mount. func (k *Kernel) SocketMount() *vfs.Mount { return k.socketMount } // CgroupRegistry returns the cgroup registry. func (k *Kernel) CgroupRegistry() *CgroupRegistry { return k.cgroupRegistry } // AddCgroupMount adds the cgroup mounts to the cgroupMountsMap. These cgroup // mounts are created during the creation of root container process and the // reference ownership is transferred to the kernel. func (k *Kernel) AddCgroupMount(ctl string, mnt *CgroupMount) { k.cgroupMountsMapMu.Lock() defer k.cgroupMountsMapMu.Unlock() if k.cgroupMountsMap == nil { k.cgroupMountsMap = make(map[string]*CgroupMount) } k.cgroupMountsMap[ctl] = mnt } // GetCgroupMount returns the cgroup mount for the given cgroup controller. func (k *Kernel) GetCgroupMount(ctl string) *CgroupMount { k.cgroupMountsMapMu.Lock() defer k.cgroupMountsMapMu.Unlock() return k.cgroupMountsMap[ctl] } // releaseCgroupMounts releases the cgroup mounts. func (k *Kernel) releaseCgroupMounts(ctx context.Context) { k.cgroupMountsMapMu.Lock() defer k.cgroupMountsMapMu.Unlock() for _, m := range k.cgroupMountsMap { m.Mount.DecRef(ctx) m.Root.DecRef(ctx) m.Fs.DecRef(ctx) } } // Release releases resources owned by k. // // Precondition: This should only be called after the kernel is fully // initialized, e.g. after k.Start() has been called. func (k *Kernel) Release() { ctx := k.SupervisorContext() k.releaseCgroupMounts(ctx) k.hostMount.DecRef(ctx) k.pipeMount.DecRef(ctx) k.nsfsMount.DecRef(ctx) k.shmMount.DecRef(ctx) k.socketMount.DecRef(ctx) k.vfs.Release(ctx) k.timekeeper.Destroy() k.vdso.Release(ctx) k.RootNetworkNamespace().DecRef(ctx) k.rootIPCNamespace.DecRef(ctx) k.rootUTSNamespace.DecRef(ctx) k.cleaupDevGofers() k.mf.Destroy() } // PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup // hierarchy. // // Precondition: root must be a new cgroup with no tasks. This implies the // controllers for root are also new and currently manage no task, which in turn // implies the new cgroup can be populated without migrating tasks between // cgroups. func (k *Kernel) PopulateNewCgroupHierarchy(root Cgroup) { k.tasks.mu.RLock() k.tasks.forEachTaskLocked(func(t *Task) { if t.exitState != TaskExitNone { return } t.mu.Lock() // A task can be in the cgroup if it has been created after the // cgroup hierarchy was registered. t.enterCgroupIfNotYetLocked(root) t.mu.Unlock() }) k.tasks.mu.RUnlock() } // ReleaseCgroupHierarchy moves all tasks out of all cgroups belonging to the // hierarchy with the provided id. This is intended for use during hierarchy // teardown, as otherwise the tasks would be orphaned w.r.t to some controllers. func (k *Kernel) ReleaseCgroupHierarchy(hid uint32) { var releasedCGs []Cgroup k.tasks.mu.RLock() // We'll have one cgroup per hierarchy per task. releasedCGs = make([]Cgroup, 0, len(k.tasks.Root.tids)) k.tasks.forEachTaskLocked(func(t *Task) { if t.exitState != TaskExitNone { return } t.mu.Lock() for cg := range t.cgroups { if cg.HierarchyID() == hid { cg.Leave(t) t.ResetMemCgIDFromCgroup(cg) delete(t.cgroups, cg) releasedCGs = append(releasedCGs, cg) // A task can't be part of multiple cgroups from the same // hierarchy, so we can skip checking the rest once we find a // match. break } } t.mu.Unlock() }) k.tasks.mu.RUnlock() for _, c := range releasedCGs { c.decRef() } } // ReplaceFSContextRoots updates root and cwd to `newRoot` in the FSContext // across all tasks whose old root or cwd were `oldRoot`. func (k *Kernel) ReplaceFSContextRoots(ctx context.Context, oldRoot vfs.VirtualDentry, newRoot vfs.VirtualDentry) { k.tasks.mu.RLock() oldRootDecRefs := 0 k.tasks.forEachTaskLocked(func(t *Task) { t.mu.Lock() defer t.mu.Unlock() if fsc := t.fsContext; fsc != nil { fsc.mu.Lock() defer fsc.mu.Unlock() if fsc.root == oldRoot { newRoot.IncRef() oldRootDecRefs++ fsc.root = newRoot } if fsc.cwd == oldRoot { newRoot.IncRef() oldRootDecRefs++ fsc.cwd = newRoot } } }) k.tasks.mu.RUnlock() for i := 0; i < oldRootDecRefs; i++ { oldRoot.DecRef(ctx) } } // GetUserCounters returns the user counters for the given KUID. func (k *Kernel) GetUserCounters(uid auth.KUID) *UserCounters { k.userCountersMapMu.Lock() defer k.userCountersMapMu.Unlock() if uc, ok := k.userCountersMap[uid]; ok { return uc } uc := &UserCounters{} k.userCountersMap[uid] = uc return uc } // AddDevGofer initializes the dev gofer connection and starts tracking it. // It takes ownership of goferFD. func (k *Kernel) AddDevGofer(contName string, goferFD int) error { client, err := devutil.NewGoferClient(k.SupervisorContext(), contName, goferFD) if err != nil { return err } k.devGofersMu.Lock() defer k.devGofersMu.Unlock() if k.devGofers == nil { k.devGofers = make(map[string]*devutil.GoferClient) } k.devGofers[contName] = client return nil } // RemoveDevGofer closes the dev gofer connection, if one exists, and stops // tracking it. func (k *Kernel) RemoveDevGofer(contName string) { k.devGofersMu.Lock() defer k.devGofersMu.Unlock() client, ok := k.devGofers[contName] if !ok { return } client.Close() delete(k.devGofers, contName) } // GetDevGoferClient implements // devutil.GoferClientProviderFromContext.GetDevGoferClient. func (k *Kernel) GetDevGoferClient(contName string) *devutil.GoferClient { k.devGofersMu.Lock() defer k.devGofersMu.Unlock() return k.devGofers[contName] } func (k *Kernel) cleaupDevGofers() { k.devGofersMu.Lock() defer k.devGofersMu.Unlock() for _, client := range k.devGofers { client.Close() } k.devGofers = nil } // RegisterContainerName registers a container name for a given container ID. func (k *Kernel) RegisterContainerName(cid, containerName string) { k.extMu.Lock() defer k.extMu.Unlock() k.containerNames[cid] = containerName } // RestoreContainerMapping remaps old container IDs to new ones after a restore. // containerIDs maps "name -> new container ID". Note that container names remain // constant between restore sessions. func (k *Kernel) RestoreContainerMapping(containerIDs map[string]string) { k.extMu.Lock() defer k.extMu.Unlock() // Delete mapping from old session and replace with new values. k.containerNames = make(map[string]string) for name, cid := range containerIDs { k.containerNames[cid] = name } } // ContainerName returns the container name for a given container ID. func (k *Kernel) ContainerName(cid string) string { k.extMu.Lock() defer k.extMu.Unlock() return k.containerNames[cid] } // SetSaver sets the kernel's Saver. // Thread-compatible. func (k *Kernel) SetSaver(s Saver) { k.checkpointMu.Lock() defer k.checkpointMu.Unlock() k.saver = s } // Saver returns the kernel's Saver. // Thread-compatible. func (k *Kernel) Saver() Saver { k.checkpointMu.Lock() defer k.checkpointMu.Unlock() return k.saver } // IncCheckpointCount increments the checkpoint counter. func (k *Kernel) IncCheckpointCount() { k.checkpointMu.Lock() defer k.checkpointMu.Unlock() k.checkpointCounter++ } // CheckpointCount returns the current checkpoint count. Note that the result // may be stale by the time the caller uses it. func (k *Kernel) CheckpointCount() uint32 { k.checkpointMu.Lock() defer k.checkpointMu.Unlock() return k.checkpointCounter } // OnCheckpointAttempt is called when a checkpoint attempt is completed. err is // any checkpoint errors that may have occurred. func (k *Kernel) OnCheckpointAttempt(err error) { k.checkpointMu.Lock() defer k.checkpointMu.Unlock() if err == nil { k.checkpointCounter++ } k.lastCheckpointStatus = err k.checkpointCond.Broadcast() } // ResetCheckpointStatus resets the last checkpoint status, indicating a new // checkpoint is in progress. Caller must call OnCheckpointAttempt when the // checkpoint attempt is completed. func (k *Kernel) ResetCheckpointStatus() { k.checkpointMu.Lock() defer k.checkpointMu.Unlock() k.lastCheckpointStatus = nil } // WaitCheckpoint waits for the Kernel to have been successfully checkpointed // n-1 times, then waits for either the n-th successful checkpoint (in which // case it returns nil) or any number of failed checkpoints (in which case it // returns an error returned by any such failure). func (k *Kernel) WaitCheckpoint(n uint32) error { if n == 0 { return nil } k.checkpointMu.Lock() defer k.checkpointMu.Unlock() if k.checkpointCounter >= n { // n-th checkpoint already completed successfully. return nil } for k.checkpointCounter < n { if k.checkpointCounter == n-1 && k.lastCheckpointStatus != nil { // n-th checkpoint was attempted but it had failed. return k.lastCheckpointStatus } k.checkpointCond.Wait() } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/kernel_abi_autogen_unsafe.go000066400000000000000000000207561465435605700277470ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package kernel import ( "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "io" "reflect" "runtime" "unsafe" ) // Marshallable types used by this file. var _ marshal.Marshallable = (*ThreadID)(nil) var _ marshal.Marshallable = (*vdsoParams)(nil) // SizeBytes implements marshal.Marshallable.SizeBytes. //go:nosplit func (tid *ThreadID) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (tid *ThreadID) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(*tid)) return dst[4:] } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (tid *ThreadID) UnmarshalBytes(src []byte) []byte { *tid = ThreadID(int32(hostarch.ByteOrder.Uint32(src[:4]))) return src[4:] } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (tid *ThreadID) Packed() bool { // Scalar newtypes are always packed. return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (tid *ThreadID) MarshalUnsafe(dst []byte) []byte { size := tid.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(tid), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (tid *ThreadID) UnmarshalUnsafe(src []byte) []byte { size := tid.SizeBytes() gohacks.Memmove(unsafe.Pointer(tid), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (tid *ThreadID) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(tid))) hdr.Len = tid.SizeBytes() hdr.Cap = tid.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that tid // must live until the use above. runtime.KeepAlive(tid) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (tid *ThreadID) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return tid.CopyOutN(cc, addr, tid.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (tid *ThreadID) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(tid))) hdr.Len = tid.SizeBytes() hdr.Cap = tid.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that tid // must live until the use above. runtime.KeepAlive(tid) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (tid *ThreadID) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return tid.CopyInN(cc, addr, tid.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (tid *ThreadID) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(tid))) hdr.Len = tid.SizeBytes() hdr.Cap = tid.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that tid // must live until the use above. runtime.KeepAlive(tid) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (v *vdsoParams) SizeBytes() int { return 64 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (v *vdsoParams) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(v.monotonicReady)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(v.monotonicBaseCycles)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(v.monotonicBaseRef)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(v.monotonicFrequency)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(v.realtimeReady)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(v.realtimeBaseCycles)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(v.realtimeBaseRef)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(v.realtimeFrequency)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (v *vdsoParams) UnmarshalBytes(src []byte) []byte { v.monotonicReady = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] v.monotonicBaseCycles = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] v.monotonicBaseRef = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] v.monotonicFrequency = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] v.realtimeReady = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] v.realtimeBaseCycles = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] v.realtimeBaseRef = int64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] v.realtimeFrequency = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (v *vdsoParams) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (v *vdsoParams) MarshalUnsafe(dst []byte) []byte { size := v.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(v), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (v *vdsoParams) UnmarshalUnsafe(src []byte) []byte { size := v.SizeBytes() gohacks.Memmove(unsafe.Pointer(v), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (v *vdsoParams) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (v *vdsoParams) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return v.CopyOutN(cc, addr, v.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (v *vdsoParams) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (v *vdsoParams) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return v.CopyInN(cc, addr, v.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (v *vdsoParams) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(v))) hdr.Len = v.SizeBytes() hdr.Cap = v.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that v // must live until the use above. runtime.KeepAlive(v) // escapes: replaced by intrinsic. return int64(length), err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/kernel_amd64_abi_autogen_unsafe.go000066400000000000000000000007351465435605700307350ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build amd64 // +build amd64 package kernel import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/kernel_amd64_state_autogen.go000066400000000000000000000001321465435605700277500ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 // +build amd64 package kernel golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/kernel_arm64_abi_autogen_unsafe.go000066400000000000000000000007351465435605700307530ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build arm64 // +build arm64 package kernel import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/kernel_arm64_state_autogen.go000066400000000000000000000001321465435605700277660ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 // +build arm64 package kernel golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/kernel_impl_abi_autogen_unsafe.go000066400000000000000000000007371465435605700307650ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build !false // +build !false package kernel import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/kernel_impl_state_autogen.go000066400000000000000000000013011465435605700277750ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package kernel import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (p *pidNamespaceData) StateTypeName() string { return "pkg/sentry/kernel.pidNamespaceData" } func (p *pidNamespaceData) StateFields() []string { return []string{} } func (p *pidNamespaceData) beforeSave() {} // +checklocksignore func (p *pidNamespaceData) StateSave(stateSinkObject state.Sink) { p.beforeSave() } func (p *pidNamespaceData) afterLoad(context.Context) {} // +checklocksignore func (p *pidNamespaceData) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func init() { state.Register((*pidNamespaceData)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/kernel_opts.go000066400000000000000000000013701465435605700251050ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false package kernel // SpecialOpts contains non-standard options for the kernel. // // +stateify savable type SpecialOpts struct{} golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/kernel_opts_abi_autogen_unsafe.go000066400000000000000000000007371465435605700310110ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build !false // +build !false package kernel import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/kernel_opts_state_autogen.go000066400000000000000000000012311465435605700300230ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package kernel import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (s *SpecialOpts) StateTypeName() string { return "pkg/sentry/kernel.SpecialOpts" } func (s *SpecialOpts) StateFields() []string { return []string{} } func (s *SpecialOpts) beforeSave() {} // +checklocksignore func (s *SpecialOpts) StateSave(stateSinkObject state.Sink) { s.beforeSave() } func (s *SpecialOpts) afterLoad(context.Context) {} // +checklocksignore func (s *SpecialOpts) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func init() { state.Register((*SpecialOpts)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/kernel_state.go000066400000000000000000000017571465435605700252510ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "context" "gvisor.dev/gvisor/pkg/tcpip" ) // saveDanglingEndpoints is invoked by stateify. func (k *Kernel) saveDanglingEndpoints() []tcpip.Endpoint { return tcpip.GetDanglingEndpoints() } // loadDanglingEndpoints is invoked by stateify. func (k *Kernel) loadDanglingEndpoints(_ context.Context, es []tcpip.Endpoint) { for _, e := range es { tcpip.AddDanglingEndpoint(e) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/kernel_state_autogen.go000066400000000000000000002070611465435605700267670ustar00rootroot00000000000000// automatically generated by stateify. package kernel import ( "context" "gvisor.dev/gvisor/pkg/state" "gvisor.dev/gvisor/pkg/tcpip" ) func (c *Cgroup) StateTypeName() string { return "pkg/sentry/kernel.Cgroup" } func (c *Cgroup) StateFields() []string { return []string{ "Dentry", "CgroupImpl", } } func (c *Cgroup) beforeSave() {} // +checklocksignore func (c *Cgroup) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.Dentry) stateSinkObject.Save(1, &c.CgroupImpl) } func (c *Cgroup) afterLoad(context.Context) {} // +checklocksignore func (c *Cgroup) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.Dentry) stateSourceObject.Load(1, &c.CgroupImpl) } func (h *hierarchy) StateTypeName() string { return "pkg/sentry/kernel.hierarchy" } func (h *hierarchy) StateFields() []string { return []string{ "id", "name", "controllers", "fs", } } func (h *hierarchy) beforeSave() {} // +checklocksignore func (h *hierarchy) StateSave(stateSinkObject state.Sink) { h.beforeSave() stateSinkObject.Save(0, &h.id) stateSinkObject.Save(1, &h.name) stateSinkObject.Save(2, &h.controllers) stateSinkObject.Save(3, &h.fs) } func (h *hierarchy) afterLoad(context.Context) {} // +checklocksignore func (h *hierarchy) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &h.id) stateSourceObject.Load(1, &h.name) stateSourceObject.Load(2, &h.controllers) stateSourceObject.Load(3, &h.fs) } func (r *CgroupRegistry) StateTypeName() string { return "pkg/sentry/kernel.CgroupRegistry" } func (r *CgroupRegistry) StateFields() []string { return []string{ "lastHierarchyID", "lastCgroupID", "controllers", "hierarchies", "hierarchiesByName", "cgroups", } } func (r *CgroupRegistry) beforeSave() {} // +checklocksignore func (r *CgroupRegistry) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.lastHierarchyID) stateSinkObject.Save(1, &r.lastCgroupID) stateSinkObject.Save(2, &r.controllers) stateSinkObject.Save(3, &r.hierarchies) stateSinkObject.Save(4, &r.hierarchiesByName) stateSinkObject.Save(5, &r.cgroups) } func (r *CgroupRegistry) afterLoad(context.Context) {} // +checklocksignore func (r *CgroupRegistry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.lastHierarchyID) stateSourceObject.Load(1, &r.lastCgroupID) stateSourceObject.Load(2, &r.controllers) stateSourceObject.Load(3, &r.hierarchies) stateSourceObject.Load(4, &r.hierarchiesByName) stateSourceObject.Load(5, &r.cgroups) } func (f *FDFlags) StateTypeName() string { return "pkg/sentry/kernel.FDFlags" } func (f *FDFlags) StateFields() []string { return []string{ "CloseOnExec", } } func (f *FDFlags) beforeSave() {} // +checklocksignore func (f *FDFlags) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.CloseOnExec) } func (f *FDFlags) afterLoad(context.Context) {} // +checklocksignore func (f *FDFlags) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.CloseOnExec) } func (d *descriptor) StateTypeName() string { return "pkg/sentry/kernel.descriptor" } func (d *descriptor) StateFields() []string { return []string{ "file", "flags", } } func (d *descriptor) beforeSave() {} // +checklocksignore func (d *descriptor) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.file) stateSinkObject.Save(1, &d.flags) } func (d *descriptor) afterLoad(context.Context) {} // +checklocksignore func (d *descriptor) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.file) stateSourceObject.Load(1, &d.flags) } func (f *FDTable) StateTypeName() string { return "pkg/sentry/kernel.FDTable" } func (f *FDTable) StateFields() []string { return []string{ "FDTableRefs", "k", "descriptorTable", } } func (f *FDTable) beforeSave() {} // +checklocksignore func (f *FDTable) StateSave(stateSinkObject state.Sink) { f.beforeSave() var descriptorTableValue map[int32]descriptor descriptorTableValue = f.saveDescriptorTable() stateSinkObject.SaveValue(2, descriptorTableValue) stateSinkObject.Save(0, &f.FDTableRefs) stateSinkObject.Save(1, &f.k) } func (f *FDTable) afterLoad(context.Context) {} // +checklocksignore func (f *FDTable) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.FDTableRefs) stateSourceObject.Load(1, &f.k) stateSourceObject.LoadValue(2, new(map[int32]descriptor), func(y any) { f.loadDescriptorTable(ctx, y.(map[int32]descriptor)) }) } func (r *FDTableRefs) StateTypeName() string { return "pkg/sentry/kernel.FDTableRefs" } func (r *FDTableRefs) StateFields() []string { return []string{ "refCount", } } func (r *FDTableRefs) beforeSave() {} // +checklocksignore func (r *FDTableRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *FDTableRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (f *FSContext) StateTypeName() string { return "pkg/sentry/kernel.FSContext" } func (f *FSContext) StateFields() []string { return []string{ "FSContextRefs", "root", "cwd", "umask", } } func (f *FSContext) beforeSave() {} // +checklocksignore func (f *FSContext) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.FSContextRefs) stateSinkObject.Save(1, &f.root) stateSinkObject.Save(2, &f.cwd) stateSinkObject.Save(3, &f.umask) } func (f *FSContext) afterLoad(context.Context) {} // +checklocksignore func (f *FSContext) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.FSContextRefs) stateSourceObject.Load(1, &f.root) stateSourceObject.Load(2, &f.cwd) stateSourceObject.Load(3, &f.umask) } func (r *FSContextRefs) StateTypeName() string { return "pkg/sentry/kernel.FSContextRefs" } func (r *FSContextRefs) StateFields() []string { return []string{ "refCount", } } func (r *FSContextRefs) beforeSave() {} // +checklocksignore func (r *FSContextRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *FSContextRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (i *IPCNamespace) StateTypeName() string { return "pkg/sentry/kernel.IPCNamespace" } func (i *IPCNamespace) StateFields() []string { return []string{ "inode", "userNS", "queues", "semaphores", "shms", "posixQueues", } } func (i *IPCNamespace) beforeSave() {} // +checklocksignore func (i *IPCNamespace) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.inode) stateSinkObject.Save(1, &i.userNS) stateSinkObject.Save(2, &i.queues) stateSinkObject.Save(3, &i.semaphores) stateSinkObject.Save(4, &i.shms) stateSinkObject.Save(5, &i.posixQueues) } func (i *IPCNamespace) afterLoad(context.Context) {} // +checklocksignore func (i *IPCNamespace) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.inode) stateSourceObject.Load(1, &i.userNS) stateSourceObject.Load(2, &i.queues) stateSourceObject.Load(3, &i.semaphores) stateSourceObject.Load(4, &i.shms) stateSourceObject.Load(5, &i.posixQueues) } func (uc *UserCounters) StateTypeName() string { return "pkg/sentry/kernel.UserCounters" } func (uc *UserCounters) StateFields() []string { return []string{ "uid", "rlimitNProc", } } func (uc *UserCounters) beforeSave() {} // +checklocksignore func (uc *UserCounters) StateSave(stateSinkObject state.Sink) { uc.beforeSave() stateSinkObject.Save(0, &uc.uid) stateSinkObject.Save(1, &uc.rlimitNProc) } func (uc *UserCounters) afterLoad(context.Context) {} // +checklocksignore func (uc *UserCounters) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &uc.uid) stateSourceObject.Load(1, &uc.rlimitNProc) } func (c *CgroupMount) StateTypeName() string { return "pkg/sentry/kernel.CgroupMount" } func (c *CgroupMount) StateFields() []string { return []string{ "Fs", "Root", "Mount", } } func (c *CgroupMount) beforeSave() {} // +checklocksignore func (c *CgroupMount) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.Fs) stateSinkObject.Save(1, &c.Root) stateSinkObject.Save(2, &c.Mount) } func (c *CgroupMount) afterLoad(context.Context) {} // +checklocksignore func (c *CgroupMount) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.Fs) stateSourceObject.Load(1, &c.Root) stateSourceObject.Load(2, &c.Mount) } func (k *Kernel) StateTypeName() string { return "pkg/sentry/kernel.Kernel" } func (k *Kernel) StateFields() []string { return []string{ "featureSet", "timekeeper", "tasks", "rootUserNamespace", "rootNetworkNamespace", "applicationCores", "useHostCores", "extraAuxv", "vdso", "vdsoParams", "rootUTSNamespace", "rootIPCNamespace", "futexes", "globalInit", "syslog", "runningTasks", "cpuClock", "cpuClockTickerRunning", "uniqueID", "nextInotifyCookie", "netlinkPorts", "danglingEndpoints", "sockets", "nextSocketRecord", "SpecialOpts", "vfs", "hostMount", "pipeMount", "nsfsMount", "shmMount", "socketMount", "sysVShmDevID", "SleepForAddressSpaceActivation", "ptraceExceptions", "YAMAPtraceScope", "cgroupRegistry", "cgroupMountsMap", "userCountersMap", "MaxFDLimit", "containerNames", "additionalCheckpointState", "checkpointCounter", } } func (k *Kernel) beforeSave() {} // +checklocksignore func (k *Kernel) StateSave(stateSinkObject state.Sink) { k.beforeSave() var danglingEndpointsValue []tcpip.Endpoint danglingEndpointsValue = k.saveDanglingEndpoints() stateSinkObject.SaveValue(21, danglingEndpointsValue) stateSinkObject.Save(0, &k.featureSet) stateSinkObject.Save(1, &k.timekeeper) stateSinkObject.Save(2, &k.tasks) stateSinkObject.Save(3, &k.rootUserNamespace) stateSinkObject.Save(4, &k.rootNetworkNamespace) stateSinkObject.Save(5, &k.applicationCores) stateSinkObject.Save(6, &k.useHostCores) stateSinkObject.Save(7, &k.extraAuxv) stateSinkObject.Save(8, &k.vdso) stateSinkObject.Save(9, &k.vdsoParams) stateSinkObject.Save(10, &k.rootUTSNamespace) stateSinkObject.Save(11, &k.rootIPCNamespace) stateSinkObject.Save(12, &k.futexes) stateSinkObject.Save(13, &k.globalInit) stateSinkObject.Save(14, &k.syslog) stateSinkObject.Save(15, &k.runningTasks) stateSinkObject.Save(16, &k.cpuClock) stateSinkObject.Save(17, &k.cpuClockTickerRunning) stateSinkObject.Save(18, &k.uniqueID) stateSinkObject.Save(19, &k.nextInotifyCookie) stateSinkObject.Save(20, &k.netlinkPorts) stateSinkObject.Save(22, &k.sockets) stateSinkObject.Save(23, &k.nextSocketRecord) stateSinkObject.Save(24, &k.SpecialOpts) stateSinkObject.Save(25, &k.vfs) stateSinkObject.Save(26, &k.hostMount) stateSinkObject.Save(27, &k.pipeMount) stateSinkObject.Save(28, &k.nsfsMount) stateSinkObject.Save(29, &k.shmMount) stateSinkObject.Save(30, &k.socketMount) stateSinkObject.Save(31, &k.sysVShmDevID) stateSinkObject.Save(32, &k.SleepForAddressSpaceActivation) stateSinkObject.Save(33, &k.ptraceExceptions) stateSinkObject.Save(34, &k.YAMAPtraceScope) stateSinkObject.Save(35, &k.cgroupRegistry) stateSinkObject.Save(36, &k.cgroupMountsMap) stateSinkObject.Save(37, &k.userCountersMap) stateSinkObject.Save(38, &k.MaxFDLimit) stateSinkObject.Save(39, &k.containerNames) stateSinkObject.Save(40, &k.additionalCheckpointState) stateSinkObject.Save(41, &k.checkpointCounter) } func (k *Kernel) afterLoad(context.Context) {} // +checklocksignore func (k *Kernel) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &k.featureSet) stateSourceObject.Load(1, &k.timekeeper) stateSourceObject.Load(2, &k.tasks) stateSourceObject.Load(3, &k.rootUserNamespace) stateSourceObject.Load(4, &k.rootNetworkNamespace) stateSourceObject.Load(5, &k.applicationCores) stateSourceObject.Load(6, &k.useHostCores) stateSourceObject.Load(7, &k.extraAuxv) stateSourceObject.Load(8, &k.vdso) stateSourceObject.Load(9, &k.vdsoParams) stateSourceObject.Load(10, &k.rootUTSNamespace) stateSourceObject.Load(11, &k.rootIPCNamespace) stateSourceObject.Load(12, &k.futexes) stateSourceObject.Load(13, &k.globalInit) stateSourceObject.Load(14, &k.syslog) stateSourceObject.Load(15, &k.runningTasks) stateSourceObject.Load(16, &k.cpuClock) stateSourceObject.Load(17, &k.cpuClockTickerRunning) stateSourceObject.Load(18, &k.uniqueID) stateSourceObject.Load(19, &k.nextInotifyCookie) stateSourceObject.Load(20, &k.netlinkPorts) stateSourceObject.Load(22, &k.sockets) stateSourceObject.Load(23, &k.nextSocketRecord) stateSourceObject.Load(24, &k.SpecialOpts) stateSourceObject.Load(25, &k.vfs) stateSourceObject.Load(26, &k.hostMount) stateSourceObject.Load(27, &k.pipeMount) stateSourceObject.Load(28, &k.nsfsMount) stateSourceObject.Load(29, &k.shmMount) stateSourceObject.Load(30, &k.socketMount) stateSourceObject.Load(31, &k.sysVShmDevID) stateSourceObject.Load(32, &k.SleepForAddressSpaceActivation) stateSourceObject.Load(33, &k.ptraceExceptions) stateSourceObject.Load(34, &k.YAMAPtraceScope) stateSourceObject.Load(35, &k.cgroupRegistry) stateSourceObject.Load(36, &k.cgroupMountsMap) stateSourceObject.Load(37, &k.userCountersMap) stateSourceObject.Load(38, &k.MaxFDLimit) stateSourceObject.Load(39, &k.containerNames) stateSourceObject.Load(40, &k.additionalCheckpointState) stateSourceObject.Load(41, &k.checkpointCounter) stateSourceObject.LoadValue(21, new([]tcpip.Endpoint), func(y any) { k.loadDanglingEndpoints(ctx, y.([]tcpip.Endpoint)) }) } func (p *privateMemoryFileMetadata) StateTypeName() string { return "pkg/sentry/kernel.privateMemoryFileMetadata" } func (p *privateMemoryFileMetadata) StateFields() []string { return []string{ "owners", } } func (p *privateMemoryFileMetadata) beforeSave() {} // +checklocksignore func (p *privateMemoryFileMetadata) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.owners) } func (p *privateMemoryFileMetadata) afterLoad(context.Context) {} // +checklocksignore func (p *privateMemoryFileMetadata) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.owners) } func (s *SocketRecord) StateTypeName() string { return "pkg/sentry/kernel.SocketRecord" } func (s *SocketRecord) StateFields() []string { return []string{ "k", "Sock", "ID", } } func (s *SocketRecord) beforeSave() {} // +checklocksignore func (s *SocketRecord) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.k) stateSinkObject.Save(1, &s.Sock) stateSinkObject.Save(2, &s.ID) } func (s *SocketRecord) afterLoad(context.Context) {} // +checklocksignore func (s *SocketRecord) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.k) stateSourceObject.Load(1, &s.Sock) stateSourceObject.Load(2, &s.ID) } func (p *pendingSignals) StateTypeName() string { return "pkg/sentry/kernel.pendingSignals" } func (p *pendingSignals) StateFields() []string { return []string{ "signals", } } func (p *pendingSignals) beforeSave() {} // +checklocksignore func (p *pendingSignals) StateSave(stateSinkObject state.Sink) { p.beforeSave() var signalsValue []savedPendingSignal signalsValue = p.saveSignals() stateSinkObject.SaveValue(0, signalsValue) } func (p *pendingSignals) afterLoad(context.Context) {} // +checklocksignore func (p *pendingSignals) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new([]savedPendingSignal), func(y any) { p.loadSignals(ctx, y.([]savedPendingSignal)) }) } func (p *pendingSignalQueue) StateTypeName() string { return "pkg/sentry/kernel.pendingSignalQueue" } func (p *pendingSignalQueue) StateFields() []string { return []string{ "pendingSignalList", "length", } } func (p *pendingSignalQueue) beforeSave() {} // +checklocksignore func (p *pendingSignalQueue) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.pendingSignalList) stateSinkObject.Save(1, &p.length) } func (p *pendingSignalQueue) afterLoad(context.Context) {} // +checklocksignore func (p *pendingSignalQueue) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.pendingSignalList) stateSourceObject.Load(1, &p.length) } func (p *pendingSignal) StateTypeName() string { return "pkg/sentry/kernel.pendingSignal" } func (p *pendingSignal) StateFields() []string { return []string{ "pendingSignalEntry", "SignalInfo", "timer", } } func (p *pendingSignal) beforeSave() {} // +checklocksignore func (p *pendingSignal) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.pendingSignalEntry) stateSinkObject.Save(1, &p.SignalInfo) stateSinkObject.Save(2, &p.timer) } func (p *pendingSignal) afterLoad(context.Context) {} // +checklocksignore func (p *pendingSignal) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.pendingSignalEntry) stateSourceObject.Load(1, &p.SignalInfo) stateSourceObject.Load(2, &p.timer) } func (l *pendingSignalList) StateTypeName() string { return "pkg/sentry/kernel.pendingSignalList" } func (l *pendingSignalList) StateFields() []string { return []string{ "head", "tail", } } func (l *pendingSignalList) beforeSave() {} // +checklocksignore func (l *pendingSignalList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *pendingSignalList) afterLoad(context.Context) {} // +checklocksignore func (l *pendingSignalList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *pendingSignalEntry) StateTypeName() string { return "pkg/sentry/kernel.pendingSignalEntry" } func (e *pendingSignalEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *pendingSignalEntry) beforeSave() {} // +checklocksignore func (e *pendingSignalEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *pendingSignalEntry) afterLoad(context.Context) {} // +checklocksignore func (e *pendingSignalEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (s *savedPendingSignal) StateTypeName() string { return "pkg/sentry/kernel.savedPendingSignal" } func (s *savedPendingSignal) StateFields() []string { return []string{ "si", "timer", } } func (s *savedPendingSignal) beforeSave() {} // +checklocksignore func (s *savedPendingSignal) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.si) stateSinkObject.Save(1, &s.timer) } func (s *savedPendingSignal) afterLoad(context.Context) {} // +checklocksignore func (s *savedPendingSignal) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.si) stateSourceObject.Load(1, &s.timer) } func (it *IntervalTimer) StateTypeName() string { return "pkg/sentry/kernel.IntervalTimer" } func (it *IntervalTimer) StateFields() []string { return []string{ "timer", "target", "signo", "id", "sigval", "group", "sigpending", "sigorphan", "overrunCur", "overrunLast", } } func (it *IntervalTimer) beforeSave() {} // +checklocksignore func (it *IntervalTimer) StateSave(stateSinkObject state.Sink) { it.beforeSave() stateSinkObject.Save(0, &it.timer) stateSinkObject.Save(1, &it.target) stateSinkObject.Save(2, &it.signo) stateSinkObject.Save(3, &it.id) stateSinkObject.Save(4, &it.sigval) stateSinkObject.Save(5, &it.group) stateSinkObject.Save(6, &it.sigpending) stateSinkObject.Save(7, &it.sigorphan) stateSinkObject.Save(8, &it.overrunCur) stateSinkObject.Save(9, &it.overrunLast) } func (it *IntervalTimer) afterLoad(context.Context) {} // +checklocksignore func (it *IntervalTimer) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &it.timer) stateSourceObject.Load(1, &it.target) stateSourceObject.Load(2, &it.signo) stateSourceObject.Load(3, &it.id) stateSourceObject.Load(4, &it.sigval) stateSourceObject.Load(5, &it.group) stateSourceObject.Load(6, &it.sigpending) stateSourceObject.Load(7, &it.sigorphan) stateSourceObject.Load(8, &it.overrunCur) stateSourceObject.Load(9, &it.overrunLast) } func (l *processGroupList) StateTypeName() string { return "pkg/sentry/kernel.processGroupList" } func (l *processGroupList) StateFields() []string { return []string{ "head", "tail", } } func (l *processGroupList) beforeSave() {} // +checklocksignore func (l *processGroupList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *processGroupList) afterLoad(context.Context) {} // +checklocksignore func (l *processGroupList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *processGroupEntry) StateTypeName() string { return "pkg/sentry/kernel.processGroupEntry" } func (e *processGroupEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *processGroupEntry) beforeSave() {} // +checklocksignore func (e *processGroupEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *processGroupEntry) afterLoad(context.Context) {} // +checklocksignore func (e *processGroupEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (r *ProcessGroupRefs) StateTypeName() string { return "pkg/sentry/kernel.ProcessGroupRefs" } func (r *ProcessGroupRefs) StateFields() []string { return []string{ "refCount", } } func (r *ProcessGroupRefs) beforeSave() {} // +checklocksignore func (r *ProcessGroupRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *ProcessGroupRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (p *ptraceOptions) StateTypeName() string { return "pkg/sentry/kernel.ptraceOptions" } func (p *ptraceOptions) StateFields() []string { return []string{ "ExitKill", "SysGood", "TraceClone", "TraceExec", "TraceExit", "TraceFork", "TraceSeccomp", "TraceVfork", "TraceVforkDone", } } func (p *ptraceOptions) beforeSave() {} // +checklocksignore func (p *ptraceOptions) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.ExitKill) stateSinkObject.Save(1, &p.SysGood) stateSinkObject.Save(2, &p.TraceClone) stateSinkObject.Save(3, &p.TraceExec) stateSinkObject.Save(4, &p.TraceExit) stateSinkObject.Save(5, &p.TraceFork) stateSinkObject.Save(6, &p.TraceSeccomp) stateSinkObject.Save(7, &p.TraceVfork) stateSinkObject.Save(8, &p.TraceVforkDone) } func (p *ptraceOptions) afterLoad(context.Context) {} // +checklocksignore func (p *ptraceOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.ExitKill) stateSourceObject.Load(1, &p.SysGood) stateSourceObject.Load(2, &p.TraceClone) stateSourceObject.Load(3, &p.TraceExec) stateSourceObject.Load(4, &p.TraceExit) stateSourceObject.Load(5, &p.TraceFork) stateSourceObject.Load(6, &p.TraceSeccomp) stateSourceObject.Load(7, &p.TraceVfork) stateSourceObject.Load(8, &p.TraceVforkDone) } func (s *ptraceStop) StateTypeName() string { return "pkg/sentry/kernel.ptraceStop" } func (s *ptraceStop) StateFields() []string { return []string{ "frozen", "listen", } } func (s *ptraceStop) beforeSave() {} // +checklocksignore func (s *ptraceStop) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.frozen) stateSinkObject.Save(1, &s.listen) } func (s *ptraceStop) afterLoad(context.Context) {} // +checklocksignore func (s *ptraceStop) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.frozen) stateSourceObject.Load(1, &s.listen) } func (o *OldRSeqCriticalRegion) StateTypeName() string { return "pkg/sentry/kernel.OldRSeqCriticalRegion" } func (o *OldRSeqCriticalRegion) StateFields() []string { return []string{ "CriticalSection", "Restart", } } func (o *OldRSeqCriticalRegion) beforeSave() {} // +checklocksignore func (o *OldRSeqCriticalRegion) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.CriticalSection) stateSinkObject.Save(1, &o.Restart) } func (o *OldRSeqCriticalRegion) afterLoad(context.Context) {} // +checklocksignore func (o *OldRSeqCriticalRegion) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.CriticalSection) stateSourceObject.Load(1, &o.Restart) } func (ts *taskSeccomp) StateTypeName() string { return "pkg/sentry/kernel.taskSeccomp" } func (ts *taskSeccomp) StateFields() []string { return []string{ "filters", "cache", "cacheAuditNumber", } } func (ts *taskSeccomp) beforeSave() {} // +checklocksignore func (ts *taskSeccomp) StateSave(stateSinkObject state.Sink) { ts.beforeSave() stateSinkObject.Save(0, &ts.filters) stateSinkObject.Save(1, &ts.cache) stateSinkObject.Save(2, &ts.cacheAuditNumber) } func (ts *taskSeccomp) afterLoad(context.Context) {} // +checklocksignore func (ts *taskSeccomp) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &ts.filters) stateSourceObject.Load(1, &ts.cache) stateSourceObject.Load(2, &ts.cacheAuditNumber) } func (l *sessionList) StateTypeName() string { return "pkg/sentry/kernel.sessionList" } func (l *sessionList) StateFields() []string { return []string{ "head", "tail", } } func (l *sessionList) beforeSave() {} // +checklocksignore func (l *sessionList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *sessionList) afterLoad(context.Context) {} // +checklocksignore func (l *sessionList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *sessionEntry) StateTypeName() string { return "pkg/sentry/kernel.sessionEntry" } func (e *sessionEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *sessionEntry) beforeSave() {} // +checklocksignore func (e *sessionEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *sessionEntry) afterLoad(context.Context) {} // +checklocksignore func (e *sessionEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (r *SessionRefs) StateTypeName() string { return "pkg/sentry/kernel.SessionRefs" } func (r *SessionRefs) StateFields() []string { return []string{ "refCount", } } func (r *SessionRefs) beforeSave() {} // +checklocksignore func (r *SessionRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *SessionRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (s *Session) StateTypeName() string { return "pkg/sentry/kernel.Session" } func (s *Session) StateFields() []string { return []string{ "SessionRefs", "leader", "id", "foreground", "processGroups", "sessionEntry", } } func (s *Session) beforeSave() {} // +checklocksignore func (s *Session) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.SessionRefs) stateSinkObject.Save(1, &s.leader) stateSinkObject.Save(2, &s.id) stateSinkObject.Save(3, &s.foreground) stateSinkObject.Save(4, &s.processGroups) stateSinkObject.Save(5, &s.sessionEntry) } func (s *Session) afterLoad(context.Context) {} // +checklocksignore func (s *Session) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.SessionRefs) stateSourceObject.Load(1, &s.leader) stateSourceObject.Load(2, &s.id) stateSourceObject.Load(3, &s.foreground) stateSourceObject.Load(4, &s.processGroups) stateSourceObject.Load(5, &s.sessionEntry) } func (pg *ProcessGroup) StateTypeName() string { return "pkg/sentry/kernel.ProcessGroup" } func (pg *ProcessGroup) StateFields() []string { return []string{ "refs", "originator", "id", "session", "ancestors", "processGroupEntry", } } func (pg *ProcessGroup) beforeSave() {} // +checklocksignore func (pg *ProcessGroup) StateSave(stateSinkObject state.Sink) { pg.beforeSave() stateSinkObject.Save(0, &pg.refs) stateSinkObject.Save(1, &pg.originator) stateSinkObject.Save(2, &pg.id) stateSinkObject.Save(3, &pg.session) stateSinkObject.Save(4, &pg.ancestors) stateSinkObject.Save(5, &pg.processGroupEntry) } func (pg *ProcessGroup) afterLoad(context.Context) {} // +checklocksignore func (pg *ProcessGroup) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &pg.refs) stateSourceObject.Load(1, &pg.originator) stateSourceObject.Load(2, &pg.id) stateSourceObject.Load(3, &pg.session) stateSourceObject.Load(4, &pg.ancestors) stateSourceObject.Load(5, &pg.processGroupEntry) } func (sh *SignalHandlers) StateTypeName() string { return "pkg/sentry/kernel.SignalHandlers" } func (sh *SignalHandlers) StateFields() []string { return []string{ "actions", } } func (sh *SignalHandlers) beforeSave() {} // +checklocksignore func (sh *SignalHandlers) StateSave(stateSinkObject state.Sink) { sh.beforeSave() stateSinkObject.Save(0, &sh.actions) } func (sh *SignalHandlers) afterLoad(context.Context) {} // +checklocksignore func (sh *SignalHandlers) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &sh.actions) } func (s *syscallTableInfo) StateTypeName() string { return "pkg/sentry/kernel.syscallTableInfo" } func (s *syscallTableInfo) StateFields() []string { return []string{ "OS", "Arch", } } func (s *syscallTableInfo) beforeSave() {} // +checklocksignore func (s *syscallTableInfo) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.OS) stateSinkObject.Save(1, &s.Arch) } func (s *syscallTableInfo) afterLoad(context.Context) {} // +checklocksignore func (s *syscallTableInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.OS) stateSourceObject.Load(1, &s.Arch) } func (s *syslog) StateTypeName() string { return "pkg/sentry/kernel.syslog" } func (s *syslog) StateFields() []string { return []string{ "msg", } } func (s *syslog) beforeSave() {} // +checklocksignore func (s *syslog) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.msg) } func (s *syslog) afterLoad(context.Context) {} // +checklocksignore func (s *syslog) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.msg) } func (t *Task) StateTypeName() string { return "pkg/sentry/kernel.Task" } func (t *Task) StateFields() []string { return []string{ "taskNode", "runState", "taskWorkCount", "taskWork", "haveSyscallReturn", "gosched", "yieldCount", "pendingSignals", "signalMask", "realSignalMask", "haveSavedSignalMask", "savedSignalMask", "signalStack", "signalQueue", "groupStopPending", "groupStopAcknowledged", "trapStopPending", "trapNotifyPending", "stop", "exitStatus", "syscallRestartBlock", "k", "containerID", "image", "fsContext", "fdTable", "vforkParent", "exitState", "exitTracerNotified", "exitTracerAcked", "exitParentNotified", "exitParentAcked", "ptraceTracer", "ptraceTracees", "ptraceSeized", "ptraceOpts", "ptraceSyscallMode", "ptraceSinglestep", "ptraceCode", "ptraceSiginfo", "ptraceEventMsg", "ptraceYAMAExceptionAdded", "ioUsage", "creds", "utsns", "ipcns", "mountNamespace", "parentDeathSignal", "seccomp", "cleartid", "allowedCPUMask", "cpu", "niceness", "numaPolicy", "numaNodeMask", "netns", "rseqCPU", "oldRSeqCPUAddr", "rseqAddr", "rseqSignature", "robustList", "startTime", "kcov", "cgroups", "memCgID", "userCounters", "sessionKeyring", "Origin", } } func (t *Task) beforeSave() {} // +checklocksignore func (t *Task) StateSave(stateSinkObject state.Sink) { t.beforeSave() var ptraceTracerValue *Task ptraceTracerValue = t.savePtraceTracer() stateSinkObject.SaveValue(32, ptraceTracerValue) var seccompValue *taskSeccomp seccompValue = t.saveSeccomp() stateSinkObject.SaveValue(48, seccompValue) stateSinkObject.Save(0, &t.taskNode) stateSinkObject.Save(1, &t.runState) stateSinkObject.Save(2, &t.taskWorkCount) stateSinkObject.Save(3, &t.taskWork) stateSinkObject.Save(4, &t.haveSyscallReturn) stateSinkObject.Save(5, &t.gosched) stateSinkObject.Save(6, &t.yieldCount) stateSinkObject.Save(7, &t.pendingSignals) stateSinkObject.Save(8, &t.signalMask) stateSinkObject.Save(9, &t.realSignalMask) stateSinkObject.Save(10, &t.haveSavedSignalMask) stateSinkObject.Save(11, &t.savedSignalMask) stateSinkObject.Save(12, &t.signalStack) stateSinkObject.Save(13, &t.signalQueue) stateSinkObject.Save(14, &t.groupStopPending) stateSinkObject.Save(15, &t.groupStopAcknowledged) stateSinkObject.Save(16, &t.trapStopPending) stateSinkObject.Save(17, &t.trapNotifyPending) stateSinkObject.Save(18, &t.stop) stateSinkObject.Save(19, &t.exitStatus) stateSinkObject.Save(20, &t.syscallRestartBlock) stateSinkObject.Save(21, &t.k) stateSinkObject.Save(22, &t.containerID) stateSinkObject.Save(23, &t.image) stateSinkObject.Save(24, &t.fsContext) stateSinkObject.Save(25, &t.fdTable) stateSinkObject.Save(26, &t.vforkParent) stateSinkObject.Save(27, &t.exitState) stateSinkObject.Save(28, &t.exitTracerNotified) stateSinkObject.Save(29, &t.exitTracerAcked) stateSinkObject.Save(30, &t.exitParentNotified) stateSinkObject.Save(31, &t.exitParentAcked) stateSinkObject.Save(33, &t.ptraceTracees) stateSinkObject.Save(34, &t.ptraceSeized) stateSinkObject.Save(35, &t.ptraceOpts) stateSinkObject.Save(36, &t.ptraceSyscallMode) stateSinkObject.Save(37, &t.ptraceSinglestep) stateSinkObject.Save(38, &t.ptraceCode) stateSinkObject.Save(39, &t.ptraceSiginfo) stateSinkObject.Save(40, &t.ptraceEventMsg) stateSinkObject.Save(41, &t.ptraceYAMAExceptionAdded) stateSinkObject.Save(42, &t.ioUsage) stateSinkObject.Save(43, &t.creds) stateSinkObject.Save(44, &t.utsns) stateSinkObject.Save(45, &t.ipcns) stateSinkObject.Save(46, &t.mountNamespace) stateSinkObject.Save(47, &t.parentDeathSignal) stateSinkObject.Save(49, &t.cleartid) stateSinkObject.Save(50, &t.allowedCPUMask) stateSinkObject.Save(51, &t.cpu) stateSinkObject.Save(52, &t.niceness) stateSinkObject.Save(53, &t.numaPolicy) stateSinkObject.Save(54, &t.numaNodeMask) stateSinkObject.Save(55, &t.netns) stateSinkObject.Save(56, &t.rseqCPU) stateSinkObject.Save(57, &t.oldRSeqCPUAddr) stateSinkObject.Save(58, &t.rseqAddr) stateSinkObject.Save(59, &t.rseqSignature) stateSinkObject.Save(60, &t.robustList) stateSinkObject.Save(61, &t.startTime) stateSinkObject.Save(62, &t.kcov) stateSinkObject.Save(63, &t.cgroups) stateSinkObject.Save(64, &t.memCgID) stateSinkObject.Save(65, &t.userCounters) stateSinkObject.Save(66, &t.sessionKeyring) stateSinkObject.Save(67, &t.Origin) } // +checklocksignore func (t *Task) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.taskNode) stateSourceObject.Load(1, &t.runState) stateSourceObject.Load(2, &t.taskWorkCount) stateSourceObject.Load(3, &t.taskWork) stateSourceObject.Load(4, &t.haveSyscallReturn) stateSourceObject.Load(5, &t.gosched) stateSourceObject.Load(6, &t.yieldCount) stateSourceObject.Load(7, &t.pendingSignals) stateSourceObject.Load(8, &t.signalMask) stateSourceObject.Load(9, &t.realSignalMask) stateSourceObject.Load(10, &t.haveSavedSignalMask) stateSourceObject.Load(11, &t.savedSignalMask) stateSourceObject.Load(12, &t.signalStack) stateSourceObject.Load(13, &t.signalQueue) stateSourceObject.Load(14, &t.groupStopPending) stateSourceObject.Load(15, &t.groupStopAcknowledged) stateSourceObject.Load(16, &t.trapStopPending) stateSourceObject.Load(17, &t.trapNotifyPending) stateSourceObject.Load(18, &t.stop) stateSourceObject.Load(19, &t.exitStatus) stateSourceObject.Load(20, &t.syscallRestartBlock) stateSourceObject.Load(21, &t.k) stateSourceObject.Load(22, &t.containerID) stateSourceObject.Load(23, &t.image) stateSourceObject.Load(24, &t.fsContext) stateSourceObject.Load(25, &t.fdTable) stateSourceObject.Load(26, &t.vforkParent) stateSourceObject.Load(27, &t.exitState) stateSourceObject.Load(28, &t.exitTracerNotified) stateSourceObject.Load(29, &t.exitTracerAcked) stateSourceObject.Load(30, &t.exitParentNotified) stateSourceObject.Load(31, &t.exitParentAcked) stateSourceObject.Load(33, &t.ptraceTracees) stateSourceObject.Load(34, &t.ptraceSeized) stateSourceObject.Load(35, &t.ptraceOpts) stateSourceObject.Load(36, &t.ptraceSyscallMode) stateSourceObject.Load(37, &t.ptraceSinglestep) stateSourceObject.Load(38, &t.ptraceCode) stateSourceObject.Load(39, &t.ptraceSiginfo) stateSourceObject.Load(40, &t.ptraceEventMsg) stateSourceObject.Load(41, &t.ptraceYAMAExceptionAdded) stateSourceObject.Load(42, &t.ioUsage) stateSourceObject.Load(43, &t.creds) stateSourceObject.Load(44, &t.utsns) stateSourceObject.Load(45, &t.ipcns) stateSourceObject.Load(46, &t.mountNamespace) stateSourceObject.Load(47, &t.parentDeathSignal) stateSourceObject.Load(49, &t.cleartid) stateSourceObject.Load(50, &t.allowedCPUMask) stateSourceObject.Load(51, &t.cpu) stateSourceObject.Load(52, &t.niceness) stateSourceObject.Load(53, &t.numaPolicy) stateSourceObject.Load(54, &t.numaNodeMask) stateSourceObject.Load(55, &t.netns) stateSourceObject.Load(56, &t.rseqCPU) stateSourceObject.Load(57, &t.oldRSeqCPUAddr) stateSourceObject.Load(58, &t.rseqAddr) stateSourceObject.Load(59, &t.rseqSignature) stateSourceObject.Load(60, &t.robustList) stateSourceObject.Load(61, &t.startTime) stateSourceObject.Load(62, &t.kcov) stateSourceObject.Load(63, &t.cgroups) stateSourceObject.Load(64, &t.memCgID) stateSourceObject.Load(65, &t.userCounters) stateSourceObject.Load(66, &t.sessionKeyring) stateSourceObject.Load(67, &t.Origin) stateSourceObject.LoadValue(32, new(*Task), func(y any) { t.loadPtraceTracer(ctx, y.(*Task)) }) stateSourceObject.LoadValue(48, new(*taskSeccomp), func(y any) { t.loadSeccomp(ctx, y.(*taskSeccomp)) }) stateSourceObject.AfterLoad(func() { t.afterLoad(ctx) }) } func (r *runSyscallAfterPtraceEventClone) StateTypeName() string { return "pkg/sentry/kernel.runSyscallAfterPtraceEventClone" } func (r *runSyscallAfterPtraceEventClone) StateFields() []string { return []string{ "vforkChild", "vforkChildTID", } } func (r *runSyscallAfterPtraceEventClone) beforeSave() {} // +checklocksignore func (r *runSyscallAfterPtraceEventClone) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.vforkChild) stateSinkObject.Save(1, &r.vforkChildTID) } func (r *runSyscallAfterPtraceEventClone) afterLoad(context.Context) {} // +checklocksignore func (r *runSyscallAfterPtraceEventClone) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.vforkChild) stateSourceObject.Load(1, &r.vforkChildTID) } func (r *runSyscallAfterVforkStop) StateTypeName() string { return "pkg/sentry/kernel.runSyscallAfterVforkStop" } func (r *runSyscallAfterVforkStop) StateFields() []string { return []string{ "childTID", } } func (r *runSyscallAfterVforkStop) beforeSave() {} // +checklocksignore func (r *runSyscallAfterVforkStop) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.childTID) } func (r *runSyscallAfterVforkStop) afterLoad(context.Context) {} // +checklocksignore func (r *runSyscallAfterVforkStop) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.childTID) } func (v *vforkStop) StateTypeName() string { return "pkg/sentry/kernel.vforkStop" } func (v *vforkStop) StateFields() []string { return []string{} } func (v *vforkStop) beforeSave() {} // +checklocksignore func (v *vforkStop) StateSave(stateSinkObject state.Sink) { v.beforeSave() } func (v *vforkStop) afterLoad(context.Context) {} // +checklocksignore func (v *vforkStop) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *execStop) StateTypeName() string { return "pkg/sentry/kernel.execStop" } func (e *execStop) StateFields() []string { return []string{} } func (e *execStop) beforeSave() {} // +checklocksignore func (e *execStop) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *execStop) afterLoad(context.Context) {} // +checklocksignore func (e *execStop) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (r *runSyscallAfterExecStop) StateTypeName() string { return "pkg/sentry/kernel.runSyscallAfterExecStop" } func (r *runSyscallAfterExecStop) StateFields() []string { return []string{ "image", } } func (r *runSyscallAfterExecStop) beforeSave() {} // +checklocksignore func (r *runSyscallAfterExecStop) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.image) } func (r *runSyscallAfterExecStop) afterLoad(context.Context) {} // +checklocksignore func (r *runSyscallAfterExecStop) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.image) } func (r *runExit) StateTypeName() string { return "pkg/sentry/kernel.runExit" } func (r *runExit) StateFields() []string { return []string{} } func (r *runExit) beforeSave() {} // +checklocksignore func (r *runExit) StateSave(stateSinkObject state.Sink) { r.beforeSave() } func (r *runExit) afterLoad(context.Context) {} // +checklocksignore func (r *runExit) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (r *runExitMain) StateTypeName() string { return "pkg/sentry/kernel.runExitMain" } func (r *runExitMain) StateFields() []string { return []string{} } func (r *runExitMain) beforeSave() {} // +checklocksignore func (r *runExitMain) StateSave(stateSinkObject state.Sink) { r.beforeSave() } func (r *runExitMain) afterLoad(context.Context) {} // +checklocksignore func (r *runExitMain) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (r *runExitNotify) StateTypeName() string { return "pkg/sentry/kernel.runExitNotify" } func (r *runExitNotify) StateFields() []string { return []string{} } func (r *runExitNotify) beforeSave() {} // +checklocksignore func (r *runExitNotify) StateSave(stateSinkObject state.Sink) { r.beforeSave() } func (r *runExitNotify) afterLoad(context.Context) {} // +checklocksignore func (r *runExitNotify) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (image *TaskImage) StateTypeName() string { return "pkg/sentry/kernel.TaskImage" } func (image *TaskImage) StateFields() []string { return []string{ "Name", "Arch", "MemoryManager", "fu", "st", "fileCaps", } } func (image *TaskImage) beforeSave() {} // +checklocksignore func (image *TaskImage) StateSave(stateSinkObject state.Sink) { image.beforeSave() var stValue syscallTableInfo stValue = image.saveSt() stateSinkObject.SaveValue(4, stValue) stateSinkObject.Save(0, &image.Name) stateSinkObject.Save(1, &image.Arch) stateSinkObject.Save(2, &image.MemoryManager) stateSinkObject.Save(3, &image.fu) stateSinkObject.Save(5, &image.fileCaps) } func (image *TaskImage) afterLoad(context.Context) {} // +checklocksignore func (image *TaskImage) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &image.Name) stateSourceObject.Load(1, &image.Arch) stateSourceObject.Load(2, &image.MemoryManager) stateSourceObject.Load(3, &image.fu) stateSourceObject.Load(5, &image.fileCaps) stateSourceObject.LoadValue(4, new(syscallTableInfo), func(y any) { image.loadSt(ctx, y.(syscallTableInfo)) }) } func (l *taskList) StateTypeName() string { return "pkg/sentry/kernel.taskList" } func (l *taskList) StateFields() []string { return []string{ "head", "tail", } } func (l *taskList) beforeSave() {} // +checklocksignore func (l *taskList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *taskList) afterLoad(context.Context) {} // +checklocksignore func (l *taskList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *taskEntry) StateTypeName() string { return "pkg/sentry/kernel.taskEntry" } func (e *taskEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *taskEntry) beforeSave() {} // +checklocksignore func (e *taskEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *taskEntry) afterLoad(context.Context) {} // +checklocksignore func (e *taskEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (app *runApp) StateTypeName() string { return "pkg/sentry/kernel.runApp" } func (app *runApp) StateFields() []string { return []string{} } func (app *runApp) beforeSave() {} // +checklocksignore func (app *runApp) StateSave(stateSinkObject state.Sink) { app.beforeSave() } func (app *runApp) afterLoad(context.Context) {} // +checklocksignore func (app *runApp) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (ts *TaskGoroutineSchedInfo) StateTypeName() string { return "pkg/sentry/kernel.TaskGoroutineSchedInfo" } func (ts *TaskGoroutineSchedInfo) StateFields() []string { return []string{ "Timestamp", "State", "UserTicks", "SysTicks", } } func (ts *TaskGoroutineSchedInfo) beforeSave() {} // +checklocksignore func (ts *TaskGoroutineSchedInfo) StateSave(stateSinkObject state.Sink) { ts.beforeSave() stateSinkObject.Save(0, &ts.Timestamp) stateSinkObject.Save(1, &ts.State) stateSinkObject.Save(2, &ts.UserTicks) stateSinkObject.Save(3, &ts.SysTicks) } func (ts *TaskGoroutineSchedInfo) afterLoad(context.Context) {} // +checklocksignore func (ts *TaskGoroutineSchedInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &ts.Timestamp) stateSourceObject.Load(1, &ts.State) stateSourceObject.Load(2, &ts.UserTicks) stateSourceObject.Load(3, &ts.SysTicks) } func (tc *taskClock) StateTypeName() string { return "pkg/sentry/kernel.taskClock" } func (tc *taskClock) StateFields() []string { return []string{ "t", "includeSys", } } func (tc *taskClock) beforeSave() {} // +checklocksignore func (tc *taskClock) StateSave(stateSinkObject state.Sink) { tc.beforeSave() stateSinkObject.Save(0, &tc.t) stateSinkObject.Save(1, &tc.includeSys) } func (tc *taskClock) afterLoad(context.Context) {} // +checklocksignore func (tc *taskClock) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &tc.t) stateSourceObject.Load(1, &tc.includeSys) } func (tgc *tgClock) StateTypeName() string { return "pkg/sentry/kernel.tgClock" } func (tgc *tgClock) StateFields() []string { return []string{ "tg", "includeSys", } } func (tgc *tgClock) beforeSave() {} // +checklocksignore func (tgc *tgClock) StateSave(stateSinkObject state.Sink) { tgc.beforeSave() stateSinkObject.Save(0, &tgc.tg) stateSinkObject.Save(1, &tgc.includeSys) } func (tgc *tgClock) afterLoad(context.Context) {} // +checklocksignore func (tgc *tgClock) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &tgc.tg) stateSourceObject.Load(1, &tgc.includeSys) } func (g *groupStop) StateTypeName() string { return "pkg/sentry/kernel.groupStop" } func (g *groupStop) StateFields() []string { return []string{} } func (g *groupStop) beforeSave() {} // +checklocksignore func (g *groupStop) StateSave(stateSinkObject state.Sink) { g.beforeSave() } func (g *groupStop) afterLoad(context.Context) {} // +checklocksignore func (g *groupStop) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (r *runInterrupt) StateTypeName() string { return "pkg/sentry/kernel.runInterrupt" } func (r *runInterrupt) StateFields() []string { return []string{} } func (r *runInterrupt) beforeSave() {} // +checklocksignore func (r *runInterrupt) StateSave(stateSinkObject state.Sink) { r.beforeSave() } func (r *runInterrupt) afterLoad(context.Context) {} // +checklocksignore func (r *runInterrupt) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (r *runInterruptAfterSignalDeliveryStop) StateTypeName() string { return "pkg/sentry/kernel.runInterruptAfterSignalDeliveryStop" } func (r *runInterruptAfterSignalDeliveryStop) StateFields() []string { return []string{} } func (r *runInterruptAfterSignalDeliveryStop) beforeSave() {} // +checklocksignore func (r *runInterruptAfterSignalDeliveryStop) StateSave(stateSinkObject state.Sink) { r.beforeSave() } func (r *runInterruptAfterSignalDeliveryStop) afterLoad(context.Context) {} // +checklocksignore func (r *runInterruptAfterSignalDeliveryStop) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (r *runSyscallAfterSyscallEnterStop) StateTypeName() string { return "pkg/sentry/kernel.runSyscallAfterSyscallEnterStop" } func (r *runSyscallAfterSyscallEnterStop) StateFields() []string { return []string{} } func (r *runSyscallAfterSyscallEnterStop) beforeSave() {} // +checklocksignore func (r *runSyscallAfterSyscallEnterStop) StateSave(stateSinkObject state.Sink) { r.beforeSave() } func (r *runSyscallAfterSyscallEnterStop) afterLoad(context.Context) {} // +checklocksignore func (r *runSyscallAfterSyscallEnterStop) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (r *runSyscallAfterSysemuStop) StateTypeName() string { return "pkg/sentry/kernel.runSyscallAfterSysemuStop" } func (r *runSyscallAfterSysemuStop) StateFields() []string { return []string{} } func (r *runSyscallAfterSysemuStop) beforeSave() {} // +checklocksignore func (r *runSyscallAfterSysemuStop) StateSave(stateSinkObject state.Sink) { r.beforeSave() } func (r *runSyscallAfterSysemuStop) afterLoad(context.Context) {} // +checklocksignore func (r *runSyscallAfterSysemuStop) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (r *runSyscallReinvoke) StateTypeName() string { return "pkg/sentry/kernel.runSyscallReinvoke" } func (r *runSyscallReinvoke) StateFields() []string { return []string{} } func (r *runSyscallReinvoke) beforeSave() {} // +checklocksignore func (r *runSyscallReinvoke) StateSave(stateSinkObject state.Sink) { r.beforeSave() } func (r *runSyscallReinvoke) afterLoad(context.Context) {} // +checklocksignore func (r *runSyscallReinvoke) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (r *runSyscallExit) StateTypeName() string { return "pkg/sentry/kernel.runSyscallExit" } func (r *runSyscallExit) StateFields() []string { return []string{} } func (r *runSyscallExit) beforeSave() {} // +checklocksignore func (r *runSyscallExit) StateSave(stateSinkObject state.Sink) { r.beforeSave() } func (r *runSyscallExit) afterLoad(context.Context) {} // +checklocksignore func (r *runSyscallExit) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (tg *ThreadGroup) StateTypeName() string { return "pkg/sentry/kernel.ThreadGroup" } func (tg *ThreadGroup) StateFields() []string { return []string{ "threadGroupNode", "signalHandlers", "pendingSignals", "groupStopDequeued", "groupStopSignal", "groupStopPendingCount", "groupStopComplete", "groupStopWaitable", "groupContNotify", "groupContInterrupted", "groupContWaitable", "exiting", "exitStatus", "terminationSignal", "itimerRealTimer", "itimerVirtSetting", "itimerProfSetting", "rlimitCPUSoftSetting", "cpuTimersEnabled", "timers", "nextTimerID", "exitedCPUStats", "childCPUStats", "ioUsage", "maxRSS", "childMaxRSS", "limits", "processGroup", "execed", "oldRSeqCritical", "tty", "oomScoreAdj", "isChildSubreaper", "hasChildSubreaper", } } func (tg *ThreadGroup) beforeSave() {} // +checklocksignore func (tg *ThreadGroup) StateSave(stateSinkObject state.Sink) { tg.beforeSave() var oldRSeqCriticalValue *OldRSeqCriticalRegion oldRSeqCriticalValue = tg.saveOldRSeqCritical() stateSinkObject.SaveValue(29, oldRSeqCriticalValue) stateSinkObject.Save(0, &tg.threadGroupNode) stateSinkObject.Save(1, &tg.signalHandlers) stateSinkObject.Save(2, &tg.pendingSignals) stateSinkObject.Save(3, &tg.groupStopDequeued) stateSinkObject.Save(4, &tg.groupStopSignal) stateSinkObject.Save(5, &tg.groupStopPendingCount) stateSinkObject.Save(6, &tg.groupStopComplete) stateSinkObject.Save(7, &tg.groupStopWaitable) stateSinkObject.Save(8, &tg.groupContNotify) stateSinkObject.Save(9, &tg.groupContInterrupted) stateSinkObject.Save(10, &tg.groupContWaitable) stateSinkObject.Save(11, &tg.exiting) stateSinkObject.Save(12, &tg.exitStatus) stateSinkObject.Save(13, &tg.terminationSignal) stateSinkObject.Save(14, &tg.itimerRealTimer) stateSinkObject.Save(15, &tg.itimerVirtSetting) stateSinkObject.Save(16, &tg.itimerProfSetting) stateSinkObject.Save(17, &tg.rlimitCPUSoftSetting) stateSinkObject.Save(18, &tg.cpuTimersEnabled) stateSinkObject.Save(19, &tg.timers) stateSinkObject.Save(20, &tg.nextTimerID) stateSinkObject.Save(21, &tg.exitedCPUStats) stateSinkObject.Save(22, &tg.childCPUStats) stateSinkObject.Save(23, &tg.ioUsage) stateSinkObject.Save(24, &tg.maxRSS) stateSinkObject.Save(25, &tg.childMaxRSS) stateSinkObject.Save(26, &tg.limits) stateSinkObject.Save(27, &tg.processGroup) stateSinkObject.Save(28, &tg.execed) stateSinkObject.Save(30, &tg.tty) stateSinkObject.Save(31, &tg.oomScoreAdj) stateSinkObject.Save(32, &tg.isChildSubreaper) stateSinkObject.Save(33, &tg.hasChildSubreaper) } func (tg *ThreadGroup) afterLoad(context.Context) {} // +checklocksignore func (tg *ThreadGroup) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &tg.threadGroupNode) stateSourceObject.Load(1, &tg.signalHandlers) stateSourceObject.Load(2, &tg.pendingSignals) stateSourceObject.Load(3, &tg.groupStopDequeued) stateSourceObject.Load(4, &tg.groupStopSignal) stateSourceObject.Load(5, &tg.groupStopPendingCount) stateSourceObject.Load(6, &tg.groupStopComplete) stateSourceObject.Load(7, &tg.groupStopWaitable) stateSourceObject.Load(8, &tg.groupContNotify) stateSourceObject.Load(9, &tg.groupContInterrupted) stateSourceObject.Load(10, &tg.groupContWaitable) stateSourceObject.Load(11, &tg.exiting) stateSourceObject.Load(12, &tg.exitStatus) stateSourceObject.Load(13, &tg.terminationSignal) stateSourceObject.Load(14, &tg.itimerRealTimer) stateSourceObject.Load(15, &tg.itimerVirtSetting) stateSourceObject.Load(16, &tg.itimerProfSetting) stateSourceObject.Load(17, &tg.rlimitCPUSoftSetting) stateSourceObject.Load(18, &tg.cpuTimersEnabled) stateSourceObject.Load(19, &tg.timers) stateSourceObject.Load(20, &tg.nextTimerID) stateSourceObject.Load(21, &tg.exitedCPUStats) stateSourceObject.Load(22, &tg.childCPUStats) stateSourceObject.Load(23, &tg.ioUsage) stateSourceObject.Load(24, &tg.maxRSS) stateSourceObject.Load(25, &tg.childMaxRSS) stateSourceObject.Load(26, &tg.limits) stateSourceObject.Load(27, &tg.processGroup) stateSourceObject.Load(28, &tg.execed) stateSourceObject.Load(30, &tg.tty) stateSourceObject.Load(31, &tg.oomScoreAdj) stateSourceObject.Load(32, &tg.isChildSubreaper) stateSourceObject.Load(33, &tg.hasChildSubreaper) stateSourceObject.LoadValue(29, new(*OldRSeqCriticalRegion), func(y any) { tg.loadOldRSeqCritical(ctx, y.(*OldRSeqCriticalRegion)) }) } func (l *itimerRealListener) StateTypeName() string { return "pkg/sentry/kernel.itimerRealListener" } func (l *itimerRealListener) StateFields() []string { return []string{ "tg", } } func (l *itimerRealListener) beforeSave() {} // +checklocksignore func (l *itimerRealListener) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.tg) } func (l *itimerRealListener) afterLoad(context.Context) {} // +checklocksignore func (l *itimerRealListener) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.tg) } func (ts *TaskSet) StateTypeName() string { return "pkg/sentry/kernel.TaskSet" } func (ts *TaskSet) StateFields() []string { return []string{ "Root", "sessions", } } func (ts *TaskSet) beforeSave() {} // +checklocksignore func (ts *TaskSet) StateSave(stateSinkObject state.Sink) { ts.beforeSave() stateSinkObject.Save(0, &ts.Root) stateSinkObject.Save(1, &ts.sessions) } func (ts *TaskSet) afterLoad(context.Context) {} // +checklocksignore func (ts *TaskSet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &ts.Root) stateSourceObject.Load(1, &ts.sessions) } func (ns *PIDNamespace) StateTypeName() string { return "pkg/sentry/kernel.PIDNamespace" } func (ns *PIDNamespace) StateFields() []string { return []string{ "owner", "parent", "userns", "id", "last", "tasks", "tids", "tgids", "sessions", "sids", "processGroups", "pgids", "exiting", "extra", } } func (ns *PIDNamespace) beforeSave() {} // +checklocksignore func (ns *PIDNamespace) StateSave(stateSinkObject state.Sink) { ns.beforeSave() stateSinkObject.Save(0, &ns.owner) stateSinkObject.Save(1, &ns.parent) stateSinkObject.Save(2, &ns.userns) stateSinkObject.Save(3, &ns.id) stateSinkObject.Save(4, &ns.last) stateSinkObject.Save(5, &ns.tasks) stateSinkObject.Save(6, &ns.tids) stateSinkObject.Save(7, &ns.tgids) stateSinkObject.Save(8, &ns.sessions) stateSinkObject.Save(9, &ns.sids) stateSinkObject.Save(10, &ns.processGroups) stateSinkObject.Save(11, &ns.pgids) stateSinkObject.Save(12, &ns.exiting) stateSinkObject.Save(13, &ns.extra) } func (ns *PIDNamespace) afterLoad(context.Context) {} // +checklocksignore func (ns *PIDNamespace) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &ns.owner) stateSourceObject.Load(1, &ns.parent) stateSourceObject.Load(2, &ns.userns) stateSourceObject.Load(3, &ns.id) stateSourceObject.Load(4, &ns.last) stateSourceObject.Load(5, &ns.tasks) stateSourceObject.Load(6, &ns.tids) stateSourceObject.Load(7, &ns.tgids) stateSourceObject.Load(8, &ns.sessions) stateSourceObject.Load(9, &ns.sids) stateSourceObject.Load(10, &ns.processGroups) stateSourceObject.Load(11, &ns.pgids) stateSourceObject.Load(12, &ns.exiting) stateSourceObject.Load(13, &ns.extra) } func (t *threadGroupNode) StateTypeName() string { return "pkg/sentry/kernel.threadGroupNode" } func (t *threadGroupNode) StateFields() []string { return []string{ "pidns", "pidWithinNS", "eventQueue", "leader", "execing", "tasks", "tasksCount", "liveTasks", "activeTasks", } } func (t *threadGroupNode) beforeSave() {} // +checklocksignore func (t *threadGroupNode) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.pidns) stateSinkObject.Save(1, &t.pidWithinNS) stateSinkObject.Save(2, &t.eventQueue) stateSinkObject.Save(3, &t.leader) stateSinkObject.Save(4, &t.execing) stateSinkObject.Save(5, &t.tasks) stateSinkObject.Save(6, &t.tasksCount) stateSinkObject.Save(7, &t.liveTasks) stateSinkObject.Save(8, &t.activeTasks) } func (t *threadGroupNode) afterLoad(context.Context) {} // +checklocksignore func (t *threadGroupNode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.pidns) stateSourceObject.Load(1, &t.pidWithinNS) stateSourceObject.Load(2, &t.eventQueue) stateSourceObject.Load(3, &t.leader) stateSourceObject.Load(4, &t.execing) stateSourceObject.Load(5, &t.tasks) stateSourceObject.Load(6, &t.tasksCount) stateSourceObject.Load(7, &t.liveTasks) stateSourceObject.Load(8, &t.activeTasks) } func (t *taskNode) StateTypeName() string { return "pkg/sentry/kernel.taskNode" } func (t *taskNode) StateFields() []string { return []string{ "tg", "taskEntry", "parent", "children", "childPIDNamespace", } } func (t *taskNode) beforeSave() {} // +checklocksignore func (t *taskNode) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.tg) stateSinkObject.Save(1, &t.taskEntry) stateSinkObject.Save(2, &t.parent) stateSinkObject.Save(3, &t.children) stateSinkObject.Save(4, &t.childPIDNamespace) } func (t *taskNode) afterLoad(context.Context) {} // +checklocksignore func (t *taskNode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadWait(0, &t.tg) stateSourceObject.Load(1, &t.taskEntry) stateSourceObject.Load(2, &t.parent) stateSourceObject.Load(3, &t.children) stateSourceObject.Load(4, &t.childPIDNamespace) } func (t *Timekeeper) StateTypeName() string { return "pkg/sentry/kernel.Timekeeper" } func (t *Timekeeper) StateFields() []string { return []string{ "realtimeClock", "monotonicClock", "bootTime", "saveMonotonic", "saveRealtime", } } // +checklocksignore func (t *Timekeeper) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.realtimeClock) stateSinkObject.Save(1, &t.monotonicClock) stateSinkObject.Save(2, &t.bootTime) stateSinkObject.Save(3, &t.saveMonotonic) stateSinkObject.Save(4, &t.saveRealtime) } // +checklocksignore func (t *Timekeeper) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.realtimeClock) stateSourceObject.Load(1, &t.monotonicClock) stateSourceObject.Load(2, &t.bootTime) stateSourceObject.Load(3, &t.saveMonotonic) stateSourceObject.Load(4, &t.saveRealtime) stateSourceObject.AfterLoad(func() { t.afterLoad(ctx) }) } func (tc *timekeeperClock) StateTypeName() string { return "pkg/sentry/kernel.timekeeperClock" } func (tc *timekeeperClock) StateFields() []string { return []string{ "tk", "c", } } func (tc *timekeeperClock) beforeSave() {} // +checklocksignore func (tc *timekeeperClock) StateSave(stateSinkObject state.Sink) { tc.beforeSave() stateSinkObject.Save(0, &tc.tk) stateSinkObject.Save(1, &tc.c) } func (tc *timekeeperClock) afterLoad(context.Context) {} // +checklocksignore func (tc *timekeeperClock) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &tc.tk) stateSourceObject.Load(1, &tc.c) } func (tty *TTY) StateTypeName() string { return "pkg/sentry/kernel.TTY" } func (tty *TTY) StateFields() []string { return []string{ "Index", "tg", } } func (tty *TTY) beforeSave() {} // +checklocksignore func (tty *TTY) StateSave(stateSinkObject state.Sink) { tty.beforeSave() stateSinkObject.Save(0, &tty.Index) stateSinkObject.Save(1, &tty.tg) } func (tty *TTY) afterLoad(context.Context) {} // +checklocksignore func (tty *TTY) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &tty.Index) stateSourceObject.Load(1, &tty.tg) } func (u *UTSNamespace) StateTypeName() string { return "pkg/sentry/kernel.UTSNamespace" } func (u *UTSNamespace) StateFields() []string { return []string{ "hostName", "domainName", "userns", "inode", } } func (u *UTSNamespace) beforeSave() {} // +checklocksignore func (u *UTSNamespace) StateSave(stateSinkObject state.Sink) { u.beforeSave() stateSinkObject.Save(0, &u.hostName) stateSinkObject.Save(1, &u.domainName) stateSinkObject.Save(2, &u.userns) stateSinkObject.Save(3, &u.inode) } func (u *UTSNamespace) afterLoad(context.Context) {} // +checklocksignore func (u *UTSNamespace) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &u.hostName) stateSourceObject.Load(1, &u.domainName) stateSourceObject.Load(2, &u.userns) stateSourceObject.Load(3, &u.inode) } func (v *VDSOParamPage) StateTypeName() string { return "pkg/sentry/kernel.VDSOParamPage" } func (v *VDSOParamPage) StateFields() []string { return []string{ "fr", "seq", "copyScratchBuffer", } } func (v *VDSOParamPage) beforeSave() {} // +checklocksignore func (v *VDSOParamPage) StateSave(stateSinkObject state.Sink) { v.beforeSave() stateSinkObject.Save(0, &v.fr) stateSinkObject.Save(1, &v.seq) stateSinkObject.Save(2, &v.copyScratchBuffer) } // +checklocksignore func (v *VDSOParamPage) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &v.fr) stateSourceObject.Load(1, &v.seq) stateSourceObject.Load(2, &v.copyScratchBuffer) stateSourceObject.AfterLoad(func() { v.afterLoad(ctx) }) } func init() { state.Register((*Cgroup)(nil)) state.Register((*hierarchy)(nil)) state.Register((*CgroupRegistry)(nil)) state.Register((*FDFlags)(nil)) state.Register((*descriptor)(nil)) state.Register((*FDTable)(nil)) state.Register((*FDTableRefs)(nil)) state.Register((*FSContext)(nil)) state.Register((*FSContextRefs)(nil)) state.Register((*IPCNamespace)(nil)) state.Register((*UserCounters)(nil)) state.Register((*CgroupMount)(nil)) state.Register((*Kernel)(nil)) state.Register((*privateMemoryFileMetadata)(nil)) state.Register((*SocketRecord)(nil)) state.Register((*pendingSignals)(nil)) state.Register((*pendingSignalQueue)(nil)) state.Register((*pendingSignal)(nil)) state.Register((*pendingSignalList)(nil)) state.Register((*pendingSignalEntry)(nil)) state.Register((*savedPendingSignal)(nil)) state.Register((*IntervalTimer)(nil)) state.Register((*processGroupList)(nil)) state.Register((*processGroupEntry)(nil)) state.Register((*ProcessGroupRefs)(nil)) state.Register((*ptraceOptions)(nil)) state.Register((*ptraceStop)(nil)) state.Register((*OldRSeqCriticalRegion)(nil)) state.Register((*taskSeccomp)(nil)) state.Register((*sessionList)(nil)) state.Register((*sessionEntry)(nil)) state.Register((*SessionRefs)(nil)) state.Register((*Session)(nil)) state.Register((*ProcessGroup)(nil)) state.Register((*SignalHandlers)(nil)) state.Register((*syscallTableInfo)(nil)) state.Register((*syslog)(nil)) state.Register((*Task)(nil)) state.Register((*runSyscallAfterPtraceEventClone)(nil)) state.Register((*runSyscallAfterVforkStop)(nil)) state.Register((*vforkStop)(nil)) state.Register((*execStop)(nil)) state.Register((*runSyscallAfterExecStop)(nil)) state.Register((*runExit)(nil)) state.Register((*runExitMain)(nil)) state.Register((*runExitNotify)(nil)) state.Register((*TaskImage)(nil)) state.Register((*taskList)(nil)) state.Register((*taskEntry)(nil)) state.Register((*runApp)(nil)) state.Register((*TaskGoroutineSchedInfo)(nil)) state.Register((*taskClock)(nil)) state.Register((*tgClock)(nil)) state.Register((*groupStop)(nil)) state.Register((*runInterrupt)(nil)) state.Register((*runInterruptAfterSignalDeliveryStop)(nil)) state.Register((*runSyscallAfterSyscallEnterStop)(nil)) state.Register((*runSyscallAfterSysemuStop)(nil)) state.Register((*runSyscallReinvoke)(nil)) state.Register((*runSyscallExit)(nil)) state.Register((*ThreadGroup)(nil)) state.Register((*itimerRealListener)(nil)) state.Register((*TaskSet)(nil)) state.Register((*PIDNamespace)(nil)) state.Register((*threadGroupNode)(nil)) state.Register((*taskNode)(nil)) state.Register((*Timekeeper)(nil)) state.Register((*timekeeperClock)(nil)) state.Register((*TTY)(nil)) state.Register((*UTSNamespace)(nil)) state.Register((*VDSOParamPage)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/kernel_unsafe_abi_autogen_unsafe.go000066400000000000000000000001461465435605700312770ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package kernel import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/kernel_unsafe_state_autogen.go000066400000000000000000000050621465435605700303250ustar00rootroot00000000000000// automatically generated by stateify. package kernel import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (p *descriptorBucketSliceAtomicPtr) StateTypeName() string { return "pkg/sentry/kernel.descriptorBucketSliceAtomicPtr" } func (p *descriptorBucketSliceAtomicPtr) StateFields() []string { return []string{ "ptr", } } func (p *descriptorBucketSliceAtomicPtr) beforeSave() {} // +checklocksignore func (p *descriptorBucketSliceAtomicPtr) StateSave(stateSinkObject state.Sink) { p.beforeSave() var ptrValue *descriptorBucketSlice ptrValue = p.savePtr() stateSinkObject.SaveValue(0, ptrValue) } func (p *descriptorBucketSliceAtomicPtr) afterLoad(context.Context) {} // +checklocksignore func (p *descriptorBucketSliceAtomicPtr) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new(*descriptorBucketSlice), func(y any) { p.loadPtr(ctx, y.(*descriptorBucketSlice)) }) } func (p *descriptorBucketAtomicPtr) StateTypeName() string { return "pkg/sentry/kernel.descriptorBucketAtomicPtr" } func (p *descriptorBucketAtomicPtr) StateFields() []string { return []string{ "ptr", } } func (p *descriptorBucketAtomicPtr) beforeSave() {} // +checklocksignore func (p *descriptorBucketAtomicPtr) StateSave(stateSinkObject state.Sink) { p.beforeSave() var ptrValue *descriptorBucket ptrValue = p.savePtr() stateSinkObject.SaveValue(0, ptrValue) } func (p *descriptorBucketAtomicPtr) afterLoad(context.Context) {} // +checklocksignore func (p *descriptorBucketAtomicPtr) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new(*descriptorBucket), func(y any) { p.loadPtr(ctx, y.(*descriptorBucket)) }) } func (p *descriptorAtomicPtr) StateTypeName() string { return "pkg/sentry/kernel.descriptorAtomicPtr" } func (p *descriptorAtomicPtr) StateFields() []string { return []string{ "ptr", } } func (p *descriptorAtomicPtr) beforeSave() {} // +checklocksignore func (p *descriptorAtomicPtr) StateSave(stateSinkObject state.Sink) { p.beforeSave() var ptrValue *descriptor ptrValue = p.savePtr() stateSinkObject.SaveValue(0, ptrValue) } func (p *descriptorAtomicPtr) afterLoad(context.Context) {} // +checklocksignore func (p *descriptorAtomicPtr) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new(*descriptor), func(y any) { p.loadPtr(ctx, y.(*descriptor)) }) } func init() { state.Register((*descriptorBucketSliceAtomicPtr)(nil)) state.Register((*descriptorBucketAtomicPtr)(nil)) state.Register((*descriptorAtomicPtr)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/memevent/000077500000000000000000000000001465435605700240505ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/memevent/memevent_state_autogen.go000066400000000000000000000000721465435605700311400ustar00rootroot00000000000000// automatically generated by stateify. package memevent golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/memevent/memory_events.go000066400000000000000000000064051465435605700273000ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package memevent implements the memory usage events controller, which // periodically emits events via the eventchannel. package memevent import ( "time" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/kernel" pb "gvisor.dev/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" ) var totalTicks = metric.MustCreateNewUint64Metric("/memory_events/ticks", metric.Uint64Metadata{ Cumulative: true, Description: "Total number of memory event periods that have elapsed since startup.", }) var totalEvents = metric.MustCreateNewUint64Metric("/memory_events/events", metric.Uint64Metadata{ Cumulative: true, Description: "Total number of memory events emitted.", }) // MemoryEvents describes the configuration for the global memory event emitter. type MemoryEvents struct { k *kernel.Kernel // The period is how often to emit an event. The memory events goroutine // will ensure a minimum of one event is emitted per this period, regardless // how of much memory usage has changed. period time.Duration // Writing to this channel indicates the memory goroutine should stop. stop chan struct{} // done is used to signal when the memory event goroutine has exited. done sync.WaitGroup } // New creates a new MemoryEvents. func New(k *kernel.Kernel, period time.Duration) *MemoryEvents { return &MemoryEvents{ k: k, period: period, stop: make(chan struct{}), } } // Stop stops the memory usage events emitter goroutine. Stop must not be called // concurrently with Start and may only be called once. func (m *MemoryEvents) Stop() { close(m.stop) m.done.Wait() } // Start starts the memory usage events emitter goroutine. Start must not be // called concurrently with Stop and may only be called once. func (m *MemoryEvents) Start() { if m.period == 0 { return } m.done.Add(1) go m.run() // S/R-SAFE: doesn't interact with saved state. } func (m *MemoryEvents) run() { defer m.done.Done() // Emit the first event immediately on startup. totalTicks.Increment() m.emit() ticker := time.NewTicker(m.period) defer ticker.Stop() for { select { case <-m.stop: return case <-ticker.C: totalTicks.Increment() m.emit() } } } func (m *MemoryEvents) emit() { totalPlatform, err := m.k.MemoryFile().TotalUsage() if err != nil { log.Warningf("Failed to fetch memory usage for memory events: %v", err) return } snapshot, _ := usage.MemoryAccounting.Copy() total := totalPlatform + snapshot.Mapped totalEvents.Increment() eventchannel.Emit(&pb.MemoryUsageEvent{ Mapped: snapshot.Mapped, Total: total, }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/memevent/memory_events_go_proto/000077500000000000000000000000001465435605700306545ustar00rootroot00000000000000memory_events.pb.go000066400000000000000000000127051465435605700344250ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/memevent/memory_events_go_proto// Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.31.0 // protoc v5.26.1 // source: pkg/sentry/kernel/memevent/memory_events.proto package memory_events_go_proto import ( protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" reflect "reflect" sync "sync" ) const ( // Verify that this generated code is sufficiently up-to-date. _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) // Verify that runtime/protoimpl is sufficiently up-to-date. _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) type MemoryUsageEvent struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Total uint64 `protobuf:"varint,1,opt,name=total,proto3" json:"total,omitempty"` Mapped uint64 `protobuf:"varint,2,opt,name=mapped,proto3" json:"mapped,omitempty"` } func (x *MemoryUsageEvent) Reset() { *x = MemoryUsageEvent{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_kernel_memevent_memory_events_proto_msgTypes[0] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *MemoryUsageEvent) String() string { return protoimpl.X.MessageStringOf(x) } func (*MemoryUsageEvent) ProtoMessage() {} func (x *MemoryUsageEvent) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_kernel_memevent_memory_events_proto_msgTypes[0] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use MemoryUsageEvent.ProtoReflect.Descriptor instead. func (*MemoryUsageEvent) Descriptor() ([]byte, []int) { return file_pkg_sentry_kernel_memevent_memory_events_proto_rawDescGZIP(), []int{0} } func (x *MemoryUsageEvent) GetTotal() uint64 { if x != nil { return x.Total } return 0 } func (x *MemoryUsageEvent) GetMapped() uint64 { if x != nil { return x.Mapped } return 0 } var File_pkg_sentry_kernel_memevent_memory_events_proto protoreflect.FileDescriptor var file_pkg_sentry_kernel_memevent_memory_events_proto_rawDesc = []byte{ 0x0a, 0x2e, 0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x2f, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x2f, 0x6d, 0x65, 0x6d, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2f, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x5f, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x06, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x22, 0x40, 0x0a, 0x10, 0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x55, 0x73, 0x61, 0x67, 0x65, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x12, 0x16, 0x0a, 0x06, 0x6d, 0x61, 0x70, 0x70, 0x65, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x06, 0x6d, 0x61, 0x70, 0x70, 0x65, 0x64, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( file_pkg_sentry_kernel_memevent_memory_events_proto_rawDescOnce sync.Once file_pkg_sentry_kernel_memevent_memory_events_proto_rawDescData = file_pkg_sentry_kernel_memevent_memory_events_proto_rawDesc ) func file_pkg_sentry_kernel_memevent_memory_events_proto_rawDescGZIP() []byte { file_pkg_sentry_kernel_memevent_memory_events_proto_rawDescOnce.Do(func() { file_pkg_sentry_kernel_memevent_memory_events_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_sentry_kernel_memevent_memory_events_proto_rawDescData) }) return file_pkg_sentry_kernel_memevent_memory_events_proto_rawDescData } var file_pkg_sentry_kernel_memevent_memory_events_proto_msgTypes = make([]protoimpl.MessageInfo, 1) var file_pkg_sentry_kernel_memevent_memory_events_proto_goTypes = []interface{}{ (*MemoryUsageEvent)(nil), // 0: gvisor.MemoryUsageEvent } var file_pkg_sentry_kernel_memevent_memory_events_proto_depIdxs = []int32{ 0, // [0:0] is the sub-list for method output_type 0, // [0:0] is the sub-list for method input_type 0, // [0:0] is the sub-list for extension type_name 0, // [0:0] is the sub-list for extension extendee 0, // [0:0] is the sub-list for field type_name } func init() { file_pkg_sentry_kernel_memevent_memory_events_proto_init() } func file_pkg_sentry_kernel_memevent_memory_events_proto_init() { if File_pkg_sentry_kernel_memevent_memory_events_proto != nil { return } if !protoimpl.UnsafeEnabled { file_pkg_sentry_kernel_memevent_memory_events_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*MemoryUsageEvent); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } } type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_pkg_sentry_kernel_memevent_memory_events_proto_rawDesc, NumEnums: 0, NumMessages: 1, NumExtensions: 0, NumServices: 0, }, GoTypes: file_pkg_sentry_kernel_memevent_memory_events_proto_goTypes, DependencyIndexes: file_pkg_sentry_kernel_memevent_memory_events_proto_depIdxs, MessageInfos: file_pkg_sentry_kernel_memevent_memory_events_proto_msgTypes, }.Build() File_pkg_sentry_kernel_memevent_memory_events_proto = out.File file_pkg_sentry_kernel_memevent_memory_events_proto_rawDesc = nil file_pkg_sentry_kernel_memevent_memory_events_proto_goTypes = nil file_pkg_sentry_kernel_memevent_memory_events_proto_depIdxs = nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/mq/000077500000000000000000000000001465435605700226455ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/mq/message_list.go000066400000000000000000000116311465435605700256550ustar00rootroot00000000000000package mq // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type msgElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (msgElementMapper) linkerFor(elem *Message) *Message { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type msgList struct { head *Message tail *Message } // Reset resets list l to the empty state. func (l *msgList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *msgList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *msgList) Front() *Message { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *msgList) Back() *Message { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *msgList) Len() (count int) { for e := l.Front(); e != nil; e = (msgElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *msgList) PushFront(e *Message) { linker := msgElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { msgElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *msgList) PushFrontList(m *msgList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { msgElementMapper{}.linkerFor(l.head).SetPrev(m.tail) msgElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *msgList) PushBack(e *Message) { linker := msgElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { msgElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *msgList) PushBackList(m *msgList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { msgElementMapper{}.linkerFor(l.tail).SetNext(m.head) msgElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *msgList) InsertAfter(b, e *Message) { bLinker := msgElementMapper{}.linkerFor(b) eLinker := msgElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { msgElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *msgList) InsertBefore(a, e *Message) { aLinker := msgElementMapper{}.linkerFor(a) eLinker := msgElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { msgElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *msgList) Remove(e *Message) { linker := msgElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { msgElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { msgElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type msgEntry struct { next *Message prev *Message } // Next returns the entry that follows e in the list. // //go:nosplit func (e *msgEntry) Next() *Message { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *msgEntry) Prev() *Message { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *msgEntry) SetNext(elem *Message) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *msgEntry) SetPrev(elem *Message) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/mq/mq.go000066400000000000000000000324661465435605700236240ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package mq provides an implementation for POSIX message queues. package mq import ( "bytes" "fmt" "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) // AccessType is the access type passed to mq_open. type AccessType int // Possible access types. const ( ReadOnly AccessType = iota WriteOnly ReadWrite ) // MaxName is the maximum size for a queue name. const MaxName = 255 const ( maxPriority = linux.MQ_PRIO_MAX - 1 // Highest possible message priority. maxQueuesDefault = linux.DFLT_QUEUESMAX // Default max number of queues. maxMsgDefault = linux.DFLT_MSG // Default max number of messages per queue. maxMsgMin = linux.MIN_MSGMAX // Min value for max number of messages per queue. maxMsgLimit = linux.DFLT_MSGMAX // Limit for max number of messages per queue. maxMsgHardLimit = linux.HARD_MSGMAX // Hard limit for max number of messages per queue. msgSizeDefault = linux.DFLT_MSGSIZE // Default max message size. msgSizeMin = linux.MIN_MSGSIZEMAX // Min value for max message size. msgSizeLimit = linux.DFLT_MSGSIZEMAX // Limit for max message size. msgSizeHardLimit = linux.HARD_MSGSIZEMAX // Hard limit for max message size. ) // Registry is a POSIX message queue registry. // // Unlike SysV utilities, Registry is not map-based. It uses a provided // RegistryImpl backed by a virtual filesystem to implement registry operations. // // +stateify savable type Registry struct { // userNS is the user namespace containing this registry. Immutable. userNS *auth.UserNamespace // mu protects all fields below. mu sync.Mutex `state:"nosave"` // impl is an implementation of several message queue utilities needed by // the registry. impl should be provided by mqfs. impl RegistryImpl } // RegistryImpl defines utilities needed by a Registry to provide actual // registry implementation. It works mainly as an abstraction layer used by // Registry to avoid dealing directly with the filesystem. RegistryImpl should // be implemented by mqfs and provided to Registry at initialization. type RegistryImpl interface { // Get searches for a queue with the given name, if it exists, the queue is // used to create a new FD, return it and return true. If the queue doesn't // exist, return false and no error. An error is returned if creation fails. Get(ctx context.Context, name string, access AccessType, block bool, flags uint32) (*vfs.FileDescription, bool, error) // New creates a new inode and file description using the given queue, // inserts the inode into the filesystem tree using the given name, and // returns the file description. An error is returned if creation fails, or // if the name already exists. New(ctx context.Context, name string, q *Queue, access AccessType, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error) // Unlink removes the queue with given name from the registry, and returns // an error if the name doesn't exist. Unlink(ctx context.Context, name string) error // Destroy destroys the registry. Destroy(context.Context) } // NewRegistry returns a new, initialized message queue registry. NewRegistry // should be called when a new message queue filesystem is created, once per // IPCNamespace. func NewRegistry(userNS *auth.UserNamespace, impl RegistryImpl) *Registry { return &Registry{ userNS: userNS, impl: impl, } } // OpenOpts holds the options passed to FindOrCreate. type OpenOpts struct { Name string Access AccessType Create bool Exclusive bool Block bool } // FindOrCreate creates a new POSIX message queue or opens an existing queue. // See mq_open(2). func (r *Registry) FindOrCreate(ctx context.Context, opts OpenOpts, mode linux.FileMode, attr *linux.MqAttr) (*vfs.FileDescription, error) { // mq_overview(7) mentions that: "Each message queue is identified by a name // of the form '/somename'", but the mq_open(3) man pages mention: // "The mq_open() library function is implemented on top of a system call // of the same name. The library function performs the check that the // name starts with a slash (/), giving the EINVAL error if it does not. // The kernel system call expects name to contain no preceding slash, so // the C library function passes name without the preceding slash (i.e., // name+1) to the system call." // So we don't need to check it. if len(opts.Name) == 0 { return nil, linuxerr.ENOENT } if len(opts.Name) > MaxName { return nil, linuxerr.ENAMETOOLONG } if strings.ContainsRune(opts.Name, '/') { return nil, linuxerr.EACCES } if opts.Name == "." || opts.Name == ".." { return nil, linuxerr.EINVAL } // Construct status flags. var flags uint32 if opts.Block { flags = linux.O_NONBLOCK } switch opts.Access { case ReadOnly: flags = flags | linux.O_RDONLY case WriteOnly: flags = flags | linux.O_WRONLY case ReadWrite: flags = flags | linux.O_RDWR } r.mu.Lock() defer r.mu.Unlock() fd, ok, err := r.impl.Get(ctx, opts.Name, opts.Access, opts.Block, flags) if err != nil { return nil, err } if ok { if opts.Create && opts.Exclusive { // "Both O_CREAT and O_EXCL were specified in oflag, but a queue // with this name already exists." fd.DecRef(ctx) return nil, linuxerr.EEXIST } return fd, nil } if !opts.Create { // "The O_CREAT flag was not specified in oflag, and no queue with this name // exists." return nil, linuxerr.ENOENT } q, err := r.newQueueLocked(auth.CredentialsFromContext(ctx), mode, attr) if err != nil { return nil, err } return r.impl.New(ctx, opts.Name, q, opts.Access, opts.Block, mode.Permissions(), flags) } // newQueueLocked creates a new queue using the given attributes. If attr is nil // return a queue with default values, otherwise use attr to create a new queue, // and return an error if attributes are invalid. func (r *Registry) newQueueLocked(creds *auth.Credentials, mode linux.FileMode, attr *linux.MqAttr) (*Queue, error) { if attr == nil { return &Queue{ ownerUID: creds.EffectiveKUID, ownerGID: creds.EffectiveKGID, mode: mode, maxMessageCount: int64(maxMsgDefault), maxMessageSize: uint64(msgSizeDefault), }, nil } // "O_CREAT was specified in oflag, and attr was not NULL, but // attr->mq_maxmsg or attr->mq_msqsize was invalid. Both of these fields // these fields must be greater than zero. In a process that is // unprivileged (does not have the CAP_SYS_RESOURCE capability), // attr->mq_maxmsg must be less than or equal to the msg_max limit, and // attr->mq_msgsize must be less than or equal to the msgsize_max limit. // In addition, even in a privileged process, attr->mq_maxmsg cannot // exceed the HARD_MAX limit." - man mq_open(3). if attr.MqMaxmsg <= 0 || attr.MqMsgsize <= 0 { return nil, linuxerr.EINVAL } if attr.MqMaxmsg > maxMsgHardLimit || (!creds.HasCapabilityIn(linux.CAP_SYS_RESOURCE, r.userNS) && (attr.MqMaxmsg > maxMsgLimit || attr.MqMsgsize > msgSizeLimit)) { return nil, linuxerr.EINVAL } return &Queue{ ownerUID: creds.EffectiveKUID, ownerGID: creds.EffectiveKGID, mode: mode, maxMessageCount: attr.MqMaxmsg, maxMessageSize: uint64(attr.MqMsgsize), }, nil } // Remove removes the queue with the given name from the registry. See // mq_unlink(2). func (r *Registry) Remove(ctx context.Context, name string) error { if len(name) > MaxName { return linuxerr.ENAMETOOLONG } r.mu.Lock() defer r.mu.Unlock() return r.impl.Unlink(ctx, name) } // Destroy destroys the registry and releases all held references. func (r *Registry) Destroy(ctx context.Context) { r.mu.Lock() defer r.mu.Unlock() r.impl.Destroy(ctx) } // Impl returns RegistryImpl inside r. func (r *Registry) Impl() RegistryImpl { return r.impl } // Queue represents a POSIX message queue. // // +stateify savable type Queue struct { // ownerUID is the registry's owner's UID. Immutable. ownerUID auth.KUID // ownerGID is the registry's owner's GID. Immutable. ownerGID auth.KGID // mode is the registry's access permissions. Immutable. mode linux.FileMode // mu protects all the fields below. mu sync.Mutex `state:"nosave"` // queue is the queue of waiters. queue waiter.Queue // messages is a list of messages currently in the queue. messages msgList // subscriber represents a task registered to receive async notification // from this queue. subscriber *Subscriber // messageCount is the number of messages currently in the queue. messageCount int64 // maxMessageCount is the maximum number of messages that the queue can // hold. maxMessageCount int64 // maxMessageSize is the maximum size of a message held by the queue. maxMessageSize uint64 // byteCount is the number of bytes of data in all messages in the queue. byteCount uint64 } // View is a view into a message queue. Views should only be used in file // descriptions, but not inodes, because we use inodes to retrieve the actual // queue, and only FDs are responsible for providing user functionality. type View interface { // TODO: Add Send and Receive when mq_timedsend(2) and mq_timedreceive(2) // are implemented. // Flush checks if the calling process has attached a notification request // to this queue, if yes, then the request is removed, and another process // can attach a request. Flush(ctx context.Context) waiter.Waitable } // ReaderWriter provides a send and receive view into a queue. // // +stateify savable type ReaderWriter struct { *Queue block bool } // Reader provides a send-only view into a queue. // // +stateify savable type Reader struct { *Queue block bool } // Writer provides a receive-only view into a queue. // // +stateify savable type Writer struct { *Queue block bool } // NewView creates a new view into a queue and returns it. func NewView(q *Queue, access AccessType, block bool) (View, error) { switch access { case ReadWrite: return ReaderWriter{Queue: q, block: block}, nil case WriteOnly: return Writer{Queue: q, block: block}, nil case ReadOnly: return Reader{Queue: q, block: block}, nil default: // This case can't happen, due to O_RDONLY flag being 0 and O_WRONLY // being 1, so one of them must be true. return nil, linuxerr.EINVAL } } // Message holds a message exchanged through a Queue via mq_timedsend(2) and // mq_timedreceive(2), and additional info relating to the message. // // +stateify savable type Message struct { msgEntry // Text is the message's sent content. Text string // Size is the message's size in bytes. Size uint64 // Priority is the message's priority. Priority uint32 } // Subscriber represents a task registered for async notification from a Queue. // // +stateify savable type Subscriber struct { // TODO: Add fields when mq_notify(2) is implemented. // pid is the PID of the registered task. pid int32 } // Generate implements vfs.DynamicBytesSource.Generate. Queue is used as a // DynamicBytesSource for mqfs's queueInode. func (q *Queue) Generate(ctx context.Context, buf *bytes.Buffer) error { q.mu.Lock() defer q.mu.Unlock() var ( pid int32 method int sigNumber int ) if q.subscriber != nil { pid = q.subscriber.pid // TODO: add method and sigNumber when mq_notify(2) is implemented. } buf.WriteString( fmt.Sprintf("QSIZE:%-10d NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n", q.byteCount, method, sigNumber, pid), ) return nil } // Flush implements View.Flush. func (q *Queue) Flush(ctx context.Context) { q.mu.Lock() defer q.mu.Unlock() pid, ok := auth.ThreadGroupIDFromContext(ctx) if ok { if q.subscriber != nil && pid == q.subscriber.pid { q.subscriber = nil } } } // Readiness implements Waitable.Readiness. func (q *Queue) Readiness(mask waiter.EventMask) waiter.EventMask { q.mu.Lock() defer q.mu.Unlock() events := waiter.EventMask(0) if q.messageCount > 0 { events |= waiter.ReadableEvents } if q.messageCount < q.maxMessageCount { events |= waiter.WritableEvents } return events & mask } // EventRegister implements Waitable.EventRegister. func (q *Queue) EventRegister(e *waiter.Entry) error { q.mu.Lock() defer q.mu.Unlock() q.queue.EventRegister(e) return nil } // EventUnregister implements Waitable.EventUnregister. func (q *Queue) EventUnregister(e *waiter.Entry) { q.mu.Lock() defer q.mu.Unlock() q.queue.EventUnregister(e) } // HasPermissions returns true if the given credentials meet the access // permissions required by the queue. func (q *Queue) HasPermissions(creds *auth.Credentials, req vfs.AccessTypes) bool { perms := uint16(q.mode.Permissions()) if q.ownerUID == creds.EffectiveKUID { perms >>= 6 } else if creds.InGroup(q.ownerGID) { perms >>= 3 } return uint16(req)&perms == uint16(req) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/mq/mq_state_autogen.go000066400000000000000000000152221465435605700265350ustar00rootroot00000000000000// automatically generated by stateify. package mq import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (l *msgList) StateTypeName() string { return "pkg/sentry/kernel/mq.msgList" } func (l *msgList) StateFields() []string { return []string{ "head", "tail", } } func (l *msgList) beforeSave() {} // +checklocksignore func (l *msgList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *msgList) afterLoad(context.Context) {} // +checklocksignore func (l *msgList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *msgEntry) StateTypeName() string { return "pkg/sentry/kernel/mq.msgEntry" } func (e *msgEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *msgEntry) beforeSave() {} // +checklocksignore func (e *msgEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *msgEntry) afterLoad(context.Context) {} // +checklocksignore func (e *msgEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (r *Registry) StateTypeName() string { return "pkg/sentry/kernel/mq.Registry" } func (r *Registry) StateFields() []string { return []string{ "userNS", "impl", } } func (r *Registry) beforeSave() {} // +checklocksignore func (r *Registry) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.userNS) stateSinkObject.Save(1, &r.impl) } func (r *Registry) afterLoad(context.Context) {} // +checklocksignore func (r *Registry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.userNS) stateSourceObject.Load(1, &r.impl) } func (q *Queue) StateTypeName() string { return "pkg/sentry/kernel/mq.Queue" } func (q *Queue) StateFields() []string { return []string{ "ownerUID", "ownerGID", "mode", "queue", "messages", "subscriber", "messageCount", "maxMessageCount", "maxMessageSize", "byteCount", } } func (q *Queue) beforeSave() {} // +checklocksignore func (q *Queue) StateSave(stateSinkObject state.Sink) { q.beforeSave() stateSinkObject.Save(0, &q.ownerUID) stateSinkObject.Save(1, &q.ownerGID) stateSinkObject.Save(2, &q.mode) stateSinkObject.Save(3, &q.queue) stateSinkObject.Save(4, &q.messages) stateSinkObject.Save(5, &q.subscriber) stateSinkObject.Save(6, &q.messageCount) stateSinkObject.Save(7, &q.maxMessageCount) stateSinkObject.Save(8, &q.maxMessageSize) stateSinkObject.Save(9, &q.byteCount) } func (q *Queue) afterLoad(context.Context) {} // +checklocksignore func (q *Queue) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &q.ownerUID) stateSourceObject.Load(1, &q.ownerGID) stateSourceObject.Load(2, &q.mode) stateSourceObject.Load(3, &q.queue) stateSourceObject.Load(4, &q.messages) stateSourceObject.Load(5, &q.subscriber) stateSourceObject.Load(6, &q.messageCount) stateSourceObject.Load(7, &q.maxMessageCount) stateSourceObject.Load(8, &q.maxMessageSize) stateSourceObject.Load(9, &q.byteCount) } func (r *ReaderWriter) StateTypeName() string { return "pkg/sentry/kernel/mq.ReaderWriter" } func (r *ReaderWriter) StateFields() []string { return []string{ "Queue", "block", } } func (r *ReaderWriter) beforeSave() {} // +checklocksignore func (r *ReaderWriter) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.Queue) stateSinkObject.Save(1, &r.block) } func (r *ReaderWriter) afterLoad(context.Context) {} // +checklocksignore func (r *ReaderWriter) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.Queue) stateSourceObject.Load(1, &r.block) } func (r *Reader) StateTypeName() string { return "pkg/sentry/kernel/mq.Reader" } func (r *Reader) StateFields() []string { return []string{ "Queue", "block", } } func (r *Reader) beforeSave() {} // +checklocksignore func (r *Reader) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.Queue) stateSinkObject.Save(1, &r.block) } func (r *Reader) afterLoad(context.Context) {} // +checklocksignore func (r *Reader) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.Queue) stateSourceObject.Load(1, &r.block) } func (w *Writer) StateTypeName() string { return "pkg/sentry/kernel/mq.Writer" } func (w *Writer) StateFields() []string { return []string{ "Queue", "block", } } func (w *Writer) beforeSave() {} // +checklocksignore func (w *Writer) StateSave(stateSinkObject state.Sink) { w.beforeSave() stateSinkObject.Save(0, &w.Queue) stateSinkObject.Save(1, &w.block) } func (w *Writer) afterLoad(context.Context) {} // +checklocksignore func (w *Writer) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &w.Queue) stateSourceObject.Load(1, &w.block) } func (m *Message) StateTypeName() string { return "pkg/sentry/kernel/mq.Message" } func (m *Message) StateFields() []string { return []string{ "msgEntry", "Text", "Size", "Priority", } } func (m *Message) beforeSave() {} // +checklocksignore func (m *Message) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.msgEntry) stateSinkObject.Save(1, &m.Text) stateSinkObject.Save(2, &m.Size) stateSinkObject.Save(3, &m.Priority) } func (m *Message) afterLoad(context.Context) {} // +checklocksignore func (m *Message) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.msgEntry) stateSourceObject.Load(1, &m.Text) stateSourceObject.Load(2, &m.Size) stateSourceObject.Load(3, &m.Priority) } func (s *Subscriber) StateTypeName() string { return "pkg/sentry/kernel/mq.Subscriber" } func (s *Subscriber) StateFields() []string { return []string{ "pid", } } func (s *Subscriber) beforeSave() {} // +checklocksignore func (s *Subscriber) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.pid) } func (s *Subscriber) afterLoad(context.Context) {} // +checklocksignore func (s *Subscriber) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.pid) } func init() { state.Register((*msgList)(nil)) state.Register((*msgEntry)(nil)) state.Register((*Registry)(nil)) state.Register((*Queue)(nil)) state.Register((*ReaderWriter)(nil)) state.Register((*Reader)(nil)) state.Register((*Writer)(nil)) state.Register((*Message)(nil)) state.Register((*Subscriber)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/msgqueue/000077500000000000000000000000001465435605700240635ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/msgqueue/message_list.go000066400000000000000000000116371465435605700271010ustar00rootroot00000000000000package msgqueue // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type msgElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (msgElementMapper) linkerFor(elem *Message) *Message { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type msgList struct { head *Message tail *Message } // Reset resets list l to the empty state. func (l *msgList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *msgList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *msgList) Front() *Message { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *msgList) Back() *Message { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *msgList) Len() (count int) { for e := l.Front(); e != nil; e = (msgElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *msgList) PushFront(e *Message) { linker := msgElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { msgElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *msgList) PushFrontList(m *msgList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { msgElementMapper{}.linkerFor(l.head).SetPrev(m.tail) msgElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *msgList) PushBack(e *Message) { linker := msgElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { msgElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *msgList) PushBackList(m *msgList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { msgElementMapper{}.linkerFor(l.tail).SetNext(m.head) msgElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *msgList) InsertAfter(b, e *Message) { bLinker := msgElementMapper{}.linkerFor(b) eLinker := msgElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { msgElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *msgList) InsertBefore(a, e *Message) { aLinker := msgElementMapper{}.linkerFor(a) eLinker := msgElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { msgElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *msgList) Remove(e *Message) { linker := msgElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { msgElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { msgElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type msgEntry struct { next *Message prev *Message } // Next returns the entry that follows e in the list. // //go:nosplit func (e *msgEntry) Next() *Message { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *msgEntry) Prev() *Message { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *msgEntry) SetNext(elem *Message) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *msgEntry) SetPrev(elem *Message) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/msgqueue/msgqueue.go000066400000000000000000000421431465435605700262510ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package msgqueue implements System V message queues. package msgqueue import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) const ( // System-wide limit for maximum number of queues. maxQueues = linux.MSGMNI // Maximum size of a queue in bytes. maxQueueBytes = linux.MSGMNB // Maximum size of a message in bytes. maxMessageBytes = linux.MSGMAX ) // Registry contains a set of message queues that can be referenced using keys // or IDs. // // +stateify savable type Registry struct { // mu protects all the fields below. mu sync.Mutex `state:"nosave"` // reg defines basic fields and operations needed for all SysV registries. reg *ipc.Registry } // NewRegistry returns a new Registry ready to be used. func NewRegistry(userNS *auth.UserNamespace) *Registry { return &Registry{ reg: ipc.NewRegistry(userNS), } } // Queue represents a SysV message queue, described by sysvipc(7). // // +stateify savable type Queue struct { // registry is the registry owning this queue. Immutable. registry *Registry // mu protects all the fields below. mu sync.Mutex `state:"nosave"` // dead is set to true when a queue is removed from the registry and should // not be used. Operations on the queue should check dead, and return // EIDRM if set to true. dead bool // obj defines basic fields that should be included in all SysV IPC objects. obj *ipc.Object // senders holds a queue of blocked message senders. Senders are notified // when enough space is available in the queue to insert their message. senders waiter.Queue // receivers holds a queue of blocked receivers. Receivers are notified // when a new message is inserted into the queue and can be received. receivers waiter.Queue // messages is a list of sent messages. messages msgList // sendTime is the last time a msgsnd was performed. sendTime ktime.Time // receiveTime is the last time a msgrcv was performed. receiveTime ktime.Time // changeTime is the last time the queue was modified using msgctl. changeTime ktime.Time // byteCount is the current number of message bytes in the queue. byteCount uint64 // messageCount is the current number of messages in the queue. messageCount uint64 // maxBytes is the maximum allowed number of bytes in the queue, and is also // used as a limit for the number of total possible messages. maxBytes uint64 // sendPID is the PID of the process that performed the last msgsnd. sendPID int32 // receivePID is the PID of the process that performed the last msgrcv. receivePID int32 } // Message represents a message exchanged through a Queue via msgsnd(2) and // msgrcv(2). // // +stateify savable type Message struct { msgEntry // Type is an integer representing the type of the sent message. Type int64 // Text is an untyped block of memory. Text []byte // Size is the size of Text. Size uint64 } func (m *Message) makeCopy() *Message { new := &Message{ Type: m.Type, Size: m.Size, } new.Text = make([]byte, len(m.Text)) copy(new.Text, m.Text) return new } // Blocker is used for blocking Queue.Send, and Queue.Receive calls that serves // as an abstracted version of kernel.Task. kernel.Task is not directly used to // prevent circular dependencies. type Blocker interface { Block(C <-chan struct{}) error } // FindOrCreate creates a new message queue or returns an existing one. See // msgget(2). func (r *Registry) FindOrCreate(ctx context.Context, key ipc.Key, mode linux.FileMode, private, create, exclusive bool) (*Queue, error) { r.mu.Lock() defer r.mu.Unlock() if !private { queue, err := r.reg.Find(ctx, key, mode, create, exclusive) if err != nil { return nil, err } if queue != nil { return queue.(*Queue), nil } } // Check system-wide limits. if r.reg.ObjectCount() >= maxQueues { return nil, linuxerr.ENOSPC } return r.newQueueLocked(ctx, key, auth.CredentialsFromContext(ctx), mode) } // newQueueLocked creates a new queue using the given fields. An error is // returned if there're no more available identifiers. // // Precondition: r.mu must be held. func (r *Registry) newQueueLocked(ctx context.Context, key ipc.Key, creds *auth.Credentials, mode linux.FileMode) (*Queue, error) { q := &Queue{ registry: r, obj: ipc.NewObject(r.reg.UserNS, key, creds, creds, mode), sendTime: ktime.ZeroTime, receiveTime: ktime.ZeroTime, changeTime: ktime.NowFromContext(ctx), maxBytes: maxQueueBytes, } err := r.reg.Register(q) if err != nil { return nil, err } return q, nil } // Remove removes the queue with specified ID. All waiters (readers and // writers) and writers will be awakened and fail. Remove will return an error // if the ID is invalid, or the user doesn't have privileges. func (r *Registry) Remove(id ipc.ID, creds *auth.Credentials) error { r.mu.Lock() defer r.mu.Unlock() r.reg.Remove(id, creds) return nil } // FindByID returns the queue with the specified ID and an error if the ID // doesn't exist. func (r *Registry) FindByID(id ipc.ID) (*Queue, error) { r.mu.Lock() defer r.mu.Unlock() mech := r.reg.FindByID(id) if mech == nil { return nil, linuxerr.EINVAL } return mech.(*Queue), nil } // IPCInfo reports global parameters for message queues. See msgctl(IPC_INFO). func (r *Registry) IPCInfo(ctx context.Context) *linux.MsgInfo { return &linux.MsgInfo{ MsgPool: linux.MSGPOOL, MsgMap: linux.MSGMAP, MsgMax: linux.MSGMAX, MsgMnb: linux.MSGMNB, MsgMni: linux.MSGMNI, MsgSsz: linux.MSGSSZ, MsgTql: linux.MSGTQL, MsgSeg: linux.MSGSEG, } } // MsgInfo reports global parameters for message queues. See msgctl(MSG_INFO). func (r *Registry) MsgInfo(ctx context.Context) *linux.MsgInfo { r.mu.Lock() defer r.mu.Unlock() var messages, bytes uint64 r.reg.ForAllObjects( func(o ipc.Mechanism) { q := o.(*Queue) q.mu.Lock() messages += q.messageCount bytes += q.byteCount q.mu.Unlock() }, ) return &linux.MsgInfo{ MsgPool: int32(r.reg.ObjectCount()), MsgMap: int32(messages), MsgTql: int32(bytes), MsgMax: linux.MSGMAX, MsgMnb: linux.MSGMNB, MsgMni: linux.MSGMNI, MsgSsz: linux.MSGSSZ, MsgSeg: linux.MSGSEG, } } // Send appends a message to the message queue, and returns an error if sending // fails. See msgsnd(2). func (q *Queue) Send(ctx context.Context, m Message, b Blocker, wait bool, pid int32) error { // Try to perform a non-blocking send using queue.append. If EWOULDBLOCK // is returned, start the blocking procedure. Otherwise, return normally. creds := auth.CredentialsFromContext(ctx) // Fast path: first attempt a non-blocking push. if err := q.push(ctx, m, creds, pid); err != linuxerr.EWOULDBLOCK { return err } if !wait { return linuxerr.EAGAIN } // Slow path: at this point, the queue was found to be full, and we were // asked to block. e, ch := waiter.NewChannelEntry(waiter.EventOut) q.senders.EventRegister(&e) defer q.senders.EventUnregister(&e) // Note: we need to check again before blocking the first time since space // may have become available. for { if err := q.push(ctx, m, creds, pid); err != linuxerr.EWOULDBLOCK { return err } if err := b.Block(ch); err != nil { return err } } } // push appends a message to the queue's message list and notifies waiting // receivers that a message has been inserted. It returns an error if adding // the message would cause the queue to exceed its maximum capacity, which can // be used as a signal to block the task. Other errors should be returned as is. func (q *Queue) push(ctx context.Context, m Message, creds *auth.Credentials, pid int32) error { if m.Type <= 0 { return linuxerr.EINVAL } q.mu.Lock() defer q.mu.Unlock() if !q.obj.CheckPermissions(creds, vfs.MayWrite) { // The calling process does not have write permission on the message // queue, and does not have the CAP_IPC_OWNER capability in the user // namespace that governs its IPC namespace. return linuxerr.EACCES } // Queue was removed while the process was waiting. if q.dead { return linuxerr.EIDRM } // Check if sufficient space is available (the queue isn't full.) From // the man pages: // // "A message queue is considered to be full if either of the following // conditions is true: // // • Adding a new message to the queue would cause the total number // of bytes in the queue to exceed the queue's maximum size (the // msg_qbytes field). // // • Adding another message to the queue would cause the total // number of messages in the queue to exceed the queue's maximum // size (the msg_qbytes field). This check is necessary to // prevent an unlimited number of zero-length messages being // placed on the queue. Although such messages contain no data, // they nevertheless consume (locked) kernel memory." // // The msg_qbytes field in our implementation is q.maxBytes. if m.Size+q.byteCount > q.maxBytes || q.messageCount+1 > q.maxBytes { return linuxerr.EWOULDBLOCK } // Copy the message into the queue. q.messages.PushBack(&m) q.byteCount += m.Size q.messageCount++ q.sendPID = pid q.sendTime = ktime.NowFromContext(ctx) // Notify receivers about the new message. q.receivers.Notify(waiter.EventIn) return nil } // Receive removes a message from the queue and returns it. See msgrcv(2). func (q *Queue) Receive(ctx context.Context, b Blocker, mType int64, maxSize int64, wait, truncate, except bool, pid int32) (*Message, error) { if maxSize < 0 || maxSize > maxMessageBytes { return nil, linuxerr.EINVAL } max := uint64(maxSize) creds := auth.CredentialsFromContext(ctx) // Fast path: first attempt a non-blocking pop. if msg, err := q.pop(ctx, creds, mType, max, truncate, except, pid); err != linuxerr.EWOULDBLOCK { return msg, err } if !wait { return nil, linuxerr.ENOMSG } // Slow path: at this point, the queue was found to be empty, and we were // asked to block. e, ch := waiter.NewChannelEntry(waiter.EventIn) q.receivers.EventRegister(&e) defer q.receivers.EventUnregister(&e) // Note: we need to check again before blocking the first time since a // message may have become available. for { if msg, err := q.pop(ctx, creds, mType, max, truncate, except, pid); err != linuxerr.EWOULDBLOCK { return msg, err } if err := b.Block(ch); err != nil { return nil, err } } } // pop pops the first message from the queue that matches the given type. It // returns an error for all the cases specified in msgrcv(2). If the queue is // empty or no message of the specified type is available, a EWOULDBLOCK error // is returned, which can then be used as a signal to block the process or fail. func (q *Queue) pop(ctx context.Context, creds *auth.Credentials, mType int64, maxSize uint64, truncate, except bool, pid int32) (*Message, error) { q.mu.Lock() defer q.mu.Unlock() if !q.obj.CheckPermissions(creds, vfs.MayRead) { // The calling process does not have read permission on the message // queue, and does not have the CAP_IPC_OWNER capability in the user // namespace that governs its IPC namespace. return nil, linuxerr.EACCES } // Queue was removed while the process was waiting. if q.dead { return nil, linuxerr.EIDRM } if q.messages.Empty() { return nil, linuxerr.EWOULDBLOCK } // Get a message from the queue. var msg *Message switch { case mType == 0: msg = q.messages.Front() case mType > 0: msg = q.msgOfType(mType, except) case mType < 0: msg = q.msgOfTypeLessThan(-1 * mType) } // If no message exists, return a blocking signal. if msg == nil { return nil, linuxerr.EWOULDBLOCK } // Check message's size is acceptable. if maxSize < msg.Size { if !truncate { return nil, linuxerr.E2BIG } msg.Size = maxSize msg.Text = msg.Text[:maxSize+1] } q.messages.Remove(msg) q.byteCount -= msg.Size q.messageCount-- q.receivePID = pid q.receiveTime = ktime.NowFromContext(ctx) // Notify senders about available space. q.senders.Notify(waiter.EventOut) return msg, nil } // Copy copies a message from the queue without deleting it. If no message // exists, an error is returned. See msgrcv(MSG_COPY). func (q *Queue) Copy(mType int64) (*Message, error) { q.mu.Lock() defer q.mu.Unlock() if mType < 0 || q.messages.Empty() { return nil, linuxerr.ENOMSG } msg := q.msgAtIndex(mType) if msg == nil { return nil, linuxerr.ENOMSG } return msg.makeCopy(), nil } // msgOfType returns the first message with the specified type, nil if no // message is found. If except is true, the first message of a type not equal // to mType will be returned. // // Precondition: caller must hold q.mu. func (q *Queue) msgOfType(mType int64, except bool) *Message { if except { for msg := q.messages.Front(); msg != nil; msg = msg.Next() { if msg.Type != mType { return msg } } return nil } for msg := q.messages.Front(); msg != nil; msg = msg.Next() { if msg.Type == mType { return msg } } return nil } // msgOfTypeLessThan return the first message with the lowest type less // than or equal to mType, nil if no such message exists. // // Precondition: caller must hold q.mu. func (q *Queue) msgOfTypeLessThan(mType int64) (m *Message) { min := mType for msg := q.messages.Front(); msg != nil; msg = msg.Next() { if msg.Type <= mType && msg.Type < min { m = msg min = msg.Type } } return m } // msgAtIndex returns a pointer to a message at given index, nil if non exits. // // Precondition: caller must hold q.mu. func (q *Queue) msgAtIndex(mType int64) *Message { msg := q.messages.Front() for ; mType != 0 && msg != nil; mType-- { msg = msg.Next() } return msg } // Set modifies some values of the queue. See msgctl(IPC_SET). func (q *Queue) Set(ctx context.Context, ds *linux.MsqidDS) error { q.mu.Lock() defer q.mu.Unlock() creds := auth.CredentialsFromContext(ctx) if ds.MsgQbytes > maxQueueBytes && !creds.HasCapabilityIn(linux.CAP_SYS_RESOURCE, q.obj.UserNS) { // "An attempt (IPC_SET) was made to increase msg_qbytes beyond the // system parameter MSGMNB, but the caller is not privileged (Linux: // does not have the CAP_SYS_RESOURCE capability)." return linuxerr.EPERM } if err := q.obj.Set(ctx, &ds.MsgPerm); err != nil { return err } q.maxBytes = ds.MsgQbytes q.changeTime = ktime.NowFromContext(ctx) return nil } // Stat returns a MsqidDS object filled with information about the queue. See // msgctl(IPC_STAT) and msgctl(MSG_STAT). func (q *Queue) Stat(ctx context.Context) (*linux.MsqidDS, error) { return q.stat(ctx, vfs.MayRead) } // StatAny is similar to Queue.Stat, but doesn't require read permission. See // msgctl(MSG_STAT_ANY). func (q *Queue) StatAny(ctx context.Context) (*linux.MsqidDS, error) { return q.stat(ctx, 0) } // stat returns a MsqidDS object filled with information about the queue. An // error is returned if the user doesn't have the specified permissions. func (q *Queue) stat(ctx context.Context, ats vfs.AccessTypes) (*linux.MsqidDS, error) { q.mu.Lock() defer q.mu.Unlock() creds := auth.CredentialsFromContext(ctx) if !q.obj.CheckPermissions(creds, ats) { // "The caller must have read permission on the message queue." return nil, linuxerr.EACCES } return &linux.MsqidDS{ MsgPerm: linux.IPCPerm{ Key: uint32(q.obj.Key), UID: uint32(creds.UserNamespace.MapFromKUID(q.obj.OwnerUID)), GID: uint32(creds.UserNamespace.MapFromKGID(q.obj.OwnerGID)), CUID: uint32(creds.UserNamespace.MapFromKUID(q.obj.CreatorUID)), CGID: uint32(creds.UserNamespace.MapFromKGID(q.obj.CreatorGID)), Mode: uint16(q.obj.Mode), Seq: 0, // IPC sequences not supported. }, MsgStime: q.sendTime.TimeT(), MsgRtime: q.receiveTime.TimeT(), MsgCtime: q.changeTime.TimeT(), MsgCbytes: q.byteCount, MsgQnum: q.messageCount, MsgQbytes: q.maxBytes, MsgLspid: q.sendPID, MsgLrpid: q.receivePID, }, nil } // Lock implements ipc.Mechanism.Lock. func (q *Queue) Lock() { q.mu.Lock() } // Unlock implements ipc.mechanism.Unlock. // // +checklocksignore func (q *Queue) Unlock() { q.mu.Unlock() } // Object implements ipc.Mechanism.Object. func (q *Queue) Object() *ipc.Object { return q.obj } // Destroy implements ipc.Mechanism.Destroy. func (q *Queue) Destroy() { q.dead = true // Notify waiters. Senders and receivers will try to run, and return an // error (EIDRM). Waiters should remove themselves from the queue after // waking up. q.senders.Notify(waiter.EventOut) q.receivers.Notify(waiter.EventIn) } // ID returns queue's ID. func (q *Queue) ID() ipc.ID { return q.obj.ID } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/msgqueue/msgqueue_state_autogen.go000066400000000000000000000107211465435605700311700ustar00rootroot00000000000000// automatically generated by stateify. package msgqueue import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (l *msgList) StateTypeName() string { return "pkg/sentry/kernel/msgqueue.msgList" } func (l *msgList) StateFields() []string { return []string{ "head", "tail", } } func (l *msgList) beforeSave() {} // +checklocksignore func (l *msgList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *msgList) afterLoad(context.Context) {} // +checklocksignore func (l *msgList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *msgEntry) StateTypeName() string { return "pkg/sentry/kernel/msgqueue.msgEntry" } func (e *msgEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *msgEntry) beforeSave() {} // +checklocksignore func (e *msgEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *msgEntry) afterLoad(context.Context) {} // +checklocksignore func (e *msgEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (r *Registry) StateTypeName() string { return "pkg/sentry/kernel/msgqueue.Registry" } func (r *Registry) StateFields() []string { return []string{ "reg", } } func (r *Registry) beforeSave() {} // +checklocksignore func (r *Registry) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.reg) } func (r *Registry) afterLoad(context.Context) {} // +checklocksignore func (r *Registry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.reg) } func (q *Queue) StateTypeName() string { return "pkg/sentry/kernel/msgqueue.Queue" } func (q *Queue) StateFields() []string { return []string{ "registry", "dead", "obj", "senders", "receivers", "messages", "sendTime", "receiveTime", "changeTime", "byteCount", "messageCount", "maxBytes", "sendPID", "receivePID", } } func (q *Queue) beforeSave() {} // +checklocksignore func (q *Queue) StateSave(stateSinkObject state.Sink) { q.beforeSave() stateSinkObject.Save(0, &q.registry) stateSinkObject.Save(1, &q.dead) stateSinkObject.Save(2, &q.obj) stateSinkObject.Save(3, &q.senders) stateSinkObject.Save(4, &q.receivers) stateSinkObject.Save(5, &q.messages) stateSinkObject.Save(6, &q.sendTime) stateSinkObject.Save(7, &q.receiveTime) stateSinkObject.Save(8, &q.changeTime) stateSinkObject.Save(9, &q.byteCount) stateSinkObject.Save(10, &q.messageCount) stateSinkObject.Save(11, &q.maxBytes) stateSinkObject.Save(12, &q.sendPID) stateSinkObject.Save(13, &q.receivePID) } func (q *Queue) afterLoad(context.Context) {} // +checklocksignore func (q *Queue) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &q.registry) stateSourceObject.Load(1, &q.dead) stateSourceObject.Load(2, &q.obj) stateSourceObject.Load(3, &q.senders) stateSourceObject.Load(4, &q.receivers) stateSourceObject.Load(5, &q.messages) stateSourceObject.Load(6, &q.sendTime) stateSourceObject.Load(7, &q.receiveTime) stateSourceObject.Load(8, &q.changeTime) stateSourceObject.Load(9, &q.byteCount) stateSourceObject.Load(10, &q.messageCount) stateSourceObject.Load(11, &q.maxBytes) stateSourceObject.Load(12, &q.sendPID) stateSourceObject.Load(13, &q.receivePID) } func (m *Message) StateTypeName() string { return "pkg/sentry/kernel/msgqueue.Message" } func (m *Message) StateFields() []string { return []string{ "msgEntry", "Type", "Text", "Size", } } func (m *Message) beforeSave() {} // +checklocksignore func (m *Message) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.msgEntry) stateSinkObject.Save(1, &m.Type) stateSinkObject.Save(2, &m.Text) stateSinkObject.Save(3, &m.Size) } func (m *Message) afterLoad(context.Context) {} // +checklocksignore func (m *Message) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.msgEntry) stateSourceObject.Load(1, &m.Type) stateSourceObject.Load(2, &m.Text) stateSourceObject.Load(3, &m.Size) } func init() { state.Register((*msgList)(nil)) state.Register((*msgEntry)(nil)) state.Register((*Registry)(nil)) state.Register((*Queue)(nil)) state.Register((*Message)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/pending_signals.go000066400000000000000000000113731465435605700257300ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" ) const ( // stdSignalCap is the maximum number of instances of a given standard // signal that may be pending. ("[If] multiple instances of a standard // signal are delivered while that signal is currently blocked, then only // one instance is queued.") - signal(7) stdSignalCap = 1 // rtSignalCap is the maximum number of instances of a given realtime // signal that may be pending. // // TODO(igudger): In Linux, the minimum signal queue size is // RLIMIT_SIGPENDING, which is by default max_threads/2. rtSignalCap = 32 ) // pendingSignals holds a collection of pending signals. The zero value of // pendingSignals is a valid empty collection. pendingSignals is thread-unsafe; // users must provide synchronization. // // +stateify savable type pendingSignals struct { // signals contains all pending signals. // // Note that signals is zero-indexed, but signal 1 is the first valid // signal, so signals[0] contains signals with signo 1 etc. This offset is // usually handled by using Signal.index(). signals [linux.SignalMaximum]pendingSignalQueue `state:".([]savedPendingSignal)"` // Bit i of pendingSet is set iff there is at least one signal with signo // i+1 pending. pendingSet linux.SignalSet `state:"manual"` } // pendingSignalQueue holds a pendingSignalList for a single signal number. // // +stateify savable type pendingSignalQueue struct { pendingSignalList length int } // +stateify savable type pendingSignal struct { // pendingSignalEntry links into a pendingSignalList. pendingSignalEntry *linux.SignalInfo // If timer is not nil, it is the IntervalTimer which sent this signal. timer *IntervalTimer } // enqueue enqueues the given signal. enqueue returns true on success and false // on failure (if the given signal's queue is full). // // Preconditions: info represents a valid signal. func (p *pendingSignals) enqueue(info *linux.SignalInfo, timer *IntervalTimer) bool { sig := linux.Signal(info.Signo) q := &p.signals[sig.Index()] if sig.IsStandard() { if q.length >= stdSignalCap { return false } } else if q.length >= rtSignalCap { return false } q.pendingSignalList.PushBack(&pendingSignal{SignalInfo: info, timer: timer}) q.length++ p.pendingSet |= linux.SignalSetOf(sig) return true } // dequeue dequeues and returns any pending signal not masked by mask. If no // unmasked signals are pending, dequeue returns nil. func (p *pendingSignals) dequeue(mask linux.SignalSet) *linux.SignalInfo { // "Real-time signals are delivered in a guaranteed order. Multiple // real-time signals of the same type are delivered in the order they were // sent. If different real-time signals are sent to a process, they are // delivered starting with the lowest-numbered signal. (I.e., low-numbered // signals have highest priority.) By contrast, if multiple standard // signals are pending for a process, the order in which they are delivered // is unspecified. If both standard and real-time signals are pending for a // process, POSIX leaves it unspecified which is delivered first. Linux, // like many other implementations, gives priority to standard signals in // this case." - signal(7) lowestPendingUnblockedBit := bits.TrailingZeros64(uint64(p.pendingSet &^ mask)) if lowestPendingUnblockedBit >= linux.SignalMaximum { return nil } return p.dequeueSpecific(linux.Signal(lowestPendingUnblockedBit + 1)) } func (p *pendingSignals) dequeueSpecific(sig linux.Signal) *linux.SignalInfo { q := &p.signals[sig.Index()] ps := q.pendingSignalList.Front() if ps == nil { return nil } q.pendingSignalList.Remove(ps) q.length-- if q.length == 0 { p.pendingSet &^= linux.SignalSetOf(sig) } if ps.timer != nil { ps.timer.updateDequeuedSignalLocked(ps.SignalInfo) } return ps.SignalInfo } // discardSpecific causes all pending signals with number sig to be discarded. func (p *pendingSignals) discardSpecific(sig linux.Signal) { q := &p.signals[sig.Index()] for ps := q.pendingSignalList.Front(); ps != nil; ps = ps.Next() { if ps.timer != nil { ps.timer.signalRejectedLocked() } } q.pendingSignalList.Reset() q.length = 0 p.pendingSet &^= linux.SignalSetOf(sig) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/pending_signals_list.go000066400000000000000000000126231465435605700267620ustar00rootroot00000000000000package kernel // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type pendingSignalElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (pendingSignalElementMapper) linkerFor(elem *pendingSignal) *pendingSignal { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type pendingSignalList struct { head *pendingSignal tail *pendingSignal } // Reset resets list l to the empty state. func (l *pendingSignalList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *pendingSignalList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *pendingSignalList) Front() *pendingSignal { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *pendingSignalList) Back() *pendingSignal { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *pendingSignalList) Len() (count int) { for e := l.Front(); e != nil; e = (pendingSignalElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *pendingSignalList) PushFront(e *pendingSignal) { linker := pendingSignalElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { pendingSignalElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *pendingSignalList) PushFrontList(m *pendingSignalList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { pendingSignalElementMapper{}.linkerFor(l.head).SetPrev(m.tail) pendingSignalElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *pendingSignalList) PushBack(e *pendingSignal) { linker := pendingSignalElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { pendingSignalElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *pendingSignalList) PushBackList(m *pendingSignalList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { pendingSignalElementMapper{}.linkerFor(l.tail).SetNext(m.head) pendingSignalElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *pendingSignalList) InsertAfter(b, e *pendingSignal) { bLinker := pendingSignalElementMapper{}.linkerFor(b) eLinker := pendingSignalElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { pendingSignalElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *pendingSignalList) InsertBefore(a, e *pendingSignal) { aLinker := pendingSignalElementMapper{}.linkerFor(a) eLinker := pendingSignalElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { pendingSignalElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *pendingSignalList) Remove(e *pendingSignal) { linker := pendingSignalElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { pendingSignalElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { pendingSignalElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type pendingSignalEntry struct { next *pendingSignal prev *pendingSignal } // Next returns the entry that follows e in the list. // //go:nosplit func (e *pendingSignalEntry) Next() *pendingSignal { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *pendingSignalEntry) Prev() *pendingSignal { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *pendingSignalEntry) SetNext(elem *pendingSignal) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *pendingSignalEntry) SetPrev(elem *pendingSignal) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/pending_signals_state.go000066400000000000000000000024651465435605700271320ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "context" "gvisor.dev/gvisor/pkg/abi/linux" ) // +stateify savable type savedPendingSignal struct { si *linux.SignalInfo timer *IntervalTimer } // saveSignals is invoked by stateify. func (p *pendingSignals) saveSignals() []savedPendingSignal { var pending []savedPendingSignal for _, q := range p.signals { for ps := q.pendingSignalList.Front(); ps != nil; ps = ps.Next() { pending = append(pending, savedPendingSignal{ si: ps.SignalInfo, timer: ps.timer, }) } } return pending } // loadSignals is invoked by stateify. func (p *pendingSignals) loadSignals(_ context.Context, pending []savedPendingSignal) { for _, sps := range pending { p.enqueue(sps.si, sps.timer) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/pipe/000077500000000000000000000000001465435605700231655ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/pipe/inode_mutex.go000066400000000000000000000031131465435605700260320ustar00rootroot00000000000000package pipe import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type inodeMutex struct { mu sync.Mutex } var inodeprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var inodelockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type inodelockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *inodeMutex) Lock() { locking.AddGLock(inodeprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *inodeMutex) NestedLock(i inodelockNameIndex) { locking.AddGLock(inodeprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *inodeMutex) Unlock() { locking.DelGLock(inodeprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *inodeMutex) NestedUnlock(i inodelockNameIndex) { locking.DelGLock(inodeprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func inodeinitLockNames() {} func init() { inodeinitLockNames() inodeprefixIndex = locking.NewMutexClass(reflect.TypeOf(inodeMutex{}), inodelockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/pipe/pipe.go000066400000000000000000000317301465435605700244550ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package pipe provides a pipe implementation. package pipe import ( "fmt" "io" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/waiter" ) const ( // MinimumPipeSize is a hard limit of the minimum size of a pipe. // It corresponds to fs/pipe.c:pipe_min_size. MinimumPipeSize = hostarch.PageSize // MaximumPipeSize is a hard limit on the maximum size of a pipe. // It corresponds to fs/pipe.c:pipe_max_size. MaximumPipeSize = 1048576 // DefaultPipeSize is the system-wide default size of a pipe in bytes. // It corresponds to pipe_fs_i.h:PIPE_DEF_BUFFERS. DefaultPipeSize = 16 * hostarch.PageSize // atomicIOBytes is the maximum number of bytes that the pipe will // guarantee atomic reads or writes atomically. // It corresponds to limits.h:PIPE_BUF. atomicIOBytes = 4096 ) // waitReaders is a wrapper around Pipe. // // This is used for ctx.Block operations that require the synchronization of // readers and writers, along with the careful grabbing and releasing of locks. type waitReaders Pipe // Readiness implements waiter.Waitable.Readiness. func (wq *waitReaders) Readiness(mask waiter.EventMask) waiter.EventMask { return ((*Pipe)(wq)).rwReadiness() & mask } // EventRegister implements waiter.Waitable.EventRegister. func (wq *waitReaders) EventRegister(e *waiter.Entry) error { ((*Pipe)(wq)).queue.EventRegister(e) // Notify synchronously. if ((*Pipe)(wq)).HasReaders() { e.NotifyEvent(waiter.EventInternal) } return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (wq *waitReaders) EventUnregister(e *waiter.Entry) { ((*Pipe)(wq)).queue.EventUnregister(e) } // waitWriters is a wrapper around Pipe. // // This is used for ctx.Block operations that require the synchronization of // readers and writers, along with the careful grabbing and releasing of locks. type waitWriters Pipe // Readiness implements waiter.Waitable.Readiness. func (wq *waitWriters) Readiness(mask waiter.EventMask) waiter.EventMask { return ((*Pipe)(wq)).rwReadiness() & mask } // EventRegister implements waiter.Waitable.EventRegister. func (wq *waitWriters) EventRegister(e *waiter.Entry) error { ((*Pipe)(wq)).queue.EventRegister(e) // Notify synchronously. if ((*Pipe)(wq)).HasWriters() { e.NotifyEvent(waiter.EventInternal) } return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (wq *waitWriters) EventUnregister(e *waiter.Entry) { ((*Pipe)(wq)).queue.EventUnregister(e) } // Pipe is an encapsulation of a platform-independent pipe. // It manages a buffered byte queue shared between a reader/writer // pair. // // +stateify savable type Pipe struct { // queue is the waiter queue. queue waiter.Queue // isNamed indicates whether this is a named pipe. // // This value is immutable. isNamed bool // The number of active readers for this pipe. readers atomicbitops.Int32 // The total number of readers for this pipe. totalReaders atomicbitops.Int32 // The number of active writers for this pipe. writers atomicbitops.Int32 // The total number of writers for this pipe. totalWriters atomicbitops.Int32 // mu protects all pipe internal state below. mu pipeMutex `state:"nosave"` // buf holds the pipe's data. buf is a circular buffer; the first valid // byte in buf is at offset off, and the pipe contains size valid bytes. // bufBlocks contains two identical safemem.Blocks representing buf; this // avoids needing to heap-allocate a new safemem.Block slice when buf is // resized. bufBlockSeq is a safemem.BlockSeq representing bufBlocks. // // These fields are protected by mu. buf []byte bufBlocks [2]safemem.Block `state:"nosave"` bufBlockSeq safemem.BlockSeq `state:"nosave"` off int64 size int64 // max is the maximum size of the pipe in bytes. When this max has been // reached, writers will get EWOULDBLOCK. // // This is protected by mu. max int64 // hadWriter indicates if this pipe ever had a writer. Note that this // does not necessarily indicate there is *currently* a writer, just // that there has been a writer at some point since the pipe was // created. // // This is protected by mu. hadWriter bool } // NewPipe initializes and returns a pipe. // // N.B. The size will be bounded. func NewPipe(isNamed bool, sizeBytes int64) *Pipe { var p Pipe initPipe(&p, isNamed, sizeBytes) return &p } func initPipe(pipe *Pipe, isNamed bool, sizeBytes int64) { if sizeBytes < MinimumPipeSize { sizeBytes = MinimumPipeSize } if sizeBytes > MaximumPipeSize { sizeBytes = MaximumPipeSize } pipe.isNamed = isNamed pipe.max = sizeBytes } // peekLocked passes the first count bytes in the pipe, starting at offset off, // to f and returns its result. If fewer than count bytes are available, the // safemem.BlockSeq passed to f will be less than count bytes in length. // // peekLocked does not mutate the pipe; if the read consumes bytes from the // pipe, then the caller is responsible for calling p.consumeLocked() and // p.queue.Notify(waiter.WritableEvents). (The latter must be called with p.mu // unlocked.) // // Preconditions: // - p.mu must be locked. // - This pipe must have readers. // - off <= p.size. func (p *Pipe) peekLocked(off, count int64, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { // Don't block for a zero-length read even if the pipe is empty. if count == 0 { return 0, nil } // Limit the amount of data read to the amount of data in the pipe. if rem := p.size - off; count > rem { if rem == 0 { if !p.HasWriters() { return 0, io.EOF } return 0, linuxerr.ErrWouldBlock } count = rem } // Prepare the view of the data to be read. pipeOff := p.off + off if max := int64(len(p.buf)); pipeOff >= max { pipeOff -= max } bs := p.bufBlockSeq.DropFirst64(uint64(pipeOff)).TakeFirst64(uint64(count)) // Perform the read. done, err := f(bs) return int64(done), err } // consumeLocked consumes the first n bytes in the pipe, such that they will no // longer be visible to future reads. // // Preconditions: // - p.mu must be locked. // - The pipe must contain at least n bytes. func (p *Pipe) consumeLocked(n int64) { p.off += n if max := int64(len(p.buf)); p.off >= max { p.off -= max } p.size -= n } // writeLocked passes a safemem.BlockSeq representing the first count bytes of // unused space in the pipe to f and returns the result. If fewer than count // bytes are free, the safemem.BlockSeq passed to f will be less than count // bytes in length. If the pipe is full or otherwise cannot accommodate a write // of any number of bytes up to count, writeLocked returns ErrWouldBlock // without calling f. // // Unlike peekLocked, writeLocked assumes that f returns the number of bytes // written to the pipe, and increases the number of bytes stored in the pipe // accordingly. Callers are still responsible for calling // p.queue.Notify(waiter.ReadableEvents) with p.mu unlocked. // // Preconditions: // - p.mu must be locked. func (p *Pipe) writeLocked(count int64, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { // Can't write to a pipe with no readers. if !p.HasReaders() { return 0, unix.EPIPE } avail := p.max - p.size if avail == 0 { return 0, linuxerr.ErrWouldBlock } short := false if count > avail { // POSIX requires that a write smaller than atomicIOBytes // (PIPE_BUF) be atomic, but requires no atomicity for writes // larger than this. if count <= atomicIOBytes { return 0, linuxerr.ErrWouldBlock } count = avail short = true } // Ensure that the buffer is big enough. if newLen, oldCap := p.size+count, int64(len(p.buf)); newLen > oldCap { // Allocate a new buffer. newCap := oldCap * 2 if oldCap == 0 { newCap = 8 // arbitrary; sending individual integers across pipes is relatively common } for newLen > newCap { newCap *= 2 } if newCap > p.max { newCap = p.max } newBuf := make([]byte, newCap) // Copy the old buffer's contents to the beginning of the new one. safemem.CopySeq( safemem.BlockSeqOf(safemem.BlockFromSafeSlice(newBuf)), p.bufBlockSeq.DropFirst64(uint64(p.off)).TakeFirst64(uint64(p.size))) // Switch to the new buffer. p.buf = newBuf p.bufBlocks[0] = safemem.BlockFromSafeSlice(newBuf) p.bufBlocks[1] = p.bufBlocks[0] p.bufBlockSeq = safemem.BlockSeqFromSlice(p.bufBlocks[:]) p.off = 0 } // Prepare the view of the space to be written. woff := p.off + p.size if woff >= int64(len(p.buf)) { woff -= int64(len(p.buf)) } bs := p.bufBlockSeq.DropFirst64(uint64(woff)).TakeFirst64(uint64(count)) // Perform the write. doneU64, err := f(bs) done := int64(doneU64) p.size += done if done < count || err != nil { return done, err } // If we shortened the write, adjust the returned error appropriately. if short { return done, linuxerr.ErrWouldBlock } return done, nil } // rOpen signals a new reader of the pipe. func (p *Pipe) rOpen() { p.readers.Add(1) p.totalReaders.Add(1) // Notify for blocking openers. p.queue.Notify(waiter.EventInternal) } // wOpen signals a new writer of the pipe. func (p *Pipe) wOpen() { p.mu.Lock() p.hadWriter = true p.writers.Add(1) p.totalWriters.Add(1) p.mu.Unlock() // Notify for blocking openers. p.queue.Notify(waiter.EventInternal) } // rClose signals that a reader has closed their end of the pipe. func (p *Pipe) rClose() { if newReaders := p.readers.Add(-1); newReaders < 0 { panic(fmt.Sprintf("Refcounting bug, pipe has negative readers: %v", newReaders)) } } // wClose signals that a writer has closed their end of the pipe. func (p *Pipe) wClose() { if newWriters := p.writers.Add(-1); newWriters < 0 { panic(fmt.Sprintf("Refcounting bug, pipe has negative writers: %v.", newWriters)) } } // HasReaders returns whether the pipe has any active readers. func (p *Pipe) HasReaders() bool { return p.readers.Load() > 0 } // HasWriters returns whether the pipe has any active writers. func (p *Pipe) HasWriters() bool { return p.writers.Load() > 0 } // rReadinessLocked calculates the read readiness. // // Precondition: mu must be held. func (p *Pipe) rReadinessLocked() waiter.EventMask { ready := waiter.EventMask(0) if p.HasReaders() && p.size != 0 { ready |= waiter.ReadableEvents } if !p.HasWriters() && p.hadWriter { // POLLHUP must be suppressed until the pipe has had at least one writer // at some point. Otherwise a reader thread may poll and immediately get // a POLLHUP before the writer ever opens the pipe, which the reader may // interpret as the writer opening then closing the pipe. ready |= waiter.EventHUp } return ready } // rReadiness returns a mask that states whether the read end of the pipe is // ready for reading. func (p *Pipe) rReadiness() waiter.EventMask { p.mu.Lock() defer p.mu.Unlock() return p.rReadinessLocked() } // wReadinessLocked calculates the write readiness. // // Precondition: mu must be held. func (p *Pipe) wReadinessLocked() waiter.EventMask { ready := waiter.EventMask(0) if p.HasWriters() && p.size < p.max { ready |= waiter.WritableEvents } if !p.HasReaders() { ready |= waiter.EventErr } return ready } // wReadiness returns a mask that states whether the write end of the pipe // is ready for writing. func (p *Pipe) wReadiness() waiter.EventMask { p.mu.Lock() defer p.mu.Unlock() return p.wReadinessLocked() } // rwReadiness returns a mask that states whether a read-write handle to the // pipe is ready for IO. func (p *Pipe) rwReadiness() waiter.EventMask { p.mu.Lock() defer p.mu.Unlock() return p.rReadinessLocked() | p.wReadinessLocked() } // EventRegister implements waiter.Waitable.EventRegister. func (p *Pipe) EventRegister(e *waiter.Entry) error { p.queue.EventRegister(e) return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (p *Pipe) EventUnregister(e *waiter.Entry) { p.queue.EventUnregister(e) } // queued returns the amount of queued data. func (p *Pipe) queued() int64 { p.mu.Lock() defer p.mu.Unlock() return p.queuedLocked() } func (p *Pipe) queuedLocked() int64 { return p.size } // SetFifoSize implements fs.FifoSizer.SetFifoSize. func (p *Pipe) SetFifoSize(size int64) (int64, error) { if size < 0 { return 0, linuxerr.EINVAL } if size < MinimumPipeSize { size = MinimumPipeSize // Per spec. } if size > MaximumPipeSize { return 0, linuxerr.EPERM } p.mu.Lock() defer p.mu.Unlock() if size < p.size { return 0, linuxerr.EBUSY } p.max = size return size, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/pipe/pipe_mutex.go000066400000000000000000000031541465435605700256760ustar00rootroot00000000000000package pipe import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type pipeMutex struct { mu sync.Mutex } var pipeprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var pipelockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type pipelockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. const ( pipeLockPipe = pipelockNameIndex(0) ) const () // Lock locks m. // +checklocksignore func (m *pipeMutex) Lock() { locking.AddGLock(pipeprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *pipeMutex) NestedLock(i pipelockNameIndex) { locking.AddGLock(pipeprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *pipeMutex) Unlock() { locking.DelGLock(pipeprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *pipeMutex) NestedUnlock(i pipelockNameIndex) { locking.DelGLock(pipeprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func pipeinitLockNames() { pipelockNames = []string{"pipe"} } func init() { pipeinitLockNames() pipeprefixIndex = locking.NewMutexClass(reflect.TypeOf(pipeMutex{}), pipelockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/pipe/pipe_state_autogen.go000066400000000000000000000063261465435605700274020ustar00rootroot00000000000000// automatically generated by stateify. package pipe import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (p *Pipe) StateTypeName() string { return "pkg/sentry/kernel/pipe.Pipe" } func (p *Pipe) StateFields() []string { return []string{ "queue", "isNamed", "readers", "totalReaders", "writers", "totalWriters", "buf", "off", "size", "max", "hadWriter", } } func (p *Pipe) beforeSave() {} // +checklocksignore func (p *Pipe) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.queue) stateSinkObject.Save(1, &p.isNamed) stateSinkObject.Save(2, &p.readers) stateSinkObject.Save(3, &p.totalReaders) stateSinkObject.Save(4, &p.writers) stateSinkObject.Save(5, &p.totalWriters) stateSinkObject.Save(6, &p.buf) stateSinkObject.Save(7, &p.off) stateSinkObject.Save(8, &p.size) stateSinkObject.Save(9, &p.max) stateSinkObject.Save(10, &p.hadWriter) } // +checklocksignore func (p *Pipe) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.queue) stateSourceObject.Load(1, &p.isNamed) stateSourceObject.Load(2, &p.readers) stateSourceObject.Load(3, &p.totalReaders) stateSourceObject.Load(4, &p.writers) stateSourceObject.Load(5, &p.totalWriters) stateSourceObject.Load(6, &p.buf) stateSourceObject.Load(7, &p.off) stateSourceObject.Load(8, &p.size) stateSourceObject.Load(9, &p.max) stateSourceObject.Load(10, &p.hadWriter) stateSourceObject.AfterLoad(func() { p.afterLoad(ctx) }) } func (vp *VFSPipe) StateTypeName() string { return "pkg/sentry/kernel/pipe.VFSPipe" } func (vp *VFSPipe) StateFields() []string { return []string{ "pipe", } } func (vp *VFSPipe) beforeSave() {} // +checklocksignore func (vp *VFSPipe) StateSave(stateSinkObject state.Sink) { vp.beforeSave() stateSinkObject.Save(0, &vp.pipe) } func (vp *VFSPipe) afterLoad(context.Context) {} // +checklocksignore func (vp *VFSPipe) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &vp.pipe) } func (fd *VFSPipeFD) StateTypeName() string { return "pkg/sentry/kernel/pipe.VFSPipeFD" } func (fd *VFSPipeFD) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "LockFD", "pipe", "lastAddr", } } func (fd *VFSPipeFD) beforeSave() {} // +checklocksignore func (fd *VFSPipeFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.vfsfd) stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &fd.LockFD) stateSinkObject.Save(4, &fd.pipe) stateSinkObject.Save(5, &fd.lastAddr) } func (fd *VFSPipeFD) afterLoad(context.Context) {} // +checklocksignore func (fd *VFSPipeFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.vfsfd) stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &fd.LockFD) stateSourceObject.Load(4, &fd.pipe) stateSourceObject.Load(5, &fd.lastAddr) } func init() { state.Register((*Pipe)(nil)) state.Register((*VFSPipe)(nil)) state.Register((*VFSPipeFD)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/pipe/pipe_unsafe.go000066400000000000000000000024521465435605700260150ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pipe import ( "unsafe" ) // lockTwoPipes locks both x.mu and y.mu in an order that is guaranteed to be // consistent for both lockTwoPipes(x, y) and lockTwoPipes(y, x), such that // concurrent calls cannot deadlock. // // Returns the two pipes in order (first locked pipe, second locked pipe). // The caller should unlock the second pipe first. // // Preconditions: x != y. // +checklocksacquire:x.mu // +checklocksacquire:y.mu func lockTwoPipes(x, y *Pipe) (*Pipe, *Pipe) { // Lock the two pipes in order of increasing address. if uintptr(unsafe.Pointer(x)) < uintptr(unsafe.Pointer(y)) { x.mu.Lock() y.mu.NestedLock(pipeLockPipe) return x, y } y.mu.Lock() x.mu.NestedLock(pipeLockPipe) return y, x } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/pipe/pipe_unsafe_state_autogen.go000066400000000000000000000000661465435605700307360ustar00rootroot00000000000000// automatically generated by stateify. package pipe golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/pipe/pipe_util.go000066400000000000000000000105211465435605700255050ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pipe import ( "io" "math" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // This file contains Pipe file functionality that is tied to neither VFS nor // the old fs architecture. // Release cleans up the pipe's state. func (p *Pipe) Release(context.Context) { p.rClose() p.wClose() // Wake up readers and writers. p.queue.Notify(waiter.ReadableEvents | waiter.WritableEvents) } // Read reads from the Pipe into dst. func (p *Pipe) Read(ctx context.Context, dst usermem.IOSequence) (int64, error) { n, err := p.read(dst.NumBytes(), func(srcs safemem.BlockSeq) (uint64, error) { var done uint64 for !srcs.IsEmpty() { src := srcs.Head() n, err := dst.CopyOut(ctx, src.ToSlice()) done += uint64(n) if err != nil { return done, err } dst = dst.DropFirst(n) srcs = srcs.Tail() } return done, nil }, true /* removeFromSrc */) if n > 0 { p.queue.Notify(waiter.WritableEvents) } return n, err } func (p *Pipe) read(count int64, f func(srcs safemem.BlockSeq) (uint64, error), removeFromSrc bool) (int64, error) { p.mu.Lock() defer p.mu.Unlock() n, err := p.peekLocked(0, count, f) if n > 0 && removeFromSrc { p.consumeLocked(n) } return n, err } // WriteTo writes to w from the Pipe. func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool) (int64, error) { n, err := p.read(count, func(srcs safemem.BlockSeq) (uint64, error) { return safemem.FromIOWriter{w}.WriteFromBlocks(srcs) }, !dup /* removeFromSrc */) if n > 0 && !dup { p.queue.Notify(waiter.WritableEvents) } return n, err } // Write writes to the Pipe from src. func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error) { n, err := p.write(src.NumBytes(), func(dsts safemem.BlockSeq) (uint64, error) { var done uint64 for !dsts.IsEmpty() { dst := dsts.Head() n, err := src.CopyIn(ctx, dst.ToSlice()) done += uint64(n) if err != nil { return done, err } src = src.DropFirst(n) dsts = dsts.Tail() } return done, nil }) if n > 0 { p.queue.Notify(waiter.ReadableEvents) } if linuxerr.Equals(linuxerr.EPIPE, err) { // If we are returning EPIPE send SIGPIPE to the task. if sendSig := linux.SignalNoInfoFuncFromContext(ctx); sendSig != nil { sendSig(linux.SIGPIPE) } } return n, err } func (p *Pipe) write(count int64, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { p.mu.Lock() defer p.mu.Unlock() return p.writeLocked(count, f) } // ReadFrom reads from r to the Pipe. func (p *Pipe) ReadFrom(ctx context.Context, r io.Reader, count int64) (int64, error) { n, err := p.write(count, func(dsts safemem.BlockSeq) (uint64, error) { return safemem.FromIOReader{r}.ReadToBlocks(dsts) }) if n > 0 { p.queue.Notify(waiter.ReadableEvents) } return n, err } // Readiness returns the ready events in the underlying pipe. func (p *Pipe) Readiness(mask waiter.EventMask) waiter.EventMask { return p.rwReadiness() & mask } // Ioctl implements ioctls on the Pipe. func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { // Switch on ioctl request. switch int(args[1].Int()) { case linux.FIONREAD: v := p.queued() if v > math.MaxInt32 { v = math.MaxInt32 // Silently truncate. } // Copy result to userspace. iocc := usermem.IOCopyContext{ IO: io, Ctx: ctx, Opts: usermem.IOOpts{ AddressSpaceActive: true, }, } _, err := primitive.CopyInt32Out(&iocc, args[2].Pointer(), int32(v)) return 0, err default: return 0, unix.ENOTTY } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/pipe/save_restore.go000066400000000000000000000015761465435605700262260ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pipe import ( "context" "gvisor.dev/gvisor/pkg/safemem" ) // afterLoad is called by stateify. func (p *Pipe) afterLoad(context.Context) { p.bufBlocks[0] = safemem.BlockFromSafeSlice(p.buf) p.bufBlocks[1] = p.bufBlocks[0] p.bufBlockSeq = safemem.BlockSeqFromSlice(p.bufBlocks[:]) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/pipe/vfs.go000066400000000000000000000370341465435605700243210ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pipe import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // This file contains types enabling the pipe package to be used with the vfs // package. // VFSPipe represents the actual pipe, analogous to an inode. VFSPipes should // not be copied. // // +stateify savable type VFSPipe struct { // pipe is the underlying pipe. pipe Pipe } // NewVFSPipe returns an initialized VFSPipe. func NewVFSPipe(isNamed bool, sizeBytes int64) *VFSPipe { var vp VFSPipe initPipe(&vp.pipe, isNamed, sizeBytes) return &vp } // ReaderWriterPair returns read-only and write-only FDs for vp. // // Preconditions: statusFlags should not contain an open access mode. func (vp *VFSPipe) ReaderWriterPair(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription, error) { // Connected pipes share the same locks. locks := &vfs.FileLocks{} r, err := vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks) if err != nil { return nil, nil, err } vp.pipe.rOpen() w, err := vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks) if err != nil { r.DecRef(ctx) return nil, nil, err } vp.pipe.wOpen() return r, w, nil } // Allocate implements vfs.FileDescriptionImpl.Allocate. func (*VFSPipe) Allocate(context.Context, uint64, uint64, uint64) error { return linuxerr.ESPIPE } // Open opens the pipe represented by vp. func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) { readable := vfs.MayReadFileWithOpenFlags(statusFlags) writable := vfs.MayWriteFileWithOpenFlags(statusFlags) if !readable && !writable { return nil, linuxerr.EINVAL } fd, err := vp.newFD(mnt, vfsd, statusFlags, locks) if err != nil { return nil, err } // Named pipes have special blocking semantics during open: // // "Normally, opening the FIFO blocks until the other end is opened also. A // process can open a FIFO in nonblocking mode. In this case, opening for // read-only will succeed even if no-one has opened on the write side yet, // opening for write-only will fail with ENXIO (no such device or address) // unless the other end has already been opened. Under Linux, opening a // FIFO for read and write will succeed both in blocking and nonblocking // mode. POSIX leaves this behavior undefined. This can be used to open a // FIFO for writing while there are no readers available." - fifo(7) switch { case readable && writable: vp.pipe.rOpen() vp.pipe.wOpen() // Pipes opened for read-write always succeed without blocking. case readable: tWriters := vp.pipe.totalWriters.Load() vp.pipe.rOpen() // If this pipe is being opened as blocking and there's no // writer, we have to wait for a writer to open the other end. for vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && tWriters == vp.pipe.totalWriters.Load() { if !ctx.BlockOn((*waitWriters)(&vp.pipe), waiter.EventInternal) { fd.DecRef(ctx) return nil, linuxerr.EINTR } } case writable: tReaders := vp.pipe.totalReaders.Load() vp.pipe.wOpen() for vp.pipe.isNamed && !vp.pipe.HasReaders() && tReaders == vp.pipe.totalReaders.Load() { // Non-blocking, write-only opens fail with ENXIO when the read // side isn't open yet. if statusFlags&linux.O_NONBLOCK != 0 { fd.DecRef(ctx) return nil, linuxerr.ENXIO } if !ctx.BlockOn((*waitReaders)(&vp.pipe), waiter.EventInternal) { fd.DecRef(ctx) return nil, linuxerr.EINTR } } default: panic("invalid pipe flags: must be readable, writable, or both") } return fd, nil } // Preconditions: vp.mu must be held. func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) { fd := &VFSPipeFD{ pipe: &vp.pipe, } fd.LockFD.Init(locks) if err := fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{ DenyPRead: true, DenyPWrite: true, UseDentryMetadata: true, }); err != nil { return nil, err } return &fd.vfsfd, nil } // VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements // non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to // other FileDescriptions for splice(2) and tee(2). // // +stateify savable type VFSPipeFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.LockFD pipe *Pipe // lastAddr is the last hostarch.Addr at which a call to a // VFSPipeFD.(usermem.IO) method ended. lastAddr is protected by pipe.mu. lastAddr hostarch.Addr } // Release implements vfs.FileDescriptionImpl.Release. func (fd *VFSPipeFD) Release(context.Context) { var event waiter.EventMask if fd.vfsfd.IsReadable() { fd.pipe.rClose() event |= waiter.WritableEvents if !fd.pipe.HasReaders() { event |= waiter.EventErr } } if fd.vfsfd.IsWritable() { fd.pipe.wClose() event |= waiter.ReadableEvents | waiter.EventHUp } if event == 0 { panic("invalid pipe flags: must be readable, writable, or both") } fd.pipe.queue.Notify(event) } // Readiness implements waiter.Waitable.Readiness. func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask { switch { case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable(): return fd.pipe.rwReadiness() case fd.vfsfd.IsReadable(): return fd.pipe.rReadiness() case fd.vfsfd.IsWritable(): return fd.pipe.wReadiness() default: panic("pipe FD is neither readable nor writable") } } // Allocate implements vfs.FileDescriptionImpl.Allocate. func (fd *VFSPipeFD) Allocate(ctx context.Context, mode, offset, length uint64) error { return linuxerr.ESPIPE } // EventRegister implements waiter.Waitable.EventRegister. func (fd *VFSPipeFD) EventRegister(e *waiter.Entry) error { fd.pipe.EventRegister(e) // Notify synchronously. e.NotifyEvent(fd.Readiness(^waiter.EventMask(0))) return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (fd *VFSPipeFD) EventUnregister(e *waiter.Entry) { fd.pipe.EventUnregister(e) } // Epollable implements FileDescriptionImpl.Epollable. func (fd *VFSPipeFD) Epollable() bool { return true } // Read implements vfs.FileDescriptionImpl.Read. func (fd *VFSPipeFD) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { return fd.pipe.Read(ctx, dst) } // Write implements vfs.FileDescriptionImpl.Write. func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { return fd.pipe.Write(ctx, src) } // Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { return fd.pipe.Ioctl(ctx, uio, sysno, args) } // PipeSize implements fcntl(F_GETPIPE_SZ). func (fd *VFSPipeFD) PipeSize() int64 { // Inline Pipe.FifoSize() since we don't have a fs.File. fd.pipe.mu.Lock() defer fd.pipe.mu.Unlock() return fd.pipe.max } // SetPipeSize implements fcntl(F_SETPIPE_SZ). func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) { return fd.pipe.SetFifoSize(size) } // SpliceToNonPipe performs a splice operation from fd to a non-pipe file. func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescription, off, count int64) (int64, error) { fd.pipe.mu.Lock() // Cap the sequence at number of bytes actually available. if count > fd.pipe.size { count = fd.pipe.size } src := usermem.IOSequence{ IO: fd, Addrs: hostarch.AddrRangeSeqOf(hostarch.AddrRange{0, hostarch.Addr(count)}), } var ( n int64 err error ) fd.lastAddr = 0 if off == -1 { n, err = out.Write(ctx, src, vfs.WriteOptions{}) } else { n, err = out.PWrite(ctx, src, off, vfs.WriteOptions{}) } // Implementations of out.[P]Write() that ignore written data (e.g. // /dev/null) may skip calling src.CopyIn[To](), so: // // - We must call Pipe.consumeLocked() here rather than in fd.CopyIn[To](). // // - We must check if Pipe.peekLocked() would have returned ErrWouldBlock. fd.pipe.consumeLocked(n) if n == 0 && err == nil && fd.pipe.size == 0 && fd.pipe.HasWriters() { err = linuxerr.ErrWouldBlock } fd.pipe.mu.Unlock() if n > 0 { fd.pipe.queue.Notify(waiter.WritableEvents) } return n, err } // SpliceFromNonPipe performs a splice operation from a non-pipe file to fd. func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescription, off, count int64) (int64, error) { dst := usermem.IOSequence{ IO: fd, Addrs: hostarch.AddrRangeSeqOf(hostarch.AddrRange{0, hostarch.Addr(count)}), } var ( n int64 err error ) fd.pipe.mu.Lock() fd.lastAddr = 0 if off == -1 { n, err = in.Read(ctx, dst, vfs.ReadOptions{}) } else { n, err = in.PRead(ctx, dst, off, vfs.ReadOptions{}) } fd.pipe.mu.Unlock() if n > 0 { fd.pipe.queue.Notify(waiter.ReadableEvents) } return n, err } // CopyIn implements usermem.IO.CopyIn. Note that it is the caller's // responsibility to call fd.pipe.Notify(waiter.WritableEvents) after the read // is completed. // // Preconditions: fd.pipe.mu must be locked. func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr hostarch.Addr, dst []byte, opts usermem.IOOpts) (int, error) { if addr != fd.lastAddr { log.Traceback("Non-sequential VFSPipeFD.CopyIn: lastAddr=%#x addr=%#x", fd.lastAddr, addr) return 0, linuxerr.EINVAL } n, err := fd.pipe.peekLocked(int64(addr), int64(len(dst)), func(srcs safemem.BlockSeq) (uint64, error) { return safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), srcs) }) fd.lastAddr = addr + hostarch.Addr(n) return int(n), err } // CopyOut implements usermem.IO.CopyOut. Note that it is the caller's // responsibility to call fd.pipe.queue.Notify(waiter.ReadableEvents) after the // write is completed. // // Preconditions: fd.pipe.mu must be locked. func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr hostarch.Addr, src []byte, opts usermem.IOOpts) (int, error) { if addr != fd.lastAddr { log.Traceback("Non-sequential VFSPipeFD.CopyOut: lastAddr=%#x addr=%#x", fd.lastAddr, addr) return 0, linuxerr.EINVAL } n, err := fd.pipe.writeLocked(int64(len(src)), func(dsts safemem.BlockSeq) (uint64, error) { return safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src))) }) fd.lastAddr = addr + hostarch.Addr(n) return int(n), err } // ZeroOut implements usermem.IO.ZeroOut. // // Preconditions: fd.pipe.mu must be locked. func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64, opts usermem.IOOpts) (int64, error) { if addr != fd.lastAddr { log.Traceback("Non-sequential VFSPipeFD.ZeroOut: lastAddr=%#x addr=%#x", fd.lastAddr, addr) return 0, linuxerr.EINVAL } n, err := fd.pipe.writeLocked(toZero, func(dsts safemem.BlockSeq) (uint64, error) { return safemem.ZeroSeq(dsts) }) fd.lastAddr = addr + hostarch.Addr(n) return n, err } // CopyInTo implements usermem.IO.CopyInTo. Note that it is the caller's // responsibility to call fd.pipe.consumeLocked() and // fd.pipe.queue.Notify(waiter.WritableEvents) after the read is completed. // // Preconditions: fd.pipe.mu must be locked. func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars hostarch.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) { total := int64(0) for !ars.IsEmpty() { ar := ars.Head() if ar.Start != fd.lastAddr { log.Traceback("Non-sequential VFSPipeFD.CopyInTo: lastAddr=%#x addr=%#x", fd.lastAddr, ar.Start) return total, linuxerr.EINVAL } n, err := fd.pipe.peekLocked(int64(ar.Start), int64(ar.Length()), func(srcs safemem.BlockSeq) (uint64, error) { return dst.WriteFromBlocks(srcs) }) fd.lastAddr = ar.Start + hostarch.Addr(n) total += n if err != nil { return total, err } ars = ars.Tail() } return total, nil } // CopyOutFrom implements usermem.IO.CopyOutFrom. Note that it is the caller's // responsibility to call fd.pipe.queue.Notify(waiter.ReadableEvents) after the // write is completed. // // Preconditions: fd.pipe.mu must be locked. func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars hostarch.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) { total := int64(0) for !ars.IsEmpty() { ar := ars.Head() if ar.Start != fd.lastAddr { log.Traceback("Non-sequential VFSPipeFD.CopyOutFrom: lastAddr=%#x addr=%#x", fd.lastAddr, ar.Start) return total, linuxerr.EINVAL } n, err := fd.pipe.writeLocked(int64(ar.Length()), func(dsts safemem.BlockSeq) (uint64, error) { return src.ReadToBlocks(dsts) }) fd.lastAddr = ar.Start + hostarch.Addr(n) total += n if err != nil { return total, err } ars = ars.Tail() } return total, nil } // SwapUint32 implements usermem.IO.SwapUint32. func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr hostarch.Addr, new uint32, opts usermem.IOOpts) (uint32, error) { // How did a pipe get passed as the virtual address space to futex(2)? panic("VFSPipeFD.SwapUint32 called unexpectedly") } // CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32. func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr hostarch.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) { panic("VFSPipeFD.CompareAndSwapUint32 called unexpectedly") } // LoadUint32 implements usermem.IO.LoadUint32. func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr hostarch.Addr, opts usermem.IOOpts) (uint32, error) { panic("VFSPipeFD.LoadUint32 called unexpectedly") } // Splice reads up to count bytes from src and writes them to dst. It returns // the number of bytes moved. // // Preconditions: count > 0. func Splice(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) { return spliceOrTee(ctx, dst, src, count, true /* removeFromSrc */) } // Tee reads up to count bytes from src and writes them to dst, without // removing the read bytes from src. It returns the number of bytes copied. // // Preconditions: count > 0. func Tee(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) { return spliceOrTee(ctx, dst, src, count, false /* removeFromSrc */) } // Preconditions: count > 0. func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFromSrc bool) (int64, error) { if dst.pipe == src.pipe { return 0, linuxerr.EINVAL } firstLocked, secondLocked := lockTwoPipes(dst.pipe, src.pipe) n, err := dst.pipe.writeLocked(count, func(dsts safemem.BlockSeq) (uint64, error) { n, err := src.pipe.peekLocked(0, int64(dsts.NumBytes()), func(srcs safemem.BlockSeq) (uint64, error) { return safemem.CopySeq(dsts, srcs) }) if n > 0 && removeFromSrc { src.pipe.consumeLocked(n) } return uint64(n), err }) secondLocked.mu.NestedUnlock(pipeLockPipe) firstLocked.mu.Unlock() if n > 0 { dst.pipe.queue.Notify(waiter.ReadableEvents) if removeFromSrc { src.pipe.queue.Notify(waiter.WritableEvents) } } return n, err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/posixtimer.go000066400000000000000000000213351465435605700247660ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "math" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" ) // IntervalTimer represents a POSIX interval timer as described by // timer_create(2). // // +stateify savable type IntervalTimer struct { timer *ktime.Timer // If target is not nil, it receives signo from timer expirations. If group // is true, these signals are thread-group-directed. These fields are // immutable. target *Task signo linux.Signal id linux.TimerID sigval uint64 group bool // If sigpending is true, a signal to target is already queued, and timer // expirations should increment overrunCur instead of sending another // signal. sigpending is protected by target's signal mutex. (If target is // nil, the timer will never send signals, so sigpending will be unused.) sigpending bool // If sigorphan is true, timer's setting has been changed since sigpending // last became true, such that overruns should no longer be counted in the // pending signals si_overrun. sigorphan is protected by target's signal // mutex. sigorphan bool // overrunCur is the number of overruns that have occurred since the last // time a signal was sent. overrunCur is protected by target's signal // mutex. overrunCur uint64 // Consider the last signal sent by this timer that has been dequeued. // overrunLast is the number of overruns that occurred between when this // signal was sent and when it was dequeued. Equivalently, overrunLast was // the value of overrunCur when this signal was dequeued. overrunLast is // protected by target's signal mutex. overrunLast uint64 } // DestroyTimer releases it's resources. func (it *IntervalTimer) DestroyTimer() { it.timer.Destroy() it.timerSettingChanged() // A destroyed IntervalTimer is still potentially reachable via a // pendingSignal; nil out timer so that it won't be saved. it.timer = nil } func (it *IntervalTimer) timerSettingChanged() { if it.target == nil { return } it.target.tg.pidns.owner.mu.RLock() defer it.target.tg.pidns.owner.mu.RUnlock() it.target.tg.signalHandlers.mu.Lock() defer it.target.tg.signalHandlers.mu.Unlock() it.sigorphan = true it.overrunCur = 0 it.overrunLast = 0 } // PauseTimer pauses the associated Timer. func (it *IntervalTimer) PauseTimer() { it.timer.Pause() } // ResumeTimer resumes the associated Timer. func (it *IntervalTimer) ResumeTimer() { it.timer.Resume() } // Preconditions: it.target's signal mutex must be locked. func (it *IntervalTimer) updateDequeuedSignalLocked(si *linux.SignalInfo) { it.sigpending = false if it.sigorphan { return } it.overrunLast = it.overrunCur it.overrunCur = 0 si.SetOverrun(saturateI32FromU64(it.overrunLast)) } // Preconditions: it.target's signal mutex must be locked. func (it *IntervalTimer) signalRejectedLocked() { it.sigpending = false if it.sigorphan { return } it.overrunCur++ } // NotifyTimer implements ktime.TimerListener.NotifyTimer. func (it *IntervalTimer) NotifyTimer(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { if it.target == nil { return ktime.Setting{}, false } it.target.tg.pidns.owner.mu.RLock() defer it.target.tg.pidns.owner.mu.RUnlock() it.target.tg.signalHandlers.mu.Lock() defer it.target.tg.signalHandlers.mu.Unlock() if it.sigpending { it.overrunCur += exp return ktime.Setting{}, false } // sigpending must be set before sendSignalTimerLocked() so that it can be // unset if the signal is discarded (in which case sendSignalTimerLocked() // will return nil). it.sigpending = true it.sigorphan = false it.overrunCur += exp - 1 si := &linux.SignalInfo{ Signo: int32(it.signo), Code: linux.SI_TIMER, } si.SetTimerID(it.id) si.SetSigval(it.sigval) // si_overrun is set when the signal is dequeued. if err := it.target.sendSignalTimerLocked(si, it.group, it); err != nil { it.signalRejectedLocked() } return ktime.Setting{}, false } // IntervalTimerCreate implements timer_create(2). func (t *Task) IntervalTimerCreate(c ktime.Clock, sigev *linux.Sigevent) (linux.TimerID, error) { t.tg.timerMu.Lock() defer t.tg.timerMu.Unlock() // Allocate a timer ID. var id linux.TimerID end := t.tg.nextTimerID for { id = t.tg.nextTimerID _, ok := t.tg.timers[id] t.tg.nextTimerID++ if t.tg.nextTimerID < 0 { t.tg.nextTimerID = 0 } if !ok { break } if t.tg.nextTimerID == end { return 0, linuxerr.EAGAIN } } // "The implementation of the default case where evp [sic] is NULL is // handled inside glibc, which invokes the underlying system call with a // suitably populated sigevent structure." - timer_create(2). This is // misleading; the timer_create syscall also handles a NULL sevp as // described by the man page // (kernel/time/posix-timers.c:sys_timer_create(), do_timer_create()). This // must be handled here instead of the syscall wrapper since sigval is the // timer ID, which isn't available until we allocate it in this function. if sigev == nil { sigev = &linux.Sigevent{ Signo: int32(linux.SIGALRM), Notify: linux.SIGEV_SIGNAL, Value: uint64(id), } } // Construct the timer. it := &IntervalTimer{ id: id, sigval: sigev.Value, } switch sigev.Notify { case linux.SIGEV_NONE: // leave it.target = nil case linux.SIGEV_SIGNAL, linux.SIGEV_THREAD: // POSIX SIGEV_THREAD semantics are implemented in userspace by libc; // to the kernel, SIGEV_THREAD and SIGEV_SIGNAL are equivalent. (See // Linux's kernel/time/posix-timers.c:good_sigevent().) it.target = t.tg.leader it.group = true case linux.SIGEV_THREAD_ID: t.tg.pidns.owner.mu.RLock() target, ok := t.tg.pidns.tasks[ThreadID(sigev.Tid)] t.tg.pidns.owner.mu.RUnlock() if !ok || target.tg != t.tg { return 0, linuxerr.EINVAL } it.target = target default: return 0, linuxerr.EINVAL } if sigev.Notify != linux.SIGEV_NONE { it.signo = linux.Signal(sigev.Signo) if !it.signo.IsValid() { return 0, linuxerr.EINVAL } } it.timer = ktime.NewTimer(c, it) t.tg.timers[id] = it return id, nil } // IntervalTimerDelete implements timer_delete(2). func (t *Task) IntervalTimerDelete(id linux.TimerID) error { t.tg.timerMu.Lock() defer t.tg.timerMu.Unlock() it := t.tg.timers[id] if it == nil { return linuxerr.EINVAL } delete(t.tg.timers, id) it.DestroyTimer() return nil } // IntervalTimerSettime implements timer_settime(2). func (t *Task) IntervalTimerSettime(id linux.TimerID, its linux.Itimerspec, abs bool) (linux.Itimerspec, error) { t.tg.timerMu.Lock() defer t.tg.timerMu.Unlock() it := t.tg.timers[id] if it == nil { return linux.Itimerspec{}, linuxerr.EINVAL } newS, err := ktime.SettingFromItimerspec(its, abs, it.timer.Clock()) if err != nil { return linux.Itimerspec{}, err } tm, oldS := it.timer.SwapAnd(newS, it.timerSettingChanged) its = ktime.ItimerspecFromSetting(tm, oldS) return its, nil } // IntervalTimerGettime implements timer_gettime(2). func (t *Task) IntervalTimerGettime(id linux.TimerID) (linux.Itimerspec, error) { t.tg.timerMu.Lock() defer t.tg.timerMu.Unlock() it := t.tg.timers[id] if it == nil { return linux.Itimerspec{}, linuxerr.EINVAL } tm, s := it.timer.Get() its := ktime.ItimerspecFromSetting(tm, s) return its, nil } // IntervalTimerGetoverrun implements timer_getoverrun(2). // // Preconditions: The caller must be running on the task goroutine. func (t *Task) IntervalTimerGetoverrun(id linux.TimerID) (int32, error) { t.tg.timerMu.Lock() defer t.tg.timerMu.Unlock() it := t.tg.timers[id] if it == nil { return 0, linuxerr.EINVAL } // By timer_create(2) invariant, either it.target == nil (in which case // it.overrunLast is immutably 0) or t.tg == it.target.tg; and the fact // that t is executing timer_getoverrun(2) means that t.tg can't be // completing execve, so t.tg.signalHandlers can't be changing, allowing us // to lock t.tg.signalHandlers.mu without holding the TaskSet mutex. t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() // This is consistent with Linux after 78c9c4dfbf8c ("posix-timers: // Sanitize overrun handling"). return saturateI32FromU64(it.overrunLast), nil } func saturateI32FromU64(x uint64) int32 { if x > math.MaxInt32 { return math.MaxInt32 } return int32(x) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/process_group_list.go000066400000000000000000000125321465435605700265070ustar00rootroot00000000000000package kernel // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type processGroupElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (processGroupElementMapper) linkerFor(elem *ProcessGroup) *ProcessGroup { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type processGroupList struct { head *ProcessGroup tail *ProcessGroup } // Reset resets list l to the empty state. func (l *processGroupList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *processGroupList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *processGroupList) Front() *ProcessGroup { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *processGroupList) Back() *ProcessGroup { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *processGroupList) Len() (count int) { for e := l.Front(); e != nil; e = (processGroupElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *processGroupList) PushFront(e *ProcessGroup) { linker := processGroupElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { processGroupElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *processGroupList) PushFrontList(m *processGroupList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { processGroupElementMapper{}.linkerFor(l.head).SetPrev(m.tail) processGroupElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *processGroupList) PushBack(e *ProcessGroup) { linker := processGroupElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { processGroupElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *processGroupList) PushBackList(m *processGroupList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { processGroupElementMapper{}.linkerFor(l.tail).SetNext(m.head) processGroupElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *processGroupList) InsertAfter(b, e *ProcessGroup) { bLinker := processGroupElementMapper{}.linkerFor(b) eLinker := processGroupElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { processGroupElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *processGroupList) InsertBefore(a, e *ProcessGroup) { aLinker := processGroupElementMapper{}.linkerFor(a) eLinker := processGroupElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { processGroupElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *processGroupList) Remove(e *ProcessGroup) { linker := processGroupElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { processGroupElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { processGroupElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type processGroupEntry struct { next *ProcessGroup prev *ProcessGroup } // Next returns the entry that follows e in the list. // //go:nosplit func (e *processGroupEntry) Next() *ProcessGroup { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *processGroupEntry) Prev() *ProcessGroup { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *processGroupEntry) SetNext(elem *ProcessGroup) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *processGroupEntry) SetPrev(elem *ProcessGroup) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/process_group_refs.go000066400000000000000000000102651465435605700264740ustar00rootroot00000000000000package kernel import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const ProcessGroupenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var ProcessGroupobj *ProcessGroup // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type ProcessGroupRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *ProcessGroupRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *ProcessGroupRefs) RefType() string { return fmt.Sprintf("%T", ProcessGroupobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *ProcessGroupRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *ProcessGroupRefs) LogRefs() bool { return ProcessGroupenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *ProcessGroupRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *ProcessGroupRefs) IncRef() { v := r.refCount.Add(1) if ProcessGroupenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *ProcessGroupRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if ProcessGroupenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *ProcessGroupRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if ProcessGroupenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *ProcessGroupRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/ptrace.go000066400000000000000000001211151465435605700240360ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/usermem" ) // ptraceOptions are the subset of options controlling a task's ptrace behavior // that are set by ptrace(PTRACE_SETOPTIONS). // // +stateify savable type ptraceOptions struct { // ExitKill is true if the tracee should be sent SIGKILL when the tracer // exits. ExitKill bool // If SysGood is true, set bit 7 in the signal number for // syscall-entry-stop and syscall-exit-stop traps delivered to this task's // tracer. SysGood bool // TraceClone is true if the tracer wants to receive PTRACE_EVENT_CLONE // events. TraceClone bool // TraceExec is true if the tracer wants to receive PTRACE_EVENT_EXEC // events. TraceExec bool // TraceExit is true if the tracer wants to receive PTRACE_EVENT_EXIT // events. TraceExit bool // TraceFork is true if the tracer wants to receive PTRACE_EVENT_FORK // events. TraceFork bool // TraceSeccomp is true if the tracer wants to receive PTRACE_EVENT_SECCOMP // events. TraceSeccomp bool // TraceVfork is true if the tracer wants to receive PTRACE_EVENT_VFORK // events. TraceVfork bool // TraceVforkDone is true if the tracer wants to receive // PTRACE_EVENT_VFORK_DONE events. TraceVforkDone bool } // ptraceSyscallMode controls the behavior of a ptraced task at syscall entry // and exit. type ptraceSyscallMode int const ( // ptraceSyscallNone indicates that the task has never ptrace-stopped, or // that it was resumed from its last ptrace-stop by PTRACE_CONT or // PTRACE_DETACH. The task's syscalls will not be intercepted. ptraceSyscallNone ptraceSyscallMode = iota // ptraceSyscallIntercept indicates that the task was resumed from its last // ptrace-stop by PTRACE_SYSCALL. The next time the task enters or exits a // syscall, a ptrace-stop will occur. ptraceSyscallIntercept // ptraceSyscallEmu indicates that the task was resumed from its last // ptrace-stop by PTRACE_SYSEMU or PTRACE_SYSEMU_SINGLESTEP. The next time // the task enters a syscall, the syscall will be skipped, and a // ptrace-stop will occur. ptraceSyscallEmu ) // CanTrace checks that t is permitted to access target's state, as defined by // ptrace(2), subsection "Ptrace access mode checking". If attach is true, it // checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access // mode PTRACE_MODE_READ. // // In Linux, ptrace access restrictions may be configured by LSMs. While we do // not support LSMs, we do add additional restrictions based on the commoncap // and YAMA LSMs. // // TODO(gvisor.dev/issue/212): The result of CanTrace is immediately stale (e.g., a // racing setuid(2) may change traceability). This may pose a risk when a task // changes from traceable to not traceable. This is only problematic across // execve, where privileges may increase. // // We currently do not implement privileged executables (set-user/group-ID bits // and file capabilities), so that case is not reachable. func (t *Task) CanTrace(target *Task, attach bool) bool { // "If the calling thread and the target thread are in the same thread // group, access is always allowed." - ptrace(2) // // Note: Strictly speaking, prior to 73af963f9f30 ("__ptrace_may_access() // should not deny sub-threads", first released in Linux 3.12), the rule // only applies if t and target are the same task. But, as that commit // message puts it, "[any] security check is pointless when the tasks share // the same ->mm." if t.tg == target.tg { return true } if !t.canTraceStandard(target, attach) { return false } if t.k.YAMAPtraceScope.Load() == linux.YAMA_SCOPE_RELATIONAL { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if !t.canTraceYAMALocked(target) { return false } } return true } // canTraceLocked is the same as CanTrace, except the caller must already hold // the TaskSet mutex (for reading or writing). func (t *Task) canTraceLocked(target *Task, attach bool) bool { if t.tg == target.tg { return true } if !t.canTraceStandard(target, attach) { return false } if t.k.YAMAPtraceScope.Load() == linux.YAMA_SCOPE_RELATIONAL { if !t.canTraceYAMALocked(target) { return false } } return true } // canTraceStandard performs standard ptrace access checks as defined by // kernel/ptrace.c:__ptrace_may_access as well as the commoncap LSM // implementation of the security_ptrace_access_check() interface, which is // always invoked. func (t *Task) canTraceStandard(target *Task, attach bool) bool { // """ // TODO(gvisor.dev/issue/260): 1. If the access mode specifies // PTRACE_MODE_FSCREDS (ED: snipped, doesn't exist until Linux 4.5). // // Otherwise, the access mode specifies PTRACE_MODE_REALCREDS, so use the // caller's real UID and GID for the checks in the next step. (Most APIs // that check the caller's UID and GID use the effective IDs. For // historical reasons, the PTRACE_MODE_REALCREDS check uses the real IDs // instead.) // // 2. Deny access if neither of the following is true: // // - The real, effective, and saved-set user IDs of the target match the // caller's user ID, *and* the real, effective, and saved-set group IDs of // the target match the caller's group ID. // // - The caller has the CAP_SYS_PTRACE capability in the user namespace of // the target. // // 3. Deny access if the target process "dumpable" attribute has a value // other than 1 (SUID_DUMP_USER; see the discussion of PR_SET_DUMPABLE in // prctl(2)), and the caller does not have the CAP_SYS_PTRACE capability in // the user namespace of the target process. // // 4. The commoncap LSM performs the following steps: // // a) If the access mode includes PTRACE_MODE_FSCREDS, then use the // caller's effective capability set; otherwise (the access mode specifies // PTRACE_MODE_REALCREDS, so) use the caller's permitted capability set. // // b) Deny access if neither of the following is true: // // - The caller and the target process are in the same user namespace, and // the caller's capabilities are a proper superset of the target process's // permitted capabilities. // // - The caller has the CAP_SYS_PTRACE capability in the target process's // user namespace. // // Note that the commoncap LSM does not distinguish between // PTRACE_MODE_READ and PTRACE_MODE_ATTACH. (ED: From earlier in this // section: "the commoncap LSM ... is always invoked".) // """ callerCreds := t.Credentials() targetCreds := target.Credentials() if callerCreds.HasCapabilityIn(linux.CAP_SYS_PTRACE, targetCreds.UserNamespace) { return true } if cuid := callerCreds.RealKUID; cuid != targetCreds.RealKUID || cuid != targetCreds.EffectiveKUID || cuid != targetCreds.SavedKUID { return false } if cgid := callerCreds.RealKGID; cgid != targetCreds.RealKGID || cgid != targetCreds.EffectiveKGID || cgid != targetCreds.SavedKGID { return false } var targetMM *mm.MemoryManager target.WithMuLocked(func(t *Task) { targetMM = t.MemoryManager() }) if targetMM != nil && targetMM.Dumpability() != mm.UserDumpable { return false } if callerCreds.UserNamespace != targetCreds.UserNamespace { return false } if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 { return false } return true } // canTraceYAMALocked performs ptrace access checks as defined by the YAMA LSM // implementation of the security_ptrace_access_check() interface, with YAMA // configured to mode 1. This is a common default among various Linux // distributions. // // It only permits the tracer to proceed if one of the following conditions is // met: // // a) The tracer is already attached to the tracee. // // b) The target is a descendant of the tracer. // // c) The target has explicitly given permission to the tracer through the // PR_SET_PTRACER prctl. // // d) The tracer has CAP_SYS_PTRACE. // // See security/yama/yama_lsm.c:yama_ptrace_access_check. // // Precondition: the TaskSet mutex must be locked (for reading or writing). func (t *Task) canTraceYAMALocked(target *Task) bool { if tracer := target.Tracer(); tracer != nil { if tracer.tg == t.tg { return true } } if target.isYAMADescendantOfLocked(t) { return true } if target.hasYAMAExceptionForLocked(t) { return true } if t.HasCapabilityIn(linux.CAP_SYS_PTRACE, target.UserNamespace()) { return true } return false } // Determines whether t is considered a descendant of ancestor for the purposes // of YAMA permissions (specifically, whether t's thread group is descended from // ancestor's). // // Precondition: the TaskSet mutex must be locked (for reading or writing). func (t *Task) isYAMADescendantOfLocked(ancestor *Task) bool { walker := t for walker != nil { if walker.tg.leader == ancestor.tg.leader { return true } walker = walker.parent } return false } // Precondition: the TaskSet mutex must be locked (for reading or writing). func (t *Task) hasYAMAExceptionForLocked(tracer *Task) bool { allowed, ok := t.k.ptraceExceptions[t.tg.leader] if !ok { return false } return allowed == nil || tracer.isYAMADescendantOfLocked(allowed) } // ClearYAMAException removes any YAMA exception with t as the tracee. func (t *Task) ClearYAMAException() { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() tracee := t.tg.leader delete(t.k.ptraceExceptions, tracee) } // SetYAMAException creates a YAMA exception allowing all descendants of tracer // to trace t. If tracer is nil, then any task is allowed to trace t. // // If there was an existing exception, it is overwritten with the new one. func (t *Task) SetYAMAException(tracer *Task) { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() tracee := t.tg.leader tracee.ptraceYAMAExceptionAdded = true if tracer != nil { tracer.ptraceYAMAExceptionAdded = true } t.k.ptraceExceptions[tracee] = tracer } // Tracer returns t's ptrace Tracer. func (t *Task) Tracer() *Task { return t.ptraceTracer.Load() } // hasTracer returns true if t has a ptrace tracer attached. func (t *Task) hasTracer() bool { // This isn't just inlined into callers so that if Task.Tracer() turns out // to be too expensive because of e.g. interface conversion, we can switch // to having a separate atomic flag more easily. return t.Tracer() != nil } // ptraceStop is a TaskStop placed on tasks in a ptrace-stop. // // +stateify savable type ptraceStop struct { // If frozen is true, the stopped task's tracer is currently operating on // it, so Task.Kill should not remove the stop. frozen bool // If listen is true, the stopped task's tracer invoked PTRACE_LISTEN, so // ptraceFreeze should fail. listen bool } // Killable implements TaskStop.Killable. func (s *ptraceStop) Killable() bool { return !s.frozen } // beginPtraceStopLocked initiates an unfrozen ptrace-stop on t. If t has been // killed, the stop is skipped, and beginPtraceStopLocked returns false. // // beginPtraceStopLocked does not signal t's tracer or wake it if it is // waiting. // // Preconditions: // - The TaskSet mutex must be locked. // - The caller must be running on the task goroutine. func (t *Task) beginPtraceStopLocked() bool { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() // This is analogous to Linux's kernel/signal.c:ptrace_stop() => ... => // kernel/sched/core.c:__schedule() => signal_pending_state() check, which // is what prevents tasks from entering ptrace-stops after being killed. // Note that if t was SIGKILLed and beingPtraceStopLocked is being called // for PTRACE_EVENT_EXIT, the task will have dequeued the signal before // entering the exit path, so t.killedLocked() will no longer return true. // This is consistent with Linux: "Bugs: ... A SIGKILL signal may still // cause a PTRACE_EVENT_EXIT stop before actual signal death. This may be // changed in the future; SIGKILL is meant to always immediately kill tasks // even under ptrace. Last confirmed on Linux 3.13." - ptrace(2) if t.killedLocked() { return false } t.beginInternalStopLocked(&ptraceStop{}) return true } // Preconditions: The TaskSet mutex must be locked. func (t *Task) ptraceTrapLocked(code int32) { // This is unconditional in ptrace_stop(). t.tg.signalHandlers.mu.Lock() t.trapStopPending = false t.tg.signalHandlers.mu.Unlock() t.ptraceCode = code t.ptraceSiginfo = &linux.SignalInfo{ Signo: int32(linux.SIGTRAP), Code: code, } t.ptraceSiginfo.SetPID(int32(t.tg.pidns.tids[t])) t.ptraceSiginfo.SetUID(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) if t.beginPtraceStopLocked() { tracer := t.Tracer() tracer.signalStop(t, linux.CLD_TRAPPED, int32(linux.SIGTRAP)) tracer.tg.eventQueue.Notify(EventTraceeStop) } } // ptraceFreeze checks if t is in a ptraceStop. If so, it freezes the // ptraceStop, temporarily preventing it from being removed by a concurrent // Task.Kill, and returns true. Otherwise it returns false. // // Preconditions: // - The TaskSet mutex must be locked. // - The caller must be running on the task goroutine of t's tracer. func (t *Task) ptraceFreeze() bool { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() if t.stop == nil { return false } s, ok := t.stop.(*ptraceStop) if !ok { return false } if s.listen { return false } s.frozen = true return true } // ptraceUnfreeze ends the effect of a previous successful call to // ptraceFreeze. // // Preconditions: t must be in a frozen ptraceStop. func (t *Task) ptraceUnfreeze() { // t.tg.signalHandlers is stable because t is in a frozen ptrace-stop, // preventing its thread group from completing execve. t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() t.ptraceUnfreezeLocked() } // Preconditions: // - t must be in a frozen ptraceStop. // - t's signal mutex must be locked. func (t *Task) ptraceUnfreezeLocked() { // Do this even if the task has been killed to ensure a panic if t.stop is // nil or not a ptraceStop. t.stop.(*ptraceStop).frozen = false if t.killedLocked() { t.endInternalStopLocked() } } // ptraceUnstop implements ptrace request PTRACE_CONT, PTRACE_SYSCALL, // PTRACE_SINGLESTEP, PTRACE_SYSEMU, or PTRACE_SYSEMU_SINGLESTEP depending on // mode and singlestep. // // Preconditions: t must be in a frozen ptrace stop. // // Postconditions: If ptraceUnstop returns nil, t will no longer be in a ptrace // stop. func (t *Task) ptraceUnstop(mode ptraceSyscallMode, singlestep bool, sig linux.Signal) error { if sig != 0 && !sig.IsValid() { return linuxerr.EIO } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() t.ptraceCode = int32(sig) t.ptraceSyscallMode = mode t.ptraceSinglestep = singlestep t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() t.endInternalStopLocked() return nil } func (t *Task) ptraceTraceme() error { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() if t.hasTracer() { return linuxerr.EPERM } if t.parent == nil { // In Linux, only init can not have a parent, and init is assumed never // to invoke PTRACE_TRACEME. In the sentry, TGID 1 is an arbitrary user // application that may invoke PTRACE_TRACEME; having no parent can // also occur if all tasks in the parent thread group have exited, and // failed to find a living thread group to reparent to. The former case // is treated as if TGID 1 has an exited parent in an invisible // ancestor PID namespace that is an owner of the root user namespace // (and consequently has CAP_SYS_PTRACE), and the latter case is a // special form of the exited parent case below. In either case, // returning nil here is correct. return nil } if !t.parent.canTraceLocked(t, true) { return linuxerr.EPERM } if t.parent.exitState != TaskExitNone { // Fail silently, as if we were successfully attached but then // immediately detached. This is consistent with Linux. return nil } t.ptraceTracer.Store(t.parent) t.parent.ptraceTracees[t] = struct{}{} return nil } // ptraceAttach implements ptrace(PTRACE_ATTACH, target) if seize is false, and // ptrace(PTRACE_SEIZE, target, 0, opts) if seize is true. t is the caller. func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error { if t.tg == target.tg { return linuxerr.EPERM } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() if !t.canTraceLocked(target, true) { return linuxerr.EPERM } if target.hasTracer() { return linuxerr.EPERM } // Attaching to zombies and dead tasks is not permitted; the exit // notification logic relies on this. Linux allows attaching to PF_EXITING // tasks, though. if target.exitState >= TaskExitZombie { return linuxerr.EPERM } if seize { if err := target.ptraceSetOptionsLocked(opts); err != nil { return linuxerr.EIO } } target.ptraceTracer.Store(t) t.ptraceTracees[target] = struct{}{} target.ptraceSeized = seize target.tg.signalHandlers.mu.Lock() // "Unlike PTRACE_ATTACH, PTRACE_SEIZE does not stop the process." - // ptrace(2) if !seize { target.sendSignalLocked(&linux.SignalInfo{ Signo: int32(linux.SIGSTOP), Code: linux.SI_USER, }, false /* group */) } // Undocumented Linux feature: If the tracee is already group-stopped (and // consequently will not report the SIGSTOP just sent), force it to leave // and re-enter the stop so that it will switch to a ptrace-stop. if target.stop == (*groupStop)(nil) { target.trapStopPending = true target.endInternalStopLocked() // TODO(jamieliu): Linux blocks ptrace_attach() until the task has // entered the ptrace-stop (or exited) via JOBCTL_TRAPPING. } target.tg.signalHandlers.mu.Unlock() return nil } // ptraceDetach implements ptrace(PTRACE_DETACH, target, 0, sig). t is the // caller. // // Preconditions: target must be a tracee of t in a frozen ptrace stop. // // Postconditions: If ptraceDetach returns nil, target will no longer be in a // ptrace stop. func (t *Task) ptraceDetach(target *Task, sig linux.Signal) error { if sig != 0 && !sig.IsValid() { return linuxerr.EIO } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() target.ptraceCode = int32(sig) target.forgetTracerLocked() delete(t.ptraceTracees, target) return nil } // exitPtrace is called in the exit path to detach all of t's tracees. func (t *Task) exitPtrace() { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() for target := range t.ptraceTracees { if target.ptraceOpts.ExitKill { target.tg.signalHandlers.mu.Lock() target.sendSignalLocked(&linux.SignalInfo{ Signo: int32(linux.SIGKILL), }, false /* group */) target.tg.signalHandlers.mu.Unlock() } // Leave ptraceCode unchanged so that if the task is ptrace-stopped, it // observes the ptraceCode it set before it entered the stop. I believe // this is consistent with Linux. target.forgetTracerLocked() } clear(t.ptraceTracees) // nil maps cannot be saved if t.ptraceYAMAExceptionAdded { delete(t.k.ptraceExceptions, t) for tracee, tracer := range t.k.ptraceExceptions { if tracer == t { delete(t.k.ptraceExceptions, tracee) } } } } // forgetTracerLocked detaches t's tracer and ensures that t is no longer // ptrace-stopped. // // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) forgetTracerLocked() { t.ptraceSeized = false t.ptraceOpts = ptraceOptions{} t.ptraceSyscallMode = ptraceSyscallNone t.ptraceSinglestep = false t.ptraceTracer.Store(nil) if t.exitTracerNotified && !t.exitTracerAcked { t.exitTracerAcked = true t.exitNotifyLocked(true) } t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() // Unset t.trapStopPending, which might have been set by PTRACE_INTERRUPT. If // it wasn't, it will be reset via t.groupStopPending after the following. t.trapStopPending = false // If t's thread group is in a group stop and t is eligible to participate, // make it do so. This is essentially the reverse of the special case in // ptraceAttach, which converts a group stop to a ptrace stop. ("Handling // of restart from group-stop is currently buggy, but the "as planned" // behavior is to leave tracee stopped and waiting for SIGCONT." - // ptrace(2)) if (t.tg.groupStopComplete || t.tg.groupStopPendingCount != 0) && !t.groupStopPending && t.exitState < TaskExitInitiated { t.groupStopPending = true // t already participated in the group stop when it unset // groupStopPending. t.groupStopAcknowledged = true t.interrupt() } if _, ok := t.stop.(*ptraceStop); ok { t.endInternalStopLocked() } } // ptraceSignalLocked is called after signal dequeueing to check if t should // enter ptrace signal-delivery-stop. // // Preconditions: // - The signal mutex must be locked. // - The caller must be running on the task goroutine. // // +checklocks:t.tg.signalHandlers.mu func (t *Task) ptraceSignalLocked(info *linux.SignalInfo) bool { if linux.Signal(info.Signo) == linux.SIGKILL { return false } if !t.hasTracer() { return false } // The tracer might change this signal into a stop signal, in which case // any SIGCONT received after the signal was originally dequeued should // cancel it. This is consistent with Linux. t.tg.groupStopDequeued = true // This is unconditional in ptrace_stop(). t.trapStopPending = false // Can't lock the TaskSet mutex while holding a signal mutex. t.tg.signalHandlers.mu.Unlock() defer t.tg.signalHandlers.mu.Lock() t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() tracer := t.Tracer() if tracer == nil { return false } t.ptraceCode = info.Signo t.ptraceSiginfo = info t.Debugf("Entering signal-delivery-stop for signal %d", info.Signo) if t.beginPtraceStopLocked() { tracer.signalStop(t, linux.CLD_TRAPPED, info.Signo) tracer.tg.eventQueue.Notify(EventTraceeStop) } return true } // ptraceSeccomp is called when a seccomp-bpf filter returns action // SECCOMP_RET_TRACE to check if t should enter PTRACE_EVENT_SECCOMP stop. data // is the lower 16 bits of the filter's return value. func (t *Task) ptraceSeccomp(data uint16) bool { if !t.hasTracer() { return false } t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if !t.ptraceOpts.TraceSeccomp { return false } t.Debugf("Entering PTRACE_EVENT_SECCOMP stop") t.ptraceEventLocked(linux.PTRACE_EVENT_SECCOMP, uint64(data)) return true } // ptraceSyscallEnter is called immediately before entering a syscall to check // if t should enter ptrace syscall-enter-stop. func (t *Task) ptraceSyscallEnter() (taskRunState, bool) { if !t.hasTracer() { return nil, false } t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() switch t.ptraceSyscallMode { case ptraceSyscallNone: return nil, false case ptraceSyscallIntercept: t.Debugf("Entering syscall-enter-stop from PTRACE_SYSCALL") t.ptraceSyscallStopLocked() return (*runSyscallAfterSyscallEnterStop)(nil), true case ptraceSyscallEmu: t.Debugf("Entering syscall-enter-stop from PTRACE_SYSEMU") t.ptraceSyscallStopLocked() return (*runSyscallAfterSysemuStop)(nil), true } panic(fmt.Sprintf("Unknown ptraceSyscallMode: %v", t.ptraceSyscallMode)) } // ptraceSyscallExit is called immediately after leaving a syscall to check if // t should enter ptrace syscall-exit-stop. func (t *Task) ptraceSyscallExit() { if !t.hasTracer() { return } t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if t.ptraceSyscallMode != ptraceSyscallIntercept { return } t.Debugf("Entering syscall-exit-stop") t.ptraceSyscallStopLocked() } // Preconditions: The TaskSet mutex must be locked. func (t *Task) ptraceSyscallStopLocked() { code := int32(linux.SIGTRAP) if t.ptraceOpts.SysGood { code |= 0x80 } t.ptraceTrapLocked(code) } type ptraceCloneKind int32 const ( // ptraceCloneKindClone represents a call to Task.Clone where // TerminationSignal is not SIGCHLD and Vfork is false. ptraceCloneKindClone ptraceCloneKind = iota // ptraceCloneKindFork represents a call to Task.Clone where // TerminationSignal is SIGCHLD and Vfork is false. ptraceCloneKindFork // ptraceCloneKindVfork represents a call to Task.Clone where Vfork is // true. ptraceCloneKindVfork ) // ptraceClone is called at the end of a clone or fork syscall to check if t // should enter PTRACE_EVENT_CLONE, PTRACE_EVENT_FORK, or PTRACE_EVENT_VFORK // stop. child is the new task. func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, args *linux.CloneArgs) bool { if !t.hasTracer() { return false } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() event := false if args.Flags&linux.CLONE_UNTRACED == 0 { switch kind { case ptraceCloneKindClone: if t.ptraceOpts.TraceClone { t.Debugf("Entering PTRACE_EVENT_CLONE stop") t.ptraceEventLocked(linux.PTRACE_EVENT_CLONE, uint64(t.tg.pidns.tids[child])) event = true } case ptraceCloneKindFork: if t.ptraceOpts.TraceFork { t.Debugf("Entering PTRACE_EVENT_FORK stop") t.ptraceEventLocked(linux.PTRACE_EVENT_FORK, uint64(t.tg.pidns.tids[child])) event = true } case ptraceCloneKindVfork: if t.ptraceOpts.TraceVfork { t.Debugf("Entering PTRACE_EVENT_VFORK stop") t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK, uint64(t.tg.pidns.tids[child])) event = true } default: panic(fmt.Sprintf("Unknown ptraceCloneKind: %v", kind)) } } // "If the PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or PTRACE_O_TRACECLONE // options are in effect, then children created by, respectively, vfork(2) // or clone(2) with the CLONE_VFORK flag, fork(2) or clone(2) with the exit // signal set to SIGCHLD, and other kinds of clone(2), are automatically // attached to the same tracer which traced their parent. SIGSTOP is // delivered to the children, causing them to enter signal-delivery-stop // after they exit the system call which created them." - ptrace(2) // // clone(2)'s documentation of CLONE_UNTRACED and CLONE_PTRACE is // confusingly wrong; see kernel/fork.c:_do_fork() => copy_process() => // include/linux/ptrace.h:ptrace_init_task(). if event || args.Flags&linux.CLONE_PTRACE != 0 { tracer := t.Tracer() if tracer != nil { child.ptraceTracer.Store(tracer) tracer.ptraceTracees[child] = struct{}{} // "The "seized" behavior ... is inherited by children that are // automatically attached using PTRACE_O_TRACEFORK, // PTRACE_O_TRACEVFORK, and PTRACE_O_TRACECLONE." - ptrace(2) child.ptraceSeized = t.ptraceSeized // "Flags are inherited by new tracees created and "auto-attached" // via active PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or // PTRACE_O_TRACECLONE options." - ptrace(2) child.ptraceOpts = t.ptraceOpts child.tg.signalHandlers.mu.Lock() // "PTRACE_SEIZE: ... Automatically attached children stop with // PTRACE_EVENT_STOP and WSTOPSIG(status) returns SIGTRAP instead // of having SIGSTOP signal delivered to them." - ptrace(2) if child.ptraceSeized { child.trapStopPending = true } else { child.pendingSignals.enqueue(&linux.SignalInfo{ Signo: int32(linux.SIGSTOP), }, nil) } // The child will self-interrupt() when its task goroutine starts // running, so we don't have to. child.tg.signalHandlers.mu.Unlock() } } return event } // ptraceVforkDone is called after the end of a vfork stop to check if t should // enter PTRACE_EVENT_VFORK_DONE stop. child is the new task's thread ID in t's // PID namespace. func (t *Task) ptraceVforkDone(child ThreadID) bool { if !t.hasTracer() { return false } t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if !t.ptraceOpts.TraceVforkDone { return false } t.Debugf("Entering PTRACE_EVENT_VFORK_DONE stop") t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK_DONE, uint64(child)) return true } // ptraceExec is called at the end of an execve syscall to check if t should // enter PTRACE_EVENT_EXEC stop. oldTID is t's thread ID, in its *tracer's* PID // namespace, prior to the execve. (If t did not have a tracer at the time // oldTID was read, oldTID may be 0. This is consistent with Linux.) func (t *Task) ptraceExec(oldTID ThreadID) { if !t.hasTracer() { return } t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() // Recheck with the TaskSet mutex locked. Most ptrace points don't need to // do this because detaching resets ptrace options, but PTRACE_EVENT_EXEC // is special because both TraceExec and !TraceExec do something if a // tracer is attached. if !t.hasTracer() { return } if t.ptraceOpts.TraceExec { t.Debugf("Entering PTRACE_EVENT_EXEC stop") t.ptraceEventLocked(linux.PTRACE_EVENT_EXEC, uint64(oldTID)) return } // "If the PTRACE_O_TRACEEXEC option is not in effect for the execing // tracee, and if the tracee was PTRACE_ATTACHed rather that [sic] // PTRACE_SEIZEd, the kernel delivers an extra SIGTRAP to the tracee after // execve(2) returns. This is an ordinary signal (similar to one which can // be generated by `kill -TRAP`, not a special kind of ptrace-stop. // Employing PTRACE_GETSIGINFO for this signal returns si_code set to 0 // (SI_USER). This signal may be blocked by signal mask, and thus may be // delivered (much) later." - ptrace(2) if t.ptraceSeized { return } t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() t.sendSignalLocked(&linux.SignalInfo{ Signo: int32(linux.SIGTRAP), Code: linux.SI_USER, }, false /* group */) } // ptraceExit is called early in the task exit path to check if t should enter // PTRACE_EVENT_EXIT stop. func (t *Task) ptraceExit() { if !t.hasTracer() { return } t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if !t.ptraceOpts.TraceExit { return } t.tg.signalHandlers.mu.Lock() status := t.exitStatus t.tg.signalHandlers.mu.Unlock() t.Debugf("Entering PTRACE_EVENT_EXIT stop") t.ptraceEventLocked(linux.PTRACE_EVENT_EXIT, uint64(status)) } // Preconditions: The TaskSet mutex must be locked. func (t *Task) ptraceEventLocked(event int32, msg uint64) { t.ptraceEventMsg = msg // """ // PTRACE_EVENT stops are observed by the tracer as waitpid(2) returning // with WIFSTOPPED(status), and WSTOPSIG(status) returns SIGTRAP. An // additional bit is set in the higher byte of the status word: the value // status>>8 will be // // (SIGTRAP | PTRACE_EVENT_foo << 8). // // ... // // """ - ptrace(2) t.ptraceTrapLocked(int32(linux.SIGTRAP) | (event << 8)) } // ptraceKill implements ptrace(PTRACE_KILL, target). t is the caller. func (t *Task) ptraceKill(target *Task) error { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() if target.Tracer() != t { return linuxerr.ESRCH } target.tg.signalHandlers.mu.Lock() defer target.tg.signalHandlers.mu.Unlock() // "This operation is deprecated; do not use it! Instead, send a SIGKILL // directly using kill(2) or tgkill(2). The problem with PTRACE_KILL is // that it requires the tracee to be in signal-delivery-stop, otherwise it // may not work (i.e., may complete successfully but won't kill the // tracee)." - ptrace(2) if target.stop == nil { return nil } if _, ok := target.stop.(*ptraceStop); !ok { return nil } target.ptraceCode = int32(linux.SIGKILL) target.endInternalStopLocked() return nil } func (t *Task) ptraceInterrupt(target *Task) error { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() if target.Tracer() != t { return linuxerr.ESRCH } if !target.ptraceSeized { return linuxerr.EIO } target.tg.signalHandlers.mu.Lock() defer target.tg.signalHandlers.mu.Unlock() if target.killedLocked() || target.exitState >= TaskExitInitiated { return nil } target.trapStopPending = true if s, ok := target.stop.(*ptraceStop); ok && s.listen { target.endInternalStopLocked() } target.interrupt() return nil } // Preconditions: // - The TaskSet mutex must be locked for writing. // - t must have a tracer. func (t *Task) ptraceSetOptionsLocked(opts uintptr) error { const valid = uintptr(linux.PTRACE_O_EXITKILL | linux.PTRACE_O_TRACESYSGOOD | linux.PTRACE_O_TRACECLONE | linux.PTRACE_O_TRACEEXEC | linux.PTRACE_O_TRACEEXIT | linux.PTRACE_O_TRACEFORK | linux.PTRACE_O_TRACESECCOMP | linux.PTRACE_O_TRACEVFORK | linux.PTRACE_O_TRACEVFORKDONE) if opts&^valid != 0 { return linuxerr.EINVAL } t.ptraceOpts = ptraceOptions{ ExitKill: opts&linux.PTRACE_O_EXITKILL != 0, SysGood: opts&linux.PTRACE_O_TRACESYSGOOD != 0, TraceClone: opts&linux.PTRACE_O_TRACECLONE != 0, TraceExec: opts&linux.PTRACE_O_TRACEEXEC != 0, TraceExit: opts&linux.PTRACE_O_TRACEEXIT != 0, TraceFork: opts&linux.PTRACE_O_TRACEFORK != 0, TraceSeccomp: opts&linux.PTRACE_O_TRACESECCOMP != 0, TraceVfork: opts&linux.PTRACE_O_TRACEVFORK != 0, TraceVforkDone: opts&linux.PTRACE_O_TRACEVFORKDONE != 0, } return nil } // Ptrace implements the ptrace system call. func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { // PTRACE_TRACEME ignores all other arguments. if req == linux.PTRACE_TRACEME { return t.ptraceTraceme() } // All other ptrace requests operate on a current or future tracee // specified by pid. target := t.tg.pidns.TaskWithID(pid) if target == nil { return linuxerr.ESRCH } // PTRACE_ATTACH and PTRACE_SEIZE do not require that target is not already // a tracee. if req == linux.PTRACE_ATTACH || req == linux.PTRACE_SEIZE { seize := req == linux.PTRACE_SEIZE if seize && addr != 0 { return linuxerr.EIO } return t.ptraceAttach(target, seize, uintptr(data)) } // PTRACE_KILL and PTRACE_INTERRUPT require that the target is a tracee, // but does not require that it is ptrace-stopped. if req == linux.PTRACE_KILL { return t.ptraceKill(target) } if req == linux.PTRACE_INTERRUPT { return t.ptraceInterrupt(target) } // All other ptrace requests require that the target is a ptrace-stopped // tracee, and freeze the ptrace-stop so the tracee can be operated on. t.tg.pidns.owner.mu.RLock() if target.Tracer() != t { t.tg.pidns.owner.mu.RUnlock() return linuxerr.ESRCH } if !target.ptraceFreeze() { t.tg.pidns.owner.mu.RUnlock() // "Most ptrace commands (all except PTRACE_ATTACH, PTRACE_SEIZE, // PTRACE_TRACEME, PTRACE_INTERRUPT, and PTRACE_KILL) require the // tracee to be in a ptrace-stop, otherwise they fail with ESRCH." - // ptrace(2) return linuxerr.ESRCH } t.tg.pidns.owner.mu.RUnlock() // Even if the target has a ptrace-stop active, the tracee's task goroutine // may not yet have reached Task.doStop; wait for it to do so. This is safe // because there's no way for target to initiate a ptrace-stop and then // block (by calling Task.block) before entering it. // // Caveat: If tasks were just restored, the tracee's first call to // Task.Activate (in Task.run) occurs before its first call to Task.doStop, // which may block if the tracer's address space is active. t.UninterruptibleSleepStart(true) target.waitGoroutineStoppedOrExited() t.UninterruptibleSleepFinish(true) // Resuming commands end the ptrace stop, but only if successful. // PTRACE_LISTEN ends the ptrace stop if trapNotifyPending is already set on the // target. switch req { case linux.PTRACE_DETACH: if err := t.ptraceDetach(target, linux.Signal(data)); err != nil { target.ptraceUnfreeze() return err } return nil case linux.PTRACE_CONT: if err := target.ptraceUnstop(ptraceSyscallNone, false, linux.Signal(data)); err != nil { target.ptraceUnfreeze() return err } return nil case linux.PTRACE_SYSCALL: if err := target.ptraceUnstop(ptraceSyscallIntercept, false, linux.Signal(data)); err != nil { target.ptraceUnfreeze() return err } return nil case linux.PTRACE_SINGLESTEP: if err := target.ptraceUnstop(ptraceSyscallNone, true, linux.Signal(data)); err != nil { target.ptraceUnfreeze() return err } return nil case linux.PTRACE_SYSEMU: if err := target.ptraceUnstop(ptraceSyscallEmu, false, linux.Signal(data)); err != nil { target.ptraceUnfreeze() return err } return nil case linux.PTRACE_SYSEMU_SINGLESTEP: if err := target.ptraceUnstop(ptraceSyscallEmu, true, linux.Signal(data)); err != nil { target.ptraceUnfreeze() return err } return nil case linux.PTRACE_LISTEN: t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if !target.ptraceSeized { return linuxerr.EIO } if target.ptraceSiginfo == nil { return linuxerr.EIO } if target.ptraceSiginfo.Code>>8 != linux.PTRACE_EVENT_STOP { return linuxerr.EIO } target.tg.signalHandlers.mu.Lock() defer target.tg.signalHandlers.mu.Unlock() if target.trapNotifyPending { target.endInternalStopLocked() } else { target.stop.(*ptraceStop).listen = true target.ptraceUnfreezeLocked() } return nil } // All other ptrace requests expect us to unfreeze the stop. defer target.ptraceUnfreeze() switch req { case linux.PTRACE_PEEKTEXT, linux.PTRACE_PEEKDATA: // "At the system call level, the PTRACE_PEEKTEXT, PTRACE_PEEKDATA, and // PTRACE_PEEKUSER requests have a different API: they store the result // at the address specified by the data parameter, and the return value // is the error flag." - ptrace(2) word := t.Arch().Native(0) if _, err := word.CopyIn(target.CopyContext(t, usermem.IOOpts{IgnorePermissions: true}), addr); err != nil { return err } _, err := word.CopyOut(t, data) return err case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA: word := t.Arch().Native(uintptr(data)) _, err := word.CopyOut(target.CopyContext(t, usermem.IOOpts{IgnorePermissions: true}), addr) return err case linux.PTRACE_GETREGSET: // "Read the tracee's registers. addr specifies, in an // architecture-dependent way, the type of registers to be read. ... // data points to a struct iovec, which describes the destination // buffer's location and length. On return, the kernel modifies iov.len // to indicate the actual number of bytes returned." - ptrace(2) ars, err := t.CopyInIovecs(data, 1) if err != nil { return err } ar := ars.Head() n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{ Ctx: t, IO: t.MemoryManager(), Addr: ar.Start, Opts: usermem.IOOpts{ AddressSpaceActive: true, }, }, int(ar.Length()), target.Kernel().FeatureSet()) if err != nil { return err } // Update iovecs to represent the range of the written register set. end, ok := ar.Start.AddLength(uint64(n)) if !ok { panic(fmt.Sprintf("%#x + %#x overflows. Invalid reg size > %#x", ar.Start, n, ar.Length())) } ar.End = end return t.CopyOutIovecs(data, hostarch.AddrRangeSeqOf(ar)) case linux.PTRACE_SETREGSET: ars, err := t.CopyInIovecs(data, 1) if err != nil { return err } ar := ars.Head() n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{ Ctx: t, IO: t.MemoryManager(), Addr: ar.Start, Opts: usermem.IOOpts{ AddressSpaceActive: true, }, }, int(ar.Length()), target.Kernel().FeatureSet()) if err != nil { return err } target.p.FullStateChanged() ar.End -= hostarch.Addr(n) return t.CopyOutIovecs(data, hostarch.AddrRangeSeqOf(ar)) case linux.PTRACE_GETSIGINFO: t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if target.ptraceSiginfo == nil { return linuxerr.EINVAL } _, err := target.ptraceSiginfo.CopyOut(t, data) return err case linux.PTRACE_SETSIGINFO: var info linux.SignalInfo if _, err := info.CopyIn(t, data); err != nil { return err } t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if target.ptraceSiginfo == nil { return linuxerr.EINVAL } target.ptraceSiginfo = &info return nil case linux.PTRACE_GETSIGMASK: if addr != linux.SignalSetSize { return linuxerr.EINVAL } mask := target.SignalMask() _, err := mask.CopyOut(t, data) return err case linux.PTRACE_SETSIGMASK: if addr != linux.SignalSetSize { return linuxerr.EINVAL } var mask linux.SignalSet if _, err := mask.CopyIn(t, data); err != nil { return err } // The target's task goroutine is stopped, so this is safe: target.SetSignalMask(mask &^ UnblockableSignals) return nil case linux.PTRACE_SETOPTIONS: t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() return target.ptraceSetOptionsLocked(uintptr(data)) case linux.PTRACE_GETEVENTMSG: t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() _, err := primitive.CopyUint64Out(t, hostarch.Addr(data), target.ptraceEventMsg) return err // PEEKSIGINFO is unimplemented but seems to have no users anywhere. default: return t.ptraceArch(target, req, addr, data) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/ptrace_amd64.go000066400000000000000000000051011465435605700250250ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/usermem" ) // ptraceArch implements arch-specific ptrace commands. func (t *Task) ptraceArch(target *Task, req int64, addr, data hostarch.Addr) error { switch req { case linux.PTRACE_PEEKUSR: // aka PTRACE_PEEKUSER n, err := target.Arch().PtracePeekUser(uintptr(addr)) if err != nil { return err } _, err = n.CopyOut(t, data) return err case linux.PTRACE_POKEUSR: // aka PTRACE_POKEUSER return target.Arch().PtracePokeUser(uintptr(addr), uintptr(data)) case linux.PTRACE_GETREGS: // "Copy the tracee's general-purpose ... registers ... to the address // data in the tracer. ... (addr is ignored.) Note that SPARC systems // have the meaning of data and addr reversed ..." _, err := target.Arch().PtraceGetRegs(&usermem.IOReadWriter{ Ctx: t, IO: t.MemoryManager(), Addr: data, Opts: usermem.IOOpts{ AddressSpaceActive: true, }, }) return err case linux.PTRACE_GETFPREGS: s := target.Arch().FloatingPointData() _, err := target.Arch().FloatingPointData().PtraceGetFPRegs(&usermem.IOReadWriter{ Ctx: t, IO: t.MemoryManager(), Addr: data, Opts: usermem.IOOpts{ AddressSpaceActive: true, }, }, len(*s)) return err case linux.PTRACE_SETREGS: _, err := target.Arch().PtraceSetRegs(&usermem.IOReadWriter{ Ctx: t, IO: t.MemoryManager(), Addr: data, Opts: usermem.IOOpts{ AddressSpaceActive: true, }, }) if err == nil { target.p.FullStateChanged() } return err case linux.PTRACE_SETFPREGS: s := target.Arch().FloatingPointData() _, err := s.PtraceSetFPRegs(&usermem.IOReadWriter{ Ctx: t, IO: t.MemoryManager(), Addr: data, Opts: usermem.IOOpts{ AddressSpaceActive: true, }, }, len(*s)) if err == nil { target.p.FullStateChanged() } return err default: return linuxerr.EIO } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/ptrace_arm64.go000066400000000000000000000016021465435605700250450ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package kernel import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" ) // ptraceArch implements arch-specific ptrace commands. func (t *Task) ptraceArch(target *Task, req int64, addr, data hostarch.Addr) error { return linuxerr.EIO } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/rseq.go000066400000000000000000000300421465435605700235300ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/usermem" ) // Restartable sequences. // // We support two different APIs for restartable sequences. // // 1. The upstream interface added in v4.18. // 2. The interface described in https://lwn.net/Articles/650333/. // // Throughout this file and other parts of the kernel, the latter is referred // to as "old rseq". This interface was never merged upstream, but is supported // for a limited set of applications that use it regardless. // OldRSeqCriticalRegion describes an old rseq critical region. // // +stateify savable type OldRSeqCriticalRegion struct { // When a task in this thread group has its CPU preempted (as defined by // platform.ErrContextCPUPreempted) or has a signal delivered to an // application handler while its instruction pointer is in CriticalSection, // set the instruction pointer to Restart and application register r10 (on // amd64) to the former instruction pointer. CriticalSection hostarch.AddrRange Restart hostarch.Addr } // RSeqAvailable returns true if t supports (old and new) restartable sequences. func (t *Task) RSeqAvailable() bool { return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption() } // SetRSeq registers addr as this thread's rseq structure. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) SetRSeq(addr hostarch.Addr, length, signature uint32) error { if t.rseqAddr != 0 { if t.rseqAddr != addr { return linuxerr.EINVAL } if t.rseqSignature != signature { return linuxerr.EINVAL } return linuxerr.EBUSY } // rseq must be aligned and correctly sized. if addr&(linux.AlignOfRSeq-1) != 0 { return linuxerr.EINVAL } if length != linux.SizeOfRSeq { return linuxerr.EINVAL } if _, ok := t.MemoryManager().CheckIORange(addr, linux.SizeOfRSeq); !ok { return linuxerr.EFAULT } t.rseqAddr = addr t.rseqSignature = signature // Initialize the CPUID. // // Linux implicitly does this on return from userspace, where failure // would cause SIGSEGV. if err := t.rseqUpdateCPU(); err != nil { t.rseqAddr = 0 t.rseqSignature = 0 t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return linuxerr.EFAULT } return nil } // ClearRSeq unregisters addr as this thread's rseq structure. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) ClearRSeq(addr hostarch.Addr, length, signature uint32) error { if t.rseqAddr == 0 { return linuxerr.EINVAL } if t.rseqAddr != addr { return linuxerr.EINVAL } if length != linux.SizeOfRSeq { return linuxerr.EINVAL } if t.rseqSignature != signature { return linuxerr.EPERM } if err := t.rseqClearCPU(); err != nil { return err } t.rseqAddr = 0 t.rseqSignature = 0 if t.oldRSeqCPUAddr == 0 { // rseqCPU no longer needed. t.rseqCPU = -1 } return nil } // OldRSeqCriticalRegion returns a copy of t's thread group's current // old restartable sequence. func (t *Task) OldRSeqCriticalRegion() OldRSeqCriticalRegion { return *t.tg.oldRSeqCritical.Load() } // SetOldRSeqCriticalRegion replaces t's thread group's old restartable // sequence. // // Preconditions: t.RSeqAvailable() == true. func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error { // These checks are somewhat more lenient than in Linux, which (bizarrely) // requires r.CriticalSection to be non-empty and r.Restart to be // outside of r.CriticalSection, even if r.CriticalSection.Start == 0 // (which disables the critical region). if r.CriticalSection.Start == 0 { r.CriticalSection.End = 0 r.Restart = 0 t.tg.oldRSeqCritical.Store(&r) return nil } if r.CriticalSection.Start >= r.CriticalSection.End { return linuxerr.EINVAL } if r.CriticalSection.Contains(r.Restart) { return linuxerr.EINVAL } // TODO(jamieliu): check that r.CriticalSection and r.Restart are in // the application address range, for consistency with Linux. t.tg.oldRSeqCritical.Store(&r) return nil } // OldRSeqCPUAddr returns the address that old rseq will keep updated with t's // CPU number. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) OldRSeqCPUAddr() hostarch.Addr { return t.oldRSeqCPUAddr } // SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with // t's CPU number. // // Preconditions: // - t.RSeqAvailable() == true. // - The caller must be running on the task goroutine. // - t's AddressSpace must be active. func (t *Task) SetOldRSeqCPUAddr(addr hostarch.Addr) error { t.oldRSeqCPUAddr = addr // Check that addr is writable. // // N.B. rseqUpdateCPU may fail on a bad t.rseqAddr as well. That's // unfortunate, but unlikely in a correct program. if err := t.rseqUpdateCPU(); err != nil { t.oldRSeqCPUAddr = 0 return linuxerr.EINVAL // yes, EINVAL, not err or EFAULT } return nil } // Preconditions: // - The caller must be running on the task goroutine. // - t's AddressSpace must be active. func (t *Task) rseqUpdateCPU() error { if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 { t.rseqCPU = -1 return nil } t.rseqCPU = int32(hostcpu.GetCPU()) // Update both CPUs, even if one fails. rerr := t.rseqCopyOutCPU() oerr := t.oldRSeqCopyOutCPU() if rerr != nil { return rerr } return oerr } // Preconditions: // - The caller must be running on the task goroutine. // - t's AddressSpace must be active. func (t *Task) oldRSeqCopyOutCPU() error { if t.oldRSeqCPUAddr == 0 { return nil } buf := t.CopyScratchBuffer(4) hostarch.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) _, err := t.CopyOutBytes(t.oldRSeqCPUAddr, buf) return err } // Preconditions: // - The caller must be running on the task goroutine. // - t's AddressSpace must be active. func (t *Task) rseqCopyOutCPU() error { if t.rseqAddr == 0 { return nil } buf := t.CopyScratchBuffer(8) // CPUIDStart and CPUID are the first two fields in linux.RSeq. hostarch.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) // CPUIDStart hostarch.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID // N.B. This write is not atomic, but since this occurs on the task // goroutine then as long as userspace uses a single-instruction read // it can't see an invalid value. _, err := t.CopyOutBytes(t.rseqAddr, buf) return err } // Preconditions: // - The caller must be running on the task goroutine. // - t's AddressSpace must be active. func (t *Task) rseqClearCPU() error { buf := t.CopyScratchBuffer(8) // CPUIDStart and CPUID are the first two fields in linux.RSeq. hostarch.ByteOrder.PutUint32(buf, 0) // CPUIDStart hostarch.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID // N.B. This write is not atomic, but since this occurs on the task // goroutine then as long as userspace uses a single-instruction read // it can't see an invalid value. _, err := t.CopyOutBytes(t.rseqAddr, buf) return err } // rseqAddrInterrupt checks if IP is in a critical section, and aborts if so. // // This is a bit complex since both the RSeq and RSeqCriticalSection structs // are stored in userspace. So we must: // // 1. Copy in the address of RSeqCriticalSection from RSeq. // 2. Copy in RSeqCriticalSection itself. // 3. Validate critical section struct version, address range, abort address. // 4. Validate the abort signature (4 bytes preceding abort IP match expected // signature). // // 5. Clear address of RSeqCriticalSection from RSeq. // 6. Finally, conditionally abort. // // See kernel/rseq.c:rseq_ip_fixup for reference. // // Preconditions: // - The caller must be running on the task goroutine. // - t's AddressSpace must be active. func (t *Task) rseqAddrInterrupt() { if t.rseqAddr == 0 { return } critAddrAddr, ok := t.rseqAddr.AddLength(linux.OffsetOfRSeqCriticalSection) if !ok { // SetRSeq should validate this. panic(fmt.Sprintf("t.rseqAddr (%#x) not large enough", t.rseqAddr)) } if t.Arch().Width() != 8 { // We only handle 64-bit for now. t.Debugf("Only 64-bit rseq supported.") t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } buf := t.CopyScratchBuffer(8) if _, err := t.CopyInBytes(critAddrAddr, buf); err != nil { t.Debugf("Failed to copy critical section address from %#x for rseq: %v", critAddrAddr, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } critAddr := hostarch.Addr(hostarch.ByteOrder.Uint64(buf)) if critAddr == 0 { return } var cs linux.RSeqCriticalSection if _, err := cs.CopyIn(t, critAddr); err != nil { t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } if cs.Version != 0 { t.Debugf("Unknown version in %+v", cs) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } start := hostarch.Addr(cs.Start) critRange, ok := start.ToRange(cs.PostCommitOffset) if !ok { t.Debugf("Invalid start and offset in %+v", cs) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } abort := hostarch.Addr(cs.Abort) if critRange.Contains(abort) { t.Debugf("Abort in critical section in %+v", cs) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } // Verify signature. sigAddr := abort - linux.SizeOfRSeqSignature buf = t.CopyScratchBuffer(linux.SizeOfRSeqSignature) if _, err := t.CopyInBytes(sigAddr, buf); err != nil { t.Debugf("Failed to copy critical section signature from %#x for rseq: %v", sigAddr, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } sig := hostarch.ByteOrder.Uint32(buf) if sig != t.rseqSignature { t.Debugf("Mismatched rseq signature %d != %d", sig, t.rseqSignature) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } // Clear the critical section address. // // NOTE(b/143949567): We don't support any rseq flags, so we always // restart if we are in the critical section, and thus *always* clear // critAddrAddr. if _, err := t.MemoryManager().ZeroOut(t, critAddrAddr, int64(t.Arch().Width()), usermem.IOOpts{ AddressSpaceActive: true, }); err != nil { t.Debugf("Failed to clear critical section address from %#x for rseq: %v", critAddrAddr, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } // Finally we can actually decide whether or not to restart. if !critRange.Contains(hostarch.Addr(t.Arch().IP())) { return } t.Arch().SetIP(uintptr(cs.Abort)) } // Preconditions: The caller must be running on the task goroutine. func (t *Task) oldRSeqInterrupt() { r := t.tg.oldRSeqCritical.Load() if ip := t.Arch().IP(); r.CriticalSection.Contains(hostarch.Addr(ip)) { t.Debugf("Interrupted rseq critical section at %#x; restarting at %#x", ip, r.Restart) t.Arch().SetIP(uintptr(r.Restart)) t.Arch().SetOldRSeqInterruptedIP(ip) } } // Preconditions: The caller must be running on the task goroutine. func (t *Task) rseqInterrupt() { t.rseqAddrInterrupt() t.oldRSeqInterrupt() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/running_tasks_mutex.go000066400000000000000000000033221465435605700266660ustar00rootroot00000000000000package kernel import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type runningTasksMutex struct { mu sync.Mutex } var runningTasksprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var runningTaskslockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type runningTaskslockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *runningTasksMutex) Lock() { locking.AddGLock(runningTasksprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *runningTasksMutex) NestedLock(i runningTaskslockNameIndex) { locking.AddGLock(runningTasksprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *runningTasksMutex) Unlock() { locking.DelGLock(runningTasksprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *runningTasksMutex) NestedUnlock(i runningTaskslockNameIndex) { locking.DelGLock(runningTasksprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func runningTasksinitLockNames() {} func init() { runningTasksinitLockNames() runningTasksprefixIndex = locking.NewMutexClass(reflect.TypeOf(runningTasksMutex{}), runningTaskslockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/sched/000077500000000000000000000000001465435605700233165ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/sched/cpuset.go000066400000000000000000000054541465435605700251600ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sched import "math/bits" const ( bitsPerByte = 8 bytesPerLong = 8 // only for 64-bit architectures ) // CPUSet contains a bitmap to record CPU information. // // Note that this definition is only correct for little-endian architectures, // since Linux's cpumask_t uses unsigned long. type CPUSet []byte // CPUSetSize returns the size in bytes of a CPUSet that can contain num cpus. func CPUSetSize(num uint) uint { // NOTE(b/68859821): Applications may expect that the size of a CPUSet in // bytes is always a multiple of sizeof(unsigned long), since this is true // in Linux. Thus we always round up. bytes := (num + bitsPerByte - 1) / bitsPerByte longs := (bytes + bytesPerLong - 1) / bytesPerLong return longs * bytesPerLong } // NewCPUSet returns a CPUSet for the given number of CPUs which initially // contains no CPUs. func NewCPUSet(num uint) CPUSet { return CPUSet(make([]byte, CPUSetSize(num))) } // NewFullCPUSet returns a CPUSet for the given number of CPUs, all of which // are present in the set. func NewFullCPUSet(num uint) CPUSet { c := NewCPUSet(num) var i uint for ; i < num/bitsPerByte; i++ { c[i] = 0xff } if rem := num % bitsPerByte; rem != 0 { c[i] = (1 << rem) - 1 } return c } // Size returns the size of 'c' in bytes. func (c CPUSet) Size() uint { return uint(len(c)) } // NumCPUs returns how many cpus are set in the CPUSet. func (c CPUSet) NumCPUs() uint { var n int for _, b := range c { n += bits.OnesCount8(b) } return uint(n) } // Copy returns a copy of the CPUSet. func (c CPUSet) Copy() CPUSet { return append(CPUSet(nil), c...) } // Set sets the bit corresponding to cpu. func (c *CPUSet) Set(cpu uint) { (*c)[cpu/bitsPerByte] |= 1 << (cpu % bitsPerByte) } // ClearAbove clears bits corresponding to cpu and all higher cpus. func (c *CPUSet) ClearAbove(cpu uint) { i := cpu / bitsPerByte if i >= c.Size() { return } (*c)[i] &^= 0xff << (cpu % bitsPerByte) clear((*c)[i+1 : c.Size()]) } // ForEachCPU iterates over the CPUSet and calls fn with the cpu index if // it's set. func (c CPUSet) ForEachCPU(fn func(uint)) { for i := uint(0); i < c.Size()*bitsPerByte; i++ { bit := uint(1) << (i & (bitsPerByte - 1)) if uint(c[i/bitsPerByte])&bit == bit { fn(i) } } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/sched/sched.go000066400000000000000000000012331465435605700247320ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package sched implements scheduler related features. package sched golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/sched/sched_state_autogen.go000066400000000000000000000000671465435605700276600ustar00rootroot00000000000000// automatically generated by stateify. package sched golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/seccheck.go000066400000000000000000000054561465435605700243410ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "gvisor.dev/gvisor/pkg/sentry/seccheck" pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto" ) func getTaskCurrentWorkingDirectory(t *Task) string { // Grab the filesystem context first since it needs tasks.mu to be locked. // It's safe to unlock and use the values obtained here as long as there's // no way to modify root and wd from a separate task. t.k.tasks.mu.RLock() root := t.FSContext().RootDirectory() wd := t.FSContext().WorkingDirectory() t.k.tasks.mu.RUnlock() // Perform VFS operations outside of task mutex to avoid circular locking with // filesystem mutexes. var cwd string if root.Ok() { defer root.DecRef(t) if wd.Ok() { defer wd.DecRef(t) vfsObj := root.Mount().Filesystem().VirtualFilesystem() cwd, _ = vfsObj.PathnameWithDeleted(t, root, wd) } } return cwd } // LoadSeccheckData sets info from the task based on mask. func LoadSeccheckData(t *Task, mask seccheck.FieldMask, info *pb.ContextData) { var cwd string if mask.Contains(seccheck.FieldCtxtCwd) { cwd = getTaskCurrentWorkingDirectory(t) } t.k.tasks.mu.RLock() defer t.k.tasks.mu.RUnlock() LoadSeccheckDataLocked(t, mask, info, cwd) } // LoadSeccheckDataLocked sets info from the task based on mask. // // Preconditions: The TaskSet mutex must be locked. func LoadSeccheckDataLocked(t *Task, mask seccheck.FieldMask, info *pb.ContextData, cwd string) { if mask.Contains(seccheck.FieldCtxtTime) { info.TimeNs = t.k.RealtimeClock().Now().Nanoseconds() } if mask.Contains(seccheck.FieldCtxtThreadID) { info.ThreadId = int32(t.k.tasks.Root.tids[t]) } if mask.Contains(seccheck.FieldCtxtThreadStartTime) { info.ThreadStartTimeNs = t.startTime.Nanoseconds() } if mask.Contains(seccheck.FieldCtxtThreadGroupID) { info.ThreadGroupId = int32(t.k.tasks.Root.tgids[t.tg]) } if mask.Contains(seccheck.FieldCtxtThreadGroupStartTime) { info.ThreadGroupStartTimeNs = t.tg.leader.startTime.Nanoseconds() } if mask.Contains(seccheck.FieldCtxtContainerID) { info.ContainerId = t.tg.leader.ContainerID() } if mask.Contains(seccheck.FieldCtxtCwd) { info.Cwd = cwd } if mask.Contains(seccheck.FieldCtxtProcessName) { info.ProcessName = t.Name() } t.Credentials().LoadSeccheckData(mask, info) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/seccomp.go000066400000000000000000000255761465435605700242270ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "reflect" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/abi/sentry" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" ) const ( maxSyscallFilterInstructions = 1 << 15 // uncacheableBPFAction is an invalid seccomp action code. // It is used as a sentinel value in `taskSeccompFilters.cache` to indicate // that a specific syscall number is uncachable. uncacheableBPFAction = linux.SECCOMP_RET_ACTION_FULL ) // taskSeccomp holds seccomp-related data for a `Task`. // // +stateify savable type taskSeccomp struct { // filters is the list of seccomp programs that are applied to the task, // in the order in which they were installed. filters []bpf.Program // cache maps syscall numbers to the action to take for that syscall number. // It is only populated for syscalls where determining this action does not // involve any input data other than the architecture and the syscall // number in any of `filters`. // If any other input is necessary, the cache stores `uncacheableBPFAction` // to indicate that this syscall number's rules are not cacheable. cache [sentry.MaxSyscallNum + 1]linux.BPFAction // cacheAuditNumber is the AUDIT_ARCH_* constant of the task image used // at the time of computing `cache`. cacheAuditNumber uint32 } // copy returns a copy of this `taskSeccomp`. func (ts *taskSeccomp) copy() *taskSeccomp { return &taskSeccomp{ filters: append(([]bpf.Program)(nil), ts.filters...), cacheAuditNumber: ts.cacheAuditNumber, cache: ts.cache, } } // dataAsBPFInput returns a serialized BPF program, only valid on the current task // goroutine. // // Note: this is called for every syscall, which is a very hot path. func dataAsBPFInput(t *Task, d *linux.SeccompData) bpf.Input { buf := t.CopyScratchBuffer(d.SizeBytes()) d.MarshalUnsafe(buf) return buf[:d.SizeBytes()] } func seccompSiginfo(t *Task, errno, sysno int32, ip hostarch.Addr) *linux.SignalInfo { si := &linux.SignalInfo{ Signo: int32(linux.SIGSYS), Errno: errno, Code: linux.SYS_SECCOMP, } si.SetCallAddr(uint64(ip)) si.SetSyscall(sysno) si.SetArch(t.SyscallTable().AuditNumber) return si } // checkSeccompSyscall applies the task's seccomp filters before the execution // of syscall sysno at instruction pointer ip. (These parameters must be passed // in because vsyscalls do not use the values in t.Arch().) // // Preconditions: The caller must be running on the task goroutine. func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip hostarch.Addr) linux.BPFAction { result := linux.BPFAction(t.evaluateSyscallFilters(sysno, args, ip)) action := result & linux.SECCOMP_RET_ACTION switch action { case linux.SECCOMP_RET_TRAP: // "Results in the kernel sending a SIGSYS signal to the triggering // task without executing the system call. ... The SECCOMP_RET_DATA // portion of the return value will be passed as si_errno." - // Documentation/prctl/seccomp_filter.txt t.SendSignal(seccompSiginfo(t, int32(result.Data()), sysno, ip)) // "The return value register will contain an arch-dependent value." In // practice, it's ~always the syscall number. t.Arch().SetReturn(uintptr(sysno)) case linux.SECCOMP_RET_ERRNO: // "Results in the lower 16-bits of the return value being passed to // userland as the errno without executing the system call." t.Arch().SetReturn(-uintptr(result.Data())) case linux.SECCOMP_RET_TRACE: // "When returned, this value will cause the kernel to attempt to // notify a ptrace()-based tracer prior to executing the system call. // If there is no tracer present, -ENOSYS is returned to userland and // the system call is not executed." if !t.ptraceSeccomp(result.Data()) { // This useless-looking temporary is needed because Go. tmp := uintptr(unix.ENOSYS) t.Arch().SetReturn(-tmp) return linux.SECCOMP_RET_ERRNO } case linux.SECCOMP_RET_ALLOW: // "Results in the system call being executed." case linux.SECCOMP_RET_KILL_THREAD: // "Results in the task exiting immediately without executing the // system call. The exit status of the task will be SIGSYS, not // SIGKILL." default: // consistent with Linux return linux.SECCOMP_RET_KILL_THREAD } return action } func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip hostarch.Addr) uint32 { ret := uint32(linux.SECCOMP_RET_ALLOW) ts := t.seccomp.Load() if ts == nil { return ret } arch := t.image.st.AuditNumber if arch == ts.cacheAuditNumber && sysno >= 0 && sysno <= sentry.MaxSyscallNum { if cached := ts.cache[sysno]; cached != uncacheableBPFAction { return uint32(cached) } } data := linux.SeccompData{ Nr: sysno, Arch: arch, InstructionPointer: uint64(ip), } // data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so // we can't do any slicing tricks or even use copy/append here. for i, arg := range args { if i >= len(data.Args) { break } data.Args[i] = arg.Uint64() } input := dataAsBPFInput(t, &data) // "Every filter successfully installed will be evaluated (in reverse // order) for each system call the task makes." - kernel/seccomp.c for i := len(ts.filters) - 1; i >= 0; i-- { thisRet, err := bpf.Exec[bpf.NativeEndian](ts.filters[i], input) if err != nil { t.Debugf("seccomp-bpf filter %d returned error: %v", i, err) thisRet = uint32(linux.SECCOMP_RET_KILL_THREAD) } // "If multiple filters exist, the return value for the evaluation of a // given system call will always use the highest precedent value." - // Documentation/prctl/seccomp_filter.txt // // (Note that this contradicts prctl(2): "If the filters permit prctl() // calls, then additional filters can be added; they are run in order // until the first non-allow result is seen." prctl(2) is incorrect.) // // "The ordering ensures that a min_t() over composed return values // always selects the least permissive choice." - // include/uapi/linux/seccomp.h if (thisRet & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) { ret = thisRet } } return ret } // checkFilterCacheability executes `program` on the given `input`, and // checks if its result is cacheable. If it is, it returns that result. func checkFilterCacheability(program bpf.Program, input bpf.Input) (uint32, error) { // Look up Nr and Arch fields, we'll use their offsets later // to verify whether they were accessed. sdType := reflect.TypeOf(linux.SeccompData{}) nrField, ok := sdType.FieldByName("Nr") if !ok { panic("linux.SeccompData.Nr field not found") } archField, ok := sdType.FieldByName("Arch") if !ok { panic("linux.SeccompData.Arch field not found") } exec, err := bpf.InstrumentedExec[bpf.NativeEndian](program, input) if err != nil { return 0, err } for offset, accessed := range exec.InputAccessed { if !accessed { continue // Input byte not accessed by the program. } if uintptr(offset) >= nrField.Offset && uintptr(offset) < nrField.Offset+nrField.Type.Size() { continue // The program accessed the "Nr" field, this is OK. } if uintptr(offset) >= archField.Offset && uintptr(offset) < archField.Offset+archField.Type.Size() { continue // The program accessed the "Arch" field, this is OK. } return 0, fmt.Errorf("program accessed byte at offset %d which is not the sysno or arch field", offset) } return exec.ReturnValue, nil } // populateCache recomputes `ts.cache` from `ts.filters`. func (ts *taskSeccomp) populateCache(t *Task) { ts.cacheAuditNumber = t.image.st.AuditNumber sd := linux.SeccompData{} input := bpf.Input(make([]byte, sd.SizeBytes())) for sysno := int32(0); sysno <= sentry.MaxSyscallNum; sysno++ { sd.Nr = sysno sd.Arch = ts.cacheAuditNumber clear(input) sd.MarshalBytes(input) sysnoIsCacheable := true ret := linux.BPFAction(linux.SECCOMP_RET_ALLOW) // See notes in `evaluateSyscallFilters` for how to properly interpret // seccomp filter and results. We use the same approach here: iterate // through filters backwards, and take the smallest result. // If any filter is not cacheable, then we cannot cache the result for // this sysno. for i := len(ts.filters) - 1; i >= 0; i-- { result, cacheErr := checkFilterCacheability(ts.filters[i], input) if cacheErr != nil { sysnoIsCacheable = false break } if (linux.BPFAction(result) & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) { ret = linux.BPFAction(result) } } if sysnoIsCacheable { ts.cache[sysno] = ret } else { ts.cache[sysno] = uncacheableBPFAction } } } // AppendSyscallFilter adds BPF program p as a system call filter. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) AppendSyscallFilter(p bpf.Program, syncAll bool) error { // While syscallFilters are an atomic.Value we must take the mutex to prevent // our read-copy-update from happening while another task is syncing syscall // filters to us, this keeps the filters in a consistent state. t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() // Cap the combined length of all syscall filters (plus a penalty of 4 // instructions per filter beyond the first) to maxSyscallFilterInstructions. // This restriction is inherited from Linux. totalLength := p.Length() newSeccomp := &taskSeccomp{} if ts := t.seccomp.Load(); ts != nil { for _, f := range ts.filters { totalLength += f.Length() + 4 } newSeccomp.filters = append(newSeccomp.filters, ts.filters...) } if totalLength > maxSyscallFilterInstructions { return linuxerr.ENOMEM } newSeccomp.filters = append(newSeccomp.filters, p) newSeccomp.populateCache(t) t.seccomp.Store(newSeccomp) if syncAll { // Note: No new privs is always assumed to be set. for ot := t.tg.tasks.Front(); ot != nil; ot = ot.Next() { if ot != t { seccompCopy := newSeccomp.copy() seccompCopy.populateCache(ot) ot.seccomp.Store(seccompCopy) } } } return nil } // SeccompMode returns a SECCOMP_MODE_* constant indicating the task's current // seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP) // and /proc/[pid]/status. func (t *Task) SeccompMode() int { if ts := t.seccomp.Load(); ts != nil && len(ts.filters) > 0 { return linux.SECCOMP_MODE_FILTER } return linux.SECCOMP_MODE_NONE } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/semaphore/000077500000000000000000000000001465435605700242135ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/semaphore/semaphore.go000066400000000000000000000421311465435605700265260ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package semaphore implements System V semaphores. package semaphore import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) const ( // Maximum semaphore value. valueMax = linux.SEMVMX // Maximum number of semaphore sets. setsMax = linux.SEMMNI // Maximum number of semaphores in a semaphore set. semsMax = linux.SEMMSL // Maximum number of semaphores in all semaphore sets. semsTotalMax = linux.SEMMNS ) // Registry maintains a set of semaphores that can be found by key or ID. // // +stateify savable type Registry struct { // mu protects all fields below. mu sync.Mutex `state:"nosave"` // reg defines basic fields and operations needed for all SysV registries. reg *ipc.Registry // indexes maintains a mapping between a set's index in virtual array and // its identifier. indexes map[int32]ipc.ID } // Set represents a set of semaphores that can be operated atomically. // // +stateify savable type Set struct { // registry owning this sem set. Immutable. registry *Registry // mu protects all fields below. mu sync.Mutex `state:"nosave"` obj *ipc.Object opTime ktime.Time changeTime ktime.Time // sems holds all semaphores in the set. The slice itself is immutable after // it's been set, however each 'sem' object in the slice requires 'mu' lock. sems []sem // dead is set to true when the set is removed and can't be reached anymore. // All waiters must wake up and fail when set is dead. dead bool } // sem represents a single semaphore from a set. // // +stateify savable type sem struct { value int16 waiters waiterList `state:"zerovalue"` pid int32 } // waiter represents a caller that is waiting for the semaphore value to // become positive or zero. // // +stateify savable type waiter struct { waiterEntry // value represents how much resource the waiter needs to wake up. // The value is either 0 or negative. value int16 ch chan struct{} } // NewRegistry creates a new semaphore set registry. func NewRegistry(userNS *auth.UserNamespace) *Registry { return &Registry{ reg: ipc.NewRegistry(userNS), indexes: make(map[int32]ipc.ID), } } // FindOrCreate searches for a semaphore set that matches 'key'. If not found, // it may create a new one if requested. If private is true, key is ignored and // a new set is always created. If create is false, it fails if a set cannot // be found. If exclusive is true, it fails if a set with the same key already // exists. func (r *Registry) FindOrCreate(ctx context.Context, key ipc.Key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) { if nsems < 0 || nsems > semsMax { return nil, linuxerr.EINVAL } r.mu.Lock() defer r.mu.Unlock() if !private { set, err := r.reg.Find(ctx, key, mode, create, exclusive) if err != nil { return nil, err } // Validate semaphore-specific parameters. if set != nil { set := set.(*Set) if nsems > int32(set.Size()) { return nil, linuxerr.EINVAL } return set, nil } } // Zero is only valid if an existing set is found. if nsems == 0 { return nil, linuxerr.EINVAL } // Apply system limits. // // Map reg.objects and map indexes in a registry are of the same size, // check map reg.objects only here for the system limit. if r.reg.ObjectCount() >= setsMax { return nil, linuxerr.ENOSPC } if r.totalSems() > int(semsTotalMax-nsems) { return nil, linuxerr.ENOSPC } // Finally create a new set. return r.newSetLocked(ctx, key, auth.CredentialsFromContext(ctx), mode, nsems) } // IPCInfo returns information about system-wide semaphore limits and parameters. func (r *Registry) IPCInfo() *linux.SemInfo { return &linux.SemInfo{ SemMap: linux.SEMMAP, SemMni: linux.SEMMNI, SemMns: linux.SEMMNS, SemMnu: linux.SEMMNU, SemMsl: linux.SEMMSL, SemOpm: linux.SEMOPM, SemUme: linux.SEMUME, SemUsz: linux.SEMUSZ, SemVmx: linux.SEMVMX, SemAem: linux.SEMAEM, } } // SemInfo returns a seminfo structure containing the same information as // for IPC_INFO, except that SemUsz field returns the number of existing // semaphore sets, and SemAem field returns the number of existing semaphores. func (r *Registry) SemInfo() *linux.SemInfo { r.mu.Lock() defer r.mu.Unlock() info := r.IPCInfo() info.SemUsz = uint32(r.reg.ObjectCount()) info.SemAem = uint32(r.totalSems()) return info } // HighestIndex returns the index of the highest used entry in // the kernel's array. func (r *Registry) HighestIndex() int32 { r.mu.Lock() defer r.mu.Unlock() // By default, highest used index is 0 even though // there is no semaphore set. var highestIndex int32 for index := range r.indexes { if index > highestIndex { highestIndex = index } } return highestIndex } // Remove removes set with give 'id' from the registry and marks the set as // dead. All waiters will be awakened and fail. func (r *Registry) Remove(id ipc.ID, creds *auth.Credentials) error { r.mu.Lock() defer r.mu.Unlock() index, found := r.findIndexByID(id) if !found { return linuxerr.EINVAL } delete(r.indexes, index) r.reg.Remove(id, creds) return nil } // newSetLocked creates a new Set using given fields. An error is returned if there // are no more available identifiers. // // Precondition: r.mu must be held. func (r *Registry) newSetLocked(ctx context.Context, key ipc.Key, creator *auth.Credentials, mode linux.FileMode, nsems int32) (*Set, error) { set := &Set{ registry: r, obj: ipc.NewObject(r.reg.UserNS, ipc.Key(key), creator, creator, mode), changeTime: ktime.NowFromContext(ctx), sems: make([]sem, nsems), } err := r.reg.Register(set) if err != nil { return nil, err } index, found := r.findFirstAvailableIndex() if !found { // See linux, ipc/sem.c:newary(). return nil, linuxerr.ENOSPC } r.indexes[index] = set.obj.ID return set, nil } // FindByID looks up a set given an ID. func (r *Registry) FindByID(id ipc.ID) *Set { r.mu.Lock() defer r.mu.Unlock() mech := r.reg.FindByID(id) if mech == nil { return nil } return mech.(*Set) } // FindByIndex looks up a set given an index. func (r *Registry) FindByIndex(index int32) *Set { r.mu.Lock() defer r.mu.Unlock() id, present := r.indexes[index] if !present { return nil } return r.reg.FindByID(id).(*Set) } func (r *Registry) findIndexByID(id ipc.ID) (int32, bool) { for k, v := range r.indexes { if v == id { return k, true } } return 0, false } func (r *Registry) findFirstAvailableIndex() (int32, bool) { for index := int32(0); index < setsMax; index++ { if _, present := r.indexes[index]; !present { return index, true } } return 0, false } func (r *Registry) totalSems() int { totalSems := 0 r.reg.ForAllObjects( func(o ipc.Mechanism) { totalSems += o.(*Set).Size() }, ) return totalSems } // ID returns semaphore's ID. func (s *Set) ID() ipc.ID { return s.obj.ID } // Object implements ipc.Mechanism.Object. func (s *Set) Object() *ipc.Object { return s.obj } // Lock implements ipc.Mechanism.Lock. func (s *Set) Lock() { s.mu.Lock() } // Unlock implements ipc.mechanism.Unlock. // // +checklocksignore func (s *Set) Unlock() { s.mu.Unlock() } func (s *Set) findSem(num int32) *sem { if num < 0 || int(num) >= s.Size() { return nil } return &s.sems[num] } // Size returns the number of semaphores in the set. Size is immutable. func (s *Set) Size() int { return len(s.sems) } // Set modifies attributes for a semaphore set. See semctl(IPC_SET). func (s *Set) Set(ctx context.Context, ds *linux.SemidDS) error { s.mu.Lock() defer s.mu.Unlock() if err := s.obj.Set(ctx, &ds.SemPerm); err != nil { return err } s.changeTime = ktime.NowFromContext(ctx) return nil } // GetStat extracts semid_ds information from the set. func (s *Set) GetStat(creds *auth.Credentials) (*linux.SemidDS, error) { // "The calling process must have read permission on the semaphore set." return s.semStat(creds, vfs.MayRead) } // GetStatAny extracts semid_ds information from the set without requiring read access. func (s *Set) GetStatAny(creds *auth.Credentials) (*linux.SemidDS, error) { return s.semStat(creds, 0) } func (s *Set) semStat(creds *auth.Credentials, ats vfs.AccessTypes) (*linux.SemidDS, error) { s.mu.Lock() defer s.mu.Unlock() if !s.obj.CheckPermissions(creds, ats) { return nil, linuxerr.EACCES } return &linux.SemidDS{ SemPerm: linux.IPCPerm{ Key: uint32(s.obj.Key), UID: uint32(creds.UserNamespace.MapFromKUID(s.obj.OwnerUID)), GID: uint32(creds.UserNamespace.MapFromKGID(s.obj.OwnerGID)), CUID: uint32(creds.UserNamespace.MapFromKUID(s.obj.CreatorUID)), CGID: uint32(creds.UserNamespace.MapFromKGID(s.obj.CreatorGID)), Mode: uint16(s.obj.Mode), Seq: 0, // IPC sequence not supported. }, SemOTime: s.opTime.TimeT(), SemCTime: s.changeTime.TimeT(), SemNSems: uint64(s.Size()), }, nil } // SetVal overrides a semaphore value, waking up waiters as needed. func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error { if val < 0 || val > valueMax { return linuxerr.ERANGE } s.mu.Lock() defer s.mu.Unlock() // "The calling process must have alter permission on the semaphore set." if !s.obj.CheckPermissions(creds, vfs.MayWrite) { return linuxerr.EACCES } sem := s.findSem(num) if sem == nil { return linuxerr.ERANGE } // TODO(gvisor.dev/issue/137): Clear undo entries in all processes. sem.value = val sem.pid = pid s.changeTime = ktime.NowFromContext(ctx) sem.wakeWaiters() return nil } // SetValAll overrides all semaphores values, waking up waiters as needed. It also // sets semaphore's PID which was fixed in Linux 4.6. // // 'len(vals)' must be equal to 's.Size()'. func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credentials, pid int32) error { if len(vals) != s.Size() { panic(fmt.Sprintf("vals length (%d) different that Set.Size() (%d)", len(vals), s.Size())) } for _, val := range vals { if val > valueMax { return linuxerr.ERANGE } } s.mu.Lock() defer s.mu.Unlock() // "The calling process must have alter permission on the semaphore set." if !s.obj.CheckPermissions(creds, vfs.MayWrite) { return linuxerr.EACCES } for i, val := range vals { sem := &s.sems[i] // TODO(gvisor.dev/issue/137): Clear undo entries in all processes. sem.value = int16(val) sem.pid = pid sem.wakeWaiters() } s.changeTime = ktime.NowFromContext(ctx) return nil } // GetVal returns a semaphore value. func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) { s.mu.Lock() defer s.mu.Unlock() // "The calling process must have read permission on the semaphore set." if !s.obj.CheckPermissions(creds, vfs.MayRead) { return 0, linuxerr.EACCES } sem := s.findSem(num) if sem == nil { return 0, linuxerr.ERANGE } return sem.value, nil } // GetValAll returns value for all semaphores. func (s *Set) GetValAll(creds *auth.Credentials) ([]uint16, error) { s.mu.Lock() defer s.mu.Unlock() // "The calling process must have read permission on the semaphore set." if !s.obj.CheckPermissions(creds, vfs.MayRead) { return nil, linuxerr.EACCES } vals := make([]uint16, s.Size()) for i, sem := range s.sems { vals[i] = uint16(sem.value) } return vals, nil } // GetPID returns the PID set when performing operations in the semaphore. func (s *Set) GetPID(num int32, creds *auth.Credentials) (int32, error) { s.mu.Lock() defer s.mu.Unlock() // "The calling process must have read permission on the semaphore set." if !s.obj.CheckPermissions(creds, vfs.MayRead) { return 0, linuxerr.EACCES } sem := s.findSem(num) if sem == nil { return 0, linuxerr.ERANGE } return sem.pid, nil } func (s *Set) countWaiters(num int32, creds *auth.Credentials, pred func(w *waiter) bool) (uint16, error) { s.mu.Lock() defer s.mu.Unlock() // The calling process must have read permission on the semaphore set. if !s.obj.CheckPermissions(creds, vfs.MayRead) { return 0, linuxerr.EACCES } sem := s.findSem(num) if sem == nil { return 0, linuxerr.ERANGE } var cnt uint16 for w := sem.waiters.Front(); w != nil; w = w.Next() { if pred(w) { cnt++ } } return cnt, nil } // CountZeroWaiters returns number of waiters waiting for the sem's value to increase. func (s *Set) CountZeroWaiters(num int32, creds *auth.Credentials) (uint16, error) { return s.countWaiters(num, creds, func(w *waiter) bool { return w.value == 0 }) } // CountNegativeWaiters returns number of waiters waiting for the sem to go to zero. func (s *Set) CountNegativeWaiters(num int32, creds *auth.Credentials) (uint16, error) { return s.countWaiters(num, creds, func(w *waiter) bool { return w.value < 0 }) } // ExecuteOps attempts to execute a list of operations to the set. It only // succeeds when all operations can be applied. No changes are made if it fails. // // On failure, it may return an error (retries are hopeless) or it may return // a channel that can be waited on before attempting again. func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Credentials, pid int32) (chan struct{}, int32, error) { s.mu.Lock() defer s.mu.Unlock() // Did it race with a removal operation? if s.dead { return nil, 0, linuxerr.EIDRM } // Validate the operations. readOnly := true for _, op := range ops { if s.findSem(int32(op.SemNum)) == nil { return nil, 0, linuxerr.EFBIG } if op.SemOp != 0 { readOnly = false } } ats := vfs.MayRead if !readOnly { ats = vfs.MayWrite } if !s.obj.CheckPermissions(creds, ats) { return nil, 0, linuxerr.EACCES } ch, num, err := s.executeOps(ctx, ops, pid) if err != nil { return nil, 0, err } return ch, num, nil } func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (chan struct{}, int32, error) { // Changes to semaphores go to this slice temporarily until they all succeed. tmpVals := make([]int16, len(s.sems)) for i := range s.sems { tmpVals[i] = s.sems[i].value } for _, op := range ops { sem := &s.sems[op.SemNum] if op.SemOp == 0 { // Handle 'wait for zero' operation. if tmpVals[op.SemNum] != 0 { // Semaphore isn't 0, must wait. if op.SemFlg&linux.IPC_NOWAIT != 0 { return nil, 0, linuxerr.ErrWouldBlock } w := newWaiter(op.SemOp) sem.waiters.PushBack(w) return w.ch, int32(op.SemNum), nil } } else { if op.SemOp < 0 { // Handle 'wait' operation. if -op.SemOp > valueMax { return nil, 0, linuxerr.ERANGE } if -op.SemOp > tmpVals[op.SemNum] { // Not enough resources, must wait. if op.SemFlg&linux.IPC_NOWAIT != 0 { return nil, 0, linuxerr.ErrWouldBlock } w := newWaiter(op.SemOp) sem.waiters.PushBack(w) return w.ch, int32(op.SemNum), nil } } else { // op.SemOp > 0: Handle 'signal' operation. if tmpVals[op.SemNum] > valueMax-op.SemOp { return nil, 0, linuxerr.ERANGE } } tmpVals[op.SemNum] += op.SemOp } } // All operations succeeded, apply them. // TODO(gvisor.dev/issue/137): handle undo operations. for i, v := range tmpVals { s.sems[i].value = v s.sems[i].wakeWaiters() s.sems[i].pid = pid } s.opTime = ktime.NowFromContext(ctx) return nil, 0, nil } // AbortWait notifies that a waiter is giving up and will not wait on the // channel anymore. func (s *Set) AbortWait(num int32, ch chan struct{}) { s.mu.Lock() defer s.mu.Unlock() sem := &s.sems[num] for w := sem.waiters.Front(); w != nil; w = w.Next() { if w.ch == ch { sem.waiters.Remove(w) return } } // Waiter may not be found in case it raced with wakeWaiters(). } // Destroy implements ipc.Mechanism.Destroy. // // Preconditions: Caller must hold 's.mu'. func (s *Set) Destroy() { // Notify all waiters. They will fail on the next attempt to execute // operations and return error. s.dead = true for _, s := range s.sems { for w := s.waiters.Front(); w != nil; w = w.Next() { w.ch <- struct{}{} } s.waiters.Reset() } } func abs(val int16) int16 { if val < 0 { return -val } return val } // wakeWaiters goes over all waiters and checks which of them can be notified. func (s *sem) wakeWaiters() { // Note that this will release all waiters waiting for 0 too. for w := s.waiters.Front(); w != nil; { if s.value < abs(w.value) { // Still blocked, skip it. w = w.Next() continue } w.ch <- struct{}{} old := w w = w.Next() s.waiters.Remove(old) } } func newWaiter(val int16) *waiter { return &waiter{ value: val, ch: make(chan struct{}, 1), } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/semaphore/semaphore_state_autogen.go000066400000000000000000000106551465435605700314560ustar00rootroot00000000000000// automatically generated by stateify. package semaphore import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (r *Registry) StateTypeName() string { return "pkg/sentry/kernel/semaphore.Registry" } func (r *Registry) StateFields() []string { return []string{ "reg", "indexes", } } func (r *Registry) beforeSave() {} // +checklocksignore func (r *Registry) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.reg) stateSinkObject.Save(1, &r.indexes) } func (r *Registry) afterLoad(context.Context) {} // +checklocksignore func (r *Registry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.reg) stateSourceObject.Load(1, &r.indexes) } func (s *Set) StateTypeName() string { return "pkg/sentry/kernel/semaphore.Set" } func (s *Set) StateFields() []string { return []string{ "registry", "obj", "opTime", "changeTime", "sems", "dead", } } func (s *Set) beforeSave() {} // +checklocksignore func (s *Set) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.registry) stateSinkObject.Save(1, &s.obj) stateSinkObject.Save(2, &s.opTime) stateSinkObject.Save(3, &s.changeTime) stateSinkObject.Save(4, &s.sems) stateSinkObject.Save(5, &s.dead) } func (s *Set) afterLoad(context.Context) {} // +checklocksignore func (s *Set) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.registry) stateSourceObject.Load(1, &s.obj) stateSourceObject.Load(2, &s.opTime) stateSourceObject.Load(3, &s.changeTime) stateSourceObject.Load(4, &s.sems) stateSourceObject.Load(5, &s.dead) } func (s *sem) StateTypeName() string { return "pkg/sentry/kernel/semaphore.sem" } func (s *sem) StateFields() []string { return []string{ "value", "pid", } } func (s *sem) beforeSave() {} // +checklocksignore func (s *sem) StateSave(stateSinkObject state.Sink) { s.beforeSave() if !state.IsZeroValue(&s.waiters) { state.Failf("waiters is %#v, expected zero", &s.waiters) } stateSinkObject.Save(0, &s.value) stateSinkObject.Save(1, &s.pid) } func (s *sem) afterLoad(context.Context) {} // +checklocksignore func (s *sem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.value) stateSourceObject.Load(1, &s.pid) } func (w *waiter) StateTypeName() string { return "pkg/sentry/kernel/semaphore.waiter" } func (w *waiter) StateFields() []string { return []string{ "waiterEntry", "value", "ch", } } func (w *waiter) beforeSave() {} // +checklocksignore func (w *waiter) StateSave(stateSinkObject state.Sink) { w.beforeSave() stateSinkObject.Save(0, &w.waiterEntry) stateSinkObject.Save(1, &w.value) stateSinkObject.Save(2, &w.ch) } func (w *waiter) afterLoad(context.Context) {} // +checklocksignore func (w *waiter) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &w.waiterEntry) stateSourceObject.Load(1, &w.value) stateSourceObject.Load(2, &w.ch) } func (l *waiterList) StateTypeName() string { return "pkg/sentry/kernel/semaphore.waiterList" } func (l *waiterList) StateFields() []string { return []string{ "head", "tail", } } func (l *waiterList) beforeSave() {} // +checklocksignore func (l *waiterList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *waiterList) afterLoad(context.Context) {} // +checklocksignore func (l *waiterList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *waiterEntry) StateTypeName() string { return "pkg/sentry/kernel/semaphore.waiterEntry" } func (e *waiterEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *waiterEntry) beforeSave() {} // +checklocksignore func (e *waiterEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *waiterEntry) afterLoad(context.Context) {} // +checklocksignore func (e *waiterEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func init() { state.Register((*Registry)(nil)) state.Register((*Set)(nil)) state.Register((*sem)(nil)) state.Register((*waiter)(nil)) state.Register((*waiterList)(nil)) state.Register((*waiterEntry)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/semaphore/waiter_list.go000066400000000000000000000120071465435605700270700ustar00rootroot00000000000000package semaphore // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type waiterElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (waiterElementMapper) linkerFor(elem *waiter) *waiter { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type waiterList struct { head *waiter tail *waiter } // Reset resets list l to the empty state. func (l *waiterList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *waiterList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *waiterList) Front() *waiter { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *waiterList) Back() *waiter { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *waiterList) Len() (count int) { for e := l.Front(); e != nil; e = (waiterElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *waiterList) PushFront(e *waiter) { linker := waiterElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { waiterElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *waiterList) PushFrontList(m *waiterList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { waiterElementMapper{}.linkerFor(l.head).SetPrev(m.tail) waiterElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *waiterList) PushBack(e *waiter) { linker := waiterElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { waiterElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *waiterList) PushBackList(m *waiterList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { waiterElementMapper{}.linkerFor(l.tail).SetNext(m.head) waiterElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *waiterList) InsertAfter(b, e *waiter) { bLinker := waiterElementMapper{}.linkerFor(b) eLinker := waiterElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { waiterElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *waiterList) InsertBefore(a, e *waiter) { aLinker := waiterElementMapper{}.linkerFor(a) eLinker := waiterElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { waiterElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *waiterList) Remove(e *waiter) { linker := waiterElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { waiterElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { waiterElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type waiterEntry struct { next *waiter prev *waiter } // Next returns the entry that follows e in the list. // //go:nosplit func (e *waiterEntry) Next() *waiter { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *waiterEntry) Prev() *waiter { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *waiterEntry) SetNext(elem *waiter) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *waiterEntry) SetPrev(elem *waiter) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo_unsafe.go000066400000000000000000000035051465435605700327510ustar00rootroot00000000000000package kernel import ( "unsafe" "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/sync" ) // SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race // with any writer critical sections in seq. // //go:nosplit func SeqAtomicLoadTaskGoroutineSchedInfo(seq *sync.SeqCount, ptr *TaskGoroutineSchedInfo) TaskGoroutineSchedInfo { for { if val, ok := SeqAtomicTryLoadTaskGoroutineSchedInfo(seq, seq.BeginRead(), ptr); ok { return val } } } // SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section // in seq initiated by a call to seq.BeginRead() that returned epoch. If the // read would race with a writer critical section, SeqAtomicTryLoad returns // (unspecified, false). // //go:nosplit func SeqAtomicTryLoadTaskGoroutineSchedInfo(seq *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *TaskGoroutineSchedInfo) (val TaskGoroutineSchedInfo, ok bool) { if sync.RaceEnabled { gohacks.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) } else { val = *ptr } ok = seq.ReadOk(epoch) return } // SeqAtomicStore sets *ptr to a copy of val, ensuring that any racing reader // critical sections are forced to retry. // //go:nosplit func SeqAtomicStoreTaskGoroutineSchedInfo(seq *sync.SeqCount, ptr *TaskGoroutineSchedInfo, val TaskGoroutineSchedInfo) { seq.BeginWrite() SeqAtomicStoreSeqedTaskGoroutineSchedInfo(ptr, val) seq.EndWrite() } // SeqAtomicStoreSeqed sets *ptr to a copy of val. // // Preconditions: ptr is protected by a SeqCount that will be in a writer // critical section throughout the call to SeqAtomicStore. // //go:nosplit func SeqAtomicStoreSeqedTaskGoroutineSchedInfo(ptr *TaskGoroutineSchedInfo, val TaskGoroutineSchedInfo) { if sync.RaceEnabled { gohacks.Memmove(unsafe.Pointer(ptr), unsafe.Pointer(&val), unsafe.Sizeof(val)) } else { *ptr = val } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/session_list.go000066400000000000000000000120751465435605700253020ustar00rootroot00000000000000package kernel // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type sessionElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (sessionElementMapper) linkerFor(elem *Session) *Session { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type sessionList struct { head *Session tail *Session } // Reset resets list l to the empty state. func (l *sessionList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *sessionList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *sessionList) Front() *Session { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *sessionList) Back() *Session { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *sessionList) Len() (count int) { for e := l.Front(); e != nil; e = (sessionElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *sessionList) PushFront(e *Session) { linker := sessionElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { sessionElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *sessionList) PushFrontList(m *sessionList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { sessionElementMapper{}.linkerFor(l.head).SetPrev(m.tail) sessionElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *sessionList) PushBack(e *Session) { linker := sessionElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { sessionElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *sessionList) PushBackList(m *sessionList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { sessionElementMapper{}.linkerFor(l.tail).SetNext(m.head) sessionElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *sessionList) InsertAfter(b, e *Session) { bLinker := sessionElementMapper{}.linkerFor(b) eLinker := sessionElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { sessionElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *sessionList) InsertBefore(a, e *Session) { aLinker := sessionElementMapper{}.linkerFor(a) eLinker := sessionElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { sessionElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *sessionList) Remove(e *Session) { linker := sessionElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { sessionElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { sessionElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type sessionEntry struct { next *Session prev *Session } // Next returns the entry that follows e in the list. // //go:nosplit func (e *sessionEntry) Next() *Session { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *sessionEntry) Prev() *Session { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *sessionEntry) SetNext(elem *Session) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *sessionEntry) SetPrev(elem *Session) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/session_refs.go000066400000000000000000000101331465435605700252570ustar00rootroot00000000000000package kernel import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const SessionenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var Sessionobj *Session // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type SessionRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *SessionRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *SessionRefs) RefType() string { return fmt.Sprintf("%T", Sessionobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *SessionRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *SessionRefs) LogRefs() bool { return SessionenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *SessionRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *SessionRefs) IncRef() { v := r.refCount.Add(1) if SessionenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *SessionRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if SessionenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *SessionRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if SessionenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *SessionRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/sessions.go000066400000000000000000000363531465435605700244370ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // SessionID is the public identifier. type SessionID ThreadID // ProcessGroupID is the public identifier. type ProcessGroupID ThreadID // Session contains a leader threadgroup and a list of ProcessGroups. // // +stateify savable type Session struct { SessionRefs // leader is the originator of the Session. // // Note that this may no longer be running (and may be reaped), so the // ID is cached upon initial creation. The leader is still required // however, since its PIDNamespace defines the scope of the Session. // // The leader is immutable. leader *ThreadGroup // id is the cached identifier in the leader's namespace. // // The id is immutable. id SessionID // foreground is the foreground process group. // // This is protected by TaskSet.mu. foreground *ProcessGroup // ProcessGroups is a list of process groups in this Session. This is // protected by TaskSet.mu. processGroups processGroupList // sessionEntry is the embed for TaskSet.sessions. This is protected by // TaskSet.mu. sessionEntry } // DecRef drops a reference. // // Precondition: callers must hold TaskSet.mu for writing. func (s *Session) DecRef() { s.SessionRefs.DecRef(func() { // Remove translations from the leader. for ns := s.leader.pidns; ns != nil; ns = ns.parent { id := ns.sids[s] delete(ns.sids, s) delete(ns.sessions, id) } // Remove from the list of global Sessions. s.leader.pidns.owner.sessions.Remove(s) }) } // ProcessGroup contains an originator threadgroup and a parent Session. // // +stateify savable type ProcessGroup struct { refs ProcessGroupRefs // originator is the originator of the group. // // See note re: leader in Session. The same applies here. // // The originator is immutable. originator *ThreadGroup // id is the cached identifier in the originator's namespace. // // The id is immutable. id ProcessGroupID // Session is the parent Session. // // The session is immutable. session *Session // ancestors is the number of thread groups in this process group whose // parent is in a different process group in the same session. // // The name is derived from the fact that process groups where // ancestors is zero are considered "orphans". // // ancestors is protected by TaskSet.mu. ancestors uint32 // processGroupEntry is the embedded entry for Sessions.groups. This is // protected by TaskSet.mu. processGroupEntry } // Originator returns the originator of the process group. func (pg *ProcessGroup) Originator() *ThreadGroup { return pg.originator } // IsOrphan returns true if this process group is an orphan. func (pg *ProcessGroup) IsOrphan() bool { ts := pg.originator.TaskSet() ts.mu.RLock() defer ts.mu.RUnlock() return pg.ancestors == 0 } // incRefWithParent grabs a reference. // // This function is called when this ProcessGroup is being associated with some // new ThreadGroup, tg. parentPG is the ProcessGroup of tg's parent // ThreadGroup. If tg is init, then parentPG may be nil. // // Precondition: callers must hold TaskSet.mu for writing. func (pg *ProcessGroup) incRefWithParent(parentPG *ProcessGroup) { // We acquire an "ancestor" reference in the case of a nil parent. // This is because the process being associated is init, and init can // never be orphaned (we count it as always having an ancestor). if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) { pg.ancestors++ } pg.refs.IncRef() } // decRefWithParent drops a reference. // // parentPG is per incRefWithParent. // // Precondition: callers must hold TaskSet.mu for writing. func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) { // See incRefWithParent regarding parent == nil. if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) { pg.ancestors-- } alive := true pg.refs.DecRef(func() { alive = false // don't bother with handleOrphan. // Remove translations from the originator. for ns := pg.originator.pidns; ns != nil; ns = ns.parent { id := ns.pgids[pg] delete(ns.pgids, pg) delete(ns.processGroups, id) } // Remove the list of process groups. pg.session.processGroups.Remove(pg) pg.session.DecRef() }) if alive { pg.handleOrphan() } } // parentPG returns the parent process group. // // Precondition: callers must hold TaskSet.mu. func (tg *ThreadGroup) parentPG() *ProcessGroup { if tg.leader.parent != nil { return tg.leader.parent.tg.processGroup } return nil } // handleOrphan checks whether the process group is an orphan and has any // stopped jobs. If yes, then appropriate signals are delivered to each thread // group within the process group. // // Precondition: callers must hold TaskSet.mu for writing. func (pg *ProcessGroup) handleOrphan() { // Check if this process is an orphan. if pg.ancestors != 0 { return } // See if there are any stopped jobs. hasStopped := false pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) { if tg.processGroup != pg { return } tg.signalHandlers.mu.NestedLock(signalHandlersLockTg) if tg.groupStopComplete { hasStopped = true } tg.signalHandlers.mu.NestedUnlock(signalHandlersLockTg) }) if !hasStopped { return } // Deliver appropriate signals to all thread groups. pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) { if tg.processGroup != pg { return } tg.signalHandlers.mu.NestedLock(signalHandlersLockTg) tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGHUP), true /* group */) tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGCONT), true /* group */) tg.signalHandlers.mu.NestedUnlock(signalHandlersLockTg) }) return } // Session returns the process group's session without taking a reference. func (pg *ProcessGroup) Session() *Session { return pg.session } // SendSignal sends a signal to all processes inside the process group. It is // analogous to kernel/signal.c:kill_pgrp. func (pg *ProcessGroup) SendSignal(info *linux.SignalInfo) error { tasks := pg.originator.TaskSet() tasks.mu.RLock() defer tasks.mu.RUnlock() var lastErr error for tg := range tasks.Root.tgids { if tg.processGroup == pg { tg.signalHandlers.mu.Lock() infoCopy := *info if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { lastErr = err } tg.signalHandlers.mu.Unlock() } } return lastErr } // CreateSession creates a new Session, with the ThreadGroup as the leader. // // EPERM may be returned if either the given ThreadGroup is already a Session // leader, or a ProcessGroup already exists for the ThreadGroup's ID. func (tg *ThreadGroup) CreateSession() (SessionID, error) { tg.pidns.owner.mu.Lock() defer tg.pidns.owner.mu.Unlock() tg.signalHandlers.mu.Lock() defer tg.signalHandlers.mu.Unlock() return tg.createSession() } // createSession creates a new session for a threadgroup. // // Precondition: callers must hold TaskSet.mu and the signal mutex for writing. func (tg *ThreadGroup) createSession() (SessionID, error) { // Get the ID for this thread in the current namespace. id := tg.pidns.tgids[tg] // Check if this ThreadGroup already leads a Session, or // if the proposed group is already taken. for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() { if s.leader.pidns != tg.pidns { continue } if s.leader == tg { return -1, linuxerr.EPERM } if s.id == SessionID(id) { return -1, linuxerr.EPERM } for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() { if pg.id == ProcessGroupID(id) { return -1, linuxerr.EPERM } } } // Create a new Session, with a single reference. sid := SessionID(id) s := &Session{ id: sid, leader: tg, } s.InitRefs() // Create a new ProcessGroup, belonging to that Session. // This also has a single reference (assigned below). // // Note that since this is a new session and a new process group, there // will be zero ancestors for this process group. (It is an orphan at // this point.) pg := &ProcessGroup{ id: ProcessGroupID(id), originator: tg, session: s, ancestors: 0, } pg.refs.InitRefs() // Tie them and return the result. s.processGroups.PushBack(pg) tg.pidns.owner.sessions.PushBack(s) // Leave the current group, and assign the new one. if tg.processGroup != nil { oldParentPG := tg.parentPG() tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) { childTG.processGroup.incRefWithParent(pg) childTG.processGroup.decRefWithParent(oldParentPG) }) // If tg.processGroup is an orphan, decRefWithParent will lock // the signal mutex of each thread group in tg.processGroup. // However, tg's signal mutex may already be locked at this // point. We change tg's process group before calling // decRefWithParent to avoid locking tg's signal mutex twice. oldPG := tg.processGroup tg.processGroup = pg oldPG.decRefWithParent(oldParentPG) } else { // The current process group may be nil only in the case of an // unparented thread group (i.e. the init process). This would // not normally occur, but we allow it for the convenience of // CreateSession working from that point. There will be no // child processes. We always say that the very first group // created has ancestors (avoids checks elsewhere). // // Note that this mirrors the parent == nil logic in // incRef/decRef/reparent, which counts nil as an ancestor. tg.processGroup = pg tg.processGroup.ancestors++ } // Ensure a translation is added to all namespaces. for ns := tg.pidns; ns != nil; ns = ns.parent { local := ns.tgids[tg] ns.sids[s] = SessionID(local) ns.sessions[SessionID(local)] = s ns.pgids[pg] = ProcessGroupID(local) ns.processGroups[ProcessGroupID(local)] = pg } // Disconnect from the controlling terminal. tg.tty = nil return sid, nil } // CreateProcessGroup creates a new process group. // // An EPERM error will be returned if the ThreadGroup belongs to a different // Session, is a Session leader or the group already exists. func (tg *ThreadGroup) CreateProcessGroup() error { tg.pidns.owner.mu.Lock() defer tg.pidns.owner.mu.Unlock() // Get the ID for this thread in the current namespace. id := tg.pidns.tgids[tg] // Check whether a process still exists or not. if id == 0 { return linuxerr.ESRCH } // Per above, check for a Session leader or existing group. for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() { if s.leader.pidns != tg.pidns { continue } if s.leader == tg { return linuxerr.EPERM } for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() { if pg.id == ProcessGroupID(id) { return linuxerr.EPERM } } } // Create a new ProcessGroup, belonging to the current Session. // // We manually adjust the ancestors if the parent is in the same // session. tg.processGroup.session.IncRef() pg := ProcessGroup{ id: ProcessGroupID(id), originator: tg, session: tg.processGroup.session, } pg.refs.InitRefs() if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session { pg.ancestors++ } // Assign the new process group; adjust children. oldParentPG := tg.parentPG() tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) { childTG.processGroup.incRefWithParent(&pg) childTG.processGroup.decRefWithParent(oldParentPG) }) tg.processGroup.decRefWithParent(oldParentPG) tg.processGroup = &pg // Add the new process group to the session. pg.session.processGroups.PushBack(&pg) // Ensure this translation is added to all namespaces. for ns := tg.pidns; ns != nil; ns = ns.parent { local := ns.tgids[tg] ns.pgids[&pg] = ProcessGroupID(local) ns.processGroups[ProcessGroupID(local)] = &pg } return nil } // JoinProcessGroup joins an existing process group. // // This function will return EACCES if an exec has been performed since fork // by the given ThreadGroup, and EPERM if the Sessions are not the same or the // group does not exist. // // If checkExec is set, then the join is not permitted after the process has // executed exec at least once. func (tg *ThreadGroup) JoinProcessGroup(pidns *PIDNamespace, pgid ProcessGroupID, checkExec bool) error { pidns.owner.mu.Lock() defer pidns.owner.mu.Unlock() // Check whether the process still exists or not. if _, ok := pidns.tgids[tg]; !ok { return linuxerr.ESRCH } // Lookup the ProcessGroup. pg := pidns.processGroups[pgid] if pg == nil { return linuxerr.EPERM } // Disallow the join if an execve has performed, per POSIX. if checkExec && tg.execed { return linuxerr.EACCES } // See if it's in the same session as ours. if pg.session != tg.processGroup.session { return linuxerr.EPERM } // Join the group; adjust children. parentPG := tg.parentPG() pg.incRefWithParent(parentPG) tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) { childTG.processGroup.incRefWithParent(pg) childTG.processGroup.decRefWithParent(tg.processGroup) }) tg.processGroup.decRefWithParent(parentPG) tg.processGroup = pg return nil } // Session returns the ThreadGroup's Session. // // A reference is not taken on the session. func (tg *ThreadGroup) Session() *Session { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() return tg.processGroup.session } // IDOfSession returns the Session assigned to s in PID namespace ns. // // If this group isn't visible in this namespace, zero will be returned. It is // the callers responsibility to check that before using this function. func (ns *PIDNamespace) IDOfSession(s *Session) SessionID { ns.owner.mu.RLock() defer ns.owner.mu.RUnlock() return ns.sids[s] } // SessionWithID returns the Session with the given ID in the PID namespace ns, // or nil if that given ID is not defined in this namespace. // // A reference is not taken on the session. func (ns *PIDNamespace) SessionWithID(id SessionID) *Session { ns.owner.mu.RLock() defer ns.owner.mu.RUnlock() return ns.sessions[id] } // ProcessGroup returns the ThreadGroup's ProcessGroup. // // A reference is not taken on the process group. func (tg *ThreadGroup) ProcessGroup() *ProcessGroup { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() return tg.processGroup } // IDOfProcessGroup returns the process group assigned to pg in PID namespace ns. // // The same constraints apply as IDOfSession. func (ns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID { ns.owner.mu.RLock() defer ns.owner.mu.RUnlock() return ns.pgids[pg] } // ProcessGroupWithID returns the ProcessGroup with the given ID in the PID // namespace ns, or nil if that given ID is not defined in this namespace. // // A reference is not taken on the process group. func (ns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup { ns.owner.mu.RLock() defer ns.owner.mu.RUnlock() return ns.processGroups[id] } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/shm/000077500000000000000000000000001465435605700230175ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/shm/context.go000066400000000000000000000017571465435605700250440ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package shm import ( "gvisor.dev/gvisor/pkg/context" ) type contextID int const ( // CtxDeviceID is a Context.Value key for kernel.Kernel.sysVShmDevID, which // this package cannot refer to due to dependency cycles. CtxDeviceID contextID = iota ) func deviceIDFromContext(ctx context.Context) (uint32, bool) { v := ctx.Value(CtxDeviceID) if v == nil { return 0, false } return v.(uint32), true } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/shm/shm.go000066400000000000000000000476751465435605700241600ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package shm implements sysv shared memory segments. // // Known missing features: // // - SHM_LOCK/SHM_UNLOCK are no-ops. The sentry currently doesn't implement // memory locking in general. // // - SHM_HUGETLB and related flags for shmget(2) are ignored. There's no easy // way to implement hugetlb support on a per-map basis, and it has no impact // on correctness. // // - SHM_NORESERVE for shmget(2) is ignored, the sentry doesn't implement swap // so it's meaningless to reserve space for swap. // // - No per-process segment size enforcement. This feature probably isn't used // much anyways, since Linux sets the per-process limits to the system-wide // limits by default. // // Lock ordering: mm.mappingMu -> shm registry lock -> shm lock package shm import ( goContext "context" "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) // Registry tracks all shared memory segments in an IPC namespace. The registry // provides the mechanisms for creating and finding segments, and reporting // global shm parameters. // // +stateify savable type Registry struct { // userNS owns the IPC namespace this registry belong to. Immutable. userNS *auth.UserNamespace // mu protects all fields below. mu sync.Mutex `state:"nosave"` // reg defines basic fields and operations needed for all SysV registries. // // Within reg, there are two maps, Objects and KeysToIDs. // // reg.objects holds all referenced segments, which are removed on the last // DecRef. Thus, it cannot itself hold a reference on the Shm. // // Since removal only occurs after the last (unlocked) DecRef, there // exists a short window during which a Shm still exists in Shm, but is // unreferenced. Users must use TryIncRef to determine if the Shm is // still valid. // // keysToIDs maps segment keys to IDs. // // Shms in keysToIDs are guaranteed to be referenced, as they are // removed by disassociateKey before the last DecRef. reg *ipc.Registry // Sum of the sizes of all existing segments rounded up to page size, in // units of page size. totalPages uint64 } // NewRegistry creates a new shm registry. func NewRegistry(userNS *auth.UserNamespace) *Registry { return &Registry{ userNS: userNS, reg: ipc.NewRegistry(userNS), } } // FindByID looks up a segment given an ID. // // FindByID returns a reference on Shm. func (r *Registry) FindByID(id ipc.ID) *Shm { r.mu.Lock() defer r.mu.Unlock() mech := r.reg.FindByID(id) if mech == nil { return nil } s := mech.(*Shm) // Take a reference on s. If TryIncRef fails, s has reached the last // DecRef, but hasn't quite been removed from r.reg.objects yet. if s != nil && s.TryIncRef() { return s } return nil } // dissociateKey removes the association between a segment and its key, // preventing it from being discovered in the registry. This doesn't necessarily // mean the segment is about to be destroyed. This is analogous to unlinking a // file; the segment can still be used by a process already referencing it, but // cannot be discovered by a new process. func (r *Registry) dissociateKey(s *Shm) { r.mu.Lock() defer r.mu.Unlock() s.mu.Lock() defer s.mu.Unlock() if s.obj.Key != linux.IPC_PRIVATE { r.reg.DissociateKey(s.obj.Key) s.obj.Key = linux.IPC_PRIVATE } } // FindOrCreate looks up or creates a segment in the registry. It's functionally // analogous to open(2). // // FindOrCreate returns a reference on Shm. func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key ipc.Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) { if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) { // "A new segment was to be created and size is less than SHMMIN or // greater than SHMMAX." - man shmget(2) // // Note that 'private' always implies the creation of a new segment // whether IPC_CREAT is specified or not. return nil, linuxerr.EINVAL } r.mu.Lock() defer r.mu.Unlock() if r.reg.ObjectCount() >= linux.SHMMNI { // "All possible shared memory IDs have been taken (SHMMNI) ..." // - man shmget(2) return nil, linuxerr.ENOSPC } if !private { shm, err := r.reg.Find(ctx, key, mode, create, exclusive) if err != nil { return nil, err } // Validate shm-specific parameters. if shm != nil { shm := shm.(*Shm) if size > shm.size { // "A segment for the given key exists, but size is greater than // the size of that segment." - man shmget(2) return nil, linuxerr.EINVAL } shm.IncRef() return shm, nil } } var sizeAligned uint64 if val, ok := hostarch.Addr(size).RoundUp(); ok { sizeAligned = uint64(val) } else { return nil, linuxerr.EINVAL } if numPages := sizeAligned / hostarch.PageSize; r.totalPages+numPages > linux.SHMALL { // "... allocating a segment of the requested size would cause the // system to exceed the system-wide limit on shared memory (SHMALL)." // - man shmget(2) return nil, linuxerr.ENOSPC } // Need to create a new segment. s, err := r.newShmLocked(ctx, pid, key, auth.CredentialsFromContext(ctx), mode, size) if err != nil { return nil, err } // The initial reference is held by s itself. Take another to return to // the caller. s.IncRef() return s, nil } // newShmLocked creates a new segment in the registry. // // Precondition: Caller must hold r.mu. func (r *Registry) newShmLocked(ctx context.Context, pid int32, key ipc.Key, creator *auth.Credentials, mode linux.FileMode, size uint64) (*Shm, error) { mf := pgalloc.MemoryFileFromContext(ctx) if mf == nil { panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFile)) } devID, ok := deviceIDFromContext(ctx) if !ok { panic(fmt.Sprintf("context.Context %T lacks value for key %T", ctx, CtxDeviceID)) } effectiveSize := uint64(hostarch.Addr(size).MustRoundUp()) fr, err := mf.Allocate(effectiveSize, pgalloc.AllocOpts{Kind: usage.Anonymous, MemCgID: pgalloc.MemoryCgroupIDFromContext(ctx)}) if err != nil { return nil, err } shm := &Shm{ mf: mf, registry: r, devID: devID, size: size, effectiveSize: effectiveSize, obj: ipc.NewObject(r.reg.UserNS, ipc.Key(key), creator, creator, mode), fr: fr, creatorPID: pid, changeTime: ktime.NowFromContext(ctx), } shm.InitRefs() if err := r.reg.Register(shm); err != nil { return nil, err } r.totalPages += effectiveSize / hostarch.PageSize return shm, nil } // IPCInfo reports global parameters for sysv shared memory segments on this // system. See shmctl(IPC_INFO). func (r *Registry) IPCInfo() *linux.ShmParams { return &linux.ShmParams{ ShmMax: linux.SHMMAX, ShmMin: linux.SHMMIN, ShmMni: linux.SHMMNI, ShmSeg: linux.SHMSEG, ShmAll: linux.SHMALL, } } // ShmInfo reports linux-specific global parameters for sysv shared memory // segments on this system. See shmctl(SHM_INFO). func (r *Registry) ShmInfo() *linux.ShmInfo { r.mu.Lock() defer r.mu.Unlock() return &linux.ShmInfo{ UsedIDs: int32(r.reg.LastIDUsed()), ShmTot: r.totalPages, ShmRss: r.totalPages, // We could probably get a better estimate from memory accounting. ShmSwp: 0, // No reclaim at the moment. } } // remove deletes a segment from this registry, deaccounting the memory used by // the segment. // // Precondition: Must follow a call to r.dissociateKey(s). func (r *Registry) remove(s *Shm) { r.mu.Lock() defer r.mu.Unlock() s.mu.Lock() defer s.mu.Unlock() if s.obj.Key != linux.IPC_PRIVATE { panic(fmt.Sprintf("Attempted to remove %s from the registry whose key is still associated", s.debugLocked())) } r.reg.DissociateID(s.obj.ID) r.totalPages -= s.effectiveSize / hostarch.PageSize } // Release drops the self-reference of each active shm segment in the registry. // It is called when the kernel.IPCNamespace containing r is being destroyed. func (r *Registry) Release(ctx context.Context) { // Because Shm.DecRef() may acquire the same locks, collect the segments to // release first. Note that this should not race with any updates to r, since // the IPC namespace containing it has no more references. toRelease := make([]*Shm, 0) r.mu.Lock() r.reg.ForAllObjects( func(o ipc.Mechanism) { s := o.(*Shm) s.mu.Lock() if !s.pendingDestruction { toRelease = append(toRelease, s) } s.mu.Unlock() }, ) r.mu.Unlock() for _, s := range toRelease { r.dissociateKey(s) s.DecRef(ctx) } } // Shm represents a single shared memory segment. // // Shm segments are backed directly by an allocation from platform memory. // Segments are always mapped as a whole, greatly simplifying how mappings are // tracked. However note that mremap and munmap calls may cause the vma for a // segment to become fragmented; which requires special care when unmapping a // segment. See mm/shm.go. // // Segments persist until they are explicitly marked for destruction via // MarkDestroyed(). // // Shm implements memmap.Mappable and memmap.MappingIdentity. // // +stateify savable type Shm struct { // ShmRefs tracks the number of references to this segment. // // A segment holds a reference to itself until it is marked for // destruction. // // In addition to direct users, the MemoryManager will hold references // via MappingIdentity. ShmRefs mf *pgalloc.MemoryFile `state:"nosave"` // registry points to the shm registry containing this segment. Immutable. registry *Registry // devID is the segment's device ID. Immutable. devID uint32 // size is the requested size of the segment at creation, in // bytes. Immutable. size uint64 // effectiveSize of the segment, rounding up to the next page // boundary. Immutable. // // Invariant: effectiveSize must be a multiple of hostarch.PageSize. effectiveSize uint64 // fr is the offset into mfp.MemoryFile() that backs this contents of this // segment. Immutable. fr memmap.FileRange // mu protects all fields below. mu sync.Mutex `state:"nosave"` obj *ipc.Object // attachTime is updated on every successful shmat. attachTime ktime.Time // detachTime is updated on every successful shmdt. detachTime ktime.Time // changeTime is updated on every successful changes to the segment via // shmctl(IPC_SET). changeTime ktime.Time // creatorPID is the PID of the process that created the segment. creatorPID int32 // lastAttachDetachPID is the pid of the process that issued the last shmat // or shmdt syscall. lastAttachDetachPID int32 // pendingDestruction indicates the segment was marked as destroyed through // shmctl(IPC_RMID). When marked as destroyed, the segment will not be found // in the registry and can no longer be attached. When the last user // detaches from the segment, it is destroyed. pendingDestruction bool } // afterLoad is invoked by stateify. func (s *Shm) afterLoad(ctx goContext.Context) { s.mf = pgalloc.MemoryFileFromContext(ctx) } // ID returns object's ID. func (s *Shm) ID() ipc.ID { return s.obj.ID } // Object implements ipc.Mechanism.Object. func (s *Shm) Object() *ipc.Object { return s.obj } // Destroy implements ipc.Mechanism.Destroy. No work is performed on shm.Destroy // because a different removal mechanism is used in shm. See Shm.MarkDestroyed. func (s *Shm) Destroy() { } // Lock implements ipc.Mechanism.Lock. func (s *Shm) Lock() { s.mu.Lock() } // Unlock implements ipc.mechanism.Unlock. // // +checklocksignore func (s *Shm) Unlock() { s.mu.Unlock() } // Precondition: Caller must hold s.mu. func (s *Shm) debugLocked() string { return fmt.Sprintf("Shm{id: %d, key: %d, size: %d bytes, refs: %d, destroyed: %v}", s.obj.ID, s.obj.Key, s.size, s.ReadRefs(), s.pendingDestruction) } // MappedName implements memmap.MappingIdentity.MappedName. func (s *Shm) MappedName(ctx context.Context) string { s.mu.Lock() defer s.mu.Unlock() return fmt.Sprintf("SYSV%08d", s.obj.Key) } // DeviceID implements memmap.MappingIdentity.DeviceID. func (s *Shm) DeviceID() uint64 { return uint64(s.devID) } // InodeID implements memmap.MappingIdentity.InodeID. func (s *Shm) InodeID() uint64 { // "shmid gets reported as "inode#" in /proc/pid/maps. proc-ps tools use // this. Changing this will break them." -- Linux, ipc/shm.c:newseg() return uint64(s.obj.ID) } // DecRef drops a reference on s. // // Precondition: Caller must not hold s.mu. func (s *Shm) DecRef(ctx context.Context) { s.ShmRefs.DecRef(func() { s.mf.DecRef(s.fr) s.registry.remove(s) }) } // Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm // segments. func (s *Shm) Msync(context.Context, memmap.MappableRange) error { return nil } // AddMapping implements memmap.Mappable.AddMapping. func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ hostarch.AddrRange, _ uint64, _ bool) error { s.mu.Lock() defer s.mu.Unlock() s.attachTime = ktime.NowFromContext(ctx) if pid, ok := auth.ThreadGroupIDFromContext(ctx); ok { s.lastAttachDetachPID = pid } else { // AddMapping is called during a syscall, so ctx should always be a task // context. log.Warningf("Adding mapping to %s but couldn't get the current pid; not updating the last attach pid", s.debugLocked()) } return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ hostarch.AddrRange, _ uint64, _ bool) { s.mu.Lock() defer s.mu.Unlock() // RemoveMapping may be called during task exit, when ctx // is context.Background. Gracefully handle missing clocks. Failing to // update the detach time in these cases is ok, since no one can observe the // omission. if clock := ktime.RealtimeClockFromContext(ctx); clock != nil { s.detachTime = clock.Now() } // If called from a non-task context we also won't have a threadgroup // id. Silently skip updating the lastAttachDetachPid in that case. if pid, ok := auth.ThreadGroupIDFromContext(ctx); ok { s.lastAttachDetachPID = pid } else { log.Debugf("Couldn't obtain pid when removing mapping to %s, not updating the last detach pid.", s.debugLocked()) } } // CopyMapping implements memmap.Mappable.CopyMapping. func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, hostarch.AddrRange, uint64, bool) error { return nil } // Translate implements memmap.Mappable.Translate. func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { var err error if required.End > s.fr.Length() { err = &memmap.BusError{linuxerr.EFAULT} } if source := optional.Intersect(memmap.MappableRange{0, s.fr.Length()}); source.Length() != 0 { return []memmap.Translation{ { Source: source, File: s.mf, Offset: s.fr.Start + source.Start, Perms: hostarch.AnyAccess, }, }, err } return nil, err } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (s *Shm) InvalidateUnsavable(ctx context.Context) error { return nil } // AttachOpts describes various flags passed to shmat(2). type AttachOpts struct { Execute bool Readonly bool Remap bool } // ConfigureAttach creates an mmap configuration for the segment with the // requested attach options. // // Postconditions: The returned MMapOpts are valid only as long as a reference // continues to be held on s. func (s *Shm) ConfigureAttach(ctx context.Context, addr hostarch.Addr, opts AttachOpts) (memmap.MMapOpts, error) { s.mu.Lock() defer s.mu.Unlock() if s.pendingDestruction && s.ReadRefs() == 0 { return memmap.MMapOpts{}, linuxerr.EIDRM } creds := auth.CredentialsFromContext(ctx) ats := vfs.MayRead if !opts.Readonly { ats |= vfs.MayWrite } if opts.Execute { ats |= vfs.MayExec } if !s.obj.CheckPermissions(creds, ats) { // "The calling process does not have the required permissions for the // requested attach type, and does not have the CAP_IPC_OWNER capability // in the user namespace that governs its IPC namespace." - man shmat(2) return memmap.MMapOpts{}, linuxerr.EACCES } return memmap.MMapOpts{ Length: s.size, Offset: 0, Addr: addr, Fixed: opts.Remap, Perms: hostarch.AccessType{ Read: true, Write: !opts.Readonly, Execute: opts.Execute, }, MaxPerms: hostarch.AnyAccess, Mappable: s, MappingIdentity: s, }, nil } // EffectiveSize returns the size of the underlying shared memory segment. This // may be larger than the requested size at creation, due to rounding to page // boundaries. func (s *Shm) EffectiveSize() uint64 { return s.effectiveSize } // IPCStat returns information about a shm. See shmctl(IPC_STAT). func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) { s.mu.Lock() defer s.mu.Unlock() // "The caller must have read permission on the shared memory segment." // - man shmctl(2) creds := auth.CredentialsFromContext(ctx) if !s.obj.CheckPermissions(creds, vfs.MayRead) { // "IPC_STAT or SHM_STAT is requested and shm_perm.mode does not allow // read access for shmid, and the calling process does not have the // CAP_IPC_OWNER capability in the user namespace that governs its IPC // namespace." - man shmctl(2) return nil, linuxerr.EACCES } var mode uint16 if s.pendingDestruction { mode |= linux.SHM_DEST } // Use the reference count as a rudimentary count of the number of // attaches. We exclude: // // 1. The reference the caller holds. // 2. The self-reference held by s prior to destruction. // // Note that this may still overcount by including transient references // used in concurrent calls. nattach := uint64(s.ReadRefs()) - 1 if !s.pendingDestruction { nattach-- } ds := &linux.ShmidDS{ ShmPerm: linux.IPCPerm{ Key: uint32(s.obj.Key), UID: uint32(creds.UserNamespace.MapFromKUID(s.obj.OwnerUID)), GID: uint32(creds.UserNamespace.MapFromKGID(s.obj.OwnerGID)), CUID: uint32(creds.UserNamespace.MapFromKUID(s.obj.CreatorUID)), CGID: uint32(creds.UserNamespace.MapFromKGID(s.obj.CreatorGID)), Mode: mode | uint16(s.obj.Mode), Seq: 0, // IPC sequences not supported. }, ShmSegsz: s.size, ShmAtime: s.attachTime.TimeT(), ShmDtime: s.detachTime.TimeT(), ShmCtime: s.changeTime.TimeT(), ShmCpid: s.creatorPID, ShmLpid: s.lastAttachDetachPID, ShmNattach: nattach, } return ds, nil } // Set modifies attributes for a segment. See shmctl(IPC_SET). func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error { s.mu.Lock() defer s.mu.Unlock() if err := s.obj.Set(ctx, &ds.ShmPerm); err != nil { return err } s.changeTime = ktime.NowFromContext(ctx) return nil } // MarkDestroyed marks a segment for destruction. The segment is actually // destroyed once it has no references. MarkDestroyed may be called multiple // times, and is safe to call after a segment has already been destroyed. See // shmctl(IPC_RMID). func (s *Shm) MarkDestroyed(ctx context.Context) { s.registry.dissociateKey(s) s.mu.Lock() if s.pendingDestruction { s.mu.Unlock() return } s.pendingDestruction = true s.mu.Unlock() // Drop the self-reference so destruction occurs when all // external references are gone. // // N.B. This cannot be the final DecRef, as the caller also // holds a reference. s.DecRef(ctx) return } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/shm/shm_refs.go000066400000000000000000000100171465435605700251530ustar00rootroot00000000000000package shm import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const ShmenableLogging = true // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var Shmobj *Shm // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type ShmRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *ShmRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *ShmRefs) RefType() string { return fmt.Sprintf("%T", Shmobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *ShmRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *ShmRefs) LogRefs() bool { return ShmenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *ShmRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *ShmRefs) IncRef() { v := r.refCount.Add(1) if ShmenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *ShmRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if ShmenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *ShmRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if ShmenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *ShmRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/shm/shm_state_autogen.go000066400000000000000000000061141465435605700270610ustar00rootroot00000000000000// automatically generated by stateify. package shm import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (r *Registry) StateTypeName() string { return "pkg/sentry/kernel/shm.Registry" } func (r *Registry) StateFields() []string { return []string{ "userNS", "reg", "totalPages", } } func (r *Registry) beforeSave() {} // +checklocksignore func (r *Registry) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.userNS) stateSinkObject.Save(1, &r.reg) stateSinkObject.Save(2, &r.totalPages) } func (r *Registry) afterLoad(context.Context) {} // +checklocksignore func (r *Registry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.userNS) stateSourceObject.Load(1, &r.reg) stateSourceObject.Load(2, &r.totalPages) } func (s *Shm) StateTypeName() string { return "pkg/sentry/kernel/shm.Shm" } func (s *Shm) StateFields() []string { return []string{ "ShmRefs", "registry", "devID", "size", "effectiveSize", "fr", "obj", "attachTime", "detachTime", "changeTime", "creatorPID", "lastAttachDetachPID", "pendingDestruction", } } func (s *Shm) beforeSave() {} // +checklocksignore func (s *Shm) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.ShmRefs) stateSinkObject.Save(1, &s.registry) stateSinkObject.Save(2, &s.devID) stateSinkObject.Save(3, &s.size) stateSinkObject.Save(4, &s.effectiveSize) stateSinkObject.Save(5, &s.fr) stateSinkObject.Save(6, &s.obj) stateSinkObject.Save(7, &s.attachTime) stateSinkObject.Save(8, &s.detachTime) stateSinkObject.Save(9, &s.changeTime) stateSinkObject.Save(10, &s.creatorPID) stateSinkObject.Save(11, &s.lastAttachDetachPID) stateSinkObject.Save(12, &s.pendingDestruction) } // +checklocksignore func (s *Shm) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.ShmRefs) stateSourceObject.Load(1, &s.registry) stateSourceObject.Load(2, &s.devID) stateSourceObject.Load(3, &s.size) stateSourceObject.Load(4, &s.effectiveSize) stateSourceObject.Load(5, &s.fr) stateSourceObject.Load(6, &s.obj) stateSourceObject.Load(7, &s.attachTime) stateSourceObject.Load(8, &s.detachTime) stateSourceObject.Load(9, &s.changeTime) stateSourceObject.Load(10, &s.creatorPID) stateSourceObject.Load(11, &s.lastAttachDetachPID) stateSourceObject.Load(12, &s.pendingDestruction) stateSourceObject.AfterLoad(func() { s.afterLoad(ctx) }) } func (r *ShmRefs) StateTypeName() string { return "pkg/sentry/kernel/shm.ShmRefs" } func (r *ShmRefs) StateFields() []string { return []string{ "refCount", } } func (r *ShmRefs) beforeSave() {} // +checklocksignore func (r *ShmRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *ShmRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func init() { state.Register((*Registry)(nil)) state.Register((*Shm)(nil)) state.Register((*ShmRefs)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/signal.go000066400000000000000000000053321465435605700240370ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/platform" ) // SignalPanic is used to panic the running threads. It is a signal which // cannot be used by the application: it must be caught and ignored by the // runtime (in order to catch possible races). const SignalPanic = linux.SIGUSR2 // sendExternalSignal is called when an asynchronous signal is sent to the // sentry ("in sentry context"). On some platforms, it may also be called when // an asynchronous signal is sent to sandboxed application threads ("in // application context"). // // context is used only for debugging to differentiate these cases. // // Preconditions: Kernel must have an init process. func (k *Kernel) sendExternalSignal(info *linux.SignalInfo, context string) { switch linux.Signal(info.Signo) { case linux.SIGURG: // Sent by the Go 1.14+ runtime for asynchronous goroutine preemption. case platform.SignalInterrupt: // Assume that a call to platform.Context.Interrupt() misfired. case SignalPanic: // SignalPanic is also specially handled in sentry setup to ensure that // it causes a panic even after tasks exit, but SignalPanic may also // be sent here if it is received while in app context. panic("Signal-induced panic") default: log.Infof("Received external signal %d in %s context", info.Signo, context) if k.globalInit == nil { panic(fmt.Sprintf("Received external signal %d before init created", info.Signo)) } k.globalInit.SendSignal(info) } } // SignalInfoPriv returns a SignalInfo equivalent to Linux's SEND_SIG_PRIV. func SignalInfoPriv(sig linux.Signal) *linux.SignalInfo { return &linux.SignalInfo{ Signo: int32(sig), Code: linux.SI_KERNEL, } } // SignalInfoNoInfo returns a SignalInfo equivalent to Linux's SEND_SIG_NOINFO. func SignalInfoNoInfo(sig linux.Signal, sender, receiver *Task) *linux.SignalInfo { info := &linux.SignalInfo{ Signo: int32(sig), Code: linux.SI_USER, } info.SetPID(int32(receiver.tg.pidns.IDOfThreadGroup(sender.tg))) info.SetUID(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) return info } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/signal_handlers.go000066400000000000000000000046721465435605700257250ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" ) // SignalHandlers holds information about signal actions. // // +stateify savable type SignalHandlers struct { // mu protects actions, as well as the signal state of all tasks and thread // groups using this SignalHandlers object. (See comment on // ThreadGroup.signalHandlers.) mu signalHandlersMutex `state:"nosave"` // actions is the action to be taken upon receiving each signal. actions map[linux.Signal]linux.SigAction } // NewSignalHandlers returns a new SignalHandlers specifying all default // actions. func NewSignalHandlers() *SignalHandlers { return &SignalHandlers{ actions: make(map[linux.Signal]linux.SigAction), } } // Fork returns a copy of sh for a new thread group. func (sh *SignalHandlers) Fork() *SignalHandlers { sh2 := NewSignalHandlers() sh.mu.Lock() defer sh.mu.Unlock() for sig, act := range sh.actions { sh2.actions[sig] = act } return sh2 } // CopyForExec returns a copy of sh for a thread group that is undergoing an // execve. (See comments in Task.finishExec.) func (sh *SignalHandlers) CopyForExec() *SignalHandlers { sh2 := NewSignalHandlers() sh.mu.Lock() defer sh.mu.Unlock() for sig, act := range sh.actions { if act.Handler == linux.SIG_IGN { sh2.actions[sig] = linux.SigAction{ Handler: linux.SIG_IGN, } } } return sh2 } // IsIgnored returns true if the signal is ignored. func (sh *SignalHandlers) IsIgnored(sig linux.Signal) bool { sh.mu.Lock() defer sh.mu.Unlock() sa, ok := sh.actions[sig] return ok && sa.Handler == linux.SIG_IGN } // dequeueActionLocked returns the SignalAct that should be used to handle sig. // // Preconditions: sh.mu must be locked. func (sh *SignalHandlers) dequeueAction(sig linux.Signal) linux.SigAction { act := sh.actions[sig] if act.Flags&linux.SA_RESETHAND != 0 { delete(sh.actions, sig) } return act } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/signal_handlers_mutex.go000066400000000000000000000035061465435605700271420ustar00rootroot00000000000000package kernel import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type signalHandlersMutex struct { mu sync.Mutex } var signalHandlersprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var signalHandlerslockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type signalHandlerslockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. const ( signalHandlersLockTg = signalHandlerslockNameIndex(0) ) const () // Lock locks m. // +checklocksignore func (m *signalHandlersMutex) Lock() { locking.AddGLock(signalHandlersprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *signalHandlersMutex) NestedLock(i signalHandlerslockNameIndex) { locking.AddGLock(signalHandlersprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *signalHandlersMutex) Unlock() { locking.DelGLock(signalHandlersprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *signalHandlersMutex) NestedUnlock(i signalHandlerslockNameIndex) { locking.DelGLock(signalHandlersprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func signalHandlersinitLockNames() { signalHandlerslockNames = []string{"tg"} } func init() { signalHandlersinitLockNames() signalHandlersprefixIndex = locking.NewMutexClass(reflect.TypeOf(signalHandlersMutex{}), signalHandlerslockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/syscalls.go000066400000000000000000000367531465435605700244320ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "strconv" "google.golang.org/protobuf/proto" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/sentry" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/seccheck" pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto" "gvisor.dev/gvisor/pkg/sync" ) // outOfRangeSyscallNumber is used to represent a syscall number that is out of the // range [0, maxSyscallNum] in monitoring. var outOfRangeSyscallNumber = []*metric.FieldValue{&metric.FieldValue{"-1"}} // SyscallSupportLevel is a syscall support levels. type SyscallSupportLevel int // String returns a human readable representation of the support level. func (l SyscallSupportLevel) String() string { switch l { case SupportUnimplemented: return "Unimplemented" case SupportPartial: return "Partial Support" case SupportFull: return "Full Support" default: return "Undocumented" } } const ( // SupportUndocumented indicates the syscall is not documented yet. SupportUndocumented = iota // SupportUnimplemented indicates the syscall is unimplemented. SupportUnimplemented // SupportPartial indicates the syscall is partially supported. SupportPartial // SupportFull indicates the syscall is fully supported. SupportFull ) // Syscall includes the syscall implementation and compatibility information. type Syscall struct { // Name is the syscall name. Name string // Fn is the implementation of the syscall. Fn SyscallFn // SupportLevel is the level of support implemented in gVisor. SupportLevel SyscallSupportLevel // Note describes the compatibility of the syscall. Note string // URLs is set of URLs to any relevant bugs or issues. URLs []string // PointCallback is an optional callback that converts syscall arguments // to a proto that can be used with seccheck.Sink. // Callback functions must follow this naming convention: // PointSyscallNameInCamelCase, e.g. PointReadat, PointRtSigaction. PointCallback SyscallToProto } // SyscallFn is a syscall implementation. type SyscallFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *SyscallControl, error) // MissingFn is a syscall to be called when an implementation is missing. type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) // Possible flags for SyscallFlagsTable.enable. const ( // syscallPresent indicates that this is not a missing syscall. // // This flag is used internally in SyscallFlagsTable. syscallPresent = 1 << iota // StraceEnableLog enables syscall log tracing. StraceEnableLog // StraceEnableEvent enables syscall event tracing. StraceEnableEvent // ExternalBeforeEnable enables the external hook before syscall execution. ExternalBeforeEnable // ExternalAfterEnable enables the external hook after syscall execution. ExternalAfterEnable // SecCheckEnter represents a schematized/enter syscall seccheck event. SecCheckEnter // SecCheckExit represents a schematized/exit syscall seccheck event. SecCheckExit // SecCheckRawEnter represents raw/enter syscall seccheck event. SecCheckRawEnter // SecCheckRawExit represents raw/exit syscall seccheck event. SecCheckRawExit ) // StraceEnableBits combines both strace log and event flags. const StraceEnableBits = StraceEnableLog | StraceEnableEvent // SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall // basis. type SyscallFlagsTable struct { // mu protects writes to the fields below. // // Atomic loads are always allowed. Atomic stores are allowed only // while mu is held. mu sync.Mutex // enable contains the enable bits for each syscall. // // missing syscalls have the same value in enable as missingEnable to // avoid an extra branch in Word. enable [sentry.MaxSyscallNum + 1]atomicbitops.Uint32 // missingEnable contains the enable bits for missing syscalls. missingEnable atomicbitops.Uint32 } // Init initializes the struct, with all syscalls in table set to enable. // // max is the largest syscall number in table. func (e *SyscallFlagsTable) init(table map[uintptr]Syscall) { for num := range table { enableFlags := uint32(syscallPresent) e.enable[num] = atomicbitops.FromUint32(enableFlags) } seccheck.Global.AddSyscallFlagListener(e) e.UpdateSecCheck(&seccheck.Global) } // UpdateSecCheck implements seccheck.SyscallFlagListener. // // It is called when per-syscall seccheck event enablement changes. func (e *SyscallFlagsTable) UpdateSecCheck(state *seccheck.State) { e.mu.Lock() defer e.mu.Unlock() for sysno := uintptr(0); sysno <= sentry.MaxSyscallNum; sysno++ { oldFlags := e.enable[sysno].Load() if !bits.IsOn32(oldFlags, syscallPresent) { continue } flags := oldFlags if state.SyscallEnabled(seccheck.SyscallEnter, sysno) { flags |= SecCheckEnter } else { flags &^= SecCheckEnter } if state.SyscallEnabled(seccheck.SyscallExit, sysno) { flags |= SecCheckExit } else { flags &^= SecCheckExit } if state.SyscallEnabled(seccheck.SyscallRawEnter, sysno) { flags |= SecCheckRawEnter } else { flags &^= SecCheckRawEnter } if state.SyscallEnabled(seccheck.SyscallRawExit, sysno) { flags |= SecCheckRawExit } else { flags &^= SecCheckRawExit } if flags != oldFlags { e.enable[sysno].Store(flags) } } } // Word returns the enable bitfield for sysno. func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 { if sysno <= sentry.MaxSyscallNum { return e.enable[sysno].Load() } return e.missingEnable.Load() } // Enable sets enable bit `bit` for all syscalls based on s. // // Syscalls missing from `s` are disabled. // // Syscalls missing from the initial table passed to Init cannot be added as // individual syscalls. If present in s they will be ignored. // // Callers to Word may see either the old or new value while this function // is executing. func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) { e.mu.Lock() defer e.mu.Unlock() missingVal := e.missingEnable.Load() if missingEnable { missingVal |= bit } else { missingVal &^= bit } e.missingEnable.Store(missingVal) for num := range e.enable { val := e.enable[num].Load() if !bits.IsOn32(val, syscallPresent) { // Missing. e.enable[num].Store(missingVal) continue } if s[uintptr(num)] { val |= bit } else { val &^= bit } e.enable[num].Store(val) } } // EnableAll sets enable bit bit for all syscalls, present and missing. func (e *SyscallFlagsTable) EnableAll(bit uint32) { e.mu.Lock() defer e.mu.Unlock() missingVal := e.missingEnable.Load() missingVal |= bit e.missingEnable.Store(missingVal) for num := range e.enable { val := e.enable[num].Load() if !bits.IsOn32(val, syscallPresent) { // Missing. e.enable[num].Store(missingVal) continue } val |= bit e.enable[num].Store(val) } } // Stracer traces syscall execution. type Stracer interface { // SyscallEnter is called on syscall entry. // // The returned private data is passed to SyscallExit. SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) any // SyscallExit is called on syscall exit. SyscallExit(context any, t *Task, sysno, rval uintptr, err error) } // SyscallTable is a lookup table of system calls. // // Note that a SyscallTable is not savable directly. Instead, they are saved as // an OS/Arch pair and lookup happens again on restore. type SyscallTable struct { // OS is the operating system that this syscall table implements. OS abi.OS // Arch is the architecture that this syscall table targets. Arch arch.Arch // The OS version that this syscall table implements. Version Version // AuditNumber is a numeric constant that represents the syscall table. If // non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by // linux/audit.h. AuditNumber uint32 // Table is the collection of functions. Table map[uintptr]Syscall // lookup is a fixed-size array that holds the syscalls (indexed by // their numbers). It is used for fast look ups. lookup [sentry.MaxSyscallNum + 1]SyscallFn // pointCallbacks is a fixed-size array that holds SyscallToProto callbacks // (indexed by syscall numbers). It is used for fast lookups when // seccheck.Point is enabled for the syscall. pointCallbacks [sentry.MaxSyscallNum + 1]SyscallToProto // Emulate is a collection of instruction addresses to emulate. The // keys are addresses, and the values are system call numbers. Emulate map[hostarch.Addr]uintptr // The function to call in case of a missing system call. Missing MissingFn // Stracer traces this syscall table. Stracer Stracer // External is used to handle an external callback. External func(*Kernel) // ExternalFilterBefore is called before External is called before the syscall is executed. // External is not called if it returns false. ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool // ExternalFilterAfter is called before External is called after the syscall is executed. // External is not called if it returns false. ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool // FeatureEnable stores the strace and one-shot enable bits. FeatureEnable SyscallFlagsTable } // MaxSysno returns the largest system call number. func (s *SyscallTable) MaxSysno() (max uintptr) { for num := range s.Table { if num > max { max = num } } return max } // allSyscallTables contains all known tables. var allSyscallTables []*SyscallTable var ( // unimplementedSyscallCounterInit ensures the following fields are only initialized once. unimplementedSyscallCounterInit sync.Once // unimplementedSyscallNumbers maps syscall numbers to their string representation. // Used such that incrementing unimplementedSyscallCounter does not require allocating memory. // Each element in the mapped slices are of length 1, as there is only one field for the // unimplemented syscall counter metric. Allocating a slice is necessary as it is passed as a // variadic argument to the metric library. unimplementedSyscallNumbers map[uintptr][]*metric.FieldValue // unimplementedSyscallCounter tracks the number of times each unimplemented syscall has been // called by the sandboxed application. unimplementedSyscallCounter *metric.Uint64Metric ) // SyscallTables returns a read-only slice of registered SyscallTables. func SyscallTables() []*SyscallTable { return allSyscallTables } // LookupSyscallTable returns the SyscallCall table for the OS/Arch combination. func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) { for _, s := range allSyscallTables { if s.OS == os && s.Arch == a { return s, true } } return nil, false } // RegisterSyscallTable registers a new syscall table for use by a Kernel. func RegisterSyscallTable(s *SyscallTable) { if max := s.MaxSysno(); max > sentry.MaxSyscallNum { panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max)) } if _, ok := LookupSyscallTable(s.OS, s.Arch); ok { panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch)) } allSyscallTables = append(allSyscallTables, s) unimplementedSyscallCounterInit.Do(func() { allowedValues := make([]*metric.FieldValue, sentry.MaxSyscallNum+2) unimplementedSyscallNumbers = make(map[uintptr][]*metric.FieldValue, len(allowedValues)) for i := uintptr(0); i <= sentry.MaxSyscallNum; i++ { s := &metric.FieldValue{strconv.Itoa(int(i))} allowedValues[i] = s unimplementedSyscallNumbers[i] = []*metric.FieldValue{s} } allowedValues[len(allowedValues)-1] = outOfRangeSyscallNumber[0] unimplementedSyscallCounter = metric.MustCreateNewUint64Metric("/unimplemented_syscalls", metric.Uint64Metadata{ Cumulative: true, Sync: true, Description: "Number of times the application tried to call an unimplemented syscall, broken down by syscall number", Fields: []metric.Field{ metric.NewField("sysno", allowedValues...), }, }) }) s.Init() } // Init initializes the system call table. // // This should normally be called only during registration. func (s *SyscallTable) Init() { if s.Table == nil { // Ensure non-nil lookup table. s.Table = make(map[uintptr]Syscall) } if s.Emulate == nil { // Ensure non-nil emulate table. s.Emulate = make(map[hostarch.Addr]uintptr) } // Initialize the fast-lookup tables. for num, sc := range s.Table { s.lookup[num] = sc.Fn } for num, sc := range s.Table { s.pointCallbacks[num] = sc.PointCallback } // Initialize all features. s.FeatureEnable.init(s.Table) } // Lookup returns the syscall implementation, if one exists. func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn { if sysno <= sentry.MaxSyscallNum { return s.lookup[sysno] } return nil } // LookupName looks up a syscall name. func (s *SyscallTable) LookupName(sysno uintptr) string { if sc, ok := s.Table[sysno]; ok { return sc.Name } return fmt.Sprintf("sys_%d", sysno) // Unlikely. } // LookupNo looks up a syscall number by name. func (s *SyscallTable) LookupNo(name string) (uintptr, error) { for i, syscall := range s.Table { if syscall.Name == name { return uintptr(i), nil } } return 0, fmt.Errorf("syscall %q not found", name) } // LookupEmulate looks up an emulation syscall number. func (s *SyscallTable) LookupEmulate(addr hostarch.Addr) (uintptr, bool) { sysno, ok := s.Emulate[addr] return sysno, ok } // mapLookup is similar to Lookup, except that it only uses the syscall table, // that is, it skips the fast look array. This is available for benchmarking. func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn { if sc, ok := s.Table[sysno]; ok { return sc.Fn } return nil } // LookupSyscallToProto looks up the SyscallToProto callback for the given // syscall. It may return nil if none is registered. func (s *SyscallTable) LookupSyscallToProto(sysno uintptr) SyscallToProto { if sysno > sentry.MaxSyscallNum { return nil } return s.pointCallbacks[sysno] } // SyscallToProto is a callback function that converts generic syscall data to // schematized protobuf for the corresponding syscall. type SyscallToProto func(*Task, seccheck.FieldSet, *pb.ContextData, SyscallInfo) (proto.Message, pb.MessageType) // SyscallInfo provides generic information about the syscall. type SyscallInfo struct { Exit bool Sysno uintptr Args arch.SyscallArguments Rval uintptr Errno int } // IncrementUnimplementedSyscallCounter increments the "unimplemented syscall" metric for the given // syscall number. // A syscall table must have been initialized prior to calling this function. // // FIXME(gvisor.dev/issue/10556): checkescape can't distinguish between this // file and files named syscalls.go in other directories, resulting in false // positives, so this function cannot be +checkescape:all. // //go:nosplit func IncrementUnimplementedSyscallCounter(sysno uintptr) { s, found := unimplementedSyscallNumbers[sysno] if !found { s = outOfRangeSyscallNumber } unimplementedSyscallCounter.Increment(s...) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/syscalls_state.go000066400000000000000000000024431465435605700256170ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "context" "fmt" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/sentry/arch" ) // syscallTableInfo is used to reload the SyscallTable. // // +stateify savable type syscallTableInfo struct { OS abi.OS Arch arch.Arch } // saveSt saves the SyscallTable. func (image *TaskImage) saveSt() syscallTableInfo { return syscallTableInfo{ OS: image.st.OS, Arch: image.st.Arch, } } // loadSt loads the SyscallTable. func (image *TaskImage) loadSt(_ context.Context, sti syscallTableInfo) { st, ok := LookupSyscallTable(sti.OS, sti.Arch) if !ok { panic(fmt.Sprintf("syscall table not found for OS %v, Arch %v", sti.OS, sti.Arch)) } image.st = st // Save the table reference. } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/syslog.go000066400000000000000000000066421465435605700241070ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "math/rand" "gvisor.dev/gvisor/pkg/sync" ) // syslog represents a sentry-global kernel log. // // Currently, it contains only fun messages for a dmesg easter egg. // // +stateify savable type syslog struct { // mu protects the below. mu sync.Mutex `state:"nosave"` // msg is the syslog message buffer. It is lazily initialized. msg []byte } // Log returns a copy of the syslog. func (s *syslog) Log() []byte { s.mu.Lock() defer s.mu.Unlock() if s.msg != nil { // Already initialized, just return a copy. o := make([]byte, len(s.msg)) copy(o, s.msg) return o } // Not initialized, create message. allMessages := []string{ "Synthesizing system calls...", "Mounting deweydecimalfs...", "Moving files to filing cabinet...", "Digging up root...", "Constructing home...", "Segmenting fault lines...", "Creating bureaucratic processes...", "Searching for needles in stacks...", "Preparing for the zombie uprising...", "Feeding the init monster...", "Creating cloned children...", "Daemonizing children...", "Waiting for children...", "Gathering forks...", "Committing treasure map to memory...", "Reading process obituaries...", "Searching for socket adapter...", "Creating process schedule...", "Generating random numbers by fair dice roll...", "Rewriting operating system in Javascript...", "Reticulating splines...", "Consulting tar man page...", "Forking spaghetti code...", "Checking naughty and nice process list...", "Checking naughty and nice process list...", // Check it up to twice. "Granting licence to kill(2)...", // British spelling for British movie. "Letting the watchdogs out...", "Conjuring /dev/null black hole...", "Adversarially training Redcode AI...", "Singleplexing /dev/ptmx...", "Recruiting cron-ies...", "Verifying that no non-zero bytes made their way into /dev/zero...", "Accelerating teletypewriter to 9600 baud...", } selectMessage := func() string { i := rand.Intn(len(allMessages)) m := allMessages[i] // Delete the selected message. allMessages[i] = allMessages[len(allMessages)-1] allMessages = allMessages[:len(allMessages)-1] return m } const format = "<6>[%11.6f] %s\n" s.msg = append(s.msg, []byte(fmt.Sprintf(format, 0.0, "Starting gVisor..."))...) time := 0.1 for i := 0; i < 10; i++ { time += rand.Float64() / 2 s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, selectMessage()))...) } time += rand.Float64() / 2 s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Setting up VFS..."))...) time += rand.Float64() / 2 s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Setting up FUSE..."))...) time += rand.Float64() / 2 s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Ready!"))...) // Return a copy. o := make([]byte, len(s.msg)) copy(o, s.msg) return o } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task.go000066400000000000000000000745451465435605700235400ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( gocontext "context" "runtime/trace" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) // TaskOrigin indicates how the task was initially created. type TaskOrigin int const ( // OriginUnknown indicates that task creation source is not known (or not important). OriginUnknown TaskOrigin = iota // OriginExec indicates that task was created due to an exec request inside a container. OriginExec ) // Task represents a thread of execution in the untrusted app. It // includes registers and any thread-specific state that you would // normally expect. // // Each task is associated with a goroutine, called the task goroutine, that // executes code (application code, system calls, etc.) on behalf of that task. // See Task.run (task_run.go). // // All fields that are "owned by the task goroutine" can only be mutated by the // task goroutine while it is running. The task goroutine does not require // synchronization to read these fields, although it still requires // synchronization as described for those fields to mutate them. // // All fields that are "exclusive to the task goroutine" can only be accessed // by the task goroutine while it is running. The task goroutine does not // require synchronization to read or write these fields. // // +stateify savable type Task struct { taskNode // goid is the task goroutine's ID. goid is owned by the task goroutine, // but since it's used to detect cases where non-task goroutines // incorrectly access state owned by, or exclusive to, the task goroutine, // goid is always accessed using atomic memory operations. goid atomicbitops.Int64 `state:"nosave"` // runState is what the task goroutine is executing if it is not stopped. // If runState is nil, the task goroutine should exit or has exited. // runState is exclusive to the task goroutine. runState taskRunState // taskWorkCount represents the current size of the task work queue. It is // used to avoid acquiring taskWorkMu when the queue is empty. taskWorkCount atomicbitops.Int32 // taskWorkMu protects taskWork. taskWorkMu taskWorkMutex `state:"nosave"` // taskWork is a queue of work to be executed before resuming user execution. // It is similar to the task_work mechanism in Linux. // // taskWork is exclusive to the task goroutine. taskWork []TaskWorker // haveSyscallReturn is true if image.Arch().Return() represents a value // returned by a syscall (or set by ptrace after a syscall). // // haveSyscallReturn is exclusive to the task goroutine. haveSyscallReturn bool // interruptChan is notified whenever the task goroutine is interrupted // (usually by a pending signal). interruptChan is effectively a condition // variable that can be used in select statements. // // interruptChan is not saved; because saving interrupts all tasks, // interruptChan is always notified after restore (see Task.run). interruptChan chan struct{} `state:"nosave"` // gosched contains the current scheduling state of the task goroutine. // // gosched is protected by goschedSeq. gosched is owned by the task // goroutine. goschedSeq sync.SeqCount `state:"nosave"` gosched TaskGoroutineSchedInfo // yieldCount is the number of times the task goroutine has called // Task.InterruptibleSleepStart, Task.UninterruptibleSleepStart, or // Task.Yield(), voluntarily ceasing execution. // // yieldCount is accessed using atomic memory operations. yieldCount is // owned by the task goroutine. yieldCount atomicbitops.Uint64 // pendingSignals is the set of pending signals that may be handled only by // this task. // // pendingSignals is protected by (taskNode.)tg.signalHandlers.mu // (hereafter "the signal mutex"); see comment on // ThreadGroup.signalHandlers. pendingSignals pendingSignals // signalMask is the set of signals whose delivery is currently blocked. // // signalMask is accessed using atomic memory operations, and is protected // by the signal mutex (such that reading signalMask is safe if either the // signal mutex is locked or if atomic memory operations are used, while // writing signalMask requires both). signalMask is owned by the task // goroutine. signalMask atomicbitops.Uint64 // If the task goroutine is currently executing Task.sigtimedwait, // realSignalMask is the previous value of signalMask, which has temporarily // been replaced by Task.sigtimedwait. Otherwise, realSignalMask is 0. // // realSignalMask is exclusive to the task goroutine. realSignalMask linux.SignalSet // If haveSavedSignalMask is true, savedSignalMask is the signal mask that // should be applied after the task has either delivered one signal to a // user handler or is about to resume execution in the untrusted // application. // // Both haveSavedSignalMask and savedSignalMask are exclusive to the task // goroutine. haveSavedSignalMask bool savedSignalMask linux.SignalSet // signalStack is the alternate signal stack used by signal handlers for // which the SA_ONSTACK flag is set. // // signalStack is exclusive to the task goroutine. signalStack linux.SignalStack // signalQueue is a set of registered waiters for signal-related events. // // signalQueue is protected by the signalMutex. Note that the task does // not implement all queue methods, specifically the readiness checks. // The task only broadcast a notification on signal delivery. signalQueue waiter.Queue // If groupStopPending is true, the task should participate in a group // stop in the interrupt path. // // groupStopPending is analogous to JOBCTL_STOP_PENDING in Linux. // // groupStopPending is protected by the signal mutex. groupStopPending bool // If groupStopAcknowledged is true, the task has already acknowledged that // it is entering the most recent group stop that has been initiated on its // thread group. // // groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux. // // groupStopAcknowledged is protected by the signal mutex. groupStopAcknowledged bool // If trapStopPending is true, the task goroutine should enter a // PTRACE_INTERRUPT-induced stop from the interrupt path. // // trapStopPending is analogous to JOBCTL_TRAP_STOP in Linux, except that // Linux also sets JOBCTL_TRAP_STOP when a ptraced task detects // JOBCTL_STOP_PENDING. // // trapStopPending is protected by the signal mutex. trapStopPending bool // If trapNotifyPending is true, this task is PTRACE_SEIZEd, and a group // stop has begun or ended since the last time the task entered a // ptrace-stop from the group-stop path. // // trapNotifyPending is analogous to JOBCTL_TRAP_NOTIFY in Linux. // // trapNotifyPending is protected by the signal mutex. trapNotifyPending bool // If stop is not nil, it is the internally-initiated condition that // currently prevents the task goroutine from running. // // stop is protected by the signal mutex. stop TaskStop // stopCount is the number of active external stops (calls to // Task.BeginExternalStop that have not been paired with a call to // Task.EndExternalStop), plus 1 if stop is not nil. Hence stopCount is // non-zero if the task goroutine should stop. // // Mutating stopCount requires both locking the signal mutex and using // atomic memory operations. Reading stopCount requires either locking the // signal mutex or using atomic memory operations. This allows Task.doStop // to require only a single atomic read in the common case where stopCount // is 0. // // stopCount is not saved, because external stops cannot be retained across // a save/restore cycle. (Suppose a sentryctl command issues an external // stop; after a save/restore cycle, the restored sentry has no knowledge // of the pre-save sentryctl command, and the stopped task would remain // stopped forever.) stopCount atomicbitops.Int32 `state:"nosave"` // endStopCond is signaled when stopCount transitions to 0. The combination // of stopCount and endStopCond effectively form a sync.WaitGroup, but // WaitGroup provides no way to read its counter value. // // Invariant: endStopCond.L is the signal mutex. (This is not racy because // sync.Cond.Wait is the only user of sync.Cond.L; only the task goroutine // calls sync.Cond.Wait; and only the task goroutine can change the // identity of the signal mutex, in Task.finishExec.) endStopCond sync.Cond `state:"nosave"` // exitStatus is the task's exit status. // // exitStatus is protected by the signal mutex. exitStatus linux.WaitStatus // syscallRestartBlock represents a custom restart function to run in // restart_syscall(2) to resume an interrupted syscall. // // syscallRestartBlock is exclusive to the task goroutine. syscallRestartBlock SyscallRestartBlock // p provides the mechanism by which the task runs code in userspace. The p // interface object is immutable. p platform.Context `state:"nosave"` // k is the Kernel that this task belongs to. The k pointer is immutable. k *Kernel // containerID has no equivalent in Linux; it's used by runsc to track all // tasks that belong to a given containers since cgroups aren't implemented. // It's inherited by the children, is immutable, and may be empty. // // NOTE: cgroups can be used to track this when implemented. containerID string // mu protects some of the following fields. mu taskMutex `state:"nosave"` // image holds task data provided by the ELF loader. // // image is protected by mu, and is owned by the task goroutine. image TaskImage // fsContext is the task's filesystem context. // // fsContext is protected by mu, and is owned by the task goroutine. fsContext *FSContext // fdTable is the task's file descriptor table. // // fdTable is protected by mu, and is owned by the task goroutine. fdTable *FDTable // If vforkParent is not nil, it is the task that created this task with // vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when // this TaskImage is released. // // vforkParent is protected by the TaskSet mutex. vforkParent *Task // exitState is the task's progress through the exit path. // // exitState is protected by the TaskSet mutex. exitState is owned by the // task goroutine. exitState TaskExitState // exitTracerNotified is true if the exit path has either signaled the // task's tracer to indicate the exit, or determined that no such signal is // needed. exitTracerNotified can only be true if exitState is // TaskExitZombie or TaskExitDead. // // exitTracerNotified is protected by the TaskSet mutex. exitTracerNotified bool // exitTracerAcked is true if exitTracerNotified is true and either the // task's tracer has acknowledged the exit notification, or the exit path // has determined that no such notification is needed. // // exitTracerAcked is protected by the TaskSet mutex. exitTracerAcked bool // exitParentNotified is true if the exit path has either signaled the // task's parent to indicate the exit, or determined that no such signal is // needed. exitParentNotified can only be true if exitState is // TaskExitZombie or TaskExitDead. // // exitParentNotified is protected by the TaskSet mutex. exitParentNotified bool // exitParentAcked is true if exitParentNotified is true and either the // task's parent has acknowledged the exit notification, or the exit path // has determined that no such acknowledgment is needed. // // exitParentAcked is protected by the TaskSet mutex. exitParentAcked bool // goroutineStopped is a WaitGroup whose counter value is 1 when the task // goroutine is running and 0 when the task goroutine is stopped or has // exited. goroutineStopped sync.WaitGroup `state:"nosave"` // ptraceTracer is the task that is ptrace-attached to this one. If // ptraceTracer is nil, this task is not being traced. // // ptraceTracer is protected by the TaskSet mutex, and accessed with atomic // operations. This allows paths that wouldn't otherwise lock the TaskSet // mutex, notably the syscall path, to check if ptraceTracer is nil without // additional synchronization. ptraceTracer atomic.Pointer[Task] `state:".(*Task)"` // ptraceTracees is the set of tasks that this task is ptrace-attached to. // // ptraceTracees is protected by the TaskSet mutex. ptraceTracees map[*Task]struct{} // ptraceSeized is true if ptraceTracer attached to this task with // PTRACE_SEIZE. // // ptraceSeized is protected by the TaskSet mutex. ptraceSeized bool // ptraceOpts contains ptrace options explicitly set by the tracer. If // ptraceTracer is nil, ptraceOpts is expected to be the zero value. // // ptraceOpts is protected by the TaskSet mutex. ptraceOpts ptraceOptions // ptraceSyscallMode controls ptrace behavior around syscall entry and // exit. // // ptraceSyscallMode is protected by the TaskSet mutex. ptraceSyscallMode ptraceSyscallMode // If ptraceSinglestep is true, the next time the task executes application // code, single-stepping should be enabled. ptraceSinglestep is stored // independently of the architecture-specific trap flag because tracer // detaching (which can happen concurrently with the tracee's execution if // the tracer exits) must disable single-stepping, and the task's // architectural state is implicitly exclusive to the task goroutine (no // synchronization occurs before passing registers to SwitchToApp). // // ptraceSinglestep is analogous to Linux's TIF_SINGLESTEP. // // ptraceSinglestep is protected by the TaskSet mutex. ptraceSinglestep bool // If t is ptrace-stopped, ptraceCode is a ptrace-defined value set at the // time that t entered the ptrace stop, reset to 0 when the tracer // acknowledges the stop with a wait*() syscall. Otherwise, it is the // signal number passed to the ptrace operation that ended the last ptrace // stop on this task. In the latter case, the effect of ptraceCode depends // on the nature of the ptrace stop; signal-delivery-stop uses it to // conditionally override ptraceSiginfo, syscall-entry/exit-stops send the // signal to the task after leaving the stop, and PTRACE_EVENT stops and // traced group stops ignore it entirely. // // Linux contextually stores the equivalent of ptraceCode in // task_struct::exit_code. // // ptraceCode is protected by the TaskSet mutex. ptraceCode int32 // ptraceSiginfo is the value returned to the tracer by // ptrace(PTRACE_GETSIGINFO) and modified by ptrace(PTRACE_SETSIGINFO). // (Despite the name, PTRACE_PEEKSIGINFO is completely unrelated.) // ptraceSiginfo is nil if the task is in a ptraced group-stop (this is // required for PTRACE_GETSIGINFO to return EINVAL during such stops, which // is in turn required to distinguish group stops from other ptrace stops, // per subsection "Group-stop" in ptrace(2)). // // ptraceSiginfo is analogous to Linux's task_struct::last_siginfo. // // ptraceSiginfo is protected by the TaskSet mutex. ptraceSiginfo *linux.SignalInfo // ptraceEventMsg is the value set by PTRACE_EVENT stops and returned to // the tracer by ptrace(PTRACE_GETEVENTMSG). // // ptraceEventMsg is protected by the TaskSet mutex. ptraceEventMsg uint64 // ptraceYAMAExceptionAdded is true if a YAMA exception involving the task has // been added before. This is used during task exit to decide whether we need // to clean up YAMA exceptions. // // ptraceYAMAExceptionAdded is protected by the TaskSet mutex. ptraceYAMAExceptionAdded bool // The struct that holds the IO-related usage. The ioUsage pointer is // immutable. ioUsage *usage.IO // logPrefix is a string containing the task's thread ID in the root PID // namespace, and is prepended to log messages emitted by Task.Infof etc. logPrefix atomic.Pointer[string] `state:"nosave"` // traceContext and traceTask are both used for tracing, and are // updated along with the logPrefix in updateInfoLocked. // // These are exclusive to the task goroutine. traceContext gocontext.Context `state:"nosave"` traceTask *trace.Task `state:"nosave"` // creds is the task's credentials. // // creds.Load() may be called without synchronization. creds.Store() is // serialized by mu. creds is owned by the task goroutine. All // auth.Credentials objects that creds may point to, or have pointed to // in the past, must be treated as immutable. creds auth.AtomicPtrCredentials // utsns is the task's UTS namespace. // // utsns is protected by mu. utsns is owned by the task goroutine. utsns *UTSNamespace // ipcns is the task's IPC namespace. // // ipcns is protected by mu. ipcns is owned by the task goroutine. ipcns *IPCNamespace // mountNamespace is the task's mount namespace. // // It is protected by mu. It is owned by the task goroutine. mountNamespace *vfs.MountNamespace // parentDeathSignal is sent to this task's thread group when its parent exits. // // parentDeathSignal is protected by mu. parentDeathSignal linux.Signal // seccomp contains all seccomp-bpf syscall filters applicable to the task. // The type of the atomic is *taskSeccomp. // Writing needs to be protected by the signal mutex. // // seccomp is owned by the task goroutine. seccomp atomic.Pointer[taskSeccomp] `state:".(*taskSeccomp)"` // If cleartid is non-zero, treat it as a pointer to a ThreadID in the // task's virtual address space; when the task exits, set the pointed-to // ThreadID to 0, and wake any futex waiters. // // cleartid is exclusive to the task goroutine. cleartid hostarch.Addr // This is mostly a fake cpumask just for sched_set/getaffinity as we // don't really control the affinity. // // Invariant: allowedCPUMask.Size() == // sched.CPUMaskSize(Kernel.applicationCores). // // allowedCPUMask is protected by mu. allowedCPUMask sched.CPUSet // cpu is the fake cpu number returned by getcpu(2). cpu is ignored // entirely if Kernel.useHostCores is true. cpu atomicbitops.Int32 // This is used to keep track of changes made to a process' priority/niceness. // It is mostly used to provide some reasonable return value from // getpriority(2) after a call to setpriority(2) has been made. // We currently do not actually modify a process' scheduling priority. // NOTE: This represents the userspace view of priority (nice). // This means that the value should be in the range [-20, 19]. // // niceness is protected by mu. niceness int // This is used to track the numa policy for the current thread. This can be // modified through a set_mempolicy(2) syscall. Since we always report a // single numa node, all policies are no-ops. We only track this information // so that we can return reasonable values if the application calls // get_mempolicy(2) after setting a non-default policy. Note that in the // real syscall, nodemask can be longer than a single unsigned long, but we // always report a single node so never need to save more than a single // bit. // // numaPolicy and numaNodeMask are protected by mu. numaPolicy linux.NumaPolicy numaNodeMask uint64 // netns is the task's network namespace. It has to be changed under mu // so that GetNetworkNamespace can take a reference before it is // released. It is changed only from the task goroutine. netns *inet.Namespace // If rseqPreempted is true, before the next call to p.Switch(), // interrupt rseq critical regions as defined by rseqAddr and // tg.oldRSeqCritical and write the task goroutine's CPU number to // rseqAddr/oldRSeqCPUAddr. // // We support two ABIs for restartable sequences: // // 1. The upstream interface added in v4.18, // 2. An "old" interface never merged upstream. In the implementation, // this is referred to as "old rseq". // // rseqPreempted is exclusive to the task goroutine. rseqPreempted bool `state:"nosave"` // rseqCPU is the last CPU number written to rseqAddr/oldRSeqCPUAddr. // // If rseq is unused, rseqCPU is -1 for convenient use in // platform.Context.Switch. // // rseqCPU is exclusive to the task goroutine. rseqCPU int32 // oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable. // // oldRSeqCPUAddr is exclusive to the task goroutine. oldRSeqCPUAddr hostarch.Addr // rseqAddr is a pointer to the userspace linux.RSeq structure. // // rseqAddr is exclusive to the task goroutine. rseqAddr hostarch.Addr // rseqSignature is the signature that the rseq abort IP must be signed // with. // // rseqSignature is exclusive to the task goroutine. rseqSignature uint32 // copyScratchBuffer is a buffer available to CopyIn/CopyOut // implementations that require an intermediate buffer to copy data // into/out of. It prevents these buffers from being allocated/zeroed in // each syscall and eventually garbage collected. // // copyScratchBuffer is exclusive to the task goroutine. copyScratchBuffer [copyScratchBufferLen]byte `state:"nosave"` // blockingTimer is used for blocking timeouts. blockingTimerChan is the // channel that is sent to when blockingTimer fires. // // blockingTimer is exclusive to the task goroutine. blockingTimer *ktime.Timer `state:"nosave"` blockingTimerChan <-chan struct{} `state:"nosave"` // futexWaiter is used for futex(FUTEX_WAIT) syscalls. // // futexWaiter is exclusive to the task goroutine. futexWaiter *futex.Waiter `state:"nosave"` // robustList is a pointer to the head of the tasks's robust futex // list. robustList hostarch.Addr // startTime is the real time at which the task started. It is set when // a Task is created or invokes execve(2). // // startTime is protected by mu. startTime ktime.Time // kcov is the kcov instance providing code coverage owned by this task. // // kcov is exclusive to the task goroutine. kcov *Kcov // cgroups is the set of cgroups this task belongs to. This may be empty if // no cgroup controllers are enabled. Protected by mu. // // +checklocks:mu cgroups map[Cgroup]struct{} // memCgID is the memory cgroup id. memCgID atomicbitops.Uint32 // userCounters is a pointer to a set of user counters. // // The userCounters pointer is exclusive to the task goroutine, but the // userCounters instance must be atomically accessed. userCounters *UserCounters // sessionKeyring is a pointer to the task's session keyring, if set. // It is guaranteed to be of type "keyring". // // +checklocks:mu sessionKeyring *auth.Key // Origin is the origin of the task. Origin TaskOrigin } // Task related metrics var ( // syscallCounter is a metric that tracks how many syscalls the sentry has // executed. syscallCounter = metric.SentryProfiling.MustCreateNewUint64Metric( "/task/syscalls", metric.Uint64Metadata{ Cumulative: true, Description: "The number of syscalls the sentry has executed for the user.", }) // faultCounter is a metric that tracks how many faults the sentry has had to // handle. faultCounter = metric.SentryProfiling.MustCreateNewUint64Metric( "/task/faults", metric.Uint64Metadata{ Cumulative: true, Description: "The number of faults the sentry has handled.", }) ) func (t *Task) savePtraceTracer() *Task { return t.ptraceTracer.Load() } func (t *Task) loadPtraceTracer(_ gocontext.Context, tracer *Task) { t.ptraceTracer.Store(tracer) } func (t *Task) saveSeccomp() *taskSeccomp { return t.seccomp.Load() } func (t *Task) loadSeccomp(_ gocontext.Context, seccompData *taskSeccomp) { t.seccomp.Store(seccompData) } // afterLoad is invoked by stateify. func (t *Task) afterLoad(gocontext.Context) { t.updateInfoLocked() if ts := t.seccomp.Load(); ts != nil { ts.populateCache(t) } t.interruptChan = make(chan struct{}, 1) t.gosched.State = TaskGoroutineNonexistent if t.stop != nil { t.stopCount = atomicbitops.FromInt32(1) } t.endStopCond.L = &t.tg.signalHandlers.mu t.rseqPreempted = true t.futexWaiter = futex.NewWaiter() t.p = t.k.Platform.NewContext(t.AsyncContext()) } // copyScratchBufferLen is the length of Task.copyScratchBuffer. const copyScratchBufferLen = 144 // sizeof(struct stat) // CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut // functions. It must only be used within those functions and can only be used // by the task goroutine; it exists to improve performance and thus // intentionally lacks any synchronization. // // Callers should pass a constant value as an argument if possible, which will // allow the compiler to inline and optimize out the if statement below. func (t *Task) CopyScratchBuffer(size int) []byte { if size > copyScratchBufferLen { return make([]byte, size) } return t.copyScratchBuffer[:size] } // FutexWaiter returns the Task's futex.Waiter. func (t *Task) FutexWaiter() *futex.Waiter { return t.futexWaiter } // Kernel returns the Kernel containing t. func (t *Task) Kernel() *Kernel { return t.k } // SetClearTID sets t's cleartid. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) SetClearTID(addr hostarch.Addr) { t.cleartid = addr } // SetSyscallRestartBlock sets the restart block for use in // restart_syscall(2). After registering a restart block, a syscall should // return ERESTART_RESTARTBLOCK to request a restart using the block. // // Precondition: The caller must be running on the task goroutine. func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock) { t.syscallRestartBlock = r } // SyscallRestartBlock returns the currently registered restart block for use in // restart_syscall(2). This function is *not* idempotent and may be called once // per syscall. This function must not be called if a restart block has not been // registered for the current syscall. // // Precondition: The caller must be running on the task goroutine. func (t *Task) SyscallRestartBlock() SyscallRestartBlock { r := t.syscallRestartBlock // Explicitly set the restart block to nil so that a future syscall can't // accidentally reuse it. t.syscallRestartBlock = nil return r } // IsChrooted returns true if the root directory of t's FSContext is not the // root directory of t's MountNamespace. // // Preconditions: The caller must be running on the task goroutine, or t.mu // must be locked. func (t *Task) IsChrooted() bool { realRoot := t.mountNamespace.Root(t) defer realRoot.DecRef(t) root := t.fsContext.RootDirectory() defer root.DecRef(t) return root != realRoot } // TaskImage returns t's TaskImage. // // Precondition: The caller must be running on the task goroutine, or t.mu must // be locked. func (t *Task) TaskImage() *TaskImage { return &t.image } // FSContext returns t's FSContext. FSContext does not take an additional // reference on the returned FSContext. // // Precondition: The caller must be running on the task goroutine, or t.mu must // be locked. func (t *Task) FSContext() *FSContext { return t.fsContext } // FDTable returns t's FDTable. FDMTable does not take an additional reference // on the returned FDMap. // // Precondition: The caller must be running on the task goroutine, or t.mu must // be locked. func (t *Task) FDTable() *FDTable { return t.fdTable } // GetFile is a convenience wrapper for t.FDTable().Get. // // Precondition: same as FDTable.Get. func (t *Task) GetFile(fd int32) *vfs.FileDescription { f, _ := t.fdTable.Get(fd) return f } // NewFDs is a convenience wrapper for t.FDTable().NewFDs. // // This automatically passes the task as the context. // // Precondition: same as FDTable. func (t *Task) NewFDs(fd int32, files []*vfs.FileDescription, flags FDFlags) ([]int32, error) { return t.fdTable.NewFDs(t, fd, files, flags) } // NewFDFrom is a convenience wrapper for t.FDTable().NewFD. // // This automatically passes the task as the context. // // Precondition: same as FDTable.Get. func (t *Task) NewFDFrom(minFD int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { return t.fdTable.NewFD(t, minFD, file, flags) } // NewFDAt is a convenience wrapper for t.FDTable().NewFDAt. // // This automatically passes the task as the context. // // Precondition: same as FDTable. func (t *Task) NewFDAt(fd int32, file *vfs.FileDescription, flags FDFlags) (*vfs.FileDescription, error) { return t.fdTable.NewFDAt(t, fd, file, flags) } // WithMuLocked executes f with t.mu locked. func (t *Task) WithMuLocked(f func(*Task)) { t.mu.Lock() f(t) t.mu.Unlock() } // MountNamespace returns t's MountNamespace. func (t *Task) MountNamespace() *vfs.MountNamespace { t.mu.Lock() defer t.mu.Unlock() return t.mountNamespace } // GetMountNamespace returns t's MountNamespace. A reference is taken on the // returned mount namespace. func (t *Task) GetMountNamespace() *vfs.MountNamespace { t.mu.Lock() defer t.mu.Unlock() mntns := t.mountNamespace if mntns != nil { mntns.IncRef() } return mntns } // ContainerID returns t's container ID. func (t *Task) ContainerID() string { return t.containerID } // RestoreContainerID sets t's container ID in case the restored container ID // is different from when it was saved. func (t *Task) RestoreContainerID(cid string) { t.containerID = cid } // OOMScoreAdj gets the task's thread group's OOM score adjustment. func (t *Task) OOMScoreAdj() int32 { return t.tg.oomScoreAdj.Load() } // SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The // value should be between -1000 and 1000 inclusive. func (t *Task) SetOOMScoreAdj(adj int32) error { if adj > 1000 || adj < -1000 { return linuxerr.EINVAL } t.tg.oomScoreAdj.Store(adj) return nil } // KUID returns t's kuid. func (t *Task) KUID() uint32 { return uint32(t.Credentials().EffectiveKUID) } // KGID returns t's kgid. func (t *Task) KGID() uint32 { return uint32(t.Credentials().EffectiveKGID) } // SetKcov sets the kcov instance associated with t. func (t *Task) SetKcov(k *Kcov) { t.kcov = k } // ResetKcov clears the kcov instance associated with t. func (t *Task) ResetKcov() { if t.kcov != nil { t.kcov.OnTaskExit() t.kcov = nil } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_acct.go000066400000000000000000000123101465435605700245100ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel // Accounting, limits, timers. import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" ) // Getitimer implements getitimer(2). // // Preconditions: The caller must be running on the task goroutine. func (t *Task) Getitimer(id int32) (linux.ItimerVal, error) { var tm ktime.Time var s ktime.Setting switch id { case linux.ITIMER_REAL: tm, s = t.tg.itimerRealTimer.Get() case linux.ITIMER_VIRTUAL: tm = t.tg.UserCPUClock().Now() t.tg.signalHandlers.mu.Lock() s, _ = t.tg.itimerVirtSetting.At(tm) t.tg.signalHandlers.mu.Unlock() case linux.ITIMER_PROF: tm = t.tg.CPUClock().Now() t.tg.signalHandlers.mu.Lock() s, _ = t.tg.itimerProfSetting.At(tm) t.tg.signalHandlers.mu.Unlock() default: return linux.ItimerVal{}, linuxerr.EINVAL } val, iv := ktime.SpecFromSetting(tm, s) return linux.ItimerVal{ Value: linux.DurationToTimeval(val), Interval: linux.DurationToTimeval(iv), }, nil } // Setitimer implements setitimer(2). // // Preconditions: The caller must be running on the task goroutine. func (t *Task) Setitimer(id int32, newitv linux.ItimerVal) (linux.ItimerVal, error) { var tm ktime.Time var olds ktime.Setting switch id { case linux.ITIMER_REAL: news, err := ktime.SettingFromSpec(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), t.tg.itimerRealTimer.Clock()) if err != nil { return linux.ItimerVal{}, err } tm, olds = t.tg.itimerRealTimer.Swap(news) case linux.ITIMER_VIRTUAL: c := t.tg.UserCPUClock() t.k.cpuClockMu.Lock() defer t.k.cpuClockMu.Unlock() tm = c.Now() news, err := ktime.SettingFromSpecAt(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), tm) if err != nil { return linux.ItimerVal{}, err } t.tg.signalHandlers.mu.Lock() olds = t.tg.itimerVirtSetting t.tg.itimerVirtSetting = news t.tg.updateCPUTimersEnabledLocked() t.tg.signalHandlers.mu.Unlock() case linux.ITIMER_PROF: c := t.tg.CPUClock() t.k.cpuClockMu.Lock() defer t.k.cpuClockMu.Unlock() tm = c.Now() news, err := ktime.SettingFromSpecAt(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), tm) if err != nil { return linux.ItimerVal{}, err } t.tg.signalHandlers.mu.Lock() olds = t.tg.itimerProfSetting t.tg.itimerProfSetting = news t.tg.updateCPUTimersEnabledLocked() t.tg.signalHandlers.mu.Unlock() default: return linux.ItimerVal{}, linuxerr.EINVAL } oldval, oldiv := ktime.SpecFromSetting(tm, olds) return linux.ItimerVal{ Value: linux.DurationToTimeval(oldval), Interval: linux.DurationToTimeval(oldiv), }, nil } // IOUsage returns the io usage of the thread. func (t *Task) IOUsage() *usage.IO { return t.ioUsage } // IOUsage returns the total io usage of all dead and live threads in the group. func (tg *ThreadGroup) IOUsage() *usage.IO { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() var io usage.IO tg.ioUsage.Clone(&io) // Account for active tasks. for t := tg.tasks.Front(); t != nil; t = t.Next() { io.Accumulate(t.IOUsage()) } return &io } // Name returns t's name. func (t *Task) Name() string { t.mu.Lock() defer t.mu.Unlock() return t.image.Name } // SetName changes t's name. func (t *Task) SetName(name string) { t.mu.Lock() defer t.mu.Unlock() t.image.Name = name t.Debugf("Set thread name to %q", name) } // Limits implements context.Context.Limits. func (t *Task) Limits() *limits.LimitSet { return t.ThreadGroup().Limits() } // StartTime returns t's start time. func (t *Task) StartTime() ktime.Time { t.mu.Lock() defer t.mu.Unlock() return t.startTime } // MaxRSS returns the maximum resident set size of the task in bytes. which // should be one of RUSAGE_SELF, RUSAGE_CHILDREN, RUSAGE_THREAD, or // RUSAGE_BOTH. See getrusage(2) for documentation on the behavior of these // flags. func (t *Task) MaxRSS(which int32) uint64 { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() switch which { case linux.RUSAGE_SELF, linux.RUSAGE_THREAD: // If there's an active mm we can use its value. if mm := t.MemoryManager(); mm != nil { if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > t.tg.maxRSS { return mmMaxRSS } } return t.tg.maxRSS case linux.RUSAGE_CHILDREN: return t.tg.childMaxRSS case linux.RUSAGE_BOTH: maxRSS := t.tg.maxRSS if maxRSS < t.tg.childMaxRSS { maxRSS = t.tg.childMaxRSS } if mm := t.MemoryManager(); mm != nil { if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > maxRSS { return mmMaxRSS } } return maxRSS default: // We'll only get here if which is invalid. return 0 } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_block.go000066400000000000000000000175101465435605700246770ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "runtime" "runtime/trace" "time" "gvisor.dev/gvisor/pkg/errors/linuxerr" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) // BlockWithTimeout blocks t until an event is received from C, the application // monotonic clock indicates that timeout has elapsed (only if haveTimeout is true), // or t is interrupted. It returns: // // - The remaining timeout, which is guaranteed to be 0 if the timeout expired, // and is unspecified if haveTimeout is false. // // - An error which is nil if an event is received from C, ETIMEDOUT if the timeout // expired, and linuxerr.ErrInterrupted if t is interrupted. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error) { if !haveTimeout { return timeout, t.block(C, nil) } clock := t.Kernel().MonotonicClock() start := clock.Now() deadline := start.Add(timeout) err := t.BlockWithDeadlineFrom(C, clock, true, deadline) // Timeout, explicitly return a remaining duration of 0. if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { return 0, err } // Compute the remaining timeout. Note that even if block() above didn't // return due to a timeout, we may have used up any of the remaining time // since then. We cap the remaining timeout to 0 to make it easier to // directly use the returned duration. end := clock.Now() remainingTimeout := timeout - end.Sub(start) if remainingTimeout < 0 { remainingTimeout = 0 } return remainingTimeout, err } // BlockWithTimeoutOn implements context.Context.BlockWithTimeoutOn. func (t *Task) BlockWithTimeoutOn(w waiter.Waitable, mask waiter.EventMask, timeout time.Duration) (time.Duration, bool) { e, ch := waiter.NewChannelEntry(mask) w.EventRegister(&e) defer w.EventUnregister(&e) left, err := t.BlockWithTimeout(ch, true, timeout) return left, err == nil } // BlockWithDeadline blocks t until it is woken by an event, the // application monotonic clock indicates a time of deadline (only if // haveDeadline is true), or t is interrupted. It returns nil if an event is // received from C, ETIMEDOUT if the deadline expired, and // linuxerr.ErrInterrupted if t is interrupted. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) BlockWithDeadline(C <-chan struct{}, haveDeadline bool, deadline ktime.Time) error { return t.BlockWithDeadlineFrom(C, t.Kernel().MonotonicClock(), haveDeadline, deadline) } // BlockWithDeadlineFrom is similar to BlockWithDeadline, except it uses the // passed clock (instead of application monotonic clock). // // Most clients should use BlockWithDeadline or BlockWithTimeout instead. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) BlockWithDeadlineFrom(C <-chan struct{}, clock ktime.Clock, haveDeadline bool, deadline ktime.Time) error { if !haveDeadline { return t.block(C, nil) } // Start the timeout timer. t.blockingTimer.SetClock(clock, ktime.Setting{ Enabled: true, Next: deadline, }) err := t.block(C, t.blockingTimerChan) // Stop the timeout timer and drain the channel. t.blockingTimer.Swap(ktime.Setting{}) select { case <-t.blockingTimerChan: default: } return err } // Block implements context.Context.Block func (t *Task) Block(C <-chan struct{}) error { return t.block(C, nil) } // BlockOn implements context.Context.BlockOn. func (t *Task) BlockOn(w waiter.Waitable, mask waiter.EventMask) bool { e, ch := waiter.NewChannelEntry(mask) w.EventRegister(&e) defer w.EventUnregister(&e) err := t.Block(ch) return err == nil } // block blocks a task on one of many events. // N.B. defer is too expensive to be used here. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error { // This function is very hot; skip this check outside of +race builds. if sync.RaceEnabled { t.assertTaskGoroutine() } // Fast path if the request is already done. select { case <-C: return nil default: } // Deactivate our address space, we don't need it. t.prepareSleep() defer t.completeSleep() // If the request is not completed, but the timer has already expired, // then ensure that we run through a scheduler cycle. This is because // we may see applications relying on timer slack to yield the thread. // For example, they may attempt to sleep for some number of nanoseconds, // and expect that this will actually yield the CPU and sleep for at // least microseconds, e.g.: // https://github.com/LMAX-Exchange/disruptor/commit/6ca210f2bcd23f703c479804d583718e16f43c07 if len(timerChan) > 0 { runtime.Gosched() } region := trace.StartRegion(t.traceContext, blockRegion) select { case <-C: region.End() // Woken by event. return nil case <-t.interruptChan: region.End() // Ensure that Task.interrupted() will return true once we return to // the task run loop. t.interruptSelf() // Return the indicated error on interrupt. return linuxerr.ErrInterrupted case <-timerChan: region.End() // We've timed out. return linuxerr.ETIMEDOUT } } // prepareSleep prepares to sleep. func (t *Task) prepareSleep() { t.assertTaskGoroutine() t.p.PrepareSleep() t.Deactivate() t.accountTaskGoroutineEnter(TaskGoroutineBlockedInterruptible) } // completeSleep reactivates the address space. func (t *Task) completeSleep() { t.accountTaskGoroutineLeave(TaskGoroutineBlockedInterruptible) t.Activate() } // Interrupted implements context.Context.Interrupted. func (t *Task) Interrupted() bool { if t.interrupted() { return true } // Indicate that t's task goroutine is still responsive (i.e. reset the // watchdog timer). t.accountTaskGoroutineRunning() return false } // UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart. func (t *Task) UninterruptibleSleepStart(deactivate bool) { t.assertTaskGoroutine() if deactivate { t.Deactivate() } t.accountTaskGoroutineEnter(TaskGoroutineBlockedUninterruptible) } // UninterruptibleSleepFinish implements context.Context.UninterruptibleSleepFinish. func (t *Task) UninterruptibleSleepFinish(activate bool) { t.accountTaskGoroutineLeave(TaskGoroutineBlockedUninterruptible) if activate { t.Activate() } } // interrupted returns true if interrupt or interruptSelf has been called at // least once since the last call to unsetInterrupted. func (t *Task) interrupted() bool { return len(t.interruptChan) != 0 } // unsetInterrupted causes interrupted to return false until the next call to // interrupt or interruptSelf. func (t *Task) unsetInterrupted() { select { case <-t.interruptChan: default: } } // interrupt unblocks the task and interrupts it if it's currently running in // userspace. func (t *Task) interrupt() { t.interruptSelf() t.p.Interrupt() } // interruptSelf is like Interrupt, but can only be called by the task // goroutine. func (t *Task) interruptSelf() { select { case t.interruptChan <- struct{}{}: default: } // platform.Context.Interrupt() is unnecessary since a task goroutine // calling interruptSelf() cannot also be blocked in // platform.Context.Switch(). } // Interrupt implements context.Blocker.Interrupt. func (t *Task) Interrupt() { t.interrupt() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_cgroup.go000066400000000000000000000175221465435605700251070ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "bytes" "fmt" "sort" "strings" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" ) // EnterInitialCgroups moves t into an initial set of cgroups. // If initCgroups is not nil, the new task will be placed in the specified cgroups. // Otherwise, if parent is not nil, the new task will be placed in the parent's cgroups. // If neither is specified, the new task will be in the root cgroups. // // This is analogous to Linux's kernel/cgroup/cgroup.c:cgroup_css_set_fork(). // // Precondition: t isn't in any cgroups yet, t.cgroups is empty. func (t *Task) EnterInitialCgroups(parent *Task, initCgroups map[Cgroup]struct{}) { var inherit map[Cgroup]struct{} if initCgroups != nil { inherit = initCgroups } else if parent != nil { parent.mu.Lock() defer parent.mu.Unlock() inherit = parent.cgroups } joinSet := t.k.cgroupRegistry.computeInitialGroups(inherit) t.mu.NestedLock(taskLockChild) defer t.mu.NestedUnlock(taskLockChild) // Transfer ownership of joinSet refs to the task's cgset. t.cgroups = joinSet for c := range t.cgroups { // Since t isn't in any cgroup yet, we can skip the check against // existing cgroups. c.Enter(t) t.SetMemCgIDFromCgroup(c) } } // SetMemCgID sets the given memory cgroup id to the task. func (t *Task) SetMemCgID(memCgID uint32) { t.memCgID.Store(memCgID) } // SetMemCgIDFromCgroup sets the id of the given memory cgroup to the task. func (t *Task) SetMemCgIDFromCgroup(cg Cgroup) { for _, ctl := range cg.Controllers() { if ctl.Type() == CgroupControllerMemory { t.SetMemCgID(cg.ID()) return } } } // ResetMemCgIDFromCgroup sets the memory cgroup id to zero, if the task has // a memory cgroup. func (t *Task) ResetMemCgIDFromCgroup(cg Cgroup) { for _, ctl := range cg.Controllers() { if ctl.Type() == CgroupControllerMemory { t.SetMemCgID(0) return } } } // EnterCgroup moves t into c. func (t *Task) EnterCgroup(c Cgroup) error { newControllers := make(map[CgroupControllerType]struct{}) for _, ctl := range c.Controllers() { newControllers[ctl.Type()] = struct{}{} } t.mu.Lock() defer t.mu.Unlock() for oldCG := range t.cgroups { if oldCG.HierarchyID() == c.HierarchyID() { log.Warningf("Cannot enter new cgroup %v due to conflicting controllers. Try migrate instead?", c) return linuxerr.EBUSY } } // No migration required. t.enterCgroupLocked(c) return nil } // +checklocks:t.mu func (t *Task) enterCgroupLocked(c Cgroup) { c.IncRef() t.cgroups[c] = struct{}{} c.Enter(t) t.SetMemCgIDFromCgroup(c) } // +checklocks:t.mu func (t *Task) enterCgroupIfNotYetLocked(c Cgroup) { if _, ok := t.cgroups[c]; ok { return } t.enterCgroupLocked(c) } // LeaveCgroups removes t out from all its cgroups. func (t *Task) LeaveCgroups() { t.tg.pidns.owner.mu.Lock() // Prevent migration. t.mu.Lock() cgs := t.cgroups t.cgroups = nil for c := range cgs { c.Leave(t) } t.SetMemCgID(0) t.mu.Unlock() t.tg.pidns.owner.mu.Unlock() for c := range cgs { c.decRef() } } // +checklocks:t.mu func (t *Task) findCgroupWithMatchingHierarchyLocked(other Cgroup) (Cgroup, bool) { for c := range t.cgroups { if c.HierarchyID() != other.HierarchyID() { continue } return c, true } return Cgroup{}, false } // CgroupPrepareMigrate starts a cgroup migration for this task to dst. The // migration must be completed through the returned context. func (t *Task) CgroupPrepareMigrate(dst Cgroup) (*CgroupMigrationContext, error) { t.mu.Lock() defer t.mu.Unlock() src, found := t.findCgroupWithMatchingHierarchyLocked(dst) if !found { log.Warningf("Cannot migrate to cgroup %v since task not currently in target hierarchy %v", dst, dst.HierarchyID()) return nil, linuxerr.EINVAL } if err := dst.PrepareMigrate(t, &src); err != nil { return nil, err } return &CgroupMigrationContext{ src: src, dst: dst, t: t, }, nil } // MigrateCgroup migrates all tasks in tg to the dst cgroup. Either all tasks // are migrated, or none are. Atomicity of migrations wrt cgroup membership // (i.e. a task can't switch cgroups mid-migration due to another migration) is // guaranteed because migrations are serialized by TaskSet.mu. func (tg *ThreadGroup) MigrateCgroup(dst Cgroup) error { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() var ctxs []*CgroupMigrationContext // Prepare migrations. On partial failure, abort. for t := tg.tasks.Front(); t != nil; t = t.Next() { ctx, err := t.CgroupPrepareMigrate(dst) if err != nil { // Rollback. for _, ctx := range ctxs { ctx.Abort() } return err } ctxs = append(ctxs, ctx) } // All migrations are now guaranteed to succeed. for _, ctx := range ctxs { ctx.Commit() } return nil } // MigrateCgroup migrates this task to the dst cgroup. func (t *Task) MigrateCgroup(dst Cgroup) error { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() ctx, err := t.CgroupPrepareMigrate(dst) if err != nil { return err } ctx.Commit() return nil } // TaskCgroupEntry represents a line in /proc//cgroup, and is used to // format a cgroup for display. type TaskCgroupEntry struct { HierarchyID uint32 `json:"hierarchy_id"` Controllers string `json:"controllers,omitempty"` Path string `json:"path,omitempty"` } // GetCgroupEntries generates the contents of /proc//cgroup as // a TaskCgroupEntry array. func (t *Task) GetCgroupEntries() []TaskCgroupEntry { t.mu.Lock() defer t.mu.Unlock() cgEntries := make([]TaskCgroupEntry, 0, len(t.cgroups)) for c := range t.cgroups { ctls := c.Controllers() ctlNames := make([]string, 0, len(ctls)) // We're guaranteed to have a valid name, a non-empty controller list, // or both. // Explicit hierarchy name, if any. if name := c.Name(); name != "" { ctlNames = append(ctlNames, fmt.Sprintf("name=%s", name)) } // Controllers attached to this hierarchy, if any. for _, ctl := range ctls { ctlNames = append(ctlNames, string(ctl.Type())) } cgEntries = append(cgEntries, TaskCgroupEntry{ HierarchyID: c.HierarchyID(), Controllers: strings.Join(ctlNames, ","), Path: c.Path(), }) } sort.Slice(cgEntries, func(i, j int) bool { return cgEntries[i].HierarchyID > cgEntries[j].HierarchyID }) return cgEntries } // GenerateProcTaskCgroup writes the contents of /proc//cgroup for t to buf. func (t *Task) GenerateProcTaskCgroup(buf *bytes.Buffer) { cgEntries := t.GetCgroupEntries() for _, cgE := range cgEntries { fmt.Fprintf(buf, "%d:%s:%s\n", cgE.HierarchyID, cgE.Controllers, cgE.Path) } } // +checklocks:t.mu func (t *Task) chargeLocked(target *Task, ctl CgroupControllerType, res CgroupResourceType, value int64) (bool, Cgroup, error) { // Due to the uniqueness of controllers on hierarchies, at most one cgroup // in t.cgroups will match. for c := range t.cgroups { err := c.Charge(target, c.Dentry, ctl, res, value) if err == nil { c.IncRef() } return err == nil, c, err } return false, Cgroup{}, nil } // ChargeFor charges t's cgroup on behalf of some other task. Returns // the cgroup that's charged if any. Returned cgroup has an extra ref // that's transferred to the caller. func (t *Task) ChargeFor(other *Task, ctl CgroupControllerType, res CgroupResourceType, value int64) (bool, Cgroup, error) { t.mu.Lock() defer t.mu.Unlock() return t.chargeLocked(other, ctl, res, value) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_clone.go000066400000000000000000000533031465435605700247050ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/seccheck" pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) // SupportedCloneFlags is the bitwise OR of all the supported flags for clone. // TODO(b/290826530): Implement CLONE_INTO_CGROUP when cgroups v2 is // implemented. const SupportedCloneFlags = linux.CLONE_VM | linux.CLONE_FS | linux.CLONE_FILES | linux.CLONE_SYSVSEM | linux.CLONE_THREAD | linux.CLONE_SIGHAND | linux.CLONE_CHILD_SETTID | linux.CLONE_NEWPID | linux.CLONE_CHILD_CLEARTID | linux.CLONE_CHILD_SETTID | linux.CLONE_PARENT | linux.CLONE_PARENT_SETTID | linux.CLONE_SETTLS | linux.CLONE_NEWUSER | linux.CLONE_NEWUTS | linux.CLONE_NEWIPC | linux.CLONE_NEWNET | linux.CLONE_PTRACE | linux.CLONE_UNTRACED | linux.CLONE_IO | linux.CLONE_VFORK | linux.CLONE_DETACHED | linux.CLONE_NEWNS // Clone implements the clone(2) syscall and returns the thread ID of the new // task in t's PID namespace. Clone may return both a non-zero thread ID and a // non-nil error. // // Preconditions: The caller must be running Task.doSyscallInvoke on the task // goroutine. func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { if args.Flags&^SupportedCloneFlags != 0 { return 0, nil, linuxerr.EINVAL } // Since signal actions may refer to application signal handlers by virtual // address, any set of signal handlers must refer to the same address // space. if args.Flags&(linux.CLONE_SIGHAND|linux.CLONE_VM) == linux.CLONE_SIGHAND { return 0, nil, linuxerr.EINVAL } if args.SetTID != 0 { return 0, nil, linuxerr.ENOTSUP } // In order for the behavior of thread-group-directed signals to be sane, // all tasks in a thread group must share signal handlers. if args.Flags&(linux.CLONE_THREAD|linux.CLONE_SIGHAND) == linux.CLONE_THREAD { return 0, nil, linuxerr.EINVAL } // All tasks in a thread group must be in the same PID namespace. if (args.Flags&linux.CLONE_THREAD != 0) && (args.Flags&linux.CLONE_NEWPID != 0 || t.childPIDNamespace != nil) { return 0, nil, linuxerr.EINVAL } // The two different ways of specifying a new PID namespace are // incompatible. if args.Flags&linux.CLONE_NEWPID != 0 && t.childPIDNamespace != nil { return 0, nil, linuxerr.EINVAL } // Thread groups and FS contexts cannot span user namespaces. if args.Flags&linux.CLONE_NEWUSER != 0 && args.Flags&(linux.CLONE_THREAD|linux.CLONE_FS) != 0 { return 0, nil, linuxerr.EINVAL } // args.ExitSignal must be a valid signal. if args.ExitSignal != 0 && !linux.Signal(args.ExitSignal).IsValid() { return 0, nil, linuxerr.EINVAL } if args.Flags&(linux.CLONE_FS|linux.CLONE_NEWNS) == linux.CLONE_FS|linux.CLONE_NEWNS { return 0, nil, linuxerr.EINVAL } // Pull task registers and FPU state, a cloned task will inherit the // state of the current task. if err := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); err != nil { t.Warningf("Unable to pull a full state: %v", err) t.forceSignal(linux.SIGILL, true /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGILL)) return 0, nil, linuxerr.EFAULT } // "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a // single clone(2) or unshare(2) call, the user namespace is guaranteed to // be created first, giving the child (clone(2)) or caller (unshare(2)) // privileges over the remaining namespaces created by the call." - // user_namespaces(7) creds := t.Credentials() userns := creds.UserNamespace if args.Flags&linux.CLONE_NEWUSER != 0 { var err error // "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and // the caller is in a chroot environment (i.e., the caller's root // directory does not match the root directory of the mount namespace // in which it resides)." - clone(2). Neither chroot(2) nor // user_namespaces(7) document this. if t.IsChrooted() { return 0, nil, linuxerr.EPERM } userns, err = creds.NewChildUserNamespace() if err != nil { return 0, nil, err } } if args.Flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC) != 0 && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) { return 0, nil, linuxerr.EPERM } cu := cleanup.Make(func() {}) defer cu.Clean() utsns := t.utsns if args.Flags&linux.CLONE_NEWUTS != 0 { // Note that this must happen after NewUserNamespace so we get // the new userns if there is one. utsns = utsns.Clone(userns) utsns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, utsns)) } else { utsns.IncRef() } cu.Add(func() { utsns.DecRef(t) }) ipcns := t.ipcns if args.Flags&linux.CLONE_NEWIPC != 0 { ipcns = NewIPCNamespace(userns) ipcns.InitPosixQueues(t, t.k.VFS(), creds) ipcns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, ipcns)) } else { ipcns.IncRef() } cu.Add(func() { ipcns.DecRef(t) }) netns := t.netns if args.Flags&linux.CLONE_NEWNET != 0 { netns = inet.NewNamespace(netns, userns) inode := nsfs.NewInode(t, t.k.nsfsMount, netns) netns.SetInode(inode) } else { netns.IncRef() } cu.Add(func() { netns.DecRef(t) }) // We must hold t.mu to access t.image, but we can't hold it during Fork(), // since TaskImage.Fork()=>mm.Fork() takes mm.addressSpaceMu, which is ordered // above Task.mu. So we copy t.image with t.mu held and call Fork() on the copy. t.mu.Lock() curImage := t.image sessionKeyring := t.sessionKeyring t.mu.Unlock() image, err := curImage.Fork(t, t.k, args.Flags&linux.CLONE_VM != 0) if err != nil { return 0, nil, err } cu.Add(func() { image.release(t) }) if args.Flags&linux.CLONE_NEWUSER != 0 { // If the task is in a new user namespace, it cannot share keys. sessionKeyring = nil } // clone() returns 0 in the child. image.Arch.SetReturn(0) if args.Stack != 0 { image.Arch.SetStack(uintptr(args.Stack + args.StackSize)) } if args.Flags&linux.CLONE_SETTLS != 0 { if !image.Arch.SetTLS(uintptr(args.TLS)) { return 0, nil, linuxerr.EPERM } } var fsContext *FSContext if args.Flags&linux.CLONE_FS == 0 || args.Flags&linux.CLONE_NEWNS != 0 { fsContext = t.fsContext.Fork() } else { fsContext = t.fsContext fsContext.IncRef() } mntns := t.mountNamespace if args.Flags&linux.CLONE_NEWNS != 0 { var err error mntns, err = t.k.vfs.CloneMountNamespace(t, creds, mntns, &fsContext.root, &fsContext.cwd, t.k) if err != nil { return 0, nil, err } } else { mntns.IncRef() } cu.Add(func() { mntns.DecRef(t) }) var fdTable *FDTable if args.Flags&linux.CLONE_FILES == 0 { fdTable = t.fdTable.Fork(t, MaxFdLimit) } else { fdTable = t.fdTable fdTable.IncRef() } pidns := t.tg.pidns if t.childPIDNamespace != nil { pidns = t.childPIDNamespace } else if args.Flags&linux.CLONE_NEWPID != 0 { pidns = pidns.NewChild(userns) } tg := t.tg rseqAddr := hostarch.Addr(0) rseqSignature := uint32(0) if args.Flags&linux.CLONE_THREAD == 0 { sh := t.tg.signalHandlers if args.Flags&linux.CLONE_SIGHAND == 0 { sh = sh.Fork() } tg = t.k.NewThreadGroup(pidns, sh, linux.Signal(args.ExitSignal), tg.limits.GetCopy()) tg.oomScoreAdj = atomicbitops.FromInt32(t.tg.oomScoreAdj.Load()) rseqAddr = t.rseqAddr rseqSignature = t.rseqSignature } uc := t.userCounters if uc.uid != creds.RealKUID { uc = t.k.GetUserCounters(creds.RealKUID) } cfg := &TaskConfig{ Kernel: t.k, ThreadGroup: tg, SignalMask: t.SignalMask(), TaskImage: image, FSContext: fsContext, FDTable: fdTable, Credentials: creds, Niceness: t.Niceness(), NetworkNamespace: netns, AllowedCPUMask: t.CPUMask(), UTSNamespace: utsns, IPCNamespace: ipcns, MountNamespace: mntns, RSeqAddr: rseqAddr, RSeqSignature: rseqSignature, ContainerID: t.ContainerID(), UserCounters: uc, SessionKeyring: sessionKeyring, Origin: t.Origin, } if args.Flags&linux.CLONE_THREAD == 0 { cfg.Parent = t } else { cfg.InheritParent = t } nt, err := t.tg.pidns.owner.NewTask(t, cfg) // If NewTask succeeds, we transfer references to nt. If NewTask fails, it does // the cleanup for us. cu.Release() if err != nil { return 0, nil, err } // "A child process created via fork(2) inherits a copy of its parent's // alternate signal stack settings" - sigaltstack(2). // // However kernel/fork.c:copy_process() adds a limitation to this: // "sigaltstack should be cleared when sharing the same VM". if args.Flags&linux.CLONE_VM == 0 || args.Flags&linux.CLONE_VFORK != 0 { nt.SetSignalStack(t.SignalStack()) } if userns != creds.UserNamespace { if err := nt.SetUserNamespace(userns); err != nil { // This shouldn't be possible: userns was created from nt.creds, so // nt should have CAP_SYS_ADMIN in userns. panic("Task.Clone: SetUserNamespace failed: " + err.Error()) } } // This has to happen last, because e.g. ptraceClone may send a SIGSTOP to // nt that it must receive before its task goroutine starts running. tid := nt.k.tasks.Root.IDOfTask(nt) defer nt.Start(tid) if seccheck.Global.Enabled(seccheck.PointClone) { mask, info := getCloneSeccheckInfo(t, nt, args.Flags) if err := seccheck.Global.SentToSinks(func(c seccheck.Sink) error { return c.Clone(t, mask, info) }); err != nil { // nt has been visible to the rest of the system since NewTask, so // it may be blocking execve or a group stop, have been notified // for group signal delivery, had children reparented to it, etc. // Thus we can't just drop it on the floor. Instead, instruct the // task goroutine to exit immediately, as quietly as possible. nt.exitTracerNotified = true nt.exitTracerAcked = true nt.exitParentNotified = true nt.exitParentAcked = true nt.runState = (*runExitMain)(nil) return 0, nil, err } } // "If fork/clone and execve are allowed by @prog, any child processes will // be constrained to the same filters and system call ABI as the parent." - // Documentation/prctl/seccomp_filter.txt if ts := t.seccomp.Load(); ts != nil { seccompCopy := ts.copy() seccompCopy.populateCache(nt) nt.seccomp.Store(seccompCopy) } else { nt.seccomp.Store(nil) } if args.Flags&linux.CLONE_VFORK != 0 { nt.vforkParent = t } if args.Flags&linux.CLONE_CHILD_CLEARTID != 0 { nt.SetClearTID(hostarch.Addr(args.ChildTID)) } if args.Flags&linux.CLONE_CHILD_SETTID != 0 { ctid := nt.ThreadID() ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), hostarch.Addr(args.ChildTID)) } ntid := t.tg.pidns.IDOfTask(nt) if args.Flags&linux.CLONE_PARENT_SETTID != 0 { ntid.CopyOut(t, hostarch.Addr(args.ParentTID)) } t.traceCloneEvent(tid) kind := ptraceCloneKindClone if args.Flags&linux.CLONE_VFORK != 0 { kind = ptraceCloneKindVfork } else if linux.Signal(args.ExitSignal) == linux.SIGCHLD { kind = ptraceCloneKindFork } if t.ptraceClone(kind, nt, args) { if args.Flags&linux.CLONE_VFORK != 0 { return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil } return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil } if args.Flags&linux.CLONE_VFORK != 0 { t.maybeBeginVforkStop(nt) return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil } return ntid, nil, nil } func getCloneSeccheckInfo(t, nt *Task, flags uint64) (seccheck.FieldSet, *pb.CloneInfo) { fields := seccheck.Global.GetFieldSet(seccheck.PointClone) var cwd string if fields.Context.Contains(seccheck.FieldCtxtCwd) { cwd = getTaskCurrentWorkingDirectory(t) } t.k.tasks.mu.RLock() defer t.k.tasks.mu.RUnlock() info := &pb.CloneInfo{ CreatedThreadId: int32(nt.k.tasks.Root.tids[nt]), CreatedThreadGroupId: int32(nt.k.tasks.Root.tgids[nt.tg]), CreatedThreadStartTimeNs: nt.startTime.Nanoseconds(), Flags: flags, } if !fields.Context.Empty() { info.ContextData = &pb.ContextData{} LoadSeccheckDataLocked(t, fields.Context, info.ContextData, cwd) } return fields, info } // maybeBeginVforkStop checks if a previously-started vfork child is still // running and has not yet released its MM, such that its parent t should enter // a vforkStop. // // Preconditions: The caller must be running on t's task goroutine. func (t *Task) maybeBeginVforkStop(child *Task) { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() if t.killedLocked() { child.vforkParent = nil return } if child.vforkParent == t { t.beginInternalStopLocked((*vforkStop)(nil)) } } func (t *Task) unstopVforkParent() { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if p := t.vforkParent; p != nil { p.tg.signalHandlers.mu.Lock() defer p.tg.signalHandlers.mu.Unlock() if _, ok := p.stop.(*vforkStop); ok { p.endInternalStopLocked() } // Parent no longer needs to be unstopped. t.vforkParent = nil } } // +stateify savable type runSyscallAfterPtraceEventClone struct { vforkChild *Task // If vforkChild is not nil, vforkChildTID is its thread ID in the parent's // PID namespace. vforkChildTID must be stored since the child may exit and // release its TID before the PTRACE_EVENT stop ends. vforkChildTID ThreadID } func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState { if r.vforkChild != nil { t.maybeBeginVforkStop(r.vforkChild) return &runSyscallAfterVforkStop{r.vforkChildTID} } return (*runSyscallExit)(nil) } // +stateify savable type runSyscallAfterVforkStop struct { // childTID has the same meaning as // runSyscallAfterPtraceEventClone.vforkChildTID. childTID ThreadID } func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState { t.ptraceVforkDone(r.childTID) return (*runSyscallExit)(nil) } // Setns reassociates thread with the specified namespace. func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error { d, ok := fd.Dentry().Impl().(*kernfs.Dentry) if !ok { return linuxerr.EINVAL } i, ok := d.Inode().(*nsfs.Inode) if !ok { return linuxerr.EINVAL } switch ns := i.Namespace().(type) { case *inet.Namespace: if flags != 0 && flags != linux.CLONE_NEWNET { return linuxerr.EINVAL } if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) || !t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } oldNS := t.NetworkNamespace() ns.IncRef() t.mu.Lock() t.netns = ns t.mu.Unlock() oldNS.DecRef(t) return nil case *IPCNamespace: if flags != 0 && flags != linux.CLONE_NEWIPC { return linuxerr.EINVAL } if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) || !t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } oldNS := t.IPCNamespace() ns.IncRef() t.mu.Lock() t.ipcns = ns t.mu.Unlock() oldNS.DecRef(t) return nil case *vfs.MountNamespace: if flags != 0 && flags != linux.CLONE_NEWNS { return linuxerr.EINVAL } if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.Owner) || !t.Credentials().HasCapability(linux.CAP_SYS_CHROOT) || !t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } oldFSContext := t.fsContext // The current task has to be an exclusive owner of its fs context. if oldFSContext.ReadRefs() != 1 { return linuxerr.EINVAL } fsContext := oldFSContext.Fork() fsContext.root.DecRef(t) fsContext.cwd.DecRef(t) vd := ns.Root(t) fsContext.root = vd vd.IncRef() fsContext.cwd = vd oldNS := t.mountNamespace ns.IncRef() t.mu.Lock() t.mountNamespace = ns t.fsContext = fsContext t.mu.Unlock() oldNS.DecRef(t) oldFSContext.DecRef(t) return nil case *UTSNamespace: if flags != 0 && flags != linux.CLONE_NEWUTS { return linuxerr.EINVAL } if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) || !t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } oldNS := t.UTSNamespace() ns.IncRef() t.mu.Lock() t.utsns = ns t.mu.Unlock() oldNS.DecRef(t) return nil default: return linuxerr.EINVAL } } // Unshare changes the set of resources t shares with other tasks, as specified // by flags. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) Unshare(flags int32) error { // "CLONE_THREAD, CLONE_SIGHAND, and CLONE_VM can be specified in flags if // the caller is single threaded (i.e., it is not sharing its address space // with another process or thread). In this case, these flags have no // effect. (Note also that specifying CLONE_THREAD automatically implies // CLONE_VM, and specifying CLONE_VM automatically implies CLONE_SIGHAND.) // If the process is multithreaded, then the use of these flags results in // an error." - unshare(2). This is incorrect (cf. // kernel/fork.c:ksys_unshare()): // // - CLONE_THREAD does not imply CLONE_VM. // // - CLONE_SIGHAND implies CLONE_THREAD. // // - Only CLONE_VM requires that the caller is not sharing its address // space with another thread. CLONE_SIGHAND requires that the caller is not // sharing its signal handlers, and CLONE_THREAD requires that the caller // is the only thread in its thread group. // // Since we don't count the number of tasks using each address space or set // of signal handlers, we reject CLONE_VM and CLONE_SIGHAND altogether. if flags&(linux.CLONE_VM|linux.CLONE_SIGHAND) != 0 { return linuxerr.EINVAL } creds := t.Credentials() if flags&linux.CLONE_THREAD != 0 { t.tg.signalHandlers.mu.Lock() if t.tg.tasksCount != 1 { t.tg.signalHandlers.mu.Unlock() return linuxerr.EINVAL } t.tg.signalHandlers.mu.Unlock() // This isn't racy because we're the only living task, and therefore // the only task capable of creating new ones, in our thread group. } if flags&linux.CLONE_NEWUSER != 0 { if t.IsChrooted() { return linuxerr.EPERM } newUserNS, err := creds.NewChildUserNamespace() if err != nil { return err } err = t.SetUserNamespace(newUserNS) if err != nil { return err } // Need to reload creds, because t.SetUserNamespace() changed task credentials. creds = t.Credentials() } haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN) if flags&linux.CLONE_NEWPID != 0 { if !haveCapSysAdmin { return linuxerr.EPERM } t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace()) } if flags&linux.CLONE_NEWNET != 0 { if !haveCapSysAdmin { return linuxerr.EPERM } netns := t.NetworkNamespace() netns = inet.NewNamespace(netns, t.UserNamespace()) netnsInode := nsfs.NewInode(t, t.k.nsfsMount, netns) netns.SetInode(netnsInode) t.mu.Lock() oldNetns := t.netns t.netns = netns t.mu.Unlock() oldNetns.DecRef(t) } cu := cleanup.Cleanup{} // All cu actions has to be executed after releasing t.mu. defer cu.Clean() t.mu.Lock() defer t.mu.Unlock() // Can't defer unlock: DecRefs must occur without holding t.mu. if flags&linux.CLONE_NEWUTS != 0 { if !haveCapSysAdmin { return linuxerr.EPERM } // Note that this must happen after NewUserNamespace, so the // new user namespace is used if there is one. oldUTSNS := t.utsns t.utsns = t.utsns.Clone(creds.UserNamespace) t.utsns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.utsns)) cu.Add(func() { oldUTSNS.DecRef(t) }) } if flags&linux.CLONE_NEWIPC != 0 { if !haveCapSysAdmin { return linuxerr.EPERM } // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC // namespace" oldIPCNS := t.ipcns t.ipcns = NewIPCNamespace(creds.UserNamespace) t.ipcns.InitPosixQueues(t, t.k.VFS(), creds) t.ipcns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.ipcns)) cu.Add(func() { oldIPCNS.DecRef(t) }) } if flags&linux.CLONE_FILES != 0 { oldFDTable := t.fdTable t.fdTable = oldFDTable.Fork(t, MaxFdLimit) cu.Add(func() { oldFDTable.DecRef(t) }) } if flags&linux.CLONE_FS != 0 || flags&linux.CLONE_NEWNS != 0 { oldFSContext := t.fsContext t.fsContext = oldFSContext.Fork() cu.Add(func() { oldFSContext.DecRef(t) }) } if flags&linux.CLONE_NEWNS != 0 { if !haveCapSysAdmin { return linuxerr.EPERM } oldMountNS := t.mountNamespace mntns, err := t.k.vfs.CloneMountNamespace(t, creds, oldMountNS, &t.fsContext.root, &t.fsContext.cwd, t.k) if err != nil { return err } t.mountNamespace = mntns cu.Add(func() { oldMountNS.DecRef(t) }) } return nil } // UnshareFdTable unshares the FdTable that task t shares with other tasks, upto // the maxFd. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) UnshareFdTable(maxFd int32) { t.mu.Lock() oldFDTable := t.fdTable t.fdTable = oldFDTable.Fork(t, maxFd) t.mu.Unlock() oldFDTable.DecRef(t) } // vforkStop is a TaskStop imposed on a task that creates a child with // CLONE_VFORK or vfork(2), that ends when the child task ceases to use its // current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so // that the child and parent share mappings until the child execve()s into a // new process image or exits.) // // +stateify savable type vforkStop struct{} // StopIgnoresKill implements TaskStop.Killable. func (*vforkStop) Killable() bool { return true } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_context.go000066400000000000000000000110141465435605700252620ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/devutil" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) // Deadline implements context.Context.Deadline. func (*Task) Deadline() (time.Time, bool) { return time.Time{}, false } // Done implements context.Context.Done. func (*Task) Done() <-chan struct{} { return nil } // Err implements context.Context.Err. func (*Task) Err() error { return nil } // Value implements context.Context.Value. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) Value(key any) any { // This function is very hot; skip this check outside of +race builds. if sync.RaceEnabled { t.assertTaskGoroutine() } return t.contextValue(key, true /* isTaskGoroutine */) } func (t *Task) contextValue(key any, isTaskGoroutine bool) any { switch key { case CtxCanTrace: return t.CanTrace case CtxKernel: return t.k case CtxPIDNamespace: return t.tg.pidns case CtxUTSNamespace: if !isTaskGoroutine { t.mu.Lock() defer t.mu.Unlock() } utsns := t.utsns utsns.IncRef() return utsns case ipc.CtxIPCNamespace: if !isTaskGoroutine { t.mu.Lock() defer t.mu.Unlock() } ipcns := t.ipcns ipcns.IncRef() return ipcns case CtxTask: return t case auth.CtxCredentials: return t.creds.Load() case auth.CtxThreadGroupID: return int32(t.tg.ID()) case vfs.CtxRoot: if !isTaskGoroutine { t.mu.Lock() defer t.mu.Unlock() } return t.fsContext.RootDirectory() case vfs.CtxMountNamespace: if !isTaskGoroutine { t.mu.Lock() defer t.mu.Unlock() } t.mountNamespace.IncRef() return t.mountNamespace case devutil.CtxDevGoferClient: return t.k.GetDevGoferClient(t.k.ContainerName(t.containerID)) case inet.CtxStack: return t.NetworkContext() case inet.CtxNamespaceByFD: return t.NetworkNamespaceByFD case ktime.CtxRealtimeClock: return t.k.RealtimeClock() case limits.CtxLimits: return t.tg.limits case linux.CtxSignalNoInfoFunc: return func(sig linux.Signal) error { return t.SendSignal(SignalInfoNoInfo(sig, t, t)) } case pgalloc.CtxMemoryCgroupID: return t.memCgID.Load() case pgalloc.CtxMemoryFile: return t.k.mf case platform.CtxPlatform: return t.k case shm.CtxDeviceID: return t.k.sysVShmDevID case uniqueid.CtxGlobalUniqueID: return t.k.UniqueID() case uniqueid.CtxGlobalUniqueIDProvider: return t.k case uniqueid.CtxInotifyCookie: return t.k.GenerateInotifyCookie() case unimpl.CtxEvents: return t.k case cpuid.CtxFeatureSet: return t.k.featureSet default: return nil } } // fallbackContext adds a level of indirection for embedding to resolve // ambiguity for method resolution. We favor context.NoTask. type fallbackTask struct { *Task } // taskAsyncContext implements context.Context for a goroutine that performs // work on behalf of a Task, but is not the task goroutine. type taskAsyncContext struct { context.NoTask fallbackTask } // Value implements context.Context.Value. func (t *taskAsyncContext) Value(key any) any { return t.fallbackTask.contextValue(key, false /* isTaskGoroutine */) } // AsyncContext returns a context.Context representing t. The returned // context.Context is intended for use by goroutines other than t's task // goroutine; for example, signal delivery to t will not interrupt goroutines // that are blocking using the returned context.Context. func (t *Task) AsyncContext() context.Context { return &taskAsyncContext{ fallbackTask: fallbackTask{t}, } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_exec.go000066400000000000000000000310521465435605700245260ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel // This file implements the machinery behind the execve() syscall. In brief, a // thread executes an execve() by killing all other threads in its thread // group, assuming the leader's identity, and then switching process images. // // This design is effectively mandated by Linux. From ptrace(2): // // """ // execve(2) under ptrace // When one thread in a multithreaded process calls execve(2), the // kernel destroys all other threads in the process, and resets the // thread ID of the execing thread to the thread group ID (process ID). // (Or, to put things another way, when a multithreaded process does an // execve(2), at completion of the call, it appears as though the // execve(2) occurred in the thread group leader, regardless of which // thread did the execve(2).) This resetting of the thread ID looks // very confusing to tracers: // // * All other threads stop in PTRACE_EVENT_EXIT stop, if the // PTRACE_O_TRACEEXIT option was turned on. Then all other threads // except the thread group leader report death as if they exited via // _exit(2) with exit code 0. // // * The execing tracee changes its thread ID while it is in the // execve(2). (Remember, under ptrace, the "pid" returned from // waitpid(2), or fed into ptrace calls, is the tracee's thread ID.) // That is, the tracee's thread ID is reset to be the same as its // process ID, which is the same as the thread group leader's thread // ID. // // * Then a PTRACE_EVENT_EXEC stop happens, if the PTRACE_O_TRACEEXEC // option was turned on. // // * If the thread group leader has reported its PTRACE_EVENT_EXIT stop // by this time, it appears to the tracer that the dead thread leader // "reappears from nowhere". (Note: the thread group leader does not // report death via WIFEXITED(status) until there is at least one // other live thread. This eliminates the possibility that the // tracer will see it dying and then reappearing.) If the thread // group leader was still alive, for the tracer this may look as if // thread group leader returns from a different system call than it // entered, or even "returned from a system call even though it was // not in any system call". If the thread group leader was not // traced (or was traced by a different tracer), then during // execve(2) it will appear as if it has become a tracee of the // tracer of the execing tracee. // // All of the above effects are the artifacts of the thread ID change in // the tracee. // """ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/seccheck" pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // execStop is a TaskStop that a task sets on itself when it wants to execve // and is waiting for the other tasks in its thread group to exit first. // // +stateify savable type execStop struct{} // Killable implements TaskStop.Killable. func (*execStop) Killable() bool { return true } // Execve implements the execve(2) syscall by killing all other tasks in its // thread group and switching to newImage. Execve always takes ownership of // newImage. // // If executable is not nil, it is the first executable file that was loaded in // the process of obtaining newImage, and pathname is a path to it. // // Preconditions: The caller must be running Task.doSyscallInvoke on the task // goroutine. func (t *Task) Execve(newImage *TaskImage, argv, env []string, executable *vfs.FileDescription, pathname string) (*SyscallControl, error) { cu := cleanup.Make(func() { newImage.release(t) }) defer cu.Clean() // We can't clearly hold kernel package locks while stat'ing executable. if seccheck.Global.Enabled(seccheck.PointExecve) { mask, info := getExecveSeccheckInfo(t, argv, env, executable, pathname) if err := seccheck.Global.SentToSinks(func(c seccheck.Sink) error { return c.Execve(t, mask, info) }); err != nil { return nil, err } } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() if t.tg.exiting || t.tg.execing != nil { // We lost to a racing group-exit, kill, or exec from another thread // and should just exit. return nil, linuxerr.EINTR } // Cancel any racing group stops. t.tg.endGroupStopLocked(false) // If the task has any siblings, they have to exit before the exec can // continue. t.tg.execing = t if t.tg.tasks.Front() != t.tg.tasks.Back() { // "[All] other threads except the thread group leader report death as // if they exited via _exit(2) with exit code 0." - ptrace(2) for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() { if t != sibling { sibling.killLocked() } } // The last sibling to exit will wake t. t.beginInternalStopLocked((*execStop)(nil)) } cu.Release() return &SyscallControl{next: &runSyscallAfterExecStop{newImage}, ignoreReturn: true}, nil } // The runSyscallAfterExecStop state continues execve(2) after all siblings of // a thread in the execve syscall have exited. // // +stateify savable type runSyscallAfterExecStop struct { image *TaskImage } func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.traceExecEvent(r.image) t.tg.pidns.owner.mu.Lock() t.tg.execing = nil if t.killed() { t.tg.pidns.owner.mu.Unlock() r.image.release(t) return (*runInterrupt)(nil) } // We are the thread group leader now. Save our old thread ID for // PTRACE_EVENT_EXEC. This is racy in that if a tracer attaches after this // point it will get a PID of 0, but this is consistent with Linux. oldTID := ThreadID(0) if tracer := t.Tracer(); tracer != nil { oldTID = tracer.tg.pidns.tids[t] } t.promoteLocked() // "POSIX timers are not preserved (timer_create(2))." - execve(2). Handle // this first since POSIX timers are protected by the signal mutex, which // we're about to change. Note that we have to stop and destroy timers // without holding any mutexes to avoid circular lock ordering. var its []*IntervalTimer t.tg.signalHandlers.mu.Lock() for _, it := range t.tg.timers { its = append(its, it) } clear(t.tg.timers) t.tg.signalHandlers.mu.Unlock() t.tg.pidns.owner.mu.Unlock() for _, it := range its { it.DestroyTimer() } t.tg.pidns.owner.mu.Lock() // "During an execve(2), the dispositions of handled signals are reset to // the default; the dispositions of ignored signals are left unchanged. ... // [The] signal mask is preserved across execve(2). ... [The] pending // signal set is preserved across an execve(2)." - signal(7) // // Details: // // - If the thread group is sharing its signal handlers with another thread // group via CLONE_SIGHAND, execve forces the signal handlers to be copied // (see Linux's fs/exec.c:de_thread). We're not reference-counting signal // handlers, so we always make a copy. // // - "Disposition" only means sigaction::sa_handler/sa_sigaction; flags, // restorer (if present), and mask are always reset. (See Linux's // fs/exec.c:setup_new_exec => kernel/signal.c:flush_signal_handlers.) t.tg.signalHandlers = t.tg.signalHandlers.CopyForExec() t.endStopCond.L = &t.tg.signalHandlers.mu // "Any alternate signal stack is not preserved (sigaltstack(2))." - execve(2) t.signalStack = linux.SignalStack{Flags: linux.SS_DISABLE} // "The termination signal is reset to SIGCHLD (see clone(2))." t.tg.terminationSignal = linux.SIGCHLD // execed indicates that the process can no longer join a process group // in some scenarios (namely, the parent call setpgid(2) on the child). // See the JoinProcessGroup function in sessions.go for more context. t.tg.execed = true // Maximum RSS is preserved across execve(2). t.updateRSSLocked() // Restartable sequence state is discarded. t.rseqPreempted = false t.rseqCPU = -1 t.rseqAddr = 0 t.rseqSignature = 0 t.oldRSeqCPUAddr = 0 t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{}) t.tg.pidns.owner.mu.Unlock() oldFDTable := t.fdTable t.fdTable = t.fdTable.Fork(t, int32(t.fdTable.CurrentMaxFDs())) oldFDTable.DecRef(t) // Remove FDs with the CloseOnExec flag set. t.fdTable.RemoveIf(t, func(_ *vfs.FileDescription, flags FDFlags) bool { return flags.CloseOnExec }) // Handle the robust futex list. t.exitRobustList() // NOTE(b/30815691): We currently do not implement privileged // executables (set-user/group-ID bits and file capabilities). This // allows us to unconditionally enable user dumpability on the new mm. // See fs/exec.c:setup_new_exec. r.image.MemoryManager.SetDumpability(mm.UserDumpable) // Switch to the new process. t.MemoryManager().Deactivate() t.mu.Lock() // Update credentials to reflect the execve. This should precede switching // MMs to ensure that dumpability has been reset first, if needed. t.updateCredsForExecLocked() oldImage := t.image t.image = *r.image t.mu.Unlock() // Don't hold t.mu while calling t.image.release(), that may // attempt to acquire TaskImage.MemoryManager.mappingMu, a lock order // violation. oldImage.release(t) t.unstopVforkParent() t.p.FullStateChanged() // NOTE(b/30316266): All locks must be dropped prior to calling Activate. t.MemoryManager().Activate(t) t.ptraceExec(oldTID) return (*runSyscallExit)(nil) } // promoteLocked makes t the leader of its thread group. If t is already the // thread group leader, promoteLocked is a no-op. // // Preconditions: // - All other tasks in t's thread group, including the existing leader (if it // is not t), have reached TaskExitZombie. // - The TaskSet mutex must be locked for writing. func (t *Task) promoteLocked() { oldLeader := t.tg.leader if t == oldLeader { return } // Swap the leader's TIDs with the execing task's. The latter will be // released when the old leader is reaped below. for ns := t.tg.pidns; ns != nil; ns = ns.parent { oldTID, leaderTID := ns.tids[t], ns.tids[oldLeader] ns.tids[oldLeader] = oldTID ns.tids[t] = leaderTID ns.tasks[oldTID] = oldLeader ns.tasks[leaderTID] = t // Neither the ThreadGroup nor TGID change, so no need to // update ns.tgids. } // Inherit the old leader's start time. oldStartTime := oldLeader.StartTime() t.mu.Lock() t.startTime = oldStartTime t.mu.Unlock() t.tg.leader = t t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t]) t.updateInfoLocked() // Reap the original leader. If it has a tracer, detach it instead of // waiting for it to acknowledge the original leader's death. oldLeader.exitParentNotified = true oldLeader.exitParentAcked = true if tracer := oldLeader.Tracer(); tracer != nil { delete(tracer.ptraceTracees, oldLeader) oldLeader.forgetTracerLocked() // Notify the tracer that it will no longer be receiving these events // from the tracee. tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop | EventGroupContinue) } oldLeader.exitNotifyLocked(false) } func getExecveSeccheckInfo(t *Task, argv, env []string, executable *vfs.FileDescription, pathname string) (seccheck.FieldSet, *pb.ExecveInfo) { fields := seccheck.Global.GetFieldSet(seccheck.PointExecve) info := &pb.ExecveInfo{ Argv: argv, Env: env, } if executable != nil { info.BinaryPath = pathname if fields.Local.Contains(seccheck.FieldSentryExecveBinaryInfo) { statOpts := vfs.StatOptions{ Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID, } if stat, err := executable.Stat(t, statOpts); err == nil { if stat.Mask&(linux.STATX_TYPE|linux.STATX_MODE) == (linux.STATX_TYPE | linux.STATX_MODE) { info.BinaryMode = uint32(stat.Mode) } if stat.Mask&linux.STATX_UID != 0 { info.BinaryUid = stat.UID } if stat.Mask&linux.STATX_GID != 0 { info.BinaryGid = stat.GID } } } } if !fields.Context.Empty() { info.ContextData = &pb.ContextData{} LoadSeccheckData(t, fields.Context, info.ContextData) } return fields, info } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_exit.go000066400000000000000000001252341465435605700245610ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel // This file implements the task exit cycle: // // - Tasks are asynchronously requested to exit with Task.Kill. // // - When able, the task goroutine enters the exit path starting from state // runExit. // // - Other tasks observe completed exits with Task.Wait (which implements the // wait*() family of syscalls). import ( "errors" "fmt" "strconv" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/seccheck" pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto" "gvisor.dev/gvisor/pkg/waiter" ) // TaskExitState represents a step in the task exit path. // // "Exiting" and "exited" are often ambiguous; prefer to name specific states. type TaskExitState int const ( // TaskExitNone indicates that the task has not begun exiting. TaskExitNone TaskExitState = iota // TaskExitInitiated indicates that the task goroutine has entered the exit // path, and the task is no longer eligible to participate in group stops // or group signal handling. TaskExitInitiated is analogous to Linux's // PF_EXITING. TaskExitInitiated // TaskExitZombie indicates that the task has released its resources, and // the task no longer prevents a sibling thread from completing execve. TaskExitZombie // TaskExitDead indicates that the task's thread IDs have been released, // and the task no longer prevents its thread group leader from being // reaped. ("Reaping" refers to the transitioning of a task from // TaskExitZombie to TaskExitDead.) TaskExitDead ) // String implements fmt.Stringer. func (t TaskExitState) String() string { switch t { case TaskExitNone: return "TaskExitNone" case TaskExitInitiated: return "TaskExitInitiated" case TaskExitZombie: return "TaskExitZombie" case TaskExitDead: return "TaskExitDead" default: return strconv.Itoa(int(t)) } } // killLocked marks t as killed by enqueueing a SIGKILL, without causing the // thread-group-affecting side effects SIGKILL usually has. // // Preconditions: The signal mutex must be locked. func (t *Task) killLocked() { // Clear killable stops. if t.stop != nil && t.stop.Killable() { t.endInternalStopLocked() } t.pendingSignals.enqueue(&linux.SignalInfo{ Signo: int32(linux.SIGKILL), // Linux just sets SIGKILL in the pending signal bitmask without // enqueueing an actual siginfo, such that // kernel/signal.c:collect_signal() initializes si_code to SI_USER. Code: linux.SI_USER, }, nil) t.interrupt() } // killed returns true if t has a SIGKILL pending. killed is analogous to // Linux's fatal_signal_pending(). // // Preconditions: The caller must be running on the task goroutine. func (t *Task) killed() bool { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() return t.killedLocked() } func (t *Task) killedLocked() bool { return t.pendingSignals.pendingSet&linux.SignalSetOf(linux.SIGKILL) != 0 } // PrepareExit indicates an exit with the given status. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) PrepareExit(ws linux.WaitStatus) { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() last := t.tg.activeTasks == 1 if last { t.prepareGroupExitLocked(ws) return } t.exitStatus = ws } // PrepareGroupExit indicates a group exit with status es to t's thread group. // // PrepareGroupExit is analogous to Linux's do_group_exit(), except that it // does not tail-call do_exit(), except that it *does* set Task.exitStatus. // (Linux does not do so until within do_exit(), since it reuses exit_code for // ptrace.) // // Preconditions: The caller must be running on the task goroutine. func (t *Task) PrepareGroupExit(ws linux.WaitStatus) { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() t.prepareGroupExitLocked(ws) } // Preconditions: // - The caller must be running on the task goroutine. // - The signal mutex must be locked. func (t *Task) prepareGroupExitLocked(ws linux.WaitStatus) { if t.tg.exiting || t.tg.execing != nil { // Note that if t.tg.exiting is false but t.tg.execing is not nil, i.e. // this "group exit" is being executed by the killed sibling of an // execing task, then Task.Execve never set t.tg.exitStatus, so it's // still the zero value. This is consistent with Linux, both in intent // ("all other threads ... report death as if they exited via _exit(2) // with exit code 0" - ptrace(2), "execve under ptrace") and in // implementation (compare fs/exec.c:de_thread() => // kernel/signal.c:zap_other_threads() and // kernel/exit.c:do_group_exit() => // include/linux/sched.h:signal_group_exit()). t.exitStatus = t.tg.exitStatus return } t.tg.exiting = true t.tg.exitStatus = ws t.exitStatus = ws for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() { if sibling != t { sibling.killLocked() } } } // Kill requests that all tasks in ts exit as if group exiting with status ws. // Kill does not wait for tasks to exit. // // Kill has no analogue in Linux; it's provided for save/restore only. func (ts *TaskSet) Kill(ws linux.WaitStatus) { ts.mu.Lock() defer ts.mu.Unlock() ts.Root.exiting = true for t := range ts.Root.tids { t.tg.signalHandlers.mu.Lock() if !t.tg.exiting { t.tg.exiting = true t.tg.exitStatus = ws } t.killLocked() t.tg.signalHandlers.mu.Unlock() } } // advanceExitStateLocked checks that t's current exit state is oldExit, then // sets it to newExit. If t's current exit state is not oldExit, // advanceExitStateLocked panics. // // Preconditions: The TaskSet mutex must be locked. func (t *Task) advanceExitStateLocked(oldExit, newExit TaskExitState) { if t.exitState != oldExit { panic(fmt.Sprintf("Transitioning from exit state %v to %v: unexpected preceding state %v", oldExit, newExit, t.exitState)) } t.Debugf("Transitioning from exit state %v to %v", oldExit, newExit) t.exitState = newExit } // runExit is the entry point into the task exit path. // // +stateify savable type runExit struct{} func (*runExit) execute(t *Task) taskRunState { t.ptraceExit() return (*runExitMain)(nil) } // +stateify savable type runExitMain struct{} func (*runExitMain) execute(t *Task) taskRunState { t.traceExitEvent() if seccheck.Global.Enabled(seccheck.PointTaskExit) { info := &pb.TaskExit{ ExitStatus: int32(t.tg.exitStatus), } fields := seccheck.Global.GetFieldSet(seccheck.PointTaskExit) if !fields.Context.Empty() { info.ContextData = &pb.ContextData{} LoadSeccheckData(t, fields.Context, info.ContextData) } seccheck.Global.SentToSinks(func(c seccheck.Sink) error { return c.TaskExit(t, fields, info) }) } lastExiter := t.exitThreadGroup() t.ResetKcov() // If the task has a cleartid, and the thread group wasn't killed by a // signal, handle that before releasing the MM. if t.cleartid != 0 { t.tg.signalHandlers.mu.Lock() signaled := t.tg.exiting && t.tg.exitStatus.Signaled() t.tg.signalHandlers.mu.Unlock() if !signaled { zero := ThreadID(0) if _, err := zero.CopyOut(t, t.cleartid); err == nil { t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1) } // If the CopyOut fails, there's nothing we can do. } } // Handle the robust futex list. t.exitRobustList() // Deactivate the address space and update max RSS before releasing the // task's MM. t.Deactivate() t.tg.pidns.owner.mu.Lock() t.updateRSSLocked() t.tg.pidns.owner.mu.Unlock() // Release the task image resources. Accessing these fields must be // done with t.mu held, but the mm.DecUsers() call must be done outside // of that lock. t.mu.Lock() mm := t.image.MemoryManager t.image.MemoryManager = nil t.image.fu = nil t.mu.Unlock() mm.DecUsers(t) // Releasing the MM unblocks a blocked CLONE_VFORK parent. t.unstopVforkParent() t.fsContext.DecRef(t) t.fdTable.DecRef(t) // Detach task from all cgroups. This must happen before potentially the // last ref to the cgroupfs mount is dropped below. t.LeaveCgroups() t.mu.Lock() mntns := t.mountNamespace t.mountNamespace = nil utsns := t.utsns t.utsns = nil ipcns := t.ipcns t.ipcns = nil netns := t.netns t.netns = nil t.mu.Unlock() mntns.DecRef(t) utsns.DecRef(t) ipcns.DecRef(t) netns.DecRef(t) // If this is the last task to exit from the thread group, release the // thread group's resources. if lastExiter { t.tg.Release(t) } // Detach tracees. t.exitPtrace() // Reparent the task's children. t.exitChildren() // Don't tail-call runExitNotify, as exitChildren may have initiated a stop // to wait for a PID namespace to die. return (*runExitNotify)(nil) } // exitThreadGroup transitions t to TaskExitInitiated, indicating to t's thread // group that it is no longer eligible to participate in group activities. It // returns true if t is the last task in its thread group to call // exitThreadGroup. func (t *Task) exitThreadGroup() bool { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() t.tg.signalHandlers.mu.Lock() // Can't defer unlock: see below. t.advanceExitStateLocked(TaskExitNone, TaskExitInitiated) t.tg.activeTasks-- last := t.tg.activeTasks == 0 // Ensure that someone will handle the signals we can't. t.setSignalMaskLocked(^linux.SignalSet(0)) // Check if this task's exit interacts with an initiated group stop. if !t.groupStopPending { t.tg.signalHandlers.mu.Unlock() return last } t.groupStopPending = false sig := t.tg.groupStopSignal notifyParent := t.participateGroupStopLocked() // signalStop must be called with t's signal mutex unlocked. t.tg.signalHandlers.mu.Unlock() if notifyParent && t.tg.leader.parent != nil { t.tg.leader.parent.signalStop(t, linux.CLD_STOPPED, int32(sig)) t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop) } return last } func (t *Task) exitChildren() { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() newParent := t.findReparentTargetLocked() if newParent == nil { // "If the init process of a PID namespace terminates, the kernel // terminates all of the processes in the namespace via a SIGKILL // signal." - pid_namespaces(7) t.Debugf("Init process terminating, killing namespace") t.tg.pidns.exiting = true for other := range t.tg.pidns.tgids { if other == t.tg { continue } other.signalHandlers.mu.Lock() other.leader.sendSignalLocked(&linux.SignalInfo{ Signo: int32(linux.SIGKILL), }, true /* group */) other.signalHandlers.mu.Unlock() } // TODO(b/37722272): The init process waits for all processes in the // namespace to exit before completing its own exit // (kernel/pid_namespace.c:zap_pid_ns_processes()). Stop until all // other tasks in the namespace are dead, except possibly for this // thread group's leader (which can't be reaped until this task exits). } // This is correct even if newParent is nil (it ensures that children don't // wait for a parent to reap them.) for c := range t.children { if sig := c.ParentDeathSignal(); sig != 0 { siginfo := &linux.SignalInfo{ Signo: int32(sig), Code: linux.SI_USER, } siginfo.SetPID(int32(c.tg.pidns.tids[t])) siginfo.SetUID(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow())) c.tg.signalHandlers.mu.Lock() c.sendSignalLocked(siginfo, true /* group */) c.tg.signalHandlers.mu.Unlock() } c.reparentLocked(newParent) if newParent != nil { newParent.children[c] = struct{}{} } } } // findReparentTargetLocked returns the task to which t's children should be // reparented. If no such task exists, findNewParentLocked returns nil. // // This corresponds to Linux's find_new_reaper(). // // Preconditions: The TaskSet mutex must be locked. func (t *Task) findReparentTargetLocked() *Task { // Reparent to any sibling in the same thread group that hasn't begun // exiting. if t2 := t.tg.anyNonExitingTaskLocked(); t2 != nil { return t2 } if !t.tg.hasChildSubreaper { // No child subreaper exists. We can immediately return the // init process in this PID namespace if it exists. if init := t.tg.pidns.tasks[initTID]; init != nil { return init.tg.anyNonExitingTaskLocked() } return nil } // Walk up the process tree until we either find a subreaper, or we hit // the init process in the PID namespace. for parent := t.parent; parent != nil; parent = parent.parent { if parent.tg.isInitInLocked(parent.PIDNamespace()) { // We found the init process for this pid namespace, // return a task from it. If the init process is // exiting, this might return nil. return parent.tg.anyNonExitingTaskLocked() } if parent.tg.isChildSubreaper { // We found a subreaper process. Return a non-exiting // task if there is one, otherwise keep walking up the // process tree. if target := parent.tg.anyNonExitingTaskLocked(); target != nil { return target } } } return nil } func (tg *ThreadGroup) anyNonExitingTaskLocked() *Task { for t := tg.tasks.Front(); t != nil; t = t.Next() { if t.exitState == TaskExitNone { return t } } return nil } // reparentLocked changes t's parent. The new parent may be nil. // // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) reparentLocked(parent *Task) { oldParent := t.parent t.parent = parent if oldParent != nil { delete(oldParent.children, t) } if parent != nil { parent.children[t] = struct{}{} } // If a thread group leader's parent changes, reset the thread group's // termination signal to SIGCHLD and re-check exit notification. (Compare // kernel/exit.c:reparent_leader().) if t != t.tg.leader { return } if oldParent == nil && parent == nil { return } if oldParent != nil && parent != nil && oldParent.tg == parent.tg { return } t.tg.terminationSignal = linux.SIGCHLD if t.exitParentNotified && !t.exitParentAcked { t.exitParentNotified = false t.exitNotifyLocked(false) } } // When a task exits, other tasks in the system, notably the task's parent and // ptracer, may want to be notified. The exit notification system ensures that // interested tasks receive signals and/or are woken from blocking calls to // wait*() syscalls; these notifications must be resolved before exiting tasks // can be reaped and disappear from the system. // // Each task may have a parent task and/or a tracer task. If both a parent and // a tracer exist, they may be the same task, different tasks in the same // thread group, or tasks in different thread groups. (In the last case, Linux // refers to the task as being ptrace-reparented due to an implementation // detail; we avoid this terminology to avoid confusion.) // // A thread group is *empty* if all non-leader tasks in the thread group are // dead, and the leader is either a zombie or dead. The exit of a thread group // leader is never waitable - by either the parent or tracer - until the thread // group is empty. // // There are a few ways for an exit notification to be resolved: // // - The exit notification may be acknowledged by a call to Task.Wait with // WaitOptions.ConsumeEvent set (e.g. due to a wait4() syscall). // // - If the notified party is the parent, and the parent thread group is not // also the tracer thread group, and the notification signal is SIGCHLD, the // parent may explicitly ignore the notification (see quote in exitNotify). // Note that it's possible for the notified party to ignore the signal in other // cases, but the notification is only resolved under the above conditions. // (Actually, there is one exception; see the last paragraph of the "leader, // has tracer, tracer thread group is parent thread group" case below.) // // - If the notified party is the parent, and the parent does not exist, the // notification is resolved as if ignored. (This is only possible in the // sentry. In Linux, the only task / thread group without a parent is global // init, and killing global init causes a kernel panic.) // // - If the notified party is a tracer, the tracer may detach the traced task. // (Zombie tasks cannot be ptrace-attached, so the reverse is not possible.) // // In addition, if the notified party is the parent, the parent may exit and // cause the notifying task to be reparented to another thread group. This does // not resolve the notification; instead, the notification must be resent to // the new parent. // // The series of notifications generated for a given task's exit depend on // whether it is a thread group leader; whether the task is ptraced; and, if // so, whether the tracer thread group is the same as the parent thread group. // // - Non-leader, no tracer: No notification is generated; the task is reaped // immediately. // // - Non-leader, has tracer: SIGCHLD is sent to the tracer. When the tracer // notification is resolved (by waiting or detaching), the task is reaped. (For // non-leaders, whether the tracer and parent thread groups are the same is // irrelevant.) // // - Leader, no tracer: The task remains a zombie, with no notification sent, // until all other tasks in the thread group are dead. (In Linux terms, this // condition is indicated by include/linux/sched.h:thread_group_empty(); tasks // are removed from their thread_group list in kernel/exit.c:release_task() => // __exit_signal() => __unhash_process().) Then the thread group's termination // signal is sent to the parent. When the parent notification is resolved (by // waiting or ignoring), the task is reaped. // // - Leader, has tracer, tracer thread group is not parent thread group: // SIGCHLD is sent to the tracer. When the tracer notification is resolved (by // waiting or detaching), and all other tasks in the thread group are dead, the // thread group's termination signal is sent to the parent. (Note that the // tracer cannot resolve the exit notification by waiting until the thread // group is empty.) When the parent notification is resolved, the task is // reaped. // // - Leader, has tracer, tracer thread group is parent thread group: // // If all other tasks in the thread group are dead, the thread group's // termination signal is sent to the parent. At this point, the notification // can only be resolved by waiting. If the parent detaches from the task as a // tracer, the notification is not resolved, but the notification can now be // resolved by waiting or ignoring. When the parent notification is resolved, // the task is reaped. // // If at least one task in the thread group is not dead, SIGCHLD is sent to the // parent. At this point, the notification cannot be resolved at all; once the // thread group becomes empty, it can be resolved only by waiting. If the // parent detaches from the task as a tracer before all remaining tasks die, // then exit notification proceeds as in the case where the leader never had a // tracer. If the parent detaches from the task as a tracer after all remaining // tasks die, the notification is not resolved, but the notification can now be // resolved by waiting or ignoring. When the parent notification is resolved, // the task is reaped. // // In both of the above cases, when the parent detaches from the task as a // tracer while the thread group is empty, whether or not the parent resolves // the notification by ignoring it is based on the parent's SIGCHLD signal // action, whether or not the thread group's termination signal is SIGCHLD // (Linux: kernel/ptrace.c:__ptrace_detach() => ignoring_children()). // // There is one final wrinkle: A leader can become a non-leader due to a // sibling execve. In this case, the execing thread detaches the leader's // tracer (if one exists) and reaps the leader immediately. In Linux, this is // in fs/exec.c:de_thread(); in the sentry, this is in Task.promoteLocked(). // +stateify savable type runExitNotify struct{} func (*runExitNotify) execute(t *Task) taskRunState { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() t.advanceExitStateLocked(TaskExitInitiated, TaskExitZombie) t.tg.liveTasks-- // Check if this completes a sibling's execve. if t.tg.execing != nil && t.tg.liveTasks == 1 { // execing blocks the addition of new tasks to the thread group, so // the sole living task must be the execing one. e := t.tg.execing e.tg.signalHandlers.mu.Lock() if _, ok := e.stop.(*execStop); ok { e.endInternalStopLocked() } e.tg.signalHandlers.mu.Unlock() } t.exitNotifyLocked(false) // The task goroutine will now exit. return nil } // exitNotifyLocked is called after changes to t's state that affect exit // notification. // // If fromPtraceDetach is true, the caller is ptraceDetach or exitPtrace; // thanks to Linux's haphazard implementation of this functionality, such cases // determine whether parent notifications are ignored based on the parent's // handling of SIGCHLD, regardless of what the exited task's thread group's // termination signal is. // // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) exitNotifyLocked(fromPtraceDetach bool) { if t.exitState != TaskExitZombie { return } if !t.exitTracerNotified { t.exitTracerNotified = true tracer := t.Tracer() if tracer == nil { t.exitTracerAcked = true } else if t != t.tg.leader || t.parent == nil || tracer.tg != t.parent.tg { // Don't set exitParentNotified if t is non-leader, even if the // tracer is in the parent thread group, so that if the parent // detaches the following call to exitNotifyLocked passes through // the !exitParentNotified case below and causes t to be reaped // immediately. // // Tracer notification doesn't care about about // SIG_IGN/SA_NOCLDWAIT. tracer.tg.signalHandlers.mu.Lock() tracer.sendSignalLocked(t.exitNotificationSignal(linux.SIGCHLD, tracer), true /* group */) tracer.tg.signalHandlers.mu.Unlock() // Wake EventTraceeStop waiters as well since this task will never // ptrace-stop again. tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop) } else { // t is a leader and the tracer is in the parent thread group. t.exitParentNotified = true sig := linux.SIGCHLD if t.tg.tasksCount == 1 { sig = t.tg.terminationSignal } // This notification doesn't care about SIG_IGN/SA_NOCLDWAIT either // (in Linux, the check in do_notify_parent() is gated by // !tsk->ptrace.) t.parent.tg.signalHandlers.mu.Lock() t.parent.sendSignalLocked(t.exitNotificationSignal(sig, t.parent), true /* group */) t.parent.tg.signalHandlers.mu.Unlock() // See below for rationale for this event mask. t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue) } } if t.exitTracerAcked && !t.exitParentNotified { if t != t.tg.leader { t.exitParentNotified = true t.exitParentAcked = true } else if t.tg.tasksCount == 1 { t.exitParentNotified = true if t.parent == nil { t.exitParentAcked = true } else { // "POSIX.1-2001 specifies that if the disposition of SIGCHLD is // set to SIG_IGN or the SA_NOCLDWAIT flag is set for SIGCHLD (see // sigaction(2)), then children that terminate do not become // zombies and a call to wait() or waitpid() will block until all // children have terminated, and then fail with errno set to // ECHILD. (The original POSIX standard left the behavior of // setting SIGCHLD to SIG_IGN unspecified. Note that even though // the default disposition of SIGCHLD is "ignore", explicitly // setting the disposition to SIG_IGN results in different // treatment of zombie process children.) Linux 2.6 conforms to // this specification." - wait(2) // // Some undocumented Linux-specific details: // // - All of the above is ignored if the termination signal isn't // SIGCHLD. // // - SA_NOCLDWAIT causes the leader to be immediately reaped, but // does not suppress the SIGCHLD. signalParent := t.tg.terminationSignal.IsValid() t.parent.tg.signalHandlers.mu.Lock() if t.tg.terminationSignal == linux.SIGCHLD || fromPtraceDetach { if act, ok := t.parent.tg.signalHandlers.actions[linux.SIGCHLD]; ok { if act.Handler == linux.SIG_IGN { t.exitParentAcked = true signalParent = false } else if act.Flags&linux.SA_NOCLDWAIT != 0 { t.exitParentAcked = true } } } if signalParent { t.parent.tg.leader.sendSignalLocked(t.exitNotificationSignal(t.tg.terminationSignal, t.parent), true /* group */) } t.parent.tg.signalHandlers.mu.Unlock() // If a task in the parent was waiting for a child group stop // or continue, it needs to be notified of the exit, because // there may be no remaining eligible tasks (so that wait // should return ECHILD). t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue) } // We don't send exit events for the root process because we don't send // Clone or Exec events for the initial process. if t.tg != t.k.globalInit && seccheck.Global.Enabled(seccheck.PointExitNotifyParent) { mask, info := getExitNotifyParentSeccheckInfo(t) if err := seccheck.Global.SentToSinks(func(c seccheck.Sink) error { return c.ExitNotifyParent(t, mask, info) }); err != nil { log.Infof("Ignoring error from ExitNotifyParent point: %v", err) } } } } if t.exitTracerAcked && t.exitParentAcked { t.advanceExitStateLocked(TaskExitZombie, TaskExitDead) for ns := t.tg.pidns; ns != nil; ns = ns.parent { ns.deleteTask(t) } t.userCounters.decRLimitNProc() t.tg.exitedCPUStats.Accumulate(t.CPUStats()) t.tg.ioUsage.Accumulate(t.ioUsage) t.tg.signalHandlers.mu.Lock() t.tg.tasks.Remove(t) t.tg.tasksCount-- tc := t.tg.tasksCount t.tg.signalHandlers.mu.Unlock() if tc == 1 && t != t.tg.leader { // Our fromPtraceDetach doesn't matter here (in Linux terms, this // is via a call to release_task()). t.tg.leader.exitNotifyLocked(false) } else if tc == 0 { t.tg.pidWithinNS.Store(0) t.tg.processGroup.decRefWithParent(t.tg.parentPG()) } if t.parent != nil { delete(t.parent.children, t) // Do not clear t.parent. It may be still be needed after the task has exited // (for example, to perform ptrace access checks on /proc/[pid] files). } } } // Preconditions: The TaskSet mutex must be locked. func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *linux.SignalInfo { info := &linux.SignalInfo{ Signo: int32(sig), } info.SetPID(int32(receiver.tg.pidns.tids[t])) info.SetUID(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) if t.exitStatus.Signaled() { info.Code = linux.CLD_KILLED info.SetStatus(int32(t.exitStatus.TerminationSignal())) } else { info.Code = linux.CLD_EXITED info.SetStatus(int32(t.exitStatus.ExitStatus())) } // TODO(b/72102453): Set utime, stime. return info } // Preconditions: The TaskSet mutex must be locked. func getExitNotifyParentSeccheckInfo(t *Task) (seccheck.FieldSet, *pb.ExitNotifyParentInfo) { fields := seccheck.Global.GetFieldSet(seccheck.PointExitNotifyParent) info := &pb.ExitNotifyParentInfo{ ExitStatus: int32(t.tg.exitStatus), } if !fields.Context.Empty() { info.ContextData = &pb.ContextData{} // cwd isn't used for notifyExit seccheck so it's ok to pass an empty // string. LoadSeccheckDataLocked(t, fields.Context, info.ContextData, "") } return fields, info } // ExitStatus returns t's exit status, which is only guaranteed to be // meaningful if t.ExitState() != TaskExitNone. func (t *Task) ExitStatus() linux.WaitStatus { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() return t.exitStatus } // ExitStatus returns the exit status that would be returned by a consuming // wait*() on tg. func (tg *ThreadGroup) ExitStatus() linux.WaitStatus { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() tg.signalHandlers.mu.Lock() defer tg.signalHandlers.mu.Unlock() if tg.exiting { return tg.exitStatus } return tg.leader.exitStatus } // TerminationSignal returns the thread group's termination signal, which is // the signal that will be sent to its leader's parent when all threads have // exited. func (tg *ThreadGroup) TerminationSignal() linux.Signal { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() return tg.terminationSignal } // Task events that can be waited for. const ( // EventExit represents an exit notification generated for a child thread // group leader or a tracee under the conditions specified in the comment // above runExitNotify. EventExit waiter.EventMask = 1 << iota // EventChildGroupStop occurs when a child thread group completes a group // stop (i.e. all tasks in the child thread group have entered a stopped // state as a result of a group stop). EventChildGroupStop // EventTraceeStop occurs when a task that is ptraced by a task in the // notified thread group enters a ptrace stop (see ptrace(2)). EventTraceeStop // EventGroupContinue occurs when a child thread group, or a thread group // whose leader is ptraced by a task in the notified thread group, that had // initiated or completed a group stop leaves the group stop, due to the // child thread group or any task in the child thread group being sent // SIGCONT. EventGroupContinue ) // WaitOptions controls the behavior of Task.Wait. type WaitOptions struct { // If SpecificTID is non-zero, only events from the task with thread ID // SpecificTID are eligible to be waited for. SpecificTID is resolved in // the PID namespace of the waiter (the method receiver of Task.Wait). If // no such task exists, or that task would not otherwise be eligible to be // waited for by the waiting task, then there are no waitable tasks and // Wait will return ECHILD. SpecificTID ThreadID // If SpecificPGID is non-zero, only events from ThreadGroups with a // matching ProcessGroupID are eligible to be waited for. (Same // constraints as SpecificTID apply.) SpecificPGID ProcessGroupID // Terminology note: Per waitpid(2), "a clone child is one which delivers // no signal, or a signal other than SIGCHLD to its parent upon // termination." In Linux, termination signal is technically a per-task // property rather than a per-thread-group property. However, clone() // forces no termination signal for tasks created with CLONE_THREAD, and // execve() resets the termination signal to SIGCHLD, so all // non-group-leader threads have no termination signal and are therefore // "clone tasks". // If NonCloneTasks is true, events from non-clone tasks are eligible to be // waited for. NonCloneTasks bool // If CloneTasks is true, events from clone tasks are eligible to be waited // for. CloneTasks bool // If SiblingChildren is true, events from children tasks of any task // in the thread group of the waiter are eligible to be waited for. SiblingChildren bool // Events is a bitwise combination of the events defined above that specify // what events are of interest to the call to Wait. Events waiter.EventMask // If ConsumeEvent is true, the Wait should consume the event such that it // cannot be returned by a future Wait. Note that if a task exit is // consumed in this way, in most cases the task will be reaped. ConsumeEvent bool // If BlockInterruptErr is not nil, Wait will block until either an event // is available or there are no tasks that could produce a waitable event; // if that blocking is interrupted, Wait returns BlockInterruptErr. If // BlockInterruptErr is nil, Wait will not block. BlockInterruptErr error } // Preconditions: The TaskSet mutex must be locked (for reading or writing). func (o *WaitOptions) matchesTask(t *Task, pidns *PIDNamespace, tracee bool) bool { if o.SpecificTID != 0 && o.SpecificTID != pidns.tids[t] { return false } if o.SpecificPGID != 0 && o.SpecificPGID != pidns.pgids[t.tg.processGroup] { return false } // Tracees are always eligible. if tracee { return true } if t == t.tg.leader && t.tg.terminationSignal == linux.SIGCHLD { return o.NonCloneTasks } return o.CloneTasks } // ErrNoWaitableEvent is returned by non-blocking Task.Waits (e.g. // waitpid(WNOHANG)) that find no waitable events, but determine that waitable // events may exist in the future. (In contrast, if a non-blocking or blocking // Wait determines that there are no tasks that can produce a waitable event, // Task.Wait returns ECHILD.) var ErrNoWaitableEvent = errors.New("non-blocking Wait found eligible threads but no waitable events") // WaitResult contains information about a waited-for event. type WaitResult struct { // Task is the task that reported the event. Task *Task // TID is the thread ID of Task in the PID namespace of the task that // called Wait (that is, the method receiver of the call to Task.Wait). TID // is provided because consuming exit waits cause the thread ID to be // deallocated. TID ThreadID // UID is the real UID of Task in the user namespace of the task that // called Wait. UID auth.UID // Event is exactly one of the events defined above. Event waiter.EventMask // Status is the wait status associated with the event. Status linux.WaitStatus } // Wait waits for an event from a thread group that is a child of t's thread // group, or a task in such a thread group, or a task that is ptraced by t, // subject to the options specified in opts. func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error) { if opts.BlockInterruptErr == nil { return t.waitOnce(opts) } w, ch := waiter.NewChannelEntry(opts.Events) t.tg.eventQueue.EventRegister(&w) defer t.tg.eventQueue.EventUnregister(&w) for { wr, err := t.waitOnce(opts) if err != ErrNoWaitableEvent { // This includes err == nil. return wr, err } if err := t.Block(ch); err != nil { return wr, linuxerr.ConvertIntr(err, opts.BlockInterruptErr) } } } func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) { anyWaitableTasks := false t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() if opts.SiblingChildren { // We can wait on the children and tracees of any task in the // same thread group. for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() { wr, any := t.waitParentLocked(opts, parent) if wr != nil { return wr, nil } anyWaitableTasks = anyWaitableTasks || any } } else { // We can only wait on this task. var wr *WaitResult wr, anyWaitableTasks = t.waitParentLocked(opts, t) if wr != nil { return wr, nil } } if anyWaitableTasks { return nil, ErrNoWaitableEvent } return nil, linuxerr.ECHILD } // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) waitParentLocked(opts *WaitOptions, parent *Task) (*WaitResult, bool) { anyWaitableTasks := false for child := range parent.children { if !opts.matchesTask(child, parent.tg.pidns, false) { continue } // Non-leaders don't notify parents on exit and aren't eligible to // be waited on. if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked { anyWaitableTasks = true if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil { return wr, anyWaitableTasks } } // Check for group stops and continues. Tasks that have passed // TaskExitInitiated can no longer participate in group stops. if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 { continue } if child.exitState >= TaskExitInitiated { continue } // If the waiter is in the same thread group as the task's // tracer, do not report its group stops; they will be reported // as ptrace stops instead. This also skips checking for group // continues, but they'll be checked for when scanning tracees // below. (Per kernel/exit.c:wait_consider_task(): "If a // ptracer wants to distinguish the two events for its own // children, it should create a separate process which takes // the role of real parent.") if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg { continue } anyWaitableTasks = true if opts.Events&EventChildGroupStop != 0 { if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil { return wr, anyWaitableTasks } } if opts.Events&EventGroupContinue != 0 { if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil { return wr, anyWaitableTasks } } } for tracee := range parent.ptraceTracees { if !opts.matchesTask(tracee, parent.tg.pidns, true) { continue } // Non-leaders do notify tracers on exit. if opts.Events&EventExit != 0 && !tracee.exitTracerAcked { anyWaitableTasks = true if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil { return wr, anyWaitableTasks } } if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 { continue } if tracee.exitState >= TaskExitInitiated { continue } anyWaitableTasks = true if opts.Events&EventTraceeStop != 0 { if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil { return wr, anyWaitableTasks } } if opts.Events&EventGroupContinue != 0 { if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil { return wr, anyWaitableTasks } } } return nil, anyWaitableTasks } // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtracer bool) *WaitResult { if asPtracer && !target.exitTracerNotified { return nil } if !asPtracer && !target.exitParentNotified { return nil } // Zombied thread group leaders are never waitable until their thread group // is otherwise empty. Usually this is caught by the // target.exitParentNotified check above, but if t is both (in the thread // group of) target's tracer and parent, asPtracer may be true. if target == target.tg.leader && target.tg.tasksCount != 1 { return nil } pid := t.tg.pidns.tids[target] uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() status := target.exitStatus if !opts.ConsumeEvent { return &WaitResult{ Task: target, TID: pid, UID: uid, Event: EventExit, Status: status, } } // Surprisingly, the exit status reported by a non-consuming wait can // differ from that reported by a consuming wait; the latter will return // the group exit code if one is available. if target.tg.exiting { status = target.tg.exitStatus } // t may be (in the thread group of) target's parent, tracer, or both. We // don't need to check for !exitTracerAcked because tracees are detached // here, and we don't need to check for !exitParentAcked because zombies // will be reaped here. if tracer := target.Tracer(); tracer != nil && tracer.tg == t.tg && target.exitTracerNotified { target.exitTracerAcked = true target.ptraceTracer.Store(nil) delete(t.ptraceTracees, target) } if target.parent != nil && target.parent.tg == t.tg && target.exitParentNotified { target.exitParentAcked = true if target == target.tg.leader { // target.tg.exitedCPUStats doesn't include target.CPUStats() yet, // and won't until after target.exitNotifyLocked() (maybe). Include // target.CPUStats() explicitly. This is consistent with Linux, // which accounts an exited task's cputime to its thread group in // kernel/exit.c:release_task() => __exit_signal(), and uses // thread_group_cputime_adjusted() in wait_task_zombie(). t.tg.childCPUStats.Accumulate(target.CPUStats()) t.tg.childCPUStats.Accumulate(target.tg.exitedCPUStats) t.tg.childCPUStats.Accumulate(target.tg.childCPUStats) // Update t's child max resident set size. The size will be the maximum // of this thread's size and all its childrens' sizes. if t.tg.childMaxRSS < target.tg.maxRSS { t.tg.childMaxRSS = target.tg.maxRSS } if t.tg.childMaxRSS < target.tg.childMaxRSS { t.tg.childMaxRSS = target.tg.childMaxRSS } } } target.exitNotifyLocked(false) return &WaitResult{ Task: target, TID: pid, UID: uid, Event: EventExit, Status: status, } } // updateRSSLocked updates t.tg.maxRSS. // // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) updateRSSLocked() { if mmMaxRSS := t.MemoryManager().MaxResidentSetSize(); t.tg.maxRSS < mmMaxRSS { t.tg.maxRSS = mmMaxRSS } } // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) waitCollectChildGroupStopLocked(target *Task, opts *WaitOptions) *WaitResult { target.tg.signalHandlers.mu.Lock() defer target.tg.signalHandlers.mu.Unlock() if !target.tg.groupStopWaitable { return nil } pid := t.tg.pidns.tids[target] uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() sig := target.tg.groupStopSignal if opts.ConsumeEvent { target.tg.groupStopWaitable = false } return &WaitResult{ Task: target, TID: pid, UID: uid, Event: EventChildGroupStop, Status: linux.WaitStatusStopped(uint32(sig)), } } // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) waitCollectGroupContinueLocked(target *Task, opts *WaitOptions) *WaitResult { target.tg.signalHandlers.mu.Lock() defer target.tg.signalHandlers.mu.Unlock() if !target.tg.groupContWaitable { return nil } pid := t.tg.pidns.tids[target] uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() if opts.ConsumeEvent { target.tg.groupContWaitable = false } return &WaitResult{ Task: target, TID: pid, UID: uid, Event: EventGroupContinue, Status: linux.WaitStatusContinued(), } } // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) waitCollectTraceeStopLocked(target *Task, opts *WaitOptions) *WaitResult { target.tg.signalHandlers.mu.Lock() defer target.tg.signalHandlers.mu.Unlock() if target.stop == nil { return nil } if _, ok := target.stop.(*ptraceStop); !ok { return nil } if target.ptraceCode == 0 { return nil } pid := t.tg.pidns.tids[target] uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() code := target.ptraceCode if opts.ConsumeEvent { target.ptraceCode = 0 } return &WaitResult{ Task: target, TID: pid, UID: uid, Event: EventTraceeStop, Status: linux.WaitStatusStopped(uint32(code)), } } // ExitState returns t's current progress through the exit path. func (t *Task) ExitState() TaskExitState { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() return t.exitState } // ParentDeathSignal returns t's parent death signal. func (t *Task) ParentDeathSignal() linux.Signal { t.mu.Lock() defer t.mu.Unlock() return t.parentDeathSignal } // SetParentDeathSignal sets t's parent death signal. func (t *Task) SetParentDeathSignal(sig linux.Signal) { t.mu.Lock() defer t.mu.Unlock() t.parentDeathSignal = sig } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_futex.go000066400000000000000000000117771465435605700247510ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/usermem" ) // Futex returns t's futex manager. // // Preconditions: The caller must be running on the task goroutine, or t.mu // must be locked. func (t *Task) Futex() *futex.Manager { return t.image.fu } // SwapUint32 implements futex.Target.SwapUint32. func (t *Task) SwapUint32(addr hostarch.Addr, new uint32) (uint32, error) { return t.MemoryManager().SwapUint32(t, addr, new, usermem.IOOpts{ AddressSpaceActive: true, }) } // CompareAndSwapUint32 implements futex.Target.CompareAndSwapUint32. func (t *Task) CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error) { return t.MemoryManager().CompareAndSwapUint32(t, addr, old, new, usermem.IOOpts{ AddressSpaceActive: true, }) } // LoadUint32 implements futex.Target.LoadUint32. func (t *Task) LoadUint32(addr hostarch.Addr) (uint32, error) { return t.MemoryManager().LoadUint32(t, addr, usermem.IOOpts{ AddressSpaceActive: true, }) } // GetSharedKey implements futex.Target.GetSharedKey. func (t *Task) GetSharedKey(addr hostarch.Addr) (futex.Key, error) { return t.MemoryManager().GetSharedFutexKey(t, addr) } // GetRobustList sets the robust futex list for the task. func (t *Task) GetRobustList() hostarch.Addr { t.mu.Lock() addr := t.robustList t.mu.Unlock() return addr } // SetRobustList sets the robust futex list for the task. func (t *Task) SetRobustList(addr hostarch.Addr) { t.mu.Lock() t.robustList = addr t.mu.Unlock() } // exitRobustList walks the robust futex list, marking locks dead and notifying // wakers. It corresponds to Linux's exit_robust_list(). Following Linux, // errors are silently ignored. func (t *Task) exitRobustList() { t.mu.Lock() addr := t.robustList t.robustList = 0 t.mu.Unlock() if addr == 0 { return } var rl linux.RobustListHead if _, err := rl.CopyIn(t, hostarch.Addr(addr)); err != nil { return } next := primitive.Uint64(rl.List) done := 0 var pendingLockAddr hostarch.Addr if rl.ListOpPending != 0 { pendingLockAddr = hostarch.Addr(rl.ListOpPending + rl.FutexOffset) } // Wake up normal elements. for hostarch.Addr(next) != addr { // We traverse to the next element of the list before we // actually wake anything. This prevents the race where waking // this futex causes a modification of the list. thisLockAddr := hostarch.Addr(uint64(next) + rl.FutexOffset) // Try to decode the next element in the list before waking the // current futex. But don't check the error until after we've // woken the current futex. Linux does it in this order too _, nextErr := next.CopyIn(t, hostarch.Addr(next)) // Wakeup the current futex if it's not pending. if thisLockAddr != pendingLockAddr { t.wakeRobustListOne(thisLockAddr) } // If there was an error copying the next futex, we must bail. if nextErr != nil { break } // This is a user structure, so it could be a massive list, or // even contain a loop if they are trying to mess with us. We // cap traversal to prevent that. done++ if done >= linux.ROBUST_LIST_LIMIT { break } } // Is there a pending entry to wake? if pendingLockAddr != 0 { t.wakeRobustListOne(pendingLockAddr) } } // wakeRobustListOne wakes a single futex from the robust list. func (t *Task) wakeRobustListOne(addr hostarch.Addr) { // Bit 0 in address signals PI futex. pi := addr&1 == 1 addr = addr &^ 1 // Load the futex. f, err := t.LoadUint32(addr) if err != nil { // Can't read this single value? Ignore the problem. // We can wake the other futexes in the list. return } tid := uint32(t.ThreadID()) for { // Is this held by someone else? if f&linux.FUTEX_TID_MASK != tid { return } // This thread is dying and it's holding this futex. We need to // set the owner died bit and wake up any waiters. newF := (f & linux.FUTEX_WAITERS) | linux.FUTEX_OWNER_DIED if curF, err := t.CompareAndSwapUint32(addr, f, newF); err != nil { return } else if curF != f { // Futex changed out from under us. Try again... f = curF continue } // Wake waiters if there are any. if f&linux.FUTEX_WAITERS != 0 { private := f&linux.FUTEX_PRIVATE_FLAG != 0 if pi { t.Futex().UnlockPI(t, addr, tid, private) return } t.Futex().Wake(t, addr, private, linux.FUTEX_BITSET_MATCH_ANY, 1) } // Done. return } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_identity.go000066400000000000000000000510431465435605700254350ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" ) // Credentials returns t's credentials. // // This value must be considered immutable. func (t *Task) Credentials() *auth.Credentials { return t.creds.Load() } // UserNamespace returns the user namespace associated with the task. func (t *Task) UserNamespace() *auth.UserNamespace { return t.Credentials().UserNamespace } // HasCapabilityIn checks if the task has capability cp in user namespace ns. func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool { return t.Credentials().HasCapabilityIn(cp, ns) } // HasCapability checks if the task has capability cp in its user namespace. func (t *Task) HasCapability(cp linux.Capability) bool { return t.Credentials().HasCapability(cp) } // SetUID implements the semantics of setuid(2). func (t *Task) SetUID(uid auth.UID) error { // setuid considers -1 to be invalid. if !uid.Ok() { return linuxerr.EINVAL } t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials() kuid := creds.UserNamespace.MapToKUID(uid) if !kuid.Ok() { return linuxerr.EINVAL } // "setuid() sets the effective user ID of the calling process. If the // effective UID of the caller is root (more precisely: if the caller has // the CAP_SETUID capability), the real UID and saved set-user-ID are also // set." - setuid(2) if creds.HasCapability(linux.CAP_SETUID) { t.setKUIDsUncheckedLocked(kuid, kuid, kuid) return nil } // "EPERM: The user is not privileged (Linux: does not have the CAP_SETUID // capability) and uid does not match the real UID or saved set-user-ID of // the calling process." if kuid != creds.RealKUID && kuid != creds.SavedKUID { return linuxerr.EPERM } t.setKUIDsUncheckedLocked(creds.RealKUID, kuid, creds.SavedKUID) return nil } // SetREUID implements the semantics of setreuid(2). func (t *Task) SetREUID(r, e auth.UID) error { t.mu.Lock() defer t.mu.Unlock() // "Supplying a value of -1 for either the real or effective user ID forces // the system to leave that ID unchanged." - setreuid(2) creds := t.Credentials() newR := creds.RealKUID if r.Ok() { newR = creds.UserNamespace.MapToKUID(r) if !newR.Ok() { return linuxerr.EINVAL } } newE := creds.EffectiveKUID if e.Ok() { newE = creds.UserNamespace.MapToKUID(e) if !newE.Ok() { return linuxerr.EINVAL } } if !creds.HasCapability(linux.CAP_SETUID) { // "Unprivileged processes may only set the effective user ID to the // real user ID, the effective user ID, or the saved set-user-ID." if newE != creds.RealKUID && newE != creds.EffectiveKUID && newE != creds.SavedKUID { return linuxerr.EPERM } // "Unprivileged users may only set the real user ID to the real user // ID or the effective user ID." if newR != creds.RealKUID && newR != creds.EffectiveKUID { return linuxerr.EPERM } } // "If the real user ID is set (i.e., ruid is not -1) or the effective user // ID is set to a value not equal to the previous real user ID, the saved // set-user-ID will be set to the new effective user ID." newS := creds.SavedKUID if r.Ok() || (e.Ok() && newE != creds.EffectiveKUID) { newS = newE } t.setKUIDsUncheckedLocked(newR, newE, newS) return nil } // SetRESUID implements the semantics of the setresuid(2) syscall. func (t *Task) SetRESUID(r, e, s auth.UID) error { t.mu.Lock() defer t.mu.Unlock() // "Unprivileged user processes may change the real UID, effective UID, and // saved set-user-ID, each to one of: the current real UID, the current // effective UID or the current saved set-user-ID. Privileged processes (on // Linux, those having the CAP_SETUID capability) may set the real UID, // effective UID, and saved set-user-ID to arbitrary values. If one of the // arguments equals -1, the corresponding value is not changed." - // setresuid(2) var err error creds := t.Credentials() newR := creds.RealKUID if r.Ok() { newR, err = creds.UseUID(r) if err != nil { return err } } newE := creds.EffectiveKUID if e.Ok() { newE, err = creds.UseUID(e) if err != nil { return err } } newS := creds.SavedKUID if s.Ok() { newS, err = creds.UseUID(s) if err != nil { return err } } t.setKUIDsUncheckedLocked(newR, newE, newS) return nil } // Preconditions: t.mu must be locked. func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds. root := creds.UserNamespace.MapToKUID(auth.RootUID) oldR, oldE, oldS := creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID = newR, newE, newS // "1. If one or more of the real, effective or saved set user IDs was // previously 0, and as a result of the UID changes all of these IDs have a // nonzero value, then all capabilities are cleared from the permitted and // effective capability sets." - capabilities(7) if (oldR == root || oldE == root || oldS == root) && (newR != root && newE != root && newS != root) { // prctl(2): "PR_SET_KEEPCAP: Set the state of the calling thread's // "keep capabilities" flag, which determines whether the thread's permitted // capability set is cleared when a change is made to the // thread's user IDs such that the thread's real UID, effective // UID, and saved set-user-ID all become nonzero when at least // one of them previously had the value 0. By default, the // permitted capability set is cleared when such a change is // made; setting the "keep capabilities" flag prevents it from // being cleared." (A thread's effective capability set is always // cleared when such a credential change is made, // regardless of the setting of the "keep capabilities" flag.) if !creds.KeepCaps { creds.PermittedCaps = 0 creds.EffectiveCaps = 0 } } // """ // 2. If the effective user ID is changed from 0 to nonzero, then all // capabilities are cleared from the effective set. // // 3. If the effective user ID is changed from nonzero to 0, then the // permitted set is copied to the effective set. // """ if oldE == root && newE != root { creds.EffectiveCaps = 0 } else if oldE != root && newE == root { creds.EffectiveCaps = creds.PermittedCaps } // "4. If the filesystem user ID is changed from 0 to nonzero (see // setfsuid(2)), then the following capabilities are cleared from the // effective set: ..." // (filesystem UIDs aren't implemented, nor are any of the capabilities in // question) if oldE != newE { // "[dumpability] is reset to the current value contained in // the file /proc/sys/fs/suid_dumpable (which by default has // the value 0), in the following circumstances: The process's // effective user or group ID is changed." - prctl(2) // // (suid_dumpable isn't implemented, so we just use the // default. t.MemoryManager().SetDumpability(mm.NotDumpable) // Not documented, but compare Linux's kernel/cred.c:commit_creds(). t.parentDeathSignal = 0 } t.creds.Store(creds) } // SetGID implements the semantics of setgid(2). func (t *Task) SetGID(gid auth.GID) error { if !gid.Ok() { return linuxerr.EINVAL } t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials() kgid := creds.UserNamespace.MapToKGID(gid) if !kgid.Ok() { return linuxerr.EINVAL } if creds.HasCapability(linux.CAP_SETGID) { t.setKGIDsUncheckedLocked(kgid, kgid, kgid) return nil } if kgid != creds.RealKGID && kgid != creds.SavedKGID { return linuxerr.EPERM } t.setKGIDsUncheckedLocked(creds.RealKGID, kgid, creds.SavedKGID) return nil } // SetREGID implements the semantics of setregid(2). func (t *Task) SetREGID(r, e auth.GID) error { t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials() newR := creds.RealKGID if r.Ok() { newR = creds.UserNamespace.MapToKGID(r) if !newR.Ok() { return linuxerr.EINVAL } } newE := creds.EffectiveKGID if e.Ok() { newE = creds.UserNamespace.MapToKGID(e) if !newE.Ok() { return linuxerr.EINVAL } } if !creds.HasCapability(linux.CAP_SETGID) { if newE != creds.RealKGID && newE != creds.EffectiveKGID && newE != creds.SavedKGID { return linuxerr.EPERM } if newR != creds.RealKGID && newR != creds.EffectiveKGID { return linuxerr.EPERM } } newS := creds.SavedKGID if r.Ok() || (e.Ok() && newE != creds.EffectiveKGID) { newS = newE } t.setKGIDsUncheckedLocked(newR, newE, newS) return nil } // SetRESGID implements the semantics of the setresgid(2) syscall. func (t *Task) SetRESGID(r, e, s auth.GID) error { var err error t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials() newR := creds.RealKGID if r.Ok() { newR, err = creds.UseGID(r) if err != nil { return err } } newE := creds.EffectiveKGID if e.Ok() { newE, err = creds.UseGID(e) if err != nil { return err } } newS := creds.SavedKGID if s.Ok() { newS, err = creds.UseGID(s) if err != nil { return err } } t.setKGIDsUncheckedLocked(newR, newE, newS) return nil } func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) { creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds. oldE := creds.EffectiveKGID creds.RealKGID, creds.EffectiveKGID, creds.SavedKGID = newR, newE, newS if oldE != newE { // "[dumpability] is reset to the current value contained in // the file /proc/sys/fs/suid_dumpable (which by default has // the value 0), in the following circumstances: The process's // effective user or group ID is changed." - prctl(2) // // (suid_dumpable isn't implemented, so we just use the // default. t.MemoryManager().SetDumpability(mm.NotDumpable) // Not documented, but compare Linux's // kernel/cred.c:commit_creds(). t.parentDeathSignal = 0 } t.creds.Store(creds) } // SetExtraGIDs attempts to change t's supplemental groups. All IDs are // interpreted as being in t's user namespace. func (t *Task) SetExtraGIDs(gids []auth.GID) error { t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials() if !creds.HasCapability(linux.CAP_SETGID) { return linuxerr.EPERM } kgids := make([]auth.KGID, len(gids)) for i, gid := range gids { kgid := creds.UserNamespace.MapToKGID(gid) if !kgid.Ok() { return linuxerr.EINVAL } kgids[i] = kgid } creds = creds.Fork() // The credentials object is immutable. See doc for creds. creds.ExtraKGIDs = kgids t.creds.Store(creds) return nil } // weakCaps is a set of capabilities that can be disabled externally. var weakCaps = auth.CapabilitySetOf(linux.CAP_NET_RAW) // SetCapabilitySets attempts to change t's permitted, inheritable, and // effective capability sets. func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.CapabilitySet) error { t.mu.Lock() defer t.mu.Unlock() // "Permitted: This is a limiting superset for the effective capabilities // that the thread may assume." - capabilities(7) if effective & ^permitted != 0 { return linuxerr.EPERM } creds := t.Credentials() // Don't fail if one or more weak capabilities can't be set, just drop them. mask := (weakCaps & creds.BoundingCaps) | (auth.AllCapabilities &^ weakCaps) permitted &= mask inheritable &= mask effective &= mask // "It is also a limiting superset for the capabilities that may be added // to the inheritable set by a thread that does not have the CAP_SETPCAP // capability in its effective set." if !creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(creds.InheritableCaps|creds.PermittedCaps) != 0) { return linuxerr.EPERM } // "If a thread drops a capability from its permitted set, it can never // reacquire that capability (unless it execve(2)s ..." if permitted & ^creds.PermittedCaps != 0 { return linuxerr.EPERM } // "... if a capability is not in the bounding set, then a thread can't add // this capability to its inheritable set, even if it was in its permitted // capabilities ..." if inheritable & ^(creds.InheritableCaps|creds.BoundingCaps) != 0 { return linuxerr.EPERM } creds = creds.Fork() // The credentials object is immutable. See doc for creds. creds.PermittedCaps = permitted creds.InheritableCaps = inheritable creds.EffectiveCaps = effective t.creds.Store(creds) return nil } // DropBoundingCapability attempts to drop capability cp from t's capability // bounding set. func (t *Task) DropBoundingCapability(cp linux.Capability) error { t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials() if !creds.HasCapability(linux.CAP_SETPCAP) { return linuxerr.EPERM } creds = creds.Fork() // The credentials object is immutable. See doc for creds. creds.BoundingCaps &^= auth.CapabilitySetOf(cp) t.creds.Store(creds) return nil } // SetUserNamespace attempts to move c into ns. func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error { t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials() // "A process reassociating itself with a user namespace must have the // CAP_SYS_ADMIN capability in the target user namespace." - setns(2) // // If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN // in ns (by rule 3 in auth.Credentials.HasCapability). if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) { return linuxerr.EPERM } creds = creds.Fork() // The credentials object is immutable. See doc for creds. creds.UserNamespace = ns // "The child process created by clone(2) with the CLONE_NEWUSER flag // starts out with a complete set of capabilities in the new user // namespace. Likewise, a process that creates a new user namespace using // unshare(2) or joins an existing user namespace using setns(2) gains a // full set of capabilities in that namespace." creds.PermittedCaps = auth.AllCapabilities creds.InheritableCaps = 0 creds.EffectiveCaps = auth.AllCapabilities creds.BoundingCaps = auth.AllCapabilities // "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER // flag sets the "securebits" flags (see capabilities(7)) to their default // values (all flags disabled) in the child (for clone(2)) or caller (for // unshare(2), or setns(2)." - user_namespaces(7) creds.KeepCaps = false t.creds.Store(creds) return nil } // SetKeepCaps will set the keep capabilities flag PR_SET_KEEPCAPS. func (t *Task) SetKeepCaps(k bool) { t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds. creds.KeepCaps = k t.creds.Store(creds) } // updateCredsForExecLocked updates t.creds to reflect an execve(). // // NOTE(b/30815691): We currently do not implement privileged executables // (set-user/group-ID bits and file capabilities). This allows us to make a lot // of simplifying assumptions: // // - We assume the no_new_privs bit (set by prctl(SET_NO_NEW_PRIVS)), which // disables the features we don't support anyway, is always set. This // drastically simplifies this function. // // - We don't set AT_SECURE = 1, because no_new_privs always being set means // that the conditions that require AT_SECURE = 1 never arise. (Compare Linux's // security/commoncap.c:cap_bprm_set_creds() and cap_bprm_secureexec().) // // - We don't check for CAP_SYS_ADMIN in prctl(PR_SET_SECCOMP), since // seccomp-bpf is also allowed if the task has no_new_privs set. // // - Task.ptraceAttach does not serialize with execve as it does in Linux, // since no_new_privs being set has the same effect as the presence of an // unprivileged tracer. // // Preconditions: t.mu must be locked. func (t *Task) updateCredsForExecLocked() { // """ // During an execve(2), the kernel calculates the new capabilities of // the process using the following algorithm: // // P'(permitted) = (P(inheritable) & F(inheritable)) | // (F(permitted) & cap_bset) // // P'(effective) = F(effective) ? P'(permitted) : 0 // // P'(inheritable) = P(inheritable) [i.e., unchanged] // // where: // // P denotes the value of a thread capability set before the // execve(2) // // P' denotes the value of a thread capability set after the // execve(2) // // F denotes a file capability set // // cap_bset is the value of the capability bounding set // // ... // // In order to provide an all-powerful root using capability sets, during // an execve(2): // // 1. If a set-user-ID-root program is being executed, or the real user ID // of the process is 0 (root) then the file inheritable and permitted sets // are defined to be all ones (i.e. all capabilities enabled). // // 2. If a set-user-ID-root program is being executed, then the file // effective bit is defined to be one (enabled). // // The upshot of the above rules, combined with the capabilities // transformations described above, is that when a process execve(2)s a // set-user-ID-root program, or when a process with an effective UID of 0 // execve(2)s a program, it gains all capabilities in its permitted and // effective capability sets, except those masked out by the capability // bounding set. // """ - capabilities(7) // (ambient capability sets omitted) // // As the last paragraph implies, the case of "a set-user-ID root program // is being executed" also includes the case where (namespace) root is // executing a non-set-user-ID program; the actual check is just based on // the effective user ID. var newPermitted auth.CapabilitySet // since F(inheritable) == F(permitted) == 0 fileEffective := false creds := t.Credentials() root := creds.UserNamespace.MapToKUID(auth.RootUID) if creds.EffectiveKUID == root || creds.RealKUID == root { newPermitted = creds.InheritableCaps | creds.BoundingCaps if creds.EffectiveKUID == root { fileEffective = true } } creds = creds.Fork() // The credentials object is immutable. See doc for creds. // Now we enter poorly-documented, somewhat confusing territory. (The // accompanying comment in Linux's security/commoncap.c:cap_bprm_set_creds // is not very helpful.) My reading of it is: // // If at least one of the following is true: // // A1. The execing task is ptraced, and the tracer did not have // CAP_SYS_PTRACE in the execing task's user namespace at the time of // PTRACE_ATTACH. // // A2. The execing task shares its FS context with at least one task in // another thread group. // // A3. The execing task has no_new_privs set. // // AND at least one of the following is true: // // B1. The new effective user ID (which may come from set-user-ID, or be the // execing task's existing effective user ID) is not equal to the task's // real UID. // // B2. The new effective group ID (which may come from set-group-ID, or be // the execing task's existing effective group ID) is not equal to the // task's real GID. // // B3. The new permitted capability set contains capabilities not in the // task's permitted capability set. // // Then: // // C1. Limit the new permitted capability set to the task's permitted // capability set. // // C2. If either the task does not have CAP_SETUID in its user namespace, or // the task has no_new_privs set, force the new effective UID and GID to // the task's real UID and GID. // // But since no_new_privs is always set (A3 is always true), this becomes // much simpler. If B1 and B2 are false, C2 is a no-op. If B3 is false, C1 // is a no-op. So we can just do C1 and C2 unconditionally. if creds.EffectiveKUID != creds.RealKUID || creds.EffectiveKGID != creds.RealKGID { creds.EffectiveKUID = creds.RealKUID creds.EffectiveKGID = creds.RealKGID t.parentDeathSignal = 0 } // (Saved set-user-ID is always set to the new effective user ID, and saved // set-group-ID is always set to the new effective group ID, regardless of // the above.) creds.SavedKUID = creds.RealKUID creds.SavedKGID = creds.RealKGID creds.PermittedCaps &= newPermitted if fileEffective { creds.EffectiveCaps = creds.PermittedCaps } else { creds.EffectiveCaps = 0 } // prctl(2): The "keep capabilities" value will be reset to 0 on subsequent // calls to execve(2). creds.KeepCaps = false // "The bounding set is inherited at fork(2) from the thread's parent, and // is preserved across an execve(2)". So we're done. t.creds.Store(creds) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_image.go000066400000000000000000000123501465435605700246640ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux/errno" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/loader" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/syserr" ) var errNoSyscalls = syserr.New("no syscall table found", errno.ENOEXEC) // Auxmap contains miscellaneous data for the task. type Auxmap map[string]any // TaskImage is the subset of a task's data that is provided by the loader. // // +stateify savable type TaskImage struct { // Name is the thread name set by the prctl(PR_SET_NAME) system call. Name string // Arch is the architecture-specific context (registers, etc.) Arch *arch.Context64 // MemoryManager is the task's address space. MemoryManager *mm.MemoryManager // fu implements futexes in the address space. fu *futex.Manager // st is the task's syscall table. st *SyscallTable `state:".(syscallTableInfo)"` // fileCaps is the image's extended attribute named security.capability. fileCaps string } // FileCaps return the task image's security.capability extended attribute. func (image *TaskImage) FileCaps() string { return image.fileCaps } // release releases all resources held by the TaskImage. release is called by // the task when it execs into a new TaskImage. func (image *TaskImage) release(ctx context.Context) { // Nil out pointers so that if the task is saved after release, it doesn't // follow the pointers to possibly now-invalid objects. if image.MemoryManager != nil { image.MemoryManager.DecUsers(ctx) image.MemoryManager = nil } image.fu = nil } // Fork returns a duplicate of image. The copied TaskImage always has an // independent arch.Context64. If shareAddressSpace is true, the copied // TaskImage shares an address space with the original; otherwise, the copied // TaskImage has an independent address space that is initially a duplicate // of the original's. func (image *TaskImage) Fork(ctx context.Context, k *Kernel, shareAddressSpace bool) (*TaskImage, error) { newImage := &TaskImage{ Name: image.Name, Arch: image.Arch.Fork(), st: image.st, } if shareAddressSpace { newImage.MemoryManager = image.MemoryManager if newImage.MemoryManager != nil { if !newImage.MemoryManager.IncUsers() { // Shouldn't be possible since image.MemoryManager should be a // counted user. panic(fmt.Sprintf("TaskImage.Fork called with userless TaskImage.MemoryManager")) } } newImage.fu = image.fu } else { newMM, err := image.MemoryManager.Fork(ctx) if err != nil { return nil, err } newImage.MemoryManager = newMM newImage.fu = k.futexes.Fork() } return newImage, nil } // Arch returns t's arch.Context64. // // Preconditions: The caller must be running on the task goroutine, or t.mu // must be locked. func (t *Task) Arch() *arch.Context64 { return t.image.Arch } // MemoryManager returns t's MemoryManager. MemoryManager does not take an // additional reference on the returned MM. // // Preconditions: The caller must be running on the task goroutine, or t.mu // must be locked. func (t *Task) MemoryManager() *mm.MemoryManager { return t.image.MemoryManager } // SyscallTable returns t's syscall table. // // Preconditions: The caller must be running on the task goroutine, or t.mu // must be locked. func (t *Task) SyscallTable() *SyscallTable { return t.image.st } // Stack returns the userspace stack. // // Preconditions: The caller must be running on the task goroutine, or t.mu // must be locked. func (t *Task) Stack() *arch.Stack { return &arch.Stack{ Arch: t.Arch(), IO: t.MemoryManager(), Bottom: hostarch.Addr(t.Arch().Stack()), } } // LoadTaskImage loads a specified file into a new TaskImage. // // args.MemoryManager does not need to be set by the caller. func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskImage, *syserr.Error) { // Prepare a new user address space to load into. m := mm.NewMemoryManager(k, k.mf, k.SleepForAddressSpaceActivation) defer m.DecUsers(ctx) args.MemoryManager = m info, err := loader.Load(ctx, args, k.extraAuxv, k.vdso) if err != nil { return nil, err } // Lookup our new syscall table. st, ok := LookupSyscallTable(info.OS, info.Arch.Arch()) if !ok { // No syscall table found. This means that the ELF binary does not match // the architecture. return nil, errNoSyscalls } if !m.IncUsers() { panic("Failed to increment users count on new MM") } return &TaskImage{ Name: info.Name, Arch: info.Arch, MemoryManager: m, fu: k.futexes.Fork(), st: st, fileCaps: info.FileCaps, }, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_key.go000066400000000000000000000103071465435605700243720ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) // SessionKeyring returns this Task's session keyring. // Session keyrings are inherited from the parent when a task is started. // If the session keyring is unset, it is implicitly initialized. // As such, this function should never return ENOKEY. func (t *Task) SessionKeyring() (*auth.Key, error) { t.mu.Lock() defer t.mu.Unlock() if t.sessionKeyring != nil { // Verify that we still have access to this keyring. creds := t.Credentials() if !creds.HasKeyPermission(t.sessionKeyring, creds.PossessedKeys(t.sessionKeyring, nil, nil), auth.KeySearch) { return nil, linuxerr.EACCES } return t.sessionKeyring, nil } // If we don't have a session keyring, implicitly create one. return t.joinNewSessionKeyringLocked(auth.DefaultSessionKeyringName, auth.DefaultUnnamedSessionKeyringPermissions) } // joinNewSessionKeyringLocked creates a new session keyring with the given // description, and joins it immediately. // Preconditions: t.mu is held. // // +checklocks:t.mu func (t *Task) joinNewSessionKeyringLocked(newKeyDesc string, newKeyPerms auth.KeyPermissions) (*auth.Key, error) { var sessionKeyring *auth.Key err := t.UserNamespace().Keys.Do(func(keySet *auth.LockedKeySet) error { creds := t.Credentials() var err error sessionKeyring, err = keySet.Add(newKeyDesc, creds, newKeyPerms) return err }) if err != nil { return nil, err } t.Debugf("Joining newly-created session keyring with ID %d, permissions %v", sessionKeyring.ID, newKeyPerms) t.sessionKeyring = sessionKeyring return sessionKeyring, nil } // JoinSessionKeyring causes the task to join a keyring with the given // key description (not ID). // If `keyDesc` is nil, then the task joins a newly-instantiated session // keyring instead. func (t *Task) JoinSessionKeyring(keyDesc *string) (*auth.Key, error) { t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials() possessed := creds.PossessedKeys(t.sessionKeyring, nil, nil) var sessionKeyring *auth.Key newKeyPerms := auth.DefaultUnnamedSessionKeyringPermissions newKeyDesc := auth.DefaultSessionKeyringName if keyDesc != nil { creds.UserNamespace.Keys.ForEach(func(k *auth.Key) bool { if k.Description == *keyDesc && creds.HasKeyPermission(k, possessed, auth.KeySearch) { sessionKeyring = k return true } return false }) if sessionKeyring != nil { t.Debugf("Joining existing session keyring with ID %d", sessionKeyring.ID) t.sessionKeyring = sessionKeyring return sessionKeyring, nil } newKeyDesc = *keyDesc newKeyPerms = auth.DefaultNamedSessionKeyringPermissions } return t.joinNewSessionKeyringLocked(newKeyDesc, newKeyPerms) } // LookupKey looks up a key by ID using this task's credentials. func (t *Task) LookupKey(keyID auth.KeySerial) (*auth.Key, error) { t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials() key, err := creds.UserNamespace.Keys.Lookup(keyID) if err != nil { return nil, err } if !creds.HasKeyPermission(key, creds.PossessedKeys(t.sessionKeyring, nil, nil), auth.KeySearch) { return nil, linuxerr.EACCES } return key, nil } // SetPermsOnKey sets the permission bits on the given key using the task's // credentials. func (t *Task) SetPermsOnKey(key *auth.Key, perms auth.KeyPermissions) error { t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials() possessed := creds.PossessedKeys(t.sessionKeyring, nil, nil) return creds.UserNamespace.Keys.Do(func(keySet *auth.LockedKeySet) error { if !creds.HasKeyPermission(key, possessed, auth.KeySetAttr) { return linuxerr.EACCES } keySet.SetPerms(key, perms) return nil }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_list.go000066400000000000000000000116221465435605700245560ustar00rootroot00000000000000package kernel // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type taskElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (taskElementMapper) linkerFor(elem *Task) *Task { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type taskList struct { head *Task tail *Task } // Reset resets list l to the empty state. func (l *taskList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *taskList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *taskList) Front() *Task { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *taskList) Back() *Task { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *taskList) Len() (count int) { for e := l.Front(); e != nil; e = (taskElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *taskList) PushFront(e *Task) { linker := taskElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { taskElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *taskList) PushFrontList(m *taskList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { taskElementMapper{}.linkerFor(l.head).SetPrev(m.tail) taskElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *taskList) PushBack(e *Task) { linker := taskElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { taskElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *taskList) PushBackList(m *taskList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { taskElementMapper{}.linkerFor(l.tail).SetNext(m.head) taskElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *taskList) InsertAfter(b, e *Task) { bLinker := taskElementMapper{}.linkerFor(b) eLinker := taskElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { taskElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *taskList) InsertBefore(a, e *Task) { aLinker := taskElementMapper{}.linkerFor(a) eLinker := taskElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { taskElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *taskList) Remove(e *Task) { linker := taskElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { taskElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { taskElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type taskEntry struct { next *Task prev *Task } // Next returns the entry that follows e in the list. // //go:nosplit func (e *taskEntry) Next() *Task { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *taskEntry) Prev() *Task { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *taskEntry) SetNext(elem *Task) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *taskEntry) SetPrev(elem *Task) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_log.go000066400000000000000000000171641465435605700243730ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "runtime/trace" "sort" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/usermem" ) const ( // maxStackDebugBytes is the maximum number of user stack bytes that may be // printed by debugDumpStack. maxStackDebugBytes = 1024 // maxCodeDebugBytes is the maximum number of user code bytes that may be // printed by debugDumpCode. maxCodeDebugBytes = 128 ) // Infof logs an formatted info message by calling log.Infof. func (t *Task) Infof(fmt string, v ...any) { if log.IsLogging(log.Info) { log.InfofAtDepth(1, *t.logPrefix.Load()+fmt, v...) } } // Warningf logs a warning string by calling log.Warningf. func (t *Task) Warningf(fmt string, v ...any) { if log.IsLogging(log.Warning) { log.WarningfAtDepth(1, *t.logPrefix.Load()+fmt, v...) } } // Debugf creates a debug string that includes the task ID. func (t *Task) Debugf(fmt string, v ...any) { if log.IsLogging(log.Debug) { log.DebugfAtDepth(1, *t.logPrefix.Load()+fmt, v...) } } // IsLogging returns true iff this level is being logged. func (t *Task) IsLogging(level log.Level) bool { return log.IsLogging(level) } // DebugDumpState logs task state at log level debug. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) DebugDumpState() { t.debugDumpRegisters() t.debugDumpStack() t.debugDumpCode() if mm := t.MemoryManager(); mm != nil { t.Debugf("Mappings:\n%s", mm) } t.Debugf("FDTable:\n%s", t.fdTable) } // debugDumpRegisters logs register state at log level debug. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) debugDumpRegisters() { if !t.IsLogging(log.Debug) { return } regmap, err := t.Arch().RegisterMap() if err != nil { t.Debugf("Registers: %v", err) } else { t.Debugf("Registers:") var regs []string for reg := range regmap { regs = append(regs, reg) } sort.Strings(regs) for _, reg := range regs { t.Debugf("%-8s = %016x", reg, regmap[reg]) } } } // debugDumpStack logs user stack contents at log level debug. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) debugDumpStack() { if !t.IsLogging(log.Debug) { return } m := t.MemoryManager() if m == nil { t.Debugf("Memory manager for task is gone, skipping application stack dump.") return } t.Debugf("Stack:") start := hostarch.Addr(t.Arch().Stack()) // Round addr down to a 16-byte boundary. start &= ^hostarch.Addr(15) // Print 16 bytes per line, one byte at a time. for offset := uint64(0); offset < maxStackDebugBytes; offset += 16 { addr, ok := start.AddLength(offset) if !ok { break } var data [16]byte n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{ IgnorePermissions: true, }) // Print as much of the line as we can, even if an error was // encountered. if n > 0 { t.Debugf("%x: % x", addr, data[:n]) } if err != nil { t.Debugf("Error reading stack at address %x: %v", addr+hostarch.Addr(n), err) break } } } // debugDumpCode logs user code contents at log level debug. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) debugDumpCode() { if !t.IsLogging(log.Debug) { return } m := t.MemoryManager() if m == nil { t.Debugf("Memory manager for task is gone, skipping application code dump.") return } t.Debugf("Code:") // Print code on both sides of the instruction register. start := hostarch.Addr(t.Arch().IP()) - maxCodeDebugBytes/2 // Round addr down to a 16-byte boundary. start &= ^hostarch.Addr(15) // Print 16 bytes per line, one byte at a time. for offset := uint64(0); offset < maxCodeDebugBytes; offset += 16 { addr, ok := start.AddLength(offset) if !ok { break } var data [16]byte n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{ IgnorePermissions: true, }) // Print as much of the line as we can, even if an error was // encountered. if n > 0 { t.Debugf("%x: % x", addr, data[:n]) } if err != nil { t.Debugf("Error reading stack at address %x: %v", addr+hostarch.Addr(n), err) break } } } // trace definitions. // // Note that all region names are prefixed by ':' in order to ensure that they // are lexically ordered before all system calls, which use the naked system // call name (e.g. "read") for maximum clarity. const ( traceCategory = "task" runRegion = ":run" blockRegion = ":block" faultRegion = ":fault" ) // updateInfoLocked updates the task's cached log prefix and tracing // information to reflect its current thread ID. // // Preconditions: The task's owning TaskSet.mu must be locked. func (t *Task) updateInfoLocked() { // Log the TID and PID in root pidns and t's pidns. rootPID := t.tg.pidns.owner.Root.tgids[t.tg] rootTID := t.tg.pidns.owner.Root.tids[t] pid := t.tg.pidns.tgids[t.tg] tid := t.tg.pidns.tids[t] if rootPID == pid && rootTID == tid { prefix := fmt.Sprintf("[% 4d:% 4d] ", pid, tid) t.logPrefix.Store(&prefix) } else { prefix := fmt.Sprintf("[% 4d(%4d):% 4d(%4d)] ", rootPID, pid, rootTID, tid) t.logPrefix.Store(&prefix) } t.rebuildTraceContext(rootTID) } // rebuildTraceContext rebuilds the trace context. // // Precondition: the passed tid must be the tid in the root namespace. func (t *Task) rebuildTraceContext(tid ThreadID) { // Re-initialize the trace context. if t.traceTask != nil { t.traceTask.End() } // Note that we define the "task type" to be the dynamic TID. This does // not align perfectly with the documentation for "tasks" in the // tracing package. Tasks may be assumed to be bounded by analysis // tools. However, if we just use a generic "task" type here, then the // "user-defined tasks" page on the tracing dashboard becomes nearly // unusable, as it loads all traces from all tasks. // // We can assume that the number of tasks in the system is not // arbitrarily large (in general it won't be, especially for cases // where we're collecting a brief profile), so using the TID is a // reasonable compromise in this case. t.traceContext, t.traceTask = trace.NewTask(context.Background(), fmt.Sprintf("tid:%d", tid)) } // traceCloneEvent is called when a new task is spawned. // // ntid must be the new task's ThreadID in the root namespace. func (t *Task) traceCloneEvent(ntid ThreadID) { if !trace.IsEnabled() { return } trace.Logf(t.traceContext, traceCategory, "spawn: %d", ntid) } // traceExitEvent is called when a task exits. func (t *Task) traceExitEvent() { if !trace.IsEnabled() { return } trace.Logf(t.traceContext, traceCategory, "exit status: %s", t.exitStatus) } // traceExecEvent is called when a task calls exec. func (t *Task) traceExecEvent(image *TaskImage) { if !trace.IsEnabled() { return } file := image.MemoryManager.Executable() if file == nil { trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>") return } defer file.DecRef(t) // traceExecEvent function may be called before the task goroutine // starts, so we must use the async context. name := file.MappedName(t.AsyncContext()) trace.Logf(t.traceContext, traceCategory, "exec: %s", name) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_mutex.go000066400000000000000000000031601465435605700247430ustar00rootroot00000000000000package kernel import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type taskMutex struct { mu sync.Mutex } var taskprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var tasklockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type tasklockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. const ( taskLockChild = tasklockNameIndex(0) ) const () // Lock locks m. // +checklocksignore func (m *taskMutex) Lock() { locking.AddGLock(taskprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *taskMutex) NestedLock(i tasklockNameIndex) { locking.AddGLock(taskprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *taskMutex) Unlock() { locking.DelGLock(taskprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *taskMutex) NestedUnlock(i tasklockNameIndex) { locking.DelGLock(taskprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func taskinitLockNames() { tasklockNames = []string{"child"} } func init() { taskinitLockNames() taskprefixIndex = locking.NewMutexClass(reflect.TypeOf(taskMutex{}), tasklockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_net.go000066400000000000000000000044001465435605700243650ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs" "gvisor.dev/gvisor/pkg/sentry/inet" ) // IsNetworkNamespaced returns true if t is in a non-root network namespace. func (t *Task) IsNetworkNamespaced() bool { return !t.netns.IsRoot() } // NetworkContext returns the network stack used by the task. NetworkContext // may return nil if no network stack is available. // // TODO(gvisor.dev/issue/1833): Migrate callers of this method to // NetworkNamespace(). func (t *Task) NetworkContext() inet.Stack { return t.netns.Stack() } // NetworkNamespace returns the network namespace observed by the task. func (t *Task) NetworkNamespace() *inet.Namespace { return t.netns } // GetNetworkNamespace takes a reference on the task network namespace and // returns it. It can return nil if the task isn't alive. func (t *Task) GetNetworkNamespace() *inet.Namespace { // t.mu is required to be sure that the network namespace will not be // released. t.mu.Lock() netns := t.netns if netns != nil { netns.IncRef() } t.mu.Unlock() return netns } // NetworkNamespaceByFD returns the network namespace associated with the specified descriptor. func (t *Task) NetworkNamespaceByFD(fd int32) (*inet.Namespace, error) { file := t.GetFile(fd) if file == nil { return nil, linuxerr.EBADF } defer file.DecRef(t) d, ok := file.Dentry().Impl().(*kernfs.Dentry) if !ok { return nil, linuxerr.EINVAL } i, ok := d.Inode().(*nsfs.Inode) if !ok { return nil, linuxerr.EINVAL } ns, ok := i.Namespace().(*inet.Namespace) if !ok { return nil, linuxerr.EINVAL } ns.IncRef() return ns, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_run.go000066400000000000000000000322361465435605700244130ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "runtime" "runtime/trace" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/goid" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/hostcpu" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" ) // A taskRunState is a reified state in the task state machine. See README.md // for details. The canonical list of all run states, as well as transitions // between them, is given in run_states.dot. // // The set of possible states is enumerable and completely defined by the // kernel package, so taskRunState would ideally be represented by a // discriminated union. However, Go does not support sum types. // // Hence, as with TaskStop, data-free taskRunStates should be represented as // typecast nils to avoid unnecessary allocation. type taskRunState interface { // execute executes the code associated with this state over the given task // and returns the following state. If execute returns nil, the task // goroutine should exit. // // It is valid to tail-call a following state's execute to avoid the // overhead of converting the following state to an interface object and // checking for stops, provided that the tail-call cannot recurse. execute(*Task) taskRunState } // run runs the task goroutine. // // threadID a dummy value set to the task's TID in the root PID namespace to // make it visible in stack dumps. A goroutine for a given task can be identified // searching for Task.run()'s argument value. func (t *Task) run(threadID uintptr) { t.goid.Store(goid.Get()) refs.CleanupSync.Add(1) defer refs.CleanupSync.Done() // Construct t.blockingTimer here. We do this here because we can't // reconstruct t.blockingTimer during restore in Task.afterLoad(), because // kernel.timekeeper.SetClocks() hasn't been called yet. blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier() t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier) defer t.blockingTimer.Destroy() t.blockingTimerChan = blockingTimerChan // Activate our address space. t.Activate() // The corresponding t.Deactivate occurs in the exit path // (runExitMain.execute) so that when // Platform.CooperativelySharesAddressSpace() == true, we give up the // AddressSpace before the task goroutine finishes executing. // If this is a newly-started task, it should check for participation in // group stops. If this is a task resuming after restore, it was // interrupted by saving. In either case, the task is initially // interrupted. t.interruptSelf() for { // Explanation for this ordering: // // - A freshly-started task that is stopped should not do anything // before it enters the stop. // // - If taskRunState.execute returns nil, the task goroutine should // exit without checking for a stop. // // - Task.Start won't start Task.run if t.runState is nil, so this // ordering is safe. t.doStop() t.runState = t.runState.execute(t) if t.runState == nil { t.accountTaskGoroutineEnter(TaskGoroutineNonexistent) t.goroutineStopped.Done() t.tg.liveGoroutines.Done() t.tg.pidns.owner.liveGoroutines.Done() t.tg.pidns.owner.runningGoroutines.Done() t.p.Release() // Deferring this store triggers a false positive in the race // detector (https://github.com/golang/go/issues/42599). t.goid.Store(0) // Keep argument alive because stack trace for dead variables may not be correct. runtime.KeepAlive(threadID) return } } } // doStop is called by Task.run to block until the task is not stopped. func (t *Task) doStop() { if t.stopCount.Load() == 0 { return } t.Deactivate() // NOTE(b/30316266): t.Activate() must be called without any locks held, so // this defer must precede the defer for unlocking the signal mutex. defer t.Activate() t.accountTaskGoroutineEnter(TaskGoroutineStopped) defer t.accountTaskGoroutineLeave(TaskGoroutineStopped) t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() t.tg.pidns.owner.runningGoroutines.Add(-1) defer t.tg.pidns.owner.runningGoroutines.Add(1) t.goroutineStopped.Add(-1) defer t.goroutineStopped.Add(1) for t.stopCount.RacyLoad() > 0 { t.endStopCond.Wait() } } // The runApp state checks for interrupts before executing untrusted // application code. // // +stateify savable type runApp struct{} func (app *runApp) execute(t *Task) taskRunState { if t.interrupted() { // Checkpointing instructs tasks to stop by sending an interrupt, so we // must check for stops before entering runInterrupt (instead of // tail-calling it). return (*runInterrupt)(nil) } // Execute any task work callbacks before returning to user space. if t.taskWorkCount.Load() > 0 { t.taskWorkMu.Lock() queue := t.taskWork t.taskWork = nil t.taskWorkCount.Store(0) t.taskWorkMu.Unlock() // Do not hold taskWorkMu while executing task work, which may register // more work. for _, work := range queue { work.TaskWork(t) } } // We're about to switch to the application again. If there's still an // unhandled SyscallRestartErrno that wasn't translated to an EINTR, // restart the syscall that was interrupted. If there's a saved signal // mask, restore it. (Note that restoring the saved signal mask may unblock // a pending signal, causing another interruption, but that signal should // not interact with the interrupted syscall.) if t.haveSyscallReturn { if err := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); err != nil { t.Warningf("Unable to pull a full state: %v", err) t.PrepareExit(linux.WaitStatusExit(int32(ExtractErrno(err, -1)))) return (*runExit)(nil) } if sre, ok := linuxerr.SyscallRestartErrorFromReturn(t.Arch().Return()); ok { if sre == linuxerr.ERESTART_RESTARTBLOCK { t.Debugf("Restarting syscall %d with restart block: not interrupted by handled signal", t.Arch().SyscallNo()) t.Arch().RestartSyscallWithRestartBlock() } else { t.Debugf("Restarting syscall %d: not interrupted by handled signal", t.Arch().SyscallNo()) t.Arch().RestartSyscall() } } t.haveSyscallReturn = false } if t.haveSavedSignalMask { t.SetSignalMask(t.savedSignalMask) t.haveSavedSignalMask = false if t.interrupted() { return (*runInterrupt)(nil) } } // Apply restartable sequences. if t.rseqPreempted { t.rseqPreempted = false if t.rseqAddr != 0 || t.oldRSeqCPUAddr != 0 { t.rseqCPU = int32(hostcpu.GetCPU()) if err := t.rseqCopyOutCPU(); err != nil { t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err) t.forceSignal(linux.SIGSEGV, false) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) // Re-enter the task run loop for signal delivery. return (*runApp)(nil) } if err := t.oldRSeqCopyOutCPU(); err != nil { t.Debugf("Failed to copy CPU to %#x for old rseq: %v", t.oldRSeqCPUAddr, err) t.forceSignal(linux.SIGSEGV, false) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) // Re-enter the task run loop for signal delivery. return (*runApp)(nil) } } t.rseqInterrupt() } // Check if we need to enable single-stepping. Tracers expect that the // kernel preserves the value of the single-step flag set by PTRACE_SETREGS // whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this // includes our ptrace platform, by the way), so we should only clear the // single-step flag if we're responsible for setting it. (clearSinglestep // is therefore analogous to Linux's TIF_FORCED_TF.) // // Strictly speaking, we should also not clear the single-step flag if we // single-step through an instruction that sets the single-step flag // (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their // own TF. (Famous last words, I know.) clearSinglestep := false if t.hasTracer() { t.tg.pidns.owner.mu.RLock() if t.ptraceSinglestep { clearSinglestep = !t.Arch().SingleStep() t.Arch().SetSingleStep() } t.tg.pidns.owner.mu.RUnlock() } region := trace.StartRegion(t.traceContext, runRegion) t.accountTaskGoroutineEnter(TaskGoroutineRunningApp) info, at, err := t.p.Switch(t, t.MemoryManager(), t.Arch(), t.rseqCPU) t.accountTaskGoroutineLeave(TaskGoroutineRunningApp) region.End() if clearSinglestep { t.Arch().ClearSingleStep() } if t.hasTracer() { if e := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); e != nil { t.Warningf("Unable to pull a full state: %v", e) err = e } } switch err { case nil: // Handle application system call. return t.doSyscall() case platform.ErrContextInterrupt: // Interrupted by platform.Context.Interrupt(). Re-enter the run // loop to figure out why. return (*runApp)(nil) case platform.ErrContextSignal: // Looks like a signal has been delivered to us. If it's a synchronous // signal (SEGV, SIGBUS, etc.), it should be sent to the application // thread that received it. sig := linux.Signal(info.Signo) // Was it a fault that we should handle internally? If so, this wasn't // an application-generated signal and we should continue execution // normally. if at.Any() { faultCounter.Increment() region := trace.StartRegion(t.traceContext, faultRegion) addr := hostarch.Addr(info.Addr()) err := t.MemoryManager().HandleUserFault(t, addr, at, hostarch.Addr(t.Arch().Stack())) region.End() if err == nil { // The fault was handled appropriately. // We can resume running the application. return (*runApp)(nil) } // Is this a vsyscall that we need emulate? // // Note that we don't track vsyscalls as part of a // specific trace region. This is because regions don't // stack, and the actual system call will count as a // region. We should be able to easily identify // vsyscalls by having a pair. if at.Execute { if sysno, ok := t.image.st.LookupEmulate(addr); ok { return t.doVsyscall(addr, sysno) } } // Faults are common, log only at debug level. t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v sig=%v err=%v", addr, t.Arch().IP(), at, sig, err) t.DebugDumpState() // Continue to signal handling. // // Convert a BusError error to a SIGBUS from a SIGSEGV. All // other info bits stay the same (address, etc.). if _, ok := err.(*memmap.BusError); ok { sig = linux.SIGBUS info.Signo = int32(linux.SIGBUS) } } switch sig { case linux.SIGILL, linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP: // Synchronous signal. Send it to ourselves. Assume the signal is // legitimate and force it (work around the signal being ignored or // blocked) like Linux does. Conveniently, this is even the correct // behavior for SIGTRAP from single-stepping. t.forceSignal(linux.Signal(sig), false /* unconditional */) t.SendSignal(info) case platform.SignalInterrupt: // Assume that a call to platform.Context.Interrupt() misfired. case linux.SIGPROF: // It's a profiling interrupt: there's not much // we can do. We've already paid a decent cost // by intercepting the signal, at this point we // simply ignore it. default: // Asynchronous signal. Let the system deal with it. t.k.sendExternalSignal(info, "application") } return (*runApp)(nil) case platform.ErrContextCPUPreempted: // Ensure that rseq critical sections are interrupted and per-thread // CPU values are updated before the next platform.Context.Switch(). t.rseqPreempted = true return (*runApp)(nil) default: // What happened? Can't continue. t.Warningf("Unexpected SwitchToApp error: %v", err) t.PrepareExit(linux.WaitStatusExit(int32(ExtractErrno(err, -1)))) return (*runExit)(nil) } } // assertTaskGoroutine panics if the caller is not running on t's task // goroutine. func (t *Task) assertTaskGoroutine() { if got, want := goid.Get(), t.goid.Load(); got != want { panic(fmt.Sprintf("running on goroutine %d (task goroutine for kernel.Task %p is %d)", got, t, want)) } } // GoroutineID returns the ID of t's task goroutine. func (t *Task) GoroutineID() int64 { return t.goid.Load() } // waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits. func (t *Task) waitGoroutineStoppedOrExited() { t.goroutineStopped.Wait() } // WaitExited blocks until all task goroutines in tg have exited. // // WaitExited does not correspond to anything in Linux; it's provided so that // external callers of Kernel.CreateProcess can wait for the created thread // group to terminate. func (tg *ThreadGroup) WaitExited() { tg.liveGoroutines.Wait() } // Yield yields the processor for the calling task. func (t *Task) Yield() { t.yieldCount.Add(1) runtime.Gosched() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_sched.go000066400000000000000000000530521465435605700246740ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel // CPU scheduling, real and fake. import ( "fmt" "math/rand" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" ) // TaskGoroutineState is a coarse representation of the current execution // status of a kernel.Task goroutine. type TaskGoroutineState int const ( // TaskGoroutineNonexistent indicates that the task goroutine has either // not yet been created by Task.Start() or has returned from Task.run(). // This must be the zero value for TaskGoroutineState. TaskGoroutineNonexistent TaskGoroutineState = iota // TaskGoroutineRunningSys indicates that the task goroutine is executing // sentry code. TaskGoroutineRunningSys // TaskGoroutineRunningApp indicates that the task goroutine is executing // application code. TaskGoroutineRunningApp // TaskGoroutineBlockedInterruptible indicates that the task goroutine is // blocked in Task.block(), and hence may be woken by Task.interrupt() // (e.g. due to signal delivery). TaskGoroutineBlockedInterruptible // TaskGoroutineBlockedUninterruptible indicates that the task goroutine is // stopped outside of Task.block() and Task.doStop(), and hence cannot be // woken by Task.interrupt(). TaskGoroutineBlockedUninterruptible // TaskGoroutineStopped indicates that the task goroutine is blocked in // Task.doStop(). TaskGoroutineStopped is similar to // TaskGoroutineBlockedUninterruptible, but is a separate state to make it // possible to determine when Task.stop is meaningful. TaskGoroutineStopped ) // TaskGoroutineSchedInfo contains task goroutine scheduling state which must // be read and updated atomically. // // +stateify savable type TaskGoroutineSchedInfo struct { // Timestamp was the value of Kernel.cpuClock when this // TaskGoroutineSchedInfo was last updated. Timestamp uint64 // State is the current state of the task goroutine. State TaskGoroutineState // UserTicks is the amount of time the task goroutine has spent executing // its associated Task's application code, in units of linux.ClockTick. UserTicks uint64 // SysTicks is the amount of time the task goroutine has spent executing in // the sentry, in units of linux.ClockTick. SysTicks uint64 } // userTicksAt returns the extrapolated value of ts.UserTicks after // Kernel.CPUClockNow() indicates a time of now. // // Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is // monotonic, this is satisfied if now is the result of a previous call to // Kernel.CPUClockNow().) This requirement exists because otherwise a racing // change to t.gosched can cause userTicksAt to adjust stats by too much, // making the observed stats non-monotonic. func (ts *TaskGoroutineSchedInfo) userTicksAt(now uint64) uint64 { if ts.Timestamp < now && ts.State == TaskGoroutineRunningApp { // Update stats to reflect execution since the last update. return ts.UserTicks + (now - ts.Timestamp) } return ts.UserTicks } // sysTicksAt returns the extrapolated value of ts.SysTicks after // Kernel.CPUClockNow() indicates a time of now. // // Preconditions: As for userTicksAt. func (ts *TaskGoroutineSchedInfo) sysTicksAt(now uint64) uint64 { if ts.Timestamp < now && ts.State == TaskGoroutineRunningSys { return ts.SysTicks + (now - ts.Timestamp) } return ts.SysTicks } // Preconditions: The caller must be running on the task goroutine. func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) { now := t.k.CPUClockNow() if t.gosched.State != TaskGoroutineRunningSys { panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, TaskGoroutineRunningSys, state)) } t.goschedSeq.BeginWrite() // This function is very hot; avoid defer. t.gosched.SysTicks += now - t.gosched.Timestamp t.gosched.Timestamp = now t.gosched.State = state t.goschedSeq.EndWrite() if state != TaskGoroutineRunningApp { // Task is blocking/stopping. t.k.decRunningTasks() } } // Preconditions: // - The caller must be running on the task goroutine // - The caller must be leaving a state indicated by a previous call to // t.accountTaskGoroutineEnter(state). func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) { if state != TaskGoroutineRunningApp { // Task is unblocking/continuing. t.k.incRunningTasks() } now := t.k.CPUClockNow() if t.gosched.State != state { panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys)) } t.goschedSeq.BeginWrite() // This function is very hot; avoid defer. if state == TaskGoroutineRunningApp { t.gosched.UserTicks += now - t.gosched.Timestamp } t.gosched.Timestamp = now t.gosched.State = TaskGoroutineRunningSys t.goschedSeq.EndWrite() } // Preconditions: The caller must be running on the task goroutine. func (t *Task) accountTaskGoroutineRunning() { now := t.k.CPUClockNow() if t.gosched.State != TaskGoroutineRunningSys { panic(fmt.Sprintf("Task goroutine in state %v (expected %v)", t.gosched.State, TaskGoroutineRunningSys)) } t.goschedSeq.BeginWrite() t.gosched.SysTicks += now - t.gosched.Timestamp t.gosched.Timestamp = now t.goschedSeq.EndWrite() } // TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info. // Most clients should use t.CPUStats() instead. func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo { return SeqAtomicLoadTaskGoroutineSchedInfo(&t.goschedSeq, &t.gosched) } // CPUStats returns the CPU usage statistics of t. func (t *Task) CPUStats() usage.CPUStats { return t.cpuStatsAt(t.k.CPUClockNow()) } // Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. func (t *Task) cpuStatsAt(now uint64) usage.CPUStats { tsched := t.TaskGoroutineSchedInfo() return usage.CPUStats{ UserTime: time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick)), SysTime: time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick)), VoluntarySwitches: t.yieldCount.Load(), } } // CPUStats returns the combined CPU usage statistics of all past and present // threads in tg. func (tg *ThreadGroup) CPUStats() usage.CPUStats { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() // Hack to get a pointer to the Kernel. if tg.leader == nil { // Per comment on tg.leader, this is only possible if nothing in the // ThreadGroup has ever executed anyway. return usage.CPUStats{} } return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow()) } // Preconditions: Same as TaskGoroutineSchedInfo.userTicksAt, plus: // - The TaskSet mutex must be locked. func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats { stats := tg.exitedCPUStats // Account for live tasks. for t := tg.tasks.Front(); t != nil; t = t.Next() { stats.Accumulate(t.cpuStatsAt(now)) } return stats } // JoinedChildCPUStats implements the semantics of RUSAGE_CHILDREN: "Return // resource usage statistics for all children of [tg] that have terminated and // been waited for. These statistics will include the resources used by // grandchildren, and further removed descendants, if all of the intervening // descendants waited on their terminated children." func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() return tg.childCPUStats } // taskClock is a ktime.Clock that measures the time that a task has spent // executing. taskClock is primarily used to implement CLOCK_THREAD_CPUTIME_ID. // // +stateify savable type taskClock struct { t *Task // If includeSys is true, the taskClock includes both time spent executing // application code as well as time spent in the sentry. Otherwise, the // taskClock includes only time spent executing application code. includeSys bool // Implements waiter.Waitable. TimeUntil wouldn't change its estimation // based on either of the clock events, so there's no event to be // notified for. ktime.NoClockEvents `state:"nosave"` // Implements ktime.Clock.WallTimeUntil. // // As an upper bound, a task's clock cannot advance faster than CPU // time. It would have to execute at a rate of more than 1 task-second // per 1 CPU-second, which isn't possible. ktime.WallRateClock `state:"nosave"` } // UserCPUClock returns a clock measuring the CPU time the task has spent // executing application code. func (t *Task) UserCPUClock() ktime.Clock { return &taskClock{t: t, includeSys: false} } // CPUClock returns a clock measuring the CPU time the task has spent executing // application and "kernel" code. func (t *Task) CPUClock() ktime.Clock { return &taskClock{t: t, includeSys: true} } // Now implements ktime.Clock.Now. func (tc *taskClock) Now() ktime.Time { stats := tc.t.CPUStats() if tc.includeSys { return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds()) } return ktime.FromNanoseconds(stats.UserTime.Nanoseconds()) } // tgClock is a ktime.Clock that measures the time a thread group has spent // executing. tgClock is primarily used to implement CLOCK_PROCESS_CPUTIME_ID. // // +stateify savable type tgClock struct { tg *ThreadGroup // If includeSys is true, the tgClock includes both time spent executing // application code as well as time spent in the sentry. Otherwise, the // tgClock includes only time spent executing application code. includeSys bool // Implements waiter.Waitable. ktime.ClockEventsQueue `state:"nosave"` } // Now implements ktime.Clock.Now. func (tgc *tgClock) Now() ktime.Time { stats := tgc.tg.CPUStats() if tgc.includeSys { return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds()) } return ktime.FromNanoseconds(stats.UserTime.Nanoseconds()) } // WallTimeUntil implements ktime.Clock.WallTimeUntil. func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration { // Thread group CPU time should not exceed wall time * live tasks, since // task goroutines exit after the transition to TaskExitZombie in // runExitNotify. tgc.tg.pidns.owner.mu.RLock() n := tgc.tg.liveTasks tgc.tg.pidns.owner.mu.RUnlock() if n == 0 { if t.Before(now) { return 0 } // The timer tick raced with thread group exit, after which no more // tasks can enter the thread group. So tgc.Now() will never advance // again. Return a large delay; the timer should be stopped long before // it comes again anyway. return time.Hour } // This is a lower bound on the amount of time that can elapse before an // associated timer expires, so returning this value tends to result in a // sequence of closely-spaced ticks just before timer expiry. To avoid // this, round up to the nearest ClockTick; CPU usage measurements are // limited to this resolution anyway. remaining := time.Duration(t.Sub(now).Nanoseconds()/int64(n)) * time.Nanosecond return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick } // UserCPUClock returns a ktime.Clock that measures the time that a thread // group has spent executing. func (tg *ThreadGroup) UserCPUClock() ktime.Clock { return &tgClock{tg: tg, includeSys: false} } // CPUClock returns a ktime.Clock that measures the time that a thread group // has spent executing, including sentry time. func (tg *ThreadGroup) CPUClock() ktime.Clock { return &tgClock{tg: tg, includeSys: true} } func (k *Kernel) runCPUClockTicker() { rng := rand.New(rand.NewSource(rand.Int63())) var tgs []*ThreadGroup for { // Stop the CPU clock while nothing is running. if k.runningTasks.Load() == 0 { k.runningTasksMu.Lock() if k.runningTasks.Load() == 0 { k.cpuClockTickerRunning = false k.cpuClockTickerStopCond.Broadcast() k.runningTasksCond.Wait() // k.cpuClockTickerRunning was set to true by our waker // (Kernel.incRunningTasks()). For reasons described there, we must // process at least one CPU clock tick between calls to // k.runningTasksCond.Wait(). } k.runningTasksMu.Unlock() } // Wait for the next CPU clock tick. select { case <-k.cpuClockTickTimer.C: k.cpuClockTickTimer.Reset(linux.ClockTick) case <-k.cpuClockTickerWakeCh: continue } // Advance the CPU clock, and timers based on the CPU clock, atomically // under cpuClockMu. k.cpuClockMu.Lock() now := k.cpuClock.Add(1) // Check thread group CPU timers. tgs = k.tasks.Root.ThreadGroupsAppend(tgs) for _, tg := range tgs { if tg.cpuTimersEnabled.Load() == 0 { continue } k.tasks.mu.RLock() if tg.leader == nil { // No tasks have ever run in this thread group. k.tasks.mu.RUnlock() continue } // Accumulate thread group CPU stats, and randomly select running tasks // using reservoir sampling to receive CPU timer signals. var virtReceiver *Task nrVirtCandidates := 0 var profReceiver *Task nrProfCandidates := 0 tgUserTime := tg.exitedCPUStats.UserTime tgSysTime := tg.exitedCPUStats.SysTime for t := tg.tasks.Front(); t != nil; t = t.Next() { tsched := t.TaskGoroutineSchedInfo() tgUserTime += time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick)) tgSysTime += time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick)) switch tsched.State { case TaskGoroutineRunningApp: // Considered by ITIMER_VIRT, ITIMER_PROF, and RLIMIT_CPU // timers. nrVirtCandidates++ if int(randInt31n(rng, int32(nrVirtCandidates))) == 0 { virtReceiver = t } fallthrough case TaskGoroutineRunningSys: // Considered by ITIMER_PROF and RLIMIT_CPU timers. nrProfCandidates++ if int(randInt31n(rng, int32(nrProfCandidates))) == 0 { profReceiver = t } } } tgVirtNow := ktime.FromNanoseconds(tgUserTime.Nanoseconds()) tgProfNow := ktime.FromNanoseconds((tgUserTime + tgSysTime).Nanoseconds()) // All of the following are standard (not real-time) signals, which are // automatically deduplicated, so we ignore the number of expirations. tg.signalHandlers.mu.Lock() // It should only be possible for these timers to advance if we found // at least one running task. if virtReceiver != nil { // ITIMER_VIRTUAL newItimerVirtSetting, exp := tg.itimerVirtSetting.At(tgVirtNow) tg.itimerVirtSetting = newItimerVirtSetting if exp != 0 { virtReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGVTALRM), true) } } if profReceiver != nil { // ITIMER_PROF newItimerProfSetting, exp := tg.itimerProfSetting.At(tgProfNow) tg.itimerProfSetting = newItimerProfSetting if exp != 0 { profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGPROF), true) } // RLIMIT_CPU soft limit newRlimitCPUSoftSetting, exp := tg.rlimitCPUSoftSetting.At(tgProfNow) tg.rlimitCPUSoftSetting = newRlimitCPUSoftSetting if exp != 0 { profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGXCPU), true) } // RLIMIT_CPU hard limit rlimitCPUMax := tg.limits.Get(limits.CPU).Max if rlimitCPUMax != limits.Infinity && !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPUMax))) { profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true) } } tg.signalHandlers.mu.Unlock() k.tasks.mu.RUnlock() } k.cpuClockMu.Unlock() // Retain tgs between calls to Notify to reduce allocations. for i := range tgs { tgs[i] = nil } tgs = tgs[:0] } } // randInt31n returns a random integer in [0, n). // // randInt31n is equivalent to math/rand.Rand.int31n(), which is unexported. // See that function for details. func randInt31n(rng *rand.Rand, n int32) int32 { v := rng.Uint32() prod := uint64(v) * uint64(n) low := uint32(prod) if low < uint32(n) { thresh := uint32(-n) % uint32(n) for low < thresh { v = rng.Uint32() prod = uint64(v) * uint64(n) low = uint32(prod) } } return int32(prod >> 32) } // NotifyRlimitCPUUpdated is called by setrlimit. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) NotifyRlimitCPUUpdated() { t.k.cpuClockMu.Lock() defer t.k.cpuClockMu.Unlock() t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() rlimitCPU := t.tg.limits.Get(limits.CPU) t.tg.rlimitCPUSoftSetting = ktime.Setting{ Enabled: rlimitCPU.Cur != limits.Infinity, Next: ktime.FromNanoseconds((time.Duration(rlimitCPU.Cur) * time.Second).Nanoseconds()), Period: time.Second, } if rlimitCPU.Max != limits.Infinity { // Check if tg is already over the hard limit. tgcpu := t.tg.cpuStatsAtLocked(t.k.CPUClockNow()) tgProfNow := ktime.FromNanoseconds((tgcpu.UserTime + tgcpu.SysTime).Nanoseconds()) if !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPU.Max))) { t.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true) } } t.tg.updateCPUTimersEnabledLocked() } // Preconditions: The signal mutex must be locked. func (tg *ThreadGroup) updateCPUTimersEnabledLocked() { rlimitCPU := tg.limits.Get(limits.CPU) if tg.itimerVirtSetting.Enabled || tg.itimerProfSetting.Enabled || tg.rlimitCPUSoftSetting.Enabled || rlimitCPU.Max != limits.Infinity { tg.cpuTimersEnabled.Store(1) } else { tg.cpuTimersEnabled.Store(0) } } // StateStatus returns a string representation of the task's current state, // appropriate for /proc/[pid]/status. func (t *Task) StateStatus() string { switch s := t.TaskGoroutineSchedInfo().State; s { case TaskGoroutineNonexistent, TaskGoroutineRunningSys: t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() switch t.exitState { case TaskExitZombie: return "Z (zombie)" case TaskExitDead: return "X (dead)" default: // The task goroutine can't exit before passing through // runExitNotify, so if s == TaskGoroutineNonexistent, the task has // been created but the task goroutine hasn't yet started. The // Linux equivalent is struct task_struct::state == TASK_NEW // (kernel/fork.c:copy_process() => // kernel/sched/core.c:sched_fork()), but the TASK_NEW bit is // masked out by TASK_REPORT for /proc/[pid]/status, leaving only // TASK_RUNNING. return "R (running)" } case TaskGoroutineRunningApp: return "R (running)" case TaskGoroutineBlockedInterruptible: return "S (sleeping)" case TaskGoroutineStopped: t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() switch t.stop.(type) { case *groupStop: return "T (stopped)" case *ptraceStop: return "t (tracing stop)" } fallthrough case TaskGoroutineBlockedUninterruptible: // This is the name Linux uses for TASK_UNINTERRUPTIBLE and // TASK_KILLABLE (= TASK_UNINTERRUPTIBLE | TASK_WAKEKILL): // fs/proc/array.c:task_state_array. return "D (disk sleep)" default: panic(fmt.Sprintf("Invalid TaskGoroutineState: %v", s)) } } // CPUMask returns a copy of t's allowed CPU mask. func (t *Task) CPUMask() sched.CPUSet { t.mu.Lock() defer t.mu.Unlock() return t.allowedCPUMask.Copy() } // SetCPUMask sets t's allowed CPU mask based on mask. It takes ownership of // mask. // // Preconditions: mask.Size() == // sched.CPUSetSize(t.Kernel().ApplicationCores()). func (t *Task) SetCPUMask(mask sched.CPUSet) error { if want := sched.CPUSetSize(t.k.applicationCores); mask.Size() != want { panic(fmt.Sprintf("Invalid CPUSet %v (expected %d bytes)", mask, want)) } // Remove CPUs in mask above Kernel.applicationCores. mask.ClearAbove(t.k.applicationCores) // Ensure that at least 1 CPU is still allowed. if mask.NumCPUs() == 0 { return linuxerr.EINVAL } if t.k.useHostCores { // No-op; pretend the mask was immediately changed back. return nil } t.tg.pidns.owner.mu.RLock() rootTID := t.tg.pidns.owner.Root.tids[t] t.tg.pidns.owner.mu.RUnlock() t.mu.Lock() defer t.mu.Unlock() t.allowedCPUMask = mask t.cpu.Store(assignCPU(mask, rootTID)) return nil } // CPU returns the cpu id for a given task. func (t *Task) CPU() int32 { if t.k.useHostCores { return int32(hostcpu.GetCPU()) } return t.cpu.Load() } // assignCPU returns the virtualized CPU number for the task with global TID // tid and allowedCPUMask allowed. func assignCPU(allowed sched.CPUSet, tid ThreadID) (cpu int32) { // To pretend that threads are evenly distributed to allowed CPUs, choose n // to be less than the number of CPUs in allowed ... n := int(tid) % int(allowed.NumCPUs()) // ... then pick the nth CPU in allowed. allowed.ForEachCPU(func(c uint) { if n == 0 { cpu = int32(c) } n-- }) return cpu } // Niceness returns t's niceness. func (t *Task) Niceness() int { t.mu.Lock() defer t.mu.Unlock() return t.niceness } // Priority returns t's priority. func (t *Task) Priority() int { t.mu.Lock() defer t.mu.Unlock() return t.niceness + 20 } // SetNiceness sets t's niceness to n. func (t *Task) SetNiceness(n int) { t.mu.Lock() defer t.mu.Unlock() t.niceness = n } // NumaPolicy returns t's current numa policy. func (t *Task) NumaPolicy() (policy linux.NumaPolicy, nodeMask uint64) { t.mu.Lock() defer t.mu.Unlock() return t.numaPolicy, t.numaNodeMask } // SetNumaPolicy sets t's numa policy. func (t *Task) SetNumaPolicy(policy linux.NumaPolicy, nodeMask uint64) { t.mu.Lock() defer t.mu.Unlock() t.numaPolicy = policy t.numaNodeMask = nodeMask } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_signals.go000066400000000000000000001156641465435605700252560ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel // This file defines the behavior of task signal handling. import ( "fmt" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" "gvisor.dev/gvisor/pkg/waiter" ) // SignalAction is an internal signal action. type SignalAction int // Available signal actions. // Note that although we refer the complete set internally, // the application is only capable of using the Default and // Ignore actions from the system call interface. const ( SignalActionTerm SignalAction = iota SignalActionCore SignalActionStop SignalActionIgnore SignalActionHandler ) // Default signal handler actions. Note that for most signals, // (except SIGKILL and SIGSTOP) these can be overridden by the app. var defaultActions = map[linux.Signal]SignalAction{ // POSIX.1-1990 standard. linux.SIGHUP: SignalActionTerm, linux.SIGINT: SignalActionTerm, linux.SIGQUIT: SignalActionCore, linux.SIGILL: SignalActionCore, linux.SIGABRT: SignalActionCore, linux.SIGFPE: SignalActionCore, linux.SIGKILL: SignalActionTerm, // but see ThreadGroup.applySignalSideEffects linux.SIGSEGV: SignalActionCore, linux.SIGPIPE: SignalActionTerm, linux.SIGALRM: SignalActionTerm, linux.SIGTERM: SignalActionTerm, linux.SIGUSR1: SignalActionTerm, linux.SIGUSR2: SignalActionTerm, linux.SIGCHLD: SignalActionIgnore, linux.SIGCONT: SignalActionIgnore, // but see ThreadGroup.applySignalSideEffects linux.SIGSTOP: SignalActionStop, linux.SIGTSTP: SignalActionStop, linux.SIGTTIN: SignalActionStop, linux.SIGTTOU: SignalActionStop, // POSIX.1-2001 standard. linux.SIGBUS: SignalActionCore, linux.SIGPROF: SignalActionTerm, linux.SIGSYS: SignalActionCore, linux.SIGTRAP: SignalActionCore, linux.SIGURG: SignalActionIgnore, linux.SIGVTALRM: SignalActionTerm, linux.SIGXCPU: SignalActionCore, linux.SIGXFSZ: SignalActionCore, // The rest on linux. linux.SIGSTKFLT: SignalActionTerm, linux.SIGIO: SignalActionTerm, linux.SIGPWR: SignalActionTerm, linux.SIGWINCH: SignalActionIgnore, } // computeAction figures out what to do given a signal number // and an linux.SigAction. SIGSTOP always results in a SignalActionStop, // and SIGKILL always results in a SignalActionTerm. // Signal 0 is always ignored as many programs use it for various internal functions // and don't expect it to do anything. // // In the event the signal is not one of these, act.Handler determines what // happens next. // If act.Handler is: // 0, the default action is taken; // 1, the signal is ignored; // anything else, the function returns SignalActionHandler. func computeAction(sig linux.Signal, act linux.SigAction) SignalAction { switch sig { case linux.SIGSTOP: return SignalActionStop case linux.SIGKILL: return SignalActionTerm case linux.Signal(0): return SignalActionIgnore } switch act.Handler { case linux.SIG_DFL: return defaultActions[sig] case linux.SIG_IGN: return SignalActionIgnore default: return SignalActionHandler } } // UnblockableSignals contains the set of signals which cannot be blocked. var UnblockableSignals = linux.MakeSignalSet(linux.SIGKILL, linux.SIGSTOP) // StopSignals is the set of signals whose default action is SignalActionStop. var StopSignals = linux.MakeSignalSet(linux.SIGSTOP, linux.SIGTSTP, linux.SIGTTIN, linux.SIGTTOU) // dequeueSignalLocked returns a pending signal that is *not* included in mask. // If there are no pending unmasked signals, dequeueSignalLocked returns nil. // // Preconditions: t.tg.signalHandlers.mu must be locked. func (t *Task) dequeueSignalLocked(mask linux.SignalSet) *linux.SignalInfo { if info := t.pendingSignals.dequeue(mask); info != nil { return info } return t.tg.pendingSignals.dequeue(mask) } // discardSpecificLocked removes all instances of the given signal from all // signal queues in tg. // // Preconditions: The signal mutex must be locked. func (tg *ThreadGroup) discardSpecificLocked(sig linux.Signal) { tg.pendingSignals.discardSpecific(sig) for t := tg.tasks.Front(); t != nil; t = t.Next() { t.pendingSignals.discardSpecific(sig) } } // PendingSignals returns the set of pending signals. func (t *Task) PendingSignals() linux.SignalSet { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() return t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet } // deliverSignal delivers the given signal and returns the following run state. func (t *Task) deliverSignal(info *linux.SignalInfo, act linux.SigAction) taskRunState { sig := linux.Signal(info.Signo) sigact := computeAction(sig, act) if t.haveSyscallReturn { if sre, ok := linuxerr.SyscallRestartErrorFromReturn(t.Arch().Return()); ok { // Signals that are ignored, cause a thread group stop, or // terminate the thread group do not interact with interrupted // syscalls; in Linux terms, they are never returned to the signal // handling path from get_signal => get_signal_to_deliver. The // behavior of an interrupted syscall is determined by the first // signal that is actually handled (by userspace). if sigact == SignalActionHandler { switch { case sre == linuxerr.ERESTARTNOHAND: fallthrough case sre == linuxerr.ERESTART_RESTARTBLOCK: fallthrough case (sre == linuxerr.ERESTARTSYS && act.Flags&linux.SA_RESTART == 0): t.Debugf("Not restarting syscall %d after error %v: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo) t.Arch().SetReturn(uintptr(-ExtractErrno(linuxerr.EINTR, -1))) default: t.Debugf("Restarting syscall %d: interrupted by signal %d", t.Arch().SyscallNo(), info.Signo) t.Arch().RestartSyscall() } } } } switch sigact { case SignalActionTerm, SignalActionCore: // "Default action is to terminate the process." - signal(7) // Emit an event channel messages related to this uncaught signal. ucs := &ucspb.UncaughtSignal{ Tid: int32(t.Kernel().TaskSet().Root.IDOfTask(t)), Pid: int32(t.Kernel().TaskSet().Root.IDOfThreadGroup(t.ThreadGroup())), Registers: t.Arch().StateData().Proto(), SignalNumber: info.Signo, } // Attach an fault address if appropriate. switch sig { case linux.SIGSEGV, linux.SIGFPE, linux.SIGILL, linux.SIGTRAP, linux.SIGBUS: ucs.FaultAddr = info.Addr() } t.Debugf("Signal %d, PID: %d, TID: %d, fault addr: %#x: terminating thread group", info.Signo, ucs.Pid, ucs.Tid, ucs.FaultAddr) eventchannel.Emit(ucs) t.PrepareGroupExit(linux.WaitStatusTerminationSignal(sig)) return (*runExit)(nil) case SignalActionStop: // "Default action is to stop the process." t.initiateGroupStop(info) case SignalActionIgnore: // "Default action is to ignore the signal." t.Debugf("Signal %d: ignored", info.Signo) case SignalActionHandler: // Try to deliver the signal to the user-configured handler. t.Debugf("Signal %d: delivering to handler", info.Signo) if err := t.deliverSignalToHandler(info, act); err != nil { // This is not a warning, it can occur during normal operation. t.Debugf("Failed to deliver signal %+v to user handler: %v", info, err) // Send a forced SIGSEGV. If the signal that couldn't be delivered // was a SIGSEGV, force the handler to SIG_DFL. t.forceSignal(linux.SIGSEGV, sig == linux.SIGSEGV /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) } default: panic(fmt.Sprintf("Unknown signal action %+v, %d?", info, computeAction(sig, act))) } return (*runInterrupt)(nil) } // deliverSignalToHandler changes the task's userspace state to enter the given // user-configured handler for the given signal. func (t *Task) deliverSignalToHandler(info *linux.SignalInfo, act linux.SigAction) error { // Signal delivery to an application handler interrupts restartable // sequences. t.rseqInterrupt() // Are executing on the main stack, // or the provided alternate stack? sp := hostarch.Addr(t.Arch().Stack()) // N.B. This is a *copy* of the alternate stack that the user's signal // handler expects to see in its ucontext (even if it's not in use). alt := t.signalStack if act.Flags&linux.SA_ONSTACK != 0 && alt.IsEnabled() { alt.Flags |= linux.SS_ONSTACK if !alt.Contains(sp) { sp = hostarch.Addr(alt.Top()) } } mm := t.MemoryManager() // Set up the signal handler. If we have a saved signal mask, the signal // handler should run with the current mask, but sigreturn should restore // the saved one. st := &arch.Stack{ Arch: t.Arch(), IO: mm, Bottom: sp, } mask := linux.SignalSet(t.signalMask.Load()) if t.haveSavedSignalMask { mask = t.savedSignalMask } // Set up the restorer. // x86-64 should always uses SA_RESTORER, but this flag is optional on other platforms. // Please see the linux code as reference: // linux/arch/x86/kernel/signal.c:__setup_rt_frame() // If SA_RESTORER is not configured, we can use the sigreturn trampolines // the vdso provides instead. // Please see the linux code as reference: // linux/arch/arm64/kernel/signal.c:setup_return() if act.Flags&linux.SA_RESTORER == 0 { act.Restorer = mm.VDSOSigReturn() } if err := t.Arch().SignalSetup(st, &act, info, &alt, mask, t.k.featureSet); err != nil { return err } t.p.FullStateChanged() t.haveSavedSignalMask = false // Add our signal mask. newMask := linux.SignalSet(t.signalMask.Load()) | act.Mask if act.Flags&linux.SA_NODEFER == 0 { newMask |= linux.SignalSetOf(linux.Signal(info.Signo)) } t.SetSignalMask(newMask) return nil } var ctrlResume = &SyscallControl{ignoreReturn: true} // SignalReturn implements sigreturn(2) (if rt is false) or rt_sigreturn(2) (if // rt is true). func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) { st := t.Stack() sigset, alt, err := t.Arch().SignalRestore(st, rt, t.k.featureSet) if err != nil { // sigreturn syscalls never return errors. t.Debugf("failed to restore from a signal frame: %v", err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return nil, err } // Attempt to record the given signal stack. Note that we silently // ignore failures here, as does Linux. Only an EFAULT may be // generated, but SignalRestore has already deserialized the entire // frame successfully. t.SetSignalStack(alt) // Restore our signal mask. SIGKILL and SIGSTOP should not be blocked. t.SetSignalMask(sigset &^ UnblockableSignals) t.p.FullStateChanged() return ctrlResume, nil } // Sigtimedwait implements the semantics of sigtimedwait(2). // // Preconditions: // - The caller must be running on the task goroutine. // - t.exitState < TaskExitZombie. func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*linux.SignalInfo, error) { // set is the set of signals we're interested in; invert it to get the set // of signals to block. mask := ^(set &^ UnblockableSignals) t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() if info := t.dequeueSignalLocked(mask); info != nil { return info, nil } if timeout == 0 { return nil, linuxerr.EAGAIN } // Unblock signals we're waiting for. Remember the original signal mask so // that Task.sendSignalTimerLocked doesn't discard ignored signals that // we're temporarily unblocking. t.realSignalMask = linux.SignalSet(t.signalMask.RacyLoad()) t.setSignalMaskLocked(t.realSignalMask & mask) // Wait for a timeout or new signal. t.tg.signalHandlers.mu.Unlock() _, err := t.BlockWithTimeout(nil, true, timeout) t.tg.signalHandlers.mu.Lock() // Restore the original signal mask. t.setSignalMaskLocked(t.realSignalMask) t.realSignalMask = 0 if info := t.dequeueSignalLocked(mask); info != nil { return info, nil } if err == linuxerr.ETIMEDOUT { return nil, linuxerr.EAGAIN } return nil, err } // SendSignal sends the given signal to t. // // The following errors may be returned: // // linuxerr.ESRCH - The task has exited. // linuxerr.EINVAL - The signal is not valid. // linuxerr.EAGAIN - THe signal is realtime, and cannot be queued. func (t *Task) SendSignal(info *linux.SignalInfo) error { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() return t.sendSignalLocked(info, false /* group */) } // SendGroupSignal sends the given signal to t's thread group. func (t *Task) SendGroupSignal(info *linux.SignalInfo) error { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() return t.sendSignalLocked(info, true /* group */) } // SendSignal sends the given signal to tg, using tg's leader to determine if // the signal is blocked. func (tg *ThreadGroup) SendSignal(info *linux.SignalInfo) error { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() tg.signalHandlers.mu.Lock() defer tg.signalHandlers.mu.Unlock() return tg.leader.sendSignalLocked(info, true /* group */) } func (t *Task) sendSignalLocked(info *linux.SignalInfo, group bool) error { return t.sendSignalTimerLocked(info, group, nil) } func (t *Task) sendSignalTimerLocked(info *linux.SignalInfo, group bool, timer *IntervalTimer) error { if t.exitState == TaskExitDead { return linuxerr.ESRCH } sig := linux.Signal(info.Signo) if sig == 0 { return nil } if !sig.IsValid() { return linuxerr.EINVAL } // Signal side effects apply even if the signal is ultimately discarded. t.tg.applySignalSideEffectsLocked(sig) // TODO: "Only signals for which the "init" process has established a // signal handler can be sent to the "init" process by other members of the // PID namespace. This restriction applies even to privileged processes, // and prevents other members of the PID namespace from accidentally // killing the "init" process." - pid_namespaces(7). We don't currently do // this for child namespaces, though we should; we also don't do this for // the root namespace (the same restriction applies to global init on // Linux), where whether or not we should is much murkier. In practice, // most sandboxed applications are not prepared to function as an init // process. // Unmasked, ignored signals are discarded without being queued, unless // they will be visible to a tracer. Even for group signals, it's the // originally-targeted task's signal mask and tracer that matter; compare // Linux's kernel/signal.c:__send_signal() => prepare_signal() => // sig_ignored(). ignored := computeAction(sig, t.tg.signalHandlers.actions[sig]) == SignalActionIgnore if sigset := linux.SignalSetOf(sig); sigset&linux.SignalSet(t.signalMask.RacyLoad()) == 0 && sigset&t.realSignalMask == 0 && ignored && !t.hasTracer() { t.Debugf("Discarding ignored signal %d", sig) if timer != nil { timer.signalRejectedLocked() } return nil } q := &t.pendingSignals if group { q = &t.tg.pendingSignals } if !q.enqueue(info, timer) { if sig.IsRealtime() { return linuxerr.EAGAIN } t.Debugf("Discarding duplicate signal %d", sig) if timer != nil { timer.signalRejectedLocked() } return nil } // Find a receiver to notify. Note that the task we choose to notify, if // any, may not be the task that actually dequeues and handles the signal; // e.g. a racing signal mask change may cause the notified task to become // ineligible, or a racing sibling task may dequeue the signal first. if t.canReceiveSignalLocked(sig) { t.Debugf("Notified of signal %d", sig) t.interrupt() return nil } if group { if nt := t.tg.findSignalReceiverLocked(sig); nt != nil { nt.Debugf("Notified of group signal %d", sig) nt.interrupt() return nil } } t.Debugf("No task notified of signal %d", sig) return nil } func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) { switch { case linux.SignalSetOf(sig)&StopSignals != 0: // Stop signals cause all prior SIGCONT to be discarded. (This is // despite the fact this has little effect since SIGCONT's most // important effect is applied when the signal is sent in the branch // below, not when the signal is delivered.) tg.discardSpecificLocked(linux.SIGCONT) case sig == linux.SIGCONT: // "The SIGCONT signal has a side effect of waking up (all threads of) // a group-stopped process. This side effect happens before // signal-delivery-stop. The tracer can't suppress this side effect (it // can only suppress signal injection, which only causes the SIGCONT // handler to not be executed in the tracee, if such a handler is // installed." - ptrace(2) tg.endGroupStopLocked(true) case sig == linux.SIGKILL: // "SIGKILL does not generate signal-delivery-stop and therefore the // tracer can't suppress it. SIGKILL kills even within system calls // (syscall-exit-stop is not generated prior to death by SIGKILL)." - // ptrace(2) // // Note that this differs from ThreadGroup.requestExit in that it // ignores tg.execing. if !tg.exiting { tg.exiting = true tg.exitStatus = linux.WaitStatusTerminationSignal(linux.SIGKILL) } for t := tg.tasks.Front(); t != nil; t = t.Next() { t.killLocked() } } } // canReceiveSignalLocked returns true if t should be interrupted to receive // the given signal. canReceiveSignalLocked is analogous to Linux's // kernel/signal.c:wants_signal(), but see below for divergences. // // Preconditions: The signal mutex must be locked. func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool { // Notify that the signal is queued. t.signalQueue.Notify(waiter.EventMask(linux.MakeSignalSet(sig))) // - Do not choose tasks that are blocking the signal. if linux.SignalSetOf(sig)&linux.SignalSet(t.signalMask.RacyLoad()) != 0 { return false } // - No need to check Task.exitState, as the exit path sets every bit in the // signal mask when it transitions from TaskExitNone to TaskExitInitiated. // - No special case for SIGKILL: SIGKILL already interrupted all tasks in the // task group via applySignalSideEffects => killLocked. // - Do not choose stopped tasks, which cannot handle signals. if t.stop != nil { return false } // - Do not choose tasks that have already been interrupted, as they may be // busy handling another signal. if len(t.interruptChan) != 0 { return false } return true } // findSignalReceiverLocked returns a task in tg that should be interrupted to // receive the given signal. If no such task exists, findSignalReceiverLocked // returns nil. // // Linux actually records curr_target to balance the group signal targets. // // Preconditions: The signal mutex must be locked. func (tg *ThreadGroup) findSignalReceiverLocked(sig linux.Signal) *Task { for t := tg.tasks.Front(); t != nil; t = t.Next() { if t.canReceiveSignalLocked(sig) { return t } } return nil } // forceSignal ensures that the task is not ignoring or blocking the given // signal. If unconditional is true, forceSignal takes action even if the // signal isn't being ignored or blocked. func (t *Task) forceSignal(sig linux.Signal, unconditional bool) { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() t.forceSignalLocked(sig, unconditional) } func (t *Task) forceSignalLocked(sig linux.Signal, unconditional bool) { blocked := linux.SignalSetOf(sig)&linux.SignalSet(t.signalMask.RacyLoad()) != 0 act := t.tg.signalHandlers.actions[sig] ignored := act.Handler == linux.SIG_IGN if blocked || ignored || unconditional { act.Handler = linux.SIG_DFL t.tg.signalHandlers.actions[sig] = act if blocked { t.setSignalMaskLocked(linux.SignalSet(t.signalMask.RacyLoad()) &^ linux.SignalSetOf(sig)) } } } // SignalMask returns a copy of t's signal mask. func (t *Task) SignalMask() linux.SignalSet { return linux.SignalSet(t.signalMask.Load()) } // SetSignalMask sets t's signal mask. // // Preconditions: // - The caller must be running on the task goroutine. // - t.exitState < TaskExitZombie. func (t *Task) SetSignalMask(mask linux.SignalSet) { // By precondition, t prevents t.tg from completing an execve and mutating // t.tg.signalHandlers, so we can skip the TaskSet mutex. t.tg.signalHandlers.mu.Lock() t.setSignalMaskLocked(mask) t.tg.signalHandlers.mu.Unlock() } // Preconditions: The signal mutex must be locked. func (t *Task) setSignalMaskLocked(mask linux.SignalSet) { oldMask := linux.SignalSet(t.signalMask.RacyLoad()) t.signalMask.Store(uint64(mask)) // If the new mask blocks any signals that were not blocked by the old // mask, and at least one such signal is pending in tg.pendingSignals, and // t has been woken, it could be the case that t was woken to handle that // signal, but will no longer do so as a result of its new signal mask, so // we have to pick a replacement. blocked := mask &^ oldMask blockedGroupPending := blocked & t.tg.pendingSignals.pendingSet if blockedGroupPending != 0 && t.interrupted() { linux.ForEachSignal(blockedGroupPending, func(sig linux.Signal) { if nt := t.tg.findSignalReceiverLocked(sig); nt != nil { nt.interrupt() return } }) } // Conversely, if the new mask unblocks any signals that were blocked by // the old mask, and at least one such signal is pending, we may now need // to handle that signal. unblocked := oldMask &^ mask unblockedPending := unblocked & (t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet) if unblockedPending != 0 { t.interruptSelf() } } // SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's // comment). // // Preconditions: The caller must be running on the task goroutine. func (t *Task) SetSavedSignalMask(mask linux.SignalSet) { t.savedSignalMask = mask t.haveSavedSignalMask = true } // SignalStack returns the task-private signal stack. // // By precondition, a full state has to be pulled. func (t *Task) SignalStack() linux.SignalStack { alt := t.signalStack if t.onSignalStack(alt) { alt.Flags |= linux.SS_ONSTACK } return alt } // SigaltStack implements the sigaltstack syscall. func (t *Task) SigaltStack(setaddr hostarch.Addr, oldaddr hostarch.Addr) (*SyscallControl, error) { if err := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); err != nil { t.PrepareGroupExit(linux.WaitStatusTerminationSignal(linux.SIGILL)) return CtrlDoExit, linuxerr.EFAULT } alt := t.SignalStack() if oldaddr != 0 { if _, err := alt.CopyOut(t, oldaddr); err != nil { return nil, err } } if setaddr != 0 { if _, err := alt.CopyIn(t, setaddr); err != nil { return nil, err } // The signal stack cannot be changed if the task is currently // on the stack. This is enforced at the lowest level because // these semantics apply to changing the signal stack via a // ucontext during a signal handler. if !t.SetSignalStack(alt) { return nil, linuxerr.EPERM } } return nil, nil } // onSignalStack returns true if the task is executing on the given signal stack. func (t *Task) onSignalStack(alt linux.SignalStack) bool { sp := hostarch.Addr(t.Arch().Stack()) return alt.Contains(sp) } // SetSignalStack sets the task-private signal stack. // // This value may not be changed if the task is currently executing on the // signal stack, i.e. if t.onSignalStack returns true. In this case, this // function will return false. Otherwise, true is returned. func (t *Task) SetSignalStack(alt linux.SignalStack) bool { // Check that we're not executing on the stack. if t.onSignalStack(t.signalStack) { return false } if alt.Flags&linux.SS_DISABLE != 0 { // Don't record anything beyond the flags. t.signalStack = linux.SignalStack{ Flags: linux.SS_DISABLE, } } else { // Mask out irrelevant parts: only disable matters. alt.Flags &= linux.SS_DISABLE t.signalStack = alt } return true } // SetSigAction atomically sets the thread group's signal action for signal sig // to *actptr (if actptr is not nil) and returns the old signal action. func (tg *ThreadGroup) SetSigAction(sig linux.Signal, actptr *linux.SigAction) (linux.SigAction, error) { if !sig.IsValid() { return linux.SigAction{}, linuxerr.EINVAL } tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() sh := tg.signalHandlers sh.mu.Lock() defer sh.mu.Unlock() oldact := sh.actions[sig] if actptr != nil { if sig == linux.SIGKILL || sig == linux.SIGSTOP { return oldact, linuxerr.EINVAL } act := *actptr act.Mask &^= UnblockableSignals sh.actions[sig] = act // From POSIX, by way of Linux: // // "Setting a signal action to SIG_IGN for a signal that is pending // shall cause the pending signal to be discarded, whether or not it is // blocked." // // "Setting a signal action to SIG_DFL for a signal that is pending and // whose default action is to ignore the signal (for example, SIGCHLD), // shall cause the pending signal to be discarded, whether or not it is // blocked." if computeAction(sig, act) == SignalActionIgnore { tg.discardSpecificLocked(sig) } } return oldact, nil } // groupStop is a TaskStop placed on tasks that have received a stop signal // (SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU). (The term "group-stop" originates from // the ptrace man page.) // // +stateify savable type groupStop struct{} // Killable implements TaskStop.Killable. func (*groupStop) Killable() bool { return true } // initiateGroupStop attempts to initiate a group stop based on a // previously-dequeued stop signal. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) initiateGroupStop(info *linux.SignalInfo) { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() if t.groupStopPending { t.Debugf("Signal %d: not stopping thread group: lost to racing stop signal", info.Signo) return } if !t.tg.groupStopDequeued { t.Debugf("Signal %d: not stopping thread group: lost to racing SIGCONT", info.Signo) return } if t.tg.exiting { t.Debugf("Signal %d: not stopping thread group: lost to racing group exit", info.Signo) return } if t.tg.execing != nil { t.Debugf("Signal %d: not stopping thread group: lost to racing execve", info.Signo) return } if !t.tg.groupStopComplete { t.tg.groupStopSignal = linux.Signal(info.Signo) } t.tg.groupStopPendingCount = 0 for t2 := t.tg.tasks.Front(); t2 != nil; t2 = t2.Next() { if t2.killedLocked() || t2.exitState >= TaskExitInitiated { t2.groupStopPending = false continue } t2.groupStopPending = true t2.groupStopAcknowledged = false if t2.ptraceSeized { t2.trapNotifyPending = true if s, ok := t2.stop.(*ptraceStop); ok && s.listen { t2.endInternalStopLocked() } } t2.interrupt() t.tg.groupStopPendingCount++ } t.Debugf("Signal %d: stopping %d threads in thread group", info.Signo, t.tg.groupStopPendingCount) } // endGroupStopLocked ensures that all prior stop signals received by tg are // not stopping tg and will not stop tg in the future. If broadcast is true, // parent and tracer notification will be scheduled if appropriate. // // Preconditions: The signal mutex must be locked. func (tg *ThreadGroup) endGroupStopLocked(broadcast bool) { // Discard all previously-queued stop signals. linux.ForEachSignal(StopSignals, tg.discardSpecificLocked) if tg.groupStopPendingCount == 0 && !tg.groupStopComplete { return } completeStr := "incomplete" if tg.groupStopComplete { completeStr = "complete" } tg.leader.Debugf("Ending %s group stop with %d threads pending", completeStr, tg.groupStopPendingCount) for t := tg.tasks.Front(); t != nil; t = t.Next() { t.groupStopPending = false if t.ptraceSeized { t.trapNotifyPending = true if s, ok := t.stop.(*ptraceStop); ok && s.listen { t.endInternalStopLocked() } } else { if _, ok := t.stop.(*groupStop); ok { t.endInternalStopLocked() } } } if broadcast { // Instead of notifying the parent here, set groupContNotify so that // one of the continuing tasks does so. (Linux does something similar.) // The reason we do this is to keep locking sane. In order to send a // signal to the parent, we need to lock its signal mutex, but we're // already holding tg's signal mutex, and the TaskSet mutex must be // locked for writing for us to hold two signal mutexes. Since we don't // want to require this for endGroupStopLocked (which is called from // signal-sending paths), nor do we want to lose atomicity by releasing // the mutexes we're already holding, just let the continuing thread // group deal with it. tg.groupContNotify = true tg.groupContInterrupted = !tg.groupStopComplete tg.groupContWaitable = true } // Unsetting groupStopDequeued will cause racing calls to initiateGroupStop // to recognize that the group stop has been cancelled. tg.groupStopDequeued = false tg.groupStopSignal = 0 tg.groupStopPendingCount = 0 tg.groupStopComplete = false tg.groupStopWaitable = false } // participateGroupStopLocked is called to handle thread group side effects // after t unsets t.groupStopPending. The caller must handle task side effects // (e.g. placing the task goroutine into the group stop). It returns true if // the caller must notify t.tg.leader's parent of a completed group stop (which // participateGroupStopLocked cannot do due to holding the wrong locks). // // Preconditions: The signal mutex must be locked. func (t *Task) participateGroupStopLocked() bool { if t.groupStopAcknowledged { return false } t.groupStopAcknowledged = true t.tg.groupStopPendingCount-- if t.tg.groupStopPendingCount != 0 { return false } if t.tg.groupStopComplete { return false } t.Debugf("Completing group stop") t.tg.groupStopComplete = true t.tg.groupStopWaitable = true t.tg.groupContNotify = false t.tg.groupContWaitable = false return true } // signalStop sends a signal to t's thread group of a new group stop, group // continue, or ptrace stop, if appropriate. code and status are set in the // signal sent to tg, if any. // // Preconditions: The TaskSet mutex must be locked (for reading or writing). func (t *Task) signalStop(target *Task, code int32, status int32) { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() act, ok := t.tg.signalHandlers.actions[linux.SIGCHLD] if !ok || (act.Handler != linux.SIG_IGN && act.Flags&linux.SA_NOCLDSTOP == 0) { sigchld := &linux.SignalInfo{ Signo: int32(linux.SIGCHLD), Code: code, } sigchld.SetPID(int32(t.tg.pidns.tids[target])) sigchld.SetUID(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) sigchld.SetStatus(status) // TODO(b/72102453): Set utime, stime. t.sendSignalLocked(sigchld, true /* group */) } } // The runInterrupt state handles conditions indicated by interrupts. // // +stateify savable type runInterrupt struct{} func (*runInterrupt) execute(t *Task) taskRunState { // Interrupts are de-duplicated (t.unsetInterrupted() will undo the effect // of all previous calls to t.interrupted() regardless of how many such // calls there have been), so early exits from this function must re-enter // the runInterrupt state to check for more interrupt-signaled conditions. t.tg.signalHandlers.mu.Lock() // Did we just leave a group stop? if t.tg.groupContNotify { t.tg.groupContNotify = false sig := t.tg.groupStopSignal intr := t.tg.groupContInterrupted t.tg.signalHandlers.mu.Unlock() t.tg.pidns.owner.mu.RLock() // For consistency with Linux, if the parent and (thread group // leader's) tracer are in the same thread group, deduplicate // notifications. notifyParent := t.tg.leader.parent != nil if tracer := t.tg.leader.Tracer(); tracer != nil { if notifyParent && tracer.tg == t.tg.leader.parent.tg { notifyParent = false } // Sending CLD_STOPPED to the tracer doesn't really make any sense; // the thread group leader may have already entered the stop and // notified its tracer accordingly. But it's consistent with // Linux... if intr { tracer.signalStop(t.tg.leader, linux.CLD_STOPPED, int32(sig)) if !notifyParent { tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop | EventChildGroupStop) } else { tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop) } } else { tracer.signalStop(t.tg.leader, linux.CLD_CONTINUED, int32(sig)) tracer.tg.eventQueue.Notify(EventGroupContinue) } } if notifyParent { // If groupContInterrupted, do as Linux does and pretend the group // stop completed just before it ended. The theoretical behavior in // this case would be to send a SIGCHLD indicating the completed // stop, followed by a SIGCHLD indicating the continue. However, // SIGCHLD is a standard signal, so the latter would always be // dropped. Hence sending only the former is equivalent. if intr { t.tg.leader.parent.signalStop(t.tg.leader, linux.CLD_STOPPED, int32(sig)) t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue | EventChildGroupStop) } else { t.tg.leader.parent.signalStop(t.tg.leader, linux.CLD_CONTINUED, int32(sig)) t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue) } } t.tg.pidns.owner.mu.RUnlock() return (*runInterrupt)(nil) } // Do we need to enter a group stop or related ptrace stop? This path is // analogous to Linux's kernel/signal.c:get_signal() => do_signal_stop() // (with ptrace enabled) and do_jobctl_trap(). if t.groupStopPending || t.trapStopPending || t.trapNotifyPending { sig := t.tg.groupStopSignal notifyParent := false if t.groupStopPending { t.groupStopPending = false // We care about t.tg.groupStopSignal (for tracer notification) // even if this doesn't complete a group stop, so keep the // value of sig we've already read. notifyParent = t.participateGroupStopLocked() } t.trapStopPending = false t.trapNotifyPending = false // Drop the signal mutex so we can take the TaskSet mutex. t.tg.signalHandlers.mu.Unlock() t.tg.pidns.owner.mu.RLock() if t.tg.leader.parent == nil { notifyParent = false } if tracer := t.Tracer(); tracer != nil { if t.ptraceSeized { if sig == 0 { sig = linux.SIGTRAP } // "If tracee was attached using PTRACE_SEIZE, group-stop is // indicated by PTRACE_EVENT_STOP: status>>16 == // PTRACE_EVENT_STOP. This allows detection of group-stops // without requiring an extra PTRACE_GETSIGINFO call." - // "Group-stop", ptrace(2) t.ptraceCode = int32(sig) | linux.PTRACE_EVENT_STOP<<8 t.ptraceSiginfo = &linux.SignalInfo{ Signo: int32(sig), Code: t.ptraceCode, } t.ptraceSiginfo.SetPID(int32(t.tg.pidns.tids[t])) t.ptraceSiginfo.SetUID(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) } else { t.ptraceCode = int32(sig) t.ptraceSiginfo = nil } if t.beginPtraceStopLocked() { tracer.signalStop(t, linux.CLD_STOPPED, int32(sig)) // For consistency with Linux, if the parent and tracer are in the // same thread group, deduplicate notification signals. if notifyParent && tracer.tg == t.tg.leader.parent.tg { notifyParent = false tracer.tg.eventQueue.Notify(EventChildGroupStop | EventTraceeStop) } else { tracer.tg.eventQueue.Notify(EventTraceeStop) } } } else { t.tg.signalHandlers.mu.Lock() if !t.killedLocked() { t.beginInternalStopLocked((*groupStop)(nil)) } t.tg.signalHandlers.mu.Unlock() } if notifyParent { t.tg.leader.parent.signalStop(t.tg.leader, linux.CLD_STOPPED, int32(sig)) t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop) } t.tg.pidns.owner.mu.RUnlock() return (*runInterrupt)(nil) } // Are there signals pending? if info := t.dequeueSignalLocked(linux.SignalSet(t.signalMask.RacyLoad())); info != nil { if err := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); err != nil { t.PrepareGroupExit(linux.WaitStatusTerminationSignal(linux.SIGILL)) return (*runExit)(nil) } if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 { // Indicate that we've dequeued a stop signal before unlocking the // signal mutex; initiateGroupStop will check for races with // endGroupStopLocked after relocking it. t.tg.groupStopDequeued = true } if t.ptraceSignalLocked(info) { // Dequeueing the signal action must wait until after the // signal-delivery-stop ends since the tracer can change or // suppress the signal. t.tg.signalHandlers.mu.Unlock() return (*runInterruptAfterSignalDeliveryStop)(nil) } act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo)) t.tg.signalHandlers.mu.Unlock() return t.deliverSignal(info, act) } t.unsetInterrupted() t.tg.signalHandlers.mu.Unlock() return (*runApp)(nil) } // +stateify savable type runInterruptAfterSignalDeliveryStop struct{} func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState { t.tg.pidns.owner.mu.Lock() // Can't defer unlock: deliverSignal must be called without holding TaskSet // mutex. sig := linux.Signal(t.ptraceCode) defer func() { t.ptraceSiginfo = nil }() if !sig.IsValid() { t.tg.pidns.owner.mu.Unlock() return (*runInterrupt)(nil) } info := t.ptraceSiginfo if sig != linux.Signal(info.Signo) { info.Signo = int32(sig) info.Errno = 0 info.Code = linux.SI_USER // pid isn't a valid field for all signal numbers, but Linux // doesn't care (kernel/signal.c:ptrace_signal()). // // Linux uses t->parent for the tid and uid here, which is the tracer // if it hasn't detached or the real parent otherwise. parent := t.parent if tracer := t.Tracer(); tracer != nil { parent = tracer } if parent == nil { // Tracer has detached and t was created by Kernel.CreateProcess(). // Pretend the parent is in an ancestor PID + user namespace. info.SetPID(0) info.SetUID(int32(auth.OverflowUID)) } else { info.SetPID(int32(t.tg.pidns.tids[parent])) info.SetUID(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) } } t.tg.signalHandlers.mu.Lock() t.tg.pidns.owner.mu.Unlock() // If the signal is masked, re-queue it. if linux.SignalSetOf(sig)&linux.SignalSet(t.signalMask.RacyLoad()) != 0 { t.sendSignalLocked(info, false /* group */) t.tg.signalHandlers.mu.Unlock() return (*runInterrupt)(nil) } act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo)) t.tg.signalHandlers.mu.Unlock() return t.deliverSignal(info, act) } // SignalRegister registers a waiter for pending signals. func (t *Task) SignalRegister(e *waiter.Entry) { t.tg.signalHandlers.mu.Lock() t.signalQueue.EventRegister(e) t.tg.signalHandlers.mu.Unlock() } // SignalUnregister unregisters a waiter for pending signals. func (t *Task) SignalUnregister(e *waiter.Entry) { t.tg.signalHandlers.mu.Lock() t.signalQueue.EventUnregister(e) t.tg.signalHandlers.mu.Unlock() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_start.go000066400000000000000000000267671465435605700247600ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // TaskConfig defines the configuration of a new Task (see below). type TaskConfig struct { // Kernel is the owning Kernel. Kernel *Kernel // Parent is the new task's parent. Parent may be nil. Parent *Task // If InheritParent is not nil, use InheritParent's parent as the new // task's parent. InheritParent *Task // ThreadGroup is the ThreadGroup the new task belongs to. ThreadGroup *ThreadGroup // SignalMask is the new task's initial signal mask. SignalMask linux.SignalSet // TaskImage is the TaskImage of the new task. Ownership of the // TaskImage is transferred to TaskSet.NewTask, whether or not it // succeeds. TaskImage *TaskImage // FSContext is the FSContext of the new task. A reference must be held on // FSContext, which is transferred to TaskSet.NewTask whether or not it // succeeds. FSContext *FSContext // FDTable is the FDTableof the new task. A reference must be held on // FDMap, which is transferred to TaskSet.NewTask whether or not it // succeeds. FDTable *FDTable // Credentials is the Credentials of the new task. Credentials *auth.Credentials // Niceness is the niceness of the new task. Niceness int // NetworkNamespace is the network namespace to be used for the new task. NetworkNamespace *inet.Namespace // AllowedCPUMask contains the cpus that this task can run on. AllowedCPUMask sched.CPUSet // UTSNamespace is the UTSNamespace of the new task. UTSNamespace *UTSNamespace // IPCNamespace is the IPCNamespace of the new task. IPCNamespace *IPCNamespace // MountNamespace is the MountNamespace of the new task. MountNamespace *vfs.MountNamespace // RSeqAddr is a pointer to the userspace linux.RSeq structure. RSeqAddr hostarch.Addr // RSeqSignature is the signature that the rseq abort IP must be signed // with. RSeqSignature uint32 // ContainerID is the container the new task belongs to. ContainerID string // InitialCgroups are the cgroups the container is initialised to. InitialCgroups map[Cgroup]struct{} // UserCounters is user resource counters. UserCounters *UserCounters // SessionKeyring is the session keyring associated with the parent task. // It may be nil. SessionKeyring *auth.Key Origin TaskOrigin } // NewTask creates a new task defined by cfg. // // NewTask does not start the returned task; the caller must call Task.Start. // // If successful, NewTask transfers references held by cfg to the new task. // Otherwise, NewTask releases them. func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { var err error cleanup := func() { cfg.TaskImage.release(ctx) cfg.FSContext.DecRef(ctx) cfg.FDTable.DecRef(ctx) cfg.UTSNamespace.DecRef(ctx) cfg.IPCNamespace.DecRef(ctx) cfg.NetworkNamespace.DecRef(ctx) if cfg.MountNamespace != nil { cfg.MountNamespace.DecRef(ctx) } } if err := cfg.UserCounters.incRLimitNProc(ctx); err != nil { cleanup() return nil, err } t, err := ts.newTask(ctx, cfg) if err != nil { cfg.UserCounters.decRLimitNProc() cleanup() return nil, err } return t, nil } // newTask is a helper for TaskSet.NewTask that only takes ownership of parts // of cfg if it succeeds. func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { srcT := TaskFromContext(ctx) tg := cfg.ThreadGroup image := cfg.TaskImage t := &Task{ taskNode: taskNode{ tg: tg, parent: cfg.Parent, children: make(map[*Task]struct{}), }, runState: (*runApp)(nil), interruptChan: make(chan struct{}, 1), signalMask: atomicbitops.FromUint64(uint64(cfg.SignalMask)), signalStack: linux.SignalStack{Flags: linux.SS_DISABLE}, image: *image, fsContext: cfg.FSContext, fdTable: cfg.FDTable, k: cfg.Kernel, ptraceTracees: make(map[*Task]struct{}), allowedCPUMask: cfg.AllowedCPUMask.Copy(), ioUsage: &usage.IO{}, niceness: cfg.Niceness, utsns: cfg.UTSNamespace, ipcns: cfg.IPCNamespace, mountNamespace: cfg.MountNamespace, rseqCPU: -1, rseqAddr: cfg.RSeqAddr, rseqSignature: cfg.RSeqSignature, futexWaiter: futex.NewWaiter(), containerID: cfg.ContainerID, cgroups: make(map[Cgroup]struct{}), userCounters: cfg.UserCounters, sessionKeyring: cfg.SessionKeyring, Origin: cfg.Origin, } t.netns = cfg.NetworkNamespace t.creds.Store(cfg.Credentials) t.endStopCond.L = &t.tg.signalHandlers.mu // We don't construct t.blockingTimer until Task.run(); see that function // for justification. var ( cg Cgroup charged, committed bool ) // Reserve cgroup PIDs controller charge. This is either committed when the // new task enters the cgroup below, or rolled back on failure. // // We may also get here from a non-task context (for example, when // creating the init task, or from the exec control command). In these cases // we skip charging the pids controller, as non-userspace task creation // bypasses pid limits. if srcT != nil { var err error if charged, cg, err = srcT.ChargeFor(t, CgroupControllerPIDs, CgroupResourcePID, 1); err != nil { return nil, err } if charged { defer func() { if !committed { if err := cg.Charge(t, cg.Dentry, CgroupControllerPIDs, CgroupResourcePID, -1); err != nil { panic(fmt.Sprintf("Failed to clean up PIDs charge on task creation failure: %v", err)) } } // Ref from ChargeFor. Note that we need to drop this outside of // TaskSet.mu critical sections. cg.DecRef(ctx) }() } } // Make the new task (and possibly thread group) visible to the rest of // the system atomically. ts.mu.Lock() defer ts.mu.Unlock() tg.signalHandlers.mu.Lock() defer tg.signalHandlers.mu.Unlock() if tg.exiting || tg.execing != nil { // If the caller is in the same thread group, then what we return // doesn't matter too much since the caller will exit before it returns // to userspace. If the caller isn't in the same thread group, then // we're in uncharted territory and can return whatever we want. return nil, linuxerr.EINTR } if err := ts.assignTIDsLocked(t); err != nil { return nil, err } // Below this point, newTask is expected not to fail (there is no rollback // of assignTIDsLocked or any of the following). // Logging on t's behalf will panic if t.logPrefix hasn't been // initialized. This is the earliest point at which we can do so // (since t now has thread IDs). t.updateInfoLocked() if cfg.InheritParent != nil { t.parent = cfg.InheritParent.parent } if t.parent != nil { t.parent.children[t] = struct{}{} } // If InitialCgroups is not nil, the new task will be placed in the // specified cgroups. Otherwise, if srcT is not nil, the new task will // be placed in the srcT's cgroups. If neither is specified, the new task // will be in the root cgroups. t.EnterInitialCgroups(srcT, cfg.InitialCgroups) committed = true if tg.leader == nil { // New thread group. tg.leader = t if parentPG := tg.parentPG(); parentPG == nil { tg.createSession() } else { // Inherit the process group and terminal. parentPG.incRefWithParent(parentPG) tg.processGroup = parentPG tg.tty = t.parent.tg.tty } // If our parent is a child subreaper, or if it has a child // subreaper, then this new thread group does as well. if t.parent != nil { tg.hasChildSubreaper = t.parent.tg.isChildSubreaper || t.parent.tg.hasChildSubreaper } } tg.tasks.PushBack(t) tg.tasksCount++ tg.liveTasks++ tg.activeTasks++ // Propagate external TaskSet stops to the new task. t.stopCount = atomicbitops.FromInt32(ts.stopCount) t.mu.Lock() defer t.mu.Unlock() t.cpu = atomicbitops.FromInt32(assignCPU(t.allowedCPUMask, ts.Root.tids[t])) t.startTime = t.k.RealtimeClock().Now() // As a final step, initialize the platform context. This may require // other pieces to be initialized as the task is used the context. t.p = cfg.Kernel.Platform.NewContext(t.AsyncContext()) return t, nil } // assignTIDsLocked ensures that new task t is visible in all PID namespaces in // which it should be visible. // // Preconditions: ts.mu must be locked for writing. func (ts *TaskSet) assignTIDsLocked(t *Task) error { type allocatedTID struct { ns *PIDNamespace tid ThreadID } var allocatedTIDs []allocatedTID var tid ThreadID var err error for ns := t.tg.pidns; ns != nil; ns = ns.parent { if tid, err = ns.allocateTID(); err != nil { break } if err = ns.addTask(t, tid); err != nil { break } allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid}) } if err != nil { // Failure. Remove the tids we already allocated in descendant // namespaces. for _, a := range allocatedTIDs { a.ns.deleteTask(t) } return err } t.tg.pidWithinNS.Store(int32(t.tg.pidns.tgids[t.tg])) return nil } // allocateTID returns an unused ThreadID from ns. // // Preconditions: ns.owner.mu must be locked for writing. func (ns *PIDNamespace) allocateTID() (ThreadID, error) { if ns.exiting { // "In this case, a subsequent fork(2) into this PID namespace will // fail with the error ENOMEM; it is not possible to create a new // processes [sic] in a PID namespace whose init process has // terminated." - pid_namespaces(7) return 0, linuxerr.ENOMEM } tid := ns.last for { // Next. tid++ if tid > TasksLimit { tid = initTID + 1 } // Is it available? tidInUse := func() bool { if _, ok := ns.tasks[tid]; ok { return true } if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok { return true } if _, ok := ns.sessions[SessionID(tid)]; ok { return true } return false }() if !tidInUse { ns.last = tid return tid, nil } // Did we do a full cycle? if tid == ns.last { // No tid available. return 0, linuxerr.EAGAIN } } } // Start starts the task goroutine. Start must be called exactly once for each // task returned by NewTask. // // 'tid' must be the task's TID in the root PID namespace and it's used for // debugging purposes only (set as parameter to Task.run to make it visible // in stack dumps). func (t *Task) Start(tid ThreadID) { // If the task was restored, it may be "starting" after having already exited. if t.runState == nil { return } t.goroutineStopped.Add(1) t.tg.liveGoroutines.Add(1) t.tg.pidns.owner.liveGoroutines.Add(1) t.tg.pidns.owner.runningGoroutines.Add(1) // Task is now running in system mode. t.accountTaskGoroutineLeave(TaskGoroutineNonexistent) // Use the task's TID in the root PID namespace to make it visible in stack dumps. go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_stop.go000066400000000000000000000214551465435605700245750ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel // This file implements task stops, which represent the equivalent of Linux's // uninterruptible sleep states in a way that is compatible with save/restore. // Task stops comprise both internal stops (which form part of the task's // "normal" control flow) and external stops (which do not); see README.md for // details. // // There are multiple interfaces for interacting with stops because there are // multiple cases to consider: // // - A task goroutine can begin a stop on its associated task (e.g. a // vfork() syscall stopping the calling task until the child task releases its // MM). In this case, calling Task.interrupt is both unnecessary (the task // goroutine obviously cannot be blocked in Task.block or executing application // code) and undesirable (as it may spuriously interrupt a in-progress // syscall). // // Beginning internal stops in this case is implemented by // Task.beginInternalStop / Task.beginInternalStopLocked. As of this writing, // there are no instances of this case that begin external stops, except for // autosave; however, autosave terminates the sentry without ending the // external stop, so the spurious interrupt is moot. // // - An arbitrary goroutine can begin a stop on an unrelated task (e.g. all // tasks being stopped in preparation for state checkpointing). If the task // goroutine may be in Task.block or executing application code, it must be // interrupted by Task.interrupt for it to actually enter the stop; since, // strictly speaking, we have no way of determining this, we call // Task.interrupt unconditionally. // // Beginning external stops in this case is implemented by // Task.BeginExternalStop. As of this writing, there are no instances of this // case that begin internal stops. // // - An arbitrary goroutine can end a stop on an unrelated task (e.g. an // exiting task resuming a sibling task that has been blocked in an execve() // syscall waiting for other tasks to exit). In this case, Task.endStopCond // must be notified to kick the task goroutine out of Task.doStop. // // Ending internal stops in this case is implemented by // Task.endInternalStopLocked. Ending external stops in this case is // implemented by Task.EndExternalStop. // // - Hypothetically, a task goroutine can end an internal stop on its // associated task. As of this writing, there are no instances of this case. // However, any instances of this case could still use the above functions, // since notifying Task.endStopCond would be unnecessary but harmless. import ( "fmt" ) // A TaskStop is a condition visible to the task control flow graph that // prevents a task goroutine from running or exiting, i.e. an internal stop. // // NOTE(b/30793614): Most TaskStops don't contain any data; they're // distinguished by their type. The obvious way to implement such a TaskStop // is: // // type groupStop struct{} // func (groupStop) Killable() bool { return true } // ... // t.beginInternalStop(groupStop{}) // // However, this doesn't work because the state package can't serialize values, // only pointers. Furthermore, the correctness of save/restore depends on the // ability to pass a TaskStop to endInternalStop that will compare equal to the // TaskStop that was passed to beginInternalStop, even if a save/restore cycle // occurred between the two. As a result, the current idiom is to always use a // typecast nil for data-free TaskStops: // // type groupStop struct{} // func (*groupStop) Killable() bool { return true } // ... // t.beginInternalStop((*groupStop)(nil)) // // This is pretty gross, but the alternatives seem grosser. type TaskStop interface { // Killable returns true if Task.Kill should end the stop prematurely. // Killable is analogous to Linux's TASK_WAKEKILL. Killable() bool } // beginInternalStop indicates the start of an internal stop that applies to t. // // Preconditions: // - The caller must be running on the task goroutine. // - The task must not already be in an internal stop (i.e. t.stop == nil). func (t *Task) beginInternalStop(s TaskStop) { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() t.beginInternalStopLocked(s) } // Preconditions: Same as beginInternalStop, plus: // - The signal mutex must be locked. func (t *Task) beginInternalStopLocked(s TaskStop) { if t.stop != nil { panic(fmt.Sprintf("Attempting to enter internal stop %#v when already in internal stop %#v", s, t.stop)) } t.Debugf("Entering internal stop %#v", s) t.stop = s t.beginStopLocked() } // endInternalStopLocked indicates the end of an internal stop that applies to // t. endInternalStopLocked does not wait for the task to resume. // // The caller is responsible for ensuring that the internal stop they expect // actually applies to t; this requires holding the signal mutex which protects // t.stop, which is why there is no endInternalStop that locks the signal mutex // for you. // // Preconditions: // - The signal mutex must be locked. // - The task must be in an internal stop (i.e. t.stop != nil). func (t *Task) endInternalStopLocked() { if t.stop == nil { panic("Attempting to leave non-existent internal stop") } t.Debugf("Leaving internal stop %#v", t.stop) t.stop = nil t.endStopLocked() } // BeginExternalStop indicates the start of an external stop that applies to t. // BeginExternalStop does not wait for t's task goroutine to stop. func (t *Task) BeginExternalStop() { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() t.beginStopLocked() t.interrupt() } // EndExternalStop indicates the end of an external stop started by a previous // call to Task.BeginExternalStop. EndExternalStop does not wait for t's task // goroutine to resume. func (t *Task) EndExternalStop() { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() t.endStopLocked() } // beginStopLocked increments t.stopCount to indicate that a new internal or // external stop applies to t. // // Preconditions: The signal mutex must be locked. func (t *Task) beginStopLocked() { if newval := t.stopCount.Add(1); newval <= 0 { // Most likely overflow. panic(fmt.Sprintf("Invalid stopCount: %d", newval)) } } // endStopLocked decrements t.stopCount to indicate that an existing internal // or external stop no longer applies to t. // // Preconditions: The signal mutex must be locked. func (t *Task) endStopLocked() { if newval := t.stopCount.Add(-1); newval < 0 { panic(fmt.Sprintf("Invalid stopCount: %d", newval)) } else if newval == 0 { t.endStopCond.Signal() } } // BeginExternalStop indicates the start of an external stop that applies to // all current and future tasks in ts. BeginExternalStop does not wait for // task goroutines to stop. func (ts *TaskSet) BeginExternalStop() { ts.mu.Lock() defer ts.mu.Unlock() ts.stopCount++ if ts.stopCount <= 0 { panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount)) } if ts.Root == nil { return } for t := range ts.Root.tids { t.tg.signalHandlers.mu.Lock() t.beginStopLocked() t.tg.signalHandlers.mu.Unlock() t.interrupt() } } // PullFullState receives full states for all tasks. func (ts *TaskSet) PullFullState() { ts.mu.Lock() defer ts.mu.Unlock() if ts.Root == nil { return } for t := range ts.Root.tids { t.Activate() if mm := t.MemoryManager(); mm != nil { t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) } t.Deactivate() } } // EndExternalStop indicates the end of an external stop started by a previous // call to TaskSet.BeginExternalStop. EndExternalStop does not wait for task // goroutines to resume. func (ts *TaskSet) EndExternalStop() { ts.mu.Lock() defer ts.mu.Unlock() ts.stopCount-- if ts.stopCount < 0 { panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount)) } if ts.Root == nil { return } for t := range ts.Root.tids { t.tg.signalHandlers.mu.Lock() t.endStopLocked() t.tg.signalHandlers.mu.Unlock() } } // isExternallyStopped returns true if BeginExternalStop() has been called on // this TaskSet, without a corresponding call to EndExternalStop(). func (ts *TaskSet) isExternallyStopped() bool { ts.mu.Lock() defer ts.mu.Unlock() return ts.stopCount > 0 } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_syscall.go000066400000000000000000000403131465435605700252540ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "os" "runtime/trace" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/errors" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/seccheck" pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto" ) // SyscallRestartBlock represents the restart block for a syscall restartable // with a custom function. It encapsulates the state required to restart a // syscall across a S/R. type SyscallRestartBlock interface { Restart(t *Task) (uintptr, error) } // SyscallControl is returned by syscalls to control the behavior of // Task.doSyscallInvoke. type SyscallControl struct { // next is the state that the task goroutine should switch to. If next is // nil, the task goroutine should continue to syscall exit as usual. next taskRunState // If ignoreReturn is true, Task.doSyscallInvoke should not store any value // in the task's syscall return value register. ignoreReturn bool } var ( // CtrlDoExit is returned by the implementations of the exit and exit_group // syscalls to enter the task exit path directly, skipping syscall exit // tracing. CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true} // ctrlStopAndReinvokeSyscall is returned by syscalls using the external // feature before syscall execution. This causes Task.doSyscallInvoke // to return runSyscallReinvoke, allowing Task.run to check for stops // before immediately re-invoking the syscall (skipping the re-checking // of seccomp filters and ptrace which would confuse userspace // tracing). ctrlStopAndReinvokeSyscall = &SyscallControl{next: (*runSyscallReinvoke)(nil), ignoreReturn: true} // ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at // their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather // than tail-calling it, allowing stops to be checked before syscall exit. ctrlStopBeforeSyscallExit = &SyscallControl{next: (*runSyscallExit)(nil)} ) func (t *Task) invokeExternal() { t.BeginExternalStop() go func() { // S/R-SAFE: External control flow. defer t.EndExternalStop() t.SyscallTable().External(t.Kernel()) }() } func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval uintptr, ctrl *SyscallControl, err error) { s := t.SyscallTable() fe := s.FeatureEnable.Word(sysno) var straceContext any if bits.IsAnyOn32(fe, StraceEnableBits) { straceContext = s.Stracer.SyscallEnter(t, sysno, args, fe) } if bits.IsAnyOn32(fe, SecCheckRawEnter) { info := pb.Syscall{ Sysno: uint64(sysno), Arg1: args[0].Uint64(), Arg2: args[1].Uint64(), Arg3: args[2].Uint64(), Arg4: args[3].Uint64(), Arg5: args[4].Uint64(), Arg6: args[5].Uint64(), } fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallRawEnter, sysno)) if !fields.Context.Empty() { info.ContextData = &pb.ContextData{} LoadSeccheckData(t, fields.Context, info.ContextData) } seccheck.Global.SentToSinks(func(c seccheck.Sink) error { return c.RawSyscall(t, fields, &info) }) } if bits.IsAnyOn32(fe, SecCheckEnter) { fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallEnter, sysno)) var ctxData *pb.ContextData if !fields.Context.Empty() { ctxData = &pb.ContextData{} LoadSeccheckData(t, fields.Context, ctxData) } info := SyscallInfo{ Sysno: sysno, Args: args, } cb := s.LookupSyscallToProto(sysno) msg, msgType := cb(t, fields, ctxData, info) seccheck.Global.SentToSinks(func(c seccheck.Sink) error { return c.Syscall(t, fields, ctxData, msgType, msg) }) } if bits.IsOn32(fe, ExternalBeforeEnable) && (s.ExternalFilterBefore == nil || s.ExternalFilterBefore(t, sysno, args)) { t.invokeExternal() // Ensure we check for stops, then invoke the syscall again. ctrl = ctrlStopAndReinvokeSyscall } else { fn := s.Lookup(sysno) var region *trace.Region // Only non-nil if tracing == true. if trace.IsEnabled() { region = trace.StartRegion(t.traceContext, s.LookupName(sysno)) } if fn != nil { // Call our syscall implementation. rval, ctrl, err = fn(t, sysno, args) } else { // Use the missing function if not found. rval, err = t.SyscallTable().Missing(t, sysno, args) } if region != nil { region.End() } } if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) { t.invokeExternal() // Don't reinvoke the unix. } if bits.IsAnyOn32(fe, StraceEnableBits) { s.Stracer.SyscallExit(straceContext, t, sysno, rval, err) } if bits.IsAnyOn32(fe, SecCheckRawExit) { info := pb.Syscall{ Sysno: uint64(sysno), Arg1: args[0].Uint64(), Arg2: args[1].Uint64(), Arg3: args[2].Uint64(), Arg4: args[3].Uint64(), Arg5: args[4].Uint64(), Arg6: args[5].Uint64(), Exit: &pb.Exit{ Result: int64(rval), Errorno: int64(ExtractErrno(err, int(sysno))), }, } fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallRawEnter, sysno)) if !fields.Context.Empty() { info.ContextData = &pb.ContextData{} LoadSeccheckData(t, fields.Context, info.ContextData) } seccheck.Global.SentToSinks(func(c seccheck.Sink) error { return c.RawSyscall(t, fields, &info) }) } if bits.IsAnyOn32(fe, SecCheckExit) { fields := seccheck.Global.GetFieldSet(seccheck.GetPointForSyscall(seccheck.SyscallExit, sysno)) var ctxData *pb.ContextData if !fields.Context.Empty() { ctxData = &pb.ContextData{} LoadSeccheckData(t, fields.Context, ctxData) } info := SyscallInfo{ Exit: true, Sysno: sysno, Args: args, Rval: rval, Errno: ExtractErrno(err, int(sysno)), } cb := s.LookupSyscallToProto(sysno) msg, msgType := cb(t, fields, ctxData, info) seccheck.Global.SentToSinks(func(c seccheck.Sink) error { return c.Syscall(t, fields, ctxData, msgType, msg) }) } return } // doSyscall is the entry point for an invocation of a system call specified by // the current state of t's registers. // // The syscall path is very hot; avoid defer. func (t *Task) doSyscall() taskRunState { // Save value of the register which is clobbered in the following // t.Arch().SetReturn(-ENOSYS) operation. This is dedicated to arm64. // // On x86, register rax was shared by syscall number and return // value, and at the entry of the syscall handler, the rax was // saved to regs.orig_rax which was exposed to userspace. // But on arm64, syscall number was passed through X8, and the X0 // was shared by the first syscall argument and return value. The // X0 was saved to regs.orig_x0 which was not exposed to userspace. // So we have to do the same operation here to save the X0 value // into the task context. t.Arch().SyscallSaveOrig() sysno := t.Arch().SyscallNo() args := t.Arch().SyscallArgs() // Tracers expect to see this between when the task traps into the kernel // to perform a syscall and when the syscall is actually invoked. // This useless-looking temporary is needed because Go. tmp := uintptr(unix.ENOSYS) t.Arch().SetReturn(-tmp) // Check seccomp filters. The nil check is for performance (as seccomp use // is rare), not needed for correctness. if t.seccomp.Load() != nil { switch r := t.checkSeccompSyscall(int32(sysno), args, hostarch.Addr(t.Arch().IP())); r { case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP: t.Debugf("Syscall %d: denied by seccomp", sysno) return (*runSyscallExit)(nil) case linux.SECCOMP_RET_ALLOW: // ok case linux.SECCOMP_RET_KILL_THREAD: t.Debugf("Syscall %d: killed by seccomp", sysno) t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS)) return (*runExit)(nil) case linux.SECCOMP_RET_TRACE: t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno) return (*runSyscallAfterPtraceEventSeccomp)(nil) default: panic(fmt.Sprintf("Unknown seccomp result %d", r)) } } syscallCounter.Increment() return t.doSyscallEnter(sysno, args) } type runSyscallAfterPtraceEventSeccomp struct{} func (*runSyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { if t.killed() { // "[S]yscall-exit-stop is not generated prior to death by SIGKILL." - // ptrace(2) return (*runInterrupt)(nil) } sysno := t.Arch().SyscallNo() // "The tracer can skip the system call by changing the syscall number to // -1." - Documentation/prctl/seccomp_filter.txt if sysno == ^uintptr(0) { return (*runSyscallExit)(nil).execute(t) } args := t.Arch().SyscallArgs() return t.doSyscallEnter(sysno, args) } func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRunState { if next, ok := t.ptraceSyscallEnter(); ok { return next } return t.doSyscallInvoke(sysno, args) } // +stateify savable type runSyscallAfterSyscallEnterStop struct{} func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState { if sig := linux.Signal(t.ptraceCode); sig.IsValid() { t.tg.signalHandlers.mu.Lock() t.sendSignalLocked(SignalInfoPriv(sig), false /* group */) t.tg.signalHandlers.mu.Unlock() } if t.killed() { return (*runInterrupt)(nil) } sysno := t.Arch().SyscallNo() if sysno == ^uintptr(0) { return (*runSyscallExit)(nil) } args := t.Arch().SyscallArgs() return t.doSyscallInvoke(sysno, args) } // +stateify savable type runSyscallAfterSysemuStop struct{} func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState { if sig := linux.Signal(t.ptraceCode); sig.IsValid() { t.tg.signalHandlers.mu.Lock() t.sendSignalLocked(SignalInfoPriv(sig), false /* group */) t.tg.signalHandlers.mu.Unlock() } if t.killed() { return (*runInterrupt)(nil) } return (*runSyscallExit)(nil).execute(t) } func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRunState { rval, ctrl, err := t.executeSyscall(sysno, args) if ctrl != nil { if !ctrl.ignoreReturn { t.Arch().SetReturn(rval) } if ctrl.next != nil { return ctrl.next } } else if err != nil { t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno)))) t.haveSyscallReturn = true } else { t.Arch().SetReturn(rval) } return (*runSyscallExit)(nil).execute(t) } // +stateify savable type runSyscallReinvoke struct{} func (*runSyscallReinvoke) execute(t *Task) taskRunState { if t.killed() { // It's possible that since the last execution, the task has // been forcible killed. Invoking the system call here could // result in an infinite loop if it is again preempted by an // external stop and reinvoked. return (*runInterrupt)(nil) } sysno := t.Arch().SyscallNo() args := t.Arch().SyscallArgs() return t.doSyscallInvoke(sysno, args) } // +stateify savable type runSyscallExit struct{} func (*runSyscallExit) execute(t *Task) taskRunState { t.ptraceSyscallExit() return (*runApp)(nil) } // doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as // indicated by an execution fault at address addr. doVsyscall returns the // task's next run state. func (t *Task) doVsyscall(addr hostarch.Addr, sysno uintptr) taskRunState { metric.WeirdnessMetric.Increment(&metric.WeirdnessTypeVsyscallCount) // Grab the caller up front, to make sure there's a sensible stack. caller := t.Arch().Native(uintptr(0)) if _, err := caller.CopyIn(t, hostarch.Addr(t.Arch().Stack())); err != nil { t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return (*runApp)(nil) } // For _vsyscalls_, there is no need to translate System V calling convention // to syscall ABI because they both use RDI, RSI, and RDX for the first three // arguments and none of the vsyscalls uses more than two arguments. args := t.Arch().SyscallArgs() if t.seccomp.Load() != nil { switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r { case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP: t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller)) return (*runApp)(nil) case linux.SECCOMP_RET_ALLOW: // ok case linux.SECCOMP_RET_TRACE: t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller)) return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller} case linux.SECCOMP_RET_KILL_THREAD: t.Debugf("vsyscall %d: killed by seccomp", sysno) t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS)) return (*runExit)(nil) default: panic(fmt.Sprintf("Unknown seccomp result %d", r)) } } return t.doVsyscallInvoke(sysno, args, caller) } type runVsyscallAfterPtraceEventSeccomp struct { addr hostarch.Addr sysno uintptr caller marshal.Marshallable } func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { if t.killed() { return (*runInterrupt)(nil) } sysno := t.Arch().SyscallNo() // "... the syscall may not be changed to another system call using the // orig_rax register. It may only be changed to -1 order [sic] to skip the // currently emulated call. ... The tracer MUST NOT modify rip or rsp." - // Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip // causes do_exit(SIGSYS), and changing sp is ignored. if (sysno != ^uintptr(0) && sysno != r.sysno) || hostarch.Addr(t.Arch().IP()) != r.addr { t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS)) return (*runExit)(nil) } if sysno == ^uintptr(0) { return (*runApp)(nil) } return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller) } func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller marshal.Marshallable) taskRunState { rval, ctrl, err := t.executeSyscall(sysno, args) if ctrl != nil { t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl) // Set the return value. The stack has already been adjusted. t.Arch().SetReturn(0) } else if err == nil { t.Debugf("vsyscall %d, caller %x: successfully emulated syscall", sysno, t.Arch().Value(caller)) // Set the return value. The stack has already been adjusted. t.Arch().SetReturn(uintptr(rval)) } else { t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err) if linuxerr.Equals(linuxerr.EFAULT, err) { t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) // A return is not emulated in this case. return (*runApp)(nil) } t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno)))) } t.Arch().SetIP(t.Arch().Value(caller)) t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width())) return (*runApp)(nil) } // ExtractErrno extracts an integer error number from the error. // The syscall number is purely for context in the error case. Use -1 if // syscall number is unknown. func ExtractErrno(err error, sysno int) int { switch err := err.(type) { case nil: return 0 case unix.Errno: return int(err) case *errors.Error: return int(linuxerr.ToUnix(err)) case *memmap.BusError: // Bus errors may generate SIGBUS, but for syscalls they still // return EFAULT. See case in task_run.go where the fault is // handled (and the SIGBUS is delivered). return int(unix.EFAULT) case *os.PathError: return ExtractErrno(err.Err, sysno) case *os.LinkError: return ExtractErrno(err.Err, sysno) case *os.SyscallError: return ExtractErrno(err.Err, sysno) case *platform.ContextError: return int(err.Errno) default: if errno, ok := linuxerr.TranslateError(err); ok { return int(linuxerr.ToUnix(errno)) } } panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_usermem.go000066400000000000000000000321451465435605700252630ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "math" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/usermem" ) const iovecLength = 16 // MAX_RW_COUNT is the maximum size in bytes of a single read or write. // Reads and writes that exceed this size may be silently truncated. // (Linux: include/linux/fs.h:MAX_RW_COUNT) var MAX_RW_COUNT = int(hostarch.Addr(math.MaxInt32).RoundDown()) // Activate ensures that the task has an active address space. func (t *Task) Activate() { if mm := t.MemoryManager(); mm != nil { if err := mm.Activate(t); err != nil { panic("unable to activate mm: " + err.Error()) } } } // Deactivate relinquishes the task's active address space. func (t *Task) Deactivate() { if mm := t.MemoryManager(); mm != nil { mm.Deactivate() } } // CopyInBytes is a fast version of CopyIn if the caller can serialize the // data without reflection and pass in a byte slice. // // This Task's AddressSpace must be active. func (t *Task) CopyInBytes(addr hostarch.Addr, dst []byte) (int, error) { return t.MemoryManager().CopyIn(t, addr, dst, usermem.IOOpts{ AddressSpaceActive: true, }) } // CopyOutBytes is a fast version of CopyOut if the caller can serialize the // data without reflection and pass in a byte slice. // // This Task's AddressSpace must be active. func (t *Task) CopyOutBytes(addr hostarch.Addr, src []byte) (int, error) { return t.MemoryManager().CopyOut(t, addr, src, usermem.IOOpts{ AddressSpaceActive: true, }) } // CopyInString copies a NUL-terminated string of length at most maxlen in from // the task's memory. The copy will fail with syscall.EFAULT if it traverses // user memory that is unmapped or not readable by the user. // // This Task's AddressSpace must be active. func (t *Task) CopyInString(addr hostarch.Addr, maxlen int) (string, error) { return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxlen, usermem.IOOpts{ AddressSpaceActive: true, }) } // CopyInVector copies a NULL-terminated vector of strings from the task's // memory. The copy will fail with syscall.EFAULT if it traverses // user memory that is unmapped or not readable by the user. // // maxElemSize is the maximum size of each individual element. // // maxTotalSize is the maximum total length of all elements plus the total // number of elements. For example, the following strings correspond to // the following set of sizes: // // { "a", "b", "c" } => 6 (3 for lengths, 3 for elements) // { "abc" } => 4 (3 for length, 1 for elements) // // This Task's AddressSpace must be active. func (t *Task) CopyInVector(addr hostarch.Addr, maxElemSize, maxTotalSize int) ([]string, error) { var v []string for { argAddr := t.Arch().Native(0) if _, err := argAddr.CopyIn(t, addr); err != nil { return v, err } if t.Arch().Value(argAddr) == 0 { break } // Each string has a zero terminating byte counted, so copying out a string // requires at least one byte of space. Also, see the calculation below. if maxTotalSize <= 0 { return nil, linuxerr.ENOMEM } thisMax := maxElemSize if maxTotalSize < thisMax { thisMax = maxTotalSize } arg, err := t.CopyInString(hostarch.Addr(t.Arch().Value(argAddr)), thisMax) if err != nil { return v, err } v = append(v, arg) addr += hostarch.Addr(t.Arch().Width()) maxTotalSize -= len(arg) + 1 } return v, nil } // CopyOutIovecs converts src to an array of struct iovecs and copies it to the // memory mapped at addr for Task. // // Preconditions: Same as usermem.IO.CopyOut, plus: // - The caller must be running on the task goroutine. // - t's AddressSpace must be active. func (t *Task) CopyOutIovecs(addr hostarch.Addr, src hostarch.AddrRangeSeq) error { switch t.Arch().Width() { case 8: if _, ok := addr.AddLength(uint64(src.NumRanges()) * iovecLength); !ok { return linuxerr.EFAULT } b := t.CopyScratchBuffer(iovecLength) for ; !src.IsEmpty(); src = src.Tail() { ar := src.Head() hostarch.ByteOrder.PutUint64(b[0:8], uint64(ar.Start)) hostarch.ByteOrder.PutUint64(b[8:16], uint64(ar.Length())) if _, err := t.CopyOutBytes(addr, b); err != nil { return err } addr += iovecLength } default: return linuxerr.ENOSYS } return nil } // CopyInIovecs copies in IoVecs for Task. // // Preconditions: Same as usermem.IO.CopyIn, plus: // * The caller must be running on the task goroutine. // * t's AddressSpace must be active. func (t *Task) CopyInIovecs(addr hostarch.Addr, numIovecs int) (hostarch.AddrRangeSeq, error) { // Special case to avoid allocating allocating a single hostaddr.AddrRange. if numIovecs == 1 { return copyInIovec(t, t, addr) } iovecs, err := copyInIovecs(t, t, addr, numIovecs) if err != nil { return hostarch.AddrRangeSeq{}, err } return hostarch.AddrRangeSeqFromSlice(iovecs), nil } // CopyInIovecsAsSlice copies in IoVecs and returns them in a slice. // // Preconditions: Same as usermem.IO.CopyIn, plus: // - The caller must be running on the task goroutine or hold t.mu. // - t's AddressSpace must be active. func (t *Task) CopyInIovecsAsSlice(addr hostarch.Addr, numIovecs int) ([]hostarch.AddrRange, error) { return copyInIovecs(t, t, addr, numIovecs) } func copyInIovec(ctx marshal.CopyContext, t *Task, addr hostarch.Addr) (hostarch.AddrRangeSeq, error) { if err := checkArch(t); err != nil { return hostarch.AddrRangeSeq{}, err } b := ctx.CopyScratchBuffer(iovecLength) ar, err := makeIovec(ctx, t, addr, b) if err != nil { return hostarch.AddrRangeSeq{}, err } return hostarch.AddrRangeSeqOf(ar).TakeFirst(MAX_RW_COUNT), nil } // copyInIovecs copies an array of numIovecs struct iovecs from the memory // mapped at addr, converts them to hostarch.AddrRanges, and returns them as a // hostarch.AddrRangeSeq. // // copyInIovecs shares the following properties with Linux's // lib/iov_iter.c:import_iovec() => fs/read_write.c:rw_copy_check_uvector(): // // - If the length of any AddrRange would exceed the range of an ssize_t, // copyInIovecs returns EINVAL. // // - If the length of any AddrRange would cause its end to overflow, // copyInIovecs returns EFAULT. // // - If any AddrRange would include addresses outside the application address // range, copyInIovecs returns EFAULT. // // - The combined length of all AddrRanges is limited to MAX_RW_COUNT. If the // combined length of all AddrRanges would otherwise exceed this amount, ranges // beyond MAX_RW_COUNT are silently truncated. func copyInIovecs(ctx marshal.CopyContext, t *Task, addr hostarch.Addr, numIovecs int) ([]hostarch.AddrRange, error) { if err := checkArch(t); err != nil { return nil, err } if numIovecs == 0 { return nil, nil } var dst []hostarch.AddrRange if numIovecs > 1 { dst = make([]hostarch.AddrRange, 0, numIovecs) } if _, ok := addr.AddLength(uint64(numIovecs) * iovecLength); !ok { return nil, linuxerr.EFAULT } b := ctx.CopyScratchBuffer(iovecLength) for i := 0; i < numIovecs; i++ { ar, err := makeIovec(ctx, t, addr, b) if err != nil { return []hostarch.AddrRange{}, err } dst = append(dst, ar) addr += iovecLength } // Truncate to MAX_RW_COUNT. var total uint64 for i := range dst { dstlen := uint64(dst[i].Length()) if rem := uint64(MAX_RW_COUNT) - total; rem < dstlen { dst[i].End -= hostarch.Addr(dstlen - rem) dstlen = rem } total += dstlen } return dst, nil } func checkArch(t *Task) error { if t.Arch().Width() != 8 { return linuxerr.ENOSYS } return nil } func makeIovec(ctx marshal.CopyContext, t *Task, addr hostarch.Addr, b []byte) (hostarch.AddrRange, error) { if _, err := ctx.CopyInBytes(addr, b); err != nil { return hostarch.AddrRange{}, err } base := hostarch.Addr(hostarch.ByteOrder.Uint64(b[0:8])) length := hostarch.ByteOrder.Uint64(b[8:16]) if length > math.MaxInt64 { return hostarch.AddrRange{}, linuxerr.EINVAL } ar, ok := t.MemoryManager().CheckIORange(base, int64(length)) if !ok { return hostarch.AddrRange{}, linuxerr.EFAULT } return ar, nil } // SingleIOSequence returns a usermem.IOSequence representing [addr, // addr+length) in t's address space. If this contains addresses outside the // application address range, it returns EFAULT. If length exceeds // MAX_RW_COUNT, the range is silently truncated. // // SingleIOSequence is analogous to Linux's // lib/iov_iter.c:import_single_range(). (Note that the non-vectorized read and // write syscalls in Linux do not use import_single_range(). However they check // access_ok() in fs/read_write.c:vfs_read/vfs_write, and overflowing address // ranges are truncated to MAX_RW_COUNT by fs/read_write.c:rw_verify_area().) func (t *Task) SingleIOSequence(addr hostarch.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) { if length > MAX_RW_COUNT { length = MAX_RW_COUNT } ar, ok := t.MemoryManager().CheckIORange(addr, int64(length)) if !ok { return usermem.IOSequence{}, linuxerr.EFAULT } return usermem.IOSequence{ IO: t.MemoryManager(), Addrs: hostarch.AddrRangeSeqOf(ar), Opts: opts, }, nil } // IovecsIOSequence returns a usermem.IOSequence representing the array of // iovcnt struct iovecs at addr in t's address space. opts applies to the // returned IOSequence, not the reading of the struct iovec array. // // IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec(). // // Preconditions: Same as Task.CopyInIovecs. func (t *Task) IovecsIOSequence(addr hostarch.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) { if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV { return usermem.IOSequence{}, linuxerr.EINVAL } ars, err := t.CopyInIovecs(addr, iovcnt) if err != nil { return usermem.IOSequence{}, err } return usermem.IOSequence{ IO: t.MemoryManager(), Addrs: ars, Opts: opts, }, nil } type taskCopyContext struct { ctx context.Context t *Task opts usermem.IOOpts } // CopyContext returns a marshal.CopyContext that copies to/from t's address // space using opts. func (t *Task) CopyContext(ctx context.Context, opts usermem.IOOpts) *taskCopyContext { return &taskCopyContext{ ctx: ctx, t: t, opts: opts, } } // CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer. func (cc *taskCopyContext) CopyScratchBuffer(size int) []byte { if ctxTask, ok := cc.ctx.(*Task); ok { return ctxTask.CopyScratchBuffer(size) } return make([]byte, size) } func (cc *taskCopyContext) getMemoryManager() (*mm.MemoryManager, error) { tmm := cc.t.MemoryManager() if tmm == nil { return nil, linuxerr.ESRCH } if !tmm.IncUsers() { return nil, linuxerr.EFAULT } return tmm, nil } // CopyInBytes implements marshal.CopyContext.CopyInBytes. // // Preconditions: Same as usermem.IO.CopyIn, plus: // - The caller must be running on the task goroutine or hold the cc.t.mu // - t's AddressSpace must be active. func (cc *taskCopyContext) CopyInBytes(addr hostarch.Addr, dst []byte) (int, error) { tmm, err := cc.getMemoryManager() if err != nil { return 0, err } defer tmm.DecUsers(cc.ctx) return tmm.CopyIn(cc.ctx, addr, dst, cc.opts) } // CopyOutBytes implements marshal.CopyContext.CopyOutBytes. // // Preconditions: Same as usermem.IO.CopyOut, plus: // - The caller must be running on the task goroutine or hold the cc.t.mu // - t's AddressSpace must be active. func (cc *taskCopyContext) CopyOutBytes(addr hostarch.Addr, src []byte) (int, error) { tmm, err := cc.getMemoryManager() if err != nil { return 0, err } defer tmm.DecUsers(cc.ctx) return tmm.CopyOut(cc.ctx, addr, src, cc.opts) } type ownTaskCopyContext struct { t *Task opts usermem.IOOpts } // OwnCopyContext returns a marshal.CopyContext that copies to/from t's address // space using opts. The returned CopyContext may only be used by t's task // goroutine. // // Since t already implements marshal.CopyContext, this is only needed to // override the usermem.IOOpts used for the copy. func (t *Task) OwnCopyContext(opts usermem.IOOpts) *ownTaskCopyContext { return &ownTaskCopyContext{ t: t, opts: opts, } } // CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer. func (cc *ownTaskCopyContext) CopyScratchBuffer(size int) []byte { return cc.t.CopyScratchBuffer(size) } // CopyInBytes implements marshal.CopyContext.CopyInBytes. func (cc *ownTaskCopyContext) CopyInBytes(addr hostarch.Addr, dst []byte) (int, error) { return cc.t.MemoryManager().CopyIn(cc.t, addr, dst, cc.opts) } // CopyOutBytes implements marshal.CopyContext.CopyOutBytes. func (cc *ownTaskCopyContext) CopyOutBytes(addr hostarch.Addr, src []byte) (int, error) { return cc.t.MemoryManager().CopyOut(cc.t, addr, src, cc.opts) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_work.go000066400000000000000000000025331465435605700245660ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel // TaskWorker is a deferred task. // // This must be savable. type TaskWorker interface { // TaskWork will be executed prior to returning to user space. Note that // TaskWork may call RegisterWork again, but this will not be executed until // the next return to user space, unlike in Linux. This effectively allows // registration of indefinite user return hooks, but not by default. TaskWork(t *Task) } // RegisterWork can be used to register additional task work that will be // performed prior to returning to user space. See TaskWorker.TaskWork for // semantics regarding registration. func (t *Task) RegisterWork(work TaskWorker) { t.taskWorkMu.Lock() defer t.taskWorkMu.Unlock() t.taskWorkCount.Add(1) t.taskWork = append(t.taskWork, work) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/task_work_mutex.go000066400000000000000000000032061465435605700260060ustar00rootroot00000000000000package kernel import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type taskWorkMutex struct { mu sync.Mutex } var taskWorkprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var taskWorklockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type taskWorklockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *taskWorkMutex) Lock() { locking.AddGLock(taskWorkprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *taskWorkMutex) NestedLock(i taskWorklockNameIndex) { locking.AddGLock(taskWorkprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *taskWorkMutex) Unlock() { locking.DelGLock(taskWorkprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *taskWorkMutex) NestedUnlock(i taskWorklockNameIndex) { locking.DelGLock(taskWorkprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func taskWorkinitLockNames() {} func init() { taskWorkinitLockNames() taskWorkprefixIndex = locking.NewMutexClass(reflect.TypeOf(taskWorkMutex{}), taskWorklockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/taskset_mutex.go000066400000000000000000000045631465435605700254670ustar00rootroot00000000000000package kernel import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type taskSetRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var taskSetlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type taskSetlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *taskSetRWMutex) Lock() { locking.AddGLock(taskSetprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *taskSetRWMutex) NestedLock(i taskSetlockNameIndex) { locking.AddGLock(taskSetprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *taskSetRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(taskSetprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *taskSetRWMutex) NestedUnlock(i taskSetlockNameIndex) { m.mu.Unlock() locking.DelGLock(taskSetprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *taskSetRWMutex) RLock() { locking.AddGLock(taskSetprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *taskSetRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(taskSetprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *taskSetRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *taskSetRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *taskSetRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var taskSetprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func taskSetinitLockNames() {} func init() { taskSetinitLockNames() taskSetprefixIndex = locking.NewMutexClass(reflect.TypeOf(taskSetRWMutex{}), taskSetlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/thread_group.go000066400000000000000000000515521465435605700252520ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( goContext "context" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" ) // A ThreadGroup is a logical grouping of tasks that has widespread // significance to other kernel features (e.g. signal handling). ("Thread // groups" are usually called "processes" in userspace documentation.) // // ThreadGroup is a superset of Linux's struct signal_struct. // // +stateify savable type ThreadGroup struct { threadGroupNode // signalHandlers is the set of signal handlers used by every task in this // thread group. (signalHandlers may also be shared with other thread // groups.) // // signalHandlers.mu (hereafter "the signal mutex") protects state related // to signal handling, as well as state that usually needs to be atomic // with signal handling, for all ThreadGroups and Tasks using // signalHandlers. (This is analogous to Linux's use of struct // sighand_struct::siglock.) // // The signalHandlers pointer can only be mutated during an execve // (Task.finishExec). Consequently, when it's possible for a task in the // thread group to be completing an execve, signalHandlers is protected by // the owning TaskSet.mu. Otherwise, it is possible to read the // signalHandlers pointer without synchronization. In particular, // completing an execve requires that all other tasks in the thread group // have exited, so task goroutines do not need the owning TaskSet.mu to // read the signalHandlers pointer of their thread groups. signalHandlers *SignalHandlers // pendingSignals is the set of pending signals that may be handled by any // task in this thread group. // // pendingSignals is protected by the signal mutex. pendingSignals pendingSignals // If groupStopDequeued is true, a task in the thread group has dequeued a // stop signal, but has not yet initiated the group stop. // // groupStopDequeued is analogous to Linux's JOBCTL_STOP_DEQUEUED. // // groupStopDequeued is protected by the signal mutex. groupStopDequeued bool // groupStopSignal is the signal that caused a group stop to be initiated. // // groupStopSignal is protected by the signal mutex. groupStopSignal linux.Signal // groupStopPendingCount is the number of active tasks in the thread group // for which Task.groupStopPending is set. // // groupStopPendingCount is analogous to Linux's // signal_struct::group_stop_count. // // groupStopPendingCount is protected by the signal mutex. groupStopPendingCount int // If groupStopComplete is true, groupStopPendingCount transitioned from // non-zero to zero without an intervening SIGCONT. // // groupStopComplete is analogous to Linux's SIGNAL_STOP_STOPPED. // // groupStopComplete is protected by the signal mutex. groupStopComplete bool // If groupStopWaitable is true, the thread group is indicating a waitable // group stop event (as defined by EventChildGroupStop). // // Linux represents the analogous state as SIGNAL_STOP_STOPPED being set // and group_exit_code being non-zero. // // groupStopWaitable is protected by the signal mutex. groupStopWaitable bool // If groupContNotify is true, then a SIGCONT has recently ended a group // stop on this thread group, and the first task to observe it should // notify its parent. groupContInterrupted is true iff SIGCONT ended an // incomplete group stop. If groupContNotify is false, groupContInterrupted is // meaningless. // // Analogues in Linux: // // - groupContNotify && groupContInterrupted is represented by // SIGNAL_CLD_STOPPED. // // - groupContNotify && !groupContInterrupted is represented by // SIGNAL_CLD_CONTINUED. // // - !groupContNotify is represented by neither flag being set. // // groupContNotify and groupContInterrupted are protected by the signal // mutex. groupContNotify bool groupContInterrupted bool // If groupContWaitable is true, the thread group is indicating a waitable // continue event (as defined by EventGroupContinue). // // groupContWaitable is analogous to Linux's SIGNAL_STOP_CONTINUED. // // groupContWaitable is protected by the signal mutex. groupContWaitable bool // exiting is true if all tasks in the ThreadGroup should exit. exiting is // analogous to Linux's SIGNAL_GROUP_EXIT. // // exiting is protected by the signal mutex. exiting can only transition // from false to true. exiting bool // exitStatus is the thread group's exit status. // // While exiting is false, exitStatus is protected by the signal mutex. // When exiting becomes true, exitStatus becomes immutable. exitStatus linux.WaitStatus // terminationSignal is the signal that this thread group's leader will // send to its parent when it exits. // // terminationSignal is protected by the TaskSet mutex. terminationSignal linux.Signal // liveGoroutines is the number of non-exited task goroutines in the thread // group. // // liveGoroutines is not saved; it is reset as task goroutines are // restarted by Task.Start. liveGoroutines sync.WaitGroup `state:"nosave"` timerMu threadGroupTimerMutex `state:"nosave"` // itimerRealTimer implements ITIMER_REAL for the thread group. itimerRealTimer *ktime.Timer // itimerVirtSetting is the ITIMER_VIRTUAL setting for the thread group. // // itimerVirtSetting is protected by the signal mutex. itimerVirtSetting ktime.Setting // itimerProfSetting is the ITIMER_PROF setting for the thread group. // // itimerProfSetting is protected by the signal mutex. itimerProfSetting ktime.Setting // rlimitCPUSoftSetting is the setting for RLIMIT_CPU soft limit // notifications for the thread group. // // rlimitCPUSoftSetting is protected by the signal mutex. rlimitCPUSoftSetting ktime.Setting // cpuTimersEnabled is non-zero if itimerVirtSetting.Enabled is true, // itimerProfSetting.Enabled is true, rlimitCPUSoftSetting.Enabled is true, // or limits.Get(CPU) is finite. // // cpuTimersEnabled is protected by the signal mutex. cpuTimersEnabled atomicbitops.Uint32 // timers is the thread group's POSIX interval timers. nextTimerID is the // TimerID at which allocation should begin searching for an unused ID. // // timers and nextTimerID are protected by timerMu. timers map[linux.TimerID]*IntervalTimer nextTimerID linux.TimerID // exitedCPUStats is the CPU usage for all exited tasks in the thread // group. exitedCPUStats is protected by the TaskSet mutex. exitedCPUStats usage.CPUStats // childCPUStats is the CPU usage of all joined descendants of this thread // group. childCPUStats is protected by the TaskSet mutex. childCPUStats usage.CPUStats // ioUsage is the I/O usage for all exited tasks in the thread group. // The ioUsage pointer is immutable. ioUsage *usage.IO // maxRSS is the historical maximum resident set size of the thread group, updated when: // // - A task in the thread group exits, since after all tasks have // exited the MemoryManager is no longer reachable. // // - The thread group completes an execve, since this changes // MemoryManagers. // // maxRSS is protected by the TaskSet mutex. maxRSS uint64 // childMaxRSS is the maximum resident set size in bytes of all joined // descendants of this thread group. // // childMaxRSS is protected by the TaskSet mutex. childMaxRSS uint64 // Resource limits for this ThreadGroup. The limits pointer is immutable. limits *limits.LimitSet // processGroup is the processGroup for this thread group. // // processGroup is protected by the TaskSet mutex. processGroup *ProcessGroup // execed indicates an exec has occurred since creation. This will be // set by finishExec, and new TheadGroups will have this field cleared. // When execed is set, the processGroup may no longer be changed. // // execed is protected by the TaskSet mutex. execed bool // oldRSeqCritical is the thread group's old rseq critical region. oldRSeqCritical atomic.Pointer[OldRSeqCriticalRegion] `state:".(*OldRSeqCriticalRegion)"` // tty is the thread group's controlling terminal. If nil, there is no // controlling terminal. // // tty is protected by the signal mutex. tty *TTY // oomScoreAdj is the thread group's OOM score adjustment. This is // currently not used but is maintained for consistency. // TODO(gvisor.dev/issue/1967) oomScoreAdj atomicbitops.Int32 // isChildSubreaper and hasChildSubreaper correspond to Linux's // signal_struct::is_child_subreaper and has_child_subreaper. // // Both fields are protected by the TaskSet mutex. // // Quoting from signal.h: // "PR_SET_CHILD_SUBREAPER marks a process, like a service manager, to // re-parent orphan (double-forking) child processes to this process // instead of 'init'. The service manager is able to receive SIGCHLD // signals and is able to investigate the process until it calls // wait(). All children of this process will inherit a flag if they // should look for a child_subreaper process at exit" isChildSubreaper bool hasChildSubreaper bool } // NewThreadGroup returns a new, empty thread group in PID namespace pidns. The // thread group leader will send its parent terminationSignal when it exits. // The new thread group isn't visible to the system until a task has been // created inside of it by a successful call to TaskSet.NewTask. func (k *Kernel) NewThreadGroup(pidns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet) *ThreadGroup { tg := &ThreadGroup{ threadGroupNode: threadGroupNode{ pidns: pidns, }, signalHandlers: sh, terminationSignal: terminationSignal, ioUsage: &usage.IO{}, limits: limits, } tg.itimerRealTimer = ktime.NewTimer(k.timekeeper.monotonicClock, &itimerRealListener{tg: tg}) tg.timers = make(map[linux.TimerID]*IntervalTimer) tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{}) return tg } // saveOldRSeqCritical is invoked by stateify. func (tg *ThreadGroup) saveOldRSeqCritical() *OldRSeqCriticalRegion { return tg.oldRSeqCritical.Load() } // loadOldRSeqCritical is invoked by stateify. func (tg *ThreadGroup) loadOldRSeqCritical(_ goContext.Context, r *OldRSeqCriticalRegion) { tg.oldRSeqCritical.Store(r) } // SignalHandlers returns the signal handlers used by tg. // // Preconditions: The caller must provide the synchronization required to read // tg.signalHandlers, as described in the field's comment. func (tg *ThreadGroup) SignalHandlers() *SignalHandlers { return tg.signalHandlers } // Limits returns tg's limits. func (tg *ThreadGroup) Limits() *limits.LimitSet { return tg.limits } // Release releases the thread group's resources. func (tg *ThreadGroup) Release(ctx context.Context) { // Timers must be destroyed without holding the TaskSet or signal mutexes // since timers send signals with Timer.mu locked. tg.itimerRealTimer.Destroy() var its []*IntervalTimer tg.pidns.owner.mu.Lock() tg.signalHandlers.mu.Lock() for _, it := range tg.timers { its = append(its, it) } clear(tg.timers) // nil maps can't be saved // Disassociate from the tty if we have one. if tg.tty != nil { tg.tty.mu.Lock() if tg.tty.tg == tg { tg.tty.tg = nil } tg.tty.mu.Unlock() tg.tty = nil } tg.signalHandlers.mu.Unlock() tg.pidns.owner.mu.Unlock() for _, it := range its { it.DestroyTimer() } } // forEachChildThreadGroupLocked indicates over all child ThreadGroups. // // Precondition: TaskSet.mu must be held. func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) { tg.walkDescendantThreadGroupsLocked(func(child *ThreadGroup) bool { fn(child) // Don't recurse below the immediate children. return false }) } // walkDescendantThreadGroupsLocked recursively walks all descendent // ThreadGroups and executes the visitor function. If visitor returns false for // a given ThreadGroup, then that ThreadGroups descendants are excluded from // further iteration. // // This corresponds to Linux's walk_process_tree. // // Precondition: TaskSet.mu must be held. func (tg *ThreadGroup) walkDescendantThreadGroupsLocked(visitor func(*ThreadGroup) bool) { for t := tg.tasks.Front(); t != nil; t = t.Next() { for child := range t.children { if child == child.tg.leader { if !visitor(child.tg) { // Don't recurse below child. continue } child.tg.walkDescendantThreadGroupsLocked(visitor) } } } } // SetControllingTTY sets tty as the controlling terminal of tg. func (tg *ThreadGroup) SetControllingTTY(tty *TTY, steal bool, isReadable bool) error { tty.mu.Lock() defer tty.mu.Unlock() // We might be asked to set the controlling terminal of multiple // processes, so we lock both the TaskSet and SignalHandlers. tg.pidns.owner.mu.Lock() defer tg.pidns.owner.mu.Unlock() tg.signalHandlers.mu.Lock() defer tg.signalHandlers.mu.Unlock() // "The calling process must be a session leader and not have a // controlling terminal already." - tty_ioctl(4) if tg.processGroup.session.leader != tg { return linuxerr.EINVAL } if tg.tty == tty { return nil } else if tg.tty != nil { return linuxerr.EINVAL } creds := auth.CredentialsFromContext(tg.leader) hasAdmin := creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) // "If this terminal is already the controlling terminal of a different // session group, then the ioctl fails with EPERM, unless the caller // has the CAP_SYS_ADMIN capability and arg equals 1, in which case the // terminal is stolen, and all processes that had it as controlling // terminal lose it." - tty_ioctl(4) if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session { // Stealing requires CAP_SYS_ADMIN in the root user namespace. if !hasAdmin || !steal { return linuxerr.EPERM } // Steal the TTY away. Unlike TIOCNOTTY, don't send signals. for othertg := range tg.pidns.owner.Root.tgids { // This won't deadlock by locking tg.signalHandlers // because at this point: // - We only lock signalHandlers if it's in the same // session as the tty's controlling thread group. // - We know that the calling thread group is not in // the same session as the tty's controlling thread // group. if othertg.processGroup.session == tty.tg.processGroup.session { othertg.signalHandlers.mu.NestedLock(signalHandlersLockTg) othertg.tty = nil othertg.signalHandlers.mu.NestedUnlock(signalHandlersLockTg) } } } if !isReadable && !hasAdmin { return linuxerr.EPERM } // Set the controlling terminal and foreground process group. tg.tty = tty tg.processGroup.session.foreground = tg.processGroup // Set this as the controlling process of the terminal. tty.tg = tg return nil } // ReleaseControllingTTY gives up tty as the controlling tty of tg. func (tg *ThreadGroup) ReleaseControllingTTY(tty *TTY) error { tty.mu.Lock() defer tty.mu.Unlock() // We might be asked to set the controlling terminal of multiple // processes, so we lock both the TaskSet and SignalHandlers. tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() // Just below, we may re-lock signalHandlers in order to send signals. // Thus we can't defer Unlock here. tg.signalHandlers.mu.Lock() if tg.tty == nil || tg.tty != tty { tg.signalHandlers.mu.Unlock() return linuxerr.ENOTTY } // "If the process was session leader, then send SIGHUP and SIGCONT to // the foreground process group and all processes in the current // session lose their controlling terminal." - tty_ioctl(4) // Remove tty as the controlling tty for each process in the session, // then send them SIGHUP and SIGCONT. // If we're not the session leader, we don't have to do much. if tty.tg != tg { tg.tty = nil tg.signalHandlers.mu.Unlock() return nil } tg.signalHandlers.mu.Unlock() // We're the session leader. SIGHUP and SIGCONT the foreground process // group and remove all controlling terminals in the session. var lastErr error for othertg := range tg.pidns.owner.Root.tgids { if othertg.processGroup.session == tg.processGroup.session { othertg.signalHandlers.mu.Lock() othertg.tty = nil if othertg.processGroup == tg.processGroup.session.foreground { if err := othertg.leader.sendSignalLocked(&linux.SignalInfo{Signo: int32(linux.SIGHUP)}, true /* group */); err != nil { lastErr = err } if err := othertg.leader.sendSignalLocked(&linux.SignalInfo{Signo: int32(linux.SIGCONT)}, true /* group */); err != nil { lastErr = err } } othertg.signalHandlers.mu.Unlock() } } return lastErr } // ForegroundProcessGroupID returns the foreground process group ID of the // thread group. func (tg *ThreadGroup) ForegroundProcessGroupID(tty *TTY) (ProcessGroupID, error) { tty.mu.Lock() defer tty.mu.Unlock() tg.pidns.owner.mu.Lock() defer tg.pidns.owner.mu.Unlock() tg.signalHandlers.mu.Lock() defer tg.signalHandlers.mu.Unlock() // fd must refer to the controlling terminal of the calling process. // See tcgetpgrp(3) if tg.tty != tty { return 0, linuxerr.ENOTTY } return tg.processGroup.session.foreground.id, nil } // SetForegroundProcessGroupID sets the foreground process group of tty to // pgid. func (tg *ThreadGroup) SetForegroundProcessGroupID(tty *TTY, pgid ProcessGroupID) error { tty.mu.Lock() defer tty.mu.Unlock() tg.pidns.owner.mu.Lock() defer tg.pidns.owner.mu.Unlock() tg.signalHandlers.mu.Lock() defer tg.signalHandlers.mu.Unlock() // tty must be the controlling terminal. if tg.tty != tty { return linuxerr.ENOTTY } // pgid must be positive. if pgid < 0 { return linuxerr.EINVAL } // pg must not be empty. Empty process groups are removed from their // pid namespaces. pg, ok := tg.pidns.processGroups[pgid] if !ok { return linuxerr.ESRCH } // pg must be part of this process's session. if tg.processGroup.session != pg.session { return linuxerr.EPERM } signalAction := tg.signalHandlers.actions[linux.SIGTTOU] // If the calling process is a member of a background group, a SIGTTOU // signal is sent to all members of this background process group. // We need also need to check whether it is ignoring or blocking SIGTTOU. ignored := signalAction.Handler == linux.SIG_IGN blocked := (linux.SignalSet(tg.leader.signalMask.RacyLoad()) & linux.SignalSetOf(linux.SIGTTOU)) != 0 if tg.processGroup.id != tg.processGroup.session.foreground.id && !ignored && !blocked { tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGTTOU), true) return linuxerr.ERESTARTSYS } tg.processGroup.session.foreground = pg return nil } // SetChildSubreaper marks this ThreadGroup sets the isChildSubreaper field on // this ThreadGroup, and marks all child ThreadGroups as having a subreaper. // Recursion stops if we find another subreaper process, which is either a // ThreadGroup with isChildSubreaper bit set, or a ThreadGroup with PID=1 // inside a PID namespace. func (tg *ThreadGroup) SetChildSubreaper(isSubreaper bool) { ts := tg.TaskSet() ts.mu.Lock() defer ts.mu.Unlock() tg.isChildSubreaper = isSubreaper tg.walkDescendantThreadGroupsLocked(func(child *ThreadGroup) bool { // Is this child PID 1 in its PID namespace, or already a // subreaper? if child.isInitInLocked(child.PIDNamespace()) || child.isChildSubreaper { // Don't set hasChildSubreaper, and don't recurse. return false } child.hasChildSubreaper = isSubreaper return true // Recurse. }) } // IsChildSubreaper returns whether this ThreadGroup is a child subreaper. func (tg *ThreadGroup) IsChildSubreaper() bool { ts := tg.TaskSet() ts.mu.RLock() defer ts.mu.RUnlock() return tg.isChildSubreaper } // IsInitIn returns whether this ThreadGroup has TID 1 int the given // PIDNamespace. func (tg *ThreadGroup) IsInitIn(pidns *PIDNamespace) bool { ts := tg.TaskSet() ts.mu.RLock() defer ts.mu.RUnlock() return tg.isInitInLocked(pidns) } // isInitInLocked returns whether this ThreadGroup has TID 1 in the given // PIDNamespace. // // Preconditions: TaskSet.mu must be locked. func (tg *ThreadGroup) isInitInLocked(pidns *PIDNamespace) bool { return pidns.tgids[tg] == initTID } // itimerRealListener implements ktime.Listener for ITIMER_REAL expirations. // // +stateify savable type itimerRealListener struct { tg *ThreadGroup } // NotifyTimer implements ktime.TimerListener.NotifyTimer. func (l *itimerRealListener) NotifyTimer(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { l.tg.SendSignal(SignalInfoPriv(linux.SIGALRM)) return ktime.Setting{}, false } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/thread_group_timer_mutex.go000066400000000000000000000034361465435605700276720ustar00rootroot00000000000000package kernel import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type threadGroupTimerMutex struct { mu sync.Mutex } var threadGroupTimerprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var threadGroupTimerlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type threadGroupTimerlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *threadGroupTimerMutex) Lock() { locking.AddGLock(threadGroupTimerprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *threadGroupTimerMutex) NestedLock(i threadGroupTimerlockNameIndex) { locking.AddGLock(threadGroupTimerprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *threadGroupTimerMutex) Unlock() { locking.DelGLock(threadGroupTimerprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *threadGroupTimerMutex) NestedUnlock(i threadGroupTimerlockNameIndex) { locking.DelGLock(threadGroupTimerprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func threadGroupTimerinitLockNames() {} func init() { threadGroupTimerinitLockNames() threadGroupTimerprefixIndex = locking.NewMutexClass(reflect.TypeOf(threadGroupTimerMutex{}), threadGroupTimerlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/threads.go000066400000000000000000000437551465435605700242270ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) // TasksLimit is the maximum number of threads for untrusted application. // Linux doesn't really limit this directly, rather it is limited by total // memory size, stacks allocated and a global maximum. There's no real reason // for us to limit it either, (esp. since threads are backed by go routines), // and we would expect to hit resource limits long before hitting this number. // However, for correctness, we still check that the user doesn't exceed this // number. // // Note that because of the way futexes are implemented, there *are* in fact // serious restrictions on valid thread IDs. They are limited to 2^30 - 1 // (kernel/fork.c:MAX_THREADS). const TasksLimit = (1 << 16) // ThreadID is a generic thread identifier. // // +marshal type ThreadID int32 // String returns a decimal representation of the ThreadID. func (tid ThreadID) String() string { return fmt.Sprintf("%d", tid) } // initTID is the TID given to the first task added to each PID namespace. The // thread group led by initTID is called the namespace's init process. The // death of a PID namespace's init process causes all tasks visible in that // namespace to be killed. const initTID ThreadID = 1 // A TaskSet comprises all tasks in a system. // // +stateify savable type TaskSet struct { // mu protects all relationships between tasks and thread groups in the // TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.) mu taskSetRWMutex `state:"nosave"` // Root is the root PID namespace, in which all tasks in the TaskSet are // visible. The Root pointer is immutable. Root *PIDNamespace // sessions is the set of all sessions. sessions sessionList // stopCount is the number of active external stops applicable to all tasks // in the TaskSet (calls to TaskSet.BeginExternalStop that have not been // paired with a call to TaskSet.EndExternalStop). stopCount is protected // by mu. // // stopCount is not saved for the same reason as Task.stopCount; it is // always reset to zero after restore. stopCount int32 `state:"nosave"` // liveGoroutines is the number of non-exited task goroutines in the // TaskSet. // // liveGoroutines is not saved; it is reset as task goroutines are // restarted by Task.Start. liveGoroutines sync.WaitGroup `state:"nosave"` // runningGoroutines is the number of running task goroutines in the // TaskSet. // // runningGoroutines is not saved; its counter value is required to be zero // at time of save (but note that this is not necessarily the same thing as // sync.WaitGroup's zero value). runningGoroutines sync.WaitGroup `state:"nosave"` // aioGoroutines is the number of goroutines running async I/O // callbacks. // // aioGoroutines is not saved but is required to be zero at the time of // save. aioGoroutines sync.WaitGroup `state:"nosave"` } // newTaskSet returns a new, empty TaskSet. func newTaskSet(pidns *PIDNamespace) *TaskSet { ts := &TaskSet{Root: pidns} pidns.owner = ts return ts } // ForEachThreadGroup applies f to each thread group in ts. func (ts *TaskSet) ForEachThreadGroup(f func(tg *ThreadGroup)) { ts.mu.RLock() defer ts.mu.RUnlock() ts.forEachThreadGroupLocked(f) } // forEachThreadGroupLocked applies f to each thread group in ts. // // Preconditions: ts.mu must be locked (for reading or writing). func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) { for tg := range ts.Root.tgids { f(tg) } } // forEachTaskLocked applies f to each Task in ts. // // Preconditions: ts.mu must be locked (for reading or writing). func (ts *TaskSet) forEachTaskLocked(f func(t *Task)) { for t := range ts.Root.tids { f(t) } } // A PIDNamespace represents a PID namespace, a bimap between thread IDs and // tasks. See the pid_namespaces(7) man page for further details. // // N.B. A task is said to be visible in a PID namespace if the PID namespace // contains a thread ID that maps to that task. // // +stateify savable type PIDNamespace struct { // owner is the TaskSet that this PID namespace belongs to. The owner // pointer is immutable. owner *TaskSet // parent is the PID namespace of the process that created this one. If // this is the root PID namespace, parent is nil. The parent pointer is // immutable. // // Invariant: All tasks that are visible in this namespace are also visible // in all ancestor namespaces. parent *PIDNamespace // userns is the user namespace with which this PID namespace is // associated. Privileged operations on this PID namespace must have // appropriate capabilities in userns. The userns pointer is immutable. userns *auth.UserNamespace // id is a unique ID assigned to the PID namespace. id is immutable. id uint64 // The following fields are protected by owner.mu. // last is the last ThreadID to be allocated in this namespace. last ThreadID // tasks is a mapping from ThreadIDs in this namespace to tasks visible in // the namespace. tasks map[ThreadID]*Task // tids is a mapping from tasks visible in this namespace to their // identifiers in this namespace. tids map[*Task]ThreadID // tgids is a mapping from thread groups visible in this namespace to // their identifiers in this namespace. // // The content of tgids is equivalent to tids[tg.leader]. This exists // primarily as an optimization to quickly find all thread groups. tgids map[*ThreadGroup]ThreadID // sessions is a mapping from SessionIDs in this namespace to sessions // visible in the namespace. sessions map[SessionID]*Session // sids is a mapping from sessions visible in this namespace to their // identifiers in this namespace. sids map[*Session]SessionID // processGroups is a mapping from ProcessGroupIDs in this namespace to // process groups visible in the namespace. processGroups map[ProcessGroupID]*ProcessGroup // pgids is a mapping from process groups visible in this namespace to // their identifiers in this namespace. pgids map[*ProcessGroup]ProcessGroupID // exiting indicates that the namespace's init process is exiting or has // exited. exiting bool // pidNamespaceData contains additional per-PID-namespace data. extra pidNamespaceData } func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespace) *PIDNamespace { return &PIDNamespace{ owner: ts, parent: parent, userns: userns, id: lastPIDNSID.Add(1), tasks: make(map[ThreadID]*Task), tids: make(map[*Task]ThreadID), tgids: make(map[*ThreadGroup]ThreadID), sessions: make(map[SessionID]*Session), sids: make(map[*Session]SessionID), processGroups: make(map[ProcessGroupID]*ProcessGroup), pgids: make(map[*ProcessGroup]ProcessGroupID), extra: newPIDNamespaceData(), } } // lastPIDNSID is the last value of PIDNamespace.ID assigned to a PID // namespace. // // This is global rather than being per-TaskSet or Kernel because // NewRootPIDNamespace() is called before the Kernel is initialized. var lastPIDNSID atomicbitops.Uint64 // NewRootPIDNamespace creates the root PID namespace. 'owner' is not available // yet when root namespace is created and must be set by caller. func NewRootPIDNamespace(userns *auth.UserNamespace) *PIDNamespace { return newPIDNamespace(nil, nil, userns) } // NewChild returns a new, empty PID namespace that is a child of ns. Authority // over the new PID namespace is controlled by userns. func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace { return newPIDNamespace(ns.owner, ns, userns) } // TaskWithID returns the task with thread ID tid in PID namespace ns. If no // task has that TID, TaskWithID returns nil. func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task { ns.owner.mu.RLock() t := ns.tasks[tid] ns.owner.mu.RUnlock() return t } // ID returns a non-zero ID that is unique across PID namespaces. func (ns *PIDNamespace) ID() uint64 { return ns.id } // ThreadGroupWithID returns the thread group led by the task with thread ID // tid in PID namespace ns. If no task has that TID, or if the task with that // TID is not a thread group leader, ThreadGroupWithID returns nil. func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup { ns.owner.mu.RLock() defer ns.owner.mu.RUnlock() t := ns.tasks[tid] if t == nil { return nil } if t != t.tg.leader { return nil } return t.tg } // IDOfTask returns the TID assigned to the given task in PID namespace ns. If // the task is not visible in that namespace, IDOfTask returns 0. (This return // value is significant in some cases, e.g. getppid() is documented as // returning 0 if the caller's parent is in an ancestor namespace and // consequently not visible to the caller.) If the task is nil, IDOfTask returns // 0. func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID { ns.owner.mu.RLock() id := ns.tids[t] ns.owner.mu.RUnlock() return id } // IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns. // If the task is not visible in that namespace, IDOfThreadGroup returns 0. func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID { ns.owner.mu.RLock() id := ns.tgids[tg] ns.owner.mu.RUnlock() return id } // Tasks returns a snapshot of the tasks in ns. func (ns *PIDNamespace) Tasks() []*Task { ns.owner.mu.RLock() defer ns.owner.mu.RUnlock() tasks := make([]*Task, 0, len(ns.tasks)) for t := range ns.tids { tasks = append(tasks, t) } return tasks } // NumTasks returns the number of tasks in ns. func (ns *PIDNamespace) NumTasks() int { ns.owner.mu.RLock() defer ns.owner.mu.RUnlock() return len(ns.tids) } // NumTasksPerContainer returns the number of tasks in ns that belongs to given container. func (ns *PIDNamespace) NumTasksPerContainer(cid string) int { ns.owner.mu.RLock() defer ns.owner.mu.RUnlock() tasks := 0 for t := range ns.tids { if t.ContainerID() == cid { tasks++ } } return tasks } // ThreadGroups returns a snapshot of the thread groups in ns. func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup { return ns.ThreadGroupsAppend(nil) } // ThreadGroupsAppend appends a snapshot of the thread groups in ns to tgs. func (ns *PIDNamespace) ThreadGroupsAppend(tgs []*ThreadGroup) []*ThreadGroup { ns.owner.mu.RLock() defer ns.owner.mu.RUnlock() for tg := range ns.tgids { tgs = append(tgs, tg) } return tgs } // UserNamespace returns the user namespace associated with PID namespace ns. func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace { return ns.userns } // Root returns the root PID namespace of ns. func (ns *PIDNamespace) Root() *PIDNamespace { return ns.owner.Root } // A threadGroupNode defines the relationship between a thread group and the // rest of the system. Conceptually, threadGroupNode is data belonging to the // owning TaskSet, as if TaskSet contained a field `nodes // map[*ThreadGroup]*threadGroupNode`. However, for practical reasons, // threadGroupNode is embedded in the ThreadGroup it represents. // (threadGroupNode is an anonymous field in ThreadGroup; this is to expose // threadGroupEntry's methods on ThreadGroup to make it implement // threadGroupLinker.) // // +stateify savable type threadGroupNode struct { // pidns is the PID namespace containing the thread group and all of its // member tasks. The pidns pointer is immutable. pidns *PIDNamespace // pidWithinNS the thread ID of the leader of this thread group within pidns. // Useful to avoid using locks when determining a thread group leader's own // TID. pidWithinNS atomicbitops.Int32 // eventQueue is notified whenever a event of interest to Task.Wait occurs // in a child of this thread group, or a ptrace tracee of a task in this // thread group. Events are defined in task_exit.go. eventQueue waiter.Queue // leader is the thread group's leader, which is the oldest task in the // thread group; usually the last task in the thread group to call // execve(), or if no such task exists then the first task in the thread // group, which was created by a call to fork() or clone() without // CLONE_THREAD. Once a thread group has been made visible to the rest of // the system by TaskSet.newTask, leader is never nil. // // Note that it's possible for the leader to exit without causing the rest // of the thread group to exit; in such a case, leader will still be valid // and non-nil, but leader will not be in tasks. // // leader is protected by the TaskSet mutex. leader *Task // If execing is not nil, it is a task in the thread group that has killed // all other tasks so that it can become the thread group leader and // perform an execve. (execing may already be the thread group leader.) // // execing is analogous to Linux's signal_struct::group_exit_task. // // execing is protected by the TaskSet mutex. execing *Task // tasks is all tasks in the thread group that have not yet been reaped. // // tasks is protected by both the TaskSet mutex and the signal mutex: // Mutating tasks requires locking the TaskSet mutex for writing *and* // locking the signal mutex. Reading tasks requires locking the TaskSet // mutex *or* locking the signal mutex. tasks taskList // tasksCount is the number of tasks in the thread group that have not yet // been reaped; equivalently, tasksCount is the number of tasks in tasks. // // tasksCount is protected by both the TaskSet mutex and the signal mutex, // as with tasks. tasksCount int // liveTasks is the number of tasks in the thread group that have not yet // reached TaskExitZombie. // // liveTasks is protected by the TaskSet mutex (NOT the signal mutex). liveTasks int // activeTasks is the number of tasks in the thread group that have not yet // reached TaskExitInitiated. // // activeTasks is protected by both the TaskSet mutex and the signal mutex, // as with tasks. activeTasks int } // PIDNamespace returns the PID namespace containing tg. func (tg *ThreadGroup) PIDNamespace() *PIDNamespace { return tg.pidns } // TaskSet returns the TaskSet containing tg. func (tg *ThreadGroup) TaskSet() *TaskSet { return tg.pidns.owner } // Leader returns tg's leader. func (tg *ThreadGroup) Leader() *Task { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() return tg.leader } // Count returns the number of non-exited threads in the group. func (tg *ThreadGroup) Count() int { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() var count int for t := tg.tasks.Front(); t != nil; t = t.Next() { count++ } return count } // MemberIDs returns a snapshot of the ThreadIDs (in PID namespace pidns) for // all tasks in tg. func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() var tasks []ThreadID for t := tg.tasks.Front(); t != nil; t = t.Next() { if id, ok := pidns.tids[t]; ok { tasks = append(tasks, id) } } return tasks } // ID returns tg's leader's thread ID in its own PID namespace. // If tg's leader is dead, ID returns 0. func (tg *ThreadGroup) ID() ThreadID { return ThreadID(tg.pidWithinNS.Load()) } // A taskNode defines the relationship between a task and the rest of the // system. The comments on threadGroupNode also apply to taskNode. // // +stateify savable type taskNode struct { // tg is the thread group that this task belongs to. The tg pointer is // immutable. tg *ThreadGroup `state:"wait"` // taskEntry links into tg.tasks. Note that this means that // Task.Next/Prev/SetNext/SetPrev refer to sibling tasks in the same thread // group. See threadGroupNode.tasks for synchronization info. taskEntry // parent is the task's parent. parent may be nil. // // parent is protected by the TaskSet mutex. parent *Task // children is this task's children. // // children is protected by the TaskSet mutex. children map[*Task]struct{} // If childPIDNamespace is not nil, all new tasks created by this task will // be members of childPIDNamespace rather than this one. (As a corollary, // this task becomes unable to create sibling tasks in the same thread // group.) // // childPIDNamespace is exclusive to the task goroutine. childPIDNamespace *PIDNamespace } // ThreadGroup returns the thread group containing t. func (t *Task) ThreadGroup() *ThreadGroup { return t.tg } // PIDNamespace returns the PID namespace containing t. func (t *Task) PIDNamespace() *PIDNamespace { return t.tg.pidns } // TaskSet returns the TaskSet containing t. func (t *Task) TaskSet() *TaskSet { return t.tg.pidns.owner } // Timekeeper returns the system Timekeeper. func (t *Task) Timekeeper() *Timekeeper { return t.k.timekeeper } // Parent returns t's parent. func (t *Task) Parent() *Task { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() return t.parent } // ParentLocked returns t's parent. Caller must ensure t's TaskSet mu // is locked for at least reading. // // +checklocks:t.tg.pidns.owner.mu func (t *Task) ParentLocked() *Task { return t.parent } // ThreadID returns t's thread ID in its own PID namespace. If the task is // dead, ThreadID returns 0. func (t *Task) ThreadID() ThreadID { return t.tg.pidns.IDOfTask(t) } // TGIDInRoot returns t's TGID in the root PID namespace. func (t *Task) TGIDInRoot() ThreadID { return t.tg.pidns.owner.Root.IDOfThreadGroup(t.tg) } // Children returns children of this task. func (t *Task) Children() map[*Task]struct{} { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() children := make(map[*Task]struct{}, len(t.children)) for child, val := range t.children { children[child] = val } return children } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/threads_impl.go000066400000000000000000000026621465435605700252400ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false package kernel // pidNamespaceData may contain extra per-PID-namespace data. // +stateify savable type pidNamespaceData struct { } // newPIDNamespaceData returns a new `pidNamespaceData` struct. func newPIDNamespaceData() pidNamespaceData { return pidNamespaceData{} } // addTask adds a Task into this PIDNamespace. // It is always performed under TaskSet lock. func (ns *PIDNamespace) addTask(t *Task, tid ThreadID) error { ns.tasks[tid] = t ns.tids[t] = tid if t.tg.leader == nil { // New thread group. ns.tgids[t.tg] = tid } return nil } // deleteTask deletes a Task from this PIDNamespace. // It is always performed under TaskSet lock. func (ns *PIDNamespace) deleteTask(t *Task) { delete(ns.tasks, ns.tids[t]) delete(ns.tids, t) if t == t.tg.leader || t.tg.leader == nil { delete(ns.tgids, t.tg) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/time/000077500000000000000000000000001465435605700231665ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/time/context.go000066400000000000000000000025121465435605700252010ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package time import ( "gvisor.dev/gvisor/pkg/context" ) // contextID is the time package's type for context.Context.Value keys. type contextID int const ( // CtxRealtimeClock is a Context.Value key for the current real time. CtxRealtimeClock contextID = iota ) // RealtimeClockFromContext returns the real time clock associated with context // ctx. func RealtimeClockFromContext(ctx context.Context) Clock { if v := ctx.Value(CtxRealtimeClock); v != nil { return v.(Clock) } return nil } // NowFromContext returns the current real time associated with context ctx. func NowFromContext(ctx context.Context) Time { if clk := RealtimeClockFromContext(ctx); clk != nil { return clk.Now() } panic("encountered context without RealtimeClock") } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/time/seqatomic_clock_unsafe.go000066400000000000000000000031251465435605700302170ustar00rootroot00000000000000package time import ( "unsafe" "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/sync" ) // SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race // with any writer critical sections in seq. // //go:nosplit func SeqAtomicLoadClock(seq *sync.SeqCount, ptr *Clock) Clock { for { if val, ok := SeqAtomicTryLoadClock(seq, seq.BeginRead(), ptr); ok { return val } } } // SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section // in seq initiated by a call to seq.BeginRead() that returned epoch. If the // read would race with a writer critical section, SeqAtomicTryLoad returns // (unspecified, false). // //go:nosplit func SeqAtomicTryLoadClock(seq *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *Clock) (val Clock, ok bool) { if sync.RaceEnabled { gohacks.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) } else { val = *ptr } ok = seq.ReadOk(epoch) return } // SeqAtomicStore sets *ptr to a copy of val, ensuring that any racing reader // critical sections are forced to retry. // //go:nosplit func SeqAtomicStoreClock(seq *sync.SeqCount, ptr *Clock, val Clock) { seq.BeginWrite() SeqAtomicStoreSeqedClock(ptr, val) seq.EndWrite() } // SeqAtomicStoreSeqed sets *ptr to a copy of val. // // Preconditions: ptr is protected by a SeqCount that will be in a writer // critical section throughout the call to SeqAtomicStore. // //go:nosplit func SeqAtomicStoreSeqedClock(ptr *Clock, val Clock) { if sync.RaceEnabled { gohacks.Memmove(unsafe.Pointer(ptr), unsafe.Pointer(&val), unsafe.Sizeof(val)) } else { *ptr = val } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/time/time.go000066400000000000000000000523131465435605700244570ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package time defines the Timer type, which provides a periodic timer that // works by sampling a user-provided clock. package time import ( "fmt" "math" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) // Events that may be generated by a Clock. const ( // ClockEventSet occurs when a Clock undergoes a discontinuous change. ClockEventSet waiter.EventMask = 1 << iota // ClockEventRateIncrease occurs when the rate at which a Clock advances // increases significantly, such that values returned by previous calls to // Clock.WallTimeUntil may be too large. ClockEventRateIncrease ) // Time represents an instant in time with nanosecond precision. // // Time may represent time with respect to any clock and may not have any // meaning in the real world. // // +stateify savable type Time struct { ns int64 } var ( // MinTime is the zero time instant, the lowest possible time that can // be represented by Time. MinTime = Time{ns: math.MinInt64} // MaxTime is the highest possible time that can be represented by // Time. MaxTime = Time{ns: math.MaxInt64} // ZeroTime represents the zero time in an unspecified Clock's domain. ZeroTime = Time{ns: 0} ) const ( // MinDuration is the minimum duration representable by time.Duration. MinDuration = time.Duration(math.MinInt64) // MaxDuration is the maximum duration representable by time.Duration. MaxDuration = time.Duration(math.MaxInt64) ) // FromNanoseconds returns a Time representing the point ns nanoseconds after // an unspecified Clock's zero time. func FromNanoseconds(ns int64) Time { return Time{ns} } // FromSeconds returns a Time representing the point s seconds after an // unspecified Clock's zero time. func FromSeconds(s int64) Time { if s > math.MaxInt64/time.Second.Nanoseconds() { return MaxTime } return Time{s * 1e9} } // FromUnix converts from Unix seconds and nanoseconds to Time, assuming a real // time Unix clock domain. func FromUnix(s int64, ns int64) Time { if s > math.MaxInt64/time.Second.Nanoseconds() { return MaxTime } t := s * 1e9 if t > math.MaxInt64-ns { return MaxTime } return Time{t + ns} } // FromTimespec converts from Linux Timespec to Time. func FromTimespec(ts linux.Timespec) Time { return Time{ts.ToNsecCapped()} } // FromTimeval converts a Linux Timeval to Time. func FromTimeval(tv linux.Timeval) Time { return Time{tv.ToNsecCapped()} } // Nanoseconds returns nanoseconds elapsed since the zero time in t's Clock // domain. If t represents walltime, this is nanoseconds since the Unix epoch. func (t Time) Nanoseconds() int64 { return t.ns } // Microseconds returns microseconds elapsed since the zero time in t's Clock // domain. If t represents walltime, this is microseconds since the Unix epoch. func (t Time) Microseconds() int64 { return t.ns / 1000 } // Seconds returns seconds elapsed since the zero time in t's Clock domain. If // t represents walltime, this is seconds since Unix epoch. func (t Time) Seconds() int64 { return t.Nanoseconds() / time.Second.Nanoseconds() } // Timespec converts Time to a Linux timespec. func (t Time) Timespec() linux.Timespec { return linux.NsecToTimespec(t.Nanoseconds()) } // Unix returns the (seconds, nanoseconds) representation of t such that // seconds*1e9 + nanoseconds = t. func (t Time) Unix() (s int64, ns int64) { s = t.ns / 1e9 ns = t.ns % 1e9 return } // TimeT converts Time to a Linux time_t. func (t Time) TimeT() linux.TimeT { return linux.NsecToTimeT(t.Nanoseconds()) } // Timeval converts Time to a Linux timeval. func (t Time) Timeval() linux.Timeval { return linux.NsecToTimeval(t.Nanoseconds()) } // StatxTimestamp converts Time to a Linux statx_timestamp. func (t Time) StatxTimestamp() linux.StatxTimestamp { return linux.NsecToStatxTimestamp(t.Nanoseconds()) } // Add adds the duration of d to t. func (t Time) Add(d time.Duration) Time { if t.ns > 0 && d.Nanoseconds() > math.MaxInt64-int64(t.ns) { return MaxTime } if t.ns < 0 && d.Nanoseconds() < math.MinInt64-int64(t.ns) { return MinTime } return Time{int64(t.ns) + d.Nanoseconds()} } // AddTime adds the duration of u to t. func (t Time) AddTime(u Time) Time { return t.Add(time.Duration(u.ns)) } // Equal reports whether the two times represent the same instant in time. func (t Time) Equal(u Time) bool { return t.ns == u.ns } // Before reports whether the instant t is before the instant u. func (t Time) Before(u Time) bool { return t.ns < u.ns } // After reports whether the instant t is after the instant u. func (t Time) After(u Time) bool { return t.ns > u.ns } // Sub returns the duration of t - u. // // N.B. This measure may not make sense for every Time returned by ktime.Clock. // Callers who need wall time duration can use ktime.Clock.WallTimeUntil to // estimate that wall time. func (t Time) Sub(u Time) time.Duration { dur := time.Duration(int64(t.ns)-int64(u.ns)) * time.Nanosecond switch { case u.Add(dur).Equal(t): return dur case t.Before(u): return MinDuration default: return MaxDuration } } // IsMin returns whether t represents the lowest possible time instant. func (t Time) IsMin() bool { return t == MinTime } // IsZero returns whether t represents the zero time instant in t's Clock domain. func (t Time) IsZero() bool { return t == ZeroTime } // String returns the time represented in nanoseconds as a string. func (t Time) String() string { return fmt.Sprintf("%dns", t.Nanoseconds()) } // A Clock is an abstract time source. type Clock interface { // Now returns the current time in nanoseconds according to the Clock. Now() Time // WallTimeUntil returns the estimated wall time until Now will return a // value greater than or equal to t, given that a recent call to Now // returned now. If t has already passed, WallTimeUntil may return 0 or a // negative value. // // WallTimeUntil must be abstract to support Clocks that do not represent // wall time (e.g. thread group execution timers). Clocks that represent // wall times may embed the WallRateClock type to obtain an appropriate // trivial implementation of WallTimeUntil. // // WallTimeUntil is used to determine when associated Timers should next // check for expirations. Returning too small a value may result in // spurious Timer goroutine wakeups, while returning too large a value may // result in late expirations. Implementations should usually err on the // side of underestimating. WallTimeUntil(t, now Time) time.Duration // Waitable methods may be used to subscribe to Clock events. Waiters will // not be preserved by Save and must be re-established during restore. // // Since Clock events are transient, implementations of // waiter.Waitable.Readiness should return 0. waiter.Waitable } // WallRateClock implements Clock.WallTimeUntil for Clocks that elapse at the // same rate as wall time. type WallRateClock struct{} // WallTimeUntil implements Clock.WallTimeUntil. func (*WallRateClock) WallTimeUntil(t, now Time) time.Duration { return t.Sub(now) } // NoClockEvents implements waiter.Waitable for Clocks that do not generate // events. type NoClockEvents struct{} // Readiness implements waiter.Waitable.Readiness. func (*NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask { return 0 } // EventRegister implements waiter.Waitable.EventRegister. func (*NoClockEvents) EventRegister(e *waiter.Entry) error { return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (*NoClockEvents) EventUnregister(e *waiter.Entry) { } // ClockEventsQueue implements waiter.Waitable by wrapping waiter.Queue and // defining waiter.Waitable.Readiness as required by Clock. type ClockEventsQueue struct { waiter.Queue } // EventRegister implements waiter.Waitable. func (c *ClockEventsQueue) EventRegister(e *waiter.Entry) error { c.Queue.EventRegister(e) return nil } // Readiness implements waiter.Waitable.Readiness. func (*ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask { return 0 } // Listener receives expirations from a Timer. type Listener interface { // NotifyTimer is called when its associated Timer expires. exp is the number // of expirations. setting is the next timer Setting. // // Notify is called with the associated Timer's mutex locked, so Notify // must not take any locks that precede Timer.mu in lock order. // // If Notify returns true, the timer will use the returned setting // rather than the passed one. // // Preconditions: exp > 0. NotifyTimer(exp uint64, setting Setting) (newSetting Setting, update bool) } // Setting contains user-controlled mutable Timer properties. // // +stateify savable type Setting struct { // Enabled is true if the timer is running. Enabled bool // Next is the time in nanoseconds of the next expiration. Next Time // Period is the time in nanoseconds between expirations. If Period is // zero, the timer will not automatically restart after expiring. // // Invariant: Period >= 0. Period time.Duration } // SettingFromSpec converts a (value, interval) pair to a Setting based on a // reading from c. value is interpreted as a time relative to c.Now(). func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Setting, error) { return SettingFromSpecAt(value, interval, c.Now()) } // SettingFromSpecAt converts a (value, interval) pair to a Setting. value is // interpreted as a time relative to now. func SettingFromSpecAt(value time.Duration, interval time.Duration, now Time) (Setting, error) { if value < 0 { return Setting{}, linuxerr.EINVAL } if value == 0 { return Setting{Period: interval}, nil } return Setting{ Enabled: true, Next: now.Add(value), Period: interval, }, nil } // SettingFromAbsSpec converts a (value, interval) pair to a Setting. value is // interpreted as an absolute time. func SettingFromAbsSpec(value Time, interval time.Duration) (Setting, error) { if value.Before(ZeroTime) { return Setting{}, linuxerr.EINVAL } if value.IsZero() { return Setting{Period: interval}, nil } return Setting{ Enabled: true, Next: value, Period: interval, }, nil } // SettingFromItimerspec converts a linux.Itimerspec to a Setting. If abs is // true, its.Value is interpreted as an absolute time. Otherwise, it is // interpreted as a time relative to c.Now(). func SettingFromItimerspec(its linux.Itimerspec, abs bool, c Clock) (Setting, error) { if abs { return SettingFromAbsSpec(FromTimespec(its.Value), its.Interval.ToDuration()) } return SettingFromSpec(its.Value.ToDuration(), its.Interval.ToDuration(), c) } // SpecFromSetting converts a timestamp and a Setting to a (relative value, // interval) pair, as used by most Linux syscalls that return a struct // itimerval or struct itimerspec. func SpecFromSetting(now Time, s Setting) (value, period time.Duration) { if !s.Enabled { return 0, s.Period } return s.Next.Sub(now), s.Period } // ItimerspecFromSetting converts a Setting to a linux.Itimerspec. func ItimerspecFromSetting(now Time, s Setting) linux.Itimerspec { val, iv := SpecFromSetting(now, s) return linux.Itimerspec{ Interval: linux.DurationToTimespec(iv), Value: linux.DurationToTimespec(val), } } // At returns an updated Setting and a number of expirations after the // associated Clock indicates a time of now. // // Settings may be created by successive calls to At with decreasing // values of now (i.e. time may appear to go backward). Supporting this is // required to support non-monotonic clocks, as well as allowing // Timer.clock.Now() to be called without holding Timer.mu. func (s Setting) At(now Time) (Setting, uint64) { if !s.Enabled { return s, 0 } if s.Next.After(now) { return s, 0 } if s.Period == 0 { s.Enabled = false return s, 1 } exp := 1 + uint64(now.Sub(s.Next).Nanoseconds())/uint64(s.Period) s.Next = s.Next.Add(time.Duration(uint64(s.Period) * exp)) return s, exp } // Timer is an optionally-periodic timer driven by sampling a user-specified // Clock. Timer's semantics support the requirements of Linux's interval timers // (setitimer(2), timer_create(2), timerfd_create(2)). // // Timers should be created using NewTimer and must be cleaned up by calling // Timer.Destroy when no longer used. // // +stateify savable type Timer struct { // clock is the time source. clock is protected by mu and clockSeq. clockSeq sync.SeqCount `state:"nosave"` clock Clock // listener is notified of expirations. listener is immutable. listener Listener // mu protects the following mutable fields. mu sync.Mutex `state:"nosave"` // setting is the timer setting. setting is protected by mu. setting Setting // paused is true if the Timer is paused. paused is protected by mu. paused bool // kicker is used to wake the Timer goroutine. The kicker pointer is // immutable, but its state is protected by mu. kicker *time.Timer `state:"nosave"` // entry is registered with clock.EventRegister. entry is immutable. // // Per comment in Clock, entry must be re-registered after restore; per // comment in Timer.Load, this is done in Timer.Resume. entry waiter.Entry `state:"nosave"` // events is the channel that will be notified whenever entry receives an // event. It is also closed by Timer.Destroy to instruct the Timer // goroutine to exit. events chan struct{} `state:"nosave"` } // timerTickEvents are Clock events that require the Timer goroutine to Tick // prematurely. const timerTickEvents = ClockEventSet | ClockEventRateIncrease // NewTimer returns a new Timer that will obtain time from clock and send // expirations to listener. The Timer is initially stopped and has no first // expiration or period configured. func NewTimer(clock Clock, listener Listener) *Timer { t := &Timer{ clock: clock, listener: listener, } t.init() return t } // init initializes Timer state that is not preserved across save/restore. If // init has already been called, calling it again is a no-op. // // Preconditions: t.mu must be locked, or the caller must have exclusive access // to t. func (t *Timer) init() { if t.kicker != nil { return } // If t.kicker is nil, the Timer goroutine can't be running, so we can't // race with it. t.kicker = time.NewTimer(0) t.entry, t.events = waiter.NewChannelEntry(timerTickEvents) if err := t.clock.EventRegister(&t.entry); err != nil { panic(err) } go t.runGoroutine() // S/R-SAFE: synchronized by t.mu } // Destroy releases resources owned by the Timer. A Destroyed Timer must not be // used again; in particular, a Destroyed Timer should not be Saved. func (t *Timer) Destroy() { // Stop the Timer, ensuring that the Timer goroutine will not call // t.kicker.Reset, before calling t.kicker.Stop. t.mu.Lock() t.setting.Enabled = false t.mu.Unlock() t.kicker.Stop() // Unregister t.entry, ensuring that the Clock will not send to t.events, // before closing t.events to instruct the Timer goroutine to exit. t.clock.EventUnregister(&t.entry) close(t.events) } func (t *Timer) runGoroutine() { for { select { case <-t.kicker.C: case _, ok := <-t.events: if !ok { // Channel closed by Destroy. return } } t.Tick() } } // Tick requests that the Timer immediately check for expirations and // re-evaluate when it should next check for expirations. func (t *Timer) Tick() { // Optimistically read t.Clock().Now() before locking t.mu, as t.clock is // unlikely to change. unlockedClock := t.Clock() now := unlockedClock.Now() t.mu.Lock() defer t.mu.Unlock() if t.paused { return } if t.clock != unlockedClock { now = t.clock.Now() } s, exp := t.setting.At(now) t.setting = s if exp > 0 { if newS, ok := t.listener.NotifyTimer(exp, t.setting); ok { t.setting = newS } } t.resetKickerLocked(now) } // Pause pauses the Timer, ensuring that it does not generate any further // expirations until Resume is called. If the Timer is already paused, Pause // has no effect. func (t *Timer) Pause() { t.mu.Lock() defer t.mu.Unlock() t.paused = true // t.kicker may be nil if we were restored but never resumed. if t.kicker != nil { t.kicker.Stop() } } // Resume ends the effect of Pause. If the Timer is not paused, Resume has no // effect. func (t *Timer) Resume() { t.mu.Lock() defer t.mu.Unlock() if !t.paused { return } t.paused = false // Lazily initialize the Timer. We can't call Timer.init until Timer.Resume // because save/restore will restore Timers before // kernel.Timekeeper.SetClocks() has been called, so if t.clock is backed // by a kernel.Timekeeper then the Timer goroutine will panic if it calls // t.clock.Now(). t.init() // Kick the Timer goroutine in case it was already initialized, but the // Timer goroutine was sleeping. t.kicker.Reset(0) } // Get returns a snapshot of the Timer's current Setting and the time // (according to the Timer's Clock) at which the snapshot was taken. // // Preconditions: The Timer must not be paused (since its Setting cannot // be advanced to the current time while it is paused.) func (t *Timer) Get() (Time, Setting) { // Optimistically read t.Clock().Now() before locking t.mu, as t.clock is // unlikely to change. unlockedClock := t.Clock() now := unlockedClock.Now() t.mu.Lock() defer t.mu.Unlock() if t.paused { panic(fmt.Sprintf("Timer.Get called on paused Timer %p", t)) } if t.clock != unlockedClock { now = t.clock.Now() } s, exp := t.setting.At(now) t.setting = s if exp > 0 { if newS, ok := t.listener.NotifyTimer(exp, t.setting); ok { t.setting = newS } } t.resetKickerLocked(now) return now, s } // Swap atomically changes the Timer's Setting and returns the Timer's previous // Setting and the time (according to the Timer's Clock) at which the snapshot // was taken. Setting s.Enabled to true starts the Timer, while setting // s.Enabled to false stops it. // // Preconditions: The Timer must not be paused. func (t *Timer) Swap(s Setting) (Time, Setting) { return t.SwapAnd(s, nil) } // SwapAnd atomically changes the Timer's Setting, calls f if it is not nil, // and returns the Timer's previous Setting and the time (according to the // Timer's Clock) at which the Setting was changed. Setting s.Enabled to true // starts the timer, while setting s.Enabled to false stops it. // // Preconditions: // - The Timer must not be paused. // - f cannot call any Timer methods since it is called with the Timer mutex // locked. func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) { // Optimistically read t.Clock().Now() before locking t.mu, as t.clock is // unlikely to change. unlockedClock := t.Clock() now := unlockedClock.Now() t.mu.Lock() defer t.mu.Unlock() if t.paused { panic(fmt.Sprintf("Timer.SwapAnd called on paused Timer %p", t)) } if t.clock != unlockedClock { now = t.clock.Now() } oldS, oldExp := t.setting.At(now) if oldExp > 0 { t.listener.NotifyTimer(oldExp, oldS) // N.B. The returned Setting doesn't matter because we're about // to overwrite. } if f != nil { f() } newS, newExp := s.At(now) t.setting = newS if newExp > 0 { if newS, ok := t.listener.NotifyTimer(newExp, t.setting); ok { t.setting = newS } } t.resetKickerLocked(now) return now, oldS } // SetClock atomically changes a Timer's Clock and Setting. func (t *Timer) SetClock(c Clock, s Setting) { var now Time if s.Enabled { now = c.Now() } t.mu.Lock() defer t.mu.Unlock() t.setting = s if oldC := t.clock; oldC != c { oldC.EventUnregister(&t.entry) c.EventRegister(&t.entry) t.clockSeq.BeginWrite() t.clock = c t.clockSeq.EndWrite() } t.resetKickerLocked(now) } // Preconditions: t.mu must be locked. func (t *Timer) resetKickerLocked(now Time) { if t.setting.Enabled { // Clock.WallTimeUntil may return a negative value. This is fine; // time.when treats negative Durations as 0. t.kicker.Reset(t.clock.WallTimeUntil(t.setting.Next, now)) } // We don't call t.kicker.Stop if !t.setting.Enabled because in most cases // resetKickerLocked will be called from the Timer goroutine itself, in // which case t.kicker has already fired and t.kicker.Stop will be an // expensive no-op (time.Timer.Stop => time.stopTimer => runtime.stopTimer // => runtime.deltimer). } // Clock returns the Clock used by t. func (t *Timer) Clock() Clock { return SeqAtomicLoadClock(&t.clockSeq, &t.clock) } // ChannelNotifier is a Listener that sends on a channel. // // ChannelNotifier cannot be saved or loaded. type ChannelNotifier chan struct{} // NewChannelNotifier creates a new channel notifier. // // If the notifier is used with a timer, Timer.Destroy will close the channel // returned here. func NewChannelNotifier() (Listener, <-chan struct{}) { tchan := make(chan struct{}, 1) return ChannelNotifier(tchan), tchan } // NotifyTimer implements Listener.NotifyTimer. func (c ChannelNotifier) NotifyTimer(uint64, Setting) (Setting, bool) { select { case c <- struct{}{}: default: } return Setting{}, false } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/time/time_state_autogen.go000066400000000000000000000042211465435605700273740ustar00rootroot00000000000000// automatically generated by stateify. package time import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (t *Time) StateTypeName() string { return "pkg/sentry/kernel/time.Time" } func (t *Time) StateFields() []string { return []string{ "ns", } } func (t *Time) beforeSave() {} // +checklocksignore func (t *Time) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.ns) } func (t *Time) afterLoad(context.Context) {} // +checklocksignore func (t *Time) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.ns) } func (s *Setting) StateTypeName() string { return "pkg/sentry/kernel/time.Setting" } func (s *Setting) StateFields() []string { return []string{ "Enabled", "Next", "Period", } } func (s *Setting) beforeSave() {} // +checklocksignore func (s *Setting) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.Enabled) stateSinkObject.Save(1, &s.Next) stateSinkObject.Save(2, &s.Period) } func (s *Setting) afterLoad(context.Context) {} // +checklocksignore func (s *Setting) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.Enabled) stateSourceObject.Load(1, &s.Next) stateSourceObject.Load(2, &s.Period) } func (t *Timer) StateTypeName() string { return "pkg/sentry/kernel/time.Timer" } func (t *Timer) StateFields() []string { return []string{ "clock", "listener", "setting", "paused", } } func (t *Timer) beforeSave() {} // +checklocksignore func (t *Timer) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.clock) stateSinkObject.Save(1, &t.listener) stateSinkObject.Save(2, &t.setting) stateSinkObject.Save(3, &t.paused) } func (t *Timer) afterLoad(context.Context) {} // +checklocksignore func (t *Timer) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.clock) stateSourceObject.Load(1, &t.listener) stateSourceObject.Load(2, &t.setting) stateSourceObject.Load(3, &t.paused) } func init() { state.Register((*Time)(nil)) state.Register((*Setting)(nil)) state.Register((*Timer)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/time/time_unsafe_state_autogen.go000066400000000000000000000000661465435605700307400ustar00rootroot00000000000000// automatically generated by stateify. package time golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/time/util.go000066400000000000000000000073651465435605700245050ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package time import ( "sync" "time" ) // AfterFunc waits for duration to elapse according to clock then runs fn. // The timer is started immediately and will fire exactly once. func AfterFunc(clock Clock, duration time.Duration, fn func()) *VariableTimer { timer := &VariableTimer{ clock: clock, } timer.notifier = functionNotifier{ fn: func() { // tcpip.Timer.Stop() explicitly states that the function is called in a // separate goroutine that Stop() does not synchronize with. // Timer.Destroy() synchronizes with calls to Listener.NotifyTimer(). // This is semantically meaningful because, in the former case, it's // legal to call tcpip.Timer.Stop() while holding locks that may also be // taken by the function, but this isn't so in the latter case. Most // immediately, Timer calls Listener.NotifyTimer() while holding // Timer.mu. A deadlock occurs without spawning a goroutine: // T1: (Timer expires) // => Timer.Tick() <- Timer.mu.Lock() called // => Listener.NotifyTimer() // => Timer.Stop() // => Timer.Destroy() <- Timer.mu.Lock() called, deadlock! // // Spawning a goroutine avoids the deadlock: // T1: (Timer expires) // => Timer.Tick() <- Timer.mu.Lock() called // => Listener.NotifyTimer() <- Launches T2 // T2: // => Timer.Stop() // => Timer.Destroy() <- Timer.mu.Lock() called, blocks // T1: // => (returns) <- Timer.mu.Unlock() called // T2: // => (continues) <- No deadlock! go func() { timer.Stop() fn() }() }, } timer.Reset(duration) return timer } // VariableTimer is a resettable timer with variable duration expirations. // Implements tcpip.Timer, which does not define a Destroy method; instead, all // resources are released after timer expiration and calls to Timer.Stop. // // Must be created by AfterFunc. type VariableTimer struct { // clock is the time source. clock is immutable. clock Clock // notifier is called when the Timer expires. notifier is immutable. notifier functionNotifier // mu protects t. mu sync.Mutex // t stores the latest running Timer. This is replaced whenever Reset is // called since Timer cannot be restarted once it has been Destroyed by Stop. // // This field is nil iff Stop has been called. t *Timer } // Stop implements tcpip.Timer.Stop. func (r *VariableTimer) Stop() bool { r.mu.Lock() defer r.mu.Unlock() if r.t == nil { return false } _, lastSetting := r.t.Swap(Setting{}) r.t.Destroy() r.t = nil return lastSetting.Enabled } // Reset implements tcpip.Timer.Reset. func (r *VariableTimer) Reset(d time.Duration) { r.mu.Lock() defer r.mu.Unlock() if r.t == nil { r.t = NewTimer(r.clock, &r.notifier) } r.t.Swap(Setting{ Enabled: true, Period: 0, Next: r.clock.Now().Add(d), }) } // functionNotifier is a TimerListener that runs a function. // // functionNotifier cannot be saved or loaded. type functionNotifier struct { fn func() } // NotifyTimer implements ktime.TimerListener.NotifyTimer. func (f *functionNotifier) NotifyTimer(uint64, Setting) (Setting, bool) { f.fn() return Setting{}, false } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/timekeeper.go000066400000000000000000000235731465435605700247230ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "time" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/log" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" sentrytime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" ) // Timekeeper manages all of the kernel clocks. // // +stateify savable type Timekeeper struct { // clocks are the clock sources. // // These are not saved directly, as the new machine's clock may behave // differently. // // It is set only once, by SetClocks. clocks sentrytime.Clocks `state:"nosave"` // realtimeClock is a ktime.Clock based on timekeeper's Realtime. realtimeClock *timekeeperClock // monotonicClock is a ktime.Clock based on timekeeper's Monotonic. monotonicClock *timekeeperClock // bootTime is the realtime when the system "booted". i.e., when // SetClocks was called in the initial (not restored) run. bootTime ktime.Time // monotonicOffset is the offset to apply to the monotonic clock output // from clocks. // // It is set only once, by SetClocks. monotonicOffset int64 `state:"nosave"` // monotonicLowerBound is the lowerBound for monotonic time. monotonicLowerBound atomicbitops.Int64 `state:"nosave"` // restored, if non-nil, indicates that this Timekeeper was restored // from a state file. The clocks are not set until restored is closed. restored chan struct{} `state:"nosave"` // saveMonotonic is the (offset) value of the monotonic clock at the // time of save. // // It is only valid if restored is non-nil. // // It is only used in SetClocks after restore to compute the new // monotonicOffset. saveMonotonic int64 // saveRealtime is the value of the realtime clock at the time of save. // // It is only valid if restored is non-nil. // // It is only used in SetClocks after restore to compute the new // monotonicOffset. saveRealtime int64 // mu protects destruction with stop and wg. mu sync.Mutex `state:"nosave"` // stop is used to tell the update goroutine to exit. stop chan struct{} `state:"nosave"` // wg is used to indicate that the update goroutine has exited. wg sync.WaitGroup `state:"nosave"` } // NewTimekeeper returns a Timekeeper that is automatically kept up-to-date. // NewTimekeeper does not take ownership of paramPage. // // SetClocks must be called on the returned Timekeeper before it is usable. func NewTimekeeper() *Timekeeper { t := Timekeeper{} t.realtimeClock = &timekeeperClock{tk: &t, c: sentrytime.Realtime} t.monotonicClock = &timekeeperClock{tk: &t, c: sentrytime.Monotonic} return &t } // SetClocks the backing clock source. // // SetClocks must be called before the Timekeeper is used, and it may not be // called more than once, as changing the clock source without extra correction // could cause time discontinuities. // // It must also be called after Load. func (t *Timekeeper) SetClocks(c sentrytime.Clocks, params *VDSOParamPage) { // Update the params, marking them "not ready", as we may need to // restart calibration on this new machine. if t.restored != nil { if err := params.Write(func() vdsoParams { return vdsoParams{} }); err != nil { panic("unable to reset VDSO params: " + err.Error()) } } if t.clocks != nil { panic("SetClocks called on previously-initialized Timekeeper") } t.clocks = c // Compute the offset of the monotonic clock from the base Clocks. // // In a fresh (not restored) sentry, monotonic time starts at zero. // // In a restored sentry, monotonic time jumps forward by approximately // the same amount as real time. There are no guarantees here, we are // just making a best-effort attempt to make it appear that the app // was simply not scheduled for a long period, rather than that the // real time clock was changed. // // If real time went backwards, it remains the same. wantMonotonic := int64(0) nowMonotonic, err := t.clocks.GetTime(sentrytime.Monotonic) if err != nil { panic("Unable to get current monotonic time: " + err.Error()) } nowRealtime, err := t.clocks.GetTime(sentrytime.Realtime) if err != nil { panic("Unable to get current realtime: " + err.Error()) } if t.restored != nil { wantMonotonic = t.saveMonotonic elapsed := nowRealtime - t.saveRealtime if elapsed > 0 { wantMonotonic += elapsed } } t.monotonicOffset = wantMonotonic - nowMonotonic if t.restored == nil { // Hold on to the initial "boot" time. t.bootTime = ktime.FromNanoseconds(nowRealtime) } t.mu.Lock() defer t.mu.Unlock() t.startUpdater(params) if t.restored != nil { close(t.restored) } } var _ tcpip.Clock = (*Timekeeper)(nil) // Now implements tcpip.Clock. func (t *Timekeeper) Now() time.Time { nsec, err := t.GetTime(sentrytime.Realtime) if err != nil { panic("timekeeper.GetTime(sentrytime.Realtime): " + err.Error()) } return time.Unix(0, nsec) } // NowMonotonic implements tcpip.Clock. func (t *Timekeeper) NowMonotonic() tcpip.MonotonicTime { nsec, err := t.GetTime(sentrytime.Monotonic) if err != nil { panic("timekeeper.GetTime(sentrytime.Monotonic): " + err.Error()) } var mt tcpip.MonotonicTime return mt.Add(time.Duration(nsec) * time.Nanosecond) } // AfterFunc implements tcpip.Clock. func (t *Timekeeper) AfterFunc(d time.Duration, f func()) tcpip.Timer { return ktime.AfterFunc(t.realtimeClock, d, f) } // startUpdater starts an update goroutine that keeps the clocks updated. // // mu must be held. func (t *Timekeeper) startUpdater(params *VDSOParamPage) { if t.stop != nil { // Timekeeper already started return } t.stop = make(chan struct{}) // Keep the clocks up to date. // // Note that the Go runtime uses host CLOCK_MONOTONIC to service the // timer, so it may run at a *slightly* different rate from the // application CLOCK_MONOTONIC. That is fine, as we only need to update // at approximately this rate. timer := time.NewTicker(sentrytime.ApproxUpdateInterval) t.wg.Add(1) go func() { // S/R-SAFE: stopped during save. defer t.wg.Done() for { // Start with an update immediately, so the clocks are // ready ASAP. // Call Update within a Write block to prevent the VDSO // from using the old params between Update and // Write. if err := params.Write(func() vdsoParams { monotonicParams, monotonicOk, realtimeParams, realtimeOk := t.clocks.Update() var p vdsoParams if monotonicOk { p.monotonicReady = 1 p.monotonicBaseCycles = int64(monotonicParams.BaseCycles) p.monotonicBaseRef = int64(monotonicParams.BaseRef) + t.monotonicOffset p.monotonicFrequency = monotonicParams.Frequency } if realtimeOk { p.realtimeReady = 1 p.realtimeBaseCycles = int64(realtimeParams.BaseCycles) p.realtimeBaseRef = int64(realtimeParams.BaseRef) p.realtimeFrequency = realtimeParams.Frequency } return p }); err != nil { log.Warningf("Unable to update VDSO parameter page: %v", err) } select { case <-timer.C: case <-t.stop: return } } }() } // stopUpdater stops the update goroutine, blocking until it exits. // // mu must be held. func (t *Timekeeper) stopUpdater() { if t.stop == nil { // Updater not running. return } close(t.stop) t.wg.Wait() t.stop = nil } // Destroy destroys the Timekeeper, freeing all associated resources. func (t *Timekeeper) Destroy() { t.mu.Lock() defer t.mu.Unlock() t.stopUpdater() } // PauseUpdates stops clock parameter updates. This should only be used when // Tasks are not running and thus cannot access the clock. func (t *Timekeeper) PauseUpdates() { t.mu.Lock() defer t.mu.Unlock() t.stopUpdater() } // ResumeUpdates restarts clock parameter updates stopped by PauseUpdates. func (t *Timekeeper) ResumeUpdates(params *VDSOParamPage) { t.mu.Lock() defer t.mu.Unlock() t.startUpdater(params) } // GetTime returns the current time in nanoseconds. func (t *Timekeeper) GetTime(c sentrytime.ClockID) (int64, error) { if t.clocks == nil { if t.restored == nil { panic("Timekeeper used before initialized with SetClocks") } <-t.restored } now, err := t.clocks.GetTime(c) if err == nil && c == sentrytime.Monotonic { now += t.monotonicOffset for { // It's possible that the clock is shaky. This may be due to // platform issues, e.g. the KVM platform relies on the guest // TSC and host TSC, which may not be perfectly in sync. To // work around this issue, ensure that the monotonic time is // always bounded by the last time read. oldLowerBound := t.monotonicLowerBound.Load() if now < oldLowerBound { now = oldLowerBound break } if t.monotonicLowerBound.CompareAndSwap(oldLowerBound, now) { break } } } return now, err } // BootTime returns the system boot real time. func (t *Timekeeper) BootTime() ktime.Time { return t.bootTime } // timekeeperClock is a ktime.Clock that reads time from a // kernel.Timekeeper-managed clock. // // +stateify savable type timekeeperClock struct { tk *Timekeeper c sentrytime.ClockID // Implements ktime.Clock.WallTimeUntil. ktime.WallRateClock `state:"nosave"` // Implements waiter.Waitable. (We have no ability to detect // discontinuities from external changes to CLOCK_REALTIME). ktime.NoClockEvents `state:"nosave"` } // Now implements ktime.Clock.Now. func (tc *timekeeperClock) Now() ktime.Time { now, err := tc.tk.GetTime(tc.c) if err != nil { panic(fmt.Sprintf("timekeeperClock(ClockID=%v)).Now: %v", tc.c, err)) } return ktime.FromNanoseconds(now) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/timekeeper_state.go000066400000000000000000000023671465435605700261210ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "context" "gvisor.dev/gvisor/pkg/sentry/time" ) // beforeSave is invoked by stateify. func (t *Timekeeper) beforeSave() { if t.stop != nil { panic("pauseUpdates must be called before Save") } // N.B. we want the *offset* monotonic time. var err error if t.saveMonotonic, err = t.GetTime(time.Monotonic); err != nil { panic("unable to get current monotonic time: " + err.Error()) } if t.saveRealtime, err = t.GetTime(time.Realtime); err != nil { panic("unable to get current realtime: " + err.Error()) } } // afterLoad is invoked by stateify. func (t *Timekeeper) afterLoad(context.Context) { t.restored = make(chan struct{}) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/tty.go000066400000000000000000000042021465435605700233750ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" ) // TTY defines the relationship between a thread group and its controlling // terminal. // // +stateify savable type TTY struct { // Index is the terminal index. It is immutable. Index uint32 mu sync.Mutex `state:"nosave"` // tg is protected by mu. tg *ThreadGroup } // TTY returns the thread group's controlling terminal. If nil, there is no // controlling terminal. func (tg *ThreadGroup) TTY() *TTY { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() tg.signalHandlers.mu.Lock() defer tg.signalHandlers.mu.Unlock() return tg.tty } // SignalForegroundProcessGroup sends the signal to the foreground process // group of the TTY. func (tty *TTY) SignalForegroundProcessGroup(info *linux.SignalInfo) { tty.mu.Lock() defer tty.mu.Unlock() tg := tty.tg if tg == nil { // This TTY is not a controlling thread group. This can happen // if it was opened with O_NOCTTY, or if it failed the checks // on session and leaders in SetControllingTTY(). There is // nothing to signal. return } tg.pidns.owner.mu.Lock() tg.signalHandlers.mu.Lock() fg := tg.processGroup.session.foreground tg.signalHandlers.mu.Unlock() tg.pidns.owner.mu.Unlock() if fg == nil { // Nothing to signal. return } // SendSignal will take TaskSet.mu and signalHandlers.mu, so we cannot // hold them here. if err := fg.SendSignal(info); err != nil { log.Warningf("failed to signal foreground process group (pgid=%d): %v", fg.id, err) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/uncaught_signal_go_proto/000077500000000000000000000000001465435605700273135ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/uncaught_signal_go_proto/uncaught_signal.pb.go000066400000000000000000000157411465435605700334250ustar00rootroot00000000000000// Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.31.0 // protoc v5.26.1 // source: pkg/sentry/kernel/uncaught_signal.proto package uncaught_signal_go_proto import ( protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" registers_go_proto "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" reflect "reflect" sync "sync" ) const ( // Verify that this generated code is sufficiently up-to-date. _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) // Verify that runtime/protoimpl is sufficiently up-to-date. _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) type UncaughtSignal struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Tid int32 `protobuf:"varint,1,opt,name=tid,proto3" json:"tid,omitempty"` Pid int32 `protobuf:"varint,2,opt,name=pid,proto3" json:"pid,omitempty"` Registers *registers_go_proto.Registers `protobuf:"bytes,3,opt,name=registers,proto3" json:"registers,omitempty"` SignalNumber int32 `protobuf:"varint,4,opt,name=signal_number,json=signalNumber,proto3" json:"signal_number,omitempty"` FaultAddr uint64 `protobuf:"varint,5,opt,name=fault_addr,json=faultAddr,proto3" json:"fault_addr,omitempty"` } func (x *UncaughtSignal) Reset() { *x = UncaughtSignal{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_kernel_uncaught_signal_proto_msgTypes[0] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *UncaughtSignal) String() string { return protoimpl.X.MessageStringOf(x) } func (*UncaughtSignal) ProtoMessage() {} func (x *UncaughtSignal) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_kernel_uncaught_signal_proto_msgTypes[0] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use UncaughtSignal.ProtoReflect.Descriptor instead. func (*UncaughtSignal) Descriptor() ([]byte, []int) { return file_pkg_sentry_kernel_uncaught_signal_proto_rawDescGZIP(), []int{0} } func (x *UncaughtSignal) GetTid() int32 { if x != nil { return x.Tid } return 0 } func (x *UncaughtSignal) GetPid() int32 { if x != nil { return x.Pid } return 0 } func (x *UncaughtSignal) GetRegisters() *registers_go_proto.Registers { if x != nil { return x.Registers } return nil } func (x *UncaughtSignal) GetSignalNumber() int32 { if x != nil { return x.SignalNumber } return 0 } func (x *UncaughtSignal) GetFaultAddr() uint64 { if x != nil { return x.FaultAddr } return 0 } var File_pkg_sentry_kernel_uncaught_signal_proto protoreflect.FileDescriptor var file_pkg_sentry_kernel_uncaught_signal_proto_rawDesc = []byte{ 0x0a, 0x27, 0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x2f, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x2f, 0x75, 0x6e, 0x63, 0x61, 0x75, 0x67, 0x68, 0x74, 0x5f, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x06, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x1a, 0x1f, 0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x2f, 0x61, 0x72, 0x63, 0x68, 0x2f, 0x72, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0xa9, 0x01, 0x0a, 0x0e, 0x55, 0x6e, 0x63, 0x61, 0x75, 0x67, 0x68, 0x74, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x12, 0x10, 0x0a, 0x03, 0x74, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x03, 0x74, 0x69, 0x64, 0x12, 0x10, 0x0a, 0x03, 0x70, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x03, 0x70, 0x69, 0x64, 0x12, 0x2f, 0x0a, 0x09, 0x72, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x11, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x52, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72, 0x73, 0x52, 0x09, 0x72, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72, 0x73, 0x12, 0x23, 0x0a, 0x0d, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x5f, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0c, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x12, 0x1d, 0x0a, 0x0a, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x61, 0x64, 0x64, 0x72, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x09, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x41, 0x64, 0x64, 0x72, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( file_pkg_sentry_kernel_uncaught_signal_proto_rawDescOnce sync.Once file_pkg_sentry_kernel_uncaught_signal_proto_rawDescData = file_pkg_sentry_kernel_uncaught_signal_proto_rawDesc ) func file_pkg_sentry_kernel_uncaught_signal_proto_rawDescGZIP() []byte { file_pkg_sentry_kernel_uncaught_signal_proto_rawDescOnce.Do(func() { file_pkg_sentry_kernel_uncaught_signal_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_sentry_kernel_uncaught_signal_proto_rawDescData) }) return file_pkg_sentry_kernel_uncaught_signal_proto_rawDescData } var file_pkg_sentry_kernel_uncaught_signal_proto_msgTypes = make([]protoimpl.MessageInfo, 1) var file_pkg_sentry_kernel_uncaught_signal_proto_goTypes = []interface{}{ (*UncaughtSignal)(nil), // 0: gvisor.UncaughtSignal (*registers_go_proto.Registers)(nil), // 1: gvisor.Registers } var file_pkg_sentry_kernel_uncaught_signal_proto_depIdxs = []int32{ 1, // 0: gvisor.UncaughtSignal.registers:type_name -> gvisor.Registers 1, // [1:1] is the sub-list for method output_type 1, // [1:1] is the sub-list for method input_type 1, // [1:1] is the sub-list for extension type_name 1, // [1:1] is the sub-list for extension extendee 0, // [0:1] is the sub-list for field type_name } func init() { file_pkg_sentry_kernel_uncaught_signal_proto_init() } func file_pkg_sentry_kernel_uncaught_signal_proto_init() { if File_pkg_sentry_kernel_uncaught_signal_proto != nil { return } if !protoimpl.UnsafeEnabled { file_pkg_sentry_kernel_uncaught_signal_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*UncaughtSignal); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } } type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_pkg_sentry_kernel_uncaught_signal_proto_rawDesc, NumEnums: 0, NumMessages: 1, NumExtensions: 0, NumServices: 0, }, GoTypes: file_pkg_sentry_kernel_uncaught_signal_proto_goTypes, DependencyIndexes: file_pkg_sentry_kernel_uncaught_signal_proto_depIdxs, MessageInfos: file_pkg_sentry_kernel_uncaught_signal_proto_msgTypes, }.Build() File_pkg_sentry_kernel_uncaught_signal_proto = out.File file_pkg_sentry_kernel_uncaught_signal_proto_rawDesc = nil file_pkg_sentry_kernel_uncaught_signal_proto_goTypes = nil file_pkg_sentry_kernel_uncaught_signal_proto_depIdxs = nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/user_counters_mutex.go000066400000000000000000000033221465435605700267010ustar00rootroot00000000000000package kernel import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type userCountersMutex struct { mu sync.Mutex } var userCountersprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var userCounterslockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type userCounterslockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *userCountersMutex) Lock() { locking.AddGLock(userCountersprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *userCountersMutex) NestedLock(i userCounterslockNameIndex) { locking.AddGLock(userCountersprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *userCountersMutex) Unlock() { locking.DelGLock(userCountersprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *userCountersMutex) NestedUnlock(i userCounterslockNameIndex) { locking.DelGLock(userCountersprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func userCountersinitLockNames() {} func init() { userCountersinitLockNames() userCountersprefixIndex = locking.NewMutexClass(reflect.TypeOf(userCountersMutex{}), userCounterslockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/uts_namespace.go000066400000000000000000000075411465435605700254150ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" ) // UTSNamespace represents a UTS namespace, a holder of two system identifiers: // the hostname and domain name. // // +stateify savable type UTSNamespace struct { // mu protects all fields below. mu sync.Mutex `state:"nosave"` hostName string domainName string // userns is the user namespace associated with the UTSNamespace. // Privileged operations on this UTSNamespace must have appropriate // capabilities in userns. // // userns is immutable. userns *auth.UserNamespace inode *nsfs.Inode } // NewUTSNamespace creates a new UTS namespace. func NewUTSNamespace(hostName, domainName string, userns *auth.UserNamespace) *UTSNamespace { return &UTSNamespace{ hostName: hostName, domainName: domainName, userns: userns, } } // UTSNamespace returns the task's UTS namespace. func (t *Task) UTSNamespace() *UTSNamespace { t.mu.Lock() defer t.mu.Unlock() return t.utsns } // GetUTSNamespace takes a reference on the task UTS namespace and // returns it. It will return nil if the task isn't alive. func (t *Task) GetUTSNamespace() *UTSNamespace { t.mu.Lock() defer t.mu.Unlock() if t.utsns != nil { t.utsns.IncRef() } return t.utsns } // HostName returns the host name of this UTS namespace. func (u *UTSNamespace) HostName() string { u.mu.Lock() defer u.mu.Unlock() return u.hostName } // SetHostName sets the host name of this UTS namespace. func (u *UTSNamespace) SetHostName(host string) { u.mu.Lock() defer u.mu.Unlock() u.hostName = host } // DomainName returns the domain name of this UTS namespace. func (u *UTSNamespace) DomainName() string { u.mu.Lock() defer u.mu.Unlock() return u.domainName } // SetDomainName sets the domain name of this UTS namespace. func (u *UTSNamespace) SetDomainName(domain string) { u.mu.Lock() defer u.mu.Unlock() u.domainName = domain } // UserNamespace returns the user namespace associated with this UTS namespace. func (u *UTSNamespace) UserNamespace() *auth.UserNamespace { u.mu.Lock() defer u.mu.Unlock() return u.userns } // Type implements nsfs.Namespace.Type. func (u *UTSNamespace) Type() string { return "uts" } // Destroy implements nsfs.Namespace.Destroy. func (u *UTSNamespace) Destroy(ctx context.Context) {} // SetInode sets the nsfs `inode` to the UTS namespace. func (u *UTSNamespace) SetInode(inode *nsfs.Inode) { u.mu.Lock() defer u.mu.Unlock() u.inode = inode } // GetInode returns the nsfs inode associated with the UTS namespace. func (u *UTSNamespace) GetInode() *nsfs.Inode { u.mu.Lock() defer u.mu.Unlock() return u.inode } // IncRef increments the Namespace's refcount. func (u *UTSNamespace) IncRef() { u.mu.Lock() defer u.mu.Unlock() u.inode.IncRef() } // DecRef decrements the namespace's refcount. func (u *UTSNamespace) DecRef(ctx context.Context) { u.mu.Lock() defer u.mu.Unlock() u.inode.DecRef(ctx) } // Clone makes a copy of this UTS namespace, associating the given user // namespace. func (u *UTSNamespace) Clone(userns *auth.UserNamespace) *UTSNamespace { u.mu.Lock() defer u.mu.Unlock() return &UTSNamespace{ hostName: u.hostName, domainName: u.domainName, userns: userns, } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/vdso.go000066400000000000000000000111541465435605700235340ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "context" "fmt" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" ) // vdsoParams are the parameters exposed to the VDSO. // // They are exposed to the VDSO via a parameter page managed by VDSOParamPage, // which also includes a sequence counter. // // +marshal type vdsoParams struct { monotonicReady uint64 monotonicBaseCycles int64 monotonicBaseRef int64 monotonicFrequency uint64 realtimeReady uint64 realtimeBaseCycles int64 realtimeBaseRef int64 realtimeFrequency uint64 } // VDSOParamPage manages a VDSO parameter page. // // Its memory layout looks like: // // type page struct { // // seq is a sequence counter that protects the fields below. // seq uint64 // vdsoParams // } // // Everything in the struct is 8 bytes for easy alignment. // // It must be kept in sync with params in vdso/vdso_time.cc. // // +stateify savable type VDSOParamPage struct { // The parameter page is fr, allocated from mf. mf *pgalloc.MemoryFile `state:"nosave"` fr memmap.FileRange // seq is the current sequence count written to the page. // // A write is in progress if bit 1 of the counter is set. // // Timekeeper's updater goroutine may call Write before equality is // checked in state_test_util tests, causing this field to change across // save / restore. seq uint64 // copyScratchBuffer is a temporary buffer used to marshal the params before // copying it to the real parameter page. The parameter page is typically // updated at a moderate frequency of ~O(seconds) throughout the lifetime of // the sentry, so reusing this buffer is a good tradeoff between memory // usage and the cost of allocation. copyScratchBuffer []byte } // afterLoad is invoked by stateify. func (v *VDSOParamPage) afterLoad(ctx context.Context) { v.mf = pgalloc.MemoryFileFromContext(ctx) } // NewVDSOParamPage returns a VDSOParamPage. // // Preconditions: // - fr is a single page allocated from mf. VDSOParamPage does not take // ownership of fr; it must remain allocated for the lifetime of the // VDSOParamPage. // - VDSOParamPage must be the only writer to fr. // - mf.MapInternal(fr) must return a single safemem.Block. func NewVDSOParamPage(mf *pgalloc.MemoryFile, fr memmap.FileRange) *VDSOParamPage { return &VDSOParamPage{ mf: mf, fr: fr, copyScratchBuffer: make([]byte, (*vdsoParams)(nil).SizeBytes()), } } // access returns a mapping of the param page. func (v *VDSOParamPage) access() (safemem.Block, error) { bs, err := v.mf.MapInternal(v.fr, hostarch.ReadWrite) if err != nil { return safemem.Block{}, err } if bs.NumBlocks() != 1 { panic(fmt.Sprintf("Multiple blocks (%d) in VDSO param BlockSeq", bs.NumBlocks())) } return bs.Head(), nil } // incrementSeq increments the sequence counter in the param page. func (v *VDSOParamPage) incrementSeq(paramPage safemem.Block) error { next := v.seq + 1 old, err := safemem.SwapUint64(paramPage, next) if err != nil { return err } if old != v.seq { return fmt.Errorf("unexpected VDSOParamPage seq value: got %d expected %d; application may hang or get incorrect time from the VDSO", old, v.seq) } v.seq = next return nil } // Write updates the VDSO parameters. // // Write starts a write block, calls f to get the new parameters, writes // out the new parameters, then ends the write block. func (v *VDSOParamPage) Write(f func() vdsoParams) error { paramPage, err := v.access() if err != nil { return err } // Write begin. next := v.seq + 1 if next%2 != 1 { panic("Out-of-order sequence count") } err = v.incrementSeq(paramPage) if err != nil { return err } // Get the new params. p := f() buf := v.copyScratchBuffer[:p.SizeBytes()] p.MarshalUnsafe(buf) // Skip the sequence counter. if _, err := safemem.Copy(paramPage.DropFirst(8), safemem.BlockFromSafeSlice(buf)); err != nil { panic(fmt.Sprintf("Unable to get set VDSO parameters: %v", err)) } // Write end. return v.incrementSeq(paramPage) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/kernel/version.go000066400000000000000000000022511465435605700242440ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel // Version defines the application-visible system version. type Version struct { // Operating system name (e.g. "Linux"). Sysname string // Operating system release (e.g. "4.4-amd64"). Release string // Operating system version. On Linux this takes the shape // "#VERSION CONFIG_FLAGS TIMESTAMP" // where: // - VERSION is a sequence counter incremented on every successful build // - CONFIG_FLAGS is a space-separated list of major enabled kernel features // (e.g. "SMP" and "PREEMPT") // - TIMESTAMP is the build timestamp as returned by `date` Version string } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/limits/000077500000000000000000000000001465435605700222515ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/limits/context.go000066400000000000000000000024351465435605700242700ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package limits import ( "gvisor.dev/gvisor/pkg/context" ) // contextID is the limit package's type for context.Context.Value keys. type contextID int const ( // CtxLimits is a Context.Value key for a LimitSet. CtxLimits contextID = iota ) // FromContext returns the limits that apply to ctx. func FromContext(ctx context.Context) *LimitSet { if v := ctx.Value(CtxLimits); v != nil { return v.(*LimitSet) } return nil } // FromContextOrDie returns FromContext(ctx) if the latter is not nil. // Otherwise, panic is triggered. func FromContextOrDie(ctx context.Context) *LimitSet { if v := ctx.Value(CtxLimits); v != nil { return v.(*LimitSet) } panic("failed to create limit set from context") } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/limits/limits.go000066400000000000000000000117411465435605700241050ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package limits provides resource limits. package limits import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sync" ) // LimitType defines a type of resource limit. type LimitType int // Set of constants defining the different types of resource limits. const ( CPU LimitType = iota FileSize Data Stack Core Rss ProcessCount NumberOfFiles MemoryLocked AS Locks SignalsPending MessageQueueBytes Nice RealTimePriority Rttime ) // AllLimitTypes contains all types in the order how they are presented in // /proc/pid/limits. var AllLimitTypes = []LimitType{ CPU, FileSize, Data, Stack, Core, Rss, ProcessCount, NumberOfFiles, MemoryLocked, AS, Locks, SignalsPending, MessageQueueBytes, Nice, RealTimePriority, Rttime, } // Name returns the kernel name of the limit func (lt LimitType) Name() string { switch lt { case CPU: return "Max cpu time" case FileSize: return "Max file size" case Data: return "Max data size" case Stack: return "Max stack size" case Core: return "Max core file size" case Rss: return "Max resident set" case ProcessCount: return "Max processes" case NumberOfFiles: return "Max open files" case MemoryLocked: return "Max locked memory" case AS: return "Max address space" case Locks: return "Max file locks" case SignalsPending: return "Max pending signals" case MessageQueueBytes: return "Max msgqueue size" case Nice: return "Max nice priority" case RealTimePriority: return "Max realtime priority" case Rttime: return "Max realtime timeout" } return "unknown" } // Unit returns the unit string for a limit func (lt LimitType) Unit() string { switch lt { case CPU: return "seconds" case FileSize: return "bytes" case Data: return "bytes" case Stack: return "bytes" case Core: return "bytes" case Rss: return "bytes" case ProcessCount: return "processes" case NumberOfFiles: return "files" case MemoryLocked: return "bytes" case AS: return "bytes" case Locks: return "locks" case SignalsPending: return "signals" case MessageQueueBytes: return "bytes" case Nice: return "" case RealTimePriority: return "" case Rttime: return "us" } return "" } // Infinity is a constant representing a resource with no limit. const Infinity = ^uint64(0) // Limit specifies a system limit. // // +stateify savable type Limit struct { // Cur specifies the current limit. Cur uint64 `json:"cur,omitempty"` // Max specifies the maximum settable limit. Max uint64 `json:"max,omitempty"` } // LimitSet represents the Limits that correspond to each LimitType. // // +stateify savable type LimitSet struct { mu sync.Mutex `state:"nosave"` data map[LimitType]Limit } // NewLimitSet creates a new, empty LimitSet. func NewLimitSet() *LimitSet { return &LimitSet{ data: make(map[LimitType]Limit), } } // GetCopy returns a clone of the LimitSet. func (l *LimitSet) GetCopy() *LimitSet { l.mu.Lock() defer l.mu.Unlock() copyData := make(map[LimitType]Limit) for k, v := range l.data { copyData[k] = v } return &LimitSet{ data: copyData, } } // Get returns the resource limit associated with LimitType t. // If no limit is provided, it defaults to an infinite limit.Infinity. func (l *LimitSet) Get(t LimitType) Limit { l.mu.Lock() defer l.mu.Unlock() s, ok := l.data[t] if !ok { return Limit{Cur: Infinity, Max: Infinity} } return s } // GetCapped returns the current value for the limit, capped as specified. func (l *LimitSet) GetCapped(t LimitType, max uint64) uint64 { s := l.Get(t) if s.Cur == Infinity || s.Cur > max { return max } return s.Cur } // SetUnchecked assigns value v to resource of LimitType t. func (l *LimitSet) SetUnchecked(t LimitType, v Limit) { l.mu.Lock() defer l.mu.Unlock() l.data[t] = v } // Set assigns value v to resource of LimitType t and returns the old value. // privileged should be true only when either the caller has CAP_SYS_RESOURCE // or when creating limits for a new kernel. func (l *LimitSet) Set(t LimitType, v Limit, privileged bool) (Limit, error) { l.mu.Lock() defer l.mu.Unlock() // If a limit is already set, make sure the new limit doesn't // exceed the previous max limit. if _, ok := l.data[t]; ok { // Unprivileged users can only lower their hard limits. if l.data[t].Max < v.Max && !privileged { return Limit{}, unix.EPERM } if v.Cur > v.Max { return Limit{}, unix.EINVAL } } old := l.data[t] l.data[t] = v return old, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/limits/limits_state_autogen.go000066400000000000000000000024201465435605700270210ustar00rootroot00000000000000// automatically generated by stateify. package limits import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (l *Limit) StateTypeName() string { return "pkg/sentry/limits.Limit" } func (l *Limit) StateFields() []string { return []string{ "Cur", "Max", } } func (l *Limit) beforeSave() {} // +checklocksignore func (l *Limit) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.Cur) stateSinkObject.Save(1, &l.Max) } func (l *Limit) afterLoad(context.Context) {} // +checklocksignore func (l *Limit) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.Cur) stateSourceObject.Load(1, &l.Max) } func (l *LimitSet) StateTypeName() string { return "pkg/sentry/limits.LimitSet" } func (l *LimitSet) StateFields() []string { return []string{ "data", } } func (l *LimitSet) beforeSave() {} // +checklocksignore func (l *LimitSet) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.data) } func (l *LimitSet) afterLoad(context.Context) {} // +checklocksignore func (l *LimitSet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.data) } func init() { state.Register((*Limit)(nil)) state.Register((*LimitSet)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/limits/linux.go000066400000000000000000000070001465435605700237340ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package limits import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" ) // FromLinuxResource maps linux resources to LimitTypes. var FromLinuxResource = map[int]LimitType{ linux.RLIMIT_CPU: CPU, linux.RLIMIT_FSIZE: FileSize, linux.RLIMIT_DATA: Data, linux.RLIMIT_STACK: Stack, linux.RLIMIT_CORE: Core, linux.RLIMIT_RSS: Rss, linux.RLIMIT_NPROC: ProcessCount, linux.RLIMIT_NOFILE: NumberOfFiles, linux.RLIMIT_MEMLOCK: MemoryLocked, linux.RLIMIT_AS: AS, linux.RLIMIT_LOCKS: Locks, linux.RLIMIT_SIGPENDING: SignalsPending, linux.RLIMIT_MSGQUEUE: MessageQueueBytes, linux.RLIMIT_NICE: Nice, linux.RLIMIT_RTPRIO: RealTimePriority, linux.RLIMIT_RTTIME: Rttime, } // FromLinuxResourceName maps from linux resource names to LimitTypes. var FromLinuxResourceName = map[string]LimitType{ "RLIMIT_AS": AS, "RLIMIT_CORE": Core, "RLIMIT_CPU": CPU, "RLIMIT_DATA": Data, "RLIMIT_FSIZE": FileSize, "RLIMIT_LOCKS": Locks, "RLIMIT_MEMLOCK": MemoryLocked, "RLIMIT_MSGQUEUE": MessageQueueBytes, "RLIMIT_NICE": Nice, "RLIMIT_NOFILE": NumberOfFiles, "RLIMIT_NPROC": ProcessCount, "RLIMIT_RSS": Rss, "RLIMIT_RTPRIO": RealTimePriority, "RLIMIT_RTTIME": Rttime, "RLIMIT_SIGPENDING": SignalsPending, "RLIMIT_STACK": Stack, } // FromLinux maps linux rlimit values to sentry Limits, being careful to handle // infinities. func FromLinux(rl uint64) uint64 { if rl == linux.RLimInfinity { return Infinity } return rl } // ToLinux maps sentry Limits to linux rlimit values, being careful to handle // infinities. func ToLinux(l uint64) uint64 { if l == Infinity { return linux.RLimInfinity } return l } // NewLinuxLimitSet returns a LimitSet whose values match the default rlimits // in Linux. func NewLinuxLimitSet() (*LimitSet, error) { ls := NewLimitSet() for rlt, rl := range linux.InitRLimits { lt, ok := FromLinuxResource[rlt] if !ok { return nil, fmt.Errorf("unknown rlimit type %v", rlt) } ls.SetUnchecked(lt, Limit{ Cur: FromLinux(rl.Cur), Max: FromLinux(rl.Max), }) } return ls, nil } // NewLinuxDistroLimitSet returns a new LimitSet whose values are typical // for a booted Linux distro. // // Many Linux init systems adjust the default Linux limits to values more // expected by the rest of the userspace. NewLinuxDistroLimitSet returns a // LimitSet with sensible defaults for applications that aren't starting // their own init system. func NewLinuxDistroLimitSet() (*LimitSet, error) { ls, err := NewLinuxLimitSet() if err != nil { return nil, err } // Adjust ProcessCount to a lower value because GNU bash allocates 16 // bytes per proc and OOMs if this number is set too high. Value was // picked arbitrarily. // // 1,048,576 ought to be enough for anyone. l := ls.Get(ProcessCount) l.Cur = 1 << 20 ls.Set(ProcessCount, l, true /* privileged */) return ls, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/loader/000077500000000000000000000000001465435605700222165ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/loader/elf.go000066400000000000000000000511031465435605700233130ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package loader import ( "bytes" "debug/elf" "fmt" "io" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) const ( // elfMagic identifies an ELF file. elfMagic = "\x7fELF" // maxTotalPhdrSize is the maximum combined size of all program // headers. Linux limits this to one page. maxTotalPhdrSize = hostarch.PageSize ) var ( // header64Size is the size of elf.Header64. header64Size = (*linux.ElfHeader64)(nil).SizeBytes() // Prog64Size is the size of elf.Prog64. prog64Size = (*linux.ElfProg64)(nil).SizeBytes() ) func progFlagsAsPerms(f elf.ProgFlag) hostarch.AccessType { var p hostarch.AccessType if f&elf.PF_R == elf.PF_R { p.Read = true } if f&elf.PF_W == elf.PF_W { p.Write = true } if f&elf.PF_X == elf.PF_X { p.Execute = true } return p } // elfInfo contains the metadata needed to load an ELF binary. type elfInfo struct { // os is the target OS of the ELF. os abi.OS // arch is the target architecture of the ELF. arch arch.Arch // entry is the program entry point. entry hostarch.Addr // phdrs are the program headers. phdrs []elf.ProgHeader // phdrSize is the size of a single program header in the ELF. phdrSize int // phdrOff is the offset of the program headers in the file. phdrOff uint64 // sharedObject is true if the ELF represents a shared object. sharedObject bool } type fullReader interface { // ReadFull is the same as vfs.FileDescription.ReadFull. ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) } // parseHeader parse the ELF header, verifying that this is a supported ELF // file and returning the ELF program headers. // // This is similar to elf.NewFile, except that it is more strict about what it // accepts from the ELF, and it doesn't parse unnecessary parts of the file. func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) { // Check ident first; it will tell us the endianness of the rest of the // structs. var ident [elf.EI_NIDENT]byte _, err := f.ReadFull(ctx, usermem.BytesIOSequence(ident[:]), 0) if err != nil { log.Infof("Error reading ELF ident: %v", err) // The entire ident array always exists. if err == io.EOF || err == io.ErrUnexpectedEOF { err = linuxerr.ENOEXEC } return elfInfo{}, err } // Only some callers pre-check the ELF magic. if !bytes.Equal(ident[:len(elfMagic)], []byte(elfMagic)) { log.Infof("File is not an ELF") return elfInfo{}, linuxerr.ENOEXEC } // We only support 64-bit, little endian binaries if class := elf.Class(ident[elf.EI_CLASS]); class != elf.ELFCLASS64 { log.Infof("Unsupported ELF class: %v", class) return elfInfo{}, linuxerr.ENOEXEC } if endian := elf.Data(ident[elf.EI_DATA]); endian != elf.ELFDATA2LSB { log.Infof("Unsupported ELF endianness: %v", endian) return elfInfo{}, linuxerr.ENOEXEC } if version := elf.Version(ident[elf.EI_VERSION]); version != elf.EV_CURRENT { log.Infof("Unsupported ELF version: %v", version) return elfInfo{}, linuxerr.ENOEXEC } // EI_OSABI is ignored by Linux, which is the only OS supported. os := abi.Linux var hdr linux.ElfHeader64 hdrBuf := make([]byte, header64Size) _, err = f.ReadFull(ctx, usermem.BytesIOSequence(hdrBuf), 0) if err != nil { log.Infof("Error reading ELF header: %v", err) // The entire header always exists. if err == io.EOF || err == io.ErrUnexpectedEOF { err = linuxerr.ENOEXEC } return elfInfo{}, err } hdr.UnmarshalUnsafe(hdrBuf) // We support amd64 and arm64. var a arch.Arch switch machine := elf.Machine(hdr.Machine); machine { case elf.EM_X86_64: a = arch.AMD64 case elf.EM_AARCH64: a = arch.ARM64 default: log.Infof("Unsupported ELF machine %d", machine) return elfInfo{}, linuxerr.ENOEXEC } var sharedObject bool elfType := elf.Type(hdr.Type) switch elfType { case elf.ET_EXEC: sharedObject = false case elf.ET_DYN: sharedObject = true default: log.Infof("Unsupported ELF type %v", elfType) return elfInfo{}, linuxerr.ENOEXEC } if int(hdr.Phentsize) != prog64Size { log.Infof("Unsupported phdr size %d", hdr.Phentsize) return elfInfo{}, linuxerr.ENOEXEC } totalPhdrSize := prog64Size * int(hdr.Phnum) if totalPhdrSize < prog64Size { log.Warningf("No phdrs or total phdr size overflows: prog64Size: %d phnum: %d", prog64Size, int(hdr.Phnum)) return elfInfo{}, linuxerr.ENOEXEC } if totalPhdrSize > maxTotalPhdrSize { log.Infof("Too many phdrs (%d): total size %d > %d", hdr.Phnum, totalPhdrSize, maxTotalPhdrSize) return elfInfo{}, linuxerr.ENOEXEC } if int64(hdr.Phoff) < 0 || int64(hdr.Phoff+uint64(totalPhdrSize)) < 0 { ctx.Infof("Unsupported phdr offset %d", hdr.Phoff) return elfInfo{}, linuxerr.ENOEXEC } phdrBuf := make([]byte, totalPhdrSize) _, err = f.ReadFull(ctx, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff)) if err != nil { log.Infof("Error reading ELF phdrs: %v", err) // If phdrs were specified, they should all exist. if err == io.EOF || err == io.ErrUnexpectedEOF { err = linuxerr.ENOEXEC } return elfInfo{}, err } phdrs := make([]elf.ProgHeader, hdr.Phnum) for i := range phdrs { var prog64 linux.ElfProg64 phdrBuf = prog64.UnmarshalUnsafe(phdrBuf) phdrs[i] = elf.ProgHeader{ Type: elf.ProgType(prog64.Type), Flags: elf.ProgFlag(prog64.Flags), Off: prog64.Off, Vaddr: prog64.Vaddr, Paddr: prog64.Paddr, Filesz: prog64.Filesz, Memsz: prog64.Memsz, Align: prog64.Align, } } return elfInfo{ os: os, arch: a, entry: hostarch.Addr(hdr.Entry), phdrs: phdrs, phdrOff: hdr.Phoff, phdrSize: prog64Size, sharedObject: sharedObject, }, nil } // mapSegment maps a phdr into the Task. offset is the offset to apply to // phdr.Vaddr. func mapSegment(ctx context.Context, m *mm.MemoryManager, fd *vfs.FileDescription, phdr *elf.ProgHeader, offset hostarch.Addr) error { // We must make a page-aligned mapping. adjust := hostarch.Addr(phdr.Vaddr).PageOffset() addr, ok := offset.AddLength(phdr.Vaddr) if !ok { // If offset != 0 we should have ensured this would fit. ctx.Warningf("Computed segment load address overflows: %#x + %#x", phdr.Vaddr, offset) return linuxerr.ENOEXEC } addr -= hostarch.Addr(adjust) fileSize := phdr.Filesz + adjust if fileSize < phdr.Filesz { ctx.Infof("Computed segment file size overflows: %#x + %#x", phdr.Filesz, adjust) return linuxerr.ENOEXEC } ms, ok := hostarch.Addr(fileSize).RoundUp() if !ok { ctx.Infof("fileSize %#x too large", fileSize) return linuxerr.ENOEXEC } mapSize := uint64(ms) if mapSize > 0 { // This must result in a page-aligned offset. i.e., the original // phdr.Off must have the same alignment as phdr.Vaddr. If that is not // true, MMap will reject the mapping. fileOffset := phdr.Off - adjust prot := progFlagsAsPerms(phdr.Flags) mopts := memmap.MMapOpts{ Length: mapSize, Offset: fileOffset, Addr: addr, Fixed: true, // Linux will happily allow conflicting segments to map over // one another. Unmap: true, Private: true, Perms: prot, MaxPerms: hostarch.AnyAccess, } defer func() { if mopts.MappingIdentity != nil { mopts.MappingIdentity.DecRef(ctx) } }() if err := fd.ConfigureMMap(ctx, &mopts); err != nil { ctx.Infof("File is not memory-mappable: %v", err) return err } if _, err := m.MMap(ctx, mopts); err != nil { ctx.Infof("Error mapping PT_LOAD segment %+v at %#x: %v", phdr, addr, err) return err } // We need to clear the end of the last page that exceeds fileSize so // we don't map part of the file beyond fileSize. // // Note that Linux *does not* clear the portion of the first page // before phdr.Off. if mapSize > fileSize { zeroAddr, ok := addr.AddLength(fileSize) if !ok { panic(fmt.Sprintf("successfully mmaped address overflows? %#x + %#x", addr, fileSize)) } zeroSize := int64(mapSize - fileSize) if zeroSize < 0 { panic(fmt.Sprintf("zeroSize too big? %#x", uint64(zeroSize))) } if _, err := m.ZeroOut(ctx, zeroAddr, zeroSize, usermem.IOOpts{IgnorePermissions: true}); err != nil { ctx.Warningf("Failed to zero end of page [%#x, %#x): %v", zeroAddr, zeroAddr+hostarch.Addr(zeroSize), err) return err } } } memSize := phdr.Memsz + adjust if memSize < phdr.Memsz { ctx.Infof("Computed segment mem size overflows: %#x + %#x", phdr.Memsz, adjust) return linuxerr.ENOEXEC } // Allocate more anonymous pages if necessary. if mapSize < memSize { anonAddr, ok := addr.AddLength(mapSize) if !ok { panic(fmt.Sprintf("anonymous memory doesn't fit in pre-sized range? %#x + %#x", addr, mapSize)) } anonSize, ok := hostarch.Addr(memSize - mapSize).RoundUp() if !ok { ctx.Infof("extra anon pages too large: %#x", memSize-mapSize) return linuxerr.ENOEXEC } // N.B. Linux uses vm_brk_flags to map these pages, which only // honors the X bit, always mapping at least RW. ignoring These // pages are not included in the final brk region. prot := hostarch.ReadWrite if phdr.Flags&elf.PF_X == elf.PF_X { prot.Execute = true } if _, err := m.MMap(ctx, memmap.MMapOpts{ Length: uint64(anonSize), Addr: anonAddr, // Fixed without Unmap will fail the mmap if something is // already at addr. Fixed: true, Private: true, Perms: prot, MaxPerms: hostarch.AnyAccess, }); err != nil { ctx.Infof("Error mapping PT_LOAD segment %v anonymous memory: %v", phdr, err) return err } } return nil } // loadedELF describes an ELF that has been successfully loaded. type loadedELF struct { // os is the target OS of the ELF. os abi.OS // arch is the target architecture of the ELF. arch arch.Arch // entry is the entry point of the ELF. entry hostarch.Addr // start is the end of the ELF. start hostarch.Addr // end is the end of the ELF. end hostarch.Addr // interpter is the path to the ELF interpreter. interpreter string // phdrAddr is the address of the ELF program headers. phdrAddr hostarch.Addr // phdrSize is the size of a single program header in the ELF. phdrSize int // phdrNum is the number of program headers. phdrNum int // auxv contains a subset of ELF-specific auxiliary vector entries: // * AT_PHDR // * AT_PHENT // * AT_PHNUM // * AT_BASE // * AT_ENTRY auxv arch.Auxv } // loadParsedELF loads f into mm. // // info is the parsed elfInfo from the header. // // It does not load the ELF interpreter, or return any auxv entries. // // Preconditions: f is an ELF file. func loadParsedELF(ctx context.Context, m *mm.MemoryManager, fd *vfs.FileDescription, info elfInfo, sharedLoadOffset hostarch.Addr) (loadedELF, error) { first := true var start, end hostarch.Addr var interpreter string for _, phdr := range info.phdrs { switch phdr.Type { case elf.PT_LOAD: vaddr := hostarch.Addr(phdr.Vaddr) if first { first = false start = vaddr } if vaddr < end { // NOTE(b/37474556): Linux allows out-of-order // segments, in violation of the spec. ctx.Infof("PT_LOAD headers out-of-order. %#x < %#x", vaddr, end) return loadedELF{}, linuxerr.ENOEXEC } var ok bool end, ok = vaddr.AddLength(phdr.Memsz) if !ok { ctx.Infof("PT_LOAD header size overflows. %#x + %#x", vaddr, phdr.Memsz) return loadedELF{}, linuxerr.ENOEXEC } case elf.PT_INTERP: if phdr.Filesz < 2 { ctx.Infof("PT_INTERP path too small: %v", phdr.Filesz) return loadedELF{}, linuxerr.ENOEXEC } if phdr.Filesz > linux.PATH_MAX { ctx.Infof("PT_INTERP path too big: %v", phdr.Filesz) return loadedELF{}, linuxerr.ENOEXEC } if int64(phdr.Off) < 0 || int64(phdr.Off+phdr.Filesz) < 0 { ctx.Infof("Unsupported PT_INTERP offset %d", phdr.Off) return loadedELF{}, linuxerr.ENOEXEC } path := make([]byte, phdr.Filesz) _, err := fd.ReadFull(ctx, usermem.BytesIOSequence(path), int64(phdr.Off)) if err != nil { // If an interpreter was specified, it should exist. ctx.Infof("Error reading PT_INTERP path: %v", err) return loadedELF{}, linuxerr.ENOEXEC } if path[len(path)-1] != 0 { ctx.Infof("PT_INTERP path not NUL-terminated: %v", path) return loadedELF{}, linuxerr.ENOEXEC } // Strip NUL-terminator and everything beyond from // string. Note that there may be a NUL-terminator // before len(path)-1. interpreter = string(path[:bytes.IndexByte(path, '\x00')]) if interpreter == "" { // Linux actually attempts to open_exec("\0"). // open_exec -> do_open_execat fails to check // that name != '\0' before calling // do_filp_open, which thus opens the working // directory. do_open_execat returns EACCES // because the directory is not a regular file. // // We bypass that nonsense and simply // short-circuit with EACCES. Those this does // mean that there may be some edge cases where // the open path would return a different // error. ctx.Infof("PT_INTERP path is empty: %v", path) return loadedELF{}, linuxerr.EACCES } } } // Shared objects don't have fixed load addresses. We need to pick a // base address big enough to fit all segments, so we first create a // mapping for the total size just to find a region that is big enough. // // It is safe to unmap it immediately without racing with another mapping // because we are the only one in control of the MemoryManager. // // Note that the vaddr of the first PT_LOAD segment is ignored when // choosing the load address (even if it is non-zero). The vaddr does // become an offset from that load address. var offset hostarch.Addr if info.sharedObject { totalSize := end - start totalSize, ok := totalSize.RoundUp() if !ok { ctx.Infof("ELF PT_LOAD segments too big") return loadedELF{}, linuxerr.ENOEXEC } var err error offset, err = m.MMap(ctx, memmap.MMapOpts{ Length: uint64(totalSize), Addr: sharedLoadOffset, Private: true, }) if err != nil { ctx.Infof("Error allocating address space for shared object: %v", err) return loadedELF{}, err } if err := m.MUnmap(ctx, offset, uint64(totalSize)); err != nil { panic(fmt.Sprintf("Failed to unmap base address: %v", err)) } start, ok = start.AddLength(uint64(offset)) if !ok { ctx.Infof(fmt.Sprintf("Start %#x + offset %#x overflows?", start, offset)) return loadedELF{}, linuxerr.EINVAL } end, ok = end.AddLength(uint64(offset)) if !ok { ctx.Infof(fmt.Sprintf("End %#x + offset %#x overflows?", end, offset)) return loadedELF{}, linuxerr.EINVAL } info.entry, ok = info.entry.AddLength(uint64(offset)) if !ok { ctx.Infof("Entrypoint %#x + offset %#x overflows? Is the entrypoint within a segment?", info.entry, offset) return loadedELF{}, err } } // Map PT_LOAD segments. for _, phdr := range info.phdrs { switch phdr.Type { case elf.PT_LOAD: if phdr.Memsz == 0 { // No need to load segments with size 0, but // they exist in some binaries. continue } if err := mapSegment(ctx, m, fd, &phdr, offset); err != nil { ctx.Infof("Failed to map PT_LOAD segment: %+v", phdr) return loadedELF{}, err } } } // This assumes that the first segment contains the ELF headers. This // may not be true in a malformed ELF, but Linux makes the same // assumption. phdrAddr, ok := start.AddLength(info.phdrOff) if !ok { ctx.Warningf("ELF start address %#x + phdr offset %#x overflows", start, info.phdrOff) phdrAddr = 0 } return loadedELF{ os: info.os, arch: info.arch, entry: info.entry, start: start, end: end, interpreter: interpreter, phdrAddr: phdrAddr, phdrSize: info.phdrSize, phdrNum: len(info.phdrs), }, nil } // loadInitialELF loads f into mm. // // It creates an arch.Context64 for the ELF and prepares the mm for this arch. // // It does not load the ELF interpreter, or return any auxv entries. // // Preconditions: // - f is an ELF file. // - f is the first ELF loaded into m. func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs cpuid.FeatureSet, fd *vfs.FileDescription) (loadedELF, *arch.Context64, error) { info, err := parseHeader(ctx, fd) if err != nil { ctx.Infof("Failed to parse initial ELF: %v", err) return loadedELF{}, nil, err } // Check Image Compatibility. if arch.Host != info.arch { ctx.Warningf("Found mismatch for platform %s with ELF type %s", arch.Host.String(), info.arch.String()) return loadedELF{}, nil, linuxerr.ENOEXEC } // Create the arch.Context64 now so we can prepare the mmap layout before // mapping anything. ac := arch.New(info.arch) l, err := m.SetMmapLayout(ac, limits.FromContext(ctx)) if err != nil { ctx.Warningf("Failed to set mmap layout: %v", err) return loadedELF{}, nil, err } // PIELoadAddress tries to move the ELF out of the way of the default // mmap base to ensure that the initial brk has sufficient space to // grow. le, err := loadParsedELF(ctx, m, fd, info, ac.PIELoadAddress(l)) return le, ac, err } // loadInterpreterELF loads f into mm. // // The interpreter must be for the same OS/Arch as the initial ELF. // // It does not return any auxv entries. // // Preconditions: f is an ELF file. func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, fd *vfs.FileDescription, initial loadedELF) (loadedELF, error) { info, err := parseHeader(ctx, fd) if err != nil { if linuxerr.Equals(linuxerr.ENOEXEC, err) { // Bad interpreter. err = linuxerr.ELIBBAD } return loadedELF{}, err } if info.os != initial.os { ctx.Infof("Initial ELF OS %v and interpreter ELF OS %v differ", initial.os, info.os) return loadedELF{}, linuxerr.ELIBBAD } if info.arch != initial.arch { ctx.Infof("Initial ELF arch %v and interpreter ELF arch %v differ", initial.arch, info.arch) return loadedELF{}, linuxerr.ELIBBAD } // The interpreter is not given a load offset, as its location does not // affect brk. return loadParsedELF(ctx, m, fd, info, 0) } // loadELF loads args.File into the Task address space. // // If loadELF returns ErrSwitchFile it should be called again with the returned // path and argv. // // Preconditions: args.File is an ELF file. func loadELF(ctx context.Context, args LoadArgs) (loadedELF, *arch.Context64, error) { bin, ac, err := loadInitialELF(ctx, args.MemoryManager, args.Features, args.File) if err != nil { ctx.Infof("Error loading binary: %v", err) return loadedELF{}, nil, err } var interp loadedELF if bin.interpreter != "" { // Even if we do not allow the final link of the script to be // resolved, the interpreter should still be resolved if it is // a symlink. args.ResolveFinal = true // Refresh the traversal limit. *args.RemainingTraversals = linux.MaxSymlinkTraversals args.Filename = bin.interpreter intFile, err := openPath(ctx, args) if err != nil { ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err) return loadedELF{}, nil, err } defer intFile.DecRef(ctx) interp, err = loadInterpreterELF(ctx, args.MemoryManager, intFile, bin) if err != nil { ctx.Infof("Error loading interpreter: %v", err) return loadedELF{}, nil, err } if interp.interpreter != "" { // No recursive interpreters! ctx.Infof("Interpreter requires an interpreter") return loadedELF{}, nil, linuxerr.ENOEXEC } } // ELF-specific auxv entries. bin.auxv = arch.Auxv{ arch.AuxEntry{linux.AT_PHDR, bin.phdrAddr}, arch.AuxEntry{linux.AT_PHENT, hostarch.Addr(bin.phdrSize)}, arch.AuxEntry{linux.AT_PHNUM, hostarch.Addr(bin.phdrNum)}, arch.AuxEntry{linux.AT_ENTRY, bin.entry}, } if bin.interpreter != "" { bin.auxv = append(bin.auxv, arch.AuxEntry{linux.AT_BASE, interp.start}) // Start in the interpreter. // N.B. AT_ENTRY above contains the *original* entry point. bin.entry = interp.entry } else { // Always add AT_BASE even if there is no interpreter. bin.auxv = append(bin.auxv, arch.AuxEntry{linux.AT_BASE, 0}) } return bin, ac, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/loader/interpreter.go000066400000000000000000000060001465435605700251040ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package loader import ( "bytes" "io" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) const ( // interpreterScriptMagic identifies an interpreter script. interpreterScriptMagic = "#!" // interpMaxLineLength is the maximum length for the first line of an // interpreter script. // // From execve(2): "A maximum line length of 127 characters is allowed // for the first line in a #! executable shell script." interpMaxLineLength = 127 ) // parseInterpreterScript returns the interpreter path and argv. func parseInterpreterScript(ctx context.Context, filename string, fd *vfs.FileDescription, argv []string) (newpath string, newargv []string, err error) { line := make([]byte, interpMaxLineLength) n, err := fd.ReadFull(ctx, usermem.BytesIOSequence(line), 0) // Short read is OK. if err != nil && err != io.ErrUnexpectedEOF { if err == io.EOF { err = linuxerr.ENOEXEC } return "", []string{}, err } line = line[:n] if !bytes.Equal(line[:2], []byte(interpreterScriptMagic)) { return "", []string{}, linuxerr.ENOEXEC } // Ignore #!. line = line[2:] // Ignore everything after newline. // Linux silently truncates the remainder of the line if it exceeds // interpMaxLineLength. i := bytes.IndexByte(line, '\n') if i >= 0 { line = line[:i] } // Skip any whitespace before the interpreter. line = bytes.TrimLeft(line, " \t") // Linux only looks for spaces or tabs delimiting the interpreter and // arg. // // execve(2): "On Linux, the entire string following the interpreter // name is passed as a single argument to the interpreter, and this // string can include white space." interp := line var arg []byte i = bytes.IndexAny(line, " \t") if i >= 0 { interp = line[:i] arg = bytes.TrimLeft(line[i:], " \t") } if string(interp) == "" { ctx.Infof("Interpreter script contains no interpreter: %v", line) return "", []string{}, linuxerr.ENOEXEC } // Build the new argument list: // // 1. The interpreter. newargv = append(newargv, string(interp)) // 2. The optional interpreter argument. if len(arg) > 0 { newargv = append(newargv, string(arg)) } // 3. The original arguments. The original argv[0] is replaced with the // full script filename. if len(argv) > 0 { argv[0] = filename } else { argv = []string{filename} } newargv = append(newargv, argv...) return string(interp), newargv, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/loader/loader.go000066400000000000000000000306411465435605700240170ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package loader loads an executable file into a MemoryManager. package loader import ( "bytes" "fmt" "io" "path" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/abi/linux/errno" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/usermem" ) const ( securityCapability = linux.XATTR_SECURITY_PREFIX + "capability" ) // LoadArgs holds specifications for an executable file to be loaded. type LoadArgs struct { // MemoryManager is the memory manager to load the executable into. MemoryManager *mm.MemoryManager // RemainingTraversals is the maximum number of symlinks to follow to // resolve Filename. This counter is passed by reference to keep it // updated throughout the call stack. RemainingTraversals *uint // ResolveFinal indicates whether the final link of Filename should be // resolved, if it is a symlink. ResolveFinal bool // Filename is the path for the executable. Filename string // File is an open FD of the executable. If File is not nil, then File will // be loaded and Filename will be ignored. // // The caller is responsible for checking that the user can execute this file. File *vfs.FileDescription // Root is the current filesystem root. Root vfs.VirtualDentry // WorkingDir is the current working directory. WorkingDir vfs.VirtualDentry // If AfterOpen is not nil, it is called after every successful call to // Opener.OpenPath(). AfterOpen func(f *vfs.FileDescription) // CloseOnExec indicates that the executable (or one of its parent // directories) was opened with O_CLOEXEC. If the executable is an // interpreter script, then cause an ENOENT error to occur, since the // script would otherwise be inaccessible to the interpreter. CloseOnExec bool // Argv is the vector of arguments to pass to the executable. Argv []string // Envv is the vector of environment variables to pass to the // executable. Envv []string // Features specifies the CPU feature set for the executable. Features cpuid.FeatureSet } // openPath opens args.Filename and checks that it is valid for loading. // // openPath returns an *fs.Dirent and *fs.File for args.Filename, which is not // installed in the Task FDTable. The caller takes ownership of both. // // args.Filename must be a readable, executable, regular file. func openPath(ctx context.Context, args LoadArgs) (*vfs.FileDescription, error) { if args.Filename == "" { ctx.Infof("cannot open empty name") return nil, linuxerr.ENOENT } // TODO(gvisor.dev/issue/160): Linux requires only execute permission, // not read. However, our backing filesystems may prevent us from reading // the file without read permission. Additionally, a task with a // non-readable executable has additional constraints on access via // ptrace and procfs. opts := vfs.OpenOptions{ Flags: linux.O_RDONLY, FileExec: true, } vfsObj := args.Root.Mount().Filesystem().VirtualFilesystem() creds := auth.CredentialsFromContext(ctx) path := fspath.Parse(args.Filename) pop := &vfs.PathOperation{ Root: args.Root, Start: args.WorkingDir, Path: path, FollowFinalSymlink: args.ResolveFinal, } if path.Absolute { pop.Start = args.Root } fd, err := vfsObj.OpenAt(ctx, creds, pop, &opts) if err != nil { return nil, err } if args.AfterOpen != nil { args.AfterOpen(fd) } return fd, nil } // checkIsRegularFile prevents us from trying to execute a directory, pipe, etc. func checkIsRegularFile(ctx context.Context, fd *vfs.FileDescription, filename string) error { stat, err := fd.Stat(ctx, vfs.StatOptions{}) if err != nil { return err } if t := linux.FileMode(stat.Mode).FileType(); t != linux.ModeRegular { ctx.Infof("%q is not a regular file: %v", filename, t) return linuxerr.EACCES } return nil } // allocStack allocates and maps a stack in to any available part of the address space. func allocStack(ctx context.Context, m *mm.MemoryManager, a *arch.Context64) (*arch.Stack, error) { ar, err := m.MapStack(ctx) if err != nil { return nil, err } return &arch.Stack{Arch: a, IO: m, Bottom: ar.End}, nil } const ( // maxLoaderAttempts is the maximum number of attempts to try to load // an interpreter scripts, to prevent loops. 6 (initial + 5 changes) is // what the Linux kernel allows (fs/exec.c:search_binary_handler). maxLoaderAttempts = 6 ) // loadExecutable loads an executable that is pointed to by args.File. The // caller is responsible for checking that the user can execute this file. // If nil, the path args.Filename is resolved and loaded (check that the user // can execute this file is done here in this case). If the executable is an // interpreter script rather than an ELF, the binary of the corresponding // interpreter will be loaded. // // It returns: // - loadedELF, description of the loaded binary // - arch.Context64 matching the binary arch // - fs.Dirent of the binary file // - Possibly updated args.Argv func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, *arch.Context64, *vfs.FileDescription, []string, error) { for i := 0; i < maxLoaderAttempts; i++ { if args.File == nil { var err error args.File, err = openPath(ctx, args) if err != nil { ctx.Infof("Error opening %s: %v", args.Filename, err) return loadedELF{}, nil, nil, nil, err } // Ensure file is release in case the code loops or errors out. defer args.File.DecRef(ctx) } else { if err := checkIsRegularFile(ctx, args.File, args.Filename); err != nil { return loadedELF{}, nil, nil, nil, err } } // Check the header. Is this an ELF or interpreter script? var hdr [4]uint8 // N.B. We assume that reading from a regular file cannot block. _, err := args.File.ReadFull(ctx, usermem.BytesIOSequence(hdr[:]), 0) // Allow unexpected EOF, as a valid executable could be only three bytes // (e.g., #!a). if err != nil && err != io.ErrUnexpectedEOF { if err == io.EOF { err = linuxerr.ENOEXEC } return loadedELF{}, nil, nil, nil, err } switch { case bytes.Equal(hdr[:], []byte(elfMagic)): loaded, ac, err := loadELF(ctx, args) if err != nil { ctx.Infof("Error loading ELF: %v", err) return loadedELF{}, nil, nil, nil, err } // An ELF is always terminal. Hold on to file. args.File.IncRef() return loaded, ac, args.File, args.Argv, err case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)): if args.CloseOnExec { return loadedELF{}, nil, nil, nil, linuxerr.ENOENT } args.Filename, args.Argv, err = parseInterpreterScript(ctx, args.Filename, args.File, args.Argv) if err != nil { ctx.Infof("Error loading interpreter script: %v", err) return loadedELF{}, nil, nil, nil, err } // Refresh the traversal limit for the interpreter. *args.RemainingTraversals = linux.MaxSymlinkTraversals default: ctx.Infof("Unknown magic: %v", hdr) return loadedELF{}, nil, nil, nil, linuxerr.ENOEXEC } // Set to nil in case we loop on a Interpreter Script. args.File = nil } return loadedELF{}, nil, nil, nil, linuxerr.ELOOP } // ImageInfo represents the information for the loaded image. type ImageInfo struct { // The target operating system of the image. OS abi.OS // AMD64 context. Arch *arch.Context64 // The base name of the binary. Name string // The binary's file capability. FileCaps string } // Load loads args.File into a MemoryManager. If args.File is nil, the path // args.Filename is resolved and loaded instead. // // If Load returns ErrSwitchFile it should be called again with the returned // path and argv. // // Preconditions: // - The Task MemoryManager is empty. // - Load is called on the Task goroutine. func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *VDSO) (ImageInfo, *syserr.Error) { // Load the executable itself. loaded, ac, file, newArgv, err := loadExecutable(ctx, args) if err != nil { return ImageInfo{}, syserr.NewDynamic(fmt.Sprintf("failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux()) } defer file.DecRef(ctx) xattr, err := file.GetXattr(ctx, &vfs.GetXattrOptions{Name: securityCapability, Size: linux.XATTR_CAPS_SZ_3}) switch { case linuxerr.Equals(linuxerr.ENODATA, err), linuxerr.Equals(linuxerr.ENOTSUP, err): xattr = "" case err != nil: return ImageInfo{}, syserr.NewDynamic(fmt.Sprintf("failed to read file capabilities of %s: %v", args.Filename, err), syserr.FromError(err).ToLinux()) } // Load the VDSO. vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded) if err != nil { return ImageInfo{}, syserr.NewDynamic(fmt.Sprintf("error loading VDSO: %v", err), syserr.FromError(err).ToLinux()) } // Setup the heap. brk starts at the next page after the end of the // executable. Userspace can assume that the remainder of the page after // loaded.end is available for its use. e, ok := loaded.end.RoundUp() if !ok { return ImageInfo{}, syserr.NewDynamic(fmt.Sprintf("brk overflows: %#x", loaded.end), errno.ENOEXEC) } args.MemoryManager.BrkSetup(ctx, e) // Allocate our stack. stack, err := allocStack(ctx, args.MemoryManager, ac) if err != nil { return ImageInfo{}, syserr.NewDynamic(fmt.Sprintf("Failed to allocate stack: %v", err), syserr.FromError(err).ToLinux()) } // Push the original filename to the stack, for AT_EXECFN. if _, err := stack.PushNullTerminatedByteSlice([]byte(args.Filename)); err != nil { return ImageInfo{}, syserr.NewDynamic(fmt.Sprintf("Failed to push exec filename: %v", err), syserr.FromError(err).ToLinux()) } execfn := stack.Bottom // Push 16 random bytes on the stack which AT_RANDOM will point to. var b [16]byte if _, err := rand.Read(b[:]); err != nil { return ImageInfo{}, syserr.NewDynamic(fmt.Sprintf("Failed to read random bytes: %v", err), syserr.FromError(err).ToLinux()) } if _, err = stack.PushNullTerminatedByteSlice(b[:]); err != nil { return ImageInfo{}, syserr.NewDynamic(fmt.Sprintf("Failed to push random bytes: %v", err), syserr.FromError(err).ToLinux()) } random := stack.Bottom c := auth.CredentialsFromContext(ctx) // Add generic auxv entries. auxv := append(loaded.auxv, arch.Auxv{ arch.AuxEntry{linux.AT_UID, hostarch.Addr(c.RealKUID.In(c.UserNamespace).OrOverflow())}, arch.AuxEntry{linux.AT_EUID, hostarch.Addr(c.EffectiveKUID.In(c.UserNamespace).OrOverflow())}, arch.AuxEntry{linux.AT_GID, hostarch.Addr(c.RealKGID.In(c.UserNamespace).OrOverflow())}, arch.AuxEntry{linux.AT_EGID, hostarch.Addr(c.EffectiveKGID.In(c.UserNamespace).OrOverflow())}, // The conditions that require AT_SECURE = 1 never arise. See // kernel.Task.updateCredsForExecLocked. arch.AuxEntry{linux.AT_SECURE, 0}, arch.AuxEntry{linux.AT_CLKTCK, linux.CLOCKS_PER_SEC}, arch.AuxEntry{linux.AT_EXECFN, execfn}, arch.AuxEntry{linux.AT_RANDOM, random}, arch.AuxEntry{linux.AT_PAGESZ, hostarch.PageSize}, arch.AuxEntry{linux.AT_SYSINFO_EHDR, vdsoAddr}, }...) auxv = append(auxv, extraAuxv...) sl, err := stack.Load(newArgv, args.Envv, auxv) if err != nil { return ImageInfo{}, syserr.NewDynamic(fmt.Sprintf("Failed to load stack: %v", err), syserr.FromError(err).ToLinux()) } m := args.MemoryManager m.SetArgvStart(sl.ArgvStart) m.SetArgvEnd(sl.ArgvEnd) m.SetEnvvStart(sl.EnvvStart) m.SetEnvvEnd(sl.EnvvEnd) m.SetAuxv(auxv) m.SetExecutable(ctx, file) m.SetVDSOSigReturn(uint64(vdsoAddr) + vdsoSigreturnOffset - vdsoPrelink) ac.SetIP(uintptr(loaded.entry)) ac.SetStack(uintptr(stack.Bottom)) name := path.Base(args.Filename) if len(name) > linux.TASK_COMM_LEN-1 { name = name[:linux.TASK_COMM_LEN-1] } return ImageInfo{ OS: loaded.os, Arch: ac, Name: name, FileCaps: xattr, }, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/loader/loader_abi_autogen_unsafe.go000066400000000000000000000001461465435605700277120ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package loader import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/loader/loader_state_autogen.go000066400000000000000000000043771465435605700267500ustar00rootroot00000000000000// automatically generated by stateify. package loader import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (v *VDSO) StateTypeName() string { return "pkg/sentry/loader.VDSO" } func (v *VDSO) StateFields() []string { return []string{ "ParamPage", "vdso", "os", "arch", "phdrs", } } func (v *VDSO) beforeSave() {} // +checklocksignore func (v *VDSO) StateSave(stateSinkObject state.Sink) { v.beforeSave() var phdrsValue []elfProgHeader phdrsValue = v.savePhdrs() stateSinkObject.SaveValue(4, phdrsValue) stateSinkObject.Save(0, &v.ParamPage) stateSinkObject.Save(1, &v.vdso) stateSinkObject.Save(2, &v.os) stateSinkObject.Save(3, &v.arch) } func (v *VDSO) afterLoad(context.Context) {} // +checklocksignore func (v *VDSO) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &v.ParamPage) stateSourceObject.Load(1, &v.vdso) stateSourceObject.Load(2, &v.os) stateSourceObject.Load(3, &v.arch) stateSourceObject.LoadValue(4, new([]elfProgHeader), func(y any) { v.loadPhdrs(ctx, y.([]elfProgHeader)) }) } func (e *elfProgHeader) StateTypeName() string { return "pkg/sentry/loader.elfProgHeader" } func (e *elfProgHeader) StateFields() []string { return []string{ "Type", "Flags", "Off", "Vaddr", "Paddr", "Filesz", "Memsz", "Align", } } func (e *elfProgHeader) beforeSave() {} // +checklocksignore func (e *elfProgHeader) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.Type) stateSinkObject.Save(1, &e.Flags) stateSinkObject.Save(2, &e.Off) stateSinkObject.Save(3, &e.Vaddr) stateSinkObject.Save(4, &e.Paddr) stateSinkObject.Save(5, &e.Filesz) stateSinkObject.Save(6, &e.Memsz) stateSinkObject.Save(7, &e.Align) } func (e *elfProgHeader) afterLoad(context.Context) {} // +checklocksignore func (e *elfProgHeader) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.Type) stateSourceObject.Load(1, &e.Flags) stateSourceObject.Load(2, &e.Off) stateSourceObject.Load(3, &e.Vaddr) stateSourceObject.Load(4, &e.Paddr) stateSourceObject.Load(5, &e.Filesz) stateSourceObject.Load(6, &e.Memsz) stateSourceObject.Load(7, &e.Align) } func init() { state.Register((*VDSO)(nil)) state.Register((*elfProgHeader)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/loader/vdso.go000066400000000000000000000265541465435605700235340ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package loader import ( "bytes" "debug/elf" "fmt" "io" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/loader/vdsodata" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/usermem" ) const vdsoPrelink = 0xffffffffff700000 type fileContext struct { context.Context } func (f *fileContext) Value(key any) any { switch key { case uniqueid.CtxGlobalUniqueID: return uint64(0) default: return f.Context.Value(key) } } type byteFullReader struct { data []byte } // ReadFull implements fullReader.ReadFull. func (b *byteFullReader) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { if offset < 0 { return 0, linuxerr.EINVAL } if offset >= int64(len(b.data)) { return 0, io.EOF } n, err := dst.CopyOut(ctx, b.data[offset:]) return int64(n), err } // validateVDSO checks that the VDSO can be loaded by loadVDSO. // // VDSOs are special (see below). Since we are going to map the VDSO directly // rather than using a normal loading process, we require that the PT_LOAD // segments have the same layout in the ELF as they expect to have in memory. // // Namely, this means that we must verify: // - PT_LOAD file offsets are equivalent to the memory offset from the first // segment. // - No extra zeroed space (memsz) is required. // - PT_LOAD segments are in order. // - No two PT_LOAD segments occupy parts of the same page. // - PT_LOAD segments don't extend beyond the end of the file. // // ctx may be nil if f does not need it. func validateVDSO(ctx context.Context, f fullReader, size uint64) (elfInfo, error) { info, err := parseHeader(ctx, f) if err != nil { log.Infof("Unable to parse VDSO header: %v", err) return elfInfo{}, err } var first *elf.ProgHeader var prev *elf.ProgHeader var prevEnd hostarch.Addr for i, phdr := range info.phdrs { if phdr.Type != elf.PT_LOAD { continue } if first == nil { first = &info.phdrs[i] if phdr.Off != 0 { log.Warningf("First PT_LOAD segment has non-zero file offset") return elfInfo{}, linuxerr.ENOEXEC } } memoryOffset := phdr.Vaddr - first.Vaddr if memoryOffset != phdr.Off { log.Warningf("PT_LOAD segment memory offset %#x != file offset %#x", memoryOffset, phdr.Off) return elfInfo{}, linuxerr.ENOEXEC } // memsz larger than filesz means that extra zeroed space should be // provided at the end of the segment. Since we are mapping the ELF // directly, we don't want to just overwrite part of the ELF with // zeroes. if phdr.Memsz != phdr.Filesz { log.Warningf("PT_LOAD segment memsz %#x != filesz %#x", phdr.Memsz, phdr.Filesz) return elfInfo{}, linuxerr.ENOEXEC } start := hostarch.Addr(memoryOffset) end, ok := start.AddLength(phdr.Memsz) if !ok { log.Warningf("PT_LOAD segment size overflows: %#x + %#x", start, end) return elfInfo{}, linuxerr.ENOEXEC } if uint64(end) > size { log.Warningf("PT_LOAD segment end %#x extends beyond end of file %#x", end, size) return elfInfo{}, linuxerr.ENOEXEC } if prev != nil { if start < prevEnd { log.Warningf("PT_LOAD segments out of order") return elfInfo{}, linuxerr.ENOEXEC } // We mprotect entire pages, so each segment must be in // its own page. prevEndPage := prevEnd.RoundDown() startPage := start.RoundDown() if prevEndPage >= startPage { log.Warningf("PT_LOAD segments share a page: %#x", prevEndPage) return elfInfo{}, linuxerr.ENOEXEC } } prev = &info.phdrs[i] prevEnd = end } return info, nil } // VDSO describes a VDSO. // // NOTE(mpratt): to support multiple architectures or operating systems, this // would need to contain a VDSO for each. // // +stateify savable type VDSO struct { // ParamPage is the VDSO parameter page. This page should be updated to // inform the VDSO for timekeeping data. ParamPage *mm.SpecialMappable // vdso is the VDSO ELF itself. vdso *mm.SpecialMappable // os is the operating system targeted by the VDSO. os abi.OS // arch is the architecture targeted by the VDSO. arch arch.Arch // phdrs are the VDSO ELF phdrs. phdrs []elf.ProgHeader `state:".([]elfProgHeader)"` } // PrepareVDSO validates the system VDSO and returns a VDSO, containing the // param page for updating by the kernel. func PrepareVDSO(mf *pgalloc.MemoryFile) (*VDSO, error) { vdsoFile := &byteFullReader{data: vdsodata.Binary} // First make sure the VDSO is valid. vdsoFile does not use ctx, so a // nil context can be passed. info, err := validateVDSO(nil, vdsoFile, uint64(len(vdsodata.Binary))) if err != nil { return nil, err } // Then copy it into a VDSO mapping. size, ok := hostarch.Addr(len(vdsodata.Binary)).RoundUp() if !ok { return nil, fmt.Errorf("VDSO size overflows? %#x", len(vdsodata.Binary)) } vdso, err := mf.Allocate(uint64(size), pgalloc.AllocOpts{Kind: usage.System}) if err != nil { return nil, fmt.Errorf("unable to allocate VDSO memory: %v", err) } ims, err := mf.MapInternal(vdso, hostarch.ReadWrite) if err != nil { mf.DecRef(vdso) return nil, fmt.Errorf("unable to map VDSO memory: %v", err) } _, err = safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(vdsodata.Binary))) if err != nil { mf.DecRef(vdso) return nil, fmt.Errorf("unable to copy VDSO into memory: %v", err) } // Finally, allocate a param page for this VDSO. paramPage, err := mf.Allocate(hostarch.PageSize, pgalloc.AllocOpts{Kind: usage.System}) if err != nil { mf.DecRef(vdso) return nil, fmt.Errorf("unable to allocate VDSO param page: %v", err) } return &VDSO{ ParamPage: mm.NewSpecialMappable("[vvar]", mf, paramPage), // TODO(gvisor.dev/issue/157): Don't advertise the VDSO, as // some applications may not be able to handle multiple [vdso] // hints. vdso: mm.NewSpecialMappable("", mf, vdso), os: info.os, arch: info.arch, phdrs: info.phdrs, }, nil } // loadVDSO loads the VDSO into m. // // VDSOs are special. // // VDSOs are fully position independent. However, instead of loading a VDSO // like a normal ELF binary, mapping only the PT_LOAD segments, the Linux // kernel simply directly maps the entire file into process memory, with very // little real ELF parsing. // // NOTE(b/25323870): This means that userspace can, and unfortunately does, // depend on parts of the ELF that would normally not be mapped. To maintain // compatibility with such binaries, we load the VDSO much like Linux. // // loadVDSO takes a reference on the VDSO and parameter page FrameRegions. func loadVDSO(ctx context.Context, m *mm.MemoryManager, v *VDSO, bin loadedELF) (hostarch.Addr, error) { if v.os != bin.os { ctx.Warningf("Binary ELF OS %v and VDSO ELF OS %v differ", bin.os, v.os) return 0, linuxerr.ENOEXEC } if v.arch != bin.arch { ctx.Warningf("Binary ELF arch %v and VDSO ELF arch %v differ", bin.arch, v.arch) return 0, linuxerr.ENOEXEC } // Reserve address space for the VDSO and its parameter page, which is // mapped just before the VDSO. mapSize := v.vdso.Length() + v.ParamPage.Length() addr, err := m.MMap(ctx, memmap.MMapOpts{ Length: mapSize, Private: true, }) if err != nil { ctx.Infof("Unable to reserve VDSO address space: %v", err) return 0, err } // Now map the param page. _, err = m.MMap(ctx, memmap.MMapOpts{ Length: v.ParamPage.Length(), MappingIdentity: v.ParamPage, Mappable: v.ParamPage, Addr: addr, Fixed: true, Unmap: true, Private: true, Perms: hostarch.Read, MaxPerms: hostarch.Read, }) if err != nil { ctx.Infof("Unable to map VDSO param page: %v", err) return 0, err } // Now map the VDSO itself. vdsoAddr, ok := addr.AddLength(v.ParamPage.Length()) if !ok { panic(fmt.Sprintf("Part of mapped range overflows? %#x + %#x", addr, v.ParamPage.Length())) } _, err = m.MMap(ctx, memmap.MMapOpts{ Length: v.vdso.Length(), MappingIdentity: v.vdso, Mappable: v.vdso, Addr: vdsoAddr, Fixed: true, Unmap: true, Private: true, Perms: hostarch.Read, MaxPerms: hostarch.AnyAccess, }) if err != nil { ctx.Infof("Unable to map VDSO: %v", err) return 0, err } vdsoEnd, ok := vdsoAddr.AddLength(v.vdso.Length()) if !ok { panic(fmt.Sprintf("VDSO mapping overflows? %#x + %#x", vdsoAddr, v.vdso.Length())) } // Set additional protections for the individual segments. var first *elf.ProgHeader for i, phdr := range v.phdrs { if phdr.Type != elf.PT_LOAD { continue } if first == nil { first = &v.phdrs[i] } memoryOffset := phdr.Vaddr - first.Vaddr segAddr, ok := vdsoAddr.AddLength(memoryOffset) if !ok { ctx.Warningf("PT_LOAD segment address overflows: %#x + %#x", segAddr, memoryOffset) return 0, linuxerr.ENOEXEC } segPage := segAddr.RoundDown() segSize := hostarch.Addr(phdr.Memsz) segSize, ok = segSize.AddLength(segAddr.PageOffset()) if !ok { ctx.Warningf("PT_LOAD segment memsize %#x + offset %#x overflows", phdr.Memsz, segAddr.PageOffset()) return 0, linuxerr.ENOEXEC } segSize, ok = segSize.RoundUp() if !ok { ctx.Warningf("PT_LOAD segment size overflows: %#x", phdr.Memsz+segAddr.PageOffset()) return 0, linuxerr.ENOEXEC } segEnd, ok := segPage.AddLength(uint64(segSize)) if !ok { ctx.Warningf("PT_LOAD segment range overflows: %#x + %#x", segAddr, segSize) return 0, linuxerr.ENOEXEC } if segEnd > vdsoEnd { ctx.Warningf("PT_LOAD segment ends beyond VDSO: %#x > %#x", segEnd, vdsoEnd) return 0, linuxerr.ENOEXEC } perms := progFlagsAsPerms(phdr.Flags) if perms != hostarch.Read { if err := m.MProtect(segPage, uint64(segSize), perms, false); err != nil { ctx.Warningf("Unable to set PT_LOAD segment protections %+v at [%#x, %#x): %v", perms, segAddr, segEnd, err) return 0, linuxerr.ENOEXEC } } } return vdsoAddr, nil } // Release drops references on mappings held by v. func (v *VDSO) Release(ctx context.Context) { v.ParamPage.DecRef(ctx) v.vdso.DecRef(ctx) } var vdsoSigreturnOffset = func() uint64 { f, err := elf.NewFile(bytes.NewReader(vdsodata.Binary)) if err != nil { panic(fmt.Sprintf("failed to parse vdso.so as ELF file: %v", err)) } syms, err := f.Symbols() if err != nil { panic(fmt.Sprintf("failed to read symbols from vdso.so: %v", err)) } const sigreturnSymbol = "__kernel_rt_sigreturn" for _, sym := range syms { if elf.ST_BIND(sym.Info) != elf.STB_LOCAL && sym.Section != elf.SHN_UNDEF && sym.Name == sigreturnSymbol { return sym.Value } } panic(fmt.Sprintf("no symbol %q in vdso.so", sigreturnSymbol)) }() golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/loader/vdso_state.go000066400000000000000000000023551465435605700247250ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package loader import ( "context" "debug/elf" ) // +stateify savable type elfProgHeader struct { Type elf.ProgType Flags elf.ProgFlag Off uint64 Vaddr uint64 Paddr uint64 Filesz uint64 Memsz uint64 Align uint64 } // savePhdrs is invoked by stateify. func (v *VDSO) savePhdrs() []elfProgHeader { s := make([]elfProgHeader, 0, len(v.phdrs)) for _, h := range v.phdrs { s = append(s, elfProgHeader(h)) } return s } // loadPhdrs is invoked by stateify. func (v *VDSO) loadPhdrs(_ context.Context, s []elfProgHeader) { v.phdrs = make([]elf.ProgHeader, 0, len(s)) for _, h := range s { v.phdrs = append(v.phdrs, elf.ProgHeader(h)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/loader/vdsodata/000077500000000000000000000000001465435605700240235ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/loader/vdsodata/vdsodata.go000066400000000000000000000012331465435605700261560ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package vdsodata contains a compiled VDSO object. package vdsodata golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/loader/vdsodata/vdsodata_amd64.go000066400000000000000000000013751465435605700271600ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package vdsodata import ( _ "embed" ) // Binary contains a compiled code of vdso.so. // //go:embed vdso_amd64.so var Binary []byte golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/loader/vdsodata/vdsodata_arm64.go000066400000000000000000000013751465435605700271760ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package vdsodata import ( _ "embed" ) // Binary contains a compiled code of vdso.so. // //go:embed vdso_arm64.so var Binary []byte golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/memmap/000077500000000000000000000000001465435605700222245ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/memmap/file_range.go000066400000000000000000000033501465435605700246470ustar00rootroot00000000000000package memmap // A Range represents a contiguous range of T. // // +stateify savable type FileRange struct { // Start is the inclusive start of the range. Start uint64 // End is the exclusive end of the range. End uint64 } // WellFormed returns true if r.Start <= r.End. All other methods on a Range // require that the Range is well-formed. // //go:nosplit func (r FileRange) WellFormed() bool { return r.Start <= r.End } // Length returns the length of the range. // //go:nosplit func (r FileRange) Length() uint64 { return r.End - r.Start } // Contains returns true if r contains x. // //go:nosplit func (r FileRange) Contains(x uint64) bool { return r.Start <= x && x < r.End } // Overlaps returns true if r and r2 overlap. // //go:nosplit func (r FileRange) Overlaps(r2 FileRange) bool { return r.Start < r2.End && r2.Start < r.End } // IsSupersetOf returns true if r is a superset of r2; that is, the range r2 is // contained within r. // //go:nosplit func (r FileRange) IsSupersetOf(r2 FileRange) bool { return r.Start <= r2.Start && r.End >= r2.End } // Intersect returns a range consisting of the intersection between r and r2. // If r and r2 do not overlap, Intersect returns a range with unspecified // bounds, but for which Length() == 0. // //go:nosplit func (r FileRange) Intersect(r2 FileRange) FileRange { if r.Start < r2.Start { r.Start = r2.Start } if r.End > r2.End { r.End = r2.End } if r.End < r.Start { r.End = r.Start } return r } // CanSplitAt returns true if it is legal to split a segment spanning the range // r at x; that is, splitting at x would produce two ranges, both of which have // non-zero length. // //go:nosplit func (r FileRange) CanSplitAt(x uint64) bool { return r.Contains(x) && r.Start < x } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/memmap/mappable_range.go000066400000000000000000000034301465435605700255100ustar00rootroot00000000000000package memmap // A Range represents a contiguous range of T. // // +stateify savable type MappableRange struct { // Start is the inclusive start of the range. Start uint64 // End is the exclusive end of the range. End uint64 } // WellFormed returns true if r.Start <= r.End. All other methods on a Range // require that the Range is well-formed. // //go:nosplit func (r MappableRange) WellFormed() bool { return r.Start <= r.End } // Length returns the length of the range. // //go:nosplit func (r MappableRange) Length() uint64 { return r.End - r.Start } // Contains returns true if r contains x. // //go:nosplit func (r MappableRange) Contains(x uint64) bool { return r.Start <= x && x < r.End } // Overlaps returns true if r and r2 overlap. // //go:nosplit func (r MappableRange) Overlaps(r2 MappableRange) bool { return r.Start < r2.End && r2.Start < r.End } // IsSupersetOf returns true if r is a superset of r2; that is, the range r2 is // contained within r. // //go:nosplit func (r MappableRange) IsSupersetOf(r2 MappableRange) bool { return r.Start <= r2.Start && r.End >= r2.End } // Intersect returns a range consisting of the intersection between r and r2. // If r and r2 do not overlap, Intersect returns a range with unspecified // bounds, but for which Length() == 0. // //go:nosplit func (r MappableRange) Intersect(r2 MappableRange) MappableRange { if r.Start < r2.Start { r.Start = r2.Start } if r.End > r2.End { r.End = r2.End } if r.End < r.Start { r.End = r.Start } return r } // CanSplitAt returns true if it is legal to split a segment spanning the range // r at x; that is, splitting at x would produce two ranges, both of which have // non-zero length. // //go:nosplit func (r MappableRange) CanSplitAt(x uint64) bool { return r.Contains(x) && r.Start < x } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/memmap/mapping_set.go000066400000000000000000000171411465435605700250650ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package memmap import ( "fmt" "math" "gvisor.dev/gvisor/pkg/hostarch" ) // MappingSet maps offsets into a Mappable to mappings of those offsets. It is // used to implement Mappable.AddMapping and RemoveMapping for Mappables that // may need to call MappingSpace.Invalidate. // // type MappingSet // MappingsOfRange is the value type of MappingSet, and represents the set of // all mappings of the corresponding MappableRange. // // Using a map offers O(1) lookups in RemoveMapping and // mappingSetFunctions.Merge. type MappingsOfRange map[MappingOfRange]struct{} // MappingOfRange represents a mapping of a MappableRange. // // +stateify savable type MappingOfRange struct { MappingSpace MappingSpace AddrRange hostarch.AddrRange Writable bool } func (r MappingOfRange) invalidate(opts InvalidateOpts) { r.MappingSpace.Invalidate(r.AddrRange, opts) } // String implements fmt.Stringer.String. func (r MappingOfRange) String() string { return fmt.Sprintf("%#v", r.AddrRange) } // mappingSetFunctions implements segment.Functions for MappingSet. type mappingSetFunctions struct{} // MinKey implements segment.Functions.MinKey. func (mappingSetFunctions) MinKey() uint64 { return 0 } // MaxKey implements segment.Functions.MaxKey. func (mappingSetFunctions) MaxKey() uint64 { return math.MaxUint64 } // ClearValue implements segment.Functions.ClearValue. func (mappingSetFunctions) ClearValue(v *MappingsOfRange) { *v = MappingsOfRange{} } // Merge implements segment.Functions.Merge. // // Since each value is a map of MappingOfRanges, values can only be merged if // all MappingOfRanges in each map have an exact pair in the other map, forming // one contiguous region. func (mappingSetFunctions) Merge(r1 MappableRange, val1 MappingsOfRange, r2 MappableRange, val2 MappingsOfRange) (MappingsOfRange, bool) { if len(val1) != len(val2) { return nil, false } merged := make(MappingsOfRange, len(val1)) // Each MappingOfRange in val1 must have a matching region in val2, forming // one contiguous region. for k1 := range val1 { // We expect val2 to contain a key that forms a contiguous // region with k1. k2 := MappingOfRange{ MappingSpace: k1.MappingSpace, AddrRange: hostarch.AddrRange{ Start: k1.AddrRange.End, End: k1.AddrRange.End + hostarch.Addr(r2.Length()), }, Writable: k1.Writable, } if _, ok := val2[k2]; !ok { return nil, false } // OK. Add it to the merged map. merged[MappingOfRange{ MappingSpace: k1.MappingSpace, AddrRange: hostarch.AddrRange{ Start: k1.AddrRange.Start, End: k2.AddrRange.End, }, Writable: k1.Writable, }] = struct{}{} } return merged, true } // Split implements segment.Functions.Split. func (mappingSetFunctions) Split(r MappableRange, val MappingsOfRange, split uint64) (MappingsOfRange, MappingsOfRange) { if split <= r.Start || split >= r.End { panic(fmt.Sprintf("split is not within range %v", r)) } m1 := make(MappingsOfRange, len(val)) m2 := make(MappingsOfRange, len(val)) // split is a value in MappableRange, we need the offset into the // corresponding MappingsOfRange. offset := hostarch.Addr(split - r.Start) for k := range val { k1 := MappingOfRange{ MappingSpace: k.MappingSpace, AddrRange: hostarch.AddrRange{ Start: k.AddrRange.Start, End: k.AddrRange.Start + offset, }, Writable: k.Writable, } m1[k1] = struct{}{} k2 := MappingOfRange{ MappingSpace: k.MappingSpace, AddrRange: hostarch.AddrRange{ Start: k.AddrRange.Start + offset, End: k.AddrRange.End, }, Writable: k.Writable, } m2[k2] = struct{}{} } return m1, m2 } // subsetMapping returns the MappingOfRange that maps subsetRange, given that // ms maps wholeRange beginning at addr. // // For instance, suppose wholeRange = [0x0, 0x2000) and addr = 0x4000, // indicating that ms maps addresses [0x4000, 0x6000) to MappableRange [0x0, // 0x2000). Then for subsetRange = [0x1000, 0x2000), subsetMapping returns a // MappingOfRange for which AddrRange = [0x5000, 0x6000). func subsetMapping(wholeRange, subsetRange MappableRange, ms MappingSpace, addr hostarch.Addr, writable bool) MappingOfRange { if !wholeRange.IsSupersetOf(subsetRange) { panic(fmt.Sprintf("%v is not a superset of %v", wholeRange, subsetRange)) } offset := subsetRange.Start - wholeRange.Start start := addr + hostarch.Addr(offset) return MappingOfRange{ MappingSpace: ms, AddrRange: hostarch.AddrRange{ Start: start, End: start + hostarch.Addr(subsetRange.Length()), }, Writable: writable, } } // AddMapping adds the given mapping and returns the set of MappableRanges that // previously had no mappings. // // Preconditions: Same as Mappable.AddMapping. func (s *MappingSet) AddMapping(ms MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) []MappableRange { mr := MappableRange{offset, offset + uint64(ar.Length())} var mapped []MappableRange seg, gap := s.Find(mr.Start) for { switch { case seg.Ok() && seg.Start() < mr.End: seg = s.Isolate(seg, mr) seg.Value()[subsetMapping(mr, seg.Range(), ms, ar.Start, writable)] = struct{}{} seg, gap = seg.NextNonEmpty() case gap.Ok() && gap.Start() < mr.End: gapMR := gap.Range().Intersect(mr) mapped = append(mapped, gapMR) // Insert a set and continue from the above case. seg, gap = s.Insert(gap, gapMR, make(MappingsOfRange)), MappingGapIterator{} default: return mapped } } } // RemoveMapping removes the given mapping and returns the set of // MappableRanges that now have no mappings. // // Preconditions: Same as Mappable.RemoveMapping. func (s *MappingSet) RemoveMapping(ms MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) []MappableRange { mr := MappableRange{offset, offset + uint64(ar.Length())} var unmapped []MappableRange seg := s.FindSegment(mr.Start) if !seg.Ok() { panic(fmt.Sprintf("MappingSet.RemoveMapping(%v): no segment containing %#x: %v", mr, mr.Start, s)) } for seg.Ok() && seg.Start() < mr.End { // Ensure this segment is limited to our range. seg = s.Isolate(seg, mr) // Remove this part of the mapping. mappings := seg.Value() delete(mappings, subsetMapping(mr, seg.Range(), ms, ar.Start, writable)) if len(mappings) == 0 { unmapped = append(unmapped, seg.Range()) seg = s.Remove(seg).NextSegment() } else { seg = seg.NextSegment() } } s.MergeOutsideRange(mr) return unmapped } // Invalidate calls MappingSpace.Invalidate for all mappings of offsets in mr. func (s *MappingSet) Invalidate(mr MappableRange, opts InvalidateOpts) { for seg := s.LowerBoundSegment(mr.Start); seg.Ok() && seg.Start() < mr.End; seg = seg.NextSegment() { segMR := seg.Range() for m := range seg.Value() { region := subsetMapping(segMR, segMR.Intersect(mr), m.MappingSpace, m.AddrRange.Start, m.Writable) region.invalidate(opts) } } } // InvalidateAll calls MappingSpace.Invalidate for all mappings of s. func (s *MappingSet) InvalidateAll(opts InvalidateOpts) { for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { for m := range seg.Value() { m.invalidate(opts) } } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/memmap/mapping_set_impl.go000066400000000000000000002027261465435605700261130ustar00rootroot00000000000000package memmap import ( "bytes" "context" "fmt" ) // trackGaps is an optional parameter. // // If trackGaps is 1, the Set will track maximum gap size recursively, // enabling the GapIterator.{Prev,Next}LargeEnoughGap functions. In this // case, Key must be an unsigned integer. // // trackGaps must be 0 or 1. const MappingtrackGaps = 0 var _ = uint8(MappingtrackGaps << 7) // Will fail if not zero or one. // dynamicGap is a type that disappears if trackGaps is 0. type MappingdynamicGap [MappingtrackGaps]uint64 // Get returns the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *MappingdynamicGap) Get() uint64 { return d[:][0] } // Set sets the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *MappingdynamicGap) Set(v uint64) { d[:][0] = v } const ( // minDegree is the minimum degree of an internal node in a Set B-tree. // // - Any non-root node has at least minDegree-1 segments. // // - Any non-root internal (non-leaf) node has at least minDegree children. // // - The root node may have fewer than minDegree-1 segments, but it may // only have 0 segments if the tree is empty. // // Our implementation requires minDegree >= 3. Higher values of minDegree // usually improve performance, but increase memory usage for small sets. MappingminDegree = 3 MappingmaxDegree = 2 * MappingminDegree ) // A Set is a mapping of segments with non-overlapping Range keys. The zero // value for a Set is an empty set. Set values are not safely movable nor // copyable. Set is thread-compatible. // // +stateify savable type MappingSet struct { root Mappingnode `state:".([]MappingFlatSegment)"` } // IsEmpty returns true if the set contains no segments. func (s *MappingSet) IsEmpty() bool { return s.root.nrSegments == 0 } // IsEmptyRange returns true iff no segments in the set overlap the given // range. This is semantically equivalent to s.SpanRange(r) == 0, but may be // more efficient. func (s *MappingSet) IsEmptyRange(r MappableRange) bool { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return true } _, gap := s.Find(r.Start) if !gap.Ok() { return false } return r.End <= gap.End() } // Span returns the total size of all segments in the set. func (s *MappingSet) Span() uint64 { var sz uint64 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { sz += seg.Range().Length() } return sz } // SpanRange returns the total size of the intersection of segments in the set // with the given range. func (s *MappingSet) SpanRange(r MappableRange) uint64 { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return 0 } var sz uint64 for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { sz += seg.Range().Intersect(r).Length() } return sz } // FirstSegment returns the first segment in the set. If the set is empty, // FirstSegment returns a terminal iterator. func (s *MappingSet) FirstSegment() MappingIterator { if s.root.nrSegments == 0 { return MappingIterator{} } return s.root.firstSegment() } // LastSegment returns the last segment in the set. If the set is empty, // LastSegment returns a terminal iterator. func (s *MappingSet) LastSegment() MappingIterator { if s.root.nrSegments == 0 { return MappingIterator{} } return s.root.lastSegment() } // FirstGap returns the first gap in the set. func (s *MappingSet) FirstGap() MappingGapIterator { n := &s.root for n.hasChildren { n = n.children[0] } return MappingGapIterator{n, 0} } // LastGap returns the last gap in the set. func (s *MappingSet) LastGap() MappingGapIterator { n := &s.root for n.hasChildren { n = n.children[n.nrSegments] } return MappingGapIterator{n, n.nrSegments} } // Find returns the segment or gap whose range contains the given key. If a // segment is found, the returned Iterator is non-terminal and the // returned GapIterator is terminal. Otherwise, the returned Iterator is // terminal and the returned GapIterator is non-terminal. func (s *MappingSet) Find(key uint64) (MappingIterator, MappingGapIterator) { n := &s.root for { lower := 0 upper := n.nrSegments for lower < upper { i := lower + (upper-lower)/2 if r := n.keys[i]; key < r.End { if key >= r.Start { return MappingIterator{n, i}, MappingGapIterator{} } upper = i } else { lower = i + 1 } } i := lower if !n.hasChildren { return MappingIterator{}, MappingGapIterator{n, i} } n = n.children[i] } } // FindSegment returns the segment whose range contains the given key. If no // such segment exists, FindSegment returns a terminal iterator. func (s *MappingSet) FindSegment(key uint64) MappingIterator { seg, _ := s.Find(key) return seg } // LowerBoundSegment returns the segment with the lowest range that contains a // key greater than or equal to min. If no such segment exists, // LowerBoundSegment returns a terminal iterator. func (s *MappingSet) LowerBoundSegment(min uint64) MappingIterator { seg, gap := s.Find(min) if seg.Ok() { return seg } return gap.NextSegment() } // UpperBoundSegment returns the segment with the highest range that contains a // key less than or equal to max. If no such segment exists, UpperBoundSegment // returns a terminal iterator. func (s *MappingSet) UpperBoundSegment(max uint64) MappingIterator { seg, gap := s.Find(max) if seg.Ok() { return seg } return gap.PrevSegment() } // FindGap returns the gap containing the given key. If no such gap exists // (i.e. the set contains a segment containing that key), FindGap returns a // terminal iterator. func (s *MappingSet) FindGap(key uint64) MappingGapIterator { _, gap := s.Find(key) return gap } // LowerBoundGap returns the gap with the lowest range that is greater than or // equal to min. func (s *MappingSet) LowerBoundGap(min uint64) MappingGapIterator { seg, gap := s.Find(min) if gap.Ok() { return gap } return seg.NextGap() } // UpperBoundGap returns the gap with the highest range that is less than or // equal to max. func (s *MappingSet) UpperBoundGap(max uint64) MappingGapIterator { seg, gap := s.Find(max) if gap.Ok() { return gap } return seg.PrevGap() } // FirstLargeEnoughGap returns the first gap in the set with at least the given // length. If no such gap exists, FirstLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *MappingSet) FirstLargeEnoughGap(minSize uint64) MappingGapIterator { if MappingtrackGaps != 1 { panic("set is not tracking gaps") } gap := s.FirstGap() if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // LastLargeEnoughGap returns the last gap in the set with at least the given // length. If no such gap exists, LastLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *MappingSet) LastLargeEnoughGap(minSize uint64) MappingGapIterator { if MappingtrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LastGap() if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // LowerBoundLargeEnoughGap returns the first gap in the set with at least the // given length and whose range contains a key greater than or equal to min. If // no such gap exists, LowerBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *MappingSet) LowerBoundLargeEnoughGap(min, minSize uint64) MappingGapIterator { if MappingtrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LowerBoundGap(min) if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // UpperBoundLargeEnoughGap returns the last gap in the set with at least the // given length and whose range contains a key less than or equal to max. If no // such gap exists, UpperBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *MappingSet) UpperBoundLargeEnoughGap(max, minSize uint64) MappingGapIterator { if MappingtrackGaps != 1 { panic("set is not tracking gaps") } gap := s.UpperBoundGap(max) if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // Insert inserts the given segment into the given gap. If the new segment can // be merged with adjacent segments, Insert will do so. Insert returns an // iterator to the segment containing the inserted value (which may have been // merged with other values). All existing iterators (including gap, but not // including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, Insert panics. // // Insert is semantically equivalent to a InsertWithoutMerging followed by a // Merge, but may be more efficient. Note that there is no unchecked variant of // Insert since Insert must retrieve and inspect gap's predecessor and // successor segments regardless. func (s *MappingSet) Insert(gap MappingGapIterator, r MappableRange, val MappingsOfRange) MappingIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } prev, next := gap.PrevSegment(), gap.NextSegment() if prev.Ok() && prev.End() > r.Start { panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) } if next.Ok() && next.Start() < r.End { panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) } if prev.Ok() && prev.End() == r.Start { if mval, ok := (mappingSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { shrinkMaxGap := MappingtrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() prev.SetEndUnchecked(r.End) prev.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } if next.Ok() && next.Start() == r.End { val = mval if mval, ok := (mappingSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { prev.SetEndUnchecked(next.End()) prev.SetValue(mval) return s.Remove(next).PrevSegment() } } return prev } } if next.Ok() && next.Start() == r.End { if mval, ok := (mappingSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { shrinkMaxGap := MappingtrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() next.SetStartUnchecked(r.Start) next.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } return next } } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMerging inserts the given segment into the given gap and // returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, // InsertWithoutMerging panics. func (s *MappingSet) InsertWithoutMerging(gap MappingGapIterator, r MappableRange, val MappingsOfRange) MappingIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if gr := gap.Range(); !gr.IsSupersetOf(r) { panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMergingUnchecked inserts the given segment into the given gap // and returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // Preconditions: // - r.Start >= gap.Start(). // - r.End <= gap.End(). func (s *MappingSet) InsertWithoutMergingUnchecked(gap MappingGapIterator, r MappableRange, val MappingsOfRange) MappingIterator { gap = gap.node.rebalanceBeforeInsert(gap) splitMaxGap := MappingtrackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get()) copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) gap.node.keys[gap.index] = r gap.node.values[gap.index] = val gap.node.nrSegments++ if splitMaxGap { gap.node.updateMaxGapLeaf() } return MappingIterator{gap.node, gap.index} } // InsertRange inserts the given segment into the set. If the new segment can // be merged with adjacent segments, InsertRange will do so. InsertRange // returns an iterator to the segment containing the inserted value (which may // have been merged with other values). All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertRange panics. // // InsertRange searches the set to find the gap to insert into. If the caller // already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *MappingSet) InsertRange(r MappableRange, val MappingsOfRange) MappingIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.Insert(gap, r, val) } // InsertWithoutMergingRange inserts the given segment into the set and returns // an iterator to the inserted segment. All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertWithoutMergingRange panics. // // InsertWithoutMergingRange searches the set to find the gap to insert into. // If the caller already has the appropriate GapIterator, or if the caller // needs to do additional work between finding the gap and insertion, use // InsertWithoutMerging instead. func (s *MappingSet) InsertWithoutMergingRange(r MappableRange, val MappingsOfRange) MappingIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.InsertWithoutMerging(gap, r, val) } // TryInsertRange attempts to insert the given segment into the set. If the new // segment can be merged with adjacent segments, TryInsertRange will do so. // TryInsertRange returns an iterator to the segment containing the inserted // value (which may have been merged with other values). All existing iterators // (excluding the returned iterator) are invalidated. // // If the new segment would overlap an existing segment, TryInsertRange does // nothing and returns a terminal iterator. // // TryInsertRange searches the set to find the gap to insert into. If the // caller already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *MappingSet) TryInsertRange(r MappableRange, val MappingsOfRange) MappingIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return MappingIterator{} } if gap.End() < r.End { return MappingIterator{} } return s.Insert(gap, r, val) } // TryInsertWithoutMergingRange attempts to insert the given segment into the // set. If successful, it returns an iterator to the inserted segment; all // existing iterators (excluding the returned iterator) are invalidated. If the // new segment would overlap an existing segment, TryInsertWithoutMergingRange // does nothing and returns a terminal iterator. // // TryInsertWithoutMergingRange searches the set to find the gap to insert // into. If the caller already has the appropriate GapIterator, or if the // caller needs to do additional work between finding the gap and insertion, // use InsertWithoutMerging instead. func (s *MappingSet) TryInsertWithoutMergingRange(r MappableRange, val MappingsOfRange) MappingIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return MappingIterator{} } if gap.End() < r.End { return MappingIterator{} } return s.InsertWithoutMerging(gap, r, val) } // Remove removes the given segment and returns an iterator to the vacated gap. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. func (s *MappingSet) Remove(seg MappingIterator) MappingGapIterator { if seg.node.hasChildren { victim := seg.PrevSegment() seg.SetRangeUnchecked(victim.Range()) seg.SetValue(victim.Value()) nextAdjacentNode := seg.NextSegment().node if MappingtrackGaps != 0 { nextAdjacentNode.updateMaxGapLeaf() } return s.Remove(victim).NextGap() } copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) mappingSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) seg.node.nrSegments-- if MappingtrackGaps != 0 { seg.node.updateMaxGapLeaf() } return seg.node.rebalanceAfterRemove(MappingGapIterator{seg.node, seg.index}) } // RemoveAll removes all segments from the set. All existing iterators are // invalidated. func (s *MappingSet) RemoveAll() { s.root = Mappingnode{} } // RemoveRange removes all segments in the given range. An iterator to the // newly formed gap is returned, and all existing iterators are invalidated. // // RemoveRange searches the set to find segments to remove. If the caller // already has an iterator to either end of the range of segments to remove, or // if the caller needs to do additional work before removing each segment, // iterate segments and call Remove in a loop instead. func (s *MappingSet) RemoveRange(r MappableRange) MappingGapIterator { seg, gap := s.Find(r.Start) if seg.Ok() { seg = s.Isolate(seg, r) gap = s.Remove(seg) } for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { seg = s.SplitAfter(seg, r.End) gap = s.Remove(seg) } return gap } // RemoveFullRange is equivalent to RemoveRange, except that if any key in the // given range does not correspond to a segment, RemoveFullRange panics. func (s *MappingSet) RemoveFullRange(r MappableRange) MappingGapIterator { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) end := seg.End() gap := s.Remove(seg) if r.End <= end { return gap } seg = gap.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // Merge attempts to merge two neighboring segments. If successful, Merge // returns an iterator to the merged segment, and all existing iterators are // invalidated. Otherwise, Merge returns a terminal iterator. // // If first is not the predecessor of second, Merge panics. func (s *MappingSet) Merge(first, second MappingIterator) MappingIterator { if first.NextSegment() != second { panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) } return s.MergeUnchecked(first, second) } // MergeUnchecked attempts to merge two neighboring segments. If successful, // MergeUnchecked returns an iterator to the merged segment, and all existing // iterators are invalidated. Otherwise, MergeUnchecked returns a terminal // iterator. // // Precondition: first is the predecessor of second: first.NextSegment() == // second, first == second.PrevSegment(). func (s *MappingSet) MergeUnchecked(first, second MappingIterator) MappingIterator { if first.End() == second.Start() { if mval, ok := (mappingSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { first.SetEndUnchecked(second.End()) first.SetValue(mval) return s.Remove(second).PrevSegment() } } return MappingIterator{} } // MergePrev attempts to merge the given segment with its predecessor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergePrev is usually used when mutating segments while iterating them in // order of increasing keys, to attempt merging of each mutated segment with // its previously-mutated predecessor. In such cases, merging a mutated segment // with its unmutated successor would incorrectly cause the latter to be // skipped. func (s *MappingSet) MergePrev(seg MappingIterator) MappingIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } return seg } // MergeNext attempts to merge the given segment with its successor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergeNext is usually used when mutating segments while iterating them in // order of decreasing keys, to attempt merging of each mutated segment with // its previously-mutated successor. In such cases, merging a mutated segment // with its unmutated predecessor would incorrectly cause the latter to be // skipped. func (s *MappingSet) MergeNext(seg MappingIterator) MappingIterator { if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // Unisolate attempts to merge the given segment with its predecessor and // successor if possible, and returns an updated iterator to the extended // segment. All existing iterators (including seg, but not including the // returned iterator) are invalidated. // // Unisolate is usually used in conjunction with Isolate when mutating part of // a single segment in a way that may affect its mergeability. For the reasons // described by MergePrev and MergeNext, it is usually incorrect to use the // return value of Unisolate in a loop variable. func (s *MappingSet) Unisolate(seg MappingIterator) MappingIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // MergeAll merges all mergeable adjacent segments in the set. All existing // iterators are invalidated. func (s *MappingSet) MergeAll() { seg := s.FirstSegment() if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeInsideRange attempts to merge all adjacent segments that contain a key // in the specific range. All existing iterators are invalidated. // // MergeInsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid a redundant search. func (s *MappingSet) MergeInsideRange(r MappableRange) { seg := s.LowerBoundSegment(r.Start) if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() && next.Start() < r.End { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeOutsideRange attempts to merge the segment containing r.Start with its // predecessor, and the segment containing r.End-1 with its successor. // // MergeOutsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid two redundant searches. func (s *MappingSet) MergeOutsideRange(r MappableRange) { first := s.FindSegment(r.Start) if first.Ok() { if prev := first.PrevSegment(); prev.Ok() { s.Merge(prev, first) } } last := s.FindSegment(r.End - 1) if last.Ok() { if next := last.NextSegment(); next.Ok() { s.Merge(last, next) } } } // Split splits the given segment at the given key and returns iterators to the // two resulting segments. All existing iterators (including seg, but not // including the returned iterators) are invalidated. // // If the segment cannot be split at split (because split is at the start or // end of the segment's range, so splitting would produce a segment with zero // length, or because split falls outside the segment's range altogether), // Split panics. func (s *MappingSet) Split(seg MappingIterator, split uint64) (MappingIterator, MappingIterator) { if !seg.Range().CanSplitAt(split) { panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) } return s.SplitUnchecked(seg, split) } // SplitUnchecked splits the given segment at the given key and returns // iterators to the two resulting segments. All existing iterators (including // seg, but not including the returned iterators) are invalidated. // // Preconditions: seg.Start() < key < seg.End(). func (s *MappingSet) SplitUnchecked(seg MappingIterator, split uint64) (MappingIterator, MappingIterator) { val1, val2 := (mappingSetFunctions{}).Split(seg.Range(), seg.Value(), split) end2 := seg.End() seg.SetEndUnchecked(split) seg.SetValue(val1) seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), MappableRange{split, end2}, val2) return seg2.PrevSegment(), seg2 } // SplitBefore ensures that the given segment's start is at least start by // splitting at start if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterator) are invalidated. // // SplitBefore is usually when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, the first segment may // extend beyond the start of the range to be mutated, and needs to be // SplitBefore to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter; i.e. SplitBefore needs to be invoked on each segment, while // SplitAfter only needs to be invoked on the first. // // Preconditions: start < seg.End(). func (s *MappingSet) SplitBefore(seg MappingIterator, start uint64) MappingIterator { if seg.Range().CanSplitAt(start) { _, seg = s.SplitUnchecked(seg, start) } return seg } // SplitAfter ensures that the given segment's end is at most end by splitting // at end if necessary, and returns an updated iterator to the bounded segment. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. // // SplitAfter is usually used when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, each iterated segment // may extend beyond the end of the range to be mutated, and needs to be // SplitAfter to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter exchange roles; i.e. SplitBefore needs to be invoked on each // segment, while SplitAfter only needs to be invoked on the first. // // Preconditions: seg.Start() < end. func (s *MappingSet) SplitAfter(seg MappingIterator, end uint64) MappingIterator { if seg.Range().CanSplitAt(end) { seg, _ = s.SplitUnchecked(seg, end) } return seg } // Isolate ensures that the given segment's range is a subset of r by splitting // at r.Start and r.End if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterators) are invalidated. // // Isolate is usually used when mutating part of a single segment, or when // mutating segments in a range where the first segment is not necessarily // split, making use of SplitBefore/SplitAfter complex. // // Preconditions: seg.Range().Overlaps(r). func (s *MappingSet) Isolate(seg MappingIterator, r MappableRange) MappingIterator { if seg.Range().CanSplitAt(r.Start) { _, seg = s.SplitUnchecked(seg, r.Start) } if seg.Range().CanSplitAt(r.End) { seg, _ = s.SplitUnchecked(seg, r.End) } return seg } // LowerBoundSegmentSplitBefore combines LowerBoundSegment and SplitBefore. // // LowerBoundSegmentSplitBefore is usually used when mutating segments in a // range while iterating them in order of increasing keys. In such cases, // LowerBoundSegmentSplitBefore provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *MappingSet) LowerBoundSegmentSplitBefore(min uint64) MappingIterator { seg := s.LowerBoundSegment(min) if seg.Ok() { seg = s.SplitBefore(seg, min) } return seg } // UpperBoundSegmentSplitAfter combines UpperBoundSegment and SplitAfter. // // UpperBoundSegmentSplitAfter is usually used when mutating segments in a // range while iterating them in order of decreasing keys. In such cases, // UpperBoundSegmentSplitAfter provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *MappingSet) UpperBoundSegmentSplitAfter(max uint64) MappingIterator { seg := s.UpperBoundSegment(max) if seg.Ok() { seg = s.SplitAfter(seg, max) } return seg } // VisitRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments will not be split, so f may be called // on segments lying partially outside r. Non-empty gaps between segments are // skipped. If a call to f returns false, VisitRange stops iteration // immediately. // // N.B. f must not invalidate iterators into s. func (s *MappingSet) VisitRange(r MappableRange, f func(seg MappingIterator) bool) { for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { if !f(seg) { return } } } // VisitFullRange is equivalent to VisitRange, except that if any key in r that // is visited before f returns false does not correspond to a segment, // VisitFullRange panics. func (s *MappingSet) VisitFullRange(r MappableRange, f func(seg MappingIterator) bool) { pos := r.Start seg := s.FindSegment(r.Start) for { if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", pos)) } if !f(seg) { return } pos = seg.End() if r.End <= pos { return } seg, _ = seg.NextNonEmpty() } } // MutateRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments that lie partially outside r are split // before f is called, such that f only observes segments entirely within r. // Iterated segments are merged again after f is called. Non-empty gaps between // segments are skipped. If a call to f returns false, MutateRange stops // iteration immediately. // // MutateRange invalidates all existing iterators. // // N.B. f must not invalidate iterators into s. func (s *MappingSet) MutateRange(r MappableRange, f func(seg MappingIterator) bool) { seg := s.LowerBoundSegmentSplitBefore(r.Start) for seg.Ok() && seg.Start() < r.End { seg = s.SplitAfter(seg, r.End) cont := f(seg) seg = s.MergePrev(seg) if !cont { s.MergeNext(seg) return } seg = seg.NextSegment() } if seg.Ok() { s.MergePrev(seg) } } // MutateFullRange is equivalent to MutateRange, except that if any key in r // that is visited before f returns false does not correspond to a segment, // MutateFullRange panics. func (s *MappingSet) MutateFullRange(r MappableRange, f func(seg MappingIterator) bool) { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) cont := f(seg) end := seg.End() seg = s.MergePrev(seg) if !cont || r.End <= end { s.MergeNext(seg) return } seg = seg.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // +stateify savable type Mappingnode struct { // An internal binary tree node looks like: // // K // / \ // Cl Cr // // where all keys in the subtree rooted by Cl (the left subtree) are less // than K (the key of the parent node), and all keys in the subtree rooted // by Cr (the right subtree) are greater than K. // // An internal B-tree node's indexes work out to look like: // // K0 K1 K2 ... Kn-1 // / \/ \/ \ ... / \ // C0 C1 C2 C3 ... Cn-1 Cn // // where n is nrSegments. nrSegments int // parent is a pointer to this node's parent. If this node is root, parent // is nil. parent *Mappingnode // parentIndex is the index of this node in parent.children. parentIndex int // Flag for internal nodes that is technically redundant with "children[0] // != nil", but is stored in the first cache line. "hasChildren" rather // than "isLeaf" because false must be the correct value for an empty root. hasChildren bool // The longest gap within this node. If the node is a leaf, it's simply the // maximum gap among all the (nrSegments+1) gaps formed by its nrSegments keys // including the 0th and nrSegments-th gap possibly shared with its upper-level // nodes; if it's a non-leaf node, it's the max of all children's maxGap. maxGap MappingdynamicGap // Nodes store keys and values in separate arrays to maximize locality in // the common case (scanning keys for lookup). keys [MappingmaxDegree - 1]MappableRange values [MappingmaxDegree - 1]MappingsOfRange children [MappingmaxDegree]*Mappingnode } // firstSegment returns the first segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *Mappingnode) firstSegment() MappingIterator { for n.hasChildren { n = n.children[0] } return MappingIterator{n, 0} } // lastSegment returns the last segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *Mappingnode) lastSegment() MappingIterator { for n.hasChildren { n = n.children[n.nrSegments] } return MappingIterator{n, n.nrSegments - 1} } func (n *Mappingnode) prevSibling() *Mappingnode { if n.parent == nil || n.parentIndex == 0 { return nil } return n.parent.children[n.parentIndex-1] } func (n *Mappingnode) nextSibling() *Mappingnode { if n.parent == nil || n.parentIndex == n.parent.nrSegments { return nil } return n.parent.children[n.parentIndex+1] } // rebalanceBeforeInsert splits n and its ancestors if they are full, as // required for insertion, and returns an updated iterator to the position // represented by gap. func (n *Mappingnode) rebalanceBeforeInsert(gap MappingGapIterator) MappingGapIterator { if n.nrSegments < MappingmaxDegree-1 { return gap } if n.parent != nil { gap = n.parent.rebalanceBeforeInsert(gap) } if n.parent == nil { left := &Mappingnode{ nrSegments: MappingminDegree - 1, parent: n, parentIndex: 0, hasChildren: n.hasChildren, } right := &Mappingnode{ nrSegments: MappingminDegree - 1, parent: n, parentIndex: 1, hasChildren: n.hasChildren, } copy(left.keys[:MappingminDegree-1], n.keys[:MappingminDegree-1]) copy(left.values[:MappingminDegree-1], n.values[:MappingminDegree-1]) copy(right.keys[:MappingminDegree-1], n.keys[MappingminDegree:]) copy(right.values[:MappingminDegree-1], n.values[MappingminDegree:]) n.keys[0], n.values[0] = n.keys[MappingminDegree-1], n.values[MappingminDegree-1] MappingzeroValueSlice(n.values[1:]) if n.hasChildren { copy(left.children[:MappingminDegree], n.children[:MappingminDegree]) copy(right.children[:MappingminDegree], n.children[MappingminDegree:]) MappingzeroNodeSlice(n.children[2:]) for i := 0; i < MappingminDegree; i++ { left.children[i].parent = left left.children[i].parentIndex = i right.children[i].parent = right right.children[i].parentIndex = i } } n.nrSegments = 1 n.hasChildren = true n.children[0] = left n.children[1] = right if MappingtrackGaps != 0 { left.updateMaxGapLocal() right.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < MappingminDegree { return MappingGapIterator{left, gap.index} } return MappingGapIterator{right, gap.index - MappingminDegree} } copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[MappingminDegree-1], n.values[MappingminDegree-1] copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { n.parent.children[i].parentIndex = i } sibling := &Mappingnode{ nrSegments: MappingminDegree - 1, parent: n.parent, parentIndex: n.parentIndex + 1, hasChildren: n.hasChildren, } n.parent.children[n.parentIndex+1] = sibling n.parent.nrSegments++ copy(sibling.keys[:MappingminDegree-1], n.keys[MappingminDegree:]) copy(sibling.values[:MappingminDegree-1], n.values[MappingminDegree:]) MappingzeroValueSlice(n.values[MappingminDegree-1:]) if n.hasChildren { copy(sibling.children[:MappingminDegree], n.children[MappingminDegree:]) MappingzeroNodeSlice(n.children[MappingminDegree:]) for i := 0; i < MappingminDegree; i++ { sibling.children[i].parent = sibling sibling.children[i].parentIndex = i } } n.nrSegments = MappingminDegree - 1 if MappingtrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < MappingminDegree { return gap } return MappingGapIterator{sibling, gap.index - MappingminDegree} } // rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient // (contain fewer segments than required by B-tree invariants), as required for // removal, and returns an updated iterator to the position represented by gap. // // Precondition: n is the only node in the tree that may currently violate a // B-tree invariant. func (n *Mappingnode) rebalanceAfterRemove(gap MappingGapIterator) MappingGapIterator { for { if n.nrSegments >= MappingminDegree-1 { return gap } if n.parent == nil { return gap } if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= MappingminDegree { copy(n.keys[1:], n.keys[:n.nrSegments]) copy(n.values[1:], n.values[:n.nrSegments]) n.keys[0] = n.parent.keys[n.parentIndex-1] n.values[0] = n.parent.values[n.parentIndex-1] n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] mappingSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { copy(n.children[1:], n.children[:n.nrSegments+1]) n.children[0] = sibling.children[sibling.nrSegments] sibling.children[sibling.nrSegments] = nil n.children[0].parent = n n.children[0].parentIndex = 0 for i := 1; i < n.nrSegments+2; i++ { n.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if MappingtrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling && gap.index == sibling.nrSegments { return MappingGapIterator{n, 0} } if gap.node == n { return MappingGapIterator{n, gap.index + 1} } return gap } if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= MappingminDegree { n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] n.values[n.nrSegments] = n.parent.values[n.parentIndex] n.parent.keys[n.parentIndex] = sibling.keys[0] n.parent.values[n.parentIndex] = sibling.values[0] copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) mappingSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { n.children[n.nrSegments+1] = sibling.children[0] copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) sibling.children[sibling.nrSegments] = nil n.children[n.nrSegments+1].parent = n n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 for i := 0; i < sibling.nrSegments; i++ { sibling.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if MappingtrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling { if gap.index == 0 { return MappingGapIterator{n, n.nrSegments} } return MappingGapIterator{sibling, gap.index - 1} } return gap } p := n.parent if p.nrSegments == 1 { left, right := p.children[0], p.children[1] p.nrSegments = left.nrSegments + right.nrSegments + 1 p.hasChildren = left.hasChildren p.keys[left.nrSegments] = p.keys[0] p.values[left.nrSegments] = p.values[0] copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := 0; i < p.nrSegments+1; i++ { p.children[i].parent = p p.children[i].parentIndex = i } } else { p.children[0] = nil p.children[1] = nil } if gap.node == left { return MappingGapIterator{p, gap.index} } if gap.node == right { return MappingGapIterator{p, gap.index + left.nrSegments + 1} } return gap } // Merge n and either sibling, along with the segment separating the // two, into whichever of the two nodes comes first. This is the // reverse of the non-root splitting case in // node.rebalanceBeforeInsert. var left, right *Mappingnode if n.parentIndex > 0 { left = n.prevSibling() right = n } else { left = n right = n.nextSibling() } if gap.node == right { gap = MappingGapIterator{left, gap.index + left.nrSegments + 1} } left.keys[left.nrSegments] = p.keys[left.parentIndex] left.values[left.nrSegments] = p.values[left.parentIndex] copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { left.children[i].parent = left left.children[i].parentIndex = i } } left.nrSegments += right.nrSegments + 1 copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) mappingSetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) for i := 0; i < p.nrSegments; i++ { p.children[i].parentIndex = i } p.children[p.nrSegments] = nil p.nrSegments-- if MappingtrackGaps != 0 { left.updateMaxGapLocal() } n = p } } // updateMaxGapLeaf updates maxGap bottom-up from the calling leaf until no // necessary update. // // Preconditions: n must be a leaf node, trackGaps must be 1. func (n *Mappingnode) updateMaxGapLeaf() { if n.hasChildren { panic(fmt.Sprintf("updateMaxGapLeaf should always be called on leaf node: %v", n)) } max := n.calculateMaxGapLeaf() if max == n.maxGap.Get() { return } oldMax := n.maxGap.Get() n.maxGap.Set(max) if max > oldMax { for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() >= max { break } p.maxGap.Set(max) } return } for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() > oldMax { break } parentNewMax := p.calculateMaxGapInternal() if p.maxGap.Get() == parentNewMax { break } p.maxGap.Set(parentNewMax) } } // updateMaxGapLocal updates maxGap of the calling node solely with no // propagation to ancestor nodes. // // Precondition: trackGaps must be 1. func (n *Mappingnode) updateMaxGapLocal() { if !n.hasChildren { n.maxGap.Set(n.calculateMaxGapLeaf()) } else { n.maxGap.Set(n.calculateMaxGapInternal()) } } // calculateMaxGapLeaf iterates the gaps within a leaf node and calculate the // max. // // Preconditions: n must be a leaf node. func (n *Mappingnode) calculateMaxGapLeaf() uint64 { max := MappingGapIterator{n, 0}.Range().Length() for i := 1; i <= n.nrSegments; i++ { if current := (MappingGapIterator{n, i}).Range().Length(); current > max { max = current } } return max } // calculateMaxGapInternal iterates children's maxGap within an internal node n // and calculate the max. // // Preconditions: n must be a non-leaf node. func (n *Mappingnode) calculateMaxGapInternal() uint64 { max := n.children[0].maxGap.Get() for i := 1; i <= n.nrSegments; i++ { if current := n.children[i].maxGap.Get(); current > max { max = current } } return max } // searchFirstLargeEnoughGap returns the first gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *Mappingnode) searchFirstLargeEnoughGap(minSize uint64) MappingGapIterator { if n.maxGap.Get() < minSize { return MappingGapIterator{} } if n.hasChildren { for i := 0; i <= n.nrSegments; i++ { if largeEnoughGap := n.children[i].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := 0; i <= n.nrSegments; i++ { currentGap := MappingGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // searchLastLargeEnoughGap returns the last gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *Mappingnode) searchLastLargeEnoughGap(minSize uint64) MappingGapIterator { if n.maxGap.Get() < minSize { return MappingGapIterator{} } if n.hasChildren { for i := n.nrSegments; i >= 0; i-- { if largeEnoughGap := n.children[i].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := n.nrSegments; i >= 0; i-- { currentGap := MappingGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // A Iterator is conceptually one of: // // - A pointer to a segment in a set; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Iterators are copyable values and are meaningfully equality-comparable. The // zero value of Iterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type MappingIterator struct { // node is the node containing the iterated segment. If the iterator is // terminal, node is nil. node *Mappingnode // index is the index of the segment in node.keys/values. index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (seg MappingIterator) Ok() bool { return seg.node != nil } // Range returns the iterated segment's range key. func (seg MappingIterator) Range() MappableRange { return seg.node.keys[seg.index] } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (seg MappingIterator) Start() uint64 { return seg.node.keys[seg.index].Start } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (seg MappingIterator) End() uint64 { return seg.node.keys[seg.index].End } // SetRangeUnchecked mutates the iterated segment's range key. This operation // does not invalidate any iterators. // // Preconditions: // - r.Length() > 0. // - The new range must not overlap an existing one: // - If seg.NextSegment().Ok(), then r.end <= seg.NextSegment().Start(). // - If seg.PrevSegment().Ok(), then r.start >= seg.PrevSegment().End(). func (seg MappingIterator) SetRangeUnchecked(r MappableRange) { seg.node.keys[seg.index] = r } // SetRange mutates the iterated segment's range key. If the new range would // cause the iterated segment to overlap another segment, or if the new range // is invalid, SetRange panics. This operation does not invalidate any // iterators. func (seg MappingIterator) SetRange(r MappableRange) { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) } if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) } seg.SetRangeUnchecked(r) } // SetStartUnchecked mutates the iterated segment's start. This operation does // not invalidate any iterators. // // Preconditions: The new start must be valid: // - start < seg.End() // - If seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). func (seg MappingIterator) SetStartUnchecked(start uint64) { seg.node.keys[seg.index].Start = start } // SetStart mutates the iterated segment's start. If the new start value would // cause the iterated segment to overlap another segment, or would result in an // invalid range, SetStart panics. This operation does not invalidate any // iterators. func (seg MappingIterator) SetStart(start uint64) { if start >= seg.End() { panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) } if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) } seg.SetStartUnchecked(start) } // SetEndUnchecked mutates the iterated segment's end. This operation does not // invalidate any iterators. // // Preconditions: The new end must be valid: // - end > seg.Start(). // - If seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). func (seg MappingIterator) SetEndUnchecked(end uint64) { seg.node.keys[seg.index].End = end } // SetEnd mutates the iterated segment's end. If the new end value would cause // the iterated segment to overlap another segment, or would result in an // invalid range, SetEnd panics. This operation does not invalidate any // iterators. func (seg MappingIterator) SetEnd(end uint64) { if end <= seg.Start() { panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) } if next := seg.NextSegment(); next.Ok() && end > next.Start() { panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) } seg.SetEndUnchecked(end) } // Value returns a copy of the iterated segment's value. func (seg MappingIterator) Value() MappingsOfRange { return seg.node.values[seg.index] } // ValuePtr returns a pointer to the iterated segment's value. The pointer is // invalidated if the iterator is invalidated. This operation does not // invalidate any iterators. func (seg MappingIterator) ValuePtr() *MappingsOfRange { return &seg.node.values[seg.index] } // SetValue mutates the iterated segment's value. This operation does not // invalidate any iterators. func (seg MappingIterator) SetValue(val MappingsOfRange) { seg.node.values[seg.index] = val } // PrevSegment returns the iterated segment's predecessor. If there is no // preceding segment, PrevSegment returns a terminal iterator. func (seg MappingIterator) PrevSegment() MappingIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment() } if seg.index > 0 { return MappingIterator{seg.node, seg.index - 1} } if seg.node.parent == nil { return MappingIterator{} } return MappingsegmentBeforePosition(seg.node.parent, seg.node.parentIndex) } // NextSegment returns the iterated segment's successor. If there is no // succeeding segment, NextSegment returns a terminal iterator. func (seg MappingIterator) NextSegment() MappingIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment() } if seg.index < seg.node.nrSegments-1 { return MappingIterator{seg.node, seg.index + 1} } if seg.node.parent == nil { return MappingIterator{} } return MappingsegmentAfterPosition(seg.node.parent, seg.node.parentIndex) } // PrevGap returns the gap immediately before the iterated segment. func (seg MappingIterator) PrevGap() MappingGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment().NextGap() } return MappingGapIterator{seg.node, seg.index} } // NextGap returns the gap immediately after the iterated segment. func (seg MappingIterator) NextGap() MappingGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment().PrevGap() } return MappingGapIterator{seg.node, seg.index + 1} } // PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, // or the gap before the iterated segment otherwise. If seg.Start() == // Functions.MinKey(), PrevNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by PrevNonEmpty will be // non-terminal. func (seg MappingIterator) PrevNonEmpty() (MappingIterator, MappingGapIterator) { if prev := seg.PrevSegment(); prev.Ok() && prev.End() == seg.Start() { return prev, MappingGapIterator{} } return MappingIterator{}, seg.PrevGap() } // NextNonEmpty returns the iterated segment's successor if it is adjacent, or // the gap after the iterated segment otherwise. If seg.End() == // Functions.MaxKey(), NextNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by NextNonEmpty will be // non-terminal. func (seg MappingIterator) NextNonEmpty() (MappingIterator, MappingGapIterator) { if next := seg.NextSegment(); next.Ok() && next.Start() == seg.End() { return next, MappingGapIterator{} } return MappingIterator{}, seg.NextGap() } // A GapIterator is conceptually one of: // // - A pointer to a position between two segments, before the first segment, or // after the last segment in a set, called a *gap*; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Note that the gap between two adjacent segments exists (iterators to it are // non-terminal), but has a length of zero. GapIterator.IsEmpty returns true // for such gaps. An empty set contains a single gap, spanning the entire range // of the set's keys. // // GapIterators are copyable values and are meaningfully equality-comparable. // The zero value of GapIterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type MappingGapIterator struct { // The representation of a GapIterator is identical to that of an Iterator, // except that index corresponds to positions between segments in the same // way as for node.children (see comment for node.nrSegments). node *Mappingnode index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (gap MappingGapIterator) Ok() bool { return gap.node != nil } // Range returns the range spanned by the iterated gap. func (gap MappingGapIterator) Range() MappableRange { return MappableRange{gap.Start(), gap.End()} } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (gap MappingGapIterator) Start() uint64 { if ps := gap.PrevSegment(); ps.Ok() { return ps.End() } return mappingSetFunctions{}.MinKey() } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (gap MappingGapIterator) End() uint64 { if ns := gap.NextSegment(); ns.Ok() { return ns.Start() } return mappingSetFunctions{}.MaxKey() } // IsEmpty returns true if the iterated gap is empty (that is, the "gap" is // between two adjacent segments.) func (gap MappingGapIterator) IsEmpty() bool { return gap.Range().Length() == 0 } // PrevSegment returns the segment immediately before the iterated gap. If no // such segment exists, PrevSegment returns a terminal iterator. func (gap MappingGapIterator) PrevSegment() MappingIterator { return MappingsegmentBeforePosition(gap.node, gap.index) } // NextSegment returns the segment immediately after the iterated gap. If no // such segment exists, NextSegment returns a terminal iterator. func (gap MappingGapIterator) NextSegment() MappingIterator { return MappingsegmentAfterPosition(gap.node, gap.index) } // PrevGap returns the iterated gap's predecessor. If no such gap exists, // PrevGap returns a terminal iterator. func (gap MappingGapIterator) PrevGap() MappingGapIterator { seg := gap.PrevSegment() if !seg.Ok() { return MappingGapIterator{} } return seg.PrevGap() } // NextGap returns the iterated gap's successor. If no such gap exists, NextGap // returns a terminal iterator. func (gap MappingGapIterator) NextGap() MappingGapIterator { seg := gap.NextSegment() if !seg.Ok() { return MappingGapIterator{} } return seg.NextGap() } // NextLargeEnoughGap returns the iterated gap's first next gap with larger // length than minSize. If not found, return a terminal gap iterator (does NOT // include this gap itself). // // Precondition: trackGaps must be 1. func (gap MappingGapIterator) NextLargeEnoughGap(minSize uint64) MappingGapIterator { if MappingtrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == gap.node.nrSegments { gap.node = gap.NextSegment().node gap.index = 0 return gap.nextLargeEnoughGapHelper(minSize) } return gap.nextLargeEnoughGapHelper(minSize) } // nextLargeEnoughGapHelper is the helper function used by NextLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the trailing gap of a non-leaf node. func (gap MappingGapIterator) nextLargeEnoughGapHelper(minSize uint64) MappingGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == gap.node.nrSegments)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return MappingGapIterator{} } gap.index++ for gap.index <= gap.node.nrSegments { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index++ } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == gap.node.nrSegments { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // PrevLargeEnoughGap returns the iterated gap's first prev gap with larger or // equal length than minSize. If not found, return a terminal gap iterator // (does NOT include this gap itself). // // Precondition: trackGaps must be 1. func (gap MappingGapIterator) PrevLargeEnoughGap(minSize uint64) MappingGapIterator { if MappingtrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == 0 { gap.node = gap.PrevSegment().node gap.index = gap.node.nrSegments return gap.prevLargeEnoughGapHelper(minSize) } return gap.prevLargeEnoughGapHelper(minSize) } // prevLargeEnoughGapHelper is the helper function used by PrevLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the first gap of a non-leaf node. func (gap MappingGapIterator) prevLargeEnoughGapHelper(minSize uint64) MappingGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == 0)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return MappingGapIterator{} } gap.index-- for gap.index >= 0 { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index-- } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == 0 { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // segmentBeforePosition returns the predecessor segment of the position given // by n.children[i], which may or may not contain a child. If no such segment // exists, segmentBeforePosition returns a terminal iterator. func MappingsegmentBeforePosition(n *Mappingnode, i int) MappingIterator { for i == 0 { if n.parent == nil { return MappingIterator{} } n, i = n.parent, n.parentIndex } return MappingIterator{n, i - 1} } // segmentAfterPosition returns the successor segment of the position given by // n.children[i], which may or may not contain a child. If no such segment // exists, segmentAfterPosition returns a terminal iterator. func MappingsegmentAfterPosition(n *Mappingnode, i int) MappingIterator { for i == n.nrSegments { if n.parent == nil { return MappingIterator{} } n, i = n.parent, n.parentIndex } return MappingIterator{n, i} } func MappingzeroValueSlice(slice []MappingsOfRange) { for i := range slice { mappingSetFunctions{}.ClearValue(&slice[i]) } } func MappingzeroNodeSlice(slice []*Mappingnode) { for i := range slice { slice[i] = nil } } // String stringifies a Set for debugging. func (s *MappingSet) String() string { return s.root.String() } // String stringifies a node (and all of its children) for debugging. func (n *Mappingnode) String() string { var buf bytes.Buffer n.writeDebugString(&buf, "") return buf.String() } func (n *Mappingnode) writeDebugString(buf *bytes.Buffer, prefix string) { if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { buf.WriteString(prefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) } for i := 0; i < n.nrSegments; i++ { if child := n.children[i]; child != nil { cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) if child.parent != n || child.parentIndex != i { buf.WriteString(cprefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) } child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) } buf.WriteString(prefix) if n.hasChildren { if MappingtrackGaps != 0 { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v, maxGap: %d\n", i, n.keys[i], n.values[i], n.maxGap.Get())) } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } if child := n.children[n.nrSegments]; child != nil { child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) } } // FlatSegment represents a segment as a single object. FlatSegment is used as // an intermediate representation for save/restore and tests. // // +stateify savable type MappingFlatSegment struct { Start uint64 End uint64 Value MappingsOfRange } // ExportSlice returns a copy of all segments in the given set, in ascending // key order. func (s *MappingSet) ExportSlice() []MappingFlatSegment { var fs []MappingFlatSegment for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { fs = append(fs, MappingFlatSegment{ Start: seg.Start(), End: seg.End(), Value: seg.Value(), }) } return fs } // ImportSlice initializes the given set from the given slice. // // Preconditions: // - s must be empty. // - fs must represent a valid set (the segments in fs must have valid // lengths that do not overlap). // - The segments in fs must be sorted in ascending key order. func (s *MappingSet) ImportSlice(fs []MappingFlatSegment) error { if !s.IsEmpty() { return fmt.Errorf("cannot import into non-empty set %v", s) } gap := s.FirstGap() for i := range fs { f := &fs[i] r := MappableRange{f.Start, f.End} if !gap.Range().IsSupersetOf(r) { return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: %v => %v", r, f.Value) } gap = s.InsertWithoutMerging(gap, r, f.Value).NextGap() } return nil } // segmentTestCheck returns an error if s is incorrectly sorted, does not // contain exactly expectedSegments segments, or contains a segment which // fails the passed check. // // This should be used only for testing, and has been added to this package for // templating convenience. func (s *MappingSet) segmentTestCheck(expectedSegments int, segFunc func(int, MappableRange, MappingsOfRange) error) error { havePrev := false prev := uint64(0) nrSegments := 0 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { next := seg.Start() if havePrev && prev >= next { return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments) } if segFunc != nil { if err := segFunc(nrSegments, seg.Range(), seg.Value()); err != nil { return err } } prev = next havePrev = true nrSegments++ } if nrSegments != expectedSegments { return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments) } return nil } // countSegments counts the number of segments in the set. // // Similar to Check, this should only be used for testing. func (s *MappingSet) countSegments() (segments int) { for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { segments++ } return segments } func (s *MappingSet) saveRoot() []MappingFlatSegment { fs := s.ExportSlice() fs = fs[:len(fs):len(fs)] return fs } func (s *MappingSet) loadRoot(_ context.Context, fs []MappingFlatSegment) { if err := s.ImportSlice(fs); err != nil { panic(err) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/memmap/memmap.go000066400000000000000000000471341465435605700240400ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package memmap defines semantics for memory mappings. package memmap import ( "fmt" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" ) // Mappable represents a memory-mappable object, a mutable mapping from uint64 // offsets to (File, uint64 File offset) pairs. // // See mm/mm.go for Mappable's place in the lock order. // // All Mappable methods have the following preconditions: // - hostarch.AddrRanges and MappableRanges must be non-empty (Length() != 0). // - hostarch.Addrs and Mappable offsets must be page-aligned. type Mappable interface { // AddMapping notifies the Mappable of a mapping from addresses ar in ms to // offsets [offset, offset+ar.Length()) in this Mappable. // // The writable flag indicates whether the backing data for a Mappable can // be modified through the mapping. Effectively, this means a shared mapping // where Translate may be called with at.Write == true. This is a property // established at mapping creation and must remain constant throughout the // lifetime of the mapping. // // Preconditions: offset+ar.Length() does not overflow. AddMapping(ctx context.Context, ms MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error // RemoveMapping notifies the Mappable of the removal of a mapping from // addresses ar in ms to offsets [offset, offset+ar.Length()) in this // Mappable. // // Preconditions: // * offset+ar.Length() does not overflow. // * The removed mapping must exist. writable must match the // corresponding call to AddMapping. RemoveMapping(ctx context.Context, ms MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) // CopyMapping notifies the Mappable of an attempt to copy a mapping in ms // from srcAR to dstAR. For most Mappables, this is equivalent to // AddMapping. Note that it is possible that srcAR.Length() != dstAR.Length(), // and also that srcAR.Length() == 0. // // CopyMapping is only called when a mapping is copied within a given // MappingSpace; it is analogous to Linux's vm_operations_struct::mremap. // // Preconditions: // * offset+srcAR.Length() and offset+dstAR.Length() do not overflow. // * The mapping at srcAR must exist. writable must match the // corresponding call to AddMapping. CopyMapping(ctx context.Context, ms MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error // Translate returns the Mappable's current mappings for at least the range // of offsets specified by required, and at most the range of offsets // specified by optional. at is the set of access types that may be // performed using the returned Translations. If not all required offsets // are translated, it returns a non-nil error explaining why. // // Translations are valid until invalidated by a callback to // MappingSpace.Invalidate or until the caller removes its mapping of the // translated range. Mappable implementations must ensure that at least one // reference is held on all pages in a File that may be the result // of a valid Translation. // // Preconditions: // * required.Length() > 0. // * optional.IsSupersetOf(required). // * required and optional must be page-aligned. // * The caller must have established a mapping for all of the queried // offsets via a previous call to AddMapping. // * The caller is responsible for ensuring that calls to Translate // synchronize with invalidation. // // Postconditions: See CheckTranslateResult. Translate(ctx context.Context, required, optional MappableRange, at hostarch.AccessType) ([]Translation, error) // InvalidateUnsavable requests that the Mappable invalidate Translations // that cannot be preserved across save/restore. // // Invariant: InvalidateUnsavable never races with concurrent calls to any // other Mappable methods. InvalidateUnsavable(ctx context.Context) error } // Translations are returned by Mappable.Translate. type Translation struct { // Source is the translated range in the Mappable. Source MappableRange // File is the mapped file. File File // Offset is the offset into File at which this Translation begins. Offset uint64 // Perms is the set of permissions for which platform.AddressSpace.MapFile // and platform.AddressSpace.MapInternal on this Translation is permitted. Perms hostarch.AccessType } // FileRange returns the FileRange represented by t. func (t Translation) FileRange() FileRange { return FileRange{t.Offset, t.Offset + t.Source.Length()} } // CheckTranslateResult returns an error if (ts, terr) does not satisfy all // postconditions for Mappable.Translate(required, optional, at). // // Preconditions: Same as Mappable.Translate. func CheckTranslateResult(required, optional MappableRange, at hostarch.AccessType, ts []Translation, terr error) error { // Verify that the inputs to Mappable.Translate were valid. if !required.WellFormed() || required.Length() == 0 { panic(fmt.Sprintf("invalid required range: %v", required)) } if !hostarch.Addr(required.Start).IsPageAligned() || !hostarch.Addr(required.End).IsPageAligned() { panic(fmt.Sprintf("unaligned required range: %v", required)) } if !optional.IsSupersetOf(required) { panic(fmt.Sprintf("optional range %v is not a superset of required range %v", optional, required)) } if !hostarch.Addr(optional.Start).IsPageAligned() || !hostarch.Addr(optional.End).IsPageAligned() { panic(fmt.Sprintf("unaligned optional range: %v", optional)) } // The first Translation must include required.Start. if len(ts) != 0 && !ts[0].Source.Contains(required.Start) { return fmt.Errorf("first Translation %+v does not cover start of required range %v", ts[0], required) } for i, t := range ts { if !t.Source.WellFormed() || t.Source.Length() == 0 { return fmt.Errorf("Translation %+v has invalid Source", t) } if !hostarch.Addr(t.Source.Start).IsPageAligned() || !hostarch.Addr(t.Source.End).IsPageAligned() { return fmt.Errorf("Translation %+v has unaligned Source", t) } if t.File == nil { return fmt.Errorf("Translation %+v has nil File", t) } if !hostarch.Addr(t.Offset).IsPageAligned() { return fmt.Errorf("Translation %+v has unaligned Offset", t) } // Translations must be contiguous and in increasing order of // Translation.Source. if i > 0 && ts[i-1].Source.End != t.Source.Start { return fmt.Errorf("Translation %+v and Translation %+v are not contiguous", ts[i-1], t) } // At least part of each Translation must be required. if t.Source.Intersect(required).Length() == 0 { return fmt.Errorf("Translation %+v lies entirely outside required range %v", t, required) } // Translations must be constrained to the optional range. if !optional.IsSupersetOf(t.Source) { return fmt.Errorf("Translation %+v lies outside optional range %v", t, optional) } // Each Translation must permit a superset of requested accesses. if !t.Perms.SupersetOf(at) { return fmt.Errorf("Translation %+v does not permit all requested access types %v", t, at) } } // If the set of Translations does not cover the entire required range, // Translate must return a non-nil error explaining why. if terr == nil { if len(ts) == 0 { return fmt.Errorf("no Translations and no error") } if t := ts[len(ts)-1]; !t.Source.Contains(required.End - 1) { return fmt.Errorf("last Translation %+v does not reach end of required range %v, but Translate returned no error", t, required) } } return nil } // BusError may be returned by implementations of Mappable.Translate for errors // that should result in SIGBUS delivery if they cause application page fault // handling to fail. type BusError struct { // Err is the original error. Err error } // Error implements error.Error. func (b *BusError) Error() string { return fmt.Sprintf("BusError: %v", b.Err.Error()) } // MappableRange represents a range of uint64 offsets into a Mappable. // // type MappableRange // String implements fmt.Stringer.String. func (mr MappableRange) String() string { return fmt.Sprintf("[%#x, %#x)", mr.Start, mr.End) } // MappingSpace represents a mutable mapping from hostarch.Addrs to (Mappable, // uint64 offset) pairs. type MappingSpace interface { // Invalidate is called to notify the MappingSpace that values returned by // previous calls to Mappable.Translate for offsets mapped by addresses in // ar are no longer valid. // // Invalidate must not take any locks preceding mm.MemoryManager.activeMu // in the lock order. // // Preconditions: // * ar.Length() != 0. // * ar must be page-aligned. Invalidate(ar hostarch.AddrRange, opts InvalidateOpts) } // InvalidateOpts holds options to MappingSpace.Invalidate. type InvalidateOpts struct { // InvalidatePrivate is true if private pages in the invalidated region // should also be discarded, causing their data to be lost. InvalidatePrivate bool } // MappingIdentity controls the lifetime of a Mappable, and provides // information about the Mappable for /proc/[pid]/maps. It is distinct from // Mappable because all Mappables that are coherent must compare equal to // support the implementation of shared futexes, but different // MappingIdentities may represent the same Mappable, in the same way that // multiple fs.Files may represent the same fs.Inode. (This similarity is not // coincidental; fs.File implements MappingIdentity, and some // fs.InodeOperations implement Mappable.) type MappingIdentity interface { // IncRef increments the MappingIdentity's reference count. IncRef() // DecRef decrements the MappingIdentity's reference count. DecRef(ctx context.Context) // MappedName returns the application-visible name shown in // /proc/[pid]/maps. MappedName(ctx context.Context) string // DeviceID returns the device number shown in /proc/[pid]/maps. DeviceID() uint64 // InodeID returns the inode number shown in /proc/[pid]/maps. InodeID() uint64 // Msync has the same semantics as fs.FileOperations.Fsync(ctx, // int64(mr.Start), int64(mr.End-1), fs.SyncData). // (fs.FileOperations.Fsync() takes an inclusive end, but mr.End is // exclusive, hence mr.End-1.) It is defined rather than Fsync so that // implementors don't need to depend on the fs package for fs.SyncType. Msync(ctx context.Context, mr MappableRange) error } // MLockMode specifies the memory locking behavior of a memory mapping. type MLockMode int // Note that the ordering of MLockModes is significant; see // mm.MemoryManager.defMLockMode. const ( // MLockNone specifies that a mapping has no memory locking behavior. // // This must be the zero value for MLockMode. MLockNone MLockMode = iota // MLockEager specifies that a mapping is memory-locked, as by mlock() or // similar. Pages in the mapping should be made, and kept, resident in // physical memory as soon as possible. // // As of this writing, MLockEager does not cause memory-locking to be // requested from the host; it only affects the sentry's memory management // behavior. // // MLockEager is analogous to Linux's VM_LOCKED. MLockEager // MLockLazy specifies that a mapping is memory-locked, as by mlock() or // similar. Pages in the mapping should be kept resident in physical memory // once they have been made resident due to e.g. a page fault. // // As of this writing, MLockLazy does not cause memory-locking to be // requested from the host; in fact, it has virtually no effect, except for // interactions between mlocked pages and other syscalls. // // MLockLazy is analogous to Linux's VM_LOCKED | VM_LOCKONFAULT. MLockLazy ) // MMapOpts specifies a request to create a memory mapping. type MMapOpts struct { // Length is the length of the mapping. Length uint64 // MappingIdentity controls the lifetime of Mappable, and provides // properties of the mapping shown in /proc/[pid]/maps. If MMapOpts is used // to successfully create a memory mapping, a reference is taken on // MappingIdentity. MappingIdentity MappingIdentity // Mappable is the Mappable to be mapped. If Mappable is nil, the mapping // is anonymous. If Mappable is not nil, it must remain valid as long as a // reference is held on MappingIdentity. Mappable Mappable // Offset is the offset into Mappable to map. If Mappable is nil, Offset is // ignored. Offset uint64 // Addr is the suggested address for the mapping. Addr hostarch.Addr // Fixed specifies whether this is a fixed mapping (it must be located at // Addr). Fixed bool // Unmap specifies whether existing mappings in the range being mapped may // be replaced. If Unmap is true, Fixed must be true. Unmap bool // If Map32Bit is true, all addresses in the created mapping must fit in a // 32-bit integer. (Note that the "end address" of the mapping, i.e. the // address of the first byte *after* the mapping, need not fit in a 32-bit // integer.) Map32Bit is ignored if Fixed is true. Map32Bit bool // Perms is the set of permissions to the applied to this mapping. Perms hostarch.AccessType // MaxPerms limits the set of permissions that may ever apply to this // mapping. If Mappable is not nil, all memmap.Translations returned by // Mappable.Translate must support all accesses in MaxPerms. // // Preconditions: MaxAccessType should be an effective AccessType, as // access cannot be limited beyond effective AccessTypes. MaxPerms hostarch.AccessType // Private is true if writes to the mapping should be propagated to a copy // that is exclusive to the MemoryManager. Private bool // GrowsDown is true if the mapping should be automatically expanded // downward on guard page faults. GrowsDown bool // Stack is equivalent to MAP_STACK, which has no mandatory semantics in // Linux. Stack bool PlatformEffect MMapPlatformEffect // MLockMode specifies the memory locking behavior of the mapping. MLockMode MLockMode // Hint is the name used for the mapping in /proc/[pid]/maps. If Hint is // empty, MappingIdentity.MappedName() will be used instead. // // TODO(jamieliu): Replace entirely with MappingIdentity? Hint string // Force means to skip validation checks of Addr and Length. It can be // used to create special mappings below mm.layout.MinAddr and // mm.layout.MaxAddr. It has to be used with caution. // // If Force is true, Unmap and Fixed must be true. Force bool // SentryOwnedContent indicates the sentry exclusively controls the // underlying memory backing the mapping thus the memory content is // guaranteed not to be modified outside the sentry's purview. SentryOwnedContent bool } // MMapPlatformEffect is the type of MMapOpts.PlatformEffect. type MMapPlatformEffect uint8 // Possible values for MMapOpts.PlatformEffect: const ( // PlatformEffectDefault indicates that no specific behavior is requested // from the platform. PlatformEffectDefault MMapPlatformEffect = iota // PlatformEffectPopulate indicates that platform mappings should be // established for all pages in the mapping. PlatformEffectPopulate // PlatformEffectCommit is like PlatformEffectPopulate, but also requests // that the platform eagerly commit resources to the mapping, as in // platform.AddressSpace.MapFile(precommit=true). PlatformEffectCommit ) // File represents a host file that may be mapped into an platform.AddressSpace. type File interface { // All pages in a File are reference-counted. // IncRef increments the reference count on all pages in fr and // associates each page with a memCgID (memory cgroup id) to which it // belongs. memCgID will not be changed if the page already exists. // // Preconditions: // * fr.Start and fr.End must be page-aligned. // * fr.Length() > 0. // * At least one reference must be held on all pages in fr. (The File // interface does not provide a way to acquire an initial reference; // implementors may define mechanisms for doing so.) IncRef(fr FileRange, memCgID uint32) // DecRef decrements the reference count on all pages in fr. // // Preconditions: // * fr.Start and fr.End must be page-aligned. // * fr.Length() > 0. // * At least one reference must be held on all pages in fr. DecRef(fr FileRange) // MapInternal returns a mapping of the given file offsets in the invoking // process' address space for reading and writing. // // Note that fr.Start and fr.End need not be page-aligned. // // Preconditions: // * fr.Length() > 0. // * At least one reference must be held on all pages in fr. // // Postconditions: The returned mapping is valid as long as at least one // reference is held on the mapped pages. MapInternal(fr FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) // BufferReadAt reads len(dst) bytes from the file into dst, starting at // file offset off. It returns the number of bytes read. Like // io.ReaderAt.ReadAt(), it never returns a short read with a nil error. // // Implementations of File for which MapInternal() never returns // BufferedIOFallbackErr can embed NoBufferedIOFallback to obtain an // appropriate implementation of BufferReadAt. // // Preconditions: // * MapInternal() returned a BufferedIOFallbackErr. // * At least one reference must be held on all read pages. BufferReadAt(off uint64, dst []byte) (uint64, error) // BufferWriteAt writes len(src) bytes src to the file, starting at file // offset off. It returns the number of bytes written. Like // io.WriterAt.WriteAt(), it never returns a short write with a nil error. // // Implementations of File for which MapInternal() never returns // BufferedIOFallbackErr can embed NoBufferedIOFallback to obtain an // appropriate implementation of BufferWriteAt. // // Preconditions: // * MapInternal() returned a BufferedIOFallbackErr. // * At least one reference must be held on all written pages. BufferWriteAt(off uint64, src []byte) (uint64, error) // FD returns the file descriptor represented by the File. // // The only permitted operation on the returned file descriptor is to map // pages from it consistent with the requirements of AddressSpace.MapFile. FD() int } // BufferedIOFallbackErr is returned (by value) by implementations of // File.MapInternal() that cannot succeed, but can still support memory-mapped // I/O by falling back to buffered reads and writes. type BufferedIOFallbackErr struct{} // Error implements error.Error. func (BufferedIOFallbackErr) Error() string { return "memmap.File.MapInternal() is unsupported, fall back to buffered R/W for internally-mapped I/O" } // NoBufferedIOFallback implements File.BufferReadAt() and BufferWriteAt() for // implementations of File for which MapInternal() never returns // BufferedIOFallbackErr. type NoBufferedIOFallback struct{} // BufferReadAt implements File.BufferReadAt. func (NoBufferedIOFallback) BufferReadAt(off uint64, dst []byte) (uint64, error) { panic("unimplemented: memmap.File.MapInternal() should not have returned BufferedIOFallbackErr") } // BufferWriteAt implements File.BufferWriteAt. func (NoBufferedIOFallback) BufferWriteAt(off uint64, src []byte) (uint64, error) { panic("unimplemented: memmap.File.MapInternal() should not have returned BufferedIOFallbackErr") } // FileRange represents a range of uint64 offsets into a File. // // type FileRange // String implements fmt.Stringer.String. func (fr FileRange) String() string { return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/memmap/memmap_impl_state_autogen.go000066400000000000000000000055111465435605700277740ustar00rootroot00000000000000// automatically generated by stateify. package memmap import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (s *MappingSet) StateTypeName() string { return "pkg/sentry/memmap.MappingSet" } func (s *MappingSet) StateFields() []string { return []string{ "root", } } func (s *MappingSet) beforeSave() {} // +checklocksignore func (s *MappingSet) StateSave(stateSinkObject state.Sink) { s.beforeSave() var rootValue []MappingFlatSegment rootValue = s.saveRoot() stateSinkObject.SaveValue(0, rootValue) } func (s *MappingSet) afterLoad(context.Context) {} // +checklocksignore func (s *MappingSet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new([]MappingFlatSegment), func(y any) { s.loadRoot(ctx, y.([]MappingFlatSegment)) }) } func (n *Mappingnode) StateTypeName() string { return "pkg/sentry/memmap.Mappingnode" } func (n *Mappingnode) StateFields() []string { return []string{ "nrSegments", "parent", "parentIndex", "hasChildren", "maxGap", "keys", "values", "children", } } func (n *Mappingnode) beforeSave() {} // +checklocksignore func (n *Mappingnode) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.nrSegments) stateSinkObject.Save(1, &n.parent) stateSinkObject.Save(2, &n.parentIndex) stateSinkObject.Save(3, &n.hasChildren) stateSinkObject.Save(4, &n.maxGap) stateSinkObject.Save(5, &n.keys) stateSinkObject.Save(6, &n.values) stateSinkObject.Save(7, &n.children) } func (n *Mappingnode) afterLoad(context.Context) {} // +checklocksignore func (n *Mappingnode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.nrSegments) stateSourceObject.Load(1, &n.parent) stateSourceObject.Load(2, &n.parentIndex) stateSourceObject.Load(3, &n.hasChildren) stateSourceObject.Load(4, &n.maxGap) stateSourceObject.Load(5, &n.keys) stateSourceObject.Load(6, &n.values) stateSourceObject.Load(7, &n.children) } func (m *MappingFlatSegment) StateTypeName() string { return "pkg/sentry/memmap.MappingFlatSegment" } func (m *MappingFlatSegment) StateFields() []string { return []string{ "Start", "End", "Value", } } func (m *MappingFlatSegment) beforeSave() {} // +checklocksignore func (m *MappingFlatSegment) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.Start) stateSinkObject.Save(1, &m.End) stateSinkObject.Save(2, &m.Value) } func (m *MappingFlatSegment) afterLoad(context.Context) {} // +checklocksignore func (m *MappingFlatSegment) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.Start) stateSourceObject.Load(1, &m.End) stateSourceObject.Load(2, &m.Value) } func init() { state.Register((*MappingSet)(nil)) state.Register((*Mappingnode)(nil)) state.Register((*MappingFlatSegment)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/memmap/memmap_state_autogen.go000066400000000000000000000043661465435605700267620ustar00rootroot00000000000000// automatically generated by stateify. package memmap import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (fr *FileRange) StateTypeName() string { return "pkg/sentry/memmap.FileRange" } func (fr *FileRange) StateFields() []string { return []string{ "Start", "End", } } func (fr *FileRange) beforeSave() {} // +checklocksignore func (fr *FileRange) StateSave(stateSinkObject state.Sink) { fr.beforeSave() stateSinkObject.Save(0, &fr.Start) stateSinkObject.Save(1, &fr.End) } func (fr *FileRange) afterLoad(context.Context) {} // +checklocksignore func (fr *FileRange) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fr.Start) stateSourceObject.Load(1, &fr.End) } func (mr *MappableRange) StateTypeName() string { return "pkg/sentry/memmap.MappableRange" } func (mr *MappableRange) StateFields() []string { return []string{ "Start", "End", } } func (mr *MappableRange) beforeSave() {} // +checklocksignore func (mr *MappableRange) StateSave(stateSinkObject state.Sink) { mr.beforeSave() stateSinkObject.Save(0, &mr.Start) stateSinkObject.Save(1, &mr.End) } func (mr *MappableRange) afterLoad(context.Context) {} // +checklocksignore func (mr *MappableRange) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &mr.Start) stateSourceObject.Load(1, &mr.End) } func (r *MappingOfRange) StateTypeName() string { return "pkg/sentry/memmap.MappingOfRange" } func (r *MappingOfRange) StateFields() []string { return []string{ "MappingSpace", "AddrRange", "Writable", } } func (r *MappingOfRange) beforeSave() {} // +checklocksignore func (r *MappingOfRange) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.MappingSpace) stateSinkObject.Save(1, &r.AddrRange) stateSinkObject.Save(2, &r.Writable) } func (r *MappingOfRange) afterLoad(context.Context) {} // +checklocksignore func (r *MappingOfRange) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.MappingSpace) stateSourceObject.Load(1, &r.AddrRange) stateSourceObject.Load(2, &r.Writable) } func init() { state.Register((*FileRange)(nil)) state.Register((*MappableRange)(nil)) state.Register((*MappingOfRange)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/000077500000000000000000000000001465435605700213615ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/active_mutex.go000066400000000000000000000046231465435605700244120ustar00rootroot00000000000000package mm import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type activeRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var activelockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type activelockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. const ( activeLockForked = activelockNameIndex(0) ) const () // Lock locks m. // +checklocksignore func (m *activeRWMutex) Lock() { locking.AddGLock(activeprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *activeRWMutex) NestedLock(i activelockNameIndex) { locking.AddGLock(activeprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *activeRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(activeprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *activeRWMutex) NestedUnlock(i activelockNameIndex) { m.mu.Unlock() locking.DelGLock(activeprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *activeRWMutex) RLock() { locking.AddGLock(activeprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *activeRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(activeprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *activeRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *activeRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *activeRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var activeprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func activeinitLockNames() { activelockNames = []string{"forked"} } func init() { activeinitLockNames() activeprefixIndex = locking.NewMutexClass(reflect.TypeOf(activeRWMutex{}), activelockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/address_space.go000066400000000000000000000160701465435605700245140ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mm import ( "fmt" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" ) // AddressSpace returns the platform.AddressSpace bound to mm. // // Preconditions: The caller must have called mm.Activate(). func (mm *MemoryManager) AddressSpace() platform.AddressSpace { if mm.active.Load() == 0 { panic("trying to use inactive address space?") } return mm.as } // Activate ensures this MemoryManager has a platform.AddressSpace. // // The caller must not hold any locks when calling Activate. // // When this MemoryManager is no longer needed by a task, it should call // Deactivate to release the reference. func (mm *MemoryManager) Activate(ctx context.Context) error { // Fast path: the MemoryManager already has an active // platform.AddressSpace, and we just need to indicate that we need it too. for { active := mm.active.Load() if active == 0 { // Fall back to the slow path. break } if mm.active.CompareAndSwap(active, active+1) { return nil } } for { // Slow path: may need to synchronize with other goroutines changing // mm.active to or from zero. mm.activeMu.Lock() // Inline Unlock instead of using a defer for performance since this // method is commonly in the hot-path. // Check if we raced with another goroutine performing activation. if mm.active.Load() > 0 { // This can't race; Deactivate can't decrease mm.active from 1 to 0 // without holding activeMu. mm.active.Add(1) mm.activeMu.Unlock() return nil } // Do we have a context? If so, then we never unmapped it. This can // only be the case if !mm.p.CooperativelySchedulesAddressSpace(). if mm.as != nil { mm.active.Store(1) mm.activeMu.Unlock() return nil } // Get a new address space. We must force unmapping by passing nil to // NewAddressSpace if requested. (As in the nil interface object, not a // typed nil.) mappingsID := (any)(mm) if mm.unmapAllOnActivate { mappingsID = nil } as, c, err := mm.p.NewAddressSpace(mappingsID) if err != nil { mm.activeMu.Unlock() return err } if as == nil { // AddressSpace is unavailable, we must wait. // // activeMu must not be held while waiting, as the user of the address // space we are waiting on may attempt to take activeMu. mm.activeMu.Unlock() sleep := mm.p.CooperativelySchedulesAddressSpace() && mm.sleepForActivation if sleep { // Mark this task sleeping while waiting for the address space to // prevent the watchdog from reporting it as a stuck task. ctx.UninterruptibleSleepStart(false) } <-c if sleep { ctx.UninterruptibleSleepFinish(false) } continue } // Okay, we could restore all mappings at this point. // But forget that. Let's just let them fault in. mm.as = as // Unmapping is done, if necessary. mm.unmapAllOnActivate = false // Now that m.as has been assigned, we can set m.active to a non-zero value // to enable the fast path. mm.active.Store(1) mm.activeMu.Unlock() return nil } } // Deactivate releases a reference to the MemoryManager. func (mm *MemoryManager) Deactivate() { // Fast path: this is not the last goroutine to deactivate the // MemoryManager. for { active := mm.active.Load() if active == 1 { // Fall back to the slow path. break } if mm.active.CompareAndSwap(active, active-1) { return } } mm.activeMu.Lock() // Same as Activate. // Still active? if mm.active.Add(-1) > 0 { mm.activeMu.Unlock() return } // Can we hold on to the address space? if !mm.p.CooperativelySchedulesAddressSpace() { mm.activeMu.Unlock() return } // Release the address space. mm.as.Release() // Lost it. mm.as = nil mm.activeMu.Unlock() } // mapASLocked maps addresses in ar into mm.as. // // Preconditions: // - mm.activeMu must be locked. // - mm.as != nil. // - ar.Length() != 0. // - ar must be page-aligned. // - pseg == mm.pmas.LowerBoundSegment(ar.Start). func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar hostarch.AddrRange, platformEffect memmap.MMapPlatformEffect) error { // By default, map entire pmas at a time, under the assumption that there // is no cost to mapping more of a pma than necessary. mapAR := hostarch.AddrRange{0, ^hostarch.Addr(hostarch.PageSize - 1)} if platformEffect != memmap.PlatformEffectDefault { // When explicitly committing, only map ar, since overmapping may incur // unexpected resource usage. When explicitly populating, do the same // since an underlying device file may be sensitive to the mapped // range. mapAR = ar } else if mapUnit := mm.p.MapUnit(); mapUnit != 0 { // Limit the range we map to ar, aligned to mapUnit. mapMask := hostarch.Addr(mapUnit - 1) mapAR.Start = ar.Start &^ mapMask // If rounding ar.End up overflows, just keep the existing mapAR.End. if end := (ar.End + mapMask) &^ mapMask; end >= ar.End { mapAR.End = end } } if checkInvariants { if !mapAR.IsSupersetOf(ar) { panic(fmt.Sprintf("mapAR %#v is not a superset of ar %#v", mapAR, ar)) } } // Since this checks ar.End and not mapAR.End, we will never map a pma that // is not required. for pseg.Ok() && pseg.Start() < ar.End { pma := pseg.ValuePtr() pmaAR := pseg.Range() pmaMapAR := pmaAR.Intersect(mapAR) perms := pma.effectivePerms if pma.needCOW { perms.Write = false } if perms.Any() { // MapFile precondition if err := mm.as.MapFile(pmaMapAR.Start, pma.file, pseg.fileRangeOf(pmaMapAR), perms, platformEffect == memmap.PlatformEffectCommit); err != nil { return err } } pseg = pseg.NextSegment() } return nil } // unmapASLocked removes all AddressSpace mappings for addresses in ar. // // Preconditions: mm.activeMu must be locked. func (mm *MemoryManager) unmapASLocked(ar hostarch.AddrRange) { if ar.Length() == 0 { return } if mm.as == nil { // No AddressSpace? Force all mappings to be unmapped on the next // Activate. mm.unmapAllOnActivate = true return } // unmapASLocked doesn't require vmas or pmas to exist for ar, so it can be // passed ranges that include addresses that can't be mapped by the // application. ar = ar.Intersect(mm.applicationAddrRange()) // Note that this AddressSpace may or may not be active. If the // platform does not require cooperative sharing of AddressSpaces, they // are retained between Deactivate/Activate calls. Despite not being // active, it is still valid to perform operations on these address // spaces. mm.as.Unmap(ar.Start, uint64(ar.Length())) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/aio_context.go000066400000000000000000000303531465435605700242300ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mm import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/usermem" ) // aioManager creates and manages asynchronous I/O contexts. // // +stateify savable type aioManager struct { // mu protects below. mu aioManagerMutex `state:"nosave"` // aioContexts is the set of asynchronous I/O contexts. contexts map[uint64]*AIOContext } func (mm *MemoryManager) destroyAIOManager(ctx context.Context) { mm.aioManager.mu.Lock() defer mm.aioManager.mu.Unlock() for id := range mm.aioManager.contexts { mm.destroyAIOContextLocked(ctx, id) } } // newAIOContext creates a new context for asynchronous I/O. // // Returns false if 'id' is currently in use. func (a *aioManager) newAIOContext(events uint32, id uint64) bool { a.mu.Lock() defer a.mu.Unlock() if _, ok := a.contexts[id]; ok { return false } a.contexts[id] = &AIOContext{ requestReady: make(chan struct{}, 1), maxOutstanding: events, } return true } // destroyAIOContext destroys an asynchronous I/O context. It doesn't wait for // for pending requests to complete. Returns the destroyed AIOContext so it can // be drained. // // Nil is returned if the context does not exist. // // Precondition: mm.aioManager.mu is locked. func (mm *MemoryManager) destroyAIOContextLocked(ctx context.Context, id uint64) *AIOContext { aioCtx, ok := mm.aioManager.contexts[id] if !ok { return nil } delete(mm.aioManager.contexts, id) aioCtx.destroy() return aioCtx } // lookupAIOContext looks up the given context. // // Returns false if context does not exist. func (a *aioManager) lookupAIOContext(id uint64) (*AIOContext, bool) { a.mu.Lock() defer a.mu.Unlock() ctx, ok := a.contexts[id] return ctx, ok } // ioResult is a completed I/O operation. // // +stateify savable type ioResult struct { data any ioEntry } // AIOContext is a single asynchronous I/O context. // // +stateify savable type AIOContext struct { // requestReady is the notification channel used for all requests. requestReady chan struct{} `state:"nosave"` // mu protects below. mu aioContextMutex `state:"nosave"` // results is the set of completed requests. results ioList // maxOutstanding is the maximum number of outstanding entries; this value // is immutable. maxOutstanding uint32 // outstanding is the number of requests outstanding; this will effectively // be the number of entries in the result list or that are expected to be // added to the result list. outstanding uint32 // dead is set when the context is destroyed. dead bool `state:"zerovalue"` } // destroy marks the context dead. func (aio *AIOContext) destroy() { aio.mu.Lock() defer aio.mu.Unlock() aio.dead = true aio.checkForDone() } // Preconditions: ctx.mu must be held by caller. func (aio *AIOContext) checkForDone() { if aio.dead && aio.outstanding == 0 { close(aio.requestReady) aio.requestReady = nil } } // Prepare reserves space for a new request, returning nil if available. // Returns EAGAIN if the context is busy and EINVAL if the context is dead. func (aio *AIOContext) Prepare() error { aio.mu.Lock() defer aio.mu.Unlock() if aio.dead { // Context died after the caller looked it up. return linuxerr.EINVAL } if aio.outstanding >= aio.maxOutstanding { // Context is busy. return linuxerr.EAGAIN } aio.outstanding++ return nil } // PopRequest pops a completed request if available, this function does not do // any blocking. Returns false if no request is available. func (aio *AIOContext) PopRequest() (any, bool) { aio.mu.Lock() defer aio.mu.Unlock() // Is there anything ready? if e := aio.results.Front(); e != nil { if aio.outstanding == 0 { panic("AIOContext outstanding is going negative") } aio.outstanding-- aio.results.Remove(e) aio.checkForDone() return e.data, true } return nil, false } // FinishRequest finishes a pending request. It queues up the data // and notifies listeners. func (aio *AIOContext) FinishRequest(data any) { aio.mu.Lock() defer aio.mu.Unlock() // Push to the list and notify opportunistically. The channel notify // here is guaranteed to be safe because outstanding must be non-zero. // The requestReady channel is only closed when outstanding reaches zero. aio.results.PushBack(&ioResult{data: data}) select { case aio.requestReady <- struct{}{}: default: } } // WaitChannel returns a channel that is notified when an AIO request is // completed. Returns nil if the context is destroyed and there are no more // outstanding requests. func (aio *AIOContext) WaitChannel() chan struct{} { aio.mu.Lock() defer aio.mu.Unlock() return aio.requestReady } // Dead returns true if the context has been destroyed. func (aio *AIOContext) Dead() bool { aio.mu.Lock() defer aio.mu.Unlock() return aio.dead } // CancelPendingRequest forgets about a request that hasn't yet completed. func (aio *AIOContext) CancelPendingRequest() { aio.mu.Lock() defer aio.mu.Unlock() if aio.outstanding == 0 { panic("AIOContext outstanding is going negative") } aio.outstanding-- aio.checkForDone() } // Drain drops all completed requests. Pending requests remain untouched. func (aio *AIOContext) Drain() { aio.mu.Lock() defer aio.mu.Unlock() if aio.outstanding == 0 { return } size := uint32(aio.results.Len()) if aio.outstanding < size { panic("AIOContext outstanding is going negative") } aio.outstanding -= size aio.results.Reset() aio.checkForDone() } // aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO // ring buffers. // // +stateify savable type aioMappable struct { aioMappableRefs mf *pgalloc.MemoryFile `state:"nosave"` fr memmap.FileRange } var aioRingBufferSize = uint64(hostarch.Addr(linux.AIORingSize).MustRoundUp()) func newAIOMappable(ctx context.Context, mf *pgalloc.MemoryFile) (*aioMappable, error) { fr, err := mf.Allocate(aioRingBufferSize, pgalloc.AllocOpts{Kind: usage.Anonymous, MemCgID: pgalloc.MemoryCgroupIDFromContext(ctx)}) if err != nil { return nil, err } m := aioMappable{mf: mf, fr: fr} m.InitRefs() return &m, nil } // DecRef implements refs.RefCounter.DecRef. func (m *aioMappable) DecRef(ctx context.Context) { m.aioMappableRefs.DecRef(func() { m.mf.DecRef(m.fr) }) } // MappedName implements memmap.MappingIdentity.MappedName. func (m *aioMappable) MappedName(ctx context.Context) string { return "[aio]" } // DeviceID implements memmap.MappingIdentity.DeviceID. func (m *aioMappable) DeviceID() uint64 { return 0 } // InodeID implements memmap.MappingIdentity.InodeID. func (m *aioMappable) InodeID() uint64 { return 0 } // Msync implements memmap.MappingIdentity.Msync. func (m *aioMappable) Msync(ctx context.Context, mr memmap.MappableRange) error { // Linux: aio_ring_fops.fsync == NULL return linuxerr.EINVAL } // AddMapping implements memmap.Mappable.AddMapping. func (m *aioMappable) AddMapping(_ context.Context, _ memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, _ bool) error { // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap() // sets VM_DONTEXPAND). if offset != 0 || uint64(ar.Length()) != aioRingBufferSize { return linuxerr.EFAULT } return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (m *aioMappable) RemoveMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, uint64, bool) { } // CopyMapping implements memmap.Mappable.CopyMapping. func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, _ bool) error { // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap() // sets VM_DONTEXPAND). if offset != 0 || uint64(dstAR.Length()) != aioRingBufferSize { return linuxerr.EFAULT } // Require that the mapping correspond to a live AIOContext. Compare // Linux's fs/aio.c:aio_ring_mremap(). mm, ok := ms.(*MemoryManager) if !ok { return linuxerr.EINVAL } am := &mm.aioManager am.mu.Lock() defer am.mu.Unlock() oldID := uint64(srcAR.Start) aioCtx, ok := am.contexts[oldID] if !ok { return linuxerr.EINVAL } aioCtx.mu.Lock() defer aioCtx.mu.Unlock() if aioCtx.dead { return linuxerr.EINVAL } // Use the new ID for the AIOContext. am.contexts[uint64(dstAR.Start)] = aioCtx delete(am.contexts, oldID) return nil } // Translate implements memmap.Mappable.Translate. func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { var err error if required.End > m.fr.Length() { err = &memmap.BusError{linuxerr.EFAULT} } if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 { return []memmap.Translation{ { Source: source, File: m.mf, Offset: m.fr.Start + source.Start, Perms: hostarch.AnyAccess, }, }, err } return nil, err } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (m *aioMappable) InvalidateUnsavable(ctx context.Context) error { return nil } // NewAIOContext creates a new context for asynchronous I/O. // // NewAIOContext is analogous to Linux's fs/aio.c:ioctx_alloc(). func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint64, error) { // libaio get_ioevents() expects context "handle" to be a valid address. // libaio peeks inside looking for a magic number. This function allocates // a page per context and keeps it set to zeroes to ensure it will not // match AIO_RING_MAGIC and make libaio happy. m, err := newAIOMappable(ctx, mm.mf) if err != nil { return 0, err } defer m.DecRef(ctx) addr, err := mm.MMap(ctx, memmap.MMapOpts{ Length: aioRingBufferSize, MappingIdentity: m, Mappable: m, // Linux uses "do_mmap_pgoff(..., PROT_READ | PROT_WRITE, ...)" in // fs/aio.c:aio_setup_ring(). Since we don't implement AIO_RING_MAGIC, // user mode should not write to this page. Perms: hostarch.Read, MaxPerms: hostarch.Read, }) if err != nil { return 0, err } id := uint64(addr) if !mm.aioManager.newAIOContext(events, id) { mm.MUnmap(ctx, addr, aioRingBufferSize) return 0, linuxerr.EINVAL } return id, nil } // DestroyAIOContext destroys an asynchronous I/O context. It returns the // destroyed context. nil if the context does not exist. func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) *AIOContext { if !mm.isValidAddr(ctx, id) { return nil } // Only unmaps after it assured that the address is a valid aio context to // prevent random memory from been unmapped. // // Note: It's possible to unmap this address and map something else into // the same address. Then it would be unmapping memory that it doesn't own. // This is, however, the way Linux implements AIO. Keeps the same [weird] // semantics in case anyone relies on it. mm.MUnmap(ctx, hostarch.Addr(id), aioRingBufferSize) mm.aioManager.mu.Lock() defer mm.aioManager.mu.Unlock() return mm.destroyAIOContextLocked(ctx, id) } // LookupAIOContext looks up the given context. It returns false if the context // does not exist. func (mm *MemoryManager) LookupAIOContext(ctx context.Context, id uint64) (*AIOContext, bool) { aioCtx, ok := mm.aioManager.lookupAIOContext(id) if !ok { return nil, false } // Protect against 'id' that is inaccessible. if !mm.isValidAddr(ctx, id) { return nil, false } return aioCtx, true } // isValidAddr determines if the address `id` is valid. (Linux also reads 4 // bytes from id). func (mm *MemoryManager) isValidAddr(ctx context.Context, id uint64) bool { var buf [4]byte _, err := mm.CopyIn(ctx, hostarch.Addr(id), buf[:], usermem.IOOpts{}) return err == nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/aio_context_mutex.go000066400000000000000000000032501465435605700254460ustar00rootroot00000000000000package mm import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type aioContextMutex struct { mu sync.Mutex } var aioContextprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var aioContextlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type aioContextlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *aioContextMutex) Lock() { locking.AddGLock(aioContextprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *aioContextMutex) NestedLock(i aioContextlockNameIndex) { locking.AddGLock(aioContextprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *aioContextMutex) Unlock() { locking.DelGLock(aioContextprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *aioContextMutex) NestedUnlock(i aioContextlockNameIndex) { locking.DelGLock(aioContextprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func aioContextinitLockNames() {} func init() { aioContextinitLockNames() aioContextprefixIndex = locking.NewMutexClass(reflect.TypeOf(aioContextMutex{}), aioContextlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/aio_context_state.go000066400000000000000000000016611465435605700254300ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mm import ( "context" "gvisor.dev/gvisor/pkg/sentry/pgalloc" ) // afterLoad is invoked by stateify. func (aio *AIOContext) afterLoad(context.Context) { aio.requestReady = make(chan struct{}, 1) } // afterLoad is invoked by stateify. func (m *aioMappable) afterLoad(ctx context.Context) { m.mf = pgalloc.MemoryFileFromContext(ctx) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/aio_manager_mutex.go000066400000000000000000000032501465435605700253740ustar00rootroot00000000000000package mm import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type aioManagerMutex struct { mu sync.Mutex } var aioManagerprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var aioManagerlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type aioManagerlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *aioManagerMutex) Lock() { locking.AddGLock(aioManagerprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *aioManagerMutex) NestedLock(i aioManagerlockNameIndex) { locking.AddGLock(aioManagerprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *aioManagerMutex) Unlock() { locking.DelGLock(aioManagerprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *aioManagerMutex) NestedUnlock(i aioManagerlockNameIndex) { locking.DelGLock(aioManagerprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func aioManagerinitLockNames() {} func init() { aioManagerinitLockNames() aioManagerprefixIndex = locking.NewMutexClass(reflect.TypeOf(aioManagerMutex{}), aioManagerlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/aio_mappable_refs.go000066400000000000000000000102371465435605700253430ustar00rootroot00000000000000package mm import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const aioMappableenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var aioMappableobj *aioMappable // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type aioMappableRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *aioMappableRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *aioMappableRefs) RefType() string { return fmt.Sprintf("%T", aioMappableobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *aioMappableRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *aioMappableRefs) LogRefs() bool { return aioMappableenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *aioMappableRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *aioMappableRefs) IncRef() { v := r.refCount.Add(1) if aioMappableenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *aioMappableRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if aioMappableenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *aioMappableRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if aioMappableenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *aioMappableRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/debug.go000066400000000000000000000050271465435605700230020ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mm import ( "bytes" "fmt" "gvisor.dev/gvisor/pkg/context" ) const ( // If checkInvariants is true, perform runtime checks for invariants // expected by the mm package. This is normally disabled since MM is a // significant hot path in general, and some such checks (notably // memmap.CheckTranslateResult) are very expensive. checkInvariants = false // If logIOErrors is true, log I/O errors that originate from MM before // converting them to EFAULT. logIOErrors = false ) // String implements fmt.Stringer.String. func (mm *MemoryManager) String() string { return mm.DebugString(context.Background()) } // DebugString returns a string containing information about mm for debugging. func (mm *MemoryManager) DebugString(ctx context.Context) string { var b bytes.Buffer // FIXME(b/235153601): Need to replace RLockBypass with RLockBypass // after fixing b/235153601. mm.mappingMu.RLockBypass() defer mm.mappingMu.RUnlockBypass() b.WriteString("VMAs:\n") for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { b.Write(mm.vmaMapsEntryLocked(ctx, vseg)) } mm.activeMu.RLock() defer mm.activeMu.RUnlock() b.WriteString("PMAs:\n") for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() { b.Write(pseg.debugStringEntryLocked()) } return string(b.Bytes()) } // Preconditions: mm.activeMu must be locked. func (pseg pmaIterator) debugStringEntryLocked() []byte { var b bytes.Buffer fmt.Fprintf(&b, "%08x-%08x ", pseg.Start(), pseg.End()) pma := pseg.ValuePtr() if pma.effectivePerms.Read { b.WriteByte('r') } else { b.WriteByte('-') } if pma.effectivePerms.Write { if pma.needCOW { b.WriteByte('c') } else { b.WriteByte('w') } } else { b.WriteByte('-') } if pma.effectivePerms.Execute { b.WriteByte('x') } else { b.WriteByte('-') } if pma.private { b.WriteByte('p') } else { b.WriteByte('s') } fmt.Fprintf(&b, " %08x %T\n", pma.off, pma.file) return b.Bytes() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/io.go000066400000000000000000000740721465435605700223310ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mm import ( "fmt" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) // There are two supported ways to copy data to/from application virtual // memory: // // 1. Internally-mapped copying: Determine the memmap.File that backs the // copied-to/from virtual address, obtain a mapping of its pages, and read or // write to the mapping. // // 2. AddressSpace copying: If platform.Platform.SupportsAddressSpaceIO() is // true, AddressSpace permissions are applicable, and an AddressSpace is // available, copy directly through the AddressSpace, handling faults as // needed. // // (Given that internally-mapped copying requires that backing memory is always // implemented using a host file descriptor, we could also preadv/pwritev to it // instead. But this would incur a host syscall for each use of the mapped // page, whereas mmap is a one-time cost.) // // The fixed overhead of internally-mapped copying is expected to be higher // than that of AddressSpace copying since the former always needs to translate // addresses, whereas the latter only needs to do so when faults occur. // However, the throughput of internally-mapped copying is expected to be // somewhat higher than that of AddressSpace copying due to the high cost of // page faults and because implementations of the latter usually rely on // safecopy, which doesn't use AVX registers. So we prefer to use AddressSpace // copying (when available) for smaller copies, and switch to internally-mapped // copying once a size threshold is exceeded. const ( // copyMapMinBytes is the size threshold for switching to internally-mapped // copying in CopyOut, CopyIn, and ZeroOut. copyMapMinBytes = 32 << 10 // 32 KB // rwMapMinBytes is the size threshold for switching to internally-mapped // copying in CopyOutFrom and CopyInTo. It's lower than copyMapMinBytes // since AddressSpace copying in this case requires additional buffering; // see CopyOutFrom for details. rwMapMinBytes = 512 ) // CheckIORange is similar to hostarch.Addr.ToRange, but applies bounds checks // consistent with Linux's arch/x86/include/asm/uaccess.h:access_ok(). // // Preconditions: length >= 0. func (mm *MemoryManager) CheckIORange(addr hostarch.Addr, length int64) (hostarch.AddrRange, bool) { // Note that access_ok() constrains end even if length == 0. ar, ok := addr.ToRange(uint64(length)) return ar, (ok && ar.End <= mm.layout.MaxAddr) } // checkIOVec applies bound checks consistent with Linux's // arch/x86/include/asm/uaccess.h:access_ok() to ars. func (mm *MemoryManager) checkIOVec(ars hostarch.AddrRangeSeq) bool { for !ars.IsEmpty() { ar := ars.Head() if _, ok := mm.CheckIORange(ar.Start, int64(ar.Length())); !ok { return false } ars = ars.Tail() } return true } func (mm *MemoryManager) asioEnabled(opts usermem.IOOpts) bool { return mm.haveASIO && !opts.IgnorePermissions && opts.AddressSpaceActive } // translateIOError converts errors to EFAULT, as is usually reported for all // I/O errors originating from MM in Linux. func translateIOError(ctx context.Context, err error) error { if err == nil { return nil } if logIOErrors { ctx.Debugf("MM I/O error: %v", err) } return linuxerr.EFAULT } // CopyOut implements usermem.IO.CopyOut. func (mm *MemoryManager) CopyOut(ctx context.Context, addr hostarch.Addr, src []byte, opts usermem.IOOpts) (int, error) { ar, ok := mm.CheckIORange(addr, int64(len(src))) if !ok { return 0, linuxerr.EFAULT } if len(src) == 0 { return 0, nil } // Do AddressSpace IO if applicable. if mm.asioEnabled(opts) && len(src) < copyMapMinBytes { return mm.asCopyOut(ctx, addr, src) } // Go through internal mappings. // NOTE(gvisor.dev/issue/10331): Using mm.withInternalMappings() here means // that if we encounter any memmap.BufferedIOFallbackErrs, this copy will // traverse an unnecessary layer of buffering. This can be fixed by // inlining mm.withInternalMappings() and passing src subslices directly to // memmap.File.BufferWriteAt(). n64, err := mm.withInternalMappings(ctx, ar, hostarch.Write, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { n, err := safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src))) return n, translateIOError(ctx, err) }) return int(n64), err } func (mm *MemoryManager) asCopyOut(ctx context.Context, addr hostarch.Addr, src []byte) (int, error) { var done int for { n, err := mm.as.CopyOut(addr+hostarch.Addr(done), src[done:]) done += n if err == nil { return done, nil } if f, ok := err.(platform.SegmentationFault); ok { ar, _ := addr.ToRange(uint64(len(src))) if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.Write); err != nil { return done, err } continue } return done, translateIOError(ctx, err) } } // CopyIn implements usermem.IO.CopyIn. func (mm *MemoryManager) CopyIn(ctx context.Context, addr hostarch.Addr, dst []byte, opts usermem.IOOpts) (int, error) { ar, ok := mm.CheckIORange(addr, int64(len(dst))) if !ok { return 0, linuxerr.EFAULT } if len(dst) == 0 { return 0, nil } // Do AddressSpace IO if applicable. if mm.asioEnabled(opts) && len(dst) < copyMapMinBytes { return mm.asCopyIn(ctx, addr, dst) } // Go through internal mappings. // NOTE(gvisor.dev/issue/10331): Using mm.withInternalMappings() here means // that if we encounter any memmap.BufferedIOFallbackErrs, this copy will // traverse an unnecessary layer of buffering. This can be fixed by // inlining mm.withInternalMappings() and passing dst subslices directly to // memmap.File.BufferReadAt(). n64, err := mm.withInternalMappings(ctx, ar, hostarch.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { n, err := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), ims) return n, translateIOError(ctx, err) }) return int(n64), err } func (mm *MemoryManager) asCopyIn(ctx context.Context, addr hostarch.Addr, dst []byte) (int, error) { var done int for { n, err := mm.as.CopyIn(addr+hostarch.Addr(done), dst[done:]) done += n if err == nil { return done, nil } if f, ok := err.(platform.SegmentationFault); ok { ar, _ := addr.ToRange(uint64(len(dst))) if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.Read); err != nil { return done, err } continue } return done, translateIOError(ctx, err) } } // ZeroOut implements usermem.IO.ZeroOut. func (mm *MemoryManager) ZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64, opts usermem.IOOpts) (int64, error) { ar, ok := mm.CheckIORange(addr, toZero) if !ok { return 0, linuxerr.EFAULT } if toZero == 0 { return 0, nil } // Do AddressSpace IO if applicable. if mm.asioEnabled(opts) && toZero < copyMapMinBytes { return mm.asZeroOut(ctx, addr, toZero) } // Go through internal mappings. return mm.withInternalMappings(ctx, ar, hostarch.Write, opts.IgnorePermissions, func(dsts safemem.BlockSeq) (uint64, error) { n, err := safemem.ZeroSeq(dsts) return n, translateIOError(ctx, err) }) } func (mm *MemoryManager) asZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64) (int64, error) { var done int64 for { n, err := mm.as.ZeroOut(addr+hostarch.Addr(done), uintptr(toZero-done)) done += int64(n) if err == nil { return done, nil } if f, ok := err.(platform.SegmentationFault); ok { ar, _ := addr.ToRange(uint64(toZero)) if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.Write); err != nil { return done, err } continue } return done, translateIOError(ctx, err) } } // CopyOutFrom implements usermem.IO.CopyOutFrom. func (mm *MemoryManager) CopyOutFrom(ctx context.Context, ars hostarch.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) { if !mm.checkIOVec(ars) { return 0, linuxerr.EFAULT } if ars.NumBytes() == 0 { return 0, nil } // Do AddressSpace IO if applicable. if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes { // We have to introduce a buffered copy, instead of just passing a // safemem.BlockSeq representing addresses in the AddressSpace to src. // This is because usermem.IO.CopyOutFrom() guarantees that it calls // src.ReadToBlocks() at most once, which is incompatible with handling // faults between calls. In the future, this is probably best resolved // by introducing a CopyOutFrom variant or option that allows it to // call src.ReadToBlocks() any number of times. // // This issue applies to CopyInTo as well. buf := make([]byte, int(ars.NumBytes())) bufN, bufErr := src.ReadToBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf))) var done int64 for done < int64(bufN) { ar := ars.Head() cplen := int64(ar.Length()) if cplen > int64(bufN)-done { cplen = int64(bufN) - done } n, err := mm.asCopyOut(ctx, ar.Start, buf[int(done):int(done+cplen)]) done += int64(n) if err != nil { return done, err } ars = ars.Tail() } // Do not convert errors returned by src to EFAULT. return done, bufErr } // Go through internal mappings. return mm.withVecInternalMappings(ctx, ars, hostarch.Write, opts.IgnorePermissions, src.ReadToBlocks) } // CopyInTo implements usermem.IO.CopyInTo. func (mm *MemoryManager) CopyInTo(ctx context.Context, ars hostarch.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) { if !mm.checkIOVec(ars) { return 0, linuxerr.EFAULT } if ars.NumBytes() == 0 { return 0, nil } // Do AddressSpace IO if applicable. if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes { buf := make([]byte, int(ars.NumBytes())) var done int var bufErr error for !ars.IsEmpty() { ar := ars.Head() var n int n, bufErr = mm.asCopyIn(ctx, ar.Start, buf[done:done+int(ar.Length())]) done += n if bufErr != nil { break } ars = ars.Tail() } n, err := dst.WriteFromBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:done]))) if err != nil { return int64(n), err } // Do not convert errors returned by dst to EFAULT. return int64(n), bufErr } // Go through internal mappings. return mm.withVecInternalMappings(ctx, ars, hostarch.Read, opts.IgnorePermissions, dst.WriteFromBlocks) } // EnsurePMAsExist attempts to ensure that PMAs exist for the given addr with the // requested length. It returns the length to which it was able to either // initialize PMAs for, or ascertain that PMAs exist for. If this length is // smaller than the requested length it returns an error explaining why. func (mm *MemoryManager) EnsurePMAsExist(ctx context.Context, addr hostarch.Addr, length int64, opts usermem.IOOpts) (int64, error) { ar, ok := mm.CheckIORange(addr, length) if !ok { return 0, linuxerr.EFAULT } n64, err := mm.withInternalMappings(ctx, ar, hostarch.Write, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { return uint64(ims.NumBytes()), nil }) return int64(n64), err } // SwapUint32 implements usermem.IO.SwapUint32. func (mm *MemoryManager) SwapUint32(ctx context.Context, addr hostarch.Addr, new uint32, opts usermem.IOOpts) (uint32, error) { ar, ok := mm.CheckIORange(addr, 4) if !ok { return 0, linuxerr.EFAULT } // Do AddressSpace IO if applicable. if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions { for { old, err := mm.as.SwapUint32(addr, new) if err == nil { return old, nil } if f, ok := err.(platform.SegmentationFault); ok { if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.ReadWrite); err != nil { return 0, err } continue } return 0, translateIOError(ctx, err) } } // Go through internal mappings. var old uint32 _, err := mm.withInternalMappings(ctx, ar, hostarch.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { // Atomicity is unachievable across mappings. return 0, linuxerr.EFAULT } im := ims.Head() var err error old, err = safemem.SwapUint32(im, new) if err != nil { return 0, translateIOError(ctx, err) } // Return the number of bytes read. return 4, nil }) return old, err } // CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32. func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr hostarch.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) { ar, ok := mm.CheckIORange(addr, 4) if !ok { return 0, linuxerr.EFAULT } // Do AddressSpace IO if applicable. if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions { for { prev, err := mm.as.CompareAndSwapUint32(addr, old, new) if err == nil { return prev, nil } if f, ok := err.(platform.SegmentationFault); ok { if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.ReadWrite); err != nil { return 0, err } continue } return 0, translateIOError(ctx, err) } } // Go through internal mappings. var prev uint32 _, err := mm.withInternalMappings(ctx, ar, hostarch.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { // Atomicity is unachievable across mappings. return 0, linuxerr.EFAULT } im := ims.Head() var err error prev, err = safemem.CompareAndSwapUint32(im, old, new) if err != nil { return 0, translateIOError(ctx, err) } // Return the number of bytes read. return 4, nil }) return prev, err } // LoadUint32 implements usermem.IO.LoadUint32. func (mm *MemoryManager) LoadUint32(ctx context.Context, addr hostarch.Addr, opts usermem.IOOpts) (uint32, error) { ar, ok := mm.CheckIORange(addr, 4) if !ok { return 0, linuxerr.EFAULT } // Do AddressSpace IO if applicable. if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions { for { val, err := mm.as.LoadUint32(addr) if err == nil { return val, nil } if f, ok := err.(platform.SegmentationFault); ok { if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.Read); err != nil { return 0, err } continue } return 0, translateIOError(ctx, err) } } // Go through internal mappings. var val uint32 _, err := mm.withInternalMappings(ctx, ar, hostarch.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { // Atomicity is unachievable across mappings. return 0, linuxerr.EFAULT } im := ims.Head() var err error val, err = safemem.LoadUint32(im) if err != nil { return 0, translateIOError(ctx, err) } // Return the number of bytes read. return 4, nil }) return val, err } // handleASIOFault handles a page fault at address addr for an AddressSpaceIO // operation spanning ioar. // // Preconditions: // - mm.as != nil. // - ioar.Length() != 0. // - ioar.Contains(addr). func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr hostarch.Addr, ioar hostarch.AddrRange, at hostarch.AccessType) error { // Try to map all remaining pages in the I/O operation. This RoundUp can't // overflow because otherwise it would have been caught by CheckIORange. end, _ := ioar.End.RoundUp() ar := hostarch.AddrRange{addr.RoundDown(), end} // Don't bother trying existingPMAsLocked; in most cases, if we did have // existing pmas, we wouldn't have faulted. // Ensure that we have usable vmas. Here and below, only return early if we // can't map the first (faulting) page; failure to map later pages are // silently ignored. This maximizes partial success. mm.mappingMu.RLock() vseg, vend, err := mm.getVMAsLocked(ctx, ar, at, false) if vendaddr := vend.Start(); vendaddr < ar.End { if vendaddr <= ar.Start { mm.mappingMu.RUnlock() return translateIOError(ctx, err) } ar.End = vendaddr } // Ensure that we have usable pmas. mm.activeMu.Lock() pseg, pend, err := mm.getPMAsLocked(ctx, vseg, ar, at, true /* callerIndirectCommit */) mm.mappingMu.RUnlock() if pendaddr := pend.Start(); pendaddr < ar.End { if pendaddr <= ar.Start { mm.activeMu.Unlock() return translateIOError(ctx, err) } ar.End = pendaddr } // Downgrade to a read-lock on activeMu since we don't need to mutate pmas // anymore. mm.activeMu.DowngradeLock() err = mm.mapASLocked(pseg, ar, memmap.PlatformEffectDefault) mm.activeMu.RUnlock() return translateIOError(ctx, err) } // withInternalMappings ensures that pmas exist for all addresses in ar, // support access of type (at, ignorePermissions), and have internal mappings // cached. It then calls f with mm.activeMu locked for reading, passing // internal mappings for the subrange of ar for which this property holds. // // withInternalMappings takes a function returning uint64 since many safemem // functions have this property, but returns an int64 since this is usually // more useful for usermem.IO methods. // // Preconditions: 0 < ar.Length() <= math.MaxInt64. func (mm *MemoryManager) withInternalMappings(ctx context.Context, ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { // If pmas are already available, we can do IO without touching mm.vmas or // mm.mappingMu. mm.activeMu.RLock() if pseg := mm.existingPMAsLocked(ar, at, ignorePermissions, true /* needInternalMappings */); pseg.Ok() { n, err := f(mm.internalMappingsLocked(pseg, ar)) mm.activeMu.RUnlock() // Do not convert errors returned by f to EFAULT. return int64(n), err } mm.activeMu.RUnlock() // Ensure that we have usable vmas. mm.mappingMu.RLock() vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions) if vendaddr := vend.Start(); vendaddr < ar.End { if vendaddr <= ar.Start { mm.mappingMu.RUnlock() return 0, translateIOError(ctx, verr) } ar.End = vendaddr } // Ensure that we have usable pmas. mm.activeMu.Lock() pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at, true /* callerIndirectCommit */) mm.mappingMu.RUnlock() if pendaddr := pend.Start(); pendaddr < ar.End { if pendaddr <= ar.Start { mm.activeMu.Unlock() return 0, translateIOError(ctx, perr) } ar.End = pendaddr } imbs, t, imerr := mm.getIOMappingsLocked(pseg, ar, at) mm.activeMu.DowngradeLock() if imlen := imbs.NumBytes(); imlen < uint64(ar.Length()) { if imlen == 0 { t.flush(0, nil) mm.activeMu.RUnlock() return 0, translateIOError(ctx, imerr) } ar.End = ar.Start + hostarch.Addr(imlen) } // Do I/O. un, err := t.flush(f(imbs)) mm.activeMu.RUnlock() n := int64(un) // Return the first error in order of progress through ar. if err != nil { // Do not convert errors returned by f to EFAULT. return n, err } if imerr != nil { return n, translateIOError(ctx, imerr) } if perr != nil { return n, translateIOError(ctx, perr) } return n, translateIOError(ctx, verr) } // withVecInternalMappings ensures that pmas exist for all addresses in ars, // support access of type (at, ignorePermissions), and have internal mappings // cached. It then calls f with mm.activeMu locked for reading, passing // internal mappings for the subset of ars for which this property holds. // // Preconditions: !ars.IsEmpty(). func (mm *MemoryManager) withVecInternalMappings(ctx context.Context, ars hostarch.AddrRangeSeq, at hostarch.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { // withInternalMappings is faster than withVecInternalMappings because of // iterator plumbing (this isn't generally practical in the vector case due // to iterator invalidation between AddrRanges). Use it if possible. if ars.NumRanges() == 1 { return mm.withInternalMappings(ctx, ars.Head(), at, ignorePermissions, f) } // If pmas are already available, we can do IO without touching mm.vmas or // mm.mappingMu. mm.activeMu.RLock() if mm.existingVecPMAsLocked(ars, at, ignorePermissions, true /* needInternalMappings */) { n, err := f(mm.vecInternalMappingsLocked(ars)) mm.activeMu.RUnlock() // Do not convert errors returned by f to EFAULT. return int64(n), err } mm.activeMu.RUnlock() // Ensure that we have usable vmas. mm.mappingMu.RLock() vars, verr := mm.getVecVMAsLocked(ctx, ars, at, ignorePermissions) if vars.NumBytes() == 0 { mm.mappingMu.RUnlock() return 0, translateIOError(ctx, verr) } // Ensure that we have usable pmas. mm.activeMu.Lock() pars, perr := mm.getVecPMAsLocked(ctx, vars, at, true /* callerIndirectCommit */) mm.mappingMu.RUnlock() if pars.NumBytes() == 0 { mm.activeMu.Unlock() return 0, translateIOError(ctx, perr) } imbs, t, imerr := mm.getVecIOMappingsLocked(pars, at) mm.activeMu.DowngradeLock() if imbs.NumBytes() == 0 { t.flush(0, nil) mm.activeMu.RUnlock() return 0, translateIOError(ctx, imerr) } // Do I/O. un, err := t.flush(f(imbs)) mm.activeMu.RUnlock() n := int64(un) // Return the first error in order of progress through ars. if err != nil { // Do not convert errors from f to EFAULT. return n, err } if imerr != nil { return n, translateIOError(ctx, imerr) } if perr != nil { return n, translateIOError(ctx, perr) } return n, translateIOError(ctx, verr) } // getIOMappingsLocked returns internal mappings appropriate for I/O for // addresses in ar. If mappings are only available for a strict subset of ar, // the returned error is non-nil. // // ioBufTracker.flush() must be called on the returned ioBufTracker when the // returned mappings are no longer in use, and its return value indicates the // number of bytes actually completed after buffer flushing. Returned mappings // are valid until either mm.activeMu is unlocked or ioBufTracker.flush() is // called. // // Preconditions: // - mm.activeMu must be locked for writing. // - pseg.Range().Contains(ar.Start). // - pmas must exist for all addresses in ar. // - ar.Length() != 0. // // Postconditions: getIOMappingsLocked does not invalidate iterators into mm.pmas. func (mm *MemoryManager) getIOMappingsLocked(pseg pmaIterator, ar hostarch.AddrRange, at hostarch.AccessType) (safemem.BlockSeq, *ioBufTracker, error) { if checkInvariants { if !ar.WellFormed() || ar.Length() == 0 { panic(fmt.Sprintf("invalid ar: %v", ar)) } if !pseg.Range().Contains(ar.Start) { panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar)) } } if ar.End <= pseg.End() { // Since only one pma is involved, we can use pma.internalMappings // directly, avoiding a slice allocation. if err := pseg.getInternalMappingsLocked(); err != nil { if _, ok := err.(memmap.BufferedIOFallbackErr); ok { goto slowPath } return safemem.BlockSeq{}, nil, err } offset := uint64(ar.Start - pseg.Start()) return pseg.ValuePtr().internalMappings.DropFirst64(offset).TakeFirst64(uint64(ar.Length())), nil, nil } slowPath: ims, t, _, err := mm.getIOMappingsTrackedLocked(pseg, ar, at, nil, nil, 0) return safemem.BlockSeqFromSlice(ims), t, err } // getVecIOMappingsLocked returns internal mappings appropriate for I/O for // addresses in ars. If mappings are only available for a strict subset of ar, // the returned error is non-nil. // // ioBufTracker.flush() must be called on the returned ioBufTracker when the // returned mappings are no longer in use, and its return value indicates the // number of bytes actually completed after buffer flushing. Returned mappings // are valid until either mm.activeMu is unlocked or ioBufTracker.flush() is // called. // // Preconditions: // - mm.activeMu must be locked for writing. // - pmas must exist for all addresses in ar. // // Postconditions: getVecIOMappingsLocked does not invalidate iterators into // mm.pmas func (mm *MemoryManager) getVecIOMappingsLocked(ars hostarch.AddrRangeSeq, at hostarch.AccessType) (safemem.BlockSeq, *ioBufTracker, error) { if ars.NumRanges() == 1 { ar := ars.Head() return mm.getIOMappingsLocked(mm.pmas.FindSegment(ar.Start), ar, at) } var ims []safemem.Block var t *ioBufTracker unbufBytes := uint64(0) for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { ar := arsit.Head() if ar.Length() == 0 { continue } var err error ims, t, unbufBytes, err = mm.getIOMappingsTrackedLocked(mm.pmas.FindSegment(ar.Start), ar, at, ims, t, unbufBytes) if err != nil { return safemem.BlockSeqFromSlice(ims), t, err } } return safemem.BlockSeqFromSlice(ims), t, nil } // getIOMappingsTrackedLocked collects internal mappings appropriate for I/O // for addresses in ar, appends them to ims, and returns an updated slice. If // mappings are only available for a strict subset of ar, the returned error is // non-nil. // // If any iterated memmap.Files require buffering for I/O, they are recorded in // an ioBufTracker. Since the ioBufTracker pointer is initially nil (to // minimize overhead for the common case where no memmap.files require // buffering for I/O), getIOMappingsTrackedLocked returns an updated // ioBufTracker pointer. // // unbufBytes is the number of bytes of unbuffered mappings that have been // appended to ims since the last buffered mapping; getIOMappingsTrackedLocked // also returns an updated value for unbufBytes. // // Returned mappings are valid until either mm.activeMu is unlocked or // ioBufTracker.flush() is called. // // Preconditions: // - mm.activeMu must be locked for writing. // - pseg.Range().Contains(ar.Start). // - pmas must exist for all addresses in ar. // - ar.Length() != 0. // // Postconditions: getIOMappingsTrackedLocked does not invalidate iterators // into mm.pmas. func (mm *MemoryManager) getIOMappingsTrackedLocked(pseg pmaIterator, ar hostarch.AddrRange, at hostarch.AccessType, ims []safemem.Block, t *ioBufTracker, unbufBytes uint64) ([]safemem.Block, *ioBufTracker, uint64, error) { for { pmaAR := ar.Intersect(pseg.Range()) if err := pseg.getInternalMappingsLocked(); err == nil { // Iterate the subset of the PMA's cached internal mappings that // correspond to pmaAR, and append them to ims. for pims := pseg.ValuePtr().internalMappings.DropFirst64(uint64(pmaAR.Start - pseg.Start())).TakeFirst64(uint64(pmaAR.Length())); !pims.IsEmpty(); pims = pims.Tail() { ims = append(ims, pims.Head()) } unbufBytes += uint64(pmaAR.Length()) } else if _, ok := err.(memmap.BufferedIOFallbackErr); !ok { return ims, t, unbufBytes, err } else { // Fall back to buffered I/O as instructed. if t == nil { t = getIOBufTracker(at.Write) } buf := getByteSlicePtr(int(pmaAR.Length())) pma := pseg.ValuePtr() off := pseg.fileRangeOf(pmaAR).Start // If the caller will read from the buffer, fill it from the file; // otherwise leave it zeroed. if at.Read || at.Execute { var n uint64 n, err = pma.file.BufferReadAt(off, *buf) *buf = (*buf)[:n] } else { err = nil } if len(*buf) != 0 { ims = append(ims, safemem.BlockFromSafeSlice(*buf)) t.bufs = append(t.bufs, ioBuf{ unbufBytesBefore: unbufBytes, file: pma.file, off: off, buf: buf, }) unbufBytes = 0 } if err != nil { return ims, t, unbufBytes, err } } if ar.End <= pseg.End() { return ims, t, unbufBytes, nil } pseg, _ = pseg.NextNonEmpty() } } type ioBuf struct { unbufBytesBefore uint64 file memmap.File off uint64 buf *[]byte } type ioBufTracker struct { write bool bufs []ioBuf } var ioBufTrackerPool = sync.Pool{ New: func() any { return &ioBufTracker{} }, } func getIOBufTracker(write bool) *ioBufTracker { t := ioBufTrackerPool.Get().(*ioBufTracker) t.write = write return t } func putIOBufTracker(t *ioBufTracker) { for i := range t.bufs { t.bufs[i].file = nil putByteSlicePtr(t.bufs[i].buf) t.bufs[i].buf = nil } t.bufs = t.bufs[:0] ioBufTrackerPool.Put(t) } func (t *ioBufTracker) flush(prevN uint64, prevErr error) (uint64, error) { if t == nil { return prevN, prevErr } return t.flushSlow(prevN, prevErr) } func (t *ioBufTracker) flushSlow(prevN uint64, prevErr error) (uint64, error) { defer putIOBufTracker(t) if !t.write { return prevN, prevErr } // Flush dirty buffers to underlying memmap.Files. rem := prevN done := uint64(0) for i := range t.bufs { buf := &t.bufs[i] if rem <= buf.unbufBytesBefore { // The write ended before reaching buf.buf. break } rem -= buf.unbufBytesBefore done += buf.unbufBytesBefore n, err := buf.file.BufferWriteAt(buf.off, (*buf.buf)[:min(len(*buf.buf), int(rem))]) rem -= n done += n if err != nil { return done, err } } // All buffers covered by prevN were written back successfully. return prevN, prevErr } var byteSlicePtrPool sync.Pool // getByteSlicePtr returns a pointer to a byte slice with the given length. The // slice is either newly-allocated or recycled from a previous call to // putByteSlicePtr. The pointer should be passed to putByteSlicePtr when the // slice is no longer in use. func getByteSlicePtr(l int) *[]byte { a := byteSlicePtrPool.Get() if a == nil { s := make([]byte, l) return &s } sp := a.(*[]byte) s := *sp if l <= cap(s) { s = s[:l] } else { s = make([]byte, l) } *sp = s return sp } // putByteSlicePtr marks all of the given's slice capacity reusable by a future // call to getByteSlicePtr. func putByteSlicePtr(s *[]byte) { byteSlicePtrPool.Put(s) } // truncatedAddrRangeSeq returns a copy of ars, but with the end truncated to // at most address end on AddrRange arsit.Head(). It is used in vector I/O paths to // truncate hostarch.AddrRangeSeq when errors occur. // // Preconditions: // - !arsit.IsEmpty(). // - end <= arsit.Head().End. func truncatedAddrRangeSeq(ars, arsit hostarch.AddrRangeSeq, end hostarch.Addr) hostarch.AddrRangeSeq { ar := arsit.Head() if end <= ar.Start { return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes()) } return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes() + int64(end-ar.Start)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/io_list.go000066400000000000000000000116021465435605700233520ustar00rootroot00000000000000package mm // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type ioElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (ioElementMapper) linkerFor(elem *ioResult) *ioResult { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type ioList struct { head *ioResult tail *ioResult } // Reset resets list l to the empty state. func (l *ioList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *ioList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *ioList) Front() *ioResult { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *ioList) Back() *ioResult { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *ioList) Len() (count int) { for e := l.Front(); e != nil; e = (ioElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *ioList) PushFront(e *ioResult) { linker := ioElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { ioElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *ioList) PushFrontList(m *ioList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { ioElementMapper{}.linkerFor(l.head).SetPrev(m.tail) ioElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *ioList) PushBack(e *ioResult) { linker := ioElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { ioElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *ioList) PushBackList(m *ioList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { ioElementMapper{}.linkerFor(l.tail).SetNext(m.head) ioElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *ioList) InsertAfter(b, e *ioResult) { bLinker := ioElementMapper{}.linkerFor(b) eLinker := ioElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { ioElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *ioList) InsertBefore(a, e *ioResult) { aLinker := ioElementMapper{}.linkerFor(a) eLinker := ioElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { ioElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *ioList) Remove(e *ioResult) { linker := ioElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { ioElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { ioElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type ioEntry struct { next *ioResult prev *ioResult } // Next returns the entry that follows e in the list. // //go:nosplit func (e *ioEntry) Next() *ioResult { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *ioEntry) Prev() *ioResult { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *ioEntry) SetNext(elem *ioResult) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *ioEntry) SetPrev(elem *ioResult) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/lifecycle.go000066400000000000000000000226421465435605700236550ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mm import ( "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" ) // NewMemoryManager returns a new MemoryManager with no mappings and 1 user. func NewMemoryManager(p platform.Platform, mf *pgalloc.MemoryFile, sleepForActivation bool) *MemoryManager { return &MemoryManager{ p: p, mf: mf, haveASIO: p.SupportsAddressSpaceIO(), users: atomicbitops.FromInt32(1), auxv: arch.Auxv{}, dumpability: atomicbitops.FromInt32(int32(UserDumpable)), aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, sleepForActivation: sleepForActivation, } } // SetMmapLayout initializes mm's layout from the given arch.Context64. // // Preconditions: mm contains no mappings and is not used concurrently. func (mm *MemoryManager) SetMmapLayout(ac *arch.Context64, r *limits.LimitSet) (arch.MmapLayout, error) { layout, err := ac.NewMmapLayout(mm.p.MinUserAddress(), mm.p.MaxUserAddress(), r) if err != nil { return arch.MmapLayout{}, err } mm.layout = layout return layout, nil } // Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or // clone() (without CLONE_VM). func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { mm.AddressSpace().PreFork() defer mm.AddressSpace().PostFork() mm.metadataMu.Lock() defer mm.metadataMu.Unlock() var droppedIDs []memmap.MappingIdentity // This must run after {mm,mm2}.mappingMu.Unlock(). defer func() { for _, id := range droppedIDs { id.DecRef(ctx) } }() mm.mappingMu.RLock() defer mm.mappingMu.RUnlock() mm2 := &MemoryManager{ p: mm.p, mf: mm.mf, haveASIO: mm.haveASIO, layout: mm.layout, users: atomicbitops.FromInt32(1), brk: mm.brk, usageAS: mm.usageAS, dataAS: mm.dataAS, // "The child does not inherit its parent's memory locks (mlock(2), // mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is // MLockNone, both of which are zero values. vma.mlockMode is reset // when copied below. captureInvalidations: true, argv: mm.argv, envv: mm.envv, auxv: append(arch.Auxv(nil), mm.auxv...), // IncRef'd below, once we know that there isn't an error. executable: mm.executable, dumpability: atomicbitops.FromInt32(mm.dumpability.Load()), aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, sleepForActivation: mm.sleepForActivation, vdsoSigReturnAddr: mm.vdsoSigReturnAddr, } // Copy vmas. dontforks := false dstvgap := mm2.vmas.FirstGap() for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() { vma := srcvseg.ValuePtr().copy() vmaAR := srcvseg.Range() if vma.dontfork { length := uint64(vmaAR.Length()) mm2.usageAS -= length if vma.isPrivateDataLocked() { mm2.dataAS -= length } dontforks = true continue } // Inform the Mappable, if any, of the new mapping. if vma.mappable != nil { if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.canWriteMappableLocked()); err != nil { _, droppedIDs = mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange(), droppedIDs) return nil, err } } if vma.id != nil { vma.id.IncRef() } vma.mlockMode = memmap.MLockNone dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap() // We don't need to update mm2.usageAS since we copied it from mm // above. } // Copy pmas. We have to lock mm.activeMu for writing to make existing // private pmas copy-on-write. We also have to lock mm2.activeMu since // after copying vmas above, memmap.Mappables may call mm2.Invalidate. We // only copy private pmas, since in the common case where fork(2) is // immediately followed by execve(2), copying non-private pmas that can be // regenerated by calling memmap.Mappable.Translate is a waste of time. // (Linux does the same; compare kernel/fork.c:dup_mmap() => // mm/memory.c:copy_page_range().) mm.activeMu.Lock() defer mm.activeMu.Unlock() mm2.activeMu.NestedLock(activeLockForked) defer mm2.activeMu.NestedUnlock(activeLockForked) if dontforks { defer mm.pmas.MergeInsideRange(mm.applicationAddrRange()) } srcvseg := mm.vmas.FirstSegment() dstpgap := mm2.pmas.FirstGap() var unmapAR hostarch.AddrRange memCgID := pgalloc.MemoryCgroupIDFromContext(ctx) for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() { pma := srcpseg.ValuePtr() if !pma.private { continue } if dontforks { // Find the 'vma' that contains the starting address // associated with the 'pma' (there must be one). srcvseg = srcvseg.seekNextLowerBound(srcpseg.Start()) if checkInvariants { if !srcvseg.Ok() { panic(fmt.Sprintf("no vma covers pma range %v", srcpseg.Range())) } if srcpseg.Start() < srcvseg.Start() { panic(fmt.Sprintf("vma %v ran ahead of pma %v", srcvseg.Range(), srcpseg.Range())) } } srcpseg = mm.pmas.Isolate(srcpseg, srcvseg.Range()) if srcvseg.ValuePtr().dontfork { continue } pma = srcpseg.ValuePtr() } if !pma.needCOW { pma.needCOW = true if pma.effectivePerms.Write { // We don't want to unmap the whole address space, even though // doing so would reduce calls to unmapASLocked(), because mm // will most likely continue to be used after the fork, so // unmapping pmas unnecessarily will result in extra page // faults. But we do want to merge consecutive AddrRanges // across pma boundaries. if unmapAR.End == srcpseg.Start() { unmapAR.End = srcpseg.End() } else { if unmapAR.Length() != 0 { mm.unmapASLocked(unmapAR) } unmapAR = srcpseg.Range() } pma.effectivePerms.Write = false } pma.maxPerms.Write = false } fr := srcpseg.fileRange() // srcpseg.ValuePtr().file == mm.mf since pma.private == true. mm.mf.IncRef(fr, memCgID) addrRange := srcpseg.Range() mm2.addRSSLocked(addrRange) dstpgap = mm2.pmas.Insert(dstpgap, addrRange, *pma).NextGap() } if unmapAR.Length() != 0 { mm.unmapASLocked(unmapAR) } // Between when we call memmap.Mappable.AddMapping while copying vmas and // when we lock mm2.activeMu to copy pmas, calls to mm2.Invalidate() are // ineffective because the pmas they invalidate haven't yet been copied, // possibly allowing mm2 to get invalidated translations: // // Invalidating Mappable mm.Fork // --------------------- ------- // // mm2.Invalidate() // mm.activeMu.Lock() // mm.Invalidate() /* blocks */ // mm2.activeMu.Lock() // (mm copies invalidated pma to mm2) // // This would technically be both safe (since we only copy private pmas, // which will still hold a reference on their memory) and consistent with // Linux, but we avoid it anyway by setting mm2.captureInvalidations during // construction, causing calls to mm2.Invalidate() to be captured in // mm2.capturedInvalidations, to be replayed after pmas are copied - i.e. // here. mm2.captureInvalidations = false for _, invArgs := range mm2.capturedInvalidations { mm2.invalidateLocked(invArgs.ar, invArgs.opts.InvalidatePrivate, true) } mm2.capturedInvalidations = nil if mm2.executable != nil { mm2.executable.IncRef() } return mm2, nil } // IncUsers increments mm's user count and returns true. If the user count is // already 0, IncUsers does nothing and returns false. func (mm *MemoryManager) IncUsers() bool { for { users := mm.users.Load() if users == 0 { return false } if mm.users.CompareAndSwap(users, users+1) { return true } } } // DecUsers decrements mm's user count. If the user count reaches 0, all // mappings in mm are unmapped. func (mm *MemoryManager) DecUsers(ctx context.Context) { if users := mm.users.Add(-1); users > 0 { return } else if users < 0 { panic(fmt.Sprintf("Invalid MemoryManager.users: %d", users)) } mm.destroyAIOManager(ctx) mm.metadataMu.Lock() exe := mm.executable mm.executable = nil mm.metadataMu.Unlock() if exe != nil { exe.DecRef(ctx) } mm.activeMu.Lock() // Sanity check. if mm.active.Load() != 0 { panic("active address space lost?") } // Make sure the AddressSpace is returned. if mm.as != nil { mm.as.Release() mm.as = nil } mm.activeMu.Unlock() var droppedIDs []memmap.MappingIdentity mm.mappingMu.Lock() // If mm is being dropped before mm.SetMmapLayout was called, // mm.applicationAddrRange() will be empty. if ar := mm.applicationAddrRange(); ar.Length() != 0 { _, droppedIDs = mm.unmapLocked(ctx, ar, droppedIDs) } mm.mappingMu.Unlock() for _, id := range droppedIDs { id.DecRef(ctx) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/mapping_mutex.go000066400000000000000000000045571465435605700246000ustar00rootroot00000000000000package mm import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type mappingRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var mappinglockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type mappinglockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *mappingRWMutex) Lock() { locking.AddGLock(mappingprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *mappingRWMutex) NestedLock(i mappinglockNameIndex) { locking.AddGLock(mappingprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *mappingRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(mappingprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *mappingRWMutex) NestedUnlock(i mappinglockNameIndex) { m.mu.Unlock() locking.DelGLock(mappingprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *mappingRWMutex) RLock() { locking.AddGLock(mappingprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *mappingRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(mappingprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *mappingRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *mappingRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *mappingRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var mappingprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func mappinginitLockNames() {} func init() { mappinginitLockNames() mappingprefixIndex = locking.NewMutexClass(reflect.TypeOf(mappingRWMutex{}), mappinglockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/metadata.go000066400000000000000000000115241465435605700234730ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mm import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // Dumpability describes if and how core dumps should be created. type Dumpability int const ( // NotDumpable indicates that core dumps should never be created. NotDumpable Dumpability = iota // UserDumpable indicates that core dumps should be created, owned by // the current user. UserDumpable // RootDumpable indicates that core dumps should be created, owned by // root. RootDumpable ) // Dumpability returns the dumpability. func (mm *MemoryManager) Dumpability() Dumpability { return Dumpability(mm.dumpability.Load()) } // SetDumpability sets the dumpability. func (mm *MemoryManager) SetDumpability(d Dumpability) { mm.dumpability.Store(int32(d)) } // ArgvStart returns the start of the application argument vector. // // There is no guarantee that this value is sensible w.r.t. ArgvEnd. func (mm *MemoryManager) ArgvStart() hostarch.Addr { mm.metadataMu.Lock() defer mm.metadataMu.Unlock() return mm.argv.Start } // SetArgvStart sets the start of the application argument vector. func (mm *MemoryManager) SetArgvStart(a hostarch.Addr) { mm.metadataMu.Lock() defer mm.metadataMu.Unlock() mm.argv.Start = a } // ArgvEnd returns the end of the application argument vector. // // There is no guarantee that this value is sensible w.r.t. ArgvStart. func (mm *MemoryManager) ArgvEnd() hostarch.Addr { mm.metadataMu.Lock() defer mm.metadataMu.Unlock() return mm.argv.End } // SetArgvEnd sets the end of the application argument vector. func (mm *MemoryManager) SetArgvEnd(a hostarch.Addr) { mm.metadataMu.Lock() defer mm.metadataMu.Unlock() mm.argv.End = a } // EnvvStart returns the start of the application environment vector. // // There is no guarantee that this value is sensible w.r.t. EnvvEnd. func (mm *MemoryManager) EnvvStart() hostarch.Addr { mm.metadataMu.Lock() defer mm.metadataMu.Unlock() return mm.envv.Start } // SetEnvvStart sets the start of the application environment vector. func (mm *MemoryManager) SetEnvvStart(a hostarch.Addr) { mm.metadataMu.Lock() defer mm.metadataMu.Unlock() mm.envv.Start = a } // EnvvEnd returns the end of the application environment vector. // // There is no guarantee that this value is sensible w.r.t. EnvvStart. func (mm *MemoryManager) EnvvEnd() hostarch.Addr { mm.metadataMu.Lock() defer mm.metadataMu.Unlock() return mm.envv.End } // SetEnvvEnd sets the end of the application environment vector. func (mm *MemoryManager) SetEnvvEnd(a hostarch.Addr) { mm.metadataMu.Lock() defer mm.metadataMu.Unlock() mm.envv.End = a } // Auxv returns the current map of auxiliary vectors. func (mm *MemoryManager) Auxv() arch.Auxv { mm.metadataMu.Lock() defer mm.metadataMu.Unlock() return append(arch.Auxv(nil), mm.auxv...) } // SetAuxv sets the entire map of auxiliary vectors. func (mm *MemoryManager) SetAuxv(auxv arch.Auxv) { mm.metadataMu.Lock() defer mm.metadataMu.Unlock() mm.auxv = append(arch.Auxv(nil), auxv...) } // Executable returns the executable, if available. // // An additional reference will be taken in the case of a non-nil executable, // which must be released by the caller. func (mm *MemoryManager) Executable() *vfs.FileDescription { mm.metadataMu.Lock() defer mm.metadataMu.Unlock() if mm.executable == nil { return nil } mm.executable.IncRef() return mm.executable } // SetExecutable sets the executable. // // This takes a reference on d. func (mm *MemoryManager) SetExecutable(ctx context.Context, fd *vfs.FileDescription) { mm.metadataMu.Lock() // Grab a new reference. fd.IncRef() // Set the executable. orig := mm.executable mm.executable = fd mm.metadataMu.Unlock() // Release the old reference. // // Do this without holding the lock, since it may wind up doing some // I/O to sync the dirent, etc. if orig != nil { orig.DecRef(ctx) } } // VDSOSigReturn returns the address of vdso_sigreturn. func (mm *MemoryManager) VDSOSigReturn() uint64 { mm.metadataMu.Lock() defer mm.metadataMu.Unlock() return mm.vdsoSigReturnAddr } // SetVDSOSigReturn sets the address of vdso_sigreturn. func (mm *MemoryManager) SetVDSOSigReturn(addr uint64) { mm.metadataMu.Lock() defer mm.metadataMu.Unlock() mm.vdsoSigReturnAddr = addr } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/metadata_mutex.go000066400000000000000000000032021465435605700247070ustar00rootroot00000000000000package mm import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type metadataMutex struct { mu sync.Mutex } var metadataprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var metadatalockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type metadatalockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *metadataMutex) Lock() { locking.AddGLock(metadataprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *metadataMutex) NestedLock(i metadatalockNameIndex) { locking.AddGLock(metadataprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *metadataMutex) Unlock() { locking.DelGLock(metadataprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *metadataMutex) NestedUnlock(i metadatalockNameIndex) { locking.DelGLock(metadataprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func metadatainitLockNames() {} func init() { metadatainitLockNames() metadataprefixIndex = locking.NewMutexClass(reflect.TypeOf(metadataMutex{}), metadatalockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/mm.go000066400000000000000000000337311465435605700223300ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package mm provides a memory management subsystem. See README.md for a // detailed overview. // // Lock order: // // fs locks, except for memmap.Mappable locks // mm.MemoryManager.metadataMu // mm.MemoryManager.mappingMu // Locks taken by memmap.MappingIdentity and memmap.Mappable methods other // than Translate // kernel.TaskSet.mu // mm.MemoryManager.activeMu // Locks taken by memmap.Mappable.Translate // platform.AddressSpace locks // memmap.File locks // mm.aioManager.mu // mm.AIOContext.mu // // Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in // multiple mm.MemoryManagers, as it does so in a well-defined order (forked // child first). package mm import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // MapsCallbackFunc has all the parameters required for populating an entry of /proc/[pid]/maps. type MapsCallbackFunc func(start, end hostarch.Addr, permissions hostarch.AccessType, private string, offset uint64, devMajor, devMinor uint32, inode uint64, path string) // MemoryManager implements a virtual address space. // // +stateify savable type MemoryManager struct { // p and mfp are immutable. p platform.Platform // mf is the cached result of mfp.MemoryFile(). // // mf is immutable. mf *pgalloc.MemoryFile `state:"nosave"` // haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from // eliminating an indirect call in the hot I/O path, this makes // MemoryManager.asioEnabled() a leaf function, allowing it to be inlined. // // haveASIO is immutable. haveASIO bool `state:"nosave"` // layout is the memory layout. // // layout is set by the binary loader before the MemoryManager can be used. layout arch.MmapLayout // users is the number of dependencies on the mappings in the MemoryManager. // When the number of references in users reaches zero, all mappings are // unmapped. users atomicbitops.Int32 // mappingMu is analogous to Linux's struct mm_struct::mmap_sem. mappingMu mappingRWMutex `state:"nosave"` // vmas stores virtual memory areas. Since vmas are stored by value, // clients should usually use vmaIterator.ValuePtr() instead of // vmaIterator.Value() to get a pointer to the vma rather than a copy. // // Invariants: vmas are always page-aligned. // // vmas is protected by mappingMu. vmas vmaSet // brk is the mm's brk, which is manipulated using the brk(2) system call. // The brk is initially set up by the loader which maps an executable // binary into the mm. // // brk is protected by mappingMu. brk hostarch.AddrRange // usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks. // // usageAS is protected by mappingMu. usageAS uint64 // lockedAS is the combined size in bytes of all vmas with vma.mlockMode != // memmap.MLockNone. // // lockedAS is protected by mappingMu. lockedAS uint64 // dataAS is the size of private data segments, like mm_struct->data_vm. // It means the vma which is private, writable, not stack. // // dataAS is protected by mappingMu. dataAS uint64 // New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or // defMLockMode is greater. // // defMLockMode is protected by mappingMu. defMLockMode memmap.MLockMode // activeMu is loosely analogous to Linux's struct // mm_struct::page_table_lock. activeMu activeRWMutex `state:"nosave"` // pmas stores platform mapping areas used to implement vmas. Since pmas // are stored by value, clients should usually use pmaIterator.ValuePtr() // instead of pmaIterator.Value() to get a pointer to the pma rather than // a copy. // // Inserting or removing segments from pmas should happen along with a // call to mm.insertRSS or mm.removeRSS. // // Invariants: pmas are always page-aligned. If a pma exists for a given // address, a vma must also exist for that address. // // pmas is protected by activeMu. pmas pmaSet // curRSS is pmas.Span(), cached to accelerate updates to maxRSS. It is // reported as the MemoryManager's RSS. // // maxRSS should be modified only via insertRSS and removeRSS, not // directly. // // maxRSS is protected by activeMu. curRSS uint64 // maxRSS is the maximum resident set size in bytes of a MemoryManager. // It is tracked as the application adds and removes mappings to pmas. // // maxRSS should be modified only via insertRSS, not directly. // // maxRSS is protected by activeMu. maxRSS uint64 // as is the platform.AddressSpace that pmas are mapped into. active is the // number of contexts that require as to be non-nil; if active == 0, as may // be nil. // // as is protected by activeMu. active is manipulated with atomic memory // operations; transitions to and from zero are additionally protected by // activeMu. (This is because such transitions may need to be atomic with // changes to as.) as platform.AddressSpace `state:"nosave"` active atomicbitops.Int32 `state:"zerovalue"` // unmapAllOnActivate indicates that the next Activate call should activate // an empty AddressSpace. // // This is used to ensure that an AddressSpace cached in // NewAddressSpace is not used after some change in the MemoryManager // or VMAs has made that AddressSpace stale. // // unmapAllOnActivate is protected by activeMu. It must only be set when // there is no active or cached AddressSpace. If as != nil, then // invalidations should be propagated immediately. unmapAllOnActivate bool `state:"nosave"` // If captureInvalidations is true, calls to MM.Invalidate() are recorded // in capturedInvalidations rather than being applied immediately to pmas. // This is to avoid a race condition in MM.Fork(); see that function for // details. // // Both captureInvalidations and capturedInvalidations are protected by // activeMu. Neither need to be saved since captureInvalidations is only // enabled during MM.Fork(), during which saving can't occur. captureInvalidations bool `state:"zerovalue"` capturedInvalidations []invalidateArgs `state:"nosave"` // dumpability describes if and how this MemoryManager may be dumped to // userspace. This is read under kernel.TaskSet.mu, so it can't be protected // by metadataMu. dumpability atomicbitops.Int32 metadataMu metadataMutex `state:"nosave"` // argv is the application argv. This is set up by the loader and may be // modified by prctl(PR_SET_MM_ARG_START/PR_SET_MM_ARG_END). No // requirements apply to argv; we do not require that argv.WellFormed(). // // argv is protected by metadataMu. argv hostarch.AddrRange // envv is the application envv. This is set up by the loader and may be // modified by prctl(PR_SET_MM_ENV_START/PR_SET_MM_ENV_END). No // requirements apply to envv; we do not require that envv.WellFormed(). // // envv is protected by metadataMu. envv hostarch.AddrRange // auxv is the ELF's auxiliary vector. // // auxv is protected by metadataMu. auxv arch.Auxv // executable is the executable for this MemoryManager. If executable // is not nil, it holds a reference on the Dirent. // // executable is protected by metadataMu. executable *vfs.FileDescription // aioManager keeps track of AIOContexts used for async IOs. AIOManager // must be cloned when CLONE_VM is used. aioManager aioManager // sleepForActivation indicates whether the task should report to be sleeping // before trying to activate the address space. When set to true, delays in // activation are not reported as stuck tasks by the watchdog. sleepForActivation bool // vdsoSigReturnAddr is the address of 'vdso_sigreturn'. vdsoSigReturnAddr uint64 // membarrierPrivateEnabled is non-zero if EnableMembarrierPrivate has // previously been called. Since, as of this writing, // MEMBARRIER_CMD_PRIVATE_EXPEDITED is implemented as a global memory // barrier, membarrierPrivateEnabled has no other effect. membarrierPrivateEnabled atomicbitops.Uint32 // membarrierRSeqEnabled is non-zero if EnableMembarrierRSeq has previously // been called. membarrierRSeqEnabled atomicbitops.Uint32 } // vma represents a virtual memory area. // // Note: new fields added to this struct must be added to vma.Copy and // vmaSetFunctions.Merge. // // +stateify savable type vma struct { // mappable is the virtual memory object mapped by this vma. If mappable is // nil, the vma represents an anonymous mapping. mappable memmap.Mappable // off is the offset into mappable at which this vma begins. If mappable is // nil, off is meaningless. off uint64 // To speedup VMA save/restore, we group and save the following booleans // as a single integer. // realPerms are the memory permissions on this vma, as defined by the // application. realPerms hostarch.AccessType `state:".(int)"` // effectivePerms are the memory permissions on this vma which are // actually used to control access. // // Invariant: effectivePerms == realPerms.Effective(). effectivePerms hostarch.AccessType `state:"manual"` // maxPerms limits the set of permissions that may ever apply to this // memory, as well as accesses for which usermem.IOOpts.IgnorePermissions // is true (e.g. ptrace(PTRACE_POKEDATA)). // // Invariant: maxPerms == maxPerms.Effective(). maxPerms hostarch.AccessType `state:"manual"` // private is true if this is a MAP_PRIVATE mapping, such that writes to // the mapping are propagated to a copy. private bool `state:"manual"` // growsDown is true if the mapping may be automatically extended downward // under certain conditions. If growsDown is true, mappable must be nil. // // There is currently no corresponding growsUp flag; in Linux, the only // architectures that can have VM_GROWSUP mappings are ia64, parisc, and // metag, none of which we currently support. growsDown bool `state:"manual"` // isStack is true if this is a MAP_STACK mapping. isStack bool `state:"manual"` // dontfork is the MADV_DONTFORK setting for this vma configured by madvise(). dontfork bool mlockMode memmap.MLockMode // numaPolicy is the NUMA policy for this vma set by mbind(). numaPolicy linux.NumaPolicy // numaNodemask is the NUMA nodemask for this vma set by mbind(). numaNodemask uint64 // If id is not nil, it controls the lifecycle of mappable and provides vma // metadata shown in /proc/[pid]/maps, and the vma holds a reference. id memmap.MappingIdentity // If hint is non-empty, it is a description of the vma printed in // /proc/[pid]/maps. hint takes priority over id.MappedName(). hint string // lastFault records the last address that was paged faulted. It hints at // which direction addresses in this vma are being accessed. // // This field can be read atomically, and written with mm.activeMu locked for // writing and mm.mapping locked. lastFault uintptr } func (v *vma) copy() vma { return vma{ mappable: v.mappable, off: v.off, realPerms: v.realPerms, effectivePerms: v.effectivePerms, maxPerms: v.maxPerms, private: v.private, growsDown: v.growsDown, isStack: v.isStack, dontfork: v.dontfork, mlockMode: v.mlockMode, numaPolicy: v.numaPolicy, numaNodemask: v.numaNodemask, id: v.id, hint: v.hint, lastFault: atomic.LoadUintptr(&v.lastFault), } } // pma represents a platform mapping area. // // +stateify savable type pma struct { // file is the file mapped by this pma. Only pmas for which file is of type // pgalloc.MemoryFile may be saved. pmas hold a reference to the // corresponding file range while they exist. file memmap.File `state:".(string)"` // off is the offset into file at which this pma begins. off uint64 // translatePerms is the permissions returned by memmap.Mappable.Translate. // If private is true, translatePerms is hostarch.AnyAccess. translatePerms hostarch.AccessType // effectivePerms is the permissions allowed for non-ignorePermissions // accesses. maxPerms is the permissions allowed for ignorePermissions // accesses. These are vma.effectivePerms and vma.maxPerms respectively, // masked by pma.translatePerms and with Write disallowed if pma.needCOW is // true. // // These are stored in the pma so that the IO implementation can avoid // iterating mm.vmas when pmas already exist. effectivePerms hostarch.AccessType maxPerms hostarch.AccessType // needCOW is true if writes to the mapping must be propagated to a copy. needCOW bool // private is true if this pma represents private memory. // // If private is true, file must be MemoryManager.mfp.MemoryFile(), and // calls to Invalidate for which memmap.InvalidateOpts.InvalidatePrivate is // false should ignore the pma. // // If private is false, this pma caches a translation from the // corresponding vma's memmap.Mappable.Translate. private bool // If huge is true, this pma was returned by a call to MemoryFile.Allocate() // with AllocOpts.Hugepage = true. Note that due to pma splitting, pma may // no longer be hugepage-aligned. // // Invariant: If huge == true, then private == true. huge bool // If internalMappings is not empty, it is the cached return value of // file.MapInternal for the memmap.FileRange mapped by this pma. internalMappings safemem.BlockSeq `state:"nosave"` } type invalidateArgs struct { ar hostarch.AddrRange opts memmap.InvalidateOpts } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/mm_state_autogen.go000066400000000000000000000414551465435605700252540ustar00rootroot00000000000000// automatically generated by stateify. package mm import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (a *aioManager) StateTypeName() string { return "pkg/sentry/mm.aioManager" } func (a *aioManager) StateFields() []string { return []string{ "contexts", } } func (a *aioManager) beforeSave() {} // +checklocksignore func (a *aioManager) StateSave(stateSinkObject state.Sink) { a.beforeSave() stateSinkObject.Save(0, &a.contexts) } func (a *aioManager) afterLoad(context.Context) {} // +checklocksignore func (a *aioManager) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &a.contexts) } func (i *ioResult) StateTypeName() string { return "pkg/sentry/mm.ioResult" } func (i *ioResult) StateFields() []string { return []string{ "data", "ioEntry", } } func (i *ioResult) beforeSave() {} // +checklocksignore func (i *ioResult) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.data) stateSinkObject.Save(1, &i.ioEntry) } func (i *ioResult) afterLoad(context.Context) {} // +checklocksignore func (i *ioResult) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.data) stateSourceObject.Load(1, &i.ioEntry) } func (aio *AIOContext) StateTypeName() string { return "pkg/sentry/mm.AIOContext" } func (aio *AIOContext) StateFields() []string { return []string{ "results", "maxOutstanding", "outstanding", } } func (aio *AIOContext) beforeSave() {} // +checklocksignore func (aio *AIOContext) StateSave(stateSinkObject state.Sink) { aio.beforeSave() if !state.IsZeroValue(&aio.dead) { state.Failf("dead is %#v, expected zero", &aio.dead) } stateSinkObject.Save(0, &aio.results) stateSinkObject.Save(1, &aio.maxOutstanding) stateSinkObject.Save(2, &aio.outstanding) } // +checklocksignore func (aio *AIOContext) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &aio.results) stateSourceObject.Load(1, &aio.maxOutstanding) stateSourceObject.Load(2, &aio.outstanding) stateSourceObject.AfterLoad(func() { aio.afterLoad(ctx) }) } func (m *aioMappable) StateTypeName() string { return "pkg/sentry/mm.aioMappable" } func (m *aioMappable) StateFields() []string { return []string{ "aioMappableRefs", "fr", } } func (m *aioMappable) beforeSave() {} // +checklocksignore func (m *aioMappable) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.aioMappableRefs) stateSinkObject.Save(1, &m.fr) } // +checklocksignore func (m *aioMappable) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.aioMappableRefs) stateSourceObject.Load(1, &m.fr) stateSourceObject.AfterLoad(func() { m.afterLoad(ctx) }) } func (r *aioMappableRefs) StateTypeName() string { return "pkg/sentry/mm.aioMappableRefs" } func (r *aioMappableRefs) StateFields() []string { return []string{ "refCount", } } func (r *aioMappableRefs) beforeSave() {} // +checklocksignore func (r *aioMappableRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *aioMappableRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (l *ioList) StateTypeName() string { return "pkg/sentry/mm.ioList" } func (l *ioList) StateFields() []string { return []string{ "head", "tail", } } func (l *ioList) beforeSave() {} // +checklocksignore func (l *ioList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *ioList) afterLoad(context.Context) {} // +checklocksignore func (l *ioList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *ioEntry) StateTypeName() string { return "pkg/sentry/mm.ioEntry" } func (e *ioEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *ioEntry) beforeSave() {} // +checklocksignore func (e *ioEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *ioEntry) afterLoad(context.Context) {} // +checklocksignore func (e *ioEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (mm *MemoryManager) StateTypeName() string { return "pkg/sentry/mm.MemoryManager" } func (mm *MemoryManager) StateFields() []string { return []string{ "p", "layout", "users", "vmas", "brk", "usageAS", "lockedAS", "dataAS", "defMLockMode", "pmas", "curRSS", "maxRSS", "dumpability", "argv", "envv", "auxv", "executable", "aioManager", "sleepForActivation", "vdsoSigReturnAddr", "membarrierPrivateEnabled", "membarrierRSeqEnabled", } } func (mm *MemoryManager) beforeSave() {} // +checklocksignore func (mm *MemoryManager) StateSave(stateSinkObject state.Sink) { mm.beforeSave() if !state.IsZeroValue(&mm.active) { state.Failf("active is %#v, expected zero", &mm.active) } if !state.IsZeroValue(&mm.captureInvalidations) { state.Failf("captureInvalidations is %#v, expected zero", &mm.captureInvalidations) } stateSinkObject.Save(0, &mm.p) stateSinkObject.Save(1, &mm.layout) stateSinkObject.Save(2, &mm.users) stateSinkObject.Save(3, &mm.vmas) stateSinkObject.Save(4, &mm.brk) stateSinkObject.Save(5, &mm.usageAS) stateSinkObject.Save(6, &mm.lockedAS) stateSinkObject.Save(7, &mm.dataAS) stateSinkObject.Save(8, &mm.defMLockMode) stateSinkObject.Save(9, &mm.pmas) stateSinkObject.Save(10, &mm.curRSS) stateSinkObject.Save(11, &mm.maxRSS) stateSinkObject.Save(12, &mm.dumpability) stateSinkObject.Save(13, &mm.argv) stateSinkObject.Save(14, &mm.envv) stateSinkObject.Save(15, &mm.auxv) stateSinkObject.Save(16, &mm.executable) stateSinkObject.Save(17, &mm.aioManager) stateSinkObject.Save(18, &mm.sleepForActivation) stateSinkObject.Save(19, &mm.vdsoSigReturnAddr) stateSinkObject.Save(20, &mm.membarrierPrivateEnabled) stateSinkObject.Save(21, &mm.membarrierRSeqEnabled) } // +checklocksignore func (mm *MemoryManager) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &mm.p) stateSourceObject.Load(1, &mm.layout) stateSourceObject.Load(2, &mm.users) stateSourceObject.Load(3, &mm.vmas) stateSourceObject.Load(4, &mm.brk) stateSourceObject.Load(5, &mm.usageAS) stateSourceObject.Load(6, &mm.lockedAS) stateSourceObject.Load(7, &mm.dataAS) stateSourceObject.Load(8, &mm.defMLockMode) stateSourceObject.Load(9, &mm.pmas) stateSourceObject.Load(10, &mm.curRSS) stateSourceObject.Load(11, &mm.maxRSS) stateSourceObject.Load(12, &mm.dumpability) stateSourceObject.Load(13, &mm.argv) stateSourceObject.Load(14, &mm.envv) stateSourceObject.Load(15, &mm.auxv) stateSourceObject.Load(16, &mm.executable) stateSourceObject.Load(17, &mm.aioManager) stateSourceObject.Load(18, &mm.sleepForActivation) stateSourceObject.Load(19, &mm.vdsoSigReturnAddr) stateSourceObject.Load(20, &mm.membarrierPrivateEnabled) stateSourceObject.Load(21, &mm.membarrierRSeqEnabled) stateSourceObject.AfterLoad(func() { mm.afterLoad(ctx) }) } func (v *vma) StateTypeName() string { return "pkg/sentry/mm.vma" } func (v *vma) StateFields() []string { return []string{ "mappable", "off", "realPerms", "dontfork", "mlockMode", "numaPolicy", "numaNodemask", "id", "hint", "lastFault", } } func (v *vma) beforeSave() {} // +checklocksignore func (v *vma) StateSave(stateSinkObject state.Sink) { v.beforeSave() var realPermsValue int realPermsValue = v.saveRealPerms() stateSinkObject.SaveValue(2, realPermsValue) stateSinkObject.Save(0, &v.mappable) stateSinkObject.Save(1, &v.off) stateSinkObject.Save(3, &v.dontfork) stateSinkObject.Save(4, &v.mlockMode) stateSinkObject.Save(5, &v.numaPolicy) stateSinkObject.Save(6, &v.numaNodemask) stateSinkObject.Save(7, &v.id) stateSinkObject.Save(8, &v.hint) stateSinkObject.Save(9, &v.lastFault) } func (v *vma) afterLoad(context.Context) {} // +checklocksignore func (v *vma) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &v.mappable) stateSourceObject.Load(1, &v.off) stateSourceObject.Load(3, &v.dontfork) stateSourceObject.Load(4, &v.mlockMode) stateSourceObject.Load(5, &v.numaPolicy) stateSourceObject.Load(6, &v.numaNodemask) stateSourceObject.Load(7, &v.id) stateSourceObject.Load(8, &v.hint) stateSourceObject.Load(9, &v.lastFault) stateSourceObject.LoadValue(2, new(int), func(y any) { v.loadRealPerms(ctx, y.(int)) }) } func (p *pma) StateTypeName() string { return "pkg/sentry/mm.pma" } func (p *pma) StateFields() []string { return []string{ "file", "off", "translatePerms", "effectivePerms", "maxPerms", "needCOW", "private", "huge", } } func (p *pma) beforeSave() {} // +checklocksignore func (p *pma) StateSave(stateSinkObject state.Sink) { p.beforeSave() var fileValue string fileValue = p.saveFile() stateSinkObject.SaveValue(0, fileValue) stateSinkObject.Save(1, &p.off) stateSinkObject.Save(2, &p.translatePerms) stateSinkObject.Save(3, &p.effectivePerms) stateSinkObject.Save(4, &p.maxPerms) stateSinkObject.Save(5, &p.needCOW) stateSinkObject.Save(6, &p.private) stateSinkObject.Save(7, &p.huge) } func (p *pma) afterLoad(context.Context) {} // +checklocksignore func (p *pma) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(1, &p.off) stateSourceObject.Load(2, &p.translatePerms) stateSourceObject.Load(3, &p.effectivePerms) stateSourceObject.Load(4, &p.maxPerms) stateSourceObject.Load(5, &p.needCOW) stateSourceObject.Load(6, &p.private) stateSourceObject.Load(7, &p.huge) stateSourceObject.LoadValue(0, new(string), func(y any) { p.loadFile(ctx, y.(string)) }) } func (s *pmaSet) StateTypeName() string { return "pkg/sentry/mm.pmaSet" } func (s *pmaSet) StateFields() []string { return []string{ "root", } } func (s *pmaSet) beforeSave() {} // +checklocksignore func (s *pmaSet) StateSave(stateSinkObject state.Sink) { s.beforeSave() var rootValue []pmaFlatSegment rootValue = s.saveRoot() stateSinkObject.SaveValue(0, rootValue) } func (s *pmaSet) afterLoad(context.Context) {} // +checklocksignore func (s *pmaSet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new([]pmaFlatSegment), func(y any) { s.loadRoot(ctx, y.([]pmaFlatSegment)) }) } func (n *pmanode) StateTypeName() string { return "pkg/sentry/mm.pmanode" } func (n *pmanode) StateFields() []string { return []string{ "nrSegments", "parent", "parentIndex", "hasChildren", "maxGap", "keys", "values", "children", } } func (n *pmanode) beforeSave() {} // +checklocksignore func (n *pmanode) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.nrSegments) stateSinkObject.Save(1, &n.parent) stateSinkObject.Save(2, &n.parentIndex) stateSinkObject.Save(3, &n.hasChildren) stateSinkObject.Save(4, &n.maxGap) stateSinkObject.Save(5, &n.keys) stateSinkObject.Save(6, &n.values) stateSinkObject.Save(7, &n.children) } func (n *pmanode) afterLoad(context.Context) {} // +checklocksignore func (n *pmanode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.nrSegments) stateSourceObject.Load(1, &n.parent) stateSourceObject.Load(2, &n.parentIndex) stateSourceObject.Load(3, &n.hasChildren) stateSourceObject.Load(4, &n.maxGap) stateSourceObject.Load(5, &n.keys) stateSourceObject.Load(6, &n.values) stateSourceObject.Load(7, &n.children) } func (p *pmaFlatSegment) StateTypeName() string { return "pkg/sentry/mm.pmaFlatSegment" } func (p *pmaFlatSegment) StateFields() []string { return []string{ "Start", "End", "Value", } } func (p *pmaFlatSegment) beforeSave() {} // +checklocksignore func (p *pmaFlatSegment) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.Start) stateSinkObject.Save(1, &p.End) stateSinkObject.Save(2, &p.Value) } func (p *pmaFlatSegment) afterLoad(context.Context) {} // +checklocksignore func (p *pmaFlatSegment) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.Start) stateSourceObject.Load(1, &p.End) stateSourceObject.Load(2, &p.Value) } func (m *SpecialMappable) StateTypeName() string { return "pkg/sentry/mm.SpecialMappable" } func (m *SpecialMappable) StateFields() []string { return []string{ "SpecialMappableRefs", "fr", "name", } } func (m *SpecialMappable) beforeSave() {} // +checklocksignore func (m *SpecialMappable) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.SpecialMappableRefs) stateSinkObject.Save(1, &m.fr) stateSinkObject.Save(2, &m.name) } // +checklocksignore func (m *SpecialMappable) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.SpecialMappableRefs) stateSourceObject.Load(1, &m.fr) stateSourceObject.Load(2, &m.name) stateSourceObject.AfterLoad(func() { m.afterLoad(ctx) }) } func (r *SpecialMappableRefs) StateTypeName() string { return "pkg/sentry/mm.SpecialMappableRefs" } func (r *SpecialMappableRefs) StateFields() []string { return []string{ "refCount", } } func (r *SpecialMappableRefs) beforeSave() {} // +checklocksignore func (r *SpecialMappableRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *SpecialMappableRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (s *vmaSet) StateTypeName() string { return "pkg/sentry/mm.vmaSet" } func (s *vmaSet) StateFields() []string { return []string{ "root", } } func (s *vmaSet) beforeSave() {} // +checklocksignore func (s *vmaSet) StateSave(stateSinkObject state.Sink) { s.beforeSave() var rootValue []vmaFlatSegment rootValue = s.saveRoot() stateSinkObject.SaveValue(0, rootValue) } func (s *vmaSet) afterLoad(context.Context) {} // +checklocksignore func (s *vmaSet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new([]vmaFlatSegment), func(y any) { s.loadRoot(ctx, y.([]vmaFlatSegment)) }) } func (n *vmanode) StateTypeName() string { return "pkg/sentry/mm.vmanode" } func (n *vmanode) StateFields() []string { return []string{ "nrSegments", "parent", "parentIndex", "hasChildren", "maxGap", "keys", "values", "children", } } func (n *vmanode) beforeSave() {} // +checklocksignore func (n *vmanode) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.nrSegments) stateSinkObject.Save(1, &n.parent) stateSinkObject.Save(2, &n.parentIndex) stateSinkObject.Save(3, &n.hasChildren) stateSinkObject.Save(4, &n.maxGap) stateSinkObject.Save(5, &n.keys) stateSinkObject.Save(6, &n.values) stateSinkObject.Save(7, &n.children) } func (n *vmanode) afterLoad(context.Context) {} // +checklocksignore func (n *vmanode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.nrSegments) stateSourceObject.Load(1, &n.parent) stateSourceObject.Load(2, &n.parentIndex) stateSourceObject.Load(3, &n.hasChildren) stateSourceObject.Load(4, &n.maxGap) stateSourceObject.Load(5, &n.keys) stateSourceObject.Load(6, &n.values) stateSourceObject.Load(7, &n.children) } func (v *vmaFlatSegment) StateTypeName() string { return "pkg/sentry/mm.vmaFlatSegment" } func (v *vmaFlatSegment) StateFields() []string { return []string{ "Start", "End", "Value", } } func (v *vmaFlatSegment) beforeSave() {} // +checklocksignore func (v *vmaFlatSegment) StateSave(stateSinkObject state.Sink) { v.beforeSave() stateSinkObject.Save(0, &v.Start) stateSinkObject.Save(1, &v.End) stateSinkObject.Save(2, &v.Value) } func (v *vmaFlatSegment) afterLoad(context.Context) {} // +checklocksignore func (v *vmaFlatSegment) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &v.Start) stateSourceObject.Load(1, &v.End) stateSourceObject.Load(2, &v.Value) } func init() { state.Register((*aioManager)(nil)) state.Register((*ioResult)(nil)) state.Register((*AIOContext)(nil)) state.Register((*aioMappable)(nil)) state.Register((*aioMappableRefs)(nil)) state.Register((*ioList)(nil)) state.Register((*ioEntry)(nil)) state.Register((*MemoryManager)(nil)) state.Register((*vma)(nil)) state.Register((*pma)(nil)) state.Register((*pmaSet)(nil)) state.Register((*pmanode)(nil)) state.Register((*pmaFlatSegment)(nil)) state.Register((*SpecialMappable)(nil)) state.Register((*SpecialMappableRefs)(nil)) state.Register((*vmaSet)(nil)) state.Register((*vmanode)(nil)) state.Register((*vmaFlatSegment)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/pma.go000066400000000000000000001101141465435605700224630ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mm import ( "fmt" "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safecopy" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" ) // existingPMAsLocked checks that pmas exist for all addresses in ar, and // support access of type (at, ignorePermissions). If so, it returns an // iterator to the pma containing ar.Start. Otherwise it returns a terminal // iterator. // // Preconditions: // - mm.activeMu must be locked. // - ar.Length() != 0. func (mm *MemoryManager) existingPMAsLocked(ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool, needInternalMappings bool) pmaIterator { if checkInvariants { if !ar.WellFormed() || ar.Length() == 0 { panic(fmt.Sprintf("invalid ar: %v", ar)) } } first := mm.pmas.FindSegment(ar.Start) pseg := first for pseg.Ok() { pma := pseg.ValuePtr() perms := pma.effectivePerms if ignorePermissions { perms = pma.maxPerms } if !perms.SupersetOf(at) { return pmaIterator{} } if needInternalMappings && pma.internalMappings.IsEmpty() { return pmaIterator{} } if ar.End <= pseg.End() { return first } pseg, _ = pseg.NextNonEmpty() } // Ran out of pmas before reaching ar.End. return pmaIterator{} } // existingVecPMAsLocked returns true if pmas exist for all addresses in ars, // and support access of type (at, ignorePermissions). // // Preconditions: mm.activeMu must be locked. func (mm *MemoryManager) existingVecPMAsLocked(ars hostarch.AddrRangeSeq, at hostarch.AccessType, ignorePermissions bool, needInternalMappings bool) bool { for ; !ars.IsEmpty(); ars = ars.Tail() { if ar := ars.Head(); ar.Length() != 0 && !mm.existingPMAsLocked(ar, at, ignorePermissions, needInternalMappings).Ok() { return false } } return true } // getPMAsLocked ensures that pmas exist for all addresses in ar, and support // access of type at. It returns: // // - An iterator to the pma containing ar.Start. If no pma contains ar.Start, // the iterator is unspecified. // // - An iterator to the gap after the last pma containing an address in ar. If // pmas exist for no addresses in ar, the iterator is to a gap that begins // before ar.Start. // // - An error that is non-nil if pmas exist for only a subset of ar. // // If callerIndirectCommit is true, the caller of getPMAsLocked will shortly // commit all pages in ar without using the caller's page tables, in the same // sense as pgalloc.AllocateCallerIndirectCommit. // // Preconditions: // - mm.mappingMu must be locked. // - mm.activeMu must be locked for writing. // - ar.Length() != 0. // - vseg.Range().Contains(ar.Start). // - vmas must exist for all addresses in ar, and support accesses of type at // (i.e. permission checks must have been performed against vmas). func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, at hostarch.AccessType, callerIndirectCommit bool) (pmaIterator, pmaGapIterator, error) { if checkInvariants { if !ar.WellFormed() || ar.Length() == 0 { panic(fmt.Sprintf("invalid ar: %v", ar)) } if !vseg.Ok() { panic("terminal vma iterator") } if !vseg.Range().Contains(ar.Start) { panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar)) } } // Page-align ar so that all AddrRanges are aligned. end, ok := ar.End.RoundUp() var alignerr error if !ok { end = ar.End.RoundDown() alignerr = linuxerr.EFAULT } ar = hostarch.AddrRange{ar.Start.RoundDown(), end} pstart, pend, perr := mm.getPMAsInternalLocked(ctx, vseg, ar, at, callerIndirectCommit) if pend.Start() <= ar.Start { return pmaIterator{}, pend, perr } // getPMAsInternalLocked may not have returned pstart due to iterator // invalidation. if !pstart.Ok() { pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend) } if perr != nil { return pstart, pend, perr } return pstart, pend, alignerr } // getVecPMAsLocked ensures that pmas exist for all addresses in ars, and // support access of type at. It returns the subset of ars for which pmas // exist. If this is not equal to ars, it returns a non-nil error explaining // why. // // Preconditions: // - mm.mappingMu must be locked. // - mm.activeMu must be locked for writing. // - vmas must exist for all addresses in ars, and support accesses of type at // (i.e. permission checks must have been performed against vmas). func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars hostarch.AddrRangeSeq, at hostarch.AccessType, callerIndirectCommit bool) (hostarch.AddrRangeSeq, error) { for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { ar := arsit.Head() if ar.Length() == 0 { continue } if checkInvariants { if !ar.WellFormed() { panic(fmt.Sprintf("invalid ar: %v", ar)) } } // Page-align ar so that all AddrRanges are aligned. end, ok := ar.End.RoundUp() var alignerr error if !ok { end = ar.End.RoundDown() alignerr = linuxerr.EFAULT } ar = hostarch.AddrRange{ar.Start.RoundDown(), end} _, pend, perr := mm.getPMAsInternalLocked(ctx, mm.vmas.FindSegment(ar.Start), ar, at, callerIndirectCommit) if perr != nil { return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr } if alignerr != nil { return truncatedAddrRangeSeq(ars, arsit, pend.Start()), alignerr } } return ars, nil } // getPMAsInternalLocked is equivalent to getPMAsLocked, with the following // exceptions: // // - getPMAsInternalLocked returns a pmaIterator on a best-effort basis (that // is, the returned iterator may be terminal, even if a pma that contains // ar.Start exists). Returning this iterator on a best-effort basis allows // callers that require it to use it when it's cheaply available, while also // avoiding the overhead of retrieving it when it's not. // // - getPMAsInternalLocked additionally requires that ar is page-aligned. // getPMAsInternalLocked is an implementation helper for getPMAsLocked and // getVecPMAsLocked; other clients should call one of those instead. func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, at hostarch.AccessType, callerIndirectCommit bool) (pmaIterator, pmaGapIterator, error) { if checkInvariants { if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { panic(fmt.Sprintf("invalid ar: %v", ar)) } if !vseg.Ok() { panic("terminal vma iterator") } if !vseg.Range().Contains(ar.Start) { panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar)) } } var pfdrs *pendingFileDecRefs defer func() { // must be a closure to avoid evaluating pfdrs immediately pfdrs.Cleanup() }() var unmapAR hostarch.AddrRange defer func() { mm.unmapASLocked(unmapAR) }() vma := vseg.ValuePtr() memCgID := pgalloc.MemoryCgroupIDFromContext(ctx) allocDir := pgalloc.BottomUp if uintptr(ar.Start) < atomic.LoadUintptr(&vma.lastFault) { // Detect cases where memory is accessed downwards and change memory file // allocation order to increase the chances that pages are coalesced. allocDir = pgalloc.TopDown } atomic.StoreUintptr(&vma.lastFault, uintptr(ar.Start)) // Limit the range we allocate to ar, aligned to hugepage boundaries. hugeMaskAR := hugepageAligned(ar) // The range in which we iterate vmas and pmas is still limited to ar, to // ensure that we don't allocate or COW-break a pma we don't need. pseg, pgap := mm.pmas.Find(ar.Start) pstart := pseg for { // Get pmas for this vma. vsegAR := vseg.Range().Intersect(ar) vma := vseg.ValuePtr() pmaLoop: for { switch { case pgap.Ok() && pgap.Start() < vsegAR.End: // Need a pma here. optAR := vseg.Range().Intersect(pgap.Range()) if checkInvariants { if optAR.Length() == 0 { panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap)) } } if vma.mappable == nil { // Private anonymous mappings get pmas by allocating. // The allocated range is limited to ar, expanded to // hugepage alignment. This is done even if the allocation // will not be hugepage-backed, in an attempt to reduce // application page faults (that trap into the sentry) by // creating AddressSpace mappings in advance. allocAR := optAR.Intersect(hugeMaskAR) // Don't back stacks with huge pages due to low utilization // and because they're often fragmented by copy-on-write. huge := mm.mf.HugepagesEnabled() && allocAR.IsHugePageAligned() && !vma.growsDown && !vma.isStack allocOpts := pgalloc.AllocOpts{ Kind: usage.Anonymous, MemCgID: memCgID, Mode: pgalloc.AllocateUncommitted, Huge: huge, Dir: allocDir, } // If the allocation is hugepage-backed and // callerIndirectCommit is true, the caller will commit every // allocated huge page. If the allocation is not // hugepage-backed, the caller won't commit every allocated // page since hugeMaskAR is ar expanded to huge alignment, // unless only one page in optAR falls into the huge page. if callerIndirectCommit && (huge || allocAR.Length() == hostarch.PageSize) { allocOpts.Mode = pgalloc.AllocateCallerIndirectCommit } fr, err := mm.mf.Allocate(uint64(allocAR.Length()), allocOpts) if err != nil { return pstart, pgap, err } if checkInvariants { if !fr.WellFormed() || fr.Length() != uint64(allocAR.Length()) { panic(fmt.Sprintf("Allocate(%v) returned invalid FileRange %v", allocAR.Length(), fr)) } } mm.addRSSLocked(allocAR) pseg, pgap = mm.pmas.Insert(pgap, allocAR, pma{ file: mm.mf, off: fr.Start, translatePerms: hostarch.AnyAccess, effectivePerms: vma.effectivePerms, maxPerms: vma.maxPerms, // Since we just allocated this memory and have the // only reference, the new pma does not need // copy-on-write. private: true, huge: huge, }).NextNonEmpty() pstart = pmaIterator{} // iterators invalidated } else { // Other mappings get pmas by translating. optMR := vseg.mappableRangeOf(optAR) reqAR := optAR.Intersect(ar) reqMR := vseg.mappableRangeOf(reqAR) perms := at if vma.private { // This pma will be copy-on-write; don't require write // permission, but do require read permission to // facilitate the copy. // // If at.Write is true, we will need to break // copy-on-write immediately, which occurs after // translation below. perms.Read = true perms.Write = false } ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms) if checkInvariants { if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil { panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err)) } } // Install a pma for each translation. if len(ts) == 0 { return pstart, pgap, err } pstart = pmaIterator{} // iterators invalidated for _, t := range ts { newpmaAR := vseg.addrRangeOf(t.Source) newpma := pma{ file: t.File, off: t.Offset, translatePerms: t.Perms, effectivePerms: vma.effectivePerms.Intersect(t.Perms), maxPerms: vma.maxPerms.Intersect(t.Perms), } if vma.private { newpma.effectivePerms.Write = false newpma.maxPerms.Write = false newpma.needCOW = true } mm.addRSSLocked(newpmaAR) t.File.IncRef(t.FileRange(), memCgID) // This is valid because memmap.Mappable.Translate is // required to return Translations in increasing // Translation.Source order. pseg = mm.pmas.Insert(pgap, newpmaAR, newpma) pgap = pseg.NextGap() } // The error returned by Translate is only significant if // it occurred before ar.End. if err != nil && vseg.addrRangeOf(ts[len(ts)-1].Source).End < ar.End { return pstart, pgap, err } // Rewind pseg to the first pma inserted and continue the // loop to check if we need to break copy-on-write. pseg, pgap = mm.findOrSeekPrevUpperBoundPMA(vseg.addrRangeOf(ts[0].Source).Start, pgap), pmaGapIterator{} continue } case pseg.Ok() && pseg.Start() < vsegAR.End: oldpma := pseg.ValuePtr() if at.Write && mm.isPMACopyOnWriteLocked(vseg, pseg) { // Break copy-on-write by copying. if checkInvariants { if !oldpma.maxPerms.Read { panic(fmt.Sprintf("pma %v needs to be copied for writing, but is not readable: %v", pseg.Range(), oldpma)) } } var copyAR hostarch.AddrRange if vma.effectivePerms.Execute { // The majority of copy-on-write breaks on executable // pages come from: // // - The ELF loader, which must zero out bytes on the // last page of each segment after the end of the // segment. // // - gdb's use of ptrace to insert breakpoints. // // Neither of these cases has enough spatial locality // to benefit from copying nearby pages, so if the vma // is executable, only copy the pages required. copyAR = pseg.Range().Intersect(ar) } else if vma.growsDown || vma.isStack { // In most cases, the new process will not use most of // its stack before exiting or invoking execve(); it is // especially unlikely to return very far down its call // stack, since async-signal-safety concerns in // multithreaded programs prevent the new process from // being able to do much. So only copy up to one page // before and after the pages required. stackMaskAR := ar if newStart := stackMaskAR.Start - hostarch.PageSize; newStart < stackMaskAR.Start { stackMaskAR.Start = newStart } if newEnd := stackMaskAR.End + hostarch.PageSize; newEnd > stackMaskAR.End { stackMaskAR.End = newEnd } copyAR = pseg.Range().Intersect(stackMaskAR) } else { // Hugepage-align the range to be copied, for the same // reasons as for private anonymous allocations. copyAR = pseg.Range().Intersect(hugeMaskAR) } // Get internal mappings from the pma to copy from. if err := pseg.getInternalMappingsLocked(); err != nil { return pstart, pseg.PrevGap(), err } // Copy contents. huge := mm.mf.HugepagesEnabled() && copyAR.IsHugePageAligned() reader := safemem.BlockSeqReader{Blocks: mm.internalMappingsLocked(pseg, copyAR)} fr, err := mm.mf.Allocate(uint64(copyAR.Length()), pgalloc.AllocOpts{ Kind: usage.Anonymous, MemCgID: memCgID, Mode: pgalloc.AllocateAndWritePopulate, Huge: huge, Dir: allocDir, ReaderFunc: reader.ReadToBlocks, }) if _, ok := err.(safecopy.BusError); ok { // If we got SIGBUS during the copy, deliver SIGBUS to // userspace (instead of SIGSEGV) if we're breaking // copy-on-write due to application page fault. err = &memmap.BusError{err} } if fr.Length() == 0 { return pstart, pseg.PrevGap(), err } // Replace the pma with a copy in the part of the address // range where copying was successful. This doesn't change // RSS. copyAR.End = copyAR.Start + hostarch.Addr(fr.Length()) if copyAR != pseg.Range() { pseg = mm.pmas.Isolate(pseg, copyAR) pstart = pmaIterator{} // iterators invalidated } oldpma = pseg.ValuePtr() unmapAR = joinAddrRanges(unmapAR, copyAR) pfdrs = appendPendingFileDecRef(pfdrs, oldpma.file, pseg.fileRange()) oldpma.file = mm.mf oldpma.off = fr.Start oldpma.translatePerms = hostarch.AnyAccess oldpma.effectivePerms = vma.effectivePerms oldpma.maxPerms = vma.maxPerms oldpma.needCOW = false oldpma.private = true oldpma.huge = huge oldpma.internalMappings = safemem.BlockSeq{} // Try to merge the pma with its neighbors. if prev := pseg.PrevSegment(); prev.Ok() { if merged := mm.pmas.Merge(prev, pseg); merged.Ok() { pseg = merged pstart = pmaIterator{} // iterators invalidated } } if next := pseg.NextSegment(); next.Ok() { if merged := mm.pmas.Merge(pseg, next); merged.Ok() { pseg = merged pstart = pmaIterator{} // iterators invalidated } } // The error returned by AllocateAndFill is only // significant if it occurred before ar.End. if err != nil && pseg.End() < ar.End { return pstart, pseg.NextGap(), err } // Ensure pseg and pgap are correct for the next iteration // of the loop. pseg, pgap = pseg.NextNonEmpty() } else if !oldpma.translatePerms.SupersetOf(at) { // Get new pmas (with sufficient permissions) by calling // memmap.Mappable.Translate again. if checkInvariants { if oldpma.private { panic(fmt.Sprintf("private pma %v has non-maximal pma.translatePerms: %v", pseg.Range(), oldpma)) } } // Allow the entire pma to be replaced. optAR := pseg.Range() optMR := vseg.mappableRangeOf(optAR) reqAR := optAR.Intersect(ar) reqMR := vseg.mappableRangeOf(reqAR) perms := oldpma.translatePerms.Union(at) ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms) if checkInvariants { if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil { panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err)) } } // Remove the part of the existing pma covered by new // Translations, then insert new pmas. This doesn't change // RSS. if len(ts) == 0 { return pstart, pseg.PrevGap(), err } transMR := memmap.MappableRange{ts[0].Source.Start, ts[len(ts)-1].Source.End} transAR := vseg.addrRangeOf(transMR) pseg = mm.pmas.Isolate(pseg, transAR) unmapAR = joinAddrRanges(unmapAR, transAR) pfdrs = appendPendingFileDecRef(pfdrs, pseg.ValuePtr().file, pseg.fileRange()) pgap = mm.pmas.Remove(pseg) pstart = pmaIterator{} // iterators invalidated for _, t := range ts { newpmaAR := vseg.addrRangeOf(t.Source) newpma := pma{ file: t.File, off: t.Offset, translatePerms: t.Perms, effectivePerms: vma.effectivePerms.Intersect(t.Perms), maxPerms: vma.maxPerms.Intersect(t.Perms), } if vma.private { newpma.effectivePerms.Write = false newpma.maxPerms.Write = false newpma.needCOW = true } t.File.IncRef(t.FileRange(), memCgID) pseg = mm.pmas.Insert(pgap, newpmaAR, newpma) pgap = pseg.NextGap() } // The error returned by Translate is only significant if // it occurred before ar.End. if err != nil && pseg.End() < ar.End { return pstart, pgap, err } // Ensure pseg and pgap are correct for the next iteration // of the loop. if pgap.Range().Length() == 0 { pseg, pgap = pgap.NextSegment(), pmaGapIterator{} } else { pseg = pmaIterator{} } } else { // We have a usable pma; continue. pseg, pgap = pseg.NextNonEmpty() } default: break pmaLoop } } // Go to the next vma. if ar.End <= vseg.End() { if pgap.Ok() { return pstart, pgap, nil } return pstart, pseg.PrevGap(), nil } vseg = vseg.NextSegment() } } func hugepageAligned(ar hostarch.AddrRange) hostarch.AddrRange { aligned := hostarch.AddrRange{ar.Start.HugeRoundDown(), ar.End} if end, ok := ar.End.HugeRoundUp(); ok { aligned.End = end } if checkInvariants { if !aligned.IsSupersetOf(ar) { panic(fmt.Sprintf("aligned AddrRange %#v is not a superset of ar %#v", aligned, ar)) } } return aligned } // isPMACopyOnWriteLocked returns true if the contents of the pma represented // by pseg must be copied to a new private pma to be written to. // // If the pma is a copy-on-write private pma, and holds the only reference on // the memory it maps, isPMACopyOnWriteLocked will take ownership of the memory // and update the pma to indicate that it does not require copy-on-write. // // Preconditions: // - vseg.Range().IsSupersetOf(pseg.Range()). // - mm.mappingMu must be locked. // - mm.activeMu must be locked for writing. func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterator) bool { pma := pseg.ValuePtr() if !pma.needCOW { return false } if !pma.private { return true } // If we have the only reference on private memory to be copied, just take // ownership of it instead of copying. If we do hold the only reference, // additional references can only be taken by mm.Fork(), which is excluded // by mm.activeMu, so this isn't racy. if mm.mf.HasUniqueRef(pseg.fileRange()) { pma.needCOW = false // pma.private => pma.translatePerms == hostarch.AnyAccess vma := vseg.ValuePtr() pma.effectivePerms = vma.effectivePerms pma.maxPerms = vma.maxPerms return false } return true } // Invalidate implements memmap.MappingSpace.Invalidate. func (mm *MemoryManager) Invalidate(ar hostarch.AddrRange, opts memmap.InvalidateOpts) { if checkInvariants { if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { panic(fmt.Sprintf("invalid ar: %v", ar)) } } mm.activeMu.Lock() defer mm.activeMu.Unlock() if mm.captureInvalidations { mm.capturedInvalidations = append(mm.capturedInvalidations, invalidateArgs{ar, opts}) return } mm.invalidateLocked(ar, opts.InvalidatePrivate, true) } // invalidateLocked removes pmas and AddressSpace mappings of those pmas for // addresses in ar. // // Preconditions: // - mm.activeMu must be locked for writing. // - ar.Length() != 0. // - ar must be page-aligned. func (mm *MemoryManager) invalidateLocked(ar hostarch.AddrRange, invalidatePrivate, invalidateShared bool) { if checkInvariants { if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { panic(fmt.Sprintf("invalid ar: %v", ar)) } } var didUnmapAS bool pseg := mm.pmas.LowerBoundSegment(ar.Start) for pseg.Ok() && pseg.Start() < ar.End { pma := pseg.ValuePtr() if (invalidatePrivate && pma.private) || (invalidateShared && !pma.private) { pseg = mm.pmas.Isolate(pseg, ar) pma = pseg.ValuePtr() if !didUnmapAS { // Unmap all of ar, not just pseg.Range(), to minimize host // syscalls. AddressSpace mappings must be removed before // pma.file.DecRef(). // // Note that we do more than just ar here, and extrapolate // to the end of any previous region that we may have mapped. // This is done to ensure that lower layers can fully invalidate // intermediate pagetable pages during the unmap. var unmapAR hostarch.AddrRange if prev := pseg.PrevSegment(); prev.Ok() { unmapAR.Start = prev.End() } else { unmapAR.Start = mm.layout.MinAddr } if last := mm.pmas.LowerBoundSegment(ar.End); last.Ok() { if last.Start() < ar.End { unmapAR.End = ar.End } else { unmapAR.End = last.Start() } } else { unmapAR.End = mm.layout.MaxAddr } mm.unmapASLocked(unmapAR) didUnmapAS = true } mm.removeRSSLocked(pseg.Range()) pma.file.DecRef(pseg.fileRange()) pseg = mm.pmas.Remove(pseg).NextSegment() } else { pseg = pseg.NextSegment() } } } // Pin returns the memmap.File ranges currently mapped by addresses in ar in // mm, acquiring a reference on the returned ranges which the caller must // release by calling Unpin. If not all addresses are mapped, Pin returns a // non-nil error. Note that Pin may return both a non-empty slice of // PinnedRanges and a non-nil error. // // Pin does not prevent mapped ranges from changing, making it unsuitable for // most I/O. It should only be used in contexts that would use get_user_pages() // in the Linux kernel. // // Preconditions: // - ar.Length() != 0. // - ar must be page-aligned. func (mm *MemoryManager) Pin(ctx context.Context, ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool) ([]PinnedRange, error) { if checkInvariants { if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { panic(fmt.Sprintf("invalid ar: %v", ar)) } } // Ensure that we have usable vmas. mm.mappingMu.RLock() vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions) if vendaddr := vend.Start(); vendaddr < ar.End { if vendaddr <= ar.Start { mm.mappingMu.RUnlock() return nil, verr } ar.End = vendaddr } // Ensure that we have usable pmas. mm.activeMu.Lock() pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at, false /* callerIndirectCommit */) mm.mappingMu.RUnlock() if pendaddr := pend.Start(); pendaddr < ar.End { if pendaddr <= ar.Start { mm.activeMu.Unlock() return nil, perr } ar.End = pendaddr } memCgID := pgalloc.MemoryCgroupIDFromContext(ctx) // Gather pmas. var prs []PinnedRange for pseg.Ok() && pseg.Start() < ar.End { psar := pseg.Range().Intersect(ar) f := pseg.ValuePtr().file fr := pseg.fileRangeOf(psar) f.IncRef(fr, memCgID) prs = append(prs, PinnedRange{ Source: psar, File: f, Offset: fr.Start, }) pseg = pseg.NextSegment() } mm.activeMu.Unlock() // Return the first error in order of progress through ar. if perr != nil { return prs, perr } return prs, verr } // PinnedRanges are returned by MemoryManager.Pin. type PinnedRange struct { // Source is the corresponding range of addresses. Source hostarch.AddrRange // File is the mapped file. File memmap.File // Offset is the offset into File at which this PinnedRange begins. Offset uint64 } // FileRange returns the memmap.File offsets mapped by pr. func (pr PinnedRange) FileRange() memmap.FileRange { return memmap.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())} } // Unpin releases the reference held by prs. func Unpin(prs []PinnedRange) { for i := range prs { prs[i].File.DecRef(prs[i].FileRange()) } } // movePMAsLocked moves all pmas in oldAR to newAR. // // Preconditions: // - mm.activeMu must be locked for writing. // - oldAR.Length() != 0. // - oldAR.Length() <= newAR.Length(). // - !oldAR.Overlaps(newAR). // - mm.pmas.IsEmptyRange(newAR). // - oldAR and newAR must be page-aligned. func (mm *MemoryManager) movePMAsLocked(oldAR, newAR hostarch.AddrRange) { if checkInvariants { if !oldAR.WellFormed() || oldAR.Length() == 0 || !oldAR.IsPageAligned() { panic(fmt.Sprintf("invalid oldAR: %v", oldAR)) } if !newAR.WellFormed() || newAR.Length() == 0 || !newAR.IsPageAligned() { panic(fmt.Sprintf("invalid newAR: %v", newAR)) } if oldAR.Length() > newAR.Length() { panic(fmt.Sprintf("old address range %v may contain pmas that will not fit in new address range %v", oldAR, newAR)) } if oldAR.Overlaps(newAR) { panic(fmt.Sprintf("old and new address ranges overlap: %v, %v", oldAR, newAR)) } // mm.pmas.IsEmptyRange is checked by mm.pmas.Insert. } type movedPMA struct { oldAR hostarch.AddrRange pma pma } var movedPMAs []movedPMA pseg := mm.pmas.LowerBoundSegment(oldAR.Start) for pseg.Ok() && pseg.Start() < oldAR.End { pseg = mm.pmas.Isolate(pseg, oldAR) movedPMAs = append(movedPMAs, movedPMA{ oldAR: pseg.Range(), pma: pseg.Value(), }) pseg = mm.pmas.Remove(pseg).NextSegment() // No RSS change is needed since we're re-inserting the same pmas // below. } off := newAR.Start - oldAR.Start pgap := mm.pmas.FindGap(newAR.Start) for i := range movedPMAs { mpma := &movedPMAs[i] pmaNewAR := hostarch.AddrRange{mpma.oldAR.Start + off, mpma.oldAR.End + off} pgap = mm.pmas.Insert(pgap, pmaNewAR, mpma.pma).NextGap() } mm.unmapASLocked(oldAR) } // internalMappingsLocked returns cached internal mappings for addresses in ar. // // Preconditions: // - mm.activeMu must be locked. // - While mm.activeMu was locked, a call to // existingPMAsLocked(needInternalMappings=true) succeeded for all // addresses in ar. // - ar.Length() != 0. // - pseg.Range().Contains(ar.Start). func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar hostarch.AddrRange) safemem.BlockSeq { if checkInvariants { if !ar.WellFormed() || ar.Length() == 0 { panic(fmt.Sprintf("invalid ar: %v", ar)) } if !pseg.Range().Contains(ar.Start) { panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar)) } } if ar.End <= pseg.End() { // Since only one pma is involved, we can use pma.internalMappings // directly, avoiding a slice allocation. offset := uint64(ar.Start - pseg.Start()) return pseg.ValuePtr().internalMappings.DropFirst64(offset).TakeFirst64(uint64(ar.Length())) } var ims []safemem.Block for { pr := pseg.Range().Intersect(ar) for pims := pseg.ValuePtr().internalMappings.DropFirst64(uint64(pr.Start - pseg.Start())).TakeFirst64(uint64(pr.Length())); !pims.IsEmpty(); pims = pims.Tail() { ims = append(ims, pims.Head()) } if ar.End <= pseg.End() { break } pseg = pseg.NextSegment() } return safemem.BlockSeqFromSlice(ims) } // vecInternalMappingsLocked returns cached internal mappings for addresses in // ars. // // Preconditions: // - mm.activeMu must be locked. // - While mm.activeMu was locked, a call to // existingVecPMAsLocked(needInternalMappings=true) succeeded for all // addresses in ars. func (mm *MemoryManager) vecInternalMappingsLocked(ars hostarch.AddrRangeSeq) safemem.BlockSeq { var ims []safemem.Block for ; !ars.IsEmpty(); ars = ars.Tail() { ar := ars.Head() if ar.Length() == 0 { continue } for pims := mm.internalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); !pims.IsEmpty(); pims = pims.Tail() { ims = append(ims, pims.Head()) } } return safemem.BlockSeqFromSlice(ims) } // addRSSLocked updates the current and maximum resident set size of a // MemoryManager to reflect the insertion of a pma at ar. // // Preconditions: mm.activeMu must be locked for writing. func (mm *MemoryManager) addRSSLocked(ar hostarch.AddrRange) { mm.curRSS += uint64(ar.Length()) if mm.curRSS > mm.maxRSS { mm.maxRSS = mm.curRSS } } // removeRSSLocked updates the current resident set size of a MemoryManager to // reflect the removal of a pma at ar. // // Preconditions: mm.activeMu must be locked for writing. func (mm *MemoryManager) removeRSSLocked(ar hostarch.AddrRange) { mm.curRSS -= uint64(ar.Length()) } // pmaSetFunctions implements segment.Functions for pmaSet. type pmaSetFunctions struct{} func (pmaSetFunctions) MinKey() hostarch.Addr { return 0 } func (pmaSetFunctions) MaxKey() hostarch.Addr { return ^hostarch.Addr(0) } func (pmaSetFunctions) ClearValue(pma *pma) { pma.file = nil pma.internalMappings = safemem.BlockSeq{} } func (pmaSetFunctions) Merge(ar1 hostarch.AddrRange, pma1 pma, ar2 hostarch.AddrRange, pma2 pma) (pma, bool) { if pma1.file != pma2.file || pma1.off+uint64(ar1.Length()) != pma2.off || pma1.translatePerms != pma2.translatePerms || pma1.effectivePerms != pma2.effectivePerms || pma1.maxPerms != pma2.maxPerms || pma1.needCOW != pma2.needCOW || pma1.private != pma2.private || pma1.huge != pma2.huge { return pma{}, false } // Discard internal mappings instead of trying to merge them, since merging // them requires an allocation and getting them again from the // memmap.File might not. pma1.internalMappings = safemem.BlockSeq{} return pma1, true } func (pmaSetFunctions) Split(ar hostarch.AddrRange, p pma, split hostarch.Addr) (pma, pma) { newlen1 := uint64(split - ar.Start) p2 := p p2.off += newlen1 if !p.internalMappings.IsEmpty() { p.internalMappings = p.internalMappings.TakeFirst64(newlen1) p2.internalMappings = p2.internalMappings.DropFirst64(newlen1) } return p, p2 } // findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do // so by scanning linearly backward from pgap. // // Preconditions: // - mm.activeMu must be locked. // - addr <= pgap.Start(). func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr hostarch.Addr, pgap pmaGapIterator) pmaIterator { if checkInvariants { if !pgap.Ok() { panic("terminal pma iterator") } if addr > pgap.Start() { panic(fmt.Sprintf("can't seek backward to %#x from %#x", addr, pgap.Start())) } } // Optimistically check if pgap.PrevSegment() is the PMA we're looking for, // which is the case if findOrSeekPrevUpperBoundPMA is called to find the // start of a range containing only a single PMA. if pseg := pgap.PrevSegment(); pseg.Start() <= addr { return pseg } return mm.pmas.UpperBoundSegment(addr) } // getInternalMappingsLocked ensures that pseg.ValuePtr().internalMappings is // non-empty. // // Preconditions: mm.activeMu must be locked for writing. func (pseg pmaIterator) getInternalMappingsLocked() error { pma := pseg.ValuePtr() if pma.internalMappings.IsEmpty() { // This must use maxPerms (instead of perms) because some permission // constraints are only visible to vmas; for example, mappings of // read-only files have vma.maxPerms.Write unset, but this may not be // visible to the memmap.Mappable. perms := pma.maxPerms // We will never execute application code through an internal mapping. perms.Execute = false ims, err := pma.file.MapInternal(pseg.fileRange(), perms) if err != nil { return err } pma.internalMappings = ims } return nil } func (pseg pmaIterator) fileRange() memmap.FileRange { return pseg.fileRangeOf(pseg.Range()) } // Preconditions: // - pseg.Range().IsSupersetOf(ar). // - ar.Length != 0. func (pseg pmaIterator) fileRangeOf(ar hostarch.AddrRange) memmap.FileRange { if checkInvariants { if !pseg.Ok() { panic("terminal pma iterator") } if !ar.WellFormed() || ar.Length() == 0 { panic(fmt.Sprintf("invalid ar: %v", ar)) } if !pseg.Range().IsSupersetOf(ar) { panic(fmt.Sprintf("ar %v out of bounds %v", ar, pseg.Range())) } } pma := pseg.ValuePtr() pstart := pseg.Start() return memmap.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)} } // joinAddrRanges returns the smallest hostarch.AddrRange that is a superset of // both ar1 and ar2. If either ar1 or ar2 have length 0, joinAddrRanges returns // the other range. If both ar1 and ar2 have length 0, joinAddrRanges returns // an unspecified range with length 0. func joinAddrRanges(ar1, ar2 hostarch.AddrRange) hostarch.AddrRange { if ar1.Length() == 0 { return ar2 } if ar2.Length() == 0 { return ar1 } ar := ar1 if ar.Start > ar2.Start { ar.Start = ar2.Start } if ar.End < ar2.End { ar.End = ar2.End } if checkInvariants { if !ar.IsSupersetOf(ar1) || !ar.IsSupersetOf(ar2) { panic(fmt.Sprintf("%v is not a superset of both %v and %v", ar, ar1, ar2)) } } return ar } // pendingFileDecRefs accumulates released memmap.FileRange references so that // calls to memmap.File.DecRef() can occur without holding locks. type pendingFileDecRefs struct { slice []pendingFileDecRef } type pendingFileDecRef struct { file memmap.File fr memmap.FileRange } var pendingFileDecRefsPool = sync.Pool{ New: func() any { return &pendingFileDecRefs{} }, } func appendPendingFileDecRef(pfdrs *pendingFileDecRefs, file memmap.File, fr memmap.FileRange) *pendingFileDecRefs { if pfdrs == nil { pfdrs = pendingFileDecRefsPool.Get().(*pendingFileDecRefs) } pfdrs.slice = append(pfdrs.slice, pendingFileDecRef{file, fr}) return pfdrs } // Cleanup releases all references accumulated by pfdrs and releases ownership // of pfdrs. pfdrs may be nil. // // Preconditions: No AddressSpace ranges may be awaiting unmapping (since such // ranges may refer to memmap.File pages that will be dropped.) func (pfdrs *pendingFileDecRefs) Cleanup() { if pfdrs == nil { return } for i := range pfdrs.slice { pfdr := &pfdrs.slice[i] pfdr.file.DecRef(pfdr.fr) pfdr.file = nil // allow GC } pfdrs.slice = pfdrs.slice[:0] pendingFileDecRefsPool.Put(pfdrs) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/pma_set.go000066400000000000000000002023311465435605700233410ustar00rootroot00000000000000package mm import ( __generics_imported0 "gvisor.dev/gvisor/pkg/hostarch" ) import ( "bytes" "context" "fmt" ) // trackGaps is an optional parameter. // // If trackGaps is 1, the Set will track maximum gap size recursively, // enabling the GapIterator.{Prev,Next}LargeEnoughGap functions. In this // case, Key must be an unsigned integer. // // trackGaps must be 0 or 1. const pmatrackGaps = 0 var _ = uint8(pmatrackGaps << 7) // Will fail if not zero or one. // dynamicGap is a type that disappears if trackGaps is 0. type pmadynamicGap [pmatrackGaps]__generics_imported0.Addr // Get returns the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *pmadynamicGap) Get() __generics_imported0.Addr { return d[:][0] } // Set sets the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *pmadynamicGap) Set(v __generics_imported0.Addr) { d[:][0] = v } const ( // minDegree is the minimum degree of an internal node in a Set B-tree. // // - Any non-root node has at least minDegree-1 segments. // // - Any non-root internal (non-leaf) node has at least minDegree children. // // - The root node may have fewer than minDegree-1 segments, but it may // only have 0 segments if the tree is empty. // // Our implementation requires minDegree >= 3. Higher values of minDegree // usually improve performance, but increase memory usage for small sets. pmaminDegree = 8 pmamaxDegree = 2 * pmaminDegree ) // A Set is a mapping of segments with non-overlapping Range keys. The zero // value for a Set is an empty set. Set values are not safely movable nor // copyable. Set is thread-compatible. // // +stateify savable type pmaSet struct { root pmanode `state:".([]pmaFlatSegment)"` } // IsEmpty returns true if the set contains no segments. func (s *pmaSet) IsEmpty() bool { return s.root.nrSegments == 0 } // IsEmptyRange returns true iff no segments in the set overlap the given // range. This is semantically equivalent to s.SpanRange(r) == 0, but may be // more efficient. func (s *pmaSet) IsEmptyRange(r __generics_imported0.AddrRange) bool { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return true } _, gap := s.Find(r.Start) if !gap.Ok() { return false } return r.End <= gap.End() } // Span returns the total size of all segments in the set. func (s *pmaSet) Span() __generics_imported0.Addr { var sz __generics_imported0.Addr for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { sz += seg.Range().Length() } return sz } // SpanRange returns the total size of the intersection of segments in the set // with the given range. func (s *pmaSet) SpanRange(r __generics_imported0.AddrRange) __generics_imported0.Addr { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return 0 } var sz __generics_imported0.Addr for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { sz += seg.Range().Intersect(r).Length() } return sz } // FirstSegment returns the first segment in the set. If the set is empty, // FirstSegment returns a terminal iterator. func (s *pmaSet) FirstSegment() pmaIterator { if s.root.nrSegments == 0 { return pmaIterator{} } return s.root.firstSegment() } // LastSegment returns the last segment in the set. If the set is empty, // LastSegment returns a terminal iterator. func (s *pmaSet) LastSegment() pmaIterator { if s.root.nrSegments == 0 { return pmaIterator{} } return s.root.lastSegment() } // FirstGap returns the first gap in the set. func (s *pmaSet) FirstGap() pmaGapIterator { n := &s.root for n.hasChildren { n = n.children[0] } return pmaGapIterator{n, 0} } // LastGap returns the last gap in the set. func (s *pmaSet) LastGap() pmaGapIterator { n := &s.root for n.hasChildren { n = n.children[n.nrSegments] } return pmaGapIterator{n, n.nrSegments} } // Find returns the segment or gap whose range contains the given key. If a // segment is found, the returned Iterator is non-terminal and the // returned GapIterator is terminal. Otherwise, the returned Iterator is // terminal and the returned GapIterator is non-terminal. func (s *pmaSet) Find(key __generics_imported0.Addr) (pmaIterator, pmaGapIterator) { n := &s.root for { lower := 0 upper := n.nrSegments for lower < upper { i := lower + (upper-lower)/2 if r := n.keys[i]; key < r.End { if key >= r.Start { return pmaIterator{n, i}, pmaGapIterator{} } upper = i } else { lower = i + 1 } } i := lower if !n.hasChildren { return pmaIterator{}, pmaGapIterator{n, i} } n = n.children[i] } } // FindSegment returns the segment whose range contains the given key. If no // such segment exists, FindSegment returns a terminal iterator. func (s *pmaSet) FindSegment(key __generics_imported0.Addr) pmaIterator { seg, _ := s.Find(key) return seg } // LowerBoundSegment returns the segment with the lowest range that contains a // key greater than or equal to min. If no such segment exists, // LowerBoundSegment returns a terminal iterator. func (s *pmaSet) LowerBoundSegment(min __generics_imported0.Addr) pmaIterator { seg, gap := s.Find(min) if seg.Ok() { return seg } return gap.NextSegment() } // UpperBoundSegment returns the segment with the highest range that contains a // key less than or equal to max. If no such segment exists, UpperBoundSegment // returns a terminal iterator. func (s *pmaSet) UpperBoundSegment(max __generics_imported0.Addr) pmaIterator { seg, gap := s.Find(max) if seg.Ok() { return seg } return gap.PrevSegment() } // FindGap returns the gap containing the given key. If no such gap exists // (i.e. the set contains a segment containing that key), FindGap returns a // terminal iterator. func (s *pmaSet) FindGap(key __generics_imported0.Addr) pmaGapIterator { _, gap := s.Find(key) return gap } // LowerBoundGap returns the gap with the lowest range that is greater than or // equal to min. func (s *pmaSet) LowerBoundGap(min __generics_imported0.Addr) pmaGapIterator { seg, gap := s.Find(min) if gap.Ok() { return gap } return seg.NextGap() } // UpperBoundGap returns the gap with the highest range that is less than or // equal to max. func (s *pmaSet) UpperBoundGap(max __generics_imported0.Addr) pmaGapIterator { seg, gap := s.Find(max) if gap.Ok() { return gap } return seg.PrevGap() } // FirstLargeEnoughGap returns the first gap in the set with at least the given // length. If no such gap exists, FirstLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *pmaSet) FirstLargeEnoughGap(minSize __generics_imported0.Addr) pmaGapIterator { if pmatrackGaps != 1 { panic("set is not tracking gaps") } gap := s.FirstGap() if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // LastLargeEnoughGap returns the last gap in the set with at least the given // length. If no such gap exists, LastLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *pmaSet) LastLargeEnoughGap(minSize __generics_imported0.Addr) pmaGapIterator { if pmatrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LastGap() if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // LowerBoundLargeEnoughGap returns the first gap in the set with at least the // given length and whose range contains a key greater than or equal to min. If // no such gap exists, LowerBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *pmaSet) LowerBoundLargeEnoughGap(min, minSize __generics_imported0.Addr) pmaGapIterator { if pmatrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LowerBoundGap(min) if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // UpperBoundLargeEnoughGap returns the last gap in the set with at least the // given length and whose range contains a key less than or equal to max. If no // such gap exists, UpperBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *pmaSet) UpperBoundLargeEnoughGap(max, minSize __generics_imported0.Addr) pmaGapIterator { if pmatrackGaps != 1 { panic("set is not tracking gaps") } gap := s.UpperBoundGap(max) if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // Insert inserts the given segment into the given gap. If the new segment can // be merged with adjacent segments, Insert will do so. Insert returns an // iterator to the segment containing the inserted value (which may have been // merged with other values). All existing iterators (including gap, but not // including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, Insert panics. // // Insert is semantically equivalent to a InsertWithoutMerging followed by a // Merge, but may be more efficient. Note that there is no unchecked variant of // Insert since Insert must retrieve and inspect gap's predecessor and // successor segments regardless. func (s *pmaSet) Insert(gap pmaGapIterator, r __generics_imported0.AddrRange, val pma) pmaIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } prev, next := gap.PrevSegment(), gap.NextSegment() if prev.Ok() && prev.End() > r.Start { panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) } if next.Ok() && next.Start() < r.End { panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) } if prev.Ok() && prev.End() == r.Start { if mval, ok := (pmaSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { shrinkMaxGap := pmatrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() prev.SetEndUnchecked(r.End) prev.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } if next.Ok() && next.Start() == r.End { val = mval if mval, ok := (pmaSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { prev.SetEndUnchecked(next.End()) prev.SetValue(mval) return s.Remove(next).PrevSegment() } } return prev } } if next.Ok() && next.Start() == r.End { if mval, ok := (pmaSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { shrinkMaxGap := pmatrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() next.SetStartUnchecked(r.Start) next.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } return next } } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMerging inserts the given segment into the given gap and // returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, // InsertWithoutMerging panics. func (s *pmaSet) InsertWithoutMerging(gap pmaGapIterator, r __generics_imported0.AddrRange, val pma) pmaIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if gr := gap.Range(); !gr.IsSupersetOf(r) { panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMergingUnchecked inserts the given segment into the given gap // and returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // Preconditions: // - r.Start >= gap.Start(). // - r.End <= gap.End(). func (s *pmaSet) InsertWithoutMergingUnchecked(gap pmaGapIterator, r __generics_imported0.AddrRange, val pma) pmaIterator { gap = gap.node.rebalanceBeforeInsert(gap) splitMaxGap := pmatrackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get()) copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) gap.node.keys[gap.index] = r gap.node.values[gap.index] = val gap.node.nrSegments++ if splitMaxGap { gap.node.updateMaxGapLeaf() } return pmaIterator{gap.node, gap.index} } // InsertRange inserts the given segment into the set. If the new segment can // be merged with adjacent segments, InsertRange will do so. InsertRange // returns an iterator to the segment containing the inserted value (which may // have been merged with other values). All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertRange panics. // // InsertRange searches the set to find the gap to insert into. If the caller // already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *pmaSet) InsertRange(r __generics_imported0.AddrRange, val pma) pmaIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.Insert(gap, r, val) } // InsertWithoutMergingRange inserts the given segment into the set and returns // an iterator to the inserted segment. All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertWithoutMergingRange panics. // // InsertWithoutMergingRange searches the set to find the gap to insert into. // If the caller already has the appropriate GapIterator, or if the caller // needs to do additional work between finding the gap and insertion, use // InsertWithoutMerging instead. func (s *pmaSet) InsertWithoutMergingRange(r __generics_imported0.AddrRange, val pma) pmaIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.InsertWithoutMerging(gap, r, val) } // TryInsertRange attempts to insert the given segment into the set. If the new // segment can be merged with adjacent segments, TryInsertRange will do so. // TryInsertRange returns an iterator to the segment containing the inserted // value (which may have been merged with other values). All existing iterators // (excluding the returned iterator) are invalidated. // // If the new segment would overlap an existing segment, TryInsertRange does // nothing and returns a terminal iterator. // // TryInsertRange searches the set to find the gap to insert into. If the // caller already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *pmaSet) TryInsertRange(r __generics_imported0.AddrRange, val pma) pmaIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return pmaIterator{} } if gap.End() < r.End { return pmaIterator{} } return s.Insert(gap, r, val) } // TryInsertWithoutMergingRange attempts to insert the given segment into the // set. If successful, it returns an iterator to the inserted segment; all // existing iterators (excluding the returned iterator) are invalidated. If the // new segment would overlap an existing segment, TryInsertWithoutMergingRange // does nothing and returns a terminal iterator. // // TryInsertWithoutMergingRange searches the set to find the gap to insert // into. If the caller already has the appropriate GapIterator, or if the // caller needs to do additional work between finding the gap and insertion, // use InsertWithoutMerging instead. func (s *pmaSet) TryInsertWithoutMergingRange(r __generics_imported0.AddrRange, val pma) pmaIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return pmaIterator{} } if gap.End() < r.End { return pmaIterator{} } return s.InsertWithoutMerging(gap, r, val) } // Remove removes the given segment and returns an iterator to the vacated gap. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. func (s *pmaSet) Remove(seg pmaIterator) pmaGapIterator { if seg.node.hasChildren { victim := seg.PrevSegment() seg.SetRangeUnchecked(victim.Range()) seg.SetValue(victim.Value()) nextAdjacentNode := seg.NextSegment().node if pmatrackGaps != 0 { nextAdjacentNode.updateMaxGapLeaf() } return s.Remove(victim).NextGap() } copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) pmaSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) seg.node.nrSegments-- if pmatrackGaps != 0 { seg.node.updateMaxGapLeaf() } return seg.node.rebalanceAfterRemove(pmaGapIterator{seg.node, seg.index}) } // RemoveAll removes all segments from the set. All existing iterators are // invalidated. func (s *pmaSet) RemoveAll() { s.root = pmanode{} } // RemoveRange removes all segments in the given range. An iterator to the // newly formed gap is returned, and all existing iterators are invalidated. // // RemoveRange searches the set to find segments to remove. If the caller // already has an iterator to either end of the range of segments to remove, or // if the caller needs to do additional work before removing each segment, // iterate segments and call Remove in a loop instead. func (s *pmaSet) RemoveRange(r __generics_imported0.AddrRange) pmaGapIterator { seg, gap := s.Find(r.Start) if seg.Ok() { seg = s.Isolate(seg, r) gap = s.Remove(seg) } for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { seg = s.SplitAfter(seg, r.End) gap = s.Remove(seg) } return gap } // RemoveFullRange is equivalent to RemoveRange, except that if any key in the // given range does not correspond to a segment, RemoveFullRange panics. func (s *pmaSet) RemoveFullRange(r __generics_imported0.AddrRange) pmaGapIterator { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) end := seg.End() gap := s.Remove(seg) if r.End <= end { return gap } seg = gap.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // Merge attempts to merge two neighboring segments. If successful, Merge // returns an iterator to the merged segment, and all existing iterators are // invalidated. Otherwise, Merge returns a terminal iterator. // // If first is not the predecessor of second, Merge panics. func (s *pmaSet) Merge(first, second pmaIterator) pmaIterator { if first.NextSegment() != second { panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) } return s.MergeUnchecked(first, second) } // MergeUnchecked attempts to merge two neighboring segments. If successful, // MergeUnchecked returns an iterator to the merged segment, and all existing // iterators are invalidated. Otherwise, MergeUnchecked returns a terminal // iterator. // // Precondition: first is the predecessor of second: first.NextSegment() == // second, first == second.PrevSegment(). func (s *pmaSet) MergeUnchecked(first, second pmaIterator) pmaIterator { if first.End() == second.Start() { if mval, ok := (pmaSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { first.SetEndUnchecked(second.End()) first.SetValue(mval) return s.Remove(second).PrevSegment() } } return pmaIterator{} } // MergePrev attempts to merge the given segment with its predecessor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergePrev is usually used when mutating segments while iterating them in // order of increasing keys, to attempt merging of each mutated segment with // its previously-mutated predecessor. In such cases, merging a mutated segment // with its unmutated successor would incorrectly cause the latter to be // skipped. func (s *pmaSet) MergePrev(seg pmaIterator) pmaIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } return seg } // MergeNext attempts to merge the given segment with its successor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergeNext is usually used when mutating segments while iterating them in // order of decreasing keys, to attempt merging of each mutated segment with // its previously-mutated successor. In such cases, merging a mutated segment // with its unmutated predecessor would incorrectly cause the latter to be // skipped. func (s *pmaSet) MergeNext(seg pmaIterator) pmaIterator { if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // Unisolate attempts to merge the given segment with its predecessor and // successor if possible, and returns an updated iterator to the extended // segment. All existing iterators (including seg, but not including the // returned iterator) are invalidated. // // Unisolate is usually used in conjunction with Isolate when mutating part of // a single segment in a way that may affect its mergeability. For the reasons // described by MergePrev and MergeNext, it is usually incorrect to use the // return value of Unisolate in a loop variable. func (s *pmaSet) Unisolate(seg pmaIterator) pmaIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // MergeAll merges all mergeable adjacent segments in the set. All existing // iterators are invalidated. func (s *pmaSet) MergeAll() { seg := s.FirstSegment() if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeInsideRange attempts to merge all adjacent segments that contain a key // in the specific range. All existing iterators are invalidated. // // MergeInsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid a redundant search. func (s *pmaSet) MergeInsideRange(r __generics_imported0.AddrRange) { seg := s.LowerBoundSegment(r.Start) if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() && next.Start() < r.End { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeOutsideRange attempts to merge the segment containing r.Start with its // predecessor, and the segment containing r.End-1 with its successor. // // MergeOutsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid two redundant searches. func (s *pmaSet) MergeOutsideRange(r __generics_imported0.AddrRange) { first := s.FindSegment(r.Start) if first.Ok() { if prev := first.PrevSegment(); prev.Ok() { s.Merge(prev, first) } } last := s.FindSegment(r.End - 1) if last.Ok() { if next := last.NextSegment(); next.Ok() { s.Merge(last, next) } } } // Split splits the given segment at the given key and returns iterators to the // two resulting segments. All existing iterators (including seg, but not // including the returned iterators) are invalidated. // // If the segment cannot be split at split (because split is at the start or // end of the segment's range, so splitting would produce a segment with zero // length, or because split falls outside the segment's range altogether), // Split panics. func (s *pmaSet) Split(seg pmaIterator, split __generics_imported0.Addr) (pmaIterator, pmaIterator) { if !seg.Range().CanSplitAt(split) { panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) } return s.SplitUnchecked(seg, split) } // SplitUnchecked splits the given segment at the given key and returns // iterators to the two resulting segments. All existing iterators (including // seg, but not including the returned iterators) are invalidated. // // Preconditions: seg.Start() < key < seg.End(). func (s *pmaSet) SplitUnchecked(seg pmaIterator, split __generics_imported0.Addr) (pmaIterator, pmaIterator) { val1, val2 := (pmaSetFunctions{}).Split(seg.Range(), seg.Value(), split) end2 := seg.End() seg.SetEndUnchecked(split) seg.SetValue(val1) seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), __generics_imported0.AddrRange{split, end2}, val2) return seg2.PrevSegment(), seg2 } // SplitBefore ensures that the given segment's start is at least start by // splitting at start if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterator) are invalidated. // // SplitBefore is usually when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, the first segment may // extend beyond the start of the range to be mutated, and needs to be // SplitBefore to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter; i.e. SplitBefore needs to be invoked on each segment, while // SplitAfter only needs to be invoked on the first. // // Preconditions: start < seg.End(). func (s *pmaSet) SplitBefore(seg pmaIterator, start __generics_imported0.Addr) pmaIterator { if seg.Range().CanSplitAt(start) { _, seg = s.SplitUnchecked(seg, start) } return seg } // SplitAfter ensures that the given segment's end is at most end by splitting // at end if necessary, and returns an updated iterator to the bounded segment. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. // // SplitAfter is usually used when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, each iterated segment // may extend beyond the end of the range to be mutated, and needs to be // SplitAfter to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter exchange roles; i.e. SplitBefore needs to be invoked on each // segment, while SplitAfter only needs to be invoked on the first. // // Preconditions: seg.Start() < end. func (s *pmaSet) SplitAfter(seg pmaIterator, end __generics_imported0.Addr) pmaIterator { if seg.Range().CanSplitAt(end) { seg, _ = s.SplitUnchecked(seg, end) } return seg } // Isolate ensures that the given segment's range is a subset of r by splitting // at r.Start and r.End if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterators) are invalidated. // // Isolate is usually used when mutating part of a single segment, or when // mutating segments in a range where the first segment is not necessarily // split, making use of SplitBefore/SplitAfter complex. // // Preconditions: seg.Range().Overlaps(r). func (s *pmaSet) Isolate(seg pmaIterator, r __generics_imported0.AddrRange) pmaIterator { if seg.Range().CanSplitAt(r.Start) { _, seg = s.SplitUnchecked(seg, r.Start) } if seg.Range().CanSplitAt(r.End) { seg, _ = s.SplitUnchecked(seg, r.End) } return seg } // LowerBoundSegmentSplitBefore combines LowerBoundSegment and SplitBefore. // // LowerBoundSegmentSplitBefore is usually used when mutating segments in a // range while iterating them in order of increasing keys. In such cases, // LowerBoundSegmentSplitBefore provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *pmaSet) LowerBoundSegmentSplitBefore(min __generics_imported0.Addr) pmaIterator { seg := s.LowerBoundSegment(min) if seg.Ok() { seg = s.SplitBefore(seg, min) } return seg } // UpperBoundSegmentSplitAfter combines UpperBoundSegment and SplitAfter. // // UpperBoundSegmentSplitAfter is usually used when mutating segments in a // range while iterating them in order of decreasing keys. In such cases, // UpperBoundSegmentSplitAfter provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *pmaSet) UpperBoundSegmentSplitAfter(max __generics_imported0.Addr) pmaIterator { seg := s.UpperBoundSegment(max) if seg.Ok() { seg = s.SplitAfter(seg, max) } return seg } // VisitRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments will not be split, so f may be called // on segments lying partially outside r. Non-empty gaps between segments are // skipped. If a call to f returns false, VisitRange stops iteration // immediately. // // N.B. f must not invalidate iterators into s. func (s *pmaSet) VisitRange(r __generics_imported0.AddrRange, f func(seg pmaIterator) bool) { for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { if !f(seg) { return } } } // VisitFullRange is equivalent to VisitRange, except that if any key in r that // is visited before f returns false does not correspond to a segment, // VisitFullRange panics. func (s *pmaSet) VisitFullRange(r __generics_imported0.AddrRange, f func(seg pmaIterator) bool) { pos := r.Start seg := s.FindSegment(r.Start) for { if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", pos)) } if !f(seg) { return } pos = seg.End() if r.End <= pos { return } seg, _ = seg.NextNonEmpty() } } // MutateRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments that lie partially outside r are split // before f is called, such that f only observes segments entirely within r. // Iterated segments are merged again after f is called. Non-empty gaps between // segments are skipped. If a call to f returns false, MutateRange stops // iteration immediately. // // MutateRange invalidates all existing iterators. // // N.B. f must not invalidate iterators into s. func (s *pmaSet) MutateRange(r __generics_imported0.AddrRange, f func(seg pmaIterator) bool) { seg := s.LowerBoundSegmentSplitBefore(r.Start) for seg.Ok() && seg.Start() < r.End { seg = s.SplitAfter(seg, r.End) cont := f(seg) seg = s.MergePrev(seg) if !cont { s.MergeNext(seg) return } seg = seg.NextSegment() } if seg.Ok() { s.MergePrev(seg) } } // MutateFullRange is equivalent to MutateRange, except that if any key in r // that is visited before f returns false does not correspond to a segment, // MutateFullRange panics. func (s *pmaSet) MutateFullRange(r __generics_imported0.AddrRange, f func(seg pmaIterator) bool) { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) cont := f(seg) end := seg.End() seg = s.MergePrev(seg) if !cont || r.End <= end { s.MergeNext(seg) return } seg = seg.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // +stateify savable type pmanode struct { // An internal binary tree node looks like: // // K // / \ // Cl Cr // // where all keys in the subtree rooted by Cl (the left subtree) are less // than K (the key of the parent node), and all keys in the subtree rooted // by Cr (the right subtree) are greater than K. // // An internal B-tree node's indexes work out to look like: // // K0 K1 K2 ... Kn-1 // / \/ \/ \ ... / \ // C0 C1 C2 C3 ... Cn-1 Cn // // where n is nrSegments. nrSegments int // parent is a pointer to this node's parent. If this node is root, parent // is nil. parent *pmanode // parentIndex is the index of this node in parent.children. parentIndex int // Flag for internal nodes that is technically redundant with "children[0] // != nil", but is stored in the first cache line. "hasChildren" rather // than "isLeaf" because false must be the correct value for an empty root. hasChildren bool // The longest gap within this node. If the node is a leaf, it's simply the // maximum gap among all the (nrSegments+1) gaps formed by its nrSegments keys // including the 0th and nrSegments-th gap possibly shared with its upper-level // nodes; if it's a non-leaf node, it's the max of all children's maxGap. maxGap pmadynamicGap // Nodes store keys and values in separate arrays to maximize locality in // the common case (scanning keys for lookup). keys [pmamaxDegree - 1]__generics_imported0.AddrRange values [pmamaxDegree - 1]pma children [pmamaxDegree]*pmanode } // firstSegment returns the first segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *pmanode) firstSegment() pmaIterator { for n.hasChildren { n = n.children[0] } return pmaIterator{n, 0} } // lastSegment returns the last segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *pmanode) lastSegment() pmaIterator { for n.hasChildren { n = n.children[n.nrSegments] } return pmaIterator{n, n.nrSegments - 1} } func (n *pmanode) prevSibling() *pmanode { if n.parent == nil || n.parentIndex == 0 { return nil } return n.parent.children[n.parentIndex-1] } func (n *pmanode) nextSibling() *pmanode { if n.parent == nil || n.parentIndex == n.parent.nrSegments { return nil } return n.parent.children[n.parentIndex+1] } // rebalanceBeforeInsert splits n and its ancestors if they are full, as // required for insertion, and returns an updated iterator to the position // represented by gap. func (n *pmanode) rebalanceBeforeInsert(gap pmaGapIterator) pmaGapIterator { if n.nrSegments < pmamaxDegree-1 { return gap } if n.parent != nil { gap = n.parent.rebalanceBeforeInsert(gap) } if n.parent == nil { left := &pmanode{ nrSegments: pmaminDegree - 1, parent: n, parentIndex: 0, hasChildren: n.hasChildren, } right := &pmanode{ nrSegments: pmaminDegree - 1, parent: n, parentIndex: 1, hasChildren: n.hasChildren, } copy(left.keys[:pmaminDegree-1], n.keys[:pmaminDegree-1]) copy(left.values[:pmaminDegree-1], n.values[:pmaminDegree-1]) copy(right.keys[:pmaminDegree-1], n.keys[pmaminDegree:]) copy(right.values[:pmaminDegree-1], n.values[pmaminDegree:]) n.keys[0], n.values[0] = n.keys[pmaminDegree-1], n.values[pmaminDegree-1] pmazeroValueSlice(n.values[1:]) if n.hasChildren { copy(left.children[:pmaminDegree], n.children[:pmaminDegree]) copy(right.children[:pmaminDegree], n.children[pmaminDegree:]) pmazeroNodeSlice(n.children[2:]) for i := 0; i < pmaminDegree; i++ { left.children[i].parent = left left.children[i].parentIndex = i right.children[i].parent = right right.children[i].parentIndex = i } } n.nrSegments = 1 n.hasChildren = true n.children[0] = left n.children[1] = right if pmatrackGaps != 0 { left.updateMaxGapLocal() right.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < pmaminDegree { return pmaGapIterator{left, gap.index} } return pmaGapIterator{right, gap.index - pmaminDegree} } copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[pmaminDegree-1], n.values[pmaminDegree-1] copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { n.parent.children[i].parentIndex = i } sibling := &pmanode{ nrSegments: pmaminDegree - 1, parent: n.parent, parentIndex: n.parentIndex + 1, hasChildren: n.hasChildren, } n.parent.children[n.parentIndex+1] = sibling n.parent.nrSegments++ copy(sibling.keys[:pmaminDegree-1], n.keys[pmaminDegree:]) copy(sibling.values[:pmaminDegree-1], n.values[pmaminDegree:]) pmazeroValueSlice(n.values[pmaminDegree-1:]) if n.hasChildren { copy(sibling.children[:pmaminDegree], n.children[pmaminDegree:]) pmazeroNodeSlice(n.children[pmaminDegree:]) for i := 0; i < pmaminDegree; i++ { sibling.children[i].parent = sibling sibling.children[i].parentIndex = i } } n.nrSegments = pmaminDegree - 1 if pmatrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < pmaminDegree { return gap } return pmaGapIterator{sibling, gap.index - pmaminDegree} } // rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient // (contain fewer segments than required by B-tree invariants), as required for // removal, and returns an updated iterator to the position represented by gap. // // Precondition: n is the only node in the tree that may currently violate a // B-tree invariant. func (n *pmanode) rebalanceAfterRemove(gap pmaGapIterator) pmaGapIterator { for { if n.nrSegments >= pmaminDegree-1 { return gap } if n.parent == nil { return gap } if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= pmaminDegree { copy(n.keys[1:], n.keys[:n.nrSegments]) copy(n.values[1:], n.values[:n.nrSegments]) n.keys[0] = n.parent.keys[n.parentIndex-1] n.values[0] = n.parent.values[n.parentIndex-1] n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] pmaSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { copy(n.children[1:], n.children[:n.nrSegments+1]) n.children[0] = sibling.children[sibling.nrSegments] sibling.children[sibling.nrSegments] = nil n.children[0].parent = n n.children[0].parentIndex = 0 for i := 1; i < n.nrSegments+2; i++ { n.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if pmatrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling && gap.index == sibling.nrSegments { return pmaGapIterator{n, 0} } if gap.node == n { return pmaGapIterator{n, gap.index + 1} } return gap } if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= pmaminDegree { n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] n.values[n.nrSegments] = n.parent.values[n.parentIndex] n.parent.keys[n.parentIndex] = sibling.keys[0] n.parent.values[n.parentIndex] = sibling.values[0] copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) pmaSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { n.children[n.nrSegments+1] = sibling.children[0] copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) sibling.children[sibling.nrSegments] = nil n.children[n.nrSegments+1].parent = n n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 for i := 0; i < sibling.nrSegments; i++ { sibling.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if pmatrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling { if gap.index == 0 { return pmaGapIterator{n, n.nrSegments} } return pmaGapIterator{sibling, gap.index - 1} } return gap } p := n.parent if p.nrSegments == 1 { left, right := p.children[0], p.children[1] p.nrSegments = left.nrSegments + right.nrSegments + 1 p.hasChildren = left.hasChildren p.keys[left.nrSegments] = p.keys[0] p.values[left.nrSegments] = p.values[0] copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := 0; i < p.nrSegments+1; i++ { p.children[i].parent = p p.children[i].parentIndex = i } } else { p.children[0] = nil p.children[1] = nil } if gap.node == left { return pmaGapIterator{p, gap.index} } if gap.node == right { return pmaGapIterator{p, gap.index + left.nrSegments + 1} } return gap } // Merge n and either sibling, along with the segment separating the // two, into whichever of the two nodes comes first. This is the // reverse of the non-root splitting case in // node.rebalanceBeforeInsert. var left, right *pmanode if n.parentIndex > 0 { left = n.prevSibling() right = n } else { left = n right = n.nextSibling() } if gap.node == right { gap = pmaGapIterator{left, gap.index + left.nrSegments + 1} } left.keys[left.nrSegments] = p.keys[left.parentIndex] left.values[left.nrSegments] = p.values[left.parentIndex] copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { left.children[i].parent = left left.children[i].parentIndex = i } } left.nrSegments += right.nrSegments + 1 copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) pmaSetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) for i := 0; i < p.nrSegments; i++ { p.children[i].parentIndex = i } p.children[p.nrSegments] = nil p.nrSegments-- if pmatrackGaps != 0 { left.updateMaxGapLocal() } n = p } } // updateMaxGapLeaf updates maxGap bottom-up from the calling leaf until no // necessary update. // // Preconditions: n must be a leaf node, trackGaps must be 1. func (n *pmanode) updateMaxGapLeaf() { if n.hasChildren { panic(fmt.Sprintf("updateMaxGapLeaf should always be called on leaf node: %v", n)) } max := n.calculateMaxGapLeaf() if max == n.maxGap.Get() { return } oldMax := n.maxGap.Get() n.maxGap.Set(max) if max > oldMax { for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() >= max { break } p.maxGap.Set(max) } return } for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() > oldMax { break } parentNewMax := p.calculateMaxGapInternal() if p.maxGap.Get() == parentNewMax { break } p.maxGap.Set(parentNewMax) } } // updateMaxGapLocal updates maxGap of the calling node solely with no // propagation to ancestor nodes. // // Precondition: trackGaps must be 1. func (n *pmanode) updateMaxGapLocal() { if !n.hasChildren { n.maxGap.Set(n.calculateMaxGapLeaf()) } else { n.maxGap.Set(n.calculateMaxGapInternal()) } } // calculateMaxGapLeaf iterates the gaps within a leaf node and calculate the // max. // // Preconditions: n must be a leaf node. func (n *pmanode) calculateMaxGapLeaf() __generics_imported0.Addr { max := pmaGapIterator{n, 0}.Range().Length() for i := 1; i <= n.nrSegments; i++ { if current := (pmaGapIterator{n, i}).Range().Length(); current > max { max = current } } return max } // calculateMaxGapInternal iterates children's maxGap within an internal node n // and calculate the max. // // Preconditions: n must be a non-leaf node. func (n *pmanode) calculateMaxGapInternal() __generics_imported0.Addr { max := n.children[0].maxGap.Get() for i := 1; i <= n.nrSegments; i++ { if current := n.children[i].maxGap.Get(); current > max { max = current } } return max } // searchFirstLargeEnoughGap returns the first gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *pmanode) searchFirstLargeEnoughGap(minSize __generics_imported0.Addr) pmaGapIterator { if n.maxGap.Get() < minSize { return pmaGapIterator{} } if n.hasChildren { for i := 0; i <= n.nrSegments; i++ { if largeEnoughGap := n.children[i].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := 0; i <= n.nrSegments; i++ { currentGap := pmaGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // searchLastLargeEnoughGap returns the last gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *pmanode) searchLastLargeEnoughGap(minSize __generics_imported0.Addr) pmaGapIterator { if n.maxGap.Get() < minSize { return pmaGapIterator{} } if n.hasChildren { for i := n.nrSegments; i >= 0; i-- { if largeEnoughGap := n.children[i].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := n.nrSegments; i >= 0; i-- { currentGap := pmaGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // A Iterator is conceptually one of: // // - A pointer to a segment in a set; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Iterators are copyable values and are meaningfully equality-comparable. The // zero value of Iterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type pmaIterator struct { // node is the node containing the iterated segment. If the iterator is // terminal, node is nil. node *pmanode // index is the index of the segment in node.keys/values. index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (seg pmaIterator) Ok() bool { return seg.node != nil } // Range returns the iterated segment's range key. func (seg pmaIterator) Range() __generics_imported0.AddrRange { return seg.node.keys[seg.index] } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (seg pmaIterator) Start() __generics_imported0.Addr { return seg.node.keys[seg.index].Start } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (seg pmaIterator) End() __generics_imported0.Addr { return seg.node.keys[seg.index].End } // SetRangeUnchecked mutates the iterated segment's range key. This operation // does not invalidate any iterators. // // Preconditions: // - r.Length() > 0. // - The new range must not overlap an existing one: // - If seg.NextSegment().Ok(), then r.end <= seg.NextSegment().Start(). // - If seg.PrevSegment().Ok(), then r.start >= seg.PrevSegment().End(). func (seg pmaIterator) SetRangeUnchecked(r __generics_imported0.AddrRange) { seg.node.keys[seg.index] = r } // SetRange mutates the iterated segment's range key. If the new range would // cause the iterated segment to overlap another segment, or if the new range // is invalid, SetRange panics. This operation does not invalidate any // iterators. func (seg pmaIterator) SetRange(r __generics_imported0.AddrRange) { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) } if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) } seg.SetRangeUnchecked(r) } // SetStartUnchecked mutates the iterated segment's start. This operation does // not invalidate any iterators. // // Preconditions: The new start must be valid: // - start < seg.End() // - If seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). func (seg pmaIterator) SetStartUnchecked(start __generics_imported0.Addr) { seg.node.keys[seg.index].Start = start } // SetStart mutates the iterated segment's start. If the new start value would // cause the iterated segment to overlap another segment, or would result in an // invalid range, SetStart panics. This operation does not invalidate any // iterators. func (seg pmaIterator) SetStart(start __generics_imported0.Addr) { if start >= seg.End() { panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) } if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) } seg.SetStartUnchecked(start) } // SetEndUnchecked mutates the iterated segment's end. This operation does not // invalidate any iterators. // // Preconditions: The new end must be valid: // - end > seg.Start(). // - If seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). func (seg pmaIterator) SetEndUnchecked(end __generics_imported0.Addr) { seg.node.keys[seg.index].End = end } // SetEnd mutates the iterated segment's end. If the new end value would cause // the iterated segment to overlap another segment, or would result in an // invalid range, SetEnd panics. This operation does not invalidate any // iterators. func (seg pmaIterator) SetEnd(end __generics_imported0.Addr) { if end <= seg.Start() { panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) } if next := seg.NextSegment(); next.Ok() && end > next.Start() { panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) } seg.SetEndUnchecked(end) } // Value returns a copy of the iterated segment's value. func (seg pmaIterator) Value() pma { return seg.node.values[seg.index] } // ValuePtr returns a pointer to the iterated segment's value. The pointer is // invalidated if the iterator is invalidated. This operation does not // invalidate any iterators. func (seg pmaIterator) ValuePtr() *pma { return &seg.node.values[seg.index] } // SetValue mutates the iterated segment's value. This operation does not // invalidate any iterators. func (seg pmaIterator) SetValue(val pma) { seg.node.values[seg.index] = val } // PrevSegment returns the iterated segment's predecessor. If there is no // preceding segment, PrevSegment returns a terminal iterator. func (seg pmaIterator) PrevSegment() pmaIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment() } if seg.index > 0 { return pmaIterator{seg.node, seg.index - 1} } if seg.node.parent == nil { return pmaIterator{} } return pmasegmentBeforePosition(seg.node.parent, seg.node.parentIndex) } // NextSegment returns the iterated segment's successor. If there is no // succeeding segment, NextSegment returns a terminal iterator. func (seg pmaIterator) NextSegment() pmaIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment() } if seg.index < seg.node.nrSegments-1 { return pmaIterator{seg.node, seg.index + 1} } if seg.node.parent == nil { return pmaIterator{} } return pmasegmentAfterPosition(seg.node.parent, seg.node.parentIndex) } // PrevGap returns the gap immediately before the iterated segment. func (seg pmaIterator) PrevGap() pmaGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment().NextGap() } return pmaGapIterator{seg.node, seg.index} } // NextGap returns the gap immediately after the iterated segment. func (seg pmaIterator) NextGap() pmaGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment().PrevGap() } return pmaGapIterator{seg.node, seg.index + 1} } // PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, // or the gap before the iterated segment otherwise. If seg.Start() == // Functions.MinKey(), PrevNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by PrevNonEmpty will be // non-terminal. func (seg pmaIterator) PrevNonEmpty() (pmaIterator, pmaGapIterator) { if prev := seg.PrevSegment(); prev.Ok() && prev.End() == seg.Start() { return prev, pmaGapIterator{} } return pmaIterator{}, seg.PrevGap() } // NextNonEmpty returns the iterated segment's successor if it is adjacent, or // the gap after the iterated segment otherwise. If seg.End() == // Functions.MaxKey(), NextNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by NextNonEmpty will be // non-terminal. func (seg pmaIterator) NextNonEmpty() (pmaIterator, pmaGapIterator) { if next := seg.NextSegment(); next.Ok() && next.Start() == seg.End() { return next, pmaGapIterator{} } return pmaIterator{}, seg.NextGap() } // A GapIterator is conceptually one of: // // - A pointer to a position between two segments, before the first segment, or // after the last segment in a set, called a *gap*; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Note that the gap between two adjacent segments exists (iterators to it are // non-terminal), but has a length of zero. GapIterator.IsEmpty returns true // for such gaps. An empty set contains a single gap, spanning the entire range // of the set's keys. // // GapIterators are copyable values and are meaningfully equality-comparable. // The zero value of GapIterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type pmaGapIterator struct { // The representation of a GapIterator is identical to that of an Iterator, // except that index corresponds to positions between segments in the same // way as for node.children (see comment for node.nrSegments). node *pmanode index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (gap pmaGapIterator) Ok() bool { return gap.node != nil } // Range returns the range spanned by the iterated gap. func (gap pmaGapIterator) Range() __generics_imported0.AddrRange { return __generics_imported0.AddrRange{gap.Start(), gap.End()} } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (gap pmaGapIterator) Start() __generics_imported0.Addr { if ps := gap.PrevSegment(); ps.Ok() { return ps.End() } return pmaSetFunctions{}.MinKey() } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (gap pmaGapIterator) End() __generics_imported0.Addr { if ns := gap.NextSegment(); ns.Ok() { return ns.Start() } return pmaSetFunctions{}.MaxKey() } // IsEmpty returns true if the iterated gap is empty (that is, the "gap" is // between two adjacent segments.) func (gap pmaGapIterator) IsEmpty() bool { return gap.Range().Length() == 0 } // PrevSegment returns the segment immediately before the iterated gap. If no // such segment exists, PrevSegment returns a terminal iterator. func (gap pmaGapIterator) PrevSegment() pmaIterator { return pmasegmentBeforePosition(gap.node, gap.index) } // NextSegment returns the segment immediately after the iterated gap. If no // such segment exists, NextSegment returns a terminal iterator. func (gap pmaGapIterator) NextSegment() pmaIterator { return pmasegmentAfterPosition(gap.node, gap.index) } // PrevGap returns the iterated gap's predecessor. If no such gap exists, // PrevGap returns a terminal iterator. func (gap pmaGapIterator) PrevGap() pmaGapIterator { seg := gap.PrevSegment() if !seg.Ok() { return pmaGapIterator{} } return seg.PrevGap() } // NextGap returns the iterated gap's successor. If no such gap exists, NextGap // returns a terminal iterator. func (gap pmaGapIterator) NextGap() pmaGapIterator { seg := gap.NextSegment() if !seg.Ok() { return pmaGapIterator{} } return seg.NextGap() } // NextLargeEnoughGap returns the iterated gap's first next gap with larger // length than minSize. If not found, return a terminal gap iterator (does NOT // include this gap itself). // // Precondition: trackGaps must be 1. func (gap pmaGapIterator) NextLargeEnoughGap(minSize __generics_imported0.Addr) pmaGapIterator { if pmatrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == gap.node.nrSegments { gap.node = gap.NextSegment().node gap.index = 0 return gap.nextLargeEnoughGapHelper(minSize) } return gap.nextLargeEnoughGapHelper(minSize) } // nextLargeEnoughGapHelper is the helper function used by NextLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the trailing gap of a non-leaf node. func (gap pmaGapIterator) nextLargeEnoughGapHelper(minSize __generics_imported0.Addr) pmaGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == gap.node.nrSegments)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return pmaGapIterator{} } gap.index++ for gap.index <= gap.node.nrSegments { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index++ } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == gap.node.nrSegments { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // PrevLargeEnoughGap returns the iterated gap's first prev gap with larger or // equal length than minSize. If not found, return a terminal gap iterator // (does NOT include this gap itself). // // Precondition: trackGaps must be 1. func (gap pmaGapIterator) PrevLargeEnoughGap(minSize __generics_imported0.Addr) pmaGapIterator { if pmatrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == 0 { gap.node = gap.PrevSegment().node gap.index = gap.node.nrSegments return gap.prevLargeEnoughGapHelper(minSize) } return gap.prevLargeEnoughGapHelper(minSize) } // prevLargeEnoughGapHelper is the helper function used by PrevLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the first gap of a non-leaf node. func (gap pmaGapIterator) prevLargeEnoughGapHelper(minSize __generics_imported0.Addr) pmaGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == 0)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return pmaGapIterator{} } gap.index-- for gap.index >= 0 { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index-- } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == 0 { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // segmentBeforePosition returns the predecessor segment of the position given // by n.children[i], which may or may not contain a child. If no such segment // exists, segmentBeforePosition returns a terminal iterator. func pmasegmentBeforePosition(n *pmanode, i int) pmaIterator { for i == 0 { if n.parent == nil { return pmaIterator{} } n, i = n.parent, n.parentIndex } return pmaIterator{n, i - 1} } // segmentAfterPosition returns the successor segment of the position given by // n.children[i], which may or may not contain a child. If no such segment // exists, segmentAfterPosition returns a terminal iterator. func pmasegmentAfterPosition(n *pmanode, i int) pmaIterator { for i == n.nrSegments { if n.parent == nil { return pmaIterator{} } n, i = n.parent, n.parentIndex } return pmaIterator{n, i} } func pmazeroValueSlice(slice []pma) { for i := range slice { pmaSetFunctions{}.ClearValue(&slice[i]) } } func pmazeroNodeSlice(slice []*pmanode) { for i := range slice { slice[i] = nil } } // String stringifies a Set for debugging. func (s *pmaSet) String() string { return s.root.String() } // String stringifies a node (and all of its children) for debugging. func (n *pmanode) String() string { var buf bytes.Buffer n.writeDebugString(&buf, "") return buf.String() } func (n *pmanode) writeDebugString(buf *bytes.Buffer, prefix string) { if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { buf.WriteString(prefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) } for i := 0; i < n.nrSegments; i++ { if child := n.children[i]; child != nil { cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) if child.parent != n || child.parentIndex != i { buf.WriteString(cprefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) } child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) } buf.WriteString(prefix) if n.hasChildren { if pmatrackGaps != 0 { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v, maxGap: %d\n", i, n.keys[i], n.values[i], n.maxGap.Get())) } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } if child := n.children[n.nrSegments]; child != nil { child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) } } // FlatSegment represents a segment as a single object. FlatSegment is used as // an intermediate representation for save/restore and tests. // // +stateify savable type pmaFlatSegment struct { Start __generics_imported0.Addr End __generics_imported0.Addr Value pma } // ExportSlice returns a copy of all segments in the given set, in ascending // key order. func (s *pmaSet) ExportSlice() []pmaFlatSegment { var fs []pmaFlatSegment for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { fs = append(fs, pmaFlatSegment{ Start: seg.Start(), End: seg.End(), Value: seg.Value(), }) } return fs } // ImportSlice initializes the given set from the given slice. // // Preconditions: // - s must be empty. // - fs must represent a valid set (the segments in fs must have valid // lengths that do not overlap). // - The segments in fs must be sorted in ascending key order. func (s *pmaSet) ImportSlice(fs []pmaFlatSegment) error { if !s.IsEmpty() { return fmt.Errorf("cannot import into non-empty set %v", s) } gap := s.FirstGap() for i := range fs { f := &fs[i] r := __generics_imported0.AddrRange{f.Start, f.End} if !gap.Range().IsSupersetOf(r) { return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: %v => %v", r, f.Value) } gap = s.InsertWithoutMerging(gap, r, f.Value).NextGap() } return nil } // segmentTestCheck returns an error if s is incorrectly sorted, does not // contain exactly expectedSegments segments, or contains a segment which // fails the passed check. // // This should be used only for testing, and has been added to this package for // templating convenience. func (s *pmaSet) segmentTestCheck(expectedSegments int, segFunc func(int, __generics_imported0.AddrRange, pma) error) error { havePrev := false prev := __generics_imported0.Addr(0) nrSegments := 0 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { next := seg.Start() if havePrev && prev >= next { return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments) } if segFunc != nil { if err := segFunc(nrSegments, seg.Range(), seg.Value()); err != nil { return err } } prev = next havePrev = true nrSegments++ } if nrSegments != expectedSegments { return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments) } return nil } // countSegments counts the number of segments in the set. // // Similar to Check, this should only be used for testing. func (s *pmaSet) countSegments() (segments int) { for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { segments++ } return segments } func (s *pmaSet) saveRoot() []pmaFlatSegment { fs := s.ExportSlice() fs = fs[:len(fs):len(fs)] return fs } func (s *pmaSet) loadRoot(_ context.Context, fs []pmaFlatSegment) { if err := s.ImportSlice(fs); err != nil { panic(err) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/procfs.go000066400000000000000000000227621465435605700232150ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mm import ( "bytes" "fmt" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/memmap" ) const ( // devMinorBits is the number of minor bits in a device number. Linux: // include/linux/kdev_t.h:MINORBITS devMinorBits = 20 vsyscallEnd = hostarch.Addr(0xffffffffff601000) vsyscallMapsEntry = "ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n" vsyscallSmapsEntry = vsyscallMapsEntry + "Size: 4 kB\n" + "Rss: 0 kB\n" + "Pss: 0 kB\n" + "Shared_Clean: 0 kB\n" + "Shared_Dirty: 0 kB\n" + "Private_Clean: 0 kB\n" + "Private_Dirty: 0 kB\n" + "Referenced: 0 kB\n" + "Anonymous: 0 kB\n" + "AnonHugePages: 0 kB\n" + "Shared_Hugetlb: 0 kB\n" + "Private_Hugetlb: 0 kB\n" + "Swap: 0 kB\n" + "SwapPss: 0 kB\n" + "KernelPageSize: 4 kB\n" + "MMUPageSize: 4 kB\n" + "Locked: 0 kB\n" + "VmFlags: rd ex \n" ) // MapsCallbackFuncForBuffer creates a /proc/[pid]/maps entry including the trailing newline. func (mm *MemoryManager) MapsCallbackFuncForBuffer(buf *bytes.Buffer) MapsCallbackFunc { return func(start, end hostarch.Addr, permissions hostarch.AccessType, private string, offset uint64, devMajor, devMinor uint32, inode uint64, path string) { // Do not include the guard page: fs/proc/task_mmu.c:show_map_vma() => // stack_guard_page_start(). lineLen, err := fmt.Fprintf(buf, "%08x-%08x %s%s %08x %02x:%02x %d ", start, end, permissions, private, offset, devMajor, devMinor, inode) if err != nil { log.Warningf("Failed to write to buffer with error: %v", err) return } if path != "" { // Per linux, we pad until the 74th character. for pad := 73 - lineLen; pad > 0; pad-- { buf.WriteByte(' ') // never returns a non-nil error } buf.WriteString(path) // never returns a non-nil error } buf.WriteByte('\n') // never returns a non-nil error } } // ReadMapsDataInto is called by fsimpl/proc.mapsData.Generate to // implement /proc/[pid]/maps. func (mm *MemoryManager) ReadMapsDataInto(ctx context.Context, fn MapsCallbackFunc) { // FIXME(b/235153601): Need to replace RLockBypass with RLockBypass // after fixing b/235153601. mm.mappingMu.RLockBypass() defer mm.mappingMu.RUnlockBypass() var start hostarch.Addr for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() { mm.appendVMAMapsEntryLocked(ctx, vseg, fn) } // We always emulate vsyscall, so advertise it here. Everything about a // vsyscall region is static, so just hard code the maps entry since we // don't have a real vma backing it. The vsyscall region is at the end of // the virtual address space so nothing should be mapped after it (if // something is really mapped in the tiny ~10 MiB segment afterwards, we'll // get the sorting on the maps file wrong at worst; but that's not possible // on any current platform). // // Artificially adjust the seqfile handle so we only output vsyscall entry once. if start != vsyscallEnd { fn(hostarch.Addr(0xffffffffff600000), hostarch.Addr(0xffffffffff601000), hostarch.ReadExecute, "p", 0, 0, 0, 0, "[vsyscall]") } } // vmaMapsEntryLocked returns a /proc/[pid]/maps entry for the vma iterated by // vseg, including the trailing newline. // // Preconditions: mm.mappingMu must be locked. func (mm *MemoryManager) vmaMapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte { var b bytes.Buffer mm.appendVMAMapsEntryLocked(ctx, vseg, mm.MapsCallbackFuncForBuffer(&b)) return b.Bytes() } // Preconditions: mm.mappingMu must be locked. func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaIterator, fn MapsCallbackFunc) { vma := vseg.ValuePtr() private := "p" if !vma.private { private = "s" } var dev, ino uint64 if vma.id != nil { dev = vma.id.DeviceID() ino = vma.id.InodeID() } devMajor := uint32(dev >> devMinorBits) devMinor := uint32(dev & ((1 << devMinorBits) - 1)) // Figure out our filename or hint. var path string if vma.hint != "" { path = vma.hint } else if vma.id != nil { // FIXME(jamieliu): We are holding mm.mappingMu here, which is // consistent with Linux's holding mmap_sem in // fs/proc/task_mmu.c:show_map_vma() => fs/seq_file.c:seq_file_path(). // However, it's not clear that fs.File.MappedName() is actually // consistent with this lock order. path = vma.id.MappedName(ctx) } fn(vseg.Start(), vseg.End(), vma.realPerms, private, vma.off, devMajor, devMinor, ino, path) } // ReadSmapsDataInto is called by fsimpl/proc.smapsData.Generate to // implement /proc/[pid]/maps. func (mm *MemoryManager) ReadSmapsDataInto(ctx context.Context, buf *bytes.Buffer) { // FIXME(b/235153601): Need to replace RLockBypass with RLockBypass // after fixing b/235153601. mm.mappingMu.RLockBypass() defer mm.mappingMu.RUnlockBypass() var start hostarch.Addr for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() { mm.vmaSmapsEntryIntoLocked(ctx, vseg, buf) } // We always emulate vsyscall, so advertise it here. See // ReadMapsSeqFileData for additional commentary. if start != vsyscallEnd { buf.WriteString(vsyscallSmapsEntry) } } // vmaSmapsEntryLocked returns a /proc/[pid]/smaps entry for the vma iterated // by vseg, including the trailing newline. // // Preconditions: mm.mappingMu must be locked. func (mm *MemoryManager) vmaSmapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte { var b bytes.Buffer mm.vmaSmapsEntryIntoLocked(ctx, vseg, &b) return b.Bytes() } func (mm *MemoryManager) vmaSmapsEntryIntoLocked(ctx context.Context, vseg vmaIterator, b *bytes.Buffer) { mm.appendVMAMapsEntryLocked(ctx, vseg, mm.MapsCallbackFuncForBuffer(b)) vma := vseg.ValuePtr() // We take mm.activeMu here in each call to vmaSmapsEntryLocked, instead of // requiring it to be locked as a precondition, to reduce the latency // impact of reading /proc/[pid]/smaps on concurrent performance-sensitive // operations requiring activeMu for writing like faults. mm.activeMu.RLock() var rss uint64 var anon uint64 vsegAR := vseg.Range() for pseg := mm.pmas.LowerBoundSegment(vsegAR.Start); pseg.Ok() && pseg.Start() < vsegAR.End; pseg = pseg.NextSegment() { psegAR := pseg.Range().Intersect(vsegAR) size := uint64(psegAR.Length()) rss += size if pseg.ValuePtr().private { anon += size } } mm.activeMu.RUnlock() fmt.Fprintf(b, "Size: %8d kB\n", vseg.Range().Length()/1024) fmt.Fprintf(b, "Rss: %8d kB\n", rss/1024) // Currently we report PSS = RSS, i.e. we pretend each page mapped by a pma // is only mapped by that pma. This avoids having to query memmap.Mappables // for reference count information on each page. As a corollary, all pages // are accounted as "private" whether or not the vma is private; compare // Linux's fs/proc/task_mmu.c:smaps_account(). fmt.Fprintf(b, "Pss: %8d kB\n", rss/1024) fmt.Fprintf(b, "Shared_Clean: %8d kB\n", 0) fmt.Fprintf(b, "Shared_Dirty: %8d kB\n", 0) // Pretend that all pages are dirty if the vma is writable, and clean otherwise. clean := rss if vma.effectivePerms.Write { clean = 0 } fmt.Fprintf(b, "Private_Clean: %8d kB\n", clean/1024) fmt.Fprintf(b, "Private_Dirty: %8d kB\n", (rss-clean)/1024) // Pretend that all pages are "referenced" (recently touched). fmt.Fprintf(b, "Referenced: %8d kB\n", rss/1024) fmt.Fprintf(b, "Anonymous: %8d kB\n", anon/1024) // Hugepages (hugetlb and THP) are not implemented. fmt.Fprintf(b, "AnonHugePages: %8d kB\n", 0) fmt.Fprintf(b, "Shared_Hugetlb: %8d kB\n", 0) fmt.Fprintf(b, "Private_Hugetlb: %7d kB\n", 0) // Swap is not implemented. fmt.Fprintf(b, "Swap: %8d kB\n", 0) fmt.Fprintf(b, "SwapPss: %8d kB\n", 0) fmt.Fprintf(b, "KernelPageSize: %8d kB\n", hostarch.PageSize/1024) fmt.Fprintf(b, "MMUPageSize: %8d kB\n", hostarch.PageSize/1024) locked := rss if vma.mlockMode == memmap.MLockNone { locked = 0 } fmt.Fprintf(b, "Locked: %8d kB\n", locked/1024) b.WriteString("VmFlags: ") if vma.realPerms.Read { b.WriteString("rd ") } if vma.realPerms.Write { b.WriteString("wr ") } if vma.realPerms.Execute { b.WriteString("ex ") } if vma.canWriteMappableLocked() { // VM_SHARED b.WriteString("sh ") } if vma.maxPerms.Read { b.WriteString("mr ") } if vma.maxPerms.Write { b.WriteString("mw ") } if vma.maxPerms.Execute { b.WriteString("me ") } if !vma.private { // VM_MAYSHARE b.WriteString("ms ") } if vma.growsDown { b.WriteString("gd ") } if vma.mlockMode != memmap.MLockNone { // VM_LOCKED b.WriteString("lo ") } if vma.mlockMode == memmap.MLockLazy { // VM_LOCKONFAULT b.WriteString("?? ") // no explicit encoding in fs/proc/task_mmu.c:show_smap_vma_flags() } if vma.private && vma.effectivePerms.Write { // VM_ACCOUNT b.WriteString("ac ") } b.WriteString("\n") } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/save_restore.go000066400000000000000000000075761465435605700244300ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mm import ( goContext "context" "fmt" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/pgalloc" ) // InvalidateUnsavable invokes memmap.Mappable.InvalidateUnsavable on all // Mappables mapped by mm. func (mm *MemoryManager) InvalidateUnsavable(ctx context.Context) error { mm.mappingMu.RLock() defer mm.mappingMu.RUnlock() for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { if vma := vseg.ValuePtr(); vma.mappable != nil { if err := vma.mappable.InvalidateUnsavable(ctx); err != nil { return err } } } return nil } // afterLoad is invoked by stateify. func (mm *MemoryManager) afterLoad(ctx goContext.Context) { mm.mf = pgalloc.MemoryFileFromContext(ctx) mm.haveASIO = mm.p.SupportsAddressSpaceIO() } // afterLoad is invoked by stateify. func (m *SpecialMappable) afterLoad(ctx goContext.Context) { m.mf = pgalloc.MemoryFileFromContext(ctx) } const ( vmaRealPermsRead = 1 << iota vmaRealPermsWrite vmaRealPermsExecute vmaEffectivePermsRead vmaEffectivePermsWrite vmaEffectivePermsExecute vmaMaxPermsRead vmaMaxPermsWrite vmaMaxPermsExecute vmaPrivate vmaGrowsDown vmaIsStack ) func (v *vma) saveRealPerms() int { var b int if v.realPerms.Read { b |= vmaRealPermsRead } if v.realPerms.Write { b |= vmaRealPermsWrite } if v.realPerms.Execute { b |= vmaRealPermsExecute } if v.effectivePerms.Read { b |= vmaEffectivePermsRead } if v.effectivePerms.Write { b |= vmaEffectivePermsWrite } if v.effectivePerms.Execute { b |= vmaEffectivePermsExecute } if v.maxPerms.Read { b |= vmaMaxPermsRead } if v.maxPerms.Write { b |= vmaMaxPermsWrite } if v.maxPerms.Execute { b |= vmaMaxPermsExecute } if v.private { b |= vmaPrivate } if v.growsDown { b |= vmaGrowsDown } if v.isStack { b |= vmaIsStack } return b } func (v *vma) loadRealPerms(_ goContext.Context, b int) { if b&vmaRealPermsRead > 0 { v.realPerms.Read = true } if b&vmaRealPermsWrite > 0 { v.realPerms.Write = true } if b&vmaRealPermsExecute > 0 { v.realPerms.Execute = true } if b&vmaEffectivePermsRead > 0 { v.effectivePerms.Read = true } if b&vmaEffectivePermsWrite > 0 { v.effectivePerms.Write = true } if b&vmaEffectivePermsExecute > 0 { v.effectivePerms.Execute = true } if b&vmaMaxPermsRead > 0 { v.maxPerms.Read = true } if b&vmaMaxPermsWrite > 0 { v.maxPerms.Write = true } if b&vmaMaxPermsExecute > 0 { v.maxPerms.Execute = true } if b&vmaPrivate > 0 { v.private = true } if b&vmaGrowsDown > 0 { v.growsDown = true } if b&vmaIsStack > 0 { v.isStack = true } } func (p *pma) saveFile() string { mf, ok := p.file.(*pgalloc.MemoryFile) if !ok { // InvalidateUnsavable should have caused all such pmas to be // invalidated. panic(fmt.Sprintf("Can't save pma with non-MemoryFile of type %T", p.file)) } if !mf.IsSavable() { panic(fmt.Sprintf("Can't save pma because its MemoryFile is not savable: %v", mf)) } return mf.RestoreID() } func (p *pma) loadFile(ctx goContext.Context, restoreID string) { if restoreID == "" { p.file = pgalloc.MemoryFileFromContext(ctx) return } mfmap := pgalloc.MemoryFileMapFromContext(ctx) mf, ok := mfmap[restoreID] if !ok { panic(fmt.Sprintf("can't restore pma because its MemoryFile's restore ID %q was not found in CtxMemoryFileMap", restoreID)) } p.file = mf } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/shm.go000066400000000000000000000044541465435605700225060ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mm import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" "gvisor.dev/gvisor/pkg/sentry/memmap" ) // DetachShm unmaps a sysv shared memory segment. func (mm *MemoryManager) DetachShm(ctx context.Context, addr hostarch.Addr) error { if addr != addr.RoundDown() { // "... shmaddr is not aligned on a page boundary." - man shmdt(2) return linuxerr.EINVAL } var detached *shm.Shm var vgap vmaGapIterator var droppedIDs []memmap.MappingIdentity // This must run after mm.mappingMu.Unlock(). defer func() { for _, id := range droppedIDs { id.DecRef(ctx) } }() mm.mappingMu.Lock() defer mm.mappingMu.Unlock() // Find and remove the first vma containing an address >= addr that maps a // segment originally attached at addr. vseg := mm.vmas.LowerBoundSegment(addr) for vseg.Ok() { vma := vseg.ValuePtr() if shm, ok := vma.mappable.(*shm.Shm); ok && vseg.Start() >= addr && uint64(vseg.Start()-addr) == vma.off { detached = shm vgap, droppedIDs = mm.unmapLocked(ctx, vseg.Range(), droppedIDs) vseg = vgap.NextSegment() break } else { vseg = vseg.NextSegment() } } if detached == nil { // There is no shared memory segment attached at addr. return linuxerr.EINVAL } // Remove all vmas that could have been created by the same attach. end := addr + hostarch.Addr(detached.EffectiveSize()) for vseg.Ok() && vseg.End() <= end { vma := vseg.ValuePtr() if vma.mappable == detached && uint64(vseg.Start()-addr) == vma.off { vgap, droppedIDs = mm.unmapLocked(ctx, vseg.Range(), droppedIDs) vseg = vgap.NextSegment() } else { vseg = vseg.NextSegment() } } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/special_mappable.go000066400000000000000000000100711465435605700251700ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mm import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" ) // SpecialMappable implements memmap.MappingIdentity and memmap.Mappable with // semantics similar to Linux's mm/mmap.c:_install_special_mapping(), except // that SpecialMappable takes ownership of the memory that it represents // (_install_special_mapping() does not.) // // +stateify savable type SpecialMappable struct { SpecialMappableRefs mf *pgalloc.MemoryFile `state:"nosave"` fr memmap.FileRange name string } // NewSpecialMappable returns a SpecialMappable that owns fr, which represents // offsets in mfp.MemoryFile() that contain the SpecialMappable's data. The // SpecialMappable will use the given name in /proc/[pid]/maps. // // Preconditions: fr.Length() != 0. func NewSpecialMappable(name string, mf *pgalloc.MemoryFile, fr memmap.FileRange) *SpecialMappable { m := SpecialMappable{mf: mf, fr: fr, name: name} m.InitRefs() return &m } // DecRef implements refs.RefCounter.DecRef. func (m *SpecialMappable) DecRef(ctx context.Context) { m.SpecialMappableRefs.DecRef(func() { m.mf.DecRef(m.fr) }) } // MappedName implements memmap.MappingIdentity.MappedName. func (m *SpecialMappable) MappedName(ctx context.Context) string { return m.name } // DeviceID implements memmap.MappingIdentity.DeviceID. func (m *SpecialMappable) DeviceID() uint64 { return 0 } // InodeID implements memmap.MappingIdentity.InodeID. func (m *SpecialMappable) InodeID() uint64 { return 0 } // Msync implements memmap.MappingIdentity.Msync. func (m *SpecialMappable) Msync(ctx context.Context, mr memmap.MappableRange) error { // Linux: vm_file is NULL, causing msync to skip it entirely. return nil } // AddMapping implements memmap.Mappable.AddMapping. func (*SpecialMappable) AddMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, uint64, bool) error { return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (*SpecialMappable) RemoveMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, uint64, bool) { } // CopyMapping implements memmap.Mappable.CopyMapping. func (*SpecialMappable) CopyMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, hostarch.AddrRange, uint64, bool) error { return nil } // Translate implements memmap.Mappable.Translate. func (m *SpecialMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { var err error if required.End > m.fr.Length() { err = &memmap.BusError{linuxerr.EFAULT} } if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 { return []memmap.Translation{ { Source: source, File: m.mf, Offset: m.fr.Start + source.Start, Perms: hostarch.AnyAccess, }, }, err } return nil, err } // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (m *SpecialMappable) InvalidateUnsavable(ctx context.Context) error { // Since data is stored in pgalloc.MemoryFile, the contents of which are // preserved across save/restore, we don't need to do anything. return nil } // FileRange returns the offsets into m.mf that stores the SpecialMappable's // contents. func (m *SpecialMappable) FileRange() memmap.FileRange { return m.fr } // Length returns the length of the SpecialMappable. func (m *SpecialMappable) Length() uint64 { return m.fr.Length() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/special_mappable_refs.go000066400000000000000000000103471465435605700262150ustar00rootroot00000000000000package mm import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const SpecialMappableenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var SpecialMappableobj *SpecialMappable // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type SpecialMappableRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *SpecialMappableRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *SpecialMappableRefs) RefType() string { return fmt.Sprintf("%T", SpecialMappableobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *SpecialMappableRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *SpecialMappableRefs) LogRefs() bool { return SpecialMappableenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *SpecialMappableRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *SpecialMappableRefs) IncRef() { v := r.refCount.Add(1) if SpecialMappableenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *SpecialMappableRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if SpecialMappableenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *SpecialMappableRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if SpecialMappableenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *SpecialMappableRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/syscalls.go000066400000000000000000001222611465435605700235510ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mm import ( "fmt" mrand "math/rand" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" ) // HandleUserFault handles an application page fault. sp is the faulting // application thread's stack pointer. // // Preconditions: mm.as != nil. func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr hostarch.Addr, at hostarch.AccessType, sp hostarch.Addr) error { ar, ok := addr.RoundDown().ToRange(hostarch.PageSize) if !ok { return linuxerr.EFAULT } // Don't bother trying existingPMAsLocked; in most cases, if we did have // existing pmas, we wouldn't have faulted. // Ensure that we have a usable vma. Here and below, since we are only // asking for a single page, there is no possibility of partial success, // and any error is immediately fatal. mm.mappingMu.RLock() vseg, _, err := mm.getVMAsLocked(ctx, ar, at, false) if err != nil { mm.mappingMu.RUnlock() return err } // Ensure that we have a usable pma. mm.activeMu.Lock() pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, at, true /* callerIndirectCommit */) mm.mappingMu.RUnlock() if err != nil { mm.activeMu.Unlock() return err } // Downgrade to a read-lock on activeMu since we don't need to mutate pmas // anymore. mm.activeMu.DowngradeLock() // Map the faulted page into the active AddressSpace. err = mm.mapASLocked(pseg, ar, memmap.PlatformEffectDefault) mm.activeMu.RUnlock() return err } // MMap establishes a memory mapping. func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (hostarch.Addr, error) { if opts.Length == 0 { return 0, linuxerr.EINVAL } length, ok := hostarch.Addr(opts.Length).RoundUp() if !ok { return 0, linuxerr.ENOMEM } opts.Length = uint64(length) if opts.Mappable != nil { // Offset must be aligned. if hostarch.Addr(opts.Offset).RoundDown() != hostarch.Addr(opts.Offset) { return 0, linuxerr.EINVAL } // Offset + length must not overflow. if end := opts.Offset + opts.Length; end < opts.Offset { return 0, linuxerr.EOVERFLOW } } else { opts.Offset = 0 } if opts.Addr.RoundDown() != opts.Addr { // MAP_FIXED requires addr to be page-aligned; non-fixed mappings // don't. if opts.Fixed { return 0, linuxerr.EINVAL } opts.Addr = opts.Addr.RoundDown() } if !opts.MaxPerms.SupersetOf(opts.Perms) { return 0, linuxerr.EACCES } if opts.Unmap && !opts.Fixed { return 0, linuxerr.EINVAL } if opts.GrowsDown && opts.Mappable != nil { return 0, linuxerr.EINVAL } // Get the new vma. var droppedIDs []memmap.MappingIdentity mm.mappingMu.Lock() if opts.MLockMode < mm.defMLockMode { opts.MLockMode = mm.defMLockMode } vseg, ar, droppedIDs, err := mm.createVMALocked(ctx, opts, droppedIDs) if err != nil { mm.mappingMu.Unlock() return 0, err } // TODO(jamieliu): In Linux, VM_LOCKONFAULT (which may be set on the new // vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears // to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in // mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() => // populate_vma_page_range(). Confirm this behavior. switch { case opts.PlatformEffect >= memmap.PlatformEffectPopulate || opts.MLockMode == memmap.MLockEager: // Get pmas and map as requested. mm.populateVMAAndUnlock(ctx, vseg, ar, opts.PlatformEffect) case opts.Mappable == nil && length <= hostarch.HugePageSize: // NOTE(b/63077076, b/63360184): Get pmas and map eagerly in the hope // that doing so will save on future page faults. We only do this for // anonymous mappings, since otherwise the cost of // memmap.Mappable.Translate is unknown; and only for small mappings, // to avoid needing to allocate large amounts of memory that we may // subsequently need to checkpoint. mm.populateVMAAndUnlock(ctx, vseg, ar, memmap.PlatformEffectDefault) default: mm.mappingMu.Unlock() } for _, id := range droppedIDs { id.DecRef(ctx) } return ar.Start, nil } // populateVMA obtains pmas for addresses in ar in the given vma, and maps them // into mm.as if it is active. // // Preconditions: // - mm.mappingMu must be locked. // - vseg.Range().IsSupersetOf(ar). func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, platformEffect memmap.MMapPlatformEffect) { if !vseg.ValuePtr().effectivePerms.Any() { // Linux doesn't populate inaccessible pages. See // mm/gup.c:populate_vma_page_range. return } mm.activeMu.Lock() // Can't defer mm.activeMu.Unlock(); see below. // Even if we get new pmas, we can't actually map them if we don't have an // AddressSpace. if mm.as == nil { mm.activeMu.Unlock() return } // Ensure that we have usable pmas. pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, hostarch.NoAccess, platformEffect == memmap.PlatformEffectCommit) if err != nil { // mm/util.c:vm_mmap_pgoff() ignores the error, if any, from // mm/gup.c:mm_populate(). If it matters, we'll get it again when // userspace actually tries to use the failing page. mm.activeMu.Unlock() return } // Downgrade to a read-lock on activeMu since we don't need to mutate pmas // anymore. mm.activeMu.DowngradeLock() // As above, errors are silently ignored. mm.mapASLocked(pseg, ar, platformEffect) mm.activeMu.RUnlock() } // populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally // unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is // preferable to populateVMA since it unlocks mm.mappingMu before performing // expensive operations that don't require it to be locked. // // Preconditions: // - mm.mappingMu must be locked for writing. // - vseg.Range().IsSupersetOf(ar). // // Postconditions: mm.mappingMu will be unlocked. // +checklocksrelease:mm.mappingMu func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, platformEffect memmap.MMapPlatformEffect) { // See populateVMA above for commentary. if !vseg.ValuePtr().effectivePerms.Any() { mm.mappingMu.Unlock() return } mm.activeMu.Lock() if mm.as == nil { mm.activeMu.Unlock() mm.mappingMu.Unlock() return } // mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it // isn't needed at all for mapASLocked. mm.mappingMu.DowngradeLock() pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, hostarch.NoAccess, platformEffect == memmap.PlatformEffectCommit) mm.mappingMu.RUnlock() if err != nil { mm.activeMu.Unlock() return } mm.activeMu.DowngradeLock() mm.mapASLocked(pseg, ar, platformEffect) mm.activeMu.RUnlock() } // MapStack allocates the initial process stack. func (mm *MemoryManager) MapStack(ctx context.Context) (hostarch.AddrRange, error) { // maxStackSize is the maximum supported process stack size in bytes. // // This limit exists because stack growing isn't implemented, so the entire // process stack must be mapped up-front. const maxStackSize = 128 << 20 stackSize := limits.FromContext(ctx).Get(limits.Stack) r, ok := hostarch.Addr(stackSize.Cur).RoundUp() sz := uint64(r) if !ok { // RLIM_INFINITY rounds up to 0. sz = linux.DefaultStackSoftLimit } else if sz > maxStackSize { ctx.Warningf("Capping stack size from RLIMIT_STACK of %v down to %v.", sz, maxStackSize) sz = maxStackSize } else if sz == 0 { return hostarch.AddrRange{}, linuxerr.ENOMEM } szaddr := hostarch.Addr(sz) ctx.Debugf("Allocating stack with size of %v bytes", sz) // Determine the stack's desired location. Unlike Linux, address // randomization can't be disabled. stackEnd := mm.layout.MaxAddr - hostarch.Addr(mrand.Int63n(int64(mm.layout.MaxStackRand))).RoundDown() if stackEnd < szaddr { return hostarch.AddrRange{}, linuxerr.ENOMEM } stackStart := stackEnd - szaddr var droppedIDs []memmap.MappingIdentity var ar hostarch.AddrRange var err error mm.mappingMu.Lock() _, ar, droppedIDs, err = mm.createVMALocked(ctx, memmap.MMapOpts{ Length: sz, Addr: stackStart, Perms: hostarch.ReadWrite, MaxPerms: hostarch.AnyAccess, Private: true, GrowsDown: true, MLockMode: mm.defMLockMode, Hint: "[stack]", }, droppedIDs) mm.mappingMu.Unlock() for _, id := range droppedIDs { id.DecRef(ctx) } return ar, err } // MUnmap implements the semantics of Linux's munmap(2). func (mm *MemoryManager) MUnmap(ctx context.Context, addr hostarch.Addr, length uint64) error { if addr != addr.RoundDown() { return linuxerr.EINVAL } if length == 0 { return linuxerr.EINVAL } la, ok := hostarch.Addr(length).RoundUp() if !ok { return linuxerr.EINVAL } ar, ok := addr.ToRange(uint64(la)) if !ok { return linuxerr.EINVAL } var droppedIDs []memmap.MappingIdentity mm.mappingMu.Lock() _, droppedIDs = mm.unmapLocked(ctx, ar, droppedIDs) mm.mappingMu.Unlock() for _, id := range droppedIDs { id.DecRef(ctx) } return nil } // MRemapOpts specifies options to MRemap. type MRemapOpts struct { // Move controls whether MRemap moves the remapped mapping to a new address. Move MRemapMoveMode // NewAddr is the new address for the remapping. NewAddr is ignored unless // Move is MMRemapMustMove. NewAddr hostarch.Addr } // MRemapMoveMode controls MRemap's moving behavior. type MRemapMoveMode int const ( // MRemapNoMove prevents MRemap from moving the remapped mapping. MRemapNoMove MRemapMoveMode = iota // MRemapMayMove allows MRemap to move the remapped mapping. MRemapMayMove // MRemapMustMove requires MRemap to move the remapped mapping to // MRemapOpts.NewAddr, replacing any existing mappings in the remapped // range. MRemapMustMove ) // MRemap implements the semantics of Linux's mremap(2). func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldSize uint64, newSize uint64, opts MRemapOpts) (hostarch.Addr, error) { // "Note that old_address has to be page aligned." - mremap(2) if oldAddr.RoundDown() != oldAddr { return 0, linuxerr.EINVAL } // Linux treats an old_size that rounds up to 0 as 0, which is otherwise a // valid size. However, new_size can't be 0 after rounding. oldSizeAddr, _ := hostarch.Addr(oldSize).RoundUp() oldSize = uint64(oldSizeAddr) newSizeAddr, ok := hostarch.Addr(newSize).RoundUp() if !ok || newSizeAddr == 0 { return 0, linuxerr.EINVAL } newSize = uint64(newSizeAddr) oldEnd, ok := oldAddr.AddLength(oldSize) if !ok { return 0, linuxerr.EINVAL } var droppedIDs []memmap.MappingIdentity // This must run after mm.mappingMu.Unlock(). defer func() { for _, id := range droppedIDs { id.DecRef(ctx) } }() mm.mappingMu.Lock() defer mm.mappingMu.Unlock() // All cases require that a vma exists at oldAddr. vseg := mm.vmas.FindSegment(oldAddr) if !vseg.Ok() { return 0, linuxerr.EFAULT } // Behavior matrix: // // Move | oldSize = 0 | oldSize < newSize | oldSize = newSize | oldSize > newSize // ---------+-------------+-------------------+-------------------+------------------ // NoMove | ENOMEM [1] | Grow in-place | No-op | Shrink in-place // MayMove | Copy [1] | Grow in-place or | No-op | Shrink in-place // | | move | | // MustMove | Copy | Move and grow | Move | Shrink and move // // [1] In-place growth is impossible because the vma at oldAddr already // occupies at least part of the destination. Thus the NoMove case always // fails and the MayMove case always falls back to copying. if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone { // Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall, // mremap in Linux does not check mm/mlock.c:can_do_mlock() and // therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and // !CAP_IPC_LOCK. mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit { return 0, linuxerr.EAGAIN } } } if opts.Move != MRemapMustMove { // Handle no-ops and in-place shrinking. These cases don't care if // [oldAddr, oldEnd) maps to a single vma, or is even mapped at all // (aside from oldAddr). if newSize <= oldSize { if newSize < oldSize { // If oldAddr+oldSize didn't overflow, oldAddr+newSize can't // either. newEnd := oldAddr + hostarch.Addr(newSize) _, droppedIDs = mm.unmapLocked(ctx, hostarch.AddrRange{newEnd, oldEnd}, droppedIDs) } return oldAddr, nil } // Handle in-place growing. // Check that oldEnd maps to the same vma as oldAddr. if vseg.End() < oldEnd { return 0, linuxerr.EFAULT } // "Grow" the existing vma by creating a new mergeable one. vma := vseg.ValuePtr() var newOffset uint64 if vma.mappable != nil { newOffset = vseg.mappableRange().End } var vseg vmaIterator var ar hostarch.AddrRange var err error vseg, ar, droppedIDs, err = mm.createVMALocked(ctx, memmap.MMapOpts{ Length: newSize - oldSize, MappingIdentity: vma.id, Mappable: vma.mappable, Offset: newOffset, Addr: oldEnd, Fixed: true, Perms: vma.realPerms, MaxPerms: vma.maxPerms, Private: vma.private, GrowsDown: vma.growsDown, Stack: vma.isStack, MLockMode: vma.mlockMode, Hint: vma.hint, }, droppedIDs) if err == nil { if vma.mlockMode == memmap.MLockEager { mm.populateVMA(ctx, vseg, ar, memmap.PlatformEffectCommit) } return oldAddr, nil } // In-place growth failed. In the MRemapMayMove case, fall through to // copying/moving below. if opts.Move == MRemapNoMove { return 0, err } } // Find a location for the new mapping. var newAR hostarch.AddrRange switch opts.Move { case MRemapMayMove: newAddr, err := mm.findAvailableLocked(newSize, findAvailableOpts{}) if err != nil { return 0, err } newAR, _ = newAddr.ToRange(newSize) case MRemapMustMove: newAddr := opts.NewAddr if newAddr.RoundDown() != newAddr { return 0, linuxerr.EINVAL } var ok bool newAR, ok = newAddr.ToRange(newSize) if !ok { return 0, linuxerr.EINVAL } if (hostarch.AddrRange{oldAddr, oldEnd}).Overlaps(newAR) { return 0, linuxerr.EINVAL } // Check that the new region is valid. _, err := mm.findAvailableLocked(newSize, findAvailableOpts{ Addr: newAddr, Fixed: true, Unmap: true, }) if err != nil { return 0, err } // Unmap any mappings at the destination. _, droppedIDs = mm.unmapLocked(ctx, newAR, droppedIDs) // If the sizes specify shrinking, unmap everything between the new and // old sizes at the source. Unmapping before the following checks is // correct: compare Linux's mm/mremap.c:mremap_to() => do_munmap(), // vma_to_resize(). if newSize < oldSize { oldNewEnd := oldAddr + hostarch.Addr(newSize) _, droppedIDs = mm.unmapLocked(ctx, hostarch.AddrRange{oldNewEnd, oldEnd}, droppedIDs) oldEnd = oldNewEnd } // unmapLocked may have invalidated vseg; look it up again. vseg = mm.vmas.FindSegment(oldAddr) } oldAR := hostarch.AddrRange{oldAddr, oldEnd} // Check that oldEnd maps to the same vma as oldAddr. if vseg.End() < oldEnd { return 0, linuxerr.EFAULT } // Check against RLIMIT_AS. newUsageAS := mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length()) if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS { return 0, linuxerr.ENOMEM } if vma := vseg.ValuePtr(); vma.mappable != nil { // Check that offset+length does not overflow. if vma.off+uint64(newAR.Length()) < vma.off { return 0, linuxerr.EINVAL } // Inform the Mappable, if any, of the new mapping. if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start), vma.canWriteMappableLocked()); err != nil { return 0, err } } if oldSize == 0 { // Handle copying. // // We can't use createVMALocked because it calls Mappable.AddMapping, // whereas we've already called Mappable.CopyMapping (which is // consistent with Linux). vma := vseg.ValuePtr().copy() if vma.mappable != nil { vma.off = vseg.mappableOffsetAt(oldAR.Start) } if vma.id != nil { vma.id.IncRef() } vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma) mm.usageAS += uint64(newAR.Length()) if vma.isPrivateDataLocked() { mm.dataAS += uint64(newAR.Length()) } if vma.mlockMode != memmap.MLockNone { mm.lockedAS += uint64(newAR.Length()) if vma.mlockMode == memmap.MLockEager { mm.populateVMA(ctx, vseg, newAR, memmap.PlatformEffectCommit) } } return newAR.Start, nil } // Handle moving. // // Remove the existing vma before inserting the new one to minimize // iterator invalidation. We do this directly (instead of calling // removeVMAsLocked) because: // // 1. We can't drop the reference on vma.id, which will be transferred to // the new vma. // // 2. We can't call vma.mappable.RemoveMapping, because pmas are still at // oldAR, so calling RemoveMapping could cause us to miss an invalidation // overlapping oldAR. vseg = mm.vmas.Isolate(vseg, oldAR) vma := vseg.ValuePtr().copy() mm.vmas.Remove(vseg) vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma) mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length()) if vma.isPrivateDataLocked() { mm.dataAS = mm.dataAS - uint64(oldAR.Length()) + uint64(newAR.Length()) } if vma.mlockMode != memmap.MLockNone { mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length()) } // Move pmas. This is technically optional for non-private pmas, which // could just go through memmap.Mappable.Translate again, but it's required // for private pmas. mm.activeMu.Lock() mm.movePMAsLocked(oldAR, newAR) mm.activeMu.Unlock() // Now that pmas have been moved to newAR, we can notify vma.mappable that // oldAR is no longer mapped. if vma.mappable != nil { vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.canWriteMappableLocked()) } if vma.mlockMode == memmap.MLockEager { mm.populateVMA(ctx, vseg, newAR, memmap.PlatformEffectCommit) } return newAR.Start, nil } // MProtect implements the semantics of Linux's mprotect(2). func (mm *MemoryManager) MProtect(addr hostarch.Addr, length uint64, realPerms hostarch.AccessType, growsDown bool) error { if addr.RoundDown() != addr { return linuxerr.EINVAL } if length == 0 { return nil } rlength, ok := hostarch.Addr(length).RoundUp() if !ok { return linuxerr.ENOMEM } ar, ok := addr.ToRange(uint64(rlength)) if !ok { return linuxerr.ENOMEM } effectivePerms := realPerms.Effective() mm.mappingMu.Lock() defer mm.mappingMu.Unlock() // Non-growsDown mprotect requires that all of ar is mapped, and stops at // the first non-empty gap. growsDown mprotect requires that the first vma // be growsDown, but does not require it to extend all the way to ar.Start; // vmas after the first must be contiguous but need not be growsDown, like // the non-growsDown case. vseg := mm.vmas.LowerBoundSegment(ar.Start) if !vseg.Ok() { return linuxerr.ENOMEM } if growsDown { if !vseg.ValuePtr().growsDown { return linuxerr.EINVAL } if ar.End <= vseg.Start() { return linuxerr.ENOMEM } ar.Start = vseg.Start() } else { if ar.Start < vseg.Start() { return linuxerr.ENOMEM } } mm.activeMu.Lock() defer mm.activeMu.Unlock() defer func() { mm.vmas.MergeInsideRange(ar) mm.vmas.MergeOutsideRange(ar) mm.pmas.MergeInsideRange(ar) mm.pmas.MergeOutsideRange(ar) }() pseg := mm.pmas.LowerBoundSegment(ar.Start) var didUnmapAS bool for { // Check for permission validity before splitting vmas, for consistency // with Linux. if !vseg.ValuePtr().maxPerms.SupersetOf(effectivePerms) { return linuxerr.EACCES } vseg = mm.vmas.Isolate(vseg, ar) // Update vma permissions. vma := vseg.ValuePtr() vmaLength := vseg.Range().Length() if vma.isPrivateDataLocked() { mm.dataAS -= uint64(vmaLength) } vma.realPerms = realPerms vma.effectivePerms = effectivePerms if vma.isPrivateDataLocked() { mm.dataAS += uint64(vmaLength) } // Propagate vma permission changes to pmas. for pseg.Ok() && pseg.Start() < vseg.End() { if pseg.Range().Overlaps(vseg.Range()) { pseg = mm.pmas.Isolate(pseg, vseg.Range()) pma := pseg.ValuePtr() if !effectivePerms.SupersetOf(pma.effectivePerms) && !didUnmapAS { // Unmap all of ar, not just vseg.Range(), to minimize host // syscalls. mm.unmapASLocked(ar) didUnmapAS = true } pma.effectivePerms = effectivePerms.Intersect(pma.translatePerms) if pma.needCOW { pma.effectivePerms.Write = false } } pseg = pseg.NextSegment() } // Continue to the next vma. if ar.End <= vseg.End() { return nil } vseg, _ = vseg.NextNonEmpty() if !vseg.Ok() { return linuxerr.ENOMEM } } } // BrkSetup sets mm's brk address to addr and its brk size to 0. func (mm *MemoryManager) BrkSetup(ctx context.Context, addr hostarch.Addr) { var droppedIDs []memmap.MappingIdentity mm.mappingMu.Lock() // Unmap the existing brk. if mm.brk.Length() != 0 { _, droppedIDs = mm.unmapLocked(ctx, mm.brk, droppedIDs) } mm.brk = hostarch.AddrRange{addr, addr} mm.mappingMu.Unlock() for _, id := range droppedIDs { id.DecRef(ctx) } } // Brk implements the semantics of Linux's brk(2), except that it returns an // error on failure. func (mm *MemoryManager) Brk(ctx context.Context, addr hostarch.Addr) (hostarch.Addr, error) { mm.mappingMu.Lock() // Can't defer mm.mappingMu.Unlock(); see below. if addr < mm.brk.Start { addr = mm.brk.End mm.mappingMu.Unlock() return addr, linuxerr.EINVAL } // TODO(gvisor.dev/issue/156): This enforces RLIMIT_DATA, but is // slightly more permissive than the usual data limit. In particular, // this only limits the size of the heap; a true RLIMIT_DATA limits the // size of heap + data + bss. The segment sizes need to be plumbed from // the loader package to fully enforce RLIMIT_DATA. if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur { addr = mm.brk.End mm.mappingMu.Unlock() return addr, linuxerr.ENOMEM } oldbrkpg, _ := mm.brk.End.RoundUp() newbrkpg, ok := addr.RoundUp() if !ok { addr = mm.brk.End mm.mappingMu.Unlock() return addr, linuxerr.EFAULT } var vseg vmaIterator var ar hostarch.AddrRange var err error var droppedIDs []memmap.MappingIdentity // This must run after mm.mappingMu.Unlock(). defer func() { for _, id := range droppedIDs { id.DecRef(ctx) } }() switch { case oldbrkpg < newbrkpg: vseg, ar, droppedIDs, err = mm.createVMALocked(ctx, memmap.MMapOpts{ Length: uint64(newbrkpg - oldbrkpg), Addr: oldbrkpg, Fixed: true, // Compare Linux's // arch/x86/include/asm/page_types.h:VM_DATA_DEFAULT_FLAGS. Perms: hostarch.ReadWrite, MaxPerms: hostarch.AnyAccess, Private: true, // Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes // mm->def_flags. MLockMode: mm.defMLockMode, Hint: "[heap]", }, droppedIDs) if err != nil { addr = mm.brk.End mm.mappingMu.Unlock() return addr, err } mm.brk.End = addr if mm.defMLockMode == memmap.MLockEager { mm.populateVMAAndUnlock(ctx, vseg, ar, memmap.PlatformEffectCommit) } else { mm.mappingMu.Unlock() } case newbrkpg < oldbrkpg: _, droppedIDs = mm.unmapLocked(ctx, hostarch.AddrRange{newbrkpg, oldbrkpg}, droppedIDs) fallthrough default: mm.brk.End = addr mm.mappingMu.Unlock() } return addr, nil } // MLock implements the semantics of Linux's mlock()/mlock2()/munlock(), // depending on mode. func (mm *MemoryManager) MLock(ctx context.Context, addr hostarch.Addr, length uint64, mode memmap.MLockMode) error { // Linux allows this to overflow. la, _ := hostarch.Addr(length + addr.PageOffset()).RoundUp() ar, ok := addr.RoundDown().ToRange(uint64(la)) if !ok { return linuxerr.EINVAL } mm.mappingMu.Lock() // Can't defer mm.mappingMu.Unlock(); see below. if mode != memmap.MLockNone { // Check against RLIMIT_MEMLOCK. if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur if mlockLimit == 0 { mm.mappingMu.Unlock() return linuxerr.EPERM } if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit { mm.mappingMu.Unlock() return linuxerr.ENOMEM } } } // Check this after RLIMIT_MEMLOCK for consistency with Linux. if ar.Length() == 0 { mm.mappingMu.Unlock() return nil } // Apply the new mlock mode to vmas. var unmapped bool vseg := mm.vmas.FindSegment(ar.Start) for { if !vseg.Ok() { unmapped = true break } vseg = mm.vmas.Isolate(vseg, ar) vma := vseg.ValuePtr() prevMode := vma.mlockMode vma.mlockMode = mode if mode != memmap.MLockNone && prevMode == memmap.MLockNone { mm.lockedAS += uint64(vseg.Range().Length()) } else if mode == memmap.MLockNone && prevMode != memmap.MLockNone { mm.lockedAS -= uint64(vseg.Range().Length()) } if ar.End <= vseg.End() { break } vseg, _ = vseg.NextNonEmpty() } mm.vmas.MergeInsideRange(ar) mm.vmas.MergeOutsideRange(ar) if unmapped { mm.mappingMu.Unlock() return linuxerr.ENOMEM } if mode == memmap.MLockEager { // Ensure that we have usable pmas. Since we didn't return ENOMEM // above, ar must be fully covered by vmas, so we can just use // NextSegment below. mm.activeMu.Lock() mm.mappingMu.DowngradeLock() for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { if !vseg.ValuePtr().effectivePerms.Any() { // Linux: mm/gup.c:__get_user_pages() returns EFAULT in this // case, which is converted to ENOMEM by mlock. mm.activeMu.Unlock() mm.mappingMu.RUnlock() return linuxerr.ENOMEM } _, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), hostarch.NoAccess, true /* callerIndirectCommit */) if err != nil { mm.activeMu.Unlock() mm.mappingMu.RUnlock() // Linux: mm/mlock.c:__mlock_posix_error_return() if linuxerr.Equals(linuxerr.EFAULT, err) { return linuxerr.ENOMEM } if linuxerr.Equals(linuxerr.ENOMEM, err) { return linuxerr.EAGAIN } return err } } // Map pmas into the active AddressSpace, if we have one. mm.mappingMu.RUnlock() if mm.as != nil { mm.activeMu.DowngradeLock() err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, memmap.PlatformEffectCommit) mm.activeMu.RUnlock() if err != nil { return err } } else { mm.activeMu.Unlock() } } else { mm.mappingMu.Unlock() } return nil } // MLockAllOpts holds options to MLockAll. type MLockAllOpts struct { // If Current is true, change the memory-locking behavior of all mappings // to Mode. If Future is true, upgrade the memory-locking behavior of all // future mappings to Mode. At least one of Current or Future must be true. Current bool Future bool Mode memmap.MLockMode } // MLockAll implements the semantics of Linux's mlockall()/munlockall(), // depending on opts. func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error { if !opts.Current && !opts.Future { return linuxerr.EINVAL } mm.mappingMu.Lock() // Can't defer mm.mappingMu.Unlock(); see below. if opts.Current { if opts.Mode != memmap.MLockNone { // Check against RLIMIT_MEMLOCK. if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur if mlockLimit == 0 { mm.mappingMu.Unlock() return linuxerr.EPERM } if uint64(mm.vmas.Span()) > mlockLimit { mm.mappingMu.Unlock() return linuxerr.ENOMEM } } } for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { vma := vseg.ValuePtr() prevMode := vma.mlockMode vma.mlockMode = opts.Mode if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone { mm.lockedAS += uint64(vseg.Range().Length()) } else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone { mm.lockedAS -= uint64(vseg.Range().Length()) } } } if opts.Future { mm.defMLockMode = opts.Mode } if opts.Current && opts.Mode == memmap.MLockEager { // Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate() // ignores the return value of __mm_populate(), so all errors below are // ignored. // // Try to get usable pmas. mm.activeMu.Lock() mm.mappingMu.DowngradeLock() for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { if vseg.ValuePtr().effectivePerms.Any() { mm.getPMAsLocked(ctx, vseg, vseg.Range(), hostarch.NoAccess, true /* callerIndirectCommit */) } } // Map all pmas into the active AddressSpace, if we have one. mm.mappingMu.RUnlock() if mm.as != nil { mm.activeMu.DowngradeLock() mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), memmap.PlatformEffectCommit) mm.activeMu.RUnlock() } else { mm.activeMu.Unlock() } } else { mm.mappingMu.Unlock() } return nil } // NumaPolicy implements the semantics of Linux's get_mempolicy(MPOL_F_ADDR). func (mm *MemoryManager) NumaPolicy(addr hostarch.Addr) (linux.NumaPolicy, uint64, error) { mm.mappingMu.RLock() defer mm.mappingMu.RUnlock() vseg := mm.vmas.FindSegment(addr) if !vseg.Ok() { return 0, 0, linuxerr.EFAULT } vma := vseg.ValuePtr() return vma.numaPolicy, vma.numaNodemask, nil } // SetNumaPolicy implements the semantics of Linux's mbind(). func (mm *MemoryManager) SetNumaPolicy(addr hostarch.Addr, length uint64, policy linux.NumaPolicy, nodemask uint64) error { if !addr.IsPageAligned() { return linuxerr.EINVAL } // Linux allows this to overflow. la, _ := hostarch.Addr(length).RoundUp() ar, ok := addr.ToRange(uint64(la)) if !ok { return linuxerr.EINVAL } if ar.Length() == 0 { return nil } mm.mappingMu.Lock() defer mm.mappingMu.Unlock() defer func() { mm.vmas.MergeInsideRange(ar) mm.vmas.MergeOutsideRange(ar) }() vseg := mm.vmas.LowerBoundSegment(ar.Start) lastEnd := ar.Start for { if !vseg.Ok() || lastEnd < vseg.Start() { // "EFAULT: ... there was an unmapped hole in the specified memory // range specified [sic] by addr and len." - mbind(2) return linuxerr.EFAULT } vseg = mm.vmas.Isolate(vseg, ar) vma := vseg.ValuePtr() vma.numaPolicy = policy vma.numaNodemask = nodemask lastEnd = vseg.End() if ar.End <= lastEnd { return nil } vseg, _ = vseg.NextNonEmpty() } } // SetDontFork implements the semantics of madvise MADV_DONTFORK. func (mm *MemoryManager) SetDontFork(addr hostarch.Addr, length uint64, dontfork bool) error { ar, ok := addr.ToRange(length) if !ok { return linuxerr.EINVAL } mm.mappingMu.Lock() defer mm.mappingMu.Unlock() defer func() { mm.vmas.MergeInsideRange(ar) mm.vmas.MergeOutsideRange(ar) }() for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { vseg = mm.vmas.Isolate(vseg, ar) vma := vseg.ValuePtr() vma.dontfork = dontfork } if mm.vmas.SpanRange(ar) != ar.Length() { return linuxerr.ENOMEM } return nil } // Decommit implements the semantics of Linux's madvise(MADV_DONTNEED). func (mm *MemoryManager) Decommit(addr hostarch.Addr, length uint64) error { ar, ok := addr.ToRange(length) if !ok { return linuxerr.EINVAL } mm.mappingMu.RLock() defer mm.mappingMu.RUnlock() mm.activeMu.Lock() defer mm.activeMu.Unlock() // This is invalidateLocked(invalidatePrivate=true, invalidateShared=true), // but: // // - We must refuse to invalidate pmas under mlocked vmas. // // - If at least one byte in ar is not covered by a vma, decommit the rest // but return ENOMEM. // // - If we would invalidate only part of a huge page that we own (is not // copy-on-write), use MemoryFile.Decommit() instead to keep the allocated // huge page intact for future use. didUnmapAS := false pseg := mm.pmas.LowerBoundSegment(ar.Start) vseg := mm.vmas.LowerBoundSegment(ar.Start) if !vseg.Ok() { return linuxerr.ENOMEM } hadvgap := ar.Start < vseg.Start() for vseg.Ok() && vseg.Start() < ar.End { vma := vseg.ValuePtr() if vma.mlockMode != memmap.MLockNone { return linuxerr.EINVAL } vsegAR := vseg.Range().Intersect(ar) // pseg should already correspond to either this vma or a later one, // since there can't be a pma without a corresponding vma. if checkInvariants { if pseg.Ok() && pseg.End() <= vsegAR.Start { panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR)) } } for pseg.Ok() && pseg.Start() < vsegAR.End { pma := pseg.ValuePtr() if pma.huge && !mm.isPMACopyOnWriteLocked(vseg, pseg) { psegAR := pseg.Range().Intersect(vsegAR) if !psegAR.IsHugePageAligned() { firstHugeStart := psegAR.Start.HugeRoundDown() firstHugeEnd := firstHugeStart + hostarch.HugePageSize lastWholeHugeEnd := psegAR.End.HugeRoundDown() if firstHugeStart != psegAR.Start { // psegAR.Start is not hugepage-aligned. if psegAR.End <= firstHugeEnd { // All of psegAR falls within a single huge page. mm.mf.Decommit(pseg.fileRangeOf(psegAR)) pseg = pseg.NextSegment() continue } if firstHugeEnd == lastWholeHugeEnd && lastWholeHugeEnd != psegAR.End { // All of psegAR falls within two huge pages, and // psegAR.End is also not hugepage-aligned. The // logic below would handle this correctly, but // would make two separate calls to // MemoryFile.Decommit() for the first and last // huge pages respectively. mm.mf.Decommit(pseg.fileRangeOf(psegAR)) pseg = pseg.NextSegment() continue } mm.mf.Decommit(pseg.fileRangeOf(hostarch.AddrRange{psegAR.Start, firstHugeEnd})) psegAR.Start = firstHugeEnd } // Drop whole huge pages between psegAR.Start (which after the above // is either firstHugeStart or firstHugeEnd) and lastWholeHugeEnd // normally. if psegAR.Start < lastWholeHugeEnd { pseg = mm.pmas.Isolate(pseg, hostarch.AddrRange{psegAR.Start, lastWholeHugeEnd}) pma = pseg.ValuePtr() if !didUnmapAS { // Unmap all of ar, not just pseg.Range(), to minimize host // syscalls. AddressSpace mappings must be removed before // pma.file.DecRef(). mm.unmapASLocked(ar) didUnmapAS = true } pma.file.DecRef(pseg.fileRange()) mm.removeRSSLocked(pseg.Range()) pseg = mm.pmas.Remove(pseg).NextSegment() } if lastWholeHugeEnd != psegAR.End { // psegAR.End is not hugepage-aligned. mm.mf.Decommit(pseg.fileRangeOf(hostarch.AddrRange{lastWholeHugeEnd, psegAR.End})) pseg = pseg.NextSegment() } continue } } pseg = mm.pmas.Isolate(pseg, vsegAR) pma = pseg.ValuePtr() if !didUnmapAS { // Unmap all of ar, not just pseg.Range(), to minimize host // syscalls. AddressSpace mappings must be removed before // pma.file.DecRef(). mm.unmapASLocked(ar) didUnmapAS = true } pma.file.DecRef(pseg.fileRange()) mm.removeRSSLocked(pseg.Range()) pseg = mm.pmas.Remove(pseg).NextSegment() } if ar.End <= vseg.End() { break } vgap := vseg.NextGap() if !vgap.IsEmpty() { hadvgap = true } vseg = vgap.NextSegment() } // "If there are some parts of the specified address space that are not // mapped, the Linux version of madvise() ignores them and applies the call // to the rest (but returns ENOMEM from the system call, as it should)." - // madvise(2) if hadvgap { return linuxerr.ENOMEM } return nil } // MSyncOpts holds options to MSync. type MSyncOpts struct { // Sync has the semantics of MS_SYNC. Sync bool // Invalidate has the semantics of MS_INVALIDATE. Invalidate bool } // MSync implements the semantics of Linux's msync(). func (mm *MemoryManager) MSync(ctx context.Context, addr hostarch.Addr, length uint64, opts MSyncOpts) error { if addr != addr.RoundDown() { return linuxerr.EINVAL } if length == 0 { return nil } la, ok := hostarch.Addr(length).RoundUp() if !ok { return linuxerr.ENOMEM } ar, ok := addr.ToRange(uint64(la)) if !ok { return linuxerr.ENOMEM } mm.mappingMu.RLock() // Can't defer mm.mappingMu.RUnlock(); see below. vseg := mm.vmas.LowerBoundSegment(ar.Start) if !vseg.Ok() { mm.mappingMu.RUnlock() return linuxerr.ENOMEM } var unmapped bool lastEnd := ar.Start for { if !vseg.Ok() { mm.mappingMu.RUnlock() unmapped = true break } if lastEnd < vseg.Start() { unmapped = true } lastEnd = vseg.End() vma := vseg.ValuePtr() if opts.Invalidate && vma.mlockMode != memmap.MLockNone { mm.mappingMu.RUnlock() return linuxerr.EBUSY } // It's only possible to have dirtied the Mappable through a shared // mapping. Don't check if the mapping is writable, because mprotect // may have changed this, and also because Linux doesn't. if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private { // We can't call memmap.MappingIdentity.Msync while holding // mm.mappingMu since it may take fs locks that precede it in the // lock order. id.IncRef() mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar)) mm.mappingMu.RUnlock() err := id.Msync(ctx, mr) id.DecRef(ctx) if err != nil { return err } if lastEnd >= ar.End { break } mm.mappingMu.RLock() vseg = mm.vmas.LowerBoundSegment(lastEnd) } else { if lastEnd >= ar.End { mm.mappingMu.RUnlock() break } vseg = vseg.NextSegment() } } if unmapped { return linuxerr.ENOMEM } return nil } // GetSharedFutexKey is used by kernel.Task.GetSharedKey. func (mm *MemoryManager) GetSharedFutexKey(ctx context.Context, addr hostarch.Addr) (futex.Key, error) { ar, ok := addr.ToRange(4) // sizeof(int32). if !ok { return futex.Key{}, linuxerr.EFAULT } mm.mappingMu.RLock() defer mm.mappingMu.RUnlock() vseg, _, err := mm.getVMAsLocked(ctx, ar, hostarch.Read, false) if err != nil { return futex.Key{}, err } vma := vseg.ValuePtr() if vma.private { return futex.Key{ Kind: futex.KindSharedPrivate, Offset: uint64(addr), }, nil } if vma.id != nil { vma.id.IncRef() } return futex.Key{ Kind: futex.KindSharedMappable, Mappable: vma.mappable, MappingIdentity: vma.id, Offset: vseg.mappableOffsetAt(addr), }, nil } // VirtualMemorySize returns the combined length in bytes of all mappings in // mm. func (mm *MemoryManager) VirtualMemorySize() uint64 { mm.mappingMu.RLock() defer mm.mappingMu.RUnlock() return mm.usageAS } // VirtualMemorySizeRange returns the combined length in bytes of all mappings // in ar in mm. func (mm *MemoryManager) VirtualMemorySizeRange(ar hostarch.AddrRange) uint64 { mm.mappingMu.RLock() defer mm.mappingMu.RUnlock() return uint64(mm.vmas.SpanRange(ar)) } // ResidentSetSize returns the value advertised as mm's RSS in bytes. func (mm *MemoryManager) ResidentSetSize() uint64 { mm.activeMu.RLock() defer mm.activeMu.RUnlock() return mm.curRSS } // MaxResidentSetSize returns the value advertised as mm's max RSS in bytes. func (mm *MemoryManager) MaxResidentSetSize() uint64 { mm.activeMu.RLock() defer mm.activeMu.RUnlock() return mm.maxRSS } // VirtualDataSize returns the size of private data segments in mm. func (mm *MemoryManager) VirtualDataSize() uint64 { mm.mappingMu.RLock() defer mm.mappingMu.RUnlock() return mm.dataAS } // EnableMembarrierPrivate causes future calls to IsMembarrierPrivateEnabled to // return true. func (mm *MemoryManager) EnableMembarrierPrivate() { mm.membarrierPrivateEnabled.Store(1) } // IsMembarrierPrivateEnabled returns true if mm.EnableMembarrierPrivate() has // previously been called. func (mm *MemoryManager) IsMembarrierPrivateEnabled() bool { return mm.membarrierPrivateEnabled.Load() != 0 } // EnableMembarrierRSeq causes future calls to IsMembarrierRSeqEnabled to // return true. func (mm *MemoryManager) EnableMembarrierRSeq() { mm.membarrierRSeqEnabled.Store(1) } // IsMembarrierRSeqEnabled returns true if mm.EnableMembarrierRSeq() has // previously been called. func (mm *MemoryManager) IsMembarrierRSeqEnabled() bool { return mm.membarrierRSeqEnabled.Load() != 0 } // FindVMAByName finds a vma with the specified name and returns its start address and offset. func (mm *MemoryManager) FindVMAByName(ar hostarch.AddrRange, hint string) (hostarch.Addr, uint64, error) { mm.mappingMu.RLock() defer mm.mappingMu.RUnlock() for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok(); vseg = vseg.NextSegment() { start := vseg.Start() if !ar.Contains(start) { break } vma := vseg.ValuePtr() if vma.hint == hint { return start, vma.off, nil } } return 0, 0, fmt.Errorf("could not find \"%s\" in %s", hint, ar) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/vma.go000066400000000000000000000471461465435605700225070ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mm import ( "fmt" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" ) // Caller provides the droppedIDs slice to collect dropped mapping // identities. The caller must drop the references on these identities outside a // mm.mappingMu critical section. droppedIDs has append-like semantics, multiple // calls to functions that drop mapping identities within a scope should reuse // the same slice. // // Preconditions: // - mm.mappingMu must be locked for writing. // - opts must be valid as defined by the checks in MMap. func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOpts, droppedIDs []memmap.MappingIdentity) (vmaIterator, hostarch.AddrRange, []memmap.MappingIdentity, error) { if opts.MaxPerms != opts.MaxPerms.Effective() { panic(fmt.Sprintf("Non-effective MaxPerms %s cannot be enforced", opts.MaxPerms)) } // Find a usable range. addr, err := mm.findAvailableLocked(opts.Length, findAvailableOpts{ Addr: opts.Addr, Fixed: opts.Fixed, GrowsDown: opts.GrowsDown, Stack: opts.Stack, Private: opts.Private, Unmap: opts.Unmap, Map32Bit: opts.Map32Bit, }) if err != nil { // Can't force without opts.Unmap and opts.Fixed. if opts.Force && opts.Unmap && opts.Fixed { addr = opts.Addr } else { return vmaIterator{}, hostarch.AddrRange{}, droppedIDs, err } } ar, _ := addr.ToRange(opts.Length) // Check against RLIMIT_AS. newUsageAS := mm.usageAS + opts.Length if opts.Unmap { newUsageAS -= uint64(mm.vmas.SpanRange(ar)) } if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS { return vmaIterator{}, hostarch.AddrRange{}, droppedIDs, linuxerr.ENOMEM } if opts.MLockMode != memmap.MLockNone { // Check against RLIMIT_MEMLOCK. if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur if mlockLimit == 0 { return vmaIterator{}, hostarch.AddrRange{}, droppedIDs, linuxerr.EPERM } newLockedAS := mm.lockedAS + opts.Length if opts.Unmap { newLockedAS -= mm.mlockedBytesRangeLocked(ar) } if newLockedAS > mlockLimit { return vmaIterator{}, hostarch.AddrRange{}, droppedIDs, linuxerr.EAGAIN } } } // Remove overwritten mappings. This ordering is consistent with Linux: // compare Linux's mm/mmap.c:mmap_region() => do_munmap(), // file->f_op->mmap(). var vgap vmaGapIterator if opts.Unmap { vgap, droppedIDs = mm.unmapLocked(ctx, ar, droppedIDs) } else { vgap = mm.vmas.FindGap(ar.Start) } // Inform the Mappable, if any, of the new mapping. if opts.Mappable != nil { // The expression for writable is vma.canWriteMappableLocked(), but we // don't yet have a vma. if err := opts.Mappable.AddMapping(ctx, mm, ar, opts.Offset, !opts.Private && opts.MaxPerms.Write); err != nil { return vmaIterator{}, hostarch.AddrRange{}, droppedIDs, err } } // Take a reference on opts.MappingIdentity before inserting the vma since // vma merging can drop the reference. if opts.MappingIdentity != nil { opts.MappingIdentity.IncRef() } // Finally insert the vma. v := vma{ mappable: opts.Mappable, off: opts.Offset, realPerms: opts.Perms, effectivePerms: opts.Perms.Effective(), maxPerms: opts.MaxPerms, private: opts.Private, growsDown: opts.GrowsDown, isStack: opts.Stack, mlockMode: opts.MLockMode, numaPolicy: linux.MPOL_DEFAULT, id: opts.MappingIdentity, hint: opts.Hint, } vseg := mm.vmas.Insert(vgap, ar, v) mm.usageAS += opts.Length if v.isPrivateDataLocked() { mm.dataAS += opts.Length } if opts.MLockMode != memmap.MLockNone { mm.lockedAS += opts.Length } return vseg, ar, droppedIDs, nil } type findAvailableOpts struct { // These fields are equivalent to those in memmap.MMapOpts, except that: // // - Addr must be page-aligned. // // - Unmap allows existing guard pages in the returned range. Addr hostarch.Addr Fixed bool GrowsDown bool Stack bool Private bool Unmap bool Map32Bit bool } // map32Start/End are the bounds to which MAP_32BIT mappings are constrained, // and are equivalent to Linux's MAP32_BASE and MAP32_MAX respectively. const ( map32Start = 0x40000000 map32End = 0x80000000 ) // findAvailableLocked finds an allocatable range. // // Preconditions: mm.mappingMu must be locked. func (mm *MemoryManager) findAvailableLocked(length uint64, opts findAvailableOpts) (hostarch.Addr, error) { if opts.Fixed { opts.Map32Bit = false } allowedAR := mm.applicationAddrRange() if opts.Map32Bit { allowedAR = allowedAR.Intersect(hostarch.AddrRange{map32Start, map32End}) } // Does the provided suggestion work? if ar, ok := opts.Addr.ToRange(length); ok { if allowedAR.IsSupersetOf(ar) { if opts.Unmap { return ar.Start, nil } // Check for the presence of an existing vma or guard page. if vgap := mm.vmas.FindGap(ar.Start); vgap.Ok() && vgap.availableRange().IsSupersetOf(ar) { return ar.Start, nil } } } // Fixed mappings accept only the requested address. if opts.Fixed { return 0, linuxerr.ENOMEM } // Prefer hugepage alignment if a hugepage or more is requested and the vma // will actually be eligible for hugepages. alignment := uint64(hostarch.PageSize) if length >= hostarch.HugePageSize && opts.Private && !opts.GrowsDown && !opts.Stack { alignment = hostarch.HugePageSize } if opts.Map32Bit { return mm.findLowestAvailableLocked(length, alignment, allowedAR) } if mm.layout.DefaultDirection == arch.MmapBottomUp { return mm.findLowestAvailableLocked(length, alignment, hostarch.AddrRange{mm.layout.BottomUpBase, mm.layout.MaxAddr}) } return mm.findHighestAvailableLocked(length, alignment, hostarch.AddrRange{mm.layout.MinAddr, mm.layout.TopDownBase}) } func (mm *MemoryManager) applicationAddrRange() hostarch.AddrRange { return hostarch.AddrRange{mm.layout.MinAddr, mm.layout.MaxAddr} } // Preconditions: mm.mappingMu must be locked. func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bounds hostarch.AddrRange) (hostarch.Addr, error) { for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextLargeEnoughGap(hostarch.Addr(length)) { if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length { // Can we shift up to match the alignment? if offset := uint64(gr.Start) % alignment; offset != 0 { if uint64(gr.Length()) >= length+alignment-offset { // Yes, we're aligned. return gr.Start + hostarch.Addr(alignment-offset), nil } } // Either aligned perfectly, or can't align it. return gr.Start, nil } } return 0, linuxerr.ENOMEM } // Preconditions: mm.mappingMu must be locked. func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bounds hostarch.AddrRange) (hostarch.Addr, error) { for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevLargeEnoughGap(hostarch.Addr(length)) { if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length { // Can we shift down to match the alignment? start := gr.End - hostarch.Addr(length) if offset := uint64(start) % alignment; offset != 0 { if gr.Start <= start-hostarch.Addr(offset) { // Yes, we're aligned. return start - hostarch.Addr(offset), nil } } // Either aligned perfectly, or can't align it. return start, nil } } return 0, linuxerr.ENOMEM } // Preconditions: mm.mappingMu must be locked. func (mm *MemoryManager) mlockedBytesRangeLocked(ar hostarch.AddrRange) uint64 { var total uint64 for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { if vseg.ValuePtr().mlockMode != memmap.MLockNone { total += uint64(vseg.Range().Intersect(ar).Length()) } } return total } // getVMAsLocked ensures that vmas exist for all addresses in ar, and support // access of type (at, ignorePermissions). It returns: // // - An iterator to the vma containing ar.Start. If no vma contains ar.Start, // the iterator is unspecified. // // - An iterator to the gap after the last vma containing an address in ar. If // vmas exist for no addresses in ar, the iterator is to a gap that begins // before ar.Start. // // - An error that is non-nil if vmas exist for only a subset of ar. // // Preconditions: // - mm.mappingMu must be locked for reading; it may be temporarily unlocked. // - ar.Length() != 0. func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool) (vmaIterator, vmaGapIterator, error) { if checkInvariants { if !ar.WellFormed() || ar.Length() == 0 { panic(fmt.Sprintf("invalid ar: %v", ar)) } } // Inline mm.vmas.LowerBoundSegment so that we have the preceding gap if // !vbegin.Ok(). vbegin, vgap := mm.vmas.Find(ar.Start) if !vbegin.Ok() { vbegin = vgap.NextSegment() // vseg.Ok() is checked before entering the following loop. } else { vgap = vbegin.PrevGap() } addr := ar.Start vseg := vbegin for vseg.Ok() { // Loop invariants: vgap = vseg.PrevGap(); addr < vseg.End(). vma := vseg.ValuePtr() if addr < vseg.Start() { // TODO(jamieliu): Implement vma.growsDown here. return vbegin, vgap, linuxerr.EFAULT } perms := vma.effectivePerms if ignorePermissions { perms = vma.maxPerms } if !perms.SupersetOf(at) { return vbegin, vgap, linuxerr.EPERM } addr = vseg.End() vgap = vseg.NextGap() if addr >= ar.End { return vbegin, vgap, nil } vseg = vgap.NextSegment() } // Ran out of vmas before ar.End. return vbegin, vgap, linuxerr.EFAULT } // getVecVMAsLocked ensures that vmas exist for all addresses in ars, and // support access to type of (at, ignorePermissions). It returns the subset of // ars for which vmas exist. If this is not equal to ars, it returns a non-nil // error explaining why. // // Preconditions: mm.mappingMu must be locked for reading; it may be // temporarily unlocked. // // Postconditions: ars is not mutated. func (mm *MemoryManager) getVecVMAsLocked(ctx context.Context, ars hostarch.AddrRangeSeq, at hostarch.AccessType, ignorePermissions bool) (hostarch.AddrRangeSeq, error) { for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { ar := arsit.Head() if ar.Length() == 0 { continue } if _, vend, err := mm.getVMAsLocked(ctx, ar, at, ignorePermissions); err != nil { return truncatedAddrRangeSeq(ars, arsit, vend.Start()), err } } return ars, nil } // vma extension will not shrink the number of unmapped bytes between the start // of a growsDown vma and the end of its predecessor non-growsDown vma below // guardBytes. // // guardBytes is equivalent to Linux's stack_guard_gap after upstream // 1be7107fbe18 "mm: larger stack guard gap, between vmas". const guardBytes = 256 * hostarch.PageSize // unmapLocked unmaps all addresses in ar and returns the resulting gap in // mm.vmas. // // Caller provides the droppedIDs slice to collect dropped mapping // identities. The caller must drop the references on these identities outside a // mm.mappingMu critical section. droppedIDs has append-like semantics, multiple // calls to functions that drop mapping identities within a scope should reuse // the same slice. // // Preconditions: // - mm.mappingMu must be locked for writing. // - ar.Length() != 0. // - ar must be page-aligned. func (mm *MemoryManager) unmapLocked(ctx context.Context, ar hostarch.AddrRange, droppedIDs []memmap.MappingIdentity) (vmaGapIterator, []memmap.MappingIdentity) { if checkInvariants { if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { panic(fmt.Sprintf("invalid ar: %v", ar)) } } // AddressSpace mappings and pmas must be invalidated before // mm.removeVMAsLocked() => memmap.Mappable.RemoveMapping(). mm.Invalidate(ar, memmap.InvalidateOpts{InvalidatePrivate: true}) return mm.removeVMAsLocked(ctx, ar, droppedIDs) } // removeVMAsLocked removes vmas for addresses in ar and returns the // resulting gap in mm.vmas. // // Caller provides the droppedIDs slice to collect dropped mapping // identities. The caller must drop the references on these identities outside a // mm.mappingMu critical section. droppedIDs has append-like semantics, multiple // calls to functions that drop mapping identities within a scope should reuse // the same slice. // // Preconditions: // - mm.mappingMu must be locked for writing. // - ar.Length() != 0. // - ar must be page-aligned. func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar hostarch.AddrRange, droppedIDs []memmap.MappingIdentity) (vmaGapIterator, []memmap.MappingIdentity) { if checkInvariants { if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { panic(fmt.Sprintf("invalid ar: %v", ar)) } } vseg, vgap := mm.vmas.Find(ar.Start) if vgap.Ok() { vseg = vgap.NextSegment() } for vseg.Ok() && vseg.Start() < ar.End { vseg = mm.vmas.Isolate(vseg, ar) vmaAR := vseg.Range() vma := vseg.ValuePtr() if vma.mappable != nil { vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.canWriteMappableLocked()) } if vma.id != nil { droppedIDs = append(droppedIDs, vma.id) } mm.usageAS -= uint64(vmaAR.Length()) if vma.isPrivateDataLocked() { mm.dataAS -= uint64(vmaAR.Length()) } if vma.mlockMode != memmap.MLockNone { mm.lockedAS -= uint64(vmaAR.Length()) } vgap = mm.vmas.Remove(vseg) vseg = vgap.NextSegment() } return vgap, droppedIDs } // canWriteMappableLocked returns true if it is possible for vma.mappable to be // written to via this vma, i.e. if it is possible that // vma.mappable.Translate(at.Write=true) may be called as a result of this vma. // This includes via I/O with usermem.IOOpts.IgnorePermissions = true, such as // PTRACE_POKEDATA. // // canWriteMappableLocked is equivalent to Linux's VM_SHARED. // // Preconditions: mm.mappingMu must be locked. func (v *vma) canWriteMappableLocked() bool { return !v.private && v.maxPerms.Write } // isPrivateDataLocked identify the data segments - private, writable, not stack // // Preconditions: mm.mappingMu must be locked. func (v *vma) isPrivateDataLocked() bool { return v.realPerms.Write && v.private && !v.growsDown } // vmaSetFunctions implements segment.Functions for vmaSet. type vmaSetFunctions struct{} func (vmaSetFunctions) MinKey() hostarch.Addr { return 0 } func (vmaSetFunctions) MaxKey() hostarch.Addr { return ^hostarch.Addr(0) } func (vmaSetFunctions) ClearValue(vma *vma) { vma.mappable = nil vma.id = nil vma.hint = "" atomic.StoreUintptr(&vma.lastFault, 0) } func (vmaSetFunctions) Merge(ar1 hostarch.AddrRange, vma1 vma, ar2 hostarch.AddrRange, vma2 vma) (vma, bool) { if vma1.mappable != vma2.mappable || (vma1.mappable != nil && vma1.off+uint64(ar1.Length()) != vma2.off) || vma1.realPerms != vma2.realPerms || vma1.maxPerms != vma2.maxPerms || vma1.private != vma2.private || vma1.growsDown != vma2.growsDown || vma1.isStack != vma2.isStack || vma1.mlockMode != vma2.mlockMode || vma1.numaPolicy != vma2.numaPolicy || vma1.numaNodemask != vma2.numaNodemask || vma1.dontfork != vma2.dontfork || vma1.id != vma2.id || vma1.hint != vma2.hint { return vma{}, false } if vma2.id != nil { // This DecRef() will never be the final ref, since the vma1 is // currently holding a ref to the same mapping identity. Thus, we don't // need to worry about whether we're in a mm.mappingMu critical section. vma2.id.DecRef(context.Background()) } return vma1, true } func (vmaSetFunctions) Split(ar hostarch.AddrRange, v vma, split hostarch.Addr) (vma, vma) { v2 := v if v2.mappable != nil { v2.off += uint64(split - ar.Start) } if v2.id != nil { v2.id.IncRef() } return v, v2 } // Preconditions: // - vseg.ValuePtr().mappable != nil. // - vseg.Range().Contains(addr). func (vseg vmaIterator) mappableOffsetAt(addr hostarch.Addr) uint64 { if checkInvariants { if !vseg.Ok() { panic("terminal vma iterator") } if vseg.ValuePtr().mappable == nil { panic("Mappable offset is meaningless for anonymous vma") } if !vseg.Range().Contains(addr) { panic(fmt.Sprintf("addr %v out of bounds %v", addr, vseg.Range())) } } vma := vseg.ValuePtr() vstart := vseg.Start() return vma.off + uint64(addr-vstart) } // Preconditions: vseg.ValuePtr().mappable != nil. func (vseg vmaIterator) mappableRange() memmap.MappableRange { return vseg.mappableRangeOf(vseg.Range()) } // Preconditions: // - vseg.ValuePtr().mappable != nil. // - vseg.Range().IsSupersetOf(ar). // - ar.Length() != 0. func (vseg vmaIterator) mappableRangeOf(ar hostarch.AddrRange) memmap.MappableRange { if checkInvariants { if !vseg.Ok() { panic("terminal vma iterator") } if vseg.ValuePtr().mappable == nil { panic("MappableRange is meaningless for anonymous vma") } if !ar.WellFormed() || ar.Length() == 0 { panic(fmt.Sprintf("invalid ar: %v", ar)) } if !vseg.Range().IsSupersetOf(ar) { panic(fmt.Sprintf("ar %v out of bounds %v", ar, vseg.Range())) } } vma := vseg.ValuePtr() vstart := vseg.Start() return memmap.MappableRange{vma.off + uint64(ar.Start-vstart), vma.off + uint64(ar.End-vstart)} } // Preconditions: // - vseg.ValuePtr().mappable != nil. // - vseg.mappableRange().IsSupersetOf(mr). // - mr.Length() != 0. func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) hostarch.AddrRange { if checkInvariants { if !vseg.Ok() { panic("terminal vma iterator") } if vseg.ValuePtr().mappable == nil { panic("MappableRange is meaningless for anonymous vma") } if !mr.WellFormed() || mr.Length() == 0 { panic(fmt.Sprintf("invalid mr: %v", mr)) } if !vseg.mappableRange().IsSupersetOf(mr) { panic(fmt.Sprintf("mr %v out of bounds %v", mr, vseg.mappableRange())) } } vma := vseg.ValuePtr() vstart := vseg.Start() return hostarch.AddrRange{vstart + hostarch.Addr(mr.Start-vma.off), vstart + hostarch.Addr(mr.End-vma.off)} } // seekNextLowerBound returns mm.vmas.LowerBoundSegment(addr), but does so by // scanning linearly forward from vseg. // // Preconditions: // - mm.mappingMu must be locked. // - addr >= vseg.Start(). func (vseg vmaIterator) seekNextLowerBound(addr hostarch.Addr) vmaIterator { if checkInvariants { if !vseg.Ok() { panic("terminal vma iterator") } if addr < vseg.Start() { panic(fmt.Sprintf("can't seek forward to %#x from %#x", addr, vseg.Start())) } } for vseg.Ok() && addr >= vseg.End() { vseg = vseg.NextSegment() } return vseg } // availableRange returns the subset of vgap.Range() in which new vmas may be // created without MMapOpts.Unmap == true. func (vgap vmaGapIterator) availableRange() hostarch.AddrRange { ar := vgap.Range() next := vgap.NextSegment() if !next.Ok() || !next.ValuePtr().growsDown { return ar } // Exclude guard pages. if ar.Length() < guardBytes { return hostarch.AddrRange{ar.Start, ar.Start} } ar.End -= guardBytes return ar } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/mm/vma_set.go000066400000000000000000002023311465435605700233470ustar00rootroot00000000000000package mm import ( __generics_imported0 "gvisor.dev/gvisor/pkg/hostarch" ) import ( "bytes" "context" "fmt" ) // trackGaps is an optional parameter. // // If trackGaps is 1, the Set will track maximum gap size recursively, // enabling the GapIterator.{Prev,Next}LargeEnoughGap functions. In this // case, Key must be an unsigned integer. // // trackGaps must be 0 or 1. const vmatrackGaps = 1 var _ = uint8(vmatrackGaps << 7) // Will fail if not zero or one. // dynamicGap is a type that disappears if trackGaps is 0. type vmadynamicGap [vmatrackGaps]__generics_imported0.Addr // Get returns the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *vmadynamicGap) Get() __generics_imported0.Addr { return d[:][0] } // Set sets the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *vmadynamicGap) Set(v __generics_imported0.Addr) { d[:][0] = v } const ( // minDegree is the minimum degree of an internal node in a Set B-tree. // // - Any non-root node has at least minDegree-1 segments. // // - Any non-root internal (non-leaf) node has at least minDegree children. // // - The root node may have fewer than minDegree-1 segments, but it may // only have 0 segments if the tree is empty. // // Our implementation requires minDegree >= 3. Higher values of minDegree // usually improve performance, but increase memory usage for small sets. vmaminDegree = 8 vmamaxDegree = 2 * vmaminDegree ) // A Set is a mapping of segments with non-overlapping Range keys. The zero // value for a Set is an empty set. Set values are not safely movable nor // copyable. Set is thread-compatible. // // +stateify savable type vmaSet struct { root vmanode `state:".([]vmaFlatSegment)"` } // IsEmpty returns true if the set contains no segments. func (s *vmaSet) IsEmpty() bool { return s.root.nrSegments == 0 } // IsEmptyRange returns true iff no segments in the set overlap the given // range. This is semantically equivalent to s.SpanRange(r) == 0, but may be // more efficient. func (s *vmaSet) IsEmptyRange(r __generics_imported0.AddrRange) bool { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return true } _, gap := s.Find(r.Start) if !gap.Ok() { return false } return r.End <= gap.End() } // Span returns the total size of all segments in the set. func (s *vmaSet) Span() __generics_imported0.Addr { var sz __generics_imported0.Addr for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { sz += seg.Range().Length() } return sz } // SpanRange returns the total size of the intersection of segments in the set // with the given range. func (s *vmaSet) SpanRange(r __generics_imported0.AddrRange) __generics_imported0.Addr { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return 0 } var sz __generics_imported0.Addr for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { sz += seg.Range().Intersect(r).Length() } return sz } // FirstSegment returns the first segment in the set. If the set is empty, // FirstSegment returns a terminal iterator. func (s *vmaSet) FirstSegment() vmaIterator { if s.root.nrSegments == 0 { return vmaIterator{} } return s.root.firstSegment() } // LastSegment returns the last segment in the set. If the set is empty, // LastSegment returns a terminal iterator. func (s *vmaSet) LastSegment() vmaIterator { if s.root.nrSegments == 0 { return vmaIterator{} } return s.root.lastSegment() } // FirstGap returns the first gap in the set. func (s *vmaSet) FirstGap() vmaGapIterator { n := &s.root for n.hasChildren { n = n.children[0] } return vmaGapIterator{n, 0} } // LastGap returns the last gap in the set. func (s *vmaSet) LastGap() vmaGapIterator { n := &s.root for n.hasChildren { n = n.children[n.nrSegments] } return vmaGapIterator{n, n.nrSegments} } // Find returns the segment or gap whose range contains the given key. If a // segment is found, the returned Iterator is non-terminal and the // returned GapIterator is terminal. Otherwise, the returned Iterator is // terminal and the returned GapIterator is non-terminal. func (s *vmaSet) Find(key __generics_imported0.Addr) (vmaIterator, vmaGapIterator) { n := &s.root for { lower := 0 upper := n.nrSegments for lower < upper { i := lower + (upper-lower)/2 if r := n.keys[i]; key < r.End { if key >= r.Start { return vmaIterator{n, i}, vmaGapIterator{} } upper = i } else { lower = i + 1 } } i := lower if !n.hasChildren { return vmaIterator{}, vmaGapIterator{n, i} } n = n.children[i] } } // FindSegment returns the segment whose range contains the given key. If no // such segment exists, FindSegment returns a terminal iterator. func (s *vmaSet) FindSegment(key __generics_imported0.Addr) vmaIterator { seg, _ := s.Find(key) return seg } // LowerBoundSegment returns the segment with the lowest range that contains a // key greater than or equal to min. If no such segment exists, // LowerBoundSegment returns a terminal iterator. func (s *vmaSet) LowerBoundSegment(min __generics_imported0.Addr) vmaIterator { seg, gap := s.Find(min) if seg.Ok() { return seg } return gap.NextSegment() } // UpperBoundSegment returns the segment with the highest range that contains a // key less than or equal to max. If no such segment exists, UpperBoundSegment // returns a terminal iterator. func (s *vmaSet) UpperBoundSegment(max __generics_imported0.Addr) vmaIterator { seg, gap := s.Find(max) if seg.Ok() { return seg } return gap.PrevSegment() } // FindGap returns the gap containing the given key. If no such gap exists // (i.e. the set contains a segment containing that key), FindGap returns a // terminal iterator. func (s *vmaSet) FindGap(key __generics_imported0.Addr) vmaGapIterator { _, gap := s.Find(key) return gap } // LowerBoundGap returns the gap with the lowest range that is greater than or // equal to min. func (s *vmaSet) LowerBoundGap(min __generics_imported0.Addr) vmaGapIterator { seg, gap := s.Find(min) if gap.Ok() { return gap } return seg.NextGap() } // UpperBoundGap returns the gap with the highest range that is less than or // equal to max. func (s *vmaSet) UpperBoundGap(max __generics_imported0.Addr) vmaGapIterator { seg, gap := s.Find(max) if gap.Ok() { return gap } return seg.PrevGap() } // FirstLargeEnoughGap returns the first gap in the set with at least the given // length. If no such gap exists, FirstLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *vmaSet) FirstLargeEnoughGap(minSize __generics_imported0.Addr) vmaGapIterator { if vmatrackGaps != 1 { panic("set is not tracking gaps") } gap := s.FirstGap() if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // LastLargeEnoughGap returns the last gap in the set with at least the given // length. If no such gap exists, LastLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *vmaSet) LastLargeEnoughGap(minSize __generics_imported0.Addr) vmaGapIterator { if vmatrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LastGap() if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // LowerBoundLargeEnoughGap returns the first gap in the set with at least the // given length and whose range contains a key greater than or equal to min. If // no such gap exists, LowerBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *vmaSet) LowerBoundLargeEnoughGap(min, minSize __generics_imported0.Addr) vmaGapIterator { if vmatrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LowerBoundGap(min) if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // UpperBoundLargeEnoughGap returns the last gap in the set with at least the // given length and whose range contains a key less than or equal to max. If no // such gap exists, UpperBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *vmaSet) UpperBoundLargeEnoughGap(max, minSize __generics_imported0.Addr) vmaGapIterator { if vmatrackGaps != 1 { panic("set is not tracking gaps") } gap := s.UpperBoundGap(max) if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // Insert inserts the given segment into the given gap. If the new segment can // be merged with adjacent segments, Insert will do so. Insert returns an // iterator to the segment containing the inserted value (which may have been // merged with other values). All existing iterators (including gap, but not // including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, Insert panics. // // Insert is semantically equivalent to a InsertWithoutMerging followed by a // Merge, but may be more efficient. Note that there is no unchecked variant of // Insert since Insert must retrieve and inspect gap's predecessor and // successor segments regardless. func (s *vmaSet) Insert(gap vmaGapIterator, r __generics_imported0.AddrRange, val vma) vmaIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } prev, next := gap.PrevSegment(), gap.NextSegment() if prev.Ok() && prev.End() > r.Start { panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) } if next.Ok() && next.Start() < r.End { panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) } if prev.Ok() && prev.End() == r.Start { if mval, ok := (vmaSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { shrinkMaxGap := vmatrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() prev.SetEndUnchecked(r.End) prev.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } if next.Ok() && next.Start() == r.End { val = mval if mval, ok := (vmaSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { prev.SetEndUnchecked(next.End()) prev.SetValue(mval) return s.Remove(next).PrevSegment() } } return prev } } if next.Ok() && next.Start() == r.End { if mval, ok := (vmaSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { shrinkMaxGap := vmatrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() next.SetStartUnchecked(r.Start) next.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } return next } } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMerging inserts the given segment into the given gap and // returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, // InsertWithoutMerging panics. func (s *vmaSet) InsertWithoutMerging(gap vmaGapIterator, r __generics_imported0.AddrRange, val vma) vmaIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if gr := gap.Range(); !gr.IsSupersetOf(r) { panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMergingUnchecked inserts the given segment into the given gap // and returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // Preconditions: // - r.Start >= gap.Start(). // - r.End <= gap.End(). func (s *vmaSet) InsertWithoutMergingUnchecked(gap vmaGapIterator, r __generics_imported0.AddrRange, val vma) vmaIterator { gap = gap.node.rebalanceBeforeInsert(gap) splitMaxGap := vmatrackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get()) copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) gap.node.keys[gap.index] = r gap.node.values[gap.index] = val gap.node.nrSegments++ if splitMaxGap { gap.node.updateMaxGapLeaf() } return vmaIterator{gap.node, gap.index} } // InsertRange inserts the given segment into the set. If the new segment can // be merged with adjacent segments, InsertRange will do so. InsertRange // returns an iterator to the segment containing the inserted value (which may // have been merged with other values). All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertRange panics. // // InsertRange searches the set to find the gap to insert into. If the caller // already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *vmaSet) InsertRange(r __generics_imported0.AddrRange, val vma) vmaIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.Insert(gap, r, val) } // InsertWithoutMergingRange inserts the given segment into the set and returns // an iterator to the inserted segment. All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertWithoutMergingRange panics. // // InsertWithoutMergingRange searches the set to find the gap to insert into. // If the caller already has the appropriate GapIterator, or if the caller // needs to do additional work between finding the gap and insertion, use // InsertWithoutMerging instead. func (s *vmaSet) InsertWithoutMergingRange(r __generics_imported0.AddrRange, val vma) vmaIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.InsertWithoutMerging(gap, r, val) } // TryInsertRange attempts to insert the given segment into the set. If the new // segment can be merged with adjacent segments, TryInsertRange will do so. // TryInsertRange returns an iterator to the segment containing the inserted // value (which may have been merged with other values). All existing iterators // (excluding the returned iterator) are invalidated. // // If the new segment would overlap an existing segment, TryInsertRange does // nothing and returns a terminal iterator. // // TryInsertRange searches the set to find the gap to insert into. If the // caller already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *vmaSet) TryInsertRange(r __generics_imported0.AddrRange, val vma) vmaIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return vmaIterator{} } if gap.End() < r.End { return vmaIterator{} } return s.Insert(gap, r, val) } // TryInsertWithoutMergingRange attempts to insert the given segment into the // set. If successful, it returns an iterator to the inserted segment; all // existing iterators (excluding the returned iterator) are invalidated. If the // new segment would overlap an existing segment, TryInsertWithoutMergingRange // does nothing and returns a terminal iterator. // // TryInsertWithoutMergingRange searches the set to find the gap to insert // into. If the caller already has the appropriate GapIterator, or if the // caller needs to do additional work between finding the gap and insertion, // use InsertWithoutMerging instead. func (s *vmaSet) TryInsertWithoutMergingRange(r __generics_imported0.AddrRange, val vma) vmaIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return vmaIterator{} } if gap.End() < r.End { return vmaIterator{} } return s.InsertWithoutMerging(gap, r, val) } // Remove removes the given segment and returns an iterator to the vacated gap. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. func (s *vmaSet) Remove(seg vmaIterator) vmaGapIterator { if seg.node.hasChildren { victim := seg.PrevSegment() seg.SetRangeUnchecked(victim.Range()) seg.SetValue(victim.Value()) nextAdjacentNode := seg.NextSegment().node if vmatrackGaps != 0 { nextAdjacentNode.updateMaxGapLeaf() } return s.Remove(victim).NextGap() } copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) vmaSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) seg.node.nrSegments-- if vmatrackGaps != 0 { seg.node.updateMaxGapLeaf() } return seg.node.rebalanceAfterRemove(vmaGapIterator{seg.node, seg.index}) } // RemoveAll removes all segments from the set. All existing iterators are // invalidated. func (s *vmaSet) RemoveAll() { s.root = vmanode{} } // RemoveRange removes all segments in the given range. An iterator to the // newly formed gap is returned, and all existing iterators are invalidated. // // RemoveRange searches the set to find segments to remove. If the caller // already has an iterator to either end of the range of segments to remove, or // if the caller needs to do additional work before removing each segment, // iterate segments and call Remove in a loop instead. func (s *vmaSet) RemoveRange(r __generics_imported0.AddrRange) vmaGapIterator { seg, gap := s.Find(r.Start) if seg.Ok() { seg = s.Isolate(seg, r) gap = s.Remove(seg) } for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { seg = s.SplitAfter(seg, r.End) gap = s.Remove(seg) } return gap } // RemoveFullRange is equivalent to RemoveRange, except that if any key in the // given range does not correspond to a segment, RemoveFullRange panics. func (s *vmaSet) RemoveFullRange(r __generics_imported0.AddrRange) vmaGapIterator { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) end := seg.End() gap := s.Remove(seg) if r.End <= end { return gap } seg = gap.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // Merge attempts to merge two neighboring segments. If successful, Merge // returns an iterator to the merged segment, and all existing iterators are // invalidated. Otherwise, Merge returns a terminal iterator. // // If first is not the predecessor of second, Merge panics. func (s *vmaSet) Merge(first, second vmaIterator) vmaIterator { if first.NextSegment() != second { panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) } return s.MergeUnchecked(first, second) } // MergeUnchecked attempts to merge two neighboring segments. If successful, // MergeUnchecked returns an iterator to the merged segment, and all existing // iterators are invalidated. Otherwise, MergeUnchecked returns a terminal // iterator. // // Precondition: first is the predecessor of second: first.NextSegment() == // second, first == second.PrevSegment(). func (s *vmaSet) MergeUnchecked(first, second vmaIterator) vmaIterator { if first.End() == second.Start() { if mval, ok := (vmaSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { first.SetEndUnchecked(second.End()) first.SetValue(mval) return s.Remove(second).PrevSegment() } } return vmaIterator{} } // MergePrev attempts to merge the given segment with its predecessor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergePrev is usually used when mutating segments while iterating them in // order of increasing keys, to attempt merging of each mutated segment with // its previously-mutated predecessor. In such cases, merging a mutated segment // with its unmutated successor would incorrectly cause the latter to be // skipped. func (s *vmaSet) MergePrev(seg vmaIterator) vmaIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } return seg } // MergeNext attempts to merge the given segment with its successor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergeNext is usually used when mutating segments while iterating them in // order of decreasing keys, to attempt merging of each mutated segment with // its previously-mutated successor. In such cases, merging a mutated segment // with its unmutated predecessor would incorrectly cause the latter to be // skipped. func (s *vmaSet) MergeNext(seg vmaIterator) vmaIterator { if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // Unisolate attempts to merge the given segment with its predecessor and // successor if possible, and returns an updated iterator to the extended // segment. All existing iterators (including seg, but not including the // returned iterator) are invalidated. // // Unisolate is usually used in conjunction with Isolate when mutating part of // a single segment in a way that may affect its mergeability. For the reasons // described by MergePrev and MergeNext, it is usually incorrect to use the // return value of Unisolate in a loop variable. func (s *vmaSet) Unisolate(seg vmaIterator) vmaIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // MergeAll merges all mergeable adjacent segments in the set. All existing // iterators are invalidated. func (s *vmaSet) MergeAll() { seg := s.FirstSegment() if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeInsideRange attempts to merge all adjacent segments that contain a key // in the specific range. All existing iterators are invalidated. // // MergeInsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid a redundant search. func (s *vmaSet) MergeInsideRange(r __generics_imported0.AddrRange) { seg := s.LowerBoundSegment(r.Start) if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() && next.Start() < r.End { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeOutsideRange attempts to merge the segment containing r.Start with its // predecessor, and the segment containing r.End-1 with its successor. // // MergeOutsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid two redundant searches. func (s *vmaSet) MergeOutsideRange(r __generics_imported0.AddrRange) { first := s.FindSegment(r.Start) if first.Ok() { if prev := first.PrevSegment(); prev.Ok() { s.Merge(prev, first) } } last := s.FindSegment(r.End - 1) if last.Ok() { if next := last.NextSegment(); next.Ok() { s.Merge(last, next) } } } // Split splits the given segment at the given key and returns iterators to the // two resulting segments. All existing iterators (including seg, but not // including the returned iterators) are invalidated. // // If the segment cannot be split at split (because split is at the start or // end of the segment's range, so splitting would produce a segment with zero // length, or because split falls outside the segment's range altogether), // Split panics. func (s *vmaSet) Split(seg vmaIterator, split __generics_imported0.Addr) (vmaIterator, vmaIterator) { if !seg.Range().CanSplitAt(split) { panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) } return s.SplitUnchecked(seg, split) } // SplitUnchecked splits the given segment at the given key and returns // iterators to the two resulting segments. All existing iterators (including // seg, but not including the returned iterators) are invalidated. // // Preconditions: seg.Start() < key < seg.End(). func (s *vmaSet) SplitUnchecked(seg vmaIterator, split __generics_imported0.Addr) (vmaIterator, vmaIterator) { val1, val2 := (vmaSetFunctions{}).Split(seg.Range(), seg.Value(), split) end2 := seg.End() seg.SetEndUnchecked(split) seg.SetValue(val1) seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), __generics_imported0.AddrRange{split, end2}, val2) return seg2.PrevSegment(), seg2 } // SplitBefore ensures that the given segment's start is at least start by // splitting at start if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterator) are invalidated. // // SplitBefore is usually when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, the first segment may // extend beyond the start of the range to be mutated, and needs to be // SplitBefore to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter; i.e. SplitBefore needs to be invoked on each segment, while // SplitAfter only needs to be invoked on the first. // // Preconditions: start < seg.End(). func (s *vmaSet) SplitBefore(seg vmaIterator, start __generics_imported0.Addr) vmaIterator { if seg.Range().CanSplitAt(start) { _, seg = s.SplitUnchecked(seg, start) } return seg } // SplitAfter ensures that the given segment's end is at most end by splitting // at end if necessary, and returns an updated iterator to the bounded segment. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. // // SplitAfter is usually used when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, each iterated segment // may extend beyond the end of the range to be mutated, and needs to be // SplitAfter to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter exchange roles; i.e. SplitBefore needs to be invoked on each // segment, while SplitAfter only needs to be invoked on the first. // // Preconditions: seg.Start() < end. func (s *vmaSet) SplitAfter(seg vmaIterator, end __generics_imported0.Addr) vmaIterator { if seg.Range().CanSplitAt(end) { seg, _ = s.SplitUnchecked(seg, end) } return seg } // Isolate ensures that the given segment's range is a subset of r by splitting // at r.Start and r.End if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterators) are invalidated. // // Isolate is usually used when mutating part of a single segment, or when // mutating segments in a range where the first segment is not necessarily // split, making use of SplitBefore/SplitAfter complex. // // Preconditions: seg.Range().Overlaps(r). func (s *vmaSet) Isolate(seg vmaIterator, r __generics_imported0.AddrRange) vmaIterator { if seg.Range().CanSplitAt(r.Start) { _, seg = s.SplitUnchecked(seg, r.Start) } if seg.Range().CanSplitAt(r.End) { seg, _ = s.SplitUnchecked(seg, r.End) } return seg } // LowerBoundSegmentSplitBefore combines LowerBoundSegment and SplitBefore. // // LowerBoundSegmentSplitBefore is usually used when mutating segments in a // range while iterating them in order of increasing keys. In such cases, // LowerBoundSegmentSplitBefore provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *vmaSet) LowerBoundSegmentSplitBefore(min __generics_imported0.Addr) vmaIterator { seg := s.LowerBoundSegment(min) if seg.Ok() { seg = s.SplitBefore(seg, min) } return seg } // UpperBoundSegmentSplitAfter combines UpperBoundSegment and SplitAfter. // // UpperBoundSegmentSplitAfter is usually used when mutating segments in a // range while iterating them in order of decreasing keys. In such cases, // UpperBoundSegmentSplitAfter provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *vmaSet) UpperBoundSegmentSplitAfter(max __generics_imported0.Addr) vmaIterator { seg := s.UpperBoundSegment(max) if seg.Ok() { seg = s.SplitAfter(seg, max) } return seg } // VisitRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments will not be split, so f may be called // on segments lying partially outside r. Non-empty gaps between segments are // skipped. If a call to f returns false, VisitRange stops iteration // immediately. // // N.B. f must not invalidate iterators into s. func (s *vmaSet) VisitRange(r __generics_imported0.AddrRange, f func(seg vmaIterator) bool) { for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { if !f(seg) { return } } } // VisitFullRange is equivalent to VisitRange, except that if any key in r that // is visited before f returns false does not correspond to a segment, // VisitFullRange panics. func (s *vmaSet) VisitFullRange(r __generics_imported0.AddrRange, f func(seg vmaIterator) bool) { pos := r.Start seg := s.FindSegment(r.Start) for { if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", pos)) } if !f(seg) { return } pos = seg.End() if r.End <= pos { return } seg, _ = seg.NextNonEmpty() } } // MutateRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments that lie partially outside r are split // before f is called, such that f only observes segments entirely within r. // Iterated segments are merged again after f is called. Non-empty gaps between // segments are skipped. If a call to f returns false, MutateRange stops // iteration immediately. // // MutateRange invalidates all existing iterators. // // N.B. f must not invalidate iterators into s. func (s *vmaSet) MutateRange(r __generics_imported0.AddrRange, f func(seg vmaIterator) bool) { seg := s.LowerBoundSegmentSplitBefore(r.Start) for seg.Ok() && seg.Start() < r.End { seg = s.SplitAfter(seg, r.End) cont := f(seg) seg = s.MergePrev(seg) if !cont { s.MergeNext(seg) return } seg = seg.NextSegment() } if seg.Ok() { s.MergePrev(seg) } } // MutateFullRange is equivalent to MutateRange, except that if any key in r // that is visited before f returns false does not correspond to a segment, // MutateFullRange panics. func (s *vmaSet) MutateFullRange(r __generics_imported0.AddrRange, f func(seg vmaIterator) bool) { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) cont := f(seg) end := seg.End() seg = s.MergePrev(seg) if !cont || r.End <= end { s.MergeNext(seg) return } seg = seg.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // +stateify savable type vmanode struct { // An internal binary tree node looks like: // // K // / \ // Cl Cr // // where all keys in the subtree rooted by Cl (the left subtree) are less // than K (the key of the parent node), and all keys in the subtree rooted // by Cr (the right subtree) are greater than K. // // An internal B-tree node's indexes work out to look like: // // K0 K1 K2 ... Kn-1 // / \/ \/ \ ... / \ // C0 C1 C2 C3 ... Cn-1 Cn // // where n is nrSegments. nrSegments int // parent is a pointer to this node's parent. If this node is root, parent // is nil. parent *vmanode // parentIndex is the index of this node in parent.children. parentIndex int // Flag for internal nodes that is technically redundant with "children[0] // != nil", but is stored in the first cache line. "hasChildren" rather // than "isLeaf" because false must be the correct value for an empty root. hasChildren bool // The longest gap within this node. If the node is a leaf, it's simply the // maximum gap among all the (nrSegments+1) gaps formed by its nrSegments keys // including the 0th and nrSegments-th gap possibly shared with its upper-level // nodes; if it's a non-leaf node, it's the max of all children's maxGap. maxGap vmadynamicGap // Nodes store keys and values in separate arrays to maximize locality in // the common case (scanning keys for lookup). keys [vmamaxDegree - 1]__generics_imported0.AddrRange values [vmamaxDegree - 1]vma children [vmamaxDegree]*vmanode } // firstSegment returns the first segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *vmanode) firstSegment() vmaIterator { for n.hasChildren { n = n.children[0] } return vmaIterator{n, 0} } // lastSegment returns the last segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *vmanode) lastSegment() vmaIterator { for n.hasChildren { n = n.children[n.nrSegments] } return vmaIterator{n, n.nrSegments - 1} } func (n *vmanode) prevSibling() *vmanode { if n.parent == nil || n.parentIndex == 0 { return nil } return n.parent.children[n.parentIndex-1] } func (n *vmanode) nextSibling() *vmanode { if n.parent == nil || n.parentIndex == n.parent.nrSegments { return nil } return n.parent.children[n.parentIndex+1] } // rebalanceBeforeInsert splits n and its ancestors if they are full, as // required for insertion, and returns an updated iterator to the position // represented by gap. func (n *vmanode) rebalanceBeforeInsert(gap vmaGapIterator) vmaGapIterator { if n.nrSegments < vmamaxDegree-1 { return gap } if n.parent != nil { gap = n.parent.rebalanceBeforeInsert(gap) } if n.parent == nil { left := &vmanode{ nrSegments: vmaminDegree - 1, parent: n, parentIndex: 0, hasChildren: n.hasChildren, } right := &vmanode{ nrSegments: vmaminDegree - 1, parent: n, parentIndex: 1, hasChildren: n.hasChildren, } copy(left.keys[:vmaminDegree-1], n.keys[:vmaminDegree-1]) copy(left.values[:vmaminDegree-1], n.values[:vmaminDegree-1]) copy(right.keys[:vmaminDegree-1], n.keys[vmaminDegree:]) copy(right.values[:vmaminDegree-1], n.values[vmaminDegree:]) n.keys[0], n.values[0] = n.keys[vmaminDegree-1], n.values[vmaminDegree-1] vmazeroValueSlice(n.values[1:]) if n.hasChildren { copy(left.children[:vmaminDegree], n.children[:vmaminDegree]) copy(right.children[:vmaminDegree], n.children[vmaminDegree:]) vmazeroNodeSlice(n.children[2:]) for i := 0; i < vmaminDegree; i++ { left.children[i].parent = left left.children[i].parentIndex = i right.children[i].parent = right right.children[i].parentIndex = i } } n.nrSegments = 1 n.hasChildren = true n.children[0] = left n.children[1] = right if vmatrackGaps != 0 { left.updateMaxGapLocal() right.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < vmaminDegree { return vmaGapIterator{left, gap.index} } return vmaGapIterator{right, gap.index - vmaminDegree} } copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[vmaminDegree-1], n.values[vmaminDegree-1] copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { n.parent.children[i].parentIndex = i } sibling := &vmanode{ nrSegments: vmaminDegree - 1, parent: n.parent, parentIndex: n.parentIndex + 1, hasChildren: n.hasChildren, } n.parent.children[n.parentIndex+1] = sibling n.parent.nrSegments++ copy(sibling.keys[:vmaminDegree-1], n.keys[vmaminDegree:]) copy(sibling.values[:vmaminDegree-1], n.values[vmaminDegree:]) vmazeroValueSlice(n.values[vmaminDegree-1:]) if n.hasChildren { copy(sibling.children[:vmaminDegree], n.children[vmaminDegree:]) vmazeroNodeSlice(n.children[vmaminDegree:]) for i := 0; i < vmaminDegree; i++ { sibling.children[i].parent = sibling sibling.children[i].parentIndex = i } } n.nrSegments = vmaminDegree - 1 if vmatrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < vmaminDegree { return gap } return vmaGapIterator{sibling, gap.index - vmaminDegree} } // rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient // (contain fewer segments than required by B-tree invariants), as required for // removal, and returns an updated iterator to the position represented by gap. // // Precondition: n is the only node in the tree that may currently violate a // B-tree invariant. func (n *vmanode) rebalanceAfterRemove(gap vmaGapIterator) vmaGapIterator { for { if n.nrSegments >= vmaminDegree-1 { return gap } if n.parent == nil { return gap } if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= vmaminDegree { copy(n.keys[1:], n.keys[:n.nrSegments]) copy(n.values[1:], n.values[:n.nrSegments]) n.keys[0] = n.parent.keys[n.parentIndex-1] n.values[0] = n.parent.values[n.parentIndex-1] n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] vmaSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { copy(n.children[1:], n.children[:n.nrSegments+1]) n.children[0] = sibling.children[sibling.nrSegments] sibling.children[sibling.nrSegments] = nil n.children[0].parent = n n.children[0].parentIndex = 0 for i := 1; i < n.nrSegments+2; i++ { n.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if vmatrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling && gap.index == sibling.nrSegments { return vmaGapIterator{n, 0} } if gap.node == n { return vmaGapIterator{n, gap.index + 1} } return gap } if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= vmaminDegree { n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] n.values[n.nrSegments] = n.parent.values[n.parentIndex] n.parent.keys[n.parentIndex] = sibling.keys[0] n.parent.values[n.parentIndex] = sibling.values[0] copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) vmaSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { n.children[n.nrSegments+1] = sibling.children[0] copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) sibling.children[sibling.nrSegments] = nil n.children[n.nrSegments+1].parent = n n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 for i := 0; i < sibling.nrSegments; i++ { sibling.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if vmatrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling { if gap.index == 0 { return vmaGapIterator{n, n.nrSegments} } return vmaGapIterator{sibling, gap.index - 1} } return gap } p := n.parent if p.nrSegments == 1 { left, right := p.children[0], p.children[1] p.nrSegments = left.nrSegments + right.nrSegments + 1 p.hasChildren = left.hasChildren p.keys[left.nrSegments] = p.keys[0] p.values[left.nrSegments] = p.values[0] copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := 0; i < p.nrSegments+1; i++ { p.children[i].parent = p p.children[i].parentIndex = i } } else { p.children[0] = nil p.children[1] = nil } if gap.node == left { return vmaGapIterator{p, gap.index} } if gap.node == right { return vmaGapIterator{p, gap.index + left.nrSegments + 1} } return gap } // Merge n and either sibling, along with the segment separating the // two, into whichever of the two nodes comes first. This is the // reverse of the non-root splitting case in // node.rebalanceBeforeInsert. var left, right *vmanode if n.parentIndex > 0 { left = n.prevSibling() right = n } else { left = n right = n.nextSibling() } if gap.node == right { gap = vmaGapIterator{left, gap.index + left.nrSegments + 1} } left.keys[left.nrSegments] = p.keys[left.parentIndex] left.values[left.nrSegments] = p.values[left.parentIndex] copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { left.children[i].parent = left left.children[i].parentIndex = i } } left.nrSegments += right.nrSegments + 1 copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) vmaSetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) for i := 0; i < p.nrSegments; i++ { p.children[i].parentIndex = i } p.children[p.nrSegments] = nil p.nrSegments-- if vmatrackGaps != 0 { left.updateMaxGapLocal() } n = p } } // updateMaxGapLeaf updates maxGap bottom-up from the calling leaf until no // necessary update. // // Preconditions: n must be a leaf node, trackGaps must be 1. func (n *vmanode) updateMaxGapLeaf() { if n.hasChildren { panic(fmt.Sprintf("updateMaxGapLeaf should always be called on leaf node: %v", n)) } max := n.calculateMaxGapLeaf() if max == n.maxGap.Get() { return } oldMax := n.maxGap.Get() n.maxGap.Set(max) if max > oldMax { for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() >= max { break } p.maxGap.Set(max) } return } for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() > oldMax { break } parentNewMax := p.calculateMaxGapInternal() if p.maxGap.Get() == parentNewMax { break } p.maxGap.Set(parentNewMax) } } // updateMaxGapLocal updates maxGap of the calling node solely with no // propagation to ancestor nodes. // // Precondition: trackGaps must be 1. func (n *vmanode) updateMaxGapLocal() { if !n.hasChildren { n.maxGap.Set(n.calculateMaxGapLeaf()) } else { n.maxGap.Set(n.calculateMaxGapInternal()) } } // calculateMaxGapLeaf iterates the gaps within a leaf node and calculate the // max. // // Preconditions: n must be a leaf node. func (n *vmanode) calculateMaxGapLeaf() __generics_imported0.Addr { max := vmaGapIterator{n, 0}.Range().Length() for i := 1; i <= n.nrSegments; i++ { if current := (vmaGapIterator{n, i}).Range().Length(); current > max { max = current } } return max } // calculateMaxGapInternal iterates children's maxGap within an internal node n // and calculate the max. // // Preconditions: n must be a non-leaf node. func (n *vmanode) calculateMaxGapInternal() __generics_imported0.Addr { max := n.children[0].maxGap.Get() for i := 1; i <= n.nrSegments; i++ { if current := n.children[i].maxGap.Get(); current > max { max = current } } return max } // searchFirstLargeEnoughGap returns the first gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *vmanode) searchFirstLargeEnoughGap(minSize __generics_imported0.Addr) vmaGapIterator { if n.maxGap.Get() < minSize { return vmaGapIterator{} } if n.hasChildren { for i := 0; i <= n.nrSegments; i++ { if largeEnoughGap := n.children[i].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := 0; i <= n.nrSegments; i++ { currentGap := vmaGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // searchLastLargeEnoughGap returns the last gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *vmanode) searchLastLargeEnoughGap(minSize __generics_imported0.Addr) vmaGapIterator { if n.maxGap.Get() < minSize { return vmaGapIterator{} } if n.hasChildren { for i := n.nrSegments; i >= 0; i-- { if largeEnoughGap := n.children[i].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := n.nrSegments; i >= 0; i-- { currentGap := vmaGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // A Iterator is conceptually one of: // // - A pointer to a segment in a set; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Iterators are copyable values and are meaningfully equality-comparable. The // zero value of Iterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type vmaIterator struct { // node is the node containing the iterated segment. If the iterator is // terminal, node is nil. node *vmanode // index is the index of the segment in node.keys/values. index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (seg vmaIterator) Ok() bool { return seg.node != nil } // Range returns the iterated segment's range key. func (seg vmaIterator) Range() __generics_imported0.AddrRange { return seg.node.keys[seg.index] } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (seg vmaIterator) Start() __generics_imported0.Addr { return seg.node.keys[seg.index].Start } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (seg vmaIterator) End() __generics_imported0.Addr { return seg.node.keys[seg.index].End } // SetRangeUnchecked mutates the iterated segment's range key. This operation // does not invalidate any iterators. // // Preconditions: // - r.Length() > 0. // - The new range must not overlap an existing one: // - If seg.NextSegment().Ok(), then r.end <= seg.NextSegment().Start(). // - If seg.PrevSegment().Ok(), then r.start >= seg.PrevSegment().End(). func (seg vmaIterator) SetRangeUnchecked(r __generics_imported0.AddrRange) { seg.node.keys[seg.index] = r } // SetRange mutates the iterated segment's range key. If the new range would // cause the iterated segment to overlap another segment, or if the new range // is invalid, SetRange panics. This operation does not invalidate any // iterators. func (seg vmaIterator) SetRange(r __generics_imported0.AddrRange) { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) } if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) } seg.SetRangeUnchecked(r) } // SetStartUnchecked mutates the iterated segment's start. This operation does // not invalidate any iterators. // // Preconditions: The new start must be valid: // - start < seg.End() // - If seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). func (seg vmaIterator) SetStartUnchecked(start __generics_imported0.Addr) { seg.node.keys[seg.index].Start = start } // SetStart mutates the iterated segment's start. If the new start value would // cause the iterated segment to overlap another segment, or would result in an // invalid range, SetStart panics. This operation does not invalidate any // iterators. func (seg vmaIterator) SetStart(start __generics_imported0.Addr) { if start >= seg.End() { panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) } if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) } seg.SetStartUnchecked(start) } // SetEndUnchecked mutates the iterated segment's end. This operation does not // invalidate any iterators. // // Preconditions: The new end must be valid: // - end > seg.Start(). // - If seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). func (seg vmaIterator) SetEndUnchecked(end __generics_imported0.Addr) { seg.node.keys[seg.index].End = end } // SetEnd mutates the iterated segment's end. If the new end value would cause // the iterated segment to overlap another segment, or would result in an // invalid range, SetEnd panics. This operation does not invalidate any // iterators. func (seg vmaIterator) SetEnd(end __generics_imported0.Addr) { if end <= seg.Start() { panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) } if next := seg.NextSegment(); next.Ok() && end > next.Start() { panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) } seg.SetEndUnchecked(end) } // Value returns a copy of the iterated segment's value. func (seg vmaIterator) Value() vma { return seg.node.values[seg.index] } // ValuePtr returns a pointer to the iterated segment's value. The pointer is // invalidated if the iterator is invalidated. This operation does not // invalidate any iterators. func (seg vmaIterator) ValuePtr() *vma { return &seg.node.values[seg.index] } // SetValue mutates the iterated segment's value. This operation does not // invalidate any iterators. func (seg vmaIterator) SetValue(val vma) { seg.node.values[seg.index] = val } // PrevSegment returns the iterated segment's predecessor. If there is no // preceding segment, PrevSegment returns a terminal iterator. func (seg vmaIterator) PrevSegment() vmaIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment() } if seg.index > 0 { return vmaIterator{seg.node, seg.index - 1} } if seg.node.parent == nil { return vmaIterator{} } return vmasegmentBeforePosition(seg.node.parent, seg.node.parentIndex) } // NextSegment returns the iterated segment's successor. If there is no // succeeding segment, NextSegment returns a terminal iterator. func (seg vmaIterator) NextSegment() vmaIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment() } if seg.index < seg.node.nrSegments-1 { return vmaIterator{seg.node, seg.index + 1} } if seg.node.parent == nil { return vmaIterator{} } return vmasegmentAfterPosition(seg.node.parent, seg.node.parentIndex) } // PrevGap returns the gap immediately before the iterated segment. func (seg vmaIterator) PrevGap() vmaGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment().NextGap() } return vmaGapIterator{seg.node, seg.index} } // NextGap returns the gap immediately after the iterated segment. func (seg vmaIterator) NextGap() vmaGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment().PrevGap() } return vmaGapIterator{seg.node, seg.index + 1} } // PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, // or the gap before the iterated segment otherwise. If seg.Start() == // Functions.MinKey(), PrevNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by PrevNonEmpty will be // non-terminal. func (seg vmaIterator) PrevNonEmpty() (vmaIterator, vmaGapIterator) { if prev := seg.PrevSegment(); prev.Ok() && prev.End() == seg.Start() { return prev, vmaGapIterator{} } return vmaIterator{}, seg.PrevGap() } // NextNonEmpty returns the iterated segment's successor if it is adjacent, or // the gap after the iterated segment otherwise. If seg.End() == // Functions.MaxKey(), NextNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by NextNonEmpty will be // non-terminal. func (seg vmaIterator) NextNonEmpty() (vmaIterator, vmaGapIterator) { if next := seg.NextSegment(); next.Ok() && next.Start() == seg.End() { return next, vmaGapIterator{} } return vmaIterator{}, seg.NextGap() } // A GapIterator is conceptually one of: // // - A pointer to a position between two segments, before the first segment, or // after the last segment in a set, called a *gap*; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Note that the gap between two adjacent segments exists (iterators to it are // non-terminal), but has a length of zero. GapIterator.IsEmpty returns true // for such gaps. An empty set contains a single gap, spanning the entire range // of the set's keys. // // GapIterators are copyable values and are meaningfully equality-comparable. // The zero value of GapIterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type vmaGapIterator struct { // The representation of a GapIterator is identical to that of an Iterator, // except that index corresponds to positions between segments in the same // way as for node.children (see comment for node.nrSegments). node *vmanode index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (gap vmaGapIterator) Ok() bool { return gap.node != nil } // Range returns the range spanned by the iterated gap. func (gap vmaGapIterator) Range() __generics_imported0.AddrRange { return __generics_imported0.AddrRange{gap.Start(), gap.End()} } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (gap vmaGapIterator) Start() __generics_imported0.Addr { if ps := gap.PrevSegment(); ps.Ok() { return ps.End() } return vmaSetFunctions{}.MinKey() } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (gap vmaGapIterator) End() __generics_imported0.Addr { if ns := gap.NextSegment(); ns.Ok() { return ns.Start() } return vmaSetFunctions{}.MaxKey() } // IsEmpty returns true if the iterated gap is empty (that is, the "gap" is // between two adjacent segments.) func (gap vmaGapIterator) IsEmpty() bool { return gap.Range().Length() == 0 } // PrevSegment returns the segment immediately before the iterated gap. If no // such segment exists, PrevSegment returns a terminal iterator. func (gap vmaGapIterator) PrevSegment() vmaIterator { return vmasegmentBeforePosition(gap.node, gap.index) } // NextSegment returns the segment immediately after the iterated gap. If no // such segment exists, NextSegment returns a terminal iterator. func (gap vmaGapIterator) NextSegment() vmaIterator { return vmasegmentAfterPosition(gap.node, gap.index) } // PrevGap returns the iterated gap's predecessor. If no such gap exists, // PrevGap returns a terminal iterator. func (gap vmaGapIterator) PrevGap() vmaGapIterator { seg := gap.PrevSegment() if !seg.Ok() { return vmaGapIterator{} } return seg.PrevGap() } // NextGap returns the iterated gap's successor. If no such gap exists, NextGap // returns a terminal iterator. func (gap vmaGapIterator) NextGap() vmaGapIterator { seg := gap.NextSegment() if !seg.Ok() { return vmaGapIterator{} } return seg.NextGap() } // NextLargeEnoughGap returns the iterated gap's first next gap with larger // length than minSize. If not found, return a terminal gap iterator (does NOT // include this gap itself). // // Precondition: trackGaps must be 1. func (gap vmaGapIterator) NextLargeEnoughGap(minSize __generics_imported0.Addr) vmaGapIterator { if vmatrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == gap.node.nrSegments { gap.node = gap.NextSegment().node gap.index = 0 return gap.nextLargeEnoughGapHelper(minSize) } return gap.nextLargeEnoughGapHelper(minSize) } // nextLargeEnoughGapHelper is the helper function used by NextLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the trailing gap of a non-leaf node. func (gap vmaGapIterator) nextLargeEnoughGapHelper(minSize __generics_imported0.Addr) vmaGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == gap.node.nrSegments)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return vmaGapIterator{} } gap.index++ for gap.index <= gap.node.nrSegments { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index++ } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == gap.node.nrSegments { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // PrevLargeEnoughGap returns the iterated gap's first prev gap with larger or // equal length than minSize. If not found, return a terminal gap iterator // (does NOT include this gap itself). // // Precondition: trackGaps must be 1. func (gap vmaGapIterator) PrevLargeEnoughGap(minSize __generics_imported0.Addr) vmaGapIterator { if vmatrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == 0 { gap.node = gap.PrevSegment().node gap.index = gap.node.nrSegments return gap.prevLargeEnoughGapHelper(minSize) } return gap.prevLargeEnoughGapHelper(minSize) } // prevLargeEnoughGapHelper is the helper function used by PrevLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the first gap of a non-leaf node. func (gap vmaGapIterator) prevLargeEnoughGapHelper(minSize __generics_imported0.Addr) vmaGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == 0)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return vmaGapIterator{} } gap.index-- for gap.index >= 0 { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index-- } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == 0 { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // segmentBeforePosition returns the predecessor segment of the position given // by n.children[i], which may or may not contain a child. If no such segment // exists, segmentBeforePosition returns a terminal iterator. func vmasegmentBeforePosition(n *vmanode, i int) vmaIterator { for i == 0 { if n.parent == nil { return vmaIterator{} } n, i = n.parent, n.parentIndex } return vmaIterator{n, i - 1} } // segmentAfterPosition returns the successor segment of the position given by // n.children[i], which may or may not contain a child. If no such segment // exists, segmentAfterPosition returns a terminal iterator. func vmasegmentAfterPosition(n *vmanode, i int) vmaIterator { for i == n.nrSegments { if n.parent == nil { return vmaIterator{} } n, i = n.parent, n.parentIndex } return vmaIterator{n, i} } func vmazeroValueSlice(slice []vma) { for i := range slice { vmaSetFunctions{}.ClearValue(&slice[i]) } } func vmazeroNodeSlice(slice []*vmanode) { for i := range slice { slice[i] = nil } } // String stringifies a Set for debugging. func (s *vmaSet) String() string { return s.root.String() } // String stringifies a node (and all of its children) for debugging. func (n *vmanode) String() string { var buf bytes.Buffer n.writeDebugString(&buf, "") return buf.String() } func (n *vmanode) writeDebugString(buf *bytes.Buffer, prefix string) { if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { buf.WriteString(prefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) } for i := 0; i < n.nrSegments; i++ { if child := n.children[i]; child != nil { cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) if child.parent != n || child.parentIndex != i { buf.WriteString(cprefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) } child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) } buf.WriteString(prefix) if n.hasChildren { if vmatrackGaps != 0 { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v, maxGap: %d\n", i, n.keys[i], n.values[i], n.maxGap.Get())) } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } if child := n.children[n.nrSegments]; child != nil { child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) } } // FlatSegment represents a segment as a single object. FlatSegment is used as // an intermediate representation for save/restore and tests. // // +stateify savable type vmaFlatSegment struct { Start __generics_imported0.Addr End __generics_imported0.Addr Value vma } // ExportSlice returns a copy of all segments in the given set, in ascending // key order. func (s *vmaSet) ExportSlice() []vmaFlatSegment { var fs []vmaFlatSegment for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { fs = append(fs, vmaFlatSegment{ Start: seg.Start(), End: seg.End(), Value: seg.Value(), }) } return fs } // ImportSlice initializes the given set from the given slice. // // Preconditions: // - s must be empty. // - fs must represent a valid set (the segments in fs must have valid // lengths that do not overlap). // - The segments in fs must be sorted in ascending key order. func (s *vmaSet) ImportSlice(fs []vmaFlatSegment) error { if !s.IsEmpty() { return fmt.Errorf("cannot import into non-empty set %v", s) } gap := s.FirstGap() for i := range fs { f := &fs[i] r := __generics_imported0.AddrRange{f.Start, f.End} if !gap.Range().IsSupersetOf(r) { return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: %v => %v", r, f.Value) } gap = s.InsertWithoutMerging(gap, r, f.Value).NextGap() } return nil } // segmentTestCheck returns an error if s is incorrectly sorted, does not // contain exactly expectedSegments segments, or contains a segment which // fails the passed check. // // This should be used only for testing, and has been added to this package for // templating convenience. func (s *vmaSet) segmentTestCheck(expectedSegments int, segFunc func(int, __generics_imported0.AddrRange, vma) error) error { havePrev := false prev := __generics_imported0.Addr(0) nrSegments := 0 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { next := seg.Start() if havePrev && prev >= next { return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments) } if segFunc != nil { if err := segFunc(nrSegments, seg.Range(), seg.Value()); err != nil { return err } } prev = next havePrev = true nrSegments++ } if nrSegments != expectedSegments { return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments) } return nil } // countSegments counts the number of segments in the set. // // Similar to Check, this should only be used for testing. func (s *vmaSet) countSegments() (segments int) { for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { segments++ } return segments } func (s *vmaSet) saveRoot() []vmaFlatSegment { fs := s.ExportSlice() fs = fs[:len(fs):len(fs)] return fs } func (s *vmaSet) loadRoot(_ context.Context, fs []vmaFlatSegment) { if err := s.ImportSlice(fs); err != nil { panic(err) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/pgalloc/000077500000000000000000000000001465435605700223715ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/pgalloc/context.go000066400000000000000000000035301465435605700244050ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pgalloc import ( "context" ) // contextID is this package's type for context.Context.Value keys. type contextID int const ( // CtxMemoryFile is a Context.Value key for a MemoryFile. CtxMemoryFile contextID = iota // CtxMemoryCgroupID is the memory cgroup id which the task belongs to. CtxMemoryCgroupID // CtxMemoryFileMap is a Context.Value key for mapping // MemoryFileOpts.RestoreID to *MemoryFile. This is used for save/restore. CtxMemoryFileMap ) // MemoryFileFromContext returns the MemoryFile used by ctx, or nil if no such // MemoryFile exists. func MemoryFileFromContext(ctx context.Context) *MemoryFile { if v := ctx.Value(CtxMemoryFile); v != nil { return v.(*MemoryFile) } return nil } // MemoryCgroupIDFromContext returns the memory cgroup id of the ctx, or // zero if the ctx does not belong to any memory cgroup. func MemoryCgroupIDFromContext(ctx context.Context) uint32 { if v := ctx.Value(CtxMemoryCgroupID); v != nil { return v.(uint32) } return 0 } // MemoryFileMapFromContext returns the memory file map used by ctx, or nil if // no such map exists. func MemoryFileMapFromContext(ctx context.Context) map[string]*MemoryFile { if v := ctx.Value(CtxMemoryFileMap); v != nil { return v.(map[string]*MemoryFile) } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/pgalloc/evictable_range.go000066400000000000000000000034451465435605700260400ustar00rootroot00000000000000package pgalloc // A Range represents a contiguous range of T. // // +stateify savable type EvictableRange struct { // Start is the inclusive start of the range. Start uint64 // End is the exclusive end of the range. End uint64 } // WellFormed returns true if r.Start <= r.End. All other methods on a Range // require that the Range is well-formed. // //go:nosplit func (r EvictableRange) WellFormed() bool { return r.Start <= r.End } // Length returns the length of the range. // //go:nosplit func (r EvictableRange) Length() uint64 { return r.End - r.Start } // Contains returns true if r contains x. // //go:nosplit func (r EvictableRange) Contains(x uint64) bool { return r.Start <= x && x < r.End } // Overlaps returns true if r and r2 overlap. // //go:nosplit func (r EvictableRange) Overlaps(r2 EvictableRange) bool { return r.Start < r2.End && r2.Start < r.End } // IsSupersetOf returns true if r is a superset of r2; that is, the range r2 is // contained within r. // //go:nosplit func (r EvictableRange) IsSupersetOf(r2 EvictableRange) bool { return r.Start <= r2.Start && r.End >= r2.End } // Intersect returns a range consisting of the intersection between r and r2. // If r and r2 do not overlap, Intersect returns a range with unspecified // bounds, but for which Length() == 0. // //go:nosplit func (r EvictableRange) Intersect(r2 EvictableRange) EvictableRange { if r.Start < r2.Start { r.Start = r2.Start } if r.End > r2.End { r.End = r2.End } if r.End < r.Start { r.End = r.Start } return r } // CanSplitAt returns true if it is legal to split a segment spanning the range // r at x; that is, splitting at x would produce two ranges, both of which have // non-zero length. // //go:nosplit func (r EvictableRange) CanSplitAt(x uint64) bool { return r.Contains(x) && r.Start < x } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/pgalloc/evictable_range_set.go000066400000000000000000002100201465435605700267000ustar00rootroot00000000000000package pgalloc import ( "bytes" "context" "fmt" ) // trackGaps is an optional parameter. // // If trackGaps is 1, the Set will track maximum gap size recursively, // enabling the GapIterator.{Prev,Next}LargeEnoughGap functions. In this // case, Key must be an unsigned integer. // // trackGaps must be 0 or 1. const evictableRangetrackGaps = 0 var _ = uint8(evictableRangetrackGaps << 7) // Will fail if not zero or one. // dynamicGap is a type that disappears if trackGaps is 0. type evictableRangedynamicGap [evictableRangetrackGaps]uint64 // Get returns the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *evictableRangedynamicGap) Get() uint64 { return d[:][0] } // Set sets the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *evictableRangedynamicGap) Set(v uint64) { d[:][0] = v } const ( // minDegree is the minimum degree of an internal node in a Set B-tree. // // - Any non-root node has at least minDegree-1 segments. // // - Any non-root internal (non-leaf) node has at least minDegree children. // // - The root node may have fewer than minDegree-1 segments, but it may // only have 0 segments if the tree is empty. // // Our implementation requires minDegree >= 3. Higher values of minDegree // usually improve performance, but increase memory usage for small sets. evictableRangeminDegree = 3 evictableRangemaxDegree = 2 * evictableRangeminDegree ) // A Set is a mapping of segments with non-overlapping Range keys. The zero // value for a Set is an empty set. Set values are not safely movable nor // copyable. Set is thread-compatible. // // +stateify savable type evictableRangeSet struct { root evictableRangenode `state:".([]evictableRangeFlatSegment)"` } // IsEmpty returns true if the set contains no segments. func (s *evictableRangeSet) IsEmpty() bool { return s.root.nrSegments == 0 } // IsEmptyRange returns true iff no segments in the set overlap the given // range. This is semantically equivalent to s.SpanRange(r) == 0, but may be // more efficient. func (s *evictableRangeSet) IsEmptyRange(r EvictableRange) bool { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return true } _, gap := s.Find(r.Start) if !gap.Ok() { return false } return r.End <= gap.End() } // Span returns the total size of all segments in the set. func (s *evictableRangeSet) Span() uint64 { var sz uint64 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { sz += seg.Range().Length() } return sz } // SpanRange returns the total size of the intersection of segments in the set // with the given range. func (s *evictableRangeSet) SpanRange(r EvictableRange) uint64 { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return 0 } var sz uint64 for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { sz += seg.Range().Intersect(r).Length() } return sz } // FirstSegment returns the first segment in the set. If the set is empty, // FirstSegment returns a terminal iterator. func (s *evictableRangeSet) FirstSegment() evictableRangeIterator { if s.root.nrSegments == 0 { return evictableRangeIterator{} } return s.root.firstSegment() } // LastSegment returns the last segment in the set. If the set is empty, // LastSegment returns a terminal iterator. func (s *evictableRangeSet) LastSegment() evictableRangeIterator { if s.root.nrSegments == 0 { return evictableRangeIterator{} } return s.root.lastSegment() } // FirstGap returns the first gap in the set. func (s *evictableRangeSet) FirstGap() evictableRangeGapIterator { n := &s.root for n.hasChildren { n = n.children[0] } return evictableRangeGapIterator{n, 0} } // LastGap returns the last gap in the set. func (s *evictableRangeSet) LastGap() evictableRangeGapIterator { n := &s.root for n.hasChildren { n = n.children[n.nrSegments] } return evictableRangeGapIterator{n, n.nrSegments} } // Find returns the segment or gap whose range contains the given key. If a // segment is found, the returned Iterator is non-terminal and the // returned GapIterator is terminal. Otherwise, the returned Iterator is // terminal and the returned GapIterator is non-terminal. func (s *evictableRangeSet) Find(key uint64) (evictableRangeIterator, evictableRangeGapIterator) { n := &s.root for { lower := 0 upper := n.nrSegments for lower < upper { i := lower + (upper-lower)/2 if r := n.keys[i]; key < r.End { if key >= r.Start { return evictableRangeIterator{n, i}, evictableRangeGapIterator{} } upper = i } else { lower = i + 1 } } i := lower if !n.hasChildren { return evictableRangeIterator{}, evictableRangeGapIterator{n, i} } n = n.children[i] } } // FindSegment returns the segment whose range contains the given key. If no // such segment exists, FindSegment returns a terminal iterator. func (s *evictableRangeSet) FindSegment(key uint64) evictableRangeIterator { seg, _ := s.Find(key) return seg } // LowerBoundSegment returns the segment with the lowest range that contains a // key greater than or equal to min. If no such segment exists, // LowerBoundSegment returns a terminal iterator. func (s *evictableRangeSet) LowerBoundSegment(min uint64) evictableRangeIterator { seg, gap := s.Find(min) if seg.Ok() { return seg } return gap.NextSegment() } // UpperBoundSegment returns the segment with the highest range that contains a // key less than or equal to max. If no such segment exists, UpperBoundSegment // returns a terminal iterator. func (s *evictableRangeSet) UpperBoundSegment(max uint64) evictableRangeIterator { seg, gap := s.Find(max) if seg.Ok() { return seg } return gap.PrevSegment() } // FindGap returns the gap containing the given key. If no such gap exists // (i.e. the set contains a segment containing that key), FindGap returns a // terminal iterator. func (s *evictableRangeSet) FindGap(key uint64) evictableRangeGapIterator { _, gap := s.Find(key) return gap } // LowerBoundGap returns the gap with the lowest range that is greater than or // equal to min. func (s *evictableRangeSet) LowerBoundGap(min uint64) evictableRangeGapIterator { seg, gap := s.Find(min) if gap.Ok() { return gap } return seg.NextGap() } // UpperBoundGap returns the gap with the highest range that is less than or // equal to max. func (s *evictableRangeSet) UpperBoundGap(max uint64) evictableRangeGapIterator { seg, gap := s.Find(max) if gap.Ok() { return gap } return seg.PrevGap() } // FirstLargeEnoughGap returns the first gap in the set with at least the given // length. If no such gap exists, FirstLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *evictableRangeSet) FirstLargeEnoughGap(minSize uint64) evictableRangeGapIterator { if evictableRangetrackGaps != 1 { panic("set is not tracking gaps") } gap := s.FirstGap() if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // LastLargeEnoughGap returns the last gap in the set with at least the given // length. If no such gap exists, LastLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *evictableRangeSet) LastLargeEnoughGap(minSize uint64) evictableRangeGapIterator { if evictableRangetrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LastGap() if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // LowerBoundLargeEnoughGap returns the first gap in the set with at least the // given length and whose range contains a key greater than or equal to min. If // no such gap exists, LowerBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *evictableRangeSet) LowerBoundLargeEnoughGap(min, minSize uint64) evictableRangeGapIterator { if evictableRangetrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LowerBoundGap(min) if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // UpperBoundLargeEnoughGap returns the last gap in the set with at least the // given length and whose range contains a key less than or equal to max. If no // such gap exists, UpperBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *evictableRangeSet) UpperBoundLargeEnoughGap(max, minSize uint64) evictableRangeGapIterator { if evictableRangetrackGaps != 1 { panic("set is not tracking gaps") } gap := s.UpperBoundGap(max) if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // Insert inserts the given segment into the given gap. If the new segment can // be merged with adjacent segments, Insert will do so. Insert returns an // iterator to the segment containing the inserted value (which may have been // merged with other values). All existing iterators (including gap, but not // including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, Insert panics. // // Insert is semantically equivalent to a InsertWithoutMerging followed by a // Merge, but may be more efficient. Note that there is no unchecked variant of // Insert since Insert must retrieve and inspect gap's predecessor and // successor segments regardless. func (s *evictableRangeSet) Insert(gap evictableRangeGapIterator, r EvictableRange, val evictableRangeSetValue) evictableRangeIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } prev, next := gap.PrevSegment(), gap.NextSegment() if prev.Ok() && prev.End() > r.Start { panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) } if next.Ok() && next.Start() < r.End { panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) } if prev.Ok() && prev.End() == r.Start { if mval, ok := (evictableRangeSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { shrinkMaxGap := evictableRangetrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() prev.SetEndUnchecked(r.End) prev.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } if next.Ok() && next.Start() == r.End { val = mval if mval, ok := (evictableRangeSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { prev.SetEndUnchecked(next.End()) prev.SetValue(mval) return s.Remove(next).PrevSegment() } } return prev } } if next.Ok() && next.Start() == r.End { if mval, ok := (evictableRangeSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { shrinkMaxGap := evictableRangetrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() next.SetStartUnchecked(r.Start) next.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } return next } } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMerging inserts the given segment into the given gap and // returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, // InsertWithoutMerging panics. func (s *evictableRangeSet) InsertWithoutMerging(gap evictableRangeGapIterator, r EvictableRange, val evictableRangeSetValue) evictableRangeIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if gr := gap.Range(); !gr.IsSupersetOf(r) { panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMergingUnchecked inserts the given segment into the given gap // and returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // Preconditions: // - r.Start >= gap.Start(). // - r.End <= gap.End(). func (s *evictableRangeSet) InsertWithoutMergingUnchecked(gap evictableRangeGapIterator, r EvictableRange, val evictableRangeSetValue) evictableRangeIterator { gap = gap.node.rebalanceBeforeInsert(gap) splitMaxGap := evictableRangetrackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get()) copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) gap.node.keys[gap.index] = r gap.node.values[gap.index] = val gap.node.nrSegments++ if splitMaxGap { gap.node.updateMaxGapLeaf() } return evictableRangeIterator{gap.node, gap.index} } // InsertRange inserts the given segment into the set. If the new segment can // be merged with adjacent segments, InsertRange will do so. InsertRange // returns an iterator to the segment containing the inserted value (which may // have been merged with other values). All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertRange panics. // // InsertRange searches the set to find the gap to insert into. If the caller // already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *evictableRangeSet) InsertRange(r EvictableRange, val evictableRangeSetValue) evictableRangeIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.Insert(gap, r, val) } // InsertWithoutMergingRange inserts the given segment into the set and returns // an iterator to the inserted segment. All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertWithoutMergingRange panics. // // InsertWithoutMergingRange searches the set to find the gap to insert into. // If the caller already has the appropriate GapIterator, or if the caller // needs to do additional work between finding the gap and insertion, use // InsertWithoutMerging instead. func (s *evictableRangeSet) InsertWithoutMergingRange(r EvictableRange, val evictableRangeSetValue) evictableRangeIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.InsertWithoutMerging(gap, r, val) } // TryInsertRange attempts to insert the given segment into the set. If the new // segment can be merged with adjacent segments, TryInsertRange will do so. // TryInsertRange returns an iterator to the segment containing the inserted // value (which may have been merged with other values). All existing iterators // (excluding the returned iterator) are invalidated. // // If the new segment would overlap an existing segment, TryInsertRange does // nothing and returns a terminal iterator. // // TryInsertRange searches the set to find the gap to insert into. If the // caller already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *evictableRangeSet) TryInsertRange(r EvictableRange, val evictableRangeSetValue) evictableRangeIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return evictableRangeIterator{} } if gap.End() < r.End { return evictableRangeIterator{} } return s.Insert(gap, r, val) } // TryInsertWithoutMergingRange attempts to insert the given segment into the // set. If successful, it returns an iterator to the inserted segment; all // existing iterators (excluding the returned iterator) are invalidated. If the // new segment would overlap an existing segment, TryInsertWithoutMergingRange // does nothing and returns a terminal iterator. // // TryInsertWithoutMergingRange searches the set to find the gap to insert // into. If the caller already has the appropriate GapIterator, or if the // caller needs to do additional work between finding the gap and insertion, // use InsertWithoutMerging instead. func (s *evictableRangeSet) TryInsertWithoutMergingRange(r EvictableRange, val evictableRangeSetValue) evictableRangeIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return evictableRangeIterator{} } if gap.End() < r.End { return evictableRangeIterator{} } return s.InsertWithoutMerging(gap, r, val) } // Remove removes the given segment and returns an iterator to the vacated gap. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. func (s *evictableRangeSet) Remove(seg evictableRangeIterator) evictableRangeGapIterator { if seg.node.hasChildren { victim := seg.PrevSegment() seg.SetRangeUnchecked(victim.Range()) seg.SetValue(victim.Value()) nextAdjacentNode := seg.NextSegment().node if evictableRangetrackGaps != 0 { nextAdjacentNode.updateMaxGapLeaf() } return s.Remove(victim).NextGap() } copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) evictableRangeSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) seg.node.nrSegments-- if evictableRangetrackGaps != 0 { seg.node.updateMaxGapLeaf() } return seg.node.rebalanceAfterRemove(evictableRangeGapIterator{seg.node, seg.index}) } // RemoveAll removes all segments from the set. All existing iterators are // invalidated. func (s *evictableRangeSet) RemoveAll() { s.root = evictableRangenode{} } // RemoveRange removes all segments in the given range. An iterator to the // newly formed gap is returned, and all existing iterators are invalidated. // // RemoveRange searches the set to find segments to remove. If the caller // already has an iterator to either end of the range of segments to remove, or // if the caller needs to do additional work before removing each segment, // iterate segments and call Remove in a loop instead. func (s *evictableRangeSet) RemoveRange(r EvictableRange) evictableRangeGapIterator { seg, gap := s.Find(r.Start) if seg.Ok() { seg = s.Isolate(seg, r) gap = s.Remove(seg) } for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { seg = s.SplitAfter(seg, r.End) gap = s.Remove(seg) } return gap } // RemoveFullRange is equivalent to RemoveRange, except that if any key in the // given range does not correspond to a segment, RemoveFullRange panics. func (s *evictableRangeSet) RemoveFullRange(r EvictableRange) evictableRangeGapIterator { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) end := seg.End() gap := s.Remove(seg) if r.End <= end { return gap } seg = gap.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // Merge attempts to merge two neighboring segments. If successful, Merge // returns an iterator to the merged segment, and all existing iterators are // invalidated. Otherwise, Merge returns a terminal iterator. // // If first is not the predecessor of second, Merge panics. func (s *evictableRangeSet) Merge(first, second evictableRangeIterator) evictableRangeIterator { if first.NextSegment() != second { panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) } return s.MergeUnchecked(first, second) } // MergeUnchecked attempts to merge two neighboring segments. If successful, // MergeUnchecked returns an iterator to the merged segment, and all existing // iterators are invalidated. Otherwise, MergeUnchecked returns a terminal // iterator. // // Precondition: first is the predecessor of second: first.NextSegment() == // second, first == second.PrevSegment(). func (s *evictableRangeSet) MergeUnchecked(first, second evictableRangeIterator) evictableRangeIterator { if first.End() == second.Start() { if mval, ok := (evictableRangeSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { first.SetEndUnchecked(second.End()) first.SetValue(mval) return s.Remove(second).PrevSegment() } } return evictableRangeIterator{} } // MergePrev attempts to merge the given segment with its predecessor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergePrev is usually used when mutating segments while iterating them in // order of increasing keys, to attempt merging of each mutated segment with // its previously-mutated predecessor. In such cases, merging a mutated segment // with its unmutated successor would incorrectly cause the latter to be // skipped. func (s *evictableRangeSet) MergePrev(seg evictableRangeIterator) evictableRangeIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } return seg } // MergeNext attempts to merge the given segment with its successor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergeNext is usually used when mutating segments while iterating them in // order of decreasing keys, to attempt merging of each mutated segment with // its previously-mutated successor. In such cases, merging a mutated segment // with its unmutated predecessor would incorrectly cause the latter to be // skipped. func (s *evictableRangeSet) MergeNext(seg evictableRangeIterator) evictableRangeIterator { if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // Unisolate attempts to merge the given segment with its predecessor and // successor if possible, and returns an updated iterator to the extended // segment. All existing iterators (including seg, but not including the // returned iterator) are invalidated. // // Unisolate is usually used in conjunction with Isolate when mutating part of // a single segment in a way that may affect its mergeability. For the reasons // described by MergePrev and MergeNext, it is usually incorrect to use the // return value of Unisolate in a loop variable. func (s *evictableRangeSet) Unisolate(seg evictableRangeIterator) evictableRangeIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // MergeAll merges all mergeable adjacent segments in the set. All existing // iterators are invalidated. func (s *evictableRangeSet) MergeAll() { seg := s.FirstSegment() if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeInsideRange attempts to merge all adjacent segments that contain a key // in the specific range. All existing iterators are invalidated. // // MergeInsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid a redundant search. func (s *evictableRangeSet) MergeInsideRange(r EvictableRange) { seg := s.LowerBoundSegment(r.Start) if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() && next.Start() < r.End { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeOutsideRange attempts to merge the segment containing r.Start with its // predecessor, and the segment containing r.End-1 with its successor. // // MergeOutsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid two redundant searches. func (s *evictableRangeSet) MergeOutsideRange(r EvictableRange) { first := s.FindSegment(r.Start) if first.Ok() { if prev := first.PrevSegment(); prev.Ok() { s.Merge(prev, first) } } last := s.FindSegment(r.End - 1) if last.Ok() { if next := last.NextSegment(); next.Ok() { s.Merge(last, next) } } } // Split splits the given segment at the given key and returns iterators to the // two resulting segments. All existing iterators (including seg, but not // including the returned iterators) are invalidated. // // If the segment cannot be split at split (because split is at the start or // end of the segment's range, so splitting would produce a segment with zero // length, or because split falls outside the segment's range altogether), // Split panics. func (s *evictableRangeSet) Split(seg evictableRangeIterator, split uint64) (evictableRangeIterator, evictableRangeIterator) { if !seg.Range().CanSplitAt(split) { panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) } return s.SplitUnchecked(seg, split) } // SplitUnchecked splits the given segment at the given key and returns // iterators to the two resulting segments. All existing iterators (including // seg, but not including the returned iterators) are invalidated. // // Preconditions: seg.Start() < key < seg.End(). func (s *evictableRangeSet) SplitUnchecked(seg evictableRangeIterator, split uint64) (evictableRangeIterator, evictableRangeIterator) { val1, val2 := (evictableRangeSetFunctions{}).Split(seg.Range(), seg.Value(), split) end2 := seg.End() seg.SetEndUnchecked(split) seg.SetValue(val1) seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), EvictableRange{split, end2}, val2) return seg2.PrevSegment(), seg2 } // SplitBefore ensures that the given segment's start is at least start by // splitting at start if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterator) are invalidated. // // SplitBefore is usually when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, the first segment may // extend beyond the start of the range to be mutated, and needs to be // SplitBefore to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter; i.e. SplitBefore needs to be invoked on each segment, while // SplitAfter only needs to be invoked on the first. // // Preconditions: start < seg.End(). func (s *evictableRangeSet) SplitBefore(seg evictableRangeIterator, start uint64) evictableRangeIterator { if seg.Range().CanSplitAt(start) { _, seg = s.SplitUnchecked(seg, start) } return seg } // SplitAfter ensures that the given segment's end is at most end by splitting // at end if necessary, and returns an updated iterator to the bounded segment. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. // // SplitAfter is usually used when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, each iterated segment // may extend beyond the end of the range to be mutated, and needs to be // SplitAfter to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter exchange roles; i.e. SplitBefore needs to be invoked on each // segment, while SplitAfter only needs to be invoked on the first. // // Preconditions: seg.Start() < end. func (s *evictableRangeSet) SplitAfter(seg evictableRangeIterator, end uint64) evictableRangeIterator { if seg.Range().CanSplitAt(end) { seg, _ = s.SplitUnchecked(seg, end) } return seg } // Isolate ensures that the given segment's range is a subset of r by splitting // at r.Start and r.End if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterators) are invalidated. // // Isolate is usually used when mutating part of a single segment, or when // mutating segments in a range where the first segment is not necessarily // split, making use of SplitBefore/SplitAfter complex. // // Preconditions: seg.Range().Overlaps(r). func (s *evictableRangeSet) Isolate(seg evictableRangeIterator, r EvictableRange) evictableRangeIterator { if seg.Range().CanSplitAt(r.Start) { _, seg = s.SplitUnchecked(seg, r.Start) } if seg.Range().CanSplitAt(r.End) { seg, _ = s.SplitUnchecked(seg, r.End) } return seg } // LowerBoundSegmentSplitBefore combines LowerBoundSegment and SplitBefore. // // LowerBoundSegmentSplitBefore is usually used when mutating segments in a // range while iterating them in order of increasing keys. In such cases, // LowerBoundSegmentSplitBefore provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *evictableRangeSet) LowerBoundSegmentSplitBefore(min uint64) evictableRangeIterator { seg := s.LowerBoundSegment(min) if seg.Ok() { seg = s.SplitBefore(seg, min) } return seg } // UpperBoundSegmentSplitAfter combines UpperBoundSegment and SplitAfter. // // UpperBoundSegmentSplitAfter is usually used when mutating segments in a // range while iterating them in order of decreasing keys. In such cases, // UpperBoundSegmentSplitAfter provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *evictableRangeSet) UpperBoundSegmentSplitAfter(max uint64) evictableRangeIterator { seg := s.UpperBoundSegment(max) if seg.Ok() { seg = s.SplitAfter(seg, max) } return seg } // VisitRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments will not be split, so f may be called // on segments lying partially outside r. Non-empty gaps between segments are // skipped. If a call to f returns false, VisitRange stops iteration // immediately. // // N.B. f must not invalidate iterators into s. func (s *evictableRangeSet) VisitRange(r EvictableRange, f func(seg evictableRangeIterator) bool) { for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { if !f(seg) { return } } } // VisitFullRange is equivalent to VisitRange, except that if any key in r that // is visited before f returns false does not correspond to a segment, // VisitFullRange panics. func (s *evictableRangeSet) VisitFullRange(r EvictableRange, f func(seg evictableRangeIterator) bool) { pos := r.Start seg := s.FindSegment(r.Start) for { if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", pos)) } if !f(seg) { return } pos = seg.End() if r.End <= pos { return } seg, _ = seg.NextNonEmpty() } } // MutateRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments that lie partially outside r are split // before f is called, such that f only observes segments entirely within r. // Iterated segments are merged again after f is called. Non-empty gaps between // segments are skipped. If a call to f returns false, MutateRange stops // iteration immediately. // // MutateRange invalidates all existing iterators. // // N.B. f must not invalidate iterators into s. func (s *evictableRangeSet) MutateRange(r EvictableRange, f func(seg evictableRangeIterator) bool) { seg := s.LowerBoundSegmentSplitBefore(r.Start) for seg.Ok() && seg.Start() < r.End { seg = s.SplitAfter(seg, r.End) cont := f(seg) seg = s.MergePrev(seg) if !cont { s.MergeNext(seg) return } seg = seg.NextSegment() } if seg.Ok() { s.MergePrev(seg) } } // MutateFullRange is equivalent to MutateRange, except that if any key in r // that is visited before f returns false does not correspond to a segment, // MutateFullRange panics. func (s *evictableRangeSet) MutateFullRange(r EvictableRange, f func(seg evictableRangeIterator) bool) { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) cont := f(seg) end := seg.End() seg = s.MergePrev(seg) if !cont || r.End <= end { s.MergeNext(seg) return } seg = seg.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // +stateify savable type evictableRangenode struct { // An internal binary tree node looks like: // // K // / \ // Cl Cr // // where all keys in the subtree rooted by Cl (the left subtree) are less // than K (the key of the parent node), and all keys in the subtree rooted // by Cr (the right subtree) are greater than K. // // An internal B-tree node's indexes work out to look like: // // K0 K1 K2 ... Kn-1 // / \/ \/ \ ... / \ // C0 C1 C2 C3 ... Cn-1 Cn // // where n is nrSegments. nrSegments int // parent is a pointer to this node's parent. If this node is root, parent // is nil. parent *evictableRangenode // parentIndex is the index of this node in parent.children. parentIndex int // Flag for internal nodes that is technically redundant with "children[0] // != nil", but is stored in the first cache line. "hasChildren" rather // than "isLeaf" because false must be the correct value for an empty root. hasChildren bool // The longest gap within this node. If the node is a leaf, it's simply the // maximum gap among all the (nrSegments+1) gaps formed by its nrSegments keys // including the 0th and nrSegments-th gap possibly shared with its upper-level // nodes; if it's a non-leaf node, it's the max of all children's maxGap. maxGap evictableRangedynamicGap // Nodes store keys and values in separate arrays to maximize locality in // the common case (scanning keys for lookup). keys [evictableRangemaxDegree - 1]EvictableRange values [evictableRangemaxDegree - 1]evictableRangeSetValue children [evictableRangemaxDegree]*evictableRangenode } // firstSegment returns the first segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *evictableRangenode) firstSegment() evictableRangeIterator { for n.hasChildren { n = n.children[0] } return evictableRangeIterator{n, 0} } // lastSegment returns the last segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *evictableRangenode) lastSegment() evictableRangeIterator { for n.hasChildren { n = n.children[n.nrSegments] } return evictableRangeIterator{n, n.nrSegments - 1} } func (n *evictableRangenode) prevSibling() *evictableRangenode { if n.parent == nil || n.parentIndex == 0 { return nil } return n.parent.children[n.parentIndex-1] } func (n *evictableRangenode) nextSibling() *evictableRangenode { if n.parent == nil || n.parentIndex == n.parent.nrSegments { return nil } return n.parent.children[n.parentIndex+1] } // rebalanceBeforeInsert splits n and its ancestors if they are full, as // required for insertion, and returns an updated iterator to the position // represented by gap. func (n *evictableRangenode) rebalanceBeforeInsert(gap evictableRangeGapIterator) evictableRangeGapIterator { if n.nrSegments < evictableRangemaxDegree-1 { return gap } if n.parent != nil { gap = n.parent.rebalanceBeforeInsert(gap) } if n.parent == nil { left := &evictableRangenode{ nrSegments: evictableRangeminDegree - 1, parent: n, parentIndex: 0, hasChildren: n.hasChildren, } right := &evictableRangenode{ nrSegments: evictableRangeminDegree - 1, parent: n, parentIndex: 1, hasChildren: n.hasChildren, } copy(left.keys[:evictableRangeminDegree-1], n.keys[:evictableRangeminDegree-1]) copy(left.values[:evictableRangeminDegree-1], n.values[:evictableRangeminDegree-1]) copy(right.keys[:evictableRangeminDegree-1], n.keys[evictableRangeminDegree:]) copy(right.values[:evictableRangeminDegree-1], n.values[evictableRangeminDegree:]) n.keys[0], n.values[0] = n.keys[evictableRangeminDegree-1], n.values[evictableRangeminDegree-1] evictableRangezeroValueSlice(n.values[1:]) if n.hasChildren { copy(left.children[:evictableRangeminDegree], n.children[:evictableRangeminDegree]) copy(right.children[:evictableRangeminDegree], n.children[evictableRangeminDegree:]) evictableRangezeroNodeSlice(n.children[2:]) for i := 0; i < evictableRangeminDegree; i++ { left.children[i].parent = left left.children[i].parentIndex = i right.children[i].parent = right right.children[i].parentIndex = i } } n.nrSegments = 1 n.hasChildren = true n.children[0] = left n.children[1] = right if evictableRangetrackGaps != 0 { left.updateMaxGapLocal() right.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < evictableRangeminDegree { return evictableRangeGapIterator{left, gap.index} } return evictableRangeGapIterator{right, gap.index - evictableRangeminDegree} } copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[evictableRangeminDegree-1], n.values[evictableRangeminDegree-1] copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { n.parent.children[i].parentIndex = i } sibling := &evictableRangenode{ nrSegments: evictableRangeminDegree - 1, parent: n.parent, parentIndex: n.parentIndex + 1, hasChildren: n.hasChildren, } n.parent.children[n.parentIndex+1] = sibling n.parent.nrSegments++ copy(sibling.keys[:evictableRangeminDegree-1], n.keys[evictableRangeminDegree:]) copy(sibling.values[:evictableRangeminDegree-1], n.values[evictableRangeminDegree:]) evictableRangezeroValueSlice(n.values[evictableRangeminDegree-1:]) if n.hasChildren { copy(sibling.children[:evictableRangeminDegree], n.children[evictableRangeminDegree:]) evictableRangezeroNodeSlice(n.children[evictableRangeminDegree:]) for i := 0; i < evictableRangeminDegree; i++ { sibling.children[i].parent = sibling sibling.children[i].parentIndex = i } } n.nrSegments = evictableRangeminDegree - 1 if evictableRangetrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < evictableRangeminDegree { return gap } return evictableRangeGapIterator{sibling, gap.index - evictableRangeminDegree} } // rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient // (contain fewer segments than required by B-tree invariants), as required for // removal, and returns an updated iterator to the position represented by gap. // // Precondition: n is the only node in the tree that may currently violate a // B-tree invariant. func (n *evictableRangenode) rebalanceAfterRemove(gap evictableRangeGapIterator) evictableRangeGapIterator { for { if n.nrSegments >= evictableRangeminDegree-1 { return gap } if n.parent == nil { return gap } if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= evictableRangeminDegree { copy(n.keys[1:], n.keys[:n.nrSegments]) copy(n.values[1:], n.values[:n.nrSegments]) n.keys[0] = n.parent.keys[n.parentIndex-1] n.values[0] = n.parent.values[n.parentIndex-1] n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] evictableRangeSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { copy(n.children[1:], n.children[:n.nrSegments+1]) n.children[0] = sibling.children[sibling.nrSegments] sibling.children[sibling.nrSegments] = nil n.children[0].parent = n n.children[0].parentIndex = 0 for i := 1; i < n.nrSegments+2; i++ { n.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if evictableRangetrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling && gap.index == sibling.nrSegments { return evictableRangeGapIterator{n, 0} } if gap.node == n { return evictableRangeGapIterator{n, gap.index + 1} } return gap } if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= evictableRangeminDegree { n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] n.values[n.nrSegments] = n.parent.values[n.parentIndex] n.parent.keys[n.parentIndex] = sibling.keys[0] n.parent.values[n.parentIndex] = sibling.values[0] copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) evictableRangeSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { n.children[n.nrSegments+1] = sibling.children[0] copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) sibling.children[sibling.nrSegments] = nil n.children[n.nrSegments+1].parent = n n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 for i := 0; i < sibling.nrSegments; i++ { sibling.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if evictableRangetrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling { if gap.index == 0 { return evictableRangeGapIterator{n, n.nrSegments} } return evictableRangeGapIterator{sibling, gap.index - 1} } return gap } p := n.parent if p.nrSegments == 1 { left, right := p.children[0], p.children[1] p.nrSegments = left.nrSegments + right.nrSegments + 1 p.hasChildren = left.hasChildren p.keys[left.nrSegments] = p.keys[0] p.values[left.nrSegments] = p.values[0] copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := 0; i < p.nrSegments+1; i++ { p.children[i].parent = p p.children[i].parentIndex = i } } else { p.children[0] = nil p.children[1] = nil } if gap.node == left { return evictableRangeGapIterator{p, gap.index} } if gap.node == right { return evictableRangeGapIterator{p, gap.index + left.nrSegments + 1} } return gap } // Merge n and either sibling, along with the segment separating the // two, into whichever of the two nodes comes first. This is the // reverse of the non-root splitting case in // node.rebalanceBeforeInsert. var left, right *evictableRangenode if n.parentIndex > 0 { left = n.prevSibling() right = n } else { left = n right = n.nextSibling() } if gap.node == right { gap = evictableRangeGapIterator{left, gap.index + left.nrSegments + 1} } left.keys[left.nrSegments] = p.keys[left.parentIndex] left.values[left.nrSegments] = p.values[left.parentIndex] copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { left.children[i].parent = left left.children[i].parentIndex = i } } left.nrSegments += right.nrSegments + 1 copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) evictableRangeSetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) for i := 0; i < p.nrSegments; i++ { p.children[i].parentIndex = i } p.children[p.nrSegments] = nil p.nrSegments-- if evictableRangetrackGaps != 0 { left.updateMaxGapLocal() } n = p } } // updateMaxGapLeaf updates maxGap bottom-up from the calling leaf until no // necessary update. // // Preconditions: n must be a leaf node, trackGaps must be 1. func (n *evictableRangenode) updateMaxGapLeaf() { if n.hasChildren { panic(fmt.Sprintf("updateMaxGapLeaf should always be called on leaf node: %v", n)) } max := n.calculateMaxGapLeaf() if max == n.maxGap.Get() { return } oldMax := n.maxGap.Get() n.maxGap.Set(max) if max > oldMax { for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() >= max { break } p.maxGap.Set(max) } return } for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() > oldMax { break } parentNewMax := p.calculateMaxGapInternal() if p.maxGap.Get() == parentNewMax { break } p.maxGap.Set(parentNewMax) } } // updateMaxGapLocal updates maxGap of the calling node solely with no // propagation to ancestor nodes. // // Precondition: trackGaps must be 1. func (n *evictableRangenode) updateMaxGapLocal() { if !n.hasChildren { n.maxGap.Set(n.calculateMaxGapLeaf()) } else { n.maxGap.Set(n.calculateMaxGapInternal()) } } // calculateMaxGapLeaf iterates the gaps within a leaf node and calculate the // max. // // Preconditions: n must be a leaf node. func (n *evictableRangenode) calculateMaxGapLeaf() uint64 { max := evictableRangeGapIterator{n, 0}.Range().Length() for i := 1; i <= n.nrSegments; i++ { if current := (evictableRangeGapIterator{n, i}).Range().Length(); current > max { max = current } } return max } // calculateMaxGapInternal iterates children's maxGap within an internal node n // and calculate the max. // // Preconditions: n must be a non-leaf node. func (n *evictableRangenode) calculateMaxGapInternal() uint64 { max := n.children[0].maxGap.Get() for i := 1; i <= n.nrSegments; i++ { if current := n.children[i].maxGap.Get(); current > max { max = current } } return max } // searchFirstLargeEnoughGap returns the first gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *evictableRangenode) searchFirstLargeEnoughGap(minSize uint64) evictableRangeGapIterator { if n.maxGap.Get() < minSize { return evictableRangeGapIterator{} } if n.hasChildren { for i := 0; i <= n.nrSegments; i++ { if largeEnoughGap := n.children[i].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := 0; i <= n.nrSegments; i++ { currentGap := evictableRangeGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // searchLastLargeEnoughGap returns the last gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *evictableRangenode) searchLastLargeEnoughGap(minSize uint64) evictableRangeGapIterator { if n.maxGap.Get() < minSize { return evictableRangeGapIterator{} } if n.hasChildren { for i := n.nrSegments; i >= 0; i-- { if largeEnoughGap := n.children[i].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := n.nrSegments; i >= 0; i-- { currentGap := evictableRangeGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // A Iterator is conceptually one of: // // - A pointer to a segment in a set; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Iterators are copyable values and are meaningfully equality-comparable. The // zero value of Iterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type evictableRangeIterator struct { // node is the node containing the iterated segment. If the iterator is // terminal, node is nil. node *evictableRangenode // index is the index of the segment in node.keys/values. index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (seg evictableRangeIterator) Ok() bool { return seg.node != nil } // Range returns the iterated segment's range key. func (seg evictableRangeIterator) Range() EvictableRange { return seg.node.keys[seg.index] } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (seg evictableRangeIterator) Start() uint64 { return seg.node.keys[seg.index].Start } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (seg evictableRangeIterator) End() uint64 { return seg.node.keys[seg.index].End } // SetRangeUnchecked mutates the iterated segment's range key. This operation // does not invalidate any iterators. // // Preconditions: // - r.Length() > 0. // - The new range must not overlap an existing one: // - If seg.NextSegment().Ok(), then r.end <= seg.NextSegment().Start(). // - If seg.PrevSegment().Ok(), then r.start >= seg.PrevSegment().End(). func (seg evictableRangeIterator) SetRangeUnchecked(r EvictableRange) { seg.node.keys[seg.index] = r } // SetRange mutates the iterated segment's range key. If the new range would // cause the iterated segment to overlap another segment, or if the new range // is invalid, SetRange panics. This operation does not invalidate any // iterators. func (seg evictableRangeIterator) SetRange(r EvictableRange) { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) } if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) } seg.SetRangeUnchecked(r) } // SetStartUnchecked mutates the iterated segment's start. This operation does // not invalidate any iterators. // // Preconditions: The new start must be valid: // - start < seg.End() // - If seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). func (seg evictableRangeIterator) SetStartUnchecked(start uint64) { seg.node.keys[seg.index].Start = start } // SetStart mutates the iterated segment's start. If the new start value would // cause the iterated segment to overlap another segment, or would result in an // invalid range, SetStart panics. This operation does not invalidate any // iterators. func (seg evictableRangeIterator) SetStart(start uint64) { if start >= seg.End() { panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) } if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) } seg.SetStartUnchecked(start) } // SetEndUnchecked mutates the iterated segment's end. This operation does not // invalidate any iterators. // // Preconditions: The new end must be valid: // - end > seg.Start(). // - If seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). func (seg evictableRangeIterator) SetEndUnchecked(end uint64) { seg.node.keys[seg.index].End = end } // SetEnd mutates the iterated segment's end. If the new end value would cause // the iterated segment to overlap another segment, or would result in an // invalid range, SetEnd panics. This operation does not invalidate any // iterators. func (seg evictableRangeIterator) SetEnd(end uint64) { if end <= seg.Start() { panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) } if next := seg.NextSegment(); next.Ok() && end > next.Start() { panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) } seg.SetEndUnchecked(end) } // Value returns a copy of the iterated segment's value. func (seg evictableRangeIterator) Value() evictableRangeSetValue { return seg.node.values[seg.index] } // ValuePtr returns a pointer to the iterated segment's value. The pointer is // invalidated if the iterator is invalidated. This operation does not // invalidate any iterators. func (seg evictableRangeIterator) ValuePtr() *evictableRangeSetValue { return &seg.node.values[seg.index] } // SetValue mutates the iterated segment's value. This operation does not // invalidate any iterators. func (seg evictableRangeIterator) SetValue(val evictableRangeSetValue) { seg.node.values[seg.index] = val } // PrevSegment returns the iterated segment's predecessor. If there is no // preceding segment, PrevSegment returns a terminal iterator. func (seg evictableRangeIterator) PrevSegment() evictableRangeIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment() } if seg.index > 0 { return evictableRangeIterator{seg.node, seg.index - 1} } if seg.node.parent == nil { return evictableRangeIterator{} } return evictableRangesegmentBeforePosition(seg.node.parent, seg.node.parentIndex) } // NextSegment returns the iterated segment's successor. If there is no // succeeding segment, NextSegment returns a terminal iterator. func (seg evictableRangeIterator) NextSegment() evictableRangeIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment() } if seg.index < seg.node.nrSegments-1 { return evictableRangeIterator{seg.node, seg.index + 1} } if seg.node.parent == nil { return evictableRangeIterator{} } return evictableRangesegmentAfterPosition(seg.node.parent, seg.node.parentIndex) } // PrevGap returns the gap immediately before the iterated segment. func (seg evictableRangeIterator) PrevGap() evictableRangeGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment().NextGap() } return evictableRangeGapIterator{seg.node, seg.index} } // NextGap returns the gap immediately after the iterated segment. func (seg evictableRangeIterator) NextGap() evictableRangeGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment().PrevGap() } return evictableRangeGapIterator{seg.node, seg.index + 1} } // PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, // or the gap before the iterated segment otherwise. If seg.Start() == // Functions.MinKey(), PrevNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by PrevNonEmpty will be // non-terminal. func (seg evictableRangeIterator) PrevNonEmpty() (evictableRangeIterator, evictableRangeGapIterator) { if prev := seg.PrevSegment(); prev.Ok() && prev.End() == seg.Start() { return prev, evictableRangeGapIterator{} } return evictableRangeIterator{}, seg.PrevGap() } // NextNonEmpty returns the iterated segment's successor if it is adjacent, or // the gap after the iterated segment otherwise. If seg.End() == // Functions.MaxKey(), NextNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by NextNonEmpty will be // non-terminal. func (seg evictableRangeIterator) NextNonEmpty() (evictableRangeIterator, evictableRangeGapIterator) { if next := seg.NextSegment(); next.Ok() && next.Start() == seg.End() { return next, evictableRangeGapIterator{} } return evictableRangeIterator{}, seg.NextGap() } // A GapIterator is conceptually one of: // // - A pointer to a position between two segments, before the first segment, or // after the last segment in a set, called a *gap*; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Note that the gap between two adjacent segments exists (iterators to it are // non-terminal), but has a length of zero. GapIterator.IsEmpty returns true // for such gaps. An empty set contains a single gap, spanning the entire range // of the set's keys. // // GapIterators are copyable values and are meaningfully equality-comparable. // The zero value of GapIterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type evictableRangeGapIterator struct { // The representation of a GapIterator is identical to that of an Iterator, // except that index corresponds to positions between segments in the same // way as for node.children (see comment for node.nrSegments). node *evictableRangenode index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (gap evictableRangeGapIterator) Ok() bool { return gap.node != nil } // Range returns the range spanned by the iterated gap. func (gap evictableRangeGapIterator) Range() EvictableRange { return EvictableRange{gap.Start(), gap.End()} } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (gap evictableRangeGapIterator) Start() uint64 { if ps := gap.PrevSegment(); ps.Ok() { return ps.End() } return evictableRangeSetFunctions{}.MinKey() } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (gap evictableRangeGapIterator) End() uint64 { if ns := gap.NextSegment(); ns.Ok() { return ns.Start() } return evictableRangeSetFunctions{}.MaxKey() } // IsEmpty returns true if the iterated gap is empty (that is, the "gap" is // between two adjacent segments.) func (gap evictableRangeGapIterator) IsEmpty() bool { return gap.Range().Length() == 0 } // PrevSegment returns the segment immediately before the iterated gap. If no // such segment exists, PrevSegment returns a terminal iterator. func (gap evictableRangeGapIterator) PrevSegment() evictableRangeIterator { return evictableRangesegmentBeforePosition(gap.node, gap.index) } // NextSegment returns the segment immediately after the iterated gap. If no // such segment exists, NextSegment returns a terminal iterator. func (gap evictableRangeGapIterator) NextSegment() evictableRangeIterator { return evictableRangesegmentAfterPosition(gap.node, gap.index) } // PrevGap returns the iterated gap's predecessor. If no such gap exists, // PrevGap returns a terminal iterator. func (gap evictableRangeGapIterator) PrevGap() evictableRangeGapIterator { seg := gap.PrevSegment() if !seg.Ok() { return evictableRangeGapIterator{} } return seg.PrevGap() } // NextGap returns the iterated gap's successor. If no such gap exists, NextGap // returns a terminal iterator. func (gap evictableRangeGapIterator) NextGap() evictableRangeGapIterator { seg := gap.NextSegment() if !seg.Ok() { return evictableRangeGapIterator{} } return seg.NextGap() } // NextLargeEnoughGap returns the iterated gap's first next gap with larger // length than minSize. If not found, return a terminal gap iterator (does NOT // include this gap itself). // // Precondition: trackGaps must be 1. func (gap evictableRangeGapIterator) NextLargeEnoughGap(minSize uint64) evictableRangeGapIterator { if evictableRangetrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == gap.node.nrSegments { gap.node = gap.NextSegment().node gap.index = 0 return gap.nextLargeEnoughGapHelper(minSize) } return gap.nextLargeEnoughGapHelper(minSize) } // nextLargeEnoughGapHelper is the helper function used by NextLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the trailing gap of a non-leaf node. func (gap evictableRangeGapIterator) nextLargeEnoughGapHelper(minSize uint64) evictableRangeGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == gap.node.nrSegments)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return evictableRangeGapIterator{} } gap.index++ for gap.index <= gap.node.nrSegments { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index++ } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == gap.node.nrSegments { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // PrevLargeEnoughGap returns the iterated gap's first prev gap with larger or // equal length than minSize. If not found, return a terminal gap iterator // (does NOT include this gap itself). // // Precondition: trackGaps must be 1. func (gap evictableRangeGapIterator) PrevLargeEnoughGap(minSize uint64) evictableRangeGapIterator { if evictableRangetrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == 0 { gap.node = gap.PrevSegment().node gap.index = gap.node.nrSegments return gap.prevLargeEnoughGapHelper(minSize) } return gap.prevLargeEnoughGapHelper(minSize) } // prevLargeEnoughGapHelper is the helper function used by PrevLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the first gap of a non-leaf node. func (gap evictableRangeGapIterator) prevLargeEnoughGapHelper(minSize uint64) evictableRangeGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == 0)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return evictableRangeGapIterator{} } gap.index-- for gap.index >= 0 { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index-- } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == 0 { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // segmentBeforePosition returns the predecessor segment of the position given // by n.children[i], which may or may not contain a child. If no such segment // exists, segmentBeforePosition returns a terminal iterator. func evictableRangesegmentBeforePosition(n *evictableRangenode, i int) evictableRangeIterator { for i == 0 { if n.parent == nil { return evictableRangeIterator{} } n, i = n.parent, n.parentIndex } return evictableRangeIterator{n, i - 1} } // segmentAfterPosition returns the successor segment of the position given by // n.children[i], which may or may not contain a child. If no such segment // exists, segmentAfterPosition returns a terminal iterator. func evictableRangesegmentAfterPosition(n *evictableRangenode, i int) evictableRangeIterator { for i == n.nrSegments { if n.parent == nil { return evictableRangeIterator{} } n, i = n.parent, n.parentIndex } return evictableRangeIterator{n, i} } func evictableRangezeroValueSlice(slice []evictableRangeSetValue) { for i := range slice { evictableRangeSetFunctions{}.ClearValue(&slice[i]) } } func evictableRangezeroNodeSlice(slice []*evictableRangenode) { for i := range slice { slice[i] = nil } } // String stringifies a Set for debugging. func (s *evictableRangeSet) String() string { return s.root.String() } // String stringifies a node (and all of its children) for debugging. func (n *evictableRangenode) String() string { var buf bytes.Buffer n.writeDebugString(&buf, "") return buf.String() } func (n *evictableRangenode) writeDebugString(buf *bytes.Buffer, prefix string) { if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { buf.WriteString(prefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) } for i := 0; i < n.nrSegments; i++ { if child := n.children[i]; child != nil { cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) if child.parent != n || child.parentIndex != i { buf.WriteString(cprefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) } child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) } buf.WriteString(prefix) if n.hasChildren { if evictableRangetrackGaps != 0 { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v, maxGap: %d\n", i, n.keys[i], n.values[i], n.maxGap.Get())) } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } if child := n.children[n.nrSegments]; child != nil { child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) } } // FlatSegment represents a segment as a single object. FlatSegment is used as // an intermediate representation for save/restore and tests. // // +stateify savable type evictableRangeFlatSegment struct { Start uint64 End uint64 Value evictableRangeSetValue } // ExportSlice returns a copy of all segments in the given set, in ascending // key order. func (s *evictableRangeSet) ExportSlice() []evictableRangeFlatSegment { var fs []evictableRangeFlatSegment for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { fs = append(fs, evictableRangeFlatSegment{ Start: seg.Start(), End: seg.End(), Value: seg.Value(), }) } return fs } // ImportSlice initializes the given set from the given slice. // // Preconditions: // - s must be empty. // - fs must represent a valid set (the segments in fs must have valid // lengths that do not overlap). // - The segments in fs must be sorted in ascending key order. func (s *evictableRangeSet) ImportSlice(fs []evictableRangeFlatSegment) error { if !s.IsEmpty() { return fmt.Errorf("cannot import into non-empty set %v", s) } gap := s.FirstGap() for i := range fs { f := &fs[i] r := EvictableRange{f.Start, f.End} if !gap.Range().IsSupersetOf(r) { return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: %v => %v", r, f.Value) } gap = s.InsertWithoutMerging(gap, r, f.Value).NextGap() } return nil } // segmentTestCheck returns an error if s is incorrectly sorted, does not // contain exactly expectedSegments segments, or contains a segment which // fails the passed check. // // This should be used only for testing, and has been added to this package for // templating convenience. func (s *evictableRangeSet) segmentTestCheck(expectedSegments int, segFunc func(int, EvictableRange, evictableRangeSetValue) error) error { havePrev := false prev := uint64(0) nrSegments := 0 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { next := seg.Start() if havePrev && prev >= next { return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments) } if segFunc != nil { if err := segFunc(nrSegments, seg.Range(), seg.Value()); err != nil { return err } } prev = next havePrev = true nrSegments++ } if nrSegments != expectedSegments { return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments) } return nil } // countSegments counts the number of segments in the set. // // Similar to Check, this should only be used for testing. func (s *evictableRangeSet) countSegments() (segments int) { for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { segments++ } return segments } func (s *evictableRangeSet) saveRoot() []evictableRangeFlatSegment { fs := s.ExportSlice() fs = fs[:len(fs):len(fs)] return fs } func (s *evictableRangeSet) loadRoot(_ context.Context, fs []evictableRangeFlatSegment) { if err := s.ImportSlice(fs); err != nil { panic(err) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/pgalloc/mappings_mutex.go000066400000000000000000000032071465435605700257620ustar00rootroot00000000000000package pgalloc import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type mappingsMutex struct { mu sync.Mutex } var mappingsprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var mappingslockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type mappingslockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *mappingsMutex) Lock() { locking.AddGLock(mappingsprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *mappingsMutex) NestedLock(i mappingslockNameIndex) { locking.AddGLock(mappingsprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *mappingsMutex) Unlock() { locking.DelGLock(mappingsprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *mappingsMutex) NestedUnlock(i mappingslockNameIndex) { locking.DelGLock(mappingsprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func mappingsinitLockNames() {} func init() { mappingsinitLockNames() mappingsprefixIndex = locking.NewMutexClass(reflect.TypeOf(mappingsMutex{}), mappingslockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/pgalloc/memacct_set.go000066400000000000000000002036631465435605700252160ustar00rootroot00000000000000package pgalloc import ( __generics_imported0 "gvisor.dev/gvisor/pkg/sentry/memmap" ) import ( "bytes" "context" "fmt" ) // trackGaps is an optional parameter. // // If trackGaps is 1, the Set will track maximum gap size recursively, // enabling the GapIterator.{Prev,Next}LargeEnoughGap functions. In this // case, Key must be an unsigned integer. // // trackGaps must be 0 or 1. const memAccttrackGaps = 1 var _ = uint8(memAccttrackGaps << 7) // Will fail if not zero or one. // dynamicGap is a type that disappears if trackGaps is 0. type memAcctdynamicGap [memAccttrackGaps]uint64 // Get returns the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *memAcctdynamicGap) Get() uint64 { return d[:][0] } // Set sets the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *memAcctdynamicGap) Set(v uint64) { d[:][0] = v } const ( // minDegree is the minimum degree of an internal node in a Set B-tree. // // - Any non-root node has at least minDegree-1 segments. // // - Any non-root internal (non-leaf) node has at least minDegree children. // // - The root node may have fewer than minDegree-1 segments, but it may // only have 0 segments if the tree is empty. // // Our implementation requires minDegree >= 3. Higher values of minDegree // usually improve performance, but increase memory usage for small sets. memAcctminDegree = 10 memAcctmaxDegree = 2 * memAcctminDegree ) // A Set is a mapping of segments with non-overlapping Range keys. The zero // value for a Set is an empty set. Set values are not safely movable nor // copyable. Set is thread-compatible. // // +stateify savable type memAcctSet struct { root memAcctnode `state:".([]memAcctFlatSegment)"` } // IsEmpty returns true if the set contains no segments. func (s *memAcctSet) IsEmpty() bool { return s.root.nrSegments == 0 } // IsEmptyRange returns true iff no segments in the set overlap the given // range. This is semantically equivalent to s.SpanRange(r) == 0, but may be // more efficient. func (s *memAcctSet) IsEmptyRange(r __generics_imported0.FileRange) bool { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return true } _, gap := s.Find(r.Start) if !gap.Ok() { return false } return r.End <= gap.End() } // Span returns the total size of all segments in the set. func (s *memAcctSet) Span() uint64 { var sz uint64 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { sz += seg.Range().Length() } return sz } // SpanRange returns the total size of the intersection of segments in the set // with the given range. func (s *memAcctSet) SpanRange(r __generics_imported0.FileRange) uint64 { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return 0 } var sz uint64 for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { sz += seg.Range().Intersect(r).Length() } return sz } // FirstSegment returns the first segment in the set. If the set is empty, // FirstSegment returns a terminal iterator. func (s *memAcctSet) FirstSegment() memAcctIterator { if s.root.nrSegments == 0 { return memAcctIterator{} } return s.root.firstSegment() } // LastSegment returns the last segment in the set. If the set is empty, // LastSegment returns a terminal iterator. func (s *memAcctSet) LastSegment() memAcctIterator { if s.root.nrSegments == 0 { return memAcctIterator{} } return s.root.lastSegment() } // FirstGap returns the first gap in the set. func (s *memAcctSet) FirstGap() memAcctGapIterator { n := &s.root for n.hasChildren { n = n.children[0] } return memAcctGapIterator{n, 0} } // LastGap returns the last gap in the set. func (s *memAcctSet) LastGap() memAcctGapIterator { n := &s.root for n.hasChildren { n = n.children[n.nrSegments] } return memAcctGapIterator{n, n.nrSegments} } // Find returns the segment or gap whose range contains the given key. If a // segment is found, the returned Iterator is non-terminal and the // returned GapIterator is terminal. Otherwise, the returned Iterator is // terminal and the returned GapIterator is non-terminal. func (s *memAcctSet) Find(key uint64) (memAcctIterator, memAcctGapIterator) { n := &s.root for { lower := 0 upper := n.nrSegments for lower < upper { i := lower + (upper-lower)/2 if r := n.keys[i]; key < r.End { if key >= r.Start { return memAcctIterator{n, i}, memAcctGapIterator{} } upper = i } else { lower = i + 1 } } i := lower if !n.hasChildren { return memAcctIterator{}, memAcctGapIterator{n, i} } n = n.children[i] } } // FindSegment returns the segment whose range contains the given key. If no // such segment exists, FindSegment returns a terminal iterator. func (s *memAcctSet) FindSegment(key uint64) memAcctIterator { seg, _ := s.Find(key) return seg } // LowerBoundSegment returns the segment with the lowest range that contains a // key greater than or equal to min. If no such segment exists, // LowerBoundSegment returns a terminal iterator. func (s *memAcctSet) LowerBoundSegment(min uint64) memAcctIterator { seg, gap := s.Find(min) if seg.Ok() { return seg } return gap.NextSegment() } // UpperBoundSegment returns the segment with the highest range that contains a // key less than or equal to max. If no such segment exists, UpperBoundSegment // returns a terminal iterator. func (s *memAcctSet) UpperBoundSegment(max uint64) memAcctIterator { seg, gap := s.Find(max) if seg.Ok() { return seg } return gap.PrevSegment() } // FindGap returns the gap containing the given key. If no such gap exists // (i.e. the set contains a segment containing that key), FindGap returns a // terminal iterator. func (s *memAcctSet) FindGap(key uint64) memAcctGapIterator { _, gap := s.Find(key) return gap } // LowerBoundGap returns the gap with the lowest range that is greater than or // equal to min. func (s *memAcctSet) LowerBoundGap(min uint64) memAcctGapIterator { seg, gap := s.Find(min) if gap.Ok() { return gap } return seg.NextGap() } // UpperBoundGap returns the gap with the highest range that is less than or // equal to max. func (s *memAcctSet) UpperBoundGap(max uint64) memAcctGapIterator { seg, gap := s.Find(max) if gap.Ok() { return gap } return seg.PrevGap() } // FirstLargeEnoughGap returns the first gap in the set with at least the given // length. If no such gap exists, FirstLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *memAcctSet) FirstLargeEnoughGap(minSize uint64) memAcctGapIterator { if memAccttrackGaps != 1 { panic("set is not tracking gaps") } gap := s.FirstGap() if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // LastLargeEnoughGap returns the last gap in the set with at least the given // length. If no such gap exists, LastLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *memAcctSet) LastLargeEnoughGap(minSize uint64) memAcctGapIterator { if memAccttrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LastGap() if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // LowerBoundLargeEnoughGap returns the first gap in the set with at least the // given length and whose range contains a key greater than or equal to min. If // no such gap exists, LowerBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *memAcctSet) LowerBoundLargeEnoughGap(min, minSize uint64) memAcctGapIterator { if memAccttrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LowerBoundGap(min) if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // UpperBoundLargeEnoughGap returns the last gap in the set with at least the // given length and whose range contains a key less than or equal to max. If no // such gap exists, UpperBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *memAcctSet) UpperBoundLargeEnoughGap(max, minSize uint64) memAcctGapIterator { if memAccttrackGaps != 1 { panic("set is not tracking gaps") } gap := s.UpperBoundGap(max) if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // Insert inserts the given segment into the given gap. If the new segment can // be merged with adjacent segments, Insert will do so. Insert returns an // iterator to the segment containing the inserted value (which may have been // merged with other values). All existing iterators (including gap, but not // including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, Insert panics. // // Insert is semantically equivalent to a InsertWithoutMerging followed by a // Merge, but may be more efficient. Note that there is no unchecked variant of // Insert since Insert must retrieve and inspect gap's predecessor and // successor segments regardless. func (s *memAcctSet) Insert(gap memAcctGapIterator, r __generics_imported0.FileRange, val memAcctInfo) memAcctIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } prev, next := gap.PrevSegment(), gap.NextSegment() if prev.Ok() && prev.End() > r.Start { panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) } if next.Ok() && next.Start() < r.End { panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) } if prev.Ok() && prev.End() == r.Start { if mval, ok := (memAcctSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { shrinkMaxGap := memAccttrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() prev.SetEndUnchecked(r.End) prev.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } if next.Ok() && next.Start() == r.End { val = mval if mval, ok := (memAcctSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { prev.SetEndUnchecked(next.End()) prev.SetValue(mval) return s.Remove(next).PrevSegment() } } return prev } } if next.Ok() && next.Start() == r.End { if mval, ok := (memAcctSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { shrinkMaxGap := memAccttrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() next.SetStartUnchecked(r.Start) next.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } return next } } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMerging inserts the given segment into the given gap and // returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, // InsertWithoutMerging panics. func (s *memAcctSet) InsertWithoutMerging(gap memAcctGapIterator, r __generics_imported0.FileRange, val memAcctInfo) memAcctIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if gr := gap.Range(); !gr.IsSupersetOf(r) { panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMergingUnchecked inserts the given segment into the given gap // and returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // Preconditions: // - r.Start >= gap.Start(). // - r.End <= gap.End(). func (s *memAcctSet) InsertWithoutMergingUnchecked(gap memAcctGapIterator, r __generics_imported0.FileRange, val memAcctInfo) memAcctIterator { gap = gap.node.rebalanceBeforeInsert(gap) splitMaxGap := memAccttrackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get()) copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) gap.node.keys[gap.index] = r gap.node.values[gap.index] = val gap.node.nrSegments++ if splitMaxGap { gap.node.updateMaxGapLeaf() } return memAcctIterator{gap.node, gap.index} } // InsertRange inserts the given segment into the set. If the new segment can // be merged with adjacent segments, InsertRange will do so. InsertRange // returns an iterator to the segment containing the inserted value (which may // have been merged with other values). All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertRange panics. // // InsertRange searches the set to find the gap to insert into. If the caller // already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *memAcctSet) InsertRange(r __generics_imported0.FileRange, val memAcctInfo) memAcctIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.Insert(gap, r, val) } // InsertWithoutMergingRange inserts the given segment into the set and returns // an iterator to the inserted segment. All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertWithoutMergingRange panics. // // InsertWithoutMergingRange searches the set to find the gap to insert into. // If the caller already has the appropriate GapIterator, or if the caller // needs to do additional work between finding the gap and insertion, use // InsertWithoutMerging instead. func (s *memAcctSet) InsertWithoutMergingRange(r __generics_imported0.FileRange, val memAcctInfo) memAcctIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.InsertWithoutMerging(gap, r, val) } // TryInsertRange attempts to insert the given segment into the set. If the new // segment can be merged with adjacent segments, TryInsertRange will do so. // TryInsertRange returns an iterator to the segment containing the inserted // value (which may have been merged with other values). All existing iterators // (excluding the returned iterator) are invalidated. // // If the new segment would overlap an existing segment, TryInsertRange does // nothing and returns a terminal iterator. // // TryInsertRange searches the set to find the gap to insert into. If the // caller already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *memAcctSet) TryInsertRange(r __generics_imported0.FileRange, val memAcctInfo) memAcctIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return memAcctIterator{} } if gap.End() < r.End { return memAcctIterator{} } return s.Insert(gap, r, val) } // TryInsertWithoutMergingRange attempts to insert the given segment into the // set. If successful, it returns an iterator to the inserted segment; all // existing iterators (excluding the returned iterator) are invalidated. If the // new segment would overlap an existing segment, TryInsertWithoutMergingRange // does nothing and returns a terminal iterator. // // TryInsertWithoutMergingRange searches the set to find the gap to insert // into. If the caller already has the appropriate GapIterator, or if the // caller needs to do additional work between finding the gap and insertion, // use InsertWithoutMerging instead. func (s *memAcctSet) TryInsertWithoutMergingRange(r __generics_imported0.FileRange, val memAcctInfo) memAcctIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return memAcctIterator{} } if gap.End() < r.End { return memAcctIterator{} } return s.InsertWithoutMerging(gap, r, val) } // Remove removes the given segment and returns an iterator to the vacated gap. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. func (s *memAcctSet) Remove(seg memAcctIterator) memAcctGapIterator { if seg.node.hasChildren { victim := seg.PrevSegment() seg.SetRangeUnchecked(victim.Range()) seg.SetValue(victim.Value()) nextAdjacentNode := seg.NextSegment().node if memAccttrackGaps != 0 { nextAdjacentNode.updateMaxGapLeaf() } return s.Remove(victim).NextGap() } copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) memAcctSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) seg.node.nrSegments-- if memAccttrackGaps != 0 { seg.node.updateMaxGapLeaf() } return seg.node.rebalanceAfterRemove(memAcctGapIterator{seg.node, seg.index}) } // RemoveAll removes all segments from the set. All existing iterators are // invalidated. func (s *memAcctSet) RemoveAll() { s.root = memAcctnode{} } // RemoveRange removes all segments in the given range. An iterator to the // newly formed gap is returned, and all existing iterators are invalidated. // // RemoveRange searches the set to find segments to remove. If the caller // already has an iterator to either end of the range of segments to remove, or // if the caller needs to do additional work before removing each segment, // iterate segments and call Remove in a loop instead. func (s *memAcctSet) RemoveRange(r __generics_imported0.FileRange) memAcctGapIterator { seg, gap := s.Find(r.Start) if seg.Ok() { seg = s.Isolate(seg, r) gap = s.Remove(seg) } for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { seg = s.SplitAfter(seg, r.End) gap = s.Remove(seg) } return gap } // RemoveFullRange is equivalent to RemoveRange, except that if any key in the // given range does not correspond to a segment, RemoveFullRange panics. func (s *memAcctSet) RemoveFullRange(r __generics_imported0.FileRange) memAcctGapIterator { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) end := seg.End() gap := s.Remove(seg) if r.End <= end { return gap } seg = gap.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // Merge attempts to merge two neighboring segments. If successful, Merge // returns an iterator to the merged segment, and all existing iterators are // invalidated. Otherwise, Merge returns a terminal iterator. // // If first is not the predecessor of second, Merge panics. func (s *memAcctSet) Merge(first, second memAcctIterator) memAcctIterator { if first.NextSegment() != second { panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) } return s.MergeUnchecked(first, second) } // MergeUnchecked attempts to merge two neighboring segments. If successful, // MergeUnchecked returns an iterator to the merged segment, and all existing // iterators are invalidated. Otherwise, MergeUnchecked returns a terminal // iterator. // // Precondition: first is the predecessor of second: first.NextSegment() == // second, first == second.PrevSegment(). func (s *memAcctSet) MergeUnchecked(first, second memAcctIterator) memAcctIterator { if first.End() == second.Start() { if mval, ok := (memAcctSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { first.SetEndUnchecked(second.End()) first.SetValue(mval) return s.Remove(second).PrevSegment() } } return memAcctIterator{} } // MergePrev attempts to merge the given segment with its predecessor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergePrev is usually used when mutating segments while iterating them in // order of increasing keys, to attempt merging of each mutated segment with // its previously-mutated predecessor. In such cases, merging a mutated segment // with its unmutated successor would incorrectly cause the latter to be // skipped. func (s *memAcctSet) MergePrev(seg memAcctIterator) memAcctIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } return seg } // MergeNext attempts to merge the given segment with its successor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergeNext is usually used when mutating segments while iterating them in // order of decreasing keys, to attempt merging of each mutated segment with // its previously-mutated successor. In such cases, merging a mutated segment // with its unmutated predecessor would incorrectly cause the latter to be // skipped. func (s *memAcctSet) MergeNext(seg memAcctIterator) memAcctIterator { if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // Unisolate attempts to merge the given segment with its predecessor and // successor if possible, and returns an updated iterator to the extended // segment. All existing iterators (including seg, but not including the // returned iterator) are invalidated. // // Unisolate is usually used in conjunction with Isolate when mutating part of // a single segment in a way that may affect its mergeability. For the reasons // described by MergePrev and MergeNext, it is usually incorrect to use the // return value of Unisolate in a loop variable. func (s *memAcctSet) Unisolate(seg memAcctIterator) memAcctIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // MergeAll merges all mergeable adjacent segments in the set. All existing // iterators are invalidated. func (s *memAcctSet) MergeAll() { seg := s.FirstSegment() if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeInsideRange attempts to merge all adjacent segments that contain a key // in the specific range. All existing iterators are invalidated. // // MergeInsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid a redundant search. func (s *memAcctSet) MergeInsideRange(r __generics_imported0.FileRange) { seg := s.LowerBoundSegment(r.Start) if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() && next.Start() < r.End { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeOutsideRange attempts to merge the segment containing r.Start with its // predecessor, and the segment containing r.End-1 with its successor. // // MergeOutsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid two redundant searches. func (s *memAcctSet) MergeOutsideRange(r __generics_imported0.FileRange) { first := s.FindSegment(r.Start) if first.Ok() { if prev := first.PrevSegment(); prev.Ok() { s.Merge(prev, first) } } last := s.FindSegment(r.End - 1) if last.Ok() { if next := last.NextSegment(); next.Ok() { s.Merge(last, next) } } } // Split splits the given segment at the given key and returns iterators to the // two resulting segments. All existing iterators (including seg, but not // including the returned iterators) are invalidated. // // If the segment cannot be split at split (because split is at the start or // end of the segment's range, so splitting would produce a segment with zero // length, or because split falls outside the segment's range altogether), // Split panics. func (s *memAcctSet) Split(seg memAcctIterator, split uint64) (memAcctIterator, memAcctIterator) { if !seg.Range().CanSplitAt(split) { panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) } return s.SplitUnchecked(seg, split) } // SplitUnchecked splits the given segment at the given key and returns // iterators to the two resulting segments. All existing iterators (including // seg, but not including the returned iterators) are invalidated. // // Preconditions: seg.Start() < key < seg.End(). func (s *memAcctSet) SplitUnchecked(seg memAcctIterator, split uint64) (memAcctIterator, memAcctIterator) { val1, val2 := (memAcctSetFunctions{}).Split(seg.Range(), seg.Value(), split) end2 := seg.End() seg.SetEndUnchecked(split) seg.SetValue(val1) seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), __generics_imported0.FileRange{split, end2}, val2) return seg2.PrevSegment(), seg2 } // SplitBefore ensures that the given segment's start is at least start by // splitting at start if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterator) are invalidated. // // SplitBefore is usually when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, the first segment may // extend beyond the start of the range to be mutated, and needs to be // SplitBefore to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter; i.e. SplitBefore needs to be invoked on each segment, while // SplitAfter only needs to be invoked on the first. // // Preconditions: start < seg.End(). func (s *memAcctSet) SplitBefore(seg memAcctIterator, start uint64) memAcctIterator { if seg.Range().CanSplitAt(start) { _, seg = s.SplitUnchecked(seg, start) } return seg } // SplitAfter ensures that the given segment's end is at most end by splitting // at end if necessary, and returns an updated iterator to the bounded segment. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. // // SplitAfter is usually used when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, each iterated segment // may extend beyond the end of the range to be mutated, and needs to be // SplitAfter to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter exchange roles; i.e. SplitBefore needs to be invoked on each // segment, while SplitAfter only needs to be invoked on the first. // // Preconditions: seg.Start() < end. func (s *memAcctSet) SplitAfter(seg memAcctIterator, end uint64) memAcctIterator { if seg.Range().CanSplitAt(end) { seg, _ = s.SplitUnchecked(seg, end) } return seg } // Isolate ensures that the given segment's range is a subset of r by splitting // at r.Start and r.End if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterators) are invalidated. // // Isolate is usually used when mutating part of a single segment, or when // mutating segments in a range where the first segment is not necessarily // split, making use of SplitBefore/SplitAfter complex. // // Preconditions: seg.Range().Overlaps(r). func (s *memAcctSet) Isolate(seg memAcctIterator, r __generics_imported0.FileRange) memAcctIterator { if seg.Range().CanSplitAt(r.Start) { _, seg = s.SplitUnchecked(seg, r.Start) } if seg.Range().CanSplitAt(r.End) { seg, _ = s.SplitUnchecked(seg, r.End) } return seg } // LowerBoundSegmentSplitBefore combines LowerBoundSegment and SplitBefore. // // LowerBoundSegmentSplitBefore is usually used when mutating segments in a // range while iterating them in order of increasing keys. In such cases, // LowerBoundSegmentSplitBefore provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *memAcctSet) LowerBoundSegmentSplitBefore(min uint64) memAcctIterator { seg := s.LowerBoundSegment(min) if seg.Ok() { seg = s.SplitBefore(seg, min) } return seg } // UpperBoundSegmentSplitAfter combines UpperBoundSegment and SplitAfter. // // UpperBoundSegmentSplitAfter is usually used when mutating segments in a // range while iterating them in order of decreasing keys. In such cases, // UpperBoundSegmentSplitAfter provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *memAcctSet) UpperBoundSegmentSplitAfter(max uint64) memAcctIterator { seg := s.UpperBoundSegment(max) if seg.Ok() { seg = s.SplitAfter(seg, max) } return seg } // VisitRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments will not be split, so f may be called // on segments lying partially outside r. Non-empty gaps between segments are // skipped. If a call to f returns false, VisitRange stops iteration // immediately. // // N.B. f must not invalidate iterators into s. func (s *memAcctSet) VisitRange(r __generics_imported0.FileRange, f func(seg memAcctIterator) bool) { for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { if !f(seg) { return } } } // VisitFullRange is equivalent to VisitRange, except that if any key in r that // is visited before f returns false does not correspond to a segment, // VisitFullRange panics. func (s *memAcctSet) VisitFullRange(r __generics_imported0.FileRange, f func(seg memAcctIterator) bool) { pos := r.Start seg := s.FindSegment(r.Start) for { if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", pos)) } if !f(seg) { return } pos = seg.End() if r.End <= pos { return } seg, _ = seg.NextNonEmpty() } } // MutateRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments that lie partially outside r are split // before f is called, such that f only observes segments entirely within r. // Iterated segments are merged again after f is called. Non-empty gaps between // segments are skipped. If a call to f returns false, MutateRange stops // iteration immediately. // // MutateRange invalidates all existing iterators. // // N.B. f must not invalidate iterators into s. func (s *memAcctSet) MutateRange(r __generics_imported0.FileRange, f func(seg memAcctIterator) bool) { seg := s.LowerBoundSegmentSplitBefore(r.Start) for seg.Ok() && seg.Start() < r.End { seg = s.SplitAfter(seg, r.End) cont := f(seg) seg = s.MergePrev(seg) if !cont { s.MergeNext(seg) return } seg = seg.NextSegment() } if seg.Ok() { s.MergePrev(seg) } } // MutateFullRange is equivalent to MutateRange, except that if any key in r // that is visited before f returns false does not correspond to a segment, // MutateFullRange panics. func (s *memAcctSet) MutateFullRange(r __generics_imported0.FileRange, f func(seg memAcctIterator) bool) { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) cont := f(seg) end := seg.End() seg = s.MergePrev(seg) if !cont || r.End <= end { s.MergeNext(seg) return } seg = seg.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // +stateify savable type memAcctnode struct { // An internal binary tree node looks like: // // K // / \ // Cl Cr // // where all keys in the subtree rooted by Cl (the left subtree) are less // than K (the key of the parent node), and all keys in the subtree rooted // by Cr (the right subtree) are greater than K. // // An internal B-tree node's indexes work out to look like: // // K0 K1 K2 ... Kn-1 // / \/ \/ \ ... / \ // C0 C1 C2 C3 ... Cn-1 Cn // // where n is nrSegments. nrSegments int // parent is a pointer to this node's parent. If this node is root, parent // is nil. parent *memAcctnode // parentIndex is the index of this node in parent.children. parentIndex int // Flag for internal nodes that is technically redundant with "children[0] // != nil", but is stored in the first cache line. "hasChildren" rather // than "isLeaf" because false must be the correct value for an empty root. hasChildren bool // The longest gap within this node. If the node is a leaf, it's simply the // maximum gap among all the (nrSegments+1) gaps formed by its nrSegments keys // including the 0th and nrSegments-th gap possibly shared with its upper-level // nodes; if it's a non-leaf node, it's the max of all children's maxGap. maxGap memAcctdynamicGap // Nodes store keys and values in separate arrays to maximize locality in // the common case (scanning keys for lookup). keys [memAcctmaxDegree - 1]__generics_imported0.FileRange values [memAcctmaxDegree - 1]memAcctInfo children [memAcctmaxDegree]*memAcctnode } // firstSegment returns the first segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *memAcctnode) firstSegment() memAcctIterator { for n.hasChildren { n = n.children[0] } return memAcctIterator{n, 0} } // lastSegment returns the last segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *memAcctnode) lastSegment() memAcctIterator { for n.hasChildren { n = n.children[n.nrSegments] } return memAcctIterator{n, n.nrSegments - 1} } func (n *memAcctnode) prevSibling() *memAcctnode { if n.parent == nil || n.parentIndex == 0 { return nil } return n.parent.children[n.parentIndex-1] } func (n *memAcctnode) nextSibling() *memAcctnode { if n.parent == nil || n.parentIndex == n.parent.nrSegments { return nil } return n.parent.children[n.parentIndex+1] } // rebalanceBeforeInsert splits n and its ancestors if they are full, as // required for insertion, and returns an updated iterator to the position // represented by gap. func (n *memAcctnode) rebalanceBeforeInsert(gap memAcctGapIterator) memAcctGapIterator { if n.nrSegments < memAcctmaxDegree-1 { return gap } if n.parent != nil { gap = n.parent.rebalanceBeforeInsert(gap) } if n.parent == nil { left := &memAcctnode{ nrSegments: memAcctminDegree - 1, parent: n, parentIndex: 0, hasChildren: n.hasChildren, } right := &memAcctnode{ nrSegments: memAcctminDegree - 1, parent: n, parentIndex: 1, hasChildren: n.hasChildren, } copy(left.keys[:memAcctminDegree-1], n.keys[:memAcctminDegree-1]) copy(left.values[:memAcctminDegree-1], n.values[:memAcctminDegree-1]) copy(right.keys[:memAcctminDegree-1], n.keys[memAcctminDegree:]) copy(right.values[:memAcctminDegree-1], n.values[memAcctminDegree:]) n.keys[0], n.values[0] = n.keys[memAcctminDegree-1], n.values[memAcctminDegree-1] memAcctzeroValueSlice(n.values[1:]) if n.hasChildren { copy(left.children[:memAcctminDegree], n.children[:memAcctminDegree]) copy(right.children[:memAcctminDegree], n.children[memAcctminDegree:]) memAcctzeroNodeSlice(n.children[2:]) for i := 0; i < memAcctminDegree; i++ { left.children[i].parent = left left.children[i].parentIndex = i right.children[i].parent = right right.children[i].parentIndex = i } } n.nrSegments = 1 n.hasChildren = true n.children[0] = left n.children[1] = right if memAccttrackGaps != 0 { left.updateMaxGapLocal() right.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < memAcctminDegree { return memAcctGapIterator{left, gap.index} } return memAcctGapIterator{right, gap.index - memAcctminDegree} } copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[memAcctminDegree-1], n.values[memAcctminDegree-1] copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { n.parent.children[i].parentIndex = i } sibling := &memAcctnode{ nrSegments: memAcctminDegree - 1, parent: n.parent, parentIndex: n.parentIndex + 1, hasChildren: n.hasChildren, } n.parent.children[n.parentIndex+1] = sibling n.parent.nrSegments++ copy(sibling.keys[:memAcctminDegree-1], n.keys[memAcctminDegree:]) copy(sibling.values[:memAcctminDegree-1], n.values[memAcctminDegree:]) memAcctzeroValueSlice(n.values[memAcctminDegree-1:]) if n.hasChildren { copy(sibling.children[:memAcctminDegree], n.children[memAcctminDegree:]) memAcctzeroNodeSlice(n.children[memAcctminDegree:]) for i := 0; i < memAcctminDegree; i++ { sibling.children[i].parent = sibling sibling.children[i].parentIndex = i } } n.nrSegments = memAcctminDegree - 1 if memAccttrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < memAcctminDegree { return gap } return memAcctGapIterator{sibling, gap.index - memAcctminDegree} } // rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient // (contain fewer segments than required by B-tree invariants), as required for // removal, and returns an updated iterator to the position represented by gap. // // Precondition: n is the only node in the tree that may currently violate a // B-tree invariant. func (n *memAcctnode) rebalanceAfterRemove(gap memAcctGapIterator) memAcctGapIterator { for { if n.nrSegments >= memAcctminDegree-1 { return gap } if n.parent == nil { return gap } if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= memAcctminDegree { copy(n.keys[1:], n.keys[:n.nrSegments]) copy(n.values[1:], n.values[:n.nrSegments]) n.keys[0] = n.parent.keys[n.parentIndex-1] n.values[0] = n.parent.values[n.parentIndex-1] n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] memAcctSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { copy(n.children[1:], n.children[:n.nrSegments+1]) n.children[0] = sibling.children[sibling.nrSegments] sibling.children[sibling.nrSegments] = nil n.children[0].parent = n n.children[0].parentIndex = 0 for i := 1; i < n.nrSegments+2; i++ { n.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if memAccttrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling && gap.index == sibling.nrSegments { return memAcctGapIterator{n, 0} } if gap.node == n { return memAcctGapIterator{n, gap.index + 1} } return gap } if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= memAcctminDegree { n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] n.values[n.nrSegments] = n.parent.values[n.parentIndex] n.parent.keys[n.parentIndex] = sibling.keys[0] n.parent.values[n.parentIndex] = sibling.values[0] copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) memAcctSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { n.children[n.nrSegments+1] = sibling.children[0] copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) sibling.children[sibling.nrSegments] = nil n.children[n.nrSegments+1].parent = n n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 for i := 0; i < sibling.nrSegments; i++ { sibling.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if memAccttrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling { if gap.index == 0 { return memAcctGapIterator{n, n.nrSegments} } return memAcctGapIterator{sibling, gap.index - 1} } return gap } p := n.parent if p.nrSegments == 1 { left, right := p.children[0], p.children[1] p.nrSegments = left.nrSegments + right.nrSegments + 1 p.hasChildren = left.hasChildren p.keys[left.nrSegments] = p.keys[0] p.values[left.nrSegments] = p.values[0] copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := 0; i < p.nrSegments+1; i++ { p.children[i].parent = p p.children[i].parentIndex = i } } else { p.children[0] = nil p.children[1] = nil } if gap.node == left { return memAcctGapIterator{p, gap.index} } if gap.node == right { return memAcctGapIterator{p, gap.index + left.nrSegments + 1} } return gap } // Merge n and either sibling, along with the segment separating the // two, into whichever of the two nodes comes first. This is the // reverse of the non-root splitting case in // node.rebalanceBeforeInsert. var left, right *memAcctnode if n.parentIndex > 0 { left = n.prevSibling() right = n } else { left = n right = n.nextSibling() } if gap.node == right { gap = memAcctGapIterator{left, gap.index + left.nrSegments + 1} } left.keys[left.nrSegments] = p.keys[left.parentIndex] left.values[left.nrSegments] = p.values[left.parentIndex] copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { left.children[i].parent = left left.children[i].parentIndex = i } } left.nrSegments += right.nrSegments + 1 copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) memAcctSetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) for i := 0; i < p.nrSegments; i++ { p.children[i].parentIndex = i } p.children[p.nrSegments] = nil p.nrSegments-- if memAccttrackGaps != 0 { left.updateMaxGapLocal() } n = p } } // updateMaxGapLeaf updates maxGap bottom-up from the calling leaf until no // necessary update. // // Preconditions: n must be a leaf node, trackGaps must be 1. func (n *memAcctnode) updateMaxGapLeaf() { if n.hasChildren { panic(fmt.Sprintf("updateMaxGapLeaf should always be called on leaf node: %v", n)) } max := n.calculateMaxGapLeaf() if max == n.maxGap.Get() { return } oldMax := n.maxGap.Get() n.maxGap.Set(max) if max > oldMax { for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() >= max { break } p.maxGap.Set(max) } return } for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() > oldMax { break } parentNewMax := p.calculateMaxGapInternal() if p.maxGap.Get() == parentNewMax { break } p.maxGap.Set(parentNewMax) } } // updateMaxGapLocal updates maxGap of the calling node solely with no // propagation to ancestor nodes. // // Precondition: trackGaps must be 1. func (n *memAcctnode) updateMaxGapLocal() { if !n.hasChildren { n.maxGap.Set(n.calculateMaxGapLeaf()) } else { n.maxGap.Set(n.calculateMaxGapInternal()) } } // calculateMaxGapLeaf iterates the gaps within a leaf node and calculate the // max. // // Preconditions: n must be a leaf node. func (n *memAcctnode) calculateMaxGapLeaf() uint64 { max := memAcctGapIterator{n, 0}.Range().Length() for i := 1; i <= n.nrSegments; i++ { if current := (memAcctGapIterator{n, i}).Range().Length(); current > max { max = current } } return max } // calculateMaxGapInternal iterates children's maxGap within an internal node n // and calculate the max. // // Preconditions: n must be a non-leaf node. func (n *memAcctnode) calculateMaxGapInternal() uint64 { max := n.children[0].maxGap.Get() for i := 1; i <= n.nrSegments; i++ { if current := n.children[i].maxGap.Get(); current > max { max = current } } return max } // searchFirstLargeEnoughGap returns the first gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *memAcctnode) searchFirstLargeEnoughGap(minSize uint64) memAcctGapIterator { if n.maxGap.Get() < minSize { return memAcctGapIterator{} } if n.hasChildren { for i := 0; i <= n.nrSegments; i++ { if largeEnoughGap := n.children[i].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := 0; i <= n.nrSegments; i++ { currentGap := memAcctGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // searchLastLargeEnoughGap returns the last gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *memAcctnode) searchLastLargeEnoughGap(minSize uint64) memAcctGapIterator { if n.maxGap.Get() < minSize { return memAcctGapIterator{} } if n.hasChildren { for i := n.nrSegments; i >= 0; i-- { if largeEnoughGap := n.children[i].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := n.nrSegments; i >= 0; i-- { currentGap := memAcctGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // A Iterator is conceptually one of: // // - A pointer to a segment in a set; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Iterators are copyable values and are meaningfully equality-comparable. The // zero value of Iterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type memAcctIterator struct { // node is the node containing the iterated segment. If the iterator is // terminal, node is nil. node *memAcctnode // index is the index of the segment in node.keys/values. index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (seg memAcctIterator) Ok() bool { return seg.node != nil } // Range returns the iterated segment's range key. func (seg memAcctIterator) Range() __generics_imported0.FileRange { return seg.node.keys[seg.index] } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (seg memAcctIterator) Start() uint64 { return seg.node.keys[seg.index].Start } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (seg memAcctIterator) End() uint64 { return seg.node.keys[seg.index].End } // SetRangeUnchecked mutates the iterated segment's range key. This operation // does not invalidate any iterators. // // Preconditions: // - r.Length() > 0. // - The new range must not overlap an existing one: // - If seg.NextSegment().Ok(), then r.end <= seg.NextSegment().Start(). // - If seg.PrevSegment().Ok(), then r.start >= seg.PrevSegment().End(). func (seg memAcctIterator) SetRangeUnchecked(r __generics_imported0.FileRange) { seg.node.keys[seg.index] = r } // SetRange mutates the iterated segment's range key. If the new range would // cause the iterated segment to overlap another segment, or if the new range // is invalid, SetRange panics. This operation does not invalidate any // iterators. func (seg memAcctIterator) SetRange(r __generics_imported0.FileRange) { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) } if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) } seg.SetRangeUnchecked(r) } // SetStartUnchecked mutates the iterated segment's start. This operation does // not invalidate any iterators. // // Preconditions: The new start must be valid: // - start < seg.End() // - If seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). func (seg memAcctIterator) SetStartUnchecked(start uint64) { seg.node.keys[seg.index].Start = start } // SetStart mutates the iterated segment's start. If the new start value would // cause the iterated segment to overlap another segment, or would result in an // invalid range, SetStart panics. This operation does not invalidate any // iterators. func (seg memAcctIterator) SetStart(start uint64) { if start >= seg.End() { panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) } if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) } seg.SetStartUnchecked(start) } // SetEndUnchecked mutates the iterated segment's end. This operation does not // invalidate any iterators. // // Preconditions: The new end must be valid: // - end > seg.Start(). // - If seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). func (seg memAcctIterator) SetEndUnchecked(end uint64) { seg.node.keys[seg.index].End = end } // SetEnd mutates the iterated segment's end. If the new end value would cause // the iterated segment to overlap another segment, or would result in an // invalid range, SetEnd panics. This operation does not invalidate any // iterators. func (seg memAcctIterator) SetEnd(end uint64) { if end <= seg.Start() { panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) } if next := seg.NextSegment(); next.Ok() && end > next.Start() { panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) } seg.SetEndUnchecked(end) } // Value returns a copy of the iterated segment's value. func (seg memAcctIterator) Value() memAcctInfo { return seg.node.values[seg.index] } // ValuePtr returns a pointer to the iterated segment's value. The pointer is // invalidated if the iterator is invalidated. This operation does not // invalidate any iterators. func (seg memAcctIterator) ValuePtr() *memAcctInfo { return &seg.node.values[seg.index] } // SetValue mutates the iterated segment's value. This operation does not // invalidate any iterators. func (seg memAcctIterator) SetValue(val memAcctInfo) { seg.node.values[seg.index] = val } // PrevSegment returns the iterated segment's predecessor. If there is no // preceding segment, PrevSegment returns a terminal iterator. func (seg memAcctIterator) PrevSegment() memAcctIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment() } if seg.index > 0 { return memAcctIterator{seg.node, seg.index - 1} } if seg.node.parent == nil { return memAcctIterator{} } return memAcctsegmentBeforePosition(seg.node.parent, seg.node.parentIndex) } // NextSegment returns the iterated segment's successor. If there is no // succeeding segment, NextSegment returns a terminal iterator. func (seg memAcctIterator) NextSegment() memAcctIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment() } if seg.index < seg.node.nrSegments-1 { return memAcctIterator{seg.node, seg.index + 1} } if seg.node.parent == nil { return memAcctIterator{} } return memAcctsegmentAfterPosition(seg.node.parent, seg.node.parentIndex) } // PrevGap returns the gap immediately before the iterated segment. func (seg memAcctIterator) PrevGap() memAcctGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment().NextGap() } return memAcctGapIterator{seg.node, seg.index} } // NextGap returns the gap immediately after the iterated segment. func (seg memAcctIterator) NextGap() memAcctGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment().PrevGap() } return memAcctGapIterator{seg.node, seg.index + 1} } // PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, // or the gap before the iterated segment otherwise. If seg.Start() == // Functions.MinKey(), PrevNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by PrevNonEmpty will be // non-terminal. func (seg memAcctIterator) PrevNonEmpty() (memAcctIterator, memAcctGapIterator) { if prev := seg.PrevSegment(); prev.Ok() && prev.End() == seg.Start() { return prev, memAcctGapIterator{} } return memAcctIterator{}, seg.PrevGap() } // NextNonEmpty returns the iterated segment's successor if it is adjacent, or // the gap after the iterated segment otherwise. If seg.End() == // Functions.MaxKey(), NextNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by NextNonEmpty will be // non-terminal. func (seg memAcctIterator) NextNonEmpty() (memAcctIterator, memAcctGapIterator) { if next := seg.NextSegment(); next.Ok() && next.Start() == seg.End() { return next, memAcctGapIterator{} } return memAcctIterator{}, seg.NextGap() } // A GapIterator is conceptually one of: // // - A pointer to a position between two segments, before the first segment, or // after the last segment in a set, called a *gap*; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Note that the gap between two adjacent segments exists (iterators to it are // non-terminal), but has a length of zero. GapIterator.IsEmpty returns true // for such gaps. An empty set contains a single gap, spanning the entire range // of the set's keys. // // GapIterators are copyable values and are meaningfully equality-comparable. // The zero value of GapIterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type memAcctGapIterator struct { // The representation of a GapIterator is identical to that of an Iterator, // except that index corresponds to positions between segments in the same // way as for node.children (see comment for node.nrSegments). node *memAcctnode index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (gap memAcctGapIterator) Ok() bool { return gap.node != nil } // Range returns the range spanned by the iterated gap. func (gap memAcctGapIterator) Range() __generics_imported0.FileRange { return __generics_imported0.FileRange{gap.Start(), gap.End()} } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (gap memAcctGapIterator) Start() uint64 { if ps := gap.PrevSegment(); ps.Ok() { return ps.End() } return memAcctSetFunctions{}.MinKey() } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (gap memAcctGapIterator) End() uint64 { if ns := gap.NextSegment(); ns.Ok() { return ns.Start() } return memAcctSetFunctions{}.MaxKey() } // IsEmpty returns true if the iterated gap is empty (that is, the "gap" is // between two adjacent segments.) func (gap memAcctGapIterator) IsEmpty() bool { return gap.Range().Length() == 0 } // PrevSegment returns the segment immediately before the iterated gap. If no // such segment exists, PrevSegment returns a terminal iterator. func (gap memAcctGapIterator) PrevSegment() memAcctIterator { return memAcctsegmentBeforePosition(gap.node, gap.index) } // NextSegment returns the segment immediately after the iterated gap. If no // such segment exists, NextSegment returns a terminal iterator. func (gap memAcctGapIterator) NextSegment() memAcctIterator { return memAcctsegmentAfterPosition(gap.node, gap.index) } // PrevGap returns the iterated gap's predecessor. If no such gap exists, // PrevGap returns a terminal iterator. func (gap memAcctGapIterator) PrevGap() memAcctGapIterator { seg := gap.PrevSegment() if !seg.Ok() { return memAcctGapIterator{} } return seg.PrevGap() } // NextGap returns the iterated gap's successor. If no such gap exists, NextGap // returns a terminal iterator. func (gap memAcctGapIterator) NextGap() memAcctGapIterator { seg := gap.NextSegment() if !seg.Ok() { return memAcctGapIterator{} } return seg.NextGap() } // NextLargeEnoughGap returns the iterated gap's first next gap with larger // length than minSize. If not found, return a terminal gap iterator (does NOT // include this gap itself). // // Precondition: trackGaps must be 1. func (gap memAcctGapIterator) NextLargeEnoughGap(minSize uint64) memAcctGapIterator { if memAccttrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == gap.node.nrSegments { gap.node = gap.NextSegment().node gap.index = 0 return gap.nextLargeEnoughGapHelper(minSize) } return gap.nextLargeEnoughGapHelper(minSize) } // nextLargeEnoughGapHelper is the helper function used by NextLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the trailing gap of a non-leaf node. func (gap memAcctGapIterator) nextLargeEnoughGapHelper(minSize uint64) memAcctGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == gap.node.nrSegments)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return memAcctGapIterator{} } gap.index++ for gap.index <= gap.node.nrSegments { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index++ } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == gap.node.nrSegments { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // PrevLargeEnoughGap returns the iterated gap's first prev gap with larger or // equal length than minSize. If not found, return a terminal gap iterator // (does NOT include this gap itself). // // Precondition: trackGaps must be 1. func (gap memAcctGapIterator) PrevLargeEnoughGap(minSize uint64) memAcctGapIterator { if memAccttrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == 0 { gap.node = gap.PrevSegment().node gap.index = gap.node.nrSegments return gap.prevLargeEnoughGapHelper(minSize) } return gap.prevLargeEnoughGapHelper(minSize) } // prevLargeEnoughGapHelper is the helper function used by PrevLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the first gap of a non-leaf node. func (gap memAcctGapIterator) prevLargeEnoughGapHelper(minSize uint64) memAcctGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == 0)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return memAcctGapIterator{} } gap.index-- for gap.index >= 0 { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index-- } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == 0 { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // segmentBeforePosition returns the predecessor segment of the position given // by n.children[i], which may or may not contain a child. If no such segment // exists, segmentBeforePosition returns a terminal iterator. func memAcctsegmentBeforePosition(n *memAcctnode, i int) memAcctIterator { for i == 0 { if n.parent == nil { return memAcctIterator{} } n, i = n.parent, n.parentIndex } return memAcctIterator{n, i - 1} } // segmentAfterPosition returns the successor segment of the position given by // n.children[i], which may or may not contain a child. If no such segment // exists, segmentAfterPosition returns a terminal iterator. func memAcctsegmentAfterPosition(n *memAcctnode, i int) memAcctIterator { for i == n.nrSegments { if n.parent == nil { return memAcctIterator{} } n, i = n.parent, n.parentIndex } return memAcctIterator{n, i} } func memAcctzeroValueSlice(slice []memAcctInfo) { for i := range slice { memAcctSetFunctions{}.ClearValue(&slice[i]) } } func memAcctzeroNodeSlice(slice []*memAcctnode) { for i := range slice { slice[i] = nil } } // String stringifies a Set for debugging. func (s *memAcctSet) String() string { return s.root.String() } // String stringifies a node (and all of its children) for debugging. func (n *memAcctnode) String() string { var buf bytes.Buffer n.writeDebugString(&buf, "") return buf.String() } func (n *memAcctnode) writeDebugString(buf *bytes.Buffer, prefix string) { if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { buf.WriteString(prefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) } for i := 0; i < n.nrSegments; i++ { if child := n.children[i]; child != nil { cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) if child.parent != n || child.parentIndex != i { buf.WriteString(cprefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) } child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) } buf.WriteString(prefix) if n.hasChildren { if memAccttrackGaps != 0 { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v, maxGap: %d\n", i, n.keys[i], n.values[i], n.maxGap.Get())) } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } if child := n.children[n.nrSegments]; child != nil { child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) } } // FlatSegment represents a segment as a single object. FlatSegment is used as // an intermediate representation for save/restore and tests. // // +stateify savable type memAcctFlatSegment struct { Start uint64 End uint64 Value memAcctInfo } // ExportSlice returns a copy of all segments in the given set, in ascending // key order. func (s *memAcctSet) ExportSlice() []memAcctFlatSegment { var fs []memAcctFlatSegment for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { fs = append(fs, memAcctFlatSegment{ Start: seg.Start(), End: seg.End(), Value: seg.Value(), }) } return fs } // ImportSlice initializes the given set from the given slice. // // Preconditions: // - s must be empty. // - fs must represent a valid set (the segments in fs must have valid // lengths that do not overlap). // - The segments in fs must be sorted in ascending key order. func (s *memAcctSet) ImportSlice(fs []memAcctFlatSegment) error { if !s.IsEmpty() { return fmt.Errorf("cannot import into non-empty set %v", s) } gap := s.FirstGap() for i := range fs { f := &fs[i] r := __generics_imported0.FileRange{f.Start, f.End} if !gap.Range().IsSupersetOf(r) { return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: %v => %v", r, f.Value) } gap = s.InsertWithoutMerging(gap, r, f.Value).NextGap() } return nil } // segmentTestCheck returns an error if s is incorrectly sorted, does not // contain exactly expectedSegments segments, or contains a segment which // fails the passed check. // // This should be used only for testing, and has been added to this package for // templating convenience. func (s *memAcctSet) segmentTestCheck(expectedSegments int, segFunc func(int, __generics_imported0.FileRange, memAcctInfo) error) error { havePrev := false prev := uint64(0) nrSegments := 0 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { next := seg.Start() if havePrev && prev >= next { return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments) } if segFunc != nil { if err := segFunc(nrSegments, seg.Range(), seg.Value()); err != nil { return err } } prev = next havePrev = true nrSegments++ } if nrSegments != expectedSegments { return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments) } return nil } // countSegments counts the number of segments in the set. // // Similar to Check, this should only be used for testing. func (s *memAcctSet) countSegments() (segments int) { for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { segments++ } return segments } func (s *memAcctSet) saveRoot() []memAcctFlatSegment { fs := s.ExportSlice() fs = fs[:len(fs):len(fs)] return fs } func (s *memAcctSet) loadRoot(_ context.Context, fs []memAcctFlatSegment) { if err := s.ImportSlice(fs); err != nil { panic(err) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/pgalloc/memory_file_mutex.go000066400000000000000000000032551465435605700264560ustar00rootroot00000000000000package pgalloc import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type memoryFileMutex struct { mu sync.Mutex } var memoryFileprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var memoryFilelockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type memoryFilelockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *memoryFileMutex) Lock() { locking.AddGLock(memoryFileprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *memoryFileMutex) NestedLock(i memoryFilelockNameIndex) { locking.AddGLock(memoryFileprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *memoryFileMutex) Unlock() { locking.DelGLock(memoryFileprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *memoryFileMutex) NestedUnlock(i memoryFilelockNameIndex) { locking.DelGLock(memoryFileprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func memoryFileinitLockNames() {} func init() { memoryFileinitLockNames() memoryFileprefixIndex = locking.NewMutexClass(reflect.TypeOf(memoryFileMutex{}), memoryFilelockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/pgalloc/pgalloc.go000066400000000000000000001766071465435605700243620ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package pgalloc contains the page allocator subsystem, which provides // allocatable memory that may be mapped into application address spaces. package pgalloc import ( "fmt" "math" "os" "strings" "sync/atomic" "time" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/hostmm" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" ) const pagesPerHugePage = hostarch.HugePageSize / hostarch.PageSize // MemoryFile is a memmap.File whose pages may be allocated to arbitrary // users. type MemoryFile struct { memmap.NoBufferedIOFallback // MemoryFile owns a single backing file. Each page in the backing file is // considered "committed" or "uncommitted". A page is committed if the host // kernel is spending resources to store its contents and uncommitted // otherwise. This definition includes pages that the host kernel has // swapped. This is intentional; it means that committed pages can only // become uncommitted as a result of MemoryFile's actions, such that page // commitment does not change even if host kernel swapping behavior changes. // // Each page in the MemoryFile is in one of the following logical states, // protected by mu: // // - Void: Pages beyond the backing file's current size cannot store data. // Void pages are uncommitted. Extending the file's size transitions pages // between the old and new sizes from void to free. // // - Free: Free pages are immediately allocatable. Free pages are // uncommitted, and implicitly zeroed. Free pages become used when they are // allocated. // // - Used: Used pages have been allocated and currently have a non-zero // reference count. Used pages may transition from uncommitted to committed // outside of MemoryFile's control, but can only transition from committed // to uncommitted via MemoryFile.Decommit(). The content of used pages is // unknown. Used pages become waste when their reference count becomes // zero. // // - Waste: Waste pages have no users, but cannot be immediately // reallocated since their commitment state and content is unknown. Waste // pages may be uncommitted or committed, but cannot transition between the // two. MemoryFile's releaser goroutine transitions pages from waste to // releasing. Allocations that may return committed pages can transition // pages from waste to used (referred to as "recycling"). // // - Releasing: Releasing pages are waste pages that the releaser goroutine // has removed from waste-tracking, making them ineligible for recycling. // The releaser decommits releasing pages without holding mu, then // transitions them back to free or sub-released with mu locked. // // - Sub-release: Sub-released pages are released small pages within a // huge-page-backed allocation where the containing huge page as a whole // has not yet been released, which can arise because references are still // counted at page granularity within huge-page-backed ranges. Sub-released // pages cannot be used for allocations until release of the whole // containing huge page causes it to transition it to free. We assume that // sub-released pages are uncommitted; this isn't necessarily true (see // discussion of khugepaged elsewhere in this file), but the assumption is // consistent with legacy behavior. mu memoryFileMutex // unwasteSmall and unwasteHuge track waste ranges backed by small/huge pages // respectively. Both sets are "inverted"; segments exist for all ranges that // are *not* waste, allowing use of segment.Set gap-tracking to efficiently // find ranges for both release and recycling allocations. // // unwasteSmall and unwasteHuge are protected by mu. unwasteSmall unwasteSet unwasteHuge unwasteSet // haveWaste is true if there may be at least one waste page in the // MemoryFile. // // haveWaste is protected by mu. haveWaste bool // releaseCond is signaled (with mu locked) when haveWaste or destroyed // transitions from false to true. releaseCond sync.Cond // unfreeSmall and unfreeHuge track information for non-free ranges backed // by small/huge pages respectively. Each unfreeSet also contains segments // representing chunks that are backed by a different page size. Gaps in // the sets therefore represent free ranges backed by small/huge pages, // allowing use of segment.Set gap-tracking to efficiently find free ranges // for allocation. // // unfreeSmall and unfreeHuge are protected by mu. unfreeSmall unfreeSet unfreeHuge unfreeSet // subreleased maps hugepage-aligned file offsets to the number of // sub-released small pages within the hugepage beginning at that offset. // subreleased is protected by mu. subreleased map[uint64]uint64 // These fields are used for memory accounting. // // Memory accounting is based on identifying the set of committed pages. // Since we do not have direct access to application page tables (on most // platforms), tracking application accesses to uncommitted pages to detect // commitment would introduce additional page faults, which would be // prohibitively expensive. Instead, we query the host kernel to determine // which pages are committed. // // memAcct tracks memory accounting state, including commitment status, for // each page. Non-empty gaps in memAcct represent pages known to be // uncommitted (void, free, and sub-released pages). // // knownCommittedBytes is the number of bytes in the file known to be // committed, i.e. the span of all segments in memAcct for which // knownCommitted is true. // // commitSeq is a sequence counter used to detect races between scans for // committed pages and concurrent decommitment. // // nextCommitScan is the next time at which UpdateUsage() may scan the // backing file for commitment information. // // All of these fields are protected by mu. memAcct memAcctSet knownCommittedBytes uint64 commitSeq uint64 nextCommitScan time.Time // evictable maps EvictableMemoryUsers to eviction state. // // evictable is protected by mu. evictable map[EvictableMemoryUser]*evictableMemoryUserInfo // evictionWG counts the number of goroutines currently performing evictions. evictionWG sync.WaitGroup // opts holds options passed to NewMemoryFile. opts is immutable. opts MemoryFileOpts // savable is true if this MemoryFile will be saved via SaveTo() during // the kernel's SaveTo operation. savable is protected by mu. savable bool // destroyed is set by Destroy to instruct the releaser goroutine to // release all MemoryFile resources and exit. destroyed is protected by mu. destroyed bool // stopNotifyPressure stops memory cgroup pressure level // notifications used to drive eviction. stopNotifyPressure is // immutable. stopNotifyPressure func() // file is the backing file. The file pointer is immutable. file *os.File // chunks holds metadata for each usable chunk in the backing file. // // chunks is at the end of MemoryFile in hopes of placing it on a relatively // quiet cache line, since MapInternal() is by far the hottest path through // pgalloc. // // chunks is protected by mu. chunks slices are immutable. chunks atomic.Pointer[[]chunkInfo] } const ( chunkShift = 30 chunkSize = 1 << chunkShift // 1 GB chunkMask = chunkSize - 1 maxChunks = math.MaxInt64 / chunkSize // because file size is int64 ) // chunkInfo is the value type of MemoryFile.chunks. // // +stateify savable type chunkInfo struct { // mapping is the start address of a mapping of the chunk. // // mapping is immutable. mapping uintptr `state:"nosave"` // huge is true if this chunk is expected to be hugepage-backed and false if // this chunk is expected to be smallpage-backed. // // huge is immutable. huge bool } func (f *MemoryFile) chunksLoad() []chunkInfo { return *f.chunks.Load() } // forEachChunk invokes fn on a sequence of chunks that collectively span all // bytes in fr. In each call, chunkFR is the subset of fr that falls within // chunk. If any call to f returns false, forEachChunk stops iteration and // returns. func (f *MemoryFile) forEachChunk(fr memmap.FileRange, fn func(chunk *chunkInfo, chunkFR memmap.FileRange) bool) { chunks := f.chunksLoad() chunkStart := fr.Start &^ chunkMask i := int(fr.Start / chunkSize) for chunkStart < fr.End { chunkEnd := chunkStart + chunkSize if !fn(&chunks[i], fr.Intersect(memmap.FileRange{chunkStart, chunkEnd})) { return } chunkStart = chunkEnd i++ } } // unwasteInfo is the value type of MemoryFile.unwasteSmall/Huge. // // +stateify savable type unwasteInfo struct{} // unfreeInfo is the value type of MemoryFile.unfreeSmall/Huge. // // +stateify savable type unfreeInfo struct { // refs is the per-page reference count. refs is non-zero for used pages, // and zero for void, waste, releasing, and sub-released pages, as well as // pages backed by a different page size. refs uint64 } // memAcctInfo is the value type of MemoryFile.memAcct. // // +stateify savable type memAcctInfo struct { // kind is the memory accounting type. kind is allocation-dependent for // used pages, and usage.System for void, waste, releasing, and // sub-released pages. kind usage.MemoryKind // memCgID is the memory cgroup ID to which represented pages are accounted. memCgID uint32 // knownCommitted is true if represented pages are definitely committed. // (If knownCommitted is false, represented pages may or may not be // committed; pages that are definitely not committed are represented by // gaps in MemoryFile.memAcct.) knownCommitted bool // If true, represented pages are waste or releasing pages. wasteOrReleasing bool // If knownCommitted is false, commitSeq was the value of // MemoryFile.commitSeq when knownCommitted last transitioned to false. // Otherwise, commitSeq is 0. commitSeq uint64 } // An EvictableMemoryUser represents a user of MemoryFile-allocated memory that // may be asked to deallocate that memory in the presence of memory pressure. type EvictableMemoryUser interface { // Evict requests that the EvictableMemoryUser deallocate memory used by // er, which was registered as evictable by a previous call to // MemoryFile.MarkEvictable. // // Evict is not required to deallocate memory. In particular, since pgalloc // must call Evict without holding locks to avoid circular lock ordering, // it is possible that the passed range has already been marked as // unevictable by a racing call to MemoryFile.MarkUnevictable. // Implementations of EvictableMemoryUser must detect such races and handle // them by making Evict have no effect on unevictable ranges. // // After a call to Evict, the MemoryFile will consider the evicted range // unevictable (i.e. it will not call Evict on the same range again) until // informed otherwise by a subsequent call to MarkEvictable. Evict(ctx context.Context, er EvictableRange) } // An EvictableRange represents a range of uint64 offsets in an // EvictableMemoryUser. // // In practice, most EvictableMemoryUsers will probably be implementations of // memmap.Mappable, and EvictableRange therefore corresponds to // memmap.MappableRange. However, this package cannot depend on the memmap // package, since doing so would create a circular dependency. // // type EvictableRange // evictableMemoryUserInfo is the value type of MemoryFile.evictable. type evictableMemoryUserInfo struct { // ranges tracks all evictable ranges for the given user. ranges evictableRangeSet // If evicting is true, there is a goroutine currently evicting all // evictable ranges for this user. evicting bool } // MemoryFileOpts provides options to NewMemoryFile. type MemoryFileOpts struct { // DelayedEviction controls the extent to which the MemoryFile may delay // eviction of evictable allocations. DelayedEviction DelayedEvictionType // If UseHostMemcgPressure is true, use host memory cgroup pressure level // notifications to determine when eviction is necessary. This option has // no effect unless DelayedEviction is DelayedEvictionEnabled. UseHostMemcgPressure bool // DecommitOnDestroy indicates whether the entire host file should be // decommitted on destruction. This is appropriate for host filesystem based // files that need to be explicitly cleaned up to release disk space. DecommitOnDestroy bool // If DisableIMAWorkAround is true, NewMemoryFile will not call // IMAWorkAroundForMemFile(). DisableIMAWorkAround bool // DiskBackedFile indicates that the MemoryFile is backed by a file on disk. DiskBackedFile bool // RestoreID is an opaque string used to reassociate the MemoryFile with its // replacement during restore. RestoreID string // If ExpectHugepages is true, MemoryFile will expect that the host will // attempt to back AllocOpts.Huge == true allocations with huge pages. If // ExpectHugepages is false, MemoryFile will expect that the host will back // all allocations with small pages. ExpectHugepages bool // If AdviseHugepage is true, MemoryFile will request that the host back // AllocOpts.Huge == true allocations with huge pages using MADV_HUGEPAGE. AdviseHugepage bool // If AdviseNoHugepage is true, MemoryFile will request that the host back // AllocOpts.Huge == false allocations with small pages using // MADV_NOHUGEPAGE. AdviseNoHugepage bool // If DisableMemoryAccounting is true, memory usage observed by the // MemoryFile will not be reported in usage.MemoryAccounting. DisableMemoryAccounting bool } // DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction. type DelayedEvictionType uint8 const ( // DelayedEvictionDefault has unspecified behavior. DelayedEvictionDefault DelayedEvictionType = iota // DelayedEvictionDisabled requires that evictable allocations are evicted // as soon as possible. DelayedEvictionDisabled // DelayedEvictionEnabled requests that the MemoryFile delay eviction of // evictable allocations until doing so is considered necessary to avoid // performance degradation due to host memory pressure, or OOM kills. // // As of this writing, the behavior of DelayedEvictionEnabled depends on // whether or not MemoryFileOpts.UseHostMemcgPressure is enabled: // // - If UseHostMemcgPressure is true, evictions are delayed until memory // pressure is indicated. // // - Otherwise, evictions are only delayed until the releaser goroutine is // out of work (pages to release). DelayedEvictionEnabled // DelayedEvictionManual requires that evictable allocations are only // evicted when MemoryFile.StartEvictions() is called. This is extremely // dangerous outside of tests. DelayedEvictionManual ) // NewMemoryFile creates a MemoryFile backed by the given file. If // NewMemoryFile succeeds, ownership of file is transferred to the returned // MemoryFile. func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) { switch opts.DelayedEviction { case DelayedEvictionDefault: opts.DelayedEviction = DelayedEvictionEnabled case DelayedEvictionDisabled, DelayedEvictionManual: opts.UseHostMemcgPressure = false case DelayedEvictionEnabled: // ok default: return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction) } // Truncate the file to 0 bytes first to ensure that it's empty. if err := file.Truncate(0); err != nil { return nil, err } f := &MemoryFile{ opts: opts, file: file, } f.initFields() if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure { stop, err := hostmm.NotifyCurrentMemcgPressureCallback(func() { f.mu.Lock() startedAny := f.startEvictionsLocked() f.mu.Unlock() if startedAny { log.Debugf("pgalloc.MemoryFile performing evictions due to memcg pressure") } }, "low") if err != nil { return nil, fmt.Errorf("failed to configure memcg pressure level notifications: %v", err) } f.stopNotifyPressure = stop } go f.releaserMain() // S/R-SAFE: f.mu if !opts.DisableIMAWorkAround { IMAWorkAroundForMemFile(file.Fd()) } return f, nil } func (f *MemoryFile) initFields() { // Initially, all pages are void. fullFR := memmap.FileRange{0, math.MaxUint64} f.unwasteSmall.InsertRange(fullFR, unwasteInfo{}) f.unwasteHuge.InsertRange(fullFR, unwasteInfo{}) f.releaseCond.L = &f.mu f.unfreeSmall.InsertRange(fullFR, unfreeInfo{}) f.unfreeHuge.InsertRange(fullFR, unfreeInfo{}) f.subreleased = make(map[uint64]uint64) f.evictable = make(map[EvictableMemoryUser]*evictableMemoryUserInfo) chunks := []chunkInfo(nil) f.chunks.Store(&chunks) } // IMAWorkAroundForMemFile works around IMA by immediately creating a temporary // PROT_EXEC mapping, while the backing file is still small. IMA will ignore // any future mappings. // // The Linux kernel contains an optional feature called "Integrity // Measurement Architecture" (IMA). If IMA is enabled, it will checksum // binaries the first time they are mapped PROT_EXEC. This is bad news for // executable pages mapped from our backing file, which can grow to // terabytes in (sparse) size. If IMA attempts to checksum a file that // large, it will allocate all of the sparse pages and quickly exhaust all // memory. func IMAWorkAroundForMemFile(fd uintptr) { m, _, errno := unix.Syscall6( unix.SYS_MMAP, 0, hostarch.PageSize, unix.PROT_EXEC, unix.MAP_SHARED, fd, 0) if errno != 0 { // This isn't fatal (IMA may not even be in use). Log the error, but // don't return it. log.Warningf("Failed to pre-map MemoryFile PROT_EXEC: %v", errno) } else { if _, _, errno := unix.Syscall( unix.SYS_MUNMAP, m, hostarch.PageSize, 0); errno != 0 { panic(fmt.Sprintf("failed to unmap PROT_EXEC MemoryFile mapping: %v", errno)) } } } // Destroy releases all resources used by f. // // Preconditions: All pages allocated by f have been freed. // // Postconditions: None of f's methods may be called after Destroy. func (f *MemoryFile) Destroy() { f.mu.Lock() defer f.mu.Unlock() f.destroyed = true f.releaseCond.Signal() } // Preconditions: f.mu must be locked. func (f *MemoryFile) releaserDestroyLocked() { if !f.destroyed { panic("destroyed is no longer set") } if f.opts.DecommitOnDestroy { if chunks := f.chunksLoad(); len(chunks) != 0 { if err := f.decommitFile(memmap.FileRange{0, uint64(len(chunks)) * chunkSize}); err != nil { panic(fmt.Sprintf("failed to decommit entire memory file during destruction: %v", err)) } } } f.file.Close() // Ensure that any attempts to use f.file.Fd() fail instead of getting a fd // that has possibly been reassigned. f.file = nil chunks := f.chunksLoad() for i := range chunks { chunk := &chunks[i] _, _, errno := unix.Syscall(unix.SYS_MUNMAP, chunk.mapping, chunkSize, 0) if errno != 0 { log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", chunk.mapping, i, errno) } chunk.mapping = 0 } } // AllocOpts are options used in MemoryFile.Allocate. type AllocOpts struct { // Kind is the allocation's memory accounting type. Kind usage.MemoryKind // MemCgID is the memory cgroup ID and the zero value indicates that // the memory will not be accounted to any cgroup. MemCgID uint32 // Mode controls the commitment status of returned pages. Mode AllocationMode // If Huge is true, the allocation should be hugepage-backed if possible. Huge bool // Dir indicates the direction in which offsets are allocated. Dir Direction // If ReaderFunc is provided, the allocated memory is filled by calling it // repeatedly until either length bytes are read or a non-nil error is // returned. It returns the allocated memory, truncated down to the nearest // page. If this is shorter than length bytes due to an error returned by // ReaderFunc, it returns the partially filled fr and error. ReaderFunc safemem.ReaderFunc } // Direction is the type of AllocOpts.Dir. type Direction uint8 const ( // BottomUp allocates offsets in increasing offsets. BottomUp Direction = iota // TopDown allocates offsets in decreasing offsets. TopDown ) // String implements fmt.Stringer. func (d Direction) String() string { switch d { case BottomUp: return "up" case TopDown: return "down" } panic(fmt.Sprintf("invalid direction: %d", d)) } // AllocationMode is the type of AllocOpts.Mode. type AllocationMode int const ( // AllocateUncommitted indicates that MemoryFile.Allocate() must return // uncommitted pages. AllocateUncommitted AllocationMode = iota // AllocateCallerIndirectCommit indicates that the caller of // MemoryFile.Allocate() intends to commit all allocated pages, without // using our page tables. Thus, Allocate() may return committed or // uncommitted pages. AllocateCallerIndirectCommit // AllocateAndCommit indicates that MemoryFile.Allocate() must return // committed pages. AllocateAndCommit // AllocateAndWritePopulate indicates that the caller of // MemoryFile.Allocate() intends to commit all allocated pages, using our // page tables. Thus, Allocate() may return committed or uncommitted pages, // and should pre-populate page table entries permitting writing for // mappings of those pages returned by MapInternal(). AllocateAndWritePopulate ) // allocState holds the state of a call to MemoryFile.Allocate(). type allocState struct { length uint64 opts AllocOpts willCommit bool // either us or our caller recycled bool huge bool } // Allocate returns a range of initially-zeroed pages of the given length, with // a single reference on each page held by the caller. When the last reference // on an allocated page is released, ownership of the page is returned to the // MemoryFile, allowing it to be returned by a future call to Allocate. // // Preconditions: // - length > 0. // - length must be page-aligned. // - If opts.Hugepage == true, length must be hugepage-aligned. func (f *MemoryFile) Allocate(length uint64, opts AllocOpts) (memmap.FileRange, error) { if length == 0 || !hostarch.IsPageAligned(length) || (opts.Huge && !hostarch.IsHugePageAligned(length)) { panic(fmt.Sprintf("invalid allocation length: %#x", length)) } alloc := allocState{ length: length, opts: opts, willCommit: opts.Mode != AllocateUncommitted, huge: opts.Huge && f.opts.ExpectHugepages, } fr, err := f.findAllocatableAndMarkUsed(&alloc) if err != nil { return fr, err } var dsts safemem.BlockSeq if alloc.willCommit { needHugeTouch := false if alloc.recycled { // We will need writable page table entries in our address space to // zero these pages. alloc.opts.Mode = AllocateAndWritePopulate } else if alloc.opts.Mode != AllocateAndWritePopulate && ((alloc.huge && f.opts.AdviseHugepage) || (!alloc.huge && f.opts.AdviseNoHugepage)) { // If Mode is AllocateCallerIndirectCommit and we do nothing, the // first access to the allocation may be by the application, // through a platform.AddressSpace, which may not have // MADV_HUGEPAGE (=> vma flag VM_HUGEPAGE) set. Consequently, // shmem_fault() => shmem_get_folio_gfp() will commit a small page. // // If Mode is AllocateAndCommit and we do nothing, the first access // to the allocation is via fallocate(2), which has the same // problem: shmem_fallocate() => shmem_get_folio() => // shmem_get_folio_gfp(vma=NULL). // // khugepaged may eventually collapse the containing // hugepage-aligned region into a huge page when it scans our // mapping (khugepaged_scan_mm_slot() => khugepaged_scan_file()), // but this depends on khugepaged_max_ptes_none, and in addition to // the latency and overhead of doing so, this will incur another // round of page faults. // // If write-populating through our mappings succeeds, then it will // avoid this problem. Otherwise, we need to touch each huge page // through our mappings. // // An analogous problem applies if MADV_NOHUGEPAGE is required // rather than MADV_HUGEPAGE; MADV_NOHUGEPAGE is only enabled if // the file defaults to huge pages, so populating or touching // through our mappings is needed to ensure that the allocation is // small-page-backed. In this case, we only need to force // commitment of one small page per huge page to prevent future // page faults within the huge page from faulting a huge page, // though there's nothing we can do about khugepaged. alloc.opts.Mode = AllocateAndWritePopulate needHugeTouch = true } switch alloc.opts.Mode { case AllocateUncommitted, AllocateCallerIndirectCommit: // Nothing for us to do. case AllocateAndCommit: if err := f.commitFile(fr); err != nil { f.DecRef(fr) return memmap.FileRange{}, err } case AllocateAndWritePopulate: dsts, err = f.MapInternal(fr, hostarch.Write) if err != nil { f.DecRef(fr) return memmap.FileRange{}, err } if canPopulate() { rem := dsts for { if !tryPopulate(rem.Head()) { break } rem = rem.Tail() if rem.IsEmpty() { needHugeTouch = false break } } } if alloc.recycled { // The contents of recycled waste pages are initially unknown, so we // need to zero them. f.manuallyZero(fr) } else if needHugeTouch { // We only need to touch a single byte in each huge page. f.forEachMappingSlice(fr, func(bs []byte) { for i := 0; i < len(bs); i += hostarch.HugePageSize { bs[i] = 0 } }) } default: panic(fmt.Sprintf("unknown AllocOpts.Mode %d", alloc.opts.Mode)) } } if alloc.opts.ReaderFunc != nil { if dsts.IsEmpty() { dsts, err = f.MapInternal(fr, hostarch.Write) if err != nil { f.DecRef(fr) return memmap.FileRange{}, err } } n, err := safemem.ReadFullToBlocks(alloc.opts.ReaderFunc, dsts) un := uint64(hostarch.Addr(n).RoundDown()) if un < length { // Free unused memory and update fr to contain only the memory that is // still allocated. f.DecRef(memmap.FileRange{fr.Start + un, fr.End}) fr.End = fr.Start + un } if err != nil { return fr, err } } return fr, nil } func (f *MemoryFile) findAllocatableAndMarkUsed(alloc *allocState) (fr memmap.FileRange, err error) { unwaste := &f.unwasteSmall unfree := &f.unfreeSmall if alloc.huge { unwaste = &f.unwasteHuge unfree = &f.unfreeHuge } f.mu.Lock() defer f.mu.Unlock() if alloc.willCommit { // Try to recycle waste pages, since this avoids the overhead of // decommitting and then committing them again. var uwgap unwasteGapIterator if alloc.opts.Dir == BottomUp { uwgap = unwaste.FirstLargeEnoughGap(alloc.length) } else { uwgap = unwaste.LastLargeEnoughGap(alloc.length) } if uwgap.Ok() { alloc.recycled = true if alloc.opts.Dir == BottomUp { fr = memmap.FileRange{ Start: uwgap.Start(), End: uwgap.Start() + alloc.length, } } else { fr = memmap.FileRange{ Start: uwgap.End() - alloc.length, End: uwgap.End(), } } unwaste.Insert(uwgap, fr, unwasteInfo{}) // Update reference count for these pages from 0 to 1. unfree.MutateFullRange(fr, func(ufseg unfreeIterator) bool { uf := ufseg.ValuePtr() if uf.refs != 0 { panic(fmt.Sprintf("waste pages %v have unexpected refcount %d during recycling of %v\n%s", ufseg.Range(), uf.refs, fr, f.stringLocked())) } uf.refs = 1 return true }) // These pages should all be unknown-commitment or known-committed; // mark them unknown-commitment, for consistency with non-recycling // allocations (below). f.memAcct.MutateFullRange(fr, func(maseg memAcctIterator) bool { ma := maseg.ValuePtr() malen := maseg.Range().Length() if ma.knownCommitted { if ma.kind != usage.System { panic(fmt.Sprintf("waste pages %v have unexpected kind %v\n%s", maseg.Range(), ma.kind, f.stringLocked())) } ma.knownCommitted = false ma.commitSeq = 0 f.knownCommittedBytes -= malen if !f.opts.DisableMemoryAccounting { usage.MemoryAccounting.Dec(malen, usage.System, ma.memCgID) } } ma.kind = alloc.opts.Kind ma.memCgID = alloc.opts.MemCgID ma.wasteOrReleasing = false return true }) return } } // No suitable waste pages or we can't use them. retryFree: // Try to allocate free pages from existing chunks. var ufgap unfreeGapIterator if alloc.opts.Dir == BottomUp { ufgap = unfree.FirstLargeEnoughGap(alloc.length) } else { ufgap = unfree.LastLargeEnoughGap(alloc.length) } if !ufgap.Ok() { // Extend the file to create more chunks. err = f.extendChunksLocked(alloc) if err != nil { return } // Retry the allocation using new chunks. goto retryFree } if alloc.opts.Dir == BottomUp { fr = memmap.FileRange{ Start: ufgap.Start(), End: ufgap.Start() + alloc.length, } } else { fr = memmap.FileRange{ Start: ufgap.End() - alloc.length, End: ufgap.End(), } } unfree.Insert(ufgap, fr, unfreeInfo{refs: 1}) // These pages should all be known-decommitted; mark them // unknown-commitment, since they can be concurrently committed by the // allocation's users at any time until deallocation. // // If alloc.willCommit is true, we expect these pages to become committed // in the near future; mark them unknown-commitment anyway, since marking // them committed prematurely makes them more likely to be saved even if // zeroed, unless SaveOpts.ExcludeCommittedZeroPages is enabled. f.memAcct.InsertRange(fr, memAcctInfo{ kind: alloc.opts.Kind, memCgID: alloc.opts.MemCgID, knownCommitted: false, commitSeq: f.commitSeq, }) return } // Preconditions: f.mu must be locked. func (f *MemoryFile) extendChunksLocked(alloc *allocState) error { unfree := &f.unfreeSmall if alloc.huge { unfree = &f.unfreeHuge } oldChunks := f.chunksLoad() oldNrChunks := uint64(len(oldChunks)) oldFileSize := oldNrChunks * chunkSize // Determine how many chunks we need to satisfy alloc. tail := uint64(0) if oldNrChunks != 0 { if lastChunk := oldChunks[oldNrChunks-1]; lastChunk.huge == alloc.huge { // We can use free pages at the end of the current last chunk. if ufgap := unfree.FindGap(oldFileSize - 1); ufgap.Ok() { tail = ufgap.Range().Length() } } } incNrChunks := (alloc.length + chunkMask - tail) / chunkSize incFileSize := incNrChunks * chunkSize newNrChunks := oldNrChunks + incNrChunks if newNrChunks > maxChunks || newNrChunks < oldNrChunks /* overflow */ { return linuxerr.ENOMEM } newFileSize := newNrChunks * chunkSize // Extend the backing file and obtain mappings for the new chunks. If the // backing file is memory-backed, and THP is enabled, Linux will align our // mapping to a hugepage boundary; see // mm/shmem.c:shmem_get_unmapped_area(). // // In tests, f.file may be nil. var mapStart uintptr if f.file != nil { if err := f.file.Truncate(int64(newFileSize)); err != nil { return err } m, _, errno := unix.Syscall6( unix.SYS_MMAP, 0, uintptr(incFileSize), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED, f.file.Fd(), uintptr(oldFileSize)) if errno != 0 { return errno } mapStart = m f.madviseChunkMapping(mapStart, uintptr(incFileSize), alloc.huge) } // Update chunk state. newChunks := make([]chunkInfo, newNrChunks, newNrChunks) copy(newChunks, oldChunks) m := mapStart for i := oldNrChunks; i < newNrChunks; i++ { newChunks[i].huge = alloc.huge if f.file != nil { newChunks[i].mapping = m m += chunkSize } } f.chunks.Store(&newChunks) // Mark void pages free. unfree.RemoveFullRange(memmap.FileRange{ Start: oldNrChunks * chunkSize, End: newNrChunks * chunkSize, }) return nil } func (f *MemoryFile) madviseChunkMapping(addr, len uintptr, huge bool) { if huge { if f.opts.AdviseHugepage { _, _, errno := unix.Syscall(unix.SYS_MADVISE, addr, len, unix.MADV_HUGEPAGE) if errno != 0 { // Log this failure but continue. log.Warningf("madvise(%#x, %d, MADV_HUGEPAGE) failed: %s", addr, len, errno) } } } else { if f.opts.AdviseNoHugepage { _, _, errno := unix.Syscall(unix.SYS_MADVISE, addr, len, unix.MADV_NOHUGEPAGE) if errno != 0 { // Log this failure but continue. log.Warningf("madvise(%#x, %d, MADV_NOHUGEPAGE) failed: %s", addr, len, errno) } } } } var mlockDisabled atomicbitops.Uint32 var madvPopulateWriteDisabled atomicbitops.Uint32 func canPopulate() bool { return mlockDisabled.Load() == 0 || madvPopulateWriteDisabled.Load() == 0 } func tryPopulateMadv(b safemem.Block) bool { if madvPopulateWriteDisabled.Load() != 0 { return false } // Only call madvise(MADV_POPULATE_WRITE) if >=2 pages are being populated. // 1 syscall overhead >= 1 page fault overhead. This is because syscalls are // susceptible to additional overheads like seccomp-bpf filters and auditing. if b.Len() <= hostarch.PageSize { return true } _, _, errno := unix.Syscall(unix.SYS_MADVISE, b.Addr(), uintptr(b.Len()), unix.MADV_POPULATE_WRITE) if errno != 0 { if errno == unix.EINVAL { // EINVAL is expected if MADV_POPULATE_WRITE is not supported (Linux <5.14). log.Infof("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: madvise failed: %s", errno) } else { log.Warningf("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: madvise failed: %s", errno) } madvPopulateWriteDisabled.Store(1) return false } return true } func tryPopulateMlock(b safemem.Block) bool { if mlockDisabled.Load() != 0 { return false } // Call mlock to populate pages, then munlock to cancel the mlock (but keep // the pages populated). Only do so for hugepage-aligned address ranges to // ensure that splitting the VMA in mlock doesn't split any existing // hugepages. This assumes that two host syscalls, plus the MM overhead of // mlock + munlock, is faster on average than trapping for // HugePageSize/PageSize small page faults. start, ok := hostarch.Addr(b.Addr()).HugeRoundUp() if !ok { return true } end := hostarch.Addr(b.Addr() + uintptr(b.Len())).HugeRoundDown() if start >= end { return true } _, _, errno := unix.Syscall(unix.SYS_MLOCK, uintptr(start), uintptr(end-start), 0) unix.RawSyscall(unix.SYS_MUNLOCK, uintptr(start), uintptr(end-start), 0) if errno != 0 { if errno == unix.ENOMEM || errno == unix.EPERM { // These errors are expected from hitting non-zero RLIMIT_MEMLOCK, or // hitting zero RLIMIT_MEMLOCK without CAP_IPC_LOCK, respectively. log.Infof("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: mlock failed: %s", errno) } else { log.Warningf("Disabling pgalloc.MemoryFile.AllocateAndFill pre-population: mlock failed: %s", errno) } mlockDisabled.Store(1) return false } return true } func tryPopulate(b safemem.Block) bool { // There are two approaches for populating writable pages: // 1. madvise(MADV_POPULATE_WRITE). It has the desired effect: "Populate // (prefault) page tables writable, faulting in all pages in the range // just as if manually writing to each each page". // 2. Call mlock to populate pages, then munlock to cancel the mlock (but // keep the pages populated). // // Prefer the madvise(MADV_POPULATE_WRITE) approach because: // - Only requires 1 syscall, as opposed to 2 syscalls with mlock approach. // - It is faster because it doesn't have to modify vmas like mlock does. // - It works for disk-backed memory mappings too. The mlock approach doesn't // work for disk-backed filesystems (e.g. ext4). This is because // mlock(2) => mm/gup.c:__mm_populate() emulates a read fault on writable // MAP_SHARED mappings. For memory-backed (shmem) files, // mm/mmap.c:vma_set_page_prot() => vma_wants_writenotify() is false, so // the page table entries populated by a read fault are writable. For // disk-backed files, vma_set_page_prot() => vma_wants_writenotify() is // true, so the page table entries populated by a read fault are read-only. if tryPopulateMadv(b) { return true } return tryPopulateMlock(b) } // Decommit uncommits the given pages, causing them to become zeroed. // // Preconditions: // - fr.Start and fr.End must be page-aligned. // - fr.Length() > 0. // - At least one reference must be held on all pages in fr. func (f *MemoryFile) Decommit(fr memmap.FileRange) { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } f.decommitOrManuallyZero(fr) f.mu.Lock() defer f.mu.Unlock() f.memAcct.MutateFullRange(fr, func(maseg memAcctIterator) bool { ma := maseg.ValuePtr() if ma.knownCommitted { ma.knownCommitted = false malen := maseg.Range().Length() f.knownCommittedBytes -= malen if !f.opts.DisableMemoryAccounting { usage.MemoryAccounting.Dec(malen, ma.kind, ma.memCgID) } } // Update commitSeq to invalidate any observations made by // concurrent calls to f.updateUsageLocked(). ma.commitSeq = f.commitSeq return true }) } func (f *MemoryFile) commitFile(fr memmap.FileRange) error { // "The default operation (i.e., mode is zero) of fallocate() allocates the // disk space within the range specified by offset and len." - fallocate(2) return unix.Fallocate( int(f.file.Fd()), 0, // mode int64(fr.Start), int64(fr.Length())) } func (f *MemoryFile) decommitFile(fr memmap.FileRange) error { // "After a successful call, subsequent reads from this range will // return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with // FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2) return unix.Fallocate( int(f.file.Fd()), unix.FALLOC_FL_PUNCH_HOLE|unix.FALLOC_FL_KEEP_SIZE, int64(fr.Start), int64(fr.Length())) } func (f *MemoryFile) manuallyZero(fr memmap.FileRange) { f.forEachMappingSlice(fr, func(bs []byte) { clear(bs) }) } func (f *MemoryFile) decommitOrManuallyZero(fr memmap.FileRange) { if err := f.decommitFile(fr); err != nil { log.Warningf("Failed to decommit %v: %v", fr, err) // Zero the pages manually. This won't reduce memory usage, but at // least ensures that the pages will be zeroed when reallocated. f.manuallyZero(fr) } } // HasUniqueRef returns true if all pages in the given range have exactly one // reference. A return value of false is inherently racy, but if the caller // holds a reference on the given range and is preventing other goroutines from // copying it, then a return value of true is not racy. // // Preconditions: At least one reference must be held on all pages in fr. func (f *MemoryFile) HasUniqueRef(fr memmap.FileRange) bool { hasUniqueRef := true f.mu.Lock() defer f.mu.Unlock() f.forEachChunk(fr, func(chunk *chunkInfo, chunkFR memmap.FileRange) bool { unfree := &f.unfreeSmall if chunk.huge { unfree = &f.unfreeHuge } unfree.VisitFullRange(fr, func(ufseg unfreeIterator) bool { if ufseg.ValuePtr().refs != 1 { hasUniqueRef = false return false } return true }) return hasUniqueRef }) return hasUniqueRef } // IncRef implements memmap.File.IncRef. func (f *MemoryFile) IncRef(fr memmap.FileRange, memCgID uint32) { if !fr.WellFormed() || fr.Length() == 0 || !hostarch.IsPageAligned(fr.Start) || !hostarch.IsPageAligned(fr.End) { panic(fmt.Sprintf("invalid range: %v", fr)) } f.mu.Lock() defer f.mu.Unlock() f.forEachChunk(fr, func(chunk *chunkInfo, chunkFR memmap.FileRange) bool { unfree := &f.unfreeSmall if chunk.huge { unfree = &f.unfreeHuge } unfree.MutateFullRange(chunkFR, func(ufseg unfreeIterator) bool { uf := ufseg.ValuePtr() if uf.refs <= 0 { panic(fmt.Sprintf("IncRef(%v) called with %d references on pages %v", fr, uf.refs, ufseg.Range())) } uf.refs++ return true }) return true }) } // DecRef implements memmap.File.DecRef. func (f *MemoryFile) DecRef(fr memmap.FileRange) { if !fr.WellFormed() || fr.Length() == 0 || !hostarch.IsPageAligned(fr.Start) || !hostarch.IsPageAligned(fr.End) { panic(fmt.Sprintf("invalid range: %v", fr)) } f.mu.Lock() defer f.mu.Unlock() haveWaste := false f.forEachChunk(fr, func(chunk *chunkInfo, chunkFR memmap.FileRange) bool { unwaste := &f.unwasteSmall unfree := &f.unfreeSmall if chunk.huge { unwaste = &f.unwasteHuge unfree = &f.unfreeHuge } unfree.MutateFullRange(chunkFR, func(ufseg unfreeIterator) bool { uf := ufseg.ValuePtr() if uf.refs <= 0 { panic(fmt.Sprintf("DecRef(%v) called with %d references on pages %v", fr, uf.refs, ufseg.Range())) } uf.refs-- if uf.refs == 0 { // Mark these pages as waste. wasteFR := ufseg.Range() unwaste.RemoveFullRange(wasteFR) haveWaste = true // Reclassify waste memory as System until it's recycled or // released. f.memAcct.MutateFullRange(wasteFR, func(maseg memAcctIterator) bool { ma := maseg.ValuePtr() if !f.opts.DisableMemoryAccounting && ma.knownCommitted { usage.MemoryAccounting.Move(maseg.Range().Length(), usage.System, ma.kind, ma.memCgID) } ma.kind = usage.System ma.wasteOrReleasing = true return true }) } return true }) return true }) // Wake the releaser if we marked any pages as waste. Leave this until just // before unlocking f.mu. if haveWaste && !f.haveWaste { f.haveWaste = true f.releaseCond.Signal() } } // releaserMain implements the releaser goroutine. func (f *MemoryFile) releaserMain() { f.mu.Lock() MainLoop: for { for { if f.destroyed { f.releaserDestroyLocked() f.mu.Unlock() // This must be called without holding f.mu to avoid circular lock // ordering. if f.stopNotifyPressure != nil { f.stopNotifyPressure() } return } if f.haveWaste { break } if f.opts.DelayedEviction == DelayedEvictionEnabled && !f.opts.UseHostMemcgPressure { // No work to do. Evict any pending evictable allocations to // get more waste pages before going to sleep. f.startEvictionsLocked() } f.releaseCond.Wait() // releases f.mu while waiting } // Huge pages are relatively rare and expensive due to fragmentation // and the cost of compaction. Fragmentation is expected to increase // over time. Most allocations are done upwards, with the main // exception being thread stacks. So we expect lower offsets to weakly // correlate with older allocations, which are more likely to actually // be hugepage-backed. Thus, release from unwasteSmall before // unwasteHuge, and higher offsets before lower ones. for i, unwaste := range []*unwasteSet{&f.unwasteSmall, &f.unwasteHuge} { if uwgap := unwaste.LastLargeEnoughGap(1); uwgap.Ok() { fr := uwgap.Range() // Linux serializes fallocate()s on shmem files, so limit the amount we // release at once to avoid starving Decommit(). const maxReleasingBytes = 128 << 20 // 128 MB if fr.Length() > maxReleasingBytes { fr.Start = fr.End - maxReleasingBytes } unwaste.Insert(uwgap, fr, unwasteInfo{}) f.releaseLocked(fr, i == 1) continue MainLoop } } f.haveWaste = false } } // Preconditions: f.mu must be locked; it may be unlocked and reacquired. func (f *MemoryFile) releaseLocked(fr memmap.FileRange, huge bool) { defer func() { maseg := f.memAcct.LowerBoundSegmentSplitBefore(fr.Start) for maseg.Ok() && maseg.Start() < fr.End { maseg = f.memAcct.SplitAfter(maseg, fr.End) ma := maseg.ValuePtr() if ma.kind != usage.System { panic(fmt.Sprintf("waste pages %v have unexpected kind %v\n%s", maseg.Range(), ma.kind, f.stringLocked())) } if ma.knownCommitted { malen := maseg.Range().Length() f.knownCommittedBytes -= malen if !f.opts.DisableMemoryAccounting { usage.MemoryAccounting.Dec(malen, ma.kind, ma.memCgID) } } maseg = f.memAcct.Remove(maseg).NextSegment() } }() if !huge { // Decommit the range being released, then mark the released range as // freed. f.mu.Unlock() f.decommitOrManuallyZero(fr) f.mu.Lock() f.unfreeSmall.RemoveFullRange(fr) return } // Handle huge pages and sub-release. firstHugeStart := hostarch.HugePageRoundDown(fr.Start) lastHugeStart := hostarch.HugePageRoundDown(fr.End - 1) firstHugeEnd := firstHugeStart + hostarch.HugePageSize lastHugeEnd := lastHugeStart + hostarch.HugePageSize if firstHugeStart == lastHugeStart { // All of fr falls within a single huge page. oldSubrel := f.subreleased[firstHugeStart] incSubrel := fr.Length() / hostarch.PageSize newSubrel := oldSubrel + incSubrel if newSubrel == pagesPerHugePage { // Free this huge page. // // When a small page within a hugepage-backed allocation is // individually deallocated (becomes waste), we decommit it to // reduce memory usage (and for consistency with legacy behavior). // This requires the host to split the containing huge page, if one // exists. khugepaged may later re-assemble the containing huge // page, implicitly re-committing previously-decommitted small // pages as a result. // // Thus: When a huge page is freed, ensure that the whole huge page // is decommitted rather than just the final small page(s), to // ensure that we leave behind an uncommitted hugepage-sized range // with no re-committed small pages. if oldSubrel != 0 { delete(f.subreleased, firstHugeStart) } hugeFR := memmap.FileRange{firstHugeStart, firstHugeEnd} f.mu.Unlock() f.decommitOrManuallyZero(hugeFR) f.mu.Lock() f.unfreeHuge.RemoveFullRange(hugeFR) } else { f.subreleased[firstHugeStart] = newSubrel f.mu.Unlock() f.decommitOrManuallyZero(fr) f.mu.Lock() } return } // fr spans at least two huge pages. Resolve sub-release in the first and // last huge pages; any huge pages in between are decommitted/freed in // full. var ( decommitFR memmap.FileRange freeFR memmap.FileRange ) if fr.Start == firstHugeStart { decommitFR.Start = firstHugeStart freeFR.Start = firstHugeStart } else { oldSubrel := f.subreleased[firstHugeStart] incSubrel := (firstHugeEnd - fr.Start) / hostarch.PageSize newSubrel := oldSubrel + incSubrel if newSubrel == pagesPerHugePage { if oldSubrel != 0 { delete(f.subreleased, firstHugeStart) } decommitFR.Start = firstHugeStart freeFR.Start = firstHugeStart } else { decommitFR.Start = fr.Start freeFR.Start = firstHugeEnd } } if fr.End == lastHugeEnd { decommitFR.End = lastHugeEnd freeFR.End = lastHugeEnd } else { oldSubrel := f.subreleased[lastHugeStart] incSubrel := (fr.End - lastHugeStart) / hostarch.PageSize newSubrel := oldSubrel + incSubrel if newSubrel == pagesPerHugePage { if oldSubrel != 0 { delete(f.subreleased, lastHugeStart) } decommitFR.End = lastHugeEnd freeFR.End = lastHugeEnd } else { decommitFR.End = fr.End freeFR.End = lastHugeStart } } f.mu.Unlock() f.decommitOrManuallyZero(decommitFR) f.mu.Lock() if freeFR.Length() != 0 { f.unfreeHuge.RemoveFullRange(freeFR) } } // MapInternal implements memmap.File.MapInternal. func (f *MemoryFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { if !fr.WellFormed() || fr.Length() == 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } if at.Execute { return safemem.BlockSeq{}, linuxerr.EACCES } chunks := ((fr.End + chunkMask) / chunkSize) - (fr.Start / chunkSize) if chunks == 1 { // Avoid an unnecessary slice allocation. var seq safemem.BlockSeq f.forEachMappingSlice(fr, func(bs []byte) { seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs)) }) return seq, nil } blocks := make([]safemem.Block, 0, chunks) f.forEachMappingSlice(fr, func(bs []byte) { blocks = append(blocks, safemem.BlockFromSafeSlice(bs)) }) return safemem.BlockSeqFromSlice(blocks), nil } // forEachMappingSlice invokes fn on a sequence of byte slices that // collectively map all bytes in fr. func (f *MemoryFile) forEachMappingSlice(fr memmap.FileRange, fn func([]byte)) { f.forEachChunk(fr, func(chunk *chunkInfo, chunkFR memmap.FileRange) bool { fn(chunk.sliceAt(chunkFR)) return true }) } // MarkEvictable allows f to request memory deallocation by calling // user.Evict(er) in the future. // // Redundantly marking an already-evictable range as evictable has no effect. func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) { f.mu.Lock() defer f.mu.Unlock() info, ok := f.evictable[user] if !ok { info = &evictableMemoryUserInfo{} f.evictable[user] = info } gap := info.ranges.LowerBoundGap(er.Start) for gap.Ok() && gap.Start() < er.End { gapER := gap.Range().Intersect(er) if gapER.Length() == 0 { gap = gap.NextGap() continue } gap = info.ranges.Insert(gap, gapER, evictableRangeSetValue{}).NextGap() } if !info.evicting { switch f.opts.DelayedEviction { case DelayedEvictionDisabled: // Kick off eviction immediately. f.startEvictionGoroutineLocked(user, info) case DelayedEvictionEnabled: if !f.opts.UseHostMemcgPressure { // Ensure that the releaser goroutine is running, so that it // can start eviction when necessary. f.releaseCond.Signal() } } } } // MarkUnevictable informs f that user no longer considers er to be evictable, // so the MemoryFile should no longer call user.Evict(er). Note that, per // EvictableMemoryUser.Evict's documentation, user.Evict(er) may still be // called even after MarkUnevictable returns due to race conditions, and // implementations of EvictableMemoryUser must handle this possibility. // // Redundantly marking an already-unevictable range as unevictable has no // effect. func (f *MemoryFile) MarkUnevictable(user EvictableMemoryUser, er EvictableRange) { f.mu.Lock() defer f.mu.Unlock() info, ok := f.evictable[user] if !ok { return } info.ranges.RemoveRange(er) // We can only remove info if there's no eviction goroutine running on its // behalf. if !info.evicting && info.ranges.IsEmpty() { delete(f.evictable, user) } } // MarkAllUnevictable informs f that user no longer considers any offsets to be // evictable. It otherwise has the same semantics as MarkUnevictable. func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) { f.mu.Lock() defer f.mu.Unlock() info, ok := f.evictable[user] if !ok { return } info.ranges.RemoveAll() // We can only remove info if there's no eviction goroutine running on its // behalf. if !info.evicting { delete(f.evictable, user) } } // ShouldCacheEvictable returns true if f is meaningfully delaying evictions of // evictable memory, such that it may be advantageous to cache data in // evictable memory. The value returned by ShouldCacheEvictable may change // between calls. func (f *MemoryFile) ShouldCacheEvictable() bool { return f.opts.DelayedEviction == DelayedEvictionManual || f.opts.UseHostMemcgPressure } // UpdateUsage ensures that the memory usage statistics in // usage.MemoryAccounting are up to date. If memCgIDs is nil, all the pages // will be scanned. Else only the pages which belong to the memory cgroup ids // in memCgIDs will be scanned and the memory usage will be updated. func (f *MemoryFile) UpdateUsage(memCgIDs map[uint32]struct{}) error { // If we already know of every committed page, skip scanning. currentUsage, err := f.TotalUsage() if err != nil { return err } f.mu.Lock() defer f.mu.Unlock() if currentUsage == f.knownCommittedBytes { return nil } // Linux updates usage values at CONFIG_HZ; throttle our scans to the same // frequency. startTime := time.Now() if startTime.Before(f.nextCommitScan) { return nil } if memCgIDs == nil { f.nextCommitScan = startTime.Add(time.Second / linux.CLOCKS_PER_SEC) } err = f.updateUsageLocked(memCgIDs, false /* alsoScanCommitted */, mincore) if log.IsLogging(log.Debug) { log.Debugf("UpdateUsage: took %v, currentUsage=%d knownCommittedBytes=%d", time.Since(startTime), currentUsage, f.knownCommittedBytes) } return err } // updateUsageLocked attempts to detect commitment of previously-uncommitted // pages by invoking checkCommitted, and updates memory accounting to reflect // newly-committed pages. If alsoScanCommitted is true, updateUsageLocked also // attempts to detect decommitment of previously-committed pages; this is only // used by save/restore, which optionally temporarily treats zeroed pages as // decommitted in order to skip saving them. // // For each page i in bs, checkCommitted must set committed[i] to 1 if the page // is committed and 0 otherwise. off is the offset at which bs begins. // wasCommitted is true if the page was known-committed before the call to // checkCommitted and false otherwise; wasCommitted can only be true if // alsoScanCommitted is true. // // Precondition: f.mu must be held; it may be unlocked and reacquired. // +checklocks:f.mu func (f *MemoryFile) updateUsageLocked(memCgIDs map[uint32]struct{}, alsoScanCommitted bool, checkCommitted func(bs []byte, committed []byte, off uint64, wasCommitted bool) error) error { // Track if anything changed to elide the merge. changedAny := false defer func() { if changedAny { f.memAcct.MergeAll() } }() // Reused mincore buffer. var buf []byte maseg := f.memAcct.FirstSegment() unscannedStart := uint64(0) for maseg.Ok() { ma := maseg.ValuePtr() if ma.wasteOrReleasing { // Skip scanning of waste and releasing pages. This isn't // necessarily correct, since !knownCommitted may have become // committed after the last call to updateUsageLocked(), then // transitioned from used to waste. However, this is consistent // with legacy behavior. maseg = maseg.NextSegment() continue } wasCommitted := ma.knownCommitted if !alsoScanCommitted && wasCommitted { maseg = maseg.NextSegment() continue } // Scan the pages of the given memCgID only. This will avoid scanning // the whole memory file when the memory usage is required only for a // specific cgroup. The total memory usage of all cgroups can be // obtained when memCgIDs is nil. if memCgIDs != nil { if _, ok := memCgIDs[ma.memCgID]; !ok { maseg = maseg.NextSegment() continue } } fr := maseg.Range() if fr.Start < unscannedStart { fr.Start = unscannedStart } var checkErr error f.forEachChunk(fr, func(chunk *chunkInfo, chunkFR memmap.FileRange) bool { s := chunk.sliceAt(chunkFR) // Ensure that we have sufficient buffer for the call (one byte per // page). The length of s must be page-aligned. bufLen := len(s) / hostarch.PageSize if len(buf) < bufLen { buf = make([]byte, bufLen) } // Query for new pages in core. // NOTE(b/165896008): mincore (which is passed as checkCommitted by // f.UpdateUsage()) might take a really long time. So unlock f.mu while // checkCommitted runs. lastCommitSeq := f.commitSeq f.commitSeq++ f.mu.Unlock() // +checklocksforce err := checkCommitted(s, buf, chunkFR.Start, wasCommitted) f.mu.Lock() if err != nil { checkErr = err return false } // Reconcile internal state with buf. Since we temporarily dropped // f.mu, f.memAcct may have changed, and maseg/ma are no longer // valid. If wasCommitted is false, then we are marking ranges that // are now committed; otherwise, we are marking ranges that are now // uncommitted. unchangedVal := byte(0) if wasCommitted { unchangedVal = 1 } maseg = f.memAcct.LowerBoundSegment(chunkFR.Start) for i := 0; i < bufLen; { if buf[i]&0x1 == unchangedVal { i++ continue } // Scan to the end of this changed range. j := i + 1 for ; j < bufLen; j++ { if buf[j]&0x1 == unchangedVal { break } } changedFR := memmap.FileRange{ Start: chunkFR.Start + uint64(i*hostarch.PageSize), End: chunkFR.Start + uint64(j*hostarch.PageSize), } // Advance maseg to changedFR.Start. for maseg.Ok() && maseg.End() <= changedFR.Start { maseg = maseg.NextSegment() } // Update pages overlapping changedFR, but don't mark ranges as // committed if they might have raced with decommit. for maseg.Ok() && maseg.Start() < changedFR.End { if !maseg.ValuePtr().wasteOrReleasing && ((!wasCommitted && !maseg.ValuePtr().knownCommitted && ma.commitSeq <= lastCommitSeq) || (wasCommitted && maseg.ValuePtr().knownCommitted)) { maseg = f.memAcct.Isolate(maseg, changedFR) ma := maseg.ValuePtr() amount := maseg.Range().Length() if wasCommitted { ma.knownCommitted = false ma.commitSeq = f.commitSeq f.knownCommittedBytes -= amount if !f.opts.DisableMemoryAccounting { usage.MemoryAccounting.Dec(amount, ma.kind, ma.memCgID) } } else { ma.knownCommitted = true ma.commitSeq = 0 f.knownCommittedBytes += amount if !f.opts.DisableMemoryAccounting { usage.MemoryAccounting.Inc(amount, ma.kind, ma.memCgID) } } changedAny = true } maseg = maseg.NextSegment() } // Continue scanning for changed pages. i = j + 1 } // Don't continue to the next chunk, since while f.mu was unlocked // its memory accounting state could have changed completely. // Instead, continue the outer loop with the first segment after // chunkFR.End. maseg = f.memAcct.LowerBoundSegment(chunkFR.End) unscannedStart = chunkFR.End return false }) if checkErr != nil { return checkErr } } return nil } // TotalUsage returns an aggregate usage for all memory statistics except // Mapped (which is external to MemoryFile). This is generally much cheaper // than UpdateUsage, but will not provide a fine-grained breakdown. func (f *MemoryFile) TotalUsage() (uint64, error) { // Stat the underlying file to discover the underlying usage. stat(2) // always reports the allocated block count in units of 512 bytes. This // includes pages in the page cache and swapped pages. var stat unix.Stat_t if err := unix.Fstat(int(f.file.Fd()), &stat); err != nil { return 0, err } return uint64(stat.Blocks * 512), nil } // TotalSize returns the current size of the backing file in bytes, which is an // upper bound on the amount of memory that can currently be allocated from the // MemoryFile. The value returned by TotalSize is permitted to change. func (f *MemoryFile) TotalSize() uint64 { return uint64(len(f.chunksLoad())) * chunkSize } // File returns the backing file. func (f *MemoryFile) File() *os.File { return f.file } // FD implements memmap.File.FD. func (f *MemoryFile) FD() int { return int(f.file.Fd()) } // IsDiskBacked returns true if f is backed by a file on disk. func (f *MemoryFile) IsDiskBacked() bool { return f.opts.DiskBackedFile } // HugepagesEnabled returns true if the MemoryFile expects to back allocations // for which AllocOpts.Huge == true with huge pages. func (f *MemoryFile) HugepagesEnabled() bool { return f.opts.ExpectHugepages } // String implements fmt.Stringer.String. func (f *MemoryFile) String() string { f.mu.Lock() defer f.mu.Unlock() return f.stringLocked() } // Preconditions: f.mu must be locked. func (f *MemoryFile) stringLocked() string { var b strings.Builder fmt.Fprintf(&b, "unwasteSmall:\n%s", &f.unwasteSmall) if f.opts.ExpectHugepages { fmt.Fprintf(&b, "unwasteHuge:\n%s", &f.unwasteHuge) } fmt.Fprintf(&b, "unfreeSmall:\n%s", &f.unfreeSmall) if f.opts.ExpectHugepages { fmt.Fprintf(&b, "unfreeHuge:\n%s", &f.unfreeHuge) fmt.Fprintf(&b, "subreleased:\n") for off, pgs := range f.subreleased { fmt.Fprintf(&b, "- %#x: %d\n", off, pgs) } } fmt.Fprintf(&b, "memAcct:\n%s", &f.memAcct) return b.String() } // StartEvictions requests that f evict all evictable allocations. It does not // wait for eviction to complete; for this, see MemoryFile.WaitForEvictions. func (f *MemoryFile) StartEvictions() { f.mu.Lock() defer f.mu.Unlock() f.startEvictionsLocked() } // Preconditions: f.mu must be locked. func (f *MemoryFile) startEvictionsLocked() bool { startedAny := false for user, info := range f.evictable { // Don't start multiple goroutines to evict the same user's // allocations. if !info.evicting { f.startEvictionGoroutineLocked(user, info) startedAny = true } } return startedAny } // Preconditions: // - info == f.evictable[user]. // - !info.evicting. // - f.mu must be locked. func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) { info.evicting = true f.evictionWG.Add(1) go func() { // S/R-SAFE: f.evictionWG defer f.evictionWG.Done() for { f.mu.Lock() info, ok := f.evictable[user] if !ok { // This shouldn't happen: only this goroutine is permitted // to delete this entry. f.mu.Unlock() panic(fmt.Sprintf("evictableMemoryUserInfo for EvictableMemoryUser %v deleted while eviction goroutine running", user)) } if info.ranges.IsEmpty() { delete(f.evictable, user) f.mu.Unlock() return } // Evict from the end of info.ranges, under the assumption that // if ranges in user start being used again (and are // consequently marked unevictable), such uses are more likely // to start from the beginning of user. seg := info.ranges.LastSegment() er := seg.Range() info.ranges.Remove(seg) // user.Evict() must be called without holding f.mu to avoid // circular lock ordering. f.mu.Unlock() user.Evict(context.Background(), er) } }() } // WaitForEvictions blocks until f is no longer evicting any evictable // allocations. func (f *MemoryFile) WaitForEvictions() { f.evictionWG.Wait() } type unwasteSetFunctions struct{} func (unwasteSetFunctions) MinKey() uint64 { return 0 } func (unwasteSetFunctions) MaxKey() uint64 { return math.MaxUint64 } func (unwasteSetFunctions) ClearValue(val *unwasteInfo) { } func (unwasteSetFunctions) Merge(_ memmap.FileRange, val1 unwasteInfo, _ memmap.FileRange, val2 unwasteInfo) (unwasteInfo, bool) { return val1, val1 == val2 } func (unwasteSetFunctions) Split(_ memmap.FileRange, val unwasteInfo, _ uint64) (unwasteInfo, unwasteInfo) { return val, val } type unfreeSetFunctions struct{} func (unfreeSetFunctions) MinKey() uint64 { return 0 } func (unfreeSetFunctions) MaxKey() uint64 { return math.MaxUint64 } func (unfreeSetFunctions) ClearValue(val *unfreeInfo) { } func (unfreeSetFunctions) Merge(_ memmap.FileRange, val1 unfreeInfo, _ memmap.FileRange, val2 unfreeInfo) (unfreeInfo, bool) { return val1, val1 == val2 } func (unfreeSetFunctions) Split(_ memmap.FileRange, val unfreeInfo, _ uint64) (unfreeInfo, unfreeInfo) { return val, val } type memAcctSetFunctions struct{} func (memAcctSetFunctions) MinKey() uint64 { return 0 } func (memAcctSetFunctions) MaxKey() uint64 { return math.MaxUint64 } func (memAcctSetFunctions) ClearValue(val *memAcctInfo) { } func (memAcctSetFunctions) Merge(_ memmap.FileRange, val1 memAcctInfo, _ memmap.FileRange, val2 memAcctInfo) (memAcctInfo, bool) { return val1, val1 == val2 } func (memAcctSetFunctions) Split(_ memmap.FileRange, val memAcctInfo, _ uint64) (memAcctInfo, memAcctInfo) { return val, val } // evictableRangeSetValue is the value type of evictableRangeSet. type evictableRangeSetValue struct{} type evictableRangeSetFunctions struct{} func (evictableRangeSetFunctions) MinKey() uint64 { return 0 } func (evictableRangeSetFunctions) MaxKey() uint64 { return math.MaxUint64 } func (evictableRangeSetFunctions) ClearValue(val *evictableRangeSetValue) { } func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetValue, _ EvictableRange, _ evictableRangeSetValue) (evictableRangeSetValue, bool) { return evictableRangeSetValue{}, true } func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) { return evictableRangeSetValue{}, evictableRangeSetValue{} } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/pgalloc/pgalloc_state_autogen.go000066400000000000000000000345231465435605700272720ustar00rootroot00000000000000// automatically generated by stateify. package pgalloc import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (r *EvictableRange) StateTypeName() string { return "pkg/sentry/pgalloc.EvictableRange" } func (r *EvictableRange) StateFields() []string { return []string{ "Start", "End", } } func (r *EvictableRange) beforeSave() {} // +checklocksignore func (r *EvictableRange) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.Start) stateSinkObject.Save(1, &r.End) } func (r *EvictableRange) afterLoad(context.Context) {} // +checklocksignore func (r *EvictableRange) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.Start) stateSourceObject.Load(1, &r.End) } func (s *evictableRangeSet) StateTypeName() string { return "pkg/sentry/pgalloc.evictableRangeSet" } func (s *evictableRangeSet) StateFields() []string { return []string{ "root", } } func (s *evictableRangeSet) beforeSave() {} // +checklocksignore func (s *evictableRangeSet) StateSave(stateSinkObject state.Sink) { s.beforeSave() var rootValue []evictableRangeFlatSegment rootValue = s.saveRoot() stateSinkObject.SaveValue(0, rootValue) } func (s *evictableRangeSet) afterLoad(context.Context) {} // +checklocksignore func (s *evictableRangeSet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new([]evictableRangeFlatSegment), func(y any) { s.loadRoot(ctx, y.([]evictableRangeFlatSegment)) }) } func (n *evictableRangenode) StateTypeName() string { return "pkg/sentry/pgalloc.evictableRangenode" } func (n *evictableRangenode) StateFields() []string { return []string{ "nrSegments", "parent", "parentIndex", "hasChildren", "maxGap", "keys", "values", "children", } } func (n *evictableRangenode) beforeSave() {} // +checklocksignore func (n *evictableRangenode) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.nrSegments) stateSinkObject.Save(1, &n.parent) stateSinkObject.Save(2, &n.parentIndex) stateSinkObject.Save(3, &n.hasChildren) stateSinkObject.Save(4, &n.maxGap) stateSinkObject.Save(5, &n.keys) stateSinkObject.Save(6, &n.values) stateSinkObject.Save(7, &n.children) } func (n *evictableRangenode) afterLoad(context.Context) {} // +checklocksignore func (n *evictableRangenode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.nrSegments) stateSourceObject.Load(1, &n.parent) stateSourceObject.Load(2, &n.parentIndex) stateSourceObject.Load(3, &n.hasChildren) stateSourceObject.Load(4, &n.maxGap) stateSourceObject.Load(5, &n.keys) stateSourceObject.Load(6, &n.values) stateSourceObject.Load(7, &n.children) } func (e *evictableRangeFlatSegment) StateTypeName() string { return "pkg/sentry/pgalloc.evictableRangeFlatSegment" } func (e *evictableRangeFlatSegment) StateFields() []string { return []string{ "Start", "End", "Value", } } func (e *evictableRangeFlatSegment) beforeSave() {} // +checklocksignore func (e *evictableRangeFlatSegment) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.Start) stateSinkObject.Save(1, &e.End) stateSinkObject.Save(2, &e.Value) } func (e *evictableRangeFlatSegment) afterLoad(context.Context) {} // +checklocksignore func (e *evictableRangeFlatSegment) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.Start) stateSourceObject.Load(1, &e.End) stateSourceObject.Load(2, &e.Value) } func (s *memAcctSet) StateTypeName() string { return "pkg/sentry/pgalloc.memAcctSet" } func (s *memAcctSet) StateFields() []string { return []string{ "root", } } func (s *memAcctSet) beforeSave() {} // +checklocksignore func (s *memAcctSet) StateSave(stateSinkObject state.Sink) { s.beforeSave() var rootValue []memAcctFlatSegment rootValue = s.saveRoot() stateSinkObject.SaveValue(0, rootValue) } func (s *memAcctSet) afterLoad(context.Context) {} // +checklocksignore func (s *memAcctSet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new([]memAcctFlatSegment), func(y any) { s.loadRoot(ctx, y.([]memAcctFlatSegment)) }) } func (n *memAcctnode) StateTypeName() string { return "pkg/sentry/pgalloc.memAcctnode" } func (n *memAcctnode) StateFields() []string { return []string{ "nrSegments", "parent", "parentIndex", "hasChildren", "maxGap", "keys", "values", "children", } } func (n *memAcctnode) beforeSave() {} // +checklocksignore func (n *memAcctnode) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.nrSegments) stateSinkObject.Save(1, &n.parent) stateSinkObject.Save(2, &n.parentIndex) stateSinkObject.Save(3, &n.hasChildren) stateSinkObject.Save(4, &n.maxGap) stateSinkObject.Save(5, &n.keys) stateSinkObject.Save(6, &n.values) stateSinkObject.Save(7, &n.children) } func (n *memAcctnode) afterLoad(context.Context) {} // +checklocksignore func (n *memAcctnode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.nrSegments) stateSourceObject.Load(1, &n.parent) stateSourceObject.Load(2, &n.parentIndex) stateSourceObject.Load(3, &n.hasChildren) stateSourceObject.Load(4, &n.maxGap) stateSourceObject.Load(5, &n.keys) stateSourceObject.Load(6, &n.values) stateSourceObject.Load(7, &n.children) } func (m *memAcctFlatSegment) StateTypeName() string { return "pkg/sentry/pgalloc.memAcctFlatSegment" } func (m *memAcctFlatSegment) StateFields() []string { return []string{ "Start", "End", "Value", } } func (m *memAcctFlatSegment) beforeSave() {} // +checklocksignore func (m *memAcctFlatSegment) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.Start) stateSinkObject.Save(1, &m.End) stateSinkObject.Save(2, &m.Value) } func (m *memAcctFlatSegment) afterLoad(context.Context) {} // +checklocksignore func (m *memAcctFlatSegment) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.Start) stateSourceObject.Load(1, &m.End) stateSourceObject.Load(2, &m.Value) } func (c *chunkInfo) StateTypeName() string { return "pkg/sentry/pgalloc.chunkInfo" } func (c *chunkInfo) StateFields() []string { return []string{ "huge", } } func (c *chunkInfo) beforeSave() {} // +checklocksignore func (c *chunkInfo) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.huge) } func (c *chunkInfo) afterLoad(context.Context) {} // +checklocksignore func (c *chunkInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.huge) } func (u *unwasteInfo) StateTypeName() string { return "pkg/sentry/pgalloc.unwasteInfo" } func (u *unwasteInfo) StateFields() []string { return []string{} } func (u *unwasteInfo) beforeSave() {} // +checklocksignore func (u *unwasteInfo) StateSave(stateSinkObject state.Sink) { u.beforeSave() } func (u *unwasteInfo) afterLoad(context.Context) {} // +checklocksignore func (u *unwasteInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (u *unfreeInfo) StateTypeName() string { return "pkg/sentry/pgalloc.unfreeInfo" } func (u *unfreeInfo) StateFields() []string { return []string{ "refs", } } func (u *unfreeInfo) beforeSave() {} // +checklocksignore func (u *unfreeInfo) StateSave(stateSinkObject state.Sink) { u.beforeSave() stateSinkObject.Save(0, &u.refs) } func (u *unfreeInfo) afterLoad(context.Context) {} // +checklocksignore func (u *unfreeInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &u.refs) } func (m *memAcctInfo) StateTypeName() string { return "pkg/sentry/pgalloc.memAcctInfo" } func (m *memAcctInfo) StateFields() []string { return []string{ "kind", "memCgID", "knownCommitted", "wasteOrReleasing", "commitSeq", } } func (m *memAcctInfo) beforeSave() {} // +checklocksignore func (m *memAcctInfo) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.kind) stateSinkObject.Save(1, &m.memCgID) stateSinkObject.Save(2, &m.knownCommitted) stateSinkObject.Save(3, &m.wasteOrReleasing) stateSinkObject.Save(4, &m.commitSeq) } func (m *memAcctInfo) afterLoad(context.Context) {} // +checklocksignore func (m *memAcctInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.kind) stateSourceObject.Load(1, &m.memCgID) stateSourceObject.Load(2, &m.knownCommitted) stateSourceObject.Load(3, &m.wasteOrReleasing) stateSourceObject.Load(4, &m.commitSeq) } func (s *unfreeSet) StateTypeName() string { return "pkg/sentry/pgalloc.unfreeSet" } func (s *unfreeSet) StateFields() []string { return []string{ "root", } } func (s *unfreeSet) beforeSave() {} // +checklocksignore func (s *unfreeSet) StateSave(stateSinkObject state.Sink) { s.beforeSave() var rootValue []unfreeFlatSegment rootValue = s.saveRoot() stateSinkObject.SaveValue(0, rootValue) } func (s *unfreeSet) afterLoad(context.Context) {} // +checklocksignore func (s *unfreeSet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new([]unfreeFlatSegment), func(y any) { s.loadRoot(ctx, y.([]unfreeFlatSegment)) }) } func (n *unfreenode) StateTypeName() string { return "pkg/sentry/pgalloc.unfreenode" } func (n *unfreenode) StateFields() []string { return []string{ "nrSegments", "parent", "parentIndex", "hasChildren", "maxGap", "keys", "values", "children", } } func (n *unfreenode) beforeSave() {} // +checklocksignore func (n *unfreenode) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.nrSegments) stateSinkObject.Save(1, &n.parent) stateSinkObject.Save(2, &n.parentIndex) stateSinkObject.Save(3, &n.hasChildren) stateSinkObject.Save(4, &n.maxGap) stateSinkObject.Save(5, &n.keys) stateSinkObject.Save(6, &n.values) stateSinkObject.Save(7, &n.children) } func (n *unfreenode) afterLoad(context.Context) {} // +checklocksignore func (n *unfreenode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.nrSegments) stateSourceObject.Load(1, &n.parent) stateSourceObject.Load(2, &n.parentIndex) stateSourceObject.Load(3, &n.hasChildren) stateSourceObject.Load(4, &n.maxGap) stateSourceObject.Load(5, &n.keys) stateSourceObject.Load(6, &n.values) stateSourceObject.Load(7, &n.children) } func (u *unfreeFlatSegment) StateTypeName() string { return "pkg/sentry/pgalloc.unfreeFlatSegment" } func (u *unfreeFlatSegment) StateFields() []string { return []string{ "Start", "End", "Value", } } func (u *unfreeFlatSegment) beforeSave() {} // +checklocksignore func (u *unfreeFlatSegment) StateSave(stateSinkObject state.Sink) { u.beforeSave() stateSinkObject.Save(0, &u.Start) stateSinkObject.Save(1, &u.End) stateSinkObject.Save(2, &u.Value) } func (u *unfreeFlatSegment) afterLoad(context.Context) {} // +checklocksignore func (u *unfreeFlatSegment) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &u.Start) stateSourceObject.Load(1, &u.End) stateSourceObject.Load(2, &u.Value) } func (s *unwasteSet) StateTypeName() string { return "pkg/sentry/pgalloc.unwasteSet" } func (s *unwasteSet) StateFields() []string { return []string{ "root", } } func (s *unwasteSet) beforeSave() {} // +checklocksignore func (s *unwasteSet) StateSave(stateSinkObject state.Sink) { s.beforeSave() var rootValue []unwasteFlatSegment rootValue = s.saveRoot() stateSinkObject.SaveValue(0, rootValue) } func (s *unwasteSet) afterLoad(context.Context) {} // +checklocksignore func (s *unwasteSet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new([]unwasteFlatSegment), func(y any) { s.loadRoot(ctx, y.([]unwasteFlatSegment)) }) } func (n *unwastenode) StateTypeName() string { return "pkg/sentry/pgalloc.unwastenode" } func (n *unwastenode) StateFields() []string { return []string{ "nrSegments", "parent", "parentIndex", "hasChildren", "maxGap", "keys", "values", "children", } } func (n *unwastenode) beforeSave() {} // +checklocksignore func (n *unwastenode) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.nrSegments) stateSinkObject.Save(1, &n.parent) stateSinkObject.Save(2, &n.parentIndex) stateSinkObject.Save(3, &n.hasChildren) stateSinkObject.Save(4, &n.maxGap) stateSinkObject.Save(5, &n.keys) stateSinkObject.Save(6, &n.values) stateSinkObject.Save(7, &n.children) } func (n *unwastenode) afterLoad(context.Context) {} // +checklocksignore func (n *unwastenode) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.nrSegments) stateSourceObject.Load(1, &n.parent) stateSourceObject.Load(2, &n.parentIndex) stateSourceObject.Load(3, &n.hasChildren) stateSourceObject.Load(4, &n.maxGap) stateSourceObject.Load(5, &n.keys) stateSourceObject.Load(6, &n.values) stateSourceObject.Load(7, &n.children) } func (u *unwasteFlatSegment) StateTypeName() string { return "pkg/sentry/pgalloc.unwasteFlatSegment" } func (u *unwasteFlatSegment) StateFields() []string { return []string{ "Start", "End", "Value", } } func (u *unwasteFlatSegment) beforeSave() {} // +checklocksignore func (u *unwasteFlatSegment) StateSave(stateSinkObject state.Sink) { u.beforeSave() stateSinkObject.Save(0, &u.Start) stateSinkObject.Save(1, &u.End) stateSinkObject.Save(2, &u.Value) } func (u *unwasteFlatSegment) afterLoad(context.Context) {} // +checklocksignore func (u *unwasteFlatSegment) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &u.Start) stateSourceObject.Load(1, &u.End) stateSourceObject.Load(2, &u.Value) } func init() { state.Register((*EvictableRange)(nil)) state.Register((*evictableRangeSet)(nil)) state.Register((*evictableRangenode)(nil)) state.Register((*evictableRangeFlatSegment)(nil)) state.Register((*memAcctSet)(nil)) state.Register((*memAcctnode)(nil)) state.Register((*memAcctFlatSegment)(nil)) state.Register((*chunkInfo)(nil)) state.Register((*unwasteInfo)(nil)) state.Register((*unfreeInfo)(nil)) state.Register((*memAcctInfo)(nil)) state.Register((*unfreeSet)(nil)) state.Register((*unfreenode)(nil)) state.Register((*unfreeFlatSegment)(nil)) state.Register((*unwasteSet)(nil)) state.Register((*unwastenode)(nil)) state.Register((*unwasteFlatSegment)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/pgalloc/pgalloc_unsafe.go000066400000000000000000000022531465435605700257040ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pgalloc import ( "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sentry/memmap" ) // Preconditions: The FileRange represented by c is a superset of fr. func (c *chunkInfo) sliceAt(fr memmap.FileRange) []byte { return unsafe.Slice((*byte)(unsafe.Pointer(c.mapping+uintptr(fr.Start&chunkMask))), fr.Length()) } func mincore(s []byte, buf []byte, off uint64, wasCommitted bool) error { if _, _, errno := unix.RawSyscall( unix.SYS_MINCORE, uintptr(unsafe.Pointer(&s[0])), uintptr(len(s)), uintptr(unsafe.Pointer(&buf[0]))); errno != 0 { return errno } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/pgalloc/pgalloc_unsafe_state_autogen.go000066400000000000000000000000711465435605700306220ustar00rootroot00000000000000// automatically generated by stateify. package pgalloc golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/pgalloc/save_restore.go000066400000000000000000000230271465435605700254250ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pgalloc import ( "bytes" "context" "fmt" "io" "runtime" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/state" "gvisor.dev/gvisor/pkg/state/statefile" "gvisor.dev/gvisor/pkg/sync" ) // SaveOpts provides options to MemoryFile.SaveTo(). type SaveOpts struct { // If ExcludeCommittedZeroPages is true, SaveTo() will scan both committed // and possibly-committed pages to find zero pages, whose contents are // saved implicitly rather than explicitly to reduce checkpoint size. If // ExcludeCommittedZeroPages is false, SaveTo() will scan only // possibly-committed pages to find zero pages. // // Enabling ExcludeCommittedZeroPages will usually increase the time taken // by SaveTo() (due to the larger number of pages that must be scanned), // but may instead improve SaveTo() and LoadFrom() time, and checkpoint // size, if the application has many committed zero pages. ExcludeCommittedZeroPages bool } // SaveTo writes f's state to the given stream. func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer, pw io.Writer, opts SaveOpts) error { // Wait for memory release. f.mu.Lock() defer f.mu.Unlock() for f.haveWaste { f.mu.Unlock() runtime.Gosched() f.mu.Lock() } // Ensure that there are no pending evictions. if len(f.evictable) != 0 { panic(fmt.Sprintf("evictions still pending for %d users; call StartEvictions and WaitForEvictions before SaveTo", len(f.evictable))) } // Ensure that all pages that contain non-zero bytes are marked // known-committed, since we only store known-committed pages below. zeroPage := make([]byte, hostarch.PageSize) var ( decommitWarnOnce sync.Once decommitPendingFR memmap.FileRange scanTotal uint64 decommitTotal uint64 decommitCount uint64 ) decommitNow := func(fr memmap.FileRange) { decommitTotal += fr.Length() decommitCount++ if err := f.decommitFile(fr); err != nil { // This doesn't impact the correctness of saved memory, it just // means that we're incrementally more likely to OOM. Complain, but // don't abort saving. decommitWarnOnce.Do(func() { log.Warningf("Decommitting MemoryFile offsets %v while saving failed: %v", fr, err) }) } } decommitAddPage := func(off uint64) { // Invariants: // (1) All of decommitPendingFR lies within a single huge page. // (2) decommitPendingFR.End is hugepage-aligned iff // decommitPendingFR.Length() == 0. end := off + hostarch.PageSize if decommitPendingFR.End == off { // Merge with the existing range. By invariants, the page {off, // end} must be within the same huge page as the rest of // decommitPendingFR. decommitPendingFR.End = end } else { // Decommit the existing range and start a new one. if decommitPendingFR.Length() != 0 { decommitNow(decommitPendingFR) } decommitPendingFR = memmap.FileRange{off, end} } // Maintain invariants by decommitting if we've reached the end of the // containing huge page. if hostarch.IsHugePageAligned(end) { decommitNow(decommitPendingFR) decommitPendingFR = memmap.FileRange{} } } err := f.updateUsageLocked(nil, opts.ExcludeCommittedZeroPages, func(bs []byte, committed []byte, off uint64, wasCommitted bool) error { scanTotal += uint64(len(bs)) for pgoff := 0; pgoff < len(bs); pgoff += hostarch.PageSize { i := pgoff / hostarch.PageSize pg := bs[pgoff : pgoff+hostarch.PageSize] if !bytes.Equal(pg, zeroPage) { committed[i] = 1 continue } committed[i] = 0 if !wasCommitted { // Reading the page may have caused it to be committed; // decommit it to reduce memory usage. decommitAddPage(off + uint64(pgoff)) } } return nil }) if decommitPendingFR.Length() != 0 { decommitNow(decommitPendingFR) decommitPendingFR = memmap.FileRange{} } if err != nil { return err } log.Debugf("MemoryFile.SaveTo: scanned %d bytes, decommitted %d bytes in %d syscalls", scanTotal, decommitTotal, decommitCount) // Save metadata. if _, err := state.Save(ctx, w, &f.unwasteSmall); err != nil { return err } if _, err := state.Save(ctx, w, &f.unwasteHuge); err != nil { return err } if _, err := state.Save(ctx, w, &f.unfreeSmall); err != nil { return err } if _, err := state.Save(ctx, w, &f.unfreeHuge); err != nil { return err } if _, err := state.Save(ctx, w, &f.subreleased); err != nil { return err } if _, err := state.Save(ctx, w, &f.memAcct); err != nil { return err } if _, err := state.Save(ctx, w, &f.knownCommittedBytes); err != nil { return err } if _, err := state.Save(ctx, w, &f.commitSeq); err != nil { return err } if _, err := state.Save(ctx, w, f.chunks.Load()); err != nil { return err } // Dump out committed pages. for maseg := f.memAcct.FirstSegment(); maseg.Ok(); maseg = maseg.NextSegment() { if !maseg.ValuePtr().knownCommitted { continue } // Write a header to distinguish from objects. if err := state.WriteHeader(w, uint64(maseg.Range().Length()), false); err != nil { return err } // Write out data. var ioErr error f.forEachMappingSlice(maseg.Range(), func(s []byte) { if ioErr != nil { return } _, ioErr = pw.Write(s) }) if ioErr != nil { return ioErr } } return nil } // MarkSavable marks f as savable. func (f *MemoryFile) MarkSavable() { f.mu.Lock() defer f.mu.Unlock() f.savable = true } // IsSavable returns true if f is savable. func (f *MemoryFile) IsSavable() bool { f.mu.Lock() defer f.mu.Unlock() return f.savable } // RestoreID returns the restore ID for f. func (f *MemoryFile) RestoreID() string { return f.opts.RestoreID } // LoadFrom loads MemoryFile state from the given stream. func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader, pr *statefile.AsyncReader) error { // Clear sets since non-empty sets will panic if loaded into. f.unwasteSmall.RemoveAll() f.unwasteHuge.RemoveAll() f.unfreeSmall.RemoveAll() f.unfreeHuge.RemoveAll() f.memAcct.RemoveAll() // Load metadata. if _, err := state.Load(ctx, r, &f.unwasteSmall); err != nil { return err } if _, err := state.Load(ctx, r, &f.unwasteHuge); err != nil { return err } if _, err := state.Load(ctx, r, &f.unfreeSmall); err != nil { return err } if _, err := state.Load(ctx, r, &f.unfreeHuge); err != nil { return err } if _, err := state.Load(ctx, r, &f.subreleased); err != nil { return err } if _, err := state.Load(ctx, r, &f.memAcct); err != nil { return err } if _, err := state.Load(ctx, r, &f.knownCommittedBytes); err != nil { return err } if _, err := state.Load(ctx, r, &f.commitSeq); err != nil { return err } var chunks []chunkInfo if _, err := state.Load(ctx, r, &chunks); err != nil { return err } f.chunks.Store(&chunks) if err := f.file.Truncate(int64(len(chunks)) * chunkSize); err != nil { return err } // Obtain chunk mappings, then madvise them concurrently with loading data. var ( madviseEnd atomicbitops.Uint64 madviseChan = make(chan struct{}, 1) madviseWG sync.WaitGroup ) if len(chunks) != 0 { m, _, errno := unix.Syscall6( unix.SYS_MMAP, 0, uintptr(len(chunks)*chunkSize), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED, f.file.Fd(), 0) if errno != 0 { return fmt.Errorf("failed to mmap MemoryFile: %w", errno) } for i := range chunks { chunk := &chunks[i] chunk.mapping = m m += chunkSize } madviseWG.Add(1) go func() { defer madviseWG.Done() for i := range chunks { chunk := &chunks[i] f.madviseChunkMapping(chunk.mapping, chunkSize, chunk.huge) madviseEnd.Add(chunkSize) select { case madviseChan <- struct{}{}: default: } } }() } defer madviseWG.Wait() // Load committed pages. for maseg := f.memAcct.FirstSegment(); maseg.Ok(); maseg = maseg.NextSegment() { if !maseg.ValuePtr().knownCommitted { continue } // Verify header. length, object, err := state.ReadHeader(r) if err != nil { return err } if object { // Not expected. return fmt.Errorf("unexpected object") } if expected := uint64(maseg.Range().Length()); length != expected { // Size mismatch. return fmt.Errorf("mismatched segment: expected %d, got %d", expected, length) } // Wait for all chunks spanned by this segment to be madvised. for madviseEnd.Load() < maseg.End() { <-madviseChan } // Read data. var ioErr error f.forEachMappingSlice(maseg.Range(), func(s []byte) { if ioErr != nil { return } if pr != nil { pr.ReadAsync(s) } else { _, ioErr = io.ReadFull(r, s) } }) if ioErr != nil { return ioErr } // Update accounting for restored pages. We need to do this here since // these segments are marked as "known committed", and will be skipped // over on accounting scans. if !f.opts.DisableMemoryAccounting { amount := maseg.Range().Length() usage.MemoryAccounting.Inc(amount, maseg.ValuePtr().kind, maseg.ValuePtr().memCgID) } } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/pgalloc/unfree_set.go000066400000000000000000002031011465435605700250540ustar00rootroot00000000000000package pgalloc import ( __generics_imported0 "gvisor.dev/gvisor/pkg/sentry/memmap" ) import ( "bytes" "context" "fmt" ) // trackGaps is an optional parameter. // // If trackGaps is 1, the Set will track maximum gap size recursively, // enabling the GapIterator.{Prev,Next}LargeEnoughGap functions. In this // case, Key must be an unsigned integer. // // trackGaps must be 0 or 1. const unfreetrackGaps = 1 var _ = uint8(unfreetrackGaps << 7) // Will fail if not zero or one. // dynamicGap is a type that disappears if trackGaps is 0. type unfreedynamicGap [unfreetrackGaps]uint64 // Get returns the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *unfreedynamicGap) Get() uint64 { return d[:][0] } // Set sets the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *unfreedynamicGap) Set(v uint64) { d[:][0] = v } const ( // minDegree is the minimum degree of an internal node in a Set B-tree. // // - Any non-root node has at least minDegree-1 segments. // // - Any non-root internal (non-leaf) node has at least minDegree children. // // - The root node may have fewer than minDegree-1 segments, but it may // only have 0 segments if the tree is empty. // // Our implementation requires minDegree >= 3. Higher values of minDegree // usually improve performance, but increase memory usage for small sets. unfreeminDegree = 10 unfreemaxDegree = 2 * unfreeminDegree ) // A Set is a mapping of segments with non-overlapping Range keys. The zero // value for a Set is an empty set. Set values are not safely movable nor // copyable. Set is thread-compatible. // // +stateify savable type unfreeSet struct { root unfreenode `state:".([]unfreeFlatSegment)"` } // IsEmpty returns true if the set contains no segments. func (s *unfreeSet) IsEmpty() bool { return s.root.nrSegments == 0 } // IsEmptyRange returns true iff no segments in the set overlap the given // range. This is semantically equivalent to s.SpanRange(r) == 0, but may be // more efficient. func (s *unfreeSet) IsEmptyRange(r __generics_imported0.FileRange) bool { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return true } _, gap := s.Find(r.Start) if !gap.Ok() { return false } return r.End <= gap.End() } // Span returns the total size of all segments in the set. func (s *unfreeSet) Span() uint64 { var sz uint64 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { sz += seg.Range().Length() } return sz } // SpanRange returns the total size of the intersection of segments in the set // with the given range. func (s *unfreeSet) SpanRange(r __generics_imported0.FileRange) uint64 { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return 0 } var sz uint64 for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { sz += seg.Range().Intersect(r).Length() } return sz } // FirstSegment returns the first segment in the set. If the set is empty, // FirstSegment returns a terminal iterator. func (s *unfreeSet) FirstSegment() unfreeIterator { if s.root.nrSegments == 0 { return unfreeIterator{} } return s.root.firstSegment() } // LastSegment returns the last segment in the set. If the set is empty, // LastSegment returns a terminal iterator. func (s *unfreeSet) LastSegment() unfreeIterator { if s.root.nrSegments == 0 { return unfreeIterator{} } return s.root.lastSegment() } // FirstGap returns the first gap in the set. func (s *unfreeSet) FirstGap() unfreeGapIterator { n := &s.root for n.hasChildren { n = n.children[0] } return unfreeGapIterator{n, 0} } // LastGap returns the last gap in the set. func (s *unfreeSet) LastGap() unfreeGapIterator { n := &s.root for n.hasChildren { n = n.children[n.nrSegments] } return unfreeGapIterator{n, n.nrSegments} } // Find returns the segment or gap whose range contains the given key. If a // segment is found, the returned Iterator is non-terminal and the // returned GapIterator is terminal. Otherwise, the returned Iterator is // terminal and the returned GapIterator is non-terminal. func (s *unfreeSet) Find(key uint64) (unfreeIterator, unfreeGapIterator) { n := &s.root for { lower := 0 upper := n.nrSegments for lower < upper { i := lower + (upper-lower)/2 if r := n.keys[i]; key < r.End { if key >= r.Start { return unfreeIterator{n, i}, unfreeGapIterator{} } upper = i } else { lower = i + 1 } } i := lower if !n.hasChildren { return unfreeIterator{}, unfreeGapIterator{n, i} } n = n.children[i] } } // FindSegment returns the segment whose range contains the given key. If no // such segment exists, FindSegment returns a terminal iterator. func (s *unfreeSet) FindSegment(key uint64) unfreeIterator { seg, _ := s.Find(key) return seg } // LowerBoundSegment returns the segment with the lowest range that contains a // key greater than or equal to min. If no such segment exists, // LowerBoundSegment returns a terminal iterator. func (s *unfreeSet) LowerBoundSegment(min uint64) unfreeIterator { seg, gap := s.Find(min) if seg.Ok() { return seg } return gap.NextSegment() } // UpperBoundSegment returns the segment with the highest range that contains a // key less than or equal to max. If no such segment exists, UpperBoundSegment // returns a terminal iterator. func (s *unfreeSet) UpperBoundSegment(max uint64) unfreeIterator { seg, gap := s.Find(max) if seg.Ok() { return seg } return gap.PrevSegment() } // FindGap returns the gap containing the given key. If no such gap exists // (i.e. the set contains a segment containing that key), FindGap returns a // terminal iterator. func (s *unfreeSet) FindGap(key uint64) unfreeGapIterator { _, gap := s.Find(key) return gap } // LowerBoundGap returns the gap with the lowest range that is greater than or // equal to min. func (s *unfreeSet) LowerBoundGap(min uint64) unfreeGapIterator { seg, gap := s.Find(min) if gap.Ok() { return gap } return seg.NextGap() } // UpperBoundGap returns the gap with the highest range that is less than or // equal to max. func (s *unfreeSet) UpperBoundGap(max uint64) unfreeGapIterator { seg, gap := s.Find(max) if gap.Ok() { return gap } return seg.PrevGap() } // FirstLargeEnoughGap returns the first gap in the set with at least the given // length. If no such gap exists, FirstLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *unfreeSet) FirstLargeEnoughGap(minSize uint64) unfreeGapIterator { if unfreetrackGaps != 1 { panic("set is not tracking gaps") } gap := s.FirstGap() if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // LastLargeEnoughGap returns the last gap in the set with at least the given // length. If no such gap exists, LastLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *unfreeSet) LastLargeEnoughGap(minSize uint64) unfreeGapIterator { if unfreetrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LastGap() if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // LowerBoundLargeEnoughGap returns the first gap in the set with at least the // given length and whose range contains a key greater than or equal to min. If // no such gap exists, LowerBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *unfreeSet) LowerBoundLargeEnoughGap(min, minSize uint64) unfreeGapIterator { if unfreetrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LowerBoundGap(min) if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // UpperBoundLargeEnoughGap returns the last gap in the set with at least the // given length and whose range contains a key less than or equal to max. If no // such gap exists, UpperBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *unfreeSet) UpperBoundLargeEnoughGap(max, minSize uint64) unfreeGapIterator { if unfreetrackGaps != 1 { panic("set is not tracking gaps") } gap := s.UpperBoundGap(max) if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // Insert inserts the given segment into the given gap. If the new segment can // be merged with adjacent segments, Insert will do so. Insert returns an // iterator to the segment containing the inserted value (which may have been // merged with other values). All existing iterators (including gap, but not // including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, Insert panics. // // Insert is semantically equivalent to a InsertWithoutMerging followed by a // Merge, but may be more efficient. Note that there is no unchecked variant of // Insert since Insert must retrieve and inspect gap's predecessor and // successor segments regardless. func (s *unfreeSet) Insert(gap unfreeGapIterator, r __generics_imported0.FileRange, val unfreeInfo) unfreeIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } prev, next := gap.PrevSegment(), gap.NextSegment() if prev.Ok() && prev.End() > r.Start { panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) } if next.Ok() && next.Start() < r.End { panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) } if prev.Ok() && prev.End() == r.Start { if mval, ok := (unfreeSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { shrinkMaxGap := unfreetrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() prev.SetEndUnchecked(r.End) prev.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } if next.Ok() && next.Start() == r.End { val = mval if mval, ok := (unfreeSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { prev.SetEndUnchecked(next.End()) prev.SetValue(mval) return s.Remove(next).PrevSegment() } } return prev } } if next.Ok() && next.Start() == r.End { if mval, ok := (unfreeSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { shrinkMaxGap := unfreetrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() next.SetStartUnchecked(r.Start) next.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } return next } } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMerging inserts the given segment into the given gap and // returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, // InsertWithoutMerging panics. func (s *unfreeSet) InsertWithoutMerging(gap unfreeGapIterator, r __generics_imported0.FileRange, val unfreeInfo) unfreeIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if gr := gap.Range(); !gr.IsSupersetOf(r) { panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMergingUnchecked inserts the given segment into the given gap // and returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // Preconditions: // - r.Start >= gap.Start(). // - r.End <= gap.End(). func (s *unfreeSet) InsertWithoutMergingUnchecked(gap unfreeGapIterator, r __generics_imported0.FileRange, val unfreeInfo) unfreeIterator { gap = gap.node.rebalanceBeforeInsert(gap) splitMaxGap := unfreetrackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get()) copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) gap.node.keys[gap.index] = r gap.node.values[gap.index] = val gap.node.nrSegments++ if splitMaxGap { gap.node.updateMaxGapLeaf() } return unfreeIterator{gap.node, gap.index} } // InsertRange inserts the given segment into the set. If the new segment can // be merged with adjacent segments, InsertRange will do so. InsertRange // returns an iterator to the segment containing the inserted value (which may // have been merged with other values). All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertRange panics. // // InsertRange searches the set to find the gap to insert into. If the caller // already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *unfreeSet) InsertRange(r __generics_imported0.FileRange, val unfreeInfo) unfreeIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.Insert(gap, r, val) } // InsertWithoutMergingRange inserts the given segment into the set and returns // an iterator to the inserted segment. All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertWithoutMergingRange panics. // // InsertWithoutMergingRange searches the set to find the gap to insert into. // If the caller already has the appropriate GapIterator, or if the caller // needs to do additional work between finding the gap and insertion, use // InsertWithoutMerging instead. func (s *unfreeSet) InsertWithoutMergingRange(r __generics_imported0.FileRange, val unfreeInfo) unfreeIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.InsertWithoutMerging(gap, r, val) } // TryInsertRange attempts to insert the given segment into the set. If the new // segment can be merged with adjacent segments, TryInsertRange will do so. // TryInsertRange returns an iterator to the segment containing the inserted // value (which may have been merged with other values). All existing iterators // (excluding the returned iterator) are invalidated. // // If the new segment would overlap an existing segment, TryInsertRange does // nothing and returns a terminal iterator. // // TryInsertRange searches the set to find the gap to insert into. If the // caller already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *unfreeSet) TryInsertRange(r __generics_imported0.FileRange, val unfreeInfo) unfreeIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return unfreeIterator{} } if gap.End() < r.End { return unfreeIterator{} } return s.Insert(gap, r, val) } // TryInsertWithoutMergingRange attempts to insert the given segment into the // set. If successful, it returns an iterator to the inserted segment; all // existing iterators (excluding the returned iterator) are invalidated. If the // new segment would overlap an existing segment, TryInsertWithoutMergingRange // does nothing and returns a terminal iterator. // // TryInsertWithoutMergingRange searches the set to find the gap to insert // into. If the caller already has the appropriate GapIterator, or if the // caller needs to do additional work between finding the gap and insertion, // use InsertWithoutMerging instead. func (s *unfreeSet) TryInsertWithoutMergingRange(r __generics_imported0.FileRange, val unfreeInfo) unfreeIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return unfreeIterator{} } if gap.End() < r.End { return unfreeIterator{} } return s.InsertWithoutMerging(gap, r, val) } // Remove removes the given segment and returns an iterator to the vacated gap. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. func (s *unfreeSet) Remove(seg unfreeIterator) unfreeGapIterator { if seg.node.hasChildren { victim := seg.PrevSegment() seg.SetRangeUnchecked(victim.Range()) seg.SetValue(victim.Value()) nextAdjacentNode := seg.NextSegment().node if unfreetrackGaps != 0 { nextAdjacentNode.updateMaxGapLeaf() } return s.Remove(victim).NextGap() } copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) unfreeSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) seg.node.nrSegments-- if unfreetrackGaps != 0 { seg.node.updateMaxGapLeaf() } return seg.node.rebalanceAfterRemove(unfreeGapIterator{seg.node, seg.index}) } // RemoveAll removes all segments from the set. All existing iterators are // invalidated. func (s *unfreeSet) RemoveAll() { s.root = unfreenode{} } // RemoveRange removes all segments in the given range. An iterator to the // newly formed gap is returned, and all existing iterators are invalidated. // // RemoveRange searches the set to find segments to remove. If the caller // already has an iterator to either end of the range of segments to remove, or // if the caller needs to do additional work before removing each segment, // iterate segments and call Remove in a loop instead. func (s *unfreeSet) RemoveRange(r __generics_imported0.FileRange) unfreeGapIterator { seg, gap := s.Find(r.Start) if seg.Ok() { seg = s.Isolate(seg, r) gap = s.Remove(seg) } for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { seg = s.SplitAfter(seg, r.End) gap = s.Remove(seg) } return gap } // RemoveFullRange is equivalent to RemoveRange, except that if any key in the // given range does not correspond to a segment, RemoveFullRange panics. func (s *unfreeSet) RemoveFullRange(r __generics_imported0.FileRange) unfreeGapIterator { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) end := seg.End() gap := s.Remove(seg) if r.End <= end { return gap } seg = gap.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // Merge attempts to merge two neighboring segments. If successful, Merge // returns an iterator to the merged segment, and all existing iterators are // invalidated. Otherwise, Merge returns a terminal iterator. // // If first is not the predecessor of second, Merge panics. func (s *unfreeSet) Merge(first, second unfreeIterator) unfreeIterator { if first.NextSegment() != second { panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) } return s.MergeUnchecked(first, second) } // MergeUnchecked attempts to merge two neighboring segments. If successful, // MergeUnchecked returns an iterator to the merged segment, and all existing // iterators are invalidated. Otherwise, MergeUnchecked returns a terminal // iterator. // // Precondition: first is the predecessor of second: first.NextSegment() == // second, first == second.PrevSegment(). func (s *unfreeSet) MergeUnchecked(first, second unfreeIterator) unfreeIterator { if first.End() == second.Start() { if mval, ok := (unfreeSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { first.SetEndUnchecked(second.End()) first.SetValue(mval) return s.Remove(second).PrevSegment() } } return unfreeIterator{} } // MergePrev attempts to merge the given segment with its predecessor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergePrev is usually used when mutating segments while iterating them in // order of increasing keys, to attempt merging of each mutated segment with // its previously-mutated predecessor. In such cases, merging a mutated segment // with its unmutated successor would incorrectly cause the latter to be // skipped. func (s *unfreeSet) MergePrev(seg unfreeIterator) unfreeIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } return seg } // MergeNext attempts to merge the given segment with its successor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergeNext is usually used when mutating segments while iterating them in // order of decreasing keys, to attempt merging of each mutated segment with // its previously-mutated successor. In such cases, merging a mutated segment // with its unmutated predecessor would incorrectly cause the latter to be // skipped. func (s *unfreeSet) MergeNext(seg unfreeIterator) unfreeIterator { if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // Unisolate attempts to merge the given segment with its predecessor and // successor if possible, and returns an updated iterator to the extended // segment. All existing iterators (including seg, but not including the // returned iterator) are invalidated. // // Unisolate is usually used in conjunction with Isolate when mutating part of // a single segment in a way that may affect its mergeability. For the reasons // described by MergePrev and MergeNext, it is usually incorrect to use the // return value of Unisolate in a loop variable. func (s *unfreeSet) Unisolate(seg unfreeIterator) unfreeIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // MergeAll merges all mergeable adjacent segments in the set. All existing // iterators are invalidated. func (s *unfreeSet) MergeAll() { seg := s.FirstSegment() if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeInsideRange attempts to merge all adjacent segments that contain a key // in the specific range. All existing iterators are invalidated. // // MergeInsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid a redundant search. func (s *unfreeSet) MergeInsideRange(r __generics_imported0.FileRange) { seg := s.LowerBoundSegment(r.Start) if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() && next.Start() < r.End { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeOutsideRange attempts to merge the segment containing r.Start with its // predecessor, and the segment containing r.End-1 with its successor. // // MergeOutsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid two redundant searches. func (s *unfreeSet) MergeOutsideRange(r __generics_imported0.FileRange) { first := s.FindSegment(r.Start) if first.Ok() { if prev := first.PrevSegment(); prev.Ok() { s.Merge(prev, first) } } last := s.FindSegment(r.End - 1) if last.Ok() { if next := last.NextSegment(); next.Ok() { s.Merge(last, next) } } } // Split splits the given segment at the given key and returns iterators to the // two resulting segments. All existing iterators (including seg, but not // including the returned iterators) are invalidated. // // If the segment cannot be split at split (because split is at the start or // end of the segment's range, so splitting would produce a segment with zero // length, or because split falls outside the segment's range altogether), // Split panics. func (s *unfreeSet) Split(seg unfreeIterator, split uint64) (unfreeIterator, unfreeIterator) { if !seg.Range().CanSplitAt(split) { panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) } return s.SplitUnchecked(seg, split) } // SplitUnchecked splits the given segment at the given key and returns // iterators to the two resulting segments. All existing iterators (including // seg, but not including the returned iterators) are invalidated. // // Preconditions: seg.Start() < key < seg.End(). func (s *unfreeSet) SplitUnchecked(seg unfreeIterator, split uint64) (unfreeIterator, unfreeIterator) { val1, val2 := (unfreeSetFunctions{}).Split(seg.Range(), seg.Value(), split) end2 := seg.End() seg.SetEndUnchecked(split) seg.SetValue(val1) seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), __generics_imported0.FileRange{split, end2}, val2) return seg2.PrevSegment(), seg2 } // SplitBefore ensures that the given segment's start is at least start by // splitting at start if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterator) are invalidated. // // SplitBefore is usually when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, the first segment may // extend beyond the start of the range to be mutated, and needs to be // SplitBefore to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter; i.e. SplitBefore needs to be invoked on each segment, while // SplitAfter only needs to be invoked on the first. // // Preconditions: start < seg.End(). func (s *unfreeSet) SplitBefore(seg unfreeIterator, start uint64) unfreeIterator { if seg.Range().CanSplitAt(start) { _, seg = s.SplitUnchecked(seg, start) } return seg } // SplitAfter ensures that the given segment's end is at most end by splitting // at end if necessary, and returns an updated iterator to the bounded segment. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. // // SplitAfter is usually used when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, each iterated segment // may extend beyond the end of the range to be mutated, and needs to be // SplitAfter to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter exchange roles; i.e. SplitBefore needs to be invoked on each // segment, while SplitAfter only needs to be invoked on the first. // // Preconditions: seg.Start() < end. func (s *unfreeSet) SplitAfter(seg unfreeIterator, end uint64) unfreeIterator { if seg.Range().CanSplitAt(end) { seg, _ = s.SplitUnchecked(seg, end) } return seg } // Isolate ensures that the given segment's range is a subset of r by splitting // at r.Start and r.End if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterators) are invalidated. // // Isolate is usually used when mutating part of a single segment, or when // mutating segments in a range where the first segment is not necessarily // split, making use of SplitBefore/SplitAfter complex. // // Preconditions: seg.Range().Overlaps(r). func (s *unfreeSet) Isolate(seg unfreeIterator, r __generics_imported0.FileRange) unfreeIterator { if seg.Range().CanSplitAt(r.Start) { _, seg = s.SplitUnchecked(seg, r.Start) } if seg.Range().CanSplitAt(r.End) { seg, _ = s.SplitUnchecked(seg, r.End) } return seg } // LowerBoundSegmentSplitBefore combines LowerBoundSegment and SplitBefore. // // LowerBoundSegmentSplitBefore is usually used when mutating segments in a // range while iterating them in order of increasing keys. In such cases, // LowerBoundSegmentSplitBefore provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *unfreeSet) LowerBoundSegmentSplitBefore(min uint64) unfreeIterator { seg := s.LowerBoundSegment(min) if seg.Ok() { seg = s.SplitBefore(seg, min) } return seg } // UpperBoundSegmentSplitAfter combines UpperBoundSegment and SplitAfter. // // UpperBoundSegmentSplitAfter is usually used when mutating segments in a // range while iterating them in order of decreasing keys. In such cases, // UpperBoundSegmentSplitAfter provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *unfreeSet) UpperBoundSegmentSplitAfter(max uint64) unfreeIterator { seg := s.UpperBoundSegment(max) if seg.Ok() { seg = s.SplitAfter(seg, max) } return seg } // VisitRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments will not be split, so f may be called // on segments lying partially outside r. Non-empty gaps between segments are // skipped. If a call to f returns false, VisitRange stops iteration // immediately. // // N.B. f must not invalidate iterators into s. func (s *unfreeSet) VisitRange(r __generics_imported0.FileRange, f func(seg unfreeIterator) bool) { for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { if !f(seg) { return } } } // VisitFullRange is equivalent to VisitRange, except that if any key in r that // is visited before f returns false does not correspond to a segment, // VisitFullRange panics. func (s *unfreeSet) VisitFullRange(r __generics_imported0.FileRange, f func(seg unfreeIterator) bool) { pos := r.Start seg := s.FindSegment(r.Start) for { if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", pos)) } if !f(seg) { return } pos = seg.End() if r.End <= pos { return } seg, _ = seg.NextNonEmpty() } } // MutateRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments that lie partially outside r are split // before f is called, such that f only observes segments entirely within r. // Iterated segments are merged again after f is called. Non-empty gaps between // segments are skipped. If a call to f returns false, MutateRange stops // iteration immediately. // // MutateRange invalidates all existing iterators. // // N.B. f must not invalidate iterators into s. func (s *unfreeSet) MutateRange(r __generics_imported0.FileRange, f func(seg unfreeIterator) bool) { seg := s.LowerBoundSegmentSplitBefore(r.Start) for seg.Ok() && seg.Start() < r.End { seg = s.SplitAfter(seg, r.End) cont := f(seg) seg = s.MergePrev(seg) if !cont { s.MergeNext(seg) return } seg = seg.NextSegment() } if seg.Ok() { s.MergePrev(seg) } } // MutateFullRange is equivalent to MutateRange, except that if any key in r // that is visited before f returns false does not correspond to a segment, // MutateFullRange panics. func (s *unfreeSet) MutateFullRange(r __generics_imported0.FileRange, f func(seg unfreeIterator) bool) { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) cont := f(seg) end := seg.End() seg = s.MergePrev(seg) if !cont || r.End <= end { s.MergeNext(seg) return } seg = seg.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // +stateify savable type unfreenode struct { // An internal binary tree node looks like: // // K // / \ // Cl Cr // // where all keys in the subtree rooted by Cl (the left subtree) are less // than K (the key of the parent node), and all keys in the subtree rooted // by Cr (the right subtree) are greater than K. // // An internal B-tree node's indexes work out to look like: // // K0 K1 K2 ... Kn-1 // / \/ \/ \ ... / \ // C0 C1 C2 C3 ... Cn-1 Cn // // where n is nrSegments. nrSegments int // parent is a pointer to this node's parent. If this node is root, parent // is nil. parent *unfreenode // parentIndex is the index of this node in parent.children. parentIndex int // Flag for internal nodes that is technically redundant with "children[0] // != nil", but is stored in the first cache line. "hasChildren" rather // than "isLeaf" because false must be the correct value for an empty root. hasChildren bool // The longest gap within this node. If the node is a leaf, it's simply the // maximum gap among all the (nrSegments+1) gaps formed by its nrSegments keys // including the 0th and nrSegments-th gap possibly shared with its upper-level // nodes; if it's a non-leaf node, it's the max of all children's maxGap. maxGap unfreedynamicGap // Nodes store keys and values in separate arrays to maximize locality in // the common case (scanning keys for lookup). keys [unfreemaxDegree - 1]__generics_imported0.FileRange values [unfreemaxDegree - 1]unfreeInfo children [unfreemaxDegree]*unfreenode } // firstSegment returns the first segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *unfreenode) firstSegment() unfreeIterator { for n.hasChildren { n = n.children[0] } return unfreeIterator{n, 0} } // lastSegment returns the last segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *unfreenode) lastSegment() unfreeIterator { for n.hasChildren { n = n.children[n.nrSegments] } return unfreeIterator{n, n.nrSegments - 1} } func (n *unfreenode) prevSibling() *unfreenode { if n.parent == nil || n.parentIndex == 0 { return nil } return n.parent.children[n.parentIndex-1] } func (n *unfreenode) nextSibling() *unfreenode { if n.parent == nil || n.parentIndex == n.parent.nrSegments { return nil } return n.parent.children[n.parentIndex+1] } // rebalanceBeforeInsert splits n and its ancestors if they are full, as // required for insertion, and returns an updated iterator to the position // represented by gap. func (n *unfreenode) rebalanceBeforeInsert(gap unfreeGapIterator) unfreeGapIterator { if n.nrSegments < unfreemaxDegree-1 { return gap } if n.parent != nil { gap = n.parent.rebalanceBeforeInsert(gap) } if n.parent == nil { left := &unfreenode{ nrSegments: unfreeminDegree - 1, parent: n, parentIndex: 0, hasChildren: n.hasChildren, } right := &unfreenode{ nrSegments: unfreeminDegree - 1, parent: n, parentIndex: 1, hasChildren: n.hasChildren, } copy(left.keys[:unfreeminDegree-1], n.keys[:unfreeminDegree-1]) copy(left.values[:unfreeminDegree-1], n.values[:unfreeminDegree-1]) copy(right.keys[:unfreeminDegree-1], n.keys[unfreeminDegree:]) copy(right.values[:unfreeminDegree-1], n.values[unfreeminDegree:]) n.keys[0], n.values[0] = n.keys[unfreeminDegree-1], n.values[unfreeminDegree-1] unfreezeroValueSlice(n.values[1:]) if n.hasChildren { copy(left.children[:unfreeminDegree], n.children[:unfreeminDegree]) copy(right.children[:unfreeminDegree], n.children[unfreeminDegree:]) unfreezeroNodeSlice(n.children[2:]) for i := 0; i < unfreeminDegree; i++ { left.children[i].parent = left left.children[i].parentIndex = i right.children[i].parent = right right.children[i].parentIndex = i } } n.nrSegments = 1 n.hasChildren = true n.children[0] = left n.children[1] = right if unfreetrackGaps != 0 { left.updateMaxGapLocal() right.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < unfreeminDegree { return unfreeGapIterator{left, gap.index} } return unfreeGapIterator{right, gap.index - unfreeminDegree} } copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[unfreeminDegree-1], n.values[unfreeminDegree-1] copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { n.parent.children[i].parentIndex = i } sibling := &unfreenode{ nrSegments: unfreeminDegree - 1, parent: n.parent, parentIndex: n.parentIndex + 1, hasChildren: n.hasChildren, } n.parent.children[n.parentIndex+1] = sibling n.parent.nrSegments++ copy(sibling.keys[:unfreeminDegree-1], n.keys[unfreeminDegree:]) copy(sibling.values[:unfreeminDegree-1], n.values[unfreeminDegree:]) unfreezeroValueSlice(n.values[unfreeminDegree-1:]) if n.hasChildren { copy(sibling.children[:unfreeminDegree], n.children[unfreeminDegree:]) unfreezeroNodeSlice(n.children[unfreeminDegree:]) for i := 0; i < unfreeminDegree; i++ { sibling.children[i].parent = sibling sibling.children[i].parentIndex = i } } n.nrSegments = unfreeminDegree - 1 if unfreetrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < unfreeminDegree { return gap } return unfreeGapIterator{sibling, gap.index - unfreeminDegree} } // rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient // (contain fewer segments than required by B-tree invariants), as required for // removal, and returns an updated iterator to the position represented by gap. // // Precondition: n is the only node in the tree that may currently violate a // B-tree invariant. func (n *unfreenode) rebalanceAfterRemove(gap unfreeGapIterator) unfreeGapIterator { for { if n.nrSegments >= unfreeminDegree-1 { return gap } if n.parent == nil { return gap } if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= unfreeminDegree { copy(n.keys[1:], n.keys[:n.nrSegments]) copy(n.values[1:], n.values[:n.nrSegments]) n.keys[0] = n.parent.keys[n.parentIndex-1] n.values[0] = n.parent.values[n.parentIndex-1] n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] unfreeSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { copy(n.children[1:], n.children[:n.nrSegments+1]) n.children[0] = sibling.children[sibling.nrSegments] sibling.children[sibling.nrSegments] = nil n.children[0].parent = n n.children[0].parentIndex = 0 for i := 1; i < n.nrSegments+2; i++ { n.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if unfreetrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling && gap.index == sibling.nrSegments { return unfreeGapIterator{n, 0} } if gap.node == n { return unfreeGapIterator{n, gap.index + 1} } return gap } if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= unfreeminDegree { n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] n.values[n.nrSegments] = n.parent.values[n.parentIndex] n.parent.keys[n.parentIndex] = sibling.keys[0] n.parent.values[n.parentIndex] = sibling.values[0] copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) unfreeSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { n.children[n.nrSegments+1] = sibling.children[0] copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) sibling.children[sibling.nrSegments] = nil n.children[n.nrSegments+1].parent = n n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 for i := 0; i < sibling.nrSegments; i++ { sibling.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if unfreetrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling { if gap.index == 0 { return unfreeGapIterator{n, n.nrSegments} } return unfreeGapIterator{sibling, gap.index - 1} } return gap } p := n.parent if p.nrSegments == 1 { left, right := p.children[0], p.children[1] p.nrSegments = left.nrSegments + right.nrSegments + 1 p.hasChildren = left.hasChildren p.keys[left.nrSegments] = p.keys[0] p.values[left.nrSegments] = p.values[0] copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := 0; i < p.nrSegments+1; i++ { p.children[i].parent = p p.children[i].parentIndex = i } } else { p.children[0] = nil p.children[1] = nil } if gap.node == left { return unfreeGapIterator{p, gap.index} } if gap.node == right { return unfreeGapIterator{p, gap.index + left.nrSegments + 1} } return gap } // Merge n and either sibling, along with the segment separating the // two, into whichever of the two nodes comes first. This is the // reverse of the non-root splitting case in // node.rebalanceBeforeInsert. var left, right *unfreenode if n.parentIndex > 0 { left = n.prevSibling() right = n } else { left = n right = n.nextSibling() } if gap.node == right { gap = unfreeGapIterator{left, gap.index + left.nrSegments + 1} } left.keys[left.nrSegments] = p.keys[left.parentIndex] left.values[left.nrSegments] = p.values[left.parentIndex] copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { left.children[i].parent = left left.children[i].parentIndex = i } } left.nrSegments += right.nrSegments + 1 copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) unfreeSetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) for i := 0; i < p.nrSegments; i++ { p.children[i].parentIndex = i } p.children[p.nrSegments] = nil p.nrSegments-- if unfreetrackGaps != 0 { left.updateMaxGapLocal() } n = p } } // updateMaxGapLeaf updates maxGap bottom-up from the calling leaf until no // necessary update. // // Preconditions: n must be a leaf node, trackGaps must be 1. func (n *unfreenode) updateMaxGapLeaf() { if n.hasChildren { panic(fmt.Sprintf("updateMaxGapLeaf should always be called on leaf node: %v", n)) } max := n.calculateMaxGapLeaf() if max == n.maxGap.Get() { return } oldMax := n.maxGap.Get() n.maxGap.Set(max) if max > oldMax { for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() >= max { break } p.maxGap.Set(max) } return } for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() > oldMax { break } parentNewMax := p.calculateMaxGapInternal() if p.maxGap.Get() == parentNewMax { break } p.maxGap.Set(parentNewMax) } } // updateMaxGapLocal updates maxGap of the calling node solely with no // propagation to ancestor nodes. // // Precondition: trackGaps must be 1. func (n *unfreenode) updateMaxGapLocal() { if !n.hasChildren { n.maxGap.Set(n.calculateMaxGapLeaf()) } else { n.maxGap.Set(n.calculateMaxGapInternal()) } } // calculateMaxGapLeaf iterates the gaps within a leaf node and calculate the // max. // // Preconditions: n must be a leaf node. func (n *unfreenode) calculateMaxGapLeaf() uint64 { max := unfreeGapIterator{n, 0}.Range().Length() for i := 1; i <= n.nrSegments; i++ { if current := (unfreeGapIterator{n, i}).Range().Length(); current > max { max = current } } return max } // calculateMaxGapInternal iterates children's maxGap within an internal node n // and calculate the max. // // Preconditions: n must be a non-leaf node. func (n *unfreenode) calculateMaxGapInternal() uint64 { max := n.children[0].maxGap.Get() for i := 1; i <= n.nrSegments; i++ { if current := n.children[i].maxGap.Get(); current > max { max = current } } return max } // searchFirstLargeEnoughGap returns the first gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *unfreenode) searchFirstLargeEnoughGap(minSize uint64) unfreeGapIterator { if n.maxGap.Get() < minSize { return unfreeGapIterator{} } if n.hasChildren { for i := 0; i <= n.nrSegments; i++ { if largeEnoughGap := n.children[i].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := 0; i <= n.nrSegments; i++ { currentGap := unfreeGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // searchLastLargeEnoughGap returns the last gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *unfreenode) searchLastLargeEnoughGap(minSize uint64) unfreeGapIterator { if n.maxGap.Get() < minSize { return unfreeGapIterator{} } if n.hasChildren { for i := n.nrSegments; i >= 0; i-- { if largeEnoughGap := n.children[i].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := n.nrSegments; i >= 0; i-- { currentGap := unfreeGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // A Iterator is conceptually one of: // // - A pointer to a segment in a set; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Iterators are copyable values and are meaningfully equality-comparable. The // zero value of Iterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type unfreeIterator struct { // node is the node containing the iterated segment. If the iterator is // terminal, node is nil. node *unfreenode // index is the index of the segment in node.keys/values. index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (seg unfreeIterator) Ok() bool { return seg.node != nil } // Range returns the iterated segment's range key. func (seg unfreeIterator) Range() __generics_imported0.FileRange { return seg.node.keys[seg.index] } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (seg unfreeIterator) Start() uint64 { return seg.node.keys[seg.index].Start } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (seg unfreeIterator) End() uint64 { return seg.node.keys[seg.index].End } // SetRangeUnchecked mutates the iterated segment's range key. This operation // does not invalidate any iterators. // // Preconditions: // - r.Length() > 0. // - The new range must not overlap an existing one: // - If seg.NextSegment().Ok(), then r.end <= seg.NextSegment().Start(). // - If seg.PrevSegment().Ok(), then r.start >= seg.PrevSegment().End(). func (seg unfreeIterator) SetRangeUnchecked(r __generics_imported0.FileRange) { seg.node.keys[seg.index] = r } // SetRange mutates the iterated segment's range key. If the new range would // cause the iterated segment to overlap another segment, or if the new range // is invalid, SetRange panics. This operation does not invalidate any // iterators. func (seg unfreeIterator) SetRange(r __generics_imported0.FileRange) { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) } if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) } seg.SetRangeUnchecked(r) } // SetStartUnchecked mutates the iterated segment's start. This operation does // not invalidate any iterators. // // Preconditions: The new start must be valid: // - start < seg.End() // - If seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). func (seg unfreeIterator) SetStartUnchecked(start uint64) { seg.node.keys[seg.index].Start = start } // SetStart mutates the iterated segment's start. If the new start value would // cause the iterated segment to overlap another segment, or would result in an // invalid range, SetStart panics. This operation does not invalidate any // iterators. func (seg unfreeIterator) SetStart(start uint64) { if start >= seg.End() { panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) } if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) } seg.SetStartUnchecked(start) } // SetEndUnchecked mutates the iterated segment's end. This operation does not // invalidate any iterators. // // Preconditions: The new end must be valid: // - end > seg.Start(). // - If seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). func (seg unfreeIterator) SetEndUnchecked(end uint64) { seg.node.keys[seg.index].End = end } // SetEnd mutates the iterated segment's end. If the new end value would cause // the iterated segment to overlap another segment, or would result in an // invalid range, SetEnd panics. This operation does not invalidate any // iterators. func (seg unfreeIterator) SetEnd(end uint64) { if end <= seg.Start() { panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) } if next := seg.NextSegment(); next.Ok() && end > next.Start() { panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) } seg.SetEndUnchecked(end) } // Value returns a copy of the iterated segment's value. func (seg unfreeIterator) Value() unfreeInfo { return seg.node.values[seg.index] } // ValuePtr returns a pointer to the iterated segment's value. The pointer is // invalidated if the iterator is invalidated. This operation does not // invalidate any iterators. func (seg unfreeIterator) ValuePtr() *unfreeInfo { return &seg.node.values[seg.index] } // SetValue mutates the iterated segment's value. This operation does not // invalidate any iterators. func (seg unfreeIterator) SetValue(val unfreeInfo) { seg.node.values[seg.index] = val } // PrevSegment returns the iterated segment's predecessor. If there is no // preceding segment, PrevSegment returns a terminal iterator. func (seg unfreeIterator) PrevSegment() unfreeIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment() } if seg.index > 0 { return unfreeIterator{seg.node, seg.index - 1} } if seg.node.parent == nil { return unfreeIterator{} } return unfreesegmentBeforePosition(seg.node.parent, seg.node.parentIndex) } // NextSegment returns the iterated segment's successor. If there is no // succeeding segment, NextSegment returns a terminal iterator. func (seg unfreeIterator) NextSegment() unfreeIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment() } if seg.index < seg.node.nrSegments-1 { return unfreeIterator{seg.node, seg.index + 1} } if seg.node.parent == nil { return unfreeIterator{} } return unfreesegmentAfterPosition(seg.node.parent, seg.node.parentIndex) } // PrevGap returns the gap immediately before the iterated segment. func (seg unfreeIterator) PrevGap() unfreeGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment().NextGap() } return unfreeGapIterator{seg.node, seg.index} } // NextGap returns the gap immediately after the iterated segment. func (seg unfreeIterator) NextGap() unfreeGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment().PrevGap() } return unfreeGapIterator{seg.node, seg.index + 1} } // PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, // or the gap before the iterated segment otherwise. If seg.Start() == // Functions.MinKey(), PrevNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by PrevNonEmpty will be // non-terminal. func (seg unfreeIterator) PrevNonEmpty() (unfreeIterator, unfreeGapIterator) { if prev := seg.PrevSegment(); prev.Ok() && prev.End() == seg.Start() { return prev, unfreeGapIterator{} } return unfreeIterator{}, seg.PrevGap() } // NextNonEmpty returns the iterated segment's successor if it is adjacent, or // the gap after the iterated segment otherwise. If seg.End() == // Functions.MaxKey(), NextNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by NextNonEmpty will be // non-terminal. func (seg unfreeIterator) NextNonEmpty() (unfreeIterator, unfreeGapIterator) { if next := seg.NextSegment(); next.Ok() && next.Start() == seg.End() { return next, unfreeGapIterator{} } return unfreeIterator{}, seg.NextGap() } // A GapIterator is conceptually one of: // // - A pointer to a position between two segments, before the first segment, or // after the last segment in a set, called a *gap*; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Note that the gap between two adjacent segments exists (iterators to it are // non-terminal), but has a length of zero. GapIterator.IsEmpty returns true // for such gaps. An empty set contains a single gap, spanning the entire range // of the set's keys. // // GapIterators are copyable values and are meaningfully equality-comparable. // The zero value of GapIterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type unfreeGapIterator struct { // The representation of a GapIterator is identical to that of an Iterator, // except that index corresponds to positions between segments in the same // way as for node.children (see comment for node.nrSegments). node *unfreenode index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (gap unfreeGapIterator) Ok() bool { return gap.node != nil } // Range returns the range spanned by the iterated gap. func (gap unfreeGapIterator) Range() __generics_imported0.FileRange { return __generics_imported0.FileRange{gap.Start(), gap.End()} } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (gap unfreeGapIterator) Start() uint64 { if ps := gap.PrevSegment(); ps.Ok() { return ps.End() } return unfreeSetFunctions{}.MinKey() } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (gap unfreeGapIterator) End() uint64 { if ns := gap.NextSegment(); ns.Ok() { return ns.Start() } return unfreeSetFunctions{}.MaxKey() } // IsEmpty returns true if the iterated gap is empty (that is, the "gap" is // between two adjacent segments.) func (gap unfreeGapIterator) IsEmpty() bool { return gap.Range().Length() == 0 } // PrevSegment returns the segment immediately before the iterated gap. If no // such segment exists, PrevSegment returns a terminal iterator. func (gap unfreeGapIterator) PrevSegment() unfreeIterator { return unfreesegmentBeforePosition(gap.node, gap.index) } // NextSegment returns the segment immediately after the iterated gap. If no // such segment exists, NextSegment returns a terminal iterator. func (gap unfreeGapIterator) NextSegment() unfreeIterator { return unfreesegmentAfterPosition(gap.node, gap.index) } // PrevGap returns the iterated gap's predecessor. If no such gap exists, // PrevGap returns a terminal iterator. func (gap unfreeGapIterator) PrevGap() unfreeGapIterator { seg := gap.PrevSegment() if !seg.Ok() { return unfreeGapIterator{} } return seg.PrevGap() } // NextGap returns the iterated gap's successor. If no such gap exists, NextGap // returns a terminal iterator. func (gap unfreeGapIterator) NextGap() unfreeGapIterator { seg := gap.NextSegment() if !seg.Ok() { return unfreeGapIterator{} } return seg.NextGap() } // NextLargeEnoughGap returns the iterated gap's first next gap with larger // length than minSize. If not found, return a terminal gap iterator (does NOT // include this gap itself). // // Precondition: trackGaps must be 1. func (gap unfreeGapIterator) NextLargeEnoughGap(minSize uint64) unfreeGapIterator { if unfreetrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == gap.node.nrSegments { gap.node = gap.NextSegment().node gap.index = 0 return gap.nextLargeEnoughGapHelper(minSize) } return gap.nextLargeEnoughGapHelper(minSize) } // nextLargeEnoughGapHelper is the helper function used by NextLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the trailing gap of a non-leaf node. func (gap unfreeGapIterator) nextLargeEnoughGapHelper(minSize uint64) unfreeGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == gap.node.nrSegments)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return unfreeGapIterator{} } gap.index++ for gap.index <= gap.node.nrSegments { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index++ } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == gap.node.nrSegments { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // PrevLargeEnoughGap returns the iterated gap's first prev gap with larger or // equal length than minSize. If not found, return a terminal gap iterator // (does NOT include this gap itself). // // Precondition: trackGaps must be 1. func (gap unfreeGapIterator) PrevLargeEnoughGap(minSize uint64) unfreeGapIterator { if unfreetrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == 0 { gap.node = gap.PrevSegment().node gap.index = gap.node.nrSegments return gap.prevLargeEnoughGapHelper(minSize) } return gap.prevLargeEnoughGapHelper(minSize) } // prevLargeEnoughGapHelper is the helper function used by PrevLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the first gap of a non-leaf node. func (gap unfreeGapIterator) prevLargeEnoughGapHelper(minSize uint64) unfreeGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == 0)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return unfreeGapIterator{} } gap.index-- for gap.index >= 0 { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index-- } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == 0 { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // segmentBeforePosition returns the predecessor segment of the position given // by n.children[i], which may or may not contain a child. If no such segment // exists, segmentBeforePosition returns a terminal iterator. func unfreesegmentBeforePosition(n *unfreenode, i int) unfreeIterator { for i == 0 { if n.parent == nil { return unfreeIterator{} } n, i = n.parent, n.parentIndex } return unfreeIterator{n, i - 1} } // segmentAfterPosition returns the successor segment of the position given by // n.children[i], which may or may not contain a child. If no such segment // exists, segmentAfterPosition returns a terminal iterator. func unfreesegmentAfterPosition(n *unfreenode, i int) unfreeIterator { for i == n.nrSegments { if n.parent == nil { return unfreeIterator{} } n, i = n.parent, n.parentIndex } return unfreeIterator{n, i} } func unfreezeroValueSlice(slice []unfreeInfo) { for i := range slice { unfreeSetFunctions{}.ClearValue(&slice[i]) } } func unfreezeroNodeSlice(slice []*unfreenode) { for i := range slice { slice[i] = nil } } // String stringifies a Set for debugging. func (s *unfreeSet) String() string { return s.root.String() } // String stringifies a node (and all of its children) for debugging. func (n *unfreenode) String() string { var buf bytes.Buffer n.writeDebugString(&buf, "") return buf.String() } func (n *unfreenode) writeDebugString(buf *bytes.Buffer, prefix string) { if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { buf.WriteString(prefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) } for i := 0; i < n.nrSegments; i++ { if child := n.children[i]; child != nil { cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) if child.parent != n || child.parentIndex != i { buf.WriteString(cprefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) } child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) } buf.WriteString(prefix) if n.hasChildren { if unfreetrackGaps != 0 { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v, maxGap: %d\n", i, n.keys[i], n.values[i], n.maxGap.Get())) } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } if child := n.children[n.nrSegments]; child != nil { child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) } } // FlatSegment represents a segment as a single object. FlatSegment is used as // an intermediate representation for save/restore and tests. // // +stateify savable type unfreeFlatSegment struct { Start uint64 End uint64 Value unfreeInfo } // ExportSlice returns a copy of all segments in the given set, in ascending // key order. func (s *unfreeSet) ExportSlice() []unfreeFlatSegment { var fs []unfreeFlatSegment for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { fs = append(fs, unfreeFlatSegment{ Start: seg.Start(), End: seg.End(), Value: seg.Value(), }) } return fs } // ImportSlice initializes the given set from the given slice. // // Preconditions: // - s must be empty. // - fs must represent a valid set (the segments in fs must have valid // lengths that do not overlap). // - The segments in fs must be sorted in ascending key order. func (s *unfreeSet) ImportSlice(fs []unfreeFlatSegment) error { if !s.IsEmpty() { return fmt.Errorf("cannot import into non-empty set %v", s) } gap := s.FirstGap() for i := range fs { f := &fs[i] r := __generics_imported0.FileRange{f.Start, f.End} if !gap.Range().IsSupersetOf(r) { return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: %v => %v", r, f.Value) } gap = s.InsertWithoutMerging(gap, r, f.Value).NextGap() } return nil } // segmentTestCheck returns an error if s is incorrectly sorted, does not // contain exactly expectedSegments segments, or contains a segment which // fails the passed check. // // This should be used only for testing, and has been added to this package for // templating convenience. func (s *unfreeSet) segmentTestCheck(expectedSegments int, segFunc func(int, __generics_imported0.FileRange, unfreeInfo) error) error { havePrev := false prev := uint64(0) nrSegments := 0 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { next := seg.Start() if havePrev && prev >= next { return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments) } if segFunc != nil { if err := segFunc(nrSegments, seg.Range(), seg.Value()); err != nil { return err } } prev = next havePrev = true nrSegments++ } if nrSegments != expectedSegments { return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments) } return nil } // countSegments counts the number of segments in the set. // // Similar to Check, this should only be used for testing. func (s *unfreeSet) countSegments() (segments int) { for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { segments++ } return segments } func (s *unfreeSet) saveRoot() []unfreeFlatSegment { fs := s.ExportSlice() fs = fs[:len(fs):len(fs)] return fs } func (s *unfreeSet) loadRoot(_ context.Context, fs []unfreeFlatSegment) { if err := s.ImportSlice(fs); err != nil { panic(err) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/pgalloc/unwaste_set.go000066400000000000000000002036631465435605700252730ustar00rootroot00000000000000package pgalloc import ( __generics_imported0 "gvisor.dev/gvisor/pkg/sentry/memmap" ) import ( "bytes" "context" "fmt" ) // trackGaps is an optional parameter. // // If trackGaps is 1, the Set will track maximum gap size recursively, // enabling the GapIterator.{Prev,Next}LargeEnoughGap functions. In this // case, Key must be an unsigned integer. // // trackGaps must be 0 or 1. const unwastetrackGaps = 1 var _ = uint8(unwastetrackGaps << 7) // Will fail if not zero or one. // dynamicGap is a type that disappears if trackGaps is 0. type unwastedynamicGap [unwastetrackGaps]uint64 // Get returns the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *unwastedynamicGap) Get() uint64 { return d[:][0] } // Set sets the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *unwastedynamicGap) Set(v uint64) { d[:][0] = v } const ( // minDegree is the minimum degree of an internal node in a Set B-tree. // // - Any non-root node has at least minDegree-1 segments. // // - Any non-root internal (non-leaf) node has at least minDegree children. // // - The root node may have fewer than minDegree-1 segments, but it may // only have 0 segments if the tree is empty. // // Our implementation requires minDegree >= 3. Higher values of minDegree // usually improve performance, but increase memory usage for small sets. unwasteminDegree = 10 unwastemaxDegree = 2 * unwasteminDegree ) // A Set is a mapping of segments with non-overlapping Range keys. The zero // value for a Set is an empty set. Set values are not safely movable nor // copyable. Set is thread-compatible. // // +stateify savable type unwasteSet struct { root unwastenode `state:".([]unwasteFlatSegment)"` } // IsEmpty returns true if the set contains no segments. func (s *unwasteSet) IsEmpty() bool { return s.root.nrSegments == 0 } // IsEmptyRange returns true iff no segments in the set overlap the given // range. This is semantically equivalent to s.SpanRange(r) == 0, but may be // more efficient. func (s *unwasteSet) IsEmptyRange(r __generics_imported0.FileRange) bool { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return true } _, gap := s.Find(r.Start) if !gap.Ok() { return false } return r.End <= gap.End() } // Span returns the total size of all segments in the set. func (s *unwasteSet) Span() uint64 { var sz uint64 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { sz += seg.Range().Length() } return sz } // SpanRange returns the total size of the intersection of segments in the set // with the given range. func (s *unwasteSet) SpanRange(r __generics_imported0.FileRange) uint64 { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return 0 } var sz uint64 for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { sz += seg.Range().Intersect(r).Length() } return sz } // FirstSegment returns the first segment in the set. If the set is empty, // FirstSegment returns a terminal iterator. func (s *unwasteSet) FirstSegment() unwasteIterator { if s.root.nrSegments == 0 { return unwasteIterator{} } return s.root.firstSegment() } // LastSegment returns the last segment in the set. If the set is empty, // LastSegment returns a terminal iterator. func (s *unwasteSet) LastSegment() unwasteIterator { if s.root.nrSegments == 0 { return unwasteIterator{} } return s.root.lastSegment() } // FirstGap returns the first gap in the set. func (s *unwasteSet) FirstGap() unwasteGapIterator { n := &s.root for n.hasChildren { n = n.children[0] } return unwasteGapIterator{n, 0} } // LastGap returns the last gap in the set. func (s *unwasteSet) LastGap() unwasteGapIterator { n := &s.root for n.hasChildren { n = n.children[n.nrSegments] } return unwasteGapIterator{n, n.nrSegments} } // Find returns the segment or gap whose range contains the given key. If a // segment is found, the returned Iterator is non-terminal and the // returned GapIterator is terminal. Otherwise, the returned Iterator is // terminal and the returned GapIterator is non-terminal. func (s *unwasteSet) Find(key uint64) (unwasteIterator, unwasteGapIterator) { n := &s.root for { lower := 0 upper := n.nrSegments for lower < upper { i := lower + (upper-lower)/2 if r := n.keys[i]; key < r.End { if key >= r.Start { return unwasteIterator{n, i}, unwasteGapIterator{} } upper = i } else { lower = i + 1 } } i := lower if !n.hasChildren { return unwasteIterator{}, unwasteGapIterator{n, i} } n = n.children[i] } } // FindSegment returns the segment whose range contains the given key. If no // such segment exists, FindSegment returns a terminal iterator. func (s *unwasteSet) FindSegment(key uint64) unwasteIterator { seg, _ := s.Find(key) return seg } // LowerBoundSegment returns the segment with the lowest range that contains a // key greater than or equal to min. If no such segment exists, // LowerBoundSegment returns a terminal iterator. func (s *unwasteSet) LowerBoundSegment(min uint64) unwasteIterator { seg, gap := s.Find(min) if seg.Ok() { return seg } return gap.NextSegment() } // UpperBoundSegment returns the segment with the highest range that contains a // key less than or equal to max. If no such segment exists, UpperBoundSegment // returns a terminal iterator. func (s *unwasteSet) UpperBoundSegment(max uint64) unwasteIterator { seg, gap := s.Find(max) if seg.Ok() { return seg } return gap.PrevSegment() } // FindGap returns the gap containing the given key. If no such gap exists // (i.e. the set contains a segment containing that key), FindGap returns a // terminal iterator. func (s *unwasteSet) FindGap(key uint64) unwasteGapIterator { _, gap := s.Find(key) return gap } // LowerBoundGap returns the gap with the lowest range that is greater than or // equal to min. func (s *unwasteSet) LowerBoundGap(min uint64) unwasteGapIterator { seg, gap := s.Find(min) if gap.Ok() { return gap } return seg.NextGap() } // UpperBoundGap returns the gap with the highest range that is less than or // equal to max. func (s *unwasteSet) UpperBoundGap(max uint64) unwasteGapIterator { seg, gap := s.Find(max) if gap.Ok() { return gap } return seg.PrevGap() } // FirstLargeEnoughGap returns the first gap in the set with at least the given // length. If no such gap exists, FirstLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *unwasteSet) FirstLargeEnoughGap(minSize uint64) unwasteGapIterator { if unwastetrackGaps != 1 { panic("set is not tracking gaps") } gap := s.FirstGap() if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // LastLargeEnoughGap returns the last gap in the set with at least the given // length. If no such gap exists, LastLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *unwasteSet) LastLargeEnoughGap(minSize uint64) unwasteGapIterator { if unwastetrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LastGap() if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // LowerBoundLargeEnoughGap returns the first gap in the set with at least the // given length and whose range contains a key greater than or equal to min. If // no such gap exists, LowerBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *unwasteSet) LowerBoundLargeEnoughGap(min, minSize uint64) unwasteGapIterator { if unwastetrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LowerBoundGap(min) if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // UpperBoundLargeEnoughGap returns the last gap in the set with at least the // given length and whose range contains a key less than or equal to max. If no // such gap exists, UpperBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *unwasteSet) UpperBoundLargeEnoughGap(max, minSize uint64) unwasteGapIterator { if unwastetrackGaps != 1 { panic("set is not tracking gaps") } gap := s.UpperBoundGap(max) if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // Insert inserts the given segment into the given gap. If the new segment can // be merged with adjacent segments, Insert will do so. Insert returns an // iterator to the segment containing the inserted value (which may have been // merged with other values). All existing iterators (including gap, but not // including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, Insert panics. // // Insert is semantically equivalent to a InsertWithoutMerging followed by a // Merge, but may be more efficient. Note that there is no unchecked variant of // Insert since Insert must retrieve and inspect gap's predecessor and // successor segments regardless. func (s *unwasteSet) Insert(gap unwasteGapIterator, r __generics_imported0.FileRange, val unwasteInfo) unwasteIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } prev, next := gap.PrevSegment(), gap.NextSegment() if prev.Ok() && prev.End() > r.Start { panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) } if next.Ok() && next.Start() < r.End { panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) } if prev.Ok() && prev.End() == r.Start { if mval, ok := (unwasteSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { shrinkMaxGap := unwastetrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() prev.SetEndUnchecked(r.End) prev.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } if next.Ok() && next.Start() == r.End { val = mval if mval, ok := (unwasteSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { prev.SetEndUnchecked(next.End()) prev.SetValue(mval) return s.Remove(next).PrevSegment() } } return prev } } if next.Ok() && next.Start() == r.End { if mval, ok := (unwasteSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { shrinkMaxGap := unwastetrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() next.SetStartUnchecked(r.Start) next.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } return next } } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMerging inserts the given segment into the given gap and // returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, // InsertWithoutMerging panics. func (s *unwasteSet) InsertWithoutMerging(gap unwasteGapIterator, r __generics_imported0.FileRange, val unwasteInfo) unwasteIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if gr := gap.Range(); !gr.IsSupersetOf(r) { panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMergingUnchecked inserts the given segment into the given gap // and returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // Preconditions: // - r.Start >= gap.Start(). // - r.End <= gap.End(). func (s *unwasteSet) InsertWithoutMergingUnchecked(gap unwasteGapIterator, r __generics_imported0.FileRange, val unwasteInfo) unwasteIterator { gap = gap.node.rebalanceBeforeInsert(gap) splitMaxGap := unwastetrackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get()) copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) gap.node.keys[gap.index] = r gap.node.values[gap.index] = val gap.node.nrSegments++ if splitMaxGap { gap.node.updateMaxGapLeaf() } return unwasteIterator{gap.node, gap.index} } // InsertRange inserts the given segment into the set. If the new segment can // be merged with adjacent segments, InsertRange will do so. InsertRange // returns an iterator to the segment containing the inserted value (which may // have been merged with other values). All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertRange panics. // // InsertRange searches the set to find the gap to insert into. If the caller // already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *unwasteSet) InsertRange(r __generics_imported0.FileRange, val unwasteInfo) unwasteIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.Insert(gap, r, val) } // InsertWithoutMergingRange inserts the given segment into the set and returns // an iterator to the inserted segment. All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertWithoutMergingRange panics. // // InsertWithoutMergingRange searches the set to find the gap to insert into. // If the caller already has the appropriate GapIterator, or if the caller // needs to do additional work between finding the gap and insertion, use // InsertWithoutMerging instead. func (s *unwasteSet) InsertWithoutMergingRange(r __generics_imported0.FileRange, val unwasteInfo) unwasteIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.InsertWithoutMerging(gap, r, val) } // TryInsertRange attempts to insert the given segment into the set. If the new // segment can be merged with adjacent segments, TryInsertRange will do so. // TryInsertRange returns an iterator to the segment containing the inserted // value (which may have been merged with other values). All existing iterators // (excluding the returned iterator) are invalidated. // // If the new segment would overlap an existing segment, TryInsertRange does // nothing and returns a terminal iterator. // // TryInsertRange searches the set to find the gap to insert into. If the // caller already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *unwasteSet) TryInsertRange(r __generics_imported0.FileRange, val unwasteInfo) unwasteIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return unwasteIterator{} } if gap.End() < r.End { return unwasteIterator{} } return s.Insert(gap, r, val) } // TryInsertWithoutMergingRange attempts to insert the given segment into the // set. If successful, it returns an iterator to the inserted segment; all // existing iterators (excluding the returned iterator) are invalidated. If the // new segment would overlap an existing segment, TryInsertWithoutMergingRange // does nothing and returns a terminal iterator. // // TryInsertWithoutMergingRange searches the set to find the gap to insert // into. If the caller already has the appropriate GapIterator, or if the // caller needs to do additional work between finding the gap and insertion, // use InsertWithoutMerging instead. func (s *unwasteSet) TryInsertWithoutMergingRange(r __generics_imported0.FileRange, val unwasteInfo) unwasteIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return unwasteIterator{} } if gap.End() < r.End { return unwasteIterator{} } return s.InsertWithoutMerging(gap, r, val) } // Remove removes the given segment and returns an iterator to the vacated gap. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. func (s *unwasteSet) Remove(seg unwasteIterator) unwasteGapIterator { if seg.node.hasChildren { victim := seg.PrevSegment() seg.SetRangeUnchecked(victim.Range()) seg.SetValue(victim.Value()) nextAdjacentNode := seg.NextSegment().node if unwastetrackGaps != 0 { nextAdjacentNode.updateMaxGapLeaf() } return s.Remove(victim).NextGap() } copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) unwasteSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) seg.node.nrSegments-- if unwastetrackGaps != 0 { seg.node.updateMaxGapLeaf() } return seg.node.rebalanceAfterRemove(unwasteGapIterator{seg.node, seg.index}) } // RemoveAll removes all segments from the set. All existing iterators are // invalidated. func (s *unwasteSet) RemoveAll() { s.root = unwastenode{} } // RemoveRange removes all segments in the given range. An iterator to the // newly formed gap is returned, and all existing iterators are invalidated. // // RemoveRange searches the set to find segments to remove. If the caller // already has an iterator to either end of the range of segments to remove, or // if the caller needs to do additional work before removing each segment, // iterate segments and call Remove in a loop instead. func (s *unwasteSet) RemoveRange(r __generics_imported0.FileRange) unwasteGapIterator { seg, gap := s.Find(r.Start) if seg.Ok() { seg = s.Isolate(seg, r) gap = s.Remove(seg) } for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { seg = s.SplitAfter(seg, r.End) gap = s.Remove(seg) } return gap } // RemoveFullRange is equivalent to RemoveRange, except that if any key in the // given range does not correspond to a segment, RemoveFullRange panics. func (s *unwasteSet) RemoveFullRange(r __generics_imported0.FileRange) unwasteGapIterator { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) end := seg.End() gap := s.Remove(seg) if r.End <= end { return gap } seg = gap.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // Merge attempts to merge two neighboring segments. If successful, Merge // returns an iterator to the merged segment, and all existing iterators are // invalidated. Otherwise, Merge returns a terminal iterator. // // If first is not the predecessor of second, Merge panics. func (s *unwasteSet) Merge(first, second unwasteIterator) unwasteIterator { if first.NextSegment() != second { panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) } return s.MergeUnchecked(first, second) } // MergeUnchecked attempts to merge two neighboring segments. If successful, // MergeUnchecked returns an iterator to the merged segment, and all existing // iterators are invalidated. Otherwise, MergeUnchecked returns a terminal // iterator. // // Precondition: first is the predecessor of second: first.NextSegment() == // second, first == second.PrevSegment(). func (s *unwasteSet) MergeUnchecked(first, second unwasteIterator) unwasteIterator { if first.End() == second.Start() { if mval, ok := (unwasteSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { first.SetEndUnchecked(second.End()) first.SetValue(mval) return s.Remove(second).PrevSegment() } } return unwasteIterator{} } // MergePrev attempts to merge the given segment with its predecessor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergePrev is usually used when mutating segments while iterating them in // order of increasing keys, to attempt merging of each mutated segment with // its previously-mutated predecessor. In such cases, merging a mutated segment // with its unmutated successor would incorrectly cause the latter to be // skipped. func (s *unwasteSet) MergePrev(seg unwasteIterator) unwasteIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } return seg } // MergeNext attempts to merge the given segment with its successor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergeNext is usually used when mutating segments while iterating them in // order of decreasing keys, to attempt merging of each mutated segment with // its previously-mutated successor. In such cases, merging a mutated segment // with its unmutated predecessor would incorrectly cause the latter to be // skipped. func (s *unwasteSet) MergeNext(seg unwasteIterator) unwasteIterator { if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // Unisolate attempts to merge the given segment with its predecessor and // successor if possible, and returns an updated iterator to the extended // segment. All existing iterators (including seg, but not including the // returned iterator) are invalidated. // // Unisolate is usually used in conjunction with Isolate when mutating part of // a single segment in a way that may affect its mergeability. For the reasons // described by MergePrev and MergeNext, it is usually incorrect to use the // return value of Unisolate in a loop variable. func (s *unwasteSet) Unisolate(seg unwasteIterator) unwasteIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // MergeAll merges all mergeable adjacent segments in the set. All existing // iterators are invalidated. func (s *unwasteSet) MergeAll() { seg := s.FirstSegment() if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeInsideRange attempts to merge all adjacent segments that contain a key // in the specific range. All existing iterators are invalidated. // // MergeInsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid a redundant search. func (s *unwasteSet) MergeInsideRange(r __generics_imported0.FileRange) { seg := s.LowerBoundSegment(r.Start) if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() && next.Start() < r.End { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeOutsideRange attempts to merge the segment containing r.Start with its // predecessor, and the segment containing r.End-1 with its successor. // // MergeOutsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid two redundant searches. func (s *unwasteSet) MergeOutsideRange(r __generics_imported0.FileRange) { first := s.FindSegment(r.Start) if first.Ok() { if prev := first.PrevSegment(); prev.Ok() { s.Merge(prev, first) } } last := s.FindSegment(r.End - 1) if last.Ok() { if next := last.NextSegment(); next.Ok() { s.Merge(last, next) } } } // Split splits the given segment at the given key and returns iterators to the // two resulting segments. All existing iterators (including seg, but not // including the returned iterators) are invalidated. // // If the segment cannot be split at split (because split is at the start or // end of the segment's range, so splitting would produce a segment with zero // length, or because split falls outside the segment's range altogether), // Split panics. func (s *unwasteSet) Split(seg unwasteIterator, split uint64) (unwasteIterator, unwasteIterator) { if !seg.Range().CanSplitAt(split) { panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) } return s.SplitUnchecked(seg, split) } // SplitUnchecked splits the given segment at the given key and returns // iterators to the two resulting segments. All existing iterators (including // seg, but not including the returned iterators) are invalidated. // // Preconditions: seg.Start() < key < seg.End(). func (s *unwasteSet) SplitUnchecked(seg unwasteIterator, split uint64) (unwasteIterator, unwasteIterator) { val1, val2 := (unwasteSetFunctions{}).Split(seg.Range(), seg.Value(), split) end2 := seg.End() seg.SetEndUnchecked(split) seg.SetValue(val1) seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), __generics_imported0.FileRange{split, end2}, val2) return seg2.PrevSegment(), seg2 } // SplitBefore ensures that the given segment's start is at least start by // splitting at start if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterator) are invalidated. // // SplitBefore is usually when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, the first segment may // extend beyond the start of the range to be mutated, and needs to be // SplitBefore to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter; i.e. SplitBefore needs to be invoked on each segment, while // SplitAfter only needs to be invoked on the first. // // Preconditions: start < seg.End(). func (s *unwasteSet) SplitBefore(seg unwasteIterator, start uint64) unwasteIterator { if seg.Range().CanSplitAt(start) { _, seg = s.SplitUnchecked(seg, start) } return seg } // SplitAfter ensures that the given segment's end is at most end by splitting // at end if necessary, and returns an updated iterator to the bounded segment. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. // // SplitAfter is usually used when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, each iterated segment // may extend beyond the end of the range to be mutated, and needs to be // SplitAfter to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter exchange roles; i.e. SplitBefore needs to be invoked on each // segment, while SplitAfter only needs to be invoked on the first. // // Preconditions: seg.Start() < end. func (s *unwasteSet) SplitAfter(seg unwasteIterator, end uint64) unwasteIterator { if seg.Range().CanSplitAt(end) { seg, _ = s.SplitUnchecked(seg, end) } return seg } // Isolate ensures that the given segment's range is a subset of r by splitting // at r.Start and r.End if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterators) are invalidated. // // Isolate is usually used when mutating part of a single segment, or when // mutating segments in a range where the first segment is not necessarily // split, making use of SplitBefore/SplitAfter complex. // // Preconditions: seg.Range().Overlaps(r). func (s *unwasteSet) Isolate(seg unwasteIterator, r __generics_imported0.FileRange) unwasteIterator { if seg.Range().CanSplitAt(r.Start) { _, seg = s.SplitUnchecked(seg, r.Start) } if seg.Range().CanSplitAt(r.End) { seg, _ = s.SplitUnchecked(seg, r.End) } return seg } // LowerBoundSegmentSplitBefore combines LowerBoundSegment and SplitBefore. // // LowerBoundSegmentSplitBefore is usually used when mutating segments in a // range while iterating them in order of increasing keys. In such cases, // LowerBoundSegmentSplitBefore provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *unwasteSet) LowerBoundSegmentSplitBefore(min uint64) unwasteIterator { seg := s.LowerBoundSegment(min) if seg.Ok() { seg = s.SplitBefore(seg, min) } return seg } // UpperBoundSegmentSplitAfter combines UpperBoundSegment and SplitAfter. // // UpperBoundSegmentSplitAfter is usually used when mutating segments in a // range while iterating them in order of decreasing keys. In such cases, // UpperBoundSegmentSplitAfter provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *unwasteSet) UpperBoundSegmentSplitAfter(max uint64) unwasteIterator { seg := s.UpperBoundSegment(max) if seg.Ok() { seg = s.SplitAfter(seg, max) } return seg } // VisitRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments will not be split, so f may be called // on segments lying partially outside r. Non-empty gaps between segments are // skipped. If a call to f returns false, VisitRange stops iteration // immediately. // // N.B. f must not invalidate iterators into s. func (s *unwasteSet) VisitRange(r __generics_imported0.FileRange, f func(seg unwasteIterator) bool) { for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { if !f(seg) { return } } } // VisitFullRange is equivalent to VisitRange, except that if any key in r that // is visited before f returns false does not correspond to a segment, // VisitFullRange panics. func (s *unwasteSet) VisitFullRange(r __generics_imported0.FileRange, f func(seg unwasteIterator) bool) { pos := r.Start seg := s.FindSegment(r.Start) for { if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", pos)) } if !f(seg) { return } pos = seg.End() if r.End <= pos { return } seg, _ = seg.NextNonEmpty() } } // MutateRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments that lie partially outside r are split // before f is called, such that f only observes segments entirely within r. // Iterated segments are merged again after f is called. Non-empty gaps between // segments are skipped. If a call to f returns false, MutateRange stops // iteration immediately. // // MutateRange invalidates all existing iterators. // // N.B. f must not invalidate iterators into s. func (s *unwasteSet) MutateRange(r __generics_imported0.FileRange, f func(seg unwasteIterator) bool) { seg := s.LowerBoundSegmentSplitBefore(r.Start) for seg.Ok() && seg.Start() < r.End { seg = s.SplitAfter(seg, r.End) cont := f(seg) seg = s.MergePrev(seg) if !cont { s.MergeNext(seg) return } seg = seg.NextSegment() } if seg.Ok() { s.MergePrev(seg) } } // MutateFullRange is equivalent to MutateRange, except that if any key in r // that is visited before f returns false does not correspond to a segment, // MutateFullRange panics. func (s *unwasteSet) MutateFullRange(r __generics_imported0.FileRange, f func(seg unwasteIterator) bool) { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) cont := f(seg) end := seg.End() seg = s.MergePrev(seg) if !cont || r.End <= end { s.MergeNext(seg) return } seg = seg.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // +stateify savable type unwastenode struct { // An internal binary tree node looks like: // // K // / \ // Cl Cr // // where all keys in the subtree rooted by Cl (the left subtree) are less // than K (the key of the parent node), and all keys in the subtree rooted // by Cr (the right subtree) are greater than K. // // An internal B-tree node's indexes work out to look like: // // K0 K1 K2 ... Kn-1 // / \/ \/ \ ... / \ // C0 C1 C2 C3 ... Cn-1 Cn // // where n is nrSegments. nrSegments int // parent is a pointer to this node's parent. If this node is root, parent // is nil. parent *unwastenode // parentIndex is the index of this node in parent.children. parentIndex int // Flag for internal nodes that is technically redundant with "children[0] // != nil", but is stored in the first cache line. "hasChildren" rather // than "isLeaf" because false must be the correct value for an empty root. hasChildren bool // The longest gap within this node. If the node is a leaf, it's simply the // maximum gap among all the (nrSegments+1) gaps formed by its nrSegments keys // including the 0th and nrSegments-th gap possibly shared with its upper-level // nodes; if it's a non-leaf node, it's the max of all children's maxGap. maxGap unwastedynamicGap // Nodes store keys and values in separate arrays to maximize locality in // the common case (scanning keys for lookup). keys [unwastemaxDegree - 1]__generics_imported0.FileRange values [unwastemaxDegree - 1]unwasteInfo children [unwastemaxDegree]*unwastenode } // firstSegment returns the first segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *unwastenode) firstSegment() unwasteIterator { for n.hasChildren { n = n.children[0] } return unwasteIterator{n, 0} } // lastSegment returns the last segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *unwastenode) lastSegment() unwasteIterator { for n.hasChildren { n = n.children[n.nrSegments] } return unwasteIterator{n, n.nrSegments - 1} } func (n *unwastenode) prevSibling() *unwastenode { if n.parent == nil || n.parentIndex == 0 { return nil } return n.parent.children[n.parentIndex-1] } func (n *unwastenode) nextSibling() *unwastenode { if n.parent == nil || n.parentIndex == n.parent.nrSegments { return nil } return n.parent.children[n.parentIndex+1] } // rebalanceBeforeInsert splits n and its ancestors if they are full, as // required for insertion, and returns an updated iterator to the position // represented by gap. func (n *unwastenode) rebalanceBeforeInsert(gap unwasteGapIterator) unwasteGapIterator { if n.nrSegments < unwastemaxDegree-1 { return gap } if n.parent != nil { gap = n.parent.rebalanceBeforeInsert(gap) } if n.parent == nil { left := &unwastenode{ nrSegments: unwasteminDegree - 1, parent: n, parentIndex: 0, hasChildren: n.hasChildren, } right := &unwastenode{ nrSegments: unwasteminDegree - 1, parent: n, parentIndex: 1, hasChildren: n.hasChildren, } copy(left.keys[:unwasteminDegree-1], n.keys[:unwasteminDegree-1]) copy(left.values[:unwasteminDegree-1], n.values[:unwasteminDegree-1]) copy(right.keys[:unwasteminDegree-1], n.keys[unwasteminDegree:]) copy(right.values[:unwasteminDegree-1], n.values[unwasteminDegree:]) n.keys[0], n.values[0] = n.keys[unwasteminDegree-1], n.values[unwasteminDegree-1] unwastezeroValueSlice(n.values[1:]) if n.hasChildren { copy(left.children[:unwasteminDegree], n.children[:unwasteminDegree]) copy(right.children[:unwasteminDegree], n.children[unwasteminDegree:]) unwastezeroNodeSlice(n.children[2:]) for i := 0; i < unwasteminDegree; i++ { left.children[i].parent = left left.children[i].parentIndex = i right.children[i].parent = right right.children[i].parentIndex = i } } n.nrSegments = 1 n.hasChildren = true n.children[0] = left n.children[1] = right if unwastetrackGaps != 0 { left.updateMaxGapLocal() right.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < unwasteminDegree { return unwasteGapIterator{left, gap.index} } return unwasteGapIterator{right, gap.index - unwasteminDegree} } copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[unwasteminDegree-1], n.values[unwasteminDegree-1] copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { n.parent.children[i].parentIndex = i } sibling := &unwastenode{ nrSegments: unwasteminDegree - 1, parent: n.parent, parentIndex: n.parentIndex + 1, hasChildren: n.hasChildren, } n.parent.children[n.parentIndex+1] = sibling n.parent.nrSegments++ copy(sibling.keys[:unwasteminDegree-1], n.keys[unwasteminDegree:]) copy(sibling.values[:unwasteminDegree-1], n.values[unwasteminDegree:]) unwastezeroValueSlice(n.values[unwasteminDegree-1:]) if n.hasChildren { copy(sibling.children[:unwasteminDegree], n.children[unwasteminDegree:]) unwastezeroNodeSlice(n.children[unwasteminDegree:]) for i := 0; i < unwasteminDegree; i++ { sibling.children[i].parent = sibling sibling.children[i].parentIndex = i } } n.nrSegments = unwasteminDegree - 1 if unwastetrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < unwasteminDegree { return gap } return unwasteGapIterator{sibling, gap.index - unwasteminDegree} } // rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient // (contain fewer segments than required by B-tree invariants), as required for // removal, and returns an updated iterator to the position represented by gap. // // Precondition: n is the only node in the tree that may currently violate a // B-tree invariant. func (n *unwastenode) rebalanceAfterRemove(gap unwasteGapIterator) unwasteGapIterator { for { if n.nrSegments >= unwasteminDegree-1 { return gap } if n.parent == nil { return gap } if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= unwasteminDegree { copy(n.keys[1:], n.keys[:n.nrSegments]) copy(n.values[1:], n.values[:n.nrSegments]) n.keys[0] = n.parent.keys[n.parentIndex-1] n.values[0] = n.parent.values[n.parentIndex-1] n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] unwasteSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { copy(n.children[1:], n.children[:n.nrSegments+1]) n.children[0] = sibling.children[sibling.nrSegments] sibling.children[sibling.nrSegments] = nil n.children[0].parent = n n.children[0].parentIndex = 0 for i := 1; i < n.nrSegments+2; i++ { n.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if unwastetrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling && gap.index == sibling.nrSegments { return unwasteGapIterator{n, 0} } if gap.node == n { return unwasteGapIterator{n, gap.index + 1} } return gap } if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= unwasteminDegree { n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] n.values[n.nrSegments] = n.parent.values[n.parentIndex] n.parent.keys[n.parentIndex] = sibling.keys[0] n.parent.values[n.parentIndex] = sibling.values[0] copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) unwasteSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { n.children[n.nrSegments+1] = sibling.children[0] copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) sibling.children[sibling.nrSegments] = nil n.children[n.nrSegments+1].parent = n n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 for i := 0; i < sibling.nrSegments; i++ { sibling.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if unwastetrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling { if gap.index == 0 { return unwasteGapIterator{n, n.nrSegments} } return unwasteGapIterator{sibling, gap.index - 1} } return gap } p := n.parent if p.nrSegments == 1 { left, right := p.children[0], p.children[1] p.nrSegments = left.nrSegments + right.nrSegments + 1 p.hasChildren = left.hasChildren p.keys[left.nrSegments] = p.keys[0] p.values[left.nrSegments] = p.values[0] copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := 0; i < p.nrSegments+1; i++ { p.children[i].parent = p p.children[i].parentIndex = i } } else { p.children[0] = nil p.children[1] = nil } if gap.node == left { return unwasteGapIterator{p, gap.index} } if gap.node == right { return unwasteGapIterator{p, gap.index + left.nrSegments + 1} } return gap } // Merge n and either sibling, along with the segment separating the // two, into whichever of the two nodes comes first. This is the // reverse of the non-root splitting case in // node.rebalanceBeforeInsert. var left, right *unwastenode if n.parentIndex > 0 { left = n.prevSibling() right = n } else { left = n right = n.nextSibling() } if gap.node == right { gap = unwasteGapIterator{left, gap.index + left.nrSegments + 1} } left.keys[left.nrSegments] = p.keys[left.parentIndex] left.values[left.nrSegments] = p.values[left.parentIndex] copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { left.children[i].parent = left left.children[i].parentIndex = i } } left.nrSegments += right.nrSegments + 1 copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) unwasteSetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) for i := 0; i < p.nrSegments; i++ { p.children[i].parentIndex = i } p.children[p.nrSegments] = nil p.nrSegments-- if unwastetrackGaps != 0 { left.updateMaxGapLocal() } n = p } } // updateMaxGapLeaf updates maxGap bottom-up from the calling leaf until no // necessary update. // // Preconditions: n must be a leaf node, trackGaps must be 1. func (n *unwastenode) updateMaxGapLeaf() { if n.hasChildren { panic(fmt.Sprintf("updateMaxGapLeaf should always be called on leaf node: %v", n)) } max := n.calculateMaxGapLeaf() if max == n.maxGap.Get() { return } oldMax := n.maxGap.Get() n.maxGap.Set(max) if max > oldMax { for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() >= max { break } p.maxGap.Set(max) } return } for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() > oldMax { break } parentNewMax := p.calculateMaxGapInternal() if p.maxGap.Get() == parentNewMax { break } p.maxGap.Set(parentNewMax) } } // updateMaxGapLocal updates maxGap of the calling node solely with no // propagation to ancestor nodes. // // Precondition: trackGaps must be 1. func (n *unwastenode) updateMaxGapLocal() { if !n.hasChildren { n.maxGap.Set(n.calculateMaxGapLeaf()) } else { n.maxGap.Set(n.calculateMaxGapInternal()) } } // calculateMaxGapLeaf iterates the gaps within a leaf node and calculate the // max. // // Preconditions: n must be a leaf node. func (n *unwastenode) calculateMaxGapLeaf() uint64 { max := unwasteGapIterator{n, 0}.Range().Length() for i := 1; i <= n.nrSegments; i++ { if current := (unwasteGapIterator{n, i}).Range().Length(); current > max { max = current } } return max } // calculateMaxGapInternal iterates children's maxGap within an internal node n // and calculate the max. // // Preconditions: n must be a non-leaf node. func (n *unwastenode) calculateMaxGapInternal() uint64 { max := n.children[0].maxGap.Get() for i := 1; i <= n.nrSegments; i++ { if current := n.children[i].maxGap.Get(); current > max { max = current } } return max } // searchFirstLargeEnoughGap returns the first gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *unwastenode) searchFirstLargeEnoughGap(minSize uint64) unwasteGapIterator { if n.maxGap.Get() < minSize { return unwasteGapIterator{} } if n.hasChildren { for i := 0; i <= n.nrSegments; i++ { if largeEnoughGap := n.children[i].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := 0; i <= n.nrSegments; i++ { currentGap := unwasteGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // searchLastLargeEnoughGap returns the last gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *unwastenode) searchLastLargeEnoughGap(minSize uint64) unwasteGapIterator { if n.maxGap.Get() < minSize { return unwasteGapIterator{} } if n.hasChildren { for i := n.nrSegments; i >= 0; i-- { if largeEnoughGap := n.children[i].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := n.nrSegments; i >= 0; i-- { currentGap := unwasteGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // A Iterator is conceptually one of: // // - A pointer to a segment in a set; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Iterators are copyable values and are meaningfully equality-comparable. The // zero value of Iterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type unwasteIterator struct { // node is the node containing the iterated segment. If the iterator is // terminal, node is nil. node *unwastenode // index is the index of the segment in node.keys/values. index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (seg unwasteIterator) Ok() bool { return seg.node != nil } // Range returns the iterated segment's range key. func (seg unwasteIterator) Range() __generics_imported0.FileRange { return seg.node.keys[seg.index] } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (seg unwasteIterator) Start() uint64 { return seg.node.keys[seg.index].Start } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (seg unwasteIterator) End() uint64 { return seg.node.keys[seg.index].End } // SetRangeUnchecked mutates the iterated segment's range key. This operation // does not invalidate any iterators. // // Preconditions: // - r.Length() > 0. // - The new range must not overlap an existing one: // - If seg.NextSegment().Ok(), then r.end <= seg.NextSegment().Start(). // - If seg.PrevSegment().Ok(), then r.start >= seg.PrevSegment().End(). func (seg unwasteIterator) SetRangeUnchecked(r __generics_imported0.FileRange) { seg.node.keys[seg.index] = r } // SetRange mutates the iterated segment's range key. If the new range would // cause the iterated segment to overlap another segment, or if the new range // is invalid, SetRange panics. This operation does not invalidate any // iterators. func (seg unwasteIterator) SetRange(r __generics_imported0.FileRange) { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) } if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) } seg.SetRangeUnchecked(r) } // SetStartUnchecked mutates the iterated segment's start. This operation does // not invalidate any iterators. // // Preconditions: The new start must be valid: // - start < seg.End() // - If seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). func (seg unwasteIterator) SetStartUnchecked(start uint64) { seg.node.keys[seg.index].Start = start } // SetStart mutates the iterated segment's start. If the new start value would // cause the iterated segment to overlap another segment, or would result in an // invalid range, SetStart panics. This operation does not invalidate any // iterators. func (seg unwasteIterator) SetStart(start uint64) { if start >= seg.End() { panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) } if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) } seg.SetStartUnchecked(start) } // SetEndUnchecked mutates the iterated segment's end. This operation does not // invalidate any iterators. // // Preconditions: The new end must be valid: // - end > seg.Start(). // - If seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). func (seg unwasteIterator) SetEndUnchecked(end uint64) { seg.node.keys[seg.index].End = end } // SetEnd mutates the iterated segment's end. If the new end value would cause // the iterated segment to overlap another segment, or would result in an // invalid range, SetEnd panics. This operation does not invalidate any // iterators. func (seg unwasteIterator) SetEnd(end uint64) { if end <= seg.Start() { panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) } if next := seg.NextSegment(); next.Ok() && end > next.Start() { panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) } seg.SetEndUnchecked(end) } // Value returns a copy of the iterated segment's value. func (seg unwasteIterator) Value() unwasteInfo { return seg.node.values[seg.index] } // ValuePtr returns a pointer to the iterated segment's value. The pointer is // invalidated if the iterator is invalidated. This operation does not // invalidate any iterators. func (seg unwasteIterator) ValuePtr() *unwasteInfo { return &seg.node.values[seg.index] } // SetValue mutates the iterated segment's value. This operation does not // invalidate any iterators. func (seg unwasteIterator) SetValue(val unwasteInfo) { seg.node.values[seg.index] = val } // PrevSegment returns the iterated segment's predecessor. If there is no // preceding segment, PrevSegment returns a terminal iterator. func (seg unwasteIterator) PrevSegment() unwasteIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment() } if seg.index > 0 { return unwasteIterator{seg.node, seg.index - 1} } if seg.node.parent == nil { return unwasteIterator{} } return unwastesegmentBeforePosition(seg.node.parent, seg.node.parentIndex) } // NextSegment returns the iterated segment's successor. If there is no // succeeding segment, NextSegment returns a terminal iterator. func (seg unwasteIterator) NextSegment() unwasteIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment() } if seg.index < seg.node.nrSegments-1 { return unwasteIterator{seg.node, seg.index + 1} } if seg.node.parent == nil { return unwasteIterator{} } return unwastesegmentAfterPosition(seg.node.parent, seg.node.parentIndex) } // PrevGap returns the gap immediately before the iterated segment. func (seg unwasteIterator) PrevGap() unwasteGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment().NextGap() } return unwasteGapIterator{seg.node, seg.index} } // NextGap returns the gap immediately after the iterated segment. func (seg unwasteIterator) NextGap() unwasteGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment().PrevGap() } return unwasteGapIterator{seg.node, seg.index + 1} } // PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, // or the gap before the iterated segment otherwise. If seg.Start() == // Functions.MinKey(), PrevNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by PrevNonEmpty will be // non-terminal. func (seg unwasteIterator) PrevNonEmpty() (unwasteIterator, unwasteGapIterator) { if prev := seg.PrevSegment(); prev.Ok() && prev.End() == seg.Start() { return prev, unwasteGapIterator{} } return unwasteIterator{}, seg.PrevGap() } // NextNonEmpty returns the iterated segment's successor if it is adjacent, or // the gap after the iterated segment otherwise. If seg.End() == // Functions.MaxKey(), NextNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by NextNonEmpty will be // non-terminal. func (seg unwasteIterator) NextNonEmpty() (unwasteIterator, unwasteGapIterator) { if next := seg.NextSegment(); next.Ok() && next.Start() == seg.End() { return next, unwasteGapIterator{} } return unwasteIterator{}, seg.NextGap() } // A GapIterator is conceptually one of: // // - A pointer to a position between two segments, before the first segment, or // after the last segment in a set, called a *gap*; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Note that the gap between two adjacent segments exists (iterators to it are // non-terminal), but has a length of zero. GapIterator.IsEmpty returns true // for such gaps. An empty set contains a single gap, spanning the entire range // of the set's keys. // // GapIterators are copyable values and are meaningfully equality-comparable. // The zero value of GapIterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type unwasteGapIterator struct { // The representation of a GapIterator is identical to that of an Iterator, // except that index corresponds to positions between segments in the same // way as for node.children (see comment for node.nrSegments). node *unwastenode index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (gap unwasteGapIterator) Ok() bool { return gap.node != nil } // Range returns the range spanned by the iterated gap. func (gap unwasteGapIterator) Range() __generics_imported0.FileRange { return __generics_imported0.FileRange{gap.Start(), gap.End()} } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (gap unwasteGapIterator) Start() uint64 { if ps := gap.PrevSegment(); ps.Ok() { return ps.End() } return unwasteSetFunctions{}.MinKey() } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (gap unwasteGapIterator) End() uint64 { if ns := gap.NextSegment(); ns.Ok() { return ns.Start() } return unwasteSetFunctions{}.MaxKey() } // IsEmpty returns true if the iterated gap is empty (that is, the "gap" is // between two adjacent segments.) func (gap unwasteGapIterator) IsEmpty() bool { return gap.Range().Length() == 0 } // PrevSegment returns the segment immediately before the iterated gap. If no // such segment exists, PrevSegment returns a terminal iterator. func (gap unwasteGapIterator) PrevSegment() unwasteIterator { return unwastesegmentBeforePosition(gap.node, gap.index) } // NextSegment returns the segment immediately after the iterated gap. If no // such segment exists, NextSegment returns a terminal iterator. func (gap unwasteGapIterator) NextSegment() unwasteIterator { return unwastesegmentAfterPosition(gap.node, gap.index) } // PrevGap returns the iterated gap's predecessor. If no such gap exists, // PrevGap returns a terminal iterator. func (gap unwasteGapIterator) PrevGap() unwasteGapIterator { seg := gap.PrevSegment() if !seg.Ok() { return unwasteGapIterator{} } return seg.PrevGap() } // NextGap returns the iterated gap's successor. If no such gap exists, NextGap // returns a terminal iterator. func (gap unwasteGapIterator) NextGap() unwasteGapIterator { seg := gap.NextSegment() if !seg.Ok() { return unwasteGapIterator{} } return seg.NextGap() } // NextLargeEnoughGap returns the iterated gap's first next gap with larger // length than minSize. If not found, return a terminal gap iterator (does NOT // include this gap itself). // // Precondition: trackGaps must be 1. func (gap unwasteGapIterator) NextLargeEnoughGap(minSize uint64) unwasteGapIterator { if unwastetrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == gap.node.nrSegments { gap.node = gap.NextSegment().node gap.index = 0 return gap.nextLargeEnoughGapHelper(minSize) } return gap.nextLargeEnoughGapHelper(minSize) } // nextLargeEnoughGapHelper is the helper function used by NextLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the trailing gap of a non-leaf node. func (gap unwasteGapIterator) nextLargeEnoughGapHelper(minSize uint64) unwasteGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == gap.node.nrSegments)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return unwasteGapIterator{} } gap.index++ for gap.index <= gap.node.nrSegments { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index++ } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == gap.node.nrSegments { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // PrevLargeEnoughGap returns the iterated gap's first prev gap with larger or // equal length than minSize. If not found, return a terminal gap iterator // (does NOT include this gap itself). // // Precondition: trackGaps must be 1. func (gap unwasteGapIterator) PrevLargeEnoughGap(minSize uint64) unwasteGapIterator { if unwastetrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == 0 { gap.node = gap.PrevSegment().node gap.index = gap.node.nrSegments return gap.prevLargeEnoughGapHelper(minSize) } return gap.prevLargeEnoughGapHelper(minSize) } // prevLargeEnoughGapHelper is the helper function used by PrevLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the first gap of a non-leaf node. func (gap unwasteGapIterator) prevLargeEnoughGapHelper(minSize uint64) unwasteGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == 0)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return unwasteGapIterator{} } gap.index-- for gap.index >= 0 { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index-- } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == 0 { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // segmentBeforePosition returns the predecessor segment of the position given // by n.children[i], which may or may not contain a child. If no such segment // exists, segmentBeforePosition returns a terminal iterator. func unwastesegmentBeforePosition(n *unwastenode, i int) unwasteIterator { for i == 0 { if n.parent == nil { return unwasteIterator{} } n, i = n.parent, n.parentIndex } return unwasteIterator{n, i - 1} } // segmentAfterPosition returns the successor segment of the position given by // n.children[i], which may or may not contain a child. If no such segment // exists, segmentAfterPosition returns a terminal iterator. func unwastesegmentAfterPosition(n *unwastenode, i int) unwasteIterator { for i == n.nrSegments { if n.parent == nil { return unwasteIterator{} } n, i = n.parent, n.parentIndex } return unwasteIterator{n, i} } func unwastezeroValueSlice(slice []unwasteInfo) { for i := range slice { unwasteSetFunctions{}.ClearValue(&slice[i]) } } func unwastezeroNodeSlice(slice []*unwastenode) { for i := range slice { slice[i] = nil } } // String stringifies a Set for debugging. func (s *unwasteSet) String() string { return s.root.String() } // String stringifies a node (and all of its children) for debugging. func (n *unwastenode) String() string { var buf bytes.Buffer n.writeDebugString(&buf, "") return buf.String() } func (n *unwastenode) writeDebugString(buf *bytes.Buffer, prefix string) { if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { buf.WriteString(prefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) } for i := 0; i < n.nrSegments; i++ { if child := n.children[i]; child != nil { cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) if child.parent != n || child.parentIndex != i { buf.WriteString(cprefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) } child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) } buf.WriteString(prefix) if n.hasChildren { if unwastetrackGaps != 0 { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v, maxGap: %d\n", i, n.keys[i], n.values[i], n.maxGap.Get())) } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } if child := n.children[n.nrSegments]; child != nil { child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) } } // FlatSegment represents a segment as a single object. FlatSegment is used as // an intermediate representation for save/restore and tests. // // +stateify savable type unwasteFlatSegment struct { Start uint64 End uint64 Value unwasteInfo } // ExportSlice returns a copy of all segments in the given set, in ascending // key order. func (s *unwasteSet) ExportSlice() []unwasteFlatSegment { var fs []unwasteFlatSegment for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { fs = append(fs, unwasteFlatSegment{ Start: seg.Start(), End: seg.End(), Value: seg.Value(), }) } return fs } // ImportSlice initializes the given set from the given slice. // // Preconditions: // - s must be empty. // - fs must represent a valid set (the segments in fs must have valid // lengths that do not overlap). // - The segments in fs must be sorted in ascending key order. func (s *unwasteSet) ImportSlice(fs []unwasteFlatSegment) error { if !s.IsEmpty() { return fmt.Errorf("cannot import into non-empty set %v", s) } gap := s.FirstGap() for i := range fs { f := &fs[i] r := __generics_imported0.FileRange{f.Start, f.End} if !gap.Range().IsSupersetOf(r) { return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: %v => %v", r, f.Value) } gap = s.InsertWithoutMerging(gap, r, f.Value).NextGap() } return nil } // segmentTestCheck returns an error if s is incorrectly sorted, does not // contain exactly expectedSegments segments, or contains a segment which // fails the passed check. // // This should be used only for testing, and has been added to this package for // templating convenience. func (s *unwasteSet) segmentTestCheck(expectedSegments int, segFunc func(int, __generics_imported0.FileRange, unwasteInfo) error) error { havePrev := false prev := uint64(0) nrSegments := 0 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { next := seg.Start() if havePrev && prev >= next { return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments) } if segFunc != nil { if err := segFunc(nrSegments, seg.Range(), seg.Value()); err != nil { return err } } prev = next havePrev = true nrSegments++ } if nrSegments != expectedSegments { return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments) } return nil } // countSegments counts the number of segments in the set. // // Similar to Check, this should only be used for testing. func (s *unwasteSet) countSegments() (segments int) { for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { segments++ } return segments } func (s *unwasteSet) saveRoot() []unwasteFlatSegment { fs := s.ExportSlice() fs = fs[:len(fs):len(fs)] return fs } func (s *unwasteSet) loadRoot(_ context.Context, fs []unwasteFlatSegment) { if err := s.ImportSlice(fs); err != nil { panic(err) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/000077500000000000000000000000001465435605700225745ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/context.go000066400000000000000000000021161465435605700246070ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package platform import ( "gvisor.dev/gvisor/pkg/context" ) // contextID is the auth package's type for context.Context.Value keys. type contextID int const ( // CtxPlatform is a Context.Value key for a Platform. CtxPlatform contextID = iota ) // FromContext returns the Platform that is used to execute ctx's application // code, or nil if no such Platform exists. func FromContext(ctx context.Context) Platform { if v := ctx.Value(CtxPlatform); v != nil { return v.(Platform) } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/cpuid_amd64.go000066400000000000000000000040721465435605700252250ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package platform import ( "bytes" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/usermem" ) // taskWrapper wraps a context.Context. type taskWrapper struct { context.Context } // emulationContext is used for emulation. // // It wraps an existing context but prioritizes resolution via context.NoTask, // since the task state should not be modified during emulation. However, we // allow logging and other operations to be directed to the correct task. type emulationContext struct { taskWrapper context.NoTask } // TryCPUIDEmulate checks for a CPUID instruction and performs emulation. func TryCPUIDEmulate(ctx context.Context, mm MemoryManager, ac *arch.Context64) bool { s := ac.StateData() inst := make([]byte, len(arch.CPUIDInstruction)) tasklessCtx := emulationContext{ taskWrapper: taskWrapper{ctx}, } if _, err := mm.CopyIn(&tasklessCtx, hostarch.Addr(s.Regs.Rip), inst, usermem.IOOpts{ IgnorePermissions: true, AddressSpaceActive: true, }); err != nil { return false } if !bytes.Equal(inst, arch.CPUIDInstruction[:]) { return false } fs := cpuid.FromContext(ctx) out := fs.Function.Query(cpuid.In{ Eax: uint32(s.Regs.Rax), Ecx: uint32(s.Regs.Rcx), }) s.Regs.Rax = uint64(out.Eax) s.Regs.Rbx = uint64(out.Ebx) s.Regs.Rcx = uint64(out.Ecx) s.Regs.Rdx = uint64(out.Edx) s.Regs.Rip += uint64(len(inst)) return true } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/cpuid_arm64.go000066400000000000000000000015761465435605700252510ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package platform import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" ) // TryCPUIDEmulate always returns false: there is no cpuid. func TryCPUIDEmulate(ctx context.Context, mm MemoryManager, ac *arch.Context64) bool { return false } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/interrupt/000077500000000000000000000000001465435605700246305ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/interrupt/interrupt.go000066400000000000000000000051311465435605700272130ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package interrupt provides an interrupt helper. package interrupt import ( "fmt" "gvisor.dev/gvisor/pkg/sync" ) // Receiver receives interrupt notifications from a Forwarder. type Receiver interface { // NotifyInterrupt is called when the Receiver receives an interrupt. NotifyInterrupt() } // Forwarder is a helper for delivering delayed signal interruptions. // // This helps platform implementations with Interrupt semantics. type Forwarder struct { // mu protects the below. mu sync.Mutex // dst is the function to be called when NotifyInterrupt() is called. If // dst is nil, pending will be set instead, causing the next call to // Enable() to return false. dst Receiver pending bool } // Enable attempts to enable interrupt forwarding to r. If f has already // received an interrupt, Enable does nothing and returns false. Otherwise, // future calls to f.NotifyInterrupt() cause r.NotifyInterrupt() to be called, // and Enable returns true. // // Usage: // // if !f.Enable(r) { // // There was an interrupt. // return // } // // defer f.Disable() // // Preconditions: // - r must not be nil. // - f must not already be forwarding interrupts to a Receiver. func (f *Forwarder) Enable(r Receiver) bool { if r == nil { panic("nil Receiver") } f.mu.Lock() if f.dst != nil { f.mu.Unlock() panic(fmt.Sprintf("already forwarding interrupts to %+v", f.dst)) } if f.pending { f.pending = false f.mu.Unlock() return false } f.dst = r f.mu.Unlock() return true } // Disable stops interrupt forwarding. If interrupt forwarding is already // disabled, Disable is a no-op. func (f *Forwarder) Disable() { f.mu.Lock() f.dst = nil f.mu.Unlock() } // NotifyInterrupt implements Receiver.NotifyInterrupt. If interrupt forwarding // is enabled, the configured Receiver will be notified. Otherwise the // interrupt will be delivered to the next call to Enable. func (f *Forwarder) NotifyInterrupt() { f.mu.Lock() if f.dst != nil { f.dst.NotifyInterrupt() } else { f.pending = true } f.mu.Unlock() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/interrupt/interrupt_state_autogen.go000066400000000000000000000000731465435605700321350ustar00rootroot00000000000000// automatically generated by stateify. package interrupt golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/000077500000000000000000000000001465435605700233715ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/address_space.go000066400000000000000000000156071465435605700265310ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kvm import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" ) // dirtySet tracks vCPUs for invalidation. type dirtySet struct { vCPUMasks []atomicbitops.Uint64 } // forEach iterates over all CPUs in the dirty set. // //go:nosplit func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) { for index := range ds.vCPUMasks { mask := ds.vCPUMasks[index].Swap(0) if mask != 0 { for bit := 0; bit < 64; bit++ { if mask&(1< 0 { physical, length, ok := translateToPhysical(m.addr) if !ok { panic("unable to translate segment") } if length > m.length { length = m.length } // Ensure that this map has physical mappings. If the page does // not have physical mappings, the KVM module may inject // spurious exceptions when emulation fails (i.e. it tries to // emulate because the RIP is pointed at those pages). as.machine.mapPhysical(physical, length, physicalRegions) // Install the page table mappings. Note that the ordering is // important; if the pagetable mappings were installed before // ensuring the physical pages were available, then some other // thread could theoretically access them. inv = as.pageTables.Map(addr, length, pagetables.MapOpts{ AccessType: at, User: true, }, physical) || inv m.addr += length m.length -= length addr += hostarch.Addr(length) } return inv } // MapFile implements platform.AddressSpace.MapFile. func (as *addressSpace) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error { as.mu.Lock() defer as.mu.Unlock() // Get mappings in the sentry's address space, which are guaranteed to be // valid as long as a reference is held on the mapped pages (which is in // turn required by AddressSpace.MapFile precondition). // // If precommit is true, we will touch mappings to commit them, so ensure // that mappings are readable from sentry context. // // We don't execute from application file-mapped memory, and guest page // tables don't care if we have execute permission (but they do need pages // to be readable). bs, err := f.MapInternal(fr, hostarch.AccessType{ Read: at.Read || at.Execute || precommit, Write: at.Write, }) if err != nil { return err } // See block in mapLocked. as.pageTables.Allocator.(*allocator).cpu = as.machine.Get() defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu) // Map the mappings in the sentry's address space (guest physical memory) // into the application's address space (guest virtual memory). inv := false for !bs.IsEmpty() { b := bs.Head() bs = bs.Tail() // Since fr was page-aligned, b should also be page-aligned. We do the // lookup in our host page tables for this translation. if precommit { s := b.ToSlice() for i := 0; i < len(s); i += hostarch.PageSize { _ = s[i] // Touch to commit. } } // See bluepill_allocator.go. bluepill(as.pageTables.Allocator.(*allocator).cpu) // Perform the mapping. prev := as.mapLocked(addr, hostMapEntry{ addr: b.Addr(), length: uintptr(b.Len()), }, at) inv = inv || prev addr += hostarch.Addr(b.Len()) } if inv { as.invalidate() } return nil } // unmapLocked is an escape-checked wrapped around Unmap. // // +checkescape:hard,stack // //go:nosplit func (as *addressSpace) unmapLocked(addr hostarch.Addr, length uint64) bool { return as.pageTables.Unmap(addr, uintptr(length)) } // Unmap unmaps the given range by calling pagetables.PageTables.Unmap. func (as *addressSpace) Unmap(addr hostarch.Addr, length uint64) { as.mu.Lock() defer as.mu.Unlock() // See above & bluepill_allocator.go. as.pageTables.Allocator.(*allocator).cpu = as.machine.Get() defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu) bluepill(as.pageTables.Allocator.(*allocator).cpu) if prev := as.unmapLocked(addr, length); prev { // Invalidate all active vCPUs. as.invalidate() // Recycle any freed intermediate pages. as.pageTables.Allocator.Recycle() } } // Release releases the page tables. func (as *addressSpace) Release() { as.Unmap(0, ^uint64(0)) // Free all pages from the allocator. as.pageTables.Allocator.(*allocator).base.Drain() // Drop all cached machine references. as.machine.dropPageTables(as.pageTables) } // PreFork implements platform.AddressSpace.PreFork. func (as *addressSpace) PreFork() {} // PostFork implements platform.AddressSpace.PostFork. func (as *addressSpace) PostFork() {} golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/address_space_amd64.go000066400000000000000000000016441465435605700275200ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kvm // invalidate is the implementation for Invalidate. func (as *addressSpace) invalidate() { timer := asInvalidateDuration.Start() as.dirtySet.forEach(as.machine, func(c *vCPU) { if c.active.get() == as { // If this happens to be active, c.BounceToKernel() // ... force a kernel transition. } }) timer.Finish() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/address_space_arm64.go000066400000000000000000000014631465435605700275350ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kvm import ( "gvisor.dev/gvisor/pkg/ring0" ) // invalidate is the implementation for Invalidate. func (as *addressSpace) invalidate() { bluepill(as.pageTables.Allocator.(*allocator).cpu) ring0.FlushTlbAll() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/atomicptr_machine_unsafe.go000066400000000000000000000023251465435605700307510ustar00rootroot00000000000000package kvm import ( "context" "sync/atomic" "unsafe" ) // An AtomicPtr is a pointer to a value of type Value that can be atomically // loaded and stored. The zero value of an AtomicPtr represents nil. // // Note that copying AtomicPtr by value performs a non-atomic read of the // stored pointer, which is unsafe if Store() can be called concurrently; in // this case, do `dst.Store(src.Load())` instead. // // +stateify savable type machineAtomicPtr struct { ptr unsafe.Pointer `state:".(*machine)"` } func (p *machineAtomicPtr) savePtr() *machine { return p.Load() } func (p *machineAtomicPtr) loadPtr(_ context.Context, v *machine) { p.Store(v) } // Load returns the value set by the most recent Store. It returns nil if there // has been no previous call to Store. // //go:nosplit func (p *machineAtomicPtr) Load() *machine { return (*machine)(atomic.LoadPointer(&p.ptr)) } // Store sets the value returned by Load to x. func (p *machineAtomicPtr) Store(x *machine) { atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x)) } // Swap atomically stores `x` into *p and returns the previous *p value. func (p *machineAtomicPtr) Swap(x *machine) *machine { return (*machine)(atomic.SwapPointer(&p.ptr, (unsafe.Pointer)(x))) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/bluepill.go000066400000000000000000000062511465435605700255340ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kvm import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sighandling" ) // bluepill enters guest mode. func bluepill(*vCPU) // sighandler is the signal entry point. func sighandler() // dieTrampoline is the assembly trampoline. This calls dieHandler. // // This uses an architecture-specific calling convention, documented in // dieArchSetup and the assembly implementation for dieTrampoline. func dieTrampoline() // Return the start address of the functions above. // // In Go 1.17+, Go references to assembly functions resolve to an ABIInternal // wrapper function rather than the function itself. We must reference from // assembly to get the ABI0 (i.e., primary) address. func addrOfSighandler() uintptr func addrOfDieTrampoline() uintptr var ( // bounceSignal is the signal used for bouncing KVM. // // We use SIGCHLD because it is not masked by the runtime, and // it will be ignored properly by other parts of the kernel. bounceSignal = unix.SIGCHLD // bounceSignalMask has only bounceSignal set. bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1)) // bounce is the interrupt vector used to return to the kernel. bounce = uint32(ring0.VirtualizationException) // savedHandler is a pointer to the previous handler. // // This is called by bluepillHandler. savedHandler uintptr // savedSigsysHandler is a pointer to the previous handler of the SIGSYS signals. savedSigsysHandler uintptr // dieTrampolineAddr is the address of dieTrampoline. dieTrampolineAddr uintptr ) // _SYS_KVM_RETURN_TO_HOST is the system call that is used to transition // to host. const _SYS_KVM_RETURN_TO_HOST = ^uintptr(0) // redpill invokes a syscall with -1. // //go:nosplit func redpill() { unix.RawSyscall(_SYS_KVM_RETURN_TO_HOST, 0, 0, 0) } // dieHandler is called by dieTrampoline. // //go:nosplit func dieHandler(c *vCPU) { throw(c.dieState.message) } // die is called to set the vCPU up to panic. // // This loads vCPU state, and sets up a call for the trampoline. // //go:nosplit func (c *vCPU) die(context *arch.SignalContext64, msg string) { // Save the death message, which will be thrown. c.dieState.message = msg // Setup the trampoline. dieArchSetup(c, context, &c.dieState.guestRegs) } func init() { // Install the handler. if err := sighandling.ReplaceSignalHandler(bluepillSignal, addrOfSighandler(), &savedHandler); err != nil { panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err)) } // Extract the address for the trampoline. dieTrampolineAddr = addrOfDieTrampoline() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/bluepill_allocator.go000066400000000000000000000051261465435605700275740ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kvm import ( "fmt" "gvisor.dev/gvisor/pkg/ring0/pagetables" ) type allocator struct { base pagetables.RuntimeAllocator // cpu must be set prior to any pagetable operation. // // Due to the way KVM's shadow paging implementation works, // modifications to the page tables while in host mode may not be // trapped, leading to the shadow pages being out of sync. Therefore, // we need to ensure that we are in guest mode for page table // modifications. See the call to bluepill, below. cpu *vCPU } // newAllocator is used to define the allocator. func newAllocator() *allocator { a := new(allocator) a.base.Init() return a } // NewPTEs implements pagetables.Allocator.NewPTEs. // // +checkescape:all // //go:nosplit func (a *allocator) NewPTEs() *pagetables.PTEs { ptes := a.base.NewPTEs() // escapes: bluepill below. if a.cpu != nil { bluepill(a.cpu) } return ptes } // PhysicalFor returns the physical address for a set of PTEs. // // +checkescape:all // //go:nosplit func (a *allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr { virtual := a.base.PhysicalFor(ptes) physical, _, ok := translateToPhysical(virtual) if !ok { panic(fmt.Sprintf("PhysicalFor failed for %p", ptes)) // escapes: panic. } return physical } // LookupPTEs implements pagetables.Allocator.LookupPTEs. // // +checkescape:all // //go:nosplit func (a *allocator) LookupPTEs(physical uintptr) *pagetables.PTEs { virtualStart, physicalStart, _, pr := calculateBluepillFault(physical, physicalRegions) if pr == nil { panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical)) // escapes: panic. } return a.base.LookupPTEs(virtualStart + (physical - physicalStart)) } // FreePTEs implements pagetables.Allocator.FreePTEs. // // +checkescape:all // //go:nosplit func (a *allocator) FreePTEs(ptes *pagetables.PTEs) { a.base.FreePTEs(ptes) // escapes: bluepill below. if a.cpu != nil { bluepill(a.cpu) } } // Recycle implements pagetables.Allocator.Recycle. // //go:nosplit func (a *allocator) Recycle() { a.base.Recycle() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/bluepill_amd64.go000066400000000000000000000121151465435605700265230ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package kvm import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/sentry/arch" ) var ( // The action for bluepillSignal is changed by sigaction(). bluepillSignal = unix.SIGSEGV ) // bluepillArchEnter is called during bluepillEnter. // //go:nosplit func bluepillArchEnter(context *arch.SignalContext64) *vCPU { c := vCPUPtr(uintptr(context.Rax)) regs := c.CPU.Registers() regs.R8 = context.R8 regs.R9 = context.R9 regs.R10 = context.R10 regs.R11 = context.R11 regs.R12 = context.R12 regs.R13 = context.R13 regs.R14 = context.R14 regs.R15 = context.R15 regs.Rdi = context.Rdi regs.Rsi = context.Rsi regs.Rbp = context.Rbp regs.Rbx = context.Rbx regs.Rdx = context.Rdx regs.Rax = context.Rax regs.Rcx = context.Rcx regs.Rsp = context.Rsp regs.Rip = context.Rip regs.Eflags = context.Eflags regs.Eflags &^= uint64(ring0.KernelFlagsClear) regs.Eflags |= ring0.KernelFlagsSet regs.Cs = uint64(ring0.Kcode) regs.Ds = uint64(ring0.Udata) regs.Es = uint64(ring0.Udata) regs.Ss = uint64(ring0.Kdata) return c } // hltSanityCheck verifies the current state to detect obvious corruption. // //go:nosplit func (c *vCPU) hltSanityCheck() { vector := c.CPU.Vector() switch ring0.Vector(vector) { case ring0.PageFault: if c.CPU.FaultAddr() < ring0.KernelStartAddress { return } case ring0.DoubleFault: case ring0.GeneralProtectionFault: case ring0.InvalidOpcode: case ring0.MachineCheck: case ring0.VirtualizationException: default: return } printHex([]byte("Vector = "), uint64(c.CPU.Vector())) printHex([]byte("FaultAddr = "), uint64(c.CPU.FaultAddr())) printHex([]byte("rip = "), uint64(c.CPU.Registers().Rip)) printHex([]byte("rsp = "), uint64(c.CPU.Registers().Rsp)) throw("fault") } // KernelSyscall handles kernel syscalls. // // +checkescape:all // //go:nosplit func (c *vCPU) KernelSyscall() { regs := c.Registers() if regs.Rax != ^uint64(0) { regs.Rip -= 2 // Rewind. } // N.B. Since KernelSyscall is called when the kernel makes a syscall, // FS_BASE is already set for correct execution of this function. // // Refresher on syscall/exception handling: // 1. When the sentry is in guest mode and makes a syscall, it goes to // sysenter(), which saves the register state (including RIP of SYSCALL // instruction) to vCPU.registers. // 2. It then calls KernelSyscall, which rewinds the IP and executes // HLT. // 3. HLT does a VM-exit to bluepillHandler, which returns from the // signal handler using vCPU.registers, directly to the SYSCALL // instruction. // 4. Later, when we want to re-use the vCPU (perhaps on a different // host thread), we set the new thread's registers in vCPU.registers // (as opposed to setting the KVM registers with KVM_SET_REGS). // 5. KVM_RUN thus enters the guest with the old register state, // immediately following the HLT instruction, returning here. // 6. We then restore FS_BASE and the full registers from vCPU.register // to return from sysenter() back to the desired bluepill point from // the host. ring0.HaltAndWriteFSBase(regs) // escapes: no, reload host segment. } // KernelException handles kernel exceptions. // // +checkescape:all // //go:nosplit func (c *vCPU) KernelException(vector ring0.Vector) { regs := c.Registers() if vector == ring0.Vector(bounce) { // This go-routine was saved in hr3 and resumed in gr0 with the // userspace flags. Let's adjust flags and skip the interrupt. regs.Eflags &^= uint64(ring0.KernelFlagsClear) regs.Eflags |= ring0.KernelFlagsSet return } // See above. ring0.HaltAndWriteFSBase(regs) // escapes: no, reload host segment. } // bluepillArchExit is called during bluepillEnter. // //go:nosplit func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { regs := c.CPU.Registers() context.R8 = regs.R8 context.R9 = regs.R9 context.R10 = regs.R10 context.R11 = regs.R11 context.R12 = regs.R12 context.R13 = regs.R13 context.R14 = regs.R14 context.R15 = regs.R15 context.Rdi = regs.Rdi context.Rsi = regs.Rsi context.Rbp = regs.Rbp context.Rbx = regs.Rbx context.Rdx = regs.Rdx context.Rax = regs.Rax context.Rcx = regs.Rcx context.Rsp = regs.Rsp context.Rip = regs.Rip context.Eflags = regs.Eflags c.FloatingPointState().PrepForHostSigframe() // Set the context pointer to the saved floating point state. This is // where the guest data has been serialized, the kernel will restore // from this new pointer value. context.Fpstate = uint64(uintptrValue(c.FloatingPointState().BytePointer())) // escapes: no. } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/bluepill_amd64.s000066400000000000000000000071711465435605700263660ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "textflag.h" // VCPU_CPU is the location of the CPU in the vCPU struct. // // This is guaranteed to be zero. #define VCPU_CPU 0x0 // ENTRY_CPU_SELF is the location of the CPU in the entry struct. // // This is sourced from ring0. #define ENTRY_CPU_SELF 272 // +checkoffset ring0 kernelEntry.cpuSelf // Context offsets. // // Only limited use of the context is done in the assembly stub below, most is // done in the Go handlers. However, the RIP must be examined. #define CONTEXT_RAX 0x90 #define CONTEXT_RIP 0xa8 #define CONTEXT_FP 0xe0 // CLI is the literal byte for the disable interrupts instruction. // // This is checked as the source of the fault. #define CLI $0xfa // System call definitions. #define SYS_MMAP 9 // See bluepill.go. TEXT ·bluepill(SB),NOSPLIT|NOFRAME,$0 begin: MOVQ arg+0(FP), AX LEAQ VCPU_CPU(AX), BX // The gorountine stack will be changed in guest which renders // the frame pointer outdated and misleads perf tools. // Disconnect the frame-chain with the zeroed frame pointer // when it is saved in the frame in bluepillHandler(). MOVQ BP, CX MOVQ $0, BP BYTE CLI; MOVQ CX, BP check_vcpu: MOVQ ENTRY_CPU_SELF(GS), CX CMPQ BX, CX JE right_vCPU wrong_vcpu: CALL ·redpill(SB) JMP begin right_vCPU: RET // sighandler: see bluepill.go for documentation. // // The arguments are the following: // // DI - The signal number. // SI - Pointer to siginfo_t structure. // DX - Pointer to ucontext structure. // TEXT ·sighandler(SB),NOSPLIT|NOFRAME,$0 // Check if the signal is from the kernel. MOVQ $0x80, CX CMPL CX, 0x8(SI) JNE fallback // Check if RIP is disable interrupts. MOVQ CONTEXT_RIP(DX), CX CMPQ CX, $0x0 JE fallback CMPB 0(CX), CLI JNE fallback // Call the bluepillHandler. PUSHQ DX // First argument (context). CALL ·bluepillHandler(SB) // Call the handler. POPQ DX // Discard the argument. RET fallback: // Jump to the previous signal handler. XORQ CX, CX MOVQ ·savedHandler(SB), AX JMP AX // func addrOfSighandler() uintptr TEXT ·addrOfSighandler(SB), $0-8 MOVQ $·sighandler(SB), AX MOVQ AX, ret+0(FP) RET TEXT ·sigsysHandler(SB),NOSPLIT|NOFRAME,$0 // Check if the signal is from the kernel. MOVQ $1, CX CMPL CX, 0x8(SI) JNE fallback MOVL CONTEXT_RAX(DX), CX CMPL CX, $SYS_MMAP JNE fallback PUSHQ DX // First argument (context). CALL ·seccompMmapHandler(SB) // Call the handler. POPQ DX // Discard the argument. RET fallback: // Jump to the previous signal handler. XORQ CX, CX MOVQ ·savedSigsysHandler(SB), AX JMP AX // func addrOfSighandler() uintptr TEXT ·addrOfSigsysHandler(SB), $0-8 MOVQ $·sigsysHandler(SB), AX MOVQ AX, ret+0(FP) RET // dieTrampoline: see bluepill.go, bluepill_amd64_unsafe.go for documentation. TEXT ·dieTrampoline(SB),NOSPLIT|NOFRAME,$0 PUSHQ BX // First argument (vCPU). PUSHQ AX // Fake the old RIP as caller. JMP ·dieHandler(SB) // func addrOfDieTrampoline() uintptr TEXT ·addrOfDieTrampoline(SB), $0-8 MOVQ $·dieTrampoline(SB), AX MOVQ AX, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go000066400000000000000000000077461465435605700301020ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package kvm import ( "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/sentry/arch" ) // dieArchSetup initializes the state for dieTrampoline. // // The amd64 dieTrampoline requires the vCPU to be set in BX, and the last RIP // to be in AX. The trampoline then simulates a call to dieHandler from the // provided RIP. // //go:nosplit func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) { // Reload all registers to have an accurate stack trace when we return // to host mode. This means that the stack should be unwound correctly. if errno := c.getUserRegisters(&c.dieState.guestRegs); errno != 0 { throw(c.dieState.message) } // If the vCPU is in user mode, we set the stack to the stored stack // value in the vCPU itself. We don't want to unwind the user stack. if guestRegs.RFLAGS&ring0.UserFlagsSet == ring0.UserFlagsSet { regs := c.CPU.Registers() context.Rax = regs.Rax context.Rsp = regs.Rsp context.Rbp = regs.Rbp } else { context.Rax = guestRegs.RIP context.Rsp = guestRegs.RSP context.Rbp = guestRegs.RBP context.Eflags = guestRegs.RFLAGS } context.Rbx = uint64(uintptr(unsafe.Pointer(c))) context.Rip = uint64(dieTrampolineAddr) } // getHypercallID returns hypercall ID. // //go:nosplit func getHypercallID(addr uintptr) int { return _KVM_HYPERCALL_MAX } // bluepillStopGuest is responsible for injecting interrupt. // //go:nosplit func bluepillStopGuest(c *vCPU) { // Interrupt: we must have requested an interrupt // window; set the interrupt line. if _, _, errno := unix.RawSyscall( unix.SYS_IOCTL, uintptr(c.fd), KVM_INTERRUPT, uintptr(unsafe.Pointer(&bounce))); errno != 0 { throw("interrupt injection failed") } // Clear previous injection request. c.runData.requestInterruptWindow = 0 } // bluepillSigBus is responsible for injecting NMI to trigger sigbus. // //go:nosplit func bluepillSigBus(c *vCPU) { if _, _, errno := unix.RawSyscall( // escapes: no. unix.SYS_IOCTL, uintptr(c.fd), KVM_NMI, 0); errno != 0 { throw("NMI injection failed") } } // bluepillHandleEnosys is responsible for handling enosys error. // //go:nosplit func bluepillHandleEnosys(c *vCPU) { throw("run failed: ENOSYS") } // bluepillReadyStopGuest checks whether the current vCPU is ready for interrupt injection. // //go:nosplit func bluepillReadyStopGuest(c *vCPU) bool { if c.runData.readyForInterruptInjection == 0 { return false } if c.runData.ifFlag == 0 { // This is impossible if readyForInterruptInjection is 1. throw("interrupts are disabled") } // Disable interrupts if we are in the kernel space. // // When the Sentry switches into the kernel mode, it disables // interrupts. But when goruntime switches on a goroutine which has // been saved in the host mode, it restores flags and this enables // interrupts. See the comment of UserFlagsSet for more details. uregs := userRegs{} err := c.getUserRegisters(&uregs) if err != 0 { throw("failed to get user registers") } if ring0.IsKernelFlags(uregs.RFLAGS) { uregs.RFLAGS &^= ring0.KernelFlagsClear err = c.setUserRegisters(&uregs) if err != 0 { throw("failed to set user registers") } return false } return true } // bluepillArchHandleExit checks architecture specific exitcode. // //go:nosplit func bluepillArchHandleExit(c *vCPU, context unsafe.Pointer) { c.die(bluepillArchContext(context), "unknown") } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/bluepill_arm64.go000066400000000000000000000063201465435605700265420ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package kvm import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/sentry/arch" ) var ( // The action for bluepillSignal is changed by sigaction(). bluepillSignal = unix.SIGILL ) // getTLS returns the value of TPIDR_EL0 register. // //go:nosplit func getTLS() (value uint64) // setTLS writes the TPIDR_EL0 value. // //go:nosplit func setTLS(value uint64) // bluepillArchEnter is called during bluepillEnter. // //go:nosplit func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) { c = vCPUPtr(uintptr(context.Regs[8])) regs := c.CPU.Registers() regs.Regs = context.Regs regs.Sp = context.Sp regs.Pc = context.Pc regs.Pstate = context.Pstate regs.Pstate &^= uint64(ring0.PsrFlagsClear) regs.Pstate |= ring0.KernelFlagsSet regs.TPIDR_EL0 = getTLS() return } // bluepillArchExit is called during bluepillEnter. // //go:nosplit func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { regs := c.CPU.Registers() context.Regs = regs.Regs context.Sp = regs.Sp context.Pc = regs.Pc context.Pstate = regs.Pstate context.Pstate &^= uint64(ring0.PsrFlagsClear) context.Pstate |= ring0.UserFlagsSet setTLS(regs.TPIDR_EL0) lazyVfp := c.GetLazyVFP() if lazyVfp != 0 { fpsimd := fpsimdPtr(c.FloatingPointState().BytePointer()) // escapes: no context.Fpsimd64.Fpsr = fpsimd.Fpsr context.Fpsimd64.Fpcr = fpsimd.Fpcr context.Fpsimd64.Vregs = fpsimd.Vregs } } // KernelSyscall handles kernel syscalls. // // +checkescape:all // //go:nosplit func (c *vCPU) KernelSyscall() { regs := c.Registers() if regs.Regs[8] != ^uint64(0) { regs.Pc -= 4 // Rewind. } fpDisableTrap := ring0.CPACREL1() if fpDisableTrap != 0 { fpsimd := fpsimdPtr(c.FloatingPointState().BytePointer()) // escapes: no fpcr := ring0.GetFPCR() fpsr := ring0.GetFPSR() fpsimd.Fpcr = uint32(fpcr) fpsimd.Fpsr = uint32(fpsr) ring0.SaveVRegs(c.FloatingPointState().BytePointer()) // escapes: no } ring0.Halt() } // KernelException handles kernel exceptions. // // +checkescape:all // //go:nosplit func (c *vCPU) KernelException(vector ring0.Vector) { regs := c.Registers() if vector == ring0.Vector(bounce) { regs.Pc = 0 } fpDisableTrap := ring0.CPACREL1() if fpDisableTrap != 0 { fpsimd := fpsimdPtr(c.FloatingPointState().BytePointer()) // escapes: no fpcr := ring0.GetFPCR() fpsr := ring0.GetFPSR() fpsimd.Fpcr = uint32(fpcr) fpsimd.Fpsr = uint32(fpsr) ring0.SaveVRegs(c.FloatingPointState().BytePointer()) // escapes: no } ring0.Halt() } // hltSanityCheck verifies the current state to detect obvious corruption. // //go:nosplit func (c *vCPU) hltSanityCheck() { } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/bluepill_arm64.s000066400000000000000000000067071465435605700264100ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "textflag.h" // VCPU_CPU is the location of the CPU in the vCPU struct. // // This is guaranteed to be zero. #define VCPU_CPU 0x0 // CPU_SELF is the self reference in ring0's percpu. // // This is guaranteed to be zero. #define CPU_SELF 0x0 // Context offsets. // // Only limited use of the context is done in the assembly stub below, most is // done in the Go handlers. #define SIGINFO_SIGNO 0x0 #define SIGINFO_CODE 0x8 #define CONTEXT_PC 0x1B8 #define CONTEXT_R0 0xB8 #define SYS_MMAP 222 // getTLS returns the value of TPIDR_EL0 register. TEXT ·getTLS(SB),NOSPLIT,$0-8 MRS TPIDR_EL0, R1 MOVD R1, value+0(FP) RET // setTLS writes the TPIDR_EL0 value. TEXT ·setTLS(SB),NOSPLIT,$0-8 MOVD value+0(FP), R1 MSR R1, TPIDR_EL0 RET // See bluepill.go. TEXT ·bluepill(SB),NOSPLIT,$0 begin: MOVD arg+0(FP), R8 MOVD $VCPU_CPU(R8), R9 ORR $0xffff000000000000, R9, R9 // Trigger sigill. // In ring0.Start(), the value of R8 will be stored into tpidr_el1. // When the context was loaded into vcpu successfully, // we will check if the value of R10 and R9 are the same. WORD $0xd538d08a // MRS TPIDR_EL1, R10 check_vcpu: CMP R10, R9 BEQ right_vCPU wrong_vcpu: CALL ·redpill(SB) B begin right_vCPU: RET // sighandler: see bluepill.go for documentation. // // The arguments are the following: // // R0 - The signal number. // R1 - Pointer to siginfo_t structure. // R2 - Pointer to ucontext structure. // TEXT ·sighandler(SB),NOSPLIT,$0 // si_signo should be sigill. MOVD SIGINFO_SIGNO(R1), R7 CMPW $4, R7 BNE fallback MOVD CONTEXT_PC(R2), R7 CMPW $0, R7 BEQ fallback MOVD R2, 8(RSP) BL ·bluepillHandler(SB) // Call the handler. RET fallback: // Jump to the previous signal handler. MOVD ·savedHandler(SB), R7 B (R7) // func addrOfSighandler() uintptr TEXT ·addrOfSighandler(SB), $0-8 MOVD $·sighandler(SB), R0 MOVD R0, ret+0(FP) RET // The arguments are the following: // // R0 - The signal number. // R1 - Pointer to siginfo_t structure. // R2 - Pointer to ucontext structure. // TEXT ·sigsysHandler(SB),NOSPLIT,$0 // si_code should be SYS_SECCOMP. MOVD SIGINFO_CODE(R1), R7 CMPW $1, R7 BNE fallback CMPW $SYS_MMAP, R8 BNE fallback MOVD R2, 8(RSP) BL ·seccompMmapHandler(SB) // Call the handler. RET fallback: // Jump to the previous signal handler. MOVD ·savedHandler(SB), R7 B (R7) // func addrOfSighandler() uintptr TEXT ·addrOfSigsysHandler(SB), $0-8 MOVD $·sigsysHandler(SB), R0 MOVD R0, ret+0(FP) RET // dieTrampoline: see bluepill.go, bluepill_arm64_unsafe.go for documentation. TEXT ·dieTrampoline(SB),NOSPLIT,$0 // R0: Fake the old PC as caller // R1: First argument (vCPU) MOVD.P R1, 8(RSP) // R1: First argument (vCPU) MOVD.P R0, 8(RSP) // R0: Fake the old PC as caller B ·dieHandler(SB) // func addrOfDieTrampoline() uintptr TEXT ·addrOfDieTrampoline(SB), $0-8 MOVD $·dieTrampoline(SB), R0 MOVD R0, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go000066400000000000000000000110111465435605700300740ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package kvm import ( "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/sentry/arch" ) // fpsimdPtr returns a fpsimd64 for the given address. // //go:nosplit func fpsimdPtr(addr *byte) *arch.FpsimdContext { return (*arch.FpsimdContext)(unsafe.Pointer(addr)) } // dieArchSetup initializes the state for dieTrampoline. // // The arm64 dieTrampoline requires the vCPU to be set in R1, and the last PC // to be in R0. The trampoline then simulates a call to dieHandler from the // provided PC. // //go:nosplit func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) { // If the vCPU is in user mode, we set the stack to the stored stack // value in the vCPU itself. We don't want to unwind the user stack. if guestRegs.Regs.Pstate&ring0.PsrModeMask == ring0.UserFlagsSet { regs := c.CPU.Registers() context.Regs[0] = regs.Regs[0] context.Sp = regs.Sp context.Regs[29] = regs.Regs[29] // stack base address } else { context.Regs[0] = guestRegs.Regs.Pc context.Sp = guestRegs.Regs.Sp context.Regs[29] = guestRegs.Regs.Regs[29] context.Pstate = guestRegs.Regs.Pstate } context.Regs[1] = uint64(uintptr(unsafe.Pointer(c))) context.Pc = uint64(dieTrampolineAddr) } // bluepillArchFpContext returns the arch-specific fpsimd context. // //go:nosplit func bluepillArchFpContext(context unsafe.Pointer) *arch.FpsimdContext { return &((*arch.SignalContext64)(context).Fpsimd64) } // getHypercallID returns hypercall ID. // // On Arm64, the MMIO address should be 64-bit aligned. // //go:nosplit func getHypercallID(addr uintptr) int { if addr < arm64HypercallMMIOBase || addr >= (arm64HypercallMMIOBase+_AARCH64_HYPERCALL_MMIO_SIZE) { return _KVM_HYPERCALL_MAX } else { return int(((addr) - arm64HypercallMMIOBase) >> 3) } } // bluepillStopGuest is responsible for injecting sError. // //go:nosplit func bluepillStopGuest(c *vCPU) { // vcpuSErrBounce is the event of system error for bouncing KVM. vcpuSErrBounce := &kvmVcpuEvents{ exception: exception{ sErrPending: 1, }, } if _, _, errno := unix.RawSyscall( // escapes: no. unix.SYS_IOCTL, uintptr(c.fd), KVM_SET_VCPU_EVENTS, uintptr(unsafe.Pointer(vcpuSErrBounce))); errno != 0 { throw("bounce sErr injection failed") } } // bluepillSigBus is responsible for injecting sError to trigger sigbus. // //go:nosplit func bluepillSigBus(c *vCPU) { // vcpuSErrNMI is the event of system error to trigger sigbus. vcpuSErrNMI := &kvmVcpuEvents{ exception: exception{ sErrPending: 1, sErrHasEsr: 1, sErrEsr: _ESR_ELx_SERR_NMI, }, } // Host must support ARM64_HAS_RAS_EXTN. if _, _, errno := unix.RawSyscall( // escapes: no. unix.SYS_IOCTL, uintptr(c.fd), KVM_SET_VCPU_EVENTS, uintptr(unsafe.Pointer(vcpuSErrNMI))); errno != 0 { if errno == unix.EINVAL { throw("No ARM64_HAS_RAS_EXTN feature in host.") } throw("nmi sErr injection failed") } } // bluepillExtDabt is responsible for injecting external data abort. // //go:nosplit func bluepillExtDabt(c *vCPU) { // vcpuExtDabt is the event of ext_dabt. vcpuExtDabt := &kvmVcpuEvents{ exception: exception{ extDabtPending: 1, }, } if _, _, errno := unix.RawSyscall( // escapes: no. unix.SYS_IOCTL, uintptr(c.fd), KVM_SET_VCPU_EVENTS, uintptr(unsafe.Pointer(vcpuExtDabt))); errno != 0 { throw("ext_dabt injection failed") } } // bluepillHandleEnosys is responsible for handling enosys error. // //go:nosplit func bluepillHandleEnosys(c *vCPU) { bluepillExtDabt(c) } // bluepillReadyStopGuest checks whether the current vCPU is ready for sError injection. // //go:nosplit func bluepillReadyStopGuest(c *vCPU) bool { return true } // bluepillArchHandleExit checks architecture specific exitcode. // //go:nosplit func bluepillArchHandleExit(c *vCPU, context unsafe.Pointer) { switch c.runData.exitReason { case _KVM_EXIT_ARM_NISV: bluepillExtDabt(c) default: c.die(bluepillArchContext(context), "unknown") } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/bluepill_fault.go000066400000000000000000000102461465435605700267260ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kvm import ( "sync/atomic" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/hostarch" ) const ( // faultBlockSize is the size used for servicing memory faults. // // This should be large enough to avoid frequent faults and avoid using // all available KVM slots (~512), but small enough that KVM does not // complain about slot sizes (~4GB). See handleBluepillFault for how // this block is used. faultBlockSize = 2 << 30 // faultBlockMask is the mask for the fault blocks. // // This must be typed to avoid overflow complaints (ugh). faultBlockMask = ^uintptr(faultBlockSize - 1) ) // yield yields the CPU. // //go:nosplit func yield() { unix.RawSyscall(unix.SYS_SCHED_YIELD, 0, 0, 0) } // calculateBluepillFault calculates the fault address range. // //go:nosplit func calculateBluepillFault(physical uintptr, phyRegions []physicalRegion) (virtualStart, physicalStart, length uintptr, pr *physicalRegion) { alignedPhysical := physical &^ uintptr(hostarch.PageSize-1) for i, pr := range phyRegions { end := pr.physical + pr.length if physical < pr.physical || physical >= end { continue } // Adjust the block to match our size. physicalStart = pr.physical + (alignedPhysical-pr.physical)&faultBlockMask virtualStart = pr.virtual + (physicalStart - pr.physical) physicalEnd := physicalStart + faultBlockSize if physicalEnd > end { physicalEnd = end } length = physicalEnd - physicalStart return virtualStart, physicalStart, length, &phyRegions[i] } return 0, 0, 0, nil } // handleBluepillFault handles a physical fault. // // The corresponding virtual address is returned. This may throw on error. // //go:nosplit func handleBluepillFault(m *machine, physical uintptr, phyRegions []physicalRegion) (uintptr, bool) { // Paging fault: we need to map the underlying physical pages for this // fault. This all has to be done in this function because we're in a // signal handler context. (We can't call any functions that might // split the stack.) virtualStart, physicalStart, length, pr := calculateBluepillFault(physical, phyRegions) if pr == nil { return 0, false } // Set the KVM slot. // // First, we need to acquire the exclusive right to set a slot. See // machine.nextSlot for information about the protocol. slot := m.nextSlot.Swap(^uint32(0)) for slot == ^uint32(0) { yield() // Race with another call. slot = m.nextSlot.Swap(^uint32(0)) } flags := _KVM_MEM_FLAGS_NONE if pr.readOnly { flags |= _KVM_MEM_READONLY } errno := m.setMemoryRegion(int(slot), physicalStart, length, virtualStart, flags) if errno == 0 { // Store the physical address in the slot. This is used to // avoid calls to handleBluepillFault in the future (see // machine.mapPhysical). atomic.StoreUintptr(&m.usedSlots[slot], physicalStart) // Successfully added region; we can increment nextSlot and // allow another set to proceed here. m.nextSlot.Store(slot + 1) return virtualStart + (physical - physicalStart), true } // Release our slot (still available). m.nextSlot.Store(slot) switch errno { case unix.EEXIST: // The region already exists. It's possible that we raced with // another vCPU here. We just revert nextSlot and return true, // because this must have been satisfied by some other vCPU. return virtualStart + (physical - physicalStart), true case unix.EINVAL: throw("set memory region failed; out of slots") case unix.ENOMEM: throw("set memory region failed: out of memory") case unix.EFAULT: throw("set memory region failed: invalid physical range") default: throw("set memory region failed: unknown reason") } panic("unreachable") } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/bluepill_unsafe.go000066400000000000000000000152341465435605700270760ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build go1.18 // +build go1.18 // //go:linkname directives type-checked by checklinkname. Any other // non-linkname assumptions outside the Go 1 compatibility guarantee should // have an accompanied vet check or version guard build tag. package kvm import ( "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sentry/arch" ) //go:linkname throw runtime.throw func throw(s string) // vCPUPtr returns a CPU for the given address. // //go:nosplit func vCPUPtr(addr uintptr) *vCPU { return (*vCPU)(unsafe.Pointer(addr)) } // bytePtr returns a bytePtr for the given address. // //go:nosplit func bytePtr(addr uintptr) *byte { return (*byte)(unsafe.Pointer(addr)) } // uintptrValue returns a uintptr for the given address. // //go:nosplit func uintptrValue(addr *byte) uintptr { return (uintptr)(unsafe.Pointer(addr)) } // bluepillArchContext returns the UContext64. // //go:nosplit func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 { return &((*arch.UContext64)(context).MContext) } // bluepillHandleHlt is responsible for handling VM-Exit. // //go:nosplit func bluepillGuestExit(c *vCPU, context unsafe.Pointer) { // Increment our counter. c.guestExits.Add(1) // Copy out registers. bluepillArchExit(c, bluepillArchContext(context)) // Return to the vCPUReady state; notify any waiters. user := c.state.Load() & vCPUUser switch c.state.Swap(user) { case user | vCPUGuest: // Expected case. case user | vCPUGuest | vCPUWaiter: c.notify() default: throw("invalid state") } } var hexSyms = []byte("0123456789abcdef") //go:nosplit func printHex(title []byte, val uint64) { var str [18]byte for i := 0; i < 16; i++ { str[16-i] = hexSyms[val&0xf] val = val >> 4 } str[0] = ' ' str[17] = '\n' unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&title[0])), uintptr(len(title))) unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&str)), 18) } // bluepillHandler is called from the signal stub. // // The world may be stopped while this is executing, and it executes on the // signal stack. It should only execute raw system calls and functions that are // explicitly marked go:nosplit. // // Ideally, this function should switch to gsignal, as runtime.sigtramp does, // but that is tedious given all the runtime internals. That said, using // gsignal inside a signal handler is not _required_, provided we avoid stack // splits and allocations. Note that calling any splittable function here will // be flaky; if the signal stack is below the G stack then we will trigger a // split and crash. If above, we won't trigger a split. // // +checkescape:all // //go:nosplit func bluepillHandler(context unsafe.Pointer) { // Sanitize the registers; interrupts must always be disabled. c := bluepillArchEnter(bluepillArchContext(context)) // Mark this as guest mode. switch c.state.Swap(vCPUGuest | vCPUUser) { case vCPUUser: // Expected case. case vCPUUser | vCPUWaiter: c.notify() default: throw("invalid state") } for { hostExitCounter.Increment() _, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(c.fd), KVM_RUN, 0) // escapes: no. switch errno { case 0: // Expected case. case unix.EINTR: interruptCounter.Increment() // First, we process whatever pending signal // interrupted KVM. Since we're in a signal handler // currently, all signals are masked and the signal // must have been delivered directly to this thread. timeout := unix.Timespec{} sig, _, errno := unix.RawSyscall6( // escapes: no. unix.SYS_RT_SIGTIMEDWAIT, uintptr(unsafe.Pointer(&bounceSignalMask)), 0, // siginfo. uintptr(unsafe.Pointer(&timeout)), // timeout. 8, // sigset size. 0, 0) if errno == unix.EAGAIN { continue } if errno != 0 { throw("error waiting for pending signal") } if sig != uintptr(bounceSignal) { throw("unexpected signal") } // Check whether the current state of the vCPU is ready // for interrupt injection. Because we don't have a // PIC, we can't inject an interrupt while they are // masked. We need to request a window if it's not // ready. if bluepillReadyStopGuest(c) { // Force injection below; the vCPU is ready. c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN } else { c.runData.requestInterruptWindow = 1 continue // Rerun vCPU. } case unix.EFAULT: // If a fault is not serviceable due to the host // backing pages having page permissions, instead of an // MMIO exit we receive EFAULT from the run ioctl. We // always inject an NMI here since we may be in kernel // mode and have interrupts disabled. bluepillSigBus(c) continue // Rerun vCPU. case unix.ENOSYS: bluepillHandleEnosys(c) continue default: throw("run failed") } switch c.runData.exitReason { case _KVM_EXIT_EXCEPTION: c.die(bluepillArchContext(context), "exception") return case _KVM_EXIT_IO: c.die(bluepillArchContext(context), "I/O") return case _KVM_EXIT_INTERNAL_ERROR: // An internal error is typically thrown when emulation // fails. This can occur via the MMIO path below (and // it might fail because we have multiple regions that // are not mapped). We would actually prefer that no // emulation occur, and don't mind at all if it fails. case _KVM_EXIT_HYPERCALL: c.die(bluepillArchContext(context), "hypercall") return case _KVM_EXIT_DEBUG: c.die(bluepillArchContext(context), "debug") return case _KVM_EXIT_HLT: c.hltSanityCheck() bluepillGuestExit(c, context) return case _KVM_EXIT_MMIO: physical := uintptr(c.runData.data[0]) if getHypercallID(physical) == _KVM_HYPERCALL_VMEXIT { bluepillGuestExit(c, context) return } c.die(bluepillArchContext(context), "exit_mmio") return case _KVM_EXIT_IRQ_WINDOW_OPEN: bluepillStopGuest(c) case _KVM_EXIT_SHUTDOWN: c.die(bluepillArchContext(context), "shutdown") return case _KVM_EXIT_FAIL_ENTRY: c.die(bluepillArchContext(context), "entry failed") return default: bluepillArchHandleExit(c, context) return } } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/context.go000066400000000000000000000101521465435605700254030ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kvm import ( "gvisor.dev/gvisor/pkg/abi/linux" pkgcontext "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" ) // platformContext is an implementation of the platform context. // // This is a thin wrapper around the machine. type platformContext struct { // machine is the parent machine, and is immutable. machine *machine // info is the linux.SignalInfo cached for this platformContext. info linux.SignalInfo // interrupt is the interrupt platformContext. interrupt interrupt.Forwarder } // tryCPUIDError indicates that CPUID emulation should occur. type tryCPUIDError struct{} // Error implements error.Error. func (tryCPUIDError) Error() string { return "cpuid emulation failed" } // Switch runs the provided platformContext in the given address space. func (c *platformContext) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac *arch.Context64, _ int32) (*linux.SignalInfo, hostarch.AccessType, error) { as := mm.AddressSpace() localAS := as.(*addressSpace) restart: // Grab a vCPU. cpu := c.machine.Get() // Enable interrupts (i.e. calls to vCPU.Notify). if !c.interrupt.Enable(cpu) { c.machine.Put(cpu) // Already preempted. return nil, hostarch.NoAccess, platform.ErrContextInterrupt } // Set the active address space. // // This must be done prior to the call to Touch below. If the address // space is invalidated between this line and the call below, we will // flag on entry anyways. When the active address space below is // cleared, it indicates that we don't need an explicit interrupt and // that the flush can occur naturally on the next user entry. cpu.active.set(localAS) // Prepare switch options. switchOpts := ring0.SwitchOpts{ Registers: &ac.StateData().Regs, FloatingPointState: ac.FloatingPointData(), PageTables: localAS.pageTables, Flush: localAS.Touch(cpu), FullRestore: ac.FullRestore(), } // Take the blue pill. at, err := cpu.SwitchToUser(switchOpts, &c.info) // Clear the address space. cpu.active.set(nil) // Increment the number of user exits. cpu.userExits.Add(1) userExitCounter.Increment() // Release resources. c.machine.Put(cpu) // All done. c.interrupt.Disable() if err != nil { if _, ok := err.(tryCPUIDError); ok { // Does emulation work for the CPUID? // // We have to put the current vCPU, because // TryCPUIDEmulate needs to read a user memory and it // has to lock mm.activeMu for that, but it can race // with as.invalidate that bonce all vcpu-s to gr0 and // is called under mm.activeMu too. if platform.TryCPUIDEmulate(ctx, mm, ac) { goto restart } // If not a valid CPUID, then the signal should be // delivered as is and the information is filled. err = platform.ErrContextSignal } } return &c.info, at, err } // Interrupt interrupts the running context. func (c *platformContext) Interrupt() { c.interrupt.NotifyInterrupt() } // Release implements platform.Context.Release(). func (c *platformContext) Release() {} // FullStateChanged implements platform.Context.FullStateChanged. func (c *platformContext) FullStateChanged() {} // PullFullState implements platform.Context.PullFullState. func (c *platformContext) PullFullState(as platform.AddressSpace, ac *arch.Context64) error { return nil } // PrepareSleep implements platform.Context.platform.Context. func (*platformContext) PrepareSleep() {} golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/filters.go000066400000000000000000000037651465435605700254030ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kvm import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/platform" ) // SeccompInfo returns seccomp information for the KVM platform. func (k *KVM) SeccompInfo() platform.SeccompInfo { return platform.StaticSeccompInfo{ PlatformName: "kvm", Filters: k.archSyscallFilters().Merge(seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_IOCTL: seccomp.Or{ seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(KVM_RUN), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(KVM_SET_USER_MEMORY_REGION), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(KVM_GET_REGS), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(KVM_SET_REGS), }, }, unix.SYS_MEMBARRIER: seccomp.PerArg{ seccomp.EqualTo(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED), seccomp.EqualTo(0), }, unix.SYS_MMAP: seccomp.MatchAll{}, unix.SYS_RT_SIGSUSPEND: seccomp.MatchAll{}, unix.SYS_RT_SIGTIMEDWAIT: seccomp.MatchAll{}, _SYS_KVM_RETURN_TO_HOST: seccomp.MatchAll{}, })), HotSyscalls: hottestSyscalls(), } } // PrecompiledSeccompInfo implements // platform.Constructor.PrecompiledSeccompInfo. func (*constructor) PrecompiledSeccompInfo() []platform.SeccompInfo { return []platform.SeccompInfo{(*KVM)(nil).SeccompInfo()} } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/filters_amd64.go000066400000000000000000000031031465435605700263600ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kvm import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" ) // archSyscallFilters returns arch-specific syscalls made exclusively by the // KVM platform. func (k *KVM) archSyscallFilters() seccomp.SyscallRules { return seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_ARCH_PRCTL: seccomp.Or{ seccomp.PerArg{ seccomp.EqualTo(linux.ARCH_GET_FS), }, seccomp.PerArg{ seccomp.EqualTo(linux.ARCH_GET_GS), }, }, unix.SYS_IOCTL: seccomp.Or{ seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(KVM_INTERRUPT), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(KVM_NMI), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(KVM_GET_REGS), }, }, }) } // hottestSyscalls returns the list of hot syscalls for the KVM platform. func hottestSyscalls() []uintptr { return []uintptr{ unix.SYS_FUTEX, unix.SYS_IOCTL, unix.SYS_RT_SIGRETURN, } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/filters_arm64.go000066400000000000000000000022211465435605700263760ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package kvm import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/seccomp" ) // archSyscallFilters returns arch-specific syscalls made exclusively by the // KVM platform. func (*KVM) archSyscallFilters() seccomp.SyscallRules { return seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_IOCTL: seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(KVM_SET_VCPU_EVENTS), }, }) } // hottestSyscalls returns the list of hot syscalls for the KVM platform. func hottestSyscalls() []uintptr { return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/kvm.go000066400000000000000000000123461465435605700245230ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package kvm provides a kvm-based implementation of the platform interface. package kvm import ( "fmt" "golang.org/x/sys/unix" pkgcontext "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" ) // userMemoryRegion is a region of physical memory. // // This mirrors kvm_memory_region. type userMemoryRegion struct { slot uint32 flags uint32 guestPhysAddr uint64 memorySize uint64 userspaceAddr uint64 } // runData is the run structure. This may be mapped for synchronous register // access (although that doesn't appear to be supported by my kernel at least). // // This mirrors kvm_run. type runData struct { requestInterruptWindow uint8 _ [7]uint8 exitReason uint32 readyForInterruptInjection uint8 ifFlag uint8 _ [2]uint8 cr8 uint64 apicBase uint64 // This is the union data for exits. Interpretation depends entirely on // the exitReason above (see vCPU code for more information). data [32]uint64 } // KVM represents a lightweight VM context. type KVM struct { platform.NoCPUPreemptionDetection // KVM never changes mm_structs. platform.UseHostProcessMemoryBarrier platform.DoesOwnPageTables // machine is the backing VM. machine *machine } var ( globalOnce sync.Once globalErr error ) // OpenDevice opens the KVM device and returns the File. // If the devicePath is empty, it will default to /dev/kvm. func OpenDevice(devicePath string) (*fd.FD, error) { if devicePath == "" { devicePath = "/dev/kvm" } f, err := fd.Open(devicePath, unix.O_RDWR, 0) if err != nil { return nil, fmt.Errorf("error opening KVM device file (%s): %v", devicePath, err) } return f, nil } // New returns a new KVM-based implementation of the platform interface. func New(deviceFile *fd.FD) (*KVM, error) { fd := deviceFile.FD() // Ensure global initialization is done. globalOnce.Do(func() { globalErr = updateGlobalOnce(int(fd)) }) if globalErr != nil { return nil, globalErr } // Create a new VM fd. var ( vm uintptr errno unix.Errno ) for { vm, _, errno = unix.Syscall(unix.SYS_IOCTL, uintptr(fd), KVM_CREATE_VM, 0) if errno == unix.EINTR { continue } if errno != 0 { return nil, fmt.Errorf("creating VM: %v", errno) } break } // We are done with the device file. deviceFile.Close() // Create a VM context. machine, err := newMachine(int(vm)) if err != nil { return nil, err } // All set. return &KVM{ machine: machine, }, nil } // SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO. func (*KVM) SupportsAddressSpaceIO() bool { return false } // CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace. func (*KVM) CooperativelySchedulesAddressSpace() bool { return false } // MapUnit implements platform.Platform.MapUnit. func (*KVM) MapUnit() uint64 { // We greedily creates PTEs in MapFile, so extremely large mappings can // be expensive. Not _that_ expensive since we allow super pages, but // even though can get out of hand if you're creating multi-terabyte // mappings. For this reason, we limit mappings to an arbitrary 16MB. return 16 << 20 } // MinUserAddress returns the lowest available address. func (*KVM) MinUserAddress() hostarch.Addr { return hostarch.PageSize } // MaxUserAddress returns the first address that may not be used. func (*KVM) MaxUserAddress() hostarch.Addr { return hostarch.Addr(ring0.MaximumUserAddress) } // NewAddressSpace returns a new pagetable root. func (k *KVM) NewAddressSpace(any) (platform.AddressSpace, <-chan struct{}, error) { // Allocate page tables and install system mappings. pageTables := pagetables.NewWithUpper(newAllocator(), k.machine.upperSharedPageTables, ring0.KernelStartAddress) // Return the new address space. return &addressSpace{ machine: k.machine, pageTables: pageTables, dirtySet: k.machine.newDirtySet(), }, nil, nil } // NewContext returns an interruptible context. func (k *KVM) NewContext(pkgcontext.Context) platform.Context { return &platformContext{ machine: k.machine, } } type constructor struct{} func (*constructor) New(f *fd.FD) (platform.Platform, error) { return New(f) } func (*constructor) OpenDevice(devicePath string) (*fd.FD, error) { return OpenDevice(devicePath) } // Flags implements platform.Constructor.Flags(). func (*constructor) Requirements() platform.Requirements { return platform.Requirements{} } func init() { platform.Register("kvm", &constructor{}) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/kvm_amd64.go000066400000000000000000000134151465435605700255140ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package kvm import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" ) // userRegs represents KVM user registers. // // This mirrors kvm_regs. type userRegs struct { RAX uint64 RBX uint64 RCX uint64 RDX uint64 RSI uint64 RDI uint64 RSP uint64 RBP uint64 R8 uint64 R9 uint64 R10 uint64 R11 uint64 R12 uint64 R13 uint64 R14 uint64 R15 uint64 RIP uint64 RFLAGS uint64 } // systemRegs represents KVM system registers. // // This mirrors kvm_sregs. type systemRegs struct { CS segment DS segment ES segment FS segment GS segment SS segment TR segment LDT segment GDT descriptor IDT descriptor CR0 uint64 CR2 uint64 CR3 uint64 CR4 uint64 CR8 uint64 EFER uint64 apicBase uint64 interruptBitmap [(_KVM_NR_INTERRUPTS + 63) / 64]uint64 } // segment is the expanded form of a segment register. // // This mirrors kvm_segment. type segment struct { base uint64 limit uint32 selector uint16 typ uint8 present uint8 DPL uint8 DB uint8 S uint8 L uint8 G uint8 AVL uint8 unusable uint8 _ uint8 } // Clear clears the segment and marks it unusable. func (s *segment) Clear() { *s = segment{unusable: 1} } // selector is a segment selector. type selector uint16 // tobool is a simple helper. func tobool(x ring0.SegmentDescriptorFlags) uint8 { if x != 0 { return 1 } return 0 } // Load loads the segment described by d into the segment s. // // The argument sel is recorded as the segment selector index. func (s *segment) Load(d *ring0.SegmentDescriptor, sel ring0.Selector) { flag := d.Flags() if flag&ring0.SegmentDescriptorPresent == 0 { s.Clear() return } s.base = uint64(d.Base()) s.limit = d.Limit() s.typ = uint8((flag>>8)&0xF) | 1 s.S = tobool(flag & ring0.SegmentDescriptorSystem) s.DPL = uint8(d.DPL()) s.present = tobool(flag & ring0.SegmentDescriptorPresent) s.AVL = tobool(flag & ring0.SegmentDescriptorAVL) s.L = tobool(flag & ring0.SegmentDescriptorLong) s.DB = tobool(flag & ring0.SegmentDescriptorDB) s.G = tobool(flag & ring0.SegmentDescriptorG) if s.L != 0 { s.limit = 0xffffffff } s.unusable = 0 s.selector = uint16(sel) } // descriptor describes a region of physical memory. // // It corresponds to the pseudo-descriptor used in the x86 LGDT and LIDT // instructions, and mirrors kvm_dtable. type descriptor struct { base uint64 limit uint16 _ [3]uint16 } // modelControlRegister is an MSR entry. // // This mirrors kvm_msr_entry. type modelControlRegister struct { index uint32 _ uint32 data uint64 } // modelControlRegisers is a collection of MSRs. // // This mirrors kvm_msrs. type modelControlRegisters struct { nmsrs uint32 _ uint32 entries [16]modelControlRegister } // cpuidEntry is a single CPUID entry. // // This mirrors kvm_cpuid_entry2. type cpuidEntry struct { function uint32 index uint32 flags uint32 eax uint32 ebx uint32 ecx uint32 edx uint32 _ [3]uint32 } // cpuidEntries is a collection of CPUID entries. // // This mirrors kvm_cpuid2. type cpuidEntries struct { nr uint32 _ uint32 entries [_KVM_NR_CPUID_ENTRIES]cpuidEntry } // Query implements cpuid.Function.Query. func (c *cpuidEntries) Query(in cpuid.In) (out cpuid.Out) { for i := 0; i < int(c.nr); i++ { if c.entries[i].function == in.Eax && c.entries[i].index == in.Ecx { out.Eax = c.entries[i].eax out.Ebx = c.entries[i].ebx out.Ecx = c.entries[i].ecx out.Edx = c.entries[i].edx return } } return } // Set implements cpuid.ChangeableSet.Set. func (c *cpuidEntries) Set(in cpuid.In, out cpuid.Out) { i := 0 for ; i < int(c.nr); i++ { if c.entries[i].function == in.Eax && c.entries[i].index == in.Ecx { break } } if i == _KVM_NR_CPUID_ENTRIES { panic("exceeded KVM_NR_CPUID_ENTRIES") } c.entries[i].eax = out.Eax c.entries[i].ebx = out.Ebx c.entries[i].ecx = out.Ecx c.entries[i].edx = out.Edx if i == int(c.nr) { c.nr++ } } // updateGlobalOnce does global initialization. It has to be called only once. func updateGlobalOnce(fd int) error { fpu.InitHostState() bitsForScaling = getBitsForScaling() if err := updateSystemValues(int(fd)); err != nil { return err } fs := cpuid.FeatureSet{ Function: &cpuidSupported, } // Calculate whether guestPCID is supported. hasGuestPCID = fs.HasFeature(cpuid.X86FeaturePCID) // Create a static feature set from the KVM entries. Then, we // explicitly set OSXSAVE, since this does not come in the feature // entries, but can be provided when the relevant CR4 bit is set. s := &cpuidSupported if cpuid.HostFeatureSet().UseXsave() { cpuid.X86FeatureOSXSAVE.Set(s) } // Explicitly disable nested virtualization. Since we don't provide // any virtualization APIs, there is no need to enable this feature. cpuid.X86FeatureVMX.Unset(s) cpuid.X86FeatureSVM.Unset(s) ring0.Init(cpuid.FeatureSet{ Function: s, }) physicalInit() return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/kvm_amd64_state_autogen.go000066400000000000000000000001651465435605700304340ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 && amd64 && amd64 // +build amd64,amd64,amd64 package kvm golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go000066400000000000000000000033241465435605700270530ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package kvm import ( "fmt" "unsafe" "golang.org/x/sys/unix" ) var ( runDataSize int hasGuestPCID bool cpuidSupported = cpuidEntries{nr: _KVM_NR_CPUID_ENTRIES} ) func updateSystemValues(fd int) error { // Extract the mmap size. sz, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(fd), KVM_GET_VCPU_MMAP_SIZE, 0) if errno != 0 { return fmt.Errorf("getting VCPU mmap size: %v", errno) } // Save the data. runDataSize = int(sz) // Must do the dance to figure out the number of entries. _, _, errno = unix.RawSyscall( unix.SYS_IOCTL, uintptr(fd), KVM_GET_SUPPORTED_CPUID, uintptr(unsafe.Pointer(&cpuidSupported))) if errno != 0 && errno != unix.ENOMEM { // Some other error occurred. return fmt.Errorf("getting supported CPUID: %v", errno) } // The number should now be correct. _, _, errno = unix.RawSyscall( unix.SYS_IOCTL, uintptr(fd), KVM_GET_SUPPORTED_CPUID, uintptr(unsafe.Pointer(&cpuidSupported))) if errno != 0 { // Didn't work with the right number. return fmt.Errorf("getting supported CPUID (2nd attempt): %v", errno) } // Success. return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/kvm_amd64_unsafe_state_autogen.go000066400000000000000000000001651465435605700317750ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 && amd64 && amd64 // +build amd64,amd64,amd64 package kvm golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/kvm_arm64.go000066400000000000000000000035461465435605700255360ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package kvm import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/sentry/arch" ) type kvmOneReg struct { id uint64 addr uint64 } // arm64HypercallMMIOBase is MMIO base address used to dispatch hypercalls. var arm64HypercallMMIOBase uintptr const KVM_NR_SPSR = 5 type userFpsimdState struct { vregs [64]uint64 fpsr uint32 fpcr uint32 reserved [2]uint32 } type userRegs struct { Regs arch.Registers sp_el1 uint64 elr_el1 uint64 spsr [KVM_NR_SPSR]uint64 fpRegs userFpsimdState } type exception struct { sErrPending uint8 sErrHasEsr uint8 extDabtPending uint8 pad [5]uint8 sErrEsr uint64 } type kvmVcpuEvents struct { exception rsvd [12]uint32 } // updateGlobalOnce does global initialization. It has to be called only once. func updateGlobalOnce(fd int) error { err := updateSystemValues(int(fd)) ring0.Init() physicalInit() // The linux.Task represents the possible largest task size, which the UserspaceSize shouldn't be larger than. if linux.TaskSize < ring0.UserspaceSize { return fmt.Errorf("gVisor doesn't support 3-level page tables on KVM platform. Try to recompile the kernel with CONFIG_ARM64_VA_BITS_48") } return err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/kvm_arm64_state_autogen.go000066400000000000000000000002041465435605700304440ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 && arm64 && arm64 && arm64 // +build arm64,arm64,arm64,arm64 package kvm golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go000066400000000000000000000020431465435605700270660ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package kvm import ( "fmt" "golang.org/x/sys/unix" ) var ( runDataSize int hasGuestPCID bool ) func updateSystemValues(fd int) error { // Extract the mmap size. sz, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(fd), KVM_GET_VCPU_MMAP_SIZE, 0) if errno != 0 { return fmt.Errorf("getting VCPU mmap size: %v", errno) } // Save the data. runDataSize = int(sz) hasGuestPCID = true // Success. return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/kvm_arm64_unsafe_state_autogen.go000066400000000000000000000001651465435605700320130ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 && arm64 && arm64 // +build arm64,arm64,arm64 package kvm golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/kvm_const.go000066400000000000000000000057071465435605700257340ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kvm // KVM ioctls. // // Only the ioctls we need in Go appear here; some additional ioctls are used // within the assembly stubs (KVM_INTERRUPT, etc.). const ( KVM_CREATE_VM = 0xae01 KVM_GET_VCPU_MMAP_SIZE = 0xae04 KVM_CREATE_VCPU = 0xae41 KVM_SET_TSS_ADDR = 0xae47 KVM_RUN = 0xae80 KVM_NMI = 0xae9a KVM_CHECK_EXTENSION = 0xae03 KVM_GET_TSC_KHZ = 0xaea3 KVM_SET_TSC_KHZ = 0xaea2 KVM_INTERRUPT = 0x4004ae86 KVM_SET_MSRS = 0x4008ae89 KVM_SET_USER_MEMORY_REGION = 0x4020ae46 KVM_SET_REGS = 0x4090ae82 KVM_SET_SREGS = 0x4138ae84 KVM_GET_MSRS = 0xc008ae88 KVM_GET_REGS = 0x8090ae81 KVM_GET_SREGS = 0x8138ae83 KVM_GET_SUPPORTED_CPUID = 0xc008ae05 KVM_SET_CPUID2 = 0x4008ae90 KVM_SET_SIGNAL_MASK = 0x4004ae8b KVM_GET_VCPU_EVENTS = 0x8040ae9f KVM_SET_VCPU_EVENTS = 0x4040aea0 KVM_SET_DEVICE_ATTR = 0x4018aee1 ) // KVM exit reasons. const ( _KVM_EXIT_EXCEPTION = 0x1 _KVM_EXIT_IO = 0x2 _KVM_EXIT_HYPERCALL = 0x3 _KVM_EXIT_DEBUG = 0x4 _KVM_EXIT_HLT = 0x5 _KVM_EXIT_MMIO = 0x6 _KVM_EXIT_IRQ_WINDOW_OPEN = 0x7 _KVM_EXIT_SHUTDOWN = 0x8 _KVM_EXIT_FAIL_ENTRY = 0x9 _KVM_EXIT_INTERNAL_ERROR = 0x11 _KVM_EXIT_SYSTEM_EVENT = 0x18 _KVM_EXIT_ARM_NISV = 0x1c ) // KVM capability options. const ( _KVM_CAP_MAX_MEMSLOTS = 0x0a _KVM_CAP_MAX_VCPUS = 0x42 _KVM_CAP_ARM_VM_IPA_SIZE = 0xa5 _KVM_CAP_VCPU_EVENTS = 0x29 _KVM_CAP_ARM_INJECT_SERROR_ESR = 0x9e _KVM_CAP_TSC_CONTROL = 0x3c ) // KVM limits. const ( _KVM_NR_MEMSLOTS = 0x100 _KVM_NR_VCPUS = 0xff _KVM_NR_INTERRUPTS = 0x100 _KVM_NR_CPUID_ENTRIES = 0x100 ) // KVM kvm_memory_region::flags. const ( _KVM_MEM_LOG_DIRTY_PAGES = uint32(1) << 0 _KVM_MEM_READONLY = uint32(1) << 1 _KVM_MEM_FLAGS_NONE = uint32(0) ) // KVM hypercall list. // // Canonical list of hypercalls supported. const ( // On amd64, it uses 'HLT' to leave the guest. // // Unlike amd64, arm64 can only uses mmio_exit/psci to leave the guest. // // _KVM_HYPERCALL_VMEXIT is only used on arm64 for now. _KVM_HYPERCALL_VMEXIT int = iota _KVM_HYPERCALL_MAX ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/kvm_const_amd64.go000066400000000000000000000012751465435605700267230ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kvm // KVM ioctls for amd64. const ( _KVM_VCPU_TSC_CTRL = 0x0 _KVM_VCPU_TSC_OFFSET = 0x0 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/kvm_const_arm64.go000066400000000000000000000123771465435605700267460ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kvm // KVM ioctls for Arm64. const ( _KVM_GET_ONE_REG = 0x4010aeab _KVM_SET_ONE_REG = 0x4010aeac _KVM_ARM_TARGET_GENERIC_V8 = 5 _KVM_ARM_PREFERRED_TARGET = 0x8020aeaf _KVM_ARM_VCPU_INIT = 0x4020aeae _KVM_ARM64_REGS_PSTATE = 0x6030000000100042 _KVM_ARM64_REGS_SP_EL1 = 0x6030000000100044 _KVM_ARM64_REGS_R0 = 0x6030000000100000 _KVM_ARM64_REGS_R1 = 0x6030000000100002 _KVM_ARM64_REGS_R2 = 0x6030000000100004 _KVM_ARM64_REGS_R3 = 0x6030000000100006 _KVM_ARM64_REGS_R8 = 0x6030000000100010 _KVM_ARM64_REGS_R18 = 0x6030000000100024 _KVM_ARM64_REGS_PC = 0x6030000000100040 _KVM_ARM64_REGS_MAIR_EL1 = 0x603000000013c510 _KVM_ARM64_REGS_TCR_EL1 = 0x603000000013c102 _KVM_ARM64_REGS_TTBR0_EL1 = 0x603000000013c100 _KVM_ARM64_REGS_TTBR1_EL1 = 0x603000000013c101 _KVM_ARM64_REGS_SCTLR_EL1 = 0x603000000013c080 _KVM_ARM64_REGS_CPACR_EL1 = 0x603000000013c082 _KVM_ARM64_REGS_VBAR_EL1 = 0x603000000013c600 _KVM_ARM64_REGS_TIMER_CNT = 0x603000000013df1a _KVM_ARM64_REGS_CNTFRQ_EL0 = 0x603000000013df00 _KVM_ARM64_REGS_MDSCR_EL1 = 0x6030000000138012 _KVM_ARM64_REGS_CNTKCTL_EL1 = 0x603000000013c708 _KVM_ARM64_REGS_TPIDR_EL1 = 0x603000000013c684 ) // Arm64: Architectural Feature Access Control Register EL1. const ( _FPEN_NOTRAP = 3 _FPEN_SHIFT = 20 ) // Arm64: System Control Register EL1. const ( _SCTLR_M = 1 << 0 _SCTLR_C = 1 << 2 _SCTLR_I = 1 << 12 _SCTLR_DZE = 1 << 14 _SCTLR_UCT = 1 << 15 _SCTLR_UCI = 1 << 26 _SCTLR_EL1_DEFAULT = _SCTLR_M | _SCTLR_C | _SCTLR_I | _SCTLR_UCT | _SCTLR_UCI | _SCTLR_DZE ) // Arm64: Counter-timer Kernel Control Register el1. const ( _CNTKCTL_EL0PCTEN = 1 << 0 _CNTKCTL_EL0VCTEN = 1 << 1 _CNTKCTL_EL1_DEFAULT = _CNTKCTL_EL0PCTEN | _CNTKCTL_EL0VCTEN ) // Arm64: Translation Control Register EL1. const ( _TCR_IPS_40BITS = 2 << 32 // PA=40 _TCR_IPS_48BITS = 5 << 32 // PA=48 _TCR_T0SZ_OFFSET = 0 _TCR_T1SZ_OFFSET = 16 _TCR_IRGN0_SHIFT = 8 _TCR_IRGN1_SHIFT = 24 _TCR_ORGN0_SHIFT = 10 _TCR_ORGN1_SHIFT = 26 _TCR_SH0_SHIFT = 12 _TCR_SH1_SHIFT = 28 _TCR_TG0_SHIFT = 14 _TCR_TG1_SHIFT = 30 _TCR_T0SZ_VA48 = 64 - 48 // VA=48 _TCR_T1SZ_VA48 = 64 - 48 // VA=48 _TCR_A1 = 1 << 22 _TCR_ASID16 = 1 << 36 _TCR_TBI0 = 1 << 37 _TCR_TXSZ_VA48 = (_TCR_T0SZ_VA48 << _TCR_T0SZ_OFFSET) | (_TCR_T1SZ_VA48 << _TCR_T1SZ_OFFSET) _TCR_TG0_4K = 0 << _TCR_TG0_SHIFT // 4K _TCR_TG0_64K = 1 << _TCR_TG0_SHIFT // 64K _TCR_TG1_4K = 2 << _TCR_TG1_SHIFT _TCR_TG_FLAGS = _TCR_TG0_4K | _TCR_TG1_4K _TCR_IRGN0_WBWA = 1 << _TCR_IRGN0_SHIFT _TCR_IRGN1_WBWA = 1 << _TCR_IRGN1_SHIFT _TCR_IRGN_WBWA = _TCR_IRGN0_WBWA | _TCR_IRGN1_WBWA _TCR_ORGN0_WBWA = 1 << _TCR_ORGN0_SHIFT _TCR_ORGN1_WBWA = 1 << _TCR_ORGN1_SHIFT _TCR_ORGN_WBWA = _TCR_ORGN0_WBWA | _TCR_ORGN1_WBWA _TCR_SHARED = (3 << _TCR_SH0_SHIFT) | (3 << _TCR_SH1_SHIFT) _TCR_CACHE_FLAGS = _TCR_IRGN_WBWA | _TCR_ORGN_WBWA ) // Arm64: Memory Attribute Indirection Register EL1. const ( _MT_DEVICE_nGnRnE = 0 _MT_DEVICE_nGnRE = 1 _MT_DEVICE_GRE = 2 _MT_NORMAL_NC = 3 _MT_NORMAL = 4 _MT_NORMAL_WT = 5 _MT_ATTR_DEVICE_nGnRnE = 0x00 _MT_ATTR_DEVICE_nGnRE = 0x04 _MT_ATTR_DEVICE_GRE = 0x0c _MT_ATTR_NORMAL_NC = 0x44 _MT_ATTR_NORMAL_WT = 0xbb _MT_ATTR_NORMAL = 0xff _MT_ATTR_MASK = 0xff _MT_EL1_INIT = (_MT_ATTR_DEVICE_nGnRnE << (_MT_DEVICE_nGnRnE * 8)) | (_MT_ATTR_DEVICE_nGnRE << (_MT_DEVICE_nGnRE * 8)) | (_MT_ATTR_DEVICE_GRE << (_MT_DEVICE_GRE * 8)) | (_MT_ATTR_NORMAL_NC << (_MT_NORMAL_NC * 8)) | (_MT_ATTR_NORMAL << (_MT_NORMAL * 8)) | (_MT_ATTR_NORMAL_WT << (_MT_NORMAL_WT * 8)) ) const ( _KVM_ARM_VCPU_POWER_OFF = 0 // CPU is started in OFF state _KVM_ARM_VCPU_PSCI_0_2 = 2 // CPU uses PSCI v0.2 ) // Arm64: Exception Syndrome Register EL1. const ( _ESR_ELx_EC_SHIFT = 26 _ESR_ELx_EC_MASK = 0x3F << _ESR_ELx_EC_SHIFT _ESR_ELx_EC_IMP_DEF = 0x1f _ESR_ELx_EC_IABT_LOW = 0x20 _ESR_ELx_EC_IABT_CUR = 0x21 _ESR_ELx_EC_PC_ALIGN = 0x22 _ESR_ELx_CM = 1 << 8 _ESR_ELx_WNR = 1 << 6 _ESR_ELx_FSC = 0x3F _ESR_SEGV_MAPERR_L0 = 0x4 _ESR_SEGV_MAPERR_L1 = 0x5 _ESR_SEGV_MAPERR_L2 = 0x6 _ESR_SEGV_MAPERR_L3 = 0x7 _ESR_SEGV_ACCERR_L1 = 0x9 _ESR_SEGV_ACCERR_L2 = 0xa _ESR_SEGV_ACCERR_L3 = 0xb _ESR_SEGV_PEMERR_L1 = 0xd _ESR_SEGV_PEMERR_L2 = 0xe _ESR_SEGV_PEMERR_L3 = 0xf // Custom ISS field definitions for system error. _ESR_ELx_SERR_NMI = 0x1 ) // Arm64: MMIO base address used to dispatch hypercalls. const ( // on Arm64, the MMIO address must be 64-bit aligned. // Currently, we only need 1 hypercall: hypercall_vmexit. _AARCH64_HYPERCALL_MMIO_SIZE = 1 << 3 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/kvm_profiling.go000066400000000000000000000016661465435605700265770ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build kvm_profiling // +build kvm_profiling package kvm import ( "gvisor.dev/gvisor/pkg/metric" ) // KVMProfiling is a builder that produces conditionally compiled metrics. // Metrics made from this are compiled and active at runtime when the // "kvm_profiling" go-tag is specified at compilation. var KVMProfiling = metric.RealMetricBuilder{} golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/kvm_profiling_fake.go000066400000000000000000000016701465435605700275600ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !kvm_profiling // +build !kvm_profiling package kvm import ( "gvisor.dev/gvisor/pkg/metric" ) // KVMProfiling is a builder that produces conditionally compiled metrics. // Metrics made from this are compiled and active at runtime when the // "kvm_profiling" go-tag is specified at compilation. var KVMProfiling = metric.FakeMetricBuilder{} golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/kvm_state_autogen.go000066400000000000000000000002101465435605700274300ustar00rootroot00000000000000// automatically generated by stateify. //go:build kvm_profiling && !kvm_profiling // +build kvm_profiling,!kvm_profiling package kvm golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/kvm_unsafe_state_autogen.go000066400000000000000000000016241465435605700310030ustar00rootroot00000000000000// automatically generated by stateify. //go:build go1.18 && go1.18 // +build go1.18,go1.18 package kvm import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (p *machineAtomicPtr) StateTypeName() string { return "pkg/sentry/platform/kvm.machineAtomicPtr" } func (p *machineAtomicPtr) StateFields() []string { return []string{ "ptr", } } func (p *machineAtomicPtr) beforeSave() {} // +checklocksignore func (p *machineAtomicPtr) StateSave(stateSinkObject state.Sink) { p.beforeSave() var ptrValue *machine ptrValue = p.savePtr() stateSinkObject.SaveValue(0, ptrValue) } func (p *machineAtomicPtr) afterLoad(context.Context) {} // +checklocksignore func (p *machineAtomicPtr) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadValue(0, new(*machine), func(y any) { p.loadPtr(ctx, y.(*machine)) }) } func init() { state.Register((*machineAtomicPtr)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/machine.go000066400000000000000000000606041465435605700253320ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kvm import ( "fmt" "runtime" gosync "sync" "sync/atomic" "time" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/hosttid" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/seccomp" ktime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sighandling" "gvisor.dev/gvisor/pkg/sync" ) // machine contains state associated with the VM as a whole. type machine struct { // fd is the vm fd. fd int // machinePoolIndex is the index in the machinePool array. machinePoolIndex uint32 // nextSlot is the next slot for setMemoryRegion. // // If nextSlot is ^uint32(0), then slots are currently being updated, and the // caller should retry. nextSlot atomicbitops.Uint32 // upperSharedPageTables tracks the read-only shared upper of all the pagetables. upperSharedPageTables *pagetables.PageTables // kernel is the set of global structures. kernel ring0.Kernel // mu protects vCPUs. mu sync.RWMutex // available is notified when vCPUs are available. available sync.Cond // vCPUsByTID are the machine vCPUs. // // These are populated dynamically. vCPUsByTID map[uint64]*vCPU // vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID. vCPUsByID []*vCPU // usedVCPUs is the number of vCPUs that have been used from the // vCPUsByID pool. usedVCPUs int // maxVCPUs is the maximum number of vCPUs supported by the machine. maxVCPUs int // maxSlots is the maximum number of memory slots supported by the machine. maxSlots int // tscControl checks whether cpu supports TSC scaling tscControl bool // usedSlots is the set of used physical addresses (not sorted). usedSlots []uintptr } const ( // vCPUReady is an alias for all the below clear. vCPUReady uint32 = 0 // vCPUser indicates that the vCPU is in or about to enter user mode. vCPUUser uint32 = 1 << 0 // vCPUGuest indicates the vCPU is in guest mode. vCPUGuest uint32 = 1 << 1 // vCPUWaiter indicates that there is a waiter. // // If this is set, then notify must be called on any state transitions. vCPUWaiter uint32 = 1 << 2 ) // Field values for the get_vcpu metric acquisition path used. var ( getVCPUAcquisitionFastReused = metric.FieldValue{"fast_reused"} getVCPUAcquisitionReused = metric.FieldValue{"reused"} getVCPUAcquisitionUnused = metric.FieldValue{"unused"} getVCPUAcquisitionStolen = metric.FieldValue{"stolen"} ) var ( // hostExitCounter is a metric that tracks how many times the sentry // performed a host to guest world switch. hostExitCounter = KVMProfiling.MustCreateNewUint64Metric( "/kvm/host_exits", metric.Uint64Metadata{ Cumulative: true, Description: "The number of times the sentry performed a host to guest world switch.", }) // userExitCounter is a metric that tracks how many times the sentry has // had an exit from userspace. Analogous to vCPU.userExits. userExitCounter = KVMProfiling.MustCreateNewUint64Metric( "/kvm/user_exits", metric.Uint64Metadata{ Cumulative: true, Description: "The number of times the sentry has had an exit from userspace.", }) // interruptCounter is a metric that tracks how many times execution returned // to the KVM host to handle a pending signal. interruptCounter = KVMProfiling.MustCreateNewUint64Metric( "/kvm/interrupts", metric.Uint64Metadata{ Cumulative: true, Description: "The number of times the signal handler was invoked.", }) // mmapCallCounter is a metric that tracks how many times the function // seccompMmapSyscall has been called. mmapCallCounter = KVMProfiling.MustCreateNewUint64Metric( "/kvm/mmap_calls", metric.Uint64Metadata{ Cumulative: true, Description: "The number of times seccompMmapSyscall has been called.", }) // getVCPUCounter is a metric that tracks how many times different paths of // machine.Get() are triggered. getVCPUCounter = KVMProfiling.MustCreateNewUint64Metric( "/kvm/get_vcpu", metric.Uint64Metadata{ Cumulative: true, Description: "The number of times that machine.Get() was called, split by path the function took.", Fields: []metric.Field{ metric.NewField("acquisition_type", &getVCPUAcquisitionFastReused, &getVCPUAcquisitionReused, &getVCPUAcquisitionUnused, &getVCPUAcquisitionStolen), }, }) // asInvalidateDuration are durations of calling addressSpace.invalidate(). asInvalidateDuration = KVMProfiling.MustCreateNewTimerMetric("/kvm/address_space_invalidate", metric.NewExponentialBucketer(15, uint64(time.Nanosecond*100), 1, 2), "Duration of calling addressSpace.invalidate().") ) // vCPU is a single KVM vCPU. type vCPU struct { // CPU is the kernel CPU data. // // This must be the first element of this structure, it is referenced // by the bluepill code (see bluepill_amd64.s). ring0.CPU // id is the vCPU id. id int // fd is the vCPU fd. fd int // tid is the last set tid. tid atomicbitops.Uint64 // userExits is the count of user exits. userExits atomicbitops.Uint64 // guestExits is the count of guest to host world switches. guestExits atomicbitops.Uint64 // faults is a count of world faults (informational only). faults uint32 // state is the vCPU state. // // This is a bitmask of the three fields (vCPU*) described above. state atomicbitops.Uint32 // runData for this vCPU. runData *runData // machine associated with this vCPU. machine *machine // active is the current addressSpace: this is set and read atomically, // it is used to elide unnecessary interrupts due to invalidations. active atomicAddressSpace // vCPUArchState is the architecture-specific state. vCPUArchState // dieState holds state related to vCPU death. dieState dieState } type dieState struct { // message is thrown from die. message string // guestRegs is used to store register state during vCPU.die() to prevent // allocation inside nosplit function. guestRegs userRegs } // createVCPU creates and returns a new vCPU. // // Precondition: mu must be held. func (m *machine) createVCPU(id int) *vCPU { // Create the vCPU. fd, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CREATE_VCPU, uintptr(id)) if errno != 0 { panic(fmt.Sprintf("error creating new vCPU: %v", errno)) } c := &vCPU{ id: id, fd: int(fd), machine: m, } c.CPU.Init(&m.kernel, c.id, c) m.vCPUsByID[c.id] = c // Ensure the signal mask is correct. if err := c.setSignalMask(); err != nil { panic(fmt.Sprintf("error setting signal mask: %v", err)) } // Map the run data. runData, err := mapRunData(int(fd)) if err != nil { panic(fmt.Sprintf("error mapping run data: %v", err)) } c.runData = runData // Initialize architecture state. if err := c.initArchState(); err != nil { panic(fmt.Sprintf("error initialization vCPU state: %v", err)) } return c // Done. } // newMachine returns a new VM context. func newMachine(vm int) (*machine, error) { // Create the machine. m := &machine{fd: vm} m.available.L = &m.mu // Pull the maximum vCPUs. m.getMaxVCPU() log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs) m.vCPUsByTID = make(map[uint64]*vCPU) m.vCPUsByID = make([]*vCPU, m.maxVCPUs) m.kernel.Init(m.maxVCPUs) // Pull the maximum slots. maxSlots, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS) if errno != 0 { m.maxSlots = _KVM_NR_MEMSLOTS } else { m.maxSlots = int(maxSlots) } log.Debugf("The maximum number of slots is %d.", m.maxSlots) m.usedSlots = make([]uintptr, m.maxSlots) // Check TSC Scaling hasTSCControl, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_TSC_CONTROL) m.tscControl = errno == 0 && hasTSCControl == 1 log.Debugf("TSC scaling support: %t.", m.tscControl) // Create the upper shared pagetables and kernel(sentry) pagetables. m.upperSharedPageTables = pagetables.New(newAllocator()) m.mapUpperHalf(m.upperSharedPageTables) m.upperSharedPageTables.Allocator.(*allocator).base.Drain() m.upperSharedPageTables.MarkReadOnlyShared() m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress) // Install seccomp rules to trap runtime mmap system calls. They will // be handled by seccompMmapHandler. seccompMmapRules(m) // Apply the physical mappings. Note that these mappings may point to // guest physical addresses that are not actually available. These // physical pages are mapped on demand, see kernel_unsafe.go. applyPhysicalRegions(func(pr physicalRegion) bool { // Map everything in the lower half. m.kernel.PageTables.Map( hostarch.Addr(pr.virtual), pr.length, pagetables.MapOpts{AccessType: hostarch.ReadWrite}, pr.physical) return true // Keep iterating. }) // Ensure that the currently mapped virtual regions are actually // available in the VM. Note that this doesn't guarantee no future // faults, however it should guarantee that everything is available to // ensure successful vCPU entry. mapRegion := func(vr virtualRegion, flags uint32) { for virtual := vr.virtual; virtual < vr.virtual+vr.length; { physical, length, ok := translateToPhysical(virtual) if !ok { // This must be an invalid region that was // knocked out by creation of the physical map. return } if virtual+length > vr.virtual+vr.length { // Cap the length to the end of the area. length = vr.virtual + vr.length - virtual } // Update page tables for executable mappings. if vr.accessType.Execute { if vr.accessType.Write { panic(fmt.Sprintf("executable mapping can't be writable: %#v", vr)) } m.kernel.PageTables.Map( hostarch.Addr(virtual), length, pagetables.MapOpts{AccessType: vr.accessType}, physical) } // Ensure the physical range is mapped. m.mapPhysical(physical, length, physicalRegions) virtual += length } } // handleBluepillFault takes the slot spinlock and it is called from // seccompMmapHandler, so here we have to guarantee that mmap is not // called while we hold the slot spinlock. disableAsyncPreemption() applyVirtualRegions(func(vr virtualRegion) { if excludeVirtualRegion(vr) { return // skip region. } // Take into account that the stack can grow down. if vr.filename == "[stack]" { vr.virtual -= 1 << 20 vr.length += 1 << 20 } mapRegion(vr, 0) }) enableAsyncPreemption() // Initialize architecture state. if err := m.initArchState(); err != nil { m.Destroy() return nil, err } // Ensure the machine is cleaned up properly. runtime.SetFinalizer(m, (*machine).Destroy) return m, nil } // hasSlot returns true if the given address is mapped. // // This must be done via a linear scan. // //go:nosplit func (m *machine) hasSlot(physical uintptr) bool { slotLen := int(m.nextSlot.Load()) // When slots are being updated, nextSlot is ^uint32(0). As this situation // is less likely happen, we just set the slotLen to m.maxSlots, and scan // the whole usedSlots array. if slotLen == int(^uint32(0)) { slotLen = m.maxSlots } for i := 0; i < slotLen; i++ { if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical { return true } } return false } // mapPhysical checks for the mapping of a physical range, and installs one if // not available. This attempts to be efficient for calls in the hot path. // // This throws on error. // //go:nosplit func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion) { for end := physical + length; physical < end; { _, physicalStart, length, pr := calculateBluepillFault(physical, phyRegions) if pr == nil { // Should never happen. throw("mapPhysical on unknown physical address") } // Is this already mapped? Check the usedSlots. if !m.hasSlot(physicalStart) { if _, ok := handleBluepillFault(m, physical, phyRegions); !ok { throw("handleBluepillFault failed") } } // Move to the next chunk. physical = physicalStart + length } } // Destroy frees associated resources. // // Destroy should only be called once all active users of the machine are gone. // The machine object should not be used after calling Destroy. // // Precondition: all vCPUs must be returned to the machine. func (m *machine) Destroy() { runtime.SetFinalizer(m, nil) // Destroy vCPUs. for _, c := range m.vCPUsByID { if c == nil { continue } // Ensure the vCPU is not still running in guest mode. This is // possible iff teardown has been done by other threads, and // somehow a single thread has not executed any system calls. c.BounceToHost() // Note that the runData may not be mapped if an error occurs // during the middle of initialization. if c.runData != nil { if err := unmapRunData(c.runData); err != nil { panic(fmt.Sprintf("error unmapping rundata: %v", err)) } } if err := unix.Close(int(c.fd)); err != nil { panic(fmt.Sprintf("error closing vCPU fd: %v", err)) } } machinePool[m.machinePoolIndex].Store(nil) seccompMmapSync() // vCPUs are gone: teardown machine state. if err := unix.Close(m.fd); err != nil { panic(fmt.Sprintf("error closing VM fd: %v", err)) } } // Get gets an available vCPU. // // This will return with the OS thread locked. // // It is guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points // to the vCPU in which the OS thread TID is running. So if Get() returns with // the current context in guest, the vCPU of it must be the same as what // Get() returns. func (m *machine) Get() *vCPU { m.mu.RLock() runtime.LockOSThread() tid := hosttid.Current() // Check for an exact match. if c := m.vCPUsByTID[tid]; c != nil { c.lock() m.mu.RUnlock() getVCPUCounter.Increment(&getVCPUAcquisitionFastReused) return c } // The happy path failed. We now proceed to acquire an exclusive lock // (because the vCPU map may change), and scan all available vCPUs. // In this case, we first unlock the OS thread. Otherwise, if mu is // not available, the current system thread will be parked and a new // system thread spawned. We avoid this situation by simply refreshing // tid after relocking the system thread. m.mu.RUnlock() runtime.UnlockOSThread() m.mu.Lock() runtime.LockOSThread() tid = hosttid.Current() // Recheck for an exact match. if c := m.vCPUsByTID[tid]; c != nil { c.lock() m.mu.Unlock() getVCPUCounter.Increment(&getVCPUAcquisitionReused) return c } for { // Get vCPU from the m.vCPUsByID pool. if m.usedVCPUs < m.maxVCPUs { c := m.vCPUsByID[m.usedVCPUs] m.usedVCPUs++ c.lock() m.vCPUsByTID[tid] = c m.mu.Unlock() c.loadSegments(tid) getVCPUCounter.Increment(&getVCPUAcquisitionUnused) return c } // Scan for an available vCPU. for origTID, c := range m.vCPUsByTID { if c.state.CompareAndSwap(vCPUReady, vCPUUser) { delete(m.vCPUsByTID, origTID) m.vCPUsByTID[tid] = c m.mu.Unlock() c.loadSegments(tid) getVCPUCounter.Increment(&getVCPUAcquisitionUnused) return c } } // Scan for something not in user mode. for origTID, c := range m.vCPUsByTID { if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest|vCPUWaiter) { continue } // The vCPU is not be able to transition to // vCPUGuest|vCPUWaiter or to vCPUUser because that // transition requires holding the machine mutex, as we // do now. There is no path to register a waiter on // just the vCPUReady state. for { c.waitUntilNot(vCPUGuest | vCPUWaiter) if c.state.CompareAndSwap(vCPUReady, vCPUUser) { break } } // Steal the vCPU. delete(m.vCPUsByTID, origTID) m.vCPUsByTID[tid] = c m.mu.Unlock() c.loadSegments(tid) getVCPUCounter.Increment(&getVCPUAcquisitionStolen) return c } // Everything is executing in user mode. Wait until something // is available. Note that signaling the condition variable // will have the extra effect of kicking the vCPUs out of guest // mode if that's where they were. m.available.Wait() } } // Put puts the current vCPU. func (m *machine) Put(c *vCPU) { c.unlock() runtime.UnlockOSThread() m.mu.RLock() m.available.Signal() m.mu.RUnlock() } // newDirtySet returns a new dirty set. func (m *machine) newDirtySet() *dirtySet { return &dirtySet{ vCPUMasks: make([]atomicbitops.Uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64), } } // dropPageTables drops cached page table entries. func (m *machine) dropPageTables(pt *pagetables.PageTables) { m.mu.Lock() defer m.mu.Unlock() // Clear from all PCIDs. for _, c := range m.vCPUsByID { if c != nil && c.PCIDs != nil { c.PCIDs.Drop(pt) } } } // lock marks the vCPU as in user mode. // // This should only be called directly when known to be safe, i.e. when // the vCPU is owned by the current TID with no chance of theft. // //go:nosplit func (c *vCPU) lock() { atomicbitops.OrUint32(&c.state, vCPUUser) } // unlock clears the vCPUUser bit. // //go:nosplit func (c *vCPU) unlock() { origState := atomicbitops.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) if origState == vCPUUser|vCPUGuest { // Happy path: no exits are forced, and we can continue // executing on our merry way with a single atomic access. return } // Clear the lock. for { state := atomicbitops.CompareAndSwapUint32(&c.state, origState, origState&^vCPUUser) if state == origState { break } origState = state } switch origState { case vCPUUser: // Normal state. case vCPUUser | vCPUGuest | vCPUWaiter: // Force a transition: this must trigger a notification when we // return from guest mode. We must clear vCPUWaiter here // anyways, because BounceToKernel will force a transition only // from ring3 to ring0, which will not clear this bit. Halt may // workaround the issue, but if there is no exception or // syscall in this period, BounceToKernel will hang. atomicbitops.AndUint32(&c.state, ^vCPUWaiter) c.notify() case vCPUUser | vCPUWaiter: // Waiting for the lock to be released; the responsibility is // on us to notify the waiter and clear the associated bit. atomicbitops.AndUint32(&c.state, ^vCPUWaiter) c.notify() default: panic("invalid state") } } // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. // //go:nosplit func (c *vCPU) NotifyInterrupt() { c.BounceToKernel() } // pid is used below in bounce. var pid = unix.Getpid() // bounce forces a return to the kernel or to host mode. // // This effectively unwinds the state machine. func (c *vCPU) bounce(forceGuestExit bool) { origGuestExits := c.guestExits.Load() origUserExits := c.userExits.Load() for { switch state := c.state.Load(); state { case vCPUReady, vCPUWaiter: // There is nothing to be done, we're already in the // kernel pre-acquisition. The Bounce criteria have // been satisfied. return case vCPUUser: // We need to register a waiter for the actual guest // transition. When the transition takes place, then we // can inject an interrupt to ensure a return to host // mode. c.state.CompareAndSwap(state, state|vCPUWaiter) case vCPUUser | vCPUWaiter: // Wait for the transition to guest mode. This should // come from the bluepill handler. c.waitUntilNot(state) case vCPUGuest, vCPUUser | vCPUGuest: if state == vCPUGuest && !forceGuestExit { // The vCPU is already not acquired, so there's // no need to do a fresh injection here. return } // The vCPU is in user or kernel mode. Attempt to // register a notification on change. if !c.state.CompareAndSwap(state, state|vCPUWaiter) { break // Retry. } for { // We need to spin here until the signal is // delivered, because Tgkill can return EAGAIN // under memory pressure. Since we already // marked ourselves as a waiter, we need to // ensure that a signal is actually delivered. if err := unix.Tgkill(pid, int(c.tid.Load()), bounceSignal); err == nil { break } else if err.(unix.Errno) == unix.EAGAIN { continue } else { // Nothing else should be returned by tgkill. panic(fmt.Sprintf("unexpected tgkill error: %v", err)) } } case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter: if state == vCPUGuest|vCPUWaiter && !forceGuestExit { // See above. return } // Wait for the transition. This again should happen // from the bluepill handler, but on the way out. c.waitUntilNot(state) default: // Should not happen: the above is exhaustive. panic("invalid state") } // Check if we've missed the state transition, but // we can safely return at this point in time. newGuestExits := c.guestExits.Load() newUserExits := c.userExits.Load() if newUserExits != origUserExits && (!forceGuestExit || newGuestExits != origGuestExits) { return } } } // BounceToKernel ensures that the vCPU bounces back to the kernel. // //go:nosplit func (c *vCPU) BounceToKernel() { c.bounce(false) } // BounceToHost ensures that the vCPU is in host mode. // //go:nosplit func (c *vCPU) BounceToHost() { c.bounce(true) } // setSystemTimeLegacy calibrates and sets an approximate system time. func (c *vCPU) setSystemTimeLegacy() error { const minIterations = 10 minimum := uint64(0) for iter := 0; ; iter++ { // Try to set the TSC to an estimate of where it will be // on the host during a "fast" system call iteration. start := uint64(ktime.Rdtsc()) if err := c.setTSC(start + (minimum / 2)); err != nil { return err } // See if this is our new minimum call time. Note that this // serves two functions: one, we make sure that we are // accurately predicting the offset we need to set. Second, we // don't want to do the final set on a slow call, which could // produce a really bad result. end := uint64(ktime.Rdtsc()) if end < start { continue // Totally bogus: unstable TSC? } current := end - start if current < minimum || iter == 0 { minimum = current // Set our new minimum. } // Is this past minIterations and within ~10% of minimum? upperThreshold := (((minimum << 3) + minimum) >> 3) if iter >= minIterations && current <= upperThreshold { return nil } } } const machinePoolSize = 16 // machinePool is enumerated from the seccompMmapHandler signal handler var ( machinePool [machinePoolSize]machineAtomicPtr machinePoolLen atomicbitops.Uint32 machinePoolMu sync.Mutex seccompMmapRulesOnce gosync.Once ) func sigsysHandler() func addrOfSigsysHandler() uintptr // seccompMmapRules adds seccomp rules to trap mmap system calls that will be // handled in seccompMmapHandler. func seccompMmapRules(m *machine) { seccompMmapRulesOnce.Do(func() { // Install the handler. if err := sighandling.ReplaceSignalHandler(unix.SIGSYS, addrOfSigsysHandler(), &savedSigsysHandler); err != nil { panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err)) } rules := []seccomp.RuleSet{ // Trap mmap system calls and handle them in sigsysGoHandler { Rules: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_MMAP: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.MaskedEqual(unix.PROT_EXEC, 0), /* MAP_DENYWRITE is ignored and used only for filtering. */ seccomp.MaskedEqual(unix.MAP_DENYWRITE, 0), }, }), Action: linux.SECCOMP_RET_TRAP, }, } instrs, _, err := seccomp.BuildProgram(rules, seccomp.ProgramOptions{ DefaultAction: linux.SECCOMP_RET_ALLOW, BadArchAction: linux.SECCOMP_RET_ALLOW, }) if err != nil { panic(fmt.Sprintf("failed to build rules: %v", err)) } // Perform the actual installation. if err := seccomp.SetFilter(instrs); err != nil { panic(fmt.Sprintf("failed to set filter: %v", err)) } }) machinePoolMu.Lock() n := machinePoolLen.Load() i := uint32(0) for ; i < n; i++ { if machinePool[i].Load() == nil { break } } if i == n { if i == machinePoolSize { machinePoolMu.Unlock() panic("machinePool is full") } machinePoolLen.Add(1) } machinePool[i].Store(m) m.machinePoolIndex = i machinePoolMu.Unlock() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/machine_amd64.go000066400000000000000000000402051465435605700263200ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package kvm import ( "fmt" "math/big" "reflect" "runtime" "runtime/debug" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/platform" ktime "gvisor.dev/gvisor/pkg/sentry/time" ) // initArchState initializes architecture-specific state. func (m *machine) initArchState() error { // Set the legacy TSS address. This address is covered by the reserved // range (up to 4GB). In fact, this is a main reason it exists. if _, _, errno := unix.RawSyscall( unix.SYS_IOCTL, uintptr(m.fd), KVM_SET_TSS_ADDR, uintptr(reservedMemory-(3*hostarch.PageSize))); errno != 0 { return errno } // Initialize all vCPUs to minimize kvm ioctl-s allowed by seccomp filters. m.mu.Lock() for i := 0; i < m.maxVCPUs; i++ { m.createVCPU(i) } m.mu.Unlock() c := m.Get() defer m.Put(c) // Enable CPUID faulting, if possible. Note that this also serves as a // basic platform sanity tests, since we will enter guest mode for the // first time here. The recovery is necessary, since if we fail to read // the platform info register, we will retry to host mode and // ultimately need to handle a segmentation fault. old := debug.SetPanicOnFault(true) defer func() { recover() debug.SetPanicOnFault(old) }() bluepill(c) ring0.SetCPUIDFaulting(true) return nil } type vCPUArchState struct { // PCIDs is the set of PCIDs for this vCPU. // // This starts above fixedKernelPCID. PCIDs *pagetables.PCIDs } const ( // fixedKernelPCID is a fixed kernel PCID used for the kernel page // tables. We must start allocating user PCIDs above this in order to // avoid any conflict (see below). fixedKernelPCID = 1 // poolPCIDs is the number of PCIDs to record in the database. As this // grows, assignment can take longer, since it is a simple linear scan. // Beyond a relatively small number, there are likely few perform // benefits, since the TLB has likely long since lost any translations // from more than a few PCIDs past. poolPCIDs = 8 ) // initArchState initializes architecture-specific state. func (c *vCPU) initArchState() error { var ( kernelSystemRegs systemRegs kernelUserRegs userRegs ) // Set base control registers. kernelSystemRegs.CR0 = c.CR0() kernelSystemRegs.CR4 = c.CR4() kernelSystemRegs.EFER = c.EFER() // Set the IDT & GDT in the registers. kernelSystemRegs.IDT.base, kernelSystemRegs.IDT.limit = c.IDT() kernelSystemRegs.GDT.base, kernelSystemRegs.GDT.limit = c.GDT() kernelSystemRegs.CS.Load(&ring0.KernelCodeSegment, ring0.Kcode) kernelSystemRegs.DS.Load(&ring0.UserDataSegment, ring0.Udata) kernelSystemRegs.ES.Load(&ring0.UserDataSegment, ring0.Udata) kernelSystemRegs.SS.Load(&ring0.KernelDataSegment, ring0.Kdata) kernelSystemRegs.FS.Load(&ring0.UserDataSegment, ring0.Udata) kernelSystemRegs.GS.Load(&ring0.UserDataSegment, ring0.Udata) tssBase, tssLimit, tss := c.TSS() kernelSystemRegs.TR.Load(tss, ring0.Tss) kernelSystemRegs.TR.base = tssBase kernelSystemRegs.TR.limit = uint32(tssLimit) // Point to kernel page tables, with no initial PCID. kernelSystemRegs.CR3 = c.machine.kernel.PageTables.CR3(false, 0) // Initialize the PCID database. if hasGuestPCID { // Note that NewPCIDs may return a nil table here, in which // case we simply don't use PCID support (see below). In // practice, this should not happen, however. c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs) } // Set the CPUID; this is required before setting system registers, // since KVM will reject several CR4 bits if the CPUID does not // indicate the support is available. if err := c.setCPUID(); err != nil { return err } // Set the entrypoint for the kernel. kernelUserRegs.RIP = uint64(ring0.AddrOfStart()) kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer()) kernelUserRegs.RSP = c.StackTop() kernelUserRegs.RFLAGS = ring0.KernelFlagsSet // Set the system registers. if err := c.setSystemRegisters(&kernelSystemRegs); err != nil { return err } // Set the user registers. if errno := c.setUserRegisters(&kernelUserRegs); errno != 0 { return fmt.Errorf("error setting user registers: %v", errno) } // Set the time offset to the host native time. return c.setSystemTime() } // bitsForScaling returns the bits available for storing the fraction component // of the TSC scaling ratio. // It is set using getBitsForScaling when the KVM platform is initialized. var bitsForScaling int64 // getBitsForScaling returns the bits available for storing the fraction component // of the TSC scaling ratio. This allows us to replicate the (bad) math done by // the kernel below in scaledTSC, and ensure we can compute an exact zero // offset in setSystemTime. // // These constants correspond to kvm_tsc_scaling_ratio_frac_bits. func getBitsForScaling() int64 { fs := cpuid.HostFeatureSet() if fs.Intel() { return 48 // See vmx.c (kvm sources). } else if fs.AMD() { return 32 // See svm.c (svm sources). } else { return 63 // Unknown: theoretical maximum. } } // scaledTSC returns the host TSC scaled by the given frequency. // // This assumes a current frequency of 1. We require only the unitless ratio of // rawFreq to some current frequency. See setSystemTime for context. // // The kernel math guarantees that all bits of the multiplication and division // will be correctly preserved and applied. However, it is not possible to // actually store the ratio correctly. So we need to use the same schema in // order to calculate the scaled frequency and get the same result. // // We can assume that the current frequency is (1), so we are calculating a // strict inverse of this value. This simplifies this function considerably. // // Roughly, the returned value "scaledTSC" will have: // scaledTSC/hostTSC == 1/rawFreq // //go:nosplit func scaledTSC(rawFreq uintptr) int64 { scale := int64(1 << bitsForScaling) ratio := big.NewInt(scale / int64(rawFreq)) ratio.Mul(ratio, big.NewInt(int64(ktime.Rdtsc()))) ratio.Div(ratio, big.NewInt(scale)) return ratio.Int64() } // setSystemTime sets the vCPU to the system time. func (c *vCPU) setSystemTime() error { // Attempt to set the offset directly. This is supported as of Linux 5.16, // or commit 828ca89628bfcb1b8f27535025f69dd00eb55207. if err := c.setTSCOffset(); err == nil { return err } // If tsc scaling is not supported, fallback to legacy mode. if !c.machine.tscControl { return c.setSystemTimeLegacy() } // First, scale down the clock frequency to the lowest value allowed by // the API itself. How low we can go depends on the underlying // hardware, but it is typically ~1/2^48 for Intel, ~1/2^32 for AMD. // Even the lower bound here will take a 4GHz frequency down to 1Hz, // meaning that everything should be able to handle a Khz setting of 1 // with bits to spare. // // Note that reducing the clock does not typically require special // capabilities as it is emulated in KVM. We don't actually use this // capability, but it means that this method should be robust to // different hardware configurations. rawFreq, err := c.getTSCFreq() if err != nil { return c.setSystemTimeLegacy() } if err := c.setTSCFreq(1); err != nil { return c.setSystemTimeLegacy() } // Always restore the original frequency. defer func() { if err := c.setTSCFreq(rawFreq); err != nil { panic(err.Error()) } }() // Attempt to set the system time in this compressed world. The // calculation for offset normally looks like: // // offset = target_tsc - kvm_scale_tsc(vcpu, rdtsc()); // // So as long as the kvm_scale_tsc component is constant before and // after the call to set the TSC value (and it is passes as the // target_tsc), we will compute an offset value of zero. // // This is effectively cheating to make our "setSystemTime" call so // unbelievably, incredibly fast that we do it "instantly" and all the // calculations result in an offset of zero. lastTSC := scaledTSC(rawFreq) for { if err := c.setTSC(uint64(lastTSC)); err != nil { return err } nextTSC := scaledTSC(rawFreq) if lastTSC == nextTSC { return nil } lastTSC = nextTSC // Try again. } } // nonCanonical generates a canonical address return. // //go:nosplit func nonCanonical(addr uint64, signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) { *info = linux.SignalInfo{ Signo: signal, Code: linux.SI_KERNEL, } info.SetAddr(addr) // Include address. return hostarch.NoAccess, platform.ErrContextSignal } // fault generates an appropriate fault return. // //go:nosplit func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) { bluepill(c) // Probably no-op, but may not be. faultAddr := ring0.ReadCR2() code, user := c.ErrorCode() if !user { // The last fault serviced by this CPU was not a user // fault, so we can't reliably trust the faultAddr or // the code provided here. We need to re-execute. return hostarch.NoAccess, platform.ErrContextInterrupt } // Reset the pointed SignalInfo. *info = linux.SignalInfo{Signo: signal} info.SetAddr(uint64(faultAddr)) accessType := hostarch.AccessType{} if signal == int32(unix.SIGSEGV) { accessType = hostarch.AccessType{ Read: code&(1<<1) == 0, Write: code&(1<<1) != 0, Execute: code&(1<<4) != 0, } } if !accessType.Write && !accessType.Execute { info.Code = 1 // SEGV_MAPERR. } else { info.Code = 2 // SEGV_ACCERR. } return accessType, platform.ErrContextSignal } //go:nosplit //go:noinline func loadByte(ptr *byte) byte { return *ptr } // SwitchToUser unpacks architectural-details. func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo) (hostarch.AccessType, error) { // Check for canonical addresses. if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Rip) { return nonCanonical(regs.Rip, int32(unix.SIGSEGV), info) } else if !ring0.IsCanonical(regs.Rsp) { return nonCanonical(regs.Rsp, int32(unix.SIGBUS), info) } else if !ring0.IsCanonical(regs.Fs_base) { return nonCanonical(regs.Fs_base, int32(unix.SIGBUS), info) } else if !ring0.IsCanonical(regs.Gs_base) { return nonCanonical(regs.Gs_base, int32(unix.SIGBUS), info) } // Assign PCIDs. if c.PCIDs != nil { var requireFlushPCID bool // Force a flush? switchOpts.UserPCID, requireFlushPCID = c.PCIDs.Assign(switchOpts.PageTables) switchOpts.KernelPCID = fixedKernelPCID switchOpts.Flush = switchOpts.Flush || requireFlushPCID } // See below. var vector ring0.Vector // Past this point, stack growth can cause system calls (and a break // from guest mode). So we need to ensure that between the bluepill // call here and the switch call immediately below, no additional // allocations occur. entersyscall() bluepill(c) vector = c.CPU.SwitchToUser(switchOpts) exitsyscall() switch vector { case ring0.Syscall, ring0.SyscallInt80: // Fast path: system call executed. return hostarch.NoAccess, nil case ring0.PageFault: return c.fault(int32(unix.SIGSEGV), info) case ring0.Debug, ring0.Breakpoint: *info = linux.SignalInfo{ Signo: int32(unix.SIGTRAP), Code: 1, // TRAP_BRKPT (breakpoint). } info.SetAddr(switchOpts.Registers.Rip) // Include address. return hostarch.AccessType{}, platform.ErrContextSignal case ring0.GeneralProtectionFault, ring0.SegmentNotPresent, ring0.BoundRangeExceeded, ring0.InvalidTSS, ring0.StackSegmentFault: *info = linux.SignalInfo{ Signo: int32(unix.SIGSEGV), Code: linux.SI_KERNEL, } info.SetAddr(switchOpts.Registers.Rip) // Include address. if vector == ring0.GeneralProtectionFault { // When CPUID faulting is enabled, we will generate a #GP(0) when // userspace executes a CPUID instruction. This is handled above, // because we need to be able to map and read user memory. return hostarch.AccessType{}, tryCPUIDError{} } return hostarch.AccessType{}, platform.ErrContextSignal case ring0.InvalidOpcode: *info = linux.SignalInfo{ Signo: int32(unix.SIGILL), Code: 1, // ILL_ILLOPC (illegal opcode). } info.SetAddr(switchOpts.Registers.Rip) // Include address. return hostarch.AccessType{}, platform.ErrContextSignal case ring0.DivideByZero: *info = linux.SignalInfo{ Signo: int32(unix.SIGFPE), Code: 1, // FPE_INTDIV (divide by zero). } info.SetAddr(switchOpts.Registers.Rip) // Include address. return hostarch.AccessType{}, platform.ErrContextSignal case ring0.Overflow: *info = linux.SignalInfo{ Signo: int32(unix.SIGFPE), Code: 2, // FPE_INTOVF (integer overflow). } info.SetAddr(switchOpts.Registers.Rip) // Include address. return hostarch.AccessType{}, platform.ErrContextSignal case ring0.X87FloatingPointException, ring0.SIMDFloatingPointException: *info = linux.SignalInfo{ Signo: int32(unix.SIGFPE), Code: 7, // FPE_FLTINV (invalid operation). } info.SetAddr(switchOpts.Registers.Rip) // Include address. return hostarch.AccessType{}, platform.ErrContextSignal case ring0.Vector(bounce): // ring0.VirtualizationException return hostarch.NoAccess, platform.ErrContextInterrupt case ring0.AlignmentCheck: *info = linux.SignalInfo{ Signo: int32(unix.SIGBUS), Code: 2, // BUS_ADRERR (physical address does not exist). } return hostarch.NoAccess, platform.ErrContextSignal case ring0.NMI: // An NMI is generated only when a fault is not servicable by // KVM itself, so we think some mapping is writeable but it's // really not. This could happen, e.g. if some file is // truncated (and would generate a SIGBUS) and we map it // directly into the instance. return c.fault(int32(unix.SIGBUS), info) case ring0.DeviceNotAvailable, ring0.DoubleFault, ring0.CoprocessorSegmentOverrun, ring0.MachineCheck, ring0.SecurityException: fallthrough default: panic(fmt.Sprintf("unexpected vector: 0x%x", vector)) } } func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { // Map all the executable regions so that all the entry functions // are mapped in the upper half. if err := applyVirtualRegions(func(vr virtualRegion) { if excludeVirtualRegion(vr) || vr.filename == "[vsyscall]" { return } if vr.accessType.Execute { r := vr.region physical, length, ok := translateToPhysical(r.virtual) if !ok || length < r.length { panic("impossible translation") } pageTable.Map( hostarch.Addr(ring0.KernelStartAddress|r.virtual), r.length, pagetables.MapOpts{AccessType: hostarch.Execute, Global: true}, physical) } }); err != nil { panic(fmt.Sprintf("error parsing /proc/self/maps: %v", err)) } for start, end := range m.kernel.EntryRegions() { regionLen := end - start physical, length, ok := translateToPhysical(start) if !ok || length < regionLen { panic("impossible translation") } pageTable.Map( hostarch.Addr(ring0.KernelStartAddress|start), regionLen, pagetables.MapOpts{AccessType: hostarch.ReadWrite, Global: true}, physical) } } // getMaxVCPU get max vCPU number func (m *machine) getMaxVCPU() { maxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) if errno != 0 { m.maxVCPUs = _KVM_NR_VCPUS } else { m.maxVCPUs = int(maxVCPUs) } // The goal here is to avoid vCPU contentions for reasonable workloads. // But "reasonable" isn't defined well in this case. Let's say that CPU // overcommit with factor 2 is still acceptable. We allocate a set of // vCPU for each goruntime processor (P) and two sets of vCPUs to run // user code. rCPUs := runtime.GOMAXPROCS(0) if 3*rCPUs < m.maxVCPUs { m.maxVCPUs = 3 * rCPUs } } func archPhysicalRegions(physicalRegions []physicalRegion) []physicalRegion { return physicalRegions } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/machine_amd64_unsafe.go000066400000000000000000000114761465435605700276710ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package kvm import ( "fmt" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" ) // loadSegments copies the current segments. // // This may be called from within the signal context and throws on error. // //go:nosplit func (c *vCPU) loadSegments(tid uint64) { if _, _, errno := unix.RawSyscall( unix.SYS_ARCH_PRCTL, linux.ARCH_GET_FS, uintptr(unsafe.Pointer(&c.CPU.Registers().Fs_base)), 0); errno != 0 { throw("getting FS segment") } if _, _, errno := unix.RawSyscall( unix.SYS_ARCH_PRCTL, linux.ARCH_GET_GS, uintptr(unsafe.Pointer(&c.CPU.Registers().Gs_base)), 0); errno != 0 { throw("getting GS segment") } c.tid.Store(tid) } // setCPUID sets the CPUID to be used by the guest. func (c *vCPU) setCPUID() error { if _, _, errno := unix.RawSyscall( unix.SYS_IOCTL, uintptr(c.fd), KVM_SET_CPUID2, uintptr(unsafe.Pointer(&cpuidSupported))); errno != 0 { return fmt.Errorf("error setting CPUID: %v", errno) } return nil } // getTSCFreq gets the TSC frequency. // // If mustSucceed is true, then this function panics on error. func (c *vCPU) getTSCFreq() (uintptr, error) { rawFreq, _, errno := unix.RawSyscall( unix.SYS_IOCTL, uintptr(c.fd), KVM_GET_TSC_KHZ, 0 /* ignored */) if errno != 0 { return 0, errno } return rawFreq, nil } // setTSCFreq sets the TSC frequency. func (c *vCPU) setTSCFreq(freq uintptr) error { if _, _, errno := unix.RawSyscall( unix.SYS_IOCTL, uintptr(c.fd), KVM_SET_TSC_KHZ, freq /* khz */); errno != 0 { return fmt.Errorf("error setting TSC frequency: %v", errno) } return nil } // setTSCOffset sets the TSC offset to zero. func (c *vCPU) setTSCOffset() error { offset := uint64(0) da := struct { flags uint32 group uint32 attr uint64 addr unsafe.Pointer }{ group: _KVM_VCPU_TSC_CTRL, attr: _KVM_VCPU_TSC_OFFSET, addr: unsafe.Pointer(&offset), } if _, _, errno := unix.RawSyscall( unix.SYS_IOCTL, uintptr(c.fd), KVM_SET_DEVICE_ATTR, uintptr(unsafe.Pointer(&da))); errno != 0 { return fmt.Errorf("error setting tsc offset: %v", errno) } return nil } // setTSC sets the TSC value. func (c *vCPU) setTSC(value uint64) error { const _MSR_IA32_TSC = 0x00000010 registers := modelControlRegisters{ nmsrs: 1, } registers.entries[0].index = _MSR_IA32_TSC registers.entries[0].data = value if _, _, errno := unix.RawSyscall( unix.SYS_IOCTL, uintptr(c.fd), KVM_SET_MSRS, uintptr(unsafe.Pointer(®isters))); errno != 0 { return fmt.Errorf("error setting tsc: %v", errno) } return nil } // setUserRegisters sets user registers in the vCPU. // //go:nosplit func (c *vCPU) setUserRegisters(uregs *userRegs) unix.Errno { if _, _, errno := unix.RawSyscall( unix.SYS_IOCTL, uintptr(c.fd), KVM_SET_REGS, uintptr(unsafe.Pointer(uregs))); errno != 0 { return errno } return 0 } // getUserRegisters reloads user registers in the vCPU. // // This is safe to call from a nosplit context. // //go:nosplit func (c *vCPU) getUserRegisters(uregs *userRegs) unix.Errno { if _, _, errno := unix.RawSyscall( // escapes: no. unix.SYS_IOCTL, uintptr(c.fd), KVM_GET_REGS, uintptr(unsafe.Pointer(uregs))); errno != 0 { return errno } return 0 } // setSystemRegisters sets system registers. func (c *vCPU) setSystemRegisters(sregs *systemRegs) error { if _, _, errno := unix.RawSyscall( unix.SYS_IOCTL, uintptr(c.fd), KVM_SET_SREGS, uintptr(unsafe.Pointer(sregs))); errno != 0 { return fmt.Errorf("error setting system registers: %v", errno) } return nil } // getSystemRegisters sets system registers. // //go:nosplit func (c *vCPU) getSystemRegisters(sregs *systemRegs) unix.Errno { if _, _, errno := unix.RawSyscall( unix.SYS_IOCTL, uintptr(c.fd), KVM_GET_SREGS, uintptr(unsafe.Pointer(sregs))); errno != 0 { return errno } return 0 } //go:nosplit func seccompMmapSyscall(context unsafe.Pointer) (uintptr, uintptr, unix.Errno) { ctx := bluepillArchContext(context) // MAP_DENYWRITE is deprecated and ignored by kernel. We use it only for seccomp filters. addr, _, e := unix.RawSyscall6(uintptr(ctx.Rax), uintptr(ctx.Rdi), uintptr(ctx.Rsi), uintptr(ctx.Rdx), uintptr(ctx.R10)|unix.MAP_DENYWRITE, uintptr(ctx.R8), uintptr(ctx.R9)) ctx.Rax = uint64(addr) return addr, uintptr(ctx.Rsi), e } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/machine_arm64.go000066400000000000000000000140151465435605700263360ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package kvm import ( "fmt" "runtime" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/platform" ) type vCPUArchState struct { // PCIDs is the set of PCIDs for this vCPU. // // This starts above fixedKernelPCID. PCIDs *pagetables.PCIDs } const ( // fixedKernelPCID is a fixed kernel PCID used for the kernel page // tables. We must start allocating user PCIDs above this in order to // avoid any conflict (see below). fixedKernelPCID = 1 // poolPCIDs is the number of PCIDs to record in the database. As this // grows, assignment can take longer, since it is a simple linear scan. // Beyond a relatively small number, there are likely few perform // benefits, since the TLB has likely long since lost any translations // from more than a few PCIDs past. poolPCIDs = 128 ) func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { applyPhysicalRegions(func(pr physicalRegion) bool { pageTable.Map( hostarch.Addr(ring0.KernelStartAddress|pr.virtual), pr.length, pagetables.MapOpts{AccessType: hostarch.AnyAccess, Global: true}, pr.physical) return true // Keep iterating. }) } // archPhysicalRegions fills readOnlyGuestRegions and allocates separate // physical regions form them. func archPhysicalRegions(physicalRegions []physicalRegion) []physicalRegion { rdRegions := []virtualRegion{} if err := applyVirtualRegions(func(vr virtualRegion) { if excludeVirtualRegion(vr) { return // skip region. } // Skip PROT_NONE mappings. Go-runtime uses them as place // holders for future read-write mappings. if !vr.accessType.Write && vr.accessType.Read { rdRegions = append(rdRegions, vr) } }); err != nil { panic(fmt.Sprintf("error parsing /proc/self/maps: %v", err)) } // Add an unreachable region. rdRegions = append(rdRegions, virtualRegion{ region: region{ virtual: 0xffffffffffffffff, length: 0, }, }) var regions []physicalRegion addValidRegion := func(r *physicalRegion, virtual, length uintptr, readOnly bool) { if length == 0 { return } regions = append(regions, physicalRegion{ region: region{ virtual: virtual, length: length, }, physical: r.physical + (virtual - r.virtual), readOnly: readOnly, }) } i := 0 for _, pr := range physicalRegions { start := pr.virtual end := pr.virtual + pr.length for start < end { rdRegion := rdRegions[i].region rdStart := rdRegion.virtual rdEnd := rdRegion.virtual + rdRegion.length if rdEnd <= start { i++ continue } if rdStart > start { newEnd := rdStart if end < rdStart { newEnd = end } addValidRegion(&pr, start, newEnd-start, false) start = rdStart continue } if rdEnd < end { addValidRegion(&pr, start, rdEnd-start, true) start = rdEnd continue } addValidRegion(&pr, start, end-start, start >= rdStart && end <= rdEnd) start = end } } return regions } // nonCanonical generates a canonical address return. // //go:nosplit func nonCanonical(addr uint64, signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) { *info = linux.SignalInfo{ Signo: signal, Code: linux.SI_KERNEL, } info.SetAddr(addr) // Include address. return hostarch.NoAccess, platform.ErrContextSignal } // isInstructionAbort returns true if it is an instruction abort. // //go:nosplit func isInstructionAbort(code uint64) bool { value := (code & _ESR_ELx_EC_MASK) >> _ESR_ELx_EC_SHIFT return value == _ESR_ELx_EC_IABT_LOW } // isWriteFault returns whether it is a write fault. // //go:nosplit func isWriteFault(code uint64) bool { if isInstructionAbort(code) { return false } return (code & _ESR_ELx_WNR) != 0 } // fault generates an appropriate fault return. // //go:nosplit func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) { bluepill(c) // Probably no-op, but may not be. faultAddr := c.FaultAddr() code, user := c.ErrorCode() if !user { // The last fault serviced by this CPU was not a user // fault, so we can't reliably trust the faultAddr or // the code provided here. We need to re-execute. return hostarch.NoAccess, platform.ErrContextInterrupt } // Reset the pointed SignalInfo. *info = linux.SignalInfo{Signo: signal} info.SetAddr(uint64(faultAddr)) accessType := hostarch.AccessType{} if signal == int32(unix.SIGSEGV) { accessType = hostarch.AccessType{ Read: !isWriteFault(uint64(code)), Write: isWriteFault(uint64(code)), Execute: isInstructionAbort(uint64(code)), } } ret := code & _ESR_ELx_FSC switch ret { case _ESR_SEGV_MAPERR_L0, _ESR_SEGV_MAPERR_L1, _ESR_SEGV_MAPERR_L2, _ESR_SEGV_MAPERR_L3: info.Code = 1 //SEGV_MAPERR case _ESR_SEGV_ACCERR_L1, _ESR_SEGV_ACCERR_L2, _ESR_SEGV_ACCERR_L3, _ESR_SEGV_PEMERR_L1, _ESR_SEGV_PEMERR_L2, _ESR_SEGV_PEMERR_L3: info.Code = 2 // SEGV_ACCERR. default: info.Code = 2 } return accessType, platform.ErrContextSignal } // getMaxVCPU get max vCPU number func (m *machine) getMaxVCPU() { rmaxVCPUs := runtime.NumCPU() smaxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) // compare the max vcpu number from runtime and syscall, use smaller one. if errno != 0 { m.maxVCPUs = rmaxVCPUs } else { if rmaxVCPUs < int(smaxVCPUs) { m.maxVCPUs = rmaxVCPUs } else { m.maxVCPUs = int(smaxVCPUs) } } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/machine_arm64_unsafe.go000066400000000000000000000234511465435605700277030ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package kvm import ( "fmt" "reflect" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/platform" ktime "gvisor.dev/gvisor/pkg/sentry/time" ) type kvmVcpuInit struct { target uint32 features [7]uint32 } var vcpuInit kvmVcpuInit // initArchState initializes architecture-specific state. func (m *machine) initArchState() error { if _, _, errno := unix.RawSyscall( unix.SYS_IOCTL, uintptr(m.fd), _KVM_ARM_PREFERRED_TARGET, uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 { panic(fmt.Sprintf("error setting KVM_ARM_PREFERRED_TARGET failed: %v", errno)) } // Initialize all vCPUs on ARM64, while this does not happen on x86_64. // The reason for the difference is that ARM64 and x86_64 have different KVM timer mechanisms. // If we create vCPU dynamically on ARM64, the timer for vCPU would mess up for a short time. // For more detail, please refer to https://github.com/google/gvisor/issues/5739 m.mu.Lock() for i := 0; i < m.maxVCPUs; i++ { m.createVCPU(i) } m.mu.Unlock() return nil } // initArchState initializes architecture-specific state. func (c *vCPU) initArchState() error { var ( reg kvmOneReg data uint64 regGet kvmOneReg dataGet uint64 ) reg.addr = uint64(reflect.ValueOf(&data).Pointer()) regGet.addr = uint64(reflect.ValueOf(&dataGet).Pointer()) vcpuInit.features[0] |= (1 << _KVM_ARM_VCPU_PSCI_0_2) if _, _, errno := unix.RawSyscall( unix.SYS_IOCTL, uintptr(c.fd), _KVM_ARM_VCPU_INIT, uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 { panic(fmt.Sprintf("error setting KVM_ARM_VCPU_INIT failed: %v", errno)) } // tcr_el1 data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS reg.id = _KVM_ARM64_REGS_TCR_EL1 if err := c.setOneRegister(®); err != nil { return err } // mair_el1 data = _MT_EL1_INIT reg.id = _KVM_ARM64_REGS_MAIR_EL1 if err := c.setOneRegister(®); err != nil { return err } // ttbr0_el1 data = c.machine.kernel.PageTables.TTBR0_EL1(false, 0) reg.id = _KVM_ARM64_REGS_TTBR0_EL1 if err := c.setOneRegister(®); err != nil { return err } c.SetTtbr0Kvm(uintptr(data)) // ttbr1_el1 data = c.machine.kernel.PageTables.TTBR1_EL1(false, 0) reg.id = _KVM_ARM64_REGS_TTBR1_EL1 if err := c.setOneRegister(®); err != nil { return err } // cntkctl_el1 data = _CNTKCTL_EL1_DEFAULT reg.id = _KVM_ARM64_REGS_CNTKCTL_EL1 if err := c.setOneRegister(®); err != nil { return err } // cpacr_el1 data = 0 reg.id = _KVM_ARM64_REGS_CPACR_EL1 if err := c.setOneRegister(®); err != nil { return err } // sctlr_el1 data = _SCTLR_EL1_DEFAULT reg.id = _KVM_ARM64_REGS_SCTLR_EL1 if err := c.setOneRegister(®); err != nil { return err } // tpidr_el1 reg.id = _KVM_ARM64_REGS_TPIDR_EL1 data = uint64(reflect.ValueOf(&c.CPU).Pointer() | ring0.KernelStartAddress) if err := c.setOneRegister(®); err != nil { return err } // sp_el1 data = c.CPU.StackTop() reg.id = _KVM_ARM64_REGS_SP_EL1 if err := c.setOneRegister(®); err != nil { return err } // pc reg.id = _KVM_ARM64_REGS_PC data = uint64(ring0.AddrOfStart()) if err := c.setOneRegister(®); err != nil { return err } // vbar_el1 reg.id = _KVM_ARM64_REGS_VBAR_EL1 vectorLocation := ring0.AddrOfVectors() data = uint64(ring0.KernelStartAddress | vectorLocation) if err := c.setOneRegister(®); err != nil { return err } // Use the address of the exception vector table as // the MMIO address base. vectorLocationPhys, _, _ := translateToPhysical(vectorLocation) arm64HypercallMMIOBase = vectorLocationPhys // Initialize the PCID database. if hasGuestPCID { // Note that NewPCIDs may return a nil table here, in which // case we simply don't use PCID support (see below). In // practice, this should not happen, however. c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs) } return c.setSystemTime() } // setTSC sets the counter Virtual Offset. func (c *vCPU) setTSC(value uint64) error { var ( reg kvmOneReg data uint64 ) reg.addr = uint64(reflect.ValueOf(&data).Pointer()) reg.id = _KVM_ARM64_REGS_TIMER_CNT data = uint64(value) if err := c.setOneRegister(®); err != nil { return err } return nil } // getTSC gets the counter Physical Counter minus Virtual Offset. func (c *vCPU) getTSC() error { var ( reg kvmOneReg data uint64 ) reg.addr = uint64(reflect.ValueOf(&data).Pointer()) reg.id = _KVM_ARM64_REGS_TIMER_CNT if err := c.getOneRegister(®); err != nil { return err } return nil } // setSystemTime sets the vCPU to the system time. func (c *vCPU) setSystemTime() error { const minIterations = 10 minimum := uint64(0) for iter := 0; ; iter++ { // Use get the TSC to an estimate of where it will be // on the host during a "fast" system call iteration. // replace getTSC to another setOneRegister syscall can get more accurate value? start := uint64(ktime.Rdtsc()) if err := c.getTSC(); err != nil { return err } // See if this is our new minimum call time. Note that this // serves two functions: one, we make sure that we are // accurately predicting the offset we need to set. Second, we // don't want to do the final set on a slow call, which could // produce a really bad result. end := uint64(ktime.Rdtsc()) if end < start { continue // Totally bogus: unstable TSC? } current := end - start if current < minimum || iter == 0 { minimum = current // Set our new minimum. } // Is this past minIterations and within ~10% of minimum? upperThreshold := (((minimum << 3) + minimum) >> 3) if iter >= minIterations && (current <= upperThreshold || minimum < 50) { // Try to set the TSC if err := c.setTSC(end + (minimum / 2)); err != nil { return err } return nil } } } //go:nosplit func (c *vCPU) loadSegments(tid uint64) { // TODO(gvisor.dev/issue/1238): TLS is not supported. // Get TLS from tpidr_el0. c.tid.Store(tid) } func (c *vCPU) setOneRegister(reg *kvmOneReg) error { if _, _, errno := unix.RawSyscall( unix.SYS_IOCTL, uintptr(c.fd), _KVM_SET_ONE_REG, uintptr(unsafe.Pointer(reg))); errno != 0 { return fmt.Errorf("error setting one register: %v", errno) } return nil } func (c *vCPU) getOneRegister(reg *kvmOneReg) error { if _, _, errno := unix.RawSyscall( unix.SYS_IOCTL, uintptr(c.fd), _KVM_GET_ONE_REG, uintptr(unsafe.Pointer(reg))); errno != 0 { return fmt.Errorf("error getting one register: %v", errno) } return nil } // SwitchToUser unpacks architectural-details. func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo) (hostarch.AccessType, error) { // Check for canonical addresses. if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Pc) { return nonCanonical(regs.Pc, int32(unix.SIGSEGV), info) } else if !ring0.IsCanonical(regs.Sp) { return nonCanonical(regs.Sp, int32(unix.SIGSEGV), info) } // Assign PCIDs. if c.PCIDs != nil { var requireFlushPCID bool // Force a flush? switchOpts.UserASID, requireFlushPCID = c.PCIDs.Assign(switchOpts.PageTables) switchOpts.Flush = switchOpts.Flush || requireFlushPCID } var vector ring0.Vector ttbr0App := switchOpts.PageTables.TTBR0_EL1(false, 0) c.SetTtbr0App(uintptr(ttbr0App)) // Full context-switch supporting for Arm64. // The Arm64 user-mode execution state consists of: // x0-x30 // PC, SP, PSTATE // V0-V31: 32 128-bit registers for floating point, and simd // FPSR, FPCR // TPIDR_EL0, used for TLS appRegs := switchOpts.Registers c.SetAppAddr(ring0.KernelStartAddress | uintptr(unsafe.Pointer(appRegs))) entersyscall() bluepill(c) vector = c.CPU.SwitchToUser(switchOpts) exitsyscall() switch vector { case ring0.Syscall: // Fast path: system call executed. return hostarch.NoAccess, nil case ring0.PageFault: return c.fault(int32(unix.SIGSEGV), info) case ring0.El0ErrNMI: return c.fault(int32(unix.SIGBUS), info) case ring0.Vector(bounce): // ring0.VirtualizationException. return hostarch.NoAccess, platform.ErrContextInterrupt case ring0.El0SyncUndef: return c.fault(int32(unix.SIGILL), info) case ring0.El0SyncDbg: *info = linux.SignalInfo{ Signo: int32(unix.SIGTRAP), Code: 1, // TRAP_BRKPT (breakpoint). } info.SetAddr(switchOpts.Registers.Pc) // Include address. return hostarch.AccessType{}, platform.ErrContextSignal case ring0.El0SyncSpPc: *info = linux.SignalInfo{ Signo: int32(unix.SIGBUS), Code: 2, // BUS_ADRERR (physical address does not exist). } return hostarch.NoAccess, platform.ErrContextSignal case ring0.El0SyncSys, ring0.El0SyncWfx: return hostarch.NoAccess, nil // skip for now. default: panic(fmt.Sprintf("unexpected vector: 0x%x", vector)) } } //go:nosplit func seccompMmapSyscall(context unsafe.Pointer) (uintptr, uintptr, unix.Errno) { ctx := bluepillArchContext(context) // MAP_DENYWRITE is deprecated and ignored by kernel. We use it only for seccomp filters. addr, _, e := unix.RawSyscall6(uintptr(ctx.Regs[8]), uintptr(ctx.Regs[0]), uintptr(ctx.Regs[1]), uintptr(ctx.Regs[2]), uintptr(ctx.Regs[3])|unix.MAP_DENYWRITE, uintptr(ctx.Regs[4]), uintptr(ctx.Regs[5])) ctx.Regs[0] = uint64(addr) return addr, uintptr(ctx.Regs[1]), e } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/machine_unsafe.go000066400000000000000000000164121465435605700266710ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build go1.18 // +build go1.18 // //go:linkname directives type-checked by checklinkname. Any other // non-linkname assumptions outside the Go 1 compatibility guarantee should // have an accompanied vet check or version guard build tag. package kvm import ( "fmt" "math" "runtime" "sync/atomic" "syscall" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" ) //go:linkname entersyscall runtime.entersyscall func entersyscall() //go:linkname exitsyscall runtime.exitsyscall func exitsyscall() // setMemoryRegion initializes a region. // // This may be called from bluepillHandler, and therefore returns an errno // directly (instead of wrapping in an error) to avoid allocations. // //go:nosplit func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr, flags uint32) unix.Errno { userRegion := userMemoryRegion{ slot: uint32(slot), flags: uint32(flags), guestPhysAddr: uint64(physical), memorySize: uint64(length), userspaceAddr: uint64(virtual), } // Set the region. // Note: syscall.RawSyscall is used to fit the nosplit stack limit. _, _, errno := syscall.RawSyscall( unix.SYS_IOCTL, uintptr(m.fd), KVM_SET_USER_MEMORY_REGION, uintptr(unsafe.Pointer(&userRegion))) return errno } // mapRunData maps the vCPU run data. func mapRunData(fd int) (*runData, error) { r, _, errno := unix.RawSyscall6( unix.SYS_MMAP, 0, uintptr(runDataSize), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED, uintptr(fd), 0) if errno != 0 { return nil, fmt.Errorf("error mapping runData: %v", errno) } return (*runData)(unsafe.Pointer(r)), nil } // unmapRunData unmaps the vCPU run data. func unmapRunData(r *runData) error { if _, _, errno := unix.RawSyscall( unix.SYS_MUNMAP, uintptr(unsafe.Pointer(r)), uintptr(runDataSize), 0); errno != 0 { return fmt.Errorf("error unmapping runData: %v", errno) } return nil } // atomicAddressSpace is an atomic address space pointer. type atomicAddressSpace struct { pointer unsafe.Pointer } // set sets the address space value. // //go:nosplit func (a *atomicAddressSpace) set(as *addressSpace) { atomic.StorePointer(&a.pointer, unsafe.Pointer(as)) } // get gets the address space value. // // Note that this should be considered best-effort, and may have changed by the // time this function returns. // //go:nosplit func (a *atomicAddressSpace) get() *addressSpace { return (*addressSpace)(atomic.LoadPointer(&a.pointer)) } // notify notifies that the vCPU has transitioned modes. // // This may be called by a signal handler and therefore throws on error. // //go:nosplit func (c *vCPU) notify() { _, _, errno := unix.RawSyscall6( // escapes: no. unix.SYS_FUTEX, uintptr(unsafe.Pointer(&c.state)), linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG, math.MaxInt32, // Number of waiters. 0, 0, 0) if errno != 0 { throw("futex wake error") } } // waitUntilNot waits for the vCPU to transition modes. // // The state should have been previously set to vCPUWaiter after performing an // appropriate action to cause a transition (e.g. interrupt injection). // // This panics on error. func (c *vCPU) waitUntilNot(state uint32) { _, _, errno := unix.Syscall6( unix.SYS_FUTEX, uintptr(unsafe.Pointer(&c.state)), linux.FUTEX_WAIT|linux.FUTEX_PRIVATE_FLAG, uintptr(state), 0, 0, 0) if errno != 0 && errno != unix.EINTR && errno != unix.EAGAIN { panic("futex wait error") } } // setSignalMask sets the vCPU signal mask. // // This must be called prior to running the vCPU. func (c *vCPU) setSignalMask() error { // The layout of this structure implies that it will not necessarily be // the same layout chosen by the Go compiler. It gets fudged here. var data struct { length uint32 mask1 uint32 mask2 uint32 _ uint32 } data.length = 8 // Fixed sigset size. data.mask1 = ^uint32(bounceSignalMask & 0xffffffff) data.mask2 = ^uint32(bounceSignalMask >> 32) if _, _, errno := unix.RawSyscall( unix.SYS_IOCTL, uintptr(c.fd), KVM_SET_SIGNAL_MASK, uintptr(unsafe.Pointer(&data))); errno != 0 { return fmt.Errorf("error setting signal mask: %v", errno) } return nil } // seccompMmapHandlerCnt is a number of currently running seccompMmapHandler // instances. var seccompMmapHandlerCnt atomicbitops.Int64 // seccompMmapSync waits for all currently runnuing seccompMmapHandler // instances. // // The standard locking primitives can't be used in this case since // seccompMmapHandler is executed in a signal handler context. // // It can be implemented by using FUTEX calls, but it will require to call // FUTEX_WAKE from seccompMmapHandler. Consider machine.Destroy is called only // once, and the probability is racing with seccompMmapHandler is very low the // spinlock-like way looks more reasonable. func seccompMmapSync() { for seccompMmapHandlerCnt.Load() != 0 { runtime.Gosched() } } // seccompMmapHandler is a signal handler for runtime mmap system calls // that are trapped by seccomp. // // It executes the mmap syscall with specified arguments and maps a new region // to the guest. // //go:nosplit func seccompMmapHandler(context unsafe.Pointer) { mmapCallCounter.Increment() addr, length, errno := seccompMmapSyscall(context) if errno != 0 { return } seccompMmapHandlerCnt.Add(1) for i := uint32(0); i < machinePoolLen.Load(); i++ { m := machinePool[i].Load() if m == nil { continue } // Map the new region to the guest. vr := region{ virtual: addr, length: length, } for virtual := vr.virtual; virtual < vr.virtual+vr.length; { physical, length, ok := translateToPhysical(virtual) if !ok { // This must be an invalid region that was // knocked out by creation of the physical map. return } if virtual+length > vr.virtual+vr.length { // Cap the length to the end of the area. length = vr.virtual + vr.length - virtual } // Ensure the physical range is mapped. m.mapPhysical(physical, length, physicalRegions) virtual += length } } seccompMmapHandlerCnt.Add(-1) } // disableAsyncPreemption disables asynchronous preemption of go-routines. func disableAsyncPreemption() { set := linux.MakeSignalSet(linux.SIGURG) _, _, errno := unix.RawSyscall6(unix.SYS_RT_SIGPROCMASK, linux.SIG_BLOCK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0) if errno != 0 { panic(fmt.Sprintf("sigprocmask failed: %d", errno)) } } // enableAsyncPreemption enables asynchronous preemption of go-routines. func enableAsyncPreemption() { set := linux.MakeSignalSet(linux.SIGURG) _, _, errno := unix.RawSyscall6(unix.SYS_RT_SIGPROCMASK, linux.SIG_UNBLOCK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0) if errno != 0 { panic(fmt.Sprintf("sigprocmask failed: %d", errno)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/physical_map.go000066400000000000000000000165571465435605700264070ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kvm import ( "fmt" "sort" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/ring0" ) type region struct { virtual uintptr length uintptr } type physicalRegion struct { region physical uintptr readOnly bool } // physicalRegions contains a list of available physical regions. // // The physical value used in physicalRegions is a number indicating the // physical offset, aligned appropriately and starting above reservedMemory. var physicalRegions []physicalRegion // fillAddressSpace fills the host address space with PROT_NONE mappings until // we have a host address space size that is less than or equal to the physical // address space. This allows us to have an injective host virtual to guest // physical mapping. // // The excluded regions are returned. func fillAddressSpace() (excludedRegions []region) { // We can cut vSize in half, because the kernel will be using the top // half and we ignore it while constructing mappings. It's as if we've // already excluded half the possible addresses. vSize := ring0.UserspaceSize // We exclude reservedMemory below from our physical memory size, so it // needs to be dropped here as well. Otherwise, we could end up with // physical addresses that are beyond what is mapped. pSize := uintptr(1) << ring0.PhysicalAddressBits pSize -= reservedMemory // Add specifically excluded regions; see excludeVirtualRegion. if err := applyVirtualRegions(func(vr virtualRegion) { if excludeVirtualRegion(vr) { excludedRegions = append(excludedRegions, vr.region) vSize -= vr.length log.Infof("excluded: virtual [%x,%x)", vr.virtual, vr.virtual+vr.length) } }); err != nil { panic(fmt.Sprintf("error parsing /proc/self/maps: %v", err)) } // Do we need any more work? if vSize < pSize { return excludedRegions } // Calculate the required space and fill it. // // Note carefully that we add faultBlockSize to required up front, and // on each iteration of the loop below (i.e. each new physical region // we define), we add faultBlockSize again. This is done because the // computation of physical regions will ensure proper alignments with // faultBlockSize, potentially causing up to faultBlockSize bytes in // internal fragmentation for each physical region. So we need to // account for this properly during allocation. requiredAddr, ok := hostarch.Addr(vSize - pSize + faultBlockSize).RoundUp() if !ok { panic(fmt.Sprintf( "overflow for vSize (%x) - pSize (%x) + faultBlockSize (%x)", vSize, pSize, faultBlockSize)) } required := uintptr(requiredAddr) current := required // Attempted mmap size. for filled := uintptr(0); filled < required && current > 0; { addr, _, errno := unix.RawSyscall6( unix.SYS_MMAP, 0, // Suggested address. current, unix.PROT_NONE, unix.MAP_ANONYMOUS|unix.MAP_PRIVATE|unix.MAP_NORESERVE, 0, 0) if errno != 0 { // One page is the smallest mapping that can be allocated. if current == hostarch.PageSize { current = 0 break } // Attempt half the size; overflow not possible. currentAddr, _ := hostarch.Addr(current >> 1).RoundUp() current = uintptr(currentAddr) continue } // We filled a block. filled += current // Check whether a new region is merged with a previous one. for i := range excludedRegions { if excludedRegions[i].virtual == addr+current { excludedRegions[i].virtual = addr excludedRegions[i].length += current addr = 0 break } if excludedRegions[i].virtual+excludedRegions[i].length == addr { excludedRegions[i].length += current addr = 0 break } } if addr != 0 { excludedRegions = append(excludedRegions, region{ virtual: addr, length: current, }) // See comment above. if filled != required { required += faultBlockSize } } } if current == 0 { panic("filling address space failed") } sort.Slice(excludedRegions, func(i, j int) bool { return excludedRegions[i].virtual < excludedRegions[j].virtual }) for _, r := range excludedRegions { log.Infof("region: virtual [%x,%x)", r.virtual, r.virtual+r.length) } return excludedRegions } // computePhysicalRegions computes physical regions. func computePhysicalRegions(excludedRegions []region) (physicalRegions []physicalRegion) { physical := uintptr(reservedMemory) addValidRegion := func(virtual, length uintptr) { if length == 0 { return } if virtual == 0 { virtual += hostarch.PageSize length -= hostarch.PageSize } if end := virtual + length; end > ring0.MaximumUserAddress { length -= (end - ring0.MaximumUserAddress) } if length == 0 { return } // Round physical up to the same alignment as the virtual // address (with respect to faultBlockSize). if offset := virtual &^ faultBlockMask; physical&^faultBlockMask != offset { if newPhysical := (physical & faultBlockMask) + offset; newPhysical > physical { physical = newPhysical // Round up by only a little bit. } else { physical = ((physical + faultBlockSize) & faultBlockMask) + offset } } physicalRegions = append(physicalRegions, physicalRegion{ region: region{ virtual: virtual, length: length, }, physical: physical, }) physical += length } lastExcludedEnd := uintptr(0) for _, r := range excludedRegions { addValidRegion(lastExcludedEnd, r.virtual-lastExcludedEnd) lastExcludedEnd = r.virtual + r.length } addValidRegion(lastExcludedEnd, ring0.MaximumUserAddress-lastExcludedEnd) // Do arch-specific actions on physical regions. physicalRegions = archPhysicalRegions(physicalRegions) // Dump our all physical regions. for _, r := range physicalRegions { log.Infof("physicalRegion: virtual [%x,%x) => physical [%x,%x)", r.virtual, r.virtual+r.length, r.physical, r.physical+r.length) } return physicalRegions } // physicalInit initializes physical address mappings. func physicalInit() { physicalRegions = computePhysicalRegions(fillAddressSpace()) } // applyPhysicalRegions applies the given function on physical regions. // // Iteration continues as long as true is returned. The return value is the // return from the last call to fn, or true if there are no entries. // // Precondition: physicalInit must have been called. func applyPhysicalRegions(fn func(pr physicalRegion) bool) bool { for _, pr := range physicalRegions { if !fn(pr) { return false } } return true } // translateToPhysical translates the given virtual address. // // Precondition: physicalInit must have been called. // //go:nosplit func translateToPhysical(virtual uintptr) (physical uintptr, length uintptr, ok bool) { for _, pr := range physicalRegions { if pr.virtual <= virtual && virtual < pr.virtual+pr.length { physical = pr.physical + (virtual - pr.virtual) length = pr.length - (virtual - pr.virtual) ok = true return } } return } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/physical_map_amd64.go000066400000000000000000000015011465435605700273610ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kvm const ( // reservedMemory is a chunk of physical memory reserved starting at // physical address zero. There are some special pages in this region, // so we just call the whole thing off. reservedMemory = 0x100000000 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/physical_map_arm64.go000066400000000000000000000012001465435605700273730ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kvm const ( reservedMemory = 0 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/kvm/virtual_map.go000066400000000000000000000055071465435605700262520ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kvm import ( "bufio" "fmt" "io" "os" "regexp" "strconv" "gvisor.dev/gvisor/pkg/hostarch" ) type virtualRegion struct { region accessType hostarch.AccessType shared bool offset uintptr filename string } // mapsLine matches a single line from /proc/PID/maps. var mapsLine = regexp.MustCompile("([0-9a-f]+)-([0-9a-f]+) ([r-][w-][x-][sp]) ([0-9a-f]+) [0-9a-f]{2,3}:[0-9a-f]{2,} [0-9]+\\s+(.*)") // excludeRegion returns true if these regions should be excluded from the // physical map. Virtual regions need to be excluded if get_user_pages will // fail on those addresses, preventing KVM from satisfying EPT faults. // // This is called by the physical map functions, not applyVirtualRegions. func excludeVirtualRegion(r virtualRegion) bool { return false } // applyVirtualRegions parses the process maps file. // // Unlike mappedRegions, these are not consistent over time. func applyVirtualRegions(fn func(vr virtualRegion)) error { // Open /proc/self/maps. f, err := os.Open("/proc/self/maps") if err != nil { return err } defer f.Close() // Parse all entries. r := bufio.NewReader(f) for { b, err := r.ReadBytes('\n') if b != nil && len(b) > 0 { m := mapsLine.FindSubmatch(b) if m == nil { // This should not happen: kernel bug? return fmt.Errorf("badly formed line: %v", string(b)) } start, err := strconv.ParseUint(string(m[1]), 16, 64) if err != nil { return fmt.Errorf("bad start address: %v", string(b)) } end, err := strconv.ParseUint(string(m[2]), 16, 64) if err != nil { return fmt.Errorf("bad end address: %v", string(b)) } read := m[3][0] == 'r' write := m[3][1] == 'w' execute := m[3][2] == 'x' shared := m[3][3] == 's' offset, err := strconv.ParseUint(string(m[4]), 16, 64) if err != nil { return fmt.Errorf("bad offset: %v", string(b)) } fn(virtualRegion{ region: region{ virtual: uintptr(start), length: uintptr(end - start), }, accessType: hostarch.AccessType{ Read: read, Write: write, Execute: execute, }, shared: shared, offset: uintptr(offset), filename: string(m[5]), }) } if err != nil && err == io.EOF { break } else if err != nil { return err } } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/mmap_min_addr.go000066400000000000000000000034601465435605700257150ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package platform import ( "fmt" "io/ioutil" "strconv" "strings" "gvisor.dev/gvisor/pkg/hostarch" ) // systemMMapMinAddrSource is the source file. const systemMMapMinAddrSource = "/proc/sys/vm/mmap_min_addr" // systemMMapMinAddr is the system's minimum map address. var systemMMapMinAddr uint64 // SystemMMapMinAddr returns the minimum system address. func SystemMMapMinAddr() hostarch.Addr { return hostarch.Addr(systemMMapMinAddr) } // MMapMinAddr is a size zero struct that implements MinUserAddress based on // the system minimum address. It is suitable for embedding in platforms that // rely on the system mmap, and thus require the system minimum. type MMapMinAddr struct { } // MinUserAddress implements platform.MinUserAddresss. func (*MMapMinAddr) MinUserAddress() hostarch.Addr { return SystemMMapMinAddr() } func init() { // Open the source file. b, err := ioutil.ReadFile(systemMMapMinAddrSource) if err != nil { panic(fmt.Sprintf("couldn't open %s: %v", systemMMapMinAddrSource, err)) } // Parse the result. systemMMapMinAddr, err = strconv.ParseUint(strings.TrimSpace(string(b)), 10, 64) if err != nil { panic(fmt.Sprintf("couldn't parse %s from %s: %v", string(b), systemMMapMinAddrSource, err)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/platform.go000066400000000000000000000547431465435605700247640ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package platform provides a Platform abstraction. // // See Platform for more information. package platform import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/seccomp/precompiledseccomp" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/hostmm" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/usermem" ) // Platform provides abstractions for execution contexts (Context, // AddressSpace). type Platform interface { // SupportsAddressSpaceIO returns true if AddressSpaces returned by this // Platform support AddressSpaceIO methods. // // The value returned by SupportsAddressSpaceIO is guaranteed to remain // unchanged over the lifetime of the Platform. SupportsAddressSpaceIO() bool // CooperativelySchedulesAddressSpace returns true if the Platform has a // limited number of AddressSpaces, such that mm.MemoryManager.Deactivate // should call AddressSpace.Release when there are no goroutines that // require the mm.MemoryManager to have an active AddressSpace. // // The value returned by CooperativelySchedulesAddressSpace is guaranteed // to remain unchanged over the lifetime of the Platform. CooperativelySchedulesAddressSpace() bool // DetectsCPUPreemption returns true if Contexts returned by the Platform // can reliably return ErrContextCPUPreempted. DetectsCPUPreemption() bool // HaveGlobalMemoryBarrier returns true if the GlobalMemoryBarrier method // is supported. HaveGlobalMemoryBarrier() bool // OwnsPageTables returns true if the Platform implementation manages any // page tables directly (rather than via host mmap(2) etc.) As of this // writing, this property is relevant because the AddressSpace interface // does not support specification of memory type (cacheability), such that // host FDs specifying memory types (e.g. device drivers) can only set them // correctly in host-managed page tables. OwnsPageTables() bool // MapUnit returns the alignment used for optional mappings into this // platform's AddressSpaces. Higher values indicate lower per-page costs // for AddressSpace.MapFile. As a special case, a MapUnit of 0 indicates // that the cost of AddressSpace.MapFile is effectively independent of the // number of pages mapped. If MapUnit is non-zero, it must be a power-of-2 // multiple of hostarch.PageSize. MapUnit() uint64 // MinUserAddress returns the minimum mappable address on this // platform. MinUserAddress() hostarch.Addr // MaxUserAddress returns the maximum mappable address on this // platform. MaxUserAddress() hostarch.Addr // NewAddressSpace returns a new memory context for this platform. // // If mappingsID is not nil, the platform may assume that (1) all calls // to NewAddressSpace with the same mappingsID represent the same // (mutable) set of mappings, and (2) the set of mappings has not // changed since the last time AddressSpace.Release was called on an // AddressSpace returned by a call to NewAddressSpace with the same // mappingsID. // // If a new AddressSpace cannot be created immediately, a nil // AddressSpace is returned, along with channel that is closed when // the caller should retry a call to NewAddressSpace. // // In general, this blocking behavior only occurs when // CooperativelySchedulesAddressSpace (above) returns false. NewAddressSpace(mappingsID any) (AddressSpace, <-chan struct{}, error) // NewContext returns a new execution context. NewContext(context.Context) Context // PreemptAllCPUs causes all concurrent calls to Context.Switch(), as well // as the first following call to Context.Switch() for each Context, to // return ErrContextCPUPreempted. // // PreemptAllCPUs is only supported if DetectsCPUPremption() == true. // Platforms for which this does not hold may panic if PreemptAllCPUs is // called. PreemptAllCPUs() error // GlobalMemoryBarrier blocks until all threads running application code // (via Context.Switch) and all task goroutines "have passed through a // state where all memory accesses to user-space addresses match program // order between entry to and return from [GlobalMemoryBarrier]", as for // membarrier(2). // // Preconditions: HaveGlobalMemoryBarrier() == true. GlobalMemoryBarrier() error // SeccompInfo returns seccomp-related information about this platform. SeccompInfo() SeccompInfo } // NoCPUPreemptionDetection implements Platform.DetectsCPUPreemption and // dependent methods for Platforms that do not support this feature. type NoCPUPreemptionDetection struct{} // DetectsCPUPreemption implements Platform.DetectsCPUPreemption. func (NoCPUPreemptionDetection) DetectsCPUPreemption() bool { return false } // PreemptAllCPUs implements Platform.PreemptAllCPUs. func (NoCPUPreemptionDetection) PreemptAllCPUs() error { panic("This platform does not support CPU preemption detection") } // UseHostGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier and // Platform.GlobalMemoryBarrier by invoking equivalent functionality on the // host. type UseHostGlobalMemoryBarrier struct{} // HaveGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier. func (UseHostGlobalMemoryBarrier) HaveGlobalMemoryBarrier() bool { return hostmm.HaveGlobalMemoryBarrier() } // GlobalMemoryBarrier implements Platform.GlobalMemoryBarrier. func (UseHostGlobalMemoryBarrier) GlobalMemoryBarrier() error { return hostmm.GlobalMemoryBarrier() } // UseHostProcessMemoryBarrier implements Platform.HaveGlobalMemoryBarrier and // Platform.GlobalMemoryBarrier by invoking a process-local memory barrier. // This is faster than UseHostGlobalMemoryBarrier, but is only appropriate for // platforms for which application code executes while using the sentry's // mm_struct. type UseHostProcessMemoryBarrier struct{} // HaveGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier. func (UseHostProcessMemoryBarrier) HaveGlobalMemoryBarrier() bool { // Fall back to a global memory barrier if a process-local one isn't // available. return hostmm.HaveProcessMemoryBarrier() || hostmm.HaveGlobalMemoryBarrier() } // GlobalMemoryBarrier implements Platform.GlobalMemoryBarrier. func (UseHostProcessMemoryBarrier) GlobalMemoryBarrier() error { if hostmm.HaveProcessMemoryBarrier() { return hostmm.ProcessMemoryBarrier() } return hostmm.GlobalMemoryBarrier() } // DoesOwnPageTables implements Platform.OwnsPageTables in the positive. type DoesOwnPageTables struct{} // OwnsPageTables implements Platform.OwnsPageTables. func (DoesOwnPageTables) OwnsPageTables() bool { return true } // DoesNotOwnPageTables implements Platform.OwnsPageTables in the negative. type DoesNotOwnPageTables struct{} // OwnsPageTables implements Platform.OwnsPageTables. func (DoesNotOwnPageTables) OwnsPageTables() bool { return false } // MemoryManager represents an abstraction above the platform address space // which manages memory mappings and their contents. type MemoryManager interface { //usermem.IO provides access to the contents of a virtual memory space. usermem.IO // MMap establishes a memory mapping. MMap(ctx context.Context, opts memmap.MMapOpts) (hostarch.Addr, error) // AddressSpace returns the AddressSpace bound to mm. AddressSpace() AddressSpace // FindVMAByName finds a vma with the specified name. FindVMAByName(ar hostarch.AddrRange, hint string) (hostarch.Addr, uint64, error) } // Context represents the execution context for a single thread. type Context interface { // Switch resumes execution of the thread specified by the arch.Context64 // in the provided address space. This call will block while the thread // is executing. // // If cpu is non-negative, and it is not the number of the CPU that the // thread executes on, Context should return ErrContextCPUPreempted. cpu // can only be non-negative if Platform.DetectsCPUPreemption() is true; // Contexts from Platforms for which this does not hold may ignore cpu, or // panic if cpu is non-negative. // // Switch may return one of the following special errors: // // - nil: The Context invoked a system call. // // - ErrContextSignal: The Context was interrupted by a signal. The // returned *linux.SignalInfo contains information about the signal. If // linux.SignalInfo.Signo == SIGSEGV, the returned hostarch.AccessType // contains the access type of the triggering fault. The caller owns // the returned SignalInfo. // // - ErrContextInterrupt: The Context was interrupted by a call to // Interrupt(). Switch() may return ErrContextInterrupt spuriously. In // particular, most implementations of Interrupt() will cause the first // following call to Switch() to return ErrContextInterrupt if there is no // concurrent call to Switch(). // // - ErrContextCPUPreempted: See the definition of that error for details. Switch(ctx context.Context, mm MemoryManager, ac *arch.Context64, cpu int32) (*linux.SignalInfo, hostarch.AccessType, error) // PullFullState() pulls a full state of the application thread. // // A platform can support lazy loading/restoring of a thread state // which includes registers and a floating point state. // // For example, when the Sentry handles a system call, it may have only // syscall arguments without other registers and a floating point // state. And in this case, if the Sentry will need to construct a // signal frame to call a signal handler, it will need to call // PullFullState() to load all registers and FPU state. // // Preconditions: The caller must be running on the task goroutine. PullFullState(as AddressSpace, ac *arch.Context64) error // FullStateChanged() indicates that a thread state has been changed by // the Sentry. This happens in case of the rt_sigreturn, execve, etc. // // First, it indicates that the Sentry has the full state of the thread // and PullFullState() has to do nothing if it is called after // FullStateChanged(). // // Second, it forces restoring the full state of the application // thread. A platform can support lazy loading/restoring of a thread // state. This means that if the Sentry has not changed a thread state, // the platform may not restore it. // // Preconditions: The caller must be running on the task goroutine. FullStateChanged() // Interrupt interrupts a concurrent call to Switch(), causing it to return // ErrContextInterrupt. Interrupt() // Release() releases any resources associated with this context. Release() // PrepareSleep() is called when the tread switches to the // interruptible sleep state. PrepareSleep() } // ContextError is one of the possible errors returned by Context.Switch(). type ContextError struct { // Err is the underlying error. Err error // Errno is an approximation of what type of error this is supposed to // be as defined by the linux errnos. Errno unix.Errno } func (e *ContextError) Error() string { return e.Err.Error() } var ( // ErrContextSignal is returned by Context.Switch() to indicate that the // Context was interrupted by a signal. ErrContextSignal = fmt.Errorf("interrupted by signal") // ErrContextInterrupt is returned by Context.Switch() to indicate that the // Context was interrupted by a call to Context.Interrupt(). ErrContextInterrupt = fmt.Errorf("interrupted by platform.Context.Interrupt()") // ErrContextCPUPreempted is returned by Context.Switch() to indicate that // one of the following occurred: // // - The CPU executing the Context is not the CPU passed to // Context.Switch(). // // - The CPU executing the Context may have executed another Context since // the last time it executed this one; or the CPU has previously executed // another Context, and has never executed this one. // // - Platform.PreemptAllCPUs() was called since the last return from // Context.Switch(). ErrContextCPUPreempted = fmt.Errorf("interrupted by CPU preemption") ) // SignalInterrupt is a signal reserved for use by implementations of // Context.Interrupt(). The sentry guarantees that it will ignore delivery of // this signal both to Contexts and to the sentry itself, under the assumption // that they originate from races with Context.Interrupt(). // // NOTE(b/23420492): The Go runtime only guarantees that a small subset // of signals will be always be unblocked on all threads, one of which // is SIGCHLD. const SignalInterrupt = linux.SIGCHLD // AddressSpace represents a virtual address space in which a Context can // execute. type AddressSpace interface { // MapFile creates a shared mapping of offsets fr from f at address addr. // Any existing overlapping mappings are silently replaced. // // If precommit is true, the platform should eagerly commit resources (e.g. // physical memory) to the mapping. The precommit flag is advisory and // implementations may choose to ignore it. // // Preconditions: // * addr and fr must be page-aligned. // * fr.Length() > 0. // * at.Any() == true. // * At least one reference must be held on all pages in fr, and must // continue to be held as long as pages are mapped. MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error // Unmap unmaps the given range. // // Preconditions: // * addr is page-aligned. // * length > 0. Unmap(addr hostarch.Addr, length uint64) // Release releases this address space. After releasing, a new AddressSpace // must be acquired via platform.NewAddressSpace(). Release() // PreFork() is called before creating a copy of AddressSpace. This // guarantees that this address space will be in a consistent state. PreFork() // PostFork() is called after creating a copy of AddressSpace. PostFork() // AddressSpaceIO methods are supported iff the associated platform's // Platform.SupportsAddressSpaceIO() == true. AddressSpaces for which this // does not hold may panic if AddressSpaceIO methods are invoked. AddressSpaceIO } // AddressSpaceIO supports IO through the memory mappings installed in an // AddressSpace. // // AddressSpaceIO implementors are responsible for ensuring that address ranges // are application-mappable. type AddressSpaceIO interface { // CopyOut copies len(src) bytes from src to the memory mapped at addr. It // returns the number of bytes copied. If the number of bytes copied is < // len(src), it returns a non-nil error explaining why. CopyOut(addr hostarch.Addr, src []byte) (int, error) // CopyIn copies len(dst) bytes from the memory mapped at addr to dst. // It returns the number of bytes copied. If the number of bytes copied is // < len(dst), it returns a non-nil error explaining why. CopyIn(addr hostarch.Addr, dst []byte) (int, error) // ZeroOut sets toZero bytes to 0, starting at addr. It returns the number // of bytes zeroed. If the number of bytes zeroed is < toZero, it returns a // non-nil error explaining why. ZeroOut(addr hostarch.Addr, toZero uintptr) (uintptr, error) // SwapUint32 atomically sets the uint32 value at addr to new and returns // the previous value. // // Preconditions: addr must be aligned to a 4-byte boundary. SwapUint32(addr hostarch.Addr, new uint32) (uint32, error) // CompareAndSwapUint32 atomically compares the uint32 value at addr to // old; if they are equal, the value in memory is replaced by new. In // either case, the previous value stored in memory is returned. // // Preconditions: addr must be aligned to a 4-byte boundary. CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error) // LoadUint32 atomically loads the uint32 value at addr and returns it. // // Preconditions: addr must be aligned to a 4-byte boundary. LoadUint32(addr hostarch.Addr) (uint32, error) } // NoAddressSpaceIO implements AddressSpaceIO methods by panicking. type NoAddressSpaceIO struct{} // CopyOut implements AddressSpaceIO.CopyOut. func (NoAddressSpaceIO) CopyOut(addr hostarch.Addr, src []byte) (int, error) { panic("This platform does not support AddressSpaceIO") } // CopyIn implements AddressSpaceIO.CopyIn. func (NoAddressSpaceIO) CopyIn(addr hostarch.Addr, dst []byte) (int, error) { panic("This platform does not support AddressSpaceIO") } // ZeroOut implements AddressSpaceIO.ZeroOut. func (NoAddressSpaceIO) ZeroOut(addr hostarch.Addr, toZero uintptr) (uintptr, error) { panic("This platform does not support AddressSpaceIO") } // SwapUint32 implements AddressSpaceIO.SwapUint32. func (NoAddressSpaceIO) SwapUint32(addr hostarch.Addr, new uint32) (uint32, error) { panic("This platform does not support AddressSpaceIO") } // CompareAndSwapUint32 implements AddressSpaceIO.CompareAndSwapUint32. func (NoAddressSpaceIO) CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error) { panic("This platform does not support AddressSpaceIO") } // LoadUint32 implements AddressSpaceIO.LoadUint32. func (NoAddressSpaceIO) LoadUint32(addr hostarch.Addr) (uint32, error) { panic("This platform does not support AddressSpaceIO") } // SegmentationFault is an error returned by AddressSpaceIO methods when IO // fails due to access of an unmapped page, or a mapped page with insufficient // permissions. type SegmentationFault struct { // Addr is the address at which the fault occurred. Addr hostarch.Addr } // Error implements error.Error. func (f SegmentationFault) Error() string { return fmt.Sprintf("segmentation fault at %#x", f.Addr) } // Requirements is used to specify platform specific requirements. type Requirements struct { // RequiresCurrentPIDNS indicates that the sandbox has to be started in the // current pid namespace. RequiresCurrentPIDNS bool // RequiresCapSysPtrace indicates that the sandbox has to be started with // the CAP_SYS_PTRACE capability. RequiresCapSysPtrace bool } // SeccompInfo represents seccomp-bpf data for a given platform. type SeccompInfo interface { // Variables returns a map from named variables to the value they should // have with the platform as currently initialized. // Variables are known only at runtime, but are not part of a platform's // configuration. For example, the KVM platform having an FD representing // the KVM VM is a variable: it is only known at runtime, but does not // change the structure of the syscall rules. // The set of variable names must be static regardless of platform // configuration. Variables() precompiledseccomp.Values // ConfigKey returns a string that uniquely represents the set of // configuration information from which syscall rules are derived, // other than variables or CPU architecture. // This should at least contain the platform name. // If syscall rules are dependent on the platform's configuration, // this should return a string that encapsulates the values of these // configuration options. // For example, if some option of the platform causes it to require a // new syscall to be allowed, this option should be part of this string. ConfigKey() string // SyscallFilters returns syscalls made exclusively by this platform. // `vars` maps variable names (as returned by `Variables()`) to values, // and **the rules should depend on `vars`**. These will not necessarily // map to the result of calling `Variables()` on the current `SeccompInfo`; // during seccomp rule precompilation, these will be set to placeholder // values. SyscallFilters(vars precompiledseccomp.Values) seccomp.SyscallRules // HottestSyscalls returns the list of syscall numbers that this platform // calls most often, most-frequently-called first. No more than a dozen // syscalls. Returning an empty or a nil slice is OK. // This is used to produce a more efficient seccomp-bpf program that can // check for the most frequently called syscalls first. // What matters here is only the frequency at which a syscall is called, // not the total amount of CPU time that is used to process it in the host // kernel. HottestSyscalls() []uintptr } // StaticSeccompInfo implements `SeccompInfo` for platforms which don't have // any configuration or variables. type StaticSeccompInfo struct { // PlatformName is the platform name. PlatformName string // Filters is the platform's syscall filters. Filters seccomp.SyscallRules // HotSyscalls is the list of syscalls numbers that this platform // calls most often, most-frequently-called first. // See `SeccompInfo.HottestSyscalls` for more. HotSyscalls []uintptr } // Variables implements `SeccompInfo.Variables`. func (StaticSeccompInfo) Variables() precompiledseccomp.Values { return nil } // ConfigKey implements `SeccompInfo.ConfigKey`. func (s StaticSeccompInfo) ConfigKey() string { return s.PlatformName } // SyscallFilters implements `SeccompInfo.SyscallFilters`. func (s StaticSeccompInfo) SyscallFilters(precompiledseccomp.Values) seccomp.SyscallRules { return s.Filters } // HottestSyscalls implements `SeccompInfo.HottestSyscalls`. func (s StaticSeccompInfo) HottestSyscalls() []uintptr { return s.HotSyscalls } // Constructor represents a platform type. type Constructor interface { // New returns a new platform instance. // // Arguments: // // * deviceFile - the device file (e.g. /dev/kvm for the KVM platform). New(deviceFile *fd.FD) (Platform, error) // OpenDevice opens the path to the device used by the platform. // Passing in an empty string will use the default path for the device, // e.g. "/dev/kvm" for the KVM platform. OpenDevice(devicePath string) (*fd.FD, error) // Requirements returns platform specific requirements. Requirements() Requirements // PrecompiledSeccompInfo returns a list of `SeccompInfo`s that is // useful to precompile into the Sentry. PrecompiledSeccompInfo() []SeccompInfo } // platforms contains all available platform types. var platforms = map[string]Constructor{} // Register registers a new platform type. func Register(name string, platform Constructor) { platforms[name] = platform } // List lists available platforms. func List() (available []string) { for name := range platforms { available = append(available, name) } return } // Lookup looks up the platform constructor by name. func Lookup(name string) (Constructor, error) { p, ok := platforms[name] if !ok { return nil, fmt.Errorf("unknown platform: %v", name) } return p, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/platform_amd64_state_autogen.go000066400000000000000000000001341465435605700306620ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 // +build amd64 package platform golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/platform_arm64_state_autogen.go000066400000000000000000000001341465435605700307000ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 // +build arm64 package platform golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/platform_state_autogen.go000066400000000000000000000000721465435605700276700ustar00rootroot00000000000000// automatically generated by stateify. package platform golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/000077500000000000000000000000001465435605700240525ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/filters.go000066400000000000000000000024551465435605700260570ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ptrace import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/platform" ) // SeccompInfo returns seccomp information for the ptrace platform. func (*PTrace) SeccompInfo() platform.SeccompInfo { return platform.StaticSeccompInfo{ PlatformName: "ptrace", Filters: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_PTRACE: seccomp.MatchAll{}, unix.SYS_TGKILL: seccomp.MatchAll{}, unix.SYS_WAIT4: seccomp.MatchAll{}, }), } } // PrecompiledSeccompInfo implements // platform.Constructor.PrecompiledSeccompInfo. func (*constructor) PrecompiledSeccompInfo() []platform.SeccompInfo { return []platform.SeccompInfo{(*PTrace)(nil).SeccompInfo()} } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/ptrace.go000066400000000000000000000206341465435605700256640ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package ptrace provides a ptrace-based implementation of the platform // interface. This is useful for development and testing purposes primarily, // and runs on stock kernels without special permissions. // // In a nutshell, it works as follows: // // The creation of a new address space creates a new child process with a single // thread which is traced by a single goroutine. // // A context is just a collection of temporary variables. Calling Switch on a // context does the following: // // Locks the runtime thread. // // Looks up a traced subprocess thread for the current runtime thread. If // none exists, the dedicated goroutine is asked to create a new stopped // thread in the subprocess. This stopped subprocess thread is then traced // by the current thread and this information is stored for subsequent // switches. // // The context is then bound with information about the subprocess thread // so that the context may be appropriately interrupted via a signal. // // The requested operation is performed in the traced subprocess thread // (e.g. set registers, execute, return). // // Lock order: // // subprocess.mu // context.mu package ptrace import ( "gvisor.dev/gvisor/pkg/abi/linux" pkgcontext "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" "gvisor.dev/gvisor/pkg/sync" ) var ( // stubStart is the link address for our stub, and determines the // maximum user address. This is valid only after a call to stubInit. // // We attempt to link the stub here, and adjust downward as needed. stubStart uintptr = stubInitAddress // stubEnd is the first byte past the end of the stub, as with // stubStart this is valid only after a call to stubInit. stubEnd uintptr // stubInitialized controls one-time stub initialization. stubInitialized sync.Once ) type context struct { archContext // signalInfo is the signal info, if and when a signal is received. signalInfo linux.SignalInfo // interrupt is the interrupt context. interrupt interrupt.Forwarder // mu protects the following fields. mu sync.Mutex // If lastFaultSP is non-nil, the last context switch was due to a fault // received while executing lastFaultSP. Only context.Switch may set // lastFaultSP to a non-nil value. lastFaultSP *subprocess // lastFaultAddr is the last faulting address; this is only meaningful if // lastFaultSP is non-nil. lastFaultAddr hostarch.Addr // lastFaultIP is the address of the last faulting instruction; // this is also only meaningful if lastFaultSP is non-nil. lastFaultIP hostarch.Addr } // NewContext implements platform.Platform.NewContext. func (*PTrace) NewContext(ctx pkgcontext.Context) platform.Context { c := new(context) c.archContext.init(ctx) return c } // Switch runs the provided context in the given address space. func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac *arch.Context64, cpu int32) (*linux.SignalInfo, hostarch.AccessType, error) { as := mm.AddressSpace() s := as.(*subprocess) restart: isSyscall := s.switchToApp(c, ac) var ( faultSP *subprocess faultAddr hostarch.Addr faultIP hostarch.Addr ) if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGSEGV { faultSP = s faultAddr = hostarch.Addr(c.signalInfo.Addr()) faultIP = hostarch.Addr(ac.IP()) } // Update the context to reflect the outcome of this context switch. c.mu.Lock() lastFaultSP := c.lastFaultSP lastFaultAddr := c.lastFaultAddr lastFaultIP := c.lastFaultIP // At this point, c may not yet be in s.contexts, so c.lastFaultSP won't be // updated by s.Unmap(). This is fine; we only need to synchronize with // calls to s.Unmap() that occur after the handling of this fault. c.lastFaultSP = faultSP c.lastFaultAddr = faultAddr c.lastFaultIP = faultIP c.mu.Unlock() // Update subprocesses to reflect the outcome of this context switch. if lastFaultSP != faultSP { if lastFaultSP != nil { lastFaultSP.mu.Lock() delete(lastFaultSP.contexts, c) lastFaultSP.mu.Unlock() } if faultSP != nil { faultSP.mu.Lock() faultSP.contexts[c] = struct{}{} faultSP.mu.Unlock() } } if isSyscall { return nil, hostarch.NoAccess, nil } si := c.signalInfo if faultSP == nil { // Non-fault signal. return &si, hostarch.NoAccess, platform.ErrContextSignal } // See if this can be handled as a CPUID instruction. if linux.Signal(si.Signo) == linux.SIGSEGV && platform.TryCPUIDEmulate(ctx, mm, ac) { goto restart } // Got a page fault. Ideally, we'd get real fault type here, but ptrace // doesn't expose this information. Instead, we use a simple heuristic: // // It was an instruction fault iff the faulting addr == instruction // pointer. // // It was a write fault if the fault is immediately repeated. at := hostarch.Read if faultAddr == faultIP { at.Execute = true } if lastFaultSP == faultSP && lastFaultAddr == faultAddr && lastFaultIP == faultIP { at.Write = true } // Handle as a signal. return &si, at, platform.ErrContextSignal } // Interrupt interrupts the running guest application associated with this context. func (c *context) Interrupt() { c.interrupt.NotifyInterrupt() } // Release implements platform.Context.Release(). func (c *context) Release() {} // FullStateChanged implements platform.Context.FullStateChanged. func (c *context) FullStateChanged() {} // PullFullState implements platform.Context.PullFullState. func (c *context) PullFullState(as platform.AddressSpace, ac *arch.Context64) error { return nil } // PrepareSleep implements platform.Context.platform.PrepareSleep. func (*context) PrepareSleep() {} // PTrace represents a collection of ptrace subprocesses. type PTrace struct { platform.MMapMinAddr platform.NoCPUPreemptionDetection platform.UseHostGlobalMemoryBarrier platform.DoesNotOwnPageTables } // New returns a new ptrace-based implementation of the platform interface. func New() (*PTrace, error) { stubInitialized.Do(func() { // Initialize the stub. stubInit() // Create the master process for the global pool. This must be // done before initializing any other processes. master, err := newSubprocess(createStub) if err != nil { // Should never happen. panic("unable to initialize ptrace master: " + err.Error()) } // Set the master on the globalPool. globalPool.master = master }) return &PTrace{}, nil } // SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO. func (*PTrace) SupportsAddressSpaceIO() bool { return false } // CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace. func (*PTrace) CooperativelySchedulesAddressSpace() bool { return false } // MapUnit implements platform.Platform.MapUnit. func (*PTrace) MapUnit() uint64 { // The host kernel manages page tables and arbitrary-sized mappings // have effectively the same cost. return 0 } // MaxUserAddress returns the first address that may not be used by user // applications. func (*PTrace) MaxUserAddress() hostarch.Addr { return hostarch.Addr(stubStart) } // NewAddressSpace returns a new subprocess. func (p *PTrace) NewAddressSpace(any) (platform.AddressSpace, <-chan struct{}, error) { as, err := newSubprocess(globalPool.master.createStub) return as, nil, err } type constructor struct{} func (*constructor) New(*fd.FD) (platform.Platform, error) { return New() } func (*constructor) OpenDevice(_ string) (*fd.FD, error) { return nil, nil } // Flags implements platform.Constructor.Flags(). func (*constructor) Requirements() platform.Requirements { // TODO(b/75837838): Also set a new PID namespace so that we limit // access to other host processes. return platform.Requirements{ RequiresCapSysPtrace: true, RequiresCurrentPIDNS: true, } } func init() { platform.Register("ptrace", &constructor{}) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/ptrace_amd64.go000066400000000000000000000040301465435605700266470ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package ptrace import ( "gvisor.dev/gvisor/pkg/abi/linux" pkgcontext "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/sentry/arch" ) // archContext is architecture-specific context. type archContext struct { // fpLen is the size of the floating point state. fpLen int // useXsave indicates whether or not xsave is in use. useXsave bool } // init initializes the archContext. func (a *archContext) init(ctx pkgcontext.Context) { fs := cpuid.FromContext(ctx) fpLen, _ := fs.ExtendedStateSize() useXsave := fs.UseXsave() a.fpLen = int(fpLen) a.useXsave = useXsave } // floatingPointLength returns the length of floating point state. func (a *archContext) floatingPointLength() uint64 { return uint64(a.fpLen) } // floatingPointRegSet returns the register set to fetch. func (a *archContext) floatingPointRegSet() uintptr { if a.useXsave { return linux.NT_X86_XSTATE } return linux.NT_PRFPREG } func stackPointer(r *arch.Registers) uintptr { return uintptr(r.Rsp) } // x86 use the fs_base register to store the TLS pointer which can be // get/set in "func (t *thread) get/setRegs(regs *arch.Registers)". // So both of the get/setTLS() operations are noop here. // getTLS gets the thread local storage register. func (t *thread) getTLS(tls *uint64) error { return nil } // setTLS sets the thread local storage register. func (t *thread) setTLS(tls *uint64) error { return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/ptrace_amd64_state_autogen.go000066400000000000000000000001511465435605700315710ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 && amd64 // +build amd64,amd64 package ptrace golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/ptrace_arm64.go000066400000000000000000000026531465435605700266760ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ptrace import ( "gvisor.dev/gvisor/pkg/abi/linux" pkgcontext "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/sentry/arch" ) // archContext is architecture-specific context. type archContext struct { // fpLen is the size of the floating point state. fpLen int } // init initializes the archContext. func (a *archContext) init(ctx pkgcontext.Context) { fs := cpuid.FromContext(ctx) fpLen, _ := fs.ExtendedStateSize() a.fpLen = int(fpLen) } // floatingPointLength returns the length of floating point state. func (a *archContext) floatingPointLength() uint64 { return uint64(a.fpLen) } // floatingPointRegSet returns the register set to fetch. func (a *archContext) floatingPointRegSet() uintptr { return linux.NT_PRFPREG } func stackPointer(r *arch.Registers) uintptr { return uintptr(r.Sp) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/ptrace_arm64_state_autogen.go000066400000000000000000000001321465435605700316060ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 // +build arm64 package ptrace golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe.go000066400000000000000000000030011465435605700302230ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package ptrace import ( "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" ) // getTLS gets the thread local storage register. func (t *thread) getTLS(tls *uint64) error { iovec := unix.Iovec{ Base: (*byte)(unsafe.Pointer(tls)), Len: uint64(unsafe.Sizeof(*tls)), } _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_GETREGSET, uintptr(t.tid), linux.NT_ARM_TLS, uintptr(unsafe.Pointer(&iovec)), 0, 0) if errno != 0 { return errno } return nil } // setTLS sets the thread local storage register. func (t *thread) setTLS(tls *uint64) error { iovec := unix.Iovec{ Base: (*byte)(unsafe.Pointer(tls)), Len: uint64(unsafe.Sizeof(*tls)), } _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_SETREGSET, uintptr(t.tid), linux.NT_ARM_TLS, uintptr(unsafe.Pointer(&iovec)), 0, 0) if errno != 0 { return errno } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe_state_autogen.go000066400000000000000000000001321465435605700331470ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 // +build arm64 package ptrace golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/ptrace_linux_state_autogen.go000066400000000000000000000001321465435605700320140ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux // +build linux package ptrace golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/ptrace_linux_unsafe_state_autogen.go000066400000000000000000000002041465435605700333550ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux && (amd64 || arm64) // +build linux // +build amd64 arm64 package ptrace golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/ptrace_state_autogen.go000066400000000000000000000000701465435605700305760ustar00rootroot00000000000000// automatically generated by stateify. package ptrace golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/ptrace_unsafe.go000066400000000000000000000104361465435605700272240ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ptrace import ( "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" ) // getRegs gets the general purpose register set. func (t *thread) getRegs(regs *arch.Registers) error { iovec := unix.Iovec{ Base: (*byte)(unsafe.Pointer(regs)), Len: uint64(unsafe.Sizeof(*regs)), } _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_GETREGSET, uintptr(t.tid), linux.NT_PRSTATUS, uintptr(unsafe.Pointer(&iovec)), 0, 0) if errno != 0 { return errno } return nil } // setRegs sets the general purpose register set. func (t *thread) setRegs(regs *arch.Registers) error { iovec := unix.Iovec{ Base: (*byte)(unsafe.Pointer(regs)), Len: uint64(unsafe.Sizeof(*regs)), } _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_SETREGSET, uintptr(t.tid), linux.NT_PRSTATUS, uintptr(unsafe.Pointer(&iovec)), 0, 0) if errno != 0 { return errno } return nil } // getFPRegs gets the floating-point data via the GETREGSET ptrace unix. func (t *thread) getFPRegs(fpState *fpu.State, ac *archContext) error { iovec := unix.Iovec{ Base: fpState.BytePointer(), Len: ac.floatingPointLength(), } _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_GETREGSET, uintptr(t.tid), ac.floatingPointRegSet(), uintptr(unsafe.Pointer(&iovec)), 0, 0) if errno != 0 { return errno } return nil } // setFPRegs sets the floating-point data via the SETREGSET ptrace unix. func (t *thread) setFPRegs(fpState *fpu.State, ac *archContext) error { iovec := unix.Iovec{ Base: fpState.BytePointer(), Len: ac.floatingPointLength(), } _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_SETREGSET, uintptr(t.tid), ac.floatingPointRegSet(), uintptr(unsafe.Pointer(&iovec)), 0, 0) if errno != 0 { return errno } return nil } // getSignalInfo retrieves information about the signal that caused the stop. func (t *thread) getSignalInfo(si *linux.SignalInfo) error { _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_GETSIGINFO, uintptr(t.tid), 0, uintptr(unsafe.Pointer(si)), 0, 0) if errno != 0 { return errno } return nil } // clone creates a new thread from this one. // // The returned thread will be stopped and available for any system thread to // call attach on it. // // Precondition: the OS thread must be locked and own t. func (t *thread) clone() (*thread, error) { r, ok := hostarch.Addr(stackPointer(&t.initRegs)).RoundUp() if !ok { return nil, unix.EINVAL } rval, err := t.syscallIgnoreInterrupt( &t.initRegs, unix.SYS_CLONE, arch.SyscallArgument{Value: uintptr( unix.CLONE_FILES | unix.CLONE_FS | unix.CLONE_SIGHAND | unix.CLONE_THREAD | unix.CLONE_PTRACE | unix.CLONE_VM)}, // The stack pointer is just made up, but we have it be // something sensible so the kernel doesn't think we're // up to no good. Which we are. arch.SyscallArgument{Value: uintptr(r)}, arch.SyscallArgument{}, arch.SyscallArgument{}, // We use these registers initially, but really they // could be anything. We're going to stop immediately. arch.SyscallArgument{Value: uintptr(unsafe.Pointer(&t.initRegs))}) if err != nil { return nil, err } return &thread{ tgid: t.tgid, tid: int32(rval), cpu: ^uint32(0), }, nil } // getEventMessage retrieves a message about the ptrace event that just happened. func (t *thread) getEventMessage() (uintptr, error) { var msg uintptr _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_GETEVENTMSG, uintptr(t.tid), 0, uintptr(unsafe.Pointer(&msg)), 0, 0) if errno != 0 { return msg, errno } return msg, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/ptrace_unsafe_state_autogen.go000066400000000000000000000001341465435605700321400ustar00rootroot00000000000000// automatically generated by stateify. //go:build go1.18 // +build go1.18 package ptrace golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/stub_amd64.s000066400000000000000000000055761465435605700262230ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "funcdata.h" #include "textflag.h" #define SYS_GETPID 39 #define SYS_EXIT 60 #define SYS_KILL 62 #define SYS_GETPPID 110 #define SYS_PRCTL 157 #define SIGKILL 9 #define SIGSTOP 19 #define PR_SET_PDEATHSIG 1 // stub bootstraps the child and sends itself SIGSTOP to wait for attach. // // R15 contains the expected PPID. R15 is used instead of a more typical DI // since syscalls will clobber DI and createStub wants to pass a new PPID to // grandchildren. // // This should not be used outside the context of a new ptrace child (as the // function is otherwise a bunch of nonsense). TEXT ·stub(SB),NOSPLIT|NOFRAME,$0 begin: // N.B. This loop only executes in the context of a single-threaded // fork child. MOVQ $SYS_PRCTL, AX MOVQ $PR_SET_PDEATHSIG, DI MOVQ $SIGKILL, SI SYSCALL CMPQ AX, $0 JNE error // If the parent already died before we called PR_SET_DEATHSIG then // we'll have an unexpected PPID. MOVQ $SYS_GETPPID, AX SYSCALL CMPQ AX, $0 JL error CMPQ AX, R15 JNE parent_dead MOVQ $SYS_GETPID, AX SYSCALL CMPQ AX, $0 JL error MOVQ $0, BX // SIGSTOP to wait for attach. // // The SYSCALL instruction will be used for future syscall injection by // thread.syscall. MOVQ AX, DI MOVQ $SYS_KILL, AX MOVQ $SIGSTOP, SI SYSCALL // The sentry sets BX to 1 when creating stub process. CMPQ BX, $1 JE clone // Notify the Sentry that syscall exited. done: INT $3 // Be paranoid. JMP done clone: // subprocess.createStub clones a new stub process that is untraced, // thus executing this code. We setup the PDEATHSIG before SIGSTOPing // ourselves for attach by the tracer. // // R15 has been updated with the expected PPID. CMPQ AX, $0 JE begin // The clone syscall returns a non-zero value. JMP done error: // Exit with -errno. MOVQ AX, DI NEGQ DI MOVQ $SYS_EXIT, AX SYSCALL HLT parent_dead: MOVQ $SYS_EXIT, AX MOVQ $1, DI SYSCALL HLT // func addrOfStub() uintptr TEXT ·addrOfStub(SB), $0-8 MOVQ $·stub(SB), AX MOVQ AX, ret+0(FP) RET // stubCall calls the stub function at the given address with the given PPID. // // This is a distinct function because stub, above, may be mapped at any // arbitrary location, and stub has a specific binary API (see above). TEXT ·stubCall(SB),NOSPLIT|NOFRAME,$0-16 MOVQ addr+0(FP), AX MOVQ pid+8(FP), R15 JMP AX golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/stub_arm64.s000066400000000000000000000052411465435605700262260ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "funcdata.h" #include "textflag.h" #define SYS_GETPID 172 #define SYS_EXIT 93 #define SYS_KILL 129 #define SYS_GETPPID 173 #define SYS_PRCTL 167 #define SIGKILL 9 #define SIGSTOP 19 #define PR_SET_PDEATHSIG 1 // stub bootstraps the child and sends itself SIGSTOP to wait for attach. // // R7 contains the expected PPID. // // This should not be used outside the context of a new ptrace child (as the // function is otherwise a bunch of nonsense). TEXT ·stub(SB),NOSPLIT,$0 begin: // N.B. This loop only executes in the context of a single-threaded // fork child. MOVD $SYS_PRCTL, R8 MOVD $PR_SET_PDEATHSIG, R0 MOVD $SIGKILL, R1 SVC CMN $4095, R0 BCS error // If the parent already died before we called PR_SET_DEATHSIG then // we'll have an unexpected PPID. MOVD $SYS_GETPPID, R8 SVC CMP R0, R7 BNE parent_dead MOVD $SYS_GETPID, R8 SVC CMP $0x0, R0 BLT error MOVD $0, R9 // SIGSTOP to wait for attach. // // The SYSCALL instruction will be used for future syscall injection by // thread.syscall. MOVD $SYS_KILL, R8 MOVD $SIGSTOP, R1 SVC // The sentry sets R9 to 1 when creating stub process. CMP $1, R9 BEQ clone done: // Notify the Sentry that syscall exited. BRK $3 B done // Be paranoid. clone: // subprocess.createStub clones a new stub process that is untraced, // thus executing this code. We setup the PDEATHSIG before SIGSTOPing // ourselves for attach by the tracer. // // R7 has been updated with the expected PPID. CMP $0, R0 BEQ begin // The clone system call returned a non-zero value. B done error: // Exit with -errno. NEG R0, R0 MOVD $SYS_EXIT, R8 SVC HLT parent_dead: MOVD $SYS_EXIT, R8 MOVD $1, R0 SVC HLT // func addrOfStub() uintptr TEXT ·addrOfStub(SB), $0-8 MOVD $·stub(SB), R0 MOVD R0, ret+0(FP) RET // stubCall calls the stub function at the given address with the given PPID. // // This is a distinct function because stub, above, may be mapped at any // arbitrary location, and stub has a specific binary API (see above). TEXT ·stubCall(SB),NOSPLIT,$0-16 MOVD addr+0(FP), R0 MOVD pid+8(FP), R7 B (R0) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/stub_unsafe.go000066400000000000000000000057241465435605700267270ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ptrace import ( "reflect" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safecopy" ) // stub is defined in arch-specific assembly. func stub() // addrOfStub returns the start address of stub. // // In Go 1.17+, Go references to assembly functions resolve to an ABIInternal // wrapper function rather than the function itself. We must reference from // assembly to get the ABI0 (i.e., primary) address. func addrOfStub() uintptr // stubCall calls the stub at the given address with the given pid. func stubCall(addr, pid uintptr) // unsafeSlice returns a slice for the given address and length. func unsafeSlice(addr uintptr, length int) (slice []byte) { sh := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) sh.Data = addr sh.Len = length sh.Cap = length return } // stubInit initializes the stub. func stubInit() { // Grab the existing stub. stubBegin := addrOfStub() stubLen := int(safecopy.FindEndAddress(stubBegin) - stubBegin) stubSlice := unsafeSlice(stubBegin, stubLen) mapLen := uintptr(stubLen) if offset := mapLen % hostarch.PageSize; offset != 0 { mapLen += hostarch.PageSize - offset } for stubStart > 0 { // Map the target address for the stub. // // We don't use FIXED here because we don't want to unmap // something that may have been there already. We just walk // down the address space until we find a place where the stub // can be placed. addr, _, errno := unix.RawSyscall6( unix.SYS_MMAP, stubStart, mapLen, unix.PROT_WRITE|unix.PROT_READ, unix.MAP_PRIVATE|unix.MAP_ANONYMOUS, 0 /* fd */, 0 /* offset */) if addr != stubStart || errno != 0 { if addr != 0 { // Unmap the region we've mapped accidentally. unix.RawSyscall(unix.SYS_MUNMAP, addr, mapLen, 0) } // Attempt to begin at a lower address. stubStart -= uintptr(hostarch.PageSize) continue } // Copy the stub to the address. targetSlice := unsafeSlice(addr, stubLen) copy(targetSlice, stubSlice) // Make the stub executable. if _, _, errno := unix.RawSyscall( unix.SYS_MPROTECT, stubStart, mapLen, unix.PROT_EXEC|unix.PROT_READ); errno != 0 { panic("mprotect failed: " + errno.Error()) } // Set the end. stubEnd = stubStart + mapLen return } // This will happen only if we exhaust the entire address // space, and it will take a long, long time. panic("failed to map stub") } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/subprocess.go000066400000000000000000000501371465435605700265770ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ptrace import ( "fmt" "os" "runtime" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/hosttid" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" ) var ( // maximumUserAddress is the largest possible user address. maximumUserAddress = linux.TaskSize // stubInitAddress is the initial attempt link address for the stub. stubInitAddress = linux.TaskSize ) // Linux kernel errnos which "should never be seen by user programs", but will // be revealed to ptrace syscall exit tracing. // // These constants are only used in subprocess.go. const ( ERESTARTSYS = unix.Errno(512) ERESTARTNOINTR = unix.Errno(513) ERESTARTNOHAND = unix.Errno(514) ) // globalPool exists to solve two distinct problems: // // 1) Subprocesses can't always be killed properly (see Release). // // 2) Any seccomp filters that have been installed will apply to subprocesses // created here. Therefore we use the intermediary (master), which is created // on initialization of the platform. var globalPool struct { mu sync.Mutex master *subprocess available []*subprocess } // thread is a traced thread; it is a thread identifier. // // This is a convenience type for defining ptrace operations. type thread struct { tgid int32 tid int32 cpu uint32 // initRegs are the initial registers for the first thread. // // These are used for the register set for system calls. initRegs arch.Registers } // threadPool is a collection of threads. type threadPool struct { // mu protects below. mu sync.RWMutex // threads is the collection of threads. // // This map is indexed by system TID (the calling thread); which will // be the tracer for the given *thread, and therefore capable of using // relevant ptrace calls. threads map[int32]*thread } // lookupOrCreate looks up a given thread or creates one. // // newThread will generally be subprocess.newThread. // // Precondition: the runtime OS thread must be locked. func (tp *threadPool) lookupOrCreate(currentTID int32, newThread func() *thread) *thread { // The overwhelming common case is that the thread is already created. // Optimistically attempt the lookup by only locking for reading. tp.mu.RLock() t, ok := tp.threads[currentTID] tp.mu.RUnlock() if ok { return t } tp.mu.Lock() defer tp.mu.Unlock() // Another goroutine might have created the thread for currentTID in between // mu.RUnlock() and mu.Lock(). if t, ok = tp.threads[currentTID]; ok { return t } // Before creating a new thread, see if we can find a thread // whose system tid has disappeared. // // TODO(b/77216482): Other parts of this package depend on // threads never exiting. for origTID, t := range tp.threads { // Signal zero is an easy existence check. if err := unix.Tgkill(unix.Getpid(), int(origTID), 0); err != nil { // This thread has been abandoned; reuse it. delete(tp.threads, origTID) tp.threads[currentTID] = t return t } } // Create a new thread. t = newThread() tp.threads[currentTID] = t return t } // subprocess is a collection of threads being traced. type subprocess struct { platform.NoAddressSpaceIO // requests is used to signal creation of new threads. requests chan chan *thread // sysemuThreads are reserved for emulation. sysemuThreads threadPool // syscallThreads are reserved for syscalls (except clone, which is // handled in the dedicated goroutine corresponding to requests above). syscallThreads threadPool // mu protects the following fields. mu sync.Mutex // contexts is the set of contexts for which it's possible that // context.lastFaultSP == this subprocess. contexts map[*context]struct{} } // newSubprocess returns a usable subprocess. // // This will either be a newly created subprocess, or one from the global pool. // The create function will be called in the latter case, which is guaranteed // to happen with the runtime thread locked. func newSubprocess(create func() (*thread, error)) (*subprocess, error) { // See Release. globalPool.mu.Lock() if len(globalPool.available) > 0 { sp := globalPool.available[len(globalPool.available)-1] globalPool.available = globalPool.available[:len(globalPool.available)-1] globalPool.mu.Unlock() return sp, nil } globalPool.mu.Unlock() // The following goroutine is responsible for creating the first traced // thread, and responding to requests to make additional threads in the // traced process. The process will be killed and reaped when the // request channel is closed, which happens in Release below. errChan := make(chan error) requests := make(chan chan *thread) go func() { // S/R-SAFE: Platform-related. runtime.LockOSThread() defer runtime.UnlockOSThread() // Initialize the first thread. firstThread, err := create() if err != nil { errChan <- err return } firstThread.grabInitRegs() // Ready to handle requests. errChan <- nil // Wait for requests to create threads. for r := range requests { t, err := firstThread.clone() if err != nil { // Should not happen: not recoverable. panic(fmt.Sprintf("error initializing first thread: %v", err)) } // Since the new thread was created with // clone(CLONE_PTRACE), it will begin execution with // SIGSTOP pending and with this thread as its tracer. // (Hopefully nobody tgkilled it with a signal < // SIGSTOP before the SIGSTOP was delivered, in which // case that signal would be delivered before SIGSTOP.) if sig := t.wait(stopped); sig != unix.SIGSTOP { panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig)) } // Detach the thread. t.detach() t.initRegs = firstThread.initRegs // Return the thread. r <- t } // Requests should never be closed. panic("unreachable") }() // Wait until error or readiness. if err := <-errChan; err != nil { return nil, err } // Ready. sp := &subprocess{ requests: requests, sysemuThreads: threadPool{ threads: make(map[int32]*thread), }, syscallThreads: threadPool{ threads: make(map[int32]*thread), }, contexts: make(map[*context]struct{}), } sp.unmap() return sp, nil } // unmap unmaps non-stub regions of the process. // // This will panic on failure (which should never happen). func (s *subprocess) unmap() { s.Unmap(0, uint64(stubStart)) if maximumUserAddress != stubEnd { s.Unmap(hostarch.Addr(stubEnd), uint64(maximumUserAddress-stubEnd)) } } // Release kills the subprocess. // // Just kidding! We can't safely coordinate the detaching of all the // tracees (since the tracers are random runtime threads, and the process // won't exit until tracers have been notifier). // // Therefore we simply unmap everything in the subprocess and return it to the // globalPool. This has the added benefit of reducing creation time for new // subprocesses. func (s *subprocess) Release() { go func() { // S/R-SAFE: Platform. s.unmap() globalPool.mu.Lock() globalPool.available = append(globalPool.available, s) globalPool.mu.Unlock() }() } // newThread creates a new traced thread. // // Precondition: the OS thread must be locked. func (s *subprocess) newThread() *thread { // Ask the first thread to create a new one. r := make(chan *thread) s.requests <- r t := <-r // Attach the subprocess to this one. t.attach() // Return the new thread, which is now bound. return t } // attach attaches to the thread. func (t *thread) attach() { if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_ATTACH, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("unable to attach: %v", errno)) } // PTRACE_ATTACH sends SIGSTOP, and wakes the tracee if it was already // stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of // newSubprocess), so we always expect to see signal-delivery-stop with // SIGSTOP. if sig := t.wait(stopped); sig != unix.SIGSTOP { panic(fmt.Sprintf("wait failed: expected SIGSTOP, got %v", sig)) } // Initialize options. t.init() } func (t *thread) grabInitRegs() { // Grab registers. // // Note that we adjust the current register RIP value to be just before // the current system call executed. This depends on the definition of // the stub itself. if err := t.getRegs(&t.initRegs); err != nil { panic(fmt.Sprintf("ptrace get regs failed: %v", err)) } t.adjustInitRegsRip() } // detach detaches from the thread. // // Because the SIGSTOP is not suppressed, the thread will enter group-stop. func (t *thread) detach() { if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(unix.SIGSTOP), 0, 0); errno != 0 { panic(fmt.Sprintf("can't detach new clone: %v", errno)) } } // waitOutcome is used for wait below. type waitOutcome int const ( // stopped indicates that the process was stopped. stopped waitOutcome = iota // killed indicates that the process was killed. killed ) func (t *thread) dumpAndPanic(message string) { var regs arch.Registers message += "\n" if err := t.getRegs(®s); err == nil { message += dumpRegs(®s) } else { log.Warningf("unable to get registers: %v", err) } message += fmt.Sprintf("stubStart\t = %016x\n", stubStart) panic(message) } func (t *thread) unexpectedStubExit() { msg, err := t.getEventMessage() status := unix.WaitStatus(msg) if status.Signaled() && status.Signal() == unix.SIGKILL { // SIGKILL can be only sent by a user or OOM-killer. In both // these cases, we don't need to panic. There is no reasons to // think that something wrong in gVisor. log.Warningf("The ptrace stub process %v has been killed by SIGKILL.", t.tgid) pid := os.Getpid() unix.Tgkill(pid, pid, unix.Signal(unix.SIGKILL)) } t.dumpAndPanic(fmt.Sprintf("wait failed: the process %d:%d exited: %x (err %v)", t.tgid, t.tid, msg, err)) } // wait waits for a stop event. // // Precondition: outcome is a valid waitOutcome. func (t *thread) wait(outcome waitOutcome) unix.Signal { var status unix.WaitStatus for { r, err := unix.Wait4(int(t.tid), &status, unix.WALL|unix.WUNTRACED, nil) if err == unix.EINTR || err == unix.EAGAIN { // Wait was interrupted; wait again. continue } else if err != nil { panic(fmt.Sprintf("ptrace wait failed: %v", err)) } if int(r) != int(t.tid) { panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid)) } switch outcome { case stopped: if !status.Stopped() { t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status)) } stopSig := status.StopSignal() if stopSig == 0 { continue // Spurious stop. } if stopSig == unix.SIGTRAP { if status.TrapCause() == unix.PTRACE_EVENT_EXIT { t.unexpectedStubExit() } // Re-encode the trap cause the way it's expected. return stopSig | unix.Signal(status.TrapCause()<<8) } // Not a trap signal. return stopSig case killed: if !status.Exited() && !status.Signaled() { t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status)) } return unix.Signal(status.ExitStatus()) default: // Should not happen. t.dumpAndPanic(fmt.Sprintf("unknown outcome: %v", outcome)) } } } // destroy kills the thread. // // Note that this should not be used in the general case; the death of threads // will typically cause the death of the parent. This is a utility method for // manually created threads. func (t *thread) destroy() { t.detach() unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(unix.SIGKILL)) t.wait(killed) } // init initializes trace options. func (t *thread) init() { // Set the TRACESYSGOOD option to differentiate real SIGTRAP. // set PTRACE_O_EXITKILL to ensure that the unexpected exit of the // sentry will immediately kill the associated stubs. const PTRACE_O_EXITKILL = 0x100000 _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_SETOPTIONS, uintptr(t.tid), 0, unix.PTRACE_O_TRACESYSGOOD|unix.PTRACE_O_TRACEEXIT|PTRACE_O_EXITKILL, 0, 0) if errno != 0 { panic(fmt.Sprintf("ptrace set options failed: %v", errno)) } } // syscall executes a system call cycle in the traced context. // // This is _not_ for use by application system calls, rather it is for use when // a system call must be injected into the remote context (e.g. mmap, munmap). // Note that clones are handled separately. func (t *thread) syscall(regs *arch.Registers) (uintptr, error) { // Set registers. if err := t.setRegs(regs); err != nil { panic(fmt.Sprintf("ptrace set regs failed: %v", err)) } for { // Execute the syscall instruction. The task has to stop on the // trap instruction which is right after the syscall // instruction. if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) } sig := t.wait(stopped) if sig == unix.SIGTRAP { // Reached syscall-enter-stop. break } else { // Some other signal caused a thread stop; ignore. if sig != unix.SIGSTOP && sig != unix.SIGCHLD { log.Warningf("The thread %d:%d has been interrupted by %d", t.tgid, t.tid, sig) } continue } } // Grab registers. if err := t.getRegs(regs); err != nil { panic(fmt.Sprintf("ptrace get regs failed: %v", err)) } return syscallReturnValue(regs) } // syscallIgnoreInterrupt ignores interrupts on the system call thread and // restarts the syscall if the kernel indicates that should happen. func (t *thread) syscallIgnoreInterrupt( initRegs *arch.Registers, sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) { for { regs := createSyscallRegs(initRegs, sysno, args...) rval, err := t.syscall(®s) switch err { case ERESTARTSYS: continue case ERESTARTNOINTR: continue case ERESTARTNOHAND: continue default: return rval, err } } } // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. func (t *thread) NotifyInterrupt() { unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(platform.SignalInterrupt)) } // switchToApp is called from the main SwitchToApp entrypoint. // // This function returns true on a system call, false on a signal. func (s *subprocess) switchToApp(c *context, ac *arch.Context64) bool { // Lock the thread for ptrace operations. runtime.LockOSThread() defer runtime.UnlockOSThread() // Extract floating point state. fpState := ac.FloatingPointData() // Grab our thread from the pool. currentTID := int32(hosttid.Current()) t := s.sysemuThreads.lookupOrCreate(currentTID, s.newThread) // Reset necessary registers. regs := &ac.StateData().Regs t.resetSysemuRegs(regs) // Extract TLS register tls := uint64(ac.TLS()) // Check for interrupts, and ensure that future interrupts will signal t. if !c.interrupt.Enable(t) { // Pending interrupt; simulate. c.signalInfo = linux.SignalInfo{Signo: int32(platform.SignalInterrupt)} return false } defer c.interrupt.Disable() // Set registers. if err := t.setRegs(regs); err != nil { panic(fmt.Sprintf("ptrace set regs (%+v) failed: %v", regs, err)) } if err := t.setFPRegs(fpState, &c.archContext); err != nil { panic(fmt.Sprintf("ptrace set fpregs (%+v) failed: %v", fpState, err)) } if err := t.setTLS(&tls); err != nil { panic(fmt.Sprintf("ptrace set tls (%+v) failed: %v", tls, err)) } for { // Start running until the next system call. if isSingleStepping(regs) { if _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_SYSEMU_SINGLESTEP, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace sysemu failed: %v", errno)) } } else { if _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace sysemu failed: %v", errno)) } } // Wait for the syscall-enter stop. sig := t.wait(stopped) if sig == unix.SIGSTOP { // SIGSTOP was delivered to another thread in the same thread // group, which initiated another group stop. Just ignore it. continue } // Refresh all registers. if err := t.getRegs(regs); err != nil { panic(fmt.Sprintf("ptrace get regs failed: %v", err)) } if err := t.getFPRegs(fpState, &c.archContext); err != nil { panic(fmt.Sprintf("ptrace get fpregs failed: %v", err)) } if err := t.getTLS(&tls); err != nil { panic(fmt.Sprintf("ptrace get tls failed: %v", err)) } if !ac.SetTLS(uintptr(tls)) { panic(fmt.Sprintf("tls value %v is invalid", tls)) } // Is it a system call? if sig == (syscallEvent | unix.SIGTRAP) { s.arm64SyscallWorkaround(t, regs) // Ensure registers are sane. updateSyscallRegs(regs) return true } // Grab signal information. if err := t.getSignalInfo(&c.signalInfo); err != nil { // Should never happen. panic(fmt.Sprintf("ptrace get signal info failed: %v", err)) } // We have a signal. We verify however, that the signal was // either delivered from the kernel or from this process. We // don't respect other signals. if c.signalInfo.Code > 0 { // The signal was generated by the kernel. We inspect // the signal information, and may patch it in order to // facilitate vsyscall emulation. See patchSignalInfo. patchSignalInfo(regs, &c.signalInfo) return false } else if c.signalInfo.Code <= 0 && c.signalInfo.PID() == int32(os.Getpid()) { // The signal was generated by this process. That means // that it was an interrupt or something else that we // should bail for. Note that we ignore signals // generated by other processes. return false } } } // syscall executes the given system call without handling interruptions. func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) { // Grab a thread. runtime.LockOSThread() defer runtime.UnlockOSThread() currentTID := int32(hosttid.Current()) t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread) return t.syscallIgnoreInterrupt(&t.initRegs, sysno, args...) } // MapFile implements platform.AddressSpace.MapFile. func (s *subprocess) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error { var flags int if precommit { flags |= unix.MAP_POPULATE } _, err := s.syscall( unix.SYS_MMAP, arch.SyscallArgument{Value: uintptr(addr)}, arch.SyscallArgument{Value: uintptr(fr.Length())}, arch.SyscallArgument{Value: uintptr(at.Prot())}, arch.SyscallArgument{Value: uintptr(flags | unix.MAP_SHARED | unix.MAP_FIXED)}, arch.SyscallArgument{Value: uintptr(f.FD())}, arch.SyscallArgument{Value: uintptr(fr.Start)}) return err } // Unmap implements platform.AddressSpace.Unmap. func (s *subprocess) Unmap(addr hostarch.Addr, length uint64) { ar, ok := addr.ToRange(length) if !ok { panic(fmt.Sprintf("addr %#x + length %#x overflows", addr, length)) } s.mu.Lock() for c := range s.contexts { c.mu.Lock() if c.lastFaultSP == s && ar.Contains(c.lastFaultAddr) { // Forget the last fault so that if c faults again, the fault isn't // incorrectly reported as a write fault. If this is being called // due to munmap() of the corresponding vma, handling of the second // fault will fail anyway. c.lastFaultSP = nil delete(s.contexts, c) } c.mu.Unlock() } s.mu.Unlock() _, err := s.syscall( unix.SYS_MUNMAP, arch.SyscallArgument{Value: uintptr(addr)}, arch.SyscallArgument{Value: uintptr(length)}) if err != nil { // We never expect this to happen. panic(fmt.Sprintf("munmap(%x, %x)) failed: %v", addr, length, err)) } } // PreFork implements platform.AddressSpace.PreFork. func (s *subprocess) PreFork() {} // PostFork implements platform.AddressSpace.PostFork. func (s *subprocess) PostFork() {} golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/subprocess_amd64.go000066400000000000000000000203721465435605700275700ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package ptrace import ( "fmt" "strings" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" ) const ( // initRegsRipAdjustment is the size of the syscall instruction. initRegsRipAdjustment = 2 ) // resetSysemuRegs sets up emulation registers. // // This should be called prior to calling sysemu. func (t *thread) resetSysemuRegs(regs *arch.Registers) { regs.Cs = t.initRegs.Cs regs.Ss = t.initRegs.Ss regs.Ds = t.initRegs.Ds regs.Es = t.initRegs.Es regs.Fs = t.initRegs.Fs regs.Gs = t.initRegs.Gs } // createSyscallRegs sets up syscall registers. // // This should be called to generate registers for a system call. func createSyscallRegs(initRegs *arch.Registers, sysno uintptr, args ...arch.SyscallArgument) arch.Registers { // Copy initial registers. regs := *initRegs // Set our syscall number. regs.Rax = uint64(sysno) if len(args) >= 1 { regs.Rdi = args[0].Uint64() } if len(args) >= 2 { regs.Rsi = args[1].Uint64() } if len(args) >= 3 { regs.Rdx = args[2].Uint64() } if len(args) >= 4 { regs.R10 = args[3].Uint64() } if len(args) >= 5 { regs.R8 = args[4].Uint64() } if len(args) >= 6 { regs.R9 = args[5].Uint64() } return regs } // isSingleStepping determines if the registers indicate single-stepping. func isSingleStepping(regs *arch.Registers) bool { return (regs.Eflags & arch.X86TrapFlag) != 0 } // updateSyscallRegs updates registers after finishing sysemu. func updateSyscallRegs(regs *arch.Registers) { // Ptrace puts -ENOSYS in rax on syscall-enter-stop. regs.Rax = regs.Orig_rax } // syscallReturnValue extracts a sensible return from registers. func syscallReturnValue(regs *arch.Registers) (uintptr, error) { rval := int64(regs.Rax) if rval < 0 { return 0, unix.Errno(-rval) } return uintptr(rval), nil } func dumpRegs(regs *arch.Registers) string { var m strings.Builder fmt.Fprintf(&m, "Registers:\n") fmt.Fprintf(&m, "\tR15\t = %016x\n", regs.R15) fmt.Fprintf(&m, "\tR14\t = %016x\n", regs.R14) fmt.Fprintf(&m, "\tR13\t = %016x\n", regs.R13) fmt.Fprintf(&m, "\tR12\t = %016x\n", regs.R12) fmt.Fprintf(&m, "\tRbp\t = %016x\n", regs.Rbp) fmt.Fprintf(&m, "\tRbx\t = %016x\n", regs.Rbx) fmt.Fprintf(&m, "\tR11\t = %016x\n", regs.R11) fmt.Fprintf(&m, "\tR10\t = %016x\n", regs.R10) fmt.Fprintf(&m, "\tR9\t = %016x\n", regs.R9) fmt.Fprintf(&m, "\tR8\t = %016x\n", regs.R8) fmt.Fprintf(&m, "\tRax\t = %016x\n", regs.Rax) fmt.Fprintf(&m, "\tRcx\t = %016x\n", regs.Rcx) fmt.Fprintf(&m, "\tRdx\t = %016x\n", regs.Rdx) fmt.Fprintf(&m, "\tRsi\t = %016x\n", regs.Rsi) fmt.Fprintf(&m, "\tRdi\t = %016x\n", regs.Rdi) fmt.Fprintf(&m, "\tOrig_rax = %016x\n", regs.Orig_rax) fmt.Fprintf(&m, "\tRip\t = %016x\n", regs.Rip) fmt.Fprintf(&m, "\tCs\t = %016x\n", regs.Cs) fmt.Fprintf(&m, "\tEflags\t = %016x\n", regs.Eflags) fmt.Fprintf(&m, "\tRsp\t = %016x\n", regs.Rsp) fmt.Fprintf(&m, "\tSs\t = %016x\n", regs.Ss) fmt.Fprintf(&m, "\tFs_base\t = %016x\n", regs.Fs_base) fmt.Fprintf(&m, "\tGs_base\t = %016x\n", regs.Gs_base) fmt.Fprintf(&m, "\tDs\t = %016x\n", regs.Ds) fmt.Fprintf(&m, "\tEs\t = %016x\n", regs.Es) fmt.Fprintf(&m, "\tFs\t = %016x\n", regs.Fs) fmt.Fprintf(&m, "\tGs\t = %016x\n", regs.Gs) return m.String() } // adjustInitregsRip adjust the current register RIP value to // be just before the system call instruction execution func (t *thread) adjustInitRegsRip() { t.initRegs.Rip -= initRegsRipAdjustment } // Pass the expected PPID to the child via R15 when creating stub process. func initChildProcessPPID(initregs *arch.Registers, ppid int32) { initregs.R15 = uint64(ppid) // Rbx has to be set to 1 when creating stub process. initregs.Rbx = 1 } // patchSignalInfo patches the signal info to account for hitting the seccomp // filters from vsyscall emulation, specified below. We allow for SIGSYS as a // synchronous trap, but patch the structure to appear like a SIGSEGV with the // Rip as the faulting address. // // Note that this should only be called after verifying that the signalInfo has // been generated by the kernel. func patchSignalInfo(regs *arch.Registers, signalInfo *linux.SignalInfo) { if linux.Signal(signalInfo.Signo) == linux.SIGSYS { signalInfo.Signo = int32(linux.SIGSEGV) // Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered // with the si_call_addr field pointing to the current RIP. This field // aligns with the si_addr field for a SIGSEGV, so we don't need to touch // anything there. We do need to unwind emulation however, so we set the // instruction pointer to the faulting value, and "unpop" the stack. regs.Rip = signalInfo.Addr() regs.Rsp -= 8 } } // enableCpuidFault enables cpuid-faulting. // // This may fail on older kernels or hardware, so we just disregard the result. // Host CPUID will be enabled. // // This is safe to call in an afterFork context. // //go:norace //go:nosplit func enableCpuidFault() { unix.RawSyscall6(unix.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0, 0, 0, 0) } // appendArchSeccompRules append architecture specific seccomp rules when creating BPF program. // Ref attachedThread() for more detail. func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFAction) []seccomp.RuleSet { rules = append(rules, // Rules for trapping vsyscall access. seccomp.RuleSet{ Rules: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_GETTIMEOFDAY: seccomp.MatchAll{}, unix.SYS_TIME: seccomp.MatchAll{}, unix.SYS_GETCPU: seccomp.MatchAll{}, // SYS_GETCPU was not defined in package syscall on amd64. }), Action: linux.SECCOMP_RET_TRAP, Vsyscall: true, }) if defaultAction != linux.SECCOMP_RET_ALLOW { rules = append(rules, seccomp.RuleSet{ Rules: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_ARCH_PRCTL: seccomp.PerArg{ seccomp.EqualTo(linux.ARCH_SET_CPUID), seccomp.EqualTo(0), }, }), Action: linux.SECCOMP_RET_ALLOW, }) } return rules } // probeSeccomp returns true iff seccomp is run after ptrace notifications, // which is generally the case for kernel version >= 4.8. This check is dynamic // because kernels have be backported behavior. // // See createStub for more information. // // Precondition: the runtime OS thread must be locked. func probeSeccomp() bool { // Create a completely new, destroyable process. t, err := attachedThread(0, linux.SECCOMP_RET_ERRNO) if err != nil { panic(fmt.Sprintf("seccomp probe failed: %v", err)) } defer t.destroy() // Set registers to the yield system call. This call is not allowed // by the filters specified in the attachThread function. regs := createSyscallRegs(&t.initRegs, unix.SYS_SCHED_YIELD) if err := t.setRegs(®s); err != nil { panic(fmt.Sprintf("ptrace set regs failed: %v", err)) } for { // Attempt an emulation. if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) } sig := t.wait(stopped) if sig == (syscallEvent | unix.SIGTRAP) { // Did the seccomp errno hook already run? This would // indicate that seccomp is first in line and we're // less than 4.8. if err := t.getRegs(®s); err != nil { panic(fmt.Sprintf("ptrace get-regs failed: %v", err)) } if _, err := syscallReturnValue(®s); err == nil { // The seccomp errno mode ran first, and reset // the error in the registers. return false } // The seccomp hook did not run yet, and therefore it // is safe to use RET_KILL mode for dispatched calls. return true } } } func (s *subprocess) arm64SyscallWorkaround(t *thread, regs *arch.Registers) { } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/subprocess_arm64.go000066400000000000000000000143171465435605700276100ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package ptrace import ( "fmt" "strings" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" ) const ( // initRegsRipAdjustment is the size of the svc instruction. initRegsRipAdjustment = 4 ) // resetSysemuRegs sets up emulation registers. // // This should be called prior to calling sysemu. func (t *thread) resetSysemuRegs(regs *arch.Registers) { } // createSyscallRegs sets up syscall registers. // // This should be called to generate registers for a system call. func createSyscallRegs(initRegs *arch.Registers, sysno uintptr, args ...arch.SyscallArgument) arch.Registers { // Copy initial registers (Pc, Sp, etc.). regs := *initRegs // Set our syscall number. // r8 for the syscall number. // r0-r6 is used to store the parameters. regs.Regs[8] = uint64(sysno) if len(args) >= 1 { regs.Regs[0] = args[0].Uint64() } if len(args) >= 2 { regs.Regs[1] = args[1].Uint64() } if len(args) >= 3 { regs.Regs[2] = args[2].Uint64() } if len(args) >= 4 { regs.Regs[3] = args[3].Uint64() } if len(args) >= 5 { regs.Regs[4] = args[4].Uint64() } if len(args) >= 6 { regs.Regs[5] = args[5].Uint64() } return regs } // isSingleStepping determines if the registers indicate single-stepping. func isSingleStepping(regs *arch.Registers) bool { // Refer to the ARM SDM D2.12.3: software step state machine // return (regs.Pstate.SS == 1) && (MDSCR_EL1.SS == 1). // // Since the host Linux kernel will set MDSCR_EL1.SS on our behalf // when we call a single-step ptrace command, we only need to check // the Pstate.SS bit here. return (regs.Pstate & arch.ARMTrapFlag) != 0 } // updateSyscallRegs updates registers after finishing sysemu. func updateSyscallRegs(regs *arch.Registers) { // No special work is necessary. return } // syscallReturnValue extracts a sensible return from registers. func syscallReturnValue(regs *arch.Registers) (uintptr, error) { rval := int64(regs.Regs[0]) if rval < 0 { return 0, unix.Errno(-rval) } return uintptr(rval), nil } func dumpRegs(regs *arch.Registers) string { var m strings.Builder fmt.Fprintf(&m, "Registers:\n") for i := 0; i < 31; i++ { fmt.Fprintf(&m, "\tRegs[%d]\t = %016x\n", i, regs.Regs[i]) } fmt.Fprintf(&m, "\tSp\t = %016x\n", regs.Sp) fmt.Fprintf(&m, "\tPc\t = %016x\n", regs.Pc) fmt.Fprintf(&m, "\tPstate\t = %016x\n", regs.Pstate) return m.String() } // adjustInitregsRip adjust the current register RIP value to // be just before the system call instruction execution func (t *thread) adjustInitRegsRip() { t.initRegs.Pc -= initRegsRipAdjustment } // Pass the expected PPID to the child via X7 when creating stub process func initChildProcessPPID(initregs *arch.Registers, ppid int32) { initregs.Regs[7] = uint64(ppid) // R9 has to be set to 1 when creating stub process. initregs.Regs[9] = 1 } // patchSignalInfo patches the signal info to account for hitting the seccomp // filters from vsyscall emulation, specified below. We allow for SIGSYS as a // synchronous trap, but patch the structure to appear like a SIGSEGV with the // Rip as the faulting address. // // Note that this should only be called after verifying that the signalInfo has // been generated by the kernel. func patchSignalInfo(regs *arch.Registers, signalInfo *linux.SignalInfo) { if linux.Signal(signalInfo.Signo) == linux.SIGSYS { signalInfo.Signo = int32(linux.SIGSEGV) // Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered // with the si_call_addr field pointing to the current RIP. This field // aligns with the si_addr field for a SIGSEGV, so we don't need to touch // anything there. We do need to unwind emulation however, so we set the // instruction pointer to the faulting value, and "unpop" the stack. regs.Pc = signalInfo.Addr() regs.Sp -= 8 } } // Noop on arm64. // //go:nosplit func enableCpuidFault() { } // appendArchSeccompRules append architecture specific seccomp rules when creating BPF program. // Ref attachedThread() for more detail. func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFAction) []seccomp.RuleSet { return rules } // probeSeccomp returns true if seccomp is run after ptrace notifications, // which is generally the case for kernel version >= 4.8. // // On arm64, the support of PTRACE_SYSEMU was added in the 5.3 kernel, so // probeSeccomp can always return true. func probeSeccomp() bool { return true } func (s *subprocess) arm64SyscallWorkaround(t *thread, regs *arch.Registers) { // On ARM64, when ptrace stops on a system call, it uses the x7 // register to indicate whether the stop has been signalled from // syscall entry or syscall exit. This means that we can't get a value // of this register and we can't change it. More details are in the // comment for tracehook_report_syscall in arch/arm64/kernel/ptrace.c. // // This happens only if we stop on a system call, so let's queue a // signal, resume a stub thread and catch it on a signal handling. t.NotifyInterrupt() for { if _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace sysemu failed: %v", errno)) } // Wait for the syscall-enter stop. sig := t.wait(stopped) if sig == unix.SIGSTOP { // SIGSTOP was delivered to another thread in the same thread // group, which initiated another group stop. Just ignore it. continue } if sig == (syscallEvent | unix.SIGTRAP) { t.dumpAndPanic(fmt.Sprintf("unexpected syscall event")) } break } if err := t.getRegs(regs); err != nil { panic(fmt.Sprintf("ptrace get regs failed: %v", err)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/subprocess_linux.go000066400000000000000000000220171465435605700300120ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package ptrace import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/hosttid" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" ) const syscallEvent unix.Signal = 0x80 // createStub creates a fresh stub processes. // // Precondition: the runtime OS thread must be locked. func createStub() (*thread, error) { // The exact interactions of ptrace and seccomp are complex, and // changed in recent kernel versions. Before commit 93e35efb8de45, the // seccomp check is done before the ptrace emulation check. This means // that any calls not matching this list will trigger the seccomp // default action instead of notifying ptrace. // // After commit 93e35efb8de45, the seccomp check is done after the // ptrace emulation check. This simplifies using SYSEMU, since seccomp // will never run for emulation. Seccomp will only run for injected // system calls, and thus we can use RET_KILL as our violation action. var defaultAction linux.BPFAction if probeSeccomp() { log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)") defaultAction = linux.SECCOMP_RET_KILL_THREAD } else { // We must rely on SYSEMU behavior; tracing with SYSEMU is broken. log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)") defaultAction = linux.SECCOMP_RET_ALLOW } // When creating the new child process, we specify SIGKILL as the // signal to deliver when the child exits. We never expect a subprocess // to exit; they are pooled and reused. This is done to ensure that if // a subprocess is OOM-killed, this process (and all other stubs, // transitively) will be killed as well. It's simply not possible to // safely handle a single stub getting killed: the exact state of // execution is unknown and not recoverable. // // In addition, we set the PTRACE_O_TRACEEXIT option to log more // information about a stub process when it receives a fatal signal. return attachedThread(uintptr(unix.SIGKILL)|unix.CLONE_FILES, defaultAction) } // attachedThread returns a new attached thread. // // Precondition: the runtime OS thread must be locked. func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, error) { // Create a BPF program that allows only the system calls needed by the // stub and all its children. This is used to create child stubs // (below), so we must include the ability to fork, but otherwise lock // down available calls only to what is needed. rules := []seccomp.RuleSet{} if defaultAction != linux.SECCOMP_RET_ALLOW { rules = append(rules, seccomp.RuleSet{ Rules: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_CLONE: seccomp.Or{ // Allow creation of new subprocesses (used by the master). seccomp.PerArg{seccomp.EqualTo(unix.CLONE_FILES | unix.SIGKILL)}, // Allow creation of new threads within a single address space (used by address spaces). seccomp.PerArg{ seccomp.EqualTo( unix.CLONE_FILES | unix.CLONE_FS | unix.CLONE_SIGHAND | unix.CLONE_THREAD | unix.CLONE_PTRACE | unix.CLONE_VM)}, }, // For the initial process creation. unix.SYS_WAIT4: seccomp.MatchAll{}, unix.SYS_EXIT: seccomp.MatchAll{}, // For the stub prctl dance (all). unix.SYS_PRCTL: seccomp.PerArg{seccomp.EqualTo(unix.PR_SET_PDEATHSIG), seccomp.EqualTo(unix.SIGKILL)}, unix.SYS_GETPPID: seccomp.MatchAll{}, // For the stub to stop itself (all). unix.SYS_GETPID: seccomp.MatchAll{}, unix.SYS_KILL: seccomp.PerArg{seccomp.AnyValue{}, seccomp.EqualTo(unix.SIGSTOP)}, // Injected to support the address space operations. unix.SYS_MMAP: seccomp.MatchAll{}, unix.SYS_MUNMAP: seccomp.MatchAll{}, }), Action: linux.SECCOMP_RET_ALLOW, }) } rules = appendArchSeccompRules(rules, defaultAction) instrs, _, err := seccomp.BuildProgram(rules, seccomp.ProgramOptions{ DefaultAction: defaultAction, BadArchAction: defaultAction, }) if err != nil { return nil, err } return forkStub(flags, instrs) } // In the child, this function must not acquire any locks, because they might // have been locked at the time of the fork. This means no rescheduling, no // malloc calls, and no new stack segments. For the same reason compiler does // not race instrument it. // //go:norace func forkStub(flags uintptr, instrs []bpf.Instruction) (*thread, error) { // Declare all variables up front in order to ensure that there's no // need for allocations between beforeFork & afterFork. var ( pid uintptr ppid uintptr errno unix.Errno ) // Remember the current ppid for the pdeathsig race. ppid, _, _ = unix.RawSyscall(unix.SYS_GETPID, 0, 0, 0) // Among other things, beforeFork masks all signals. beforeFork() // Do the clone. pid, _, errno = unix.RawSyscall6(unix.SYS_CLONE, flags, 0, 0, 0, 0, 0) if errno != 0 { afterFork() return nil, errno } // Is this the parent? if pid != 0 { // Among other things, restore signal mask. afterFork() // Initialize the first thread. t := &thread{ tgid: int32(pid), tid: int32(pid), cpu: ^uint32(0), } if sig := t.wait(stopped); sig != unix.SIGSTOP { return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig) } t.attach() t.grabInitRegs() return t, nil } // Move the stub to a new session (and thus a new process group). This // prevents the stub from getting PTY job control signals intended only // for the sentry process. We must call this before restoring signal // mask. if _, _, errno := unix.RawSyscall(unix.SYS_SETSID, 0, 0, 0); errno != 0 { unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) } // afterForkInChild resets all signals to their default dispositions // and restores the signal mask to its pre-fork state. afterForkInChild() // Explicitly unmask all signals to ensure that the tracer can see // them. if errno := unmaskAllSignals(); errno != 0 { unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) } // Set an aggressive BPF filter for the stub and all it's children. See // the description of the BPF program built above. if errno := seccomp.SetFilterInChild(instrs); errno != 0 { unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) } // Enable cpuid-faulting. enableCpuidFault() // Call the stub; should not return. stubCall(stubStart, ppid) panic("unreachable") } // createStub creates a stub processes as a child of an existing subprocesses. // // Precondition: the runtime OS thread must be locked. func (s *subprocess) createStub() (*thread, error) { // There's no need to lock the runtime thread here, as this can only be // called from a context that is already locked. currentTID := int32(hosttid.Current()) t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread) // Pass the expected PPID to the child via R15. regs := t.initRegs initChildProcessPPID(®s, t.tgid) // Call fork in a subprocess. // // The new child must set up PDEATHSIG to ensure it dies if this // process dies. Since this process could die at any time, this cannot // be done via instrumentation from here. // // Instead, we create the child untraced, which will do the PDEATHSIG // setup and then SIGSTOP itself for our attach below. // // See above re: SIGKILL. pid, err := t.syscallIgnoreInterrupt( ®s, unix.SYS_CLONE, arch.SyscallArgument{Value: uintptr(unix.SIGKILL | unix.CLONE_FILES)}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}) if err != nil { return nil, fmt.Errorf("creating stub process: %v", err) } // Wait for child to enter group-stop, so we don't stop its // bootstrapping work with t.attach below. // // We unfortunately don't have a handy part of memory to write the wait // status. If the wait succeeds, we'll assume that it was the SIGSTOP. // If the child actually exited, the attach below will fail. _, err = t.syscallIgnoreInterrupt( &t.initRegs, unix.SYS_WAIT4, arch.SyscallArgument{Value: uintptr(pid)}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: unix.WALL | unix.WUNTRACED}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}) if err != nil { return nil, fmt.Errorf("waiting on stub process: %v", err) } childT := &thread{ tgid: int32(pid), tid: int32(pid), cpu: ^uint32(0), } childT.attach() return childT, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go000066400000000000000000000020671465435605700313560ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux && (amd64 || arm64) // +build linux // +build amd64 arm64 package ptrace import ( "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" ) // unmaskAllSignals unmasks all signals on the current thread. // //go:norace //go:nosplit func unmaskAllSignals() unix.Errno { var set linux.SignalSet _, _, errno := unix.RawSyscall6(unix.SYS_RT_SIGPROCMASK, linux.SIG_SETMASK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0) return errno } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/ptrace/subprocess_unsafe.go000066400000000000000000000021601465435605700301310ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build go1.18 // +build go1.18 // //go:linkname directives type-checked by checklinkname. Any other // non-linkname assumptions outside the Go 1 compatibility guarantee should // have an accompanied vet check or version guard build tag. package ptrace import ( _ "unsafe" // required for go:linkname. ) //go:linkname beforeFork syscall.runtime_BeforeFork func beforeFork() //go:linkname afterFork syscall.runtime_AfterFork func afterFork() //go:linkname afterForkInChild syscall.runtime_AfterForkInChild func afterForkInChild() golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/000077500000000000000000000000001465435605700243015ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/context_list.go000066400000000000000000000122441465435605700273520ustar00rootroot00000000000000package systrap // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type contextElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (contextElementMapper) linkerFor(elem *sharedContext) *sharedContext { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type contextList struct { head *sharedContext tail *sharedContext } // Reset resets list l to the empty state. func (l *contextList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *contextList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *contextList) Front() *sharedContext { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *contextList) Back() *sharedContext { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *contextList) Len() (count int) { for e := l.Front(); e != nil; e = (contextElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *contextList) PushFront(e *sharedContext) { linker := contextElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { contextElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *contextList) PushFrontList(m *contextList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { contextElementMapper{}.linkerFor(l.head).SetPrev(m.tail) contextElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *contextList) PushBack(e *sharedContext) { linker := contextElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { contextElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *contextList) PushBackList(m *contextList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { contextElementMapper{}.linkerFor(l.tail).SetNext(m.head) contextElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *contextList) InsertAfter(b, e *sharedContext) { bLinker := contextElementMapper{}.linkerFor(b) eLinker := contextElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { contextElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *contextList) InsertBefore(a, e *sharedContext) { aLinker := contextElementMapper{}.linkerFor(a) eLinker := contextElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { contextElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *contextList) Remove(e *sharedContext) { linker := contextElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { contextElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { contextElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type contextEntry struct { next *sharedContext prev *sharedContext } // Next returns the entry that follows e in the list. // //go:nosplit func (e *contextEntry) Next() *sharedContext { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *contextEntry) Prev() *sharedContext { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *contextEntry) SetNext(elem *sharedContext) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *contextEntry) SetPrev(elem *sharedContext) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/context_queue.go000066400000000000000000000107741465435605700275310ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import ( "sync/atomic" "gvisor.dev/gvisor/pkg/sentry/platform" ) // LINT.IfChange const ( // maxEntries is the size of the ringbuffer. maxContextQueueEntries uint32 = uint32(maxGuestContexts) + 1 ) type queuedContext struct { contextID uint32 threadID uint32 } // contextQueue is a structure shared with the each stub thread that is used to // signal to stub threads which contexts are ready to resume running. // // It is a lockless ringbuffer where threads try to police themselves on whether // they should continue waiting for a context or go to sleep if they are // unneeded. type contextQueue struct { // start is an index used for taking contexts out of the ringbuffer. start uint32 // end is an index used for putting new contexts into the ringbuffer. end uint32 // numActiveThreads indicates to the sentry how many stubs are running. // It is changed only by stub threads. numActiveThreads uint32 // numSpinningThreads indicates to the sentry how many stubs are waiting // to receive a context from the queue, and are not doing useful work. numSpinningThreads uint32 // numThreadsToWakeup is the number of threads requested by Sentry to wake up. // The Sentry increments it and stub threads decrements. numThreadsToWakeup uint32 // numActiveContext is a number of running and waiting contexts numActiveContexts uint32 // numAwakeContexts is the number of awake contexts. It includes all // active contexts and contexts that are running in the Sentry. numAwakeContexts uint32 fastPathDisabled uint32 usedFastPath uint32 ringbuffer [maxContextQueueEntries]uint64 } const ( // Each element of a contextQueue ring buffer is a sum of its index // shifted by CQ_INDEX_SHIFT and context_id. contextQueueIndexShift = 32 ) // LINT.ThenChange(./sysmsg/sysmsg_lib.c) func (q *contextQueue) init() { for i := uint32(0); i < maxContextQueueEntries; i++ { q.ringbuffer[i] = uint64(invalidContextID) } // Allow tests to trigger overflows of start and end. idx := ^uint32(0) - maxContextQueueEntries*4 atomic.StoreUint32(&q.start, idx) atomic.StoreUint32(&q.end, idx) atomic.StoreUint32(&q.numActiveThreads, 0) atomic.StoreUint32(&q.numSpinningThreads, 0) atomic.StoreUint32(&q.numThreadsToWakeup, 0) atomic.StoreUint32(&q.numActiveContexts, 0) atomic.StoreUint32(&q.numAwakeContexts, 0) atomic.StoreUint32(&q.fastPathDisabled, 1) atomic.StoreUint32(&q.usedFastPath, 0) } func (q *contextQueue) isEmpty() bool { return atomic.LoadUint32(&q.start) == atomic.LoadUint32(&q.end) } func (q *contextQueue) queuedContexts() uint32 { return (atomic.LoadUint32(&q.end) + maxContextQueueEntries - atomic.LoadUint32(&q.start)) % maxContextQueueEntries } // add puts the given ctx onto the context queue, and records a state of // the subprocess after insertion to see if there are more active stub threads // or more waiting contexts. func (q *contextQueue) add(ctx *sharedContext) *platform.ContextError { ctx.startWaitingTS = cputicks() if fastpath.stubFastPath() { q.enableFastPath() } else { q.disableFastPath() } contextID := ctx.contextID atomic.AddUint32(&q.numActiveContexts, 1) next := atomic.AddUint32(&q.end, 1) if (next % maxContextQueueEntries) == (atomic.LoadUint32(&q.start) % maxContextQueueEntries) { // reachable only in case of corrupted memory return corruptedSharedMemoryErr("context queue is full, indicates tampering with queue counters") } idx := next - 1 next = idx % maxContextQueueEntries v := (uint64(idx) << contextQueueIndexShift) + uint64(contextID) atomic.StoreUint64(&q.ringbuffer[next], v) if atomic.SwapUint32(&q.usedFastPath, 0) != 0 { fastpath.usedStubFastPath.Store(true) } return nil } func (q *contextQueue) disableFastPath() { atomic.StoreUint32(&q.fastPathDisabled, 1) } func (q *contextQueue) enableFastPath() { atomic.StoreUint32(&q.fastPathDisabled, 0) } func (q *contextQueue) fastPathEnabled() bool { return atomic.LoadUint32(&q.fastPathDisabled) == 0 } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/context_queue_unsafe.go000066400000000000000000000015351465435605700310650ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import ( "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" ) func (q *contextQueue) wakeupSysmsgThread() { unix.RawSyscall6(unix.SYS_FUTEX, uintptr(unsafe.Pointer(&q.numThreadsToWakeup)), linux.FUTEX_WAKE, 1, 0, 0, 0) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/filters.go000066400000000000000000000103131465435605700262760ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/seccomp/precompiledseccomp" "gvisor.dev/gvisor/pkg/sentry/platform" ) // sysmsgThreadPriorityVarName is the seccomp filter variable name used to // encode the sysmsg thread priority. const sysmsgThreadPriorityVarName = "systrap_sysmsg_thread_priority" // systrapSeccomp implements platform.SeccompInfo. type systrapSeccomp struct{} // Variables implements `platform.SeccompInfo.Variables`. func (systrapSeccomp) Variables() precompiledseccomp.Values { initSysmsgThreadPriority() vars := precompiledseccomp.Values{} vars.SetUint64(sysmsgThreadPriorityVarName, uint64(sysmsgThreadPriority)) return vars } // ConfigKey implements `platform.SeccompInfo.ConfigKey`. func (systrapSeccomp) ConfigKey() string { return "systrap" } // SyscallFilters implements `platform.SeccompInfo.SyscallFilters`. func (systrapSeccomp) SyscallFilters(vars precompiledseccomp.Values) seccomp.SyscallRules { return seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_PTRACE: seccomp.Or{ seccomp.PerArg{ seccomp.EqualTo(unix.PTRACE_ATTACH), }, seccomp.PerArg{ seccomp.EqualTo(unix.PTRACE_CONT), seccomp.AnyValue{}, seccomp.EqualTo(0), seccomp.EqualTo(0), }, seccomp.PerArg{ seccomp.EqualTo(unix.PTRACE_GETEVENTMSG), }, seccomp.PerArg{ seccomp.EqualTo(unix.PTRACE_GETREGSET), seccomp.AnyValue{}, seccomp.EqualTo(linux.NT_PRSTATUS), }, seccomp.PerArg{ seccomp.EqualTo(unix.PTRACE_GETSIGINFO), }, seccomp.PerArg{ seccomp.EqualTo(unix.PTRACE_SETOPTIONS), seccomp.AnyValue{}, seccomp.EqualTo(0), seccomp.EqualTo(unix.PTRACE_O_TRACESYSGOOD | unix.PTRACE_O_TRACEEXIT | unix.PTRACE_O_EXITKILL), }, seccomp.PerArg{ seccomp.EqualTo(unix.PTRACE_SETREGSET), seccomp.AnyValue{}, seccomp.EqualTo(linux.NT_PRSTATUS), }, seccomp.PerArg{ seccomp.EqualTo(linux.PTRACE_SETSIGMASK), seccomp.AnyValue{}, seccomp.EqualTo(8), }, seccomp.PerArg{ seccomp.EqualTo(unix.PTRACE_SYSEMU), seccomp.AnyValue{}, seccomp.EqualTo(0), seccomp.EqualTo(0), }, seccomp.PerArg{ seccomp.EqualTo(unix.PTRACE_DETACH), }, }, unix.SYS_TGKILL: seccomp.MatchAll{}, unix.SYS_WAIT4: seccomp.MatchAll{}, unix.SYS_IOCTL: seccomp.Or{ seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(linux.SECCOMP_IOCTL_NOTIF_RECV), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(linux.SECCOMP_IOCTL_NOTIF_SEND), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(linux.SECCOMP_IOCTL_NOTIF_SET_FLAGS), seccomp.EqualTo(linux.SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP), }, }, unix.SYS_WAITID: seccomp.PerArg{ seccomp.EqualTo(unix.P_PID), seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.WEXITED | unix.WNOHANG | unix.WNOWAIT), }, unix.SYS_SETPRIORITY: seccomp.PerArg{ seccomp.EqualTo(unix.PRIO_PROCESS), seccomp.AnyValue{}, seccomp.EqualTo(vars.GetUint64(sysmsgThreadPriorityVarName)), }, }).Merge(archSyscallFilters()) } // HottestSyscalls implements `platform.SeccompInfo.HottestSyscalls`. func (systrapSeccomp) HottestSyscalls() []uintptr { return hottestSyscalls() } // SeccompInfo returns seccomp filter info for the systrap platform. func (p *Systrap) SeccompInfo() platform.SeccompInfo { return systrapSeccomp{} } // PrecompiledSeccompInfo implements // platform.Constructor.PrecompiledSeccompInfo. func (*constructor) PrecompiledSeccompInfo() []platform.SeccompInfo { return []platform.SeccompInfo{(*Systrap)(nil).SeccompInfo()} } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/filters_amd64.go000066400000000000000000000020741465435605700272760ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package systrap import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/seccomp" ) // archSyscallFilters returns architecture-specific syscalls made exclusively // by the systrap platform. func archSyscallFilters() seccomp.SyscallRules { return seccomp.SyscallRules{} } // hottestSyscalls returns the hottest syscalls used by the Systrap platform. func hottestSyscalls() []uintptr { return []uintptr{ unix.SYS_FUTEX, unix.SYS_NANOSLEEP, } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/filters_arm64.go000066400000000000000000000026001465435605700273070ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package systrap import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" ) // archSyscallFilters returns architecture-specific syscalls made exclusively // by the systrap platform. func archSyscallFilters() seccomp.SyscallRules { return seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_PTRACE: seccomp.Or{ seccomp.PerArg{ seccomp.EqualTo(unix.PTRACE_GETREGSET), seccomp.AnyValue{}, seccomp.EqualTo(linux.NT_ARM_TLS), }, seccomp.PerArg{ seccomp.EqualTo(unix.PTRACE_SETREGSET), seccomp.AnyValue{}, seccomp.EqualTo(linux.NT_ARM_TLS), }, }, }) } // hottestSyscalls returns the hottest syscalls used by the Systrap platform. func hottestSyscalls() []uintptr { return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/lib_amd64.s000066400000000000000000000014421465435605700262270ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "funcdata.h" #include "textflag.h" TEXT ·spinloop(SB),NOSPLIT|NOFRAME,$0 PAUSE RET TEXT ·cputicks(SB),NOSPLIT|NOFRAME,$0-8 LFENCE RDTSC SHLQ $32, DX ADDQ DX, AX MOVQ AX, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/lib_arm64.s000066400000000000000000000014341465435605700262460ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "funcdata.h" #include "textflag.h" TEXT ·spinloop(SB),NOSPLIT,$0 YIELD RET TEXT ·cputicks(SB),NOSPLIT,$0-8 ISB $15 WORD $0xd53be040 //MRS CNTVCT_EL0, R0 MOVD R0, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/metrics.go000066400000000000000000000546051465435605700263100ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import ( "time" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/metric" ) // This file contains all logic related to context switch latency metrics. // // Latency metrics are the main method by which fastpath for both stub threads // and the sentry is enabled and disabled. We measure latency in CPU cycles. // // The high level overview of metric collection looks like this: // 1a) When a context is switched from the sentry to the stub, the sentry // records the time it was put into the context queue. // 1b) When a stub thread picks up the context from the context queue, the stub // thread records the time when it's about to switch back to user code. // Getting the diff between these timestamps gives us the stub-bound latency. // // 2a) When a stub thread gives back a context to the sentry for handling, // it records the time just before notifying the sentry task goroutine. // 2b) When the task goroutine sees that it has been notified, it records the // time. // Getting the diff between these timestamps gives us the sentry-bound latency. // // 3) Both latencies are recorded at once via recordLatency(). This means // there is a delay on getting stubBoundLatencies. In practice this should not // matter that much due to our relatively large latency measurement periods. // // There is a bucket array for each latency type, where each bucket is of size // `bucketIncrements`. Latencies are collected in time periods of length // `recordingPeriod`, and measurements for the current period are stored // in the `latencies` variable. type latencyBuckets [numLatencyBuckets]atomicbitops.Uint64 type cpuTicks uint64 const ( numLatencyBuckets = 80 bucketIncrements = 2048 // minNecessaryRecordings defines the minimum amount of recordings we // want to see in latencyBuckets in order to get a reasonable median. minNecessaryRecordings = 5 ) // neverEnableFastPath is used for completely disabling the fast path. // It is set once so doesn't need any synchronizations. var neverEnableFastPath bool // latencyRecorder is used to collect latency metrics. type latencyRecorder struct { stubBound latencyBuckets sentryBound latencyBuckets } // latencies stores the latency counts for the current measurement period. var latencies latencyRecorder // record increments the correct bucket assigned to the given latency l. // //go:nosplit func (b *latencyBuckets) record(l cpuTicks) { bucket := l / bucketIncrements if bucket >= numLatencyBuckets { bucket = numLatencyBuckets - 1 } b[bucket].Add(1) } // getMedian returns a latency measure in the range of // [bucketIncrements, numLatencyBuckets * bucketIncrements], or 0 if unable to // find a median in the latencyBuckets. func (b *latencyBuckets) getMedian() cpuTicks { i := 0 j := numLatencyBuckets - 1 var totalForwards, totalBackwards uint64 for i <= j { if totalForwards < totalBackwards { totalForwards += b[i].Load() i++ } else { totalBackwards += b[j].Load() j-- } } if totalForwards+totalBackwards < minNecessaryRecordings { return 0 } return cpuTicks(max(uint64(i), 1) * bucketIncrements) } // merge combines two latencyBuckets instances. func (b *latencyBuckets) merge(other *latencyBuckets) { for i := 0; i < numLatencyBuckets; i++ { b[i].Add(other[i].Load()) } } // reset zeroes all buckets. func (b *latencyBuckets) reset() { for i := 0; i < numLatencyBuckets; i++ { b[i].Store(0) } } // recordLatency records the latency of both the sentry->stub and the // stub->sentry context switches. // For the stub->sentry context switch, the final timestamp is taken by this // function. // Preconditions: // - ctx.isAcked() is true. // //go:nosplit func (sc *sharedContext) recordLatency() { // Record stub->sentry latency. sentryBoundLatency := sc.getStateChangedTimeDiff() if sentryBoundLatency != 0 { latencies.sentryBound.record(sentryBoundLatency) } // Record sentry->stub latency. stubBoundLatency := sc.getAckedTimeDiff() if stubBoundLatency != 0 { latencies.stubBound.record(stubBoundLatency) } updateDebugMetrics(stubBoundLatency, sentryBoundLatency) } // When a measurement period ends, the latencies are used to determine the fast // path state. Fastpath is independently enabled for both the sentry and stub // threads, and is modeled as the following state machine: // // +----------StubFPOff,SentryFPOff-------+ // | ^ ^ | // V | | V // +-->StubFPOn,SentryFPOff StubFPOff,SentryFPOn<--+ // | | ^ | ^ | // | V | V | | // | StubFPOn,SentryFPOn StubFPOn,SentryFPOn | // | LastEnabledSentryFP LastEnabledStubFP | // | | | | // | | | | // | +---------> StubFPOn,SentryFPOn <-------+ | // | | | | // |______________________________| |___________________________| // // The default state is to have both stub and sentry fastpath OFF. // A state transition to enable one fastpath is done when // fpState.(stub|sentry)FPBackoff reaches 0. (stub|sentry)FPBackoff is // decremented every recording period that the corresponding fastpath is // disabled. // A state transition to disable one fastpath is decided through the predicates // shouldDisableStubFP or shouldDisableSentryFP, and activated with // disableStubFP or disableSentryFP. // // Why have 3 states for both FPs being ON? The logic behind that is to do with // the fact that fastpaths are interdependent. Enabling one fastpath can have // negative effects on the latency metrics of the other in the event that there // are not enough CPUs to run the fastpath. So it's very possible that the system // finds itself in a state where it's beneficial to run one fastpath but not the // other based on the workload it's doing. For this case, we need to remember // what the last stable state was to return to, because the metrics will likely // be bad enough for both sides to be eligible for being disabled. // // Once the system establishes that having both the stub and sentry fastpath ON // is acceptable, it does prioritize disabling stub fastpath over disabling // sentry fastpath, because the sentry fastpath at most takes one thread to spin. const ( recordingPeriod = 400 * time.Microsecond fastPathBackoffMin = 2 maxRecentFPFailures = 9 numConsecutiveFailsToDisableFP = 2 ) // fastPathState is used to keep track of long term metrics that span beyond // one measurement period. type fastPathState struct { // stubBoundBaselineLatency and sentryBoundBaselineLatency record all // latency measures recorded during periods when their respective // fastpath was OFF. stubBoundBaselineLatency latencyBuckets sentryBoundBaselineLatency latencyBuckets // stubFPBackoff and sentryFPBackoff are the periods remaining until // the system attempts to use the fastpath again. stubFPBackoff int sentryFPBackoff int // stubFPRecentFailures and sentryFPRecentFailures are counters in the // range [0, maxRecentFPFailures] that are incremented by // disable(Stub|Sentry)FP and decremented by (stub|sentry)FPSuccess. // They are used to set the backoffs. stubFPRecentFailures int sentryFPRecentFailures int consecutiveStubFPFailures int consecutiveSentryFPFailures int _ [hostarch.CacheLineSize]byte // stubFastPathEnabled is a global flag referenced in other parts of // systrap to determine if the stub fast path is enabled or not. stubFastPathEnabled atomicbitops.Bool _ [hostarch.CacheLineSize]byte // sentryFastPathEnabled is a global flag referenced in other parts of // systrap to determine if the sentry fastpath is enabled or not. sentryFastPathEnabled atomicbitops.Bool _ [hostarch.CacheLineSize]byte // nrMaxAwakeStubThreads is the maximum number of awake stub threads over // all subprocesses at the this moment. nrMaxAwakeStubThreads atomicbitops.Uint32 // usedStubFastPath and usedSentryFastPath are reset every recording // period, and are populated in case the system actually used the // fastpath (i.e. stub or dispatcher spun for some time without work). _ [hostarch.CacheLineSize]byte usedStubFastPath atomicbitops.Bool _ [hostarch.CacheLineSize]byte usedSentryFastPath atomicbitops.Bool _ [hostarch.CacheLineSize]byte // curState is the current fastpath state function, which is called at // the end of every recording period. curState func(*fastPathState) } var ( fastpath = fastPathState{ stubFPBackoff: fastPathBackoffMin, sentryFPBackoff: fastPathBackoffMin, curState: sentryOffStubOff, } // fastPathContextLimit is the maximum number of contexts after which the fast // path in stub threads is disabled. Its value can be higher than the number of // CPU-s, because the Sentry is running with higher priority than stub threads, // deepSleepTimeout is much shorter than the Linux scheduler timeslice, so the // only thing that matters here is whether the Sentry handles syscall faster // than the overhead of scheduling another stub thread. // // It is set after maxSysmsgThreads is initialized. fastPathContextLimit = uint32(0) ) // controlFastPath is used to spawn a goroutine when creating the Systrap // platform. func controlFastPath() { fastPathContextLimit = uint32(maxSysmsgThreads * 2) for { time.Sleep(recordingPeriod) fastpath.curState(&fastpath) // Reset FP trackers. fastpath.usedStubFastPath.Store(false) fastpath.usedSentryFastPath.Store(false) } } // getBackoff returns the number of recording periods that fastpath should remain // disabled for, based on the num of recentFailures. func getBackoff(recentFailures int) int { return 1 << recentFailures } //go:nosplit func (s *fastPathState) sentryFastPath() bool { return s.sentryFastPathEnabled.Load() } //go:nosplit func (s *fastPathState) stubFastPath() bool { return s.stubFastPathEnabled.Load() && (s.nrMaxAwakeStubThreads.Load() <= fastPathContextLimit) } // enableSentryFP is a wrapper to unconditionally enable sentry FP and increment // a debug metric. func (s *fastPathState) enableSentryFP() { s.sentryFastPathEnabled.Store(true) numTimesSentryFastPathEnabled.Increment() } // disableSentryFP returns true if the sentry fastpath was able to be disabled. // // It takes two calls to disableSentryFP without any calls to sentryFPSuccess in // between to disable the sentry fastpath. This is done in order to mitigate the // effects of outlier measures due to rdtsc inaccuracies. func (s *fastPathState) disableSentryFP() bool { s.consecutiveSentryFPFailures++ if s.consecutiveSentryFPFailures < numConsecutiveFailsToDisableFP { return false } s.consecutiveSentryFPFailures = 0 s.sentryFastPathEnabled.Store(false) numTimesSentryFastPathDisabled.Increment() s.sentryFPBackoff = getBackoff(s.sentryFPRecentFailures) s.sentryFPRecentFailures = min(maxRecentFPFailures, s.sentryFPRecentFailures+1) return true } // enableStubFP is a wrapper to unconditionally enable stub FP and increment // a debug metric. func (s *fastPathState) enableStubFP() { s.stubFastPathEnabled.Store(true) numTimesStubFastPathEnabled.Increment() } // disableStubFP returns true if the stub fastpath was able to be disabled. // // It takes two calls to disableStubFP without any calls to stubFPSuccess in // between to disable the stub fastpath. This is done in order to mitigate the // effects of outlier measures due to rdtsc inaccuracies. func (s *fastPathState) disableStubFP() bool { s.consecutiveStubFPFailures++ if s.consecutiveStubFPFailures < numConsecutiveFailsToDisableFP { return false } s.consecutiveStubFPFailures = 0 s.stubFastPathEnabled.Store(false) numTimesStubFastPathDisabled.Increment() s.stubFPBackoff = getBackoff(s.stubFPRecentFailures) s.stubFPRecentFailures = min(maxRecentFPFailures, s.stubFPRecentFailures+1) return true } func (s *fastPathState) sentryFPSuccess() { s.sentryFPRecentFailures = max(0, s.sentryFPRecentFailures-1) s.consecutiveSentryFPFailures = 0 } func (s *fastPathState) stubFPSuccess() { s.stubFPRecentFailures = max(0, s.stubFPRecentFailures-1) s.consecutiveStubFPFailures = 0 } // shouldDisableSentryFP returns true if the metrics indicate sentry fastpath // should be disabled. func (s *fastPathState) shouldDisableSentryFP(stubMedian, sentryMedian cpuTicks) bool { if !s.usedSentryFastPath.Load() { return false } stubBaseline := s.stubBoundBaselineLatency.getMedian() sentryBaseline := s.sentryBoundBaselineLatency.getMedian() if sentryMedian < sentryBaseline { // Assume the number of productive stubs is the core count on the // system, not counting the 1 core taken by the dispatcher for // the fast path. n := cpuTicks(maxSysmsgThreads - 1) // If the sentry fastpath is causing the stub latency to be // higher than normal, the point at which it's considered to be // too high is when the time saved via the sentry fastpath is // less than the time lost via higher stub latency (with some // error margin). Assume that all possible stub threads are // active for this comparison. diff := (sentryBaseline - sentryMedian) * n errorMargin := stubBaseline / 8 return (stubMedian > stubBaseline) && (stubMedian-stubBaseline) > (diff+errorMargin) } // Running the fastpath resulted in higher sentry latency than baseline? // This does not happen often, but it is an indication that the fastpath // wasn't used to full effect: for example the dispatcher kept changing, // and that there was not enough CPU to place a new dispatcher fast // enough. // // If there isn't enough CPU we will most likely see large stub latency // regressions, and should disable the fastpath. return stubMedian > (stubBaseline + stubBaseline/2) } // shouldDisableStubFP returns true if the metrics indicate stub fastpath should // be disabled. func (s *fastPathState) shouldDisableStubFP(stubMedian, sentryMedian cpuTicks) bool { if !s.usedStubFastPath.Load() { return false } stubBaseline := s.stubBoundBaselineLatency.getMedian() sentryBaseline := s.sentryBoundBaselineLatency.getMedian() if stubMedian < stubBaseline { // If the stub fastpath is causing the sentry latency to be // higher than normal, the point at which it's considered to be // too high is when the time saved via the stub fastpath is // less than the time lost via higher sentry latency (with some // error margin). Unlike the stub latency, the sentry latency is // largely dependent on one thread (the dispatcher). diff := stubBaseline - stubMedian errorMargin := sentryBaseline / 8 return (sentryMedian > sentryBaseline) && (sentryMedian-sentryBaseline) > (diff+errorMargin) } // Running the fastpath resulted in higher stub latency than baseline? // This is either an indication that there isn't enough CPU to schedule // stub threads to run the fastpath, or the user workload has changed to // be such that it returns less often to the sentry. // // If there isn't enough CPU we will most likely see large sentry latency // regressions, and should disable the fastpath. return sentryMedian > (sentryBaseline + sentryBaseline/2) } // The following functions are used for state transitions in the sentry/stub // fastpath state machine described above. func sentryOffStubOff(s *fastPathState) { if neverEnableFastPath { return } periodStubBoundMedian := latencies.stubBound.getMedian() s.stubBoundBaselineLatency.merge(&latencies.stubBound) latencies.stubBound.reset() if periodStubBoundMedian != 0 { s.stubFPBackoff = max(s.stubFPBackoff-1, 0) } periodSentryBoundMedian := latencies.sentryBound.getMedian() s.sentryBoundBaselineLatency.merge(&latencies.sentryBound) latencies.sentryBound.reset() if periodSentryBoundMedian != 0 { s.sentryFPBackoff = max(s.sentryFPBackoff-1, 0) } if s.sentryFPBackoff == 0 { s.enableSentryFP() s.curState = sentryOnStubOff } else if s.stubFPBackoff == 0 { s.enableStubFP() s.curState = sentryOffStubOn } } func sentryOnStubOff(s *fastPathState) { periodStubBoundMedian := latencies.stubBound.getMedian() periodSentryBoundMedian := latencies.sentryBound.getMedian() if periodStubBoundMedian == 0 || periodSentryBoundMedian == 0 { return } if s.shouldDisableSentryFP(periodStubBoundMedian, periodSentryBoundMedian) { if s.disableSentryFP() { s.curState = sentryOffStubOff } } else { s.sentryFPSuccess() // If we are going to keep sentry FP on that means stub latency // was fine; update the baseline. s.stubBoundBaselineLatency.merge(&latencies.stubBound) latencies.stubBound.reset() s.stubFPBackoff = max(s.stubFPBackoff-1, 0) if s.stubFPBackoff == 0 { s.enableStubFP() s.curState = sentryOnStubOnLastEnabledStub } } latencies.sentryBound.reset() } func sentryOffStubOn(s *fastPathState) { periodStubBoundMedian := latencies.stubBound.getMedian() periodSentryBoundMedian := latencies.sentryBound.getMedian() if periodStubBoundMedian == 0 || periodSentryBoundMedian == 0 { return } if s.shouldDisableStubFP(periodStubBoundMedian, periodSentryBoundMedian) { if s.disableStubFP() { s.curState = sentryOffStubOff } } else { s.stubFPSuccess() s.sentryBoundBaselineLatency.merge(&latencies.sentryBound) latencies.sentryBound.reset() s.sentryFPBackoff = max(s.sentryFPBackoff-1, 0) if s.sentryFPBackoff == 0 { s.enableSentryFP() s.curState = sentryOnStubOnLastEnabledSentry } } latencies.stubBound.reset() } func sentryOnStubOnLastEnabledSentry(s *fastPathState) { periodStubBoundMedian := latencies.stubBound.getMedian() periodSentryBoundMedian := latencies.sentryBound.getMedian() if periodStubBoundMedian == 0 || periodSentryBoundMedian == 0 { return } latencies.stubBound.reset() latencies.sentryBound.reset() if s.shouldDisableSentryFP(periodStubBoundMedian, periodSentryBoundMedian) { if s.disableSentryFP() { s.curState = sentryOffStubOn } } else { s.curState = sentryOnStubOn s.sentryFPSuccess() s.stubFPSuccess() } } func sentryOnStubOnLastEnabledStub(s *fastPathState) { periodStubBoundMedian := latencies.stubBound.getMedian() periodSentryBoundMedian := latencies.sentryBound.getMedian() if periodStubBoundMedian == 0 || periodSentryBoundMedian == 0 { return } latencies.stubBound.reset() latencies.sentryBound.reset() if s.shouldDisableStubFP(periodStubBoundMedian, periodSentryBoundMedian) { if s.disableStubFP() { s.curState = sentryOnStubOff } } else { s.curState = sentryOnStubOn s.sentryFPSuccess() s.stubFPSuccess() } } func sentryOnStubOn(s *fastPathState) { periodStubBoundMedian := latencies.stubBound.getMedian() periodSentryBoundMedian := latencies.sentryBound.getMedian() if periodStubBoundMedian == 0 || periodSentryBoundMedian == 0 { return } latencies.stubBound.reset() latencies.sentryBound.reset() // Prioritize disabling stub fastpath over sentry fastpath, since sentry // only spins with one thread. if s.shouldDisableStubFP(periodStubBoundMedian, periodSentryBoundMedian) { if s.disableStubFP() { s.curState = sentryOnStubOff } } else if s.shouldDisableSentryFP(latencies.stubBound.getMedian(), latencies.sentryBound.getMedian()) { if s.disableSentryFP() { s.curState = sentryOffStubOn } } else { s.sentryFPSuccess() s.stubFPSuccess() } } // Profiling metrics intended for debugging purposes. var ( numTimesSentryFastPathDisabled = SystrapProfiling.MustCreateNewUint64Metric("/systrap/numTimesSentryFastPathDisabled", metric.Uint64Metadata{Cumulative: true}) numTimesSentryFastPathEnabled = SystrapProfiling.MustCreateNewUint64Metric("/systrap/numTimesSentryFastPathEnabled", metric.Uint64Metadata{Cumulative: true}) numTimesStubFastPathDisabled = SystrapProfiling.MustCreateNewUint64Metric("/systrap/numTimesStubFastPathDisabled", metric.Uint64Metadata{Cumulative: true}) numTimesStubFastPathEnabled = SystrapProfiling.MustCreateNewUint64Metric("/systrap/numTimesStubFastPathEnabled", metric.Uint64Metadata{Cumulative: true}) numTimesStubKicked = SystrapProfiling.MustCreateNewUint64Metric("/systrap/numTimesStubKicked", metric.Uint64Metadata{Cumulative: true}) stubLatWithin1kUS = SystrapProfiling.MustCreateNewUint64Metric("/systrap/stubLatWithin1kUS", metric.Uint64Metadata{Cumulative: true}) stubLatWithin5kUS = SystrapProfiling.MustCreateNewUint64Metric("/systrap/stubLatWithin5kUS", metric.Uint64Metadata{Cumulative: true}) stubLatWithin10kUS = SystrapProfiling.MustCreateNewUint64Metric("/systrap/stubLatWithin10kUS", metric.Uint64Metadata{Cumulative: true}) stubLatWithin20kUS = SystrapProfiling.MustCreateNewUint64Metric("/systrap/stubLatWithin20kUS", metric.Uint64Metadata{Cumulative: true}) stubLatWithin40kUS = SystrapProfiling.MustCreateNewUint64Metric("/systrap/stubLatWithin40kUS", metric.Uint64Metadata{Cumulative: true}) stubLatGreater40kUS = SystrapProfiling.MustCreateNewUint64Metric("/systrap/stubLatGreater40kUS", metric.Uint64Metadata{Cumulative: true}) sentryLatWithin1kUS = SystrapProfiling.MustCreateNewUint64Metric("/systrap/sentryLatWithin1kUS", metric.Uint64Metadata{Cumulative: true}) sentryLatWithin5kUS = SystrapProfiling.MustCreateNewUint64Metric("/systrap/sentryLatWithin5kUS", metric.Uint64Metadata{Cumulative: true}) sentryLatWithin10kUS = SystrapProfiling.MustCreateNewUint64Metric("/systrap/sentryLatWithin10kUS", metric.Uint64Metadata{Cumulative: true}) sentryLatWithin20kUS = SystrapProfiling.MustCreateNewUint64Metric("/systrap/sentryLatWithin20kUS", metric.Uint64Metadata{Cumulative: true}) sentryLatWithin40kUS = SystrapProfiling.MustCreateNewUint64Metric("/systrap/sentryLatWithin40kUS", metric.Uint64Metadata{Cumulative: true}) sentryLatGreater40kUS = SystrapProfiling.MustCreateNewUint64Metric("/systrap/sentryLatGreater40kUS", metric.Uint64Metadata{Cumulative: true}) ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/shared_context.go000066400000000000000000000260401465435605700276440ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import ( "fmt" "strconv" "sync" "sync/atomic" "time" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg" "gvisor.dev/gvisor/pkg/syncevent" ) const ( ackReset uint64 = 0 stateChangedReset uint64 = 0 ) // sharedContext is an abstraction for interactions that the sentry has to // perform with memory shared between it and the stub threads used for contexts. // // Any access to shared memory should most likely have a getter/setter through // this struct. This is due to the following reasons: // - The memory needs to be read or modified atomically because there is no // (trusted) synchronization between the sentry and the stub processes. // - Data read from shared memory may require validation before it can be used. type sharedContext struct { contextEntry // subprocess is the subprocess that this sharedContext instance belongs to. subprocess *subprocess // contextID is the ID corresponding to the sysmsg.ThreadContext memory slot // that is used for this sharedContext. contextID uint32 // shared is the handle to the shared memory that the sentry task go-routine // reads from and writes to. // NOTE: Using this handle directly without a getter from this function should // most likely be avoided due to concerns listed above. shared *sysmsg.ThreadContext // sync is used by the context go-routine to wait for events from the // dispatcher. sync syncevent.Waiter startWaitingTS int64 kicked bool // The task associated with the context fell asleep. sleeping bool } // String returns the ID of this shared context. func (sc *sharedContext) String() string { return strconv.Itoa(int(sc.contextID)) } const ( // sharedContextReady indicates that a context has new events. sharedContextReady = syncevent.Set(1 << iota) // sharedContextKicked indicates that a new stub thread should be woken up. sharedContextKicked // sharedContextSlowPath indicates that a context has to be waited for in the // slow path. sharedContextSlowPath // sharedContextDispatch indicates that a context go-routine has to start the wait loop. sharedContextDispatch ) func (s *subprocess) getSharedContext() (*sharedContext, error) { s.mu.Lock() defer s.mu.Unlock() id, ok := s.threadContextPool.Get() if !ok { return nil, fmt.Errorf("subprocess has too many active tasks (%d); failed to create a new one", maxGuestContexts) } s.IncRef() sc := sharedContext{ subprocess: s, contextID: uint32(id), shared: s.getThreadContextFromID(id), } sc.shared.Init(invalidThreadID) sc.sync.Init() sc.sleeping = true return &sc, nil } func (sc *sharedContext) release() { if sc == nil { return } if !sc.sleeping { sc.subprocess.decAwakeContexts() } sc.subprocess.threadContextPool.Put(uint64(sc.contextID)) sc.subprocess.DecRef(sc.subprocess.release) } func (sc *sharedContext) isActiveInSubprocess(s *subprocess) bool { if sc == nil { return false } return sc.subprocess == s } // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. func (sc *sharedContext) NotifyInterrupt() { // If this context is not being worked on right now we need to mark it as // interrupted so the next executor does not start working on it. atomic.StoreUint32(&sc.shared.Interrupt, 1) if sc.threadID() == invalidThreadID { return } sc.subprocess.sysmsgThreadsMu.Lock() defer sc.subprocess.sysmsgThreadsMu.Unlock() threadID := atomic.LoadUint32(&sc.shared.ThreadID) sysmsgThread, ok := sc.subprocess.sysmsgThreads[threadID] if !ok { // This is either an invalidThreadID or another garbage value; either way we // don't know which thread to interrupt; best we can do is mark the context. return } t := sysmsgThread.thread if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(t.tgid), uintptr(t.tid), uintptr(platform.SignalInterrupt)); e != 0 { panic(fmt.Sprintf("failed to interrupt the child process %d: %v", t.tid, e)) } } func (sc *sharedContext) state() sysmsg.ContextState { return sc.shared.State.Get() } func (sc *sharedContext) setState(state sysmsg.ContextState) { sc.shared.State.Set(state) } func (sc *sharedContext) setInterrupt() { atomic.StoreUint32(&sc.shared.Interrupt, 1) } func (sc *sharedContext) clearInterrupt() { atomic.StoreUint32(&sc.shared.Interrupt, 0) } func (sc *sharedContext) setFPStateChanged() { atomic.StoreUint64(&sc.shared.FPStateChanged, 1) } func (sc *sharedContext) threadID() uint32 { return atomic.LoadUint32(&sc.shared.ThreadID) } // EnableSentryFastPath indicates that the polling mode is enabled for the // Sentry. It has to be called before putting the context into the context queue. func (sc *sharedContext) enableSentryFastPath() { atomic.StoreUint32(&sc.shared.SentryFastPath, 1) } // DisableSentryFastPath indicates that the polling mode for the sentry is // disabled for the Sentry. func (sc *sharedContext) disableSentryFastPath() { atomic.StoreUint32(&sc.shared.SentryFastPath, 0) } func (sc *sharedContext) isAcked() bool { return atomic.LoadUint64(&sc.shared.AckedTime) != ackReset } // getAckedTimeDiff returns the time difference between when this context was // put into the context queue, and when this context was acked by a stub thread. // Precondition: must be called after isAcked() == true. // //go:nosplit func (sc *sharedContext) getAckedTimeDiff() cpuTicks { ackedAt := atomic.LoadUint64(&sc.shared.AckedTime) if ackedAt < uint64(sc.startWaitingTS) { log.Infof("likely memory tampering detected: found a condition where ackedAt (%d) < startWaitingTS (%d)", ackedAt, uint64(sc.startWaitingTS)) return 0 } return cpuTicks(ackedAt - uint64(sc.startWaitingTS)) } // getStateChangedTimeDiff returns the time difference between the time the // context state got changed by a stub thread, and now. // //go:nosplit func (sc *sharedContext) getStateChangedTimeDiff() cpuTicks { changedAt := atomic.LoadUint64(&sc.shared.StateChangedTime) now := uint64(cputicks()) if now < changedAt { log.Infof("likely memory tampering detected: found a condition where now (%d) < changedAt (%d)", now, changedAt) return 0 } return cpuTicks(now - changedAt) } func (sc *sharedContext) resetLatencyMeasures() { atomic.StoreUint64(&sc.shared.AckedTime, ackReset) atomic.StoreUint64(&sc.shared.StateChangedTime, stateChangedReset) } const ( contextPreemptTimeoutNsec = 10 * 1000 * 1000 // 10ms contextCheckupTimeoutSec = 5 stuckContextTimeout = 30 * time.Second ) var errDeadSubprocess = fmt.Errorf("subprocess died") func (sc *sharedContext) sleepOnState(state sysmsg.ContextState) error { timeout := unix.Timespec{ Sec: 0, Nsec: contextPreemptTimeoutNsec, } sentInterruptOnce := false deadline := time.Now().Add(stuckContextTimeout) for sc.state() == state { errno := sc.shared.SleepOnState(state, &timeout) if errno == 0 { continue } if errno != unix.ETIMEDOUT { panic(fmt.Sprintf("error waiting for state: %v", errno)) } if !sc.subprocess.alive() { return errDeadSubprocess } if time.Now().After(deadline) { log.Warningf("Systrap task goroutine has been waiting on ThreadContext.State futex too long. ThreadContext: %v", sc) } if sentInterruptOnce { log.Warningf("The context is still running: %v", sc) continue } if !sc.isAcked() || sc.subprocess.contextQueue.isEmpty() { continue } sc.NotifyInterrupt() sentInterruptOnce = true timeout.Sec = contextCheckupTimeoutSec timeout.Nsec = 0 } return nil } type fastPathDispatcher struct { // list is used only from the loop method and so it isn't protected by // any lock. list contextList mu sync.Mutex // nr is the number of contexts in the queue. // +checklocks:mu nr int // entrants contains new contexts that haven't been added to `list` yet. // +checklocks:mu entrants contextList } var dispatcher fastPathDispatcher const ( // deepSleepTimeout is the timeout after which both stub threads and the // dispatcher consider whether to stop polling. They need to have elapsed // this timeout twice in a row in order to stop, so the actual timeout // can be considered to be (deepSleepTimeout*2). Falling asleep after two // shorter timeouts instead of one long timeout is done in order to // mitigate the effects of rdtsc inaccuracies. // // The value is 20µs for 2GHz CPU. 40µs matches the sentry<->stub // round trip in the pure deep sleep case. deepSleepTimeout = uint64(40000) handshakeTimeout = uint64(1000) ) // loop is processing contexts in the queue. Only one instance of it can be // running, because it has exclusive access to the list. // // target is the context associated with the current go-routine. func (q *fastPathDispatcher) loop(target *sharedContext) { done := false processed := 0 firstTimeout := false slowPath := false startedSpinning := cputicks() for { var ctx, next *sharedContext q.mu.Lock() q.nr -= processed // Add new contexts to the list. q.list.PushBackList(&q.entrants) ctx = q.list.Front() q.mu.Unlock() if done { if ctx != nil { // Wake up the next go-routine to run the loop. ctx.sync.Receiver().Notify(sharedContextDispatch) } break } slowPath = !fastpath.sentryFastPath() || slowPath processed = 0 now := cputicks() for ctx = q.list.Front(); ctx != nil; ctx = next { next = ctx.Next() event := sharedContextReady if ctx.state() == sysmsg.ContextStateNone { if slowPath { event = sharedContextSlowPath } else if !ctx.kicked && uint64(now-ctx.startWaitingTS) > handshakeTimeout { if ctx.isAcked() { ctx.kicked = true continue } event = sharedContextKicked } else { continue } } processed++ q.list.Remove(ctx) if ctx == target { done = true } ctx.sync.Receiver().Notify(event) } if processed != 0 { startedSpinning = now firstTimeout = false } else { fastpath.usedSentryFastPath.Store(true) } // If dispatcher has been spinning for too long, send this // dispatcher to sleep. if uint64(now-startedSpinning) > deepSleepTimeout { slowPath = firstTimeout firstTimeout = true } yield() } } func (q *fastPathDispatcher) waitFor(ctx *sharedContext) syncevent.Set { events := syncevent.NoEvents q.mu.Lock() q.entrants.PushBack(ctx) q.nr++ if q.nr == 1 { events = sharedContextDispatch } q.mu.Unlock() for { if events&sharedContextDispatch != 0 { ctx.sync.Ack(sharedContextDispatch) q.loop(ctx) } events = ctx.sync.WaitAndAckAll() if events&sharedContextDispatch == 0 { break } } return events } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/shared_context_norace.go000066400000000000000000000013231465435605700311700ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !race // +build !race package systrap import ( "gvisor.dev/gvisor/pkg/sync" ) func yield() { sync.Goyield() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/shared_context_race.go000066400000000000000000000013621465435605700306360ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build race // +build race package systrap // yield is just a stub because sync.Goyield() is very expensive with the race // detector. func yield() {} golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/stub_amd64.s000066400000000000000000000153661465435605700264500ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "funcdata.h" #include "textflag.h" #define SYS_GETPID 39 // +checkconst unix SYS_GETPID #define SYS_EXIT 60 // +checkconst unix SYS_EXIT #define SYS_KILL 62 // +checkconst unix SYS_KILL #define SYS_GETPPID 110 // +checkconst unix SYS_GETPPID #define SIGKILL 9 // +checkconst unix SIGKILL #define SIGSTOP 19 // +checkconst unix SIGSTOP #define SYS_PRCTL 157 // +checkconst unix SYS_PRCTL #define SYS_EXIT_GROUP 231 // +checkconst unix SYS_EXIT_GROUP #define PR_SET_PDEATHSIG 1 // +checkconst unix PR_SET_PDEATHSIG #define SYS_FUTEX 202 // +checkconst unix SYS_FUTEX #define FUTEX_WAKE 1 // +checkconst linux FUTEX_WAKE #define FUTEX_WAIT 0 // +checkconst linux FUTEX_WAIT #define NEW_STUB 1 // +checkconst . _NEW_STUB #define RUN_SYSCALL_LOOP 5 // +checkconst . _RUN_SYSCALL_LOOP #define RUN_SECCOMP_LOOP 6 // +checkconst . _RUN_SECCOMP_LOOP // syscallSentryMessage offsets. #define SENTRY_MESSAGE_STATE 0 // +checkoffset . syscallSentryMessage.state #define SENTRY_MESSAGE_SYSNO 8 // +checkoffset . syscallSentryMessage.sysno #define SENTRY_MESSAGE_ARGS 16 // +checkoffset . syscallSentryMessage.args #define SENTRY_MESSAGE_ARG0 (SENTRY_MESSAGE_ARGS + 0*8) #define SENTRY_MESSAGE_ARG1 (SENTRY_MESSAGE_ARGS + 1*8) #define SENTRY_MESSAGE_ARG2 (SENTRY_MESSAGE_ARGS + 2*8) #define SENTRY_MESSAGE_ARG3 (SENTRY_MESSAGE_ARGS + 3*8) #define SENTRY_MESSAGE_ARG4 (SENTRY_MESSAGE_ARGS + 4*8) #define SENTRY_MESSAGE_ARG5 (SENTRY_MESSAGE_ARGS + 5*8) // syscallStubMessage offsets. #define STUB_MESSAGE_OFFSET 4096 // +checkconst . syscallStubMessageOffset #define STUB_MESSAGE_RET 0 // +checkoffset . syscallStubMessage.ret // initStubProcess bootstraps the child and sends itself SIGSTOP to wait for attach. // // R15 contains the expected PPID. R15 is used instead of a more typical DI // since syscalls will clobber DI and createStub wants to pass a new PPID to // grandchildren. // // This should not be used outside the context of a new ptrace child (as the // function is otherwise a bunch of nonsense). TEXT ·initStubProcess(SB),NOSPLIT|NOFRAME,$0 begin: // N.B. This loop only executes in the context of a single-threaded // fork child. MOVQ $SYS_PRCTL, AX MOVQ $PR_SET_PDEATHSIG, DI MOVQ $SIGKILL, SI SYSCALL CMPQ AX, $0 JNE error // If the parent already died before we called PR_SET_DEATHSIG then // we'll have an unexpected PPID. MOVQ $SYS_GETPPID, AX SYSCALL CMPQ AX, $0 JL error CMPQ AX, R15 JNE parent_dead MOVQ $SYS_GETPID, AX SYSCALL CMPQ AX, $0 JL error MOVQ $0, BX // SIGSTOP to wait for attach. // // The SYSCALL instruction will be used for future syscall injection by // thread.syscall. MOVQ AX, DI MOVQ $SYS_KILL, AX MOVQ $SIGSTOP, SI SYSCALL // The sentry sets BX to $NEW_STUB when creating stub process. CMPQ BX, $NEW_STUB JE clone // The sentry sets BX to $RUN_SYSCALL_LOOP when requesting a syscall // thread. CMPQ BX, $RUN_SYSCALL_LOOP JE syscall_loop CMPQ BX, $RUN_SECCOMP_LOOP JE seccomp_loop // Notify the Sentry that syscall exited. done: INT $3 // Be paranoid. JMP done clone: // subprocess.createStub clones a new stub process that is untraced, // thus executing this code. We setup the PDEATHSIG before SIGSTOPing // ourselves for attach by the tracer. // // R15 has been updated with the expected PPID. CMPQ AX, $0 JE begin // The clone syscall returns a non-zero value. JMP done error: // Exit with -errno. MOVQ AX, DI NEGQ DI MOVQ $SYS_EXIT, AX SYSCALL HLT parent_dead: MOVQ $SYS_EXIT, AX MOVQ $1, DI SYSCALL HLT // syscall_loop handles requests from the Sentry to execute syscalls. // Look at syscall_thread for more details. // // syscall_loop is running without using the stack because it can be // compromised by sysmsg (guest) threads that run in the same address // space. syscall_loop: // while (sentryMessage->state != R13) { // futex(sentryMessage->state, FUTEX_WAIT, 0, NULL, NULL, 0); // } MOVQ R12, DI MOVQ $FUTEX_WAIT, SI MOVQ $0, R10 MOVQ $0, R8 MOVQ $0, R9 wait_for_syscall: MOVL SENTRY_MESSAGE_STATE(DI), DX CMPL DX, R13 JE execute_syscall MOVQ $SYS_FUTEX, AX SYSCALL JMP wait_for_syscall execute_syscall: // ret = syscall(sysno, args...) MOVQ SENTRY_MESSAGE_SYSNO(R12), AX MOVQ SENTRY_MESSAGE_ARG0(R12), DI MOVQ SENTRY_MESSAGE_ARG1(R12), SI MOVQ SENTRY_MESSAGE_ARG2(R12), DX MOVQ SENTRY_MESSAGE_ARG3(R12), R10 MOVQ SENTRY_MESSAGE_ARG4(R12), R8 MOVQ SENTRY_MESSAGE_ARG5(R12), R9 SYSCALL // stubMessage->ret = ret MOVQ AX, (STUB_MESSAGE_OFFSET + STUB_MESSAGE_RET)(R12) // for { // if futex(sentryMessage->state, FUTEX_WAKE, 1) == 1 { // break; // } // } MOVQ R12, DI MOVQ $FUTEX_WAKE, SI MOVQ $1, DX MOVQ $0, R10 MOVQ $0, R8 MOVQ $0, R9 wake_up_sentry: MOVQ $SYS_FUTEX, AX SYSCALL // futex returns the number of waiters that were woken up. If futex // returns 0 here, it means that the Sentry has not called futex_wait // yet and we need to try again. The value of sentryMessage->state // isn't changed, so futex_wake is the only way to wake up the Sentry. CMPQ AX, $1 JNE wake_up_sentry INCL R13 JMP syscall_loop seccomp_loop: // SYS_EXIT_GROUP triggers seccomp notifications. MOVQ $SYS_EXIT_GROUP, AX SYSCALL // ret = syscall(sysno, args...) MOVQ SENTRY_MESSAGE_SYSNO(R12), AX MOVQ SENTRY_MESSAGE_ARG0(R12), DI MOVQ SENTRY_MESSAGE_ARG1(R12), SI MOVQ SENTRY_MESSAGE_ARG2(R12), DX MOVQ SENTRY_MESSAGE_ARG3(R12), R10 MOVQ SENTRY_MESSAGE_ARG4(R12), R8 MOVQ SENTRY_MESSAGE_ARG5(R12), R9 SYSCALL // stubMessage->ret = ret MOVQ AX, (STUB_MESSAGE_OFFSET + STUB_MESSAGE_RET)(R12) // for { // if futex(sentryMessage->state, FUTEX_WAKE, 1) == 1 { // break; // } // } MOVQ R12, DI MOVQ $FUTEX_WAKE, SI MOVQ $1, DX MOVQ $0, R10 MOVQ $0, R8 MOVQ $0, R9 JMP seccomp_loop // func addrOfInitStubProcess() uintptr TEXT ·addrOfInitStubProcess(SB), $0-8 MOVQ $·initStubProcess(SB), AX MOVQ AX, ret+0(FP) RET // stubCall calls the stub function at the given address with the given PPID. // // This is a distinct function because stub, above, may be mapped at any // arbitrary location, and stub has a specific binary API (see above). TEXT ·stubCall(SB),NOSPLIT|NOFRAME,$0-16 MOVQ addr+0(FP), AX MOVQ pid+8(FP), R15 JMP AX golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/stub_arm64.s000066400000000000000000000145061465435605700264610ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "funcdata.h" #include "textflag.h" #define SYS_GETPID 172 // +checkconst unix SYS_GETPID #define SYS_EXIT 93 // +checkconst unix SYS_EXIT #define SYS_KILL 129 // +checkconst unix SYS_KILL #define SYS_GETPPID 173 // +checkconst unix SYS_GETPPID #define SIGKILL 9 // +checkconst unix SIGKILL #define SIGSTOP 19 // +checkconst unix SIGSTOP #define SYS_PRCTL 167 // +checkconst unix SYS_PRCTL #define SYS_EXIT_GROUP 94 // +checkconst unix SYS_EXIT_GROUP #define PR_SET_PDEATHSIG 1 // +checkconst unix PR_SET_PDEATHSIG #define SYS_FUTEX 98 // +checkconst unix SYS_FUTEX #define FUTEX_WAKE 1 // +checkconst linux FUTEX_WAKE #define FUTEX_WAIT 0 // +checkconst linux FUTEX_WAIT #define NEW_STUB 1 // +checkconst . _NEW_STUB #define RUN_SYSCALL_LOOP 5 // +checkconst . _RUN_SYSCALL_LOOP #define RUN_SECCOMP_LOOP 6 // +checkconst . _RUN_SECCOMP_LOOP // syscallSentryMessage offsets. #define SENTRY_MESSAGE_STATE 0 // +checkoffset . syscallSentryMessage.state #define SENTRY_MESSAGE_SYSNO 8 // +checkoffset . syscallSentryMessage.sysno #define SENTRY_MESSAGE_ARGS 16 // +checkoffset . syscallSentryMessage.args #define SENTRY_MESSAGE_ARG0 (SENTRY_MESSAGE_ARGS + 0*8) #define SENTRY_MESSAGE_ARG1 (SENTRY_MESSAGE_ARGS + 1*8) #define SENTRY_MESSAGE_ARG2 (SENTRY_MESSAGE_ARGS + 2*8) #define SENTRY_MESSAGE_ARG3 (SENTRY_MESSAGE_ARGS + 3*8) #define SENTRY_MESSAGE_ARG4 (SENTRY_MESSAGE_ARGS + 4*8) #define SENTRY_MESSAGE_ARG5 (SENTRY_MESSAGE_ARGS + 5*8) // syscallStubMessage offsets. #define STUB_MESSAGE_OFFSET 4096 // +checkconst . syscallStubMessageOffset #define STUB_MESSAGE_RET 0 // +checkoffset . syscallStubMessage.ret // initStubProcess bootstraps the child and sends itself SIGSTOP to wait for attach. // // R7 contains the expected PPID. // // This should not be used outside the context of a new ptrace child (as the // function is otherwise a bunch of nonsense). TEXT ·initStubProcess(SB),NOSPLIT,$0 begin: // N.B. This loop only executes in the context of a single-threaded // fork child. MOVD $SYS_PRCTL, R8 MOVD $PR_SET_PDEATHSIG, R0 MOVD $SIGKILL, R1 SVC CMN $4095, R0 BCS error // If the parent already died before we called PR_SET_DEATHSIG then // we'll have an unexpected PPID. MOVD $SYS_GETPPID, R8 SVC CMP R0, R7 BNE parent_dead MOVD $SYS_GETPID, R8 SVC CMP $0x0, R0 BLT error MOVD $0, R9 // SIGSTOP to wait for attach. // // The SYSCALL instruction will be used for future syscall injection by // thread.syscall. MOVD $SYS_KILL, R8 MOVD $SIGSTOP, R1 SVC // The sentry sets R9 to $NEW_STUB when creating stub process. CMP $NEW_STUB, R9 BEQ clone // The sentry sets R9 to $RUN_SYSCALL_LOOP when creating a new syscall // thread. CMP $RUN_SYSCALL_LOOP, R9 BEQ syscall_loop CMP $RUN_SECCOMP_LOOP, R9 BEQ seccomp_loop done: // Notify the Sentry that syscall exited. BRK $3 B done // Be paranoid. clone: // subprocess.createStub clones a new stub process that is untraced, // thus executing this code. We setup the PDEATHSIG before SIGSTOPing // ourselves for attach by the tracer. // // R7 has been updated with the expected PPID. CMP $0, R0 BEQ begin // The clone system call returned a non-zero value. B done error: // Exit with -errno. NEG R0, R0 MOVD $SYS_EXIT, R8 SVC HLT parent_dead: MOVD $SYS_EXIT, R8 MOVD $1, R0 SVC HLT // syscall_loop handles requests from the Sentry to execute syscalls. // Look at syscall_thread for more details. // // syscall_loop is running without using the stack because it can be // compromised by sysmsg (guest) threads that run in the same address // space. syscall_loop: // while (sentryMessage->state != R13) { // futex(sentryMessage->state, FUTEX_WAIT, 0, NULL, NULL, 0); // } MOVD R12, R0 MOVD $FUTEX_WAIT, R1 MOVD $0, R3 MOVD $0, R4 MOVD $0, R5 wait_for_syscall: // Move the sentry message state to R2. MOVW SENTRY_MESSAGE_STATE(R12), R2 CMPW R2, R13 BEQ execute_syscall MOVD $SYS_FUTEX, R8 SVC JMP wait_for_syscall execute_syscall: MOVD SENTRY_MESSAGE_SYSNO(R12), R8 MOVD SENTRY_MESSAGE_ARG0(R12), R0 MOVD SENTRY_MESSAGE_ARG1(R12), R1 MOVD SENTRY_MESSAGE_ARG2(R12), R2 MOVD SENTRY_MESSAGE_ARG3(R12), R3 MOVD SENTRY_MESSAGE_ARG4(R12), R4 MOVD SENTRY_MESSAGE_ARG5(R12), R5 SVC // stubMessage->ret = ret MOVD R0, (STUB_MESSAGE_OFFSET + STUB_MESSAGE_RET)(R12) // for { // if futex(sentryMessage->state, FUTEX_WAKE, 1) == 1 { // break; // } // } MOVD $FUTEX_WAKE, R1 MOVD $1, R2 MOVD $0, R3 MOVD $0, R4 MOVD $0, R5 MOVD $SYS_FUTEX, R8 wake_up_sentry: MOVD R12, R0 SVC // futex returns the number of waiters that were woken up. If futex // returns 0 here, it means that the Sentry has not called futex_wait // yet and we need to try again. The value of sentryMessage->state // isn't changed, so futex_wake is the only way to wake up the Sentry. CMP $1, R0 BNE wake_up_sentry ADDW $1, R13, R13 JMP syscall_loop seccomp_loop: // SYS_EXIT_GROUP triggers seccomp notifications. MOVD $SYS_EXIT_GROUP, R8 SVC MOVD SENTRY_MESSAGE_SYSNO(R12), R8 MOVD SENTRY_MESSAGE_ARG0(R12), R0 MOVD SENTRY_MESSAGE_ARG1(R12), R1 MOVD SENTRY_MESSAGE_ARG2(R12), R2 MOVD SENTRY_MESSAGE_ARG3(R12), R3 MOVD SENTRY_MESSAGE_ARG4(R12), R4 MOVD SENTRY_MESSAGE_ARG5(R12), R5 SVC // stubMessage->ret = ret MOVD R0, (STUB_MESSAGE_OFFSET + STUB_MESSAGE_RET)(R12) JMP seccomp_loop // func addrOfInitStubProcess() uintptr TEXT ·addrOfInitStubProcess(SB), $0-8 MOVD $·initStubProcess(SB), R0 MOVD R0, ret+0(FP) RET // stubCall calls the stub function at the given address with the given PPID. // // This is a distinct function because stub, above, may be mapped at any // arbitrary location, and stub has a specific binary API (see above). TEXT ·stubCall(SB),NOSPLIT,$0-16 MOVD addr+0(FP), R0 MOVD pid+8(FP), R7 B (R0) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/stub_defs.go000066400000000000000000000017011465435605700266050ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import ( // Required for fact extraction. _ "golang.org/x/sys/unix" _ "gvisor.dev/gvisor/pkg/abi/linux" ) // _NEW_STUB is the value of the BX register when a new stub thread is created. const _NEW_STUB = 1 // _NEW_STUB is the value of the BX register when the syscall loop is executed. const _RUN_SYSCALL_LOOP = 5 const _RUN_SECCOMP_LOOP = 6 golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/stub_unsafe.go000066400000000000000000000226561465435605700271610ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import ( "math/rand" "reflect" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safecopy" "gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg" ) // initStubProcess is defined in arch-specific assembly. func initStubProcess() // addrOfInitStubProcess returns the start address of initStubProcess. // // In Go 1.17+, Go references to assembly functions resolve to an ABIInternal // wrapper function rather than the function itself. We must reference from // assembly to get the ABI0 (i.e., primary) address. func addrOfInitStubProcess() uintptr // stubCall calls the stub at the given address with the given pid. func stubCall(addr, pid uintptr) // unsafeSlice returns a slice for the given address and length. func unsafeSlice(addr uintptr, length int) (slice []byte) { sh := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) sh.Data = addr sh.Len = length sh.Cap = length return } // prepareSeccompRules compiles stub process seccomp filters and fill // the sock_fprog structure. So the stub process will only need to call // seccomp system call to apply these filters. // //go:nosplit func prepareSeccompRules(stubSysmsgStart, stubSysmsgRules, stubSysmsgRulesLen, stubSyscallRules, stubSyscallRulesLen uintptr) { instrs := sysmsgThreadRules(stubSysmsgStart) copySeccompRulesToStub(instrs, stubSysmsgRules, stubSysmsgRulesLen) instrs = sysmsgSyscallNotifyRules() copySeccompRulesToStub(instrs, stubSyscallRules, stubSyscallRulesLen) } func copySeccompRulesToStub(instrs []bpf.Instruction, stubAddr, size uintptr) { progLen := len(instrs) * int(unsafe.Sizeof(bpf.Instruction{})) progPtr := stubAddr + unsafe.Sizeof(linux.SockFprog{}) if progLen+int(unsafe.Sizeof(linux.SockFprog{})) > int(size) { panic("not enough space for sysmsg seccomp rules") } var targetSlice []bpf.Instruction sh := (*reflect.SliceHeader)(unsafe.Pointer(&targetSlice)) sh.Data = progPtr sh.Cap = len(instrs) sh.Len = sh.Cap copy(targetSlice, instrs) // stubSysmsgRules and progPtr are addresses from a stub mapping which // is mapped once and never moved, so it is safe to use unsafe.Pointer // this way for them. sockProg := (*linux.SockFprog)(unsafe.Pointer(stubAddr)) sockProg.Len = uint16(len(instrs)) sockProg.Filter = (*linux.BPFInstruction)(unsafe.Pointer(progPtr)) // Make the seccomp rules stub read-only. if _, _, errno := unix.RawSyscall( unix.SYS_MPROTECT, stubAddr, size, unix.PROT_READ); errno != 0 { panic("mprotect failed: " + errno.Error()) } } // stubInit allocates and initializes the stub memory region which includes: // - the stub code to do initial initialization of a stub process. // - the sysmsg signal handler code to notify sentry about new events such as // system calls, memory faults, etc. // - precompiled seccomp rules to trap application system calls. // - reserved space for stub-thread stack regions. func stubInit() { // *--------stubStart-------------------* // |--------stubInitProcess-------------| // | stub code to init stub processes | // |--------stubSysmsgStart-------------| // | sysmsg code | // |--------stubSysmsgRuleStart---------| // | precompiled sysmsg seccomp rules | // |--------guard page------------------| // |--------random gap------------------| // | | // |--------stubSysmsgStack-------------| // | Reserved space for per-thread | // | sysmsg stacks. | // |----------stubContextQueue----------| // | Shared ringbuffer queue for stubs | // | to select the next context. | // |--------stubThreadContextRegion-----| // | Reserved space for thread contexts | // *------------------------------------* // Grab the existing stub. procStubBegin := addrOfInitStubProcess() procStubLen := int(safecopy.FindEndAddress(procStubBegin) - procStubBegin) procStubSlice := unsafeSlice(procStubBegin, procStubLen) mapLen, _ := hostarch.PageRoundUp(uintptr(procStubLen)) stubSysmsgStart = mapLen stubSysmsgLen := len(sysmsg.SighandlerBlob) mapLen, _ = hostarch.PageRoundUp(mapLen + uintptr(stubSysmsgLen)) stubSysmsgRules = mapLen stubSysmsgRulesLen = hostarch.PageSize * 2 mapLen += stubSysmsgRulesLen stubSyscallRules = mapLen stubSyscallRulesLen = hostarch.PageSize mapLen += stubSyscallRulesLen stubROMapEnd = mapLen // Add a guard page. mapLen += hostarch.PageSize stubSysmsgStack = mapLen // Allocate maxGuestThreads plus ONE because each per-thread stack // has to be aligned to sysmsg.PerThreadMemSize. // Look at sysmsg/sighandler.c:sysmsg_addr() for more details. mapLen, _ = hostarch.PageRoundUp(mapLen + sysmsg.PerThreadMemSize*(uintptr(maxChildThreads+1))) // Allocate context queue region stubContextQueueRegion = mapLen stubContextQueueRegionLen, _ = hostarch.PageRoundUp(unsafe.Sizeof(contextQueue{})) mapLen += stubContextQueueRegionLen stubSpinningThreadQueueAddr = mapLen mapLen += sysmsg.SpinningQueueMemSize // Allocate thread context region stubContextRegion = mapLen stubContextRegionLen = sysmsg.AllocatedSizeofThreadContextStruct * (maxGuestContexts + 1) mapLen, _ = hostarch.PageRoundUp(mapLen + stubContextRegionLen) // Randomize stubStart address. randomOffset := uintptr(rand.Uint64() * hostarch.PageSize) maxRandomOffset := maxRandomOffsetOfStubAddress - mapLen stubStart = uintptr(0) for offset := uintptr(0); offset < maxRandomOffset; offset += hostarch.PageSize { stubStart = maxStubUserAddress + (randomOffset+offset)%maxRandomOffset // Map the target address for the stub. // // We don't use FIXED here because we don't want to unmap // something that may have been there already. We just walk // down the address space until we find a place where the stub // can be placed. addr, _, _ := unix.RawSyscall6( unix.SYS_MMAP, stubStart, stubROMapEnd, unix.PROT_WRITE|unix.PROT_READ, unix.MAP_PRIVATE|unix.MAP_ANONYMOUS, 0 /* fd */, 0 /* offset */) if addr == stubStart { break } if addr != 0 { // Unmap the region we've mapped accidentally. unix.RawSyscall(unix.SYS_MUNMAP, addr, stubROMapEnd, 0) } stubStart = uintptr(0) } if stubStart == 0 { // This will happen only if we exhaust the entire address // space, and it will take a long, long time. panic("failed to map stub") } // Randomize stubSysmsgStack address. gap := uintptr(rand.Uint64()) * hostarch.PageSize % (maximumUserAddress - stubStart - mapLen) stubSysmsgStack += uintptr(gap) stubContextQueueRegion += uintptr(gap) stubContextRegion += uintptr(gap) // Copy the stub to the address. targetSlice := unsafeSlice(stubStart, procStubLen) copy(targetSlice, procStubSlice) stubInitProcess = stubStart stubSysmsgStart += stubStart stubSysmsgStack += stubStart stubROMapEnd += stubStart stubContextQueueRegion += stubStart stubSpinningThreadQueueAddr += stubStart stubContextRegion += stubStart // Align stubSysmsgStack to the per-thread stack size. // Look at sysmsg/sighandler.c:sysmsg_addr() for more details. if offset := stubSysmsgStack % sysmsg.PerThreadMemSize; offset != 0 { stubSysmsgStack += sysmsg.PerThreadMemSize - offset } stubSysmsgRules += stubStart stubSyscallRules += stubStart targetSlice = unsafeSlice(stubSysmsgStart, stubSysmsgLen) copy(targetSlice, sysmsg.SighandlerBlob) // Initialize stub globals p := (*uint64)(unsafe.Pointer(stubSysmsgStart + uintptr(sysmsg.Sighandler_blob_offset____export_deep_sleep_timeout))) *p = deepSleepTimeout p = (*uint64)(unsafe.Pointer(stubSysmsgStart + uintptr(sysmsg.Sighandler_blob_offset____export_context_region))) *p = uint64(stubContextRegion) p = (*uint64)(unsafe.Pointer(stubSysmsgStart + uintptr(sysmsg.Sighandler_blob_offset____export_stub_start))) *p = uint64(stubStart) archState := (*sysmsg.ArchState)(unsafe.Pointer(stubSysmsgStart + uintptr(sysmsg.Sighandler_blob_offset____export_arch_state))) archState.Init() p = (*uint64)(unsafe.Pointer(stubSysmsgStart + uintptr(sysmsg.Sighandler_blob_offset____export_context_queue_addr))) *p = uint64(stubContextQueueRegion) p = (*uint64)(unsafe.Pointer(stubSysmsgStart + uintptr(sysmsg.Sighandler_blob_offset____export_spinning_queue_addr))) *p = uint64(stubSpinningThreadQueueAddr) prepareSeccompRules(stubSysmsgStart, stubSysmsgRules, stubSysmsgRulesLen, stubSyscallRules, stubSyscallRulesLen) // Make the stub executable. if _, _, errno := unix.RawSyscall( unix.SYS_MPROTECT, stubStart, stubROMapEnd-stubStart, unix.PROT_EXEC|unix.PROT_READ); errno != 0 { panic("mprotect failed: " + errno.Error()) } // Set the end. stubEnd = stubStart + mapLen + uintptr(gap) log.Debugf("stubStart %x stubSysmsgStart %x stubSysmsgStack %x, stubContextQueue %x, stubThreadContextRegion %x, mapLen %x", stubStart, stubSysmsgStart, stubSysmsgStack, stubContextQueueRegion, stubContextRegion, mapLen) log.Debugf(archState.String()) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/subprocess.go000066400000000000000000001127671465435605700270360ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import ( "fmt" "os" "runtime" "sync" "sync/atomic" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/pool" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg" "gvisor.dev/gvisor/pkg/sentry/platform/systrap/usertrap" "gvisor.dev/gvisor/pkg/sentry/usage" ) var ( // globalPool tracks all subprocesses in various state: active or available for // reuse. globalPool = subprocessPool{} // maximumUserAddress is the largest possible user address. maximumUserAddress = linux.TaskSize // stubInitAddress is the initial attempt link address for the stub. stubInitAddress = linux.TaskSize // maxRandomOffsetOfStubAddress is the maximum offset for randomizing a // stub address. It is set to the default value of mm.mmap_rnd_bits. // // Note: Tools like ThreadSanitizer don't like when the memory layout // is changed significantly. maxRandomOffsetOfStubAddress = (linux.TaskSize >> 7) & ^(uintptr(hostarch.PageSize) - 1) // maxStubUserAddress is the largest possible user address for // processes running inside gVisor. It is fixed because // * we don't want to reveal a stub address. // * it has to be the same across checkpoint/restore. maxStubUserAddress = maximumUserAddress - maxRandomOffsetOfStubAddress ) // Linux kernel errnos which "should never be seen by user programs", but will // be revealed to ptrace syscall exit tracing. // // These constants are only used in subprocess.go. const ( ERESTARTSYS = unix.Errno(512) ERESTARTNOINTR = unix.Errno(513) ERESTARTNOHAND = unix.Errno(514) ) // thread is a traced thread; it is a thread identifier. // // This is a convenience type for defining ptrace operations. type thread struct { tgid int32 tid int32 // sysmsgStackID is a stack ID in subprocess.sysmsgStackPool. sysmsgStackID uint64 // initRegs are the initial registers for the first thread. // // These are used for the register set for system calls. initRegs arch.Registers logPrefix atomic.Pointer[string] } // requestThread is used to request a new sysmsg thread. A thread identifier will // be sent into the thread channel. type requestThread struct { thread chan *thread } // requestStub is used to request a new stub process. type requestStub struct { done chan *thread } // maxSysmsgThreads is the maximum number of sysmsg threads that a subprocess // can create. It is based on GOMAXPROCS and set once, so it must be set after // GOMAXPROCS has been adjusted (see loader.go:Args.NumCPU). var maxSysmsgThreads = 0 // maxChildThreads is the max number of all child system threads that a // subprocess can create, including sysmsg threads. var maxChildThreads = 0 const ( // maxGuestContexts specifies the maximum number of task contexts that a // subprocess can handle. maxGuestContexts = 4095 // invalidContextID specifies an invalid ID. invalidContextID uint32 = 0xfefefefe // invalidThreadID is used to indicate that a context is not being worked on by // any sysmsg thread. invalidThreadID uint32 = 0xfefefefe ) // subprocess is a collection of threads being traced. type subprocess struct { platform.NoAddressSpaceIO subprocessRefs // requests is used to signal creation of new threads. requests chan any // sysmsgInitRegs is used to reset sysemu regs. sysmsgInitRegs arch.Registers // mu protects the following fields. mu sync.Mutex // faultedContexts is the set of contexts for which it's possible that // platformContext.lastFaultSP == this subprocess. faultedContexts map[*platformContext]struct{} // sysmsgStackPool is a pool of available sysmsg stacks. sysmsgStackPool pool.Pool // threadContextPool is a pool of available sysmsg.ThreadContext IDs. threadContextPool pool.Pool // threadContextRegion defines the ThreadContext memory region start // within the sentry address space. threadContextRegion uintptr // memoryFile is used to allocate a sysmsg stack which is shared // between a stub process and the Sentry. memoryFile *pgalloc.MemoryFile // usertrap is the state of the usertrap table which contains syscall // trampolines. usertrap *usertrap.State syscallThreadMu sync.Mutex syscallThread *syscallThread // sysmsgThreadsMu protects sysmsgThreads and numSysmsgThreads sysmsgThreadsMu sync.Mutex // sysmsgThreads is a collection of all active sysmsg threads in the // subprocess. sysmsgThreads map[uint32]*sysmsgThread // numSysmsgThreads counts the number of active sysmsg threads; we use a // counter instead of using len(sysmsgThreads) because we need to synchronize // how many threads get created _before_ the creation happens. numSysmsgThreads int // contextQueue is a queue of all contexts that are ready to switch back to // user mode. contextQueue *contextQueue // dead indicates whether the subprocess is alive or not. dead atomicbitops.Bool } var seccompNotifyIsSupported = false func initSeccompNotify() { _, _, errno := unix.Syscall(seccomp.SYS_SECCOMP, linux.SECCOMP_SET_MODE_FILTER, linux.SECCOMP_FILTER_FLAG_NEW_LISTENER, 0) switch errno { case unix.EFAULT: // seccomp unotify is supported. case unix.EINVAL: log.Warningf("Seccomp user-space notification mechanism isn't " + "supported by the kernel (available since Linux 5.0).") default: panic(fmt.Sprintf("seccomp returns unexpected code: %d", errno)) } } func (s *subprocess) initSyscallThread(ptraceThread *thread, seccompNotify bool) error { s.syscallThreadMu.Lock() defer s.syscallThreadMu.Unlock() id, ok := s.sysmsgStackPool.Get() if !ok { panic("unable to allocate a sysmsg stub thread") } ptraceThread.sysmsgStackID = id t := syscallThread{ subproc: s, thread: ptraceThread, } if err := t.init(seccompNotify); err != nil { panic(fmt.Sprintf("failed to create a syscall thread")) } s.syscallThread = &t s.syscallThread.detach() return nil } func handlePtraceSyscallRequestError(req any, format string, values ...any) { switch req.(type) { case requestThread: req.(requestThread).thread <- nil case requestStub: req.(requestStub).done <- nil } log.Warningf("handlePtraceSyscallRequest failed: "+format, values...) } // handlePtraceSyscallRequest executes system calls that can't be run via // syscallThread without using ptrace. Look at the description of syscallThread // to get more details about its limitations. func (s *subprocess) handlePtraceSyscallRequest(req any) { s.syscallThreadMu.Lock() defer s.syscallThreadMu.Unlock() runtime.LockOSThread() defer runtime.UnlockOSThread() if err := s.syscallThread.attach(); err != nil { handlePtraceSyscallRequestError(req, err.Error()) return } defer s.syscallThread.detach() ptraceThread := s.syscallThread.thread switch r := req.(type) { case requestThread: t, err := ptraceThread.clone() if err != nil { handlePtraceSyscallRequestError(req, "error initializing thread: %v", err) return } // Since the new thread was created with // clone(CLONE_PTRACE), it will begin execution with // SIGSTOP pending and with this thread as its tracer. // (Hopefully nobody tgkilled it with a signal < // SIGSTOP before the SIGSTOP was delivered, in which // case that signal would be delivered before SIGSTOP.) if sig := t.wait(stopped); sig != unix.SIGSTOP { handlePtraceSyscallRequestError(req, "error waiting for new clone: expected SIGSTOP, got %v", sig) return } t.initRegs = ptraceThread.initRegs // Set the parent death signal to SIGKILL. _, err = t.syscallIgnoreInterrupt(&t.initRegs, unix.SYS_PRCTL, arch.SyscallArgument{Value: linux.PR_SET_PDEATHSIG}, arch.SyscallArgument{Value: uintptr(unix.SIGKILL)}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}, ) if err != nil { handlePtraceSyscallRequestError(req, "prctl: %v", err) return } id, ok := s.sysmsgStackPool.Get() if !ok { handlePtraceSyscallRequestError(req, "unable to allocate a sysmsg stub thread") return } t.sysmsgStackID = id if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(t.tgid), uintptr(t.tid), uintptr(unix.SIGSTOP)); e != 0 { handlePtraceSyscallRequestError(req, "tkill failed: %v", e) return } // Detach the thread. t.detach() // Return the thread. r.thread <- t case requestStub: t, err := ptraceThread.createStub() if err != nil { handlePtraceSyscallRequestError(req, "unable to create a stub process: %v", err) return } r.done <- t } } // newSubprocess returns a usable subprocess. // // This will either be a newly created subprocess, or one from the global pool. // The create function will be called in the latter case, which is guaranteed // to happen with the runtime thread locked. // // seccompNotify indicates a ways of comunications with syscall threads. // If it is false, futex-s are used. Otherwise, seccomp-unotify is used. // seccomp-unotify can't be used for the source pool process, because it is a // parent of all other stub processes, but only one filter can be installed // with SECCOMP_FILTER_FLAG_NEW_LISTENER. func newSubprocess(create func() (*thread, error), memoryFile *pgalloc.MemoryFile, seccompNotify bool) (*subprocess, error) { if sp := globalPool.fetchAvailable(); sp != nil { sp.subprocessRefs.InitRefs() sp.usertrap = usertrap.New() return sp, nil } // The following goroutine is responsible for creating the first traced // thread, and responding to requests to make additional threads in the // traced process. The process will be killed and reaped when the // request channel is closed, which happens in Release below. requests := make(chan any) // Ready. sp := &subprocess{ requests: requests, faultedContexts: make(map[*platformContext]struct{}), sysmsgStackPool: pool.Pool{Start: 0, Limit: uint64(maxChildThreads)}, threadContextPool: pool.Pool{Start: 0, Limit: maxGuestContexts}, memoryFile: memoryFile, sysmsgThreads: make(map[uint32]*sysmsgThread), } sp.subprocessRefs.InitRefs() runtime.LockOSThread() defer runtime.UnlockOSThread() // Initialize the syscall thread. ptraceThread, err := create() if err != nil { return nil, err } sp.sysmsgInitRegs = ptraceThread.initRegs if err := sp.initSyscallThread(ptraceThread, seccompNotify); err != nil { return nil, err } go func() { // S/R-SAFE: Platform-related. // Wait for requests to create threads. for req := range requests { sp.handlePtraceSyscallRequest(req) } // Requests should never be closed. panic("unreachable") }() sp.unmap() sp.usertrap = usertrap.New() sp.mapSharedRegions() sp.mapPrivateRegions() // The main stub doesn't need sysmsg threads. if seccompNotify { // Create the initial sysmsg thread. atomic.AddUint32(&sp.contextQueue.numThreadsToWakeup, 1) if err := sp.createSysmsgThread(); err != nil { return nil, err } sp.numSysmsgThreads++ } return sp, nil } // mapSharedRegions maps the shared regions that are used between the subprocess // and ALL of the subsequently created sysmsg threads into both the sentry and // the syscall thread. // // Should be called before any sysmsg threads are created. // Initializes s.contextQueue and s.threadContextRegion. func (s *subprocess) mapSharedRegions() { if s.contextQueue != nil || s.threadContextRegion != 0 { panic("contextQueue or threadContextRegion was already initialized") } opts := pgalloc.AllocOpts{ Kind: usage.System, Dir: pgalloc.TopDown, } // Map shared regions into the sentry. contextQueueFR, contextQueue := mmapContextQueueForSentry(s.memoryFile, opts) contextQueue.init() // Map thread context region into the syscall thread. _, err := s.syscallThread.syscall( unix.SYS_MMAP, arch.SyscallArgument{Value: uintptr(stubContextQueueRegion)}, arch.SyscallArgument{Value: uintptr(contextQueueFR.Length())}, arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)}, arch.SyscallArgument{Value: uintptr(unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED)}, arch.SyscallArgument{Value: uintptr(s.memoryFile.FD())}, arch.SyscallArgument{Value: uintptr(contextQueueFR.Start)}) if err != nil { panic(fmt.Sprintf("failed to mmap context queue region into syscall thread: %v", err)) } s.contextQueue = contextQueue // Map thread context region into the sentry. threadContextFR, err := s.memoryFile.Allocate(uint64(stubContextRegionLen), opts) if err != nil { panic(fmt.Sprintf("failed to allocate a new subprocess context memory region")) } sentryThreadContextRegionAddr, _, errno := unix.RawSyscall6( unix.SYS_MMAP, 0, uintptr(threadContextFR.Length()), unix.PROT_WRITE|unix.PROT_READ, unix.MAP_SHARED|unix.MAP_FILE, uintptr(s.memoryFile.FD()), uintptr(threadContextFR.Start)) if errno != 0 { panic(fmt.Sprintf("mmap failed for subprocess context memory region: %v", errno)) } // Map thread context region into the syscall thread. if _, err := s.syscallThread.syscall( unix.SYS_MMAP, arch.SyscallArgument{Value: uintptr(stubContextRegion)}, arch.SyscallArgument{Value: uintptr(threadContextFR.Length())}, arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)}, arch.SyscallArgument{Value: uintptr(unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED)}, arch.SyscallArgument{Value: uintptr(s.memoryFile.FD())}, arch.SyscallArgument{Value: uintptr(threadContextFR.Start)}); err != nil { panic(fmt.Sprintf("failed to mmap context queue region into syscall thread: %v", err)) } s.threadContextRegion = sentryThreadContextRegionAddr } func (s *subprocess) mapPrivateRegions() { _, err := s.syscallThread.syscall( unix.SYS_MMAP, arch.SyscallArgument{Value: uintptr(stubSpinningThreadQueueAddr)}, arch.SyscallArgument{Value: uintptr(sysmsg.SpinningQueueMemSize)}, arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)}, arch.SyscallArgument{Value: uintptr(unix.MAP_PRIVATE | unix.MAP_ANONYMOUS | unix.MAP_FIXED)}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}) if err != nil { panic(fmt.Sprintf("failed to mmap spinning queue region into syscall thread: %v", err)) } } // unmap unmaps non-stub regions of the process. // // This will panic on failure (which should never happen). func (s *subprocess) unmap() { s.Unmap(0, uint64(stubStart)) if maximumUserAddress != stubEnd { s.Unmap(hostarch.Addr(stubEnd), uint64(maximumUserAddress-stubEnd)) } } // Release kills the subprocess. // // Just kidding! We can't safely coordinate the detaching of all the // tracees (since the tracers are random runtime threads, and the process // won't exit until tracers have been notifier). // // Therefore we simply unmap everything in the subprocess and return it to the // globalPool. This has the added benefit of reducing creation time for new // subprocesses. func (s *subprocess) Release() { if !s.alive() { return } s.unmap() s.DecRef(s.release) } // release returns the subprocess to the global pool. func (s *subprocess) release() { if s.alive() { globalPool.markAvailable(s) return } if s.syscallThread != nil && s.syscallThread.seccompNotify != nil { s.syscallThread.seccompNotify.Close() } } // attach attaches to the thread. func (t *thread) attach() error { if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_ATTACH, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { return fmt.Errorf("unable to attach: %v", errno) } // PTRACE_ATTACH sends SIGSTOP, and wakes the tracee if it was already // stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of // newSubprocess), so we always expect to see signal-delivery-stop with // SIGSTOP. if sig := t.wait(stopped); sig != unix.SIGSTOP { return fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig) } // Initialize options. t.init() return nil } func (t *thread) grabInitRegs() { // Grab registers. // // Note that we adjust the current register RIP value to be just before // the current system call executed. This depends on the definition of // the stub itself. if err := t.getRegs(&t.initRegs); err != nil { panic(fmt.Sprintf("ptrace get regs failed: %v", err)) } t.adjustInitRegsRip() t.initRegs.SetStackPointer(0) } // detach detaches from the thread. // // Because the SIGSTOP is not suppressed, the thread will enter group-stop. func (t *thread) detach() { if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(unix.SIGSTOP), 0, 0); errno != 0 { panic(fmt.Sprintf("can't detach new clone: %v", errno)) } } // waitOutcome is used for wait below. type waitOutcome int const ( // stopped indicates that the process was stopped. stopped waitOutcome = iota // killed indicates that the process was killed. killed ) func (t *thread) loadLogPrefix() *string { p := t.logPrefix.Load() if p == nil { prefix := fmt.Sprintf("[% 4d:% 4d] ", t.tgid, t.tid) t.logPrefix.Store(&prefix) p = &prefix } return p } // Debugf logs with the debugging severity. func (t *thread) Debugf(format string, v ...any) { if log.IsLogging(log.Debug) { log.DebugfAtDepth(1, *t.loadLogPrefix()+format, v...) } } // Warningf logs with the warning severity. func (t *thread) Warningf(format string, v ...any) { if log.IsLogging(log.Warning) { log.WarningfAtDepth(1, *t.loadLogPrefix()+format, v...) } } func (t *thread) dumpAndPanic(message string) { var regs arch.Registers message += "\n" if err := t.getRegs(®s); err == nil { message += dumpRegs(®s) } else { log.Warningf("unable to get registers: %v", err) } message += fmt.Sprintf("stubStart\t = %016x\n", stubStart) panic(message) } func (t *thread) dumpRegs(message string) { var regs arch.Registers message += "\n" if err := t.getRegs(®s); err == nil { message += dumpRegs(®s) } else { log.Warningf("unable to get registers: %v", err) } log.Infof("%s", message) } func (t *thread) unexpectedStubExit() { msg, err := t.getEventMessage() status := unix.WaitStatus(msg) if status.Signaled() && status.Signal() == unix.SIGKILL { // SIGKILL can be only sent by a user or OOM-killer. In both // these cases, we don't need to panic. There is no reasons to // think that something wrong in gVisor. log.Warningf("The ptrace stub process %v has been killed by SIGKILL.", t.tgid) pid := os.Getpid() unix.Tgkill(pid, pid, unix.Signal(unix.SIGKILL)) } t.dumpAndPanic(fmt.Sprintf("wait failed: the process %d:%d exited: %x (err %v)", t.tgid, t.tid, msg, err)) } // wait waits for a stop event. // // Precondition: outcome is a valid waitOutcome. func (t *thread) wait(outcome waitOutcome) unix.Signal { var status unix.WaitStatus for { r, err := unix.Wait4(int(t.tid), &status, unix.WALL|unix.WUNTRACED, nil) if err == unix.EINTR || err == unix.EAGAIN { // Wait was interrupted; wait again. continue } else if err != nil { panic(fmt.Sprintf("ptrace wait failed: %v", err)) } if int(r) != int(t.tid) { panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid)) } switch outcome { case stopped: if !status.Stopped() { t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status)) } stopSig := status.StopSignal() if stopSig == 0 { continue // Spurious stop. } if stopSig == unix.SIGTRAP { if status.TrapCause() == unix.PTRACE_EVENT_EXIT { t.unexpectedStubExit() } // Re-encode the trap cause the way it's expected. return stopSig | unix.Signal(status.TrapCause()<<8) } // Not a trap signal. return stopSig case killed: if !status.Exited() && !status.Signaled() { t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status)) } return unix.Signal(status.ExitStatus()) default: // Should not happen. t.dumpAndPanic(fmt.Sprintf("unknown outcome: %v", outcome)) } } } // kill kills the thread; func (t *thread) kill() { unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(unix.SIGKILL)) } // destroy kills and waits on the thread. // // Note that this should not be used in the general case; the death of threads // will typically cause the death of the parent. This is a utility method for // manually created threads. func (t *thread) destroy() { t.detach() unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(unix.SIGKILL)) t.wait(killed) } // init initializes trace options. func (t *thread) init() { // Set the TRACESYSGOOD option to differentiate real SIGTRAP. // set PTRACE_O_EXITKILL to ensure that the unexpected exit of the // sentry will immediately kill the associated stubs. _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_SETOPTIONS, uintptr(t.tid), 0, unix.PTRACE_O_TRACESYSGOOD|unix.PTRACE_O_TRACEEXIT|unix.PTRACE_O_EXITKILL, 0, 0) if errno != 0 { panic(fmt.Sprintf("ptrace set options failed: %v", errno)) } } // syscall executes a system call cycle in the traced context. // // This is _not_ for use by application system calls, rather it is for use when // a system call must be injected into the remote context (e.g. mmap, munmap). // Note that clones are handled separately. func (t *thread) syscall(regs *arch.Registers) (uintptr, error) { // Set registers. if err := t.setRegs(regs); err != nil { panic(fmt.Sprintf("ptrace set regs failed: %v", err)) } for { // Execute the syscall instruction. The task has to stop on the // trap instruction which is right after the syscall // instruction. if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) } sig := t.wait(stopped) if sig == unix.SIGTRAP { // Reached syscall-enter-stop. break } else { // Some other signal caused a thread stop; ignore. if sig != unix.SIGSTOP && sig != unix.SIGCHLD { log.Warningf("The thread %d:%d has been interrupted by %d", t.tgid, t.tid, sig) } continue } } // Grab registers. if err := t.getRegs(regs); err != nil { panic(fmt.Sprintf("ptrace get regs failed: %v", err)) } return syscallReturnValue(regs) } // syscallIgnoreInterrupt ignores interrupts on the system call thread and // restarts the syscall if the kernel indicates that should happen. func (t *thread) syscallIgnoreInterrupt( initRegs *arch.Registers, sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) { for { regs := createSyscallRegs(initRegs, sysno, args...) rval, err := t.syscall(®s) switch err { case ERESTARTSYS: continue case ERESTARTNOINTR: continue case ERESTARTNOHAND: continue default: return rval, err } } } // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. func (t *thread) NotifyInterrupt() { unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(platform.SignalInterrupt)) } func (s *subprocess) incAwakeContexts() { nr := atomic.AddUint32(&s.contextQueue.numAwakeContexts, 1) if nr > uint32(maxSysmsgThreads) { return } fastpath.nrMaxAwakeStubThreads.Add(1) } func (s *subprocess) decAwakeContexts() { nr := atomic.AddUint32(&s.contextQueue.numAwakeContexts, ^uint32(0)) if nr >= uint32(maxSysmsgThreads) { return } fastpath.nrMaxAwakeStubThreads.Add(^uint32(0)) } // switchToApp is called from the main SwitchToApp entrypoint. // // This function returns true on a system call, false on a signal. // The second return value is true if a syscall instruction can be replaced on // a function call. func (s *subprocess) switchToApp(c *platformContext, ac *arch.Context64) (isSyscall bool, shouldPatchSyscall bool, err *platform.ContextError) { // Reset necessary registers. regs := &ac.StateData().Regs s.resetSysemuRegs(regs) ctx := c.sharedContext ctx.shared.Regs = regs.PtraceRegs restoreArchSpecificState(ctx.shared, ac) // Check for interrupts, and ensure that future interrupts signal the context. if !c.interrupt.Enable(c.sharedContext) { // Pending interrupt; simulate. ctx.clearInterrupt() c.signalInfo = linux.SignalInfo{Signo: int32(platform.SignalInterrupt)} return false, false, nil } defer func() { ctx.clearInterrupt() c.interrupt.Disable() }() restoreFPState(ctx, c, ac) // Place the context onto the context queue. if ctx.sleeping { ctx.sleeping = false s.incAwakeContexts() } ctx.setState(sysmsg.ContextStateNone) if err := s.contextQueue.add(ctx); err != nil { return false, false, err } if err := s.waitOnState(ctx); err != nil { return false, false, corruptedSharedMemoryErr(err.Error()) } // Check if there's been an error. threadID := ctx.threadID() if threadID != invalidThreadID { if sysThread, ok := s.sysmsgThreads[threadID]; ok && sysThread.msg.Err != 0 { return false, false, sysThread.msg.ConvertSysmsgErr() } return false, false, corruptedSharedMemoryErr(fmt.Sprintf("found unexpected ThreadContext.ThreadID field, expected %d found %d", invalidThreadID, threadID)) } // Copy register state locally. regs.PtraceRegs = ctx.shared.Regs retrieveArchSpecificState(ctx.shared, ac) c.needToPullFullState = true // We have a signal. We verify however, that the signal was // either delivered from the kernel or from this process. We // don't respect other signals. c.signalInfo = ctx.shared.SignalInfo ctxState := ctx.state() if ctxState == sysmsg.ContextStateSyscallCanBePatched { ctxState = sysmsg.ContextStateSyscall shouldPatchSyscall = true } if ctxState == sysmsg.ContextStateSyscall || ctxState == sysmsg.ContextStateSyscallTrap { if maybePatchSignalInfo(regs, &c.signalInfo) { return false, false, nil } updateSyscallRegs(regs) return true, shouldPatchSyscall, nil } else if ctxState != sysmsg.ContextStateFault { return false, false, corruptedSharedMemoryErr(fmt.Sprintf("unknown context state: %v", ctxState)) } return false, false, nil } func (s *subprocess) waitOnState(ctx *sharedContext) error { ctx.kicked = false slowPath := false if !s.contextQueue.fastPathEnabled() || atomic.LoadUint32(&s.contextQueue.numActiveThreads) == 0 { ctx.kicked = s.kickSysmsgThread() } for curState := ctx.state(); curState == sysmsg.ContextStateNone; curState = ctx.state() { if !slowPath { events := dispatcher.waitFor(ctx) if events&sharedContextKicked != 0 { if ctx.kicked { continue } if ctx.isAcked() { ctx.kicked = true continue } s.kickSysmsgThread() ctx.kicked = true continue } if events&sharedContextSlowPath != 0 { ctx.disableSentryFastPath() slowPath = true continue } } else { // If the context already received a handshake then it knows it's being // worked on. if !ctx.kicked && !ctx.isAcked() { ctx.kicked = s.kickSysmsgThread() } if err := ctx.sleepOnState(curState); err != nil { return err } } } ctx.recordLatency() ctx.resetLatencyMeasures() ctx.enableSentryFastPath() return nil } // canKickSysmsgThread returns true if a new thread can be kicked. // The second return value is the expected number of threads after kicking a // new one. func (s *subprocess) canKickSysmsgThread() (bool, uint32) { // numActiveContexts and numActiveThreads can be changed from stub // threads that handles the contextQueue without any locks. The idea // here is that any stub thread that gets CPU time can make some // progress. In stub threads, we can use only spinlock-like // synchronizations, but they don't work well because a thread that // holds a lock can be preempted by another thread that is waiting for // the same lock. nrActiveThreads := atomic.LoadUint32(&s.contextQueue.numActiveThreads) nrThreadsToWakeup := atomic.LoadUint32(&s.contextQueue.numThreadsToWakeup) nrActiveContexts := atomic.LoadUint32(&s.contextQueue.numActiveContexts) nrActiveThreads += nrThreadsToWakeup + 1 if nrActiveThreads > nrActiveContexts { // This can happen when one or more stub threads are // waiting for cpu time. The host probably has more // running tasks than a number of cpu-s. return false, nrActiveThreads } return true, nrActiveThreads } // kickSysmsgThread returns true if it was able to wake up or create a new sysmsg // stub thread. func (s *subprocess) kickSysmsgThread() bool { kick, _ := s.canKickSysmsgThread() if !kick { return false } s.sysmsgThreadsMu.Lock() kick, nrThreads := s.canKickSysmsgThread() if !kick { s.sysmsgThreadsMu.Unlock() return false } numTimesStubKicked.Increment() atomic.AddUint32(&s.contextQueue.numThreadsToWakeup, 1) if s.numSysmsgThreads < maxSysmsgThreads && s.numSysmsgThreads < int(nrThreads) { s.numSysmsgThreads++ s.sysmsgThreadsMu.Unlock() if err := s.createSysmsgThread(); err != nil { log.Warningf("Unable to create a new stub thread: %s", err) s.sysmsgThreadsMu.Lock() s.numSysmsgThreads-- s.sysmsgThreadsMu.Unlock() } } else { s.sysmsgThreadsMu.Unlock() } s.contextQueue.wakeupSysmsgThread() return true } // syscall executes the given system call without handling interruptions. func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) { s.syscallThreadMu.Lock() defer s.syscallThreadMu.Unlock() return s.syscallThread.syscall(sysno, args...) } // MapFile implements platform.AddressSpace.MapFile. func (s *subprocess) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error { var flags int if precommit { flags |= unix.MAP_POPULATE } _, err := s.syscall( unix.SYS_MMAP, arch.SyscallArgument{Value: uintptr(addr)}, arch.SyscallArgument{Value: uintptr(fr.Length())}, arch.SyscallArgument{Value: uintptr(at.Prot())}, arch.SyscallArgument{Value: uintptr(flags | unix.MAP_SHARED | unix.MAP_FIXED)}, arch.SyscallArgument{Value: uintptr(f.FD())}, arch.SyscallArgument{Value: uintptr(fr.Start)}) return err } // Unmap implements platform.AddressSpace.Unmap. func (s *subprocess) Unmap(addr hostarch.Addr, length uint64) { ar, ok := addr.ToRange(length) if !ok { panic(fmt.Sprintf("addr %#x + length %#x overflows", addr, length)) } s.mu.Lock() for c := range s.faultedContexts { c.mu.Lock() if c.lastFaultSP == s && ar.Contains(c.lastFaultAddr) { // Forget the last fault so that if c faults again, the fault isn't // incorrectly reported as a write fault. If this is being called // due to munmap() of the corresponding vma, handling of the second // fault will fail anyway. c.lastFaultSP = nil delete(s.faultedContexts, c) } c.mu.Unlock() } s.mu.Unlock() _, err := s.syscall( unix.SYS_MUNMAP, arch.SyscallArgument{Value: uintptr(addr)}, arch.SyscallArgument{Value: uintptr(length)}) if err != nil && err != errDeadSubprocess { // We never expect this to happen. panic(fmt.Sprintf("munmap(%x, %x)) failed: %v", addr, length, err)) } } func (s *subprocess) PullFullState(c *platformContext, ac *arch.Context64) error { if !c.sharedContext.isActiveInSubprocess(s) { panic("Attempted to PullFullState for context that is not used in subprocess") } saveFPState(c.sharedContext, ac) return nil } var ( sysmsgThreadPriorityOnce sync.Once sysmsgThreadPriority int ) // initSysmsgThreadPriority looks at the current priority of the process // and updates `sysmsgThreadPriority` accordingly. func initSysmsgThreadPriority() { sysmsgThreadPriorityOnce.Do(func() { prio, err := unix.Getpriority(unix.PRIO_PROCESS, 0) if err != nil { panic("unable to get current scheduling priority") } // Sysmsg threads are executed with a priority one lower than the Sentry. sysmsgThreadPriority = 20 - prio + 1 }) } // createSysmsgThread creates a new sysmsg thread. // The thread starts processing any available context in the context queue. func (s *subprocess) createSysmsgThread() error { // Create a new seccomp process. var r requestThread r.thread = make(chan *thread) s.requests <- r p := <-r.thread if p == nil { return fmt.Errorf("createSysmsgThread: failed to get clone") } runtime.LockOSThread() defer runtime.UnlockOSThread() if err := p.attach(); err != nil { return err } // Skip SIGSTOP. if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(p.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace cont failed: %v", errno)) } sig := p.wait(stopped) if sig != unix.SIGSTOP { panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig)) } // Allocate a new stack for the BPF process. opts := pgalloc.AllocOpts{ Kind: usage.System, Dir: pgalloc.TopDown, } fr, err := s.memoryFile.Allocate(uint64(sysmsg.PerThreadSharedStackSize), opts) if err != nil { // TODO(b/144063246): Need to fail the clone system call. panic(fmt.Sprintf("failed to allocate a new stack: %v", err)) } sysThread := &sysmsgThread{ thread: p, subproc: s, stackRange: fr, } // Use the sysmsgStackID as a handle on this thread instead of host tid in // order to be able to reliably specify invalidThreadID. threadID := uint32(p.sysmsgStackID) // Map the stack into the sentry. sentryStackAddr, _, errno := unix.RawSyscall6( unix.SYS_MMAP, 0, sysmsg.PerThreadSharedStackSize, unix.PROT_WRITE|unix.PROT_READ, unix.MAP_SHARED|unix.MAP_FILE, uintptr(s.memoryFile.FD()), uintptr(fr.Start)) if errno != 0 { panic(fmt.Sprintf("mmap failed: %v", errno)) } // Before installing the stub syscall filters, we need to call a few // system calls (e.g. sigaltstack, sigaction) which have in-memory // arguments. We need to prevent changing these parameters by other // stub threads, so lets map the future BPF stack as read-only and // fill syscall arguments from the Sentry. sysmsgStackAddr := sysThread.sysmsgPerThreadMemAddr() + sysmsg.PerThreadSharedStackOffset err = sysThread.mapStack(sysmsgStackAddr, true) if err != nil { panic(fmt.Sprintf("mmap failed: %v", err)) } sysThread.init(sentryStackAddr, sysmsgStackAddr) // Map the stack into the BPF process. err = sysThread.mapStack(sysmsgStackAddr, false) if err != nil { s.memoryFile.DecRef(fr) panic(fmt.Sprintf("mmap failed: %v", err)) } // Map the stack into the BPF process. privateStackAddr := sysThread.sysmsgPerThreadMemAddr() + sysmsg.PerThreadPrivateStackOffset err = sysThread.mapPrivateStack(privateStackAddr, sysmsg.PerThreadPrivateStackSize) if err != nil { s.memoryFile.DecRef(fr) panic(fmt.Sprintf("mmap failed: %v", err)) } sysThread.setMsg(sysmsg.StackAddrToMsg(sentryStackAddr)) sysThread.msg.Init(threadID) sysThread.msg.Self = uint64(sysmsgStackAddr + sysmsg.MsgOffsetFromSharedStack) sysThread.msg.SyshandlerStack = uint64(sysmsg.StackAddrToSyshandlerStack(sysThread.sysmsgPerThreadMemAddr())) sysThread.msg.Syshandler = uint64(stubSysmsgStart + uintptr(sysmsg.Sighandler_blob_offset____export_syshandler)) sysThread.msg.State.Set(sysmsg.ThreadStateInitializing) if err := unix.Setpriority(unix.PRIO_PROCESS, int(p.tid), sysmsgThreadPriority); err != nil { log.Warningf("Unable to change priority of a stub thread: %s", err) } // Install a pre-compiled seccomp rules for the BPF process. _, err = p.syscallIgnoreInterrupt(&p.initRegs, unix.SYS_PRCTL, arch.SyscallArgument{Value: uintptr(linux.PR_SET_NO_NEW_PRIVS)}, arch.SyscallArgument{Value: uintptr(1)}, arch.SyscallArgument{Value: uintptr(0)}, arch.SyscallArgument{Value: uintptr(0)}, arch.SyscallArgument{Value: uintptr(0)}, arch.SyscallArgument{Value: uintptr(0)}) if err != nil { panic(fmt.Sprintf("prctl(PR_SET_NO_NEW_PRIVS) failed: %v", err)) } _, err = p.syscallIgnoreInterrupt(&p.initRegs, seccomp.SYS_SECCOMP, arch.SyscallArgument{Value: uintptr(linux.SECCOMP_SET_MODE_FILTER)}, arch.SyscallArgument{Value: uintptr(0)}, arch.SyscallArgument{Value: stubSysmsgRules}) if err != nil { panic(fmt.Sprintf("seccomp failed: %v", err)) } // Prepare to start the BPF process. tregs := &arch.Registers{} s.resetSysemuRegs(tregs) setArchSpecificRegs(sysThread, tregs) if err := p.setRegs(tregs); err != nil { panic(fmt.Sprintf("ptrace set regs failed: %v", err)) } archSpecificSysmsgThreadInit(sysThread) // Skip SIGSTOP. if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(p.tgid), uintptr(p.tid), uintptr(unix.SIGCONT)); e != 0 { panic(fmt.Sprintf("tkill failed: %v", e)) } // Resume the BPF process. if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(p.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("can't detach new clone: %v", errno)) } s.sysmsgThreadsMu.Lock() s.sysmsgThreads[threadID] = sysThread s.sysmsgThreadsMu.Unlock() return nil } // PreFork implements platform.AddressSpace.PreFork. // We need to take the usertrap lock to be sure that fork() will not be in the // middle of applying a binary patch. func (s *subprocess) PreFork() { s.usertrap.PreFork() } // PostFork implements platform.AddressSpace.PostFork. func (s *subprocess) PostFork() { s.usertrap.PostFork() // +checklocksforce: PreFork acquires, above. } // activateContext activates the context in this subprocess. // No-op if the context is already active within the subprocess; if not, // deactivates it from its last subprocess. func (s *subprocess) activateContext(c *platformContext) error { if !c.sharedContext.isActiveInSubprocess(s) { c.sharedContext.release() c.sharedContext = nil shared, err := s.getSharedContext() if err != nil { return err } c.sharedContext = shared } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/subprocess_amd64.go000066400000000000000000000165121465435605700300200ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package systrap import ( "fmt" "strings" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg" ) const ( // initRegsRipAdjustment is the size of the syscall instruction. initRegsRipAdjustment = 2 ) // resetSysemuRegs sets up emulation registers. // // This should be called prior to calling sysemu. func (s *subprocess) resetSysemuRegs(regs *arch.Registers) { regs.Cs = s.sysmsgInitRegs.Cs regs.Ss = s.sysmsgInitRegs.Ss regs.Ds = s.sysmsgInitRegs.Ds regs.Es = s.sysmsgInitRegs.Es regs.Fs = s.sysmsgInitRegs.Fs regs.Gs = s.sysmsgInitRegs.Gs } // createSyscallRegs sets up syscall registers. // // This should be called to generate registers for a system call. func createSyscallRegs(initRegs *arch.Registers, sysno uintptr, args ...arch.SyscallArgument) arch.Registers { // Copy initial registers. regs := *initRegs // Set our syscall number. regs.Rax = uint64(sysno) if len(args) >= 1 { regs.Rdi = args[0].Uint64() } if len(args) >= 2 { regs.Rsi = args[1].Uint64() } if len(args) >= 3 { regs.Rdx = args[2].Uint64() } if len(args) >= 4 { regs.R10 = args[3].Uint64() } if len(args) >= 5 { regs.R8 = args[4].Uint64() } if len(args) >= 6 { regs.R9 = args[5].Uint64() } return regs } // updateSyscallRegs updates registers after finishing sysemu. func updateSyscallRegs(regs *arch.Registers) { // Ptrace puts -ENOSYS in rax on syscall-enter-stop. regs.Rax = regs.Orig_rax } // syscallReturnValue extracts a sensible return from registers. func syscallReturnValue(regs *arch.Registers) (uintptr, error) { rval := int64(regs.Rax) if rval < 0 { return 0, unix.Errno(-rval) } return uintptr(rval), nil } func dumpRegs(regs *arch.Registers) string { var m strings.Builder fmt.Fprintf(&m, "Registers:\n") fmt.Fprintf(&m, "\tR15\t = %016x\n", regs.R15) fmt.Fprintf(&m, "\tR14\t = %016x\n", regs.R14) fmt.Fprintf(&m, "\tR13\t = %016x\n", regs.R13) fmt.Fprintf(&m, "\tR12\t = %016x\n", regs.R12) fmt.Fprintf(&m, "\tRbp\t = %016x\n", regs.Rbp) fmt.Fprintf(&m, "\tRbx\t = %016x\n", regs.Rbx) fmt.Fprintf(&m, "\tR11\t = %016x\n", regs.R11) fmt.Fprintf(&m, "\tR10\t = %016x\n", regs.R10) fmt.Fprintf(&m, "\tR9\t = %016x\n", regs.R9) fmt.Fprintf(&m, "\tR8\t = %016x\n", regs.R8) fmt.Fprintf(&m, "\tRax\t = %016x\n", regs.Rax) fmt.Fprintf(&m, "\tRcx\t = %016x\n", regs.Rcx) fmt.Fprintf(&m, "\tRdx\t = %016x\n", regs.Rdx) fmt.Fprintf(&m, "\tRsi\t = %016x\n", regs.Rsi) fmt.Fprintf(&m, "\tRdi\t = %016x\n", regs.Rdi) fmt.Fprintf(&m, "\tOrig_rax = %016x\n", regs.Orig_rax) fmt.Fprintf(&m, "\tRip\t = %016x\n", regs.Rip) fmt.Fprintf(&m, "\tCs\t = %016x\n", regs.Cs) fmt.Fprintf(&m, "\tEflags\t = %016x\n", regs.Eflags) fmt.Fprintf(&m, "\tRsp\t = %016x\n", regs.Rsp) fmt.Fprintf(&m, "\tSs\t = %016x\n", regs.Ss) fmt.Fprintf(&m, "\tFs_base\t = %016x\n", regs.Fs_base) fmt.Fprintf(&m, "\tGs_base\t = %016x\n", regs.Gs_base) fmt.Fprintf(&m, "\tDs\t = %016x\n", regs.Ds) fmt.Fprintf(&m, "\tEs\t = %016x\n", regs.Es) fmt.Fprintf(&m, "\tFs\t = %016x\n", regs.Fs) fmt.Fprintf(&m, "\tGs\t = %016x\n", regs.Gs) return m.String() } // adjustInitregsRip adjust the current register RIP value to // be just before the system call instruction execution func (t *thread) adjustInitRegsRip() { t.initRegs.Rip -= initRegsRipAdjustment } // Pass the expected PPID to the child via R15 when creating stub process. func initChildProcessPPID(initregs *arch.Registers, ppid int32) { // Rbx has to be set to 1 when creating stub process. initregs.Rbx = _NEW_STUB } // patchSignalInfo patches the signal info to account for hitting the seccomp // filters from vsyscall emulation, specified below. We allow for SIGSYS as a // synchronous trap, but patch the structure to appear like a SIGSEGV with the // Rip as the faulting address. // // Note that this should only be called after verifying that the signalInfo has // been generated by the kernel. // Returns true if the signal info was patched, false otherwise. func maybePatchSignalInfo(regs *arch.Registers, signalInfo *linux.SignalInfo) bool { if signalInfo.Addr() < linux.VSyscallStartAddr || signalInfo.Addr() >= linux.VSyscallEndAddr { return false } // The syscall event was triggered from vsyscall emulation. signalInfo.Signo = int32(linux.SIGSEGV) // Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered // with the si_call_addr field pointing to the current RIP. This field // aligns with the si_addr field for a SIGSEGV, so we don't need to touch // anything there. We do need to unwind emulation however, so we set the // instruction pointer to the faulting value, and "unpop" the stack. regs.Rip = signalInfo.Addr() regs.Rsp -= 8 return true } // enableCpuidFault enables cpuid-faulting. // // This may fail on older kernels or hardware, so we just disregard the result. // Host CPUID will be enabled. // // This is safe to call in an afterFork context. // //go:nosplit //go:norace func enableCpuidFault() { unix.RawSyscall6(unix.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0, 0, 0, 0) } // appendArchSeccompRules append architecture specific seccomp rules when creating BPF program. // Ref attachedThread() for more detail. func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet { return append(rules, []seccomp.RuleSet{ // Rules for trapping vsyscall access. { Rules: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_GETTIMEOFDAY: seccomp.MatchAll{}, unix.SYS_TIME: seccomp.MatchAll{}, unix.SYS_GETCPU: seccomp.MatchAll{}, // SYS_GETCPU was not defined in package syscall on amd64. }), Action: linux.SECCOMP_RET_TRAP, Vsyscall: true, }, { Rules: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_ARCH_PRCTL: seccomp.Or{ seccomp.PerArg{seccomp.EqualTo(linux.ARCH_SET_CPUID), seccomp.EqualTo(0)}, seccomp.PerArg{seccomp.EqualTo(linux.ARCH_SET_FS)}, seccomp.PerArg{seccomp.EqualTo(linux.ARCH_GET_FS)}, }, }), Action: linux.SECCOMP_RET_ALLOW, }, }...) } func restoreArchSpecificState(ctx *sysmsg.ThreadContext, ac *arch.Context64) { } func setArchSpecificRegs(sysThread *sysmsgThread, regs *arch.Registers) { // Set the start function and initial stack. regs.PtraceRegs.Rip = uint64(stubSysmsgStart + uintptr(sysmsg.Sighandler_blob_offset____export_start)) regs.PtraceRegs.Rsp = uint64(sysmsg.StackAddrToSyshandlerStack(sysThread.sysmsgPerThreadMemAddr())) // Set gs_base; this is the only time we set it and we don't expect it to ever // change for any thread. regs.Gs_base = sysThread.msg.Self } func retrieveArchSpecificState(ctx *sysmsg.ThreadContext, ac *arch.Context64) { } func archSpecificSysmsgThreadInit(sysThread *sysmsgThread) { } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/subprocess_arm64.go000066400000000000000000000132551465435605700300370ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package systrap import ( "fmt" "strings" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg" ) const ( // initRegsRipAdjustment is the size of the svc instruction. initRegsRipAdjustment = 4 ) // resetSysemuRegs sets up emulation registers. // // This should be called prior to calling sysemu. func (s *subprocess) resetSysemuRegs(regs *arch.Registers) { } // createSyscallRegs sets up syscall registers. // // This should be called to generate registers for a system call. func createSyscallRegs(initRegs *arch.Registers, sysno uintptr, args ...arch.SyscallArgument) arch.Registers { // Copy initial registers (Pc, Sp, etc.). regs := *initRegs // Set our syscall number. // r8 for the syscall number. // r0-r6 is used to store the parameters. regs.Regs[8] = uint64(sysno) if len(args) >= 1 { regs.Regs[0] = args[0].Uint64() } if len(args) >= 2 { regs.Regs[1] = args[1].Uint64() } if len(args) >= 3 { regs.Regs[2] = args[2].Uint64() } if len(args) >= 4 { regs.Regs[3] = args[3].Uint64() } if len(args) >= 5 { regs.Regs[4] = args[4].Uint64() } if len(args) >= 6 { regs.Regs[5] = args[5].Uint64() } return regs } // updateSyscallRegs updates registers after finishing sysemu. func updateSyscallRegs(regs *arch.Registers) { // No special work is necessary. return } // syscallReturnValue extracts a sensible return from registers. func syscallReturnValue(regs *arch.Registers) (uintptr, error) { rval := int64(regs.Regs[0]) if rval < 0 { return 0, unix.Errno(-rval) } return uintptr(rval), nil } func dumpRegs(regs *arch.Registers) string { var m strings.Builder fmt.Fprintf(&m, "Registers:\n") for i := 0; i < 31; i++ { fmt.Fprintf(&m, "\tRegs[%d]\t = %016x\n", i, regs.Regs[i]) } fmt.Fprintf(&m, "\tSp\t = %016x\n", regs.Sp) fmt.Fprintf(&m, "\tPc\t = %016x\n", regs.Pc) fmt.Fprintf(&m, "\tPstate\t = %016x\n", regs.Pstate) return m.String() } // adjustInitregsRip adjust the current register RIP value to // be just before the system call instruction execution. func (t *thread) adjustInitRegsRip() { t.initRegs.Pc -= initRegsRipAdjustment } // Pass the expected PPID to the child via X7 when creating stub process func initChildProcessPPID(initregs *arch.Registers, ppid int32) { // R9 has to be set to 1 when creating stub process. initregs.Regs[9] = _NEW_STUB } func maybePatchSignalInfo(regs *arch.Registers, signalInfo *linux.SignalInfo) (patched bool) { // vsyscall emulation is not supported on ARM64. No need to patch anything. return false } // Noop on arm64. // //go:nosplit func enableCpuidFault() { } // appendArchSeccompRules append architecture specific seccomp rules when creating BPF program. // Ref attachedThread() for more detail. func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet { return rules } // probeSeccomp returns true if seccomp is run after ptrace notifications, // which is generally the case for kernel version >= 4.8. // // On arm64, the support of PTRACE_SYSEMU was added in the 5.3 kernel, so // probeSeccomp can always return true. func probeSeccomp() bool { return true } func (s *subprocess) arm64SyscallWorkaround(t *thread, regs *arch.Registers) { // On ARM64, when ptrace stops on a system call, it uses the x7 // register to indicate whether the stop has been signalled from // syscall entry or syscall exit. This means that we can't get a value // of this register and we can't change it. More details are in the // comment for tracehook_report_syscall in arch/arm64/kernel/ptrace.c. // // This happens only if we stop on a system call, so let's queue a // signal, resume a stub thread and catch it on a signal handling. t.NotifyInterrupt() for { if _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { panic(fmt.Sprintf("ptrace sysemu failed: %v", errno)) } // Wait for the syscall-enter stop. sig := t.wait(stopped) if sig == unix.SIGSTOP { // SIGSTOP was delivered to another thread in the same thread // group, which initiated another group stop. Just ignore it. continue } if sig == (syscallEvent | unix.SIGTRAP) { t.dumpAndPanic(fmt.Sprintf("unexpected syscall event")) } break } if err := t.getRegs(regs); err != nil { panic(fmt.Sprintf("ptrace get regs failed: %v", err)) } } func restoreArchSpecificState(ctx *sysmsg.ThreadContext, ac *arch.Context64) { ctx.TLS = uint64(ac.TLS()) } func setArchSpecificRegs(sysThread *sysmsgThread, regs *arch.Registers) { } func retrieveArchSpecificState(ctx *sysmsg.ThreadContext, ac *arch.Context64) { if !ac.SetTLS(uintptr(ctx.TLS)) { panic(fmt.Sprintf("ac.SetTLS(%+v) failed", ctx.TLS)) } } func archSpecificSysmsgThreadInit(sysThread *sysmsgThread) { // Send a fake event to stop the BPF process so that it enters the sighandler. if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(sysThread.thread.tgid), uintptr(sysThread.thread.tid), uintptr(unix.SIGSEGV)); e != 0 { panic(fmt.Sprintf("tkill failed: %v", e)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/subprocess_linux.go000066400000000000000000000233111465435605700302370ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package systrap import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" ) const syscallEvent unix.Signal = 0x80 // createStub creates a fresh stub processes. // // Precondition: the runtime OS thread must be locked. func createStub() (*thread, error) { // When creating the new child process, we specify SIGKILL as the // signal to deliver when the child exits. We never expect a subprocess // to exit; they are pooled and reused. This is done to ensure that if // a subprocess is OOM-killed, this process (and all other stubs, // transitively) will be killed as well. It's simply not possible to // safely handle a single stub getting killed: the exact state of // execution is unknown and not recoverable. return attachedThread(unix.CLONE_FILES|uintptr(unix.SIGCHLD), linux.SECCOMP_RET_TRAP) } // attachedThread returns a new attached thread. // // Precondition: the runtime OS thread must be locked. func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, error) { // Create a BPF program that allows only the system calls needed by the // stub and all its children. This is used to create child stubs // (below), so we must include the ability to fork, but otherwise lock // down available calls only to what is needed. rules := []seccomp.RuleSet{} if defaultAction != linux.SECCOMP_RET_ALLOW { ruleSet := seccomp.RuleSet{ Rules: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_CLONE: seccomp.Or{ // Allow creation of new subprocesses (used by the master). seccomp.PerArg{seccomp.EqualTo(unix.CLONE_FILES | unix.CLONE_PARENT | unix.SIGCHLD)}, seccomp.PerArg{seccomp.EqualTo(unix.CLONE_FILES | unix.SIGCHLD)}, // Allow creation of new sysmsg thread. seccomp.PerArg{seccomp.EqualTo( unix.CLONE_FILES | unix.CLONE_FS | unix.CLONE_VM | unix.CLONE_PTRACE | linux.SIGKILL)}, // Allow creation of new threads within a single address space (used by address spaces). seccomp.PerArg{seccomp.EqualTo( unix.CLONE_FILES | unix.CLONE_FS | unix.CLONE_SIGHAND | unix.CLONE_THREAD | unix.CLONE_PTRACE | unix.CLONE_VM)}, }, // For the initial process creation. unix.SYS_WAIT4: seccomp.MatchAll{}, unix.SYS_EXIT: seccomp.MatchAll{}, // For the stub prctl dance (all). unix.SYS_PRCTL: seccomp.Or{ seccomp.PerArg{seccomp.EqualTo(unix.PR_SET_PDEATHSIG), seccomp.EqualTo(unix.SIGKILL)}, seccomp.PerArg{seccomp.EqualTo(linux.PR_SET_NO_NEW_PRIVS), seccomp.EqualTo(1)}, }, unix.SYS_GETPPID: seccomp.MatchAll{}, // For the stub to stop itself (all). unix.SYS_GETPID: seccomp.MatchAll{}, unix.SYS_KILL: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(unix.SIGSTOP), }, // Injected to support the address space operations. unix.SYS_MMAP: seccomp.MatchAll{}, unix.SYS_MUNMAP: seccomp.MatchAll{}, // For sysmsg threads. Look at sysmsg/sighandler.c for more details. unix.SYS_RT_SIGRETURN: seccomp.MatchAll{}, unix.SYS_SCHED_YIELD: seccomp.MatchAll{}, unix.SYS_FUTEX: seccomp.Or{ seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(linux.FUTEX_WAIT), seccomp.AnyValue{}, seccomp.AnyValue{}, }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(linux.FUTEX_WAKE), seccomp.AnyValue{}, seccomp.AnyValue{}, }, }, unix.SYS_SIGALTSTACK: seccomp.MatchAll{}, unix.SYS_TKILL: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(unix.SIGSTOP), }, unix.SYS_GETTID: seccomp.MatchAll{}, unix.SYS_EXIT_GROUP: seccomp.MatchAll{}, seccomp.SYS_SECCOMP: seccomp.Or{ seccomp.PerArg{ seccomp.EqualTo(linux.SECCOMP_SET_MODE_FILTER), seccomp.EqualTo(0), seccomp.AnyValue{}, }, seccomp.PerArg{ seccomp.EqualTo(linux.SECCOMP_SET_MODE_FILTER), seccomp.EqualTo(linux.SECCOMP_FILTER_FLAG_NEW_LISTENER), seccomp.AnyValue{}, }, }, }), Action: linux.SECCOMP_RET_ALLOW, } rules = append(rules, ruleSet) rules = appendArchSeccompRules(rules) } instrs, _, err := seccomp.BuildProgram(rules, seccomp.ProgramOptions{ DefaultAction: defaultAction, BadArchAction: defaultAction, }) if err != nil { return nil, err } return forkStub(flags, instrs) } // In the child, this function must not acquire any locks, because they might // have been locked at the time of the fork. This means no rescheduling, no // malloc calls, and no new stack segments. For the same reason compiler does // not race instrument it. // //go:norace func forkStub(flags uintptr, instrs []bpf.Instruction) (*thread, error) { // Declare all variables up front in order to ensure that there's no // need for allocations between beforeFork & afterFork. var ( pid uintptr ppid uintptr errno unix.Errno ) // Remember the current ppid for the pdeathsig race. ppid, _, _ = unix.RawSyscall(unix.SYS_GETPID, 0, 0, 0) // Among other things, beforeFork masks all signals. beforeFork() // Do the clone. pid, _, errno = unix.RawSyscall6(unix.SYS_CLONE, flags, 0, 0, 0, 0, 0) if errno != 0 { afterFork() return nil, errno } // Is this the parent? if pid != 0 { // Among other things, restore signal mask. afterFork() // Initialize the first thread. t := &thread{ tgid: int32(pid), tid: int32(pid), } if sig := t.wait(stopped); sig != unix.SIGSTOP { return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig) } if err := t.attach(); err != nil { return nil, err } t.grabInitRegs() _, err := t.syscallIgnoreInterrupt(&t.initRegs, unix.SYS_MUNMAP, arch.SyscallArgument{Value: stubROMapEnd}, arch.SyscallArgument{Value: maximumUserAddress - stubROMapEnd}) if err != nil { return nil, err } return t, nil } // Move the stub to a new session (and thus a new process group). This // prevents the stub from getting PTY job control signals intended only // for the sentry process. We must call this before restoring signal // mask. if _, _, errno := unix.RawSyscall(unix.SYS_SETSID, 0, 0, 0); errno != 0 { unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) } // afterForkInChild resets all signals to their default dispositions // and restores the signal mask to its pre-fork state. afterForkInChild() if errno := sysmsgSigactions(stubSysmsgStart); errno != 0 { unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) } // Explicitly unmask all signals to ensure that the tracer can see // them. if errno := unmaskAllSignals(); errno != 0 { unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) } // Set an aggressive BPF filter for the stub and all it's children. See // the description of the BPF program built above. if errno := seccomp.SetFilterInChild(instrs); errno != 0 { unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) } // Enable cpuid-faulting. enableCpuidFault() // Call the stub; should not return. stubCall(stubInitProcess, ppid) panic("unreachable") } // createStub creates a stub processes as a child of an existing subprocesses. // // Precondition: the runtime OS thread must be locked. func (t *thread) createStub() (*thread, error) { // There's no need to lock the runtime thread here, as this can only be // called from a context that is already locked. // Pass the expected PPID to the child via R15. regs := t.initRegs initChildProcessPPID(®s, t.tgid) // Call fork in a subprocess. // // The new child must set up PDEATHSIG to ensure it dies if this // process dies. Since this process could die at any time, this cannot // be done via instrumentation from here. // // Instead, we create the child untraced, which will do the PDEATHSIG // setup and then SIGSTOP itself for our attach below. // // See above re: SIGKILL. pid, err := t.syscallIgnoreInterrupt( ®s, unix.SYS_CLONE, arch.SyscallArgument{Value: uintptr(unix.CLONE_FILES | unix.CLONE_PARENT | uintptr(unix.SIGCHLD))}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}) if err != nil { return nil, fmt.Errorf("creating stub process: %v", err) } // Wait for child to enter group-stop, so we don't stop its // bootstrapping work with t.attach below. // // We unfortunately don't have a handy part of memory to write the wait // status. If the wait succeeds, we'll assume that it was the SIGSTOP. // If the child actually exited, the attach below will fail. _, err = unix.Wait4(int(pid), nil, unix.WALL|unix.WUNTRACED, nil) if err != nil { return nil, fmt.Errorf("waiting on stub process: %v", err) } childT := &thread{ tgid: int32(pid), tid: int32(pid), } return childT, nil } func (s *subprocess) createStub() (*thread, error) { req := requestStub{} req.done = make(chan *thread, 1) s.requests <- req childT := <-req.done if childT == nil { return nil, fmt.Errorf("createStub: failed to get clone") } if err := childT.attach(); err != nil { return nil, err } childT.grabInitRegs() return childT, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/subprocess_linux_unsafe.go000066400000000000000000000033461465435605700316060ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 || linux // +build amd64 linux package systrap import ( "sync" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" ) // maskPool contains reusable CPU masks for setting affinity. Unfortunately, // runtime.NumCPU doesn't actually record the number of CPUs on the system, it // just records the number of CPUs available in the scheduler affinity set at // startup. This may a) change over time and b) gives a number far lower than // the maximum indexable CPU. To prevent lots of allocation in the hot path, we // use a pool to store large masks that we can reuse during bind. var maskPool = sync.Pool{ New: func() any { const maxCPUs = 1024 // Not a hard limit; see below. return make([]uintptr, maxCPUs/64) }, } // unmaskAllSignals unmasks all signals on the current thread. // // It is called in a child process after fork(), so the race instrumentation // has to be disabled. // //go:nosplit //go:norace func unmaskAllSignals() unix.Errno { var set linux.SignalSet _, _, errno := unix.RawSyscall6(unix.SYS_RT_SIGPROCMASK, linux.SIG_SETMASK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0) return errno } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/subprocess_pool.go000066400000000000000000000032471465435605700300570ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import ( "sync" ) // subprocessPool exists to solve these distinct problems: // // 1) Subprocesses can't always be killed properly (see subprocess.Release). // In general it's helpful to be able to reuse subprocesses, but we must observe // the subprocess lifecycle before we can do so (e.g. should wait for all // contexts to be released). // // 2) Any seccomp filters that have been installed will apply to subprocesses // created here. Therefore we use the intermediary (source), which is created // on initialization of the platform. type subprocessPool struct { mu sync.Mutex source *subprocess // available stores all subprocesses that are available for reuse. // +checklocks:mu available []*subprocess } func (p *subprocessPool) markAvailable(s *subprocess) { p.mu.Lock() defer p.mu.Unlock() p.available = append(p.available, s) } func (p *subprocessPool) fetchAvailable() *subprocess { p.mu.Lock() defer p.mu.Unlock() if len(p.available) > 0 { s := p.available[len(p.available)-1] p.available = p.available[:len(p.available)-1] return s } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/subprocess_refs.go000066400000000000000000000102221465435605700300340ustar00rootroot00000000000000package systrap import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const subprocessenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var subprocessobj *subprocess // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type subprocessRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *subprocessRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *subprocessRefs) RefType() string { return fmt.Sprintf("%T", subprocessobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *subprocessRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *subprocessRefs) LogRefs() bool { return subprocessenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *subprocessRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *subprocessRefs) IncRef() { v := r.refCount.Add(1) if subprocessenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *subprocessRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if subprocessenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *subprocessRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if subprocessenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *subprocessRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/subprocess_unsafe.go000066400000000000000000000102051465435605700303570ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build go1.18 // +build go1.18 // //go:linkname directives type-checked by checklinkname. Any other // non-linkname assumptions outside the Go 1 compatibility guarantee should // have an accompanied vet check or version guard build tag. package systrap import ( "fmt" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg" ) //go:linkname beforeFork syscall.runtime_BeforeFork func beforeFork() //go:linkname afterFork syscall.runtime_AfterFork func afterFork() //go:linkname afterForkInChild syscall.runtime_AfterForkInChild func afterForkInChild() // cputicks is implemented in assembly. func cputicks() int64 // spinloop is implemented in assembly. func spinloop() // getThreadContextFromID returns a ThreadContext struct that corresponds to the // given ID. // // Precondition: cid must be a valid thread context ID that has a mapping for it // that exists in s.contexts. func (s *subprocess) getThreadContextFromID(cid uint64) *sysmsg.ThreadContext { tcSlot := s.threadContextRegion + uintptr(cid)*sysmsg.AllocatedSizeofThreadContextStruct return (*sysmsg.ThreadContext)(unsafe.Pointer(tcSlot)) } func mmapContextQueueForSentry(memoryFile *pgalloc.MemoryFile, opts pgalloc.AllocOpts) (memmap.FileRange, *contextQueue) { fr, err := memoryFile.Allocate(uint64(stubContextQueueRegionLen), opts) if err != nil { panic(fmt.Sprintf("failed to allocate a new subprocess context memory region")) } addr, _, errno := unix.RawSyscall6( unix.SYS_MMAP, 0, uintptr(fr.Length()), unix.PROT_WRITE|unix.PROT_READ, unix.MAP_SHARED|unix.MAP_FILE, uintptr(memoryFile.FD()), uintptr(fr.Start)) if errno != 0 { panic(fmt.Sprintf("mmap failed for subprocess context memory region: %v", errno)) } return fr, (*contextQueue)(unsafe.Pointer(addr)) } func saveFPState(ctx *sharedContext, ac *arch.Context64) { fpState := ac.FloatingPointData().BytePointer() dst := unsafeSlice(uintptr(unsafe.Pointer(fpState)), archState.FpLen()) src := ctx.shared.FPState[:] copy(dst, src) } // restoreFPStateDecoupledContext writes FPState from c to the thread context // shared memory region if there is any need to do so. func restoreFPState(ctx *sharedContext, c *platformContext, ac *arch.Context64) { if !c.needRestoreFPState { return } c.needRestoreFPState = false ctx.setFPStateChanged() fpState := ac.FloatingPointData().BytePointer() src := unsafeSlice(uintptr(unsafe.Pointer(fpState)), archState.FpLen()) dst := ctx.shared.FPState[:] copy(dst, src) } // alive returns true if the subprocess is alive. func (s *subprocess) alive() bool { if s.dead.Load() { return false } // Wait4 doesn't support WNOWAIT, but here is no other way to find out // whether a process exited or was stopped by ptrace. siginfo := linux.SignalInfo{} _, _, errno := unix.Syscall6( unix.SYS_WAITID, unix.P_PID, uintptr(s.syscallThread.thread.tid), uintptr(unsafe.Pointer(&siginfo)), uintptr(unix.WEXITED|unix.WNOHANG|unix.WNOWAIT), 0, 0) if errno == 0 && siginfo.PID() == 0 { return true } if errno == 0 && siginfo.Code != linux.CLD_EXITED && siginfo.Code != linux.CLD_KILLED { return true } // The process is dead, let's collect its zombie. wstatus := unix.WaitStatus(0) pid, err := unix.Wait4(int(s.syscallThread.thread.tid), &wstatus, unix.WNOHANG, nil) log.Warningf("the subprocess %d exited (status: %s, err %s)", pid, wstatus, err) s.dead.Store(true) return false } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/syscall_thread.go000066400000000000000000000200051465435605700276260ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import ( "fmt" "os" "sync/atomic" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg" "gvisor.dev/gvisor/pkg/sentry/usage" ) // The syscall message consists of sentry and stub messages. const syscallThreadMessageSize = hostarch.PageSize * 2 // syscallThread implements the process of calling syscalls in a stub process. // // Each syscall thread owns a shared memory region to communicate with the // Sentry. This region consists of two pages. The first page called // sentryMessage is mapped as read-only in the stub address space. The second // page called stubMessage is mapped as read-write in the stub process. // // Any memory regions that are mapped as read-write in a stub address space can // be changed from a user code. This means that we can't trust the content of // stubMessage, but it is used to receive a syscall return code. Therefore // syscallThread can be used only in these cases: // - If a system call never fails (e.g munmap). // - If a system call has to return only one know value or if it fails, // it doesn't not reveal any data (e.g. mmap). type syscallThread struct { // subproc is a link to the subprocess which is used to call native // system calls and track when a sysmsg thread has to be recreated. // Look at getSysmsgThread() for more details. subproc *subprocess // thread is a thread identifier. thread *thread // stackRange is the range for the sentry syscall message in the memory // file. stackRange memmap.FileRange // sentryAddr is the address of the shared memory region in the Sentry // address space. sentryAddr uintptr // stubAddr is the address of the shared memory region in the stub // address space. stubAddr uintptr // sentryMessage is the first page of the share message that can't be // modified by the stub thread. sentryMessage *syscallSentryMessage // stubMessage is the second page of the shared message that can be // modified by the stub thread. stubMessage *syscallStubMessage seccompNotify *os.File seccompNotifyResp linux.SeccompNotifResp } func (t *syscallThread) init(seccompNotify bool) error { // Allocate a new shared memory message. opts := pgalloc.AllocOpts{ Kind: usage.System, Dir: pgalloc.TopDown, } fr, err := t.subproc.memoryFile.Allocate(syscallThreadMessageSize, opts) if err != nil { return err } t.stackRange = fr t.stubAddr = stubSysmsgStack + sysmsg.PerThreadMemSize*uintptr(t.thread.sysmsgStackID) err = t.mapMessageIntoStub() if err != nil { t.destroy() return err } if seccompNotify && seccompNotifyIsSupported { if t.seccompNotify, err = t.installSeccompNotify(); err != nil { t.destroy() return fmt.Errorf("failed to install seccomp notify rules: %w", err) } } // Map the stack into the sentry. sentryAddr, _, errno := unix.RawSyscall6( unix.SYS_MMAP, 0, syscallThreadMessageSize, unix.PROT_WRITE|unix.PROT_READ, unix.MAP_SHARED|unix.MAP_FILE, uintptr(t.subproc.memoryFile.FD()), uintptr(fr.Start)) if errno != 0 { t.destroy() return fmt.Errorf("mmap failed: %v", errno) } t.sentryAddr = sentryAddr t.initRequestReplyAddresses(sentryAddr) return nil } func (t *syscallThread) destroy() { if t.sentryAddr != 0 { _, _, errno := unix.RawSyscall6( unix.SYS_MUNMAP, t.sentryAddr, syscallThreadMessageSize, 0, 0, 0, 0) if errno != 0 { panic(fmt.Sprintf("mumap failed: %v", errno)) } } if t.stubAddr != 0 { _, err := t.thread.syscallIgnoreInterrupt(&t.thread.initRegs, unix.SYS_MUNMAP, arch.SyscallArgument{Value: t.stubAddr}, arch.SyscallArgument{Value: uintptr(syscallThreadMessageSize)}) if err != nil { panic(fmt.Sprintf("munmap failed: %v", err)) } } t.subproc.memoryFile.DecRef(t.stackRange) t.subproc.sysmsgStackPool.Put(t.thread.sysmsgStackID) } func (t *syscallThread) installSeccompNotify() (*os.File, error) { fd, err := t.thread.syscallIgnoreInterrupt(&t.thread.initRegs, seccomp.SYS_SECCOMP, arch.SyscallArgument{Value: uintptr(linux.SECCOMP_SET_MODE_FILTER)}, arch.SyscallArgument{Value: uintptr(linux.SECCOMP_FILTER_FLAG_NEW_LISTENER)}, arch.SyscallArgument{Value: stubSyscallRules}) if err != nil { return nil, err } _, _, errno := unix.RawSyscall(unix.SYS_IOCTL, fd, linux.SECCOMP_IOCTL_NOTIF_SET_FLAGS, linux.SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP) if errno != 0 { t.thread.Debugf("failed to set SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP") } return os.NewFile(fd, "seccomp_notify"), nil } // mapMessageIntoStub maps the syscall message into the stub process address space. func (t *syscallThread) mapMessageIntoStub() error { // Map sentryMessage as read-only. _, err := t.thread.syscallIgnoreInterrupt(&t.thread.initRegs, unix.SYS_MMAP, arch.SyscallArgument{Value: t.stubAddr}, arch.SyscallArgument{Value: uintptr(hostarch.PageSize)}, arch.SyscallArgument{Value: uintptr(unix.PROT_READ)}, arch.SyscallArgument{Value: unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED}, arch.SyscallArgument{Value: uintptr(t.subproc.memoryFile.FD())}, arch.SyscallArgument{Value: uintptr(t.stackRange.Start)}) if err != nil { return err } // Map stubMessage as read-write. _, err = t.thread.syscallIgnoreInterrupt(&t.thread.initRegs, unix.SYS_MMAP, arch.SyscallArgument{Value: t.stubAddr + syscallStubMessageOffset}, arch.SyscallArgument{Value: uintptr(hostarch.PageSize)}, arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)}, arch.SyscallArgument{Value: unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED}, arch.SyscallArgument{Value: uintptr(t.subproc.memoryFile.FD())}, arch.SyscallArgument{Value: uintptr(t.stackRange.Start + hostarch.PageSize)}) return err } // attach attaches to the stub thread with ptrace and unlock signals. func (t *syscallThread) attach() error { if err := t.thread.attach(); err != nil { return err } // We need to unblock signals, because the TRAP signal is used to run // syscalls via ptrace. t.unmaskAllSignalsAttached() return nil } const maxErrno = 4095 func (t *syscallThread) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) { if t.subproc.dead.Load() { return 0, errDeadSubprocess } sentryMsg := t.sentryMessage stubMsg := t.stubMessage sentryMsg.sysno = uint64(sysno) for i := 0; i < len(sentryMsg.args); i++ { if i < len(args) { sentryMsg.args[i] = uint64(args[i].Value) } else { sentryMsg.args[i] = 0 } } if t.seccompNotify != nil { if errno := t.kickSeccompNotify(); errno != 0 { t.thread.kill() t.thread.Warningf("failed sending request to syscall thread: %s", errno) return 0, errDeadSubprocess } if err := t.waitForSeccompNotify(); err != nil { t.thread.Warningf("failed waiting for seccomp notify: %s", err) return 0, errDeadSubprocess } } else { // Notify the syscall thread about a new syscall request. atomic.AddUint32(&sentryMsg.state, 1) futexWakeUint32(&sentryMsg.state) // Wait for reply. // // futex waits for sentryMsg.state that isn't changed, so it will // returns only only when the other side will call FUTEX_WAKE. futexWaitWake(&sentryMsg.state, atomic.LoadUint32(&sentryMsg.state)) } errno := -uintptr(stubMsg.ret) if errno > 0 && errno < maxErrno { return 0, fmt.Errorf("stub syscall (%x, %#v) failed with %w", sysno, args, unix.Errno(errno)) } return uintptr(stubMsg.ret), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/syscall_thread_amd64.go000066400000000000000000000031201465435605700306200ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package systrap import ( "fmt" "runtime" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sentry/arch" ) func (t *syscallThread) detach() { p := t.thread // The syscall thread can't handle any signals and doesn't expect to // receive anything. t.maskAllSignalsAttached() regs := p.initRegs regs.Rsp = 0 regs.R12 = uint64(t.stubAddr) regs.R13 = uint64(t.sentryMessage.state + 1) if t.seccompNotify != nil { regs.Rbx = _RUN_SECCOMP_LOOP } else { regs.Rbx = _RUN_SYSCALL_LOOP } // Skip the syscall instruction. regs.Rip += arch.SyscallWidth if err := p.setRegs(®s); err != nil { panic(fmt.Sprintf("ptrace set regs failed: %v", err)) } p.detach() if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(p.tgid), uintptr(p.tid), uintptr(unix.SIGCONT)); e != 0 { panic(fmt.Sprintf("tkill failed: %v", e)) } runtime.UnlockOSThread() if t.seccompNotify != nil { if err := t.waitForSeccompNotify(); err != nil { panic(fmt.Sprintf("%s", err)) } } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/syscall_thread_arm64.go000066400000000000000000000031401465435605700306400ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package systrap import ( "fmt" "runtime" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sentry/arch" ) func (t *syscallThread) detach() { p := t.thread // The syscall thread can't handle any signals and doesn't expect to // receive anything. t.maskAllSignalsAttached() regs := p.initRegs regs.Sp = 0 regs.Regs[12] = uint64(t.stubAddr) regs.Regs[13] = uint64(t.sentryMessage.state + 1) if t.seccompNotify != nil { regs.Regs[9] = _RUN_SECCOMP_LOOP } else { regs.Regs[9] = _RUN_SYSCALL_LOOP } // Skip the syscall instruction. regs.Pc += arch.SyscallWidth if err := p.setRegs(®s); err != nil { panic(fmt.Sprintf("ptrace set regs failed: %v", err)) } p.detach() if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(p.tgid), uintptr(p.tid), uintptr(unix.SIGCONT)); e != 0 { panic(fmt.Sprintf("tkill failed: %v", e)) } runtime.UnlockOSThread() if t.seccompNotify != nil { if err := t.waitForSeccompNotify(); err != nil { panic(fmt.Sprintf("%s", err)) } } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/syscall_thread_defs.go000066400000000000000000000022721465435605700306350ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import ( "gvisor.dev/gvisor/pkg/hostarch" ) const syscallStubMessageOffset = hostarch.PageSize // syscallSentryMessage is a shared message that can be changed only from the // Sentry and a stub process can only read it. type syscallSentryMessage struct { state uint32 unused uint32 sysno uint64 args [6]uint64 } // syscallStubMessage is a shared message that can be changed from a stub // process. It is used to notify the Sentry that a requested system call has // been executed. // // Attention: It can be compromised by user threads. type syscallStubMessage struct { ret uint64 } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/syscall_thread_unsafe.go000066400000000000000000000071301465435605700311730ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import ( "fmt" "sync/atomic" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" ) func (t *syscallThread) initRequestReplyAddresses(sentryStackAddr uintptr) { // These are safe as these addresses are mmapped and never moved/gced. sentryMessage := (*syscallSentryMessage)(unsafe.Pointer(sentryStackAddr)) stubMessage := (*syscallStubMessage)(unsafe.Pointer(sentryStackAddr + syscallStubMessageOffset)) atomic.StoreUint32(&sentryMessage.state, 0) t.sentryMessage = sentryMessage t.stubMessage = stubMessage } // maskAllSignals blocks all signals. func (t *syscallThread) maskAllSignalsAttached() { p := t.thread mask := ^uint64(0) if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, linux.PTRACE_SETSIGMASK, uintptr(p.tid), 8, uintptr(unsafe.Pointer(&mask)), 0, 0); errno != 0 { panic(fmt.Sprintf("unable to setmask: %v", errno)) } } // unmaskAllSignals unblocks all signals. func (t *syscallThread) unmaskAllSignalsAttached() { p := t.thread mask := uint64(0) if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, linux.PTRACE_SETSIGMASK, uintptr(p.tid), 8, uintptr(unsafe.Pointer(&mask)), 0, 0); errno != 0 { panic(fmt.Sprintf("unable to setmask: %v", errno)) } } func futexWakeUint32(addr *uint32) error { if _, _, e := unix.RawSyscall6(unix.SYS_FUTEX, uintptr(unsafe.Pointer(addr)), linux.FUTEX_WAKE, 1, 0, 0, 0); e != 0 { return fmt.Errorf("failed to FUTEX_WAKE: %v", e) } return nil } func futexWaitForUint32(addr *uint32, targetValue uint32) error { for { val := atomic.LoadUint32(addr) if val == targetValue { break } _, _, e := unix.Syscall6(unix.SYS_FUTEX, uintptr(unsafe.Pointer(addr)), linux.FUTEX_WAIT, uintptr(val), 0, 0, 0) if e != 0 && e != unix.EAGAIN && e != unix.EINTR { return fmt.Errorf("failed to FUTEX_WAIT: %v", e) } } return nil } // futexWaitWake waits when other side will call FUTEX_WAKE. A value of the // futex word has to be equal to futexValue and it must not be changed. func futexWaitWake(futexAddr *uint32, futexValue uint32) error { for { _, _, e := unix.Syscall6(unix.SYS_FUTEX, uintptr(unsafe.Pointer(futexAddr)), linux.FUTEX_WAIT, uintptr(futexValue), 0, 0, 0) if e == 0 { break } if e != unix.EAGAIN && e != unix.EINTR { return fmt.Errorf("failed to FUTEX_WAIT: %v", e) } } return nil } func (t *syscallThread) kickSeccompNotify() unix.Errno { _, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(t.seccompNotify.Fd()), uintptr(linux.SECCOMP_IOCTL_NOTIF_SEND), uintptr(unsafe.Pointer(&t.seccompNotifyResp))) return errno } func (t *syscallThread) waitForSeccompNotify() error { for { req := linux.SeccompNotif{} _, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(t.seccompNotify.Fd()), uintptr(linux.SECCOMP_IOCTL_NOTIF_RECV), uintptr(unsafe.Pointer(&req))) if errno == 0 { t.seccompNotifyResp.ID = req.ID break } if errno == unix.EINTR && t.subproc.alive() { continue } t.thread.kill() return fmt.Errorf("failed getting response from syscall thread : %w", errno) } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/000077500000000000000000000000001465435605700256265ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/atomic.h000066400000000000000000000024021465435605700272510ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef THIRD_PARTY_GVISOR_PKG_SENTRY_PLATFORM_SYSTRAP_SYSMSG_ATOMIC_H_ #define THIRD_PARTY_GVISOR_PKG_SENTRY_PLATFORM_SYSTRAP_SYSMSG_ATOMIC_H_ #define atomic_load(p) __atomic_load_n(p, __ATOMIC_ACQUIRE) #define atomic_store(p, val) __atomic_store_n(p, val, __ATOMIC_RELEASE) #define atomic_compare_exchange(p, old, val) \ __atomic_compare_exchange_n(p, old, val, false, __ATOMIC_ACQ_REL, \ __ATOMIC_ACQUIRE) #define atomic_add(p, val) __atomic_add_fetch(p, val, __ATOMIC_ACQ_REL) #define atomic_sub(p, val) __atomic_sub_fetch(p, val, __ATOMIC_ACQ_REL) #endif // THIRD_PARTY_GVISOR_PKG_SENTRY_PLATFORM_SYSTRAP_SYSMSG_ATOMIC_H_ golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/gen_offsets_go.sh000066400000000000000000000020331465435605700311470ustar00rootroot00000000000000# Copyright 2020 The gVisor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This script generates a golang file which contains: # * byte array with the sysmsg stub binary blob. # * set of variables with addresses of exported symbols. #!/bin/bash set -e set -u FILE=$1 NAME=$2 PREFIX=${NAME}_blob_offset__ BLOB=${NAME}_blob OBJNAME=$3 AWK_CMD='$2 ~ /^[tBCTA]$/ { print "var '$PREFIX'" $3 " = 0x" $1 }' cat << EOF /* Autogenerated by $0, do not edit */ package sysmsg EOF nm "$OBJNAME" | grep "__export_" | tr . _ | awk "$AWK_CMD" golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/pie.lds.S000066400000000000000000000023501465435605700273100ustar00rootroot00000000000000/* Copyright 2020 The gVisor Authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ SECTIONS { .crblob 0x0 : { *(.head.text) *(.text*) . = ALIGN(32); *(.data*) . = ALIGN(32); *(COMMON*) . = ALIGN(32); *(.rodata*) . = ALIGN(32); *(.bss*) . = ALIGN(32); *(.got*) . = ALIGN(32); *(.debug*) . = ALIGN(32); } =0x00000000, /DISCARD/ : { *(.interp) *(.gnu.hash) *(.hash) *(.dynamic) *(.dynsym) *(.dynstr) *(.rela.dyn) *(.eh_frame) *(.note.gnu.property) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/sighandler_amd64.c000066400000000000000000000445531465435605700311200ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include "atomic.h" #include "sysmsg.h" #include "sysmsg_offsets.h" #include "sysmsg_offsets_amd64.h" // TODO(b/271631387): These globals are shared between AMD64 and ARM64; move to // sysmsg_lib.c. struct arch_state __export_arch_state; uint64_t __export_stub_start; long __syscall(long n, long a1, long a2, long a3, long a4, long a5, long a6) { unsigned long ret; register long r10 __asm__("r10") = a4; register long r8 __asm__("r8") = a5; register long r9 __asm__("r9") = a6; __asm__ __volatile__("syscall" : "=a"(ret) : "a"(n), "D"(a1), "S"(a2), "d"(a3), "r"(r10), "r"(r8), "r"(r9) : "rcx", "r11", "memory"); return ret; } long sys_futex(uint32_t *addr, int op, int val, struct __kernel_timespec *tv, uint32_t *addr2, int val3) { return __syscall(__NR_futex, (long)addr, (long)op, (long)val, (long)tv, (long)addr2, (long)val3); } union csgsfs { uint64_t csgsfs; // REG_CSGSFS struct { uint16_t cs; uint16_t gs; uint16_t fs; uint16_t ss; }; }; static void gregs_to_ptregs(ucontext_t *ucontext, struct user_regs_struct *ptregs) { union csgsfs csgsfs = {.csgsfs = ucontext->uc_mcontext.gregs[REG_CSGSFS]}; // Set all registers except: // * fs_base and gs_base, because they can be only changed by arch_prctl. // * DS and ES are not used on x86_64. ptregs->r15 = ucontext->uc_mcontext.gregs[REG_R15]; ptregs->r14 = ucontext->uc_mcontext.gregs[REG_R14]; ptregs->r13 = ucontext->uc_mcontext.gregs[REG_R13]; ptregs->r12 = ucontext->uc_mcontext.gregs[REG_R12]; ptregs->rbp = ucontext->uc_mcontext.gregs[REG_RBP]; ptregs->rbx = ucontext->uc_mcontext.gregs[REG_RBX]; ptregs->r11 = ucontext->uc_mcontext.gregs[REG_R11]; ptregs->r10 = ucontext->uc_mcontext.gregs[REG_R10]; ptregs->r9 = ucontext->uc_mcontext.gregs[REG_R9]; ptregs->r8 = ucontext->uc_mcontext.gregs[REG_R8]; ptregs->rax = ucontext->uc_mcontext.gregs[REG_RAX]; ptregs->rcx = ucontext->uc_mcontext.gregs[REG_RCX]; ptregs->rdx = ucontext->uc_mcontext.gregs[REG_RDX]; ptregs->rsi = ucontext->uc_mcontext.gregs[REG_RSI]; ptregs->rdi = ucontext->uc_mcontext.gregs[REG_RDI]; ptregs->rip = ucontext->uc_mcontext.gregs[REG_RIP]; ptregs->eflags = ucontext->uc_mcontext.gregs[REG_EFL]; ptregs->rsp = ucontext->uc_mcontext.gregs[REG_RSP]; ptregs->cs = csgsfs.cs; ptregs->ss = csgsfs.ss; ptregs->fs = csgsfs.fs; ptregs->gs = csgsfs.gs; } static void ptregs_to_gregs(ucontext_t *ucontext, struct user_regs_struct *ptregs) { union csgsfs csgsfs = {.csgsfs = ucontext->uc_mcontext.gregs[REG_CSGSFS]}; ucontext->uc_mcontext.gregs[REG_R15] = ptregs->r15; ucontext->uc_mcontext.gregs[REG_R14] = ptregs->r14; ucontext->uc_mcontext.gregs[REG_R13] = ptregs->r13; ucontext->uc_mcontext.gregs[REG_R12] = ptregs->r12; ucontext->uc_mcontext.gregs[REG_RBP] = ptregs->rbp; ucontext->uc_mcontext.gregs[REG_RBX] = ptregs->rbx; ucontext->uc_mcontext.gregs[REG_R11] = ptregs->r11; ucontext->uc_mcontext.gregs[REG_R10] = ptregs->r10; ucontext->uc_mcontext.gregs[REG_R9] = ptregs->r9; ucontext->uc_mcontext.gregs[REG_R8] = ptregs->r8; ucontext->uc_mcontext.gregs[REG_RAX] = ptregs->rax; ucontext->uc_mcontext.gregs[REG_RCX] = ptregs->rcx; ucontext->uc_mcontext.gregs[REG_RDX] = ptregs->rdx; ucontext->uc_mcontext.gregs[REG_RSI] = ptregs->rsi; ucontext->uc_mcontext.gregs[REG_RDI] = ptregs->rdi; ucontext->uc_mcontext.gregs[REG_RIP] = ptregs->rip; ucontext->uc_mcontext.gregs[REG_EFL] = ptregs->eflags; ucontext->uc_mcontext.gregs[REG_RSP] = ptregs->rsp; csgsfs.cs = ptregs->cs; csgsfs.ss = ptregs->ss; csgsfs.fs = ptregs->fs; csgsfs.gs = ptregs->gs; ucontext->uc_mcontext.gregs[REG_CSGSFS] = csgsfs.csgsfs; } // get_fsbase writes the current thread's fsbase value to ptregs. static uint64_t get_fsbase(void) { uint64_t fsbase; if (__export_arch_state.fsgsbase) { asm volatile("rdfsbase %0" : "=r"(fsbase)); } else { int ret = __syscall(__NR_arch_prctl, ARCH_GET_FS, (long)&fsbase, 0, 0, 0, 0); if (ret) { panic(STUB_ERROR_ARCH_PRCTL, ret); } } return fsbase; } // set_fsbase sets the current thread's fsbase to the fsbase value in ptregs. static void set_fsbase(uint64_t fsbase) { if (__export_arch_state.fsgsbase) { asm volatile("wrfsbase %0" : : "r"(fsbase) : "memory"); } else { int ret = __syscall(__NR_arch_prctl, ARCH_SET_FS, fsbase, 0, 0, 0, 0); if (ret) { panic(STUB_ERROR_ARCH_PRCTL, ret); } } } // switch_context_amd64 is a wrapper of switch_context() which does checks // specific to amd64. struct thread_context *switch_context_amd64( struct sysmsg *sysmsg, struct thread_context *ctx, enum context_state new_context_state) { struct thread_context *old_ctx = sysmsg->context; for (;;) { ctx = switch_context(sysmsg, ctx, new_context_state); // After setting THREAD_STATE_NONE, syshandled can be interrupted by // SIGCHLD. In this case, we consider that the current context contains // the actual state and sighandler can take control on it. atomic_store(&sysmsg->state, THREAD_STATE_NONE); if (atomic_load(&ctx->interrupt) != 0) { atomic_store(&sysmsg->state, THREAD_STATE_PREP); // This context got interrupted while it was waiting in the queue. // Setup all the necessary bits to let the sentry know this context has // switched back because of it. atomic_store(&ctx->interrupt, 0); new_context_state = CONTEXT_STATE_FAULT; ctx->signo = SIGCHLD; ctx->siginfo.si_signo = SIGCHLD; ctx->ptregs.orig_rax = -1; } else { break; } } if (old_ctx != ctx || ctx->last_thread_id != sysmsg->thread_id) { ctx->fpstate_changed = 1; } return ctx; } static void prep_fpstate_for_sigframe(void *buf, uint32_t user_size, bool use_xsave); void __export_sighandler(int signo, siginfo_t *siginfo, void *_ucontext) { ucontext_t *ucontext = _ucontext; void *sp = sysmsg_sp(); struct sysmsg *sysmsg = sysmsg_addr(sp); if (sysmsg != sysmsg->self) panic(STUB_ERROR_BAD_SYSMSG, 0); int32_t thread_state = atomic_load(&sysmsg->state); if (thread_state == THREAD_STATE_INITIALIZING) { // This thread was interrupted before it even had a context. return; } struct thread_context *ctx = sysmsg->context; // If the current thread is in syshandler, an interrupt has to be postponed, // because sysmsg can't be changed. if (signo == SIGCHLD && thread_state != THREAD_STATE_NONE) { return; } // Handle faults in syshandler. if ((signo == SIGSEGV || signo == SIGBUS) && sysmsg->fault_jump) { ucontext->uc_mcontext.gregs[REG_RIP] += sysmsg->fault_jump; sysmsg->fault_jump = 0; return; } long fs_base = get_fsbase(); ctx->signo = signo; ctx->siginfo = *siginfo; // syshandler sets THREAD_STATE_NONE right before it starts resuming a // context. It means the context contains the actual state, and the state of // the stub thread is incomplete. if (signo != SIGCHLD || ucontext->uc_mcontext.gregs[REG_RIP] < __export_stub_start) { ctx->ptregs.fs_base = fs_base; gregs_to_ptregs(ucontext, &ctx->ptregs); memcpy(ctx->fpstate, (uint8_t *)ucontext->uc_mcontext.fpregs, __export_arch_state.fp_len); atomic_store(&ctx->fpstate_changed, 0); } enum context_state ctx_state = CONTEXT_STATE_INVALID; switch (signo) { case SIGSYS: { ctx_state = CONTEXT_STATE_SYSCALL; // Check whether this syscall can be replaced on a function call or not. // If a syscall instruction set is "mov sysno, %eax, syscall", it can be // replaced on a function call which works much faster. // Look at pkg/sentry/usertrap for more details. if (siginfo->si_arch == AUDIT_ARCH_X86_64) { uint8_t *rip = (uint8_t *)ctx->ptregs.rip; // FIXME(b/144063246): Even if all five bytes before the syscall // instruction match the "mov sysno, %eax" instruction, they can be a // part of a longer instruction. Here is not easy way to decode x86 // instructions in reverse. uint64_t syscall_code_int[2]; uint8_t *syscall_code = (uint8_t *)&syscall_code_int[0]; // We need to receive 5 bytes before the syscall instruction, but they // are not aligned, so we can't read them atomically. Let's read them // twice. If the second copy will not contain the FAULT_OPCODE, this // will mean that the first copy is in the consistent state. for (int i = 0; i < 2; i++) { // fault_jump is set to the size of "mov (%rbx)" which is 3 bytes. atomic_store(&sysmsg->fault_jump, 3); asm volatile("movq (%1), %0\n" : "=a"(syscall_code_int[i]) : "b"(rip - 8) : "cc", "memory"); atomic_store(&sysmsg->fault_jump, 0); } // The mov instruction is 5 bytes: b8 . // The syscall instruction is 2 bytes: 0f 05. uint32_t sysno = *(uint32_t *)(syscall_code + 2); int need_trap = *(syscall_code + 6) == 0x0f && // syscall *(syscall_code + 7) == 0x05 && *(syscall_code + 1) == 0xb8 && // mov sysno, %eax sysno == siginfo->si_syscall && sysno == ctx->ptregs.rax; // Restart syscall if it has been patched by another thread. When a // syscall instruction set is replaced on a function call, all threads // have to call it via the function call. Otherwise the syscall will not // be restarted properly if it will be interrupted by signal. syscall_code = (uint8_t *)&syscall_code_int[1]; uint8_t syscall_opcode = *(syscall_code + 6); // A binary patch is built so that the first byte of the syscall // instruction is changed on the invalid instruction. If we meet this // case, this means that another thread has been patched this syscall // and we need to restart it. if (syscall_opcode == FAULT_OPCODE) { ucontext->uc_mcontext.gregs[REG_RIP] -= 7; return; } if (need_trap) { // This syscall can be replaced on the function call. ctx_state = CONTEXT_STATE_SYSCALL_NEED_TRAP; } } ctx->ptregs.orig_rax = ctx->ptregs.rax; ctx->ptregs.rax = (unsigned long)-ENOSYS; if (siginfo->si_arch != AUDIT_ARCH_X86_64) // gVisor doesn't support x32 system calls, so let's change the syscall // number so that it returns ENOSYS. ctx->ptregs.orig_rax += 0x86000000; break; } case SIGCHLD: case SIGSEGV: case SIGBUS: case SIGFPE: case SIGTRAP: case SIGILL: ctx->ptregs.orig_rax = -1; ctx_state = CONTEXT_STATE_FAULT; break; default: return; } ctx = switch_context_amd64(sysmsg, ctx, ctx_state); if (fs_base != ctx->ptregs.fs_base) { set_fsbase(ctx->ptregs.fs_base); } if (atomic_load(&ctx->fpstate_changed)) { prep_fpstate_for_sigframe( ctx->fpstate, __export_arch_state.fp_len, __export_arch_state.xsave_mode != XSAVE_MODE_FXSAVE); ucontext->uc_mcontext.fpregs = (void *)ctx->fpstate; } ptregs_to_gregs(ucontext, &ctx->ptregs); } void __syshandler() { struct sysmsg *sysmsg; asm volatile("movq %%gs:0, %0\n" : "=r"(sysmsg) : :); // SYSMSG_STATE_PREP is set to postpone interrupts. Look at // __export_sighandler for more details. int state = atomic_load(&sysmsg->state); if (state != THREAD_STATE_PREP) panic(STUB_ERROR_BAD_THREAD_STATE, 0); struct thread_context *ctx = sysmsg->context; enum context_state ctx_state = CONTEXT_STATE_SYSCALL_TRAP; ctx->signo = SIGSYS; ctx->siginfo.si_addr = 0; ctx->siginfo.si_syscall = ctx->ptregs.rax; ctx->ptregs.rax = (unsigned long)-ENOSYS; long fs_base = get_fsbase(); ctx->ptregs.fs_base = fs_base; ctx = switch_context_amd64(sysmsg, ctx, ctx_state); // switch_context_amd64 changed sysmsg->state to THREAD_STATE_NONE, so we can // only resume the current process, all other actions are // prohibited after this point. if (fs_base != ctx->ptregs.fs_base) { set_fsbase(ctx->ptregs.fs_base); } } void __export_start(struct sysmsg *sysmsg, void *_ucontext) { init_new_thread(); asm volatile("movq %%gs:0, %0\n" : "=r"(sysmsg) : :); if (sysmsg->self != sysmsg) { panic(STUB_ERROR_BAD_SYSMSG, 0); } struct thread_context *ctx = switch_context_amd64(sysmsg, NULL, CONTEXT_STATE_INVALID); restore_state(sysmsg, ctx, _ucontext); } // asm_restore_state is implemented in syshandler_amd64.S void asm_restore_state(); // On x86 restore_state jumps straight to user code and does not return. void restore_state(struct sysmsg *sysmsg, struct thread_context *ctx, void *unused) { set_fsbase(ctx->ptregs.fs_base); asm_restore_state(); } void verify_offsets_amd64() { #define PTREGS_OFFSET offsetof(struct thread_context, ptregs) BUILD_BUG_ON(offsetof_thread_context_ptregs != PTREGS_OFFSET); BUILD_BUG_ON(offsetof_thread_context_ptregs_r15 != (offsetof(struct user_regs_struct, r15) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_r14 != (offsetof(struct user_regs_struct, r14) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_r13 != (offsetof(struct user_regs_struct, r13) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_r12 != (offsetof(struct user_regs_struct, r12) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_rbp != (offsetof(struct user_regs_struct, rbp) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_rbx != (offsetof(struct user_regs_struct, rbx) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_r11 != (offsetof(struct user_regs_struct, r11) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_r10 != (offsetof(struct user_regs_struct, r10) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_r9 != (offsetof(struct user_regs_struct, r9) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_r8 != (offsetof(struct user_regs_struct, r8) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_rax != (offsetof(struct user_regs_struct, rax) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_rcx != (offsetof(struct user_regs_struct, rcx) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_rdx != (offsetof(struct user_regs_struct, rdx) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_rsi != (offsetof(struct user_regs_struct, rsi) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_rdi != (offsetof(struct user_regs_struct, rdi) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_orig_rax != (offsetof(struct user_regs_struct, orig_rax) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_rip != (offsetof(struct user_regs_struct, rip) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_cs != (offsetof(struct user_regs_struct, cs) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_eflags != (offsetof(struct user_regs_struct, eflags) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_rsp != (offsetof(struct user_regs_struct, rsp) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_ss != (offsetof(struct user_regs_struct, ss) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_fs_base != (offsetof(struct user_regs_struct, fs_base) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_gs_base != (offsetof(struct user_regs_struct, gs_base) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_ds != (offsetof(struct user_regs_struct, ds) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_es != (offsetof(struct user_regs_struct, es) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_fs != (offsetof(struct user_regs_struct, fs) + PTREGS_OFFSET)); BUILD_BUG_ON(offsetof_thread_context_ptregs_gs != (offsetof(struct user_regs_struct, gs) + PTREGS_OFFSET)); #undef PTREGS_OFFSET } // asm/sigcontext.h conflicts with signal.h. struct __fpx_sw_bytes { uint32_t magic1; uint32_t extended_size; uint64_t xfeatures; uint32_t xstate_size; uint32_t padding[7]; }; struct __fpstate { uint16_t cwd; uint16_t swd; uint16_t twd; uint16_t fop; uint64_t rip; uint64_t rdp; uint32_t mxcsr; uint32_t mxcsr_mask; uint32_t st_space[32]; uint32_t xmm_space[64]; uint32_t reserved2[12]; struct __fpx_sw_bytes sw_reserved; }; // The kernel expects to see some additional info in an FPU state. More details // can be found in arch/x86/kernel/fpu/signal.c:check_xstate_in_sigframe. static void prep_fpstate_for_sigframe(void *buf, uint32_t user_size, bool use_xsave) { struct __fpstate *fpstate = buf; struct __fpx_sw_bytes *sw_bytes = &fpstate->sw_reserved; sw_bytes->magic1 = FP_XSTATE_MAGIC1; sw_bytes->extended_size = user_size + FP_XSTATE_MAGIC2_SIZE; sw_bytes->xfeatures = ~(0ULL) ^ (XCR0_DISABLED_MASK); sw_bytes->xstate_size = user_size; *(uint32_t *)(buf + user_size) = use_xsave ? FP_XSTATE_MAGIC2 : 0; } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/sighandler_arm64.c000066400000000000000000000143411465435605700311260ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include "atomic.h" #include "sysmsg.h" #include "sysmsg_offsets.h" // TODO(b/271631387): These globals are shared between AMD64 and ARM64; move to // sysmsg_lib.c. struct arch_state __export_arch_state; uint64_t __export_stub_start; long __syscall(long n, long a1, long a2, long a3, long a4, long a5, long a6) { // ARM64 syscall interface passes the syscall number in x8 and the 6 arguments // in x0-x5. The return value is in x0. // // See: https://man7.org/linux/man-pages/man2/syscall.2.html register long x8 __asm__("x8") = n; register long x0 __asm__("x0") = a1; register long x1 __asm__("x1") = a2; register long x2 __asm__("x2") = a3; register long x3 __asm__("x3") = a4; register long x4 __asm__("x4") = a5; register long x5 __asm__("x5") = a6; __asm__ __volatile__("svc #0" : "=r"(x0) : "r"(x8), "0"(x0), "r"(x1), "r"(x2), "r"(x3), "r"(x4), "r"(x5) : "memory", "cc"); return x0; } static __inline void set_tls(uint64_t tls) { __asm__("msr tpidr_el0,%0" : : "r"(tls)); } static __inline uint64_t get_tls() { uint64_t tls; __asm__("mrs %0,tpidr_el0" : "=r"(tls)); return tls; } long sys_futex(uint32_t *addr, int op, int val, struct __kernel_timespec *tv, uint32_t *addr2, int val3) { return __syscall(__NR_futex, (long)addr, (long)op, (long)val, (long)tv, (long)addr2, (long)val3); } static void gregs_to_ptregs(ucontext_t *ucontext, struct user_regs_struct *ptregs) { // Set all registers. for (int i = 0; i < 31; i++ ) { ptregs->regs[i] = ucontext->uc_mcontext.regs[i]; } ptregs->sp = ucontext->uc_mcontext.sp; ptregs->pc = ucontext->uc_mcontext.pc; ptregs->pstate = ucontext->uc_mcontext.pstate; } static void ptregs_to_gregs(ucontext_t *ucontext, struct user_regs_struct *ptregs) { for (int i = 0; i < 31; i++ ) { ucontext->uc_mcontext.regs[i] = ptregs->regs[i]; } ucontext->uc_mcontext.sp = ptregs->sp; ucontext->uc_mcontext.pc = ptregs->pc; ucontext->uc_mcontext.pstate = ptregs->pstate; } void __export_start(struct sysmsg *sysmsg, void *_ucontext) { panic(0x11111111, 0); } void __export_sighandler(int signo, siginfo_t *siginfo, void *_ucontext) { ucontext_t *ucontext = _ucontext; void *sp = sysmsg_sp(); struct sysmsg *sysmsg = sysmsg_addr(sp); if (sysmsg != sysmsg->self) panic(STUB_ERROR_BAD_SYSMSG, 0); int32_t thread_state = atomic_load(&sysmsg->state); uint32_t ctx_state = CONTEXT_STATE_INVALID; struct thread_context *ctx = NULL, *old_ctx = NULL; if (thread_state == THREAD_STATE_INITIALIZING) { // Find a new context and exit to restore it. init_new_thread(); goto init; } ctx = sysmsg->context; old_ctx = sysmsg->context; ctx->signo = signo; gregs_to_ptregs(ucontext, &ctx->ptregs); // Signal frames for ARM64 include 8 byte magic header before the floating // point context. // // See: arch/arm64/include/uapi/asm/sigcontext.h const uint64_t kSigframeMagicHeaderLen = sizeof(struct _aarch64_ctx); // Verify the header. if (((uint32_t *)&ucontext->uc_mcontext.__reserved)[0] != FPSIMD_MAGIC) { panic(STUB_ERROR_FPSTATE_BAD_HEADER, ((uint32_t *)&ucontext->uc_mcontext.__reserved)[0]); } uint8_t *fpStatePointer = (uint8_t *)&ucontext->uc_mcontext.__reserved + kSigframeMagicHeaderLen; memcpy(ctx->fpstate, fpStatePointer, __export_arch_state.fp_len); ctx->tls = get_tls(); ctx->siginfo = *siginfo; switch (signo) { case SIGSYS: { ctx_state = CONTEXT_STATE_SYSCALL; if (siginfo->si_arch != AUDIT_ARCH_AARCH64) { // gVisor doesn't support x32 system calls, so let's change the syscall // number so that it returns ENOSYS. The value added here is just a // random large number which is large enough to not match any existing // syscall number in linux. ctx->ptregs.regs[8] += 0x86000000; } break; } case SIGCHLD: case SIGSEGV: case SIGBUS: case SIGFPE: case SIGTRAP: case SIGILL: ctx_state = CONTEXT_STATE_FAULT; break; default: return; } init: for (;;) { ctx = switch_context(sysmsg, ctx, ctx_state); if (atomic_load(&ctx->interrupt) != 0) { // This context got interrupted while it was waiting in the queue. // Setup all the necessary bits to let the sentry know this context has // switched back because of it. atomic_store(&ctx->interrupt, 0); ctx_state = CONTEXT_STATE_FAULT; ctx->signo = SIGCHLD; ctx->siginfo.si_signo = SIGCHLD; } else { break; } } if (old_ctx != ctx || ctx->last_thread_id != sysmsg->thread_id) { ctx->fpstate_changed = 1; } restore_state(sysmsg, ctx, _ucontext); } // On ARM restore_state sets up a correct restore from the sighandler by // populating _ucontext. void restore_state(struct sysmsg *sysmsg, struct thread_context *ctx, void *_ucontext) { ucontext_t *ucontext = _ucontext; struct fpsimd_context *fpctx = (struct fpsimd_context *)&ucontext->uc_mcontext.__reserved; uint8_t *fpStatePointer = (uint8_t *)&fpctx->fpsr; if (atomic_load(&ctx->fpstate_changed)) { memcpy(fpStatePointer, ctx->fpstate, __export_arch_state.fp_len); } ptregs_to_gregs(ucontext, &ctx->ptregs); set_tls(ctx->tls); atomic_store(&sysmsg->state, THREAD_STATE_NONE); } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/sigrestorer_amd64.S000066400000000000000000000014421465435605700313160ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include .global __export_restore_rt; .type __export_restore_rt, @function; __export_restore_rt: movq $__NR_rt_sigreturn, %rax syscall .size __export_restore_rt,.-__export_restore_rt golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/sigrestorer_arm64.S000066400000000000000000000014351465435605700313360ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include .global __export_restore_rt; .type __export_restore_rt, @function; __export_restore_rt: mov x8, __NR_rt_sigreturn svc #0 .size __export_restore_rt,.-__export_restore_rt golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/syshandler_amd64.S000066400000000000000000000157751465435605700311400ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "sysmsg_offsets.h" #include "sysmsg_offsets_amd64.h" // Helper macros: //////////////////////////////////////// // prepare_enter_syshandler does the following: // - saves all registers that are restorable onto the thread_context struct. // - loads the address of the thread_context struct into %rcx. .macro prepare_enter_syshandler // Syshandler clobbers rflags (load_thread_context_addr does so for example). // Therefore save it as the first thing we do. pushfq // load_thread_context_addr overwrites %rcx. push %rcx movq %gs:offsetof_sysmsg_context, %rcx // Registers listed in order as written in ptregs: movq %r15, offsetof_thread_context_ptregs_r15(%rcx) movq %r14, offsetof_thread_context_ptregs_r14(%rcx) movq %r13, offsetof_thread_context_ptregs_r13(%rcx) movq %r12, offsetof_thread_context_ptregs_r12(%rcx) movq %rbp, offsetof_thread_context_ptregs_rbp(%rcx) movq %rbx, offsetof_thread_context_ptregs_rbx(%rcx) movq %r11, offsetof_thread_context_ptregs_r11(%rcx) movq %r10, offsetof_thread_context_ptregs_r10(%rcx) movq %r9, offsetof_thread_context_ptregs_r9(%rcx) movq %r8, offsetof_thread_context_ptregs_r8(%rcx) movq %rax, offsetof_thread_context_ptregs_rax(%rcx) pop %r15 movq %r15, offsetof_thread_context_ptregs_rcx(%rcx) movq %rdx, offsetof_thread_context_ptregs_rdx(%rcx) movq %rsi, offsetof_thread_context_ptregs_rsi(%rcx) movq %rdi, offsetof_thread_context_ptregs_rdi(%rcx) movq %rax, offsetof_thread_context_ptregs_orig_rax(%rcx) movw %cs, offsetof_thread_context_ptregs_cs(%rcx) movw %ss, offsetof_thread_context_ptregs_ss(%rcx) // Don't bother save/restoring ds/es on amd64 // movw %ds, offsetof_thread_context_ptregs_ds(%rcx) // movw %es, offsetof_thread_context_ptregs_es(%rcx) movw %fs, offsetof_thread_context_ptregs_fs(%rcx) movw %gs, offsetof_thread_context_ptregs_gs(%rcx) pop %rax movq %rax, offsetof_thread_context_ptregs_eflags(%rcx) movq %gs:offsetof_sysmsg_app_stack, %r8 movq %r8, offsetof_thread_context_ptregs_rsp(%rcx) movq %gs:offsetof_sysmsg_ret_addr, %r9 movq %r9, offsetof_thread_context_ptregs_rip(%rcx) .endm // prepare_exit_syshandler assumes that: // - the memory address of the thread_context is loaded in %rcx. // prepare_exit_syshandler does the following: // - sets sysmsg->ret_addr // - restores all registers that were saved inside the thread_context struct except for // %rsp and rflags. // - %rcx will be restored as well, and will no longer contain the memory address to the // thread context. // - puts user %rsp and rflags onto the syshandler stack (in that order). rflags cannot // be restored at this point because syshandler will clobber it before it exits. .macro prepare_exit_syshandler movq offsetof_thread_context_ptregs_rsp(%rcx), %rax push %rax movq offsetof_thread_context_ptregs_eflags(%rcx), %rbx push %rbx // set sysmsg->ret_addr movq offsetof_thread_context_ptregs_rip(%rcx), %r9 movq %r9, %gs:offsetof_sysmsg_ret_addr // Restore segments. Because restoring segments is slow, restore them only if necessary. movw %fs, %dx cmpw %dx, offsetof_thread_context_ptregs_fs(%rcx) je restored_fs movw offsetof_thread_context_ptregs_fs(%rcx), %fs restored_fs: movw %gs, %si cmpw %si, offsetof_thread_context_ptregs_gs(%rcx) je restored_gs movw offsetof_thread_context_ptregs_gs(%rcx), %gs restored_gs: // Restore other GP registers movq offsetof_thread_context_ptregs_r15(%rcx), %r15 movq offsetof_thread_context_ptregs_r14(%rcx), %r14 movq offsetof_thread_context_ptregs_r13(%rcx), %r13 movq offsetof_thread_context_ptregs_r12(%rcx), %r12 movq offsetof_thread_context_ptregs_rbp(%rcx), %rbp movq offsetof_thread_context_ptregs_rbx(%rcx), %rbx movq offsetof_thread_context_ptregs_r11(%rcx), %r11 movq offsetof_thread_context_ptregs_r10(%rcx), %r10 movq offsetof_thread_context_ptregs_r9(%rcx), %r9 movq offsetof_thread_context_ptregs_r8(%rcx), %r8 movq offsetof_thread_context_ptregs_rax(%rcx), %rax // %rcx restored last movq offsetof_thread_context_ptregs_rdx(%rcx), %rdx movq offsetof_thread_context_ptregs_rsi(%rcx), %rsi movq offsetof_thread_context_ptregs_rdi(%rcx), %rdi movq offsetof_thread_context_ptregs_rcx(%rcx), %rcx .endm // save_fpstate saves the current fpstate onto thread_context.fpstate. // It assumes that: // - the memory address of the thread_context is loaded in %rcx. .macro save_fpstate lea offsetof_thread_context_fpstate(%rcx), %rdi movl $XCR0_EAX, %eax movl $XCR0_EDX, %edx movl __export_arch_state+offsetof_arch_state_xsave_mode(%rip), %esi cmpl $XSAVE_MODE_XSAVEOPT, %esi jl use_xsave xsaveopt (%rdi) jmp fpu_saved use_xsave: cmpl $XSAVE_MODE_XSAVE, %esi jl use_fxsave xsave (%rdi) jmp fpu_saved use_fxsave: fxsave (%rdi) fpu_saved: .endm // restore_fpstate restores the fpstate previously saved onto thread_context.fpstate. // It assumes that: // - the memory address of the thread_context is loaded in %rcx. .macro restore_fpstate // We only need to restore fpstate if we were signalled that it changed (syshandler // does not modify fpstate). cmpl $0, offsetof_thread_context_fpstate_changed(%rcx) je fpu_restored lea offsetof_thread_context_fpstate(%rcx), %rdi mov __export_arch_state+offsetof_arch_state_xsave_mode(%rip), %eax cmpl $XSAVE_MODE_FXSAVE, %eax jz use_fxrstor use_xrstor: movl $XCR0_EAX, %eax movl $XCR0_EDX, %edx xrstor (%rdi) jmp fpu_restored use_fxrstor: fxrstor (%rdi) fpu_restored: .endm // Syshandler: //////////////////////////////////////// .globl __export_syshandler; .type __export_syshandler, @function; .align 4, 0x00; __export_syshandler: // The start of this function is in a usertrap trampoline: // mov sysmsg.ThreadStatePrep, %gs:offset(msg.State) // mov %rsp,%gs:0x20 // msg.AppStack // mov %gs:0x18,%rsp // msg.SyshandlerStack // movabs $ret_addr, %rax // mov %rax,%gs:0x8 // msg.RetAddr // mov sysno,%eax // jmpq *%gs:0x10 // msg.Syshandler prepare_enter_syshandler save_fpstate callq __syshandler .globl asm_restore_state; .type asm_restore_state, @function; asm_restore_state: // thread_context may have changed, therefore we reload it into %rcx anew. movq %gs:offsetof_sysmsg_context, %rcx restore_fpstate prepare_exit_syshandler // Now syshandler is exiting for good; restore user rflags and %rsp. popfq movq 0(%rsp), %rsp jmp *%gs:offsetof_sysmsg_ret_addr // msg->ret_addr .size __export_syshandler, . - __export_syshandler golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/syshandler_arm64.S000066400000000000000000000021601465435605700311360ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "sysmsg_offsets.h" .globl __export_syshandler; .type __export_syshandler, @function; .align 4, 0x00; // syshandler is not implemented for ARM64 yet. __export_syshandler: // BRK will generate an Debug Exception which cannot be masked. // See: https://developer.arm.com/documentation/102120/0100/Debug-exceptions // The immediate unsigned operand needs to be <= 0xffff. // See: https://developer.arm.com/documentation/dui0802/b/A64-General-Instructions/BRK BRK #0xdead .size __export_syshandler, . - __export_syshandler golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/sysmsg.go000066400000000000000000000331521465435605700275060ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package sysmsg provides a stub signal handler and a communication protocol // between stub threads and the Sentry. // // Note that this package is allowlisted for use of sync/atomic. // // +checkalignedignore package sysmsg import ( "fmt" "strings" "sync/atomic" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/platform" ) // LINT.IfChange // Per-thread stack layout: // // *------------* // | guard page | // |------------| // | | // | sysstack | // | | // *------------* // | guard page | // |------------| // | | // | ^ | // | / \ | // | | | // | altstack | // |------------| // | sysmsg | // *------------* const ( // PerThreadMemSize is the size of a per-thread memory region. PerThreadMemSize = 8 * hostarch.PageSize // GuardSize is the size of an unmapped region which is placed right // before the signal stack. GuardSize = hostarch.PageSize PerThreadPrivateStackOffset = GuardSize PerThreadPrivateStackSize = 2 * hostarch.PageSize // PerThreadStackSharedSize is the size of a per-thread stack region. PerThreadSharedStackSize = 4 * hostarch.PageSize PerThreadSharedStackOffset = 4 * hostarch.PageSize // MsgOffsetFromStack is the offset of the Msg structure on // the thread stack. MsgOffsetFromSharedStack = PerThreadMemSize - hostarch.PageSize - PerThreadSharedStackOffset // SpinningQueueMemSize is the size of a spinning queue memory region. SpinningQueueMemSize = hostarch.PageSize ) // StackAddrToMsg returns an address of a sysmsg structure. func StackAddrToMsg(sp uintptr) uintptr { return sp + MsgOffsetFromSharedStack } // StackAddrToSyshandlerStack returns an address of a syshandler stack. func StackAddrToSyshandlerStack(sp uintptr) uintptr { return sp + PerThreadPrivateStackOffset + PerThreadPrivateStackSize } // MsgToStackAddr returns a start address of a stack. func MsgToStackAddr(msg uintptr) uintptr { return msg - MsgOffsetFromSharedStack } // ThreadState is used to store a state of the sysmsg thread. type ThreadState uint32 // Set atomicaly sets the state value. func (s *ThreadState) Set(state ThreadState) { atomic.StoreUint32((*uint32)(s), uint32(state)) } // CompareAndSwap atomicaly compares and swaps the state value. func (s *ThreadState) CompareAndSwap(old, state ThreadState) bool { return atomic.CompareAndSwapUint32((*uint32)(s), uint32(old), uint32(state)) } // Get returns the current state value. // //go:nosplit func (s *ThreadState) Get() ThreadState { return ThreadState(atomic.LoadUint32((*uint32)(s))) } const ( // ThreadStateNone means that the thread is executing the user workload. ThreadStateNone ThreadState = iota // ThreadStateDone means that last event has been handled and the stub thread // can be resumed. ThreadStateDone // ThreadStatePrep means that syshandler started filling the sysmsg struct. ThreadStatePrep // ThreadStateAsleep means that this thread fell asleep because there was not // enough contexts to process in the context queue. ThreadStateAsleep // ThreadStateInitializing is only set once at sysmsg thread creation time. It // is used to tell the signal handler that the thread does not yet have a // context. ThreadStateInitializing ) // Msg contains the current state of the sysmsg thread. type Msg struct { // The next batch of fields is used to call the syshandler stub // function. A system call can be replaced with a function call. When // a function call is executed, it can't change the current process // stack, so it needs to save stack and instruction registers, switch // on its syshandler stack and call the jmp instruction to the syshandler // address. // // Self is a pointer to itself in a process address space. Self uint64 // RetAddr is a return address from the syshandler function. RetAddr uint64 // Syshandler is an address of the syshandler function. Syshandler uint64 // SyshandlerStack is an address of the thread syshandler stack. SyshandlerStack uint64 // AppStack is a value of the stack register before calling the syshandler // function. AppStack uint64 // interrupt is non-zero if there is a postponed interrupt. interrupt uint32 // State indicates to the sentry what the sysmsg thread is doing at a given // moment. State ThreadState // Context is a pointer to the ThreadContext struct that the current sysmsg // thread is processing. Context uint64 // FaultJump is the size of a faulted instruction. FaultJump int32 // Err is the error value with which the {sig|sys}handler crashes the stub // thread (see sysmsg.h:__panic). Err int32 // ErrAdditional is an error value that gives additional information // about the panic. ErrAdditional int32 // Line is the code line on which the {sig|sys}handler crashed the stub thread // (see sysmsg.h:panic). Line int32 // Debug is a variable to use to get visibility into the stub from the sentry. Debug uint64 // ThreadID is the ID of the sysmsg thread. ThreadID uint32 } // ContextState defines the reason the context has exited back to the sentry, // or ContextStateNone if running/ready-to-run. type ContextState uint32 // Set atomicaly sets the state value. func (s *ContextState) Set(state ContextState) { atomic.StoreUint32((*uint32)(s), uint32(state)) } // Get returns the current state value. // //go:nosplit func (s *ContextState) Get() ContextState { return ContextState(atomic.LoadUint32((*uint32)(s))) } // Context State types. const ( // ContextStateNone means that is either running in the user task or is ready // to run in the user task. ContextStateNone ContextState = iota // ContextStateSyscall means that a syscall event is triggered from the // sighandler. ContextStateSyscall // ContextStateFault means that there is a fault event that needs to be // handled. ContextStateFault // ContextStateSyscallTrap means that a syscall event is triggered from // a function call (syshandler). ContextStateSyscallTrap // ContextStateSyscallCanBePatched means that the syscall can be replaced // with a function call. ContextStateSyscallCanBePatched // ContextStateInvalid is an invalid state that the sentry should never see. ContextStateInvalid ) const ( // MaxFPStateLen is the largest possible FPState that we will save. // Note: This value was chosen to be able to fit ThreadContext into one page. MaxFPStateLen uint32 = 3584 // AllocatedSizeofThreadContextStruct defines how much memory to allocate for // one instance of ThreadContext. // We over allocate the memory for it because: // - The next instances needs to align to 64 bytes for purposes of xsave. // - It's nice to align it to the page boundary. AllocatedSizeofThreadContextStruct uintptr = 4096 ) // ThreadContext contains the current context of the sysmsg thread. The struct // facilitates switching contexts by allowing the sentry to switch pointers to // this struct as it needs to. type ThreadContext struct { // FPState is a region of memory where: // - syshandler saves FPU state to using xsave/fxsave // - sighandler copies FPU state to from ucontext->uc_mcontext.fpregs // Note that xsave requires this region of memory to be 64 byte aligned; // therefore allocations of ThreadContext must be too. FPState [MaxFPStateLen]byte // FPStateChanged is set to true when the stub thread needs to restore FPState // because the sentry changed it. FPStateChanged uint64 // Regs is the context's GP register set. The {sig|sys}handler will save and // restore the user app's registers here. Regs linux.PtraceRegs // SignalInfo is the siginfo struct. SignalInfo linux.SignalInfo // Signo is the signal that the stub is requesting the sentry to handle. Signo int64 // State indicates the reason why the context has exited back to the sentry. State ContextState // Interrupt is set to indicate that this context has been interrupted. Interrupt uint32 // ThreadID is the ID of the sysmsg thread that's currently working on the // context. ThreadID uint32 // LastThreadID is the ID of the previous sysmsg thread that ran the context // (not the one currently working on it). This field is used by sysmsg threads // to detect whether fpstate may have changed since the last time they ran a // context. LastThreadID uint32 // SentryFastPath is used to indicate to the stub thread that the sentry // goroutine used for this thread context is busy-polling for a response // instead of using FUTEX_WAIT. SentryFastPath uint32 // AckedTime is used by sysmsg threads to signal to the sentry that this context // has been picked up from the context queue and is actively being worked on. // The stub thread puts down the timestamp at which it has started processing // this context. AckedTime uint64 // StateChangedTime is the time when the ThreadContext.State changed, as // recorded by the stub thread when it gave it back to the sentry // (the sentry does not populate this field except to reset it). StateChangedTime uint64 // TLS is a pointer to a thread local storage. // It is is only populated on ARM64. TLS uint64 // Debug is a variable to use to get visibility into the stub from the sentry. Debug uint64 } // StubError are values that represent known stub-thread failure modes. // Since these errors originate from the stub threads, look at // sysmsg.h:stub_error. type StubError int32 const ( // StubErrorBadSysmsg indicates sysmsg->self did not match sysmsg. StubErrorBadSysmsg StubError = 0x0bad0000 + iota // StubErrorBadThreadState indicates sysmsg->state was invalid. StubErrorBadThreadState // StubErrorBadSpinningQueueDecref indicates stubs removed more threads // from spinning queue than were put in. StubErrorBadSpinningQueueDecref // StubErrorArchPrctl indicates an error when calling arch_prctl. StubErrorArchPrctl // StubErrorFutex indicates an error when calling futex. StubErrorFutex // StubErrorBadContextID indicates a context received from the context // queue was of unexpected value. StubErrorBadContextID // StubErrorFpStateBadHeader indicates that the floating point state // header did not match the expected value. StubErrorFpStateBadHeader ) // LINT.ThenChange(sysmsg.h) // Init initializes the message. func (m *Msg) Init(threadID uint32) { m.Err = 0 m.ErrAdditional = 0 m.Line = -1 m.ThreadID = threadID m.Context = 0 } // Init initializes the ThreadContext instance. func (c *ThreadContext) Init(initialThreadID uint32) { c.FPStateChanged = 1 c.Regs = linux.PtraceRegs{} c.Signo = 0 c.SignalInfo = linux.SignalInfo{} c.State = ContextStateNone c.ThreadID = initialThreadID } // ConvertSysmsgErr converts m.Err to platform.ContextError. func (m *Msg) ConvertSysmsgErr() *platform.ContextError { err := &platform.ContextError{ Errno: unix.EPERM, } const prefix = "systrap stub thread failure:" suffix := fmt.Sprintf("(failed on line %d; %s)", atomic.LoadInt32(&m.Line), m.String()) switch StubError(atomic.LoadInt32(&m.Err)) { case StubErrorBadSysmsg: err.Err = fmt.Errorf("%s sysmsg->self did not match sysmsg during sig/sys-handler %s", prefix, suffix) case StubErrorBadThreadState: err.Err = fmt.Errorf("%s sysmsg->state was invalid during sys-handler %s", prefix, suffix) case StubErrorBadSpinningQueueDecref: err.Err = fmt.Errorf("%s imbalanced use of spinning queue %s", prefix, suffix) case StubErrorArchPrctl: err.Err = fmt.Errorf("%s arch_prctl error=0x%x %s", prefix, atomic.LoadInt32(&m.ErrAdditional), suffix) case StubErrorFutex: err.Err = fmt.Errorf("%s futex error=0x%x %s", prefix, atomic.LoadInt32(&m.ErrAdditional), suffix) case StubErrorBadContextID: err.Err = fmt.Errorf("%s unexpected context ID (%d) from context queue %s", prefix, atomic.LoadInt32(&m.ErrAdditional), suffix) case StubErrorFpStateBadHeader: err.Err = fmt.Errorf("%s FP state context magic header (%d) does not match expected FPSIMD_MAGIC %s", prefix, atomic.LoadInt32(&m.ErrAdditional), suffix) default: err.Err = fmt.Errorf("%s unknown reason (0x%x) (possible shared memory corruption) %s", prefix, atomic.LoadInt32(&m.Err), suffix) } return err } func (m *Msg) String() string { var b strings.Builder fmt.Fprintf(&b, "sysmsg.Msg{msg: %x state %d", m.Self, m.State) fmt.Fprintf(&b, " err %x line %d debug %x", m.Err, m.Line, m.Debug) fmt.Fprintf(&b, " app stack %x", m.AppStack) fmt.Fprintf(&b, " context %x", m.Context) fmt.Fprintf(&b, " ThreadID %d", m.ThreadID) b.WriteString("}") return b.String() } func (c *ThreadContext) String() string { var b strings.Builder fmt.Fprintf(&b, "sysmsg.ThreadContext{state %d", c.State.Get()) fmt.Fprintf(&b, " fault addr %x syscall %d", c.SignalInfo.Addr(), c.SignalInfo.Syscall()) fmt.Fprintf(&b, " ip %x sp %x", c.Regs.InstructionPointer(), c.Regs.StackPointer()) fmt.Fprintf(&b, " FPStateChanged %d Regs %+v", c.FPStateChanged, c.Regs) fmt.Fprintf(&b, " Interrupt %d", c.Interrupt) fmt.Fprintf(&b, " ThreadID %d LastThreadID %d", c.ThreadID, c.LastThreadID) fmt.Fprintf(&b, " SentryFastPath %d Acked %d", c.SentryFastPath, c.AckedTime) fmt.Fprintf(&b, " signo: %d, siginfo: %+v", c.Signo, c.SignalInfo) fmt.Fprintf(&b, " debug %d", atomic.LoadUint64(&c.Debug)) b.WriteString("}") return b.String() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/sysmsg.h000066400000000000000000000127511465435605700273320ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef THIRD_PARTY_GVISOR_PKG_SENTRY_PLATFORM_SYSTRAP_SYSMSG_SYSMSG_H_ #define THIRD_PARTY_GVISOR_PKG_SENTRY_PLATFORM_SYSTRAP_SYSMSG_SYSMSG_H_ #include #include #include #include "sysmsg_offsets.h" // NOLINT #if defined(__x86_64__) // LINT.IfChange struct arch_state { uint32_t xsave_mode; uint32_t fp_len; uint32_t fsgsbase; }; // LINT.ThenChange(sysmsg_amd64.go) #else // LINT.IfChange struct arch_state { uint32_t fp_len; }; // LINT.ThenChange(sysmsg_arm64.go) #endif // LINT.IfChange enum thread_state { THREAD_STATE_NONE, THREAD_STATE_DONE, THREAD_STATE_PREP, THREAD_STATE_ASLEEP, THREAD_STATE_INITIALIZING, }; struct thread_context; // sysmsg contains the current state of the sysmsg thread. See: sysmsg.go:Msg struct sysmsg { struct sysmsg *self; uint64_t ret_addr; uint64_t syshandler; uint64_t syshandler_stack; uint64_t app_stack; uint32_t interrupt; uint32_t state; struct thread_context *context; // The fields above have offsets defined in sysmsg_offsets*.h int32_t fault_jump; int32_t err; int32_t err_additional; int32_t err_line; uint64_t debug; uint32_t thread_id; }; enum context_state { CONTEXT_STATE_NONE, CONTEXT_STATE_SYSCALL, CONTEXT_STATE_FAULT, CONTEXT_STATE_SYSCALL_TRAP, CONTEXT_STATE_SYSCALL_NEED_TRAP, CONTEXT_STATE_INVALID, }; // thread_context contains the current context of the sysmsg thread. // See sysmsg.go:SysThreadContext struct thread_context { uint8_t fpstate[MAX_FPSTATE_LEN]; uint64_t fpstate_changed; struct user_regs_struct ptregs; // The fields above have offsets defined in sysmsg_offsets*.h siginfo_t siginfo; int64_t signo; uint32_t state; uint32_t interrupt; uint32_t thread_id; uint32_t last_thread_id; uint32_t sentry_fast_path; uint64_t acked_time; uint64_t state_changed_time; uint64_t tls; uint64_t debug; }; enum stub_error { STUB_ERROR_BAD_SYSMSG = 0x0bad0000, STUB_ERROR_BAD_THREAD_STATE, STUB_ERROR_SPINNING_QUEUE_DECREF, STUB_ERROR_ARCH_PRCTL, STUB_ERROR_FUTEX, STUB_ERROR_BAD_CONTEXT_ID, STUB_ERROR_FPSTATE_BAD_HEADER, }; #ifndef PAGE_SIZE #define PAGE_SIZE 4096 #endif #define PER_THREAD_MEM_SIZE (8 * PAGE_SIZE) #define GUARD_SIZE (PAGE_SIZE) #define MSG_OFFSET_FROM_START (PER_THREAD_MEM_SIZE - PAGE_SIZE) #define SPINNING_QUEUE_MEM_SIZE PAGE_SIZE // LINT.ThenChange(sysmsg.go) #define FAULT_OPCODE 0x06 // "push %es" on x32 and invalid opcode on x64. #define __stringify_1(x...) #x #define __stringify(x...) __stringify_1(x) #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2 * !!(condition)])) extern uint64_t __export_pr_sched_core; extern uint64_t __export_deep_sleep_timeout; extern struct arch_state __export_arch_state; struct context_queue; extern struct context_queue *__export_context_queue_addr; // NOLINTBEGIN(runtime/int) static void *sysmsg_sp() { volatile int p; void *sp = (struct sysmsg *)(((long)&p) / PER_THREAD_MEM_SIZE * PER_THREAD_MEM_SIZE); _Static_assert( sizeof(struct sysmsg) < (PER_THREAD_MEM_SIZE - MSG_OFFSET_FROM_START), "The sysmsg structure is too big."); return sp; } static struct sysmsg *sysmsg_addr(void *sp) { return (struct sysmsg *)(sp + MSG_OFFSET_FROM_START); } long __syscall(long n, long a1, long a2, long a3, long a4, long a5, long a6); struct __kernel_timespec; long sys_futex(uint32_t *addr, int op, int val, struct __kernel_timespec *tv, uint32_t *addr2, int val3); static void __panic(int err, int err_additional, long line) { void *sp = sysmsg_sp(); struct sysmsg *sysmsg = sysmsg_addr(sp); struct thread_context *ctx = sysmsg->context; sysmsg->err = err; sysmsg->err_additional = err_additional; sysmsg->err_line = line; // Wake up the goroutine waiting on the current context. __atomic_store_n(&ctx->state, CONTEXT_STATE_FAULT, __ATOMIC_RELEASE); sys_futex(&ctx->state, FUTEX_WAKE, 1, NULL, NULL, 666); // crash the stub process. // // Normal user processes cannot map addresses lower than vm.mmap_min_addr // which is usually > 4K. So writing to an address <4K should crash the // process with a segfault. #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Warray-bounds" *(int *)(line % 4096) = err; #pragma GCC diagnostic pop } void memcpy(uint8_t *dest, uint8_t *src, size_t n); void __export_start(struct sysmsg *sysmsg, void *_ucontext); void restore_state(struct sysmsg *sysmsg, struct thread_context *ctx, void *_ucontext); struct thread_context *switch_context(struct sysmsg *sysmsg, struct thread_context *ctx, enum context_state new_context_state); int wait_state(struct sysmsg *sysmsg, enum thread_state new_thread_state); void init_new_thread(void); #define panic(err, err_additional) __panic(err, err_additional, __LINE__) // NOLINTEND(runtime/int) #endif // THIRD_PARTY_GVISOR_PKG_SENTRY_PLATFORM_SYSTRAP_SYSMSG_SYSMSG_H_ golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/sysmsg_amd64.go000066400000000000000000000040121465435605700304720ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package sysmsg import ( _ "embed" "fmt" "strings" "gvisor.dev/gvisor/pkg/cpuid" ) // SighandlerBlob contains the compiled code of the sysmsg signal handler. // //go:embed sighandler.built-in.amd64.bin var SighandlerBlob []byte // ArchState defines variables specific to the architecture being // used. type ArchState struct { xsaveMode uint32 fpLen uint32 fsgsbase uint32 } // The linux kernel does not allow using xsavec from userspace, so we are limited // to xsaveopt. // See arch/x86/kernel/fpu/xstate.c:validate_user_xstate_header for details. const ( fxsave = iota xsave xsaveopt ) // Init initializes the arch specific state. func (s *ArchState) Init() { fs := cpuid.HostFeatureSet() fpLenUint, _ := fs.ExtendedStateSize() // TODO(gvisor.dev/issues/9896): Implement AMX Support. s.fpLen = uint32(fpLenUint - fs.AMXExtendedStateSize()) if fs.UseXsaveopt() { s.xsaveMode = xsaveopt } else if fs.UseXsave() { s.xsaveMode = xsave } else { s.xsaveMode = fxsave } if fs.UseFSGSBASE() { s.fsgsbase = 1 } } // FpLen returns the FP state length for AMD64. func (s *ArchState) FpLen() int { return int(s.fpLen) } func (s *ArchState) String() string { var b strings.Builder fmt.Fprintf(&b, "sysmsg.ArchState{") fmt.Fprintf(&b, " xsaveMode %d", s.xsaveMode) fmt.Fprintf(&b, " fsgsbase %d", s.fsgsbase) fmt.Fprintf(&b, " fpLen %d", s.fpLen) b.WriteString(" }") return b.String() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/sysmsg_amd64_state_autogen.go000066400000000000000000000001321465435605700334130ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 // +build amd64 package sysmsg golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/sysmsg_arm64.go000066400000000000000000000026431465435605700305200ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package sysmsg import ( _ "embed" "fmt" "strings" "gvisor.dev/gvisor/pkg/cpuid" ) // SighandlerBlob contains the compiled code of the sysmsg signal handler. // //go:embed sighandler.built-in.arm64.bin var SighandlerBlob []byte // ArchState defines variables specific to the architecture being // used. type ArchState struct { fpLen uint32 } // Init initializes the arch specific state. func (s *ArchState) Init() { fs := cpuid.HostFeatureSet() fpLenUint, _ := fs.ExtendedStateSize() s.fpLen = uint32(fpLenUint) } // FpLen returns the FP state length for ARM. func (s *ArchState) FpLen() int { return int(s.fpLen) } func (s *ArchState) String() string { var b strings.Builder fmt.Fprintf(&b, "sysmsg.ArchState{") fmt.Fprintf(&b, " fpLen %d", s.fpLen) b.WriteString(" }") return b.String() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/sysmsg_arm64_state_autogen.go000066400000000000000000000001321465435605700334310ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 // +build arm64 package sysmsg golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/sysmsg_lib.c000066400000000000000000000333271465435605700301550ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include "atomic.h" #include "sysmsg.h" // __export_deep_sleep_timeout is the timeout after which the stub thread stops // polling and fall asleep. uint64_t __export_deep_sleep_timeout; // LINT.IfChange #define MAX_GUEST_CONTEXTS (4095) #define MAX_CONTEXT_QUEUE_ENTRIES (MAX_GUEST_CONTEXTS + 1) #define INVALID_CONTEXT_ID 0xfefefefe #define INVALID_THREAD_ID 0xfefefefe // Each element of a context_queue ring buffer is a sum of its index shifted by // CQ_INDEX_SHIFT and context_id. #define CQ_INDEX_SHIFT 32 #define CQ_CONTEXT_MASK ((1UL << CQ_INDEX_SHIFT) - 1) // See systrap/context_queue.go struct context_queue { uint32_t start; uint32_t end; uint32_t num_active_threads; uint32_t num_spinning_threads; uint32_t num_threads_to_wakeup; uint32_t num_active_contexts; uint32_t num_awake_contexts; uint32_t fast_path_disabled; uint32_t used_fast_path; uint64_t ringbuffer[MAX_CONTEXT_QUEUE_ENTRIES]; }; struct context_queue *__export_context_queue_addr; // LINT.ThenChange(../context_queue.go) uint32_t is_empty(struct context_queue *queue) { return atomic_load(&queue->start) == atomic_load(&queue->end); } int32_t queued_contexts(struct context_queue *queue) { return (atomic_load(&queue->end) + MAX_CONTEXT_QUEUE_ENTRIES - atomic_load(&queue->start)) % MAX_CONTEXT_QUEUE_ENTRIES; } #if defined(__x86_64__) static __inline__ unsigned long rdtsc(void) { unsigned h, l; __asm__ __volatile__("rdtsc" : "=a"(l), "=d"(h)); return ((unsigned long)l) | (((unsigned long)h) << 32); } static __inline__ void spinloop(void) { asm("pause"); } #elif defined(__aarch64__) static __inline__ unsigned long rdtsc(void) { long val; asm volatile("mrs %0, cntvct_el0" : "=r"(val)); return val; } static __inline__ void spinloop(void) { asm volatile("yield" : : : "memory"); } #endif void *__export_context_region; static struct thread_context *thread_context_addr(uint32_t tcid) { return (struct thread_context *)(__export_context_region + tcid * ALLOCATED_SIZEOF_THREAD_CONTEXT_STRUCT); } void memcpy(uint8_t *dest, uint8_t *src, size_t n) { for (size_t i = 0; i < n; i += 1) { dest[i] = src[i]; } } // The spinning queue is a queue of spinning threads. It solves the // fragmentation problem. The idea is to minimize the number of threads // processing requests. We can't control how system threads are scheduled, so // can't distribute requests efficiently. The spinning queue emulates virtual // threads sorted by their spinning time. // // This queue is lock-less to be sure that any thread scheduled out // from CPU doesn't block others. // // The size of the queue must be a divisor of 2^32, because queue indexes are // calculated as modules of uint32 values. #define SPINNING_QUEUE_SIZE 256 // MAX_RE_ENQUEUE defines the amount of time a given entry in the spinning queue // needs to reach timeout in order to be removed. Re-enqueuing a timeout is done // in order to mitigate rdtsc inaccuracies. #define MAX_RE_ENQUEUE 2 struct spinning_queue { uint32_t len; uint32_t start; uint32_t end; uint64_t start_times[SPINNING_QUEUE_SIZE]; uint8_t num_times_re_enqueued[SPINNING_QUEUE_SIZE]; }; struct spinning_queue *__export_spinning_queue_addr; // spinning_queue_push adds a new thread to the queue. It returns false if the // queue is full, or if re_enqueue_times has reached MAX_RE_ENQUEUE. static bool spinning_queue_push(uint8_t re_enqueue_times) __attribute__((warn_unused_result)); static bool spinning_queue_push(uint8_t re_enqueue_times) { struct spinning_queue *queue = __export_spinning_queue_addr; uint32_t idx, end, len; BUILD_BUG_ON(sizeof(struct spinning_queue) > SPINNING_QUEUE_MEM_SIZE); if (re_enqueue_times >= MAX_RE_ENQUEUE) { return false; } len = atomic_add(&queue->len, 1); if (len > SPINNING_QUEUE_SIZE) { atomic_sub(&queue->len, 1); return false; } end = atomic_add(&queue->end, 1); idx = end - 1; atomic_store(&queue->num_times_re_enqueued[idx % SPINNING_QUEUE_SIZE], re_enqueue_times); atomic_store(&queue->start_times[idx % SPINNING_QUEUE_SIZE], rdtsc()); return true; } // spinning_queue_pop() removes one thread from a queue that has been spinning // the shortest time. // However it doesn't take into account the spinning re-enqueue. static void spinning_queue_pop() { struct spinning_queue *queue = __export_spinning_queue_addr; atomic_sub(&queue->end, 1); atomic_sub(&queue->len, 1); } // spinning_queue_remove_first removes one thread from a queue that has been // spinning longer than others and longer than a specified timeout. // // If `timeout` is zero, it always removes one element and never returns false. // // Returns true if one thread has been removed from the queue. static bool spinning_queue_remove_first(uint64_t timeout) __attribute__((warn_unused_result)); static bool spinning_queue_remove_first(uint64_t timeout) { struct spinning_queue *queue = __export_spinning_queue_addr; uint64_t ts; uint8_t re_enqueue = 0; while (1) { uint32_t idx, qidx; idx = atomic_load(&queue->start); qidx = idx % SPINNING_QUEUE_SIZE; ts = atomic_load(&queue->start_times[qidx]); if (ts == 0) continue; if (rdtsc() - ts < timeout) return false; if (idx != atomic_load(&queue->start)) continue; // Lose the race. re_enqueue = atomic_load(&queue->num_times_re_enqueued[qidx]); if (atomic_compare_exchange(&queue->start_times[qidx], &ts, 0)) { atomic_add(&queue->start, 1); break; } } atomic_sub(&queue->len, 1); if (timeout == 0) return true; return !spinning_queue_push(re_enqueue + 1); } struct thread_context *queue_get_context(struct sysmsg *sysmsg) { struct context_queue *queue = __export_context_queue_addr; // Indexes should not jump when start or end are overflowed. BUILD_BUG_ON(UINT32_MAX % MAX_CONTEXT_QUEUE_ENTRIES != MAX_CONTEXT_QUEUE_ENTRIES - 1); while (!is_empty(queue)) { uint64_t idx = atomic_load(&queue->start); uint32_t next = idx % MAX_CONTEXT_QUEUE_ENTRIES; uint64_t v = atomic_load(&queue->ringbuffer[next]); // We need to check the index to be sure that a ring buffer hasn't been // recycled. if ((v >> CQ_INDEX_SHIFT) != idx) continue; if (!atomic_compare_exchange(&queue->ringbuffer[next], &v, INVALID_CONTEXT_ID)) { continue; } uint32_t context_id = v & CQ_CONTEXT_MASK; if (context_id == INVALID_CONTEXT_ID) continue; atomic_add(&queue->start, 1); if (context_id > MAX_GUEST_CONTEXTS) { panic(STUB_ERROR_BAD_CONTEXT_ID, context_id); } struct thread_context *ctx = thread_context_addr(context_id); sysmsg->context = ctx; atomic_store(&ctx->acked_time, rdtsc()); atomic_store(&ctx->thread_id, sysmsg->thread_id); return ctx; } return NULL; } // get_context_fast sets nr_active_threads_p only if it deactivates the thread. static struct thread_context *get_context_fast(struct sysmsg *sysmsg, struct context_queue *queue, uint32_t *nr_active_threads_p) { uint32_t nr_active_threads, nr_awake_contexts; if (!spinning_queue_push(0)) return NULL; atomic_store(&queue->used_fast_path, 1); while (1) { struct thread_context *ctx; ctx = queue_get_context(sysmsg); if (ctx) { spinning_queue_pop(); return ctx; } if (atomic_load(&queue->fast_path_disabled) != 0) { if (!spinning_queue_remove_first(0)) panic(STUB_ERROR_SPINNING_QUEUE_DECREF, 0); break; } nr_active_threads = atomic_load(&queue->num_active_threads); nr_awake_contexts = atomic_load(&queue->num_awake_contexts); if (nr_awake_contexts < nr_active_threads) { if (atomic_compare_exchange(&queue->num_active_threads, &nr_active_threads, nr_active_threads - 1)) { nr_active_threads -= 1; if (!spinning_queue_remove_first(0)) panic(STUB_ERROR_SPINNING_QUEUE_DECREF, 0); *nr_active_threads_p = nr_active_threads; break; } } if (spinning_queue_remove_first(__export_deep_sleep_timeout)) { break; } spinloop(); } return NULL; } #define NR_IF_THREAD_IS_ACTIVE (~0) static bool try_to_dec_threads_to_wakeup(struct context_queue *queue) { while (1) { uint32_t nr = atomic_load(&queue->num_threads_to_wakeup); if (nr == 0) { return false; } if (atomic_compare_exchange(&queue->num_threads_to_wakeup, &nr, nr - 1)) { return true; }; } } void init_new_thread() { struct context_queue *queue = __export_context_queue_addr; atomic_add(&queue->num_active_threads, 1); try_to_dec_threads_to_wakeup(queue); } // get_context retrieves a context that is ready to be restored to the user. // This populates sysmsg->thread_context_id. struct thread_context *get_context(struct sysmsg *sysmsg) { struct context_queue *queue = __export_context_queue_addr; uint32_t nr_active_threads; struct thread_context *ctx; for (;;) { atomic_add(&queue->num_spinning_threads, 1); // Change sysmsg thread state just to indicate thread is not asleep. atomic_store(&sysmsg->state, THREAD_STATE_PREP); ctx = queue_get_context(sysmsg); if (ctx) { goto exit; } bool fast_path_enabled = atomic_load(&queue->fast_path_disabled) == 0; nr_active_threads = NR_IF_THREAD_IS_ACTIVE; if (fast_path_enabled) { ctx = get_context_fast(sysmsg, queue, &nr_active_threads); if (ctx) goto exit; } if (nr_active_threads == NR_IF_THREAD_IS_ACTIVE) { nr_active_threads = atomic_sub(&queue->num_active_threads, 1); } atomic_sub(&queue->num_spinning_threads, 1); atomic_store(&sysmsg->state, THREAD_STATE_ASLEEP); uint32_t nr_active_contexts = atomic_load(&queue->num_active_contexts); // We have to make another attempt to get a context here to prevent TOCTTOU // races with waitOnState and kickSysmsgThread. There are two assumptions: // * If the queue isn't empty, one or more threads have to be active. // * A new thread isn't kicked, if the number of active threads are not less // than a number of active contexts. if (nr_active_threads < nr_active_contexts) { ctx = queue_get_context(sysmsg); if (ctx) { atomic_store(&sysmsg->state, THREAD_STATE_PREP); atomic_add(&queue->num_active_threads, 1); return ctx; } } while (1) { if (!try_to_dec_threads_to_wakeup(queue)) { sys_futex(&queue->num_threads_to_wakeup, FUTEX_WAIT, 0, NULL, NULL, 0); continue; } // Mark this thread as being active only if it can get a context. ctx = queue_get_context(sysmsg); if (ctx) { atomic_store(&sysmsg->state, THREAD_STATE_PREP); atomic_add(&queue->num_active_threads, 1); return ctx; } } } exit: atomic_sub(&queue->num_spinning_threads, 1); return ctx; } // switch_context signals the sentry that the old context is ready to be worked // on and retrieves a new context to switch to. struct thread_context *switch_context(struct sysmsg *sysmsg, struct thread_context *ctx, enum context_state new_context_state) { struct context_queue *queue = __export_context_queue_addr; if (ctx) { atomic_sub(&queue->num_active_contexts, 1); atomic_store(&ctx->thread_id, INVALID_THREAD_ID); atomic_store(&ctx->last_thread_id, sysmsg->thread_id); atomic_store(&ctx->state_changed_time, rdtsc()); atomic_store(&ctx->state, new_context_state); if (atomic_load(&ctx->sentry_fast_path) == 0) { int ret = sys_futex(&ctx->state, FUTEX_WAKE, 1, NULL, NULL, 0); if (ret < 0) { panic(STUB_ERROR_FUTEX, ret); } } } return get_context(sysmsg); } void verify_offsets() { BUILD_BUG_ON(offsetof_sysmsg_self != offsetof(struct sysmsg, self)); BUILD_BUG_ON(offsetof_sysmsg_ret_addr != offsetof(struct sysmsg, ret_addr)); BUILD_BUG_ON(offsetof_sysmsg_syshandler != offsetof(struct sysmsg, syshandler)); BUILD_BUG_ON(offsetof_sysmsg_syshandler_stack != offsetof(struct sysmsg, syshandler_stack)); BUILD_BUG_ON(offsetof_sysmsg_app_stack != offsetof(struct sysmsg, app_stack)); BUILD_BUG_ON(offsetof_sysmsg_interrupt != offsetof(struct sysmsg, interrupt)); BUILD_BUG_ON(offsetof_sysmsg_state != offsetof(struct sysmsg, state)); BUILD_BUG_ON(offsetof_sysmsg_context != offsetof(struct sysmsg, context)); BUILD_BUG_ON(offsetof_thread_context_fpstate != offsetof(struct thread_context, fpstate)); BUILD_BUG_ON(offsetof_thread_context_fpstate_changed != offsetof(struct thread_context, fpstate_changed)); BUILD_BUG_ON(offsetof_thread_context_ptregs != offsetof(struct thread_context, ptregs)); BUILD_BUG_ON(kTHREAD_STATE_NONE != THREAD_STATE_NONE); BUILD_BUG_ON(sizeof(struct thread_context) > ALLOCATED_SIZEOF_THREAD_CONTEXT_STRUCT); } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/sysmsg_offsets.h000066400000000000000000000047001465435605700310560ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef THIRD_PARTY_GVISOR_PKG_SENTRY_PLATFORM_SYSTRAP_SYSMSG_SYSMSG_OFFSETS_H_ #define THIRD_PARTY_GVISOR_PKG_SENTRY_PLATFORM_SYSTRAP_SYSMSG_SYSMSG_OFFSETS_H_ // FAULT_OPCODE is the opcode of the invalid instruction that is used to replace // the first byte of the syscall instruction. More details in the description // for the pkg/sentry/platform/systrap/usertrap package. #define FAULT_OPCODE 0x06 // The value for XCR0 is defined to xsave/xrstor everything except for PKRU and // AMX regions. // TODO(gvisor.dev/issues/9896): Implement AMX support. // TODO(gvisor.dev/issues/10087): Implement PKRU support. #define XCR0_DISABLED_MASK ((1 << 9) | (1 << 17) | (1 << 18)) #define XCR0_EAX (0xffffffff ^ XCR0_DISABLED_MASK) #define XCR0_EDX 0xffffffff // LINT.IfChange #define MAX_FPSTATE_LEN 3584 // Note: To be explicit, 2^12 = 4096; if ALLOCATED_SIZEOF_THREAD_CONTEXT_STRUCT // is changed, make sure to change the code that relies on the bitshift. #define ALLOCATED_SIZEOF_THREAD_CONTEXT_STRUCT 4096 #define THREAD_CONTEXT_STRUCT_BITSHIFT 12 // LINT.ThenChange(sysmsg.go) // LINT.IfChange // Define offsets in the struct sysmsg to use them in assembly files. // Each offset has to have BUILD_BUG_ON in sighandler.c. #define offsetof_sysmsg_self 0x0 #define offsetof_sysmsg_ret_addr 0x8 #define offsetof_sysmsg_syshandler 0x10 #define offsetof_sysmsg_syshandler_stack 0x18 #define offsetof_sysmsg_app_stack 0x20 #define offsetof_sysmsg_interrupt 0x28 #define offsetof_sysmsg_state 0x2c #define offsetof_sysmsg_context 0x30 #define offsetof_thread_context_fpstate 0x0 #define offsetof_thread_context_fpstate_changed MAX_FPSTATE_LEN #define offsetof_thread_context_ptregs 0x8 + MAX_FPSTATE_LEN #define kTHREAD_STATE_NONE 0 #define kTHREAD_STATE_INTERRUPT 3 // LINT.ThenChange(sysmsg.h, sysmsg_lib.c) #endif // THIRD_PARTY_GVISOR_PKG_SENTRY_PLATFORM_SYSTRAP_SYSMSG_SYSMSG_OFFSETS_H_ golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/sysmsg_offsets_amd64.h000066400000000000000000000067661465435605700320670ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef THIRD_PARTY_GVISOR_PKG_SENTRY_PLATFORM_SYSTRAP_SYSMSG_SYSMSG_OFFSETS_AMD64_H_ #define THIRD_PARTY_GVISOR_PKG_SENTRY_PLATFORM_SYSTRAP_SYSMSG_SYSMSG_OFFSETS_AMD64_H_ // LINT.IfChange #define offsetof_arch_state_xsave_mode (0x0) #define offsetof_arch_state_fpLen (0x4) #define offsetof_arch_state_fsgsbase (0x8) #define XSAVE_MODE_FXSAVE (0x0) #define XSAVE_MODE_XSAVE (0x1) #define XSAVE_MODE_XSAVEOPT (0x2) // LINT.ThenChange(sysmsg.h, sysmsg_amd64.go) // LINT.IfChange #define offsetof_thread_context_ptregs_r15 \ (0x0 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_r14 \ (0x8 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_r13 \ (0x10 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_r12 \ (0x18 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_rbp \ (0x20 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_rbx \ (0x28 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_r11 \ (0x30 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_r10 \ (0x38 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_r9 \ (0x40 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_r8 \ (0x48 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_rax \ (0x50 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_rcx \ (0x58 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_rdx \ (0x60 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_rsi \ (0x68 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_rdi \ (0x70 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_orig_rax \ (0x78 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_rip \ (0x80 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_cs \ (0x88 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_eflags \ (0x90 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_rsp \ (0x98 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_ss \ (0xa0 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_fs_base \ (0xa8 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_gs_base \ (0xb0 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_ds \ (0xb8 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_es \ (0xc0 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_fs \ (0xc8 + offsetof_thread_context_ptregs) #define offsetof_thread_context_ptregs_gs \ (0xd0 + offsetof_thread_context_ptregs) // LINT.ThenChange(sysmsg.h, sighandler_amd64.c) #endif // THIRD_PARTY_GVISOR_PKG_SENTRY_PLATFORM_SYSTRAP_SYSMSG_SYSMSG_OFFSETS_AMD64_H_ golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/sysmsg_state_autogen.go000066400000000000000000000000701465435605700324210ustar00rootroot00000000000000// automatically generated by stateify. package sysmsg golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg/sysmsg_unsafe.go000066400000000000000000000026501465435605700310460ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sysmsg import ( "syscall" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" ) // SleepOnState makes the caller sleep on the ThreadContext.State futex. func (c *ThreadContext) SleepOnState(curState ContextState, timeout *unix.Timespec) syscall.Errno { _, _, errno := unix.Syscall6(unix.SYS_FUTEX, uintptr(unsafe.Pointer(&c.State)), linux.FUTEX_WAIT, uintptr(curState), uintptr(unsafe.Pointer(timeout)), 0, 0) if errno == unix.EAGAIN || errno == unix.EINTR { errno = 0 } return errno } // WakeSysmsgThread calls futex wake on Sysmsg.State. func (m *Msg) WakeSysmsgThread() (bool, syscall.Errno) { if !m.State.CompareAndSwap(ThreadStateAsleep, ThreadStatePrep) { return false, 0 } _, _, e := unix.RawSyscall6(unix.SYS_FUTEX, uintptr(unsafe.Pointer(&m.State)), linux.FUTEX_WAKE, 1, 0, 0, 0) return true, e } sysmsg_unsafe_state_autogen.go000066400000000000000000000000701465435605700337030ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg// automatically generated by stateify. package sysmsg golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg_thread.go000066400000000000000000000127221465435605700275100ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg" ) // sysmsgThread describes a sysmsg stub thread which isn't traced // and communicates with the Sentry via the sysmsg protocol. // // This type of thread is used to execute user processes. type sysmsgThread struct { // subproc is a link to the subprocess which is used to call native // system calls. subproc *subprocess // thread is a thread identifier. thread *thread // msg is a pointer to a shared sysmsg structure in the Sentry address // space which is used to communicate with the thread. msg *sysmsg.Msg // context is the last context that ran on this thread. context *platformContext // stackRange is a sysmsg stack in the memory file. stackRange memmap.FileRange // fpuStateToMsgOffset is the offset of a thread fpu state relative to sysmsg. fpuStateToMsgOffset uint64 } // sysmsgPerThreadMemAddr returns a sysmsg stack address in the thread address // space. func (p *sysmsgThread) sysmsgPerThreadMemAddr() uintptr { return stubSysmsgStack + sysmsg.PerThreadMemSize*uintptr(p.thread.sysmsgStackID) } // mapStack maps a sysmsg stack into the thread address space. func (p *sysmsgThread) mapStack(addr uintptr, readOnly bool) error { prot := uintptr(unix.PROT_READ) if !readOnly { prot |= unix.PROT_WRITE } _, err := p.thread.syscallIgnoreInterrupt(&p.thread.initRegs, unix.SYS_MMAP, arch.SyscallArgument{Value: addr}, arch.SyscallArgument{Value: uintptr(p.stackRange.Length())}, arch.SyscallArgument{Value: prot}, arch.SyscallArgument{Value: unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED}, arch.SyscallArgument{Value: uintptr(p.subproc.memoryFile.FD())}, arch.SyscallArgument{Value: uintptr(p.stackRange.Start)}) return err } // mapPrivateStack maps a private stack into the thread address space. func (p *sysmsgThread) mapPrivateStack(addr uintptr, size uintptr) error { prot := uintptr(unix.PROT_READ | unix.PROT_WRITE) _, err := p.thread.syscallIgnoreInterrupt(&p.thread.initRegs, unix.SYS_MMAP, arch.SyscallArgument{Value: addr}, arch.SyscallArgument{Value: size}, arch.SyscallArgument{Value: prot}, arch.SyscallArgument{Value: unix.MAP_PRIVATE | unix.MAP_ANONYMOUS | unix.MAP_FIXED}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}) return err } func (p *sysmsgThread) Debugf(format string, v ...any) { if !log.IsLogging(log.Debug) { return } msg := p.msg postfix := fmt.Sprintf(": %s", msg) p.thread.Debugf(format+postfix, v...) } func sysmsgSyscallNotifyRules() []bpf.Instruction { rules := []seccomp.RuleSet{ seccomp.RuleSet{ Rules: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_EXIT_GROUP: seccomp.MatchAll{}, }), Action: linux.SECCOMP_RET_USER_NOTIF, }, } instrs, _, err := seccomp.BuildProgram(rules, seccomp.ProgramOptions{ DefaultAction: linux.SECCOMP_RET_ALLOW, BadArchAction: linux.SECCOMP_RET_ALLOW, }) if err != nil { panic(fmt.Sprintf("failed to build rules for sysmsg threads: %v", err)) } return instrs } func sysmsgThreadRules(stubStart uintptr) []bpf.Instruction { rules := []seccomp.RuleSet{} rules = appendSysThreadArchSeccompRules(rules) rules = append(rules, []seccomp.RuleSet{ // Allow instructions from the sysmsg code stub, which is limited by one page. { Rules: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_FUTEX: seccomp.Or{ seccomp.PerArg{ seccomp.GreaterThan(stubStart), seccomp.EqualTo(linux.FUTEX_WAKE), seccomp.EqualTo(1), seccomp.EqualTo(0), seccomp.EqualTo(0), seccomp.EqualTo(0), seccomp.GreaterThan(stubStart), // rip }, seccomp.PerArg{ seccomp.GreaterThan(stubStart), seccomp.EqualTo(linux.FUTEX_WAIT), seccomp.AnyValue{}, seccomp.EqualTo(0), seccomp.EqualTo(0), seccomp.EqualTo(0), seccomp.GreaterThan(stubStart), // rip }, }, unix.SYS_RT_SIGRETURN: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.GreaterThan(stubStart), // rip }, unix.SYS_SCHED_YIELD: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.GreaterThan(stubStart), // rip }, }), Action: linux.SECCOMP_RET_ALLOW, }, }...) instrs, _, err := seccomp.BuildProgram(rules, seccomp.ProgramOptions{ DefaultAction: linux.SECCOMP_RET_TRAP, BadArchAction: linux.SECCOMP_RET_TRAP, }) if err != nil { panic(fmt.Sprintf("failed to build rules for sysmsg threads: %v", err)) } return instrs } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg_thread_amd64.go000066400000000000000000000035411465435605700305020ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" ) func appendSysThreadArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet { return append(rules, []seccomp.RuleSet{ { // Rules for trapping vsyscall access. Rules: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_GETTIMEOFDAY: seccomp.MatchAll{}, unix.SYS_TIME: seccomp.MatchAll{}, unix.SYS_GETCPU: seccomp.MatchAll{}, // SYS_GETCPU was not defined in package syscall on amd64. }), Action: linux.SECCOMP_RET_TRAP, Vsyscall: true, }, { Rules: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_ARCH_PRCTL: seccomp.Or{ seccomp.PerArg{ seccomp.EqualTo(linux.ARCH_SET_FS), seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.GreaterThan(stubStart), // rip }, seccomp.PerArg{ seccomp.EqualTo(linux.ARCH_GET_FS), seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.GreaterThan(stubStart), // rip }, }, }), Action: linux.SECCOMP_RET_ALLOW, }, }...) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg_thread_arm64.go000066400000000000000000000013601465435605700305150ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import "gvisor.dev/gvisor/pkg/seccomp" func appendSysThreadArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet { return rules } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/sysmsg_thread_unsafe.go000066400000000000000000000057061465435605700310550ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import ( "fmt" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg" ) func (p *sysmsgThread) unmapStackFromSentry() { _, _, errno := unix.RawSyscall(unix.SYS_MUNMAP, sysmsg.MsgToStackAddr(uintptr(unsafe.Pointer(p.msg))), sysmsg.PerThreadSharedStackSize, 0) if errno != 0 { panic("failed to unmap: " + errno.Error()) } } func (p *sysmsgThread) setMsg(addr uintptr) { // add is always from the stub mapping which is mapped once and never // moved, so it is safe to use unsafe.Pointer here. p.msg = (*sysmsg.Msg)(unsafe.Pointer(addr)) } func (p *sysmsgThread) init(sentryAddr, guestAddr uintptr) { t := p.thread // Set the sysmsg signal stack. // // sentryAddr is from the stub mapping which is mapped once and never // moved, so it is safe to use unsafe.Pointer here. alt := (*linux.SignalStack)(unsafe.Pointer(sentryAddr)) *alt = linux.SignalStack{} alt.Addr = uint64(guestAddr) alt.Size = uint64(sysmsg.MsgOffsetFromSharedStack) _, err := t.syscallIgnoreInterrupt(&t.initRegs, unix.SYS_SIGALTSTACK, arch.SyscallArgument{Value: guestAddr}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}, arch.SyscallArgument{Value: 0}, ) if err != nil { panic(fmt.Sprintf("sigaltstack: %v", err)) } } // sysmsgSigactions installs signal handles for signals which can be triggered // by stubProcess and have to be handled by Sentry. // // It is called in a child process after fork(), so the race instrumentation // has to be disabled. // //go:nosplit //go:norace func sysmsgSigactions(stubSysmsgStart uintptr) unix.Errno { act := linux.SigAction{ Handler: uint64(stubSysmsgStart) + uint64(sysmsg.Sighandler_blob_offset____export_sighandler), Flags: linux.SA_ONSTACK | linux.SA_RESTORER | linux.SA_SIGINFO, Restorer: uint64(stubSysmsgStart) + uint64(sysmsg.Sighandler_blob_offset____export_restore_rt), Mask: 1<<(linux.SIGCHLD-1) | 1<<(linux.SIGSYS-1), } for _, s := range []unix.Signal{ unix.SIGSYS, unix.SIGBUS, unix.SIGFPE, unix.SIGILL, unix.SIGCHLD, unix.SIGTRAP, unix.SIGSEGV, } { _, _, errno := unix.RawSyscall6(unix.SYS_RT_SIGACTION, uintptr(s), uintptr(unsafe.Pointer(&act)), 0, 8, 0, 0) if errno != 0 { return errno } } return 0 } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/systrap.go000066400000000000000000000322641465435605700263440ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package systrap provides a seccomp-based implementation of the platform // interface. // // In a nutshell, it works as follows: // // The creation of a new address space creates a new child processes. // // The creation of a new stub thread creates a new system thread with a // specified address space. To initialize this thread, the following action // will be done: // - install a signal stack which is shared with the Sentry. // - install a signal handler for SYS, BUS, FPE, CHLD, TRAP, SEGV signals. // This signal handler is a key part of the systrap platform. Any stub event // which has to be handled in a privilege mode (by the Sentry) triggers one of // previous signals. The signal handler is running on the separate stack which // is shared with the Sentry. There is the sysmsg structure to synchronize the // Sentry and a stub thread. // - install seccomp filters to trap user system calls. // - send a fake SIGSEGV to stop the thread in the signal handler. // // A platformContext is just a collection of temporary variables. Calling Switch on a // platformContext does the following: // // Set up proper registers and an FPU state on a stub signal frame. // Wake up a stub thread by changing sysmsg->stage and calling FUTEX_WAKE. // Wait for new stub event by polling sysmsg->stage. // // Lock order: // // subprocessPool.mu // subprocess.mu // platformContext.mu // // +checkalignedignore package systrap import ( "fmt" "os" "runtime" "sync" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" pkgcontext "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/memutil" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" "gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg" "gvisor.dev/gvisor/pkg/sentry/platform/systrap/usertrap" ) var ( // stubStart is the link address for our stub, and determines the // maximum user address. This is valid only after a call to stubInit. // // We attempt to link the stub here, and adjust downward as needed. stubStart uintptr = stubInitAddress stubInitProcess uintptr // Memory region to store thread specific stacks. stubSysmsgStack uintptr stubSysmsgStart uintptr stubSysmsgEnd uintptr // Memory region to store the contextQueue. stubContextQueueRegion uintptr stubContextQueueRegionLen uintptr // Memory region to store instances of sysmsg.ThreadContext. stubContextRegion uintptr stubContextRegionLen uintptr // The memory blob with precompiled seccomp rules. stubSysmsgRules uintptr stubSysmsgRulesLen uintptr stubSyscallRules uintptr stubSyscallRulesLen uintptr stubSpinningThreadQueueAddr uintptr stubSpinningThreadQueueSize uintptr // stubROMapEnd is the end address of the read-only stub region that // contains the code and precompiled seccomp rules. stubROMapEnd uintptr // stubEnd is the first byte past the end of the stub, as with // stubStart this is valid only after a call to stubInit. stubEnd uintptr // stubInitialized controls one-time stub initialization. stubInitialized sync.Once // latencyMonitoring controls one-time initialization of the fastpath // control goroutine. latencyMonitoring sync.Once // archState stores architecture-specific details used in the platform. archState sysmsg.ArchState ) // platformContext is an implementation of the platform context. type platformContext struct { // signalInfo is the signal info, if and when a signal is received. signalInfo linux.SignalInfo // interrupt is the interrupt platformContext. interrupt interrupt.Forwarder // sharedContext is everything related to this platformContext that is resident in // shared memory with the stub thread. // sharedContext is only accessed on the Task goroutine, therefore it is not // mutex protected. sharedContext *sharedContext // mu protects the following fields. mu sync.Mutex // If lastFaultSP is non-nil, the last platformContext switch was due to a fault // received while executing lastFaultSP. Only platformContext.Switch may set // lastFaultSP to a non-nil value. lastFaultSP *subprocess // lastFaultAddr is the last faulting address; this is only meaningful if // lastFaultSP is non-nil. lastFaultAddr hostarch.Addr // lastFaultIP is the address of the last faulting instruction; // this is also only meaningful if lastFaultSP is non-nil. lastFaultIP hostarch.Addr // needRestoreFPState indicates that the FPU state has been changed by // the Sentry and has to be updated on the stub thread. needRestoreFPState bool // needToPullFullState indicates that the Sentry doesn't have a full // state of the thread. needToPullFullState bool } // PullFullState implements platform.Context.PullFullState. func (c *platformContext) PullFullState(as platform.AddressSpace, ac *arch.Context64) error { if !c.needToPullFullState { return nil } s := as.(*subprocess) if err := s.PullFullState(c, ac); err != nil { return err } c.needToPullFullState = false return nil } // FullStateChanged implements platform.Context.FullStateChanged. func (c *platformContext) FullStateChanged() { c.needRestoreFPState = true c.needToPullFullState = false } // Switch runs the provided platformContext in the given address space. func (c *platformContext) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac *arch.Context64, cpu int32) (*linux.SignalInfo, hostarch.AccessType, error) { as := mm.AddressSpace() s := as.(*subprocess) if err := s.activateContext(c); err != nil { return nil, hostarch.NoAccess, err } restart: isSyscall, needPatch, err := s.switchToApp(c, ac) if err != nil { return nil, hostarch.NoAccess, err } if needPatch { s.usertrap.PatchSyscall(ctx, ac, mm) } if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGILL { err := s.usertrap.HandleFault(ctx, ac, mm) if err == usertrap.ErrFaultSyscall { isSyscall = true } else if err == usertrap.ErrFaultRestart { goto restart } else if err != nil { ctx.Warningf("usertrap.HandleFault failed: %v", err) } } var ( faultSP *subprocess faultAddr hostarch.Addr faultIP hostarch.Addr ) if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGSEGV { faultSP = s faultAddr = hostarch.Addr(c.signalInfo.Addr()) faultIP = hostarch.Addr(ac.IP()) } // Update the platformContext to reflect the outcome of this context switch. c.mu.Lock() lastFaultSP := c.lastFaultSP lastFaultAddr := c.lastFaultAddr lastFaultIP := c.lastFaultIP // At this point, c may not yet be in s.faultedContexts, so c.lastFaultSP won't // be updated by s.Unmap(). This is fine; we only need to synchronize with // calls to s.Unmap() that occur after the handling of this fault. c.lastFaultSP = faultSP c.lastFaultAddr = faultAddr c.lastFaultIP = faultIP c.mu.Unlock() // Update subprocesses to reflect the outcome of this context switch. if lastFaultSP != faultSP { if lastFaultSP != nil { lastFaultSP.mu.Lock() delete(lastFaultSP.faultedContexts, c) lastFaultSP.mu.Unlock() } if faultSP != nil { faultSP.mu.Lock() faultSP.faultedContexts[c] = struct{}{} faultSP.mu.Unlock() } } if isSyscall { return nil, hostarch.NoAccess, nil } si := c.signalInfo if faultSP == nil { // Non-fault signal. return &si, hostarch.NoAccess, platform.ErrContextSignal } // See if this can be handled as a CPUID exception. if linux.Signal(si.Signo) == linux.SIGSEGV && platform.TryCPUIDEmulate(ctx, mm, ac) { goto restart } // Got a page fault. Ideally, we'd get real fault type here, but ptrace // doesn't expose this information. Instead, we use a simple heuristic: // // It was an instruction fault iff the faulting addr == instruction // pointer. // // It was a write fault if the fault is immediately repeated. at := hostarch.Read if faultAddr == faultIP { at.Execute = true } if lastFaultSP == faultSP && lastFaultAddr == faultAddr && lastFaultIP == faultIP { at.Write = true } // Handle as a signal. return &si, at, platform.ErrContextSignal } // Interrupt interrupts the running guest application associated with this platformContext. func (c *platformContext) Interrupt() { c.interrupt.NotifyInterrupt() } // Release releases all platform resources used by the platformContext. func (c *platformContext) Release() { if c.sharedContext != nil { c.sharedContext.release() c.sharedContext = nil } } // PrepareSleep implements platform.Context.platform.PrepareSleep. func (c *platformContext) PrepareSleep() { ctx := c.sharedContext if ctx == nil { return } if !ctx.sleeping { ctx.sleeping = true ctx.subprocess.decAwakeContexts() } } // Systrap represents a collection of seccomp subprocesses. type Systrap struct { platform.NoCPUPreemptionDetection platform.UseHostGlobalMemoryBarrier platform.DoesNotOwnPageTables // memoryFile is used to create a stub sysmsg stack // which is shared with the Sentry. memoryFile *pgalloc.MemoryFile } // MinUserAddress implements platform.MinUserAddress. func (*Systrap) MinUserAddress() hostarch.Addr { return platform.SystemMMapMinAddr() } // New returns a new seccomp-based implementation of the platform interface. func New() (*Systrap, error) { if maxSysmsgThreads == 0 { // CPUID information has been initialized at this point. archState.Init() // GOMAXPROCS has been set at this point. maxSysmsgThreads = runtime.GOMAXPROCS(0) // Account for syscall thread. maxChildThreads = maxSysmsgThreads + 1 } mf, err := createMemoryFile() if err != nil { return nil, err } stubInitialized.Do(func() { // Don't use sentry and stub fast paths if here is just one cpu. neverEnableFastPath = min(runtime.NumCPU(), runtime.GOMAXPROCS(0)) == 1 // Initialize the stub. stubInit() // Create the source process for the global pool. This must be // done before initializing any other processes. source, err := newSubprocess(createStub, mf, false) if err != nil { // Should never happen. panic("unable to initialize systrap source: " + err.Error()) } // The source subprocess is never released explicitly by a MM. source.DecRef(nil) globalPool.source = source initSysmsgThreadPriority() initSeccompNotify() }) latencyMonitoring.Do(func() { go controlFastPath() }) return &Systrap{memoryFile: mf}, nil } // SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO. func (*Systrap) SupportsAddressSpaceIO() bool { return false } // CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace. func (*Systrap) CooperativelySchedulesAddressSpace() bool { return false } // MapUnit implements platform.Platform.MapUnit. func (*Systrap) MapUnit() uint64 { // The host kernel manages page tables and arbitrary-sized mappings // have effectively the same cost. return 0 } // MaxUserAddress returns the first address that may not be used by user // applications. func (*Systrap) MaxUserAddress() hostarch.Addr { return hostarch.Addr(maxStubUserAddress) } // NewAddressSpace returns a new subprocess. func (p *Systrap) NewAddressSpace(any) (platform.AddressSpace, <-chan struct{}, error) { as, err := newSubprocess(globalPool.source.createStub, p.memoryFile, true) return as, nil, err } // NewContext returns an interruptible platformContext. func (*Systrap) NewContext(ctx pkgcontext.Context) platform.Context { return &platformContext{ needRestoreFPState: true, needToPullFullState: false, } } type constructor struct{} func (*constructor) New(_ *fd.FD) (platform.Platform, error) { return New() } func (*constructor) OpenDevice(_ string) (*fd.FD, error) { return nil, nil } // Requirements implements platform.Constructor.Requirements(). func (*constructor) Requirements() platform.Requirements { // TODO(b/75837838): Also set a new PID namespace so that we limit // access to other host processes. return platform.Requirements{ RequiresCapSysPtrace: true, RequiresCurrentPIDNS: true, } } func init() { platform.Register("systrap", &constructor{}) } func createMemoryFile() (*pgalloc.MemoryFile, error) { const memfileName = "systrap-memory" fd, err := memutil.CreateMemFD(memfileName, 0) if err != nil { return nil, fmt.Errorf("error creating memfd: %v", err) } memfile := os.NewFile(uintptr(fd), memfileName) mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) if err != nil { memfile.Close() return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err) } return mf, nil } func corruptedSharedMemoryErr(additional string) *platform.ContextError { return &platform.ContextError{ Err: fmt.Errorf("systrap corrupted memory: %s", additional), Errno: unix.EPERM, } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/systrap_amd64.go000066400000000000000000000021771465435605700273370ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import ( "gvisor.dev/gvisor/pkg/sentry/arch" ) func stackPointer(r *arch.Registers) uintptr { return uintptr(r.Rsp) } // x86 use the fs_base register to store the TLS pointer which can be // get/set in "func (t *thread) get/setRegs(regs *arch.Registers)". // So both of the get/setTLS() operations are noop here. // getTLS gets the thread local storage register. func (t *thread) getTLS(tls *uint64) error { return nil } // setTLS sets the thread local storage register. func (t *thread) setTLS(tls *uint64) error { return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/systrap_amd64_state_autogen.go000066400000000000000000000001711465435605700322510ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 && amd64 && amd64 // +build amd64,amd64,amd64 package systrap golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/systrap_arm64.go000066400000000000000000000013361465435605700273510ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import ( "gvisor.dev/gvisor/pkg/sentry/arch" ) func stackPointer(r *arch.Registers) uintptr { return uintptr(r.Sp) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/systrap_arm64_state_autogen.go000066400000000000000000000001711465435605700322670ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 && arm64 && arm64 // +build arm64,arm64,arm64 package systrap golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/systrap_arm64_unsafe.go000066400000000000000000000030021465435605700307020ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package systrap import ( "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" ) // getTLS gets the thread local storage register. func (t *thread) getTLS(tls *uint64) error { iovec := unix.Iovec{ Base: (*byte)(unsafe.Pointer(tls)), Len: uint64(unsafe.Sizeof(*tls)), } _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_GETREGSET, uintptr(t.tid), linux.NT_ARM_TLS, uintptr(unsafe.Pointer(&iovec)), 0, 0) if errno != 0 { return errno } return nil } // setTLS sets the thread local storage register. func (t *thread) setTLS(tls *uint64) error { iovec := unix.Iovec{ Base: (*byte)(unsafe.Pointer(tls)), Len: uint64(unsafe.Sizeof(*tls)), } _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_SETREGSET, uintptr(t.tid), linux.NT_ARM_TLS, uintptr(unsafe.Pointer(&iovec)), 0, 0) if errno != 0 { return errno } return nil } systrap_arm64_unsafe_state_autogen.go000066400000000000000000000001331465435605700335470ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap// automatically generated by stateify. //go:build arm64 // +build arm64 package systrap golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/systrap_linux_state_autogen.go000066400000000000000000000001331465435605700324730ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux // +build linux package systrap systrap_linux_unsafe_state_autogen.go000066400000000000000000000001521465435605700337560ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap// automatically generated by stateify. //go:build amd64 || linux // +build amd64 linux package systrap golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/systrap_norace_state_autogen.go000066400000000000000000000001331465435605700326030ustar00rootroot00000000000000// automatically generated by stateify. //go:build !race // +build !race package systrap golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/systrap_profiling.go000066400000000000000000000035361465435605700304150ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build systrap_profiling // +build systrap_profiling package systrap import ( "gvisor.dev/gvisor/pkg/metric" ) // SystrapProfiling is a builder that produces conditionally compiled metrics. // Metrics made from this are compiled and active at runtime when the // "systrap_profiling" go-tag is specified at compilation. var SystrapProfiling = metric.RealMetricBuilder{} //go:nosplit func updateDebugMetrics(stubBoundLat, sentryBoundLat cpuTicks) { if stubBoundLat == 0 { } else if stubBoundLat < 2000 { stubLatWithin1kUS.Increment() } else if stubBoundLat < 10000 { stubLatWithin5kUS.Increment() } else if stubBoundLat < 20000 { stubLatWithin10kUS.Increment() } else if stubBoundLat < 40000 { stubLatWithin20kUS.Increment() } else if stubBoundLat < 80000 { stubLatWithin40kUS.Increment() } else { stubLatGreater40kUS.Increment() } if sentryBoundLat == 0 { } else if sentryBoundLat < 2000 { sentryLatWithin1kUS.Increment() } else if sentryBoundLat < 10000 { sentryLatWithin5kUS.Increment() } else if sentryBoundLat < 20000 { sentryLatWithin10kUS.Increment() } else if sentryBoundLat < 40000 { sentryLatWithin20kUS.Increment() } else if sentryBoundLat < 80000 { sentryLatWithin40kUS.Increment() } else { sentryLatGreater40kUS.Increment() } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/systrap_profiling_fake.go000066400000000000000000000020401465435605700313700ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !systrap_profiling // +build !systrap_profiling package systrap import ( "gvisor.dev/gvisor/pkg/metric" ) // SystrapProfiling is a builder that produces conditionally compiled metrics. // Metrics made from this are compiled and active at runtime when the // "systrap_profiling" go-tag is specified at compilation. var SystrapProfiling = metric.FakeMetricBuilder{} //go:nosplit func updateDebugMetrics(stubBoundLat, sentryBoundLat cpuTicks) {} golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/systrap_race_state_autogen.go000066400000000000000000000001311465435605700322440ustar00rootroot00000000000000// automatically generated by stateify. //go:build race // +build race package systrap golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/systrap_state_autogen.go000066400000000000000000000042451465435605700312640ustar00rootroot00000000000000// automatically generated by stateify. //go:build systrap_profiling && !systrap_profiling // +build systrap_profiling,!systrap_profiling package systrap import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (l *contextList) StateTypeName() string { return "pkg/sentry/platform/systrap.contextList" } func (l *contextList) StateFields() []string { return []string{ "head", "tail", } } func (l *contextList) beforeSave() {} // +checklocksignore func (l *contextList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *contextList) afterLoad(context.Context) {} // +checklocksignore func (l *contextList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *contextEntry) StateTypeName() string { return "pkg/sentry/platform/systrap.contextEntry" } func (e *contextEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *contextEntry) beforeSave() {} // +checklocksignore func (e *contextEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *contextEntry) afterLoad(context.Context) {} // +checklocksignore func (e *contextEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (r *subprocessRefs) StateTypeName() string { return "pkg/sentry/platform/systrap.subprocessRefs" } func (r *subprocessRefs) StateFields() []string { return []string{ "refCount", } } func (r *subprocessRefs) beforeSave() {} // +checklocksignore func (r *subprocessRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *subprocessRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func init() { state.Register((*contextList)(nil)) state.Register((*contextEntry)(nil)) state.Register((*subprocessRefs)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/systrap_unsafe.go000066400000000000000000000071441465435605700277040ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package systrap import ( "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" ) // getRegs gets the general purpose register set. func (t *thread) getRegs(regs *arch.Registers) error { iovec := unix.Iovec{ Base: (*byte)(unsafe.Pointer(regs)), Len: uint64(unsafe.Sizeof(*regs)), } _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_GETREGSET, uintptr(t.tid), linux.NT_PRSTATUS, uintptr(unsafe.Pointer(&iovec)), 0, 0) if errno != 0 { return errno } return nil } // setRegs sets the general purpose register set. func (t *thread) setRegs(regs *arch.Registers) error { iovec := unix.Iovec{ Base: (*byte)(unsafe.Pointer(regs)), Len: uint64(unsafe.Sizeof(*regs)), } _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_SETREGSET, uintptr(t.tid), linux.NT_PRSTATUS, uintptr(unsafe.Pointer(&iovec)), 0, 0) if errno != 0 { return errno } return nil } // getSignalInfo retrieves information about the signal that caused the stop. func (t *thread) getSignalInfo(si *linux.SignalInfo) error { _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_GETSIGINFO, uintptr(t.tid), 0, uintptr(unsafe.Pointer(si)), 0, 0) if errno != 0 { return errno } return nil } // clone creates a new sysmsg thread from this one. // // The returned thread will be stopped and available for any system thread to // call attach on it. // // Precondition: the OS thread must be locked and own t. func (t *thread) clone() (*thread, error) { r, ok := hostarch.Addr(stackPointer(&t.initRegs)).RoundUp() if !ok { return nil, unix.EINVAL } var flags uintptr // Create a sysmsg thread. // // CLONE_THREAD isn't set, because a stub process has SIGSTOP // in its queue. A sysmsg thread will not be traced by ptrace, // so it will be stopped immediately if it will share signal // queue with its stub process. flags = uintptr( unix.CLONE_FILES | unix.CLONE_FS | unix.CLONE_PTRACE | unix.CLONE_VM | linux.SIGKILL) rval, err := t.syscallIgnoreInterrupt( &t.initRegs, unix.SYS_CLONE, arch.SyscallArgument{Value: flags}, // The stack pointer is just made up, but we have it be // something sensible so the kernel doesn't think we're // up to no good. Which we are. arch.SyscallArgument{Value: uintptr(r)}, arch.SyscallArgument{}, arch.SyscallArgument{}, // We use these registers initially, but really they // could be anything. We're going to stop immediately. arch.SyscallArgument{Value: uintptr(unsafe.Pointer(&t.initRegs))}) if err != nil { return nil, err } return &thread{ tgid: int32(rval), tid: int32(rval), }, nil } // getEventMessage retrieves a message about the ptrace event that just happened. func (t *thread) getEventMessage() (uintptr, error) { var msg uintptr _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_GETEVENTMSG, uintptr(t.tid), 0, uintptr(unsafe.Pointer(&msg)), 0, 0) if errno != 0 { return msg, errno } return msg, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/systrap_unsafe_state_autogen.go000066400000000000000000000001351465435605700326170ustar00rootroot00000000000000// automatically generated by stateify. //go:build go1.18 // +build go1.18 package systrap golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/usertrap/000077500000000000000000000000001465435605700261465ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/usertrap/usertrap.go000066400000000000000000000044041465435605700303440ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package usertrap implements the library to replace syscall instructions with // function calls. // // The most often used pattern of performing a system call is a sequence of two // instruction: mov sysno, %eax; syscall. The size of the mov instruction is 5 // bytes and the size of the syscall instruction is 2 bytes. These two // instruction can be replaced with a single jmp instruction with an absolute // address below 2 gigabytes. // // Here is a few tricks: // - The GS register is used to access a per-thread memory. // - The syscall instruction is replaced with the "jmp *%ds:offset" instruction. // On x86_64, ds is always zero. offset is a 32-bit signed integer. This // means that a service mapping for a table with syscall trampolines has to // be mapped below 2GB. // - We can't touch a process stack, so we have to use the jmp instruction // instead of callq and generate a new function call for each replaced // instruction. Each trampoline contains a syscall number and an return // address. // - The address for the syscall table is set so that the syscall instruction // is replaced on an invalid instruction. This allows us to handle races // when two threads are executing the same syscall concurrently. And this // allows us to restart a syscall if it has been interrupted by a signal. // // +checkalignedignore package usertrap import "fmt" var ( // ErrFaultRestart indicates that the current stub thread has to be restarted. ErrFaultRestart = fmt.Errorf("need to restart stub thread") // ErrFaultSyscall indicates that the current fault has to be handled as a system call. ErrFaultSyscall = fmt.Errorf("need to handle as syscall") ) usertrap_abi_autogen_unsafe.go000066400000000000000000000001501465435605700341550ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/usertrap// Automatically generated marshal implementation. See tools/go_marshal. package usertrap import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/usertrap/usertrap_amd64.go000066400000000000000000000261601465435605700313420ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package usertrap import ( "encoding/binary" "fmt" "math/rand" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) // trapNR is the maximum number of traps what can fit in the trap table. const trapNR = 256 // trapSize is the size of one trap. const trapSize = 80 var ( // jmpInst is the binary code of "jmp *addr". jmpInst = [7]byte{0xff, 0x24, 0x25, 0, 0, 0, 0} jmpInstOpcodeLen = 3 // faultInst is the single byte invalid instruction. faultInst = [1]byte{0x6} // faultInstOffset is the offset of the syscall instruction. faultInstOffset = uintptr(5) ) type memoryManager interface { usermem.IO MMap(ctx context.Context, opts memmap.MMapOpts) (hostarch.Addr, error) FindVMAByName(ar hostarch.AddrRange, hint string) (hostarch.Addr, uint64, error) } // State represents the current state of the trap table. // // +stateify savable type State struct { mu sync.RWMutex `state:"nosave"` nextTrap uint32 tableAddr hostarch.Addr } // New returns the new state structure. func New() *State { return &State{} } // +marshal type header struct { nextTrap uint32 } func (s *State) trapAddr(trap uint32) hostarch.Addr { return s.tableAddr + hostarch.Addr(trapSize*trap) } // newTrapLocked allocates a new trap entry. // // Preconditions: s.mu must be locked. func (s *State) newTrapLocked(ctx context.Context, mm memoryManager) (hostarch.Addr, error) { var hdr header task := kernel.TaskFromContext(ctx) if task == nil { return 0, fmt.Errorf("no task found") } // s.nextTrap is zero if it isn't initialized. Here are three cases // when this can happen: // * A usertrap vma has not been mapped yet. // * The address space has been forked. // * The address space has been restored. // nextTrap is saved on the usertrap vma to handle the third and second // cases. if s.nextTrap == 0 { addr, off, err := mm.FindVMAByName(trapTableAddrRange, tableHint) if off != 0 { return 0, fmt.Errorf("the usertrap vma has been overmounted") } if err != nil { // The usertrap table has not been mapped yet. addr := hostarch.Addr(rand.Int63n(int64(trapTableAddrRange.Length()-trapTableSize))).RoundDown() + trapTableAddrRange.Start ctx.Debugf("Map a usertrap vma at %x", addr) if err := loadUsertrap(ctx, mm, addr); err != nil { return 0, err } // The first cell in the table is used to save an index of a // next unused trap. s.nextTrap = 1 s.tableAddr = addr } else if _, err := hdr.CopyIn(task.OwnCopyContext(usermem.IOOpts{AddressSpaceActive: false}), addr); err != nil { return 0, err } else { // Read an index of a next unused trap. s.nextTrap = hdr.nextTrap s.tableAddr = addr } } ctx.Debugf("Allocate a new trap: %p %d", s, s.nextTrap) if s.nextTrap >= trapNR { ctx.Warningf("No space in the trap table") return 0, fmt.Errorf("no space in the trap table") } trap := s.nextTrap s.nextTrap++ // An entire trap has to be on the same page to avoid memory faults. addr := s.trapAddr(trap) if addr/hostarch.PageSize != (addr+trapSize)/hostarch.PageSize { trap = s.nextTrap s.nextTrap++ } hdr = header{ nextTrap: s.nextTrap, } if _, err := hdr.CopyOut(task.OwnCopyContext(usermem.IOOpts{IgnorePermissions: true}), s.tableAddr); err != nil { return 0, err } return s.trapAddr(trap), nil } // trapTableAddrRange is the range where a trap table can be placed. // // The value has to be below 2GB and the high two bytes has to be an invalid // instruction. In case of 0x60000, the high two bytes is 0x6. This is "push // es" in x86 and the bad instruction on x64. var trapTableAddrRange = hostarch.AddrRange{Start: 0x60000, End: 0x70000} const ( trapTableSize = hostarch.Addr(trapNR * trapSize) tableHint = "[usertrap]" ) // LoadUsertrap maps the usertrap table into the address space. func loadUsertrap(ctx context.Context, mm memoryManager, addr hostarch.Addr) error { size, _ := hostarch.Addr(trapTableSize).RoundUp() // Force is true because Addr is below MinUserAddress. _, err := mm.MMap(ctx, memmap.MMapOpts{ Force: true, Unmap: true, Fixed: true, Addr: addr, Length: uint64(size), Private: true, Hint: tableHint, MLockMode: memmap.MLockEager, Perms: hostarch.AccessType{ Write: false, Read: true, Execute: true, }, MaxPerms: hostarch.AccessType{ Write: true, Read: true, Execute: true, }, }) if err != nil { return err } return nil } // PatchSyscall changes the syscall instruction into a function call. func (s *State) PatchSyscall(ctx context.Context, ac *arch.Context64, mm memoryManager) error { task := kernel.TaskFromContext(ctx) if task == nil { return fmt.Errorf("no task found") } s.mu.Lock() defer s.mu.Unlock() sysno := ac.SyscallNo() patchAddr := ac.IP() - uintptr(len(jmpInst)) prevCode := make([]uint8, len(jmpInst)) if _, err := primitive.CopyUint8SliceIn(task.OwnCopyContext(usermem.IOOpts{AddressSpaceActive: false}), hostarch.Addr(patchAddr), prevCode); err != nil { return err } // Check that another thread has not patched this syscall yet. // 0xb8 is the first byte of "mov sysno, %eax". if prevCode[0] == uint8(0xb8) { ctx.Debugf("Found the pattern at ip %x:sysno %d", patchAddr, sysno) trapAddr, err := s.addTrapLocked(ctx, ac, mm, uint32(sysno)) if trapAddr == 0 || err != nil { ctx.Warningf("Failed to add a new trap: %v", err) return nil } // Replace "mov sysno, %eax; syscall" with "jmp trapAddr". newCode := make([]uint8, len(jmpInst)) copy(newCode[:jmpInstOpcodeLen], jmpInst[:jmpInstOpcodeLen]) binary.LittleEndian.PutUint32(newCode[jmpInstOpcodeLen:], uint32(trapAddr)) ctx.Debugf("Apply the binary patch addr %x trap addr %x (%v -> %v)", patchAddr, trapAddr, prevCode, newCode) ignorePermContext := task.OwnCopyContext(usermem.IOOpts{IgnorePermissions: true}) // The patch can't be applied atomically, so we need to // guarantee that in each moment other threads will read a // valid set of instructions, detect any inconsistent states // and restart the patched code if so. // // A subtle aspect is the address at which the user trap table // is always mapped which is 0x60000. The first byte of this is // 0x06 which is an invalid opcode. That’s why when we // overwrite all the bytes but the first 1 in the second step // it works fine since the jump address still writes a 0x6 at // the location of the first byte of syscall instruction that // we are removing and any threads reading the instructions // will still fault at the same place. // // Another subtle aspect is the second step is done using a // regular non-atomic write which means a thread decoding the // mov instruction could read a garbage value of the immediate // operand for the ‘mov sysyno, %eax” instruction. But it // doesn’t matter since we don’t change the first byte which is // the one that contains the opcode. Also since the thread will // fault on the 0x6 right after and will be restarted with the // patched code the mov reading a garbage immediate operand // doesn’t impact correctness. // The patch is applied in three steps: // // The first step is to replace the first byte of the syscall // instruction by one-byte invalid instruction (0x06), so that // other threads which have passed the mov instruction fault on // the invalid instruction and restart a patched code. faultInstB := primitive.ByteSlice(faultInst[:]) if _, err := faultInstB.CopyOut(ignorePermContext, hostarch.Addr(patchAddr+faultInstOffset)); err != nil { return err } // The second step is to replace all bytes except the first one // which is the opcode of the mov instruction, so that the first // five bytes remain "mov XXX, %rax". if _, err := primitive.CopyUint8SliceOut(ignorePermContext, hostarch.Addr(patchAddr+1), newCode[1:]); err != nil { return err } // The final step is to replace the first byte of the patch. // After this point, all threads will read the valid jmp // instruction. if _, err := primitive.CopyUint8SliceOut(ignorePermContext, hostarch.Addr(patchAddr), newCode[0:1]); err != nil { return err } } return nil } // HandleFault handles a fault on a patched syscall instruction. // // When we replace a system call with a function call, we replace two // instructions with one instruction. This means that here can be a thread // which called the first instruction, then another thread applied a binary // patch and the first thread calls the second instruction. // // To handle this case, the function call (jmp) instruction is constructed so // that the first byte of the syscall instruction is changed with the one-byte // invalid instruction (0x6). And in case of the race, the first thread will // fault on the invalid instruction and HandleFault will restart the function // call. func (s *State) HandleFault(ctx context.Context, ac *arch.Context64, mm memoryManager) error { task := kernel.TaskFromContext(ctx) if task == nil { return fmt.Errorf("no task found") } s.mu.RLock() defer s.mu.RUnlock() code := make([]uint8, len(jmpInst)) ip := ac.IP() - faultInstOffset if _, err := primitive.CopyUint8SliceIn(task.OwnCopyContext(usermem.IOOpts{AddressSpaceActive: false}), hostarch.Addr(ip), code); err != nil { return err } for i := 0; i < jmpInstOpcodeLen; i++ { if code[i] != jmpInst[i] { return nil } } for i := 0; i < len(faultInst); i++ { if code[i+int(faultInstOffset)] != faultInst[i] { return nil } } regs := &ac.StateData().Regs if regs.Rax == uint64(unix.SYS_RESTART_SYSCALL) { // restart_syscall is usually set by the Sentry to restart a // system call after interruption by a stop signal. The Sentry // sets RAX and moves RIP back on the size of the syscall // instruction. // // RAX can't be set to SYS_RESTART_SYSCALL due to a race with // injecting a function call, because neither of the two first // bytes are equal to proper bytes of jmpInst. regs.Orig_rax = regs.Rax regs.Rip += arch.SyscallWidth return ErrFaultSyscall } ac.SetIP(ip) return ErrFaultRestart } // PreFork locks the trap table for reading. This call guarantees that the trap // table will not be changed before the next PostFork call. // +checklocksacquireread:s.mu func (s *State) PreFork() { s.mu.RLock() } // PostFork unlocks the trap table. // +checklocksreleaseread:s.mu func (s *State) PostFork() { s.mu.RUnlock() } usertrap_amd64_abi_autogen_unsafe.go000066400000000000000000000101141465435605700351510ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/usertrap// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build amd64 // +build amd64 package usertrap import ( "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "io" "reflect" "runtime" "unsafe" ) // Marshallable types used by this file. var _ marshal.Marshallable = (*header)(nil) // SizeBytes implements marshal.Marshallable.SizeBytes. func (h *header) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (h *header) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(h.nextTrap)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (h *header) UnmarshalBytes(src []byte) []byte { h.nextTrap = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (h *header) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (h *header) MarshalUnsafe(dst []byte) []byte { size := h.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(h), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (h *header) UnmarshalUnsafe(src []byte) []byte { size := h.SizeBytes() gohacks.Memmove(unsafe.Pointer(h), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (h *header) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(h))) hdr.Len = h.SizeBytes() hdr.Cap = h.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that h // must live until the use above. runtime.KeepAlive(h) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (h *header) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return h.CopyOutN(cc, addr, h.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (h *header) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(h))) hdr.Len = h.SizeBytes() hdr.Cap = h.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that h // must live until the use above. runtime.KeepAlive(h) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (h *header) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return h.CopyInN(cc, addr, h.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (h *header) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(h))) hdr.Len = h.SizeBytes() hdr.Cap = h.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that h // must live until the use above. runtime.KeepAlive(h) // escapes: replaced by intrinsic. return int64(length), err } usertrap_amd64_state_autogen.go000066400000000000000000000014711465435605700342030ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/usertrap// automatically generated by stateify. //go:build amd64 // +build amd64 package usertrap import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (s *State) StateTypeName() string { return "pkg/sentry/platform/systrap/usertrap.State" } func (s *State) StateFields() []string { return []string{ "nextTrap", "tableAddr", } } func (s *State) beforeSave() {} // +checklocksignore func (s *State) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.nextTrap) stateSinkObject.Save(1, &s.tableAddr) } func (s *State) afterLoad(context.Context) {} // +checklocksignore func (s *State) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.nextTrap) stateSourceObject.Load(1, &s.tableAddr) } func init() { state.Register((*State)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/usertrap/usertrap_amd64_unsafe.go000066400000000000000000000073051465435605700327030ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package usertrap import ( "encoding/binary" "unsafe" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg" "gvisor.dev/gvisor/pkg/usermem" ) // addTrapLocked constructs a trampoline for a specified syscall. // // mm.UserTrap.Lock has to be taken. func (s *State) addTrapLocked(ctx context.Context, ac *arch.Context64, mm memoryManager, sysno uint32) (uint64, error) { trapAddr, err := s.newTrapLocked(ctx, mm) if err != nil { return 0, err } // First eight bytes is an address which points to the 9th byte, they // are used as an argument for the jmp instruction. // // Then here is the code of the syscall trampoline. // First, we need to lock the sysmsg struct by setting StatePrep. This // is used to synchronise with sighandler which uses the same struct // sysmsg. And we need to guarantee that the current thread will not be // interrupted in syshandler, because the sysmsg struct isn't saved on // S/R. // A thread stack can't be change, so the call instruction can't be // used and we need to save values of stack and instruction registers, // switch to the syshandler stack and call the jmp instruction to // syshandler: // mov sysmsg.ThreadStatePrep, %gs:offset(msg.State) // mov %rsp,%gs:0x20 // msg.AppStack // mov %gs:0x18,%rsp // msg.SyshandlerStack // movabs $ret_addr, %rax // mov %rax,%gs:0x8 // msg.RetAddr // mov sysno,%eax // jmpq *%gs:0x10 // msg.Syshandler trap := []byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // msg.State = sysmsg.ThreadStatePrep /*08*/ 0x65, 0xc7, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mov $X, %gs:OFFSET /*20*/ 0x65, 0x48, 0x89, 0x24, 0x25, 0x20, 0x00, 0x00, 0x00, // mov %rsp,%gs:0x20 /*29*/ 0x65, 0x48, 0x8b, 0x24, 0x25, 0x18, 0x00, 0x00, 0x00, // mov %gs:0x18,%rsp /*38*/ 0x48, 0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // movabs $ret_addr, %rax /*48*/ 0x65, 0x48, 0x89, 0x04, 0x25, 0x08, 0x00, 0x00, 0x00, // mov %rax,%gs:0x8 /*57*/ 0xb8, 0x00, 0x00, 0x00, 0x00, // mov sysno,%eax /*62*/ 0x65, 0xff, 0x24, 0x25, 0x10, 0x00, 0x00, 0x00, // jmpq *%gs:0x10 } binary.LittleEndian.PutUint64(trap[40:48], uint64(ac.IP())) binary.LittleEndian.PutUint32(trap[58:62], sysno) binary.LittleEndian.PutUint64(trap[:8], uint64(trapAddr)+8) var msg *sysmsg.Msg binary.LittleEndian.PutUint32(trap[12:16], uint32(unsafe.Offsetof(msg.State))) binary.LittleEndian.PutUint32(trap[16:20], uint32(sysmsg.ThreadStatePrep)) binary.LittleEndian.PutUint32(trap[25:29], uint32(unsafe.Offsetof(msg.AppStack))) binary.LittleEndian.PutUint32(trap[34:38], uint32(unsafe.Offsetof(msg.SyshandlerStack))) binary.LittleEndian.PutUint32(trap[53:57], uint32(unsafe.Offsetof(msg.RetAddr))) binary.LittleEndian.PutUint32(trap[66:70], uint32(unsafe.Offsetof(msg.Syshandler))) iocc := usermem.IOCopyContext{ Ctx: ctx, IO: mm, Opts: usermem.IOOpts{ IgnorePermissions: true, }, } _, err = primitive.CopyByteSliceOut(&iocc, trapAddr, trap[:]) return uint64(trapAddr), err } usertrap_amd64_unsafe_abi_autogen_unsafe.go000066400000000000000000000007371465435605700365240ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/usertrap// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build amd64 // +build amd64 package usertrap import ( ) usertrap_amd64_unsafe_state_autogen.go000066400000000000000000000001341465435605700355370ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/usertrap// automatically generated by stateify. //go:build amd64 // +build amd64 package usertrap golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/usertrap/usertrap_arm64.go000066400000000000000000000036261465435605700313620ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package usertrap import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/usermem" ) // trapNR is the maximum number of traps what can fit in the trap table. const trapNR = 256 // trapSize is the size of one trap. const trapSize = 80 // TrapTableSize returns the maximum size of a trap table. func TrapTableSize() uintptr { return uintptr(trapNR * trapSize) } type memoryManager interface { usermem.IO MMap(ctx context.Context, opts memmap.MMapOpts) (hostarch.Addr, error) } // State represents the current state of the trap table. // // +stateify savable type State struct { } // New returns the new state structure. func New() *State { return &State{} } func (*State) PatchSyscall(ctx context.Context, ac *arch.Context64, mm memoryManager) (restart bool, err error) { return false /* restart */, nil } // HandleFault handles a fault on a patched syscall instruction. func (*State) HandleFault(ctx context.Context, ac *arch.Context64, mm memoryManager) error { return nil } // PreFork does nothing on arm64 as syscall trapping is not supported. func (*State) PreFork() { } // PostFork does nothing on arm64 as syscall trapping is not supported. func (*State) PostFork() { } usertrap_arm64_abi_autogen_unsafe.go000066400000000000000000000007371465435605700352010ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/usertrap// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build arm64 // +build arm64 package usertrap import ( ) usertrap_arm64_state_autogen.go000066400000000000000000000011741465435605700342210ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/usertrap// automatically generated by stateify. //go:build arm64 // +build arm64 package usertrap import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (s *State) StateTypeName() string { return "pkg/sentry/platform/systrap/usertrap.State" } func (s *State) StateFields() []string { return []string{} } func (s *State) beforeSave() {} // +checklocksignore func (s *State) StateSave(stateSinkObject state.Sink) { s.beforeSave() } func (s *State) afterLoad(context.Context) {} // +checklocksignore func (s *State) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func init() { state.Register((*State)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/platform/systrap/usertrap/usertrap_state_autogen.go000066400000000000000000000000721465435605700332630ustar00rootroot00000000000000// automatically generated by stateify. package usertrap golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/seccheck/000077500000000000000000000000001465435605700225205ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/seccheck/config.go000066400000000000000000000163731465435605700243260ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package seccheck import ( "fmt" "os" "sync" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" ) // DefaultSessionName is the name of the only session that can exist in the // system for now. When multiple sessions are supported, this can be removed. const DefaultSessionName = "Default" var ( sessionsMu = sync.Mutex{} sessions = make(map[string]*State) ) var sessionCounter = metric.MustCreateNewUint64Metric("/trace/sessions_created", metric.Uint64Metadata{ Cumulative: true, Description: "Counts the number of trace sessions created.", }) // SessionConfig describes a new session configuration. A session consists of a // set of points to be enabled and sinks where the points are sent to. type SessionConfig struct { // Name is the unique session name. Name string `json:"name,omitempty"` // Points is the set of points to enable in this session. Points []PointConfig `json:"points,omitempty"` // IgnoreMissing skips point and optional/context fields not found. This can // be used to apply a single configuration file with newer points/fields with // older versions which do not have them yet. Note that it may hide typos in // the configuration. // // This field does NOT apply to sinks. IgnoreMissing bool `json:"ignore_missing,omitempty"` // Sinks are the sinks that will process the points enabled above. Sinks []SinkConfig `json:"sinks,omitempty"` } // PointConfig describes a point to be enabled in a given session. type PointConfig struct { // Name is the point to be enabled. The point must exist in the system. Name string `json:"name,omitempty"` // OptionalFields is the list of optional fields to collect from the point. OptionalFields []string `json:"optional_fields,omitempty"` // ContextFields is the list of context fields to collect. ContextFields []string `json:"context_fields,omitempty"` } // SinkConfig describes the sink that will process the points in a given // session. type SinkConfig struct { // Name is the sink to be created. The sink must exist in the system. Name string `json:"name,omitempty"` // Config is a opaque json object that is passed to the sink. Config map[string]any `json:"config,omitempty"` // IgnoreSetupError makes errors during sink setup to be ignored. Otherwise, // failures will prevent the container from starting. IgnoreSetupError bool `json:"ignore_setup_error,omitempty"` // Status is the runtime status for the sink. Status SinkStatus `json:"status,omitempty"` // FD is the endpoint returned from Setup. It may be nil. FD *fd.FD `json:"-"` } // Create reads the session configuration and applies it to the system. func Create(conf *SessionConfig, force bool) error { log.Debugf("Creating seccheck: %+v", conf) sessionsMu.Lock() defer sessionsMu.Unlock() if _, ok := sessions[conf.Name]; ok { if !force { return fmt.Errorf("session %q already exists", conf.Name) } if err := deleteLocked(conf.Name); err != nil { return err } log.Infof("Trace session %q was deleted to be replaced", conf.Name) } if conf.Name != DefaultSessionName { return fmt.Errorf(`only a single "Default" session is supported`) } state := &Global var reqs []PointReq for _, ptConfig := range conf.Points { desc, err := findPointDesc(ptConfig.Name) if err != nil { if conf.IgnoreMissing { log.Warningf("Skipping point %q: %v", ptConfig.Name, err) continue } return err } req := PointReq{Pt: desc.ID} mask, err := setFields(ptConfig.OptionalFields, desc.OptionalFields, conf.IgnoreMissing) if err != nil { return fmt.Errorf("configuring point %q: %w", ptConfig.Name, err) } req.Fields.Local = mask mask, err = setFields(ptConfig.ContextFields, desc.ContextFields, conf.IgnoreMissing) if err != nil { return fmt.Errorf("configuring point %q: %w", ptConfig.Name, err) } req.Fields.Context = mask reqs = append(reqs, req) } for _, sinkConfig := range conf.Sinks { desc, err := findSinkDesc(sinkConfig.Name) if err != nil { return err } sink, err := desc.New(sinkConfig.Config, sinkConfig.FD) if err != nil { return fmt.Errorf("creating event sink: %w", err) } state.AppendSink(sink, reqs) } sessions[conf.Name] = state sessionCounter.Increment() return nil } // SetupSinks runs the setup step of all sinks in the configuration. func SetupSinks(sinks []SinkConfig) ([]*os.File, error) { var files []*os.File for _, sink := range sinks { sinkFile, err := setupSink(sink) if err != nil { if !sink.IgnoreSetupError { return nil, err } log.Warningf("Ignoring sink setup failure: %v", err) // Set sinkFile is nil and append it to the list to ensure the file // order is preserved. sinkFile = nil } files = append(files, sinkFile) } return files, nil } // setupSink runs the setup step for a given sink. func setupSink(config SinkConfig) (*os.File, error) { sink, err := findSinkDesc(config.Name) if err != nil { return nil, err } if sink.Setup == nil { return nil, nil } return sink.Setup(config.Config) } // Delete deletes an existing session. func Delete(name string) error { sessionsMu.Lock() defer sessionsMu.Unlock() return deleteLocked(name) } // +checklocks:sessionsMu func deleteLocked(name string) error { session := sessions[name] if session == nil { return fmt.Errorf("session %q not found", name) } session.clearSink() delete(sessions, name) return nil } // List lists all existing sessions. func List(out *[]SessionConfig) { sessionsMu.Lock() defer sessionsMu.Unlock() for name, state := range sessions { // Only report session name. Consider adding rest of the fields as needed. session := SessionConfig{Name: name} for _, sink := range state.getSinks() { session.Sinks = append(session.Sinks, SinkConfig{ Name: sink.Name(), Status: sink.Status(), }) } *out = append(*out, session) } } func findPointDesc(name string) (PointDesc, error) { if desc, ok := Points[name]; ok { return desc, nil } return PointDesc{}, fmt.Errorf("point %q not found", name) } func findField(name string, fields []FieldDesc) (FieldDesc, error) { for _, f := range fields { if f.Name == name { return f, nil } } return FieldDesc{}, fmt.Errorf("field %q not found", name) } func setFields(names []string, fields []FieldDesc, ignoreMissing bool) (FieldMask, error) { fm := FieldMask{} for _, name := range names { desc, err := findField(name, fields) if err != nil { if ignoreMissing { log.Warningf("Skipping field %q: %v", name, err) continue } return FieldMask{}, err } fm.Add(desc.ID) } return fm, nil } func findSinkDesc(name string) (SinkDesc, error) { if desc, ok := Sinks[name]; ok { return desc, nil } return SinkDesc{}, fmt.Errorf("sink %q not found", name) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/seccheck/metadata.go000066400000000000000000000172611465435605700246360ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package seccheck import ( "fmt" "os" "path" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/sync" ) // PointX represents the checkpoint X. const ( PointClone Point = iota PointContainerStart PointExecve PointExitNotifyParent PointTaskExit // Add new Points above this line. pointLengthBeforeSyscalls ) // FieldCtxtX represents a data field that comes from the Context. const ( FieldCtxtContainerID Field = iota FieldCtxtCredentials FieldCtxtCwd FieldCtxtProcessName FieldCtxtThreadGroupID FieldCtxtThreadGroupStartTime FieldCtxtThreadID FieldCtxtThreadStartTime FieldCtxtTime ) // Fields for container/start point. const ( // FieldContainerStartEnv is an optional field to collect list of environment // variables set for the container start process. FieldContainerStartEnv Field = iota ) // Fields for sentry/execve point. const ( // FieldSentryExecveBinaryInfo is an optional field to collect information // about the binary being executed. FieldSentryExecveBinaryInfo Field = iota ) // Points is a map with all the trace points registered in the system. var Points = map[string]PointDesc{} // Sinks is a map with all the sinks registered in the system. var Sinks = map[string]SinkDesc{} // defaultContextFields are the fields present in most trace points. var defaultContextFields = []FieldDesc{ { ID: FieldCtxtTime, Name: "time", }, { ID: FieldCtxtThreadID, Name: "thread_id", }, { ID: FieldCtxtThreadStartTime, Name: "task_start_time", }, { ID: FieldCtxtThreadGroupID, Name: "group_id", }, { ID: FieldCtxtThreadGroupStartTime, Name: "thread_group_start_time", }, { ID: FieldCtxtContainerID, Name: "container_id", }, { ID: FieldCtxtCredentials, Name: "credentials", }, { ID: FieldCtxtCwd, Name: "cwd", }, { ID: FieldCtxtProcessName, Name: "process_name", }, } // SinkDesc describes a sink that is available to be configured. type SinkDesc struct { // Name is a unique identifier for the sink. Name string // Setup is called outside the protection of the sandbox. This is done to // allow the sink to do whatever is necessary to set it up. If it returns a // file, this file is donated to the sandbox and passed to the sink when New // is called. config is an opaque json object passed to the sink. Setup func(config map[string]any) (*os.File, error) // New creates a new sink. config is an opaque json object passed to the sink. // endpoing is a file descriptor to the file returned in Setup. It's set to -1 // if Setup returned nil. New func(config map[string]any, endpoint *fd.FD) (Sink, error) } // RegisterSink registers a new sink to make it discoverable. func RegisterSink(sink SinkDesc) { if _, ok := Sinks[sink.Name]; ok { panic(fmt.Sprintf("Sink %q already registered", sink.Name)) } Sinks[sink.Name] = sink } // PointDesc describes a Point that is available to be configured. // Schema for these points are defined in pkg/sentry/seccheck/points/. type PointDesc struct { // ID is the point unique identifier. ID Point // Name is the point unique name. Convention is to use the following format: // namespace/name // Examples: container/start, sentry/clone, etc. Name string // OptionalFields is a list of fields that are available in the point, but not // collected unless specified when the Point is configured. // Examples: fd_path, data for read/write Points, etc. OptionalFields []FieldDesc // ContextFields is a list of fields that can be collected from the context, // but are not collected unless specified when the Point is configured. // Examples: container_id, PID, etc. ContextFields []FieldDesc } // FieldDesc describes an optional/context field that is available to be // configured. type FieldDesc struct { // ID is the numeric identifier of the field. ID Field // Name is the unique field name. Name string } func registerPoint(pt PointDesc) { if _, ok := Points[pt.Name]; ok { panic(fmt.Sprintf("Point %q already registered", pt.Name)) } if err := validateFields(pt.OptionalFields); err != nil { panic(err) } if err := validateFields(pt.ContextFields); err != nil { panic(err) } Points[pt.Name] = pt } func validateFields(fields []FieldDesc) error { ids := make(map[Field]FieldDesc) names := make(map[string]FieldDesc) for _, f := range fields { if other, ok := names[f.Name]; ok { return fmt.Errorf("field %q has repeated name with field %q", f.Name, other.Name) } if other, ok := ids[f.ID]; ok { return fmt.Errorf("field %q has repeated ID (%d) with field %q", f.Name, f.ID, other.Name) } names[f.Name] = f ids[f.ID] = f } return nil } func addRawSyscallPoint(sysno uintptr) { addSyscallPointHelper(SyscallRawEnter, sysno, fmt.Sprintf("sysno/%d", sysno), nil) } func addSyscallPoint(sysno uintptr, name string, optionalFields []FieldDesc) { addSyscallPointHelper(SyscallEnter, sysno, name, optionalFields) } func addSyscallPointHelper(typ SyscallType, sysno uintptr, name string, optionalFields []FieldDesc) { registerPoint(PointDesc{ ID: GetPointForSyscall(typ, sysno), Name: path.Join("syscall", name, "enter"), OptionalFields: optionalFields, ContextFields: defaultContextFields, }) registerPoint(PointDesc{ ID: GetPointForSyscall(typ+1, sysno), Name: path.Join("syscall", name, "exit"), OptionalFields: optionalFields, ContextFields: defaultContextFields, }) } // genericInit initializes non-architecture-specific Points available in the system. func genericInit() { // Points from the container namespace. registerPoint(PointDesc{ ID: PointContainerStart, Name: "container/start", OptionalFields: []FieldDesc{ { ID: FieldContainerStartEnv, Name: "env", }, }, ContextFields: defaultContextFields, }) // Points from the sentry namespace. registerPoint(PointDesc{ ID: PointClone, Name: "sentry/clone", ContextFields: defaultContextFields, }) registerPoint(PointDesc{ ID: PointExecve, Name: "sentry/execve", OptionalFields: []FieldDesc{ { ID: FieldSentryExecveBinaryInfo, Name: "binary_info", }, }, ContextFields: defaultContextFields, }) registerPoint(PointDesc{ ID: PointExitNotifyParent, Name: "sentry/exit_notify_parent", ContextFields: []FieldDesc{ { ID: FieldCtxtTime, Name: "time", }, { ID: FieldCtxtThreadID, Name: "thread_id", }, { ID: FieldCtxtThreadStartTime, Name: "task_start_time", }, { ID: FieldCtxtThreadGroupID, Name: "group_id", }, { ID: FieldCtxtThreadGroupStartTime, Name: "thread_group_start_time", }, { ID: FieldCtxtContainerID, Name: "container_id", }, { ID: FieldCtxtCredentials, Name: "credentials", }, { ID: FieldCtxtProcessName, Name: "process_name", }, }, }) registerPoint(PointDesc{ ID: PointTaskExit, Name: "sentry/task_exit", ContextFields: defaultContextFields, }) } var initOnce sync.Once // Initialize initializes the Points available in the system. // Must be called prior to using any of them. func Initialize() { initOnce.Do(func() { genericInit() archInit() }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/seccheck/metadata_amd64.go000066400000000000000000000114451465435605700256270ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package seccheck // archInit registers syscall trace points metadata. // Keep them sorted by syscall number. func archInit() { addSyscallPoint(0, "read", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(1, "write", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(2, "open", nil) addSyscallPoint(3, "close", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(17, "pread64", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(18, "pwrite64", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(19, "readv", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(20, "writev", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(22, "pipe", nil) addSyscallPoint(32, "dup", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(33, "dup2", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(41, "socket", nil) addSyscallPoint(42, "connect", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(43, "accept", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(49, "bind", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(53, "socketpair", nil) addSyscallPoint(56, "clone", nil) addSyscallPoint(57, "fork", nil) addSyscallPoint(58, "vfork", nil) addSyscallPoint(59, "execve", []FieldDesc{ { ID: FieldSyscallExecveEnvv, Name: "envv", }, }) addSyscallPoint(72, "fcntl", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(85, "creat", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(80, "chdir", nil) addSyscallPoint(81, "fchdir", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(105, "setuid", nil) addSyscallPoint(106, "setgid", nil) addSyscallPoint(112, "setsid", nil) addSyscallPoint(117, "setresuid", nil) addSyscallPoint(119, "setresgid", nil) addSyscallPoint(161, "chroot", nil) addSyscallPoint(253, "inotify_init", nil) addSyscallPoint(254, "inotify_add_watch", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(255, "inotify_rm_watch", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(257, "openat", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(282, "signalfd", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(283, "timerfd_create", nil) addSyscallPoint(284, "eventfd", nil) addSyscallPoint(286, "timerfd_settime", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(287, "timerfd_gettime", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(288, "accept4", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(289, "signalfd4", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(290, "eventfd2", nil) addSyscallPoint(292, "dup3", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(293, "pipe2", nil) addSyscallPoint(294, "inotify_init1", nil) addSyscallPoint(295, "preadv", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(296, "pwritev", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(302, "prlimit64", nil) addSyscallPoint(322, "execveat", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, { ID: FieldSyscallExecveEnvv, Name: "envv", }, }) addSyscallPoint(327, "preadv2", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(328, "pwritev2", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) const lastSyscallInTable = 441 for i := 0; i <= lastSyscallInTable; i++ { addRawSyscallPoint(uintptr(i)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/seccheck/metadata_arm64.go000066400000000000000000000104211465435605700256360ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package seccheck // archInit registers syscall trace points metadata. // Keep them sorted by syscall number. func archInit() { addSyscallPoint(19, "eventfd2", nil) addSyscallPoint(23, "dup", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(24, "dup3", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(25, "fcntl", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(26, "inotify_init1", nil) addSyscallPoint(27, "inotify_add_watch", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(28, "inotify_rm_watch", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(49, "chdir", nil) addSyscallPoint(50, "fchdir", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(51, "chroot", nil) addSyscallPoint(56, "openat", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(57, "close", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(59, "pipe2", nil) addSyscallPoint(63, "read", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(64, "write", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(65, "readv", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(66, "writev", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(67, "pread64", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(68, "pwrite64", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(69, "preadv", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(70, "pwritev", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(74, "signalfd4", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(85, "timerfd_create", nil) addSyscallPoint(86, "timerfd_settime", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(87, "timerfd_gettime", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(144, "setgid", nil) addSyscallPoint(146, "setuid", nil) addSyscallPoint(147, "setresuid", nil) addSyscallPoint(149, "setresgid", nil) addSyscallPoint(157, "setsid", nil) addSyscallPoint(198, "socket", nil) addSyscallPoint(199, "socketpair", nil) addSyscallPoint(200, "bind", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(202, "accept", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(203, "connect", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(220, "clone", nil) addSyscallPoint(221, "execve", []FieldDesc{ { ID: FieldSyscallExecveEnvv, Name: "envv", }, }) addSyscallPoint(242, "accept4", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(261, "prlimit64", nil) addSyscallPoint(281, "execveat", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, { ID: FieldSyscallExecveEnvv, Name: "envv", }, }) addSyscallPoint(286, "preadv2", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) addSyscallPoint(287, "pwritev2", []FieldDesc{ { ID: FieldSyscallPath, Name: "fd_path", }, }) const lastSyscallInTable = 441 for i := 0; i <= lastSyscallInTable; i++ { addRawSyscallPoint(uintptr(i)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/seccheck/points/000077500000000000000000000000001465435605700240345ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/seccheck/points/points_go_proto/000077500000000000000000000000001465435605700272605ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/seccheck/points/points_go_proto/common.pb.go000066400000000000000000000645451465435605700315150ustar00rootroot00000000000000// Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.31.0 // protoc v5.26.1 // source: pkg/sentry/seccheck/points/common.proto package points_go_proto import ( protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" reflect "reflect" sync "sync" ) const ( // Verify that this generated code is sufficiently up-to-date. _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) // Verify that runtime/protoimpl is sufficiently up-to-date. _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) type MessageType int32 const ( MessageType_MESSAGE_UNKNOWN MessageType = 0 MessageType_MESSAGE_CONTAINER_START MessageType = 1 MessageType_MESSAGE_SENTRY_CLONE MessageType = 2 MessageType_MESSAGE_SENTRY_EXEC MessageType = 3 MessageType_MESSAGE_SENTRY_EXIT_NOTIFY_PARENT MessageType = 4 MessageType_MESSAGE_SENTRY_TASK_EXIT MessageType = 5 MessageType_MESSAGE_SYSCALL_RAW MessageType = 6 MessageType_MESSAGE_SYSCALL_OPEN MessageType = 7 MessageType_MESSAGE_SYSCALL_CLOSE MessageType = 8 MessageType_MESSAGE_SYSCALL_READ MessageType = 9 MessageType_MESSAGE_SYSCALL_CONNECT MessageType = 10 MessageType_MESSAGE_SYSCALL_EXECVE MessageType = 11 MessageType_MESSAGE_SYSCALL_SOCKET MessageType = 12 MessageType_MESSAGE_SYSCALL_CHDIR MessageType = 13 MessageType_MESSAGE_SYSCALL_SETID MessageType = 14 MessageType_MESSAGE_SYSCALL_SETRESID MessageType = 15 MessageType_MESSAGE_SYSCALL_PRLIMIT64 MessageType = 16 MessageType_MESSAGE_SYSCALL_PIPE MessageType = 17 MessageType_MESSAGE_SYSCALL_FCNTL MessageType = 18 MessageType_MESSAGE_SYSCALL_DUP MessageType = 19 MessageType_MESSAGE_SYSCALL_SIGNALFD MessageType = 20 MessageType_MESSAGE_SYSCALL_CHROOT MessageType = 21 MessageType_MESSAGE_SYSCALL_EVENTFD MessageType = 22 MessageType_MESSAGE_SYSCALL_CLONE MessageType = 23 MessageType_MESSAGE_SYSCALL_BIND MessageType = 24 MessageType_MESSAGE_SYSCALL_ACCEPT MessageType = 25 MessageType_MESSAGE_SYSCALL_TIMERFD_CREATE MessageType = 26 MessageType_MESSAGE_SYSCALL_TIMERFD_SETTIME MessageType = 27 MessageType_MESSAGE_SYSCALL_TIMERFD_GETTIME MessageType = 28 MessageType_MESSAGE_SYSCALL_FORK MessageType = 29 MessageType_MESSAGE_SYSCALL_INOTIFY_INIT MessageType = 30 MessageType_MESSAGE_SYSCALL_INOTIFY_ADD_WATCH MessageType = 31 MessageType_MESSAGE_SYSCALL_INOTIFY_RM_WATCH MessageType = 32 MessageType_MESSAGE_SYSCALL_SOCKETPAIR MessageType = 33 MessageType_MESSAGE_SYSCALL_WRITE MessageType = 34 ) // Enum value maps for MessageType. var ( MessageType_name = map[int32]string{ 0: "MESSAGE_UNKNOWN", 1: "MESSAGE_CONTAINER_START", 2: "MESSAGE_SENTRY_CLONE", 3: "MESSAGE_SENTRY_EXEC", 4: "MESSAGE_SENTRY_EXIT_NOTIFY_PARENT", 5: "MESSAGE_SENTRY_TASK_EXIT", 6: "MESSAGE_SYSCALL_RAW", 7: "MESSAGE_SYSCALL_OPEN", 8: "MESSAGE_SYSCALL_CLOSE", 9: "MESSAGE_SYSCALL_READ", 10: "MESSAGE_SYSCALL_CONNECT", 11: "MESSAGE_SYSCALL_EXECVE", 12: "MESSAGE_SYSCALL_SOCKET", 13: "MESSAGE_SYSCALL_CHDIR", 14: "MESSAGE_SYSCALL_SETID", 15: "MESSAGE_SYSCALL_SETRESID", 16: "MESSAGE_SYSCALL_PRLIMIT64", 17: "MESSAGE_SYSCALL_PIPE", 18: "MESSAGE_SYSCALL_FCNTL", 19: "MESSAGE_SYSCALL_DUP", 20: "MESSAGE_SYSCALL_SIGNALFD", 21: "MESSAGE_SYSCALL_CHROOT", 22: "MESSAGE_SYSCALL_EVENTFD", 23: "MESSAGE_SYSCALL_CLONE", 24: "MESSAGE_SYSCALL_BIND", 25: "MESSAGE_SYSCALL_ACCEPT", 26: "MESSAGE_SYSCALL_TIMERFD_CREATE", 27: "MESSAGE_SYSCALL_TIMERFD_SETTIME", 28: "MESSAGE_SYSCALL_TIMERFD_GETTIME", 29: "MESSAGE_SYSCALL_FORK", 30: "MESSAGE_SYSCALL_INOTIFY_INIT", 31: "MESSAGE_SYSCALL_INOTIFY_ADD_WATCH", 32: "MESSAGE_SYSCALL_INOTIFY_RM_WATCH", 33: "MESSAGE_SYSCALL_SOCKETPAIR", 34: "MESSAGE_SYSCALL_WRITE", } MessageType_value = map[string]int32{ "MESSAGE_UNKNOWN": 0, "MESSAGE_CONTAINER_START": 1, "MESSAGE_SENTRY_CLONE": 2, "MESSAGE_SENTRY_EXEC": 3, "MESSAGE_SENTRY_EXIT_NOTIFY_PARENT": 4, "MESSAGE_SENTRY_TASK_EXIT": 5, "MESSAGE_SYSCALL_RAW": 6, "MESSAGE_SYSCALL_OPEN": 7, "MESSAGE_SYSCALL_CLOSE": 8, "MESSAGE_SYSCALL_READ": 9, "MESSAGE_SYSCALL_CONNECT": 10, "MESSAGE_SYSCALL_EXECVE": 11, "MESSAGE_SYSCALL_SOCKET": 12, "MESSAGE_SYSCALL_CHDIR": 13, "MESSAGE_SYSCALL_SETID": 14, "MESSAGE_SYSCALL_SETRESID": 15, "MESSAGE_SYSCALL_PRLIMIT64": 16, "MESSAGE_SYSCALL_PIPE": 17, "MESSAGE_SYSCALL_FCNTL": 18, "MESSAGE_SYSCALL_DUP": 19, "MESSAGE_SYSCALL_SIGNALFD": 20, "MESSAGE_SYSCALL_CHROOT": 21, "MESSAGE_SYSCALL_EVENTFD": 22, "MESSAGE_SYSCALL_CLONE": 23, "MESSAGE_SYSCALL_BIND": 24, "MESSAGE_SYSCALL_ACCEPT": 25, "MESSAGE_SYSCALL_TIMERFD_CREATE": 26, "MESSAGE_SYSCALL_TIMERFD_SETTIME": 27, "MESSAGE_SYSCALL_TIMERFD_GETTIME": 28, "MESSAGE_SYSCALL_FORK": 29, "MESSAGE_SYSCALL_INOTIFY_INIT": 30, "MESSAGE_SYSCALL_INOTIFY_ADD_WATCH": 31, "MESSAGE_SYSCALL_INOTIFY_RM_WATCH": 32, "MESSAGE_SYSCALL_SOCKETPAIR": 33, "MESSAGE_SYSCALL_WRITE": 34, } ) func (x MessageType) Enum() *MessageType { p := new(MessageType) *p = x return p } func (x MessageType) String() string { return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) } func (MessageType) Descriptor() protoreflect.EnumDescriptor { return file_pkg_sentry_seccheck_points_common_proto_enumTypes[0].Descriptor() } func (MessageType) Type() protoreflect.EnumType { return &file_pkg_sentry_seccheck_points_common_proto_enumTypes[0] } func (x MessageType) Number() protoreflect.EnumNumber { return protoreflect.EnumNumber(x) } // Deprecated: Use MessageType.Descriptor instead. func (MessageType) EnumDescriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_common_proto_rawDescGZIP(), []int{0} } type Handshake struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Version uint32 `protobuf:"varint,1,opt,name=version,proto3" json:"version,omitempty"` } func (x *Handshake) Reset() { *x = Handshake{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_common_proto_msgTypes[0] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Handshake) String() string { return protoimpl.X.MessageStringOf(x) } func (*Handshake) ProtoMessage() {} func (x *Handshake) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_common_proto_msgTypes[0] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Handshake.ProtoReflect.Descriptor instead. func (*Handshake) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_common_proto_rawDescGZIP(), []int{0} } func (x *Handshake) GetVersion() uint32 { if x != nil { return x.Version } return 0 } type Credentials struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields RealUid uint32 `protobuf:"varint,1,opt,name=real_uid,json=realUid,proto3" json:"real_uid,omitempty"` EffectiveUid uint32 `protobuf:"varint,2,opt,name=effective_uid,json=effectiveUid,proto3" json:"effective_uid,omitempty"` SavedUid uint32 `protobuf:"varint,3,opt,name=saved_uid,json=savedUid,proto3" json:"saved_uid,omitempty"` RealGid uint32 `protobuf:"varint,4,opt,name=real_gid,json=realGid,proto3" json:"real_gid,omitempty"` EffectiveGid uint32 `protobuf:"varint,5,opt,name=effective_gid,json=effectiveGid,proto3" json:"effective_gid,omitempty"` SavedGid uint32 `protobuf:"varint,6,opt,name=saved_gid,json=savedGid,proto3" json:"saved_gid,omitempty"` } func (x *Credentials) Reset() { *x = Credentials{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_common_proto_msgTypes[1] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Credentials) String() string { return protoimpl.X.MessageStringOf(x) } func (*Credentials) ProtoMessage() {} func (x *Credentials) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_common_proto_msgTypes[1] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Credentials.ProtoReflect.Descriptor instead. func (*Credentials) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_common_proto_rawDescGZIP(), []int{1} } func (x *Credentials) GetRealUid() uint32 { if x != nil { return x.RealUid } return 0 } func (x *Credentials) GetEffectiveUid() uint32 { if x != nil { return x.EffectiveUid } return 0 } func (x *Credentials) GetSavedUid() uint32 { if x != nil { return x.SavedUid } return 0 } func (x *Credentials) GetRealGid() uint32 { if x != nil { return x.RealGid } return 0 } func (x *Credentials) GetEffectiveGid() uint32 { if x != nil { return x.EffectiveGid } return 0 } func (x *Credentials) GetSavedGid() uint32 { if x != nil { return x.SavedGid } return 0 } type ContextData struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields TimeNs int64 `protobuf:"varint,1,opt,name=time_ns,json=timeNs,proto3" json:"time_ns,omitempty"` ThreadId int32 `protobuf:"varint,2,opt,name=thread_id,json=threadId,proto3" json:"thread_id,omitempty"` ThreadStartTimeNs int64 `protobuf:"varint,3,opt,name=thread_start_time_ns,json=threadStartTimeNs,proto3" json:"thread_start_time_ns,omitempty"` ThreadGroupId int32 `protobuf:"varint,4,opt,name=thread_group_id,json=threadGroupId,proto3" json:"thread_group_id,omitempty"` ThreadGroupStartTimeNs int64 `protobuf:"varint,5,opt,name=thread_group_start_time_ns,json=threadGroupStartTimeNs,proto3" json:"thread_group_start_time_ns,omitempty"` ContainerId string `protobuf:"bytes,6,opt,name=container_id,json=containerId,proto3" json:"container_id,omitempty"` Credentials *Credentials `protobuf:"bytes,7,opt,name=credentials,proto3" json:"credentials,omitempty"` Cwd string `protobuf:"bytes,8,opt,name=cwd,proto3" json:"cwd,omitempty"` ProcessName string `protobuf:"bytes,9,opt,name=process_name,json=processName,proto3" json:"process_name,omitempty"` } func (x *ContextData) Reset() { *x = ContextData{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_common_proto_msgTypes[2] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *ContextData) String() string { return protoimpl.X.MessageStringOf(x) } func (*ContextData) ProtoMessage() {} func (x *ContextData) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_common_proto_msgTypes[2] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use ContextData.ProtoReflect.Descriptor instead. func (*ContextData) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_common_proto_rawDescGZIP(), []int{2} } func (x *ContextData) GetTimeNs() int64 { if x != nil { return x.TimeNs } return 0 } func (x *ContextData) GetThreadId() int32 { if x != nil { return x.ThreadId } return 0 } func (x *ContextData) GetThreadStartTimeNs() int64 { if x != nil { return x.ThreadStartTimeNs } return 0 } func (x *ContextData) GetThreadGroupId() int32 { if x != nil { return x.ThreadGroupId } return 0 } func (x *ContextData) GetThreadGroupStartTimeNs() int64 { if x != nil { return x.ThreadGroupStartTimeNs } return 0 } func (x *ContextData) GetContainerId() string { if x != nil { return x.ContainerId } return "" } func (x *ContextData) GetCredentials() *Credentials { if x != nil { return x.Credentials } return nil } func (x *ContextData) GetCwd() string { if x != nil { return x.Cwd } return "" } func (x *ContextData) GetProcessName() string { if x != nil { return x.ProcessName } return "" } var File_pkg_sentry_seccheck_points_common_proto protoreflect.FileDescriptor var file_pkg_sentry_seccheck_points_common_proto_rawDesc = []byte{ 0x0a, 0x27, 0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x2f, 0x73, 0x65, 0x63, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x2f, 0x70, 0x6f, 0x69, 0x6e, 0x74, 0x73, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x0d, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x22, 0x25, 0x0a, 0x09, 0x48, 0x61, 0x6e, 0x64, 0x73, 0x68, 0x61, 0x6b, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x22, 0xc7, 0x01, 0x0a, 0x0b, 0x43, 0x72, 0x65, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x73, 0x12, 0x19, 0x0a, 0x08, 0x72, 0x65, 0x61, 0x6c, 0x5f, 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x07, 0x72, 0x65, 0x61, 0x6c, 0x55, 0x69, 0x64, 0x12, 0x23, 0x0a, 0x0d, 0x65, 0x66, 0x66, 0x65, 0x63, 0x74, 0x69, 0x76, 0x65, 0x5f, 0x75, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0c, 0x65, 0x66, 0x66, 0x65, 0x63, 0x74, 0x69, 0x76, 0x65, 0x55, 0x69, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x61, 0x76, 0x65, 0x64, 0x5f, 0x75, 0x69, 0x64, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x61, 0x76, 0x65, 0x64, 0x55, 0x69, 0x64, 0x12, 0x19, 0x0a, 0x08, 0x72, 0x65, 0x61, 0x6c, 0x5f, 0x67, 0x69, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x07, 0x72, 0x65, 0x61, 0x6c, 0x47, 0x69, 0x64, 0x12, 0x23, 0x0a, 0x0d, 0x65, 0x66, 0x66, 0x65, 0x63, 0x74, 0x69, 0x76, 0x65, 0x5f, 0x67, 0x69, 0x64, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0c, 0x65, 0x66, 0x66, 0x65, 0x63, 0x74, 0x69, 0x76, 0x65, 0x47, 0x69, 0x64, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x61, 0x76, 0x65, 0x64, 0x5f, 0x67, 0x69, 0x64, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x08, 0x73, 0x61, 0x76, 0x65, 0x64, 0x47, 0x69, 0x64, 0x22, 0xee, 0x02, 0x0a, 0x0b, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x17, 0x0a, 0x07, 0x74, 0x69, 0x6d, 0x65, 0x5f, 0x6e, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x03, 0x52, 0x06, 0x74, 0x69, 0x6d, 0x65, 0x4e, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x5f, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x08, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, 0x12, 0x2f, 0x0a, 0x14, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x5f, 0x73, 0x74, 0x61, 0x72, 0x74, 0x5f, 0x74, 0x69, 0x6d, 0x65, 0x5f, 0x6e, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x03, 0x52, 0x11, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x53, 0x74, 0x61, 0x72, 0x74, 0x54, 0x69, 0x6d, 0x65, 0x4e, 0x73, 0x12, 0x26, 0x0a, 0x0f, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, 0x69, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0d, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x47, 0x72, 0x6f, 0x75, 0x70, 0x49, 0x64, 0x12, 0x3a, 0x0a, 0x1a, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, 0x73, 0x74, 0x61, 0x72, 0x74, 0x5f, 0x74, 0x69, 0x6d, 0x65, 0x5f, 0x6e, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, 0x03, 0x52, 0x16, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x47, 0x72, 0x6f, 0x75, 0x70, 0x53, 0x74, 0x61, 0x72, 0x74, 0x54, 0x69, 0x6d, 0x65, 0x4e, 0x73, 0x12, 0x21, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x5f, 0x69, 0x64, 0x18, 0x06, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x49, 0x64, 0x12, 0x3c, 0x0a, 0x0b, 0x63, 0x72, 0x65, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x73, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x72, 0x65, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x73, 0x52, 0x0b, 0x63, 0x72, 0x65, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x73, 0x12, 0x10, 0x0a, 0x03, 0x63, 0x77, 0x64, 0x18, 0x08, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x63, 0x77, 0x64, 0x12, 0x21, 0x0a, 0x0c, 0x70, 0x72, 0x6f, 0x63, 0x65, 0x73, 0x73, 0x5f, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x09, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x70, 0x72, 0x6f, 0x63, 0x65, 0x73, 0x73, 0x4e, 0x61, 0x6d, 0x65, 0x2a, 0x8f, 0x08, 0x0a, 0x0b, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x54, 0x79, 0x70, 0x65, 0x12, 0x13, 0x0a, 0x0f, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, 0x1b, 0x0a, 0x17, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x43, 0x4f, 0x4e, 0x54, 0x41, 0x49, 0x4e, 0x45, 0x52, 0x5f, 0x53, 0x54, 0x41, 0x52, 0x54, 0x10, 0x01, 0x12, 0x18, 0x0a, 0x14, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x45, 0x4e, 0x54, 0x52, 0x59, 0x5f, 0x43, 0x4c, 0x4f, 0x4e, 0x45, 0x10, 0x02, 0x12, 0x17, 0x0a, 0x13, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x45, 0x4e, 0x54, 0x52, 0x59, 0x5f, 0x45, 0x58, 0x45, 0x43, 0x10, 0x03, 0x12, 0x25, 0x0a, 0x21, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x45, 0x4e, 0x54, 0x52, 0x59, 0x5f, 0x45, 0x58, 0x49, 0x54, 0x5f, 0x4e, 0x4f, 0x54, 0x49, 0x46, 0x59, 0x5f, 0x50, 0x41, 0x52, 0x45, 0x4e, 0x54, 0x10, 0x04, 0x12, 0x1c, 0x0a, 0x18, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x45, 0x4e, 0x54, 0x52, 0x59, 0x5f, 0x54, 0x41, 0x53, 0x4b, 0x5f, 0x45, 0x58, 0x49, 0x54, 0x10, 0x05, 0x12, 0x17, 0x0a, 0x13, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x52, 0x41, 0x57, 0x10, 0x06, 0x12, 0x18, 0x0a, 0x14, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x4f, 0x50, 0x45, 0x4e, 0x10, 0x07, 0x12, 0x19, 0x0a, 0x15, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x43, 0x4c, 0x4f, 0x53, 0x45, 0x10, 0x08, 0x12, 0x18, 0x0a, 0x14, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x52, 0x45, 0x41, 0x44, 0x10, 0x09, 0x12, 0x1b, 0x0a, 0x17, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x4e, 0x45, 0x43, 0x54, 0x10, 0x0a, 0x12, 0x1a, 0x0a, 0x16, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x45, 0x58, 0x45, 0x43, 0x56, 0x45, 0x10, 0x0b, 0x12, 0x1a, 0x0a, 0x16, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x53, 0x4f, 0x43, 0x4b, 0x45, 0x54, 0x10, 0x0c, 0x12, 0x19, 0x0a, 0x15, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x43, 0x48, 0x44, 0x49, 0x52, 0x10, 0x0d, 0x12, 0x19, 0x0a, 0x15, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x53, 0x45, 0x54, 0x49, 0x44, 0x10, 0x0e, 0x12, 0x1c, 0x0a, 0x18, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x53, 0x45, 0x54, 0x52, 0x45, 0x53, 0x49, 0x44, 0x10, 0x0f, 0x12, 0x1d, 0x0a, 0x19, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x50, 0x52, 0x4c, 0x49, 0x4d, 0x49, 0x54, 0x36, 0x34, 0x10, 0x10, 0x12, 0x18, 0x0a, 0x14, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x50, 0x49, 0x50, 0x45, 0x10, 0x11, 0x12, 0x19, 0x0a, 0x15, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x46, 0x43, 0x4e, 0x54, 0x4c, 0x10, 0x12, 0x12, 0x17, 0x0a, 0x13, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x44, 0x55, 0x50, 0x10, 0x13, 0x12, 0x1c, 0x0a, 0x18, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x53, 0x49, 0x47, 0x4e, 0x41, 0x4c, 0x46, 0x44, 0x10, 0x14, 0x12, 0x1a, 0x0a, 0x16, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x43, 0x48, 0x52, 0x4f, 0x4f, 0x54, 0x10, 0x15, 0x12, 0x1b, 0x0a, 0x17, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x45, 0x56, 0x45, 0x4e, 0x54, 0x46, 0x44, 0x10, 0x16, 0x12, 0x19, 0x0a, 0x15, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x43, 0x4c, 0x4f, 0x4e, 0x45, 0x10, 0x17, 0x12, 0x18, 0x0a, 0x14, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x42, 0x49, 0x4e, 0x44, 0x10, 0x18, 0x12, 0x1a, 0x0a, 0x16, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x41, 0x43, 0x43, 0x45, 0x50, 0x54, 0x10, 0x19, 0x12, 0x22, 0x0a, 0x1e, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x52, 0x46, 0x44, 0x5f, 0x43, 0x52, 0x45, 0x41, 0x54, 0x45, 0x10, 0x1a, 0x12, 0x23, 0x0a, 0x1f, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x52, 0x46, 0x44, 0x5f, 0x53, 0x45, 0x54, 0x54, 0x49, 0x4d, 0x45, 0x10, 0x1b, 0x12, 0x23, 0x0a, 0x1f, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x52, 0x46, 0x44, 0x5f, 0x47, 0x45, 0x54, 0x54, 0x49, 0x4d, 0x45, 0x10, 0x1c, 0x12, 0x18, 0x0a, 0x14, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x46, 0x4f, 0x52, 0x4b, 0x10, 0x1d, 0x12, 0x20, 0x0a, 0x1c, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x49, 0x4e, 0x4f, 0x54, 0x49, 0x46, 0x59, 0x5f, 0x49, 0x4e, 0x49, 0x54, 0x10, 0x1e, 0x12, 0x25, 0x0a, 0x21, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x49, 0x4e, 0x4f, 0x54, 0x49, 0x46, 0x59, 0x5f, 0x41, 0x44, 0x44, 0x5f, 0x57, 0x41, 0x54, 0x43, 0x48, 0x10, 0x1f, 0x12, 0x24, 0x0a, 0x20, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x49, 0x4e, 0x4f, 0x54, 0x49, 0x46, 0x59, 0x5f, 0x52, 0x4d, 0x5f, 0x57, 0x41, 0x54, 0x43, 0x48, 0x10, 0x20, 0x12, 0x1e, 0x0a, 0x1a, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x53, 0x4f, 0x43, 0x4b, 0x45, 0x54, 0x50, 0x41, 0x49, 0x52, 0x10, 0x21, 0x12, 0x19, 0x0a, 0x15, 0x4d, 0x45, 0x53, 0x53, 0x41, 0x47, 0x45, 0x5f, 0x53, 0x59, 0x53, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x57, 0x52, 0x49, 0x54, 0x45, 0x10, 0x22, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( file_pkg_sentry_seccheck_points_common_proto_rawDescOnce sync.Once file_pkg_sentry_seccheck_points_common_proto_rawDescData = file_pkg_sentry_seccheck_points_common_proto_rawDesc ) func file_pkg_sentry_seccheck_points_common_proto_rawDescGZIP() []byte { file_pkg_sentry_seccheck_points_common_proto_rawDescOnce.Do(func() { file_pkg_sentry_seccheck_points_common_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_sentry_seccheck_points_common_proto_rawDescData) }) return file_pkg_sentry_seccheck_points_common_proto_rawDescData } var file_pkg_sentry_seccheck_points_common_proto_enumTypes = make([]protoimpl.EnumInfo, 1) var file_pkg_sentry_seccheck_points_common_proto_msgTypes = make([]protoimpl.MessageInfo, 3) var file_pkg_sentry_seccheck_points_common_proto_goTypes = []interface{}{ (MessageType)(0), // 0: gvisor.common.MessageType (*Handshake)(nil), // 1: gvisor.common.Handshake (*Credentials)(nil), // 2: gvisor.common.Credentials (*ContextData)(nil), // 3: gvisor.common.ContextData } var file_pkg_sentry_seccheck_points_common_proto_depIdxs = []int32{ 2, // 0: gvisor.common.ContextData.credentials:type_name -> gvisor.common.Credentials 1, // [1:1] is the sub-list for method output_type 1, // [1:1] is the sub-list for method input_type 1, // [1:1] is the sub-list for extension type_name 1, // [1:1] is the sub-list for extension extendee 0, // [0:1] is the sub-list for field type_name } func init() { file_pkg_sentry_seccheck_points_common_proto_init() } func file_pkg_sentry_seccheck_points_common_proto_init() { if File_pkg_sentry_seccheck_points_common_proto != nil { return } if !protoimpl.UnsafeEnabled { file_pkg_sentry_seccheck_points_common_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Handshake); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_common_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Credentials); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_common_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*ContextData); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } } type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_pkg_sentry_seccheck_points_common_proto_rawDesc, NumEnums: 1, NumMessages: 3, NumExtensions: 0, NumServices: 0, }, GoTypes: file_pkg_sentry_seccheck_points_common_proto_goTypes, DependencyIndexes: file_pkg_sentry_seccheck_points_common_proto_depIdxs, EnumInfos: file_pkg_sentry_seccheck_points_common_proto_enumTypes, MessageInfos: file_pkg_sentry_seccheck_points_common_proto_msgTypes, }.Build() File_pkg_sentry_seccheck_points_common_proto = out.File file_pkg_sentry_seccheck_points_common_proto_rawDesc = nil file_pkg_sentry_seccheck_points_common_proto_goTypes = nil file_pkg_sentry_seccheck_points_common_proto_depIdxs = nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/seccheck/points/points_go_proto/container.pb.go000066400000000000000000000160541465435605700321770ustar00rootroot00000000000000// Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.31.0 // protoc v5.26.1 // source: pkg/sentry/seccheck/points/container.proto package points_go_proto import ( protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" reflect "reflect" sync "sync" ) const ( // Verify that this generated code is sufficiently up-to-date. _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) // Verify that runtime/protoimpl is sufficiently up-to-date. _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) type Start struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Id string `protobuf:"bytes,2,opt,name=id,proto3" json:"id,omitempty"` Cwd string `protobuf:"bytes,3,opt,name=cwd,proto3" json:"cwd,omitempty"` Args []string `protobuf:"bytes,4,rep,name=args,proto3" json:"args,omitempty"` Env []string `protobuf:"bytes,5,rep,name=env,proto3" json:"env,omitempty"` Terminal bool `protobuf:"varint,6,opt,name=terminal,proto3" json:"terminal,omitempty"` } func (x *Start) Reset() { *x = Start{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_container_proto_msgTypes[0] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Start) String() string { return protoimpl.X.MessageStringOf(x) } func (*Start) ProtoMessage() {} func (x *Start) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_container_proto_msgTypes[0] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Start.ProtoReflect.Descriptor instead. func (*Start) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_container_proto_rawDescGZIP(), []int{0} } func (x *Start) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Start) GetId() string { if x != nil { return x.Id } return "" } func (x *Start) GetCwd() string { if x != nil { return x.Cwd } return "" } func (x *Start) GetArgs() []string { if x != nil { return x.Args } return nil } func (x *Start) GetEnv() []string { if x != nil { return x.Env } return nil } func (x *Start) GetTerminal() bool { if x != nil { return x.Terminal } return false } var File_pkg_sentry_seccheck_points_container_proto protoreflect.FileDescriptor var file_pkg_sentry_seccheck_points_container_proto_rawDesc = []byte{ 0x0a, 0x2a, 0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x2f, 0x73, 0x65, 0x63, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x2f, 0x70, 0x6f, 0x69, 0x6e, 0x74, 0x73, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x10, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x1a, 0x27, 0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x2f, 0x73, 0x65, 0x63, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x2f, 0x70, 0x6f, 0x69, 0x6e, 0x74, 0x73, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0xaa, 0x01, 0x0a, 0x05, 0x53, 0x74, 0x61, 0x72, 0x74, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x10, 0x0a, 0x03, 0x63, 0x77, 0x64, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x63, 0x77, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x61, 0x72, 0x67, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x09, 0x52, 0x04, 0x61, 0x72, 0x67, 0x73, 0x12, 0x10, 0x0a, 0x03, 0x65, 0x6e, 0x76, 0x18, 0x05, 0x20, 0x03, 0x28, 0x09, 0x52, 0x03, 0x65, 0x6e, 0x76, 0x12, 0x1a, 0x0a, 0x08, 0x74, 0x65, 0x72, 0x6d, 0x69, 0x6e, 0x61, 0x6c, 0x18, 0x06, 0x20, 0x01, 0x28, 0x08, 0x52, 0x08, 0x74, 0x65, 0x72, 0x6d, 0x69, 0x6e, 0x61, 0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( file_pkg_sentry_seccheck_points_container_proto_rawDescOnce sync.Once file_pkg_sentry_seccheck_points_container_proto_rawDescData = file_pkg_sentry_seccheck_points_container_proto_rawDesc ) func file_pkg_sentry_seccheck_points_container_proto_rawDescGZIP() []byte { file_pkg_sentry_seccheck_points_container_proto_rawDescOnce.Do(func() { file_pkg_sentry_seccheck_points_container_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_sentry_seccheck_points_container_proto_rawDescData) }) return file_pkg_sentry_seccheck_points_container_proto_rawDescData } var file_pkg_sentry_seccheck_points_container_proto_msgTypes = make([]protoimpl.MessageInfo, 1) var file_pkg_sentry_seccheck_points_container_proto_goTypes = []interface{}{ (*Start)(nil), // 0: gvisor.container.Start (*ContextData)(nil), // 1: gvisor.common.ContextData } var file_pkg_sentry_seccheck_points_container_proto_depIdxs = []int32{ 1, // 0: gvisor.container.Start.context_data:type_name -> gvisor.common.ContextData 1, // [1:1] is the sub-list for method output_type 1, // [1:1] is the sub-list for method input_type 1, // [1:1] is the sub-list for extension type_name 1, // [1:1] is the sub-list for extension extendee 0, // [0:1] is the sub-list for field type_name } func init() { file_pkg_sentry_seccheck_points_container_proto_init() } func file_pkg_sentry_seccheck_points_container_proto_init() { if File_pkg_sentry_seccheck_points_container_proto != nil { return } file_pkg_sentry_seccheck_points_common_proto_init() if !protoimpl.UnsafeEnabled { file_pkg_sentry_seccheck_points_container_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Start); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } } type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_pkg_sentry_seccheck_points_container_proto_rawDesc, NumEnums: 0, NumMessages: 1, NumExtensions: 0, NumServices: 0, }, GoTypes: file_pkg_sentry_seccheck_points_container_proto_goTypes, DependencyIndexes: file_pkg_sentry_seccheck_points_container_proto_depIdxs, MessageInfos: file_pkg_sentry_seccheck_points_container_proto_msgTypes, }.Build() File_pkg_sentry_seccheck_points_container_proto = out.File file_pkg_sentry_seccheck_points_container_proto_rawDesc = nil file_pkg_sentry_seccheck_points_container_proto_goTypes = nil file_pkg_sentry_seccheck_points_container_proto_depIdxs = nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/seccheck/points/points_go_proto/sentry.pb.go000066400000000000000000000431571465435605700315450ustar00rootroot00000000000000// Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.31.0 // protoc v5.26.1 // source: pkg/sentry/seccheck/points/sentry.proto package points_go_proto import ( protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" reflect "reflect" sync "sync" ) const ( // Verify that this generated code is sufficiently up-to-date. _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) // Verify that runtime/protoimpl is sufficiently up-to-date. _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) type CloneInfo struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` CreatedThreadId int32 `protobuf:"varint,3,opt,name=created_thread_id,json=createdThreadId,proto3" json:"created_thread_id,omitempty"` CreatedThreadGroupId int32 `protobuf:"varint,4,opt,name=created_thread_group_id,json=createdThreadGroupId,proto3" json:"created_thread_group_id,omitempty"` CreatedThreadStartTimeNs int64 `protobuf:"varint,5,opt,name=created_thread_start_time_ns,json=createdThreadStartTimeNs,proto3" json:"created_thread_start_time_ns,omitempty"` Flags uint64 `protobuf:"varint,6,opt,name=flags,proto3" json:"flags,omitempty"` } func (x *CloneInfo) Reset() { *x = CloneInfo{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_sentry_proto_msgTypes[0] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *CloneInfo) String() string { return protoimpl.X.MessageStringOf(x) } func (*CloneInfo) ProtoMessage() {} func (x *CloneInfo) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_sentry_proto_msgTypes[0] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use CloneInfo.ProtoReflect.Descriptor instead. func (*CloneInfo) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_sentry_proto_rawDescGZIP(), []int{0} } func (x *CloneInfo) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *CloneInfo) GetCreatedThreadId() int32 { if x != nil { return x.CreatedThreadId } return 0 } func (x *CloneInfo) GetCreatedThreadGroupId() int32 { if x != nil { return x.CreatedThreadGroupId } return 0 } func (x *CloneInfo) GetCreatedThreadStartTimeNs() int64 { if x != nil { return x.CreatedThreadStartTimeNs } return 0 } func (x *CloneInfo) GetFlags() uint64 { if x != nil { return x.Flags } return 0 } type ExecveInfo struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` BinaryPath string `protobuf:"bytes,2,opt,name=binary_path,json=binaryPath,proto3" json:"binary_path,omitempty"` Argv []string `protobuf:"bytes,3,rep,name=argv,proto3" json:"argv,omitempty"` Env []string `protobuf:"bytes,4,rep,name=env,proto3" json:"env,omitempty"` BinaryMode uint32 `protobuf:"varint,5,opt,name=binary_mode,json=binaryMode,proto3" json:"binary_mode,omitempty"` BinaryUid uint32 `protobuf:"varint,6,opt,name=binary_uid,json=binaryUid,proto3" json:"binary_uid,omitempty"` BinaryGid uint32 `protobuf:"varint,7,opt,name=binary_gid,json=binaryGid,proto3" json:"binary_gid,omitempty"` BinarySha256 []byte `protobuf:"bytes,8,opt,name=binary_sha256,json=binarySha256,proto3" json:"binary_sha256,omitempty"` } func (x *ExecveInfo) Reset() { *x = ExecveInfo{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_sentry_proto_msgTypes[1] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *ExecveInfo) String() string { return protoimpl.X.MessageStringOf(x) } func (*ExecveInfo) ProtoMessage() {} func (x *ExecveInfo) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_sentry_proto_msgTypes[1] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use ExecveInfo.ProtoReflect.Descriptor instead. func (*ExecveInfo) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_sentry_proto_rawDescGZIP(), []int{1} } func (x *ExecveInfo) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *ExecveInfo) GetBinaryPath() string { if x != nil { return x.BinaryPath } return "" } func (x *ExecveInfo) GetArgv() []string { if x != nil { return x.Argv } return nil } func (x *ExecveInfo) GetEnv() []string { if x != nil { return x.Env } return nil } func (x *ExecveInfo) GetBinaryMode() uint32 { if x != nil { return x.BinaryMode } return 0 } func (x *ExecveInfo) GetBinaryUid() uint32 { if x != nil { return x.BinaryUid } return 0 } func (x *ExecveInfo) GetBinaryGid() uint32 { if x != nil { return x.BinaryGid } return 0 } func (x *ExecveInfo) GetBinarySha256() []byte { if x != nil { return x.BinarySha256 } return nil } type ExitNotifyParentInfo struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` ExitStatus int32 `protobuf:"varint,2,opt,name=exit_status,json=exitStatus,proto3" json:"exit_status,omitempty"` } func (x *ExitNotifyParentInfo) Reset() { *x = ExitNotifyParentInfo{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_sentry_proto_msgTypes[2] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *ExitNotifyParentInfo) String() string { return protoimpl.X.MessageStringOf(x) } func (*ExitNotifyParentInfo) ProtoMessage() {} func (x *ExitNotifyParentInfo) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_sentry_proto_msgTypes[2] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use ExitNotifyParentInfo.ProtoReflect.Descriptor instead. func (*ExitNotifyParentInfo) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_sentry_proto_rawDescGZIP(), []int{2} } func (x *ExitNotifyParentInfo) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *ExitNotifyParentInfo) GetExitStatus() int32 { if x != nil { return x.ExitStatus } return 0 } type TaskExit struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` ExitStatus int32 `protobuf:"varint,2,opt,name=exit_status,json=exitStatus,proto3" json:"exit_status,omitempty"` } func (x *TaskExit) Reset() { *x = TaskExit{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_sentry_proto_msgTypes[3] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *TaskExit) String() string { return protoimpl.X.MessageStringOf(x) } func (*TaskExit) ProtoMessage() {} func (x *TaskExit) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_sentry_proto_msgTypes[3] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use TaskExit.ProtoReflect.Descriptor instead. func (*TaskExit) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_sentry_proto_rawDescGZIP(), []int{3} } func (x *TaskExit) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *TaskExit) GetExitStatus() int32 { if x != nil { return x.ExitStatus } return 0 } var File_pkg_sentry_seccheck_points_sentry_proto protoreflect.FileDescriptor var file_pkg_sentry_seccheck_points_sentry_proto_rawDesc = []byte{ 0x0a, 0x27, 0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x2f, 0x73, 0x65, 0x63, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x2f, 0x70, 0x6f, 0x69, 0x6e, 0x74, 0x73, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x0d, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x1a, 0x27, 0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x2f, 0x73, 0x65, 0x63, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x2f, 0x70, 0x6f, 0x69, 0x6e, 0x74, 0x73, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0x83, 0x02, 0x0a, 0x09, 0x43, 0x6c, 0x6f, 0x6e, 0x65, 0x49, 0x6e, 0x66, 0x6f, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x2a, 0x0a, 0x11, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x64, 0x5f, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x5f, 0x69, 0x64, 0x18, 0x03, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0f, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x64, 0x54, 0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, 0x12, 0x35, 0x0a, 0x17, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x64, 0x5f, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, 0x69, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x14, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x64, 0x54, 0x68, 0x72, 0x65, 0x61, 0x64, 0x47, 0x72, 0x6f, 0x75, 0x70, 0x49, 0x64, 0x12, 0x3e, 0x0a, 0x1c, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x64, 0x5f, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x5f, 0x73, 0x74, 0x61, 0x72, 0x74, 0x5f, 0x74, 0x69, 0x6d, 0x65, 0x5f, 0x6e, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, 0x03, 0x52, 0x18, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x64, 0x54, 0x68, 0x72, 0x65, 0x61, 0x64, 0x53, 0x74, 0x61, 0x72, 0x74, 0x54, 0x69, 0x6d, 0x65, 0x4e, 0x73, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x18, 0x06, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x22, 0x96, 0x02, 0x0a, 0x0a, 0x45, 0x78, 0x65, 0x63, 0x76, 0x65, 0x49, 0x6e, 0x66, 0x6f, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x1f, 0x0a, 0x0b, 0x62, 0x69, 0x6e, 0x61, 0x72, 0x79, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0a, 0x62, 0x69, 0x6e, 0x61, 0x72, 0x79, 0x50, 0x61, 0x74, 0x68, 0x12, 0x12, 0x0a, 0x04, 0x61, 0x72, 0x67, 0x76, 0x18, 0x03, 0x20, 0x03, 0x28, 0x09, 0x52, 0x04, 0x61, 0x72, 0x67, 0x76, 0x12, 0x10, 0x0a, 0x03, 0x65, 0x6e, 0x76, 0x18, 0x04, 0x20, 0x03, 0x28, 0x09, 0x52, 0x03, 0x65, 0x6e, 0x76, 0x12, 0x1f, 0x0a, 0x0b, 0x62, 0x69, 0x6e, 0x61, 0x72, 0x79, 0x5f, 0x6d, 0x6f, 0x64, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0a, 0x62, 0x69, 0x6e, 0x61, 0x72, 0x79, 0x4d, 0x6f, 0x64, 0x65, 0x12, 0x1d, 0x0a, 0x0a, 0x62, 0x69, 0x6e, 0x61, 0x72, 0x79, 0x5f, 0x75, 0x69, 0x64, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x09, 0x62, 0x69, 0x6e, 0x61, 0x72, 0x79, 0x55, 0x69, 0x64, 0x12, 0x1d, 0x0a, 0x0a, 0x62, 0x69, 0x6e, 0x61, 0x72, 0x79, 0x5f, 0x67, 0x69, 0x64, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x09, 0x62, 0x69, 0x6e, 0x61, 0x72, 0x79, 0x47, 0x69, 0x64, 0x12, 0x23, 0x0a, 0x0d, 0x62, 0x69, 0x6e, 0x61, 0x72, 0x79, 0x5f, 0x73, 0x68, 0x61, 0x32, 0x35, 0x36, 0x18, 0x08, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x0c, 0x62, 0x69, 0x6e, 0x61, 0x72, 0x79, 0x53, 0x68, 0x61, 0x32, 0x35, 0x36, 0x22, 0x76, 0x0a, 0x14, 0x45, 0x78, 0x69, 0x74, 0x4e, 0x6f, 0x74, 0x69, 0x66, 0x79, 0x50, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x49, 0x6e, 0x66, 0x6f, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x1f, 0x0a, 0x0b, 0x65, 0x78, 0x69, 0x74, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0a, 0x65, 0x78, 0x69, 0x74, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0x6a, 0x0a, 0x08, 0x54, 0x61, 0x73, 0x6b, 0x45, 0x78, 0x69, 0x74, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x1f, 0x0a, 0x0b, 0x65, 0x78, 0x69, 0x74, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0a, 0x65, 0x78, 0x69, 0x74, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( file_pkg_sentry_seccheck_points_sentry_proto_rawDescOnce sync.Once file_pkg_sentry_seccheck_points_sentry_proto_rawDescData = file_pkg_sentry_seccheck_points_sentry_proto_rawDesc ) func file_pkg_sentry_seccheck_points_sentry_proto_rawDescGZIP() []byte { file_pkg_sentry_seccheck_points_sentry_proto_rawDescOnce.Do(func() { file_pkg_sentry_seccheck_points_sentry_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_sentry_seccheck_points_sentry_proto_rawDescData) }) return file_pkg_sentry_seccheck_points_sentry_proto_rawDescData } var file_pkg_sentry_seccheck_points_sentry_proto_msgTypes = make([]protoimpl.MessageInfo, 4) var file_pkg_sentry_seccheck_points_sentry_proto_goTypes = []interface{}{ (*CloneInfo)(nil), // 0: gvisor.sentry.CloneInfo (*ExecveInfo)(nil), // 1: gvisor.sentry.ExecveInfo (*ExitNotifyParentInfo)(nil), // 2: gvisor.sentry.ExitNotifyParentInfo (*TaskExit)(nil), // 3: gvisor.sentry.TaskExit (*ContextData)(nil), // 4: gvisor.common.ContextData } var file_pkg_sentry_seccheck_points_sentry_proto_depIdxs = []int32{ 4, // 0: gvisor.sentry.CloneInfo.context_data:type_name -> gvisor.common.ContextData 4, // 1: gvisor.sentry.ExecveInfo.context_data:type_name -> gvisor.common.ContextData 4, // 2: gvisor.sentry.ExitNotifyParentInfo.context_data:type_name -> gvisor.common.ContextData 4, // 3: gvisor.sentry.TaskExit.context_data:type_name -> gvisor.common.ContextData 4, // [4:4] is the sub-list for method output_type 4, // [4:4] is the sub-list for method input_type 4, // [4:4] is the sub-list for extension type_name 4, // [4:4] is the sub-list for extension extendee 0, // [0:4] is the sub-list for field type_name } func init() { file_pkg_sentry_seccheck_points_sentry_proto_init() } func file_pkg_sentry_seccheck_points_sentry_proto_init() { if File_pkg_sentry_seccheck_points_sentry_proto != nil { return } file_pkg_sentry_seccheck_points_common_proto_init() if !protoimpl.UnsafeEnabled { file_pkg_sentry_seccheck_points_sentry_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*CloneInfo); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_sentry_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*ExecveInfo); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_sentry_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*ExitNotifyParentInfo); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_sentry_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*TaskExit); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } } type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_pkg_sentry_seccheck_points_sentry_proto_rawDesc, NumEnums: 0, NumMessages: 4, NumExtensions: 0, NumServices: 0, }, GoTypes: file_pkg_sentry_seccheck_points_sentry_proto_goTypes, DependencyIndexes: file_pkg_sentry_seccheck_points_sentry_proto_depIdxs, MessageInfos: file_pkg_sentry_seccheck_points_sentry_proto_msgTypes, }.Build() File_pkg_sentry_seccheck_points_sentry_proto = out.File file_pkg_sentry_seccheck_points_sentry_proto_rawDesc = nil file_pkg_sentry_seccheck_points_sentry_proto_goTypes = nil file_pkg_sentry_seccheck_points_sentry_proto_depIdxs = nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/seccheck/points/points_go_proto/syscall.pb.go000066400000000000000000003767701465435605700317050ustar00rootroot00000000000000// Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.31.0 // protoc v5.26.1 // source: pkg/sentry/seccheck/points/syscall.proto package points_go_proto import ( protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" reflect "reflect" sync "sync" ) const ( // Verify that this generated code is sufficiently up-to-date. _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) // Verify that runtime/protoimpl is sufficiently up-to-date. _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) type Exit struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Result int64 `protobuf:"varint,1,opt,name=result,proto3" json:"result,omitempty"` Errorno int64 `protobuf:"varint,2,opt,name=errorno,proto3" json:"errorno,omitempty"` } func (x *Exit) Reset() { *x = Exit{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[0] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Exit) String() string { return protoimpl.X.MessageStringOf(x) } func (*Exit) ProtoMessage() {} func (x *Exit) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[0] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Exit.ProtoReflect.Descriptor instead. func (*Exit) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{0} } func (x *Exit) GetResult() int64 { if x != nil { return x.Result } return 0 } func (x *Exit) GetErrorno() int64 { if x != nil { return x.Errorno } return 0 } type Syscall struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,4,opt,name=sysno,proto3" json:"sysno,omitempty"` Arg1 uint64 `protobuf:"varint,5,opt,name=arg1,proto3" json:"arg1,omitempty"` Arg2 uint64 `protobuf:"varint,6,opt,name=arg2,proto3" json:"arg2,omitempty"` Arg3 uint64 `protobuf:"varint,7,opt,name=arg3,proto3" json:"arg3,omitempty"` Arg4 uint64 `protobuf:"varint,8,opt,name=arg4,proto3" json:"arg4,omitempty"` Arg5 uint64 `protobuf:"varint,9,opt,name=arg5,proto3" json:"arg5,omitempty"` Arg6 uint64 `protobuf:"varint,10,opt,name=arg6,proto3" json:"arg6,omitempty"` } func (x *Syscall) Reset() { *x = Syscall{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[1] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Syscall) String() string { return protoimpl.X.MessageStringOf(x) } func (*Syscall) ProtoMessage() {} func (x *Syscall) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[1] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Syscall.ProtoReflect.Descriptor instead. func (*Syscall) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{1} } func (x *Syscall) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Syscall) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Syscall) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Syscall) GetArg1() uint64 { if x != nil { return x.Arg1 } return 0 } func (x *Syscall) GetArg2() uint64 { if x != nil { return x.Arg2 } return 0 } func (x *Syscall) GetArg3() uint64 { if x != nil { return x.Arg3 } return 0 } func (x *Syscall) GetArg4() uint64 { if x != nil { return x.Arg4 } return 0 } func (x *Syscall) GetArg5() uint64 { if x != nil { return x.Arg5 } return 0 } func (x *Syscall) GetArg6() uint64 { if x != nil { return x.Arg6 } return 0 } type Open struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Fd int64 `protobuf:"varint,4,opt,name=fd,proto3" json:"fd,omitempty"` FdPath string `protobuf:"bytes,5,opt,name=fd_path,json=fdPath,proto3" json:"fd_path,omitempty"` Pathname string `protobuf:"bytes,6,opt,name=pathname,proto3" json:"pathname,omitempty"` Flags uint32 `protobuf:"varint,7,opt,name=flags,proto3" json:"flags,omitempty"` Mode uint32 `protobuf:"varint,8,opt,name=mode,proto3" json:"mode,omitempty"` } func (x *Open) Reset() { *x = Open{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[2] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Open) String() string { return protoimpl.X.MessageStringOf(x) } func (*Open) ProtoMessage() {} func (x *Open) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[2] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Open.ProtoReflect.Descriptor instead. func (*Open) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{2} } func (x *Open) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Open) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Open) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Open) GetFd() int64 { if x != nil { return x.Fd } return 0 } func (x *Open) GetFdPath() string { if x != nil { return x.FdPath } return "" } func (x *Open) GetPathname() string { if x != nil { return x.Pathname } return "" } func (x *Open) GetFlags() uint32 { if x != nil { return x.Flags } return 0 } func (x *Open) GetMode() uint32 { if x != nil { return x.Mode } return 0 } type Close struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Fd int64 `protobuf:"varint,4,opt,name=fd,proto3" json:"fd,omitempty"` FdPath string `protobuf:"bytes,5,opt,name=fd_path,json=fdPath,proto3" json:"fd_path,omitempty"` } func (x *Close) Reset() { *x = Close{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[3] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Close) String() string { return protoimpl.X.MessageStringOf(x) } func (*Close) ProtoMessage() {} func (x *Close) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[3] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Close.ProtoReflect.Descriptor instead. func (*Close) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{3} } func (x *Close) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Close) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Close) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Close) GetFd() int64 { if x != nil { return x.Fd } return 0 } func (x *Close) GetFdPath() string { if x != nil { return x.FdPath } return "" } type Read struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Fd int64 `protobuf:"varint,4,opt,name=fd,proto3" json:"fd,omitempty"` FdPath string `protobuf:"bytes,5,opt,name=fd_path,json=fdPath,proto3" json:"fd_path,omitempty"` Count uint64 `protobuf:"varint,6,opt,name=count,proto3" json:"count,omitempty"` HasOffset bool `protobuf:"varint,7,opt,name=has_offset,json=hasOffset,proto3" json:"has_offset,omitempty"` Offset int64 `protobuf:"varint,8,opt,name=offset,proto3" json:"offset,omitempty"` Flags uint32 `protobuf:"varint,9,opt,name=flags,proto3" json:"flags,omitempty"` } func (x *Read) Reset() { *x = Read{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[4] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Read) String() string { return protoimpl.X.MessageStringOf(x) } func (*Read) ProtoMessage() {} func (x *Read) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[4] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Read.ProtoReflect.Descriptor instead. func (*Read) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{4} } func (x *Read) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Read) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Read) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Read) GetFd() int64 { if x != nil { return x.Fd } return 0 } func (x *Read) GetFdPath() string { if x != nil { return x.FdPath } return "" } func (x *Read) GetCount() uint64 { if x != nil { return x.Count } return 0 } func (x *Read) GetHasOffset() bool { if x != nil { return x.HasOffset } return false } func (x *Read) GetOffset() int64 { if x != nil { return x.Offset } return 0 } func (x *Read) GetFlags() uint32 { if x != nil { return x.Flags } return 0 } type Write struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Fd int64 `protobuf:"varint,4,opt,name=fd,proto3" json:"fd,omitempty"` FdPath string `protobuf:"bytes,5,opt,name=fd_path,json=fdPath,proto3" json:"fd_path,omitempty"` Count uint64 `protobuf:"varint,6,opt,name=count,proto3" json:"count,omitempty"` HasOffset bool `protobuf:"varint,7,opt,name=has_offset,json=hasOffset,proto3" json:"has_offset,omitempty"` Offset int64 `protobuf:"varint,8,opt,name=offset,proto3" json:"offset,omitempty"` Flags uint32 `protobuf:"varint,9,opt,name=flags,proto3" json:"flags,omitempty"` } func (x *Write) Reset() { *x = Write{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[5] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Write) String() string { return protoimpl.X.MessageStringOf(x) } func (*Write) ProtoMessage() {} func (x *Write) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[5] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Write.ProtoReflect.Descriptor instead. func (*Write) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{5} } func (x *Write) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Write) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Write) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Write) GetFd() int64 { if x != nil { return x.Fd } return 0 } func (x *Write) GetFdPath() string { if x != nil { return x.FdPath } return "" } func (x *Write) GetCount() uint64 { if x != nil { return x.Count } return 0 } func (x *Write) GetHasOffset() bool { if x != nil { return x.HasOffset } return false } func (x *Write) GetOffset() int64 { if x != nil { return x.Offset } return 0 } func (x *Write) GetFlags() uint32 { if x != nil { return x.Flags } return 0 } type Connect struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Fd int64 `protobuf:"varint,4,opt,name=fd,proto3" json:"fd,omitempty"` FdPath string `protobuf:"bytes,5,opt,name=fd_path,json=fdPath,proto3" json:"fd_path,omitempty"` Address []byte `protobuf:"bytes,6,opt,name=address,proto3" json:"address,omitempty"` } func (x *Connect) Reset() { *x = Connect{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[6] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Connect) String() string { return protoimpl.X.MessageStringOf(x) } func (*Connect) ProtoMessage() {} func (x *Connect) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[6] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Connect.ProtoReflect.Descriptor instead. func (*Connect) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{6} } func (x *Connect) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Connect) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Connect) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Connect) GetFd() int64 { if x != nil { return x.Fd } return 0 } func (x *Connect) GetFdPath() string { if x != nil { return x.FdPath } return "" } func (x *Connect) GetAddress() []byte { if x != nil { return x.Address } return nil } type Execve struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Fd int64 `protobuf:"varint,4,opt,name=fd,proto3" json:"fd,omitempty"` FdPath string `protobuf:"bytes,5,opt,name=fd_path,json=fdPath,proto3" json:"fd_path,omitempty"` Pathname string `protobuf:"bytes,6,opt,name=pathname,proto3" json:"pathname,omitempty"` Argv []string `protobuf:"bytes,7,rep,name=argv,proto3" json:"argv,omitempty"` Envv []string `protobuf:"bytes,8,rep,name=envv,proto3" json:"envv,omitempty"` Flags uint32 `protobuf:"varint,9,opt,name=flags,proto3" json:"flags,omitempty"` } func (x *Execve) Reset() { *x = Execve{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[7] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Execve) String() string { return protoimpl.X.MessageStringOf(x) } func (*Execve) ProtoMessage() {} func (x *Execve) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[7] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Execve.ProtoReflect.Descriptor instead. func (*Execve) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{7} } func (x *Execve) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Execve) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Execve) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Execve) GetFd() int64 { if x != nil { return x.Fd } return 0 } func (x *Execve) GetFdPath() string { if x != nil { return x.FdPath } return "" } func (x *Execve) GetPathname() string { if x != nil { return x.Pathname } return "" } func (x *Execve) GetArgv() []string { if x != nil { return x.Argv } return nil } func (x *Execve) GetEnvv() []string { if x != nil { return x.Envv } return nil } func (x *Execve) GetFlags() uint32 { if x != nil { return x.Flags } return 0 } type Socket struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Domain int32 `protobuf:"varint,4,opt,name=domain,proto3" json:"domain,omitempty"` Type int32 `protobuf:"varint,5,opt,name=type,proto3" json:"type,omitempty"` Protocol int32 `protobuf:"varint,6,opt,name=protocol,proto3" json:"protocol,omitempty"` } func (x *Socket) Reset() { *x = Socket{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[8] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Socket) String() string { return protoimpl.X.MessageStringOf(x) } func (*Socket) ProtoMessage() {} func (x *Socket) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[8] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Socket.ProtoReflect.Descriptor instead. func (*Socket) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{8} } func (x *Socket) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Socket) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Socket) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Socket) GetDomain() int32 { if x != nil { return x.Domain } return 0 } func (x *Socket) GetType() int32 { if x != nil { return x.Type } return 0 } func (x *Socket) GetProtocol() int32 { if x != nil { return x.Protocol } return 0 } type Chdir struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Fd int64 `protobuf:"varint,4,opt,name=fd,proto3" json:"fd,omitempty"` FdPath string `protobuf:"bytes,5,opt,name=fd_path,json=fdPath,proto3" json:"fd_path,omitempty"` Pathname string `protobuf:"bytes,6,opt,name=pathname,proto3" json:"pathname,omitempty"` } func (x *Chdir) Reset() { *x = Chdir{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[9] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Chdir) String() string { return protoimpl.X.MessageStringOf(x) } func (*Chdir) ProtoMessage() {} func (x *Chdir) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[9] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Chdir.ProtoReflect.Descriptor instead. func (*Chdir) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{9} } func (x *Chdir) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Chdir) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Chdir) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Chdir) GetFd() int64 { if x != nil { return x.Fd } return 0 } func (x *Chdir) GetFdPath() string { if x != nil { return x.FdPath } return "" } func (x *Chdir) GetPathname() string { if x != nil { return x.Pathname } return "" } type Setresid struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Rid uint32 `protobuf:"varint,4,opt,name=rid,proto3" json:"rid,omitempty"` Eid uint32 `protobuf:"varint,5,opt,name=eid,proto3" json:"eid,omitempty"` Sid uint32 `protobuf:"varint,6,opt,name=sid,proto3" json:"sid,omitempty"` } func (x *Setresid) Reset() { *x = Setresid{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[10] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Setresid) String() string { return protoimpl.X.MessageStringOf(x) } func (*Setresid) ProtoMessage() {} func (x *Setresid) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[10] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Setresid.ProtoReflect.Descriptor instead. func (*Setresid) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{10} } func (x *Setresid) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Setresid) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Setresid) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Setresid) GetRid() uint32 { if x != nil { return x.Rid } return 0 } func (x *Setresid) GetEid() uint32 { if x != nil { return x.Eid } return 0 } func (x *Setresid) GetSid() uint32 { if x != nil { return x.Sid } return 0 } type Setid struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Id uint32 `protobuf:"varint,4,opt,name=id,proto3" json:"id,omitempty"` } func (x *Setid) Reset() { *x = Setid{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[11] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Setid) String() string { return protoimpl.X.MessageStringOf(x) } func (*Setid) ProtoMessage() {} func (x *Setid) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[11] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Setid.ProtoReflect.Descriptor instead. func (*Setid) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{11} } func (x *Setid) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Setid) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Setid) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Setid) GetId() uint32 { if x != nil { return x.Id } return 0 } type StructRlimit struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Cur uint64 `protobuf:"varint,1,opt,name=cur,proto3" json:"cur,omitempty"` Max uint64 `protobuf:"varint,2,opt,name=max,proto3" json:"max,omitempty"` } func (x *StructRlimit) Reset() { *x = StructRlimit{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[12] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *StructRlimit) String() string { return protoimpl.X.MessageStringOf(x) } func (*StructRlimit) ProtoMessage() {} func (x *StructRlimit) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[12] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use StructRlimit.ProtoReflect.Descriptor instead. func (*StructRlimit) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{12} } func (x *StructRlimit) GetCur() uint64 { if x != nil { return x.Cur } return 0 } func (x *StructRlimit) GetMax() uint64 { if x != nil { return x.Max } return 0 } type Prlimit struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Pid int32 `protobuf:"varint,4,opt,name=pid,proto3" json:"pid,omitempty"` Resource int64 `protobuf:"varint,5,opt,name=resource,proto3" json:"resource,omitempty"` NewLimit *StructRlimit `protobuf:"bytes,6,opt,name=new_limit,json=newLimit,proto3" json:"new_limit,omitempty"` OldLimit *StructRlimit `protobuf:"bytes,7,opt,name=old_limit,json=oldLimit,proto3" json:"old_limit,omitempty"` } func (x *Prlimit) Reset() { *x = Prlimit{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[13] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Prlimit) String() string { return protoimpl.X.MessageStringOf(x) } func (*Prlimit) ProtoMessage() {} func (x *Prlimit) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[13] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Prlimit.ProtoReflect.Descriptor instead. func (*Prlimit) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{13} } func (x *Prlimit) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Prlimit) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Prlimit) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Prlimit) GetPid() int32 { if x != nil { return x.Pid } return 0 } func (x *Prlimit) GetResource() int64 { if x != nil { return x.Resource } return 0 } func (x *Prlimit) GetNewLimit() *StructRlimit { if x != nil { return x.NewLimit } return nil } func (x *Prlimit) GetOldLimit() *StructRlimit { if x != nil { return x.OldLimit } return nil } type Pipe struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Flags uint32 `protobuf:"varint,4,opt,name=flags,proto3" json:"flags,omitempty"` Reader int32 `protobuf:"varint,5,opt,name=reader,proto3" json:"reader,omitempty"` Writer int32 `protobuf:"varint,6,opt,name=writer,proto3" json:"writer,omitempty"` } func (x *Pipe) Reset() { *x = Pipe{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[14] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Pipe) String() string { return protoimpl.X.MessageStringOf(x) } func (*Pipe) ProtoMessage() {} func (x *Pipe) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[14] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Pipe.ProtoReflect.Descriptor instead. func (*Pipe) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{14} } func (x *Pipe) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Pipe) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Pipe) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Pipe) GetFlags() uint32 { if x != nil { return x.Flags } return 0 } func (x *Pipe) GetReader() int32 { if x != nil { return x.Reader } return 0 } func (x *Pipe) GetWriter() int32 { if x != nil { return x.Writer } return 0 } type Fcntl struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Fd int32 `protobuf:"varint,4,opt,name=fd,proto3" json:"fd,omitempty"` FdPath string `protobuf:"bytes,5,opt,name=fd_path,json=fdPath,proto3" json:"fd_path,omitempty"` Cmd int32 `protobuf:"varint,6,opt,name=cmd,proto3" json:"cmd,omitempty"` Args int64 `protobuf:"varint,7,opt,name=args,proto3" json:"args,omitempty"` } func (x *Fcntl) Reset() { *x = Fcntl{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[15] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Fcntl) String() string { return protoimpl.X.MessageStringOf(x) } func (*Fcntl) ProtoMessage() {} func (x *Fcntl) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[15] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Fcntl.ProtoReflect.Descriptor instead. func (*Fcntl) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{15} } func (x *Fcntl) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Fcntl) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Fcntl) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Fcntl) GetFd() int32 { if x != nil { return x.Fd } return 0 } func (x *Fcntl) GetFdPath() string { if x != nil { return x.FdPath } return "" } func (x *Fcntl) GetCmd() int32 { if x != nil { return x.Cmd } return 0 } func (x *Fcntl) GetArgs() int64 { if x != nil { return x.Args } return 0 } type Dup struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` OldFd int32 `protobuf:"varint,4,opt,name=old_fd,json=oldFd,proto3" json:"old_fd,omitempty"` NewFd int32 `protobuf:"varint,5,opt,name=new_fd,json=newFd,proto3" json:"new_fd,omitempty"` FdPath string `protobuf:"bytes,6,opt,name=fd_path,json=fdPath,proto3" json:"fd_path,omitempty"` Flags uint32 `protobuf:"varint,7,opt,name=flags,proto3" json:"flags,omitempty"` } func (x *Dup) Reset() { *x = Dup{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[16] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Dup) String() string { return protoimpl.X.MessageStringOf(x) } func (*Dup) ProtoMessage() {} func (x *Dup) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[16] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Dup.ProtoReflect.Descriptor instead. func (*Dup) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{16} } func (x *Dup) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Dup) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Dup) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Dup) GetOldFd() int32 { if x != nil { return x.OldFd } return 0 } func (x *Dup) GetNewFd() int32 { if x != nil { return x.NewFd } return 0 } func (x *Dup) GetFdPath() string { if x != nil { return x.FdPath } return "" } func (x *Dup) GetFlags() uint32 { if x != nil { return x.Flags } return 0 } type Signalfd struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Fd int32 `protobuf:"varint,4,opt,name=fd,proto3" json:"fd,omitempty"` FdPath string `protobuf:"bytes,5,opt,name=fd_path,json=fdPath,proto3" json:"fd_path,omitempty"` Sigset uint64 `protobuf:"varint,6,opt,name=sigset,proto3" json:"sigset,omitempty"` Flags int32 `protobuf:"varint,7,opt,name=flags,proto3" json:"flags,omitempty"` } func (x *Signalfd) Reset() { *x = Signalfd{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[17] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Signalfd) String() string { return protoimpl.X.MessageStringOf(x) } func (*Signalfd) ProtoMessage() {} func (x *Signalfd) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[17] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Signalfd.ProtoReflect.Descriptor instead. func (*Signalfd) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{17} } func (x *Signalfd) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Signalfd) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Signalfd) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Signalfd) GetFd() int32 { if x != nil { return x.Fd } return 0 } func (x *Signalfd) GetFdPath() string { if x != nil { return x.FdPath } return "" } func (x *Signalfd) GetSigset() uint64 { if x != nil { return x.Sigset } return 0 } func (x *Signalfd) GetFlags() int32 { if x != nil { return x.Flags } return 0 } type Chroot struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Pathname string `protobuf:"bytes,4,opt,name=pathname,proto3" json:"pathname,omitempty"` } func (x *Chroot) Reset() { *x = Chroot{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[18] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Chroot) String() string { return protoimpl.X.MessageStringOf(x) } func (*Chroot) ProtoMessage() {} func (x *Chroot) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[18] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Chroot.ProtoReflect.Descriptor instead. func (*Chroot) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{18} } func (x *Chroot) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Chroot) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Chroot) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Chroot) GetPathname() string { if x != nil { return x.Pathname } return "" } type Eventfd struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Val int32 `protobuf:"varint,4,opt,name=val,proto3" json:"val,omitempty"` Flags uint32 `protobuf:"varint,5,opt,name=flags,proto3" json:"flags,omitempty"` } func (x *Eventfd) Reset() { *x = Eventfd{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[19] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Eventfd) String() string { return protoimpl.X.MessageStringOf(x) } func (*Eventfd) ProtoMessage() {} func (x *Eventfd) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[19] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Eventfd.ProtoReflect.Descriptor instead. func (*Eventfd) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{19} } func (x *Eventfd) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Eventfd) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Eventfd) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Eventfd) GetVal() int32 { if x != nil { return x.Val } return 0 } func (x *Eventfd) GetFlags() uint32 { if x != nil { return x.Flags } return 0 } type Clone struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Flags uint64 `protobuf:"varint,4,opt,name=flags,proto3" json:"flags,omitempty"` Stack uint64 `protobuf:"varint,5,opt,name=stack,proto3" json:"stack,omitempty"` NewTid uint64 `protobuf:"varint,6,opt,name=new_tid,json=newTid,proto3" json:"new_tid,omitempty"` Tls uint64 `protobuf:"varint,7,opt,name=tls,proto3" json:"tls,omitempty"` } func (x *Clone) Reset() { *x = Clone{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[20] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Clone) String() string { return protoimpl.X.MessageStringOf(x) } func (*Clone) ProtoMessage() {} func (x *Clone) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[20] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Clone.ProtoReflect.Descriptor instead. func (*Clone) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{20} } func (x *Clone) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Clone) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Clone) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Clone) GetFlags() uint64 { if x != nil { return x.Flags } return 0 } func (x *Clone) GetStack() uint64 { if x != nil { return x.Stack } return 0 } func (x *Clone) GetNewTid() uint64 { if x != nil { return x.NewTid } return 0 } func (x *Clone) GetTls() uint64 { if x != nil { return x.Tls } return 0 } type Bind struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Fd int32 `protobuf:"varint,4,opt,name=fd,proto3" json:"fd,omitempty"` FdPath string `protobuf:"bytes,5,opt,name=fd_path,json=fdPath,proto3" json:"fd_path,omitempty"` Address []byte `protobuf:"bytes,6,opt,name=address,proto3" json:"address,omitempty"` } func (x *Bind) Reset() { *x = Bind{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[21] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Bind) String() string { return protoimpl.X.MessageStringOf(x) } func (*Bind) ProtoMessage() {} func (x *Bind) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[21] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Bind.ProtoReflect.Descriptor instead. func (*Bind) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{21} } func (x *Bind) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Bind) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Bind) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Bind) GetFd() int32 { if x != nil { return x.Fd } return 0 } func (x *Bind) GetFdPath() string { if x != nil { return x.FdPath } return "" } func (x *Bind) GetAddress() []byte { if x != nil { return x.Address } return nil } type Accept struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Fd int32 `protobuf:"varint,4,opt,name=fd,proto3" json:"fd,omitempty"` FdPath string `protobuf:"bytes,5,opt,name=fd_path,json=fdPath,proto3" json:"fd_path,omitempty"` Address []byte `protobuf:"bytes,6,opt,name=address,proto3" json:"address,omitempty"` Flags int32 `protobuf:"varint,7,opt,name=flags,proto3" json:"flags,omitempty"` } func (x *Accept) Reset() { *x = Accept{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[22] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Accept) String() string { return protoimpl.X.MessageStringOf(x) } func (*Accept) ProtoMessage() {} func (x *Accept) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[22] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Accept.ProtoReflect.Descriptor instead. func (*Accept) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{22} } func (x *Accept) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Accept) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Accept) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *Accept) GetFd() int32 { if x != nil { return x.Fd } return 0 } func (x *Accept) GetFdPath() string { if x != nil { return x.FdPath } return "" } func (x *Accept) GetAddress() []byte { if x != nil { return x.Address } return nil } func (x *Accept) GetFlags() int32 { if x != nil { return x.Flags } return 0 } type TimerfdCreate struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` ClockId int32 `protobuf:"varint,4,opt,name=clock_id,json=clockId,proto3" json:"clock_id,omitempty"` Flags int32 `protobuf:"varint,5,opt,name=flags,proto3" json:"flags,omitempty"` } func (x *TimerfdCreate) Reset() { *x = TimerfdCreate{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[23] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *TimerfdCreate) String() string { return protoimpl.X.MessageStringOf(x) } func (*TimerfdCreate) ProtoMessage() {} func (x *TimerfdCreate) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[23] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use TimerfdCreate.ProtoReflect.Descriptor instead. func (*TimerfdCreate) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{23} } func (x *TimerfdCreate) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *TimerfdCreate) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *TimerfdCreate) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *TimerfdCreate) GetClockId() int32 { if x != nil { return x.ClockId } return 0 } func (x *TimerfdCreate) GetFlags() int32 { if x != nil { return x.Flags } return 0 } type Timespec struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Sec int64 `protobuf:"varint,1,opt,name=sec,proto3" json:"sec,omitempty"` Nsec int64 `protobuf:"varint,2,opt,name=nsec,proto3" json:"nsec,omitempty"` } func (x *Timespec) Reset() { *x = Timespec{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[24] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Timespec) String() string { return protoimpl.X.MessageStringOf(x) } func (*Timespec) ProtoMessage() {} func (x *Timespec) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[24] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Timespec.ProtoReflect.Descriptor instead. func (*Timespec) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{24} } func (x *Timespec) GetSec() int64 { if x != nil { return x.Sec } return 0 } func (x *Timespec) GetNsec() int64 { if x != nil { return x.Nsec } return 0 } type ItimerSpec struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Interval *Timespec `protobuf:"bytes,1,opt,name=interval,proto3" json:"interval,omitempty"` Value *Timespec `protobuf:"bytes,2,opt,name=value,proto3" json:"value,omitempty"` } func (x *ItimerSpec) Reset() { *x = ItimerSpec{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[25] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *ItimerSpec) String() string { return protoimpl.X.MessageStringOf(x) } func (*ItimerSpec) ProtoMessage() {} func (x *ItimerSpec) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[25] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use ItimerSpec.ProtoReflect.Descriptor instead. func (*ItimerSpec) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{25} } func (x *ItimerSpec) GetInterval() *Timespec { if x != nil { return x.Interval } return nil } func (x *ItimerSpec) GetValue() *Timespec { if x != nil { return x.Value } return nil } type TimerfdSetTime struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Fd int32 `protobuf:"varint,4,opt,name=fd,proto3" json:"fd,omitempty"` FdPath string `protobuf:"bytes,5,opt,name=fd_path,json=fdPath,proto3" json:"fd_path,omitempty"` Flags int32 `protobuf:"varint,6,opt,name=flags,proto3" json:"flags,omitempty"` NewValue *ItimerSpec `protobuf:"bytes,7,opt,name=new_value,json=newValue,proto3" json:"new_value,omitempty"` OldValue *ItimerSpec `protobuf:"bytes,8,opt,name=old_value,json=oldValue,proto3" json:"old_value,omitempty"` } func (x *TimerfdSetTime) Reset() { *x = TimerfdSetTime{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[26] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *TimerfdSetTime) String() string { return protoimpl.X.MessageStringOf(x) } func (*TimerfdSetTime) ProtoMessage() {} func (x *TimerfdSetTime) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[26] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use TimerfdSetTime.ProtoReflect.Descriptor instead. func (*TimerfdSetTime) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{26} } func (x *TimerfdSetTime) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *TimerfdSetTime) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *TimerfdSetTime) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *TimerfdSetTime) GetFd() int32 { if x != nil { return x.Fd } return 0 } func (x *TimerfdSetTime) GetFdPath() string { if x != nil { return x.FdPath } return "" } func (x *TimerfdSetTime) GetFlags() int32 { if x != nil { return x.Flags } return 0 } func (x *TimerfdSetTime) GetNewValue() *ItimerSpec { if x != nil { return x.NewValue } return nil } func (x *TimerfdSetTime) GetOldValue() *ItimerSpec { if x != nil { return x.OldValue } return nil } type TimerfdGetTime struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Fd int32 `protobuf:"varint,4,opt,name=fd,proto3" json:"fd,omitempty"` FdPath string `protobuf:"bytes,5,opt,name=fd_path,json=fdPath,proto3" json:"fd_path,omitempty"` CurValue *ItimerSpec `protobuf:"bytes,6,opt,name=cur_value,json=curValue,proto3" json:"cur_value,omitempty"` } func (x *TimerfdGetTime) Reset() { *x = TimerfdGetTime{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[27] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *TimerfdGetTime) String() string { return protoimpl.X.MessageStringOf(x) } func (*TimerfdGetTime) ProtoMessage() {} func (x *TimerfdGetTime) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[27] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use TimerfdGetTime.ProtoReflect.Descriptor instead. func (*TimerfdGetTime) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{27} } func (x *TimerfdGetTime) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *TimerfdGetTime) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *TimerfdGetTime) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *TimerfdGetTime) GetFd() int32 { if x != nil { return x.Fd } return 0 } func (x *TimerfdGetTime) GetFdPath() string { if x != nil { return x.FdPath } return "" } func (x *TimerfdGetTime) GetCurValue() *ItimerSpec { if x != nil { return x.CurValue } return nil } type Fork struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` } func (x *Fork) Reset() { *x = Fork{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[28] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Fork) String() string { return protoimpl.X.MessageStringOf(x) } func (*Fork) ProtoMessage() {} func (x *Fork) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[28] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Fork.ProtoReflect.Descriptor instead. func (*Fork) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{28} } func (x *Fork) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *Fork) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *Fork) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } type InotifyInit struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Flags int32 `protobuf:"varint,4,opt,name=flags,proto3" json:"flags,omitempty"` } func (x *InotifyInit) Reset() { *x = InotifyInit{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[29] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *InotifyInit) String() string { return protoimpl.X.MessageStringOf(x) } func (*InotifyInit) ProtoMessage() {} func (x *InotifyInit) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[29] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use InotifyInit.ProtoReflect.Descriptor instead. func (*InotifyInit) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{29} } func (x *InotifyInit) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *InotifyInit) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *InotifyInit) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *InotifyInit) GetFlags() int32 { if x != nil { return x.Flags } return 0 } type InotifyAddWatch struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Fd int32 `protobuf:"varint,4,opt,name=fd,proto3" json:"fd,omitempty"` FdPath string `protobuf:"bytes,5,opt,name=fd_path,json=fdPath,proto3" json:"fd_path,omitempty"` Pathname string `protobuf:"bytes,6,opt,name=pathname,proto3" json:"pathname,omitempty"` Mask uint32 `protobuf:"varint,7,opt,name=mask,proto3" json:"mask,omitempty"` } func (x *InotifyAddWatch) Reset() { *x = InotifyAddWatch{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[30] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *InotifyAddWatch) String() string { return protoimpl.X.MessageStringOf(x) } func (*InotifyAddWatch) ProtoMessage() {} func (x *InotifyAddWatch) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[30] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use InotifyAddWatch.ProtoReflect.Descriptor instead. func (*InotifyAddWatch) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{30} } func (x *InotifyAddWatch) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *InotifyAddWatch) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *InotifyAddWatch) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *InotifyAddWatch) GetFd() int32 { if x != nil { return x.Fd } return 0 } func (x *InotifyAddWatch) GetFdPath() string { if x != nil { return x.FdPath } return "" } func (x *InotifyAddWatch) GetPathname() string { if x != nil { return x.Pathname } return "" } func (x *InotifyAddWatch) GetMask() uint32 { if x != nil { return x.Mask } return 0 } type InotifyRmWatch struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Fd int32 `protobuf:"varint,4,opt,name=fd,proto3" json:"fd,omitempty"` FdPath string `protobuf:"bytes,5,opt,name=fd_path,json=fdPath,proto3" json:"fd_path,omitempty"` Wd int32 `protobuf:"varint,6,opt,name=wd,proto3" json:"wd,omitempty"` } func (x *InotifyRmWatch) Reset() { *x = InotifyRmWatch{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[31] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *InotifyRmWatch) String() string { return protoimpl.X.MessageStringOf(x) } func (*InotifyRmWatch) ProtoMessage() {} func (x *InotifyRmWatch) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[31] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use InotifyRmWatch.ProtoReflect.Descriptor instead. func (*InotifyRmWatch) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{31} } func (x *InotifyRmWatch) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *InotifyRmWatch) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *InotifyRmWatch) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *InotifyRmWatch) GetFd() int32 { if x != nil { return x.Fd } return 0 } func (x *InotifyRmWatch) GetFdPath() string { if x != nil { return x.FdPath } return "" } func (x *InotifyRmWatch) GetWd() int32 { if x != nil { return x.Wd } return 0 } type SocketPair struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields ContextData *ContextData `protobuf:"bytes,1,opt,name=context_data,json=contextData,proto3" json:"context_data,omitempty"` Exit *Exit `protobuf:"bytes,2,opt,name=exit,proto3" json:"exit,omitempty"` Sysno uint64 `protobuf:"varint,3,opt,name=sysno,proto3" json:"sysno,omitempty"` Domain int32 `protobuf:"varint,4,opt,name=domain,proto3" json:"domain,omitempty"` Type int32 `protobuf:"varint,5,opt,name=type,proto3" json:"type,omitempty"` Protocol int32 `protobuf:"varint,6,opt,name=protocol,proto3" json:"protocol,omitempty"` Socket1 int32 `protobuf:"varint,7,opt,name=socket1,proto3" json:"socket1,omitempty"` Socket2 int32 `protobuf:"varint,8,opt,name=socket2,proto3" json:"socket2,omitempty"` } func (x *SocketPair) Reset() { *x = SocketPair{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[32] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *SocketPair) String() string { return protoimpl.X.MessageStringOf(x) } func (*SocketPair) ProtoMessage() {} func (x *SocketPair) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[32] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use SocketPair.ProtoReflect.Descriptor instead. func (*SocketPair) Descriptor() ([]byte, []int) { return file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP(), []int{32} } func (x *SocketPair) GetContextData() *ContextData { if x != nil { return x.ContextData } return nil } func (x *SocketPair) GetExit() *Exit { if x != nil { return x.Exit } return nil } func (x *SocketPair) GetSysno() uint64 { if x != nil { return x.Sysno } return 0 } func (x *SocketPair) GetDomain() int32 { if x != nil { return x.Domain } return 0 } func (x *SocketPair) GetType() int32 { if x != nil { return x.Type } return 0 } func (x *SocketPair) GetProtocol() int32 { if x != nil { return x.Protocol } return 0 } func (x *SocketPair) GetSocket1() int32 { if x != nil { return x.Socket1 } return 0 } func (x *SocketPair) GetSocket2() int32 { if x != nil { return x.Socket2 } return 0 } var File_pkg_sentry_seccheck_points_syscall_proto protoreflect.FileDescriptor var file_pkg_sentry_seccheck_points_syscall_proto_rawDesc = []byte{ 0x0a, 0x28, 0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x2f, 0x73, 0x65, 0x63, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x2f, 0x70, 0x6f, 0x69, 0x6e, 0x74, 0x73, 0x2f, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x0e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x1a, 0x27, 0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x2f, 0x73, 0x65, 0x63, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x2f, 0x70, 0x6f, 0x69, 0x6e, 0x74, 0x73, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0x38, 0x0a, 0x04, 0x45, 0x78, 0x69, 0x74, 0x12, 0x16, 0x0a, 0x06, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x03, 0x52, 0x06, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x18, 0x0a, 0x07, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x6e, 0x6f, 0x18, 0x02, 0x20, 0x01, 0x28, 0x03, 0x52, 0x07, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x6e, 0x6f, 0x22, 0x80, 0x02, 0x0a, 0x07, 0x53, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x12, 0x0a, 0x04, 0x61, 0x72, 0x67, 0x31, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x61, 0x72, 0x67, 0x31, 0x12, 0x12, 0x0a, 0x04, 0x61, 0x72, 0x67, 0x32, 0x18, 0x06, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x61, 0x72, 0x67, 0x32, 0x12, 0x12, 0x0a, 0x04, 0x61, 0x72, 0x67, 0x33, 0x18, 0x07, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x61, 0x72, 0x67, 0x33, 0x12, 0x12, 0x0a, 0x04, 0x61, 0x72, 0x67, 0x34, 0x18, 0x08, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x61, 0x72, 0x67, 0x34, 0x12, 0x12, 0x0a, 0x04, 0x61, 0x72, 0x67, 0x35, 0x18, 0x09, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x61, 0x72, 0x67, 0x35, 0x12, 0x12, 0x0a, 0x04, 0x61, 0x72, 0x67, 0x36, 0x18, 0x0a, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x61, 0x72, 0x67, 0x36, 0x22, 0xf4, 0x01, 0x0a, 0x04, 0x4f, 0x70, 0x65, 0x6e, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x0e, 0x0a, 0x02, 0x66, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x03, 0x52, 0x02, 0x66, 0x64, 0x12, 0x17, 0x0a, 0x07, 0x66, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x66, 0x64, 0x50, 0x61, 0x74, 0x68, 0x12, 0x1a, 0x0a, 0x08, 0x70, 0x61, 0x74, 0x68, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x06, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x70, 0x61, 0x74, 0x68, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x12, 0x12, 0x0a, 0x04, 0x6d, 0x6f, 0x64, 0x65, 0x18, 0x08, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x04, 0x6d, 0x6f, 0x64, 0x65, 0x22, 0xaf, 0x01, 0x0a, 0x05, 0x43, 0x6c, 0x6f, 0x73, 0x65, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x0e, 0x0a, 0x02, 0x66, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x03, 0x52, 0x02, 0x66, 0x64, 0x12, 0x17, 0x0a, 0x07, 0x66, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x66, 0x64, 0x50, 0x61, 0x74, 0x68, 0x22, 0x91, 0x02, 0x0a, 0x04, 0x52, 0x65, 0x61, 0x64, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x0e, 0x0a, 0x02, 0x66, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x03, 0x52, 0x02, 0x66, 0x64, 0x12, 0x17, 0x0a, 0x07, 0x66, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x66, 0x64, 0x50, 0x61, 0x74, 0x68, 0x12, 0x14, 0x0a, 0x05, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x18, 0x06, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x12, 0x1d, 0x0a, 0x0a, 0x68, 0x61, 0x73, 0x5f, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x18, 0x07, 0x20, 0x01, 0x28, 0x08, 0x52, 0x09, 0x68, 0x61, 0x73, 0x4f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x12, 0x16, 0x0a, 0x06, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x18, 0x08, 0x20, 0x01, 0x28, 0x03, 0x52, 0x06, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x18, 0x09, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x22, 0x92, 0x02, 0x0a, 0x05, 0x57, 0x72, 0x69, 0x74, 0x65, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x0e, 0x0a, 0x02, 0x66, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x03, 0x52, 0x02, 0x66, 0x64, 0x12, 0x17, 0x0a, 0x07, 0x66, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x66, 0x64, 0x50, 0x61, 0x74, 0x68, 0x12, 0x14, 0x0a, 0x05, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x18, 0x06, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x12, 0x1d, 0x0a, 0x0a, 0x68, 0x61, 0x73, 0x5f, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x18, 0x07, 0x20, 0x01, 0x28, 0x08, 0x52, 0x09, 0x68, 0x61, 0x73, 0x4f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x12, 0x16, 0x0a, 0x06, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x18, 0x08, 0x20, 0x01, 0x28, 0x03, 0x52, 0x06, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x18, 0x09, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x22, 0xcb, 0x01, 0x0a, 0x07, 0x43, 0x6f, 0x6e, 0x6e, 0x65, 0x63, 0x74, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x0e, 0x0a, 0x02, 0x66, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x03, 0x52, 0x02, 0x66, 0x64, 0x12, 0x17, 0x0a, 0x07, 0x66, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x66, 0x64, 0x50, 0x61, 0x74, 0x68, 0x12, 0x18, 0x0a, 0x07, 0x61, 0x64, 0x64, 0x72, 0x65, 0x73, 0x73, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x07, 0x61, 0x64, 0x64, 0x72, 0x65, 0x73, 0x73, 0x22, 0x8a, 0x02, 0x0a, 0x06, 0x45, 0x78, 0x65, 0x63, 0x76, 0x65, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x0e, 0x0a, 0x02, 0x66, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x03, 0x52, 0x02, 0x66, 0x64, 0x12, 0x17, 0x0a, 0x07, 0x66, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x66, 0x64, 0x50, 0x61, 0x74, 0x68, 0x12, 0x1a, 0x0a, 0x08, 0x70, 0x61, 0x74, 0x68, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x06, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x70, 0x61, 0x74, 0x68, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x12, 0x0a, 0x04, 0x61, 0x72, 0x67, 0x76, 0x18, 0x07, 0x20, 0x03, 0x28, 0x09, 0x52, 0x04, 0x61, 0x72, 0x67, 0x76, 0x12, 0x12, 0x0a, 0x04, 0x65, 0x6e, 0x76, 0x76, 0x18, 0x08, 0x20, 0x03, 0x28, 0x09, 0x52, 0x04, 0x65, 0x6e, 0x76, 0x76, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x18, 0x09, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x22, 0xcf, 0x01, 0x0a, 0x06, 0x53, 0x6f, 0x63, 0x6b, 0x65, 0x74, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x16, 0x0a, 0x06, 0x64, 0x6f, 0x6d, 0x61, 0x69, 0x6e, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x64, 0x6f, 0x6d, 0x61, 0x69, 0x6e, 0x12, 0x12, 0x0a, 0x04, 0x74, 0x79, 0x70, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x05, 0x52, 0x04, 0x74, 0x79, 0x70, 0x65, 0x12, 0x1a, 0x0a, 0x08, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x18, 0x06, 0x20, 0x01, 0x28, 0x05, 0x52, 0x08, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x22, 0xcb, 0x01, 0x0a, 0x05, 0x43, 0x68, 0x64, 0x69, 0x72, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x0e, 0x0a, 0x02, 0x66, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x03, 0x52, 0x02, 0x66, 0x64, 0x12, 0x17, 0x0a, 0x07, 0x66, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x66, 0x64, 0x50, 0x61, 0x74, 0x68, 0x12, 0x1a, 0x0a, 0x08, 0x70, 0x61, 0x74, 0x68, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x06, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x70, 0x61, 0x74, 0x68, 0x6e, 0x61, 0x6d, 0x65, 0x22, 0xbf, 0x01, 0x0a, 0x08, 0x53, 0x65, 0x74, 0x72, 0x65, 0x73, 0x69, 0x64, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x10, 0x0a, 0x03, 0x72, 0x69, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x03, 0x72, 0x69, 0x64, 0x12, 0x10, 0x0a, 0x03, 0x65, 0x69, 0x64, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x03, 0x65, 0x69, 0x64, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x69, 0x64, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x03, 0x73, 0x69, 0x64, 0x22, 0x96, 0x01, 0x0a, 0x05, 0x53, 0x65, 0x74, 0x69, 0x64, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x02, 0x69, 0x64, 0x22, 0x32, 0x0a, 0x0c, 0x53, 0x74, 0x72, 0x75, 0x63, 0x74, 0x52, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x12, 0x10, 0x0a, 0x03, 0x63, 0x75, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x63, 0x75, 0x72, 0x12, 0x10, 0x0a, 0x03, 0x6d, 0x61, 0x78, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x6d, 0x61, 0x78, 0x22, 0xac, 0x02, 0x0a, 0x07, 0x50, 0x72, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x10, 0x0a, 0x03, 0x70, 0x69, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x03, 0x70, 0x69, 0x64, 0x12, 0x1a, 0x0a, 0x08, 0x72, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x03, 0x52, 0x08, 0x72, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x12, 0x39, 0x0a, 0x09, 0x6e, 0x65, 0x77, 0x5f, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1c, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x53, 0x74, 0x72, 0x75, 0x63, 0x74, 0x52, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x52, 0x08, 0x6e, 0x65, 0x77, 0x4c, 0x69, 0x6d, 0x69, 0x74, 0x12, 0x39, 0x0a, 0x09, 0x6f, 0x6c, 0x64, 0x5f, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1c, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x53, 0x74, 0x72, 0x75, 0x63, 0x74, 0x52, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x52, 0x08, 0x6f, 0x6c, 0x64, 0x4c, 0x69, 0x6d, 0x69, 0x74, 0x22, 0xcb, 0x01, 0x0a, 0x04, 0x50, 0x69, 0x70, 0x65, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x12, 0x16, 0x0a, 0x06, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x18, 0x05, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x12, 0x16, 0x0a, 0x06, 0x77, 0x72, 0x69, 0x74, 0x65, 0x72, 0x18, 0x06, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x77, 0x72, 0x69, 0x74, 0x65, 0x72, 0x22, 0xd5, 0x01, 0x0a, 0x05, 0x46, 0x63, 0x6e, 0x74, 0x6c, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x0e, 0x0a, 0x02, 0x66, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x02, 0x66, 0x64, 0x12, 0x17, 0x0a, 0x07, 0x66, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x66, 0x64, 0x50, 0x61, 0x74, 0x68, 0x12, 0x10, 0x0a, 0x03, 0x63, 0x6d, 0x64, 0x18, 0x06, 0x20, 0x01, 0x28, 0x05, 0x52, 0x03, 0x63, 0x6d, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x61, 0x72, 0x67, 0x73, 0x18, 0x07, 0x20, 0x01, 0x28, 0x03, 0x52, 0x04, 0x61, 0x72, 0x67, 0x73, 0x22, 0xe1, 0x01, 0x0a, 0x03, 0x44, 0x75, 0x70, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x15, 0x0a, 0x06, 0x6f, 0x6c, 0x64, 0x5f, 0x66, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x6f, 0x6c, 0x64, 0x46, 0x64, 0x12, 0x15, 0x0a, 0x06, 0x6e, 0x65, 0x77, 0x5f, 0x66, 0x64, 0x18, 0x05, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x6e, 0x65, 0x77, 0x46, 0x64, 0x12, 0x17, 0x0a, 0x07, 0x66, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x06, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x66, 0x64, 0x50, 0x61, 0x74, 0x68, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x22, 0xe0, 0x01, 0x0a, 0x08, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x66, 0x64, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x0e, 0x0a, 0x02, 0x66, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x02, 0x66, 0x64, 0x12, 0x17, 0x0a, 0x07, 0x66, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x66, 0x64, 0x50, 0x61, 0x74, 0x68, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x69, 0x67, 0x73, 0x65, 0x74, 0x18, 0x06, 0x20, 0x01, 0x28, 0x04, 0x52, 0x06, 0x73, 0x69, 0x67, 0x73, 0x65, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x18, 0x07, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x22, 0xa3, 0x01, 0x0a, 0x06, 0x43, 0x68, 0x72, 0x6f, 0x6f, 0x74, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x1a, 0x0a, 0x08, 0x70, 0x61, 0x74, 0x68, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x70, 0x61, 0x74, 0x68, 0x6e, 0x61, 0x6d, 0x65, 0x22, 0xb0, 0x01, 0x0a, 0x07, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x66, 0x64, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x10, 0x0a, 0x03, 0x76, 0x61, 0x6c, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x03, 0x76, 0x61, 0x6c, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x22, 0xdd, 0x01, 0x0a, 0x05, 0x43, 0x6c, 0x6f, 0x6e, 0x65, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x12, 0x17, 0x0a, 0x07, 0x6e, 0x65, 0x77, 0x5f, 0x74, 0x69, 0x64, 0x18, 0x06, 0x20, 0x01, 0x28, 0x04, 0x52, 0x06, 0x6e, 0x65, 0x77, 0x54, 0x69, 0x64, 0x12, 0x10, 0x0a, 0x03, 0x74, 0x6c, 0x73, 0x18, 0x07, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x74, 0x6c, 0x73, 0x22, 0xc8, 0x01, 0x0a, 0x04, 0x42, 0x69, 0x6e, 0x64, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x0e, 0x0a, 0x02, 0x66, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x02, 0x66, 0x64, 0x12, 0x17, 0x0a, 0x07, 0x66, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x66, 0x64, 0x50, 0x61, 0x74, 0x68, 0x12, 0x18, 0x0a, 0x07, 0x61, 0x64, 0x64, 0x72, 0x65, 0x73, 0x73, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x07, 0x61, 0x64, 0x64, 0x72, 0x65, 0x73, 0x73, 0x22, 0xe0, 0x01, 0x0a, 0x06, 0x41, 0x63, 0x63, 0x65, 0x70, 0x74, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x0e, 0x0a, 0x02, 0x66, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x02, 0x66, 0x64, 0x12, 0x17, 0x0a, 0x07, 0x66, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x66, 0x64, 0x50, 0x61, 0x74, 0x68, 0x12, 0x18, 0x0a, 0x07, 0x61, 0x64, 0x64, 0x72, 0x65, 0x73, 0x73, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x07, 0x61, 0x64, 0x64, 0x72, 0x65, 0x73, 0x73, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x18, 0x07, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x22, 0xbf, 0x01, 0x0a, 0x0d, 0x54, 0x69, 0x6d, 0x65, 0x72, 0x66, 0x64, 0x43, 0x72, 0x65, 0x61, 0x74, 0x65, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x19, 0x0a, 0x08, 0x63, 0x6c, 0x6f, 0x63, 0x6b, 0x5f, 0x69, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x07, 0x63, 0x6c, 0x6f, 0x63, 0x6b, 0x49, 0x64, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x22, 0x30, 0x0a, 0x08, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x70, 0x65, 0x63, 0x12, 0x10, 0x0a, 0x03, 0x73, 0x65, 0x63, 0x18, 0x01, 0x20, 0x01, 0x28, 0x03, 0x52, 0x03, 0x73, 0x65, 0x63, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x73, 0x65, 0x63, 0x18, 0x02, 0x20, 0x01, 0x28, 0x03, 0x52, 0x04, 0x6e, 0x73, 0x65, 0x63, 0x22, 0x72, 0x0a, 0x0a, 0x49, 0x74, 0x69, 0x6d, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x12, 0x34, 0x0a, 0x08, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x76, 0x61, 0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x18, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x70, 0x65, 0x63, 0x52, 0x08, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x76, 0x61, 0x6c, 0x12, 0x2e, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x18, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x70, 0x65, 0x63, 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0xc0, 0x02, 0x0a, 0x0e, 0x54, 0x69, 0x6d, 0x65, 0x72, 0x66, 0x64, 0x53, 0x65, 0x74, 0x54, 0x69, 0x6d, 0x65, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x0e, 0x0a, 0x02, 0x66, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x02, 0x66, 0x64, 0x12, 0x17, 0x0a, 0x07, 0x66, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x66, 0x64, 0x50, 0x61, 0x74, 0x68, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x18, 0x06, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x12, 0x37, 0x0a, 0x09, 0x6e, 0x65, 0x77, 0x5f, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x49, 0x74, 0x69, 0x6d, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x52, 0x08, 0x6e, 0x65, 0x77, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x12, 0x37, 0x0a, 0x09, 0x6f, 0x6c, 0x64, 0x5f, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x08, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x49, 0x74, 0x69, 0x6d, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x52, 0x08, 0x6f, 0x6c, 0x64, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x22, 0xf1, 0x01, 0x0a, 0x0e, 0x54, 0x69, 0x6d, 0x65, 0x72, 0x66, 0x64, 0x47, 0x65, 0x74, 0x54, 0x69, 0x6d, 0x65, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x0e, 0x0a, 0x02, 0x66, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x02, 0x66, 0x64, 0x12, 0x17, 0x0a, 0x07, 0x66, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x66, 0x64, 0x50, 0x61, 0x74, 0x68, 0x12, 0x37, 0x0a, 0x09, 0x63, 0x75, 0x72, 0x5f, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x49, 0x74, 0x69, 0x6d, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x52, 0x08, 0x63, 0x75, 0x72, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x85, 0x01, 0x0a, 0x04, 0x46, 0x6f, 0x72, 0x6b, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x22, 0xa2, 0x01, 0x0a, 0x0b, 0x49, 0x6e, 0x6f, 0x74, 0x69, 0x66, 0x79, 0x49, 0x6e, 0x69, 0x74, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x14, 0x0a, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x22, 0xe9, 0x01, 0x0a, 0x0f, 0x49, 0x6e, 0x6f, 0x74, 0x69, 0x66, 0x79, 0x41, 0x64, 0x64, 0x57, 0x61, 0x74, 0x63, 0x68, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x0e, 0x0a, 0x02, 0x66, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x02, 0x66, 0x64, 0x12, 0x17, 0x0a, 0x07, 0x66, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x66, 0x64, 0x50, 0x61, 0x74, 0x68, 0x12, 0x1a, 0x0a, 0x08, 0x70, 0x61, 0x74, 0x68, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x06, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x70, 0x61, 0x74, 0x68, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x12, 0x0a, 0x04, 0x6d, 0x61, 0x73, 0x6b, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x04, 0x6d, 0x61, 0x73, 0x6b, 0x22, 0xc8, 0x01, 0x0a, 0x0e, 0x49, 0x6e, 0x6f, 0x74, 0x69, 0x66, 0x79, 0x52, 0x6d, 0x57, 0x61, 0x74, 0x63, 0x68, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x0e, 0x0a, 0x02, 0x66, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x02, 0x66, 0x64, 0x12, 0x17, 0x0a, 0x07, 0x66, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x66, 0x64, 0x50, 0x61, 0x74, 0x68, 0x12, 0x0e, 0x0a, 0x02, 0x77, 0x64, 0x18, 0x06, 0x20, 0x01, 0x28, 0x05, 0x52, 0x02, 0x77, 0x64, 0x22, 0x87, 0x02, 0x0a, 0x0a, 0x53, 0x6f, 0x63, 0x6b, 0x65, 0x74, 0x50, 0x61, 0x69, 0x72, 0x12, 0x3d, 0x0a, 0x0c, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2e, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x14, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x45, 0x78, 0x69, 0x74, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x79, 0x73, 0x6e, 0x6f, 0x12, 0x16, 0x0a, 0x06, 0x64, 0x6f, 0x6d, 0x61, 0x69, 0x6e, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x64, 0x6f, 0x6d, 0x61, 0x69, 0x6e, 0x12, 0x12, 0x0a, 0x04, 0x74, 0x79, 0x70, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x05, 0x52, 0x04, 0x74, 0x79, 0x70, 0x65, 0x12, 0x1a, 0x0a, 0x08, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x18, 0x06, 0x20, 0x01, 0x28, 0x05, 0x52, 0x08, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x12, 0x18, 0x0a, 0x07, 0x73, 0x6f, 0x63, 0x6b, 0x65, 0x74, 0x31, 0x18, 0x07, 0x20, 0x01, 0x28, 0x05, 0x52, 0x07, 0x73, 0x6f, 0x63, 0x6b, 0x65, 0x74, 0x31, 0x12, 0x18, 0x0a, 0x07, 0x73, 0x6f, 0x63, 0x6b, 0x65, 0x74, 0x32, 0x18, 0x08, 0x20, 0x01, 0x28, 0x05, 0x52, 0x07, 0x73, 0x6f, 0x63, 0x6b, 0x65, 0x74, 0x32, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( file_pkg_sentry_seccheck_points_syscall_proto_rawDescOnce sync.Once file_pkg_sentry_seccheck_points_syscall_proto_rawDescData = file_pkg_sentry_seccheck_points_syscall_proto_rawDesc ) func file_pkg_sentry_seccheck_points_syscall_proto_rawDescGZIP() []byte { file_pkg_sentry_seccheck_points_syscall_proto_rawDescOnce.Do(func() { file_pkg_sentry_seccheck_points_syscall_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_sentry_seccheck_points_syscall_proto_rawDescData) }) return file_pkg_sentry_seccheck_points_syscall_proto_rawDescData } var file_pkg_sentry_seccheck_points_syscall_proto_msgTypes = make([]protoimpl.MessageInfo, 33) var file_pkg_sentry_seccheck_points_syscall_proto_goTypes = []interface{}{ (*Exit)(nil), // 0: gvisor.syscall.Exit (*Syscall)(nil), // 1: gvisor.syscall.Syscall (*Open)(nil), // 2: gvisor.syscall.Open (*Close)(nil), // 3: gvisor.syscall.Close (*Read)(nil), // 4: gvisor.syscall.Read (*Write)(nil), // 5: gvisor.syscall.Write (*Connect)(nil), // 6: gvisor.syscall.Connect (*Execve)(nil), // 7: gvisor.syscall.Execve (*Socket)(nil), // 8: gvisor.syscall.Socket (*Chdir)(nil), // 9: gvisor.syscall.Chdir (*Setresid)(nil), // 10: gvisor.syscall.Setresid (*Setid)(nil), // 11: gvisor.syscall.Setid (*StructRlimit)(nil), // 12: gvisor.syscall.StructRlimit (*Prlimit)(nil), // 13: gvisor.syscall.Prlimit (*Pipe)(nil), // 14: gvisor.syscall.Pipe (*Fcntl)(nil), // 15: gvisor.syscall.Fcntl (*Dup)(nil), // 16: gvisor.syscall.Dup (*Signalfd)(nil), // 17: gvisor.syscall.Signalfd (*Chroot)(nil), // 18: gvisor.syscall.Chroot (*Eventfd)(nil), // 19: gvisor.syscall.Eventfd (*Clone)(nil), // 20: gvisor.syscall.Clone (*Bind)(nil), // 21: gvisor.syscall.Bind (*Accept)(nil), // 22: gvisor.syscall.Accept (*TimerfdCreate)(nil), // 23: gvisor.syscall.TimerfdCreate (*Timespec)(nil), // 24: gvisor.syscall.Timespec (*ItimerSpec)(nil), // 25: gvisor.syscall.ItimerSpec (*TimerfdSetTime)(nil), // 26: gvisor.syscall.TimerfdSetTime (*TimerfdGetTime)(nil), // 27: gvisor.syscall.TimerfdGetTime (*Fork)(nil), // 28: gvisor.syscall.Fork (*InotifyInit)(nil), // 29: gvisor.syscall.InotifyInit (*InotifyAddWatch)(nil), // 30: gvisor.syscall.InotifyAddWatch (*InotifyRmWatch)(nil), // 31: gvisor.syscall.InotifyRmWatch (*SocketPair)(nil), // 32: gvisor.syscall.SocketPair (*ContextData)(nil), // 33: gvisor.common.ContextData } var file_pkg_sentry_seccheck_points_syscall_proto_depIdxs = []int32{ 33, // 0: gvisor.syscall.Syscall.context_data:type_name -> gvisor.common.ContextData 0, // 1: gvisor.syscall.Syscall.exit:type_name -> gvisor.syscall.Exit 33, // 2: gvisor.syscall.Open.context_data:type_name -> gvisor.common.ContextData 0, // 3: gvisor.syscall.Open.exit:type_name -> gvisor.syscall.Exit 33, // 4: gvisor.syscall.Close.context_data:type_name -> gvisor.common.ContextData 0, // 5: gvisor.syscall.Close.exit:type_name -> gvisor.syscall.Exit 33, // 6: gvisor.syscall.Read.context_data:type_name -> gvisor.common.ContextData 0, // 7: gvisor.syscall.Read.exit:type_name -> gvisor.syscall.Exit 33, // 8: gvisor.syscall.Write.context_data:type_name -> gvisor.common.ContextData 0, // 9: gvisor.syscall.Write.exit:type_name -> gvisor.syscall.Exit 33, // 10: gvisor.syscall.Connect.context_data:type_name -> gvisor.common.ContextData 0, // 11: gvisor.syscall.Connect.exit:type_name -> gvisor.syscall.Exit 33, // 12: gvisor.syscall.Execve.context_data:type_name -> gvisor.common.ContextData 0, // 13: gvisor.syscall.Execve.exit:type_name -> gvisor.syscall.Exit 33, // 14: gvisor.syscall.Socket.context_data:type_name -> gvisor.common.ContextData 0, // 15: gvisor.syscall.Socket.exit:type_name -> gvisor.syscall.Exit 33, // 16: gvisor.syscall.Chdir.context_data:type_name -> gvisor.common.ContextData 0, // 17: gvisor.syscall.Chdir.exit:type_name -> gvisor.syscall.Exit 33, // 18: gvisor.syscall.Setresid.context_data:type_name -> gvisor.common.ContextData 0, // 19: gvisor.syscall.Setresid.exit:type_name -> gvisor.syscall.Exit 33, // 20: gvisor.syscall.Setid.context_data:type_name -> gvisor.common.ContextData 0, // 21: gvisor.syscall.Setid.exit:type_name -> gvisor.syscall.Exit 33, // 22: gvisor.syscall.Prlimit.context_data:type_name -> gvisor.common.ContextData 0, // 23: gvisor.syscall.Prlimit.exit:type_name -> gvisor.syscall.Exit 12, // 24: gvisor.syscall.Prlimit.new_limit:type_name -> gvisor.syscall.StructRlimit 12, // 25: gvisor.syscall.Prlimit.old_limit:type_name -> gvisor.syscall.StructRlimit 33, // 26: gvisor.syscall.Pipe.context_data:type_name -> gvisor.common.ContextData 0, // 27: gvisor.syscall.Pipe.exit:type_name -> gvisor.syscall.Exit 33, // 28: gvisor.syscall.Fcntl.context_data:type_name -> gvisor.common.ContextData 0, // 29: gvisor.syscall.Fcntl.exit:type_name -> gvisor.syscall.Exit 33, // 30: gvisor.syscall.Dup.context_data:type_name -> gvisor.common.ContextData 0, // 31: gvisor.syscall.Dup.exit:type_name -> gvisor.syscall.Exit 33, // 32: gvisor.syscall.Signalfd.context_data:type_name -> gvisor.common.ContextData 0, // 33: gvisor.syscall.Signalfd.exit:type_name -> gvisor.syscall.Exit 33, // 34: gvisor.syscall.Chroot.context_data:type_name -> gvisor.common.ContextData 0, // 35: gvisor.syscall.Chroot.exit:type_name -> gvisor.syscall.Exit 33, // 36: gvisor.syscall.Eventfd.context_data:type_name -> gvisor.common.ContextData 0, // 37: gvisor.syscall.Eventfd.exit:type_name -> gvisor.syscall.Exit 33, // 38: gvisor.syscall.Clone.context_data:type_name -> gvisor.common.ContextData 0, // 39: gvisor.syscall.Clone.exit:type_name -> gvisor.syscall.Exit 33, // 40: gvisor.syscall.Bind.context_data:type_name -> gvisor.common.ContextData 0, // 41: gvisor.syscall.Bind.exit:type_name -> gvisor.syscall.Exit 33, // 42: gvisor.syscall.Accept.context_data:type_name -> gvisor.common.ContextData 0, // 43: gvisor.syscall.Accept.exit:type_name -> gvisor.syscall.Exit 33, // 44: gvisor.syscall.TimerfdCreate.context_data:type_name -> gvisor.common.ContextData 0, // 45: gvisor.syscall.TimerfdCreate.exit:type_name -> gvisor.syscall.Exit 24, // 46: gvisor.syscall.ItimerSpec.interval:type_name -> gvisor.syscall.Timespec 24, // 47: gvisor.syscall.ItimerSpec.value:type_name -> gvisor.syscall.Timespec 33, // 48: gvisor.syscall.TimerfdSetTime.context_data:type_name -> gvisor.common.ContextData 0, // 49: gvisor.syscall.TimerfdSetTime.exit:type_name -> gvisor.syscall.Exit 25, // 50: gvisor.syscall.TimerfdSetTime.new_value:type_name -> gvisor.syscall.ItimerSpec 25, // 51: gvisor.syscall.TimerfdSetTime.old_value:type_name -> gvisor.syscall.ItimerSpec 33, // 52: gvisor.syscall.TimerfdGetTime.context_data:type_name -> gvisor.common.ContextData 0, // 53: gvisor.syscall.TimerfdGetTime.exit:type_name -> gvisor.syscall.Exit 25, // 54: gvisor.syscall.TimerfdGetTime.cur_value:type_name -> gvisor.syscall.ItimerSpec 33, // 55: gvisor.syscall.Fork.context_data:type_name -> gvisor.common.ContextData 0, // 56: gvisor.syscall.Fork.exit:type_name -> gvisor.syscall.Exit 33, // 57: gvisor.syscall.InotifyInit.context_data:type_name -> gvisor.common.ContextData 0, // 58: gvisor.syscall.InotifyInit.exit:type_name -> gvisor.syscall.Exit 33, // 59: gvisor.syscall.InotifyAddWatch.context_data:type_name -> gvisor.common.ContextData 0, // 60: gvisor.syscall.InotifyAddWatch.exit:type_name -> gvisor.syscall.Exit 33, // 61: gvisor.syscall.InotifyRmWatch.context_data:type_name -> gvisor.common.ContextData 0, // 62: gvisor.syscall.InotifyRmWatch.exit:type_name -> gvisor.syscall.Exit 33, // 63: gvisor.syscall.SocketPair.context_data:type_name -> gvisor.common.ContextData 0, // 64: gvisor.syscall.SocketPair.exit:type_name -> gvisor.syscall.Exit 65, // [65:65] is the sub-list for method output_type 65, // [65:65] is the sub-list for method input_type 65, // [65:65] is the sub-list for extension type_name 65, // [65:65] is the sub-list for extension extendee 0, // [0:65] is the sub-list for field type_name } func init() { file_pkg_sentry_seccheck_points_syscall_proto_init() } func file_pkg_sentry_seccheck_points_syscall_proto_init() { if File_pkg_sentry_seccheck_points_syscall_proto != nil { return } file_pkg_sentry_seccheck_points_common_proto_init() if !protoimpl.UnsafeEnabled { file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Exit); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Syscall); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Open); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Close); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Read); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Write); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[6].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Connect); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[7].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Execve); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[8].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Socket); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[9].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Chdir); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[10].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Setresid); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[11].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Setid); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[12].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*StructRlimit); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[13].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Prlimit); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[14].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Pipe); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[15].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Fcntl); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[16].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Dup); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[17].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Signalfd); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[18].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Chroot); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[19].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Eventfd); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[20].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Clone); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[21].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Bind); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[22].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Accept); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[23].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*TimerfdCreate); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[24].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Timespec); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[25].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*ItimerSpec); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[26].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*TimerfdSetTime); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[27].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*TimerfdGetTime); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[28].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Fork); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[29].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*InotifyInit); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[30].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*InotifyAddWatch); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[31].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*InotifyRmWatch); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_seccheck_points_syscall_proto_msgTypes[32].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*SocketPair); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } } type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_pkg_sentry_seccheck_points_syscall_proto_rawDesc, NumEnums: 0, NumMessages: 33, NumExtensions: 0, NumServices: 0, }, GoTypes: file_pkg_sentry_seccheck_points_syscall_proto_goTypes, DependencyIndexes: file_pkg_sentry_seccheck_points_syscall_proto_depIdxs, MessageInfos: file_pkg_sentry_seccheck_points_syscall_proto_msgTypes, }.Build() File_pkg_sentry_seccheck_points_syscall_proto = out.File file_pkg_sentry_seccheck_points_syscall_proto_rawDesc = nil file_pkg_sentry_seccheck_points_syscall_proto_goTypes = nil file_pkg_sentry_seccheck_points_syscall_proto_depIdxs = nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/seccheck/seccheck.go000066400000000000000000000233601465435605700246230ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package seccheck defines a structure for dynamically-configured security // checks in the sentry. package seccheck import ( "google.golang.org/protobuf/proto" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto" "gvisor.dev/gvisor/pkg/sync" ) // A Point represents a checkpoint, a point at which a security check occurs. type Point uint // PointX represents the checkpoint X. const ( totalPoints = int(pointLengthBeforeSyscalls) + syscallPoints numPointsPerUint32 = 32 numPointBitmaskUint32s = (totalPoints-1)/numPointsPerUint32 + 1 ) // FieldSet contains all optional fields to be collected by a given Point. type FieldSet struct { // Local indicates which optional fields from the Point that needs to be // collected, e.g. resolving path from an FD, or collecting a large field. Local FieldMask // Context indicates which optional fields from the Context that needs to be // collected, e.g. PID, credentials, current time. Context FieldMask } // Field represents the index of a single optional field to be collect for a // Point. type Field uint // FieldMask is a bitmask with a single bit representing an optional field to be // collected. The meaning of each bit varies per point. The mask is currently // limited to 64 fields. If more are needed, FieldMask can be expanded to // support additional fields. type FieldMask struct { mask uint64 } // MakeFieldMask creates a FieldMask from a set of Fields. func MakeFieldMask(fields ...Field) FieldMask { var m FieldMask for _, field := range fields { m.Add(field) } return m } // Contains returns true if the mask contains the Field. func (fm *FieldMask) Contains(field Field) bool { return fm.mask&(1<= pointLengthBeforeSyscalls { updateSyscalls = true } s.pointFields[req.Pt] = req.Fields } if updateSyscalls { for _, listener := range s.syscallFlagListeners { listener.UpdateSecCheck(s) } } } func (s *State) clearSink() { s.registrationMu.Lock() defer s.registrationMu.Unlock() updateSyscalls := false for i := range s.enabledPoints { s.enabledPoints[i].Store(0) // We use i+1 here because we want to check the last bit that may have been changed within i. if Point((i+1)*numPointsPerUint32) >= pointLengthBeforeSyscalls { updateSyscalls = true } } if updateSyscalls { for _, listener := range s.syscallFlagListeners { listener.UpdateSecCheck(s) } } s.pointFields = nil oldSinks := s.getSinks() s.registrationSeq.BeginWrite() s.sinks = nil s.registrationSeq.EndWrite() for _, sink := range oldSinks { sink.Stop() } } // AddSyscallFlagListener adds a listener to the State. // // The listener will be notified whenever syscall point enablement changes. func (s *State) AddSyscallFlagListener(listener SyscallFlagListener) { s.registrationMu.Lock() defer s.registrationMu.Unlock() s.syscallFlagListeners = append(s.syscallFlagListeners, listener) } // Enabled returns true if any Sink is registered for the given checkpoint. func (s *State) Enabled(p Point) bool { word, bit := p/numPointsPerUint32, p%numPointsPerUint32 if int(word) >= len(s.enabledPoints) { return false } return s.enabledPoints[word].Load()&(uint32(1)< r.maxBackoff { return nil, fmt.Errorf("initial backoff (%v) cannot be larger than max backoff (%v)", r.initialBackoff, r.maxBackoff) } log.Debugf("Remote sink created, endpoint FD: %d, %+v", r.endpoint.FD(), r) return r, nil } func (*remote) Name() string { return name } func (r *remote) Status() seccheck.SinkStatus { return seccheck.SinkStatus{ DroppedCount: uint64(r.droppedCount.Load()), } } // Stop implements seccheck.Sink. func (r *remote) Stop() { if r.endpoint != nil { // It's possible to race with Point firing, but in the worst case they will // simply fail to be delivered. r.endpoint.Close() } } func (r *remote) write(msg proto.Message, msgType pb.MessageType) { out, err := proto.Marshal(msg) if err != nil { log.Debugf("Marshal(%+v): %v", msg, err) return } hdr := wire.Header{ HeaderSize: uint16(wire.HeaderStructSize), DroppedCount: r.droppedCount.Load(), MessageType: uint16(msgType), } var hdrOut [wire.HeaderStructSize]byte hdr.MarshalUnsafe(hdrOut[:]) backoff := r.initialBackoff for i := 0; ; i++ { _, err := unix.Writev(r.endpoint.FD(), [][]byte{hdrOut[:], out}) if err == nil { // Write succeeded, we're done! return } if !errors.Is(err, unix.EAGAIN) || i >= r.retries { log.Debugf("Write failed, dropping point: %v", err) r.droppedCount.Add(1) return } log.Debugf("Write failed, retrying (%d/%d) in %v: %v", i+1, r.retries, backoff, err) time.Sleep(backoff) backoff *= 2 if r.maxBackoff > 0 && backoff > r.maxBackoff { backoff = r.maxBackoff } } } // Clone implements seccheck.Sink. func (r *remote) Clone(_ context.Context, _ seccheck.FieldSet, info *pb.CloneInfo) error { r.write(info, pb.MessageType_MESSAGE_SENTRY_CLONE) return nil } // Execve implements seccheck.Sink. func (r *remote) Execve(_ context.Context, _ seccheck.FieldSet, info *pb.ExecveInfo) error { r.write(info, pb.MessageType_MESSAGE_SENTRY_EXEC) return nil } // ExitNotifyParent implements seccheck.Sink. func (r *remote) ExitNotifyParent(_ context.Context, _ seccheck.FieldSet, info *pb.ExitNotifyParentInfo) error { r.write(info, pb.MessageType_MESSAGE_SENTRY_EXIT_NOTIFY_PARENT) return nil } // TaskExit implements seccheck.Sink. func (r *remote) TaskExit(_ context.Context, _ seccheck.FieldSet, info *pb.TaskExit) error { r.write(info, pb.MessageType_MESSAGE_SENTRY_TASK_EXIT) return nil } // ContainerStart implements seccheck.Sink. func (r *remote) ContainerStart(_ context.Context, _ seccheck.FieldSet, info *pb.Start) error { r.write(info, pb.MessageType_MESSAGE_CONTAINER_START) return nil } // RawSyscall implements seccheck.Sink. func (r *remote) RawSyscall(_ context.Context, _ seccheck.FieldSet, info *pb.Syscall) error { r.write(info, pb.MessageType_MESSAGE_SYSCALL_RAW) return nil } // Syscall implements seccheck.Sink. func (r *remote) Syscall(ctx context.Context, fields seccheck.FieldSet, ctxData *pb.ContextData, msgType pb.MessageType, msg proto.Message) error { r.write(msg, msgType) return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/seccheck/sinks/remote/remote_state_autogen.go000066400000000000000000000000701465435605700317030ustar00rootroot00000000000000// automatically generated by stateify. package remote golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/seccheck/sinks/remote/wire/000077500000000000000000000000001465435605700261105ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/seccheck/sinks/remote/wire/wire.go000066400000000000000000000034571465435605700274160ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package wire defines structs used in the wire format for the remote checker. package wire // CurrentVersion is the current wire and protocol version. const CurrentVersion = 1 // HeaderStructSize size of header struct in bytes. const HeaderStructSize = 8 // Header is used to describe the message being sent to the remote process. // // 0 --------- 16 ---------- 32 ----------- 64 -----------+ // | HeaderSize | MessageType | DroppedCount | Payload... | // +---- 16 ----+---- 16 -----+----- 32 -----+------------+ // // +marshal type Header struct { // HeaderSize is the size of the header in bytes. The payload comes // immediately after the header. The length is needed to allow the header to // expand in the future without breaking remotes that do not yet understand // the new fields. HeaderSize uint16 // MessageType describes the payload. It must be one of the pb.MessageType // values and determine how the payload is interpreted. This is more efficient // than using protobuf.Any because Any uses the full protobuf name to identify // the type. MessageType uint16 // DroppedCount is the number of points that failed to be written and had to // be dropped. It wraps around after max(uint32). DroppedCount uint32 } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/seccheck/sinks/remote/wire/wire_abi_autogen_unsafe.go000066400000000000000000000100371465435605700333040ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package wire import ( "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "io" "reflect" "runtime" "unsafe" ) // Marshallable types used by this file. var _ marshal.Marshallable = (*Header)(nil) // SizeBytes implements marshal.Marshallable.SizeBytes. func (h *Header) SizeBytes() int { return 8 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (h *Header) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(h.HeaderSize)) dst = dst[2:] hostarch.ByteOrder.PutUint16(dst[:2], uint16(h.MessageType)) dst = dst[2:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(h.DroppedCount)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (h *Header) UnmarshalBytes(src []byte) []byte { h.HeaderSize = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] h.MessageType = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] h.DroppedCount = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (h *Header) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (h *Header) MarshalUnsafe(dst []byte) []byte { size := h.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(h), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (h *Header) UnmarshalUnsafe(src []byte) []byte { size := h.SizeBytes() gohacks.Memmove(unsafe.Pointer(h), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (h *Header) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(h))) hdr.Len = h.SizeBytes() hdr.Cap = h.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that h // must live until the use above. runtime.KeepAlive(h) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (h *Header) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return h.CopyOutN(cc, addr, h.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (h *Header) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(h))) hdr.Len = h.SizeBytes() hdr.Cap = h.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that h // must live until the use above. runtime.KeepAlive(h) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (h *Header) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return h.CopyInN(cc, addr, h.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (h *Header) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(h))) hdr.Len = h.SizeBytes() hdr.Cap = h.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that h // must live until the use above. runtime.KeepAlive(h) // escapes: replaced by intrinsic. return int64(length), err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/seccheck/sinks/remote/wire/wire_state_autogen.go000066400000000000000000000000661465435605700323310ustar00rootroot00000000000000// automatically generated by stateify. package wire golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/seccheck/syscall.go000066400000000000000000000057771465435605700245410ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package seccheck import ( "gvisor.dev/gvisor/pkg/abi/sentry" ) // SyscallType is an enum that denotes different types of syscall points. There // are 2 types of syscall point: fully-schematized and raw. Schematizes are // points that have syscall specific format, e.g. open => {path, flags, mode}. // Raw uses a generic schema that contains syscall number and 6 arguments. Each // of these type have a corresponding enter and exit points. Exit points include // return value and errno information. type SyscallType int const ( // SyscallEnter represents schematized/enter syscall. SyscallEnter SyscallType = iota // SyscallExit represents schematized/exit syscall. SyscallExit // SyscallRawEnter represents raw/enter syscall. SyscallRawEnter // SyscallRawExit represents raw/exit syscall. SyscallRawExit syscallTypesCount ) // SyscallFlagListener is an interface that is notified when syscall point enablement changes. // // It is used to notify the kernel's syscall table about syscall points, without introducing a // direct dependency on it. type SyscallFlagListener interface { // UpdateSecCheck is called each time the system call point enablement may have changed. // This is called with seccheck.State.mu held, so it is expected to be fast and not re-entrant // with seccheck.State functions that attempt to re-lock it. UpdateSecCheck(state *State) } const ( syscallPoints = (sentry.MaxSyscallNum + 1) * int(syscallTypesCount) ) // Fields that are common for many syscalls. const ( // FieldSyscallPath is an optional field to collect path from an FD. Given // that many syscalls operate on FDs, this const is used across syscalls. FieldSyscallPath Field = iota ) // Fields for execve*(2) syscalls. const ( // FieldSyscallExecveEnvv is an optional field to collect list of environment // variables. Start after FieldSyscallPath because execveat(2) can collect // path from FD. FieldSyscallExecveEnvv = FieldSyscallPath + 1 ) // GetPointForSyscall translates the syscall number to the corresponding Point. func GetPointForSyscall(typ SyscallType, sysno uintptr) Point { return Point(sysno)*Point(syscallTypesCount) + Point(typ) + pointLengthBeforeSyscalls } // SyscallEnabled checks if the corresponding point for the syscall is enabled. func (s *State) SyscallEnabled(typ SyscallType, sysno uintptr) bool { // Prevent overflow. if sysno >= sentry.MaxSyscallNum { return false } return s.Enabled(GetPointForSyscall(typ, sysno)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/000077500000000000000000000000001465435605700222405ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/control/000077500000000000000000000000001465435605700237205ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/control/control.go000066400000000000000000000512241465435605700257330ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package control provides internal representations of socket control // messages. package control import ( "math" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // SCMCredentials represents a SCM_CREDENTIALS socket control message. type SCMCredentials interface { transport.CredentialsControlMessage // Credentials returns properly namespaced values for the sender's pid, uid // and gid. Credentials(t *kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) } // scmCredentials represents an SCM_CREDENTIALS socket control message. // // +stateify savable type scmCredentials struct { t *kernel.Task kuid auth.KUID kgid auth.KGID } // NewSCMCredentials creates a new SCM_CREDENTIALS socket control message // representation. func NewSCMCredentials(t *kernel.Task, cred linux.ControlMessageCredentials) (SCMCredentials, error) { tcred := t.Credentials() kuid, err := tcred.UseUID(auth.UID(cred.UID)) if err != nil { return nil, err } kgid, err := tcred.UseGID(auth.GID(cred.GID)) if err != nil { return nil, err } if kernel.ThreadID(cred.PID) != t.ThreadGroup().ID() && !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.PIDNamespace().UserNamespace()) { return nil, linuxerr.EPERM } return &scmCredentials{t, kuid, kgid}, nil } // Equals implements transport.CredentialsControlMessage.Equals. func (c *scmCredentials) Equals(oc transport.CredentialsControlMessage) bool { if oc, _ := oc.(*scmCredentials); oc != nil && *c == *oc { return true } return false } func putUint64(buf []byte, n uint64) []byte { hostarch.ByteOrder.PutUint64(buf[len(buf):len(buf)+8], n) return buf[:len(buf)+8] } func putUint32(buf []byte, n uint32) []byte { hostarch.ByteOrder.PutUint32(buf[len(buf):len(buf)+4], n) return buf[:len(buf)+4] } // putCmsg writes a control message header and as much data as will fit into // the unused capacity of a buffer. func putCmsg(buf []byte, flags int, msgType uint32, align uint, data []int32) ([]byte, int) { space := bits.AlignDown(cap(buf)-len(buf), 4) // We can't write to space that doesn't exist, so if we are going to align // the available space, we must align down. // // align must be >= 4 and each data int32 is 4 bytes. The length of the // header is already aligned, so if we align to the width of the data there // are two cases: // 1. The aligned length is less than the length of the header. The // unaligned length was also less than the length of the header, so we // can't write anything. // 2. The aligned length is greater than or equal to the length of the // header. We can write the header plus zero or more bytes of data. We can't // write a partial int32, so the length of the message will be // min(aligned length, header + data). if space < linux.SizeOfControlMessageHeader { flags |= linux.MSG_CTRUNC return buf, flags } length := 4*len(data) + linux.SizeOfControlMessageHeader if length > space { length = space } buf = putUint64(buf, uint64(length)) buf = putUint32(buf, linux.SOL_SOCKET) buf = putUint32(buf, msgType) for _, d := range data { if len(buf)+4 > cap(buf) { flags |= linux.MSG_CTRUNC break } buf = putUint32(buf, uint32(d)) } return alignSlice(buf, align), flags } func putCmsgStruct(buf []byte, msgLevel, msgType uint32, align uint, data marshal.Marshallable) []byte { if cap(buf)-len(buf) < linux.SizeOfControlMessageHeader { return buf } ob := buf buf = putUint64(buf, uint64(linux.SizeOfControlMessageHeader)) buf = putUint32(buf, msgLevel) buf = putUint32(buf, msgType) hdrBuf := buf buf = append(buf, marshal.Marshal(data)...) // If the control message data brought us over capacity, omit it. if cap(buf) != cap(ob) { return hdrBuf } // Update control message length to include data. putUint64(ob, uint64(len(buf)-len(ob))) return alignSlice(buf, align) } // Credentials implements SCMCredentials.Credentials. func (c *scmCredentials) Credentials(t *kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) { // "When a process's user and group IDs are passed over a UNIX domain // socket to a process in a different user namespace (see the description // of SCM_CREDENTIALS in unix(7)), they are translated into the // corresponding values as per the receiving process's user and group ID // mappings." - user_namespaces(7) pid := t.PIDNamespace().IDOfTask(c.t) uid := c.kuid.In(t.UserNamespace()).OrOverflow() gid := c.kgid.In(t.UserNamespace()).OrOverflow() return pid, uid, gid } // PackCredentials packs the credentials in the control message (or default // credentials if none) into a buffer. func PackCredentials(t *kernel.Task, creds SCMCredentials, buf []byte, flags int) ([]byte, int) { align := t.Arch().Width() // Default credentials if none are available. pid := kernel.ThreadID(0) uid := auth.UID(auth.NobodyKUID) gid := auth.GID(auth.NobodyKGID) if creds != nil { pid, uid, gid = creds.Credentials(t) } c := []int32{int32(pid), int32(uid), int32(gid)} return putCmsg(buf, flags, linux.SCM_CREDENTIALS, align, c) } // alignSlice extends a slice's length (up to the capacity) to align it. func alignSlice(buf []byte, align uint) []byte { aligned := bits.AlignUp(len(buf), align) if aligned > cap(buf) { // Linux allows unaligned data if there isn't room for alignment. // Since there isn't room for alignment, there isn't room for any // additional messages either. return buf } return buf[:aligned] } // PackTimestamp packs a SO_TIMESTAMP socket control message. func PackTimestamp(t *kernel.Task, timestamp time.Time, buf []byte) []byte { timestampP := linux.NsecToTimeval(timestamp.UnixNano()) return putCmsgStruct( buf, linux.SOL_SOCKET, linux.SO_TIMESTAMP, t.Arch().Width(), ×tampP, ) } // PackInq packs a TCP_INQ socket control message. func PackInq(t *kernel.Task, inq int32, buf []byte) []byte { return putCmsgStruct( buf, linux.SOL_TCP, linux.TCP_INQ, t.Arch().Width(), primitive.AllocateInt32(inq), ) } // PackTOS packs an IP_TOS socket control message. func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte { return putCmsgStruct( buf, linux.SOL_IP, linux.IP_TOS, t.Arch().Width(), primitive.AllocateUint8(tos), ) } // PackTClass packs an IPV6_TCLASS socket control message. func PackTClass(t *kernel.Task, tClass uint32, buf []byte) []byte { return putCmsgStruct( buf, linux.SOL_IPV6, linux.IPV6_TCLASS, t.Arch().Width(), primitive.AllocateUint32(tClass), ) } // PackTTL packs an IP_TTL socket control message. func PackTTL(t *kernel.Task, ttl uint32, buf []byte) []byte { return putCmsgStruct( buf, linux.SOL_IP, linux.IP_TTL, t.Arch().Width(), primitive.AllocateUint32(ttl), ) } // PackHopLimit packs an IPV6_HOPLIMIT socket control message. func PackHopLimit(t *kernel.Task, hoplimit uint32, buf []byte) []byte { return putCmsgStruct( buf, linux.SOL_IPV6, linux.IPV6_HOPLIMIT, t.Arch().Width(), primitive.AllocateUint32(hoplimit), ) } // PackIPPacketInfo packs an IP_PKTINFO socket control message. func PackIPPacketInfo(t *kernel.Task, packetInfo *linux.ControlMessageIPPacketInfo, buf []byte) []byte { return putCmsgStruct( buf, linux.SOL_IP, linux.IP_PKTINFO, t.Arch().Width(), packetInfo, ) } // PackIPv6PacketInfo packs an IPV6_PKTINFO socket control message. func PackIPv6PacketInfo(t *kernel.Task, packetInfo *linux.ControlMessageIPv6PacketInfo, buf []byte) []byte { return putCmsgStruct( buf, linux.SOL_IPV6, linux.IPV6_PKTINFO, t.Arch().Width(), packetInfo, ) } // PackOriginalDstAddress packs an IP_RECVORIGINALDSTADDR socket control message. func PackOriginalDstAddress(t *kernel.Task, originalDstAddress linux.SockAddr, buf []byte) []byte { var level uint32 var optType uint32 switch originalDstAddress.(type) { case *linux.SockAddrInet: level = linux.SOL_IP optType = linux.IP_RECVORIGDSTADDR case *linux.SockAddrInet6: level = linux.SOL_IPV6 optType = linux.IPV6_RECVORIGDSTADDR default: panic("invalid address type, must be an IP address for IP_RECVORIGINALDSTADDR cmsg") } return putCmsgStruct( buf, level, optType, t.Arch().Width(), originalDstAddress) } // PackSockExtendedErr packs an IP*_RECVERR socket control message. func PackSockExtendedErr(t *kernel.Task, sockErr linux.SockErrCMsg, buf []byte) []byte { return putCmsgStruct( buf, sockErr.CMsgLevel(), sockErr.CMsgType(), t.Arch().Width(), sockErr, ) } // PackControlMessages packs control messages into the given buffer. // // We skip control messages specific to Unix domain sockets. // // Note that some control messages may be truncated if they do not fit under // the capacity of buf. func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byte) []byte { if cmsgs.IP.HasTimestamp { buf = PackTimestamp(t, cmsgs.IP.Timestamp, buf) } if cmsgs.IP.HasInq { // In Linux, TCP_CM_INQ is added after SO_TIMESTAMP. buf = PackInq(t, cmsgs.IP.Inq, buf) } if cmsgs.IP.HasTOS { buf = PackTOS(t, cmsgs.IP.TOS, buf) } if cmsgs.IP.HasTTL { buf = PackTTL(t, cmsgs.IP.TTL, buf) } if cmsgs.IP.HasTClass { buf = PackTClass(t, cmsgs.IP.TClass, buf) } if cmsgs.IP.HasHopLimit { buf = PackHopLimit(t, cmsgs.IP.HopLimit, buf) } if cmsgs.IP.HasIPPacketInfo { buf = PackIPPacketInfo(t, &cmsgs.IP.PacketInfo, buf) } if cmsgs.IP.HasIPv6PacketInfo { buf = PackIPv6PacketInfo(t, &cmsgs.IP.IPv6PacketInfo, buf) } if cmsgs.IP.OriginalDstAddress != nil { buf = PackOriginalDstAddress(t, cmsgs.IP.OriginalDstAddress, buf) } if cmsgs.IP.SockErr != nil { buf = PackSockExtendedErr(t, cmsgs.IP.SockErr, buf) } return buf } // cmsgSpace is equivalent to CMSG_SPACE in Linux. func cmsgSpace(t *kernel.Task, dataLen int) int { return linux.SizeOfControlMessageHeader + bits.AlignUp(dataLen, t.Arch().Width()) } // CmsgsSpace returns the number of bytes needed to fit the control messages // represented in cmsgs. func CmsgsSpace(t *kernel.Task, cmsgs socket.ControlMessages) int { space := 0 if cmsgs.IP.HasTimestamp { space += cmsgSpace(t, linux.SizeOfTimeval) } if cmsgs.IP.HasInq { space += cmsgSpace(t, linux.SizeOfControlMessageInq) } if cmsgs.IP.HasTOS { space += cmsgSpace(t, linux.SizeOfControlMessageTOS) } if cmsgs.IP.HasTTL { space += cmsgSpace(t, linux.SizeOfControlMessageTTL) } if cmsgs.IP.HasTClass { space += cmsgSpace(t, linux.SizeOfControlMessageTClass) } if cmsgs.IP.HasHopLimit { space += cmsgSpace(t, linux.SizeOfControlMessageHopLimit) } if cmsgs.IP.HasIPPacketInfo { space += cmsgSpace(t, linux.SizeOfControlMessageIPPacketInfo) } if cmsgs.IP.HasIPv6PacketInfo { space += cmsgSpace(t, linux.SizeOfControlMessageIPv6PacketInfo) } if cmsgs.IP.OriginalDstAddress != nil { space += cmsgSpace(t, cmsgs.IP.OriginalDstAddress.SizeBytes()) } if cmsgs.IP.SockErr != nil { space += cmsgSpace(t, cmsgs.IP.SockErr.SizeBytes()) } return space } // Parse parses a raw socket control message into portable objects. // TODO(https://gvisor.dev/issue/7188): Parse is only called on raw cmsg that // are used when sending a messages. We should fail with EINVAL when we find a // non-sendable control messages (such as IP_RECVERR). And the function should // be renamed to reflect that. func Parse(t *kernel.Task, socketOrEndpoint any, buf []byte, width uint) (socket.ControlMessages, error) { var ( cmsgs socket.ControlMessages fds []primitive.Int32 ) for len(buf) > 0 { if linux.SizeOfControlMessageHeader > len(buf) { return cmsgs, linuxerr.EINVAL } var h linux.ControlMessageHeader buf = h.UnmarshalUnsafe(buf) if h.Length < uint64(linux.SizeOfControlMessageHeader) { return socket.ControlMessages{}, linuxerr.EINVAL } length := int(h.Length) - linux.SizeOfControlMessageHeader if length < 0 || length > len(buf) { return socket.ControlMessages{}, linuxerr.EINVAL } switch h.Level { case linux.SOL_SOCKET: switch h.Type { case linux.SCM_RIGHTS: rightsSize := bits.AlignDown(length, linux.SizeOfControlMessageRight) numRights := rightsSize / linux.SizeOfControlMessageRight if len(fds)+numRights > linux.SCM_MAX_FD { return socket.ControlMessages{}, linuxerr.EINVAL } curFDs := make([]primitive.Int32, numRights) primitive.UnmarshalUnsafeInt32Slice(curFDs, buf[:rightsSize]) fds = append(fds, curFDs...) case linux.SCM_CREDENTIALS: if length < linux.SizeOfControlMessageCredentials { return socket.ControlMessages{}, linuxerr.EINVAL } var creds linux.ControlMessageCredentials creds.UnmarshalUnsafe(buf) scmCreds, err := NewSCMCredentials(t, creds) if err != nil { return socket.ControlMessages{}, err } cmsgs.Unix.Credentials = scmCreds case linux.SO_TIMESTAMP: if length < linux.SizeOfTimeval { return socket.ControlMessages{}, linuxerr.EINVAL } var ts linux.Timeval ts.UnmarshalUnsafe(buf) cmsgs.IP.Timestamp = ts.ToTime() cmsgs.IP.HasTimestamp = true default: // Unknown message type. return socket.ControlMessages{}, linuxerr.EINVAL } case linux.SOL_IP: switch h.Type { case linux.IP_TOS: if length < linux.SizeOfControlMessageTOS { return socket.ControlMessages{}, linuxerr.EINVAL } cmsgs.IP.HasTOS = true var tos primitive.Uint8 tos.UnmarshalUnsafe(buf) cmsgs.IP.TOS = uint8(tos) case linux.IP_TTL: if length < linux.SizeOfControlMessageTTL { return socket.ControlMessages{}, linuxerr.EINVAL } var ttl primitive.Uint32 ttl.UnmarshalUnsafe(buf) if ttl == 0 || ttl > math.MaxUint8 { return socket.ControlMessages{}, linuxerr.EINVAL } cmsgs.IP.TTL = uint32(ttl) cmsgs.IP.HasTTL = true case linux.IP_PKTINFO: if length < linux.SizeOfControlMessageIPPacketInfo { return socket.ControlMessages{}, linuxerr.EINVAL } cmsgs.IP.HasIPPacketInfo = true var packetInfo linux.ControlMessageIPPacketInfo packetInfo.UnmarshalUnsafe(buf) cmsgs.IP.PacketInfo = packetInfo case linux.IP_RECVORIGDSTADDR: var addr linux.SockAddrInet if length < addr.SizeBytes() { return socket.ControlMessages{}, linuxerr.EINVAL } addr.UnmarshalUnsafe(buf) cmsgs.IP.OriginalDstAddress = &addr case linux.IP_RECVERR: var errCmsg linux.SockErrCMsgIPv4 if length < errCmsg.SizeBytes() { return socket.ControlMessages{}, linuxerr.EINVAL } errCmsg.UnmarshalBytes(buf) cmsgs.IP.SockErr = &errCmsg default: return socket.ControlMessages{}, linuxerr.EINVAL } case linux.SOL_IPV6: switch h.Type { case linux.IPV6_TCLASS: if length < linux.SizeOfControlMessageTClass { return socket.ControlMessages{}, linuxerr.EINVAL } cmsgs.IP.HasTClass = true var tclass primitive.Uint32 tclass.UnmarshalUnsafe(buf) cmsgs.IP.TClass = uint32(tclass) case linux.IPV6_PKTINFO: if length < linux.SizeOfControlMessageIPv6PacketInfo { return socket.ControlMessages{}, linuxerr.EINVAL } cmsgs.IP.HasIPv6PacketInfo = true var packetInfo linux.ControlMessageIPv6PacketInfo packetInfo.UnmarshalUnsafe(buf) cmsgs.IP.IPv6PacketInfo = packetInfo case linux.IPV6_HOPLIMIT: if length < linux.SizeOfControlMessageHopLimit { return socket.ControlMessages{}, linuxerr.EINVAL } var hoplimit primitive.Uint32 hoplimit.UnmarshalUnsafe(buf) if hoplimit > math.MaxUint8 { return socket.ControlMessages{}, linuxerr.EINVAL } cmsgs.IP.HasHopLimit = true cmsgs.IP.HopLimit = uint32(hoplimit) case linux.IPV6_RECVORIGDSTADDR: var addr linux.SockAddrInet6 if length < addr.SizeBytes() { return socket.ControlMessages{}, linuxerr.EINVAL } addr.UnmarshalUnsafe(buf) cmsgs.IP.OriginalDstAddress = &addr case linux.IPV6_RECVERR: var errCmsg linux.SockErrCMsgIPv6 if length < errCmsg.SizeBytes() { return socket.ControlMessages{}, linuxerr.EINVAL } errCmsg.UnmarshalBytes(buf) cmsgs.IP.SockErr = &errCmsg default: return socket.ControlMessages{}, linuxerr.EINVAL } default: return socket.ControlMessages{}, linuxerr.EINVAL } if shift := bits.AlignUp(length, width); shift > len(buf) { buf = buf[:0] } else { buf = buf[shift:] } } if cmsgs.Unix.Credentials == nil { cmsgs.Unix.Credentials = makeCreds(t, socketOrEndpoint) } if len(fds) > 0 { rights, err := NewSCMRights(t, fds) if err != nil { return socket.ControlMessages{}, err } cmsgs.Unix.Rights = rights } return cmsgs, nil } func makeCreds(t *kernel.Task, socketOrEndpoint any) SCMCredentials { if t == nil || socketOrEndpoint == nil { return nil } if cr, ok := socketOrEndpoint.(transport.Credentialer); ok && (cr.Passcred() || cr.ConnectedPasscred()) { return MakeCreds(t) } return nil } // MakeCreds creates default SCMCredentials. func MakeCreds(t *kernel.Task) SCMCredentials { if t == nil { return nil } tcred := t.Credentials() return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID} } // New creates default control messages if needed. func New(t *kernel.Task, socketOrEndpoint any) transport.ControlMessages { return transport.ControlMessages{ Credentials: makeCreds(t, socketOrEndpoint), } } // SCMRights represents a SCM_RIGHTS socket control message. // // +stateify savable type SCMRights interface { transport.RightsControlMessage // Files returns up to max RightsFiles. // // Returned files are consumed and ownership is transferred to the caller. // Subsequent calls to Files will return the next files. Files(ctx context.Context, max int) (rf RightsFiles, truncated bool) } // RightsFiles represents a SCM_RIGHTS socket control message. A reference // is maintained for each vfs.FileDescription and is release either when an FD // is created or when the Release method is called. // // +stateify savable type RightsFiles []*vfs.FileDescription // NewSCMRights creates a new SCM_RIGHTS socket control message // representation using local sentry FDs. func NewSCMRights(t *kernel.Task, fds []primitive.Int32) (SCMRights, error) { files := make(RightsFiles, 0, len(fds)) for _, fd := range fds { file := t.GetFile(int32(fd)) if file == nil { files.Release(t) return nil, linuxerr.EBADF } files = append(files, file) } return &files, nil } // Files implements SCMRights.Files. func (fs *RightsFiles) Files(ctx context.Context, max int) (RightsFiles, bool) { n := max var trunc bool if l := len(*fs); n > l { n = l } else if n < l { trunc = true } rf := (*fs)[:n] *fs = (*fs)[n:] return rf, trunc } // Clone implements transport.RightsControlMessage.Clone. func (fs *RightsFiles) Clone() transport.RightsControlMessage { nfs := append(RightsFiles(nil), *fs...) for _, nf := range nfs { nf.IncRef() } return &nfs } // Release implements transport.RightsControlMessage.Release. func (fs *RightsFiles) Release(ctx context.Context) { for _, f := range *fs { f.DecRef(ctx) } *fs = nil } // rightsFDs gets up to the specified maximum number of FDs. func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) ([]int32, bool) { files, trunc := rights.Files(t, max) fds := make([]int32, 0, len(files)) for i := 0; i < max && len(files) > 0; i++ { fd, err := t.NewFDFrom(0, files[0], kernel.FDFlags{ CloseOnExec: cloexec, }) files[0].DecRef(t) files = files[1:] if err != nil { t.Warningf("Error inserting FD: %v", err) // This is what Linux does. break } fds = append(fds, int32(fd)) } return fds, trunc } // PackRights packs as many FDs as will fit into the unused capacity of buf. func PackRights(t *kernel.Task, rights SCMRights, cloexec bool, buf []byte, flags int) ([]byte, int) { maxFDs := (cap(buf) - len(buf) - linux.SizeOfControlMessageHeader) / 4 // Linux does not return any FDs if none fit. if maxFDs <= 0 { flags |= linux.MSG_CTRUNC return buf, flags } fds, trunc := rightsFDs(t, rights, cloexec, maxFDs) if trunc { flags |= linux.MSG_CTRUNC } align := t.Arch().Width() return putCmsg(buf, flags, linux.SCM_RIGHTS, align, fds) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/control/control_state_autogen.go000066400000000000000000000021061465435605700306500ustar00rootroot00000000000000// automatically generated by stateify. package control import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (c *scmCredentials) StateTypeName() string { return "pkg/sentry/socket/control.scmCredentials" } func (c *scmCredentials) StateFields() []string { return []string{ "t", "kuid", "kgid", } } func (c *scmCredentials) beforeSave() {} // +checklocksignore func (c *scmCredentials) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.t) stateSinkObject.Save(1, &c.kuid) stateSinkObject.Save(2, &c.kgid) } func (c *scmCredentials) afterLoad(context.Context) {} // +checklocksignore func (c *scmCredentials) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.t) stateSourceObject.Load(1, &c.kuid) stateSourceObject.Load(2, &c.kgid) } func (fs *RightsFiles) StateTypeName() string { return "pkg/sentry/socket/control.RightsFiles" } func (fs *RightsFiles) StateFields() []string { return nil } func init() { state.Register((*scmCredentials)(nil)) state.Register((*RightsFiles)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/hostinet/000077500000000000000000000000001465435605700240755ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/hostinet/hostinet.go000066400000000000000000000013051465435605700262600ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package hostinet implements AF_INET and AF_INET6 sockets using the host's // network stack. package hostinet golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/hostinet/hostinet_impl_state_autogen.go000066400000000000000000000001361465435605700322240ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package hostinet golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/hostinet/hostinet_state_autogen.go000066400000000000000000000032741465435605700312110ustar00rootroot00000000000000// automatically generated by stateify. package hostinet import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (s *Socket) StateTypeName() string { return "pkg/sentry/socket/hostinet.Socket" } func (s *Socket) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "LockFD", "DentryMetadataFileDescriptionImpl", "SendReceiveTimeout", "family", "stype", "protocol", "queue", "fd", "recvClosed", } } func (s *Socket) beforeSave() {} // +checklocksignore func (s *Socket) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.vfsfd) stateSinkObject.Save(1, &s.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &s.LockFD) stateSinkObject.Save(3, &s.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(4, &s.SendReceiveTimeout) stateSinkObject.Save(5, &s.family) stateSinkObject.Save(6, &s.stype) stateSinkObject.Save(7, &s.protocol) stateSinkObject.Save(8, &s.queue) stateSinkObject.Save(9, &s.fd) stateSinkObject.Save(10, &s.recvClosed) } func (s *Socket) afterLoad(context.Context) {} // +checklocksignore func (s *Socket) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.vfsfd) stateSourceObject.Load(1, &s.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &s.LockFD) stateSourceObject.Load(3, &s.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(4, &s.SendReceiveTimeout) stateSourceObject.Load(5, &s.family) stateSourceObject.Load(6, &s.stype) stateSourceObject.Load(7, &s.protocol) stateSourceObject.Load(8, &s.queue) stateSourceObject.Load(9, &s.fd) stateSourceObject.Load(10, &s.recvClosed) } func init() { state.Register((*Socket)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/hostinet/hostinet_unsafe_state_autogen.go000066400000000000000000000000721465435605700325430ustar00rootroot00000000000000// automatically generated by stateify. package hostinet golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/hostinet/netlink.go000066400000000000000000000215321465435605700260730ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package hostinet import ( "bytes" "fmt" "syscall" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/tcpip" ) func getInterfaces() (map[int32]inet.Interface, error) { data, err := syscall.NetlinkRIB(unix.RTM_GETLINK, syscall.AF_UNSPEC) if err != nil { return nil, err } msgs, err := syscall.ParseNetlinkMessage(data) if err != nil { return nil, err } ifs := make(map[int32]inet.Interface, len(msgs)) for _, msg := range msgs { if msg.Header.Type != unix.RTM_NEWLINK { continue } if len(msg.Data) < unix.SizeofIfInfomsg { return nil, fmt.Errorf("RTM_GETLINK returned RTM_NEWLINK message with invalid data length (%d bytes, expected at least %d bytes)", len(msg.Data), unix.SizeofIfInfomsg) } var ifinfo linux.InterfaceInfoMessage ifinfo.UnmarshalUnsafe(msg.Data) inetIF := inet.Interface{ DeviceType: ifinfo.Type, Flags: ifinfo.Flags, } // Not clearly documented: syscall.ParseNetlinkRouteAttr will check the // syscall.NetlinkMessage.Header.Type and skip the struct ifinfomsg // accordingly. attrs, err := syscall.ParseNetlinkRouteAttr(&msg) if err != nil { return nil, fmt.Errorf("RTM_GETLINK returned RTM_NEWLINK message with invalid rtattrs: %v", err) } for _, attr := range attrs { switch attr.Attr.Type { case unix.IFLA_ADDRESS: inetIF.Addr = attr.Value case unix.IFLA_IFNAME: inetIF.Name = string(attr.Value[:len(attr.Value)-1]) } } ifs[ifinfo.Index] = inetIF } return ifs, nil } func getInterfaceAddrs() (map[int32][]inet.InterfaceAddr, error) { data, err := syscall.NetlinkRIB(unix.RTM_GETADDR, syscall.AF_UNSPEC) if err != nil { return nil, err } msgs, err := syscall.ParseNetlinkMessage(data) if err != nil { return nil, err } addrs := make(map[int32][]inet.InterfaceAddr, len(msgs)) for _, msg := range msgs { if msg.Header.Type != unix.RTM_NEWADDR { continue } if len(msg.Data) < unix.SizeofIfAddrmsg { return nil, fmt.Errorf("RTM_GETADDR returned RTM_NEWADDR message with invalid data length (%d bytes, expected at least %d bytes)", len(msg.Data), unix.SizeofIfAddrmsg) } var ifaddr linux.InterfaceAddrMessage ifaddr.UnmarshalUnsafe(msg.Data) inetAddr := inet.InterfaceAddr{ Family: ifaddr.Family, PrefixLen: ifaddr.PrefixLen, Flags: ifaddr.Flags, } attrs, err := syscall.ParseNetlinkRouteAttr(&msg) if err != nil { return nil, fmt.Errorf("RTM_GETADDR returned RTM_NEWADDR message with invalid rtattrs: %v", err) } for _, attr := range attrs { switch attr.Attr.Type { case unix.IFA_ADDRESS: inetAddr.Addr = attr.Value } } addrs[int32(ifaddr.Index)] = append(addrs[int32(ifaddr.Index)], inetAddr) } return addrs, nil } func getRoutes() ([]inet.Route, error) { data, err := syscall.NetlinkRIB(unix.RTM_GETROUTE, syscall.AF_UNSPEC) if err != nil { return nil, err } msgs, err := syscall.ParseNetlinkMessage(data) if err != nil { return nil, err } routes, err := extractRoutes(msgs) if err != nil { return nil, err } return routes, nil } // extractRoutes populates the given routes slice with the data from the host // route table. func extractRoutes(routeMsgs []syscall.NetlinkMessage) ([]inet.Route, error) { var routes []inet.Route for _, routeMsg := range routeMsgs { if routeMsg.Header.Type != unix.RTM_NEWROUTE { continue } var ifRoute linux.RouteMessage ifRoute.UnmarshalUnsafe(routeMsg.Data) inetRoute := inet.Route{ Family: ifRoute.Family, DstLen: ifRoute.DstLen, SrcLen: ifRoute.SrcLen, TOS: ifRoute.TOS, Table: ifRoute.Table, Protocol: ifRoute.Protocol, Scope: ifRoute.Scope, Type: ifRoute.Type, Flags: ifRoute.Flags, } // Not clearly documented: syscall.ParseNetlinkRouteAttr will check the // syscall.NetlinkMessage.Header.Type and skip the struct rtmsg // accordingly. attrs, err := syscall.ParseNetlinkRouteAttr(&routeMsg) if err != nil { return nil, fmt.Errorf("RTM_GETROUTE returned RTM_NEWROUTE message with invalid rtattrs: %v", err) } for _, attr := range attrs { switch attr.Attr.Type { case unix.RTA_DST: inetRoute.DstAddr = attr.Value case unix.RTA_SRC: inetRoute.SrcAddr = attr.Value case unix.RTA_GATEWAY: inetRoute.GatewayAddr = attr.Value case unix.RTA_OIF: expected := int(binary.Size(inetRoute.OutputInterface)) if len(attr.Value) != expected { return nil, fmt.Errorf("RTM_GETROUTE returned RTM_NEWROUTE message with invalid attribute data length (%d bytes, expected %d bytes)", len(attr.Value), expected) } var outputIF primitive.Int32 outputIF.UnmarshalUnsafe(attr.Value) inetRoute.OutputInterface = int32(outputIF) } } routes = append(routes, inetRoute) } return routes, nil } // doNetlinkRouteRequest is a more general form of syscall.NetlinkRIB that // allows sending arbitrary (marshallable) structs to the netlink socket. func doNetlinkRouteRequest(msgs []marshal.Marshallable) error { s, err := unix.Socket(unix.AF_NETLINK, unix.SOCK_RAW|unix.SOCK_CLOEXEC, unix.NETLINK_ROUTE) if err != nil { return err } defer syscall.Close(s) sa := syscall.SockaddrNetlink{Family: unix.AF_NETLINK} if err := syscall.Bind(s, &sa); err != nil { return err } b := marshal.MarshalAll(msgs) if err := syscall.Sendto(s, b, 0, &sa); err != nil { return err } lsa, err := syscall.Getsockname(s) if err != nil { return err } lsanl, ok := lsa.(*syscall.SockaddrNetlink) if !ok { return linuxerr.EINVAL } rbNew := make([]byte, hostarch.PageSize) done: for { rb := rbNew nr, _, err := syscall.Recvfrom(s, rb, 0) if err != nil { return err } if nr < linux.NetlinkMessageHeaderSize { return linuxerr.EINVAL } rb = rb[:nr] msgs, err := syscall.ParseNetlinkMessage(rb) if err != nil { return err } for _, m := range msgs { if m.Header.Seq != 1 || m.Header.Pid != lsanl.Pid { return linuxerr.EINVAL } if m.Header.Type == linux.NLMSG_DONE { break done } if m.Header.Type == linux.NLMSG_ERROR { errno, err := binary.ReadUint32(bytes.NewReader(m.Data[0:4]), hostarch.ByteOrder) if err != nil { return err } if errno == 0 { break done } return linuxerr.ErrorFromUnix(unix.Errno(-errno)) } } } return nil } func removeInterface(idx int32) error { // [ NetlinkMessageHeader | InterfaceInfoMessage ] hdr := linux.NetlinkMessageHeader{ Type: linux.RTM_DELLINK, Flags: linux.NLM_F_REQUEST | linux.NLM_F_ACK, Seq: 1, } infoMsg := linux.InterfaceInfoMessage{ Family: linux.AF_UNSPEC, Index: idx, } msgs := []marshal.Marshallable{ &hdr, &infoMsg, } hdr.Length = uint32(marshal.TotalSize(msgs)) return doNetlinkRouteRequest(msgs) } func doNetlinkInterfaceRequest(typ, flags uint16, idx uint32, addr inet.InterfaceAddr) error { // [ NetlinkMessageHeader | InterfaceAddrMessage | RtAttr | localAddr | RtAttr | peerAddr ] hdr := linux.NetlinkMessageHeader{ Type: typ, Flags: flags | linux.NLM_F_REQUEST | linux.NLM_F_ACK, Seq: 1, } infoMsg := linux.InterfaceAddrMessage{ Family: addr.Family, Index: idx, PrefixLen: addr.PrefixLen, Flags: addr.Flags, } // Local address. localAddr := tcpip.AddrFromSlice(addr.Addr) if addr.Family == linux.AF_INET { localAddr = localAddr.To4() } rtLocal := linux.RtAttr{ Len: linux.SizeOfRtAttr + uint16(localAddr.Len()), Type: linux.IFA_LOCAL, } localAddrBs := primitive.ByteSlice(localAddr.AsSlice()) // Peer is always the local address for us. rtPeer := linux.RtAttr{ Len: linux.SizeOfRtAttr + uint16(localAddr.Len()), Type: linux.IFA_ADDRESS, } peerAddrBs := primitive.ByteSlice(localAddr.AsSlice()) msgs := []marshal.Marshallable{ &hdr, &infoMsg, &rtLocal, &localAddrBs, &rtPeer, &peerAddrBs, } hdr.Length = uint32(marshal.TotalSize(msgs)) return doNetlinkRouteRequest(msgs) } func addInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { return doNetlinkInterfaceRequest(linux.RTM_NEWADDR, linux.NLM_F_CREATE, uint32(idx), addr) } func removeInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { return doNetlinkInterfaceRequest(linux.RTM_DELADDR, 0, uint32(idx), addr) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/hostinet/socket.go000066400000000000000000000645441465435605700257310ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package hostinet import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) const ( // sizeofSockaddr is the size in bytes of the largest sockaddr type // supported by this package. sizeofSockaddr = unix.SizeofSockaddrInet6 // sizeof(sockaddr_in6) > sizeof(sockaddr_in) // maxControlLen is the maximum size of a control message buffer used in a // recvmsg or sendmsg unix. maxControlLen = 1024 ) // AllowedSocketType is a tuple of socket family, type, and protocol. type AllowedSocketType struct { Family int Type int // Protocol of AllowAllProtocols indicates that all protocols are // allowed. Protocol int } // AllowAllProtocols indicates that all protocols are allowed by the stack and // in the syscall filters. var AllowAllProtocols = -1 // AllowedSocketTypes are the socket types which are supported by hostinet. // These are used to validate the arguments to socket(), and also to generate // syscall filters. var AllowedSocketTypes = []AllowedSocketType{ // Family, Type, Protocol. {unix.AF_INET, unix.SOCK_STREAM, unix.IPPROTO_TCP}, {unix.AF_INET, unix.SOCK_DGRAM, unix.IPPROTO_UDP}, {unix.AF_INET, unix.SOCK_DGRAM, unix.IPPROTO_ICMP}, {unix.AF_INET6, unix.SOCK_STREAM, unix.IPPROTO_TCP}, {unix.AF_INET6, unix.SOCK_DGRAM, unix.IPPROTO_UDP}, {unix.AF_INET6, unix.SOCK_DGRAM, unix.IPPROTO_ICMPV6}, } // AllowedRawSocketTypes are the socket types which are supported by hostinet // with raw sockets enabled. var AllowedRawSocketTypes = []AllowedSocketType{ {unix.AF_INET, unix.SOCK_RAW, unix.IPPROTO_RAW}, {unix.AF_INET, unix.SOCK_RAW, unix.IPPROTO_TCP}, {unix.AF_INET, unix.SOCK_RAW, unix.IPPROTO_UDP}, {unix.AF_INET, unix.SOCK_RAW, unix.IPPROTO_ICMP}, {unix.AF_INET6, unix.SOCK_RAW, unix.IPPROTO_RAW}, {unix.AF_INET6, unix.SOCK_RAW, unix.IPPROTO_TCP}, {unix.AF_INET6, unix.SOCK_RAW, unix.IPPROTO_UDP}, {unix.AF_INET6, unix.SOCK_RAW, unix.IPPROTO_ICMPV6}, // AF_PACKET do not allow Write or SendMsg. {unix.AF_PACKET, unix.SOCK_DGRAM, AllowAllProtocols}, {unix.AF_PACKET, unix.SOCK_RAW, AllowAllProtocols}, } // Socket implements socket.Socket (and by extension, vfs.FileDescriptionImpl) // for host sockets. // // +stateify savable type Socket struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.LockFD // We store metadata for hostinet sockets internally. Technically, we should // access metadata (e.g. through stat, chmod) on the host for correctness, // but this is not very useful for inet socket fds, which do not belong to a // concrete file anyway. vfs.DentryMetadataFileDescriptionImpl socket.SendReceiveTimeout family int // Read-only. stype linux.SockType // Read-only. protocol int // Read-only. queue waiter.Queue // fd is the host socket fd. It must have O_NONBLOCK, so that operations // will return EWOULDBLOCK instead of blocking on the host. This allows us to // handle blocking behavior independently in the sentry. fd int // recvClosed indicates that the socket has been shutdown for reading // (SHUT_RD or SHUT_RDWR). recvClosed atomicbitops.Bool } var _ = socket.Socket(&Socket{}) func newSocket(t *kernel.Task, family int, stype linux.SockType, protocol int, fd int, flags uint32) (*vfs.FileDescription, *syserr.Error) { mnt := t.Kernel().SocketMount() d := sockfs.NewDentry(t, mnt) defer d.DecRef(t) s := &Socket{ family: family, stype: stype, protocol: protocol, fd: fd, } s.LockFD.Init(&vfs.FileLocks{}) if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil { return nil, syserr.FromError(err) } vfsfd := &s.vfsfd if err := vfsfd.Init(s, linux.O_RDWR|(flags&linux.O_NONBLOCK), mnt, d, &vfs.FileDescriptionOptions{ DenyPRead: true, DenyPWrite: true, UseDentryMetadata: true, }); err != nil { fdnotifier.RemoveFD(int32(s.fd)) return nil, syserr.FromError(err) } return vfsfd, nil } // Release implements vfs.FileDescriptionImpl.Release. func (s *Socket) Release(ctx context.Context) { kernel.KernelFromContext(ctx).DeleteSocket(&s.vfsfd) fdnotifier.RemoveFD(int32(s.fd)) _ = unix.Close(s.fd) } // Epollable implements FileDescriptionImpl.Epollable. func (s *Socket) Epollable() bool { return true } // Ioctl implements vfs.FileDescriptionImpl. func (s *Socket) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { return ioctl(ctx, s.fd, uio, sysno, args) } // PRead implements vfs.FileDescriptionImpl.PRead. func (s *Socket) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return 0, linuxerr.ESPIPE } // Read implements vfs.FileDescriptionImpl. func (s *Socket) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { return 0, linuxerr.EOPNOTSUPP } reader := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags) defer hostfd.PutReadWriterAt(reader) n, err := dst.CopyOutFrom(ctx, reader) return int64(n), err } // PWrite implements vfs.FileDescriptionImpl. func (s *Socket) PWrite(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { return 0, linuxerr.ESPIPE } // Write implements vfs.FileDescriptionImpl. func (s *Socket) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { if s.family == linux.AF_PACKET { // Don't allow Write for AF_PACKET. return 0, linuxerr.EACCES } // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { return 0, linuxerr.EOPNOTSUPP } writer := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags) defer hostfd.PutReadWriterAt(writer) n, err := src.CopyInTo(ctx, writer) return int64(n), err } type socketProvider struct { family int } // Socket implements socket.Provider.Socket. func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { // Check that we are using the host network stack. netCtx := t.NetworkContext() if netCtx == nil { return nil, nil } stack, ok := netCtx.(*Stack) if !ok { return nil, nil } stype := stypeflags & linux.SOCK_TYPE_MASK // Raw and packet sockets require CAP_NET_RAW. if stype == linux.SOCK_RAW || p.family == linux.AF_PACKET { if creds := auth.CredentialsFromContext(t); !creds.HasCapability(linux.CAP_NET_RAW) { return nil, syserr.ErrNotPermitted } } // Convert generic IPPROTO_IP protocol to the actual protocol depending // on family and type. if protocol == linux.IPPROTO_IP && (p.family == linux.AF_INET || p.family == linux.AF_INET6) { switch stype { case linux.SOCK_STREAM: protocol = linux.IPPROTO_TCP case linux.SOCK_DGRAM: protocol = linux.IPPROTO_UDP } } // Validate the socket based on family, type, and protocol. var supported bool for _, allowed := range stack.allowedSocketTypes { isAllowedFamily := p.family == allowed.Family isAllowedType := int(stype) == allowed.Type isAllowedProtocol := protocol == allowed.Protocol || allowed.Protocol == AllowAllProtocols if isAllowedFamily && isAllowedType && isAllowedProtocol { supported = true break } } if !supported { // Return nil error here to give other socket providers a // chance to create this socket. return nil, nil } // Conservatively ignore all flags specified by the application and add // SOCK_NONBLOCK since socketOperations requires it. st := int(stype) | unix.SOCK_NONBLOCK | unix.SOCK_CLOEXEC fd, err := unix.Socket(p.family, st, protocol) if err != nil { return nil, syserr.FromError(err) } return newSocket(t, p.family, stype, protocol, fd, uint32(stypeflags&unix.SOCK_NONBLOCK)) } // Pair implements socket.Provider.Pair. func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { // Not supported by AF_INET/AF_INET6. return nil, nil, nil } // Readiness implements waiter.Waitable.Readiness. func (s *Socket) Readiness(mask waiter.EventMask) waiter.EventMask { return fdnotifier.NonBlockingPoll(int32(s.fd), mask) } // EventRegister implements waiter.Waitable.EventRegister. func (s *Socket) EventRegister(e *waiter.Entry) error { s.queue.EventRegister(e) if err := fdnotifier.UpdateFD(int32(s.fd)); err != nil { s.queue.EventUnregister(e) return err } return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (s *Socket) EventUnregister(e *waiter.Entry) { s.queue.EventUnregister(e) if err := fdnotifier.UpdateFD(int32(s.fd)); err != nil { panic(err) } } // Connect implements socket.Socket.Connect. func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { if len(sockaddr) > sizeofSockaddr { sockaddr = sockaddr[:sizeofSockaddr] } _, _, errno := unix.Syscall(unix.SYS_CONNECT, uintptr(s.fd), uintptr(firstBytePtr(sockaddr)), uintptr(len(sockaddr))) if errno == 0 { return nil } // The host socket is always non-blocking, so we expect connect to // return EINPROGRESS. If we are emulating a blocking socket, we will // wait for the connect to complete below. // But if we are not emulating a blocking socket, or if we got some // other error, then return it now. if errno != unix.EINPROGRESS || !blocking { return syserr.FromError(translateIOSyscallError(errno)) } // "EINPROGRESS: The socket is nonblocking and the connection cannot be // completed immediately. It is possible to select(2) or poll(2) for // completion by selecting the socket for writing. After select(2) // indicates writability, use getsockopt(2) to read the SO_ERROR option at // level SOL-SOCKET to determine whether connect() completed successfully // (SO_ERROR is zero) or unsuccessfully (SO_ERROR is one of the usual error // codes listed here, explaining the reason for the failure)." - connect(2) writableMask := waiter.WritableEvents e, ch := waiter.NewChannelEntry(writableMask) s.EventRegister(&e) defer s.EventUnregister(&e) if s.Readiness(writableMask)&writableMask == 0 { if err := t.Block(ch); err != nil { return syserr.FromError(err) } } val, err := unix.GetsockoptInt(s.fd, unix.SOL_SOCKET, unix.SO_ERROR) if err != nil { return syserr.FromError(err) } if val != 0 { return syserr.FromError(unix.Errno(uintptr(val))) } // It seems like we are all good now, but Linux has left the socket // state as CONNECTING (not CONNECTED). This is a strange quirk of // non-blocking sockets. See tcp_finish_connect() which sets tcp state // but not socket state. // // Sockets in the CONNECTING state can call connect() a second time, // whereas CONNECTED sockets will reject the second connect() call. // Because we are emulating a blocking socket, we want a subsequent // connect() call to fail. So we must kick Linux to update the socket // to state CONNECTED, which we can do by calling connect() a second // time ourselves. _, _, errno = unix.Syscall(unix.SYS_CONNECT, uintptr(s.fd), uintptr(firstBytePtr(sockaddr)), uintptr(len(sockaddr))) if errno != 0 && errno != unix.EALREADY { return syserr.FromError(translateIOSyscallError(errno)) } return nil } // Accept implements socket.Socket.Accept. func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { var peerAddr linux.SockAddr var peerAddrBuf []byte var peerAddrlen uint32 var peerAddrPtr *byte var peerAddrlenPtr *uint32 if peerRequested { peerAddrBuf = make([]byte, sizeofSockaddr) peerAddrlen = uint32(len(peerAddrBuf)) peerAddrPtr = &peerAddrBuf[0] peerAddrlenPtr = &peerAddrlen } // Conservatively ignore all flags specified by the application and add // SOCK_NONBLOCK since socketOpsCommon requires it. fd, syscallErr := accept4(s.fd, peerAddrPtr, peerAddrlenPtr, unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC) if blocking { var ch chan struct{} for linuxerr.Equals(linuxerr.ErrWouldBlock, syscallErr) { if ch != nil { if syscallErr = t.Block(ch); syscallErr != nil { break } } else { var e waiter.Entry e, ch = waiter.NewChannelEntry(waiter.ReadableEvents | waiter.EventHUp | waiter.EventErr) s.EventRegister(&e) defer s.EventUnregister(&e) } fd, syscallErr = accept4(s.fd, peerAddrPtr, peerAddrlenPtr, unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC) } } if peerRequested { peerAddr = socket.UnmarshalSockAddr(s.family, peerAddrBuf[:peerAddrlen]) } if syscallErr != nil { return 0, peerAddr, peerAddrlen, syserr.FromError(syscallErr) } var ( kfd int32 kerr error ) f, err := newSocket(t, s.family, s.stype, s.protocol, fd, uint32(flags&unix.SOCK_NONBLOCK)) if err != nil { _ = unix.Close(fd) return 0, nil, 0, err } defer f.DecRef(t) kfd, kerr = t.NewFDFrom(0, f, kernel.FDFlags{ CloseOnExec: flags&unix.SOCK_CLOEXEC != 0, }) t.Kernel().RecordSocket(f) return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr) } // Bind implements socket.Socket.Bind. func (s *Socket) Bind(_ *kernel.Task, sockaddr []byte) *syserr.Error { if len(sockaddr) > sizeofSockaddr { sockaddr = sockaddr[:sizeofSockaddr] } _, _, errno := unix.Syscall(unix.SYS_BIND, uintptr(s.fd), uintptr(firstBytePtr(sockaddr)), uintptr(len(sockaddr))) if errno != 0 { return syserr.FromError(errno) } return nil } // Listen implements socket.Socket.Listen. func (s *Socket) Listen(_ *kernel.Task, backlog int) *syserr.Error { return syserr.FromError(unix.Listen(s.fd, backlog)) } // Shutdown implements socket.Socket.Shutdown. func (s *Socket) Shutdown(_ *kernel.Task, how int) *syserr.Error { switch how { case unix.SHUT_RD, unix.SHUT_RDWR: // Mark the socket as closed for reading. s.recvClosed.Store(true) fallthrough case unix.SHUT_WR: return syserr.FromError(unix.Shutdown(s.fd, how)) default: return syserr.ErrInvalidArgument } } func (s *Socket) recvMsgFromHost(iovs []unix.Iovec, flags int, senderRequested bool, controlLen uint64) (uint64, int, []byte, []byte, error) { // We always do a non-blocking recv*(). sysflags := flags | unix.MSG_DONTWAIT msg := unix.Msghdr{} if len(iovs) > 0 { msg.Iov = &iovs[0] msg.Iovlen = uint64(len(iovs)) } var senderAddrBuf []byte if senderRequested { senderAddrBuf = make([]byte, sizeofSockaddr) msg.Name = &senderAddrBuf[0] msg.Namelen = uint32(sizeofSockaddr) } var controlBuf []byte if controlLen > 0 { if controlLen > maxControlLen { controlLen = maxControlLen } controlBuf = make([]byte, controlLen) msg.Control = &controlBuf[0] msg.Controllen = controlLen } n, err := recvmsg(s.fd, &msg, sysflags) if err != nil { return 0 /* n */, 0 /* mFlags */, nil /* senderAddrBuf */, nil /* controlBuf */, err } return n, int(msg.Flags), senderAddrBuf[:msg.Namelen], controlBuf[:msg.Controllen], err } const allowedRecvMsgFlags = unix.MSG_CTRUNC | unix.MSG_DONTWAIT | unix.MSG_ERRQUEUE | unix.MSG_OOB | unix.MSG_PEEK | unix.MSG_TRUNC | unix.MSG_WAITALL // RecvMsg implements socket.Socket.RecvMsg. func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { // Only allow known and safe flags. if flags&^allowedRecvMsgFlags != 0 { return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrInvalidArgument } var senderAddrBuf []byte var controlBuf []byte var msgFlags int copyToDst := func() (int64, error) { var n uint64 var err error if dst.NumBytes() == 0 { // We want to make the recvmsg(2) call to the host even if dst is empty // to fetch control messages, sender address or errors if any occur. n, msgFlags, senderAddrBuf, controlBuf, err = s.recvMsgFromHost(nil, flags, senderRequested, controlLen) return int64(n), err } recvmsgToBlocks := safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) { // Refuse to do anything if any part of dst.Addrs was unusable. if uint64(dst.NumBytes()) != dsts.NumBytes() { return 0, nil } if dsts.IsEmpty() { return 0, nil } n, msgFlags, senderAddrBuf, controlBuf, err = s.recvMsgFromHost(safemem.IovecsFromBlockSeq(dsts), flags, senderRequested, controlLen) return n, err }) return dst.CopyOutFrom(t, recvmsgToBlocks) } var ch chan struct{} n, err := copyToDst() // recv*(MSG_ERRQUEUE) never blocks, even without MSG_DONTWAIT. if flags&(unix.MSG_DONTWAIT|unix.MSG_ERRQUEUE) == 0 { for linuxerr.Equals(linuxerr.ErrWouldBlock, err) { // We only expect blocking to come from the actual syscall, in which // case it can't have returned any data. if n != 0 { panic(fmt.Sprintf("CopyOutFrom: got (%d, %v), wanted (0, %v)", n, err, err)) } // Are we closed for reading? No sense in trying to read if so. if s.recvClosed.Load() { break } if ch != nil { if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { err = linuxerr.ErrWouldBlock } break } } else { var e waiter.Entry e, ch = waiter.NewChannelEntry(waiter.ReadableEvents | waiter.EventRdHUp | waiter.EventHUp | waiter.EventErr) s.EventRegister(&e) defer s.EventUnregister(&e) } n, err = copyToDst() } } if err != nil { return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) } // In some circumstances (like MSG_PEEK specified), the sender address // field is purposefully ignored. recvMsgFromHost will return an empty // senderAddrBuf in those cases. var senderAddr linux.SockAddr if senderRequested && len(senderAddrBuf) > 0 { senderAddr = socket.UnmarshalSockAddr(s.family, senderAddrBuf) } unixControlMessages, err := unix.ParseSocketControlMessage(controlBuf) if err != nil { return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) } return int(n), msgFlags, senderAddr, uint32(len(senderAddrBuf)), parseUnixControlMessages(unixControlMessages), nil } func parseUnixControlMessages(unixControlMessages []unix.SocketControlMessage) socket.ControlMessages { controlMessages := socket.ControlMessages{} for _, unixCmsg := range unixControlMessages { switch unixCmsg.Header.Level { case linux.SOL_SOCKET: switch unixCmsg.Header.Type { case linux.SO_TIMESTAMP: controlMessages.IP.HasTimestamp = true ts := linux.Timeval{} ts.UnmarshalUnsafe(unixCmsg.Data) controlMessages.IP.Timestamp = ts.ToTime() } case linux.SOL_IP: switch unixCmsg.Header.Type { case linux.IP_TOS: controlMessages.IP.HasTOS = true var tos primitive.Uint8 tos.UnmarshalUnsafe(unixCmsg.Data) controlMessages.IP.TOS = uint8(tos) case linux.IP_TTL: controlMessages.IP.HasTTL = true var ttl primitive.Uint32 ttl.UnmarshalUnsafe(unixCmsg.Data) controlMessages.IP.TTL = uint32(ttl) case linux.IP_PKTINFO: controlMessages.IP.HasIPPacketInfo = true var packetInfo linux.ControlMessageIPPacketInfo packetInfo.UnmarshalUnsafe(unixCmsg.Data) controlMessages.IP.PacketInfo = packetInfo case linux.IP_RECVORIGDSTADDR: var addr linux.SockAddrInet addr.UnmarshalUnsafe(unixCmsg.Data) controlMessages.IP.OriginalDstAddress = &addr case unix.IP_RECVERR: var errCmsg linux.SockErrCMsgIPv4 errCmsg.UnmarshalBytes(unixCmsg.Data) controlMessages.IP.SockErr = &errCmsg } case linux.SOL_IPV6: switch unixCmsg.Header.Type { case linux.IPV6_TCLASS: controlMessages.IP.HasTClass = true var tclass primitive.Uint32 tclass.UnmarshalUnsafe(unixCmsg.Data) controlMessages.IP.TClass = uint32(tclass) case linux.IPV6_PKTINFO: controlMessages.IP.HasIPv6PacketInfo = true var packetInfo linux.ControlMessageIPv6PacketInfo packetInfo.UnmarshalUnsafe(unixCmsg.Data) controlMessages.IP.IPv6PacketInfo = packetInfo case linux.IPV6_HOPLIMIT: controlMessages.IP.HasHopLimit = true var hoplimit primitive.Uint32 hoplimit.UnmarshalUnsafe(unixCmsg.Data) controlMessages.IP.HopLimit = uint32(hoplimit) case linux.IPV6_RECVORIGDSTADDR: var addr linux.SockAddrInet6 addr.UnmarshalUnsafe(unixCmsg.Data) controlMessages.IP.OriginalDstAddress = &addr case unix.IPV6_RECVERR: var errCmsg linux.SockErrCMsgIPv6 errCmsg.UnmarshalBytes(unixCmsg.Data) controlMessages.IP.SockErr = &errCmsg } case linux.SOL_TCP: switch unixCmsg.Header.Type { case linux.TCP_INQ: controlMessages.IP.HasInq = true var inq primitive.Int32 inq.UnmarshalUnsafe(unixCmsg.Data) controlMessages.IP.Inq = int32(inq) } } } return controlMessages } const allowedSendMsgFlags = unix.MSG_DONTWAIT | unix.MSG_EOR | unix.MSG_FASTOPEN | unix.MSG_MORE | unix.MSG_NOSIGNAL | unix.MSG_OOB // SendMsg implements socket.Socket.SendMsg. func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { if s.family == linux.AF_PACKET { // Don't allow SendMesg for AF_PACKET. return 0, syserr.ErrPermissionDenied } // Only allow known and safe flags. if flags&^allowedSendMsgFlags != 0 { return 0, syserr.ErrInvalidArgument } // If the src is zero-length, call SENDTO directly with a null buffer in // order to generate poll/epoll notifications. if src.NumBytes() == 0 { sysflags := flags | unix.MSG_DONTWAIT n, _, errno := unix.Syscall6(unix.SYS_SENDTO, uintptr(s.fd), 0, 0, uintptr(sysflags), uintptr(firstBytePtr(to)), uintptr(len(to))) if errno != 0 { return 0, syserr.FromError(errno) } return int(n), nil } space := uint64(control.CmsgsSpace(t, controlMessages)) if space > maxControlLen { space = maxControlLen } controlBuf := make([]byte, 0, space) // PackControlMessages will append up to space bytes to controlBuf. controlBuf = control.PackControlMessages(t, controlMessages, controlBuf) sendmsgFromBlocks := safemem.WriterFunc(func(srcs safemem.BlockSeq) (uint64, error) { // Refuse to do anything if any part of src.Addrs was unusable. if uint64(src.NumBytes()) != srcs.NumBytes() { return 0, nil } if srcs.IsEmpty() && len(controlBuf) == 0 { return 0, nil } // We always do a non-blocking send*(). sysflags := flags | unix.MSG_DONTWAIT if srcs.NumBlocks() == 1 && len(controlBuf) == 0 { // Skip allocating []unix.Iovec. src := srcs.Head() n, _, errno := unix.Syscall6(unix.SYS_SENDTO, uintptr(s.fd), src.Addr(), uintptr(src.Len()), uintptr(sysflags), uintptr(firstBytePtr(to)), uintptr(len(to))) if errno != 0 { return 0, translateIOSyscallError(errno) } return uint64(n), nil } iovs := safemem.IovecsFromBlockSeq(srcs) msg := unix.Msghdr{ Iov: &iovs[0], Iovlen: uint64(len(iovs)), } if len(to) != 0 { msg.Name = &to[0] msg.Namelen = uint32(len(to)) } if len(controlBuf) != 0 { msg.Control = &controlBuf[0] msg.Controllen = uint64(len(controlBuf)) } return sendmsg(s.fd, &msg, sysflags) }) var ch chan struct{} n, err := src.CopyInTo(t, sendmsgFromBlocks) if flags&unix.MSG_DONTWAIT == 0 { for linuxerr.Equals(linuxerr.ErrWouldBlock, err) { // We only expect blocking to come from the actual syscall, in which // case it can't have returned any data. if n != 0 { panic(fmt.Sprintf("CopyInTo: got (%d, %v), wanted (0, %v)", n, err, err)) } if ch != nil { if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { err = linuxerr.ErrWouldBlock } break } } else { var e waiter.Entry e, ch = waiter.NewChannelEntry(waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) s.EventRegister(&e) defer s.EventUnregister(&e) } n, err = src.CopyInTo(t, sendmsgFromBlocks) } } return int(n), syserr.FromError(err) } func translateIOSyscallError(err error) error { if err == unix.EAGAIN || err == unix.EWOULDBLOCK { return linuxerr.ErrWouldBlock } return err } // State implements socket.Socket.State. func (s *Socket) State() uint32 { info := linux.TCPInfo{} buf := make([]byte, linux.SizeOfTCPInfo) var err error buf, err = getsockopt(s.fd, unix.SOL_TCP, unix.TCP_INFO, buf) if err != nil { if err != unix.ENOPROTOOPT { log.Warningf("Failed to get TCP socket info from %+v: %v", s, err) } // For non-TCP sockets, silently ignore the failure. return 0 } if len(buf) != linux.SizeOfTCPInfo { // Unmarshal below will panic if getsockopt returns a buffer of // unexpected size. log.Warningf("Failed to get TCP socket info from %+v: getsockopt(2) returned %d bytes, expecting %d bytes.", s, len(buf), linux.SizeOfTCPInfo) return 0 } info.UnmarshalUnsafe(buf[:info.SizeBytes()]) return uint32(info.State) } // Type implements socket.Socket.Type. func (s *Socket) Type() (family int, skType linux.SockType, protocol int) { return s.family, s.stype, s.protocol } func init() { // Register all families in AllowedSocketTypes and AllowedRawSocket // types. If we don't allow raw sockets, they will be rejected in the // Socket call. registered := make(map[int]struct{}) for _, sockType := range append(AllowedSocketTypes, AllowedRawSocketTypes...) { fam := sockType.Family if _, ok := registered[fam]; ok { continue } socket.RegisterProvider(fam, &socketProvider{fam}) registered[fam] = struct{}{} } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/hostinet/socket_unsafe.go000066400000000000000000000201511465435605700272540ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package hostinet import ( "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/usermem" ) func firstBytePtr(bs []byte) unsafe.Pointer { if len(bs) == 0 { return nil } return unsafe.Pointer(&bs[0]) } // Preconditions: len(dsts) != 0. func readv(fd int, dsts []unix.Iovec) (uint64, error) { n, _, errno := unix.Syscall(unix.SYS_READV, uintptr(fd), uintptr(unsafe.Pointer(&dsts[0])), uintptr(len(dsts))) if errno != 0 { return 0, translateIOSyscallError(errno) } return uint64(n), nil } // Preconditions: len(srcs) != 0. func writev(fd int, srcs []unix.Iovec) (uint64, error) { n, _, errno := unix.Syscall(unix.SYS_WRITEV, uintptr(fd), uintptr(unsafe.Pointer(&srcs[0])), uintptr(len(srcs))) if errno != 0 { return 0, translateIOSyscallError(errno) } return uint64(n), nil } func ioctl(ctx context.Context, fd int, io usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { switch cmd := uintptr(args[1].Int()); cmd { case unix.TIOCINQ, unix.TIOCOUTQ: var val int32 if _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(fd), cmd, uintptr(unsafe.Pointer(&val))); errno != 0 { return 0, translateIOSyscallError(errno) } var buf [4]byte hostarch.ByteOrder.PutUint32(buf[:], uint32(val)) _, err := io.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{ AddressSpaceActive: true, }) return 0, err case linux.SIOCGIFFLAGS, linux.SIOCGIFHWADDR, linux.SIOCGIFINDEX, linux.SIOCGIFMTU, linux.SIOCGIFNAME, linux.SIOCGIFNETMASK, linux.SIOCGIFTXQLEN: cc := &usermem.IOCopyContext{ Ctx: ctx, IO: io, Opts: usermem.IOOpts{ AddressSpaceActive: true, }, } var ifr linux.IFReq if _, err := ifr.CopyIn(cc, args[2].Pointer()); err != nil { return 0, err } if _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(fd), cmd, uintptr(unsafe.Pointer(&ifr))); errno != 0 { return 0, translateIOSyscallError(errno) } _, err := ifr.CopyOut(cc, args[2].Pointer()) return 0, err case linux.SIOCGIFCONF: cc := &usermem.IOCopyContext{ Ctx: ctx, IO: io, Opts: usermem.IOOpts{ AddressSpaceActive: true, }, } var ifc linux.IFConf if _, err := ifc.CopyIn(cc, args[2].Pointer()); err != nil { return 0, err } // The user's ifconf can have a nullable pointer to a buffer. Use a Sentry array if non-null. ifcNested := linux.IFConf{Len: ifc.Len} var ifcBuf []byte if ifc.Ptr != 0 { ifcBuf = make([]byte, ifc.Len) ifcNested.Ptr = uint64(uintptr(unsafe.Pointer(&ifcBuf[0]))) } if _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(fd), cmd, uintptr(unsafe.Pointer(&ifcNested))); errno != 0 { return 0, translateIOSyscallError(errno) } // Copy out the buffer if it was non-null. if ifc.Ptr != 0 { if _, err := cc.CopyOutBytes(hostarch.Addr(ifc.Ptr), ifcBuf); err != nil { return 0, err } } ifc.Len = ifcNested.Len _, err := ifc.CopyOut(cc, args[2].Pointer()) return 0, err case linux.SIOCETHTOOL: cc := &usermem.IOCopyContext{ Ctx: ctx, IO: io, Opts: usermem.IOOpts{ AddressSpaceActive: true, }, } var ifr linux.IFReq if _, err := ifr.CopyIn(cc, args[2].Pointer()); err != nil { return 0, err } // SIOCETHTOOL commands specify the subcommand in the first 32 bytes pointed // to by ifr.ifr_data. We need to copy it in first to understand the actual // structure pointed by ifr.ifr_data. ifrData := hostarch.Addr(hostarch.ByteOrder.Uint64(ifr.Data[:8])) var ethtoolCmd linux.EthtoolCmd if _, err := ethtoolCmd.CopyIn(cc, ifrData); err != nil { return 0, err } // We only support ETHTOOL_GFEATURES. if ethtoolCmd != linux.ETHTOOL_GFEATURES { return 0, linuxerr.EOPNOTSUPP } var gfeatures linux.EthtoolGFeatures if _, err := gfeatures.CopyIn(cc, ifrData); err != nil { return 0, err } // Find the requested device. stk := inet.StackFromContext(ctx) if stk == nil { return 0, linuxerr.ENODEV } var ( iface inet.Interface found bool ) for _, iface = range stk.Interfaces() { if iface.Name == ifr.Name() { found = true break } } if !found { return 0, linuxerr.ENODEV } // Copy out the feature blocks to the memory pointed to by ifrData. blksToCopy := int(gfeatures.Size) if blksToCopy > len(iface.Features) { blksToCopy = len(iface.Features) } gfeatures.Size = uint32(blksToCopy) if _, err := gfeatures.CopyOut(cc, ifrData); err != nil { return 0, err } next, ok := ifrData.AddLength(uint64(unsafe.Sizeof(linux.EthtoolGFeatures{}))) for i := 0; i < blksToCopy; i++ { if !ok { return 0, linuxerr.EFAULT } if _, err := iface.Features[i].CopyOut(cc, next); err != nil { return 0, err } next, ok = next.AddLength(uint64(unsafe.Sizeof(linux.EthtoolGetFeaturesBlock{}))) } return 0, nil default: return 0, linuxerr.ENOTTY } } func accept4(fd int, addr *byte, addrlen *uint32, flags int) (int, error) { afd, _, errno := unix.Syscall6(unix.SYS_ACCEPT4, uintptr(fd), uintptr(unsafe.Pointer(addr)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0) if errno != 0 { return 0, translateIOSyscallError(errno) } return int(afd), nil } func getsockopt(fd int, level, name int, opt []byte) ([]byte, error) { optlen32 := int32(len(opt)) _, _, errno := unix.Syscall6(unix.SYS_GETSOCKOPT, uintptr(fd), uintptr(level), uintptr(name), uintptr(firstBytePtr(opt)), uintptr(unsafe.Pointer(&optlen32)), 0) if errno != 0 { return nil, errno } return opt[:optlen32], nil } // GetSockName implements socket.Socket.GetSockName. func (s *Socket) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr := make([]byte, sizeofSockaddr) addrlen := uint32(len(addr)) _, _, errno := unix.Syscall(unix.SYS_GETSOCKNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen))) if errno != 0 { return nil, 0, syserr.FromError(errno) } return socket.UnmarshalSockAddr(s.family, addr), addrlen, nil } // GetPeerName implements socket.Socket.GetPeerName. func (s *Socket) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr := make([]byte, sizeofSockaddr) addrlen := uint32(len(addr)) _, _, errno := unix.Syscall(unix.SYS_GETPEERNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen))) if errno != 0 { return nil, 0, syserr.FromError(errno) } return socket.UnmarshalSockAddr(s.family, addr), addrlen, nil } func recvfrom(fd int, dst []byte, flags int, from *[]byte) (uint64, error) { fromLen := uint32(len(*from)) n, _, errno := unix.Syscall6(unix.SYS_RECVFROM, uintptr(fd), uintptr(firstBytePtr(dst)), uintptr(len(dst)), uintptr(flags), uintptr(firstBytePtr(*from)), uintptr(unsafe.Pointer(&fromLen))) if errno != 0 { return 0, translateIOSyscallError(errno) } *from = (*from)[:fromLen] return uint64(n), nil } func recvmsg(fd int, msg *unix.Msghdr, flags int) (uint64, error) { n, _, errno := unix.Syscall(unix.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(msg)), uintptr(flags)) if errno != 0 { return 0, translateIOSyscallError(errno) } return uint64(n), nil } func sendmsg(fd int, msg *unix.Msghdr, flags int) (uint64, error) { n, _, errno := unix.Syscall(unix.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(msg)), uintptr(flags)) if errno != 0 { return 0, translateIOSyscallError(errno) } return uint64(n), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/hostinet/sockopt.go000066400000000000000000000234011465435605700261060ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package hostinet import ( "fmt" "sync" "time" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserr" ) const ( sizeofInt16 = 2 sizeofInt32 = 4 ) // SockOpt is used to generate get/setsockopt handlers and filters. type SockOpt struct { // Level the socket option applies to. Level uint64 // Name of the option. Name uint64 // Size of the parameter. A size of 0 indicates that any size is // allowed (used for string or other variable-length types). Size uint64 // Support getsockopt on this option. AllowGet bool // Support setsockopt on this option. AllowSet bool } // SockOpts are the socket options supported by hostinet by making syscalls to the host. // // Note the following socket options are supported but do not need syscalls to // the host, so do not appear on this list: // - SO_TYPE, SO_PROTOCOL, SO_DOMAIN are handled at the syscall level in // syscalls/sys_socket.go. // - SO_SNDTIMEOU, SO_RCVTIMEO are handled internally by setting the embedded // socket.SendReceiveTimeout. var SockOpts = []SockOpt{ {linux.SOL_IP, linux.IP_ADD_MEMBERSHIP, 0, false, true}, {linux.SOL_IP, linux.IP_DROP_MEMBERSHIP, 0, false, true}, {linux.SOL_IP, linux.IP_HDRINCL, sizeofInt32, true, true}, {linux.SOL_IP, linux.IP_MULTICAST_IF, 0 /* kernel allows multiple structures to be passed */, true, true}, {linux.SOL_IP, linux.IP_MULTICAST_LOOP, 0 /* can be 32-bit int or 8-bit uint */, true, true}, {linux.SOL_IP, linux.IP_MULTICAST_TTL, 0 /* can be 32-bit int or 8-bit uint */, true, true}, {linux.SOL_IP, linux.IP_MTU_DISCOVER, 0 /* can be 32-bit int or 8-bit uint */, true, true}, {linux.SOL_IP, linux.IP_PKTINFO, sizeofInt32, true, true}, {linux.SOL_IP, linux.IP_RECVERR, sizeofInt32, true, true}, {linux.SOL_IP, linux.IP_RECVORIGDSTADDR, sizeofInt32, true, true}, {linux.SOL_IP, linux.IP_RECVTOS, sizeofInt32, true, true}, {linux.SOL_IP, linux.IP_RECVTTL, sizeofInt32, true, true}, {linux.SOL_IP, linux.IP_TOS, 0 /* Can be 32, 16, or 8 bits */, true, true}, {linux.SOL_IP, linux.IP_TTL, sizeofInt32, true, true}, {linux.SOL_IPV6, linux.IPV6_CHECKSUM, sizeofInt32, true, true}, {linux.SOL_IPV6, linux.IPV6_MULTICAST_HOPS, sizeofInt32, true, true}, {linux.SOL_IPV6, linux.IPV6_RECVERR, sizeofInt32, true, true}, {linux.SOL_IPV6, linux.IPV6_RECVHOPLIMIT, sizeofInt32, true, true}, {linux.SOL_IPV6, linux.IPV6_RECVORIGDSTADDR, sizeofInt32, true, true}, {linux.SOL_IPV6, linux.IPV6_RECVPKTINFO, sizeofInt32, true, true}, {linux.SOL_IPV6, linux.IPV6_RECVTCLASS, sizeofInt32, true, true}, {linux.SOL_IPV6, linux.IPV6_TCLASS, sizeofInt32, true, true}, {linux.SOL_IPV6, linux.IPV6_UNICAST_HOPS, sizeofInt32, true, true}, {linux.SOL_IPV6, linux.IPV6_V6ONLY, sizeofInt32, true, true}, {linux.SOL_SOCKET, linux.SO_ACCEPTCONN, sizeofInt32, true, true}, {linux.SOL_SOCKET, linux.SO_BINDTODEVICE, 0, true, true}, {linux.SOL_SOCKET, linux.SO_BROADCAST, sizeofInt32, true, true}, {linux.SOL_SOCKET, linux.SO_ERROR, sizeofInt32, true, false}, {linux.SOL_SOCKET, linux.SO_KEEPALIVE, sizeofInt32, true, true}, {linux.SOL_SOCKET, linux.SO_LINGER, linux.SizeOfLinger, true, true}, {linux.SOL_SOCKET, linux.SO_NO_CHECK, sizeofInt32, true, true}, {linux.SOL_SOCKET, linux.SO_OOBINLINE, sizeofInt32, true, true}, {linux.SOL_SOCKET, linux.SO_PASSCRED, sizeofInt32, true, true}, {linux.SOL_SOCKET, linux.SO_RCVBUF, sizeofInt32, true, true}, {linux.SOL_SOCKET, linux.SO_RCVBUFFORCE, sizeofInt32, false, true}, {linux.SOL_SOCKET, linux.SO_RCVLOWAT, sizeofInt32, true, true}, {linux.SOL_SOCKET, linux.SO_REUSEADDR, sizeofInt32, true, true}, {linux.SOL_SOCKET, linux.SO_REUSEPORT, sizeofInt32, true, true}, {linux.SOL_SOCKET, linux.SO_SNDBUF, sizeofInt32, true, true}, {linux.SOL_SOCKET, linux.SO_TIMESTAMP, sizeofInt32, true, true}, {linux.SOL_TCP, linux.TCP_CONGESTION, 0 /* string */, true, true}, {linux.SOL_TCP, linux.TCP_CORK, sizeofInt32, true, true}, {linux.SOL_TCP, linux.TCP_DEFER_ACCEPT, sizeofInt32, true, true}, {linux.SOL_TCP, linux.TCP_INFO, uint64(linux.SizeOfTCPInfo), true, false}, {linux.SOL_TCP, linux.TCP_INQ, sizeofInt32, true, true}, {linux.SOL_TCP, linux.TCP_KEEPCNT, sizeofInt32, true, true}, {linux.SOL_TCP, linux.TCP_KEEPIDLE, sizeofInt32, true, true}, {linux.SOL_TCP, linux.TCP_KEEPINTVL, sizeofInt32, true, true}, {linux.SOL_TCP, linux.TCP_LINGER2, sizeofInt32, true, true}, {linux.SOL_TCP, linux.TCP_MAXSEG, sizeofInt32, true, true}, {linux.SOL_TCP, linux.TCP_NODELAY, sizeofInt32, true, true}, {linux.SOL_TCP, linux.TCP_QUICKACK, sizeofInt32, true, true}, {linux.SOL_TCP, linux.TCP_SYNCNT, sizeofInt32, true, true}, {linux.SOL_TCP, linux.TCP_USER_TIMEOUT, sizeofInt32, true, true}, {linux.SOL_TCP, linux.TCP_WINDOW_CLAMP, sizeofInt32, true, true}, {linux.SOL_ICMPV6, linux.ICMPV6_FILTER, uint64(linux.SizeOfICMP6Filter), true, true}, } // sockOptMap is a map of {level, name} -> SockOpts. It is an optimization for // looking up SockOpts by level and name. The map is initialized in the first // call to Get/SetSockOpt. var ( sockOptMap map[levelName]SockOpt sockOptMapOnce sync.Once ) type levelName struct { level uint64 name uint64 } func initSockOptMap(t *kernel.Task) { opts := append(SockOpts, extraSockOpts(t)...) sockOptMap = make(map[levelName]SockOpt, len(opts)) for _, opt := range opts { ln := levelName{opt.Level, opt.Name} if _, ok := sockOptMap[ln]; ok { panic(fmt.Sprintf("multiple sockopts with level=%d and name=%d", opt.Level, opt.Name)) } sockOptMap[ln] = opt } } // GetSockOpt implements socket.Socket.GetSockOpt. func (s *Socket) GetSockOpt(t *kernel.Task, level, name int, optValAddr hostarch.Addr, optLen int) (marshal.Marshallable, *syserr.Error) { sockOptMapOnce.Do(func() { initSockOptMap(t) }) if optLen < 0 { return nil, syserr.ErrInvalidArgument } // Special case send/recv timeouts since those are handled internally. if level == linux.SOL_SOCKET { switch name { case linux.SO_RCVTIMEO: recvTimeout := linux.NsecToTimeval(s.RecvTimeout()) return &recvTimeout, nil case linux.SO_SNDTIMEO: sndTimeout := linux.NsecToTimeval(s.SendTimeout()) return &sndTimeout, nil } } sockOpt, ok := sockOptMap[levelName{uint64(level), uint64(name)}] if !ok { return nil, syserr.ErrProtocolNotAvailable } if !sockOpt.AllowGet { return nil, syserr.ErrInvalidArgument } var opt []byte if sockOpt.Size > 0 { // Validate size of input buffer. if uint64(optLen) < sockOpt.Size { // Special case for options that allow smaller buffers. // // To keep the syscall filters simple and restrictive, // we use the full buffer size when calling the host, // but truncate before returning to the application. switch { case level == linux.SOL_TCP && name == linux.TCP_INFO: // Allow smaller buffer. case level == linux.SOL_ICMPV6 && name == linux.ICMPV6_FILTER: // Allow smaller buffer. case level == linux.SOL_IP && name == linux.IP_TTL: // Allow smaller buffer. case level == linux.SOL_IPV6 && name == linux.IPV6_TCLASS: // Allow smaller buffer. default: return nil, syserr.ErrInvalidArgument } } opt = make([]byte, sockOpt.Size) } else { // No size checking. This is probably a string. Use the size // they gave us. opt = make([]byte, optLen) } if err := preGetSockOpt(t, level, name, optValAddr, opt); err != nil { return nil, syserr.FromError(err) } var err error opt, err = getsockopt(s.fd, level, name, opt) if err != nil { return nil, syserr.FromError(err) } opt = postGetSockOpt(t, level, name, opt) // If option allows a smaller buffer, truncate it to desired size. if uint64(optLen) < sockOpt.Size { opt = opt[:optLen] } optP := primitive.ByteSlice(opt) return &optP, nil } // SetSockOpt implements socket.Socket.SetSockOpt. func (s *Socket) SetSockOpt(t *kernel.Task, level, name int, opt []byte) *syserr.Error { sockOptMapOnce.Do(func() { initSockOptMap(t) }) // Special case send/recv timeouts since those are handled internally. if level == linux.SOL_SOCKET { switch name { case linux.SO_RCVTIMEO: optLen := linux.SizeOfTimeval var v linux.Timeval v.UnmarshalBytes(opt[:optLen]) if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) { return syserr.ErrDomain } s.SetRecvTimeout(v.ToNsecCapped()) return nil case linux.SO_SNDTIMEO: optLen := linux.SizeOfTimeval var v linux.Timeval v.UnmarshalBytes(opt[:optLen]) if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) { return syserr.ErrDomain } s.SetSendTimeout(v.ToNsecCapped()) return nil } } sockOpt, ok := sockOptMap[levelName{uint64(level), uint64(name)}] if !ok { // Pretend to accept socket options we don't understand. This // seems dangerous, but it's what netstack does... return nil } if !sockOpt.AllowSet { return syserr.ErrInvalidArgument } if sockOpt.Size > 0 { if uint64(len(opt)) < sockOpt.Size { return syserr.ErrInvalidArgument } opt = opt[:sockOpt.Size] } if _, _, errno := unix.Syscall6(unix.SYS_SETSOCKOPT, uintptr(s.fd), uintptr(level), uintptr(name), uintptr(firstBytePtr(opt)), uintptr(len(opt)), 0); errno != 0 { return syserr.FromError(errno) } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/hostinet/sockopt_impl.go000066400000000000000000000017461465435605700271370ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false package hostinet import ( "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel" ) func extraSockOpts(t *kernel.Task) []SockOpt { return nil } func preGetSockOpt(t *kernel.Task, level, name int, optValAddr hostarch.Addr, opt []byte) error { return nil } func postGetSockOpt(t *kernel.Task, level, name int, opt []byte) []byte { return opt } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/hostinet/stack.go000066400000000000000000000274621465435605700255440ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package hostinet import ( "fmt" "io" "io/ioutil" "os" "reflect" "strconv" "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/socket/netlink/nlmsg" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/usermem" ) var defaultRecvBufSize = inet.TCPBufferSize{ Min: 4096, Default: 87380, Max: 6291456, } var defaultSendBufSize = inet.TCPBufferSize{ Min: 4096, Default: 16384, Max: 4194304, } // Stack implements inet.Stack for host sockets. type Stack struct { // Stack is immutable. supportsIPv6 bool tcpRecovery inet.TCPLossRecovery tcpRecvBufSize inet.TCPBufferSize tcpSendBufSize inet.TCPBufferSize tcpSACKEnabled bool netDevFile *os.File netSNMPFile *os.File // allowedSocketTypes is the list of allowed socket types allowedSocketTypes []AllowedSocketType } // Destroy implements inet.Stack.Destroy. func (*Stack) Destroy() { } // NewStack returns an empty Stack containing no configuration. func NewStack() *Stack { return &Stack{} } // Configure sets up the stack using the current state of the host network. func (s *Stack) Configure(allowRawSockets bool) error { if _, err := os.Stat("/proc/net/if_inet6"); err == nil { s.supportsIPv6 = true } s.tcpRecvBufSize = defaultRecvBufSize if tcpRMem, err := readTCPBufferSizeFile("/proc/sys/net/ipv4/tcp_rmem"); err == nil { s.tcpRecvBufSize = tcpRMem } else { log.Warningf("Failed to read TCP receive buffer size, using default values") } s.tcpSendBufSize = defaultSendBufSize if tcpWMem, err := readTCPBufferSizeFile("/proc/sys/net/ipv4/tcp_wmem"); err == nil { s.tcpSendBufSize = tcpWMem } else { log.Warningf("Failed to read TCP send buffer size, using default values") } // SACK is important for performance and even compatibility, assume it's // enabled if we can't find the actual value. s.tcpSACKEnabled = true if sack, err := ioutil.ReadFile("/proc/sys/net/ipv4/tcp_sack"); err == nil { s.tcpSACKEnabled = strings.TrimSpace(string(sack)) != "0" } else { log.Warningf("Failed to read if TCP SACK if enabled, setting to true") } if f, err := os.Open("/proc/net/dev"); err != nil { log.Warningf("Failed to open /proc/net/dev: %v", err) } else { s.netDevFile = f } if f, err := os.Open("/proc/net/snmp"); err != nil { log.Warningf("Failed to open /proc/net/snmp: %v", err) } else { s.netSNMPFile = f } s.allowedSocketTypes = AllowedSocketTypes if allowRawSockets { s.allowedSocketTypes = append(s.allowedSocketTypes, AllowedRawSocketTypes...) } return nil } func readTCPBufferSizeFile(filename string) (inet.TCPBufferSize, error) { contents, err := ioutil.ReadFile(filename) if err != nil { return inet.TCPBufferSize{}, fmt.Errorf("failed to read %s: %v", filename, err) } ioseq := usermem.BytesIOSequence(contents) fields := make([]int32, 3) if n, err := usermem.CopyInt32StringsInVec(context.Background(), ioseq.IO, ioseq.Addrs, fields, ioseq.Opts); n != ioseq.NumBytes() || err != nil { return inet.TCPBufferSize{}, fmt.Errorf("failed to parse %s (%q): got %v after %d/%d bytes", filename, contents, err, n, ioseq.NumBytes()) } return inet.TCPBufferSize{ Min: int(fields[0]), Default: int(fields[1]), Max: int(fields[2]), }, nil } // Interfaces implements inet.Stack.Interfaces. func (s *Stack) Interfaces() map[int32]inet.Interface { ifs, err := getInterfaces() if err != nil { log.Warningf("could not get host interface: %v", err) return nil } // query interface features for each of the host interfaces. if err := queryInterfaceFeatures(ifs); err != nil { log.Warningf("could not query host interfaces: %v", err) return nil } return ifs } // RemoveInterface implements inet.Stack.RemoveInterface. func (*Stack) RemoveInterface(idx int32) error { return removeInterface(idx) } // InterfaceAddrs implements inet.Stack.InterfaceAddrs. func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { addrs, err := getInterfaceAddrs() if err != nil { log.Warningf("failed to get host interface addresses: %v", err) return nil } return addrs } // SetInterface implements inet.Stack.SetInterface. func (s *Stack) SetInterface(ctx context.Context, msg *nlmsg.Message) *syserr.Error { var ifinfomsg linux.InterfaceInfoMessage attrs, ok := msg.GetData(&ifinfomsg) if !ok { return syserr.ErrInvalidArgument } for !attrs.Empty() { // The index is unspecified, search by the interface name. ahdr, value, rest, ok := attrs.ParseFirst() if !ok { return syserr.ErrInvalidArgument } attrs = rest switch ahdr.Type { case linux.IFLA_IFNAME: if len(value) < 1 { return syserr.ErrInvalidArgument } if ifinfomsg.Index != 0 { // Device name changing isn't supported yet. return syserr.ErrNotSupported } ifname := string(value[:len(value)-1]) for idx, ifa := range s.Interfaces() { if ifname == ifa.Name { ifinfomsg.Index = idx break } } default: ctx.Warningf("unexpected attribute: %x", ahdr.Type) return syserr.ErrNotSupported } } if ifinfomsg.Index == 0 { return syserr.ErrNoDevice } flags := msg.Header().Flags if flags&(linux.NLM_F_EXCL|linux.NLM_F_REPLACE) != 0 { return syserr.ErrExists } if ifinfomsg.Flags != 0 || ifinfomsg.Change != 0 { if ifinfomsg.Change & ^uint32(linux.IFF_UP) != 0 { ctx.Warningf("Unsupported ifi_change flags: %x", ifinfomsg.Change) return syserr.ErrInvalidArgument } if ifinfomsg.Flags & ^uint32(linux.IFF_UP) != 0 { ctx.Warningf("Unsupported ifi_flags: %x", ifinfomsg.Change) return syserr.ErrInvalidArgument } // Netstack interfaces are always up. } return nil } // AddInterfaceAddr implements inet.Stack.AddInterfaceAddr. func (*Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { return addInterfaceAddr(idx, addr) } // RemoveInterfaceAddr implements inet.Stack.RemoveInterfaceAddr. func (*Stack) RemoveInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { return removeInterfaceAddr(idx, addr) } // SupportsIPv6 implements inet.Stack.SupportsIPv6. func (s *Stack) SupportsIPv6() bool { return s.supportsIPv6 } // TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize. func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) { return s.tcpRecvBufSize, nil } // SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize. func (*Stack) SetTCPReceiveBufferSize(inet.TCPBufferSize) error { return linuxerr.EACCES } // TCPSendBufferSize implements inet.Stack.TCPSendBufferSize. func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) { return s.tcpSendBufSize, nil } // SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize. func (*Stack) SetTCPSendBufferSize(inet.TCPBufferSize) error { return linuxerr.EACCES } // TCPSACKEnabled implements inet.Stack.TCPSACKEnabled. func (s *Stack) TCPSACKEnabled() (bool, error) { return s.tcpSACKEnabled, nil } // SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled. func (*Stack) SetTCPSACKEnabled(bool) error { return linuxerr.EACCES } // TCPRecovery implements inet.Stack.TCPRecovery. func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) { return s.tcpRecovery, nil } // SetTCPRecovery implements inet.Stack.SetTCPRecovery. func (*Stack) SetTCPRecovery(inet.TCPLossRecovery) error { return linuxerr.EACCES } // getLine reads one line from proc file, with specified prefix. // The last argument, withHeader, specifies if it contains line header. func getLine(f *os.File, prefix string, withHeader bool) string { data := make([]byte, 4096) if _, err := f.Seek(0, 0); err != nil { return "" } if _, err := io.ReadFull(f, data); err != io.ErrUnexpectedEOF { return "" } prefix = prefix + ":" lines := strings.Split(string(data), "\n") for _, l := range lines { l = strings.TrimSpace(l) if strings.HasPrefix(l, prefix) { if withHeader { withHeader = false continue } return l } } return "" } func toSlice(i any) []uint64 { v := reflect.Indirect(reflect.ValueOf(i)) return v.Slice(0, v.Len()).Interface().([]uint64) } // Statistics implements inet.Stack.Statistics. func (s *Stack) Statistics(stat any, arg string) error { var ( snmpTCP bool rawLine string sliceStat []uint64 ) switch stat.(type) { case *inet.StatDev: if s.netDevFile == nil { return fmt.Errorf("/proc/net/dev is not opened for hostinet") } rawLine = getLine(s.netDevFile, arg, false /* with no header */) case *inet.StatSNMPIP, *inet.StatSNMPICMP, *inet.StatSNMPICMPMSG, *inet.StatSNMPTCP, *inet.StatSNMPUDP, *inet.StatSNMPUDPLite: if s.netSNMPFile == nil { return fmt.Errorf("/proc/net/snmp is not opened for hostinet") } rawLine = getLine(s.netSNMPFile, arg, true) default: return syserr.ErrEndpointOperation.ToError() } if rawLine == "" { return fmt.Errorf("failed to get raw line") } parts := strings.SplitN(rawLine, ":", 2) if len(parts) != 2 { return fmt.Errorf("failed to get prefix from: %q", rawLine) } sliceStat = toSlice(stat) fields := strings.Fields(strings.TrimSpace(parts[1])) if len(fields) != len(sliceStat) { return fmt.Errorf("failed to parse fields: %q", rawLine) } if _, ok := stat.(*inet.StatSNMPTCP); ok { snmpTCP = true } for i := 0; i < len(sliceStat); i++ { var err error if snmpTCP && i == 3 { var tmp int64 // MaxConn field is signed, RFC 2012. tmp, err = strconv.ParseInt(fields[i], 10, 64) sliceStat[i] = uint64(tmp) // Convert back to int before use. } else { sliceStat[i], err = strconv.ParseUint(fields[i], 10, 64) } if err != nil { return fmt.Errorf("failed to parse field %d from: %q, %v", i, rawLine, err) } } return nil } // RouteTable implements inet.Stack.RouteTable. func (s *Stack) RouteTable() []inet.Route { routes, err := getRoutes() if err != nil { log.Warningf("failed to get routes: %v", err) return nil } // Prepend empty route. return append([]inet.Route(nil), routes...) } // NewRoute implements inet.Stack.NewRoute. func (*Stack) NewRoute(context.Context, *nlmsg.Message) *syserr.Error { // TODO(b/343524351): implements RTM_NEWROUTE for hostinet. return syserr.ErrNotSupported } // Pause implements inet.Stack.Pause. func (*Stack) Pause() {} // Restore implements inet.Stack.Restore. func (*Stack) Restore() {} // Resume implements inet.Stack.Resume. func (*Stack) Resume() {} // RegisteredEndpoints implements inet.Stack.RegisteredEndpoints. func (*Stack) RegisteredEndpoints() []stack.TransportEndpoint { return nil } // CleanupEndpoints implements inet.Stack.CleanupEndpoints. func (*Stack) CleanupEndpoints() []stack.TransportEndpoint { return nil } // RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints. func (*Stack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {} // SetForwarding implements inet.Stack.SetForwarding. func (*Stack) SetForwarding(tcpip.NetworkProtocolNumber, bool) error { return linuxerr.EACCES } // PortRange implements inet.Stack.PortRange. func (*Stack) PortRange() (uint16, uint16) { // Use the default Linux values per net/ipv4/af_inet.c:inet_init_net(). return 32768, 60999 } // SetPortRange implements inet.Stack.SetPortRange. func (*Stack) SetPortRange(uint16, uint16) error { return linuxerr.EACCES } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/hostinet/stack_unsafe.go000066400000000000000000000066551465435605700271060ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package hostinet import ( "runtime" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/inet" ) func queryInterfaceFeatures(interfaces map[int32]inet.Interface) error { fd, err := queryFD() if err != nil { return err } defer unix.Close(fd) for idx, nic := range interfaces { var ifr linux.IFReq copy(ifr.IFName[:], nic.Name) var gfeatures linux.EthtoolGFeatures // Each feature block is sufficient to query 32 features, the linux // kernel today supports upto 64 features per device. Technically it // can support more in the future but this is sufficient for our use // right now. const ( numFeatureBlocks = 2 ifrDataSz = unsafe.Sizeof(linux.EthtoolGFeatures{}) + numFeatureBlocks*unsafe.Sizeof(linux.EthtoolGetFeaturesBlock{}) ) featureBlocks := make([]linux.EthtoolGetFeaturesBlock, numFeatureBlocks) b := make([]byte, ifrDataSz) gfeatures.Cmd = uint32(linux.ETHTOOL_GFEATURES) gfeatures.Size = numFeatureBlocks gfeatures.MarshalBytes(b) next := b[unsafe.Sizeof(linux.EthtoolGFeatures{}):] for i := 0; i < numFeatureBlocks; i++ { featureBlocks[i].MarshalBytes(next) next = next[unsafe.Sizeof(linux.EthtoolGetFeaturesBlock{}):] } // Technically the next two lines are not safe as Go GC can technically move // b to a new location and the pointer value stored in ifr.Data could point // to random memory. But the reality today is that Go GC is not a moving GC // so this is essentially safe as of today. // // TODO(b/209014118): Use Pin API when available in Go runtime to make this // safe. dataPtr := unsafe.Pointer(&b[0]) hostarch.ByteOrder.PutUint64(ifr.Data[:8], uint64(uintptr(dataPtr))) if _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(fd), unix.SIOCETHTOOL, uintptr(unsafe.Pointer(&ifr))); errno != 0 { return errno } // Unmarshall the features back. gfeatures.UnmarshalBytes(b) next = b[unsafe.Sizeof(linux.EthtoolGFeatures{}):] for i := 0; i < int(gfeatures.Size); i++ { featureBlocks[i].UnmarshalBytes(next) next = next[unsafe.Sizeof(linux.EthtoolGetFeaturesBlock{}):] } // Store the queried features. iface := interfaces[idx] iface.Features = make([]linux.EthtoolGetFeaturesBlock, gfeatures.Size) copy(iface.Features, featureBlocks) interfaces[idx] = iface // This ensures b is not garbage collected before this point to ensure that // the slice is not collected before the syscall returns and we copy out the // data. runtime.KeepAlive(b) } return nil } func queryFD() (int, error) { // Try both AF_INET and AF_INET6 in case only one is supported. var fd int var err error for _, family := range []int{unix.AF_INET6, unix.AF_INET} { fd, err = unix.Socket(family, unix.SOCK_STREAM, 0) if err == nil { return fd, err } } return fd, err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netfilter/000077500000000000000000000000001465435605700242345ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netfilter/dnat.go000066400000000000000000000224641465435605700255210ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package netfilter import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // DNATTargetName is used to mark targets as DNAT targets. DNAT targets should // be reached for only NAT table. These targets will change the source port // and/or IP for packets. const DNATTargetName = "DNAT" type dnatTarget struct { stack.DNATTarget revision uint8 } func (dt *dnatTarget) id() targetID { return targetID{ name: DNATTargetName, networkProtocol: dt.NetworkProtocol, revision: dt.revision, } } type dnatTargetMakerV4 struct { NetworkProtocol tcpip.NetworkProtocolNumber } func (dt *dnatTargetMakerV4) id() targetID { return targetID{ name: DNATTargetName, networkProtocol: dt.NetworkProtocol, } } func (*dnatTargetMakerV4) marshal(target target) []byte { dt := target.(*dnatTarget) // This is a dnat target named dnat. xt := linux.XTNATTargetV0{ Target: linux.XTEntryTarget{ TargetSize: linux.SizeOfXTNATTargetV0, }, } copy(xt.Target.Name[:], DNATTargetName) if dt.ChangeAddress { xt.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_MAP_IPS } if dt.ChangePort { xt.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED } xt.NfRange.RangeSize = 1 xt.NfRange.RangeIPV4.MinPort = htons(dt.Port) xt.NfRange.RangeIPV4.MaxPort = xt.NfRange.RangeIPV4.MinPort copy(xt.NfRange.RangeIPV4.MinIP[:], dt.Addr.AsSlice()) copy(xt.NfRange.RangeIPV4.MaxIP[:], dt.Addr.AsSlice()) return marshal.Marshal(&xt) } func (*dnatTargetMakerV4) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) { if len(buf) < linux.SizeOfXTNATTargetV0 { nflog("dnatTargetMakerV4: buf has insufficient size for dnat target %d", len(buf)) return nil, syserr.ErrInvalidArgument } if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber { nflog("dnatTargetMakerV4: bad proto %d", p) return nil, syserr.ErrInvalidArgument } var dt linux.XTNATTargetV0 dt.UnmarshalUnsafe(buf) // Copy linux.XTNATTargetV0 to stack.DNATTarget. target := dnatTarget{DNATTarget: stack.DNATTarget{ NetworkProtocol: filter.NetworkProtocol(), }} // RangeSize should be 1. nfRange := dt.NfRange if nfRange.RangeSize != 1 { nflog("dnatTargetMakerV4: bad rangesize %d", nfRange.RangeSize) return nil, syserr.ErrInvalidArgument } if nfRange.RangeIPV4.MinPort == 0 { nflog("dnatTargetMakerV4: dnat target needs to specify a non-zero port") return nil, syserr.ErrInvalidArgument } if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort { nflog("dnatTargetMakerV4: MinPort != MaxPort (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort) return nil, syserr.ErrInvalidArgument } if nfRange.RangeIPV4.MinIP != nfRange.RangeIPV4.MaxIP { nflog("dnatTargetMakerV4: MinIP != MaxIP (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort) return nil, syserr.ErrInvalidArgument } if nfRange.RangeIPV4.Flags&^(linux.NF_NAT_RANGE_MAP_IPS|linux.NF_NAT_RANGE_PROTO_SPECIFIED) != 0 { nflog("dnatTargetMakerV4: unknown flags used (%x)", nfRange.RangeIPV4.Flags) return nil, syserr.ErrInvalidArgument } target.ChangeAddress = nfRange.RangeIPV4.Flags&linux.NF_NAT_RANGE_MAP_IPS != 0 target.ChangePort = nfRange.RangeIPV4.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED != 0 target.Addr = tcpip.AddrFrom4(nfRange.RangeIPV4.MinIP) target.Port = ntohs(nfRange.RangeIPV4.MinPort) return &target, nil } type dnatTargetMakerR1 struct { NetworkProtocol tcpip.NetworkProtocolNumber } func (dt *dnatTargetMakerR1) id() targetID { return targetID{ name: DNATTargetName, networkProtocol: dt.NetworkProtocol, revision: 1, } } func (*dnatTargetMakerR1) marshal(target target) []byte { dt := target.(*dnatTarget) nt := linux.XTNATTargetV1{ Target: linux.XTEntryTarget{ TargetSize: linux.SizeOfXTNATTargetV1, Revision: 1, }, } copy(nt.Target.Name[:], DNATTargetName) if dt.ChangeAddress { nt.Range.Flags |= linux.NF_NAT_RANGE_MAP_IPS } if dt.ChangePort { nt.Range.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED } copy(nt.Range.MinAddr[:], dt.Addr.AsSlice()) copy(nt.Range.MaxAddr[:], dt.Addr.AsSlice()) nt.Range.MinProto = htons(dt.Port) nt.Range.MaxProto = nt.Range.MinProto return marshal.Marshal(&nt) } func (dt *dnatTargetMakerR1) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) { if size := linux.SizeOfXTNATTargetV1; len(buf) < size { nflog("dnatTargetMakerR1: buf has insufficient size (%d) for DNAT target (%d)", len(buf), size) return nil, syserr.ErrInvalidArgument } if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber { nflog("dnatTargetMakerR1: bad proto %d", p) return nil, syserr.ErrInvalidArgument } var natRange linux.NFNATRange natRange.UnmarshalUnsafe(buf[linux.SizeOfXTEntryTarget:]) if natRange.MinAddr != natRange.MaxAddr { nflog("dnatTargetMakerR1: MinAddr and MaxAddr are different") return nil, syserr.ErrInvalidArgument } if natRange.MinProto != natRange.MaxProto { nflog("dnatTargetMakerR1: MinProto and MaxProto are different") return nil, syserr.ErrInvalidArgument } if natRange.Flags&^(linux.NF_NAT_RANGE_MAP_IPS|linux.NF_NAT_RANGE_PROTO_SPECIFIED) != 0 { nflog("dnatTargetMakerR1: invalid flags used (%x)", natRange.Flags) return nil, syserr.ErrInvalidArgument } target := dnatTarget{ DNATTarget: stack.DNATTarget{ NetworkProtocol: filter.NetworkProtocol(), Port: ntohs(natRange.MinProto), ChangeAddress: natRange.Flags&linux.NF_NAT_RANGE_MAP_IPS != 0, ChangePort: natRange.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED != 0, }, revision: 1, } switch dt.NetworkProtocol { case header.IPv4ProtocolNumber: target.DNATTarget.Addr = tcpip.AddrFrom4Slice(natRange.MinAddr[:4]) case header.IPv6ProtocolNumber: target.DNATTarget.Addr = tcpip.AddrFrom16(natRange.MinAddr) default: panic(fmt.Sprintf("invalid protocol number: %d", dt.NetworkProtocol)) } return &target, nil } type dnatTargetMakerR2 struct { NetworkProtocol tcpip.NetworkProtocolNumber } func (dt *dnatTargetMakerR2) id() targetID { return targetID{ name: DNATTargetName, networkProtocol: dt.NetworkProtocol, revision: 2, } } func (*dnatTargetMakerR2) marshal(target target) []byte { dt := target.(*dnatTarget) nt := linux.XTNATTargetV2{ Target: linux.XTEntryTarget{ TargetSize: linux.SizeOfXTNATTargetV1, Revision: 2, }, } copy(nt.Target.Name[:], DNATTargetName) if dt.ChangeAddress { nt.Range.Flags |= linux.NF_NAT_RANGE_MAP_IPS } if dt.ChangePort { nt.Range.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED } copy(nt.Range.MinAddr[:], dt.Addr.AsSlice()) copy(nt.Range.MaxAddr[:], dt.Addr.AsSlice()) nt.Range.MinProto = htons(dt.Port) nt.Range.MaxProto = nt.Range.MinProto return marshal.Marshal(&nt) } func (dt *dnatTargetMakerR2) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) { nflog("dnatTargetMakerR2 unmarshal") if size := linux.SizeOfXTNATTargetV2; len(buf) < size { nflog("dnatTargetMakerR2: buf has insufficient size (%d) for DNAT target (%d)", len(buf), size) return nil, syserr.ErrInvalidArgument } if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber { nflog("dnatTargetMakerR2: bad proto %d", p) return nil, syserr.ErrInvalidArgument } var natRange linux.NFNATRange2 natRange.UnmarshalUnsafe(buf[linux.SizeOfXTEntryTarget:]) if natRange.MinAddr != natRange.MaxAddr { nflog("dnatTargetMakerR2: MinAddr and MaxAddr are different") return nil, syserr.ErrInvalidArgument } if natRange.MinProto != natRange.MaxProto { nflog("dnatTargetMakerR2: MinProto and MaxProto are different") return nil, syserr.ErrInvalidArgument } if natRange.BaseProto != 0 { nflog("dnatTargetMakerR2: BaseProto is nonzero") return nil, syserr.ErrInvalidArgument } if natRange.Flags&^(linux.NF_NAT_RANGE_MAP_IPS|linux.NF_NAT_RANGE_PROTO_SPECIFIED) != 0 { nflog("dnatTargetMakerR2: invalid flags used (%x)", natRange.Flags) return nil, syserr.ErrInvalidArgument } target := dnatTarget{ DNATTarget: stack.DNATTarget{ NetworkProtocol: filter.NetworkProtocol(), Port: ntohs(natRange.MinProto), ChangeAddress: natRange.Flags&linux.NF_NAT_RANGE_MAP_IPS != 0, ChangePort: natRange.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED != 0, }, revision: 2, } switch dt.NetworkProtocol { case header.IPv4ProtocolNumber: target.DNATTarget.Addr = tcpip.AddrFrom4Slice(natRange.MinAddr[:4]) case header.IPv6ProtocolNumber: target.DNATTarget.Addr = tcpip.AddrFrom16(natRange.MinAddr) default: panic(fmt.Sprintf("invalid protocol number: %d", dt.NetworkProtocol)) } return &target, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netfilter/extensions.go000066400000000000000000000144201465435605700267630ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package netfilter import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // matchMaker knows how to (un)marshal the matcher named name(). type matchMaker interface { // name is the matcher name as stored in the xt_entry_match struct. name() string // revision is the match revision as stored in the xt_entry_match // struct. revision() uint8 // marshal converts from a stack.Matcher to an ABI struct. marshal(matcher matcher) []byte // unmarshal converts from the ABI matcher struct to an // stack.Matcher. unmarshal(mapper IDMapper, buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) } type matchKey struct { name string revision uint8 } func key(mm matchMaker) matchKey { return matchKey{ name: mm.name(), revision: mm.revision(), } } type matcher interface { name() string revision() uint8 } // matchMakers maps the name of supported matchers to the matchMaker that // marshals and unmarshals it. It is immutable after package initialization. var matchMakers = map[matchKey]matchMaker{} // registermatchMaker should be called by match extensions to register them // with the netfilter package. func registerMatchMaker(mm matchMaker) { if _, ok := matchMakers[key(mm)]; ok { panic(fmt.Sprintf("Multiple matches registered with key %+v.", key(mm))) } matchMakers[key(mm)] = mm } func marshalMatcher(mr stack.Matcher) []byte { matcher := mr.(matcher) key := matchKey{ name: matcher.name(), revision: matcher.revision(), } matchMaker, ok := matchMakers[key] if !ok { panic(fmt.Sprintf("Unknown matcher of type %T.", matcher)) } return matchMaker.marshal(matcher) } // marshalEntryMatch creates a marshalled XTEntryMatch with the given name and // data appended at the end. func marshalEntryMatch(name string, data []byte) []byte { nflog("marshaling matcher %q", name) // We have to pad this struct size to a multiple of 8 bytes. size := bits.AlignUp(linux.SizeOfXTEntryMatch+len(data), 8) matcher := linux.KernelXTEntryMatch{ XTEntryMatch: linux.XTEntryMatch{ MatchSize: uint16(size), }, Data: data, } copy(matcher.Name[:], name) buf := make([]byte, size) bufRemain := matcher.XTEntryMatch.MarshalUnsafe(buf) copy(bufRemain, matcher.Data) return buf } func unmarshalMatcher(mapper IDMapper, match linux.XTEntryMatch, filter stack.IPHeaderFilter, buf []byte) (stack.Matcher, error) { key := matchKey{ name: match.Name.String(), revision: match.Revision, } matchMaker, ok := matchMakers[key] if !ok { return nil, fmt.Errorf("unsupported matcher with name %q and revision %d", match.Name.String(), match.Revision) } return matchMaker.unmarshal(mapper, buf, filter) } // targetMaker knows how to (un)marshal a target. Once registered, // marshalTarget and unmarshalTarget can be used. type targetMaker interface { // id uniquely identifies the target. id() targetID // marshal converts from a target to an ABI struct. marshal(target target) []byte // unmarshal converts from the ABI matcher struct to a target. unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) } // A targetID uniquely identifies a target. type targetID struct { // name is the target name as stored in the xt_entry_target struct. name string // networkProtocol is the protocol to which the target applies. networkProtocol tcpip.NetworkProtocolNumber // revision is the version of the target. revision uint8 } // target extends a stack.Target, allowing it to be used with the extension // system. The sentry only uses targets, never stack.Targets directly. type target interface { stack.Target id() targetID } // targetMakers maps the targetID of supported targets to the targetMaker that // marshals and unmarshals it. It is immutable after package initialization. var targetMakers = map[targetID]targetMaker{} // targetRevision returns the maximum supported version of the matcher with // name `name` up to rev, and whether any such matcher with that name exists. func targetRevision(name string, netProto tcpip.NetworkProtocolNumber, rev uint8) (uint8, bool) { tid := targetID{ name: name, networkProtocol: netProto, revision: rev, } if _, ok := targetMakers[tid]; ok { return rev, true } // Return the highest supported revision. var found bool var ret uint8 for _, cur := range targetMakers { curID := cur.id() if name == curID.name && netProto == curID.networkProtocol { found = true if curID.revision > ret { ret = uint8(curID.revision) } } } return ret, found } // registerTargetMaker should be called by target extensions to register them // with the netfilter package. func registerTargetMaker(tm targetMaker) { if _, ok := targetMakers[tm.id()]; ok { panic(fmt.Sprintf("multiple targets registered with name %q.", tm.id())) } targetMakers[tm.id()] = tm } func marshalTarget(tgt stack.Target) []byte { // The sentry only uses targets, never stack.Targets directly. target := tgt.(target) targetMaker, ok := targetMakers[target.id()] if !ok { panic(fmt.Sprintf("unknown target of type %T with id %+v.", target, target.id())) } return targetMaker.marshal(target) } func unmarshalTarget(target linux.XTEntryTarget, filter stack.IPHeaderFilter, buf []byte) (target, *syserr.Error) { tid := targetID{ name: target.Name.String(), networkProtocol: filter.NetworkProtocol(), revision: target.Revision, } targetMaker, ok := targetMakers[tid] if !ok { nflog("unsupported target with name %q, proto %d, and revision %d", target.Name.String(), tid.networkProtocol, tid.revision) return nil, syserr.ErrInvalidArgument } return targetMaker.unmarshal(buf, filter) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netfilter/ipv4.go000066400000000000000000000225271465435605700254550ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package netfilter import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // emptyIPv4Filter is for comparison with a rule's filters to determine whether // it is also empty. It is immutable. var emptyIPv4Filter = stack.IPHeaderFilter{ Dst: tcpip.AddrFrom4([4]byte{0x00, 0x00, 0x00, 0x00}), DstMask: tcpip.AddrFrom4([4]byte{0x00, 0x00, 0x00, 0x00}), Src: tcpip.AddrFrom4([4]byte{0x00, 0x00, 0x00, 0x00}), SrcMask: tcpip.AddrFrom4([4]byte{0x00, 0x00, 0x00, 0x00}), } // convertNetstackToBinary4 converts the iptables as stored in netstack to the // format expected by the iptables tool. Linux stores each table as a binary // blob that can only be traversed by parsing a little data, reading some // offsets, jumping to those offsets, parsing again, etc. func convertNetstackToBinary4(stk *stack.Stack, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo, error) { // The table name has to fit in the struct. if linux.XT_TABLE_MAXNAMELEN < len(tablename) { return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename) } id, ok := nameToID[tablename.String()] if !ok { return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename) } // Setup the info struct. entries, info := getEntries4(stk.IPTables().GetTable(id, false), tablename) return entries, info, nil } func getEntries4(table stack.Table, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo) { var info linux.IPTGetinfo var entries linux.KernelIPTGetEntries copy(info.Name[:], tablename[:]) copy(entries.Name[:], info.Name[:]) info.ValidHooks = table.ValidHooks() for ruleIdx, rule := range table.Rules { nflog("convert to binary: current offset: %d", entries.Size) setHooksAndUnderflow(&info, table, entries.Size, ruleIdx) // Each rule corresponds to an entry. entry := linux.KernelIPTEntry{ Entry: linux.IPTEntry{ IP: linux.IPTIP{ Protocol: uint16(rule.Filter.Protocol), }, NextOffset: linux.SizeOfIPTEntry, TargetOffset: linux.SizeOfIPTEntry, }, } copy(entry.Entry.IP.Dst[:], rule.Filter.Dst.AsSlice()) copy(entry.Entry.IP.DstMask[:], rule.Filter.DstMask.AsSlice()) copy(entry.Entry.IP.Src[:], rule.Filter.Src.AsSlice()) copy(entry.Entry.IP.SrcMask[:], rule.Filter.SrcMask.AsSlice()) copy(entry.Entry.IP.OutputInterface[:], rule.Filter.OutputInterface) copy(entry.Entry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask) copy(entry.Entry.IP.InputInterface[:], rule.Filter.InputInterface) copy(entry.Entry.IP.InputInterfaceMask[:], rule.Filter.InputInterfaceMask) if rule.Filter.DstInvert { entry.Entry.IP.InverseFlags |= linux.IPT_INV_DSTIP } if rule.Filter.SrcInvert { entry.Entry.IP.InverseFlags |= linux.IPT_INV_SRCIP } if rule.Filter.OutputInterfaceInvert { entry.Entry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT } for _, matcher := range rule.Matchers { // Serialize the matcher and add it to the // entry. serialized := marshalMatcher(matcher) nflog("convert to binary: matcher serialized as: %v", serialized) if len(serialized)%8 != 0 { panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher)) } entry.Elems = append(entry.Elems, serialized...) entry.Entry.NextOffset += uint16(len(serialized)) entry.Entry.TargetOffset += uint16(len(serialized)) } // Serialize and append the target. serialized := marshalTarget(rule.Target) if len(serialized)%8 != 0 { panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target)) } entry.Elems = append(entry.Elems, serialized...) entry.Entry.NextOffset += uint16(len(serialized)) nflog("convert to binary: adding entry: %+v", entry) entries.Size += uint32(entry.Entry.NextOffset) entries.Entrytable = append(entries.Entrytable, entry) info.NumEntries++ } info.Size = entries.Size nflog("convert to binary: finished with an marshalled size of %d", info.Size) return entries, info } func modifyEntries4(mapper IDMapper, stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) { nflog("set entries: setting entries in table %q", replace.Name.String()) // Convert input into a list of rules and their offsets. var offset uint32 // offsets maps rule byte offsets to their position in table.Rules. offsets := map[uint32]int{} for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ { nflog("set entries: processing entry at offset %d", offset) // Get the struct ipt_entry. if len(optVal) < linux.SizeOfIPTEntry { nflog("optVal has insufficient size for entry %d", len(optVal)) return nil, syserr.ErrInvalidArgument } initialOptValLen := len(optVal) var entry linux.IPTEntry optVal = entry.UnmarshalUnsafe(optVal) if entry.TargetOffset < linux.SizeOfIPTEntry { nflog("entry has too-small target offset %d", entry.TargetOffset) return nil, syserr.ErrInvalidArgument } filter, err := filterFromIPTIP(entry.IP) if err != nil { nflog("bad iptip: %v", err) return nil, syserr.ErrInvalidArgument } // Get matchers. matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry if len(optVal) < int(matchersSize) { nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal)) return nil, syserr.ErrInvalidArgument } matchers, err := parseMatchers(mapper, filter, optVal[:matchersSize]) if err != nil { nflog("failed to parse matchers: %v", err) return nil, syserr.ErrInvalidArgument } optVal = optVal[matchersSize:] // Get the target of the rule. targetSize := entry.NextOffset - entry.TargetOffset if len(optVal) < int(targetSize) { nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal)) return nil, syserr.ErrInvalidArgument } rule := stack.Rule{ Filter: filter, Matchers: matchers, } { target, err := parseTarget(filter, optVal[:targetSize], false /* ipv6 */) if err != nil { nflog("failed to parse target: %v", err) return nil, err } rule.Target = target } optVal = optVal[targetSize:] table.Rules = append(table.Rules, rule) offsets[offset] = int(entryIdx) offset += uint32(entry.NextOffset) if initialOptValLen-len(optVal) != int(entry.NextOffset) { nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal)) return nil, syserr.ErrInvalidArgument } } return offsets, nil } func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) { if containsUnsupportedFields4(iptip) { return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip) } if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize { return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask)) } if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize { return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask)) } return stack.IPHeaderFilter{ Protocol: tcpip.TransportProtocolNumber(iptip.Protocol), // A Protocol value of 0 indicates all protocols match. CheckProtocol: iptip.Protocol != 0, Dst: tcpip.AddrFrom4(iptip.Dst), DstMask: tcpip.AddrFrom4(iptip.DstMask), DstInvert: iptip.InverseFlags&linux.IPT_INV_DSTIP != 0, Src: tcpip.AddrFrom4(iptip.Src), SrcMask: tcpip.AddrFrom4(iptip.SrcMask), SrcInvert: iptip.InverseFlags&linux.IPT_INV_SRCIP != 0, InputInterface: string(trimNullBytes(iptip.InputInterface[:])), InputInterfaceMask: string(trimNullBytes(iptip.InputInterfaceMask[:])), InputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_IN != 0, OutputInterface: string(trimNullBytes(iptip.OutputInterface[:])), OutputInterfaceMask: string(trimNullBytes(iptip.OutputInterfaceMask[:])), OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0, }, nil } func containsUnsupportedFields4(iptip linux.IPTIP) bool { // The following features are supported: // - Protocol // - Dst and DstMask // - Src and SrcMask // - The inverse destination IP check flag // - InputInterface, InputInterfaceMask and its inverse. // - OutputInterface, OutputInterfaceMask and its inverse. const flagMask = 0 // Disable any supported inverse flags. const inverseMask = linux.IPT_INV_DSTIP | linux.IPT_INV_SRCIP | linux.IPT_INV_VIA_IN | linux.IPT_INV_VIA_OUT return iptip.Flags&^flagMask != 0 || iptip.InverseFlags&^inverseMask != 0 } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netfilter/ipv6.go000066400000000000000000000235561465435605700254620ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package netfilter import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // emptyIPv6Filter is for comparison with a rule's filters to determine whether // it is also empty. It is immutable. var emptyIPv6Filter = stack.IPHeaderFilter{ Dst: tcpip.AddrFrom16([16]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}), DstMask: tcpip.AddrFrom16([16]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}), Src: tcpip.AddrFrom16([16]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}), SrcMask: tcpip.AddrFrom16([16]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}), } // convertNetstackToBinary6 converts the ip6tables as stored in netstack to the // format expected by the iptables tool. Linux stores each table as a binary // blob that can only be traversed by parsing a little data, reading some // offsets, jumping to those offsets, parsing again, etc. func convertNetstackToBinary6(stk *stack.Stack, tablename linux.TableName) (linux.KernelIP6TGetEntries, linux.IPTGetinfo, error) { // The table name has to fit in the struct. if linux.XT_TABLE_MAXNAMELEN < len(tablename) { return linux.KernelIP6TGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename) } id, ok := nameToID[tablename.String()] if !ok { return linux.KernelIP6TGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename) } // Setup the info struct, which is the same in IPv4 and IPv6. entries, info := getEntries6(stk.IPTables().GetTable(id, true), tablename) return entries, info, nil } func getEntries6(table stack.Table, tablename linux.TableName) (linux.KernelIP6TGetEntries, linux.IPTGetinfo) { var info linux.IPTGetinfo var entries linux.KernelIP6TGetEntries copy(info.Name[:], tablename[:]) copy(entries.Name[:], info.Name[:]) info.ValidHooks = table.ValidHooks() for ruleIdx, rule := range table.Rules { nflog("convert to binary: current offset: %d", entries.Size) setHooksAndUnderflow(&info, table, entries.Size, ruleIdx) // Each rule corresponds to an entry. entry := linux.KernelIP6TEntry{ Entry: linux.IP6TEntry{ IPv6: linux.IP6TIP{ Protocol: uint16(rule.Filter.Protocol), }, NextOffset: linux.SizeOfIP6TEntry, TargetOffset: linux.SizeOfIP6TEntry, }, } copy(entry.Entry.IPv6.Dst[:], rule.Filter.Dst.AsSlice()) copy(entry.Entry.IPv6.DstMask[:], rule.Filter.DstMask.AsSlice()) copy(entry.Entry.IPv6.Src[:], rule.Filter.Src.AsSlice()) copy(entry.Entry.IPv6.SrcMask[:], rule.Filter.SrcMask.AsSlice()) copy(entry.Entry.IPv6.OutputInterface[:], rule.Filter.OutputInterface) copy(entry.Entry.IPv6.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask) copy(entry.Entry.IPv6.InputInterface[:], rule.Filter.InputInterface) copy(entry.Entry.IPv6.InputInterfaceMask[:], rule.Filter.InputInterfaceMask) if rule.Filter.DstInvert { entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_DSTIP } if rule.Filter.SrcInvert { entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_SRCIP } if rule.Filter.OutputInterfaceInvert { entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_VIA_OUT } if rule.Filter.CheckProtocol { entry.Entry.IPv6.Flags |= linux.IP6T_F_PROTO } for _, matcher := range rule.Matchers { // Serialize the matcher and add it to the // entry. serialized := marshalMatcher(matcher) nflog("convert to binary: matcher serialized as: %v", serialized) if len(serialized)%8 != 0 { panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher)) } entry.Elems = append(entry.Elems, serialized...) entry.Entry.NextOffset += uint16(len(serialized)) entry.Entry.TargetOffset += uint16(len(serialized)) } // Serialize and append the target. serialized := marshalTarget(rule.Target) if len(serialized)%8 != 0 { panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target)) } entry.Elems = append(entry.Elems, serialized...) entry.Entry.NextOffset += uint16(len(serialized)) nflog("convert to binary: adding entry: %+v", entry) entries.Size += uint32(entry.Entry.NextOffset) entries.Entrytable = append(entries.Entrytable, entry) info.NumEntries++ } info.Size = entries.Size nflog("convert to binary: finished with an marshalled size of %d", info.Size) return entries, info } func modifyEntries6(mapper IDMapper, stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) { nflog("set entries: setting entries in table %q", replace.Name.String()) // Convert input into a list of rules and their offsets. var offset uint32 // offsets maps rule byte offsets to their position in table.Rules. offsets := map[uint32]int{} for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ { nflog("set entries: processing entry at offset %d", offset) // Get the struct ipt_entry. if len(optVal) < linux.SizeOfIP6TEntry { nflog("optVal has insufficient size for entry %d", len(optVal)) return nil, syserr.ErrInvalidArgument } initialOptValLen := len(optVal) var entry linux.IP6TEntry optVal = entry.UnmarshalUnsafe(optVal) if entry.TargetOffset < linux.SizeOfIP6TEntry { nflog("entry has too-small target offset %d", entry.TargetOffset) return nil, syserr.ErrInvalidArgument } filter, err := filterFromIP6TIP(entry.IPv6) if err != nil { nflog("bad iptip: %v", err) return nil, syserr.ErrInvalidArgument } // Get matchers. matchersSize := entry.TargetOffset - linux.SizeOfIP6TEntry if len(optVal) < int(matchersSize) { nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal)) return nil, syserr.ErrInvalidArgument } matchers, err := parseMatchers(mapper, filter, optVal[:matchersSize]) if err != nil { nflog("failed to parse matchers: %v", err) return nil, syserr.ErrInvalidArgument } optVal = optVal[matchersSize:] // Get the target of the rule. targetSize := entry.NextOffset - entry.TargetOffset if len(optVal) < int(targetSize) { nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal)) return nil, syserr.ErrInvalidArgument } rule := stack.Rule{ Filter: filter, Matchers: matchers, } { target, err := parseTarget(filter, optVal[:targetSize], true /* ipv6 */) if err != nil { nflog("failed to parse target: %v", err) return nil, err } rule.Target = target } optVal = optVal[targetSize:] table.Rules = append(table.Rules, rule) offsets[offset] = int(entryIdx) offset += uint32(entry.NextOffset) if initialOptValLen-len(optVal) != int(entry.NextOffset) { nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal)) return nil, syserr.ErrInvalidArgument } } return offsets, nil } func filterFromIP6TIP(iptip linux.IP6TIP) (stack.IPHeaderFilter, error) { if containsUnsupportedFields6(iptip) { return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip) } if len(iptip.Dst) != header.IPv6AddressSize || len(iptip.DstMask) != header.IPv6AddressSize { return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask)) } if len(iptip.Src) != header.IPv6AddressSize || len(iptip.SrcMask) != header.IPv6AddressSize { return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask)) } return stack.IPHeaderFilter{ Protocol: tcpip.TransportProtocolNumber(iptip.Protocol), // In ip6tables a flag controls whether to check the protocol. CheckProtocol: iptip.Flags&linux.IP6T_F_PROTO != 0, Dst: tcpip.AddrFrom16(iptip.Dst), DstMask: tcpip.AddrFrom16(iptip.DstMask), DstInvert: iptip.InverseFlags&linux.IP6T_INV_DSTIP != 0, Src: tcpip.AddrFrom16(iptip.Src), SrcMask: tcpip.AddrFrom16(iptip.SrcMask), SrcInvert: iptip.InverseFlags&linux.IP6T_INV_SRCIP != 0, InputInterface: string(trimNullBytes(iptip.InputInterface[:])), InputInterfaceMask: string(trimNullBytes(iptip.InputInterfaceMask[:])), InputInterfaceInvert: iptip.InverseFlags&linux.IP6T_INV_VIA_IN != 0, OutputInterface: string(trimNullBytes(iptip.OutputInterface[:])), OutputInterfaceMask: string(trimNullBytes(iptip.OutputInterfaceMask[:])), OutputInterfaceInvert: iptip.InverseFlags&linux.IP6T_INV_VIA_OUT != 0, }, nil } func containsUnsupportedFields6(iptip linux.IP6TIP) bool { // The following features are supported: // - Protocol // - Dst and DstMask // - Src and SrcMask // - The inverse destination IP check flag // - InputInterface, InputInterfaceMask and its inverse. // - OutputInterface, OutputInterfaceMask and its inverse. const flagMask = linux.IP6T_F_PROTO // Disable any supported inverse flags. const inverseMask = linux.IP6T_INV_DSTIP | linux.IP6T_INV_SRCIP | linux.IP6T_INV_VIA_IN | linux.IP6T_INV_VIA_OUT return iptip.Flags&^flagMask != 0 || iptip.InverseFlags&^inverseMask != 0 || iptip.TOS != 0 } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netfilter/netfilter.go000066400000000000000000000306311465435605700265620ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package netfilter helps the sentry interact with netstack's netfilter // capabilities. package netfilter import ( "bytes" "errors" "fmt" "math/rand" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // enableLogging controls whether to log the (de)serialization of netfilter // structs between userspace and netstack. These logs are useful when // developing iptables, but can pollute sentry logs otherwise. const enableLogging = true // nflog logs messages related to the writing and reading of iptables. func nflog(format string, args ...any) { if enableLogging && log.IsLogging(log.Debug) { log.Debugf("netfilter: "+format, args...) } } // Table names. const ( natTable = "nat" mangleTable = "mangle" filterTable = "filter" ) // nameToID is immutable. var nameToID = map[string]stack.TableID{ natTable: stack.NATID, mangleTable: stack.MangleID, filterTable: stack.FilterID, } // DefaultLinuxTables returns the rules of stack.DefaultTables() wrapped for // compatibility with netfilter extensions. func DefaultLinuxTables(clock tcpip.Clock, rand *rand.Rand) *stack.IPTables { tables := stack.DefaultTables(clock, rand) tables.VisitTargets(func(oldTarget stack.Target) stack.Target { switch val := oldTarget.(type) { case *stack.AcceptTarget: return &acceptTarget{AcceptTarget: *val} case *stack.DropTarget: return &dropTarget{DropTarget: *val} case *stack.ErrorTarget: return &errorTarget{ErrorTarget: *val} case *stack.UserChainTarget: return &userChainTarget{UserChainTarget: *val} case *stack.ReturnTarget: return &returnTarget{ReturnTarget: *val} case *stack.RedirectTarget: return &redirectTarget{RedirectTarget: *val} default: panic(fmt.Sprintf("Unknown rule in default iptables of type %T", val)) } }) return tables } // GetInfo returns information about iptables. func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr hostarch.Addr, ipv6 bool) (linux.IPTGetinfo, *syserr.Error) { // Read in the struct and table name. var info linux.IPTGetinfo if _, err := info.CopyIn(t, outPtr); err != nil { return linux.IPTGetinfo{}, syserr.FromError(err) } var err error if ipv6 { _, info, err = convertNetstackToBinary6(stack, info.Name) } else { _, info, err = convertNetstackToBinary4(stack, info.Name) } if err != nil { nflog("couldn't convert iptables: %v", err) return linux.IPTGetinfo{}, syserr.ErrInvalidArgument } nflog("returning info: %+v", info) return info, nil } // GetEntries4 returns netstack's iptables rules. func GetEntries4(t *kernel.Task, stack *stack.Stack, outPtr hostarch.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) { // Read in the struct and table name. var userEntries linux.IPTGetEntries if _, err := userEntries.CopyIn(t, outPtr); err != nil { nflog("couldn't copy in entries %q", userEntries.Name) return linux.KernelIPTGetEntries{}, syserr.FromError(err) } // Convert netstack's iptables rules to something that the iptables // tool can understand. entries, _, err := convertNetstackToBinary4(stack, userEntries.Name) if err != nil { nflog("couldn't read entries: %v", err) return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument } if entries.SizeBytes() > outLen { nflog("insufficient GetEntries output size: %d", uintptr(outLen)) return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument } return entries, nil } // GetEntries6 returns netstack's ip6tables rules. func GetEntries6(t *kernel.Task, stack *stack.Stack, outPtr hostarch.Addr, outLen int) (linux.KernelIP6TGetEntries, *syserr.Error) { // Read in the struct and table name. IPv4 and IPv6 utilize structs // with the same layout. var userEntries linux.IPTGetEntries if _, err := userEntries.CopyIn(t, outPtr); err != nil { nflog("couldn't copy in entries %q", userEntries.Name) return linux.KernelIP6TGetEntries{}, syserr.FromError(err) } // Convert netstack's iptables rules to something that the iptables // tool can understand. entries, _, err := convertNetstackToBinary6(stack, userEntries.Name) if err != nil { nflog("couldn't read entries: %v", err) return linux.KernelIP6TGetEntries{}, syserr.ErrInvalidArgument } if entries.SizeBytes() > outLen { nflog("insufficient GetEntries output size: %d", uintptr(outLen)) return linux.KernelIP6TGetEntries{}, syserr.ErrInvalidArgument } return entries, nil } // setHooksAndUnderflow checks whether the rule at ruleIdx is a hook entrypoint // or underflow, in which case it fills in info.HookEntry and info.Underflows. func setHooksAndUnderflow(info *linux.IPTGetinfo, table stack.Table, offset uint32, ruleIdx int) { // Is this a chain entry point? for hook, hookRuleIdx := range table.BuiltinChains { if hookRuleIdx == ruleIdx { nflog("convert to binary: found hook %d at offset %d", hook, offset) info.HookEntry[hook] = offset } } // Is this a chain underflow point? for underflow, underflowRuleIdx := range table.Underflows { if underflowRuleIdx == ruleIdx { nflog("convert to binary: found underflow %d at offset %d", underflow, offset) info.Underflow[underflow] = offset } } } // An IDMapper maps UIDs and GIDs to KUIDs and KGIDs. type IDMapper interface { MapToKUID(uid auth.UID) auth.KUID MapToKGID(uid auth.GID) auth.KGID } // SetEntries sets iptables rules for a single table. See // net/ipv4/netfilter/ip_tables.c:translate_table for reference. func SetEntries(mapper IDMapper, stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error { var replace linux.IPTReplace optVal = replace.UnmarshalBytes(optVal) var table stack.Table switch replace.Name.String() { case filterTable: table = stack.EmptyFilterTable() case natTable: table = stack.EmptyNATTable() default: nflog("unknown iptables table %q", replace.Name.String()) return syserr.ErrInvalidArgument } var err *syserr.Error var offsets map[uint32]int if ipv6 { offsets, err = modifyEntries6(mapper, stk, optVal, &replace, &table) } else { offsets, err = modifyEntries4(mapper, stk, optVal, &replace, &table) } if err != nil { return err } // Go through the list of supported hooks for this table and, for each // one, set the rule it corresponds to. for hook := range replace.HookEntry { if table.ValidHooks()&(1< 0 { nflog("set entries: optVal has len %d", len(optVal)) // Get the XTEntryMatch. if len(optVal) < linux.SizeOfXTEntryMatch { return nil, fmt.Errorf("optVal has insufficient size for entry match: %d", len(optVal)) } var match linux.XTEntryMatch match.UnmarshalUnsafe(optVal) nflog("set entries: parsed entry match %q: %+v", match.Name.String(), match) // Check some invariants. if match.MatchSize < linux.SizeOfXTEntryMatch { return nil, fmt.Errorf("match size is too small, must be at least %d", linux.SizeOfXTEntryMatch) } if len(optVal) < int(match.MatchSize) { return nil, fmt.Errorf("optVal has insufficient size for match: %d", len(optVal)) } // Parse the specific matcher. matcher, err := unmarshalMatcher(mapper, match, filter, optVal[linux.SizeOfXTEntryMatch:match.MatchSize]) if err != nil { return nil, fmt.Errorf("failed to create matcher: %v", err) } matchers = append(matchers, matcher) // TODO(gvisor.dev/issue/6167): Check the revision field. optVal = optVal[match.MatchSize:] } if len(optVal) != 0 { return nil, errors.New("optVal should be exhausted after parsing matchers") } return matchers, nil } func validUnderflow(rule stack.Rule, ipv6 bool) bool { if len(rule.Matchers) != 0 { return false } if (ipv6 && rule.Filter != emptyIPv6Filter) || (!ipv6 && rule.Filter != emptyIPv4Filter) { return false } switch rule.Target.(type) { case *acceptTarget, *dropTarget: return true default: return false } } func isUnconditionalAccept(rule stack.Rule, ipv6 bool) bool { if !validUnderflow(rule, ipv6) { return false } _, ok := rule.Target.(*acceptTarget) return ok } func hookFromLinux(hook int) stack.Hook { switch hook { case linux.NF_INET_PRE_ROUTING: return stack.Prerouting case linux.NF_INET_LOCAL_IN: return stack.Input case linux.NF_INET_FORWARD: return stack.Forward case linux.NF_INET_LOCAL_OUT: return stack.Output case linux.NF_INET_POST_ROUTING: return stack.Postrouting } panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook)) } // TargetRevision returns a linux.XTGetRevision for a given target. It sets // Revision to the highest supported value, unless the provided revision number // is larger. func TargetRevision(t *kernel.Task, revPtr hostarch.Addr, netProto tcpip.NetworkProtocolNumber) (linux.XTGetRevision, *syserr.Error) { // Read in the target name and version. var rev linux.XTGetRevision if _, err := rev.CopyIn(t, revPtr); err != nil { return linux.XTGetRevision{}, syserr.FromError(err) } maxSupported, ok := targetRevision(rev.Name.String(), netProto, rev.Revision) if !ok { // Return ENOENT if there's no target with that name. return linux.XTGetRevision{}, syserr.ErrNoFileOrDir } if maxSupported < rev.Revision { // Return EPROTONOSUPPORT if we have an insufficient revision. return linux.XTGetRevision{}, syserr.ErrProtocolNotSupported } return rev, nil } func trimNullBytes(b []byte) []byte { n := bytes.IndexByte(b, 0) if n == -1 { n = len(b) } return b[:n] } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netfilter/netfilter_abi_autogen_unsafe.go000066400000000000000000000001511465435605700324520ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package netfilter import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netfilter/netfilter_state_autogen.go000066400000000000000000000166511465435605700315120ustar00rootroot00000000000000// automatically generated by stateify. package netfilter import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (at *acceptTarget) StateTypeName() string { return "pkg/sentry/socket/netfilter.acceptTarget" } func (at *acceptTarget) StateFields() []string { return []string{ "AcceptTarget", } } func (at *acceptTarget) beforeSave() {} // +checklocksignore func (at *acceptTarget) StateSave(stateSinkObject state.Sink) { at.beforeSave() stateSinkObject.Save(0, &at.AcceptTarget) } func (at *acceptTarget) afterLoad(context.Context) {} // +checklocksignore func (at *acceptTarget) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &at.AcceptTarget) } func (dt *dropTarget) StateTypeName() string { return "pkg/sentry/socket/netfilter.dropTarget" } func (dt *dropTarget) StateFields() []string { return []string{ "DropTarget", } } func (dt *dropTarget) beforeSave() {} // +checklocksignore func (dt *dropTarget) StateSave(stateSinkObject state.Sink) { dt.beforeSave() stateSinkObject.Save(0, &dt.DropTarget) } func (dt *dropTarget) afterLoad(context.Context) {} // +checklocksignore func (dt *dropTarget) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &dt.DropTarget) } func (et *errorTarget) StateTypeName() string { return "pkg/sentry/socket/netfilter.errorTarget" } func (et *errorTarget) StateFields() []string { return []string{ "ErrorTarget", } } func (et *errorTarget) beforeSave() {} // +checklocksignore func (et *errorTarget) StateSave(stateSinkObject state.Sink) { et.beforeSave() stateSinkObject.Save(0, &et.ErrorTarget) } func (et *errorTarget) afterLoad(context.Context) {} // +checklocksignore func (et *errorTarget) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &et.ErrorTarget) } func (uc *userChainTarget) StateTypeName() string { return "pkg/sentry/socket/netfilter.userChainTarget" } func (uc *userChainTarget) StateFields() []string { return []string{ "UserChainTarget", } } func (uc *userChainTarget) beforeSave() {} // +checklocksignore func (uc *userChainTarget) StateSave(stateSinkObject state.Sink) { uc.beforeSave() stateSinkObject.Save(0, &uc.UserChainTarget) } func (uc *userChainTarget) afterLoad(context.Context) {} // +checklocksignore func (uc *userChainTarget) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &uc.UserChainTarget) } func (rt *returnTarget) StateTypeName() string { return "pkg/sentry/socket/netfilter.returnTarget" } func (rt *returnTarget) StateFields() []string { return []string{ "ReturnTarget", } } func (rt *returnTarget) beforeSave() {} // +checklocksignore func (rt *returnTarget) StateSave(stateSinkObject state.Sink) { rt.beforeSave() stateSinkObject.Save(0, &rt.ReturnTarget) } func (rt *returnTarget) afterLoad(context.Context) {} // +checklocksignore func (rt *returnTarget) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &rt.ReturnTarget) } func (rt *redirectTarget) StateTypeName() string { return "pkg/sentry/socket/netfilter.redirectTarget" } func (rt *redirectTarget) StateFields() []string { return []string{ "RedirectTarget", "addr", } } func (rt *redirectTarget) beforeSave() {} // +checklocksignore func (rt *redirectTarget) StateSave(stateSinkObject state.Sink) { rt.beforeSave() stateSinkObject.Save(0, &rt.RedirectTarget) stateSinkObject.Save(1, &rt.addr) } func (rt *redirectTarget) afterLoad(context.Context) {} // +checklocksignore func (rt *redirectTarget) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &rt.RedirectTarget) stateSourceObject.Load(1, &rt.addr) } func (sm *standardTargetMaker) StateTypeName() string { return "pkg/sentry/socket/netfilter.standardTargetMaker" } func (sm *standardTargetMaker) StateFields() []string { return []string{ "NetworkProtocol", } } func (sm *standardTargetMaker) beforeSave() {} // +checklocksignore func (sm *standardTargetMaker) StateSave(stateSinkObject state.Sink) { sm.beforeSave() stateSinkObject.Save(0, &sm.NetworkProtocol) } func (sm *standardTargetMaker) afterLoad(context.Context) {} // +checklocksignore func (sm *standardTargetMaker) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &sm.NetworkProtocol) } func (em *errorTargetMaker) StateTypeName() string { return "pkg/sentry/socket/netfilter.errorTargetMaker" } func (em *errorTargetMaker) StateFields() []string { return []string{ "NetworkProtocol", } } func (em *errorTargetMaker) beforeSave() {} // +checklocksignore func (em *errorTargetMaker) StateSave(stateSinkObject state.Sink) { em.beforeSave() stateSinkObject.Save(0, &em.NetworkProtocol) } func (em *errorTargetMaker) afterLoad(context.Context) {} // +checklocksignore func (em *errorTargetMaker) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &em.NetworkProtocol) } func (rm *redirectTargetMaker) StateTypeName() string { return "pkg/sentry/socket/netfilter.redirectTargetMaker" } func (rm *redirectTargetMaker) StateFields() []string { return []string{ "NetworkProtocol", } } func (rm *redirectTargetMaker) beforeSave() {} // +checklocksignore func (rm *redirectTargetMaker) StateSave(stateSinkObject state.Sink) { rm.beforeSave() stateSinkObject.Save(0, &rm.NetworkProtocol) } func (rm *redirectTargetMaker) afterLoad(context.Context) {} // +checklocksignore func (rm *redirectTargetMaker) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &rm.NetworkProtocol) } func (rm *nfNATTargetMaker) StateTypeName() string { return "pkg/sentry/socket/netfilter.nfNATTargetMaker" } func (rm *nfNATTargetMaker) StateFields() []string { return []string{ "NetworkProtocol", } } func (rm *nfNATTargetMaker) beforeSave() {} // +checklocksignore func (rm *nfNATTargetMaker) StateSave(stateSinkObject state.Sink) { rm.beforeSave() stateSinkObject.Save(0, &rm.NetworkProtocol) } func (rm *nfNATTargetMaker) afterLoad(context.Context) {} // +checklocksignore func (rm *nfNATTargetMaker) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &rm.NetworkProtocol) } func (jt *JumpTarget) StateTypeName() string { return "pkg/sentry/socket/netfilter.JumpTarget" } func (jt *JumpTarget) StateFields() []string { return []string{ "Offset", "RuleNum", "NetworkProtocol", } } func (jt *JumpTarget) beforeSave() {} // +checklocksignore func (jt *JumpTarget) StateSave(stateSinkObject state.Sink) { jt.beforeSave() stateSinkObject.Save(0, &jt.Offset) stateSinkObject.Save(1, &jt.RuleNum) stateSinkObject.Save(2, &jt.NetworkProtocol) } func (jt *JumpTarget) afterLoad(context.Context) {} // +checklocksignore func (jt *JumpTarget) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &jt.Offset) stateSourceObject.Load(1, &jt.RuleNum) stateSourceObject.Load(2, &jt.NetworkProtocol) } func init() { state.Register((*acceptTarget)(nil)) state.Register((*dropTarget)(nil)) state.Register((*errorTarget)(nil)) state.Register((*userChainTarget)(nil)) state.Register((*returnTarget)(nil)) state.Register((*redirectTarget)(nil)) state.Register((*standardTargetMaker)(nil)) state.Register((*errorTargetMaker)(nil)) state.Register((*redirectTargetMaker)(nil)) state.Register((*nfNATTargetMaker)(nil)) state.Register((*JumpTarget)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netfilter/owner_matcher.go000066400000000000000000000073361465435605700274310ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package netfilter import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/tcpip/stack" ) const matcherNameOwner = "owner" func init() { registerMatchMaker(ownerMarshaler{}) } // ownerMarshaler implements matchMaker for owner matching. type ownerMarshaler struct{} // name implements matchMaker.name. func (ownerMarshaler) name() string { return matcherNameOwner } func (ownerMarshaler) revision() uint8 { return 0 } // marshal implements matchMaker.marshal. func (ownerMarshaler) marshal(mr matcher) []byte { matcher := mr.(*OwnerMatcher) iptOwnerInfo := linux.IPTOwnerInfo{ UID: uint32(matcher.uid), GID: uint32(matcher.gid), } // Support for UID and GID match. if matcher.matchUID { iptOwnerInfo.Match = linux.XT_OWNER_UID if matcher.invertUID { iptOwnerInfo.Invert = linux.XT_OWNER_UID } } if matcher.matchGID { iptOwnerInfo.Match |= linux.XT_OWNER_GID if matcher.invertGID { iptOwnerInfo.Invert |= linux.XT_OWNER_GID } } buf := marshal.Marshal(&iptOwnerInfo) return marshalEntryMatch(matcherNameOwner, buf) } // unmarshal implements matchMaker.unmarshal. func (ownerMarshaler) unmarshal(mapper IDMapper, buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { if len(buf) < linux.SizeOfIPTOwnerInfo { return nil, fmt.Errorf("buf has insufficient size for owner match: %d", len(buf)) } // For alignment reasons, the match's total size may // exceed what's strictly necessary to hold matchData. var matchData linux.IPTOwnerInfo matchData.UnmarshalUnsafe(buf) nflog("parsed IPTOwnerInfo: %+v", matchData) var owner OwnerMatcher owner.uid = mapper.MapToKUID(auth.UID(matchData.UID)) owner.gid = mapper.MapToKGID(auth.GID(matchData.GID)) // Check flags. if matchData.Match&linux.XT_OWNER_UID != 0 { owner.matchUID = true if matchData.Invert&linux.XT_OWNER_UID != 0 { owner.invertUID = true } } if matchData.Match&linux.XT_OWNER_GID != 0 { owner.matchGID = true if matchData.Invert&linux.XT_OWNER_GID != 0 { owner.invertGID = true } } return &owner, nil } // OwnerMatcher matches against a UID and/or GID. type OwnerMatcher struct { uid auth.KUID gid auth.KGID matchUID bool matchGID bool invertUID bool invertGID bool } // name implements matcher.name. func (*OwnerMatcher) name() string { return matcherNameOwner } func (*OwnerMatcher) revision() uint8 { return 0 } // Match implements Matcher.Match. func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ string) (bool, bool) { // Support only for OUTPUT chain. if hook != stack.Output { return false, true } // If the packet owner is not set, drop the packet. if pkt.Owner == nil { return false, true } var matches bool // Check for UID match. if om.matchUID { if auth.KUID(pkt.Owner.KUID()) == om.uid { matches = true } if matches == om.invertUID { return false, false } } // Check for GID match. if om.matchGID { matches = false if auth.KGID(pkt.Owner.KGID()) == om.gid { matches = true } if matches == om.invertGID { return false, false } } return true, false } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netfilter/owner_matcher_v1.go000066400000000000000000000076111465435605700300330ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package netfilter import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/tcpip/stack" ) func init() { registerMatchMaker(ownerMarshalerV1{}) } // ownerMarshalerV1 implements matchMaker for owner matching. type ownerMarshalerV1 struct{} // name implements matchMaker.name. func (ownerMarshalerV1) name() string { return matcherNameOwner } func (ownerMarshalerV1) revision() uint8 { return 1 } // marshal implements matchMaker.marshal. func (ownerMarshalerV1) marshal(mr matcher) []byte { matcher := mr.(*OwnerMatcherV1) ownerInfo := linux.XTOwnerMatchInfo{ UIDMin: uint32(matcher.uid), UIDMax: uint32(matcher.uid), GIDMin: uint32(matcher.gid), GIDMax: uint32(matcher.gid), } // Support for UID and GID match. if matcher.matchUID { ownerInfo.Match |= linux.XT_OWNER_UID } if matcher.matchGID { ownerInfo.Match |= linux.XT_OWNER_GID } if matcher.invertUID { ownerInfo.Invert |= linux.XT_OWNER_UID } if matcher.invertGID { ownerInfo.Invert |= linux.XT_OWNER_GID } buf := marshal.Marshal(&ownerInfo) return marshalEntryMatch(matcherNameOwner, buf) } // unmarshal implements matchMaker.unmarshal. func (ownerMarshalerV1) unmarshal(mapper IDMapper, buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { if len(buf) < linux.SizeOfXTOwnerMatchInfo { return nil, fmt.Errorf("buf has insufficient size for owner match: %d", len(buf)) } // For alignment reasons, the match's total size may // exceed what's strictly necessary to hold matchData. var matchData linux.XTOwnerMatchInfo matchData.UnmarshalUnsafe(buf) nflog("parsed XTOwnerMatchInfo: %+v", matchData) if matchData.UIDMin != matchData.UIDMax { nflog("owner v1 doesn't support differing UID min/max") } if matchData.GIDMin != matchData.GIDMax { nflog("owner v1 doesn't support differing GID min/max") } owner := OwnerMatcherV1{ uid: mapper.MapToKUID(auth.UID(matchData.UIDMin)), gid: mapper.MapToKGID(auth.GID(matchData.GIDMin)), matchUID: matchData.Match&linux.XT_OWNER_UID != 0, matchGID: matchData.Match&linux.XT_OWNER_GID != 0, invertUID: matchData.Invert&linux.XT_OWNER_UID != 0, invertGID: matchData.Invert&linux.XT_OWNER_GID != 0, } return &owner, nil } // OwnerMatcherV1 matches against a UID and/or GID. type OwnerMatcherV1 struct { uid auth.KUID gid auth.KGID matchUID bool matchGID bool invertUID bool invertGID bool } // name implements matcher.name. func (*OwnerMatcherV1) name() string { return matcherNameOwner } func (*OwnerMatcherV1) revision() uint8 { return 1 } // Match implements Matcher.Match. func (om *OwnerMatcherV1) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ string) (bool, bool) { // Support only for OUTPUT chain. if hook != stack.Output { return false, true } // If the packet owner is not set, drop the packet. if pkt.Owner == nil { return false, true } var matches bool // Check for UID match. if om.matchUID { if auth.KUID(pkt.Owner.KUID()) == om.uid { matches = true } if matches == om.invertUID { return false, false } } // Check for GID match. if om.matchGID { matches = false if auth.KGID(pkt.Owner.KGID()) == om.gid { matches = true } if matches == om.invertGID { return false, false } } return true, false } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netfilter/snat.go000066400000000000000000000221611465435605700255320ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package netfilter import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // SNATTargetName is used to mark targets as SNAT targets. SNAT targets should // be reached for only NAT table. These targets will change the source port // and/or IP for packets. const SNATTargetName = "SNAT" type snatTarget struct { stack.SNATTarget revision uint8 } func (st *snatTarget) id() targetID { return targetID{ name: SNATTargetName, networkProtocol: st.NetworkProtocol, revision: st.revision, } } type snatTargetMakerV4 struct { NetworkProtocol tcpip.NetworkProtocolNumber } func (st *snatTargetMakerV4) id() targetID { return targetID{ name: SNATTargetName, networkProtocol: st.NetworkProtocol, } } func (*snatTargetMakerV4) marshal(target target) []byte { st := target.(*snatTarget) // This is a snat target named snat. xt := linux.XTNATTargetV0{ Target: linux.XTEntryTarget{ TargetSize: linux.SizeOfXTNATTargetV0, }, } copy(xt.Target.Name[:], SNATTargetName) if st.ChangeAddress { xt.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_MAP_IPS } if st.ChangePort { xt.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED } xt.NfRange.RangeSize = 1 xt.NfRange.RangeIPV4.MinPort = htons(st.Port) xt.NfRange.RangeIPV4.MaxPort = xt.NfRange.RangeIPV4.MinPort copy(xt.NfRange.RangeIPV4.MinIP[:], st.Addr.AsSlice()) copy(xt.NfRange.RangeIPV4.MaxIP[:], st.Addr.AsSlice()) return marshal.Marshal(&xt) } func (*snatTargetMakerV4) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) { if len(buf) < linux.SizeOfXTNATTargetV0 { nflog("snatTargetMakerV4: buf has insufficient size for snat target %d", len(buf)) return nil, syserr.ErrInvalidArgument } if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber { nflog("snatTargetMakerV4: bad proto %d", p) return nil, syserr.ErrInvalidArgument } var st linux.XTNATTargetV0 st.UnmarshalUnsafe(buf) // Copy linux.XTNATTargetV0 to stack.SNATTarget. target := snatTarget{SNATTarget: stack.SNATTarget{ NetworkProtocol: filter.NetworkProtocol(), }} // RangeSize should be 1. nfRange := st.NfRange if nfRange.RangeSize != 1 { nflog("snatTargetMakerV4: bad rangesize %d", nfRange.RangeSize) return nil, syserr.ErrInvalidArgument } if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort { nflog("snatTargetMakerV4: MinPort != MaxPort (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort) return nil, syserr.ErrInvalidArgument } if nfRange.RangeIPV4.MinIP != nfRange.RangeIPV4.MaxIP { nflog("snatTargetMakerV4: MinIP != MaxIP (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort) return nil, syserr.ErrInvalidArgument } if nfRange.RangeIPV4.Flags&^(linux.NF_NAT_RANGE_MAP_IPS|linux.NF_NAT_RANGE_PROTO_SPECIFIED) != 0 { nflog("snatTargetMakerV4: unknown flags used (%x)", nfRange.RangeIPV4.Flags) return nil, syserr.ErrInvalidArgument } target.ChangeAddress = nfRange.RangeIPV4.Flags&linux.NF_NAT_RANGE_MAP_IPS != 0 target.ChangePort = nfRange.RangeIPV4.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED != 0 target.Addr = tcpip.AddrFrom4(nfRange.RangeIPV4.MinIP) target.Port = ntohs(nfRange.RangeIPV4.MinPort) return &target, nil } type snatTargetMakerR1 struct { NetworkProtocol tcpip.NetworkProtocolNumber } func (st *snatTargetMakerR1) id() targetID { return targetID{ name: SNATTargetName, networkProtocol: st.NetworkProtocol, revision: 1, } } func (*snatTargetMakerR1) marshal(target target) []byte { st := target.(*snatTarget) nt := linux.XTNATTargetV1{ Target: linux.XTEntryTarget{ TargetSize: linux.SizeOfXTNATTargetV1, Revision: 1, }, } copy(nt.Target.Name[:], SNATTargetName) if st.ChangeAddress { nt.Range.Flags |= linux.NF_NAT_RANGE_MAP_IPS } if st.ChangePort { nt.Range.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED } copy(nt.Range.MinAddr[:], st.Addr.AsSlice()) copy(nt.Range.MaxAddr[:], st.Addr.AsSlice()) nt.Range.MinProto = htons(st.Port) nt.Range.MaxProto = nt.Range.MinProto return marshal.Marshal(&nt) } func (st *snatTargetMakerR1) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) { if size := linux.SizeOfXTNATTargetV1; len(buf) < size { nflog("snatTargetMakerR1: buf has insufficient size (%d) for SNAT target (%d)", len(buf), size) return nil, syserr.ErrInvalidArgument } if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber { nflog("snatTargetMakerR1: bad proto %d", p) return nil, syserr.ErrInvalidArgument } var natRange linux.NFNATRange natRange.UnmarshalUnsafe(buf[linux.SizeOfXTEntryTarget:]) if natRange.MinAddr != natRange.MaxAddr { nflog("snatTargetMakerR1: MinAddr and MaxAddr are different") return nil, syserr.ErrInvalidArgument } if natRange.MinProto != natRange.MaxProto { nflog("snatTargetMakerR1: MinProto and MaxProto are different") return nil, syserr.ErrInvalidArgument } if natRange.Flags&^(linux.NF_NAT_RANGE_MAP_IPS|linux.NF_NAT_RANGE_PROTO_SPECIFIED) != 0 { nflog("snatTargetMakerR1: unknown flags used (%x)", natRange.Flags) return nil, syserr.ErrInvalidArgument } target := snatTarget{ SNATTarget: stack.SNATTarget{ NetworkProtocol: filter.NetworkProtocol(), Port: ntohs(natRange.MinProto), ChangeAddress: natRange.Flags&linux.NF_NAT_RANGE_MAP_IPS != 0, ChangePort: natRange.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED != 0, }, revision: 1, } switch st.NetworkProtocol { case header.IPv4ProtocolNumber: target.SNATTarget.Addr = tcpip.AddrFrom4Slice(natRange.MinAddr[:4]) case header.IPv6ProtocolNumber: target.SNATTarget.Addr = tcpip.AddrFrom16(natRange.MinAddr) default: panic(fmt.Sprintf("invalid protocol number: %d", st.NetworkProtocol)) } return &target, nil } type snatTargetMakerR2 struct { NetworkProtocol tcpip.NetworkProtocolNumber } func (st *snatTargetMakerR2) id() targetID { return targetID{ name: SNATTargetName, networkProtocol: st.NetworkProtocol, revision: 2, } } func (*snatTargetMakerR2) marshal(target target) []byte { st := target.(*snatTarget) nt := linux.XTNATTargetV2{ Target: linux.XTEntryTarget{ TargetSize: linux.SizeOfXTNATTargetV1, Revision: 2, }, } copy(nt.Target.Name[:], SNATTargetName) if st.ChangeAddress { nt.Range.Flags |= linux.NF_NAT_RANGE_MAP_IPS } if st.ChangePort { nt.Range.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED } copy(nt.Range.MinAddr[:], st.Addr.AsSlice()) copy(nt.Range.MaxAddr[:], st.Addr.AsSlice()) nt.Range.MinProto = htons(st.Port) nt.Range.MaxProto = nt.Range.MinProto return marshal.Marshal(&nt) } func (st *snatTargetMakerR2) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) { if size := linux.SizeOfXTNATTargetV2; len(buf) < size { nflog("snatTargetMakerR2: buf has insufficient size (%d) for SNAT target (%d)", len(buf), size) return nil, syserr.ErrInvalidArgument } if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber { nflog("snatTargetMakerR2: bad proto %d", p) return nil, syserr.ErrInvalidArgument } var natRange linux.NFNATRange2 natRange.UnmarshalUnsafe(buf[linux.SizeOfXTEntryTarget:]) if natRange.MinAddr != natRange.MaxAddr { nflog("snatTargetMakerR2: MinAddr and MaxAddr are different") return nil, syserr.ErrInvalidArgument } if natRange.MinProto != natRange.MaxProto { nflog("snatTargetMakerR2: MinProto and MaxProto are different") return nil, syserr.ErrInvalidArgument } if natRange.BaseProto != 0 { nflog("snatTargetMakerR2: BaseProto is nonzero") return nil, syserr.ErrInvalidArgument } if natRange.Flags&^(linux.NF_NAT_RANGE_MAP_IPS|linux.NF_NAT_RANGE_PROTO_SPECIFIED) != 0 { nflog("snatTargetMakerR1: unknown flags used (%x)", natRange.Flags) return nil, syserr.ErrInvalidArgument } target := snatTarget{ SNATTarget: stack.SNATTarget{ NetworkProtocol: filter.NetworkProtocol(), Port: ntohs(natRange.MinProto), ChangeAddress: natRange.Flags&linux.NF_NAT_RANGE_MAP_IPS != 0, ChangePort: natRange.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED != 0, }, revision: 2, } switch st.NetworkProtocol { case header.IPv4ProtocolNumber: target.SNATTarget.Addr = tcpip.AddrFrom4Slice(natRange.MinAddr[:4]) case header.IPv6ProtocolNumber: target.SNATTarget.Addr = tcpip.AddrFrom16(natRange.MinAddr) default: panic(fmt.Sprintf("invalid protocol number: %d", st.NetworkProtocol)) } return &target, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netfilter/targets.go000066400000000000000000000353021465435605700262370ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package netfilter import ( "encoding/binary" "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // ErrorTargetName is used to mark targets as error targets. Error targets // shouldn't be reached - an error has occurred if we fall through to one. const ErrorTargetName = "ERROR" // RedirectTargetName is used to mark targets as redirect targets. Redirect // targets should be reached for only NAT and Mangle tables. These targets will // change the destination port and/or IP for packets. const RedirectTargetName = "REDIRECT" func init() { // Standard targets include ACCEPT, DROP, RETURN, and JUMP. registerTargetMaker(&standardTargetMaker{ NetworkProtocol: header.IPv4ProtocolNumber, }) registerTargetMaker(&standardTargetMaker{ NetworkProtocol: header.IPv6ProtocolNumber, }) // Both user chains and actual errors are represented in iptables by // error targets. registerTargetMaker(&errorTargetMaker{ NetworkProtocol: header.IPv4ProtocolNumber, }) registerTargetMaker(&errorTargetMaker{ NetworkProtocol: header.IPv6ProtocolNumber, }) // REDIRECT targets. registerTargetMaker(&redirectTargetMaker{ NetworkProtocol: header.IPv4ProtocolNumber, }) registerTargetMaker(&nfNATTargetMaker{ NetworkProtocol: header.IPv6ProtocolNumber, }) // SNAT targets. registerTargetMaker(&snatTargetMakerV4{ NetworkProtocol: header.IPv4ProtocolNumber, }) registerTargetMaker(&snatTargetMakerR1{ NetworkProtocol: header.IPv4ProtocolNumber, }) registerTargetMaker(&snatTargetMakerR1{ NetworkProtocol: header.IPv6ProtocolNumber, }) registerTargetMaker(&snatTargetMakerR2{ NetworkProtocol: header.IPv4ProtocolNumber, }) registerTargetMaker(&snatTargetMakerR2{ NetworkProtocol: header.IPv6ProtocolNumber, }) // DNAT targets. registerTargetMaker(&dnatTargetMakerV4{ NetworkProtocol: header.IPv4ProtocolNumber, }) registerTargetMaker(&dnatTargetMakerR1{ NetworkProtocol: header.IPv4ProtocolNumber, }) registerTargetMaker(&dnatTargetMakerR1{ NetworkProtocol: header.IPv6ProtocolNumber, }) registerTargetMaker(&dnatTargetMakerR2{ NetworkProtocol: header.IPv4ProtocolNumber, }) registerTargetMaker(&dnatTargetMakerR2{ NetworkProtocol: header.IPv6ProtocolNumber, }) } // The stack package provides some basic, useful targets for us. The following // types wrap them for compatibility with the extension system. // +stateify savable type acceptTarget struct { stack.AcceptTarget } func (at *acceptTarget) id() targetID { return targetID{ networkProtocol: at.NetworkProtocol, } } // +stateify savable type dropTarget struct { stack.DropTarget } func (dt *dropTarget) id() targetID { return targetID{ networkProtocol: dt.NetworkProtocol, } } // +stateify savable type errorTarget struct { stack.ErrorTarget } func (et *errorTarget) id() targetID { return targetID{ name: ErrorTargetName, networkProtocol: et.NetworkProtocol, } } // +stateify savable type userChainTarget struct { stack.UserChainTarget } func (uc *userChainTarget) id() targetID { return targetID{ name: ErrorTargetName, networkProtocol: uc.NetworkProtocol, } } // +stateify savable type returnTarget struct { stack.ReturnTarget } func (rt *returnTarget) id() targetID { return targetID{ networkProtocol: rt.NetworkProtocol, } } // +stateify savable type redirectTarget struct { stack.RedirectTarget // addr must be (un)marshalled when reading and writing the target to // userspace, but does not affect behavior. addr tcpip.Address } func (rt *redirectTarget) id() targetID { return targetID{ name: RedirectTargetName, networkProtocol: rt.NetworkProtocol, } } // +stateify savable type standardTargetMaker struct { NetworkProtocol tcpip.NetworkProtocolNumber } func (sm *standardTargetMaker) id() targetID { // Standard targets have the empty string as a name and no revisions. return targetID{ networkProtocol: sm.NetworkProtocol, } } func (*standardTargetMaker) marshal(target target) []byte { // Translate verdicts the same way as the iptables tool. var verdict int32 switch tg := target.(type) { case *acceptTarget: verdict = -linux.NF_ACCEPT - 1 case *dropTarget: verdict = -linux.NF_DROP - 1 case *returnTarget: verdict = linux.NF_RETURN case *JumpTarget: verdict = int32(tg.Offset) default: panic(fmt.Errorf("unknown target of type %T", target)) } // The target's name will be the empty string. xt := linux.XTStandardTarget{ Target: linux.XTEntryTarget{ TargetSize: linux.SizeOfXTStandardTarget, }, Verdict: verdict, } return marshal.Marshal(&xt) } func (*standardTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) { if len(buf) != linux.SizeOfXTStandardTarget { nflog("buf has wrong size for standard target %d", len(buf)) return nil, syserr.ErrInvalidArgument } var standardTarget linux.XTStandardTarget standardTarget.UnmarshalUnsafe(buf) if standardTarget.Verdict < 0 { // A Verdict < 0 indicates a non-jump verdict. return translateToStandardTarget(standardTarget.Verdict, filter.NetworkProtocol()) } // A verdict >= 0 indicates a jump. return &JumpTarget{ Offset: uint32(standardTarget.Verdict), NetworkProtocol: filter.NetworkProtocol(), }, nil } // +stateify savable type errorTargetMaker struct { NetworkProtocol tcpip.NetworkProtocolNumber } func (em *errorTargetMaker) id() targetID { // Error targets have no revision. return targetID{ name: ErrorTargetName, networkProtocol: em.NetworkProtocol, } } func (*errorTargetMaker) marshal(target target) []byte { var errorName string switch tg := target.(type) { case *errorTarget: errorName = ErrorTargetName case *userChainTarget: errorName = tg.Name default: panic(fmt.Sprintf("errorMakerTarget cannot marshal unknown type %T", target)) } // This is an error target named error xt := linux.XTErrorTarget{ Target: linux.XTEntryTarget{ TargetSize: linux.SizeOfXTErrorTarget, }, } copy(xt.Name[:], errorName) copy(xt.Target.Name[:], ErrorTargetName) return marshal.Marshal(&xt) } func (*errorTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) { if len(buf) != linux.SizeOfXTErrorTarget { nflog("buf has insufficient size for error target %d", len(buf)) return nil, syserr.ErrInvalidArgument } var errTgt linux.XTErrorTarget errTgt.UnmarshalUnsafe(buf) // Error targets are used in 2 cases: // * An actual error case. These rules have an error named // ErrorTargetName. The last entry of the table is usually an error // case to catch any packets that somehow fall through every rule. // * To mark the start of a user defined chain. These // rules have an error with the name of the chain. switch name := errTgt.Name.String(); name { case ErrorTargetName: return &errorTarget{stack.ErrorTarget{ NetworkProtocol: filter.NetworkProtocol(), }}, nil default: // User defined chain. return &userChainTarget{stack.UserChainTarget{ Name: name, NetworkProtocol: filter.NetworkProtocol(), }}, nil } } // +stateify savable type redirectTargetMaker struct { NetworkProtocol tcpip.NetworkProtocolNumber } func (rm *redirectTargetMaker) id() targetID { return targetID{ name: RedirectTargetName, networkProtocol: rm.NetworkProtocol, } } func (*redirectTargetMaker) marshal(target target) []byte { rt := target.(*redirectTarget) // This is a redirect target named redirect xt := linux.XTRedirectTarget{ Target: linux.XTEntryTarget{ TargetSize: linux.SizeOfXTRedirectTarget, }, } copy(xt.Target.Name[:], RedirectTargetName) xt.NfRange.RangeSize = 1 xt.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED xt.NfRange.RangeIPV4.MinPort = htons(rt.Port) xt.NfRange.RangeIPV4.MaxPort = xt.NfRange.RangeIPV4.MinPort return marshal.Marshal(&xt) } func (*redirectTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) { if len(buf) < linux.SizeOfXTRedirectTarget { nflog("redirectTargetMaker: buf has insufficient size for redirect target %d", len(buf)) return nil, syserr.ErrInvalidArgument } if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber { nflog("redirectTargetMaker: bad proto %d", p) return nil, syserr.ErrInvalidArgument } var rt linux.XTRedirectTarget rt.UnmarshalUnsafe(buf) // Copy linux.XTRedirectTarget to stack.RedirectTarget. target := redirectTarget{RedirectTarget: stack.RedirectTarget{ NetworkProtocol: filter.NetworkProtocol(), }} // RangeSize should be 1. nfRange := rt.NfRange if nfRange.RangeSize != 1 { nflog("redirectTargetMaker: bad rangesize %d", nfRange.RangeSize) return nil, syserr.ErrInvalidArgument } // Also check if we need to map ports or IP. // For now, redirect target only supports destination port change. // Port range and IP range are not supported yet. if nfRange.RangeIPV4.Flags != linux.NF_NAT_RANGE_PROTO_SPECIFIED { nflog("redirectTargetMaker: invalid range flags %d", nfRange.RangeIPV4.Flags) return nil, syserr.ErrInvalidArgument } if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort { nflog("redirectTargetMaker: MinPort != MaxPort (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort) return nil, syserr.ErrInvalidArgument } if nfRange.RangeIPV4.MinIP != nfRange.RangeIPV4.MaxIP { nflog("redirectTargetMaker: MinIP != MaxIP (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort) return nil, syserr.ErrInvalidArgument } target.addr = tcpip.AddrFrom4(nfRange.RangeIPV4.MinIP) target.Port = ntohs(nfRange.RangeIPV4.MinPort) return &target, nil } // +stateify savable type nfNATTargetMaker struct { NetworkProtocol tcpip.NetworkProtocolNumber } func (rm *nfNATTargetMaker) id() targetID { return targetID{ name: RedirectTargetName, networkProtocol: rm.NetworkProtocol, } } func (*nfNATTargetMaker) marshal(target target) []byte { rt := target.(*redirectTarget) nt := linux.XTNATTargetV1{ Target: linux.XTEntryTarget{ TargetSize: linux.SizeOfXTNATTargetV1, }, Range: linux.NFNATRange{ Flags: linux.NF_NAT_RANGE_PROTO_SPECIFIED, }, } copy(nt.Target.Name[:], RedirectTargetName) copy(nt.Range.MinAddr[:], rt.addr.AsSlice()) copy(nt.Range.MaxAddr[:], rt.addr.AsSlice()) nt.Range.MinProto = htons(rt.Port) nt.Range.MaxProto = nt.Range.MinProto return marshal.Marshal(&nt) } func (*nfNATTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) { if size := linux.SizeOfXTNATTargetV1; len(buf) < size { nflog("nfNATTargetMaker: buf has insufficient size (%d) for nfNAT target (%d)", len(buf), size) return nil, syserr.ErrInvalidArgument } if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber { nflog("nfNATTargetMaker: bad proto %d", p) return nil, syserr.ErrInvalidArgument } var natRange linux.NFNATRange natRange.UnmarshalUnsafe(buf[linux.SizeOfXTEntryTarget:]) // We don't support port or address ranges. if natRange.MinAddr != natRange.MaxAddr { nflog("nfNATTargetMaker: MinAddr and MaxAddr are different") return nil, syserr.ErrInvalidArgument } if natRange.MinProto != natRange.MaxProto { nflog("nfNATTargetMaker: MinProto and MaxProto are different") return nil, syserr.ErrInvalidArgument } // For now, redirect target only supports destination change. if natRange.Flags != linux.NF_NAT_RANGE_PROTO_SPECIFIED { nflog("nfNATTargetMaker: invalid range flags %d", natRange.Flags) return nil, syserr.ErrInvalidArgument } target := redirectTarget{ RedirectTarget: stack.RedirectTarget{ NetworkProtocol: filter.NetworkProtocol(), Port: ntohs(natRange.MinProto), }, addr: tcpip.AddrFrom16(natRange.MinAddr), } return &target, nil } // translateToStandardTarget translates from the value in a // linux.XTStandardTarget to an stack.Verdict. func translateToStandardTarget(val int32, netProto tcpip.NetworkProtocolNumber) (target, *syserr.Error) { switch val { case -linux.NF_ACCEPT - 1: return &acceptTarget{stack.AcceptTarget{ NetworkProtocol: netProto, }}, nil case -linux.NF_DROP - 1: return &dropTarget{stack.DropTarget{ NetworkProtocol: netProto, }}, nil case -linux.NF_QUEUE - 1: nflog("unsupported iptables verdict QUEUE") return nil, syserr.ErrInvalidArgument case linux.NF_RETURN: return &returnTarget{stack.ReturnTarget{ NetworkProtocol: netProto, }}, nil default: nflog("unknown iptables verdict %d", val) return nil, syserr.ErrInvalidArgument } } // parseTarget parses a target from optVal. optVal should contain only the // target. func parseTarget(filter stack.IPHeaderFilter, optVal []byte, ipv6 bool) (stack.Target, *syserr.Error) { nflog("set entries: parsing target of size %d", len(optVal)) if len(optVal) < linux.SizeOfXTEntryTarget { nflog("optVal has insufficient size for entry target %d", len(optVal)) return nil, syserr.ErrInvalidArgument } var target linux.XTEntryTarget // Do not advance optVal as targetMake.unmarshal() may unmarshal // XTEntryTarget again but with some added fields. target.UnmarshalUnsafe(optVal) return unmarshalTarget(target, filter, optVal) } // JumpTarget implements stack.Target. // // +stateify savable type JumpTarget struct { // Offset is the byte offset of the rule to jump to. It is used for // marshaling and unmarshaling. Offset uint32 // RuleNum is the rule to jump to. RuleNum int // NetworkProtocol is the network protocol the target is used with. NetworkProtocol tcpip.NetworkProtocolNumber } // ID implements Target.ID. func (jt *JumpTarget) id() targetID { return targetID{ networkProtocol: jt.NetworkProtocol, } } // Action implements stack.Target.Action. func (jt *JumpTarget) Action(*stack.PacketBuffer, stack.Hook, *stack.Route, stack.AddressableEndpoint) (stack.RuleVerdict, int) { return stack.RuleJump, jt.RuleNum } func ntohs(port uint16) uint16 { buf := make([]byte, 2) binary.BigEndian.PutUint16(buf, port) return hostarch.ByteOrder.Uint16(buf) } func htons(port uint16) uint16 { buf := make([]byte, 2) hostarch.ByteOrder.PutUint16(buf, port) return binary.BigEndian.Uint16(buf) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netfilter/tcp_matcher.go000066400000000000000000000120231465435605700270520ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package netfilter import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) const matcherNameTCP = "tcp" func init() { registerMatchMaker(tcpMarshaler{}) } // tcpMarshaler implements matchMaker for TCP matching. type tcpMarshaler struct{} // name implements matchMaker.name. func (tcpMarshaler) name() string { return matcherNameTCP } func (tcpMarshaler) revision() uint8 { return 0 } // marshal implements matchMaker.marshal. func (tcpMarshaler) marshal(mr matcher) []byte { matcher := mr.(*TCPMatcher) xttcp := linux.XTTCP{ SourcePortStart: matcher.sourcePortStart, SourcePortEnd: matcher.sourcePortEnd, DestinationPortStart: matcher.destinationPortStart, DestinationPortEnd: matcher.destinationPortEnd, FlagMask: matcher.flagMask, FlagCompare: matcher.flagCompare, InverseFlags: matcher.inverseFlags, } return marshalEntryMatch(matcherNameTCP, marshal.Marshal(&xttcp)) } // unmarshal implements matchMaker.unmarshal. func (tcpMarshaler) unmarshal(_ IDMapper, buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { if len(buf) < linux.SizeOfXTTCP { return nil, fmt.Errorf("buf has insufficient size for TCP match: %d", len(buf)) } // For alignment reasons, the match's total size may // exceed what's strictly necessary to hold matchData. var matchData linux.XTTCP matchData.UnmarshalUnsafe(buf) nflog("parseMatchers: parsed XTTCP: %+v", matchData) // Only support inverse dport/sport if matchData.Option != 0 || matchData.InverseFlags > 2 { return nil, fmt.Errorf("unsupported TCP matcher flags set") } if filter.Protocol != header.TCPProtocolNumber { return nil, fmt.Errorf("TCP matching is only valid for protocol %d", header.TCPProtocolNumber) } return &TCPMatcher{ sourcePortStart: matchData.SourcePortStart, sourcePortEnd: matchData.SourcePortEnd, destinationPortStart: matchData.DestinationPortStart, destinationPortEnd: matchData.DestinationPortEnd, flagMask: matchData.FlagMask, flagCompare: matchData.FlagCompare, inverseFlags: matchData.InverseFlags, }, nil } // TCPMatcher matches TCP packets and their headers. It implements Matcher. type TCPMatcher struct { sourcePortStart uint16 sourcePortEnd uint16 destinationPortStart uint16 destinationPortEnd uint16 flagMask uint8 flagCompare uint8 inverseFlags uint8 } // name implements matcher.name. func (*TCPMatcher) name() string { return matcherNameTCP } func (*TCPMatcher) revision() uint8 { return 0 } // Match implements Matcher.Match. func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ string) (bool, bool) { switch pkt.NetworkProtocolNumber { case header.IPv4ProtocolNumber: netHeader := header.IPv4(pkt.NetworkHeader().Slice()) if netHeader.TransportProtocol() != header.TCPProtocolNumber { return false, false } // We don't match fragments. if frag := netHeader.FragmentOffset(); frag != 0 { if frag == 1 { return false, true } return false, false } case header.IPv6ProtocolNumber: // As in Linux, we do not perform an IPv6 fragment check. See // xt_action_param.fragoff in // include/linux/netfilter/x_tables.h. if header.IPv6(pkt.NetworkHeader().Slice()).TransportProtocol() != header.TCPProtocolNumber { return false, false } default: // We don't know the network protocol. return false, false } tcpHeader := header.TCP(pkt.TransportHeader().Slice()) if len(tcpHeader) < header.TCPMinimumSize { // There's no valid TCP header here, so we drop the packet immediately. return false, true } // Check whether the source and destination ports are within the // matching range. // Take into account inverseFlags for DSTPT & SRCPT only sPort := tcpHeader.SourcePort() sPortMatch := sPort < tm.sourcePortStart || tm.sourcePortEnd < sPort sPortMatch = sPortMatch != (tm.inverseFlags&linux.XT_TCP_INV_SRCPT == linux.XT_TCP_INV_SRCPT) if sPortMatch { return false, false } dPort := tcpHeader.DestinationPort() dPortMatch := dPort < tm.destinationPortStart || tm.destinationPortEnd < dPort dPortMatch = dPortMatch != (tm.inverseFlags&linux.XT_TCP_INV_DSTPT == linux.XT_TCP_INV_DSTPT) if dPortMatch { return false, false } // Check the flags. if uint8(tcpHeader.Flags())&tm.flagMask != tm.flagCompare { return false, false } return true, false } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netfilter/udp_matcher.go000066400000000000000000000104161465435605700270600ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package netfilter import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) const matcherNameUDP = "udp" func init() { registerMatchMaker(udpMarshaler{}) } // udpMarshaler implements matchMaker for UDP matching. type udpMarshaler struct{} // name implements matchMaker.name. func (udpMarshaler) name() string { return matcherNameUDP } func (udpMarshaler) revision() uint8 { return 0 } // marshal implements matchMaker.marshal. func (udpMarshaler) marshal(mr matcher) []byte { matcher := mr.(*UDPMatcher) xtudp := linux.XTUDP{ SourcePortStart: matcher.sourcePortStart, SourcePortEnd: matcher.sourcePortEnd, DestinationPortStart: matcher.destinationPortStart, DestinationPortEnd: matcher.destinationPortEnd, } return marshalEntryMatch(matcherNameUDP, marshal.Marshal(&xtudp)) } // unmarshal implements matchMaker.unmarshal. func (udpMarshaler) unmarshal(_ IDMapper, buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { if len(buf) < linux.SizeOfXTUDP { return nil, fmt.Errorf("buf has insufficient size for UDP match: %d", len(buf)) } // For alignment reasons, the match's total size may exceed what's // strictly necessary to hold matchData. var matchData linux.XTUDP matchData.UnmarshalUnsafe(buf) nflog("parseMatchers: parsed XTUDP: %+v", matchData) if matchData.InverseFlags != 0 { return nil, fmt.Errorf("unsupported UDP matcher inverse flags set") } if filter.Protocol != header.UDPProtocolNumber { return nil, fmt.Errorf("UDP matching is only valid for protocol %d", header.UDPProtocolNumber) } return &UDPMatcher{ sourcePortStart: matchData.SourcePortStart, sourcePortEnd: matchData.SourcePortEnd, destinationPortStart: matchData.DestinationPortStart, destinationPortEnd: matchData.DestinationPortEnd, }, nil } // UDPMatcher matches UDP packets and their headers. It implements Matcher. type UDPMatcher struct { sourcePortStart uint16 sourcePortEnd uint16 destinationPortStart uint16 destinationPortEnd uint16 } // name implements Matcher.name. func (*UDPMatcher) name() string { return matcherNameUDP } func (*UDPMatcher) revision() uint8 { return 0 } // Match implements Matcher.Match. func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ string) (bool, bool) { switch pkt.NetworkProtocolNumber { case header.IPv4ProtocolNumber: netHeader := header.IPv4(pkt.NetworkHeader().Slice()) if netHeader.TransportProtocol() != header.UDPProtocolNumber { return false, false } // We don't match fragments. if frag := netHeader.FragmentOffset(); frag != 0 { if frag == 1 { return false, true } return false, false } case header.IPv6ProtocolNumber: // As in Linux, we do not perform an IPv6 fragment check. See // xt_action_param.fragoff in // include/linux/netfilter/x_tables.h. if header.IPv6(pkt.NetworkHeader().Slice()).TransportProtocol() != header.UDPProtocolNumber { return false, false } default: // We don't know the network protocol. return false, false } udpHeader := header.UDP(pkt.TransportHeader().Slice()) if len(udpHeader) < header.UDPMinimumSize { // There's no valid UDP header here, so we drop the packet immediately. return false, true } // Check whether the source and destination ports are within the // matching range. if sourcePort := udpHeader.SourcePort(); sourcePort < um.sourcePortStart || um.sourcePortEnd < sourcePort { return false, false } if destinationPort := udpHeader.DestinationPort(); destinationPort < um.destinationPortStart || um.destinationPortEnd < destinationPort { return false, false } return true, false } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netlink/000077500000000000000000000000001465435605700237045ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netlink/netlink_state_autogen.go000066400000000000000000000050341465435605700306230ustar00rootroot00000000000000// automatically generated by stateify. package netlink import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (s *Socket) StateTypeName() string { return "pkg/sentry/socket/netlink.Socket" } func (s *Socket) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "LockFD", "SendReceiveTimeout", "ports", "protocol", "skType", "ep", "connection", "bound", "portID", "sendBufferSize", "filter", "netns", } } func (s *Socket) beforeSave() {} // +checklocksignore func (s *Socket) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.vfsfd) stateSinkObject.Save(1, &s.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &s.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &s.LockFD) stateSinkObject.Save(4, &s.SendReceiveTimeout) stateSinkObject.Save(5, &s.ports) stateSinkObject.Save(6, &s.protocol) stateSinkObject.Save(7, &s.skType) stateSinkObject.Save(8, &s.ep) stateSinkObject.Save(9, &s.connection) stateSinkObject.Save(10, &s.bound) stateSinkObject.Save(11, &s.portID) stateSinkObject.Save(12, &s.sendBufferSize) stateSinkObject.Save(13, &s.filter) stateSinkObject.Save(14, &s.netns) } func (s *Socket) afterLoad(context.Context) {} // +checklocksignore func (s *Socket) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.vfsfd) stateSourceObject.Load(1, &s.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &s.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &s.LockFD) stateSourceObject.Load(4, &s.SendReceiveTimeout) stateSourceObject.Load(5, &s.ports) stateSourceObject.Load(6, &s.protocol) stateSourceObject.Load(7, &s.skType) stateSourceObject.Load(8, &s.ep) stateSourceObject.Load(9, &s.connection) stateSourceObject.Load(10, &s.bound) stateSourceObject.Load(11, &s.portID) stateSourceObject.Load(12, &s.sendBufferSize) stateSourceObject.Load(13, &s.filter) stateSourceObject.Load(14, &s.netns) } func (k *kernelSCM) StateTypeName() string { return "pkg/sentry/socket/netlink.kernelSCM" } func (k *kernelSCM) StateFields() []string { return []string{} } func (k *kernelSCM) beforeSave() {} // +checklocksignore func (k *kernelSCM) StateSave(stateSinkObject state.Sink) { k.beforeSave() } func (k *kernelSCM) afterLoad(context.Context) {} // +checklocksignore func (k *kernelSCM) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func init() { state.Register((*Socket)(nil)) state.Register((*kernelSCM)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netlink/nlmsg/000077500000000000000000000000001465435605700250245ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netlink/nlmsg/message.go000066400000000000000000000210411465435605700267750ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package nlmsg provides helpers to parse and construct netlink messages. package nlmsg import ( "fmt" "math" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" ) // alignPad returns the length of padding required for alignment. // // Preconditions: align is a power of two. func alignPad(length int, align uint) int { return bits.AlignUp(length, align) - length } // Message contains a complete serialized netlink message. type Message struct { hdr linux.NetlinkMessageHeader buf []byte } // NewMessage creates a new Message containing the passed header. // // The header length will be updated by Finalize. func NewMessage(hdr linux.NetlinkMessageHeader) *Message { return &Message{ hdr: hdr, buf: marshal.Marshal(&hdr), } } // ParseMessage parses the first message seen at buf, returning the rest of the // buffer. If message is malformed, ok of false is returned. For last message, // padding check is loose, if there isn't enough padding, whole buf is consumed // and ok is set to true. func ParseMessage(buf []byte) (msg *Message, rest []byte, ok bool) { b := BytesView(buf) hdrBytes, ok := b.Extract(linux.NetlinkMessageHeaderSize) if !ok { return } var hdr linux.NetlinkMessageHeader hdr.UnmarshalUnsafe(hdrBytes) // Msg portion. totalMsgLen := int(hdr.Length) _, ok = b.Extract(totalMsgLen - linux.NetlinkMessageHeaderSize) if !ok { return } // Padding. numPad := alignPad(totalMsgLen, linux.NLMSG_ALIGNTO) // Linux permits the last message not being aligned, just consume all of it. // Ref: net/netlink/af_netlink.c:netlink_rcv_skb if numPad > len(b) { numPad = len(b) } _, ok = b.Extract(numPad) if !ok { return } return &Message{ hdr: hdr, buf: buf[:totalMsgLen], }, []byte(b), true } // Header returns the header of this message. func (m *Message) Header() linux.NetlinkMessageHeader { return m.hdr } // GetData unmarshals the payload message header from this netlink message, and // returns the attributes portion. func (m *Message) GetData(msg marshal.Marshallable) (AttrsView, bool) { b := BytesView(m.buf) _, ok := b.Extract(linux.NetlinkMessageHeaderSize) if !ok { return nil, false } size := msg.SizeBytes() msgBytes, ok := b.Extract(size) if !ok { return nil, false } msg.UnmarshalUnsafe(msgBytes) numPad := alignPad(linux.NetlinkMessageHeaderSize+size, linux.NLMSG_ALIGNTO) // Linux permits the last message not being aligned, just consume all of it. // Ref: net/netlink/af_netlink.c:netlink_rcv_skb if numPad > len(b) { numPad = len(b) } _, ok = b.Extract(numPad) if !ok { return nil, false } return AttrsView(b), true } // Finalize returns the []byte containing the entire message, with the total // length set in the message header. The Message must not be modified after // calling Finalize. func (m *Message) Finalize() []byte { // Update length, which is the first 4 bytes of the header. hostarch.ByteOrder.PutUint32(m.buf, uint32(len(m.buf))) // Align the message. Note that the message length in the header (set // above) is the useful length of the message, not the total aligned // length. See net/netlink/af_netlink.c:__nlmsg_put. aligned := bits.AlignUp(len(m.buf), linux.NLMSG_ALIGNTO) m.putZeros(aligned - len(m.buf)) return m.buf } // putZeros adds n zeros to the message. func (m *Message) putZeros(n int) { for n > 0 { m.buf = append(m.buf, 0) n-- } } // Put serializes v into the message. func (m *Message) Put(v marshal.Marshallable) { m.buf = append(m.buf, marshal.Marshal(v)...) } // PutAttr adds v to the message as a netlink attribute. // // Preconditions: The serialized attribute (linux.NetlinkAttrHeaderSize + // v.SizeBytes()) fits in math.MaxUint16 bytes. func (m *Message) PutAttr(atype uint16, v marshal.Marshallable) { l := linux.NetlinkAttrHeaderSize + v.SizeBytes() if l > math.MaxUint16 { panic(fmt.Sprintf("attribute too large: %d", l)) } m.Put(&linux.NetlinkAttrHeader{ Type: atype, Length: uint16(l), }) m.Put(v) // Align the attribute. aligned := bits.AlignUp(l, linux.NLA_ALIGNTO) m.putZeros(aligned - l) } // PutAttrString adds s to the message as a netlink attribute. func (m *Message) PutAttrString(atype uint16, s string) { l := linux.NetlinkAttrHeaderSize + len(s) + 1 m.Put(&linux.NetlinkAttrHeader{ Type: atype, Length: uint16(l), }) // String + NUL-termination. m.Put(primitive.AsByteSlice([]byte(s))) m.putZeros(1) // Align the attribute. aligned := bits.AlignUp(l, linux.NLA_ALIGNTO) m.putZeros(aligned - l) } // MessageSet contains a series of netlink messages. type MessageSet struct { // Multi indicates that this a multi-part message, to be terminated by // NLMSG_DONE. NLMSG_DONE is sent even if the set contains only one // Message. // // If Multi is set, all added messages will have NLM_F_MULTI set. Multi bool // PortID is the destination port for all messages. PortID int32 // Seq is the sequence counter for all messages in the set. Seq uint32 // Messages contains the messages in the set. Messages []*Message } // NewMessageSet creates a new MessageSet. // // portID is the destination port to set as PortID in all messages. // // seq is the sequence counter to set as seq in all messages in the set. func NewMessageSet(portID int32, seq uint32) *MessageSet { return &MessageSet{ PortID: portID, Seq: seq, } } // AddMessage adds a new message to the set and returns it for further // additions. // // The passed header will have Seq, PortID and the multi flag set // automatically. func (ms *MessageSet) AddMessage(hdr linux.NetlinkMessageHeader) *Message { hdr.Seq = ms.Seq hdr.PortID = uint32(ms.PortID) if ms.Multi { hdr.Flags |= linux.NLM_F_MULTI } m := NewMessage(hdr) ms.Messages = append(ms.Messages, m) return m } // AttrsView is a view into the attributes portion of a netlink message. type AttrsView []byte // Empty returns whether there is no attribute left in v. func (v AttrsView) Empty() bool { return len(v) == 0 } // ParseFirst parses first netlink attribute at the beginning of v. func (v AttrsView) ParseFirst() (hdr linux.NetlinkAttrHeader, value []byte, rest AttrsView, ok bool) { b := BytesView(v) hdrBytes, ok := b.Extract(linux.NetlinkAttrHeaderSize) if !ok { return } hdr.UnmarshalUnsafe(hdrBytes) value, ok = b.Extract(int(hdr.Length) - linux.NetlinkAttrHeaderSize) if !ok { return } _, ok = b.Extract(alignPad(int(hdr.Length), linux.NLA_ALIGNTO)) if !ok { return } return hdr, value, AttrsView(b), ok } // Parse parses netlink attributes. func (v AttrsView) Parse() (map[uint16]BytesView, bool) { attrs := make(map[uint16]BytesView) attrsView := v for !attrsView.Empty() { // The index is unspecified, search by the interface name. ahdr, value, rest, ok := attrsView.ParseFirst() if !ok { return nil, false } attrsView = rest attrs[ahdr.Type] = BytesView(value) } return attrs, true } // BytesView supports extracting data from a byte slice with bounds checking. type BytesView []byte // Extract removes the first n bytes from v and returns it. If n is out of // bounds, it returns false. func (v *BytesView) Extract(n int) ([]byte, bool) { if n < 0 || n > len(*v) { return nil, false } extracted := (*v)[:n] *v = (*v)[n:] return extracted, true } // String converts the raw attribute value to string. func (v *BytesView) String() string { b := []byte(*v) if len(b) == 0 { return "" } if b[len(b)-1] == 0 { b = b[:len(b)-1] } return string(b) } // Uint32 converts the raw attribute value to uint32. func (v *BytesView) Uint32() (uint32, bool) { attr := []byte(*v) val := primitive.Uint32(0) if len(attr) != val.SizeBytes() { return 0, false } val.UnmarshalBytes(attr) return uint32(val), true } // Int32 converts the raw attribute value to int32. func (v *BytesView) Int32() (int32, bool) { attr := []byte(*v) val := primitive.Int32(0) if len(attr) != val.SizeBytes() { return 0, false } val.UnmarshalBytes(attr) return int32(val), true } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netlink/nlmsg/nlmsg_state_autogen.go000066400000000000000000000000671465435605700314200ustar00rootroot00000000000000// automatically generated by stateify. package nlmsg golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netlink/port/000077500000000000000000000000001465435605700246705ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netlink/port/port.go000066400000000000000000000057731465435605700262170ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package port provides port ID allocation for netlink sockets. // // A netlink port is any int32 value. Positive ports are typically equivalent // to the PID of the binding process. If that port is unavailable, negative // ports are searched to find a free port that will not conflict with other // PIDS. package port import ( "fmt" "math" "math/rand" "gvisor.dev/gvisor/pkg/sync" ) // maxPorts is a sanity limit on the maximum number of ports to allocate per // protocol. const maxPorts = 10000 // Manager allocates netlink port IDs. // // +stateify savable type Manager struct { // mu protects the fields below. mu sync.Mutex `state:"nosave"` // ports contains a map of allocated ports for each protocol. ports map[int]map[int32]struct{} } // New creates a new Manager. func New() *Manager { return &Manager{ ports: make(map[int]map[int32]struct{}), } } // Allocate reserves a new port ID for protocol. hint will be taken if // available. func (m *Manager) Allocate(protocol int, hint int32) (int32, bool) { m.mu.Lock() defer m.mu.Unlock() proto, ok := m.ports[protocol] if !ok { proto = make(map[int32]struct{}) // Port 0 is reserved for the kernel. proto[0] = struct{}{} m.ports[protocol] = proto } if len(proto) >= maxPorts { return 0, false } if _, ok := proto[hint]; !ok { // Hint is available, reserve it. proto[hint] = struct{}{} return hint, true } // Search for any free port in [math.MinInt32, -4096). The positive // port space is left open for pid-based allocations. This behavior is // consistent with Linux. start := int32(math.MinInt32 + rand.Int63n(math.MaxInt32-4096+1)) curr := start for { if _, ok := proto[curr]; !ok { proto[curr] = struct{}{} return curr, true } curr-- if curr >= -4096 { curr = -4097 } if curr == start { // Nothing found. We should always find a free port // because maxPorts < -4096 - MinInt32. panic(fmt.Sprintf("No free port found in %+v", proto)) } } } // Release frees the specified port for protocol. // // Preconditions: port is already allocated. func (m *Manager) Release(protocol int, port int32) { m.mu.Lock() defer m.mu.Unlock() proto, ok := m.ports[protocol] if !ok { panic(fmt.Sprintf("Released port %d for protocol %d which has no allocations", port, protocol)) } if _, ok := proto[port]; !ok { panic(fmt.Sprintf("Released port %d for protocol %d is not allocated", port, protocol)) } delete(proto, port) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netlink/port/port_state_autogen.go000066400000000000000000000012651465435605700311310ustar00rootroot00000000000000// automatically generated by stateify. package port import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (m *Manager) StateTypeName() string { return "pkg/sentry/socket/netlink/port.Manager" } func (m *Manager) StateFields() []string { return []string{ "ports", } } func (m *Manager) beforeSave() {} // +checklocksignore func (m *Manager) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.ports) } func (m *Manager) afterLoad(context.Context) {} // +checklocksignore func (m *Manager) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.ports) } func init() { state.Register((*Manager)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netlink/provider.go000066400000000000000000000076401465435605700260740ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package netlink import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netlink/nlmsg" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" ) // Protocol is the implementation of a netlink socket protocol. type Protocol interface { // Protocol returns the Linux netlink protocol value. Protocol() int // CanSend returns true if this protocol may ever send messages. // // TODO(gvisor.dev/issue/1119): This is a workaround to allow // advertising support for otherwise unimplemented features on sockets // that will never send messages, thus making those features no-ops. CanSend() bool // ProcessMessage processes a single message from userspace. // // If err == nil, any messages added to ms will be sent back to the // other end of the socket. Setting ms.Multi will cause an NLMSG_DONE // message to be sent even if ms contains no messages. ProcessMessage(ctx context.Context, s *Socket, msg *nlmsg.Message, ms *nlmsg.MessageSet) *syserr.Error } // Provider is a function that creates a new Protocol for a specific netlink // protocol. // // Note that this is distinct from socket.Provider, which is used for all // socket families. type Provider func(t *kernel.Task) (Protocol, *syserr.Error) // protocols holds a map of all known address protocols and their provider. var protocols = make(map[int]Provider) // RegisterProvider registers the provider of a given address protocol so that // netlink sockets of that type can be created via socket(2). // // Preconditions: May only be called before any netlink sockets are created. func RegisterProvider(protocol int, provider Provider) { if p, ok := protocols[protocol]; ok { panic(fmt.Sprintf("Netlink protocol %d already provided by %+v", protocol, p)) } protocols[protocol] = provider } // socketProvider implements socket.Provider. type socketProvider struct { } // Socket implements socket.Provider.Socket. func (*socketProvider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { // Netlink sockets must be specified as datagram or raw, but they // behave the same regardless of type. if stype != linux.SOCK_DGRAM && stype != linux.SOCK_RAW { return nil, syserr.ErrSocketNotSupported } provider, ok := protocols[protocol] if !ok { return nil, syserr.ErrProtocolNotSupported } p, err := provider(t) if err != nil { return nil, err } s, err := New(t, stype, p) if err != nil { return nil, err } vfsfd := &s.vfsfd mnt := t.Kernel().SocketMount() d := sockfs.NewDentry(t, mnt) defer d.DecRef(t) if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ DenyPRead: true, DenyPWrite: true, UseDentryMetadata: true, }); err != nil { return nil, syserr.FromError(err) } return vfsfd, nil } // Pair implements socket.Provider.Pair by returning an error. func (*socketProvider) Pair(*kernel.Task, linux.SockType, int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { // Netlink sockets never supports creating socket pairs. return nil, nil, syserr.ErrNotSupported } // init registers the socket provider. func init() { socket.RegisterProvider(linux.AF_NETLINK, &socketProvider{}) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netlink/route/000077500000000000000000000000001465435605700250425ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netlink/route/protocol.go000066400000000000000000000417151465435605700272420ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package route provides a NETLINK_ROUTE socket protocol. package route import ( "bytes" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/netlink" "gvisor.dev/gvisor/pkg/sentry/socket/netlink/nlmsg" "gvisor.dev/gvisor/pkg/syserr" ) // commandKind describes the operational class of a message type. // // The route message types use the lower 2 bits of the type to describe class // of command. type commandKind int const ( kindNew commandKind = 0x0 kindDel commandKind = 0x1 kindGet commandKind = 0x2 kindSet commandKind = 0x3 ) func typeKind(typ uint16) commandKind { return commandKind(typ & 0x3) } // Protocol implements netlink.Protocol. // // +stateify savable type Protocol struct{} var _ netlink.Protocol = (*Protocol)(nil) // NewProtocol creates a NETLINK_ROUTE netlink.Protocol. func NewProtocol(t *kernel.Task) (netlink.Protocol, *syserr.Error) { return &Protocol{}, nil } // Protocol implements netlink.Protocol.Protocol. func (p *Protocol) Protocol() int { return linux.NETLINK_ROUTE } // CanSend implements netlink.Protocol.CanSend. func (p *Protocol) CanSend() bool { return true } // dumpLinks handles RTM_GETLINK dump requests. func (p *Protocol) dumpLinks(ctx context.Context, s *netlink.Socket, msg *nlmsg.Message, ms *nlmsg.MessageSet) *syserr.Error { // NLM_F_DUMP + RTM_GETLINK messages are supposed to include an // ifinfomsg. However, Linux <3.9 only checked for rtgenmsg, and some // userspace applications (including glibc) still include rtgenmsg. // Linux has a workaround based on the total message length. // // We don't bother to check for either, since we don't support any // extra attributes that may be included anyways. // // The message may also contain netlink attribute IFLA_EXT_MASK, which // we don't support. // The RTM_GETLINK dump response is a set of messages each containing // an InterfaceInfoMessage followed by a set of netlink attributes. // We always send back an NLMSG_DONE. ms.Multi = true stack := s.Stack() if stack == nil { // No network devices. return nil } for idx, i := range stack.Interfaces() { addNewLinkMessage(ms, idx, i) } return nil } // getLinks handles RTM_GETLINK requests. func (p *Protocol) getLink(ctx context.Context, s *netlink.Socket, msg *nlmsg.Message, ms *nlmsg.MessageSet) *syserr.Error { stack := s.Stack() if stack == nil { // No network devices. return nil } // Parse message. var ifi linux.InterfaceInfoMessage attrs, ok := msg.GetData(&ifi) if !ok { return syserr.ErrInvalidArgument } // Parse attributes. var byName []byte for !attrs.Empty() { ahdr, value, rest, ok := attrs.ParseFirst() if !ok { return syserr.ErrInvalidArgument } attrs = rest switch ahdr.Type { case linux.IFLA_IFNAME: if len(value) < 1 { return syserr.ErrInvalidArgument } byName = value[:len(value)-1] // TODO(gvisor.dev/issue/578): Support IFLA_EXT_MASK. } } found := false for idx, i := range stack.Interfaces() { switch { case ifi.Index > 0: if idx != ifi.Index { continue } case byName != nil: if string(byName) != i.Name { continue } default: // Criteria not specified. return syserr.ErrInvalidArgument } addNewLinkMessage(ms, idx, i) found = true break } if !found { return syserr.ErrNoDevice } return nil } // newLink handles RTM_NEWLINK reqeusts. func (p *Protocol) newLink(ctx context.Context, s *netlink.Socket, msg *nlmsg.Message, ms *nlmsg.MessageSet) *syserr.Error { stack := s.Stack() if stack == nil { // No network stack. return syserr.ErrProtocolNotSupported } return stack.SetInterface(ctx, msg) } // setLink handles RTM_SETLINK requests. func (p *Protocol) setLink(ctx context.Context, s *netlink.Socket, msg *nlmsg.Message, ms *nlmsg.MessageSet) *syserr.Error { stack := s.Stack() if stack == nil { // No network stack. return syserr.ErrProtocolNotSupported } if msg.Header().Flags&linux.NLM_F_CREATE == linux.NLM_F_CREATE { return syserr.ErrInvalidArgument } return stack.SetInterface(ctx, msg) } // delLink handles RTM_DELLINK requests. func (p *Protocol) delLink(ctx context.Context, s *netlink.Socket, msg *nlmsg.Message, ms *nlmsg.MessageSet) *syserr.Error { stack := s.Stack() if stack == nil { // No network stack. return syserr.ErrProtocolNotSupported } var ifinfomsg linux.InterfaceInfoMessage attrs, ok := msg.GetData(&ifinfomsg) if !ok { return syserr.ErrInvalidArgument } if ifinfomsg.Index == 0 { // The index is unspecified, search by the interface name. ahdr, value, _, ok := attrs.ParseFirst() if !ok { return syserr.ErrInvalidArgument } switch ahdr.Type { case linux.IFLA_IFNAME: if len(value) < 1 { return syserr.ErrInvalidArgument } ifname := string(value[:len(value)-1]) for idx, ifa := range stack.Interfaces() { if ifname == ifa.Name { ifinfomsg.Index = idx break } } default: return syserr.ErrInvalidArgument } if ifinfomsg.Index == 0 { return syserr.ErrNoDevice } } return syserr.FromError(stack.RemoveInterface(ifinfomsg.Index)) } // addNewLinkMessage appends RTM_NEWLINK message for the given interface into // the message set. func addNewLinkMessage(ms *nlmsg.MessageSet, idx int32, i inet.Interface) { m := ms.AddMessage(linux.NetlinkMessageHeader{ Type: linux.RTM_NEWLINK, }) m.Put(&linux.InterfaceInfoMessage{ Family: linux.AF_UNSPEC, Type: i.DeviceType, Index: idx, Flags: i.Flags, }) m.PutAttrString(linux.IFLA_IFNAME, i.Name) m.PutAttr(linux.IFLA_MTU, primitive.AllocateUint32(i.MTU)) mac := make([]byte, 6) brd := mac if len(i.Addr) > 0 { mac = i.Addr brd = bytes.Repeat([]byte{0xff}, len(i.Addr)) } m.PutAttr(linux.IFLA_ADDRESS, primitive.AsByteSlice(mac)) m.PutAttr(linux.IFLA_BROADCAST, primitive.AsByteSlice(brd)) // TODO(gvisor.dev/issue/578): There are many more attributes. } // dumpAddrs handles RTM_GETADDR dump requests. func (p *Protocol) dumpAddrs(ctx context.Context, s *netlink.Socket, msg *nlmsg.Message, ms *nlmsg.MessageSet) *syserr.Error { // RTM_GETADDR dump requests need not contain anything more than the // netlink header and 1 byte protocol family common to all // NETLINK_ROUTE requests. // // TODO(b/68878065): Filter output by passed protocol family. // The RTM_GETADDR dump response is a set of RTM_NEWADDR messages each // containing an InterfaceAddrMessage followed by a set of netlink // attributes. // We always send back an NLMSG_DONE. ms.Multi = true stack := s.Stack() if stack == nil { // No network devices. return nil } for id, as := range stack.InterfaceAddrs() { for _, a := range as { m := ms.AddMessage(linux.NetlinkMessageHeader{ Type: linux.RTM_NEWADDR, }) m.Put(&linux.InterfaceAddrMessage{ Family: a.Family, PrefixLen: a.PrefixLen, Index: uint32(id), }) addr := primitive.ByteSlice([]byte(a.Addr)) m.PutAttr(linux.IFA_LOCAL, &addr) m.PutAttr(linux.IFA_ADDRESS, &addr) // TODO(gvisor.dev/issue/578): There are many more attributes. } } return nil } // commonPrefixLen reports the length of the longest IP address prefix. // This is a simplified version from Golang's src/net/addrselect.go. func commonPrefixLen(a, b []byte) (cpl int) { for len(a) > 0 { if a[0] == b[0] { cpl += 8 a = a[1:] b = b[1:] continue } bits := 8 ab, bb := a[0], b[0] for { ab >>= 1 bb >>= 1 bits-- if ab == bb { cpl += bits return } } } return } // fillRoute returns the Route using LPM algorithm. Refer to Linux's // net/ipv4/route.c:rt_fill_info(). func fillRoute(routes []inet.Route, addr []byte) (inet.Route, *syserr.Error) { family := uint8(linux.AF_INET) if len(addr) != 4 { family = linux.AF_INET6 } idx := -1 // Index of the Route rule to be returned. idxDef := -1 // Index of the default route rule. prefix := 0 // Current longest prefix. for i, route := range routes { if route.Family != family { continue } if len(route.GatewayAddr) > 0 && route.DstLen == 0 { idxDef = i continue } cpl := commonPrefixLen(addr, route.DstAddr) if cpl < int(route.DstLen) { continue } cpl = int(route.DstLen) if cpl > prefix { idx = i prefix = cpl } } if idx == -1 { idx = idxDef } if idx == -1 { return inet.Route{}, syserr.ErrHostUnreachable } route := routes[idx] if family == linux.AF_INET { route.DstLen = 32 } else { route.DstLen = 128 } route.DstAddr = addr route.Flags |= linux.RTM_F_CLONED // This route is cloned. return route, nil } // parseForDestination parses a message as format of RouteMessage-RtAttr-dst. func parseForDestination(msg *nlmsg.Message) ([]byte, *syserr.Error) { var rtMsg linux.RouteMessage attrs, ok := msg.GetData(&rtMsg) if !ok { return nil, syserr.ErrInvalidArgument } // iproute2 added the RTM_F_LOOKUP_TABLE flag in version v4.4.0. See // commit bc234301af12. Note we don't check this flag for backward // compatibility. if rtMsg.Flags != 0 && rtMsg.Flags != linux.RTM_F_LOOKUP_TABLE { return nil, syserr.ErrNotSupported } // Expect first attribute is RTA_DST. if hdr, value, _, ok := attrs.ParseFirst(); ok && hdr.Type == linux.RTA_DST { return value, nil } return nil, syserr.ErrInvalidArgument } // newRoute handles RTM_NEWROUTE requests. func (p *Protocol) newRoute(ctx context.Context, s *netlink.Socket, msg *nlmsg.Message, ms *nlmsg.MessageSet) *syserr.Error { stack := s.Stack() if stack == nil { // No network routes. return syserr.ErrProtocolNotSupported } if msg.Header().Flags&linux.NLM_F_REQUEST != linux.NLM_F_REQUEST { return syserr.ErrProtocolNotSupported } return stack.NewRoute(ctx, msg) } // dumpRoutes handles RTM_GETROUTE requests. func (p *Protocol) dumpRoutes(ctx context.Context, s *netlink.Socket, msg *nlmsg.Message, ms *nlmsg.MessageSet) *syserr.Error { // RTM_GETROUTE dump requests need not contain anything more than the // netlink header and 1 byte protocol family common to all // NETLINK_ROUTE requests. stack := s.Stack() if stack == nil { // No network routes. return nil } hdr := msg.Header() routeTables := stack.RouteTable() if hdr.Flags == linux.NLM_F_REQUEST { dst, err := parseForDestination(msg) if err != nil { return err } route, err := fillRoute(routeTables, dst) if err != nil { // TODO(gvisor.dev/issue/1237): return NLMSG_ERROR with ENETUNREACH. return syserr.ErrNotSupported } routeTables = append([]inet.Route{}, route) } else if hdr.Flags&linux.NLM_F_DUMP == linux.NLM_F_DUMP { // We always send back an NLMSG_DONE. ms.Multi = true } else { // TODO(b/68878065): Only above cases are supported. return syserr.ErrNotSupported } for _, rt := range routeTables { m := ms.AddMessage(linux.NetlinkMessageHeader{ Type: linux.RTM_NEWROUTE, }) m.Put(&linux.RouteMessage{ Family: rt.Family, DstLen: rt.DstLen, SrcLen: rt.SrcLen, TOS: rt.TOS, // Always return the main table since we don't have multiple // routing tables. Table: linux.RT_TABLE_MAIN, Protocol: rt.Protocol, Scope: rt.Scope, Type: rt.Type, Flags: rt.Flags, }) m.PutAttr(254, primitive.AsByteSlice([]byte{123})) if rt.DstLen > 0 { m.PutAttr(linux.RTA_DST, primitive.AsByteSlice(rt.DstAddr)) } if rt.SrcLen > 0 { m.PutAttr(linux.RTA_SRC, primitive.AsByteSlice(rt.SrcAddr)) } if rt.OutputInterface != 0 { m.PutAttr(linux.RTA_OIF, primitive.AllocateInt32(rt.OutputInterface)) } if len(rt.GatewayAddr) > 0 { m.PutAttr(linux.RTA_GATEWAY, primitive.AsByteSlice(rt.GatewayAddr)) } // TODO(gvisor.dev/issue/578): There are many more attributes. } return nil } // newAddr handles RTM_NEWADDR requests. func (p *Protocol) newAddr(ctx context.Context, s *netlink.Socket, msg *nlmsg.Message, ms *nlmsg.MessageSet) *syserr.Error { stack := s.Stack() if stack == nil { // No network stack. return syserr.ErrProtocolNotSupported } var ifa linux.InterfaceAddrMessage attrs, ok := msg.GetData(&ifa) if !ok { return syserr.ErrInvalidArgument } for !attrs.Empty() { ahdr, value, rest, ok := attrs.ParseFirst() if !ok { return syserr.ErrInvalidArgument } attrs = rest // NOTE: A netlink message will contain multiple header attributes. // Both the IFA_ADDRESS and IFA_LOCAL attributes are typically sent // with IFA_ADDRESS being a prefix address and IFA_LOCAL being the // local interface address. We add the local interface address here // and ignore the IFA_ADDRESS. switch ahdr.Type { case linux.IFA_LOCAL: err := stack.AddInterfaceAddr(int32(ifa.Index), inet.InterfaceAddr{ Family: ifa.Family, PrefixLen: ifa.PrefixLen, Flags: ifa.Flags, Addr: value, }) if linuxerr.Equals(linuxerr.EEXIST, err) { flags := msg.Header().Flags if flags&linux.NLM_F_EXCL != 0 { return syserr.ErrExists } } else if err != nil { return syserr.ErrInvalidArgument } case linux.IFA_ADDRESS: case linux.IFA_BROADCAST: // TODO(b/340929168): support IFA_BROADCAST. The standard // broadcast address (the last IP address of the subnet) is // used by default. default: ctx.Warningf("Unknown attribute: %v", ahdr.Type) return syserr.ErrNotSupported } } return nil } // delAddr handles RTM_DELADDR requests. func (p *Protocol) delAddr(ctx context.Context, s *netlink.Socket, msg *nlmsg.Message, ms *nlmsg.MessageSet) *syserr.Error { stack := s.Stack() if stack == nil { // No network stack. return syserr.ErrProtocolNotSupported } var ifa linux.InterfaceAddrMessage attrs, ok := msg.GetData(&ifa) if !ok { return syserr.ErrInvalidArgument } for !attrs.Empty() { ahdr, value, rest, ok := attrs.ParseFirst() if !ok { return syserr.ErrInvalidArgument } attrs = rest // NOTE: A netlink message will contain multiple header attributes. // Both the IFA_ADDRESS and IFA_LOCAL attributes are typically sent // with IFA_ADDRESS being a prefix address and IFA_LOCAL being the // local interface address. We use the local interface address to // remove the address and ignore the IFA_ADDRESS. switch ahdr.Type { case linux.IFA_LOCAL: err := stack.RemoveInterfaceAddr(int32(ifa.Index), inet.InterfaceAddr{ Family: ifa.Family, PrefixLen: ifa.PrefixLen, Flags: ifa.Flags, Addr: value, }) if err != nil { return syserr.ErrBadLocalAddress } case linux.IFA_ADDRESS: default: return syserr.ErrNotSupported } } return nil } // ProcessMessage implements netlink.Protocol.ProcessMessage. func (p *Protocol) ProcessMessage(ctx context.Context, s *netlink.Socket, msg *nlmsg.Message, ms *nlmsg.MessageSet) *syserr.Error { hdr := msg.Header() // All messages start with a 1 byte protocol family. var family primitive.Uint8 if _, ok := msg.GetData(&family); !ok { // Linux ignores messages missing the protocol family. See // net/core/rtnetlink.c:rtnetlink_rcv_msg. return nil } // Non-GET message types require CAP_NET_ADMIN. if typeKind(hdr.Type) != kindGet { creds := auth.CredentialsFromContext(ctx) if !creds.HasCapability(linux.CAP_NET_ADMIN) { return syserr.ErrPermissionDenied } } if hdr.Flags&linux.NLM_F_DUMP == linux.NLM_F_DUMP { // TODO(b/68878065): Only the dump variant of the types below are // supported. switch hdr.Type { case linux.RTM_GETLINK: return p.dumpLinks(ctx, s, msg, ms) case linux.RTM_GETADDR: return p.dumpAddrs(ctx, s, msg, ms) case linux.RTM_GETROUTE: return p.dumpRoutes(ctx, s, msg, ms) default: return syserr.ErrNotSupported } } else if hdr.Flags&linux.NLM_F_REQUEST == linux.NLM_F_REQUEST { switch hdr.Type { case linux.RTM_NEWLINK: return p.newLink(ctx, s, msg, ms) case linux.RTM_GETLINK: return p.getLink(ctx, s, msg, ms) case linux.RTM_DELLINK: return p.delLink(ctx, s, msg, ms) case linux.RTM_SETLINK: // RTM_NEWLINK is backward compatible to RTM_SETLINK. return p.setLink(ctx, s, msg, ms) case linux.RTM_NEWROUTE: return p.newRoute(ctx, s, msg, ms) case linux.RTM_GETROUTE: return p.dumpRoutes(ctx, s, msg, ms) case linux.RTM_NEWADDR: return p.newAddr(ctx, s, msg, ms) case linux.RTM_DELADDR: return p.delAddr(ctx, s, msg, ms) default: return syserr.ErrNotSupported } } return syserr.ErrNotSupported } // init registers the NETLINK_ROUTE provider. func init() { netlink.RegisterProvider(linux.NETLINK_ROUTE, NewProtocol) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netlink/route/route_state_autogen.go000066400000000000000000000011521465435605700314500ustar00rootroot00000000000000// automatically generated by stateify. package route import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (p *Protocol) StateTypeName() string { return "pkg/sentry/socket/netlink/route.Protocol" } func (p *Protocol) StateFields() []string { return []string{} } func (p *Protocol) beforeSave() {} // +checklocksignore func (p *Protocol) StateSave(stateSinkObject state.Sink) { p.beforeSave() } func (p *Protocol) afterLoad(context.Context) {} // +checklocksignore func (p *Protocol) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func init() { state.Register((*Protocol)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netlink/socket.go000066400000000000000000000553031465435605700255310ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package netlink provides core functionality for netlink sockets. package netlink import ( "io" "math" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/abi/linux/errno" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netlink/nlmsg" "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port" "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) const sizeOfInt32 int = 4 const ( // minBufferSize is the smallest size of a send buffer. minSendBufferSize = 4 << 10 // 4096 bytes. // defaultSendBufferSize is the default size for the send buffer. defaultSendBufferSize = 16 * 1024 // maxBufferSize is the largest size a send buffer can grow to. maxSendBufferSize = 4 << 20 // 4MB ) var errNoFilter = syserr.New("no filter attached", errno.ENOENT) // Socket is the base socket type for netlink sockets. // // This implementation only supports userspace sending and receiving messages // to/from the kernel. // // Socket implements socket.Socket and transport.Credentialer. // // +stateify savable type Socket struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.LockFD socket.SendReceiveTimeout // ports provides netlink port allocation. ports *port.Manager // protocol is the netlink protocol implementation. protocol Protocol // skType is the socket type. This is either SOCK_DGRAM or SOCK_RAW for // netlink sockets. skType linux.SockType // ep is a datagram unix endpoint used to buffer messages sent from the // kernel to userspace. RecvMsg reads messages from this endpoint. ep transport.Endpoint // connection is the kernel's connection to ep, used to write messages // sent to userspace. connection transport.ConnectedEndpoint // mu protects the fields below. mu sync.Mutex `state:"nosave"` // bound indicates that portid is valid. bound bool // portID is the port ID allocated for this socket. portID int32 // sendBufferSize is the send buffer "size". We don't actually have a // fixed buffer but only consume this many bytes. sendBufferSize uint32 // filter indicates that this socket has a BPF filter "installed". // // TODO(gvisor.dev/issue/1119): We don't actually support filtering, // this is just bookkeeping for tracking add/remove. filter bool // netns is the network namespace associated with the socket. netns *inet.Namespace } var _ socket.Socket = (*Socket)(nil) var _ transport.Credentialer = (*Socket)(nil) // New creates a new Socket. func New(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socket, *syserr.Error) { // Datagram endpoint used to buffer kernel -> user messages. ep := transport.NewConnectionless(t) // Bind the endpoint for good measure so we can connect to it. The // bound address will never be exposed. if err := ep.Bind(transport.Address{Addr: "dummy"}); err != nil { ep.Close(t) return nil, err } // Create a connection from which the kernel can write messages. connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t) if err != nil { ep.Close(t) return nil, err } fd := &Socket{ ports: t.Kernel().NetlinkPorts(), protocol: protocol, skType: skType, ep: ep, connection: connection, sendBufferSize: defaultSendBufferSize, netns: t.GetNetworkNamespace(), } fd.LockFD.Init(&vfs.FileLocks{}) return fd, nil } // Stack returns the network stack associated with the socket. func (s *Socket) Stack() inet.Stack { return s.netns.Stack() } // Release implements vfs.FileDescriptionImpl.Release. func (s *Socket) Release(ctx context.Context) { t := kernel.TaskFromContext(ctx) t.Kernel().DeleteSocket(&s.vfsfd) s.connection.Release(ctx) s.ep.Close(ctx) if s.bound { s.ports.Release(s.protocol.Protocol(), s.portID) } s.netns.DecRef(ctx) } // Epollable implements FileDescriptionImpl.Epollable. func (s *Socket) Epollable() bool { return true } // Ioctl implements vfs.FileDescriptionImpl. func (*Socket) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { // TODO(b/68878065): no ioctls supported. return 0, linuxerr.ENOTTY } // PRead implements vfs.FileDescriptionImpl. func (s *Socket) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return 0, linuxerr.ESPIPE } // Read implements vfs.FileDescriptionImpl. func (s *Socket) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { return 0, linuxerr.EOPNOTSUPP } if dst.NumBytes() == 0 { return 0, nil } r := unix.EndpointReader{ Endpoint: s.ep, } n, err := dst.CopyOutFrom(ctx, &r) if r.Notify != nil { r.Notify() } return n, err } // PWrite implements vfs.FileDescriptionImpl. func (s *Socket) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { return 0, linuxerr.ESPIPE } // Write implements vfs.FileDescriptionImpl. func (s *Socket) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { return 0, linuxerr.EOPNOTSUPP } n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{}) return int64(n), err.ToError() } // Readiness implements waiter.Waitable.Readiness. func (s *Socket) Readiness(mask waiter.EventMask) waiter.EventMask { // ep holds messages to be read and thus handles EventIn readiness. ready := s.ep.Readiness(mask) if mask&waiter.WritableEvents != 0 { // sendMsg handles messages synchronously and is thus always // ready for writing. ready |= waiter.WritableEvents } return ready } // EventRegister implements waiter.Waitable.EventRegister. func (s *Socket) EventRegister(e *waiter.Entry) error { return s.ep.EventRegister(e) // Writable readiness never changes, so no registration is needed. } // EventUnregister implements waiter.Waitable.EventUnregister. func (s *Socket) EventUnregister(e *waiter.Entry) { s.ep.EventUnregister(e) } // Passcred implements transport.Credentialer.Passcred. func (s *Socket) Passcred() bool { return s.ep.SocketOptions().GetPassCred() } // ConnectedPasscred implements transport.Credentialer.ConnectedPasscred. func (s *Socket) ConnectedPasscred() bool { // This socket is connected to the kernel, which doesn't need creds. // // This is arbitrary, as ConnectedPasscred on this type has no callers. return false } // ExtractSockAddr extracts the SockAddrNetlink from b. func ExtractSockAddr(b []byte) (*linux.SockAddrNetlink, *syserr.Error) { if len(b) < linux.SockAddrNetlinkSize { return nil, syserr.ErrBadAddress } var sa linux.SockAddrNetlink sa.UnmarshalUnsafe(b) if sa.Family != linux.AF_NETLINK { return nil, syserr.ErrInvalidArgument } return &sa, nil } // bindPort binds this socket to a port, preferring 'port' if it is available. // // port of 0 defaults to the ThreadGroup ID. // // Preconditions: mu is held. func (s *Socket) bindPort(t *kernel.Task, port int32) *syserr.Error { if s.bound { // Re-binding is only allowed if the port doesn't change. if port != s.portID { return syserr.ErrInvalidArgument } return nil } if port == 0 { port = int32(t.ThreadGroup().ID()) } port, ok := s.ports.Allocate(s.protocol.Protocol(), port) if !ok { return syserr.ErrBusy } s.portID = port s.bound = true return nil } // Bind implements socket.Socket.Bind. func (s *Socket) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { a, err := ExtractSockAddr(sockaddr) if err != nil { return err } // No support for multicast groups yet. if a.Groups != 0 { return syserr.ErrPermissionDenied } s.mu.Lock() defer s.mu.Unlock() return s.bindPort(t, int32(a.PortID)) } // Connect implements socket.Socket.Connect. func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { a, err := ExtractSockAddr(sockaddr) if err != nil { return err } // No support for multicast groups yet. if a.Groups != 0 { return syserr.ErrPermissionDenied } s.mu.Lock() defer s.mu.Unlock() if a.PortID == 0 { // Netlink sockets default to connected to the kernel, but // connecting anyways automatically binds if not already bound. if !s.bound { // Pass port 0 to get an auto-selected port ID. return s.bindPort(t, 0) } return nil } // We don't support non-kernel destination ports. Linux returns EPERM // if applications attempt to do this without NL_CFG_F_NONROOT_SEND, so // we emulate that. return syserr.ErrPermissionDenied } // Accept implements socket.Socket.Accept. func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { // Netlink sockets never support accept. return 0, nil, 0, syserr.ErrNotSupported } // Listen implements socket.Socket.Listen. func (s *Socket) Listen(t *kernel.Task, backlog int) *syserr.Error { // Netlink sockets never support listen. return syserr.ErrNotSupported } // Shutdown implements socket.Socket.Shutdown. func (s *Socket) Shutdown(t *kernel.Task, how int) *syserr.Error { // Netlink sockets never support shutdown. return syserr.ErrNotSupported } // GetSockOpt implements socket.Socket.GetSockOpt. func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { switch level { case linux.SOL_SOCKET: switch name { case linux.SO_SNDBUF: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } s.mu.Lock() defer s.mu.Unlock() return primitive.AllocateInt32(int32(s.sendBufferSize)), nil case linux.SO_RCVBUF: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } // We don't have limit on receiving size. return primitive.AllocateInt32(math.MaxInt32), nil case linux.SO_PASSCRED: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } var passcred primitive.Int32 if s.Passcred() { passcred = 1 } return &passcred, nil case linux.SO_SNDTIMEO: if outLen < linux.SizeOfTimeval { return nil, syserr.ErrInvalidArgument } sendTimeout := linux.NsecToTimeval(s.SendTimeout()) return &sendTimeout, nil case linux.SO_RCVTIMEO: if outLen < linux.SizeOfTimeval { return nil, syserr.ErrInvalidArgument } recvTimeout := linux.NsecToTimeval(s.RecvTimeout()) return &recvTimeout, nil } case linux.SOL_NETLINK: switch name { case linux.NETLINK_BROADCAST_ERROR, linux.NETLINK_CAP_ACK, linux.NETLINK_DUMP_STRICT_CHK, linux.NETLINK_EXT_ACK, linux.NETLINK_LIST_MEMBERSHIPS, linux.NETLINK_NO_ENOBUFS, linux.NETLINK_PKTINFO: // Not supported. } } // TODO(b/68878065): other sockopts are not supported. return nil, syserr.ErrProtocolNotAvailable } // SetSockOpt implements socket.Socket.SetSockOpt. func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { switch level { case linux.SOL_SOCKET: switch name { case linux.SO_SNDBUF: if len(opt) < sizeOfInt32 { return syserr.ErrInvalidArgument } size := hostarch.ByteOrder.Uint32(opt) if size < minSendBufferSize { size = minSendBufferSize } else if size > maxSendBufferSize { size = maxSendBufferSize } s.mu.Lock() s.sendBufferSize = size s.mu.Unlock() return nil case linux.SO_RCVBUF: if len(opt) < sizeOfInt32 { return syserr.ErrInvalidArgument } // We don't have limit on receiving size. So just accept anything as // valid for compatibility. return nil case linux.SO_PASSCRED: if len(opt) < sizeOfInt32 { return syserr.ErrInvalidArgument } passcred := hostarch.ByteOrder.Uint32(opt) s.ep.SocketOptions().SetPassCred(passcred != 0) return nil case linux.SO_ATTACH_FILTER: // TODO(gvisor.dev/issue/1119): We don't actually // support filtering. If this socket can't ever send // messages, then there is nothing to filter and we can // advertise support. Otherwise, be conservative and // return an error. if s.protocol.CanSend() { return syserr.ErrProtocolNotAvailable } s.mu.Lock() s.filter = true s.mu.Unlock() return nil case linux.SO_DETACH_FILTER: // TODO(gvisor.dev/issue/1119): See above. if s.protocol.CanSend() { return syserr.ErrProtocolNotAvailable } s.mu.Lock() filter := s.filter s.filter = false s.mu.Unlock() if !filter { return errNoFilter } return nil case linux.SO_SNDTIMEO: if len(opt) < linux.SizeOfTimeval { return syserr.ErrInvalidArgument } var v linux.Timeval v.UnmarshalBytes(opt) if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) { return syserr.ErrDomain } s.SetSendTimeout(v.ToNsecCapped()) return nil case linux.SO_RCVTIMEO: if len(opt) < linux.SizeOfTimeval { return syserr.ErrInvalidArgument } var v linux.Timeval v.UnmarshalBytes(opt) if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) { return syserr.ErrDomain } s.SetRecvTimeout(v.ToNsecCapped()) return nil } case linux.SOL_NETLINK: switch name { case linux.NETLINK_ADD_MEMBERSHIP, linux.NETLINK_BROADCAST_ERROR, linux.NETLINK_CAP_ACK, linux.NETLINK_DROP_MEMBERSHIP, linux.NETLINK_DUMP_STRICT_CHK, linux.NETLINK_EXT_ACK, linux.NETLINK_LISTEN_ALL_NSID, linux.NETLINK_NO_ENOBUFS, linux.NETLINK_PKTINFO: // Not supported. } } // TODO(b/68878065): other sockopts are not supported. return syserr.ErrProtocolNotAvailable } // GetSockName implements socket.Socket.GetSockName. func (s *Socket) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { s.mu.Lock() defer s.mu.Unlock() sa := &linux.SockAddrNetlink{ Family: linux.AF_NETLINK, PortID: uint32(s.portID), } return sa, uint32(sa.SizeBytes()), nil } // GetPeerName implements socket.Socket.GetPeerName. func (s *Socket) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { sa := &linux.SockAddrNetlink{ Family: linux.AF_NETLINK, // TODO(b/68878065): Support non-kernel peers. For now the peer // must be the kernel. PortID: 0, } return sa, uint32(sa.SizeBytes()), nil } // RecvMsg implements socket.Socket.RecvMsg. func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { from := &linux.SockAddrNetlink{ Family: linux.AF_NETLINK, PortID: 0, } fromLen := uint32(from.SizeBytes()) trunc := flags&linux.MSG_TRUNC != 0 r := unix.EndpointReader{ Ctx: t, Endpoint: s.ep, Peek: flags&linux.MSG_PEEK != 0, } doRead := func() (int64, error) { return dst.CopyOutFrom(t, &r) } // If MSG_TRUNC is set with a zero byte destination then we still need // to read the message and discard it, or in the case where MSG_PEEK is // set, leave it be. In both cases the full message length must be // returned. if trunc && dst.Addrs.NumBytes() == 0 { doRead = func() (int64, error) { err := r.Truncate() // Always return zero for bytes read since the destination size is // zero. return 0, err } } if n, err := doRead(); err != linuxerr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 { var mflags int if n < int64(r.MsgSize) { mflags |= linux.MSG_TRUNC } if trunc { n = int64(r.MsgSize) } return int(n), mflags, from, fromLen, socket.ControlMessages{}, syserr.FromError(err) } // We'll have to block. Register for notification and keep trying to // receive all the data. e, ch := waiter.NewChannelEntry(waiter.ReadableEvents) if err := s.EventRegister(&e); err != nil { return 0, 0, from, fromLen, socket.ControlMessages{}, syserr.FromError(err) } defer s.EventUnregister(&e) for { if n, err := doRead(); err != linuxerr.ErrWouldBlock { var mflags int if n < int64(r.MsgSize) { mflags |= linux.MSG_TRUNC } if trunc { n = int64(r.MsgSize) } return int(n), mflags, from, fromLen, socket.ControlMessages{}, syserr.FromError(err) } if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain } return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) } } } // kernelSCM implements control.SCMCredentials with credentials that represent // the kernel itself rather than a Task. // // +stateify savable type kernelSCM struct{} // Equals implements transport.CredentialsControlMessage.Equals. func (kernelSCM) Equals(oc transport.CredentialsControlMessage) bool { _, ok := oc.(kernelSCM) return ok } // Credentials implements control.SCMCredentials.Credentials. func (kernelSCM) Credentials(*kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) { return 0, auth.RootUID, auth.RootGID } // kernelCreds is the concrete version of kernelSCM used in all creds. var kernelCreds = &kernelSCM{} // sendResponse sends the response messages in ms back to userspace. func (s *Socket) sendResponse(ctx context.Context, ms *nlmsg.MessageSet) *syserr.Error { // Linux combines multiple netlink messages into a single datagram. bufs := make([][]byte, 0, len(ms.Messages)) for _, m := range ms.Messages { bufs = append(bufs, m.Finalize()) } // All messages are from the kernel. cms := transport.ControlMessages{ Credentials: kernelCreds, } if len(bufs) > 0 { // RecvMsg never receives the address, so we don't need to send // one. _, notify, err := s.connection.Send(ctx, bufs, cms, transport.Address{}) // If the buffer is full, we simply drop messages, just like // Linux. if err != nil && err != syserr.ErrWouldBlock { return err } if notify { s.connection.SendNotify() } } // N.B. multi-part messages should still send NLMSG_DONE even if // nlmsg.MessageSet contains no messages. // // N.B. NLMSG_DONE is always sent in a different datagram. See // net/netlink/af_netlink.c:netlink_dump. if ms.Multi { m := nlmsg.NewMessage(linux.NetlinkMessageHeader{ Type: linux.NLMSG_DONE, Flags: linux.NLM_F_MULTI, Seq: ms.Seq, PortID: uint32(ms.PortID), }) // Add the dump_done_errno payload. m.Put(primitive.AllocateInt64(0)) _, notify, err := s.connection.Send(ctx, [][]byte{m.Finalize()}, cms, transport.Address{}) if err != nil && err != syserr.ErrWouldBlock { return err } if notify { s.connection.SendNotify() } } return nil } func dumpErrorMessage(hdr linux.NetlinkMessageHeader, ms *nlmsg.MessageSet, err *syserr.Error) { m := ms.AddMessage(linux.NetlinkMessageHeader{ Type: linux.NLMSG_ERROR, }) m.Put(&linux.NetlinkErrorMessage{ Error: int32(-err.ToLinux()), Header: hdr, }) } func dumpAckMessage(hdr linux.NetlinkMessageHeader, ms *nlmsg.MessageSet) { m := ms.AddMessage(linux.NetlinkMessageHeader{ Type: linux.NLMSG_ERROR, }) m.Put(&linux.NetlinkErrorMessage{ Error: 0, Header: hdr, }) } // processMessages handles each message in buf, passing it to the protocol // handler for final handling. func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error { for len(buf) > 0 { msg, rest, ok := nlmsg.ParseMessage(buf) if !ok { // Linux ignores messages that are too short. See // net/netlink/af_netlink.c:netlink_rcv_skb. break } buf = rest hdr := msg.Header() // Ignore control messages. if hdr.Type < linux.NLMSG_MIN_TYPE { continue } ms := nlmsg.NewMessageSet(s.portID, hdr.Seq) if err := s.protocol.ProcessMessage(ctx, s, msg, ms); err != nil { dumpErrorMessage(hdr, ms, err) } else if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK { dumpAckMessage(hdr, ms) } if err := s.sendResponse(ctx, ms); err != nil { return err } } return nil } // sendMsg is the core of message send, used for SendMsg and Write. func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) { dstPort := int32(0) if len(to) != 0 { a, err := ExtractSockAddr(to) if err != nil { return 0, err } // No support for multicast groups yet. if a.Groups != 0 { return 0, syserr.ErrPermissionDenied } dstPort = int32(a.PortID) } if dstPort != 0 { // Non-kernel destinations not supported yet. Treat as if // NL_CFG_F_NONROOT_SEND is not set. return 0, syserr.ErrPermissionDenied } s.mu.Lock() defer s.mu.Unlock() // For simplicity, and consistency with Linux, we copy in the entire // message up front. if src.NumBytes() > int64(s.sendBufferSize) { return 0, syserr.ErrMessageTooLong } buf := make([]byte, src.NumBytes()) n, err := src.CopyIn(ctx, buf) // io.EOF can be only returned if src is a file, this means that // sendMsg is called from splice and the error has to be ignored in // this case. if err == io.EOF { err = nil } if err != nil { // Don't partially consume messages. return 0, syserr.FromError(err) } if err := s.processMessages(ctx, buf); err != nil { return 0, err } return n, nil } // SendMsg implements socket.Socket.SendMsg. func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { return s.sendMsg(t, src, to, flags, controlMessages) } // State implements socket.Socket.State. func (s *Socket) State() uint32 { return s.ep.State() } // Type implements socket.Socket.Type. func (s *Socket) Type() (family int, skType linux.SockType, protocol int) { return linux.AF_NETLINK, s.skType, s.protocol.Protocol() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netlink/uevent/000077500000000000000000000000001465435605700252125ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netlink/uevent/protocol.go000066400000000000000000000037121465435605700274050ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package uevent provides a NETLINK_KOBJECT_UEVENT socket protocol. // // NETLINK_KOBJECT_UEVENT sockets send udev-style device events. gVisor does // not support any device events, so these sockets never send any messages. package uevent import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket/netlink" "gvisor.dev/gvisor/pkg/sentry/socket/netlink/nlmsg" "gvisor.dev/gvisor/pkg/syserr" ) // Protocol implements netlink.Protocol. // // +stateify savable type Protocol struct{} var _ netlink.Protocol = (*Protocol)(nil) // NewProtocol creates a NETLINK_KOBJECT_UEVENT netlink.Protocol. func NewProtocol(t *kernel.Task) (netlink.Protocol, *syserr.Error) { return &Protocol{}, nil } // Protocol implements netlink.Protocol.Protocol. func (p *Protocol) Protocol() int { return linux.NETLINK_KOBJECT_UEVENT } // CanSend implements netlink.Protocol.CanSend. func (p *Protocol) CanSend() bool { return false } // ProcessMessage implements netlink.Protocol.ProcessMessage. func (p *Protocol) ProcessMessage(ctx context.Context, s *netlink.Socket, msg *nlmsg.Message, ms *nlmsg.MessageSet) *syserr.Error { // Silently ignore all messages. return nil } // init registers the NETLINK_KOBJECT_UEVENT provider. func init() { netlink.RegisterProvider(linux.NETLINK_KOBJECT_UEVENT, NewProtocol) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netlink/uevent/uevent_state_autogen.go000066400000000000000000000011541465435605700317720ustar00rootroot00000000000000// automatically generated by stateify. package uevent import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (p *Protocol) StateTypeName() string { return "pkg/sentry/socket/netlink/uevent.Protocol" } func (p *Protocol) StateFields() []string { return []string{} } func (p *Protocol) beforeSave() {} // +checklocksignore func (p *Protocol) StateSave(stateSinkObject state.Sink) { p.beforeSave() } func (p *Protocol) afterLoad(context.Context) {} // +checklocksignore func (p *Protocol) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func init() { state.Register((*Protocol)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netstack/000077500000000000000000000000001465435605700240545ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netstack/events_go_proto/000077500000000000000000000000001465435605700272705ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netstack/events_go_proto/events.pb.go000066400000000000000000000121761465435605700315320ustar00rootroot00000000000000// Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.31.0 // protoc v5.26.1 // source: pkg/sentry/socket/netstack/events.proto package events_go_proto import ( protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" reflect "reflect" sync "sync" ) const ( // Verify that this generated code is sufficiently up-to-date. _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) // Verify that runtime/protoimpl is sufficiently up-to-date. _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) type SentryTcpListenEvent struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Port *int32 `protobuf:"varint,1,opt,name=port,proto3,oneof" json:"port,omitempty"` } func (x *SentryTcpListenEvent) Reset() { *x = SentryTcpListenEvent{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_socket_netstack_events_proto_msgTypes[0] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *SentryTcpListenEvent) String() string { return protoimpl.X.MessageStringOf(x) } func (*SentryTcpListenEvent) ProtoMessage() {} func (x *SentryTcpListenEvent) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_socket_netstack_events_proto_msgTypes[0] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use SentryTcpListenEvent.ProtoReflect.Descriptor instead. func (*SentryTcpListenEvent) Descriptor() ([]byte, []int) { return file_pkg_sentry_socket_netstack_events_proto_rawDescGZIP(), []int{0} } func (x *SentryTcpListenEvent) GetPort() int32 { if x != nil && x.Port != nil { return *x.Port } return 0 } var File_pkg_sentry_socket_netstack_events_proto protoreflect.FileDescriptor var file_pkg_sentry_socket_netstack_events_proto_rawDesc = []byte{ 0x0a, 0x27, 0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x2f, 0x73, 0x6f, 0x63, 0x6b, 0x65, 0x74, 0x2f, 0x6e, 0x65, 0x74, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x06, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x22, 0x38, 0x0a, 0x14, 0x53, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x54, 0x63, 0x70, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x12, 0x17, 0x0a, 0x04, 0x70, 0x6f, 0x72, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x48, 0x00, 0x52, 0x04, 0x70, 0x6f, 0x72, 0x74, 0x88, 0x01, 0x01, 0x42, 0x07, 0x0a, 0x05, 0x5f, 0x70, 0x6f, 0x72, 0x74, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( file_pkg_sentry_socket_netstack_events_proto_rawDescOnce sync.Once file_pkg_sentry_socket_netstack_events_proto_rawDescData = file_pkg_sentry_socket_netstack_events_proto_rawDesc ) func file_pkg_sentry_socket_netstack_events_proto_rawDescGZIP() []byte { file_pkg_sentry_socket_netstack_events_proto_rawDescOnce.Do(func() { file_pkg_sentry_socket_netstack_events_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_sentry_socket_netstack_events_proto_rawDescData) }) return file_pkg_sentry_socket_netstack_events_proto_rawDescData } var file_pkg_sentry_socket_netstack_events_proto_msgTypes = make([]protoimpl.MessageInfo, 1) var file_pkg_sentry_socket_netstack_events_proto_goTypes = []interface{}{ (*SentryTcpListenEvent)(nil), // 0: gvisor.SentryTcpListenEvent } var file_pkg_sentry_socket_netstack_events_proto_depIdxs = []int32{ 0, // [0:0] is the sub-list for method output_type 0, // [0:0] is the sub-list for method input_type 0, // [0:0] is the sub-list for extension type_name 0, // [0:0] is the sub-list for extension extendee 0, // [0:0] is the sub-list for field type_name } func init() { file_pkg_sentry_socket_netstack_events_proto_init() } func file_pkg_sentry_socket_netstack_events_proto_init() { if File_pkg_sentry_socket_netstack_events_proto != nil { return } if !protoimpl.UnsafeEnabled { file_pkg_sentry_socket_netstack_events_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*SentryTcpListenEvent); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } } file_pkg_sentry_socket_netstack_events_proto_msgTypes[0].OneofWrappers = []interface{}{} type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_pkg_sentry_socket_netstack_events_proto_rawDesc, NumEnums: 0, NumMessages: 1, NumExtensions: 0, NumServices: 0, }, GoTypes: file_pkg_sentry_socket_netstack_events_proto_goTypes, DependencyIndexes: file_pkg_sentry_socket_netstack_events_proto_depIdxs, MessageInfos: file_pkg_sentry_socket_netstack_events_proto_msgTypes, }.Build() File_pkg_sentry_socket_netstack_events_proto = out.File file_pkg_sentry_socket_netstack_events_proto_rawDesc = nil file_pkg_sentry_socket_netstack_events_proto_goTypes = nil file_pkg_sentry_socket_netstack_events_proto_depIdxs = nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netstack/netstack.go000066400000000000000000003450331465435605700262270ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package netstack provides an implementation of the socket.Socket interface // that is backed by a tcpip.Endpoint. // // It does not depend on any particular endpoint implementation, and thus can // be used to expose certain endpoints to the sentry while leaving others out, // for example, TCP endpoints and Unix-domain endpoints. // // Lock ordering: netstack => mm: ioSequenceReadWriter copies user memory inside // tcpip.Endpoint.Write(). Netstack is allowed to (and does) hold locks during // this operation. package netstack import ( "bytes" "encoding/binary" "fmt" "io" "io/ioutil" "math" "reflect" "time" "golang.org/x/sys/unix" "google.golang.org/protobuf/proto" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/abi/linux/errno" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" epb "gvisor.dev/gvisor/pkg/sentry/socket/netstack/events_go_proto" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) const bitsPerUint32 = 32 // statCounterValue returns a function usable as callback function when defining a gVisor Sentry // metric that contains the value counted by the StatCounter. // This avoids a dependency loop in the tcpip package. func statCounterValue(cm *tcpip.StatCounter) func(...*metric.FieldValue) uint64 { return func(...*metric.FieldValue) uint64 { return cm.Value() } } func mustCreateMetric(name, description string) *tcpip.StatCounter { var cm tcpip.StatCounter metric.MustRegisterCustomUint64Metric(name, metric.Uint64Metadata{ Cumulative: true, Description: description, }, statCounterValue(&cm)) return &cm } func mustCreateGauge(name, description string) *tcpip.StatCounter { var cm tcpip.StatCounter metric.MustRegisterCustomUint64Metric(name, metric.Uint64Metadata{ Cumulative: false, Description: description, }, statCounterValue(&cm)) return &cm } // Metrics contains metrics exported by netstack. var Metrics = tcpip.Stats{ DroppedPackets: mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped at the transport layer."), NICs: tcpip.NICStats{ MalformedL4RcvdPackets: mustCreateMetric("/netstack/nic/malformed_l4_received_packets", "Number of packets received that failed L4 header parsing."), Tx: tcpip.NICPacketStats{ Packets: mustCreateMetric("/netstack/nic/tx/packets", "Number of packets transmitted."), Bytes: mustCreateMetric("/netstack/nic/tx/bytes", "Number of bytes transmitted."), }, TxPacketsDroppedNoBufferSpace: mustCreateMetric("/netstack/nic/tx_packets_dropped_no_buffer_space", "Number of TX packets dropped as a result of no buffer space errors."), Rx: tcpip.NICPacketStats{ Packets: mustCreateMetric("/netstack/nic/rx/packets", "Number of packets received."), Bytes: mustCreateMetric("/netstack/nic/rx/bytes", "Number of bytes received."), }, DisabledRx: tcpip.NICPacketStats{ Packets: mustCreateMetric("/netstack/nic/disabled_rx/packets", "Number of packets received on disabled NICs."), Bytes: mustCreateMetric("/netstack/nic/disabled_rx/bytes", "Number of bytes received on disabled NICs."), }, Neighbor: tcpip.NICNeighborStats{ UnreachableEntryLookups: mustCreateMetric("/netstack/nic/neighbor/unreachable_entry_loopups", "Number of lookups performed on a neighbor entry in Unreachable state."), DroppedConfirmationForNoninitiatedNeighbor: mustCreateMetric("/netstack/nic/neighbor/dropped_confirmation_for_noninitiated_neighbor", "Number of advertisements received that don't match an entry in the neighbor cache."), DroppedInvalidLinkAddressConfirmations: mustCreateMetric("/netstack/nic/neighbor/dropped_invalid_link_address_confirmations", "Number of advertisements dropped because they have empty source link-layer addresses"), }, }, ICMP: tcpip.ICMPStats{ V4: tcpip.ICMPv4Stats{ PacketsSent: tcpip.ICMPv4SentPacketStats{ ICMPv4PacketStats: tcpip.ICMPv4PacketStats{ EchoRequest: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_request", "Number of ICMPv4 echo request packets sent."), EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Number of ICMPv4 echo reply packets sent."), DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Number of ICMPv4 destination unreachable packets sent."), SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Number of ICMPv4 source quench packets sent."), Redirect: mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Number of ICMPv4 redirect packets sent."), TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Number of ICMPv4 time exceeded packets sent."), ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Number of ICMPv4 parameter problem packets sent."), Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Number of ICMPv4 timestamp packets sent."), TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Number of ICMPv4 timestamp reply packets sent."), InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Number of ICMPv4 information request packets sent."), InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Number of ICMPv4 information reply packets sent."), }, Dropped: mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Number of ICMPv4 packets dropped due to link layer errors."), RateLimited: mustCreateMetric("/netstack/icmp/v4/packets_sent/rate_limited", "Number of ICMPv4 packets dropped due to rate limit being exceeded."), }, PacketsReceived: tcpip.ICMPv4ReceivedPacketStats{ ICMPv4PacketStats: tcpip.ICMPv4PacketStats{ EchoRequest: mustCreateMetric("/netstack/icmp/v4/packets_received/echo_request", "Number of ICMPv4 echo request packets received."), EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Number of ICMPv4 echo reply packets received."), DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Number of ICMPv4 destination unreachable packets received."), SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Number of ICMPv4 source quench packets received."), Redirect: mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Number of ICMPv4 redirect packets received."), TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Number of ICMPv4 time exceeded packets received."), ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Number of ICMPv4 parameter problem packets received."), Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Number of ICMPv4 timestamp packets received."), TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Number of ICMPv4 timestamp reply packets received."), InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Number of ICMPv4 information request packets received."), InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Number of ICMPv4 information reply packets received."), }, Invalid: mustCreateMetric("/netstack/icmp/v4/packets_received/invalid", "Number of ICMPv4 packets received that the transport layer could not parse."), }, }, V6: tcpip.ICMPv6Stats{ PacketsSent: tcpip.ICMPv6SentPacketStats{ ICMPv6PacketStats: tcpip.ICMPv6PacketStats{ EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Number of ICMPv6 echo request packets sent."), EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Number of ICMPv6 echo reply packets sent."), DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Number of ICMPv6 destination unreachable packets sent."), PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Number of ICMPv6 packet too big packets sent."), TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Number of ICMPv6 time exceeded packets sent."), ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Number of ICMPv6 parameter problem packets sent."), RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Number of ICMPv6 router solicit packets sent."), RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Number of ICMPv6 router advert packets sent."), NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets sent."), NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Number of ICMPv6 neighbor advert packets sent."), RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Number of ICMPv6 redirect message packets sent."), MulticastListenerQuery: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_query", "Number of ICMPv6 multicast listener query packets sent."), MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."), MulticastListenerDone: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."), }, Dropped: mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Number of ICMPv6 packets dropped due to link layer errors."), RateLimited: mustCreateMetric("/netstack/icmp/v6/packets_sent/rate_limited", "Number of ICMPv6 packets dropped due to rate limit being exceeded."), }, PacketsReceived: tcpip.ICMPv6ReceivedPacketStats{ ICMPv6PacketStats: tcpip.ICMPv6PacketStats{ EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Number of ICMPv6 echo request packets received."), EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Number of ICMPv6 echo reply packets received."), DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Number of ICMPv6 destination unreachable packets received."), PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Number of ICMPv6 packet too big packets received."), TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Number of ICMPv6 time exceeded packets received."), ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Number of ICMPv6 parameter problem packets received."), RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Number of ICMPv6 router solicit packets received."), RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Number of ICMPv6 router advert packets received."), NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets received."), NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Number of ICMPv6 neighbor advert packets received."), RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Number of ICMPv6 redirect message packets received."), MulticastListenerQuery: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_query", "Number of ICMPv6 multicast listener query packets received."), MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."), MulticastListenerDone: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."), }, Unrecognized: mustCreateMetric("/netstack/icmp/v6/packets_received/unrecognized", "Number of ICMPv6 packets received that the transport layer does not know how to parse."), Invalid: mustCreateMetric("/netstack/icmp/v6/packets_received/invalid", "Number of ICMPv6 packets received that the transport layer could not parse."), RouterOnlyPacketsDroppedByHost: mustCreateMetric("/netstack/icmp/v6/packets_received/router_only_packets_dropped_by_host", "Number of ICMPv6 packets dropped due to being router-specific packets."), }, }, }, IGMP: tcpip.IGMPStats{ PacketsSent: tcpip.IGMPSentPacketStats{ IGMPPacketStats: tcpip.IGMPPacketStats{ MembershipQuery: mustCreateMetric("/netstack/igmp/packets_sent/membership_query", "Number of IGMP Membership Query messages sent."), V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v1_membership_report", "Number of IGMPv1 Membership Report messages sent."), V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v2_membership_report", "Number of IGMPv2 Membership Report messages sent."), LeaveGroup: mustCreateMetric("/netstack/igmp/packets_sent/leave_group", "Number of IGMP Leave Group messages sent."), }, Dropped: mustCreateMetric("/netstack/igmp/packets_sent/dropped", "Number of IGMP packets dropped due to link layer errors."), }, PacketsReceived: tcpip.IGMPReceivedPacketStats{ IGMPPacketStats: tcpip.IGMPPacketStats{ MembershipQuery: mustCreateMetric("/netstack/igmp/packets_received/membership_query", "Number of IGMP Membership Query messages received."), V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v1_membership_report", "Number of IGMPv1 Membership Report messages received."), V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v2_membership_report", "Number of IGMPv2 Membership Report messages received."), LeaveGroup: mustCreateMetric("/netstack/igmp/packets_received/leave_group", "Number of IGMP Leave Group messages received."), }, Invalid: mustCreateMetric("/netstack/igmp/packets_received/invalid", "Number of IGMP packets received that could not be parsed."), ChecksumErrors: mustCreateMetric("/netstack/igmp/packets_received/checksum_errors", "Number of received IGMP packets with bad checksums."), Unrecognized: mustCreateMetric("/netstack/igmp/packets_received/unrecognized", "Number of unrecognized IGMP packets received."), }, }, IP: tcpip.IPStats{ PacketsReceived: mustCreateMetric("/netstack/ip/packets_received", "Number of IP packets received from the link layer in nic.DeliverNetworkPacket."), DisabledPacketsReceived: mustCreateMetric("/netstack/ip/disabled_packets_received", "Number of IP packets received from the link layer when the IP layer is disabled."), InvalidDestinationAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Number of IP packets received with an unknown or invalid destination address."), InvalidSourceAddressesReceived: mustCreateMetric("/netstack/ip/invalid_source_addresses_received", "Number of IP packets received with an unknown or invalid source address."), PacketsDelivered: mustCreateMetric("/netstack/ip/packets_delivered", "Number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."), PacketsSent: mustCreateMetric("/netstack/ip/packets_sent", "Number of IP packets sent via WritePacket."), OutgoingPacketErrors: mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Number of IP packets which failed to write to a link-layer endpoint."), MalformedPacketsReceived: mustCreateMetric("/netstack/ip/malformed_packets_received", "Number of IP packets which failed IP header validation checks."), MalformedFragmentsReceived: mustCreateMetric("/netstack/ip/malformed_fragments_received", "Number of IP fragments which failed IP fragment validation checks."), IPTablesPreroutingDropped: mustCreateMetric("/netstack/ip/iptables/prerouting_dropped", "Number of IP packets dropped in the Prerouting chain."), IPTablesInputDropped: mustCreateMetric("/netstack/ip/iptables/input_dropped", "Number of IP packets dropped in the Input chain."), IPTablesOutputDropped: mustCreateMetric("/netstack/ip/iptables/output_dropped", "Number of IP packets dropped in the Output chain."), OptionTimestampReceived: mustCreateMetric("/netstack/ip/options/timestamp_received", "Number of timestamp options found in received IP packets."), OptionRecordRouteReceived: mustCreateMetric("/netstack/ip/options/record_route_received", "Number of record route options found in received IP packets."), OptionRouterAlertReceived: mustCreateMetric("/netstack/ip/options/router_alert_received", "Number of router alert options found in received IP packets."), OptionUnknownReceived: mustCreateMetric("/netstack/ip/options/unknown_received", "Number of unknown options found in received IP packets."), Forwarding: tcpip.IPForwardingStats{ Unrouteable: mustCreateMetric("/netstack/ip/forwarding/unrouteable", "Number of IP packets received which couldn't be routed and thus were not forwarded."), ExhaustedTTL: mustCreateMetric("/netstack/ip/forwarding/exhausted_ttl", "Number of IP packets received which could not be forwarded due to an exhausted TTL."), LinkLocalSource: mustCreateMetric("/netstack/ip/forwarding/link_local_source_address", "Number of IP packets received which could not be forwarded due to a link-local source address."), LinkLocalDestination: mustCreateMetric("/netstack/ip/forwarding/link_local_destination_address", "Number of IP packets received which could not be forwarded due to a link-local destination address."), ExtensionHeaderProblem: mustCreateMetric("/netstack/ip/forwarding/extension_header_problem", "Number of IP packets received which could not be forwarded due to a problem processing their IPv6 extension headers."), PacketTooBig: mustCreateMetric("/netstack/ip/forwarding/packet_too_big", "Number of IP packets received which could not be forwarded because they could not fit within the outgoing MTU."), HostUnreachable: mustCreateMetric("/netstack/ip/forwarding/host_unreachable", "Number of IP packets received which could not be forwarded due to unresolvable next hop."), Errors: mustCreateMetric("/netstack/ip/forwarding/errors", "Number of IP packets which couldn't be forwarded."), }, }, ARP: tcpip.ARPStats{ PacketsReceived: mustCreateMetric("/netstack/arp/packets_received", "Number of ARP packets received from the link layer."), DisabledPacketsReceived: mustCreateMetric("/netstack/arp/disabled_packets_received", "Number of ARP packets received from the link layer when the ARP layer is disabled."), MalformedPacketsReceived: mustCreateMetric("/netstack/arp/malformed_packets_received", "Number of ARP packets which failed ARP header validation checks."), RequestsReceived: mustCreateMetric("/netstack/arp/requests_received", "Number of ARP requests received."), RequestsReceivedUnknownTargetAddress: mustCreateMetric("/netstack/arp/requests_received_unknown_addr", "Number of ARP requests received with an unknown target address."), OutgoingRequestInterfaceHasNoLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_iface_has_no_addr", "Number of failed attempts to send an ARP request with an interface that has no network address."), OutgoingRequestBadLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_invalid_local_addr", "Number of failed attempts to send an ARP request with a provided local address that is invalid."), OutgoingRequestsDropped: mustCreateMetric("/netstack/arp/outgoing_requests_dropped", "Number of ARP requests which failed to write to a link-layer endpoint."), OutgoingRequestsSent: mustCreateMetric("/netstack/arp/outgoing_requests_sent", "Number of ARP requests sent."), RepliesReceived: mustCreateMetric("/netstack/arp/replies_received", "Number of ARP replies received."), OutgoingRepliesDropped: mustCreateMetric("/netstack/arp/outgoing_replies_dropped", "Number of ARP replies which failed to write to a link-layer endpoint."), OutgoingRepliesSent: mustCreateMetric("/netstack/arp/outgoing_replies_sent", "Number of ARP replies sent."), }, TCP: tcpip.TCPStats{ ActiveConnectionOpenings: mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."), PassiveConnectionOpenings: mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."), CurrentEstablished: mustCreateGauge("/netstack/tcp/current_established", "Number of connections in ESTABLISHED state now."), CurrentConnected: mustCreateGauge("/netstack/tcp/current_open", "Number of connections that are in connected state."), EstablishedResets: mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"), EstablishedClosed: mustCreateMetric("/netstack/tcp/established_closed", "Number of times established TCP connections made a transition to CLOSED state."), EstablishedTimedout: mustCreateMetric("/netstack/tcp/established_timedout", "Number of times an established connection was reset because of keep-alive time out."), ListenOverflowSynDrop: mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."), ListenOverflowAckDrop: mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."), ListenOverflowSynCookieSent: mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."), ListenOverflowSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_rcvd", "Number of times a SYN cookie was received."), ListenOverflowInvalidSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_invalid_syn_cookie_rcvd", "Number of times an invalid SYN cookie was received."), FailedConnectionAttempts: mustCreateMetric("/netstack/tcp/failed_connection_attempts", "Number of calls to Connect or Listen (active and passive openings, respectively) that end in an error."), ValidSegmentsReceived: mustCreateMetric("/netstack/tcp/valid_segments_received", "Number of TCP segments received that the transport layer successfully parsed."), InvalidSegmentsReceived: mustCreateMetric("/netstack/tcp/invalid_segments_received", "Number of TCP segments received that the transport layer could not parse."), SegmentsSent: mustCreateMetric("/netstack/tcp/segments_sent", "Number of TCP segments sent."), SegmentSendErrors: mustCreateMetric("/netstack/tcp/segment_send_errors", "Number of TCP segments failed to be sent."), ResetsSent: mustCreateMetric("/netstack/tcp/resets_sent", "Number of TCP resets sent."), ResetsReceived: mustCreateMetric("/netstack/tcp/resets_received", "Number of TCP resets received."), Retransmits: mustCreateMetric("/netstack/tcp/retransmits", "Number of TCP segments retransmitted."), FastRecovery: mustCreateMetric("/netstack/tcp/fast_recovery", "Number of times fast recovery was used to recover from packet loss."), SACKRecovery: mustCreateMetric("/netstack/tcp/sack_recovery", "Number of times SACK recovery was used to recover from packet loss."), TLPRecovery: mustCreateMetric("/netstack/tcp/tlp_recovery", "Number of times tail loss probe triggers recovery from tail loss."), SlowStartRetransmits: mustCreateMetric("/netstack/tcp/slow_start_retransmits", "Number of segments retransmitted in slow start mode."), FastRetransmit: mustCreateMetric("/netstack/tcp/fast_retransmit", "Number of TCP segments which were fast retransmitted."), Timeouts: mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."), ChecksumErrors: mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."), FailedPortReservations: mustCreateMetric("/netstack/tcp/failed_port_reservations", "Number of time TCP failed to reserve a port."), SegmentsAckedWithDSACK: mustCreateMetric("/netstack/tcp/segments_acked_with_dsack", "Number of segments for which DSACK was received."), SpuriousRecovery: mustCreateMetric("/netstack/tcp/spurious_recovery", "Number of times the connection entered loss recovery spuriously."), SpuriousRTORecovery: mustCreateMetric("/netstack/tcp/spurious_rto_recovery", "Number of times the connection entered RTO spuriously."), ForwardMaxInFlightDrop: mustCreateMetric("/netstack/tcp/forward_max_in_flight_drop", "Number of connection requests dropped due to exceeding in-flight limit."), }, UDP: tcpip.UDPStats{ PacketsReceived: mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."), UnknownPortErrors: mustCreateMetric("/netstack/udp/unknown_port_errors", "Number of incoming UDP datagrams dropped because they did not have a known destination port."), ReceiveBufferErrors: mustCreateMetric("/netstack/udp/receive_buffer_errors", "Number of incoming UDP datagrams dropped due to the receiving buffer being in an invalid state."), MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."), PacketsSent: mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."), PacketSendErrors: mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."), ChecksumErrors: mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."), }, } // DefaultTTL is linux's default TTL. All network protocols in all stacks used // with this package must have this value set as their default TTL. const DefaultTTL = 64 const sizeOfInt32 int = 4 var errStackType = syserr.New("expected but did not receive a netstack.Stack", errno.EINVAL) // commonEndpoint represents the intersection of a tcpip.Endpoint and a // transport.Endpoint. type commonEndpoint interface { // Readiness implements tcpip.Endpoint.Readiness and // transport.Endpoint.Readiness. Readiness(mask waiter.EventMask) waiter.EventMask // SetSockOpt implements tcpip.Endpoint.SetSockOpt and // transport.Endpoint.SetSockOpt. SetSockOpt(tcpip.SettableSocketOption) tcpip.Error // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and // transport.Endpoint.SetSockOptInt. SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error // GetSockOpt implements tcpip.Endpoint.GetSockOpt and // transport.Endpoint.GetSockOpt. GetSockOpt(tcpip.GettableSocketOption) tcpip.Error // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and // transport.Endpoint.GetSockOpt. GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) // State returns a socket's lifecycle state. The returned value is // protocol-specific and is primarily used for diagnostics. State() uint32 // LastError implements tcpip.Endpoint.LastError and // transport.Endpoint.LastError. LastError() tcpip.Error // SocketOptions implements tcpip.Endpoint.SocketOptions and // transport.Endpoint.SocketOptions. SocketOptions() *tcpip.SocketOptions } // sock encapsulates all the state needed to represent a network stack // endpoint in the kernel context. // // +stateify savable type sock struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.LockFD socket.SendReceiveTimeout *waiter.Queue family int Endpoint tcpip.Endpoint skType linux.SockType protocol int namespace *inet.Namespace mu sync.Mutex `state:"nosave"` // readWriter is an optimization to avoid allocations. // +checklocks:mu readWriter usermem.IOSequenceReadWriter `state:"nosave"` // readMu protects access to the below fields. readMu sync.Mutex `state:"nosave"` // sockOptTimestamp corresponds to SO_TIMESTAMP. When true, timestamps // of returned messages can be returned via control messages. When // false, the same timestamp is instead stored and can be read via the // SIOCGSTAMP ioctl. It is protected by readMu. See socket(7). sockOptTimestamp bool // timestampValid indicates whether timestamp for SIOCGSTAMP has been // set. It is protected by readMu. timestampValid bool // timestamp holds the timestamp to use with SIOCTSTAMP. It is only // valid when timestampValid is true. It is protected by readMu. timestamp time.Time `state:".(int64)"` // TODO(b/153685824): Move this to SocketOptions. // sockOptInq corresponds to TCP_INQ. sockOptInq bool } var _ = socket.Socket(&sock{}) // New creates a new endpoint socket. func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*vfs.FileDescription, *syserr.Error) { if skType == linux.SOCK_STREAM { endpoint.SocketOptions().SetDelayOption(true) } mnt := t.Kernel().SocketMount() d := sockfs.NewDentry(t, mnt) defer d.DecRef(t) namespace := t.NetworkNamespace() s := &sock{ Queue: queue, family: family, Endpoint: endpoint, skType: skType, protocol: protocol, namespace: namespace, } s.LockFD.Init(&vfs.FileLocks{}) vfsfd := &s.vfsfd if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ DenyPRead: true, DenyPWrite: true, UseDentryMetadata: true, }); err != nil { return nil, syserr.FromError(err) } namespace.IncRef() return vfsfd, nil } // Release implements vfs.FileDescriptionImpl.Release. func (s *sock) Release(ctx context.Context) { kernel.KernelFromContext(ctx).DeleteSocket(&s.vfsfd) e, ch := waiter.NewChannelEntry(waiter.EventHUp | waiter.EventErr) s.EventRegister(&e) defer s.EventUnregister(&e) s.Endpoint.Close() // SO_LINGER option is valid only for TCP. For other socket types // return after endpoint close. if family, skType, _ := s.Type(); skType == linux.SOCK_STREAM && (family == linux.AF_INET || family == linux.AF_INET6) { v := s.Endpoint.SocketOptions().GetLinger() // The case for zero timeout is handled in tcp endpoint close function. // Close is blocked until either: // 1. The endpoint state is not in any of the states: FIN-WAIT1, // CLOSING and LAST_ACK. // 2. Timeout is reached. if v.Enabled && v.Timeout != 0 { t := kernel.TaskFromContext(ctx) start := t.Kernel().MonotonicClock().Now() deadline := start.Add(v.Timeout) _ = t.BlockWithDeadline(ch, true, deadline) } } s.namespace.DecRef(ctx) } // Epollable implements FileDescriptionImpl.Epollable. func (s *sock) Epollable() bool { return true } // Read implements vfs.FileDescriptionImpl. func (s *sock) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { return 0, linuxerr.EOPNOTSUPP } if dst.NumBytes() == 0 { return 0, nil } n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false) if err == syserr.ErrWouldBlock { return int64(n), linuxerr.ErrWouldBlock } if err != nil { return 0, err.ToError() } return int64(n), nil } // Write implements vfs.FileDescriptionImpl. func (s *sock) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { return 0, linuxerr.EOPNOTSUPP } var n int64 var err tcpip.Error switch s.Endpoint.(type) { case *tcp.Endpoint: s.mu.Lock() s.readWriter.Init(ctx, src) n, err = s.Endpoint.Write(&s.readWriter, tcpip.WriteOptions{}) s.mu.Unlock() default: n, err = s.Endpoint.Write(src.Reader(ctx), tcpip.WriteOptions{}) } if _, ok := err.(*tcpip.ErrWouldBlock); ok { return 0, linuxerr.ErrWouldBlock } if err != nil { return 0, syserr.TranslateNetstackError(err).ToError() } if n < src.NumBytes() { return n, linuxerr.ErrWouldBlock } return n, nil } // Accept implements the linux syscall accept(2) for sockets backed by // tcpip.Endpoint. func (s *sock) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { // Issue the accept request to get the new endpoint. var peerAddr *tcpip.FullAddress if peerRequested { peerAddr = &tcpip.FullAddress{} } ep, wq, terr := s.Endpoint.Accept(peerAddr) if terr != nil { if _, ok := terr.(*tcpip.ErrWouldBlock); !ok || !blocking { return 0, nil, 0, syserr.TranslateNetstackError(terr) } var err *syserr.Error ep, wq, err = s.blockingAccept(t, peerAddr) if err != nil { return 0, nil, 0, err } } ns, err := New(t, s.family, s.skType, s.protocol, wq, ep) if err != nil { return 0, nil, 0, err } defer ns.DecRef(t) if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil { return 0, nil, 0, syserr.FromError(err) } var addr linux.SockAddr var addrLen uint32 if peerAddr != nil { // Get address of the peer and write it to peer slice. addr, addrLen = socket.ConvertAddress(s.family, *peerAddr) } fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{ CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, }) t.Kernel().RecordSocket(ns) return fd, addr, addrLen, syserr.FromError(e) } // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by // tcpip.Endpoint. func (s *sock) GetSockOpt(t *kernel.Task, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is // implemented specifically for netstack.Socket rather than // commonEndpoint. commonEndpoint should be extended to support socket // options where the implementation is not shared, as unix sockets need // their own support for SO_TIMESTAMP. if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP { if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } val := primitive.Int32(0) s.readMu.Lock() defer s.readMu.Unlock() if s.sockOptTimestamp { val = 1 } return &val, nil } if level == linux.SOL_TCP && name == linux.TCP_INQ { if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } val := primitive.Int32(0) s.readMu.Lock() defer s.readMu.Unlock() if s.sockOptInq { val = 1 } return &val, nil } return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen) } // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by // tcpip.Endpoint. func (s *sock) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error { // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is // implemented specifically for netstack.Socket rather than // commonEndpoint. commonEndpoint should be extended to support socket // options where the implementation is not shared, as unix sockets need // their own support for SO_TIMESTAMP. if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP { if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } s.readMu.Lock() defer s.readMu.Unlock() s.sockOptTimestamp = hostarch.ByteOrder.Uint32(optVal) != 0 return nil } if level == linux.SOL_TCP && name == linux.TCP_INQ { if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } s.readMu.Lock() defer s.readMu.Unlock() s.sockOptInq = hostarch.ByteOrder.Uint32(optVal) != 0 return nil } return SetSockOpt(t, s, s.Endpoint, level, name, optVal) } var sockAddrInetSize = (*linux.SockAddrInet)(nil).SizeBytes() var sockAddrInet6Size = (*linux.SockAddrInet6)(nil).SizeBytes() var sockAddrLinkSize = (*linux.SockAddrLink)(nil).SizeBytes() // minSockAddrLen returns the minimum length in bytes of a socket address for // the socket's family. func (s *sock) minSockAddrLen() int { const addressFamilySize = 2 switch s.family { case linux.AF_UNIX: return addressFamilySize case linux.AF_INET: return sockAddrInetSize case linux.AF_INET6: return sockAddrInet6Size case linux.AF_PACKET: return sockAddrLinkSize case linux.AF_UNSPEC: return addressFamilySize default: panic(fmt.Sprintf("s.family unrecognized = %d", s.family)) } } func (s *sock) isPacketBased() bool { return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW } // Readiness returns a mask of ready events for socket s. func (s *sock) Readiness(mask waiter.EventMask) waiter.EventMask { return s.Endpoint.Readiness(mask) } // checkFamily returns true iff the specified address family may be used with // the socket. // // If exact is true, then the specified address family must be an exact match // with the socket's family. func (s *sock) checkFamily(family uint16, exact bool) bool { if family == uint16(s.family) { return true } if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 { if !s.Endpoint.SocketOptions().GetV6Only() { return true } } return false } // mapFamily maps the AF_INET ANY address to the IPv4-mapped IPv6 ANY if the // receiver's family is AF_INET6. // // This is a hack to work around the fact that both IPv4 and IPv6 ANY are // represented by the empty string. // // TODO(gvisor.dev/issue/1556): remove this function. func (s *sock) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress { if addr.Addr.BitLen() == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET { addr.Addr = tcpip.AddrFrom16([16]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}) } return addr } // Connect implements the linux syscall connect(2) for sockets backed by // tpcip.Endpoint. func (s *sock) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { addr, family, err := socket.AddressAndFamily(sockaddr) if err != nil { return err } if family == linux.AF_UNSPEC { err := s.Endpoint.Disconnect() if _, ok := err.(*tcpip.ErrNotSupported); ok { return syserr.ErrAddressFamilyNotSupported } return syserr.TranslateNetstackError(err) } if !s.checkFamily(family, false /* exact */) { return syserr.ErrInvalidArgument } addr = s.mapFamily(addr, family) // Always return right away in the non-blocking case. if !blocking { return syserr.TranslateNetstackError(s.Endpoint.Connect(addr)) } // Register for notification when the endpoint becomes writable, then // initiate the connection. e, ch := waiter.NewChannelEntry(waiter.WritableEvents) s.EventRegister(&e) defer s.EventUnregister(&e) switch err := s.Endpoint.Connect(addr); err.(type) { case *tcpip.ErrConnectStarted, *tcpip.ErrAlreadyConnecting: case *tcpip.ErrNoPortAvailable: if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM { // TCP unlike UDP returns EADDRNOTAVAIL when it can't // find an available local ephemeral port. return syserr.ErrAddressNotAvailable } return syserr.TranslateNetstackError(err) default: return syserr.TranslateNetstackError(err) } // It's pending, so we have to wait for a notification, and fetch the // result once the wait completes. if err := t.Block(ch); err != nil { return syserr.FromError(err) } // Call Connect() again after blocking to find connect's result. return syserr.TranslateNetstackError(s.Endpoint.Connect(addr)) } // Bind implements the linux syscall bind(2) for sockets backed by // tcpip.Endpoint. func (s *sock) Bind(_ *kernel.Task, sockaddr []byte) *syserr.Error { if len(sockaddr) < 2 { return syserr.ErrInvalidArgument } family := hostarch.ByteOrder.Uint16(sockaddr) var addr tcpip.FullAddress // Bind for AF_PACKET requires only family, protocol and ifindex. // In function AddressAndFamily, we check the address length which is // not needed for AF_PACKET bind. if family == linux.AF_PACKET { var a linux.SockAddrLink if len(sockaddr) < sockAddrLinkSize { return syserr.ErrInvalidArgument } a.UnmarshalBytes(sockaddr) addr = tcpip.FullAddress{ NIC: tcpip.NICID(a.InterfaceIndex), Addr: tcpip.AddrFrom16Slice(append( a.HardwareAddr[:header.EthernetAddressSize], []byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}..., )), Port: socket.Ntohs(a.Protocol), } } else { if s.minSockAddrLen() > len(sockaddr) { return syserr.ErrInvalidArgument } var err *syserr.Error addr, family, err = socket.AddressAndFamily(sockaddr) if err != nil { return err } if !s.checkFamily(family, true /* exact */) { return syserr.ErrAddressFamilyNotSupported } addr = s.mapFamily(addr, family) } // Issue the bind request to the endpoint. err := s.Endpoint.Bind(addr) if _, ok := err.(*tcpip.ErrNoPortAvailable); ok { // Bind always returns EADDRINUSE irrespective of if the specified port was // already bound or if an ephemeral port was requested but none were // available. // // *tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because // UDP connect returns EAGAIN on ephemeral port exhaustion. // // TCP connect returns EADDRNOTAVAIL on ephemeral port exhaustion. err = &tcpip.ErrPortInUse{} } return syserr.TranslateNetstackError(err) } // Listen implements the linux syscall listen(2) for sockets backed by // tcpip.Endpoint. func (s *sock) Listen(_ *kernel.Task, backlog int) *syserr.Error { if err := s.Endpoint.Listen(backlog); err != nil { return syserr.TranslateNetstackError(err) } if !socket.IsTCP(s) { return nil } // Emit SentryTCPListenEvent with the bound port for tcp sockets. addr, err := s.Endpoint.GetLocalAddress() if err != nil { panic(fmt.Sprintf("GetLocalAddress failed for tcp socket: %s", err)) } eventchannel.Emit(&epb.SentryTcpListenEvent{ Port: proto.Int32(int32(addr.Port)), }) return nil } // blockingAccept implements a blocking version of accept(2), that is, if no // connections are ready to be accept, it will block until one becomes ready. func (s *sock) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) { // Register for notifications. e, ch := waiter.NewChannelEntry(waiter.ReadableEvents) s.EventRegister(&e) defer s.EventUnregister(&e) // Try to accept the connection again; if it fails, then wait until we // get a notification. for { ep, wq, err := s.Endpoint.Accept(peerAddr) if _, ok := err.(*tcpip.ErrWouldBlock); !ok { return ep, wq, syserr.TranslateNetstackError(err) } if err := t.Block(ch); err != nil { return nil, nil, syserr.FromError(err) } } } // ConvertShutdown converts Linux shutdown flags into tcpip shutdown flags. func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) { var f tcpip.ShutdownFlags switch how { case linux.SHUT_RD: f = tcpip.ShutdownRead case linux.SHUT_WR: f = tcpip.ShutdownWrite case linux.SHUT_RDWR: f = tcpip.ShutdownRead | tcpip.ShutdownWrite default: return 0, syserr.ErrInvalidArgument } return f, nil } // Shutdown implements the linux syscall shutdown(2) for sockets backed by // tcpip.Endpoint. func (s *sock) Shutdown(_ *kernel.Task, how int) *syserr.Error { f, err := ConvertShutdown(how) if err != nil { return err } // Issue shutdown request. return syserr.TranslateNetstackError(s.Endpoint.Shutdown(f)) } // GetSockOpt can be used to implement the linux syscall getsockopt(2) for // sockets backed by a commonEndpoint. func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { switch level { case linux.SOL_SOCKET: return getSockOptSocket(t, s, ep, family, skType, name, outLen) case linux.SOL_TCP: return getSockOptTCP(t, s, ep, name, outLen) case linux.SOL_IPV6: return getSockOptIPv6(t, s, ep, name, outPtr, outLen) case linux.SOL_IP: return getSockOptIP(t, s, ep, name, outPtr, outLen, family) case linux.SOL_ICMPV6: return getSockOptICMPv6(t, s, ep, name, outLen) case linux.SOL_UDP, linux.SOL_RAW, linux.SOL_PACKET: // Not supported. } return nil, syserr.ErrProtocolNotAvailable } func boolToInt32(v bool) int32 { if v { return 1 } return 0 } // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET. func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, _ linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) { // TODO(b/124056281): Stop rejecting short optLen values in getsockopt. switch name { case linux.SO_ERROR: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } // Get the last error and convert it. err := ep.SocketOptions().GetLastError() if err == nil { optP := primitive.Int32(0) return &optP, nil } optP := primitive.Int32(syserr.TranslateNetstackError(err).ToLinux()) return &optP, nil case linux.SO_PEERCRED: if family != linux.AF_UNIX || outLen < unix.SizeofUcred { return nil, syserr.ErrInvalidArgument } tcred := t.Credentials() creds := linux.ControlMessageCredentials{ PID: int32(t.ThreadGroup().ID()), UID: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()), GID: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()), } return &creds, nil case linux.SO_PASSCRED: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetPassCred())) return &v, nil case linux.SO_SNDBUF: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } size := ep.SocketOptions().GetSendBufferSize() if size > math.MaxInt32 { size = math.MaxInt32 } sizeP := primitive.Int32(size) return &sizeP, nil case linux.SO_RCVBUF: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } size := ep.SocketOptions().GetReceiveBufferSize() if size > math.MaxInt32 { size = math.MaxInt32 } sizeP := primitive.Int32(size) return &sizeP, nil case linux.SO_REUSEADDR: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReuseAddress())) return &v, nil case linux.SO_REUSEPORT: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReusePort())) return &v, nil case linux.SO_BINDTODEVICE: v := ep.SocketOptions().GetBindToDevice() if v == 0 { var b primitive.ByteSlice return &b, nil } if outLen < linux.IFNAMSIZ { return nil, syserr.ErrInvalidArgument } s := t.NetworkContext() if s == nil { return nil, syserr.ErrNoDevice } nic, ok := s.Interfaces()[int32(v)] if !ok { // The NICID no longer indicates a valid interface, probably because that // interface was removed. return nil, syserr.ErrUnknownDevice } name := primitive.ByteSlice(append([]byte(nic.Name), 0)) return &name, nil case linux.SO_BROADCAST: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetBroadcast())) return &v, nil case linux.SO_KEEPALIVE: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetKeepAlive())) return &v, nil case linux.SO_LINGER: if outLen < linux.SizeOfLinger { return nil, syserr.ErrInvalidArgument } var linger linux.Linger v := ep.SocketOptions().GetLinger() if v.Enabled { linger.OnOff = 1 } linger.Linger = int32(v.Timeout.Seconds()) return &linger, nil case linux.SO_SNDTIMEO: // TODO(igudger): Linux allows shorter lengths for partial results. if outLen < linux.SizeOfTimeval { return nil, syserr.ErrInvalidArgument } sendTimeout := linux.NsecToTimeval(s.SendTimeout()) return &sendTimeout, nil case linux.SO_RCVTIMEO: // TODO(igudger): Linux allows shorter lengths for partial results. if outLen < linux.SizeOfTimeval { return nil, syserr.ErrInvalidArgument } recvTimeout := linux.NsecToTimeval(s.RecvTimeout()) return &recvTimeout, nil case linux.SO_OOBINLINE: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetOutOfBandInline())) return &v, nil case linux.SO_NO_CHECK: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetNoChecksum())) return &v, nil case linux.SO_ACCEPTCONN: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetAcceptConn())) return &v, nil case linux.SO_RCVLOWAT: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(ep.SocketOptions().GetRcvlowat()) return &v, nil } return nil, syserr.ErrProtocolNotAvailable } // getSockOptTCP implements GetSockOpt when level is SOL_TCP. func getSockOptTCP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) { if !socket.IsTCP(s) { return nil, syserr.ErrUnknownProtocolOption } switch name { case linux.TCP_NODELAY: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(!ep.SocketOptions().GetDelayOption())) return &v, nil case linux.TCP_CORK: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetCorkOption())) return &v, nil case linux.TCP_QUICKACK: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetQuickAck())) return &v, nil case linux.TCP_MAXSEG: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v, err := ep.GetSockOptInt(tcpip.MaxSegOption) if err != nil { return nil, syserr.TranslateNetstackError(err) } vP := primitive.Int32(v) return &vP, nil case linux.TCP_KEEPIDLE: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } var v tcpip.KeepaliveIdleOption if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } keepAliveIdle := primitive.Int32(time.Duration(v) / time.Second) return &keepAliveIdle, nil case linux.TCP_KEEPINTVL: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } var v tcpip.KeepaliveIntervalOption if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } keepAliveInterval := primitive.Int32(time.Duration(v) / time.Second) return &keepAliveInterval, nil case linux.TCP_KEEPCNT: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v, err := ep.GetSockOptInt(tcpip.KeepaliveCountOption) if err != nil { return nil, syserr.TranslateNetstackError(err) } vP := primitive.Int32(v) return &vP, nil case linux.TCP_USER_TIMEOUT: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } var v tcpip.TCPUserTimeoutOption if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } tcpUserTimeout := primitive.Int32(time.Duration(v) / time.Millisecond) return &tcpUserTimeout, nil case linux.TCP_INFO: var v tcpip.TCPInfoOption if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } info := linux.TCPInfo{ State: uint8(v.State), RTO: uint32(v.RTO / time.Microsecond), RTT: uint32(v.RTT / time.Microsecond), RTTVar: uint32(v.RTTVar / time.Microsecond), SndSsthresh: v.SndSsthresh, SndCwnd: v.SndCwnd, } switch v.CcState { case tcpip.RTORecovery: info.CaState = linux.TCP_CA_Loss case tcpip.FastRecovery, tcpip.SACKRecovery: info.CaState = linux.TCP_CA_Recovery case tcpip.Disorder: info.CaState = linux.TCP_CA_Disorder case tcpip.Open: info.CaState = linux.TCP_CA_Open } // In netstack reorderSeen is updated only when RACK is enabled. // We only track whether the reordering is seen, which is // different than Linux where reorderSeen is not specific to // RACK and is incremented when a reordering event is seen. if v.ReorderSeen { info.ReordSeen = 1 } // Linux truncates the output binary to outLen. buf := t.CopyScratchBuffer(info.SizeBytes()) info.MarshalUnsafe(buf) if len(buf) > outLen { buf = buf[:outLen] } bufP := primitive.ByteSlice(buf) return &bufP, nil case linux.TCP_CC_INFO, linux.TCP_NOTSENT_LOWAT, linux.TCP_ZEROCOPY_RECEIVE: // Not supported. case linux.TCP_CONGESTION: if outLen <= 0 { return nil, syserr.ErrInvalidArgument } var v tcpip.CongestionControlOption if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } // We match linux behaviour here where it returns the lower of // TCP_CA_NAME_MAX bytes or the value of the option length. // // This is Linux's net/tcp.h TCP_CA_NAME_MAX. const tcpCANameMax = 16 toCopy := tcpCANameMax if outLen < tcpCANameMax { toCopy = outLen } b := make([]byte, toCopy) copy(b, v) bP := primitive.ByteSlice(b) return &bP, nil case linux.TCP_LINGER2: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } var v tcpip.TCPLingerTimeoutOption if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } var lingerTimeout primitive.Int32 if v >= 0 { lingerTimeout = primitive.Int32(time.Duration(v) / time.Second) } else { lingerTimeout = -1 } return &lingerTimeout, nil case linux.TCP_DEFER_ACCEPT: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } var v tcpip.TCPDeferAcceptOption if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } tcpDeferAccept := primitive.Int32(time.Duration(v) / time.Second) return &tcpDeferAccept, nil case linux.TCP_SYNCNT: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v, err := ep.GetSockOptInt(tcpip.TCPSynCountOption) if err != nil { return nil, syserr.TranslateNetstackError(err) } vP := primitive.Int32(v) return &vP, nil case linux.TCP_WINDOW_CLAMP: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v, err := ep.GetSockOptInt(tcpip.TCPWindowClampOption) if err != nil { return nil, syserr.TranslateNetstackError(err) } vP := primitive.Int32(v) return &vP, nil } return nil, syserr.ErrProtocolNotAvailable } func getSockOptICMPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outLen int) (marshal.Marshallable, *syserr.Error) { if _, ok := ep.(tcpip.Endpoint); !ok { log.Warningf("SOL_ICMPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name) return nil, syserr.ErrUnknownProtocolOption } if family, _, _ := s.Type(); family != linux.AF_INET6 { return nil, syserr.ErrNotSupported } switch name { case linux.ICMPV6_FILTER: var v tcpip.ICMPv6Filter if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } filter := linux.ICMP6Filter{Filter: v.DenyType} // Linux truncates the output to outLen. buf := t.CopyScratchBuffer(filter.SizeBytes()) filter.MarshalUnsafe(buf) if len(buf) > outLen { buf = buf[:outLen] } bufP := primitive.ByteSlice(buf) return &bufP, nil } return nil, syserr.ErrProtocolNotAvailable } func defaultTTL(t *kernel.Task, network tcpip.NetworkProtocolNumber) (primitive.Int32, tcpip.Error) { var opt tcpip.DefaultTTLOption stack := inet.StackFromContext(t) if err := stack.(*Stack).Stack.NetworkProtocolOption(network, &opt); err != nil { return 0, err } return primitive.Int32(opt), nil } // getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6. func getSockOptIPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { if _, ok := ep.(tcpip.Endpoint); !ok { log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name) return nil, syserr.ErrUnknownProtocolOption } family, skType, _ := s.Type() if family != linux.AF_INET6 { return nil, syserr.ErrNotSupported } switch name { case linux.IPV6_CHECKSUM: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v, err := ep.GetSockOptInt(tcpip.IPv6Checksum) if err != nil { return nil, syserr.TranslateNetstackError(err) } vP := primitive.Int32(v) return &vP, nil case linux.IPV6_V6ONLY: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetV6Only())) return &v, nil case linux.IPV6_UNICAST_HOPS: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v, err := ep.GetSockOptInt(tcpip.IPv6HopLimitOption) if err != nil { return nil, syserr.TranslateNetstackError(err) } // Fill in the default value, if needed. vP := primitive.Int32(v) if vP == -1 { vP, err = defaultTTL(t, header.IPv6ProtocolNumber) if err != nil { return nil, syserr.TranslateNetstackError(err) } } return &vP, nil case linux.IPV6_RECVHOPLIMIT: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveHopLimit())) return &v, nil case linux.IPV6_PATHMTU: // Not supported. case linux.IPV6_TCLASS: // Length handling for parity with Linux. if outLen == 0 { var b primitive.ByteSlice return &b, nil } v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption) if err != nil { return nil, syserr.TranslateNetstackError(err) } uintv := primitive.Uint32(v) // Linux truncates the output binary to outLen. ib := t.CopyScratchBuffer(uintv.SizeBytes()) uintv.MarshalUnsafe(ib) // Handle cases where outLen is lesser than sizeOfInt32. if len(ib) > outLen { ib = ib[:outLen] } ibP := primitive.ByteSlice(ib) return &ibP, nil case linux.IPV6_RECVTCLASS: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTClass())) return &v, nil case linux.IPV6_RECVERR: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv6RecvError())) return &v, nil case linux.IPV6_RECVORIGDSTADDR: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress())) return &v, nil case linux.IPV6_RECVPKTINFO: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv6ReceivePacketInfo())) return &v, nil case linux.IP6T_ORIGINAL_DST: if outLen < sockAddrInet6Size { return nil, syserr.ErrInvalidArgument } var v tcpip.OriginalDestinationOption if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } a, _ := socket.ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v)) return a.(*linux.SockAddrInet6), nil case linux.IP6T_SO_GET_INFO: if outLen < linux.SizeOfIPTGetinfo { return nil, syserr.ErrInvalidArgument } // Only valid for raw IPv6 sockets. if skType != linux.SOCK_RAW { return nil, syserr.ErrProtocolNotAvailable } stk := inet.StackFromContext(t) if stk == nil { return nil, syserr.ErrNoDevice } info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, true) if err != nil { return nil, err } return &info, nil case linux.IP6T_SO_GET_ENTRIES: // IPTGetEntries is reused for IPv6. if outLen < linux.SizeOfIPTGetEntries { return nil, syserr.ErrInvalidArgument } // Only valid for raw IPv6 sockets. if skType != linux.SOCK_RAW { return nil, syserr.ErrProtocolNotAvailable } stk := inet.StackFromContext(t) if stk == nil { return nil, syserr.ErrNoDevice } entries, err := netfilter.GetEntries6(t, stk.(*Stack).Stack, outPtr, outLen) if err != nil { return nil, err } return &entries, nil case linux.IP6T_SO_GET_REVISION_TARGET: if outLen < linux.SizeOfXTGetRevision { return nil, syserr.ErrInvalidArgument } // Only valid for raw IPv6 sockets. if skType != linux.SOCK_RAW { return nil, syserr.ErrProtocolNotAvailable } stk := inet.StackFromContext(t) if stk == nil { return nil, syserr.ErrNoDevice } ret, err := netfilter.TargetRevision(t, outPtr, header.IPv6ProtocolNumber) if err != nil { return nil, err } return &ret, nil } return nil, syserr.ErrProtocolNotAvailable } // getSockOptIP implements GetSockOpt when level is SOL_IP. func getSockOptIP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int, _ int) (marshal.Marshallable, *syserr.Error) { if _, ok := ep.(tcpip.Endpoint); !ok { log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d, endpoint = %T", name, ep) return nil, syserr.ErrUnknownProtocolOption } switch name { case linux.IP_TTL: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v, err := ep.GetSockOptInt(tcpip.IPv4TTLOption) if err != nil { return nil, syserr.TranslateNetstackError(err) } // Fill in the default value, if needed. vP := primitive.Int32(v) if vP == 0 { vP, err = defaultTTL(t, header.IPv4ProtocolNumber) if err != nil { return nil, syserr.TranslateNetstackError(err) } } return &vP, nil case linux.IP_RECVTTL: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTTL())) return &v, nil case linux.IP_MULTICAST_TTL: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v, err := ep.GetSockOptInt(tcpip.MulticastTTLOption) if err != nil { return nil, syserr.TranslateNetstackError(err) } vP := primitive.Int32(v) return &vP, nil case linux.IP_MULTICAST_IF: if outLen < len(linux.InetAddr{}) { return nil, syserr.ErrInvalidArgument } var v tcpip.MulticastInterfaceOption if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr}) return &a.(*linux.SockAddrInet).Addr, nil case linux.IP_MULTICAST_LOOP: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetMulticastLoop())) return &v, nil case linux.IP_TOS: // Length handling for parity with Linux. if outLen == 0 { var b primitive.ByteSlice return &b, nil } v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption) if err != nil { return nil, syserr.TranslateNetstackError(err) } if outLen < sizeOfInt32 { vP := primitive.Uint8(v) return &vP, nil } vP := primitive.Int32(v) return &vP, nil case linux.IP_RECVTOS: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTOS())) return &v, nil case linux.IP_RECVERR: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv4RecvError())) return &v, nil case linux.IP_PKTINFO: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceivePacketInfo())) return &v, nil case linux.IP_HDRINCL: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetHeaderIncluded())) return &v, nil case linux.IP_RECVORIGDSTADDR: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress())) return &v, nil case linux.SO_ORIGINAL_DST: if outLen < sockAddrInetSize { return nil, syserr.ErrInvalidArgument } var v tcpip.OriginalDestinationOption if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress(v)) return a.(*linux.SockAddrInet), nil case linux.IPT_SO_GET_INFO: if outLen < linux.SizeOfIPTGetinfo { return nil, syserr.ErrInvalidArgument } // Only valid for raw IPv4 sockets. if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { return nil, syserr.ErrProtocolNotAvailable } stk := inet.StackFromContext(t) if stk == nil { return nil, syserr.ErrNoDevice } info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, false) if err != nil { return nil, err } return &info, nil case linux.IPT_SO_GET_ENTRIES: if outLen < linux.SizeOfIPTGetEntries { return nil, syserr.ErrInvalidArgument } // Only valid for raw IPv4 sockets. if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { return nil, syserr.ErrProtocolNotAvailable } stk := inet.StackFromContext(t) if stk == nil { return nil, syserr.ErrNoDevice } entries, err := netfilter.GetEntries4(t, stk.(*Stack).Stack, outPtr, outLen) if err != nil { return nil, err } return &entries, nil case linux.IPT_SO_GET_REVISION_TARGET: if outLen < linux.SizeOfXTGetRevision { return nil, syserr.ErrInvalidArgument } // Only valid for raw IPv4 sockets. if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { return nil, syserr.ErrProtocolNotAvailable } stk := inet.StackFromContext(t) if stk == nil { return nil, syserr.ErrNoDevice } ret, err := netfilter.TargetRevision(t, outPtr, header.IPv4ProtocolNumber) if err != nil { return nil, err } return &ret, nil case linux.IP_MTU_DISCOVER: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } v, err := ep.GetSockOptInt(tcpip.MTUDiscoverOption) if err != nil { return nil, syserr.TranslateNetstackError(err) } switch tcpip.PMTUDStrategy(v) { case tcpip.PMTUDiscoveryWant: v = linux.IP_PMTUDISC_WANT case tcpip.PMTUDiscoveryDont: v = linux.IP_PMTUDISC_DONT case tcpip.PMTUDiscoveryDo: v = linux.IP_PMTUDISC_DO case tcpip.PMTUDiscoveryProbe: v = linux.IP_PMTUDISC_PROBE default: panic(fmt.Errorf("unknown PMTUD option: %d", v)) } vP := primitive.Int32(v) return &vP, nil } return nil, syserr.ErrProtocolNotAvailable } // SetSockOpt can be used to implement the linux syscall setsockopt(2) for // sockets backed by a commonEndpoint. func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error { switch level { case linux.SOL_SOCKET: return setSockOptSocket(t, s, ep, name, optVal) case linux.SOL_TCP: return setSockOptTCP(t, s, ep, name, optVal) case linux.SOL_ICMPV6: return setSockOptICMPv6(t, s, ep, name, optVal) case linux.SOL_IPV6: return setSockOptIPv6(t, s, ep, name, optVal) case linux.SOL_IP: return setSockOptIP(t, s, ep, name, optVal) case linux.SOL_PACKET: // gVisor doesn't support any SOL_PACKET options just return not // supported. Returning nil here will result in tcpdump thinking AF_PACKET // features are supported and proceed to use them and break. return syserr.ErrProtocolNotAvailable case linux.SOL_UDP, linux.SOL_RAW: // Not supported. } return nil } func clampBufSize(newSz, min, max int64, ignoreMax bool) int64 { // packetOverheadFactor is used to multiply the value provided by the user on // a setsockopt(2) for setting the send/receive buffer sizes sockets. const packetOverheadFactor = 2 if !ignoreMax && newSz > max { newSz = max } if newSz < math.MaxInt32/packetOverheadFactor { newSz *= packetOverheadFactor if newSz < min { newSz = min } } else { newSz = math.MaxInt32 } return newSz } // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET. func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { switch name { case linux.SO_SNDBUF: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) min, max := ep.SocketOptions().SendBufferLimits() clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */) ep.SocketOptions().SetSendBufferSize(clamped, true /* notify */) return nil case linux.SO_RCVBUF: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) min, max := ep.SocketOptions().ReceiveBufferLimits() clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */) ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */) return nil case linux.SO_RCVBUFFORCE: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } if creds := auth.CredentialsFromContext(t); !creds.HasCapability(linux.CAP_NET_ADMIN) { return syserr.ErrNotPermitted } v := hostarch.ByteOrder.Uint32(optVal) min, max := ep.SocketOptions().ReceiveBufferLimits() clamped := clampBufSize(int64(v), min, max, true /* ignoreMax */) ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */) return nil case linux.SO_REUSEADDR: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) ep.SocketOptions().SetReuseAddress(v != 0) return nil case linux.SO_REUSEPORT: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) ep.SocketOptions().SetReusePort(v != 0) return nil case linux.SO_BINDTODEVICE: n := bytes.IndexByte(optVal, 0) if n == -1 { n = len(optVal) } name := string(optVal[:n]) if name == "" { return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(0)) } s := t.NetworkContext() if s == nil { return syserr.ErrNoDevice } for nicID, nic := range s.Interfaces() { if nic.Name == name { return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(nicID)) } } return syserr.ErrUnknownDevice case linux.SO_BROADCAST: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) ep.SocketOptions().SetBroadcast(v != 0) return nil case linux.SO_PASSCRED: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) ep.SocketOptions().SetPassCred(v != 0) return nil case linux.SO_KEEPALIVE: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) ep.SocketOptions().SetKeepAlive(v != 0) return nil case linux.SO_SNDTIMEO: if len(optVal) < linux.SizeOfTimeval { return syserr.ErrInvalidArgument } var v linux.Timeval v.UnmarshalBytes(optVal) if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) { return syserr.ErrDomain } s.SetSendTimeout(v.ToNsecCapped()) return nil case linux.SO_RCVTIMEO: if len(optVal) < linux.SizeOfTimeval { return syserr.ErrInvalidArgument } var v linux.Timeval v.UnmarshalBytes(optVal) if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) { return syserr.ErrDomain } s.SetRecvTimeout(v.ToNsecCapped()) return nil case linux.SO_OOBINLINE: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) ep.SocketOptions().SetOutOfBandInline(v != 0) return nil case linux.SO_NO_CHECK: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) ep.SocketOptions().SetNoChecksum(v != 0) return nil case linux.SO_LINGER: if len(optVal) < linux.SizeOfLinger { return syserr.ErrInvalidArgument } var v linux.Linger v.UnmarshalBytes(optVal) ep.SocketOptions().SetLinger(tcpip.LingerOption{ Enabled: v.OnOff != 0, Timeout: time.Second * time.Duration(v.Linger), }) return nil case linux.SO_DETACH_FILTER: // optval is ignored. var v tcpip.SocketDetachFilterOption return syserr.TranslateNetstackError(ep.SetSockOpt(&v)) // TODO(b/226603727): Add support for SO_RCVLOWAT option. For now, only // the unsupported syscall message is removed. case linux.SO_RCVLOWAT: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) ep.SocketOptions().SetRcvlowat(int32(v)) return nil } return nil } // setSockOptTCP implements SetSockOpt when level is SOL_TCP. func setSockOptTCP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { if !socket.IsTCP(s) { return syserr.ErrUnknownProtocolOption } switch name { case linux.TCP_NODELAY: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) ep.SocketOptions().SetDelayOption(v == 0) return nil case linux.TCP_CORK: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) ep.SocketOptions().SetCorkOption(v != 0) return nil case linux.TCP_QUICKACK: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) ep.SocketOptions().SetQuickAck(v != 0) return nil case linux.TCP_MAXSEG: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MaxSegOption, int(v))) case linux.TCP_KEEPIDLE: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) if v < 1 || v > linux.MAX_TCP_KEEPIDLE { return syserr.ErrInvalidArgument } opt := tcpip.KeepaliveIdleOption(time.Second * time.Duration(v)) return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) case linux.TCP_KEEPINTVL: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) if v < 1 || v > linux.MAX_TCP_KEEPINTVL { return syserr.ErrInvalidArgument } opt := tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v)) return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) case linux.TCP_KEEPCNT: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) if v < 1 || v > linux.MAX_TCP_KEEPCNT { return syserr.ErrInvalidArgument } return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.KeepaliveCountOption, int(v))) case linux.TCP_USER_TIMEOUT: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := int32(hostarch.ByteOrder.Uint32(optVal)) if v < 0 { return syserr.ErrInvalidArgument } opt := tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v)) return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) case linux.TCP_CONGESTION: v := tcpip.CongestionControlOption(optVal) if err := ep.SetSockOpt(&v); err != nil { return syserr.TranslateNetstackError(err) } return nil case linux.TCP_LINGER2: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := int32(hostarch.ByteOrder.Uint32(optVal)) opt := tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v)) return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) case linux.TCP_DEFER_ACCEPT: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := int32(hostarch.ByteOrder.Uint32(optVal)) if v < 0 { v = 0 } opt := tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v)) return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) case linux.TCP_SYNCNT: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPSynCountOption, int(v))) case linux.TCP_WINDOW_CLAMP: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := hostarch.ByteOrder.Uint32(optVal) return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPWindowClampOption, int(v))) case linux.TCP_REPAIR_OPTIONS: // Not supported. } return nil } func setSockOptICMPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { if _, ok := ep.(tcpip.Endpoint); !ok { log.Warningf("SOL_ICMPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name) return syserr.ErrUnknownProtocolOption } if family, _, _ := s.Type(); family != linux.AF_INET6 { return syserr.ErrUnknownProtocolOption } switch name { case linux.ICMPV6_FILTER: var req linux.ICMP6Filter if len(optVal) < req.SizeBytes() { return syserr.ErrInvalidArgument } req.UnmarshalUnsafe(optVal) return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.ICMPv6Filter{DenyType: req.Filter})) } return nil } // setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6. func setSockOptIPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { if _, ok := ep.(tcpip.Endpoint); !ok { log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name) return syserr.ErrUnknownProtocolOption } family, _, _ := s.Type() if family != linux.AF_INET6 { return syserr.ErrUnknownProtocolOption } switch name { case linux.IPV6_CHECKSUM: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } // int may not be 32-bits so we cast the uint32 to an int32 before casting // to an int. return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6Checksum, int(int32(hostarch.ByteOrder.Uint32(optVal))))) case linux.IPV6_V6ONLY: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } if socket.IsTCP(s) && tcp.EndpointState(ep.State()) != tcp.StateInitial { return syserr.ErrInvalidEndpointState } else if socket.IsUDP(s) && transport.DatagramEndpointState(ep.State()) != transport.DatagramEndpointStateInitial { return syserr.ErrInvalidEndpointState } v := hostarch.ByteOrder.Uint32(optVal) ep.SocketOptions().SetV6Only(v != 0) return nil case linux.IPV6_ADD_MEMBERSHIP: req, err := copyInMulticastV6Request(optVal) if err != nil { return err } return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{ NIC: tcpip.NICID(req.InterfaceIndex), MulticastAddr: tcpip.AddrFrom16(req.MulticastAddr), })) case linux.IPV6_DROP_MEMBERSHIP: req, err := copyInMulticastV6Request(optVal) if err != nil { return err } return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{ NIC: tcpip.NICID(req.InterfaceIndex), MulticastAddr: tcpip.AddrFrom16(req.MulticastAddr), })) case linux.IPV6_IPSEC_POLICY, linux.IPV6_JOIN_ANYCAST, linux.IPV6_LEAVE_ANYCAST, // TODO(b/148887420): Add support for IPV6_PKTINFO. linux.IPV6_PKTINFO, linux.IPV6_ROUTER_ALERT, linux.IPV6_XFRM_POLICY, linux.MCAST_BLOCK_SOURCE, linux.MCAST_JOIN_GROUP, linux.MCAST_JOIN_SOURCE_GROUP, linux.MCAST_LEAVE_GROUP, linux.MCAST_LEAVE_SOURCE_GROUP, linux.MCAST_UNBLOCK_SOURCE: // Not supported. case linux.IPV6_RECVORIGDSTADDR: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := int32(hostarch.ByteOrder.Uint32(optVal)) ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0) return nil case linux.IPV6_RECVPKTINFO: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := int32(hostarch.ByteOrder.Uint32(optVal)) ep.SocketOptions().SetIPv6ReceivePacketInfo(v != 0) return nil case linux.IPV6_UNICAST_HOPS: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := int32(hostarch.ByteOrder.Uint32(optVal)) if v < -1 || v > 255 { return syserr.ErrInvalidArgument } return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6HopLimitOption, int(v))) case linux.IPV6_RECVHOPLIMIT: v, err := parseIntOrChar(optVal) if err != nil { return err } ep.SocketOptions().SetReceiveHopLimit(v != 0) return nil case linux.IPV6_TCLASS: if len(optVal) < sizeOfInt32 { return syserr.ErrInvalidArgument } v := int32(hostarch.ByteOrder.Uint32(optVal)) if v < -1 || v > 255 { return syserr.ErrInvalidArgument } if v == -1 { v = 0 } return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6TrafficClassOption, int(v))) case linux.IPV6_RECVTCLASS: v, err := parseIntOrChar(optVal) if err != nil { return err } ep.SocketOptions().SetReceiveTClass(v != 0) return nil case linux.IPV6_RECVERR: if len(optVal) == 0 { return nil } v, err := parseIntOrChar(optVal) if err != nil { return err } ep.SocketOptions().SetIPv6RecvError(v != 0) return nil case linux.IP6T_SO_SET_REPLACE: if len(optVal) < linux.SizeOfIP6TReplace { return syserr.ErrInvalidArgument } // Only valid for raw IPv6 sockets. if !socket.IsRaw(s) { return syserr.ErrProtocolNotAvailable } stk := inet.StackFromContext(t) if stk == nil { return syserr.ErrNoDevice } // Stack must be a netstack stack. return netfilter.SetEntries(t.Credentials().UserNamespace, stk.(*Stack).Stack, optVal, true) case linux.IP6T_SO_SET_ADD_COUNTERS: log.Infof("IP6T_SO_SET_ADD_COUNTERS is not supported") return nil } return nil } var ( inetMulticastRequestSize = (*linux.InetMulticastRequest)(nil).SizeBytes() inetMulticastRequestWithNICSize = (*linux.InetMulticastRequestWithNIC)(nil).SizeBytes() inet6MulticastRequestSize = (*linux.Inet6MulticastRequest)(nil).SizeBytes() ) // copyInMulticastRequest copies in a variable-size multicast request. The // kernel determines which structure was passed by its length. IP_MULTICAST_IF // supports ip_mreqn, ip_mreq and in_addr, while IP_ADD_MEMBERSHIP and // IP_DROP_MEMBERSHIP only support ip_mreqn and ip_mreq. To handle this, // allowAddr controls whether in_addr is accepted or rejected. func copyInMulticastRequest(optVal []byte, allowAddr bool) (linux.InetMulticastRequestWithNIC, *syserr.Error) { if len(optVal) < len(linux.InetAddr{}) { return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument } if len(optVal) < inetMulticastRequestSize { if !allowAddr { return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument } var req linux.InetMulticastRequestWithNIC copy(req.InterfaceAddr[:], optVal) return req, nil } if len(optVal) >= inetMulticastRequestWithNICSize { var req linux.InetMulticastRequestWithNIC req.UnmarshalUnsafe(optVal) return req, nil } var req linux.InetMulticastRequestWithNIC req.InetMulticastRequest.UnmarshalUnsafe(optVal) return req, nil } func copyInMulticastV6Request(optVal []byte) (linux.Inet6MulticastRequest, *syserr.Error) { if len(optVal) < inet6MulticastRequestSize { return linux.Inet6MulticastRequest{}, syserr.ErrInvalidArgument } var req linux.Inet6MulticastRequest req.UnmarshalUnsafe(optVal) return req, nil } // parseIntOrChar copies either a 32-bit int or an 8-bit uint out of buf. // // net/ipv4/ip_sockglue.c:do_ip_setsockopt does this for its socket options. func parseIntOrChar(buf []byte) (int32, *syserr.Error) { if len(buf) == 0 { return 0, syserr.ErrInvalidArgument } if len(buf) >= sizeOfInt32 { return int32(hostarch.ByteOrder.Uint32(buf)), nil } return int32(buf[0]), nil } // setSockOptIP implements SetSockOpt when level is SOL_IP. func setSockOptIP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { if _, ok := ep.(tcpip.Endpoint); !ok { log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d, endpoint = %T", name, ep) return syserr.ErrUnknownProtocolOption } switch name { case linux.IP_MULTICAST_TTL: v, err := parseIntOrChar(optVal) if err != nil { return err } if v == -1 { // Linux translates -1 to 1. v = 1 } if v < 0 || v > 255 { return syserr.ErrInvalidArgument } return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MulticastTTLOption, int(v))) case linux.IP_ADD_MEMBERSHIP: req, err := copyInMulticastRequest(optVal, false /* allowAddr */) if err != nil { return err } return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{ NIC: tcpip.NICID(req.InterfaceIndex), // TODO(igudger): Change AddMembership to use the standard // any address representation. InterfaceAddr: tcpip.AddrFrom4(req.InterfaceAddr), MulticastAddr: tcpip.AddrFrom4(req.MulticastAddr), })) case linux.IP_DROP_MEMBERSHIP: req, err := copyInMulticastRequest(optVal, false /* allowAddr */) if err != nil { return err } return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{ NIC: tcpip.NICID(req.InterfaceIndex), // TODO(igudger): Change DropMembership to use the standard // any address representation. InterfaceAddr: tcpip.AddrFrom4(req.InterfaceAddr), MulticastAddr: tcpip.AddrFrom4(req.MulticastAddr), })) case linux.IP_MULTICAST_IF: req, err := copyInMulticastRequest(optVal, true /* allowAddr */) if err != nil { return err } return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.MulticastInterfaceOption{ NIC: tcpip.NICID(req.InterfaceIndex), InterfaceAddr: socket.BytesToIPAddress(req.InterfaceAddr[:]), })) case linux.IP_MULTICAST_LOOP: v, err := parseIntOrChar(optVal) if err != nil { return err } ep.SocketOptions().SetMulticastLoop(v != 0) return nil case linux.MCAST_JOIN_GROUP: // FIXME(b/124219304): Implement MCAST_JOIN_GROUP. return syserr.ErrInvalidArgument case linux.IP_TTL: v, err := parseIntOrChar(optVal) if err != nil { return err } // -1 means default TTL. if v == -1 { v = 0 } else if v < 1 || v > 255 { return syserr.ErrInvalidArgument } return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TTLOption, int(v))) case linux.IP_RECVTTL: v, err := parseIntOrChar(optVal) if err != nil { return err } ep.SocketOptions().SetReceiveTTL(v != 0) return nil case linux.IP_TOS: if len(optVal) == 0 { return nil } v, err := parseIntOrChar(optVal) if err != nil { return err } return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TOSOption, int(v))) case linux.IP_RECVTOS: v, err := parseIntOrChar(optVal) if err != nil { return err } ep.SocketOptions().SetReceiveTOS(v != 0) return nil case linux.IP_RECVERR: if len(optVal) == 0 { return nil } v, err := parseIntOrChar(optVal) if err != nil { return err } ep.SocketOptions().SetIPv4RecvError(v != 0) return nil case linux.IP_PKTINFO: if len(optVal) == 0 { return nil } v, err := parseIntOrChar(optVal) if err != nil { return err } ep.SocketOptions().SetReceivePacketInfo(v != 0) return nil case linux.IP_HDRINCL: if len(optVal) == 0 { return nil } v, err := parseIntOrChar(optVal) if err != nil { return err } ep.SocketOptions().SetHeaderIncluded(v != 0) return nil case linux.IP_RECVORIGDSTADDR: if len(optVal) == 0 { return nil } v, err := parseIntOrChar(optVal) if err != nil { return err } ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0) return nil case linux.IPT_SO_SET_REPLACE: if len(optVal) < linux.SizeOfIPTReplace { return syserr.ErrInvalidArgument } // Only valid for raw IPv4 sockets. if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { return syserr.ErrProtocolNotAvailable } stk := inet.StackFromContext(t) if stk == nil { return syserr.ErrNoDevice } // Stack must be a netstack stack. return netfilter.SetEntries(t.Credentials().UserNamespace, stk.(*Stack).Stack, optVal, false) case linux.IPT_SO_SET_ADD_COUNTERS: log.Infof("IPT_SO_SET_ADD_COUNTERS is not supported") return nil case linux.IP_MTU_DISCOVER: if len(optVal) == 0 { return nil } v, err := parseIntOrChar(optVal) if err != nil { return err } switch v { case linux.IP_PMTUDISC_DONT: v = int32(tcpip.PMTUDiscoveryDont) case linux.IP_PMTUDISC_WANT: v = int32(tcpip.PMTUDiscoveryWant) case linux.IP_PMTUDISC_DO: v = int32(tcpip.PMTUDiscoveryDo) case linux.IP_PMTUDISC_PROBE: v = int32(tcpip.PMTUDiscoveryProbe) default: return syserr.ErrNotSupported } return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MTUDiscoverOption, int(v))) case linux.IP_ADD_SOURCE_MEMBERSHIP, linux.IP_BIND_ADDRESS_NO_PORT, linux.IP_BLOCK_SOURCE, linux.IP_CHECKSUM, linux.IP_DROP_SOURCE_MEMBERSHIP, linux.IP_FREEBIND, linux.IP_IPSEC_POLICY, linux.IP_MINTTL, linux.IP_MSFILTER, linux.IP_MULTICAST_ALL, linux.IP_NODEFRAG, linux.IP_OPTIONS, linux.IP_PASSSEC, linux.IP_RECVFRAGSIZE, linux.IP_RECVOPTS, linux.IP_RETOPTS, linux.IP_TRANSPARENT, linux.IP_UNBLOCK_SOURCE, linux.IP_UNICAST_IF, linux.IP_XFRM_POLICY, linux.MCAST_BLOCK_SOURCE, linux.MCAST_JOIN_SOURCE_GROUP, linux.MCAST_LEAVE_GROUP, linux.MCAST_LEAVE_SOURCE_GROUP, linux.MCAST_MSFILTER, linux.MCAST_UNBLOCK_SOURCE: // Not supported. } return nil } // GetSockName implements the linux syscall getsockname(2) for sockets backed by // tcpip.Endpoint. func (s *sock) GetSockName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.Endpoint.GetLocalAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) } a, l := socket.ConvertAddress(s.family, addr) return a, l, nil } // GetPeerName implements the linux syscall getpeername(2) for sockets backed by // tcpip.Endpoint. func (s *sock) GetPeerName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.Endpoint.GetRemoteAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) } a, l := socket.ConvertAddress(s.family, addr) return a, l, nil } func (s *sock) fillCmsgInq(cmsg *socket.ControlMessages) { if !s.sockOptInq { return } rcvBufUsed, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption) if err != nil { return } cmsg.IP.HasInq = true cmsg.IP.Inq = int32(rcvBufUsed) } func toLinuxPacketType(pktType tcpip.PacketType) uint8 { switch pktType { case tcpip.PacketHost: return linux.PACKET_HOST case tcpip.PacketOtherHost: return linux.PACKET_OTHERHOST case tcpip.PacketOutgoing: return linux.PACKET_OUTGOING case tcpip.PacketBroadcast: return linux.PACKET_BROADCAST case tcpip.PacketMulticast: return linux.PACKET_MULTICAST default: panic(fmt.Sprintf("unknown packet type: %d", pktType)) } } // nonBlockingRead issues a non-blocking read. // // TODO(b/78348848): Support timestamps for stream sockets. func (s *sock) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { isPacket := s.isPacketBased() readOptions := tcpip.ReadOptions{ Peek: peek, NeedRemoteAddr: senderRequested, NeedLinkPacketInfo: isPacket, } // TCP sockets discard the data if MSG_TRUNC is set. // // This behavior is documented in man 7 tcp: // Since version 2.4, Linux supports the use of MSG_TRUNC in the flags // argument of recv(2) (and recvmsg(2)). This flag causes the received // bytes of data to be discarded, rather than passed back in a // caller-supplied buffer. var w io.Writer var res tcpip.ReadResult var err tcpip.Error s.readMu.Lock() defer s.readMu.Unlock() if !isPacket && trunc { w = &tcpip.LimitedWriter{ W: ioutil.Discard, N: dst.NumBytes(), } res, err = s.Endpoint.Read(w, readOptions) } else { switch s.Endpoint.(type) { case *tcp.Endpoint: s.mu.Lock() s.readWriter.Init(ctx, dst) res, err = s.Endpoint.Read(&s.readWriter, readOptions) s.mu.Unlock() default: res, err = s.Endpoint.Read(dst.Writer(ctx), readOptions) } } if _, ok := err.(*tcpip.ErrBadBuffer); ok && dst.NumBytes() == 0 { err = nil } if err != nil { return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err) } // Set the control message, even if 0 bytes were read. s.updateTimestamp(res.ControlMessages) if isPacket { var addr linux.SockAddr var addrLen uint32 if senderRequested { addr, addrLen = socket.ConvertAddress(s.family, res.RemoteAddr) switch v := addr.(type) { case *linux.SockAddrLink: v.Protocol = socket.Htons(uint16(res.LinkPacketInfo.Protocol)) v.PacketType = toLinuxPacketType(res.LinkPacketInfo.PktType) } } msgLen := res.Count if trunc { msgLen = res.Total } var flags int if res.Total > res.Count { flags |= linux.MSG_TRUNC } return msgLen, flags, addr, addrLen, s.netstackToLinuxControlMessages(res.ControlMessages), nil } if peek { // MSG_TRUNC with MSG_PEEK on a TCP socket returns the // amount that could be read, and does not write to buffer. if trunc { // TCP endpoint does not return the total bytes in buffer as numTotal. // We need to query it from socket option. rql, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption) if err != nil { return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err) } msgLen := int(dst.NumBytes()) if msgLen > rql { msgLen = rql } return msgLen, 0, nil, 0, socket.ControlMessages{}, nil } } else if n := res.Count; n != 0 { s.Endpoint.ModerateRecvBuf(n) } cmsg := s.netstackToLinuxControlMessages(res.ControlMessages) s.fillCmsgInq(&cmsg) return res.Count, 0, nil, 0, cmsg, syserr.TranslateNetstackError(err) } func (s *sock) netstackToLinuxControlMessages(cm tcpip.ReceivableControlMessages) socket.ControlMessages { readCM := socket.NewIPControlMessages(s.family, cm) return socket.ControlMessages{ IP: socket.IPControlMessages{ HasTimestamp: readCM.HasTimestamp && s.sockOptTimestamp, Timestamp: readCM.Timestamp, HasInq: readCM.HasInq, Inq: readCM.Inq, HasTOS: readCM.HasTOS, TOS: readCM.TOS, HasTClass: readCM.HasTClass, TClass: readCM.TClass, HasTTL: readCM.HasTTL, TTL: readCM.TTL, HasHopLimit: readCM.HasHopLimit, HopLimit: readCM.HopLimit, HasIPPacketInfo: readCM.HasIPPacketInfo, PacketInfo: readCM.PacketInfo, HasIPv6PacketInfo: readCM.HasIPv6PacketInfo, IPv6PacketInfo: readCM.IPv6PacketInfo, OriginalDstAddress: readCM.OriginalDstAddress, SockErr: readCM.SockErr, }, } } func (s *sock) linuxToNetstackControlMessages(cm socket.ControlMessages) tcpip.SendableControlMessages { return tcpip.SendableControlMessages{ HasTTL: cm.IP.HasTTL, TTL: uint8(cm.IP.TTL), HasHopLimit: cm.IP.HasHopLimit, HopLimit: uint8(cm.IP.HopLimit), } } // updateTimestamp sets the timestamp for SIOCGSTAMP. It should be called after // successfully writing packet data out to userspace. // // Precondition: s.readMu must be locked. func (s *sock) updateTimestamp(cm tcpip.ReceivableControlMessages) { // Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled. if !s.sockOptTimestamp { s.timestampValid = true s.timestamp = cm.Timestamp } } // dequeueErr is analogous to net/core/skbuff.c:sock_dequeue_err_skb(). func (s *sock) dequeueErr() *tcpip.SockError { so := s.Endpoint.SocketOptions() err := so.DequeueErr() if err == nil { return nil } // Update socket error to reflect ICMP errors in queue. if nextErr := so.PeekErr(); nextErr != nil && nextErr.Cause.Origin().IsICMPErr() { so.SetLastError(nextErr.Err) } else if err.Cause.Origin().IsICMPErr() { so.SetLastError(nil) } return err } // addrFamilyFromNetProto returns the address family identifier for the given // network protocol. func addrFamilyFromNetProto(net tcpip.NetworkProtocolNumber) int { switch net { case header.IPv4ProtocolNumber: return linux.AF_INET case header.IPv6ProtocolNumber: return linux.AF_INET6 default: panic(fmt.Sprintf("invalid net proto for addr family inference: %d", net)) } } // recvErr handles MSG_ERRQUEUE for recvmsg(2). // This is analogous to net/ipv4/ip_sockglue.c:ip_recv_error(). func (s *sock) recvErr(t *kernel.Task, dst usermem.IOSequence) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { sockErr := s.dequeueErr() if sockErr == nil { return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain } if sockErr.Payload != nil { defer sockErr.Payload.Release() } // The payload of the original packet that caused the error is passed as // normal data via msg_iovec. -- recvmsg(2) msgFlags := linux.MSG_ERRQUEUE if int(dst.NumBytes()) < sockErr.Payload.Size() { msgFlags |= linux.MSG_TRUNC } n, err := dst.CopyOut(t, sockErr.Payload.AsSlice()) // The original destination address of the datagram that caused the error is // supplied via msg_name. -- recvmsg(2) dstAddr, dstAddrLen := socket.ConvertAddress(addrFamilyFromNetProto(sockErr.NetProto), sockErr.Dst) cmgs := socket.ControlMessages{IP: socket.NewIPControlMessages(s.family, tcpip.ReceivableControlMessages{SockErr: sockErr})} return n, msgFlags, dstAddr, dstAddrLen, cmgs, syserr.FromError(err) } // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by // tcpip.Endpoint. func (s *sock) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, _ uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { if flags&linux.MSG_ERRQUEUE != 0 { return s.recvErr(t, dst) } trunc := flags&linux.MSG_TRUNC != 0 peek := flags&linux.MSG_PEEK != 0 dontWait := flags&linux.MSG_DONTWAIT != 0 waitAll := flags&linux.MSG_WAITALL != 0 if senderRequested && !s.isPacketBased() { // Stream sockets ignore the sender address. senderRequested = false } n, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested) if s.isPacketBased() && err == syserr.ErrClosedForReceive && flags&linux.MSG_DONTWAIT != 0 { // In this situation we should return EAGAIN. return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain } if err != nil && (err != syserr.ErrWouldBlock || dontWait) { // Read failed and we should not retry. return 0, 0, nil, 0, socket.ControlMessages{}, err } if err == nil && (dontWait || !waitAll || s.isPacketBased() || int64(n) >= dst.NumBytes()) { // We got all the data we need. return } // Don't overwrite any data we received. dst = dst.DropFirst(n) // We'll have to block. Register for notifications and keep trying to // send all the data. e, ch := waiter.NewChannelEntry(waiter.ReadableEvents) s.EventRegister(&e) defer s.EventUnregister(&e) for { var rn int rn, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested) n += rn if err != nil && err != syserr.ErrWouldBlock { // Always stop on errors other than would block as we generally // won't be able to get any more data. Eat the error if we got // any data. if n > 0 { err = nil } return } if err == nil && (s.isPacketBased() || !waitAll || int64(rn) >= dst.NumBytes()) { // We got all the data we need. return } dst = dst.DropFirst(rn) if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { if n > 0 { return n, msgFlags, senderAddr, senderAddrLen, controlMessages, nil } if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain } return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) } } } // SendMsg implements the linux syscall sendmsg(2) for sockets backed by // tcpip.Endpoint. func (s *sock) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { // Reject Unix control messages. if !controlMessages.Unix.Empty() { return 0, syserr.ErrInvalidArgument } var addr *tcpip.FullAddress if len(to) > 0 { addrBuf, family, err := socket.AddressAndFamily(to) if err != nil { return 0, err } if !s.checkFamily(family, false /* exact */) { return 0, syserr.ErrInvalidArgument } addrBuf = s.mapFamily(addrBuf, family) addr = &addrBuf } opts := tcpip.WriteOptions{ To: addr, More: flags&linux.MSG_MORE != 0, EndOfRecord: flags&linux.MSG_EOR != 0, ControlMessages: s.linuxToNetstackControlMessages(controlMessages), } r := src.Reader(t) var ( total int64 entry waiter.Entry ch <-chan struct{} ) for { n, err := s.Endpoint.Write(r, opts) total += n if flags&linux.MSG_DONTWAIT != 0 { return int(total), syserr.TranslateNetstackError(err) } block := true switch err.(type) { case nil: block = total != src.NumBytes() case *tcpip.ErrWouldBlock: default: block = false } if block { if ch == nil { // We'll have to block. Register for notification and keep trying to // send all the data. entry, ch = waiter.NewChannelEntry(waiter.WritableEvents) s.EventRegister(&entry) defer s.EventUnregister(&entry) } else { // Don't wait immediately after registration in case more data // became available between when we last checked and when we setup // the notification. if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { return int(total), syserr.ErrTryAgain } // handleIOError will consume errors from t.Block if needed. return int(total), syserr.FromError(err) } } continue } return int(total), syserr.TranslateNetstackError(err) } } // Ioctl implements vfs.FileDescriptionImpl. func (s *sock) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { t := kernel.TaskFromContext(ctx) if t == nil { panic("ioctl(2) may only be called from a task goroutine") } // SIOCGSTAMP is implemented by netstack rather than all commonEndpoint // sockets. // TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP. switch args[1].Int() { case linux.SIOCGSTAMP: s.readMu.Lock() defer s.readMu.Unlock() if !s.timestampValid { return 0, linuxerr.ENOENT } tv := linux.NsecToTimeval(s.timestamp.UnixNano()) _, err := tv.CopyOut(t, args[2].Pointer()) return 0, err case linux.TIOCINQ: v, terr := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption) if terr != nil { return 0, syserr.TranslateNetstackError(terr).ToError() } if v > math.MaxInt32 { v = math.MaxInt32 } // Copy result to userspace. vP := primitive.Int32(v) _, err := vP.CopyOut(t, args[2].Pointer()) return 0, err } return Ioctl(ctx, s.Endpoint, uio, sysno, args) } // Ioctl performs a socket ioctl. func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { t := kernel.TaskFromContext(ctx) if t == nil { panic("ioctl(2) may only be called from a task goroutine") } switch arg := int(args[1].Int()); arg { case linux.SIOCGIFFLAGS, linux.SIOCGIFADDR, linux.SIOCGIFBRDADDR, linux.SIOCGIFDSTADDR, linux.SIOCGIFHWADDR, linux.SIOCGIFINDEX, linux.SIOCGIFMAP, linux.SIOCGIFMETRIC, linux.SIOCGIFMTU, linux.SIOCGIFNAME, linux.SIOCGIFNETMASK, linux.SIOCGIFTXQLEN, linux.SIOCETHTOOL: var ifr linux.IFReq if _, err := ifr.CopyIn(t, args[2].Pointer()); err != nil { return 0, err } if err := interfaceIoctl(ctx, io, arg, &ifr); err != nil { return 0, err.ToError() } _, err := ifr.CopyOut(t, args[2].Pointer()) return 0, err case linux.SIOCGIFCONF: // Return a list of interface addresses or the buffer size // necessary to hold the list. var ifc linux.IFConf if _, err := ifc.CopyIn(t, args[2].Pointer()); err != nil { return 0, err } if err := ifconfIoctl(ctx, t, io, &ifc); err != nil { return 0, err } _, err := ifc.CopyOut(t, args[2].Pointer()) return 0, err case linux.TIOCINQ: v, terr := ep.GetSockOptInt(tcpip.ReceiveQueueSizeOption) if terr != nil { return 0, syserr.TranslateNetstackError(terr).ToError() } if v > math.MaxInt32 { v = math.MaxInt32 } // Copy result to userspace. vP := primitive.Int32(v) _, err := vP.CopyOut(t, args[2].Pointer()) return 0, err case linux.TIOCOUTQ: v, terr := ep.GetSockOptInt(tcpip.SendQueueSizeOption) if terr != nil { return 0, syserr.TranslateNetstackError(terr).ToError() } if v > math.MaxInt32 { v = math.MaxInt32 } // Copy result to userspace. vP := primitive.Int32(v) _, err := vP.CopyOut(t, args[2].Pointer()) return 0, err case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG: // Not supported. } return 0, linuxerr.ENOTTY } // interfaceIoctl implements interface requests. func interfaceIoctl(ctx context.Context, _ usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error { var ( iface inet.Interface index int32 found bool ) // Find the relevant device. stk := inet.StackFromContext(ctx) if stk == nil { return syserr.ErrNoDevice } // SIOCGIFNAME uses ifr.ifr_ifindex rather than ifr.ifr_name to // identify a device. if arg == linux.SIOCGIFNAME { // Gets the name of the interface given the interface index // stored in ifr_ifindex. index = int32(hostarch.ByteOrder.Uint32(ifr.Data[:4])) if iface, ok := stk.Interfaces()[index]; ok { ifr.SetName(iface.Name) return nil } return syserr.ErrNoDevice } // Find the relevant device. for index, iface = range stk.Interfaces() { if iface.Name == ifr.Name() { found = true break } } if !found { return syserr.ErrNoDevice } switch arg { case linux.SIOCGIFINDEX: // Copy out the index to the data. hostarch.ByteOrder.PutUint32(ifr.Data[:], uint32(index)) case linux.SIOCGIFHWADDR: // Copy the hardware address out. // // Refer: https://linux.die.net/man/7/netdevice // SIOCGIFHWADDR, SIOCSIFHWADDR // // Get or set the hardware address of a device using // ifr_hwaddr. The hardware address is specified in a struct // sockaddr. sa_family contains the ARPHRD_* device type, // sa_data the L2 hardware address starting from byte 0. Setting // the hardware address is a privileged operation. hostarch.ByteOrder.PutUint16(ifr.Data[:], iface.DeviceType) n := copy(ifr.Data[2:], iface.Addr) for i := 2 + n; i < len(ifr.Data); i++ { ifr.Data[i] = 0 // Clear padding. } case linux.SIOCGIFFLAGS: f, err := interfaceStatusFlags(stk, iface.Name) if err != nil { return err } // Drop the flags that don't fit in the size that we need to return. This // matches Linux behavior. hostarch.ByteOrder.PutUint16(ifr.Data[:2], uint16(f)) case linux.SIOCGIFADDR: // Copy the IPv4 address out. for _, addr := range stk.InterfaceAddrs()[index] { // This ioctl is only compatible with AF_INET addresses. if addr.Family != linux.AF_INET { continue } copy(ifr.Data[4:8], addr.Addr) break } case linux.SIOCGIFMETRIC: // Gets the metric of the device. As per netdevice(7), this // always just sets ifr_metric to 0. hostarch.ByteOrder.PutUint32(ifr.Data[:4], 0) case linux.SIOCGIFMTU: // Gets the MTU of the device. hostarch.ByteOrder.PutUint32(ifr.Data[:4], iface.MTU) case linux.SIOCGIFMAP: // Gets the hardware parameters of the device. // TODO(gvisor.dev/issue/505): Implement. case linux.SIOCGIFTXQLEN: // Gets the transmit queue length of the device. // TODO(gvisor.dev/issue/505): Implement. case linux.SIOCGIFDSTADDR: // Gets the destination address of a point-to-point device. // TODO(gvisor.dev/issue/505): Implement. case linux.SIOCGIFBRDADDR: // Gets the broadcast address of a device. // TODO(gvisor.dev/issue/505): Implement. case linux.SIOCGIFNETMASK: // Gets the network mask of a device. for _, addr := range stk.InterfaceAddrs()[index] { // This ioctl is only compatible with AF_INET addresses. if addr.Family != linux.AF_INET { continue } // Populate ifr.ifr_netmask (type sockaddr). hostarch.ByteOrder.PutUint16(ifr.Data[0:], uint16(linux.AF_INET)) hostarch.ByteOrder.PutUint16(ifr.Data[2:], 0) var mask uint32 = 0xffffffff << (32 - addr.PrefixLen) // Netmask is expected to be returned as a big endian // value. binary.BigEndian.PutUint32(ifr.Data[4:8], mask) break } case linux.SIOCETHTOOL: // Stubbed out for now, Ideally we should implement the required // sub-commands for ETHTOOL // // See: // https://github.com/torvalds/linux/blob/aa0c9086b40c17a7ad94425b3b70dd1fdd7497bf/net/core/dev_ioctl.c return syserr.ErrEndpointOperation default: // Not a valid call. return syserr.ErrInvalidArgument } return nil } // ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl. func ifconfIoctl(ctx context.Context, t *kernel.Task, _ usermem.IO, ifc *linux.IFConf) error { // If Ptr is NULL, return the necessary buffer size via Len. // Otherwise, write up to Len bytes starting at Ptr containing ifreq // structs. stk := inet.StackFromContext(ctx) if stk == nil { return syserr.ErrNoDevice.ToError() } if ifc.Ptr == 0 { ifc.Len = int32(len(stk.Interfaces())) * int32(linux.SizeOfIFReq) return nil } max := ifc.Len ifc.Len = 0 for key, ifaceAddrs := range stk.InterfaceAddrs() { iface := stk.Interfaces()[key] for _, ifaceAddr := range ifaceAddrs { // Don't write past the end of the buffer. if ifc.Len+int32(linux.SizeOfIFReq) > max { break } if ifaceAddr.Family != linux.AF_INET { continue } // Populate ifr.ifr_addr. ifr := linux.IFReq{} ifr.SetName(iface.Name) hostarch.ByteOrder.PutUint16(ifr.Data[0:2], uint16(ifaceAddr.Family)) hostarch.ByteOrder.PutUint16(ifr.Data[2:4], 0) copy(ifr.Data[4:8], ifaceAddr.Addr[:4]) // Copy the ifr to userspace. dst := uintptr(ifc.Ptr) + uintptr(ifc.Len) ifc.Len += int32(linux.SizeOfIFReq) if _, err := ifr.CopyOut(t, hostarch.Addr(dst)); err != nil { return err } } } return nil } // interfaceStatusFlags returns status flags for an interface in the stack. // Flag values and meanings are described in greater detail in netdevice(7) in // the SIOCGIFFLAGS section. func interfaceStatusFlags(stack inet.Stack, name string) (uint32, *syserr.Error) { // We should only ever be passed a netstack.Stack. epstack, ok := stack.(*Stack) if !ok { return 0, errStackType } // Find the NIC corresponding to this interface. for _, info := range epstack.Stack.NICInfo() { if info.Name == name { return nicStateFlagsToLinux(info.Flags), nil } } return 0, syserr.ErrNoDevice } func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 { var rv uint32 if f.Up { rv |= linux.IFF_UP | linux.IFF_LOWER_UP } if f.Running { rv |= linux.IFF_RUNNING } if f.Promiscuous { rv |= linux.IFF_PROMISC } if f.Loopback { rv |= linux.IFF_LOOPBACK } return rv } // State implements socket.Socket.State. State translates the internal state // returned by netstack to values defined by Linux. func (s *sock) State() uint32 { if s.family != linux.AF_INET && s.family != linux.AF_INET6 { // States not implemented for this socket's family. return 0 } switch { case socket.IsTCP(s): // TCP socket. switch tcp.EndpointState(s.Endpoint.State()) { case tcp.StateEstablished: return linux.TCP_ESTABLISHED case tcp.StateSynSent: return linux.TCP_SYN_SENT case tcp.StateSynRecv: return linux.TCP_SYN_RECV case tcp.StateFinWait1: return linux.TCP_FIN_WAIT1 case tcp.StateFinWait2: return linux.TCP_FIN_WAIT2 case tcp.StateTimeWait: return linux.TCP_TIME_WAIT case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError: return linux.TCP_CLOSE case tcp.StateCloseWait: return linux.TCP_CLOSE_WAIT case tcp.StateLastAck: return linux.TCP_LAST_ACK case tcp.StateListen: return linux.TCP_LISTEN case tcp.StateClosing: return linux.TCP_CLOSING default: // Internal or unknown state. return 0 } case socket.IsUDP(s): // UDP socket. switch transport.DatagramEndpointState(s.Endpoint.State()) { case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateBound, transport.DatagramEndpointStateClosed: return linux.TCP_CLOSE case transport.DatagramEndpointStateConnected: return linux.TCP_ESTABLISHED default: return 0 } case socket.IsICMP(s): // We don't support this yet. case socket.IsRaw(s): // We don't support this yet. default: // Unknown transport protocol, how did we make this socket? log.Warningf("Unknown transport protocol for an existing socket: family=%v, type=%v, protocol=%v, internal type %v", s.family, s.skType, s.protocol, reflect.TypeOf(s.Endpoint).Elem()) return 0 } return 0 } // Type implements socket.Socket.Type. func (s *sock) Type() (family int, skType linux.SockType, protocol int) { return s.family, s.skType, s.protocol } // EventRegister implements waiter.Waitable. func (s *sock) EventRegister(e *waiter.Entry) error { s.Queue.EventRegister(e) return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (s *sock) EventUnregister(e *waiter.Entry) { s.Queue.EventUnregister(e) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netstack/netstack_state.go000066400000000000000000000016051465435605700274210ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package netstack import ( "context" "time" ) func (s *sock) saveTimestamp() int64 { s.readMu.Lock() defer s.readMu.Unlock() return s.timestamp.UnixNano() } func (s *sock) loadTimestamp(_ context.Context, nsec int64) { s.readMu.Lock() defer s.readMu.Unlock() s.timestamp = time.Unix(0, nsec) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netstack/netstack_state_autogen.go000066400000000000000000000052731465435605700311500ustar00rootroot00000000000000// automatically generated by stateify. package netstack import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (s *sock) StateTypeName() string { return "pkg/sentry/socket/netstack.sock" } func (s *sock) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "LockFD", "SendReceiveTimeout", "Queue", "family", "Endpoint", "skType", "protocol", "namespace", "sockOptTimestamp", "timestampValid", "timestamp", "sockOptInq", } } func (s *sock) beforeSave() {} // +checklocksignore func (s *sock) StateSave(stateSinkObject state.Sink) { s.beforeSave() var timestampValue int64 timestampValue = s.saveTimestamp() stateSinkObject.SaveValue(13, timestampValue) stateSinkObject.Save(0, &s.vfsfd) stateSinkObject.Save(1, &s.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &s.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &s.LockFD) stateSinkObject.Save(4, &s.SendReceiveTimeout) stateSinkObject.Save(5, &s.Queue) stateSinkObject.Save(6, &s.family) stateSinkObject.Save(7, &s.Endpoint) stateSinkObject.Save(8, &s.skType) stateSinkObject.Save(9, &s.protocol) stateSinkObject.Save(10, &s.namespace) stateSinkObject.Save(11, &s.sockOptTimestamp) stateSinkObject.Save(12, &s.timestampValid) stateSinkObject.Save(14, &s.sockOptInq) } func (s *sock) afterLoad(context.Context) {} // +checklocksignore func (s *sock) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.vfsfd) stateSourceObject.Load(1, &s.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &s.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &s.LockFD) stateSourceObject.Load(4, &s.SendReceiveTimeout) stateSourceObject.Load(5, &s.Queue) stateSourceObject.Load(6, &s.family) stateSourceObject.Load(7, &s.Endpoint) stateSourceObject.Load(8, &s.skType) stateSourceObject.Load(9, &s.protocol) stateSourceObject.Load(10, &s.namespace) stateSourceObject.Load(11, &s.sockOptTimestamp) stateSourceObject.Load(12, &s.timestampValid) stateSourceObject.Load(14, &s.sockOptInq) stateSourceObject.LoadValue(13, new(int64), func(y any) { s.loadTimestamp(ctx, y.(int64)) }) } func (s *Stack) StateTypeName() string { return "pkg/sentry/socket/netstack.Stack" } func (s *Stack) StateFields() []string { return []string{} } func (s *Stack) beforeSave() {} // +checklocksignore func (s *Stack) StateSave(stateSinkObject state.Sink) { s.beforeSave() } // +checklocksignore func (s *Stack) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.AfterLoad(func() { s.afterLoad(ctx) }) } func init() { state.Register((*sock)(nil)) state.Register((*Stack)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netstack/provider.go000066400000000000000000000143761465435605700262500ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package netstack import ( "time" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" "gvisor.dev/gvisor/pkg/tcpip/transport/udp" "gvisor.dev/gvisor/pkg/waiter" ) // provider is an inet socket provider. type provider struct { family int netProto tcpip.NetworkProtocolNumber } var rawMissingLogger = log.BasicRateLimitedLogger(time.Minute) // getTransportProtocol figures out transport protocol. Currently only TCP, // UDP, and ICMP are supported. The bool return value is true when this socket // is associated with a transport protocol. This is only false for SOCK_RAW, // IPPROTO_IP sockets. func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol int) (tcpip.TransportProtocolNumber, bool, *syserr.Error) { switch stype { case linux.SOCK_STREAM: if protocol != 0 && protocol != unix.IPPROTO_TCP { return 0, true, syserr.ErrInvalidArgument } return tcp.ProtocolNumber, true, nil case linux.SOCK_DGRAM: switch protocol { case 0, unix.IPPROTO_UDP: return udp.ProtocolNumber, true, nil case unix.IPPROTO_ICMP: return header.ICMPv4ProtocolNumber, true, nil case unix.IPPROTO_ICMPV6: return header.ICMPv6ProtocolNumber, true, nil } case linux.SOCK_RAW: // Raw sockets require CAP_NET_RAW. creds := auth.CredentialsFromContext(ctx) if !creds.HasCapability(linux.CAP_NET_RAW) { rawMissingLogger.Infof("A process tried to create a raw socket without CAP_NET_RAW. Should the container config enable CAP_NET_RAW?") return 0, true, syserr.ErrNotPermitted } switch protocol { case unix.IPPROTO_ICMP: return header.ICMPv4ProtocolNumber, true, nil case unix.IPPROTO_ICMPV6: return header.ICMPv6ProtocolNumber, true, nil case unix.IPPROTO_UDP: return header.UDPProtocolNumber, true, nil case unix.IPPROTO_TCP: return header.TCPProtocolNumber, true, nil // IPPROTO_RAW signifies that the raw socket isn't assigned to // a transport protocol. Users will be able to write packets' // IP headers and won't receive anything. case unix.IPPROTO_RAW: return tcpip.TransportProtocolNumber(0), false, nil } } return 0, true, syserr.ErrProtocolNotSupported } // Socket creates a new socket object for the AF_INET, AF_INET6, or AF_PACKET // family. func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { // Fail right away if we don't have a stack. stack := t.NetworkContext() if stack == nil { // Don't propagate an error here. Instead, allow the socket // code to continue searching for another provider. return nil, nil } eps, ok := stack.(*Stack) if !ok { return nil, nil } // Packet sockets are handled separately, since they are neither INET // nor INET6 specific. if p.family == linux.AF_PACKET { return packetSocket(t, eps, stype, protocol) } // Figure out the transport protocol. transProto, associated, err := getTransportProtocol(t, stype, protocol) if err != nil { return nil, err } // Create the endpoint. var ep tcpip.Endpoint var e tcpip.Error wq := &waiter.Queue{} if stype == linux.SOCK_RAW { ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated) } else { ep, e = eps.Stack.NewEndpoint(transProto, p.netProto, wq) // Assign task to PacketOwner interface to get the UID and GID for // iptables owner matching. if e == nil { ep.SetOwner(t) } } if e != nil { return nil, syserr.TranslateNetstackError(e) } return New(t, p.family, stype, int(transProto), wq, ep) } func packetSocket(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { // Packet sockets require CAP_NET_RAW. creds := auth.CredentialsFromContext(t) if !creds.HasCapability(linux.CAP_NET_RAW) { rawMissingLogger.Infof("A process tried to create a raw socket without CAP_NET_RAW. Should the container config enable CAP_NET_RAW?") return nil, syserr.ErrNotPermitted } // "cooked" packets don't contain link layer information. var cooked bool switch stype { case linux.SOCK_DGRAM: cooked = true case linux.SOCK_RAW: cooked = false default: return nil, syserr.ErrProtocolNotSupported } // protocol is passed in network byte order, but netstack wants it in // host order. netProto := tcpip.NetworkProtocolNumber(socket.Ntohs(uint16(protocol))) wq := &waiter.Queue{} ep, err := epStack.Stack.NewPacketEndpoint(cooked, netProto, wq) if err != nil { if _, ok := err.(*tcpip.ErrNotPermitted); ok { rawMissingLogger.Infof("A process tried to create a raw socket, which is disabled by default. Should the runtime config enable --net-raw?") } return nil, syserr.TranslateNetstackError(err) } return New(t, linux.AF_PACKET, stype, protocol, wq, ep) } // Pair just returns nil sockets (not supported). func (*provider) Pair(*kernel.Task, linux.SockType, int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { return nil, nil, nil } // init registers socket providers for AF_INET, AF_INET6, and AF_PACKET. func init() { // Providers backed by netstack. p := []provider{ { family: linux.AF_INET, netProto: ipv4.ProtocolNumber, }, { family: linux.AF_INET6, netProto: ipv6.ProtocolNumber, }, { family: linux.AF_PACKET, }, } for i := range p { socket.RegisterProvider(p[i].family, &p[i]) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netstack/save_restore.go000066400000000000000000000015761465435605700271150ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package netstack import ( "context" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // afterLoad is invoked by stateify. func (s *Stack) afterLoad(ctx context.Context) { s.Stack = stack.RestoreStackFromContext(ctx) if s.Stack == nil { panic("can't restore without netstack/tcpip/stack.Stack") } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netstack/stack.go000066400000000000000000000660671465435605700255270ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package netstack import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/socket/netlink/nlmsg" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/link/ethernet" "gvisor.dev/gvisor/pkg/tcpip/link/packetsocket" "gvisor.dev/gvisor/pkg/tcpip/link/veth" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" ) // Stack implements inet.Stack for netstack/tcpip/stack.Stack. // // +stateify savable type Stack struct { Stack *stack.Stack `state:"manual"` } // Destroy implements inet.Stack.Destroy. func (s *Stack) Destroy() { s.Stack.Close() refs.CleanupSync.Add(1) go func() { s.Stack.Wait() refs.CleanupSync.Done() }() } // SupportsIPv6 implements Stack.SupportsIPv6. func (s *Stack) SupportsIPv6() bool { return s.Stack.CheckNetworkProtocol(ipv6.ProtocolNumber) } // Converts Netstack's ARPHardwareType to equivalent linux constants. func toLinuxARPHardwareType(t header.ARPHardwareType) uint16 { switch t { case header.ARPHardwareNone: return linux.ARPHRD_NONE case header.ARPHardwareLoopback: return linux.ARPHRD_LOOPBACK case header.ARPHardwareEther: return linux.ARPHRD_ETHER default: panic(fmt.Sprintf("unknown ARPHRD type: %d", t)) } } // Interfaces implements inet.Stack.Interfaces. func (s *Stack) Interfaces() map[int32]inet.Interface { is := make(map[int32]inet.Interface) for id, ni := range s.Stack.NICInfo() { is[int32(id)] = inet.Interface{ Name: ni.Name, Addr: []byte(ni.LinkAddress), Flags: uint32(nicStateFlagsToLinux(ni.Flags)), DeviceType: toLinuxARPHardwareType(ni.ARPHardwareType), MTU: ni.MTU, } } return is } // RemoveInterface implements inet.Stack.RemoveInterface. func (s *Stack) RemoveInterface(idx int32) error { nic := tcpip.NICID(idx) nicInfo, ok := s.Stack.NICInfo()[nic] if !ok { return syserr.ErrUnknownNICID.ToError() } // Don't allow removing the loopback interface. if nicInfo.Flags.Loopback { return syserr.ErrNotSupported.ToError() } return syserr.TranslateNetstackError(s.Stack.RemoveNIC(nic)).ToError() } // SetInterface implements inet.Stack.SetInterface. func (s *Stack) SetInterface(ctx context.Context, msg *nlmsg.Message) *syserr.Error { var ifinfomsg linux.InterfaceInfoMessage attrsView, ok := msg.GetData(&ifinfomsg) if !ok { return syserr.ErrInvalidArgument } attrs, ok := attrsView.Parse() if !ok { return syserr.ErrInvalidArgument } ifname := "" for attr := range attrs { value := attrs[attr] switch attr { case linux.IFLA_IFNAME: if len(value) < 1 { return syserr.ErrInvalidArgument } if ifinfomsg.Index == 0 { ifname = value.String() for idx, ifa := range s.Interfaces() { if ifname == ifa.Name { ifinfomsg.Index = idx break } } } case linux.IFLA_MASTER: case linux.IFLA_LINKINFO: case linux.IFLA_ADDRESS: case linux.IFLA_MTU: case linux.IFLA_NET_NS_FD: case linux.IFLA_TXQLEN: default: ctx.Warningf("unexpected attribute: %x", attr) return syserr.ErrNotSupported } } flags := msg.Header().Flags if ifinfomsg.Index == 0 { if flags&linux.NLM_F_CREATE != 0 { return s.newInterface(ctx, msg, attrs) } return syserr.ErrNoDevice } if flags&(linux.NLM_F_EXCL|linux.NLM_F_REPLACE) != 0 { return syserr.ErrExists } if ifinfomsg.Flags != 0 || ifinfomsg.Change != 0 { if ifinfomsg.Change & ^uint32(linux.IFF_UP) != 0 { ctx.Warningf("Unsupported ifi_change flags: %x", ifinfomsg.Change) return syserr.ErrInvalidArgument } if ifinfomsg.Flags & ^uint32(linux.IFF_UP) != 0 { ctx.Warningf("Unsupported ifi_flags: %x", ifinfomsg.Change) return syserr.ErrInvalidArgument } // Netstack interfaces are always up. } return s.setLink(ctx, tcpip.NICID(ifinfomsg.Index), attrs) } func (s *Stack) setLink(ctx context.Context, id tcpip.NICID, linkAttrs map[uint16]nlmsg.BytesView) *syserr.Error { // IFLA_NET_NS_FD has to be handled first, because other parameters may be reseted. if v, ok := linkAttrs[linux.IFLA_NET_NS_FD]; ok { fd, ok := v.Uint32() if !ok { return syserr.ErrInvalidArgument } f := inet.NamespaceByFDFromContext(ctx) if f == nil { return syserr.ErrInvalidArgument } ns, err := f(int32(fd)) if err != nil { return syserr.FromError(err) } defer ns.DecRef(ctx) peer := ns.Stack().(*Stack) if peer.Stack != s.Stack { var err tcpip.Error id, err = s.Stack.SetNICStack(id, peer.Stack) if err != nil { return syserr.TranslateNetstackError(err) } } } for t, v := range linkAttrs { switch t { case linux.IFLA_MASTER: master, ok := v.Uint32() if !ok { return syserr.ErrInvalidArgument } if master != 0 { if err := s.Stack.SetNICCoordinator(id, tcpip.NICID(master)); err != nil { return syserr.TranslateNetstackError(err) } } case linux.IFLA_ADDRESS: if len(v) != tcpip.LinkAddressSize { return syserr.ErrInvalidArgument } addr := tcpip.LinkAddress(v) if err := s.Stack.SetNICAddress(id, addr); err != nil { return syserr.TranslateNetstackError(err) } case linux.IFLA_IFNAME: if err := s.Stack.SetNICName(id, v.String()); err != nil { return syserr.TranslateNetstackError(err) } case linux.IFLA_MTU: mtu, ok := v.Uint32() if !ok { return syserr.ErrInvalidArgument } if err := s.Stack.SetNICMTU(id, mtu); err != nil { return syserr.TranslateNetstackError(err) } case linux.IFLA_TXQLEN: // TODO(b/340388892): support IFLA_TXQLEN. } } return nil } const defaultMTU = 1500 func (s *Stack) newVeth(ctx context.Context, linkAttrs map[uint16]nlmsg.BytesView, linkInfoAttrs map[uint16]nlmsg.BytesView) *syserr.Error { var ( linkInfoData map[uint16]nlmsg.BytesView ifinfomsg linux.InterfaceInfoMessage peerLinkAttrs map[uint16]nlmsg.BytesView ) peerStack := s peerName := "" ifname := "" if v, ok := linkAttrs[linux.IFLA_IFNAME]; ok { ifname = v.String() } if value, ok := linkInfoAttrs[linux.IFLA_INFO_DATA]; ok { linkInfoData, ok = nlmsg.AttrsView(value).Parse() if !ok { return syserr.ErrInvalidArgument } if v, ok := linkInfoData[linux.VETH_INFO_PEER]; ok { attrsView := nlmsg.AttrsView(v[ifinfomsg.SizeBytes():]) if !ok { return syserr.ErrInvalidArgument } peerLinkAttrs, ok = attrsView.Parse() if !ok { return syserr.ErrInvalidArgument } if v, ok = peerLinkAttrs[linux.IFLA_IFNAME]; ok { peerName = v.String() } if v, ok = peerLinkAttrs[linux.IFLA_NET_NS_FD]; ok { fd, ok := v.Uint32() if !ok { return syserr.ErrInvalidArgument } f := inet.NamespaceByFDFromContext(ctx) if f == nil { return syserr.ErrInvalidArgument } ns, err := f(int32(fd)) if err != nil { return syserr.FromError(err) } defer ns.DecRef(ctx) peerStack = ns.Stack().(*Stack) } } } ep, peerEP := veth.NewPair(defaultMTU) id := s.Stack.NextNICID() peerID := peerStack.Stack.NextNICID() if ifname == "" { ifname = fmt.Sprintf("veth%d", id) } err := s.Stack.CreateNICWithOptions(id, packetsocket.New(ethernet.New(ep)), stack.NICOptions{ Name: ifname, }) if err != nil { return syserr.TranslateNetstackError(err) } if err := s.setLink(ctx, id, linkAttrs); err != nil { peerEP.Close() return err } if peerName == "" { peerName = fmt.Sprintf("veth%d", peerID) } err = peerStack.Stack.CreateNICWithOptions(peerID, packetsocket.New(ethernet.New(peerEP)), stack.NICOptions{ Name: peerName, }) if err != nil { peerEP.Close() return syserr.TranslateNetstackError(err) } if peerLinkAttrs != nil { if err := peerStack.setLink(ctx, peerID, peerLinkAttrs); err != nil { peerStack.Stack.RemoveNIC(peerID) peerEP.Close() return err } } return nil } func (s *Stack) newBridge(ctx context.Context, linkAttrs map[uint16]nlmsg.BytesView, linkInfoAttrs map[uint16]nlmsg.BytesView) *syserr.Error { ifname := "" if v, ok := linkAttrs[linux.IFLA_IFNAME]; ok { ifname = v.String() } ep := stack.NewBridgeEndpoint(defaultMTU) id := s.Stack.NextNICID() err := s.Stack.CreateNICWithOptions(id, ep, stack.NICOptions{ Name: ifname, }) if err != nil { return syserr.TranslateNetstackError(err) } if err := s.setLink(ctx, id, linkAttrs); err != nil { return err } return nil } func (s *Stack) newInterface(ctx context.Context, msg *nlmsg.Message, linkAttrs map[uint16]nlmsg.BytesView) *syserr.Error { var ( linkInfoAttrs map[uint16]nlmsg.BytesView kind string ) if v, ok := linkAttrs[linux.IFLA_LINKINFO]; ok { linkInfoAttrs, ok = nlmsg.AttrsView(v).Parse() if !ok { return syserr.ErrInvalidArgument } for attr := range linkInfoAttrs { value := linkInfoAttrs[attr] switch attr { case linux.IFLA_INFO_KIND: kind = value.String() case linux.IFLA_INFO_DATA: default: ctx.Warningf("unexpected link info attribute: %x", attr) return syserr.ErrNotSupported } } } switch kind { case "": return syserr.ErrInvalidArgument case "bridge": return s.newBridge(ctx, linkAttrs, linkInfoAttrs) case "veth": return s.newVeth(ctx, linkAttrs, linkInfoAttrs) } return syserr.ErrNotSupported } // InterfaceAddrs implements inet.Stack.InterfaceAddrs. func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { nicAddrs := make(map[int32][]inet.InterfaceAddr) for id, ni := range s.Stack.NICInfo() { var addrs []inet.InterfaceAddr for _, a := range ni.ProtocolAddresses { var family uint8 switch a.Protocol { case ipv4.ProtocolNumber: family = linux.AF_INET case ipv6.ProtocolNumber: family = linux.AF_INET6 default: log.Warningf("Unknown network protocol in %+v", a) continue } addrCopy := a.AddressWithPrefix.Address addrs = append(addrs, inet.InterfaceAddr{ Family: family, PrefixLen: uint8(a.AddressWithPrefix.PrefixLen), Addr: addrCopy.AsSlice(), // TODO(b/68878065): Other fields. }) } nicAddrs[int32(id)] = addrs } return nicAddrs } // convertAddr converts an InterfaceAddr to a ProtocolAddress. func convertAddr(addr inet.InterfaceAddr) (tcpip.ProtocolAddress, error) { var ( protocol tcpip.NetworkProtocolNumber address tcpip.Address protocolAddress tcpip.ProtocolAddress ) switch addr.Family { case linux.AF_INET: if len(addr.Addr) != header.IPv4AddressSize { return protocolAddress, linuxerr.EINVAL } if addr.PrefixLen > header.IPv4AddressSize*8 { return protocolAddress, linuxerr.EINVAL } protocol = ipv4.ProtocolNumber address = tcpip.AddrFrom4Slice(addr.Addr) case linux.AF_INET6: if len(addr.Addr) != header.IPv6AddressSize { return protocolAddress, linuxerr.EINVAL } if addr.PrefixLen > header.IPv6AddressSize*8 { return protocolAddress, linuxerr.EINVAL } protocol = ipv6.ProtocolNumber address = tcpip.AddrFrom16Slice(addr.Addr) default: return protocolAddress, linuxerr.ENOTSUP } protocolAddress = tcpip.ProtocolAddress{ Protocol: protocol, AddressWithPrefix: tcpip.AddressWithPrefix{ Address: address, PrefixLen: int(addr.PrefixLen), }, } return protocolAddress, nil } // AddInterfaceAddr implements inet.Stack.AddInterfaceAddr. func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { protocolAddress, err := convertAddr(addr) if err != nil { return err } // Attach address to interface. nicID := tcpip.NICID(idx) if err := s.Stack.AddProtocolAddress(nicID, protocolAddress, stack.AddressProperties{}); err != nil { return syserr.TranslateNetstackError(err).ToError() } // Add route for local network if it doesn't exist already. localRoute := tcpip.Route{ Destination: protocolAddress.AddressWithPrefix.Subnet(), Gateway: tcpip.Address{}, // No gateway for local network. NIC: nicID, } for _, rt := range s.Stack.GetRouteTable() { if rt.Equal(localRoute) { return nil } } // Local route does not exist yet. Add it. s.Stack.AddRoute(localRoute) return nil } // RemoveInterfaceAddr implements inet.Stack.RemoveInterfaceAddr. func (s *Stack) RemoveInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { protocolAddress, err := convertAddr(addr) if err != nil { return err } // Remove addresses matching the address and prefix. nicID := tcpip.NICID(idx) if err := s.Stack.RemoveAddress(nicID, protocolAddress.AddressWithPrefix.Address); err != nil { return syserr.TranslateNetstackError(err).ToError() } // Remove the corresponding local network route if it exists. localRoute := tcpip.Route{ Destination: protocolAddress.AddressWithPrefix.Subnet(), Gateway: tcpip.Address{}, // No gateway for local network. NIC: nicID, } s.Stack.RemoveRoutes(func(rt tcpip.Route) bool { return rt.Equal(localRoute) }) return nil } // TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize. func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) { var rs tcpip.TCPReceiveBufferSizeRangeOption err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &rs) return inet.TCPBufferSize{ Min: rs.Min, Default: rs.Default, Max: rs.Max, }, syserr.TranslateNetstackError(err).ToError() } // SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize. func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error { rs := tcpip.TCPReceiveBufferSizeRangeOption{ Min: size.Min, Default: size.Default, Max: size.Max, } return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &rs)).ToError() } // TCPSendBufferSize implements inet.Stack.TCPSendBufferSize. func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) { var ss tcpip.TCPSendBufferSizeRangeOption err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &ss) return inet.TCPBufferSize{ Min: ss.Min, Default: ss.Default, Max: ss.Max, }, syserr.TranslateNetstackError(err).ToError() } // SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize. func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error { ss := tcpip.TCPSendBufferSizeRangeOption{ Min: size.Min, Default: size.Default, Max: size.Max, } return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &ss)).ToError() } // TCPSACKEnabled implements inet.Stack.TCPSACKEnabled. func (s *Stack) TCPSACKEnabled() (bool, error) { var sack tcpip.TCPSACKEnabled err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &sack) return bool(sack), syserr.TranslateNetstackError(err).ToError() } // SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled. func (s *Stack) SetTCPSACKEnabled(enabled bool) error { opt := tcpip.TCPSACKEnabled(enabled) return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)).ToError() } // TCPRecovery implements inet.Stack.TCPRecovery. func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) { var recovery tcpip.TCPRecovery if err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &recovery); err != nil { return 0, syserr.TranslateNetstackError(err).ToError() } return inet.TCPLossRecovery(recovery), nil } // SetTCPRecovery implements inet.Stack.SetTCPRecovery. func (s *Stack) SetTCPRecovery(recovery inet.TCPLossRecovery) error { opt := tcpip.TCPRecovery(recovery) return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)).ToError() } // Statistics implements inet.Stack.Statistics. func (s *Stack) Statistics(stat any, arg string) error { switch stats := stat.(type) { case *inet.StatDev: for _, ni := range s.Stack.NICInfo() { if ni.Name != arg { continue } // TODO(gvisor.dev/issue/2103) Support stubbed stats. *stats = inet.StatDev{ // Receive section. ni.Stats.Rx.Bytes.Value(), // bytes. ni.Stats.Rx.Packets.Value(), // packets. 0, // errs. 0, // drop. 0, // fifo. 0, // frame. 0, // compressed. 0, // multicast. // Transmit section. ni.Stats.Tx.Bytes.Value(), // bytes. ni.Stats.Tx.Packets.Value(), // packets. 0, // errs. 0, // drop. 0, // fifo. 0, // colls. 0, // carrier. 0, // compressed. } break } case *inet.StatSNMPIP: ip := Metrics.IP // TODO(gvisor.dev/issue/969) Support stubbed stats. *stats = inet.StatSNMPIP{ 0, // Ip/Forwarding. 0, // Ip/DefaultTTL. ip.PacketsReceived.Value(), // InReceives. 0, // Ip/InHdrErrors. ip.InvalidDestinationAddressesReceived.Value(), // InAddrErrors. 0, // Ip/ForwDatagrams. 0, // Ip/InUnknownProtos. 0, // Ip/InDiscards. ip.PacketsDelivered.Value(), // InDelivers. ip.PacketsSent.Value(), // OutRequests. ip.OutgoingPacketErrors.Value(), // OutDiscards. 0, // Ip/OutNoRoutes. 0, // Support Ip/ReasmTimeout. 0, // Support Ip/ReasmReqds. 0, // Support Ip/ReasmOKs. 0, // Support Ip/ReasmFails. 0, // Support Ip/FragOKs. 0, // Support Ip/FragFails. 0, // Support Ip/FragCreates. } case *inet.StatSNMPICMP: in := Metrics.ICMP.V4.PacketsReceived.ICMPv4PacketStats out := Metrics.ICMP.V4.PacketsSent.ICMPv4PacketStats // TODO(gvisor.dev/issue/969) Support stubbed stats. *stats = inet.StatSNMPICMP{ 0, // Icmp/InMsgs. Metrics.ICMP.V4.PacketsSent.Dropped.Value(), // InErrors. 0, // Icmp/InCsumErrors. in.DstUnreachable.Value(), // InDestUnreachs. in.TimeExceeded.Value(), // InTimeExcds. in.ParamProblem.Value(), // InParmProbs. in.SrcQuench.Value(), // InSrcQuenchs. in.Redirect.Value(), // InRedirects. in.EchoRequest.Value(), // InEchos. in.EchoReply.Value(), // InEchoReps. in.Timestamp.Value(), // InTimestamps. in.TimestampReply.Value(), // InTimestampReps. in.InfoRequest.Value(), // InAddrMasks. in.InfoReply.Value(), // InAddrMaskReps. 0, // Icmp/OutMsgs. Metrics.ICMP.V4.PacketsReceived.Invalid.Value(), // OutErrors. out.DstUnreachable.Value(), // OutDestUnreachs. out.TimeExceeded.Value(), // OutTimeExcds. out.ParamProblem.Value(), // OutParmProbs. out.SrcQuench.Value(), // OutSrcQuenchs. out.Redirect.Value(), // OutRedirects. out.EchoRequest.Value(), // OutEchos. out.EchoReply.Value(), // OutEchoReps. out.Timestamp.Value(), // OutTimestamps. out.TimestampReply.Value(), // OutTimestampReps. out.InfoRequest.Value(), // OutAddrMasks. out.InfoReply.Value(), // OutAddrMaskReps. } case *inet.StatSNMPTCP: tcp := Metrics.TCP // RFC 2012 (updates 1213): SNMPv2-MIB-TCP. *stats = inet.StatSNMPTCP{ 1, // RtoAlgorithm. 200, // RtoMin. 120000, // RtoMax. (1<<64 - 1), // MaxConn. tcp.ActiveConnectionOpenings.Value(), // ActiveOpens. tcp.PassiveConnectionOpenings.Value(), // PassiveOpens. tcp.FailedConnectionAttempts.Value(), // AttemptFails. tcp.EstablishedResets.Value(), // EstabResets. tcp.CurrentEstablished.Value(), // CurrEstab. tcp.ValidSegmentsReceived.Value(), // InSegs. tcp.SegmentsSent.Value(), // OutSegs. tcp.Retransmits.Value(), // RetransSegs. tcp.InvalidSegmentsReceived.Value(), // InErrs. tcp.ResetsSent.Value(), // OutRsts. tcp.ChecksumErrors.Value(), // InCsumErrors. } case *inet.StatSNMPUDP: udp := Metrics.UDP // TODO(gvisor.dev/issue/969) Support stubbed stats. *stats = inet.StatSNMPUDP{ udp.PacketsReceived.Value(), // InDatagrams. udp.UnknownPortErrors.Value(), // NoPorts. 0, // Udp/InErrors. udp.PacketsSent.Value(), // OutDatagrams. udp.ReceiveBufferErrors.Value(), // RcvbufErrors. 0, // Udp/SndbufErrors. udp.ChecksumErrors.Value(), // Udp/InCsumErrors. 0, // Udp/IgnoredMulti. } default: return syserr.ErrEndpointOperation.ToError() } return nil } // RouteTable implements inet.Stack.RouteTable. func (s *Stack) RouteTable() []inet.Route { var routeTable []inet.Route for _, rt := range s.Stack.GetRouteTable() { var family uint8 switch rt.Destination.ID().BitLen() { case header.IPv4AddressSizeBits: family = linux.AF_INET case header.IPv6AddressSizeBits: family = linux.AF_INET6 default: log.Warningf("Unknown network protocol in route %+v", rt) continue } dstAddr := rt.Destination.ID() routeTable = append(routeTable, inet.Route{ Family: family, DstLen: uint8(rt.Destination.Prefix()), // The CIDR prefix for the destination. // Always return unspecified protocol since we have no notion of // protocol for routes. Protocol: linux.RTPROT_UNSPEC, // Set statically to LINK scope for now. // // TODO(gvisor.dev/issue/595): Set scope for routes. Scope: linux.RT_SCOPE_LINK, Type: linux.RTN_UNICAST, DstAddr: dstAddr.AsSlice(), OutputInterface: int32(rt.NIC), GatewayAddr: rt.Gateway.AsSlice(), }) } return routeTable } // NewRoute implements inet.Stack.NewRoute. func (s *Stack) NewRoute(ctx context.Context, msg *nlmsg.Message) *syserr.Error { var routeMsg linux.RouteMessage attrs, ok := msg.GetData(&routeMsg) if !ok { return syserr.ErrInvalidArgument } route := inet.Route{ Family: routeMsg.Family, DstLen: routeMsg.DstLen, SrcLen: routeMsg.SrcLen, TOS: routeMsg.TOS, Table: routeMsg.Table, Protocol: routeMsg.Protocol, Scope: routeMsg.Scope, Type: routeMsg.Type, Flags: routeMsg.Flags, } for !attrs.Empty() { ahdr, value, rest, ok := attrs.ParseFirst() if !ok { return syserr.ErrInvalidArgument } attrs = rest switch ahdr.Type { case linux.RTA_DST: if len(value) < 1 { return syserr.ErrInvalidArgument } route.DstAddr = value case linux.RTA_SRC: if len(value) < 1 { return syserr.ErrInvalidArgument } route.SrcAddr = value case linux.RTA_OIF: oif := nlmsg.BytesView(value) outputInterface, ok := oif.Int32() if !ok { return syserr.ErrInvalidArgument } if _, exist := s.Interfaces()[outputInterface]; !exist { return syserr.ErrNoDevice } route.OutputInterface = outputInterface case linux.RTA_GATEWAY: if len(value) < 1 { return syserr.ErrInvalidArgument } route.GatewayAddr = value case linux.RTA_PRIORITY: default: ctx.Warningf("Unknown attribute: %v", ahdr.Type) return syserr.ErrNotSupported } } var dest tcpip.Subnet // When no destination address is provided, the new route might be the default route. if route.DstAddr == nil { if route.GatewayAddr == nil { return syserr.ErrInvalidArgument } switch len(route.GatewayAddr) { case header.IPv4AddressSize: subnet, err := tcpip.NewSubnet(tcpip.AddrFromSlice(tcpip.IPv4Zero), tcpip.MaskFromBytes(tcpip.IPv4Zero)) if err != nil { return syserr.ErrInvalidArgument } dest = subnet case header.IPv6AddressSize: subnet, err := tcpip.NewSubnet(tcpip.AddrFromSlice(tcpip.IPv6Zero), tcpip.MaskFromBytes(tcpip.IPv6Zero)) if err != nil { return syserr.ErrInvalidArgument } dest = subnet default: return syserr.ErrInvalidArgument } } else { dest = tcpip.AddressWithPrefix{ Address: tcpip.AddrFromSlice(route.DstAddr), PrefixLen: int(route.DstLen)}.Subnet() } localRoute := tcpip.Route{ Destination: dest, Gateway: tcpip.AddrFromSlice(route.GatewayAddr), NIC: tcpip.NICID(route.OutputInterface), } if len(route.SrcAddr) != 0 { localRoute.SourceHint = tcpip.AddrFromSlice(route.SrcAddr) } found := false for _, rt := range s.Stack.GetRouteTable() { if localRoute.Equal(rt) { found = true break } } flags := msg.Header().Flags switch { case !found && flags&linux.NLM_F_CREATE == linux.NLM_F_CREATE: s.Stack.AddRoute(localRoute) case found && flags&linux.NLM_F_REPLACE != linux.NLM_F_REPLACE: return syserr.ErrExists } if flags&linux.NLM_F_REPLACE == linux.NLM_F_REPLACE { s.Stack.ReplaceRoute(localRoute) } return nil } // IPTables returns the stack's iptables. func (s *Stack) IPTables() (*stack.IPTables, error) { return s.Stack.IPTables(), nil } // Pause implements inet.Stack.Pause. func (s *Stack) Pause() { s.Stack.Pause() } // Restore implements inet.Stack.Restore. func (s *Stack) Restore() { s.Stack.Restore() } // Resume implements inet.Stack.Resume. func (s *Stack) Resume() { s.Stack.Resume() } // RegisteredEndpoints implements inet.Stack.RegisteredEndpoints. func (s *Stack) RegisteredEndpoints() []stack.TransportEndpoint { return s.Stack.RegisteredEndpoints() } // CleanupEndpoints implements inet.Stack.CleanupEndpoints. func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { return s.Stack.CleanupEndpoints() } // RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints. func (s *Stack) RestoreCleanupEndpoints(es []stack.TransportEndpoint) { s.Stack.RestoreCleanupEndpoints(es) } // SetForwarding implements inet.Stack.SetForwarding. func (s *Stack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error { if err := s.Stack.SetForwardingDefaultAndAllNICs(protocol, enable); err != nil { return fmt.Errorf("SetForwardingDefaultAndAllNICs(%d, %t): %s", protocol, enable, err) } return nil } // PortRange implements inet.Stack.PortRange. func (s *Stack) PortRange() (uint16, uint16) { return s.Stack.PortRange() } // SetPortRange implements inet.Stack.SetPortRange. func (s *Stack) SetPortRange(start uint16, end uint16) error { return syserr.TranslateNetstackError(s.Stack.SetPortRange(start, end)).ToError() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/netstack/tun.go000066400000000000000000000030711465435605700252120ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package netstack import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/tcpip/link/tun" ) // TUNFlagsToLinux converts a tun.Flags to Linux TUN flags. func TUNFlagsToLinux(flags tun.Flags) uint16 { ret := uint16(linux.IFF_NOFILTER) if flags.TAP { ret |= linux.IFF_TAP } if flags.TUN { ret |= linux.IFF_TUN } if flags.NoPacketInfo { ret |= linux.IFF_NO_PI } return ret } // LinuxToTUNFlags converts Linux TUN flags to a tun.Flags. func LinuxToTUNFlags(flags uint16) (tun.Flags, error) { // Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) // when there is no sk_filter. See __tun_chr_ioctl() in // net/drivers/tun.c. if flags&^uint16(linux.IFF_TUN|linux.IFF_TAP|linux.IFF_NO_PI|linux.IFF_ONE_QUEUE) != 0 { return tun.Flags{}, linuxerr.EINVAL } return tun.Flags{ TUN: flags&linux.IFF_TUN != 0, TAP: flags&linux.IFF_TAP != 0, NoPacketInfo: flags&linux.IFF_NO_PI != 0, }, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/socket.go000066400000000000000000000477131465435605700240730ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package socket provides the interfaces that need to be provided by socket // implementations and providers, as well as per family demultiplexing of socket // creation. package socket import ( "bytes" "fmt" "time" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/usermem" ) // ControlMessages represents the union of unix control messages and tcpip // control messages. type ControlMessages struct { Unix transport.ControlMessages IP IPControlMessages } // packetInfoToLinux converts IPPacketInfo from tcpip format to Linux format. func packetInfoToLinux(packetInfo tcpip.IPPacketInfo) linux.ControlMessageIPPacketInfo { var p linux.ControlMessageIPPacketInfo p.NIC = int32(packetInfo.NIC) copy(p.LocalAddr[:], packetInfo.LocalAddr.AsSlice()) copy(p.DestinationAddr[:], packetInfo.DestinationAddr.AsSlice()) return p } // ipv6PacketInfoToLinux converts IPv6PacketInfo from tcpip format to Linux // format. func ipv6PacketInfoToLinux(packetInfo tcpip.IPv6PacketInfo) linux.ControlMessageIPv6PacketInfo { var p linux.ControlMessageIPv6PacketInfo if n := copy(p.Addr[:], packetInfo.Addr.AsSlice()); n != len(p.Addr) { panic(fmt.Sprintf("got copy(%x, %x) = %d, want = %d", p.Addr, packetInfo.Addr, n, len(p.Addr))) } p.NIC = uint32(packetInfo.NIC) return p } // errOriginToLinux maps tcpip socket origin to Linux socket origin constants. func errOriginToLinux(origin tcpip.SockErrOrigin) uint8 { switch origin { case tcpip.SockExtErrorOriginNone: return linux.SO_EE_ORIGIN_NONE case tcpip.SockExtErrorOriginLocal: return linux.SO_EE_ORIGIN_LOCAL case tcpip.SockExtErrorOriginICMP: return linux.SO_EE_ORIGIN_ICMP case tcpip.SockExtErrorOriginICMP6: return linux.SO_EE_ORIGIN_ICMP6 default: panic(fmt.Sprintf("unknown socket origin: %d", origin)) } } // sockErrCmsgToLinux converts SockError control message from tcpip format to // Linux format. func sockErrCmsgToLinux(sockErr *tcpip.SockError) linux.SockErrCMsg { if sockErr == nil { return nil } ee := linux.SockExtendedErr{ Errno: uint32(syserr.TranslateNetstackError(sockErr.Err).ToLinux()), Origin: errOriginToLinux(sockErr.Cause.Origin()), Type: sockErr.Cause.Type(), Code: sockErr.Cause.Code(), Info: sockErr.Cause.Info(), } switch sockErr.NetProto { case header.IPv4ProtocolNumber: errMsg := &linux.SockErrCMsgIPv4{SockExtendedErr: ee} if len(sockErr.Offender.Addr.AsSlice()) > 0 { addr, _ := ConvertAddress(linux.AF_INET, sockErr.Offender) errMsg.Offender = *addr.(*linux.SockAddrInet) } return errMsg case header.IPv6ProtocolNumber: errMsg := &linux.SockErrCMsgIPv6{SockExtendedErr: ee} if len(sockErr.Offender.Addr.AsSlice()) > 0 { addr, _ := ConvertAddress(linux.AF_INET6, sockErr.Offender) errMsg.Offender = *addr.(*linux.SockAddrInet6) } return errMsg default: panic(fmt.Sprintf("invalid net proto for creating SockErrCMsg: %d", sockErr.NetProto)) } } // NewIPControlMessages converts the tcpip.ReceivableControlMessages (which does // not have Linux specific format) to Linux format. func NewIPControlMessages(family int, cmgs tcpip.ReceivableControlMessages) IPControlMessages { var orgDstAddr linux.SockAddr if cmgs.HasOriginalDstAddress { orgDstAddr, _ = ConvertAddress(family, cmgs.OriginalDstAddress) } cm := IPControlMessages{ HasTimestamp: cmgs.HasTimestamp, Timestamp: cmgs.Timestamp, HasInq: cmgs.HasInq, Inq: cmgs.Inq, HasTOS: cmgs.HasTOS, TOS: cmgs.TOS, HasTTL: cmgs.HasTTL, TTL: uint32(cmgs.TTL), HasHopLimit: cmgs.HasHopLimit, HopLimit: uint32(cmgs.HopLimit), HasTClass: cmgs.HasTClass, TClass: cmgs.TClass, HasIPPacketInfo: cmgs.HasIPPacketInfo, PacketInfo: packetInfoToLinux(cmgs.PacketInfo), HasIPv6PacketInfo: cmgs.HasIPv6PacketInfo, OriginalDstAddress: orgDstAddr, SockErr: sockErrCmsgToLinux(cmgs.SockErr), } if cm.HasIPv6PacketInfo { cm.IPv6PacketInfo = ipv6PacketInfoToLinux(cmgs.IPv6PacketInfo) } return cm } // IPControlMessages contains socket control messages for IP sockets. // This can contain Linux specific structures unlike tcpip.ControlMessages. // // +stateify savable type IPControlMessages struct { // HasTimestamp indicates whether Timestamp is valid/set. HasTimestamp bool // Timestamp is the time that the last packet used to create the read data // was received. Timestamp time.Time `state:".(int64)"` // HasInq indicates whether Inq is valid/set. HasInq bool // Inq is the number of bytes ready to be received. Inq int32 // HasTOS indicates whether Tos is valid/set. HasTOS bool // TOS is the IPv4 type of service of the associated packet. TOS uint8 // HasTTL indicates whether TTL is valid/set. HasTTL bool // TTL is the IPv4 Time To Live of the associated packet. TTL uint32 // HasHopLimit indicates whether HopLimit is valid/set. HasHopLimit bool // HopLimit is the IPv6 Hop Limit of the associated packet. HopLimit uint32 // HasTClass indicates whether TClass is valid/set. HasTClass bool // TClass is the IPv6 traffic class of the associated packet. TClass uint32 // HasIPPacketInfo indicates whether PacketInfo is set. HasIPPacketInfo bool // PacketInfo holds interface and address data on an incoming packet. PacketInfo linux.ControlMessageIPPacketInfo // HasIPv6PacketInfo indicates whether IPv6PacketInfo is set. HasIPv6PacketInfo bool // PacketInfo holds interface and address data on an incoming packet. IPv6PacketInfo linux.ControlMessageIPv6PacketInfo // OriginalDestinationAddress holds the original destination address // and port of the incoming packet. OriginalDstAddress linux.SockAddr // SockErr is the dequeued socket error on recvmsg(MSG_ERRQUEUE). SockErr linux.SockErrCMsg } // Release releases Unix domain socket credentials and rights. func (c *ControlMessages) Release(ctx context.Context) { c.Unix.Release(ctx) } // Socket is an interface containing socket syscalls used by the syscall // layer to redirect them to the appropriate implementation. type Socket interface { vfs.FileDescriptionImpl // Connect implements the connect(2) linux unix. Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error // Accept implements the accept4(2) linux unix. // Returns fd, real peer address length and error. Real peer address // length is only set if len(peer) > 0. Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) // Bind implements the bind(2) linux unix. Bind(t *kernel.Task, sockaddr []byte) *syserr.Error // Listen implements the listen(2) linux unix. Listen(t *kernel.Task, backlog int) *syserr.Error // Shutdown implements the shutdown(2) linux unix. Shutdown(t *kernel.Task, how int) *syserr.Error // GetSockOpt implements the getsockopt(2) linux unix. GetSockOpt(t *kernel.Task, level int, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) // SetSockOpt implements the setsockopt(2) linux unix. SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error // GetSockName implements the getsockname(2) linux unix. // // addrLen is the address length to be returned to the application, not // necessarily the actual length of the address. GetSockName(t *kernel.Task) (addr linux.SockAddr, addrLen uint32, err *syserr.Error) // GetPeerName implements the getpeername(2) linux unix. // // addrLen is the address length to be returned to the application, not // necessarily the actual length of the address. GetPeerName(t *kernel.Task) (addr linux.SockAddr, addrLen uint32, err *syserr.Error) // RecvMsg implements the recvmsg(2) linux unix. // // senderAddrLen is the address length to be returned to the application, // not necessarily the actual length of the address. // // flags control how RecvMsg should be completed. msgFlags indicate how // the RecvMsg call was completed. Note that control message truncation // may still be required even if the MSG_CTRUNC bit is not set in // msgFlags. In that case, the caller should set MSG_CTRUNC appropriately. // // If err != nil, the recv was not successful. RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages ControlMessages, err *syserr.Error) // SendMsg implements the sendmsg(2) linux unix. SendMsg does not take // ownership of the ControlMessage on error. // // If n > 0, err will either be nil or an error from t.Block. SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages ControlMessages) (n int, err *syserr.Error) // SetRecvTimeout sets the timeout (in ns) for recv operations. Zero means // no timeout, and negative means DONTWAIT. SetRecvTimeout(nanoseconds int64) // RecvTimeout gets the current timeout (in ns) for recv operations. Zero // means no timeout, and negative means DONTWAIT. RecvTimeout() int64 // SetSendTimeout sets the timeout (in ns) for send operations. Zero means // no timeout, and negative means DONTWAIT. SetSendTimeout(nanoseconds int64) // SendTimeout gets the current timeout (in ns) for send operations. Zero // means no timeout, and negative means DONTWAIT. SendTimeout() int64 // State returns the current state of the socket, as represented by Linux in // procfs. The returned state value is protocol-specific. State() uint32 // Type returns the family, socket type and protocol of the socket. Type() (family int, skType linux.SockType, protocol int) } // Provider is the interface implemented by providers of sockets for // specific address families (e.g., AF_INET). type Provider interface { // Socket creates a new socket. // // If a nil Socket _and_ a nil error is returned, it means that the // protocol is not supported. A non-nil error should only be returned // if the protocol is supported, but an error occurs during creation. Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) // Pair creates a pair of connected sockets. // // See Socket for error information. Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) } // families holds a map of all known address families and their providers. var families = make(map[int][]Provider) // RegisterProvider registers the provider of a given address family so that // sockets of that type can be created via socket() and/or socketpair() // syscalls. // // This should only be called during the initialization of the address family. func RegisterProvider(family int, provider Provider) { families[family] = append(families[family], provider) } // New creates a new socket with the given family, type and protocol. func New(t *kernel.Task, family int, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { for _, p := range families[family] { s, err := p.Socket(t, stype, protocol) if err != nil { return nil, err } if s != nil { t.Kernel().RecordSocket(s) return s, nil } } return nil, syserr.ErrAddressFamilyNotSupported } // Pair creates a new connected socket pair with the given family, type and // protocol. func Pair(t *kernel.Task, family int, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { providers, ok := families[family] if !ok { return nil, nil, syserr.ErrAddressFamilyNotSupported } for _, p := range providers { s1, s2, err := p.Pair(t, stype, protocol) if err != nil { return nil, nil, err } if s1 != nil && s2 != nil { k := t.Kernel() k.RecordSocket(s1) k.RecordSocket(s2) return s1, s2, nil } } return nil, nil, syserr.ErrSocketNotSupported } // SendReceiveTimeout stores timeouts for send and receive calls. // // It is meant to be embedded into Socket implementations to help satisfy the // interface. // // Care must be taken when copying SendReceiveTimeout as it contains atomic // variables. // // +stateify savable type SendReceiveTimeout struct { // send is length of the send timeout in nanoseconds. // // send must be accessed atomically. send atomicbitops.Int64 // recv is length of the receive timeout in nanoseconds. // // recv must be accessed atomically. recv atomicbitops.Int64 } // SetRecvTimeout implements Socket.SetRecvTimeout. func (to *SendReceiveTimeout) SetRecvTimeout(nanoseconds int64) { to.recv.Store(nanoseconds) } // RecvTimeout implements Socket.RecvTimeout. func (to *SendReceiveTimeout) RecvTimeout() int64 { return to.recv.Load() } // SetSendTimeout implements Socket.SetSendTimeout. func (to *SendReceiveTimeout) SetSendTimeout(nanoseconds int64) { to.send.Store(nanoseconds) } // SendTimeout implements Socket.SendTimeout. func (to *SendReceiveTimeout) SendTimeout() int64 { return to.send.Load() } // UnmarshalSockAddr unmarshals memory representing a struct sockaddr to one of // the ABI socket address types. // // Precondition: data must be long enough to represent a socket address of the // given family. func UnmarshalSockAddr(family int, data []byte) linux.SockAddr { switch family { case unix.AF_INET: var addr linux.SockAddrInet addr.UnmarshalUnsafe(data) return &addr case unix.AF_INET6: var addr linux.SockAddrInet6 addr.UnmarshalUnsafe(data) return &addr case unix.AF_UNIX: var addr linux.SockAddrUnix addr.UnmarshalUnsafe(data) return &addr case unix.AF_NETLINK: var addr linux.SockAddrNetlink addr.UnmarshalUnsafe(data) return &addr case unix.AF_PACKET: var addr linux.SockAddrLink addr.UnmarshalUnsafe(data) return &addr default: panic(fmt.Sprintf("Unsupported socket family %v", family)) } } var sockAddrLinkSize = (&linux.SockAddrLink{}).SizeBytes() var sockAddrInetSize = (&linux.SockAddrInet{}).SizeBytes() var sockAddrInet6Size = (&linux.SockAddrInet6{}).SizeBytes() // Ntohs converts a 16-bit number from network byte order to host byte order. It // assumes that the host is little endian. func Ntohs(v uint16) uint16 { return v<<8 | v>>8 } // Htons converts a 16-bit number from host byte order to network byte order. It // assumes that the host is little endian. func Htons(v uint16) uint16 { return Ntohs(v) } // isLinkLocal determines if the given IPv6 address is link-local. This is the // case when it has the fe80::/10 prefix. This check is used to determine when // the NICID is relevant for a given IPv6 address. func isLinkLocal(addr tcpip.Address) bool { addrBytes := addr.AsSlice() return len(addrBytes) >= 2 && addrBytes[0] == 0xfe && addrBytes[1]&0xc0 == 0x80 } // ConvertAddress converts the given address to a native format. func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) { switch family { case linux.AF_INET: var out linux.SockAddrInet copy(out.Addr[:], addr.Addr.AsSlice()) out.Family = linux.AF_INET out.Port = Htons(addr.Port) return &out, uint32(sockAddrInetSize) case linux.AF_INET6: var out linux.SockAddrInet6 addrBytes := addr.Addr.AsSlice() if len(addrBytes) == header.IPv4AddressSize { // Copy address in v4-mapped format. copy(out.Addr[12:], addrBytes) out.Addr[10] = 0xff out.Addr[11] = 0xff } else { copy(out.Addr[:], addrBytes) } out.Family = linux.AF_INET6 out.Port = Htons(addr.Port) if isLinkLocal(addr.Addr) { out.Scope_id = uint32(addr.NIC) } return &out, uint32(sockAddrInet6Size) case linux.AF_PACKET: var out linux.SockAddrLink out.Family = linux.AF_PACKET out.InterfaceIndex = int32(addr.NIC) out.HardwareAddrLen = header.EthernetAddressSize copy(out.HardwareAddr[:], addr.LinkAddr) return &out, uint32(sockAddrLinkSize) default: return nil, 0 } } // BytesToIPAddress converts an IPv4 or IPv6 address from the user to the // netstack representation taking any addresses into account. func BytesToIPAddress(addr []byte) tcpip.Address { if bytes.Equal(addr, make([]byte, 4)) || bytes.Equal(addr, make([]byte, 16)) { return tcpip.Address{} } return tcpip.AddrFromSlice(addr) } // AddressAndFamily reads an sockaddr struct from the given address and // converts it to the FullAddress format. It supports AF_UNIX, AF_INET, // AF_INET6, and AF_PACKET addresses. // // AddressAndFamily returns an address and its family. func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) { // Make sure we have at least 2 bytes for the address family. if len(addr) < 2 { return tcpip.FullAddress{}, 0, syserr.ErrInvalidArgument } // Get the rest of the fields based on the address family. switch family := hostarch.ByteOrder.Uint16(addr); family { case linux.AF_INET: var a linux.SockAddrInet if len(addr) < sockAddrInetSize { return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument } a.UnmarshalUnsafe(addr) out := tcpip.FullAddress{ Addr: BytesToIPAddress(a.Addr[:]), Port: Ntohs(a.Port), } return out, family, nil case linux.AF_INET6: var a linux.SockAddrInet6 if len(addr) < sockAddrInet6Size { return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument } a.UnmarshalUnsafe(addr) out := tcpip.FullAddress{ Addr: BytesToIPAddress(a.Addr[:]), Port: Ntohs(a.Port), } if isLinkLocal(out.Addr) { out.NIC = tcpip.NICID(a.Scope_id) } return out, family, nil case linux.AF_PACKET: var a linux.SockAddrLink if len(addr) < sockAddrLinkSize { return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument } a.UnmarshalUnsafe(addr) // TODO(https://gvisor.dev/issue/6530): Do not assume all interfaces have // an ethernet address. if a.Family != linux.AF_PACKET || a.HardwareAddrLen != header.EthernetAddressSize { return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument } return tcpip.FullAddress{ NIC: tcpip.NICID(a.InterfaceIndex), LinkAddr: tcpip.LinkAddress(a.HardwareAddr[:a.HardwareAddrLen]), Port: Ntohs(a.Protocol), }, family, nil case linux.AF_UNSPEC: return tcpip.FullAddress{}, family, nil default: return tcpip.FullAddress{}, 0, syserr.ErrAddressFamilyNotSupported } } // IsTCP returns true if the socket is a TCP socket. func IsTCP(s Socket) bool { fam, typ, proto := s.Type() if fam != linux.AF_INET && fam != linux.AF_INET6 { return false } return typ == linux.SOCK_STREAM && (proto == 0 || proto == linux.IPPROTO_TCP) } // IsUDP returns true if the socket is a UDP socket. func IsUDP(s Socket) bool { fam, typ, proto := s.Type() if fam != linux.AF_INET && fam != linux.AF_INET6 { return false } return typ == linux.SOCK_DGRAM && (proto == 0 || proto == linux.IPPROTO_UDP) } // IsICMP returns true if the socket is an ICMP socket. func IsICMP(s Socket) bool { fam, typ, proto := s.Type() if fam != linux.AF_INET && fam != linux.AF_INET6 { return false } return typ == linux.SOCK_DGRAM && (proto == linux.IPPROTO_ICMP || proto == linux.IPPROTO_ICMPV6) } // IsRaw returns true if the socket is a raw socket. func IsRaw(s Socket) bool { fam, typ, _ := s.Type() if fam != linux.AF_INET && fam != linux.AF_INET6 { return false } return typ == linux.SOCK_RAW } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/socket_state.go000066400000000000000000000015111465435605700252550ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package socket import ( "context" "time" ) func (i *IPControlMessages) saveTimestamp() int64 { return i.Timestamp.UnixNano() } func (i *IPControlMessages) loadTimestamp(_ context.Context, nsec int64) { i.Timestamp = time.Unix(0, nsec) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/socket_state_autogen.go000066400000000000000000000062731465435605700270110ustar00rootroot00000000000000// automatically generated by stateify. package socket import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (i *IPControlMessages) StateTypeName() string { return "pkg/sentry/socket.IPControlMessages" } func (i *IPControlMessages) StateFields() []string { return []string{ "HasTimestamp", "Timestamp", "HasInq", "Inq", "HasTOS", "TOS", "HasTTL", "TTL", "HasHopLimit", "HopLimit", "HasTClass", "TClass", "HasIPPacketInfo", "PacketInfo", "HasIPv6PacketInfo", "IPv6PacketInfo", "OriginalDstAddress", "SockErr", } } func (i *IPControlMessages) beforeSave() {} // +checklocksignore func (i *IPControlMessages) StateSave(stateSinkObject state.Sink) { i.beforeSave() var TimestampValue int64 TimestampValue = i.saveTimestamp() stateSinkObject.SaveValue(1, TimestampValue) stateSinkObject.Save(0, &i.HasTimestamp) stateSinkObject.Save(2, &i.HasInq) stateSinkObject.Save(3, &i.Inq) stateSinkObject.Save(4, &i.HasTOS) stateSinkObject.Save(5, &i.TOS) stateSinkObject.Save(6, &i.HasTTL) stateSinkObject.Save(7, &i.TTL) stateSinkObject.Save(8, &i.HasHopLimit) stateSinkObject.Save(9, &i.HopLimit) stateSinkObject.Save(10, &i.HasTClass) stateSinkObject.Save(11, &i.TClass) stateSinkObject.Save(12, &i.HasIPPacketInfo) stateSinkObject.Save(13, &i.PacketInfo) stateSinkObject.Save(14, &i.HasIPv6PacketInfo) stateSinkObject.Save(15, &i.IPv6PacketInfo) stateSinkObject.Save(16, &i.OriginalDstAddress) stateSinkObject.Save(17, &i.SockErr) } func (i *IPControlMessages) afterLoad(context.Context) {} // +checklocksignore func (i *IPControlMessages) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.HasTimestamp) stateSourceObject.Load(2, &i.HasInq) stateSourceObject.Load(3, &i.Inq) stateSourceObject.Load(4, &i.HasTOS) stateSourceObject.Load(5, &i.TOS) stateSourceObject.Load(6, &i.HasTTL) stateSourceObject.Load(7, &i.TTL) stateSourceObject.Load(8, &i.HasHopLimit) stateSourceObject.Load(9, &i.HopLimit) stateSourceObject.Load(10, &i.HasTClass) stateSourceObject.Load(11, &i.TClass) stateSourceObject.Load(12, &i.HasIPPacketInfo) stateSourceObject.Load(13, &i.PacketInfo) stateSourceObject.Load(14, &i.HasIPv6PacketInfo) stateSourceObject.Load(15, &i.IPv6PacketInfo) stateSourceObject.Load(16, &i.OriginalDstAddress) stateSourceObject.Load(17, &i.SockErr) stateSourceObject.LoadValue(1, new(int64), func(y any) { i.loadTimestamp(ctx, y.(int64)) }) } func (to *SendReceiveTimeout) StateTypeName() string { return "pkg/sentry/socket.SendReceiveTimeout" } func (to *SendReceiveTimeout) StateFields() []string { return []string{ "send", "recv", } } func (to *SendReceiveTimeout) beforeSave() {} // +checklocksignore func (to *SendReceiveTimeout) StateSave(stateSinkObject state.Sink) { to.beforeSave() stateSinkObject.Save(0, &to.send) stateSinkObject.Save(1, &to.recv) } func (to *SendReceiveTimeout) afterLoad(context.Context) {} // +checklocksignore func (to *SendReceiveTimeout) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &to.send) stateSourceObject.Load(1, &to.recv) } func init() { state.Register((*IPControlMessages)(nil)) state.Register((*SendReceiveTimeout)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/000077500000000000000000000000001465435605700232235ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/io.go000066400000000000000000000101621465435605700241610ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package unix import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" ) // EndpointWriter implements safemem.Writer that writes to a transport.Endpoint. // // EndpointWriter is not thread-safe. type EndpointWriter struct { Ctx context.Context // Endpoint is the transport.Endpoint to write to. Endpoint transport.Endpoint // Control is the control messages to send. Control transport.ControlMessages // To is the endpoint to send to. May be nil. To transport.BoundEndpoint // Notify is the receiver.SendNotify notification callback that is set // by WriteFromBlocks and should be called without mm.activeMu held // (i.e. after CopyOut completes). Notify func() } // WriteFromBlocks implements safemem.Writer.WriteFromBlocks. func (w *EndpointWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { return safemem.FromVecWriterFunc{func(bufs [][]byte) (int64, error) { n, notify, err := w.Endpoint.SendMsg(w.Ctx, bufs, w.Control, w.To) w.Notify = notify if err != nil { return int64(n), err.ToError() } return int64(n), nil }}.WriteFromBlocks(srcs) } // EndpointReader implements safemem.Reader that reads from a // transport.Endpoint. // // EndpointReader is not thread-safe. type EndpointReader struct { Ctx context.Context // Endpoint is the transport.Endpoint to read from. Endpoint transport.Endpoint // Creds indicates if credential control messages are requested. Creds bool // NumRights is the number of SCM_RIGHTS FDs requested. NumRights int // Peek indicates that the data should not be consumed from the // endpoint. Peek bool // MsgSize is the size of the message that was read from. For stream // sockets, it is the amount read. MsgSize int64 // From will be set with the address read from. From transport.Address // Control contains the received control messages. Control transport.ControlMessages // UnusedRights is a slice of unused RightsControlMessage that must be // Release()d before this EndpointReader is discarded. UnusedRights []transport.RightsControlMessage // ControlTrunc indicates that SCM_RIGHTS FDs were discarded based on // the value of NumRights. ControlTrunc bool // Notify is the ConnectedEndpoint.RecvNotify callback that is set by // ReadToBlocks and should be called without mm.activeMu held (i.e. // after CopyIn completes). Notify func() } // Truncate calls RecvMsg on the endpoint without writing to a destination. func (r *EndpointReader) Truncate() error { args := transport.RecvArgs{ Creds: r.Creds, NumRights: r.NumRights, Peek: r.Peek, } out, notify, err := r.Endpoint.RecvMsg(r.Ctx, [][]byte{}, args) r.MsgSize = out.MsgLen r.Control = out.Control r.ControlTrunc = out.ControlTrunc r.UnusedRights = out.UnusedRights r.From = out.Source if notify != nil { notify() } if err != nil { return err.ToError() } return nil } // ReadToBlocks implements safemem.Reader.ReadToBlocks. func (r *EndpointReader) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { return safemem.FromVecReaderFunc{func(bufs [][]byte) (int64, error) { args := transport.RecvArgs{ Creds: r.Creds, NumRights: r.NumRights, Peek: r.Peek, } out, notify, err := r.Endpoint.RecvMsg(r.Ctx, bufs, args) r.MsgSize = out.MsgLen r.Control = out.Control r.ControlTrunc = out.ControlTrunc r.UnusedRights = out.UnusedRights r.From = out.Source r.Notify = notify if err != nil { return int64(out.RecvLen), err.ToError() } return int64(out.RecvLen), nil }}.ReadToBlocks(dsts) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/socket_refs.go000066400000000000000000000101071465435605700260600ustar00rootroot00000000000000package unix import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const socketenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var socketobj *Socket // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type socketRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *socketRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *socketRefs) RefType() string { return fmt.Sprintf("%T", socketobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *socketRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *socketRefs) LogRefs() bool { return socketenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *socketRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *socketRefs) IncRef() { v := r.refCount.Add(1) if socketenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *socketRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if socketenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *socketRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if socketenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *socketRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport/000077500000000000000000000000001465435605700252575ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport/connectioned.go000066400000000000000000000462211465435605700302630ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package transport import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/waiter" ) type locker interface { Lock() Unlock() NestedLock(endpointlockNameIndex) NestedUnlock(endpointlockNameIndex) } // A ConnectingEndpoint is a connectioned unix endpoint that is attempting to // establish a bidirectional connection with a BoundEndpoint. type ConnectingEndpoint interface { // ID returns the endpoint's globally unique identifier. This identifier // must be used to determine locking order if more than one endpoint is // to be locked in the same codepath. The endpoint with the smaller // identifier must be locked before endpoints with larger identifiers. ID() uint64 // Passcred implements socket.Credentialer.Passcred. Passcred() bool // Type returns the socket type, typically either SockStream or // SockSeqpacket. The connection attempt must be aborted if this // value doesn't match the BoundEndpoint's type. Type() linux.SockType // GetLocalAddress returns the bound path. GetLocalAddress() (Address, tcpip.Error) // Locker protects the following methods. While locked, only the holder of // the lock can change the return value of the protected methods. locker // Connected returns true iff the ConnectingEndpoint is in the connected // state. ConnectingEndpoints can only be connected to a single endpoint, // so the connection attempt must be aborted if this returns true. Connected() bool // ListeningLocked returns true iff the ConnectingEndpoint is in the // listening state. ConnectingEndpoints cannot make connections while // listening, so the connection attempt must be aborted if this returns // true. ListeningLocked() bool // WaiterQueue returns a pointer to the endpoint's waiter queue. WaiterQueue() *waiter.Queue } // connectionedEndpoint is a Unix-domain connected or connectable endpoint and implements // ConnectingEndpoint, BoundEndpoint and tcpip.Endpoint. // // connectionedEndpoints must be in connected state in order to transfer data. // // This implementation includes STREAM and SEQPACKET Unix sockets created with // socket(2), accept(2) or socketpair(2) and dgram unix sockets created with // socketpair(2). See unix_connectionless.go for the implementation of DGRAM // Unix sockets created with socket(2). // // The state is much simpler than a TCP endpoint, so it is not encoded // explicitly. Instead we enforce the following invariants: // // receiver != nil, connected != nil => connected. // path != "" && acceptedChan == nil => bound, not listening. // path != "" && acceptedChan != nil => bound and listening. // // Only one of these will be true at any moment. // // +stateify savable type connectionedEndpoint struct { baseEndpoint // id is the unique endpoint identifier. This is used exclusively for // lock ordering within connect. id uint64 // idGenerator is used to generate new unique endpoint identifiers. idGenerator uniqueid.Provider // stype is used by connecting sockets to ensure that they are the // same type. The value is typically either tcpip.SockSeqpacket or // tcpip.SockStream. stype linux.SockType // acceptedChan is per the TCP endpoint implementation. Note that the // sockets in this channel are _already in the connected state_, and // have another associated connectionedEndpoint. // // If nil, then no listen call has been made. acceptedChan chan *connectionedEndpoint `state:".([]*connectionedEndpoint)"` // boundSocketFD corresponds to a bound socket on the host filesystem // that may listen and accept incoming connections. // // boundSocketFD is protected by baseEndpoint.mu. boundSocketFD BoundSocketFD } var ( _ = BoundEndpoint((*connectionedEndpoint)(nil)) _ = Endpoint((*connectionedEndpoint)(nil)) ) // NewConnectioned creates a new unbound connectionedEndpoint. func NewConnectioned(ctx context.Context, stype linux.SockType, uid uniqueid.Provider) Endpoint { return newConnectioned(ctx, stype, uid) } func newConnectioned(ctx context.Context, stype linux.SockType, uid uniqueid.Provider) *connectionedEndpoint { ep := &connectionedEndpoint{ baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}}, id: uid.UniqueID(), idGenerator: uid, stype: stype, } ep.ops.InitHandler(ep, &stackHandler{}, getSendBufferLimits, getReceiveBufferLimits) ep.ops.SetSendBufferSize(defaultBufferSize, false /* notify */) ep.ops.SetReceiveBufferSize(defaultBufferSize, false /* notify */) return ep } // NewPair allocates a new pair of connected unix-domain connectionedEndpoints. func NewPair(ctx context.Context, stype linux.SockType, uid uniqueid.Provider) (Endpoint, Endpoint) { a := newConnectioned(ctx, stype, uid) b := newConnectioned(ctx, stype, uid) q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: defaultBufferSize} q1.InitRefs() q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: defaultBufferSize} q2.InitRefs() if stype == linux.SOCK_STREAM { a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}} b.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q2}} } else { a.receiver = &queueReceiver{q1} b.receiver = &queueReceiver{q2} } q2.IncRef() a.connected = &connectedEndpoint{ endpoint: b, writeQueue: q2, } q1.IncRef() b.connected = &connectedEndpoint{ endpoint: a, writeQueue: q1, } return a, b } // NewExternal creates a new externally backed Endpoint. It behaves like a // socketpair. func NewExternal(stype linux.SockType, uid uniqueid.Provider, queue *waiter.Queue, receiver Receiver, connected ConnectedEndpoint) Endpoint { ep := &connectionedEndpoint{ baseEndpoint: baseEndpoint{Queue: queue, receiver: receiver, connected: connected}, id: uid.UniqueID(), idGenerator: uid, stype: stype, } ep.ops.InitHandler(ep, &stackHandler{}, getSendBufferLimits, getReceiveBufferLimits) ep.ops.SetSendBufferSize(connected.SendMaxQueueSize(), false /* notify */) ep.ops.SetReceiveBufferSize(defaultBufferSize, false /* notify */) return ep } // ID implements ConnectingEndpoint.ID. func (e *connectionedEndpoint) ID() uint64 { return e.id } // Type implements ConnectingEndpoint.Type and Endpoint.Type. func (e *connectionedEndpoint) Type() linux.SockType { return e.stype } // WaiterQueue implements ConnectingEndpoint.WaiterQueue. func (e *connectionedEndpoint) WaiterQueue() *waiter.Queue { return e.Queue } // isBound returns true iff the connectionedEndpoint is bound (but not // listening). func (e *connectionedEndpoint) isBound() bool { return e.path != "" && e.acceptedChan == nil } // Listening implements ConnectingEndpoint.Listening. func (e *connectionedEndpoint) Listening() bool { e.Lock() defer e.Unlock() return e.ListeningLocked() } func (e *connectionedEndpoint) ListeningLocked() bool { return e.acceptedChan != nil } // Close puts the connectionedEndpoint in a closed state and frees all // resources associated with it. // // The socket will be a fresh state after a call to close and may be reused. // That is, close may be used to "unbind" or "disconnect" the socket in error // paths. func (e *connectionedEndpoint) Close(ctx context.Context) { var acceptedChan chan *connectionedEndpoint e.Lock() var ( c ConnectedEndpoint r Receiver ) switch { case e.Connected(): e.connected.CloseSend() e.receiver.CloseRecv() // Still have unread data? If yes, we set this into the write // end so that the peer can get ECONNRESET) when it does read. if e.receiver.RecvQueuedSize() > 0 { e.connected.CloseUnread() } c = e.connected r = e.receiver e.connected = nil e.receiver = nil case e.isBound(): e.path = "" case e.ListeningLocked(): close(e.acceptedChan) acceptedChan = e.acceptedChan e.acceptedChan = nil e.path = "" } e.Unlock() if acceptedChan != nil { for n := range acceptedChan { n.Close(ctx) } } if c != nil { c.CloseNotify() c.Release(ctx) } e.ResetBoundSocketFD(ctx) if r != nil { r.CloseNotify() r.Release(ctx) } } // BidirectionalConnect implements BoundEndpoint.BidirectionalConnect. func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error { if ce.Type() != e.stype { return syserr.ErrWrongProtocolForSocket } // Check if ce is e to avoid a deadlock. if ce, ok := ce.(*connectionedEndpoint); ok && ce == e { return syserr.ErrInvalidEndpointState } // Do a dance to safely acquire locks on both endpoints. if e.id < ce.ID() { e.Lock() ce.NestedLock(endpointLockHigherid) } else { ce.Lock() e.NestedLock(endpointLockHigherid) } // Check connecting state. if ce.Connected() { e.NestedUnlock(endpointLockHigherid) ce.Unlock() return syserr.ErrAlreadyConnected } if ce.ListeningLocked() { e.NestedUnlock(endpointLockHigherid) ce.Unlock() return syserr.ErrInvalidEndpointState } // Check bound state. if !e.ListeningLocked() { e.NestedUnlock(endpointLockHigherid) ce.Unlock() return syserr.ErrConnectionRefused } // Create a newly bound connectionedEndpoint. ne := &connectionedEndpoint{ baseEndpoint: baseEndpoint{ path: e.path, Queue: &waiter.Queue{}, }, id: e.idGenerator.UniqueID(), idGenerator: e.idGenerator, stype: e.stype, } ne.ops.InitHandler(ne, &stackHandler{}, getSendBufferLimits, getReceiveBufferLimits) ne.ops.SetSendBufferSize(defaultBufferSize, false /* notify */) ne.ops.SetReceiveBufferSize(defaultBufferSize, false /* notify */) ne.SocketOptions().SetPassCred(e.SocketOptions().GetPassCred()) readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: defaultBufferSize} readQueue.InitRefs() ne.connected = &connectedEndpoint{ endpoint: ce, writeQueue: readQueue, } // Make sure the accepted endpoint inherits this listening socket's SO_SNDBUF. writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: e.ops.GetSendBufferSize()} writeQueue.InitRefs() if e.stype == linux.SOCK_STREAM { ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}} } else { ne.receiver = &queueReceiver{readQueue: writeQueue} } select { case e.acceptedChan <- ne: // Commit state. writeQueue.IncRef() connected := &connectedEndpoint{ endpoint: ne, writeQueue: writeQueue, } readQueue.IncRef() if e.stype == linux.SOCK_STREAM { returnConnect(&streamQueueReceiver{queueReceiver: queueReceiver{readQueue: readQueue}}, connected) } else { returnConnect(&queueReceiver{readQueue: readQueue}, connected) } // Notify can deadlock if we are holding these locks. e.NestedUnlock(endpointLockHigherid) ce.Unlock() // Notify on both ends. e.Notify(waiter.ReadableEvents) ce.WaiterQueue().Notify(waiter.WritableEvents) return nil default: // Busy; return EAGAIN per spec. e.NestedUnlock(endpointLockHigherid) ce.Unlock() ne.Close(ctx) return syserr.ErrTryAgain } } // UnidirectionalConnect implements BoundEndpoint.UnidirectionalConnect. func (e *connectionedEndpoint) UnidirectionalConnect(ctx context.Context) (ConnectedEndpoint, *syserr.Error) { return nil, syserr.ErrConnectionRefused } // Connect attempts to directly connect to another Endpoint. // Implements Endpoint.Connect. func (e *connectionedEndpoint) Connect(ctx context.Context, server BoundEndpoint) *syserr.Error { returnConnect := func(r Receiver, ce ConnectedEndpoint) { e.receiver = r e.connected = ce // Make sure the newly created connected endpoint's write queue is updated // to reflect this endpoint's send buffer size. if bufSz := e.connected.SetSendBufferSize(e.ops.GetSendBufferSize()); bufSz != e.ops.GetSendBufferSize() { e.ops.SetSendBufferSize(bufSz, false /* notify */) e.ops.SetReceiveBufferSize(bufSz, false /* notify */) } } return server.BidirectionalConnect(ctx, e, returnConnect) } // Listen starts listening on the connection. func (e *connectionedEndpoint) Listen(ctx context.Context, backlog int) *syserr.Error { e.Lock() defer e.Unlock() if e.ListeningLocked() { // Adjust the size of the channel iff we can fix existing // pending connections into the new one. if len(e.acceptedChan) > backlog { return syserr.ErrInvalidEndpointState } origChan := e.acceptedChan e.acceptedChan = make(chan *connectionedEndpoint, backlog) close(origChan) for ep := range origChan { e.acceptedChan <- ep } if e.boundSocketFD != nil { if err := e.boundSocketFD.Listen(ctx, int32(backlog)); err != nil { return syserr.FromError(err) } } return nil } if !e.isBound() { return syserr.ErrInvalidEndpointState } // Normal case. e.acceptedChan = make(chan *connectionedEndpoint, backlog) if e.boundSocketFD != nil { if err := e.boundSocketFD.Listen(ctx, int32(backlog)); err != nil { return syserr.FromError(err) } } return nil } // Accept accepts a new connection. func (e *connectionedEndpoint) Accept(ctx context.Context, peerAddr *Address) (Endpoint, *syserr.Error) { e.Lock() if !e.ListeningLocked() { e.Unlock() return nil, syserr.ErrInvalidEndpointState } ne, err := e.getAcceptedEndpointLocked(ctx) e.Unlock() if err != nil { return nil, err } if peerAddr != nil { ne.Lock() c := ne.connected ne.Unlock() if c != nil { addr, err := c.GetLocalAddress() if err != nil { return nil, syserr.TranslateNetstackError(err) } *peerAddr = addr } } return ne, nil } // Preconditions: // - e.Listening() // - e is locked. func (e *connectionedEndpoint) getAcceptedEndpointLocked(ctx context.Context) (*connectionedEndpoint, *syserr.Error) { // Accept connections from within the sentry first, since this avoids // an RPC to the gofer on the common path. select { case ne := <-e.acceptedChan: return ne, nil default: // No internal connections. } if e.boundSocketFD == nil { return nil, syserr.ErrWouldBlock } // Check for external connections. nfd, err := e.boundSocketFD.Accept(ctx) if err == unix.EWOULDBLOCK { return nil, syserr.ErrWouldBlock } if err != nil { return nil, syserr.FromError(err) } q := &waiter.Queue{} scme, serr := NewSCMEndpoint(nfd, q, e.path) if serr != nil { unix.Close(nfd) return nil, serr } scme.Init() return NewExternal(e.stype, e.idGenerator, q, scme, scme).(*connectionedEndpoint), nil } // Bind binds the connection. // // For Unix connectionedEndpoints, this _only sets the address associated with // the socket_. Work associated with sockets in the filesystem or finding those // sockets must be done by a higher level. // // Bind will fail only if the socket is connected, bound or the passed address // is invalid (the empty string). func (e *connectionedEndpoint) Bind(addr Address) *syserr.Error { e.Lock() defer e.Unlock() if e.isBound() || e.ListeningLocked() { return syserr.ErrAlreadyBound } if addr.Addr == "" { // The empty string is not permitted. return syserr.ErrBadLocalAddress } // Save the bound address. e.path = addr.Addr return nil } // SendMsg writes data and a control message to the endpoint's peer. // This method does not block if the data cannot be written. func (e *connectionedEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, func(), *syserr.Error) { // Stream sockets do not support specifying the endpoint. Seqpacket // sockets ignore the passed endpoint. if e.stype == linux.SOCK_STREAM && to != nil { return 0, nil, syserr.ErrNotSupported } return e.baseEndpoint.SendMsg(ctx, data, c, to) } func (e *connectionedEndpoint) isBoundSocketReadable() bool { if e.boundSocketFD == nil { return false } return fdnotifier.NonBlockingPoll(e.boundSocketFD.NotificationFD(), waiter.ReadableEvents)&waiter.ReadableEvents != 0 } // Readiness returns the current readiness of the connectionedEndpoint. For // example, if waiter.EventIn is set, the connectionedEndpoint is immediately // readable. func (e *connectionedEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask { e.Lock() defer e.Unlock() ready := waiter.EventMask(0) switch { case e.Connected(): if mask&waiter.ReadableEvents != 0 && e.receiver.Readable() { ready |= waiter.ReadableEvents } if mask&waiter.WritableEvents != 0 && e.connected.Writable() { ready |= waiter.WritableEvents } if mask&(waiter.EventHUp|waiter.EventRdHUp) != 0 && e.receiver.IsRecvClosed() { ready |= waiter.EventRdHUp if mask&waiter.EventHUp != 0 && e.connected.IsSendClosed() { ready |= waiter.EventHUp } } case e.ListeningLocked(): if mask&waiter.ReadableEvents != 0 && (len(e.acceptedChan) > 0 || e.isBoundSocketReadable()) { ready |= waiter.ReadableEvents } } return ready } // State implements socket.Socket.State. func (e *connectionedEndpoint) State() uint32 { e.Lock() defer e.Unlock() if e.Connected() { return linux.SS_CONNECTED } return linux.SS_UNCONNECTED } // OnSetSendBufferSize implements tcpip.SocketOptionsHandler.OnSetSendBufferSize. func (e *connectionedEndpoint) OnSetSendBufferSize(v int64) (newSz int64) { e.Lock() defer e.Unlock() if e.Connected() { return e.baseEndpoint.connected.SetSendBufferSize(v) } return v } // WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters. func (e *connectionedEndpoint) WakeupWriters() {} // SetBoundSocketFD implement HostBountEndpoint.SetBoundSocketFD. func (e *connectionedEndpoint) SetBoundSocketFD(ctx context.Context, bsFD BoundSocketFD) error { e.Lock() defer e.Unlock() if e.path != "" || e.boundSocketFD != nil { bsFD.Close(ctx) return syserr.ErrAlreadyBound.ToError() } e.boundSocketFD = bsFD fdnotifier.AddFD(bsFD.NotificationFD(), e.Queue) return nil } // SetBoundSocketFD implement HostBountEndpoint.ResetBoundSocketFD. func (e *connectionedEndpoint) ResetBoundSocketFD(ctx context.Context) { e.Lock() defer e.Unlock() if e.boundSocketFD == nil { return } fdnotifier.RemoveFD(e.boundSocketFD.NotificationFD()) e.boundSocketFD.Close(ctx) e.boundSocketFD = nil } // EventRegister implements waiter.Waitable.EventRegister. func (e *connectionedEndpoint) EventRegister(we *waiter.Entry) error { if err := e.baseEndpoint.EventRegister(we); err != nil { return err } e.Lock() bsFD := e.boundSocketFD e.Unlock() if bsFD != nil { fdnotifier.UpdateFD(bsFD.NotificationFD()) } return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (e *connectionedEndpoint) EventUnregister(we *waiter.Entry) { e.baseEndpoint.EventUnregister(we) e.Lock() bsFD := e.boundSocketFD e.Unlock() if bsFD != nil { fdnotifier.UpdateFD(bsFD.NotificationFD()) } } func (e *connectionedEndpoint) GetAcceptConn() bool { return e.Listening() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport/connectioned_state.go000066400000000000000000000046151465435605700314640ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package transport import "context" // saveAcceptedChan is invoked by stateify. func (e *connectionedEndpoint) saveAcceptedChan() []*connectionedEndpoint { // If acceptedChan is nil (i.e. we are not listening) then we will save nil. // Otherwise we create a (possibly empty) slice of the values in acceptedChan and // save that. var acceptedSlice []*connectionedEndpoint if e.acceptedChan != nil { // Swap out acceptedChan with a new empty channel of the same capacity. saveChan := e.acceptedChan e.acceptedChan = make(chan *connectionedEndpoint, cap(saveChan)) // Create a new slice with the same len and capacity as the channel. acceptedSlice = make([]*connectionedEndpoint, len(saveChan), cap(saveChan)) // Drain acceptedChan into saveSlice, and fill up the new acceptChan at the // same time. for i := range acceptedSlice { ep := <-saveChan acceptedSlice[i] = ep e.acceptedChan <- ep } close(saveChan) } return acceptedSlice } // loadAcceptedChan is invoked by stateify. func (e *connectionedEndpoint) loadAcceptedChan(_ context.Context, acceptedSlice []*connectionedEndpoint) { // If acceptedSlice is nil, then acceptedChan should also be nil. if acceptedSlice != nil { // Otherwise, create a new channel with the same capacity as acceptedSlice. e.acceptedChan = make(chan *connectionedEndpoint, cap(acceptedSlice)) // Seed the channel with values from acceptedSlice. for _, ep := range acceptedSlice { e.acceptedChan <- ep } } } // beforeSave is invoked by stateify. func (e *connectionedEndpoint) beforeSave() { if e.boundSocketFD != nil { panic("Cannot save endpoint with bound host socket") } } // afterLoad is invoked by stateify. func (e *connectionedEndpoint) afterLoad(context.Context) { e.ops.InitHandler(e, &stackHandler{}, getSendBufferLimits, getReceiveBufferLimits) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport/connectionless.go000066400000000000000000000142671465435605700306460ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package transport import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/waiter" ) // connectionlessEndpoint is a unix endpoint for unix sockets that support operating in // a connectionless fashion. // // Specifically, this means datagram unix sockets not created with // socketpair(2). // // +stateify savable type connectionlessEndpoint struct { baseEndpoint } var ( _ = BoundEndpoint((*connectionlessEndpoint)(nil)) _ = Endpoint((*connectionlessEndpoint)(nil)) ) // NewConnectionless creates a new unbound dgram endpoint. func NewConnectionless(ctx context.Context) Endpoint { ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}} q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: defaultBufferSize} q.InitRefs() ep.receiver = &queueReceiver{readQueue: &q} ep.ops.InitHandler(ep, &stackHandler{}, getSendBufferLimits, getReceiveBufferLimits) ep.ops.SetSendBufferSize(defaultBufferSize, false /* notify */) ep.ops.SetReceiveBufferSize(defaultBufferSize, false /* notify */) return ep } // isBound returns true iff the endpoint is bound. func (e *connectionlessEndpoint) isBound() bool { return e.path != "" } // Close puts the endpoint in a closed state and frees all resources associated // with it. func (e *connectionlessEndpoint) Close(ctx context.Context) { e.Lock() connected := e.connected e.connected = nil if e.isBound() { e.path = "" } e.receiver.CloseRecv() r := e.receiver e.receiver = nil e.Unlock() if connected != nil { connected.Release(ctx) } r.CloseNotify() r.Release(ctx) } // BidirectionalConnect implements BoundEndpoint.BidirectionalConnect. func (e *connectionlessEndpoint) BidirectionalConnect(ctx context.Context, ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error { return syserr.ErrConnectionRefused } // UnidirectionalConnect implements BoundEndpoint.UnidirectionalConnect. func (e *connectionlessEndpoint) UnidirectionalConnect(ctx context.Context) (ConnectedEndpoint, *syserr.Error) { e.Lock() r := e.receiver e.Unlock() if r == nil { return nil, syserr.ErrConnectionRefused } q := r.(*queueReceiver).readQueue if !q.TryIncRef() { return nil, syserr.ErrConnectionRefused } return &connectedEndpoint{ endpoint: e, writeQueue: q, }, nil } // SendMsg writes data and a control message to the specified endpoint. // This method does not block if the data cannot be written. func (e *connectionlessEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, func(), *syserr.Error) { if to == nil { return e.baseEndpoint.SendMsg(ctx, data, c, nil) } connected, err := to.UnidirectionalConnect(ctx) if err != nil { return 0, nil, syserr.ErrInvalidEndpointState } defer connected.Release(ctx) e.Lock() n, notify, err := connected.Send(ctx, data, c, Address{Addr: e.path}) e.Unlock() var notifyFn func() if notify { notifyFn = connected.SendNotify } return n, notifyFn, err } // Type implements Endpoint.Type. func (e *connectionlessEndpoint) Type() linux.SockType { return linux.SOCK_DGRAM } // Connect attempts to connect directly to server. func (e *connectionlessEndpoint) Connect(ctx context.Context, server BoundEndpoint) *syserr.Error { connected, err := server.UnidirectionalConnect(ctx) if err != nil { return err } e.Lock() if e.connected != nil { e.connected.Release(ctx) } e.connected = connected e.Unlock() return nil } // Listen starts listening on the connection. func (*connectionlessEndpoint) Listen(context.Context, int) *syserr.Error { return syserr.ErrNotSupported } // Accept accepts a new connection. func (*connectionlessEndpoint) Accept(context.Context, *Address) (Endpoint, *syserr.Error) { return nil, syserr.ErrNotSupported } // Bind binds the connection. // // For Unix endpoints, this _only sets the address associated with the socket_. // Work associated with sockets in the filesystem or finding those sockets must // be done by a higher level. // // Bind will fail only if the socket is connected, bound or the passed address // is invalid (the empty string). func (e *connectionlessEndpoint) Bind(addr Address) *syserr.Error { e.Lock() defer e.Unlock() if e.isBound() { return syserr.ErrAlreadyBound } if addr.Addr == "" { // The empty string is not permitted. return syserr.ErrBadLocalAddress } // Save the bound address. e.path = addr.Addr return nil } // Readiness returns the current readiness of the endpoint. For example, if // waiter.EventIn is set, the endpoint is immediately readable. func (e *connectionlessEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask { e.Lock() defer e.Unlock() ready := waiter.EventMask(0) if mask&waiter.ReadableEvents != 0 && e.receiver.Readable() { ready |= waiter.ReadableEvents } if e.Connected() { if mask&waiter.WritableEvents != 0 && e.connected.Writable() { ready |= waiter.WritableEvents } } return ready } // State implements socket.Socket.State. func (e *connectionlessEndpoint) State() uint32 { e.Lock() defer e.Unlock() switch { case e.isBound(): return linux.SS_UNCONNECTED case e.Connected(): return linux.SS_CONNECTING default: return linux.SS_DISCONNECTING } } // OnSetSendBufferSize implements tcpip.SocketOptionsHandler.OnSetSendBufferSize. func (e *connectionlessEndpoint) OnSetSendBufferSize(v int64) (newSz int64) { e.Lock() defer e.Unlock() if e.Connected() { return e.baseEndpoint.connected.SetSendBufferSize(v) } return v } // WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters. func (e *connectionlessEndpoint) WakeupWriters() {} golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport/connectionless_state.go000066400000000000000000000014631465435605700320400ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package transport import "context" // afterLoad is invoked by stateify. func (e *connectionlessEndpoint) afterLoad(context.Context) { e.ops.InitHandler(e, &stackHandler{}, getSendBufferLimits, getReceiveBufferLimits) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport/endpoint_mutex.go000066400000000000000000000033211465435605700306470ustar00rootroot00000000000000package transport import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type endpointMutex struct { mu sync.Mutex } var endpointprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var endpointlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type endpointlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. const ( endpointLockHigherid = endpointlockNameIndex(0) ) const () // Lock locks m. // +checklocksignore func (m *endpointMutex) Lock() { locking.AddGLock(endpointprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *endpointMutex) NestedLock(i endpointlockNameIndex) { locking.AddGLock(endpointprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *endpointMutex) Unlock() { locking.DelGLock(endpointprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *endpointMutex) NestedUnlock(i endpointlockNameIndex) { locking.DelGLock(endpointprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func endpointinitLockNames() { endpointlockNames = []string{"higherID"} } func init() { endpointinitLockNames() endpointprefixIndex = locking.NewMutexClass(reflect.TypeOf(endpointMutex{}), endpointlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport/host.go000066400000000000000000000313371465435605700265720ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package transport import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/waiter" ) // SCMRights implements RightsControlMessage with host FDs. type SCMRights struct { FDs []int } // Clone implements RightsControlMessage.Clone. func (c *SCMRights) Clone() RightsControlMessage { // Host rights never need to be cloned. return nil } // Release implements RightsControlMessage.Release. func (c *SCMRights) Release(ctx context.Context) { for _, fd := range c.FDs { unix.Close(fd) } c.FDs = nil } // HostConnectedEndpoint is an implementation of ConnectedEndpoint and // Receiver. It is backed by a host fd that was imported at sentry startup. // This fd is shared with a hostfs inode, which retains ownership of it. // // HostConnectedEndpoint is saveable, since we expect that the host will // provide the same fd upon restore. // // As of this writing, we only allow Unix sockets to be imported. // // +stateify savable type HostConnectedEndpoint struct { HostConnectedEndpointRefs // mu protects fd below. mu sync.RWMutex `state:"nosave"` // fd is the host fd backing this endpoint. fd int // addr is the address at which this endpoint is bound. addr string // sndbuf is the size of the send buffer. // // N.B. When this is smaller than the host size, we present it via // GetSockOpt and message splitting/rejection in SendMsg, but do not // prevent lots of small messages from filling the real send buffer // size on the host. sndbuf atomicbitops.Int64 `state:"nosave"` // stype is the type of Unix socket. stype linux.SockType // rdShutdown is true if receptions have been shutdown with SHUT_RD. rdShutdown atomicbitops.Bool // wrShutdown is true if transmissions have been shutdown with SHUT_WR. wrShutdown atomicbitops.Bool } // init performs initialization required for creating new // HostConnectedEndpoints and for restoring them. func (c *HostConnectedEndpoint) init() *syserr.Error { c.InitRefs() return c.initFromOptions() } func (c *HostConnectedEndpoint) initFromOptions() *syserr.Error { family, err := unix.GetsockoptInt(c.fd, unix.SOL_SOCKET, unix.SO_DOMAIN) if err != nil { return syserr.FromError(err) } if family != unix.AF_UNIX { // We only allow Unix sockets. return syserr.ErrInvalidEndpointState } stype, err := unix.GetsockoptInt(c.fd, unix.SOL_SOCKET, unix.SO_TYPE) if err != nil { return syserr.FromError(err) } if err := unix.SetNonblock(c.fd, true); err != nil { return syserr.FromError(err) } sndbuf, err := unix.GetsockoptInt(c.fd, unix.SOL_SOCKET, unix.SO_SNDBUF) if err != nil { return syserr.FromError(err) } c.stype = linux.SockType(stype) c.sndbuf.Store(int64(sndbuf)) return nil } // NewHostConnectedEndpoint creates a new HostConnectedEndpoint backed by a // host fd imported at sentry startup. // // The caller is responsible for calling Init(). Additionally, Release needs to // be called twice because HostConnectedEndpoint is both a Receiver and // HostConnectedEndpoint. func NewHostConnectedEndpoint(hostFD int, addr string) (*HostConnectedEndpoint, *syserr.Error) { e := HostConnectedEndpoint{ fd: hostFD, addr: addr, } if err := e.init(); err != nil { return nil, err } // HostConnectedEndpointRefs start off with a single reference. We need two. e.IncRef() return &e, nil } // SockType returns the underlying socket type. func (c *HostConnectedEndpoint) SockType() linux.SockType { return c.stype } // Send implements ConnectedEndpoint.Send. func (c *HostConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMessages ControlMessages, from Address) (int64, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() if !controlMessages.Empty() { return 0, false, syserr.ErrInvalidEndpointState } // Since stream sockets don't preserve message boundaries, we can write // only as much of the message as fits in the send buffer. truncate := c.stype == linux.SOCK_STREAM n, totalLen, err := fdWriteVec(c.fd, data, c.SendMaxQueueSize(), truncate) if n < totalLen && err == nil { // The host only returns a short write if it would otherwise // block (and only for stream sockets). err = linuxerr.EAGAIN } if n > 0 && !linuxerr.Equals(linuxerr.EAGAIN, err) { // The caller may need to block to send more data, but // otherwise there isn't anything that can be done about an // error with a partial write. err = nil } // There is no need for the callee to call SendNotify because fdWriteVec // uses the host's sendmsg(2) and the host kernel's queue. return n, false, syserr.FromError(err) } // SendNotify implements ConnectedEndpoint.SendNotify. func (c *HostConnectedEndpoint) SendNotify() {} // CloseSend implements ConnectedEndpoint.CloseSend. func (c *HostConnectedEndpoint) CloseSend() { c.mu.Lock() defer c.mu.Unlock() if err := unix.Shutdown(c.fd, unix.SHUT_WR); err != nil { // A well-formed UDS shutdown can't fail. See // net/unix/af_unix.c:unix_shutdown. panic(fmt.Sprintf("failed write shutdown on host socket %+v: %v", c, err)) } c.wrShutdown.Store(true) } // CloseNotify implements ConnectedEndpoint.CloseNotify. func (c *HostConnectedEndpoint) CloseNotify() {} // IsSendClosed implements ConnectedEndpoint.IsSendClosed. func (c *HostConnectedEndpoint) IsSendClosed() bool { return c.wrShutdown.Load() } // Writable implements ConnectedEndpoint.Writable. func (c *HostConnectedEndpoint) Writable() bool { c.mu.RLock() defer c.mu.RUnlock() return fdnotifier.NonBlockingPoll(int32(c.fd), waiter.WritableEvents)&waiter.WritableEvents != 0 } // Passcred implements ConnectedEndpoint.Passcred. func (c *HostConnectedEndpoint) Passcred() bool { // We don't support credential passing for host sockets. return false } // GetLocalAddress implements ConnectedEndpoint.GetLocalAddress. func (c *HostConnectedEndpoint) GetLocalAddress() (Address, tcpip.Error) { return Address{Addr: c.addr}, nil } // EventUpdate implements ConnectedEndpoint.EventUpdate. func (c *HostConnectedEndpoint) EventUpdate() error { c.mu.RLock() defer c.mu.RUnlock() if c.fd != -1 { if err := fdnotifier.UpdateFD(int32(c.fd)); err != nil { return err } } return nil } // Recv implements Receiver.Recv. func (c *HostConnectedEndpoint) Recv(ctx context.Context, data [][]byte, args RecvArgs) (RecvOutput, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() var cm unet.ControlMessage if args.NumRights > 0 { cm.EnableFDs(int(args.NumRights)) } // N.B. Unix sockets don't have a receive buffer, the send buffer // serves both purposes. out := RecvOutput{Source: Address{Addr: c.addr}} var err error var controlLen uint64 out.RecvLen, out.MsgLen, controlLen, out.ControlTrunc, err = fdReadVec(c.fd, data, []byte(cm), args.Peek, c.RecvMaxQueueSize()) if out.RecvLen > 0 && err != nil { // We got some data, so all we need to do on error is return // the data that we got. Short reads are fine, no need to // block. err = nil } if err != nil { return RecvOutput{}, false, syserr.FromError(err) } // There is no need for the callee to call RecvNotify because fdReadVec uses // the host's recvmsg(2) and the host kernel's queue. // Trim the control data if we received less than the full amount. if controlLen < uint64(len(cm)) { cm = cm[:controlLen] } // Avoid extra allocations in the case where there isn't any control data. if len(cm) == 0 { return out, false, nil } fds, err := cm.ExtractFDs() if err != nil { return RecvOutput{}, false, syserr.FromError(err) } if len(fds) == 0 { return out, false, nil } out.Control = ControlMessages{ Rights: &SCMRights{fds}, } return out, false, nil } // RecvNotify implements Receiver.RecvNotify. func (c *HostConnectedEndpoint) RecvNotify() {} // CloseRecv implements Receiver.CloseRecv. func (c *HostConnectedEndpoint) CloseRecv() { c.mu.Lock() defer c.mu.Unlock() if err := unix.Shutdown(c.fd, unix.SHUT_RD); err != nil { // A well-formed UDS shutdown can't fail. See // net/unix/af_unix.c:unix_shutdown. panic(fmt.Sprintf("failed read shutdown on host socket %+v: %v", c, err)) } c.rdShutdown.Store(true) } // IsRecvClosed implements Receiver.IsRecvClosed. func (c *HostConnectedEndpoint) IsRecvClosed() bool { return c.rdShutdown.Load() } // Readable implements Receiver.Readable. func (c *HostConnectedEndpoint) Readable() bool { c.mu.RLock() defer c.mu.RUnlock() return fdnotifier.NonBlockingPoll(int32(c.fd), waiter.ReadableEvents)&waiter.ReadableEvents != 0 } // SendQueuedSize implements Receiver.SendQueuedSize. func (c *HostConnectedEndpoint) SendQueuedSize() int64 { // TODO(gvisor.dev/issue/273): SendQueuedSize isn't supported for host // sockets because we don't allow the sentry to call ioctl(2). return -1 } // RecvQueuedSize implements Receiver.RecvQueuedSize. func (c *HostConnectedEndpoint) RecvQueuedSize() int64 { // TODO(gvisor.dev/issue/273): RecvQueuedSize isn't supported for host // sockets because we don't allow the sentry to call ioctl(2). return -1 } // SendMaxQueueSize implements Receiver.SendMaxQueueSize. func (c *HostConnectedEndpoint) SendMaxQueueSize() int64 { return c.sndbuf.Load() } // RecvMaxQueueSize implements Receiver.RecvMaxQueueSize. func (c *HostConnectedEndpoint) RecvMaxQueueSize() int64 { // N.B. Unix sockets don't use the receive buffer. We'll claim it is // the same size as the send buffer. return c.sndbuf.Load() } func (c *HostConnectedEndpoint) destroyLocked() { c.fd = -1 } // Release implements ConnectedEndpoint.Release and Receiver.Release. func (c *HostConnectedEndpoint) Release(ctx context.Context) { c.DecRef(func() { c.mu.Lock() c.destroyLocked() c.mu.Unlock() }) } // CloseUnread implements ConnectedEndpoint.CloseUnread. func (c *HostConnectedEndpoint) CloseUnread() {} // SetSendBufferSize implements ConnectedEndpoint.SetSendBufferSize. func (c *HostConnectedEndpoint) SetSendBufferSize(v int64) (newSz int64) { // gVisor does not permit setting of SO_SNDBUF for host backed unix // domain sockets. return c.sndbuf.Load() } // SetReceiveBufferSize implements ConnectedEndpoint.SetReceiveBufferSize. func (c *HostConnectedEndpoint) SetReceiveBufferSize(v int64) (newSz int64) { // gVisor does not permit setting of SO_RCVBUF for host backed unix // domain sockets. Receive buffer does not have any effect for unix // sockets and we claim to be the same as send buffer. return c.sndbuf.Load() } // SCMConnectedEndpoint represents an endpoint backed by a host fd that was // passed through a gofer Unix socket. It resembles HostConnectedEndpoint, with the // following differences: // - SCMConnectedEndpoint is not saveable, because the host cannot guarantee // the same descriptor number across S/R. // - SCMConnectedEndpoint holds ownership of its fd and notification queue. type SCMConnectedEndpoint struct { HostConnectedEndpoint queue *waiter.Queue } // Init will do the initialization required without holding other locks. func (e *SCMConnectedEndpoint) Init() error { return fdnotifier.AddFD(int32(e.fd), e.queue) } // Release implements ConnectedEndpoint.Release and Receiver.Release. func (e *SCMConnectedEndpoint) Release(ctx context.Context) { e.DecRef(func() { e.mu.Lock() fdnotifier.RemoveFD(int32(e.fd)) if err := unix.Close(e.fd); err != nil { log.Warningf("Failed to close host fd %d: %v", err) } e.destroyLocked() e.mu.Unlock() }) } // NewSCMEndpoint creates a new SCMConnectedEndpoint backed by a host fd that // was passed through a Unix socket. // // The caller is responsible for calling Init(). Additionally, Release needs to // be called twice because ConnectedEndpoint is both a Receiver and // ConnectedEndpoint. func NewSCMEndpoint(hostFD int, queue *waiter.Queue, addr string) (*SCMConnectedEndpoint, *syserr.Error) { e := SCMConnectedEndpoint{ HostConnectedEndpoint: HostConnectedEndpoint{ fd: hostFD, addr: addr, }, queue: queue, } if err := e.init(); err != nil { return nil, err } // e starts off with a single reference. We need two. e.IncRef() return &e, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport/host_connected_endpoint_refs.go000066400000000000000000000105321465435605700335250ustar00rootroot00000000000000package transport import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const HostConnectedEndpointenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var HostConnectedEndpointobj *HostConnectedEndpoint // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type HostConnectedEndpointRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *HostConnectedEndpointRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *HostConnectedEndpointRefs) RefType() string { return fmt.Sprintf("%T", HostConnectedEndpointobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *HostConnectedEndpointRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *HostConnectedEndpointRefs) LogRefs() bool { return HostConnectedEndpointenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *HostConnectedEndpointRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *HostConnectedEndpointRefs) IncRef() { v := r.refCount.Add(1) if HostConnectedEndpointenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *HostConnectedEndpointRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if HostConnectedEndpointenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *HostConnectedEndpointRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if HostConnectedEndpointenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *HostConnectedEndpointRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport/host_iovec.go000066400000000000000000000051211465435605700277470ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package transport import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/hostfd" ) // copyToMulti copies as many bytes from src to dst as possible. func copyToMulti(dst [][]byte, src []byte) { for _, d := range dst { done := copy(d, src) src = src[done:] if len(src) == 0 { break } } } // copyFromMulti copies as many bytes from src to dst as possible. func copyFromMulti(dst []byte, src [][]byte) { for _, s := range src { done := copy(dst, s) dst = dst[done:] if len(dst) == 0 { break } } } // buildIovec builds an iovec slice from the given []byte slice. // // If truncate, truncate bufs > maxlen. Otherwise, immediately return an error. // // If length < the total length of bufs, err indicates why, even when returning // a truncated iovec. // // If intermediate != nil, iovecs references intermediate rather than bufs and // the caller must copy to/from bufs as necessary. func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovecs []unix.Iovec, intermediate []byte, err error) { var iovsRequired int for _, b := range bufs { length += int64(len(b)) if len(b) > 0 { iovsRequired++ } } stopLen := length if length > maxlen { if truncate { stopLen = maxlen err = linuxerr.EAGAIN } else { return 0, nil, nil, linuxerr.EMSGSIZE } } if iovsRequired > hostfd.MaxSendRecvMsgIov { // The kernel will reject our call if we pass this many iovs. // Use a single intermediate buffer instead. b := make([]byte, stopLen) return stopLen, []unix.Iovec{{ Base: &b[0], Len: uint64(stopLen), }}, b, err } var total int64 iovecs = make([]unix.Iovec, 0, iovsRequired) for i := range bufs { l := len(bufs[i]) if l == 0 { continue } stop := int64(l) if total+stop > stopLen { stop = stopLen - total } iovecs = append(iovecs, unix.Iovec{ Base: &bufs[i][0], Len: uint64(stop), }) total += stop if total >= stopLen { break } } return total, iovecs, nil, err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport/host_unsafe.go000066400000000000000000000057311465435605700301320ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package transport import ( "unsafe" "golang.org/x/sys/unix" ) // fdReadVec receives from fd to bufs. // // If the total length of bufs is > maxlen, fdReadVec will do a partial read // and err will indicate why the message was truncated. func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (readLen int64, msgLen int64, controlLen uint64, controlTrunc bool, err error) { flags := uintptr(unix.MSG_DONTWAIT | unix.MSG_TRUNC) if peek { flags |= unix.MSG_PEEK } // Always truncate the receive buffer. All socket types will truncate // received messages. length, iovecs, intermediate, err := buildIovec(bufs, maxlen, true) if err != nil && len(iovecs) == 0 { // No partial write to do, return error immediately. return 0, 0, 0, false, err } var msg unix.Msghdr if len(control) != 0 { msg.Control = &control[0] msg.Controllen = uint64(len(control)) } if len(iovecs) != 0 { msg.Iov = &iovecs[0] msg.Iovlen = uint64(len(iovecs)) } rawN, _, e := unix.RawSyscall(unix.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags) if e != 0 { // N.B. prioritize the syscall error over the buildIovec error. return 0, 0, 0, false, e } n := int64(rawN) // Copy data back to bufs. if intermediate != nil { copyToMulti(bufs, intermediate) } controlTrunc = msg.Flags&unix.MSG_CTRUNC == unix.MSG_CTRUNC if n > length { return length, n, msg.Controllen, controlTrunc, nil } return n, n, msg.Controllen, controlTrunc, nil } // fdWriteVec sends from bufs to fd. // // If the total length of bufs is > maxlen && truncate, fdWriteVec will do a // partial write and err will indicate why the message was truncated. func fdWriteVec(fd int, bufs [][]byte, maxlen int64, truncate bool) (int64, int64, error) { length, iovecs, intermediate, err := buildIovec(bufs, maxlen, truncate) if err != nil && len(iovecs) == 0 { // No partial write to do, return error immediately. return 0, length, err } // Copy data to intermediate buf. if intermediate != nil { copyFromMulti(intermediate, bufs) } var msg unix.Msghdr if len(iovecs) > 0 { msg.Iov = &iovecs[0] msg.Iovlen = uint64(len(iovecs)) } n, _, e := unix.RawSyscall(unix.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), unix.MSG_DONTWAIT|unix.MSG_NOSIGNAL) if e != 0 { // N.B. prioritize the syscall error over the buildIovec error. return 0, length, e } return int64(n), length, err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport/queue.go000066400000000000000000000142501465435605700267340ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package transport import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/waiter" ) // queue is a buffer queue. // // +stateify savable type queue struct { queueRefs ReaderQueue *waiter.Queue WriterQueue *waiter.Queue mu queueMutex `state:"nosave"` closed atomicbitops.Bool unread bool used int64 limit int64 dataList messageList } // Close closes q for reading and writing. It is immediately not writable and // will become unreadable when no more data is pending. // // Both the read and write queues must be notified after closing: // q.ReaderQueue.Notify(waiter.ReadableEvents) // q.WriterQueue.Notify(waiter.WritableEvents) func (q *queue) Close() { q.mu.Lock() q.closed.Store(true) q.mu.Unlock() } func (q *queue) isClosed() bool { return q.closed.Load() } // Reset empties the queue and Releases all of the Entries. // // Both the read and write queues must be notified after resetting: // q.ReaderQueue.Notify(waiter.ReadableEvents) // q.WriterQueue.Notify(waiter.WritableEvents) func (q *queue) Reset(ctx context.Context) { q.mu.Lock() dataList := q.dataList q.dataList.Reset() q.used = 0 q.mu.Unlock() for cur := dataList.Front(); cur != nil; cur = cur.Next() { cur.Release(ctx) } } // DecRef implements RefCounter.DecRef. func (q *queue) DecRef(ctx context.Context) { q.queueRefs.DecRef(func() { // We don't need to notify after resetting because no one cares about // this queue after all references have been dropped. q.Reset(ctx) }) } // IsReadable determines if q is currently readable. func (q *queue) IsReadable() bool { q.mu.Lock() defer q.mu.Unlock() return q.closed.RacyLoad() || q.dataList.Front() != nil } // bufWritable returns true if there is space for writing. // // N.B. Linux only considers a unix socket "writable" if >75% of the buffer is // free. // // See net/unix/af_unix.c:unix_writeable. func (q *queue) bufWritable() bool { return 4*q.used < q.limit } // IsWritable determines if q is currently writable. func (q *queue) IsWritable() bool { q.mu.Lock() defer q.mu.Unlock() return q.closed.RacyLoad() || q.bufWritable() } // Enqueue adds an entry to the data queue if room is available. // // If discardEmpty is true and there are zero bytes of data, the packet is // dropped. // // If truncate is true, Enqueue may truncate the message before enqueuing it. // Otherwise, the entire message must fit. If l is less than the size of data, // err indicates why. // // If notify is true, ReaderQueue.Notify must be called: // q.ReaderQueue.Notify(waiter.ReadableEvents) func (q *queue) Enqueue(ctx context.Context, data [][]byte, c ControlMessages, from Address, discardEmpty bool, truncate bool) (l int64, notify bool, err *syserr.Error) { q.mu.Lock() if q.closed.RacyLoad() { q.mu.Unlock() return 0, false, syserr.ErrClosedForSend } for _, d := range data { l += int64(len(d)) } if discardEmpty && l == 0 { q.mu.Unlock() c.Release(ctx) return 0, false, nil } free := q.limit - q.used if l > free && truncate { if free <= 0 { // Message can't fit right now. q.mu.Unlock() return 0, false, syserr.ErrWouldBlock } l = free err = syserr.ErrWouldBlock } if l > q.limit { // Message is too big to ever fit. q.mu.Unlock() return 0, false, syserr.ErrMessageTooLong } if l > free { // Message can't fit right now, and could not be truncated. q.mu.Unlock() return 0, false, syserr.ErrWouldBlock } // Aggregate l bytes of data. This will truncate the data if l is less than // the total bytes held in data. v := make([]byte, l) for i, b := 0, v; i < len(data) && len(b) > 0; i++ { n := copy(b, data[i]) b = b[n:] } notify = true q.used += l q.dataList.PushBack(&message{ Data: v, Control: c, Address: from, }) q.mu.Unlock() return l, notify, err } // Dequeue removes the first entry in the data queue, if one exists. // // If notify is true, WriterQueue.Notify must be called: // q.WriterQueue.Notify(waiter.WritableEvents) func (q *queue) Dequeue() (e *message, notify bool, err *syserr.Error) { q.mu.Lock() if q.dataList.Front() == nil { err := syserr.ErrWouldBlock if q.closed.RacyLoad() { err = syserr.ErrClosedForReceive if q.unread { err = syserr.ErrConnectionReset } } q.mu.Unlock() return nil, false, err } e = q.dataList.Front() q.dataList.Remove(e) q.used -= e.Length() notify = q.bufWritable() q.mu.Unlock() return e, notify, nil } // Peek returns the first entry in the data queue, if one exists. func (q *queue) Peek() (*message, *syserr.Error) { q.mu.Lock() defer q.mu.Unlock() if q.dataList.Front() == nil { err := syserr.ErrWouldBlock if q.closed.RacyLoad() { if err = syserr.ErrClosedForReceive; q.unread { err = syserr.ErrConnectionReset } } return nil, err } return q.dataList.Front().Peek(), nil } // QueuedSize returns the number of bytes currently in the queue, that is, the // number of readable bytes. func (q *queue) QueuedSize() int64 { q.mu.Lock() defer q.mu.Unlock() return q.used } // MaxQueueSize returns the maximum number of bytes storable in the queue. func (q *queue) MaxQueueSize() int64 { q.mu.Lock() defer q.mu.Unlock() return q.limit } // SetMaxQueueSize sets the maximum number of bytes storable in the queue. func (q *queue) SetMaxQueueSize(v int64) { q.mu.Lock() defer q.mu.Unlock() q.limit = v } // CloseUnread sets flag to indicate that the peer is closed (not shutdown) // with unread data. So if read on this queue shall return ECONNRESET error. func (q *queue) CloseUnread() { q.mu.Lock() defer q.mu.Unlock() q.unread = true } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport/queue_mutex.go000066400000000000000000000031201465435605700301500ustar00rootroot00000000000000package transport import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type queueMutex struct { mu sync.Mutex } var queueprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var queuelockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type queuelockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *queueMutex) Lock() { locking.AddGLock(queueprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *queueMutex) NestedLock(i queuelockNameIndex) { locking.AddGLock(queueprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *queueMutex) Unlock() { locking.DelGLock(queueprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *queueMutex) NestedUnlock(i queuelockNameIndex) { locking.DelGLock(queueprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func queueinitLockNames() {} func init() { queueinitLockNames() queueprefixIndex = locking.NewMutexClass(reflect.TypeOf(queueMutex{}), queuelockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport/queue_refs.go000066400000000000000000000100721465435605700277510ustar00rootroot00000000000000package transport import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const queueenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var queueobj *queue // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type queueRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *queueRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *queueRefs) RefType() string { return fmt.Sprintf("%T", queueobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *queueRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *queueRefs) LogRefs() bool { return queueenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *queueRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *queueRefs) IncRef() { v := r.refCount.Add(1) if queueenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *queueRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if queueenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *queueRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if queueenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *queueRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport/save_restore.go000066400000000000000000000015221465435605700303070ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package transport import ( "context" "fmt" ) // afterLoad is invoked by stateify. func (c *HostConnectedEndpoint) afterLoad(context.Context) { if err := c.initFromOptions(); err != nil { panic(fmt.Sprintf("initFromOptions failed: %v", err)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport/stream_queue_receiver_mutex.go000066400000000000000000000035321465435605700334160ustar00rootroot00000000000000package transport import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type streamQueueReceiverMutex struct { mu sync.Mutex } var streamQueueReceiverprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var streamQueueReceiverlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type streamQueueReceiverlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *streamQueueReceiverMutex) Lock() { locking.AddGLock(streamQueueReceiverprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *streamQueueReceiverMutex) NestedLock(i streamQueueReceiverlockNameIndex) { locking.AddGLock(streamQueueReceiverprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *streamQueueReceiverMutex) Unlock() { locking.DelGLock(streamQueueReceiverprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *streamQueueReceiverMutex) NestedUnlock(i streamQueueReceiverlockNameIndex) { locking.DelGLock(streamQueueReceiverprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func streamQueueReceiverinitLockNames() {} func init() { streamQueueReceiverinitLockNames() streamQueueReceiverprefixIndex = locking.NewMutexClass(reflect.TypeOf(streamQueueReceiverMutex{}), streamQueueReceiverlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport/transport_message_list.go000066400000000000000000000121001465435605700323730ustar00rootroot00000000000000package transport // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type messageElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (messageElementMapper) linkerFor(elem *message) *message { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type messageList struct { head *message tail *message } // Reset resets list l to the empty state. func (l *messageList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *messageList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *messageList) Front() *message { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *messageList) Back() *message { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *messageList) Len() (count int) { for e := l.Front(); e != nil; e = (messageElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *messageList) PushFront(e *message) { linker := messageElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { messageElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *messageList) PushFrontList(m *messageList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { messageElementMapper{}.linkerFor(l.head).SetPrev(m.tail) messageElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *messageList) PushBack(e *message) { linker := messageElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { messageElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *messageList) PushBackList(m *messageList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { messageElementMapper{}.linkerFor(l.tail).SetNext(m.head) messageElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *messageList) InsertAfter(b, e *message) { bLinker := messageElementMapper{}.linkerFor(b) eLinker := messageElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { messageElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *messageList) InsertBefore(a, e *message) { aLinker := messageElementMapper{}.linkerFor(a) eLinker := messageElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { messageElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *messageList) Remove(e *message) { linker := messageElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { messageElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { messageElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type messageEntry struct { next *message prev *message } // Next returns the entry that follows e in the list. // //go:nosplit func (e *messageEntry) Next() *message { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *messageEntry) Prev() *message { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *messageEntry) SetNext(elem *message) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *messageEntry) SetPrev(elem *message) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport/transport_state_autogen.go000066400000000000000000000307211465435605700325670ustar00rootroot00000000000000// automatically generated by stateify. package transport import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (e *connectionedEndpoint) StateTypeName() string { return "pkg/sentry/socket/unix/transport.connectionedEndpoint" } func (e *connectionedEndpoint) StateFields() []string { return []string{ "baseEndpoint", "id", "idGenerator", "stype", "acceptedChan", "boundSocketFD", } } // +checklocksignore func (e *connectionedEndpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() var acceptedChanValue []*connectionedEndpoint acceptedChanValue = e.saveAcceptedChan() stateSinkObject.SaveValue(4, acceptedChanValue) stateSinkObject.Save(0, &e.baseEndpoint) stateSinkObject.Save(1, &e.id) stateSinkObject.Save(2, &e.idGenerator) stateSinkObject.Save(3, &e.stype) stateSinkObject.Save(5, &e.boundSocketFD) } // +checklocksignore func (e *connectionedEndpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.baseEndpoint) stateSourceObject.Load(1, &e.id) stateSourceObject.Load(2, &e.idGenerator) stateSourceObject.Load(3, &e.stype) stateSourceObject.Load(5, &e.boundSocketFD) stateSourceObject.LoadValue(4, new([]*connectionedEndpoint), func(y any) { e.loadAcceptedChan(ctx, y.([]*connectionedEndpoint)) }) stateSourceObject.AfterLoad(func() { e.afterLoad(ctx) }) } func (e *connectionlessEndpoint) StateTypeName() string { return "pkg/sentry/socket/unix/transport.connectionlessEndpoint" } func (e *connectionlessEndpoint) StateFields() []string { return []string{ "baseEndpoint", } } func (e *connectionlessEndpoint) beforeSave() {} // +checklocksignore func (e *connectionlessEndpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.baseEndpoint) } // +checklocksignore func (e *connectionlessEndpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.baseEndpoint) stateSourceObject.AfterLoad(func() { e.afterLoad(ctx) }) } func (c *HostConnectedEndpoint) StateTypeName() string { return "pkg/sentry/socket/unix/transport.HostConnectedEndpoint" } func (c *HostConnectedEndpoint) StateFields() []string { return []string{ "HostConnectedEndpointRefs", "fd", "addr", "stype", "rdShutdown", "wrShutdown", } } func (c *HostConnectedEndpoint) beforeSave() {} // +checklocksignore func (c *HostConnectedEndpoint) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.HostConnectedEndpointRefs) stateSinkObject.Save(1, &c.fd) stateSinkObject.Save(2, &c.addr) stateSinkObject.Save(3, &c.stype) stateSinkObject.Save(4, &c.rdShutdown) stateSinkObject.Save(5, &c.wrShutdown) } // +checklocksignore func (c *HostConnectedEndpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.HostConnectedEndpointRefs) stateSourceObject.Load(1, &c.fd) stateSourceObject.Load(2, &c.addr) stateSourceObject.Load(3, &c.stype) stateSourceObject.Load(4, &c.rdShutdown) stateSourceObject.Load(5, &c.wrShutdown) stateSourceObject.AfterLoad(func() { c.afterLoad(ctx) }) } func (r *HostConnectedEndpointRefs) StateTypeName() string { return "pkg/sentry/socket/unix/transport.HostConnectedEndpointRefs" } func (r *HostConnectedEndpointRefs) StateFields() []string { return []string{ "refCount", } } func (r *HostConnectedEndpointRefs) beforeSave() {} // +checklocksignore func (r *HostConnectedEndpointRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *HostConnectedEndpointRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (q *queue) StateTypeName() string { return "pkg/sentry/socket/unix/transport.queue" } func (q *queue) StateFields() []string { return []string{ "queueRefs", "ReaderQueue", "WriterQueue", "closed", "unread", "used", "limit", "dataList", } } func (q *queue) beforeSave() {} // +checklocksignore func (q *queue) StateSave(stateSinkObject state.Sink) { q.beforeSave() stateSinkObject.Save(0, &q.queueRefs) stateSinkObject.Save(1, &q.ReaderQueue) stateSinkObject.Save(2, &q.WriterQueue) stateSinkObject.Save(3, &q.closed) stateSinkObject.Save(4, &q.unread) stateSinkObject.Save(5, &q.used) stateSinkObject.Save(6, &q.limit) stateSinkObject.Save(7, &q.dataList) } func (q *queue) afterLoad(context.Context) {} // +checklocksignore func (q *queue) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &q.queueRefs) stateSourceObject.Load(1, &q.ReaderQueue) stateSourceObject.Load(2, &q.WriterQueue) stateSourceObject.Load(3, &q.closed) stateSourceObject.Load(4, &q.unread) stateSourceObject.Load(5, &q.used) stateSourceObject.Load(6, &q.limit) stateSourceObject.Load(7, &q.dataList) } func (r *queueRefs) StateTypeName() string { return "pkg/sentry/socket/unix/transport.queueRefs" } func (r *queueRefs) StateFields() []string { return []string{ "refCount", } } func (r *queueRefs) beforeSave() {} // +checklocksignore func (r *queueRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *queueRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (l *messageList) StateTypeName() string { return "pkg/sentry/socket/unix/transport.messageList" } func (l *messageList) StateFields() []string { return []string{ "head", "tail", } } func (l *messageList) beforeSave() {} // +checklocksignore func (l *messageList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *messageList) afterLoad(context.Context) {} // +checklocksignore func (l *messageList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *messageEntry) StateTypeName() string { return "pkg/sentry/socket/unix/transport.messageEntry" } func (e *messageEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *messageEntry) beforeSave() {} // +checklocksignore func (e *messageEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *messageEntry) afterLoad(context.Context) {} // +checklocksignore func (e *messageEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (c *ControlMessages) StateTypeName() string { return "pkg/sentry/socket/unix/transport.ControlMessages" } func (c *ControlMessages) StateFields() []string { return []string{ "Rights", "Credentials", } } func (c *ControlMessages) beforeSave() {} // +checklocksignore func (c *ControlMessages) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.Rights) stateSinkObject.Save(1, &c.Credentials) } func (c *ControlMessages) afterLoad(context.Context) {} // +checklocksignore func (c *ControlMessages) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.Rights) stateSourceObject.Load(1, &c.Credentials) } func (m *message) StateTypeName() string { return "pkg/sentry/socket/unix/transport.message" } func (m *message) StateFields() []string { return []string{ "messageEntry", "Data", "Control", "Address", } } func (m *message) beforeSave() {} // +checklocksignore func (m *message) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.messageEntry) stateSinkObject.Save(1, &m.Data) stateSinkObject.Save(2, &m.Control) stateSinkObject.Save(3, &m.Address) } func (m *message) afterLoad(context.Context) {} // +checklocksignore func (m *message) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.messageEntry) stateSourceObject.Load(1, &m.Data) stateSourceObject.Load(2, &m.Control) stateSourceObject.Load(3, &m.Address) } func (a *Address) StateTypeName() string { return "pkg/sentry/socket/unix/transport.Address" } func (a *Address) StateFields() []string { return []string{ "Addr", } } func (a *Address) beforeSave() {} // +checklocksignore func (a *Address) StateSave(stateSinkObject state.Sink) { a.beforeSave() stateSinkObject.Save(0, &a.Addr) } func (a *Address) afterLoad(context.Context) {} // +checklocksignore func (a *Address) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &a.Addr) } func (q *queueReceiver) StateTypeName() string { return "pkg/sentry/socket/unix/transport.queueReceiver" } func (q *queueReceiver) StateFields() []string { return []string{ "readQueue", } } func (q *queueReceiver) beforeSave() {} // +checklocksignore func (q *queueReceiver) StateSave(stateSinkObject state.Sink) { q.beforeSave() stateSinkObject.Save(0, &q.readQueue) } func (q *queueReceiver) afterLoad(context.Context) {} // +checklocksignore func (q *queueReceiver) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &q.readQueue) } func (q *streamQueueReceiver) StateTypeName() string { return "pkg/sentry/socket/unix/transport.streamQueueReceiver" } func (q *streamQueueReceiver) StateFields() []string { return []string{ "queueReceiver", "buffer", "control", "addr", } } func (q *streamQueueReceiver) beforeSave() {} // +checklocksignore func (q *streamQueueReceiver) StateSave(stateSinkObject state.Sink) { q.beforeSave() stateSinkObject.Save(0, &q.queueReceiver) stateSinkObject.Save(1, &q.buffer) stateSinkObject.Save(2, &q.control) stateSinkObject.Save(3, &q.addr) } func (q *streamQueueReceiver) afterLoad(context.Context) {} // +checklocksignore func (q *streamQueueReceiver) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &q.queueReceiver) stateSourceObject.Load(1, &q.buffer) stateSourceObject.Load(2, &q.control) stateSourceObject.Load(3, &q.addr) } func (e *connectedEndpoint) StateTypeName() string { return "pkg/sentry/socket/unix/transport.connectedEndpoint" } func (e *connectedEndpoint) StateFields() []string { return []string{ "endpoint", "writeQueue", } } func (e *connectedEndpoint) beforeSave() {} // +checklocksignore func (e *connectedEndpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.endpoint) stateSinkObject.Save(1, &e.writeQueue) } func (e *connectedEndpoint) afterLoad(context.Context) {} // +checklocksignore func (e *connectedEndpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.endpoint) stateSourceObject.Load(1, &e.writeQueue) } func (e *baseEndpoint) StateTypeName() string { return "pkg/sentry/socket/unix/transport.baseEndpoint" } func (e *baseEndpoint) StateFields() []string { return []string{ "Queue", "DefaultSocketOptionsHandler", "receiver", "connected", "path", "ops", } } func (e *baseEndpoint) beforeSave() {} // +checklocksignore func (e *baseEndpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.Queue) stateSinkObject.Save(1, &e.DefaultSocketOptionsHandler) stateSinkObject.Save(2, &e.receiver) stateSinkObject.Save(3, &e.connected) stateSinkObject.Save(4, &e.path) stateSinkObject.Save(5, &e.ops) } func (e *baseEndpoint) afterLoad(context.Context) {} // +checklocksignore func (e *baseEndpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.Queue) stateSourceObject.Load(1, &e.DefaultSocketOptionsHandler) stateSourceObject.Load(2, &e.receiver) stateSourceObject.Load(3, &e.connected) stateSourceObject.Load(4, &e.path) stateSourceObject.Load(5, &e.ops) } func init() { state.Register((*connectionedEndpoint)(nil)) state.Register((*connectionlessEndpoint)(nil)) state.Register((*HostConnectedEndpoint)(nil)) state.Register((*HostConnectedEndpointRefs)(nil)) state.Register((*queue)(nil)) state.Register((*queueRefs)(nil)) state.Register((*messageList)(nil)) state.Register((*messageEntry)(nil)) state.Register((*ControlMessages)(nil)) state.Register((*message)(nil)) state.Register((*Address)(nil)) state.Register((*queueReceiver)(nil)) state.Register((*streamQueueReceiver)(nil)) state.Register((*connectedEndpoint)(nil)) state.Register((*baseEndpoint)(nil)) } transport_unsafe_state_autogen.go000066400000000000000000000000731465435605700340460ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport// automatically generated by stateify. package transport golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/transport/unix.go000066400000000000000000001016001465435605700265670ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package transport contains the implementation of Unix endpoints. package transport import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/waiter" ) const ( // The minimum size of the send/receive buffers. minimumBufferSize = 4 << 10 // 4 KiB (match default in linux) // The default size of the send/receive buffers. defaultBufferSize = 208 << 10 // 208 KiB (default in linux for net.core.wmem_default) // The maximum permitted size for the send/receive buffers. maxBufferSize = 4 << 20 // 4 MiB 4 MiB (default in linux for net.core.wmem_max) ) // A RightsControlMessage is a control message containing FDs. // // +stateify savable type RightsControlMessage interface { // Clone returns a copy of the RightsControlMessage. Clone() RightsControlMessage // Release releases any resources owned by the RightsControlMessage. Release(ctx context.Context) } // A CredentialsControlMessage is a control message containing Unix credentials. type CredentialsControlMessage interface { // Equals returns true iff the two messages are equal. Equals(CredentialsControlMessage) bool } // A ControlMessages represents a collection of socket control messages. // // +stateify savable type ControlMessages struct { // Rights is a control message containing FDs. Rights RightsControlMessage // Credentials is a control message containing Unix credentials. Credentials CredentialsControlMessage } // Empty returns true iff the ControlMessages does not contain either // credentials or rights. func (c *ControlMessages) Empty() bool { return c.Rights == nil && c.Credentials == nil } // Clone clones both the credentials and the rights. func (c *ControlMessages) Clone() ControlMessages { cm := ControlMessages{} if c.Rights != nil { cm.Rights = c.Rights.Clone() } cm.Credentials = c.Credentials return cm } // Release releases both the credentials and the rights. func (c *ControlMessages) Release(ctx context.Context) { if c.Rights != nil { c.Rights.Release(ctx) } *c = ControlMessages{} } // RecvArgs are the arguments to Endpoint.RecvMsg and Receiver.Recv. type RecvArgs struct { // Creds indicates if credential control messages are requested by the // caller. This is useful for determining if control messages can be // coalesced. Creds is a hint and can be safely ignored by the // implementation if no coalescing is possible. It is fine to return // credential control messages when none were requested or to not // return credential control messages when they were requested. Creds bool // NumRights is the number of SCM_RIGHTS FDs requested by the caller. // This is useful if one must allocate a buffer to receive a SCM_RIGHTS // message or determine if control messages can be coalesced. numRights // is a hint and can be safely ignored by the implementation if the // number of available SCM_RIGHTS FDs is known and no coalescing is // possible. It is fine for the returned number of SCM_RIGHTS FDs to be // either higher or lower than the requested number. NumRights int // If Peek is true, no data should be consumed from the Endpoint. Any and // all data returned from a peek should be available in the next call to // Recv or RecvMsg. Peek bool } // RecvOutput is the output from Endpoint.RecvMsg and Receiver.Recv. type RecvOutput struct { // RecvLen is the number of bytes copied into RecvArgs.Data. RecvLen int64 // MsgLen is the length of the read message consumed for datagram Endpoints. // MsgLen is always the same as RecvLen for stream Endpoints. MsgLen int64 // Source is the source address we received from. Source Address // Control is the ControlMessages read. Control ControlMessages // ControlTrunc indicates that the NumRights hint was used to receive // fewer than the total available SCM_RIGHTS FDs. Additional truncation // may be required by the caller. ControlTrunc bool // UnusedRights is a slice of unused RightsControlMessage which should // be Release()d. UnusedRights []RightsControlMessage } // Endpoint is the interface implemented by Unix transport protocol // implementations that expose functionality like sendmsg, recvmsg, connect, // etc. to Unix socket implementations. type Endpoint interface { Credentialer waiter.Waitable // Close puts the endpoint in a closed state and frees all resources // associated with it. Close(ctx context.Context) // RecvMsg reads data and a control message from the endpoint. This method // does not block if there is no data pending. // // The returned callback should be called if not nil. RecvMsg(ctx context.Context, data [][]byte, args RecvArgs) (RecvOutput, func(), *syserr.Error) // SendMsg writes data and a control message to the endpoint's peer. // This method does not block if the data cannot be written. // // SendMsg does not take ownership of any of its arguments on error. // // If set, notify is a callback that should be called after RecvMesg // completes without mm.activeMu held. SendMsg(context.Context, [][]byte, ControlMessages, BoundEndpoint) (int64, func(), *syserr.Error) // Connect connects this endpoint directly to another. // // This should be called on the client endpoint, and the (bound) // endpoint passed in as a parameter. // // The error codes are the same as Connect. Connect(ctx context.Context, server BoundEndpoint) *syserr.Error // Shutdown closes the read and/or write end of the endpoint connection // to its peer. Shutdown(flags tcpip.ShutdownFlags) *syserr.Error // Listen puts the endpoint in "listen" mode, which allows it to accept // new connections. Listen(ctx context.Context, backlog int) *syserr.Error // Accept returns a new endpoint if a peer has established a connection // to an endpoint previously set to listen mode. This method does not // block if no new connections are available. // // The returned Queue is the wait queue for the newly created endpoint. // // peerAddr if not nil will be populated with the address of the connected // peer on a successful accept. Accept(ctx context.Context, peerAddr *Address) (Endpoint, *syserr.Error) // Bind binds the endpoint to a specific local address and port. // Specifying a NIC is optional. Bind(address Address) *syserr.Error // Type return the socket type, typically either SockStream, SockDgram // or SockSeqpacket. Type() linux.SockType // GetLocalAddress returns the address to which the endpoint is bound. GetLocalAddress() (Address, tcpip.Error) // GetRemoteAddress returns the address to which the endpoint is // connected. GetRemoteAddress() (Address, tcpip.Error) // SetSockOpt sets a socket option. SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error // SetSockOptInt sets a socket option for simple cases when a value has // the int type. SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error // GetSockOpt gets a socket option. GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error // GetSockOptInt gets a socket option for simple cases when a return // value has the int type. GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) // State returns the current state of the socket, as represented by Linux in // procfs. State() uint32 // LastError clears and returns the last error reported by the endpoint. LastError() tcpip.Error // SocketOptions returns the structure which contains all the socket // level options. SocketOptions() *tcpip.SocketOptions } // A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket // option. type Credentialer interface { // Passcred returns whether or not the SO_PASSCRED socket option is // enabled on this end. Passcred() bool // ConnectedPasscred returns whether or not the SO_PASSCRED socket option // is enabled on the connected end. ConnectedPasscred() bool } // A BoundEndpoint is a unix endpoint that can be connected to. type BoundEndpoint interface { // BidirectionalConnect establishes a bi-directional connection between two // unix endpoints in an all-or-nothing manner. If an error occurs during // connecting, the state of neither endpoint should be modified. // // In order for an endpoint to establish such a bidirectional connection // with a BoundEndpoint, the endpoint calls the BidirectionalConnect method // on the BoundEndpoint and sends a representation of itself (the // ConnectingEndpoint) and a callback (returnConnect) to receive the // connection information (Receiver and ConnectedEndpoint) upon a // successful connect. The callback should only be called on a successful // connect. // // For a connection attempt to be successful, the ConnectingEndpoint must // be unconnected and not listening and the BoundEndpoint whose // BidirectionalConnect method is being called must be listening. // // This method will return syserr.ErrConnectionRefused on endpoints with a // type that isn't SockStream or SockSeqpacket. BidirectionalConnect(ctx context.Context, ep ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error // UnidirectionalConnect establishes a write-only connection to a unix // endpoint. // // An endpoint which calls UnidirectionalConnect and supports it itself must // not hold its own lock when calling UnidirectionalConnect. // // This method will return syserr.ErrConnectionRefused on a non-SockDgram // endpoint. UnidirectionalConnect(ctx context.Context) (ConnectedEndpoint, *syserr.Error) // Passcred returns whether or not the SO_PASSCRED socket option is // enabled on this end. Passcred() bool // Release releases any resources held by the BoundEndpoint. It must be // called before dropping all references to a BoundEndpoint returned by a // function. Release(ctx context.Context) } // HostBoundEndpoint is an interface that endpoints can implement if they support // binding listening and accepting connections from a bound Unix domain socket // on the host. type HostBoundEndpoint interface { // SetBoundSocketFD will be called on supporting endpoints after // binding a socket on the host filesystem. Implementations should // delegate Listen and Accept calls to the BoundSocketFD. The ownership // of bsFD is transferred to the endpoint. SetBoundSocketFD(ctx context.Context, bsFD BoundSocketFD) error // ResetBoundSocketFD cleans up the BoundSocketFD set by the last successful // SetBoundSocketFD call. ResetBoundSocketFD(ctx context.Context) } // BoundSocketFD is an interface that wraps a socket FD that was bind(2)-ed. // It allows to listen and accept on that socket. type BoundSocketFD interface { // Close closes the socket FD. Close(ctx context.Context) // NotificationFD is a host FD that can be used to notify when new clients // connect to the socket. NotificationFD() int32 // Listen is analogous to listen(2). Listen(ctx context.Context, backlog int32) error // Accept is analogous to accept(2). Accept(ctx context.Context) (int, error) } // message represents a message passed over a Unix domain socket. // // +stateify savable type message struct { messageEntry // Data is the Message payload. Data []byte // Control is auxiliary control message data that goes along with the // data. Control ControlMessages // Address is the bound address of the endpoint that sent the message. // // If the endpoint that sent the message is not bound, the Address is // the empty string. Address Address } // Length returns number of bytes stored in the message. func (m *message) Length() int64 { return int64(len(m.Data)) } // Release releases any resources held by the message. func (m *message) Release(ctx context.Context) { m.Control.Release(ctx) } // Peek returns a copy of the message. func (m *message) Peek() *message { return &message{Data: m.Data, Control: m.Control.Clone(), Address: m.Address} } // Truncate reduces the length of the message payload to n bytes. // // Preconditions: n <= m.Length(). func (m *message) Truncate(n int64) { m.Data = m.Data[:n] } // A Receiver can be used to receive Messages. type Receiver interface { // Recv receives a single message. This method does not block. // // notify indicates if RecvNotify should be called. Recv(ctx context.Context, data [][]byte, args RecvArgs) (out RecvOutput, notify bool, err *syserr.Error) // RecvNotify notifies the Receiver of a successful Recv. This must not be // called while holding any endpoint locks. RecvNotify() // CloseRecv prevents the receiving of additional Messages. // // After CloseRecv is called, CloseNotify must also be called. CloseRecv() // CloseNotify notifies the Receiver of recv being closed. This must not be // called while holding any endpoint locks. CloseNotify() // IsRecvClosed returns true if reception of additional messages is closed. IsRecvClosed() bool // Readable returns if messages should be attempted to be received. This // includes when read has been shutdown. Readable() bool // RecvQueuedSize returns the total amount of data currently receivable. // RecvQueuedSize should return -1 if the operation isn't supported. RecvQueuedSize() int64 // RecvMaxQueueSize returns maximum value for RecvQueuedSize. // RecvMaxQueueSize should return -1 if the operation isn't supported. RecvMaxQueueSize() int64 // Release releases any resources owned by the Receiver. It should be // called before dropping all references to a Receiver. Release(ctx context.Context) } // Address is a unix socket address. // // +stateify savable type Address struct { Addr string } // queueReceiver implements Receiver for datagram sockets. // // +stateify savable type queueReceiver struct { readQueue *queue } // Recv implements Receiver.Recv. func (q *queueReceiver) Recv(ctx context.Context, data [][]byte, args RecvArgs) (RecvOutput, bool, *syserr.Error) { var m *message var notify bool var err *syserr.Error if args.Peek { m, err = q.readQueue.Peek() } else { m, notify, err = q.readQueue.Dequeue() } if err != nil { return RecvOutput{}, false, err } src := []byte(m.Data) var copied int64 for i := 0; i < len(data) && len(src) > 0; i++ { n := copy(data[i], src) copied += int64(n) src = src[n:] } out := RecvOutput{ RecvLen: copied, MsgLen: int64(len(m.Data)), Control: m.Control, Source: m.Address, } return out, notify, nil } // RecvNotify implements Receiver.RecvNotify. func (q *queueReceiver) RecvNotify() { q.readQueue.WriterQueue.Notify(waiter.WritableEvents) } // CloseNotify implements Receiver.CloseNotify. func (q *queueReceiver) CloseNotify() { q.readQueue.ReaderQueue.Notify(waiter.ReadableEvents) q.readQueue.WriterQueue.Notify(waiter.WritableEvents) } // CloseRecv implements Receiver.CloseRecv. func (q *queueReceiver) CloseRecv() { q.readQueue.Close() } // IsRecvClosed implements Receiver.IsRecvClosed. func (q *queueReceiver) IsRecvClosed() bool { return q.readQueue.isClosed() } // Readable implements Receiver.Readable. func (q *queueReceiver) Readable() bool { return q.readQueue.IsReadable() } // RecvQueuedSize implements Receiver.RecvQueuedSize. func (q *queueReceiver) RecvQueuedSize() int64 { return q.readQueue.QueuedSize() } // RecvMaxQueueSize implements Receiver.RecvMaxQueueSize. func (q *queueReceiver) RecvMaxQueueSize() int64 { return q.readQueue.MaxQueueSize() } // Release implements Receiver.Release. func (q *queueReceiver) Release(ctx context.Context) { q.readQueue.DecRef(ctx) } // streamQueueReceiver implements Receiver for stream sockets. // // +stateify savable type streamQueueReceiver struct { queueReceiver mu streamQueueReceiverMutex `state:"nosave"` buffer []byte control ControlMessages addr Address } func vecCopy(data [][]byte, buf []byte) (int64, [][]byte, []byte) { var copied int64 for len(data) > 0 && len(buf) > 0 { n := copy(data[0], buf) copied += int64(n) buf = buf[n:] data[0] = data[0][n:] if len(data[0]) == 0 { data = data[1:] } } return copied, data, buf } // Readable implements Receiver.Readable. func (q *streamQueueReceiver) Readable() bool { q.mu.Lock() bl := len(q.buffer) r := q.readQueue.IsReadable() q.mu.Unlock() // We're readable if we have data in our buffer or if the queue receiver is // readable. return bl > 0 || r } // RecvQueuedSize implements Receiver.RecvQueuedSize. func (q *streamQueueReceiver) RecvQueuedSize() int64 { q.mu.Lock() bl := len(q.buffer) qs := q.readQueue.QueuedSize() q.mu.Unlock() return int64(bl) + qs } // RecvMaxQueueSize implements Receiver.RecvMaxQueueSize. func (q *streamQueueReceiver) RecvMaxQueueSize() int64 { // The RecvMaxQueueSize() is the readQueue's MaxQueueSize() plus the largest // message we can buffer which is also the largest message we can receive. return 2 * q.readQueue.MaxQueueSize() } // Recv implements Receiver.Recv. func (q *streamQueueReceiver) Recv(ctx context.Context, data [][]byte, args RecvArgs) (RecvOutput, bool, *syserr.Error) { q.mu.Lock() defer q.mu.Unlock() var notify bool // If we have no data in the endpoint, we need to get some. if len(q.buffer) == 0 { // Load the next message into a buffer, even if we are peeking. Peeking // won't consume the message, so it will be still available to be read // the next time Recv() is called. m, n, err := q.readQueue.Dequeue() if err != nil { return RecvOutput{}, false, err } notify = n q.buffer = []byte(m.Data) q.control = m.Control q.addr = m.Address } var copied int64 if args.Peek { // Don't consume control message if we are peeking. c := q.control.Clone() // Don't consume data since we are peeking. copied, _, _ = vecCopy(data, q.buffer) out := RecvOutput{ RecvLen: copied, MsgLen: copied, Control: c, Source: q.addr, } return out, notify, nil } // Consume data and control message since we are not peeking. copied, data, q.buffer = vecCopy(data, q.buffer) // Save the original state of q.control. c := q.control // Remove rights from q.control and leave behind just the creds. q.control.Rights = nil if !args.Creds { c.Credentials = nil } var out RecvOutput if c.Rights != nil && args.NumRights == 0 { // We won't use these rights. out.UnusedRights = append(out.UnusedRights, c.Rights) c.Rights = nil out.ControlTrunc = true } haveRights := c.Rights != nil // If we have more capacity for data and haven't received any usable // rights. // // Linux never coalesces rights control messages. for !haveRights && len(data) > 0 { // Get a message from the readQueue. m, n, err := q.readQueue.Dequeue() if err != nil { // We already got some data, so ignore this error. This will // manifest as a short read to the user, which is what Linux // does. break } notify = notify || n q.buffer = []byte(m.Data) q.control = m.Control q.addr = m.Address if args.Creds { if (q.control.Credentials == nil) != (c.Credentials == nil) { // One message has credentials, the other does not. break } if q.control.Credentials != nil && c.Credentials != nil && !q.control.Credentials.Equals(c.Credentials) { // Both messages have credentials, but they don't match. break } } if args.NumRights != 0 && c.Rights != nil && q.control.Rights != nil { // Both messages have rights. break } var cpd int64 cpd, data, q.buffer = vecCopy(data, q.buffer) copied += cpd if cpd == 0 { // data was actually full. break } if q.control.Rights != nil { // Consume rights. if args.NumRights == 0 { out.ControlTrunc = true out.UnusedRights = append(out.UnusedRights, q.control.Rights) } else { c.Rights = q.control.Rights haveRights = true } q.control.Rights = nil } } out.MsgLen = copied out.RecvLen = copied out.Source = q.addr out.Control = c return out, notify, nil } // Release implements Receiver.Release. func (q *streamQueueReceiver) Release(ctx context.Context) { q.queueReceiver.Release(ctx) q.control.Release(ctx) } // A ConnectedEndpoint is an Endpoint that can be used to send Messages. type ConnectedEndpoint interface { // Passcred implements Endpoint.Passcred. Passcred() bool // GetLocalAddress implements Endpoint.GetLocalAddress. GetLocalAddress() (Address, tcpip.Error) // Send sends a single message. This method does not block. // // notify indicates if SendNotify should be called. // // syserr.ErrWouldBlock can be returned along with a partial write if // the caller should block to send the rest of the data. Send(ctx context.Context, data [][]byte, c ControlMessages, from Address) (n int64, notify bool, err *syserr.Error) // SendNotify notifies the ConnectedEndpoint of a successful Send. This // must not be called while holding any endpoint locks. SendNotify() // CloseSend prevents the sending of additional Messages. // // After CloseSend is call, CloseNotify must also be called. CloseSend() // CloseNotify notifies the ConnectedEndpoint of send being closed. This // must not be called while holding any endpoint locks. CloseNotify() // IsSendClosed returns true if transmission of additional messages is closed. IsSendClosed() bool // Writable returns if messages should be attempted to be sent. This // includes when write has been shutdown. Writable() bool // EventUpdate lets the ConnectedEndpoint know that event registrations // have changed. EventUpdate() error // SendQueuedSize returns the total amount of data currently queued for // sending. SendQueuedSize should return -1 if the operation isn't // supported. SendQueuedSize() int64 // SendMaxQueueSize returns maximum value for SendQueuedSize. // SendMaxQueueSize should return -1 if the operation isn't supported. SendMaxQueueSize() int64 // Release releases any resources owned by the ConnectedEndpoint. It should // be called before dropping all references to a ConnectedEndpoint. Release(ctx context.Context) // CloseUnread sets the fact that this end is closed with unread data to // the peer socket. CloseUnread() // SetSendBufferSize is called when the endpoint's send buffer size is // changed. SetSendBufferSize(v int64) (newSz int64) } // +stateify savable type connectedEndpoint struct { // endpoint represents the subset of the Endpoint functionality needed by // the connectedEndpoint. It is implemented by both connectionedEndpoint // and connectionlessEndpoint and allows the use of types which don't // fully implement Endpoint. endpoint interface { // Passcred implements Endpoint.Passcred. Passcred() bool // GetLocalAddress implements Endpoint.GetLocalAddress. GetLocalAddress() (Address, tcpip.Error) // Type implements Endpoint.Type. Type() linux.SockType } writeQueue *queue } // Passcred implements ConnectedEndpoint.Passcred. func (e *connectedEndpoint) Passcred() bool { return e.endpoint.Passcred() } // GetLocalAddress implements ConnectedEndpoint.GetLocalAddress. func (e *connectedEndpoint) GetLocalAddress() (Address, tcpip.Error) { return e.endpoint.GetLocalAddress() } // Send implements ConnectedEndpoint.Send. func (e *connectedEndpoint) Send(ctx context.Context, data [][]byte, c ControlMessages, from Address) (int64, bool, *syserr.Error) { discardEmpty := false truncate := false if e.endpoint.Type() == linux.SOCK_STREAM { // Discard empty stream packets. Since stream sockets don't // preserve message boundaries, sending zero bytes is a no-op. // In Linux, the receiver actually uses a zero-length receive // as an indication that the stream was closed. discardEmpty = true // Since stream sockets don't preserve message boundaries, we // can write only as much of the message as fits in the queue. truncate = true } return e.writeQueue.Enqueue(ctx, data, c, from, discardEmpty, truncate) } // SendNotify implements ConnectedEndpoint.SendNotify. func (e *connectedEndpoint) SendNotify() { e.writeQueue.ReaderQueue.Notify(waiter.ReadableEvents) } // CloseNotify implements ConnectedEndpoint.CloseNotify. func (e *connectedEndpoint) CloseNotify() { e.writeQueue.ReaderQueue.Notify(waiter.ReadableEvents) e.writeQueue.WriterQueue.Notify(waiter.WritableEvents) } // CloseSend implements ConnectedEndpoint.CloseSend. func (e *connectedEndpoint) CloseSend() { e.writeQueue.Close() } // IsSendClosed implements ConnectedEndpoint.IsSendClosed. func (e *connectedEndpoint) IsSendClosed() bool { return e.writeQueue.isClosed() } // Writable implements ConnectedEndpoint.Writable. func (e *connectedEndpoint) Writable() bool { return e.writeQueue.IsWritable() } // EventUpdate implements ConnectedEndpoint.EventUpdate. func (*connectedEndpoint) EventUpdate() error { return nil } // SendQueuedSize implements ConnectedEndpoint.SendQueuedSize. func (e *connectedEndpoint) SendQueuedSize() int64 { return e.writeQueue.QueuedSize() } // SendMaxQueueSize implements ConnectedEndpoint.SendMaxQueueSize. func (e *connectedEndpoint) SendMaxQueueSize() int64 { return e.writeQueue.MaxQueueSize() } // Release implements ConnectedEndpoint.Release. func (e *connectedEndpoint) Release(ctx context.Context) { e.writeQueue.DecRef(ctx) } // CloseUnread implements ConnectedEndpoint.CloseUnread. func (e *connectedEndpoint) CloseUnread() { e.writeQueue.CloseUnread() } // SetSendBufferSize implements ConnectedEndpoint.SetSendBufferSize. // SetSendBufferSize sets the send buffer size for the write queue to the // specified value. func (e *connectedEndpoint) SetSendBufferSize(v int64) (newSz int64) { e.writeQueue.SetMaxQueueSize(v) return v } // baseEndpoint is an embeddable unix endpoint base used in both the connected // and connectionless unix domain socket Endpoint implementations. // // Not to be used on its own. // // +stateify savable type baseEndpoint struct { *waiter.Queue tcpip.DefaultSocketOptionsHandler // Mutex protects the below fields. // // See the lock ordering comment in package kernel/epoll regarding when // this lock can safely be held. endpointMutex `state:"nosave"` // receiver allows Messages to be received. receiver Receiver // connected allows messages to be sent and state information about the // connected endpoint to be read. connected ConnectedEndpoint // path is not empty if the endpoint has been bound, // or may be used if the endpoint is connected. path string // ops is used to get socket level options. ops tcpip.SocketOptions } // EventRegister implements waiter.Waitable.EventRegister. func (e *baseEndpoint) EventRegister(we *waiter.Entry) error { e.Queue.EventRegister(we) e.Lock() c := e.connected e.Unlock() if c != nil { if err := c.EventUpdate(); err != nil { return err } } return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (e *baseEndpoint) EventUnregister(we *waiter.Entry) { e.Queue.EventUnregister(we) e.Lock() c := e.connected e.Unlock() if c != nil { c.EventUpdate() } } // Passcred implements Credentialer.Passcred. func (e *baseEndpoint) Passcred() bool { return e.SocketOptions().GetPassCred() } // ConnectedPasscred implements Credentialer.ConnectedPasscred. func (e *baseEndpoint) ConnectedPasscred() bool { e.Lock() defer e.Unlock() return e.connected != nil && e.connected.Passcred() } // Connected implements ConnectingEndpoint.Connected. // // Preconditions: e.mu must be held. func (e *baseEndpoint) Connected() bool { return e.receiver != nil && e.connected != nil } // RecvMsg reads data and a control message from the endpoint. func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, args RecvArgs) (RecvOutput, func(), *syserr.Error) { e.Lock() receiver := e.receiver e.Unlock() if receiver == nil { return RecvOutput{}, nil, syserr.ErrNotConnected } out, notify, err := receiver.Recv(ctx, data, args) if err != nil { return RecvOutput{}, nil, err } if notify { return out, receiver.RecvNotify, nil } return out, nil, nil } // SendMsg writes data and a control message to the endpoint's peer. // This method does not block if the data cannot be written. func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, func(), *syserr.Error) { e.Lock() if !e.Connected() { e.Unlock() return 0, nil, syserr.ErrNotConnected } if to != nil { e.Unlock() return 0, nil, syserr.ErrAlreadyConnected } connected := e.connected n, notify, err := connected.Send(ctx, data, c, Address{Addr: e.path}) e.Unlock() var notifyFn func() if notify { notifyFn = connected.SendNotify } return n, notifyFn, err } // SetSockOpt sets a socket option. func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { return nil } func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { log.Warningf("Unsupported socket option: %d", opt) return nil } func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { switch opt { case tcpip.ReceiveQueueSizeOption: v := 0 e.Lock() if !e.Connected() { e.Unlock() return -1, &tcpip.ErrNotConnected{} } v = int(e.receiver.RecvQueuedSize()) e.Unlock() if v < 0 { return -1, &tcpip.ErrQueueSizeNotSupported{} } return v, nil case tcpip.SendQueueSizeOption: e.Lock() if !e.Connected() { e.Unlock() return -1, &tcpip.ErrNotConnected{} } v := e.connected.SendQueuedSize() e.Unlock() if v < 0 { return -1, &tcpip.ErrQueueSizeNotSupported{} } return int(v), nil default: log.Warningf("Unsupported socket option: %d", opt) return -1, &tcpip.ErrUnknownProtocolOption{} } } // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { log.Warningf("Unsupported socket option: %T", opt) return &tcpip.ErrUnknownProtocolOption{} } // LastError implements Endpoint.LastError. func (*baseEndpoint) LastError() tcpip.Error { return nil } // SocketOptions implements Endpoint.SocketOptions. func (e *baseEndpoint) SocketOptions() *tcpip.SocketOptions { return &e.ops } // Shutdown closes the read and/or write end of the endpoint connection to its // peer. func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *syserr.Error { e.Lock() if !e.Connected() { e.Unlock() return syserr.ErrNotConnected } var ( r = e.receiver c = e.connected shutdownRead = flags&tcpip.ShutdownRead != 0 shutdownWrite = flags&tcpip.ShutdownWrite != 0 ) if shutdownRead { r.CloseRecv() } if shutdownWrite { c.CloseSend() } e.Unlock() // Don't hold e.Mutex while calling CloseNotify. if shutdownRead { r.CloseNotify() } if shutdownWrite { c.CloseNotify() } return nil } // GetLocalAddress returns the bound path. func (e *baseEndpoint) GetLocalAddress() (Address, tcpip.Error) { e.Lock() defer e.Unlock() return Address{Addr: e.path}, nil } // GetRemoteAddress returns the local address of the connected endpoint (if // available). func (e *baseEndpoint) GetRemoteAddress() (Address, tcpip.Error) { e.Lock() c := e.connected e.Unlock() if c != nil { return c.GetLocalAddress() } return Address{}, &tcpip.ErrNotConnected{} } // Release implements BoundEndpoint.Release. func (*baseEndpoint) Release(context.Context) { // Binding a baseEndpoint doesn't take a reference. } // stackHandler is just a stub implementation of tcpip.StackHandler to provide // when initializing socketoptions. type stackHandler struct { } // Option implements tcpip.StackHandler. func (h *stackHandler) Option(option any) tcpip.Error { panic("unimplemented") } // TransportProtocolOption implements tcpip.StackHandler. func (h *stackHandler) TransportProtocolOption(proto tcpip.TransportProtocolNumber, option tcpip.GettableTransportProtocolOption) tcpip.Error { panic("unimplemented") } // getSendBufferLimits implements tcpip.GetSendBufferLimits. // // AF_UNIX sockets buffer sizes are not tied to the networking stack/namespace // in linux but are bound by net.core.(wmem|rmem)_(max|default). // // In gVisor net.core sysctls today are not exposed or if exposed are currently // tied to the networking stack in use. This makes it complicated for AF_UNIX // when we are in a new namespace w/ no networking stack. As a result for now we // define default/max values here in the unix socket implementation itself. func getSendBufferLimits(tcpip.StackHandler) tcpip.SendBufferSizeOption { return tcpip.SendBufferSizeOption{ Min: minimumBufferSize, Default: defaultBufferSize, Max: maxBufferSize, } } // getReceiveBufferLimits implements tcpip.GetReceiveBufferLimits. // // We define min, max and default values for unix socket implementation. Unix // sockets do not use receive buffer. func getReceiveBufferLimits(tcpip.StackHandler) tcpip.ReceiveBufferSizeOption { return tcpip.ReceiveBufferSizeOption{ Min: minimumBufferSize, Default: defaultBufferSize, Max: maxBufferSize, } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/unix.go000066400000000000000000000577441465435605700245560ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package unix provides an implementation of the socket.Socket interface for // the AF_UNIX protocol family. package unix import ( "bytes" "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // Socket implements socket.Socket (and by extension, // vfs.FileDescriptionImpl) for Unix sockets. // // +stateify savable type Socket struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl vfs.LockFD socket.SendReceiveTimeout socketRefs namespace *inet.Namespace ep transport.Endpoint stype linux.SockType // abstractName and abstractNamespace indicate the name and namespace of the // socket if it is bound to an abstract socket namespace. Once the socket is // bound, they cannot be modified. abstractName string abstractBound bool } var _ = socket.Socket(&Socket{}) // NewSockfsFile creates a new socket file in the global sockfs mount and // returns a corresponding file description. func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) { mnt := t.Kernel().SocketMount() d := sockfs.NewDentry(t, mnt) defer d.DecRef(t) ns := t.GetNetworkNamespace() fd, err := NewFileDescription(ep, stype, linux.O_RDWR, ns, mnt, d, &vfs.FileLocks{}) if err != nil { ns.DecRef(t) return nil, syserr.FromError(err) } return fd, nil } // NewFileDescription creates and returns a socket file description // corresponding to the given mount and dentry. func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, ns *inet.Namespace, mnt *vfs.Mount, d *vfs.Dentry, locks *vfs.FileLocks) (*vfs.FileDescription, error) { // You can create AF_UNIX, SOCK_RAW sockets. They're the same as // SOCK_DGRAM and don't require CAP_NET_RAW. if stype == linux.SOCK_RAW { stype = linux.SOCK_DGRAM } sock := &Socket{ ep: ep, stype: stype, namespace: ns, } sock.InitRefs() sock.LockFD.Init(locks) vfsfd := &sock.vfsfd if err := vfsfd.Init(sock, flags, mnt, d, &vfs.FileDescriptionOptions{ DenyPRead: true, DenyPWrite: true, UseDentryMetadata: true, }); err != nil { return nil, err } return vfsfd, nil } // DecRef implements RefCounter.DecRef. func (s *Socket) DecRef(ctx context.Context) { s.socketRefs.DecRef(func() { kernel.KernelFromContext(ctx).DeleteSocket(&s.vfsfd) s.ep.Close(ctx) if s.abstractBound { s.namespace.AbstractSockets().Remove(s.abstractName, s) } if s.namespace != nil { s.namespace.DecRef(ctx) } }) } // Release implements vfs.FileDescriptionImpl.Release. func (s *Socket) Release(ctx context.Context) { // Release only decrements a reference on s because s may be referenced in // the abstract socket namespace. s.DecRef(ctx) } // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by // a transport.Endpoint. func (s *Socket) GetSockOpt(t *kernel.Task, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outPtr, outLen) } // blockingAccept implements a blocking version of accept(2), that is, if no // connections are ready to be accept, it will block until one becomes ready. func (s *Socket) blockingAccept(t *kernel.Task, peerAddr *transport.Address) (transport.Endpoint, *syserr.Error) { // Register for notifications. e, ch := waiter.NewChannelEntry(waiter.ReadableEvents) s.EventRegister(&e) defer s.EventUnregister(&e) // Try to accept the connection; if it fails, then wait until we get a // notification. for { if ep, err := s.ep.Accept(t, peerAddr); err != syserr.ErrWouldBlock { return ep, err } if err := t.Block(ch); err != nil { return nil, syserr.FromError(err) } } } // Accept implements the linux syscall accept(2) for sockets backed by // a transport.Endpoint. func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { var peerAddr *transport.Address if peerRequested { peerAddr = &transport.Address{} } ep, err := s.ep.Accept(t, peerAddr) if err != nil { if err != syserr.ErrWouldBlock || !blocking { return 0, nil, 0, err } var err *syserr.Error ep, err = s.blockingAccept(t, peerAddr) if err != nil { return 0, nil, 0, err } } ns, err := NewSockfsFile(t, ep, s.stype) if err != nil { return 0, nil, 0, err } defer ns.DecRef(t) if flags&linux.SOCK_NONBLOCK != 0 { ns.SetStatusFlags(t, t.Credentials(), linux.SOCK_NONBLOCK) } var addr linux.SockAddr var addrLen uint32 if peerAddr != nil { addr, addrLen = convertAddress(*peerAddr) } fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{ CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, }) if e != nil { return 0, nil, 0, syserr.FromError(e) } t.Kernel().RecordSocket(ns) return fd, addr, addrLen, nil } // Bind implements the linux syscall bind(2) for unix sockets. func (s *Socket) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { p, e := extractPath(sockaddr) if e != nil { return e } bep, ok := s.ep.(transport.BoundEndpoint) if !ok { // This socket can't be bound. return syserr.ErrInvalidArgument } // If path is empty, the socket is autobound to an abstract address. if len(p) == 0 || p[0] == 0 { // Abstract socket. See net/unix/af_unix.c:unix_bind_abstract(). asn := s.namespace.AbstractSockets() p, err := asn.Bind(t, p, bep, s) if err != nil { return err } name := p[1:] if err := s.ep.Bind(transport.Address{Addr: p}); err != nil { asn.Remove(name, s) return err } // The socket has been successfully bound. We can update the following. s.abstractName = name s.abstractBound = true return nil } // See net/unix/af_unix.c:unix_bind_bsd(). path := fspath.Parse(p) root := t.FSContext().RootDirectory() defer root.DecRef(t) start := root relPath := !path.Absolute if relPath { start = t.FSContext().WorkingDirectory() defer start.DecRef(t) } pop := vfs.PathOperation{ Root: root, Start: start, Path: path, } stat, err := s.vfsfd.Stat(t, vfs.StatOptions{Mask: linux.STATX_MODE}) if err != nil { return syserr.FromError(err) } err = t.Kernel().VFS().MknodAt(t, t.Credentials(), &pop, &vfs.MknodOptions{ Mode: linux.FileMode(linux.S_IFSOCK | uint(stat.Mode)&^t.FSContext().Umask()), Endpoint: bep, }) if linuxerr.Equals(linuxerr.EEXIST, err) { return syserr.ErrAddressInUse } if err != nil { return syserr.FromError(err) } if err := s.ep.Bind(transport.Address{Addr: p}); err != nil { if unlinkErr := t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &pop); unlinkErr != nil { log.Warningf("failed to unlink socket file created for bind(%q): %v", p, unlinkErr) } return err } return nil } // Ioctl implements vfs.FileDescriptionImpl. func (s *Socket) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { return netstack.Ioctl(ctx, s.ep, uio, sysno, args) } // PRead implements vfs.FileDescriptionImpl. func (s *Socket) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return 0, linuxerr.ESPIPE } // Read implements vfs.FileDescriptionImpl. func (s *Socket) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { return 0, linuxerr.EOPNOTSUPP } if dst.NumBytes() == 0 { return 0, nil } r := &EndpointReader{ Ctx: ctx, Endpoint: s.ep, NumRights: 0, Peek: false, } n, err := dst.CopyOutFrom(ctx, r) if r.Notify != nil { r.Notify() } // Drop any unused rights messages. for _, rm := range r.UnusedRights { rm.Release(ctx) } // Drop control messages. r.Control.Release(ctx) return n, err } // PWrite implements vfs.FileDescriptionImpl. func (s *Socket) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { return 0, linuxerr.ESPIPE } // Write implements vfs.FileDescriptionImpl. func (s *Socket) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { // All flags other than RWF_NOWAIT should be ignored. // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { return 0, linuxerr.EOPNOTSUPP } t := kernel.TaskFromContext(ctx) ctrl := control.New(t, s.ep) if src.NumBytes() == 0 { nInt, notify, err := s.ep.SendMsg(ctx, [][]byte{}, ctrl, nil) if notify != nil { notify() } return int64(nInt), err.ToError() } w := &EndpointWriter{ Ctx: ctx, Endpoint: s.ep, Control: ctrl, To: nil, } n, err := src.CopyInTo(ctx, w) if w.Notify != nil { w.Notify() } return n, err } // Epollable implements FileDescriptionImpl.Epollable. func (s *Socket) Epollable() bool { return true } // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by // a transport.Endpoint. func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error { return netstack.SetSockOpt(t, s, s.ep, level, name, optVal) } // provider is a unix domain socket provider. type provider struct{} func (*provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { // Check arguments. if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ { return nil, syserr.ErrProtocolNotSupported } // Create the endpoint and socket. var ep transport.Endpoint switch stype { case linux.SOCK_DGRAM, linux.SOCK_RAW: ep = transport.NewConnectionless(t) case linux.SOCK_SEQPACKET, linux.SOCK_STREAM: ep = transport.NewConnectioned(t, stype, t.Kernel()) default: return nil, syserr.ErrInvalidArgument } f, err := NewSockfsFile(t, ep, stype) if err != nil { ep.Close(t) return nil, err } return f, nil } // Pair creates a new pair of AF_UNIX connected sockets. func (*provider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { // Check arguments. if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ { return nil, nil, syserr.ErrProtocolNotSupported } switch stype { case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET, linux.SOCK_RAW: // Ok default: return nil, nil, syserr.ErrInvalidArgument } // Create the endpoints and sockets. ep1, ep2 := transport.NewPair(t, stype, t.Kernel()) s1, err := NewSockfsFile(t, ep1, stype) if err != nil { ep1.Close(t) ep2.Close(t) return nil, nil, err } s2, err := NewSockfsFile(t, ep2, stype) if err != nil { s1.DecRef(t) ep2.Close(t) return nil, nil, err } return s1, s2, nil } func (s *Socket) isPacket() bool { switch s.stype { case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET: return true case linux.SOCK_STREAM: return false default: // We shouldn't have allowed any other socket types during creation. panic(fmt.Sprintf("Invalid socket type %d", s.stype)) } } // Endpoint extracts the transport.Endpoint. func (s *Socket) Endpoint() transport.Endpoint { return s.ep } // extractPath extracts and validates the address. func extractPath(sockaddr []byte) (string, *syserr.Error) { addr, family, err := addressAndFamily(sockaddr) if err != nil { if err == syserr.ErrAddressFamilyNotSupported { err = syserr.ErrInvalidArgument } return "", err } if family != linux.AF_UNIX { return "", syserr.ErrInvalidArgument } // The address is trimmed by GetAddress. p := addr.Addr if len(p) > 0 && p[len(p)-1] == '/' { // Weird, they tried to bind '/a/b/c/'? return "", syserr.ErrIsDir } return p, nil } func addressAndFamily(addr []byte) (transport.Address, uint16, *syserr.Error) { // Make sure we have at least 2 bytes for the address family. if len(addr) < 2 { return transport.Address{}, 0, syserr.ErrInvalidArgument } // Get the rest of the fields based on the address family. switch family := hostarch.ByteOrder.Uint16(addr); family { case linux.AF_UNIX: path := addr[2:] if len(path) > linux.UnixPathMax { return transport.Address{}, family, syserr.ErrInvalidArgument } // Drop the terminating NUL (if one exists) and everything after // it for filesystem (non-abstract) addresses. if len(path) > 0 && path[0] != 0 { if n := bytes.IndexByte(path[1:], 0); n >= 0 { path = path[:n+1] } } return transport.Address{ Addr: string(path), }, family, nil } return transport.Address{}, 0, syserr.ErrAddressFamilyNotSupported } // GetPeerName implements the linux syscall getpeername(2) for sockets backed by // a transport.Endpoint. func (s *Socket) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.ep.GetRemoteAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) } a, l := convertAddress(addr) return a, l, nil } // GetSockName implements the linux syscall getsockname(2) for sockets backed by // a transport.Endpoint. func (s *Socket) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.ep.GetLocalAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) } a, l := convertAddress(addr) return a, l, nil } // Listen implements the linux syscall listen(2) for sockets backed by // a transport.Endpoint. func (s *Socket) Listen(t *kernel.Task, backlog int) *syserr.Error { return s.ep.Listen(t, backlog) } // extractEndpoint retrieves the transport.BoundEndpoint associated with a Unix // socket path. The Release must be called on the transport.BoundEndpoint when // the caller is done with it. func (s *Socket) extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, *syserr.Error) { path, err := extractPath(sockaddr) if err != nil { return nil, err } if path == "" { // Not allowed. return nil, syserr.ErrInvalidArgument } // Is it abstract? if path[0] == 0 { ep := s.namespace.AbstractSockets().BoundEndpoint(path[1:]) if ep == nil { // No socket found. return nil, syserr.ErrConnectionRefused } return ep, nil } p := fspath.Parse(path) root := t.FSContext().RootDirectory() start := root relPath := !p.Absolute if relPath { start = t.FSContext().WorkingDirectory() } pop := vfs.PathOperation{ Root: root, Start: start, Path: p, FollowFinalSymlink: true, } ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop, &vfs.BoundEndpointOptions{path}) root.DecRef(t) if relPath { start.DecRef(t) } if e != nil { return nil, syserr.FromError(e) } return ep, nil } // Connect implements the linux syscall connect(2) for unix sockets. func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { ep, err := s.extractEndpoint(t, sockaddr) if err != nil { return err } defer ep.Release(t) // Connect the server endpoint. err = s.ep.Connect(t, ep) if err == syserr.ErrWrongProtocolForSocket { // Linux for abstract sockets returns ErrConnectionRefused // instead of ErrWrongProtocolForSocket. path, _ := extractPath(sockaddr) if len(path) > 0 && path[0] == 0 { err = syserr.ErrConnectionRefused } } return err } // SendMsg implements the linux syscall sendmsg(2) for unix sockets backed by // a transport.Endpoint. func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { w := EndpointWriter{ Ctx: t, Endpoint: s.ep, Control: controlMessages.Unix, To: nil, } if len(to) > 0 { switch s.stype { case linux.SOCK_SEQPACKET: // to is ignored. case linux.SOCK_STREAM: if s.State() == linux.SS_CONNECTED { return 0, syserr.ErrAlreadyConnected } return 0, syserr.ErrNotSupported default: ep, err := s.extractEndpoint(t, to) if err != nil { return 0, err } defer ep.Release(t) w.To = ep if ep.Passcred() && w.Control.Credentials == nil { w.Control.Credentials = control.MakeCreds(t) } } } n, err := src.CopyInTo(t, &w) if w.Notify != nil { w.Notify() } if err != linuxerr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 { return int(n), syserr.FromError(err) } // Only send SCM Rights once (see net/unix/af_unix.c:unix_stream_sendmsg). w.Control.Rights = nil // We'll have to block. Register for notification and keep trying to // send all the data. e, ch := waiter.NewChannelEntry(waiter.WritableEvents) s.EventRegister(&e) defer s.EventUnregister(&e) total := n for { // Shorten src to reflect bytes previously written. src = src.DropFirst64(n) n, err = src.CopyInTo(t, &w) if w.Notify != nil { w.Notify() } total += n if err != linuxerr.ErrWouldBlock { break } if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { err = linuxerr.ErrWouldBlock } break } } return int(total), syserr.FromError(err) } // Passcred implements transport.Credentialer.Passcred. func (s *Socket) Passcred() bool { return s.ep.Passcred() } // ConnectedPasscred implements transport.Credentialer.ConnectedPasscred. func (s *Socket) ConnectedPasscred() bool { return s.ep.ConnectedPasscred() } // Readiness implements waiter.Waitable.Readiness. func (s *Socket) Readiness(mask waiter.EventMask) waiter.EventMask { return s.ep.Readiness(mask) } // EventRegister implements waiter.Waitable.EventRegister. func (s *Socket) EventRegister(e *waiter.Entry) error { return s.ep.EventRegister(e) } // EventUnregister implements waiter.Waitable.EventUnregister. func (s *Socket) EventUnregister(e *waiter.Entry) { s.ep.EventUnregister(e) } // Shutdown implements the linux syscall shutdown(2) for sockets backed by // a transport.Endpoint. func (s *Socket) Shutdown(t *kernel.Task, how int) *syserr.Error { f, err := netstack.ConvertShutdown(how) if err != nil { return err } // Issue shutdown request. return s.ep.Shutdown(f) } // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by // a transport.Endpoint. func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { trunc := flags&linux.MSG_TRUNC != 0 peek := flags&linux.MSG_PEEK != 0 dontWait := flags&linux.MSG_DONTWAIT != 0 waitAll := flags&linux.MSG_WAITALL != 0 isPacket := s.isPacket() // Calculate the number of FDs for which we have space and if we are // requesting credentials. var wantCreds bool rightsLen := int(controlDataLen) - unix.SizeofCmsghdr if s.Passcred() { // Credentials take priority if they are enabled and there is space. wantCreds = rightsLen > 0 if !wantCreds { msgFlags |= linux.MSG_CTRUNC } credLen := unix.CmsgSpace(unix.SizeofUcred) rightsLen -= credLen } // FDs are 32 bit (4 byte) ints. numRights := rightsLen / 4 if numRights < 0 { numRights = 0 } r := EndpointReader{ Ctx: t, Endpoint: s.ep, Creds: wantCreds, NumRights: numRights, Peek: peek, } doRead := func() (int64, error) { n, err := dst.CopyOutFrom(t, &r) if r.Notify != nil { r.Notify() } return n, err } // Drop any unused rights messages after reading. defer func() { for _, rm := range r.UnusedRights { rm.Release(t) } }() // If MSG_TRUNC is set with a zero byte destination then we still need // to read the message and discard it, or in the case where MSG_PEEK is // set, leave it be. In both cases the full message length must be // returned. if trunc && dst.Addrs.NumBytes() == 0 { doRead = func() (int64, error) { err := r.Truncate() // Always return zero for bytes read since the destination size is // zero. return 0, err } } var total int64 if n, err := doRead(); err != linuxerr.ErrWouldBlock || dontWait { var from linux.SockAddr var fromLen uint32 if senderRequested && len([]byte(r.From.Addr)) != 0 { from, fromLen = convertAddress(r.From) } if r.ControlTrunc { msgFlags |= linux.MSG_CTRUNC } if err != nil || dontWait || !waitAll || isPacket || n >= dst.NumBytes() { if isPacket && n < int64(r.MsgSize) { msgFlags |= linux.MSG_TRUNC } if trunc { n = int64(r.MsgSize) } return int(n), msgFlags, from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err) } // Don't overwrite any data we received. dst = dst.DropFirst64(n) total += n } // We'll have to block. Register for notification and keep trying to // send all the data. e, ch := waiter.NewChannelEntry(waiter.ReadableEvents) s.EventRegister(&e) defer s.EventUnregister(&e) for { if n, err := doRead(); err != linuxerr.ErrWouldBlock { var from linux.SockAddr var fromLen uint32 if senderRequested { from, fromLen = convertAddress(r.From) } if r.ControlTrunc { msgFlags |= linux.MSG_CTRUNC } if trunc { // n and r.MsgSize are the same for streams. total += int64(r.MsgSize) } else { total += n } streamPeerClosed := s.stype == linux.SOCK_STREAM && n == 0 && err == nil if err != nil || !waitAll || isPacket || n >= dst.NumBytes() || streamPeerClosed { if total > 0 { err = nil } if isPacket && n < int64(r.MsgSize) { msgFlags |= linux.MSG_TRUNC } return int(total), msgFlags, from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err) } // Don't overwrite any data we received. dst = dst.DropFirst64(n) } if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { if total > 0 { err = nil } if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { return int(total), msgFlags, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain } return int(total), msgFlags, nil, 0, socket.ControlMessages{}, syserr.FromError(err) } } } // State implements socket.Socket.State. func (s *Socket) State() uint32 { return s.ep.State() } // Type implements socket.Socket.Type. func (s *Socket) Type() (family int, skType linux.SockType, protocol int) { // Unix domain sockets always have a protocol of 0. return linux.AF_UNIX, s.stype, 0 } func convertAddress(addr transport.Address) (linux.SockAddr, uint32) { var out linux.SockAddrUnix out.Family = linux.AF_UNIX l := len([]byte(addr.Addr)) for i := 0; i < l; i++ { out.Path[i] = int8(addr.Addr[i]) } // Linux returns the used length of the address struct (including the // null terminator) for filesystem paths. The Family field is 2 bytes. // It is sometimes allowed to exclude the null terminator if the // address length is the max. Abstract and empty paths always return // the full exact length. if l == 0 || out.Path[0] == 0 || l == len(out.Path) { return &out, uint32(2 + l) } return &out, uint32(3 + l) } func init() { socket.RegisterProvider(linux.AF_UNIX, &provider{}) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/socket/unix/unix_state_autogen.go000066400000000000000000000044671465435605700274720ustar00rootroot00000000000000// automatically generated by stateify. package unix import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (r *socketRefs) StateTypeName() string { return "pkg/sentry/socket/unix.socketRefs" } func (r *socketRefs) StateFields() []string { return []string{ "refCount", } } func (r *socketRefs) beforeSave() {} // +checklocksignore func (r *socketRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *socketRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (s *Socket) StateTypeName() string { return "pkg/sentry/socket/unix.Socket" } func (s *Socket) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "LockFD", "SendReceiveTimeout", "socketRefs", "namespace", "ep", "stype", "abstractName", "abstractBound", } } func (s *Socket) beforeSave() {} // +checklocksignore func (s *Socket) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.vfsfd) stateSinkObject.Save(1, &s.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &s.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &s.LockFD) stateSinkObject.Save(4, &s.SendReceiveTimeout) stateSinkObject.Save(5, &s.socketRefs) stateSinkObject.Save(6, &s.namespace) stateSinkObject.Save(7, &s.ep) stateSinkObject.Save(8, &s.stype) stateSinkObject.Save(9, &s.abstractName) stateSinkObject.Save(10, &s.abstractBound) } func (s *Socket) afterLoad(context.Context) {} // +checklocksignore func (s *Socket) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.vfsfd) stateSourceObject.Load(1, &s.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &s.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &s.LockFD) stateSourceObject.Load(4, &s.SendReceiveTimeout) stateSourceObject.Load(5, &s.socketRefs) stateSourceObject.Load(6, &s.namespace) stateSourceObject.Load(7, &s.ep) stateSourceObject.Load(8, &s.stype) stateSourceObject.Load(9, &s.abstractName) stateSourceObject.Load(10, &s.abstractBound) } func init() { state.Register((*socketRefs)(nil)) state.Register((*Socket)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/state/000077500000000000000000000000001465435605700220705ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/state/state.go000066400000000000000000000105571465435605700235470ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package state provides high-level state wrappers. package state import ( "fmt" "io" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/state/statefile" ) var previousMetadata map[string]string // ErrStateFile is returned when an error is encountered writing the statefile // (which may occur during open or close calls in addition to write). type ErrStateFile struct { err error } // Error implements error.Error(). func (e ErrStateFile) Error() string { return fmt.Sprintf("statefile error: %v", e.err) } // SaveOpts contains save-related options. type SaveOpts struct { // Destination is the save target. Destination io.Writer // PagesMetadata is the file into which MemoryFile metadata is stored if // PagesMetadata is non-nil. Otherwise this content is stored in Destination. PagesMetadata *fd.FD // PagesFile is the file in which all MemoryFile pages are stored if // PagesFile is non-nil. Otherwise this content is stored in Destination. PagesFile *fd.FD // Key is used for state integrity check. Key []byte // Metadata is save metadata. Metadata map[string]string // MemoryFileSaveOpts is passed to calls to pgalloc.MemoryFile.SaveTo(). MemoryFileSaveOpts pgalloc.SaveOpts // Callback is called prior to unpause, with any save error. Callback func(err error) // Resume indicates if the statefile is used for save-resume. Resume bool } // Save saves the system state. func (opts SaveOpts) Save(ctx context.Context, k *kernel.Kernel, w *watchdog.Watchdog) error { log.Infof("Sandbox save started, pausing all tasks.") k.Pause() k.ReceiveTaskStates() defer func() { k.Unpause() log.Infof("Tasks resumed after save.") }() w.Stop() defer w.Start() // Supplement the metadata. if opts.Metadata == nil { opts.Metadata = make(map[string]string) } addSaveMetadata(opts.Metadata) // Open the statefile. wc, err := statefile.NewWriter(opts.Destination, opts.Key, opts.Metadata) if err != nil { err = ErrStateFile{err} } else { // Save the kernel. err = k.SaveTo(ctx, wc, opts.PagesMetadata, opts.PagesFile, opts.MemoryFileSaveOpts) // ENOSPC is a state file error. This error can only come from // writing the state file, and not from fs.FileOperations.Fsync // because we wrap those in kernel.TaskSet.flushWritesToFiles. if linuxerr.Equals(linuxerr.ENOSPC, err) { err = ErrStateFile{err} } if closeErr := wc.Close(); err == nil && closeErr != nil { err = ErrStateFile{closeErr} } } opts.Callback(err) return err } // LoadOpts contains load-related options. type LoadOpts struct { // Source is the load source. Source io.Reader // PagesMetadata is the file into which MemoryFile metadata is stored if // PagesMetadata is non-nil. Otherwise this content is stored in Source. PagesMetadata *fd.FD // PagesFile is the file in which all MemoryFile pages are stored if // PagesFile is non-nil. Otherwise this content is stored in Source. PagesFile *fd.FD // Key is used for state integrity check. Key []byte } // Load loads the given kernel, setting the provided platform and stack. func (opts LoadOpts) Load(ctx context.Context, k *kernel.Kernel, timeReady chan struct{}, n inet.Stack, clocks time.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error { // Open the file. r, m, err := statefile.NewReader(opts.Source, opts.Key) if err != nil { return ErrStateFile{err} } previousMetadata = m // Restore the Kernel object graph. return k.LoadFrom(ctx, r, opts.PagesMetadata, opts.PagesFile, timeReady, n, clocks, vfsOpts) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/state/state_metadata.go000066400000000000000000000023161465435605700254010ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false package state import ( "fmt" "time" "gvisor.dev/gvisor/pkg/log" ) // The save metadata keys for timestamp. const ( cpuUsage = "cpu_usage" metadataTimestamp = "timestamp" ) func addSaveMetadata(m map[string]string) { t, err := CPUTime() if err != nil { log.Warningf("Error getting cpu time: %v", err) } if previousMetadata != nil { p, err := time.ParseDuration(previousMetadata[cpuUsage]) if err != nil { log.Warningf("Error parsing previous runs' cpu time: %v", err) } t += p } m[cpuUsage] = t.String() m[metadataTimestamp] = fmt.Sprintf("%v", time.Now()) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/state/state_state_autogen.go000066400000000000000000000001331465435605700264560ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package state golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/state/state_unsafe.go000066400000000000000000000021351465435605700251010ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package state import ( "fmt" "time" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" ) // CPUTime returns the CPU time usage by Sentry and app. func CPUTime() (time.Duration, error) { var ts unix.Timespec _, _, errno := unix.RawSyscall(unix.SYS_CLOCK_GETTIME, uintptr(linux.CLOCK_PROCESS_CPUTIME_ID), uintptr(unsafe.Pointer(&ts)), 0) if errno != 0 { return 0, fmt.Errorf("failed calling clock_gettime(CLOCK_PROCESS_CPUTIME_ID): errno=%d", errno) } return time.Duration(ts.Nano()), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/state/state_unsafe_state_autogen.go000066400000000000000000000000671465435605700300250ustar00rootroot00000000000000// automatically generated by stateify. package state golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/000077500000000000000000000000001465435605700222315ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/capability.go000066400000000000000000000070431465435605700247050ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package strace import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" ) // CapabilityBitset is the set of capabilities in a bitset. var CapabilityBitset = abi.FlagSet{ { Flag: 1 << uint32(linux.CAP_CHOWN), Name: "CAP_CHOWN", }, { Flag: 1 << uint32(linux.CAP_DAC_OVERRIDE), Name: "CAP_DAC_OVERRIDE", }, { Flag: 1 << uint32(linux.CAP_DAC_READ_SEARCH), Name: "CAP_DAC_READ_SEARCH", }, { Flag: 1 << uint32(linux.CAP_FOWNER), Name: "CAP_FOWNER", }, { Flag: 1 << uint32(linux.CAP_FSETID), Name: "CAP_FSETID", }, { Flag: 1 << uint32(linux.CAP_KILL), Name: "CAP_KILL", }, { Flag: 1 << uint32(linux.CAP_SETGID), Name: "CAP_SETGID", }, { Flag: 1 << uint32(linux.CAP_SETUID), Name: "CAP_SETUID", }, { Flag: 1 << uint32(linux.CAP_SETPCAP), Name: "CAP_SETPCAP", }, { Flag: 1 << uint32(linux.CAP_LINUX_IMMUTABLE), Name: "CAP_LINUX_IMMUTABLE", }, { Flag: 1 << uint32(linux.CAP_NET_BIND_SERVICE), Name: "CAP_NET_BIND_SERVICE", }, { Flag: 1 << uint32(linux.CAP_NET_BROADCAST), Name: "CAP_NET_BROADCAST", }, { Flag: 1 << uint32(linux.CAP_NET_ADMIN), Name: "CAP_NET_ADMIN", }, { Flag: 1 << uint32(linux.CAP_NET_RAW), Name: "CAP_NET_RAW", }, { Flag: 1 << uint32(linux.CAP_IPC_LOCK), Name: "CAP_IPC_LOCK", }, { Flag: 1 << uint32(linux.CAP_IPC_OWNER), Name: "CAP_IPC_OWNER", }, { Flag: 1 << uint32(linux.CAP_SYS_MODULE), Name: "CAP_SYS_MODULE", }, { Flag: 1 << uint32(linux.CAP_SYS_RAWIO), Name: "CAP_SYS_RAWIO", }, { Flag: 1 << uint32(linux.CAP_SYS_CHROOT), Name: "CAP_SYS_CHROOT", }, { Flag: 1 << uint32(linux.CAP_SYS_PTRACE), Name: "CAP_SYS_PTRACE", }, { Flag: 1 << uint32(linux.CAP_SYS_PACCT), Name: "CAP_SYS_PACCT", }, { Flag: 1 << uint32(linux.CAP_SYS_ADMIN), Name: "CAP_SYS_ADMIN", }, { Flag: 1 << uint32(linux.CAP_SYS_BOOT), Name: "CAP_SYS_BOOT", }, { Flag: 1 << uint32(linux.CAP_SYS_NICE), Name: "CAP_SYS_NICE", }, { Flag: 1 << uint32(linux.CAP_SYS_RESOURCE), Name: "CAP_SYS_RESOURCE", }, { Flag: 1 << uint32(linux.CAP_SYS_TIME), Name: "CAP_SYS_TIME", }, { Flag: 1 << uint32(linux.CAP_SYS_TTY_CONFIG), Name: "CAP_SYS_TTY_CONFIG", }, { Flag: 1 << uint32(linux.CAP_MKNOD), Name: "CAP_MKNOD", }, { Flag: 1 << uint32(linux.CAP_LEASE), Name: "CAP_LEASE", }, { Flag: 1 << uint32(linux.CAP_AUDIT_WRITE), Name: "CAP_AUDIT_WRITE", }, { Flag: 1 << uint32(linux.CAP_AUDIT_CONTROL), Name: "CAP_AUDIT_CONTROL", }, { Flag: 1 << uint32(linux.CAP_SETFCAP), Name: "CAP_SETFCAP", }, { Flag: 1 << uint32(linux.CAP_MAC_OVERRIDE), Name: "CAP_MAC_OVERRIDE", }, { Flag: 1 << uint32(linux.CAP_MAC_ADMIN), Name: "CAP_MAC_ADMIN", }, { Flag: 1 << uint32(linux.CAP_SYSLOG), Name: "CAP_SYSLOG", }, { Flag: 1 << uint32(linux.CAP_WAKE_ALARM), Name: "CAP_WAKE_ALARM", }, { Flag: 1 << uint32(linux.CAP_BLOCK_SUSPEND), Name: "CAP_BLOCK_SUSPEND", }, { Flag: 1 << uint32(linux.CAP_AUDIT_READ), Name: "CAP_AUDIT_READ", }, } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/clone.go000066400000000000000000000040551465435605700236640ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package strace import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" ) // CloneFlagSet is the set of clone(2) flags. var CloneFlagSet = abi.FlagSet{ { Flag: linux.CLONE_VM, Name: "CLONE_VM", }, { Flag: linux.CLONE_FS, Name: "CLONE_FS", }, { Flag: linux.CLONE_FILES, Name: "CLONE_FILES", }, { Flag: linux.CLONE_SIGHAND, Name: "CLONE_SIGHAND", }, { Flag: linux.CLONE_PTRACE, Name: "CLONE_PTRACE", }, { Flag: linux.CLONE_VFORK, Name: "CLONE_VFORK", }, { Flag: linux.CLONE_PARENT, Name: "CLONE_PARENT", }, { Flag: linux.CLONE_THREAD, Name: "CLONE_THREAD", }, { Flag: linux.CLONE_NEWNS, Name: "CLONE_NEWNS", }, { Flag: linux.CLONE_SYSVSEM, Name: "CLONE_SYSVSEM", }, { Flag: linux.CLONE_SETTLS, Name: "CLONE_SETTLS", }, { Flag: linux.CLONE_PARENT_SETTID, Name: "CLONE_PARENT_SETTID", }, { Flag: linux.CLONE_CHILD_CLEARTID, Name: "CLONE_CHILD_CLEARTID", }, { Flag: linux.CLONE_DETACHED, Name: "CLONE_DETACHED", }, { Flag: linux.CLONE_UNTRACED, Name: "CLONE_UNTRACED", }, { Flag: linux.CLONE_CHILD_SETTID, Name: "CLONE_CHILD_SETTID", }, { Flag: linux.CLONE_NEWUTS, Name: "CLONE_NEWUTS", }, { Flag: linux.CLONE_NEWIPC, Name: "CLONE_NEWIPC", }, { Flag: linux.CLONE_NEWUSER, Name: "CLONE_NEWUSER", }, { Flag: linux.CLONE_NEWPID, Name: "CLONE_NEWPID", }, { Flag: linux.CLONE_NEWNET, Name: "CLONE_NEWNET", }, { Flag: linux.CLONE_IO, Name: "CLONE_IO", }, } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/close_range.go000066400000000000000000000016631465435605700250470ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package strace import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" ) // CloseRangeFlagSet is the set of close_range(2) flags. var CloseRangeFlagSet = abi.FlagSet{ { Flag: uint64(linux.CLOSE_RANGE_CLOEXEC), Name: "CLOSE_RANGE_CLOEXEC", }, { Flag: uint64(linux.CLOSE_RANGE_UNSHARE), Name: "CLOSE_RANGE_UNSHARE", }, } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/epoll.go000066400000000000000000000056041465435605700237000ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package strace import ( "fmt" "strings" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/hostarch" ) func epollEvent(t *kernel.Task, eventAddr hostarch.Addr) string { var e linux.EpollEvent if _, err := e.CopyIn(t, eventAddr); err != nil { return fmt.Sprintf("%#x {error reading event: %v}", eventAddr, err) } var sb strings.Builder fmt.Fprintf(&sb, "%#x ", eventAddr) writeEpollEvent(&sb, e) return sb.String() } func epollEvents(t *kernel.Task, eventsAddr hostarch.Addr, numEvents, maxBytes uint64) string { var sb strings.Builder fmt.Fprintf(&sb, "%#x {", eventsAddr) addr := eventsAddr for i := uint64(0); i < numEvents; i++ { var e linux.EpollEvent if _, err := e.CopyIn(t, addr); err != nil { fmt.Fprintf(&sb, "{error reading event at %#x: %v}", addr, err) continue } writeEpollEvent(&sb, e) if uint64(sb.Len()) >= maxBytes { sb.WriteString("...") break } // Allowing addr to overflow is consistent with Linux, and harmless; if // this isn't the last iteration of the loop, the next call to CopyIn // will just fail with EFAULT. addr, _ = addr.AddLength(uint64(linux.SizeOfEpollEvent)) } sb.WriteString("}") return sb.String() } func writeEpollEvent(sb *strings.Builder, e linux.EpollEvent) { events := epollEventEvents.Parse(uint64(e.Events)) fmt.Fprintf(sb, "{events=%s data=[%#x, %#x]}", events, e.Data[0], e.Data[1]) } var epollCtlOps = abi.ValueSet{ linux.EPOLL_CTL_ADD: "EPOLL_CTL_ADD", linux.EPOLL_CTL_DEL: "EPOLL_CTL_DEL", linux.EPOLL_CTL_MOD: "EPOLL_CTL_MOD", } var epollEventEvents = abi.FlagSet{ {Flag: linux.EPOLLIN, Name: "EPOLLIN"}, {Flag: linux.EPOLLPRI, Name: "EPOLLPRI"}, {Flag: linux.EPOLLOUT, Name: "EPOLLOUT"}, {Flag: linux.EPOLLERR, Name: "EPOLLERR"}, {Flag: linux.EPOLLHUP, Name: "EPOLLHUP"}, {Flag: linux.EPOLLRDNORM, Name: "EPOLLRDNORM"}, {Flag: linux.EPOLLRDBAND, Name: "EPOLLRDBAND"}, {Flag: linux.EPOLLWRNORM, Name: "EPOLLWRNORM"}, {Flag: linux.EPOLLWRBAND, Name: "EPOLLWRBAND"}, {Flag: linux.EPOLLMSG, Name: "EPOLLMSG"}, {Flag: linux.EPOLLRDHUP, Name: "EPOLLRDHUP"}, {Flag: linux.EPOLLEXCLUSIVE, Name: "EPOLLEXCLUSIVE"}, {Flag: linux.EPOLLWAKEUP, Name: "EPOLLWAKEUP"}, {Flag: linux.EPOLLONESHOT, Name: "EPOLLONESHOT"}, {Flag: linux.EPOLLET, Name: "EPOLLET"}, } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/futex.go000066400000000000000000000034031465435605700237130ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package strace import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" ) // FutexCmd are the possible futex(2) commands. var FutexCmd = abi.ValueSet{ linux.FUTEX_WAIT: "FUTEX_WAIT", linux.FUTEX_WAKE: "FUTEX_WAKE", linux.FUTEX_FD: "FUTEX_FD", linux.FUTEX_REQUEUE: "FUTEX_REQUEUE", linux.FUTEX_CMP_REQUEUE: "FUTEX_CMP_REQUEUE", linux.FUTEX_WAKE_OP: "FUTEX_WAKE_OP", linux.FUTEX_LOCK_PI: "FUTEX_LOCK_PI", linux.FUTEX_UNLOCK_PI: "FUTEX_UNLOCK_PI", linux.FUTEX_TRYLOCK_PI: "FUTEX_TRYLOCK_PI", linux.FUTEX_WAIT_BITSET: "FUTEX_WAIT_BITSET", linux.FUTEX_WAKE_BITSET: "FUTEX_WAKE_BITSET", linux.FUTEX_WAIT_REQUEUE_PI: "FUTEX_WAIT_REQUEUE_PI", linux.FUTEX_CMP_REQUEUE_PI: "FUTEX_CMP_REQUEUE_PI", } func futex(op uint64) string { cmd := op &^ (linux.FUTEX_PRIVATE_FLAG | linux.FUTEX_CLOCK_REALTIME) clockRealtime := (op & linux.FUTEX_CLOCK_REALTIME) == linux.FUTEX_CLOCK_REALTIME private := (op & linux.FUTEX_PRIVATE_FLAG) == linux.FUTEX_PRIVATE_FLAG s := FutexCmd.Parse(cmd) if clockRealtime { s += "|FUTEX_CLOCK_REALTIME" } if private { s += "|FUTEX_PRIVATE_FLAG" } return s } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/linux64_amd64.go000066400000000000000000000456701465435605700251000ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package strace import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/sentry/arch" ) // linuxAMD64 provides a mapping of the Linux amd64 syscalls and their argument // types for display / formatting. var linuxAMD64 = SyscallMap{ 0: makeSyscallInfo("read", FD, ReadBuffer, Hex), 1: makeSyscallInfo("write", FD, WriteBuffer, Hex), 2: makeSyscallInfo("open", Path, OpenFlags, Mode), 3: makeSyscallInfo("close", FD), 4: makeSyscallInfo("stat", Path, Stat), 5: makeSyscallInfo("fstat", FD, Stat), 6: makeSyscallInfo("lstat", Path, Stat), 7: makeSyscallInfo("poll", PollFDs, Hex, Hex), 8: makeSyscallInfo("lseek", Hex, Hex, Hex), 9: makeSyscallInfo("mmap", Hex, Hex, MmapProt, MmapFlags, FD, Hex), 10: makeSyscallInfo("mprotect", Hex, Hex, Hex), 11: makeSyscallInfo("munmap", Hex, Hex), 12: makeSyscallInfo("brk", Hex), 13: makeSyscallInfo("rt_sigaction", Signal, SigAction, PostSigAction, Hex), 14: makeSyscallInfo("rt_sigprocmask", SignalMaskAction, SigSet, PostSigSet, Hex), 15: makeSyscallInfo("rt_sigreturn"), 16: makeSyscallInfo("ioctl", FD, Hex, Hex), 17: makeSyscallInfo("pread64", FD, ReadBuffer, Hex, Hex), 18: makeSyscallInfo("pwrite64", FD, WriteBuffer, Hex, Hex), 19: makeSyscallInfo("readv", FD, ReadIOVec, Hex), 20: makeSyscallInfo("writev", FD, WriteIOVec, Hex), 21: makeSyscallInfo("access", Path, Oct), 22: makeSyscallInfo("pipe", PipeFDs), 23: makeSyscallInfo("select", Hex, SelectFDSet, SelectFDSet, SelectFDSet, Timeval), 24: makeSyscallInfo("sched_yield"), 25: makeSyscallInfo("mremap", Hex, Hex, Hex, Hex, Hex), 26: makeSyscallInfo("msync", Hex, Hex, Hex), 27: makeSyscallInfo("mincore", Hex, Hex, Hex), 28: makeSyscallInfo("madvise", Hex, Hex, Hex), 29: makeSyscallInfo("shmget", Hex, Hex, Hex), 30: makeSyscallInfo("shmat", Hex, Hex, Hex), 31: makeSyscallInfo("shmctl", Hex, Hex, Hex), 32: makeSyscallInfo("dup", FD), 33: makeSyscallInfo("dup2", FD, FD), 34: makeSyscallInfo("pause"), 35: makeSyscallInfo("nanosleep", Timespec, PostTimespec), 36: makeSyscallInfo("getitimer", ItimerType, PostItimerVal), 37: makeSyscallInfo("alarm", Hex), 38: makeSyscallInfo("setitimer", ItimerType, ItimerVal, PostItimerVal), 39: makeSyscallInfo("getpid"), 40: makeSyscallInfo("sendfile", FD, FD, Hex, Hex), 41: makeSyscallInfo("socket", SockFamily, SockType, SockProtocol), 42: makeSyscallInfo("connect", FD, SockAddr, Hex), 43: makeSyscallInfo("accept", FD, PostSockAddr, SockLen), 44: makeSyscallInfo("sendto", FD, Hex, Hex, Hex, SockAddr, Hex), 45: makeSyscallInfo("recvfrom", FD, Hex, Hex, Hex, PostSockAddr, SockLen), 46: makeSyscallInfo("sendmsg", FD, SendMsgHdr, Hex), 47: makeSyscallInfo("recvmsg", FD, RecvMsgHdr, Hex), 48: makeSyscallInfo("shutdown", FD, Hex), 49: makeSyscallInfo("bind", FD, SockAddr, Hex), 50: makeSyscallInfo("listen", FD, Hex), 51: makeSyscallInfo("getsockname", FD, PostSockAddr, SockLen), 52: makeSyscallInfo("getpeername", FD, PostSockAddr, SockLen), 53: makeSyscallInfo("socketpair", SockFamily, SockType, SockProtocol, Hex), 54: makeSyscallInfo("setsockopt", FD, SockOptLevel, SockOptName, SetSockOptVal, Hex /* length by value, not a pointer */), 55: makeSyscallInfo("getsockopt", FD, SockOptLevel, SockOptName, GetSockOptVal, SockLen), 56: makeSyscallInfo("clone", CloneFlags, Hex, Hex, Hex, Hex), 57: makeSyscallInfo("fork"), 58: makeSyscallInfo("vfork"), 59: makeSyscallInfo("execve", Path, ExecveStringVector, ExecveStringVector), 60: makeSyscallInfo("exit", Hex), 61: makeSyscallInfo("wait4", Hex, Hex, Hex, Rusage), 62: makeSyscallInfo("kill", Hex, Signal), 63: makeSyscallInfo("uname", Uname), 64: makeSyscallInfo("semget", Hex, Hex, Hex), 65: makeSyscallInfo("semop", Hex, Hex, Hex), 66: makeSyscallInfo("semctl", Hex, Hex, Hex, Hex), 67: makeSyscallInfo("shmdt", Hex), 68: makeSyscallInfo("msgget", Hex, Hex), 69: makeSyscallInfo("msgsnd", Hex, Hex, Hex, Hex), 70: makeSyscallInfo("msgrcv", Hex, Hex, Hex, Hex, Hex), 71: makeSyscallInfo("msgctl", Hex, Hex, Hex), 72: makeSyscallInfo("fcntl", FD, Hex, Hex), 73: makeSyscallInfo("flock", FD, Hex), 74: makeSyscallInfo("fsync", FD), 75: makeSyscallInfo("fdatasync", FD), 76: makeSyscallInfo("truncate", Path, Hex), 77: makeSyscallInfo("ftruncate", FD, Hex), 78: makeSyscallInfo("getdents", FD, Hex, Hex), 79: makeSyscallInfo("getcwd", PostPath, Hex), 80: makeSyscallInfo("chdir", Path), 81: makeSyscallInfo("fchdir", FD), 82: makeSyscallInfo("rename", Path, Path), 83: makeSyscallInfo("mkdir", Path, Oct), 84: makeSyscallInfo("rmdir", Path), 85: makeSyscallInfo("creat", Path, Oct), 86: makeSyscallInfo("link", Path, Path), 87: makeSyscallInfo("unlink", Path), 88: makeSyscallInfo("symlink", Path, Path), 89: makeSyscallInfo("readlink", Path, ReadBuffer, Hex), 90: makeSyscallInfo("chmod", Path, Mode), 91: makeSyscallInfo("fchmod", FD, Mode), 92: makeSyscallInfo("chown", Path, Hex, Hex), 93: makeSyscallInfo("fchown", FD, Hex, Hex), 94: makeSyscallInfo("lchown", Path, Hex, Hex), 95: makeSyscallInfo("umask", Hex), 96: makeSyscallInfo("gettimeofday", Timeval, Hex), 97: makeSyscallInfo("getrlimit", Hex, Hex), 98: makeSyscallInfo("getrusage", Hex, Rusage), 99: makeSyscallInfo("sysinfo", Hex), 100: makeSyscallInfo("times", Hex), 101: makeSyscallInfo("ptrace", PtraceRequest, Hex, Hex, Hex), 102: makeSyscallInfo("getuid"), 103: makeSyscallInfo("syslog", Hex, Hex, Hex), 104: makeSyscallInfo("getgid"), 105: makeSyscallInfo("setuid", Hex), 106: makeSyscallInfo("setgid", Hex), 107: makeSyscallInfo("geteuid"), 108: makeSyscallInfo("getegid"), 109: makeSyscallInfo("setpgid", Hex, Hex), 110: makeSyscallInfo("getppid"), 111: makeSyscallInfo("getpgrp"), 112: makeSyscallInfo("setsid"), 113: makeSyscallInfo("setreuid", Hex, Hex), 114: makeSyscallInfo("setregid", Hex, Hex), 115: makeSyscallInfo("getgroups", Hex, Hex), 116: makeSyscallInfo("setgroups", Hex, Hex), 117: makeSyscallInfo("setresuid", Hex, Hex, Hex), 118: makeSyscallInfo("getresuid", Hex, Hex, Hex), 119: makeSyscallInfo("setresgid", Hex, Hex, Hex), 120: makeSyscallInfo("getresgid", Hex, Hex, Hex), 121: makeSyscallInfo("getpgid", Hex), 122: makeSyscallInfo("setfsuid", Hex), 123: makeSyscallInfo("setfsgid", Hex), 124: makeSyscallInfo("getsid", Hex), 125: makeSyscallInfo("capget", CapHeader, PostCapData), 126: makeSyscallInfo("capset", CapHeader, CapData), 127: makeSyscallInfo("rt_sigpending", Hex), 128: makeSyscallInfo("rt_sigtimedwait", SigSet, Hex, Timespec, Hex), 129: makeSyscallInfo("rt_sigqueueinfo", Hex, Signal, Hex), 130: makeSyscallInfo("rt_sigsuspend", Hex), 131: makeSyscallInfo("sigaltstack", Hex, Hex), 132: makeSyscallInfo("utime", Path, Utimbuf), 133: makeSyscallInfo("mknod", Path, Mode, Hex), 134: makeSyscallInfo("uselib", Hex), 135: makeSyscallInfo("personality", Hex), 136: makeSyscallInfo("ustat", Hex, Hex), 137: makeSyscallInfo("statfs", Path, Hex), 138: makeSyscallInfo("fstatfs", FD, Hex), 139: makeSyscallInfo("sysfs", Hex, Hex, Hex), 140: makeSyscallInfo("getpriority", Hex, Hex), 141: makeSyscallInfo("setpriority", Hex, Hex, Hex), 142: makeSyscallInfo("sched_setparam", Hex, Hex), 143: makeSyscallInfo("sched_getparam", Hex, Hex), 144: makeSyscallInfo("sched_setscheduler", Hex, Hex, Hex), 145: makeSyscallInfo("sched_getscheduler", Hex), 146: makeSyscallInfo("sched_get_priority_max", Hex), 147: makeSyscallInfo("sched_get_priority_min", Hex), 148: makeSyscallInfo("sched_rr_get_interval", Hex, Hex), 149: makeSyscallInfo("mlock", Hex, Hex), 150: makeSyscallInfo("munlock", Hex, Hex), 151: makeSyscallInfo("mlockall", Hex), 152: makeSyscallInfo("munlockall"), 153: makeSyscallInfo("vhangup"), 154: makeSyscallInfo("modify_ldt", Hex, Hex, Hex), 155: makeSyscallInfo("pivot_root", Path, Path), 156: makeSyscallInfo("_sysctl", Hex), 157: makeSyscallInfo("prctl", Hex, Hex, Hex, Hex, Hex), 158: makeSyscallInfo("arch_prctl", Hex, Hex), 159: makeSyscallInfo("adjtimex", Hex), 160: makeSyscallInfo("setrlimit", Hex, Hex), 161: makeSyscallInfo("chroot", Path), 162: makeSyscallInfo("sync"), 163: makeSyscallInfo("acct", Hex), 164: makeSyscallInfo("settimeofday", Timeval, Hex), 165: makeSyscallInfo("mount", Path, Path, Path, Hex, Path), 166: makeSyscallInfo("umount2", Path, Hex), 167: makeSyscallInfo("swapon", Hex, Hex), 168: makeSyscallInfo("swapoff", Hex), 169: makeSyscallInfo("reboot", Hex, Hex, Hex, Hex), 170: makeSyscallInfo("sethostname", Hex, Hex), 171: makeSyscallInfo("setdomainname", Hex, Hex), 172: makeSyscallInfo("iopl", Hex), 173: makeSyscallInfo("ioperm", Hex, Hex, Hex), 174: makeSyscallInfo("create_module", Path, Hex), 175: makeSyscallInfo("init_module", Hex, Hex, Hex), 176: makeSyscallInfo("delete_module", Hex, Hex), 177: makeSyscallInfo("get_kernel_syms", Hex), // 178: query_module (only present in Linux < 2.6) 179: makeSyscallInfo("quotactl", Hex, Hex, Hex, Hex), 180: makeSyscallInfo("nfsservctl", Hex, Hex, Hex), // 181: getpmsg (not implemented in the Linux kernel) // 182: putpmsg (not implemented in the Linux kernel) // 183: afs_syscall (not implemented in the Linux kernel) // 184: tuxcall (not implemented in the Linux kernel) // 185: security (not implemented in the Linux kernel) 186: makeSyscallInfo("gettid"), 187: makeSyscallInfo("readahead", Hex, Hex, Hex), 188: makeSyscallInfo("setxattr", Path, Path, Hex, Hex, Hex), 189: makeSyscallInfo("lsetxattr", Path, Path, Hex, Hex, Hex), 190: makeSyscallInfo("fsetxattr", FD, Path, Hex, Hex, Hex), 191: makeSyscallInfo("getxattr", Path, Path, Hex, Hex), 192: makeSyscallInfo("lgetxattr", Path, Path, Hex, Hex), 193: makeSyscallInfo("fgetxattr", FD, Path, Hex, Hex), 194: makeSyscallInfo("listxattr", Path, Path, Hex), 195: makeSyscallInfo("llistxattr", Path, Path, Hex), 196: makeSyscallInfo("flistxattr", FD, Path, Hex), 197: makeSyscallInfo("removexattr", Path, Path), 198: makeSyscallInfo("lremovexattr", Path, Path), 199: makeSyscallInfo("fremovexattr", FD, Path), 200: makeSyscallInfo("tkill", Hex, Signal), 201: makeSyscallInfo("time", Hex), 202: makeSyscallInfo("futex", Hex, FutexOp, Hex, Timespec, Hex, Hex), 203: makeSyscallInfo("sched_setaffinity", Hex, Hex, Hex), 204: makeSyscallInfo("sched_getaffinity", Hex, Hex, Hex), 205: makeSyscallInfo("set_thread_area", Hex), 206: makeSyscallInfo("io_setup", Hex, Hex), 207: makeSyscallInfo("io_destroy", Hex), 208: makeSyscallInfo("io_getevents", Hex, Hex, Hex, Hex, Timespec), 209: makeSyscallInfo("io_submit", Hex, Hex, Hex), 210: makeSyscallInfo("io_cancel", Hex, Hex, Hex), 211: makeSyscallInfo("get_thread_area", Hex), 212: makeSyscallInfo("lookup_dcookie", Hex, Hex, Hex), 213: makeSyscallInfo("epoll_create", Hex), // 214: epoll_ctl_old (not implemented in the Linux kernel) // 215: epoll_wait_old (not implemented in the Linux kernel) 216: makeSyscallInfo("remap_file_pages", Hex, Hex, Hex, Hex, Hex), 217: makeSyscallInfo("getdents64", FD, Hex, Hex), 218: makeSyscallInfo("set_tid_address", Hex), 219: makeSyscallInfo("restart_syscall"), 220: makeSyscallInfo("semtimedop", Hex, Hex, Hex, Hex), 221: makeSyscallInfo("fadvise64", FD, Hex, Hex, Hex), 222: makeSyscallInfo("timer_create", Hex, Hex, Hex), 223: makeSyscallInfo("timer_settime", Hex, Hex, ItimerSpec, PostItimerSpec), 224: makeSyscallInfo("timer_gettime", Hex, PostItimerSpec), 225: makeSyscallInfo("timer_getoverrun", Hex), 226: makeSyscallInfo("timer_delete", Hex), 227: makeSyscallInfo("clock_settime", Hex, Timespec), 228: makeSyscallInfo("clock_gettime", Hex, PostTimespec), 229: makeSyscallInfo("clock_getres", Hex, PostTimespec), 230: makeSyscallInfo("clock_nanosleep", Hex, Hex, Timespec, PostTimespec), 231: makeSyscallInfo("exit_group", Hex), 232: makeSyscallInfo("epoll_wait", FD, EpollEvents, Hex, Hex), 233: makeSyscallInfo("epoll_ctl", FD, EpollCtlOp, FD, EpollEvent), 234: makeSyscallInfo("tgkill", Hex, Hex, Signal), 235: makeSyscallInfo("utimes", Path, Timeval), // 236: vserver (not implemented in the Linux kernel) 237: makeSyscallInfo("mbind", Hex, Hex, Hex, Hex, Hex, Hex), 238: makeSyscallInfo("set_mempolicy", Hex, Hex, Hex), 239: makeSyscallInfo("get_mempolicy", Hex, Hex, Hex, Hex, Hex), 240: makeSyscallInfo("mq_open", Hex, Hex, Hex, Hex), 241: makeSyscallInfo("mq_unlink", Hex), 242: makeSyscallInfo("mq_timedsend", Hex, Hex, Hex, Hex, Hex), 243: makeSyscallInfo("mq_timedreceive", Hex, Hex, Hex, Hex, Hex), 244: makeSyscallInfo("mq_notify", Hex, Hex), 245: makeSyscallInfo("mq_getsetattr", Hex, Hex, Hex), 246: makeSyscallInfo("kexec_load", Hex, Hex, Hex, Hex), 247: makeSyscallInfo("waitid", Hex, Hex, Hex, Hex, Rusage), 248: makeSyscallInfo("add_key", Hex, Hex, Hex, Hex, Hex), 249: makeSyscallInfo("request_key", Hex, Hex, Hex, Hex), 250: makeSyscallInfo("keyctl", Hex, Hex, Hex, Hex, Hex), 251: makeSyscallInfo("ioprio_set", Hex, Hex, Hex), 252: makeSyscallInfo("ioprio_get", Hex, Hex), 253: makeSyscallInfo("inotify_init"), 254: makeSyscallInfo("inotify_add_watch", Hex, Path, Hex), 255: makeSyscallInfo("inotify_rm_watch", Hex, Hex), 256: makeSyscallInfo("migrate_pages", Hex, Hex, Hex, Hex), 257: makeSyscallInfo("openat", FD, Path, OpenFlags, Mode), 258: makeSyscallInfo("mkdirat", FD, Path, Hex), 259: makeSyscallInfo("mknodat", FD, Path, Mode, Hex), 260: makeSyscallInfo("fchownat", FD, Path, Hex, Hex, Hex), 261: makeSyscallInfo("futimesat", FD, Path, Hex), 262: makeSyscallInfo("newfstatat", FD, Path, Stat, Hex), 263: makeSyscallInfo("unlinkat", FD, Path, Hex), 264: makeSyscallInfo("renameat", FD, Path, Hex, Path), 265: makeSyscallInfo("linkat", FD, Path, Hex, Path, Hex), 266: makeSyscallInfo("symlinkat", Path, FD, Path), 267: makeSyscallInfo("readlinkat", FD, Path, ReadBuffer, Hex), 268: makeSyscallInfo("fchmodat", FD, Path, Mode), 269: makeSyscallInfo("faccessat", FD, Path, Oct, Hex), 270: makeSyscallInfo("pselect6", Hex, SelectFDSet, SelectFDSet, SelectFDSet, Timespec, SigSet), 271: makeSyscallInfo("ppoll", PollFDs, Hex, Timespec, SigSet, Hex), 272: makeSyscallInfo("unshare", CloneFlags), 273: makeSyscallInfo("set_robust_list", Hex, Hex), 274: makeSyscallInfo("get_robust_list", Hex, Hex, Hex), 275: makeSyscallInfo("splice", FD, Hex, FD, Hex, Hex, Hex), 276: makeSyscallInfo("tee", FD, FD, Hex, Hex), 277: makeSyscallInfo("sync_file_range", FD, Hex, Hex, Hex), 278: makeSyscallInfo("vmsplice", FD, Hex, Hex, Hex), 279: makeSyscallInfo("move_pages", Hex, Hex, Hex, Hex, Hex, Hex), 280: makeSyscallInfo("utimensat", FD, Path, UTimeTimespec, Hex), 281: makeSyscallInfo("epoll_pwait", FD, EpollEvents, Hex, Hex, SigSet, Hex), 282: makeSyscallInfo("signalfd", Hex, Hex, Hex), 283: makeSyscallInfo("timerfd_create", Hex, Hex), 284: makeSyscallInfo("eventfd", Hex), 285: makeSyscallInfo("fallocate", FD, Hex, Hex, Hex), 286: makeSyscallInfo("timerfd_settime", FD, Hex, ItimerSpec, PostItimerSpec), 287: makeSyscallInfo("timerfd_gettime", FD, PostItimerSpec), 288: makeSyscallInfo("accept4", FD, PostSockAddr, SockLen, SockFlags), 289: makeSyscallInfo("signalfd4", Hex, Hex, Hex, Hex), 290: makeSyscallInfo("eventfd2", Hex, Hex), 291: makeSyscallInfo("epoll_create1", Hex), 292: makeSyscallInfo("dup3", FD, FD, Hex), 293: makeSyscallInfo("pipe2", PipeFDs, Hex), 294: makeSyscallInfo("inotify_init1", Hex), 295: makeSyscallInfo("preadv", FD, ReadIOVec, Hex, Hex), 296: makeSyscallInfo("pwritev", FD, WriteIOVec, Hex, Hex), 297: makeSyscallInfo("rt_tgsigqueueinfo", Hex, Hex, Signal, Hex), 298: makeSyscallInfo("perf_event_open", Hex, Hex, Hex, Hex, Hex), 299: makeSyscallInfo("recvmmsg", FD, Hex, Hex, Hex, Hex), 300: makeSyscallInfo("fanotify_init", Hex, Hex), 301: makeSyscallInfo("fanotify_mark", Hex, Hex, Hex, Hex, Hex), 302: makeSyscallInfo("prlimit64", Hex, Hex, Hex, Hex), 303: makeSyscallInfo("name_to_handle_at", FD, Hex, Hex, Hex, Hex), 304: makeSyscallInfo("open_by_handle_at", FD, Hex, Hex), 305: makeSyscallInfo("clock_adjtime", Hex, Hex), 306: makeSyscallInfo("syncfs", FD), 307: makeSyscallInfo("sendmmsg", FD, Hex, Hex, Hex), 308: makeSyscallInfo("setns", FD, Hex), 309: makeSyscallInfo("getcpu", Hex, Hex, Hex), 310: makeSyscallInfo("process_vm_readv", Hex, ReadIOVec, Hex, IOVec, Hex, Hex), 311: makeSyscallInfo("process_vm_writev", Hex, IOVec, Hex, WriteIOVec, Hex, Hex), 312: makeSyscallInfo("kcmp", Hex, Hex, Hex, Hex, Hex), 313: makeSyscallInfo("finit_module", Hex, Hex, Hex), 314: makeSyscallInfo("sched_setattr", Hex, Hex, Hex), 315: makeSyscallInfo("sched_getattr", Hex, Hex, Hex), 316: makeSyscallInfo("renameat2", FD, Path, Hex, Path, Hex), 317: makeSyscallInfo("seccomp", Hex, Hex, Hex), 318: makeSyscallInfo("getrandom", Hex, Hex, Hex), 319: makeSyscallInfo("memfd_create", Path, Hex), // Not quite a path, but close. 320: makeSyscallInfo("kexec_file_load", FD, FD, Hex, Hex, Hex), 321: makeSyscallInfo("bpf", Hex, Hex, Hex), 322: makeSyscallInfo("execveat", FD, Path, ExecveStringVector, ExecveStringVector, Hex), 323: makeSyscallInfo("userfaultfd", Hex), 324: makeSyscallInfo("membarrier", Hex, Hex), 325: makeSyscallInfo("mlock2", Hex, Hex, Hex), 326: makeSyscallInfo("copy_file_range", FD, Hex, FD, Hex, Hex, Hex), 327: makeSyscallInfo("preadv2", FD, ReadIOVec, Hex, Hex, Hex), 328: makeSyscallInfo("pwritev2", FD, WriteIOVec, Hex, Hex, Hex), 329: makeSyscallInfo("pkey_mprotect", Hex, Hex, Hex, Hex), 330: makeSyscallInfo("pkey_alloc", Hex, Hex), 331: makeSyscallInfo("pkey_free", Hex), 332: makeSyscallInfo("statx", FD, Path, Hex, Hex, Hex), 333: makeSyscallInfo("io_pgetevents", Hex, Hex, Hex, Hex, Timespec, SigSet), 334: makeSyscallInfo("rseq", Hex, Hex, Hex, Hex), 424: makeSyscallInfo("pidfd_send_signal", FD, Signal, Hex, Hex), 425: makeSyscallInfo("io_uring_setup", Hex, Hex), 426: makeSyscallInfo("io_uring_enter", FD, Hex, Hex, Hex, SigSet, Hex), 427: makeSyscallInfo("io_uring_register", FD, Hex, Hex, Hex), 428: makeSyscallInfo("open_tree", FD, Path, Hex), 429: makeSyscallInfo("move_mount", FD, Path, FD, Path, Hex), 430: makeSyscallInfo("fsopen", Path, Hex), // Not quite a path, but close. 431: makeSyscallInfo("fsconfig", FD, Hex, Hex, Hex, Hex), 432: makeSyscallInfo("fsmount", FD, Hex, Hex), 433: makeSyscallInfo("fspick", FD, Path, Hex), 434: makeSyscallInfo("pidfd_open", Hex, Hex), 435: makeSyscallInfo("clone3", Hex, Hex), 436: makeSyscallInfo("close_range", FD, FD, CloseRangeFlags), 439: makeSyscallInfo("faccessat2", FD, Path, Oct, Hex), 441: makeSyscallInfo("epoll_pwait2", FD, EpollEvents, Hex, Timespec, SigSet), } func init() { syscallTables = append(syscallTables, syscallTable{ os: abi.Linux, arch: arch.AMD64, syscalls: linuxAMD64, }, ) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/linux64_arm64.go000066400000000000000000000377231465435605700251160ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package strace import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/sentry/arch" ) // linuxARM64 provides a mapping of the Linux arm64 syscalls and their argument // types for display / formatting. var linuxARM64 = SyscallMap{ 0: makeSyscallInfo("io_setup", Hex, Hex), 1: makeSyscallInfo("io_destroy", Hex), 2: makeSyscallInfo("io_submit", Hex, Hex, Hex), 3: makeSyscallInfo("io_cancel", Hex, Hex, Hex), 4: makeSyscallInfo("io_getevents", Hex, Hex, Hex, Hex, Timespec), 5: makeSyscallInfo("setxattr", Path, Path, Hex, Hex, Hex), 6: makeSyscallInfo("lsetxattr", Path, Path, Hex, Hex, Hex), 7: makeSyscallInfo("fsetxattr", FD, Path, Hex, Hex, Hex), 8: makeSyscallInfo("getxattr", Path, Path, Hex, Hex), 9: makeSyscallInfo("lgetxattr", Path, Path, Hex, Hex), 10: makeSyscallInfo("fgetxattr", FD, Path, Hex, Hex), 11: makeSyscallInfo("listxattr", Path, Path, Hex), 12: makeSyscallInfo("llistxattr", Path, Path, Hex), 13: makeSyscallInfo("flistxattr", FD, Path, Hex), 14: makeSyscallInfo("removexattr", Path, Path), 15: makeSyscallInfo("lremovexattr", Path, Path), 16: makeSyscallInfo("fremovexattr", FD, Path), 17: makeSyscallInfo("getcwd", PostPath, Hex), 18: makeSyscallInfo("lookup_dcookie", Hex, Hex, Hex), 19: makeSyscallInfo("eventfd2", Hex, Hex), 20: makeSyscallInfo("epoll_create1", Hex), 21: makeSyscallInfo("epoll_ctl", FD, EpollCtlOp, FD, EpollEvent), 22: makeSyscallInfo("epoll_pwait", FD, EpollEvents, Hex, Hex, SigSet, Hex), 23: makeSyscallInfo("dup", FD), 24: makeSyscallInfo("dup3", FD, FD, Hex), 25: makeSyscallInfo("fcntl", FD, Hex, Hex), 26: makeSyscallInfo("inotify_init1", Hex), 27: makeSyscallInfo("inotify_add_watch", Hex, Path, Hex), 28: makeSyscallInfo("inotify_rm_watch", Hex, Hex), 29: makeSyscallInfo("ioctl", FD, Hex, Hex), 30: makeSyscallInfo("ioprio_set", Hex, Hex, Hex), 31: makeSyscallInfo("ioprio_get", Hex, Hex), 32: makeSyscallInfo("flock", FD, Hex), 33: makeSyscallInfo("mknodat", FD, Path, Mode, Hex), 34: makeSyscallInfo("mkdirat", FD, Path, Hex), 35: makeSyscallInfo("unlinkat", FD, Path, Hex), 36: makeSyscallInfo("symlinkat", Path, FD, Path), 37: makeSyscallInfo("linkat", FD, Path, Hex, Path, Hex), 38: makeSyscallInfo("renameat", FD, Path, Hex, Path), 39: makeSyscallInfo("umount2", Path, Hex), 40: makeSyscallInfo("mount", Path, Path, Path, Hex, Path), 41: makeSyscallInfo("pivot_root", Path, Path), 42: makeSyscallInfo("nfsservctl", Hex, Hex, Hex), 43: makeSyscallInfo("statfs", Path, Hex), 44: makeSyscallInfo("fstatfs", FD, Hex), 45: makeSyscallInfo("truncate", Path, Hex), 46: makeSyscallInfo("ftruncate", FD, Hex), 47: makeSyscallInfo("fallocate", FD, Hex, Hex, Hex), 48: makeSyscallInfo("faccessat", FD, Path, Oct, Hex), 49: makeSyscallInfo("chdir", Path), 50: makeSyscallInfo("fchdir", FD), 51: makeSyscallInfo("chroot", Path), 52: makeSyscallInfo("fchmod", FD, Mode), 53: makeSyscallInfo("fchmodat", FD, Path, Mode), 54: makeSyscallInfo("fchownat", FD, Path, Hex, Hex, Hex), 55: makeSyscallInfo("fchown", FD, Hex, Hex), 56: makeSyscallInfo("openat", FD, Path, OpenFlags, Mode), 57: makeSyscallInfo("close", FD), 58: makeSyscallInfo("vhangup"), 59: makeSyscallInfo("pipe2", PipeFDs, Hex), 60: makeSyscallInfo("quotactl", Hex, Hex, Hex, Hex), 61: makeSyscallInfo("getdents64", FD, Hex, Hex), 62: makeSyscallInfo("lseek", Hex, Hex, Hex), 63: makeSyscallInfo("read", FD, ReadBuffer, Hex), 64: makeSyscallInfo("write", FD, WriteBuffer, Hex), 65: makeSyscallInfo("readv", FD, ReadIOVec, Hex), 66: makeSyscallInfo("writev", FD, WriteIOVec, Hex), 67: makeSyscallInfo("pread64", FD, ReadBuffer, Hex, Hex), 68: makeSyscallInfo("pwrite64", FD, WriteBuffer, Hex, Hex), 69: makeSyscallInfo("preadv", FD, ReadIOVec, Hex, Hex), 70: makeSyscallInfo("pwritev", FD, WriteIOVec, Hex, Hex), 71: makeSyscallInfo("sendfile", FD, FD, Hex, Hex), 72: makeSyscallInfo("pselect6", Hex, Hex, Hex, Hex, Hex, Hex), 73: makeSyscallInfo("ppoll", PollFDs, Hex, Timespec, SigSet, Hex), 74: makeSyscallInfo("signalfd4", Hex, Hex, Hex, Hex), 75: makeSyscallInfo("vmsplice", FD, Hex, Hex, Hex), 76: makeSyscallInfo("splice", FD, Hex, FD, Hex, Hex, Hex), 77: makeSyscallInfo("tee", FD, FD, Hex, Hex), 78: makeSyscallInfo("readlinkat", FD, Path, ReadBuffer, Hex), 79: makeSyscallInfo("fstatat", FD, Path, Stat, Hex), 80: makeSyscallInfo("fstat", FD, Stat), 81: makeSyscallInfo("sync"), 82: makeSyscallInfo("fsync", FD), 83: makeSyscallInfo("fdatasync", FD), 84: makeSyscallInfo("sync_file_range", FD, Hex, Hex, Hex), 85: makeSyscallInfo("timerfd_create", Hex, Hex), 86: makeSyscallInfo("timerfd_settime", FD, Hex, ItimerSpec, PostItimerSpec), 87: makeSyscallInfo("timerfd_gettime", FD, PostItimerSpec), 88: makeSyscallInfo("utimensat", FD, Path, UTimeTimespec, Hex), 89: makeSyscallInfo("acct", Hex), 90: makeSyscallInfo("capget", CapHeader, PostCapData), 91: makeSyscallInfo("capset", CapHeader, CapData), 92: makeSyscallInfo("personality", Hex), 93: makeSyscallInfo("exit", Hex), 94: makeSyscallInfo("exit_group", Hex), 95: makeSyscallInfo("waitid", Hex, Hex, Hex, Hex, Rusage), 96: makeSyscallInfo("set_tid_address", Hex), 97: makeSyscallInfo("unshare", CloneFlags), 98: makeSyscallInfo("futex", Hex, FutexOp, Hex, Timespec, Hex, Hex), 99: makeSyscallInfo("set_robust_list", Hex, Hex), 100: makeSyscallInfo("get_robust_list", Hex, Hex, Hex), 101: makeSyscallInfo("nanosleep", Timespec, PostTimespec), 102: makeSyscallInfo("getitimer", ItimerType, PostItimerVal), 103: makeSyscallInfo("setitimer", ItimerType, ItimerVal, PostItimerVal), 104: makeSyscallInfo("kexec_load", Hex, Hex, Hex, Hex), 105: makeSyscallInfo("init_module", Hex, Hex, Hex), 106: makeSyscallInfo("delete_module", Hex, Hex), 107: makeSyscallInfo("timer_create", Hex, Hex, Hex), 108: makeSyscallInfo("timer_gettime", Hex, PostItimerSpec), 109: makeSyscallInfo("timer_getoverrun", Hex), 110: makeSyscallInfo("timer_settime", Hex, Hex, ItimerSpec, PostItimerSpec), 111: makeSyscallInfo("timer_delete", Hex), 112: makeSyscallInfo("clock_settime", Hex, Timespec), 113: makeSyscallInfo("clock_gettime", Hex, PostTimespec), 114: makeSyscallInfo("clock_getres", Hex, PostTimespec), 115: makeSyscallInfo("clock_nanosleep", Hex, Hex, Timespec, PostTimespec), 116: makeSyscallInfo("syslog", Hex, Hex, Hex), 117: makeSyscallInfo("ptrace", PtraceRequest, Hex, Hex, Hex), 118: makeSyscallInfo("sched_setparam", Hex, Hex), 119: makeSyscallInfo("sched_setscheduler", Hex, Hex, Hex), 120: makeSyscallInfo("sched_getscheduler", Hex), 121: makeSyscallInfo("sched_getparam", Hex, Hex), 122: makeSyscallInfo("sched_setaffinity", Hex, Hex, Hex), 123: makeSyscallInfo("sched_getaffinity", Hex, Hex, Hex), 124: makeSyscallInfo("sched_yield"), 125: makeSyscallInfo("sched_get_priority_max", Hex), 126: makeSyscallInfo("sched_get_priority_min", Hex), 127: makeSyscallInfo("sched_rr_get_interval", Hex, Hex), 128: makeSyscallInfo("restart_syscall"), 129: makeSyscallInfo("kill", Hex, Signal), 130: makeSyscallInfo("tkill", Hex, Signal), 131: makeSyscallInfo("tgkill", Hex, Hex, Signal), 132: makeSyscallInfo("sigaltstack", Hex, Hex), 133: makeSyscallInfo("rt_sigsuspend", Hex), 134: makeSyscallInfo("rt_sigaction", Signal, SigAction, PostSigAction, Hex), 135: makeSyscallInfo("rt_sigprocmask", SignalMaskAction, SigSet, PostSigSet, Hex), 136: makeSyscallInfo("rt_sigpending", Hex), 137: makeSyscallInfo("rt_sigtimedwait", SigSet, Hex, Timespec, Hex), 138: makeSyscallInfo("rt_sigqueueinfo", Hex, Signal, Hex), 139: makeSyscallInfo("rt_sigreturn"), 140: makeSyscallInfo("setpriority", Hex, Hex, Hex), 141: makeSyscallInfo("getpriority", Hex, Hex), 142: makeSyscallInfo("reboot", Hex, Hex, Hex, Hex), 143: makeSyscallInfo("setregid", Hex, Hex), 144: makeSyscallInfo("setgid", Hex), 145: makeSyscallInfo("setreuid", Hex, Hex), 146: makeSyscallInfo("setuid", Hex), 147: makeSyscallInfo("setresuid", Hex, Hex, Hex), 148: makeSyscallInfo("getresuid", Hex, Hex, Hex), 149: makeSyscallInfo("setresgid", Hex, Hex, Hex), 150: makeSyscallInfo("getresgid", Hex, Hex, Hex), 151: makeSyscallInfo("setfsuid", Hex), 152: makeSyscallInfo("setfsgid", Hex), 153: makeSyscallInfo("times", Hex), 154: makeSyscallInfo("setpgid", Hex, Hex), 155: makeSyscallInfo("getpgid", Hex), 156: makeSyscallInfo("getsid", Hex), 157: makeSyscallInfo("setsid"), 158: makeSyscallInfo("getgroups", Hex, Hex), 159: makeSyscallInfo("setgroups", Hex, Hex), 160: makeSyscallInfo("uname", Uname), 161: makeSyscallInfo("sethostname", Hex, Hex), 162: makeSyscallInfo("setdomainname", Hex, Hex), 163: makeSyscallInfo("getrlimit", Hex, Hex), 164: makeSyscallInfo("setrlimit", Hex, Hex), 165: makeSyscallInfo("getrusage", Hex, Rusage), 166: makeSyscallInfo("umask", Hex), 167: makeSyscallInfo("prctl", Hex, Hex, Hex, Hex, Hex), 168: makeSyscallInfo("getcpu", Hex, Hex, Hex), 169: makeSyscallInfo("gettimeofday", Timeval, Hex), 170: makeSyscallInfo("settimeofday", Timeval, Hex), 171: makeSyscallInfo("adjtimex", Hex), 172: makeSyscallInfo("getpid"), 173: makeSyscallInfo("getppid"), 174: makeSyscallInfo("getuid"), 175: makeSyscallInfo("geteuid"), 176: makeSyscallInfo("getgid"), 177: makeSyscallInfo("getegid"), 178: makeSyscallInfo("gettid"), 179: makeSyscallInfo("sysinfo", Hex), 180: makeSyscallInfo("mq_open", Hex, Hex, Hex, Hex), 181: makeSyscallInfo("mq_unlink", Hex), 182: makeSyscallInfo("mq_timedsend", Hex, Hex, Hex, Hex, Hex), 183: makeSyscallInfo("mq_timedreceive", Hex, Hex, Hex, Hex, Hex), 184: makeSyscallInfo("mq_notify", Hex, Hex), 185: makeSyscallInfo("mq_getsetattr", Hex, Hex, Hex), 186: makeSyscallInfo("msgget", Hex, Hex), 187: makeSyscallInfo("msgctl", Hex, Hex, Hex), 188: makeSyscallInfo("msgrcv", Hex, Hex, Hex, Hex, Hex), 189: makeSyscallInfo("msgsnd", Hex, Hex, Hex, Hex), 190: makeSyscallInfo("semget", Hex, Hex, Hex), 191: makeSyscallInfo("semctl", Hex, Hex, Hex, Hex), 192: makeSyscallInfo("semtimedop", Hex, Hex, Hex, Hex), 193: makeSyscallInfo("semop", Hex, Hex, Hex), 194: makeSyscallInfo("shmget", Hex, Hex, Hex), 195: makeSyscallInfo("shmctl", Hex, Hex, Hex), 196: makeSyscallInfo("shmat", Hex, Hex, Hex), 197: makeSyscallInfo("shmdt", Hex), 198: makeSyscallInfo("socket", SockFamily, SockType, SockProtocol), 199: makeSyscallInfo("socketpair", SockFamily, SockType, SockProtocol, Hex), 200: makeSyscallInfo("bind", FD, SockAddr, Hex), 201: makeSyscallInfo("listen", FD, Hex), 202: makeSyscallInfo("accept", FD, PostSockAddr, SockLen), 203: makeSyscallInfo("connect", FD, SockAddr, Hex), 204: makeSyscallInfo("getsockname", FD, PostSockAddr, SockLen), 205: makeSyscallInfo("getpeername", FD, PostSockAddr, SockLen), 206: makeSyscallInfo("sendto", FD, Hex, Hex, Hex, SockAddr, Hex), 207: makeSyscallInfo("recvfrom", FD, Hex, Hex, Hex, PostSockAddr, SockLen), 208: makeSyscallInfo("setsockopt", FD, Hex, Hex, Hex, Hex), 209: makeSyscallInfo("getsockopt", FD, Hex, Hex, Hex, Hex), 210: makeSyscallInfo("shutdown", FD, Hex), 211: makeSyscallInfo("sendmsg", FD, SendMsgHdr, Hex), 212: makeSyscallInfo("recvmsg", FD, RecvMsgHdr, Hex), 213: makeSyscallInfo("readahead", Hex, Hex, Hex), 214: makeSyscallInfo("brk", Hex), 215: makeSyscallInfo("munmap", Hex, Hex), 216: makeSyscallInfo("mremap", Hex, Hex, Hex, Hex, Hex), 217: makeSyscallInfo("add_key", Hex, Hex, Hex, Hex, Hex), 218: makeSyscallInfo("request_key", Hex, Hex, Hex, Hex), 219: makeSyscallInfo("keyctl", Hex, Hex, Hex, Hex, Hex), 220: makeSyscallInfo("clone", CloneFlags, Hex, Hex, Hex, Hex), 221: makeSyscallInfo("execve", Path, ExecveStringVector, ExecveStringVector), 222: makeSyscallInfo("mmap", Hex, Hex, MmapProt, MmapFlags, FD, Hex), 223: makeSyscallInfo("fadvise64", FD, Hex, Hex, Hex), 224: makeSyscallInfo("swapon", Hex, Hex), 225: makeSyscallInfo("swapoff", Hex), 226: makeSyscallInfo("mprotect", Hex, Hex, Hex), 227: makeSyscallInfo("msync", Hex, Hex, Hex), 228: makeSyscallInfo("mlock", Hex, Hex), 229: makeSyscallInfo("munlock", Hex, Hex), 230: makeSyscallInfo("mlockall", Hex), 231: makeSyscallInfo("munlockall"), 232: makeSyscallInfo("mincore", Hex, Hex, Hex), 233: makeSyscallInfo("madvise", Hex, Hex, Hex), 234: makeSyscallInfo("remap_file_pages", Hex, Hex, Hex, Hex, Hex), 235: makeSyscallInfo("mbind", Hex, Hex, Hex, Hex, Hex, Hex), 236: makeSyscallInfo("get_mempolicy", Hex, Hex, Hex, Hex, Hex), 237: makeSyscallInfo("set_mempolicy", Hex, Hex, Hex), 238: makeSyscallInfo("migrate_pages", Hex, Hex, Hex, Hex), 239: makeSyscallInfo("move_pages", Hex, Hex, Hex, Hex, Hex, Hex), 240: makeSyscallInfo("rt_tgsigqueueinfo", Hex, Hex, Signal, Hex), 241: makeSyscallInfo("perf_event_open", Hex, Hex, Hex, Hex, Hex), 242: makeSyscallInfo("accept4", FD, PostSockAddr, SockLen, SockFlags), 243: makeSyscallInfo("recvmmsg", FD, Hex, Hex, Hex, Hex), 260: makeSyscallInfo("wait4", Hex, Hex, Hex, Rusage), 261: makeSyscallInfo("prlimit64", Hex, Hex, Hex, Hex), 262: makeSyscallInfo("fanotify_init", Hex, Hex), 263: makeSyscallInfo("fanotify_mark", Hex, Hex, Hex, Hex, Hex), 264: makeSyscallInfo("name_to_handle_at", FD, Hex, Hex, Hex, Hex), 265: makeSyscallInfo("open_by_handle_at", FD, Hex, Hex), 266: makeSyscallInfo("clock_adjtime", Hex, Hex), 267: makeSyscallInfo("syncfs", FD), 268: makeSyscallInfo("setns", FD, Hex), 269: makeSyscallInfo("sendmmsg", FD, Hex, Hex, Hex), 270: makeSyscallInfo("process_vm_readv", Hex, ReadIOVec, Hex, IOVec, Hex, Hex), 271: makeSyscallInfo("process_vm_writev", Hex, IOVec, Hex, WriteIOVec, Hex, Hex), 272: makeSyscallInfo("kcmp", Hex, Hex, Hex, Hex, Hex), 273: makeSyscallInfo("finit_module", Hex, Hex, Hex), 274: makeSyscallInfo("sched_setattr", Hex, Hex, Hex), 275: makeSyscallInfo("sched_getattr", Hex, Hex, Hex), 276: makeSyscallInfo("renameat2", FD, Path, Hex, Path, Hex), 277: makeSyscallInfo("seccomp", Hex, Hex, Hex), 278: makeSyscallInfo("getrandom", Hex, Hex, Hex), 279: makeSyscallInfo("memfd_create", Path, Hex), 280: makeSyscallInfo("bpf", Hex, Hex, Hex), 281: makeSyscallInfo("execveat", FD, Path, Hex, Hex, Hex), 282: makeSyscallInfo("userfaultfd", Hex), 283: makeSyscallInfo("membarrier", Hex), 284: makeSyscallInfo("mlock2", Hex, Hex, Hex), 285: makeSyscallInfo("copy_file_range", FD, Hex, FD, Hex, Hex, Hex), 286: makeSyscallInfo("preadv2", FD, ReadIOVec, Hex, Hex, Hex), 287: makeSyscallInfo("pwritev2", FD, WriteIOVec, Hex, Hex, Hex), 291: makeSyscallInfo("statx", FD, Path, Hex, Hex, Hex), 292: makeSyscallInfo("io_pgetevents", Hex, Hex, Hex, Hex, Timespec, SigSet), 293: makeSyscallInfo("rseq", Hex, Hex, Hex, Hex), 424: makeSyscallInfo("pidfd_send_signal", FD, Signal, Hex, Hex), 425: makeSyscallInfo("io_uring_setup", Hex, Hex), 426: makeSyscallInfo("io_uring_enter", FD, Hex, Hex, Hex, SigSet, Hex), 427: makeSyscallInfo("io_uring_register", FD, Hex, Hex, Hex), 428: makeSyscallInfo("open_tree", FD, Path, Hex), 429: makeSyscallInfo("move_mount", FD, Path, FD, Path, Hex), 430: makeSyscallInfo("fsopen", Path, Hex), // Not quite a path, but close. 431: makeSyscallInfo("fsconfig", FD, Hex, Hex, Hex, Hex), 432: makeSyscallInfo("fsmount", FD, Hex, Hex), 433: makeSyscallInfo("fspick", FD, Path, Hex), 434: makeSyscallInfo("pidfd_open", Hex, Hex), 435: makeSyscallInfo("clone3", Hex, Hex), 436: makeSyscallInfo("close_range", FD, FD, CloseRangeFlags), 439: makeSyscallInfo("faccessat2", FD, Path, Oct, Hex), 441: makeSyscallInfo("epoll_pwait2", FD, EpollEvents, Hex, Timespec, SigSet), } func init() { syscallTables = append(syscallTables, syscallTable{ os: abi.Linux, arch: arch.ARM64, syscalls: linuxARM64}) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/mmap.go000066400000000000000000000033651465435605700235210ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package strace import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" ) // ProtectionFlagSet represents the protection to mmap(2). var ProtectionFlagSet = abi.FlagSet{ { Flag: linux.PROT_READ, Name: "PROT_READ", }, { Flag: linux.PROT_WRITE, Name: "PROT_WRITE", }, { Flag: linux.PROT_EXEC, Name: "PROT_EXEC", }, } // MmapFlagSet is the set of mmap(2) flags. var MmapFlagSet = abi.FlagSet{ { Flag: linux.MAP_SHARED, Name: "MAP_SHARED", }, { Flag: linux.MAP_PRIVATE, Name: "MAP_PRIVATE", }, { Flag: linux.MAP_FIXED, Name: "MAP_FIXED", }, { Flag: linux.MAP_ANONYMOUS, Name: "MAP_ANONYMOUS", }, { Flag: linux.MAP_GROWSDOWN, Name: "MAP_GROWSDOWN", }, { Flag: linux.MAP_DENYWRITE, Name: "MAP_DENYWRITE", }, { Flag: linux.MAP_EXECUTABLE, Name: "MAP_EXECUTABLE", }, { Flag: linux.MAP_LOCKED, Name: "MAP_LOCKED", }, { Flag: linux.MAP_NORESERVE, Name: "MAP_NORESERVE", }, { Flag: linux.MAP_POPULATE, Name: "MAP_POPULATE", }, { Flag: linux.MAP_NONBLOCK, Name: "MAP_NONBLOCK", }, { Flag: linux.MAP_STACK, Name: "MAP_STACK", }, { Flag: linux.MAP_HUGETLB, Name: "MAP_HUGETLB", }, } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/open.go000066400000000000000000000035371465435605700235310ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package strace import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" ) // OpenMode represents the mode to open(2) a file. var OpenMode = abi.ValueSet{ linux.O_RDWR: "O_RDWR", linux.O_WRONLY: "O_WRONLY", linux.O_RDONLY: "O_RDONLY", } // OpenFlagSet is the set of open(2) flags. var OpenFlagSet = abi.FlagSet{ { Flag: linux.O_APPEND, Name: "O_APPEND", }, { Flag: linux.O_ASYNC, Name: "O_ASYNC", }, { Flag: linux.O_CLOEXEC, Name: "O_CLOEXEC", }, { Flag: linux.O_CREAT, Name: "O_CREAT", }, { Flag: linux.O_DIRECT, Name: "O_DIRECT", }, { Flag: linux.O_DIRECTORY, Name: "O_DIRECTORY", }, { Flag: linux.O_EXCL, Name: "O_EXCL", }, { Flag: linux.O_NOATIME, Name: "O_NOATIME", }, { Flag: linux.O_NOCTTY, Name: "O_NOCTTY", }, { Flag: linux.O_NOFOLLOW, Name: "O_NOFOLLOW", }, { Flag: linux.O_NONBLOCK, Name: "O_NONBLOCK", }, { Flag: 0x200000, // O_PATH Name: "O_PATH", }, { Flag: linux.O_SYNC, Name: "O_SYNC", }, { Flag: linux.O_TMPFILE, Name: "O_TMPFILE", }, { Flag: linux.O_TRUNC, Name: "O_TRUNC", }, } func open(val uint64) string { s := OpenMode.Parse(val & linux.O_ACCMODE) if flags := OpenFlagSet.Parse(val &^ linux.O_ACCMODE); flags != "" { s += "|" + flags } return s } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/poll.go000066400000000000000000000043551465435605700235350ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package strace import ( "fmt" "strings" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/hostarch" ) // PollEventSet is the set of poll(2) event flags. var PollEventSet = abi.FlagSet{ {Flag: linux.POLLIN, Name: "POLLIN"}, {Flag: linux.POLLPRI, Name: "POLLPRI"}, {Flag: linux.POLLOUT, Name: "POLLOUT"}, {Flag: linux.POLLERR, Name: "POLLERR"}, {Flag: linux.POLLHUP, Name: "POLLHUP"}, {Flag: linux.POLLNVAL, Name: "POLLNVAL"}, {Flag: linux.POLLRDNORM, Name: "POLLRDNORM"}, {Flag: linux.POLLRDBAND, Name: "POLLRDBAND"}, {Flag: linux.POLLWRNORM, Name: "POLLWRNORM"}, {Flag: linux.POLLWRBAND, Name: "POLLWRBAND"}, {Flag: linux.POLLMSG, Name: "POLLMSG"}, {Flag: linux.POLLREMOVE, Name: "POLLREMOVE"}, {Flag: linux.POLLRDHUP, Name: "POLLRDHUP"}, {Flag: linux.POLLFREE, Name: "POLLFREE"}, {Flag: linux.POLL_BUSY_LOOP, Name: "POLL_BUSY_LOOP"}, } func pollFD(t *kernel.Task, pfd *linux.PollFD, post bool) string { revents := "..." if post { revents = PollEventSet.Parse(uint64(pfd.REvents)) } return fmt.Sprintf("{FD: %s, Events: %s, REvents: %s}", fd(t, pfd.FD), PollEventSet.Parse(uint64(pfd.Events)), revents) } func pollFDs(t *kernel.Task, addr hostarch.Addr, nfds uint, post bool) string { if addr == 0 { return "null" } pfds, err := slinux.CopyInPollFDs(t, addr, nfds) if err != nil { return fmt.Sprintf("%#x (error decoding pollfds: %s)", addr, err) } s := make([]string, 0, len(pfds)) for i := range pfds { s = append(s, pollFD(t, &pfds[i], post)) } return fmt.Sprintf("%#x [%s]", addr, strings.Join(s, ", ")) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/ptrace.go000066400000000000000000000053541465435605700240450ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package strace import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" ) // PtraceRequestSet are the possible ptrace(2) requests. var PtraceRequestSet = abi.ValueSet{ linux.PTRACE_TRACEME: "PTRACE_TRACEME", linux.PTRACE_PEEKTEXT: "PTRACE_PEEKTEXT", linux.PTRACE_PEEKDATA: "PTRACE_PEEKDATA", linux.PTRACE_PEEKUSR: "PTRACE_PEEKUSR", linux.PTRACE_POKETEXT: "PTRACE_POKETEXT", linux.PTRACE_POKEDATA: "PTRACE_POKEDATA", linux.PTRACE_POKEUSR: "PTRACE_POKEUSR", linux.PTRACE_CONT: "PTRACE_CONT", linux.PTRACE_KILL: "PTRACE_KILL", linux.PTRACE_SINGLESTEP: "PTRACE_SINGLESTEP", linux.PTRACE_ATTACH: "PTRACE_ATTACH", linux.PTRACE_DETACH: "PTRACE_DETACH", linux.PTRACE_SYSCALL: "PTRACE_SYSCALL", linux.PTRACE_SETOPTIONS: "PTRACE_SETOPTIONS", linux.PTRACE_GETEVENTMSG: "PTRACE_GETEVENTMSG", linux.PTRACE_GETSIGINFO: "PTRACE_GETSIGINFO", linux.PTRACE_SETSIGINFO: "PTRACE_SETSIGINFO", linux.PTRACE_GETREGSET: "PTRACE_GETREGSET", linux.PTRACE_SETREGSET: "PTRACE_SETREGSET", linux.PTRACE_SEIZE: "PTRACE_SEIZE", linux.PTRACE_INTERRUPT: "PTRACE_INTERRUPT", linux.PTRACE_LISTEN: "PTRACE_LISTEN", linux.PTRACE_PEEKSIGINFO: "PTRACE_PEEKSIGINFO", linux.PTRACE_GETSIGMASK: "PTRACE_GETSIGMASK", linux.PTRACE_SETSIGMASK: "PTRACE_SETSIGMASK", linux.PTRACE_GETREGS: "PTRACE_GETREGS", linux.PTRACE_SETREGS: "PTRACE_SETREGS", linux.PTRACE_GETFPREGS: "PTRACE_GETFPREGS", linux.PTRACE_SETFPREGS: "PTRACE_SETFPREGS", linux.PTRACE_GETFPXREGS: "PTRACE_GETFPXREGS", linux.PTRACE_SETFPXREGS: "PTRACE_SETFPXREGS", linux.PTRACE_OLDSETOPTIONS: "PTRACE_OLDSETOPTIONS", linux.PTRACE_GET_THREAD_AREA: "PTRACE_GET_THREAD_AREA", linux.PTRACE_SET_THREAD_AREA: "PTRACE_SET_THREAD_AREA", linux.PTRACE_ARCH_PRCTL: "PTRACE_ARCH_PRCTL", linux.PTRACE_SYSEMU: "PTRACE_SYSEMU", linux.PTRACE_SYSEMU_SINGLESTEP: "PTRACE_SYSEMU_SINGLESTEP", linux.PTRACE_SINGLEBLOCK: "PTRACE_SINGLEBLOCK", } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/select.go000066400000000000000000000027231465435605700240430ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package strace import ( "fmt" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/hostarch" ) func fdsFromSet(t *kernel.Task, set []byte) []int { var fds []int // Append n if the n-th bit is 1. for i, v := range set { for j := 0; j < 8; j++ { if (v>>j)&1 == 1 { fds = append(fds, i*8+j) } } } return fds } func fdSet(t *kernel.Task, nfds int, addr hostarch.Addr) string { if nfds < 0 { return fmt.Sprintf("%#x (negative nfds)", addr) } if addr == 0 { return "null" } // Calculate the size of the fd set (one bit per fd). nBytes := (nfds + 7) / 8 nBitsInLastPartialByte := nfds % 8 set, err := linux.CopyInFDSet(t, addr, nBytes, nBitsInLastPartialByte) if err != nil { return fmt.Sprintf("%#x (error decoding fdset: %s)", addr, err) } return fmt.Sprintf("%#x %v", addr, fdsFromSet(t, set)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/signal.go000066400000000000000000000074161465435605700240450ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package strace import ( "fmt" "strings" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/hostarch" ) // signalNames contains the names of all named signals. var signalNames = abi.ValueSet{ uint64(linux.SIGABRT): "SIGABRT", uint64(linux.SIGALRM): "SIGALRM", uint64(linux.SIGBUS): "SIGBUS", uint64(linux.SIGCHLD): "SIGCHLD", uint64(linux.SIGCONT): "SIGCONT", uint64(linux.SIGFPE): "SIGFPE", uint64(linux.SIGHUP): "SIGHUP", uint64(linux.SIGILL): "SIGILL", uint64(linux.SIGINT): "SIGINT", uint64(linux.SIGIO): "SIGIO", uint64(linux.SIGKILL): "SIGKILL", uint64(linux.SIGPIPE): "SIGPIPE", uint64(linux.SIGPROF): "SIGPROF", uint64(linux.SIGPWR): "SIGPWR", uint64(linux.SIGQUIT): "SIGQUIT", uint64(linux.SIGSEGV): "SIGSEGV", uint64(linux.SIGSTKFLT): "SIGSTKFLT", uint64(linux.SIGSTOP): "SIGSTOP", uint64(linux.SIGSYS): "SIGSYS", uint64(linux.SIGTERM): "SIGTERM", uint64(linux.SIGTRAP): "SIGTRAP", uint64(linux.SIGTSTP): "SIGTSTP", uint64(linux.SIGTTIN): "SIGTTIN", uint64(linux.SIGTTOU): "SIGTTOU", uint64(linux.SIGURG): "SIGURG", uint64(linux.SIGUSR1): "SIGUSR1", uint64(linux.SIGUSR2): "SIGUSR2", uint64(linux.SIGVTALRM): "SIGVTALRM", uint64(linux.SIGWINCH): "SIGWINCH", uint64(linux.SIGXCPU): "SIGXCPU", uint64(linux.SIGXFSZ): "SIGXFSZ", } var signalMaskActions = abi.ValueSet{ linux.SIG_BLOCK: "SIG_BLOCK", linux.SIG_UNBLOCK: "SIG_UNBLOCK", linux.SIG_SETMASK: "SIG_SETMASK", } var sigActionFlags = abi.FlagSet{ { Flag: linux.SA_NOCLDSTOP, Name: "SA_NOCLDSTOP", }, { Flag: linux.SA_NOCLDWAIT, Name: "SA_NOCLDWAIT", }, { Flag: linux.SA_SIGINFO, Name: "SA_SIGINFO", }, { Flag: linux.SA_RESTORER, Name: "SA_RESTORER", }, { Flag: linux.SA_ONSTACK, Name: "SA_ONSTACK", }, { Flag: linux.SA_RESTART, Name: "SA_RESTART", }, { Flag: linux.SA_NODEFER, Name: "SA_NODEFER", }, { Flag: linux.SA_RESETHAND, Name: "SA_RESETHAND", }, } func sigSet(t *kernel.Task, addr hostarch.Addr) string { if addr == 0 { return "null" } var b [linux.SignalSetSize]byte if _, err := t.CopyInBytes(addr, b[:]); err != nil { return fmt.Sprintf("%#x (error copying sigset: %v)", addr, err) } set := linux.SignalSet(hostarch.ByteOrder.Uint64(b[:])) return fmt.Sprintf("%#x %s", addr, formatSigSet(set)) } func formatSigSet(set linux.SignalSet) string { var signals []string linux.ForEachSignal(set, func(sig linux.Signal) { signals = append(signals, signalNames.ParseDecimal(uint64(sig))) }) return fmt.Sprintf("[%v]", strings.Join(signals, " ")) } func sigAction(t *kernel.Task, addr hostarch.Addr) string { if addr == 0 { return "null" } var sa linux.SigAction if _, err := sa.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error copying sigaction: %v)", addr, err) } var handler string switch sa.Handler { case linux.SIG_IGN: handler = "SIG_IGN" case linux.SIG_DFL: handler = "SIG_DFL" default: handler = fmt.Sprintf("%#x", sa.Handler) } return fmt.Sprintf("%#x {Handler: %s, Flags: %s, Restorer: %#x, Mask: %s}", addr, handler, sigActionFlags.Parse(sa.Flags), sa.Restorer, formatSigSet(sa.Mask)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/socket.go000066400000000000000000000536731465435605700240660ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package strace import ( "fmt" "strings" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netlink" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" ) // SocketFamily are the possible socket(2) families. var SocketFamily = abi.ValueSet{ linux.AF_UNSPEC: "AF_UNSPEC", linux.AF_UNIX: "AF_UNIX", linux.AF_INET: "AF_INET", linux.AF_AX25: "AF_AX25", linux.AF_IPX: "AF_IPX", linux.AF_APPLETALK: "AF_APPLETALK", linux.AF_NETROM: "AF_NETROM", linux.AF_BRIDGE: "AF_BRIDGE", linux.AF_ATMPVC: "AF_ATMPVC", linux.AF_X25: "AF_X25", linux.AF_INET6: "AF_INET6", linux.AF_ROSE: "AF_ROSE", linux.AF_DECnet: "AF_DECnet", linux.AF_NETBEUI: "AF_NETBEUI", linux.AF_SECURITY: "AF_SECURITY", linux.AF_KEY: "AF_KEY", linux.AF_NETLINK: "AF_NETLINK", linux.AF_PACKET: "AF_PACKET", linux.AF_ASH: "AF_ASH", linux.AF_ECONET: "AF_ECONET", linux.AF_ATMSVC: "AF_ATMSVC", linux.AF_RDS: "AF_RDS", linux.AF_SNA: "AF_SNA", linux.AF_IRDA: "AF_IRDA", linux.AF_PPPOX: "AF_PPPOX", linux.AF_WANPIPE: "AF_WANPIPE", linux.AF_LLC: "AF_LLC", linux.AF_IB: "AF_IB", linux.AF_MPLS: "AF_MPLS", linux.AF_CAN: "AF_CAN", linux.AF_TIPC: "AF_TIPC", linux.AF_BLUETOOTH: "AF_BLUETOOTH", linux.AF_IUCV: "AF_IUCV", linux.AF_RXRPC: "AF_RXRPC", linux.AF_ISDN: "AF_ISDN", linux.AF_PHONET: "AF_PHONET", linux.AF_IEEE802154: "AF_IEEE802154", linux.AF_CAIF: "AF_CAIF", linux.AF_ALG: "AF_ALG", linux.AF_NFC: "AF_NFC", linux.AF_VSOCK: "AF_VSOCK", } // SocketType are the possible socket(2) types. var SocketType = abi.ValueSet{ uint64(linux.SOCK_STREAM): "SOCK_STREAM", uint64(linux.SOCK_DGRAM): "SOCK_DGRAM", uint64(linux.SOCK_RAW): "SOCK_RAW", uint64(linux.SOCK_RDM): "SOCK_RDM", uint64(linux.SOCK_SEQPACKET): "SOCK_SEQPACKET", uint64(linux.SOCK_DCCP): "SOCK_DCCP", uint64(linux.SOCK_PACKET): "SOCK_PACKET", } // SocketFlagSet are the possible socket(2) flags. var SocketFlagSet = abi.FlagSet{ { Flag: linux.SOCK_CLOEXEC, Name: "SOCK_CLOEXEC", }, { Flag: linux.SOCK_NONBLOCK, Name: "SOCK_NONBLOCK", }, } // ipProtocol are the possible socket(2) types for INET and INET6 sockets. var ipProtocol = abi.ValueSet{ linux.IPPROTO_IP: "IPPROTO_IP", linux.IPPROTO_ICMP: "IPPROTO_ICMP", linux.IPPROTO_ICMPV6: "IPPROTO_ICMPV6", linux.IPPROTO_IGMP: "IPPROTO_IGMP", linux.IPPROTO_IPIP: "IPPROTO_IPIP", linux.IPPROTO_TCP: "IPPROTO_TCP", linux.IPPROTO_EGP: "IPPROTO_EGP", linux.IPPROTO_PUP: "IPPROTO_PUP", linux.IPPROTO_UDP: "IPPROTO_UDP", linux.IPPROTO_IDP: "IPPROTO_IDP", linux.IPPROTO_TP: "IPPROTO_TP", linux.IPPROTO_DCCP: "IPPROTO_DCCP", linux.IPPROTO_IPV6: "IPPROTO_IPV6", linux.IPPROTO_RSVP: "IPPROTO_RSVP", linux.IPPROTO_GRE: "IPPROTO_GRE", linux.IPPROTO_ESP: "IPPROTO_ESP", linux.IPPROTO_AH: "IPPROTO_AH", linux.IPPROTO_MTP: "IPPROTO_MTP", linux.IPPROTO_BEETPH: "IPPROTO_BEETPH", linux.IPPROTO_ENCAP: "IPPROTO_ENCAP", linux.IPPROTO_PIM: "IPPROTO_PIM", linux.IPPROTO_COMP: "IPPROTO_COMP", linux.IPPROTO_SCTP: "IPPROTO_SCTP", linux.IPPROTO_UDPLITE: "IPPROTO_UDPLITE", linux.IPPROTO_MPLS: "IPPROTO_MPLS", linux.IPPROTO_RAW: "IPPROTO_RAW", } // SocketProtocol are the possible socket(2) protocols for each protocol family. var SocketProtocol = map[int32]abi.ValueSet{ linux.AF_INET: ipProtocol, linux.AF_INET6: ipProtocol, linux.AF_NETLINK: { linux.NETLINK_ROUTE: "NETLINK_ROUTE", linux.NETLINK_UNUSED: "NETLINK_UNUSED", linux.NETLINK_USERSOCK: "NETLINK_USERSOCK", linux.NETLINK_FIREWALL: "NETLINK_FIREWALL", linux.NETLINK_SOCK_DIAG: "NETLINK_SOCK_DIAG", linux.NETLINK_NFLOG: "NETLINK_NFLOG", linux.NETLINK_XFRM: "NETLINK_XFRM", linux.NETLINK_SELINUX: "NETLINK_SELINUX", linux.NETLINK_ISCSI: "NETLINK_ISCSI", linux.NETLINK_AUDIT: "NETLINK_AUDIT", linux.NETLINK_FIB_LOOKUP: "NETLINK_FIB_LOOKUP", linux.NETLINK_CONNECTOR: "NETLINK_CONNECTOR", linux.NETLINK_NETFILTER: "NETLINK_NETFILTER", linux.NETLINK_IP6_FW: "NETLINK_IP6_FW", linux.NETLINK_DNRTMSG: "NETLINK_DNRTMSG", linux.NETLINK_KOBJECT_UEVENT: "NETLINK_KOBJECT_UEVENT", linux.NETLINK_GENERIC: "NETLINK_GENERIC", linux.NETLINK_SCSITRANSPORT: "NETLINK_SCSITRANSPORT", linux.NETLINK_ECRYPTFS: "NETLINK_ECRYPTFS", linux.NETLINK_RDMA: "NETLINK_RDMA", linux.NETLINK_CRYPTO: "NETLINK_CRYPTO", }, } var controlMessageType = map[int32]string{ linux.SCM_RIGHTS: "SCM_RIGHTS", linux.SCM_CREDENTIALS: "SCM_CREDENTIALS", linux.SO_TIMESTAMP: "SO_TIMESTAMP", } func unmarshalControlMessageRights(src []byte) []primitive.Int32 { count := len(src) / linux.SizeOfControlMessageRight cmr := make([]primitive.Int32, count) primitive.UnmarshalUnsafeInt32Slice(cmr, src) return cmr } func cmsghdr(t *kernel.Task, addr hostarch.Addr, length uint64, maxBytes uint64) string { if length > maxBytes { return fmt.Sprintf("%#x (error decoding control: invalid length (%d))", addr, length) } buf := make([]byte, length) if _, err := t.CopyInBytes(addr, buf); err != nil { return fmt.Sprintf("%#x (error decoding control: %v)", addr, err) } var strs []string for len(buf) > 0 { if linux.SizeOfControlMessageHeader > len(buf) { strs = append(strs, "{invalid control message (too short)}") break } var h linux.ControlMessageHeader buf = h.UnmarshalUnsafe(buf) var skipData bool level := "SOL_SOCKET" if h.Level != linux.SOL_SOCKET { skipData = true level = fmt.Sprint(h.Level) } typ, ok := controlMessageType[h.Type] if !ok { skipData = true typ = fmt.Sprint(h.Type) } width := t.Arch().Width() length := int(h.Length) - linux.SizeOfControlMessageHeader if length > len(buf) { strs = append(strs, fmt.Sprintf( "{level=%s, type=%s, length=%d, content extends beyond buffer}", level, typ, h.Length, )) break } if length < 0 { strs = append(strs, fmt.Sprintf( "{level=%s, type=%s, length=%d, content too short}", level, typ, h.Length, )) break } if skipData { strs = append(strs, fmt.Sprintf("{level=%s, type=%s, length=%d}", level, typ, h.Length)) } else { switch h.Type { case linux.SCM_RIGHTS: rightsSize := bits.AlignDown(length, linux.SizeOfControlMessageRight) fds := unmarshalControlMessageRights(buf[:rightsSize]) rights := make([]string, 0, len(fds)) for _, fd := range fds { rights = append(rights, fmt.Sprint(fd)) } strs = append(strs, fmt.Sprintf( "{level=%s, type=%s, length=%d, content: %s}", level, typ, h.Length, strings.Join(rights, ","), )) case linux.SCM_CREDENTIALS: if length < linux.SizeOfControlMessageCredentials { strs = append(strs, fmt.Sprintf( "{level=%s, type=%s, length=%d, content too short}", level, typ, h.Length, )) break } var creds linux.ControlMessageCredentials creds.UnmarshalUnsafe(buf) strs = append(strs, fmt.Sprintf( "{level=%s, type=%s, length=%d, pid: %d, uid: %d, gid: %d}", level, typ, h.Length, creds.PID, creds.UID, creds.GID, )) case linux.SO_TIMESTAMP: if length < linux.SizeOfTimeval { strs = append(strs, fmt.Sprintf( "{level=%s, type=%s, length=%d, content too short}", level, typ, h.Length, )) break } var tv linux.Timeval tv.UnmarshalUnsafe(buf) strs = append(strs, fmt.Sprintf( "{level=%s, type=%s, length=%d, Sec: %d, Usec: %d}", level, typ, h.Length, tv.Sec, tv.Usec, )) default: panic("unreachable") } } if shift := bits.AlignUp(length, width); shift > len(buf) { buf = buf[:0] } else { buf = buf[shift:] } } return fmt.Sprintf("%#x %s", addr, strings.Join(strs, ", ")) } func msghdr(t *kernel.Task, addr hostarch.Addr, printContent bool, maxBytes uint64) string { var msg slinux.MessageHeader64 if _, err := msg.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error decoding msghdr: %v)", addr, err) } s := fmt.Sprintf( "%#x {name=%#x, namelen=%d, iovecs=%s", addr, msg.Name, msg.NameLen, iovecs(t, hostarch.Addr(msg.Iov), int(msg.IovLen), printContent, maxBytes), ) if printContent { s = fmt.Sprintf("%s, control={%s}", s, cmsghdr(t, hostarch.Addr(msg.Control), msg.ControlLen, maxBytes)) } else { s = fmt.Sprintf("%s, control=%#x, control_len=%d", s, msg.Control, msg.ControlLen) } return fmt.Sprintf("%s, flags=%d}", s, msg.Flags) } func sockAddr(t *kernel.Task, addr hostarch.Addr, length uint32) string { if addr == 0 { return "null" } b, err := slinux.CaptureAddress(t, addr, length) if err != nil { return fmt.Sprintf("%#x {error reading address: %v}", addr, err) } // Extract address family. if len(b) < 2 { return fmt.Sprintf("%#x {address too short: %d bytes}", addr, len(b)) } family := hostarch.ByteOrder.Uint16(b) familyStr := SocketFamily.Parse(uint64(family)) switch family { case linux.AF_INET, linux.AF_INET6, linux.AF_UNIX: fa, _, err := socket.AddressAndFamily(b) if err != nil { return fmt.Sprintf("%#x {Family: %s, error extracting address: %v}", addr, familyStr, err) } if family == linux.AF_UNIX { return fmt.Sprintf("%#x {Family: %s, Addr: %q}", addr, familyStr, string(fa.Addr.AsSlice())) } return fmt.Sprintf("%#x {Family: %s, Addr: %v, Port: %d}", addr, familyStr, fa.Addr, fa.Port) case linux.AF_NETLINK: sa, err := netlink.ExtractSockAddr(b) if err != nil { return fmt.Sprintf("%#x {Family: %s, error extracting address: %v}", addr, familyStr, err) } return fmt.Sprintf("%#x {Family: %s, PortID: %d, Groups: %d}", addr, familyStr, sa.PortID, sa.Groups) default: return fmt.Sprintf("%#x {Family: %s, family addr format unknown}", addr, familyStr) } } func postSockAddr(t *kernel.Task, addr hostarch.Addr, lengthPtr hostarch.Addr) string { if addr == 0 { return "null" } if lengthPtr == 0 { return fmt.Sprintf("%#x {length null}", addr) } l, err := copySockLen(t, lengthPtr) if err != nil { return fmt.Sprintf("%#x {error reading length: %v}", addr, err) } return sockAddr(t, addr, l) } func copySockLen(t *kernel.Task, addr hostarch.Addr) (uint32, error) { // socklen_t is 32-bits. var l primitive.Uint32 _, err := l.CopyIn(t, addr) return uint32(l), err } func sockLenPointer(t *kernel.Task, addr hostarch.Addr) string { if addr == 0 { return "null" } l, err := copySockLen(t, addr) if err != nil { return fmt.Sprintf("%#x {error reading length: %v}", addr, err) } return fmt.Sprintf("%#x {length=%v}", addr, l) } func sockType(stype int32) string { s := SocketType.Parse(uint64(stype & linux.SOCK_TYPE_MASK)) if flags := SocketFlagSet.Parse(uint64(stype &^ linux.SOCK_TYPE_MASK)); flags != "" { s += "|" + flags } return s } func sockProtocol(family, protocol int32) string { protocols, ok := SocketProtocol[family] if !ok { return fmt.Sprintf("%#x", protocol) } return protocols.Parse(uint64(protocol)) } func sockFlags(flags int32) string { if flags == 0 { return "0" } return SocketFlagSet.Parse(uint64(flags)) } func getSockOptVal(t *kernel.Task, level, optname uint64, optVal hostarch.Addr, optLen hostarch.Addr, maximumBlobSize uint, rval uintptr) string { if int(rval) < 0 { return hexNum(uint64(optVal)) } if optVal == 0 { return "null" } l, err := copySockLen(t, optLen) if err != nil { return fmt.Sprintf("%#x {error reading length: %v}", optLen, err) } return sockOptVal(t, level, optname, optVal, uint64(l), maximumBlobSize) } func sockOptVal(t *kernel.Task, level, optname uint64, optVal hostarch.Addr, optLen uint64, maximumBlobSize uint) string { switch optLen { case 1: var v primitive.Uint8 _, err := v.CopyIn(t, optVal) if err != nil { return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err) } return fmt.Sprintf("%#x {value=%v}", optVal, v) case 2: var v primitive.Uint16 _, err := v.CopyIn(t, optVal) if err != nil { return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err) } return fmt.Sprintf("%#x {value=%v}", optVal, v) case 4: var v primitive.Uint32 _, err := v.CopyIn(t, optVal) if err != nil { return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err) } return fmt.Sprintf("%#x {value=%v}", optVal, v) default: return dump(t, optVal, uint(optLen), maximumBlobSize, true /* content */) } } var sockOptLevels = abi.ValueSet{ linux.SOL_IP: "SOL_IP", linux.SOL_SOCKET: "SOL_SOCKET", linux.SOL_TCP: "SOL_TCP", linux.SOL_UDP: "SOL_UDP", linux.SOL_IPV6: "SOL_IPV6", linux.SOL_ICMPV6: "SOL_ICMPV6", linux.SOL_RAW: "SOL_RAW", linux.SOL_PACKET: "SOL_PACKET", linux.SOL_NETLINK: "SOL_NETLINK", } var sockOptNames = map[uint64]abi.ValueSet{ linux.SOL_IP: { linux.IP_TTL: "IP_TTL", linux.IP_MULTICAST_TTL: "IP_MULTICAST_TTL", linux.IP_MULTICAST_IF: "IP_MULTICAST_IF", linux.IP_MULTICAST_LOOP: "IP_MULTICAST_LOOP", linux.IP_TOS: "IP_TOS", linux.IP_RECVTOS: "IP_RECVTOS", linux.IPT_SO_GET_INFO: "IPT_SO_GET_INFO", linux.IPT_SO_GET_ENTRIES: "IPT_SO_GET_ENTRIES", linux.IP_ADD_MEMBERSHIP: "IP_ADD_MEMBERSHIP", linux.IP_DROP_MEMBERSHIP: "IP_DROP_MEMBERSHIP", linux.MCAST_JOIN_GROUP: "MCAST_JOIN_GROUP", linux.IP_ADD_SOURCE_MEMBERSHIP: "IP_ADD_SOURCE_MEMBERSHIP", linux.IP_BIND_ADDRESS_NO_PORT: "IP_BIND_ADDRESS_NO_PORT", linux.IP_BLOCK_SOURCE: "IP_BLOCK_SOURCE", linux.IP_CHECKSUM: "IP_CHECKSUM", linux.IP_DROP_SOURCE_MEMBERSHIP: "IP_DROP_SOURCE_MEMBERSHIP", linux.IP_FREEBIND: "IP_FREEBIND", linux.IP_HDRINCL: "IP_HDRINCL", linux.IP_IPSEC_POLICY: "IP_IPSEC_POLICY", linux.IP_MINTTL: "IP_MINTTL", linux.IP_MSFILTER: "IP_MSFILTER", linux.IP_MTU_DISCOVER: "IP_MTU_DISCOVER", linux.IP_MULTICAST_ALL: "IP_MULTICAST_ALL", linux.IP_NODEFRAG: "IP_NODEFRAG", linux.IP_OPTIONS: "IP_OPTIONS", linux.IP_PASSSEC: "IP_PASSSEC", linux.IP_PKTINFO: "IP_PKTINFO", linux.IP_RECVERR: "IP_RECVERR", linux.IP_RECVFRAGSIZE: "IP_RECVFRAGSIZE", linux.IP_RECVOPTS: "IP_RECVOPTS", linux.IP_RECVORIGDSTADDR: "IP_RECVORIGDSTADDR", linux.IP_RECVTTL: "IP_RECVTTL", linux.IP_RETOPTS: "IP_RETOPTS", linux.IP_TRANSPARENT: "IP_TRANSPARENT", linux.IP_UNBLOCK_SOURCE: "IP_UNBLOCK_SOURCE", linux.IP_UNICAST_IF: "IP_UNICAST_IF", linux.IP_XFRM_POLICY: "IP_XFRM_POLICY", linux.MCAST_BLOCK_SOURCE: "MCAST_BLOCK_SOURCE", linux.MCAST_JOIN_SOURCE_GROUP: "MCAST_JOIN_SOURCE_GROUP", linux.MCAST_LEAVE_GROUP: "MCAST_LEAVE_GROUP", linux.MCAST_LEAVE_SOURCE_GROUP: "MCAST_LEAVE_SOURCE_GROUP", linux.MCAST_MSFILTER: "MCAST_MSFILTER", linux.MCAST_UNBLOCK_SOURCE: "MCAST_UNBLOCK_SOURCE", linux.IP_ROUTER_ALERT: "IP_ROUTER_ALERT", linux.IP_PKTOPTIONS: "IP_PKTOPTIONS", linux.IP_MTU: "IP_MTU", linux.SO_ORIGINAL_DST: "SO_ORIGINAL_DST", }, linux.SOL_SOCKET: { linux.SO_ERROR: "SO_ERROR", linux.SO_PEERCRED: "SO_PEERCRED", linux.SO_PASSCRED: "SO_PASSCRED", linux.SO_SNDBUF: "SO_SNDBUF", linux.SO_RCVBUF: "SO_RCVBUF", linux.SO_REUSEADDR: "SO_REUSEADDR", linux.SO_REUSEPORT: "SO_REUSEPORT", linux.SO_BINDTODEVICE: "SO_BINDTODEVICE", linux.SO_BROADCAST: "SO_BROADCAST", linux.SO_KEEPALIVE: "SO_KEEPALIVE", linux.SO_LINGER: "SO_LINGER", linux.SO_SNDTIMEO: "SO_SNDTIMEO", linux.SO_RCVTIMEO: "SO_RCVTIMEO", linux.SO_OOBINLINE: "SO_OOBINLINE", linux.SO_TIMESTAMP: "SO_TIMESTAMP", linux.SO_ACCEPTCONN: "SO_ACCEPTCONN", }, linux.SOL_TCP: { linux.TCP_NODELAY: "TCP_NODELAY", linux.TCP_CORK: "TCP_CORK", linux.TCP_QUICKACK: "TCP_QUICKACK", linux.TCP_MAXSEG: "TCP_MAXSEG", linux.TCP_KEEPIDLE: "TCP_KEEPIDLE", linux.TCP_KEEPINTVL: "TCP_KEEPINTVL", linux.TCP_USER_TIMEOUT: "TCP_USER_TIMEOUT", linux.TCP_INFO: "TCP_INFO", linux.TCP_CC_INFO: "TCP_CC_INFO", linux.TCP_NOTSENT_LOWAT: "TCP_NOTSENT_LOWAT", linux.TCP_ZEROCOPY_RECEIVE: "TCP_ZEROCOPY_RECEIVE", linux.TCP_CONGESTION: "TCP_CONGESTION", linux.TCP_LINGER2: "TCP_LINGER2", linux.TCP_DEFER_ACCEPT: "TCP_DEFER_ACCEPT", linux.TCP_REPAIR_OPTIONS: "TCP_REPAIR_OPTIONS", linux.TCP_INQ: "TCP_INQ", linux.TCP_FASTOPEN: "TCP_FASTOPEN", linux.TCP_FASTOPEN_CONNECT: "TCP_FASTOPEN_CONNECT", linux.TCP_FASTOPEN_KEY: "TCP_FASTOPEN_KEY", linux.TCP_FASTOPEN_NO_COOKIE: "TCP_FASTOPEN_NO_COOKIE", linux.TCP_KEEPCNT: "TCP_KEEPCNT", linux.TCP_QUEUE_SEQ: "TCP_QUEUE_SEQ", linux.TCP_REPAIR: "TCP_REPAIR", linux.TCP_REPAIR_QUEUE: "TCP_REPAIR_QUEUE", linux.TCP_REPAIR_WINDOW: "TCP_REPAIR_WINDOW", linux.TCP_SAVED_SYN: "TCP_SAVED_SYN", linux.TCP_SAVE_SYN: "TCP_SAVE_SYN", linux.TCP_SYNCNT: "TCP_SYNCNT", linux.TCP_THIN_DUPACK: "TCP_THIN_DUPACK", linux.TCP_THIN_LINEAR_TIMEOUTS: "TCP_THIN_LINEAR_TIMEOUTS", linux.TCP_TIMESTAMP: "TCP_TIMESTAMP", linux.TCP_ULP: "TCP_ULP", linux.TCP_WINDOW_CLAMP: "TCP_WINDOW_CLAMP", }, linux.SOL_IPV6: { linux.IPV6_V6ONLY: "IPV6_V6ONLY", linux.IPV6_PATHMTU: "IPV6_PATHMTU", linux.IPV6_TCLASS: "IPV6_TCLASS", linux.IPV6_ADD_MEMBERSHIP: "IPV6_ADD_MEMBERSHIP", linux.IPV6_DROP_MEMBERSHIP: "IPV6_DROP_MEMBERSHIP", linux.IPV6_IPSEC_POLICY: "IPV6_IPSEC_POLICY", linux.IPV6_JOIN_ANYCAST: "IPV6_JOIN_ANYCAST", linux.IPV6_LEAVE_ANYCAST: "IPV6_LEAVE_ANYCAST", linux.IPV6_PKTINFO: "IPV6_PKTINFO", linux.IPV6_ROUTER_ALERT: "IPV6_ROUTER_ALERT", linux.IPV6_XFRM_POLICY: "IPV6_XFRM_POLICY", linux.MCAST_BLOCK_SOURCE: "MCAST_BLOCK_SOURCE", linux.MCAST_JOIN_GROUP: "MCAST_JOIN_GROUP", linux.MCAST_JOIN_SOURCE_GROUP: "MCAST_JOIN_SOURCE_GROUP", linux.MCAST_LEAVE_GROUP: "MCAST_LEAVE_GROUP", linux.MCAST_LEAVE_SOURCE_GROUP: "MCAST_LEAVE_SOURCE_GROUP", linux.MCAST_UNBLOCK_SOURCE: "MCAST_UNBLOCK_SOURCE", linux.IPV6_2292DSTOPTS: "IPV6_2292DSTOPTS", linux.IPV6_2292HOPLIMIT: "IPV6_2292HOPLIMIT", linux.IPV6_2292HOPOPTS: "IPV6_2292HOPOPTS", linux.IPV6_2292PKTINFO: "IPV6_2292PKTINFO", linux.IPV6_2292PKTOPTIONS: "IPV6_2292PKTOPTIONS", linux.IPV6_2292RTHDR: "IPV6_2292RTHDR", linux.IPV6_ADDR_PREFERENCES: "IPV6_ADDR_PREFERENCES", linux.IPV6_AUTOFLOWLABEL: "IPV6_AUTOFLOWLABEL", linux.IPV6_DONTFRAG: "IPV6_DONTFRAG", linux.IPV6_DSTOPTS: "IPV6_DSTOPTS", linux.IPV6_FLOWINFO: "IPV6_FLOWINFO", linux.IPV6_FLOWINFO_SEND: "IPV6_FLOWINFO_SEND", linux.IPV6_FLOWLABEL_MGR: "IPV6_FLOWLABEL_MGR", linux.IPV6_FREEBIND: "IPV6_FREEBIND", linux.IPV6_HOPOPTS: "IPV6_HOPOPTS", linux.IPV6_MINHOPCOUNT: "IPV6_MINHOPCOUNT", linux.IPV6_MTU: "IPV6_MTU", linux.IPV6_MTU_DISCOVER: "IPV6_MTU_DISCOVER", linux.IPV6_MULTICAST_ALL: "IPV6_MULTICAST_ALL", linux.IPV6_MULTICAST_HOPS: "IPV6_MULTICAST_HOPS", linux.IPV6_MULTICAST_IF: "IPV6_MULTICAST_IF", linux.IPV6_MULTICAST_LOOP: "IPV6_MULTICAST_LOOP", linux.IPV6_RECVDSTOPTS: "IPV6_RECVDSTOPTS", linux.IPV6_RECVERR: "IPV6_RECVERR", linux.IPV6_RECVFRAGSIZE: "IPV6_RECVFRAGSIZE", linux.IPV6_RECVHOPLIMIT: "IPV6_RECVHOPLIMIT", linux.IPV6_RECVHOPOPTS: "IPV6_RECVHOPOPTS", linux.IPV6_RECVORIGDSTADDR: "IPV6_RECVORIGDSTADDR", linux.IPV6_RECVPATHMTU: "IPV6_RECVPATHMTU", linux.IPV6_RECVPKTINFO: "IPV6_RECVPKTINFO", linux.IPV6_RECVRTHDR: "IPV6_RECVRTHDR", linux.IPV6_RECVTCLASS: "IPV6_RECVTCLASS", linux.IPV6_RTHDR: "IPV6_RTHDR", linux.IPV6_RTHDRDSTOPTS: "IPV6_RTHDRDSTOPTS", linux.IPV6_TRANSPARENT: "IPV6_TRANSPARENT", linux.IPV6_UNICAST_HOPS: "IPV6_UNICAST_HOPS", linux.IPV6_UNICAST_IF: "IPV6_UNICAST_IF", linux.MCAST_MSFILTER: "MCAST_MSFILTER", linux.IPV6_ADDRFORM: "IPV6_ADDRFORM", linux.IP6T_SO_GET_INFO: "IP6T_SO_GET_INFO", linux.IP6T_SO_GET_ENTRIES: "IP6T_SO_GET_ENTRIES", }, linux.SOL_NETLINK: { linux.NETLINK_BROADCAST_ERROR: "NETLINK_BROADCAST_ERROR", linux.NETLINK_CAP_ACK: "NETLINK_CAP_ACK", linux.NETLINK_DUMP_STRICT_CHK: "NETLINK_DUMP_STRICT_CHK", linux.NETLINK_EXT_ACK: "NETLINK_EXT_ACK", linux.NETLINK_LIST_MEMBERSHIPS: "NETLINK_LIST_MEMBERSHIPS", linux.NETLINK_NO_ENOBUFS: "NETLINK_NO_ENOBUFS", linux.NETLINK_PKTINFO: "NETLINK_PKTINFO", }, } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/strace.go000066400000000000000000000637041465435605700240530ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package strace implements the logic to print out the input and the return value // of each traced syscall. package strace import ( "fmt" "strconv" "strings" "time" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" pb "gvisor.dev/gvisor/pkg/sentry/strace/strace_go_proto" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/hostarch" ) // DefaultLogMaximumSize is the default LogMaximumSize. const DefaultLogMaximumSize = 1024 // LogMaximumSize determines the maximum display size for data blobs (read, // write, etc.). var LogMaximumSize uint = DefaultLogMaximumSize // EventMaximumSize determines the maximum size for data blobs (read, write, // etc.) sent over the event channel. Default is 0 because most clients cannot // do anything useful with binary text dump of byte array arguments. var EventMaximumSize uint // LogAppDataAllowed is set to true when printing application data in strace // logs is allowed. var LogAppDataAllowed = true // ItimerTypes are the possible itimer types. var ItimerTypes = abi.ValueSet{ linux.ITIMER_REAL: "ITIMER_REAL", linux.ITIMER_VIRTUAL: "ITIMER_VIRTUAL", linux.ITIMER_PROF: "ITIMER_PROF", } func hexNum(num uint64) string { return "0x" + strconv.FormatUint(num, 16) } func hexArg(arg arch.SyscallArgument) string { return hexNum(arg.Uint64()) } func iovecs(t *kernel.Task, addr hostarch.Addr, iovcnt int, printContent bool, maxBytes uint64) string { if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV { return fmt.Sprintf("%#x (error decoding iovecs: invalid iovcnt)", addr) } ars, err := t.CopyInIovecs(addr, iovcnt) if err != nil { return fmt.Sprintf("%#x (error decoding iovecs: %v)", addr, err) } var totalBytes uint64 var truncated bool iovs := make([]string, iovcnt) for i := 0; !ars.IsEmpty(); i, ars = i+1, ars.Tail() { ar := ars.Head() if ar.Length() == 0 || !printContent { iovs[i] = fmt.Sprintf("{base=%#x, len=%d}", ar.Start, ar.Length()) continue } size := uint64(ar.Length()) if truncated || totalBytes+size > maxBytes { truncated = true size = maxBytes - totalBytes } else { totalBytes += uint64(ar.Length()) } b := make([]byte, size) amt, err := t.CopyInBytes(ar.Start, b) if err != nil { iovs[i] = fmt.Sprintf("{base=%#x, len=%d, %q..., error decoding string: %v}", ar.Start, ar.Length(), b[:amt], err) continue } dot := "" if truncated { // Indicate truncation. dot = "..." } iovs[i] = fmt.Sprintf("{base=%#x, len=%d, %q%s}", ar.Start, ar.Length(), b[:amt], dot) } return fmt.Sprintf("%#x %s", addr, strings.Join(iovs, ", ")) } func dump(t *kernel.Task, addr hostarch.Addr, size uint, maximumBlobSize uint, printContent bool) string { if !printContent { return fmt.Sprintf("{base=%#x, len=%d}", addr, size) } origSize := size if size > maximumBlobSize { size = maximumBlobSize } if size == 0 { return "" } b := make([]byte, size) amt, err := t.CopyInBytes(addr, b) if err != nil { return fmt.Sprintf("%#x (error decoding string: %s)", addr, err) } dot := "" if uint(amt) < origSize { // ... if we truncated the dump. dot = "..." } return fmt.Sprintf("%#x %q%s", addr, b[:amt], dot) } func path(t *kernel.Task, addr hostarch.Addr) string { if addr == 0 { return "" } path, err := t.CopyInString(addr, linux.PATH_MAX) if err != nil { return fmt.Sprintf("%#x (error decoding path: %s)", addr, err) } return fmt.Sprintf("%#x %s", addr, path) } func fd(t *kernel.Task, fd int32) string { root := t.FSContext().RootDirectory() defer root.DecRef(t) vfsObj := root.Mount().Filesystem().VirtualFilesystem() if fd == linux.AT_FDCWD { wd := t.FSContext().WorkingDirectory() defer wd.DecRef(t) name, _ := vfsObj.PathnameWithDeleted(t, root, wd) return fmt.Sprintf("AT_FDCWD %s", name) } file := t.GetFile(fd) if file == nil { // Cast FD to uint64 to avoid printing negative hex. return fmt.Sprintf("%#x (bad FD)", uint64(fd)) } defer file.DecRef(t) name, _ := vfsObj.PathnameWithDeleted(t, root, file.VirtualDentry()) return fmt.Sprintf("%#x %s", fd, name) } func fdpair(t *kernel.Task, addr hostarch.Addr) string { var fds [2]int32 _, err := primitive.CopyInt32SliceIn(t, addr, fds[:]) if err != nil { return fmt.Sprintf("%#x (error decoding fds: %s)", addr, err) } return fmt.Sprintf("%#x [%d %d]", addr, fds[0], fds[1]) } func uname(t *kernel.Task, addr hostarch.Addr) string { var u linux.UtsName if _, err := u.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error decoding utsname: %s)", addr, err) } return fmt.Sprintf("%#x %s", addr, u) } func utimensTimespec(t *kernel.Task, addr hostarch.Addr) string { if addr == 0 { return "null" } var tim linux.Timespec if _, err := tim.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error decoding timespec: %s)", addr, err) } var ns string switch tim.Nsec { case linux.UTIME_NOW: ns = "UTIME_NOW" case linux.UTIME_OMIT: ns = "UTIME_OMIT" default: ns = fmt.Sprintf("%v", tim.Nsec) } return fmt.Sprintf("%#x {sec=%v nsec=%s}", addr, tim.Sec, ns) } func timespec(t *kernel.Task, addr hostarch.Addr) string { if addr == 0 { return "null" } var tim linux.Timespec if _, err := tim.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error decoding timespec: %s)", addr, err) } return fmt.Sprintf("%#x {sec=%v nsec=%v}", addr, tim.Sec, tim.Nsec) } func timeval(t *kernel.Task, addr hostarch.Addr) string { if addr == 0 { return "null" } var tim linux.Timeval if _, err := tim.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error decoding timeval: %s)", addr, err) } return fmt.Sprintf("%#x {sec=%v usec=%v}", addr, tim.Sec, tim.Usec) } func utimbuf(t *kernel.Task, addr hostarch.Addr) string { if addr == 0 { return "null" } var utim linux.Utime if _, err := utim.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error decoding utimbuf: %s)", addr, err) } return fmt.Sprintf("%#x {actime=%v, modtime=%v}", addr, utim.Actime, utim.Modtime) } func stat(t *kernel.Task, addr hostarch.Addr) string { if addr == 0 { return "null" } var stat linux.Stat if _, err := stat.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error decoding stat: %s)", addr, err) } return fmt.Sprintf("%#x {dev=%d, ino=%d, mode=%s, nlink=%d, uid=%d, gid=%d, rdev=%d, size=%d, blksize=%d, blocks=%d, atime=%s, mtime=%s, ctime=%s}", addr, stat.Dev, stat.Ino, linux.FileMode(stat.Mode), stat.Nlink, stat.UID, stat.GID, stat.Rdev, stat.Size, stat.Blksize, stat.Blocks, time.Unix(stat.ATime.Sec, stat.ATime.Nsec), time.Unix(stat.MTime.Sec, stat.MTime.Nsec), time.Unix(stat.CTime.Sec, stat.CTime.Nsec)) } func itimerval(t *kernel.Task, addr hostarch.Addr) string { if addr == 0 { return "null" } interval := timeval(t, addr) value := timeval(t, addr+hostarch.Addr((*linux.Timeval)(nil).SizeBytes())) return fmt.Sprintf("%#x {interval=%s, value=%s}", addr, interval, value) } func itimerspec(t *kernel.Task, addr hostarch.Addr) string { if addr == 0 { return "null" } interval := timespec(t, addr) value := timespec(t, addr+hostarch.Addr((*linux.Timespec)(nil).SizeBytes())) return fmt.Sprintf("%#x {interval=%s, value=%s}", addr, interval, value) } func stringVector(t *kernel.Task, addr hostarch.Addr) string { vec, err := t.CopyInVector(addr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize) if err != nil { return fmt.Sprintf("%#x {error copying vector: %v}", addr, err) } s := fmt.Sprintf("%#x [", addr) for i, v := range vec { if i != 0 { s += ", " } s += fmt.Sprintf("%q", v) } s += "]" return s } func rusage(t *kernel.Task, addr hostarch.Addr) string { if addr == 0 { return "null" } var ru linux.Rusage if _, err := ru.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error decoding rusage: %s)", addr, err) } return fmt.Sprintf("%#x %+v", addr, ru) } func capHeader(t *kernel.Task, addr hostarch.Addr) string { if addr == 0 { return "null" } var hdr linux.CapUserHeader if _, err := hdr.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error decoding header: %s)", addr, err) } var version string switch hdr.Version { case linux.LINUX_CAPABILITY_VERSION_1: version = "1" case linux.LINUX_CAPABILITY_VERSION_2: version = "2" case linux.LINUX_CAPABILITY_VERSION_3: version = "3" default: version = strconv.FormatUint(uint64(hdr.Version), 16) } return fmt.Sprintf("%#x {Version: %s, Pid: %d}", addr, version, hdr.Pid) } func capData(t *kernel.Task, hdrAddr, dataAddr hostarch.Addr) string { if dataAddr == 0 { return "null" } var hdr linux.CapUserHeader if _, err := hdr.CopyIn(t, hdrAddr); err != nil { return fmt.Sprintf("%#x (error decoding header: %v)", dataAddr, err) } var p, i, e uint64 switch hdr.Version { case linux.LINUX_CAPABILITY_VERSION_1: var data linux.CapUserData if _, err := data.CopyIn(t, dataAddr); err != nil { return fmt.Sprintf("%#x (error decoding data: %v)", dataAddr, err) } p = uint64(data.Permitted) i = uint64(data.Inheritable) e = uint64(data.Effective) case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3: var data [2]linux.CapUserData if _, err := linux.CopyCapUserDataSliceIn(t, dataAddr, data[:]); err != nil { return fmt.Sprintf("%#x (error decoding data: %v)", dataAddr, err) } p = uint64(data[0].Permitted) | (uint64(data[1].Permitted) << 32) i = uint64(data[0].Inheritable) | (uint64(data[1].Inheritable) << 32) e = uint64(data[0].Effective) | (uint64(data[1].Effective) << 32) default: return fmt.Sprintf("%#x (unknown version %d)", dataAddr, hdr.Version) } return fmt.Sprintf("%#x {Permitted: %s, Inheritable: %s, Effective: %s}", dataAddr, CapabilityBitset.Parse(p), CapabilityBitset.Parse(i), CapabilityBitset.Parse(e)) } // pre fills in the pre-execution arguments for a system call. If an argument // cannot be interpreted before the system call is executed, then a hex value // will be used. Note that a full output slice will always be provided, that is // len(return) == len(args). func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlobSize uint) []string { var output []string for arg := range args { if arg >= len(i.format) { break } switch i.format[arg] { case FD: output = append(output, fd(t, args[arg].Int())) case WriteBuffer: output = append(output, dump(t, args[arg].Pointer(), args[arg+1].SizeT(), maximumBlobSize, LogAppDataAllowed /* content */)) case WriteIOVec: output = append(output, iovecs(t, args[arg].Pointer(), int(args[arg+1].Int()), LogAppDataAllowed /* content */, uint64(maximumBlobSize))) case ReadIOVec, IOVec: output = append(output, iovecs(t, args[arg].Pointer(), int(args[arg+1].Int()), false /* content */, uint64(maximumBlobSize))) case SendMsgHdr: output = append(output, msghdr(t, args[arg].Pointer(), LogAppDataAllowed /* content */, uint64(maximumBlobSize))) case RecvMsgHdr: output = append(output, msghdr(t, args[arg].Pointer(), false /* content */, uint64(maximumBlobSize))) case Path: output = append(output, path(t, args[arg].Pointer())) case ExecveStringVector: output = append(output, stringVector(t, args[arg].Pointer())) case SetSockOptVal: output = append(output, sockOptVal(t, args[arg-2].Uint64() /* level */, args[arg-1].Uint64() /* optName */, args[arg].Pointer() /* optVal */, args[arg+1].Uint64() /* optLen */, maximumBlobSize)) case SockOptLevel: output = append(output, sockOptLevels.Parse(args[arg].Uint64())) case SockOptName: output = append(output, sockOptNames[args[arg-1].Uint64() /* level */].Parse(args[arg].Uint64())) case SockAddr: output = append(output, sockAddr(t, args[arg].Pointer(), uint32(args[arg+1].Uint64()))) case SockLen: output = append(output, sockLenPointer(t, args[arg].Pointer())) case SockFamily: output = append(output, SocketFamily.Parse(uint64(args[arg].Int()))) case SockType: output = append(output, sockType(args[arg].Int())) case SockProtocol: output = append(output, sockProtocol(args[arg-2].Int(), args[arg].Int())) case SockFlags: output = append(output, sockFlags(args[arg].Int())) case Timespec: output = append(output, timespec(t, args[arg].Pointer())) case UTimeTimespec: output = append(output, utimensTimespec(t, args[arg].Pointer())) case ItimerVal: output = append(output, itimerval(t, args[arg].Pointer())) case ItimerSpec: output = append(output, itimerspec(t, args[arg].Pointer())) case Timeval: output = append(output, timeval(t, args[arg].Pointer())) case Utimbuf: output = append(output, utimbuf(t, args[arg].Pointer())) case CloneFlags: output = append(output, CloneFlagSet.Parse(uint64(args[arg].Uint()))) case OpenFlags: output = append(output, open(uint64(args[arg].Uint()))) case Mode: output = append(output, linux.FileMode(args[arg].ModeT()).String()) case FutexOp: output = append(output, futex(uint64(args[arg].Uint()))) case PtraceRequest: output = append(output, PtraceRequestSet.Parse(args[arg].Uint64())) case ItimerType: output = append(output, ItimerTypes.Parse(uint64(args[arg].Int()))) case Signal: output = append(output, signalNames.ParseDecimal(args[arg].Uint64())) case SignalMaskAction: output = append(output, signalMaskActions.Parse(uint64(args[arg].Int()))) case SigSet: output = append(output, sigSet(t, args[arg].Pointer())) case SigAction: output = append(output, sigAction(t, args[arg].Pointer())) case CapHeader: output = append(output, capHeader(t, args[arg].Pointer())) case CapData: output = append(output, capData(t, args[arg-1].Pointer(), args[arg].Pointer())) case PollFDs: output = append(output, pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), false)) case EpollCtlOp: output = append(output, epollCtlOps.Parse(uint64(args[arg].Int()))) case EpollEvent: output = append(output, epollEvent(t, args[arg].Pointer())) case EpollEvents: output = append(output, epollEvents(t, args[arg].Pointer(), 0 /* numEvents */, uint64(maximumBlobSize))) case SelectFDSet: output = append(output, fdSet(t, int(args[0].Int()), args[arg].Pointer())) case MmapProt: output = append(output, ProtectionFlagSet.Parse(uint64(args[arg].Uint()))) case MmapFlags: output = append(output, MmapFlagSet.Parse(uint64(args[arg].Uint()))) case CloseRangeFlags: output = append(output, CloseRangeFlagSet.Parse(uint64(args[arg].Uint()))) case Oct: output = append(output, "0o"+strconv.FormatUint(args[arg].Uint64(), 8)) case Hex: fallthrough default: output = append(output, hexArg(args[arg])) } } return output } // post fills in the post-execution arguments for a system call. This modifies // the given output slice in place with arguments that may only be interpreted // after the system call has been executed. func (i *SyscallInfo) post(t *kernel.Task, args arch.SyscallArguments, rval uintptr, output []string, maximumBlobSize uint) { for arg := range output { if arg >= len(i.format) { break } switch i.format[arg] { case ReadBuffer: output[arg] = dump(t, args[arg].Pointer(), uint(rval), maximumBlobSize, LogAppDataAllowed /* content */) case ReadIOVec: printLength := uint64(rval) if printLength > uint64(maximumBlobSize) { printLength = uint64(maximumBlobSize) } output[arg] = iovecs(t, args[arg].Pointer(), int(args[arg+1].Int()), LogAppDataAllowed /* content */, printLength) case WriteIOVec, IOVec, WriteBuffer: // We already have a big blast from write. output[arg] = "..." case SendMsgHdr: output[arg] = msghdr(t, args[arg].Pointer(), false /* content */, uint64(maximumBlobSize)) case RecvMsgHdr: output[arg] = msghdr(t, args[arg].Pointer(), LogAppDataAllowed /* content */, uint64(maximumBlobSize)) case PostPath: output[arg] = path(t, args[arg].Pointer()) case PipeFDs: output[arg] = fdpair(t, args[arg].Pointer()) case Uname: output[arg] = uname(t, args[arg].Pointer()) case Stat: output[arg] = stat(t, args[arg].Pointer()) case PostSockAddr: output[arg] = postSockAddr(t, args[arg].Pointer(), args[arg+1].Pointer()) case SockLen: output[arg] = sockLenPointer(t, args[arg].Pointer()) case PostTimespec: output[arg] = timespec(t, args[arg].Pointer()) case PostItimerVal: output[arg] = itimerval(t, args[arg].Pointer()) case PostItimerSpec: output[arg] = itimerspec(t, args[arg].Pointer()) case Timeval: output[arg] = timeval(t, args[arg].Pointer()) case Rusage: output[arg] = rusage(t, args[arg].Pointer()) case PostSigSet: output[arg] = sigSet(t, args[arg].Pointer()) case PostSigAction: output[arg] = sigAction(t, args[arg].Pointer()) case PostCapData: output[arg] = capData(t, args[arg-1].Pointer(), args[arg].Pointer()) case PollFDs: output[arg] = pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), true) case EpollEvents: output[arg] = epollEvents(t, args[arg].Pointer(), uint64(rval), uint64(maximumBlobSize)) case GetSockOptVal: output[arg] = getSockOptVal(t, args[arg-2].Uint64() /* level */, args[arg-1].Uint64() /* optName */, args[arg].Pointer() /* optVal */, args[arg+1].Pointer() /* optLen */, maximumBlobSize, rval) case SetSockOptVal: // No need to print the value again. While it usually // isn't, the string version of this arg can be long. output[arg] = hexArg(args[arg]) } } } // printEntry prints the given system call entry. func (i *SyscallInfo) printEnter(t *kernel.Task, args arch.SyscallArguments) []string { output := i.pre(t, args, LogMaximumSize) switch len(output) { case 0: t.Infof("%s E %s()", t.Name(), i.name) case 1: t.Infof("%s E %s(%s)", t.Name(), i.name, output[0]) case 2: t.Infof("%s E %s(%s, %s)", t.Name(), i.name, output[0], output[1]) case 3: t.Infof("%s E %s(%s, %s, %s)", t.Name(), i.name, output[0], output[1], output[2]) case 4: t.Infof("%s E %s(%s, %s, %s, %s)", t.Name(), i.name, output[0], output[1], output[2], output[3]) case 5: t.Infof("%s E %s(%s, %s, %s, %s, %s)", t.Name(), i.name, output[0], output[1], output[2], output[3], output[4]) case 6: t.Infof("%s E %s(%s, %s, %s, %s, %s, %s)", t.Name(), i.name, output[0], output[1], output[2], output[3], output[4], output[5]) } return output } // printExit prints the given system call exit. func (i *SyscallInfo) printExit(t *kernel.Task, elapsed time.Duration, output []string, args arch.SyscallArguments, retval uintptr, err error, errno int) { var rval string if err == nil { // Fill in the output after successful execution. i.post(t, args, retval, output, LogMaximumSize) rval = fmt.Sprintf("%d (%#x) (%v)", retval, retval, elapsed) } else { rval = fmt.Sprintf("%d (%#x) errno=%d (%s) (%v)", retval, retval, errno, err, elapsed) } switch len(output) { case 0: t.Infof("%s X %s() = %s", t.Name(), i.name, rval) case 1: t.Infof("%s X %s(%s) = %s", t.Name(), i.name, output[0], rval) case 2: t.Infof("%s X %s(%s, %s) = %s", t.Name(), i.name, output[0], output[1], rval) case 3: t.Infof("%s X %s(%s, %s, %s) = %s", t.Name(), i.name, output[0], output[1], output[2], rval) case 4: t.Infof("%s X %s(%s, %s, %s, %s) = %s", t.Name(), i.name, output[0], output[1], output[2], output[3], rval) case 5: t.Infof("%s X %s(%s, %s, %s, %s, %s) = %s", t.Name(), i.name, output[0], output[1], output[2], output[3], output[4], rval) case 6: t.Infof("%s X %s(%s, %s, %s, %s, %s, %s) = %s", t.Name(), i.name, output[0], output[1], output[2], output[3], output[4], output[5], rval) } } // sendEnter sends the syscall enter to event log. func (i *SyscallInfo) sendEnter(t *kernel.Task, args arch.SyscallArguments) []string { output := i.pre(t, args, EventMaximumSize) event := pb.Strace{ Process: t.Name(), Function: i.name, Info: &pb.Strace_Enter{ Enter: &pb.StraceEnter{}, }, } for _, arg := range output { event.Args = append(event.Args, arg) } eventchannel.Emit(&event) return output } // sendExit sends the syscall exit to event log. func (i *SyscallInfo) sendExit(t *kernel.Task, elapsed time.Duration, output []string, args arch.SyscallArguments, rval uintptr, err error, errno int) { if err == nil { // Fill in the output after successful execution. i.post(t, args, rval, output, EventMaximumSize) } exit := &pb.StraceExit{ Return: fmt.Sprintf("%#x", rval), ElapsedNs: elapsed.Nanoseconds(), } if err != nil { exit.Error = err.Error() exit.ErrNo = int64(errno) } event := pb.Strace{ Process: t.Name(), Function: i.name, Info: &pb.Strace_Exit{Exit: exit}, } for _, arg := range output { event.Args = append(event.Args, arg) } eventchannel.Emit(&event) } type syscallContext struct { info SyscallInfo args arch.SyscallArguments start time.Time logOutput []string eventOutput []string flags uint32 } // SyscallEnter implements kernel.Stracer.SyscallEnter. It logs the syscall // entry trace. func (s SyscallMap) SyscallEnter(t *kernel.Task, sysno uintptr, args arch.SyscallArguments, flags uint32) any { info, ok := s[sysno] if !ok { info = SyscallInfo{ name: fmt.Sprintf("sys_%d", sysno), format: defaultFormat, } } var output, eventOutput []string if bits.IsOn32(flags, kernel.StraceEnableLog) { output = info.printEnter(t, args) } if bits.IsOn32(flags, kernel.StraceEnableEvent) { eventOutput = info.sendEnter(t, args) } return &syscallContext{ info: info, args: args, start: time.Now(), logOutput: output, eventOutput: eventOutput, flags: flags, } } // SyscallExit implements kernel.Stracer.SyscallExit. It logs the syscall // exit trace. func (s SyscallMap) SyscallExit(context any, t *kernel.Task, sysno, rval uintptr, err error) { errno := kernel.ExtractErrno(err, int(sysno)) c := context.(*syscallContext) elapsed := time.Since(c.start) if bits.IsOn32(c.flags, kernel.StraceEnableLog) { c.info.printExit(t, elapsed, c.logOutput, c.args, rval, err, errno) } if bits.IsOn32(c.flags, kernel.StraceEnableEvent) { c.info.sendExit(t, elapsed, c.eventOutput, c.args, rval, err, errno) } } // ConvertToSysnoMap converts the names to a map keyed on the syscall number // and value set to true. // // The map is in a convenient format to pass to SyscallFlagsTable.Enable(). func (s SyscallMap) ConvertToSysnoMap(syscalls []string) (map[uintptr]bool, error) { if syscalls == nil { // Sentinel: no list. return nil, nil } l := make(map[uintptr]bool) for _, sc := range syscalls { // Try to match this system call. sysno, ok := s.ConvertToSysno(sc) if !ok { return nil, fmt.Errorf("syscall %q not found", sc) } l[sysno] = true } // Success. return l, nil } // ConvertToSysno converts the name to system call number. Returns false // if syscall with same name is not found. func (s SyscallMap) ConvertToSysno(syscall string) (uintptr, bool) { for sysno, info := range s { if info.name != "" && info.name == syscall { return sysno, true } } return 0, false } // Name returns the syscall name. func (s SyscallMap) Name(sysno uintptr) string { if info, ok := s[sysno]; ok { return info.name } return fmt.Sprintf("sys_%d", sysno) } // Initialize prepares all syscall tables for use by this package. // // N.B. This is not in an init function because we can't be sure all syscall // tables are registered with the kernel when init runs. func Initialize() { for _, table := range kernel.SyscallTables() { // Is this known? sys, ok := Lookup(table.OS, table.Arch) if !ok { continue } table.Stracer = sys } } // SinkType defines where to send straces to. type SinkType uint32 const ( // SinkTypeLog sends straces to text log SinkTypeLog SinkType = 1 << iota // SinkTypeEvent sends strace to event log SinkTypeEvent ) func convertToSyscallFlag(sinks SinkType) uint32 { ret := uint32(0) if bits.IsOn32(uint32(sinks), uint32(SinkTypeLog)) { ret |= kernel.StraceEnableLog } if bits.IsOn32(uint32(sinks), uint32(SinkTypeEvent)) { ret |= kernel.StraceEnableEvent } return ret } // Enable enables the syscalls in allowlist in all syscall tables. // // Preconditions: Initialize has been called. func Enable(allowlist []string, sinks SinkType) error { flags := convertToSyscallFlag(sinks) for _, table := range kernel.SyscallTables() { // Is this known? sys, ok := Lookup(table.OS, table.Arch) if !ok { continue } // Convert to a set of system calls numbers. wl, err := sys.ConvertToSysnoMap(allowlist) if err != nil { return err } table.FeatureEnable.Enable(flags, wl, true) } // Done. return nil } // Disable will disable Strace for all system calls and missing syscalls. // // Preconditions: Initialize has been called. func Disable(sinks SinkType) { flags := convertToSyscallFlag(sinks) for _, table := range kernel.SyscallTables() { // Strace will be disabled for all syscalls including missing. table.FeatureEnable.Enable(flags, nil, false) } } // EnableAll enables all syscalls in all syscall tables. // // Preconditions: Initialize has been called. func EnableAll(sinks SinkType) { flags := convertToSyscallFlag(sinks) for _, table := range kernel.SyscallTables() { // Is this known? if _, ok := Lookup(table.OS, table.Arch); !ok { continue } table.FeatureEnable.EnableAll(flags) } } func init() { t, ok := Lookup(abi.Host, arch.Host) if ok { // Provide the native table as the lookup for seccomp // debugging. This is best-effort. This is provided this way to // avoid dependencies from seccomp to this package. seccomp.SyscallName = t.Name } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/strace_amd64_state_autogen.go000066400000000000000000000001321465435605700277520ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 // +build amd64 package strace golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/strace_arm64_state_autogen.go000066400000000000000000000001321465435605700277700ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 // +build arm64 package strace golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/strace_go_proto/000077500000000000000000000000001465435605700254225ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/strace_go_proto/strace.pb.go000066400000000000000000000253341465435605700276410ustar00rootroot00000000000000// Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.31.0 // protoc v5.26.1 // source: pkg/sentry/strace/strace.proto package strace_go_proto import ( protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" reflect "reflect" sync "sync" ) const ( // Verify that this generated code is sufficiently up-to-date. _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) // Verify that runtime/protoimpl is sufficiently up-to-date. _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) type Strace struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Process string `protobuf:"bytes,1,opt,name=process,proto3" json:"process,omitempty"` Function string `protobuf:"bytes,2,opt,name=function,proto3" json:"function,omitempty"` Args []string `protobuf:"bytes,3,rep,name=args,proto3" json:"args,omitempty"` // Types that are assignable to Info: // // *Strace_Enter // *Strace_Exit Info isStrace_Info `protobuf_oneof:"info"` } func (x *Strace) Reset() { *x = Strace{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_strace_strace_proto_msgTypes[0] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *Strace) String() string { return protoimpl.X.MessageStringOf(x) } func (*Strace) ProtoMessage() {} func (x *Strace) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_strace_strace_proto_msgTypes[0] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use Strace.ProtoReflect.Descriptor instead. func (*Strace) Descriptor() ([]byte, []int) { return file_pkg_sentry_strace_strace_proto_rawDescGZIP(), []int{0} } func (x *Strace) GetProcess() string { if x != nil { return x.Process } return "" } func (x *Strace) GetFunction() string { if x != nil { return x.Function } return "" } func (x *Strace) GetArgs() []string { if x != nil { return x.Args } return nil } func (m *Strace) GetInfo() isStrace_Info { if m != nil { return m.Info } return nil } func (x *Strace) GetEnter() *StraceEnter { if x, ok := x.GetInfo().(*Strace_Enter); ok { return x.Enter } return nil } func (x *Strace) GetExit() *StraceExit { if x, ok := x.GetInfo().(*Strace_Exit); ok { return x.Exit } return nil } type isStrace_Info interface { isStrace_Info() } type Strace_Enter struct { Enter *StraceEnter `protobuf:"bytes,4,opt,name=enter,proto3,oneof"` } type Strace_Exit struct { Exit *StraceExit `protobuf:"bytes,5,opt,name=exit,proto3,oneof"` } func (*Strace_Enter) isStrace_Info() {} func (*Strace_Exit) isStrace_Info() {} type StraceEnter struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields } func (x *StraceEnter) Reset() { *x = StraceEnter{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_strace_strace_proto_msgTypes[1] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *StraceEnter) String() string { return protoimpl.X.MessageStringOf(x) } func (*StraceEnter) ProtoMessage() {} func (x *StraceEnter) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_strace_strace_proto_msgTypes[1] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use StraceEnter.ProtoReflect.Descriptor instead. func (*StraceEnter) Descriptor() ([]byte, []int) { return file_pkg_sentry_strace_strace_proto_rawDescGZIP(), []int{1} } type StraceExit struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Return string `protobuf:"bytes,1,opt,name=return,proto3" json:"return,omitempty"` Error string `protobuf:"bytes,2,opt,name=error,proto3" json:"error,omitempty"` ErrNo int64 `protobuf:"varint,3,opt,name=err_no,json=errNo,proto3" json:"err_no,omitempty"` ElapsedNs int64 `protobuf:"varint,4,opt,name=elapsed_ns,json=elapsedNs,proto3" json:"elapsed_ns,omitempty"` } func (x *StraceExit) Reset() { *x = StraceExit{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_strace_strace_proto_msgTypes[2] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *StraceExit) String() string { return protoimpl.X.MessageStringOf(x) } func (*StraceExit) ProtoMessage() {} func (x *StraceExit) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_strace_strace_proto_msgTypes[2] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use StraceExit.ProtoReflect.Descriptor instead. func (*StraceExit) Descriptor() ([]byte, []int) { return file_pkg_sentry_strace_strace_proto_rawDescGZIP(), []int{2} } func (x *StraceExit) GetReturn() string { if x != nil { return x.Return } return "" } func (x *StraceExit) GetError() string { if x != nil { return x.Error } return "" } func (x *StraceExit) GetErrNo() int64 { if x != nil { return x.ErrNo } return 0 } func (x *StraceExit) GetElapsedNs() int64 { if x != nil { return x.ElapsedNs } return 0 } var File_pkg_sentry_strace_strace_proto protoreflect.FileDescriptor var file_pkg_sentry_strace_strace_proto_rawDesc = []byte{ 0x0a, 0x1e, 0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x2f, 0x73, 0x74, 0x72, 0x61, 0x63, 0x65, 0x2f, 0x73, 0x74, 0x72, 0x61, 0x63, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x06, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x22, 0xb1, 0x01, 0x0a, 0x06, 0x53, 0x74, 0x72, 0x61, 0x63, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x70, 0x72, 0x6f, 0x63, 0x65, 0x73, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x70, 0x72, 0x6f, 0x63, 0x65, 0x73, 0x73, 0x12, 0x1a, 0x0a, 0x08, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x12, 0x0a, 0x04, 0x61, 0x72, 0x67, 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x09, 0x52, 0x04, 0x61, 0x72, 0x67, 0x73, 0x12, 0x2b, 0x0a, 0x05, 0x65, 0x6e, 0x74, 0x65, 0x72, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x13, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x53, 0x74, 0x72, 0x61, 0x63, 0x65, 0x45, 0x6e, 0x74, 0x65, 0x72, 0x48, 0x00, 0x52, 0x05, 0x65, 0x6e, 0x74, 0x65, 0x72, 0x12, 0x28, 0x0a, 0x04, 0x65, 0x78, 0x69, 0x74, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x53, 0x74, 0x72, 0x61, 0x63, 0x65, 0x45, 0x78, 0x69, 0x74, 0x48, 0x00, 0x52, 0x04, 0x65, 0x78, 0x69, 0x74, 0x42, 0x06, 0x0a, 0x04, 0x69, 0x6e, 0x66, 0x6f, 0x22, 0x0d, 0x0a, 0x0b, 0x53, 0x74, 0x72, 0x61, 0x63, 0x65, 0x45, 0x6e, 0x74, 0x65, 0x72, 0x22, 0x70, 0x0a, 0x0a, 0x53, 0x74, 0x72, 0x61, 0x63, 0x65, 0x45, 0x78, 0x69, 0x74, 0x12, 0x16, 0x0a, 0x06, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x12, 0x14, 0x0a, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x12, 0x15, 0x0a, 0x06, 0x65, 0x72, 0x72, 0x5f, 0x6e, 0x6f, 0x18, 0x03, 0x20, 0x01, 0x28, 0x03, 0x52, 0x05, 0x65, 0x72, 0x72, 0x4e, 0x6f, 0x12, 0x1d, 0x0a, 0x0a, 0x65, 0x6c, 0x61, 0x70, 0x73, 0x65, 0x64, 0x5f, 0x6e, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x03, 0x52, 0x09, 0x65, 0x6c, 0x61, 0x70, 0x73, 0x65, 0x64, 0x4e, 0x73, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( file_pkg_sentry_strace_strace_proto_rawDescOnce sync.Once file_pkg_sentry_strace_strace_proto_rawDescData = file_pkg_sentry_strace_strace_proto_rawDesc ) func file_pkg_sentry_strace_strace_proto_rawDescGZIP() []byte { file_pkg_sentry_strace_strace_proto_rawDescOnce.Do(func() { file_pkg_sentry_strace_strace_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_sentry_strace_strace_proto_rawDescData) }) return file_pkg_sentry_strace_strace_proto_rawDescData } var file_pkg_sentry_strace_strace_proto_msgTypes = make([]protoimpl.MessageInfo, 3) var file_pkg_sentry_strace_strace_proto_goTypes = []interface{}{ (*Strace)(nil), // 0: gvisor.Strace (*StraceEnter)(nil), // 1: gvisor.StraceEnter (*StraceExit)(nil), // 2: gvisor.StraceExit } var file_pkg_sentry_strace_strace_proto_depIdxs = []int32{ 1, // 0: gvisor.Strace.enter:type_name -> gvisor.StraceEnter 2, // 1: gvisor.Strace.exit:type_name -> gvisor.StraceExit 2, // [2:2] is the sub-list for method output_type 2, // [2:2] is the sub-list for method input_type 2, // [2:2] is the sub-list for extension type_name 2, // [2:2] is the sub-list for extension extendee 0, // [0:2] is the sub-list for field type_name } func init() { file_pkg_sentry_strace_strace_proto_init() } func file_pkg_sentry_strace_strace_proto_init() { if File_pkg_sentry_strace_strace_proto != nil { return } if !protoimpl.UnsafeEnabled { file_pkg_sentry_strace_strace_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*Strace); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_strace_strace_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*StraceEnter); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_sentry_strace_strace_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*StraceExit); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } } file_pkg_sentry_strace_strace_proto_msgTypes[0].OneofWrappers = []interface{}{ (*Strace_Enter)(nil), (*Strace_Exit)(nil), } type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_pkg_sentry_strace_strace_proto_rawDesc, NumEnums: 0, NumMessages: 3, NumExtensions: 0, NumServices: 0, }, GoTypes: file_pkg_sentry_strace_strace_proto_goTypes, DependencyIndexes: file_pkg_sentry_strace_strace_proto_depIdxs, MessageInfos: file_pkg_sentry_strace_strace_proto_msgTypes, }.Build() File_pkg_sentry_strace_strace_proto = out.File file_pkg_sentry_strace_strace_proto_rawDesc = nil file_pkg_sentry_strace_strace_proto_goTypes = nil file_pkg_sentry_strace_strace_proto_depIdxs = nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/strace_state_autogen.go000066400000000000000000000000701465435605700267600ustar00rootroot00000000000000// automatically generated by stateify. package strace golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/strace/syscalls.go000066400000000000000000000174541465435605700244300ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package strace import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ) // FormatSpecifier values describe how an individual syscall argument should be // formatted. type FormatSpecifier int // Valid FormatSpecifiers. // // Unless otherwise specified, values are formatted before syscall execution // and not updated after syscall execution (the same value is output). const ( // Hex is just a hexadecimal number. Hex FormatSpecifier = iota // Oct is just an octal number. Oct // FD is a file descriptor. FD // ReadBuffer is a buffer for a read-style call. The syscall return // value is used for the length. // // Formatted after syscall execution. ReadBuffer // WriteBuffer is a buffer for a write-style call. The following arg is // used for the length. // // Contents omitted after syscall execution. WriteBuffer // ReadIOVec is a pointer to a struct iovec for a writev-style call. // The following arg is used for the length. The return value is used // for the total length. // // Complete contents only formatted after syscall execution. ReadIOVec // WriteIOVec is a pointer to a struct iovec for a writev-style call. // The following arg is used for the length. // // Complete contents only formatted before syscall execution, omitted // after. WriteIOVec // IOVec is a generic pointer to a struct iovec. Contents are not dumped. IOVec // SendMsgHdr is a pointer to a struct msghdr for a sendmsg-style call. // Contents formatted only before syscall execution, omitted after. SendMsgHdr // RecvMsgHdr is a pointer to a struct msghdr for a recvmsg-style call. // Contents formatted only after syscall execution. RecvMsgHdr // Path is a pointer to a char* path. Path // PostPath is a pointer to a char* path, formatted after syscall // execution. PostPath // ExecveStringVector is a NULL-terminated array of strings. Enforces // the maximum execve array length. ExecveStringVector // PipeFDs is an array of two FDs, formatted after syscall execution. PipeFDs // Uname is a pointer to a struct uname, formatted after syscall execution. Uname // Stat is a pointer to a struct stat, formatted after syscall execution. Stat // SockAddr is a pointer to a struct sockaddr. The following arg is // used for length. SockAddr // PostSockAddr is a pointer to a struct sockaddr, formatted after // syscall execution. The following arg is a pointer to the socklen_t // length. PostSockAddr // SockLen is a pointer to a socklen_t, formatted before and after // syscall execution. SockLen // SockFamily is a socket protocol family value. SockFamily // SockType is a socket type and flags value. SockType // SockProtocol is a socket protocol value. Argument n-2 is the socket // protocol family. SockProtocol // SockFlags are socket flags. SockFlags // Timespec is a pointer to a struct timespec. Timespec // PostTimespec is a pointer to a struct timespec, formatted after // syscall execution. PostTimespec // UTimeTimespec is a pointer to a struct timespec. Formatting includes // UTIME_NOW and UTIME_OMIT. UTimeTimespec // ItimerVal is a pointer to a struct itimerval. ItimerVal // PostItimerVal is a pointer to a struct itimerval, formatted after // syscall execution. PostItimerVal // ItimerSpec is a pointer to a struct itimerspec. ItimerSpec // PostItimerSpec is a pointer to a struct itimerspec, formatted after // syscall execution. PostItimerSpec // Timeval is a pointer to a struct timeval, formatted before and after // syscall execution. Timeval // Utimbuf is a pointer to a struct utimbuf. Utimbuf // Rusage is a struct rusage, formatted after syscall execution. Rusage // CloneFlags are clone(2) flags. CloneFlags // OpenFlags are open(2) flags. OpenFlags // Mode is a mode_t. Mode // FutexOp is the futex(2) operation. FutexOp // PtraceRequest is the ptrace(2) request. PtraceRequest // ItimerType is an itimer type (ITIMER_REAL, etc). ItimerType // Signal is a signal number. Signal // SignalMaskAction is a signal mask action passed to rt_sigprocmask(2). SignalMaskAction // SigSet is a signal set. SigSet // PostSigSet is a signal set, formatted after syscall execution. PostSigSet // SigAction is a struct sigaction. SigAction // PostSigAction is a struct sigaction, formatted after syscall execution. PostSigAction // CapHeader is a cap_user_header_t. CapHeader // CapData is the data argument to capget(2)/capset(2). The previous // argument must be CapHeader. CapData // PostCapData is the data argument to capget(2)/capset(2), formatted // after syscall execution. The previous argument must be CapHeader. PostCapData // PollFDs is an array of struct pollfd. The number of entries in the // array is in the next argument. PollFDs // SelectFDSet is an fd_set argument in select(2)/pselect(2). The // number of FDs represented must be the first argument. SelectFDSet // GetSockOptVal is the optval argument in getsockopt(2). // // Formatted after syscall execution. GetSockOptVal // SetSockOptVal is the optval argument in setsockopt(2). // // Contents omitted after syscall execution. SetSockOptVal // SockOptLevel is the level argument in getsockopt(2) and // setsockopt(2). SockOptLevel // SockOptLevel is the optname argument in getsockopt(2) and // setsockopt(2). SockOptName // EpollCtlOp is the op argument to epoll_ctl(2). EpollCtlOp // EpollEvent is the event argument in epoll_ctl(2). EpollEvent // EpollEvents is an array of struct epoll_event. It is the events // argument in epoll_wait(2)/epoll_pwait(2). EpollEvents // MmapProt is the protection argument in mmap(2). MmapProt // MmapFlags is the flags argument in mmap(2). MmapFlags // CloseRangeFlags are close_range(2) flags. CloseRangeFlags ) // defaultFormat is the syscall argument format to use if the actual format is // not known. It formats all six arguments as hex. var defaultFormat = []FormatSpecifier{Hex, Hex, Hex, Hex, Hex, Hex} // SyscallInfo captures the name and printing format of a syscall. type SyscallInfo struct { // name is the name of the syscall. name string // format contains the format specifiers for each argument. // // Syscall calls can have up to six arguments. Arguments without a // corresponding entry in format will not be printed. format []FormatSpecifier } // makeSyscallInfo returns a SyscallInfo for a syscall. func makeSyscallInfo(name string, f ...FormatSpecifier) SyscallInfo { return SyscallInfo{name: name, format: f} } // SyscallMap maps syscalls into names and printing formats. type SyscallMap map[uintptr]SyscallInfo var _ kernel.Stracer = (SyscallMap)(nil) // syscallTable contains the syscalls for a specific OS/Arch. type syscallTable struct { // os is the operating system this table targets. os abi.OS // arch is the architecture this table targets. arch arch.Arch // syscalls contains the syscall mappings. syscalls SyscallMap } var syscallTables []syscallTable // Lookup returns the SyscallMap for the OS/Arch combination. The returned map // must not be changed. func Lookup(os abi.OS, a arch.Arch) (SyscallMap, bool) { for _, s := range syscallTables { if s.os == os && s.arch == a { return s.syscalls, true } } return nil, false } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/000077500000000000000000000000001465435605700226055ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/000077500000000000000000000000001465435605700237445ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/error.go000066400000000000000000000134701465435605700254310ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) var ( partialResultOnce sync.Once ) // incrementPartialResultMetric increments PartialResultMetric by calling // Increment(). This is added as the func Do() which is called below requires // us to pass a function which does not take any arguments, whereas Increment() // takes a variadic number of arguments. func incrementPartialResultMetric() { metric.WeirdnessMetric.Increment(&metric.WeirdnessTypePartialResult) } // HandleIOError handles special error cases for partial results. For some // errors, we may consume the error and return only the partial read/write. // // op and f are used only for panics. func HandleIOError(ctx context.Context, partialResult bool, ioerr, intr error, op string, f *vfs.FileDescription) error { known, err := handleIOErrorImpl(ctx, partialResult, ioerr, intr, op) if err != nil { return err } if !known { // An unknown error is encountered with a partial read/write. fs := f.Mount().Filesystem().VirtualFilesystem() root := vfs.RootFromContext(ctx) name, _ := fs.PathnameWithDeleted(ctx, root, f.VirtualDentry()) log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q", partialResult, ioerr, ioerr, op, name) partialResultOnce.Do(incrementPartialResultMetric) } return nil } // handleIOError handles special error cases for partial results. For some // errors, we may consume the error and return only the partial read/write. // // Returns false if error is unknown. func handleIOErrorImpl(ctx context.Context, partialResult bool, errOrig, intr error, op string) (bool, error) { if errOrig == nil { // Typical successful syscall. return true, nil } // Translate error, if possible, to consolidate errors from other packages // into a smaller set of errors from linuxerr package. translatedErr := errOrig if errno, ok := linuxerr.TranslateError(errOrig); ok { translatedErr = errno } switch { case translatedErr == io.EOF: // EOF is always consumed. If this is a partial read/write // (result != 0), the application will see that, otherwise // they will see 0. return true, nil case linuxerr.Equals(linuxerr.EFBIG, translatedErr): t := kernel.TaskFromContext(ctx) if t == nil { panic("I/O error should only occur from a context associated with a Task") } // Ignore partialResult because this error only applies to // normal files, and for those files we cannot accumulate // write results. // // Do not consume the error and return it as EFBIG. // Simultaneously send a SIGXFSZ per setrlimit(2). t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) return true, linuxerr.EFBIG case linuxerr.Equals(linuxerr.EINTR, translatedErr): // The syscall was interrupted. Return nil if it completed // partially, otherwise return the error code that the syscall // needs (to indicate to the kernel what it should do). if partialResult { return true, nil } return true, intr } if !partialResult { // Typical syscall error. return true, errOrig } switch { case linuxerr.Equals(linuxerr.EINTR, translatedErr): // Syscall interrupted, but completed a partial // read/write. Like ErrWouldBlock, since we have a // partial read/write, we consume the error and return // the partial result. return true, nil case linuxerr.Equals(linuxerr.EFAULT, translatedErr): // EFAULT is only shown the user if nothing was // read/written. If we read something (this case), they see // a partial read/write. They will then presumably try again // with an incremented buffer, which will EFAULT with // result == 0. return true, nil case linuxerr.Equals(linuxerr.EPIPE, translatedErr): // Writes to a pipe or socket will return EPIPE if the other // side is gone. The partial write is returned. EPIPE will be // returned on the next call. // // TODO(gvisor.dev/issue/161): In some cases SIGPIPE should // also be sent to the application. return true, nil case linuxerr.Equals(linuxerr.ENOSPC, translatedErr): // Similar to EPIPE. Return what we wrote this time, and let // ENOSPC be returned on the next call. return true, nil case linuxerr.Equals(linuxerr.ECONNRESET, translatedErr): fallthrough case linuxerr.Equals(linuxerr.ECONNABORTED, translatedErr): fallthrough case linuxerr.Equals(linuxerr.ETIMEDOUT, translatedErr): // For TCP sendfile connections, we may have a reset, abort or timeout. But // we should just return the partial result. The next call will return the // error without a partial IO operation. return true, nil case linuxerr.Equals(linuxerr.EWOULDBLOCK, translatedErr): // Syscall would block, but completed a partial read/write. // This case should only be returned by IssueIO for nonblocking // files. Since we have a partial read/write, we consume // ErrWouldBlock, returning the partial result. return true, nil case linuxerr.IsRestartError(translatedErr): // Identical to the EINTR case. return true, nil } // Error is unknown and cannot be properly handled. return false, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/linux64.go000066400000000000000000001413261465435605700256130ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package linux provides syscall tables for amd64 and arm64 Linux. package linux import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/syscalls" ) const ( // LinuxSysname is the OS name advertised by gVisor. LinuxSysname = "Linux" // LinuxRelease is the Linux release version number advertised by gVisor. LinuxRelease = "4.4.0" // LinuxVersion is the version info advertised by gVisor. LinuxVersion = "#1 SMP Sun Jan 10 15:06:54 PST 2016" ) // AMD64 is a table of Linux amd64 syscall API with the corresponding syscall // numbers from Linux 4.4. var AMD64 = &kernel.SyscallTable{ OS: abi.Linux, Arch: arch.AMD64, Version: kernel.Version{ // Version 4.4 is chosen as a stable, longterm version of Linux, which // guides the interface provided by this syscall table. The build // version is that for a clean build with default kernel config, at 5 // minutes after v4.4 was tagged. Sysname: LinuxSysname, Release: LinuxRelease, Version: LinuxVersion, }, AuditNumber: linux.AUDIT_ARCH_X86_64, Table: map[uintptr]kernel.Syscall{ 0: syscalls.SupportedPoint("read", Read, PointRead), 1: syscalls.SupportedPoint("write", Write, PointWrite), 2: syscalls.SupportedPoint("open", Open, PointOpen), 3: syscalls.SupportedPoint("close", Close, PointClose), 4: syscalls.Supported("stat", Stat), 5: syscalls.Supported("fstat", Fstat), 6: syscalls.Supported("lstat", Lstat), 7: syscalls.Supported("poll", Poll), 8: syscalls.Supported("lseek", Lseek), 9: syscalls.Supported("mmap", Mmap), 10: syscalls.Supported("mprotect", Mprotect), 11: syscalls.Supported("munmap", Munmap), 12: syscalls.Supported("brk", Brk), 13: syscalls.Supported("rt_sigaction", RtSigaction), 14: syscalls.Supported("rt_sigprocmask", RtSigprocmask), 15: syscalls.Supported("rt_sigreturn", RtSigreturn), 16: syscalls.Supported("ioctl", Ioctl), 17: syscalls.SupportedPoint("pread64", Pread64, PointPread64), 18: syscalls.SupportedPoint("pwrite64", Pwrite64, PointPwrite64), 19: syscalls.SupportedPoint("readv", Readv, PointReadv), 20: syscalls.SupportedPoint("writev", Writev, PointWritev), 21: syscalls.Supported("access", Access), 22: syscalls.SupportedPoint("pipe", Pipe, PointPipe), 23: syscalls.Supported("select", Select), 24: syscalls.Supported("sched_yield", SchedYield), 25: syscalls.Supported("mremap", Mremap), 26: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil), 27: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), 28: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), 29: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), 30: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil), 31: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), 32: syscalls.SupportedPoint("dup", Dup, PointDup), 33: syscalls.SupportedPoint("dup2", Dup2, PointDup2), 34: syscalls.Supported("pause", Pause), 35: syscalls.Supported("nanosleep", Nanosleep), 36: syscalls.Supported("getitimer", Getitimer), 37: syscalls.Supported("alarm", Alarm), 38: syscalls.Supported("setitimer", Setitimer), 39: syscalls.Supported("getpid", Getpid), 40: syscalls.Supported("sendfile", Sendfile), 41: syscalls.SupportedPoint("socket", Socket, PointSocket), 42: syscalls.SupportedPoint("connect", Connect, PointConnect), 43: syscalls.SupportedPoint("accept", Accept, PointAccept), 44: syscalls.Supported("sendto", SendTo), 45: syscalls.Supported("recvfrom", RecvFrom), 46: syscalls.Supported("sendmsg", SendMsg), 47: syscalls.Supported("recvmsg", RecvMsg), 48: syscalls.Supported("shutdown", Shutdown), 49: syscalls.SupportedPoint("bind", Bind, PointBind), 50: syscalls.Supported("listen", Listen), 51: syscalls.Supported("getsockname", GetSockName), 52: syscalls.Supported("getpeername", GetPeerName), 53: syscalls.SupportedPoint("socketpair", SocketPair, PointSocketpair), 54: syscalls.Supported("setsockopt", SetSockOpt), 55: syscalls.Supported("getsockopt", GetSockOpt), 56: syscalls.PartiallySupportedPoint("clone", Clone, PointClone, "Options CLONE_PIDFD, CLONE_NEWCGROUP, CLONE_PARENT, CLONE_NEWTIME, CLONE_CLEAR_SIGHAND, and CLONE_SYSVSEM not supported.", nil), 57: syscalls.SupportedPoint("fork", Fork, PointFork), 58: syscalls.SupportedPoint("vfork", Vfork, PointVfork), 59: syscalls.SupportedPoint("execve", Execve, PointExecve), 60: syscalls.Supported("exit", Exit), 61: syscalls.Supported("wait4", Wait4), 62: syscalls.Supported("kill", Kill), 63: syscalls.Supported("uname", Uname), 64: syscalls.Supported("semget", Semget), 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), 66: syscalls.Supported("semctl", Semctl), 67: syscalls.Supported("shmdt", Shmdt), 68: syscalls.Supported("msgget", Msgget), 69: syscalls.Supported("msgsnd", Msgsnd), 70: syscalls.Supported("msgrcv", Msgrcv), 71: syscalls.Supported("msgctl", Msgctl), 72: syscalls.SupportedPoint("fcntl", Fcntl, PointFcntl), 73: syscalls.Supported("flock", Flock), 74: syscalls.Supported("fsync", Fsync), 75: syscalls.Supported("fdatasync", Fdatasync), 76: syscalls.Supported("truncate", Truncate), 77: syscalls.Supported("ftruncate", Ftruncate), 78: syscalls.Supported("getdents", Getdents), 79: syscalls.Supported("getcwd", Getcwd), 80: syscalls.SupportedPoint("chdir", Chdir, PointChdir), 81: syscalls.SupportedPoint("fchdir", Fchdir, PointFchdir), 82: syscalls.Supported("rename", Rename), 83: syscalls.Supported("mkdir", Mkdir), 84: syscalls.Supported("rmdir", Rmdir), 85: syscalls.SupportedPoint("creat", Creat, PointCreat), 86: syscalls.Supported("link", Link), 87: syscalls.Supported("unlink", Unlink), 88: syscalls.Supported("symlink", Symlink), 89: syscalls.Supported("readlink", Readlink), 90: syscalls.Supported("chmod", Chmod), 91: syscalls.Supported("fchmod", Fchmod), 92: syscalls.Supported("chown", Chown), 93: syscalls.Supported("fchown", Fchown), 94: syscalls.Supported("lchown", Lchown), 95: syscalls.Supported("umask", Umask), 96: syscalls.Supported("gettimeofday", Gettimeofday), 97: syscalls.Supported("getrlimit", Getrlimit), 98: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil), 99: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), 100: syscalls.Supported("times", Times), 101: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil), 102: syscalls.Supported("getuid", Getuid), 103: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil), 104: syscalls.Supported("getgid", Getgid), 105: syscalls.SupportedPoint("setuid", Setuid, PointSetuid), 106: syscalls.SupportedPoint("setgid", Setgid, PointSetgid), 107: syscalls.Supported("geteuid", Geteuid), 108: syscalls.Supported("getegid", Getegid), 109: syscalls.Supported("setpgid", Setpgid), 110: syscalls.Supported("getppid", Getppid), 111: syscalls.Supported("getpgrp", Getpgrp), 112: syscalls.SupportedPoint("setsid", Setsid, PointSetsid), 113: syscalls.Supported("setreuid", Setreuid), 114: syscalls.Supported("setregid", Setregid), 115: syscalls.Supported("getgroups", Getgroups), 116: syscalls.Supported("setgroups", Setgroups), 117: syscalls.SupportedPoint("setresuid", Setresuid, PointSetresuid), 118: syscalls.Supported("getresuid", Getresuid), 119: syscalls.SupportedPoint("setresgid", Setresgid, PointSetresgid), 120: syscalls.Supported("getresgid", Getresgid), 121: syscalls.Supported("getpgid", Getpgid), 122: syscalls.ErrorWithEvent("setfsuid", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) 123: syscalls.ErrorWithEvent("setfsgid", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) 124: syscalls.Supported("getsid", Getsid), 125: syscalls.Supported("capget", Capget), 126: syscalls.Supported("capset", Capset), 127: syscalls.Supported("rt_sigpending", RtSigpending), 128: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait), 129: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo), 130: syscalls.Supported("rt_sigsuspend", RtSigsuspend), 131: syscalls.Supported("sigaltstack", Sigaltstack), 132: syscalls.Supported("utime", Utime), 133: syscalls.Supported("mknod", Mknod), 134: syscalls.Error("uselib", linuxerr.ENOSYS, "Obsolete", nil), 135: syscalls.ErrorWithEvent("personality", linuxerr.EINVAL, "Unable to change personality.", nil), 136: syscalls.ErrorWithEvent("ustat", linuxerr.ENOSYS, "Needs filesystem support.", nil), 137: syscalls.Supported("statfs", Statfs), 138: syscalls.Supported("fstatfs", Fstatfs), 139: syscalls.ErrorWithEvent("sysfs", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/165"}), 140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), 141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), 142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), 143: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil), 144: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil), 145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), 146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), 147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), 148: syscalls.ErrorWithEvent("sched_rr_get_interval", linuxerr.EPERM, "", nil), 149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), 154: syscalls.Error("modify_ldt", linuxerr.EPERM, "", nil), 155: syscalls.Supported("pivot_root", PivotRoot), 156: syscalls.Error("sysctl", linuxerr.EPERM, "Deprecated. Use /proc/sys instead.", nil), 157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), 158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil), 159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), 160: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil), 161: syscalls.SupportedPoint("chroot", Chroot, PointChroot), 162: syscalls.Supported("sync", Sync), 163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), 164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), 165: syscalls.Supported("mount", Mount), 166: syscalls.Supported("umount2", Umount2), 167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), 168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), 169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), 170: syscalls.Supported("sethostname", Sethostname), 171: syscalls.Supported("setdomainname", Setdomainname), 172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil), 173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil), 174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil), 175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), 176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), 177: syscalls.Error("get_kernel_syms", linuxerr.ENOSYS, "Not supported in Linux > 2.6.", nil), 178: syscalls.Error("query_module", linuxerr.ENOSYS, "Not supported in Linux > 2.6.", nil), 179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations 180: syscalls.Error("nfsservctl", linuxerr.ENOSYS, "Removed after Linux 3.1.", nil), 181: syscalls.Error("getpmsg", linuxerr.ENOSYS, "Not implemented in Linux.", nil), 182: syscalls.Error("putpmsg", linuxerr.ENOSYS, "Not implemented in Linux.", nil), 183: syscalls.PartiallySupported("afs_syscall", AFSSyscall, "Test implementation.", nil), 184: syscalls.Error("tuxcall", linuxerr.ENOSYS, "Not implemented in Linux.", nil), 185: syscalls.Error("security", linuxerr.ENOSYS, "Not implemented in Linux.", nil), 186: syscalls.Supported("gettid", Gettid), 187: syscalls.Supported("readahead", Readahead), 188: syscalls.Supported("setxattr", SetXattr), 189: syscalls.Supported("lsetxattr", Lsetxattr), 190: syscalls.Supported("fsetxattr", Fsetxattr), 191: syscalls.Supported("getxattr", GetXattr), 192: syscalls.Supported("lgetxattr", Lgetxattr), 193: syscalls.Supported("fgetxattr", Fgetxattr), 194: syscalls.Supported("listxattr", ListXattr), 195: syscalls.Supported("llistxattr", Llistxattr), 196: syscalls.Supported("flistxattr", Flistxattr), 197: syscalls.Supported("removexattr", RemoveXattr), 198: syscalls.Supported("lremovexattr", Lremovexattr), 199: syscalls.Supported("fremovexattr", Fremovexattr), 200: syscalls.Supported("tkill", Tkill), 201: syscalls.Supported("time", Time), 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), 203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), 204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), 205: syscalls.Error("set_thread_area", linuxerr.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), 206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 211: syscalls.Error("get_thread_area", linuxerr.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), 212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), 213: syscalls.Supported("epoll_create", EpollCreate), 214: syscalls.ErrorWithEvent("epoll_ctl_old", linuxerr.ENOSYS, "Deprecated.", nil), 215: syscalls.ErrorWithEvent("epoll_wait_old", linuxerr.ENOSYS, "Deprecated.", nil), 216: syscalls.ErrorWithEvent("remap_file_pages", linuxerr.ENOSYS, "Deprecated since Linux 3.16.", nil), 217: syscalls.Supported("getdents64", Getdents64), 218: syscalls.Supported("set_tid_address", SetTidAddress), 219: syscalls.Supported("restart_syscall", RestartSyscall), 220: syscalls.Supported("semtimedop", Semtimedop), 221: syscalls.PartiallySupported("fadvise64", Fadvise64, "The syscall is 'supported', but ignores all provided advice.", nil), 222: syscalls.Supported("timer_create", TimerCreate), 223: syscalls.Supported("timer_settime", TimerSettime), 224: syscalls.Supported("timer_gettime", TimerGettime), 225: syscalls.Supported("timer_getoverrun", TimerGetoverrun), 226: syscalls.Supported("timer_delete", TimerDelete), 227: syscalls.Supported("clock_settime", ClockSettime), 228: syscalls.Supported("clock_gettime", ClockGettime), 229: syscalls.Supported("clock_getres", ClockGetres), 230: syscalls.Supported("clock_nanosleep", ClockNanosleep), 231: syscalls.Supported("exit_group", ExitGroup), 232: syscalls.Supported("epoll_wait", EpollWait), 233: syscalls.Supported("epoll_ctl", EpollCtl), 234: syscalls.Supported("tgkill", Tgkill), 235: syscalls.Supported("utimes", Utimes), 236: syscalls.Error("vserver", linuxerr.ENOSYS, "Not implemented by Linux", nil), 237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), 238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), 239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), 240: syscalls.Supported("mq_open", MqOpen), 241: syscalls.Supported("mq_unlink", MqUnlink), 242: syscalls.ErrorWithEvent("mq_timedsend", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 243: syscalls.ErrorWithEvent("mq_timedreceive", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 244: syscalls.ErrorWithEvent("mq_notify", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 245: syscalls.ErrorWithEvent("mq_getsetattr", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), 247: syscalls.Supported("waitid", Waitid), 248: syscalls.Error("add_key", linuxerr.EACCES, "Not available to user.", nil), 249: syscalls.Error("request_key", linuxerr.EACCES, "Not available to user.", nil), 250: syscalls.PartiallySupported("keyctl", Keyctl, "Only supports session keyrings with zero keys in them.", nil), 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) 253: syscalls.PartiallySupportedPoint("inotify_init", InotifyInit, PointInotifyInit, "inotify events are only available inside the sandbox.", nil), 254: syscalls.PartiallySupportedPoint("inotify_add_watch", InotifyAddWatch, PointInotifyAddWatch, "inotify events are only available inside the sandbox.", nil), 255: syscalls.PartiallySupportedPoint("inotify_rm_watch", InotifyRmWatch, PointInotifyRmWatch, "inotify events are only available inside the sandbox.", nil), 256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), 257: syscalls.SupportedPoint("openat", Openat, PointOpenat), 258: syscalls.Supported("mkdirat", Mkdirat), 259: syscalls.Supported("mknodat", Mknodat), 260: syscalls.Supported("fchownat", Fchownat), 261: syscalls.Supported("futimesat", Futimesat), 262: syscalls.Supported("newfstatat", Newfstatat), 263: syscalls.Supported("unlinkat", Unlinkat), 264: syscalls.Supported("renameat", Renameat), 265: syscalls.Supported("linkat", Linkat), 266: syscalls.Supported("symlinkat", Symlinkat), 267: syscalls.Supported("readlinkat", Readlinkat), 268: syscalls.Supported("fchmodat", Fchmodat), 269: syscalls.Supported("faccessat", Faccessat), 270: syscalls.Supported("pselect6", Pselect6), 271: syscalls.Supported("ppoll", Ppoll), 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), 273: syscalls.Supported("set_robust_list", SetRobustList), 274: syscalls.Supported("get_robust_list", GetRobustList), 275: syscalls.Supported("splice", Splice), 276: syscalls.Supported("tee", Tee), 277: syscalls.Supported("sync_file_range", SyncFileRange), 278: syscalls.ErrorWithEvent("vmsplice", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) 280: syscalls.Supported("utimensat", Utimensat), 281: syscalls.Supported("epoll_pwait", EpollPwait), 282: syscalls.SupportedPoint("signalfd", Signalfd, PointSignalfd), 283: syscalls.SupportedPoint("timerfd_create", TimerfdCreate, PointTimerfdCreate), 284: syscalls.SupportedPoint("eventfd", Eventfd, PointEventfd), 285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), 286: syscalls.SupportedPoint("timerfd_settime", TimerfdSettime, PointTimerfdSettime), 287: syscalls.SupportedPoint("timerfd_gettime", TimerfdGettime, PointTimerfdGettime), 288: syscalls.SupportedPoint("accept4", Accept4, PointAccept4), 289: syscalls.SupportedPoint("signalfd4", Signalfd4, PointSignalfd4), 290: syscalls.SupportedPoint("eventfd2", Eventfd2, PointEventfd2), 291: syscalls.Supported("epoll_create1", EpollCreate1), 292: syscalls.SupportedPoint("dup3", Dup3, PointDup3), 293: syscalls.SupportedPoint("pipe2", Pipe2, PointPipe2), 294: syscalls.PartiallySupportedPoint("inotify_init1", InotifyInit1, PointInotifyInit1, "inotify events are only available inside the sandbox.", nil), 295: syscalls.SupportedPoint("preadv", Preadv, PointPreadv), 296: syscalls.SupportedPoint("pwritev", Pwritev, PointPwritev), 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), 298: syscalls.ErrorWithEvent("perf_event_open", linuxerr.ENODEV, "No support for perf counters", nil), 299: syscalls.Supported("recvmmsg", RecvMMsg), 300: syscalls.ErrorWithEvent("fanotify_init", linuxerr.ENOSYS, "Needs CONFIG_FANOTIFY", nil), 301: syscalls.ErrorWithEvent("fanotify_mark", linuxerr.ENOSYS, "Needs CONFIG_FANOTIFY", nil), 302: syscalls.SupportedPoint("prlimit64", Prlimit64, PointPrlimit64), 303: syscalls.Error("name_to_handle_at", linuxerr.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), 304: syscalls.Error("open_by_handle_at", linuxerr.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), 306: syscalls.Supported("syncfs", Syncfs), 307: syscalls.Supported("sendmmsg", SendMMsg), 308: syscalls.Supported("setns", Setns), 309: syscalls.Supported("getcpu", Getcpu), 310: syscalls.Supported("process_vm_readv", ProcessVMReadv), 311: syscalls.Supported("process_vm_writev", ProcessVMWritev), 312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), 313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), 314: syscalls.ErrorWithEvent("sched_setattr", linuxerr.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) 315: syscalls.ErrorWithEvent("sched_getattr", linuxerr.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) 316: syscalls.Supported("renameat2", Renameat2), 317: syscalls.Supported("seccomp", Seccomp), 318: syscalls.Supported("getrandom", GetRandom), 319: syscalls.Supported("memfd_create", MemfdCreate), 320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil), 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), 322: syscalls.SupportedPoint("execveat", Execveat, PointExecveat), 323: syscalls.ErrorWithEvent("userfaultfd", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) 324: syscalls.PartiallySupported("membarrier", Membarrier, "Not supported on all platforms.", nil), 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), // Syscalls implemented after 325 are "backports" from versions // of Linux after 4.4. 326: syscalls.ErrorWithEvent("copy_file_range", linuxerr.ENOSYS, "", nil), 327: syscalls.SupportedPoint("preadv2", Preadv2, PointPreadv2), 328: syscalls.SupportedPoint("pwritev2", Pwritev2, PointPwritev2), 329: syscalls.ErrorWithEvent("pkey_mprotect", linuxerr.ENOSYS, "", nil), 330: syscalls.ErrorWithEvent("pkey_alloc", linuxerr.ENOSYS, "", nil), 331: syscalls.ErrorWithEvent("pkey_free", linuxerr.ENOSYS, "", nil), 332: syscalls.Supported("statx", Statx), 333: syscalls.ErrorWithEvent("io_pgetevents", linuxerr.ENOSYS, "", nil), 334: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), // Linux skips ahead to syscall 424 to sync numbers between arches. 424: syscalls.ErrorWithEvent("pidfd_send_signal", linuxerr.ENOSYS, "", nil), 425: syscalls.PartiallySupported("io_uring_setup", IOUringSetup, "Not all flags and functionality supported.", nil), 426: syscalls.PartiallySupported("io_uring_enter", IOUringEnter, "Not all flags and functionality supported.", nil), 427: syscalls.ErrorWithEvent("io_uring_register", linuxerr.ENOSYS, "", nil), 428: syscalls.ErrorWithEvent("open_tree", linuxerr.ENOSYS, "", nil), 429: syscalls.ErrorWithEvent("move_mount", linuxerr.ENOSYS, "", nil), 430: syscalls.ErrorWithEvent("fsopen", linuxerr.ENOSYS, "", nil), 431: syscalls.ErrorWithEvent("fsconfig", linuxerr.ENOSYS, "", nil), 432: syscalls.ErrorWithEvent("fsmount", linuxerr.ENOSYS, "", nil), 433: syscalls.ErrorWithEvent("fspick", linuxerr.ENOSYS, "", nil), 434: syscalls.ErrorWithEvent("pidfd_open", linuxerr.ENOSYS, "", nil), 435: syscalls.PartiallySupported("clone3", Clone3, "Options CLONE_PIDFD, CLONE_NEWCGROUP, CLONE_INTO_CGROUP, CLONE_NEWTIME, CLONE_CLEAR_SIGHAND, CLONE_PARENT, CLONE_SYSVSEM and, SetTid are not supported.", nil), 436: syscalls.Supported("close_range", CloseRange), 439: syscalls.Supported("faccessat2", Faccessat2), 441: syscalls.Supported("epoll_pwait2", EpollPwait2), }, Emulate: map[hostarch.Addr]uintptr{ 0xffffffffff600000: 96, // vsyscall gettimeofday(2) 0xffffffffff600400: 201, // vsyscall time(2) 0xffffffffff600800: 309, // vsyscall getcpu(2) }, Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { t.Kernel().EmitUnimplementedEvent(t, sysno) return 0, linuxerr.ENOSYS }, } // ARM64 is a table of Linux arm64 syscall API with the corresponding syscall // numbers from Linux 4.4. var ARM64 = &kernel.SyscallTable{ OS: abi.Linux, Arch: arch.ARM64, Version: kernel.Version{ Sysname: LinuxSysname, Release: LinuxRelease, Version: LinuxVersion, }, AuditNumber: linux.AUDIT_ARCH_AARCH64, Table: map[uintptr]kernel.Syscall{ 0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), 5: syscalls.Supported("setxattr", SetXattr), 6: syscalls.Supported("lsetxattr", Lsetxattr), 7: syscalls.Supported("fsetxattr", Fsetxattr), 8: syscalls.Supported("getxattr", GetXattr), 9: syscalls.Supported("lgetxattr", Lgetxattr), 10: syscalls.Supported("fgetxattr", Fgetxattr), 11: syscalls.Supported("listxattr", ListXattr), 12: syscalls.Supported("llistxattr", Llistxattr), 13: syscalls.Supported("flistxattr", Flistxattr), 14: syscalls.Supported("removexattr", RemoveXattr), 15: syscalls.Supported("lremovexattr", Lremovexattr), 16: syscalls.Supported("fremovexattr", Fremovexattr), 17: syscalls.Supported("getcwd", Getcwd), 18: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), 19: syscalls.SupportedPoint("eventfd2", Eventfd2, PointEventfd2), 20: syscalls.Supported("epoll_create1", EpollCreate1), 21: syscalls.Supported("epoll_ctl", EpollCtl), 22: syscalls.Supported("epoll_pwait", EpollPwait), 23: syscalls.SupportedPoint("dup", Dup, PointDup), 24: syscalls.SupportedPoint("dup3", Dup3, PointDup3), 25: syscalls.SupportedPoint("fcntl", Fcntl, PointFcntl), 26: syscalls.PartiallySupportedPoint("inotify_init1", InotifyInit1, PointInotifyInit1, "inotify events are only available inside the sandbox.", nil), 27: syscalls.PartiallySupportedPoint("inotify_add_watch", InotifyAddWatch, PointInotifyAddWatch, "inotify events are only available inside the sandbox.", nil), 28: syscalls.PartiallySupportedPoint("inotify_rm_watch", InotifyRmWatch, PointInotifyRmWatch, "inotify events are only available inside the sandbox.", nil), 29: syscalls.Supported("ioctl", Ioctl), 30: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) 31: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) 32: syscalls.Supported("flock", Flock), 33: syscalls.Supported("mknodat", Mknodat), 34: syscalls.Supported("mkdirat", Mkdirat), 35: syscalls.Supported("unlinkat", Unlinkat), 36: syscalls.Supported("symlinkat", Symlinkat), 37: syscalls.Supported("linkat", Linkat), 38: syscalls.Supported("renameat", Renameat), 39: syscalls.Supported("umount2", Umount2), 40: syscalls.Supported("mount", Mount), 41: syscalls.Supported("pivot_root", PivotRoot), 42: syscalls.Error("nfsservctl", linuxerr.ENOSYS, "Removed after Linux 3.1.", nil), 43: syscalls.Supported("statfs", Statfs), 44: syscalls.Supported("fstatfs", Fstatfs), 45: syscalls.Supported("truncate", Truncate), 46: syscalls.Supported("ftruncate", Ftruncate), 47: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), 48: syscalls.Supported("faccessat", Faccessat), 49: syscalls.SupportedPoint("chdir", Chdir, PointChdir), 50: syscalls.SupportedPoint("fchdir", Fchdir, PointFchdir), 51: syscalls.SupportedPoint("chroot", Chroot, PointChroot), 52: syscalls.Supported("fchmod", Fchmod), 53: syscalls.Supported("fchmodat", Fchmodat), 54: syscalls.Supported("fchownat", Fchownat), 55: syscalls.Supported("fchown", Fchown), 56: syscalls.SupportedPoint("openat", Openat, PointOpenat), 57: syscalls.SupportedPoint("close", Close, PointClose), 58: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), 59: syscalls.SupportedPoint("pipe2", Pipe2, PointPipe2), 60: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations 61: syscalls.Supported("getdents64", Getdents64), 62: syscalls.Supported("lseek", Lseek), 63: syscalls.SupportedPoint("read", Read, PointRead), 64: syscalls.SupportedPoint("write", Write, PointWrite), 65: syscalls.SupportedPoint("readv", Readv, PointReadv), 66: syscalls.SupportedPoint("writev", Writev, PointWritev), 67: syscalls.SupportedPoint("pread64", Pread64, PointPread64), 68: syscalls.SupportedPoint("pwrite64", Pwrite64, PointPwrite64), 69: syscalls.SupportedPoint("preadv", Preadv, PointPreadv), 70: syscalls.SupportedPoint("pwritev", Pwritev, PointPwritev), 71: syscalls.Supported("sendfile", Sendfile), 72: syscalls.Supported("pselect6", Pselect6), 73: syscalls.Supported("ppoll", Ppoll), 74: syscalls.SupportedPoint("signalfd4", Signalfd4, PointSignalfd4), 75: syscalls.ErrorWithEvent("vmsplice", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) 76: syscalls.Supported("splice", Splice), 77: syscalls.Supported("tee", Tee), 78: syscalls.Supported("readlinkat", Readlinkat), 79: syscalls.Supported("newfstatat", Newfstatat), 80: syscalls.Supported("fstat", Fstat), 81: syscalls.Supported("sync", Sync), 82: syscalls.Supported("fsync", Fsync), 83: syscalls.Supported("fdatasync", Fdatasync), 84: syscalls.Supported("sync_file_range", SyncFileRange), 85: syscalls.SupportedPoint("timerfd_create", TimerfdCreate, PointTimerfdCreate), 86: syscalls.SupportedPoint("timerfd_settime", TimerfdSettime, PointTimerfdSettime), 87: syscalls.SupportedPoint("timerfd_gettime", TimerfdGettime, PointTimerfdGettime), 88: syscalls.Supported("utimensat", Utimensat), 89: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), 90: syscalls.Supported("capget", Capget), 91: syscalls.Supported("capset", Capset), 92: syscalls.ErrorWithEvent("personality", linuxerr.EINVAL, "Unable to change personality.", nil), 93: syscalls.Supported("exit", Exit), 94: syscalls.Supported("exit_group", ExitGroup), 95: syscalls.Supported("waitid", Waitid), 96: syscalls.Supported("set_tid_address", SetTidAddress), 97: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), 98: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), 99: syscalls.Supported("set_robust_list", SetRobustList), 100: syscalls.Supported("get_robust_list", GetRobustList), 101: syscalls.Supported("nanosleep", Nanosleep), 102: syscalls.Supported("getitimer", Getitimer), 103: syscalls.Supported("setitimer", Setitimer), 104: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), 105: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), 106: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), 107: syscalls.Supported("timer_create", TimerCreate), 108: syscalls.Supported("timer_gettime", TimerGettime), 109: syscalls.Supported("timer_getoverrun", TimerGetoverrun), 110: syscalls.Supported("timer_settime", TimerSettime), 111: syscalls.Supported("timer_delete", TimerDelete), 112: syscalls.Supported("clock_settime", ClockSettime), 113: syscalls.Supported("clock_gettime", ClockGettime), 114: syscalls.Supported("clock_getres", ClockGetres), 115: syscalls.Supported("clock_nanosleep", ClockNanosleep), 116: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil), 117: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil), 118: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), 119: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil), 120: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), 121: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil), 122: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), 123: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), 124: syscalls.Supported("sched_yield", SchedYield), 125: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), 126: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), 127: syscalls.ErrorWithEvent("sched_rr_get_interval", linuxerr.EPERM, "", nil), 128: syscalls.Supported("restart_syscall", RestartSyscall), 129: syscalls.Supported("kill", Kill), 130: syscalls.Supported("tkill", Tkill), 131: syscalls.Supported("tgkill", Tgkill), 132: syscalls.Supported("sigaltstack", Sigaltstack), 133: syscalls.Supported("rt_sigsuspend", RtSigsuspend), 134: syscalls.Supported("rt_sigaction", RtSigaction), 135: syscalls.Supported("rt_sigprocmask", RtSigprocmask), 136: syscalls.Supported("rt_sigpending", RtSigpending), 137: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait), 138: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo), 139: syscalls.Supported("rt_sigreturn", RtSigreturn), 140: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), 141: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), 142: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), 143: syscalls.Supported("setregid", Setregid), 144: syscalls.SupportedPoint("setgid", Setgid, PointSetgid), 145: syscalls.Supported("setreuid", Setreuid), 146: syscalls.SupportedPoint("setuid", Setuid, PointSetuid), 147: syscalls.SupportedPoint("setresuid", Setresuid, PointSetresuid), 148: syscalls.Supported("getresuid", Getresuid), 149: syscalls.SupportedPoint("setresgid", Setresgid, PointSetresgid), 150: syscalls.Supported("getresgid", Getresgid), 151: syscalls.ErrorWithEvent("setfsuid", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) 152: syscalls.ErrorWithEvent("setfsgid", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) 153: syscalls.Supported("times", Times), 154: syscalls.Supported("setpgid", Setpgid), 155: syscalls.Supported("getpgid", Getpgid), 156: syscalls.Supported("getsid", Getsid), 157: syscalls.SupportedPoint("setsid", Setsid, PointSetsid), 158: syscalls.Supported("getgroups", Getgroups), 159: syscalls.Supported("setgroups", Setgroups), 160: syscalls.Supported("uname", Uname), 161: syscalls.Supported("sethostname", Sethostname), 162: syscalls.Supported("setdomainname", Setdomainname), 163: syscalls.Supported("getrlimit", Getrlimit), 164: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil), 165: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil), 166: syscalls.Supported("umask", Umask), 167: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), 168: syscalls.Supported("getcpu", Getcpu), 169: syscalls.Supported("gettimeofday", Gettimeofday), 170: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), 171: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), 172: syscalls.Supported("getpid", Getpid), 173: syscalls.Supported("getppid", Getppid), 174: syscalls.Supported("getuid", Getuid), 175: syscalls.Supported("geteuid", Geteuid), 176: syscalls.Supported("getgid", Getgid), 177: syscalls.Supported("getegid", Getegid), 178: syscalls.Supported("gettid", Gettid), 179: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), 180: syscalls.Supported("mq_open", MqOpen), 181: syscalls.Supported("mq_unlink", MqUnlink), 182: syscalls.ErrorWithEvent("mq_timedsend", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 183: syscalls.ErrorWithEvent("mq_timedreceive", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 184: syscalls.ErrorWithEvent("mq_notify", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 185: syscalls.ErrorWithEvent("mq_getsetattr", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) 186: syscalls.Supported("msgget", Msgget), 187: syscalls.Supported("msgctl", Msgctl), 188: syscalls.Supported("msgrcv", Msgrcv), 189: syscalls.Supported("msgsnd", Msgsnd), 190: syscalls.Supported("semget", Semget), 191: syscalls.Supported("semctl", Semctl), 192: syscalls.Supported("semtimedop", Semtimedop), 193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), 194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), 195: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), 196: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil), 197: syscalls.Supported("shmdt", Shmdt), 198: syscalls.SupportedPoint("socket", Socket, PointSocket), 199: syscalls.SupportedPoint("socketpair", SocketPair, PointSocketpair), 200: syscalls.SupportedPoint("bind", Bind, PointBind), 201: syscalls.Supported("listen", Listen), 202: syscalls.SupportedPoint("accept", Accept, PointAccept), 203: syscalls.SupportedPoint("connect", Connect, PointConnect), 204: syscalls.Supported("getsockname", GetSockName), 205: syscalls.Supported("getpeername", GetPeerName), 206: syscalls.Supported("sendto", SendTo), 207: syscalls.Supported("recvfrom", RecvFrom), 208: syscalls.Supported("setsockopt", SetSockOpt), 209: syscalls.Supported("getsockopt", GetSockOpt), 210: syscalls.Supported("shutdown", Shutdown), 211: syscalls.Supported("sendmsg", SendMsg), 212: syscalls.Supported("recvmsg", RecvMsg), 213: syscalls.Supported("readahead", Readahead), 214: syscalls.Supported("brk", Brk), 215: syscalls.Supported("munmap", Munmap), 216: syscalls.Supported("mremap", Mremap), 217: syscalls.Error("add_key", linuxerr.EACCES, "Not available to user.", nil), 218: syscalls.Error("request_key", linuxerr.EACCES, "Not available to user.", nil), 219: syscalls.PartiallySupported("keyctl", Keyctl, "Only supports session keyrings with zero keys in them.", nil), 220: syscalls.PartiallySupportedPoint("clone", Clone, PointClone, "Options CLONE_PIDFD, CLONE_NEWCGROUP, CLONE_PARENT, CLONE_NEWTIME, CLONE_CLEAR_SIGHAND, and CLONE_SYSVSEM not supported.", nil), 221: syscalls.SupportedPoint("execve", Execve, PointExecve), 222: syscalls.Supported("mmap", Mmap), 223: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), 224: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), 225: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), 226: syscalls.Supported("mprotect", Mprotect), 227: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil), 228: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 229: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 230: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 231: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), 232: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), 233: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), 234: syscalls.ErrorWithEvent("remap_file_pages", linuxerr.ENOSYS, "Deprecated since Linux 3.16.", nil), 235: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), 236: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), 237: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), 238: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), 239: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) 240: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), 241: syscalls.ErrorWithEvent("perf_event_open", linuxerr.ENODEV, "No support for perf counters", nil), 242: syscalls.SupportedPoint("accept4", Accept4, PointAccept4), 243: syscalls.Supported("recvmmsg", RecvMMsg), 260: syscalls.Supported("wait4", Wait4), 261: syscalls.SupportedPoint("prlimit64", Prlimit64, PointPrlimit64), 262: syscalls.ErrorWithEvent("fanotify_init", linuxerr.ENOSYS, "Needs CONFIG_FANOTIFY", nil), 263: syscalls.ErrorWithEvent("fanotify_mark", linuxerr.ENOSYS, "Needs CONFIG_FANOTIFY", nil), 264: syscalls.Error("name_to_handle_at", linuxerr.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), 265: syscalls.Error("open_by_handle_at", linuxerr.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), 266: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), 267: syscalls.Supported("syncfs", Syncfs), 268: syscalls.Supported("setns", Setns), 269: syscalls.Supported("sendmmsg", SendMMsg), 270: syscalls.Supported("process_vm_readv", ProcessVMReadv), 271: syscalls.Supported("process_vm_writev", ProcessVMWritev), 272: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), 273: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), 274: syscalls.ErrorWithEvent("sched_setattr", linuxerr.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) 275: syscalls.ErrorWithEvent("sched_getattr", linuxerr.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) 276: syscalls.Supported("renameat2", Renameat2), 277: syscalls.Supported("seccomp", Seccomp), 278: syscalls.Supported("getrandom", GetRandom), 279: syscalls.Supported("memfd_create", MemfdCreate), 280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), 281: syscalls.SupportedPoint("execveat", Execveat, PointExecveat), 282: syscalls.ErrorWithEvent("userfaultfd", linuxerr.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) 283: syscalls.PartiallySupported("membarrier", Membarrier, "Not supported on all platforms.", nil), 284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), // Syscalls after 284 are "backports" from versions of Linux after 4.4. 285: syscalls.ErrorWithEvent("copy_file_range", linuxerr.ENOSYS, "", nil), 286: syscalls.SupportedPoint("preadv2", Preadv2, PointPreadv2), 287: syscalls.SupportedPoint("pwritev2", Pwritev2, PointPwritev2), 288: syscalls.ErrorWithEvent("pkey_mprotect", linuxerr.ENOSYS, "", nil), 289: syscalls.ErrorWithEvent("pkey_alloc", linuxerr.ENOSYS, "", nil), 290: syscalls.ErrorWithEvent("pkey_free", linuxerr.ENOSYS, "", nil), 291: syscalls.Supported("statx", Statx), 292: syscalls.ErrorWithEvent("io_pgetevents", linuxerr.ENOSYS, "", nil), 293: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), // Linux skips ahead to syscall 424 to sync numbers between arches. 424: syscalls.ErrorWithEvent("pidfd_send_signal", linuxerr.ENOSYS, "", nil), 425: syscalls.PartiallySupported("io_uring_setup", IOUringSetup, "Not all flags and functionality supported.", nil), 426: syscalls.PartiallySupported("io_uring_enter", IOUringEnter, "Not all flags and functionality supported.", nil), 427: syscalls.ErrorWithEvent("io_uring_register", linuxerr.ENOSYS, "", nil), 428: syscalls.ErrorWithEvent("open_tree", linuxerr.ENOSYS, "", nil), 429: syscalls.ErrorWithEvent("move_mount", linuxerr.ENOSYS, "", nil), 430: syscalls.ErrorWithEvent("fsopen", linuxerr.ENOSYS, "", nil), 431: syscalls.ErrorWithEvent("fsconfig", linuxerr.ENOSYS, "", nil), 432: syscalls.ErrorWithEvent("fsmount", linuxerr.ENOSYS, "", nil), 433: syscalls.ErrorWithEvent("fspick", linuxerr.ENOSYS, "", nil), 434: syscalls.ErrorWithEvent("pidfd_open", linuxerr.ENOSYS, "", nil), 435: syscalls.PartiallySupported("clone3", Clone3, "Options CLONE_PIDFD, CLONE_NEWCGROUP, CLONE_INTO_CGROUP, CLONE_NEWTIME, CLONE_CLEAR_SIGHAND, CLONE_PARENT, CLONE_SYSVSEM and clone_args.set_tid are not supported.", nil), 436: syscalls.Supported("close_range", CloseRange), 439: syscalls.Supported("faccessat2", Faccessat2), 441: syscalls.Supported("epoll_pwait2", EpollPwait2), }, Emulate: map[hostarch.Addr]uintptr{}, Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { t.Kernel().EmitUnimplementedEvent(t, sysno) return 0, linuxerr.ENOSYS }, } func init() { kernel.RegisterSyscallTable(AMD64) kernel.RegisterSyscallTable(ARM64) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/linux_abi_autogen_unsafe.go000066400000000000000000000614451465435605700313420ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. package linux import ( "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "io" "reflect" "runtime" "unsafe" ) // Marshallable types used by this file. var _ marshal.Marshallable = (*MessageHeader64)(nil) var _ marshal.Marshallable = (*SchedParam)(nil) var _ marshal.Marshallable = (*multipleMessageHeader64)(nil) var _ marshal.Marshallable = (*rlimit64)(nil) var _ marshal.Marshallable = (*sigSetWithSize)(nil) var _ marshal.Marshallable = (*userSockFprog)(nil) // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *sigSetWithSize) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *sigSetWithSize) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.sigsetAddr)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(s.sizeofSigset)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *sigSetWithSize) UnmarshalBytes(src []byte) []byte { s.sigsetAddr = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] s.sizeofSigset = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *sigSetWithSize) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *sigSetWithSize) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *sigSetWithSize) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *sigSetWithSize) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *sigSetWithSize) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *sigSetWithSize) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *sigSetWithSize) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *sigSetWithSize) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (r *rlimit64) SizeBytes() int { return 16 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (r *rlimit64) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.Cur)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(r.Max)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (r *rlimit64) UnmarshalBytes(src []byte) []byte { r.Cur = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] r.Max = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (r *rlimit64) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (r *rlimit64) MarshalUnsafe(dst []byte) []byte { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(r), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (r *rlimit64) UnmarshalUnsafe(src []byte) []byte { size := r.SizeBytes() gohacks.Memmove(unsafe.Pointer(r), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (r *rlimit64) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (r *rlimit64) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyOutN(cc, addr, r.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (r *rlimit64) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (r *rlimit64) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return r.CopyInN(cc, addr, r.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (r *rlimit64) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(r))) hdr.Len = r.SizeBytes() hdr.Cap = r.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that r // must live until the use above. runtime.KeepAlive(r) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (s *SchedParam) SizeBytes() int { return 4 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (s *SchedParam) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint32(dst[:4], uint32(s.schedPriority)) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (s *SchedParam) UnmarshalBytes(src []byte) []byte { s.schedPriority = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (s *SchedParam) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (s *SchedParam) MarshalUnsafe(dst []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(s), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (s *SchedParam) UnmarshalUnsafe(src []byte) []byte { size := s.SizeBytes() gohacks.Memmove(unsafe.Pointer(s), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (s *SchedParam) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (s *SchedParam) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyOutN(cc, addr, s.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (s *SchedParam) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (s *SchedParam) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return s.CopyInN(cc, addr, s.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (s *SchedParam) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(s))) hdr.Len = s.SizeBytes() hdr.Cap = s.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that s // must live until the use above. runtime.KeepAlive(s) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (u *userSockFprog) SizeBytes() int { return 10 + 1*6 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (u *userSockFprog) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint16(dst[:2], uint16(u.Len)) dst = dst[2:] // Padding: dst[:sizeof(byte)*6] ~= [6]byte{0} dst = dst[1*(6):] hostarch.ByteOrder.PutUint64(dst[:8], uint64(u.Filter)) dst = dst[8:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (u *userSockFprog) UnmarshalBytes(src []byte) []byte { u.Len = uint16(hostarch.ByteOrder.Uint16(src[:2])) src = src[2:] // Padding: ~ copy([6]byte(u._), src[:sizeof(byte)*6]) src = src[1*(6):] u.Filter = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (u *userSockFprog) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (u *userSockFprog) MarshalUnsafe(dst []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(u), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (u *userSockFprog) UnmarshalUnsafe(src []byte) []byte { size := u.SizeBytes() gohacks.Memmove(unsafe.Pointer(u), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (u *userSockFprog) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (u *userSockFprog) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyOutN(cc, addr, u.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (u *userSockFprog) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (u *userSockFprog) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return u.CopyInN(cc, addr, u.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (u *userSockFprog) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(u))) hdr.Len = u.SizeBytes() hdr.Cap = u.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that u // must live until the use above. runtime.KeepAlive(u) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (m *MessageHeader64) SizeBytes() int { return 56 } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (m *MessageHeader64) MarshalBytes(dst []byte) []byte { hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.Name)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(m.NameLen)) dst = dst[4:] // Padding: dst[:sizeof(uint32)] ~= uint32(0) dst = dst[4:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.Iov)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.IovLen)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.Control)) dst = dst[8:] hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.ControlLen)) dst = dst[8:] hostarch.ByteOrder.PutUint32(dst[:4], uint32(m.Flags)) dst = dst[4:] // Padding: dst[:sizeof(int32)] ~= int32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (m *MessageHeader64) UnmarshalBytes(src []byte) []byte { m.Name = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] m.NameLen = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ uint32 ~= src[:sizeof(uint32)] src = src[4:] m.Iov = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] m.IovLen = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] m.Control = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] m.ControlLen = uint64(hostarch.ByteOrder.Uint64(src[:8])) src = src[8:] m.Flags = int32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ int32 ~= src[:sizeof(int32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (m *MessageHeader64) Packed() bool { return true } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (m *MessageHeader64) MarshalUnsafe(dst []byte) []byte { size := m.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(m), uintptr(size)) return dst[size:] } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (m *MessageHeader64) UnmarshalUnsafe(src []byte) []byte { size := m.SizeBytes() gohacks.Memmove(unsafe.Pointer(m), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // CopyOutN implements marshal.Marshallable.CopyOutN. func (m *MessageHeader64) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (m *MessageHeader64) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyOutN(cc, addr, m.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (m *MessageHeader64) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (m *MessageHeader64) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyInN(cc, addr, m.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (m *MessageHeader64) WriteTo(writer io.Writer) (int64, error) { // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return int64(length), err } // SizeBytes implements marshal.Marshallable.SizeBytes. func (m *multipleMessageHeader64) SizeBytes() int { return 8 + (*MessageHeader64)(nil).SizeBytes() } // MarshalBytes implements marshal.Marshallable.MarshalBytes. func (m *multipleMessageHeader64) MarshalBytes(dst []byte) []byte { dst = m.msgHdr.MarshalUnsafe(dst) hostarch.ByteOrder.PutUint32(dst[:4], uint32(m.msgLen)) dst = dst[4:] // Padding: dst[:sizeof(int32)] ~= int32(0) dst = dst[4:] return dst } // UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. func (m *multipleMessageHeader64) UnmarshalBytes(src []byte) []byte { src = m.msgHdr.UnmarshalUnsafe(src) m.msgLen = uint32(hostarch.ByteOrder.Uint32(src[:4])) src = src[4:] // Padding: var _ int32 ~= src[:sizeof(int32)] src = src[4:] return src } // Packed implements marshal.Marshallable.Packed. //go:nosplit func (m *multipleMessageHeader64) Packed() bool { return m.msgHdr.Packed() } // MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. func (m *multipleMessageHeader64) MarshalUnsafe(dst []byte) []byte { if m.msgHdr.Packed() { size := m.SizeBytes() gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(m), uintptr(size)) return dst[size:] } // Type multipleMessageHeader64 doesn't have a packed layout in memory, fallback to MarshalBytes. return m.MarshalBytes(dst) } // UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. func (m *multipleMessageHeader64) UnmarshalUnsafe(src []byte) []byte { if m.msgHdr.Packed() { size := m.SizeBytes() gohacks.Memmove(unsafe.Pointer(m), unsafe.Pointer(&src[0]), uintptr(size)) return src[size:] } // Type multipleMessageHeader64 doesn't have a packed layout in memory, fallback to UnmarshalBytes. return m.UnmarshalBytes(src) } // CopyOutN implements marshal.Marshallable.CopyOutN. func (m *multipleMessageHeader64) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !m.msgHdr.Packed() { // Type multipleMessageHeader64 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := cc.CopyScratchBuffer(m.SizeBytes()) // escapes: okay. m.MarshalBytes(buf) // escapes: fallback. return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return length, err } // CopyOut implements marshal.Marshallable.CopyOut. func (m *multipleMessageHeader64) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyOutN(cc, addr, m.SizeBytes()) } // CopyInN implements marshal.Marshallable.CopyInN. func (m *multipleMessageHeader64) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { if !m.msgHdr.Packed() { // Type multipleMessageHeader64 doesn't have a packed layout in memory, fall back to UnmarshalBytes. buf := cc.CopyScratchBuffer(m.SizeBytes()) // escapes: okay. length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Unmarshal unconditionally. If we had a short copy-in, this results in a // partially unmarshalled struct. m.UnmarshalBytes(buf) // escapes: fallback. return length, err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay. // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return length, err } // CopyIn implements marshal.Marshallable.CopyIn. func (m *multipleMessageHeader64) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { return m.CopyInN(cc, addr, m.SizeBytes()) } // WriteTo implements io.WriterTo.WriteTo. func (m *multipleMessageHeader64) WriteTo(writer io.Writer) (int64, error) { if !m.msgHdr.Packed() { // Type multipleMessageHeader64 doesn't have a packed layout in memory, fall back to MarshalBytes. buf := make([]byte, m.SizeBytes()) m.MarshalBytes(buf) length, err := writer.Write(buf) return int64(length), err } // Construct a slice backed by dst's underlying memory. var buf []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) hdr.Len = m.SizeBytes() hdr.Cap = m.SizeBytes() length, err := writer.Write(buf) // Since we bypassed the compiler's escape analysis, indicate that m // must live until the use above. runtime.KeepAlive(m) // escapes: replaced by intrinsic. return int64(length), err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/linux_amd64_abi_autogen_unsafe.go000066400000000000000000000007721465435605700323310ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build amd64 && amd64 && amd64 // +build amd64,amd64,amd64 package linux import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/linux_amd64_state_autogen.go000066400000000000000000000001671465435605700313530ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 && amd64 && amd64 // +build amd64,amd64,amd64 package linux golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/linux_arm64_abi_autogen_unsafe.go000066400000000000000000000007721465435605700323470ustar00rootroot00000000000000// Automatically generated marshal implementation. See tools/go_marshal. // If there are issues with build constraint aggregation, see // tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here // come from the input set of files used to generate this file. This input set // is filtered based on pre-defined file suffixes related to build constraints, // see tools/defs.bzl:calculate_sets(). //go:build arm64 && arm64 && arm64 // +build arm64,arm64,arm64 package linux import ( ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/linux_arm64_state_autogen.go000066400000000000000000000001671465435605700313710ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 && arm64 && arm64 // +build arm64,arm64,arm64 package linux golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/linux_state_autogen.go000066400000000000000000000053611465435605700303610ustar00rootroot00000000000000// automatically generated by stateify. package linux import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (f *futexWaitRestartBlock) StateTypeName() string { return "pkg/sentry/syscalls/linux.futexWaitRestartBlock" } func (f *futexWaitRestartBlock) StateFields() []string { return []string{ "duration", "addr", "private", "val", "mask", } } func (f *futexWaitRestartBlock) beforeSave() {} // +checklocksignore func (f *futexWaitRestartBlock) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.duration) stateSinkObject.Save(1, &f.addr) stateSinkObject.Save(2, &f.private) stateSinkObject.Save(3, &f.val) stateSinkObject.Save(4, &f.mask) } func (f *futexWaitRestartBlock) afterLoad(context.Context) {} // +checklocksignore func (f *futexWaitRestartBlock) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.duration) stateSourceObject.Load(1, &f.addr) stateSourceObject.Load(2, &f.private) stateSourceObject.Load(3, &f.val) stateSourceObject.Load(4, &f.mask) } func (p *pollRestartBlock) StateTypeName() string { return "pkg/sentry/syscalls/linux.pollRestartBlock" } func (p *pollRestartBlock) StateFields() []string { return []string{ "pfdAddr", "nfds", "timeout", } } func (p *pollRestartBlock) beforeSave() {} // +checklocksignore func (p *pollRestartBlock) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.pfdAddr) stateSinkObject.Save(1, &p.nfds) stateSinkObject.Save(2, &p.timeout) } func (p *pollRestartBlock) afterLoad(context.Context) {} // +checklocksignore func (p *pollRestartBlock) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.pfdAddr) stateSourceObject.Load(1, &p.nfds) stateSourceObject.Load(2, &p.timeout) } func (n *clockNanosleepRestartBlock) StateTypeName() string { return "pkg/sentry/syscalls/linux.clockNanosleepRestartBlock" } func (n *clockNanosleepRestartBlock) StateFields() []string { return []string{ "c", "end", "rem", } } func (n *clockNanosleepRestartBlock) beforeSave() {} // +checklocksignore func (n *clockNanosleepRestartBlock) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.c) stateSinkObject.Save(1, &n.end) stateSinkObject.Save(2, &n.rem) } func (n *clockNanosleepRestartBlock) afterLoad(context.Context) {} // +checklocksignore func (n *clockNanosleepRestartBlock) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.c) stateSourceObject.Load(1, &n.end) stateSourceObject.Load(2, &n.rem) } func init() { state.Register((*futexWaitRestartBlock)(nil)) state.Register((*pollRestartBlock)(nil)) state.Register((*clockNanosleepRestartBlock)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/path.go000066400000000000000000000050571465435605700252360ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" ) func copyInPath(t *kernel.Task, addr hostarch.Addr) (fspath.Path, error) { pathname, err := t.CopyInString(addr, linux.PATH_MAX) if err != nil { return fspath.Path{}, err } return fspath.Parse(pathname), nil } type taskPathOperation struct { pop vfs.PathOperation haveStartRef bool } func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink) (taskPathOperation, error) { root := t.FSContext().RootDirectory() start := root haveStartRef := false if !path.Absolute { if !path.HasComponents() && !bool(shouldAllowEmptyPath) { root.DecRef(t) return taskPathOperation{}, linuxerr.ENOENT } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectory() haveStartRef = true } else { dirfile := t.GetFile(dirfd) if dirfile == nil { root.DecRef(t) return taskPathOperation{}, linuxerr.EBADF } start = dirfile.VirtualDentry() start.IncRef() haveStartRef = true dirfile.DecRef(t) } } return taskPathOperation{ pop: vfs.PathOperation{ Root: root, Start: start, Path: path, FollowFinalSymlink: bool(shouldFollowFinalSymlink), }, haveStartRef: haveStartRef, }, nil } func (tpop *taskPathOperation) Release(t *kernel.Task) { tpop.pop.Root.DecRef(t) if tpop.haveStartRef { tpop.pop.Start.DecRef(t) tpop.haveStartRef = false } } type shouldAllowEmptyPath bool const ( disallowEmptyPath shouldAllowEmptyPath = false allowEmptyPath shouldAllowEmptyPath = true ) type shouldFollowFinalSymlink bool const ( nofollowFinalSymlink shouldFollowFinalSymlink = false followFinalSymlink shouldFollowFinalSymlink = true ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/points.go000066400000000000000000001011711465435605700256100ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "fmt" "google.golang.org/protobuf/proto" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/seccheck" pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto" "gvisor.dev/gvisor/pkg/usermem" ) func newExitMaybe(info kernel.SyscallInfo) *pb.Exit { if !info.Exit { return nil } return &pb.Exit{ Result: int64(info.Rval), Errorno: int64(info.Errno), } } func getFilePath(t *kernel.Task, fd int32) string { if fd < 0 { return "" } fdt := t.FDTable() if fdt == nil { return "[err: no FD table]" } file, _ := fdt.Get(fd) if file == nil { return "[err: FD not found]" } defer file.DecRef(t) root := t.MountNamespace().Root(t) defer root.DecRef(t) path, err := t.Kernel().VFS().PathnameWithDeleted(t, root, file.VirtualDentry()) if err != nil { return fmt.Sprintf("[err: %v]", err) } return path } func getIovecSize(t *kernel.Task, addr hostarch.Addr, iovcnt int) uint64 { dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{AddressSpaceActive: true}) if err != nil { return 0 } return uint64(dst.NumBytes()) } // PointOpen converts open(2) syscall to proto. func PointOpen(t *kernel.Task, _ seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Open{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: linux.AT_FDCWD, Flags: info.Args[1].Uint(), Mode: uint32(info.Args[2].ModeT()), } addr := info.Args[0].Pointer() if addr > 0 { path, err := t.CopyInString(addr, linux.PATH_MAX) if err == nil { // if NO error p.Pathname = path } } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_OPEN } // PointOpenat converts openat(2) syscall to proto. func PointOpenat(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Open{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: int64(info.Args[0].Int()), Flags: info.Args[2].Uint(), } addr := info.Args[1].Pointer() if addr > 0 { path, err := t.CopyInString(addr, linux.PATH_MAX) if err == nil { // if NO error p.Pathname = path } } if p.Flags&linux.O_CREAT != 0 { p.Mode = uint32(info.Args[3].ModeT()) } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_OPEN } // PointCreat converts creat(2) syscall to proto. func PointCreat(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Open{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: linux.AT_FDCWD, Flags: linux.O_WRONLY | linux.O_CREAT | linux.O_TRUNC, Mode: uint32(info.Args[1].ModeT()), } addr := info.Args[0].Pointer() if addr > 0 { path, err := t.CopyInString(addr, linux.PATH_MAX) if err == nil { // if NO error p.Pathname = path } } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_OPEN } // PointClose converts close(2) syscall to proto. func PointClose(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Close{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: int64(info.Args[0].Int()), } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_CLOSE } // PointRead converts read(2) syscall to proto. func PointRead(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Read{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: int64(info.Args[0].Int()), Count: uint64(info.Args[2].SizeT()), } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_READ } // PointPread64 converts pread64(2) syscall to proto. func PointPread64(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Read{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: int64(info.Args[0].Int()), Count: uint64(info.Args[2].SizeT()), HasOffset: true, Offset: info.Args[3].Int64(), } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_READ } // PointReadv converts readv(2) syscall to proto. func PointReadv(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Read{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: int64(info.Args[0].Int()), Count: getIovecSize(t, info.Args[1].Pointer(), int(info.Args[2].Int())), } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_READ } // PointPreadv converts preadv(2) syscall to proto. func PointPreadv(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Read{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: int64(info.Args[0].Int()), Count: getIovecSize(t, info.Args[1].Pointer(), int(info.Args[2].Int())), HasOffset: true, Offset: info.Args[3].Int64(), } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_READ } // PointPreadv2 converts preadv2(2) syscall to proto. func PointPreadv2(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Read{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: int64(info.Args[0].Int()), Count: getIovecSize(t, info.Args[1].Pointer(), int(info.Args[2].Int())), HasOffset: true, Offset: info.Args[3].Int64(), Flags: info.Args[5].Uint(), } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_READ } // PointWrite converts write(2) syscall to proto. func PointWrite(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Write{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: int64(info.Args[0].Int()), Count: uint64(info.Args[2].SizeT()), } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_WRITE } // PointPwrite64 converts pwrite64(2) syscall to proto. func PointPwrite64(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Write{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: int64(info.Args[0].Int()), Count: uint64(info.Args[2].SizeT()), HasOffset: true, Offset: info.Args[3].Int64(), } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_WRITE } // PointWritev converts writev(2) syscall to proto. func PointWritev(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Write{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: int64(info.Args[0].Int()), Count: getIovecSize(t, info.Args[1].Pointer(), int(info.Args[2].Int())), } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_WRITE } // PointPwritev converts pwritev(2) syscall to proto. func PointPwritev(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Write{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: int64(info.Args[0].Int()), Count: getIovecSize(t, info.Args[1].Pointer(), int(info.Args[2].Int())), HasOffset: true, Offset: info.Args[3].Int64(), } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_WRITE } // PointPwritev2 converts pwritev2(2) syscall to proto. func PointPwritev2(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Write{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: int64(info.Args[0].Int()), Count: getIovecSize(t, info.Args[1].Pointer(), int(info.Args[2].Int())), HasOffset: true, Offset: info.Args[3].Int64(), Flags: info.Args[5].Uint(), } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_WRITE } // PointSocket converts socket(2) syscall to proto. func PointSocket(_ *kernel.Task, _ seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Socket{ ContextData: cxtData, Sysno: uint64(info.Sysno), Domain: info.Args[0].Int(), Type: info.Args[1].Int(), Protocol: info.Args[2].Int(), } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_SOCKET } // PointConnect converts connect(2) syscall to proto. func PointConnect(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Connect{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: int64(info.Args[0].Int()), } addr := info.Args[1].Pointer() addrlen := info.Args[2].Uint() p.Address, _ = CaptureAddress(t, addr, addrlen) if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_CONNECT } // PointExecve converts execve(2) syscall to proto. func PointExecve(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Execve{ ContextData: cxtData, Sysno: uint64(info.Sysno), } if pathname, err := t.CopyInString(info.Args[0].Pointer(), linux.PATH_MAX); err == nil { // if NO error p.Pathname = pathname } if argvAddr := info.Args[1].Pointer(); argvAddr != 0 { if argv, err := t.CopyInVector(argvAddr, ExecMaxElemSize, ExecMaxTotalSize); err == nil { // if NO error p.Argv = argv } } if fields.Local.Contains(seccheck.FieldSyscallExecveEnvv) { if envvAddr := info.Args[2].Pointer(); envvAddr != 0 { if envv, err := t.CopyInVector(envvAddr, ExecMaxElemSize, ExecMaxTotalSize); err == nil { // if NO error p.Envv = envv } } } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_EXECVE } // PointExecveat converts execveat(2) syscall to proto. func PointExecveat(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Execve{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: int64(info.Args[0].Int()), Flags: info.Args[4].Uint(), } if pathname, err := t.CopyInString(info.Args[1].Pointer(), linux.PATH_MAX); err == nil { // if NO error p.Pathname = pathname } if argvAddr := info.Args[2].Pointer(); argvAddr != 0 { if argv, err := t.CopyInVector(argvAddr, ExecMaxElemSize, ExecMaxTotalSize); err == nil { // if NO error p.Argv = argv } } if fields.Local.Contains(seccheck.FieldSyscallExecveEnvv) { if envvAddr := info.Args[3].Pointer(); envvAddr != 0 { if envv, err := t.CopyInVector(envvAddr, ExecMaxElemSize, ExecMaxTotalSize); err == nil { // if NO error p.Envv = envv } } } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_EXECVE } // pointChdirHelper converts chdir(2) and fchdir(2) syscall to proto. func pointChdirHelper(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo, fd int64, path hostarch.Addr) (proto.Message, pb.MessageType) { p := &pb.Chdir{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: fd, } if path > 0 { pathname, err := t.CopyInString(path, linux.PATH_MAX) if err == nil { // if NO error p.Pathname = pathname } } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_CHDIR } // PointChdir calls pointChdirHelper to convert chdir(2) syscall to proto. func PointChdir(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { path := info.Args[0].Pointer() return pointChdirHelper(t, fields, cxtData, info, linux.AT_FDCWD, path) } // PointFchdir calls pointChdirHelper to convert fchdir(2) syscall to proto. func PointFchdir(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { fd := int64(info.Args[0].Int()) path := info.Args[1].Pointer() return pointChdirHelper(t, fields, cxtData, info, fd, path) } // pointSetidHelper converts setuid(2) and setgid(2) syscall to proto. func pointSetidHelper(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo, id uint32) (proto.Message, pb.MessageType) { p := &pb.Setid{ ContextData: cxtData, Sysno: uint64(info.Sysno), Id: id, } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_SETID } // PointSetuid calls pointSetidHelper to convert setuid(2) syscall to proto. func PointSetuid(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { id := info.Args[0].Uint() return pointSetidHelper(t, fields, cxtData, info, id) } // PointSetgid calls pointSetidHelper to convert setgid(2) syscall to proto. func PointSetgid(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { id := info.Args[0].Uint() return pointSetidHelper(t, fields, cxtData, info, id) } // PointSetsid calls pointSetidHelper to convert setsid(2) syscall to proto. func PointSetsid(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { return pointSetidHelper(t, fields, cxtData, info, 0) } // pointSetresidHelper converts setresuid(2) and setresgid(2) syscall to proto. func pointSetresidHelper(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Setresid{ ContextData: cxtData, Sysno: uint64(info.Sysno), Rid: info.Args[0].Uint(), Eid: info.Args[1].Uint(), Sid: info.Args[2].Uint(), } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_SETRESID } // PointSetresuid calls pointSetresidHelper to convert setresuid(2) syscall to proto. func PointSetresuid(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { return pointSetresidHelper(t, fields, cxtData, info) } // PointSetresgid calls pointSetresidHelper to convert setresgid(2) syscall to proto. func PointSetresgid(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { return pointSetresidHelper(t, fields, cxtData, info) } func rlimits(rlimit rlimit64) *pb.StructRlimit { limit := rlimit.toLimit() return &pb.StructRlimit{ Cur: limit.Cur, Max: limit.Max, } } // PointPrlimit64 call converts prlimit64(2) syscall to proto. func PointPrlimit64(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Prlimit{ ContextData: cxtData, Sysno: uint64(info.Sysno), Pid: int32(info.Args[0].Int()), Resource: info.Args[1].Int64(), } if newRlimitAddr := info.Args[2].Pointer(); newRlimitAddr != 0 { var nrl rlimit64 if err := nrl.copyIn(t, newRlimitAddr); err == nil { // if NO error p.NewLimit = rlimits(nrl) } } if oldRlimitAddr := info.Args[3].Pointer(); oldRlimitAddr != 0 { var orl rlimit64 if err := orl.copyIn(t, oldRlimitAddr); err == nil { // if NO error p.OldLimit = rlimits(orl) } } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_PRLIMIT64 } // pipeHelper converts pipe(2) and pipe2(2) syscall to proto. func pipeHelper(t *kernel.Task, cxtData *pb.ContextData, info kernel.SyscallInfo, flags uint32) (proto.Message, pb.MessageType) { p := &pb.Pipe{ ContextData: cxtData, Sysno: uint64(info.Sysno), Flags: flags, } if info.Exit { if pipeFDAddr := info.Args[0].Pointer(); pipeFDAddr != 0 { var pipeFDs [2]int32 if _, err := primitive.CopyInt32SliceIn(t, pipeFDAddr, pipeFDs[:]); err == nil { // if NO error p.Reader = pipeFDs[0] p.Writer = pipeFDs[1] } } } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_PIPE } // PointPipe calls pipeHelper to convert pipe(2) syscall to proto. func PointPipe(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { return pipeHelper(t, cxtData, info, 0) } // PointPipe2 calls pipeHelper to convert pipe2(2) syscall to proto. func PointPipe2(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { flags := info.Args[1].Uint() return pipeHelper(t, cxtData, info, flags) } // eventfdHelper converts eventfd(2) and eventfd2(2) syscall to proto. func eventfdHelper(cxtData *pb.ContextData, info kernel.SyscallInfo, flags uint32) (proto.Message, pb.MessageType) { p := &pb.Eventfd{ ContextData: cxtData, Sysno: uint64(info.Sysno), Val: int32(info.Args[0].Int()), Flags: flags, } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_EVENTFD } // PointEventfd calls pipeHelper to convert eventfd(2) syscall to proto. func PointEventfd(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { return eventfdHelper(cxtData, info, 0) } // PointEventfd2 calls pipeHelper to convert eventfd2(2) syscall to proto. func PointEventfd2(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { flags := info.Args[1].Uint() return eventfdHelper(cxtData, info, flags) } // PointFcntl converts fcntl(2) syscall to proto. func PointFcntl(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Fcntl{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: info.Args[0].Int(), Cmd: info.Args[1].Int(), Args: info.Args[2].Int64(), } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_FCNTL } // pointDupHelper converts dup(2), dup2(2), and dup3(2) syscall to proto. func pointDupHelper(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo, oldFD, newFD int32, flags uint32) (proto.Message, pb.MessageType) { p := &pb.Dup{ ContextData: cxtData, Sysno: uint64(info.Sysno), OldFd: oldFD, NewFd: newFD, Flags: flags, } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.OldFd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_DUP } // PointDup calls pointDupHelper to convert dup(2) syscall to proto. func PointDup(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { oldFD := info.Args[0].Int() return pointDupHelper(t, fields, cxtData, info, oldFD, 0, 0) } // PointDup2 calls pointDupHelper to convert dup2(2) syscall to proto. func PointDup2(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { oldFD := info.Args[0].Int() newFD := info.Args[1].Int() return pointDupHelper(t, fields, cxtData, info, oldFD, newFD, 0) } // PointDup3 calls pointDupHelper to convert dup3(2) syscall to proto. func PointDup3(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { oldFD := info.Args[0].Int() newFD := info.Args[1].Int() flags := info.Args[2].Uint() return pointDupHelper(t, fields, cxtData, info, oldFD, newFD, flags) } // signalfdHelper converts signalfd(2) and signalfd4(2) syscall to proto. func signalfdHelper(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo, flags int32) (proto.Message, pb.MessageType) { p := &pb.Signalfd{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: info.Args[0].Int(), Flags: flags, } sigset := info.Args[1].Pointer() sigsetsize := info.Args[2].SizeT() mask, err := copyInSigSet(t, sigset, sigsetsize) if err == nil { // if NO error p.Sigset = uint64(mask) p.Sigset = uint64(mask) } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_SIGNALFD } // PointSignalfd calls signalfdHelper to convert signalfd(2) syscall to proto. func PointSignalfd(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { return signalfdHelper(t, fields, cxtData, info, 0) } // PointSignalfd4 calls signalfdHelper to convert signalfd4(2) syscall to proto. func PointSignalfd4(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { flags := info.Args[3].Int() return signalfdHelper(t, fields, cxtData, info, flags) } // PointChroot converts chroot(2) syscall to proto. func PointChroot(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Chroot{ ContextData: cxtData, Sysno: uint64(info.Sysno), } if pathname, err := t.CopyInString(info.Args[0].Pointer(), linux.PATH_MAX); err == nil { // if NO error p.Pathname = pathname } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_CHROOT } // PointClone converts clone(2) syscall to proto. func PointClone(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Clone{ ContextData: cxtData, Sysno: uint64(info.Sysno), Flags: info.Args[0].Uint64(), Stack: uint64(info.Args[1].Pointer()), Tls: uint64(info.Args[4].Pointer()), } var parTid kernel.ThreadID parentTidAddr := info.Args[2].Pointer() if _, err := parTid.CopyIn(t, parentTidAddr); err == nil { // if NO error p.NewTid = uint64(parTid) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_CLONE } // PointBind converts bind(2) syscall to proto. func PointBind(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Bind{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: info.Args[0].Int(), } addr := info.Args[1].Pointer() addrLen := info.Args[2].Uint() if address, err := CaptureAddress(t, addr, addrLen); err == nil { // if NO error p.Address = address } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_BIND } func acceptHelper(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo, flags int32) (proto.Message, pb.MessageType) { p := &pb.Accept{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: info.Args[0].Int(), Flags: flags, } addr := info.Args[1].Pointer() if addrLenPointer := info.Args[2].Pointer(); addrLenPointer != 0 { var addrLen uint32 if _, err := primitive.CopyUint32In(t, addrLenPointer, &addrLen); err == nil { // if NO error if address, err := CaptureAddress(t, addr, addrLen); err == nil { // if NO error p.Address = address } } } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_ACCEPT } // PointAccept converts accept(2) syscall to proto. func PointAccept(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { return acceptHelper(t, fields, cxtData, info, 0) } // PointAccept4 converts accept4(2) syscall to proto. func PointAccept4(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { flags := info.Args[3].Int() return acceptHelper(t, fields, cxtData, info, flags) } // PointTimerfdCreate converts timerfd_create(2) syscall to proto. func PointTimerfdCreate(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.TimerfdCreate{ ContextData: cxtData, Sysno: uint64(info.Sysno), ClockId: info.Args[0].Int(), Flags: info.Args[1].Int(), } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_TIMERFD_CREATE } func getValues(values linux.Timespec) *pb.Timespec { return &pb.Timespec{ Sec: values.Sec, Nsec: values.Nsec, } } // PointTimerfdSettime converts timerfd_settime(2) syscall to proto. func PointTimerfdSettime(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.TimerfdSetTime{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: info.Args[0].Int(), Flags: info.Args[1].Int(), } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } var newVal linux.Itimerspec if newValAddr := info.Args[2].Pointer(); newValAddr != 0 { if _, err := newVal.CopyIn(t, newValAddr); err == nil { p.NewValue = &pb.ItimerSpec{ Interval: getValues(newVal.Interval), Value: getValues(newVal.Value), } } } if info.Exit { var oldVal linux.Itimerspec if oldValAddr := info.Args[3].Pointer(); oldValAddr != 0 { if _, err := oldVal.CopyIn(t, oldValAddr); err == nil { p.OldValue = &pb.ItimerSpec{ Interval: getValues(oldVal.Interval), Value: getValues(oldVal.Value), } } } } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_TIMERFD_SETTIME } // PointTimerfdGettime converts timerfd_gettime(2) syscall to proto. func PointTimerfdGettime(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.TimerfdGetTime{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: info.Args[0].Int(), } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } if curValAddr := info.Args[1].Pointer(); curValAddr != 0 { var curVal linux.Itimerspec if _, err := curVal.CopyIn(t, curValAddr); err == nil { p.CurValue = &pb.ItimerSpec{ Interval: getValues(curVal.Interval), Value: getValues(curVal.Value), } } } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_TIMERFD_GETTIME } // pointForkHelper converts fork(2) and vfork(2) syscall to proto. func pointForkHelper(cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.Fork{ ContextData: cxtData, Sysno: uint64(info.Sysno), } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_FORK } // PointFork converts fork(2) syscall to proto. func PointFork(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { return pointForkHelper(cxtData, info) } // PointVfork converts vfork(2) syscall to proto. func PointVfork(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { return pointForkHelper(cxtData, info) } // pointInotifyInitHelper converts inotify_init(2) and inotify_init1(2) syscall to proto. func pointInotifyInitHelper(cxtData *pb.ContextData, info kernel.SyscallInfo, flags int32) (proto.Message, pb.MessageType) { p := &pb.InotifyInit{ ContextData: cxtData, Sysno: uint64(info.Sysno), Flags: flags, } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_INOTIFY_INIT } // PointInotifyInit converts inotify_init(2) syscall to proto. func PointInotifyInit(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { return pointInotifyInitHelper(cxtData, info, 0) } // PointInotifyInit1 converts inotify_init1(2) syscall to proto. func PointInotifyInit1(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { flags := info.Args[0].Int() return pointInotifyInitHelper(cxtData, info, flags) } // PointInotifyAddWatch converts inotify_add_watch(2) syscall to proto. func PointInotifyAddWatch(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.InotifyAddWatch{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: info.Args[0].Int(), Mask: info.Args[2].Uint(), } if pathAddr := info.Args[1].Pointer(); pathAddr > 0 { p.Pathname, _ = t.CopyInString(pathAddr, linux.PATH_MAX) } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_INOTIFY_ADD_WATCH } // PointInotifyRmWatch converts inotify_add_watch(2) syscall to proto. func PointInotifyRmWatch(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.InotifyRmWatch{ ContextData: cxtData, Sysno: uint64(info.Sysno), Fd: info.Args[0].Int(), Wd: info.Args[2].Int(), } if fields.Local.Contains(seccheck.FieldSyscallPath) { p.FdPath = getFilePath(t, int32(p.Fd)) } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_INOTIFY_RM_WATCH } // PointSocketpair converts socketpair(2) syscall to proto. func PointSocketpair(t *kernel.Task, fields seccheck.FieldSet, cxtData *pb.ContextData, info kernel.SyscallInfo) (proto.Message, pb.MessageType) { p := &pb.SocketPair{ ContextData: cxtData, Sysno: uint64(info.Sysno), Domain: info.Args[0].Int(), Type: info.Args[1].Int(), Protocol: info.Args[2].Int(), } if info.Exit { sockets := info.Args[3].Pointer() var fds [2]int32 if _, err := primitive.CopyInt32SliceIn(t, sockets, fds[:]); err == nil { // if NO error p.Socket1 = fds[0] p.Socket2 = fds[1] } } p.Exit = newExitMaybe(info) return p, pb.MessageType_MESSAGE_SYSCALL_SOCKETPAIR } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sigset.go000066400000000000000000000041111465435605700255660ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel" ) // copyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and // STOP are clear. func copyInSigSet(t *kernel.Task, sigSetAddr hostarch.Addr, size uint) (linux.SignalSet, error) { if size != linux.SignalSetSize { return 0, linuxerr.EINVAL } b := t.CopyScratchBuffer(8) if _, err := t.CopyInBytes(sigSetAddr, b); err != nil { return 0, err } mask := hostarch.ByteOrder.Uint64(b[:]) return linux.SignalSet(mask) &^ kernel.UnblockableSignals, nil } // copyOutSigSet copies out a sigset_t. func copyOutSigSet(t *kernel.Task, sigSetAddr hostarch.Addr, mask linux.SignalSet) error { b := t.CopyScratchBuffer(8) hostarch.ByteOrder.PutUint64(b, uint64(mask)) _, err := t.CopyOutBytes(sigSetAddr, b) return err } // copyInSigSetWithSize copies in a structure as below // // struct { // sigset_t* sigset_addr; // size_t sizeof_sigset; // }; // // and returns sigset_addr and size. func copyInSigSetWithSize(t *kernel.Task, addr hostarch.Addr) (hostarch.Addr, uint, error) { switch t.Arch().Width() { case 8: in := t.CopyScratchBuffer(16) if _, err := t.CopyInBytes(addr, in); err != nil { return 0, 0, err } maskAddr := hostarch.Addr(hostarch.ByteOrder.Uint64(in[0:])) maskSize := uint(hostarch.ByteOrder.Uint64(in[8:])) return maskAddr, maskSize, nil default: return 0, 0, linuxerr.ENOSYS } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_afs_syscall.go000066400000000000000000000030301465435605700274700ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ) var afsSyscallPanic = atomicbitops.Bool{} // SetAFSSyscallPanic sets the panic behaviour of afs_syscall. // Should only be called based on the config value of TESTONLY-afs-syscall-panic. func SetAFSSyscallPanic(v bool) { if v { log.Warningf("AFSSyscallPanic is set. User workloads may trigger sentry panics.") } afsSyscallPanic.Store(v) } // AFSSyscall is a gVisor specific implementation of afs_syscall: // - if TESTONLY-afs-syscall-panic flag is set it triggers a panic. func AFSSyscall(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { if afsSyscallPanic.Load() { panic("User workload triggered a panic via afs_syscall. This panic is intentional.") } return 0, nil, linuxerr.ENOSYS } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_aio.go000066400000000000000000000240601465435605700257430ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) // IoSetup implements linux syscall io_setup(2). func IoSetup(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { nrEvents := args[0].Int() idAddr := args[1].Pointer() // Linux uses the native long as the aio ID. // // The context pointer _must_ be zero initially. var idIn uint64 if _, err := primitive.CopyUint64In(t, idAddr, &idIn); err != nil { return 0, nil, err } if idIn != 0 { return 0, nil, linuxerr.EINVAL } id, err := t.MemoryManager().NewAIOContext(t, uint32(nrEvents)) if err != nil { return 0, nil, err } // Copy out the new ID. if _, err := primitive.CopyUint64Out(t, idAddr, id); err != nil { t.MemoryManager().DestroyAIOContext(t, id) return 0, nil, err } return 0, nil, nil } // IoDestroy implements linux syscall io_destroy(2). func IoDestroy(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { id := args[0].Uint64() ctx := t.MemoryManager().DestroyAIOContext(t, id) if ctx == nil { // Does not exist. return 0, nil, linuxerr.EINVAL } // Drain completed requests amd wait for pending requests until there are no // more. for { ctx.Drain() ch := ctx.WaitChannel() if ch == nil { // No more requests, we're done. return 0, nil, nil } // The task cannot be interrupted during the wait. Equivalent to // TASK_UNINTERRUPTIBLE in Linux. t.UninterruptibleSleepStart(true /* deactivate */) <-ch t.UninterruptibleSleepFinish(true /* activate */) } } // IoGetevents implements linux syscall io_getevents(2). func IoGetevents(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { id := args[0].Uint64() minEvents := args[1].Int() events := args[2].Int() eventsAddr := args[3].Pointer() timespecAddr := args[4].Pointer() // Sanity check arguments. if minEvents < 0 || minEvents > events { return 0, nil, linuxerr.EINVAL } ctx, ok := t.MemoryManager().LookupAIOContext(t, id) if !ok { return 0, nil, linuxerr.EINVAL } // Setup the timeout. var haveDeadline bool var deadline ktime.Time if timespecAddr != 0 { d, err := copyTimespecIn(t, timespecAddr) if err != nil { return 0, nil, err } if !d.Valid() { return 0, nil, linuxerr.EINVAL } deadline = t.Kernel().MonotonicClock().Now().Add(d.ToDuration()) haveDeadline = true } // Loop over all requests. for count := int32(0); count < events; count++ { // Get a request, per semantics. var v any if count >= minEvents { var ok bool v, ok = ctx.PopRequest() if !ok { return uintptr(count), nil, nil } } else { var err error v, err = waitForRequest(ctx, t, haveDeadline, deadline) if err != nil { if count > 0 || linuxerr.Equals(linuxerr.ETIMEDOUT, err) { return uintptr(count), nil, nil } return 0, nil, linuxerr.ConvertIntr(err, linuxerr.EINTR) } } ev := v.(*linux.IOEvent) // Copy out the result. if _, err := ev.CopyOut(t, eventsAddr); err != nil { if count > 0 { return uintptr(count), nil, nil } // Nothing done. return 0, nil, err } // Keep rolling. eventsAddr += hostarch.Addr(linux.IOEventSize) } // Everything finished. return uintptr(events), nil, nil } func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadline ktime.Time) (any, error) { for { if v, ok := ctx.PopRequest(); ok { // Request was readily available. Just return it. return v, nil } // Need to wait for request completion. done := ctx.WaitChannel() if done == nil { // Context has been destroyed. return nil, linuxerr.EINVAL } if err := t.BlockWithDeadline(done, haveDeadline, deadline); err != nil { return nil, err } } } // memoryFor returns appropriate memory for the given callback. func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) { bytes := int(cb.Bytes) if bytes < 0 { // Linux also requires that this field fit in ssize_t. return usermem.IOSequence{}, linuxerr.EINVAL } // Since this I/O will be asynchronous with respect to t's task goroutine, // we have no guarantee that t's AddressSpace will be active during the // I/O. switch cb.OpCode { case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE: return t.SingleIOSequence(hostarch.Addr(cb.Buf), bytes, usermem.IOOpts{ AddressSpaceActive: false, }) case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV: return t.IovecsIOSequence(hostarch.Addr(cb.Buf), bytes, usermem.IOOpts{ AddressSpaceActive: false, }) case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP: return usermem.IOSequence{}, nil default: // Not a supported command. return usermem.IOSequence{}, linuxerr.EINVAL } } // IoCancel implements linux syscall io_cancel(2). // // It is not presently supported (ENOSYS indicates no support on this // architecture). func IoCancel(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, linuxerr.ENOSYS } // IoSubmit implements linux syscall io_submit(2). func IoSubmit(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { id := args[0].Uint64() nrEvents := args[1].Int() addr := args[2].Pointer() if nrEvents < 0 { return 0, nil, linuxerr.EINVAL } for i := int32(0); i < nrEvents; i++ { // Copy in the callback address. var cbAddr hostarch.Addr switch t.Arch().Width() { case 8: var cbAddrP primitive.Uint64 if _, err := cbAddrP.CopyIn(t, addr); err != nil { if i > 0 { // Some successful. return uintptr(i), nil, nil } // Nothing done. return 0, nil, err } cbAddr = hostarch.Addr(cbAddrP) default: return 0, nil, linuxerr.ENOSYS } // Copy in this callback. var cb linux.IOCallback if _, err := cb.CopyIn(t, cbAddr); err != nil { if i > 0 { // Some have been successful. return uintptr(i), nil, nil } // Nothing done. return 0, nil, err } // Process this callback. if err := submitCallback(t, id, &cb, cbAddr); err != nil { if i > 0 { // Partial success. return uintptr(i), nil, nil } // Nothing done. return 0, nil, err } // Advance to the next one. addr += hostarch.Addr(t.Arch().Width()) } return uintptr(nrEvents), nil, nil } // submitCallback processes a single callback. func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr hostarch.Addr) error { if cb.Reserved2 != 0 { return linuxerr.EINVAL } fd := t.GetFile(cb.FD) if fd == nil { return linuxerr.EBADF } defer fd.DecRef(t) // Was there an eventFD? Extract it. var eventFD *vfs.FileDescription if cb.Flags&linux.IOCB_FLAG_RESFD != 0 { eventFD = t.GetFile(cb.ResFD) if eventFD == nil { return linuxerr.EBADF } defer eventFD.DecRef(t) // Check that it is an eventfd. if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok { return linuxerr.EINVAL } } ioseq, err := memoryFor(t, cb) if err != nil { return err } // Check offset for reads/writes. switch cb.OpCode { case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: if cb.Offset < 0 { return linuxerr.EINVAL } } // Prepare the request. aioCtx, ok := t.MemoryManager().LookupAIOContext(t, id) if !ok { return linuxerr.EINVAL } if err := aioCtx.Prepare(); err != nil { return err } if eventFD != nil { // The request is set. Make sure there's a ref on the file. // // This is necessary when the callback executes on completion, // which is also what will release this reference. eventFD.IncRef() } // Perform the request asynchronously. fd.IncRef() t.QueueAIO(getAIOCallback(t, fd, eventFD, cbAddr, cb, ioseq, aioCtx)) return nil } func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr hostarch.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, aioCtx *mm.AIOContext) kernel.AIOCallback { return func(ctx context.Context) { // Release references after completing the callback. defer fd.DecRef(ctx) if eventFD != nil { defer eventFD.DecRef(ctx) } if aioCtx.Dead() { aioCtx.CancelPendingRequest() return } ev := &linux.IOEvent{ Data: cb.Data, Obj: uint64(cbAddr), } var err error switch cb.OpCode { case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV: ev.Result, err = fd.PRead(ctx, ioseq, cb.Offset, vfs.ReadOptions{}) case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: ev.Result, err = fd.PWrite(ctx, ioseq, cb.Offset, vfs.WriteOptions{}) case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC: err = fd.Sync(ctx) } // Update the result. if err != nil { err = HandleIOError(ctx, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", fd) ev.Result = -int64(kernel.ExtractErrno(err, 0)) } // Queue the result for delivery. aioCtx.FinishRequest(ev) // Notify the event file if one was specified. This needs to happen // *after* queueing the result to avoid racing with the thread we may // wake up. if eventFD != nil { eventFD.Impl().(*eventfd.EventFileDescription).Signal(1) } } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_capability.go000066400000000000000000000110251465435605700273110ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) func lookupCaps(t *kernel.Task, tid kernel.ThreadID) (permitted, inheritable, effective auth.CapabilitySet, err error) { if tid < 0 { err = linuxerr.EINVAL return } if tid > 0 { t = t.PIDNamespace().TaskWithID(tid) } if t == nil { err = linuxerr.ESRCH return } creds := t.Credentials() permitted, inheritable, effective = creds.PermittedCaps, creds.InheritableCaps, creds.EffectiveCaps return } // Capget implements Linux syscall capget. func Capget(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { hdrAddr := args[0].Pointer() dataAddr := args[1].Pointer() var hdr linux.CapUserHeader if _, err := hdr.CopyIn(t, hdrAddr); err != nil { return 0, nil, err } // hdr.Pid doesn't need to be valid if this capget() is a "version probe" // (hdr.Version is unrecognized and dataAddr is null), so we can't do the // lookup yet. switch hdr.Version { case linux.LINUX_CAPABILITY_VERSION_1: if dataAddr == 0 { return 0, nil, nil } p, i, e, err := lookupCaps(t, kernel.ThreadID(hdr.Pid)) if err != nil { return 0, nil, err } data := linux.CapUserData{ Effective: uint32(e), Permitted: uint32(p), Inheritable: uint32(i), } _, err = data.CopyOut(t, dataAddr) return 0, nil, err case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3: if dataAddr == 0 { return 0, nil, nil } p, i, e, err := lookupCaps(t, kernel.ThreadID(hdr.Pid)) if err != nil { return 0, nil, err } data := [2]linux.CapUserData{ { Effective: uint32(e), Permitted: uint32(p), Inheritable: uint32(i), }, { Effective: uint32(e >> 32), Permitted: uint32(p >> 32), Inheritable: uint32(i >> 32), }, } _, err = linux.CopyCapUserDataSliceOut(t, dataAddr, data[:]) return 0, nil, err default: hdr.Version = linux.HighestCapabilityVersion if _, err := hdr.CopyOut(t, hdrAddr); err != nil { return 0, nil, err } if dataAddr != 0 { return 0, nil, linuxerr.EINVAL } return 0, nil, nil } } // Capset implements Linux syscall capset. func Capset(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { hdrAddr := args[0].Pointer() dataAddr := args[1].Pointer() var hdr linux.CapUserHeader if _, err := hdr.CopyIn(t, hdrAddr); err != nil { return 0, nil, err } switch hdr.Version { case linux.LINUX_CAPABILITY_VERSION_1: if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() { return 0, nil, linuxerr.EPERM } var data linux.CapUserData if _, err := data.CopyIn(t, dataAddr); err != nil { return 0, nil, err } p := auth.CapabilitySet(data.Permitted) & auth.AllCapabilities i := auth.CapabilitySet(data.Inheritable) & auth.AllCapabilities e := auth.CapabilitySet(data.Effective) & auth.AllCapabilities return 0, nil, t.SetCapabilitySets(p, i, e) case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3: if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() { return 0, nil, linuxerr.EPERM } var data [2]linux.CapUserData if _, err := linux.CopyCapUserDataSliceIn(t, dataAddr, data[:]); err != nil { return 0, nil, err } p := (auth.CapabilitySet(data[0].Permitted) | (auth.CapabilitySet(data[1].Permitted) << 32)) & auth.AllCapabilities i := (auth.CapabilitySet(data[0].Inheritable) | (auth.CapabilitySet(data[1].Inheritable) << 32)) & auth.AllCapabilities e := (auth.CapabilitySet(data[0].Effective) | (auth.CapabilitySet(data[1].Effective) << 32)) & auth.AllCapabilities return 0, nil, t.SetCapabilitySets(p, i, e) default: hdr.Version = linux.HighestCapabilityVersion if _, err := hdr.CopyOut(t, hdrAddr); err != nil { return 0, nil, err } return 0, nil, linuxerr.EINVAL } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_clone_amd64.go000066400000000000000000000023521465435605700272660ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package linux import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ) // Clone implements linux syscall clone(2). // sys_clone has so many flavors. We implement the default one in linux 3.11 // x86_64: // // sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr, tls_val) func Clone(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := int(args[0].Int()) stack := args[1].Pointer() parentTID := args[2].Pointer() childTID := args[3].Pointer() tls := args[4].Pointer() return clone(t, flags, stack, parentTID, childTID, tls) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_clone_arm64.go000066400000000000000000000024631465435605700273070ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package linux import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ) // Clone implements linux syscall clone(2). // sys_clone has so many flavors, and we implement the default one in linux 3.11 // arm64(kernel/fork.c with CONFIG_CLONE_BACKWARDS defined in the config file): // // sys_clone(clone_flags, newsp, parent_tidptr, tls_val, child_tidptr) func Clone(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := int(args[0].Int()) stack := args[1].Pointer() parentTID := args[2].Pointer() tls := args[3].Pointer() childTID := args[4].Pointer() return clone(t, flags, stack, parentTID, childTID, tls) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_epoll.go000066400000000000000000000153671465435605700263200ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "math" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/waiter" ) var sizeofEpollEvent = (*linux.EpollEvent)(nil).SizeBytes() // EpollCreate1 implements Linux syscall epoll_create1(2). func EpollCreate1(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() if flags&^linux.EPOLL_CLOEXEC != 0 { return 0, nil, linuxerr.EINVAL } file, err := t.Kernel().VFS().NewEpollInstanceFD(t) if err != nil { return 0, nil, err } defer file.DecRef(t) fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0, }) if err != nil { return 0, nil, err } return uintptr(fd), nil, nil } // EpollCreate implements Linux syscall epoll_create(2). func EpollCreate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { size := args[0].Int() // "Since Linux 2.6.8, the size argument is ignored, but must be greater // than zero" - epoll_create(2) if size <= 0 { return 0, nil, linuxerr.EINVAL } file, err := t.Kernel().VFS().NewEpollInstanceFD(t) if err != nil { return 0, nil, err } defer file.DecRef(t) fd, err := t.NewFDFrom(0, file, kernel.FDFlags{}) if err != nil { return 0, nil, err } return uintptr(fd), nil, nil } // EpollCtl implements Linux syscall epoll_ctl(2). func EpollCtl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { epfd := args[0].Int() op := args[1].Int() fd := args[2].Int() eventAddr := args[3].Pointer() epfile := t.GetFile(epfd) if epfile == nil { return 0, nil, linuxerr.EBADF } defer epfile.DecRef(t) ep, ok := epfile.Impl().(*vfs.EpollInstance) if !ok { return 0, nil, linuxerr.EINVAL } file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if epfile == file { return 0, nil, linuxerr.EINVAL } var event linux.EpollEvent switch op { case linux.EPOLL_CTL_ADD: if _, err := event.CopyIn(t, eventAddr); err != nil { return 0, nil, err } return 0, nil, ep.AddInterest(file, fd, event) case linux.EPOLL_CTL_DEL: return 0, nil, ep.DeleteInterest(file, fd) case linux.EPOLL_CTL_MOD: if _, err := event.CopyIn(t, eventAddr); err != nil { return 0, nil, err } return 0, nil, ep.ModifyInterest(file, fd, event) default: return 0, nil, linuxerr.EINVAL } } func waitEpoll(t *kernel.Task, epfd int32, eventsAddr hostarch.Addr, maxEvents int, timeoutInNanos int64) (uintptr, *kernel.SyscallControl, error) { var _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS { return 0, nil, linuxerr.EINVAL } epfile := t.GetFile(epfd) if epfile == nil { return 0, nil, linuxerr.EBADF } defer epfile.DecRef(t) ep, ok := epfile.Impl().(*vfs.EpollInstance) if !ok { return 0, nil, linuxerr.EINVAL } // Allocate space for a few events on the stack for the common case in // which we don't have too many events. var ( eventsArr [16]linux.EpollEvent ch chan struct{} haveDeadline bool deadline ktime.Time ) for { events := ep.ReadEvents(eventsArr[:0], maxEvents) if len(events) != 0 { copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events) copiedEvents := copiedBytes / sizeofEpollEvent // rounded down if copiedEvents != 0 { return uintptr(copiedEvents), nil, nil } return 0, nil, err } if timeoutInNanos == 0 { return 0, nil, nil } // In the first iteration of this loop, register with the epoll // instance for readability events, but then immediately continue the // loop since we need to retry ReadEvents() before blocking. In all // subsequent iterations, block until events are available, the timeout // expires, or an interrupt arrives. if ch == nil { var w waiter.Entry w, ch = waiter.NewChannelEntry(waiter.ReadableEvents) if err := epfile.EventRegister(&w); err != nil { return 0, nil, err } defer epfile.EventUnregister(&w) } else { // Set up the timer if a timeout was specified. if timeoutInNanos > 0 && !haveDeadline { timeoutDur := time.Duration(timeoutInNanos) * time.Nanosecond deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur) haveDeadline = true } if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { err = nil } return 0, nil, err } } } } // EpollWait implements Linux syscall epoll_wait(2). func EpollWait(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { epfd := args[0].Int() eventsAddr := args[1].Pointer() maxEvents := int(args[2].Int()) timeoutInNanos := int64(args[3].Int()) * 1000000 return waitEpoll(t, epfd, eventsAddr, maxEvents, timeoutInNanos) } // EpollPwait implements Linux syscall epoll_pwait(2). func EpollPwait(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { maskAddr := args[4].Pointer() maskSize := uint(args[5].Uint()) if err := setTempSignalSet(t, maskAddr, maskSize); err != nil { return 0, nil, err } return EpollWait(t, sysno, args) } // EpollPwait2 implements Linux syscall epoll_pwait(2). func EpollPwait2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { epfd := args[0].Int() eventsAddr := args[1].Pointer() maxEvents := int(args[2].Int()) timeoutPtr := args[3].Pointer() maskAddr := args[4].Pointer() maskSize := uint(args[5].Uint()) haveTimeout := timeoutPtr != 0 var timeoutInNanos int64 = -1 if haveTimeout { var timeout linux.Timespec if _, err := timeout.CopyIn(t, timeoutPtr); err != nil { return 0, nil, err } timeoutInNanos = timeout.ToNsec() } if err := setTempSignalSet(t, maskAddr, maskSize); err != nil { return 0, nil, err } return waitEpoll(t, epfd, eventsAddr, maxEvents, timeoutInNanos) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_eventfd.go000066400000000000000000000035601465435605700266300ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" "gvisor.dev/gvisor/pkg/sentry/kernel" ) // Eventfd2 implements linux syscall eventfd2(2). func Eventfd2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { initVal := uint64(args[0].Uint()) flags := uint(args[1].Uint()) allOps := uint(linux.EFD_SEMAPHORE | linux.EFD_NONBLOCK | linux.EFD_CLOEXEC) if flags & ^allOps != 0 { return 0, nil, linuxerr.EINVAL } vfsObj := t.Kernel().VFS() fileFlags := uint32(linux.O_RDWR) if flags&linux.EFD_NONBLOCK != 0 { fileFlags |= linux.O_NONBLOCK } semMode := flags&linux.EFD_SEMAPHORE != 0 eventfd, err := eventfd.New(t, vfsObj, initVal, semMode, fileFlags) if err != nil { return 0, nil, err } defer eventfd.DecRef(t) fd, err := t.NewFDFrom(0, eventfd, kernel.FDFlags{ CloseOnExec: flags&linux.EFD_CLOEXEC != 0, }) if err != nil { return 0, nil, err } return uintptr(fd), nil, nil } // Eventfd implements linux syscall eventfd(2). func Eventfd(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { args[1].Value = 0 return Eventfd2(t, sysno, args) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_file.go000066400000000000000000001413471465435605700261220ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "math" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/fasync" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // Mknod implements Linux syscall mknod(2). func Mknod(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() mode := args[1].ModeT() dev := args[2].Uint() return 0, nil, mknodat(t, linux.AT_FDCWD, addr, linux.FileMode(mode), dev) } // Mknodat implements Linux syscall mknodat(2). func Mknodat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirfd := args[0].Int() addr := args[1].Pointer() mode := args[2].ModeT() dev := args[3].Uint() return 0, nil, mknodat(t, dirfd, addr, linux.FileMode(mode), dev) } func mknodat(t *kernel.Task, dirfd int32, addr hostarch.Addr, mode linux.FileMode, dev uint32) error { path, err := copyInPath(t, addr) if err != nil { return err } tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) if err != nil { return err } defer tpop.Release(t) // "Zero file type is equivalent to type S_IFREG." - mknod(2) if mode.FileType() == 0 { mode |= linux.ModeRegular } major, minor := linux.DecodeDeviceID(dev) return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{ Mode: mode &^ linux.FileMode(t.FSContext().Umask()), DevMajor: uint32(major), DevMinor: minor, }) } // Open implements Linux syscall open(2). func Open(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() flags := args[1].Uint() mode := args[2].ModeT() return openat(t, linux.AT_FDCWD, addr, flags, mode) } // Openat implements Linux syscall openat(2). func Openat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirfd := args[0].Int() addr := args[1].Pointer() flags := args[2].Uint() mode := args[3].ModeT() return openat(t, dirfd, addr, flags, mode) } // Creat implements Linux syscall creat(2). func Creat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() mode := args[1].ModeT() return openat(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_CREAT|linux.O_TRUNC, mode) } func openat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, flags uint32, mode uint) (uintptr, *kernel.SyscallControl, error) { path, err := copyInPath(t, pathAddr) if err != nil { return 0, nil, err } tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.O_NOFOLLOW == 0)) if err != nil { return 0, nil, err } defer tpop.Release(t) file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{ Flags: flags | linux.O_LARGEFILE, Mode: linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()), }) if err != nil { return 0, nil, err } defer file.DecRef(t) fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, }) return uintptr(fd), nil, err } // Access implements Linux syscall access(2). func Access(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() mode := args[1].ModeT() return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode, 0 /* flags */) } // Faccessat implements Linux syscall faccessat(2). func Faccessat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirfd := args[0].Int() addr := args[1].Pointer() mode := args[2].ModeT() return 0, nil, accessAt(t, dirfd, addr, mode, 0 /* flags */) } // Faccessat2 implements Linux syscall faccessat2(2). func Faccessat2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirfd := args[0].Int() addr := args[1].Pointer() mode := args[2].ModeT() flags := args[3].Int() return 0, nil, accessAt(t, dirfd, addr, mode, flags) } func accessAt(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, mode uint, flags int32) error { const rOK = 4 const wOK = 2 const xOK = 1 // Sanity check the mode. if mode&^(rOK|wOK|xOK) != 0 { return linuxerr.EINVAL } // faccessat2(2) isn't documented as supporting AT_EMPTY_PATH, but it does. if flags&^(linux.AT_EACCESS|linux.AT_SYMLINK_NOFOLLOW|linux.AT_EMPTY_PATH) != 0 { return linuxerr.EINVAL } path, err := copyInPath(t, pathAddr) if err != nil { return err } tpop, err := getTaskPathOperation(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0)) if err != nil { return err } defer tpop.Release(t) creds := t.Credentials() if flags&linux.AT_EACCESS == 0 { // access(2) and faccessat(2) check permissions using real // UID/GID, not effective UID/GID. // // "access() needs to use the real uid/gid, not the effective // uid/gid. We do this by temporarily clearing all FS-related // capabilities and switching the fsuid/fsgid around to the // real ones." -fs/open.c:faccessat creds = creds.Fork() creds.EffectiveKUID = creds.RealKUID creds.EffectiveKGID = creds.RealKGID if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID { creds.EffectiveCaps = creds.PermittedCaps } else { creds.EffectiveCaps = 0 } } return t.Kernel().VFS().AccessAt(t, creds, vfs.AccessTypes(mode), &tpop.pop) } // Ioctl implements Linux syscall ioctl(2). func Ioctl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if file.StatusFlags()&linux.O_PATH != 0 { return 0, nil, linuxerr.EBADF } // Handle ioctls that apply to all FDs. switch args[1].Int() { case linux.FIONCLEX: t.FDTable().SetFlags(t, fd, kernel.FDFlags{ CloseOnExec: false, }) return 0, nil, nil case linux.FIOCLEX: t.FDTable().SetFlags(t, fd, kernel.FDFlags{ CloseOnExec: true, }) return 0, nil, nil case linux.FIONBIO: var set int32 if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil { return 0, nil, err } flags := file.StatusFlags() if set != 0 { flags |= linux.O_NONBLOCK } else { flags &^= linux.O_NONBLOCK } return 0, nil, file.SetStatusFlags(t, t.Credentials(), flags) case linux.FIOASYNC: var set int32 if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil { return 0, nil, err } flags := file.StatusFlags() if set != 0 { flags |= linux.O_ASYNC } else { flags &^= linux.O_ASYNC } file.SetStatusFlags(t, t.Credentials(), flags) return 0, nil, nil case linux.FIOGETOWN, linux.SIOCGPGRP: var who int32 owner, hasOwner := getAsyncOwner(t, file) if hasOwner { if owner.Type == linux.F_OWNER_PGRP { who = -owner.PID } else { who = owner.PID } } _, err := primitive.CopyInt32Out(t, args[2].Pointer(), who) return 0, nil, err case linux.FIOSETOWN, linux.SIOCSPGRP: var who int32 if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &who); err != nil { return 0, nil, err } ownerType := int32(linux.F_OWNER_PID) if who < 0 { // Check for overflow before flipping the sign. if who-1 > who { return 0, nil, linuxerr.EINVAL } ownerType = linux.F_OWNER_PGRP who = -who } return 0, nil, setAsyncOwner(t, int(fd), file, ownerType, who) } ret, err := file.Ioctl(t, t.MemoryManager(), sysno, args) return ret, nil, err } // Getcwd implements Linux syscall getcwd(2). func Getcwd(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() size := args[1].SizeT() root := t.FSContext().RootDirectory() wd := t.FSContext().WorkingDirectory() s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd) root.DecRef(t) wd.DecRef(t) if err != nil { return 0, nil, err } // Note this is >= because we need a terminator. if uint(len(s)) >= size { return 0, nil, linuxerr.ERANGE } // Construct a byte slice containing a NUL terminator. buf := t.CopyScratchBuffer(len(s) + 1) copy(buf, s) buf[len(buf)-1] = 0 // Write the pathname slice. n, err := t.CopyOutBytes(addr, buf) if err != nil { return 0, nil, err } return uintptr(n), nil, nil } // Chdir implements Linux syscall chdir(2). func Chdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() path, err := copyInPath(t, addr) if err != nil { return 0, nil, err } tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) if err != nil { return 0, nil, err } defer tpop.Release(t) vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ CheckSearchable: true, }) if err != nil { return 0, nil, err } t.FSContext().SetWorkingDirectory(t, vd) vd.DecRef(t) return 0, nil, nil } // Fchdir implements Linux syscall fchdir(2). func Fchdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink) if err != nil { return 0, nil, err } defer tpop.Release(t) vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ CheckSearchable: true, }) if err != nil { return 0, nil, err } t.FSContext().SetWorkingDirectory(t, vd) vd.DecRef(t) return 0, nil, nil } // Chroot implements Linux syscall chroot(2). func Chroot(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() if !t.HasCapability(linux.CAP_SYS_CHROOT) { return 0, nil, linuxerr.EPERM } path, err := copyInPath(t, addr) if err != nil { return 0, nil, err } tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) if err != nil { return 0, nil, err } defer tpop.Release(t) vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ CheckSearchable: true, }) if err != nil { return 0, nil, err } t.FSContext().SetRootDirectory(t, vd) vd.DecRef(t) return 0, nil, nil } // PivotRoot implements Linux syscall pivot_root(2). func PivotRoot(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr1 := args[0].Pointer() addr2 := args[1].Pointer() if !t.HasCapability(linux.CAP_SYS_ADMIN) { return 0, nil, linuxerr.EPERM } newRootPath, err := copyInPath(t, addr1) if err != nil { return 0, nil, err } newRootTpop, err := getTaskPathOperation(t, linux.AT_FDCWD, newRootPath, disallowEmptyPath, followFinalSymlink) if err != nil { return 0, nil, err } defer newRootTpop.Release(t) putOldPath, err := copyInPath(t, addr2) if err != nil { return 0, nil, err } putOldTpop, err := getTaskPathOperation(t, linux.AT_FDCWD, putOldPath, disallowEmptyPath, followFinalSymlink) if err != nil { return 0, nil, err } defer putOldTpop.Release(t) newRoot, oldRoot, err := t.Kernel().VFS().PivotRoot(t, t.Credentials(), &newRootTpop.pop, &putOldTpop.pop) if err != nil { return 0, nil, err } defer newRoot.DecRef(t) defer oldRoot.DecRef(t) t.Kernel().ReplaceFSContextRoots(t, oldRoot, newRoot) return 0, nil, nil } // Close implements Linux syscall close(2). func Close(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() // Note that Remove provides a reference on the file that we may use to // flush. It is still active until we drop the final reference below // (and other reference-holding operations complete). file := t.FDTable().Remove(t, fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) err := file.OnClose(t) return 0, nil, HandleIOError(t, false /* partial */, err, linuxerr.EINTR, "close", file) } // CloseRange implements linux syscall close_range(2). func CloseRange(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { first := args[0].Uint() last := args[1].Uint() flags := args[2].Uint() if (first > last) || (last > math.MaxInt32) { return 0, nil, linuxerr.EINVAL } if (flags & ^(linux.CLOSE_RANGE_CLOEXEC | linux.CLOSE_RANGE_UNSHARE)) != 0 { return 0, nil, linuxerr.EINVAL } cloexec := flags & linux.CLOSE_RANGE_CLOEXEC unshare := flags & linux.CLOSE_RANGE_UNSHARE if unshare != 0 { // If possible, we don't want to copy FDs to the new unshared table, because those FDs will // be promptly closed and no longer used. So in the case where we know the range extends all // the way to the end of the FdTable, we can simply copy the FdTable only up to the start of // the range that we are closing. if cloexec == 0 && int32(last) >= t.FDTable().GetLastFd() { t.UnshareFdTable(int32(first)) } else { t.UnshareFdTable(math.MaxInt32) } } if cloexec != 0 { flagToApply := kernel.FDFlags{ CloseOnExec: true, } t.FDTable().SetFlagsForRange(t.AsyncContext(), int32(first), int32(last), flagToApply) return 0, nil, nil } fdTable := t.FDTable() fd := int32(first) for { fd, file := fdTable.RemoveNextInRange(t, fd, int32(last)) if file == nil { break } fd++ // Per the close_range(2) documentation, errors upon closing file descriptors are ignored. _ = file.OnClose(t) file.DecRef(t) } return 0, nil, nil } // Dup implements Linux syscall dup(2). func Dup(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{}) if err != nil { return 0, nil, linuxerr.EMFILE } return uintptr(newFD), nil, nil } // Dup2 implements Linux syscall dup2(2). func Dup2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { oldfd := args[0].Int() newfd := args[1].Int() if oldfd == newfd { // As long as oldfd is valid, dup2() does nothing and returns newfd. file := t.GetFile(oldfd) if file == nil { return 0, nil, linuxerr.EBADF } file.DecRef(t) return uintptr(newfd), nil, nil } return dup3(t, oldfd, newfd, 0) } // Dup3 implements Linux syscall dup3(2). func Dup3(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { oldfd := args[0].Int() newfd := args[1].Int() flags := args[2].Uint() if oldfd == newfd { return 0, nil, linuxerr.EINVAL } return dup3(t, oldfd, newfd, flags) } func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) { if flags&^linux.O_CLOEXEC != 0 { return 0, nil, linuxerr.EINVAL } file := t.GetFile(oldfd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) df, err := t.NewFDAt(newfd, file, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, }) if linuxerr.Equals(linuxerr.EMFILE, err) { err = linuxerr.EBADF } if err != nil { return 0, nil, err } if df != nil { // "If the file descriptor newfd was previously open, it is closed // before being reused; the close is performed silently (i.e., any // errors during the close are not reported by dup2())." - dup(2) _ = df.OnClose(t) df.DecRef(t) } return uintptr(newfd), nil, nil } // Fcntl implements linux syscall fcntl(2). func Fcntl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() cmd := args[1].Int() file, flags := t.FDTable().Get(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if file.StatusFlags()&linux.O_PATH != 0 { switch cmd { case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC, linux.F_GETFD, linux.F_SETFD, linux.F_GETFL: // allowed default: return 0, nil, linuxerr.EBADF } } switch cmd { case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: minfd := args[2].Int() fd, err := t.NewFDFrom(minfd, file, kernel.FDFlags{ CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC, }) if err != nil { return 0, nil, err } return uintptr(fd), nil, nil case linux.F_GETFD: return uintptr(flags.ToLinuxFDFlags()), nil, nil case linux.F_SETFD: flags := args[2].Uint() err := t.FDTable().SetFlags(t, fd, kernel.FDFlags{ CloseOnExec: flags&linux.FD_CLOEXEC != 0, }) return 0, nil, err case linux.F_GETFL: return uintptr(file.StatusFlags()), nil, nil case linux.F_SETFL: return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint()) case linux.F_GETOWN: owner, hasOwner := getAsyncOwner(t, file) if !hasOwner { return 0, nil, nil } if owner.Type == linux.F_OWNER_PGRP { return uintptr(-owner.PID), nil, nil } return uintptr(owner.PID), nil, nil case linux.F_SETOWN: who := args[2].Int() ownerType := int32(linux.F_OWNER_PID) if who < 0 { // Check for overflow before flipping the sign. if who-1 > who { return 0, nil, linuxerr.EINVAL } ownerType = linux.F_OWNER_PGRP who = -who } return 0, nil, setAsyncOwner(t, int(fd), file, ownerType, who) case linux.F_GETOWN_EX: owner, hasOwner := getAsyncOwner(t, file) if !hasOwner { return 0, nil, nil } _, err := owner.CopyOut(t, args[2].Pointer()) return 0, nil, err case linux.F_SETOWN_EX: var owner linux.FOwnerEx _, err := owner.CopyIn(t, args[2].Pointer()) if err != nil { return 0, nil, err } return 0, nil, setAsyncOwner(t, int(fd), file, owner.Type, owner.PID) case linux.F_SETPIPE_SZ: pipefile, ok := file.Impl().(*pipe.VFSPipeFD) if !ok { return 0, nil, linuxerr.EBADF } n, err := pipefile.SetPipeSize(int64(args[2].Int())) if err != nil { return 0, nil, err } return uintptr(n), nil, nil case linux.F_GETPIPE_SZ: pipefile, ok := file.Impl().(*pipe.VFSPipeFD) if !ok { return 0, nil, linuxerr.EBADF } return uintptr(pipefile.PipeSize()), nil, nil case linux.F_GET_SEALS: val, err := tmpfs.GetSeals(file) return uintptr(val), nil, err case linux.F_ADD_SEALS: if !file.IsWritable() { return 0, nil, linuxerr.EPERM } err := tmpfs.AddSeals(file, args[2].Uint()) return 0, nil, err case linux.F_SETLK: return 0, nil, posixLock(t, args, file, false /* ofd */, false /* block */) case linux.F_SETLKW: return 0, nil, posixLock(t, args, file, false /* ofd */, true /* block */) case linux.F_GETLK: return 0, nil, posixTestLock(t, args, file, false /* ofd */) case linux.F_OFD_SETLK: return 0, nil, posixLock(t, args, file, true /* ofd */, false /* block */) case linux.F_OFD_SETLKW: return 0, nil, posixLock(t, args, file, true /* ofd */, true /* block */) case linux.F_OFD_GETLK: return 0, nil, posixTestLock(t, args, file, true /* ofd */) case linux.F_GETSIG: a := file.AsyncHandler() if a == nil { // Default behavior aka SIGIO. return 0, nil, nil } return uintptr(a.(*fasync.FileAsync).Signal()), nil, nil case linux.F_SETSIG: a, err := file.SetAsyncHandler(fasync.New(int(fd))) if err != nil { return 0, nil, err } async := a.(*fasync.FileAsync) return 0, nil, async.SetSignal(linux.Signal(args[2].Int())) default: // Everything else is not yet supported. return 0, nil, linuxerr.EINVAL } } func getAsyncOwner(t *kernel.Task, fd *vfs.FileDescription) (ownerEx linux.FOwnerEx, hasOwner bool) { a := fd.AsyncHandler() if a == nil { return linux.FOwnerEx{}, false } ot, otg, opg := a.(*fasync.FileAsync).Owner() switch { case ot != nil: return linux.FOwnerEx{ Type: linux.F_OWNER_TID, PID: int32(t.PIDNamespace().IDOfTask(ot)), }, true case otg != nil: return linux.FOwnerEx{ Type: linux.F_OWNER_PID, PID: int32(t.PIDNamespace().IDOfThreadGroup(otg)), }, true case opg != nil: return linux.FOwnerEx{ Type: linux.F_OWNER_PGRP, PID: int32(t.PIDNamespace().IDOfProcessGroup(opg)), }, true default: return linux.FOwnerEx{}, true } } func setAsyncOwner(t *kernel.Task, fd int, file *vfs.FileDescription, ownerType, pid int32) error { switch ownerType { case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP: // Acceptable type. default: return linuxerr.EINVAL } a, err := file.SetAsyncHandler(fasync.New(fd)) if err != nil { return err } async := a.(*fasync.FileAsync) if pid == 0 { async.ClearOwner() return nil } switch ownerType { case linux.F_OWNER_TID: task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) if task == nil { return linuxerr.ESRCH } async.SetOwnerTask(t, task) return nil case linux.F_OWNER_PID: tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid)) if tg == nil { return linuxerr.ESRCH } async.SetOwnerThreadGroup(t, tg) return nil case linux.F_OWNER_PGRP: pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(pid)) if pg == nil { return linuxerr.ESRCH } async.SetOwnerProcessGroup(t, pg) return nil default: return linuxerr.EINVAL } } func posixTestLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, ofd bool) error { // Copy in the lock request. flockAddr := args[2].Pointer() var flock linux.Flock if _, err := flock.CopyIn(t, flockAddr); err != nil { return err } var typ lock.LockType switch flock.Type { case linux.F_RDLCK: typ = lock.ReadLock case linux.F_WRLCK: typ = lock.WriteLock default: return linuxerr.EINVAL } r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence) if err != nil { return err } uid := lock.UniqueID(t.FDTable()) if ofd { uid = lock.UniqueID(file) } newFlock, err := file.TestPOSIX(t, uid, typ, r) if err != nil { return err } if !ofd { newFlock.PID = translatePID(t.PIDNamespace().Root(), t.PIDNamespace(), newFlock.PID) } if _, err = newFlock.CopyOut(t, flockAddr); err != nil { return err } return nil } // translatePID translates a pid from one namespace to another. Note that this // may race with task termination/creation, in which case the original task // corresponding to pid may no longer exist. This is used to implement the // F_GETLK fcntl, which has the same potential race in Linux as well (i.e., // there is no synchronization between retrieving the lock PID and translating // it). See fs/locks.c:posix_lock_to_flock. func translatePID(old, new *kernel.PIDNamespace, pid int32) int32 { return int32(new.IDOfTask(old.TaskWithID(kernel.ThreadID(pid)))) } func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, ofd bool, block bool) error { // Copy in the lock request. flockAddr := args[2].Pointer() var flock linux.Flock if _, err := flock.CopyIn(t, flockAddr); err != nil { return err } if ofd && flock.PID != 0 { return linuxerr.EINVAL } uid := lock.UniqueID(t.FDTable()) pid := int32(t.TGIDInRoot()) if ofd { uid = lock.UniqueID(file) pid = -1 } r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence) if err != nil { return err } switch flock.Type { case linux.F_RDLCK: if !file.IsReadable() { return linuxerr.EBADF } return file.LockPOSIX(t, uid, pid, lock.ReadLock, r, block) case linux.F_WRLCK: if !file.IsWritable() { return linuxerr.EBADF } return file.LockPOSIX(t, uid, pid, lock.WriteLock, r, block) case linux.F_UNLCK: return file.UnlockPOSIX(t, uid, r) default: return linuxerr.EINVAL } } // Fadvise64 implements fadvise64(2). // This implementation currently ignores the provided advice. func Fadvise64(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() length := args[2].Int64() advice := args[3].Int() // Note: offset is allowed to be negative. if length < 0 { return 0, nil, linuxerr.EINVAL } file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if file.StatusFlags()&linux.O_PATH != 0 { return 0, nil, linuxerr.EBADF } // If the FD refers to a pipe or FIFO, return error. if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe { return 0, nil, linuxerr.ESPIPE } switch advice { case linux.POSIX_FADV_NORMAL: case linux.POSIX_FADV_RANDOM: case linux.POSIX_FADV_SEQUENTIAL: case linux.POSIX_FADV_WILLNEED: case linux.POSIX_FADV_DONTNEED: case linux.POSIX_FADV_NOREUSE: default: return 0, nil, linuxerr.EINVAL } // Sure, whatever. return 0, nil, nil } // Mkdir implements Linux syscall mkdir(2). func Mkdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() mode := args[1].ModeT() return 0, nil, mkdirat(t, linux.AT_FDCWD, addr, mode) } // Mkdirat implements Linux syscall mkdirat(2). func Mkdirat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirfd := args[0].Int() addr := args[1].Pointer() mode := args[2].ModeT() return 0, nil, mkdirat(t, dirfd, addr, mode) } func mkdirat(t *kernel.Task, dirfd int32, addr hostarch.Addr, mode uint) error { path, err := copyInPath(t, addr) if err != nil { return err } tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) if err != nil { return err } defer tpop.Release(t) return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{ Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()), }) } // Rmdir implements Linux syscall rmdir(2). func Rmdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() return 0, nil, rmdirat(t, linux.AT_FDCWD, pathAddr) } func rmdirat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr) error { path, err := copyInPath(t, pathAddr) if err != nil { return err } tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) if err != nil { return err } defer tpop.Release(t) return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop) } // Symlink implements Linux syscall symlink(2). func Symlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { targetAddr := args[0].Pointer() linkpathAddr := args[1].Pointer() return 0, nil, symlinkat(t, targetAddr, linux.AT_FDCWD, linkpathAddr) } // Symlinkat implements Linux syscall symlinkat(2). func Symlinkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { targetAddr := args[0].Pointer() newdirfd := args[1].Int() linkpathAddr := args[2].Pointer() return 0, nil, symlinkat(t, targetAddr, newdirfd, linkpathAddr) } func symlinkat(t *kernel.Task, targetAddr hostarch.Addr, newdirfd int32, linkpathAddr hostarch.Addr) error { target, err := t.CopyInString(targetAddr, linux.PATH_MAX) if err != nil { return err } if len(target) == 0 { return linuxerr.ENOENT } linkpath, err := copyInPath(t, linkpathAddr) if err != nil { return err } tpop, err := getTaskPathOperation(t, newdirfd, linkpath, disallowEmptyPath, nofollowFinalSymlink) if err != nil { return err } defer tpop.Release(t) return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target) } // Link implements Linux syscall link(2). func Link(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { oldpathAddr := args[0].Pointer() newpathAddr := args[1].Pointer() return 0, nil, linkat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */) } // Linkat implements Linux syscall linkat(2). func Linkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { olddirfd := args[0].Int() oldpathAddr := args[1].Pointer() newdirfd := args[2].Int() newpathAddr := args[3].Pointer() flags := args[4].Int() return 0, nil, linkat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags) } func linkat(t *kernel.Task, olddirfd int32, oldpathAddr hostarch.Addr, newdirfd int32, newpathAddr hostarch.Addr, flags int32) error { if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 { return linuxerr.EINVAL } if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) { return linuxerr.ENOENT } oldpath, err := copyInPath(t, oldpathAddr) if err != nil { return err } oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_FOLLOW != 0)) if err != nil { return err } defer oldtpop.Release(t) newpath, err := copyInPath(t, newpathAddr) if err != nil { return err } newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink) if err != nil { return err } defer newtpop.Release(t) return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop) } // Readlinkat implements Linux syscall readlinkat(2). func Readlinkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirfd := args[0].Int() pathAddr := args[1].Pointer() bufAddr := args[2].Pointer() size := args[3].SizeT() return readlinkat(t, dirfd, pathAddr, bufAddr, size) } // Readlink implements Linux syscall readlink(2). func Readlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() bufAddr := args[1].Pointer() size := args[2].SizeT() return readlinkat(t, linux.AT_FDCWD, pathAddr, bufAddr, size) } func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr hostarch.Addr, size uint) (uintptr, *kernel.SyscallControl, error) { if int(size) <= 0 { return 0, nil, linuxerr.EINVAL } path, err := copyInPath(t, pathAddr) if err != nil { return 0, nil, err } // "Since Linux 2.6.39, pathname can be an empty string, in which case the // call operates on the symbolic link referred to by dirfd ..." - // readlinkat(2) tpop, err := getTaskPathOperation(t, dirfd, path, allowEmptyPath, nofollowFinalSymlink) if err != nil { return 0, nil, err } defer tpop.Release(t) target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop) if err != nil { return 0, nil, err } if len(target) > int(size) { target = target[:size] } n, err := t.CopyOutBytes(bufAddr, gohacks.ImmutableBytesFromString(target)) if n == 0 { return 0, nil, err } return uintptr(n), nil, nil } // Unlink implements Linux syscall unlink(2). func Unlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() return 0, nil, unlinkat(t, linux.AT_FDCWD, pathAddr) } func unlinkat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr) error { path, err := copyInPath(t, pathAddr) if err != nil { return err } tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) if err != nil { return err } defer tpop.Release(t) return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop) } // Unlinkat implements Linux syscall unlinkat(2). func Unlinkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirfd := args[0].Int() pathAddr := args[1].Pointer() flags := args[2].Int() if flags&^linux.AT_REMOVEDIR != 0 { return 0, nil, linuxerr.EINVAL } if flags&linux.AT_REMOVEDIR != 0 { return 0, nil, rmdirat(t, dirfd, pathAddr) } return 0, nil, unlinkat(t, dirfd, pathAddr) } func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error { root := t.FSContext().RootDirectory() defer root.DecRef(t) start := root if !path.Absolute { if !path.HasComponents() && !bool(shouldAllowEmptyPath) { return linuxerr.ENOENT } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectory() defer start.DecRef(t) } else { dirfile := t.GetFile(dirfd) if dirfile == nil { return linuxerr.EBADF } if !path.HasComponents() && dirfile.StatusFlags()&linux.O_PATH == 0 { // For empty path, use FileDescription.SetStat() instead of // VirtualFilesystem.SetStatAt(), since the former may be able to use // opened file state to expedite the SetStat. Skip this optimization // for FDs with O_PATH, since the FD impl always returns EBADF. err := dirfile.SetStat(t, *opts) dirfile.DecRef(t) return err } start = dirfile.VirtualDentry() start.IncRef() defer start.DecRef(t) dirfile.DecRef(t) } } return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{ Root: root, Start: start, Path: path, FollowFinalSymlink: bool(shouldFollowFinalSymlink), }, opts) } func handleSetSizeError(t *kernel.Task, err error) error { if err == linuxerr.ErrExceedsFileSizeLimit { // Convert error to EFBIG and send a SIGXFSZ per setrlimit(2). t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) return linuxerr.EFBIG } return err } // Truncate implements Linux syscall truncate(2). func Truncate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() length := args[1].Int64() if length < 0 { return 0, nil, linuxerr.EINVAL } path, err := copyInPath(t, addr) if err != nil { return 0, nil, err } err = setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{ Stat: linux.Statx{ Mask: linux.STATX_SIZE, Size: uint64(length), }, NeedWritePerm: true, }) return 0, nil, handleSetSizeError(t, err) } // Ftruncate implements Linux syscall ftruncate(2). func Ftruncate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() length := args[1].Int64() if length < 0 { return 0, nil, linuxerr.EINVAL } file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if !file.IsWritable() { return 0, nil, linuxerr.EINVAL } err := file.SetStat(t, vfs.SetStatOptions{ Stat: linux.Statx{ Mask: linux.STATX_SIZE, Size: uint64(length), }, }) return 0, nil, handleSetSizeError(t, err) } // Umask implements linux syscall umask(2). func Umask(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { mask := args[0].ModeT() mask = t.FSContext().SwapUmask(mask & 0777) return uintptr(mask), nil, nil } // Chown implements Linux syscall chown(2). func Chown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() owner := args[1].Int() group := args[2].Int() return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, 0 /* flags */) } // Lchown implements Linux syscall lchown(2). func Lchown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() owner := args[1].Int() group := args[2].Int() return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, linux.AT_SYMLINK_NOFOLLOW) } // Fchownat implements Linux syscall fchownat(2). func Fchownat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirfd := args[0].Int() pathAddr := args[1].Pointer() owner := args[2].Int() group := args[3].Int() flags := args[4].Int() return 0, nil, fchownat(t, dirfd, pathAddr, owner, group, flags) } func fchownat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, owner, group, flags int32) error { if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { return linuxerr.EINVAL } path, err := copyInPath(t, pathAddr) if err != nil { return err } var opts vfs.SetStatOptions if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil { return err } return setstatat(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts) } func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vfs.SetStatOptions) error { userns := t.UserNamespace() if owner != -1 { kuid := userns.MapToKUID(auth.UID(owner)) if !kuid.Ok() { return linuxerr.EINVAL } opts.Stat.Mask |= linux.STATX_UID opts.Stat.UID = uint32(kuid) } if group != -1 { kgid := userns.MapToKGID(auth.GID(group)) if !kgid.Ok() { return linuxerr.EINVAL } opts.Stat.Mask |= linux.STATX_GID opts.Stat.GID = uint32(kgid) } return nil } // Fchown implements Linux syscall fchown(2). func Fchown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() owner := args[1].Int() group := args[2].Int() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) var opts vfs.SetStatOptions if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil { return 0, nil, err } return 0, nil, file.SetStat(t, opts) } const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX // Chmod implements Linux syscall chmod(2). func Chmod(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() mode := args[1].ModeT() return 0, nil, fchmodat(t, linux.AT_FDCWD, pathAddr, mode) } // Fchmodat implements Linux syscall fchmodat(2). func Fchmodat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirfd := args[0].Int() pathAddr := args[1].Pointer() mode := args[2].ModeT() return 0, nil, fchmodat(t, dirfd, pathAddr, mode) } func fchmodat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, mode uint) error { path, err := copyInPath(t, pathAddr) if err != nil { return err } return setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{ Stat: linux.Statx{ Mask: linux.STATX_MODE, Mode: uint16(mode & chmodMask), }, }) } // Fchmod implements Linux syscall fchmod(2). func Fchmod(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() mode := args[1].ModeT() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) return 0, nil, file.SetStat(t, vfs.SetStatOptions{ Stat: linux.Statx{ Mask: linux.STATX_MODE, Mode: uint16(mode & chmodMask), }, }) } // Utime implements Linux syscall utime(2). func Utime(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() timesAddr := args[1].Pointer() opts := vfs.SetStatOptions{ Stat: linux.Statx{ Mask: linux.STATX_ATIME | linux.STATX_MTIME, }, } if timesAddr == 0 { opts.Stat.Atime.Nsec = linux.UTIME_NOW opts.Stat.Mtime.Nsec = linux.UTIME_NOW } else { var times linux.Utime if _, err := times.CopyIn(t, timesAddr); err != nil { return 0, nil, err } opts.Stat.Atime.Sec = times.Actime opts.Stat.Mtime.Sec = times.Modtime } return 0, nil, utimes(t, linux.AT_FDCWD, pathAddr, followFinalSymlink, &opts) } // Utimes implements Linux syscall utimes(2). func Utimes(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() timesAddr := args[1].Pointer() var opts vfs.SetStatOptions if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil { return 0, nil, err } return 0, nil, utimes(t, linux.AT_FDCWD, pathAddr, followFinalSymlink, &opts) } // Futimesat implements Linux syscall futimesat(2). func Futimesat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirfd := args[0].Int() pathAddr := args[1].Pointer() timesAddr := args[2].Pointer() var opts vfs.SetStatOptions if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil { return 0, nil, err } return 0, nil, utimes(t, dirfd, pathAddr, followFinalSymlink, &opts) } func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr hostarch.Addr, opts *vfs.SetStatOptions) error { if timesAddr == 0 { opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME opts.Stat.Atime.Nsec = linux.UTIME_NOW opts.Stat.Mtime.Nsec = linux.UTIME_NOW return nil } var times [2]linux.Timeval if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil { return err } if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 { return linuxerr.EINVAL } opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME opts.Stat.Atime = linux.StatxTimestamp{ Sec: times[0].Sec, Nsec: uint32(times[0].Usec * 1000), } opts.Stat.Mtime = linux.StatxTimestamp{ Sec: times[1].Sec, Nsec: uint32(times[1].Usec * 1000), } return nil } // Utimensat implements Linux syscall utimensat(2). func Utimensat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirfd := args[0].Int() pathAddr := args[1].Pointer() timesAddr := args[2].Pointer() flags := args[3].Int() // Linux requires that the UTIME_OMIT check occur before flags. var opts vfs.SetStatOptions if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil { return 0, nil, err } if opts.Stat.Mask == 0 { return 0, nil, nil } if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 { return 0, nil, linuxerr.EINVAL } return 0, nil, utimes(t, dirfd, pathAddr, shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts) } func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr hostarch.Addr, opts *vfs.SetStatOptions) error { if timesAddr == 0 { opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME opts.Stat.Atime.Nsec = linux.UTIME_NOW opts.Stat.Mtime.Nsec = linux.UTIME_NOW return nil } var times [2]linux.Timespec if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil { return err } if times[0].Nsec != linux.UTIME_OMIT { if times[0].Nsec != linux.UTIME_NOW && (times[0].Nsec < 0 || times[0].Nsec > 999999999) { return linuxerr.EINVAL } opts.Stat.Mask |= linux.STATX_ATIME opts.Stat.Atime = linux.StatxTimestamp{ Sec: times[0].Sec, Nsec: uint32(times[0].Nsec), } } if times[1].Nsec != linux.UTIME_OMIT { if times[1].Nsec != linux.UTIME_NOW && (times[1].Nsec < 0 || times[1].Nsec > 999999999) { return linuxerr.EINVAL } opts.Stat.Mask |= linux.STATX_MTIME opts.Stat.Mtime = linux.StatxTimestamp{ Sec: times[1].Sec, Nsec: uint32(times[1].Nsec), } } return nil } // Analogous to fs/utimes.c:do_utimes(). func utimes(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error { // "If filename is NULL and dfd refers to an open file, then operate on the // file. Otherwise look up filename, possibly using dfd as a starting // point." - fs/utimes.c:do_utimes() if dirfd != linux.AT_FDCWD && pathAddr == 0 { file := t.GetFile(dirfd) if file == nil { return linuxerr.EBADF } defer file.DecRef(t) return file.SetStat(t, *opts) } path, err := copyInPath(t, pathAddr) if err != nil { return err } return setstatat(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink, opts) } // Rename implements Linux syscall rename(2). func Rename(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { oldpathAddr := args[0].Pointer() newpathAddr := args[1].Pointer() return 0, nil, renameat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */) } // Renameat implements Linux syscall renameat(2). func Renameat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { olddirfd := args[0].Int() oldpathAddr := args[1].Pointer() newdirfd := args[2].Int() newpathAddr := args[3].Pointer() return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, 0 /* flags */) } // Renameat2 implements Linux syscall renameat2(2). func Renameat2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { olddirfd := args[0].Int() oldpathAddr := args[1].Pointer() newdirfd := args[2].Int() newpathAddr := args[3].Pointer() flags := args[4].Uint() return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags) } func renameat(t *kernel.Task, olddirfd int32, oldpathAddr hostarch.Addr, newdirfd int32, newpathAddr hostarch.Addr, flags uint32) error { oldpath, err := copyInPath(t, oldpathAddr) if err != nil { return err } // "If oldpath refers to a symbolic link, the link is renamed" - rename(2) oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, disallowEmptyPath, nofollowFinalSymlink) if err != nil { return err } defer oldtpop.Release(t) newpath, err := copyInPath(t, newpathAddr) if err != nil { return err } newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink) if err != nil { return err } defer newtpop.Release(t) return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{ Flags: flags, }) } // Fallocate implements linux system call fallocate(2). func Fallocate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() mode := args[1].Uint64() offset := args[2].Int64() length := args[3].Int64() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if !file.IsWritable() { return 0, nil, linuxerr.EBADF } if mode != 0 { return 0, nil, linuxerr.ENOTSUP } if offset < 0 || length <= 0 { return 0, nil, linuxerr.EINVAL } size := offset + length if size < 0 { return 0, nil, linuxerr.EFBIG } limit := limits.FromContext(t).Get(limits.FileSize).Cur if uint64(size) >= limit { t.SendSignal(&linux.SignalInfo{ Signo: int32(linux.SIGXFSZ), Code: linux.SI_USER, }) return 0, nil, linuxerr.EFBIG } return 0, nil, file.Allocate(t, mode, uint64(offset), uint64(length)) } // Flock implements linux syscall flock(2). func Flock(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() operation := args[1].Int() file := t.GetFile(fd) if file == nil { // flock(2): EBADF fd is not an open file descriptor. return 0, nil, linuxerr.EBADF } defer file.DecRef(t) nonblocking := operation&linux.LOCK_NB != 0 operation &^= linux.LOCK_NB switch operation { case linux.LOCK_EX: if err := file.LockBSD(t, int32(t.TGIDInRoot()), lock.WriteLock, !nonblocking /* block */); err != nil { return 0, nil, err } case linux.LOCK_SH: if err := file.LockBSD(t, int32(t.TGIDInRoot()), lock.ReadLock, !nonblocking /* block */); err != nil { return 0, nil, err } case linux.LOCK_UN: if err := file.UnlockBSD(t); err != nil { return 0, nil, err } default: // flock(2): EINVAL operation is invalid. return 0, nil, linuxerr.EINVAL } return 0, nil, nil } const ( memfdPrefix = "memfd:" memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix) memfdAllFlags = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING) ) // MemfdCreate implements the linux syscall memfd_create(2). func MemfdCreate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() flags := args[1].Uint() if flags&^memfdAllFlags != 0 { // Unknown bits in flags. return 0, nil, linuxerr.EINVAL } allowSeals := flags&linux.MFD_ALLOW_SEALING != 0 cloExec := flags&linux.MFD_CLOEXEC != 0 name, err := t.CopyInString(addr, memfdMaxNameLen) if err != nil { return 0, nil, err } shmMount := t.Kernel().ShmMount() file, err := tmpfs.NewMemfd(t, t.Credentials(), shmMount, allowSeals, memfdPrefix+name) if err != nil { return 0, nil, err } defer file.DecRef(t) fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: cloExec, }) if err != nil { return 0, nil, err } return uintptr(fd), nil, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_futex.go000066400000000000000000000225551465435605700263350ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" ) // futexWaitRestartBlock encapsulates the state required to restart futex(2) // via restart_syscall(2). // // +stateify savable type futexWaitRestartBlock struct { duration time.Duration // addr stored as uint64 since uintptr is not save-able. addr uint64 private bool val uint32 mask uint32 } // Restart implements kernel.SyscallRestartBlock.Restart. func (f *futexWaitRestartBlock) Restart(t *kernel.Task) (uintptr, error) { return futexWaitDuration(t, f.duration, false, hostarch.Addr(f.addr), f.private, f.val, f.mask) } // futexWaitAbsolute performs a FUTEX_WAIT_BITSET, blocking until the wait is // complete. // // The wait blocks forever if forever is true, otherwise it blocks until ts. // // If blocking is interrupted, the syscall is restarted with the original // arguments. func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, forever bool, addr hostarch.Addr, private bool, val, mask uint32) (uintptr, error) { w := t.FutexWaiter() err := t.Futex().WaitPrepare(w, t, addr, private, val, mask) if err != nil { return 0, err } if forever { err = t.Block(w.C) } else if clockRealtime { err = t.BlockWithDeadlineFrom(w.C, t.Kernel().RealtimeClock(), true, ktime.FromTimespec(ts)) } else { err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts)) } t.Futex().WaitComplete(w, t) return 0, linuxerr.ConvertIntr(err, linuxerr.ERESTARTSYS) } // futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is // complete. // // The wait blocks forever if forever is true, otherwise is blocks for // duration. // // If blocking is interrupted, forever determines how to restart the // syscall. If forever is true, the syscall is restarted with the original // arguments. If forever is false, duration is a relative timeout and the // syscall is restarted with the remaining timeout. func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, addr hostarch.Addr, private bool, val, mask uint32) (uintptr, error) { w := t.FutexWaiter() err := t.Futex().WaitPrepare(w, t, addr, private, val, mask) if err != nil { return 0, err } remaining, err := t.BlockWithTimeout(w.C, !forever, duration) t.Futex().WaitComplete(w, t) if err == nil { return 0, nil } // The wait was unsuccessful for some reason other than interruption. Simply // forward the error. if err != linuxerr.ErrInterrupted { return 0, err } // The wait was interrupted and we need to restart. Decide how. // The wait duration was absolute, restart with the original arguments. if forever { return 0, linuxerr.ERESTARTSYS } // The wait duration was relative, restart with the remaining duration. t.SetSyscallRestartBlock(&futexWaitRestartBlock{ duration: remaining, addr: uint64(addr), private: private, val: val, mask: mask, }) return 0, linuxerr.ERESTART_RESTARTBLOCK } func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr hostarch.Addr, private bool) error { w := t.FutexWaiter() locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, false) if err != nil { return err } if locked { // Futex acquired, we're done! return nil } if forever { err = t.Block(w.C) } else { err = t.BlockWithDeadlineFrom(w.C, t.Kernel().RealtimeClock(), true, ktime.FromTimespec(ts)) } t.Futex().WaitComplete(w, t) return linuxerr.ConvertIntr(err, linuxerr.ERESTARTSYS) } func tryLockPI(t *kernel.Task, addr hostarch.Addr, private bool) error { w := t.FutexWaiter() locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, true) if err != nil { return err } if !locked { return linuxerr.EWOULDBLOCK } return nil } // Futex implements linux syscall futex(2). // It provides a method for a program to wait for a value at a given address to // change, and a method to wake up anyone waiting on a particular address. func Futex(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() futexOp := args[1].Int() val := int(args[2].Int()) nreq := int(args[3].Int()) timeout := args[3].Pointer() naddr := args[4].Pointer() val3 := args[5].Int() cmd := futexOp &^ (linux.FUTEX_PRIVATE_FLAG | linux.FUTEX_CLOCK_REALTIME) private := (futexOp & linux.FUTEX_PRIVATE_FLAG) != 0 clockRealtime := (futexOp & linux.FUTEX_CLOCK_REALTIME) == linux.FUTEX_CLOCK_REALTIME mask := uint32(val3) switch cmd { case linux.FUTEX_WAIT, linux.FUTEX_WAIT_BITSET: // WAIT{_BITSET} wait forever if the timeout isn't passed. forever := (timeout == 0) var timespec linux.Timespec if !forever { var err error timespec, err = copyTimespecIn(t, timeout) if err != nil { return 0, nil, err } } switch cmd { case linux.FUTEX_WAIT: // WAIT uses a relative timeout. mask = linux.FUTEX_BITSET_MATCH_ANY var timeoutDur time.Duration if !forever { timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond } n, err := futexWaitDuration(t, timeoutDur, forever, addr, private, uint32(val), mask) return n, nil, err case linux.FUTEX_WAIT_BITSET: // WAIT_BITSET uses an absolute timeout which is either // CLOCK_MONOTONIC or CLOCK_REALTIME. if mask == 0 { return 0, nil, linuxerr.EINVAL } n, err := futexWaitAbsolute(t, clockRealtime, timespec, forever, addr, private, uint32(val), mask) return n, nil, err default: panic("unreachable") } case linux.FUTEX_WAKE: mask = ^uint32(0) fallthrough case linux.FUTEX_WAKE_BITSET: if mask == 0 { return 0, nil, linuxerr.EINVAL } if val <= 0 { // The Linux kernel wakes one waiter even if val is // non-positive. val = 1 } n, err := t.Futex().Wake(t, addr, private, mask, val) return uintptr(n), nil, err case linux.FUTEX_REQUEUE: n, err := t.Futex().Requeue(t, addr, naddr, private, val, nreq) return uintptr(n), nil, err case linux.FUTEX_CMP_REQUEUE: // 'val3' contains the value to be checked at 'addr' and // 'val' is the number of waiters that should be woken up. nval := uint32(val3) n, err := t.Futex().RequeueCmp(t, addr, naddr, private, nval, val, nreq) return uintptr(n), nil, err case linux.FUTEX_WAKE_OP: op := uint32(val3) if val <= 0 { // The Linux kernel wakes one waiter even if val is // non-positive. val = 1 } n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op) return uintptr(n), nil, err case linux.FUTEX_LOCK_PI: forever := (timeout == 0) var timespec linux.Timespec if !forever { var err error timespec, err = copyTimespecIn(t, timeout) if err != nil { return 0, nil, err } } err := futexLockPI(t, timespec, forever, addr, private) return 0, nil, err case linux.FUTEX_TRYLOCK_PI: err := tryLockPI(t, addr, private) return 0, nil, err case linux.FUTEX_UNLOCK_PI: err := t.Futex().UnlockPI(t, addr, uint32(t.ThreadID()), private) return 0, nil, err case linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI: t.Kernel().EmitUnimplementedEvent(t, sysno) return 0, nil, linuxerr.ENOSYS default: // We don't even know about this command. return 0, nil, linuxerr.ENOSYS } } // SetRobustList implements linux syscall set_robust_list(2). func SetRobustList(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { // Despite the syscall using the name 'pid' for this variable, it is // very much a tid. head := args[0].Pointer() length := args[1].SizeT() if length != uint(linux.SizeOfRobustListHead) { return 0, nil, linuxerr.EINVAL } t.SetRobustList(head) return 0, nil, nil } // GetRobustList implements linux syscall get_robust_list(2). func GetRobustList(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { // Despite the syscall using the name 'pid' for this variable, it is // very much a tid. tid := args[0].Int() headAddr := args[1].Pointer() sizeAddr := args[2].Pointer() if tid < 0 { return 0, nil, linuxerr.EINVAL } ot := t if tid != 0 { if ot = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)); ot == nil { return 0, nil, linuxerr.ESRCH } } // Copy out head pointer. head := t.Arch().Native(uintptr(ot.GetRobustList())) if _, err := head.CopyOut(t, headAddr); err != nil { return 0, nil, err } // Copy out size, which is a constant. Note that while size isn't // an address, it is defined as the arch-dependent size_t, so it // needs to be converted to a native-sized int. size := t.Arch().Native(uintptr(linux.SizeOfRobustListHead)) if _, err := size.CopyOut(t, sizeAddr); err != nil { return 0, nil, err } return 0, nil, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_getdents.go000066400000000000000000000145731465435605700270200ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "fmt" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) // Getdents implements Linux syscall getdents(2). func Getdents(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return getdents(t, args, false /* isGetdents64 */) } // Getdents64 implements Linux syscall getdents64(2). func Getdents64(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return getdents(t, args, true /* isGetdents64 */) } // DirentStructBytesWithoutName is enough to fit (struct linux_dirent) and // (struct linux_dirent64) without accounting for the name parameter. const DirentStructBytesWithoutName = 8 + 8 + 2 + 1 + 1 func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() addr := args[1].Pointer() size := args[2].Int() if size < DirentStructBytesWithoutName { return 0, nil, linuxerr.EINVAL } file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // We want to be sure of the allowed buffer size before calling IterDirents, // because this function depends on IterDirents saving state of which dirent // was the last one that was successfully operated on. allowedSize, err := t.MemoryManager().EnsurePMAsExist(t, addr, int64(size), usermem.IOOpts{ AddressSpaceActive: true, }) if allowedSize == 0 { return 0, nil, err } cb := getGetdentsCallback(t, int(allowedSize), int(size), isGetdents64) err = file.IterDirents(t, cb) n, _ := t.CopyOutBytes(addr, cb.buf[:cb.copied]) putGetdentsCallback(cb) // Only report an error in case we didn't copy anything. // If we did manage to give _something_ to the caller then the correct // behaviour is to return success. if n == 0 { return 0, nil, err } return uintptr(n), nil, nil } type getdentsCallback struct { t *kernel.Task buf []byte copied int userReportedSize int isGetdents64 bool } var getdentsCallbackPool = sync.Pool{ New: func() any { return &getdentsCallback{} }, } func getGetdentsCallback(t *kernel.Task, size int, userReportedSize int, isGetdents64 bool) *getdentsCallback { cb := getdentsCallbackPool.Get().(*getdentsCallback) buf := cb.buf if cap(buf) < size { buf = make([]byte, size) } else { buf = buf[:size] } *cb = getdentsCallback{ t: t, buf: buf, copied: 0, userReportedSize: userReportedSize, isGetdents64: isGetdents64, } return cb } func putGetdentsCallback(cb *getdentsCallback) { cb.t = nil cb.buf = cb.buf[:0] getdentsCallbackPool.Put(cb) } // Handle implements vfs.IterDirentsCallback.Handle. func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error { remaining := len(cb.buf) - cb.copied if cb.isGetdents64 { // struct linux_dirent64 { // ino64_t d_ino; /* 64-bit inode number */ // off64_t d_off; /* 64-bit offset to next structure */ // unsigned short d_reclen; /* Size of this dirent */ // unsigned char d_type; /* File type */ // char d_name[]; /* Filename (null-terminated) */ // }; size := DirentStructBytesWithoutName + len(dirent.Name) size = (size + 7) &^ 7 // round up to multiple of 8 if size > remaining { // This is only needed to imitate Linux, since it writes out to the user // as it's iterating over dirs. We don't do that because we can't take // the mm.mappingMu while holding the filesystem mutex. if cb.copied == 0 && cb.userReportedSize >= size { return linuxerr.EFAULT } return linuxerr.EINVAL } buf := cb.buf[cb.copied : cb.copied+size] hostarch.ByteOrder.PutUint64(buf[0:8], dirent.Ino) hostarch.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff)) hostarch.ByteOrder.PutUint16(buf[16:18], uint16(size)) buf[18] = dirent.Type copy(buf[19:], dirent.Name) // Zero out all remaining bytes in buf, including the NUL terminator // after dirent.Name. bufTail := buf[19+len(dirent.Name):] clear(bufTail) cb.copied += size } else { // struct linux_dirent { // unsigned long d_ino; /* Inode number */ // unsigned long d_off; /* Offset to next linux_dirent */ // unsigned short d_reclen; /* Length of this linux_dirent */ // char d_name[]; /* Filename (null-terminated) */ // /* length is actually (d_reclen - 2 - // offsetof(struct linux_dirent, d_name)) */ // /* // char pad; // Zero padding byte // char d_type; // File type (only since Linux // // 2.6.4); offset is (d_reclen - 1) // */ // }; if cb.t.Arch().Width() != 8 { panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width())) } size := DirentStructBytesWithoutName + len(dirent.Name) size = (size + 7) &^ 7 // round up to multiple of sizeof(long) if size > remaining { if cb.copied == 0 && cb.userReportedSize >= size { return linuxerr.EFAULT } return linuxerr.EINVAL } buf := cb.buf[cb.copied : cb.copied+size] hostarch.ByteOrder.PutUint64(buf[0:8], dirent.Ino) hostarch.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff)) hostarch.ByteOrder.PutUint16(buf[16:18], uint16(size)) copy(buf[18:], dirent.Name) // Zero out all remaining bytes in buf, including the NUL terminator // after dirent.Name and the zero padding byte between the name and // dirent type. bufTail := buf[18+len(dirent.Name) : size-1] clear(bufTail) buf[size-1] = dirent.Type cb.copied += size } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_identity.go000066400000000000000000000142561465435605700270320ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) const ( // As NGROUPS_MAX in include/uapi/linux/limits.h. maxNGroups = 65536 ) // Getuid implements the Linux syscall getuid. func Getuid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { c := t.Credentials() ruid := c.RealKUID.In(c.UserNamespace).OrOverflow() return uintptr(ruid), nil, nil } // Geteuid implements the Linux syscall geteuid. func Geteuid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { c := t.Credentials() euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow() return uintptr(euid), nil, nil } // Getresuid implements the Linux syscall getresuid. func Getresuid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { ruidAddr := args[0].Pointer() euidAddr := args[1].Pointer() suidAddr := args[2].Pointer() c := t.Credentials() ruid := c.RealKUID.In(c.UserNamespace).OrOverflow() euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow() suid := c.SavedKUID.In(c.UserNamespace).OrOverflow() if _, err := ruid.CopyOut(t, ruidAddr); err != nil { return 0, nil, err } if _, err := euid.CopyOut(t, euidAddr); err != nil { return 0, nil, err } if _, err := suid.CopyOut(t, suidAddr); err != nil { return 0, nil, err } return 0, nil, nil } // Getgid implements the Linux syscall getgid. func Getgid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { c := t.Credentials() rgid := c.RealKGID.In(c.UserNamespace).OrOverflow() return uintptr(rgid), nil, nil } // Getegid implements the Linux syscall getegid. func Getegid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { c := t.Credentials() egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow() return uintptr(egid), nil, nil } // Getresgid implements the Linux syscall getresgid. func Getresgid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { rgidAddr := args[0].Pointer() egidAddr := args[1].Pointer() sgidAddr := args[2].Pointer() c := t.Credentials() rgid := c.RealKGID.In(c.UserNamespace).OrOverflow() egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow() sgid := c.SavedKGID.In(c.UserNamespace).OrOverflow() if _, err := rgid.CopyOut(t, rgidAddr); err != nil { return 0, nil, err } if _, err := egid.CopyOut(t, egidAddr); err != nil { return 0, nil, err } if _, err := sgid.CopyOut(t, sgidAddr); err != nil { return 0, nil, err } return 0, nil, nil } // Setuid implements the Linux syscall setuid. func Setuid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { uid := auth.UID(args[0].Int()) return 0, nil, t.SetUID(uid) } // Setreuid implements the Linux syscall setreuid. func Setreuid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { ruid := auth.UID(args[0].Int()) euid := auth.UID(args[1].Int()) return 0, nil, t.SetREUID(ruid, euid) } // Setresuid implements the Linux syscall setreuid. func Setresuid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { ruid := auth.UID(args[0].Int()) euid := auth.UID(args[1].Int()) suid := auth.UID(args[2].Int()) return 0, nil, t.SetRESUID(ruid, euid, suid) } // Setgid implements the Linux syscall setgid. func Setgid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { gid := auth.GID(args[0].Int()) return 0, nil, t.SetGID(gid) } // Setregid implements the Linux syscall setregid. func Setregid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { rgid := auth.GID(args[0].Int()) egid := auth.GID(args[1].Int()) return 0, nil, t.SetREGID(rgid, egid) } // Setresgid implements the Linux syscall setregid. func Setresgid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { rgid := auth.GID(args[0].Int()) egid := auth.GID(args[1].Int()) sgid := auth.GID(args[2].Int()) return 0, nil, t.SetRESGID(rgid, egid, sgid) } // Getgroups implements the Linux syscall getgroups. func Getgroups(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { size := int(args[0].Int()) if size < 0 { return 0, nil, linuxerr.EINVAL } kgids := t.Credentials().ExtraKGIDs // "If size is zero, list is not modified, but the total number of // supplementary group IDs for the process is returned." - getgroups(2) if size == 0 { return uintptr(len(kgids)), nil, nil } if size < len(kgids) { return 0, nil, linuxerr.EINVAL } gids := make([]auth.GID, len(kgids)) for i, kgid := range kgids { gids[i] = kgid.In(t.UserNamespace()).OrOverflow() } if _, err := auth.CopyGIDSliceOut(t, args[1].Pointer(), gids); err != nil { return 0, nil, err } return uintptr(len(gids)), nil, nil } // Setgroups implements the Linux syscall setgroups. func Setgroups(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { size := args[0].Int() if size < 0 || size > maxNGroups { return 0, nil, linuxerr.EINVAL } if size == 0 { return 0, nil, t.SetExtraGIDs(nil) } gids := make([]auth.GID, size) if _, err := auth.CopyGIDSliceIn(t, args[1].Pointer(), gids); err != nil { return 0, nil, err } return 0, nil, t.SetExtraGIDs(gids) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_inotify.go000066400000000000000000000072071465435605700266600ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" ) const allFlags = linux.IN_NONBLOCK | linux.IN_CLOEXEC // InotifyInit1 implements the inotify_init1() syscalls. func InotifyInit1(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() if flags&^allFlags != 0 { return 0, nil, linuxerr.EINVAL } ino, err := vfs.NewInotifyFD(t, t.Kernel().VFS(), uint32(flags)) if err != nil { return 0, nil, err } defer ino.DecRef(t) fd, err := t.NewFDFrom(0, ino, kernel.FDFlags{ CloseOnExec: flags&linux.IN_CLOEXEC != 0, }) if err != nil { return 0, nil, err } return uintptr(fd), nil, nil } // InotifyInit implements the inotify_init() syscalls. func InotifyInit(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { args[0].Value = 0 return InotifyInit1(t, sysno, args) } // fdToInotify resolves an fd to an inotify object. If successful, the file will // have an extra ref and the caller is responsible for releasing the ref. func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, error) { f := t.GetFile(fd) if f == nil { // Invalid fd. return nil, nil, linuxerr.EBADF } ino, ok := f.Impl().(*vfs.Inotify) if !ok { // Not an inotify fd. f.DecRef(t) return nil, nil, linuxerr.EINVAL } return ino, f, nil } // InotifyAddWatch implements the inotify_add_watch() syscall. func InotifyAddWatch(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() addr := args[1].Pointer() mask := args[2].Uint() // "EINVAL: The given event mask contains no valid events." // -- inotify_add_watch(2) if mask&linux.ALL_INOTIFY_BITS == 0 { return 0, nil, linuxerr.EINVAL } // "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link." // -- inotify(7) follow := followFinalSymlink if mask&linux.IN_DONT_FOLLOW != 0 { follow = nofollowFinalSymlink } ino, f, err := fdToInotify(t, fd) if err != nil { return 0, nil, err } defer f.DecRef(t) path, err := copyInPath(t, addr) if err != nil { return 0, nil, err } if mask&linux.IN_ONLYDIR != 0 { path.Dir = true } tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, follow) if err != nil { return 0, nil, err } defer tpop.Release(t) d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{}) if err != nil { return 0, nil, err } defer d.DecRef(t) return uintptr(ino.AddWatch(d.Dentry(), mask)), nil, nil } // InotifyRmWatch implements the inotify_rm_watch() syscall. func InotifyRmWatch(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() wd := args[1].Int() ino, f, err := fdToInotify(t, fd) if err != nil { return 0, nil, err } defer f.DecRef(t) return 0, nil, ino.RmWatch(t, wd) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_iouring.go000066400000000000000000000071401465435605700266470ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/iouringfs" "gvisor.dev/gvisor/pkg/sentry/kernel" ) // IOUringSetup implements linux syscall io_uring_setup(2). func IOUringSetup(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { if !kernel.IOUringEnabled { return 0, nil, linuxerr.ENOSYS } entries := uint32(args[0].Uint()) paramsAddr := args[1].Pointer() var params linux.IOUringParams if entries == 0 { return 0, nil, linuxerr.EINVAL } if _, err := params.CopyIn(t, paramsAddr); err != nil { return 0, nil, err } for i := int(0); i < len(params.Resv); i++ { if params.Resv[i] != 0 { return 0, nil, linuxerr.EINVAL } } // List of currently supported flags in our IO_URING implementation. const supportedFlags = 0 // Currently support none // Since we don't implement everything, we fail explicitly on flags that are unimplemented. if params.Flags|supportedFlags != supportedFlags { return 0, nil, linuxerr.EINVAL } vfsObj := t.Kernel().VFS() iouringfd, err := iouringfs.New(t, vfsObj, entries, ¶ms) if err != nil { // return 0, nil, err return 0, nil, linuxerr.EPERM } defer iouringfd.DecRef(t) fd, err := t.NewFDFrom(0, iouringfd, kernel.FDFlags{ // O_CLOEXEC is always set up. See io_uring/io_uring.c:io_uring_install_fd(). CloseOnExec: true, }) if err != nil { return 0, nil, err } if _, err := params.CopyOut(t, paramsAddr); err != nil { return 0, nil, err } return uintptr(fd), nil, nil } // IOUringEnter implements linux syscall io_uring_enter(2). func IOUringEnter(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { if !kernel.IOUringEnabled { return 0, nil, linuxerr.ENOSYS } fd := int32(args[0].Int()) toSubmit := uint32(args[1].Uint()) minComplete := uint32(args[2].Uint()) flags := uint32(args[3].Uint()) sigSet := args[4].Pointer() ret := -1 // List of currently supported flags for io_uring_enter(2). const supportedFlags = linux.IORING_ENTER_GETEVENTS // Since we don't implement everything, we fail explicitly on flags that are unimplemented. if flags|supportedFlags != supportedFlags { return uintptr(ret), nil, linuxerr.EINVAL } // Currently don't support replacing an existing signal mask. if sigSet != hostarch.Addr(0) { return uintptr(ret), nil, linuxerr.EFAULT } // If a user requested to submit zero SQEs, then we don't process any and return right away. if toSubmit == 0 { return uintptr(ret), nil, nil } file := t.GetFile(fd) if file == nil { return uintptr(ret), nil, linuxerr.EBADF } defer file.DecRef(t) iouringfd, ok := file.Impl().(*iouringfs.FileDescription) if !ok { return uintptr(ret), nil, linuxerr.EBADF } ret, err := iouringfd.ProcessSubmissions(t, toSubmit, minComplete, flags) if err != nil { return uintptr(ret), nil, err } return uintptr(ret), nil, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_key.go000066400000000000000000000111631465435605700257630ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "fmt" "math" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) // Keyctl implements Linux syscall keyctl(2). func Keyctl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { switch args[0].Int() { case linux.KEYCTL_GET_KEYRING_ID: return keyCtlGetKeyringID(t, args) case linux.KEYCTL_DESCRIBE: return keyctlDescribe(t, args) case linux.KEYCTL_JOIN_SESSION_KEYRING: return keyctlJoinSessionKeyring(t, args) case linux.KEYCTL_SETPERM: return keyctlSetPerm(t, args) } log.Debugf("Unimplemented keyctl operation: %d", args[0].Int()) kernel.IncrementUnimplementedSyscallCounter(sysno) return 0, nil, linuxerr.ENOSYS } // keyCtlGetKeyringID implements keyctl(2) with operation // KEYCTL_GET_KEYRING_ID. func keyCtlGetKeyringID(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { keyID := auth.KeySerial(args[1].Int()) var key *auth.Key var err error if keyID > 0 { // Not a special key ID, so return as-is. return uintptr(keyID), nil, nil } switch keyID { case linux.KEY_SPEC_SESSION_KEYRING: key, err = t.SessionKeyring() default: if keyID <= 0 { // Other special key IDs are not implemented. return 0, nil, linuxerr.ENOSYS } // For positive key IDs, KEYCTL_GET_KEYRING_ID can be used as an existence // and permissions check. key, err = t.LookupKey(keyID) } if err != nil { return 0, nil, err } return uintptr(key.ID), nil, nil } // keyctlDescribe implements keyctl(2) with operation KEYCTL_DESCRIBE. func keyctlDescribe(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { keyID := auth.KeySerial(args[1].Int()) bufPtr := args[2].Pointer() bufSize := args[3].SizeT() // Get address range to write to. if bufSize > math.MaxInt32 { bufSize = math.MaxInt32 } var key *auth.Key var err error switch keyID { case linux.KEY_SPEC_SESSION_KEYRING: key, err = t.SessionKeyring() default: key, err = t.LookupKey(keyID) } if err != nil { return 0, nil, err } uid := t.UserNamespace().MapFromKUID(key.KUID()) gid := t.UserNamespace().MapFromKGID(key.KGID()) keyDesc := fmt.Sprintf("%s;%d;%d;%08x;%s\x00", key.Type(), uid, gid, uint64(key.Permissions()), key.Description) if bufSize > 0 { toWrite := uint(len(keyDesc)) if toWrite > bufSize { toWrite = bufSize } _, err = t.CopyOutBytes(bufPtr, []byte(keyDesc)[:toWrite]) } // The KEYCTL_DESCRIBE operation returns the length of the full string, // regardless of whether or not it was fully written out to userspace. // It includes the zero byte at the end in the returned length. return uintptr(len(keyDesc)), nil, err } // keyctlJoinSessionKeyring implements keyctl(2) with operation // KEYCTL_JOIN_SESSION_KEYRING. func keyctlJoinSessionKeyring(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { keyDescPtr := args[1].Pointer() var key *auth.Key var err error if keyDescPtr == 0 { // Creating an anonymous keyring. key, err = t.JoinSessionKeyring(nil) } else { // Joining a named keyring. Read in its description. var keyringDesc string keyringDesc, err = t.CopyInString(keyDescPtr, auth.MaxKeyDescSize) if err != nil { return 0, nil, err } key, err = t.JoinSessionKeyring(&keyringDesc) } if err != nil { return 0, nil, err } return uintptr(key.ID), nil, nil } // keyctlSetPerm implements keyctl(2) with operation KEYCTL_SETPERM. func keyctlSetPerm(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { keyID := auth.KeySerial(args[1].Int()) newPerms := auth.KeyPermissions(args[2].Uint64()) var key *auth.Key var err error switch keyID { case linux.KEY_SPEC_SESSION_KEYRING: key, err = t.SessionKeyring() default: key, err = t.UserNamespace().Keys.Lookup(keyID) } if err != nil { return 0, nil, err } return 0, nil, t.SetPermsOnKey(key, newPerms) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_membarrier.go000066400000000000000000000065501465435605700273240ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ) // Membarrier implements syscall membarrier(2). func Membarrier(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { cmd := args[0].Int() flags := args[1].Uint() switch cmd { case linux.MEMBARRIER_CMD_QUERY: if flags != 0 { return 0, nil, linuxerr.EINVAL } var supportedCommands uintptr if t.Kernel().Platform.HaveGlobalMemoryBarrier() { supportedCommands |= linux.MEMBARRIER_CMD_GLOBAL | linux.MEMBARRIER_CMD_GLOBAL_EXPEDITED | linux.MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED | linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED | linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED } if t.RSeqAvailable() { supportedCommands |= linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ | linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ } return supportedCommands, nil, nil case linux.MEMBARRIER_CMD_GLOBAL, linux.MEMBARRIER_CMD_GLOBAL_EXPEDITED, linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED: if flags != 0 { return 0, nil, linuxerr.EINVAL } if !t.Kernel().Platform.HaveGlobalMemoryBarrier() { return 0, nil, linuxerr.EINVAL } if cmd == linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED && !t.MemoryManager().IsMembarrierPrivateEnabled() { return 0, nil, linuxerr.EPERM } return 0, nil, t.Kernel().Platform.GlobalMemoryBarrier() case linux.MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: if flags != 0 { return 0, nil, linuxerr.EINVAL } if !t.Kernel().Platform.HaveGlobalMemoryBarrier() { return 0, nil, linuxerr.EINVAL } // no-op return 0, nil, nil case linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: if flags != 0 { return 0, nil, linuxerr.EINVAL } if !t.Kernel().Platform.HaveGlobalMemoryBarrier() { return 0, nil, linuxerr.EINVAL } t.MemoryManager().EnableMembarrierPrivate() return 0, nil, nil case linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: if flags&^linux.MEMBARRIER_CMD_FLAG_CPU != 0 { return 0, nil, linuxerr.EINVAL } if !t.RSeqAvailable() { return 0, nil, linuxerr.EINVAL } if !t.MemoryManager().IsMembarrierRSeqEnabled() { return 0, nil, linuxerr.EPERM } // MEMBARRIER_CMD_FLAG_CPU and cpu_id are ignored since we don't have // the ability to preempt specific CPUs. return 0, nil, t.Kernel().Platform.PreemptAllCPUs() case linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: if flags != 0 { return 0, nil, linuxerr.EINVAL } if !t.RSeqAvailable() { return 0, nil, linuxerr.EINVAL } t.MemoryManager().EnableMembarrierRSeq() return 0, nil, nil default: // Probably a command we don't implement. t.Kernel().EmitUnimplementedEvent(t, sysno) return 0, nil, linuxerr.EINVAL } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_mempolicy.go000066400000000000000000000251201465435605700271670ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/usermem" ) // We unconditionally report a single NUMA node. This also means that our // "nodemask_t" is a single unsigned long (uint64). const ( maxNodes = 1 allowedNodemask = (1 << maxNodes) - 1 ) func copyInNodemask(t *kernel.Task, addr hostarch.Addr, maxnode uint32) (uint64, error) { // "nodemask points to a bit mask of node IDs that contains up to maxnode // bits. The bit mask size is rounded to the next multiple of // sizeof(unsigned long), but the kernel will use bits only up to maxnode. // A NULL value of nodemask or a maxnode value of zero specifies the empty // set of nodes. If the value of maxnode is zero, the nodemask argument is // ignored." - set_mempolicy(2). Unfortunately, most of this is inaccurate // because of what appears to be a bug: mm/mempolicy.c:get_nodes() uses // maxnode-1, not maxnode, as the number of bits. bits := maxnode - 1 if bits > hostarch.PageSize*8 { // also handles overflow from maxnode == 0 return 0, linuxerr.EINVAL } if bits == 0 { return 0, nil } // Copy in the whole nodemask. numUint64 := (bits + 63) / 64 buf := t.CopyScratchBuffer(int(numUint64) * 8) if _, err := t.CopyInBytes(addr, buf); err != nil { return 0, err } val := hostarch.ByteOrder.Uint64(buf) // Check that only allowed bits in the first unsigned long in the nodemask // are set. if val&^allowedNodemask != 0 { return 0, linuxerr.EINVAL } // Check that all remaining bits in the nodemask are 0. for i := 8; i < len(buf); i++ { if buf[i] != 0 { return 0, linuxerr.EINVAL } } return val, nil } func copyOutNodemask(t *kernel.Task, addr hostarch.Addr, maxnode uint32, val uint64) error { // mm/mempolicy.c:copy_nodes_to_user() also uses maxnode-1 as the number of // bits. bits := maxnode - 1 if bits > hostarch.PageSize*8 { // also handles overflow from maxnode == 0 return linuxerr.EINVAL } if bits == 0 { return nil } // Copy out the first unsigned long in the nodemask. buf := t.CopyScratchBuffer(8) hostarch.ByteOrder.PutUint64(buf, val) if _, err := t.CopyOutBytes(addr, buf); err != nil { return err } // Zero out remaining unsigned longs in the nodemask. if bits > 64 { remAddr, ok := addr.AddLength(8) if !ok { return linuxerr.EFAULT } remUint64 := (bits - 1) / 64 if _, err := t.MemoryManager().ZeroOut(t, remAddr, int64(remUint64)*8, usermem.IOOpts{ AddressSpaceActive: true, }); err != nil { return err } } return nil } // GetMempolicy implements the syscall get_mempolicy(2). func GetMempolicy(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { mode := args[0].Pointer() nodemask := args[1].Pointer() maxnode := args[2].Uint() addr := args[3].Pointer() flags := args[4].Uint() if flags&^(linux.MPOL_F_NODE|linux.MPOL_F_ADDR|linux.MPOL_F_MEMS_ALLOWED) != 0 { return 0, nil, linuxerr.EINVAL } nodeFlag := flags&linux.MPOL_F_NODE != 0 addrFlag := flags&linux.MPOL_F_ADDR != 0 memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0 // "EINVAL: The value specified by maxnode is less than the number of node // IDs supported by the system." - get_mempolicy(2) if nodemask != 0 && maxnode < maxNodes { return 0, nil, linuxerr.EINVAL } // "If flags specifies MPOL_F_MEMS_ALLOWED [...], the mode argument is // ignored and the set of nodes (memories) that the thread is allowed to // specify in subsequent calls to mbind(2) or set_mempolicy(2) (in the // absence of any mode flags) is returned in nodemask." if memsAllowed { // "It is not permitted to combine MPOL_F_MEMS_ALLOWED with either // MPOL_F_ADDR or MPOL_F_NODE." if nodeFlag || addrFlag { return 0, nil, linuxerr.EINVAL } if err := copyOutNodemask(t, nodemask, maxnode, allowedNodemask); err != nil { return 0, nil, err } return 0, nil, nil } // "If flags specifies MPOL_F_ADDR, then information is returned about the // policy governing the memory address given in addr. ... If the mode // argument is not NULL, then get_mempolicy() will store the policy mode // and any optional mode flags of the requested NUMA policy in the location // pointed to by this argument. If nodemask is not NULL, then the nodemask // associated with the policy will be stored in the location pointed to by // this argument." if addrFlag { policy, nodemaskVal, err := t.MemoryManager().NumaPolicy(addr) if err != nil { return 0, nil, err } if nodeFlag { // "If flags specifies both MPOL_F_NODE and MPOL_F_ADDR, // get_mempolicy() will return the node ID of the node on which the // address addr is allocated into the location pointed to by mode. // If no page has yet been allocated for the specified address, // get_mempolicy() will allocate a page as if the thread had // performed a read (load) access to that address, and return the // ID of the node where that page was allocated." buf := t.CopyScratchBuffer(1) _, err := t.CopyInBytes(addr, buf) if err != nil { return 0, nil, err } policy = linux.MPOL_DEFAULT // maxNodes == 1 } if mode != 0 { if _, err := policy.CopyOut(t, mode); err != nil { return 0, nil, err } } if nodemask != 0 { if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil { return 0, nil, err } } return 0, nil, nil } // "EINVAL: ... flags specified MPOL_F_ADDR and addr is NULL, or flags did // not specify MPOL_F_ADDR and addr is not NULL." This is partially // inaccurate: if flags specifies MPOL_F_ADDR, // mm/mempolicy.c:do_get_mempolicy() doesn't special-case NULL; it will // just (usually) fail to find a VMA at address 0 and return EFAULT. if addr != 0 { return 0, nil, linuxerr.EINVAL } // "If flags is specified as 0, then information about the calling thread's // default policy (as set by set_mempolicy(2)) is returned, in the buffers // pointed to by mode and nodemask. ... If flags specifies MPOL_F_NODE, but // not MPOL_F_ADDR, and the thread's current policy is MPOL_INTERLEAVE, // then get_mempolicy() will return in the location pointed to by a // non-NULL mode argument, the node ID of the next node that will be used // for interleaving of internal kernel pages allocated on behalf of the // thread." policy, nodemaskVal := t.NumaPolicy() if nodeFlag { if policy&^linux.MPOL_MODE_FLAGS != linux.MPOL_INTERLEAVE { return 0, nil, linuxerr.EINVAL } policy = linux.MPOL_DEFAULT // maxNodes == 1 } if mode != 0 { if _, err := policy.CopyOut(t, mode); err != nil { return 0, nil, err } } if nodemask != 0 { if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil { return 0, nil, err } } return 0, nil, nil } // SetMempolicy implements the syscall set_mempolicy(2). func SetMempolicy(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { modeWithFlags := linux.NumaPolicy(args[0].Int()) nodemask := args[1].Pointer() maxnode := args[2].Uint() modeWithFlags, nodemaskVal, err := copyInMempolicyNodemask(t, modeWithFlags, nodemask, maxnode) if err != nil { return 0, nil, err } t.SetNumaPolicy(modeWithFlags, nodemaskVal) return 0, nil, nil } // Mbind implements the syscall mbind(2). func Mbind(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() length := args[1].Uint64() mode := linux.NumaPolicy(args[2].Int()) nodemask := args[3].Pointer() maxnode := args[4].Uint() flags := args[5].Uint() if flags&^linux.MPOL_MF_VALID != 0 { return 0, nil, linuxerr.EINVAL } // "If MPOL_MF_MOVE_ALL is passed in flags ... [the] calling thread must be // privileged (CAP_SYS_NICE) to use this flag." - mbind(2) if flags&linux.MPOL_MF_MOVE_ALL != 0 && !t.HasCapability(linux.CAP_SYS_NICE) { return 0, nil, linuxerr.EPERM } mode, nodemaskVal, err := copyInMempolicyNodemask(t, mode, nodemask, maxnode) if err != nil { return 0, nil, err } // Since we claim to have only a single node, all flags can be ignored // (since all pages must already be on that single node). err = t.MemoryManager().SetNumaPolicy(addr, length, mode, nodemaskVal) return 0, nil, err } func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags linux.NumaPolicy, nodemask hostarch.Addr, maxnode uint32) (linux.NumaPolicy, uint64, error) { flags := linux.NumaPolicy(modeWithFlags & linux.MPOL_MODE_FLAGS) mode := linux.NumaPolicy(modeWithFlags &^ linux.MPOL_MODE_FLAGS) if flags == linux.MPOL_MODE_FLAGS { // Can't specify both mode flags simultaneously. return 0, 0, linuxerr.EINVAL } if mode < 0 || mode >= linux.MPOL_MAX { // Must specify a valid mode. return 0, 0, linuxerr.EINVAL } var nodemaskVal uint64 if nodemask != 0 { var err error nodemaskVal, err = copyInNodemask(t, nodemask, maxnode) if err != nil { return 0, 0, err } } switch mode { case linux.MPOL_DEFAULT: // "nodemask must be specified as NULL." - set_mempolicy(2). This is inaccurate; // Linux allows a nodemask to be specified, as long as it is empty. if nodemaskVal != 0 { return 0, 0, linuxerr.EINVAL } case linux.MPOL_BIND, linux.MPOL_INTERLEAVE: // These require a non-empty nodemask. if nodemaskVal == 0 { return 0, 0, linuxerr.EINVAL } case linux.MPOL_PREFERRED: // This permits an empty nodemask, as long as no flags are set. if nodemaskVal == 0 { if flags != 0 { return 0, 0, linuxerr.EINVAL } // On newer Linux versions, MPOL_PREFERRED is implemented as MPOL_LOCAL // when node set is empty. See 7858d7bca7fb ("mm/mempolicy: don't handle // MPOL_LOCAL like a fake MPOL_PREFERRED policy"). mode = linux.MPOL_LOCAL } case linux.MPOL_LOCAL: // This requires an empty nodemask and no flags set. if nodemaskVal != 0 || flags != 0 { return 0, 0, linuxerr.EINVAL } default: // Unknown mode, which we should have rejected above. panic(fmt.Sprintf("unknown mode: %v", mode)) } return mode | flags, nodemaskVal, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_mmap.go000066400000000000000000000256171465435605700261360ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "bytes" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" ) // Brk implements linux syscall brk(2). func Brk(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr, _ := t.MemoryManager().Brk(t, args[0].Pointer()) // "However, the actual Linux system call returns the new program break on // success. On failure, the system call returns the current break." - // brk(2) return uintptr(addr), nil, nil } // Mmap implements Linux syscall mmap(2). func Mmap(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { prot := args[2].Int() flags := args[3].Int() fd := args[4].Int() fixed := flags&linux.MAP_FIXED != 0 private := flags&linux.MAP_PRIVATE != 0 shared := flags&linux.MAP_SHARED != 0 anon := flags&linux.MAP_ANONYMOUS != 0 map32bit := flags&linux.MAP_32BIT != 0 // Require exactly one of MAP_PRIVATE and MAP_SHARED. if private == shared { return 0, nil, linuxerr.EINVAL } opts := memmap.MMapOpts{ Length: args[1].Uint64(), Offset: args[5].Uint64(), Addr: args[0].Pointer(), Fixed: fixed, Unmap: fixed, Map32Bit: map32bit, Private: private, Perms: hostarch.AccessType{ Read: linux.PROT_READ&prot != 0, Write: linux.PROT_WRITE&prot != 0, Execute: linux.PROT_EXEC&prot != 0, }, MaxPerms: hostarch.AnyAccess, GrowsDown: linux.MAP_GROWSDOWN&flags != 0, Stack: linux.MAP_STACK&flags != 0, } if linux.MAP_POPULATE&flags != 0 { opts.PlatformEffect = memmap.PlatformEffectCommit } if linux.MAP_LOCKED&flags != 0 { opts.MLockMode = memmap.MLockEager } defer func() { if opts.MappingIdentity != nil { opts.MappingIdentity.DecRef(t) } }() if !anon { // Convert the passed FD to a file reference. file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // mmap unconditionally requires that the FD is readable. if !file.IsReadable() { return 0, nil, linuxerr.EACCES } // MAP_SHARED requires that the FD be writable for PROT_WRITE. if shared && !file.IsWritable() { opts.MaxPerms.Write = false } if err := file.ConfigureMMap(t, &opts); err != nil { return 0, nil, err } } else if shared { // Back shared anonymous mappings with an anonymous tmpfs file. opts.Offset = 0 file, err := tmpfs.NewZeroFile(t, t.Credentials(), t.Kernel().ShmMount(), opts.Length) if err != nil { return 0, nil, err } defer file.DecRef(t) if err := file.ConfigureMMap(t, &opts); err != nil { return 0, nil, err } } rv, err := t.MemoryManager().MMap(t, opts) return uintptr(rv), nil, err } // Munmap implements linux syscall munmap(2). func Munmap(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64()) } // Mremap implements linux syscall mremap(2). func Mremap(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { oldAddr := args[0].Pointer() oldSize := args[1].Uint64() newSize := args[2].Uint64() flags := args[3].Uint64() newAddr := args[4].Pointer() if flags&^(linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED) != 0 { return 0, nil, linuxerr.EINVAL } mayMove := flags&linux.MREMAP_MAYMOVE != 0 fixed := flags&linux.MREMAP_FIXED != 0 var moveMode mm.MRemapMoveMode switch { case !mayMove && !fixed: moveMode = mm.MRemapNoMove case mayMove && !fixed: moveMode = mm.MRemapMayMove case mayMove && fixed: moveMode = mm.MRemapMustMove case !mayMove && fixed: // "If MREMAP_FIXED is specified, then MREMAP_MAYMOVE must also be // specified." - mremap(2) return 0, nil, linuxerr.EINVAL } rv, err := t.MemoryManager().MRemap(t, oldAddr, oldSize, newSize, mm.MRemapOpts{ Move: moveMode, NewAddr: newAddr, }) return uintptr(rv), nil, err } // Mprotect implements linux syscall mprotect(2). func Mprotect(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { length := args[1].Uint64() prot := args[2].Int() err := t.MemoryManager().MProtect(args[0].Pointer(), length, hostarch.AccessType{ Read: linux.PROT_READ&prot != 0, Write: linux.PROT_WRITE&prot != 0, Execute: linux.PROT_EXEC&prot != 0, }, linux.PROT_GROWSDOWN&prot != 0) return 0, nil, err } // Madvise implements linux syscall madvise(2). func Madvise(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() length := uint64(args[1].SizeT()) adv := args[2].Int() // "The Linux implementation requires that the address addr be // page-aligned, and allows length to be zero." - madvise(2) if addr.RoundDown() != addr { return 0, nil, linuxerr.EINVAL } if length == 0 { return 0, nil, nil } // Not explicitly stated: length need not be page-aligned. lenAddr, ok := hostarch.Addr(length).RoundUp() if !ok { return 0, nil, linuxerr.EINVAL } length = uint64(lenAddr) switch adv { case linux.MADV_DONTNEED: return 0, nil, t.MemoryManager().Decommit(addr, length) case linux.MADV_DOFORK: return 0, nil, t.MemoryManager().SetDontFork(addr, length, false) case linux.MADV_DONTFORK: return 0, nil, t.MemoryManager().SetDontFork(addr, length, true) case linux.MADV_HUGEPAGE, linux.MADV_NOHUGEPAGE: fallthrough case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE: fallthrough case linux.MADV_DONTDUMP, linux.MADV_DODUMP: // TODO(b/72045799): Core dumping isn't implemented, so these are // no-ops. fallthrough case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED: // Do nothing, we totally ignore the suggestions above. return 0, nil, nil case linux.MADV_REMOVE: // These "suggestions" have application-visible side effects, so we // have to indicate that we don't support them. return 0, nil, linuxerr.ENOSYS case linux.MADV_HWPOISON: // Only privileged processes are allowed to poison pages. return 0, nil, linuxerr.EPERM default: // If adv is not a valid value tell the caller. return 0, nil, linuxerr.EINVAL } } // Mincore implements the syscall mincore(2). func Mincore(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() length := args[1].SizeT() vec := args[2].Pointer() if addr != addr.RoundDown() { return 0, nil, linuxerr.EINVAL } // "The length argument need not be a multiple of the page size, but since // residency information is returned for whole pages, length is effectively // rounded up to the next multiple of the page size." - mincore(2) la, ok := hostarch.Addr(length).RoundUp() if !ok { return 0, nil, linuxerr.ENOMEM } ar, ok := addr.ToRange(uint64(la)) if !ok { return 0, nil, linuxerr.ENOMEM } // Pretend that all mapped pages are "resident in core". mapped := t.MemoryManager().VirtualMemorySizeRange(ar) // "ENOMEM: addr to addr + length contained unmapped memory." if mapped != uint64(la) { return 0, nil, linuxerr.ENOMEM } resident := bytes.Repeat([]byte{1}, int(mapped/hostarch.PageSize)) _, err := t.CopyOutBytes(vec, resident) return 0, nil, err } // Msync implements Linux syscall msync(2). func Msync(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() length := args[1].SizeT() flags := args[2].Int() // "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC, // and may additionally include the MS_INVALIDATE bit. ... However, Linux // permits a call to msync() that specifies neither of these flags, with // semantics that are (currently) equivalent to specifying MS_ASYNC." - // msync(2) if flags&^(linux.MS_ASYNC|linux.MS_SYNC|linux.MS_INVALIDATE) != 0 { return 0, nil, linuxerr.EINVAL } sync := flags&linux.MS_SYNC != 0 if sync && flags&linux.MS_ASYNC != 0 { return 0, nil, linuxerr.EINVAL } err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{ Sync: sync, Invalidate: flags&linux.MS_INVALIDATE != 0, }) // MSync calls fsync, the same interrupt conversion rules apply, see // mm/msync.c, fsync POSIX.1-2008. return 0, nil, linuxerr.ConvertIntr(err, linuxerr.ERESTARTSYS) } // Mlock implements linux syscall mlock(2). func Mlock(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() length := args[1].SizeT() return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager) } // Mlock2 implements linux syscall mlock2(2). func Mlock2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() length := args[1].SizeT() flags := args[2].Int() if flags&^(linux.MLOCK_ONFAULT) != 0 { return 0, nil, linuxerr.EINVAL } mode := memmap.MLockEager if flags&linux.MLOCK_ONFAULT != 0 { mode = memmap.MLockLazy } return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode) } // Munlock implements linux syscall munlock(2). func Munlock(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() length := args[1].SizeT() return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone) } // Mlockall implements linux syscall mlockall(2). func Mlockall(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 { return 0, nil, linuxerr.EINVAL } mode := memmap.MLockEager if flags&linux.MCL_ONFAULT != 0 { mode = memmap.MLockLazy } return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ Current: flags&linux.MCL_CURRENT != 0, Future: flags&linux.MCL_FUTURE != 0, Mode: mode, }) } // Munlockall implements linux syscall munlockall(2). func Munlockall(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ Current: true, Future: true, Mode: memmap.MLockNone, }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_mount.go000066400000000000000000000131141465435605700263330ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // Mount implements Linux syscall mount(2). func Mount(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { sourceAddr := args[0].Pointer() targetAddr := args[1].Pointer() typeAddr := args[2].Pointer() flags := args[3].Uint64() dataAddr := args[4].Pointer() // Must have CAP_SYS_ADMIN in the current mount namespace's associated user // namespace. creds := t.Credentials() if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().Owner) { return 0, nil, linuxerr.EPERM } // Ignore magic value that was required before Linux 2.4. if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL { flags = flags &^ linux.MS_MGC_MSK } // Silently allow MS_NOSUID, since we don't implement set-id bits anyway. const unsupported = linux.MS_UNBINDABLE | linux.MS_MOVE | linux.MS_NODIRATIME // Linux just allows passing any flags to mount(2) - it won't fail when // unknown or unsupported flags are passed. Since we don't implement // everything, we fail explicitly on flags that are unimplemented. if flags&(unsupported) != 0 { return 0, nil, linuxerr.EINVAL } // For null-terminated strings related to mount(2), Linux copies in at most // a page worth of data. See fs/namespace.c:copy_mount_string(). targetPath, err := copyInPath(t, targetAddr) if err != nil { return 0, nil, err } target, err := getTaskPathOperation(t, linux.AT_FDCWD, targetPath, disallowEmptyPath, followFinalSymlink) if err != nil { return 0, nil, err } defer target.Release(t) var opts vfs.MountOptions if flags&(linux.MS_NOATIME|linux.MS_STRICTATIME) == linux.MS_NOATIME { opts.Flags.NoATime = true } if flags&linux.MS_NOEXEC == linux.MS_NOEXEC { opts.Flags.NoExec = true } if flags&linux.MS_NODEV == linux.MS_NODEV { opts.Flags.NoDev = true } if flags&linux.MS_NOSUID == linux.MS_NOSUID { opts.Flags.NoSUID = true } if flags&linux.MS_RDONLY == linux.MS_RDONLY { opts.ReadOnly = true } data := "" if dataAddr != 0 { // In Linux, a full page is always copied in regardless of null // character placement, and the address is passed to each file system. // Most file systems always treat this data as a string, though, and so // do all of the ones we implement. data, err = t.CopyInString(dataAddr, hostarch.PageSize) if err != nil { return 0, nil, err } } opts.GetFilesystemOptions.Data = data switch { case flags&linux.MS_REMOUNT != 0: // When MS_REMOUNT is specified, the flags and data should match the values used in the original mount() call, // except for those parameters that are being changed. // // The src and filesystem type are ignored for MS_REMOUNT. return 0, nil, t.Kernel().VFS().RemountAt(t, creds, &target.pop, &opts) case flags&linux.MS_BIND != 0: sourcePath, err := copyInPath(t, sourceAddr) if err != nil { return 0, nil, err } var sourceTpop taskPathOperation sourceTpop, err = getTaskPathOperation(t, linux.AT_FDCWD, sourcePath, disallowEmptyPath, followFinalSymlink) if err != nil { return 0, nil, err } defer sourceTpop.Release(t) return 0, nil, t.Kernel().VFS().BindAt(t, creds, &sourceTpop.pop, &target.pop, flags&linux.MS_REC != 0) case flags&(linux.MS_SHARED|linux.MS_PRIVATE|linux.MS_SLAVE|linux.MS_UNBINDABLE) != 0: return 0, nil, t.Kernel().VFS().SetMountPropagationAt(t, creds, &target.pop, uint32(flags)) } // Only copy in source, fstype, and data if we are doing a normal mount. var source string if sourceAddr != 0 { source, err = t.CopyInString(sourceAddr, hostarch.PageSize) if err != nil { return 0, nil, err } } fsType, err := t.CopyInString(typeAddr, hostarch.PageSize) if err != nil { return 0, nil, err } _, err = t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts) return 0, nil, err } // Umount2 implements Linux syscall umount2(2). func Umount2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() flags := args[1].Int() // Must have CAP_SYS_ADMIN in the mount namespace's associated user // namespace. // // Currently, this is always the init task's user namespace. creds := t.Credentials() if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().Owner) { return 0, nil, linuxerr.EPERM } const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE if flags&unsupported != 0 { return 0, nil, linuxerr.EINVAL } path, err := copyInPath(t, addr) if err != nil { return 0, nil, err } tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.UMOUNT_NOFOLLOW == 0)) if err != nil { return 0, nil, err } defer tpop.Release(t) opts := vfs.UmountOptions{ Flags: uint32(flags &^ linux.UMOUNT_NOFOLLOW), } return 0, nil, t.Kernel().VFS().UmountAt(t, creds, &tpop.pop, &opts) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_mq.go000066400000000000000000000052571465435605700256170ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/mq" ) // MqOpen implements mq_open(2). func MqOpen(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { nameAddr := args[0].Pointer() flag := args[1].Int() mode := args[2].ModeT() attrAddr := args[3].Pointer() name, err := t.CopyInString(nameAddr, mq.MaxName) if err != nil { return 0, nil, err } rOnly := flag&linux.O_RDONLY == linux.O_RDONLY wOnly := flag&linux.O_WRONLY == linux.O_WRONLY readWrite := flag&linux.O_RDWR == linux.O_RDWR create := flag&linux.O_CREAT == linux.O_CREAT exclusive := flag&linux.O_EXCL == linux.O_EXCL block := flag&linux.O_NONBLOCK != linux.O_NONBLOCK var attr linux.MqAttr var attrPtr *linux.MqAttr if attrAddr != 0 { if _, err := attr.CopyIn(t, attrAddr); err != nil { return 0, nil, err } attrPtr = &attr } opts := openOpts(name, rOnly, wOnly, readWrite, create, exclusive, block) r := t.IPCNamespace().PosixQueues() queue, err := r.FindOrCreate(t, opts, linux.FileMode(mode), attrPtr) if err != nil { return 0, nil, err } defer queue.DecRef(t) fd, err := t.NewFDFrom(0, queue, kernel.FDFlags{ CloseOnExec: flag&linux.O_CLOEXEC != 0, }) if err != nil { return 0, nil, err } return uintptr(fd), nil, nil } // MqUnlink implements mq_unlink(2). func MqUnlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { nameAddr := args[0].Pointer() name, err := t.CopyInString(nameAddr, mq.MaxName) if err != nil { return 0, nil, err } return 0, nil, t.IPCNamespace().PosixQueues().Remove(t, name) } func openOpts(name string, rOnly, wOnly, readWrite, create, exclusive, block bool) mq.OpenOpts { var access mq.AccessType switch { case readWrite: access = mq.ReadWrite case wOnly: access = mq.WriteOnly case rOnly: access = mq.ReadOnly } return mq.OpenOpts{ Name: name, Access: access, Create: create, Exclusive: exclusive, Block: block, } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_msgqueue.go000066400000000000000000000124451465435605700270320ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" "gvisor.dev/gvisor/pkg/sentry/kernel/msgqueue" ) // Msgget implements msgget(2). func Msgget(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { key := ipc.Key(args[0].Int()) flag := args[1].Int() private := key == linux.IPC_PRIVATE create := flag&linux.IPC_CREAT == linux.IPC_CREAT exclusive := flag&linux.IPC_EXCL == linux.IPC_EXCL mode := linux.FileMode(flag & 0777) r := t.IPCNamespace().MsgqueueRegistry() queue, err := r.FindOrCreate(t, key, mode, private, create, exclusive) if err != nil { return 0, nil, err } return uintptr(queue.ID()), nil, nil } // Msgsnd implements msgsnd(2). func Msgsnd(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { id := ipc.ID(args[0].Int()) msgAddr := args[1].Pointer() size := args[2].Int64() flag := args[3].Int() if size < 0 || size > linux.MSGMAX { return 0, nil, linuxerr.EINVAL } wait := flag&linux.IPC_NOWAIT != linux.IPC_NOWAIT pid := int32(t.ThreadGroup().ID()) buf := linux.MsgBuf{ Text: make([]byte, size), } if _, err := buf.CopyIn(t, msgAddr); err != nil { return 0, nil, err } queue, err := t.IPCNamespace().MsgqueueRegistry().FindByID(id) if err != nil { return 0, nil, err } msg := msgqueue.Message{ Type: int64(buf.Type), Text: buf.Text, Size: uint64(size), } return 0, nil, queue.Send(t, msg, t, wait, pid) } // Msgrcv implements msgrcv(2). func Msgrcv(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { id := ipc.ID(args[0].Int()) msgAddr := args[1].Pointer() size := args[2].Int64() mType := args[3].Int64() flag := args[4].Int() wait := flag&linux.IPC_NOWAIT != linux.IPC_NOWAIT except := flag&linux.MSG_EXCEPT == linux.MSG_EXCEPT truncate := flag&linux.MSG_NOERROR == linux.MSG_NOERROR msgCopy := flag&linux.MSG_COPY == linux.MSG_COPY msg, err := receive(t, id, mType, size, msgCopy, wait, truncate, except) if err != nil { return 0, nil, err } buf := linux.MsgBuf{ Type: primitive.Int64(msg.Type), Text: msg.Text, } if _, err := buf.CopyOut(t, msgAddr); err != nil { return 0, nil, err } return uintptr(msg.Size), nil, nil } // receive returns a message from the queue with the given ID. If msgCopy is // true, a message is copied from the queue without being removed. Otherwise, // a message is removed from the queue and returned. func receive(t *kernel.Task, id ipc.ID, mType int64, maxSize int64, msgCopy, wait, truncate, except bool) (*msgqueue.Message, error) { pid := int32(t.ThreadGroup().ID()) queue, err := t.IPCNamespace().MsgqueueRegistry().FindByID(id) if err != nil { return nil, err } if msgCopy { if wait || except { return nil, linuxerr.EINVAL } return queue.Copy(mType) } return queue.Receive(t, t, mType, maxSize, wait, truncate, except, pid) } // Msgctl implements msgctl(2). func Msgctl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { id := ipc.ID(args[0].Int()) cmd := args[1].Int() buf := args[2].Pointer() creds := auth.CredentialsFromContext(t) r := t.IPCNamespace().MsgqueueRegistry() switch cmd { case linux.IPC_INFO: info := r.IPCInfo(t) _, err := info.CopyOut(t, buf) return 0, nil, err case linux.MSG_INFO: msgInfo := r.MsgInfo(t) _, err := msgInfo.CopyOut(t, buf) return 0, nil, err case linux.IPC_RMID: return 0, nil, r.Remove(id, creds) } // Remaining commands use a queue. queue, err := r.FindByID(id) if err != nil { return 0, nil, err } switch cmd { case linux.MSG_STAT: // Technically, we should be treating id as "an index into the kernel's // internal array that maintains information about all shared memory // segments on the system". Since we don't track segments in an array, // we'll just pretend the msqid is the index and do the same thing as // IPC_STAT. Linux also uses the index as the msqid. fallthrough case linux.IPC_STAT: stat, err := queue.Stat(t) if err != nil { return 0, nil, err } _, err = stat.CopyOut(t, buf) return 0, nil, err case linux.MSG_STAT_ANY: stat, err := queue.StatAny(t) if err != nil { return 0, nil, err } _, err = stat.CopyOut(t, buf) return 0, nil, err case linux.IPC_SET: var ds linux.MsqidDS if _, err := ds.CopyIn(t, buf); err != nil { return 0, nil, linuxerr.EINVAL } err := queue.Set(t, &ds) return 0, nil, err default: return 0, nil, linuxerr.EINVAL } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_pipe.go000066400000000000000000000040441465435605700261300ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // Pipe implements Linux syscall pipe(2). func Pipe(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() return 0, nil, pipe2(t, addr, 0) } // Pipe2 implements Linux syscall pipe2(2). func Pipe2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() flags := args[1].Int() return 0, nil, pipe2(t, addr, flags) } func pipe2(t *kernel.Task, addr hostarch.Addr, flags int32) error { if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 { return linuxerr.EINVAL } r, w, err := pipefs.NewConnectedPipeFDs(t, t.Kernel().PipeMount(), uint32(flags&linux.O_NONBLOCK)) if err != nil { return err } defer r.DecRef(t) defer w.DecRef(t) fds, err := t.NewFDs(0, []*vfs.FileDescription{r, w}, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, }) if err != nil { return err } if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil { for _, fd := range fds { if file := t.FDTable().Remove(t, fd); file != nil { file.DecRef(t) } } return err } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_poll.go000066400000000000000000000375321465435605700261510ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "fmt" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/waiter" ) // fileCap is the maximum allowable files for poll & select. This has no // equivalent in Linux; it exists in gVisor since allocation failure in Go is // unrecoverable. const fileCap = 1024 * 1024 // Masks for "readable", "writable", and "exceptional" events as defined by // select(2). const ( // selectReadEvents is analogous to the Linux kernel's // fs/select.c:POLLIN_SET. selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR // selectWriteEvents is analogous to the Linux kernel's // fs/select.c:POLLOUT_SET. selectWriteEvents = linux.POLLOUT | linux.POLLERR // selectExceptEvents is analogous to the Linux kernel's // fs/select.c:POLLEX_SET. selectExceptEvents = linux.POLLPRI ) // pollState tracks the associated file description and waiter of a PollFD. type pollState struct { file *vfs.FileDescription waiter waiter.Entry } // initReadiness gets the current ready mask for the file represented by the FD // stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is // used to register with the file for event notifications, and a reference to // the file is stored in "state". func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) error { if pfd.FD < 0 { pfd.REvents = 0 return nil } file := t.GetFile(pfd.FD) if file == nil { pfd.REvents = linux.POLLNVAL return nil } if ch == nil { defer file.DecRef(t) } else { state.file = file state.waiter.Init(waiter.ChannelNotifier(ch), waiter.EventMaskFromLinux(uint32(pfd.Events))) if err := file.EventRegister(&state.waiter); err != nil { return err } } r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events))) pfd.REvents = int16(r.ToLinux()) & pfd.Events return nil } // releaseState releases all the pollState in "state". func releaseState(t *kernel.Task, state []pollState) { for i := range state { if state[i].file != nil { state[i].file.EventUnregister(&state[i].waiter) state[i].file.DecRef(t) } } } // pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout" // when "timeout" is greater than zero. // // pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or // positive if interrupted by a signal. func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) { var ch chan struct{} if timeout != 0 { ch = make(chan struct{}, 1) } // Register for event notification in the files involved if we may // block (timeout not zero). Once we find a file that has a non-zero // result, we stop registering for events but still go through all files // to get their ready masks. state := make([]pollState, len(pfd)) defer releaseState(t, state) n := uintptr(0) for i := range pfd { if err := initReadiness(t, &pfd[i], &state[i], ch); err != nil { return timeout, 0, err } if pfd[i].REvents != 0 { n++ ch = nil } } if timeout == 0 { return timeout, n, nil } haveTimeout := timeout >= 0 for n == 0 { var err error // Wait for a notification. timeout, err = t.BlockWithTimeout(ch, haveTimeout, timeout) if err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { err = nil } return timeout, 0, err } // We got notified, count how many files are ready. If none, // then this was a spurious notification, and we just go back // to sleep with the remaining timeout. for i := range state { if state[i].file == nil { continue } r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events))) rl := int16(r.ToLinux()) & pfd[i].Events if rl != 0 { pfd[i].REvents = rl n++ } } } return timeout, n, nil } // CopyInPollFDs copies an array of struct pollfd unless nfds exceeds the max. func CopyInPollFDs(t *kernel.Task, addr hostarch.Addr, nfds uint) ([]linux.PollFD, error) { if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) { return nil, linuxerr.EINVAL } pfd := make([]linux.PollFD, nfds) if nfds > 0 { if _, err := linux.CopyPollFDSliceIn(t, addr, pfd); err != nil { return nil, err } } return pfd, nil } func doPoll(t *kernel.Task, addr hostarch.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) { pfd, err := CopyInPollFDs(t, addr, nfds) if err != nil { return timeout, 0, err } // Compatibility warning: Linux adds POLLHUP and POLLERR just before // polling, in fs/select.c:do_pollfd(). Since pfd is copied out after // polling, changing event masks here is an application-visible difference. // (Linux also doesn't copy out event masks at all, only revents.) for i := range pfd { pfd[i].Events |= linux.POLLHUP | linux.POLLERR } remainingTimeout, n, err := pollBlock(t, pfd, timeout) err = linuxerr.ConvertIntr(err, linuxerr.EINTR) // The poll entries are copied out regardless of whether // any are set or not. This aligns with the Linux behavior. if nfds > 0 && err == nil { if _, err := linux.CopyPollFDSliceOut(t, addr, pfd); err != nil { return remainingTimeout, 0, err } } return remainingTimeout, n, err } // CopyInFDSet copies an fd set from select(2)/pselect(2). func CopyInFDSet(t *kernel.Task, addr hostarch.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) { set := make([]byte, nBytes) if addr != 0 { if _, err := t.CopyInBytes(addr, set); err != nil { return nil, err } // If we only use part of the last byte, mask out the extraneous bits. // // N.B. This only works on little-endian architectures. if nBitsInLastPartialByte != 0 { set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte } } return set, nil } func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs hostarch.Addr, timeout time.Duration) (uintptr, error) { if nfds < 0 || nfds > fileCap { return 0, linuxerr.EINVAL } // Calculate the size of the fd sets (one bit per fd). nBytes := (nfds + 7) / 8 nBitsInLastPartialByte := nfds % 8 // Capture all the provided input vectors. r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte) if err != nil { return 0, err } w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte) if err != nil { return 0, err } e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte) if err != nil { return 0, err } // Count how many FDs are actually being requested so that we can build // a PollFD array. fdCount := 0 for i := 0; i < nBytes; i++ { v := r[i] | w[i] | e[i] for v != 0 { v &= (v - 1) fdCount++ } } // Build the PollFD array. pfd := make([]linux.PollFD, 0, fdCount) var fd int32 for i := 0; i < nBytes; i++ { rV, wV, eV := r[i], w[i], e[i] v := rV | wV | eV m := byte(1) for j := 0; j < 8; j++ { if (v & m) != 0 { // Make sure the fd is valid and decrement the reference // immediately to ensure we don't leak. Note, another thread // might be about to close fd. This is racy, but that's // OK. Linux is racy in the same way. file := t.GetFile(fd) if file == nil { return 0, linuxerr.EBADF } file.DecRef(t) var mask int16 if (rV & m) != 0 { mask |= selectReadEvents } if (wV & m) != 0 { mask |= selectWriteEvents } if (eV & m) != 0 { mask |= selectExceptEvents } pfd = append(pfd, linux.PollFD{ FD: fd, Events: mask, }) } fd++ m <<= 1 } } // Do the syscall, then count the number of bits set. if _, _, err = pollBlock(t, pfd, timeout); err != nil { return 0, linuxerr.ConvertIntr(err, linuxerr.EINTR) } // r, w, and e are currently event mask bitsets; unset bits corresponding // to events that *didn't* occur. bitSetCount := uintptr(0) for idx := range pfd { events := pfd[idx].REvents i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8) m := byte(1) << j if r[i]&m != 0 { if (events & selectReadEvents) != 0 { bitSetCount++ } else { r[i] &^= m } } if w[i]&m != 0 { if (events & selectWriteEvents) != 0 { bitSetCount++ } else { w[i] &^= m } } if e[i]&m != 0 { if (events & selectExceptEvents) != 0 { bitSetCount++ } else { e[i] &^= m } } } // Copy updated vectors back. if readFDs != 0 { if _, err := t.CopyOutBytes(readFDs, r); err != nil { return 0, err } } if writeFDs != 0 { if _, err := t.CopyOutBytes(writeFDs, w); err != nil { return 0, err } } if exceptFDs != 0 { if _, err := t.CopyOutBytes(exceptFDs, e); err != nil { return 0, err } } return bitSetCount, nil } // timeoutRemaining returns the amount of time remaining for the specified // timeout or 0 if it has elapsed. // // startNs must be from CLOCK_MONOTONIC. func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration { now := t.Kernel().MonotonicClock().Now() remaining := timeout - now.Sub(startNs) if remaining < 0 { remaining = 0 } return remaining } // copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr. // // startNs must be from CLOCK_MONOTONIC. func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr hostarch.Addr) error { if timeout <= 0 { return nil } remaining := timeoutRemaining(t, startNs, timeout) tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds()) _, err := tsRemaining.CopyOut(t, timespecAddr) return err } // copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr. // // startNs must be from CLOCK_MONOTONIC. func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr hostarch.Addr) error { if timeout <= 0 { return nil } remaining := timeoutRemaining(t, startNs, timeout) tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds()) _, err := tvRemaining.CopyOut(t, timevalAddr) return err } // pollRestartBlock encapsulates the state required to restart poll(2) via // restart_syscall(2). // // +stateify savable type pollRestartBlock struct { pfdAddr hostarch.Addr nfds uint timeout time.Duration } // Restart implements kernel.SyscallRestartBlock.Restart. func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) { return poll(t, p.pfdAddr, p.nfds, p.timeout) } func poll(t *kernel.Task, pfdAddr hostarch.Addr, nfds uint, timeout time.Duration) (uintptr, error) { remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout) // On an interrupt poll(2) is restarted with the remaining timeout. if linuxerr.Equals(linuxerr.EINTR, err) { t.SetSyscallRestartBlock(&pollRestartBlock{ pfdAddr: pfdAddr, nfds: nfds, timeout: remainingTimeout, }) return 0, linuxerr.ERESTART_RESTARTBLOCK } return n, err } // Poll implements linux syscall poll(2). func Poll(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pfdAddr := args[0].Pointer() nfds := uint(args[1].Uint()) // poll(2) uses unsigned long. timeout := time.Duration(args[2].Int()) * time.Millisecond n, err := poll(t, pfdAddr, nfds, timeout) return n, nil, err } // Ppoll implements linux syscall ppoll(2). func Ppoll(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pfdAddr := args[0].Pointer() nfds := uint(args[1].Uint()) // poll(2) uses unsigned long. timespecAddr := args[2].Pointer() maskAddr := args[3].Pointer() maskSize := uint(args[4].Uint()) timeout, err := copyTimespecInToDuration(t, timespecAddr) if err != nil { return 0, nil, err } var startNs ktime.Time if timeout > 0 { startNs = t.Kernel().MonotonicClock().Now() } if err := setTempSignalSet(t, maskAddr, maskSize); err != nil { return 0, nil, err } _, n, err := doPoll(t, pfdAddr, nfds, timeout) copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) // doPoll returns EINTR if interrupted, but ppoll is normally restartable // if interrupted by something other than a signal handled by the // application (i.e. returns ERESTARTNOHAND). However, if // copyOutTimespecRemaining failed, then the restarted ppoll would use the // wrong timeout, so the error should be left as EINTR. // // Note that this means that if err is nil but copyErr is not, copyErr is // ignored. This is consistent with Linux. if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil { err = linuxerr.ERESTARTNOHAND } return n, nil, err } // Select implements linux syscall select(2). func Select(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { nfds := int(args[0].Int()) // select(2) uses an int. readFDs := args[1].Pointer() writeFDs := args[2].Pointer() exceptFDs := args[3].Pointer() timevalAddr := args[4].Pointer() // Use a negative Duration to indicate "no timeout". timeout := time.Duration(-1) if timevalAddr != 0 { var timeval linux.Timeval if _, err := timeval.CopyIn(t, timevalAddr); err != nil { return 0, nil, err } if timeval.Sec < 0 || timeval.Usec < 0 { return 0, nil, linuxerr.EINVAL } timeout = time.Duration(timeval.ToNsecCapped()) } startNs := t.Kernel().MonotonicClock().Now() n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr) // See comment in Ppoll. if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil { err = linuxerr.ERESTARTNOHAND } return n, nil, err } // +marshal type sigSetWithSize struct { sigsetAddr uint64 sizeofSigset uint64 } // Pselect6 implements linux syscall pselect6(2). func Pselect6(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { nfds := int(args[0].Int()) // select(2) uses an int. readFDs := args[1].Pointer() writeFDs := args[2].Pointer() exceptFDs := args[3].Pointer() timespecAddr := args[4].Pointer() maskWithSizeAddr := args[5].Pointer() timeout, err := copyTimespecInToDuration(t, timespecAddr) if err != nil { return 0, nil, err } var startNs ktime.Time if timeout > 0 { startNs = t.Kernel().MonotonicClock().Now() } if maskWithSizeAddr != 0 { if t.Arch().Width() != 8 { panic(fmt.Sprintf("unsupported sizeof(void*): %d", t.Arch().Width())) } var maskStruct sigSetWithSize if _, err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil { return 0, nil, err } if err := setTempSignalSet(t, hostarch.Addr(maskStruct.sigsetAddr), uint(maskStruct.sizeofSigset)); err != nil { return 0, nil, err } } n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) // See comment in Ppoll. if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil { err = linuxerr.ERESTARTNOHAND } return n, nil, err } func setTempSignalSet(t *kernel.Task, maskAddr hostarch.Addr, maskSize uint) error { if maskAddr == 0 { return nil } if maskSize != linux.SignalSetSize { return linuxerr.EINVAL } var mask linux.SignalSet if _, err := mask.CopyIn(t, maskAddr); err != nil { return err } mask &^= kernel.UnblockableSignals oldmask := t.SignalMask() t.SetSignalMask(mask) t.SetSavedSignalMask(oldmask) return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_prctl.go000066400000000000000000000155041465435605700263220ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // Prctl implements linux syscall prctl(2). // It has a list of subfunctions which operate on the process. The arguments are // all based on each subfunction. func Prctl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { option := args[0].Int() switch option { case linux.PR_SET_PDEATHSIG: sig := linux.Signal(args[1].Int()) if sig != 0 && !sig.IsValid() { return 0, nil, linuxerr.EINVAL } t.SetParentDeathSignal(sig) return 0, nil, nil case linux.PR_GET_PDEATHSIG: _, err := primitive.CopyInt32Out(t, args[1].Pointer(), int32(t.ParentDeathSignal())) return 0, nil, err case linux.PR_GET_DUMPABLE: d := t.MemoryManager().Dumpability() switch d { case mm.NotDumpable: return linux.SUID_DUMP_DISABLE, nil, nil case mm.UserDumpable: return linux.SUID_DUMP_USER, nil, nil case mm.RootDumpable: return linux.SUID_DUMP_ROOT, nil, nil default: panic(fmt.Sprintf("Unknown dumpability %v", d)) } case linux.PR_SET_DUMPABLE: var d mm.Dumpability switch args[1].Int() { case linux.SUID_DUMP_DISABLE: d = mm.NotDumpable case linux.SUID_DUMP_USER: d = mm.UserDumpable default: // N.B. Userspace may not pass SUID_DUMP_ROOT. return 0, nil, linuxerr.EINVAL } t.MemoryManager().SetDumpability(d) return 0, nil, nil case linux.PR_GET_KEEPCAPS: if t.Credentials().KeepCaps { return 1, nil, nil } return 0, nil, nil case linux.PR_SET_KEEPCAPS: val := args[1].Int() // prctl(2): arg2 must be either 0 (permitted capabilities are cleared) // or 1 (permitted capabilities are kept). if val == 0 { t.SetKeepCaps(false) } else if val == 1 { t.SetKeepCaps(true) } else { return 0, nil, linuxerr.EINVAL } return 0, nil, nil case linux.PR_SET_NAME: addr := args[1].Pointer() name, err := t.CopyInString(addr, linux.TASK_COMM_LEN-1) if err != nil && !linuxerr.Equals(linuxerr.ENAMETOOLONG, err) { return 0, nil, err } t.SetName(name) case linux.PR_GET_NAME: addr := args[1].Pointer() buf := t.CopyScratchBuffer(linux.TASK_COMM_LEN) len := copy(buf, t.Name()) if len < linux.TASK_COMM_LEN { buf[len] = 0 len++ } _, err := t.CopyOutBytes(addr, buf[:len]) if err != nil { return 0, nil, err } case linux.PR_SET_MM: if !t.HasCapability(linux.CAP_SYS_RESOURCE) { return 0, nil, linuxerr.EPERM } switch args[1].Int() { case linux.PR_SET_MM_EXE_FILE: fd := args[2].Int() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // They trying to set exe to a non-file? stat, err := file.Stat(t, vfs.StatOptions{Mask: linux.STATX_TYPE}) if err != nil { return 0, nil, err } if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.FileTypeMask != linux.ModeRegular { return 0, nil, linuxerr.EBADF } // Set the underlying executable. t.MemoryManager().SetExecutable(t, file) case linux.PR_SET_MM_AUXV, linux.PR_SET_MM_START_CODE, linux.PR_SET_MM_END_CODE, linux.PR_SET_MM_START_DATA, linux.PR_SET_MM_END_DATA, linux.PR_SET_MM_START_STACK, linux.PR_SET_MM_START_BRK, linux.PR_SET_MM_BRK, linux.PR_SET_MM_ARG_START, linux.PR_SET_MM_ARG_END, linux.PR_SET_MM_ENV_START, linux.PR_SET_MM_ENV_END: t.Kernel().EmitUnimplementedEvent(t, sysno) fallthrough default: return 0, nil, linuxerr.EINVAL } case linux.PR_SET_NO_NEW_PRIVS: if args[1].Int() != 1 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 { return 0, nil, linuxerr.EINVAL } // PR_SET_NO_NEW_PRIVS is assumed to always be set. // See kernel.Task.updateCredsForExecLocked. return 0, nil, nil case linux.PR_GET_NO_NEW_PRIVS: if args[1].Int() != 0 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 { return 0, nil, linuxerr.EINVAL } return 1, nil, nil case linux.PR_SET_PTRACER: pid := args[1].Int() switch pid { case 0: t.ClearYAMAException() return 0, nil, nil case linux.PR_SET_PTRACER_ANY: t.SetYAMAException(nil) return 0, nil, nil default: tracer := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) if tracer == nil { return 0, nil, linuxerr.EINVAL } t.SetYAMAException(tracer) return 0, nil, nil } case linux.PR_SET_SECCOMP: if args[1].Int() != linux.SECCOMP_MODE_FILTER { // Unsupported mode. return 0, nil, linuxerr.EINVAL } return 0, nil, seccomp(t, linux.SECCOMP_SET_MODE_FILTER, 0, args[2].Pointer()) case linux.PR_GET_SECCOMP: return uintptr(t.SeccompMode()), nil, nil case linux.PR_CAPBSET_READ: cp := linux.Capability(args[1].Uint64()) if !cp.Ok() { return 0, nil, linuxerr.EINVAL } var rv uintptr if auth.CapabilitySetOf(cp)&t.Credentials().BoundingCaps != 0 { rv = 1 } return rv, nil, nil case linux.PR_CAPBSET_DROP: cp := linux.Capability(args[1].Uint64()) if !cp.Ok() { return 0, nil, linuxerr.EINVAL } return 0, nil, t.DropBoundingCapability(cp) case linux.PR_SET_CHILD_SUBREAPER: // "If arg2 is nonzero, set the "child subreaper" attribute of // the calling process; if arg2 is zero, unset the attribute." isSubreaper := args[1].Int() != 0 t.ThreadGroup().SetChildSubreaper(isSubreaper) return 0, nil, nil case linux.PR_GET_CHILD_SUBREAPER: var isSubreaper int32 if t.ThreadGroup().IsChildSubreaper() { isSubreaper = 1 } _, err := primitive.CopyInt32Out(t, args[1].Pointer(), isSubreaper) return 0, nil, err case linux.PR_GET_TIMING, linux.PR_SET_TIMING, linux.PR_GET_TSC, linux.PR_SET_TSC, linux.PR_TASK_PERF_EVENTS_DISABLE, linux.PR_TASK_PERF_EVENTS_ENABLE, linux.PR_GET_TIMERSLACK, linux.PR_SET_TIMERSLACK, linux.PR_MCE_KILL, linux.PR_MCE_KILL_GET, linux.PR_GET_TID_ADDRESS, linux.PR_GET_THP_DISABLE, linux.PR_SET_THP_DISABLE, linux.PR_MPX_ENABLE_MANAGEMENT, linux.PR_MPX_DISABLE_MANAGEMENT: t.Kernel().EmitUnimplementedEvent(t, sysno) fallthrough default: return 0, nil, linuxerr.EINVAL } return 0, nil, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_process_vm.go000066400000000000000000000140511465435605700273520ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/usermem" ) type processVMOpType int const ( processVMOpRead = iota processVMOpWrite ) // ProcessVMReadv implements process_vm_readv(2). func ProcessVMReadv(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return processVMOp(t, args, processVMOpRead) } // ProcessVMWritev implements process_vm_writev(2). func ProcessVMWritev(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return processVMOp(t, args, processVMOpWrite) } func processVMOp(t *kernel.Task, args arch.SyscallArguments, op processVMOpType) (uintptr, *kernel.SyscallControl, error) { pid := kernel.ThreadID(args[0].Int()) lvec := hostarch.Addr(args[1].Pointer()) liovcnt := int(args[2].Int64()) rvec := hostarch.Addr(args[3].Pointer()) riovcnt := int(args[4].Int64()) flags := args[5].Int() // Parse the flags. switch { case flags != 0 || liovcnt < 0 || riovcnt < 0 || liovcnt > linux.UIO_MAXIOV || riovcnt > linux.UIO_MAXIOV: return 0, nil, linuxerr.EINVAL case liovcnt == 0 || riovcnt == 0: return 0, nil, nil case lvec == 0 || rvec == 0: return 0, nil, linuxerr.EFAULT } // Local process is always the current task (t). Remote process is the // pid specified in the syscall arguments. It is allowed to be the same // as the caller process. remoteTask := t.PIDNamespace().TaskWithID(pid) if remoteTask == nil { return 0, nil, linuxerr.ESRCH } // man 2 process_vm_read: "Permission to read from or write to another // process is governed by a ptrace access mode // PTRACE_MODE_ATTACH_REALCREDS check; see ptrace(2)." if !t.CanTrace(remoteTask, true /* attach */) { return 0, nil, linuxerr.EPERM } // Figure out which processes and arguments (local or remote) are for // writing and which are for reading, based on the operation. var opArgs processVMOpArgs switch op { case processVMOpRead: // Read from remote process and write into local. opArgs = processVMOpArgs{ readCtx: remoteTask.CopyContext(t, usermem.IOOpts{}), readAddr: rvec, readIovecCount: riovcnt, writeCtx: t.CopyContext(t, usermem.IOOpts{AddressSpaceActive: true}), writeAddr: lvec, writeIovecCount: liovcnt, } case processVMOpWrite: // Read from local process and write into remote. opArgs = processVMOpArgs{ readCtx: t.CopyContext(t, usermem.IOOpts{AddressSpaceActive: true}), readAddr: lvec, readIovecCount: liovcnt, writeCtx: remoteTask.CopyContext(t, usermem.IOOpts{}), writeAddr: rvec, writeIovecCount: riovcnt, } default: panic(fmt.Sprintf("unknown process vm op type: %v", op)) } var ( n int err error ) if t == remoteTask { // No need to lock remote process's task mutex since it is the // same as this process. n, err = doProcessVMOpMaybeLocked(t, opArgs) } else { // Need to take remote process's task mutex to pin // remoteTask.MemoryManager(). remoteTask.WithMuLocked(func(*kernel.Task) { if remoteTask.MemoryManager() == nil { err = linuxerr.ESRCH return } n, err = doProcessVMOpMaybeLocked(t, opArgs) }) } if n == 0 && err != nil { return 0, nil, err } return uintptr(n), nil, nil } type processVMOpArgs struct { readCtx marshal.CopyContext readAddr hostarch.Addr readIovecCount int writeCtx marshal.CopyContext writeAddr hostarch.Addr writeIovecCount int } // maxScratchBufferSize is the maximum size of a scratch buffer. It should be // sufficiently large to minimizing the number of trips through MM. const maxScratchBufferSize = 1 << 20 func doProcessVMOpMaybeLocked(t *kernel.Task, args processVMOpArgs) (int, error) { // Copy IOVecs in to kernel. readIovecs, err := t.CopyInIovecsAsSlice(args.readAddr, args.readIovecCount) if err != nil { return 0, err } writeIovecs, err := t.CopyInIovecsAsSlice(args.writeAddr, args.writeIovecCount) if err != nil { return 0, err } // Get scratch buffer from the calling task. // Size should be max be size of largest read iovec. var bufSize int for _, readIovec := range readIovecs { if int(readIovec.Length()) > bufSize { bufSize = int(readIovec.Length()) } } if bufSize > maxScratchBufferSize { bufSize = maxScratchBufferSize } buf := t.CopyScratchBuffer(bufSize) // Number of bytes written. var n int for len(readIovecs) != 0 && len(writeIovecs) != 0 { readIovec := readIovecs[0] length := readIovec.Length() if length == 0 { readIovecs = readIovecs[1:] continue } if length > maxScratchBufferSize { length = maxScratchBufferSize } buf = buf[0:int(length)] bytes, err := args.readCtx.CopyInBytes(readIovec.Start, buf) if bytes == 0 { return n, err } readIovecs[0].Start += hostarch.Addr(bytes) start := 0 for bytes > start && len(writeIovecs) > 0 { writeLength := int(writeIovecs[0].Length()) if writeLength == 0 { writeIovecs = writeIovecs[1:] continue } if writeLength > (bytes - start) { writeLength = bytes - start } out, err := args.writeCtx.CopyOutBytes(writeIovecs[0].Start, buf[start:writeLength+start]) n += out start += out if out != writeLength { return n, err } writeIovecs[0].Start += hostarch.Addr(out) } } return n, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_random.go000066400000000000000000000040151465435605700264510ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "math" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/usermem" ) const ( _GRND_NONBLOCK = 0x1 _GRND_RANDOM = 0x2 ) // GetRandom implements the linux syscall getrandom(2). // // In a multi-tenant/shared environment, the only valid implementation is to // fetch data from the urandom pool, otherwise starvation attacks become // possible. The urandom pool is also expected to have plenty of entropy, thus // the GRND_RANDOM flag is ignored. The GRND_NONBLOCK flag does not apply, as // the pool will already be initialized. func GetRandom(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() length := args[1].SizeT() flags := args[2].Int() // Flags are checked for validity but otherwise ignored. See above. if flags & ^(_GRND_NONBLOCK|_GRND_RANDOM) != 0 { return 0, nil, linuxerr.EINVAL } if length > math.MaxInt32 { length = math.MaxInt32 } ar, ok := addr.ToRange(uint64(length)) if !ok { return 0, nil, linuxerr.EFAULT } n, err := t.MemoryManager().CopyOutFrom(t, hostarch.AddrRangeSeqOf(ar), safemem.FromIOReader{rand.Reader}, usermem.IOOpts{ AddressSpaceActive: true, }) if n > 0 { return uintptr(n), nil, nil } return 0, nil, err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_read_write.go000066400000000000000000000402441465435605700273220ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) const ( eventMaskRead = waiter.EventRdNorm | waiter.EventIn | waiter.EventHUp | waiter.EventErr | waiter.EventRdHUp eventMaskWrite = waiter.EventWrNorm | waiter.EventOut | waiter.EventHUp | waiter.EventErr ) // Read implements Linux syscall read(2). func Read(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() addr := args[1].Pointer() size := args[2].SizeT() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the size is legitimate. si := int(size) if si < 0 { return 0, nil, linuxerr.EINVAL } // Get the destination of the read. dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ AddressSpaceActive: true, }) if err != nil { return 0, nil, err } n, err := read(t, file, dst, vfs.ReadOptions{}) t.IOUsage().AccountReadSyscall(n) return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "read", file) } // Readv implements Linux syscall readv(2). func Readv(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Get the destination of the read. dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ AddressSpaceActive: true, }) if err != nil { return 0, nil, err } n, err := read(t, file, dst, vfs.ReadOptions{}) t.IOUsage().AccountReadSyscall(n) return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "readv", file) } func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { n, err := file.Read(t, dst, opts) if !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { return n, err } // Register for notifications. w, ch := waiter.NewChannelEntry(eventMaskRead) if err := file.EventRegister(&w); err != nil { return n, err } total := n for { // Shorten dst to reflect bytes previously read. dst = dst.DropFirst(int(n)) // Issue the request and break out if it completes with anything other than // "would block". n, err = file.Read(t, dst, opts) total += n if !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { break } // Wait for a notification that we should retry. if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { err = linuxerr.ErrWouldBlock } break } } file.EventUnregister(&w) return total, err } // Pread64 implements Linux syscall pread64(2). func Pread64(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() addr := args[1].Pointer() size := args[2].SizeT() offset := args[3].Int64() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { return 0, nil, linuxerr.EINVAL } // Check that the size is legitimate. si := int(size) if si < 0 { return 0, nil, linuxerr.EINVAL } // Get the destination of the read. dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ AddressSpaceActive: true, }) if err != nil { return 0, nil, err } n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) t.IOUsage().AccountReadSyscall(n) return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "pread64", file) } // Preadv implements Linux syscall preadv(2). func Preadv(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) offset := args[3].Int64() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { return 0, nil, linuxerr.EINVAL } // Get the destination of the read. dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ AddressSpaceActive: true, }) if err != nil { return 0, nil, err } n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) t.IOUsage().AccountReadSyscall(n) return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "preadv", file) } // Preadv2 implements Linux syscall preadv2(2). func Preadv2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { // While the glibc signature is // preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags) // the actual syscall // (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1142) // splits the offset argument into a high/low value for compatibility with // 32-bit architectures. The flags argument is the 6th argument (index 5). fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) offset := args[3].Int64() flags := args[5].Int() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { return 0, nil, linuxerr.EINVAL } // Get the destination of the read. dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ AddressSpaceActive: true, }) if err != nil { return 0, nil, err } opts := vfs.ReadOptions{ Flags: uint32(flags), } var n int64 if offset == -1 { n, err = read(t, file, dst, opts) } else { n, err = pread(t, file, dst, offset, opts) } t.IOUsage().AccountReadSyscall(n) return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "preadv2", file) } func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { n, err := file.PRead(t, dst, offset, opts) if !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { return n, err } // Register for notifications. w, ch := waiter.NewChannelEntry(eventMaskRead) if err := file.EventRegister(&w); err != nil { return n, err } total := n for { // Shorten dst to reflect bytes previously read. dst = dst.DropFirst(int(n)) // Issue the request and break out if it completes with anything other than // "would block". n, err = file.PRead(t, dst, offset+total, opts) total += n if !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { break } // Wait for a notification that we should retry. if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { err = linuxerr.ErrWouldBlock } break } } file.EventUnregister(&w) return total, err } // Write implements Linux syscall write(2). func Write(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() addr := args[1].Pointer() size := args[2].SizeT() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the size is legitimate. si := int(size) if si < 0 { return 0, nil, linuxerr.EINVAL } // Get the source of the write. src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ AddressSpaceActive: true, }) if err != nil { return 0, nil, err } n, err := write(t, file, src, vfs.WriteOptions{}) t.IOUsage().AccountWriteSyscall(n) return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "write", file) } // Writev implements Linux syscall writev(2). func Writev(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Get the source of the write. src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ AddressSpaceActive: true, }) if err != nil { return 0, nil, err } n, err := write(t, file, src, vfs.WriteOptions{}) t.IOUsage().AccountWriteSyscall(n) return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "writev", file) } func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { n, err := file.Write(t, src, opts) if !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { return n, err } // Register for notifications. w, ch := waiter.NewChannelEntry(eventMaskWrite) if err := file.EventRegister(&w); err != nil { return n, err } total := n for { // Shorten src to reflect bytes previously written. src = src.DropFirst(int(n)) // Issue the request and break out if it completes with anything other than // "would block". n, err = file.Write(t, src, opts) total += n if !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { break } // Wait for a notification that we should retry. if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { err = linuxerr.ErrWouldBlock } break } } file.EventUnregister(&w) return total, err } // Pwrite64 implements Linux syscall pwrite64(2). func Pwrite64(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() addr := args[1].Pointer() size := args[2].SizeT() offset := args[3].Int64() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { return 0, nil, linuxerr.EINVAL } // Check that the size is legitimate. si := int(size) if si < 0 { return 0, nil, linuxerr.EINVAL } // Get the source of the write. src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ AddressSpaceActive: true, }) if err != nil { return 0, nil, err } n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) t.IOUsage().AccountWriteSyscall(n) return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "pwrite64", file) } // Pwritev implements Linux syscall pwritev(2). func Pwritev(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) offset := args[3].Int64() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { return 0, nil, linuxerr.EINVAL } // Get the source of the write. src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ AddressSpaceActive: true, }) if err != nil { return 0, nil, err } n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) t.IOUsage().AccountReadSyscall(n) return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "pwritev", file) } // Pwritev2 implements Linux syscall pwritev2(2). func Pwritev2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { // While the glibc signature is // pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags) // the actual syscall // (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1162) // splits the offset argument into a high/low value for compatibility with // 32-bit architectures. The flags argument is the 6th argument (index 5). fd := args[0].Int() addr := args[1].Pointer() iovcnt := int(args[2].Int()) offset := args[3].Int64() flags := args[5].Int() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { return 0, nil, linuxerr.EINVAL } // Get the source of the write. src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ AddressSpaceActive: true, }) if err != nil { return 0, nil, err } opts := vfs.WriteOptions{ Flags: uint32(flags), } var n int64 if offset == -1 { n, err = write(t, file, src, opts) } else { n, err = pwrite(t, file, src, offset, opts) } t.IOUsage().AccountWriteSyscall(n) return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "pwritev2", file) } func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { n, err := file.PWrite(t, src, offset, opts) if !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { return n, err } // Register for notifications. w, ch := waiter.NewChannelEntry(eventMaskWrite) if err := file.EventRegister(&w); err != nil { return n, err } total := n for { // Shorten src to reflect bytes previously written. src = src.DropFirst(int(n)) // Issue the request and break out if it completes with anything other than // "would block". n, err = file.PWrite(t, src, offset+total, opts) total += n if !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { break } // Wait for a notification that we should retry. if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { err = linuxerr.ErrWouldBlock } break } } file.EventUnregister(&w) return total, err } func blockPolicy(t *kernel.Task, file *vfs.FileDescription) (allowBlock bool, deadline ktime.Time, hasDeadline bool) { if file.StatusFlags()&linux.O_NONBLOCK != 0 { return false, ktime.Time{}, false } // Sockets support read/write timeouts. if s, ok := file.Impl().(socket.Socket); ok { dl := s.RecvTimeout() if dl < 0 { return false, ktime.Time{}, false } if dl > 0 { return true, t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond), true } } return true, ktime.Time{}, false } // Lseek implements Linux syscall lseek(2). func Lseek(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() offset := args[1].Int64() whence := args[2].Int() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) newoff, err := file.Seek(t, offset, whence) return uintptr(newoff), nil, err } // Readahead implements readahead(2). func Readahead(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() offset := args[1].Int64() size := args[2].SizeT() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Check that the file is readable. if !file.IsReadable() { return 0, nil, linuxerr.EBADF } // Check that the size is valid. if int(size) < 0 { return 0, nil, linuxerr.EINVAL } // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { return 0, nil, linuxerr.EINVAL } // Return EINVAL; if the underlying file type does not support readahead, // then Linux will return EINVAL to indicate as much. In the future, we // may extend this function to actually support readahead hints. return 0, nil, linuxerr.EINVAL } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_rlimit.go000066400000000000000000000147371465435605700265050ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/limits" ) // rlimit describes an implementation of 'struct rlimit', which may vary from // system-to-system. type rlimit interface { marshal.Marshallable // toLimit converts an rlimit to a limits.Limit. toLimit() *limits.Limit // fromLimit converts a limits.Limit to an rlimit. fromLimit(lim limits.Limit) } // newRlimit returns the appropriate rlimit type for 'struct rlimit' on this system. func newRlimit(t *kernel.Task) (rlimit, error) { switch t.Arch().Width() { case 8: // On 64-bit system, struct rlimit and struct rlimit64 are identical. return &rlimit64{}, nil default: return nil, linuxerr.ENOSYS } } // +marshal type rlimit64 struct { Cur uint64 Max uint64 } func (r *rlimit64) toLimit() *limits.Limit { return &limits.Limit{ Cur: limits.FromLinux(r.Cur), Max: limits.FromLinux(r.Max), } } func (r *rlimit64) fromLimit(lim limits.Limit) { *r = rlimit64{ Cur: limits.ToLinux(lim.Cur), Max: limits.ToLinux(lim.Max), } } func (r *rlimit64) copyIn(t *kernel.Task, addr hostarch.Addr) error { _, err := r.CopyIn(t, addr) return err } func (r *rlimit64) copyOut(t *kernel.Task, addr hostarch.Addr) error { _, err := r.CopyOut(t, addr) return err } func makeRlimit64(lim limits.Limit) *rlimit64 { return &rlimit64{Cur: lim.Cur, Max: lim.Max} } // setableLimits is the set of supported setable limits. var setableLimits = map[limits.LimitType]struct{}{ limits.NumberOfFiles: {}, limits.AS: {}, limits.CPU: {}, limits.Data: {}, limits.FileSize: {}, limits.MemoryLocked: {}, limits.Stack: {}, // RSS can be set, but it's not enforced because Linux doesn't enforce it // either: "This limit has effect only in Linux 2.4.x, x < 30" limits.Rss: {}, // These are not enforced, but we include them here to avoid returning // EPERM, since some apps expect them to succeed. limits.Core: {}, limits.ProcessCount: {}, } func prlimit64(t *kernel.Task, resource limits.LimitType, newLim *limits.Limit) (limits.Limit, error) { if newLim == nil { return t.ThreadGroup().Limits().Get(resource), nil } if _, ok := setableLimits[resource]; !ok { return limits.Limit{}, linuxerr.EPERM } switch resource { case limits.NumberOfFiles: if newLim.Max > uint64(t.Kernel().MaxFDLimit.Load()) { return limits.Limit{}, linuxerr.EPERM } } // "A privileged process (under Linux: one with the CAP_SYS_RESOURCE // capability in the initial user namespace) may make arbitrary changes // to either limit value." privileged := t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.Kernel().RootUserNamespace()) oldLim, err := t.ThreadGroup().Limits().Set(resource, *newLim, privileged) if err != nil { return limits.Limit{}, err } if resource == limits.CPU { t.NotifyRlimitCPUUpdated() } return oldLim, nil } // Getrlimit implements linux syscall getrlimit(2). func Getrlimit(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { resource, ok := limits.FromLinuxResource[int(args[0].Int())] if !ok { // Return err; unknown limit. return 0, nil, linuxerr.EINVAL } addr := args[1].Pointer() rlim, err := newRlimit(t) if err != nil { return 0, nil, err } lim, err := prlimit64(t, resource, nil) if err != nil { return 0, nil, err } rlim.fromLimit(lim) _, err = rlim.CopyOut(t, addr) return 0, nil, err } // Setrlimit implements linux syscall setrlimit(2). func Setrlimit(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { resource, ok := limits.FromLinuxResource[int(args[0].Int())] if !ok { // Return err; unknown limit. return 0, nil, linuxerr.EINVAL } addr := args[1].Pointer() rlim, err := newRlimit(t) if err != nil { return 0, nil, err } if _, err := rlim.CopyIn(t, addr); err != nil { return 0, nil, linuxerr.EFAULT } _, err = prlimit64(t, resource, rlim.toLimit()) return 0, nil, err } // Prlimit64 implements linux syscall prlimit64(2). func Prlimit64(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { tid := kernel.ThreadID(args[0].Int()) resource, ok := limits.FromLinuxResource[int(args[1].Int())] if !ok { // Return err; unknown limit. return 0, nil, linuxerr.EINVAL } newRlimAddr := args[2].Pointer() oldRlimAddr := args[3].Pointer() var newLim *limits.Limit if newRlimAddr != 0 { var nrl rlimit64 if err := nrl.copyIn(t, newRlimAddr); err != nil { return 0, nil, linuxerr.EFAULT } newLim = nrl.toLimit() } if tid < 0 { return 0, nil, linuxerr.EINVAL } ot := t if tid > 0 { if ot = t.PIDNamespace().TaskWithID(tid); ot == nil { return 0, nil, linuxerr.ESRCH } } // "To set or get the resources of a process other than itself, the caller // must have the CAP_SYS_RESOURCE capability, or the real, effective, and // saved set user IDs of the target process must match the real user ID of // the caller and the real, effective, and saved set group IDs of the // target process must match the real group ID of the caller." if ot != t && !t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.PIDNamespace().UserNamespace()) { cred, tcred := t.Credentials(), ot.Credentials() if cred.RealKUID != tcred.RealKUID || cred.RealKUID != tcred.EffectiveKUID || cred.RealKUID != tcred.SavedKUID || cred.RealKGID != tcred.RealKGID || cred.RealKGID != tcred.EffectiveKGID || cred.RealKGID != tcred.SavedKGID { return 0, nil, linuxerr.EPERM } } oldLim, err := prlimit64(ot, resource, newLim) if err != nil { return 0, nil, err } if oldRlimAddr != 0 { if err := makeRlimit64(oldLim).copyOut(t, oldRlimAddr); err != nil { return 0, nil, linuxerr.EFAULT } } return 0, nil, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_rseq.go000066400000000000000000000027101465435605700261430ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ) // RSeq implements syscall rseq(2). func RSeq(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() length := args[1].Uint() flags := args[2].Int() signature := args[3].Uint() if !t.RSeqAvailable() { // Event for applications that want rseq on a configuration // that doesn't support them. t.Kernel().EmitUnimplementedEvent(t, sysno) return 0, nil, linuxerr.ENOSYS } switch flags { case 0: // Register. return 0, nil, t.SetRSeq(addr, length, signature) case linux.RSEQ_FLAG_UNREGISTER: return 0, nil, t.ClearRSeq(addr, length, signature) default: // Unknown flag. return 0, nil, linuxerr.EINVAL } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_rusage.go000066400000000000000000000075731465435605700264730ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/usage" ) func getrusage(t *kernel.Task, which int32) linux.Rusage { var cs usage.CPUStats switch which { case linux.RUSAGE_SELF: cs = t.ThreadGroup().CPUStats() case linux.RUSAGE_CHILDREN: cs = t.ThreadGroup().JoinedChildCPUStats() case linux.RUSAGE_THREAD: cs = t.CPUStats() case linux.RUSAGE_BOTH: tg := t.ThreadGroup() cs = tg.CPUStats() cs.Accumulate(tg.JoinedChildCPUStats()) } return linux.Rusage{ UTime: linux.NsecToTimeval(cs.UserTime.Nanoseconds()), STime: linux.NsecToTimeval(cs.SysTime.Nanoseconds()), NVCSw: int64(cs.VoluntarySwitches), MaxRSS: int64(t.MaxRSS(which) / 1024), } } // Getrusage implements linux syscall getrusage(2). // // marked "y" are supported now // marked "*" are not used on Linux // marked "p" are pending for support // // y struct timeval ru_utime; /* user CPU time used */ // y struct timeval ru_stime; /* system CPU time used */ // p long ru_maxrss; /* maximum resident set size */ // * long ru_ixrss; /* integral shared memory size */ // * long ru_idrss; /* integral unshared data size */ // * long ru_isrss; /* integral unshared stack size */ // p long ru_minflt; /* page reclaims (soft page faults) */ // p long ru_majflt; /* page faults (hard page faults) */ // * long ru_nswap; /* swaps */ // p long ru_inblock; /* block input operations */ // p long ru_oublock; /* block output operations */ // * long ru_msgsnd; /* IPC messages sent */ // * long ru_msgrcv; /* IPC messages received */ // * long ru_nsignals; /* signals received */ // y long ru_nvcsw; /* voluntary context switches */ // y long ru_nivcsw; /* involuntary context switches */ func Getrusage(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { which := args[0].Int() addr := args[1].Pointer() if which != linux.RUSAGE_SELF && which != linux.RUSAGE_CHILDREN && which != linux.RUSAGE_THREAD { return 0, nil, linuxerr.EINVAL } ru := getrusage(t, which) _, err := ru.CopyOut(t, addr) return 0, nil, err } // Times implements linux syscall times(2). func Times(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() // Calculate the ticks first, and figure out if any additional work is // necessary. Linux allows for a NULL addr, in which case only the // return value is meaningful. We don't need to do anything else. ticks := uintptr(ktime.NowFromContext(t).Nanoseconds() / linux.ClockTick.Nanoseconds()) if addr == 0 { return ticks, nil, nil } cs1 := t.ThreadGroup().CPUStats() cs2 := t.ThreadGroup().JoinedChildCPUStats() r := linux.Tms{ UTime: linux.ClockTFromDuration(cs1.UserTime), STime: linux.ClockTFromDuration(cs1.SysTime), CUTime: linux.ClockTFromDuration(cs2.UserTime), CSTime: linux.ClockTFromDuration(cs2.SysTime), } if _, err := r.CopyOut(t, addr); err != nil { return 0, nil, err } return ticks, nil, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_sched.go000066400000000000000000000060641465435605700262650ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ) const ( onlyScheduler = linux.SCHED_NORMAL onlyPriority = 0 ) // SchedParam replicates struct sched_param in sched.h. // // +marshal type SchedParam struct { schedPriority int32 } // SchedGetparam implements linux syscall sched_getparam(2). func SchedGetparam(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pid := args[0].Int() param := args[1].Pointer() if param == 0 { return 0, nil, linuxerr.EINVAL } if pid < 0 { return 0, nil, linuxerr.EINVAL } if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { return 0, nil, linuxerr.ESRCH } r := SchedParam{schedPriority: onlyPriority} if _, err := r.CopyOut(t, param); err != nil { return 0, nil, err } return 0, nil, nil } // SchedGetscheduler implements linux syscall sched_getscheduler(2). func SchedGetscheduler(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pid := args[0].Int() if pid < 0 { return 0, nil, linuxerr.EINVAL } if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { return 0, nil, linuxerr.ESRCH } return onlyScheduler, nil, nil } // SchedSetscheduler implements linux syscall sched_setscheduler(2). func SchedSetscheduler(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pid := args[0].Int() policy := args[1].Int() param := args[2].Pointer() if pid < 0 { return 0, nil, linuxerr.EINVAL } if policy != onlyScheduler { return 0, nil, linuxerr.EINVAL } if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil { return 0, nil, linuxerr.ESRCH } var r SchedParam if _, err := r.CopyIn(t, param); err != nil { return 0, nil, linuxerr.EINVAL } if r.schedPriority != onlyPriority { return 0, nil, linuxerr.EINVAL } return 0, nil, nil } // SchedGetPriorityMax implements linux syscall sched_get_priority_max(2). func SchedGetPriorityMax(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return onlyPriority, nil, nil } // SchedGetPriorityMin implements linux syscall sched_get_priority_min(2). func SchedGetPriorityMin(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return onlyPriority, nil, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_seccomp.go000066400000000000000000000055321465435605700266270ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ) // userSockFprog is equivalent to Linux's struct sock_fprog on amd64. // // +marshal type userSockFprog struct { // Len is the length of the filter in BPF instructions. Len uint16 _ [6]byte // padding for alignment // Filter is a user pointer to the struct sock_filter array that makes up // the filter program. Filter is a uint64 rather than a hostarch.Addr // because hostarch.Addr is actually uintptr, which is not a fixed-size // type. Filter uint64 } // seccomp applies a seccomp policy to the current task. func seccomp(t *kernel.Task, mode, flags uint64, addr hostarch.Addr) error { // We only support SECCOMP_SET_MODE_FILTER at the moment. if mode != linux.SECCOMP_SET_MODE_FILTER { // Unsupported mode. return linuxerr.EINVAL } tsync := flags&linux.SECCOMP_FILTER_FLAG_TSYNC != 0 // The only flag we support now is SECCOMP_FILTER_FLAG_TSYNC. if flags&^linux.SECCOMP_FILTER_FLAG_TSYNC != 0 { // Unsupported flag. return linuxerr.EINVAL } var fprog userSockFprog if _, err := fprog.CopyIn(t, addr); err != nil { return err } if fprog.Len == 0 || fprog.Len > bpf.MaxInstructions { // If the filter is already over the maximum number of instructions, // do not go further and attempt to optimize the bytecode to make it // smaller. return linuxerr.EINVAL } filter := make([]linux.BPFInstruction, int(fprog.Len)) if _, err := linux.CopyBPFInstructionSliceIn(t, hostarch.Addr(fprog.Filter), filter); err != nil { return err } bpfFilter := make([]bpf.Instruction, len(filter)) for i, ins := range filter { bpfFilter[i] = bpf.Instruction(ins) } compiledFilter, err := bpf.Compile(bpfFilter, true /* optimize */) if err != nil { t.Debugf("Invalid seccomp-bpf filter: %v", err) return linuxerr.EINVAL } return t.AppendSyscallFilter(compiledFilter, tsync) } // Seccomp implements linux syscall seccomp(2). func Seccomp(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, seccomp(t, args[0].Uint64(), args[1].Uint64(), args[2].Pointer()) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_sem.go000066400000000000000000000236231465435605700257630ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "math" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" ) const opsMax = 500 // SEMOPM // Semget handles: semget(key_t key, int nsems, int semflg) func Semget(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { key := ipc.Key(args[0].Int()) nsems := args[1].Int() flag := args[2].Int() private := key == linux.IPC_PRIVATE create := flag&linux.IPC_CREAT == linux.IPC_CREAT exclusive := flag&linux.IPC_EXCL == linux.IPC_EXCL mode := linux.FileMode(flag & 0777) r := t.IPCNamespace().SemaphoreRegistry() set, err := r.FindOrCreate(t, key, nsems, mode, private, create, exclusive) if err != nil { return 0, nil, err } return uintptr(set.ID()), nil, nil } // Semtimedop handles: semop(int semid, struct sembuf *sops, size_t nsops, const struct timespec *timeout) func Semtimedop(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { // If the timeout argument is NULL, then semtimedop() behaves exactly like semop(). if args[3].Pointer() == 0 { return Semop(t, sysno, args) } id := ipc.ID(args[0].Int()) sembufAddr := args[1].Pointer() nsops := args[2].SizeT() timespecAddr := args[3].Pointer() if nsops <= 0 { return 0, nil, linuxerr.EINVAL } if nsops > opsMax { return 0, nil, linuxerr.E2BIG } ops := make([]linux.Sembuf, nsops) if _, err := linux.CopySembufSliceIn(t, sembufAddr, ops); err != nil { return 0, nil, err } var timeout linux.Timespec if _, err := timeout.CopyIn(t, timespecAddr); err != nil { return 0, nil, err } if timeout.Sec < 0 || timeout.Nsec < 0 || timeout.Nsec >= 1e9 { return 0, nil, linuxerr.EINVAL } if err := semTimedOp(t, id, ops, true, timeout.ToDuration()); err != nil { if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { return 0, nil, linuxerr.EAGAIN } return 0, nil, err } return 0, nil, nil } // Semop handles: semop(int semid, struct sembuf *sops, size_t nsops) func Semop(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { id := ipc.ID(args[0].Int()) sembufAddr := args[1].Pointer() nsops := args[2].SizeT() if nsops <= 0 { return 0, nil, linuxerr.EINVAL } if nsops > opsMax { return 0, nil, linuxerr.E2BIG } ops := make([]linux.Sembuf, nsops) if _, err := linux.CopySembufSliceIn(t, sembufAddr, ops); err != nil { return 0, nil, err } return 0, nil, semTimedOp(t, id, ops, false, time.Second) } func semTimedOp(t *kernel.Task, id ipc.ID, ops []linux.Sembuf, haveTimeout bool, timeout time.Duration) error { set := t.IPCNamespace().SemaphoreRegistry().FindByID(id) if set == nil { return linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup()) for { ch, num, err := set.ExecuteOps(t, ops, creds, int32(pid)) if ch == nil || err != nil { return err } if _, err = t.BlockWithTimeout(ch, haveTimeout, timeout); err != nil { set.AbortWait(num, ch) return err } } } // Semctl handles: semctl(int semid, int semnum, int cmd, ...) func Semctl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { id := ipc.ID(args[0].Int()) num := args[1].Int() cmd := args[2].Int() switch cmd { case linux.SETVAL: val := args[3].Int() if val > math.MaxInt16 { return 0, nil, linuxerr.ERANGE } return 0, nil, setVal(t, id, num, int16(val)) case linux.SETALL: array := args[3].Pointer() return 0, nil, setValAll(t, id, array) case linux.GETVAL: v, err := getVal(t, id, num) return uintptr(v), nil, err case linux.GETALL: array := args[3].Pointer() return 0, nil, getValAll(t, id, array) case linux.IPC_RMID: return 0, nil, remove(t, id) case linux.IPC_SET: arg := args[3].Pointer() var s linux.SemidDS if _, err := s.CopyIn(t, arg); err != nil { return 0, nil, err } return 0, nil, ipcSet(t, id, &s) case linux.GETPID: v, err := getPID(t, id, num) return uintptr(v), nil, err case linux.IPC_STAT: arg := args[3].Pointer() ds, err := ipcStat(t, id) if err == nil { _, err = ds.CopyOut(t, arg) } return 0, nil, err case linux.GETZCNT: v, err := getZCnt(t, id, num) return uintptr(v), nil, err case linux.GETNCNT: v, err := getNCnt(t, id, num) return uintptr(v), nil, err case linux.IPC_INFO: buf := args[3].Pointer() r := t.IPCNamespace().SemaphoreRegistry() info := r.IPCInfo() if _, err := info.CopyOut(t, buf); err != nil { return 0, nil, err } return uintptr(r.HighestIndex()), nil, nil case linux.SEM_INFO: buf := args[3].Pointer() r := t.IPCNamespace().SemaphoreRegistry() info := r.SemInfo() if _, err := info.CopyOut(t, buf); err != nil { return 0, nil, err } return uintptr(r.HighestIndex()), nil, nil case linux.SEM_STAT: arg := args[3].Pointer() // id is an index in SEM_STAT. semid, ds, err := semStat(t, int32(id)) if err != nil { return 0, nil, err } if _, err := ds.CopyOut(t, arg); err != nil { return 0, nil, err } return uintptr(semid), nil, err case linux.SEM_STAT_ANY: arg := args[3].Pointer() // id is an index in SEM_STAT. semid, ds, err := semStatAny(t, int32(id)) if err != nil { return 0, nil, err } if _, err := ds.CopyOut(t, arg); err != nil { return 0, nil, err } return uintptr(semid), nil, err default: return 0, nil, linuxerr.EINVAL } } func remove(t *kernel.Task, id ipc.ID) error { r := t.IPCNamespace().SemaphoreRegistry() creds := auth.CredentialsFromContext(t) return r.Remove(id, creds) } func ipcSet(t *kernel.Task, id ipc.ID, ds *linux.SemidDS) error { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { return linuxerr.EINVAL } return set.Set(t, ds) } func ipcStat(t *kernel.Task, id ipc.ID) (*linux.SemidDS, error) { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { return nil, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) return set.GetStat(creds) } func semStat(t *kernel.Task, index int32) (int32, *linux.SemidDS, error) { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByIndex(index) if set == nil { return 0, nil, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) ds, err := set.GetStat(creds) if err != nil { return 0, ds, err } return int32(set.ID()), ds, nil } func semStatAny(t *kernel.Task, index int32) (int32, *linux.SemidDS, error) { set := t.IPCNamespace().SemaphoreRegistry().FindByIndex(index) if set == nil { return 0, nil, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) ds, err := set.GetStatAny(creds) if err != nil { return 0, ds, err } return int32(set.ID()), ds, nil } func setVal(t *kernel.Task, id ipc.ID, num int32, val int16) error { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { return linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup()) return set.SetVal(t, num, val, creds, int32(pid)) } func setValAll(t *kernel.Task, id ipc.ID, array hostarch.Addr) error { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { return linuxerr.EINVAL } vals := make([]uint16, set.Size()) if _, err := primitive.CopyUint16SliceIn(t, array, vals); err != nil { return err } creds := auth.CredentialsFromContext(t) pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup()) return set.SetValAll(t, vals, creds, int32(pid)) } func getVal(t *kernel.Task, id ipc.ID, num int32) (int16, error) { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { return 0, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) return set.GetVal(num, creds) } func getValAll(t *kernel.Task, id ipc.ID, array hostarch.Addr) error { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { return linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) vals, err := set.GetValAll(creds) if err != nil { return err } _, err = primitive.CopyUint16SliceOut(t, array, vals) return err } func getPID(t *kernel.Task, id ipc.ID, num int32) (int32, error) { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { return 0, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) gpid, err := set.GetPID(num, creds) if err != nil { return 0, err } // Convert pid from init namespace to the caller's namespace. tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(gpid)) if tg == nil { return 0, nil } return int32(tg.ID()), nil } func getZCnt(t *kernel.Task, id ipc.ID, num int32) (uint16, error) { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { return 0, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) return set.CountZeroWaiters(num, creds) } func getNCnt(t *kernel.Task, id ipc.ID, num int32) (uint16, error) { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) if set == nil { return 0, linuxerr.EINVAL } creds := auth.CredentialsFromContext(t) return set.CountNegativeWaiters(num, creds) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_shm.go000066400000000000000000000110411465435605700257550ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" ) // Shmget implements shmget(2). func Shmget(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { key := ipc.Key(args[0].Int()) size := uint64(args[1].SizeT()) flag := args[2].Int() private := key == linux.IPC_PRIVATE create := flag&linux.IPC_CREAT == linux.IPC_CREAT exclusive := flag&linux.IPC_EXCL == linux.IPC_EXCL mode := linux.FileMode(flag & 0777) pid := int32(t.ThreadGroup().ID()) r := t.IPCNamespace().ShmRegistry() segment, err := r.FindOrCreate(t, pid, key, size, mode, private, create, exclusive) if err != nil { return 0, nil, err } defer segment.DecRef(t) return uintptr(segment.ID()), nil, nil } // findSegment retrieves a shm segment by the given id. // // findSegment returns a reference on Shm. func findSegment(t *kernel.Task, id ipc.ID) (*shm.Shm, error) { r := t.IPCNamespace().ShmRegistry() segment := r.FindByID(id) if segment == nil { // No segment with provided id. return nil, linuxerr.EINVAL } return segment, nil } // Shmat implements shmat(2). func Shmat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { id := ipc.ID(args[0].Int()) addr := args[1].Pointer() flag := args[2].Int() segment, err := findSegment(t, id) if err != nil { return 0, nil, linuxerr.EINVAL } defer segment.DecRef(t) opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{ Execute: flag&linux.SHM_EXEC == linux.SHM_EXEC, Readonly: flag&linux.SHM_RDONLY == linux.SHM_RDONLY, Remap: flag&linux.SHM_REMAP == linux.SHM_REMAP, }) if err != nil { return 0, nil, err } addr, err = t.MemoryManager().MMap(t, opts) return uintptr(addr), nil, err } // Shmdt implements shmdt(2). func Shmdt(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() err := t.MemoryManager().DetachShm(t, addr) return 0, nil, err } // Shmctl implements shmctl(2). func Shmctl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { id := ipc.ID(args[0].Int()) cmd := args[1].Int() buf := args[2].Pointer() r := t.IPCNamespace().ShmRegistry() switch cmd { case linux.SHM_STAT: // Technically, we should be treating id as "an index into the kernel's // internal array that maintains information about all shared memory // segments on the system". Since we don't track segments in an array, // we'll just pretend the shmid is the index and do the same thing as // IPC_STAT. Linux also uses the index as the shmid. fallthrough case linux.IPC_STAT: segment, err := findSegment(t, id) if err != nil { return 0, nil, linuxerr.EINVAL } defer segment.DecRef(t) stat, err := segment.IPCStat(t) if err == nil { _, err = stat.CopyOut(t, buf) } return 0, nil, err case linux.IPC_INFO: params := r.IPCInfo() _, err := params.CopyOut(t, buf) return 0, nil, err case linux.SHM_INFO: info := r.ShmInfo() _, err := info.CopyOut(t, buf) return 0, nil, err } // Remaining commands refer to a specific segment. segment, err := findSegment(t, id) if err != nil { return 0, nil, linuxerr.EINVAL } defer segment.DecRef(t) switch cmd { case linux.IPC_SET: var ds linux.ShmidDS if _, err = ds.CopyIn(t, buf); err != nil { return 0, nil, err } err := segment.Set(t, &ds) return 0, nil, err case linux.IPC_RMID: segment.MarkDestroyed(t) return 0, nil, nil case linux.SHM_LOCK, linux.SHM_UNLOCK: // We currently do not support memory locking anywhere. // mlock(2)/munlock(2) are currently stubbed out as no-ops so do the // same here. t.Kernel().EmitUnimplementedEvent(t, sysno) return 0, nil, nil default: return 0, nil, linuxerr.EINVAL } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_signal.go000066400000000000000000000430121465435605700264460ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "math" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/signalfd" "gvisor.dev/gvisor/pkg/sentry/kernel" ) // "For a process to have permission to send a signal it must // - either be privileged (CAP_KILL), or // - the real or effective user ID of the sending process must be equal to the // // real or saved set-user-ID of the target process. // // In the case of SIGCONT it suffices when the sending and receiving processes // belong to the same session." - kill(2) // // Equivalent to kernel/signal.c:check_kill_permission. func mayKill(t *kernel.Task, target *kernel.Task, sig linux.Signal) bool { // kernel/signal.c:check_kill_permission also allows a signal if the // sending and receiving tasks share a thread group, which is not // mentioned in kill(2) since kill does not allow task-level // granularity in signal sending. if t.ThreadGroup() == target.ThreadGroup() { return true } if t.HasCapabilityIn(linux.CAP_KILL, target.UserNamespace()) { return true } creds := t.Credentials() tcreds := target.Credentials() if creds.EffectiveKUID == tcreds.SavedKUID || creds.EffectiveKUID == tcreds.RealKUID || creds.RealKUID == tcreds.SavedKUID || creds.RealKUID == tcreds.RealKUID { return true } if sig == linux.SIGCONT && target.ThreadGroup().Session() == t.ThreadGroup().Session() { return true } return false } // Kill implements linux syscall kill(2). func Kill(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pid := kernel.ThreadID(args[0].Int()) sig := linux.Signal(args[1].Int()) switch { case pid > 0: // "If pid is positive, then signal sig is sent to the process with the // ID specified by pid." - kill(2) // This loops to handle races with execve where target dies between // TaskWithID and SendGroupSignal. Compare Linux's // kernel/signal.c:kill_pid_info(). for { target := t.PIDNamespace().TaskWithID(pid) if target == nil { return 0, nil, linuxerr.ESRCH } if !mayKill(t, target, sig) { return 0, nil, linuxerr.EPERM } info := &linux.SignalInfo{ Signo: int32(sig), Code: linux.SI_USER, } info.SetPID(int32(target.PIDNamespace().IDOfTask(t))) info.SetUID(int32(t.Credentials().RealKUID.In(target.UserNamespace()).OrOverflow())) if err := target.SendGroupSignal(info); !linuxerr.Equals(linuxerr.ESRCH, err) { return 0, nil, err } } case pid == -1: // "If pid equals -1, then sig is sent to every process for which the // calling process has permission to send signals, except for process 1 // (init), but see below. ... POSIX.1-2001 requires that kill(-1,sig) // send sig to all processes that the calling process may send signals // to, except possibly for some implementation-defined system // processes. Linux allows a process to signal itself, but on Linux the // call kill(-1,sig) does not signal the calling process." var ( lastErr error delivered int ) for _, tg := range t.PIDNamespace().ThreadGroups() { if tg == t.ThreadGroup() { continue } // Don't send the signal to the init process in t's PID namespace. if tg.IsInitIn(t.PIDNamespace()) { continue } // If pid == -1, the returned error is the last non-EPERM error // from any call to group_send_sig_info. if !mayKill(t, tg.Leader(), sig) { continue } // Here and below, whether or not kill returns an error may // depend on the iteration order. We at least implement the // semantics documented by the man page: "On success (at least // one signal was sent), zero is returned." info := &linux.SignalInfo{ Signo: int32(sig), Code: linux.SI_USER, } info.SetPID(int32(tg.PIDNamespace().IDOfTask(t))) info.SetUID(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow())) err := tg.SendSignal(info) if linuxerr.Equals(linuxerr.ESRCH, err) { // ESRCH is ignored because it means the task // exited while we were iterating. This is a // race which would not normally exist on // Linux, so we suppress it. continue } delivered++ if err != nil { lastErr = err } } if delivered > 0 { return 0, nil, lastErr } return 0, nil, linuxerr.ESRCH default: // "If pid equals 0, then sig is sent to every process in the process // group of the calling process." // // "If pid is less than -1, then sig is sent to every process // in the process group whose ID is -pid." pgid := kernel.ProcessGroupID(-pid) if pgid == 0 { pgid = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup()) } // If pid != -1 (i.e. signalling a process group), the returned error // is the last error from any call to group_send_sig_info. lastErr := error(linuxerr.ESRCH) for _, tg := range t.PIDNamespace().ThreadGroups() { if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid { if !mayKill(t, tg.Leader(), sig) { lastErr = linuxerr.EPERM continue } info := &linux.SignalInfo{ Signo: int32(sig), Code: linux.SI_USER, } info.SetPID(int32(tg.PIDNamespace().IDOfTask(t))) info.SetUID(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow())) // See note above regarding ESRCH race above. if err := tg.SendSignal(info); !linuxerr.Equals(linuxerr.ESRCH, err) { lastErr = err } } } return 0, nil, lastErr } } func tkillSigInfo(sender, receiver *kernel.Task, sig linux.Signal) *linux.SignalInfo { info := &linux.SignalInfo{ Signo: int32(sig), Code: linux.SI_TKILL, } info.SetPID(int32(receiver.PIDNamespace().IDOfThreadGroup(sender.ThreadGroup()))) info.SetUID(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) return info } // Tkill implements linux syscall tkill(2). func Tkill(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { tid := kernel.ThreadID(args[0].Int()) sig := linux.Signal(args[1].Int()) // N.B. Inconsistent with man page, linux actually rejects calls with // tid <=0 by EINVAL. This isn't the same for all signal calls. if tid <= 0 { return 0, nil, linuxerr.EINVAL } target := t.PIDNamespace().TaskWithID(tid) if target == nil { return 0, nil, linuxerr.ESRCH } if !mayKill(t, target, sig) { return 0, nil, linuxerr.EPERM } return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig)) } // Tgkill implements linux syscall tgkill(2). func Tgkill(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { tgid := kernel.ThreadID(args[0].Int()) tid := kernel.ThreadID(args[1].Int()) sig := linux.Signal(args[2].Int()) // N.B. Inconsistent with man page, linux actually rejects calls with // tgid/tid <=0 by EINVAL. This isn't the same for all signal calls. if tgid <= 0 || tid <= 0 { return 0, nil, linuxerr.EINVAL } targetTG := t.PIDNamespace().ThreadGroupWithID(tgid) target := t.PIDNamespace().TaskWithID(tid) if targetTG == nil || target == nil || target.ThreadGroup() != targetTG { return 0, nil, linuxerr.ESRCH } if !mayKill(t, target, sig) { return 0, nil, linuxerr.EPERM } return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig)) } // RtSigaction implements linux syscall rt_sigaction(2). func RtSigaction(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { sig := linux.Signal(args[0].Int()) newactarg := args[1].Pointer() oldactarg := args[2].Pointer() sigsetsize := args[3].SizeT() if sigsetsize != linux.SignalSetSize { return 0, nil, linuxerr.EINVAL } var newactptr *linux.SigAction if newactarg != 0 { var newact linux.SigAction if _, err := newact.CopyIn(t, newactarg); err != nil { return 0, nil, err } newactptr = &newact } oldact, err := t.ThreadGroup().SetSigAction(sig, newactptr) if err != nil { return 0, nil, err } if oldactarg != 0 { if _, err := oldact.CopyOut(t, oldactarg); err != nil { return 0, nil, err } } return 0, nil, nil } // Sigreturn implements linux syscall sigreturn(2). func Sigreturn(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { ctrl, err := t.SignalReturn(false) return 0, ctrl, err } // RtSigreturn implements linux syscall rt_sigreturn(2). func RtSigreturn(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { ctrl, err := t.SignalReturn(true) return 0, ctrl, err } // RtSigprocmask implements linux syscall rt_sigprocmask(2). func RtSigprocmask(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { how := args[0].Int() setaddr := args[1].Pointer() oldaddr := args[2].Pointer() sigsetsize := args[3].SizeT() if sigsetsize != linux.SignalSetSize { return 0, nil, linuxerr.EINVAL } oldmask := t.SignalMask() if setaddr != 0 { mask, err := copyInSigSet(t, setaddr, sigsetsize) if err != nil { return 0, nil, err } switch how { case linux.SIG_BLOCK: t.SetSignalMask(oldmask | mask) case linux.SIG_UNBLOCK: t.SetSignalMask(oldmask &^ mask) case linux.SIG_SETMASK: t.SetSignalMask(mask) default: return 0, nil, linuxerr.EINVAL } } if oldaddr != 0 { return 0, nil, copyOutSigSet(t, oldaddr, oldmask) } return 0, nil, nil } // Sigaltstack implements linux syscall sigaltstack(2). func Sigaltstack(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { setaddr := args[0].Pointer() oldaddr := args[1].Pointer() ctrl, err := t.SigaltStack(setaddr, oldaddr) return 0, ctrl, err } // Pause implements linux syscall pause(2). func Pause(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, linuxerr.ConvertIntr(t.Block(nil), linuxerr.ERESTARTNOHAND) } // RtSigpending implements linux syscall rt_sigpending(2). func RtSigpending(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() pending := t.PendingSignals() _, err := pending.CopyOut(t, addr) return 0, nil, err } // RtSigtimedwait implements linux syscall rt_sigtimedwait(2). func RtSigtimedwait(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { sigset := args[0].Pointer() siginfo := args[1].Pointer() timespec := args[2].Pointer() sigsetsize := args[3].SizeT() mask, err := copyInSigSet(t, sigset, sigsetsize) if err != nil { return 0, nil, err } var timeout time.Duration if timespec != 0 { d, err := copyTimespecIn(t, timespec) if err != nil { return 0, nil, err } if !d.Valid() { return 0, nil, linuxerr.EINVAL } timeout = time.Duration(d.ToNsecCapped()) } else { timeout = time.Duration(math.MaxInt64) } si, err := t.Sigtimedwait(mask, timeout) if err != nil { return 0, nil, err } if siginfo != 0 { si.FixSignalCodeForUser() if _, err := si.CopyOut(t, siginfo); err != nil { return 0, nil, err } } return uintptr(si.Signo), nil, nil } // RtSigqueueinfo implements linux syscall rt_sigqueueinfo(2). func RtSigqueueinfo(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pid := kernel.ThreadID(args[0].Int()) sig := linux.Signal(args[1].Int()) infoAddr := args[2].Pointer() // Copy in the info. // // We must ensure that the Signo is set (Linux overrides this in the // same way), and that the code is in the allowed set. This same logic // appears below in RtSigtgqueueinfo and should be kept in sync. var info linux.SignalInfo if _, err := info.CopyIn(t, infoAddr); err != nil { return 0, nil, err } info.Signo = int32(sig) // This must loop to handle the race with execve described in Kill. for { // Deliver to the given task's thread group. target := t.PIDNamespace().TaskWithID(pid) if target == nil { return 0, nil, linuxerr.ESRCH } // If the sender is not the receiver, it can't use si_codes used by the // kernel or SI_TKILL. if (info.Code >= 0 || info.Code == linux.SI_TKILL) && target != t { return 0, nil, linuxerr.EPERM } if !mayKill(t, target, sig) { return 0, nil, linuxerr.EPERM } if err := target.SendGroupSignal(&info); !linuxerr.Equals(linuxerr.ESRCH, err) { return 0, nil, err } } } // RtTgsigqueueinfo implements linux syscall rt_tgsigqueueinfo(2). func RtTgsigqueueinfo(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { tgid := kernel.ThreadID(args[0].Int()) tid := kernel.ThreadID(args[1].Int()) sig := linux.Signal(args[2].Int()) infoAddr := args[3].Pointer() // N.B. Inconsistent with man page, linux actually rejects calls with // tgid/tid <=0 by EINVAL. This isn't the same for all signal calls. if tgid <= 0 || tid <= 0 { return 0, nil, linuxerr.EINVAL } // Copy in the info. See RtSigqueueinfo above. var info linux.SignalInfo if _, err := info.CopyIn(t, infoAddr); err != nil { return 0, nil, err } info.Signo = int32(sig) // Deliver to the given task. targetTG := t.PIDNamespace().ThreadGroupWithID(tgid) target := t.PIDNamespace().TaskWithID(tid) if targetTG == nil || target == nil || target.ThreadGroup() != targetTG { return 0, nil, linuxerr.ESRCH } // If the sender is not the receiver, it can't use si_codes used by the // kernel or SI_TKILL. if (info.Code >= 0 || info.Code == linux.SI_TKILL) && target != t { return 0, nil, linuxerr.EPERM } if !mayKill(t, target, sig) { return 0, nil, linuxerr.EPERM } return 0, nil, target.SendSignal(&info) } // RtSigsuspend implements linux syscall rt_sigsuspend(2). func RtSigsuspend(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { sigset := args[0].Pointer() // Copy in the signal mask. var mask linux.SignalSet if _, err := mask.CopyIn(t, sigset); err != nil { return 0, nil, err } mask &^= kernel.UnblockableSignals // Swap the mask. oldmask := t.SignalMask() t.SetSignalMask(mask) t.SetSavedSignalMask(oldmask) // Perform the wait. return 0, nil, linuxerr.ConvertIntr(t.Block(nil), linuxerr.ERESTARTNOHAND) } // RestartSyscall implements the linux syscall restart_syscall(2). func RestartSyscall(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { if r := t.SyscallRestartBlock(); r != nil { n, err := r.Restart(t) return n, nil, err } // The restart block should never be nil here, but it's possible // ERESTART_RESTARTBLOCK was set by ptrace without the current syscall // setting up a restart block. If ptrace didn't manipulate the return value, // finding a nil restart block is a bug. Linux ensures that the restart // function is never null by (re)initializing it with one that translates // the restart into EINTR. We'll emulate that behaviour. t.Debugf("Restart block missing in restart_syscall(2). Did ptrace inject a return value of ERESTART_RESTARTBLOCK?") return 0, nil, linuxerr.EINTR } // sharedSignalfd is shared between the two calls. func sharedSignalfd(t *kernel.Task, fd int32, sigset hostarch.Addr, sigsetsize uint, flags int32) (uintptr, *kernel.SyscallControl, error) { // Copy in the signal mask. mask, err := copyInSigSet(t, sigset, sigsetsize) if err != nil { return 0, nil, err } // Always check for valid flags, even if not creating. if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 { return 0, nil, linuxerr.EINVAL } // Is this a change to an existing signalfd? // // The spec indicates that this should adjust the mask. if fd != -1 { file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Is this a signalfd? if sfd, ok := file.Impl().(*signalfd.SignalFileDescription); ok { sfd.SetMask(mask) return 0, nil, nil } // Not a signalfd. return 0, nil, linuxerr.EINVAL } fileFlags := uint32(linux.O_RDWR) if flags&linux.SFD_NONBLOCK != 0 { fileFlags |= linux.O_NONBLOCK } // Create a new file. vfsObj := t.Kernel().VFS() file, err := signalfd.New(vfsObj, t, mask, fileFlags) if err != nil { return 0, nil, err } defer file.DecRef(t) // Create a new descriptor. fd, err = t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.SFD_CLOEXEC != 0, }) if err != nil { return 0, nil, err } // Done. return uintptr(fd), nil, nil } // Signalfd implements the linux syscall signalfd(2). func Signalfd(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() sigset := args[1].Pointer() sigsetsize := args[2].SizeT() return sharedSignalfd(t, fd, sigset, sigsetsize, 0) } // Signalfd4 implements the linux syscall signalfd4(2). func Signalfd4(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() sigset := args[1].Pointer() sigsetsize := args[2].SizeT() flags := args[3].Int() return sharedSignalfd(t, fd, sigset, sigsetsize, flags) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_socket.go000066400000000000000000001000311465435605700264540ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "fmt" "time" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/usermem" ) // maxAddrLen is the maximum socket address length we're willing to accept. const maxAddrLen = 200 // maxOptLen is the maximum sockopt parameter length we're willing to accept. const maxOptLen = 1024 * 8 // maxControlLen is the maximum length of the msghdr.msg_control buffer we're // willing to accept. Note that this limit is smaller than Linux, which allows // buffers upto INT_MAX. const maxControlLen = 10 * 1024 * 1024 // maxListenBacklog is the maximum limit of listen backlog supported. const maxListenBacklog = 1024 // nameLenOffset is the offset from the start of the MessageHeader64 struct to // the NameLen field. const nameLenOffset = 8 // controlLenOffset is the offset form the start of the MessageHeader64 struct // to the ControlLen field. const controlLenOffset = 40 // flagsOffset is the offset form the start of the MessageHeader64 struct // to the Flags field. const flagsOffset = 48 const sizeOfInt32 = 4 // messageHeader64Len is the length of a MessageHeader64 struct. var messageHeader64Len = uint64((*MessageHeader64)(nil).SizeBytes()) // multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct. var multipleMessageHeader64Len = uint64((*multipleMessageHeader64)(nil).SizeBytes()) // baseRecvFlags are the flags that are accepted across recvmsg(2), // recvmmsg(2), and recvfrom(2). const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT | linux.MSG_NOSIGNAL | linux.MSG_WAITALL | linux.MSG_TRUNC | linux.MSG_CTRUNC // MessageHeader64 is the 64-bit representation of the msghdr struct used in // the recvmsg and sendmsg syscalls. // // +marshal type MessageHeader64 struct { // Name is the optional pointer to a network address buffer. Name uint64 // NameLen is the length of the buffer pointed to by Name. NameLen uint32 _ uint32 // Iov is a pointer to an array of io vectors that describe the memory // locations involved in the io operation. Iov uint64 // IovLen is the length of the array pointed to by Iov. IovLen uint64 // Control is the optional pointer to ancillary control data. Control uint64 // ControlLen is the length of the data pointed to by Control. ControlLen uint64 // Flags on the sent/received message. Flags int32 _ int32 } // multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in // the recvmmsg and sendmmsg syscalls. // // +marshal type multipleMessageHeader64 struct { msgHdr MessageHeader64 msgLen uint32 _ int32 } // CaptureAddress allocates memory for and copies a socket address structure // from the untrusted address space range. func CaptureAddress(t *kernel.Task, addr hostarch.Addr, addrlen uint32) ([]byte, error) { if addrlen > maxAddrLen { return nil, linuxerr.EINVAL } addrBuf := make([]byte, addrlen) if _, err := t.CopyInBytes(addr, addrBuf); err != nil { return nil, err } return addrBuf, nil } // writeAddress writes a sockaddr structure and its length to an output buffer // in the unstrusted address space range. If the address is bigger than the // buffer, it is truncated. func writeAddress(t *kernel.Task, addr linux.SockAddr, addrLen uint32, addrPtr hostarch.Addr, addrLenPtr hostarch.Addr) error { // Get the buffer length. var bufLen uint32 if _, err := primitive.CopyUint32In(t, addrLenPtr, &bufLen); err != nil { return err } if int32(bufLen) < 0 { return linuxerr.EINVAL } // Write the length unconditionally. if _, err := primitive.CopyUint32Out(t, addrLenPtr, addrLen); err != nil { return err } if addr == nil { return nil } if bufLen > addrLen { bufLen = addrLen } // Copy as much of the address as will fit in the buffer. encodedAddr := t.CopyScratchBuffer(addr.SizeBytes()) addr.MarshalUnsafe(encodedAddr) if bufLen > uint32(len(encodedAddr)) { bufLen = uint32(len(encodedAddr)) } _, err := t.CopyOutBytes(addrPtr, encodedAddr[:int(bufLen)]) return err } // Socket implements the linux syscall socket(2). func Socket(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { domain := int(args[0].Int()) stype := args[1].Int() protocol := int(args[2].Int()) // Check and initialize the flags. if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { return 0, nil, linuxerr.EINVAL } // Create the new socket. s, e := socket.New(t, domain, linux.SockType(stype&0xf), protocol) if e != nil { return 0, nil, e.ToError() } defer s.DecRef(t) if err := s.SetStatusFlags(t, t.Credentials(), uint32(stype&linux.SOCK_NONBLOCK)); err != nil { return 0, nil, err } fd, err := t.NewFDFrom(0, s, kernel.FDFlags{ CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, }) if err != nil { return 0, nil, err } return uintptr(fd), nil, nil } // SocketPair implements the linux syscall socketpair(2). func SocketPair(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { domain := int(args[0].Int()) stype := args[1].Int() protocol := int(args[2].Int()) addr := args[3].Pointer() // Check and initialize the flags. if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { return 0, nil, linuxerr.EINVAL } // Create the socket pair. s1, s2, e := socket.Pair(t, domain, linux.SockType(stype&0xf), protocol) if e != nil { return 0, nil, e.ToError() } // Adding to the FD table will cause an extra reference to be acquired. defer s1.DecRef(t) defer s2.DecRef(t) nonblocking := uint32(stype & linux.SOCK_NONBLOCK) if err := s1.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil { return 0, nil, err } if err := s2.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil { return 0, nil, err } // Create the FDs for the sockets. flags := kernel.FDFlags{ CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, } fds, err := t.NewFDs(0, []*vfs.FileDescription{s1, s2}, flags) if err != nil { return 0, nil, err } if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil { for _, fd := range fds { if file := t.FDTable().Remove(t, fd); file != nil { file.DecRef(t) } } return 0, nil, err } return 0, nil, nil } // Connect implements the linux syscall connect(2). func Connect(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Uint() // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.Socket) if !ok { return 0, nil, linuxerr.ENOTSOCK } // Capture address and call syscall implementation. a, err := CaptureAddress(t, addr, addrlen) if err != nil { return 0, nil, err } blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0 return 0, nil, linuxerr.ConvertIntr(s.Connect(t, a, blocking).ToError(), linuxerr.ERESTARTSYS) } // accept is the implementation of the accept syscall. It is called by accept // and accept4 syscall handlers. func accept(t *kernel.Task, fd int32, addr hostarch.Addr, addrLen hostarch.Addr, flags int) (uintptr, error) { // Check that no unsupported flags are passed in. if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { return 0, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { return 0, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.Socket) if !ok { return 0, linuxerr.ENOTSOCK } // Call the syscall implementation for this socket, then copy the // output address if one is specified. blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0 peerRequested := addrLen != 0 nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking) if e != nil { return 0, linuxerr.ConvertIntr(e.ToError(), linuxerr.ERESTARTSYS) } if peerRequested { // NOTE(magi): Linux does not give you an error if it can't // write the data back out so neither do we. if err := writeAddress(t, peer, peerLen, addr, addrLen); linuxerr.Equals(linuxerr.EINVAL, err) { return 0, err } } return uintptr(nfd), nil } // Accept4 implements the linux syscall accept4(2). func Accept4(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Pointer() flags := int(args[3].Int()) n, err := accept(t, fd, addr, addrlen, flags) return n, nil, err } // Accept implements the linux syscall accept(2). func Accept(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Pointer() n, err := accept(t, fd, addr, addrlen, 0) return n, nil, err } // Bind implements the linux syscall bind(2). func Bind(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Uint() // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.Socket) if !ok { return 0, nil, linuxerr.ENOTSOCK } // Capture address and call syscall implementation. a, err := CaptureAddress(t, addr, addrlen) if err != nil { return 0, nil, err } return 0, nil, s.Bind(t, a).ToError() } // Listen implements the linux syscall listen(2). func Listen(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() backlog := args[1].Uint() // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.Socket) if !ok { return 0, nil, linuxerr.ENOTSOCK } if backlog > maxListenBacklog { // Linux treats incoming backlog as uint with a limit defined by // sysctl_somaxconn. // https://github.com/torvalds/linux/blob/7acac4b3196/net/socket.c#L1666 backlog = maxListenBacklog } // Accept one more than the configured listen backlog to keep in parity with // Linux. Ref, because of missing equality check here: // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/sock.h#L937 // // In case of unix domain sockets, the following check // https://github.com/torvalds/linux/blob/7d6beb71da3/net/unix/af_unix.c#L1293 // will allow 1 connect through since it checks for a receive queue len > // backlog and not >=. backlog++ return 0, nil, s.Listen(t, int(backlog)).ToError() } // Shutdown implements the linux syscall shutdown(2). func Shutdown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() how := args[1].Int() // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.Socket) if !ok { return 0, nil, linuxerr.ENOTSOCK } // Validate how, then call syscall implementation. switch how { case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR: default: return 0, nil, linuxerr.EINVAL } return 0, nil, s.Shutdown(t, int(how)).ToError() } // GetSockOpt implements the linux syscall getsockopt(2). func GetSockOpt(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() level := args[1].Int() name := args[2].Int() optValAddr := args[3].Pointer() optLenAddr := args[4].Pointer() // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.Socket) if !ok { return 0, nil, linuxerr.ENOTSOCK } // Read the length. Reject negative values. var optLen int32 if _, err := primitive.CopyInt32In(t, optLenAddr, &optLen); err != nil { return 0, nil, err } if optLen < 0 { return 0, nil, linuxerr.EINVAL } // Call syscall implementation then copy both value and value len out. v, e := getSockOpt(t, s, int(level), int(name), optValAddr, int(optLen)) if e != nil { return 0, nil, e.ToError() } if _, err := primitive.CopyInt32Out(t, optLenAddr, int32(v.SizeBytes())); err != nil { return 0, nil, err } if v != nil { if _, err := v.CopyOut(t, optValAddr); err != nil { return 0, nil, err } } return 0, nil, nil } // getSockOpt tries to handle common socket options, or dispatches to a specific // socket implementation. func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr hostarch.Addr, len int) (marshal.Marshallable, *syserr.Error) { if level == linux.SOL_SOCKET { switch name { case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL: if len < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } } switch name { case linux.SO_TYPE: _, skType, _ := s.Type() v := primitive.Int32(skType) return &v, nil case linux.SO_DOMAIN: family, _, _ := s.Type() v := primitive.Int32(family) return &v, nil case linux.SO_PROTOCOL: _, _, protocol := s.Type() v := primitive.Int32(protocol) return &v, nil } } return s.GetSockOpt(t, level, name, optValAddr, len) } // SetSockOpt implements the linux syscall setsockopt(2). // // Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket. func SetSockOpt(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() level := args[1].Int() name := args[2].Int() optValAddr := args[3].Pointer() optLen := args[4].Int() // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.Socket) if !ok { return 0, nil, linuxerr.ENOTSOCK } if optLen < 0 { return 0, nil, linuxerr.EINVAL } if optLen > maxOptLen { return 0, nil, linuxerr.EINVAL } buf := t.CopyScratchBuffer(int(optLen)) if _, err := t.CopyInBytes(optValAddr, buf); err != nil { return 0, nil, err } // Call syscall implementation. if err := s.SetSockOpt(t, int(level), int(name), buf); err != nil { return 0, nil, err.ToError() } return 0, nil, nil } // GetSockName implements the linux syscall getsockname(2). func GetSockName(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Pointer() // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.Socket) if !ok { return 0, nil, linuxerr.ENOTSOCK } // Get the socket name and copy it to the caller. v, vl, err := s.GetSockName(t) if err != nil { return 0, nil, err.ToError() } return 0, nil, writeAddress(t, v, vl, addr, addrlen) } // GetPeerName implements the linux syscall getpeername(2). func GetPeerName(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() addr := args[1].Pointer() addrlen := args[2].Pointer() // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.Socket) if !ok { return 0, nil, linuxerr.ENOTSOCK } // Get the socket peer name and copy it to the caller. v, vl, err := s.GetPeerName(t) if err != nil { return 0, nil, err.ToError() } return 0, nil, writeAddress(t, v, vl, addr, addrlen) } // RecvMsg implements the linux syscall recvmsg(2). func RecvMsg(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() msgPtr := args[1].Pointer() flags := args[2].Int() if t.Arch().Width() != 8 { // We only handle 64-bit for now. return 0, nil, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.Socket) if !ok { return 0, nil, linuxerr.ENOTSOCK } // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { return 0, nil, linuxerr.EINVAL } if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { flags |= linux.MSG_DONTWAIT } var haveDeadline bool var deadline ktime.Time if dl := s.RecvTimeout(); dl > 0 { deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) haveDeadline = true } else if dl < 0 { flags |= linux.MSG_DONTWAIT } n, err := recvSingleMsg(t, s, msgPtr, flags, haveDeadline, deadline) return n, nil, err } // RecvMMsg implements the linux syscall recvmmsg(2). func RecvMMsg(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() msgPtr := args[1].Pointer() vlen := args[2].Uint() flags := args[3].Int() toPtr := args[4].Pointer() if t.Arch().Width() != 8 { // We only handle 64-bit for now. return 0, nil, linuxerr.EINVAL } if vlen > linux.UIO_MAXIOV { vlen = linux.UIO_MAXIOV } // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { return 0, nil, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.Socket) if !ok { return 0, nil, linuxerr.ENOTSOCK } if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { flags |= linux.MSG_DONTWAIT } var haveDeadline bool var deadline ktime.Time if toPtr != 0 { var ts linux.Timespec if _, err := ts.CopyIn(t, toPtr); err != nil { return 0, nil, err } if !ts.Valid() { return 0, nil, linuxerr.EINVAL } deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration()) haveDeadline = true } if !haveDeadline { if dl := s.RecvTimeout(); dl > 0 { deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) haveDeadline = true } else if dl < 0 { flags |= linux.MSG_DONTWAIT } } var count uint32 var err error for i := uint64(0); i < uint64(vlen); i++ { mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) if !ok { return 0, nil, linuxerr.EFAULT } var n uintptr if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil { break } // Copy the received length to the caller. lp, ok := mp.AddLength(messageHeader64Len) if !ok { return 0, nil, linuxerr.EFAULT } if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil { break } count++ } if count == 0 { return 0, nil, err } return uintptr(count), nil, nil } func getSCMRights(t *kernel.Task, rights transport.RightsControlMessage) control.SCMRights { switch v := rights.(type) { case control.SCMRights: return v case *transport.SCMRights: rf := control.RightsFiles(fdsToHostFiles(t, v.FDs)) return &rf default: panic(fmt.Sprintf("rights of type %T must be *transport.SCMRights or implement SCMRights", rights)) } } // If an error is encountered, only files created before the error will be // returned. This is what Linux does. func fdsToHostFiles(ctx context.Context, fds []int) []*vfs.FileDescription { files := make([]*vfs.FileDescription, 0, len(fds)) for _, fd := range fds { // Get flags. We do it here because they may be modified // by subsequent functions. fileFlags, _, errno := unix.Syscall(unix.SYS_FCNTL, uintptr(fd), unix.F_GETFL, 0) if errno != 0 { ctx.Warningf("Error retrieving host FD flags: %v", error(errno)) break } // Create the file backed by hostFD. file, err := host.NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fd, &host.NewFDOptions{}) if err != nil { ctx.Warningf("Error creating file from host FD: %v", err) break } if err := file.SetStatusFlags(ctx, auth.CredentialsFromContext(ctx), uint32(fileFlags&linux.O_NONBLOCK)); err != nil { ctx.Warningf("Error setting flags on host FD file: %v", err) break } files = append(files, file) } return files } func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr hostarch.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) { // Capture the message header and io vectors. var msg MessageHeader64 if _, err := msg.CopyIn(t, msgPtr); err != nil { return 0, err } if msg.IovLen > linux.UIO_MAXIOV { return 0, linuxerr.EMSGSIZE } dst, err := t.IovecsIOSequence(hostarch.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ AddressSpaceActive: true, }) if err != nil { return 0, err } // Fast path when no control message nor name buffers are provided. if msg.ControlLen == 0 && msg.NameLen == 0 { n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0) if err != nil { return 0, linuxerr.ConvertIntr(err.ToError(), linuxerr.ERESTARTSYS) } if !cms.Unix.Empty() { mflags |= linux.MSG_CTRUNC cms.Release(t) } if int(msg.Flags) != mflags { // Copy out the flags to the caller. if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil { return 0, err } } return uintptr(n), nil } if msg.ControlLen > maxControlLen { return 0, linuxerr.ENOBUFS } n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen) if e != nil { return 0, linuxerr.ConvertIntr(e.ToError(), linuxerr.ERESTARTSYS) } defer cms.Release(t) controlData := make([]byte, 0, msg.ControlLen) controlData = control.PackControlMessages(t, cms, controlData) if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() { creds, _ := cms.Unix.Credentials.(control.SCMCredentials) controlData, mflags = control.PackCredentials(t, creds, controlData, mflags) } if cms.Unix.Rights != nil { cms.Unix.Rights = getSCMRights(t, cms.Unix.Rights) controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags) } // Copy the address to the caller. if msg.NameLen != 0 { if err := writeAddress(t, sender, senderLen, hostarch.Addr(msg.Name), hostarch.Addr(msgPtr+nameLenOffset)); err != nil { return 0, err } } // Copy the control data to the caller. if _, err := primitive.CopyUint64Out(t, msgPtr+controlLenOffset, uint64(len(controlData))); err != nil { return 0, err } if len(controlData) > 0 { if _, err := t.CopyOutBytes(hostarch.Addr(msg.Control), controlData); err != nil { return 0, err } } // Copy out the flags to the caller. if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil { return 0, err } return uintptr(n), nil } // recvFrom is the implementation of the recvfrom syscall. It is called by // recvfrom and recv syscall handlers. func recvFrom(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags int32, namePtr hostarch.Addr, nameLenPtr hostarch.Addr) (uintptr, error) { if int(bufLen) < 0 { return 0, linuxerr.EINVAL } // Reject flags that we don't handle yet. if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 { return 0, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { return 0, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.Socket) if !ok { return 0, linuxerr.ENOTSOCK } if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { flags |= linux.MSG_DONTWAIT } dst, err := t.SingleIOSequence(bufPtr, int(bufLen), usermem.IOOpts{ AddressSpaceActive: true, }) if err != nil { return 0, err } var haveDeadline bool var deadline ktime.Time if dl := s.RecvTimeout(); dl > 0 { deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) haveDeadline = true } else if dl < 0 { flags |= linux.MSG_DONTWAIT } n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) cm.Release(t) if e != nil { return 0, linuxerr.ConvertIntr(e.ToError(), linuxerr.ERESTARTSYS) } // Copy the address to the caller. if nameLenPtr != 0 { if err := writeAddress(t, sender, senderLen, namePtr, nameLenPtr); err != nil { return 0, err } } return uintptr(n), nil } // RecvFrom implements the linux syscall recvfrom(2). func RecvFrom(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() bufPtr := args[1].Pointer() bufLen := args[2].Uint64() flags := args[3].Int() namePtr := args[4].Pointer() nameLenPtr := args[5].Pointer() n, err := recvFrom(t, fd, bufPtr, bufLen, flags, namePtr, nameLenPtr) return n, nil, err } // SendMsg implements the linux syscall sendmsg(2). func SendMsg(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() msgPtr := args[1].Pointer() flags := args[2].Int() if t.Arch().Width() != 8 { // We only handle 64-bit for now. return 0, nil, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.Socket) if !ok { return 0, nil, linuxerr.ENOTSOCK } // Reject flags that we don't handle yet. if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { return 0, nil, linuxerr.EINVAL } if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { flags |= linux.MSG_DONTWAIT } n, err := sendSingleMsg(t, s, file, msgPtr, flags) return n, nil, err } // SendMMsg implements the linux syscall sendmmsg(2). func SendMMsg(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() msgPtr := args[1].Pointer() vlen := args[2].Uint() flags := args[3].Int() if t.Arch().Width() != 8 { // We only handle 64-bit for now. return 0, nil, linuxerr.EINVAL } if vlen > linux.UIO_MAXIOV { vlen = linux.UIO_MAXIOV } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.Socket) if !ok { return 0, nil, linuxerr.ENOTSOCK } // Reject flags that we don't handle yet. if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { return 0, nil, linuxerr.EINVAL } if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { flags |= linux.MSG_DONTWAIT } var count uint32 var err error for i := uint64(0); i < uint64(vlen); i++ { mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) if !ok { return 0, nil, linuxerr.EFAULT } var n uintptr if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil { break } // Copy the received length to the caller. lp, ok := mp.AddLength(messageHeader64Len) if !ok { return 0, nil, linuxerr.EFAULT } if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil { break } count++ } if count == 0 { return 0, nil, err } return uintptr(count), nil, nil } func sendSingleMsg(t *kernel.Task, s socket.Socket, file *vfs.FileDescription, msgPtr hostarch.Addr, flags int32) (uintptr, error) { // Capture the message header. var msg MessageHeader64 if _, err := msg.CopyIn(t, msgPtr); err != nil { return 0, err } var controlData []byte if msg.ControlLen > 0 { // Put an upper bound to prevent large allocations. if msg.ControlLen > maxControlLen { return 0, linuxerr.ENOBUFS } controlData = make([]byte, msg.ControlLen) if _, err := t.CopyInBytes(hostarch.Addr(msg.Control), controlData); err != nil { return 0, err } } // Read the destination address if one is specified. var to []byte if msg.NameLen != 0 { var err error to, err = CaptureAddress(t, hostarch.Addr(msg.Name), msg.NameLen) if err != nil { return 0, err } } // Read data then call the sendmsg implementation. if msg.IovLen > linux.UIO_MAXIOV { return 0, linuxerr.EMSGSIZE } src, err := t.IovecsIOSequence(hostarch.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ AddressSpaceActive: true, }) if err != nil { return 0, err } controlMessages, err := control.Parse(t, s, controlData, t.Arch().Width()) if err != nil { return 0, err } var haveDeadline bool var deadline ktime.Time if dl := s.SendTimeout(); dl > 0 { deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) haveDeadline = true } else if dl < 0 { flags |= linux.MSG_DONTWAIT } // Call the syscall implementation. n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) err = HandleIOError(t, n != 0, e.ToError(), linuxerr.ERESTARTSYS, "sendmsg", file) // Control messages should be released on error as well as for zero-length // messages, which are discarded by the receiver. if n == 0 || err != nil { controlMessages.Release(t) } return uintptr(n), err } // sendTo is the implementation of the sendto syscall. It is called by sendto // and send syscall handlers. func sendTo(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags int32, namePtr hostarch.Addr, nameLen uint32) (uintptr, error) { bl := int(bufLen) if bl < 0 { return 0, linuxerr.EINVAL } // Get socket from the file descriptor. file := t.GetFile(fd) if file == nil { return 0, linuxerr.EBADF } defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.Socket) if !ok { return 0, linuxerr.ENOTSOCK } if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { flags |= linux.MSG_DONTWAIT } // Read the destination address if one is specified. var to []byte var err error if namePtr != 0 { to, err = CaptureAddress(t, namePtr, nameLen) if err != nil { return 0, err } } src, err := t.SingleIOSequence(bufPtr, bl, usermem.IOOpts{ AddressSpaceActive: true, }) if err != nil { return 0, err } var haveDeadline bool var deadline ktime.Time if dl := s.SendTimeout(); dl > 0 { deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) haveDeadline = true } else if dl < 0 { flags |= linux.MSG_DONTWAIT } // Call the syscall implementation. n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s)}) return uintptr(n), HandleIOError(t, n != 0, e.ToError(), linuxerr.ERESTARTSYS, "sendto", file) } // SendTo implements the linux syscall sendto(2). func SendTo(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() bufPtr := args[1].Pointer() bufLen := args[2].Uint64() flags := args[3].Int() namePtr := args[4].Pointer() nameLen := args[5].Uint() n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen) return n, nil, err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_splice.go000066400000000000000000000351511465435605700264550ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // Splice implements Linux syscall splice(2). func Splice(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { inFD := args[0].Int() inOffsetPtr := args[1].Pointer() outFD := args[2].Int() outOffsetPtr := args[3].Pointer() count := int64(args[4].SizeT()) flags := args[5].Int() if count == 0 { return 0, nil, nil } if count > int64(kernel.MAX_RW_COUNT) { count = int64(kernel.MAX_RW_COUNT) } if count < 0 { return 0, nil, linuxerr.EINVAL } // Check for invalid flags. if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { return 0, nil, linuxerr.EINVAL } // Get file descriptions. inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, linuxerr.EBADF } defer inFile.DecRef(t) outFile := t.GetFile(outFD) if outFile == nil { return 0, nil, linuxerr.EBADF } defer outFile.DecRef(t) // Check that both files support the required directionality. if !inFile.IsReadable() || !outFile.IsWritable() { return 0, nil, linuxerr.EBADF } if outFile.Options().DenySpliceIn { return 0, nil, linuxerr.EINVAL } // The operation is non-blocking if anything is non-blocking. // // N.B. This is a rather simplistic heuristic that avoids some // poor edge case behavior since the exact semantics here are // underspecified and vary between versions of Linux itself. nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0) // At least one file description must represent a pipe. inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD) outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) if !inIsPipe && !outIsPipe { return 0, nil, linuxerr.EINVAL } // Copy in offsets. inOffset := int64(-1) if inOffsetPtr != 0 { if inIsPipe { return 0, nil, linuxerr.ESPIPE } if inFile.Options().DenyPRead { return 0, nil, linuxerr.EINVAL } if _, err := primitive.CopyInt64In(t, inOffsetPtr, &inOffset); err != nil { return 0, nil, err } if inOffset < 0 { return 0, nil, linuxerr.EINVAL } } outOffset := int64(-1) if outOffsetPtr != 0 { if outIsPipe { return 0, nil, linuxerr.ESPIPE } if outFile.Options().DenyPWrite { return 0, nil, linuxerr.EINVAL } if _, err := primitive.CopyInt64In(t, outOffsetPtr, &outOffset); err != nil { return 0, nil, err } if outOffset < 0 { return 0, nil, linuxerr.EINVAL } } // Move data. var ( n int64 err error ) dw := dualWaiter{ inFile: inFile, outFile: outFile, } defer dw.destroy() for { // If both input and output are pipes, delegate to the pipe // implementation. Otherwise, exactly one end is a pipe, which // we ensure is consistently ordered after the non-pipe FD's // locks by passing the pipe FD as usermem.IO to the non-pipe // end. switch { case inIsPipe && outIsPipe: n, err = pipe.Splice(t, outPipeFD, inPipeFD, count) case inIsPipe: n, err = inPipeFD.SpliceToNonPipe(t, outFile, outOffset, count) if outOffset != -1 { outOffset += n } case outIsPipe: n, err = outPipeFD.SpliceFromNonPipe(t, inFile, inOffset, count) if inOffset != -1 { inOffset += n } default: panic("at least one end of splice must be a pipe") } if n != 0 || !linuxerr.Equals(linuxerr.ErrWouldBlock, err) || nonBlock { break } if err = dw.waitForBoth(t); err != nil { break } } // Copy updated offsets out. if inOffsetPtr != 0 { if _, err := primitive.CopyInt64Out(t, inOffsetPtr, inOffset); err != nil { return 0, nil, err } } if outOffsetPtr != 0 { if _, err := primitive.CopyInt64Out(t, outOffsetPtr, outOffset); err != nil { return 0, nil, err } } // We can only pass a single file to handleIOError, so pick inFile arbitrarily. // This is used only for debugging purposes. return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "splice", outFile) } // Tee implements Linux syscall tee(2). func Tee(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { inFD := args[0].Int() outFD := args[1].Int() count := int64(args[2].SizeT()) flags := args[3].Int() if count == 0 { return 0, nil, nil } if count > int64(kernel.MAX_RW_COUNT) { count = int64(kernel.MAX_RW_COUNT) } if count < 0 { return 0, nil, linuxerr.EINVAL } // Check for invalid flags. if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { return 0, nil, linuxerr.EINVAL } // Get file descriptions. inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, linuxerr.EBADF } defer inFile.DecRef(t) outFile := t.GetFile(outFD) if outFile == nil { return 0, nil, linuxerr.EBADF } defer outFile.DecRef(t) // Check that both files support the required directionality. if !inFile.IsReadable() || !outFile.IsWritable() { return 0, nil, linuxerr.EBADF } if outFile.Options().DenySpliceIn { return 0, nil, linuxerr.EINVAL } // The operation is non-blocking if anything is non-blocking. // // N.B. This is a rather simplistic heuristic that avoids some // poor edge case behavior since the exact semantics here are // underspecified and vary between versions of Linux itself. nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0) // Both file descriptions must represent pipes. inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD) outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) if !inIsPipe || !outIsPipe { return 0, nil, linuxerr.EINVAL } // Copy data. var ( n int64 err error ) dw := dualWaiter{ inFile: inFile, outFile: outFile, } defer dw.destroy() for { n, err = pipe.Tee(t, outPipeFD, inPipeFD, count) if n != 0 || !linuxerr.Equals(linuxerr.ErrWouldBlock, err) || nonBlock { break } if err = dw.waitForBoth(t); err != nil { break } } if n != 0 { // If a partial write is completed, the error is dropped. Log it here. if err != nil && err != io.EOF && !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { log.Debugf("tee completed a partial write with error: %v", err) err = nil } } // We can only pass a single file to handleIOError, so pick inFile arbitrarily. // This is used only for debugging purposes. return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "tee", inFile) } // Sendfile implements linux system call sendfile(2). func Sendfile(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { outFD := args[0].Int() inFD := args[1].Int() offsetAddr := args[2].Pointer() count := int64(args[3].SizeT()) inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, linuxerr.EBADF } defer inFile.DecRef(t) if !inFile.IsReadable() { return 0, nil, linuxerr.EBADF } outFile := t.GetFile(outFD) if outFile == nil { return 0, nil, linuxerr.EBADF } defer outFile.DecRef(t) if !outFile.IsWritable() { return 0, nil, linuxerr.EBADF } if outFile.Options().DenySpliceIn { return 0, nil, linuxerr.EINVAL } // Verify that the outFile Append flag is not set. if outFile.StatusFlags()&linux.O_APPEND != 0 { return 0, nil, linuxerr.EINVAL } // Verify that inFile is a regular file or block device. This is a // requirement; the same check appears in Linux // (fs/splice.c:splice_direct_to_actor). if stat, err := inFile.Stat(t, vfs.StatOptions{Mask: linux.STATX_TYPE}); err != nil { return 0, nil, err } else if stat.Mask&linux.STATX_TYPE == 0 || (stat.Mode&linux.S_IFMT != linux.S_IFREG && stat.Mode&linux.S_IFMT != linux.S_IFBLK) { return 0, nil, linuxerr.EINVAL } // Copy offset if it exists. offset := int64(-1) if offsetAddr != 0 { if inFile.Options().DenyPRead { return 0, nil, linuxerr.ESPIPE } var offsetP primitive.Int64 if _, err := offsetP.CopyIn(t, offsetAddr); err != nil { return 0, nil, err } offset = int64(offsetP) if offset < 0 { return 0, nil, linuxerr.EINVAL } if offset+count < 0 { return 0, nil, linuxerr.EINVAL } } // Validate count. This must come after offset checks. if count < 0 { return 0, nil, linuxerr.EINVAL } if count == 0 { return 0, nil, nil } if count > int64(kernel.MAX_RW_COUNT) { count = int64(kernel.MAX_RW_COUNT) } // Copy data. var ( total int64 err error ) dw := dualWaiter{ inFile: inFile, outFile: outFile, } defer dw.destroy() outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) // Reading from input file should never block, since it is regular or // block device. We only need to check if writing to the output file // can block. nonBlock := outFile.StatusFlags()&linux.O_NONBLOCK != 0 if outIsPipe { for { var n int64 n, err = outPipeFD.SpliceFromNonPipe(t, inFile, offset, count-total) if offset != -1 { offset += n } total += n if total == count { break } if err == nil && t.Interrupted() { err = linuxerr.ErrInterrupted break } if linuxerr.Equals(linuxerr.ErrWouldBlock, err) && !nonBlock { err = dw.waitForBoth(t) } if err != nil { break } } } else { // Read inFile to buffer, then write the contents to outFile. // // The buffer size has to be limited to avoid large memory // allocations and long delays. In Linux, the buffer size is // limited by a size of an internl pipe. Here, we repeat this // behavior. bufSize := count if bufSize > pipe.MaximumPipeSize { bufSize = pipe.MaximumPipeSize } buf := make([]byte, bufSize) for { if int64(len(buf)) > count-total { buf = buf[:count-total] } var readN int64 if offset != -1 { readN, err = inFile.PRead(t, usermem.BytesIOSequence(buf), offset, vfs.ReadOptions{}) offset += readN } else { readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{}) } // Write all of the bytes that we read. This may need // multiple write calls to complete. wbuf := buf[:readN] for len(wbuf) > 0 { var writeN int64 writeN, err = outFile.Write(t, usermem.BytesIOSequence(wbuf), vfs.WriteOptions{}) wbuf = wbuf[writeN:] if linuxerr.Equals(linuxerr.ErrWouldBlock, err) && !nonBlock { err = dw.waitForOut(t) } if err != nil { // We didn't complete the write. Only report the bytes that were actually // written, and rewind offsets as needed. notWritten := int64(len(wbuf)) readN -= notWritten if offset == -1 { // We modified the offset of the input file itself during the read // operation. Rewind it. if _, seekErr := inFile.Seek(t, -notWritten, linux.SEEK_CUR); seekErr != nil { // Log the error but don't return it, since the write has already // completed successfully. log.Warningf("failed to roll back input file offset: %v", seekErr) } } else { // The sendfile call was provided an offset parameter that should be // adjusted to reflect the number of bytes sent. Rewind it. offset -= notWritten } break } } total += readN if total == count { break } if err == nil && t.Interrupted() { err = linuxerr.ErrInterrupted break } if linuxerr.Equals(linuxerr.ErrWouldBlock, err) && !nonBlock { err = dw.waitForBoth(t) } if err != nil { break } } } if offsetAddr != 0 { // Copy out the new offset. offsetP := primitive.Uint64(offset) if _, err := offsetP.CopyOut(t, offsetAddr); err != nil { return 0, nil, err } } if total != 0 { if err != nil && err != io.EOF && !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { // If a partial write is completed, the error is dropped. Log it here. log.Debugf("sendfile completed a partial write with error: %v", err) err = nil } } // We can only pass a single file to handleIOError, so pick inFile arbitrarily. // This is used only for debugging purposes. return uintptr(total), nil, HandleIOError(t, total != 0, err, linuxerr.ERESTARTSYS, "sendfile", inFile) } // dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not // thread-safe, and does not take a reference on the vfs.FileDescriptions. // // Users must call destroy() when finished. type dualWaiter struct { inFile *vfs.FileDescription outFile *vfs.FileDescription inW waiter.Entry inCh chan struct{} outW waiter.Entry outCh chan struct{} } // waitForBoth waits for both dw.inFile and dw.outFile to be ready. func (dw *dualWaiter) waitForBoth(t *kernel.Task) error { if dw.inFile.Readiness(eventMaskRead)&eventMaskRead == 0 { if dw.inCh == nil { dw.inW, dw.inCh = waiter.NewChannelEntry(eventMaskRead) if err := dw.inFile.EventRegister(&dw.inW); err != nil { return err } // We might be ready now. Try again before blocking. return nil } if err := t.Block(dw.inCh); err != nil { return err } } return dw.waitForOut(t) } // waitForOut waits for dw.outfile to be read. func (dw *dualWaiter) waitForOut(t *kernel.Task) error { // Don't bother checking readiness of the outFile, because it's not a // guarantee that it won't return EWOULDBLOCK. Both pipes and eventfds // can be "ready" but will reject writes of certain sizes with // EWOULDBLOCK. See b/172075629, b/170743336. if dw.outCh == nil { dw.outW, dw.outCh = waiter.NewChannelEntry(eventMaskWrite) if err := dw.outFile.EventRegister(&dw.outW); err != nil { return err } // We might be ready to write now. Try again before blocking. return nil } return t.Block(dw.outCh) } // destroy cleans up resources help by dw. No more calls to wait* can occur // after destroy is called. func (dw *dualWaiter) destroy() { if dw.inCh != nil { dw.inFile.EventUnregister(&dw.inW) dw.inCh = nil } if dw.outCh != nil { dw.outFile.EventUnregister(&dw.outW) dw.outCh = nil } dw.inFile = nil dw.outFile = nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_stat.go000066400000000000000000000177001465435605700261510ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // Stat implements Linux syscall stat(2). func Stat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() statAddr := args[1].Pointer() return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, 0 /* flags */) } // Lstat implements Linux syscall lstat(2). func Lstat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() statAddr := args[1].Pointer() return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, linux.AT_SYMLINK_NOFOLLOW) } // Newfstatat implements Linux syscall newfstatat, which backs fstatat(2). func Newfstatat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirfd := args[0].Int() pathAddr := args[1].Pointer() statAddr := args[2].Pointer() flags := args[3].Int() return 0, nil, fstatat(t, dirfd, pathAddr, statAddr, flags) } func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr hostarch.Addr, flags int32) error { // TODO(b/270247637): gVisor does not yet support automount, so // AT_NO_AUTOMOUNT flag is a no-op. flags &= ^linux.AT_NO_AUTOMOUNT if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { return linuxerr.EINVAL } opts := vfs.StatOptions{ Mask: linux.STATX_BASIC_STATS, } path, err := copyInPath(t, pathAddr) if err != nil { return err } root := t.FSContext().RootDirectory() defer root.DecRef(t) start := root if !path.Absolute { if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { return linuxerr.ENOENT } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectory() defer start.DecRef(t) } else { dirfile := t.GetFile(dirfd) if dirfile == nil { return linuxerr.EBADF } if !path.HasComponents() { // Use FileDescription.Stat() instead of // VirtualFilesystem.StatAt() for fstatat(fd, ""), since the // former may be able to use opened file state to expedite the // Stat. statx, err := dirfile.Stat(t, opts) dirfile.DecRef(t) if err != nil { return err } var stat linux.Stat convertStatxToUserStat(t, &statx, &stat) _, err = stat.CopyOut(t, statAddr) return err } start = dirfile.VirtualDentry() start.IncRef() defer start.DecRef(t) dirfile.DecRef(t) } } statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{ Root: root, Start: start, Path: path, FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0, }, &opts) if err != nil { return err } var stat linux.Stat convertStatxToUserStat(t, &statx, &stat) _, err = stat.CopyOut(t, statAddr) return err } func timespecFromStatxTimestamp(sxts linux.StatxTimestamp) linux.Timespec { return linux.Timespec{ Sec: sxts.Sec, Nsec: int64(sxts.Nsec), } } // Fstat implements Linux syscall fstat(2). func Fstat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() statAddr := args[1].Pointer() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) statx, err := file.Stat(t, vfs.StatOptions{ Mask: linux.STATX_BASIC_STATS, }) if err != nil { return 0, nil, err } var stat linux.Stat convertStatxToUserStat(t, &statx, &stat) _, err = stat.CopyOut(t, statAddr) return 0, nil, err } // Statx implements Linux syscall statx(2). func Statx(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirfd := args[0].Int() pathAddr := args[1].Pointer() flags := args[2].Int() mask := args[3].Uint() statxAddr := args[4].Pointer() // TODO(b/270247637): gVisor does not yet support automount, so // AT_NO_AUTOMOUNT flag is a no-op. flags &= ^linux.AT_NO_AUTOMOUNT if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW|linux.AT_STATX_SYNC_TYPE) != 0 { return 0, nil, linuxerr.EINVAL } // Make sure that only one sync type option is set. syncType := uint32(flags & linux.AT_STATX_SYNC_TYPE) if syncType != 0 && !bits.IsPowerOfTwo32(syncType) { return 0, nil, linuxerr.EINVAL } if mask&linux.STATX__RESERVED != 0 { return 0, nil, linuxerr.EINVAL } opts := vfs.StatOptions{ Mask: mask, Sync: uint32(flags & linux.AT_STATX_SYNC_TYPE), } path, err := copyInPath(t, pathAddr) if err != nil { return 0, nil, err } root := t.FSContext().RootDirectory() defer root.DecRef(t) start := root if !path.Absolute { if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { return 0, nil, linuxerr.ENOENT } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectory() defer start.DecRef(t) } else { dirfile := t.GetFile(dirfd) if dirfile == nil { return 0, nil, linuxerr.EBADF } if !path.HasComponents() { // Use FileDescription.Stat() instead of // VirtualFilesystem.StatAt() for statx(fd, ""), since the // former may be able to use opened file state to expedite the // Stat. statx, err := dirfile.Stat(t, opts) dirfile.DecRef(t) if err != nil { return 0, nil, err } userifyStatx(t, &statx) _, err = statx.CopyOut(t, statxAddr) return 0, nil, err } start = dirfile.VirtualDentry() start.IncRef() defer start.DecRef(t) dirfile.DecRef(t) } } statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{ Root: root, Start: start, Path: path, FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0, }, &opts) if err != nil { return 0, nil, err } userifyStatx(t, &statx) _, err = statx.CopyOut(t, statxAddr) return 0, nil, err } func userifyStatx(t *kernel.Task, statx *linux.Statx) { userns := t.UserNamespace() statx.UID = uint32(auth.KUID(statx.UID).In(userns).OrOverflow()) statx.GID = uint32(auth.KGID(statx.GID).In(userns).OrOverflow()) } // Statfs implements Linux syscall statfs(2). func Statfs(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() bufAddr := args[1].Pointer() path, err := copyInPath(t, pathAddr) if err != nil { return 0, nil, err } tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) if err != nil { return 0, nil, err } defer tpop.Release(t) statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop) if err != nil { return 0, nil, err } _, err = statfs.CopyOut(t, bufAddr) return 0, nil, err } // Fstatfs implements Linux syscall fstatfs(2). func Fstatfs(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() bufAddr := args[1].Pointer() tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink) if err != nil { return 0, nil, err } defer tpop.Release(t) statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop) if err != nil { return 0, nil, err } _, err = statfs.CopyOut(t, bufAddr) return 0, nil, err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_stat_amd64.go000066400000000000000000000033511465435605700271410ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) // This takes both input and output as pointer arguments to avoid copying large // structs. func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) { // Linux just copies fields from struct kstat without regard to struct // kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too. userns := t.UserNamespace() *stat = linux.Stat{ Dev: uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)), Ino: statx.Ino, Nlink: uint64(statx.Nlink), Mode: uint32(statx.Mode), UID: uint32(auth.KUID(statx.UID).In(userns).OrOverflow()), GID: uint32(auth.KGID(statx.GID).In(userns).OrOverflow()), Rdev: uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)), Size: int64(statx.Size), Blksize: int64(statx.Blksize), Blocks: int64(statx.Blocks), ATime: timespecFromStatxTimestamp(statx.Atime), MTime: timespecFromStatxTimestamp(statx.Mtime), CTime: timespecFromStatxTimestamp(statx.Ctime), } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_stat_arm64.go000066400000000000000000000033511465435605700271570ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) // This takes both input and output as pointer arguments to avoid copying large // structs. func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) { // Linux just copies fields from struct kstat without regard to struct // kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too. userns := t.UserNamespace() *stat = linux.Stat{ Dev: uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)), Ino: statx.Ino, Nlink: uint32(statx.Nlink), Mode: uint32(statx.Mode), UID: uint32(auth.KUID(statx.UID).In(userns).OrOverflow()), GID: uint32(auth.KGID(statx.GID).In(userns).OrOverflow()), Rdev: uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)), Size: int64(statx.Size), Blksize: int32(statx.Blksize), Blocks: int64(statx.Blocks), ATime: timespecFromStatxTimestamp(statx.Atime), MTime: timespecFromStatxTimestamp(statx.Mtime), CTime: timespecFromStatxTimestamp(statx.Ctime), } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_sync.go000066400000000000000000000103441465435605700261470ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ) // Sync implements Linux syscall sync(2). func Sync(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, t.Kernel().VFS().SyncAllFilesystems(t) } // Syncfs implements Linux syscall syncfs(2). func Syncfs(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) if file.StatusFlags()&linux.O_PATH != 0 { return 0, nil, linuxerr.EBADF } return 0, nil, file.SyncFS(t) } // Fsync implements Linux syscall fsync(2). func Fsync(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) return 0, nil, file.Sync(t) } // Fdatasync implements Linux syscall fdatasync(2). func Fdatasync(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { // TODO(gvisor.dev/issue/1897): Avoid writeback of unnecessary metadata. return Fsync(t, sysno, args) } // SyncFileRange implements Linux syscall sync_file_range(2). func SyncFileRange(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() offset := args[1].Int64() nbytes := args[2].Int64() flags := args[3].Uint() // Check for negative values and overflow. if offset < 0 || offset+nbytes < 0 { return 0, nil, linuxerr.EINVAL } if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 { return 0, nil, linuxerr.EINVAL } file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) // TODO(gvisor.dev/issue/1897): Currently, the only file syncing we support // is a full-file sync, i.e. fsync(2). As a result, there are severe // limitations on how much we support sync_file_range: // - In Linux, sync_file_range(2) doesn't write out the file's metadata, even // if the file size is changed. We do. // - We always sync the entire file instead of [offset, offset+nbytes). // - We do not support the use of WAIT_BEFORE without WAIT_AFTER. For // correctness, we would have to perform a write-out every time WAIT_BEFORE // was used, but this would be much more expensive than expected if there // were no write-out operations in progress. // - Whenever WAIT_AFTER is used, we sync the file. // - Ignore WRITE. If this flag is used with WAIT_AFTER, then the file will // be synced anyway. If this flag is used without WAIT_AFTER, then it is // safe (and less expensive) to do nothing, because the syscall will not // wait for the write-out to complete--we only need to make sure that the // next time WAIT_BEFORE or WAIT_AFTER are used, the write-out completes. // - According to fs/sync.c, WAIT_BEFORE|WAIT_AFTER "will detect any I/O // errors or ENOSPC conditions and will return those to the caller, after // clearing the EIO and ENOSPC flags in the address_space." We don't do // this. if flags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 && flags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 { t.Kernel().EmitUnimplementedEvent(t, sysno) return 0, nil, linuxerr.ENOSYS } if flags&linux.SYNC_FILE_RANGE_WAIT_AFTER != 0 { if err := file.Sync(t); err != nil { return 0, nil, linuxerr.ConvertIntr(err, linuxerr.ERESTARTSYS) } } return 0, nil, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_sysinfo.go000066400000000000000000000031351465435605700266650ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usage" ) // Sysinfo implements Linux syscall sysinfo(2). func Sysinfo(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() mf := t.Kernel().MemoryFile() mfUsage, err := mf.TotalUsage() if err != nil { return 0, nil, err } memStats, _ := usage.MemoryAccounting.Copy() totalUsage := mfUsage + memStats.Mapped totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage) memFree := totalSize - totalUsage if memFree > totalSize { // Underflow. memFree = 0 } // Only a subset of the fields in sysinfo_t make sense to return. si := linux.Sysinfo{ Procs: uint16(t.Kernel().TaskSet().Root.NumTasks()), Uptime: t.Kernel().MonotonicClock().Now().Seconds(), TotalRAM: totalSize, FreeRAM: memFree, Unit: 1, } _, err = si.CopyOut(t, addr) return 0, nil, err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_syslog.go000066400000000000000000000031741465435605700265160ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ) const ( _SYSLOG_ACTION_READ_ALL = 3 _SYSLOG_ACTION_SIZE_BUFFER = 10 ) // logBufLen is the default syslog buffer size on Linux. const logBufLen = 1 << 17 // Syslog implements part of Linux syscall syslog. // // Only the unprivileged commands are implemented, allowing applications to // read a fun dmesg. func Syslog(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { command := args[0].Int() buf := args[1].Pointer() size := int(args[2].Int()) switch command { case _SYSLOG_ACTION_READ_ALL: if size < 0 { return 0, nil, linuxerr.EINVAL } if size > logBufLen { size = logBufLen } log := t.Kernel().Syslog().Log() if len(log) > size { log = log[:size] } n, err := t.CopyOutBytes(buf, log) return uintptr(n), nil, err case _SYSLOG_ACTION_SIZE_BUFFER: return logBufLen, nil, nil default: return 0, nil, linuxerr.ENOSYS } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_thread.go000066400000000000000000000575701465435605700264560ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/loader" "gvisor.dev/gvisor/pkg/sentry/seccheck" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) var ( // ExecMaxTotalSize is the maximum length of all argv and envv entries. // // N.B. The behavior here is different than Linux. Linux provides a limit on // individual arguments of 32 pages, and an aggregate limit of at least 32 pages // but otherwise bounded by min(stack size / 4, 8 MB * 3 / 4). We don't implement // any behavior based on the stack size, and instead provide a fixed hard-limit of // 2 MB (which should work well given that 8 MB stack limits are common). ExecMaxTotalSize = 2 * 1024 * 1024 // ExecMaxElemSize is the maximum length of a single argv or envv entry. ExecMaxElemSize = 32 * hostarch.PageSize ) // Getppid implements linux syscall getppid(2). func Getppid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { parent := t.Parent() if parent == nil { return 0, nil, nil } return uintptr(t.PIDNamespace().IDOfThreadGroup(parent.ThreadGroup())), nil, nil } // Getpid implements linux syscall getpid(2). func Getpid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return uintptr(t.ThreadGroup().ID()), nil, nil } // Gettid implements linux syscall gettid(2). func Gettid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return uintptr(t.ThreadID()), nil, nil } // Execve implements linux syscall execve(2). func Execve(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathnameAddr := args[0].Pointer() argvAddr := args[1].Pointer() envvAddr := args[2].Pointer() return execveat(t, linux.AT_FDCWD, pathnameAddr, argvAddr, envvAddr, 0 /* flags */) } // Execveat implements linux syscall execveat(2). func Execveat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirfd := args[0].Int() pathnameAddr := args[1].Pointer() argvAddr := args[2].Pointer() envvAddr := args[3].Pointer() flags := args[4].Int() return execveat(t, dirfd, pathnameAddr, argvAddr, envvAddr, flags) } func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr hostarch.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) { if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { return 0, nil, linuxerr.EINVAL } pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX) if err != nil { return 0, nil, err } var argv, envv []string if argvAddr != 0 { var err error argv, err = t.CopyInVector(argvAddr, ExecMaxElemSize, ExecMaxTotalSize) if err != nil { return 0, nil, err } } if envvAddr != 0 { var err error envv, err = t.CopyInVector(envvAddr, ExecMaxElemSize, ExecMaxTotalSize) if err != nil { return 0, nil, err } } root := t.FSContext().RootDirectory() defer root.DecRef(t) var executable *vfs.FileDescription defer func() { if executable != nil { executable.DecRef(t) } }() closeOnExec := false if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute { // We must open the executable ourselves since dirfd is used as the // starting point while resolving path, but the task working directory // is used as the starting point while resolving interpreters (Linux: // fs/binfmt_script.c:load_script() => fs/exec.c:open_exec() => // do_open_execat(fd=AT_FDCWD)), and the loader package is currently // incapable of handling this correctly. if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { return 0, nil, linuxerr.ENOENT } dirfile, dirfileFlags := t.FDTable().Get(dirfd) if dirfile == nil { return 0, nil, linuxerr.EBADF } start := dirfile.VirtualDentry() start.IncRef() dirfile.DecRef(t) closeOnExec = dirfileFlags.CloseOnExec file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{ Root: root, Start: start, Path: path, FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0, }, &vfs.OpenOptions{ Flags: linux.O_RDONLY, FileExec: true, }) start.DecRef(t) if err != nil { return 0, nil, err } executable = file pathname = executable.MappedName(t) } // Load the new TaskImage. wd := t.FSContext().WorkingDirectory() defer wd.DecRef(t) remainingTraversals := uint(linux.MaxSymlinkTraversals) loadArgs := loader.LoadArgs{ Root: root, WorkingDir: wd, RemainingTraversals: &remainingTraversals, ResolveFinal: flags&linux.AT_SYMLINK_NOFOLLOW == 0, Filename: pathname, File: executable, CloseOnExec: closeOnExec, Argv: argv, Envv: envv, Features: t.Kernel().FeatureSet(), } if seccheck.Global.Enabled(seccheck.PointExecve) { // Retain the first executable file that is opened (which may open // multiple executable files while resolving interpreter scripts). if executable == nil { loadArgs.AfterOpen = func(f *vfs.FileDescription) { if executable == nil { f.IncRef() executable = f pathname = executable.MappedName(t) } } } } image, se := t.Kernel().LoadTaskImage(t, loadArgs) if se != nil { return 0, nil, se.ToError() } ctrl, err := t.Execve(image, argv, envv, executable, pathname) return 0, ctrl, err } // Exit implements linux syscall exit(2). func Exit(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { status := args[0].Int() t.PrepareExit(linux.WaitStatusExit(status & 0xff)) return 0, kernel.CtrlDoExit, nil } // ExitGroup implements linux syscall exit_group(2). func ExitGroup(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { status := args[0].Int() t.PrepareGroupExit(linux.WaitStatusExit(status & 0xff)) return 0, kernel.CtrlDoExit, nil } // clone is used by Clone, Fork, and VFork. func clone(t *kernel.Task, flags int, stack hostarch.Addr, parentTID hostarch.Addr, childTID hostarch.Addr, tls hostarch.Addr) (uintptr, *kernel.SyscallControl, error) { args := linux.CloneArgs{ Flags: uint64(uint32(flags) &^ linux.CSIGNAL), ChildTID: uint64(childTID), ParentTID: uint64(parentTID), ExitSignal: uint64(flags & linux.CSIGNAL), Stack: uint64(stack), TLS: uint64(tls), } ntid, ctrl, err := t.Clone(&args) return uintptr(ntid), ctrl, err } // Fork implements Linux syscall fork(2). func Fork(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { // "A call to fork() is equivalent to a call to clone(2) specifying flags // as just SIGCHLD." - fork(2) return clone(t, int(linux.SIGCHLD), 0, 0, 0, 0) } // Vfork implements Linux syscall vfork(2). func Vfork(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { // """ // A call to vfork() is equivalent to calling clone(2) with flags specified as: // // CLONE_VM | CLONE_VFORK | SIGCHLD // """ - vfork(2) return clone(t, linux.CLONE_VM|linux.CLONE_VFORK|int(linux.SIGCHLD), 0, 0, 0, 0) } // Clone3 implements linux syscall clone3(2). func Clone3(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { cloneArgsPointer := args[0].Pointer() size := args[1].SizeT() if int(size) < linux.CLONE_ARGS_SIZE_VER0 || int(size) > linux.CLONE_ARGS_SIZE_VER2 { return 0, nil, linuxerr.EINVAL } var cloneArgs linux.CloneArgs if cloneArgsPointer != 0 { if _, err := cloneArgs.CopyInN(t, cloneArgsPointer, int(size)); err != nil { return 0, nil, err } } ntid, ctrl, err := t.Clone(&cloneArgs) if err != nil { return 0, nil, err } return uintptr(ntid), ctrl, err } // parseCommonWaitOptions applies the options common to wait4 and waitid to // wopts. func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error { switch options & (linux.WCLONE | linux.WALL) { case 0: wopts.NonCloneTasks = true case linux.WCLONE: wopts.CloneTasks = true case linux.WALL: wopts.NonCloneTasks = true wopts.CloneTasks = true default: return linuxerr.EINVAL } if options&linux.WCONTINUED != 0 { wopts.Events |= kernel.EventGroupContinue } if options&linux.WNOHANG == 0 { wopts.BlockInterruptErr = linuxerr.ERESTARTSYS } if options&linux.WNOTHREAD == 0 { wopts.SiblingChildren = true } return nil } // wait4 waits for the given child process to exit. func wait4(t *kernel.Task, pid int, statusAddr hostarch.Addr, options int, rusageAddr hostarch.Addr) (uintptr, error) { if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { return 0, linuxerr.EINVAL } wopts := kernel.WaitOptions{ Events: kernel.EventExit | kernel.EventTraceeStop, ConsumeEvent: true, } // There are four cases to consider: // // pid < -1 any child process whose process group ID is equal to the absolute value of pid // pid == -1 any child process // pid == 0 any child process whose process group ID is equal to that of the calling process // pid > 0 the child whose process ID is equal to the value of pid switch { case pid < -1: wopts.SpecificPGID = kernel.ProcessGroupID(-pid) case pid == -1: // Any process is the default. case pid == 0: wopts.SpecificPGID = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup()) default: wopts.SpecificTID = kernel.ThreadID(pid) } if err := parseCommonWaitOptions(&wopts, options); err != nil { return 0, err } if options&linux.WUNTRACED != 0 { wopts.Events |= kernel.EventChildGroupStop } wr, err := t.Wait(&wopts) if err != nil { if err == kernel.ErrNoWaitableEvent { return 0, nil } return 0, err } if statusAddr != 0 { if _, err := primitive.CopyUint32Out(t, statusAddr, uint32(wr.Status)); err != nil { return 0, err } } if rusageAddr != 0 { ru := getrusage(wr.Task, linux.RUSAGE_BOTH) if _, err := ru.CopyOut(t, rusageAddr); err != nil { return 0, err } } return uintptr(wr.TID), nil } // Wait4 implements linux syscall wait4(2). func Wait4(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pid := int(args[0].Int()) statusAddr := args[1].Pointer() options := int(args[2].Uint()) rusageAddr := args[3].Pointer() n, err := wait4(t, pid, statusAddr, options, rusageAddr) return n, nil, err } // WaitPid implements linux syscall waitpid(2). func WaitPid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pid := int(args[0].Int()) statusAddr := args[1].Pointer() options := int(args[2].Uint()) n, err := wait4(t, pid, statusAddr, options, 0) return n, nil, err } // Waitid implements linux syscall waitid(2). func Waitid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { idtype := args[0].Int() id := args[1].Int() infop := args[2].Pointer() options := int(args[3].Uint()) rusageAddr := args[4].Pointer() if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { return 0, nil, linuxerr.EINVAL } if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 { return 0, nil, linuxerr.EINVAL } wopts := kernel.WaitOptions{ Events: kernel.EventTraceeStop, ConsumeEvent: options&linux.WNOWAIT == 0, } switch idtype { case linux.P_ALL: case linux.P_PID: wopts.SpecificTID = kernel.ThreadID(id) case linux.P_PGID: wopts.SpecificPGID = kernel.ProcessGroupID(id) default: return 0, nil, linuxerr.EINVAL } if err := parseCommonWaitOptions(&wopts, options); err != nil { return 0, nil, err } if options&linux.WEXITED != 0 { wopts.Events |= kernel.EventExit } if options&linux.WSTOPPED != 0 { wopts.Events |= kernel.EventChildGroupStop } wr, err := t.Wait(&wopts) if err != nil { if err == kernel.ErrNoWaitableEvent { err = nil // "If WNOHANG was specified in options and there were no children // in a waitable state, then waitid() returns 0 immediately and the // state of the siginfo_t structure pointed to by infop is // unspecified." - waitid(2). But Linux's waitid actually zeroes // out the fields it would set for a successful waitid in this case // as well. if infop != 0 { var si linux.SignalInfo _, err = si.CopyOut(t, infop) } } return 0, nil, err } if rusageAddr != 0 { ru := getrusage(wr.Task, linux.RUSAGE_BOTH) if _, err := ru.CopyOut(t, rusageAddr); err != nil { return 0, nil, err } } if infop == 0 { return 0, nil, nil } si := linux.SignalInfo{ Signo: int32(linux.SIGCHLD), } si.SetPID(int32(wr.TID)) si.SetUID(int32(wr.UID)) s := wr.Status switch { case s.Exited(): si.Code = linux.CLD_EXITED si.SetStatus(int32(s.ExitStatus())) case s.Signaled(): if s.CoreDumped() { si.Code = linux.CLD_DUMPED } else { si.Code = linux.CLD_KILLED } si.SetStatus(int32(s.TerminationSignal())) case s.Stopped(): if wr.Event == kernel.EventTraceeStop { si.Code = linux.CLD_TRAPPED si.SetStatus(int32(s.PtraceEvent())) } else { si.Code = linux.CLD_STOPPED si.SetStatus(int32(s.StopSignal())) } case s.Continued(): si.Code = linux.CLD_CONTINUED si.SetStatus(int32(linux.SIGCONT)) default: t.Warningf("waitid got incomprehensible wait status %d", s) } _, err = si.CopyOut(t, infop) return 0, nil, err } // SetTidAddress implements linux syscall set_tid_address(2). func SetTidAddress(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() // Always succeed, return caller's tid. t.SetClearTID(addr) return uintptr(t.ThreadID()), nil, nil } // Setns implements linux syscall setns(2). func Setns(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) flags := args[1].Int() return 0, nil, t.Setns(file, flags) } // Unshare implements linux syscall unshare(2). func Unshare(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() // "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2) if flags&linux.CLONE_NEWPID != 0 { flags |= linux.CLONE_THREAD } // "... specifying CLONE_NEWUSER automatically implies CLONE_THREAD. Since // Linux 3.9, CLONE_NEWUSER also automatically implies CLONE_FS." if flags&linux.CLONE_NEWUSER != 0 { flags |= linux.CLONE_THREAD | linux.CLONE_FS } return 0, nil, t.Unshare(flags) } // SchedYield implements linux syscall sched_yield(2). func SchedYield(t *kernel.Task, sysno uintptr, _ arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { t.Yield() return 0, nil, nil } // SchedSetaffinity implements linux syscall sched_setaffinity(2). func SchedSetaffinity(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { tid := args[0].Int() size := args[1].SizeT() maskAddr := args[2].Pointer() var task *kernel.Task if tid == 0 { task = t } else { task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)) if task == nil { return 0, nil, linuxerr.ESRCH } } mask := sched.NewCPUSet(t.Kernel().ApplicationCores()) if size > mask.Size() { size = mask.Size() } if _, err := t.CopyInBytes(maskAddr, mask[:size]); err != nil { return 0, nil, err } return 0, nil, task.SetCPUMask(mask) } // SchedGetaffinity implements linux syscall sched_getaffinity(2). func SchedGetaffinity(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { tid := args[0].Int() size := args[1].SizeT() maskAddr := args[2].Pointer() // This limitation is because linux stores the cpumask // in an array of "unsigned long" so the buffer needs to // be a multiple of the word size. if size&(t.Arch().Width()-1) > 0 { return 0, nil, linuxerr.EINVAL } var task *kernel.Task if tid == 0 { task = t } else { task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)) if task == nil { return 0, nil, linuxerr.ESRCH } } mask := task.CPUMask() // The buffer needs to be big enough to hold a cpumask with // all possible cpus. if size < mask.Size() { return 0, nil, linuxerr.EINVAL } _, err := t.CopyOutBytes(maskAddr, mask) // NOTE: The syscall interface is slightly different than the glibc // interface. The raw sched_getaffinity syscall returns the number of // bytes used to represent a cpu mask. return uintptr(mask.Size()), nil, err } // Getcpu implements linux syscall getcpu(2). func Getcpu(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { cpu := args[0].Pointer() node := args[1].Pointer() // third argument to this system call is nowadays unused. if cpu != 0 { if _, err := primitive.CopyInt32Out(t, cpu, t.CPU()); err != nil { return 0, nil, err } } // We always return node 0. if node != 0 { if _, err := t.MemoryManager().ZeroOut(t, node, 4, usermem.IOOpts{ AddressSpaceActive: true, }); err != nil { return 0, nil, err } } return 0, nil, nil } // Setpgid implements the linux syscall setpgid(2). func Setpgid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { // Note that throughout this function, pgid is interpreted with respect // to t's namespace, not with respect to the selected ThreadGroup's // namespace (which may be different). pid := kernel.ThreadID(args[0].Int()) pgid := kernel.ProcessGroupID(args[1].Int()) // "If pid is zero, then the process ID of the calling process is used." tg := t.ThreadGroup() if pid != 0 { ot := t.PIDNamespace().TaskWithID(pid) if ot == nil { return 0, nil, linuxerr.ESRCH } tg = ot.ThreadGroup() if tg.Leader() != ot { return 0, nil, linuxerr.EINVAL } // Setpgid only operates on child threadgroups. if tg != t.ThreadGroup() && (tg.Leader().Parent() == nil || tg.Leader().Parent().ThreadGroup() != t.ThreadGroup()) { return 0, nil, linuxerr.ESRCH } } // "If pgid is zero, then the PGID of the process specified by pid is made // the same as its process ID." defaultPGID := kernel.ProcessGroupID(t.PIDNamespace().IDOfThreadGroup(tg)) if pgid == 0 { pgid = defaultPGID } else if pgid < 0 { return 0, nil, linuxerr.EINVAL } // If the pgid is the same as the group, then create a new one. Otherwise, // we attempt to join an existing process group. if pgid == defaultPGID { // For convenience, errors line up with Linux syscall API. if err := tg.CreateProcessGroup(); err != nil { // Is the process group already as expected? If so, // just return success. This is the same behavior as // Linux. if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == defaultPGID { return 0, nil, nil } return 0, nil, err } } else { // Same as CreateProcessGroup, above. if err := tg.JoinProcessGroup(t.PIDNamespace(), pgid, tg != t.ThreadGroup()); err != nil { // See above. if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid { return 0, nil, nil } return 0, nil, err } } // Success. return 0, nil, nil } // Getpgrp implements the linux syscall getpgrp(2). func Getpgrp(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return uintptr(t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())), nil, nil } // Getpgid implements the linux syscall getpgid(2). func Getpgid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { tid := kernel.ThreadID(args[0].Int()) if tid == 0 { return Getpgrp(t, sysno, args) } target := t.PIDNamespace().TaskWithID(tid) if target == nil { return 0, nil, linuxerr.ESRCH } return uintptr(t.PIDNamespace().IDOfProcessGroup(target.ThreadGroup().ProcessGroup())), nil, nil } // Setsid implements the linux syscall setsid(2). func Setsid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { sid, err := t.ThreadGroup().CreateSession() if err != nil { return 0, nil, err } return uintptr(sid), nil, nil } // Getsid implements the linux syscall getsid(2). func Getsid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { tid := kernel.ThreadID(args[0].Int()) if tid == 0 { return uintptr(t.PIDNamespace().IDOfSession(t.ThreadGroup().Session())), nil, nil } target := t.PIDNamespace().TaskWithID(tid) if target == nil { return 0, nil, linuxerr.ESRCH } return uintptr(t.PIDNamespace().IDOfSession(target.ThreadGroup().Session())), nil, nil } // Getpriority pretends to implement the linux syscall getpriority(2). // // This is a stub; real priorities require a full scheduler. func Getpriority(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { which := args[0].Int() who := kernel.ThreadID(args[1].Int()) switch which { case linux.PRIO_PROCESS: // Look for who, return ESRCH if not found. var task *kernel.Task if who == 0 { task = t } else { task = t.PIDNamespace().TaskWithID(who) } if task == nil { return 0, nil, linuxerr.ESRCH } // From kernel/sys.c:getpriority: // "To avoid negative return values, 'getpriority()' // will not return the normal nice-value, but a negated // value that has been offset by 20" return uintptr(20 - task.Niceness()), nil, nil case linux.PRIO_USER: fallthrough case linux.PRIO_PGRP: // PRIO_USER and PRIO_PGRP have no further implementation yet. return 0, nil, nil default: return 0, nil, linuxerr.EINVAL } } // Setpriority pretends to implement the linux syscall setpriority(2). // // This is a stub; real priorities require a full scheduler. func Setpriority(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { which := args[0].Int() who := kernel.ThreadID(args[1].Int()) niceval := int(args[2].Int()) // In the kernel's implementation, values outside the range // of [-20, 19] are truncated to these minimum and maximum // values. if niceval < -20 /* min niceval */ { niceval = -20 } else if niceval > 19 /* max niceval */ { niceval = 19 } switch which { case linux.PRIO_PROCESS: // Look for who, return ESRCH if not found. var task *kernel.Task if who == 0 { task = t } else { task = t.PIDNamespace().TaskWithID(who) } if task == nil { return 0, nil, linuxerr.ESRCH } task.SetNiceness(niceval) case linux.PRIO_USER: fallthrough case linux.PRIO_PGRP: // PRIO_USER and PRIO_PGRP have no further implementation yet. return 0, nil, nil default: return 0, nil, linuxerr.EINVAL } return 0, nil, nil } // Ptrace implements linux system call ptrace(2). func Ptrace(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { req := args[0].Int64() pid := kernel.ThreadID(args[1].Int()) addr := args[2].Pointer() data := args[3].Pointer() return 0, nil, t.Ptrace(req, pid, addr, data) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_time.go000066400000000000000000000213141465435605700261300ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "fmt" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" ) // The most significant 29 bits hold either a pid or a file descriptor. func pidOfClockID(c int32) kernel.ThreadID { return kernel.ThreadID(^(c >> 3)) } // whichCPUClock returns one of CPUCLOCK_PERF, CPUCLOCK_VIRT, CPUCLOCK_SCHED or // CLOCK_FD. func whichCPUClock(c int32) int32 { return c & linux.CPUCLOCK_CLOCK_MASK } // isCPUClockPerThread returns true if the CPUCLOCK_PERTHREAD bit is set in the // clock id. func isCPUClockPerThread(c int32) bool { return c&linux.CPUCLOCK_PERTHREAD_MASK != 0 } // isValidCPUClock returns checks that the cpu clock id is valid. func isValidCPUClock(c int32) bool { // Bits 0, 1, and 2 cannot all be set. if c&7 == 7 { return false } if whichCPUClock(c) >= linux.CPUCLOCK_MAX { return false } return true } // targetTask returns the kernel.Task for the given clock id. func targetTask(t *kernel.Task, c int32) *kernel.Task { pid := pidOfClockID(c) if pid == 0 { return t } return t.PIDNamespace().TaskWithID(pid) } // ClockGetres implements linux syscall clock_getres(2). func ClockGetres(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { clockID := int32(args[0].Int()) addr := args[1].Pointer() r := linux.Timespec{ Sec: 0, Nsec: 1, } if _, err := getClock(t, clockID); err != nil { return 0, nil, linuxerr.EINVAL } if addr == 0 { // Don't need to copy out. return 0, nil, nil } return 0, nil, copyTimespecOut(t, addr, &r) } type cpuClocker interface { UserCPUClock() ktime.Clock CPUClock() ktime.Clock } func getClock(t *kernel.Task, clockID int32) (ktime.Clock, error) { if clockID < 0 { if !isValidCPUClock(clockID) { return nil, linuxerr.EINVAL } targetTask := targetTask(t, clockID) if targetTask == nil { return nil, linuxerr.EINVAL } var target cpuClocker if isCPUClockPerThread(clockID) { target = targetTask } else { target = targetTask.ThreadGroup() } switch whichCPUClock(clockID) { case linux.CPUCLOCK_VIRT: return target.UserCPUClock(), nil case linux.CPUCLOCK_PROF, linux.CPUCLOCK_SCHED: // CPUCLOCK_SCHED is approximated by CPUCLOCK_PROF. return target.CPUClock(), nil default: return nil, linuxerr.EINVAL } } switch clockID { case linux.CLOCK_REALTIME, linux.CLOCK_REALTIME_COARSE: return t.Kernel().RealtimeClock(), nil case linux.CLOCK_MONOTONIC, linux.CLOCK_MONOTONIC_COARSE, linux.CLOCK_MONOTONIC_RAW, linux.CLOCK_BOOTTIME: // CLOCK_MONOTONIC approximates CLOCK_MONOTONIC_RAW. // CLOCK_BOOTTIME is internally mapped to CLOCK_MONOTONIC, as: // - CLOCK_BOOTTIME should behave as CLOCK_MONOTONIC while also // including suspend time. // - gVisor has no concept of suspend/resume. // - CLOCK_MONOTONIC already includes save/restore time, which is // the closest to suspend time. return t.Kernel().MonotonicClock(), nil case linux.CLOCK_PROCESS_CPUTIME_ID: return t.ThreadGroup().CPUClock(), nil case linux.CLOCK_THREAD_CPUTIME_ID: return t.CPUClock(), nil default: return nil, linuxerr.EINVAL } } // ClockGettime implements linux syscall clock_gettime(2). func ClockGettime(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { clockID := int32(args[0].Int()) addr := args[1].Pointer() c, err := getClock(t, clockID) if err != nil { return 0, nil, err } ts := c.Now().Timespec() return 0, nil, copyTimespecOut(t, addr, &ts) } // ClockSettime implements linux syscall clock_settime(2). func ClockSettime(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, linuxerr.EPERM } // Time implements linux syscall time(2). func Time(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() r := t.Kernel().RealtimeClock().Now().TimeT() if addr == hostarch.Addr(0) { return uintptr(r), nil, nil } if _, err := r.CopyOut(t, addr); err != nil { return 0, nil, err } return uintptr(r), nil, nil } // Nanosleep implements linux syscall Nanosleep(2). func Nanosleep(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() rem := args[1].Pointer() ts, err := copyTimespecIn(t, addr) if err != nil { return 0, nil, err } if !ts.Valid() { return 0, nil, linuxerr.EINVAL } // Just like linux, we cap the timeout with the max number that int64 can // represent which is roughly 292 years. dur := time.Duration(ts.ToNsecCapped()) * time.Nanosecond c := t.Kernel().MonotonicClock() return 0, nil, clockNanosleepUntil(t, c, c.Now().Add(dur), rem, true) } // ClockNanosleep implements linux syscall clock_nanosleep(2). func ClockNanosleep(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { clockID := int32(args[0].Int()) flags := args[1].Int() addr := args[2].Pointer() rem := args[3].Pointer() req, err := copyTimespecIn(t, addr) if err != nil { return 0, nil, err } if !req.Valid() { return 0, nil, linuxerr.EINVAL } // Only allow clock constants also allowed by Linux. (CLOCK_TAI is // unimplemented.) if clockID > 0 { if clockID != linux.CLOCK_REALTIME && clockID != linux.CLOCK_MONOTONIC && clockID != linux.CLOCK_BOOTTIME && clockID != linux.CLOCK_PROCESS_CPUTIME_ID { return 0, nil, linuxerr.EINVAL } } c, err := getClock(t, clockID) if err != nil { return 0, nil, err } if flags&linux.TIMER_ABSTIME != 0 { return 0, nil, clockNanosleepUntil(t, c, ktime.FromTimespec(req), 0, false) } dur := time.Duration(req.ToNsecCapped()) * time.Nanosecond return 0, nil, clockNanosleepUntil(t, c, c.Now().Add(dur), rem, true) } // clockNanosleepUntil blocks until a specified time. // // If blocking is interrupted, the syscall is restarted with the original // arguments. func clockNanosleepUntil(t *kernel.Task, c ktime.Clock, end ktime.Time, rem hostarch.Addr, needRestartBlock bool) error { err := t.BlockWithDeadlineFrom(nil, c, true, end) switch { case linuxerr.Equals(linuxerr.ETIMEDOUT, err): // Slept for entire timeout. return nil case err == linuxerr.ErrInterrupted: // Interrupted. remaining := end.Sub(c.Now()) if remaining <= 0 { return nil } // Copy out remaining time. if rem != 0 { timeleft := linux.NsecToTimespec(remaining.Nanoseconds()) if err := copyTimespecOut(t, rem, &timeleft); err != nil { return err } } if needRestartBlock { // Arrange for a restart with the remaining duration. t.SetSyscallRestartBlock(&clockNanosleepRestartBlock{ c: c, end: end, rem: rem, }) return linuxerr.ERESTART_RESTARTBLOCK } return linuxerr.ERESTARTNOHAND default: panic(fmt.Sprintf("Impossible BlockWithTimer error %v", err)) } } // clockNanosleepRestartBlock encapsulates the state required to restart // clock_nanosleep(2) via restart_syscall(2). // // +stateify savable type clockNanosleepRestartBlock struct { c ktime.Clock end ktime.Time rem hostarch.Addr } // Restart implements kernel.SyscallRestartBlock.Restart. func (n *clockNanosleepRestartBlock) Restart(t *kernel.Task) (uintptr, error) { return 0, clockNanosleepUntil(t, n.c, n.end, n.rem, true) } // Gettimeofday implements linux syscall gettimeofday(2). func Gettimeofday(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { tv := args[0].Pointer() tz := args[1].Pointer() if tv != hostarch.Addr(0) { nowTv := t.Kernel().RealtimeClock().Now().Timeval() if err := copyTimevalOut(t, tv, &nowTv); err != nil { return 0, nil, err } } if tz != hostarch.Addr(0) { // Ask the time package for the timezone. _, offset := time.Now().Zone() // This int32 array mimics linux's struct timezone. timezone := []int32{-int32(offset) / 60, 0} _, err := primitive.CopyInt32SliceOut(t, tz, timezone) return 0, nil, err } return 0, nil, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_timer.go000066400000000000000000000125121465435605700263120ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ) const nsecPerSec = int64(time.Second) // Getitimer implements linux syscall getitimer(2). func Getitimer(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { if t.Arch().Width() != 8 { // Definition of linux.ItimerVal assumes 64-bit architecture. return 0, nil, linuxerr.ENOSYS } timerID := args[0].Int() addr := args[1].Pointer() olditv, err := t.Getitimer(timerID) if err != nil { return 0, nil, err } // A NULL address is allowed, in which case no copy out takes place. if addr == 0 { return 0, nil, nil } _, err = olditv.CopyOut(t, addr) return 0, nil, err } // Setitimer implements linux syscall setitimer(2). func Setitimer(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { if t.Arch().Width() != 8 { // Definition of linux.ItimerVal assumes 64-bit architecture. return 0, nil, linuxerr.ENOSYS } timerID := args[0].Int() newAddr := args[1].Pointer() oldAddr := args[2].Pointer() var newitv linux.ItimerVal // A NULL address is allowed because because Linux allows // setitimer(which, NULL, &old_value) which disables the timer. There is a // KERN_WARN message saying this misfeature will be removed. However, that // hasn't happened as of 3.19, so we continue to support it. if newAddr != 0 { if _, err := newitv.CopyIn(t, newAddr); err != nil { return 0, nil, err } } olditv, err := t.Setitimer(timerID, newitv) if err != nil { return 0, nil, err } // A NULL address is allowed, in which case no copy out takes place. if oldAddr == 0 { return 0, nil, nil } _, err = olditv.CopyOut(t, oldAddr) return 0, nil, err } // Alarm implements linux syscall alarm(2). func Alarm(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { duration := time.Duration(args[0].Uint()) * time.Second olditv, err := t.Setitimer(linux.ITIMER_REAL, linux.ItimerVal{ Value: linux.DurationToTimeval(duration), }) if err != nil { return 0, nil, err } olddur := olditv.Value.ToDuration() secs := olddur.Round(time.Second).Nanoseconds() / nsecPerSec if secs == 0 && olddur != 0 { // We can't return 0 if an alarm was previously scheduled. secs = 1 } return uintptr(secs), nil, nil } // TimerCreate implements linux syscall timer_create(2). func TimerCreate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { clockID := args[0].Int() sevp := args[1].Pointer() timerIDp := args[2].Pointer() c, err := getClock(t, clockID) if err != nil { return 0, nil, err } var sev *linux.Sigevent if sevp != 0 { sev = &linux.Sigevent{} if _, err = sev.CopyIn(t, sevp); err != nil { return 0, nil, err } } id, err := t.IntervalTimerCreate(c, sev) if err != nil { return 0, nil, err } if _, err := id.CopyOut(t, timerIDp); err != nil { t.IntervalTimerDelete(id) return 0, nil, err } return 0, nil, nil } // TimerSettime implements linux syscall timer_settime(2). func TimerSettime(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { timerID := linux.TimerID(args[0].Value) flags := args[1].Int() newValAddr := args[2].Pointer() oldValAddr := args[3].Pointer() var newVal linux.Itimerspec if _, err := newVal.CopyIn(t, newValAddr); err != nil { return 0, nil, err } oldVal, err := t.IntervalTimerSettime(timerID, newVal, flags&linux.TIMER_ABSTIME != 0) if err != nil { return 0, nil, err } if oldValAddr != 0 { _, err = oldVal.CopyOut(t, oldValAddr) return 0, nil, err } return 0, nil, nil } // TimerGettime implements linux syscall timer_gettime(2). func TimerGettime(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { timerID := linux.TimerID(args[0].Value) curValAddr := args[1].Pointer() curVal, err := t.IntervalTimerGettime(timerID) if err != nil { return 0, nil, err } _, err = curVal.CopyOut(t, curValAddr) return 0, nil, err } // TimerGetoverrun implements linux syscall timer_getoverrun(2). func TimerGetoverrun(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { timerID := linux.TimerID(args[0].Value) o, err := t.IntervalTimerGetoverrun(timerID) if err != nil { return 0, nil, err } return uintptr(o), nil, nil } // TimerDelete implements linux syscall timer_delete(2). func TimerDelete(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { timerID := linux.TimerID(args[0].Value) return 0, nil, t.IntervalTimerDelete(timerID) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_timerfd.go000066400000000000000000000071441465435605700266310ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" ) // TimerfdCreate implements Linux syscall timerfd_create(2). func TimerfdCreate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { clockID := args[0].Int() flags := args[1].Int() if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 { return 0, nil, linuxerr.EINVAL } // Timerfds aren't writable per se (their implementation of Write just // returns EINVAL), but they are "opened for writing", which is necessary // to actually reach said implementation of Write. fileFlags := uint32(linux.O_RDWR) if flags&linux.TFD_NONBLOCK != 0 { fileFlags |= linux.O_NONBLOCK } var clock ktime.Clock switch clockID { case linux.CLOCK_REALTIME: clock = t.Kernel().RealtimeClock() case linux.CLOCK_MONOTONIC, linux.CLOCK_BOOTTIME: clock = t.Kernel().MonotonicClock() default: return 0, nil, linuxerr.EINVAL } vfsObj := t.Kernel().VFS() file, err := timerfd.New(t, vfsObj, clock, fileFlags) if err != nil { return 0, nil, err } defer file.DecRef(t) fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.TFD_CLOEXEC != 0, }) if err != nil { return 0, nil, err } return uintptr(fd), nil, nil } // TimerfdSettime implements Linux syscall timerfd_settime(2). func TimerfdSettime(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() flags := args[1].Int() newValAddr := args[2].Pointer() oldValAddr := args[3].Pointer() if flags&^(linux.TFD_TIMER_ABSTIME) != 0 { return 0, nil, linuxerr.EINVAL } file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { return 0, nil, linuxerr.EINVAL } var newVal linux.Itimerspec if _, err := newVal.CopyIn(t, newValAddr); err != nil { return 0, nil, err } newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tfd.Clock()) if err != nil { return 0, nil, err } tm, oldS := tfd.SetTime(newS) if oldValAddr != 0 { oldVal := ktime.ItimerspecFromSetting(tm, oldS) if _, err := oldVal.CopyOut(t, oldValAddr); err != nil { return 0, nil, err } } return 0, nil, nil } // TimerfdGettime implements Linux syscall timerfd_gettime(2). func TimerfdGettime(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() curValAddr := args[1].Pointer() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { return 0, nil, linuxerr.EINVAL } tm, s := tfd.GetTime() curVal := ktime.ItimerspecFromSetting(tm, s) _, err := curVal.CopyOut(t, curValAddr) return 0, nil, err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_tls_amd64.go000066400000000000000000000032321465435605700267660ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ) // ArchPrctl implements linux syscall arch_prctl(2). // It sets architecture-specific process or thread state for t. func ArchPrctl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { switch args[0].Int() { case linux.ARCH_GET_FS: addr := args[1].Pointer() fsbase := t.Arch().TLS() switch t.Arch().Width() { case 8: if _, err := primitive.CopyUint64Out(t, addr, uint64(fsbase)); err != nil { return 0, nil, err } default: return 0, nil, linuxerr.ENOSYS } case linux.ARCH_SET_FS: fsbase := args[1].Uint64() if !t.Arch().SetTLS(uintptr(fsbase)) { return 0, nil, linuxerr.EPERM } case linux.ARCH_GET_GS, linux.ARCH_SET_GS: t.Kernel().EmitUnimplementedEvent(t, sysno) fallthrough default: return 0, nil, linuxerr.EINVAL } return 0, nil, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_tls_arm64.go000066400000000000000000000017051465435605700270070ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package linux import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ) // ArchPrctl is not defined for ARM64. func ArchPrctl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, linuxerr.ENOSYS } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_utsname.go000066400000000000000000000052311465435605700266460ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ) // Uname implements linux syscall uname. func Uname(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { version := t.SyscallTable().Version uts := t.UTSNamespace() // Fill in structure fields. var u linux.UtsName copy(u.Sysname[:], version.Sysname) copy(u.Nodename[:], uts.HostName()) copy(u.Release[:], version.Release) copy(u.Version[:], version.Version) // build tag above. switch t.SyscallTable().Arch { case arch.AMD64: copy(u.Machine[:], "x86_64") case arch.ARM64: copy(u.Machine[:], "aarch64") default: copy(u.Machine[:], "unknown") } copy(u.Domainname[:], uts.DomainName()) // Copy out the result. va := args[0].Pointer() _, err := u.CopyOut(t, va) return 0, nil, err } // Setdomainname implements Linux syscall setdomainname. func Setdomainname(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { nameAddr := args[0].Pointer() size := args[1].Int() utsns := t.UTSNamespace() if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, utsns.UserNamespace()) { return 0, nil, linuxerr.EPERM } if size < 0 || size > linux.UTSLen { return 0, nil, linuxerr.EINVAL } name, err := t.CopyInString(nameAddr, int(size)) if err != nil { return 0, nil, err } utsns.SetDomainName(name) return 0, nil, nil } // Sethostname implements Linux syscall sethostname. func Sethostname(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { nameAddr := args[0].Pointer() size := args[1].Int() utsns := t.UTSNamespace() if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, utsns.UserNamespace()) { return 0, nil, linuxerr.EPERM } if size < 0 || size > linux.UTSLen { return 0, nil, linuxerr.EINVAL } name := make([]byte, size) if _, err := t.CopyInBytes(nameAddr, name); err != nil { return 0, nil, err } utsns.SetHostName(string(name)) return 0, nil, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/sys_xattr.go000066400000000000000000000236301465435605700263370ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "bytes" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // ListXattr implements Linux syscall listxattr(2). func ListXattr(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return listxattr(t, args, followFinalSymlink) } // Llistxattr implements Linux syscall llistxattr(2). func Llistxattr(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return listxattr(t, args, nofollowFinalSymlink) } func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() listAddr := args[1].Pointer() size := args[2].SizeT() path, err := copyInPath(t, pathAddr) if err != nil { return 0, nil, err } tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) if err != nil { return 0, nil, err } defer tpop.Release(t) names, err := t.Kernel().VFS().ListXattrAt(t, t.Credentials(), &tpop.pop, uint64(size)) if err != nil { return 0, nil, err } n, err := copyOutXattrNameList(t, listAddr, size, names) if err != nil { return 0, nil, err } return uintptr(n), nil, nil } // Flistxattr implements Linux syscall flistxattr(2). func Flistxattr(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() listAddr := args[1].Pointer() size := args[2].SizeT() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) names, err := file.ListXattr(t, uint64(size)) if err != nil { return 0, nil, err } n, err := copyOutXattrNameList(t, listAddr, size, names) if err != nil { return 0, nil, err } return uintptr(n), nil, nil } // GetXattr implements Linux syscall getxattr(2). func GetXattr(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return getxattr(t, args, followFinalSymlink) } // Lgetxattr implements Linux syscall lgetxattr(2). func Lgetxattr(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return getxattr(t, args, nofollowFinalSymlink) } func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() nameAddr := args[1].Pointer() valueAddr := args[2].Pointer() size := args[3].SizeT() if size > linux.XATTR_SIZE_MAX { size = linux.XATTR_SIZE_MAX } path, err := copyInPath(t, pathAddr) if err != nil { return 0, nil, err } tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) if err != nil { return 0, nil, err } defer tpop.Release(t) name, err := copyInXattrName(t, nameAddr) if err != nil { return 0, nil, err } value, err := t.Kernel().VFS().GetXattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetXattrOptions{ Name: name, Size: uint64(size), }) if err != nil { return 0, nil, err } n, err := copyOutXattrValue(t, valueAddr, size, value) if err != nil { return 0, nil, err } return uintptr(n), nil, nil } // Fgetxattr implements Linux syscall fgetxattr(2). func Fgetxattr(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() nameAddr := args[1].Pointer() valueAddr := args[2].Pointer() size := args[3].SizeT() if size > linux.XATTR_SIZE_MAX { size = linux.XATTR_SIZE_MAX } file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) name, err := copyInXattrName(t, nameAddr) if err != nil { return 0, nil, err } value, err := file.GetXattr(t, &vfs.GetXattrOptions{Name: name, Size: uint64(size)}) if err != nil { return 0, nil, err } n, err := copyOutXattrValue(t, valueAddr, size, value) if err != nil { return 0, nil, err } return uintptr(n), nil, nil } // SetXattr implements Linux syscall setxattr(2). func SetXattr(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, setxattr(t, args, followFinalSymlink) } // Lsetxattr implements Linux syscall lsetxattr(2). func Lsetxattr(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, setxattr(t, args, nofollowFinalSymlink) } func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error { pathAddr := args[0].Pointer() nameAddr := args[1].Pointer() valueAddr := args[2].Pointer() size := args[3].SizeT() flags := args[4].Int() if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { return linuxerr.EINVAL } path, err := copyInPath(t, pathAddr) if err != nil { return err } tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) if err != nil { return err } defer tpop.Release(t) name, err := copyInXattrName(t, nameAddr) if err != nil { return err } value, err := copyInXattrValue(t, valueAddr, size) if err != nil { return err } return t.Kernel().VFS().SetXattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetXattrOptions{ Name: name, Value: value, Flags: uint32(flags), }) } // Fsetxattr implements Linux syscall fsetxattr(2). func Fsetxattr(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() nameAddr := args[1].Pointer() valueAddr := args[2].Pointer() size := args[3].SizeT() flags := args[4].Int() if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { return 0, nil, linuxerr.EINVAL } file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) name, err := copyInXattrName(t, nameAddr) if err != nil { return 0, nil, err } value, err := copyInXattrValue(t, valueAddr, size) if err != nil { return 0, nil, err } return 0, nil, file.SetXattr(t, &vfs.SetXattrOptions{ Name: name, Value: value, Flags: uint32(flags), }) } // RemoveXattr implements Linux syscall removexattr(2). func RemoveXattr(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, removexattr(t, args, followFinalSymlink) } // Lremovexattr implements Linux syscall lremovexattr(2). func Lremovexattr(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, removexattr(t, args, nofollowFinalSymlink) } func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error { pathAddr := args[0].Pointer() nameAddr := args[1].Pointer() path, err := copyInPath(t, pathAddr) if err != nil { return err } tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) if err != nil { return err } defer tpop.Release(t) name, err := copyInXattrName(t, nameAddr) if err != nil { return err } return t.Kernel().VFS().RemoveXattrAt(t, t.Credentials(), &tpop.pop, name) } // Fremovexattr implements Linux syscall fremovexattr(2). func Fremovexattr(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() nameAddr := args[1].Pointer() file := t.GetFile(fd) if file == nil { return 0, nil, linuxerr.EBADF } defer file.DecRef(t) name, err := copyInXattrName(t, nameAddr) if err != nil { return 0, nil, err } return 0, nil, file.RemoveXattr(t, name) } func copyInXattrName(t *kernel.Task, nameAddr hostarch.Addr) (string, error) { name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1) if err != nil { if linuxerr.Equals(linuxerr.ENAMETOOLONG, err) { return "", linuxerr.ERANGE } return "", err } if len(name) == 0 { return "", linuxerr.ERANGE } return name, nil } func copyOutXattrNameList(t *kernel.Task, listAddr hostarch.Addr, size uint, names []string) (int, error) { if size > linux.XATTR_LIST_MAX { size = linux.XATTR_LIST_MAX } var buf bytes.Buffer for _, name := range names { buf.WriteString(name) buf.WriteByte(0) } if size == 0 { // Return the size that would be required to accommodate the list. return buf.Len(), nil } if buf.Len() > int(size) { if size >= linux.XATTR_LIST_MAX { return 0, linuxerr.E2BIG } return 0, linuxerr.ERANGE } return t.CopyOutBytes(listAddr, buf.Bytes()) } func copyInXattrValue(t *kernel.Task, valueAddr hostarch.Addr, size uint) (string, error) { if size > linux.XATTR_SIZE_MAX { return "", linuxerr.E2BIG } buf := make([]byte, size) if _, err := t.CopyInBytes(valueAddr, buf); err != nil { return "", err } return gohacks.StringFromImmutableBytes(buf), nil } func copyOutXattrValue(t *kernel.Task, valueAddr hostarch.Addr, size uint, value string) (int, error) { if size == 0 { // Return the size that would be required to accommodate the value. return len(value), nil } if len(value) > int(size) { if size >= linux.XATTR_SIZE_MAX { return 0, linuxerr.E2BIG } return 0, linuxerr.ERANGE } return t.CopyOutBytes(valueAddr, gohacks.ImmutableBytesFromString(value)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/linux/timespec.go000066400000000000000000000065401465435605700261110ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package linux import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel" ) // copyTimespecIn copies a Timespec from the untrusted app range to the kernel. func copyTimespecIn(t *kernel.Task, addr hostarch.Addr) (linux.Timespec, error) { switch t.Arch().Width() { case 8: ts := linux.Timespec{} in := t.CopyScratchBuffer(16) _, err := t.CopyInBytes(addr, in) if err != nil { return ts, err } ts.Sec = int64(hostarch.ByteOrder.Uint64(in[0:])) ts.Nsec = int64(hostarch.ByteOrder.Uint64(in[8:])) return ts, nil default: return linux.Timespec{}, linuxerr.ENOSYS } } // copyTimespecOut copies a Timespec to the untrusted app range. func copyTimespecOut(t *kernel.Task, addr hostarch.Addr, ts *linux.Timespec) error { switch t.Arch().Width() { case 8: out := t.CopyScratchBuffer(16) hostarch.ByteOrder.PutUint64(out[0:], uint64(ts.Sec)) hostarch.ByteOrder.PutUint64(out[8:], uint64(ts.Nsec)) _, err := t.CopyOutBytes(addr, out) return err default: return linuxerr.ENOSYS } } // copyTimevalIn copies a Timeval from the untrusted app range to the kernel. func copyTimevalIn(t *kernel.Task, addr hostarch.Addr) (linux.Timeval, error) { switch t.Arch().Width() { case 8: tv := linux.Timeval{} in := t.CopyScratchBuffer(16) _, err := t.CopyInBytes(addr, in) if err != nil { return tv, err } tv.Sec = int64(hostarch.ByteOrder.Uint64(in[0:])) tv.Usec = int64(hostarch.ByteOrder.Uint64(in[8:])) return tv, nil default: return linux.Timeval{}, linuxerr.ENOSYS } } // copyTimevalOut copies a Timeval to the untrusted app range. func copyTimevalOut(t *kernel.Task, addr hostarch.Addr, tv *linux.Timeval) error { switch t.Arch().Width() { case 8: out := t.CopyScratchBuffer(16) hostarch.ByteOrder.PutUint64(out[0:], uint64(tv.Sec)) hostarch.ByteOrder.PutUint64(out[8:], uint64(tv.Usec)) _, err := t.CopyOutBytes(addr, out) return err default: return linuxerr.ENOSYS } } // copyTimespecInToDuration copies a Timespec from the untrusted app range, // validates it and converts it to a Duration. // // If the Timespec is larger than what can be represented in a Duration, the // returned value is the maximum that Duration will allow. // // If timespecAddr is NULL, the returned value is negative. func copyTimespecInToDuration(t *kernel.Task, timespecAddr hostarch.Addr) (time.Duration, error) { // Use a negative Duration to indicate "no timeout". timeout := time.Duration(-1) if timespecAddr != 0 { var timespec linux.Timespec if _, err := timespec.CopyIn(t, timespecAddr); err != nil { return 0, err } if !timespec.Valid() { return 0, linuxerr.EINVAL } timeout = time.Duration(timespec.ToNsecCapped()) } return timeout, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/syscalls.go000066400000000000000000000110011465435605700247620ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package syscalls is the interface from the application to the kernel. // Traditionally, syscalls is the interface that is used by applications to // request services from the kernel of a operating system. We provide a // user-mode kernel that needs to handle those requests coming from unmodified // applications. Therefore, we still use the term "syscalls" to denote this // interface. // // Note that the stubs in this package may merely provide the interface, not // the actual implementation. It just makes writing syscall stubs // straightforward. package syscalls import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ) // Supported returns a syscall that is fully supported. func Supported(name string, fn kernel.SyscallFn) kernel.Syscall { return kernel.Syscall{ Name: name, Fn: fn, SupportLevel: kernel.SupportFull, Note: "Fully Supported.", } } // SupportedPoint returns a syscall that is fully supported with a corresponding // seccheck.Point. func SupportedPoint(name string, fn kernel.SyscallFn, cb kernel.SyscallToProto) kernel.Syscall { sys := Supported(name, fn) sys.PointCallback = cb return sys } // PartiallySupported returns a syscall that has a partial implementation. func PartiallySupported(name string, fn kernel.SyscallFn, note string, urls []string) kernel.Syscall { return kernel.Syscall{ Name: name, Fn: fn, SupportLevel: kernel.SupportPartial, Note: note, URLs: urls, } } // PartiallySupportedPoint returns a syscall that has a partial implementation // with a corresponding seccheck.Point. func PartiallySupportedPoint(name string, fn kernel.SyscallFn, cb kernel.SyscallToProto, note string, urls []string) kernel.Syscall { sys := PartiallySupported(name, fn, note, urls) sys.PointCallback = cb return sys } // Error returns a syscall handler that will always give the passed error. func Error(name string, err error, note string, urls []string) kernel.Syscall { if note != "" { note = note + "; " } return kernel.Syscall{ Name: name, Fn: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { kernel.IncrementUnimplementedSyscallCounter(sysno) return 0, nil, err }, SupportLevel: kernel.SupportUnimplemented, Note: fmt.Sprintf("%sReturns %q.", note, err.Error()), URLs: urls, } } // ErrorWithEvent gives a syscall function that sends an unimplemented // syscall event via the event channel and returns the passed error. func ErrorWithEvent(name string, err error, note string, urls []string) kernel.Syscall { if note != "" { note = note + "; " } return kernel.Syscall{ Name: name, Fn: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { t.Kernel().EmitUnimplementedEvent(t, sysno) return 0, nil, err }, SupportLevel: kernel.SupportUnimplemented, Note: fmt.Sprintf("%sReturns %q.", note, err.Error()), URLs: urls, } } // CapError gives a syscall function that checks for capability c. If the task // has the capability, it returns ENOSYS, otherwise EPERM. To unprivileged // tasks, it will seem like there is an implementation. func CapError(name string, c linux.Capability, note string, urls []string) kernel.Syscall { if note != "" { note = note + "; " } return kernel.Syscall{ Name: name, Fn: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { if !t.HasCapability(c) { return 0, nil, linuxerr.EPERM } t.Kernel().EmitUnimplementedEvent(t, sysno) return 0, nil, linuxerr.ENOSYS }, SupportLevel: kernel.SupportUnimplemented, Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise.", note, linuxerr.EPERM, c.String(), linuxerr.ENOSYS), URLs: urls, } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/syscalls/syscalls_state_autogen.go000066400000000000000000000000721465435605700277120ustar00rootroot00000000000000// automatically generated by stateify. package syscalls golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/000077500000000000000000000000001465435605700217065ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/arith_arm64.go000066400000000000000000000026161465435605700243620ustar00rootroot00000000000000// Copyright 2009 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // This file provides a generic Go implementation of uint128 divided by uint64. // The code is derived from Go's generic math/big.divWW_g // (src/math/big/arith.go), but is only used on ARM64. package time import "math/bits" type word uint const ( _W = bits.UintSize // word size in bits _W2 = _W / 2 // half word size in bits _B2 = 1 << _W2 // half digit base _M2 = _B2 - 1 // half digit mask ) // nlz returns the number of leading zeros in x. // Wraps bits.LeadingZeros call for convenience. func nlz(x word) uint { return uint(bits.LeadingZeros(uint(x))) } // q = (u1<<_W + u0 - r)/y // Adapted from Warren, Hacker's Delight, p. 152. func divWW(u1, u0, v word) (q, r word) { if u1 >= v { return 1<<_W - 1, 1<<_W - 1 } s := nlz(v) v <<= s vn1 := v >> _W2 vn0 := v & _M2 un32 := u1<>(_W-s) un10 := u0 << s un1 := un10 >> _W2 un0 := un10 & _M2 q1 := un32 / vn1 rhat := un32 - q1*vn1 for q1 >= _B2 || q1*vn0 > _B2*rhat+un1 { q1-- rhat += vn1 if rhat >= _B2 { break } } un21 := un32*_B2 + un1 - q1*v q0 := un21 / vn1 rhat = un21 - q0*vn1 for q0 >= _B2 || q0*vn0 > _B2*rhat+un0 { q0-- rhat += vn1 if rhat >= _B2 { break } } return q1*_B2 + q0, (un21*_B2 + un0 - q0*v) >> s } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/calibrated_clock.go000066400000000000000000000170361465435605700255110ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package time provides a calibrated clock synchronized to a system reference // clock. package time import ( "time" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sync" ) // CalibratedClock implements a clock that tracks a reference clock. // // Users should call Update at regular intervals of around approxUpdateInterval // to ensure that the clock does not drift significantly from the reference // clock. type CalibratedClock struct { // mu protects the fields below. // TODO(mpratt): consider a sequence counter for read locking. mu sync.RWMutex // ref sample the reference clock that this clock is calibrated // against. ref *sampler // ready indicates that the fields below are ready for use calculating // time. ready bool // params are the current timekeeping parameters. params Parameters // errorNS is the estimated clock error in nanoseconds. errorNS ReferenceNS } // NewCalibratedClock creates a CalibratedClock that tracks the given ClockID. func NewCalibratedClock(c ClockID) *CalibratedClock { return &CalibratedClock{ ref: newSampler(c), } } // Debugf logs at debug level. func (c *CalibratedClock) Debugf(format string, v ...any) { if log.IsLogging(log.Debug) { args := []any{c.ref.clockID} args = append(args, v...) log.Debugf("CalibratedClock(%v): "+format, args...) } } // Infof logs at debug level. func (c *CalibratedClock) Infof(format string, v ...any) { if log.IsLogging(log.Info) { args := []any{c.ref.clockID} args = append(args, v...) log.Infof("CalibratedClock(%v): "+format, args...) } } // Warningf logs at debug level. func (c *CalibratedClock) Warningf(format string, v ...any) { if log.IsLogging(log.Warning) { args := []any{c.ref.clockID} args = append(args, v...) log.Warningf("CalibratedClock(%v): "+format, args...) } } // reset forces the clock to restart the calibration process, logging the // passed message. func (c *CalibratedClock) reset(str string, v ...any) { c.mu.Lock() defer c.mu.Unlock() c.resetLocked(str, v...) } // resetLocked is equivalent to reset with c.mu already held for writing. func (c *CalibratedClock) resetLocked(str string, v ...any) { c.Warningf(str+" Resetting clock; time may jump.", v...) c.ready = false c.ref.Reset() metric.WeirdnessMetric.Increment(&metric.WeirdnessTypeTimeFallback) } // updateParams updates the timekeeping parameters based on the passed // parameters. // // actual is the actual estimated timekeeping parameters. The stored parameters // may need to be adjusted slightly from these values to compensate for error. // // Preconditions: c.mu must be held for writing. func (c *CalibratedClock) updateParams(actual Parameters) { if !c.ready { // At initial calibration there is nothing to correct. c.params = actual c.ready = true c.Infof("ready") return } // Otherwise, adjust the params to correct for errors. newParams, errorNS, err := errorAdjust(c.params, actual, actual.BaseCycles) if err != nil { // Something is very wrong. Reset and try again from the // beginning. c.resetLocked("Unable to update params: %v.", err) return } logErrorAdjustment(c.ref.clockID, errorNS, c.params, newParams) if errorNS.Magnitude() >= MaxClockError { // We should never get such extreme error, something is very // wrong. Reset everything and start again. // // N.B. logErrorAdjustment will have already logged the error // at warning level. // // TODO(mpratt): We could allow Realtime clock jumps here. c.resetLocked("Extreme clock error.") return } c.params = newParams c.errorNS = errorNS } // Update runs the update step of the clock, updating its synchronization with // the reference clock. // // Update returns timekeeping and true with the new timekeeping parameters if // the clock is calibrated. Update should be called regularly to prevent the // clock from getting significantly out of sync from the reference clock. // // The returned timekeeping parameters are invalidated on the next call to // Update. func (c *CalibratedClock) Update() (Parameters, bool) { c.mu.Lock() defer c.mu.Unlock() if err := c.ref.Sample(); err != nil { c.resetLocked("Unable to update calibrated clock: %v.", err) return Parameters{}, false } oldest, newest, ok := c.ref.Range() if !ok { // Not ready yet. return Parameters{}, false } minCount := uint64(newest.before - oldest.after) maxCount := uint64(newest.after - oldest.before) refInterval := uint64(newest.ref - oldest.ref) // freq hz = count / (interval ns) * (nsPerS ns) / (1 s) nsPerS := uint64(time.Second.Nanoseconds()) minHz, ok := muldiv64(minCount, nsPerS, refInterval) if !ok { c.resetLocked("Unable to update calibrated clock: (%v - %v) * %v / %v overflows.", newest.before, oldest.after, nsPerS, refInterval) return Parameters{}, false } maxHz, ok := muldiv64(maxCount, nsPerS, refInterval) if !ok { c.resetLocked("Unable to update calibrated clock: (%v - %v) * %v / %v overflows.", newest.after, oldest.before, nsPerS, refInterval) return Parameters{}, false } c.updateParams(Parameters{ Frequency: (minHz + maxHz) / 2, BaseRef: newest.ref, BaseCycles: newest.after, }) return c.params, true } // GetTime returns the current time based on the clock calibration. func (c *CalibratedClock) GetTime() (int64, error) { c.mu.RLock() if !c.ready { // Fallback to a syscall. now, err := c.ref.Syscall() c.mu.RUnlock() return int64(now), err } now := c.ref.Cycles() v, ok := c.params.ComputeTime(now) if !ok { // Something is seriously wrong with the clock. Try // again with syscalls. c.resetLocked("Time computation overflowed. params = %+v, now = %v.", c.params, now) now, err := c.ref.Syscall() c.mu.RUnlock() return int64(now), err } c.mu.RUnlock() return v, nil } // CalibratedClocks contains calibrated monotonic and realtime clocks. // // TODO(mpratt): We know that Linux runs the monotonic and realtime clocks at // the same rate, so rather than tracking both individually, we could do one // calibration for both clocks. type CalibratedClocks struct { // monotonic is the clock tracking the system monotonic clock. monotonic *CalibratedClock // realtime is the realtime equivalent of monotonic. realtime *CalibratedClock } // NewCalibratedClocks creates a CalibratedClocks. func NewCalibratedClocks() *CalibratedClocks { return &CalibratedClocks{ monotonic: NewCalibratedClock(Monotonic), realtime: NewCalibratedClock(Realtime), } } // Update implements Clocks.Update. func (c *CalibratedClocks) Update() (Parameters, bool, Parameters, bool) { monotonicParams, monotonicOk := c.monotonic.Update() realtimeParams, realtimeOk := c.realtime.Update() return monotonicParams, monotonicOk, realtimeParams, realtimeOk } // GetTime implements Clocks.GetTime. func (c *CalibratedClocks) GetTime(id ClockID) (int64, error) { switch id { case Monotonic: return c.monotonic.GetTime() case Realtime: return c.realtime.GetTime() default: return 0, linuxerr.EINVAL } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/clock_id.go000066400000000000000000000017511465435605700240100ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package time import ( "strconv" ) // ClockID is a Linux clock identifier. type ClockID int32 // These are the supported Linux clock identifiers. const ( Realtime ClockID = iota Monotonic ) // String implements fmt.Stringer.String. func (c ClockID) String() string { switch c { case Realtime: return "Realtime" case Monotonic: return "Monotonic" default: return strconv.Itoa(int(c)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/clocks.go000066400000000000000000000023211465435605700235110ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package time // Clocks represents a clock source that contains both a monotonic and realtime // clock. type Clocks interface { // Update performs an update step, keeping the clocks in sync with the // reference host clocks, and returning the new timekeeping parameters. // // Update should be called at approximately ApproxUpdateInterval. Update() (monotonicParams Parameters, monotonicOk bool, realtimeParam Parameters, realtimeOk bool) // GetTime returns the current time in nanoseconds for the given clock. // // Clocks implementations must support at least Monotonic and // Realtime. GetTime(c ClockID) (int64, error) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/muldiv_amd64.s000066400000000000000000000023061465435605700243660ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "textflag.h" // Documentation is available in parameters.go. // // func muldiv64(value, multiplier, divisor uint64) (uint64, bool) TEXT ·muldiv64(SB),NOSPLIT|NOFRAME,$0-33 MOVQ value+0(FP), AX MOVQ multiplier+8(FP), BX MOVQ divisor+16(FP), CX // Multiply AX*BX and store result in DX:AX. MULQ BX // If divisor <= (value*multiplier) / 2^64, then the division will overflow. // // (value*multiplier) / 2^64 is DX:AX >> 64, or simply DX. CMPQ CX, DX JLE overflow // Divide DX:AX by CX. DIVQ CX MOVQ AX, ret+24(FP) MOVB $1, ret1+32(FP) RET overflow: MOVQ $0, ret+24(FP) MOVB $0, ret1+32(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/muldiv_arm64.s000066400000000000000000000024051465435605700244040ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "funcdata.h" #include "textflag.h" // Documentation is available in parameters.go. // // func muldiv64(value, multiplier, divisor uint64) (uint64, bool) TEXT ·muldiv64(SB),NOSPLIT,$40-33 GO_ARGS NO_LOCAL_POINTERS MOVD value+0(FP), R0 MOVD multiplier+8(FP), R1 MOVD divisor+16(FP), R2 UMULH R0, R1, R3 MUL R0, R1, R4 CMP R2, R3 BHS overflow MOVD R3, 8(RSP) MOVD R4, 16(RSP) MOVD R2, 24(RSP) CALL ·divWW(SB) MOVD 32(RSP), R0 MOVD R0, ret+24(FP) MOVD $1, R0 MOVB R0, ret1+32(FP) RET overflow: MOVD ZR, ret+24(FP) MOVB ZR, ret1+32(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/parameters.go000066400000000000000000000207261465435605700244070ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package time import ( "fmt" "time" "gvisor.dev/gvisor/pkg/log" ) const ( // ApproxUpdateInterval is the approximate interval that parameters // should be updated at. // // Error correction assumes that the next update will occur after this // much time. // // If an update occurs before ApproxUpdateInterval passes, it has no // adverse effect on error correction behavior. // // If an update occurs after ApproxUpdateInterval passes, the clock // will overshoot its error correction target and begin accumulating // error in the other direction. // // If updates occur after more than 2*ApproxUpdateInterval passes, the // clock becomes unstable, accumulating more error than it had // originally. Repeated updates after more than 2*ApproxUpdateInterval // will cause unbounded increases in error. // // These statements assume that the host clock does not change. Actual // error will depend upon host clock changes. // // TODO(b/68779214): make error correction more robust to delayed // updates. ApproxUpdateInterval = 1 * time.Second // MaxClockError is the maximum amount of error that the clocks will // try to correct. // // This limit: // // * Puts a limit on cases of otherwise unbounded increases in error. // // * Avoids unreasonably large frequency adjustments required to // correct large errors over a single update interval. MaxClockError = ReferenceNS(ApproxUpdateInterval) / 4 ) // Parameters are the timekeeping parameters needed to compute the current // time. type Parameters struct { // BaseCycles was the TSC counter value when the time was BaseRef. BaseCycles TSCValue // BaseRef is the reference clock time in nanoseconds corresponding to // BaseCycles. BaseRef ReferenceNS // Frequency is the frequency of the cycle clock in Hertz. Frequency uint64 } // muldiv64 multiplies two 64-bit numbers, then divides the result by another // 64-bit number. // // It requires that the result fit in 64 bits, but doesn't require that // intermediate values do; in particular, the result of the multiplication may // require 128 bits. // // It returns !ok if divisor is zero or the result does not fit in 64 bits. func muldiv64(value, multiplier, divisor uint64) (uint64, bool) // ComputeTime calculates the current time from a "now" TSC value. // // time = ref + (now - base) / f func (p Parameters) ComputeTime(nowCycles TSCValue) (int64, bool) { diffCycles := nowCycles - p.BaseCycles if diffCycles < 0 { log.Warningf("now cycles %v < base cycles %v", nowCycles, p.BaseCycles) diffCycles = 0 } // Overflow "won't ever happen". If diffCycles is the max value // (2^63 - 1), then to overflow, // // frequency <= ((2^63 - 1) * 10^9) / 2^64 = 500Mhz // // A TSC running at 2GHz takes 201 years to reach 2^63-1. 805 years at // 500MHz. diffNS, ok := muldiv64(uint64(diffCycles), uint64(time.Second.Nanoseconds()), p.Frequency) return int64(uint64(p.BaseRef) + diffNS), ok } // errorAdjust returns a new Parameters struct "adjusted" that satisfies: // // 1. adjusted.ComputeTime(now) = prevParams.ComputeTime(now) // - i.e., the current time does not jump. // // 2. adjusted.ComputeTime(TSC at next update) = newParams.ComputeTime(TSC at next update) // - i.e., Any error between prevParams and newParams will be corrected over // the course of the next update period. // // errorAdjust also returns the current clock error. // // Preconditions: // - newParams.BaseCycles >= prevParams.BaseCycles; i.e., TSC must not go // backwards. // - newParams.BaseCycles <= now; i.e., the new parameters be computed at or // before now. func errorAdjust(prevParams Parameters, newParams Parameters, now TSCValue) (Parameters, ReferenceNS, error) { if newParams.BaseCycles < prevParams.BaseCycles { // Oh dear! Something is very wrong. return Parameters{}, 0, fmt.Errorf("TSC went backwards in updated clock params: %v < %v", newParams.BaseCycles, prevParams.BaseCycles) } if newParams.BaseCycles > now { return Parameters{}, 0, fmt.Errorf("parameters contain base cycles later than now: %v > %v", newParams.BaseCycles, now) } intervalNS := int64(ApproxUpdateInterval.Nanoseconds()) nsPerSec := uint64(time.Second.Nanoseconds()) // Current time as computed by prevParams. oldNowNS, ok := prevParams.ComputeTime(now) if !ok { return Parameters{}, 0, fmt.Errorf("old now time computation overflowed. params = %+v, now = %v", prevParams, now) } // We expect the update ticker to run based on this clock (i.e., it has // been using prevParams and will use the returned adjusted // parameters). Hence it will decide to fire intervalNS from the // current (oldNowNS) "now". nextNS := oldNowNS + intervalNS if nextNS <= int64(newParams.BaseRef) { // The next update time already passed before the new // parameters were created! We definitely can't correct the // error by then. return Parameters{}, 0, fmt.Errorf("unable to correct error in single period. oldNowNS = %v, nextNS = %v, p = %v", oldNowNS, nextNS, newParams) } // For what TSC value next will newParams.ComputeTime(next) = nextNS? // // Solve ComputeTime for next: // // next = newParams.Frequency * (nextNS - newParams.BaseRef) + newParams.BaseCycles c, ok := muldiv64(newParams.Frequency, uint64(nextNS-int64(newParams.BaseRef)), nsPerSec) if !ok { return Parameters{}, 0, fmt.Errorf("%v * (%v - %v) / %v overflows", newParams.Frequency, nextNS, newParams.BaseRef, nsPerSec) } cycles := TSCValue(c) next := cycles + newParams.BaseCycles if next <= now { // The next update time already passed now with the new // parameters! We can't correct the error in a single period. return Parameters{}, 0, fmt.Errorf("unable to correct error in single period. oldNowNS = %v, nextNS = %v, now = %v, next = %v", oldNowNS, nextNS, now, next) } // We want to solve for parameters that satisfy: // // adjusted.ComputeTime(now) = oldNowNS // // adjusted.ComputeTime(next) = nextNS // // i.e., the current time does not change, but by the time we reach // next we reach the same time as newParams. // We choose to keep BaseCycles fixed. adjusted := Parameters{ BaseCycles: newParams.BaseCycles, } // We want a slope such that time goes from oldNowNS to nextNS when // we reach next. // // In other words, cycles should increase by next - now in the next // interval. cycles = next - now ns := intervalNS // adjusted.Frequency = cycles / ns adjusted.Frequency, ok = muldiv64(uint64(cycles), nsPerSec, uint64(ns)) if !ok { return Parameters{}, 0, fmt.Errorf("(%v - %v) * %v / %v overflows", next, now, nsPerSec, ns) } // Now choose a base reference such that the current time remains the // same. Note that this is just ComputeTime, solving for BaseRef: // // oldNowNS = BaseRef + (now - BaseCycles) / Frequency // BaseRef = oldNowNS - (now - BaseCycles) / Frequency diffNS, ok := muldiv64(uint64(now-adjusted.BaseCycles), nsPerSec, adjusted.Frequency) if !ok { return Parameters{}, 0, fmt.Errorf("(%v - %v) * %v / %v overflows", now, adjusted.BaseCycles, nsPerSec, adjusted.Frequency) } adjusted.BaseRef = ReferenceNS(oldNowNS - int64(diffNS)) // The error is the difference between the current time and what the // new parameters say the current time should be. newNowNS, ok := newParams.ComputeTime(now) if !ok { return Parameters{}, 0, fmt.Errorf("new now time computation overflowed. params = %+v, now = %v", newParams, now) } errorNS := ReferenceNS(oldNowNS - newNowNS) return adjusted, errorNS, nil } // logErrorAdjustment logs the clock error and associated error correction // frequency adjustment. // // The log level is determined by the error severity. func logErrorAdjustment(clock ClockID, errorNS ReferenceNS, orig, adjusted Parameters) { magNS := int64(errorNS.Magnitude()) if magNS <= time.Millisecond.Nanoseconds() { // Don't log small errors. return } log.Warningf("Clock(%v): error: %v ns, adjusted frequency from %v Hz to %v Hz", clock, errorNS, orig.Frequency, adjusted.Frequency) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/sampler.go000066400000000000000000000143171465435605700237060ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package time import ( "errors" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" ) const ( // maxSampleLoops is the maximum number of times to try to get a clock sample // under the expected overhead. maxSampleLoops = 5 // maxSamples is the maximum number of samples to collect. maxSamples = 10 ) // errOverheadTooHigh is returned from sampler.Sample if the syscall // overhead is too high. var errOverheadTooHigh = errors.New("time syscall overhead exceeds maximum") // TSCValue is a value from the TSC. type TSCValue int64 // Rdtsc reads the TSC. // // Intel SDM, Vol 3, Ch 17.15: // "The RDTSC instruction reads the time-stamp counter and is guaranteed to // return a monotonically increasing unique value whenever executed, except for // a 64-bit counter wraparound. Intel guarantees that the time-stamp counter // will not wraparound within 10 years after being reset." // // We use int64, so we have 5 years before wrap-around. func Rdtsc() TSCValue // ReferenceNS are nanoseconds in the reference clock domain. // int64 gives us ~290 years before this overflows. type ReferenceNS int64 // Magnitude returns the absolute value of r. func (r ReferenceNS) Magnitude() ReferenceNS { if r < 0 { return -r } return r } // cycleClock is a TSC-based cycle clock. type cycleClock interface { // Cycles returns a count value from the TSC. Cycles() TSCValue } // tscCycleClock is a cycleClock that uses the real TSC. type tscCycleClock struct{} // Cycles implements cycleClock.Cycles. func (tscCycleClock) Cycles() TSCValue { return Rdtsc() } // sample contains a sample from the reference clock, with TSC values from // before and after the reference clock value was captured. type sample struct { before TSCValue after TSCValue ref ReferenceNS } // Overhead returns the sample overhead in TSC cycles. func (s *sample) Overhead() TSCValue { return s.after - s.before } // referenceClocks collects individual samples from a reference clock ID and // TSC. type referenceClocks interface { cycleClock // Sample returns a single sample from the reference clock ID. Sample(c ClockID) (sample, error) } // sampler collects samples from a reference system clock, minimizing // the overhead in each sample. type sampler struct { // clockID is the reference clock ID (e.g., CLOCK_MONOTONIC). clockID ClockID // clocks provides raw samples. clocks referenceClocks // overhead is the estimated sample overhead in TSC cycles. overhead TSCValue // samples is a ring buffer of the latest samples collected. samples []sample } // newSampler creates a sampler for clockID. func newSampler(c ClockID) *sampler { return &sampler{ clockID: c, clocks: syscallTSCReferenceClocks{}, overhead: defaultOverheadCycles, } } // Reset discards previously collected clock samples. func (s *sampler) Reset() { s.overhead = defaultOverheadCycles s.samples = []sample{} } // lowOverheadSample returns a reference clock sample with minimized syscall overhead. func (s *sampler) lowOverheadSample() (sample, error) { for { for i := 0; i < maxSampleLoops; i++ { samp, err := s.clocks.Sample(s.clockID) if err != nil { return sample{}, err } if samp.before > samp.after { log.Warningf("TSC went backwards: %v > %v", samp.before, samp.after) continue } if samp.Overhead() <= s.overhead { return samp, nil } } // Couldn't get a sample with the current overhead. Increase it. newOverhead := 2 * s.overhead if newOverhead > maxOverheadCycles { // We'll give it one more shot with the max overhead. if s.overhead == maxOverheadCycles { return sample{}, errOverheadTooHigh } newOverhead = maxOverheadCycles } s.overhead = newOverhead log.Debugf("Time: Adjusting syscall overhead up to %v", s.overhead) } } // Sample collects a reference clock sample. func (s *sampler) Sample() error { sample, err := s.lowOverheadSample() if err != nil { return err } s.samples = append(s.samples, sample) if len(s.samples) > maxSamples { s.samples = s.samples[1:] } // If the 4 most recent samples all have an overhead less than half the // expected overhead, adjust downwards. if len(s.samples) < 4 { return nil } for _, sample := range s.samples[len(s.samples)-4:] { if sample.Overhead() > s.overhead/2 { return nil } } s.overhead -= s.overhead / 8 log.Debugf("Time: Adjusting syscall overhead down to %v", s.overhead) return nil } // Syscall returns the current raw reference time without storing TSC // samples. func (s *sampler) Syscall() (ReferenceNS, error) { sample, err := s.clocks.Sample(s.clockID) if err != nil { return 0, err } return sample.ref, nil } // Cycles returns a raw TSC value. func (s *sampler) Cycles() TSCValue { return s.clocks.Cycles() } // Range returns the widest range of clock samples available. func (s *sampler) Range() (sample, sample, bool) { if len(s.samples) < 2 { return sample{}, sample{}, false } return s.samples[0], s.samples[len(s.samples)-1], true } // syscallTSCReferenceClocks is the standard referenceClocks, collecting // samples using CLOCK_GETTIME and RDTSC. type syscallTSCReferenceClocks struct { tscCycleClock } // Sample implements sampler.Sample. func (syscallTSCReferenceClocks) Sample(c ClockID) (sample, error) { var s sample s.before = Rdtsc() // Don't call clockGettime to avoid a call which may call morestack. var ts unix.Timespec vdsoClockGettime(c, &ts) s.after = Rdtsc() s.ref = ReferenceNS(ts.Nano()) return s, nil } // clockGettime calls SYS_CLOCK_GETTIME, returning time in nanoseconds. func clockGettime(c ClockID) (ReferenceNS, error) { var ts unix.Timespec vdsoClockGettime(c, &ts) return ReferenceNS(ts.Nano()), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/sampler_amd64.go000066400000000000000000000016601465435605700246760ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package time const ( // defaultOverheadTSC is the default estimated syscall overhead in TSC cycles. // It is further refined as syscalls are made. defaultOverheadCycles = 1 * 1000 // maxOverheadCycles is the maximum allowed syscall overhead in TSC cycles. maxOverheadCycles = 100 * defaultOverheadCycles ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/sampler_arm64.go000066400000000000000000000032011465435605700247050ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package time // getCNTFRQ get ARM counter-timer frequency func getCNTFRQ() TSCValue // getDefaultArchOverheadCycles get default OverheadCycles based on // ARM counter-timer frequency. Usually ARM counter-timer frequency // is range from 1-50Mhz which is much less than that on x86, so we // calibrate defaultOverheadCycles for ARM. func getDefaultArchOverheadCycles() TSCValue { // estimated the clock frequency on x86 is 1Ghz. // 1Ghz divided by counter-timer frequency of ARM to get // frqRatio. defaultOverheadCycles of ARM equals to that on // x86 divided by frqRatio cntfrq := getCNTFRQ() frqRatio := 1000000000 / float64(cntfrq) overheadCycles := (1 * 1000) / frqRatio return TSCValue(overheadCycles) } // defaultOverheadTSC is the default estimated syscall overhead in TSC cycles. // It is further refined as syscalls are made. var defaultOverheadCycles = getDefaultArchOverheadCycles() // maxOverheadCycles is the maximum allowed syscall overhead in TSC cycles. var maxOverheadCycles = 100 * defaultOverheadCycles golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/seqatomic_parameters_unsafe.go000066400000000000000000000032331465435605700300070ustar00rootroot00000000000000package time import ( "unsafe" "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/sync" ) // SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race // with any writer critical sections in seq. // //go:nosplit func SeqAtomicLoadParameters(seq *sync.SeqCount, ptr *Parameters) Parameters { for { if val, ok := SeqAtomicTryLoadParameters(seq, seq.BeginRead(), ptr); ok { return val } } } // SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section // in seq initiated by a call to seq.BeginRead() that returned epoch. If the // read would race with a writer critical section, SeqAtomicTryLoad returns // (unspecified, false). // //go:nosplit func SeqAtomicTryLoadParameters(seq *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *Parameters) (val Parameters, ok bool) { if sync.RaceEnabled { gohacks.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) } else { val = *ptr } ok = seq.ReadOk(epoch) return } // SeqAtomicStore sets *ptr to a copy of val, ensuring that any racing reader // critical sections are forced to retry. // //go:nosplit func SeqAtomicStoreParameters(seq *sync.SeqCount, ptr *Parameters, val Parameters) { seq.BeginWrite() SeqAtomicStoreSeqedParameters(ptr, val) seq.EndWrite() } // SeqAtomicStoreSeqed sets *ptr to a copy of val. // // Preconditions: ptr is protected by a SeqCount that will be in a writer // critical section throughout the call to SeqAtomicStore. // //go:nosplit func SeqAtomicStoreSeqedParameters(ptr *Parameters, val Parameters) { if sync.RaceEnabled { gohacks.Memmove(unsafe.Pointer(ptr), unsafe.Pointer(&val), unsafe.Sizeof(val)) } else { *ptr = val } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/time_amd64_state_autogen.go000066400000000000000000000001301465435605700271020ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 // +build amd64 package time golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/time_arm64_state_autogen.go000066400000000000000000000001301465435605700271200ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 // +build arm64 package time golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/time_state_autogen.go000066400000000000000000000000661465435605700261170ustar00rootroot00000000000000// automatically generated by stateify. package time golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/time_unsafe_state_autogen.go000066400000000000000000000000661465435605700274600ustar00rootroot00000000000000// automatically generated by stateify. package time golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/tsc_amd64.s000066400000000000000000000016661465435605700236670ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "textflag.h" TEXT ·Rdtsc(SB),NOSPLIT|NOFRAME,$0-8 // N.B. We need LFENCE on Intel, AMD is more complicated. // Modern AMD CPUs with modern kernels make LFENCE behave like it does // on Intel with MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT. MFENCE is // otherwise needed on AMD. LFENCE RDTSC SHLQ $32, DX ADDQ DX, AX MOVQ AX, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/tsc_arm64.s000066400000000000000000000016021465435605700236730ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "textflag.h" TEXT ·Rdtsc(SB),NOSPLIT,$0-8 // Get the virtual counter. ISB $15 WORD $0xd53be040 //MRS CNTVCT_EL0, R0 MOVD R0, ret+0(FP) RET TEXT ·getCNTFRQ(SB),NOSPLIT,$0-8 // Get the virtual counter frequency. WORD $0xd53be000 //MRS CNTFRQ_EL0, R0 MOVD R0, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/vdso.go000066400000000000000000000013061465435605700232100ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package time import ( "golang.org/x/sys/unix" ) func vdsoClockGettime(clockid ClockID, ts *unix.Timespec) int golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/vdso_amd64.s000066400000000000000000000016411465435605700240420ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "textflag.h" #define SYS_clock_gettime 228 TEXT ·vdsoClockGettime(SB),NOSPLIT|NOFRAME,$0-24 MOVL clockid+0(FP), DI MOVQ ts+8(FP), SI MOVQ runtime·vdsoClockgettimeSym(SB), AX CMPQ AX, $0 JEQ fallback CALL AX MOVQ AX, ret+16(FP) RET fallback: MOVQ $SYS_clock_gettime, AX SYSCALL MOVQ AX, ret+16(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/time/vdso_arm64.s000066400000000000000000000016161465435605700240620ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "textflag.h" #define SYS_clock_gettime 113 TEXT ·vdsoClockGettime(SB), NOSPLIT, $0-24 MOVW clockid+0(FP), R0 MOVD ts+8(FP), R1 MOVD runtime·vdsoClockgettimeSym(SB), R2 CBZ R2, fallback BL (R2) MOVD R0, ret+16(FP) RET fallback: MOVD $SYS_clock_gettime, R8 SVC MOVD R0, ret+16(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/unimpl/000077500000000000000000000000001465435605700222545ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/unimpl/events.go000066400000000000000000000026471465435605700241200ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package unimpl contains interface to emit events about unimplemented // features. package unimpl import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" ) // contextID is the events package's type for context.Context.Value keys. type contextID int const ( // CtxEvents is a Context.Value key for a Events. CtxEvents contextID = iota ) // Events interface defines method to emit unsupported events. type Events interface { EmitUnimplementedEvent(ctx context.Context, sysno uintptr) } // EmitUnimplementedEvent emits unsupported syscall event to the context. func EmitUnimplementedEvent(ctx context.Context, sysno uintptr) { e := ctx.Value(CtxEvents) if e == nil { log.Warningf("Context.Value(CtxEvents) not present, unimplemented syscall event not reported.") return } e.(Events).EmitUnimplementedEvent(ctx, sysno) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/unimpl/unimpl_state_autogen.go000066400000000000000000000000701465435605700270260ustar00rootroot00000000000000// automatically generated by stateify. package unimpl golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/unimpl/unimplemented_syscall_go_proto/000077500000000000000000000000001465435605700305645ustar00rootroot00000000000000unimplemented_syscall.pb.go000066400000000000000000000141531465435605700360400ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/unimpl/unimplemented_syscall_go_proto// Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.31.0 // protoc v5.26.1 // source: pkg/sentry/unimpl/unimplemented_syscall.proto package unimplemented_syscall_go_proto import ( protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" registers_go_proto "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" reflect "reflect" sync "sync" ) const ( // Verify that this generated code is sufficiently up-to-date. _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) // Verify that runtime/protoimpl is sufficiently up-to-date. _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) type UnimplementedSyscall struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Tid int32 `protobuf:"varint,1,opt,name=tid,proto3" json:"tid,omitempty"` Registers *registers_go_proto.Registers `protobuf:"bytes,2,opt,name=registers,proto3" json:"registers,omitempty"` } func (x *UnimplementedSyscall) Reset() { *x = UnimplementedSyscall{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_unimpl_unimplemented_syscall_proto_msgTypes[0] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *UnimplementedSyscall) String() string { return protoimpl.X.MessageStringOf(x) } func (*UnimplementedSyscall) ProtoMessage() {} func (x *UnimplementedSyscall) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_unimpl_unimplemented_syscall_proto_msgTypes[0] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use UnimplementedSyscall.ProtoReflect.Descriptor instead. func (*UnimplementedSyscall) Descriptor() ([]byte, []int) { return file_pkg_sentry_unimpl_unimplemented_syscall_proto_rawDescGZIP(), []int{0} } func (x *UnimplementedSyscall) GetTid() int32 { if x != nil { return x.Tid } return 0 } func (x *UnimplementedSyscall) GetRegisters() *registers_go_proto.Registers { if x != nil { return x.Registers } return nil } var File_pkg_sentry_unimpl_unimplemented_syscall_proto protoreflect.FileDescriptor var file_pkg_sentry_unimpl_unimplemented_syscall_proto_rawDesc = []byte{ 0x0a, 0x2d, 0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x2f, 0x75, 0x6e, 0x69, 0x6d, 0x70, 0x6c, 0x2f, 0x75, 0x6e, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x65, 0x64, 0x5f, 0x73, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x06, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x1a, 0x1f, 0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x2f, 0x61, 0x72, 0x63, 0x68, 0x2f, 0x72, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0x59, 0x0a, 0x14, 0x55, 0x6e, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x65, 0x64, 0x53, 0x79, 0x73, 0x63, 0x61, 0x6c, 0x6c, 0x12, 0x10, 0x0a, 0x03, 0x74, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x03, 0x74, 0x69, 0x64, 0x12, 0x2f, 0x0a, 0x09, 0x72, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72, 0x73, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x11, 0x2e, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x2e, 0x52, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72, 0x73, 0x52, 0x09, 0x72, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72, 0x73, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( file_pkg_sentry_unimpl_unimplemented_syscall_proto_rawDescOnce sync.Once file_pkg_sentry_unimpl_unimplemented_syscall_proto_rawDescData = file_pkg_sentry_unimpl_unimplemented_syscall_proto_rawDesc ) func file_pkg_sentry_unimpl_unimplemented_syscall_proto_rawDescGZIP() []byte { file_pkg_sentry_unimpl_unimplemented_syscall_proto_rawDescOnce.Do(func() { file_pkg_sentry_unimpl_unimplemented_syscall_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_sentry_unimpl_unimplemented_syscall_proto_rawDescData) }) return file_pkg_sentry_unimpl_unimplemented_syscall_proto_rawDescData } var file_pkg_sentry_unimpl_unimplemented_syscall_proto_msgTypes = make([]protoimpl.MessageInfo, 1) var file_pkg_sentry_unimpl_unimplemented_syscall_proto_goTypes = []interface{}{ (*UnimplementedSyscall)(nil), // 0: gvisor.UnimplementedSyscall (*registers_go_proto.Registers)(nil), // 1: gvisor.Registers } var file_pkg_sentry_unimpl_unimplemented_syscall_proto_depIdxs = []int32{ 1, // 0: gvisor.UnimplementedSyscall.registers:type_name -> gvisor.Registers 1, // [1:1] is the sub-list for method output_type 1, // [1:1] is the sub-list for method input_type 1, // [1:1] is the sub-list for extension type_name 1, // [1:1] is the sub-list for extension extendee 0, // [0:1] is the sub-list for field type_name } func init() { file_pkg_sentry_unimpl_unimplemented_syscall_proto_init() } func file_pkg_sentry_unimpl_unimplemented_syscall_proto_init() { if File_pkg_sentry_unimpl_unimplemented_syscall_proto != nil { return } if !protoimpl.UnsafeEnabled { file_pkg_sentry_unimpl_unimplemented_syscall_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*UnimplementedSyscall); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } } type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_pkg_sentry_unimpl_unimplemented_syscall_proto_rawDesc, NumEnums: 0, NumMessages: 1, NumExtensions: 0, NumServices: 0, }, GoTypes: file_pkg_sentry_unimpl_unimplemented_syscall_proto_goTypes, DependencyIndexes: file_pkg_sentry_unimpl_unimplemented_syscall_proto_depIdxs, MessageInfos: file_pkg_sentry_unimpl_unimplemented_syscall_proto_msgTypes, }.Build() File_pkg_sentry_unimpl_unimplemented_syscall_proto = out.File file_pkg_sentry_unimpl_unimplemented_syscall_proto_rawDesc = nil file_pkg_sentry_unimpl_unimplemented_syscall_proto_goTypes = nil file_pkg_sentry_unimpl_unimplemented_syscall_proto_depIdxs = nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/uniqueid/000077500000000000000000000000001465435605700225735ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/uniqueid/context.go000066400000000000000000000036431465435605700246140ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package uniqueid defines context.Context keys for obtaining system-wide // unique identifiers. package uniqueid import ( "gvisor.dev/gvisor/pkg/context" ) // contextID is the kernel package's type for context.Context.Value keys. type contextID int const ( // CtxGlobalUniqueID is a Context.Value key for a system-wide // unique identifier. CtxGlobalUniqueID contextID = iota // CtxGlobalUniqueIDProvider is a Context.Value key for a // system-wide unique identifier generator. CtxGlobalUniqueIDProvider // CtxInotifyCookie is a Context.Value key for a unique inotify // event cookie. CtxInotifyCookie ) // Provider generates a sequence of unique identifiers useful for, // among other things, lock ordering. type Provider interface { // UniqueID returns a new unique identifier. UniqueID() uint64 } // GlobalFromContext returns a system-wide unique identifier from ctx. func GlobalFromContext(ctx context.Context) uint64 { return ctx.Value(CtxGlobalUniqueID).(uint64) } // GlobalProviderFromContext returns a system-wide unique identifier from ctx. func GlobalProviderFromContext(ctx context.Context) Provider { return ctx.Value(CtxGlobalUniqueIDProvider).(Provider) } // InotifyCookie generates a unique inotify event cookie from ctx. func InotifyCookie(ctx context.Context) uint32 { return ctx.Value(CtxInotifyCookie).(uint32) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/uniqueid/uniqueid_state_autogen.go000066400000000000000000000000721465435605700276660ustar00rootroot00000000000000// automatically generated by stateify. package uniqueid golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/usage/000077500000000000000000000000001465435605700220545ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/usage/cpu.go000066400000000000000000000034141465435605700231740ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package usage import ( "time" ) // CPUStats contains the subset of struct rusage fields that relate to CPU // scheduling. // // +stateify savable type CPUStats struct { // UserTime is the amount of time spent executing application code. UserTime time.Duration // SysTime is the amount of time spent executing sentry code. SysTime time.Duration // VoluntarySwitches is the number of times control has been voluntarily // ceded due to blocking, etc. VoluntarySwitches uint64 // InvoluntarySwitches (struct rusage::ru_nivcsw) is unsupported, since // "preemptive" scheduling is managed by the Go runtime, which doesn't // provide this information. } // Accumulate adds s2 to s. func (s *CPUStats) Accumulate(s2 CPUStats) { s.UserTime += s2.UserTime s.SysTime += s2.SysTime s.VoluntarySwitches += s2.VoluntarySwitches } // DifferenceSince computes s - earlierSample. // // Precondition: s >= earlierSample. func (s *CPUStats) DifferenceSince(earlierSample CPUStats) CPUStats { return CPUStats{ UserTime: s.UserTime - earlierSample.UserTime, SysTime: s.SysTime - earlierSample.SysTime, VoluntarySwitches: s.VoluntarySwitches - earlierSample.VoluntarySwitches, } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/usage/io.go000066400000000000000000000057571465435605700230300ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package usage import "gvisor.dev/gvisor/pkg/atomicbitops" // IO contains I/O-related statistics. // // +stateify savable type IO struct { // CharsRead is the number of bytes read by read syscalls. CharsRead atomicbitops.Uint64 // CharsWritten is the number of bytes written by write syscalls. CharsWritten atomicbitops.Uint64 // ReadSyscalls is the number of read syscalls. ReadSyscalls atomicbitops.Uint64 // WriteSyscalls is the number of write syscalls. WriteSyscalls atomicbitops.Uint64 // The following counter is only meaningful when Sentry has internal // pagecache. // BytesRead is the number of bytes actually read into pagecache. BytesRead atomicbitops.Uint64 // BytesWritten is the number of bytes actually written from pagecache. BytesWritten atomicbitops.Uint64 // BytesWriteCancelled is the number of bytes not written out due to // truncation. BytesWriteCancelled atomicbitops.Uint64 } // Clone turns other into a clone of i. func (i *IO) Clone(other *IO) { other.CharsRead.Store(i.CharsRead.Load()) other.CharsWritten.Store(i.CharsWritten.Load()) other.ReadSyscalls.Store(i.ReadSyscalls.Load()) other.WriteSyscalls.Store(i.WriteSyscalls.Load()) other.BytesRead.Store(i.BytesRead.Load()) other.BytesWritten.Store(i.BytesWritten.Load()) other.BytesWriteCancelled.Store(i.BytesWriteCancelled.Load()) } // AccountReadSyscall does the accounting for a read syscall. func (i *IO) AccountReadSyscall(bytes int64) { i.ReadSyscalls.Add(1) if bytes > 0 { i.CharsRead.Add(uint64(bytes)) } } // AccountWriteSyscall does the accounting for a write syscall. func (i *IO) AccountWriteSyscall(bytes int64) { i.WriteSyscalls.Add(1) if bytes > 0 { i.CharsWritten.Add(uint64(bytes)) } } // AccountReadIO does the accounting for a read IO into the file system. func (i *IO) AccountReadIO(bytes int64) { if bytes > 0 { i.BytesRead.Add(uint64(bytes)) } } // AccountWriteIO does the accounting for a write IO into the file system. func (i *IO) AccountWriteIO(bytes int64) { if bytes > 0 { i.BytesWritten.Add(uint64(bytes)) } } // Accumulate adds up io usages. func (i *IO) Accumulate(io *IO) { i.CharsRead.Add(io.CharsRead.Load()) i.CharsWritten.Add(io.CharsWritten.Load()) i.ReadSyscalls.Add(io.ReadSyscalls.Load()) i.WriteSyscalls.Add(io.WriteSyscalls.Load()) i.BytesRead.Add(io.BytesRead.Load()) i.BytesWritten.Add(io.BytesWritten.Load()) i.BytesWriteCancelled.Add(io.BytesWriteCancelled.Load()) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/usage/memory.go000066400000000000000000000305231465435605700237160ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package usage import ( "fmt" "os" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/memutil" "gvisor.dev/gvisor/pkg/sync" ) // MemoryKind represents a type of memory used by the application. // // For efficiency reasons, it is assumed that the Memory implementation is // responsible for specific stats (documented below), and those may be reported // in aggregate independently. See the platform.Memory interface as well as the // control.Usage.Collect method for more information. type MemoryKind int const ( // System represents miscellaneous system memory. This may include // memory that is in the process of being reclaimed, system caches, // page tables, swap, etc. // // This memory kind is backed by platform memory. System MemoryKind = iota // Anonymous represents anonymous application memory. // // This memory kind is backed by platform memory. Anonymous // PageCache represents memory allocated to back sandbox-visible files that // do not have a local fd. The contents of these files are buffered in // memory to support application mmaps. // // This memory kind is backed by platform memory. PageCache // Tmpfs represents memory used by the sandbox-visible tmpfs. // // This memory kind is backed by platform memory. Tmpfs // Ramdiskfs represents memory used by the ramdiskfs. // // This memory kind is backed by platform memory. Ramdiskfs // Mapped represents memory related to files which have a local fd on the // host, and thus can be directly mapped. Typically these are files backed // by gofers with donated-fd support. Note that this value may not track the // exact amount of memory used by mapping on the host, because we don't have // any visibility into the host kernel memory management. In particular, // once we map some part of a host file, the host kernel is free to // arbitrarily populate/decommit the pages, which it may do for various // reasons (ex. host memory reclaim, NUMA balancing). // // This memory kind is backed by the host pagecache, via host mmaps. Mapped ) // memoryStats tracks application memory usage in bytes. All fields correspond to the // memory category with the same name. This object is thread-safe if accessed // through the provided methods. The public fields may be safely accessed // directly on a copy of the object obtained from Memory.Copy(). type memoryStats struct { System atomicbitops.Uint64 Anonymous atomicbitops.Uint64 PageCache atomicbitops.Uint64 Tmpfs atomicbitops.Uint64 Mapped atomicbitops.Uint64 Ramdiskfs atomicbitops.Uint64 } // incLocked adds a usage of 'val' bytes from memory category 'kind'. // // Precondition: must be called when locked. func (ms *memoryStats) incLocked(val uint64, kind MemoryKind) { switch kind { case System: ms.System.Add(val) case Anonymous: ms.Anonymous.Add(val) case PageCache: ms.PageCache.Add(val) case Mapped: ms.Mapped.Add(val) case Tmpfs: ms.Tmpfs.Add(val) case Ramdiskfs: ms.Ramdiskfs.Add(val) default: panic(fmt.Sprintf("invalid memory kind: %v", kind)) } } // decLocked removes a usage of 'val' bytes from memory category 'kind'. // // Precondition: must be called when locked. func (ms *memoryStats) decLocked(val uint64, kind MemoryKind) { switch kind { case System: ms.System.Add(^(val - 1)) case Anonymous: ms.Anonymous.Add(^(val - 1)) case PageCache: ms.PageCache.Add(^(val - 1)) case Mapped: ms.Mapped.Add(^(val - 1)) case Tmpfs: ms.Tmpfs.Add(^(val - 1)) case Ramdiskfs: ms.Ramdiskfs.Add(^(val - 1)) default: panic(fmt.Sprintf("invalid memory kind: %v", kind)) } } // totalLocked returns a total usage. // // Precondition: must be called when locked. func (ms *memoryStats) totalLocked() (total uint64) { total += ms.System.RacyLoad() total += ms.Anonymous.RacyLoad() total += ms.PageCache.RacyLoad() total += ms.Mapped.RacyLoad() total += ms.Tmpfs.RacyLoad() total += ms.Ramdiskfs.RacyLoad() return } // copyLocked returns a copy of the structure. // // Precondition: must be called when locked. func (ms *memoryStats) copyLocked() MemoryStats { return MemoryStats{ System: ms.System.RacyLoad(), Anonymous: ms.Anonymous.RacyLoad(), PageCache: ms.PageCache.RacyLoad(), Tmpfs: ms.Tmpfs.RacyLoad(), Mapped: ms.Mapped.RacyLoad(), Ramdiskfs: ms.Ramdiskfs.RacyLoad(), } } // MemoryStats tracks application memory usage in bytes. All fields correspond // to the memory category with the same name. type MemoryStats struct { System uint64 Anonymous uint64 PageCache uint64 Tmpfs uint64 Mapped uint64 Ramdiskfs uint64 } // RTMemoryStats contains the memory usage values that need to be directly // exposed through a shared memory file for real-time access. These are // categories not backed by platform memory. For details about how this works, // see the memory accounting docs. // // N.B. Please keep the struct in sync with the API. Notably, changes to this // struct requires a version bump and addition of compatibility logic in the // control server. As a special-case, adding fields without re-ordering existing // ones do not require a version bump because the mapped page we use is // initially zeroed. Any added field will be ignored by an older API and will be // zero if read by a newer API. type RTMemoryStats struct { RTMapped atomicbitops.Uint64 } // MemoryLocked is Memory with access methods. type MemoryLocked struct { mu memoryMutex // memoryStats records the memory stats. memoryStats // RTMemoryStats records the memory stats that need to be exposed through // shared page. *RTMemoryStats // File is the backing file storing the memory stats. File *os.File // MemCgIDToMemStats is the map of cgroup ids to memory stats. MemCgIDToMemStats map[uint32]*memoryStats } var ( initOnce sync.Once initErr error ) // Init initializes global 'MemoryAccounting'. func Init() error { initOnce.Do(func() { initErr = func() error { const name = "memory-usage" fd, err := memutil.CreateMemFD(name, 0) if err != nil { return fmt.Errorf("error creating usage file: %v", err) } file := os.NewFile(uintptr(fd), name) if err := file.Truncate(int64(RTMemoryStatsSize)); err != nil { return fmt.Errorf("error truncating usage file: %v", err) } // Note: We rely on the returned page being initially zeroed. This will // always be the case for a newly mapped page from /dev/shm. If we obtain // the shared memory through some other means in the future, we may have to // explicitly zero the page. mmap, err := memutil.MapFile(0, RTMemoryStatsSize, unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED, file.Fd(), 0) if err != nil { return fmt.Errorf("error mapping usage file: %v", err) } MemoryAccounting = &MemoryLocked{ File: file, RTMemoryStats: RTMemoryStatsPointer(mmap), MemCgIDToMemStats: make(map[uint32]*memoryStats), } return nil }() }) return initErr } // MemoryAccounting is the global memory stats. // // There is no need to save or restore the global memory accounting object, // because individual frame kinds are saved and charged only when they become // resident. var MemoryAccounting *MemoryLocked func (m *MemoryLocked) incLockedPerCg(val uint64, kind MemoryKind, memCgID uint32) { if _, ok := m.MemCgIDToMemStats[memCgID]; !ok { m.MemCgIDToMemStats[memCgID] = &memoryStats{} } ms := m.MemCgIDToMemStats[memCgID] ms.incLocked(val, kind) } // Inc adds an additional usage of 'val' bytes to memory category 'kind' for a // cgroup with id 'memCgID'. If 'memCgID' is zero, the memory is accounted only // for the total memory usage. // // This method is thread-safe. func (m *MemoryLocked) Inc(val uint64, kind MemoryKind, memCgID uint32) { m.mu.Lock() defer m.mu.Unlock() m.incLocked(val, kind) if memCgID != 0 { m.incLockedPerCg(val, kind, memCgID) } // If the memory category is 'Mapped', update RTMapped. if kind == Mapped { m.RTMapped.Add(val) } } func (m *MemoryLocked) decLockedPerCg(val uint64, kind MemoryKind, memCgID uint32) { if _, ok := m.MemCgIDToMemStats[memCgID]; !ok { panic(fmt.Sprintf("invalid memory cgroup id: %v", memCgID)) } ms := m.MemCgIDToMemStats[memCgID] ms.decLocked(val, kind) } // Dec removes a usage of 'val' bytes from memory category 'kind' for a cgroup // with id 'memCgID'. If 'memCgID' is zero, the memory is removed only from the // total usage. // // This method is thread-safe. func (m *MemoryLocked) Dec(val uint64, kind MemoryKind, memCgID uint32) { m.mu.Lock() defer m.mu.Unlock() m.decLocked(val, kind) if memCgID != 0 { m.decLockedPerCg(val, kind, memCgID) } // If the memory category is 'Mapped', update RTMapped. if kind == Mapped { m.RTMapped.Add(^(val - 1)) } } // Move moves a usage of 'val' bytes from 'from' to 'to' for a cgroup with // id 'memCgID'. // // This method is thread-safe. func (m *MemoryLocked) Move(val uint64, to MemoryKind, from MemoryKind, memCgID uint32) { m.mu.Lock() defer m.mu.Unlock() // Just call decLocked and incLocked directly. We held the Lock to // protect against concurrent callers to Total(). m.decLocked(val, from) m.incLocked(val, to) if memCgID != 0 { m.decLockedPerCg(val, from, memCgID) m.incLockedPerCg(val, to, memCgID) } } // Total returns a total memory usage. // // This method is thread-safe. func (m *MemoryLocked) Total() uint64 { m.mu.Lock() defer m.mu.Unlock() return m.totalLocked() } // TotalPerCg returns a total memory usage for a cgroup. // // This method is thread-safe. func (m *MemoryLocked) TotalPerCg(memCgID uint32) uint64 { m.mu.Lock() defer m.mu.Unlock() // Total memory usage including the sentry memory. if memCgID == 0 { return m.totalLocked() } // Memory usage for all cgroups except sentry memory. ms, ok := m.MemCgIDToMemStats[memCgID] if !ok { return 0 } return ms.totalLocked() } // Copy returns a copy of the structure with a total. // // This method is thread-safe. func (m *MemoryLocked) Copy() (MemoryStats, uint64) { m.mu.Lock() defer m.mu.Unlock() return m.copyLocked(), m.totalLocked() } // CopyPerCg returns a copy of the structure with a total for a cgroup. // // This method is thread-safe. func (m *MemoryLocked) CopyPerCg(memCgID uint32) (MemoryStats, uint64) { m.mu.Lock() defer m.mu.Unlock() // Total memory usage including the sentry memory. if memCgID == 0 { return m.copyLocked(), m.totalLocked() } // Memory usage for all cgroups except sentry memory. ms, ok := m.MemCgIDToMemStats[memCgID] if !ok { return MemoryStats{}, 0 } return ms.copyLocked(), ms.totalLocked() } // These options control how much total memory the is reported to the // application. They may only be set before the application starts executing, // and must not be modified. var ( // MinimumTotalMemoryBytes is the minimum reported total system memory. MinimumTotalMemoryBytes uint64 = 2 << 30 // 2 GB // MaximumTotalMemoryBytes is the maximum reported total system memory. // The 0 value indicates no maximum. MaximumTotalMemoryBytes uint64 ) // TotalMemory returns the "total usable memory" available. // // This number doesn't really have a true value so it's based on the following // inputs and further bounded to be above the MinumumTotalMemoryBytes and below // MaximumTotalMemoryBytes. // // memSize should be the platform.Memory size reported by platform.Memory.TotalSize() // used is the total memory reported by MemoryLocked.Total() func TotalMemory(memSize, used uint64) uint64 { if memSize < MinimumTotalMemoryBytes { memSize = MinimumTotalMemoryBytes } if memSize < used { memSize = used // Bump memSize to the next largest power of 2, if one exists, so // that MemFree isn't 0. if msb := bits.MostSignificantOne64(memSize); msb < 63 { memSize = uint64(1) << (uint(msb) + 1) } } if MaximumTotalMemoryBytes > 0 && memSize > MaximumTotalMemoryBytes { memSize = MaximumTotalMemoryBytes } return memSize } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/usage/memory_mutex.go000066400000000000000000000031371465435605700251410ustar00rootroot00000000000000package usage import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type memoryMutex struct { mu sync.Mutex } var memoryprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var memorylockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type memorylockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *memoryMutex) Lock() { locking.AddGLock(memoryprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *memoryMutex) NestedLock(i memorylockNameIndex) { locking.AddGLock(memoryprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *memoryMutex) Unlock() { locking.DelGLock(memoryprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *memoryMutex) NestedUnlock(i memorylockNameIndex) { locking.DelGLock(memoryprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func memoryinitLockNames() {} func init() { memoryinitLockNames() memoryprefixIndex = locking.NewMutexClass(reflect.TypeOf(memoryMutex{}), memorylockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/usage/memory_unsafe.go000066400000000000000000000016311465435605700252550ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package usage import ( "unsafe" ) // RTMemoryStatsSize is the size of the RTMemoryStats struct. var RTMemoryStatsSize = unsafe.Sizeof(RTMemoryStats{}) // RTMemoryStatsPointer casts addr to a RTMemoryStats pointer. func RTMemoryStatsPointer(addr uintptr) *RTMemoryStats { return (*RTMemoryStats)(unsafe.Pointer(addr)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/usage/usage.go000066400000000000000000000012401465435605700235040ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package usage provides representations of resource usage. package usage golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/usage/usage_state_autogen.go000066400000000000000000000040251465435605700264320ustar00rootroot00000000000000// automatically generated by stateify. package usage import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (s *CPUStats) StateTypeName() string { return "pkg/sentry/usage.CPUStats" } func (s *CPUStats) StateFields() []string { return []string{ "UserTime", "SysTime", "VoluntarySwitches", } } func (s *CPUStats) beforeSave() {} // +checklocksignore func (s *CPUStats) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.UserTime) stateSinkObject.Save(1, &s.SysTime) stateSinkObject.Save(2, &s.VoluntarySwitches) } func (s *CPUStats) afterLoad(context.Context) {} // +checklocksignore func (s *CPUStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.UserTime) stateSourceObject.Load(1, &s.SysTime) stateSourceObject.Load(2, &s.VoluntarySwitches) } func (i *IO) StateTypeName() string { return "pkg/sentry/usage.IO" } func (i *IO) StateFields() []string { return []string{ "CharsRead", "CharsWritten", "ReadSyscalls", "WriteSyscalls", "BytesRead", "BytesWritten", "BytesWriteCancelled", } } func (i *IO) beforeSave() {} // +checklocksignore func (i *IO) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.CharsRead) stateSinkObject.Save(1, &i.CharsWritten) stateSinkObject.Save(2, &i.ReadSyscalls) stateSinkObject.Save(3, &i.WriteSyscalls) stateSinkObject.Save(4, &i.BytesRead) stateSinkObject.Save(5, &i.BytesWritten) stateSinkObject.Save(6, &i.BytesWriteCancelled) } func (i *IO) afterLoad(context.Context) {} // +checklocksignore func (i *IO) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.CharsRead) stateSourceObject.Load(1, &i.CharsWritten) stateSourceObject.Load(2, &i.ReadSyscalls) stateSourceObject.Load(3, &i.WriteSyscalls) stateSourceObject.Load(4, &i.BytesRead) stateSourceObject.Load(5, &i.BytesWritten) stateSourceObject.Load(6, &i.BytesWriteCancelled) } func init() { state.Register((*CPUStats)(nil)) state.Register((*IO)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/usage/usage_unsafe_state_autogen.go000066400000000000000000000000671465435605700277750ustar00rootroot00000000000000// automatically generated by stateify. package usage golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/000077500000000000000000000000001465435605700215465ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/anonfs.go000066400000000000000000000242441465435605700233670ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" ) // NewAnonVirtualDentry returns a VirtualDentry with the given synthetic name, // consistent with Linux's fs/anon_inodes.c:anon_inode_getfile(). References // are taken on the returned VirtualDentry. func (vfs *VirtualFilesystem) NewAnonVirtualDentry(name string) VirtualDentry { d := anonDentry{ name: name, } d.vfsd.Init(&d) vfs.anonMount.IncRef() // anonDentry no-ops refcounting. return VirtualDentry{ mount: vfs.anonMount, dentry: &d.vfsd, } } const ( anonfsBlockSize = hostarch.PageSize // via fs/libfs.c:pseudo_fs_fill_super() // Mode, UID, and GID for a generic anonfs file. anonFileMode = 0600 // no type is correct anonFileUID = auth.RootKUID anonFileGID = auth.RootKGID ) // anonFilesystemType implements FilesystemType. // // +stateify savable type anonFilesystemType struct{} // GetFilesystem implements FilesystemType.GetFilesystem. func (anonFilesystemType) GetFilesystem(context.Context, *VirtualFilesystem, *auth.Credentials, string, GetFilesystemOptions) (*Filesystem, *Dentry, error) { panic("cannot instaniate an anon filesystem") } // Name implements FilesystemType.Name. func (anonFilesystemType) Name() string { return "none" } // Release implemenents FilesystemType.Release. func (anonFilesystemType) Release(ctx context.Context) {} // anonFilesystem is the implementation of FilesystemImpl that backs // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry(). // // Since all Dentries in anonFilesystem are non-directories, all FilesystemImpl // methods that would require an anonDentry to be a directory return ENOTDIR. // // +stateify savable type anonFilesystem struct { vfsfs Filesystem devMinor uint32 } // +stateify savable type anonDentry struct { vfsd Dentry name string // Inotify watches for this dentry. Note that anonfs doesn't allow hardlinks // and the dentry lifetime matches exactly with the file lifetime so it is // okay to have the watches in the dentry itself. watches Watches } // Release implements FilesystemImpl.Release. func (fs *anonFilesystem) Release(ctx context.Context) { } // Sync implements FilesystemImpl.Sync. func (fs *anonFilesystem) Sync(ctx context.Context) error { return nil } // AccessAt implements vfs.Filesystem.Impl.AccessAt. func (fs *anonFilesystem) AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error { if !rp.Done() || rp.MustBeDir() { return linuxerr.ENOTDIR } return GenericCheckPermissions(creds, ats, anonFileMode, anonFileUID, anonFileGID) } // GetDentryAt implements FilesystemImpl.GetDentryAt. func (fs *anonFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) { if !rp.Done() || rp.MustBeDir() { return nil, linuxerr.ENOTDIR } if opts.CheckSearchable { return nil, linuxerr.ENOTDIR } // anonDentry no-ops refcounting. return rp.Start(), nil } // GetParentDentryAt implements FilesystemImpl.GetParentDentryAt. func (fs *anonFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) { if !rp.Final() { return nil, linuxerr.ENOTDIR } // anonDentry no-ops refcounting. return rp.Start(), nil } // LinkAt implements FilesystemImpl.LinkAt. func (fs *anonFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error { if !rp.Final() { return linuxerr.ENOTDIR } return linuxerr.EPERM } // MkdirAt implements FilesystemImpl.MkdirAt. func (fs *anonFilesystem) MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error { if !rp.Final() { return linuxerr.ENOTDIR } return linuxerr.EPERM } // MknodAt implements FilesystemImpl.MknodAt. func (fs *anonFilesystem) MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error { if !rp.Final() { return linuxerr.ENOTDIR } return linuxerr.EPERM } // OpenAt implements FilesystemImpl.OpenAt. func (fs *anonFilesystem) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) { if !rp.Done() || rp.MustBeDir() { return nil, linuxerr.ENOTDIR } return nil, linuxerr.ENODEV } // ReadlinkAt implements FilesystemImpl.ReadlinkAt. func (fs *anonFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) { if !rp.Done() || rp.MustBeDir() { return "", linuxerr.ENOTDIR } return "", linuxerr.EINVAL } // RenameAt implements FilesystemImpl.RenameAt. func (fs *anonFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error { if !rp.Final() { return linuxerr.ENOTDIR } return linuxerr.EPERM } // RmdirAt implements FilesystemImpl.RmdirAt. func (fs *anonFilesystem) RmdirAt(ctx context.Context, rp *ResolvingPath) error { if !rp.Final() { return linuxerr.ENOTDIR } return linuxerr.EPERM } // SetStatAt implements FilesystemImpl.SetStatAt. func (fs *anonFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error { if !rp.Done() || rp.MustBeDir() { return linuxerr.ENOTDIR } // Linux actually permits anon_inode_inode's metadata to be set, which is // visible to all users of anon_inode_inode. We just silently ignore // metadata changes. return nil } // StatAt implements FilesystemImpl.StatAt. func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) { if !rp.Done() || rp.MustBeDir() { return linux.Statx{}, linuxerr.ENOTDIR } // See fs/anon_inodes.c:anon_inode_init() => fs/libfs.c:alloc_anon_inode(). return linux.Statx{ Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS, Blksize: anonfsBlockSize, Nlink: 1, UID: uint32(anonFileUID), GID: uint32(anonFileGID), Mode: anonFileMode, Ino: 1, Size: 0, Blocks: 0, DevMajor: linux.UNNAMED_MAJOR, DevMinor: fs.devMinor, }, nil } // StatFSAt implements FilesystemImpl.StatFSAt. func (fs *anonFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) { if !rp.Done() || rp.MustBeDir() { return linux.Statfs{}, linuxerr.ENOTDIR } return linux.Statfs{ Type: linux.ANON_INODE_FS_MAGIC, BlockSize: anonfsBlockSize, }, nil } // SymlinkAt implements FilesystemImpl.SymlinkAt. func (fs *anonFilesystem) SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error { if !rp.Final() { return linuxerr.ENOTDIR } return linuxerr.EPERM } // UnlinkAt implements FilesystemImpl.UnlinkAt. func (fs *anonFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error { if !rp.Final() { return linuxerr.ENOTDIR } return linuxerr.EPERM } // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error) { if !rp.Final() { return nil, linuxerr.ENOTDIR } if err := GenericCheckPermissions(rp.Credentials(), MayWrite, anonFileMode, anonFileUID, anonFileGID); err != nil { return nil, err } return nil, linuxerr.ECONNREFUSED } // ListXattrAt implements FilesystemImpl.ListXattrAt. func (fs *anonFilesystem) ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) { if !rp.Done() || rp.MustBeDir() { return nil, linuxerr.ENOTDIR } return nil, nil } // GetXattrAt implements FilesystemImpl.GetXattrAt. func (fs *anonFilesystem) GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error) { if !rp.Done() || rp.MustBeDir() { return "", linuxerr.ENOTDIR } return "", linuxerr.ENOTSUP } // SetXattrAt implements FilesystemImpl.SetXattrAt. func (fs *anonFilesystem) SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error { if !rp.Done() || rp.MustBeDir() { return linuxerr.ENOTDIR } return linuxerr.EPERM } // RemoveXattrAt implements FilesystemImpl.RemoveXattrAt. func (fs *anonFilesystem) RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error { if !rp.Done() || rp.MustBeDir() { return linuxerr.ENOTDIR } return linuxerr.EPERM } // PrependPath implements FilesystemImpl.PrependPath. func (fs *anonFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error { b.PrependComponent(fmt.Sprintf("anon_inode:%s", vd.dentry.impl.(*anonDentry).name)) return PrependPathSyntheticError{} } // MountOptions implements FilesystemImpl.MountOptions. func (fs *anonFilesystem) MountOptions() string { return "" } // IsDescendant implements FilesystemImpl.IsDescendant. func (fs *anonFilesystem) IsDescendant(vfsroot, vd VirtualDentry) bool { return vfsroot == vd } // IncRef implements DentryImpl.IncRef. func (d *anonDentry) IncRef() { // no-op } // TryIncRef implements DentryImpl.TryIncRef. func (d *anonDentry) TryIncRef() bool { return true } // DecRef implements DentryImpl.DecRef. func (d *anonDentry) DecRef(ctx context.Context) { // no-op } // InotifyWithParent implements DentryImpl.InotifyWithParent. func (d *anonDentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) { // d.parent doesn't exist. d.watches.Notify(ctx, "", events, cookie, et, false /* unlinked */) } // Watches implements DentryImpl.Watches. func (d *anonDentry) Watches() *Watches { return &d.watches } // OnZeroWatches implements Dentry.OnZeroWatches. func (d *anonDentry) OnZeroWatches(context.Context) {} golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/context.go000066400000000000000000000062121465435605700235620ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( goContext "context" "gvisor.dev/gvisor/pkg/context" ) // contextID is this package's type for context.Context.Value keys. type contextID int const ( // CtxMountNamespace is a Context.Value key for a MountNamespace. CtxMountNamespace contextID = iota // CtxRoot is a Context.Value key for a VFS root. CtxRoot // CtxRestoreFilesystemFDMap is a Context.Value key for a map[string]int // mapping filesystem unique IDs (cf. gofer.InternalFilesystemOptions.UniqueID) // to host FDs. CtxRestoreFilesystemFDMap ) // MountNamespaceFromContext returns the MountNamespace used by ctx. If ctx is // not associated with a MountNamespace, MountNamespaceFromContext returns nil. // // A reference is taken on the returned MountNamespace. func MountNamespaceFromContext(ctx goContext.Context) *MountNamespace { if v := ctx.Value(CtxMountNamespace); v != nil { return v.(*MountNamespace) } return nil } // RestoreFilesystemFDMapFromContext returns the RestoreFilesystemFDMap used // by ctx. If ctx is not associated with a RestoreFilesystemFDMap, returns nil. func RestoreFilesystemFDMapFromContext(ctx goContext.Context) map[RestoreID]int { fdmap, ok := ctx.Value(CtxRestoreFilesystemFDMap).(map[RestoreID]int) if !ok { return nil } return fdmap } type mountNamespaceContext struct { context.Context mntns *MountNamespace } // Value implements Context.Value. func (mc mountNamespaceContext) Value(key any) any { switch key { case CtxMountNamespace: mc.mntns.IncRef() return mc.mntns default: return mc.Context.Value(key) } } // WithMountNamespace returns a copy of ctx with the given MountNamespace. func WithMountNamespace(ctx context.Context, mntns *MountNamespace) context.Context { return &mountNamespaceContext{ Context: ctx, mntns: mntns, } } // RootFromContext returns the VFS root used by ctx. It takes a reference on // the returned VirtualDentry. If ctx does not have a specific VFS root, // RootFromContext returns a zero-value VirtualDentry. func RootFromContext(ctx goContext.Context) VirtualDentry { if v := ctx.Value(CtxRoot); v != nil { return v.(VirtualDentry) } return VirtualDentry{} } type rootContext struct { context.Context root VirtualDentry } // WithRoot returns a copy of ctx with the given root. func WithRoot(ctx context.Context, root VirtualDentry) context.Context { return &rootContext{ Context: ctx, root: root, } } // Value implements Context.Value. func (rc rootContext) Value(key any) any { switch key { case CtxRoot: rc.root.IncRef() return rc.root default: return rc.Context.Value(key) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/debug.go000066400000000000000000000015601465435605700231650ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !check_invariants // +build !check_invariants package vfs const ( // If checkInvariants is true, perform runtime checks for invariants // expected by the vfs package. This is disabled for non-test binaries since // VFS is often a hot path. checkInvariants = false ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/debug_testonly.go000066400000000000000000000013471465435605700251310ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build check_invariants // +build check_invariants package vfs const ( // Set checkInvariants to true for tests. checkInvariants = true ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/dentry.go000066400000000000000000000313361465435605700234100ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sync" ) // Dentry represents a node in a Filesystem tree at which a file exists. // // Dentries are reference-counted. Unless otherwise specified, all Dentry // methods require that a reference is held. // // Dentry is loosely analogous to Linux's struct dentry, but: // // - VFS does not associate Dentries with inodes. gVisor interacts primarily // with filesystems that are accessed through filesystem APIs (as opposed to // raw block devices); many such APIs support only paths and file descriptors, // and not inodes. Furthermore, when parties outside the scope of VFS can // rename inodes on such filesystems, VFS generally cannot "follow" the rename, // both due to synchronization issues and because it may not even be able to // name the destination path; this implies that it would in fact be incorrect // for Dentries to be associated with inodes on such filesystems. Consequently, // operations that are inode operations in Linux are FilesystemImpl methods // and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do // support inodes may store appropriate state in implementations of DentryImpl. // // - VFS does not require that Dentries are instantiated for all paths accessed // through VFS, only those that are tracked beyond the scope of a single // Filesystem operation. This includes file descriptions, mount points, mount // roots, process working directories, and chroots. This avoids instantiation // of Dentries for operations on mutable remote filesystems that can't actually // cache any state in the Dentry. // // - VFS does not track filesystem structure (i.e. relationships between // Dentries), since both the relevant state and synchronization are // filesystem-specific. // // - For the reasons above, VFS is not directly responsible for managing Dentry // lifetime. Dentry reference counts only indicate the extent to which VFS // requires Dentries to exist; Filesystems may elect to cache or discard // Dentries with zero references. // // +stateify savable type Dentry struct { // mu synchronizes deletion/invalidation and mounting over this Dentry. mu sync.Mutex `state:"nosave"` // dead is true if the file represented by this Dentry has been deleted (by // CommitDeleteDentry or CommitRenameReplaceDentry) or invalidated (by // InvalidateDentry). dead is protected by mu. dead bool // evictable is set by the VFS layer or filesystems like overlayfs as a hint // that this dentry will not be accessed hence forth. So filesystems that // cache dentries locally can use this hint to release the dentry when all // references are dropped. evictable is protected by mu. evictable bool // mounts is the number of Mounts for which this Dentry is Mount.point. mounts atomicbitops.Uint32 // impl is the DentryImpl associated with this Dentry. impl is immutable. // This should be the last field in Dentry. impl DentryImpl } // Init must be called before first use of d. func (d *Dentry) Init(impl DentryImpl) { d.impl = impl } // Impl returns the DentryImpl associated with d. func (d *Dentry) Impl() DentryImpl { return d.impl } // DentryImpl contains implementation details for a Dentry. Implementations of // DentryImpl should contain their associated Dentry by value as their first // field. // // +stateify savable type DentryImpl interface { // IncRef increments the Dentry's reference count. A Dentry with a non-zero // reference count must remain coherent with the state of the filesystem. IncRef() // TryIncRef increments the Dentry's reference count and returns true. If // the Dentry's reference count is zero, TryIncRef may do nothing and // return false. (It is also permitted to succeed if it can restore the // guarantee that the Dentry is coherent with the state of the filesystem.) // // TryIncRef does not require that a reference is held on the Dentry. TryIncRef() bool // DecRef decrements the Dentry's reference count. DecRef(ctx context.Context) // InotifyWithParent notifies all watches on the targets represented by this // dentry and its parent. The parent's watches are notified first, followed // by this dentry's. // // InotifyWithParent automatically adds the IN_ISDIR flag for dentries // representing directories. // // Note that the events may not actually propagate up to the user, depending // on the event masks. InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) // Watches returns the set of inotify watches for the file corresponding to // the Dentry. Dentries that are hard links to the same underlying file // share the same watches. // // The caller does not need to hold a reference on the dentry. Watches() *Watches // OnZeroWatches is called whenever the number of watches on a dentry drops // to zero. This is needed by some FilesystemImpls (e.g. gofer) to manage // dentry lifetime. // // The caller does not need to hold a reference on the dentry. OnZeroWatches // may acquire inotify locks, so to prevent deadlock, no inotify locks should // be held by the caller. OnZeroWatches(ctx context.Context) } // IncRef increments d's reference count. func (d *Dentry) IncRef() { d.impl.IncRef() } // TryIncRef increments d's reference count and returns true. If d's reference // count is zero, TryIncRef may instead do nothing and return false. func (d *Dentry) TryIncRef() bool { return d.impl.TryIncRef() } // DecRef decrements d's reference count. func (d *Dentry) DecRef(ctx context.Context) { d.impl.DecRef(ctx) } // IsDead returns true if d has been deleted or invalidated by its owning // filesystem. func (d *Dentry) IsDead() bool { d.mu.Lock() defer d.mu.Unlock() return d.dead } // IsEvictable returns true if d is evictable from filesystem dentry cache. func (d *Dentry) IsEvictable() bool { d.mu.Lock() defer d.mu.Unlock() return d.evictable } // MarkEvictable marks d as evictable. func (d *Dentry) MarkEvictable() { d.mu.Lock() defer d.mu.Unlock() d.evictable = true } func (d *Dentry) isMounted() bool { return d.mounts.Load() != 0 } // InotifyWithParent notifies all watches on the targets represented by d and // its parent of events. func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) { d.impl.InotifyWithParent(ctx, events, cookie, et) } // Watches returns the set of inotify watches associated with d. func (d *Dentry) Watches() *Watches { return d.impl.Watches() } // OnZeroWatches performs cleanup tasks whenever the number of watches on a // dentry drops to zero. func (d *Dentry) OnZeroWatches(ctx context.Context) { d.impl.OnZeroWatches(ctx) } // The following functions are exported so that filesystem implementations can // use them. The vfs package, and users of VFS, should not call these // functions. // PrepareDeleteDentry must be called before attempting to delete the file // represented by d. If PrepareDeleteDentry succeeds, the caller must call // AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome. // +checklocksacquire:d.mu func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error { vfs.lockMounts() defer vfs.unlockMounts(context.Background()) if mntns.mountpoints[d] != 0 { return linuxerr.EBUSY // +checklocksforce: inconsistent return. } d.mu.Lock() // Return with d.mu locked to block attempts to mount over it; it will be // unlocked by AbortDeleteDentry or CommitDeleteDentry. return nil } // AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion // fails. // +checklocksrelease:d.mu func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) { d.mu.Unlock() } // CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion // succeeds. If d is mounted, the method returns a list of Virtual Dentries // mounted on d that the caller is responsible for DecRefing. // +checklocksrelease:d.mu func (vfs *VirtualFilesystem) CommitDeleteDentry(ctx context.Context, d *Dentry) []refs.RefCounter { d.dead = true d.mu.Unlock() if d.isMounted() { return vfs.forgetDeadMountpoint(ctx, d) } return nil } // InvalidateDentry is called when d ceases to represent the file it formerly // did for reasons outside of VFS' control (e.g. d represents the local state // of a file on a remote filesystem on which the file has already been // deleted). If d is mounted, the method returns a list of Virtual Dentries // mounted on d that the caller is responsible for DecRefing. func (vfs *VirtualFilesystem) InvalidateDentry(ctx context.Context, d *Dentry) []refs.RefCounter { d.mu.Lock() d.dead = true d.mu.Unlock() if d.isMounted() { return vfs.forgetDeadMountpoint(ctx, d) } return nil } // PrepareRenameDentry must be called before attempting to rename the file // represented by from. If to is not nil, it represents the file that will be // replaced or exchanged by the rename. If PrepareRenameDentry succeeds, the // caller must call AbortRenameDentry, CommitRenameReplaceDentry, or // CommitRenameExchangeDentry depending on the rename's outcome. // // Preconditions: // - If to is not nil, it must be a child Dentry from the same Filesystem. // - from != to. // // +checklocksacquire:from.mu // +checklocksacquire:to.mu func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error { vfs.lockMounts() defer vfs.unlockMounts(context.Background()) if mntns.mountpoints[from] != 0 { return linuxerr.EBUSY // +checklocksforce: no locks acquired. } if to != nil { if mntns.mountpoints[to] != 0 { return linuxerr.EBUSY // +checklocksforce: no locks acquired. } to.mu.Lock() } from.mu.Lock() // Return with from.mu and to.mu locked, which will be unlocked by // AbortRenameDentry, CommitRenameReplaceDentry, or // CommitRenameExchangeDentry. return nil // +checklocksforce: to may not be acquired. } // AbortRenameDentry must be called after PrepareRenameDentry if the rename // fails. // +checklocksrelease:from.mu // +checklocksrelease:to.mu func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) { from.mu.Unlock() if to != nil { to.mu.Unlock() } } // CommitRenameReplaceDentry must be called after the file represented by from // is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file // that was replaced by from. If to is mounted, the method returns a list of // Virtual Dentries mounted on to that the caller is responsible for DecRefing. // // Preconditions: PrepareRenameDentry was previously called on from and to. // +checklocksrelease:from.mu // +checklocksrelease:to.mu func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(ctx context.Context, from, to *Dentry) []refs.RefCounter { from.mu.Unlock() if to != nil { to.dead = true to.mu.Unlock() if to.isMounted() { return vfs.forgetDeadMountpoint(ctx, to) } } return nil } // CommitRenameExchangeDentry must be called after the files represented by // from and to are exchanged by rename(RENAME_EXCHANGE). // // Preconditions: PrepareRenameDentry was previously called on from and to. // +checklocksrelease:from.mu // +checklocksrelease:to.mu func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) { from.mu.Unlock() to.mu.Unlock() } // forgetDeadMountpoint is called when a mount point is deleted or invalidated // to umount all mounts using it in all other mount namespaces. If skipDecRef // is true, the method returns a list of reference counted objects with an // an extra reference. // // forgetDeadMountpoint is analogous to Linux's // fs/namespace.c:__detach_mounts(). func (vfs *VirtualFilesystem) forgetDeadMountpoint(ctx context.Context, d *Dentry) []refs.RefCounter { vfs.lockMounts() defer vfs.unlockMounts(ctx) for mnt := range vfs.mountpoints[d] { // If umounted is true, the mount point has already been decrefed by umount // so we don't need to release the reference again here. if mnt.umounted { vfs.mounts.seq.BeginWrite() vfs.disconnectLocked(mnt) vfs.delayDecRef(mnt) vfs.mounts.seq.EndWrite() } else { vfs.umountTreeLocked(mnt, &umountRecursiveOptions{}) } } return vfs.PopDelayedDecRefs() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/device.go000066400000000000000000000130621465435605700233360ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "fmt" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" ) // DeviceKind indicates whether a device is a block or character device. // // +stateify savable type DeviceKind uint32 const ( // BlockDevice indicates a block device. BlockDevice DeviceKind = iota // CharDevice indicates a character device. CharDevice ) // String implements fmt.Stringer.String. func (kind DeviceKind) String() string { switch kind { case BlockDevice: return "block" case CharDevice: return "character" default: return fmt.Sprintf("invalid device kind %d", kind) } } // +stateify savable type devTuple struct { kind DeviceKind major uint32 minor uint32 } // A Device backs device special files. type Device interface { // Open returns a FileDescription representing this device. Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error) } // +stateify savable type registeredDevice struct { dev Device opts RegisterDeviceOptions } // RegisterDeviceOptions contains options to // VirtualFilesystem.RegisterDevice(). // // +stateify savable type RegisterDeviceOptions struct { // GroupName is the name shown for this device registration in // /proc/devices. If GroupName is empty, this registration will not be // shown in /proc/devices. GroupName string // Pathname is the name for the device file of this device in /dev directory. // If Pathname is empty, then no device file is created. Pathname string // FilePerms are the permission bits to create the device file with. Only // used if Pathname is provided. FilePerms uint16 } // RegisterDevice registers the given Device in vfs with the given major and // minor device numbers. func (vfs *VirtualFilesystem) RegisterDevice(kind DeviceKind, major, minor uint32, dev Device, opts *RegisterDeviceOptions) error { tup := devTuple{kind, major, minor} vfs.devicesMu.Lock() defer vfs.devicesMu.Unlock() if existing, ok := vfs.devices[tup]; ok { return fmt.Errorf("%s device number (%d, %d) is already registered to device type %T", kind, major, minor, existing.dev) } vfs.devices[tup] = ®isteredDevice{ dev: dev, opts: *opts, } return nil } // ForEachDevice calls the given callback for each registered device. func (vfs *VirtualFilesystem) ForEachDevice(cb func(pathname string, kind DeviceKind, major, minor uint32, perms uint16) error) error { vfs.devicesMu.Lock() defer vfs.devicesMu.Unlock() for tup, dev := range vfs.devices { if err := cb(dev.opts.Pathname, tup.kind, tup.major, tup.minor, dev.opts.FilePerms); err != nil { return err } } return nil } // OpenDeviceSpecialFile returns a FileDescription representing the given // device. func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mount, d *Dentry, kind DeviceKind, major, minor uint32, opts *OpenOptions) (*FileDescription, error) { tup := devTuple{kind, major, minor} vfs.devicesMu.RLock() defer vfs.devicesMu.RUnlock() rd, ok := vfs.devices[tup] if !ok { return nil, linuxerr.ENXIO } return rd.dev.Open(ctx, mnt, d, *opts) } // GetDynamicCharDevMajor allocates and returns an unused major device number // for a character device or set of character devices. func (vfs *VirtualFilesystem) GetDynamicCharDevMajor() (uint32, error) { vfs.dynCharDevMajorMu.Lock() defer vfs.dynCharDevMajorMu.Unlock() // Compare Linux's fs/char_dev.c:find_dynamic_major(). for major := uint32(254); major >= 234; major-- { if _, ok := vfs.dynCharDevMajorUsed[major]; !ok { vfs.dynCharDevMajorUsed[major] = struct{}{} return major, nil } } for major := uint32(511); major >= 384; major-- { if _, ok := vfs.dynCharDevMajorUsed[major]; !ok { vfs.dynCharDevMajorUsed[major] = struct{}{} return major, nil } } return 0, linuxerr.EBUSY } // PutDynamicCharDevMajor deallocates a major device number returned by a // previous call to GetDynamicCharDevMajor. func (vfs *VirtualFilesystem) PutDynamicCharDevMajor(major uint32) { vfs.dynCharDevMajorMu.Lock() defer vfs.dynCharDevMajorMu.Unlock() delete(vfs.dynCharDevMajorUsed, major) } // GetAnonBlockDevMinor allocates and returns an unused minor device number for // an "anonymous" block device with major number UNNAMED_MAJOR. func (vfs *VirtualFilesystem) GetAnonBlockDevMinor() (uint32, error) { vfs.anonBlockDevMinorMu.Lock() defer vfs.anonBlockDevMinorMu.Unlock() minor := vfs.anonBlockDevMinorNext const maxDevMinor = (1 << 20) - 1 for minor < maxDevMinor { if _, ok := vfs.anonBlockDevMinor[minor]; !ok { vfs.anonBlockDevMinor[minor] = struct{}{} vfs.anonBlockDevMinorNext = minor + 1 return minor, nil } minor++ } return 0, linuxerr.EMFILE } // PutAnonBlockDevMinor deallocates a minor device number returned by a // previous call to GetAnonBlockDevMinor. func (vfs *VirtualFilesystem) PutAnonBlockDevMinor(minor uint32) { vfs.anonBlockDevMinorMu.Lock() defer vfs.anonBlockDevMinorMu.Unlock() delete(vfs.anonBlockDevMinor, minor) if minor < vfs.anonBlockDevMinorNext { vfs.anonBlockDevMinorNext = minor } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/epoll.go000066400000000000000000000341001465435605700232060ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) // epollCycleMu serializes attempts to register EpollInstances with other // EpollInstances in order to check for cycles. var epollCycleMu sync.Mutex // EpollInstance represents an epoll instance, as described by epoll(7). // // +stateify savable type EpollInstance struct { vfsfd FileDescription FileDescriptionDefaultImpl DentryMetadataFileDescriptionImpl NoLockFD // q holds waiters on this EpollInstance. q waiter.Queue // interestMu protects interest and most fields in registered // epollInterests. interestMu is analogous to Linux's struct // eventpoll::mtx. interestMu sync.Mutex `state:"nosave"` // interest is the set of file descriptors that are registered with the // EpollInstance for monitoring. interest map[epollInterestKey]*epollInterest // readyMu protects ready, readySeq, epollInterest.ready, and // epollInterest.epollInterestEntry. ready is analogous to Linux's struct // eventpoll::lock. readyMu epollReadyInstanceMutex `state:"nosave"` // ready is the set of file descriptors that may be "ready" for I/O. Note // that this must be an ordered list, not a map: "If more than maxevents // file descriptors are ready when epoll_wait() is called, then successive // epoll_wait() calls will round robin through the set of ready file // descriptors. This behavior helps avoid starvation scenarios, where a // process fails to notice that additional file descriptors are ready // because it focuses on a set of file descriptors that are already known // to be ready." - epoll_wait(2) ready epollInterestList // readySeq is used to detect calls to epollInterest.NotifyEvent() while // Readiness() or ReadEvents() are running with readyMu unlocked. readySeq // is protected by both interestMu and readyMu; reading requires either // mutex to be locked, but mutation requires both mutexes to be locked. readySeq uint32 } // +stateify savable type epollInterestKey struct { // file is the registered FileDescription. No reference is held on file; // instead, when the last reference is dropped, FileDescription.DecRef() // removes the FileDescription from all EpollInstances. file is immutable. file *FileDescription // num is the file descriptor number with which this entry was registered. // num is immutable. num int32 } // epollInterest represents an EpollInstance's interest in a file descriptor. // // +stateify savable type epollInterest struct { // epoll is the owning EpollInstance. epoll is immutable. epoll *EpollInstance `state:"wait"` // key is the file to which this epollInterest applies. key is immutable. key epollInterestKey // waiter is registered with key.file. entry is protected by // epoll.interestMu. waiter waiter.Entry // mask is the event mask associated with this registration, including // flags EPOLLET and EPOLLONESHOT. mask is protected by epoll.interestMu. mask uint32 // ready is true if epollInterestEntry is linked into epoll.ready. readySeq // is the value of epoll.readySeq when NotifyEvent() was last called. // ready, epollInterestEntry, and readySeq are protected by epoll.readyMu. ready bool epollInterestEntry readySeq uint32 // userData is the struct epoll_event::data associated with this // epollInterest. userData is protected by epoll.interestMu. userData [2]int32 } // NewEpollInstanceFD returns a FileDescription representing a new epoll // instance. A reference is taken on the returned FileDescription. func (vfs *VirtualFilesystem) NewEpollInstanceFD(ctx context.Context) (*FileDescription, error) { vd := vfs.NewAnonVirtualDentry("[eventpoll]") defer vd.DecRef(ctx) ep := &EpollInstance{ interest: make(map[epollInterestKey]*epollInterest), } if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ DenyPRead: true, DenyPWrite: true, UseDentryMetadata: true, }); err != nil { return nil, err } return &ep.vfsfd, nil } // Release implements FileDescriptionImpl.Release. func (ep *EpollInstance) Release(ctx context.Context) { // Unregister all polled fds. ep.interestMu.Lock() defer ep.interestMu.Unlock() for key, epi := range ep.interest { file := key.file file.epollMu.Lock() delete(file.epolls, epi) file.epollMu.Unlock() file.EventUnregister(&epi.waiter) } ep.interest = nil } // Readiness implements waiter.Waitable.Readiness. func (ep *EpollInstance) Readiness(mask waiter.EventMask) waiter.EventMask { if mask&waiter.ReadableEvents == 0 { return 0 } // We can't call FileDescription.Readiness() while holding ep.readyMu. // Instead, hold ep.interestMu to prevent changes to the set of // epollInterests, then temporarily move all epollInterests already on // ep.ready to a local list that we can iterate without holding ep.readyMu. // epollInterest.ready is left set to true so that // epollInterest.NotifyEvent() doesn't touch epollInterestEntry. ep.interestMu.Lock() defer ep.interestMu.Unlock() var ( ready epollInterestList notReady epollInterestList ) ep.readyMu.Lock() ready.PushBackList(&ep.ready) ep.readySeq++ ep.readyMu.Unlock() if ready.Empty() { return 0 } defer func() { notify := false ep.readyMu.Lock() ep.ready.PushFrontList(&ready) var next *epollInterest for epi := notReady.Front(); epi != nil; epi = next { next = epi.Next() if epi.readySeq == ep.readySeq { // epi.NotifyEvent() was called while we were running. notReady.Remove(epi) ep.ready.PushBack(epi) notify = true } else { epi.ready = false } } ep.readyMu.Unlock() if notify { ep.q.Notify(waiter.ReadableEvents) } }() var next *epollInterest for epi := ready.Front(); epi != nil; epi = next { next = epi.Next() wmask := waiter.EventMaskFromLinux(epi.mask) if epi.key.file.Readiness(wmask)&wmask != 0 { return waiter.ReadableEvents } // epi.key.file was readied spuriously; leave it off of ep.ready. ready.Remove(epi) notReady.PushBack(epi) } return 0 } // EventRegister implements waiter.Waitable.EventRegister. func (ep *EpollInstance) EventRegister(e *waiter.Entry) error { ep.q.EventRegister(e) return nil } // EventUnregister implements waiter.Waitable.EventUnregister. func (ep *EpollInstance) EventUnregister(e *waiter.Entry) { ep.q.EventUnregister(e) } // Epollable implements FileDescriptionImpl.Epollable. func (ep *EpollInstance) Epollable() bool { return true } // Seek implements FileDescriptionImpl.Seek. func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { // Linux: fs/eventpoll.c:eventpoll_fops.llseek == noop_llseek return 0, nil } // AddInterest implements the semantics of EPOLL_CTL_ADD. // // Preconditions: A reference must be held on file. func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event linux.EpollEvent) error { if !file.Epollable() { return linuxerr.EPERM } // Check for cyclic polling if necessary. subep, _ := file.impl.(*EpollInstance) if subep != nil { epollCycleMu.Lock() // epollCycleMu must be locked for the rest of AddInterest to ensure // that cyclic polling is not introduced after the check. defer epollCycleMu.Unlock() if subep.mightPoll(ep) { return linuxerr.ELOOP } } ep.interestMu.Lock() defer ep.interestMu.Unlock() // Fail if the key is already registered. key := epollInterestKey{ file: file, num: num, } if _, ok := ep.interest[key]; ok { return linuxerr.EEXIST } // Register interest in file. mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP epi := &epollInterest{ epoll: ep, key: key, mask: mask, userData: event.Data, } ep.interest[key] = epi wmask := waiter.EventMaskFromLinux(mask) epi.waiter.Init(epi, wmask) if err := file.EventRegister(&epi.waiter); err != nil { return err } // Check if the file is already ready. if m := file.Readiness(wmask) & wmask; m != 0 { epi.NotifyEvent(m) } // Add epi to file.epolls so that it is removed when the last // FileDescription reference is dropped. file.epollMu.Lock() if file.epolls == nil { file.epolls = make(map[*epollInterest]struct{}) } file.epolls[epi] = struct{}{} file.epollMu.Unlock() return nil } func (ep *EpollInstance) mightPoll(ep2 *EpollInstance) bool { return ep.mightPollRecursive(ep2, 4) // Linux: fs/eventpoll.c:EP_MAX_NESTS } func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursion int) bool { ep.interestMu.Lock() defer ep.interestMu.Unlock() for key := range ep.interest { nextep, ok := key.file.impl.(*EpollInstance) if !ok { continue } if nextep == ep2 { return true } if remainingRecursion == 0 { return true } if nextep.mightPollRecursive(ep2, remainingRecursion-1) { return true } } return false } // ModifyInterest implements the semantics of EPOLL_CTL_MOD. // // Preconditions: A reference must be held on file. func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event linux.EpollEvent) error { ep.interestMu.Lock() defer ep.interestMu.Unlock() // Fail if the key is not already registered. epi, ok := ep.interest[epollInterestKey{ file: file, num: num, }] if !ok { return linuxerr.ENOENT } // Update epi for the next call to ep.ReadEvents(). mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP epi.mask = mask epi.userData = event.Data // Re-register with the new mask. file.EventUnregister(&epi.waiter) wmask := waiter.EventMaskFromLinux(mask) epi.waiter.Init(epi, wmask) if err := file.EventRegister(&epi.waiter); err != nil { return err } // Check if the file is already ready with the new mask. if m := file.Readiness(wmask) & wmask; m != 0 { epi.NotifyEvent(m) } return nil } // DeleteInterest implements the semantics of EPOLL_CTL_DEL. // // Preconditions: A reference must be held on file. func (ep *EpollInstance) DeleteInterest(file *FileDescription, num int32) error { ep.interestMu.Lock() defer ep.interestMu.Unlock() // Fail if the key is not already registered. epi, ok := ep.interest[epollInterestKey{ file: file, num: num, }] if !ok { return linuxerr.ENOENT } // Unregister from the file so that epi will no longer be readied. file.EventUnregister(&epi.waiter) // Forget about epi. ep.removeLocked(epi) file.epollMu.Lock() delete(file.epolls, epi) file.epollMu.Unlock() return nil } // NotifyEvent implements waiter.EventListener.NotifyEvent. func (epi *epollInterest) NotifyEvent(waiter.EventMask) { newReady := false epi.epoll.readyMu.Lock() if !epi.ready { newReady = true epi.ready = true epi.epoll.ready.PushBack(epi) } epi.readySeq = epi.epoll.readySeq epi.epoll.readyMu.Unlock() if newReady { epi.epoll.q.Notify(waiter.ReadableEvents) } } // Preconditions: ep.interestMu must be locked. func (ep *EpollInstance) removeLocked(epi *epollInterest) { delete(ep.interest, epi.key) ep.readyMu.Lock() if epi.ready { epi.ready = false ep.ready.Remove(epi) } ep.readyMu.Unlock() } // ReadEvents appends up to maxReady events to events and returns the updated // slice of events. func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent, maxEvents int) []linux.EpollEvent { // We can't call FileDescription.Readiness() while holding ep.readyMu. // Instead, hold ep.interestMu to prevent changes to the set of // epollInterests, then temporarily move all epollInterests already on // ep.ready to a local list that we can iterate without holding ep.readyMu. // epollInterest.ready is left set to true so that // epollInterest.NotifyEvent() doesn't touch epollInterestEntry. ep.interestMu.Lock() defer ep.interestMu.Unlock() var ( ready epollInterestList notReady epollInterestList requeue epollInterestList ) ep.readyMu.Lock() ready.PushBackList(&ep.ready) ep.readySeq++ ep.readyMu.Unlock() if ready.Empty() { return nil } defer func() { notify := false ep.readyMu.Lock() // epollInterests that we never checked are re-inserted at the start of // ep.ready. epollInterests that were ready are re-inserted at the end // for reasons described by EpollInstance.ready. ep.ready.PushFrontList(&ready) var next *epollInterest for epi := notReady.Front(); epi != nil; epi = next { next = epi.Next() if epi.readySeq == ep.readySeq { // epi.NotifyEvent() was called while we were running. notReady.Remove(epi) ep.ready.PushBack(epi) notify = true } else { epi.ready = false } } ep.ready.PushBackList(&requeue) ep.readyMu.Unlock() if notify { ep.q.Notify(waiter.ReadableEvents) } }() i := 0 var next *epollInterest for epi := ready.Front(); epi != nil; epi = next { next = epi.Next() // Regardless of what else happens, epi is initially removed from the // ready list. ready.Remove(epi) wmask := waiter.EventMaskFromLinux(epi.mask) ievents := epi.key.file.Readiness(wmask) & wmask if ievents == 0 { // Leave epi off the ready list. notReady.PushBack(epi) continue } // Determine what we should do with epi. switch { case epi.mask&linux.EPOLLONESHOT != 0: // Clear all events from the mask; they must be re-added by // EPOLL_CTL_MOD. epi.mask &= linux.EP_PRIVATE_BITS fallthrough case epi.mask&linux.EPOLLET != 0: // Leave epi off the ready list. notReady.PushBack(epi) default: // Queue epi to be moved to the end of the ready list. requeue.PushBack(epi) } // Report ievents. events = append(events, linux.EpollEvent{ Events: ievents.ToLinux(), Data: epi.userData, }) i++ if i == maxEvents { break } } return events } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/epoll_instance_mutex.go000066400000000000000000000035011465435605700263150ustar00rootroot00000000000000package vfs import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type epollReadyInstanceMutex struct { mu sync.Mutex } var epollReadyInstanceprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var epollReadyInstancelockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type epollReadyInstancelockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *epollReadyInstanceMutex) Lock() { locking.AddGLock(epollReadyInstanceprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *epollReadyInstanceMutex) NestedLock(i epollReadyInstancelockNameIndex) { locking.AddGLock(epollReadyInstanceprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *epollReadyInstanceMutex) Unlock() { locking.DelGLock(epollReadyInstanceprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *epollReadyInstanceMutex) NestedUnlock(i epollReadyInstancelockNameIndex) { locking.DelGLock(epollReadyInstanceprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func epollReadyInstanceinitLockNames() {} func init() { epollReadyInstanceinitLockNames() epollReadyInstanceprefixIndex = locking.NewMutexClass(reflect.TypeOf(epollReadyInstanceMutex{}), epollReadyInstancelockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/epoll_interest_list.go000066400000000000000000000126201465435605700261610ustar00rootroot00000000000000package vfs // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type epollInterestElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (epollInterestElementMapper) linkerFor(elem *epollInterest) *epollInterest { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type epollInterestList struct { head *epollInterest tail *epollInterest } // Reset resets list l to the empty state. func (l *epollInterestList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *epollInterestList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *epollInterestList) Front() *epollInterest { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *epollInterestList) Back() *epollInterest { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *epollInterestList) Len() (count int) { for e := l.Front(); e != nil; e = (epollInterestElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *epollInterestList) PushFront(e *epollInterest) { linker := epollInterestElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { epollInterestElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *epollInterestList) PushFrontList(m *epollInterestList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { epollInterestElementMapper{}.linkerFor(l.head).SetPrev(m.tail) epollInterestElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *epollInterestList) PushBack(e *epollInterest) { linker := epollInterestElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { epollInterestElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *epollInterestList) PushBackList(m *epollInterestList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { epollInterestElementMapper{}.linkerFor(l.tail).SetNext(m.head) epollInterestElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *epollInterestList) InsertAfter(b, e *epollInterest) { bLinker := epollInterestElementMapper{}.linkerFor(b) eLinker := epollInterestElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { epollInterestElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *epollInterestList) InsertBefore(a, e *epollInterest) { aLinker := epollInterestElementMapper{}.linkerFor(a) eLinker := epollInterestElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { epollInterestElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *epollInterestList) Remove(e *epollInterest) { linker := epollInterestElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { epollInterestElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { epollInterestElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type epollInterestEntry struct { next *epollInterest prev *epollInterest } // Next returns the entry that follows e in the list. // //go:nosplit func (e *epollInterestEntry) Next() *epollInterest { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *epollInterestEntry) Prev() *epollInterest { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *epollInterestEntry) SetNext(elem *epollInterest) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *epollInterestEntry) SetPrev(elem *epollInterest) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/epoll_mutex.go000066400000000000000000000031121465435605700244270ustar00rootroot00000000000000package vfs import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type epollMutex struct { mu sync.Mutex } var epollprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var epolllockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type epolllockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *epollMutex) Lock() { locking.AddGLock(epollprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *epollMutex) NestedLock(i epolllockNameIndex) { locking.AddGLock(epollprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *epollMutex) Unlock() { locking.DelGLock(epollprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *epollMutex) NestedUnlock(i epolllockNameIndex) { locking.DelGLock(epollprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func epollinitLockNames() {} func init() { epollinitLockNames() epollprefixIndex = locking.NewMutexClass(reflect.TypeOf(epollMutex{}), epolllockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/event_list.go000066400000000000000000000117101465435605700242510ustar00rootroot00000000000000package vfs // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type eventElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (eventElementMapper) linkerFor(elem *Event) *Event { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type eventList struct { head *Event tail *Event } // Reset resets list l to the empty state. func (l *eventList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *eventList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *eventList) Front() *Event { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *eventList) Back() *Event { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *eventList) Len() (count int) { for e := l.Front(); e != nil; e = (eventElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *eventList) PushFront(e *Event) { linker := eventElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { eventElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *eventList) PushFrontList(m *eventList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { eventElementMapper{}.linkerFor(l.head).SetPrev(m.tail) eventElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *eventList) PushBack(e *Event) { linker := eventElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { eventElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *eventList) PushBackList(m *eventList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { eventElementMapper{}.linkerFor(l.tail).SetNext(m.head) eventElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *eventList) InsertAfter(b, e *Event) { bLinker := eventElementMapper{}.linkerFor(b) eLinker := eventElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { eventElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *eventList) InsertBefore(a, e *Event) { aLinker := eventElementMapper{}.linkerFor(a) eLinker := eventElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { eventElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *eventList) Remove(e *Event) { linker := eventElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { eventElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { eventElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type eventEntry struct { next *Event prev *Event } // Next returns the entry that follows e in the list. // //go:nosplit func (e *eventEntry) Next() *Event { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *eventEntry) Prev() *Event { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *eventEntry) SetNext(elem *Event) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *eventEntry) SetPrev(elem *Event) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/events_go_proto/000077500000000000000000000000001465435605700247625ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/events_go_proto/events.pb.go000066400000000000000000000112441465435605700272170ustar00rootroot00000000000000// Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.31.0 // protoc v5.26.1 // source: pkg/sentry/vfs/events.proto package events_go_proto import ( protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" reflect "reflect" sync "sync" ) const ( // Verify that this generated code is sufficiently up-to-date. _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) // Verify that runtime/protoimpl is sufficiently up-to-date. _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) type SentryMountPromiseBlockEvent struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Path string `protobuf:"bytes,1,opt,name=path,proto3" json:"path,omitempty"` } func (x *SentryMountPromiseBlockEvent) Reset() { *x = SentryMountPromiseBlockEvent{} if protoimpl.UnsafeEnabled { mi := &file_pkg_sentry_vfs_events_proto_msgTypes[0] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *SentryMountPromiseBlockEvent) String() string { return protoimpl.X.MessageStringOf(x) } func (*SentryMountPromiseBlockEvent) ProtoMessage() {} func (x *SentryMountPromiseBlockEvent) ProtoReflect() protoreflect.Message { mi := &file_pkg_sentry_vfs_events_proto_msgTypes[0] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use SentryMountPromiseBlockEvent.ProtoReflect.Descriptor instead. func (*SentryMountPromiseBlockEvent) Descriptor() ([]byte, []int) { return file_pkg_sentry_vfs_events_proto_rawDescGZIP(), []int{0} } func (x *SentryMountPromiseBlockEvent) GetPath() string { if x != nil { return x.Path } return "" } var File_pkg_sentry_vfs_events_proto protoreflect.FileDescriptor var file_pkg_sentry_vfs_events_proto_rawDesc = []byte{ 0x0a, 0x1b, 0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x2f, 0x76, 0x66, 0x73, 0x2f, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x06, 0x67, 0x76, 0x69, 0x73, 0x6f, 0x72, 0x22, 0x32, 0x0a, 0x1c, 0x53, 0x65, 0x6e, 0x74, 0x72, 0x79, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x42, 0x6c, 0x6f, 0x63, 0x6b, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x12, 0x12, 0x0a, 0x04, 0x70, 0x61, 0x74, 0x68, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x70, 0x61, 0x74, 0x68, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( file_pkg_sentry_vfs_events_proto_rawDescOnce sync.Once file_pkg_sentry_vfs_events_proto_rawDescData = file_pkg_sentry_vfs_events_proto_rawDesc ) func file_pkg_sentry_vfs_events_proto_rawDescGZIP() []byte { file_pkg_sentry_vfs_events_proto_rawDescOnce.Do(func() { file_pkg_sentry_vfs_events_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_sentry_vfs_events_proto_rawDescData) }) return file_pkg_sentry_vfs_events_proto_rawDescData } var file_pkg_sentry_vfs_events_proto_msgTypes = make([]protoimpl.MessageInfo, 1) var file_pkg_sentry_vfs_events_proto_goTypes = []interface{}{ (*SentryMountPromiseBlockEvent)(nil), // 0: gvisor.SentryMountPromiseBlockEvent } var file_pkg_sentry_vfs_events_proto_depIdxs = []int32{ 0, // [0:0] is the sub-list for method output_type 0, // [0:0] is the sub-list for method input_type 0, // [0:0] is the sub-list for extension type_name 0, // [0:0] is the sub-list for extension extendee 0, // [0:0] is the sub-list for field type_name } func init() { file_pkg_sentry_vfs_events_proto_init() } func file_pkg_sentry_vfs_events_proto_init() { if File_pkg_sentry_vfs_events_proto != nil { return } if !protoimpl.UnsafeEnabled { file_pkg_sentry_vfs_events_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*SentryMountPromiseBlockEvent); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } } type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_pkg_sentry_vfs_events_proto_rawDesc, NumEnums: 0, NumMessages: 1, NumExtensions: 0, NumServices: 0, }, GoTypes: file_pkg_sentry_vfs_events_proto_goTypes, DependencyIndexes: file_pkg_sentry_vfs_events_proto_depIdxs, MessageInfos: file_pkg_sentry_vfs_events_proto_msgTypes, }.Build() File_pkg_sentry_vfs_events_proto = out.File file_pkg_sentry_vfs_events_proto_rawDesc = nil file_pkg_sentry_vfs_events_proto_goTypes = nil file_pkg_sentry_vfs_events_proto_depIdxs = nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/file_description.go000066400000000000000000001026511465435605700254240ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/lock" "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // A FileDescription represents an open file description, which is the entity // referred to by a file descriptor (POSIX.1-2017 3.258 "Open File // Description"). // // FileDescriptions are reference-counted. Unless otherwise specified, all // FileDescription methods require that a reference is held. // // FileDescription is analogous to Linux's struct file. // // +stateify savable type FileDescription struct { FileDescriptionRefs // flagsMu protects `statusFlags` and `asyncHandler` below. flagsMu sync.Mutex `state:"nosave"` // statusFlags contains status flags, "initialized by open(2) and possibly // modified by fcntl()" - fcntl(2). statusFlags can be read using atomic // memory operations when it does not need to be synchronized with an // access to asyncHandler. statusFlags atomicbitops.Uint32 // asyncHandler handles O_ASYNC signal generation. It is set with the // F_SETOWN or F_SETOWN_EX fcntls. For asyncHandler to be used, O_ASYNC must // also be set by fcntl(2). asyncHandler FileAsync // epolls is the set of epollInterests registered for this FileDescription. // epolls is protected by epollMu. epollMu epollMutex `state:"nosave"` epolls map[*epollInterest]struct{} // vd is the filesystem location at which this FileDescription was opened. // A reference is held on vd. vd is immutable. vd VirtualDentry // opts contains options passed to FileDescription.Init(). opts is // immutable. opts FileDescriptionOptions // readable is MayReadFileWithOpenFlags(statusFlags). readable is // immutable. // // readable is analogous to Linux's FMODE_READ. readable bool // writable is MayWriteFileWithOpenFlags(statusFlags). If writable is true, // the FileDescription holds a write count on vd.mount. writable is // immutable. // // writable is analogous to Linux's FMODE_WRITE. writable bool usedLockBSD atomicbitops.Uint32 // impl is the FileDescriptionImpl associated with this Filesystem. impl is // immutable. This should be the last field in FileDescription. impl FileDescriptionImpl } // FileDescriptionOptions contains options to FileDescription.Init(). // // +stateify savable type FileDescriptionOptions struct { // If AllowDirectIO is true, allow O_DIRECT to be set on the file. AllowDirectIO bool // If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE. DenyPRead bool // If DenyPWrite is true, calls to FileDescription.PWrite() return // ESPIPE. DenyPWrite bool // If UseDentryMetadata is true, calls to FileDescription methods that // interact with file and filesystem metadata (Stat, SetStat, StatFS, // ListXattr, GetXattr, SetXattr, RemoveXattr) are implemented by calling // the corresponding FilesystemImpl methods instead of the corresponding // FileDescriptionImpl methods. // // UseDentryMetadata is intended for file descriptions that are implemented // outside of individual filesystems, such as pipes, sockets, and device // special files. FileDescriptions for which UseDentryMetadata is true may // embed DentryMetadataFileDescriptionImpl to obtain appropriate // implementations of FileDescriptionImpl methods that should not be // called. UseDentryMetadata bool // If DenySpliceIn is true, splice into descriptor isn't allowed. DenySpliceIn bool } // FileCreationFlags are the set of flags passed to FileDescription.Init() but // omitted from FileDescription.StatusFlags(). const FileCreationFlags = linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC // Init must be called before first use of fd. If it succeeds, it takes // references on mnt and d. flags is the initial file description flags, which // is usually the full set of flags passed to open(2). func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error { writable := MayWriteFileWithOpenFlags(flags) if writable { if err := mnt.CheckBeginWrite(); err != nil { return err } } fd.InitRefs() // Remove "file creation flags" to mirror the behavior from file.f_flags in // fs/open.c:do_dentry_open. fd.statusFlags = atomicbitops.FromUint32(flags &^ FileCreationFlags) fd.vd = VirtualDentry{ mount: mnt, dentry: d, } mnt.IncRef() d.IncRef() fd.opts = *opts fd.readable = MayReadFileWithOpenFlags(flags) fd.writable = writable fd.impl = impl return nil } // DecRef decrements fd's reference count. func (fd *FileDescription) DecRef(ctx context.Context) { fd.FileDescriptionRefs.DecRef(func() { // Generate inotify events. ev := uint32(linux.IN_CLOSE_NOWRITE) if fd.IsWritable() { ev = linux.IN_CLOSE_WRITE } fd.Dentry().InotifyWithParent(ctx, ev, 0, PathEvent) // Unregister fd from all epoll instances. fd.epollMu.Lock() epolls := fd.epolls fd.epolls = nil fd.epollMu.Unlock() for epi := range epolls { ep := epi.epoll ep.interestMu.Lock() // Check that epi has not been concurrently unregistered by // EpollInstance.DeleteInterest() or EpollInstance.Release(). if _, ok := ep.interest[epi.key]; ok { fd.EventUnregister(&epi.waiter) ep.removeLocked(epi) } ep.interestMu.Unlock() } // If BSD locks were used, release any lock that it may have acquired. if fd.usedLockBSD.Load() != 0 { fd.impl.UnlockBSD(context.Background(), fd) } // Unlock any OFD locks. if fd.impl.SupportsLocks() { fd.impl.UnlockPOSIX(ctx, fd, lock.LockRange{0, lock.LockEOF}) } // Release implementation resources. fd.impl.Release(ctx) if fd.writable { fd.vd.mount.EndWrite() } fd.vd.DecRef(ctx) fd.flagsMu.Lock() if fd.statusFlags.RacyLoad()&linux.O_ASYNC != 0 && fd.asyncHandler != nil { fd.impl.UnregisterFileAsyncHandler(fd) } fd.asyncHandler = nil fd.flagsMu.Unlock() }) } // Mount returns the mount on which fd was opened. It does not take a reference // on the returned Mount. func (fd *FileDescription) Mount() *Mount { return fd.vd.mount } // Dentry returns the dentry at which fd was opened. It does not take a // reference on the returned Dentry. func (fd *FileDescription) Dentry() *Dentry { return fd.vd.dentry } // VirtualDentry returns the location at which fd was opened. It does not take // a reference on the returned VirtualDentry. func (fd *FileDescription) VirtualDentry() VirtualDentry { return fd.vd } // Options returns the options passed to fd.Init(). func (fd *FileDescription) Options() FileDescriptionOptions { return fd.opts } // StatusFlags returns file description status flags, as for fcntl(F_GETFL). func (fd *FileDescription) StatusFlags() uint32 { return fd.statusFlags.Load() } // SetStatusFlags sets file description status flags, as for fcntl(F_SETFL). func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Credentials, flags uint32) error { // Compare Linux's fs/fcntl.c:setfl(). oldFlags := fd.StatusFlags() // Linux documents this check as "O_APPEND cannot be cleared if the file is // marked as append-only and the file is open for write", which would make // sense. However, the check as actually implemented seems to be "O_APPEND // cannot be changed if the file is marked as append-only". if (flags^oldFlags)&linux.O_APPEND != 0 { stat, err := fd.Stat(ctx, StatOptions{ // There is no mask bit for stx_attributes. Mask: 0, // Linux just reads inode::i_flags directly. Sync: linux.AT_STATX_DONT_SYNC, }) if err != nil { return err } if (stat.AttributesMask&linux.STATX_ATTR_APPEND != 0) && (stat.Attributes&linux.STATX_ATTR_APPEND != 0) { return linuxerr.EPERM } } if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) { stat, err := fd.Stat(ctx, StatOptions{ Mask: linux.STATX_UID, // Linux's inode_owner_or_capable() just reads inode::i_uid // directly. Sync: linux.AT_STATX_DONT_SYNC, }) if err != nil { return err } if stat.Mask&linux.STATX_UID == 0 { return linuxerr.EPERM } if !CanActAsOwner(creds, auth.KUID(stat.UID)) { return linuxerr.EPERM } } if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO { return linuxerr.EINVAL } // TODO(gvisor.dev/issue/1035): FileDescriptionImpl.SetOAsync()? const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK fd.flagsMu.Lock() defer fd.flagsMu.Unlock() if fd.asyncHandler != nil { // Use fd.statusFlags instead of oldFlags, which may have become outdated, // to avoid double registering/unregistering. if fd.statusFlags.RacyLoad()&linux.O_ASYNC == 0 && flags&linux.O_ASYNC != 0 { if err := fd.impl.RegisterFileAsyncHandler(fd); err != nil { return err } } else if fd.statusFlags.RacyLoad()&linux.O_ASYNC != 0 && flags&linux.O_ASYNC == 0 { fd.impl.UnregisterFileAsyncHandler(fd) } } fd.statusFlags.Store((oldFlags &^ settableFlags) | (flags & settableFlags)) return nil } // IsReadable returns true if fd was opened for reading. func (fd *FileDescription) IsReadable() bool { return fd.readable } // IsWritable returns true if fd was opened for writing. func (fd *FileDescription) IsWritable() bool { return fd.writable } // Impl returns the FileDescriptionImpl associated with fd. func (fd *FileDescription) Impl() FileDescriptionImpl { return fd.impl } // FileDescriptionImpl contains implementation details for an FileDescription. // Implementations of FileDescriptionImpl should contain their associated // FileDescription by value as their first field. // // For all functions that return linux.Statx, Statx.Uid and Statx.Gid will // be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and // auth.KGID respectively). // // All methods may return errors not specified. // // FileDescriptionImpl is analogous to Linux's struct file_operations. type FileDescriptionImpl interface { // Release is called when the associated FileDescription reaches zero // references. Release(ctx context.Context) // OnClose is called when a file descriptor representing the // FileDescription is closed. Note that returning a non-nil error does not // prevent the file descriptor from being closed. OnClose(ctx context.Context) error // Stat returns metadata for the file represented by the FileDescription. Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) // SetStat updates metadata for the file represented by the // FileDescription. Implementations are responsible for checking if the // operation can be performed (see vfs.CheckSetStat() for common checks). SetStat(ctx context.Context, opts SetStatOptions) error // StatFS returns metadata for the filesystem containing the file // represented by the FileDescription. StatFS(ctx context.Context) (linux.Statfs, error) // Allocate grows the file to offset + length bytes. // Only mode == 0 is supported currently. // // Allocate should return EISDIR on directories, ESPIPE on pipes, and ENODEV on // other files where it is not supported. // // Preconditions: The FileDescription was opened for writing. Allocate(ctx context.Context, mode, offset, length uint64) error // waiter.Waitable methods may be used to poll for I/O events. waiter.Waitable // Epollable indicates whether this file can be used with epoll_ctl(2). Epollable() bool // PRead reads from the file into dst, starting at the given offset, and // returns the number of bytes read. PRead is permitted to return partial // reads with a nil error. // // Errors: // // - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP. // // Preconditions: // * The FileDescription was opened for reading. // * FileDescriptionOptions.DenyPRead == false. PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) // Read is similar to PRead, but does not specify an offset. // // For files with an implicit FileDescription offset (e.g. regular files), // Read begins at the FileDescription offset, and advances the offset by // the number of bytes read; note that POSIX 2.9.7 "Thread Interactions // with Regular File Operations" requires that all operations that may // mutate the FileDescription offset are serialized. // // Errors: // // - If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP. // // Preconditions: The FileDescription was opened for reading. Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) // PWrite writes src to the file, starting at the given offset, and returns // the number of bytes written. PWrite is permitted to return partial // writes with a nil error. // // As in Linux (but not POSIX), if O_APPEND is in effect for the // FileDescription, PWrite should ignore the offset and append data to the // end of the file. // // Errors: // // - If opts.Flags specifies unsupported options, PWrite returns // EOPNOTSUPP. // // Preconditions: // * The FileDescription was opened for writing. // * FileDescriptionOptions.DenyPWrite == false. PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) // Write is similar to PWrite, but does not specify an offset, which is // implied as for Read. // // Write is a FileDescriptionImpl method, instead of a wrapper around // PWrite that uses a FileDescription offset, to make it possible for // remote filesystems to implement O_APPEND correctly (i.e. atomically with // respect to writers outside the scope of VFS). // // Errors: // // - If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP. // // Preconditions: The FileDescription was opened for writing. Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) // IterDirents invokes cb on each entry in the directory represented by the // FileDescription. If IterDirents has been called since the last call to // Seek, it continues iteration from the end of the last call. IterDirents(ctx context.Context, cb IterDirentsCallback) error // Seek changes the FileDescription offset (assuming one exists) and // returns its new value. // // For directories, if whence == SEEK_SET and offset == 0, the caller is // rewinddir(), such that Seek "shall also cause the directory stream to // refer to the current state of the corresponding directory" - // POSIX.1-2017. Seek(ctx context.Context, offset int64, whence int32) (int64, error) // Sync requests that cached state associated with the file represented by // the FileDescription is synchronized with persistent storage, and blocks // until this is complete. Sync(ctx context.Context) error // ConfigureMMap mutates opts to implement mmap(2) for the file. Most // implementations that support memory mapping can call // GenericConfigureMMap with the appropriate memmap.Mappable. ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error // Ioctl implements the ioctl(2) syscall. Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) // ListXattr returns all extended attribute names for the file. ListXattr(ctx context.Context, size uint64) ([]string, error) // GetXattr returns the value associated with the given extended attribute // for the file. GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) // SetXattr changes the value associated with the given extended attribute // for the file. SetXattr(ctx context.Context, opts SetXattrOptions) error // RemoveXattr removes the given extended attribute from the file. RemoveXattr(ctx context.Context, name string) error // SupportsLocks indicates whether file locks are supported. SupportsLocks() bool // LockBSD tries to acquire a BSD-style advisory file lock. LockBSD(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, block bool) error // UnlockBSD releases a BSD-style advisory file lock. UnlockBSD(ctx context.Context, uid lock.UniqueID) error // LockPOSIX tries to acquire a POSIX-style advisory file lock. LockPOSIX(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, r lock.LockRange, block bool) error // UnlockPOSIX releases a POSIX-style advisory file lock. UnlockPOSIX(ctx context.Context, uid lock.UniqueID, ComputeLockRange lock.LockRange) error // TestPOSIX returns information about whether the specified lock can be held, in the style of the F_GETLK fcntl. TestPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, r lock.LockRange) (linux.Flock, error) RegisterFileAsyncHandler(fd *FileDescription) error UnregisterFileAsyncHandler(fd *FileDescription) } // Dirent holds the information contained in struct linux_dirent64. // // +stateify savable type Dirent struct { // Name is the filename. Name string // Type is the file type, a linux.DT_* constant. Type uint8 // Ino is the inode number. Ino uint64 // NextOff is the offset of the *next* Dirent in the directory; that is, // FileDescription.Seek(NextOff, SEEK_SET) (as called by seekdir(3)) will // cause the next call to FileDescription.IterDirents() to yield the next // Dirent. (The offset of the first Dirent in a directory is always 0.) NextOff int64 } // IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents. type IterDirentsCallback interface { // Handle handles the given iterated Dirent. If Handle returns a non-nil // error, FileDescriptionImpl.IterDirents must stop iteration and return // the error; the next call to FileDescriptionImpl.IterDirents should // restart with the same Dirent. Handle(dirent Dirent) error } // IterDirentsCallbackFunc implements IterDirentsCallback for a function with // the semantics of IterDirentsCallback.Handle. type IterDirentsCallbackFunc func(dirent Dirent) error // Handle implements IterDirentsCallback.Handle. func (f IterDirentsCallbackFunc) Handle(dirent Dirent) error { return f(dirent) } // OnClose is called when a file descriptor representing the FileDescription is // closed. Returning a non-nil error should not prevent the file descriptor // from being closed. func (fd *FileDescription) OnClose(ctx context.Context) error { return fd.impl.OnClose(ctx) } // Stat returns metadata for the file represented by fd. func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { if fd.opts.UseDentryMetadata { vfsObj := fd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vd, Start: fd.vd, }) stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts) rp.Release(ctx) return stat, err } return fd.impl.Stat(ctx, opts) } // SetStat updates metadata for the file represented by fd. func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error { if fd.opts.UseDentryMetadata { vfsObj := fd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vd, Start: fd.vd, }) err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts) rp.Release(ctx) return err } if err := fd.impl.SetStat(ctx, opts); err != nil { return err } if ev := InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { fd.Dentry().InotifyWithParent(ctx, ev, 0, InodeEvent) } return nil } // StatFS returns metadata for the filesystem containing the file represented // by fd. func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { if fd.opts.UseDentryMetadata { vfsObj := fd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vd, Start: fd.vd, }) statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp) rp.Release(ctx) return statfs, err } return fd.impl.StatFS(ctx) } // Allocate grows file represented by FileDescription to offset + length bytes. func (fd *FileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { if !fd.IsWritable() { return linuxerr.EBADF } if err := fd.impl.Allocate(ctx, mode, offset, length); err != nil { return err } fd.Dentry().InotifyWithParent(ctx, linux.IN_MODIFY, 0, PathEvent) return nil } // Readiness implements waiter.Waitable.Readiness. // // It returns fd's I/O readiness. func (fd *FileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { return fd.impl.Readiness(mask) } // EventRegister implements waiter.Waitable.EventRegister. // // It registers e for I/O readiness events in mask. func (fd *FileDescription) EventRegister(e *waiter.Entry) error { return fd.impl.EventRegister(e) } // EventUnregister implements waiter.Waitable.EventUnregister. // // It unregisters e for I/O readiness events. func (fd *FileDescription) EventUnregister(e *waiter.Entry) { fd.impl.EventUnregister(e) } // Epollable returns whether this file can be used with epoll_ctl(2). func (fd *FileDescription) Epollable() bool { return fd.impl.Epollable() } // PRead reads from the file represented by fd into dst, starting at the given // offset, and returns the number of bytes read. PRead is permitted to return // partial reads with a nil error. func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { if fd.opts.DenyPRead { return 0, linuxerr.ESPIPE } if !fd.readable { return 0, linuxerr.EBADF } start := fsmetric.StartReadWait() n, err := fd.impl.PRead(ctx, dst, offset, opts) if n > 0 { fd.Dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, PathEvent) } fsmetric.Reads.Increment() fsmetric.FinishReadWait(fsmetric.ReadWait, start) return n, err } // Read is similar to PRead, but does not specify an offset. func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { if !fd.readable { return 0, linuxerr.EBADF } start := fsmetric.StartReadWait() n, err := fd.impl.Read(ctx, dst, opts) if n > 0 { fd.Dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, PathEvent) } fsmetric.Reads.Increment() fsmetric.FinishReadWait(fsmetric.ReadWait, start) return n, err } // PWrite writes src to the file represented by fd, starting at the given // offset, and returns the number of bytes written. PWrite is permitted to // return partial writes with a nil error. func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { if fd.opts.DenyPWrite { return 0, linuxerr.ESPIPE } if !fd.writable { return 0, linuxerr.EBADF } n, err := fd.impl.PWrite(ctx, src, offset, opts) if n > 0 { fd.Dentry().InotifyWithParent(ctx, linux.IN_MODIFY, 0, PathEvent) } return n, err } // Write is similar to PWrite, but does not specify an offset. func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { if !fd.writable { return 0, linuxerr.EBADF } n, err := fd.impl.Write(ctx, src, opts) if n > 0 { fd.Dentry().InotifyWithParent(ctx, linux.IN_MODIFY, 0, PathEvent) } return n, err } // IterDirents invokes cb on each entry in the directory represented by fd. If // IterDirents has been called since the last call to Seek, it continues // iteration from the end of the last call. func (fd *FileDescription) IterDirents(ctx context.Context, cb IterDirentsCallback) error { defer fd.Dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, PathEvent) return fd.impl.IterDirents(ctx, cb) } // Seek changes fd's offset (assuming one exists) and returns its new value. func (fd *FileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { return fd.impl.Seek(ctx, offset, whence) } // Sync has the semantics of fsync(2). func (fd *FileDescription) Sync(ctx context.Context) error { return fd.impl.Sync(ctx) } // ConfigureMMap mutates opts to implement mmap(2) for the file represented by // fd. func (fd *FileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { return fd.impl.ConfigureMMap(ctx, opts) } // Ioctl implements the ioctl(2) syscall. func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { return fd.impl.Ioctl(ctx, uio, sysno, args) } // ListXattr returns all extended attribute names for the file represented by // fd. // // If the size of the list (including a NUL terminating byte after every entry) // would exceed size, ERANGE may be returned. Note that implementations // are free to ignore size entirely and return without error). In all cases, // if size is 0, the list should be returned without error, regardless of size. func (fd *FileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { if fd.opts.UseDentryMetadata { vfsObj := fd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vd, Start: fd.vd, }) names, err := fd.vd.mount.fs.impl.ListXattrAt(ctx, rp, size) rp.Release(ctx) return names, err } names, err := fd.impl.ListXattr(ctx, size) if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) { // Linux doesn't actually return EOPNOTSUPP in this case; instead, // fs/xattr.c:vfs_listxattr() falls back to allowing the security // subsystem to return security extended attributes, which by default // don't exist. return nil, nil } return names, err } // GetXattr returns the value associated with the given extended attribute for // the file represented by fd. // // If the size of the return value exceeds opts.Size, ERANGE may be returned // (note that implementations are free to ignore opts.Size entirely and return // without error). In all cases, if opts.Size is 0, the value should be // returned without error, regardless of size. func (fd *FileDescription) GetXattr(ctx context.Context, opts *GetXattrOptions) (string, error) { if fd.opts.UseDentryMetadata { vfsObj := fd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vd, Start: fd.vd, }) val, err := fd.vd.mount.fs.impl.GetXattrAt(ctx, rp, *opts) rp.Release(ctx) return val, err } return fd.impl.GetXattr(ctx, *opts) } // SetXattr changes the value associated with the given extended attribute for // the file represented by fd. func (fd *FileDescription) SetXattr(ctx context.Context, opts *SetXattrOptions) error { if fd.opts.UseDentryMetadata { vfsObj := fd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vd, Start: fd.vd, }) err := fd.vd.mount.fs.impl.SetXattrAt(ctx, rp, *opts) rp.Release(ctx) return err } if err := fd.impl.SetXattr(ctx, *opts); err != nil { return err } fd.Dentry().InotifyWithParent(ctx, linux.IN_ATTRIB, 0, InodeEvent) return nil } // RemoveXattr removes the given extended attribute from the file represented // by fd. func (fd *FileDescription) RemoveXattr(ctx context.Context, name string) error { if fd.opts.UseDentryMetadata { vfsObj := fd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vd, Start: fd.vd, }) err := fd.vd.mount.fs.impl.RemoveXattrAt(ctx, rp, name) rp.Release(ctx) return err } if err := fd.impl.RemoveXattr(ctx, name); err != nil { return err } fd.Dentry().InotifyWithParent(ctx, linux.IN_ATTRIB, 0, InodeEvent) return nil } // SyncFS instructs the filesystem containing fd to execute the semantics of // syncfs(2). func (fd *FileDescription) SyncFS(ctx context.Context) error { return fd.vd.mount.fs.impl.Sync(ctx) } // MappedName implements memmap.MappingIdentity.MappedName. func (fd *FileDescription) MappedName(ctx context.Context) string { vfsroot := RootFromContext(ctx) s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd) if vfsroot.Ok() { vfsroot.DecRef(ctx) } return s } // DeviceID implements memmap.MappingIdentity.DeviceID. func (fd *FileDescription) DeviceID() uint64 { stat, err := fd.Stat(context.Background(), StatOptions{ // There is no STATX_DEV; we assume that Stat will return it if it's // available regardless of mask. Mask: 0, // fs/proc/task_mmu.c:show_map_vma() just reads inode::i_sb->s_dev // directly. Sync: linux.AT_STATX_DONT_SYNC, }) if err != nil { return 0 } return uint64(linux.MakeDeviceID(uint16(stat.DevMajor), stat.DevMinor)) } // InodeID implements memmap.MappingIdentity.InodeID. func (fd *FileDescription) InodeID() uint64 { stat, err := fd.Stat(context.Background(), StatOptions{ Mask: linux.STATX_INO, // fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly. Sync: linux.AT_STATX_DONT_SYNC, }) if err != nil || stat.Mask&linux.STATX_INO == 0 { return 0 } return stat.Ino } // Msync implements memmap.MappingIdentity.Msync. func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error { return fd.Sync(ctx) } // SupportsLocks indicates whether file locks are supported. func (fd *FileDescription) SupportsLocks() bool { return fd.impl.SupportsLocks() } // LockBSD tries to acquire a BSD-style advisory file lock. func (fd *FileDescription) LockBSD(ctx context.Context, ownerPID int32, lockType lock.LockType, block bool) error { fd.usedLockBSD.Store(1) return fd.impl.LockBSD(ctx, fd, ownerPID, lockType, block) } // UnlockBSD releases a BSD-style advisory file lock. func (fd *FileDescription) UnlockBSD(ctx context.Context) error { return fd.impl.UnlockBSD(ctx, fd) } // LockPOSIX locks a POSIX-style file range lock. func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, r lock.LockRange, block bool) error { return fd.impl.LockPOSIX(ctx, uid, ownerPID, t, r, block) } // UnlockPOSIX unlocks a POSIX-style file range lock. func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, r lock.LockRange) error { return fd.impl.UnlockPOSIX(ctx, uid, r) } // TestPOSIX returns information about whether the specified lock can be held. func (fd *FileDescription) TestPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, r lock.LockRange) (linux.Flock, error) { return fd.impl.TestPOSIX(ctx, uid, t, r) } // ComputeLockRange computes the range of a file lock based on the given values. func (fd *FileDescription) ComputeLockRange(ctx context.Context, start uint64, length uint64, whence int16) (lock.LockRange, error) { var off int64 switch whence { case linux.SEEK_SET: off = 0 case linux.SEEK_CUR: // Note that Linux does not hold any mutexes while retrieving the file // offset, see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk. curOff, err := fd.Seek(ctx, 0, linux.SEEK_CUR) if err != nil { return lock.LockRange{}, err } off = curOff case linux.SEEK_END: stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_SIZE}) if err != nil { return lock.LockRange{}, err } off = int64(stat.Size) default: return lock.LockRange{}, linuxerr.EINVAL } return lock.ComputeRange(int64(start), int64(length), off) } // ReadFull read all contents from the file. func (fd *FileDescription) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { var total int64 for dst.NumBytes() > 0 { n, err := fd.PRead(ctx, dst, offset+total, ReadOptions{}) total += n if err == io.EOF && total != 0 { return total, io.ErrUnexpectedEOF } else if err != nil { return total, err } dst = dst.DropFirst64(n) } return total, nil } // A FileAsync sends signals to its owner when w is ready for IO. This is only // implemented by pkg/sentry/fasync:FileAsync, but we unfortunately need this // interface to avoid circular dependencies. type FileAsync interface { Register(w waiter.Waitable) error Unregister(w waiter.Waitable) } // AsyncHandler returns the FileAsync for fd. func (fd *FileDescription) AsyncHandler() FileAsync { fd.flagsMu.Lock() defer fd.flagsMu.Unlock() return fd.asyncHandler } // SetAsyncHandler sets fd.asyncHandler if it has not been set before and // returns it. func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) (FileAsync, error) { fd.flagsMu.Lock() defer fd.flagsMu.Unlock() if fd.asyncHandler == nil { fd.asyncHandler = newHandler() if fd.statusFlags.RacyLoad()&linux.O_ASYNC != 0 { if err := fd.impl.RegisterFileAsyncHandler(fd); err != nil { return nil, err } } } return fd.asyncHandler, nil } // CopyRegularFileData copies data from srcFD to dstFD until reading from srcFD // returns EOF or an error. It returns the number of bytes copied. func CopyRegularFileData(ctx context.Context, dstFD, srcFD *FileDescription) (int64, error) { done := int64(0) buf := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size for { readN, readErr := srcFD.Read(ctx, buf, ReadOptions{}) if readErr != nil && readErr != io.EOF { return done, readErr } src := buf.TakeFirst64(readN) for src.NumBytes() != 0 { writeN, writeErr := dstFD.Write(ctx, src, WriteOptions{}) done += writeN src = src.DropFirst64(writeN) if writeErr != nil { return done, writeErr } } if readErr == io.EOF { return done, nil } } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/file_description_impl_util.go000066400000000000000000000467621465435605700275140ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "bytes" goContext "context" "io" "math" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" fslock "gvisor.dev/gvisor/pkg/sentry/fsimpl/lock" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // The following design pattern is strongly recommended for filesystem // implementations to adapt: // - Have a local fileDescription struct (containing FileDescription) which // embeds FileDescriptionDefaultImpl and overrides the default methods // which are common to all fd implementations for that filesystem like // StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc. // - This should be embedded in all file description implementations as the // first field by value. // - Directory FDs would also embed DirectoryFileDescriptionDefaultImpl. // FileDescriptionDefaultImpl may be embedded by implementations of // FileDescriptionImpl to obtain implementations of many FileDescriptionImpl // methods with default behavior analogous to Linux's. // // +stateify savable type FileDescriptionDefaultImpl struct{} // OnClose implements FileDescriptionImpl.OnClose analogously to // file_operations::flush == NULL in Linux. func (FileDescriptionDefaultImpl) OnClose(ctx context.Context) error { return nil } // StatFS implements FileDescriptionImpl.StatFS analogously to // super_operations::statfs == NULL in Linux. func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, error) { return linux.Statfs{}, linuxerr.ENOSYS } // Allocate implements FileDescriptionImpl.Allocate analogously to // fallocate called on an invalid type of file in Linux. // // Note that directories can rely on this implementation even though they // should technically return EISDIR. Allocate should never be called for a // directory, because it requires a writable fd. func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { return linuxerr.ENODEV } // Readiness implements waiter.Waitable.Readiness analogously to // file_operations::poll == NULL in Linux. func (FileDescriptionDefaultImpl) Readiness(mask waiter.EventMask) waiter.EventMask { // include/linux/poll.h:vfs_poll() => DEFAULT_POLLMASK return waiter.ReadableEvents | waiter.WritableEvents } // EventRegister implements waiter.Waitable.EventRegister analogously to // file_operations::poll == NULL in Linux. func (FileDescriptionDefaultImpl) EventRegister(e *waiter.Entry) error { return nil } // EventUnregister implements waiter.Waitable.EventUnregister analogously to // file_operations::poll == NULL in Linux. func (FileDescriptionDefaultImpl) EventUnregister(e *waiter.Entry) { } // Epollable implements FileDescriptionImpl.Epollable. func (FileDescriptionDefaultImpl) Epollable() bool { return false } // PRead implements FileDescriptionImpl.PRead analogously to // file_operations::read == file_operations::read_iter == NULL in Linux. func (FileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { return 0, linuxerr.EINVAL } // Read implements FileDescriptionImpl.Read analogously to // file_operations::read == file_operations::read_iter == NULL in Linux. func (FileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { return 0, linuxerr.EINVAL } // PWrite implements FileDescriptionImpl.PWrite analogously to // file_operations::write == file_operations::write_iter == NULL in Linux. func (FileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { return 0, linuxerr.EINVAL } // Write implements FileDescriptionImpl.Write analogously to // file_operations::write == file_operations::write_iter == NULL in Linux. func (FileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { return 0, linuxerr.EINVAL } // IterDirents implements FileDescriptionImpl.IterDirents analogously to // file_operations::iterate == file_operations::iterate_shared == NULL in // Linux. func (FileDescriptionDefaultImpl) IterDirents(ctx context.Context, cb IterDirentsCallback) error { return linuxerr.ENOTDIR } // Seek implements FileDescriptionImpl.Seek analogously to // file_operations::llseek == NULL in Linux. func (FileDescriptionDefaultImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { return 0, linuxerr.ESPIPE } // Sync implements FileDescriptionImpl.Sync analogously to // file_operations::fsync == NULL in Linux. func (FileDescriptionDefaultImpl) Sync(ctx context.Context) error { return linuxerr.EINVAL } // ConfigureMMap implements FileDescriptionImpl.ConfigureMMap analogously to // file_operations::mmap == NULL in Linux. func (FileDescriptionDefaultImpl) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { return linuxerr.ENODEV } // Ioctl implements FileDescriptionImpl.Ioctl analogously to // file_operations::unlocked_ioctl == NULL in Linux. func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { return 0, linuxerr.ENOTTY } // ListXattr implements FileDescriptionImpl.ListXattr analogously to // inode_operations::listxattr == NULL in Linux. func (FileDescriptionDefaultImpl) ListXattr(ctx context.Context, size uint64) ([]string, error) { // This isn't exactly accurate; see FileDescription.ListXattr. return nil, linuxerr.ENOTSUP } // GetXattr implements FileDescriptionImpl.GetXattr analogously to // inode::i_opflags & IOP_XATTR == 0 in Linux. func (FileDescriptionDefaultImpl) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) { return "", linuxerr.ENOTSUP } // SetXattr implements FileDescriptionImpl.SetXattr analogously to // inode::i_opflags & IOP_XATTR == 0 in Linux. func (FileDescriptionDefaultImpl) SetXattr(ctx context.Context, opts SetXattrOptions) error { return linuxerr.ENOTSUP } // RemoveXattr implements FileDescriptionImpl.RemoveXattr analogously to // inode::i_opflags & IOP_XATTR == 0 in Linux. func (FileDescriptionDefaultImpl) RemoveXattr(ctx context.Context, name string) error { return linuxerr.ENOTSUP } // RegisterFileAsyncHandler implements FileDescriptionImpl.RegisterFileAsyncHandler. func (FileDescriptionDefaultImpl) RegisterFileAsyncHandler(fd *FileDescription) error { return fd.asyncHandler.Register(fd) } // UnregisterFileAsyncHandler implements FileDescriptionImpl.UnregisterFileAsyncHandler. func (FileDescriptionDefaultImpl) UnregisterFileAsyncHandler(fd *FileDescription) { fd.asyncHandler.Unregister(fd) } // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of // FileDescriptionImpl that always represent directories to obtain // implementations of non-directory I/O methods that return EISDIR. // // +stateify savable type DirectoryFileDescriptionDefaultImpl struct{} // Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate. func (DirectoryFileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { return linuxerr.EISDIR } // PRead implements FileDescriptionImpl.PRead. func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { return 0, linuxerr.EISDIR } // Read implements FileDescriptionImpl.Read. func (DirectoryFileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { return 0, linuxerr.EISDIR } // PWrite implements FileDescriptionImpl.PWrite. func (DirectoryFileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { return 0, linuxerr.EISDIR } // Write implements FileDescriptionImpl.Write. func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { return 0, linuxerr.EISDIR } // DentryMetadataFileDescriptionImpl may be embedded by implementations of // FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is // true to obtain implementations of Stat and SetStat that panic. // // +stateify savable type DentryMetadataFileDescriptionImpl struct{} // Stat implements FileDescriptionImpl.Stat. func (DentryMetadataFileDescriptionImpl) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { panic("illegal call to DentryMetadataFileDescriptionImpl.Stat") } // SetStat implements FileDescriptionImpl.SetStat. func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetStatOptions) error { panic("illegal call to DentryMetadataFileDescriptionImpl.SetStat") } // DynamicBytesSource represents a data source for a // DynamicBytesFileDescriptionImpl. // // +stateify savable type DynamicBytesSource interface { // Generate writes the file's contents to buf. Generate(ctx context.Context, buf *bytes.Buffer) error } // StaticData implements DynamicBytesSource over a static string. // // +stateify savable type StaticData struct { Data string } // Generate implements DynamicBytesSource. func (s *StaticData) Generate(ctx context.Context, buf *bytes.Buffer) error { buf.WriteString(s.Data) return nil } // WritableDynamicBytesSource extends DynamicBytesSource to allow writes to the // underlying source. type WritableDynamicBytesSource interface { DynamicBytesSource // Write sends writes to the source. Write(ctx context.Context, fd *FileDescription, src usermem.IOSequence, offset int64) (int64, error) } // DynamicBytesFileDescriptionImpl may be embedded by implementations of // FileDescriptionImpl that represent read-only regular files whose contents // are backed by a bytes.Buffer that is regenerated when necessary, consistent // with Linux's fs/seq_file.c:single_open(). // // If data additionally implements WritableDynamicBytesSource, writes are // dispatched to the implementer. The source data is not automatically modified. // // DynamicBytesFileDescriptionImpl.Init() must be called before first // use. // // +stateify savable type DynamicBytesFileDescriptionImpl struct { vfsfd *FileDescription // immutable data DynamicBytesSource // immutable mu sync.Mutex `state:"nosave"` // protects the following fields buf bytes.Buffer `state:".([]byte)"` off int64 lastRead int64 // offset at which the last Read, PRead, or Seek ended } func (fd *DynamicBytesFileDescriptionImpl) saveBuf() []byte { return fd.buf.Bytes() } func (fd *DynamicBytesFileDescriptionImpl) loadBuf(_ goContext.Context, p []byte) { fd.buf.Write(p) } // Init must be called before first use. func (fd *DynamicBytesFileDescriptionImpl) Init(vfsfd *FileDescription, data DynamicBytesSource) { fd.vfsfd = vfsfd fd.data = data } // Preconditions: fd.mu must be locked. func (fd *DynamicBytesFileDescriptionImpl) preadLocked(ctx context.Context, dst usermem.IOSequence, offset int64, opts *ReadOptions) (int64, error) { // Regenerate the buffer if it's empty, or before pread() at a new offset. // Compare fs/seq_file.c:seq_read() => traverse(). switch { case offset != fd.lastRead: fd.buf.Reset() fallthrough case fd.buf.Len() == 0: if err := fd.data.Generate(ctx, &fd.buf); err != nil { fd.buf.Reset() // fd.off is not updated in this case. fd.lastRead = 0 return 0, err } } bs := fd.buf.Bytes() if offset >= int64(len(bs)) { return 0, io.EOF } n, err := dst.CopyOut(ctx, bs[offset:]) fd.lastRead = offset + int64(n) return int64(n), err } // PRead implements FileDescriptionImpl.PRead. func (fd *DynamicBytesFileDescriptionImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { fd.mu.Lock() n, err := fd.preadLocked(ctx, dst, offset, &opts) fd.mu.Unlock() return n, err } // Read implements FileDescriptionImpl.Read. func (fd *DynamicBytesFileDescriptionImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { fd.mu.Lock() n, err := fd.preadLocked(ctx, dst, fd.off, &opts) fd.off += n fd.mu.Unlock() return n, err } // Seek implements FileDescriptionImpl.Seek. func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { fd.mu.Lock() defer fd.mu.Unlock() switch whence { case linux.SEEK_SET: // Use offset as given. case linux.SEEK_CUR: offset += fd.off default: // fs/seq_file:seq_lseek() rejects SEEK_END etc. return 0, linuxerr.EINVAL } if offset < 0 { return 0, linuxerr.EINVAL } if offset != fd.lastRead { // Regenerate the file's contents immediately. Compare // fs/seq_file.c:seq_lseek() => traverse(). fd.buf.Reset() if err := fd.data.Generate(ctx, &fd.buf); err != nil { fd.buf.Reset() fd.off = 0 fd.lastRead = 0 return 0, err } fd.lastRead = offset } fd.off = offset return offset, nil } // Preconditions: fd.mu must be locked. func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { return 0, linuxerr.EOPNOTSUPP } limit, err := CheckLimit(ctx, offset, src.NumBytes()) if err != nil { return 0, err } src = src.TakeFirst64(limit) writable, ok := fd.data.(WritableDynamicBytesSource) if !ok { return 0, linuxerr.EIO } n, err := writable.Write(ctx, fd.vfsfd, src, offset) if err != nil { return 0, err } // Invalidate cached data that might exist prior to this call. fd.buf.Reset() return n, nil } // PWrite implements FileDescriptionImpl.PWrite. func (fd *DynamicBytesFileDescriptionImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { fd.mu.Lock() n, err := fd.pwriteLocked(ctx, src, offset, opts) fd.mu.Unlock() return n, err } // Write implements FileDescriptionImpl.Write. func (fd *DynamicBytesFileDescriptionImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { fd.mu.Lock() n, err := fd.pwriteLocked(ctx, src, fd.off, opts) fd.off += n fd.mu.Unlock() return n, err } // GenericConfigureMMap may be used by most implementations of // FileDescriptionImpl.ConfigureMMap. func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.MMapOpts) error { if opts.Offset+opts.Length > math.MaxInt64 { return linuxerr.EOVERFLOW } opts.Mappable = m opts.MappingIdentity = fd fd.IncRef() return nil } // LockFD may be used by most implementations of FileDescriptionImpl.Lock* // functions. Caller must call Init(). // // +stateify savable type LockFD struct { locks *FileLocks } // SupportsLocks implements FileDescriptionImpl.SupportsLocks. func (LockFD) SupportsLocks() bool { return true } // Init initializes fd with FileLocks to use. func (fd *LockFD) Init(locks *FileLocks) { fd.locks = locks } // Locks returns the locks associated with this file. func (fd *LockFD) Locks() *FileLocks { return fd.locks } // LockBSD implements FileDescriptionImpl.LockBSD. func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block bool) error { return fd.locks.LockBSD(ctx, uid, ownerPID, t, block) } // UnlockBSD implements FileDescriptionImpl.UnlockBSD. func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { fd.locks.UnlockBSD(uid) return nil } // LockPOSIX implements FileDescriptionImpl.LockPOSIX. func (fd *LockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block bool) error { return fd.locks.LockPOSIX(ctx, uid, ownerPID, t, r, block) } // UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX. func (fd *LockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { return fd.locks.UnlockPOSIX(ctx, uid, r) } // TestPOSIX implements FileDescriptionImpl.TestPOSIX. func (fd *LockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { return fd.locks.TestPOSIX(ctx, uid, t, r) } // NoAsyncEventFD implements [Un]RegisterFileAsyncHandler of FileDescriptionImpl. type NoAsyncEventFD struct{} // RegisterFileAsyncHandler implements FileDescriptionImpl.RegisterFileAsyncHandler. func (NoAsyncEventFD) RegisterFileAsyncHandler(fd *FileDescription) error { return nil } // UnregisterFileAsyncHandler implements FileDescriptionImpl.UnregisterFileAsyncHandler. func (NoAsyncEventFD) UnregisterFileAsyncHandler(fd *FileDescription) { } // NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface // returning ENOLCK. // // +stateify savable type NoLockFD struct{} // SupportsLocks implements FileDescriptionImpl.SupportsLocks. func (NoLockFD) SupportsLocks() bool { return false } // LockBSD implements FileDescriptionImpl.LockBSD. func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block bool) error { return linuxerr.ENOLCK } // UnlockBSD implements FileDescriptionImpl.UnlockBSD. func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { return linuxerr.ENOLCK } // LockPOSIX implements FileDescriptionImpl.LockPOSIX. func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block bool) error { return linuxerr.ENOLCK } // UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX. func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { return linuxerr.ENOLCK } // TestPOSIX implements FileDescriptionImpl.TestPOSIX. func (NoLockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { return linux.Flock{}, linuxerr.ENOLCK } // BadLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface // returning EBADF. // // +stateify savable type BadLockFD struct{} // SupportsLocks implements FileDescriptionImpl.SupportsLocks. func (BadLockFD) SupportsLocks() bool { return false } // LockBSD implements FileDescriptionImpl.LockBSD. func (BadLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block bool) error { return linuxerr.EBADF } // UnlockBSD implements FileDescriptionImpl.UnlockBSD. func (BadLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { return linuxerr.EBADF } // LockPOSIX implements FileDescriptionImpl.LockPOSIX. func (BadLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block bool) error { return linuxerr.EBADF } // UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX. func (BadLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { return linuxerr.EBADF } // TestPOSIX implements FileDescriptionImpl.TestPOSIX. func (BadLockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { return linux.Flock{}, linuxerr.EBADF } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/file_description_refs.go000066400000000000000000000103501465435605700264350ustar00rootroot00000000000000package vfs import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const FileDescriptionenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var FileDescriptionobj *FileDescription // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type FileDescriptionRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *FileDescriptionRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *FileDescriptionRefs) RefType() string { return fmt.Sprintf("%T", FileDescriptionobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *FileDescriptionRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *FileDescriptionRefs) LogRefs() bool { return FileDescriptionenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *FileDescriptionRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *FileDescriptionRefs) IncRef() { v := r.refCount.Add(1) if FileDescriptionenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *FileDescriptionRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if FileDescriptionenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *FileDescriptionRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if FileDescriptionenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *FileDescriptionRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/filesystem.go000066400000000000000000000520131465435605700242620ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" ) // A Filesystem is a tree of nodes represented by Dentries, which forms part of // a VirtualFilesystem. // // Filesystems are reference-counted. Unless otherwise specified, all // Filesystem methods require that a reference is held. // // Filesystem is analogous to Linux's struct super_block. // // +stateify savable type Filesystem struct { FilesystemRefs // vfs is the VirtualFilesystem that uses this Filesystem. vfs is // immutable. vfs *VirtualFilesystem // fsType is the FilesystemType of this Filesystem. fsType FilesystemType // impl is the FilesystemImpl associated with this Filesystem. impl is // immutable. This should be the last field in Dentry. impl FilesystemImpl } // Init must be called before first use of fs. func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, fsType FilesystemType, impl FilesystemImpl) { fs.InitRefs() fs.vfs = vfsObj fs.fsType = fsType fs.impl = impl vfsObj.filesystemsMu.Lock() vfsObj.filesystems[fs] = struct{}{} vfsObj.filesystemsMu.Unlock() } // FilesystemType returns the FilesystemType for this Filesystem. func (fs *Filesystem) FilesystemType() FilesystemType { return fs.fsType } // VirtualFilesystem returns the containing VirtualFilesystem. func (fs *Filesystem) VirtualFilesystem() *VirtualFilesystem { return fs.vfs } // Impl returns the FilesystemImpl associated with fs. func (fs *Filesystem) Impl() FilesystemImpl { return fs.impl } // DecRef decrements fs' reference count. func (fs *Filesystem) DecRef(ctx context.Context) { fs.FilesystemRefs.DecRef(func() { fs.vfs.filesystemsMu.Lock() delete(fs.vfs.filesystems, fs) fs.vfs.filesystemsMu.Unlock() fs.impl.Release(ctx) }) } // FilesystemImpl contains implementation details for a Filesystem. // Implementations of FilesystemImpl should contain their associated Filesystem // by value as their first field. // // All methods that take a ResolvingPath must resolve the path before // performing any other checks, including rejection of the operation if not // supported by the FilesystemImpl. This is because the final FilesystemImpl // (responsible for actually implementing the operation) isn't known until path // resolution is complete. // // Unless otherwise specified, FilesystemImpl methods are responsible for // performing permission checks. In many cases, vfs package functions in // permissions.go may be used to help perform these checks. // // When multiple specified error conditions apply to a given method call, the // implementation may return any applicable errno unless otherwise specified, // but returning the earliest error specified is preferable to maximize // compatibility with Linux. // // All methods may return errors not specified, notably including: // // - ENOENT if a required path component does not exist. // // - ENOTDIR if an intermediate path component is not a directory. // // - Errors from vfs-package functions (ResolvingPath.Resolve*(), // Mount.CheckBeginWrite(), permission-checking functions, etc.) // // For all methods that take or return linux.Statx, Statx.Uid and Statx.Gid // should be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID // and auth.KGID respectively). // // FilesystemImpl combines elements of Linux's struct super_operations and // struct inode_operations, for reasons described in the documentation for // Dentry. type FilesystemImpl interface { // Release is called when the associated Filesystem reaches zero // references. Release(ctx context.Context) // Sync "causes all pending modifications to filesystem metadata and cached // file data to be written to the underlying [filesystem]", as by syncfs(2). Sync(ctx context.Context) error // AccessAt checks whether a user with creds can access the file at rp. AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error // GetDentryAt returns a Dentry representing the file at rp. A reference is // taken on the returned Dentry. // // GetDentryAt does not correspond directly to a Linux syscall; it is used // in the implementation of: // // - Syscalls that need to resolve two paths: link(), linkat(). // // - Syscalls that need to refer to a filesystem position outside the // context of a file description: chdir(), fchdir(), chroot(), mount(), // umount(). GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) // GetParentDentryAt returns a Dentry representing the directory at the // second-to-last path component in rp. (Note that, despite the name, this // is not necessarily the parent directory of the file at rp, since the // last path component in rp may be "." or "..".) A reference is taken on // the returned Dentry. // // GetParentDentryAt does not correspond directly to a Linux syscall; it is // used in the implementation of the rename() family of syscalls, which // must resolve the parent directories of two paths. // // Preconditions: !rp.Done(). // // Postconditions: If GetParentDentryAt returns a nil error, then // rp.Final(). If GetParentDentryAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) // LinkAt creates a hard link at rp representing the same file as vd. It // does not take ownership of references on vd. // // Errors: // // - If the last path component in rp is "." or "..", LinkAt returns // EEXIST. // // - If a file already exists at rp, LinkAt returns EEXIST. // // - If rp.MustBeDir(), LinkAt returns ENOENT. // // - If the directory in which the link would be created has been removed // by RmdirAt or RenameAt, LinkAt returns ENOENT. // // - If rp.Mount != vd.Mount(), LinkAt returns EXDEV. // // - If vd represents a directory, LinkAt returns EPERM. // // - If vd represents a file for which all existing links have been // removed, or a file created by open(O_TMPFILE|O_EXCL), LinkAt returns // ENOENT. Equivalently, if vd represents a file with a link count of 0 not // created by open(O_TMPFILE) without O_EXCL, LinkAt returns ENOENT. // // Preconditions: // * !rp.Done(). // * For the final path component in rp, !rp.ShouldFollowSymlink(). // // Postconditions: If LinkAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error // MkdirAt creates a directory at rp. // // Errors: // // - If the last path component in rp is "." or "..", MkdirAt returns // EEXIST. // // - If a file already exists at rp, MkdirAt returns EEXIST. // // - If the directory in which the new directory would be created has been // removed by RmdirAt or RenameAt, MkdirAt returns ENOENT. // // Preconditions: // * !rp.Done(). // * For the final path component in rp, !rp.ShouldFollowSymlink(). // // Postconditions: If MkdirAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error // MknodAt creates a regular file, device special file, or named pipe at // rp. // // Errors: // // - If the last path component in rp is "." or "..", MknodAt returns // EEXIST. // // - If a file already exists at rp, MknodAt returns EEXIST. // // - If rp.MustBeDir(), MknodAt returns ENOENT. // // - If the directory in which the file would be created has been removed // by RmdirAt or RenameAt, MknodAt returns ENOENT. // // Preconditions: // * !rp.Done(). // * For the final path component in rp, !rp.ShouldFollowSymlink(). // // Postconditions: If MknodAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error // OpenAt returns an FileDescription providing access to the file at rp. A // reference is taken on the returned FileDescription. // // Errors: // // - If opts.Flags specifies O_TMPFILE and this feature is unsupported by // the implementation, OpenAt returns EOPNOTSUPP. (All other unsupported // features are silently ignored, consistently with Linux's open*(2).) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) // ReadlinkAt returns the target of the symbolic link at rp. // // Errors: // // - If the file at rp is not a symbolic link, ReadlinkAt returns EINVAL. ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) // RenameAt renames the file named oldName in directory oldParentVD to rp. // It does not take ownership of references on oldParentVD. // // Errors [1]: // // - If opts.Flags specifies unsupported options, RenameAt returns EINVAL. // // - If the last path component in rp is "." or "..", and opts.Flags // contains RENAME_NOREPLACE, RenameAt returns EEXIST. // // - If the last path component in rp is "." or "..", and opts.Flags does // not contain RENAME_NOREPLACE, RenameAt returns EBUSY. // // - If rp.Mount != oldParentVD.Mount(), RenameAt returns EXDEV. // // - If the renamed file is not a directory, and opts.MustBeDir is true, // RenameAt returns ENOTDIR. // // - If renaming would replace an existing file and opts.Flags contains // RENAME_NOREPLACE, RenameAt returns EEXIST. // // - If there is no existing file at rp and opts.Flags contains // RENAME_EXCHANGE, RenameAt returns ENOENT. // // - If there is an existing non-directory file at rp, and rp.MustBeDir() // is true, RenameAt returns ENOTDIR. // // - If the renamed file is not a directory, opts.Flags does not contain // RENAME_EXCHANGE, and rp.MustBeDir() is true, RenameAt returns ENOTDIR. // (This check is not subsumed by the check for directory replacement below // since it applies even if there is no file to replace.) // // - If the renamed file is a directory, and the new parent directory of // the renamed file is either the renamed directory or a descendant // subdirectory of the renamed directory, RenameAt returns EINVAL. // // - If renaming would exchange the renamed file with an ancestor directory // of the renamed file, RenameAt returns EINVAL. // // - If renaming would replace an ancestor directory of the renamed file, // RenameAt returns ENOTEMPTY. (This check would be subsumed by the // non-empty directory check below; however, this check takes place before // the self-rename check.) // // - If the renamed file would replace or exchange with itself (i.e. the // source and destination paths resolve to the same file), RenameAt returns // nil, skipping the checks described below. // // - If the source or destination directory is not writable by the provider // of rp.Credentials(), RenameAt returns EACCES. // // - If the renamed file is a directory, and renaming would replace a // non-directory file, RenameAt returns ENOTDIR. // // - If the renamed file is not a directory, and renaming would replace a // directory, RenameAt returns EISDIR. // // - If the new parent directory of the renamed file has been removed by // RmdirAt or a preceding call to RenameAt, RenameAt returns ENOENT. // // - If the renamed file is a directory, it is not writable by the // provider of rp.Credentials(), and the source and destination parent // directories are different, RenameAt returns EACCES. (This is nominally // required to change the ".." entry in the renamed directory.) // // - If renaming would replace a non-empty directory, RenameAt returns // ENOTEMPTY. // // Preconditions: // * !rp.Done(). // * For the final path component in rp, !rp.ShouldFollowSymlink(). // * oldParentVD.Dentry() was obtained from a previous call to // oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt(). // * oldName is not "." or "..". // // Postconditions: If RenameAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). // // [1] "The worst of all namespace operations - renaming directory. // "Perverted" doesn't even start to describe it. Somebody in UCB had a // heck of a trip..." - fs/namei.c:vfs_rename() RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error // RmdirAt removes the directory at rp. // // Errors: // // - If the last path component in rp is ".", RmdirAt returns EINVAL. // // - If the last path component in rp is "..", RmdirAt returns ENOTEMPTY. // // - If no file exists at rp, RmdirAt returns ENOENT. // // - If the file at rp exists but is not a directory, RmdirAt returns // ENOTDIR. // // Preconditions: // * !rp.Done(). // * For the final path component in rp, !rp.ShouldFollowSymlink(). // // Postconditions: If RmdirAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). RmdirAt(ctx context.Context, rp *ResolvingPath) error // SetStatAt updates metadata for the file at the given path. Implementations // are responsible for checking if the operation can be performed // (see vfs.CheckSetStat() for common checks). // // Errors: // // - If opts specifies unsupported options, SetStatAt returns EINVAL. SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error // StatAt returns metadata for the file at rp. StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) // StatFSAt returns metadata for the filesystem containing the file at rp. // (This method takes a path because a FilesystemImpl may consist of any // number of constituent filesystems.) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) // SymlinkAt creates a symbolic link at rp referring to the given target. // // Errors: // // - If the last path component in rp is "." or "..", SymlinkAt returns // EEXIST. // // - If a file already exists at rp, SymlinkAt returns EEXIST. // // - If rp.MustBeDir(), SymlinkAt returns ENOENT. // // - If the directory in which the symbolic link would be created has been // removed by RmdirAt or RenameAt, SymlinkAt returns ENOENT. // // Preconditions: // * !rp.Done(). // * For the final path component in rp, !rp.ShouldFollowSymlink(). // // Postconditions: If SymlinkAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error // UnlinkAt removes the file at rp. // // Errors: // // - If the last path component in rp is "." or "..", UnlinkAt returns // EISDIR. // // - If no file exists at rp, UnlinkAt returns ENOENT. // // - If rp.MustBeDir(), and the file at rp exists and is not a directory, // UnlinkAt returns ENOTDIR. // // - If the file at rp exists but is a directory, UnlinkAt returns EISDIR. // // Preconditions: // * !rp.Done(). // * For the final path component in rp, !rp.ShouldFollowSymlink(). // // Postconditions: If UnlinkAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). UnlinkAt(ctx context.Context, rp *ResolvingPath) error // ListXattrAt returns all extended attribute names for the file at rp. // // Errors: // // - If extended attributes are not supported by the filesystem, // ListXattrAt returns ENOTSUP. // // - If the size of the list (including a NUL terminating byte after every // entry) would exceed size, ERANGE may be returned. Note that // implementations are free to ignore size entirely and return without // error). In all cases, if size is 0, the list should be returned without // error, regardless of size. ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) // GetXattrAt returns the value associated with the given extended // attribute for the file at rp. // // Errors: // // - If extended attributes are not supported by the filesystem, GetXattrAt // returns ENOTSUP. // // - If an extended attribute named opts.Name does not exist, ENODATA is // returned. // // - If the size of the return value exceeds opts.Size, ERANGE may be // returned (note that implementations are free to ignore opts.Size entirely // and return without error). In all cases, if opts.Size is 0, the value // should be returned without error, regardless of size. GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error) // SetXattrAt changes the value associated with the given extended // attribute for the file at rp. // // Errors: // // - If extended attributes are not supported by the filesystem, SetXattrAt // returns ENOTSUP. // // - If XATTR_CREATE is set in opts.Flag and opts.Name already exists, // EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist, // ENODATA is returned. SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error // RemoveXattrAt removes the given extended attribute from the file at rp. // // Errors: // // - If extended attributes are not supported by the filesystem, // RemoveXattrAt returns ENOTSUP. // // - If name does not exist, ENODATA is returned. RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error // BoundEndpointAt returns the Unix socket endpoint bound at the path rp. // // Errors: // // - If the file does not have write permissions, then BoundEndpointAt // returns EACCES. // // - If a non-socket file exists at rp, then BoundEndpointAt returns // ECONNREFUSED. BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error) // PrependPath prepends a path from vd to vd.Mount().Root() to b. // // If vfsroot.Ok(), it is the contextual VFS root; if it is encountered // before vd.Mount().Root(), PrependPath should stop prepending path // components and return a PrependPathAtVFSRootError. // // If traversal of vd.Dentry()'s ancestors encounters an independent // ("root") Dentry that is not vd.Mount().Root() (i.e. vd.Dentry() is not a // descendant of vd.Mount().Root()), PrependPath should stop prepending // path components and return a PrependPathAtNonMountRootError. // // Filesystems for which Dentries do not have meaningful paths may prepend // an arbitrary descriptive string to b and then return a // PrependPathSyntheticError. // // Most implementations can acquire the appropriate locks to ensure that // Dentry.Name() and Dentry.Parent() are fixed for vd.Dentry() and all of // its ancestors, then call GenericPrependPath. // // Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl. PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error // MountOptions returns mount options for the current filesystem. This // should only return options specific to the filesystem (i.e. don't return // "ro", "rw", etc). Options should be returned as a comma-separated string, // similar to the input to the 5th argument to mount. // // If the implementation has no filesystem-specific options, it should // return the empty string. MountOptions() string // IsDescendant returns true if vd is a descendant of vfsroot or if vd and // vfsroot are the same dentry. The method does not take filesystem locks when // accessing the parents of each dentry, so it's possible for parents to be // mutated concurrently during a call to IsDescendant. Callers should take // appropriate caution when using this method. // // Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl. IsDescendant(vfsroot, vd VirtualDentry) bool } // PrependPathAtVFSRootError is returned by implementations of // FilesystemImpl.PrependPath() when they encounter the contextual VFS root. // // +stateify savable type PrependPathAtVFSRootError struct{} // Error implements error.Error. func (PrependPathAtVFSRootError) Error() string { return "vfs.FilesystemImpl.PrependPath() reached VFS root" } // PrependPathAtNonMountRootError is returned by implementations of // FilesystemImpl.PrependPath() when they encounter an independent ancestor // Dentry that is not the Mount root. // // +stateify savable type PrependPathAtNonMountRootError struct{} // Error implements error.Error. func (PrependPathAtNonMountRootError) Error() string { return "vfs.FilesystemImpl.PrependPath() reached root other than Mount root" } // PrependPathSyntheticError is returned by implementations of // FilesystemImpl.PrependPath() for which prepended names do not represent real // paths. // // +stateify savable type PrependPathSyntheticError struct{} // Error implements error.Error. func (PrependPathSyntheticError) Error() string { return "vfs.FilesystemImpl.PrependPath() prepended synthetic name" } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/filesystem_impl_util.go000066400000000000000000000034301465435605700263370ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" ) // GenericParseMountOptions parses a comma-separated list of options of the // form "key" or "key=value", where neither key nor value contain commas, and // returns it as a map. If str contains duplicate keys, then the last value // wins. For example: // // str = "key0=value0,key1,key2=value2,key0=value3" -> map{'key0':'value3','key1':”,'key2':'value2'} // // GenericParseMountOptions is not appropriate if values may contain commas, // e.g. in the case of the mpol mount option for tmpfs(5). func GenericParseMountOptions(str string) map[string]string { m := make(map[string]string) for _, opt := range strings.Split(str, ",") { if len(opt) > 0 { res := strings.SplitN(opt, "=", 2) if len(res) == 2 { m[res[0]] = res[1] } else { m[opt] = "" } } } return m } // GenericStatFS returns a statfs struct filled with the common fields for a // general filesystem. This is analogous to Linux's fs/libfs.cs:simple_statfs(). func GenericStatFS(fsMagic uint64) linux.Statfs { return linux.Statfs{ Type: fsMagic, BlockSize: hostarch.PageSize, NameLength: linux.NAME_MAX, } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/filesystem_refs.go000066400000000000000000000102161465435605700253000ustar00rootroot00000000000000package vfs import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const FilesystemenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var Filesystemobj *Filesystem // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type FilesystemRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *FilesystemRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *FilesystemRefs) RefType() string { return fmt.Sprintf("%T", Filesystemobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *FilesystemRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *FilesystemRefs) LogRefs() bool { return FilesystemenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *FilesystemRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *FilesystemRefs) IncRef() { v := r.refCount.Add(1) if FilesystemenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *FilesystemRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if FilesystemenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *FilesystemRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if FilesystemenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *FilesystemRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/filesystem_type.go000066400000000000000000000112231465435605700253210ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "bytes" "fmt" "strings" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) // A FilesystemType constructs filesystems. // // FilesystemType is analogous to Linux's struct file_system_type. type FilesystemType interface { // GetFilesystem returns a Filesystem configured by the given options, // along with its mount root. A reference is taken on the returned // Filesystem and Dentry whose ownership is transferred to the caller. GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error) // Name returns the name of this FilesystemType. Name() string // Release releases all resources held by this FilesystemType. Release(ctx context.Context) } // GetFilesystemOptions contains options to FilesystemType.GetFilesystem. type GetFilesystemOptions struct { // InternalMount indicates whether the mount operation is coming from the // application, i.e. through mount(2). If InternalMount is true, allow the use // of filesystem types for which RegisterFilesystemTypeOptions.AllowUserMount // == false. InternalMount bool // Data is the string passed as the 5th argument to mount(2), which is // usually a comma-separated list of filesystem-specific mount options. Data string // InternalData holds opaque FilesystemType-specific data. There is // intentionally no way for applications to specify InternalData; if it is // not nil, the call to GetFilesystem originates from within the sentry. InternalData any } // +stateify savable type registeredFilesystemType struct { fsType FilesystemType opts RegisterFilesystemTypeOptions } // RegisterFilesystemTypeOptions contains options to // VirtualFilesystem.RegisterFilesystem(). // // +stateify savable type RegisterFilesystemTypeOptions struct { // AllowUserMount determines whether users are allowed to mount a file system // of this type, i.e. through mount(2). If AllowUserMount is true, allow calls // to VirtualFilesystem.MountAt() for which MountOptions.InternalMount == false // to use this filesystem type. AllowUserMount bool // If AllowUserList is true, make this filesystem type visible in // /proc/filesystems. AllowUserList bool // If RequiresDevice is true, indicate that mounting this filesystem // requires a block device as the mount source in /proc/filesystems. RequiresDevice bool } // RegisterFilesystemType registers the given FilesystemType in vfs with the // given name. func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) error { vfs.fsTypesMu.Lock() defer vfs.fsTypesMu.Unlock() if existing, ok := vfs.fsTypes[name]; ok { return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing.fsType) } vfs.fsTypes[name] = ®isteredFilesystemType{ fsType: fsType, opts: *opts, } return nil } // MustRegisterFilesystemType is equivalent to RegisterFilesystemType but // panics on failure. func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) { if err := vfs.RegisterFilesystemType(name, fsType, opts); err != nil { panic(fmt.Sprintf("failed to register filesystem type %T: %v", fsType, err)) } } func (vfs *VirtualFilesystem) getFilesystemType(name string) *registeredFilesystemType { vfs.fsTypesMu.RLock() defer vfs.fsTypesMu.RUnlock() fsname := name // Fetch a meaningful part of name if there is a dot in the name // and use left part of a string as fname. if strings.Index(name, ".") != -1 { fsname = strings.Split(name, ".")[0] } return vfs.fsTypes[fsname] } // GenerateProcFilesystems emits the contents of /proc/filesystems for vfs to // buf. func (vfs *VirtualFilesystem) GenerateProcFilesystems(buf *bytes.Buffer) { vfs.fsTypesMu.RLock() defer vfs.fsTypesMu.RUnlock() for name, rft := range vfs.fsTypes { if !rft.opts.AllowUserList { continue } var nodev string if !rft.opts.RequiresDevice { nodev = "nodev" } fmt.Fprintf(buf, "%s\t%s\n", nodev, name) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/inotify.go000066400000000000000000000555411465435605700235700ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "bytes" "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // inotifyEventBaseSize is the base size of linux's struct inotify_event. This // must be a power 2 for rounding below. const inotifyEventBaseSize = 16 // EventType defines different kinds of inotfiy events. // // The way events are labelled appears somewhat arbitrary, but they must match // Linux so that IN_EXCL_UNLINK behaves as it does in Linux. // // +stateify savable type EventType uint8 // PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and // FSNOTIFY_EVENT_INODE in Linux. const ( PathEvent EventType = iota InodeEvent EventType = iota ) // Inotify represents an inotify instance created by inotify_init(2) or // inotify_init1(2). Inotify implements FileDescriptionImpl. // // +stateify savable type Inotify struct { vfsfd FileDescription FileDescriptionDefaultImpl DentryMetadataFileDescriptionImpl NoLockFD // Unique identifier for this inotify instance. We don't just reuse the // inotify fd because fds can be duped. These should not be exposed to the // user, since we may aggressively reuse an id on S/R. id uint64 // queue is used to notify interested parties when the inotify instance // becomes readable or writable. queue waiter.Queue // evMu *only* protects the events list. We need a separate lock while // queuing events: using mu may violate lock ordering, since at that point // the calling goroutine may already hold Watches.mu. evMu inotifyEventMutex `state:"nosave"` // A list of pending events for this inotify instance. Protected by evMu. events eventList // A scratch buffer, used to serialize inotify events. Allocate this // ahead of time for the sake of performance. Protected by evMu. scratch []byte // mu protects the fields below. mu inotifyMutex `state:"nosave"` // nextWatchMinusOne is used to allocate watch descriptors on this Inotify // instance. Note that Linux starts numbering watch descriptors from 1. nextWatchMinusOne int32 // Map from watch descriptors to watch objects. watches map[int32]*Watch } var _ FileDescriptionImpl = (*Inotify)(nil) // NewInotifyFD constructs a new Inotify instance. func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) (*FileDescription, error) { // O_CLOEXEC affects file descriptors, so it must be handled outside of vfs. flags &^= linux.O_CLOEXEC if flags&^linux.O_NONBLOCK != 0 { return nil, linuxerr.EINVAL } id := uniqueid.GlobalFromContext(ctx) vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id)) defer vd.DecRef(ctx) fd := &Inotify{ id: id, scratch: make([]byte, inotifyEventBaseSize), watches: make(map[int32]*Watch), } if err := fd.vfsfd.Init(fd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ UseDentryMetadata: true, DenyPRead: true, DenyPWrite: true, }); err != nil { return nil, err } return &fd.vfsfd, nil } // Release implements FileDescriptionImpl.Release. Release removes all // watches and frees all resources for an inotify instance. func (i *Inotify) Release(ctx context.Context) { var ds []*Dentry // We need to hold i.mu to avoid a race with concurrent calls to // Inotify.handleDeletion from Watches. There's no risk of Watches // accessing this Inotify after the destructor ends, because we remove all // references to it below. i.mu.Lock() for _, w := range i.watches { // Remove references to the watch from the watches set on the target. We // don't need to worry about the references from i.watches, since this // file description is about to be destroyed. d := w.target ws := d.Watches() // Watchable dentries should never return a nil watch set. if ws == nil { panic("Cannot remove watch from an unwatchable dentry") } ws.Remove(i.id) if ws.Size() == 0 { ds = append(ds, d) } } i.mu.Unlock() for _, d := range ds { d.OnZeroWatches(ctx) } } // Allocate implements FileDescription.Allocate. func (i *Inotify) Allocate(ctx context.Context, mode, offset, length uint64) error { panic("Allocate should not be called on read-only inotify fds") } // EventRegister implements waiter.Waitable. func (i *Inotify) EventRegister(e *waiter.Entry) error { i.queue.EventRegister(e) return nil } // EventUnregister implements waiter.Waitable. func (i *Inotify) EventUnregister(e *waiter.Entry) { i.queue.EventUnregister(e) } // Readiness implements waiter.Waitable.Readiness. // // Readiness indicates whether there are pending events for an inotify instance. func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask { ready := waiter.EventMask(0) i.evMu.Lock() defer i.evMu.Unlock() if !i.events.Empty() { ready |= waiter.ReadableEvents } return mask & ready } // Epollable implements FileDescriptionImpl.Epollable. func (i *Inotify) Epollable() bool { return true } // PRead implements FileDescriptionImpl.PRead. func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { return 0, linuxerr.ESPIPE } // PWrite implements FileDescriptionImpl.PWrite. func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { return 0, linuxerr.ESPIPE } // Write implements FileDescriptionImpl.Write. func (*Inotify) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { return 0, linuxerr.EBADF } // Read implements FileDescriptionImpl.Read. func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { if dst.NumBytes() < inotifyEventBaseSize { return 0, linuxerr.EINVAL } i.evMu.Lock() defer i.evMu.Unlock() if i.events.Empty() { // Nothing to read yet, tell caller to block. return 0, linuxerr.ErrWouldBlock } var writeLen int64 for it := i.events.Front(); it != nil; { // Advance `it` before the element is removed from the list, or else // it.Next() will always be nil. event := it it = it.Next() // Does the buffer have enough remaining space to hold the event we're // about to write out? if dst.NumBytes() < int64(event.sizeOf()) { if writeLen > 0 { // Buffer wasn't big enough for all pending events, but we did // write some events out. return writeLen, nil } return 0, linuxerr.EINVAL } // Linux always dequeues an available event as long as there's enough // buffer space to copy it out, even if the copy below fails. Emulate // this behaviour. i.events.Remove(event) // Buffer has enough space, copy event to the read buffer. n, err := event.CopyTo(ctx, i.scratch, dst) if err != nil { return 0, err } writeLen += n dst = dst.DropFirst64(n) } return writeLen, nil } // Ioctl implements FileDescriptionImpl.Ioctl. func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { switch args[1].Int() { case linux.FIONREAD: i.evMu.Lock() var n uint32 for e := i.events.Front(); e != nil; e = e.Next() { n += uint32(e.sizeOf()) } i.evMu.Unlock() var buf [4]byte hostarch.ByteOrder.PutUint32(buf[:], n) _, err := uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{}) return 0, err default: return 0, linuxerr.ENOTTY } } func (i *Inotify) queueEvent(ev *Event) { i.evMu.Lock() // Check if we should coalesce the event we're about to queue with the last // one currently in the queue. Events are coalesced if they are identical. if last := i.events.Back(); last != nil { if ev.equals(last) { // "Coalesce" the two events by simply not queuing the new one. We // don't need to raise a waiter.EventIn notification because no new // data is available for reading. i.evMu.Unlock() return } } i.events.PushBack(ev) // Release mutex before notifying waiters because we don't control what they // can do. i.evMu.Unlock() i.queue.Notify(waiter.ReadableEvents) } // newWatchLocked creates and adds a new watch to target. // // Precondition: i.mu must be locked. ws must be the watch set for target d. func (i *Inotify) newWatchLocked(d *Dentry, ws *Watches, mask uint32) *Watch { w := &Watch{ owner: i, wd: i.nextWatchIDLocked(), target: d, mask: atomicbitops.FromUint32(mask), } // Hold the watch in this inotify instance as well as the watch set on the // target. i.watches[w.wd] = w ws.Add(w) return w } // newWatchIDLocked allocates and returns a new watch descriptor. // // Precondition: i.mu must be locked. func (i *Inotify) nextWatchIDLocked() int32 { i.nextWatchMinusOne++ return i.nextWatchMinusOne } // AddWatch constructs a new inotify watch and adds it to the target. It // returns the watch descriptor returned by inotify_add_watch(2). // // The caller must hold a reference on target. func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 { // Note: Locking this inotify instance protects the result returned by // Lookup() below. With the lock held, we know for sure the lookup result // won't become stale because it's impossible for *this* instance to // add/remove watches on target. i.mu.Lock() defer i.mu.Unlock() ws := target.Watches() // Does the target already have a watch from this inotify instance? if existing := ws.Lookup(i.id); existing != nil { newmask := mask if mask&linux.IN_MASK_ADD != 0 { // "Add (OR) events to watch mask for this pathname if it already // exists (instead of replacing mask)." -- inotify(7) newmask |= existing.mask.Load() } existing.mask.Store(newmask) return existing.wd } // No existing watch, create a new watch. w := i.newWatchLocked(target, ws, mask) return w.wd } // RmWatch looks up an inotify watch for the given 'wd' and configures the // target to stop sending events to this inotify instance. func (i *Inotify) RmWatch(ctx context.Context, wd int32) error { i.mu.Lock() // Find the watch we were asked to removed. w, ok := i.watches[wd] if !ok { i.mu.Unlock() return linuxerr.EINVAL } // Remove the watch from this instance. delete(i.watches, wd) // Remove the watch from the watch target. ws := w.target.Watches() // AddWatch ensures that w.target has a non-nil watch set. if ws == nil { panic("Watched dentry cannot have nil watch set") } ws.Remove(w.OwnerID()) remaining := ws.Size() i.mu.Unlock() if remaining == 0 { w.target.OnZeroWatches(ctx) } // Generate the event for the removal. i.queueEvent(newEvent(wd, "", linux.IN_IGNORED, 0)) return nil } // Watches is the collection of all inotify watches on a single file. // // +stateify savable type Watches struct { // mu protects the fields below. mu sync.RWMutex `state:"nosave"` // ws is the map of active watches in this collection, keyed by the inotify // instance id of the owner. ws map[uint64]*Watch } // Size returns the number of watches held by w. func (w *Watches) Size() int { w.mu.Lock() defer w.mu.Unlock() return len(w.ws) } // Lookup returns the watch owned by an inotify instance with the given id. // Returns nil if no such watch exists. // // Precondition: the inotify instance with the given id must be locked to // prevent the returned watch from being concurrently modified or replaced in // Inotify.watches. func (w *Watches) Lookup(id uint64) *Watch { w.mu.Lock() defer w.mu.Unlock() return w.ws[id] } // Add adds watch into this set of watches. // // Precondition: the inotify instance with the given id must be locked. func (w *Watches) Add(watch *Watch) { w.mu.Lock() defer w.mu.Unlock() owner := watch.OwnerID() // Sanity check, we should never have two watches for one owner on the // same target. if _, exists := w.ws[owner]; exists { panic(fmt.Sprintf("Watch collision with ID %+v", owner)) } if w.ws == nil { w.ws = make(map[uint64]*Watch) } w.ws[owner] = watch } // Remove removes a watch with the given id from this set of watches and // releases it. The caller is responsible for generating any watch removal // event, as appropriate. The provided id must match an existing watch in this // collection. // // Precondition: the inotify instance with the given id must be locked. func (w *Watches) Remove(id uint64) { w.mu.Lock() defer w.mu.Unlock() if w.ws == nil { // This watch set is being destroyed. The thread executing the // destructor is already in the process of deleting all our watches. We // got here with no references on the target because we raced with the // destructor notifying all the watch owners of destruction. See the // comment in Watches.HandleDeletion for why this race exists. return } // It is possible for w.Remove() to be called for the same watch multiple // times. See the treatment of one-shot watches in Watches.Notify(). if _, ok := w.ws[id]; ok { delete(w.ws, id) } } // Notify queues a new event with watches in this set. Watches with // IN_EXCL_UNLINK are skipped if the event is coming from a child that has been // unlinked. func (w *Watches) Notify(ctx context.Context, name string, events, cookie uint32, et EventType, unlinked bool) { var hasExpired bool w.mu.RLock() for _, watch := range w.ws { if unlinked && watch.ExcludeUnlinked() && et == PathEvent { continue } if watch.Notify(name, events, cookie) { hasExpired = true } } w.mu.RUnlock() if hasExpired { w.cleanupExpiredWatches(ctx) } } // This function is relatively expensive and should only be called where there // are expired watches. func (w *Watches) cleanupExpiredWatches(ctx context.Context) { // Because of lock ordering, we cannot acquire Inotify.mu for each watch // owner while holding w.mu. As a result, store expired watches locally // before removing. var toRemove []*Watch w.mu.RLock() for _, watch := range w.ws { if watch.expired.Load() == 1 { toRemove = append(toRemove, watch) } } w.mu.RUnlock() for _, watch := range toRemove { watch.owner.RmWatch(ctx, watch.wd) } } // HandleDeletion is called when the watch target is destroyed. Clear the // watch set, detach watches from the inotify instances they belong to, and // generate the appropriate events. func (w *Watches) HandleDeletion(ctx context.Context) { w.Notify(ctx, "", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */) // As in Watches.Notify, we can't hold w.mu while acquiring Inotify.mu for // the owner of each watch being deleted. Instead, atomically store the // watches map in a local variable and set it to nil so we can iterate over // it with the assurance that there will be no concurrent accesses. var ws map[uint64]*Watch w.mu.Lock() ws = w.ws w.ws = nil w.mu.Unlock() // Remove each watch from its owner's watch set, and generate a corresponding // watch removal event. for _, watch := range ws { i := watch.owner i.mu.Lock() _, found := i.watches[watch.wd] delete(i.watches, watch.wd) // Release mutex before notifying waiters because we don't control what // they can do. i.mu.Unlock() // If watch was not found, it was removed from the inotify instance before // we could get to it, in which case we should not generate an event. if found { i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0)) } } } // Watch represent a particular inotify watch created by inotify_add_watch. // // +stateify savable type Watch struct { // Inotify instance which owns this watch. // // This field is immutable after creation. owner *Inotify // Descriptor for this watch. This is unique across an inotify instance. // // This field is immutable after creation. wd int32 // target is a dentry representing the watch target. Its watch set contains this watch. // // This field is immutable after creation. target *Dentry // Events being monitored via this watch. mask atomicbitops.Uint32 // expired is set to 1 to indicate that this watch is a one-shot that has // already sent a notification and therefore can be removed. expired atomicbitops.Int32 } // OwnerID returns the id of the inotify instance that owns this watch. func (w *Watch) OwnerID() uint64 { return w.owner.id } // ExcludeUnlinked indicates whether the watched object should continue to be // notified of events originating from a path that has been unlinked. // // For example, if "foo/bar" is opened and then unlinked, operations on the // open fd may be ignored by watches on "foo" and "foo/bar" with IN_EXCL_UNLINK. func (w *Watch) ExcludeUnlinked() bool { return w.mask.Load()&linux.IN_EXCL_UNLINK != 0 } // Notify queues a new event on this watch. Returns true if this is a one-shot // watch that should be deleted, after this event was successfully queued. func (w *Watch) Notify(name string, events uint32, cookie uint32) bool { if w.expired.Load() == 1 { // This is a one-shot watch that is already in the process of being // removed. This may happen if a second event reaches the watch target // before this watch has been removed. return false } mask := w.mask.Load() if mask&events == 0 { // We weren't watching for this event. return false } // Event mask should include bits matched from the watch plus all control // event bits. unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS effectiveMask := unmaskableBits | mask matchedEvents := effectiveMask & events w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie)) if mask&linux.IN_ONESHOT != 0 { w.expired.Store(1) return true } return false } // Event represents a struct inotify_event from linux. // // +stateify savable type Event struct { eventEntry wd int32 mask uint32 cookie uint32 // len is computed based on the name field is set automatically by // Event.setName. It should be 0 when no name is set; otherwise it is the // length of the name slice. len uint32 // The name field has special padding requirements and should only be set by // calling Event.setName. name []byte } func newEvent(wd int32, name string, events, cookie uint32) *Event { e := &Event{ wd: wd, mask: events, cookie: cookie, } if name != "" { e.setName(name) } return e } // paddedBytes converts a go string to a null-terminated c-string, padded with // null bytes to a total size of 'l'. 'l' must be large enough for all the bytes // in the 's' plus at least one null byte. func paddedBytes(s string, l uint32) []byte { if l < uint32(len(s)+1) { panic("Converting string to byte array results in truncation, this can lead to buffer-overflow due to the missing null-byte!") } b := make([]byte, l) copy(b, s) // b was zero-value initialized during make(), so the rest of the slice is // already filled with null bytes. return b } // setName sets the optional name for this event. func (e *Event) setName(name string) { // We need to pad the name such that the entire event length ends up a // multiple of inotifyEventBaseSize. unpaddedLen := len(name) + 1 // Round up to nearest multiple of inotifyEventBaseSize. e.len = uint32((unpaddedLen + inotifyEventBaseSize - 1) & ^(inotifyEventBaseSize - 1)) // Make sure we haven't overflowed and wrapped around when rounding. if unpaddedLen > int(e.len) { panic("Overflow when rounding inotify event size, the 'name' field was too big.") } e.name = paddedBytes(name, e.len) } func (e *Event) sizeOf() int { s := inotifyEventBaseSize + int(e.len) if s < inotifyEventBaseSize { panic("Overflowed event size") } return s } // CopyTo serializes this event to dst. buf is used as a scratch buffer to // construct the output. We use a buffer allocated ahead of time for // performance. buf must be at least inotifyEventBaseSize bytes. func (e *Event) CopyTo(ctx context.Context, buf []byte, dst usermem.IOSequence) (int64, error) { hostarch.ByteOrder.PutUint32(buf[0:], uint32(e.wd)) hostarch.ByteOrder.PutUint32(buf[4:], e.mask) hostarch.ByteOrder.PutUint32(buf[8:], e.cookie) hostarch.ByteOrder.PutUint32(buf[12:], e.len) writeLen := 0 n, err := dst.CopyOut(ctx, buf) if err != nil { return 0, err } writeLen += n dst = dst.DropFirst(n) if e.len > 0 { n, err = dst.CopyOut(ctx, e.name) if err != nil { return 0, err } writeLen += n } // Santiy check. if writeLen != e.sizeOf() { panic(fmt.Sprintf("Serialized unexpected amount of data for an event, expected %d, wrote %d.", e.sizeOf(), writeLen)) } return int64(writeLen), nil } func (e *Event) equals(other *Event) bool { return e.wd == other.wd && e.mask == other.mask && e.cookie == other.cookie && e.len == other.len && bytes.Equal(e.name, other.name) } // InotifyEventFromStatMask generates the appropriate events for an operation // that set the stats specified in mask. func InotifyEventFromStatMask(mask uint32) uint32 { var ev uint32 if mask&(linux.STATX_UID|linux.STATX_GID|linux.STATX_MODE) != 0 { ev |= linux.IN_ATTRIB } if mask&linux.STATX_SIZE != 0 { ev |= linux.IN_MODIFY } if (mask & (linux.STATX_ATIME | linux.STATX_MTIME)) == (linux.STATX_ATIME | linux.STATX_MTIME) { // Both times indicates a utime(s) call. ev |= linux.IN_ATTRIB } else if mask&linux.STATX_ATIME != 0 { ev |= linux.IN_ACCESS } else if mask&linux.STATX_MTIME != 0 { ev |= linux.IN_MODIFY } return ev } // InotifyRemoveChild sends the appropriate notifications to the watch sets of // the child being removed and its parent. Note that unlike most pairs of // parent/child notifications, the child is notified first in this case. func InotifyRemoveChild(ctx context.Context, self, parent *Watches, name string) { if self != nil { self.Notify(ctx, "", linux.IN_ATTRIB, 0, InodeEvent, true /* unlinked */) } if parent != nil { parent.Notify(ctx, name, linux.IN_DELETE, 0, InodeEvent, true /* unlinked */) } } // InotifyRename sends the appropriate notifications to the watch sets of the // file being renamed and its old/new parents. func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, oldName, newName string, isDir bool) { var dirEv uint32 if isDir { dirEv = linux.IN_ISDIR } cookie := uniqueid.InotifyCookie(ctx) if oldParent != nil { oldParent.Notify(ctx, oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent, false /* unlinked */) } if newParent != nil { newParent.Notify(ctx, newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent, false /* unlinked */) } // Somewhat surprisingly, self move events do not have a cookie. if renamed != nil { renamed.Notify(ctx, "", linux.IN_MOVE_SELF, 0, InodeEvent, false /* unlinked */) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/inotify_event_mutex.go000066400000000000000000000033171465435605700262050ustar00rootroot00000000000000package vfs import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type inotifyEventMutex struct { mu sync.Mutex } var inotifyEventprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var inotifyEventlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type inotifyEventlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *inotifyEventMutex) Lock() { locking.AddGLock(inotifyEventprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *inotifyEventMutex) NestedLock(i inotifyEventlockNameIndex) { locking.AddGLock(inotifyEventprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *inotifyEventMutex) Unlock() { locking.DelGLock(inotifyEventprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *inotifyEventMutex) NestedUnlock(i inotifyEventlockNameIndex) { locking.DelGLock(inotifyEventprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func inotifyEventinitLockNames() {} func init() { inotifyEventinitLockNames() inotifyEventprefixIndex = locking.NewMutexClass(reflect.TypeOf(inotifyEventMutex{}), inotifyEventlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/inotify_mutex.go000066400000000000000000000031601465435605700250000ustar00rootroot00000000000000package vfs import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type inotifyMutex struct { mu sync.Mutex } var inotifyprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var inotifylockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type inotifylockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *inotifyMutex) Lock() { locking.AddGLock(inotifyprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *inotifyMutex) NestedLock(i inotifylockNameIndex) { locking.AddGLock(inotifyprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *inotifyMutex) Unlock() { locking.DelGLock(inotifyprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *inotifyMutex) NestedUnlock(i inotifylockNameIndex) { locking.DelGLock(inotifyprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func inotifyinitLockNames() {} func init() { inotifyinitLockNames() inotifyprefixIndex = locking.NewMutexClass(reflect.TypeOf(inotifyMutex{}), inotifylockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/lock.go000066400000000000000000000062151465435605700230310ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" fslock "gvisor.dev/gvisor/pkg/sentry/fsimpl/lock" ) // FileLocks supports POSIX and BSD style locks, which correspond to fcntl(2) // and flock(2) respectively in Linux. It can be embedded into various file // implementations that support locking. // // Note that in Linux these two types of locks are _not_ cooperative, because // race and deadlock conditions make merging them prohibitive. We do the same // and keep them oblivious to each other. // // +stateify savable type FileLocks struct { // bsd is a set of BSD-style advisory file wide locks, see flock(2). bsd fslock.Locks // posix is a set of POSIX-style regional advisory locks, see fcntl(2). posix fslock.Locks } // LockBSD tries to acquire a BSD-style lock on the entire file. func (fl *FileLocks) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerID int32, t fslock.LockType, block bool) error { if err := fl.bsd.LockRegion(ctx, uid, ownerID, t, fslock.LockRange{0, fslock.LockEOF}, false, block); err == nil || err == linuxerr.ErrWouldBlock { return err } return linuxerr.ERESTARTSYS } // UnlockBSD releases a BSD-style lock on the entire file. // // This operation is always successful, even if there did not exist a lock on // the requested region held by uid in the first place. func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) { fl.bsd.UnlockRegion(uid, fslock.LockRange{0, fslock.LockEOF}) } // LockPOSIX tries to acquire a POSIX-style lock on a file region. func (fl *FileLocks) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block bool) error { _, ofd := uid.(*FileDescription) if err := fl.posix.LockRegion(ctx, uid, ownerPID, t, r, ofd, block); err == nil || err == linuxerr.ErrWouldBlock { return err } return linuxerr.ERESTARTSYS } // UnlockPOSIX releases a POSIX-style lock on a file region. // // This operation is always successful, even if there did not exist a lock on // the requested region held by uid in the first place. func (fl *FileLocks) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { fl.posix.UnlockRegion(uid, r) return nil } // TestPOSIX returns information about whether the specified lock can be held, in the style of the F_GETLK fcntl. func (fl *FileLocks) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { _, ofd := uid.(*FileDescription) return fl.posix.TestRegion(ctx, uid, t, r, ofd), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/memxattr/000077500000000000000000000000001465435605700234075ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/memxattr/memxattr_state_autogen.go000066400000000000000000000014751465435605700305300ustar00rootroot00000000000000// automatically generated by stateify. package memxattr import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (x *SimpleExtendedAttributes) StateTypeName() string { return "pkg/sentry/vfs/memxattr.SimpleExtendedAttributes" } func (x *SimpleExtendedAttributes) StateFields() []string { return []string{ "xattrs", } } func (x *SimpleExtendedAttributes) beforeSave() {} // +checklocksignore func (x *SimpleExtendedAttributes) StateSave(stateSinkObject state.Sink) { x.beforeSave() stateSinkObject.Save(0, &x.xattrs) } func (x *SimpleExtendedAttributes) afterLoad(context.Context) {} // +checklocksignore func (x *SimpleExtendedAttributes) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &x.xattrs) } func init() { state.Register((*SimpleExtendedAttributes)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/memxattr/xattr.go000066400000000000000000000073761465435605700251150ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package memxattr provides a default, in-memory extended attribute // implementation. package memxattr import ( "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) // SimpleExtendedAttributes implements extended attributes using a map of // names to values. // // SimpleExtendedAttributes calls vfs.CheckXattrPermissions, so callers are not // required to do so. // // +stateify savable type SimpleExtendedAttributes struct { // mu protects the below fields. mu sync.RWMutex `state:"nosave"` xattrs map[string]string } // GetXattr returns the value at 'name'. func (x *SimpleExtendedAttributes) GetXattr(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, opts *vfs.GetXattrOptions) (string, error) { if err := vfs.CheckXattrPermissions(creds, vfs.MayRead, mode, kuid, opts.Name); err != nil { return "", err } x.mu.RLock() value, ok := x.xattrs[opts.Name] x.mu.RUnlock() if !ok { return "", linuxerr.ENODATA } // Check that the size of the buffer provided in getxattr(2) is large enough // to contain the value. if opts.Size != 0 && uint64(len(value)) > opts.Size { return "", linuxerr.ERANGE } return value, nil } // SetXattr sets 'value' at 'name'. func (x *SimpleExtendedAttributes) SetXattr(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, opts *vfs.SetXattrOptions) error { if err := vfs.CheckXattrPermissions(creds, vfs.MayWrite, mode, kuid, opts.Name); err != nil { return err } x.mu.Lock() defer x.mu.Unlock() if x.xattrs == nil { if opts.Flags&linux.XATTR_REPLACE != 0 { return linuxerr.ENODATA } x.xattrs = make(map[string]string) } _, ok := x.xattrs[opts.Name] if ok && opts.Flags&linux.XATTR_CREATE != 0 { return linuxerr.EEXIST } if !ok && opts.Flags&linux.XATTR_REPLACE != 0 { return linuxerr.ENODATA } x.xattrs[opts.Name] = opts.Value return nil } // ListXattr returns all names in xattrs. func (x *SimpleExtendedAttributes) ListXattr(creds *auth.Credentials, size uint64) ([]string, error) { // Keep track of the size of the buffer needed in listxattr(2) for the list. listSize := 0 x.mu.RLock() names := make([]string, 0, len(x.xattrs)) haveCap := creds.HasCapability(linux.CAP_SYS_ADMIN) for n := range x.xattrs { // Hide extended attributes in the "trusted" namespace from // non-privileged users. This is consistent with Linux's // fs/xattr.c:simple_xattr_list(). if !haveCap && strings.HasPrefix(n, linux.XATTR_TRUSTED_PREFIX) { continue } names = append(names, n) // Add one byte per null terminator. listSize += len(n) + 1 } x.mu.RUnlock() if size != 0 && uint64(listSize) > size { return nil, linuxerr.ERANGE } return names, nil } // RemoveXattr removes the xattr at 'name'. func (x *SimpleExtendedAttributes) RemoveXattr(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, name string) error { if err := vfs.CheckXattrPermissions(creds, vfs.MayWrite, mode, kuid, name); err != nil { return err } x.mu.Lock() defer x.mu.Unlock() if _, ok := x.xattrs[name]; !ok { return linuxerr.ENODATA } delete(x.xattrs, name) return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/mount.go000066400000000000000000001375251465435605700232540ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "bytes" "fmt" "math" "sort" "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) // MountMax is the maximum number of mounts allowed. In Linux this can be // configured by the user at /proc/sys/fs/mount-max, but the default is // 100,000. We set the gVisor limit to 10,000. const ( MountMax = 10000 nsfsName = "nsfs" cgroupFsName = "cgroup" ) // A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem // (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem // (Mount.fs), which applies to path resolution in the context of a particular // Mount (Mount.key.parent). // // Mounts are reference-counted. Unless otherwise specified, all Mount methods // require that a reference is held. // // Mount and Filesystem are distinct types because it's possible for a single // Filesystem to be mounted at multiple locations and/or in multiple mount // namespaces. // // Mount is analogous to Linux's struct mount. (gVisor does not distinguish // between struct mount and struct vfsmount.) // // +stateify savable type Mount struct { // vfs, fs, root are immutable. References are held on fs and root. // Note that for a disconnected mount, root may be nil. // // Invariant: if not nil, root belongs to fs. vfs *VirtualFilesystem fs *Filesystem root *Dentry // ID is the immutable mount ID. ID uint64 // Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except // for MS_RDONLY which is tracked in "writers". flags is protected by // VirtualFilesystem.mountMu. flags MountFlags // key is protected by VirtualFilesystem.mountMu and // VirtualFilesystem.mounts.seq, and may be nil. References are held on // key.parent and key.point if they are not nil. // // Invariant: key.parent != nil iff key.point != nil. key.point belongs to // key.parent.fs. key mountKey `state:".(VirtualDentry)"` // ns is the namespace in which this Mount was mounted. ns is protected by // VirtualFilesystem.mountMu. ns *MountNamespace // The lower 63 bits of refs are a reference count. The MSB of refs is set // if the Mount has been eagerly umounted, as by umount(2) without the // MNT_DETACH flag. refs is accessed using atomic memory operations. refs atomicbitops.Int64 // children is the set of all Mounts for which Mount.key.parent is this // Mount. children is protected by VirtualFilesystem.mountMu. children map[*Mount]struct{} // isShared indicates this mount has the MS_SHARED propagation type. isShared bool // sharedEntry is an entry in a circular list (ring) of mounts in a shared // peer group. sharedEntry mountEntry // followerList is a list of mounts which has this mount as its leader. followerList followerList // followerEntry is an entry in a followerList. followerEntry // leader is the mount that this mount receives propagation events from. leader *Mount // groupID is the ID for this mount's shared peer group. If the mount is not // in a peer group, this is 0. groupID uint32 // umounted is true if VFS.umountRecursiveLocked() has been called on this // Mount. VirtualFilesystem does not hold a reference on Mounts for which // umounted is true. umounted is protected by VirtualFilesystem.mountMu. umounted bool // locked is true if the mount cannot be unmounted in the current mount // namespace. It is analogous to MNT_LOCKED in Linux. locked bool // The lower 63 bits of writers is the number of calls to // Mount.CheckBeginWrite() that have not yet been paired with a call to // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect. // writers is accessed using atomic memory operations. writers atomicbitops.Int64 } func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount { mnt := &Mount{ ID: vfs.lastMountID.Add(1), flags: opts.Flags, vfs: vfs, fs: fs, root: root, ns: mntns, locked: opts.Locked, isShared: false, refs: atomicbitops.FromInt64(1), } if opts.ReadOnly { mnt.setReadOnlyLocked(true) } mnt.sharedEntry.Init(mnt) refs.Register(mnt) return mnt } // Options returns a copy of the MountOptions currently applicable to mnt. func (mnt *Mount) Options() MountOptions { mnt.vfs.lockMounts() defer mnt.vfs.unlockMounts(context.Background()) return MountOptions{ Flags: mnt.flags, ReadOnly: mnt.ReadOnlyLocked(), } } // setMountOptions sets mnt's options to the given opts. // // Preconditions: // - vfs.mountMu must be locked. func (mnt *Mount) setMountOptions(opts *MountOptions) error { if opts == nil { return linuxerr.EINVAL } if err := mnt.setReadOnlyLocked(opts.ReadOnly); err != nil { return err } mnt.flags = opts.Flags return nil } // MountFlags returns a bit mask that indicates mount options. func (mnt *Mount) MountFlags() uint64 { mnt.vfs.lockMounts() defer mnt.vfs.unlockMounts(context.Background()) var flags uint64 if mnt.flags.NoExec { flags |= linux.ST_NOEXEC } if mnt.flags.NoATime { flags |= linux.ST_NOATIME } if mnt.flags.NoDev { flags |= linux.ST_NODEV } if mnt.flags.NoSUID { flags |= linux.ST_NOSUID } if mnt.ReadOnlyLocked() { flags |= linux.ST_RDONLY } return flags } func (mnt *Mount) isFollower() bool { return mnt.leader != nil } func (mnt *Mount) neverConnected() bool { return mnt.ns == nil } // coveringMount returns a mount that completely covers mnt if it exists and nil // otherwise. A mount that covers another is one that is the only child of its // parent and whose mountpoint is its parent's root. func (mnt *Mount) coveringMount() *Mount { if len(mnt.children) != 1 { return nil } // Get the child from the children map. var child *Mount for child = range mnt.children { break } if child.point() != mnt.root { return nil } return child } // validInMountNS checks if the mount is valid in the current mount namespace. This includes // checking if has previously been unmounted. It is analogous to fs/namespace.c:check_mnt() in // Linux. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) validInMountNS(ctx context.Context, mnt *Mount) bool { if mntns := MountNamespaceFromContext(ctx); mntns != nil { vfs.delayDecRef(mntns) return mnt.ns == mntns && !mnt.umounted } return false } // NewFilesystem creates a new filesystem object not yet associated with any // mounts. It can be installed into the filesystem tree with ConnectMountAt. // Note that only the filesystem-specific mount options from opts are used by // this function, mount flags are ignored. To set mount flags, pass them to a // corresponding ConnectMountAt. func (vfs *VirtualFilesystem) NewFilesystem(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*Filesystem, *Dentry, error) { rft := vfs.getFilesystemType(fsTypeName) if rft == nil { return nil, nil, linuxerr.ENODEV } if !opts.GetFilesystemOptions.InternalMount && !rft.opts.AllowUserMount { return nil, nil, linuxerr.ENODEV } return rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) } // NewDisconnectedMount returns a Mount representing fs with the given root // (which may be nil). The new Mount is not associated with any MountNamespace // and is not connected to any other Mounts. References are taken on fs and // root. func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, opts *MountOptions) *Mount { fs.IncRef() if root != nil { root.IncRef() } return newMount(vfs, fs, root, nil /* mntns */, opts) } // MountDisconnected creates a Filesystem configured by the given arguments, // then returns a Mount representing it. The new Mount is not associated with // any MountNamespace and is not connected to any other Mounts. func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) { fs, root, err := vfs.NewFilesystem(ctx, creds, source, fsTypeName, opts) if err != nil { return nil, err } return newMount(vfs, fs, root, nil /* mntns */, opts), nil } // attachTreeLocked attaches the mount tree at mnt to mp and propagates the mount to mp.mount's // peers and followers. This method consumes the reference on mp. It is analogous to // fs/namespace.c:attach_recursive_mnt() in Linux. The mount point mp must have its dentry locked // before calling attachTreeLocked. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) attachTreeLocked(ctx context.Context, mnt *Mount, mp VirtualDentry) error { cleanup := cleanup.Make(func() { vfs.cleanupGroupIDs(mnt.submountsLocked()) // +checklocksforce mp.dentry.mu.Unlock() vfs.delayDecRef(mp) }) defer cleanup.Clean() // This is equivalent to checking for SB_NOUSER in Linux, which is set on all // anon mounts and sentry-internal filesystems like pipefs. if mp.mount.neverConnected() { return linuxerr.EINVAL } defer func() { mp.mount.ns.pending = 0 }() if err := mp.mount.ns.checkMountCount(ctx, mnt); err != nil { return err } var ( propMnts map[*Mount]struct{} err error ) if mp.mount.isShared { if err := vfs.allocMountGroupIDs(mnt, true); err != nil { return err } propMnts, err = vfs.doPropagation(ctx, mnt, mp) if err != nil { for pmnt := range propMnts { if !pmnt.parent().neverConnected() { pmnt.parent().ns.pending -= pmnt.countSubmountsLocked() } vfs.abortUncommitedMount(ctx, pmnt) } return err } } cleanup.Release() if mp.mount.isShared { for _, m := range mnt.submountsLocked() { m.isShared = true } } vfs.mounts.seq.BeginWrite() vfs.connectLocked(mnt, mp, mp.mount.ns) vfs.mounts.seq.EndWrite() mp.dentry.mu.Unlock() vfs.commitChildren(ctx, mnt) var owner *auth.UserNamespace if mntns := MountNamespaceFromContext(ctx); mntns != nil { owner = mntns.Owner mntns.DecRef(ctx) } for pmnt := range propMnts { vfs.commitMount(ctx, pmnt) if pmnt.parent().ns.Owner != owner { vfs.lockMountTree(pmnt) } pmnt.locked = false } return nil } // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) lockMountTree(mnt *Mount) { for _, m := range mnt.submountsLocked() { // TODO(b/315839347): Add equivalents for MNT_LOCK_ATIME, // MNT_LOCK_READONLY, etc. m.locked = true } } // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) mountHasLockedChildren(mnt *Mount, vd VirtualDentry) bool { for child := range mnt.children { mp := child.getKey() if !mp.mount.fs.Impl().IsDescendant(vd, mp) { continue } if child.locked { return true } } return false } // ConnectMountAt connects mnt at the path represented by target. // // Preconditions: mnt must be disconnected. func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error { // We can't hold vfs.mountMu while calling FilesystemImpl methods due to // lock ordering. vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) if err != nil { return err } vfs.lockMounts() defer vfs.unlockMounts(ctx) mp, err := vfs.lockMountpoint(vd) if err != nil { return err } if mp.mount.neverConnected() || mp.mount.umounted { mp.dentry.mu.Unlock() vfs.delayDecRef(mp) return linuxerr.EINVAL } return vfs.attachTreeLocked(ctx, mnt, mp) } // lockMountpoint returns VirtualDentry with a locked Dentry. If vd is a // mountpoint, the method returns a VirtualDentry with a locked Dentry that is // the top most mount stacked on that Dentry. This method consumes a reference // on vd and returns a VirtualDentry with an extra reference. It is analogous to // fs/namespace.c:do_lock_mount() in Linux. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) lockMountpoint(vd VirtualDentry) (VirtualDentry, error) { vd.dentry.mu.Lock() for { if vd.mount.umounted || vd.dentry.dead { vd.dentry.mu.Unlock() vfs.delayDecRef(vd) return VirtualDentry{}, linuxerr.ENOENT } // vd might have been mounted over between vfs.GetDentryAt() and // vfs.mountMu.Lock(). if !vd.dentry.isMounted() { break } nextmnt := vfs.mounts.Lookup(vd.mount, vd.dentry) if nextmnt == nil { break } // It's possible that nextmnt has been umounted but not disconnected, // in which case vfs no longer holds a reference on it, and the last // reference may be concurrently dropped even though we're holding // vfs.mountMu. if !nextmnt.tryIncMountedRef() { break } // This can't fail since we're holding vfs.mountMu. nextmnt.root.IncRef() vd.dentry.mu.Unlock() vfs.delayDecRef(vd) vd = VirtualDentry{ mount: nextmnt, dentry: nextmnt.root, } vd.dentry.mu.Lock() } return vd, nil } // CloneMountAt returns a new mount with the same fs, specified root and // mount options. If mount options are nil, mnt's options are copied. The clone // is added to mnt's peer group if mnt is shared. If not the clone is in a // shared peer group by itself. func (vfs *VirtualFilesystem) CloneMountAt(mnt *Mount, root *Dentry, mopts *MountOptions) (*Mount, error) { vfs.lockMounts() defer vfs.unlockMounts(context.Background()) return vfs.cloneMount(mnt, root, mopts, makeSharedClone) } // cloneMount returns a new mount with mnt.fs as the filesystem and root as the // root, with a propagation type specified by cloneType. The returned mount has // an extra reference. If mopts is nil, use the options found in mnt. // This method is analogous to fs/namespace.c:clone_mnt() in Linux. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) cloneMount(mnt *Mount, root *Dentry, mopts *MountOptions, cloneType int) (*Mount, error) { opts := mopts if opts == nil { opts = &MountOptions{ Flags: mnt.flags, ReadOnly: mnt.ReadOnlyLocked(), } } clone := vfs.NewDisconnectedMount(mnt.fs, root, opts) if cloneType&(makeFollowerClone|makePrivateClone|sharedToFollowerClone) != 0 { clone.groupID = 0 } else { clone.groupID = mnt.groupID } if cloneType&makeSharedClone != 0 && clone.groupID == 0 { if err := vfs.allocateGroupID(clone); err != nil { vfs.delayDecRef(clone) return nil, err } } clone.isShared = mnt.isShared clone.locked = mnt.locked if cloneType&makeFollowerClone != 0 || (cloneType&sharedToFollowerClone != 0 && mnt.isShared) { mnt.followerList.PushFront(clone) clone.leader = mnt clone.isShared = false } else if cloneType&makePrivateClone == 0 { if cloneType&makeSharedClone != 0 || mnt.isShared { mnt.sharedEntry.Add(&clone.sharedEntry) } if mnt.isFollower() { mnt.leader.followerList.InsertAfter(mnt, clone) } clone.leader = mnt.leader } else { clone.isShared = false } if cloneType&makeSharedClone != 0 { clone.isShared = true } return clone, nil } type cloneTreeNode struct { prevMount *Mount parentMount *Mount } // cloneMountTree creates a copy of mnt's tree with the specified root // dentry at root. The new descendants are added to mnt's children list but are // not connected with call to connectLocked. // `cloneFunc` is a callback that is executed for each cloned mount. // This method is analogous to fs/namespace.c:copy_tree() in Linux. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) cloneMountTree(ctx context.Context, mnt *Mount, root *Dentry, cloneType int, cloneFunc func(ctx context.Context, oldmnt, newMnt *Mount)) (*Mount, error) { clone, err := vfs.cloneMount(mnt, root, nil, cloneType) if err != nil { return nil, err } if cloneFunc != nil { cloneFunc(ctx, mnt, clone) } queue := []cloneTreeNode{{mnt, clone}} for len(queue) != 0 { p := queue[len(queue)-1] queue = queue[:len(queue)-1] for c := range p.prevMount.children { if mp := c.getKey(); p.prevMount == mnt && !mp.mount.fs.Impl().IsDescendant(VirtualDentry{mnt, root}, mp) { continue } m, err := vfs.cloneMount(c, c.root, nil, cloneType) if err != nil { vfs.abortUncommitedMount(ctx, clone) return nil, err } mp := VirtualDentry{ mount: p.parentMount, dentry: c.point(), } mp.IncRef() m.setKey(mp) if p.parentMount.children == nil { p.parentMount.children = make(map[*Mount]struct{}) } p.parentMount.children[m] = struct{}{} if len(c.children) != 0 { queue = append(queue, cloneTreeNode{c, m}) } if cloneFunc != nil { cloneFunc(ctx, c, m) } } } return clone, nil } // BindAt creates a clone of the source path's parent mount and mounts it at // the target path. The new mount's root dentry is one pointed to by the source // path. func (vfs *VirtualFilesystem) BindAt(ctx context.Context, creds *auth.Credentials, source, target *PathOperation, recursive bool) error { sourceVd, err := vfs.GetDentryAt(ctx, creds, source, &GetDentryOptions{}) if err != nil { return err } defer sourceVd.DecRef(ctx) targetVd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) if err != nil { return err } vfs.lockMounts() defer vfs.unlockMounts(ctx) mp, err := vfs.lockMountpoint(targetVd) if err != nil { return err } cleanup := cleanup.Make(func() { mp.dentry.mu.Unlock() vfs.delayDecRef(mp) // +checklocksforce }) defer cleanup.Clean() // Namespace mounts can be binded to other mount points. fsName := sourceVd.mount.Filesystem().FilesystemType().Name() if !vfs.validInMountNS(ctx, sourceVd.mount) && fsName != nsfsName && fsName != cgroupFsName { return linuxerr.EINVAL } if !vfs.validInMountNS(ctx, mp.mount) { return linuxerr.EINVAL } var clone *Mount if recursive { clone, err = vfs.cloneMountTree(ctx, sourceVd.mount, sourceVd.dentry, 0, nil) } else { if vfs.mountHasLockedChildren(sourceVd.mount, sourceVd) { return linuxerr.EINVAL } clone, err = vfs.cloneMount(sourceVd.mount, sourceVd.dentry, nil, 0) } if err != nil { return err } cleanup.Release() vfs.delayDecRef(clone) clone.locked = false if err := vfs.attachTreeLocked(ctx, clone, mp); err != nil { vfs.abortUncomittedChildren(ctx, clone) return err } return nil } // RemountAt changes the mountflags and data of an existing mount without having to unmount and remount the filesystem. func (vfs *VirtualFilesystem) RemountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MountOptions) error { vd, err := vfs.getMountpoint(ctx, creds, pop) if err != nil { return err } defer vd.DecRef(ctx) vfs.lockMounts() defer vfs.unlockMounts(ctx) mnt := vd.Mount() if !vfs.validInMountNS(ctx, mnt) { return linuxerr.EINVAL } return mnt.setMountOptions(opts) } // MountAt creates and mounts a Filesystem configured by the given arguments. // The VirtualFilesystem will hold a reference to the Mount until it is // unmounted. // // This method returns the mounted Mount without a reference, for convenience // during VFS setup when there is no chance of racing with unmount. func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) (*Mount, error) { mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts) if err != nil { return nil, err } defer mnt.DecRef(ctx) if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil { return nil, err } return mnt, nil } // UmountAt removes the Mount at the given path. func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error { if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 { return linuxerr.EINVAL } // MNT_FORCE is currently unimplemented except for the permission check. // Force unmounting specifically requires CAP_SYS_ADMIN in the root user // namespace, and not in the owner user namespace for the target mount. See // fs/namespace.c:SYSCALL_DEFINE2(umount, ...) if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) { return linuxerr.EPERM } vd, err := vfs.getMountpoint(ctx, creds, pop) if err != nil { return err } defer vd.DecRef(ctx) vfs.lockMounts() defer vfs.unlockMounts(ctx) if vd.mount.locked { return linuxerr.EINVAL } if !vfs.validInMountNS(ctx, vd.mount) { return linuxerr.EINVAL } if vd.mount == vd.mount.ns.root { return linuxerr.EINVAL } if opts.Flags&linux.MNT_DETACH == 0 && vfs.arePropMountsBusy(vd.mount) { return linuxerr.EBUSY } // TODO(gvisor.dev/issue/1035): Linux special-cases umount of the caller's // root, which we don't implement yet (we'll just fail it since the caller // holds a reference on it). vfs.umountTreeLocked(vd.mount, &umountRecursiveOptions{ eager: opts.Flags&linux.MNT_DETACH == 0, disconnectHierarchy: true, propagate: true, }) return nil } // mountHasExpectedRefs checks that mnt has the correct number of references // before a umount. It is analogous to fs/pnode.c:do_refcount_check(). // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) mountHasExpectedRefs(mnt *Mount) bool { expectedRefs := int64(1) if !mnt.umounted { expectedRefs++ } if mnt.coveringMount() != nil { expectedRefs++ } return mnt.refs.Load()&^math.MinInt64 == expectedRefs // mask out MSB } // +stateify savable type umountRecursiveOptions struct { // If eager is true, ensure that future calls to Mount.tryIncMountedRef() // on umounted mounts fail. // // eager is analogous to Linux's UMOUNT_SYNC. eager bool // If disconnectHierarchy is true, Mounts that are umounted hierarchically // should be disconnected from their parents. (Mounts whose parents are not // umounted, which in most cases means the Mount passed to the initial call // to umountRecursiveLocked, are unconditionally disconnected for // consistency with Linux.) // // disconnectHierarchy is analogous to Linux's !UMOUNT_CONNECTED. disconnectHierarchy bool // If propagate is true, mounts located at the same point on the mount's // parent's peers and follows will also be umounted if they do not have any // children. // // propagate is analogous to Linux's UMOUNT_PROPAGATE. propagate bool } // shouldUmount returns if this mount should be disconnected from its parent. // It is analogous to fs/namespace.c:disconnect_mount() in Linux. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) shouldUmount(mnt *Mount, opts *umountRecursiveOptions) bool { // Always disconnect when it's not a lazy unmount. if opts.eager { return true } // If a mount does not have a parent, it won't be disconnected but will be // DecRef-ed. if mnt.parent() == nil { return true } // Always unmount if the parent is not marked as unmounted. if !mnt.parent().umounted { return true } // If the parent is marked as unmounted, we can only unmount is // UMOUNT_CONNECTED is false. if !opts.disconnectHierarchy { return false } if mnt.locked { return false } return true } // umountTreeLocked marks mnt and its descendants as umounted. // // umountTreeLocked is analogous to Linux's fs/namespace.c:umount_tree(). // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) umountTreeLocked(mnt *Mount, opts *umountRecursiveOptions) { if opts.propagate { vfs.unlockPropagationMounts(mnt) } umountMnts := mnt.submountsLocked() for _, mnt := range umountMnts { vfs.umount(mnt) } if opts.propagate { umountMnts = append(umountMnts, vfs.propagateUmount(umountMnts)...) } vfs.mounts.seq.BeginWrite() for _, mnt := range umountMnts { if opts.eager { for { refs := mnt.refs.Load() if refs < 0 { break } if mnt.refs.CompareAndSwap(refs, refs|math.MinInt64) { break } } } if mnt.parent() != nil { vfs.delayDecRef(mnt.getKey()) if vfs.shouldUmount(mnt, opts) { vfs.disconnectLocked(mnt) } else { // Restore mnt in it's parent children list with a reference, but leave // it marked as unmounted. These partly unmounted mounts are cleaned up // in vfs.forgetDeadMountpoints and Mount.destroy. We keep the extra // reference on the mount but remove a reference on the mount point so // that mount.Destroy is called when there are no other references on // the parent. mnt.IncRef() mnt.parent().children[mnt] = struct{}{} } } vfs.setPropagation(mnt, linux.MS_PRIVATE) } vfs.mounts.seq.EndWrite() } // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) umount(mnt *Mount) { if !mnt.umounted { mnt.umounted = true vfs.delayDecRef(mnt) } if parent := mnt.parent(); parent != nil { delete(parent.children, mnt) } } // changeMountpoint disconnects mnt from its current mount point and connects // it to mp. It must be called from a vfs.mounts.seq writer critical section. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) changeMountpoint(mnt *Mount, mp VirtualDentry) { mp.dentry.mu.Lock() vfs.delayDecRef(vfs.disconnectLocked(mnt)) vfs.delayDecRef(mnt) mp.IncRef() vfs.connectLocked(mnt, mp, mp.mount.ns) mp.dentry.mu.Unlock() } // connectLocked makes vd the mount parent/point for mnt. It consumes // references held by vd. // // Preconditions: // - vfs.mountMu must be locked. // - vfs.mounts.seq must be in a writer critical section. // - d.mu must be locked. // - mnt.parent() == nil or mnt.parent().children doesn't contain mnt. // i.e. mnt must not already be connected. func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) { if checkInvariants { if mnt.parent() != nil && mnt.parent().children != nil { if _, ok := mnt.parent().children[mnt]; ok { panic("VFS.connectLocked called on connected mount") } } } mnt.IncRef() // dropped by vfs.umount(). mnt.setKey(vd) if vd.mount.children == nil { vd.mount.children = make(map[*Mount]struct{}) } vd.mount.children[mnt] = struct{}{} vd.dentry.mounts.Add(1) mnt.ns = mntns mntns.mountpoints[vd.dentry]++ mntns.mounts++ vfs.mounts.insertSeqed(mnt) vfsmpmounts, ok := vfs.mountpoints[vd.dentry] if !ok { vfsmpmounts = make(map[*Mount]struct{}) vfs.mountpoints[vd.dentry] = vfsmpmounts } vfsmpmounts[mnt] = struct{}{} vfs.maybeResolveMountPromise(vd) } // disconnectLocked makes vd have no mount parent/point and returns its old // mount parent/point with a reference held. // // Preconditions: // - vfs.mountMu must be locked. // - vfs.mounts.seq must be in a writer critical section. // - mnt.parent() != nil. func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry { vd := mnt.getKey() if checkInvariants { if vd.mount == nil { panic("VFS.disconnectLocked called on disconnected mount") } if mnt.ns.mountpoints[vd.dentry] == 0 { panic("VFS.disconnectLocked called on dentry with zero mountpoints.") } if mnt.ns.mounts == 0 { panic("VFS.disconnectLocked called on namespace with zero mounts.") } } delete(vd.mount.children, mnt) vd.dentry.mounts.Add(math.MaxUint32) // -1 mnt.ns.mountpoints[vd.dentry]-- mnt.ns.mounts-- if mnt.ns.mountpoints[vd.dentry] == 0 { delete(mnt.ns.mountpoints, vd.dentry) } vfs.mounts.removeSeqed(mnt) mnt.setKey(VirtualDentry{}) // Clear mnt.key. vfsmpmounts := vfs.mountpoints[vd.dentry] delete(vfsmpmounts, mnt) if len(vfsmpmounts) == 0 { delete(vfs.mountpoints, vd.dentry) } return vd } // tryIncMountedRef increments mnt's reference count and returns true. If mnt's // reference count is already zero, or has been eagerly umounted, // tryIncMountedRef does nothing and returns false. // // tryIncMountedRef does not require that a reference is held on mnt. func (mnt *Mount) tryIncMountedRef() bool { for { r := mnt.refs.Load() if r <= 0 { // r < 0 => MSB set => eagerly unmounted return false } if mnt.refs.CompareAndSwap(r, r+1) { if mnt.LogRefs() { refs.LogTryIncRef(mnt, r+1) } return true } } } // IncRef increments mnt's reference count. func (mnt *Mount) IncRef() { // In general, negative values for mnt.refs are valid because the MSB is // the eager-unmount bit. r := mnt.refs.Add(1) if mnt.LogRefs() { refs.LogIncRef(mnt, r) } } // DecRef decrements mnt's reference count. func (mnt *Mount) DecRef(ctx context.Context) { r := mnt.refs.Add(-1) if mnt.LogRefs() { refs.LogDecRef(mnt, r) } if r&^math.MinInt64 == 0 { // mask out MSB refs.Unregister(mnt) mnt.destroy(ctx) } } func (mnt *Mount) destroy(ctx context.Context) { mnt.vfs.lockMounts() defer mnt.vfs.unlockMounts(ctx) if mnt.parent() != nil { mnt.vfs.mounts.seq.BeginWrite() vd := mnt.vfs.disconnectLocked(mnt) if vd.Ok() { mnt.vfs.delayDecRef(vd) } mnt.vfs.mounts.seq.EndWrite() } // Cleanup any leftover children. The mount point has already been decref'd in // umount so we just need to clean up the actual mounts. if len(mnt.children) != 0 { mnt.vfs.mounts.seq.BeginWrite() for child := range mnt.children { if checkInvariants { if !child.umounted { panic("children of a mount that has no references should already be marked as unmounted.") } } mnt.vfs.disconnectLocked(child) mnt.vfs.delayDecRef(child) } mnt.vfs.mounts.seq.EndWrite() } if mnt.root != nil { mnt.vfs.delayDecRef(mnt.root) } mnt.vfs.delayDecRef(mnt.fs) } // RefType implements refs.CheckedObject.Type. func (mnt *Mount) RefType() string { return "vfs.Mount" } // LeakMessage implements refs.CheckedObject.LeakMessage. func (mnt *Mount) LeakMessage() string { return fmt.Sprintf("[vfs.Mount %p] reference count of %d instead of 0", mnt, mnt.refs.Load()) } // LogRefs implements refs.CheckedObject.LogRefs. // // This should only be set to true for debugging purposes, as it can generate an // extremely large amount of output and drastically degrade performance. func (mnt *Mount) LogRefs() bool { return false } // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes // a reference on the returned Mount. If (mnt, d) is not a mount point, // getMountAt returns nil. // // getMountAt is analogous to Linux's fs/namei.c:follow_mount(). // // Preconditions: References are held on mnt and d. func (vfs *VirtualFilesystem) getMountAt(ctx context.Context, mnt *Mount, d *Dentry) *Mount { // The first mount is special-cased: // // - The caller is assumed to have checked d.isMounted() already. (This // isn't a precondition because it doesn't matter for correctness.) // // - We return nil, instead of mnt, if there is no mount at (mnt, d). // // - We don't drop the caller's references on mnt and d. retryFirst: next := vfs.mounts.Lookup(mnt, d) if next == nil { return nil } if !next.tryIncMountedRef() { // Raced with umount. goto retryFirst } mnt = next d = next.root // We don't need to take Dentry refs anywhere in this function because // Mounts hold references on Mount.root, which is immutable. for d.isMounted() { next := vfs.mounts.Lookup(mnt, d) if next == nil { break } if !next.tryIncMountedRef() { // Raced with umount. continue } mnt.DecRef(ctx) mnt = next d = next.root } return mnt } // getMountpoint returns the top mount for the given path. // If the path is not a mountpoint, it returns an error. // // The returned VirtualDentry has an extra reference. func (vfs *VirtualFilesystem) getMountpoint(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, error) { vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{}) if err != nil { return VirtualDentry{}, err } // Linux passes the LOOKUP_MOUNPOINT flag to user_path_at in ksys_umount to // resolve to the toppmost mount in the stack located at the specified path. // vfs.GetMountAt() imitates this behavior. See fs/namei.c:user_path_at(...) // and fs/namespace.c:ksys_umount(...). if vd.dentry.isMounted() { if mnt := vfs.getMountAt(ctx, vd.mount, vd.dentry); mnt != nil { vd.mount.DecRef(ctx) vd.mount = mnt } } else if vd.dentry != vd.mount.root { vd.DecRef(ctx) return VirtualDentry{}, linuxerr.EINVAL } return vd, nil } // getMountpointAt returns the mount point for the stack of Mounts including // mnt. It takes a reference on the returned VirtualDentry. If no such mount // point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil). // // Preconditions: // - References are held on mnt and root. // - vfsroot is not (mnt, mnt.root). func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry { // The first mount is special-cased: // // - The caller must have already checked mnt against vfsroot. // // - We return nil, instead of mnt, if there is no mount point for mnt. // // - We don't drop the caller's reference on mnt. retryFirst: epoch := vfs.mounts.seq.BeginRead() parent, point := mnt.parent(), mnt.point() if !vfs.mounts.seq.ReadOk(epoch) { goto retryFirst } if parent == nil { return VirtualDentry{} } if !parent.tryIncMountedRef() { // Raced with umount. goto retryFirst } if !point.TryIncRef() { // Since Mount holds a reference on Mount.key.point, this can only // happen due to a racing change to Mount.key. parent.DecRef(ctx) goto retryFirst } if !vfs.mounts.seq.ReadOk(epoch) { point.DecRef(ctx) parent.DecRef(ctx) goto retryFirst } mnt = parent d := point for { if mnt == vfsroot.mount && d == vfsroot.dentry { break } if d != mnt.root { break } retryNotFirst: epoch := vfs.mounts.seq.BeginRead() parent, point := mnt.parent(), mnt.point() if !vfs.mounts.seq.ReadOk(epoch) { goto retryNotFirst } if parent == nil { break } if !parent.tryIncMountedRef() { // Raced with umount. goto retryNotFirst } if !point.TryIncRef() { // Since Mount holds a reference on Mount.key.point, this can // only happen due to a racing change to Mount.key. parent.DecRef(ctx) goto retryNotFirst } if !vfs.mounts.seq.ReadOk(epoch) { point.DecRef(ctx) parent.DecRef(ctx) goto retryNotFirst } d.DecRef(ctx) mnt.DecRef(ctx) mnt = parent d = point } return VirtualDentry{mnt, d} } // PivotRoot makes location pointed to by newRootPop the root of the current // namespace, and moves the current root to the location pointed to by // putOldPop. If the operation is successful, it returns virtual dentries for // the new root and the old root with an extra reference taken. func (vfs *VirtualFilesystem) PivotRoot(ctx context.Context, creds *auth.Credentials, newRootPop *PathOperation, putOldPop *PathOperation) (newRoot, oldRoot VirtualDentry, err error) { newRoot, err = vfs.GetDentryAt(ctx, creds, newRootPop, &GetDentryOptions{CheckSearchable: true}) if err != nil { return } defer newRoot.DecRef(ctx) oldRoot = RootFromContext(ctx) defer oldRoot.DecRef(ctx) putOldVd, err := vfs.GetDentryAt(ctx, creds, putOldPop, &GetDentryOptions{CheckSearchable: true}) if err != nil { return } vfs.lockMounts() defer vfs.unlockMounts(ctx) putOld, err := vfs.lockMountpoint(putOldVd) if err != nil { return } vfs.delayDecRef(putOld) cleanup := cleanup.Make(func() { putOld.dentry.mu.Unlock() }) defer cleanup.Clean() // Neither new_root nor put_old can be on the same mount as the current // root mount. if newRoot.mount == oldRoot.mount || putOld.mount == oldRoot.mount { return newRoot, oldRoot, linuxerr.EBUSY } // new_root must be a mountpoint. if newRoot.mount.root != newRoot.dentry { return newRoot, oldRoot, linuxerr.EINVAL } // new_root must not be locked. if newRoot.mount.locked { return newRoot, oldRoot, linuxerr.EINVAL } // put_old must be at or underneath new_root. if !vfs.isPathReachable(ctx, newRoot, putOld) { return newRoot, oldRoot, linuxerr.EINVAL } // the new root must be at or underneath the current root. if !vfs.isPathReachable(ctx, oldRoot, newRoot) { return newRoot, oldRoot, linuxerr.EINVAL } // The current root directory must be a mountpoint // (in the case it has been chrooted). if oldRoot.mount.root != oldRoot.dentry { return newRoot, oldRoot, linuxerr.EINVAL } // The current root and the new root must be in the context's mount namespace. if !vfs.validInMountNS(ctx, oldRoot.mount) || !vfs.validInMountNS(ctx, newRoot.mount) { return newRoot, oldRoot, linuxerr.EINVAL } // The current root and the new root cannot be on the rootfs mount. if oldRoot.mount.parent() == nil || newRoot.mount.parent() == nil { return newRoot, oldRoot, linuxerr.EINVAL } // Either the mount point at new_root, or the parent mount of that mount // point, has propagation type MS_SHARED. if newRootParent := newRoot.mount.parent(); newRoot.mount.isShared || newRootParent.isShared { return newRoot, oldRoot, linuxerr.EINVAL } // put_old is a mount point and has the propagation type MS_SHARED. if putOld.mount.root == putOld.dentry && putOld.mount.isShared { return newRoot, oldRoot, linuxerr.EINVAL } cleanup.Release() vfs.mounts.seq.BeginWrite() mp := vfs.disconnectLocked(newRoot.mount) vfs.delayDecRef(mp) rootMp := vfs.disconnectLocked(oldRoot.mount) if oldRoot.mount.locked { newRoot.mount.locked = true oldRoot.mount.locked = false } putOld.IncRef() vfs.connectLocked(oldRoot.mount, putOld, putOld.mount.ns) putOld.dentry.mu.Unlock() rootMp.dentry.mu.Lock() vfs.connectLocked(newRoot.mount, rootMp, rootMp.mount.ns) rootMp.dentry.mu.Unlock() vfs.mounts.seq.EndWrite() vfs.delayDecRef(newRoot.mount) vfs.delayDecRef(oldRoot.mount) newRoot.IncRef() oldRoot.IncRef() return } // SetMountReadOnly sets the mount as ReadOnly. func (vfs *VirtualFilesystem) SetMountReadOnly(mnt *Mount, ro bool) error { vfs.lockMounts() defer vfs.unlockMounts(context.Background()) return mnt.setReadOnlyLocked(ro) } // CheckBeginWrite increments the counter of in-progress write operations on // mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns // EROFS. // // If CheckBeginWrite succeeds, EndWrite must be called when the write // operation is finished. func (mnt *Mount) CheckBeginWrite() error { if mnt.writers.Add(1) < 0 { mnt.writers.Add(-1) return linuxerr.EROFS } return nil } // EndWrite indicates that a write operation signaled by a previous successful // call to CheckBeginWrite has finished. func (mnt *Mount) EndWrite() { mnt.writers.Add(-1) } // Preconditions: VirtualFilesystem.mountMu must be locked. func (mnt *Mount) setReadOnlyLocked(ro bool) error { if oldRO := mnt.writers.Load() < 0; oldRO == ro { return nil } if ro { if !mnt.writers.CompareAndSwap(0, math.MinInt64) { return linuxerr.EBUSY } return nil } // Unset MSB without dropping any temporary increments from failed calls to // mnt.CheckBeginWrite(). mnt.writers.Add(math.MinInt64) return nil } // ReadOnly returns true if mount is readonly. func (mnt *Mount) ReadOnly() bool { mnt.vfs.lockMounts() defer mnt.vfs.unlockMounts(context.Background()) return mnt.writers.Load() < 0 } // ReadOnlyLocked returns true if mount is readonly. // // Preconditions: VirtualFilesystem.mountMu must be locked. func (mnt *Mount) ReadOnlyLocked() bool { return mnt.writers.Load() < 0 } // Filesystem returns the mounted Filesystem. It does not take a reference on // the returned Filesystem. func (mnt *Mount) Filesystem() *Filesystem { return mnt.fs } // submountsLocked returns this Mount and all Mounts that are descendents of // it. // // Precondition: mnt.vfs.mountMu must be held. func (mnt *Mount) submountsLocked() []*Mount { mounts := []*Mount{mnt} for m := range mnt.children { mounts = append(mounts, m.submountsLocked()...) } return mounts } // countSubmountsLocked returns mnt's total number of descendants including // uncommitted descendants. // // Precondition: mnt.vfs.mountMu must be held. func (mnt *Mount) countSubmountsLocked() uint32 { mounts := uint32(1) for m := range mnt.children { mounts += m.countSubmountsLocked() } return mounts } // Root returns the mount's root. It does not take a reference on the returned // Dentry. func (mnt *Mount) Root() *Dentry { return mnt.root } // GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf. // // Preconditions: taskRootDir.Ok(). func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { rootMnt := taskRootDir.mount vfs.lockMounts() mounts := rootMnt.submountsLocked() // Take a reference on mounts since we need to drop vfs.mountMu before // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()). for _, mnt := range mounts { mnt.IncRef() } vfs.unlockMounts(ctx) defer func() { for _, mnt := range mounts { mnt.DecRef(ctx) } }() sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) for _, mnt := range mounts { // Get the path to this mount relative to task root. mntRootVD := VirtualDentry{ mount: mnt, dentry: mnt.root, } path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD) if err != nil { // For some reason we didn't get a path. Log a warning // and run with empty path. ctx.Warningf("VFS.GenerateProcMounts: error getting pathname for mount root %+v: %v", mnt.root, err) path = "" } if path == "" { // Either an error occurred, or path is not reachable // from root. break } mntOpts := mnt.Options() opts := "rw" if mntOpts.ReadOnly { opts = "ro" } if mntOpts.Flags.NoATime { opts = ",noatime" } if mntOpts.Flags.NoExec { opts += ",noexec" } if mopts := mnt.fs.Impl().MountOptions(); mopts != "" { opts += "," + mopts } // Format: // // // The "needs dump" and "fsck order" flags are always 0, which // is allowed. fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0) } } // GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to // buf. // // Preconditions: taskRootDir.Ok(). func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { rootMnt := taskRootDir.mount vfs.lockMounts() mounts := rootMnt.submountsLocked() // Take a reference on mounts since we need to drop vfs.mountMu before // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()) or // vfs.StatAt() (=> FilesystemImpl.StatAt()). for _, mnt := range mounts { mnt.IncRef() } vfs.unlockMounts(ctx) defer func() { for _, mnt := range mounts { mnt.DecRef(ctx) } }() sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) creds := auth.CredentialsFromContext(ctx) for _, mnt := range mounts { // Get the path to this mount relative to task root. mntRootVD := VirtualDentry{ mount: mnt, dentry: mnt.root, } pathFromRoot, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD) if err != nil { // For some reason we didn't get a path. Log a warning // and run with empty path. ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err) continue } if pathFromRoot == "" { // The path is not reachable from root. continue } var pathFromFS string pathFromFS, err = vfs.PathnameInFilesystem(ctx, mntRootVD) if err != nil { // For some reason we didn't get a path. Log a warning // and run with empty path. ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err) continue } if pathFromFS == "" { // The path is not reachable from root. continue } // Stat the mount root to get the major/minor device numbers. pop := &PathOperation{ Root: mntRootVD, Start: mntRootVD, } statx, err := vfs.StatAt(ctx, creds, pop, &StatOptions{}) if err != nil { // Well that's not good. Ignore this mount. ctx.Warningf("VFS.GenerateProcMountInfo: failed to stat mount root %+v: %v", mnt.root, err) continue } // Format: // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) // (1) Mount ID. fmt.Fprintf(buf, "%d ", mnt.ID) // (2) Parent ID (or this ID if there is no parent). // Note that even if the call to mnt.parent() races with Mount // destruction (which is possible since we're not holding vfs.mountMu), // its Mount.ID will still be valid. pID := mnt.ID if p := mnt.parent(); p != nil { pID = p.ID } fmt.Fprintf(buf, "%d ", pID) // (3) Major:Minor device ID. We don't have a superblock, so we // just use the root inode device number. fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor) // (4) Root: the pathname of the directory in the filesystem // which forms the root of this mount. fmt.Fprintf(buf, "%s ", manglePath(pathFromFS)) // (5) Mount point (relative to process root). fmt.Fprintf(buf, "%s ", manglePath(pathFromRoot)) // (6) Mount options. opts := "rw" if mnt.ReadOnly() { opts = "ro" } if mnt.flags.NoATime { opts = ",noatime" } if mnt.flags.NoExec { opts += ",noexec" } fmt.Fprintf(buf, "%s ", opts) // (7) Optional fields: zero or more fields of the form "tag[:value]". fmt.Fprintf(buf, "%s", vfs.generateOptionalTags(ctx, mnt, taskRootDir)) // (8) Separator: the end of the optional fields is marked by a single hyphen. fmt.Fprintf(buf, "- ") // (9) Filesystem type. fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name()) // (10) Mount source: filesystem-specific information or "none". fmt.Fprintf(buf, "none ") // (11) Superblock options, and final newline. fmt.Fprintf(buf, "%s\n", superBlockOpts(pathFromRoot, mnt)) } } // manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents. // See Linux fs/seq_file.c:mangle_path. func manglePath(p string) string { r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134") return r.Replace(p) } // superBlockOpts returns the super block options string for the mount at // the given path. func superBlockOpts(mountPath string, mnt *Mount) string { // Compose super block options by combining global mount flags with // FS-specific mount options. opts := "rw" if mnt.ReadOnly() { opts = "ro" } if mopts := mnt.fs.Impl().MountOptions(); mopts != "" { opts += "," + mopts } // NOTE(b/147673608): If the mount is a ramdisk-based fake cgroupfs, we also // need to include the cgroup name in the options. For now we just read that // from the path. Note that this is only possible when "cgroup" isn't // registered as a valid filesystem type. // // TODO(gvisor.dev/issue/190): Once we removed fake cgroupfs support, we // should remove this. if cgroupfs := mnt.vfs.getFilesystemType("cgroup"); cgroupfs != nil && cgroupfs.opts.AllowUserMount { // Real cgroupfs available. return opts } if mnt.fs.FilesystemType().Name() == "cgroup" { splitPath := strings.Split(mountPath, "/") cgroupType := splitPath[len(splitPath)-1] opts += "," + cgroupType } return opts } func (vfs *VirtualFilesystem) generateOptionalTags(ctx context.Context, mnt *Mount, root VirtualDentry) string { vfs.lockMounts() defer vfs.unlockMounts(ctx) // TODO(b/305893463): Support MS_UNBINDABLE propagation type. var optionalSb strings.Builder if mnt.isShared { optionalSb.WriteString(fmt.Sprintf("shared:%d ", mnt.groupID)) } if mnt.isFollower() { // Per man mount_namespaces(7), propagate_from should not be // included in optional tags if the leader "is the immediate leader of the // mount, or if there is no dominant peer group under the same root". A // dominant peer group is the nearest reachable mount in the leader/follower // chain. optionalSb.WriteString(fmt.Sprintf("master:%d ", mnt.leader.groupID)) var dominant *Mount for m := mnt.leader; m != nil; m = m.leader { if dominant = vfs.peerUnderRoot(ctx, m, mnt.ns, root); dominant != nil { break } } if dominant != nil && dominant != mnt.leader { optionalSb.WriteString(fmt.Sprintf("propagate_from:%d ", dominant.groupID)) } } return optionalSb.String() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/mount_list.go000066400000000000000000000121001465435605700242640ustar00rootroot00000000000000package vfs // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type followerElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (followerElementMapper) linkerFor(elem *Mount) *Mount { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type followerList struct { head *Mount tail *Mount } // Reset resets list l to the empty state. func (l *followerList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *followerList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *followerList) Front() *Mount { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *followerList) Back() *Mount { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *followerList) Len() (count int) { for e := l.Front(); e != nil; e = (followerElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *followerList) PushFront(e *Mount) { linker := followerElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { followerElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *followerList) PushFrontList(m *followerList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { followerElementMapper{}.linkerFor(l.head).SetPrev(m.tail) followerElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *followerList) PushBack(e *Mount) { linker := followerElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { followerElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *followerList) PushBackList(m *followerList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { followerElementMapper{}.linkerFor(l.tail).SetNext(m.head) followerElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *followerList) InsertAfter(b, e *Mount) { bLinker := followerElementMapper{}.linkerFor(b) eLinker := followerElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { followerElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *followerList) InsertBefore(a, e *Mount) { aLinker := followerElementMapper{}.linkerFor(a) eLinker := followerElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { followerElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *followerList) Remove(e *Mount) { linker := followerElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { followerElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { followerElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type followerEntry struct { next *Mount prev *Mount } // Next returns the entry that follows e in the list. // //go:nosplit func (e *followerEntry) Next() *Mount { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *followerEntry) Prev() *Mount { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *followerEntry) SetNext(elem *Mount) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *followerEntry) SetPrev(elem *Mount) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/mount_namespace_refs.go000066400000000000000000000102011465435605700262640ustar00rootroot00000000000000package vfs import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const namespaceenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var namespaceobj *MountNamespace // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type namespaceRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *namespaceRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *namespaceRefs) RefType() string { return fmt.Sprintf("%T", namespaceobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *namespaceRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *namespaceRefs) LogRefs() bool { return namespaceenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *namespaceRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *namespaceRefs) IncRef() { v := r.refCount.Add(1) if namespaceenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *namespaceRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if namespaceenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *namespaceRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if namespaceenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *namespaceRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/mount_ring.go000066400000000000000000000022641465435605700242620ustar00rootroot00000000000000package vfs // Entry is an element in the circular linked list. // // +stateify savable type mountEntry struct { next *mountEntry prev *mountEntry container *Mount } // Init instantiates an Element to be an item in a ring (circularly-linked // list). // //go:nosplit func (e *mountEntry) Init(container *Mount) { e.next = e e.prev = e e.container = container } // Add adds new to old's ring. // //go:nosplit func (e *mountEntry) Add(new *mountEntry) { next := e.next prev := e next.prev = new new.next = next new.prev = prev e.next = new } // Remove removes e from its ring and reinitializes it. // //go:nosplit func (e *mountEntry) Remove() { next := e.next prev := e.prev next.prev = prev prev.next = next e.Init(e.container) } // Empty returns true if there are no other elements in the ring. // //go:nosplit func (e *mountEntry) Empty() bool { return e.next == e } // Next returns the next containing object pointed to by the list. // //go:nosplit func (e *mountEntry) Next() *Mount { return e.next.container } // Prev returns the previous containing object pointed to by the list. // //go:nosplit func (e *mountEntry) Prev() *Mount { return e.prev.container } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/mount_unsafe.go000066400000000000000000000306661465435605700246130ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "fmt" "math/bits" "sync/atomic" "unsafe" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/sync" ) // mountKey represents the location at which a Mount is mounted. It is // structurally identical to VirtualDentry, but stores its fields as // unsafe.Pointer since mutators synchronize with VFS path traversal using // seqcounts. // // This is explicitly not savable. type mountKey struct { parent unsafe.Pointer // *Mount point unsafe.Pointer // *Dentry } var ( mountKeyHasher = sync.MapKeyHasher(map[mountKey]struct{}(nil)) mountKeySeed = sync.RandUintptr() ) func (k *mountKey) hash() uintptr { return mountKeyHasher(gohacks.Noescape(unsafe.Pointer(k)), mountKeySeed) } func (mnt *Mount) parent() *Mount { return (*Mount)(atomic.LoadPointer(&mnt.key.parent)) } func (mnt *Mount) point() *Dentry { return (*Dentry)(atomic.LoadPointer(&mnt.key.point)) } func (mnt *Mount) getKey() VirtualDentry { return VirtualDentry{ mount: mnt.parent(), dentry: mnt.point(), } } // Invariant: mnt.key.parent == nil. vd.Ok(). func (mnt *Mount) setKey(vd VirtualDentry) { atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount)) atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry)) } // mountTable maps (mount parent, mount point) pairs to mounts. It supports // efficient concurrent lookup, even in the presence of concurrent mutators // (provided mutation is sufficiently uncommon). // // mountTable.Init() must be called on new mountTables before use. type mountTable struct { // mountTable is implemented as a seqcount-protected hash table that // resolves collisions with linear probing, featuring Robin Hood insertion // and backward shift deletion. These minimize probe length variance, // significantly improving the performance of linear probing at high load // factors. (mountTable doesn't use bucketing, which is the other major // technique commonly used in high-performance hash tables; the efficiency // of bucketing is largely due to SIMD lookup, and Go lacks both SIMD // intrinsics and inline assembly, limiting the performance of this // approach.) seq sync.SeqCount `state:"nosave"` // size holds both length (number of elements) and capacity (number of // slots): capacity is stored as its base-2 log (referred to as order) in // the least significant bits of size, and length is stored in the // remaining bits. Go defines bit shifts >= width of shifted unsigned // operand as shifting to 0, which differs from x86's SHL, so the Go // compiler inserts a bounds check for each bit shift unless we mask order // anyway (cf. runtime.bucketShift()), and length isn't used by lookup; // thus this bit packing gets us more bits for the length (vs. storing // length and cap in separate uint32s) for ~free. size atomicbitops.Uint64 slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init } type mountSlot struct { // We don't store keys in slots; instead, we just check Mount.parent and // Mount.point directly. Any practical use of lookup will need to touch // Mounts anyway, and comparing hashes means that false positives are // extremely rare, so this isn't an extra cache line touch overall. value unsafe.Pointer // *Mount hash uintptr } const ( mtSizeOrderBits = 6 // log2 of pointer size in bits mtSizeOrderMask = (1 << mtSizeOrderBits) - 1 mtSizeOrderOne = 1 mtSizeLenLSB = mtSizeOrderBits mtSizeLenOne = 1 << mtSizeLenLSB mtSizeLenNegOne = ^uint64(mtSizeOrderMask) // uint64(-1) << mtSizeLenLSB mountSlotBytes = unsafe.Sizeof(mountSlot{}) mountKeyBytes = unsafe.Sizeof(mountKey{}) // Tuning parameters. // // Essentially every mountTable will contain at least /proc, /sys, and // /dev/shm, so there is ~no reason for mtInitCap to be < 4. mtInitOrder = 2 mtInitCap = 1 << mtInitOrder mtMaxLoadNum = 13 mtMaxLoadDen = 16 ) func init() { // We can't just define mtSizeOrderBits as follows because Go doesn't have // constexpr. if ptrBits := uint(unsafe.Sizeof(uintptr(0)) * 8); mtSizeOrderBits != bits.TrailingZeros(ptrBits) { panic(fmt.Sprintf("mtSizeOrderBits (%d) must be %d = log2 of pointer size in bits (%d)", mtSizeOrderBits, bits.TrailingZeros(ptrBits), ptrBits)) } if bits.OnesCount(uint(mountSlotBytes)) != 1 { panic(fmt.Sprintf("sizeof(mountSlotBytes) (%d) must be a power of 2 to use bit masking for wraparound", mountSlotBytes)) } if mtInitCap <= 1 { panic(fmt.Sprintf("mtInitCap (%d) must be at least 2 since mountTable methods assume that there will always be at least one empty slot", mtInitCap)) } if mtMaxLoadNum >= mtMaxLoadDen { panic(fmt.Sprintf("invalid mountTable maximum load factor (%d/%d)", mtMaxLoadNum, mtMaxLoadDen)) } } // Init must be called exactly once on each mountTable before use. func (mt *mountTable) Init() { mt.size = atomicbitops.FromUint64(mtInitOrder) mt.slots = newMountTableSlots(mtInitCap) } func newMountTableSlots(cap uintptr) unsafe.Pointer { slice := make([]mountSlot, cap, cap) return unsafe.Pointer(&slice[0]) } // Lookup returns the Mount with the given parent, mounted at the given point. // If no such Mount exists, Lookup returns nil. // // Lookup may be called even if there are concurrent mutators of mt. func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount { key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)} hash := key.hash() loop: for { epoch := mt.seq.BeginRead() size := mt.size.Load() slots := atomic.LoadPointer(&mt.slots) if !mt.seq.ReadOk(epoch) { continue } tcap := uintptr(1) << (size & mtSizeOrderMask) mask := tcap - 1 off := (hash & mask) * mountSlotBytes offmask := mask * mountSlotBytes for { // This avoids bounds checking. slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off)) slotValue := atomic.LoadPointer(&slot.value) slotHash := atomic.LoadUintptr(&slot.hash) if !mt.seq.ReadOk(epoch) { // The element we're looking for might have been moved into a // slot we've previously checked, so restart entirely. continue loop } if slotValue == nil { return nil } if slotHash == hash { mount := (*Mount)(slotValue) var mountKey mountKey mountKey.parent = atomic.LoadPointer(&mount.key.parent) mountKey.point = atomic.LoadPointer(&mount.key.point) if !mt.seq.ReadOk(epoch) { continue loop } if key == mountKey { return mount } } off = (off + mountSlotBytes) & offmask } } } // Range calls f on each Mount in mt. If f returns false, Range stops iteration // and returns immediately. func (mt *mountTable) Range(f func(*Mount) bool) { tcap := uintptr(1) << (mt.size.Load() & mtSizeOrderMask) slotPtr := mt.slots last := unsafe.Pointer(uintptr(mt.slots) + ((tcap - 1) * mountSlotBytes)) for { slot := (*mountSlot)(slotPtr) if slot.value != nil { if !f((*Mount)(slot.value)) { return } } if slotPtr == last { return } slotPtr = unsafe.Pointer(uintptr(slotPtr) + mountSlotBytes) } } // Insert inserts the given mount into mt. // // Preconditions: mt must not already contain a Mount with the same mount point // and parent. func (mt *mountTable) Insert(mount *Mount) { mt.seq.BeginWrite() mt.insertSeqed(mount) mt.seq.EndWrite() } // insertSeqed inserts the given mount into mt. // // Preconditions: // - mt.seq must be in a writer critical section. // - mt must not already contain a Mount with the same mount point and parent. func (mt *mountTable) insertSeqed(mount *Mount) { hash := mount.key.hash() // We're under the maximum load factor if: // // (len+1) / cap <= mtMaxLoadNum / mtMaxLoadDen // (len+1) * mtMaxLoadDen <= mtMaxLoadNum * cap tlen := mt.size.RacyLoad() >> mtSizeLenLSB order := mt.size.RacyLoad() & mtSizeOrderMask tcap := uintptr(1) << order if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) { // Atomically insert the new element into the table. mt.size.Add(mtSizeLenOne) mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash) return } // Otherwise, we have to expand. Double the number of slots in the new // table. newOrder := order + 1 if newOrder > mtSizeOrderMask { panic("mount table size overflow") } newCap := uintptr(1) << newOrder newSlots := newMountTableSlots(newCap) // Copy existing elements to the new table. oldCur := mt.slots // Go does not permit pointers to the end of allocated objects, so we // must use a pointer to the last element of the old table. The // following expression is equivalent to // `slots+(cap-1)*mountSlotBytes` but has a critical path length of 2 // arithmetic instructions instead of 3. oldLast := unsafe.Pointer((uintptr(mt.slots) - mountSlotBytes) + (tcap * mountSlotBytes)) for { oldSlot := (*mountSlot)(oldCur) if oldSlot.value != nil { mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash) } if oldCur == oldLast { break } oldCur = unsafe.Pointer(uintptr(oldCur) + mountSlotBytes) } // Insert the new element into the new table. mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash) // Switch to the new table. mt.size.Add(mtSizeLenOne | mtSizeOrderOne) atomic.StorePointer(&mt.slots, newSlots) } // Preconditions: // - There are no concurrent mutators of the table (slots, cap). // - If the table is visible to readers, then mt.seq must be in a writer // critical section. // - cap must be a power of 2. func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) { mask := cap - 1 off := (hash & mask) * mountSlotBytes offmask := mask * mountSlotBytes disp := uintptr(0) for { slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off)) slotValue := slot.value if slotValue == nil { atomic.StorePointer(&slot.value, value) atomic.StoreUintptr(&slot.hash, hash) return } // If we've been displaced farther from our first-probed slot than the // element stored in this one, swap elements and switch to inserting // the replaced one. (This is Robin Hood insertion.) slotHash := slot.hash slotDisp := ((off / mountSlotBytes) - slotHash) & mask if disp > slotDisp { atomic.StorePointer(&slot.value, value) atomic.StoreUintptr(&slot.hash, hash) value = slotValue hash = slotHash disp = slotDisp } off = (off + mountSlotBytes) & offmask disp++ } } // Remove removes the given mount from mt. // // Preconditions: // - mt must contain mount. // - mount.key should be valid. func (mt *mountTable) Remove(mount *Mount) { mt.seq.BeginWrite() mt.removeSeqed(mount) mt.seq.EndWrite() } // removeSeqed removes the given mount from mt. // // Preconditions same as Remove() plus: // - mt.seq must be in a writer critical section. func (mt *mountTable) removeSeqed(mount *Mount) { hash := mount.key.hash() tcap := uintptr(1) << (mt.size.RacyLoad() & mtSizeOrderMask) mask := tcap - 1 slots := mt.slots off := (hash & mask) * mountSlotBytes offmask := mask * mountSlotBytes for { slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off)) slotValue := slot.value if slotValue == unsafe.Pointer(mount) { // Found the element to remove. Move all subsequent elements // backward until we either find an empty slot, or an element that // is already in its first-probed slot. (This is backward shift // deletion.) for { nextOff := (off + mountSlotBytes) & offmask nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff)) nextSlotValue := nextSlot.value if nextSlotValue == nil { break } nextSlotHash := nextSlot.hash if (nextOff / mountSlotBytes) == (nextSlotHash & mask) { break } atomic.StorePointer(&slot.value, nextSlotValue) atomic.StoreUintptr(&slot.hash, nextSlotHash) off = nextOff slot = nextSlot } atomic.StorePointer(&slot.value, nil) mt.size.Add(mtSizeLenNegOne) return } if checkInvariants && slotValue == nil { panic(fmt.Sprintf("mountTable.Remove() called on missing Mount %v", mount)) } off = (off + mountSlotBytes) & offmask } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/namespace.go000066400000000000000000000163521465435605700240400ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) // A MountNamespace is a collection of Mounts.// // MountNamespaces are reference-counted. Unless otherwise specified, all // MountNamespace methods require that a reference is held. // // MountNamespace is analogous to Linux's struct mnt_namespace. // // +stateify savable type MountNamespace struct { // Refs is the reference count for this mount namespace. Refs refs.TryRefCounter // Owner is the usernamespace that owns this mount namespace. Owner *auth.UserNamespace // root is the MountNamespace's root mount. root *Mount // mountpoints maps all Dentries which are mount points in this namespace // to the number of Mounts for which they are mount points. mountpoints is // protected by VirtualFilesystem.mountMu. // // mountpoints is used to determine if a Dentry can be moved or removed // (which requires that the Dentry is not a mount point in the calling // namespace). // // mountpoints is maintained even if there are no references held on the // MountNamespace; this is required to ensure that // VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate // correctly on unreferenced MountNamespaces. mountpoints map[*Dentry]uint32 // mounts is the total number of mounts in this mount namespace. mounts uint32 // pending is the total number of pending mounts in this mount namespace. pending uint32 } // Namespace is the namespace interface. type Namespace interface { Type() string Destroy(ctx context.Context) } // NewMountNamespace returns a new mount namespace with a root filesystem // configured by the given arguments. A reference is taken on the returned // MountNamespace. // // If nsfs is nil, the default reference counter is used. func (vfs *VirtualFilesystem) NewMountNamespace( ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions, nsfs NamespaceInodeGetter, ) (*MountNamespace, error) { rft := vfs.getFilesystemType(fsTypeName) if rft == nil { ctx.Warningf("Unknown filesystem type: %s", fsTypeName) return nil, linuxerr.ENODEV } fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) if err != nil { return nil, err } return vfs.NewMountNamespaceFrom(ctx, creds, fs, root, opts, nsfs), nil } type namespaceDefaultRefs struct { namespaceRefs destroy func(ctx context.Context) } func (r *namespaceDefaultRefs) DecRef(ctx context.Context) { r.namespaceRefs.DecRef( func() { r.destroy(ctx) }, ) } // NewMountNamespaceFrom constructs a new mount namespace from an existing // filesystem and its root dentry. This is similar to NewMountNamespace, but // uses an existing filesystem instead of constructing a new one. func (vfs *VirtualFilesystem) NewMountNamespaceFrom( ctx context.Context, creds *auth.Credentials, fs *Filesystem, root *Dentry, opts *MountOptions, nsfs NamespaceInodeGetter, ) *MountNamespace { mntns := &MountNamespace{ Owner: creds.UserNamespace, mountpoints: make(map[*Dentry]uint32), } if nsfs == nil { refs := &namespaceDefaultRefs{destroy: mntns.Destroy} refs.InitRefs() mntns.Refs = refs } else { mntns.Refs = nsfs.GetNamespaceInode(ctx, mntns) } mntns.root = newMount(vfs, fs, root, mntns, opts) return mntns } type cloneEntry struct { prevMount *Mount parentMount *Mount } // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) updateRootAndCWD(ctx context.Context, root *VirtualDentry, cwd *VirtualDentry, src *Mount, dst *Mount) { if root.mount == src { vfs.delayDecRef(root.mount) root.mount = dst root.mount.IncRef() } if cwd.mount == src { vfs.delayDecRef(cwd.mount) cwd.mount = dst cwd.mount.IncRef() } } // NamespaceInodeGetter is an interface that provides the GetNamespaceInode method. type NamespaceInodeGetter interface { GetNamespaceInode(ctx context.Context, ns Namespace) refs.TryRefCounter } // CloneMountNamespace makes a copy of the specified mount namespace. // // If `root` or `cwd` have mounts in the old namespace, they will be replaced // with proper mounts from the new namespace. func (vfs *VirtualFilesystem) CloneMountNamespace( ctx context.Context, creds *auth.Credentials, ns *MountNamespace, root *VirtualDentry, cwd *VirtualDentry, nsfs NamespaceInodeGetter, ) (*MountNamespace, error) { newns := &MountNamespace{ Owner: creds.UserNamespace, mountpoints: make(map[*Dentry]uint32), } newns.Refs = nsfs.GetNamespaceInode(ctx, newns) vfs.lockMounts() defer vfs.unlockMounts(ctx) cloneType := 0 if ns.Owner != newns.Owner { cloneType = sharedToFollowerClone } newRoot, err := vfs.cloneMountTree(ctx, ns.root, ns.root.root, cloneType, func(ctx context.Context, src, dst *Mount) { vfs.updateRootAndCWD(ctx, root, cwd, src, dst) // +checklocksforce: vfs.mountMu is locked. }) if err != nil { newns.DecRef(ctx) return nil, err } newns.root = newRoot newns.root.ns = newns vfs.commitChildren(ctx, newRoot) if ns.Owner != newns.Owner { vfs.lockMountTree(newRoot) } return newns, nil } // Destroy implements nsfs.Namespace.Destroy. func (mntns *MountNamespace) Destroy(ctx context.Context) { vfs := mntns.root.fs.VirtualFilesystem() vfs.lockMounts() vfs.umountTreeLocked(mntns.root, &umountRecursiveOptions{ disconnectHierarchy: true, }) vfs.unlockMounts(ctx) } // Type implements nsfs.Namespace.Type. func (mntns *MountNamespace) Type() string { return "mnt" } // IncRef increments mntns' refcount. func (mntns *MountNamespace) IncRef() { mntns.Refs.IncRef() } // DecRef decrements mntns' reference count. func (mntns *MountNamespace) DecRef(ctx context.Context) { mntns.Refs.DecRef(ctx) } // TryIncRef attempts to increment mntns' reference count. func (mntns *MountNamespace) TryIncRef() bool { return mntns.Refs.TryIncRef() } // Root returns mntns' root. If the root is over-mounted, it returns the top // mount. func (mntns *MountNamespace) Root(ctx context.Context) VirtualDentry { vfs := mntns.root.fs.VirtualFilesystem() vd := VirtualDentry{ mount: mntns.root, dentry: mntns.root.root, } vd.IncRef() if !vd.dentry.isMounted() { return vd } m := vfs.getMountAt(ctx, vd.mount, vd.dentry) if m == nil { return vd } vd.DecRef(ctx) vd.mount = m vd.dentry = m.root vd.dentry.IncRef() return vd } func (mntns *MountNamespace) checkMountCount(ctx context.Context, mnt *Mount) error { if mntns.mounts > MountMax { return linuxerr.ENOSPC } if mntns.mounts+mntns.pending > MountMax { return linuxerr.ENOSPC } mnts := mnt.countSubmountsLocked() if mntns.mounts+mntns.pending+mnts > MountMax { return linuxerr.ENOSPC } mntns.pending += mnts return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/opath.go000066400000000000000000000122541465435605700232140ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/usermem" ) // opathFD implements FileDescriptionImpl for a file description opened with O_PATH. // // +stateify savable type opathFD struct { vfsfd FileDescription FileDescriptionDefaultImpl BadLockFD } // Release implements FileDescriptionImpl.Release. func (fd *opathFD) Release(context.Context) { // noop } // Allocate implements FileDescriptionImpl.Allocate. func (fd *opathFD) Allocate(ctx context.Context, mode, offset, length uint64) error { return linuxerr.EBADF } // PRead implements FileDescriptionImpl.PRead. func (fd *opathFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { return 0, linuxerr.EBADF } // Read implements FileDescriptionImpl.Read. func (fd *opathFD) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { return 0, linuxerr.EBADF } // PWrite implements FileDescriptionImpl.PWrite. func (fd *opathFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { return 0, linuxerr.EBADF } // Write implements FileDescriptionImpl.Write. func (fd *opathFD) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { return 0, linuxerr.EBADF } // Ioctl implements FileDescriptionImpl.Ioctl. func (fd *opathFD) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { return 0, linuxerr.EBADF } // IterDirents implements FileDescriptionImpl.IterDirents. func (fd *opathFD) IterDirents(ctx context.Context, cb IterDirentsCallback) error { return linuxerr.EBADF } // Seek implements FileDescriptionImpl.Seek. func (fd *opathFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { return 0, linuxerr.EBADF } // ConfigureMMap implements FileDescriptionImpl.ConfigureMMap. func (fd *opathFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { return linuxerr.EBADF } // ListXattr implements FileDescriptionImpl.ListXattr. func (fd *opathFD) ListXattr(ctx context.Context, size uint64) ([]string, error) { return nil, linuxerr.EBADF } // GetXattr implements FileDescriptionImpl.GetXattr. func (fd *opathFD) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) { return "", linuxerr.EBADF } // SetXattr implements FileDescriptionImpl.SetXattr. func (fd *opathFD) SetXattr(ctx context.Context, opts SetXattrOptions) error { return linuxerr.EBADF } // RemoveXattr implements FileDescriptionImpl.RemoveXattr. func (fd *opathFD) RemoveXattr(ctx context.Context, name string) error { return linuxerr.EBADF } // Sync implements FileDescriptionImpl.Sync. func (fd *opathFD) Sync(ctx context.Context) error { return linuxerr.EBADF } // SetStat implements FileDescriptionImpl.SetStat. func (fd *opathFD) SetStat(ctx context.Context, opts SetStatOptions) error { return linuxerr.EBADF } // Stat implements FileDescriptionImpl.Stat. func (fd *opathFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { vfsObj := fd.vfsfd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vfsfd.vd, Start: fd.vfsfd.vd, }) stat, err := fd.vfsfd.vd.mount.fs.impl.StatAt(ctx, rp, opts) rp.Release(ctx) return stat, err } // StatFS returns metadata for the filesystem containing the file represented // by fd. func (fd *opathFD) StatFS(ctx context.Context) (linux.Statfs, error) { vfsObj := fd.vfsfd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vfsfd.vd, Start: fd.vfsfd.vd, }) statfs, err := fd.vfsfd.vd.mount.fs.impl.StatFSAt(ctx, rp) rp.Release(ctx) return statfs, err } func (vfs *VirtualFilesystem) openOPathFD(ctx context.Context, creds *auth.Credentials, pop *PathOperation, flags uint32) (*FileDescription, error) { vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{}) if err != nil { return nil, err } defer vd.DecRef(ctx) if flags&linux.O_DIRECTORY != 0 { stat, err := vfs.StatAt(ctx, creds, &PathOperation{ Root: vd, Start: vd, }, &StatOptions{ Mask: linux.STATX_MODE, }) if err != nil { return nil, err } if stat.Mode&linux.S_IFDIR == 0 { return nil, linuxerr.ENOTDIR } } fd := &opathFD{} if err := fd.vfsfd.Init(fd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, err } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/options.go000066400000000000000000000236761465435605700236060ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" ) // GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and // FilesystemImpl.GetDentryAt(). // // +stateify savable type GetDentryOptions struct { // If CheckSearchable is true, FilesystemImpl.GetDentryAt() must check that // the returned Dentry is a directory for which creds has search // permission. CheckSearchable bool } // MkdirOptions contains options to VirtualFilesystem.MkdirAt() and // FilesystemImpl.MkdirAt(). // // +stateify savable type MkdirOptions struct { // Mode is the file mode bits for the created directory. Mode linux.FileMode // If ForSyntheticMountpoint is true, FilesystemImpl.MkdirAt() may create // the given directory in memory only (as opposed to persistent storage). // The created directory should be able to support the creation of // subdirectories with ForSyntheticMountpoint == true. It does not need to // support the creation of subdirectories with ForSyntheticMountpoint == // false, or files of other types. // // FilesystemImpls are permitted to ignore the ForSyntheticMountpoint // option. // // The ForSyntheticMountpoint option exists because, unlike mount(2), the // OCI Runtime Specification permits the specification of mount points that // do not exist, under the expectation that container runtimes will create // them. (More accurately, the OCI Runtime Specification completely fails // to document this feature, but it's implemented by runc.) // ForSyntheticMountpoint allows such mount points to be created even when // the underlying persistent filesystem is immutable. ForSyntheticMountpoint bool } // MknodOptions contains options to VirtualFilesystem.MknodAt() and // FilesystemImpl.MknodAt(). // // +stateify savable type MknodOptions struct { // Mode is the file type and mode bits for the created file. Mode linux.FileMode // If Mode specifies a character or block device special file, DevMajor and // DevMinor are the major and minor device numbers for the created device. DevMajor uint32 DevMinor uint32 // Endpoint is the endpoint to bind to the created file, if a socket file is // being created for bind(2) on a Unix domain socket. Endpoint transport.BoundEndpoint } // MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC. // MS_RDONLY is not part of MountFlags because it's tracked in Mount.writers. // // +stateify savable type MountFlags struct { // NoExec is equivalent to MS_NOEXEC. NoExec bool // NoATime is equivalent to MS_NOATIME and indicates that the // filesystem should not update access time in-place. NoATime bool // NoDev is equivalent to MS_NODEV and indicates that the // filesystem should not allow access to devices (special files). // TODO(gVisor.dev/issue/3186): respect this flag in non FUSE // filesystems. NoDev bool // NoSUID is equivalent to MS_NOSUID and indicates that the // filesystem should not honor set-user-ID and set-group-ID bits or // file capabilities when executing programs. NoSUID bool } // MountOptions contains options to VirtualFilesystem.MountAt(), and VirtualFilesystem.RemountAt() // // +stateify savable type MountOptions struct { // Flags contains flags as specified for mount(2), e.g. MS_NOEXEC. Flags MountFlags // ReadOnly is equivalent to MS_RDONLY. ReadOnly bool // GetFilesystemOptions contains options to FilesystemType.GetFilesystem(). GetFilesystemOptions GetFilesystemOptions // Locked determines whether to lock this mount so it cannot be unmounted by // normal user processes. Locked bool } // OpenOptions contains options to VirtualFilesystem.OpenAt() and // FilesystemImpl.OpenAt(). // // +stateify savable type OpenOptions struct { // Flags contains access mode and flags as specified for open(2). // // FilesystemImpls are responsible for implementing the following flags: // O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC, // O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_SYNC, O_TMPFILE, and // O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and // O_NOFOLLOW. VFS users are responsible for handling O_CLOEXEC, since file // descriptors are mostly outside the scope of VFS. Flags uint32 // If FilesystemImpl.OpenAt() creates a file, Mode is the file mode for the // created file. Mode linux.FileMode // FileExec is set when the file is being opened to be executed. // VirtualFilesystem.OpenAt() checks that the caller has execute permissions // on the file, that the file is a regular file, and that the mount doesn't // have MS_NOEXEC set. FileExec bool } // ReadOptions contains options to FileDescription.PRead(), // FileDescriptionImpl.PRead(), FileDescription.Read(), and // FileDescriptionImpl.Read(). // // +stateify savable type ReadOptions struct { // Flags contains flags as specified for preadv2(2). Flags uint32 } // RenameOptions contains options to VirtualFilesystem.RenameAt() and // FilesystemImpl.RenameAt(). // // +stateify savable type RenameOptions struct { // Flags contains flags as specified for renameat2(2). Flags uint32 // If MustBeDir is true, the renamed file must be a directory. MustBeDir bool } // SetStatOptions contains options to VirtualFilesystem.SetStatAt(), // FilesystemImpl.SetStatAt(), FileDescription.SetStat(), and // FileDescriptionImpl.SetStat(). // // +stateify savable type SetStatOptions struct { // Stat is the metadata that should be set. Only fields indicated by // Stat.Mask should be set. // // If Stat specifies that a timestamp should be set, // FilesystemImpl.SetStatAt() and FileDescriptionImpl.SetStat() must // special-case StatxTimestamp.Nsec == UTIME_NOW as described by // utimensat(2); however, they do not need to check for StatxTimestamp.Nsec // == UTIME_OMIT (VFS users must unset the corresponding bit in Stat.Mask // instead). Stat linux.Statx // NeedWritePerm indicates that write permission on the file is needed for // this operation. This is needed for truncate(2) (note that ftruncate(2) // does not require the same check--instead, it checks that the fd is // writable). NeedWritePerm bool } // BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt() // and FilesystemImpl.BoundEndpointAt(). // // +stateify savable type BoundEndpointOptions struct { // Addr is the path of the file whose socket endpoint is being retrieved. // It is generally irrelevant: most endpoints are stored at a dentry that // was created through a bind syscall, so the path can be stored on creation. // However, if the endpoint was created in FilesystemImpl.BoundEndpointAt(), // then we may not know what the original bind address was. // // For example, if connect(2) is called with address "foo" which corresponds // a remote named socket in goferfs, we need to generate an endpoint wrapping // that file. In this case, we can use Addr to set the endpoint address to // "foo". Note that Addr is only a best-effort attempt--we still do not know // the exact address that was used on the remote fs to bind the socket (it // may have been "foo", "./foo", etc.). Addr string } // GetXattrOptions contains options to VirtualFilesystem.GetXattrAt(), // FilesystemImpl.GetXattrAt(), FileDescription.GetXattr(), and // FileDescriptionImpl.GetXattr(). // // +stateify savable type GetXattrOptions struct { // Name is the name of the extended attribute to retrieve. Name string // Size is the maximum value size that the caller will tolerate. If the value // is larger than size, getxattr methods may return ERANGE, but they are also // free to ignore the hint entirely (i.e. the value returned may be larger // than size). All size checking is done independently at the syscall layer. Size uint64 } // SetXattrOptions contains options to VirtualFilesystem.SetXattrAt(), // FilesystemImpl.SetXattrAt(), FileDescription.SetXattr(), and // FileDescriptionImpl.SetXattr(). // // +stateify savable type SetXattrOptions struct { // Name is the name of the extended attribute being mutated. Name string // Value is the extended attribute's new value. Value string // Flags contains flags as specified for setxattr/lsetxattr/fsetxattr(2). Flags uint32 } // StatOptions contains options to VirtualFilesystem.StatAt(), // FilesystemImpl.StatAt(), FileDescription.Stat(), and // FileDescriptionImpl.Stat(). // // +stateify savable type StatOptions struct { // Mask is the set of fields in the returned Statx that the FilesystemImpl // or FileDescriptionImpl should provide. Bits are as in linux.Statx.Mask. // // The FilesystemImpl or FileDescriptionImpl may return fields not // requested in Mask, and may fail to return fields requested in Mask that // are not supported by the underlying filesystem implementation, without // returning an error. Mask uint32 // Sync specifies the synchronization required, and is one of // linux.AT_STATX_SYNC_AS_STAT (which is 0, and therefore the default), // linux.AT_STATX_SYNC_FORCE_SYNC, or linux.AT_STATX_SYNC_DONT_SYNC. Sync uint32 } // UmountOptions contains options to VirtualFilesystem.UmountAt(). // // +stateify savable type UmountOptions struct { // Flags contains flags as specified for umount2(2). Flags uint32 } // WriteOptions contains options to FileDescription.PWrite(), // FileDescriptionImpl.PWrite(), FileDescription.Write(), and // FileDescriptionImpl.Write(). // // +stateify savable type WriteOptions struct { // Flags contains flags as specified for pwritev2(2). Flags uint32 } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/pathname.go000066400000000000000000000140011465435605700236660ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sync" ) var fspathBuilderPool = sync.Pool{ New: func() any { return &fspath.Builder{} }, } func getFSPathBuilder() *fspath.Builder { return fspathBuilderPool.Get().(*fspath.Builder) } func putFSPathBuilder(b *fspath.Builder) { // No methods can be called on b after b.String(), so reset it to its zero // value (as returned by fspathBuilderPool.New) instead. *b = fspath.Builder{} fspathBuilderPool.Put(b) } // PathnameWithDeleted returns an absolute pathname to vd, consistent with // Linux's d_path(). In particular, if vd.Dentry() has been disowned, // PathnameWithDeleted appends " (deleted)" to the returned pathname. func (vfs *VirtualFilesystem) PathnameWithDeleted(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) { b := getFSPathBuilder() defer putFSPathBuilder(b) haveRef := false defer func() { if haveRef { vd.DecRef(ctx) } }() origD := vd.dentry loop: for { err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b) switch err.(type) { case nil: if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { // genericfstree.PrependPath() will have returned // PrependPathAtVFSRootError in this case since it checks // against vfsroot before mnt.root, but other implementations // of FilesystemImpl.PrependPath() may return nil instead. break loop } nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot) if !nextVD.Ok() { break loop } if haveRef { vd.DecRef(ctx) } vd = nextVD haveRef = true // continue loop case PrependPathSyntheticError: // Skip prepending "/" and appending " (deleted)". return b.String(), nil case PrependPathAtVFSRootError, PrependPathAtNonMountRootError: break loop default: return "", err } } b.PrependByte('/') if origD.IsDead() { b.AppendString(" (deleted)") } return b.String(), nil } // PathnameReachable returns an absolute pathname to vd, consistent with // Linux's __d_path() (as used by seq_path_root()). If vfsroot.Ok() and vd is // not reachable from vfsroot, such that seq_path_root() would return SEQ_SKIP // (causing the entire containing entry to be skipped), PathnameReachable // returns ("", nil). func (vfs *VirtualFilesystem) PathnameReachable(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) { b := getFSPathBuilder() defer putFSPathBuilder(b) haveRef := false defer func() { if haveRef { vd.DecRef(ctx) } }() loop: for { err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b) switch err.(type) { case nil: if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { break loop } nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot) if !nextVD.Ok() { return "", nil } if haveRef { vd.DecRef(ctx) } vd = nextVD haveRef = true case PrependPathAtVFSRootError: break loop case PrependPathAtNonMountRootError, PrependPathSyntheticError: return "", nil default: return "", err } } b.PrependByte('/') return b.String(), nil } // PathnameInFilesystem returns an absolute path to vd relative to vd's // Filesystem root. It also appends //deleted to for disowned entries. It is // equivalent to Linux's dentry_path(). func (vfs *VirtualFilesystem) PathnameInFilesystem(ctx context.Context, vd VirtualDentry) (string, error) { b := getFSPathBuilder() defer putFSPathBuilder(b) if vd.dentry.IsDead() { b.PrependString("//deleted") } if err := vd.mount.fs.impl.PrependPath(ctx, VirtualDentry{}, VirtualDentry{dentry: vd.dentry}, b); err != nil { // PrependPath returns an error if it encounters a filesystem root before // the provided vfsroot. We don't provide a vfsroot, so encountering this // error is expected and can be ignored. switch err.(type) { case PrependPathAtNonMountRootError: default: return "", err } } b.PrependByte('/') return b.String(), nil } // PathnameForGetcwd returns an absolute pathname to vd, consistent with // Linux's sys_getcwd(). func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) { if vd.dentry.IsDead() { return "", linuxerr.ENOENT } b := getFSPathBuilder() defer putFSPathBuilder(b) haveRef := false defer func() { if haveRef { vd.DecRef(ctx) } }() unreachable := false loop: for { err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b) switch err.(type) { case nil: if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { break loop } nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot) if !nextVD.Ok() { unreachable = true break loop } if haveRef { vd.DecRef(ctx) } vd = nextVD haveRef = true case PrependPathAtVFSRootError: break loop case PrependPathAtNonMountRootError, PrependPathSyntheticError: unreachable = true break loop default: return "", err } } b.PrependByte('/') if unreachable { b.PrependString("(unreachable)") } return b.String(), nil } // As of this writing, we do not have equivalents to: // // - d_absolute_path(), which returns EINVAL if (effectively) any call to // FilesystemImpl.PrependPath() would return PrependPathAtNonMountRootError. // // - dentry_path(), which does not walk up mounts (and only returns the path // relative to Filesystem root), but also appends "//deleted" for disowned // Dentries. // // These should be added as necessary. golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/permissions.go000066400000000000000000000275371465435605700244660ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "math" "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" ) // AccessTypes is a bitmask of Unix file permissions. // // +stateify savable type AccessTypes uint16 // Bits in AccessTypes. const ( MayExec AccessTypes = 1 MayWrite AccessTypes = 2 MayRead AccessTypes = 4 ) // OnlyRead returns true if access _only_ allows read. func (a AccessTypes) OnlyRead() bool { return a == MayRead } // MayRead returns true if access allows read. func (a AccessTypes) MayRead() bool { return a&MayRead != 0 } // MayWrite returns true if access allows write. func (a AccessTypes) MayWrite() bool { return a&MayWrite != 0 } // MayExec returns true if access allows exec. func (a AccessTypes) MayExec() bool { return a&MayExec != 0 } // GenericCheckPermissions checks that creds has the given access rights on a // file with the given permissions, UID, and GID, subject to the rules of // fs/namei.c:generic_permission(). func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { // Check permission bits. perms := uint16(mode.Permissions()) if creds.EffectiveKUID == kuid { perms >>= 6 } else if creds.InGroup(kgid) { perms >>= 3 } if uint16(ats)&perms == uint16(ats) { // All permission bits match, access granted. return nil } // Caller capabilities require that the file's KUID and KGID are mapped in // the caller's user namespace; compare // kernel/capability.c:privileged_wrt_inode_uidgid(). if !kuid.In(creds.UserNamespace).Ok() || !kgid.In(creds.UserNamespace).Ok() { return linuxerr.EACCES } // CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary // directories, and read arbitrary non-directory files. if (mode.IsDir() && !ats.MayWrite()) || ats.OnlyRead() { if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) { return nil } } // CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write // access to non-directory files, and execute access to non-directory files // for which at least one execute bit is set. if mode.IsDir() || !ats.MayExec() || (mode.Permissions()&0111 != 0) { if creds.HasCapability(linux.CAP_DAC_OVERRIDE) { return nil } } return linuxerr.EACCES } // MayLink determines whether creating a hard link to a file with the given // mode, kuid, and kgid is permitted. // // This corresponds to Linux's fs/namei.c:may_linkat. func MayLink(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { // Source inode owner can hardlink all they like; otherwise, it must be a // safe source. if CanActAsOwner(creds, kuid) { return nil } // Only regular files can be hard linked. if mode.FileType() != linux.S_IFREG { return linuxerr.EPERM } // Setuid files should not get pinned to the filesystem. if mode&linux.S_ISUID != 0 { return linuxerr.EPERM } // Executable setgid files should not get pinned to the filesystem, but we // don't support S_IXGRP anyway. // Hardlinking to unreadable or unwritable sources is dangerous. if err := GenericCheckPermissions(creds, MayRead|MayWrite, mode, kuid, kgid); err != nil { return linuxerr.EPERM } return nil } // AccessTypesForOpenFlags returns the access types required to open a file // with the given OpenOptions.Flags. Note that this is NOT the same thing as // the set of accesses permitted for the opened file: // // - O_TRUNC causes MayWrite to be set in the returned AccessTypes (since it // mutates the file), but does not permit writing to the open file description // thereafter. // // - "Linux reserves the special, nonstandard access mode 3 (binary 11) in // flags to mean: check for read and write permission on the file and return a // file descriptor that can't be used for reading or writing." - open(2). Thus // AccessTypesForOpenFlags returns MayRead|MayWrite in this case. // // Use May{Read,Write}FileWithOpenFlags() for these checks instead. func AccessTypesForOpenFlags(opts *OpenOptions) AccessTypes { ats := AccessTypes(0) if opts.FileExec { ats |= MayExec } switch opts.Flags & linux.O_ACCMODE { case linux.O_RDONLY: if opts.Flags&linux.O_TRUNC != 0 { return ats | MayRead | MayWrite } return ats | MayRead case linux.O_WRONLY: return ats | MayWrite default: return ats | MayRead | MayWrite } } // MayReadFileWithOpenFlags returns true if a file with the given open flags // should be readable. func MayReadFileWithOpenFlags(flags uint32) bool { switch flags & linux.O_ACCMODE { case linux.O_RDONLY, linux.O_RDWR: return true default: return false } } // MayWriteFileWithOpenFlags returns true if a file with the given open flags // should be writable. func MayWriteFileWithOpenFlags(flags uint32) bool { switch flags & linux.O_ACCMODE { case linux.O_WRONLY, linux.O_RDWR: return true default: return false } } // CheckSetStat checks that creds has permission to change the metadata of a // file with the given permissions, UID, and GID as specified by stat, subject // to the rules of Linux's fs/attr.c:setattr_prepare(). func CheckSetStat(ctx context.Context, creds *auth.Credentials, opts *SetStatOptions, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { stat := &opts.Stat if stat.Mask&linux.STATX_SIZE != 0 { limit, err := CheckLimit(ctx, 0, int64(stat.Size)) if err != nil { return err } if limit < int64(stat.Size) { return linuxerr.ErrExceedsFileSizeLimit } } if stat.Mask&linux.STATX_MODE != 0 { if !CanActAsOwner(creds, kuid) { return linuxerr.EPERM } // TODO(b/30815691): "If the calling process is not privileged (Linux: // does not have the CAP_FSETID capability), and the group of the file // does not match the effective group ID of the process or one of its // supplementary group IDs, the S_ISGID bit will be turned off, but // this will not cause an error to be returned." - chmod(2) } if stat.Mask&linux.STATX_UID != 0 { if !((creds.EffectiveKUID == kuid && auth.KUID(stat.UID) == kuid) || HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) { return linuxerr.EPERM } } if stat.Mask&linux.STATX_GID != 0 { if !((creds.EffectiveKUID == kuid && creds.InGroup(auth.KGID(stat.GID))) || HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) { return linuxerr.EPERM } } if opts.NeedWritePerm && !creds.HasCapability(linux.CAP_DAC_OVERRIDE) { if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil { return err } } if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME) != 0 { if !CanActAsOwner(creds, kuid) { if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) || (stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW) || (stat.Mask&linux.STATX_CTIME != 0 && stat.Ctime.Nsec != linux.UTIME_NOW) { return linuxerr.EPERM } if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil { return err } } } return nil } // CheckDeleteSticky checks whether the sticky bit is set on a directory with // the given file mode, and if so, checks whether creds has permission to // remove a file owned by childKUID from a directory with the given mode. // CheckDeleteSticky is consistent with fs/linux.h:check_sticky(). func CheckDeleteSticky(creds *auth.Credentials, parentMode linux.FileMode, parentKUID auth.KUID, childKUID auth.KUID, childKGID auth.KGID) error { if parentMode&linux.ModeSticky == 0 { return nil } if creds.EffectiveKUID == childKUID || creds.EffectiveKUID == parentKUID || HasCapabilityOnFile(creds, linux.CAP_FOWNER, childKUID, childKGID) { return nil } return linuxerr.EPERM } // CanActAsOwner returns true if creds can act as the owner of a file with the // given owning UID, consistent with Linux's // fs/inode.c:inode_owner_or_capable(). func CanActAsOwner(creds *auth.Credentials, kuid auth.KUID) bool { if creds.EffectiveKUID == kuid { return true } return creds.HasCapability(linux.CAP_FOWNER) && creds.UserNamespace.MapFromKUID(kuid).Ok() } // HasCapabilityOnFile returns true if creds has the given capability with // respect to a file with the given owning UID and GID, consistent with Linux's // kernel/capability.c:capable_wrt_inode_uidgid(). func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth.KUID, kgid auth.KGID) bool { return creds.HasCapability(cp) && creds.UserNamespace.MapFromKUID(kuid).Ok() && creds.UserNamespace.MapFromKGID(kgid).Ok() } // CheckLimit enforces file size rlimits. It returns error if the write // operation must not proceed. Otherwise it returns the max length allowed to // without violating the limit. func CheckLimit(ctx context.Context, offset, size int64) (int64, error) { fileSizeLimit := limits.FromContextOrDie(ctx).Get(limits.FileSize).Cur if fileSizeLimit > math.MaxInt64 { return size, nil } if offset >= int64(fileSizeLimit) { return 0, linuxerr.ErrExceedsFileSizeLimit } remaining := int64(fileSizeLimit) - offset if remaining < size { return remaining, nil } return size, nil } // CheckXattrPermissions checks permissions for extended attribute access. // This is analogous to fs/xattr.c:xattr_permission(). Some key differences: // - Does not check for read-only filesystem property. // - Does not check inode immutability or append only mode. In both cases EPERM // must be returned by filesystem implementations. // - Does not do inode permission checks. Filesystem implementations should // handle inode permission checks as they may differ across implementations. func CheckXattrPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, name string) error { switch { case strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX): // The trusted.* namespace can only be accessed by privileged // users. if creds.HasCapability(linux.CAP_SYS_ADMIN) { return nil } if ats.MayWrite() { return linuxerr.EPERM } return linuxerr.ENODATA case strings.HasPrefix(name, linux.XATTR_USER_PREFIX): // In the user.* namespace, only regular files and directories can have // extended attributes. For sticky directories, only the owner and // privileged users can write attributes. filetype := mode.FileType() if filetype != linux.ModeRegular && filetype != linux.ModeDirectory { if ats.MayWrite() { return linuxerr.EPERM } return linuxerr.ENODATA } if filetype == linux.ModeDirectory && mode&linux.ModeSticky != 0 && ats.MayWrite() && !CanActAsOwner(creds, kuid) { return linuxerr.EPERM } case strings.HasPrefix(name, linux.XATTR_SECURITY_PREFIX): if ats.MayRead() { return nil } return linuxerr.EOPNOTSUPP } return nil } // ClearSUIDAndSGID clears the setuid and/or setgid bits after a chown or write. // Depending on the mode, neither bit, only the setuid bit, or both are cleared. func ClearSUIDAndSGID(mode uint32) uint32 { // Directories don't have their bits changed. if mode&linux.ModeDirectory == linux.ModeDirectory { return mode } // Changing owners always disables the setuid bit. It disables // the setgid bit when the file is executable. mode &= ^uint32(linux.ModeSetUID) if sgid := uint32(linux.ModeSetGID | linux.ModeGroupExec); mode&sgid == sgid { mode &= ^uint32(linux.ModeSetGID) } return mode } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/propagation.go000066400000000000000000000512071465435605700244250ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) const ( // The following constants are possible bits for the cloneType argument to // VirtualFilesystem.cloneMount() and related functions. // Analogous to CL_MAKE_SHARED in Linux. makeSharedClone = 1 << iota // Analogous to CL_SLAVE in Linux. makeFollowerClone // Analogous to CL_PRIVATE in Linux. makePrivateClone // Analogous to CL_SHARED_TO_SLAVE in Linux. sharedToFollowerClone propagationFlags = linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE | linux.MS_UNBINDABLE ) // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) commitChildren(ctx context.Context, mnt *Mount) { for c := range mnt.children { if c.neverConnected() { vfs.commitMount(ctx, c) } } } // commitMount attaches mnt to the parent and mountpoint specified by its // mountKey and recursively does the same for all of mnt's descendants. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) commitMount(ctx context.Context, mnt *Mount) { mp := mnt.getKey() // If there is already a mount at this (parent, point), disconnect it from its // parent and reconnect it to mnt once mnt has been connected. child := vfs.mounts.Lookup(mp.mount, mp.dentry) vfs.mounts.seq.BeginWrite() if child != nil { vfs.delayDecRef(vfs.disconnectLocked(child)) } mp.dentry.mu.Lock() vfs.connectLocked(mnt, mp, mp.mount.ns) mp.dentry.mu.Unlock() vfs.delayDecRef(mnt) if child != nil { newmp := VirtualDentry{mnt, mnt.root} newmp.IncRef() newmp.dentry.mu.Lock() vfs.connectLocked(child, newmp, newmp.mount.ns) newmp.dentry.mu.Unlock() vfs.delayDecRef(child) } vfs.mounts.seq.EndWrite() vfs.commitChildren(ctx, mnt) } // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) abortUncomittedChildren(ctx context.Context, mnt *Mount) { for c := range mnt.children { if c.neverConnected() { vfs.abortUncommitedMount(ctx, c) delete(mnt.children, c) } } } // abortUncommitedMount releases references on mnt and all its descendants. // // Prerequisite: mnt is not connected, i.e. mnt.ns == nil. // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) abortUncommitedMount(ctx context.Context, mnt *Mount) { vfs.delayDecRef(mnt) vfs.delayDecRef(mnt.getKey()) mnt.setKey(VirtualDentry{}) vfs.setPropagation(mnt, linux.MS_PRIVATE) vfs.abortUncomittedChildren(ctx, mnt) } // SetMountPropagationAt changes the propagation type of the mount pointed to by // pop. func (vfs *VirtualFilesystem) SetMountPropagationAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, propFlag uint32) error { recursive := propFlag&linux.MS_REC != 0 propFlag &= propagationFlags // Check if flags is a power of 2. If not then more than one flag is set. if !bits.IsPowerOfTwo32(propFlag) { return linuxerr.EINVAL } vd, err := vfs.getMountpoint(ctx, creds, pop) if err != nil { return err } defer vd.DecRef(ctx) vfs.SetMountPropagation(vd.mount, propFlag, recursive) return nil } // SetMountPropagation changes the propagation type of the mount. func (vfs *VirtualFilesystem) SetMountPropagation(mnt *Mount, propFlag uint32, recursive bool) error { vfs.lockMounts() defer vfs.unlockMounts(context.Background()) if propFlag == linux.MS_SHARED { if err := vfs.allocMountGroupIDs(mnt, recursive); err != nil { return err } } if !recursive { vfs.setPropagation(mnt, propFlag) return nil } for _, m := range mnt.submountsLocked() { vfs.setPropagation(m, propFlag) } return nil } // setPropagation sets the propagation on mnt for a propagation type. This // method is analogous to fs/pnode.c:change_mnt_propagation() in Linux. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) setPropagation(mnt *Mount, propFlags uint32) { if propFlags == linux.MS_SHARED { mnt.isShared = true return } // pflag is MS_PRIVATE, MS_SLAVE, or MS_UNBINDABLE. The algorithm is the same // for MS_PRIVATE/MS_SLAVE/MS_UNBINDABLE, except that in the // private/unbindable case we clear the leader and followerEntry after the // procedure is finished. var leader *Mount if mnt.sharedEntry.Empty() { // If mnt is shared and in a peer group with only itself, just make it // private. if mnt.isShared { vfs.freeGroupID(mnt) mnt.isShared = false } // If mnt is not a follower to any other mount, make all of its followers // also private. leader = mnt.leader if leader == nil { for !mnt.followerList.Empty() { f := mnt.followerList.Front() mnt.followerList.Remove(f) f.leader = nil } } } else { // Pick a suitable new leader. Linux chooses the first peer that shares a // root dentry, or any peer if none matches that criteria. leader = mnt.sharedEntry.Next() for m := mnt.sharedEntry.Next(); m != mnt; m = m.sharedEntry.Next() { if m.root == mnt.root { leader = m break } } // Clear out mnt's shared attributes. mnt.sharedEntry.Remove() mnt.groupID = 0 mnt.isShared = false } // Transfer all of mnt's followers to the new leader. for f := mnt.followerList.Front(); f != nil; f = f.followerEntry.Next() { f.leader = leader } // Remove mnt from its current follower list and add it to the new leader. if mnt.leader != nil { mnt.leader.followerList.Remove(mnt) } if leader != nil && propFlags == linux.MS_SLAVE { leader.followerList.PushFront(mnt) mnt.leader = leader } else { mnt.leader = nil } // Add mnts followers to leader's follower list. This also links all their // followerEntry together. if !mnt.followerList.Empty() && leader != nil { leader.followerList.PushBackList(&mnt.followerList) } } type propState struct { origSrc *Mount prevSrc *Mount prevDst *Mount dstLeader *Mount propList map[*Mount]struct{} visitedLeaders map[*Mount]struct{} } // doPropagation returns a list of propagated mounts with their mount points // set. The mounts are clones of src and have an extra reference taken. If // propagation fails at any point, the method returns all the mounts propagated // up until that point so they can be properly released. This method is // analogous to fs/pnode.c:propagate_mnt() in Linux. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) doPropagation(ctx context.Context, src *Mount, dst VirtualDentry) (map[*Mount]struct{}, error) { if !dst.mount.isShared { return nil, nil } s := propState{ origSrc: src, prevSrc: src, prevDst: dst.mount, dstLeader: dst.mount.leader, propList: map[*Mount]struct{}{}, visitedLeaders: map[*Mount]struct{}{}, } for peer := dst.mount.sharedEntry.Next(); peer != dst.mount; peer = peer.sharedEntry.Next() { if err := vfs.propagateMount(ctx, peer, dst.dentry, &s); err != nil { return s.propList, err } } for follower := nextFollowerPeerGroup(dst.mount, dst.mount); follower != nil; follower = nextFollowerPeerGroup(follower, dst.mount) { peer := follower for { if err := vfs.propagateMount(ctx, peer, dst.dentry, &s); err != nil { return s.propList, err } peer = peer.sharedEntry.Next() if peer == follower { break } } } return s.propList, nil } // peers returns if two mounts are in the same peer group. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) peers(m1, m2 *Mount) bool { return m1.groupID == m2.groupID && m1.groupID != 0 } // propagateMount propagates state.srcMount to dstMount at dstPoint. // This method is analogous to fs/pnode.c:propagate_one() in Linux. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) propagateMount(ctx context.Context, dstMnt *Mount, dstPoint *Dentry, state *propState) error { // Skip newly added mounts. if dstMnt.neverConnected() || dstMnt.umounted { return nil } mp := VirtualDentry{mount: dstMnt, dentry: dstPoint} if !mp.mount.fs.Impl().IsDescendant(VirtualDentry{dstMnt, dstMnt.root}, mp) { return nil } cloneType := 0 if vfs.peers(dstMnt, state.prevDst) { cloneType = makeSharedClone } else { done := false // Get the most recent leader that we've propagated from in the tree. var leader, underLeader *Mount for underLeader = dstMnt; ; underLeader = leader { leader = underLeader.leader if _, ok := state.visitedLeaders[leader]; ok { break } if leader == state.dstLeader { break } } for { parent := state.prevSrc.parent() // Check that prevSrc is a follower, not a peer of the original. if vfs.peers(state.prevSrc, state.origSrc) { break } // Check if the mount prvSrc attached to (aka parent) has the same leader // as the most recently visited leader in the mount tree. done = parent.leader == leader // If the leader under the most recently visited leader is not peers with // the mount prevSrc attached to, then it's not part of this propagation // tree and we need to traverse up the tree to get to the real src. if done && vfs.peers(underLeader, parent) { break } // Traverse back up the propagation tree to get the proper src. We only // want to propagate from this mount's leader or peers of that leader. state.prevSrc = state.prevSrc.leader if done { break } } cloneType = makeFollowerClone if dstMnt.isShared { cloneType |= makeSharedClone } } clone, err := vfs.cloneMountTree(ctx, state.prevSrc, state.prevSrc.root, cloneType, nil) if err != nil { return err } mp.IncRef() clone.setKey(mp) state.propList[clone] = struct{}{} if dstMnt.leader != state.dstLeader { state.visitedLeaders[dstMnt.leader] = struct{}{} } state.prevDst = dstMnt state.prevSrc = clone return dstMnt.ns.checkMountCount(ctx, clone) } // nextFollowerPeerGroup iterates through the propagation tree and returns the // first mount in each follower peer group under mnt. Once all the groups // have been iterated through the method returns nil. This method is analogous // to fs/pnode.c:next_group() in Linux. func nextFollowerPeerGroup(mnt *Mount, start *Mount) *Mount { for { // If mnt has any followers, this loop returns that follower. Otherwise mnt // is updated until it is the last peer in its peer group. This has the // effect of moving down the propagation tree until the bottommost follower. // After that the loop moves across peers (if possible) to the last peer // in the group. for { if !mnt.neverConnected() && !mnt.followerList.Empty() { return mnt.followerList.Front() } next := mnt.sharedEntry.Next() if mnt.groupID == start.groupID { if next == start { return nil } // If mnt is shared+slave, its next follower will be the same as its // next peer. } else if mnt.isFollower() && mnt.followerEntry.Next() != next { break } mnt = next } // At this point mnt is the last peer in its shared+slave peer group. // This loop returns the next follower in mnt's leader's follower list. Once // the list of followers is exhausted it sets mnt to be the leader and // breaks out of the loop. This has the effect of moving across the tree // branches until all branches are exhausted. Then it moves up the tree to // the parent. for { leader := mnt.leader if mnt.followerEntry.Next() != nil { return mnt.followerEntry.Next() } mnt = leader.sharedEntry.Next() if leader.groupID == start.groupID { break } if leader.followerEntry.Next() == mnt { break } mnt = leader } if mnt == start { return nil } } } // nextPropMount iterates through the propagation tree rooted at start. It // returns nil when there are no more mounts in the tree. Otherwise, it returns // the next mount in the tree. It is analogous to fs/pnode.c:propagation_next() // in Linux. func nextPropMount(mnt, start *Mount) *Mount { m := mnt if !m.neverConnected() && !m.followerList.Empty() { return m.followerList.Front() } for { leader := m.leader if leader == start.leader { next := m.sharedEntry.Next() if next == start { return nil } return next } else if m.followerEntry.Next() != nil { return m.followerEntry.Next() } m = leader } } // arePropMountsBusy checks if all the mounts that mnt's parents propagate to // have the correct number of references before a call to umount. It is // analogous to fs/pnode.c:propagate_mount_busy() in Linux. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) arePropMountsBusy(mnt *Mount) bool { parent := mnt.parent() if parent == nil { return !vfs.mountHasExpectedRefs(mnt) } if len(mnt.children) != 0 || !vfs.mountHasExpectedRefs(mnt) { return true } for m := nextPropMount(parent, parent); m != nil; m = nextPropMount(m, parent) { child := vfs.mounts.Lookup(m, mnt.point()) if child == nil { continue } if len(child.children) != 0 && child.coveringMount() == nil { continue } if !vfs.mountHasExpectedRefs(child) { return true } } return false } // allocateGroupID populates mnt.groupID with a new group id if one is // available, and returns an error otherwise. If the group ID bitmap is full, // double the size of the bitmap before allocating the new group id. It is // analogous to fs/namespace.c:mnt_alloc_group_id() in Linux. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) allocateGroupID(mnt *Mount) error { groupID, err := vfs.groupIDBitmap.FirstZero(1) if err != nil { if err := vfs.groupIDBitmap.Grow(uint32(vfs.groupIDBitmap.Size())); err != nil { return linuxerr.ENOSPC } groupID, err = vfs.groupIDBitmap.FirstZero(1) if err != nil { return err } } vfs.groupIDBitmap.Add(groupID) mnt.groupID = groupID return nil } // freeGroupID marks a groupID as available for reuse. It is analogous to // fs/namespace.c:mnt_release_group_id() in Linux. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) freeGroupID(mnt *Mount) { vfs.groupIDBitmap.Remove(mnt.groupID) mnt.groupID = 0 } // cleanupGroupIDs zeroes out all of the mounts' groupIDs and returns them // to the pool of available ids. It is analogous to // fs/namespace.c:cleanup_group_ids() in Linux. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) cleanupGroupIDs(mnts []*Mount) { for _, m := range mnts { if m.groupID != 0 && !m.isShared { vfs.freeGroupID(m) } } } // allocMountGroupIDs allocates a new group id for mnt. If recursive is true, it // also allocates a new group id for all mounts children. It is analogous to // fs/namespace.c:invent_group_ids() in Linux. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) allocMountGroupIDs(mnt *Mount, recursive bool) error { var mnts []*Mount if recursive { mnts = mnt.submountsLocked() } else { mnts = []*Mount{mnt} } for _, m := range mnts { if m.groupID == 0 && !m.isShared { if err := vfs.allocateGroupID(m); err != nil { vfs.cleanupGroupIDs(mnts) return err } } } return nil } // propagateUmount returns a list of mounts that the umount of mnts propagates // to. // // Prerequisites: all the mounts in mnts have had vfs.umount() called on them. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) propagateUmount(mnts []*Mount) []*Mount { const ( umountVisited = iota umountRestore ) var toUmount []*Mount noChildren := make(map[*Mount]struct{}) // Processed contains all the mounts that the algorithm has processed so far. // If the mount maps to umountRestore, it should be restored after processing // all the mounts. This happens in cases where a mount was speculatively // unmounted that had children or is a cover mount. processed := make(map[*Mount]int) // Iterate through the mounts from the leafs back to the root. for i := len(mnts) - 1; i >= 0; i-- { mnt := mnts[i] // If a mount has already been visited we know all its peers and followers // have been visited so there's no need to visit them again. if _, ok := processed[mnt]; ok { continue } processed[mnt] = umountVisited parent := mnt.parent() if parent == nil { continue } for m := nextPropMount(parent, parent); m != nil; m = nextPropMount(m, parent) { child := vfs.mounts.Lookup(m, mnt.point()) if child == nil { continue } if _, ok := processed[child]; ok { // If the child has been visited we know its peer group and followers // have all been visited so there's no need to visit them again. We can // skip this propagation subtree by setting the iterator to be the last // mount in the follower group. if !child.followerList.Empty() { m = child.followerList.Back() } continue } else if child.umounted { // If this child has already been marked for unmounting, just mark it // as visited and move on. This means it was either part of the original // mount list passed to this method or was umounted from another mount's // propagation. In either case we can consider all its peers and // followers as visited. processed[child] = umountVisited continue } // This loop starts at the child we are propagating the umount to and // iterates through the child's parents. It continues as until it // encounters a parent that's been visited. loop: for { if _, ok := noChildren[child]; ok || child.umounted { break } // If there are any children that have mountpoint != parent's root then // the current mount cannot be unmounted. for gchild := range child.children { if gchild.point() == child.root { continue } _, isProcessed := processed[gchild] _, hasNoChildren := noChildren[gchild] if isProcessed && hasNoChildren { continue } processed[child] = umountRestore break loop } if child.locked { processed[child] = umountRestore noChildren[child] = struct{}{} } else { vfs.umount(child) toUmount = append(toUmount, child) } // If this parent was a mount that had to be restored because it had // children, it might be safe to umount now that its child is gone. If // it has been visited then it's already being umounted. child = child.parent() if _, ok := processed[child]; !ok { break } } } } // Add all the children of mounts marked for umount to the umount list. This // excludes "cover" mounts (mounts whose mount point is equal to their // parent's root) which will be reparented in the next step. for i := 0; i < len(toUmount); i++ { umount := toUmount[i] for child := range umount.children { if child.point() == umount.root { processed[child] = umountRestore } else { vfs.umount(child) toUmount = append(toUmount, child) } } } vfs.mounts.seq.BeginWrite() for m, status := range processed { if status == umountVisited { continue } mp := m.getKey() for mp.mount.umounted { mp = mp.mount.getKey() } if mp != m.getKey() { vfs.changeMountpoint(m, mp) } } vfs.mounts.seq.EndWrite() return toUmount } // unlockPropagationMounts sets locked to false for every mount that a umount // of mnt propagates to. It is analogous to fs/pnode.c:propagate_mount_unlock() // in Linux. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) unlockPropagationMounts(mnt *Mount) { parent := mnt.parent() if parent == nil { return } for m := nextPropMount(parent, parent); m != nil; m = nextPropMount(m, parent) { child := vfs.mounts.Lookup(m, mnt.point()) if child == nil { continue } child.locked = false } } // peerUnderRoot iterates through mnt's peers until it finds a mount that is in // ns and is reachable from root. This method is analogous to // fs/pnode.c:get_peer_under_root() in Linux. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) peerUnderRoot(ctx context.Context, mnt *Mount, ns *MountNamespace, root VirtualDentry) *Mount { m := mnt for { if m.ns == ns { if vfs.isPathReachable(ctx, root, VirtualDentry{mnt, mnt.root}) { return m } } m = m.sharedEntry.Next() if m == mnt { break } } return nil } // isPathReachable returns true if vd is reachable from vfsroot. It is analogous // to fs/namespace.c:is_path_reachable() in Linux. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) isPathReachable(ctx context.Context, vfsroot VirtualDentry, vd VirtualDentry) bool { for vd.mount != vfsroot.mount && vd.mount.parent() != nil { vd = vd.mount.getKey() } return vd.mount == vfsroot.mount && vd.mount.fs.Impl().IsDescendant(vfsroot, vd) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/resolving_path.go000066400000000000000000000366771465435605700251440ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" ) // ResolvingPath represents the state of an in-progress path resolution, shared // between VFS and FilesystemImpl methods that take a path. // // From the perspective of FilesystemImpl methods, a ResolvingPath represents a // starting Dentry on the associated Filesystem (on which a reference is // already held), a stream of path components relative to that Dentry, and // elements of the invoking Context that are commonly required by // FilesystemImpl methods. // // ResolvingPath is loosely analogous to Linux's struct nameidata. // // +stateify savable type ResolvingPath struct { vfs *VirtualFilesystem root VirtualDentry // refs borrowed from PathOperation mount *Mount start *Dentry pit fspath.Iterator flags uint16 mustBeDir bool // final file must be a directory? symlinks uint8 // number of symlinks traversed curPart uint8 // index into parts creds *auth.Credentials // Data associated with resolve*Errors, stored in ResolvingPath so that // those errors don't need to allocate. nextMount *Mount // ref held if not nil nextStart *Dentry // ref held if not nil absSymlinkTarget fspath.Path // ResolvingPath tracks relative paths, which is updated whenever a relative // symlink is encountered. parts [1 + linux.MaxSymlinkTraversals]fspath.Iterator } const ( rpflagsHaveMountRef = 1 << iota // do we hold a reference on mount? rpflagsHaveStartRef // do we hold a reference on start? rpflagsFollowFinalSymlink // same as PathOperation.FollowFinalSymlink ) func init() { if maxParts := len(ResolvingPath{}.parts); maxParts > 255 { panic(fmt.Sprintf("uint8 is insufficient to accommodate len(ResolvingPath.parts) (%d)", maxParts)) } } // Error types that communicate state from the FilesystemImpl-caller, // VFS-callee side of path resolution (i.e. errors returned by // ResolvingPath.Resolve*()) to the VFS-caller, FilesystemImpl-callee side // (i.e. VFS methods => ResolvingPath.handleError()). These are empty structs // rather than error values because Go doesn't support non-primitive constants, // so error "constants" are really mutable vars, necessitating somewhat // expensive interface object comparisons. // +stateify savable type resolveMountRootOrJumpError struct{} // Error implements error.Error. func (resolveMountRootOrJumpError) Error() string { return "resolving mount root or jump" } // +stateify savable type resolveMountPointError struct{} // Error implements error.Error. func (resolveMountPointError) Error() string { return "resolving mount point" } // +stateify savable type resolveAbsSymlinkError struct{} // Error implements error.Error. func (resolveAbsSymlinkError) Error() string { return "resolving absolute symlink" } var resolvingPathPool = sync.Pool{ New: func() any { return &ResolvingPath{} }, } // getResolvingPath gets a new ResolvingPath from the pool. Caller must call // ResolvingPath.Release() when done. func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) *ResolvingPath { rp := resolvingPathPool.Get().(*ResolvingPath) rp.vfs = vfs rp.root = pop.Root rp.mount = pop.Start.mount rp.start = pop.Start.dentry rp.pit = pop.Path.Begin rp.flags = 0 if pop.FollowFinalSymlink { rp.flags |= rpflagsFollowFinalSymlink } rp.mustBeDir = pop.Path.Dir rp.symlinks = 0 rp.curPart = 0 rp.creds = creds rp.parts[0] = pop.Path.Begin return rp } // Copy creates another ResolvingPath with the same state as the original. // Copies are independent, using the copy does not change the original and // vice-versa. // // Caller must call Resease() when done. func (rp *ResolvingPath) Copy() *ResolvingPath { copy := resolvingPathPool.Get().(*ResolvingPath) *copy = *rp // All fields all shallow copiable. // Take extra reference for the copy if the original had them. if copy.flags&rpflagsHaveStartRef != 0 { copy.start.IncRef() } if copy.flags&rpflagsHaveMountRef != 0 { copy.mount.IncRef() } // Reset error state. copy.nextStart = nil copy.nextMount = nil return copy } // Release decrements references if needed and returns the object to the pool. func (rp *ResolvingPath) Release(ctx context.Context) { rp.root = VirtualDentry{} rp.decRefStartAndMount(ctx) rp.mount = nil rp.start = nil rp.releaseErrorState(ctx) resolvingPathPool.Put(rp) } func (rp *ResolvingPath) decRefStartAndMount(ctx context.Context) { if rp.flags&rpflagsHaveStartRef != 0 { rp.start.DecRef(ctx) } if rp.flags&rpflagsHaveMountRef != 0 { rp.mount.DecRef(ctx) } } func (rp *ResolvingPath) releaseErrorState(ctx context.Context) { if rp.nextStart != nil { rp.nextStart.DecRef(ctx) rp.nextStart = nil } if rp.nextMount != nil { rp.nextMount.DecRef(ctx) rp.nextMount = nil } } // VirtualFilesystem returns the containing VirtualFilesystem. func (rp *ResolvingPath) VirtualFilesystem() *VirtualFilesystem { return rp.vfs } // Credentials returns the credentials of rp's provider. func (rp *ResolvingPath) Credentials() *auth.Credentials { return rp.creds } // Mount returns the Mount on which path resolution is currently occurring. It // does not take a reference on the returned Mount. func (rp *ResolvingPath) Mount() *Mount { return rp.mount } // Start returns the starting Dentry represented by rp. It does not take a // reference on the returned Dentry. func (rp *ResolvingPath) Start() *Dentry { return rp.start } // Done returns true if there are no remaining path components in the stream // represented by rp. func (rp *ResolvingPath) Done() bool { // We don't need to check for rp.curPart == 0 because rp.Advance() won't // set rp.pit to a terminal iterator otherwise. return !rp.pit.Ok() } // Final returns true if there is exactly one remaining path component in the // stream represented by rp. // // Preconditions: !rp.Done(). func (rp *ResolvingPath) Final() bool { return rp.curPart == 0 && !rp.pit.NextOk() } // Component returns the current path component in the stream represented by // rp. // // Preconditions: !rp.Done(). func (rp *ResolvingPath) Component() string { if checkInvariants { if !rp.pit.Ok() { panic("ResolvingPath.Component() called at end of relative path") } } return rp.pit.String() } // Advance advances the stream of path components represented by rp. // // Preconditions: !rp.Done(). func (rp *ResolvingPath) Advance() { if checkInvariants { if !rp.pit.Ok() { panic("ResolvingPath.Advance() called at end of relative path") } } next := rp.pit.Next() if next.Ok() || rp.curPart == 0 { // have next component, or at end of path rp.pit = next } else { // at end of path segment, continue with next one rp.curPart-- rp.pit = rp.parts[rp.curPart] } } // GetComponents emits all the remaining path components in rp. It does *not* // update rp state. It halts if emit() returns false. If excludeLast is true, // then the last path component is not emitted. func (rp *ResolvingPath) GetComponents(excludeLast bool, emit func(string) bool) { // Copy rp state. cur := rp.pit curPart := rp.curPart for cur.Ok() { if excludeLast && curPart == 0 && !cur.NextOk() { break } if !emit(cur.String()) { break } cur = cur.Next() if !cur.Ok() && curPart > 0 { curPart-- cur = rp.parts[curPart] } } } // CheckRoot is called before resolving the parent of the Dentry d. If the // Dentry is contextually a VFS root, such that path resolution should treat // d's parent as itself, CheckRoot returns (true, nil). If the Dentry is the // root of a non-root mount, such that path resolution should switch to another // Mount, CheckRoot returns (unspecified, non-nil error). Otherwise, path // resolution should resolve d's parent normally, and CheckRoot returns (false, // nil). func (rp *ResolvingPath) CheckRoot(ctx context.Context, d *Dentry) (bool, error) { if d == rp.root.dentry && rp.mount == rp.root.mount { // At contextual VFS root (due to e.g. chroot(2)). return true, nil } else if d == rp.mount.root { // At mount root ... vd := rp.vfs.getMountpointAt(ctx, rp.mount, rp.root) if vd.Ok() { // ... of non-root mount. rp.nextMount = vd.mount rp.nextStart = vd.dentry return false, resolveMountRootOrJumpError{} } // ... of root mount. return true, nil } return false, nil } // CheckMount is called after resolving the parent or child of another Dentry // to d. If d is a mount point, such that path resolution should switch to // another Mount, CheckMount returns a non-nil error. Otherwise, CheckMount // returns nil. func (rp *ResolvingPath) CheckMount(ctx context.Context, d *Dentry) error { if !d.isMounted() { return nil } if mnt := rp.vfs.getMountAt(ctx, rp.mount, d); mnt != nil { rp.nextMount = mnt return resolveMountPointError{} } return nil } // ShouldFollowSymlink returns true if, supposing that the current path // component in pcs represents a symbolic link, the symbolic link should be // followed. // // If path is terminated with '/', the '/' is considered the last element and // any symlink before that is followed: // // - For most non-creating walks, the last path component is handled by // fs/namei.c:lookup_last(), which sets LOOKUP_FOLLOW if the first byte // after the path component is non-NULL (which is only possible if it's '/') // and the path component is of type LAST_NORM. // // - For open/openat/openat2 without O_CREAT, the last path component is // handled by fs/namei.c:do_last(), which does the same, though without the // LAST_NORM check. // // Preconditions: !rp.Done(). func (rp *ResolvingPath) ShouldFollowSymlink() bool { // Non-final symlinks are always followed. Paths terminated with '/' are also // always followed. return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final() || rp.MustBeDir() } // HandleSymlink is called when the current path component is a symbolic link // to the given target. If the calling Filesystem method should continue path // traversal, HandleSymlink updates the path component stream to reflect the // symlink target and returns nil. Otherwise it returns a non-nil error. It // also returns whether the symlink was successfully followed, which can be // true even when a non-nil error like resolveAbsSymlinkError is returned. // // Preconditions: !rp.Done(). // // Postconditions: If HandleSymlink returns a nil error, then !rp.Done(). func (rp *ResolvingPath) HandleSymlink(target string) (bool, error) { if rp.symlinks >= linux.MaxSymlinkTraversals { return false, linuxerr.ELOOP } if len(target) == 0 { return false, linuxerr.ENOENT } rp.symlinks++ targetPath := fspath.Parse(target) if targetPath.Absolute { rp.absSymlinkTarget = targetPath return true, resolveAbsSymlinkError{} } // Consume the path component that represented the symlink. rp.Advance() // Prepend the symlink target to the relative path. if checkInvariants { if !targetPath.HasComponents() { panic(fmt.Sprintf("non-empty pathname %q parsed to relative path with no components", target)) } } rp.relpathPrepend(targetPath) return true, nil } // Preconditions: path.HasComponents(). func (rp *ResolvingPath) relpathPrepend(path fspath.Path) { if rp.pit.Ok() { rp.parts[rp.curPart] = rp.pit rp.pit = path.Begin rp.curPart++ } else { // The symlink was the final path component, so now the symlink target // is the whole path. rp.pit = path.Begin // Symlink targets can set rp.mustBeDir (if they end in a trailing /), // but can't unset it. if path.Dir { rp.mustBeDir = true } } } // HandleJump is called when the current path component is a "magic" link to // the given VirtualDentry, like /proc/[pid]/fd/[fd]. If the calling Filesystem // method should continue path traversal, HandleJump updates the path // component stream to reflect the magic link target and returns nil. Otherwise // it returns a non-nil error. It also returns whether the magic link was // followed, which can be true even when a non-nil error like // resolveMountRootOrJumpError is returned. // // Preconditions: !rp.Done(). func (rp *ResolvingPath) HandleJump(target VirtualDentry) (bool, error) { if rp.symlinks >= linux.MaxSymlinkTraversals { return false, linuxerr.ELOOP } rp.symlinks++ // Consume the path component that represented the magic link. rp.Advance() // Unconditionally return a resolveMountRootOrJumpError, even if the Mount // isn't changing, to force restarting at the new Dentry. target.IncRef() rp.nextMount = target.mount rp.nextStart = target.dentry return true, resolveMountRootOrJumpError{} } func (rp *ResolvingPath) handleError(ctx context.Context, err error) bool { switch err.(type) { case resolveMountRootOrJumpError: // Switch to the new Mount. We hold references on the Mount and Dentry. rp.decRefStartAndMount(ctx) rp.mount = rp.nextMount rp.start = rp.nextStart rp.flags |= rpflagsHaveMountRef | rpflagsHaveStartRef rp.nextMount = nil rp.nextStart = nil // Don't consume the path component that caused us to traverse // through the mount root - i.e. the ".." - because we still need to // resolve the mount point's parent in the new FilesystemImpl. // // Restart path resolution on the new Mount. Don't bother calling // rp.releaseErrorState() since we already set nextMount and nextStart // to nil above. return true case resolveMountPointError: // Switch to the new Mount. We hold a reference on the Mount, but // borrow the reference on the mount root from the Mount. rp.decRefStartAndMount(ctx) rp.mount = rp.nextMount rp.start = rp.nextMount.root rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef rp.nextMount = nil // Consume the path component that represented the mount point. rp.Advance() // Restart path resolution on the new Mount. rp.releaseErrorState(ctx) return true case resolveAbsSymlinkError: // Switch to the new Mount. References are borrowed from rp.root. rp.decRefStartAndMount(ctx) rp.mount = rp.root.mount rp.start = rp.root.dentry rp.flags &^= rpflagsHaveMountRef | rpflagsHaveStartRef // Consume the path component that represented the symlink. rp.Advance() if rp.absSymlinkTarget.HasComponents() { // Prepend the symlink target to the relative path. rp.relpathPrepend(rp.absSymlinkTarget) } // Restart path resolution on the new Mount. rp.releaseErrorState(ctx) return true default: // Not an error we can handle. return false } } // canHandleError returns true if err is an error returned by rp.Resolve*() // that rp.handleError() may attempt to handle. func (rp *ResolvingPath) canHandleError(err error) bool { switch err.(type) { case resolveMountRootOrJumpError, resolveMountPointError, resolveAbsSymlinkError: return true default: return false } } // MustBeDir returns true if the file traversed by rp must be a directory. func (rp *ResolvingPath) MustBeDir() bool { return rp.mustBeDir } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/save_restore.go000066400000000000000000000121021465435605700245720ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( goContext "context" "fmt" "sync/atomic" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/waiter" ) // ErrCorruption indicates a failed restore due to external file system state in // corruption. type ErrCorruption struct { // Err is the wrapped error. Err error } // Error returns a sensible description of the restore error. func (e ErrCorruption) Error() string { return "restore failed due to external file system state in corruption: " + e.Err.Error() } // FilesystemImplSaveRestoreExtension is an optional extension to // FilesystemImpl. type FilesystemImplSaveRestoreExtension interface { // PrepareSave prepares this filesystem for serialization. PrepareSave(ctx context.Context) error // CompleteRestore completes restoration from checkpoint for this // filesystem after deserialization. CompleteRestore(ctx context.Context, opts CompleteRestoreOptions) error } // PrepareSave prepares all filesystems for serialization. func (vfs *VirtualFilesystem) PrepareSave(ctx context.Context) error { for fs := range vfs.getFilesystems() { if ext, ok := fs.impl.(FilesystemImplSaveRestoreExtension); ok { if err := ext.PrepareSave(ctx); err != nil { fs.DecRef(ctx) return err } } fs.DecRef(ctx) } return nil } // CompleteRestore completes restoration from checkpoint for all filesystems // after deserialization. func (vfs *VirtualFilesystem) CompleteRestore(ctx context.Context, opts *CompleteRestoreOptions) error { for fs := range vfs.getFilesystems() { if ext, ok := fs.impl.(FilesystemImplSaveRestoreExtension); ok { if err := ext.CompleteRestore(ctx, *opts); err != nil { fs.DecRef(ctx) return err } } fs.DecRef(ctx) } return nil } // CompleteRestoreOptions contains options to // VirtualFilesystem.CompleteRestore() and // FilesystemImplSaveRestoreExtension.CompleteRestore(). type CompleteRestoreOptions struct { // If ValidateFileSizes is true, filesystem implementations backed by // remote filesystems should verify that file sizes have not changed // between checkpoint and restore. ValidateFileSizes bool // If ValidateFileModificationTimestamps is true, filesystem // implementations backed by remote filesystems should validate that file // mtimes have not changed between checkpoint and restore. ValidateFileModificationTimestamps bool } // saveMounts is called by stateify. func (vfs *VirtualFilesystem) saveMounts() []*Mount { if atomic.LoadPointer(&vfs.mounts.slots) == nil { // vfs.Init() was never called. return nil } var mounts []*Mount vfs.mounts.Range(func(mount *Mount) bool { mounts = append(mounts, mount) return true }) return mounts } // saveKey is called by stateify. func (mnt *Mount) saveKey() VirtualDentry { return mnt.getKey() } // saveMountPromises is called by stateify. func (vfs *VirtualFilesystem) saveMountPromises() map[VirtualDentry]*mountPromise { m := make(map[VirtualDentry]*mountPromise) vfs.mountPromises.Range(func(key any, val any) bool { m[key.(VirtualDentry)] = val.(*mountPromise) return true }) return m } // loadMounts is called by stateify. func (vfs *VirtualFilesystem) loadMounts(_ goContext.Context, mounts []*Mount) { if mounts == nil { return } vfs.mounts.Init() for _, mount := range mounts { vfs.mounts.Insert(mount) } } // loadKey is called by stateify. func (mnt *Mount) loadKey(_ goContext.Context, vd VirtualDentry) { mnt.setKey(vd) } // loadMountPromises is called by stateify. func (vfs *VirtualFilesystem) loadMountPromises(_ goContext.Context, mps map[VirtualDentry]*mountPromise) { for vd, mp := range mps { vfs.mountPromises.Store(vd, mp) } } // afterLoad is called by stateify. func (mnt *Mount) afterLoad(goContext.Context) { if mnt.refs.Load() != 0 { refs.Register(mnt) } } // afterLoad is called by stateify. func (epi *epollInterest) afterLoad(goContext.Context) { // Mark all epollInterests as ready after restore so that the next call to // EpollInstance.ReadEvents() rechecks their readiness. epi.waiter.NotifyEvent(waiter.EventMaskFromLinux(epi.mask)) } // RestoreID is a unique ID that is used to identify resources between save/restore sessions. // Example of resources are host files, gofer connection for mount points, etc. // // +stateify savable type RestoreID struct { // ContainerName is the name of the container that the resource belongs to. ContainerName string // Path is the path of the resource. Path string } func (f RestoreID) String() string { return fmt.Sprintf("%s:%s", f.ContainerName, f.Path) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/vfs.go000066400000000000000000001063471465435605700227060ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package vfs implements a virtual filesystem layer. // // Lock order: // // EpollInstance.interestMu // FileDescription.epollMu // Locks acquired by FilesystemImpl/FileDescriptionImpl methods (except IsDescendant) // VirtualFilesystem.mountMu // Dentry.mu // Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry // VirtualFilesystem.filesystemsMu // fdnotifier.notifier.mu // EpollInstance.readyMu // Inotify.mu // Watches.mu // Inotify.evMu // VirtualFilesystem.fsTypesMu // // Locking Dentry.mu in multiple Dentries requires holding // VirtualFilesystem.mountMu. Locking EpollInstance.interestMu in multiple // EpollInstances requires holding epollCycleMu. // // FilesystemImpl locks are not held during calls to FilesystemImpl.IsDescendant // since it's called under mountMu. It's possible for concurrent mutation // to dentry ancestors during calls IsDescendant. Callers should take // appropriate caution when using this method. package vfs import ( "fmt" "path" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/bitmap" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fsmetric" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" epb "gvisor.dev/gvisor/pkg/sentry/vfs/events_go_proto" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) // How long to wait for a mount promise before proceeding with the VFS // operation. This should be configurable by the user eventually. const mountPromiseTimeout = 30 * time.Second type mountPromise struct { wq *waiter.Queue resolved atomicbitops.Bool } // A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts. // // There is no analogue to the VirtualFilesystem type in Linux, as the // equivalent state in Linux is global. // // +stateify savable type VirtualFilesystem struct { // mountMu serializes mount mutations. // // mountMu is analogous to Linux's namespace_sem. mountMu virtualFilesystemMutex `state:"nosave"` // mounts maps (mount parent, mount point) pairs to mounts. (Since mounts // are uniquely namespaced, including mount parent in the key correctly // handles both bind mounts and mount namespaces; Linux does the same.) // Synchronization between mutators and readers is provided by mounts.seq; // synchronization between mutators is provided by mountMu. // // mounts is used to follow mount points during path traversal. We use a // single table rather than per-Dentry tables to reduce size (and therefore // cache footprint) for the vast majority of Dentries that are not mount // points. // // mounts is analogous to Linux's mount_hashtable. mounts mountTable `state:".([]*Mount)"` // mountpoints maps mount points to mounts at those points in all // namespaces. mountpoints is protected by mountMu. // // mountpoints is used to find mounts that must be umounted due to // removal of a mount point Dentry from another mount namespace. ("A file // or directory that is a mount point in one namespace that is not a mount // point in another namespace, may be renamed, unlinked, or removed // (rmdir(2)) in the mount namespace in which it is not a mount point // (subject to the usual permission checks)." - mount_namespaces(7)) // // mountpoints is analogous to Linux's mountpoint_hashtable. mountpoints map[*Dentry]map[*Mount]struct{} // lastMountID is the last allocated mount ID. lastMountID is accessed // using atomic memory operations. lastMountID atomicbitops.Uint64 // anonMount is a Mount, not included in mounts or mountpoints, // representing an anonFilesystem. anonMount is used to back // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry(). // anonMount is immutable. // // anonMount is analogous to Linux's anon_inode_mnt. anonMount *Mount // devices contains all registered Devices. devices is protected by // devicesMu. devicesMu sync.RWMutex `state:"nosave"` devices map[devTuple]*registeredDevice // dynCharDevMajorUsed contains all allocated dynamic character device // major numbers. dynCharDevMajor is protected by dynCharDevMajorMu. dynCharDevMajorMu sync.Mutex `state:"nosave"` dynCharDevMajorUsed map[uint32]struct{} // anonBlockDevMinor contains all allocated anonymous block device minor // numbers. anonBlockDevMinorNext is a lower bound for the smallest // unallocated anonymous block device number. anonBlockDevMinorNext and // anonBlockDevMinor are protected by anonBlockDevMinorMu. anonBlockDevMinorMu sync.Mutex `state:"nosave"` anonBlockDevMinorNext uint32 anonBlockDevMinor map[uint32]struct{} // fsTypes contains all registered FilesystemTypes. fsTypes is protected by // fsTypesMu. fsTypesMu sync.RWMutex `state:"nosave"` fsTypes map[string]*registeredFilesystemType // filesystems contains all Filesystems. filesystems is protected by // filesystemsMu. filesystemsMu sync.Mutex `state:"nosave"` filesystems map[*Filesystem]struct{} // groupIDBitmap tracks which mount group IDs are available for allocation. groupIDBitmap bitmap.Bitmap // mountPromises contains all unresolved mount promises. mountPromises sync.Map `state:".(map[VirtualDentry]*mountPromise)"` // toDecRef contains all the reference counted objects that needed to be // DecRefd while mountMu was held. It is cleared every time unlockMounts is // called and protected by mountMu. // // +checklocks:mountMu toDecRef map[refs.RefCounter]int } // Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes. func (vfs *VirtualFilesystem) Init(ctx context.Context) error { if vfs.mountpoints != nil { panic("VFS already initialized") } vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{}) vfs.devices = make(map[devTuple]*registeredDevice) vfs.dynCharDevMajorUsed = make(map[uint32]struct{}) vfs.anonBlockDevMinorNext = 1 vfs.anonBlockDevMinor = make(map[uint32]struct{}) vfs.fsTypes = make(map[string]*registeredFilesystemType) vfs.filesystems = make(map[*Filesystem]struct{}) vfs.mounts.Init() vfs.groupIDBitmap = bitmap.New(1024) vfs.mountMu.Lock() vfs.toDecRef = make(map[refs.RefCounter]int) vfs.mountMu.Unlock() // Construct vfs.anonMount. anonfsDevMinor, err := vfs.GetAnonBlockDevMinor() if err != nil { // This shouldn't be possible since anonBlockDevMinorNext was // initialized to 1 above (no device numbers have been allocated yet). panic(fmt.Sprintf("VirtualFilesystem.Init: device number allocation for anonfs failed: %v", err)) } anonfs := anonFilesystem{ devMinor: anonfsDevMinor, } anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs) defer anonfs.vfsfs.DecRef(ctx) anonMount := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{}) vfs.anonMount = anonMount return nil } // Release drops references on filesystem objects held by vfs. // // Precondition: This must be called after VFS.Init() has succeeded. func (vfs *VirtualFilesystem) Release(ctx context.Context) { vfs.anonMount.DecRef(ctx) for _, fst := range vfs.fsTypes { fst.fsType.Release(ctx) } } // PathOperation specifies the path operated on by a VFS method. // // PathOperation is passed to VFS methods by pointer to reduce memory copying: // it's somewhat large and should never escape. (Options structs are passed by // pointer to VFS and FileDescription methods for the same reason.) // // +stateify savable type PathOperation struct { // Root is the VFS root. References on Root are borrowed from the provider // of the PathOperation. // // Invariants: Root.Ok(). Root VirtualDentry // Start is the starting point for the path traversal. References on Start // are borrowed from the provider of the PathOperation (i.e. the caller of // the VFS method to which the PathOperation was passed). // // Invariants: Start.Ok(). If Path.Absolute, then Start == Root. Start VirtualDentry // Path is the pathname traversed by this operation. Path fspath.Path // If FollowFinalSymlink is true, and the Dentry traversed by the final // path component represents a symbolic link, the symbolic link should be // followed. FollowFinalSymlink bool } // AccessAt checks whether a user with creds has access to the file at // the given path. func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credentials, ats AccessTypes, pop *PathOperation) error { rp := vfs.getResolvingPath(creds, pop) for { vfs.maybeBlockOnMountPromise(ctx, rp) err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats) if err == nil { rp.Release(ctx) return nil } if !rp.handleError(ctx, err) { rp.Release(ctx) return err } } } // GetDentryAt returns a VirtualDentry representing the given path, at which a // file must exist. A reference is taken on the returned VirtualDentry. func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) { rp := vfs.getResolvingPath(creds, pop) for { vfs.maybeBlockOnMountPromise(ctx, rp) d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts) if err == nil { vd := VirtualDentry{ mount: rp.mount, dentry: d, } rp.mount.IncRef() rp.Release(ctx) return vd, nil } if !rp.handleError(ctx, err) { rp.Release(ctx) return VirtualDentry{}, err } } } // Preconditions: pop.Path.Begin.Ok(). func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, string, error) { rp := vfs.getResolvingPath(creds, pop) for { vfs.maybeBlockOnMountPromise(ctx, rp) parent, err := rp.mount.fs.impl.GetParentDentryAt(ctx, rp) if err == nil { parentVD := VirtualDentry{ mount: rp.mount, dentry: parent, } rp.mount.IncRef() name := rp.Component() rp.Release(ctx) return parentVD, name, nil } if checkInvariants { if rp.canHandleError(err) && rp.Done() { panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } if !rp.handleError(ctx, err) { rp.Release(ctx) return VirtualDentry{}, "", err } } } // LinkAt creates a hard link at newpop representing the existing file at // oldpop. func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error { oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{}) if err != nil { return err } if !newpop.Path.Begin.Ok() { oldVD.DecRef(ctx) if newpop.Path.Absolute { return linuxerr.EEXIST } return linuxerr.ENOENT } if newpop.FollowFinalSymlink { oldVD.DecRef(ctx) ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink") return linuxerr.EINVAL } rp := vfs.getResolvingPath(creds, newpop) for { vfs.maybeBlockOnMountPromise(ctx, rp) err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD) if err == nil { rp.Release(ctx) oldVD.DecRef(ctx) return nil } if checkInvariants { if rp.canHandleError(err) && rp.Done() { panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } if !rp.handleError(ctx, err) { rp.Release(ctx) oldVD.DecRef(ctx) return err } } } // MkdirAt creates a directory at the given path. func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { if !pop.Path.Begin.Ok() { // pop.Path should not be empty in operations that create/delete files. // This is consistent with mkdirat(dirfd, "", mode). if pop.Path.Absolute { return linuxerr.EEXIST } return linuxerr.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink") return linuxerr.EINVAL } // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is // also honored." - mkdir(2) opts.Mode &= 0777 | linux.S_ISVTX rp := vfs.getResolvingPath(creds, pop) for { vfs.maybeBlockOnMountPromise(ctx, rp) err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) if err == nil { rp.Release(ctx) return nil } if checkInvariants { if rp.canHandleError(err) && rp.Done() { panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } if !rp.handleError(ctx, err) { rp.Release(ctx) return err } } } // MknodAt creates a file of the given mode at the given path. It returns an // error from the linuxerr package. func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error { if !pop.Path.Begin.Ok() { // pop.Path should not be empty in operations that create/delete files. // This is consistent with mknodat(dirfd, "", mode, dev). if pop.Path.Absolute { return linuxerr.EEXIST } return linuxerr.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink") return linuxerr.EINVAL } rp := vfs.getResolvingPath(creds, pop) for { vfs.maybeBlockOnMountPromise(ctx, rp) err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts) if err == nil { rp.Release(ctx) return nil } if checkInvariants { if rp.canHandleError(err) && rp.Done() { panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } if !rp.handleError(ctx, err) { rp.Release(ctx) return err } } } // OpenAt returns a FileDescription providing access to the file at the given // path. A reference is taken on the returned FileDescription. func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { fsmetric.Opens.Increment() // Remove: // // - O_CLOEXEC, which affects file descriptors and therefore must be // handled outside of VFS. // // - Unknown flags. opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_LARGEFILE | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC. if opts.Flags&linux.O_SYNC != 0 { opts.Flags |= linux.O_DSYNC } // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified // with O_DIRECTORY and a writable access mode (to ensure that it fails on // filesystem implementations that do not support it). if opts.Flags&linux.O_TMPFILE != 0 { if opts.Flags&linux.O_DIRECTORY == 0 { return nil, linuxerr.EINVAL } if opts.Flags&linux.O_CREAT != 0 { return nil, linuxerr.EINVAL } if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY { return nil, linuxerr.EINVAL } } // O_PATH causes most other flags to be ignored. if opts.Flags&linux.O_PATH != 0 { opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH } // "On Linux, the following bits are also honored in mode: [S_ISUID, // S_ISGID, S_ISVTX]" - open(2) opts.Mode &= 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX if opts.Flags&linux.O_NOFOLLOW != 0 { pop.FollowFinalSymlink = false } if opts.Flags&linux.O_PATH != 0 { return vfs.openOPathFD(ctx, creds, pop, opts.Flags) } rp := vfs.getResolvingPath(creds, pop) if opts.Flags&linux.O_DIRECTORY != 0 { rp.mustBeDir = true } for { vfs.maybeBlockOnMountPromise(ctx, rp) fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) if err == nil { rp.Release(ctx) if opts.FileExec { if fd.Mount().Options().Flags.NoExec { fd.DecRef(ctx) return nil, linuxerr.EACCES } // Only a regular file can be executed. stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE}) if err != nil { fd.DecRef(ctx) return nil, err } if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG { fd.DecRef(ctx) return nil, linuxerr.EACCES } } fd.Dentry().InotifyWithParent(ctx, linux.IN_OPEN, 0, PathEvent) return fd, nil } if !rp.handleError(ctx, err) { rp.Release(ctx) return nil, err } } } // ReadlinkAt returns the target of the symbolic link at the given path. func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) { rp := vfs.getResolvingPath(creds, pop) for { vfs.maybeBlockOnMountPromise(ctx, rp) target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp) if err == nil { rp.Release(ctx) return target, nil } if !rp.handleError(ctx, err) { rp.Release(ctx) return "", err } } } // RenameAt renames the file at oldpop to newpop. func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error { if !oldpop.Path.Begin.Ok() { if oldpop.Path.Absolute { return linuxerr.EBUSY } return linuxerr.ENOENT } if oldpop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink") return linuxerr.EINVAL } oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop) if err != nil { return err } if oldName == "." || oldName == ".." { oldParentVD.DecRef(ctx) return linuxerr.EBUSY } if len(oldName) > linux.NAME_MAX { oldParentVD.DecRef(ctx) return linuxerr.ENAMETOOLONG } if !newpop.Path.Begin.Ok() { oldParentVD.DecRef(ctx) if newpop.Path.Absolute { return linuxerr.EBUSY } return linuxerr.ENOENT } if newpop.FollowFinalSymlink { oldParentVD.DecRef(ctx) ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink") return linuxerr.EINVAL } rp := vfs.getResolvingPath(creds, newpop) renameOpts := *opts if oldpop.Path.Dir { renameOpts.MustBeDir = true } for { vfs.maybeBlockOnMountPromise(ctx, rp) err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts) if err == nil { rp.Release(ctx) oldParentVD.DecRef(ctx) return nil } if checkInvariants { if rp.canHandleError(err) && rp.Done() { panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } if !rp.handleError(ctx, err) { rp.Release(ctx) oldParentVD.DecRef(ctx) return err } } } // RmdirAt removes the directory at the given path. func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { if !pop.Path.Begin.Ok() { // pop.Path should not be empty in operations that create/delete files. // This is consistent with unlinkat(dirfd, "", AT_REMOVEDIR). if pop.Path.Absolute { return linuxerr.EBUSY } return linuxerr.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink") return linuxerr.EINVAL } rp := vfs.getResolvingPath(creds, pop) for { vfs.maybeBlockOnMountPromise(ctx, rp) err := rp.mount.fs.impl.RmdirAt(ctx, rp) if err == nil { rp.Release(ctx) return nil } if checkInvariants { if rp.canHandleError(err) && rp.Done() { panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } if !rp.handleError(ctx, err) { rp.Release(ctx) return err } } } // SetStatAt changes metadata for the file at the given path. func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error { rp := vfs.getResolvingPath(creds, pop) for { vfs.maybeBlockOnMountPromise(ctx, rp) err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts) if err == nil { rp.Release(ctx) return nil } if !rp.handleError(ctx, err) { rp.Release(ctx) return err } } } // StatAt returns metadata for the file at the given path. func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) { rp := vfs.getResolvingPath(creds, pop) for { vfs.maybeBlockOnMountPromise(ctx, rp) stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) if err == nil { rp.Release(ctx) return stat, nil } if !rp.handleError(ctx, err) { rp.Release(ctx) return linux.Statx{}, err } } } // StatFSAt returns metadata for the filesystem containing the file at the // given path. func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) { rp := vfs.getResolvingPath(creds, pop) for { vfs.maybeBlockOnMountPromise(ctx, rp) statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp) if err == nil { statfs.Flags |= rp.mount.MountFlags() rp.Release(ctx) return statfs, nil } if !rp.handleError(ctx, err) { rp.Release(ctx) return linux.Statfs{}, err } } } // SymlinkAt creates a symbolic link at the given path with the given target. func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error { if !pop.Path.Begin.Ok() { // pop.Path should not be empty in operations that create/delete files. // This is consistent with symlinkat(oldpath, newdirfd, ""). if pop.Path.Absolute { return linuxerr.EEXIST } return linuxerr.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink") return linuxerr.EINVAL } rp := vfs.getResolvingPath(creds, pop) for { vfs.maybeBlockOnMountPromise(ctx, rp) err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target) if err == nil { rp.Release(ctx) return nil } if checkInvariants { if rp.canHandleError(err) && rp.Done() { panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } if !rp.handleError(ctx, err) { rp.Release(ctx) return err } } } // UnlinkAt deletes the non-directory file at the given path. func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { if !pop.Path.Begin.Ok() { // pop.Path should not be empty in operations that create/delete files. // This is consistent with unlinkat(dirfd, "", 0). if pop.Path.Absolute { return linuxerr.EBUSY } return linuxerr.ENOENT } if pop.FollowFinalSymlink { ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink") return linuxerr.EINVAL } rp := vfs.getResolvingPath(creds, pop) for { vfs.maybeBlockOnMountPromise(ctx, rp) err := rp.mount.fs.impl.UnlinkAt(ctx, rp) if err == nil { rp.Release(ctx) return nil } if checkInvariants { if rp.canHandleError(err) && rp.Done() { panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } if !rp.handleError(ctx, err) { rp.Release(ctx) return err } } } // BoundEndpointAt gets the bound endpoint at the given path, if one exists. func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) { rp := vfs.getResolvingPath(creds, pop) for { vfs.maybeBlockOnMountPromise(ctx, rp) bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts) if err == nil { rp.Release(ctx) return bep, nil } if checkInvariants { if rp.canHandleError(err) && rp.Done() { panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } if !rp.handleError(ctx, err) { rp.Release(ctx) return nil, err } } } // ListXattrAt returns all extended attribute names for the file at the given // path. func (vfs *VirtualFilesystem) ListXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) { rp := vfs.getResolvingPath(creds, pop) for { vfs.maybeBlockOnMountPromise(ctx, rp) names, err := rp.mount.fs.impl.ListXattrAt(ctx, rp, size) if err == nil { rp.Release(ctx) return names, nil } if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) { // Linux doesn't actually return EOPNOTSUPP in this case; instead, // fs/xattr.c:vfs_listxattr() falls back to allowing the security // subsystem to return security extended attributes, which by // default don't exist. rp.Release(ctx) return nil, nil } if !rp.handleError(ctx, err) { rp.Release(ctx) return nil, err } } } // GetXattrAt returns the value associated with the given extended attribute // for the file at the given path. func (vfs *VirtualFilesystem) GetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetXattrOptions) (string, error) { rp := vfs.getResolvingPath(creds, pop) for { vfs.maybeBlockOnMountPromise(ctx, rp) val, err := rp.mount.fs.impl.GetXattrAt(ctx, rp, *opts) if err == nil { rp.Release(ctx) return val, nil } if !rp.handleError(ctx, err) { rp.Release(ctx) return "", err } } } // SetXattrAt changes the value associated with the given extended attribute // for the file at the given path. func (vfs *VirtualFilesystem) SetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetXattrOptions) error { rp := vfs.getResolvingPath(creds, pop) for { vfs.maybeBlockOnMountPromise(ctx, rp) err := rp.mount.fs.impl.SetXattrAt(ctx, rp, *opts) if err == nil { rp.Release(ctx) return nil } if !rp.handleError(ctx, err) { rp.Release(ctx) return err } } } // RemoveXattrAt removes the given extended attribute from the file at rp. func (vfs *VirtualFilesystem) RemoveXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error { rp := vfs.getResolvingPath(creds, pop) for { vfs.maybeBlockOnMountPromise(ctx, rp) err := rp.mount.fs.impl.RemoveXattrAt(ctx, rp, name) if err == nil { rp.Release(ctx) return nil } if !rp.handleError(ctx, err) { rp.Release(ctx) return err } } } // SyncAllFilesystems has the semantics of Linux's sync(2). func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error { var retErr error for fs := range vfs.getFilesystems() { if err := fs.impl.Sync(ctx); err != nil && retErr == nil { retErr = err } fs.DecRef(ctx) } return retErr } func (vfs *VirtualFilesystem) getFilesystems() map[*Filesystem]struct{} { fss := make(map[*Filesystem]struct{}) vfs.filesystemsMu.Lock() defer vfs.filesystemsMu.Unlock() for fs := range vfs.filesystems { if !fs.TryIncRef() { continue } fss[fs] = struct{}{} } return fss } // MkdirAllAt recursively creates non-existent directories on the given path // (including the last component). func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string, root VirtualDentry, creds *auth.Credentials, mkdirOpts *MkdirOptions, mustBeDir bool) error { pop := &PathOperation{ Root: root, Start: root, Path: fspath.Parse(currentPath), } stat, err := vfs.StatAt(ctx, creds, pop, &StatOptions{Mask: linux.STATX_TYPE}) switch { case err == nil: if mustBeDir && (stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.FileTypeMask != linux.ModeDirectory) { return linuxerr.ENOTDIR } // Directory already exists. return nil case linuxerr.Equals(linuxerr.ENOENT, err): // Expected, we will create the dir. default: return fmt.Errorf("stat failed for %q during directory creation: %w", currentPath, err) } // Recurse to ensure parent is created and then create the final directory. if err := vfs.MkdirAllAt(ctx, path.Dir(currentPath), root, creds, mkdirOpts, true /* mustBeDir */); err != nil { return err } if err := vfs.MkdirAt(ctx, creds, pop, mkdirOpts); err != nil { return fmt.Errorf("failed to create directory %q: %w", currentPath, err) } return nil } // MakeSyntheticMountpoint creates parent directories of target if they do not // exist and attempts to create a directory for the mountpoint. If a // non-directory file already exists there then we allow it. func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error { mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} // Make sure the parent directory of target exists. if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts, true /* mustBeDir */); err != nil { return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err) } // Attempt to mkdir the final component. If a file (of any type) exists // then we let allow mounting on top of that because we do not require the // target to be an existing directory, unlike Linux mount(2). if err := vfs.MkdirAllAt(ctx, target, root, creds, mkdirOpts, false /* mustBeDir */); err != nil { return fmt.Errorf("failed to create mountpoint %q: %w", target, err) } return nil } func (vfs *VirtualFilesystem) getMountPromise(vd VirtualDentry) *mountPromise { if mp, ok := vfs.mountPromises.Load(vd); ok { return mp.(*mountPromise) } return nil } // RegisterMountPromise marks vd as a mount promise. This means any VFS // operation on vd will be blocked until another process mounts over it or the // mount promise times out. func (vfs *VirtualFilesystem) RegisterMountPromise(vd VirtualDentry) error { if _, loaded := vfs.mountPromises.LoadOrStore(vd, &mountPromise{wq: &waiter.Queue{}}); loaded { return fmt.Errorf("mount promise already registered for %v", vd) } return nil } // Emit a SentryMountPromiseBlockEvent and wait for the mount promise to be // resolved or time out. func (vfs *VirtualFilesystem) maybeBlockOnMountPromise(ctx context.Context, rp *ResolvingPath) { vd := VirtualDentry{rp.mount, rp.start} mp := vfs.getMountPromise(vd) if mp == nil { return } else if mp.resolved.Load() { vfs.updateResolvingPathForMountPromise(ctx, rp) return } e, ch := waiter.NewChannelEntry(waiter.EventOut) mp.wq.EventRegister(&e) defer mp.wq.EventUnregister(&e) var ( path string err error ) // Unblock waiter entries that were created after this mount promise was // resolved by a racing thread. if mp.resolved.Load() { close(ch) } else { root := RootFromContext(ctx) defer root.DecRef(ctx) path, err = vfs.PathnameReachable(ctx, root, vd) if err != nil { panic(fmt.Sprintf("could not reach %v from root", rp.Component())) } if path == "" { log.Warningf("Attempting to block for a mount promise on an empty path.") return } eventchannel.Emit(&epb.SentryMountPromiseBlockEvent{Path: path}) } select { case <-ch: vfs.updateResolvingPathForMountPromise(ctx, rp) case <-time.After(mountPromiseTimeout): panic(fmt.Sprintf("mount promise for %s timed out, unable to proceed", path)) } } func (vfs *VirtualFilesystem) updateResolvingPathForMountPromise(ctx context.Context, rp *ResolvingPath) { newMnt := vfs.getMountAt(ctx, rp.mount, rp.start) rp.mount = newMnt rp.start = newMnt.root rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef } func (vfs *VirtualFilesystem) maybeResolveMountPromise(vd VirtualDentry) { if mp := vfs.getMountPromise(vd); mp != nil { mp.resolved.Store(true) mp.wq.Notify(waiter.EventOut) } } // PopDelayedDecRefs transfers the ownership of vfs.toDecRef to the caller via // the returned list. It is the caller's responsibility to DecRef these object // later. They must be DecRef'd outside of mountMu. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) PopDelayedDecRefs() []refs.RefCounter { var rcs []refs.RefCounter for rc, refs := range vfs.toDecRef { for i := 0; i < refs; i++ { rcs = append(rcs, rc) } } clear(vfs.toDecRef) return rcs } // delayDecRef saves a reference counted object so that it can be DecRef'd // outside of vfs.mountMu. This is necessary because filesystem locks possibly // taken by DentryImpl.DecRef() may precede vfs.mountMu in the lock order, and // Mount.DecRef() may lock vfs.mountMu. // // +checklocks:vfs.mountMu func (vfs *VirtualFilesystem) delayDecRef(rc refs.RefCounter) { vfs.toDecRef[rc]++ } // Use this instead of vfs.mountMu.Lock(). // // +checklocksacquire:vfs.mountMu func (vfs *VirtualFilesystem) lockMounts() { vfs.mountMu.Lock() } // Use this instead of vfs.mountMu.Unlock(). This method DecRefs any reference // counted objects that were collected while mountMu was held. // // +checklocksrelease:vfs.mountMu func (vfs *VirtualFilesystem) unlockMounts(ctx context.Context) { if len(vfs.toDecRef) == 0 { vfs.mountMu.Unlock() return } toDecRef := vfs.toDecRef // Can't use `clear` here as this would reference the same map as `toDecRef`. vfs.toDecRef = map[refs.RefCounter]int{} vfs.mountMu.Unlock() for rc, refs := range toDecRef { for i := 0; i < refs; i++ { rc.DecRef(ctx) } } } // A VirtualDentry represents a node in a VFS tree, by combining a Dentry // (which represents a node in a Filesystem's tree) and a Mount (which // represents the Filesystem's position in a VFS mount tree). // // VirtualDentry's semantics are similar to that of a Go interface object // representing a pointer: it is a copyable value type that represents // references to another entity. The zero value of VirtualDentry is an "empty // VirtualDentry", directly analogous to a nil interface object. // VirtualDentry.Ok() checks that a VirtualDentry is not zero-valued; unless // otherwise specified, all other VirtualDentry methods require // VirtualDentry.Ok() == true. // // Mounts and Dentries are reference-counted, requiring that users call // VirtualDentry.{Inc,Dec}Ref() as appropriate. We often colloquially refer to // references on the Mount and Dentry referred to by a VirtualDentry as // references on the VirtualDentry itself. Unless otherwise specified, all // VirtualDentry methods require that a reference is held on the VirtualDentry. // // VirtualDentry is analogous to Linux's struct path. // // +stateify savable type VirtualDentry struct { mount *Mount dentry *Dentry } // MakeVirtualDentry creates a VirtualDentry. func MakeVirtualDentry(mount *Mount, dentry *Dentry) VirtualDentry { return VirtualDentry{ mount: mount, dentry: dentry, } } // Ok returns true if vd is not empty. It does not require that a reference is // held. func (vd VirtualDentry) Ok() bool { return vd.mount != nil } // IncRef increments the reference counts on the Mount and Dentry represented // by vd. func (vd VirtualDentry) IncRef() { vd.mount.IncRef() vd.dentry.IncRef() } // DecRef decrements the reference counts on the Mount and Dentry represented // by vd. func (vd VirtualDentry) DecRef(ctx context.Context) { vd.dentry.DecRef(ctx) vd.mount.DecRef(ctx) } // Mount returns the Mount associated with vd. It does not take a reference on // the returned Mount. func (vd VirtualDentry) Mount() *Mount { return vd.mount } // Dentry returns the Dentry associated with vd. It does not take a reference // on the returned Dentry. func (vd VirtualDentry) Dentry() *Dentry { return vd.dentry } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/vfs_state_autogen.go000066400000000000000000001600231465435605700256170ustar00rootroot00000000000000// automatically generated by stateify. //go:build !check_invariants // +build !check_invariants package vfs import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (a *anonFilesystemType) StateTypeName() string { return "pkg/sentry/vfs.anonFilesystemType" } func (a *anonFilesystemType) StateFields() []string { return []string{} } func (a *anonFilesystemType) beforeSave() {} // +checklocksignore func (a *anonFilesystemType) StateSave(stateSinkObject state.Sink) { a.beforeSave() } func (a *anonFilesystemType) afterLoad(context.Context) {} // +checklocksignore func (a *anonFilesystemType) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (fs *anonFilesystem) StateTypeName() string { return "pkg/sentry/vfs.anonFilesystem" } func (fs *anonFilesystem) StateFields() []string { return []string{ "vfsfs", "devMinor", } } func (fs *anonFilesystem) beforeSave() {} // +checklocksignore func (fs *anonFilesystem) StateSave(stateSinkObject state.Sink) { fs.beforeSave() stateSinkObject.Save(0, &fs.vfsfs) stateSinkObject.Save(1, &fs.devMinor) } func (fs *anonFilesystem) afterLoad(context.Context) {} // +checklocksignore func (fs *anonFilesystem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fs.vfsfs) stateSourceObject.Load(1, &fs.devMinor) } func (d *anonDentry) StateTypeName() string { return "pkg/sentry/vfs.anonDentry" } func (d *anonDentry) StateFields() []string { return []string{ "vfsd", "name", "watches", } } func (d *anonDentry) beforeSave() {} // +checklocksignore func (d *anonDentry) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.vfsd) stateSinkObject.Save(1, &d.name) stateSinkObject.Save(2, &d.watches) } func (d *anonDentry) afterLoad(context.Context) {} // +checklocksignore func (d *anonDentry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.vfsd) stateSourceObject.Load(1, &d.name) stateSourceObject.Load(2, &d.watches) } func (d *Dentry) StateTypeName() string { return "pkg/sentry/vfs.Dentry" } func (d *Dentry) StateFields() []string { return []string{ "dead", "evictable", "mounts", "impl", } } func (d *Dentry) beforeSave() {} // +checklocksignore func (d *Dentry) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.dead) stateSinkObject.Save(1, &d.evictable) stateSinkObject.Save(2, &d.mounts) stateSinkObject.Save(3, &d.impl) } func (d *Dentry) afterLoad(context.Context) {} // +checklocksignore func (d *Dentry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.dead) stateSourceObject.Load(1, &d.evictable) stateSourceObject.Load(2, &d.mounts) stateSourceObject.Load(3, &d.impl) } func (kind *DeviceKind) StateTypeName() string { return "pkg/sentry/vfs.DeviceKind" } func (kind *DeviceKind) StateFields() []string { return nil } func (d *devTuple) StateTypeName() string { return "pkg/sentry/vfs.devTuple" } func (d *devTuple) StateFields() []string { return []string{ "kind", "major", "minor", } } func (d *devTuple) beforeSave() {} // +checklocksignore func (d *devTuple) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.kind) stateSinkObject.Save(1, &d.major) stateSinkObject.Save(2, &d.minor) } func (d *devTuple) afterLoad(context.Context) {} // +checklocksignore func (d *devTuple) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.kind) stateSourceObject.Load(1, &d.major) stateSourceObject.Load(2, &d.minor) } func (r *registeredDevice) StateTypeName() string { return "pkg/sentry/vfs.registeredDevice" } func (r *registeredDevice) StateFields() []string { return []string{ "dev", "opts", } } func (r *registeredDevice) beforeSave() {} // +checklocksignore func (r *registeredDevice) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.dev) stateSinkObject.Save(1, &r.opts) } func (r *registeredDevice) afterLoad(context.Context) {} // +checklocksignore func (r *registeredDevice) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.dev) stateSourceObject.Load(1, &r.opts) } func (r *RegisterDeviceOptions) StateTypeName() string { return "pkg/sentry/vfs.RegisterDeviceOptions" } func (r *RegisterDeviceOptions) StateFields() []string { return []string{ "GroupName", "Pathname", "FilePerms", } } func (r *RegisterDeviceOptions) beforeSave() {} // +checklocksignore func (r *RegisterDeviceOptions) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.GroupName) stateSinkObject.Save(1, &r.Pathname) stateSinkObject.Save(2, &r.FilePerms) } func (r *RegisterDeviceOptions) afterLoad(context.Context) {} // +checklocksignore func (r *RegisterDeviceOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.GroupName) stateSourceObject.Load(1, &r.Pathname) stateSourceObject.Load(2, &r.FilePerms) } func (ep *EpollInstance) StateTypeName() string { return "pkg/sentry/vfs.EpollInstance" } func (ep *EpollInstance) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "NoLockFD", "q", "interest", "ready", "readySeq", } } func (ep *EpollInstance) beforeSave() {} // +checklocksignore func (ep *EpollInstance) StateSave(stateSinkObject state.Sink) { ep.beforeSave() stateSinkObject.Save(0, &ep.vfsfd) stateSinkObject.Save(1, &ep.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &ep.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &ep.NoLockFD) stateSinkObject.Save(4, &ep.q) stateSinkObject.Save(5, &ep.interest) stateSinkObject.Save(6, &ep.ready) stateSinkObject.Save(7, &ep.readySeq) } func (ep *EpollInstance) afterLoad(context.Context) {} // +checklocksignore func (ep *EpollInstance) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &ep.vfsfd) stateSourceObject.Load(1, &ep.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &ep.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &ep.NoLockFD) stateSourceObject.Load(4, &ep.q) stateSourceObject.Load(5, &ep.interest) stateSourceObject.Load(6, &ep.ready) stateSourceObject.Load(7, &ep.readySeq) } func (e *epollInterestKey) StateTypeName() string { return "pkg/sentry/vfs.epollInterestKey" } func (e *epollInterestKey) StateFields() []string { return []string{ "file", "num", } } func (e *epollInterestKey) beforeSave() {} // +checklocksignore func (e *epollInterestKey) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.file) stateSinkObject.Save(1, &e.num) } func (e *epollInterestKey) afterLoad(context.Context) {} // +checklocksignore func (e *epollInterestKey) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.file) stateSourceObject.Load(1, &e.num) } func (epi *epollInterest) StateTypeName() string { return "pkg/sentry/vfs.epollInterest" } func (epi *epollInterest) StateFields() []string { return []string{ "epoll", "key", "waiter", "mask", "ready", "epollInterestEntry", "readySeq", "userData", } } func (epi *epollInterest) beforeSave() {} // +checklocksignore func (epi *epollInterest) StateSave(stateSinkObject state.Sink) { epi.beforeSave() stateSinkObject.Save(0, &epi.epoll) stateSinkObject.Save(1, &epi.key) stateSinkObject.Save(2, &epi.waiter) stateSinkObject.Save(3, &epi.mask) stateSinkObject.Save(4, &epi.ready) stateSinkObject.Save(5, &epi.epollInterestEntry) stateSinkObject.Save(6, &epi.readySeq) stateSinkObject.Save(7, &epi.userData) } // +checklocksignore func (epi *epollInterest) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadWait(0, &epi.epoll) stateSourceObject.Load(1, &epi.key) stateSourceObject.Load(2, &epi.waiter) stateSourceObject.Load(3, &epi.mask) stateSourceObject.Load(4, &epi.ready) stateSourceObject.Load(5, &epi.epollInterestEntry) stateSourceObject.Load(6, &epi.readySeq) stateSourceObject.Load(7, &epi.userData) stateSourceObject.AfterLoad(func() { epi.afterLoad(ctx) }) } func (l *epollInterestList) StateTypeName() string { return "pkg/sentry/vfs.epollInterestList" } func (l *epollInterestList) StateFields() []string { return []string{ "head", "tail", } } func (l *epollInterestList) beforeSave() {} // +checklocksignore func (l *epollInterestList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *epollInterestList) afterLoad(context.Context) {} // +checklocksignore func (l *epollInterestList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *epollInterestEntry) StateTypeName() string { return "pkg/sentry/vfs.epollInterestEntry" } func (e *epollInterestEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *epollInterestEntry) beforeSave() {} // +checklocksignore func (e *epollInterestEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *epollInterestEntry) afterLoad(context.Context) {} // +checklocksignore func (e *epollInterestEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (l *eventList) StateTypeName() string { return "pkg/sentry/vfs.eventList" } func (l *eventList) StateFields() []string { return []string{ "head", "tail", } } func (l *eventList) beforeSave() {} // +checklocksignore func (l *eventList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *eventList) afterLoad(context.Context) {} // +checklocksignore func (l *eventList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *eventEntry) StateTypeName() string { return "pkg/sentry/vfs.eventEntry" } func (e *eventEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *eventEntry) beforeSave() {} // +checklocksignore func (e *eventEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *eventEntry) afterLoad(context.Context) {} // +checklocksignore func (e *eventEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (fd *FileDescription) StateTypeName() string { return "pkg/sentry/vfs.FileDescription" } func (fd *FileDescription) StateFields() []string { return []string{ "FileDescriptionRefs", "statusFlags", "asyncHandler", "epolls", "vd", "opts", "readable", "writable", "usedLockBSD", "impl", } } func (fd *FileDescription) beforeSave() {} // +checklocksignore func (fd *FileDescription) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.FileDescriptionRefs) stateSinkObject.Save(1, &fd.statusFlags) stateSinkObject.Save(2, &fd.asyncHandler) stateSinkObject.Save(3, &fd.epolls) stateSinkObject.Save(4, &fd.vd) stateSinkObject.Save(5, &fd.opts) stateSinkObject.Save(6, &fd.readable) stateSinkObject.Save(7, &fd.writable) stateSinkObject.Save(8, &fd.usedLockBSD) stateSinkObject.Save(9, &fd.impl) } func (fd *FileDescription) afterLoad(context.Context) {} // +checklocksignore func (fd *FileDescription) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.FileDescriptionRefs) stateSourceObject.Load(1, &fd.statusFlags) stateSourceObject.Load(2, &fd.asyncHandler) stateSourceObject.Load(3, &fd.epolls) stateSourceObject.Load(4, &fd.vd) stateSourceObject.Load(5, &fd.opts) stateSourceObject.Load(6, &fd.readable) stateSourceObject.Load(7, &fd.writable) stateSourceObject.Load(8, &fd.usedLockBSD) stateSourceObject.Load(9, &fd.impl) } func (f *FileDescriptionOptions) StateTypeName() string { return "pkg/sentry/vfs.FileDescriptionOptions" } func (f *FileDescriptionOptions) StateFields() []string { return []string{ "AllowDirectIO", "DenyPRead", "DenyPWrite", "UseDentryMetadata", "DenySpliceIn", } } func (f *FileDescriptionOptions) beforeSave() {} // +checklocksignore func (f *FileDescriptionOptions) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.AllowDirectIO) stateSinkObject.Save(1, &f.DenyPRead) stateSinkObject.Save(2, &f.DenyPWrite) stateSinkObject.Save(3, &f.UseDentryMetadata) stateSinkObject.Save(4, &f.DenySpliceIn) } func (f *FileDescriptionOptions) afterLoad(context.Context) {} // +checklocksignore func (f *FileDescriptionOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.AllowDirectIO) stateSourceObject.Load(1, &f.DenyPRead) stateSourceObject.Load(2, &f.DenyPWrite) stateSourceObject.Load(3, &f.UseDentryMetadata) stateSourceObject.Load(4, &f.DenySpliceIn) } func (d *Dirent) StateTypeName() string { return "pkg/sentry/vfs.Dirent" } func (d *Dirent) StateFields() []string { return []string{ "Name", "Type", "Ino", "NextOff", } } func (d *Dirent) beforeSave() {} // +checklocksignore func (d *Dirent) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.Name) stateSinkObject.Save(1, &d.Type) stateSinkObject.Save(2, &d.Ino) stateSinkObject.Save(3, &d.NextOff) } func (d *Dirent) afterLoad(context.Context) {} // +checklocksignore func (d *Dirent) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.Name) stateSourceObject.Load(1, &d.Type) stateSourceObject.Load(2, &d.Ino) stateSourceObject.Load(3, &d.NextOff) } func (f *FileDescriptionDefaultImpl) StateTypeName() string { return "pkg/sentry/vfs.FileDescriptionDefaultImpl" } func (f *FileDescriptionDefaultImpl) StateFields() []string { return []string{} } func (f *FileDescriptionDefaultImpl) beforeSave() {} // +checklocksignore func (f *FileDescriptionDefaultImpl) StateSave(stateSinkObject state.Sink) { f.beforeSave() } func (f *FileDescriptionDefaultImpl) afterLoad(context.Context) {} // +checklocksignore func (f *FileDescriptionDefaultImpl) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (d *DirectoryFileDescriptionDefaultImpl) StateTypeName() string { return "pkg/sentry/vfs.DirectoryFileDescriptionDefaultImpl" } func (d *DirectoryFileDescriptionDefaultImpl) StateFields() []string { return []string{} } func (d *DirectoryFileDescriptionDefaultImpl) beforeSave() {} // +checklocksignore func (d *DirectoryFileDescriptionDefaultImpl) StateSave(stateSinkObject state.Sink) { d.beforeSave() } func (d *DirectoryFileDescriptionDefaultImpl) afterLoad(context.Context) {} // +checklocksignore func (d *DirectoryFileDescriptionDefaultImpl) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (d *DentryMetadataFileDescriptionImpl) StateTypeName() string { return "pkg/sentry/vfs.DentryMetadataFileDescriptionImpl" } func (d *DentryMetadataFileDescriptionImpl) StateFields() []string { return []string{} } func (d *DentryMetadataFileDescriptionImpl) beforeSave() {} // +checklocksignore func (d *DentryMetadataFileDescriptionImpl) StateSave(stateSinkObject state.Sink) { d.beforeSave() } func (d *DentryMetadataFileDescriptionImpl) afterLoad(context.Context) {} // +checklocksignore func (d *DentryMetadataFileDescriptionImpl) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (s *StaticData) StateTypeName() string { return "pkg/sentry/vfs.StaticData" } func (s *StaticData) StateFields() []string { return []string{ "Data", } } func (s *StaticData) beforeSave() {} // +checklocksignore func (s *StaticData) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.Data) } func (s *StaticData) afterLoad(context.Context) {} // +checklocksignore func (s *StaticData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.Data) } func (fd *DynamicBytesFileDescriptionImpl) StateTypeName() string { return "pkg/sentry/vfs.DynamicBytesFileDescriptionImpl" } func (fd *DynamicBytesFileDescriptionImpl) StateFields() []string { return []string{ "vfsfd", "data", "buf", "off", "lastRead", } } func (fd *DynamicBytesFileDescriptionImpl) beforeSave() {} // +checklocksignore func (fd *DynamicBytesFileDescriptionImpl) StateSave(stateSinkObject state.Sink) { fd.beforeSave() var bufValue []byte bufValue = fd.saveBuf() stateSinkObject.SaveValue(2, bufValue) stateSinkObject.Save(0, &fd.vfsfd) stateSinkObject.Save(1, &fd.data) stateSinkObject.Save(3, &fd.off) stateSinkObject.Save(4, &fd.lastRead) } func (fd *DynamicBytesFileDescriptionImpl) afterLoad(context.Context) {} // +checklocksignore func (fd *DynamicBytesFileDescriptionImpl) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.vfsfd) stateSourceObject.Load(1, &fd.data) stateSourceObject.Load(3, &fd.off) stateSourceObject.Load(4, &fd.lastRead) stateSourceObject.LoadValue(2, new([]byte), func(y any) { fd.loadBuf(ctx, y.([]byte)) }) } func (fd *LockFD) StateTypeName() string { return "pkg/sentry/vfs.LockFD" } func (fd *LockFD) StateFields() []string { return []string{ "locks", } } func (fd *LockFD) beforeSave() {} // +checklocksignore func (fd *LockFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.locks) } func (fd *LockFD) afterLoad(context.Context) {} // +checklocksignore func (fd *LockFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.locks) } func (n *NoLockFD) StateTypeName() string { return "pkg/sentry/vfs.NoLockFD" } func (n *NoLockFD) StateFields() []string { return []string{} } func (n *NoLockFD) beforeSave() {} // +checklocksignore func (n *NoLockFD) StateSave(stateSinkObject state.Sink) { n.beforeSave() } func (n *NoLockFD) afterLoad(context.Context) {} // +checklocksignore func (n *NoLockFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (b *BadLockFD) StateTypeName() string { return "pkg/sentry/vfs.BadLockFD" } func (b *BadLockFD) StateFields() []string { return []string{} } func (b *BadLockFD) beforeSave() {} // +checklocksignore func (b *BadLockFD) StateSave(stateSinkObject state.Sink) { b.beforeSave() } func (b *BadLockFD) afterLoad(context.Context) {} // +checklocksignore func (b *BadLockFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (r *FileDescriptionRefs) StateTypeName() string { return "pkg/sentry/vfs.FileDescriptionRefs" } func (r *FileDescriptionRefs) StateFields() []string { return []string{ "refCount", } } func (r *FileDescriptionRefs) beforeSave() {} // +checklocksignore func (r *FileDescriptionRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *FileDescriptionRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (fs *Filesystem) StateTypeName() string { return "pkg/sentry/vfs.Filesystem" } func (fs *Filesystem) StateFields() []string { return []string{ "FilesystemRefs", "vfs", "fsType", "impl", } } func (fs *Filesystem) beforeSave() {} // +checklocksignore func (fs *Filesystem) StateSave(stateSinkObject state.Sink) { fs.beforeSave() stateSinkObject.Save(0, &fs.FilesystemRefs) stateSinkObject.Save(1, &fs.vfs) stateSinkObject.Save(2, &fs.fsType) stateSinkObject.Save(3, &fs.impl) } func (fs *Filesystem) afterLoad(context.Context) {} // +checklocksignore func (fs *Filesystem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fs.FilesystemRefs) stateSourceObject.Load(1, &fs.vfs) stateSourceObject.Load(2, &fs.fsType) stateSourceObject.Load(3, &fs.impl) } func (p *PrependPathAtVFSRootError) StateTypeName() string { return "pkg/sentry/vfs.PrependPathAtVFSRootError" } func (p *PrependPathAtVFSRootError) StateFields() []string { return []string{} } func (p *PrependPathAtVFSRootError) beforeSave() {} // +checklocksignore func (p *PrependPathAtVFSRootError) StateSave(stateSinkObject state.Sink) { p.beforeSave() } func (p *PrependPathAtVFSRootError) afterLoad(context.Context) {} // +checklocksignore func (p *PrependPathAtVFSRootError) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (p *PrependPathAtNonMountRootError) StateTypeName() string { return "pkg/sentry/vfs.PrependPathAtNonMountRootError" } func (p *PrependPathAtNonMountRootError) StateFields() []string { return []string{} } func (p *PrependPathAtNonMountRootError) beforeSave() {} // +checklocksignore func (p *PrependPathAtNonMountRootError) StateSave(stateSinkObject state.Sink) { p.beforeSave() } func (p *PrependPathAtNonMountRootError) afterLoad(context.Context) {} // +checklocksignore func (p *PrependPathAtNonMountRootError) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (p *PrependPathSyntheticError) StateTypeName() string { return "pkg/sentry/vfs.PrependPathSyntheticError" } func (p *PrependPathSyntheticError) StateFields() []string { return []string{} } func (p *PrependPathSyntheticError) beforeSave() {} // +checklocksignore func (p *PrependPathSyntheticError) StateSave(stateSinkObject state.Sink) { p.beforeSave() } func (p *PrependPathSyntheticError) afterLoad(context.Context) {} // +checklocksignore func (p *PrependPathSyntheticError) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (r *FilesystemRefs) StateTypeName() string { return "pkg/sentry/vfs.FilesystemRefs" } func (r *FilesystemRefs) StateFields() []string { return []string{ "refCount", } } func (r *FilesystemRefs) beforeSave() {} // +checklocksignore func (r *FilesystemRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *FilesystemRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (r *registeredFilesystemType) StateTypeName() string { return "pkg/sentry/vfs.registeredFilesystemType" } func (r *registeredFilesystemType) StateFields() []string { return []string{ "fsType", "opts", } } func (r *registeredFilesystemType) beforeSave() {} // +checklocksignore func (r *registeredFilesystemType) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.fsType) stateSinkObject.Save(1, &r.opts) } func (r *registeredFilesystemType) afterLoad(context.Context) {} // +checklocksignore func (r *registeredFilesystemType) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.fsType) stateSourceObject.Load(1, &r.opts) } func (r *RegisterFilesystemTypeOptions) StateTypeName() string { return "pkg/sentry/vfs.RegisterFilesystemTypeOptions" } func (r *RegisterFilesystemTypeOptions) StateFields() []string { return []string{ "AllowUserMount", "AllowUserList", "RequiresDevice", } } func (r *RegisterFilesystemTypeOptions) beforeSave() {} // +checklocksignore func (r *RegisterFilesystemTypeOptions) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.AllowUserMount) stateSinkObject.Save(1, &r.AllowUserList) stateSinkObject.Save(2, &r.RequiresDevice) } func (r *RegisterFilesystemTypeOptions) afterLoad(context.Context) {} // +checklocksignore func (r *RegisterFilesystemTypeOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.AllowUserMount) stateSourceObject.Load(1, &r.AllowUserList) stateSourceObject.Load(2, &r.RequiresDevice) } func (e *EventType) StateTypeName() string { return "pkg/sentry/vfs.EventType" } func (e *EventType) StateFields() []string { return nil } func (i *Inotify) StateTypeName() string { return "pkg/sentry/vfs.Inotify" } func (i *Inotify) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "DentryMetadataFileDescriptionImpl", "NoLockFD", "id", "queue", "events", "scratch", "nextWatchMinusOne", "watches", } } func (i *Inotify) beforeSave() {} // +checklocksignore func (i *Inotify) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.vfsfd) stateSinkObject.Save(1, &i.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &i.DentryMetadataFileDescriptionImpl) stateSinkObject.Save(3, &i.NoLockFD) stateSinkObject.Save(4, &i.id) stateSinkObject.Save(5, &i.queue) stateSinkObject.Save(6, &i.events) stateSinkObject.Save(7, &i.scratch) stateSinkObject.Save(8, &i.nextWatchMinusOne) stateSinkObject.Save(9, &i.watches) } func (i *Inotify) afterLoad(context.Context) {} // +checklocksignore func (i *Inotify) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.vfsfd) stateSourceObject.Load(1, &i.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &i.DentryMetadataFileDescriptionImpl) stateSourceObject.Load(3, &i.NoLockFD) stateSourceObject.Load(4, &i.id) stateSourceObject.Load(5, &i.queue) stateSourceObject.Load(6, &i.events) stateSourceObject.Load(7, &i.scratch) stateSourceObject.Load(8, &i.nextWatchMinusOne) stateSourceObject.Load(9, &i.watches) } func (w *Watches) StateTypeName() string { return "pkg/sentry/vfs.Watches" } func (w *Watches) StateFields() []string { return []string{ "ws", } } func (w *Watches) beforeSave() {} // +checklocksignore func (w *Watches) StateSave(stateSinkObject state.Sink) { w.beforeSave() stateSinkObject.Save(0, &w.ws) } func (w *Watches) afterLoad(context.Context) {} // +checklocksignore func (w *Watches) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &w.ws) } func (w *Watch) StateTypeName() string { return "pkg/sentry/vfs.Watch" } func (w *Watch) StateFields() []string { return []string{ "owner", "wd", "target", "mask", "expired", } } func (w *Watch) beforeSave() {} // +checklocksignore func (w *Watch) StateSave(stateSinkObject state.Sink) { w.beforeSave() stateSinkObject.Save(0, &w.owner) stateSinkObject.Save(1, &w.wd) stateSinkObject.Save(2, &w.target) stateSinkObject.Save(3, &w.mask) stateSinkObject.Save(4, &w.expired) } func (w *Watch) afterLoad(context.Context) {} // +checklocksignore func (w *Watch) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &w.owner) stateSourceObject.Load(1, &w.wd) stateSourceObject.Load(2, &w.target) stateSourceObject.Load(3, &w.mask) stateSourceObject.Load(4, &w.expired) } func (e *Event) StateTypeName() string { return "pkg/sentry/vfs.Event" } func (e *Event) StateFields() []string { return []string{ "eventEntry", "wd", "mask", "cookie", "len", "name", } } func (e *Event) beforeSave() {} // +checklocksignore func (e *Event) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.eventEntry) stateSinkObject.Save(1, &e.wd) stateSinkObject.Save(2, &e.mask) stateSinkObject.Save(3, &e.cookie) stateSinkObject.Save(4, &e.len) stateSinkObject.Save(5, &e.name) } func (e *Event) afterLoad(context.Context) {} // +checklocksignore func (e *Event) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.eventEntry) stateSourceObject.Load(1, &e.wd) stateSourceObject.Load(2, &e.mask) stateSourceObject.Load(3, &e.cookie) stateSourceObject.Load(4, &e.len) stateSourceObject.Load(5, &e.name) } func (fl *FileLocks) StateTypeName() string { return "pkg/sentry/vfs.FileLocks" } func (fl *FileLocks) StateFields() []string { return []string{ "bsd", "posix", } } func (fl *FileLocks) beforeSave() {} // +checklocksignore func (fl *FileLocks) StateSave(stateSinkObject state.Sink) { fl.beforeSave() stateSinkObject.Save(0, &fl.bsd) stateSinkObject.Save(1, &fl.posix) } func (fl *FileLocks) afterLoad(context.Context) {} // +checklocksignore func (fl *FileLocks) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fl.bsd) stateSourceObject.Load(1, &fl.posix) } func (mnt *Mount) StateTypeName() string { return "pkg/sentry/vfs.Mount" } func (mnt *Mount) StateFields() []string { return []string{ "vfs", "fs", "root", "ID", "flags", "key", "ns", "refs", "children", "isShared", "sharedEntry", "followerList", "followerEntry", "leader", "groupID", "umounted", "locked", "writers", } } func (mnt *Mount) beforeSave() {} // +checklocksignore func (mnt *Mount) StateSave(stateSinkObject state.Sink) { mnt.beforeSave() var keyValue VirtualDentry keyValue = mnt.saveKey() stateSinkObject.SaveValue(5, keyValue) stateSinkObject.Save(0, &mnt.vfs) stateSinkObject.Save(1, &mnt.fs) stateSinkObject.Save(2, &mnt.root) stateSinkObject.Save(3, &mnt.ID) stateSinkObject.Save(4, &mnt.flags) stateSinkObject.Save(6, &mnt.ns) stateSinkObject.Save(7, &mnt.refs) stateSinkObject.Save(8, &mnt.children) stateSinkObject.Save(9, &mnt.isShared) stateSinkObject.Save(10, &mnt.sharedEntry) stateSinkObject.Save(11, &mnt.followerList) stateSinkObject.Save(12, &mnt.followerEntry) stateSinkObject.Save(13, &mnt.leader) stateSinkObject.Save(14, &mnt.groupID) stateSinkObject.Save(15, &mnt.umounted) stateSinkObject.Save(16, &mnt.locked) stateSinkObject.Save(17, &mnt.writers) } // +checklocksignore func (mnt *Mount) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &mnt.vfs) stateSourceObject.Load(1, &mnt.fs) stateSourceObject.Load(2, &mnt.root) stateSourceObject.Load(3, &mnt.ID) stateSourceObject.Load(4, &mnt.flags) stateSourceObject.Load(6, &mnt.ns) stateSourceObject.Load(7, &mnt.refs) stateSourceObject.Load(8, &mnt.children) stateSourceObject.Load(9, &mnt.isShared) stateSourceObject.Load(10, &mnt.sharedEntry) stateSourceObject.Load(11, &mnt.followerList) stateSourceObject.Load(12, &mnt.followerEntry) stateSourceObject.Load(13, &mnt.leader) stateSourceObject.Load(14, &mnt.groupID) stateSourceObject.Load(15, &mnt.umounted) stateSourceObject.Load(16, &mnt.locked) stateSourceObject.Load(17, &mnt.writers) stateSourceObject.LoadValue(5, new(VirtualDentry), func(y any) { mnt.loadKey(ctx, y.(VirtualDentry)) }) stateSourceObject.AfterLoad(func() { mnt.afterLoad(ctx) }) } func (u *umountRecursiveOptions) StateTypeName() string { return "pkg/sentry/vfs.umountRecursiveOptions" } func (u *umountRecursiveOptions) StateFields() []string { return []string{ "eager", "disconnectHierarchy", "propagate", } } func (u *umountRecursiveOptions) beforeSave() {} // +checklocksignore func (u *umountRecursiveOptions) StateSave(stateSinkObject state.Sink) { u.beforeSave() stateSinkObject.Save(0, &u.eager) stateSinkObject.Save(1, &u.disconnectHierarchy) stateSinkObject.Save(2, &u.propagate) } func (u *umountRecursiveOptions) afterLoad(context.Context) {} // +checklocksignore func (u *umountRecursiveOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &u.eager) stateSourceObject.Load(1, &u.disconnectHierarchy) stateSourceObject.Load(2, &u.propagate) } func (l *followerList) StateTypeName() string { return "pkg/sentry/vfs.followerList" } func (l *followerList) StateFields() []string { return []string{ "head", "tail", } } func (l *followerList) beforeSave() {} // +checklocksignore func (l *followerList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *followerList) afterLoad(context.Context) {} // +checklocksignore func (l *followerList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *followerEntry) StateTypeName() string { return "pkg/sentry/vfs.followerEntry" } func (e *followerEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *followerEntry) beforeSave() {} // +checklocksignore func (e *followerEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *followerEntry) afterLoad(context.Context) {} // +checklocksignore func (e *followerEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (r *namespaceRefs) StateTypeName() string { return "pkg/sentry/vfs.namespaceRefs" } func (r *namespaceRefs) StateFields() []string { return []string{ "refCount", } } func (r *namespaceRefs) beforeSave() {} // +checklocksignore func (r *namespaceRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *namespaceRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (e *mountEntry) StateTypeName() string { return "pkg/sentry/vfs.mountEntry" } func (e *mountEntry) StateFields() []string { return []string{ "next", "prev", "container", } } func (e *mountEntry) beforeSave() {} // +checklocksignore func (e *mountEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) stateSinkObject.Save(2, &e.container) } func (e *mountEntry) afterLoad(context.Context) {} // +checklocksignore func (e *mountEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) stateSourceObject.Load(2, &e.container) } func (mntns *MountNamespace) StateTypeName() string { return "pkg/sentry/vfs.MountNamespace" } func (mntns *MountNamespace) StateFields() []string { return []string{ "Refs", "Owner", "root", "mountpoints", "mounts", "pending", } } func (mntns *MountNamespace) beforeSave() {} // +checklocksignore func (mntns *MountNamespace) StateSave(stateSinkObject state.Sink) { mntns.beforeSave() stateSinkObject.Save(0, &mntns.Refs) stateSinkObject.Save(1, &mntns.Owner) stateSinkObject.Save(2, &mntns.root) stateSinkObject.Save(3, &mntns.mountpoints) stateSinkObject.Save(4, &mntns.mounts) stateSinkObject.Save(5, &mntns.pending) } func (mntns *MountNamespace) afterLoad(context.Context) {} // +checklocksignore func (mntns *MountNamespace) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &mntns.Refs) stateSourceObject.Load(1, &mntns.Owner) stateSourceObject.Load(2, &mntns.root) stateSourceObject.Load(3, &mntns.mountpoints) stateSourceObject.Load(4, &mntns.mounts) stateSourceObject.Load(5, &mntns.pending) } func (fd *opathFD) StateTypeName() string { return "pkg/sentry/vfs.opathFD" } func (fd *opathFD) StateFields() []string { return []string{ "vfsfd", "FileDescriptionDefaultImpl", "BadLockFD", } } func (fd *opathFD) beforeSave() {} // +checklocksignore func (fd *opathFD) StateSave(stateSinkObject state.Sink) { fd.beforeSave() stateSinkObject.Save(0, &fd.vfsfd) stateSinkObject.Save(1, &fd.FileDescriptionDefaultImpl) stateSinkObject.Save(2, &fd.BadLockFD) } func (fd *opathFD) afterLoad(context.Context) {} // +checklocksignore func (fd *opathFD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fd.vfsfd) stateSourceObject.Load(1, &fd.FileDescriptionDefaultImpl) stateSourceObject.Load(2, &fd.BadLockFD) } func (g *GetDentryOptions) StateTypeName() string { return "pkg/sentry/vfs.GetDentryOptions" } func (g *GetDentryOptions) StateFields() []string { return []string{ "CheckSearchable", } } func (g *GetDentryOptions) beforeSave() {} // +checklocksignore func (g *GetDentryOptions) StateSave(stateSinkObject state.Sink) { g.beforeSave() stateSinkObject.Save(0, &g.CheckSearchable) } func (g *GetDentryOptions) afterLoad(context.Context) {} // +checklocksignore func (g *GetDentryOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &g.CheckSearchable) } func (m *MkdirOptions) StateTypeName() string { return "pkg/sentry/vfs.MkdirOptions" } func (m *MkdirOptions) StateFields() []string { return []string{ "Mode", "ForSyntheticMountpoint", } } func (m *MkdirOptions) beforeSave() {} // +checklocksignore func (m *MkdirOptions) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.Mode) stateSinkObject.Save(1, &m.ForSyntheticMountpoint) } func (m *MkdirOptions) afterLoad(context.Context) {} // +checklocksignore func (m *MkdirOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.Mode) stateSourceObject.Load(1, &m.ForSyntheticMountpoint) } func (m *MknodOptions) StateTypeName() string { return "pkg/sentry/vfs.MknodOptions" } func (m *MknodOptions) StateFields() []string { return []string{ "Mode", "DevMajor", "DevMinor", "Endpoint", } } func (m *MknodOptions) beforeSave() {} // +checklocksignore func (m *MknodOptions) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.Mode) stateSinkObject.Save(1, &m.DevMajor) stateSinkObject.Save(2, &m.DevMinor) stateSinkObject.Save(3, &m.Endpoint) } func (m *MknodOptions) afterLoad(context.Context) {} // +checklocksignore func (m *MknodOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.Mode) stateSourceObject.Load(1, &m.DevMajor) stateSourceObject.Load(2, &m.DevMinor) stateSourceObject.Load(3, &m.Endpoint) } func (m *MountFlags) StateTypeName() string { return "pkg/sentry/vfs.MountFlags" } func (m *MountFlags) StateFields() []string { return []string{ "NoExec", "NoATime", "NoDev", "NoSUID", } } func (m *MountFlags) beforeSave() {} // +checklocksignore func (m *MountFlags) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.NoExec) stateSinkObject.Save(1, &m.NoATime) stateSinkObject.Save(2, &m.NoDev) stateSinkObject.Save(3, &m.NoSUID) } func (m *MountFlags) afterLoad(context.Context) {} // +checklocksignore func (m *MountFlags) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.NoExec) stateSourceObject.Load(1, &m.NoATime) stateSourceObject.Load(2, &m.NoDev) stateSourceObject.Load(3, &m.NoSUID) } func (m *MountOptions) StateTypeName() string { return "pkg/sentry/vfs.MountOptions" } func (m *MountOptions) StateFields() []string { return []string{ "Flags", "ReadOnly", "GetFilesystemOptions", "Locked", } } func (m *MountOptions) beforeSave() {} // +checklocksignore func (m *MountOptions) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.Flags) stateSinkObject.Save(1, &m.ReadOnly) stateSinkObject.Save(2, &m.GetFilesystemOptions) stateSinkObject.Save(3, &m.Locked) } func (m *MountOptions) afterLoad(context.Context) {} // +checklocksignore func (m *MountOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.Flags) stateSourceObject.Load(1, &m.ReadOnly) stateSourceObject.Load(2, &m.GetFilesystemOptions) stateSourceObject.Load(3, &m.Locked) } func (o *OpenOptions) StateTypeName() string { return "pkg/sentry/vfs.OpenOptions" } func (o *OpenOptions) StateFields() []string { return []string{ "Flags", "Mode", "FileExec", } } func (o *OpenOptions) beforeSave() {} // +checklocksignore func (o *OpenOptions) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.Flags) stateSinkObject.Save(1, &o.Mode) stateSinkObject.Save(2, &o.FileExec) } func (o *OpenOptions) afterLoad(context.Context) {} // +checklocksignore func (o *OpenOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.Flags) stateSourceObject.Load(1, &o.Mode) stateSourceObject.Load(2, &o.FileExec) } func (r *ReadOptions) StateTypeName() string { return "pkg/sentry/vfs.ReadOptions" } func (r *ReadOptions) StateFields() []string { return []string{ "Flags", } } func (r *ReadOptions) beforeSave() {} // +checklocksignore func (r *ReadOptions) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.Flags) } func (r *ReadOptions) afterLoad(context.Context) {} // +checklocksignore func (r *ReadOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.Flags) } func (r *RenameOptions) StateTypeName() string { return "pkg/sentry/vfs.RenameOptions" } func (r *RenameOptions) StateFields() []string { return []string{ "Flags", "MustBeDir", } } func (r *RenameOptions) beforeSave() {} // +checklocksignore func (r *RenameOptions) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.Flags) stateSinkObject.Save(1, &r.MustBeDir) } func (r *RenameOptions) afterLoad(context.Context) {} // +checklocksignore func (r *RenameOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.Flags) stateSourceObject.Load(1, &r.MustBeDir) } func (s *SetStatOptions) StateTypeName() string { return "pkg/sentry/vfs.SetStatOptions" } func (s *SetStatOptions) StateFields() []string { return []string{ "Stat", "NeedWritePerm", } } func (s *SetStatOptions) beforeSave() {} // +checklocksignore func (s *SetStatOptions) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.Stat) stateSinkObject.Save(1, &s.NeedWritePerm) } func (s *SetStatOptions) afterLoad(context.Context) {} // +checklocksignore func (s *SetStatOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.Stat) stateSourceObject.Load(1, &s.NeedWritePerm) } func (b *BoundEndpointOptions) StateTypeName() string { return "pkg/sentry/vfs.BoundEndpointOptions" } func (b *BoundEndpointOptions) StateFields() []string { return []string{ "Addr", } } func (b *BoundEndpointOptions) beforeSave() {} // +checklocksignore func (b *BoundEndpointOptions) StateSave(stateSinkObject state.Sink) { b.beforeSave() stateSinkObject.Save(0, &b.Addr) } func (b *BoundEndpointOptions) afterLoad(context.Context) {} // +checklocksignore func (b *BoundEndpointOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &b.Addr) } func (g *GetXattrOptions) StateTypeName() string { return "pkg/sentry/vfs.GetXattrOptions" } func (g *GetXattrOptions) StateFields() []string { return []string{ "Name", "Size", } } func (g *GetXattrOptions) beforeSave() {} // +checklocksignore func (g *GetXattrOptions) StateSave(stateSinkObject state.Sink) { g.beforeSave() stateSinkObject.Save(0, &g.Name) stateSinkObject.Save(1, &g.Size) } func (g *GetXattrOptions) afterLoad(context.Context) {} // +checklocksignore func (g *GetXattrOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &g.Name) stateSourceObject.Load(1, &g.Size) } func (s *SetXattrOptions) StateTypeName() string { return "pkg/sentry/vfs.SetXattrOptions" } func (s *SetXattrOptions) StateFields() []string { return []string{ "Name", "Value", "Flags", } } func (s *SetXattrOptions) beforeSave() {} // +checklocksignore func (s *SetXattrOptions) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.Name) stateSinkObject.Save(1, &s.Value) stateSinkObject.Save(2, &s.Flags) } func (s *SetXattrOptions) afterLoad(context.Context) {} // +checklocksignore func (s *SetXattrOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.Name) stateSourceObject.Load(1, &s.Value) stateSourceObject.Load(2, &s.Flags) } func (s *StatOptions) StateTypeName() string { return "pkg/sentry/vfs.StatOptions" } func (s *StatOptions) StateFields() []string { return []string{ "Mask", "Sync", } } func (s *StatOptions) beforeSave() {} // +checklocksignore func (s *StatOptions) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.Mask) stateSinkObject.Save(1, &s.Sync) } func (s *StatOptions) afterLoad(context.Context) {} // +checklocksignore func (s *StatOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.Mask) stateSourceObject.Load(1, &s.Sync) } func (u *UmountOptions) StateTypeName() string { return "pkg/sentry/vfs.UmountOptions" } func (u *UmountOptions) StateFields() []string { return []string{ "Flags", } } func (u *UmountOptions) beforeSave() {} // +checklocksignore func (u *UmountOptions) StateSave(stateSinkObject state.Sink) { u.beforeSave() stateSinkObject.Save(0, &u.Flags) } func (u *UmountOptions) afterLoad(context.Context) {} // +checklocksignore func (u *UmountOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &u.Flags) } func (w *WriteOptions) StateTypeName() string { return "pkg/sentry/vfs.WriteOptions" } func (w *WriteOptions) StateFields() []string { return []string{ "Flags", } } func (w *WriteOptions) beforeSave() {} // +checklocksignore func (w *WriteOptions) StateSave(stateSinkObject state.Sink) { w.beforeSave() stateSinkObject.Save(0, &w.Flags) } func (w *WriteOptions) afterLoad(context.Context) {} // +checklocksignore func (w *WriteOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &w.Flags) } func (a *AccessTypes) StateTypeName() string { return "pkg/sentry/vfs.AccessTypes" } func (a *AccessTypes) StateFields() []string { return nil } func (rp *ResolvingPath) StateTypeName() string { return "pkg/sentry/vfs.ResolvingPath" } func (rp *ResolvingPath) StateFields() []string { return []string{ "vfs", "root", "mount", "start", "pit", "flags", "mustBeDir", "symlinks", "curPart", "creds", "nextMount", "nextStart", "absSymlinkTarget", "parts", } } func (rp *ResolvingPath) beforeSave() {} // +checklocksignore func (rp *ResolvingPath) StateSave(stateSinkObject state.Sink) { rp.beforeSave() stateSinkObject.Save(0, &rp.vfs) stateSinkObject.Save(1, &rp.root) stateSinkObject.Save(2, &rp.mount) stateSinkObject.Save(3, &rp.start) stateSinkObject.Save(4, &rp.pit) stateSinkObject.Save(5, &rp.flags) stateSinkObject.Save(6, &rp.mustBeDir) stateSinkObject.Save(7, &rp.symlinks) stateSinkObject.Save(8, &rp.curPart) stateSinkObject.Save(9, &rp.creds) stateSinkObject.Save(10, &rp.nextMount) stateSinkObject.Save(11, &rp.nextStart) stateSinkObject.Save(12, &rp.absSymlinkTarget) stateSinkObject.Save(13, &rp.parts) } func (rp *ResolvingPath) afterLoad(context.Context) {} // +checklocksignore func (rp *ResolvingPath) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &rp.vfs) stateSourceObject.Load(1, &rp.root) stateSourceObject.Load(2, &rp.mount) stateSourceObject.Load(3, &rp.start) stateSourceObject.Load(4, &rp.pit) stateSourceObject.Load(5, &rp.flags) stateSourceObject.Load(6, &rp.mustBeDir) stateSourceObject.Load(7, &rp.symlinks) stateSourceObject.Load(8, &rp.curPart) stateSourceObject.Load(9, &rp.creds) stateSourceObject.Load(10, &rp.nextMount) stateSourceObject.Load(11, &rp.nextStart) stateSourceObject.Load(12, &rp.absSymlinkTarget) stateSourceObject.Load(13, &rp.parts) } func (r *resolveMountRootOrJumpError) StateTypeName() string { return "pkg/sentry/vfs.resolveMountRootOrJumpError" } func (r *resolveMountRootOrJumpError) StateFields() []string { return []string{} } func (r *resolveMountRootOrJumpError) beforeSave() {} // +checklocksignore func (r *resolveMountRootOrJumpError) StateSave(stateSinkObject state.Sink) { r.beforeSave() } func (r *resolveMountRootOrJumpError) afterLoad(context.Context) {} // +checklocksignore func (r *resolveMountRootOrJumpError) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (r *resolveMountPointError) StateTypeName() string { return "pkg/sentry/vfs.resolveMountPointError" } func (r *resolveMountPointError) StateFields() []string { return []string{} } func (r *resolveMountPointError) beforeSave() {} // +checklocksignore func (r *resolveMountPointError) StateSave(stateSinkObject state.Sink) { r.beforeSave() } func (r *resolveMountPointError) afterLoad(context.Context) {} // +checklocksignore func (r *resolveMountPointError) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (r *resolveAbsSymlinkError) StateTypeName() string { return "pkg/sentry/vfs.resolveAbsSymlinkError" } func (r *resolveAbsSymlinkError) StateFields() []string { return []string{} } func (r *resolveAbsSymlinkError) beforeSave() {} // +checklocksignore func (r *resolveAbsSymlinkError) StateSave(stateSinkObject state.Sink) { r.beforeSave() } func (r *resolveAbsSymlinkError) afterLoad(context.Context) {} // +checklocksignore func (r *resolveAbsSymlinkError) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (f *RestoreID) StateTypeName() string { return "pkg/sentry/vfs.RestoreID" } func (f *RestoreID) StateFields() []string { return []string{ "ContainerName", "Path", } } func (f *RestoreID) beforeSave() {} // +checklocksignore func (f *RestoreID) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.ContainerName) stateSinkObject.Save(1, &f.Path) } func (f *RestoreID) afterLoad(context.Context) {} // +checklocksignore func (f *RestoreID) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.ContainerName) stateSourceObject.Load(1, &f.Path) } func (vfs *VirtualFilesystem) StateTypeName() string { return "pkg/sentry/vfs.VirtualFilesystem" } func (vfs *VirtualFilesystem) StateFields() []string { return []string{ "mounts", "mountpoints", "lastMountID", "anonMount", "devices", "dynCharDevMajorUsed", "anonBlockDevMinorNext", "anonBlockDevMinor", "fsTypes", "filesystems", "groupIDBitmap", "mountPromises", "toDecRef", } } func (vfs *VirtualFilesystem) beforeSave() {} // +checklocksignore func (vfs *VirtualFilesystem) StateSave(stateSinkObject state.Sink) { vfs.beforeSave() var mountsValue []*Mount mountsValue = vfs.saveMounts() stateSinkObject.SaveValue(0, mountsValue) var mountPromisesValue map[VirtualDentry]*mountPromise mountPromisesValue = vfs.saveMountPromises() stateSinkObject.SaveValue(11, mountPromisesValue) stateSinkObject.Save(1, &vfs.mountpoints) stateSinkObject.Save(2, &vfs.lastMountID) stateSinkObject.Save(3, &vfs.anonMount) stateSinkObject.Save(4, &vfs.devices) stateSinkObject.Save(5, &vfs.dynCharDevMajorUsed) stateSinkObject.Save(6, &vfs.anonBlockDevMinorNext) stateSinkObject.Save(7, &vfs.anonBlockDevMinor) stateSinkObject.Save(8, &vfs.fsTypes) stateSinkObject.Save(9, &vfs.filesystems) stateSinkObject.Save(10, &vfs.groupIDBitmap) stateSinkObject.Save(12, &vfs.toDecRef) } func (vfs *VirtualFilesystem) afterLoad(context.Context) {} // +checklocksignore func (vfs *VirtualFilesystem) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(1, &vfs.mountpoints) stateSourceObject.Load(2, &vfs.lastMountID) stateSourceObject.Load(3, &vfs.anonMount) stateSourceObject.Load(4, &vfs.devices) stateSourceObject.Load(5, &vfs.dynCharDevMajorUsed) stateSourceObject.Load(6, &vfs.anonBlockDevMinorNext) stateSourceObject.Load(7, &vfs.anonBlockDevMinor) stateSourceObject.Load(8, &vfs.fsTypes) stateSourceObject.Load(9, &vfs.filesystems) stateSourceObject.Load(10, &vfs.groupIDBitmap) stateSourceObject.Load(12, &vfs.toDecRef) stateSourceObject.LoadValue(0, new([]*Mount), func(y any) { vfs.loadMounts(ctx, y.([]*Mount)) }) stateSourceObject.LoadValue(11, new(map[VirtualDentry]*mountPromise), func(y any) { vfs.loadMountPromises(ctx, y.(map[VirtualDentry]*mountPromise)) }) } func (p *PathOperation) StateTypeName() string { return "pkg/sentry/vfs.PathOperation" } func (p *PathOperation) StateFields() []string { return []string{ "Root", "Start", "Path", "FollowFinalSymlink", } } func (p *PathOperation) beforeSave() {} // +checklocksignore func (p *PathOperation) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.Root) stateSinkObject.Save(1, &p.Start) stateSinkObject.Save(2, &p.Path) stateSinkObject.Save(3, &p.FollowFinalSymlink) } func (p *PathOperation) afterLoad(context.Context) {} // +checklocksignore func (p *PathOperation) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.Root) stateSourceObject.Load(1, &p.Start) stateSourceObject.Load(2, &p.Path) stateSourceObject.Load(3, &p.FollowFinalSymlink) } func (vd *VirtualDentry) StateTypeName() string { return "pkg/sentry/vfs.VirtualDentry" } func (vd *VirtualDentry) StateFields() []string { return []string{ "mount", "dentry", } } func (vd *VirtualDentry) beforeSave() {} // +checklocksignore func (vd *VirtualDentry) StateSave(stateSinkObject state.Sink) { vd.beforeSave() stateSinkObject.Save(0, &vd.mount) stateSinkObject.Save(1, &vd.dentry) } func (vd *VirtualDentry) afterLoad(context.Context) {} // +checklocksignore func (vd *VirtualDentry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &vd.mount) stateSourceObject.Load(1, &vd.dentry) } func init() { state.Register((*anonFilesystemType)(nil)) state.Register((*anonFilesystem)(nil)) state.Register((*anonDentry)(nil)) state.Register((*Dentry)(nil)) state.Register((*DeviceKind)(nil)) state.Register((*devTuple)(nil)) state.Register((*registeredDevice)(nil)) state.Register((*RegisterDeviceOptions)(nil)) state.Register((*EpollInstance)(nil)) state.Register((*epollInterestKey)(nil)) state.Register((*epollInterest)(nil)) state.Register((*epollInterestList)(nil)) state.Register((*epollInterestEntry)(nil)) state.Register((*eventList)(nil)) state.Register((*eventEntry)(nil)) state.Register((*FileDescription)(nil)) state.Register((*FileDescriptionOptions)(nil)) state.Register((*Dirent)(nil)) state.Register((*FileDescriptionDefaultImpl)(nil)) state.Register((*DirectoryFileDescriptionDefaultImpl)(nil)) state.Register((*DentryMetadataFileDescriptionImpl)(nil)) state.Register((*StaticData)(nil)) state.Register((*DynamicBytesFileDescriptionImpl)(nil)) state.Register((*LockFD)(nil)) state.Register((*NoLockFD)(nil)) state.Register((*BadLockFD)(nil)) state.Register((*FileDescriptionRefs)(nil)) state.Register((*Filesystem)(nil)) state.Register((*PrependPathAtVFSRootError)(nil)) state.Register((*PrependPathAtNonMountRootError)(nil)) state.Register((*PrependPathSyntheticError)(nil)) state.Register((*FilesystemRefs)(nil)) state.Register((*registeredFilesystemType)(nil)) state.Register((*RegisterFilesystemTypeOptions)(nil)) state.Register((*EventType)(nil)) state.Register((*Inotify)(nil)) state.Register((*Watches)(nil)) state.Register((*Watch)(nil)) state.Register((*Event)(nil)) state.Register((*FileLocks)(nil)) state.Register((*Mount)(nil)) state.Register((*umountRecursiveOptions)(nil)) state.Register((*followerList)(nil)) state.Register((*followerEntry)(nil)) state.Register((*namespaceRefs)(nil)) state.Register((*mountEntry)(nil)) state.Register((*MountNamespace)(nil)) state.Register((*opathFD)(nil)) state.Register((*GetDentryOptions)(nil)) state.Register((*MkdirOptions)(nil)) state.Register((*MknodOptions)(nil)) state.Register((*MountFlags)(nil)) state.Register((*MountOptions)(nil)) state.Register((*OpenOptions)(nil)) state.Register((*ReadOptions)(nil)) state.Register((*RenameOptions)(nil)) state.Register((*SetStatOptions)(nil)) state.Register((*BoundEndpointOptions)(nil)) state.Register((*GetXattrOptions)(nil)) state.Register((*SetXattrOptions)(nil)) state.Register((*StatOptions)(nil)) state.Register((*UmountOptions)(nil)) state.Register((*WriteOptions)(nil)) state.Register((*AccessTypes)(nil)) state.Register((*ResolvingPath)(nil)) state.Register((*resolveMountRootOrJumpError)(nil)) state.Register((*resolveMountPointError)(nil)) state.Register((*resolveAbsSymlinkError)(nil)) state.Register((*RestoreID)(nil)) state.Register((*VirtualFilesystem)(nil)) state.Register((*PathOperation)(nil)) state.Register((*VirtualDentry)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/vfs_testonly_state_autogen.go000066400000000000000000000001551465435605700275570ustar00rootroot00000000000000// automatically generated by stateify. //go:build check_invariants // +build check_invariants package vfs golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/vfs_unsafe_state_autogen.go000066400000000000000000000000651465435605700271570ustar00rootroot00000000000000// automatically generated by stateify. package vfs golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/vfs/virtual_filesystem_mutex.go000066400000000000000000000034561465435605700272610ustar00rootroot00000000000000package vfs import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type virtualFilesystemMutex struct { mu sync.Mutex } var virtualFilesystemprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var virtualFilesystemlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type virtualFilesystemlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *virtualFilesystemMutex) Lock() { locking.AddGLock(virtualFilesystemprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *virtualFilesystemMutex) NestedLock(i virtualFilesystemlockNameIndex) { locking.AddGLock(virtualFilesystemprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *virtualFilesystemMutex) Unlock() { locking.DelGLock(virtualFilesystemprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *virtualFilesystemMutex) NestedUnlock(i virtualFilesystemlockNameIndex) { locking.DelGLock(virtualFilesystemprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func virtualFilesysteminitLockNames() {} func init() { virtualFilesysteminitLockNames() virtualFilesystemprefixIndex = locking.NewMutexClass(reflect.TypeOf(virtualFilesystemMutex{}), virtualFilesystemlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/watchdog/000077500000000000000000000000001465435605700225505ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/watchdog/watchdog.go000066400000000000000000000270561465435605700247110ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package watchdog is responsible for monitoring the sentry for tasks that may // potentially be stuck or looping inderterminally causing hard to debug hangs in // the untrusted app. // // It works by periodically querying all tasks to check whether they are in user // mode (RunUser), kernel mode (RunSys), or blocked in the kernel (OffCPU). Tasks // that have been running in kernel mode for a long time in the same syscall // without blocking are considered stuck and are reported. // // When a stuck task is detected, the watchdog can take one of the following actions: // 1. LogWarning: Logs a warning message followed by a stack dump of all goroutines. // If a tasks continues to be stuck, the message will repeat every minute, unless // a new stuck task is detected // 2. Panic: same as above, followed by panic() package watchdog import ( "bytes" "fmt" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sync" ) // Opts configures the watchdog. type Opts struct { // TaskTimeout is the amount of time to allow a task to execute the // same syscall without blocking before it's declared stuck. TaskTimeout time.Duration // TaskTimeoutAction indicates what action to take when a stuck tasks // is detected. TaskTimeoutAction Action // StartupTimeout is the amount of time to allow between watchdog // creation and calling watchdog.Start. StartupTimeout time.Duration // StartupTimeoutAction indicates what action to take when // watchdog.Start is not called within the timeout. StartupTimeoutAction Action } // DefaultOpts is a default set of options for the watchdog. var DefaultOpts = Opts{ // Task timeout. TaskTimeout: 3 * time.Minute, TaskTimeoutAction: LogWarning, // Startup timeout. StartupTimeout: 30 * time.Second, StartupTimeoutAction: LogWarning, } // descheduleThreshold is the amount of time scheduling needs to be off before the entire wait period // is discounted from task's last update time. It's set high enough that small scheduling delays won't // trigger it. const descheduleThreshold = 1 * time.Second // Amount of time to wait before dumping the stack to the log again when the same task(s) remains stuck. var stackDumpSameTaskPeriod = time.Minute // Action defines what action to take when a stuck task is detected. type Action int const ( // LogWarning logs warning message followed by stack trace. LogWarning Action = iota // Panic will do the same logging as LogWarning and panic(). Panic ) // Set implements flag.Value. func (a *Action) Set(v string) error { switch v { case "log", "logwarning": *a = LogWarning case "panic": *a = Panic default: return fmt.Errorf("invalid watchdog action %q", v) } return nil } // Get implements flag.Value. func (a *Action) Get() any { return *a } // String returns Action's string representation. func (a Action) String() string { switch a { case LogWarning: return "logWarning" case Panic: return "panic" default: panic(fmt.Sprintf("Invalid watchdog action: %d", a)) } } // Watchdog is the main watchdog class. It controls a goroutine that periodically // analyses all tasks and reports if any of them appear to be stuck. type Watchdog struct { // Configuration options are embedded. Opts // period indicates how often to check all tasks. It's calculated based on // opts.TaskTimeout. period time.Duration // k is where the tasks come from. k *kernel.Kernel // stop is used to notify to watchdog should stop. stop chan struct{} // done is used to notify when the watchdog has stopped. done chan struct{} // offenders map contains all tasks that are currently stuck. offenders map[*kernel.Task]*offender // lastStackDump tracks the last time a stack dump was generated to prevent // spamming the log. lastStackDump time.Time // lastRun is set to the last time the watchdog executed a monitoring loop. lastRun ktime.Time // mu protects the fields below. mu sync.Mutex // running is true if the watchdog is running. running bool // startCalled is true if Start has ever been called. It remains true // even if Stop is called. startCalled bool } type offender struct { lastUpdateTime ktime.Time } // New creates a new watchdog. func New(k *kernel.Kernel, opts Opts) *Watchdog { // 4 is arbitrary, just don't want to prolong 'TaskTimeout' too much. period := opts.TaskTimeout / 4 w := &Watchdog{ Opts: opts, k: k, period: period, offenders: make(map[*kernel.Task]*offender), stop: make(chan struct{}), done: make(chan struct{}), } // Handle StartupTimeout if it exists. if w.StartupTimeout > 0 { log.Infof("Watchdog waiting %v for startup", w.StartupTimeout) go w.waitForStart() // S/R-SAFE: watchdog is stopped during save and restarted after restore. } return w } // Start starts the watchdog. func (w *Watchdog) Start() { w.mu.Lock() defer w.mu.Unlock() w.startCalled = true if w.running { return } if w.TaskTimeout == 0 { log.Infof("Watchdog task timeout disabled") return } w.lastRun = w.k.MonotonicClock().Now() log.Infof("Starting watchdog, period: %v, timeout: %v, action: %v", w.period, w.TaskTimeout, w.TaskTimeoutAction) go w.loop() // S/R-SAFE: watchdog is stopped during save and restarted after restore. w.running = true } // Stop requests the watchdog to stop and wait for it. func (w *Watchdog) Stop() { if w.TaskTimeout == 0 { return } w.mu.Lock() defer w.mu.Unlock() if !w.running { return } log.Infof("Stopping watchdog") w.stop <- struct{}{} <-w.done w.running = false log.Infof("Watchdog stopped") } // waitForStart waits for Start to be called and takes action if it does not // happen within the startup timeout. func (w *Watchdog) waitForStart() { <-time.After(w.StartupTimeout) w.mu.Lock() defer w.mu.Unlock() if w.startCalled { // We are fine. return } metric.WeirdnessMetric.Increment(&metric.WeirdnessTypeWatchdogStuckStartup) var buf bytes.Buffer buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout)) w.doAction(w.StartupTimeoutAction, false, &buf) } // loop is the main watchdog routine. It only returns when 'Stop()' is called. func (w *Watchdog) loop() { // Loop until someone stops it. for { select { case <-w.stop: w.done <- struct{}{} return case <-time.After(w.period): w.runTurn() } } } // runTurn runs a single pass over all tasks and reports anything it finds. func (w *Watchdog) runTurn() { // Someone needs to watch the watchdog. The call below can get stuck if there // is a deadlock affecting root's PID namespace mutex. Run it in a goroutine // and report if it takes too long to return. var tasks []*kernel.Task done := make(chan struct{}) go func() { // S/R-SAFE: watchdog is stopped and restarted during S/R. tasks = w.k.TaskSet().Root.Tasks() close(done) }() select { case <-done: case <-time.After(w.TaskTimeout): // Report if the watchdog is not making progress. // No one is watching the watchdog watcher though. w.reportStuckWatchdog() <-done } newOffenders := make(map[*kernel.Task]*offender) newTaskFound := false now := ktime.FromNanoseconds(int64(w.k.CPUClockNow() * uint64(linux.ClockTick))) // The process may be running with low CPU limit making tasks appear stuck because // are starved of CPU cycles. An estimate is that Tasks could have been starved // since the last time the watchdog run. If the watchdog detects that scheduling // is off, it will discount the entire duration since last run from 'lastUpdateTime'. discount := time.Duration(0) if now.Sub(w.lastRun.Add(w.period)) > descheduleThreshold { discount = now.Sub(w.lastRun) } w.lastRun = now log.Infof("Watchdog starting loop, tasks: %d, discount: %v", len(tasks), discount) for _, t := range tasks { tsched := t.TaskGoroutineSchedInfo() // An offender is a task running inside the kernel for longer than the specified timeout. if tsched.State == kernel.TaskGoroutineRunningSys { lastUpdateTime := ktime.FromNanoseconds(int64(tsched.Timestamp * uint64(linux.ClockTick))) elapsed := now.Sub(lastUpdateTime) - discount if elapsed > w.TaskTimeout { tc, ok := w.offenders[t] if !ok { // New stuck task detected. // // Note that tasks blocked doing IO may be considered stuck in kernel, // unless they are surrounded by // Task.UninterruptibleSleepStart/Finish. tc = &offender{lastUpdateTime: lastUpdateTime} metric.WeirdnessMetric.Increment(&metric.WeirdnessTypeWatchdogStuckTasks) newTaskFound = true } newOffenders[t] = tc } } } if len(newOffenders) > 0 { w.report(newOffenders, newTaskFound, now) } // Remember which tasks have been reported. w.offenders = newOffenders } // report takes appropriate action when a stuck task is detected. func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound bool, now ktime.Time) { var buf bytes.Buffer buf.WriteString(fmt.Sprintf("Sentry detected %d stuck task(s):\n", len(offenders))) for t, o := range offenders { tid := w.k.TaskSet().Root.IDOfTask(t) buf.WriteString(fmt.Sprintf("\tTask tid: %v (goroutine %d), entered RunSys state %v ago.\n", tid, t.GoroutineID(), now.Sub(o.lastUpdateTime))) } buf.WriteString("Search for 'goroutine ' in the stack dump to find the offending goroutine(s)") // Force stack dump only if a new task is detected. w.doAction(w.TaskTimeoutAction, newTaskFound, &buf) } func (w *Watchdog) reportStuckWatchdog() { var buf bytes.Buffer buf.WriteString("Watchdog goroutine is stuck") w.doAction(w.TaskTimeoutAction, false, &buf) } // doAction will take the given action. If the action is LogWarning, the stack // is not always dumped to the log to prevent log flooding. "forceStack" // guarantees that the stack will be dumped regardless. func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) { switch action { case LogWarning: // Dump stack only if forced or sometime has passed since the last time a // stack dump was generated. if !forceStack && time.Since(w.lastStackDump) < stackDumpSameTaskPeriod { msg.WriteString("\n...[stack dump skipped]...") log.Warningf(msg.String()) return } log.TracebackAll(msg.String()) w.lastStackDump = time.Now() case Panic: // Panic will skip over running tasks, which is likely the culprit here. So manually // dump all stacks before panic'ing. log.TracebackAll(msg.String()) // Attempt to flush metrics, timeout and move on in case metrics are stuck as well. metricsEmitted := make(chan struct{}, 1) go func() { // S/R-SAFE: watchdog is stopped during save and restarted after restore. // Flush metrics before killing process. metric.EmitMetricUpdate() metricsEmitted <- struct{}{} }() select { case <-metricsEmitted: case <-time.After(1 * time.Second): } panic(fmt.Sprintf("%s\nStack for running G's are skipped while panicking.", msg.String())) default: panic(fmt.Sprintf("Unknown watchdog action %v", action)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sentry/watchdog/watchdog_state_autogen.go000066400000000000000000000000721465435605700276200ustar00rootroot00000000000000// automatically generated by stateify. package watchdog golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/000077500000000000000000000000001465435605700203645ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/extension/000077500000000000000000000000001465435605700224005ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/extension/extension.go000066400000000000000000000035751465435605700247550ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package extension provides an extension to the shim. package extension import ( "context" "github.com/containerd/containerd/pkg/process" "github.com/containerd/containerd/runtime/v2/task" ) // NewExtension registers an extension constructor. It may return nil, nil to indicate that the // extension should not handle this task request. Returning an error will fail the task request. var NewExtension func(ctx context.Context, next TaskServiceExt, req *task.CreateTaskRequest) (TaskServiceExt, error) // RestoreRequest is a request to restore a container. It extends // task.StartRequest with restore functionality. type RestoreRequest struct { Start task.StartRequest Conf RestoreConfig } // Process extends process.Process with extra restore functionality. type Process interface { process.Process // Restore restores the container from a snapshot. Restore(context.Context, *RestoreConfig) error } // RestoreConfig is the configuration for a restore request. type RestoreConfig struct { ImagePath string Direct bool } // TaskServiceExt extends TaskRequest with extra functionality required by the shim. type TaskServiceExt interface { task.TaskService Cleanup(ctx context.Context) (*task.DeleteResponse, error) Restore(ctx context.Context, req *RestoreRequest) (*task.StartResponse, error) } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/extension/extension_state_autogen.go000066400000000000000000000000731465435605700276650ustar00rootroot00000000000000// automatically generated by stateify. package extension golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/proc/000077500000000000000000000000001465435605700213275ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/proc/deleted_state.go000066400000000000000000000035251465435605700244710ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package proc import ( "context" "fmt" "github.com/containerd/console" "github.com/containerd/containerd/errdefs" runc "github.com/containerd/go-runc" "gvisor.dev/gvisor/pkg/shim/extension" ) type deletedState struct{} func (*deletedState) Resize(console.WinSize) error { return fmt.Errorf("cannot resize a deleted container/process") } func (*deletedState) Start(context.Context, *extension.RestoreConfig) error { return fmt.Errorf("cannot start a deleted container/process") } func (*deletedState) Delete(context.Context) error { return fmt.Errorf("cannot delete a deleted container/process: %w", errdefs.ErrNotFound) } func (*deletedState) Kill(_ context.Context, signal uint32, _ bool) error { return handleStoppedKill(signal) } func (*deletedState) SetExited(int) {} func (*deletedState) Exec(context.Context, string, *ExecConfig) (extension.Process, error) { return nil, fmt.Errorf("cannot exec in a deleted state") } func (s *deletedState) State(context.Context) (string, error) { // There is no "deleted" state, closest one is stopped. return "stopped", nil } func (s *deletedState) Stats(context.Context, string) (*runc.Stats, error) { return nil, fmt.Errorf("cannot stat a stopped container/process") } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/proc/exec.go000066400000000000000000000162231465435605700226060ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package proc import ( "context" "fmt" "io" "os" "path/filepath" "sync" "time" "github.com/containerd/console" "github.com/containerd/containerd/errdefs" "github.com/containerd/containerd/log" "github.com/containerd/containerd/pkg/stdio" "github.com/containerd/fifo" runc "github.com/containerd/go-runc" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/shim/extension" "gvisor.dev/gvisor/pkg/shim/runsccmd" ) type execProcess struct { wg sync.WaitGroup execState execState mu sync.Mutex id string console console.Console io runc.IO status int exited time.Time pid int internalPid int closers []io.Closer stdin io.Closer stdio stdio.Stdio path string spec specs.Process parent *Init waitBlock chan struct{} } func (e *execProcess) Wait() { <-e.waitBlock } func (e *execProcess) ID() string { return e.id } func (e *execProcess) Pid() int { e.mu.Lock() defer e.mu.Unlock() return e.pid } func (e *execProcess) ExitStatus() int { e.mu.Lock() defer e.mu.Unlock() return e.status } func (e *execProcess) ExitedAt() time.Time { e.mu.Lock() defer e.mu.Unlock() return e.exited } func (e *execProcess) SetExited(status int) { e.mu.Lock() defer e.mu.Unlock() e.execState.SetExited(status) } func (e *execProcess) setExited(status int) { if !e.exited.IsZero() { log.L.Debugf("Exec: status already set to %d, ignoring status: %d", e.status, status) return } log.L.Debugf("Exec: setting status: %d", status) e.status = status e.exited = time.Now() e.parent.Platform.ShutdownConsole(context.Background(), e.console) close(e.waitBlock) } func (e *execProcess) Delete(ctx context.Context) error { e.mu.Lock() defer e.mu.Unlock() return e.execState.Delete(ctx) } func (e *execProcess) delete() { e.wg.Wait() if e.io != nil { for _, c := range e.closers { c.Close() } e.io.Close() } } func (e *execProcess) Resize(ws console.WinSize) error { e.mu.Lock() defer e.mu.Unlock() return e.execState.Resize(ws) } func (e *execProcess) resize(ws console.WinSize) error { if e.console == nil { return nil } return e.console.Resize(ws) } func (e *execProcess) Kill(ctx context.Context, sig uint32, _ bool) error { e.mu.Lock() defer e.mu.Unlock() return e.execState.Kill(ctx, sig, false) } func (e *execProcess) kill(ctx context.Context, sig uint32, _ bool) error { internalPid := e.internalPid if internalPid == 0 { return nil } opts := runsccmd.KillOpts{Pid: internalPid} if err := e.parent.runtime.Kill(ctx, e.parent.id, int(sig), &opts); err != nil { return fmt.Errorf("%s: %w", err.Error(), errdefs.ErrNotFound) } return nil } func (e *execProcess) Stdin() io.Closer { return e.stdin } func (e *execProcess) Stdio() stdio.Stdio { return e.stdio } func (e *execProcess) Start(ctx context.Context) error { e.mu.Lock() defer e.mu.Unlock() return e.execState.Start(ctx, nil /* restoreConf */) } func (e *execProcess) start(ctx context.Context) error { var socket *runc.Socket switch { case e.stdio.Terminal: s, err := runc.NewTempConsoleSocket() if err != nil { return fmt.Errorf("failed to create runc console socket: %w", err) } defer s.Close() socket = s case e.stdio.IsNull(): io, err := runc.NewNullIO() if err != nil { return fmt.Errorf("creating new NULL IO: %w", err) } e.io = io default: io, err := runc.NewPipeIO(e.parent.IoUID, e.parent.IoGID, withConditionalIO(e.stdio)) if err != nil { return fmt.Errorf("failed to create runc io pipes: %w", err) } e.io = io } opts := &runsccmd.ExecOpts{ PidFile: filepath.Join(e.path, fmt.Sprintf("%s.pid", e.id)), InternalPidFile: filepath.Join(e.path, fmt.Sprintf("%s-internal.pid", e.id)), IO: e.io, Detach: true, } defer func() { _ = os.Remove(opts.PidFile) _ = os.Remove(opts.InternalPidFile) }() if socket != nil { opts.ConsoleSocket = socket } eventCh := e.parent.Monitor.Subscribe() cu := cleanup.Make(func() { e.parent.Monitor.Unsubscribe(eventCh) }) defer cu.Clean() if err := e.parent.runtime.Exec(ctx, e.parent.id, e.spec, opts); err != nil { close(e.waitBlock) return e.parent.runtimeError(err, "OCI runtime exec failed") } if e.stdio.Stdin != "" { sc, err := fifo.OpenFifo(context.Background(), e.stdio.Stdin, unix.O_WRONLY|unix.O_NONBLOCK, 0) if err != nil { return fmt.Errorf("failed to open stdin fifo %s: %w", e.stdio.Stdin, err) } e.closers = append(e.closers, sc) e.stdin = sc } ctx, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() if socket != nil { console, err := socket.ReceiveMaster() if err != nil { return fmt.Errorf("failed to retrieve console master: %w", err) } if e.console, err = e.parent.Platform.CopyConsole(ctx, console, e.stdio.Stdin, e.stdio.Stdout, e.stdio.Stderr, &e.wg); err != nil { return fmt.Errorf("failed to start console copy: %w", err) } } else if !e.stdio.IsNull() { if err := copyPipes(ctx, e.io, e.stdio.Stdin, e.stdio.Stdout, e.stdio.Stderr, &e.wg); err != nil { return fmt.Errorf("failed to start io pipe copy: %w", err) } } pid, err := runc.ReadPidFile(opts.PidFile) if err != nil { return fmt.Errorf("failed to retrieve OCI runtime exec pid: %w", err) } e.pid = pid internalPid, err := runc.ReadPidFile(opts.InternalPidFile) if err != nil { return fmt.Errorf("failed to retrieve OCI runtime exec internal pid: %w", err) } e.internalPid = internalPid go func() { defer e.parent.Monitor.Unsubscribe(eventCh) for event := range eventCh { if event.Pid == e.pid { ExitCh <- Exit{ Timestamp: event.Timestamp, ID: e.id, Status: event.Status, } break } } }() cu.Release() // cancel cleanup on success. return nil } func (e *execProcess) Restore(context.Context, *extension.RestoreConfig) error { return fmt.Errorf("cannot restore an exec'd process") } func (e *execProcess) Status(context.Context) (string, error) { e.mu.Lock() defer e.mu.Unlock() // if we don't have a pid then the exec process has just been created if e.pid == 0 { return "created", nil } // This checks that `runsc exec` process is still running. This process has // the same lifetime as the process executing inside the container. So instead // of calling `runsc kill --pid`, just do a quick check that `runsc exec` is // still running. if err := unix.Kill(e.pid, 0); err != nil { // Can't signal the process, it must have exited. return "stopped", nil } return "running", nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/proc/exec_state.go000066400000000000000000000075341465435605700240130ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package proc import ( "context" "fmt" "github.com/containerd/console" "gvisor.dev/gvisor/pkg/shim/extension" ) type execState interface { Resize(console.WinSize) error Start(context.Context, *extension.RestoreConfig) error Delete(context.Context) error Kill(context.Context, uint32, bool) error SetExited(int) } type execCreatedState struct { p *execProcess } func (s *execCreatedState) name() string { return "created" } func (s *execCreatedState) transition(transition stateTransition) { switch transition { case running: s.p.execState = &execRunningState{p: s.p} case stopped: s.p.execState = &execStoppedState{p: s.p} case deleted: s.p.execState = &deletedState{} default: panic(fmt.Sprintf("invalid state transition %q to %q", s.name(), transition)) } } func (s *execCreatedState) Resize(ws console.WinSize) error { return s.p.resize(ws) } func (s *execCreatedState) Start(ctx context.Context, restoreConf *extension.RestoreConfig) error { if restoreConf != nil { return fmt.Errorf("cannot restore an exec'd process") } if err := s.p.start(ctx); err != nil { return err } s.transition(running) return nil } func (s *execCreatedState) Delete(context.Context) error { s.p.delete() s.transition(deleted) return nil } func (s *execCreatedState) Kill(ctx context.Context, sig uint32, all bool) error { return s.p.kill(ctx, sig, all) } func (s *execCreatedState) SetExited(status int) { s.p.setExited(status) s.transition(stopped) } type execRunningState struct { p *execProcess } func (s *execRunningState) name() string { return "running" } func (s *execRunningState) transition(transition stateTransition) { switch transition { case stopped: s.p.execState = &execStoppedState{p: s.p} default: panic(fmt.Sprintf("invalid state transition %q to %q", s.name(), transition)) } } func (s *execRunningState) Resize(ws console.WinSize) error { return s.p.resize(ws) } func (s *execRunningState) Start(context.Context, *extension.RestoreConfig) error { return fmt.Errorf("cannot start a running process") } func (s *execRunningState) Delete(context.Context) error { return fmt.Errorf("cannot delete a running process") } func (s *execRunningState) Kill(ctx context.Context, sig uint32, all bool) error { return s.p.kill(ctx, sig, all) } func (s *execRunningState) SetExited(status int) { s.p.setExited(status) s.transition(stopped) } type execStoppedState struct { p *execProcess } func (s *execStoppedState) name() string { return "stopped" } func (s *execStoppedState) transition(transition stateTransition) { switch transition { case deleted: s.p.execState = &deletedState{} default: panic(fmt.Sprintf("invalid state transition %q to %q", s.name(), transition)) } } func (s *execStoppedState) Resize(console.WinSize) error { return fmt.Errorf("cannot resize a stopped container") } func (s *execStoppedState) Start(context.Context, *extension.RestoreConfig) error { return fmt.Errorf("cannot start a stopped process") } func (s *execStoppedState) Delete(context.Context) error { s.p.delete() s.transition(deleted) return nil } func (s *execStoppedState) Kill(_ context.Context, sig uint32, _ bool) error { return handleStoppedKill(sig) } func (s *execStoppedState) SetExited(int) { // no op } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/proc/init.go000066400000000000000000000304571465435605700226320ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package proc import ( "context" "encoding/json" "fmt" "io" "path/filepath" "strings" "sync" "time" "github.com/containerd/console" "github.com/containerd/containerd/errdefs" "github.com/containerd/containerd/log" "github.com/containerd/containerd/mount" "github.com/containerd/containerd/pkg/stdio" "github.com/containerd/fifo" runc "github.com/containerd/go-runc" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/shim/extension" "gvisor.dev/gvisor/pkg/shim/runsccmd" "gvisor.dev/gvisor/pkg/shim/utils" ) const statusStopped = "stopped" // Init represents an initial process for a container. type Init struct { wg sync.WaitGroup initState initState // mu is used to ensure that `Start()` and `Exited()` calls return in // the right order when invoked in separate go routines. This is the // case within the shim implementation as it makes use of the reaper // interface. mu sync.Mutex waitBlock chan struct{} WorkDir string id string Bundle string console console.Console Platform stdio.Platform io runc.IO runtime *runsccmd.Runsc status int exited time.Time pid int closers []io.Closer stdin io.Closer stdio stdio.Stdio Rootfs string IoUID int IoGID int Sandbox bool UserLog string Monitor ProcessMonitor } // NewRunsc returns a new runsc instance for a process. func NewRunsc(root, path, namespace, runtime string, config map[string]string, spec *specs.Spec) *runsccmd.Runsc { if root == "" { root = RunscRoot } return &runsccmd.Runsc{ Command: runtime, PdeathSignal: unix.SIGKILL, Log: filepath.Join(path, "log.json"), LogFormat: runc.JSON, PanicLog: utils.PanicLogPath(spec), Root: filepath.Join(root, namespace), Config: config, } } // New returns a new init process. func New(id string, runtime *runsccmd.Runsc, stdio stdio.Stdio) *Init { p := &Init{ id: id, runtime: runtime, stdio: stdio, status: 0, waitBlock: make(chan struct{}), } p.initState = &createdState{p: p} return p } // Create the process with the provided config. func (p *Init) Create(ctx context.Context, r *CreateConfig) (err error) { var socket *runc.Socket if r.Terminal { if socket, err = runc.NewTempConsoleSocket(); err != nil { return fmt.Errorf("failed to create OCI runtime console socket: %w", err) } defer socket.Close() } else if hasNoIO(r) { if p.io, err = runc.NewNullIO(); err != nil { return fmt.Errorf("creating new NULL IO: %w", err) } } else { if p.io, err = runc.NewPipeIO(p.IoUID, p.IoGID, withConditionalIO(p.stdio)); err != nil { return fmt.Errorf("failed to create OCI runtime io pipes: %w", err) } } // pidFile is the file that will contain the sandbox pid. pidFile := filepath.Join(p.Bundle, "init.pid") opts := &runsccmd.CreateOpts{ PidFile: pidFile, } if socket != nil { opts.ConsoleSocket = socket } if p.Sandbox { opts.IO = p.io // UserLog is only useful for sandbox. opts.UserLog = p.UserLog } if err := p.runtime.Create(ctx, r.ID, r.Bundle, opts); err != nil { return p.runtimeError(err, "OCI runtime create failed") } if r.Stdin != "" { sc, err := fifo.OpenFifo(context.Background(), r.Stdin, unix.O_WRONLY|unix.O_NONBLOCK, 0) if err != nil { return fmt.Errorf("failed to open stdin fifo %s: %w", r.Stdin, err) } p.stdin = sc p.closers = append(p.closers, sc) } ctx, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() if socket != nil { console, err := socket.ReceiveMaster() if err != nil { return fmt.Errorf("failed to retrieve console master: %w", err) } console, err = p.Platform.CopyConsole(ctx, console, r.Stdin, r.Stdout, r.Stderr, &p.wg) if err != nil { return fmt.Errorf("failed to start console copy: %w", err) } p.console = console } else if !hasNoIO(r) { if err := copyPipes(ctx, p.io, r.Stdin, r.Stdout, r.Stderr, &p.wg); err != nil { return fmt.Errorf("failed to start io pipe copy: %w", err) } } pid, err := runc.ReadPidFile(pidFile) if err != nil { return fmt.Errorf("failed to retrieve OCI runtime container pid: %w", err) } p.pid = pid return nil } // Wait waits for the process to exit. func (p *Init) Wait() { <-p.waitBlock } // ID returns the ID of the process. func (p *Init) ID() string { return p.id } // Pid returns the PID of the process. func (p *Init) Pid() int { return p.pid } // ExitStatus returns the exit status of the process. func (p *Init) ExitStatus() int { p.mu.Lock() defer p.mu.Unlock() return p.status } // ExitedAt returns the time when the process exited. func (p *Init) ExitedAt() time.Time { p.mu.Lock() defer p.mu.Unlock() return p.exited } // Status returns the status of the process. func (p *Init) Status(ctx context.Context) (string, error) { p.mu.Lock() defer p.mu.Unlock() return p.initState.State(ctx) } func (p *Init) state(ctx context.Context) (string, error) { c, err := p.runtime.State(ctx, p.id) if err != nil { if strings.Contains(err.Error(), "does not exist") { return statusStopped, nil } return "", p.runtimeError(err, "OCI runtime state failed") } return p.convertStatus(c.Status), nil } // Start starts the init process. func (p *Init) Start(ctx context.Context) error { p.mu.Lock() defer p.mu.Unlock() return p.initState.Start(ctx, nil /* restoreConf */) } func (p *Init) start(ctx context.Context, restoreConf *extension.RestoreConfig) error { var cio runc.IO if !p.Sandbox { cio = p.io } if restoreConf == nil { if err := p.runtime.Start(ctx, p.id, cio); err != nil { return p.runtimeError(err, "OCI runtime start failed") } } else { if err := p.runtime.Restore(ctx, p.id, cio, &runsccmd.RestoreOpts{ ImagePath: restoreConf.ImagePath, Direct: restoreConf.Direct, }); err != nil { return p.runtimeError(err, "OCI runtime restore failed") } } go func() { status, err := p.runtime.Wait(context.Background(), p.id) if err != nil { log.G(ctx).WithError(err).Errorf("Failed to wait for container %q", p.id) p.killAllLocked(ctx) status = internalErrorCode } ExitCh <- Exit{ Timestamp: time.Now(), ID: p.id, Status: status, } }() return nil } // Restore restores the container from a snapshot. func (p *Init) Restore(ctx context.Context, conf *extension.RestoreConfig) error { p.mu.Lock() defer p.mu.Unlock() return p.initState.Start(ctx, conf) } // SetExited set the exit status of the init process. func (p *Init) SetExited(status int) { p.mu.Lock() defer p.mu.Unlock() p.initState.SetExited(status) } func (p *Init) setExited(status int) { if !p.exited.IsZero() { log.L.Debugf("Status already set to %d, ignoring status: %d", p.status, status) return } log.L.Debugf("Setting status: %d", status) p.exited = time.Now() p.status = status p.Platform.ShutdownConsole(context.Background(), p.console) close(p.waitBlock) } // Delete deletes the init process. func (p *Init) Delete(ctx context.Context) error { p.mu.Lock() defer p.mu.Unlock() return p.initState.Delete(ctx) } func (p *Init) delete(ctx context.Context) error { p.killAllLocked(ctx) p.wg.Wait() err := p.runtime.Delete(ctx, p.id, nil) if err != nil { // ignore errors if a runtime has already deleted the process // but we still hold metadata and pipes // // this is common during a checkpoint, runc will delete the container state // after a checkpoint and the container will no longer exist within runc if strings.Contains(err.Error(), "does not exist") { err = nil } else { err = p.runtimeError(err, "failed to delete task") } } if p.io != nil { for _, c := range p.closers { c.Close() } p.io.Close() } if err2 := mount.UnmountAll(p.Rootfs, 0); err2 != nil { log.G(ctx).WithError(err2).Warn("failed to cleanup rootfs mount") if err == nil { err = fmt.Errorf("failed rootfs umount: %w", err2) } } return err } // Resize resizes the init processes console. func (p *Init) Resize(ws console.WinSize) error { p.mu.Lock() defer p.mu.Unlock() if p.console == nil { return nil } return p.console.Resize(ws) } func (p *Init) resize(ws console.WinSize) error { if p.console == nil { return nil } return p.console.Resize(ws) } // Kill kills the init process. func (p *Init) Kill(ctx context.Context, signal uint32, all bool) error { p.mu.Lock() defer p.mu.Unlock() return p.initState.Kill(ctx, signal, all) } func (p *Init) kill(ctx context.Context, signal uint32, all bool) error { var ( killErr error backoff = 100 * time.Millisecond ) const timeout = time.Second for start := time.Now(); time.Since(start) < timeout; { state, err := p.initState.State(ctx) if err != nil { return p.runtimeError(err, "OCI runtime state failed") } // For runsc, signal only works when container is running state. // If the container is not in running state, directly return // "no such process" if state == statusStopped { return fmt.Errorf("no such process: %w", errdefs.ErrNotFound) } killErr = p.runtime.Kill(ctx, p.id, int(signal), &runsccmd.KillOpts{All: all}) if killErr == nil { return nil } time.Sleep(backoff) backoff *= 2 } return p.runtimeError(killErr, "kill timeout") } // KillAll kills all processes belonging to the init process. If // `runsc kill --all` returns error, assume the container has already stopped. func (p *Init) KillAll(context context.Context) { p.mu.Lock() defer p.mu.Unlock() p.killAllLocked(context) } func (p *Init) killAllLocked(context context.Context) { if err := p.runtime.Kill(context, p.id, int(unix.SIGKILL), &runsccmd.KillOpts{All: true}); err != nil { log.L.Warningf("Ignoring error killing container %q: %v", p.id, err) } } // Stdin returns the stdin of the process. func (p *Init) Stdin() io.Closer { return p.stdin } // Runtime returns the OCI runtime configured for the init process. func (p *Init) Runtime() *runsccmd.Runsc { return p.runtime } // Exec returns a new child process. func (p *Init) Exec(ctx context.Context, path string, r *ExecConfig) (extension.Process, error) { p.mu.Lock() defer p.mu.Unlock() return p.initState.Exec(ctx, path, r) } // exec returns a new exec'd process. func (p *Init) exec(path string, r *ExecConfig) (extension.Process, error) { var spec specs.Process if err := json.Unmarshal(r.Spec.Value, &spec); err != nil { return nil, err } spec.Terminal = r.Terminal e := &execProcess{ id: r.ID, path: path, parent: p, spec: spec, stdio: stdio.Stdio{ Stdin: r.Stdin, Stdout: r.Stdout, Stderr: r.Stderr, Terminal: r.Terminal, }, waitBlock: make(chan struct{}), } e.execState = &execCreatedState{p: e} return e, nil } func (p *Init) Stats(ctx context.Context, id string) (*runc.Stats, error) { p.mu.Lock() defer p.mu.Unlock() return p.initState.Stats(ctx, id) } func (p *Init) stats(ctx context.Context, id string) (*runc.Stats, error) { return p.Runtime().Stats(ctx, id) } // Stdio returns the stdio of the process. func (p *Init) Stdio() stdio.Stdio { return p.stdio } func (p *Init) runtimeError(rErr error, msg string) error { if rErr == nil { return nil } rMsg, err := getLastRuntimeError(p.runtime) switch { case err != nil: return fmt.Errorf("%s: %w (unable to retrieve OCI runtime error: %v)", msg, rErr, err) case rMsg == "": return fmt.Errorf("%s: %w", msg, rErr) default: return fmt.Errorf("%s: %s", msg, rMsg) } } func (p *Init) convertStatus(status string) string { if status == "created" && !p.Sandbox && p.status == internalErrorCode { // Treat start failure state for non-root container as stopped. return statusStopped } return status } func withConditionalIO(c stdio.Stdio) runc.IOOpt { return func(o *runc.IOOption) { o.OpenStdin = c.Stdin != "" o.OpenStdout = c.Stdout != "" o.OpenStderr = c.Stderr != "" } } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/proc/init_state.go000066400000000000000000000142501465435605700240230ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package proc import ( "context" "fmt" "github.com/containerd/containerd/errdefs" runc "github.com/containerd/go-runc" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/shim/extension" ) type stateTransition int const ( running stateTransition = iota stopped deleted ) func (s stateTransition) String() string { switch s { case running: return "running" case stopped: return "stopped" case deleted: return "deleted" default: panic(fmt.Sprintf("unknown state: %d", s)) } } type initState interface { // Start starts the process. If RestoreConfig is provided, the process is // restored using the checkpoint image provided in the config. Start(context.Context, *extension.RestoreConfig) error Delete(context.Context) error Exec(context.Context, string, *ExecConfig) (extension.Process, error) State(ctx context.Context) (string, error) Stats(context.Context, string) (*runc.Stats, error) Kill(context.Context, uint32, bool) error SetExited(int) } type createdState struct { p *Init } func (s *createdState) name() string { return "created" } func (s *createdState) transition(transition stateTransition) { switch transition { case running: s.p.initState = &runningState{p: s.p} case stopped: s.p.initState = &stoppedState{process: s.p} case deleted: s.p.initState = &deletedState{} default: panic(fmt.Sprintf("invalid state transition %q to %q", s.name(), transition)) } } func (s *createdState) Start(ctx context.Context, restoreConf *extension.RestoreConfig) error { if err := s.p.start(ctx, restoreConf); err != nil { // Containerd doesn't allow deleting container in created state. // However, for gVisor, a non-root container in created state can // only go to running state. If the container can't be started/restored, // it can only stay in created state, and never be deleted. // To work around that, we treat non-root container in start/restore // failure state as stopped. if !s.p.Sandbox { s.p.io.Close() s.p.setExited(internalErrorCode) s.transition(stopped) } return err } s.transition(running) return nil } func (s *createdState) Delete(ctx context.Context) error { if err := s.p.delete(ctx); err != nil { return err } s.transition(deleted) return nil } func (s *createdState) Kill(ctx context.Context, sig uint32, all bool) error { return s.p.kill(ctx, sig, all) } func (s *createdState) SetExited(status int) { s.p.setExited(status) s.transition(stopped) } func (s *createdState) Exec(ctx context.Context, path string, r *ExecConfig) (extension.Process, error) { return s.p.exec(path, r) } func (s *createdState) State(ctx context.Context) (string, error) { state, err := s.p.state(ctx) if err == nil && state == statusStopped { s.transition(stopped) } return state, err } func (s *createdState) Stats(ctx context.Context, id string) (*runc.Stats, error) { return s.p.stats(ctx, id) } type runningState struct { p *Init } func (s *runningState) name() string { return "running" } func (s *runningState) transition(transition stateTransition) { switch transition { case stopped: s.p.initState = &stoppedState{process: s.p} default: panic(fmt.Sprintf("invalid state transition %q to %q", s.name(), transition)) } } func (s *runningState) Start(context.Context, *extension.RestoreConfig) error { return fmt.Errorf("cannot start a running container") } func (s *runningState) Delete(ctx context.Context) error { return fmt.Errorf("cannot delete a running container") } func (s *runningState) Kill(ctx context.Context, sig uint32, all bool) error { return s.p.kill(ctx, sig, all) } func (s *runningState) SetExited(status int) { s.p.setExited(status) s.transition(stopped) } func (s *runningState) Exec(_ context.Context, path string, r *ExecConfig) (extension.Process, error) { return s.p.exec(path, r) } func (s *runningState) State(ctx context.Context) (string, error) { state, err := s.p.state(ctx) if err == nil && state == "stopped" { s.transition(stopped) } return state, err } func (s *runningState) Stats(ctx context.Context, id string) (*runc.Stats, error) { return s.p.stats(ctx, id) } type stoppedState struct { process *Init } func (s *stoppedState) name() string { return "stopped" } func (s *stoppedState) transition(transition stateTransition) { switch transition { case deleted: s.process.initState = &deletedState{} default: panic(fmt.Sprintf("invalid state transition %q to %q", s.name(), transition)) } } func (s *stoppedState) Start(context.Context, *extension.RestoreConfig) error { return fmt.Errorf("cannot start a stopped container") } func (s *stoppedState) Delete(ctx context.Context) error { if err := s.process.delete(ctx); err != nil { return err } s.transition(deleted) return nil } func (s *stoppedState) Kill(_ context.Context, signal uint32, _ bool) error { return handleStoppedKill(signal) } func (s *stoppedState) SetExited(status int) { s.process.setExited(status) } func (s *stoppedState) Exec(context.Context, string, *ExecConfig) (extension.Process, error) { return nil, fmt.Errorf("cannot exec in a stopped state") } func (s *stoppedState) State(context.Context) (string, error) { return "stopped", nil } func (s *stoppedState) Stats(context.Context, string) (*runc.Stats, error) { return nil, fmt.Errorf("cannot stat a stopped container") } func handleStoppedKill(signal uint32) error { switch unix.Signal(signal) { case unix.SIGTERM, unix.SIGKILL: // Container is already stopped, so everything inside the container has // already been killed. return nil default: return errdefs.ToGRPCf(errdefs.ErrNotFound, "process not found") } } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/proc/io.go000066400000000000000000000075351465435605700222770ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package proc import ( "context" "fmt" "io" "os" "sync" "github.com/containerd/containerd/log" "github.com/containerd/fifo" runc "github.com/containerd/go-runc" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" ) // TODO(random-liu): This file can be a util. var bufPool = sync.Pool{ New: func() any { buffer := make([]byte, 32<<10) return &buffer }, } func copyPipes(ctx context.Context, rio runc.IO, stdin, stdout, stderr string, wg *sync.WaitGroup) error { var sameFile *countingWriteCloser for _, i := range []struct { name string dest func(wc io.WriteCloser, rc io.Closer) }{ { name: stdout, dest: func(wc io.WriteCloser, rc io.Closer) { wg.Add(1) go func() { p := bufPool.Get().(*[]byte) defer bufPool.Put(p) if _, err := io.CopyBuffer(wc, rio.Stdout(), *p); err != nil { log.G(ctx).Warn("error copying stdout") } wg.Done() wc.Close() if rc != nil { rc.Close() } }() }, }, { name: stderr, dest: func(wc io.WriteCloser, rc io.Closer) { wg.Add(1) go func() { p := bufPool.Get().(*[]byte) defer bufPool.Put(p) if _, err := io.CopyBuffer(wc, rio.Stderr(), *p); err != nil { log.G(ctx).Warn("error copying stderr") } wg.Done() wc.Close() if rc != nil { rc.Close() } }() }, }, } { ok, err := isFifo(i.name) if err != nil { return err } var ( fw io.WriteCloser fr io.Closer ) if ok { if fw, err = fifo.OpenFifo(ctx, i.name, unix.O_WRONLY, 0); err != nil { return fmt.Errorf("gvisor-containerd-shim: opening %s failed: %s", i.name, err) } if fr, err = fifo.OpenFifo(ctx, i.name, unix.O_RDONLY, 0); err != nil { return fmt.Errorf("gvisor-containerd-shim: opening %s failed: %s", i.name, err) } } else { if sameFile != nil { sameFile.count.Add(1) i.dest(sameFile, nil) continue } if fw, err = os.OpenFile(i.name, unix.O_WRONLY|unix.O_APPEND, 0); err != nil { return fmt.Errorf("gvisor-containerd-shim: opening %s failed: %s", i.name, err) } if stdout == stderr { sameFile = &countingWriteCloser{ WriteCloser: fw, count: atomicbitops.FromInt64(1), } } } i.dest(fw, fr) } if stdin == "" { return nil } f, err := fifo.OpenFifo(context.Background(), stdin, unix.O_RDONLY|unix.O_NONBLOCK, 0) if err != nil { return fmt.Errorf("gvisor-containerd-shim: opening %s failed: %s", stdin, err) } go func() { p := bufPool.Get().(*[]byte) defer bufPool.Put(p) io.CopyBuffer(rio.Stdin(), f, *p) rio.Stdin().Close() f.Close() }() return nil } // countingWriteCloser masks io.Closer() until close has been invoked a certain number of times. type countingWriteCloser struct { io.WriteCloser count atomicbitops.Int64 } func (c *countingWriteCloser) Close() error { if c.count.Add(-1) > 0 { return nil } return c.WriteCloser.Close() } // isFifo checks if a file is a fifo. // // If the file does not exist then it returns false. func isFifo(path string) (bool, error) { stat, err := os.Stat(path) if err != nil { if os.IsNotExist(err) { return false, nil } return false, err } if stat.Mode()&os.ModeNamedPipe == os.ModeNamedPipe { return true, nil } return false, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/proc/proc.go000066400000000000000000000015621465435605700226250ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package proc is responsible to manage the communication between the shim and // the sandbox process running the container. package proc // RunscRoot is the path to the root runsc state directory. const RunscRoot = "/run/containerd/runsc" golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/proc/proc_state_autogen.go000066400000000000000000000000661465435605700255450ustar00rootroot00000000000000// automatically generated by stateify. package proc golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/proc/types.go000066400000000000000000000031131465435605700230200ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package proc import ( "time" runc "github.com/containerd/go-runc" "github.com/gogo/protobuf/types" ) // Mount holds filesystem mount configuration. type Mount struct { Type string Source string Target string Options []string } // CreateConfig hold task creation configuration. type CreateConfig struct { ID string Bundle string Runtime string Rootfs []Mount Terminal bool Stdin string Stdout string Stderr string } // ExecConfig holds exec creation configuration. type ExecConfig struct { ID string Terminal bool Stdin string Stdout string Stderr string Spec *types.Any } // Exit is the type of exit events. type Exit struct { Timestamp time.Time ID string Status int } // ProcessMonitor monitors process exit changes. type ProcessMonitor interface { // Subscribe to process exit changes Subscribe() chan runc.Exit // Unsubscribe to process exit changes Unsubscribe(c chan runc.Exit) } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/proc/utils.go000066400000000000000000000031161465435605700230170ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package proc import ( "encoding/json" "io" "os" "strings" "time" "gvisor.dev/gvisor/pkg/shim/runsccmd" ) const ( internalErrorCode = 128 bufferSize = 32 ) // ExitCh is the exit events channel for containers and exec processes // inside the sandbox. var ExitCh = make(chan Exit, bufferSize) // TODO(mlaventure): move to runc package? func getLastRuntimeError(r *runsccmd.Runsc) (string, error) { if r.Log == "" { return "", nil } f, err := os.OpenFile(r.Log, os.O_RDONLY, 0400) if err != nil { return "", err } var ( errMsg string log struct { Level string Msg string Time time.Time } ) dec := json.NewDecoder(f) for err = nil; err == nil; { if err = dec.Decode(&log); err != nil && err != io.EOF { return "", err } if log.Level == "error" { errMsg = strings.TrimSpace(log.Msg) } } return errMsg, nil } func hasNoIO(r *CreateConfig) bool { return r.Stdin == "" && r.Stdout == "" && r.Stderr == "" } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runsc/000077500000000000000000000000001465435605700215165ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runsc/api.go000066400000000000000000000014231465435605700226160ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package runsc import ( "github.com/containerd/containerd/api/events" ) // TaskOOM is an alias for events.TaskOOM. type TaskOOM = events.TaskOOM golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runsc/debug.go000066400000000000000000000023101465435605700231270ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package runsc import ( "os" "os/signal" "runtime" "sync" "syscall" "github.com/containerd/containerd/log" ) var once sync.Once func setDebugSigHandler() { once.Do(func() { dumpCh := make(chan os.Signal, 1) signal.Notify(dumpCh, syscall.SIGUSR2) go func() { buf := make([]byte, 10240) for range dumpCh { for { n := runtime.Stack(buf, true) if n >= len(buf) { buf = make([]byte, 2*len(buf)) continue } log.L.Debugf("User requested stack trace:\n%s", buf[:n]) } } }() log.L.Debugf("For full process dump run: kill -%d %d", syscall.SIGUSR2, os.Getpid()) }) } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runsc/epoll.go000066400000000000000000000054561465435605700231720ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package runsc import ( "context" "fmt" "sync" "github.com/containerd/cgroups" "github.com/containerd/containerd/events" "github.com/containerd/containerd/runtime" "golang.org/x/sys/unix" ) func newOOMEpoller(publisher events.Publisher) (*epoller, error) { fd, err := unix.EpollCreate1(unix.EPOLL_CLOEXEC) if err != nil { return nil, err } return &epoller{ fd: fd, publisher: publisher, set: make(map[uintptr]*item), }, nil } type epoller struct { mu sync.Mutex fd int publisher events.Publisher set map[uintptr]*item } type item struct { id string cg cgroups.Cgroup } func (e *epoller) Close() error { return unix.Close(e.fd) } func (e *epoller) run(ctx context.Context) { var events [128]unix.EpollEvent for { select { case <-ctx.Done(): e.Close() return default: n, err := unix.EpollWait(e.fd, events[:], -1) if err != nil { if err == unix.EINTR || err == unix.EAGAIN { continue } // Should not happen. panic(fmt.Errorf("cgroups: epoll wait: %w", err)) } for i := 0; i < n; i++ { e.process(ctx, uintptr(events[i].Fd)) } } } } func (e *epoller) add(id string, cgx any) error { e.mu.Lock() defer e.mu.Unlock() cg, ok := cgx.(cgroups.Cgroup) if !ok { return fmt.Errorf("expected cgroups.Cgroup, got: %T", cgx) } fd, err := cg.OOMEventFD() if err != nil { return err } e.set[fd] = &item{ id: id, cg: cg, } event := unix.EpollEvent{ Fd: int32(fd), Events: unix.EPOLLHUP | unix.EPOLLIN | unix.EPOLLERR, } return unix.EpollCtl(e.fd, unix.EPOLL_CTL_ADD, int(fd), &event) } func (e *epoller) process(ctx context.Context, fd uintptr) { flush(fd) e.mu.Lock() i, ok := e.set[fd] if !ok { e.mu.Unlock() return } e.mu.Unlock() if i.cg.State() == cgroups.Deleted { e.mu.Lock() delete(e.set, fd) e.mu.Unlock() unix.Close(int(fd)) return } if err := e.publisher.Publish(ctx, runtime.TaskOOMEventTopic, &TaskOOM{ ContainerID: i.id, }); err != nil { // Should not happen. panic(fmt.Errorf("publish OOM event: %w", err)) } } func flush(fd uintptr) error { var buf [8]byte _, err := unix.Read(int(fd), buf[:]) return err } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runsc/oom_v2.go000066400000000000000000000057151465435605700232560ustar00rootroot00000000000000// Copyright The containerd Authors. // Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package runsc import ( "context" "fmt" cgroupsv2 "github.com/containerd/cgroups/v2" "github.com/containerd/containerd/runtime" "github.com/containerd/containerd/runtime/v2/shim" "github.com/sirupsen/logrus" ) // newOOMv2Epoller returns an implementation that listens to OOM events // from a container's cgroups v2. This is copied from containerd to avoid // having to upgrade containerd package just to get it func newOOMv2Poller(publisher shim.Publisher) (oomPoller, error) { return &watcherV2{ itemCh: make(chan itemV2), publisher: publisher, }, nil } // watcher implementation for handling OOM events from a container's cgroup type watcherV2 struct { itemCh chan itemV2 publisher shim.Publisher } type itemV2 struct { id string ev cgroupsv2.Event err error } // Close closes the watcher func (w *watcherV2) Close() error { return nil } // Run the loop func (w *watcherV2) run(ctx context.Context) { lastOOMMap := make(map[string]uint64) // key: id, value: ev.OOM for { select { case <-ctx.Done(): w.Close() return case i := <-w.itemCh: if i.err != nil { logrus.WithError(i.err).Debugf("Error listening for OOM, id: %q", i.id) delete(lastOOMMap, i.id) continue } logrus.Debugf("Received OOM event, id: %q, event: %+v", i.id, i.ev) lastOOM := lastOOMMap[i.id] if i.ev.OOM > lastOOM { if err := w.publisher.Publish(ctx, runtime.TaskOOMEventTopic, &TaskOOM{ ContainerID: i.id, }); err != nil { logrus.WithError(err).Error("Publish OOM event") } } if i.ev.OOM > 0 { lastOOMMap[i.id] = i.ev.OOM } } } } // Add cgroups.Cgroup to the epoll monitor func (w *watcherV2) add(id string, cgx any) error { cg, ok := cgx.(*cgroupsv2.Manager) if !ok { return fmt.Errorf("expected *cgroupsv2.Manager, got: %T", cgx) } // NOTE: containerd/cgroups/v2 does not support closing eventCh routine // currently. The routine shuts down when an error happens, mostly when the // cgroup is deleted. eventCh, errCh := cg.EventChan() go func() { for { i := itemV2{id: id} select { case ev := <-eventCh: i.ev = ev w.itemCh <- i case err := <-errCh: i.err = err w.itemCh <- i // we no longer get any event/err when we got an err logrus.WithError(err).Warn("error from eventChan") return } } }() return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runsc/options.go000066400000000000000000000034121465435605700235400ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package runsc const optionsType = "io.containerd.runsc.v1.options" // options is runtime options for io.containerd.runsc.v1. type options struct { // ShimCgroup is the cgroup the shim should be in. ShimCgroup string `toml:"shim_cgroup" json:"shimCgroup"` // IoUID is the I/O's pipes uid. IoUID uint32 `toml:"io_uid" json:"ioUid"` // IoGID is the I/O's pipes gid. IoGID uint32 `toml:"io_gid" json:"ioGid"` // BinaryName is the binary name of the runsc binary. BinaryName string `toml:"binary_name" json:"binaryName"` // Root is the runsc root directory. Root string `toml:"root" json:"root"` // LogLevel sets the logging level. Some of the possible values are: debug, // info, warning. // // This configuration only applies when the shim is running as a service. LogLevel string `toml:"log_level" json:"logLevel"` // LogPath is the path to log directory. %ID% tags inside the string are // replaced with the container ID. // // This configuration only applies when the shim is running as a service. LogPath string `toml:"log_path" json:"logPath"` // RunscConfig is a key/value map of all runsc flags. RunscConfig map[string]string `toml:"runsc_config" json:"runscConfig"` } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runsc/runsc_linux_state_autogen.go000066400000000000000000000001311465435605700273330ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux // +build linux package runsc golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runsc/runsc_state_autogen.go000066400000000000000000000001501465435605700261150ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux && linux // +build linux,linux package runsc golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runsc/service.go000066400000000000000000000636361465435605700235230ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package runsc implements Containerd Shim v2 interface. package runsc import ( "context" "fmt" "io" "os" "path/filepath" "strings" "sync" "time" "github.com/BurntSushi/toml" "github.com/containerd/cgroups" cgroupsstats "github.com/containerd/cgroups/stats/v1" cgroupsv2 "github.com/containerd/cgroups/v2" "github.com/containerd/console" "github.com/containerd/containerd/api/events" "github.com/containerd/containerd/api/types/task" "github.com/containerd/containerd/errdefs" "github.com/containerd/containerd/log" "github.com/containerd/containerd/mount" "github.com/containerd/containerd/namespaces" "github.com/containerd/containerd/pkg/process" "github.com/containerd/containerd/pkg/stdio" "github.com/containerd/containerd/runtime" "github.com/containerd/containerd/runtime/linux/runctypes" "github.com/containerd/containerd/runtime/v2/shim" taskAPI "github.com/containerd/containerd/runtime/v2/task" "github.com/containerd/containerd/sys/reaper" "github.com/containerd/typeurl" "github.com/gogo/protobuf/types" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/shim/runtimeoptions/v14" "gvisor.dev/gvisor/pkg/shim/extension" "gvisor.dev/gvisor/pkg/shim/proc" "gvisor.dev/gvisor/pkg/shim/runsccmd" "gvisor.dev/gvisor/pkg/shim/runtimeoptions" "gvisor.dev/gvisor/pkg/shim/utils" "gvisor.dev/gvisor/runsc/specutils" ) var ( empty = &types.Empty{} bufPool = sync.Pool{ New: func() any { buffer := make([]byte, 32<<10) return &buffer }, } ) const ( // configFile is the default config file name. For containerd 1.2, // we assume that a config.toml should exist in the runtime root. configFile = "config.toml" cgroupParentAnnotation = "dev.gvisor.spec.cgroup-parent" ) type oomPoller interface { io.Closer // add adds `cg` cgroup to oom poller. `cg` is cgroups.Cgroup in v1 and // `cgroupsv2.Manager` in v2 add(id string, cg any) error // run monitors oom event and notifies the shim about them run(ctx context.Context) } // runscService is the shim implementation of a remote shim over gRPC. It converts // shim calls into `runsc` commands. It runs in 2 different modes: // 1. Service: process runs for the life time of the container and receives // calls described in shimapi.TaskService interface. // 2. Tool: process is short lived and runs only to perform the requested // operations and then exits. It implements the direct functions in // shim.Shim interface. // // When the service is running, it saves a json file with state information so // that commands sent to the tool can load the state and perform the operation // with the required context. type runscService struct { mu sync.Mutex // id is the container ID. id string // bundle is a path provided by the caller on container creation. Store // because it's needed in commands that don't receive bundle in the request. bundle string // task is the main process that is running the container. task *proc.Init // processes maps ExecId to processes running through exec. processes map[string]extension.Process events chan any // platform handles operations related to the console. platform stdio.Platform // opts are configuration options specific for this shim. opts options // ex gets notified whenever the container init process or an exec'd process // exits from inside the sandbox. ec chan proc.Exit // oomPoller monitors the sandbox's cgroup for OOM notifications. oomPoller oomPoller } var _ extension.TaskServiceExt = (*runscService)(nil) // New returns a new shim service. func New(ctx context.Context, id string, publisher shim.Publisher) (extension.TaskServiceExt, error) { var ( ep oomPoller err error ) if cgroups.Mode() == cgroups.Unified { ep, err = newOOMv2Poller(publisher) } else { ep, err = newOOMEpoller(publisher) } if err != nil { return nil, err } go ep.run(ctx) s := &runscService{ id: id, processes: make(map[string]extension.Process), events: make(chan any, 128), ec: proc.ExitCh, oomPoller: ep, } go s.processExits(ctx) runsccmd.Monitor = &runsccmd.LogMonitor{Next: reaper.Default} if err := s.initPlatform(); err != nil { return nil, fmt.Errorf("failed to initialized platform behavior: %w", err) } go s.forward(ctx, publisher) return s, nil } // Cleanup is called from another process (need to reload state) to stop the // container and undo all operations done in Create(). func (s *runscService) Cleanup(ctx context.Context) (*taskAPI.DeleteResponse, error) { path, err := os.Getwd() if err != nil { return nil, err } ns, err := namespaces.NamespaceRequired(ctx) if err != nil { return nil, err } var st state if err := st.load(path); err != nil { return nil, err } r := proc.NewRunsc(s.opts.Root, path, ns, st.Options.BinaryName, nil, nil) if err := r.Delete(ctx, s.id, &runsccmd.DeleteOpts{ Force: true, }); err != nil { log.L.Infof("failed to remove runc container: %v", err) } if err := mount.UnmountAll(st.Rootfs, 0); err != nil { log.L.Infof("failed to cleanup rootfs mount: %v", err) } return &taskAPI.DeleteResponse{ ExitedAt: time.Now(), ExitStatus: 128 + uint32(unix.SIGKILL), }, nil } // Create creates a new initial process and container with the underlying OCI // runtime. func (s *runscService) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (*taskAPI.CreateTaskResponse, error) { s.mu.Lock() defer s.mu.Unlock() // Save the main task id and bundle to the shim for additional requests. s.id = r.ID s.bundle = r.Bundle ns, err := namespaces.NamespaceRequired(ctx) if err != nil { return nil, fmt.Errorf("create namespace: %w", err) } // Read from root for now. if r.Options != nil { v, err := typeurl.UnmarshalAny(r.Options) if err != nil { return nil, err } var path string switch o := v.(type) { case *runctypes.CreateOptions: // containerd 1.2.x s.opts.IoUID = o.IoUid s.opts.IoGID = o.IoGid s.opts.ShimCgroup = o.ShimCgroup case *runctypes.RuncOptions: // containerd 1.2.x root := proc.RunscRoot if o.RuntimeRoot != "" { root = o.RuntimeRoot } s.opts.BinaryName = o.Runtime path = filepath.Join(root, configFile) if _, err := os.Stat(path); err != nil { if !os.IsNotExist(err) { return nil, fmt.Errorf("stat config file %q: %w", path, err) } // A config file in runtime root is not required. path = "" } case *runtimeoptions.Options: // containerd 1.5+ if o.ConfigPath == "" { break } if o.TypeUrl != optionsType { return nil, fmt.Errorf("unsupported option type %q", o.TypeUrl) } path = o.ConfigPath case *v14.Options: // containerd 1.4- if o.ConfigPath == "" { break } if o.TypeUrl != optionsType { return nil, fmt.Errorf("unsupported option type %q", o.TypeUrl) } path = o.ConfigPath default: return nil, fmt.Errorf("unsupported option type %q", r.Options.TypeUrl) } if path != "" { if _, err = toml.DecodeFile(path, &s.opts); err != nil { return nil, fmt.Errorf("decode config file %q: %w", path, err) } } } if len(s.opts.LogLevel) != 0 { lvl, err := logrus.ParseLevel(s.opts.LogLevel) if err != nil { return nil, err } logrus.SetLevel(lvl) } for _, emittedPath := range runsccmd.EmittedPaths(s.id, s.opts.RunscConfig) { if err := os.MkdirAll(filepath.Dir(emittedPath), 0777); err != nil { return nil, fmt.Errorf("failed to create parent directories for file %v: %w", emittedPath, err) } } if len(s.opts.LogPath) != 0 { logPath := runsccmd.FormatShimLogPath(s.opts.LogPath, s.id) if err := os.MkdirAll(filepath.Dir(logPath), 0777); err != nil { return nil, fmt.Errorf("failed to create log dir: %w", err) } logFile, err := os.Create(logPath) if err != nil { return nil, fmt.Errorf("failed to create log file: %w", err) } log.L.Debugf("Starting mirror log at %q", logPath) std := logrus.StandardLogger() std.SetOutput(io.MultiWriter(std.Out, logFile)) log.L.Debugf("Create shim") log.L.Debugf("***************************") log.L.Debugf("Args: %s", os.Args) log.L.Debugf("PID: %d", os.Getpid()) log.L.Debugf("ID: %s", s.id) log.L.Debugf("Options: %+v", s.opts) log.L.Debugf("Bundle: %s", r.Bundle) log.L.Debugf("Terminal: %t", r.Terminal) log.L.Debugf("stdin: %s", r.Stdin) log.L.Debugf("stdout: %s", r.Stdout) log.L.Debugf("stderr: %s", r.Stderr) log.L.Debugf("***************************") if log.L.Logger.IsLevelEnabled(logrus.DebugLevel) { setDebugSigHandler() } } // Save state before any action is taken to ensure Cleanup() will have all // the information it needs to undo the operations. st := state{ Rootfs: filepath.Join(r.Bundle, "rootfs"), Options: s.opts, } if err := st.save(r.Bundle); err != nil { return nil, err } if err := os.Mkdir(st.Rootfs, 0711); err != nil && !os.IsExist(err) { return nil, err } // Convert from types.Mount to proc.Mount. var mounts []proc.Mount for _, m := range r.Rootfs { mounts = append(mounts, proc.Mount{ Type: m.Type, Source: m.Source, Target: m.Target, Options: m.Options, }) } // Cleans up all mounts in case of failure. cu := cleanup.Make(func() { if err := mount.UnmountAll(st.Rootfs, 0); err != nil { log.L.Infof("failed to cleanup rootfs mount: %v", err) } }) defer cu.Clean() for _, rm := range mounts { m := &mount.Mount{ Type: rm.Type, Source: rm.Source, Options: rm.Options, } if err := m.Mount(st.Rootfs); err != nil { return nil, fmt.Errorf("failed to mount rootfs component %v: %w", m, err) } } config := &proc.CreateConfig{ ID: r.ID, Bundle: r.Bundle, Runtime: s.opts.BinaryName, Rootfs: mounts, Terminal: r.Terminal, Stdin: r.Stdin, Stdout: r.Stdout, Stderr: r.Stderr, } process, err := newInit(r.Bundle, filepath.Join(r.Bundle, "work"), ns, s.platform, config, &s.opts, st.Rootfs) if err != nil { return nil, err } if err := process.Create(ctx, config); err != nil { return nil, err } // Set up OOM notification on the sandbox's cgroup. This is done on // sandbox create since the sandbox process will be created here. pid := process.Pid() if pid > 0 { var ( cg any err error ) if cgroups.Mode() == cgroups.Unified { var cgPath string cgPath, err = cgroupsv2.PidGroupPath(pid) if err == nil { cg, err = cgroupsv2.LoadManager("/sys/fs/cgroup", cgPath) } } else { cg, err = cgroups.Load(cgroups.V1, cgroups.PidPath(pid)) } if err != nil { return nil, fmt.Errorf("loading cgroup for %d: %w", pid, err) } if err := s.oomPoller.add(s.id, cg); err != nil { return nil, fmt.Errorf("add cg to OOM monitor: %w", err) } } // Success cu.Release() s.task = process return &taskAPI.CreateTaskResponse{ Pid: uint32(process.Pid()), }, nil } // Start starts the container. func (s *runscService) Start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI.StartResponse, error) { p, err := s.getProcess(r.ExecID) if err != nil { return nil, err } if err := p.Start(ctx); err != nil { return nil, err } // TODO: Set the cgroup and oom notifications on restore. // https://github.com/google/gvisor-containerd-shim/issues/58 return &taskAPI.StartResponse{ Pid: uint32(p.Pid()), }, nil } // Delete deletes the initial process and container. func (s *runscService) Delete(ctx context.Context, r *taskAPI.DeleteRequest) (*taskAPI.DeleteResponse, error) { p, err := s.getProcess(r.ExecID) if err != nil { return nil, err } if err := p.Delete(ctx); err != nil { return nil, err } if len(r.ExecID) != 0 { s.mu.Lock() delete(s.processes, r.ExecID) s.mu.Unlock() } else if s.platform != nil { s.platform.Close() } return &taskAPI.DeleteResponse{ ExitStatus: uint32(p.ExitStatus()), ExitedAt: p.ExitedAt(), Pid: uint32(p.Pid()), }, nil } // Exec spawns an additional process inside the container. func (s *runscService) Exec(ctx context.Context, r *taskAPI.ExecProcessRequest) (*types.Empty, error) { s.mu.Lock() p := s.processes[r.ExecID] s.mu.Unlock() if p != nil { return nil, errdefs.ToGRPCf(errdefs.ErrAlreadyExists, "id %s", r.ExecID) } if s.task == nil { return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } process, err := s.task.Exec(ctx, s.bundle, &proc.ExecConfig{ ID: r.ExecID, Terminal: r.Terminal, Stdin: r.Stdin, Stdout: r.Stdout, Stderr: r.Stderr, Spec: r.Spec, }) if err != nil { return nil, err } s.mu.Lock() s.processes[r.ExecID] = process s.mu.Unlock() return empty, nil } // ResizePty resizes the terminal of a process. func (s *runscService) ResizePty(ctx context.Context, r *taskAPI.ResizePtyRequest) (*types.Empty, error) { p, err := s.getProcess(r.ExecID) if err != nil { return nil, err } ws := console.WinSize{ Width: uint16(r.Width), Height: uint16(r.Height), } if err := p.Resize(ws); err != nil { return nil, err } return empty, nil } // State returns runtime state information for the container. func (s *runscService) State(ctx context.Context, r *taskAPI.StateRequest) (*taskAPI.StateResponse, error) { p, err := s.getProcess(r.ExecID) if err != nil { log.L.Debugf("State failed to find process: %v", err) return nil, err } st, err := p.Status(ctx) if err != nil { log.L.Debugf("State failed: %v", err) return nil, err } status := task.StatusUnknown switch st { case "created": status = task.StatusCreated case "running": status = task.StatusRunning case "stopped": status = task.StatusStopped } sio := p.Stdio() res := &taskAPI.StateResponse{ ID: p.ID(), Bundle: s.bundle, Pid: uint32(p.Pid()), Status: status, Stdin: sio.Stdin, Stdout: sio.Stdout, Stderr: sio.Stderr, Terminal: sio.Terminal, ExitStatus: uint32(p.ExitStatus()), ExitedAt: p.ExitedAt(), } log.L.Debugf("State succeeded, response: %+v", res) return res, nil } // Pause the container. func (s *runscService) Pause(ctx context.Context, r *taskAPI.PauseRequest) (*types.Empty, error) { if s.task == nil { log.L.Debugf("Pause error, id: %s: container not created", r.ID) return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } err := s.task.Runtime().Pause(ctx, r.ID) if err != nil { return nil, err } return empty, nil } // Resume the container. func (s *runscService) Resume(ctx context.Context, r *taskAPI.ResumeRequest) (*types.Empty, error) { if s.task == nil { log.L.Debugf("Resume error, id: %s: container not created", r.ID) return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } err := s.task.Runtime().Resume(ctx, r.ID) if err != nil { return nil, err } return empty, nil } // Kill the container with the provided signal. func (s *runscService) Kill(ctx context.Context, r *taskAPI.KillRequest) (*types.Empty, error) { p, err := s.getProcess(r.ExecID) if err != nil { return nil, err } if err := p.Kill(ctx, r.Signal, r.All); err != nil { log.L.Debugf("Kill failed: %v", err) return nil, err } log.L.Debugf("Kill succeeded") return empty, nil } // Pids returns all pids inside the container. func (s *runscService) Pids(ctx context.Context, r *taskAPI.PidsRequest) (*taskAPI.PidsResponse, error) { pids, err := s.getContainerPids(ctx, r.ID) if err != nil { return nil, err } var processes []*task.ProcessInfo for _, pid := range pids { pInfo := task.ProcessInfo{ Pid: pid, } for _, p := range s.processes { if p.Pid() == int(pid) { d := &runctypes.ProcessDetails{ ExecID: p.ID(), } a, err := typeurl.MarshalAny(d) if err != nil { return nil, fmt.Errorf("failed to marshal process %d info: %w", pid, err) } pInfo.Info = a break } } processes = append(processes, &pInfo) } return &taskAPI.PidsResponse{ Processes: processes, }, nil } // CloseIO closes the I/O context of the container. func (s *runscService) CloseIO(ctx context.Context, r *taskAPI.CloseIORequest) (*types.Empty, error) { p, err := s.getProcess(r.ExecID) if err != nil { return nil, err } if stdin := p.Stdin(); stdin != nil { if err := stdin.Close(); err != nil { return nil, fmt.Errorf("close stdin: %w", err) } } return empty, nil } // Checkpoint checkpoints the container. func (s *runscService) Checkpoint(ctx context.Context, r *taskAPI.CheckpointTaskRequest) (*types.Empty, error) { return empty, errdefs.ErrNotImplemented } // Restore restores the container. func (s *runscService) Restore(ctx context.Context, r *extension.RestoreRequest) (*taskAPI.StartResponse, error) { p, err := s.getProcess(r.Start.ExecID) if err != nil { return nil, err } if err := p.Restore(ctx, &r.Conf); err != nil { return nil, err } // TODO: Set the cgroup and oom notifications on restore. // https://github.com/google/gvisor-containerd-shim/issues/58 return &taskAPI.StartResponse{ Pid: uint32(p.Pid()), }, nil } // Connect returns shim information such as the shim's pid. func (s *runscService) Connect(ctx context.Context, r *taskAPI.ConnectRequest) (*taskAPI.ConnectResponse, error) { var pid int if s.task != nil { pid = s.task.Pid() } return &taskAPI.ConnectResponse{ ShimPid: uint32(os.Getpid()), TaskPid: uint32(pid), }, nil } func (s *runscService) Shutdown(ctx context.Context, r *taskAPI.ShutdownRequest) (*types.Empty, error) { return nil, nil } func (s *runscService) Stats(ctx context.Context, r *taskAPI.StatsRequest) (*taskAPI.StatsResponse, error) { if s.task == nil { log.L.Debugf("Stats error, id: %s: container not created", r.ID) return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } stats, err := s.task.Stats(ctx, s.id) if err != nil { log.L.Debugf("Stats error, id: %s: %v", r.ID, err) return nil, err } // gvisor currently (as of 2020-03-03) only returns the total memory // usage and current PID value[0]. However, we copy the common fields here // so that future updates will propagate correct information. We're // using the cgroups.Metrics structure so we're returning the same type // as runc. // // [0]: https://github.com/google/gvisor/blob/277a0d5a1fbe8272d4729c01ee4c6e374d047ebc/runsc/boot/events.go#L61-L81 metrics := &cgroupsstats.Metrics{ CPU: &cgroupsstats.CPUStat{ Usage: &cgroupsstats.CPUUsage{ Total: stats.Cpu.Usage.Total, Kernel: stats.Cpu.Usage.Kernel, User: stats.Cpu.Usage.User, PerCPU: stats.Cpu.Usage.Percpu, }, Throttling: &cgroupsstats.Throttle{ Periods: stats.Cpu.Throttling.Periods, ThrottledPeriods: stats.Cpu.Throttling.ThrottledPeriods, ThrottledTime: stats.Cpu.Throttling.ThrottledTime, }, }, Memory: &cgroupsstats.MemoryStat{ Cache: stats.Memory.Cache, Usage: &cgroupsstats.MemoryEntry{ Limit: stats.Memory.Usage.Limit, Usage: stats.Memory.Usage.Usage, Max: stats.Memory.Usage.Max, Failcnt: stats.Memory.Usage.Failcnt, }, Swap: &cgroupsstats.MemoryEntry{ Limit: stats.Memory.Swap.Limit, Usage: stats.Memory.Swap.Usage, Max: stats.Memory.Swap.Max, Failcnt: stats.Memory.Swap.Failcnt, }, Kernel: &cgroupsstats.MemoryEntry{ Limit: stats.Memory.Kernel.Limit, Usage: stats.Memory.Kernel.Usage, Max: stats.Memory.Kernel.Max, Failcnt: stats.Memory.Kernel.Failcnt, }, KernelTCP: &cgroupsstats.MemoryEntry{ Limit: stats.Memory.KernelTCP.Limit, Usage: stats.Memory.KernelTCP.Usage, Max: stats.Memory.KernelTCP.Max, Failcnt: stats.Memory.KernelTCP.Failcnt, }, }, Pids: &cgroupsstats.PidsStat{ Current: stats.Pids.Current, Limit: stats.Pids.Limit, }, } data, err := typeurl.MarshalAny(metrics) if err != nil { log.L.Debugf("Stats error, id: %s: %v", r.ID, err) return nil, err } log.L.Debugf("Stats success, id: %s: %+v", r.ID, data) return &taskAPI.StatsResponse{ Stats: data, }, nil } // Update updates a running container. func (s *runscService) Update(ctx context.Context, r *taskAPI.UpdateTaskRequest) (*types.Empty, error) { return empty, errdefs.ErrNotImplemented } // Wait waits for the container to exit. func (s *runscService) Wait(ctx context.Context, r *taskAPI.WaitRequest) (*taskAPI.WaitResponse, error) { p, err := s.getProcess(r.ExecID) if err != nil { log.L.Debugf("Wait failed to find process: %v", err) return nil, err } p.Wait() res := &taskAPI.WaitResponse{ ExitStatus: uint32(p.ExitStatus()), ExitedAt: p.ExitedAt(), } log.L.Debugf("Wait succeeded, response: %+v", res) return res, nil } func (s *runscService) processExits(ctx context.Context) { for e := range s.ec { s.checkProcesses(ctx, e) } } func (s *runscService) checkProcesses(ctx context.Context, e proc.Exit) { // TODO(random-liu): Add `shouldKillAll` logic if container pid // namespace is supported. for _, p := range s.allProcesses() { if p.ID() == e.ID { if ip, ok := p.(*proc.Init); ok { // Ensure all children are killed. log.L.Debugf("Container init process exited, killing all container processes") ip.KillAll(ctx) } p.SetExited(e.Status) s.events <- &events.TaskExit{ ContainerID: s.id, ID: p.ID(), Pid: uint32(p.Pid()), ExitStatus: uint32(e.Status), ExitedAt: p.ExitedAt(), } return } } } func (s *runscService) allProcesses() (o []process.Process) { s.mu.Lock() defer s.mu.Unlock() for _, p := range s.processes { o = append(o, p) } if s.task != nil { o = append(o, s.task) } return o } func (s *runscService) getContainerPids(ctx context.Context, id string) ([]uint32, error) { s.mu.Lock() p := s.task s.mu.Unlock() if p == nil { return nil, fmt.Errorf("container must be created: %w", errdefs.ErrFailedPrecondition) } ps, err := p.Runtime().Ps(ctx, id) if err != nil { return nil, err } pids := make([]uint32, 0, len(ps)) for _, pid := range ps { pids = append(pids, uint32(pid)) } return pids, nil } func (s *runscService) forward(ctx context.Context, publisher shim.Publisher) { for e := range s.events { err := publisher.Publish(ctx, getTopic(e), e) if err != nil { // Should not happen. panic(fmt.Errorf("post event: %w", err)) } } } func (s *runscService) getProcess(execID string) (extension.Process, error) { s.mu.Lock() defer s.mu.Unlock() if execID == "" { if s.task == nil { return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } return s.task, nil } p := s.processes[execID] if p == nil { return nil, errdefs.ToGRPCf(errdefs.ErrNotFound, "process does not exist %s", execID) } return p, nil } func getTopic(e any) string { switch e.(type) { case *events.TaskCreate: return runtime.TaskCreateEventTopic case *events.TaskStart: return runtime.TaskStartEventTopic case *events.TaskOOM: return runtime.TaskOOMEventTopic case *events.TaskExit: return runtime.TaskExitEventTopic case *events.TaskDelete: return runtime.TaskDeleteEventTopic case *events.TaskExecAdded: return runtime.TaskExecAddedEventTopic case *events.TaskExecStarted: return runtime.TaskExecStartedEventTopic default: log.L.Infof("no topic for type %#v", e) } return runtime.TaskUnknownTopic } func newInit(path, workDir, namespace string, platform stdio.Platform, r *proc.CreateConfig, options *options, rootfs string) (*proc.Init, error) { spec, err := utils.ReadSpec(r.Bundle) if err != nil { return nil, fmt.Errorf("read oci spec: %w", err) } updated, err := utils.UpdateVolumeAnnotations(spec) if err != nil { return nil, fmt.Errorf("update volume annotations: %w", err) } updated = setPodCgroup(spec) || updated if updated { if err := utils.WriteSpec(r.Bundle, spec); err != nil { return nil, err } } runsccmd.FormatRunscPaths(r.ID, options.RunscConfig) runtime := proc.NewRunsc(options.Root, path, namespace, options.BinaryName, options.RunscConfig, spec) p := proc.New(r.ID, runtime, stdio.Stdio{ Stdin: r.Stdin, Stdout: r.Stdout, Stderr: r.Stderr, Terminal: r.Terminal, }) p.Bundle = r.Bundle p.Platform = platform p.Rootfs = rootfs p.WorkDir = workDir p.IoUID = int(options.IoUID) p.IoGID = int(options.IoGID) p.Sandbox = specutils.SpecContainerType(spec) == specutils.ContainerTypeSandbox p.UserLog = utils.UserLogPath(spec) p.Monitor = reaper.Default return p, nil } // setPodCgroup searches for the pod cgroup path inside the container's cgroup // path. If found, it's set as an annotation in the spec. This is done so that // the sandbox joins the pod cgroup. Otherwise, the sandbox would join the pause // container cgroup. Returns true if the spec was modified. Ex.: // /kubepods/burstable/pod123/container123 => kubepods/burstable/pod123 func setPodCgroup(spec *specs.Spec) bool { if !utils.IsSandbox(spec) { return false } if spec.Linux == nil || len(spec.Linux.CgroupsPath) == 0 { return false } // Search backwards for the pod cgroup path to make the sandbox use it, // instead of the pause container's cgroup. parts := strings.Split(spec.Linux.CgroupsPath, string(filepath.Separator)) for i := len(parts) - 1; i >= 0; i-- { if strings.HasPrefix(parts[i], "pod") { var path string for j := 0; j <= i; j++ { path = filepath.Join(path, parts[j]) } // Add back the initial '/' that may have been lost above. if filepath.IsAbs(spec.Linux.CgroupsPath) { path = string(filepath.Separator) + path } if spec.Linux.CgroupsPath == path { return false } if spec.Annotations == nil { spec.Annotations = make(map[string]string) } spec.Annotations[cgroupParentAnnotation] = path return true } } return false } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runsc/service_linux.go000066400000000000000000000051721465435605700247310ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package runsc import ( "context" "fmt" "io" "sync" "github.com/containerd/console" "github.com/containerd/fifo" "golang.org/x/sys/unix" ) type linuxPlatform struct { epoller *console.Epoller } func (p *linuxPlatform) CopyConsole(ctx context.Context, console console.Console, stdin, stdout, stderr string, wg *sync.WaitGroup) (console.Console, error) { if p.epoller == nil { return nil, fmt.Errorf("uninitialized epoller") } epollConsole, err := p.epoller.Add(console) if err != nil { return nil, err } if stdin != "" { in, err := fifo.OpenFifo(context.Background(), stdin, unix.O_RDONLY|unix.O_NONBLOCK, 0) if err != nil { return nil, err } go func() { p := bufPool.Get().(*[]byte) defer bufPool.Put(p) io.CopyBuffer(epollConsole, in, *p) }() } outw, err := fifo.OpenFifo(ctx, stdout, unix.O_WRONLY, 0) if err != nil { return nil, err } outr, err := fifo.OpenFifo(ctx, stdout, unix.O_RDONLY, 0) if err != nil { return nil, err } wg.Add(1) go func() { p := bufPool.Get().(*[]byte) defer bufPool.Put(p) io.CopyBuffer(outw, epollConsole, *p) epollConsole.Close() outr.Close() outw.Close() wg.Done() }() return epollConsole, nil } func (p *linuxPlatform) ShutdownConsole(ctx context.Context, cons console.Console) error { if p.epoller == nil { return fmt.Errorf("uninitialized epoller") } epollConsole, ok := cons.(*console.EpollConsole) if !ok { return fmt.Errorf("expected EpollConsole, got %#v", cons) } return epollConsole.Shutdown(p.epoller.CloseConsole) } func (p *linuxPlatform) Close() error { return p.epoller.Close() } // initialize a single epoll fd to manage our consoles. `initPlatform` should // only be called once. func (s *runscService) initPlatform() error { if s.platform != nil { return nil } epoller, err := console.NewEpoller() if err != nil { return fmt.Errorf("failed to initialize epoller: %w", err) } s.platform = &linuxPlatform{ epoller: epoller, } go epoller.Wait() return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runsc/state.go000066400000000000000000000024501465435605700231660ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package runsc import ( "encoding/json" "io/ioutil" "path/filepath" ) const filename = "state.json" // state holds information needed between shim invocations. type state struct { // Rootfs is the full path to the location rootfs was mounted. Rootfs string `json:"rootfs"` // Options is the configuration loaded from config.toml. Options options `json:"options"` } func (s state) load(path string) error { data, err := ioutil.ReadFile(filepath.Join(path, filename)) if err != nil { return err } return json.Unmarshal(data, &s) } func (s state) save(path string) error { data, err := json.Marshal(&s) if err != nil { return err } return ioutil.WriteFile(filepath.Join(path, filename), data, 0644) } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runsccmd/000077500000000000000000000000001465435605700222025ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runsccmd/runsc.go000066400000000000000000000364321465435605700236730ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package runsccmd provides an API to interact with runsc command line. package runsccmd import ( "bytes" "context" "encoding/json" "fmt" "io" "io/ioutil" "os" "os/exec" "path/filepath" "strconv" "time" "github.com/containerd/containerd/log" runc "github.com/containerd/go-runc" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" ) // DefaultCommand is the default command for Runsc. const DefaultCommand = "runsc" // ProcessMonitor is a subset of runc.ProcessMonitor. It does not include // StartLocked(), which was added in containerd/runc v1.1.1. This is so that // we can continue using containerd/containerd v1.4.13 with newer // containerd/runc versions without breaking build. type ProcessMonitor interface { Start(cmd *exec.Cmd) (chan runc.Exit, error) Wait(cmd *exec.Cmd, ch chan runc.Exit) (int, error) } // Monitor is the default process monitor to be used by runsc. var Monitor ProcessMonitor = &LogMonitor{Next: runc.Monitor} // LogMonitor implements the runc.ProcessMonitor interface, logging the command // that is getting executed, and then forwarding the call to another // implementation. type LogMonitor struct { Next ProcessMonitor } // Start implements runc.ProcessMonitor. func (l *LogMonitor) Start(cmd *exec.Cmd) (chan runc.Exit, error) { log.L.Debugf("Executing: %s", cmd.Args) return l.Next.Start(cmd) } // Wait implements runc.ProcessMonitor. func (l *LogMonitor) Wait(cmd *exec.Cmd, ch chan runc.Exit) (int, error) { status, err := l.Next.Wait(cmd, ch) log.L.Debugf("Command exit code: %d, err: %v", status, err) return status, err } // Runsc is the client to the runsc cli. type Runsc struct { Command string PdeathSignal unix.Signal Setpgid bool Root string Log string LogFormat runc.Format PanicLog string Config map[string]string } // List returns all containers created inside the provided runsc root directory. func (r *Runsc) List(context context.Context) ([]*runc.Container, error) { data, stderr, err := cmdOutput(r.command(context, "list", "--format=json"), false) if err != nil { return nil, fmt.Errorf("%w: %s", err, stderr) } var out []*runc.Container if err := json.Unmarshal(data, &out); err != nil { return nil, err } return out, nil } // State returns the state for the container provided by id. func (r *Runsc) State(context context.Context, id string) (*runc.Container, error) { data, stderr, err := cmdOutput(r.command(context, "state", id), false) if err != nil { return nil, fmt.Errorf("%w: %s", err, stderr) } var c runc.Container if err := json.Unmarshal(data, &c); err != nil { return nil, err } return &c, nil } // CreateOpts is a set of options to Runsc.Create(). type CreateOpts struct { runc.IO ConsoleSocket runc.ConsoleSocket // PidFile is a path to where a pid file should be created. PidFile string // UserLog is a path to where runsc user log should be generated. UserLog string } func (o *CreateOpts) args() (out []string, err error) { if o.PidFile != "" { abs, err := filepath.Abs(o.PidFile) if err != nil { return nil, err } out = append(out, "--pid-file", abs) } if o.ConsoleSocket != nil { out = append(out, "--console-socket", o.ConsoleSocket.Path()) } if o.UserLog != "" { out = append(out, "--user-log", o.UserLog) } return out, nil } // Create creates a new container and returns its pid if it was created successfully. func (r *Runsc) Create(context context.Context, id, bundle string, opts *CreateOpts) error { args := []string{"create", "--bundle", bundle} if opts != nil { oargs, err := opts.args() if err != nil { return err } args = append(args, oargs...) } cmd := r.command(context, append(args, id)...) if opts != nil && opts.IO != nil { opts.Set(cmd) } if cmd.Stdout == nil && cmd.Stderr == nil { out, _, err := cmdOutput(cmd, true) if err != nil { return fmt.Errorf("%w: %s", err, out) } return nil } ec, err := Monitor.Start(cmd) if err != nil { return err } if opts != nil && opts.IO != nil { if c, ok := opts.IO.(runc.StartCloser); ok { if err := c.CloseAfterStart(); err != nil { return err } } } status, err := Monitor.Wait(cmd, ec) if err == nil && status != 0 { err = fmt.Errorf("%s did not terminate successfully", cmd.Args[0]) } return err } func (r *Runsc) Pause(context context.Context, id string) error { if out, _, err := cmdOutput(r.command(context, "pause", id), true); err != nil { return fmt.Errorf("unable to pause: %w: %s", err, out) } return nil } func (r *Runsc) Resume(context context.Context, id string) error { if out, _, err := cmdOutput(r.command(context, "resume", id), true); err != nil { return fmt.Errorf("unable to resume: %w: %s", err, out) } return nil } // Start will start an already created container. func (r *Runsc) Start(context context.Context, id string, cio runc.IO) error { return r.start(context, cio, r.command(context, "start", id)) } func (r *Runsc) start(context context.Context, cio runc.IO, cmd *exec.Cmd) error { if cio != nil { cio.Set(cmd) } if cmd.Stdout == nil && cmd.Stderr == nil { out, _, err := cmdOutput(cmd, true) if err != nil { return fmt.Errorf("%w: %s", err, out) } return nil } ec, err := Monitor.Start(cmd) if err != nil { return err } if cio != nil { if c, ok := cio.(runc.StartCloser); ok { if err := c.CloseAfterStart(); err != nil { return err } } } status, err := Monitor.Wait(cmd, ec) if err == nil && status != 0 { err = fmt.Errorf("%s did not terminate successfully", cmd.Args[0]) } return err } // RestoreOpts is a set of options to runsc.Restore(). type RestoreOpts struct { ImagePath string Detach bool Direct bool } func (o *RestoreOpts) args() []string { var out []string if o.ImagePath != "" { out = append(out, fmt.Sprintf("--image-path=%s", o.ImagePath)) } if o.Detach { out = append(out, "--detach") } if o.Direct { out = append(out, "--direct") } return out } // Restore will restore an already created container. func (r *Runsc) Restore(context context.Context, id string, cio runc.IO, opts *RestoreOpts) error { args := []string{"restore"} if opts != nil { args = append(args, opts.args()...) } return r.start(context, cio, r.command(context, append(args, id)...)) } type waitResult struct { ID string `json:"id"` ExitStatus int `json:"exitStatus"` } // Wait will wait for a running container, and return its exit status. func (r *Runsc) Wait(context context.Context, id string) (int, error) { data, stderr, err := cmdOutput(r.command(context, "wait", id), false) if err != nil { return 0, fmt.Errorf("%w: %s", err, stderr) } var res waitResult if err := json.Unmarshal(data, &res); err != nil { return 0, err } return res.ExitStatus, nil } // ExecOpts is a set of options to runsc.Exec(). type ExecOpts struct { runc.IO PidFile string InternalPidFile string ConsoleSocket runc.ConsoleSocket Detach bool } func (o *ExecOpts) args() (out []string, err error) { if o.ConsoleSocket != nil { out = append(out, "--console-socket", o.ConsoleSocket.Path()) } if o.Detach { out = append(out, "--detach") } if o.PidFile != "" { abs, err := filepath.Abs(o.PidFile) if err != nil { return nil, err } out = append(out, "--pid-file", abs) } if o.InternalPidFile != "" { abs, err := filepath.Abs(o.InternalPidFile) if err != nil { return nil, err } out = append(out, "--internal-pid-file", abs) } return out, nil } // Exec executes an additional process inside the container based on a full OCI // Process specification. func (r *Runsc) Exec(context context.Context, id string, spec specs.Process, opts *ExecOpts) error { f, err := ioutil.TempFile(os.Getenv("XDG_RUNTIME_DIR"), "runsc-process") if err != nil { return err } defer os.Remove(f.Name()) err = json.NewEncoder(f).Encode(spec) f.Close() if err != nil { return err } args := []string{"exec", "--process", f.Name()} if opts != nil { oargs, err := opts.args() if err != nil { return err } args = append(args, oargs...) } cmd := r.command(context, append(args, id)...) if opts != nil && opts.IO != nil { opts.Set(cmd) } if cmd.Stdout == nil && cmd.Stderr == nil { out, _, err := cmdOutput(cmd, true) if err != nil { return fmt.Errorf("%w: %s", err, out) } return nil } ec, err := Monitor.Start(cmd) if err != nil { return err } if opts != nil && opts.IO != nil { if c, ok := opts.IO.(runc.StartCloser); ok { if err := c.CloseAfterStart(); err != nil { return err } } } status, err := Monitor.Wait(cmd, ec) if err == nil && status != 0 { err = fmt.Errorf("%s did not terminate successfully", cmd.Args[0]) } return err } // Run runs the create, start, delete lifecycle of the container and returns // its exit status after it has exited. func (r *Runsc) Run(context context.Context, id, bundle string, opts *CreateOpts) (int, error) { args := []string{"run", "--bundle", bundle} if opts != nil { oargs, err := opts.args() if err != nil { return -1, err } args = append(args, oargs...) } cmd := r.command(context, append(args, id)...) if opts != nil && opts.IO != nil { opts.Set(cmd) } ec, err := Monitor.Start(cmd) if err != nil { return -1, err } return Monitor.Wait(cmd, ec) } // DeleteOpts is a set of options to runsc.Delete(). type DeleteOpts struct { Force bool } func (o *DeleteOpts) args() (out []string) { if o.Force { out = append(out, "--force") } return out } // Delete deletes the container. func (r *Runsc) Delete(context context.Context, id string, opts *DeleteOpts) error { args := []string{"delete"} if opts != nil { args = append(args, opts.args()...) } return r.runOrError(r.command(context, append(args, id)...)) } // KillOpts specifies options for killing a container and its processes. type KillOpts struct { All bool Pid int } func (o *KillOpts) args() (out []string) { if o.All { out = append(out, "--all") } if o.Pid != 0 { out = append(out, "--pid", strconv.Itoa(o.Pid)) } return out } // Kill sends the specified signal to the container. func (r *Runsc) Kill(context context.Context, id string, sig int, opts *KillOpts) error { args := []string{ "kill", } if opts != nil { args = append(args, opts.args()...) } return r.runOrError(r.command(context, append(args, id, strconv.Itoa(sig))...)) } // Stats return the stats for a container like cpu, memory, and I/O. func (r *Runsc) Stats(context context.Context, id string) (*runc.Stats, error) { cmd := r.command(context, "events", "--stats", id) data, stderr, err := cmdOutput(cmd, false) if err != nil { return nil, fmt.Errorf("%w: %s", err, stderr) } var e runc.Event if err := json.Unmarshal(data, &e); err != nil { log.L.Debugf("Parsing events error: %v", err) return nil, err } log.L.Debugf("Stats returned, type: %s, stats: %+v", e.Type, e.Stats) if e.Type != "stats" { return nil, fmt.Errorf(`unexpected event type %q, wanted "stats"`, e.Type) } if e.Stats == nil { return nil, fmt.Errorf(`"runsc events -stat" succeeded but no stat was provided`) } return e.Stats, nil } // Events returns an event stream from runsc for a container with stats and OOM notifications. func (r *Runsc) Events(context context.Context, id string, interval time.Duration) (chan *runc.Event, error) { cmd := r.command(context, "events", fmt.Sprintf("--interval=%ds", int(interval.Seconds())), id) rd, err := cmd.StdoutPipe() if err != nil { return nil, err } ec, err := Monitor.Start(cmd) if err != nil { rd.Close() return nil, err } var ( dec = json.NewDecoder(rd) c = make(chan *runc.Event, 128) ) go func() { defer func() { close(c) rd.Close() Monitor.Wait(cmd, ec) }() for { var e runc.Event if err := dec.Decode(&e); err != nil { if err == io.EOF { return } e = runc.Event{ Type: "error", Err: err, } } c <- &e } }() return c, nil } // Ps lists all the processes inside the container returning their pids. func (r *Runsc) Ps(context context.Context, id string) ([]int, error) { data, stderr, err := cmdOutput(r.command(context, "ps", "--format", "json", id), false) if err != nil { return nil, fmt.Errorf("%w: %s", err, stderr) } var pids []int if err := json.Unmarshal(data, &pids); err != nil { return nil, err } return pids, nil } // Top lists all the processes inside the container returning the full ps data. func (r *Runsc) Top(context context.Context, id string) (*runc.TopResults, error) { data, stderr, err := cmdOutput(r.command(context, "ps", "--format", "table", id), false) if err != nil { return nil, fmt.Errorf("%w: %s", err, stderr) } topResults, err := runc.ParsePSOutput(data) if err != nil { return nil, fmt.Errorf("%s: ", err) } return topResults, nil } func (r *Runsc) args() []string { var args []string if r.Root != "" { args = append(args, fmt.Sprintf("--root=%s", r.Root)) } if r.Log != "" { args = append(args, fmt.Sprintf("--log=%s", r.Log)) } if r.LogFormat != "" { args = append(args, fmt.Sprintf("--log-format=%s", r.LogFormat)) } if r.PanicLog != "" { args = append(args, fmt.Sprintf("--panic-log=%s", r.PanicLog)) } for k, v := range r.Config { args = append(args, fmt.Sprintf("--%s=%s", k, v)) } return args } // runOrError will run the provided command. // // If an error is encountered and neither Stdout or Stderr was set the error // will be returned in the format of : . func (r *Runsc) runOrError(cmd *exec.Cmd) error { if cmd.Stdout != nil || cmd.Stderr != nil { ec, err := Monitor.Start(cmd) if err != nil { return err } status, err := Monitor.Wait(cmd, ec) if err == nil && status != 0 { err = fmt.Errorf("%s did not terminate successfully", cmd.Args[0]) } return err } out, _, err := cmdOutput(cmd, true) if err != nil { return fmt.Errorf("%w: %s", err, out) } return nil } func (r *Runsc) command(context context.Context, args ...string) *exec.Cmd { command := r.Command if command == "" { command = DefaultCommand } cmd := exec.CommandContext(context, command, append(r.args(), args...)...) cmd.SysProcAttr = &unix.SysProcAttr{ Setpgid: r.Setpgid, } if r.PdeathSignal != 0 { cmd.SysProcAttr.Pdeathsig = r.PdeathSignal } return cmd } func cmdOutput(cmd *exec.Cmd, combined bool) ([]byte, []byte, error) { stdout := getBuf() defer putBuf(stdout) cmd.Stdout = stdout cmd.Stderr = stdout var stderr *bytes.Buffer if !combined { stderr = getBuf() defer putBuf(stderr) cmd.Stderr = stderr } ec, err := Monitor.Start(cmd) if err != nil { return nil, nil, err } status, err := Monitor.Wait(cmd, ec) if err == nil && status != 0 { err = fmt.Errorf("%q did not terminate successfully", cmd.Args[0]) } if stderr == nil { return stdout.Bytes(), nil, err } return stdout.Bytes(), stderr.Bytes(), err } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runsccmd/runsccmd_state_autogen.go000066400000000000000000000000721465435605700272700ustar00rootroot00000000000000// automatically generated by stateify. package runsccmd golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runsccmd/utils.go000066400000000000000000000044501465435605700236740ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package runsccmd import ( "bytes" "strings" "sync" ) var bytesBufferPool = sync.Pool{ New: func() any { return bytes.NewBuffer(nil) }, } func getBuf() *bytes.Buffer { return bytesBufferPool.Get().(*bytes.Buffer) } func putBuf(b *bytes.Buffer) { b.Reset() bytesBufferPool.Put(b) } // pathLikeFlags are runsc flags which refer to paths to files. var pathLikeFlags = []string{ "log", "panic-log", "debug-log", "coverage-report", "profile-block", "profile-cpu", "profile-heap", "profile-mutex", "trace", } // replaceID replaces %ID% in `path` with the given sandbox ID. func replaceID(id string, path string) string { return strings.Replace(path, "%ID%", id, -1) } // EmittedPaths returns a list of file paths that the sandbox may need to // create using the given configuration. Useful to create parent directories. func EmittedPaths(id string, config map[string]string) []string { var paths []string for _, cfgFlag := range pathLikeFlags { if path, ok := config[cfgFlag]; ok { paths = append(paths, replaceID(id, path)) } } return paths } // FormatRunscPaths fills in %ID% in path-like flags. func FormatRunscPaths(id string, config map[string]string) { for _, cfgFlag := range pathLikeFlags { if path, ok := config[cfgFlag]; ok { config[cfgFlag] = replaceID(id, path) } } } // FormatShimLogPath creates the file path to the log file. It replaces %ID% // in the path with the provided "id". It also uses a default log name if the // path ends with '/'. func FormatShimLogPath(path string, id string) string { if strings.HasSuffix(path, "/") { // Default format: /runsc-shim-.log path += "runsc-shim-%ID%.log" } return replaceID(id, path) } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runtimeoptions/000077500000000000000000000000001465435605700234635ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runtimeoptions/runtimeoptions.go000066400000000000000000000013641465435605700271150ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package runtimeoptions contains the runtimeoptions proto for containerd 1.5 // and above. package runtimeoptions golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runtimeoptions/runtimeoptions_cri.go000066400000000000000000000240001465435605700277420ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false package runtimeoptions import ( "fmt" "io" math_bits "math/bits" "reflect" "strings" "github.com/gogo/protobuf/proto" ) // This is a compile-time assertion to ensure that this generated file // is compatible with the proto package it is being compiled against. // A compilation error at this line likely means your copy of the // proto package needs to be updated. const _ = proto.GoGoProtoPackageIsVersion3 // please upgrade the proto package type Options struct { // TypeUrl specifies the type of the content inside the config file. TypeUrl string `protobuf:"bytes,1,opt,name=type_url,json=typeUrl,proto3" json:"type_url,omitempty"` // ConfigPath specifies the filesystem location of the config file // used by the runtime. ConfigPath string `protobuf:"bytes,2,opt,name=config_path,json=configPath,proto3" json:"config_path,omitempty"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_sizecache int32 `json:"-"` } func (m *Options) Reset() { *m = Options{} } func (*Options) ProtoMessage() {} func (*Options) Descriptor() ([]byte, []int) { return fileDescriptor_7700dd27e3487aa6, []int{0} } func (m *Options) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) } func (m *Options) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { if deterministic { return xxx_messageInfo_Options.Marshal(b, m, deterministic) } else { b = b[:cap(b)] n, err := m.MarshalToSizedBuffer(b) if err != nil { return nil, err } return b[:n], nil } } func (m *Options) XXX_Merge(src proto.Message) { xxx_messageInfo_Options.Merge(m, src) } func (m *Options) XXX_Size() int { return m.Size() } func (m *Options) XXX_DiscardUnknown() { xxx_messageInfo_Options.DiscardUnknown(m) } var xxx_messageInfo_Options proto.InternalMessageInfo func (m *Options) GetTypeUrl() string { if m != nil { return m.TypeUrl } return "" } func (m *Options) GetConfigPath() string { if m != nil { return m.ConfigPath } return "" } func init() { proto.RegisterType((*Options)(nil), "runtimeoptions.v1.Options") } func init() { proto.RegisterFile("github.com/containerd/containerd/pkg/runtimeoptions/v1/api.proto", fileDescriptor_7700dd27e3487aa6) } var fileDescriptor_7700dd27e3487aa6 = []byte{ // 214 bytes of a gzipped FileDescriptorProto 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0x72, 0x48, 0xcf, 0x2c, 0xc9, 0x28, 0x4d, 0xd2, 0x4b, 0xce, 0xcf, 0xd5, 0x4f, 0xce, 0xcf, 0x2b, 0x49, 0xcc, 0xcc, 0x4b, 0x2d, 0x4a, 0x41, 0x66, 0x16, 0x64, 0xa7, 0xeb, 0x17, 0x95, 0xe6, 0x95, 0x64, 0xe6, 0xa6, 0xe6, 0x17, 0x94, 0x64, 0xe6, 0xe7, 0x15, 0xeb, 0x97, 0x19, 0xea, 0x27, 0x16, 0x64, 0xea, 0x15, 0x14, 0xe5, 0x97, 0xe4, 0x0b, 0x09, 0xa2, 0x4a, 0xea, 0x95, 0x19, 0x4a, 0xe9, 0x22, 0x19, 0x9a, 0x9e, 0x9f, 0x9e, 0xaf, 0x0f, 0x56, 0x99, 0x54, 0x9a, 0x06, 0xe6, 0x81, 0x39, 0x60, 0x16, 0xc4, 0x04, 0x25, 0x57, 0x2e, 0x76, 0x7f, 0x88, 0x66, 0x21, 0x49, 0x2e, 0x8e, 0x92, 0xca, 0x82, 0xd4, 0xf8, 0xd2, 0xa2, 0x1c, 0x09, 0x46, 0x05, 0x46, 0x0d, 0xce, 0x20, 0x76, 0x10, 0x3f, 0xb4, 0x28, 0x47, 0x48, 0x9e, 0x8b, 0x3b, 0x39, 0x3f, 0x2f, 0x2d, 0x33, 0x3d, 0xbe, 0x20, 0xb1, 0x24, 0x43, 0x82, 0x09, 0x2c, 0xcb, 0x05, 0x11, 0x0a, 0x48, 0x2c, 0xc9, 0x70, 0x4a, 0x3b, 0xf1, 0x50, 0x8e, 0xf1, 0xc6, 0x43, 0x39, 0x86, 0x86, 0x47, 0x72, 0x8c, 0x27, 0x1e, 0xc9, 0x31, 0x5e, 0x78, 0x24, 0xc7, 0xf8, 0xe0, 0x91, 0x1c, 0xe3, 0x84, 0xc7, 0x72, 0x0c, 0x51, 0x1e, 0xe4, 0x79, 0xd4, 0x1a, 0x55, 0x24, 0xbe, 0xcc, 0x30, 0x89, 0x0d, 0xec, 0x6a, 0x63, 0x40, 0x00, 0x00, 0x00, 0xff, 0xff, 0x91, 0x3c, 0x3e, 0x79, 0x3b, 0x01, 0x00, 0x00, } func (m *Options) Marshal() (dAtA []byte, err error) { size := m.Size() dAtA = make([]byte, size) n, err := m.MarshalToSizedBuffer(dAtA[:size]) if err != nil { return nil, err } return dAtA[:n], nil } func (m *Options) MarshalTo(dAtA []byte) (int, error) { size := m.Size() return m.MarshalToSizedBuffer(dAtA[:size]) } func (m *Options) MarshalToSizedBuffer(dAtA []byte) (int, error) { i := len(dAtA) _ = i var l int _ = l if len(m.ConfigPath) > 0 { i -= len(m.ConfigPath) copy(dAtA[i:], m.ConfigPath) i = encodeVarintApi(dAtA, i, uint64(len(m.ConfigPath))) i-- dAtA[i] = 0x12 } if len(m.TypeUrl) > 0 { i -= len(m.TypeUrl) copy(dAtA[i:], m.TypeUrl) i = encodeVarintApi(dAtA, i, uint64(len(m.TypeUrl))) i-- dAtA[i] = 0xa } return len(dAtA) - i, nil } func encodeVarintApi(dAtA []byte, offset int, v uint64) int { offset -= sovApi(v) base := offset for v >= 1<<7 { dAtA[offset] = uint8(v&0x7f | 0x80) v >>= 7 offset++ } dAtA[offset] = uint8(v) return base } func (m *Options) Size() (n int) { if m == nil { return 0 } var l int _ = l l = len(m.TypeUrl) if l > 0 { n += 1 + l + sovApi(uint64(l)) } l = len(m.ConfigPath) if l > 0 { n += 1 + l + sovApi(uint64(l)) } return n } func sovApi(x uint64) (n int) { return (math_bits.Len64(x|1) + 6) / 7 } func sozApi(x uint64) (n int) { return sovApi(uint64((x << 1) ^ uint64((int64(x) >> 63)))) } func (this *Options) String() string { if this == nil { return "nil" } s := strings.Join([]string{`&Options{`, `TypeUrl:` + fmt.Sprintf("%v", this.TypeUrl) + `,`, `ConfigPath:` + fmt.Sprintf("%v", this.ConfigPath) + `,`, `}`, }, "") return s } func valueToStringApi(v interface{}) string { rv := reflect.ValueOf(v) if rv.IsNil() { return "nil" } pv := reflect.Indirect(rv).Interface() return fmt.Sprintf("*%v", pv) } func (m *Options) Unmarshal(dAtA []byte) error { l := len(dAtA) iNdEx := 0 for iNdEx < l { preIndex := iNdEx var wire uint64 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowApi } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ wire |= uint64(b&0x7F) << shift if b < 0x80 { break } } fieldNum := int32(wire >> 3) wireType := int(wire & 0x7) if wireType == 4 { return fmt.Errorf("proto: Options: wiretype end group for non-group") } if fieldNum <= 0 { return fmt.Errorf("proto: Options: illegal tag %d (wire type %d)", fieldNum, wire) } switch fieldNum { case 1: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field TypeUrl", wireType) } var stringLen uint64 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowApi } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ stringLen |= uint64(b&0x7F) << shift if b < 0x80 { break } } intStringLen := int(stringLen) if intStringLen < 0 { return ErrInvalidLengthApi } postIndex := iNdEx + intStringLen if postIndex < 0 { return ErrInvalidLengthApi } if postIndex > l { return io.ErrUnexpectedEOF } m.TypeUrl = string(dAtA[iNdEx:postIndex]) iNdEx = postIndex case 2: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field ConfigPath", wireType) } var stringLen uint64 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowApi } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ stringLen |= uint64(b&0x7F) << shift if b < 0x80 { break } } intStringLen := int(stringLen) if intStringLen < 0 { return ErrInvalidLengthApi } postIndex := iNdEx + intStringLen if postIndex < 0 { return ErrInvalidLengthApi } if postIndex > l { return io.ErrUnexpectedEOF } m.ConfigPath = string(dAtA[iNdEx:postIndex]) iNdEx = postIndex default: iNdEx = preIndex skippy, err := skipApi(dAtA[iNdEx:]) if err != nil { return err } if (skippy < 0) || (iNdEx+skippy) < 0 { return ErrInvalidLengthApi } if (iNdEx + skippy) > l { return io.ErrUnexpectedEOF } iNdEx += skippy } } if iNdEx > l { return io.ErrUnexpectedEOF } return nil } func skipApi(dAtA []byte) (n int, err error) { l := len(dAtA) iNdEx := 0 depth := 0 for iNdEx < l { var wire uint64 for shift := uint(0); ; shift += 7 { if shift >= 64 { return 0, ErrIntOverflowApi } if iNdEx >= l { return 0, io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ wire |= (uint64(b) & 0x7F) << shift if b < 0x80 { break } } wireType := int(wire & 0x7) switch wireType { case 0: for shift := uint(0); ; shift += 7 { if shift >= 64 { return 0, ErrIntOverflowApi } if iNdEx >= l { return 0, io.ErrUnexpectedEOF } iNdEx++ if dAtA[iNdEx-1] < 0x80 { break } } case 1: iNdEx += 8 case 2: var length int for shift := uint(0); ; shift += 7 { if shift >= 64 { return 0, ErrIntOverflowApi } if iNdEx >= l { return 0, io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ length |= (int(b) & 0x7F) << shift if b < 0x80 { break } } if length < 0 { return 0, ErrInvalidLengthApi } iNdEx += length case 3: depth++ case 4: if depth == 0 { return 0, ErrUnexpectedEndOfGroupApi } depth-- case 5: iNdEx += 4 default: return 0, fmt.Errorf("proto: illegal wireType %d", wireType) } if iNdEx < 0 { return 0, ErrInvalidLengthApi } if depth == 0 { return iNdEx, nil } } return 0, io.ErrUnexpectedEOF } var ( ErrInvalidLengthApi = fmt.Errorf("proto: negative length found during unmarshaling") ErrIntOverflowApi = fmt.Errorf("proto: integer overflow") ErrUnexpectedEndOfGroupApi = fmt.Errorf("proto: unexpected end of group") ) golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runtimeoptions/runtimeoptions_state_autogen.go000066400000000000000000000001441465435605700320320ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package runtimeoptions golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runtimeoptions/v14/000077500000000000000000000000001465435605700240755ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runtimeoptions/v14/runtimeoptions.go000066400000000000000000000016251465435605700275270ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package v14 contains the runtimeoptions proto for containerd 1.4 and earlier. // The package for runtimeoptions proto changed from "cri.runtimeoptions.v1" to // "runtimeoptions.v1" in 1.5, So keep both versions until 1.4 doesn't need to // be supported anymore. package v14 golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runtimeoptions/v14/runtimeoptions_cri.go000066400000000000000000000220221465435605700303560ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false package v14 import ( "fmt" "io" "reflect" "strings" proto "github.com/gogo/protobuf/proto" ) // This is a compile-time assertion to ensure that this generated file // is compatible with the proto package it is being compiled against. // A compilation error at this line likely means your copy of the // proto package needs to be updated. const _ = proto.GoGoProtoPackageIsVersion2 // please upgrade the proto package type Options struct { // TypeUrl specifies the type of the content inside the config file. TypeUrl string `protobuf:"bytes,1,opt,name=type_url,json=typeUrl,proto3" json:"type_url,omitempty"` // ConfigPath specifies the filesystem location of the config file // used by the runtime. ConfigPath string `protobuf:"bytes,2,opt,name=config_path,json=configPath,proto3" json:"config_path,omitempty"` } func (m *Options) Reset() { *m = Options{} } func (*Options) ProtoMessage() {} func (*Options) Descriptor() ([]byte, []int) { return fileDescriptorApi, []int{0} } func (m *Options) GetTypeUrl() string { if m != nil { return m.TypeUrl } return "" } func (m *Options) GetConfigPath() string { if m != nil { return m.ConfigPath } return "" } func init() { proto.RegisterType((*Options)(nil), "cri.runtimeoptions.v1.Options") } func (m *Options) Marshal() (dAtA []byte, err error) { size := m.Size() dAtA = make([]byte, size) n, err := m.MarshalTo(dAtA) if err != nil { return nil, err } return dAtA[:n], nil } func (m *Options) MarshalTo(dAtA []byte) (int, error) { var i int _ = i var l int _ = l if len(m.TypeUrl) > 0 { dAtA[i] = 0xa i++ i = encodeVarintApi(dAtA, i, uint64(len(m.TypeUrl))) i += copy(dAtA[i:], m.TypeUrl) } if len(m.ConfigPath) > 0 { dAtA[i] = 0x12 i++ i = encodeVarintApi(dAtA, i, uint64(len(m.ConfigPath))) i += copy(dAtA[i:], m.ConfigPath) } return i, nil } func encodeVarintApi(dAtA []byte, offset int, v uint64) int { for v >= 1<<7 { dAtA[offset] = uint8(v&0x7f | 0x80) v >>= 7 offset++ } dAtA[offset] = uint8(v) return offset + 1 } func (m *Options) Size() (n int) { var l int _ = l l = len(m.TypeUrl) if l > 0 { n += 1 + l + sovApi(uint64(l)) } l = len(m.ConfigPath) if l > 0 { n += 1 + l + sovApi(uint64(l)) } return n } func sovApi(x uint64) (n int) { for { n++ x >>= 7 if x == 0 { break } } return n } func sozApi(x uint64) (n int) { return sovApi(uint64((x << 1) ^ uint64((int64(x) >> 63)))) } func (this *Options) String() string { if this == nil { return "nil" } s := strings.Join([]string{`&Options{`, `TypeUrl:` + fmt.Sprintf("%v", this.TypeUrl) + `,`, `ConfigPath:` + fmt.Sprintf("%v", this.ConfigPath) + `,`, `}`, }, "") return s } func valueToStringApi(v interface{}) string { rv := reflect.ValueOf(v) if rv.IsNil() { return "nil" } pv := reflect.Indirect(rv).Interface() return fmt.Sprintf("*%v", pv) } func (m *Options) Unmarshal(dAtA []byte) error { l := len(dAtA) iNdEx := 0 for iNdEx < l { preIndex := iNdEx var wire uint64 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowApi } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ wire |= (uint64(b) & 0x7F) << shift if b < 0x80 { break } } fieldNum := int32(wire >> 3) wireType := int(wire & 0x7) if wireType == 4 { return fmt.Errorf("proto: Options: wiretype end group for non-group") } if fieldNum <= 0 { return fmt.Errorf("proto: Options: illegal tag %d (wire type %d)", fieldNum, wire) } switch fieldNum { case 1: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field TypeUrl", wireType) } var stringLen uint64 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowApi } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ stringLen |= (uint64(b) & 0x7F) << shift if b < 0x80 { break } } intStringLen := int(stringLen) if intStringLen < 0 { return ErrInvalidLengthApi } postIndex := iNdEx + intStringLen if postIndex > l { return io.ErrUnexpectedEOF } m.TypeUrl = string(dAtA[iNdEx:postIndex]) iNdEx = postIndex case 2: if wireType != 2 { return fmt.Errorf("proto: wrong wireType = %d for field ConfigPath", wireType) } var stringLen uint64 for shift := uint(0); ; shift += 7 { if shift >= 64 { return ErrIntOverflowApi } if iNdEx >= l { return io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ stringLen |= (uint64(b) & 0x7F) << shift if b < 0x80 { break } } intStringLen := int(stringLen) if intStringLen < 0 { return ErrInvalidLengthApi } postIndex := iNdEx + intStringLen if postIndex > l { return io.ErrUnexpectedEOF } m.ConfigPath = string(dAtA[iNdEx:postIndex]) iNdEx = postIndex default: iNdEx = preIndex skippy, err := skipApi(dAtA[iNdEx:]) if err != nil { return err } if skippy < 0 { return ErrInvalidLengthApi } if (iNdEx + skippy) > l { return io.ErrUnexpectedEOF } iNdEx += skippy } } if iNdEx > l { return io.ErrUnexpectedEOF } return nil } func skipApi(dAtA []byte) (n int, err error) { l := len(dAtA) iNdEx := 0 for iNdEx < l { var wire uint64 for shift := uint(0); ; shift += 7 { if shift >= 64 { return 0, ErrIntOverflowApi } if iNdEx >= l { return 0, io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ wire |= (uint64(b) & 0x7F) << shift if b < 0x80 { break } } wireType := int(wire & 0x7) switch wireType { case 0: for shift := uint(0); ; shift += 7 { if shift >= 64 { return 0, ErrIntOverflowApi } if iNdEx >= l { return 0, io.ErrUnexpectedEOF } iNdEx++ if dAtA[iNdEx-1] < 0x80 { break } } return iNdEx, nil case 1: iNdEx += 8 return iNdEx, nil case 2: var length int for shift := uint(0); ; shift += 7 { if shift >= 64 { return 0, ErrIntOverflowApi } if iNdEx >= l { return 0, io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ length |= (int(b) & 0x7F) << shift if b < 0x80 { break } } iNdEx += length if length < 0 { return 0, ErrInvalidLengthApi } return iNdEx, nil case 3: for { var innerWire uint64 var start int = iNdEx for shift := uint(0); ; shift += 7 { if shift >= 64 { return 0, ErrIntOverflowApi } if iNdEx >= l { return 0, io.ErrUnexpectedEOF } b := dAtA[iNdEx] iNdEx++ innerWire |= (uint64(b) & 0x7F) << shift if b < 0x80 { break } } innerWireType := int(innerWire & 0x7) if innerWireType == 4 { break } next, err := skipApi(dAtA[start:]) if err != nil { return 0, err } iNdEx = start + next } return iNdEx, nil case 4: return iNdEx, nil case 5: iNdEx += 4 return iNdEx, nil default: return 0, fmt.Errorf("proto: illegal wireType %d", wireType) } } panic("unreachable") } var ( ErrInvalidLengthApi = fmt.Errorf("proto: negative length found during unmarshaling") ErrIntOverflowApi = fmt.Errorf("proto: integer overflow") ) func init() { proto.RegisterFile("api.proto", fileDescriptorApi) } var fileDescriptorApi = []byte{ // 183 bytes of a gzipped FileDescriptorProto 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0xe2, 0x4c, 0x2c, 0xc8, 0xd4, 0x2b, 0x28, 0xca, 0x2f, 0xc9, 0x17, 0x12, 0x4d, 0x2e, 0xca, 0xd4, 0x2b, 0x2a, 0xcd, 0x2b, 0xc9, 0xcc, 0x4d, 0xcd, 0x2f, 0x28, 0xc9, 0xcc, 0xcf, 0x2b, 0xd6, 0x2b, 0x33, 0x94, 0xd2, 0x4d, 0xcf, 0x2c, 0xc9, 0x28, 0x4d, 0xd2, 0x4b, 0xce, 0xcf, 0xd5, 0x4f, 0xcf, 0x4f, 0xcf, 0xd7, 0x07, 0xab, 0x4e, 0x2a, 0x4d, 0x03, 0xf3, 0xc0, 0x1c, 0x30, 0x0b, 0x62, 0x8a, 0x92, 0x2b, 0x17, 0xbb, 0x3f, 0x44, 0xb3, 0x90, 0x24, 0x17, 0x47, 0x49, 0x65, 0x41, 0x6a, 0x7c, 0x69, 0x51, 0x8e, 0x04, 0xa3, 0x02, 0xa3, 0x06, 0x67, 0x10, 0x3b, 0x88, 0x1f, 0x5a, 0x94, 0x23, 0x24, 0xcf, 0xc5, 0x9d, 0x9c, 0x9f, 0x97, 0x96, 0x99, 0x1e, 0x5f, 0x90, 0x58, 0x92, 0x21, 0xc1, 0x04, 0x96, 0xe5, 0x82, 0x08, 0x05, 0x24, 0x96, 0x64, 0x38, 0xc9, 0x9c, 0x78, 0x28, 0xc7, 0x78, 0xe3, 0xa1, 0x1c, 0x43, 0xc3, 0x23, 0x39, 0xc6, 0x13, 0x8f, 0xe4, 0x18, 0x2f, 0x3c, 0x92, 0x63, 0x7c, 0xf0, 0x48, 0x8e, 0x71, 0xc2, 0x63, 0x39, 0x86, 0x24, 0x36, 0xb0, 0x5d, 0xc6, 0x80, 0x00, 0x00, 0x00, 0xff, 0xff, 0x07, 0x00, 0xf2, 0x18, 0xbe, 0x00, 0x00, 0x00, } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/runtimeoptions/v14/v14_state_autogen.go000066400000000000000000000001311465435605700277530ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package v14 golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/service.go000066400000000000000000000272531465435605700223640ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package shim implements Containerd Shim v2 interface. package shim import ( "context" "fmt" "os" "os/exec" "github.com/containerd/containerd/errdefs" "github.com/containerd/containerd/log" "github.com/containerd/containerd/namespaces" "github.com/containerd/containerd/runtime/v2/shim" taskapi "github.com/containerd/containerd/runtime/v2/task" "github.com/gogo/protobuf/types" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/shim/extension" "gvisor.dev/gvisor/pkg/shim/runsc" "gvisor.dev/gvisor/pkg/sync" ) const ( // shimAddressPath is the relative path to a file that contains the address // to the shim UDS. See service.shimAddress. shimAddressPath = "address" ) // New returns a new shim service that can be used via gRPC. func New(ctx context.Context, id string, publisher shim.Publisher, cancel func()) (shim.Shim, error) { var opts shim.Opts if ctxOpts := ctx.Value(shim.OptsKey{}); ctxOpts != nil { opts = ctxOpts.(shim.Opts) } runsc, err := runsc.New(ctx, id, publisher) if err != nil { cancel() return nil, err } s := &service{ genericOptions: opts, cancel: cancel, main: runsc, } if address, err := shim.ReadAddress(shimAddressPath); err == nil { s.shimAddress = address } return s, nil } // service is the shim implementation of a remote shim over gRPC. It runs in 2 // different modes: // 1. Service: process runs for the life time of the container and receives // calls described in shimapi.TaskService interface. // 2. Tool: process is short lived and runs only to perform the requested // operations and then exits. It implements the direct functions in // shim.Shim interface. // // It forwards all calls to extension.TaskServiceExt which actually implements the // service interface. This struct receives the RPC calls, forwards them to the // appropriate service implementation, and convert errors to gRPC errors. type service struct { mu sync.Mutex // genericOptions are options that come from the shim interface and are common // to all shims. genericOptions shim.Opts // cancel is a function that needs to be called before the shim stops. The // function is provided by the caller to New(). cancel func() // shimAddress is the location of the UDS used to communicate to containerd. shimAddress string // main is the extension.TaskServiceExt that is used for all calls to the // container's shim, except for the cases where `ext` is set. // // Protected by mu. main extension.TaskServiceExt // ext may intercept calls to the container's shim. During the call to create // container, the extension may be created and the shim will start using it // for all calls to the container's shim. // // Protected by mu. ext extension.TaskServiceExt } var _ shim.Shim = (*service)(nil) // get return the extension.TaskServiceExt that should be used for the next // call to the container's shim. func (s *service) get() extension.TaskServiceExt { s.mu.Lock() defer s.mu.Unlock() if s.ext == nil { return s.main } return s.ext } func (s *service) newCommand(ctx context.Context, containerdBinary, containerdAddress string) (*exec.Cmd, error) { ns, err := namespaces.NamespaceRequired(ctx) if err != nil { return nil, err } self, err := os.Executable() if err != nil { return nil, err } cwd, err := os.Getwd() if err != nil { return nil, err } args := []string{ "-namespace", ns, "-address", containerdAddress, "-publish-binary", containerdBinary, } if s.genericOptions.Debug { args = append(args, "-debug") } cmd := exec.Command(self, args...) cmd.Dir = cwd cmd.Env = append(os.Environ(), "GOMAXPROCS=2") cmd.SysProcAttr = &unix.SysProcAttr{ Setpgid: true, } return cmd, nil } func (s *service) StartShim(ctx context.Context, id, containerdBinary, containerdAddress, containerdTTRPCAddress string) (string, error) { log.L.Debugf("StartShim, id: %s, binary: %q, address: %q", id, containerdBinary, containerdAddress) cmd, err := s.newCommand(ctx, containerdBinary, containerdAddress) if err != nil { return "", err } address, err := shim.SocketAddress(ctx, containerdAddress, id) if err != nil { return "", err } socket, err := shim.NewSocket(address) if err != nil { // The only time where this would happen is if there is a bug and the socket // was not cleaned up in the cleanup method of the shim or we are using the // grouping functionality where the new process should be run with the same // shim as an existing container. if !shim.SocketEaddrinuse(err) { return "", fmt.Errorf("create new shim socket: %w", err) } if shim.CanConnect(address) { if err := shim.WriteAddress(shimAddressPath, address); err != nil { return "", fmt.Errorf("write existing socket for shim: %w", err) } return address, nil } if err := shim.RemoveSocket(address); err != nil { return "", fmt.Errorf("remove pre-existing socket: %w", err) } if socket, err = shim.NewSocket(address); err != nil { return "", fmt.Errorf("try create new shim socket 2x: %w", err) } } cu := cleanup.Make(func() { socket.Close() _ = shim.RemoveSocket(address) }) defer cu.Clean() f, err := socket.File() if err != nil { return "", err } cmd.ExtraFiles = append(cmd.ExtraFiles, f) log.L.Debugf("Executing: %q %s", cmd.Path, cmd.Args) if err := cmd.Start(); err != nil { f.Close() return "", err } cu.Add(func() { cmd.Process.Kill() }) // make sure to wait after start go cmd.Wait() if err := shim.WritePidFile("shim.pid", cmd.Process.Pid); err != nil { return "", err } if err := shim.WriteAddress(shimAddressPath, address); err != nil { return "", err } if err := shim.SetScore(cmd.Process.Pid); err != nil { return "", fmt.Errorf("failed to set OOM Score on shim: %w", err) } cu.Release() return address, nil } // Cleanup is called from another process to stop the container and undo all // operations done in Create(). func (s *service) Cleanup(ctx context.Context) (*taskapi.DeleteResponse, error) { log.L.Debugf("Cleanup") resp, err := s.get().Cleanup(ctx) return resp, errdefs.ToGRPC(err) } // Create creates a new initial process and container with the underlying OCI // runtime. func (s *service) Create(ctx context.Context, r *taskapi.CreateTaskRequest) (*taskapi.CreateTaskResponse, error) { log.L.Debugf("Create, id: %s, bundle: %q", r.ID, r.Bundle) // Check if we need to create an extension to intercept calls to the container's shim. if extension.NewExtension != nil { s.mu.Lock() var err error s.ext, err = extension.NewExtension(ctx, s.main, r) if err != nil { s.mu.Unlock() return nil, err } if s.ext == nil { log.L.Debugf("No extension created for container") } else { log.L.Infof("Extension created for container") } s.mu.Unlock() } resp, err := s.get().Create(ctx, r) return resp, errdefs.ToGRPC(err) } // Start starts the container. func (s *service) Start(ctx context.Context, r *taskapi.StartRequest) (*taskapi.StartResponse, error) { log.L.Debugf("Start, id: %s, execID: %s", r.ID, r.ExecID) resp, err := s.get().Start(ctx, r) return resp, errdefs.ToGRPC(err) } // Delete deletes container. func (s *service) Delete(ctx context.Context, r *taskapi.DeleteRequest) (*taskapi.DeleteResponse, error) { log.L.Debugf("Delete, id: %s, execID: %s", r.ID, r.ExecID) resp, err := s.get().Delete(ctx, r) return resp, errdefs.ToGRPC(err) } // Exec spawns a process inside the container. func (s *service) Exec(ctx context.Context, r *taskapi.ExecProcessRequest) (*types.Empty, error) { log.L.Debugf("Exec, id: %s, execID: %s", r.ID, r.ExecID) resp, err := s.get().Exec(ctx, r) return resp, errdefs.ToGRPC(err) } // ResizePty resizes the terminal of a process. func (s *service) ResizePty(ctx context.Context, r *taskapi.ResizePtyRequest) (*types.Empty, error) { log.L.Debugf("ResizePty, id: %s, execID: %s, dimension: %dx%d", r.ID, r.ExecID, r.Height, r.Width) resp, err := s.get().ResizePty(ctx, r) return resp, errdefs.ToGRPC(err) } // State returns runtime state information for the container. func (s *service) State(ctx context.Context, r *taskapi.StateRequest) (*taskapi.StateResponse, error) { log.L.Debugf("State, id: %s, execID: %s", r.ID, r.ExecID) resp, err := s.get().State(ctx, r) return resp, errdefs.ToGRPC(err) } // Pause the container. func (s *service) Pause(ctx context.Context, r *taskapi.PauseRequest) (*types.Empty, error) { log.L.Debugf("Pause, id: %s", r.ID) resp, err := s.get().Pause(ctx, r) return resp, errdefs.ToGRPC(err) } // Resume the container. func (s *service) Resume(ctx context.Context, r *taskapi.ResumeRequest) (*types.Empty, error) { log.L.Debugf("Resume, id: %s", r.ID) resp, err := s.get().Resume(ctx, r) return resp, errdefs.ToGRPC(err) } // Kill the container with the provided signal. func (s *service) Kill(ctx context.Context, r *taskapi.KillRequest) (*types.Empty, error) { log.L.Debugf("Kill, id: %s, execID: %s, signal: %d, all: %t", r.ID, r.ExecID, r.Signal, r.All) resp, err := s.get().Kill(ctx, r) return resp, errdefs.ToGRPC(err) } // Pids returns all pids inside the container. func (s *service) Pids(ctx context.Context, r *taskapi.PidsRequest) (*taskapi.PidsResponse, error) { log.L.Debugf("Pids, id: %s", r.ID) resp, err := s.get().Pids(ctx, r) return resp, errdefs.ToGRPC(err) } // CloseIO closes the I/O context of the container. func (s *service) CloseIO(ctx context.Context, r *taskapi.CloseIORequest) (*types.Empty, error) { log.L.Debugf("CloseIO, id: %s, execID: %s, stdin: %t", r.ID, r.ExecID, r.Stdin) resp, err := s.get().CloseIO(ctx, r) return resp, errdefs.ToGRPC(err) } // Checkpoint checkpoints the container. func (s *service) Checkpoint(ctx context.Context, r *taskapi.CheckpointTaskRequest) (*types.Empty, error) { log.L.Debugf("Checkpoint, id: %s", r.ID) resp, err := s.get().Checkpoint(ctx, r) return resp, errdefs.ToGRPC(err) } // Connect returns shim information such as the shim's pid. func (s *service) Connect(ctx context.Context, r *taskapi.ConnectRequest) (*taskapi.ConnectResponse, error) { log.L.Debugf("Connect, id: %s", r.ID) resp, err := s.get().Connect(ctx, r) return resp, errdefs.ToGRPC(err) } func (s *service) Shutdown(ctx context.Context, r *taskapi.ShutdownRequest) (*types.Empty, error) { log.L.Debugf("Shutdown, id: %s", r.ID) resp, err := s.get().Shutdown(ctx, r) if err != nil { return resp, errdefs.ToGRPC(err) } s.cancel() if len(s.shimAddress) != 0 { _ = shim.RemoveSocket(s.shimAddress) } os.Exit(0) panic("Should not get here") } func (s *service) Stats(ctx context.Context, r *taskapi.StatsRequest) (*taskapi.StatsResponse, error) { log.L.Debugf("Stats, id: %s", r.ID) resp, err := s.get().Stats(ctx, r) return resp, errdefs.ToGRPC(err) } // Update updates a running container. func (s *service) Update(ctx context.Context, r *taskapi.UpdateTaskRequest) (*types.Empty, error) { log.L.Debugf("Update, id: %s", r.ID) resp, err := s.get().Update(ctx, r) return resp, errdefs.ToGRPC(err) } // Wait waits for the container to exit. func (s *service) Wait(ctx context.Context, r *taskapi.WaitRequest) (*taskapi.WaitResponse, error) { log.L.Debugf("Wait, id: %s, execID: %s", r.ID, r.ExecID) resp, err := s.get().Wait(ctx, r) return resp, errdefs.ToGRPC(err) } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/shim_state_autogen.go000066400000000000000000000000661465435605700245770ustar00rootroot00000000000000// automatically generated by stateify. package shim golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/utils/000077500000000000000000000000001465435605700215245ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/utils/annotations.go000066400000000000000000000020401465435605700244040ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package utils // Annotations from the CRI annotations package. // // These are vendor due to import conflicts. const ( sandboxLogDirAnnotation = "io.kubernetes.cri.sandbox-log-directory" // ContainerTypeAnnotation is they key that defines sandbox or container. ContainerTypeAnnotation = "io.kubernetes.cri.container-type" containerTypeSandbox = "sandbox" // ContainerTypeContainer is the value for container. ContainerTypeContainer = "container" ) golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/utils/utils.go000066400000000000000000000041721465435605700232170ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package utils container miscellaneous utility function used by the shim. package utils import ( "encoding/json" "io/ioutil" "path/filepath" specs "github.com/opencontainers/runtime-spec/specs-go" ) const configFilename = "config.json" // ReadSpec reads OCI spec from the bundle directory. func ReadSpec(bundle string) (*specs.Spec, error) { b, err := ioutil.ReadFile(filepath.Join(bundle, configFilename)) if err != nil { return nil, err } var spec specs.Spec if err := json.Unmarshal(b, &spec); err != nil { return nil, err } return &spec, nil } // WriteSpec writes OCI spec to the bundle directory. func WriteSpec(bundle string, spec *specs.Spec) error { b, err := json.Marshal(spec) if err != nil { return err } return ioutil.WriteFile(filepath.Join(bundle, configFilename), b, 0666) } // IsSandbox checks whether a container is a sandbox container. func IsSandbox(spec *specs.Spec) bool { t, ok := spec.Annotations[ContainerTypeAnnotation] return !ok || t == containerTypeSandbox } // UserLogPath gets user log path from OCI annotation. func UserLogPath(spec *specs.Spec) string { sandboxLogDir := spec.Annotations[sandboxLogDirAnnotation] if sandboxLogDir == "" { return "" } return filepath.Join(sandboxLogDir, "gvisor.log") } // PanicLogPath gets the panic log path from OCI annotation. func PanicLogPath(spec *specs.Spec) string { if spec == nil { return "" } sandboxLogDir := spec.Annotations[sandboxLogDirAnnotation] if sandboxLogDir == "" { return "" } return filepath.Join(sandboxLogDir, "gvisor_panic.log") } golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/utils/utils_state_autogen.go000066400000000000000000000000671465435605700261400ustar00rootroot00000000000000// automatically generated by stateify. package utils golang-gvisor-gvisor-0.0~20240729.0/pkg/shim/utils/volumes.go000066400000000000000000000202161465435605700235460ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package utils import ( "fmt" "path/filepath" "strings" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/runsc/specutils" ) const ( volumeKeyPrefix = "dev.gvisor.spec.mount." // devshmName is the volume name used for /dev/shm. Pick a name that is // unlikely to be used. devshmName = "gvisorinternaldevshm" // emptyDirVolumesDir is the directory inside kubeletPodsDir/{uid}/volumes/ // that hosts all the EmptyDir volumes used by the pod. emptyDirVolumesDir = "kubernetes.io~empty-dir" ) // The directory structure for volumes is as follows: // /var/lib/kubelet/pods/{uid}/volumes/{type} where `uid` is the pod UID and // `type` is the volume type. var kubeletPodsDir = "/var/lib/kubelet/pods" // volumeName gets volume name from volume annotation key, example: // // dev.gvisor.spec.mount.NAME.share func volumeName(k string) string { return strings.SplitN(strings.TrimPrefix(k, volumeKeyPrefix), ".", 2)[0] } // volumeFieldName gets volume field name from volume annotation key, example: // // `type` is the field of dev.gvisor.spec.mount.NAME.type func volumeFieldName(k string) string { parts := strings.Split(strings.TrimPrefix(k, volumeKeyPrefix), ".") return parts[len(parts)-1] } // podUID gets pod UID from the pod log path. func podUID(s *specs.Spec) (string, error) { sandboxLogDir := s.Annotations[sandboxLogDirAnnotation] if sandboxLogDir == "" { return "", fmt.Errorf("no sandbox log path annotation") } fields := strings.Split(filepath.Base(sandboxLogDir), "_") switch len(fields) { case 1: // This is the old CRI logging path. return fields[0], nil case 3: // This is the new CRI logging path. return fields[2], nil } return "", fmt.Errorf("unexpected sandbox log path %q", sandboxLogDir) } // isVolumeKey checks whether an annotation key is for volume. func isVolumeKey(k string) bool { return strings.HasPrefix(k, volumeKeyPrefix) } // volumeSourceKey constructs the annotation key for volume source. func volumeSourceKey(volume string) string { return volumeKeyPrefix + volume + ".source" } // volumePath searches the volume path in the kubelet pod directory. func volumePath(volume, uid string) (string, error) { // TODO: Support subpath when gvisor supports pod volume bind mount. volumeSearchPath := fmt.Sprintf("%s/%s/volumes/*/%s", kubeletPodsDir, uid, volume) dirs, err := filepath.Glob(volumeSearchPath) if err != nil { return "", err } if len(dirs) != 1 { return "", fmt.Errorf("unexpected matched volume list %v", dirs) } return dirs[0], nil } // isVolumePath checks whether a string is the volume path. func isVolumePath(volume, path string) (bool, error) { // TODO: Support subpath when gvisor supports pod volume bind mount. volumeSearchPath := fmt.Sprintf("%s/*/volumes/*/%s", kubeletPodsDir, volume) return filepath.Match(volumeSearchPath, path) } // UpdateVolumeAnnotations add necessary OCI annotations for gvisor // volume optimization. Returns true if the spec was modified. // // Note about EmptyDir handling: // The admission controller sets mount annotations for EmptyDir as follows: // - For EmptyDir volumes with medium=Memory, the "type" field is set to tmpfs. // - For EmptyDir volumes with medium="", the "type" field is set to bind. // // The container spec has EmptyDir mount points as bind mounts. This method // modifies the spec as follows: // - The "type" mount annotation for all EmptyDirs is changed to tmpfs. // - The mount type in spec.Mounts[i].Type is changed as follows: // - For EmptyDir volumes with medium=Memory, we change it to tmpfs. // - For EmptyDir volumes with medium="", we leave it as a bind mount. // - (Essentially we set it to what the admission controller said.) // // runsc should use these two setting to infer EmptyDir medium: // - tmpfs annotation type + tmpfs mount type = memory-backed EmptyDir // - tmpfs annotation type + bind mount type = disk-backed EmptyDir func UpdateVolumeAnnotations(s *specs.Spec) (bool, error) { var uid string if IsSandbox(s) { var err error uid, err = podUID(s) if err != nil { // Skip if we can't get pod UID, because this doesn't work // for containerd 1.1. return false, nil } } updated := false for k, v := range s.Annotations { if !isVolumeKey(k) { continue } if volumeFieldName(k) != "type" { continue } volume := volumeName(k) if uid != "" { // This is the root (first) container. Mount annotations are only // consumed from this container's spec. So fix mount annotations by: // 1. Adding source annotation. // 2. Fixing type annotation. path, err := volumePath(volume, uid) if err != nil { return false, fmt.Errorf("get volume path for %q: %w", volume, err) } s.Annotations[volumeSourceKey(volume)] = path if strings.Contains(path, emptyDirVolumesDir) { s.Annotations[k] = "tmpfs" // See note about EmptyDir. } updated = true } else { // This is a sub-container. Mount annotations are ignored. So no need to // bother fixing those. for i := range s.Mounts { // An error is returned for sandbox if source annotation is not // successfully applied, so it is guaranteed that the source annotation // for sandbox has already been successfully applied at this point. // // The volume name is unique inside a pod, so matching without podUID // is fine here. // // TODO: Pass podUID down to shim for containers to do more accurate // matching. if yes, _ := isVolumePath(volume, s.Mounts[i].Source); yes { // Container mount type must match the mount type specified by // admission controller. See note about EmptyDir. specutils.ChangeMountType(&s.Mounts[i], v) updated = true } } } } if ok, err := configureShm(s); err != nil { return false, err } else if ok { updated = true } return updated, nil } // configureShm sets up annotations to mount /dev/shm as a pod shared tmpfs // mount inside containers. // // Pods are configured to mount /dev/shm to a common path in the host, so it's // shared among containers in the same pod. In gVisor, /dev/shm must be // converted to a tmpfs mount inside the sandbox, otherwise shm_open(3) doesn't // use it (see where_is_shmfs() in glibc). Mount annotation hints are used to // instruct runsc to mount the same tmpfs volume in all containers inside the // pod. func configureShm(s *specs.Spec) (bool, error) { const ( shmPath = "/dev/shm" devshmType = "tmpfs" ) // Some containers contain a duplicate mount entry for /dev/shm using tmpfs. // If this is detected, remove the extraneous entry to ensure the correct one // is used. duplicate := -1 for i, m := range s.Mounts { if m.Destination == shmPath && m.Type == devshmType { duplicate = i break } } updated := false for i := range s.Mounts { m := &s.Mounts[i] if m.Destination == shmPath && m.Type == "bind" { if IsSandbox(s) { s.Annotations[volumeKeyPrefix+devshmName+".source"] = m.Source s.Annotations[volumeKeyPrefix+devshmName+".type"] = devshmType s.Annotations[volumeKeyPrefix+devshmName+".share"] = "pod" // Given that we don't have visibility into mount options for all // containers, assume broad access for the master mount (it's tmpfs // inside the sandbox anyways) and apply options to subcontainers as // they bind mount individually. s.Annotations[volumeKeyPrefix+devshmName+".options"] = "rw" } specutils.ChangeMountType(m, devshmType) updated = true // Remove the duplicate entry now that we found the shared /dev/shm mount. if duplicate >= 0 { s.Mounts = append(s.Mounts[:duplicate], s.Mounts[duplicate+1:]...) } break } } return updated, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sighandling/000077500000000000000000000000001465435605700217135ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sighandling/sighandling.go000066400000000000000000000071051465435605700245340ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package sighandling contains helpers for handling signals to applications. package sighandling import ( "os" "os/signal" "reflect" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" ) // numSignals is the number of normal (non-realtime) signals on Linux. const numSignals = 32 // handleSignals listens for incoming signals and calls the given handler // function. // // It stops when the stop channel is closed. The done channel is closed once it // will no longer deliver signals to k. func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), stop, done chan struct{}) { // Build a select case. sc := []reflect.SelectCase{{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(stop)}} for _, sigchan := range sigchans { sc = append(sc, reflect.SelectCase{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(sigchan)}) } for { // Wait for a notification. index, _, ok := reflect.Select(sc) // Was it the stop channel? if index == 0 { if !ok { // Stop forwarding and notify that it's done. close(done) return } continue } // How about a different close? if !ok { panic("signal channel closed unexpectedly") } // Otherwise, it was a signal on channel N. Index 0 represents the stop // channel, so index N represents the channel for signal N. handler(linux.Signal(index)) } } // StartSignalForwarding ensures that synchronous signals are passed to the // given handler function and returns a callback that stops signal delivery. // // Note that this function permanently takes over signal handling. After the // stop callback, signals revert to the default Go runtime behavior, which // cannot be overridden with external calls to signal.Notify. func StartSignalForwarding(handler func(linux.Signal)) func() { stop := make(chan struct{}) done := make(chan struct{}) // Register individual channels. One channel per standard signal is // required as os.Notify() is non-blocking and may drop signals. To avoid // this, standard signals have to be queued separately. Channel size 1 is // enough for standard signals as their semantics allow de-duplication. // // External real-time signals are not supported. We rely on the go-runtime // for their handling. // // We do not forward some signals that are likely induced by the behavior // of the forwarding process. var sigchans []chan os.Signal for sig := 1; sig <= numSignals+1; sig++ { sigchan := make(chan os.Signal, 1) sigchans = append(sigchans, sigchan) // SIGURG is used by Go's runtime scheduler. if sig == int(linux.SIGURG) { continue } // SIGPIPE is received when sending to disconnected host pipes/sockets. if sig == int(linux.SIGPIPE) { continue } // SIGCHLD is received when a child of the forwarding process exits. if sig == int(linux.SIGCHLD) { continue } signal.Notify(sigchan, unix.Signal(sig)) } // Start up our listener. go handleSignals(sigchans, handler, stop, done) // S/R-SAFE: synchronized by Kernel.extMu. return func() { close(stop) <-done } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sighandling/sighandling_darwin.go000066400000000000000000000026121465435605700260760ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build darwin // +build darwin package sighandling import ( "errors" "golang.org/x/sys/unix" ) // IgnoreChildStop sets the SA_NOCLDSTOP flag, causing child processes to not // generate SIGCHLD when they stop. func IgnoreChildStop() error { return errors.New("IgnoreChildStop not supported on Darwin") } // ReplaceSignalHandler replaces the existing signal handler for the provided // signal with the function pointer at `handler`. This bypasses the Go runtime // signal handlers, and should only be used for low-level signal handlers where // use of signal.Notify is not appropriate. // // It stores the value of the previously set handler in previous. func ReplaceSignalHandler(sig unix.Signal, handler uintptr, previous *uintptr) error { return errors.New("ReplaceSignalHandler not supported on Darwin") } golang-gvisor-gvisor-0.0~20240729.0/pkg/sighandling/sighandling_linux_unsafe.go000066400000000000000000000047601465435605700273200ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package sighandling import ( "fmt" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" ) // IgnoreChildStop sets the SA_NOCLDSTOP flag, causing child processes to not // generate SIGCHLD when they stop. func IgnoreChildStop() error { var sa linux.SigAction // Get the existing signal handler information, and set the flag. if _, _, e := unix.RawSyscall6(unix.SYS_RT_SIGACTION, uintptr(unix.SIGCHLD), 0, uintptr(unsafe.Pointer(&sa)), linux.SignalSetSize, 0, 0); e != 0 { return e } sa.Flags |= linux.SA_NOCLDSTOP if _, _, e := unix.RawSyscall6(unix.SYS_RT_SIGACTION, uintptr(unix.SIGCHLD), uintptr(unsafe.Pointer(&sa)), 0, linux.SignalSetSize, 0, 0); e != 0 { return e } return nil } // ReplaceSignalHandler replaces the existing signal handler for the provided // signal with the function pointer at `handler`. This bypasses the Go runtime // signal handlers, and should only be used for low-level signal handlers where // use of signal.Notify is not appropriate. // // It stores the value of the previously set handler in previous. func ReplaceSignalHandler(sig unix.Signal, handler uintptr, previous *uintptr) error { var sa linux.SigAction const maskLen = 8 // Get the existing signal handler information, and save the current // handler. Once we replace it, we will use this pointer to fall back to // it when we receive other signals. if _, _, e := unix.RawSyscall6(unix.SYS_RT_SIGACTION, uintptr(sig), 0, uintptr(unsafe.Pointer(&sa)), maskLen, 0, 0); e != 0 { return e } // Fail if there isn't a previous handler. if sa.Handler == 0 { return fmt.Errorf("previous handler for signal %x isn't set", sig) } *previous = uintptr(sa.Handler) // Install our own handler. sa.Handler = uint64(handler) if _, _, e := unix.RawSyscall6(unix.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, maskLen, 0, 0); e != 0 { return e } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/sighandling/sighandling_linux_unsafe_state_autogen.go000066400000000000000000000001371465435605700322340ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux // +build linux package sighandling golang-gvisor-gvisor-0.0~20240729.0/pkg/sighandling/sighandling_state_autogen.go000066400000000000000000000001411465435605700274470ustar00rootroot00000000000000// automatically generated by stateify. //go:build darwin // +build darwin package sighandling golang-gvisor-gvisor-0.0~20240729.0/pkg/sleep/000077500000000000000000000000001465435605700205345ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sleep/sleep_unsafe.go000066400000000000000000000372121465435605700235410ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package sleep allows goroutines to efficiently sleep on multiple sources of // notifications (wakers). It offers O(1) complexity, which is different from // multi-channel selects which have O(n) complexity (where n is the number of // channels) and a considerable constant factor. // // It is similar to edge-triggered epoll waits, where the user registers each // object of interest once, and then can repeatedly wait on all of them. // // A Waker object is used to wake a sleeping goroutine (G) up, or prevent it // from going to sleep next. A Sleeper object is used to receive notifications // from wakers, and if no notifications are available, to optionally sleep until // one becomes available. // // A Waker can be associated with at most one Sleeper, but a Sleeper can be // associated with multiple Wakers. A Sleeper has a list of asserted (ready) // wakers; when Fetch() is called repeatedly, elements from this list are // returned until the list becomes empty in which case the goroutine goes to // sleep. When Assert() is called on a Waker, it adds itself to the Sleeper's // asserted list and wakes the G up from its sleep if needed. // // Sleeper objects are expected to be used as follows, with just one goroutine // executing this code: // // // One time set-up. // s := sleep.Sleeper{} // s.AddWaker(&w1) // s.AddWaker(&w2) // // // Called repeatedly. // for { // switch s.Fetch(true) { // case &w1: // // Do work triggered by w1 being asserted. // case &w2: // // Do work triggered by w2 being asserted. // } // } // // And Waker objects are expected to call w.Assert() when they want the sleeper // to wake up and perform work. // // The notifications are edge-triggered, which means that if a Waker calls // Assert() several times before the sleeper has the chance to wake up, it will // only be notified once and should perform all pending work (alternatively, it // can also call Assert() on the waker, to ensure that it will wake up again). // // The "unsafeness" here is in the casts to/from unsafe.Pointer, which is safe // when only one type is used for each unsafe.Pointer (which is the case here), // we should just make sure that this remains the case in the future. The usage // of unsafe package could be confined to sharedWaker and sharedSleeper types // that would hold pointers in atomic.Pointers, but the go compiler currently // can't optimize these as well (it won't inline their method calls), which // reduces performance. package sleep import ( "context" "sync/atomic" "unsafe" "gvisor.dev/gvisor/pkg/sync" ) const ( // preparingG is stored in sleepers to indicate that they're preparing // to sleep. preparingG = 1 ) var ( // assertedSleeper is a sentinel sleeper. A pointer to it is stored in // wakers that are asserted. assertedSleeper Sleeper ) // Sleeper allows a goroutine to sleep and receive wake up notifications from // Wakers in an efficient way. // // This is similar to edge-triggered epoll in that wakers are added to the // sleeper once and the sleeper can then repeatedly sleep in O(1) time while // waiting on all wakers. // // None of the methods in a Sleeper can be called concurrently. Wakers that have // been added to a sleeper A can only be added to another sleeper after A.Done() // returns. These restrictions allow this to be implemented lock-free. // // This struct is thread-compatible. // // +stateify savable type Sleeper struct { _ sync.NoCopy // sharedList is a "stack" of asserted wakers. They atomically add // themselves to the front of this list as they become asserted. sharedList unsafe.Pointer `state:".(*Waker)"` // localList is a list of asserted wakers that is only accessible to the // waiter, and thus doesn't have to be accessed atomically. When // fetching more wakers, the waiter will first go through this list, and // only when it's empty will it atomically fetch wakers from // sharedList. localList *Waker // allWakers is a list with all wakers that have been added to this // sleeper. It is used during cleanup to remove associations. allWakers *Waker // waitingG holds the G that is sleeping, if any. It is used by wakers // to determine which G, if any, they should wake. waitingG uintptr `state:"zero"` } // saveSharedList is invoked by stateify. func (s *Sleeper) saveSharedList() *Waker { return (*Waker)(atomic.LoadPointer(&s.sharedList)) } // loadSharedList is invoked by stateify. func (s *Sleeper) loadSharedList(_ context.Context, w *Waker) { atomic.StorePointer(&s.sharedList, unsafe.Pointer(w)) } // AddWaker associates the given waker to the sleeper. func (s *Sleeper) AddWaker(w *Waker) { if w.allWakersNext != nil { panic("waker has non-nil allWakersNext; owned by another sleeper?") } if w.next != nil { panic("waker has non-nil next; queued in another sleeper?") } // Add the waker to the list of all wakers. w.allWakersNext = s.allWakers s.allWakers = w // Try to associate the waker with the sleeper. If it's already // asserted, we simply enqueue it in the "ready" list. for { p := (*Sleeper)(atomic.LoadPointer(&w.s)) if p == &assertedSleeper { s.enqueueAssertedWaker(w, true /* wakep */) return } if atomic.CompareAndSwapPointer(&w.s, usleeper(p), usleeper(s)) { return } } } // nextWaker returns the next waker in the notification list, blocking if // needed. The parameter wakepOrSleep indicates that if the operation does not // block, then we will need to explicitly wake a runtime P. // // Precondition: wakepOrSleep may be true iff block is true. // //go:nosplit func (s *Sleeper) nextWaker(block, wakepOrSleep bool) *Waker { // Attempt to replenish the local list if it's currently empty. if s.localList == nil { for atomic.LoadPointer(&s.sharedList) == nil { // Fail request if caller requested that we // don't block. if !block { return nil } // Indicate to wakers that we're about to sleep, // this allows them to abort the wait by setting // waitingG back to zero (which we'll notice // before committing the sleep). atomic.StoreUintptr(&s.waitingG, preparingG) // Check if something was queued while we were // preparing to sleep. We need this interleaving // to avoid missing wake ups. if atomic.LoadPointer(&s.sharedList) != nil { atomic.StoreUintptr(&s.waitingG, 0) break } // Since we are sleeping for sure, we no longer // need to wakep once we get a value. wakepOrSleep = false // Try to commit the sleep and report it to the // tracer as a select. // // gopark puts the caller to sleep and calls // commitSleep to decide whether to immediately // wake the caller up or to leave it sleeping. const traceEvGoBlockSelect = 24 // See:runtime2.go in the go runtime package for // the values to pass as the waitReason here. const waitReasonSelect = 9 sync.Gopark(commitSleep, unsafe.Pointer(&s.waitingG), sync.WaitReasonSelect, sync.TraceBlockSelect, 0) } // Pull the shared list out and reverse it in the local // list. Given that wakers push themselves in reverse // order, we fix things here. v := (*Waker)(atomic.SwapPointer(&s.sharedList, nil)) for v != nil { cur := v v = v.next cur.next = s.localList s.localList = cur } } // Remove the waker in the front of the list. w := s.localList s.localList = w.next // Do we need to wake a P? if wakepOrSleep { sync.Wakep() } return w } // commitSleep signals to wakers that the given g is now sleeping. Wakers can // then fetch it and wake it. // // The commit may fail if wakers have been asserted after our last check, in // which case they will have set s.waitingG to zero. // //go:norace //go:nosplit func commitSleep(g uintptr, waitingG unsafe.Pointer) bool { return sync.RaceUncheckedAtomicCompareAndSwapUintptr((*uintptr)(waitingG), preparingG, g) } // fetch is the backing implementation for Fetch and AssertAndFetch. // // Preconditions are the same as nextWaker. // //go:nosplit func (s *Sleeper) fetch(block, wakepOrSleep bool) *Waker { for { w := s.nextWaker(block, wakepOrSleep) if w == nil { return nil } // Reassociate the waker with the sleeper. If the waker was // still asserted we can return it, otherwise try the next one. old := (*Sleeper)(atomic.SwapPointer(&w.s, usleeper(s))) if old == &assertedSleeper { return w } } } // Fetch fetches the next wake-up notification. If a notification is // immediately available, the asserted waker is returned immediately. // Otherwise, the behavior depends on the value of 'block': if true, the // current goroutine blocks until a notification arrives and returns the // asserted waker; if false, nil will be returned. // // N.B. This method is *not* thread-safe. Only one goroutine at a time is // allowed to call this method. func (s *Sleeper) Fetch(block bool) *Waker { return s.fetch(block, false /* wakepOrSleep */) } // AssertAndFetch asserts the given waker and fetches the next wake-up notification. // Note that this will always be blocking, since there is no value in joining a // non-blocking operation. // // N.B. Like Fetch, this method is *not* thread-safe. This will also yield the current // P to the next goroutine, avoiding associated scheduled overhead. // // +checkescape:all // //go:nosplit func (s *Sleeper) AssertAndFetch(n *Waker) *Waker { n.assert(false /* wakep */) return s.fetch(true /* block */, true /* wakepOrSleep*/) } // Done is used to indicate that the caller won't use this Sleeper anymore. It // removes the association with all wakers so that they can be safely reused // by another sleeper after Done() returns. func (s *Sleeper) Done() { // Remove all associations that we can, and build a list of the ones we // could not. An association can be removed right away from waker w if // w.s has a pointer to the sleeper, that is, the waker is not asserted // yet. By atomically switching w.s to nil, we guarantee that // subsequent calls to Assert() on the waker will not result in it // being queued. for w := s.allWakers; w != nil; w = s.allWakers { next := w.allWakersNext // Before zapping. if atomic.CompareAndSwapPointer(&w.s, usleeper(s), nil) { w.allWakersNext = nil w.next = nil s.allWakers = next // Move ahead. continue } // Dequeue exactly one waiter from the list, it may not be // this one but we know this one is in the process. We must // leave it in the asserted state but drop it from our lists. if w := s.nextWaker(true, false); w != nil { prev := &s.allWakers for *prev != w { prev = &((*prev).allWakersNext) } *prev = (*prev).allWakersNext w.allWakersNext = nil w.next = nil } } } // enqueueAssertedWaker enqueues an asserted waker to the "ready" circular list // of wakers that want to notify the sleeper. // //go:nosplit func (s *Sleeper) enqueueAssertedWaker(w *Waker, wakep bool) { // Add the new waker to the front of the list. for { v := (*Waker)(atomic.LoadPointer(&s.sharedList)) w.next = v if atomic.CompareAndSwapPointer(&s.sharedList, uwaker(v), uwaker(w)) { break } } // Nothing to do if there isn't a G waiting. if atomic.LoadUintptr(&s.waitingG) == 0 { return } // Signal to the sleeper that a waker has been asserted. switch g := atomic.SwapUintptr(&s.waitingG, 0); g { case 0, preparingG: default: // We managed to get a G. Wake it up. sync.Goready(g, 0, wakep) } } // Waker represents a source of wake-up notifications to be sent to sleepers. A // waker can be associated with at most one sleeper at a time, and at any given // time is either in asserted or non-asserted state. // // Once asserted, the waker remains so until it is manually cleared or a sleeper // consumes its assertion (i.e., a sleeper wakes up or is prevented from going // to sleep due to the waker). // // This struct is thread-safe, that is, its methods can be called concurrently // by multiple goroutines. // // Note, it is not safe to copy a Waker as its fields are modified by value // (the pointer fields are individually modified with atomic operations). // // +stateify savable type Waker struct { _ sync.NoCopy // s is the sleeper that this waker can wake up. Only one sleeper at a // time is allowed. This field can have three classes of values: // nil -- the waker is not asserted: it either is not associated with // a sleeper, or is queued to a sleeper due to being previously // asserted. This is the zero value. // &assertedSleeper -- the waker is asserted. // otherwise -- the waker is not asserted, and is associated with the // given sleeper. Once it transitions to asserted state, the // associated sleeper will be woken. s unsafe.Pointer `state:".(wakerState)"` // next is used to form a linked list of asserted wakers in a sleeper. next *Waker // allWakersNext is used to form a linked list of all wakers associated // to a given sleeper. allWakersNext *Waker } type wakerState struct { asserted bool other *Sleeper } // saveS is invoked by stateify. func (w *Waker) saveS() wakerState { s := (*Sleeper)(atomic.LoadPointer(&w.s)) if s == &assertedSleeper { return wakerState{asserted: true} } return wakerState{other: s} } // loadS is invoked by stateify. func (w *Waker) loadS(_ context.Context, ws wakerState) { if ws.asserted { atomic.StorePointer(&w.s, unsafe.Pointer(&assertedSleeper)) } else { atomic.StorePointer(&w.s, unsafe.Pointer(ws.other)) } } // assert is the implementation for Assert. // //go:nosplit func (w *Waker) assert(wakep bool) { // Nothing to do if the waker is already asserted. This check allows us // to complete this case (already asserted) without any interlocked // operations on x86. if atomic.LoadPointer(&w.s) == usleeper(&assertedSleeper) { return } // Mark the waker as asserted, and wake up a sleeper if there is one. switch s := (*Sleeper)(atomic.SwapPointer(&w.s, usleeper(&assertedSleeper))); s { case nil: case &assertedSleeper: default: s.enqueueAssertedWaker(w, wakep) } } // Assert moves the waker to an asserted state, if it isn't asserted yet. When // asserted, the waker will cause its matching sleeper to wake up. func (w *Waker) Assert() { w.assert(true /* wakep */) } // Clear moves the waker to then non-asserted state and returns whether it was // asserted before being cleared. // // N.B. The waker isn't removed from the "ready" list of a sleeper (if it // happens to be in one), but the sleeper will notice that it is not asserted // anymore and won't return it to the caller. func (w *Waker) Clear() bool { // Nothing to do if the waker is not asserted. This check allows us to // complete this case (already not asserted) without any interlocked // operations on x86. if atomic.LoadPointer(&w.s) != usleeper(&assertedSleeper) { return false } // Try to store nil in the sleeper, which indicates that the waker is // not asserted. return atomic.CompareAndSwapPointer(&w.s, usleeper(&assertedSleeper), nil) } // IsAsserted returns whether the waker is currently asserted (i.e., if it's // currently in a state that would cause its matching sleeper to wake up). func (w *Waker) IsAsserted() bool { return (*Sleeper)(atomic.LoadPointer(&w.s)) == &assertedSleeper } func usleeper(s *Sleeper) unsafe.Pointer { return unsafe.Pointer(s) } func uwaker(w *Waker) unsafe.Pointer { return unsafe.Pointer(w) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sleep/sleep_unsafe_state_autogen.go000066400000000000000000000034361465435605700264640ustar00rootroot00000000000000// automatically generated by stateify. package sleep import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (s *Sleeper) StateTypeName() string { return "pkg/sleep.Sleeper" } func (s *Sleeper) StateFields() []string { return []string{ "sharedList", "localList", "allWakers", } } func (s *Sleeper) beforeSave() {} // +checklocksignore func (s *Sleeper) StateSave(stateSinkObject state.Sink) { s.beforeSave() var sharedListValue *Waker sharedListValue = s.saveSharedList() stateSinkObject.SaveValue(0, sharedListValue) stateSinkObject.Save(1, &s.localList) stateSinkObject.Save(2, &s.allWakers) } func (s *Sleeper) afterLoad(context.Context) {} // +checklocksignore func (s *Sleeper) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(1, &s.localList) stateSourceObject.Load(2, &s.allWakers) stateSourceObject.LoadValue(0, new(*Waker), func(y any) { s.loadSharedList(ctx, y.(*Waker)) }) } func (w *Waker) StateTypeName() string { return "pkg/sleep.Waker" } func (w *Waker) StateFields() []string { return []string{ "s", "next", "allWakersNext", } } func (w *Waker) beforeSave() {} // +checklocksignore func (w *Waker) StateSave(stateSinkObject state.Sink) { w.beforeSave() var sValue wakerState sValue = w.saveS() stateSinkObject.SaveValue(0, sValue) stateSinkObject.Save(1, &w.next) stateSinkObject.Save(2, &w.allWakersNext) } func (w *Waker) afterLoad(context.Context) {} // +checklocksignore func (w *Waker) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(1, &w.next) stateSourceObject.Load(2, &w.allWakersNext) stateSourceObject.LoadValue(0, new(wakerState), func(y any) { w.loadS(ctx, y.(wakerState)) }) } func init() { state.Register((*Sleeper)(nil)) state.Register((*Waker)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/state/000077500000000000000000000000001465435605700205445ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/state/addr_range.go000066400000000000000000000033541465435605700231660ustar00rootroot00000000000000package state // A Range represents a contiguous range of T. // // +stateify savable type addrRange struct { // Start is the inclusive start of the range. Start uintptr // End is the exclusive end of the range. End uintptr } // WellFormed returns true if r.Start <= r.End. All other methods on a Range // require that the Range is well-formed. // //go:nosplit func (r addrRange) WellFormed() bool { return r.Start <= r.End } // Length returns the length of the range. // //go:nosplit func (r addrRange) Length() uintptr { return r.End - r.Start } // Contains returns true if r contains x. // //go:nosplit func (r addrRange) Contains(x uintptr) bool { return r.Start <= x && x < r.End } // Overlaps returns true if r and r2 overlap. // //go:nosplit func (r addrRange) Overlaps(r2 addrRange) bool { return r.Start < r2.End && r2.Start < r.End } // IsSupersetOf returns true if r is a superset of r2; that is, the range r2 is // contained within r. // //go:nosplit func (r addrRange) IsSupersetOf(r2 addrRange) bool { return r.Start <= r2.Start && r.End >= r2.End } // Intersect returns a range consisting of the intersection between r and r2. // If r and r2 do not overlap, Intersect returns a range with unspecified // bounds, but for which Length() == 0. // //go:nosplit func (r addrRange) Intersect(r2 addrRange) addrRange { if r.Start < r2.Start { r.Start = r2.Start } if r.End > r2.End { r.End = r2.End } if r.End < r.Start { r.End = r.Start } return r } // CanSplitAt returns true if it is legal to split a segment spanning the range // r at x; that is, splitting at x would produce two ranges, both of which have // non-zero length. // //go:nosplit func (r addrRange) CanSplitAt(x uintptr) bool { return r.Contains(x) && r.Start < x } golang-gvisor-gvisor-0.0~20240729.0/pkg/state/addr_set.go000066400000000000000000002006231465435605700226630ustar00rootroot00000000000000package state import ( "bytes" "context" "fmt" ) // trackGaps is an optional parameter. // // If trackGaps is 1, the Set will track maximum gap size recursively, // enabling the GapIterator.{Prev,Next}LargeEnoughGap functions. In this // case, Key must be an unsigned integer. // // trackGaps must be 0 or 1. const addrtrackGaps = 0 var _ = uint8(addrtrackGaps << 7) // Will fail if not zero or one. // dynamicGap is a type that disappears if trackGaps is 0. type addrdynamicGap [addrtrackGaps]uintptr // Get returns the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *addrdynamicGap) Get() uintptr { return d[:][0] } // Set sets the value of the gap. // // Precondition: trackGaps must be non-zero. func (d *addrdynamicGap) Set(v uintptr) { d[:][0] = v } const ( // minDegree is the minimum degree of an internal node in a Set B-tree. // // - Any non-root node has at least minDegree-1 segments. // // - Any non-root internal (non-leaf) node has at least minDegree children. // // - The root node may have fewer than minDegree-1 segments, but it may // only have 0 segments if the tree is empty. // // Our implementation requires minDegree >= 3. Higher values of minDegree // usually improve performance, but increase memory usage for small sets. addrminDegree = 10 addrmaxDegree = 2 * addrminDegree ) // A Set is a mapping of segments with non-overlapping Range keys. The zero // value for a Set is an empty set. Set values are not safely movable nor // copyable. Set is thread-compatible. // // +stateify savable type addrSet struct { root addrnode `state:".([]addrFlatSegment)"` } // IsEmpty returns true if the set contains no segments. func (s *addrSet) IsEmpty() bool { return s.root.nrSegments == 0 } // IsEmptyRange returns true iff no segments in the set overlap the given // range. This is semantically equivalent to s.SpanRange(r) == 0, but may be // more efficient. func (s *addrSet) IsEmptyRange(r addrRange) bool { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return true } _, gap := s.Find(r.Start) if !gap.Ok() { return false } return r.End <= gap.End() } // Span returns the total size of all segments in the set. func (s *addrSet) Span() uintptr { var sz uintptr for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { sz += seg.Range().Length() } return sz } // SpanRange returns the total size of the intersection of segments in the set // with the given range. func (s *addrSet) SpanRange(r addrRange) uintptr { switch { case r.Length() < 0: panic(fmt.Sprintf("invalid range %v", r)) case r.Length() == 0: return 0 } var sz uintptr for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { sz += seg.Range().Intersect(r).Length() } return sz } // FirstSegment returns the first segment in the set. If the set is empty, // FirstSegment returns a terminal iterator. func (s *addrSet) FirstSegment() addrIterator { if s.root.nrSegments == 0 { return addrIterator{} } return s.root.firstSegment() } // LastSegment returns the last segment in the set. If the set is empty, // LastSegment returns a terminal iterator. func (s *addrSet) LastSegment() addrIterator { if s.root.nrSegments == 0 { return addrIterator{} } return s.root.lastSegment() } // FirstGap returns the first gap in the set. func (s *addrSet) FirstGap() addrGapIterator { n := &s.root for n.hasChildren { n = n.children[0] } return addrGapIterator{n, 0} } // LastGap returns the last gap in the set. func (s *addrSet) LastGap() addrGapIterator { n := &s.root for n.hasChildren { n = n.children[n.nrSegments] } return addrGapIterator{n, n.nrSegments} } // Find returns the segment or gap whose range contains the given key. If a // segment is found, the returned Iterator is non-terminal and the // returned GapIterator is terminal. Otherwise, the returned Iterator is // terminal and the returned GapIterator is non-terminal. func (s *addrSet) Find(key uintptr) (addrIterator, addrGapIterator) { n := &s.root for { lower := 0 upper := n.nrSegments for lower < upper { i := lower + (upper-lower)/2 if r := n.keys[i]; key < r.End { if key >= r.Start { return addrIterator{n, i}, addrGapIterator{} } upper = i } else { lower = i + 1 } } i := lower if !n.hasChildren { return addrIterator{}, addrGapIterator{n, i} } n = n.children[i] } } // FindSegment returns the segment whose range contains the given key. If no // such segment exists, FindSegment returns a terminal iterator. func (s *addrSet) FindSegment(key uintptr) addrIterator { seg, _ := s.Find(key) return seg } // LowerBoundSegment returns the segment with the lowest range that contains a // key greater than or equal to min. If no such segment exists, // LowerBoundSegment returns a terminal iterator. func (s *addrSet) LowerBoundSegment(min uintptr) addrIterator { seg, gap := s.Find(min) if seg.Ok() { return seg } return gap.NextSegment() } // UpperBoundSegment returns the segment with the highest range that contains a // key less than or equal to max. If no such segment exists, UpperBoundSegment // returns a terminal iterator. func (s *addrSet) UpperBoundSegment(max uintptr) addrIterator { seg, gap := s.Find(max) if seg.Ok() { return seg } return gap.PrevSegment() } // FindGap returns the gap containing the given key. If no such gap exists // (i.e. the set contains a segment containing that key), FindGap returns a // terminal iterator. func (s *addrSet) FindGap(key uintptr) addrGapIterator { _, gap := s.Find(key) return gap } // LowerBoundGap returns the gap with the lowest range that is greater than or // equal to min. func (s *addrSet) LowerBoundGap(min uintptr) addrGapIterator { seg, gap := s.Find(min) if gap.Ok() { return gap } return seg.NextGap() } // UpperBoundGap returns the gap with the highest range that is less than or // equal to max. func (s *addrSet) UpperBoundGap(max uintptr) addrGapIterator { seg, gap := s.Find(max) if gap.Ok() { return gap } return seg.PrevGap() } // FirstLargeEnoughGap returns the first gap in the set with at least the given // length. If no such gap exists, FirstLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *addrSet) FirstLargeEnoughGap(minSize uintptr) addrGapIterator { if addrtrackGaps != 1 { panic("set is not tracking gaps") } gap := s.FirstGap() if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // LastLargeEnoughGap returns the last gap in the set with at least the given // length. If no such gap exists, LastLargeEnoughGap returns a terminal // iterator. // // Precondition: trackGaps must be 1. func (s *addrSet) LastLargeEnoughGap(minSize uintptr) addrGapIterator { if addrtrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LastGap() if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // LowerBoundLargeEnoughGap returns the first gap in the set with at least the // given length and whose range contains a key greater than or equal to min. If // no such gap exists, LowerBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *addrSet) LowerBoundLargeEnoughGap(min, minSize uintptr) addrGapIterator { if addrtrackGaps != 1 { panic("set is not tracking gaps") } gap := s.LowerBoundGap(min) if gap.Range().Length() >= minSize { return gap } return gap.NextLargeEnoughGap(minSize) } // UpperBoundLargeEnoughGap returns the last gap in the set with at least the // given length and whose range contains a key less than or equal to max. If no // such gap exists, UpperBoundLargeEnoughGap returns a terminal iterator. // // Precondition: trackGaps must be 1. func (s *addrSet) UpperBoundLargeEnoughGap(max, minSize uintptr) addrGapIterator { if addrtrackGaps != 1 { panic("set is not tracking gaps") } gap := s.UpperBoundGap(max) if gap.Range().Length() >= minSize { return gap } return gap.PrevLargeEnoughGap(minSize) } // Insert inserts the given segment into the given gap. If the new segment can // be merged with adjacent segments, Insert will do so. Insert returns an // iterator to the segment containing the inserted value (which may have been // merged with other values). All existing iterators (including gap, but not // including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, Insert panics. // // Insert is semantically equivalent to a InsertWithoutMerging followed by a // Merge, but may be more efficient. Note that there is no unchecked variant of // Insert since Insert must retrieve and inspect gap's predecessor and // successor segments regardless. func (s *addrSet) Insert(gap addrGapIterator, r addrRange, val *objectEncodeState) addrIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } prev, next := gap.PrevSegment(), gap.NextSegment() if prev.Ok() && prev.End() > r.Start { panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) } if next.Ok() && next.Start() < r.End { panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) } if prev.Ok() && prev.End() == r.Start { if mval, ok := (addrSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { shrinkMaxGap := addrtrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() prev.SetEndUnchecked(r.End) prev.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } if next.Ok() && next.Start() == r.End { val = mval if mval, ok := (addrSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { prev.SetEndUnchecked(next.End()) prev.SetValue(mval) return s.Remove(next).PrevSegment() } } return prev } } if next.Ok() && next.Start() == r.End { if mval, ok := (addrSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { shrinkMaxGap := addrtrackGaps != 0 && gap.Range().Length() == gap.node.maxGap.Get() next.SetStartUnchecked(r.Start) next.SetValue(mval) if shrinkMaxGap { gap.node.updateMaxGapLeaf() } return next } } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMerging inserts the given segment into the given gap and // returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // If the gap cannot accommodate the segment, or if r is invalid, // InsertWithoutMerging panics. func (s *addrSet) InsertWithoutMerging(gap addrGapIterator, r addrRange, val *objectEncodeState) addrIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if gr := gap.Range(); !gr.IsSupersetOf(r) { panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) } return s.InsertWithoutMergingUnchecked(gap, r, val) } // InsertWithoutMergingUnchecked inserts the given segment into the given gap // and returns an iterator to the inserted segment. All existing iterators // (including gap, but not including the returned iterator) are invalidated. // // Preconditions: // - r.Start >= gap.Start(). // - r.End <= gap.End(). func (s *addrSet) InsertWithoutMergingUnchecked(gap addrGapIterator, r addrRange, val *objectEncodeState) addrIterator { gap = gap.node.rebalanceBeforeInsert(gap) splitMaxGap := addrtrackGaps != 0 && (gap.node.nrSegments == 0 || gap.Range().Length() == gap.node.maxGap.Get()) copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) gap.node.keys[gap.index] = r gap.node.values[gap.index] = val gap.node.nrSegments++ if splitMaxGap { gap.node.updateMaxGapLeaf() } return addrIterator{gap.node, gap.index} } // InsertRange inserts the given segment into the set. If the new segment can // be merged with adjacent segments, InsertRange will do so. InsertRange // returns an iterator to the segment containing the inserted value (which may // have been merged with other values). All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertRange panics. // // InsertRange searches the set to find the gap to insert into. If the caller // already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *addrSet) InsertRange(r addrRange, val *objectEncodeState) addrIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.Insert(gap, r, val) } // InsertWithoutMergingRange inserts the given segment into the set and returns // an iterator to the inserted segment. All existing iterators (excluding the // returned iterator) are invalidated. // // If the new segment would overlap an existing segment, or if r is invalid, // InsertWithoutMergingRange panics. // // InsertWithoutMergingRange searches the set to find the gap to insert into. // If the caller already has the appropriate GapIterator, or if the caller // needs to do additional work between finding the gap and insertion, use // InsertWithoutMerging instead. func (s *addrSet) InsertWithoutMergingRange(r addrRange, val *objectEncodeState) addrIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, seg.Range())) } if gap.End() < r.End { panic(fmt.Sprintf("new segment %v overlaps existing segment %v", r, gap.NextSegment().Range())) } return s.InsertWithoutMerging(gap, r, val) } // TryInsertRange attempts to insert the given segment into the set. If the new // segment can be merged with adjacent segments, TryInsertRange will do so. // TryInsertRange returns an iterator to the segment containing the inserted // value (which may have been merged with other values). All existing iterators // (excluding the returned iterator) are invalidated. // // If the new segment would overlap an existing segment, TryInsertRange does // nothing and returns a terminal iterator. // // TryInsertRange searches the set to find the gap to insert into. If the // caller already has the appropriate GapIterator, or if the caller needs to do // additional work between finding the gap and insertion, use Insert instead. func (s *addrSet) TryInsertRange(r addrRange, val *objectEncodeState) addrIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return addrIterator{} } if gap.End() < r.End { return addrIterator{} } return s.Insert(gap, r, val) } // TryInsertWithoutMergingRange attempts to insert the given segment into the // set. If successful, it returns an iterator to the inserted segment; all // existing iterators (excluding the returned iterator) are invalidated. If the // new segment would overlap an existing segment, TryInsertWithoutMergingRange // does nothing and returns a terminal iterator. // // TryInsertWithoutMergingRange searches the set to find the gap to insert // into. If the caller already has the appropriate GapIterator, or if the // caller needs to do additional work between finding the gap and insertion, // use InsertWithoutMerging instead. func (s *addrSet) TryInsertWithoutMergingRange(r addrRange, val *objectEncodeState) addrIterator { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } seg, gap := s.Find(r.Start) if seg.Ok() { return addrIterator{} } if gap.End() < r.End { return addrIterator{} } return s.InsertWithoutMerging(gap, r, val) } // Remove removes the given segment and returns an iterator to the vacated gap. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. func (s *addrSet) Remove(seg addrIterator) addrGapIterator { if seg.node.hasChildren { victim := seg.PrevSegment() seg.SetRangeUnchecked(victim.Range()) seg.SetValue(victim.Value()) nextAdjacentNode := seg.NextSegment().node if addrtrackGaps != 0 { nextAdjacentNode.updateMaxGapLeaf() } return s.Remove(victim).NextGap() } copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) addrSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) seg.node.nrSegments-- if addrtrackGaps != 0 { seg.node.updateMaxGapLeaf() } return seg.node.rebalanceAfterRemove(addrGapIterator{seg.node, seg.index}) } // RemoveAll removes all segments from the set. All existing iterators are // invalidated. func (s *addrSet) RemoveAll() { s.root = addrnode{} } // RemoveRange removes all segments in the given range. An iterator to the // newly formed gap is returned, and all existing iterators are invalidated. // // RemoveRange searches the set to find segments to remove. If the caller // already has an iterator to either end of the range of segments to remove, or // if the caller needs to do additional work before removing each segment, // iterate segments and call Remove in a loop instead. func (s *addrSet) RemoveRange(r addrRange) addrGapIterator { seg, gap := s.Find(r.Start) if seg.Ok() { seg = s.Isolate(seg, r) gap = s.Remove(seg) } for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { seg = s.SplitAfter(seg, r.End) gap = s.Remove(seg) } return gap } // RemoveFullRange is equivalent to RemoveRange, except that if any key in the // given range does not correspond to a segment, RemoveFullRange panics. func (s *addrSet) RemoveFullRange(r addrRange) addrGapIterator { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) end := seg.End() gap := s.Remove(seg) if r.End <= end { return gap } seg = gap.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // Merge attempts to merge two neighboring segments. If successful, Merge // returns an iterator to the merged segment, and all existing iterators are // invalidated. Otherwise, Merge returns a terminal iterator. // // If first is not the predecessor of second, Merge panics. func (s *addrSet) Merge(first, second addrIterator) addrIterator { if first.NextSegment() != second { panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) } return s.MergeUnchecked(first, second) } // MergeUnchecked attempts to merge two neighboring segments. If successful, // MergeUnchecked returns an iterator to the merged segment, and all existing // iterators are invalidated. Otherwise, MergeUnchecked returns a terminal // iterator. // // Precondition: first is the predecessor of second: first.NextSegment() == // second, first == second.PrevSegment(). func (s *addrSet) MergeUnchecked(first, second addrIterator) addrIterator { if first.End() == second.Start() { if mval, ok := (addrSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { first.SetEndUnchecked(second.End()) first.SetValue(mval) return s.Remove(second).PrevSegment() } } return addrIterator{} } // MergePrev attempts to merge the given segment with its predecessor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergePrev is usually used when mutating segments while iterating them in // order of increasing keys, to attempt merging of each mutated segment with // its previously-mutated predecessor. In such cases, merging a mutated segment // with its unmutated successor would incorrectly cause the latter to be // skipped. func (s *addrSet) MergePrev(seg addrIterator) addrIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } return seg } // MergeNext attempts to merge the given segment with its successor if // possible, and returns an updated iterator to the extended segment. All // existing iterators (including seg, but not including the returned iterator) // are invalidated. // // MergeNext is usually used when mutating segments while iterating them in // order of decreasing keys, to attempt merging of each mutated segment with // its previously-mutated successor. In such cases, merging a mutated segment // with its unmutated predecessor would incorrectly cause the latter to be // skipped. func (s *addrSet) MergeNext(seg addrIterator) addrIterator { if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // Unisolate attempts to merge the given segment with its predecessor and // successor if possible, and returns an updated iterator to the extended // segment. All existing iterators (including seg, but not including the // returned iterator) are invalidated. // // Unisolate is usually used in conjunction with Isolate when mutating part of // a single segment in a way that may affect its mergeability. For the reasons // described by MergePrev and MergeNext, it is usually incorrect to use the // return value of Unisolate in a loop variable. func (s *addrSet) Unisolate(seg addrIterator) addrIterator { if prev := seg.PrevSegment(); prev.Ok() { if mseg := s.MergeUnchecked(prev, seg); mseg.Ok() { seg = mseg } } if next := seg.NextSegment(); next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg = mseg } } return seg } // MergeAll merges all mergeable adjacent segments in the set. All existing // iterators are invalidated. func (s *addrSet) MergeAll() { seg := s.FirstSegment() if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeInsideRange attempts to merge all adjacent segments that contain a key // in the specific range. All existing iterators are invalidated. // // MergeInsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid a redundant search. func (s *addrSet) MergeInsideRange(r addrRange) { seg := s.LowerBoundSegment(r.Start) if !seg.Ok() { return } next := seg.NextSegment() for next.Ok() && next.Start() < r.End { if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { seg, next = mseg, mseg.NextSegment() } else { seg, next = next, next.NextSegment() } } } // MergeOutsideRange attempts to merge the segment containing r.Start with its // predecessor, and the segment containing r.End-1 with its successor. // // MergeOutsideRange only makes sense after mutating the set in a way that may // change the mergeability of modified segments; callers should prefer to use // MergePrev or MergeNext during the mutating loop instead (depending on the // direction of iteration), in order to avoid two redundant searches. func (s *addrSet) MergeOutsideRange(r addrRange) { first := s.FindSegment(r.Start) if first.Ok() { if prev := first.PrevSegment(); prev.Ok() { s.Merge(prev, first) } } last := s.FindSegment(r.End - 1) if last.Ok() { if next := last.NextSegment(); next.Ok() { s.Merge(last, next) } } } // Split splits the given segment at the given key and returns iterators to the // two resulting segments. All existing iterators (including seg, but not // including the returned iterators) are invalidated. // // If the segment cannot be split at split (because split is at the start or // end of the segment's range, so splitting would produce a segment with zero // length, or because split falls outside the segment's range altogether), // Split panics. func (s *addrSet) Split(seg addrIterator, split uintptr) (addrIterator, addrIterator) { if !seg.Range().CanSplitAt(split) { panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) } return s.SplitUnchecked(seg, split) } // SplitUnchecked splits the given segment at the given key and returns // iterators to the two resulting segments. All existing iterators (including // seg, but not including the returned iterators) are invalidated. // // Preconditions: seg.Start() < key < seg.End(). func (s *addrSet) SplitUnchecked(seg addrIterator, split uintptr) (addrIterator, addrIterator) { val1, val2 := (addrSetFunctions{}).Split(seg.Range(), seg.Value(), split) end2 := seg.End() seg.SetEndUnchecked(split) seg.SetValue(val1) seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), addrRange{split, end2}, val2) return seg2.PrevSegment(), seg2 } // SplitBefore ensures that the given segment's start is at least start by // splitting at start if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterator) are invalidated. // // SplitBefore is usually when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, the first segment may // extend beyond the start of the range to be mutated, and needs to be // SplitBefore to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter; i.e. SplitBefore needs to be invoked on each segment, while // SplitAfter only needs to be invoked on the first. // // Preconditions: start < seg.End(). func (s *addrSet) SplitBefore(seg addrIterator, start uintptr) addrIterator { if seg.Range().CanSplitAt(start) { _, seg = s.SplitUnchecked(seg, start) } return seg } // SplitAfter ensures that the given segment's end is at most end by splitting // at end if necessary, and returns an updated iterator to the bounded segment. // All existing iterators (including seg, but not including the returned // iterator) are invalidated. // // SplitAfter is usually used when mutating segments in a range. In such cases, // when iterating segments in order of increasing keys, each iterated segment // may extend beyond the end of the range to be mutated, and needs to be // SplitAfter to ensure that only the part of the segment within the range is // mutated. When iterating segments in order of decreasing keys, SplitBefore // and SplitAfter exchange roles; i.e. SplitBefore needs to be invoked on each // segment, while SplitAfter only needs to be invoked on the first. // // Preconditions: seg.Start() < end. func (s *addrSet) SplitAfter(seg addrIterator, end uintptr) addrIterator { if seg.Range().CanSplitAt(end) { seg, _ = s.SplitUnchecked(seg, end) } return seg } // Isolate ensures that the given segment's range is a subset of r by splitting // at r.Start and r.End if necessary, and returns an updated iterator to the // bounded segment. All existing iterators (including seg, but not including // the returned iterators) are invalidated. // // Isolate is usually used when mutating part of a single segment, or when // mutating segments in a range where the first segment is not necessarily // split, making use of SplitBefore/SplitAfter complex. // // Preconditions: seg.Range().Overlaps(r). func (s *addrSet) Isolate(seg addrIterator, r addrRange) addrIterator { if seg.Range().CanSplitAt(r.Start) { _, seg = s.SplitUnchecked(seg, r.Start) } if seg.Range().CanSplitAt(r.End) { seg, _ = s.SplitUnchecked(seg, r.End) } return seg } // LowerBoundSegmentSplitBefore combines LowerBoundSegment and SplitBefore. // // LowerBoundSegmentSplitBefore is usually used when mutating segments in a // range while iterating them in order of increasing keys. In such cases, // LowerBoundSegmentSplitBefore provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *addrSet) LowerBoundSegmentSplitBefore(min uintptr) addrIterator { seg := s.LowerBoundSegment(min) if seg.Ok() { seg = s.SplitBefore(seg, min) } return seg } // UpperBoundSegmentSplitAfter combines UpperBoundSegment and SplitAfter. // // UpperBoundSegmentSplitAfter is usually used when mutating segments in a // range while iterating them in order of decreasing keys. In such cases, // UpperBoundSegmentSplitAfter provides an iterator to the first segment to be // mutated, suitable as the initial value for a loop variable. func (s *addrSet) UpperBoundSegmentSplitAfter(max uintptr) addrIterator { seg := s.UpperBoundSegment(max) if seg.Ok() { seg = s.SplitAfter(seg, max) } return seg } // VisitRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments will not be split, so f may be called // on segments lying partially outside r. Non-empty gaps between segments are // skipped. If a call to f returns false, VisitRange stops iteration // immediately. // // N.B. f must not invalidate iterators into s. func (s *addrSet) VisitRange(r addrRange, f func(seg addrIterator) bool) { for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { if !f(seg) { return } } } // VisitFullRange is equivalent to VisitRange, except that if any key in r that // is visited before f returns false does not correspond to a segment, // VisitFullRange panics. func (s *addrSet) VisitFullRange(r addrRange, f func(seg addrIterator) bool) { pos := r.Start seg := s.FindSegment(r.Start) for { if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", pos)) } if !f(seg) { return } pos = seg.End() if r.End <= pos { return } seg, _ = seg.NextNonEmpty() } } // MutateRange applies the function f to all segments intersecting the range r, // in order of ascending keys. Segments that lie partially outside r are split // before f is called, such that f only observes segments entirely within r. // Iterated segments are merged again after f is called. Non-empty gaps between // segments are skipped. If a call to f returns false, MutateRange stops // iteration immediately. // // MutateRange invalidates all existing iterators. // // N.B. f must not invalidate iterators into s. func (s *addrSet) MutateRange(r addrRange, f func(seg addrIterator) bool) { seg := s.LowerBoundSegmentSplitBefore(r.Start) for seg.Ok() && seg.Start() < r.End { seg = s.SplitAfter(seg, r.End) cont := f(seg) seg = s.MergePrev(seg) if !cont { s.MergeNext(seg) return } seg = seg.NextSegment() } if seg.Ok() { s.MergePrev(seg) } } // MutateFullRange is equivalent to MutateRange, except that if any key in r // that is visited before f returns false does not correspond to a segment, // MutateFullRange panics. func (s *addrSet) MutateFullRange(r addrRange, f func(seg addrIterator) bool) { seg := s.FindSegment(r.Start) if !seg.Ok() { panic(fmt.Sprintf("missing segment at %v", r.Start)) } seg = s.SplitBefore(seg, r.Start) for { seg = s.SplitAfter(seg, r.End) cont := f(seg) end := seg.End() seg = s.MergePrev(seg) if !cont || r.End <= end { s.MergeNext(seg) return } seg = seg.NextSegment() if !seg.Ok() || seg.Start() != end { panic(fmt.Sprintf("missing segment at %v", end)) } } } // +stateify savable type addrnode struct { // An internal binary tree node looks like: // // K // / \ // Cl Cr // // where all keys in the subtree rooted by Cl (the left subtree) are less // than K (the key of the parent node), and all keys in the subtree rooted // by Cr (the right subtree) are greater than K. // // An internal B-tree node's indexes work out to look like: // // K0 K1 K2 ... Kn-1 // / \/ \/ \ ... / \ // C0 C1 C2 C3 ... Cn-1 Cn // // where n is nrSegments. nrSegments int // parent is a pointer to this node's parent. If this node is root, parent // is nil. parent *addrnode // parentIndex is the index of this node in parent.children. parentIndex int // Flag for internal nodes that is technically redundant with "children[0] // != nil", but is stored in the first cache line. "hasChildren" rather // than "isLeaf" because false must be the correct value for an empty root. hasChildren bool // The longest gap within this node. If the node is a leaf, it's simply the // maximum gap among all the (nrSegments+1) gaps formed by its nrSegments keys // including the 0th and nrSegments-th gap possibly shared with its upper-level // nodes; if it's a non-leaf node, it's the max of all children's maxGap. maxGap addrdynamicGap // Nodes store keys and values in separate arrays to maximize locality in // the common case (scanning keys for lookup). keys [addrmaxDegree - 1]addrRange values [addrmaxDegree - 1]*objectEncodeState children [addrmaxDegree]*addrnode } // firstSegment returns the first segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *addrnode) firstSegment() addrIterator { for n.hasChildren { n = n.children[0] } return addrIterator{n, 0} } // lastSegment returns the last segment in the subtree rooted by n. // // Preconditions: n.nrSegments != 0. func (n *addrnode) lastSegment() addrIterator { for n.hasChildren { n = n.children[n.nrSegments] } return addrIterator{n, n.nrSegments - 1} } func (n *addrnode) prevSibling() *addrnode { if n.parent == nil || n.parentIndex == 0 { return nil } return n.parent.children[n.parentIndex-1] } func (n *addrnode) nextSibling() *addrnode { if n.parent == nil || n.parentIndex == n.parent.nrSegments { return nil } return n.parent.children[n.parentIndex+1] } // rebalanceBeforeInsert splits n and its ancestors if they are full, as // required for insertion, and returns an updated iterator to the position // represented by gap. func (n *addrnode) rebalanceBeforeInsert(gap addrGapIterator) addrGapIterator { if n.nrSegments < addrmaxDegree-1 { return gap } if n.parent != nil { gap = n.parent.rebalanceBeforeInsert(gap) } if n.parent == nil { left := &addrnode{ nrSegments: addrminDegree - 1, parent: n, parentIndex: 0, hasChildren: n.hasChildren, } right := &addrnode{ nrSegments: addrminDegree - 1, parent: n, parentIndex: 1, hasChildren: n.hasChildren, } copy(left.keys[:addrminDegree-1], n.keys[:addrminDegree-1]) copy(left.values[:addrminDegree-1], n.values[:addrminDegree-1]) copy(right.keys[:addrminDegree-1], n.keys[addrminDegree:]) copy(right.values[:addrminDegree-1], n.values[addrminDegree:]) n.keys[0], n.values[0] = n.keys[addrminDegree-1], n.values[addrminDegree-1] addrzeroValueSlice(n.values[1:]) if n.hasChildren { copy(left.children[:addrminDegree], n.children[:addrminDegree]) copy(right.children[:addrminDegree], n.children[addrminDegree:]) addrzeroNodeSlice(n.children[2:]) for i := 0; i < addrminDegree; i++ { left.children[i].parent = left left.children[i].parentIndex = i right.children[i].parent = right right.children[i].parentIndex = i } } n.nrSegments = 1 n.hasChildren = true n.children[0] = left n.children[1] = right if addrtrackGaps != 0 { left.updateMaxGapLocal() right.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < addrminDegree { return addrGapIterator{left, gap.index} } return addrGapIterator{right, gap.index - addrminDegree} } copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[addrminDegree-1], n.values[addrminDegree-1] copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { n.parent.children[i].parentIndex = i } sibling := &addrnode{ nrSegments: addrminDegree - 1, parent: n.parent, parentIndex: n.parentIndex + 1, hasChildren: n.hasChildren, } n.parent.children[n.parentIndex+1] = sibling n.parent.nrSegments++ copy(sibling.keys[:addrminDegree-1], n.keys[addrminDegree:]) copy(sibling.values[:addrminDegree-1], n.values[addrminDegree:]) addrzeroValueSlice(n.values[addrminDegree-1:]) if n.hasChildren { copy(sibling.children[:addrminDegree], n.children[addrminDegree:]) addrzeroNodeSlice(n.children[addrminDegree:]) for i := 0; i < addrminDegree; i++ { sibling.children[i].parent = sibling sibling.children[i].parentIndex = i } } n.nrSegments = addrminDegree - 1 if addrtrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node != n { return gap } if gap.index < addrminDegree { return gap } return addrGapIterator{sibling, gap.index - addrminDegree} } // rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient // (contain fewer segments than required by B-tree invariants), as required for // removal, and returns an updated iterator to the position represented by gap. // // Precondition: n is the only node in the tree that may currently violate a // B-tree invariant. func (n *addrnode) rebalanceAfterRemove(gap addrGapIterator) addrGapIterator { for { if n.nrSegments >= addrminDegree-1 { return gap } if n.parent == nil { return gap } if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= addrminDegree { copy(n.keys[1:], n.keys[:n.nrSegments]) copy(n.values[1:], n.values[:n.nrSegments]) n.keys[0] = n.parent.keys[n.parentIndex-1] n.values[0] = n.parent.values[n.parentIndex-1] n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] addrSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { copy(n.children[1:], n.children[:n.nrSegments+1]) n.children[0] = sibling.children[sibling.nrSegments] sibling.children[sibling.nrSegments] = nil n.children[0].parent = n n.children[0].parentIndex = 0 for i := 1; i < n.nrSegments+2; i++ { n.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if addrtrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling && gap.index == sibling.nrSegments { return addrGapIterator{n, 0} } if gap.node == n { return addrGapIterator{n, gap.index + 1} } return gap } if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= addrminDegree { n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] n.values[n.nrSegments] = n.parent.values[n.parentIndex] n.parent.keys[n.parentIndex] = sibling.keys[0] n.parent.values[n.parentIndex] = sibling.values[0] copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) addrSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) if n.hasChildren { n.children[n.nrSegments+1] = sibling.children[0] copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) sibling.children[sibling.nrSegments] = nil n.children[n.nrSegments+1].parent = n n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 for i := 0; i < sibling.nrSegments; i++ { sibling.children[i].parentIndex = i } } n.nrSegments++ sibling.nrSegments-- if addrtrackGaps != 0 { n.updateMaxGapLocal() sibling.updateMaxGapLocal() } if gap.node == sibling { if gap.index == 0 { return addrGapIterator{n, n.nrSegments} } return addrGapIterator{sibling, gap.index - 1} } return gap } p := n.parent if p.nrSegments == 1 { left, right := p.children[0], p.children[1] p.nrSegments = left.nrSegments + right.nrSegments + 1 p.hasChildren = left.hasChildren p.keys[left.nrSegments] = p.keys[0] p.values[left.nrSegments] = p.values[0] copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := 0; i < p.nrSegments+1; i++ { p.children[i].parent = p p.children[i].parentIndex = i } } else { p.children[0] = nil p.children[1] = nil } if gap.node == left { return addrGapIterator{p, gap.index} } if gap.node == right { return addrGapIterator{p, gap.index + left.nrSegments + 1} } return gap } // Merge n and either sibling, along with the segment separating the // two, into whichever of the two nodes comes first. This is the // reverse of the non-root splitting case in // node.rebalanceBeforeInsert. var left, right *addrnode if n.parentIndex > 0 { left = n.prevSibling() right = n } else { left = n right = n.nextSibling() } if gap.node == right { gap = addrGapIterator{left, gap.index + left.nrSegments + 1} } left.keys[left.nrSegments] = p.keys[left.parentIndex] left.values[left.nrSegments] = p.values[left.parentIndex] copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) if left.hasChildren { copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { left.children[i].parent = left left.children[i].parentIndex = i } } left.nrSegments += right.nrSegments + 1 copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) addrSetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) for i := 0; i < p.nrSegments; i++ { p.children[i].parentIndex = i } p.children[p.nrSegments] = nil p.nrSegments-- if addrtrackGaps != 0 { left.updateMaxGapLocal() } n = p } } // updateMaxGapLeaf updates maxGap bottom-up from the calling leaf until no // necessary update. // // Preconditions: n must be a leaf node, trackGaps must be 1. func (n *addrnode) updateMaxGapLeaf() { if n.hasChildren { panic(fmt.Sprintf("updateMaxGapLeaf should always be called on leaf node: %v", n)) } max := n.calculateMaxGapLeaf() if max == n.maxGap.Get() { return } oldMax := n.maxGap.Get() n.maxGap.Set(max) if max > oldMax { for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() >= max { break } p.maxGap.Set(max) } return } for p := n.parent; p != nil; p = p.parent { if p.maxGap.Get() > oldMax { break } parentNewMax := p.calculateMaxGapInternal() if p.maxGap.Get() == parentNewMax { break } p.maxGap.Set(parentNewMax) } } // updateMaxGapLocal updates maxGap of the calling node solely with no // propagation to ancestor nodes. // // Precondition: trackGaps must be 1. func (n *addrnode) updateMaxGapLocal() { if !n.hasChildren { n.maxGap.Set(n.calculateMaxGapLeaf()) } else { n.maxGap.Set(n.calculateMaxGapInternal()) } } // calculateMaxGapLeaf iterates the gaps within a leaf node and calculate the // max. // // Preconditions: n must be a leaf node. func (n *addrnode) calculateMaxGapLeaf() uintptr { max := addrGapIterator{n, 0}.Range().Length() for i := 1; i <= n.nrSegments; i++ { if current := (addrGapIterator{n, i}).Range().Length(); current > max { max = current } } return max } // calculateMaxGapInternal iterates children's maxGap within an internal node n // and calculate the max. // // Preconditions: n must be a non-leaf node. func (n *addrnode) calculateMaxGapInternal() uintptr { max := n.children[0].maxGap.Get() for i := 1; i <= n.nrSegments; i++ { if current := n.children[i].maxGap.Get(); current > max { max = current } } return max } // searchFirstLargeEnoughGap returns the first gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *addrnode) searchFirstLargeEnoughGap(minSize uintptr) addrGapIterator { if n.maxGap.Get() < minSize { return addrGapIterator{} } if n.hasChildren { for i := 0; i <= n.nrSegments; i++ { if largeEnoughGap := n.children[i].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := 0; i <= n.nrSegments; i++ { currentGap := addrGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // searchLastLargeEnoughGap returns the last gap having at least minSize length // in the subtree rooted by n. If not found, return a terminal gap iterator. func (n *addrnode) searchLastLargeEnoughGap(minSize uintptr) addrGapIterator { if n.maxGap.Get() < minSize { return addrGapIterator{} } if n.hasChildren { for i := n.nrSegments; i >= 0; i-- { if largeEnoughGap := n.children[i].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } } else { for i := n.nrSegments; i >= 0; i-- { currentGap := addrGapIterator{n, i} if currentGap.Range().Length() >= minSize { return currentGap } } } panic(fmt.Sprintf("invalid maxGap in %v", n)) } // A Iterator is conceptually one of: // // - A pointer to a segment in a set; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Iterators are copyable values and are meaningfully equality-comparable. The // zero value of Iterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type addrIterator struct { // node is the node containing the iterated segment. If the iterator is // terminal, node is nil. node *addrnode // index is the index of the segment in node.keys/values. index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (seg addrIterator) Ok() bool { return seg.node != nil } // Range returns the iterated segment's range key. func (seg addrIterator) Range() addrRange { return seg.node.keys[seg.index] } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (seg addrIterator) Start() uintptr { return seg.node.keys[seg.index].Start } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (seg addrIterator) End() uintptr { return seg.node.keys[seg.index].End } // SetRangeUnchecked mutates the iterated segment's range key. This operation // does not invalidate any iterators. // // Preconditions: // - r.Length() > 0. // - The new range must not overlap an existing one: // - If seg.NextSegment().Ok(), then r.end <= seg.NextSegment().Start(). // - If seg.PrevSegment().Ok(), then r.start >= seg.PrevSegment().End(). func (seg addrIterator) SetRangeUnchecked(r addrRange) { seg.node.keys[seg.index] = r } // SetRange mutates the iterated segment's range key. If the new range would // cause the iterated segment to overlap another segment, or if the new range // is invalid, SetRange panics. This operation does not invalidate any // iterators. func (seg addrIterator) SetRange(r addrRange) { if r.Length() <= 0 { panic(fmt.Sprintf("invalid segment range %v", r)) } if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) } if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) } seg.SetRangeUnchecked(r) } // SetStartUnchecked mutates the iterated segment's start. This operation does // not invalidate any iterators. // // Preconditions: The new start must be valid: // - start < seg.End() // - If seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). func (seg addrIterator) SetStartUnchecked(start uintptr) { seg.node.keys[seg.index].Start = start } // SetStart mutates the iterated segment's start. If the new start value would // cause the iterated segment to overlap another segment, or would result in an // invalid range, SetStart panics. This operation does not invalidate any // iterators. func (seg addrIterator) SetStart(start uintptr) { if start >= seg.End() { panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) } if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) } seg.SetStartUnchecked(start) } // SetEndUnchecked mutates the iterated segment's end. This operation does not // invalidate any iterators. // // Preconditions: The new end must be valid: // - end > seg.Start(). // - If seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). func (seg addrIterator) SetEndUnchecked(end uintptr) { seg.node.keys[seg.index].End = end } // SetEnd mutates the iterated segment's end. If the new end value would cause // the iterated segment to overlap another segment, or would result in an // invalid range, SetEnd panics. This operation does not invalidate any // iterators. func (seg addrIterator) SetEnd(end uintptr) { if end <= seg.Start() { panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) } if next := seg.NextSegment(); next.Ok() && end > next.Start() { panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) } seg.SetEndUnchecked(end) } // Value returns a copy of the iterated segment's value. func (seg addrIterator) Value() *objectEncodeState { return seg.node.values[seg.index] } // ValuePtr returns a pointer to the iterated segment's value. The pointer is // invalidated if the iterator is invalidated. This operation does not // invalidate any iterators. func (seg addrIterator) ValuePtr() **objectEncodeState { return &seg.node.values[seg.index] } // SetValue mutates the iterated segment's value. This operation does not // invalidate any iterators. func (seg addrIterator) SetValue(val *objectEncodeState) { seg.node.values[seg.index] = val } // PrevSegment returns the iterated segment's predecessor. If there is no // preceding segment, PrevSegment returns a terminal iterator. func (seg addrIterator) PrevSegment() addrIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment() } if seg.index > 0 { return addrIterator{seg.node, seg.index - 1} } if seg.node.parent == nil { return addrIterator{} } return addrsegmentBeforePosition(seg.node.parent, seg.node.parentIndex) } // NextSegment returns the iterated segment's successor. If there is no // succeeding segment, NextSegment returns a terminal iterator. func (seg addrIterator) NextSegment() addrIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment() } if seg.index < seg.node.nrSegments-1 { return addrIterator{seg.node, seg.index + 1} } if seg.node.parent == nil { return addrIterator{} } return addrsegmentAfterPosition(seg.node.parent, seg.node.parentIndex) } // PrevGap returns the gap immediately before the iterated segment. func (seg addrIterator) PrevGap() addrGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index].lastSegment().NextGap() } return addrGapIterator{seg.node, seg.index} } // NextGap returns the gap immediately after the iterated segment. func (seg addrIterator) NextGap() addrGapIterator { if seg.node.hasChildren { return seg.node.children[seg.index+1].firstSegment().PrevGap() } return addrGapIterator{seg.node, seg.index + 1} } // PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, // or the gap before the iterated segment otherwise. If seg.Start() == // Functions.MinKey(), PrevNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by PrevNonEmpty will be // non-terminal. func (seg addrIterator) PrevNonEmpty() (addrIterator, addrGapIterator) { if prev := seg.PrevSegment(); prev.Ok() && prev.End() == seg.Start() { return prev, addrGapIterator{} } return addrIterator{}, seg.PrevGap() } // NextNonEmpty returns the iterated segment's successor if it is adjacent, or // the gap after the iterated segment otherwise. If seg.End() == // Functions.MaxKey(), NextNonEmpty will return two terminal iterators. // Otherwise, exactly one of the iterators returned by NextNonEmpty will be // non-terminal. func (seg addrIterator) NextNonEmpty() (addrIterator, addrGapIterator) { if next := seg.NextSegment(); next.Ok() && next.Start() == seg.End() { return next, addrGapIterator{} } return addrIterator{}, seg.NextGap() } // A GapIterator is conceptually one of: // // - A pointer to a position between two segments, before the first segment, or // after the last segment in a set, called a *gap*; or // // - A terminal iterator, which is a sentinel indicating that the end of // iteration has been reached. // // Note that the gap between two adjacent segments exists (iterators to it are // non-terminal), but has a length of zero. GapIterator.IsEmpty returns true // for such gaps. An empty set contains a single gap, spanning the entire range // of the set's keys. // // GapIterators are copyable values and are meaningfully equality-comparable. // The zero value of GapIterator is a terminal iterator. // // Unless otherwise specified, any mutation of a set invalidates all existing // iterators into the set. type addrGapIterator struct { // The representation of a GapIterator is identical to that of an Iterator, // except that index corresponds to positions between segments in the same // way as for node.children (see comment for node.nrSegments). node *addrnode index int } // Ok returns true if the iterator is not terminal. All other methods are only // valid for non-terminal iterators. func (gap addrGapIterator) Ok() bool { return gap.node != nil } // Range returns the range spanned by the iterated gap. func (gap addrGapIterator) Range() addrRange { return addrRange{gap.Start(), gap.End()} } // Start is equivalent to Range().Start, but should be preferred if only the // start of the range is needed. func (gap addrGapIterator) Start() uintptr { if ps := gap.PrevSegment(); ps.Ok() { return ps.End() } return addrSetFunctions{}.MinKey() } // End is equivalent to Range().End, but should be preferred if only the end of // the range is needed. func (gap addrGapIterator) End() uintptr { if ns := gap.NextSegment(); ns.Ok() { return ns.Start() } return addrSetFunctions{}.MaxKey() } // IsEmpty returns true if the iterated gap is empty (that is, the "gap" is // between two adjacent segments.) func (gap addrGapIterator) IsEmpty() bool { return gap.Range().Length() == 0 } // PrevSegment returns the segment immediately before the iterated gap. If no // such segment exists, PrevSegment returns a terminal iterator. func (gap addrGapIterator) PrevSegment() addrIterator { return addrsegmentBeforePosition(gap.node, gap.index) } // NextSegment returns the segment immediately after the iterated gap. If no // such segment exists, NextSegment returns a terminal iterator. func (gap addrGapIterator) NextSegment() addrIterator { return addrsegmentAfterPosition(gap.node, gap.index) } // PrevGap returns the iterated gap's predecessor. If no such gap exists, // PrevGap returns a terminal iterator. func (gap addrGapIterator) PrevGap() addrGapIterator { seg := gap.PrevSegment() if !seg.Ok() { return addrGapIterator{} } return seg.PrevGap() } // NextGap returns the iterated gap's successor. If no such gap exists, NextGap // returns a terminal iterator. func (gap addrGapIterator) NextGap() addrGapIterator { seg := gap.NextSegment() if !seg.Ok() { return addrGapIterator{} } return seg.NextGap() } // NextLargeEnoughGap returns the iterated gap's first next gap with larger // length than minSize. If not found, return a terminal gap iterator (does NOT // include this gap itself). // // Precondition: trackGaps must be 1. func (gap addrGapIterator) NextLargeEnoughGap(minSize uintptr) addrGapIterator { if addrtrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == gap.node.nrSegments { gap.node = gap.NextSegment().node gap.index = 0 return gap.nextLargeEnoughGapHelper(minSize) } return gap.nextLargeEnoughGapHelper(minSize) } // nextLargeEnoughGapHelper is the helper function used by NextLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the trailing gap of a non-leaf node. func (gap addrGapIterator) nextLargeEnoughGapHelper(minSize uintptr) addrGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == gap.node.nrSegments)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return addrGapIterator{} } gap.index++ for gap.index <= gap.node.nrSegments { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchFirstLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index++ } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == gap.node.nrSegments { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // PrevLargeEnoughGap returns the iterated gap's first prev gap with larger or // equal length than minSize. If not found, return a terminal gap iterator // (does NOT include this gap itself). // // Precondition: trackGaps must be 1. func (gap addrGapIterator) PrevLargeEnoughGap(minSize uintptr) addrGapIterator { if addrtrackGaps != 1 { panic("set is not tracking gaps") } if gap.node != nil && gap.node.hasChildren && gap.index == 0 { gap.node = gap.PrevSegment().node gap.index = gap.node.nrSegments return gap.prevLargeEnoughGapHelper(minSize) } return gap.prevLargeEnoughGapHelper(minSize) } // prevLargeEnoughGapHelper is the helper function used by PrevLargeEnoughGap // to do the real recursions. // // Preconditions: gap is NOT the first gap of a non-leaf node. func (gap addrGapIterator) prevLargeEnoughGapHelper(minSize uintptr) addrGapIterator { for { for gap.node != nil && (gap.node.maxGap.Get() < minSize || (!gap.node.hasChildren && gap.index == 0)) { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } if gap.node == nil { return addrGapIterator{} } gap.index-- for gap.index >= 0 { if gap.node.hasChildren { if largeEnoughGap := gap.node.children[gap.index].searchLastLargeEnoughGap(minSize); largeEnoughGap.Ok() { return largeEnoughGap } } else { if gap.Range().Length() >= minSize { return gap } } gap.index-- } gap.node, gap.index = gap.node.parent, gap.node.parentIndex if gap.node != nil && gap.index == 0 { gap.node, gap.index = gap.node.parent, gap.node.parentIndex } } } // segmentBeforePosition returns the predecessor segment of the position given // by n.children[i], which may or may not contain a child. If no such segment // exists, segmentBeforePosition returns a terminal iterator. func addrsegmentBeforePosition(n *addrnode, i int) addrIterator { for i == 0 { if n.parent == nil { return addrIterator{} } n, i = n.parent, n.parentIndex } return addrIterator{n, i - 1} } // segmentAfterPosition returns the successor segment of the position given by // n.children[i], which may or may not contain a child. If no such segment // exists, segmentAfterPosition returns a terminal iterator. func addrsegmentAfterPosition(n *addrnode, i int) addrIterator { for i == n.nrSegments { if n.parent == nil { return addrIterator{} } n, i = n.parent, n.parentIndex } return addrIterator{n, i} } func addrzeroValueSlice(slice []*objectEncodeState) { for i := range slice { addrSetFunctions{}.ClearValue(&slice[i]) } } func addrzeroNodeSlice(slice []*addrnode) { for i := range slice { slice[i] = nil } } // String stringifies a Set for debugging. func (s *addrSet) String() string { return s.root.String() } // String stringifies a node (and all of its children) for debugging. func (n *addrnode) String() string { var buf bytes.Buffer n.writeDebugString(&buf, "") return buf.String() } func (n *addrnode) writeDebugString(buf *bytes.Buffer, prefix string) { if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { buf.WriteString(prefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) } for i := 0; i < n.nrSegments; i++ { if child := n.children[i]; child != nil { cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) if child.parent != n || child.parentIndex != i { buf.WriteString(cprefix) buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) } child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) } buf.WriteString(prefix) if n.hasChildren { if addrtrackGaps != 0 { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v, maxGap: %d\n", i, n.keys[i], n.values[i], n.maxGap.Get())) } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } else { buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) } } if child := n.children[n.nrSegments]; child != nil { child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) } } // FlatSegment represents a segment as a single object. FlatSegment is used as // an intermediate representation for save/restore and tests. // // +stateify savable type addrFlatSegment struct { Start uintptr End uintptr Value *objectEncodeState } // ExportSlice returns a copy of all segments in the given set, in ascending // key order. func (s *addrSet) ExportSlice() []addrFlatSegment { var fs []addrFlatSegment for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { fs = append(fs, addrFlatSegment{ Start: seg.Start(), End: seg.End(), Value: seg.Value(), }) } return fs } // ImportSlice initializes the given set from the given slice. // // Preconditions: // - s must be empty. // - fs must represent a valid set (the segments in fs must have valid // lengths that do not overlap). // - The segments in fs must be sorted in ascending key order. func (s *addrSet) ImportSlice(fs []addrFlatSegment) error { if !s.IsEmpty() { return fmt.Errorf("cannot import into non-empty set %v", s) } gap := s.FirstGap() for i := range fs { f := &fs[i] r := addrRange{f.Start, f.End} if !gap.Range().IsSupersetOf(r) { return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: %v => %v", r, f.Value) } gap = s.InsertWithoutMerging(gap, r, f.Value).NextGap() } return nil } // segmentTestCheck returns an error if s is incorrectly sorted, does not // contain exactly expectedSegments segments, or contains a segment which // fails the passed check. // // This should be used only for testing, and has been added to this package for // templating convenience. func (s *addrSet) segmentTestCheck(expectedSegments int, segFunc func(int, addrRange, *objectEncodeState) error) error { havePrev := false prev := uintptr(0) nrSegments := 0 for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { next := seg.Start() if havePrev && prev >= next { return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments) } if segFunc != nil { if err := segFunc(nrSegments, seg.Range(), seg.Value()); err != nil { return err } } prev = next havePrev = true nrSegments++ } if nrSegments != expectedSegments { return fmt.Errorf("incorrect number of segments: got %d, wanted %d", nrSegments, expectedSegments) } return nil } // countSegments counts the number of segments in the set. // // Similar to Check, this should only be used for testing. func (s *addrSet) countSegments() (segments int) { for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { segments++ } return segments } func (s *addrSet) saveRoot() []addrFlatSegment { fs := s.ExportSlice() fs = fs[:len(fs):len(fs)] return fs } func (s *addrSet) loadRoot(_ context.Context, fs []addrFlatSegment) { if err := s.ImportSlice(fs); err != nil { panic(err) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/state/complete_list.go000066400000000000000000000124161465435605700237420ustar00rootroot00000000000000package state // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type completeElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (completeElementMapper) linkerFor(elem *objectDecodeState) *objectDecodeState { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type completeList struct { head *objectDecodeState tail *objectDecodeState } // Reset resets list l to the empty state. func (l *completeList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *completeList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *completeList) Front() *objectDecodeState { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *completeList) Back() *objectDecodeState { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *completeList) Len() (count int) { for e := l.Front(); e != nil; e = (completeElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *completeList) PushFront(e *objectDecodeState) { linker := completeElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { completeElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *completeList) PushFrontList(m *completeList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { completeElementMapper{}.linkerFor(l.head).SetPrev(m.tail) completeElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *completeList) PushBack(e *objectDecodeState) { linker := completeElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { completeElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *completeList) PushBackList(m *completeList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { completeElementMapper{}.linkerFor(l.tail).SetNext(m.head) completeElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *completeList) InsertAfter(b, e *objectDecodeState) { bLinker := completeElementMapper{}.linkerFor(b) eLinker := completeElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { completeElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *completeList) InsertBefore(a, e *objectDecodeState) { aLinker := completeElementMapper{}.linkerFor(a) eLinker := completeElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { completeElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *completeList) Remove(e *objectDecodeState) { linker := completeElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { completeElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { completeElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type completeEntry struct { next *objectDecodeState prev *objectDecodeState } // Next returns the entry that follows e in the list. // //go:nosplit func (e *completeEntry) Next() *objectDecodeState { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *completeEntry) Prev() *objectDecodeState { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *completeEntry) SetNext(elem *objectDecodeState) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *completeEntry) SetPrev(elem *objectDecodeState) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/state/decode.go000066400000000000000000000540371465435605700223270ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package state import ( "bytes" "context" "fmt" "io" "math" "reflect" "gvisor.dev/gvisor/pkg/state/wire" ) // internalCallback is a interface called on object completion. // // There are two implementations: objectDecodeState & userCallback. type internalCallback interface { // source returns the dependent object. May be nil. source() *objectDecodeState // callbackRun executes the callback. callbackRun() } // userCallback is an implementation of internalCallback. type userCallback func() // source implements internalCallback.source. func (userCallback) source() *objectDecodeState { return nil } // callbackRun implements internalCallback.callbackRun. func (uc userCallback) callbackRun() { uc() } // objectDecodeState represents an object that may be in the process of being // decoded. Specifically, it represents either a decoded object, or an an // interest in a future object that will be decoded. When that interest is // registered (via register), the storage for the object will be created, but // it will not be decoded until the object is encountered in the stream. type objectDecodeState struct { // id is the id for this object. id objectID // typ is the id for this typeID. This may be zero if this is not a // type-registered structure. typ typeID // obj is the object. This may or may not be valid yet, depending on // whether complete returns true. However, regardless of whether the // object is valid, obj contains a final storage location for the // object. This is immutable. // // Note that this must be addressable (obj.Addr() must not panic). // // The obj passed to the decode methods below will equal this obj only // in the case of decoding the top-level object. However, the passed // obj may represent individual fields, elements of a slice, etc. that // are effectively embedded within the reflect.Value below but with // distinct types. obj reflect.Value // blockedBy is the number of dependencies this object has. blockedBy int // callbacksInline is inline storage for callbacks. callbacksInline [2]internalCallback // callbacks is a set of callbacks to execute on load. callbacks []internalCallback completeEntry } // addCallback adds a callback to the objectDecodeState. func (ods *objectDecodeState) addCallback(ic internalCallback) { if ods.callbacks == nil { ods.callbacks = ods.callbacksInline[:0] } ods.callbacks = append(ods.callbacks, ic) } // findCycleFor returns when the given object is found in the blocking set. func (ods *objectDecodeState) findCycleFor(target *objectDecodeState) []*objectDecodeState { for _, ic := range ods.callbacks { other := ic.source() if other != nil && other == target { return []*objectDecodeState{target} } else if childList := other.findCycleFor(target); childList != nil { return append(childList, other) } } // This should not occur. Failf("no deadlock found?") panic("unreachable") } // findCycle finds a dependency cycle. func (ods *objectDecodeState) findCycle() []*objectDecodeState { return append(ods.findCycleFor(ods), ods) } // source implements internalCallback.source. func (ods *objectDecodeState) source() *objectDecodeState { return ods } // callbackRun implements internalCallback.callbackRun. func (ods *objectDecodeState) callbackRun() { ods.blockedBy-- } // decodeState is a graph of objects in the process of being decoded. // // The decode process involves loading the breadth-first graph generated by // encode. This graph is read in it's entirety, ensuring that all object // storage is complete. // // As the graph is being serialized, a set of completion callbacks are // executed. These completion callbacks should form a set of acyclic subgraphs // over the original one. After decoding is complete, the objects are scanned // to ensure that all callbacks are executed, otherwise the callback graph was // not acyclic. type decodeState struct { // ctx is the decode context. ctx context.Context // r is the input stream. r io.Reader // types is the type database. types typeDecodeDatabase // objectByID is the set of objects in progress. objectsByID []*objectDecodeState // deferred are objects that have been read, by no interest has been // registered yet. These will be decoded once interest in registered. deferred map[objectID]wire.Object // pending is the set of objects that are not yet complete. pending completeList // stats tracks time data. stats Stats } // lookup looks up an object in decodeState or returns nil if no such object // has been previously registered. func (ds *decodeState) lookup(id objectID) *objectDecodeState { if len(ds.objectsByID) < int(id) { return nil } return ds.objectsByID[id-1] } // checkComplete checks for completion. func (ds *decodeState) checkComplete(ods *objectDecodeState) bool { // Still blocked? if ods.blockedBy > 0 { return false } // Track stats if relevant. if ods.callbacks != nil && ods.typ != 0 { ds.stats.start(ods.typ) defer ds.stats.done() } // Fire all callbacks. for _, ic := range ods.callbacks { ic.callbackRun() } // Mark completed. cbs := ods.callbacks ods.callbacks = nil ds.pending.Remove(ods) // Recursively check others. for _, ic := range cbs { if other := ic.source(); other != nil && other.blockedBy == 0 { ds.checkComplete(other) } } return true // All set. } // wait registers a dependency on an object. // // As a special case, we always allow _useable_ references back to the first // decoding object because it may have fields that are already decoded. We also // allow trivial self reference, since they can be handled internally. func (ds *decodeState) wait(waiter *objectDecodeState, id objectID, callback func()) { switch id { case waiter.id: // Trivial self reference. fallthrough case 1: // Root object; see above. if callback != nil { callback() } return } // Mark as blocked. waiter.blockedBy++ // No nil can be returned here. other := ds.lookup(id) if callback != nil { // Add the additional user callback. other.addCallback(userCallback(callback)) } // Mark waiter as unblocked. other.addCallback(waiter) } // waitObject notes a blocking relationship. func (ds *decodeState) waitObject(ods *objectDecodeState, encoded wire.Object, callback func()) { if rv, ok := encoded.(*wire.Ref); ok && rv.Root != 0 { // Refs can encode pointers and maps. ds.wait(ods, objectID(rv.Root), callback) } else if sv, ok := encoded.(*wire.Slice); ok && sv.Ref.Root != 0 { // See decodeObject; we need to wait for the array (if non-nil). ds.wait(ods, objectID(sv.Ref.Root), callback) } else if iv, ok := encoded.(*wire.Interface); ok { // It's an interface (wait recursively). ds.waitObject(ods, iv.Value, callback) } else if callback != nil { // Nothing to wait for: execute the callback immediately. callback() } } // walkChild returns a child object from obj, given an accessor path. This is // the decode-side equivalent to traverse in encode.go. // // For the purposes of this function, a child object is either a field within a // struct or an array element, with one such indirection per element in // path. The returned value may be an unexported field, so it may not be // directly assignable. See decode_unsafe.go. func walkChild(path []wire.Dot, obj reflect.Value) reflect.Value { // See wire.Ref.Dots. The path here is specified in reverse order. for i := len(path) - 1; i >= 0; i-- { switch pc := path[i].(type) { case *wire.FieldName: // Must be a pointer. if obj.Kind() != reflect.Struct { Failf("next component in child path is a field name, but the current object is not a struct. Path: %v, current obj: %#v", path, obj) } obj = obj.FieldByName(string(*pc)) case wire.Index: // Embedded. if obj.Kind() != reflect.Array { Failf("next component in child path is an array index, but the current object is not an array. Path: %v, current obj: %#v", path, obj) } obj = obj.Index(int(pc)) default: panic("unreachable: switch should be exhaustive") } } return obj } // register registers a decode with a type. // // This type is only used to instantiate a new object if it has not been // registered previously. This depends on the type provided if none is // available in the object itself. func (ds *decodeState) register(r *wire.Ref, typ reflect.Type) reflect.Value { // Grow the objectsByID slice. id := objectID(r.Root) if len(ds.objectsByID) < int(id) { ds.objectsByID = append(ds.objectsByID, make([]*objectDecodeState, int(id)-len(ds.objectsByID))...) } // Does this object already exist? ods := ds.objectsByID[id-1] if ods != nil { return walkChild(r.Dots, ods.obj) } // Create the object. if len(r.Dots) != 0 { typ = ds.findType(r.Type) } v := reflect.New(typ) ods = &objectDecodeState{ id: id, obj: v.Elem(), } ds.objectsByID[id-1] = ods ds.pending.PushBack(ods) // Process any deferred objects & callbacks. if encoded, ok := ds.deferred[id]; ok { delete(ds.deferred, id) ds.decodeObject(ods, ods.obj, encoded) } return walkChild(r.Dots, ods.obj) } // objectDecoder is for decoding structs. type objectDecoder struct { // ds is decodeState. ds *decodeState // ods is current object being decoded. ods *objectDecodeState // reconciledTypeEntry is the reconciled type information. rte *reconciledTypeEntry // encoded is the encoded object state. encoded *wire.Struct } // load is helper for the public methods on Source. func (od *objectDecoder) load(slot int, objPtr reflect.Value, wait bool, fn func()) { // Note that we have reconciled the type and may remap the fields here // to match what's expected by the decoder. The "slot" parameter here // is in terms of the local type, where the fields in the encoded // object are in terms of the wire object's type, which might be in a // different order (but will have the same fields). v := *od.encoded.Field(od.rte.FieldOrder[slot]) od.ds.decodeObject(od.ods, objPtr.Elem(), v) if wait { // Mark this individual object a blocker. od.ds.waitObject(od.ods, v, fn) } } // aterLoad implements Source.AfterLoad. func (od *objectDecoder) afterLoad(fn func()) { // Queue the local callback; this will execute when all of the above // data dependencies have been cleared. od.ods.addCallback(userCallback(fn)) } // decodeStruct decodes a struct value. func (ds *decodeState) decodeStruct(ods *objectDecodeState, obj reflect.Value, encoded *wire.Struct) { if encoded.TypeID == 0 { // Allow anonymous empty structs, but only if the encoded // object also has no fields. if encoded.Fields() == 0 && obj.NumField() == 0 { return } // Propagate an error. Failf("empty struct on wire %#v has field mismatch with type %q", encoded, obj.Type().Name()) } // Lookup the object type. rte := ds.types.Lookup(typeID(encoded.TypeID), obj.Type()) ods.typ = typeID(encoded.TypeID) // Invoke the loader. od := objectDecoder{ ds: ds, ods: ods, rte: rte, encoded: encoded, } ds.stats.start(ods.typ) defer ds.stats.done() if sl, ok := obj.Addr().Interface().(SaverLoader); ok { // Note: may be a registered empty struct which does not // implement the saver/loader interfaces. sl.StateLoad(ds.ctx, Source{internal: od}) } } // decodeMap decodes a map value. func (ds *decodeState) decodeMap(ods *objectDecodeState, obj reflect.Value, encoded *wire.Map) { if obj.IsNil() { // See pointerTo. obj.Set(reflect.MakeMap(obj.Type())) } for i := 0; i < len(encoded.Keys); i++ { // Decode the objects. kv := reflect.New(obj.Type().Key()).Elem() vv := reflect.New(obj.Type().Elem()).Elem() ds.decodeObject(ods, kv, encoded.Keys[i]) ds.decodeObject(ods, vv, encoded.Values[i]) ds.waitObject(ods, encoded.Keys[i], nil) ds.waitObject(ods, encoded.Values[i], nil) // Set in the map. obj.SetMapIndex(kv, vv) } } // decodeArray decodes an array value. func (ds *decodeState) decodeArray(ods *objectDecodeState, obj reflect.Value, encoded *wire.Array) { if len(encoded.Contents) != obj.Len() { Failf("mismatching array length expect=%d, actual=%d", obj.Len(), len(encoded.Contents)) } // Decode the contents into the array. for i := 0; i < len(encoded.Contents); i++ { ds.decodeObject(ods, obj.Index(i), encoded.Contents[i]) ds.waitObject(ods, encoded.Contents[i], nil) } } // findType finds the type for the given wire.TypeSpecs. func (ds *decodeState) findType(t wire.TypeSpec) reflect.Type { switch x := t.(type) { case wire.TypeID: typ := ds.types.LookupType(typeID(x)) rte := ds.types.Lookup(typeID(x), typ) return rte.LocalType case *wire.TypeSpecPointer: return reflect.PtrTo(ds.findType(x.Type)) case *wire.TypeSpecArray: return reflect.ArrayOf(int(x.Count), ds.findType(x.Type)) case *wire.TypeSpecSlice: return reflect.SliceOf(ds.findType(x.Type)) case *wire.TypeSpecMap: return reflect.MapOf(ds.findType(x.Key), ds.findType(x.Value)) default: // Should not happen. Failf("unknown type %#v", t) } panic("unreachable") } // decodeInterface decodes an interface value. func (ds *decodeState) decodeInterface(ods *objectDecodeState, obj reflect.Value, encoded *wire.Interface) { if _, ok := encoded.Type.(wire.TypeSpecNil); ok { // Special case; the nil object. Just decode directly, which // will read nil from the wire (if encoded correctly). ds.decodeObject(ods, obj, encoded.Value) return } // We now need to resolve the actual type. typ := ds.findType(encoded.Type) // We need to imbue type information here, then we can proceed to // decode normally. In order to avoid issues with setting value-types, // we create a new non-interface version of this object. We will then // set the interface object to be equal to whatever we decode. origObj := obj obj = reflect.New(typ).Elem() defer origObj.Set(obj) // With the object now having sufficient type information to actually // have Set called on it, we can proceed to decode the value. ds.decodeObject(ods, obj, encoded.Value) } // isFloatEq determines if x and y represent the same value. func isFloatEq(x float64, y float64) bool { switch { case math.IsNaN(x): return math.IsNaN(y) case math.IsInf(x, 1): return math.IsInf(y, 1) case math.IsInf(x, -1): return math.IsInf(y, -1) default: return x == y } } // isComplexEq determines if x and y represent the same value. func isComplexEq(x complex128, y complex128) bool { return isFloatEq(real(x), real(y)) && isFloatEq(imag(x), imag(y)) } // decodeObject decodes a object value. func (ds *decodeState) decodeObject(ods *objectDecodeState, obj reflect.Value, encoded wire.Object) { switch x := encoded.(type) { case wire.Nil: // Fast path: first. // We leave obj alone here. That's because if obj represents an // interface, it may have been imbued with type information in // decodeInterface, and we don't want to destroy that. case *wire.Ref: // Nil pointers may be encoded in a "forceValue" context. For // those we just leave it alone as the value will already be // correct (nil). if id := objectID(x.Root); id == 0 { return } // Note that if this is a map type, we go through a level of // indirection to allow for map aliasing. if obj.Kind() == reflect.Map { v := ds.register(x, obj.Type()) if v.IsNil() { // Note that we don't want to clobber the map // if has already been decoded by decodeMap. We // just make it so that we have a consistent // reference when that eventually does happen. v.Set(reflect.MakeMap(v.Type())) } obj.Set(v) return } // Normal assignment: authoritative only if no dots. v := ds.register(x, obj.Type().Elem()) obj.Set(reflectValueRWAddr(v)) case wire.Bool: obj.SetBool(bool(x)) case wire.Int: obj.SetInt(int64(x)) if obj.Int() != int64(x) { Failf("signed integer truncated from %v to %v", int64(x), obj.Int()) } case wire.Uint: obj.SetUint(uint64(x)) if obj.Uint() != uint64(x) { Failf("unsigned integer truncated from %v to %v", uint64(x), obj.Uint()) } case wire.Float32: obj.SetFloat(float64(x)) case wire.Float64: obj.SetFloat(float64(x)) if !isFloatEq(obj.Float(), float64(x)) { Failf("floating point number truncated from %v to %v", float64(x), obj.Float()) } case *wire.Complex64: obj.SetComplex(complex128(*x)) case *wire.Complex128: obj.SetComplex(complex128(*x)) if !isComplexEq(obj.Complex(), complex128(*x)) { Failf("complex number truncated from %v to %v", complex128(*x), obj.Complex()) } case *wire.String: obj.SetString(string(*x)) case *wire.Slice: // See *wire.Ref above; same applies. if id := objectID(x.Ref.Root); id == 0 { return } // Note that it's fine to slice the array here and assume that // contents will still be filled in later on. typ := reflect.ArrayOf(int(x.Capacity), obj.Type().Elem()) // The object type. v := ds.register(&x.Ref, typ) obj.Set(reflectValueRWSlice3(v, 0, int(x.Length), int(x.Capacity))) case *wire.Array: ds.decodeArray(ods, obj, x) case *wire.Struct: ds.decodeStruct(ods, obj, x) case *wire.Map: ds.decodeMap(ods, obj, x) case *wire.Interface: ds.decodeInterface(ods, obj, x) default: // Should not happen, not propagated as an error. Failf("unknown object %#v for %q", encoded, obj.Type().Name()) } } // Load deserializes the object graph rooted at obj. // // This function may panic and should be run in safely(). func (ds *decodeState) Load(obj reflect.Value) { ds.stats.init() defer ds.stats.fini(func(id typeID) string { return ds.types.LookupName(id) }) // Create the root object. rootOds := &objectDecodeState{ id: 1, obj: obj, } ds.objectsByID = append(ds.objectsByID, rootOds) ds.pending.PushBack(rootOds) // Read the number of objects. numObjects, object, err := ReadHeader(ds.r) if err != nil { Failf("header error: %w", err) } if !object { Failf("object missing") } // Decode all objects. var ( encoded wire.Object ods *objectDecodeState id objectID tid = typeID(1) ) if err := safely(func() { // Decode all objects in the stream. // // Note that the structure of this decoding loop should match the raw // decoding loop in state/pretty/pretty.printer.printStream(). for i := uint64(0); i < numObjects; { // Unmarshal either a type object or object ID. encoded = wire.Load(ds.r) switch we := encoded.(type) { case *wire.Type: ds.types.Register(we) tid++ encoded = nil continue case wire.Uint: id = objectID(we) i++ // Unmarshal and resolve the actual object. encoded = wire.Load(ds.r) ods = ds.lookup(id) if ods != nil { // Decode the object. ds.decodeObject(ods, ods.obj, encoded) } else { // If an object hasn't had interest registered // previously or isn't yet valid, we deferred // decoding until interest is registered. ds.deferred[id] = encoded } // For error handling. ods = nil encoded = nil default: Failf("wanted type or object ID, got %T", encoded) } } }); err != nil { // Include as much information as we can, taking into account // the possible state transitions above. if ods != nil { Failf("error decoding object ID %d (%T) from %#v: %w", id, ods.obj.Interface(), encoded, err) } else if encoded != nil { Failf("error decoding from %#v: %w", encoded, err) } else { Failf("general decoding error: %w", err) } } // Check if we have any deferred objects. numDeferred := 0 for id, encoded := range ds.deferred { numDeferred++ if s, ok := encoded.(*wire.Struct); ok && s.TypeID != 0 { typ := ds.types.LookupType(typeID(s.TypeID)) Failf("unused deferred object: ID %d, type %v", id, typ) } else { Failf("unused deferred object: ID %d, %#v", id, encoded) } } if numDeferred != 0 { Failf("still had %d deferred objects", numDeferred) } // Scan and fire all callbacks. We iterate over the list of incomplete // objects until all have been finished. We stop iterating if no // objects become complete (there is a dependency cycle). // // Note that we iterate backwards here, because there will be a strong // tendendcy for blocking relationships to go from earlier objects to // later (deeper) objects in the graph. This will reduce the number of // iterations required to finish all objects. if err := safely(func() { for ds.pending.Back() != nil { thisCycle := false for ods = ds.pending.Back(); ods != nil; { if ds.checkComplete(ods) { thisCycle = true break } ods = ods.Prev() } if !thisCycle { break } } }); err != nil { Failf("error executing callbacks: %w\nfor object %#v", err, ods.obj.Interface()) } // Check if we have any remaining dependency cycles. If there are any // objects left in the pending list, then it must be due to a cycle. if ods := ds.pending.Front(); ods != nil { // This must be the result of a dependency cycle. cycle := ods.findCycle() var buf bytes.Buffer buf.WriteString("dependency cycle: {") for i, cycleOS := range cycle { if i > 0 { buf.WriteString(" => ") } fmt.Fprintf(&buf, "%q", cycleOS.obj.Type()) } buf.WriteString("}") Failf("incomplete graph: %s", string(buf.Bytes())) } } // ReadHeader reads an object header. // // Each object written to the statefile is prefixed with a header. See // WriteHeader for more information; these functions are exported to allow // non-state writes to the file to play nice with debugging tools. func ReadHeader(r io.Reader) (length uint64, object bool, err error) { // Read the header. err = safely(func() { length = wire.LoadUint(r) }) if err != nil { // On the header, pass raw I/O errors. if sErr, ok := err.(*ErrState); ok { return 0, false, sErr.Unwrap() } } // Decode whether the object is valid. object = length&objectFlag != 0 length &^= objectFlag return } golang-gvisor-gvisor-0.0~20240729.0/pkg/state/decode_unsafe.go000066400000000000000000000047101465435605700236610ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package state import ( "fmt" "reflect" "runtime" "unsafe" ) // reflectValueRWAddr is equivalent to obj.Addr(), except that the returned // reflect.Value is usable in assignments even if obj was obtained by the use // of unexported struct fields. // // Preconditions: obj.CanAddr(). func reflectValueRWAddr(obj reflect.Value) reflect.Value { return reflect.NewAt(obj.Type(), unsafe.Pointer(obj.UnsafeAddr())) } // reflectValueRWSlice3 is equivalent to arr.Slice3(i, j, k), except that the // returned reflect.Value is usable in assignments even if obj was obtained by // the use of unexported struct fields. // // Preconditions: // - arr.Kind() == reflect.Array. // - i, j, k >= 0. // - i <= j <= k <= arr.Len(). func reflectValueRWSlice3(arr reflect.Value, i, j, k int) reflect.Value { if arr.Kind() != reflect.Array { panic(fmt.Sprintf("arr has kind %v, wanted %v", arr.Kind(), reflect.Array)) } if i < 0 || j < 0 || k < 0 { panic(fmt.Sprintf("negative subscripts (%d, %d, %d)", i, j, k)) } if i > j { panic(fmt.Sprintf("subscript i (%d) > j (%d)", i, j)) } if j > k { panic(fmt.Sprintf("subscript j (%d) > k (%d)", j, k)) } if k > arr.Len() { panic(fmt.Sprintf("subscript k (%d) > array length (%d)", k, arr.Len())) } sliceTyp := reflect.SliceOf(arr.Type().Elem()) if i == arr.Len() { // By precondition, i == j == k == arr.Len(). return reflect.MakeSlice(sliceTyp, 0, 0) } slh := reflect.SliceHeader{ // reflect.Value.CanAddr() == false for arrays, so we need to get the // address from the first element of the array. Data: arr.Index(i).UnsafeAddr(), Len: j - i, Cap: k - i, } slobj := reflect.NewAt(sliceTyp, unsafe.Pointer(&slh)).Elem() // Before slobj is constructed, arr holds the only pointer-typed pointer to // the array since reflect.SliceHeader.Data is a uintptr, so arr must be // kept alive. runtime.KeepAlive(arr) return slobj } golang-gvisor-gvisor-0.0~20240729.0/pkg/state/deferred_list.go000066400000000000000000000124161465435605700237120ustar00rootroot00000000000000package state // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type deferredElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (deferredElementMapper) linkerFor(elem *objectEncodeState) *objectEncodeState { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type deferredList struct { head *objectEncodeState tail *objectEncodeState } // Reset resets list l to the empty state. func (l *deferredList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *deferredList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *deferredList) Front() *objectEncodeState { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *deferredList) Back() *objectEncodeState { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *deferredList) Len() (count int) { for e := l.Front(); e != nil; e = (deferredElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *deferredList) PushFront(e *objectEncodeState) { linker := deferredElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { deferredElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *deferredList) PushFrontList(m *deferredList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { deferredElementMapper{}.linkerFor(l.head).SetPrev(m.tail) deferredElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *deferredList) PushBack(e *objectEncodeState) { linker := deferredElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { deferredElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *deferredList) PushBackList(m *deferredList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { deferredElementMapper{}.linkerFor(l.tail).SetNext(m.head) deferredElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *deferredList) InsertAfter(b, e *objectEncodeState) { bLinker := deferredElementMapper{}.linkerFor(b) eLinker := deferredElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { deferredElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *deferredList) InsertBefore(a, e *objectEncodeState) { aLinker := deferredElementMapper{}.linkerFor(a) eLinker := deferredElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { deferredElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *deferredList) Remove(e *objectEncodeState) { linker := deferredElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { deferredElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { deferredElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type deferredEntry struct { next *objectEncodeState prev *objectEncodeState } // Next returns the entry that follows e in the list. // //go:nosplit func (e *deferredEntry) Next() *objectEncodeState { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *deferredEntry) Prev() *objectEncodeState { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *deferredEntry) SetNext(elem *objectEncodeState) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *deferredEntry) SetPrev(elem *objectEncodeState) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/state/encode.go000066400000000000000000000647341465435605700223460ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package state import ( "context" "io" "reflect" "sort" "gvisor.dev/gvisor/pkg/state/wire" ) // objectEncodeState the type and identity of an object occupying a memory // address range. This is the value type for addrSet, and the intrusive entry // for the deferred list. type objectEncodeState struct { // id is the assigned ID for this object. id objectID // obj is the object value. Note that this may be replaced if we // encounter an object that contains this object. When this happens (in // resolve), we will update existing references appropriately, below, // and defer a re-encoding of the object. obj reflect.Value // encoded is the encoded value of this object. Note that this may not // be up to date if this object is still in the deferred list. encoded wire.Object // how indicates whether this object should be encoded as a value. This // is used only for deferred encoding. how encodeStrategy // refs are the list of reference objects used by other objects // referring to this object. When the object is updated, these // references may be updated directly and automatically. refs []*wire.Ref deferredEntry } // encodeState is state used for encoding. // // The encoding process constructs a representation of the in-memory graph of // objects before a single object is serialized. This is done to ensure that // all references can be fully disambiguated. See resolve for more details. type encodeState struct { // ctx is the encode context. ctx context.Context // w is the output stream. w io.Writer // types is the type database. types typeEncodeDatabase // lastID is the last allocated object ID. lastID objectID // values tracks the address ranges occupied by objects, along with the // types of these objects. This is used to locate pointer targets, // including pointers to fields within another type. // // Multiple objects may overlap in memory iff the larger object fully // contains the smaller one, and the type of the smaller object matches // a field or array element's type at the appropriate offset. An // arbitrary number of objects may be nested in this manner. // // Note that this does not track zero-sized objects, those are tracked // by zeroValues below. values addrSet // zeroValues tracks zero-sized objects. zeroValues map[reflect.Type]*objectEncodeState // deferred is the list of objects to be encoded. deferred deferredList // pendingTypes is the list of types to be serialized. Serialization // will occur when all objects have been encoded, but before pending is // serialized. pendingTypes []wire.Type // pending maps object IDs to objects to be serialized. Serialization does // not actually occur until the full object graph is computed. pending map[objectID]*objectEncodeState // encodedStructs maps reflect.Values representing structs to previous // encodings of those structs. This is necessary to avoid duplicate calls // to SaverLoader.StateSave() that may result in multiple calls to // Sink.SaveValue() for a given field, resulting in object duplication. encodedStructs map[reflect.Value]*wire.Struct // stats tracks time data. stats Stats } // isSameSizeParent returns true if child is a field value or element within // parent. Only a struct or array can have a child value. // // isSameSizeParent deals with objects like this: // // struct child { // // fields.. // } // // struct parent { // c child // } // // var p parent // record(&p.c) // // Here, &p and &p.c occupy the exact same address range. // // Or like this: // // struct child { // // fields // } // // var arr [1]parent // record(&arr[0]) // // Similarly, &arr[0] and &arr[0].c have the exact same address range. // // Precondition: parent and child must occupy the same memory. func isSameSizeParent(parent reflect.Value, childType reflect.Type) bool { switch parent.Kind() { case reflect.Struct: for i := 0; i < parent.NumField(); i++ { field := parent.Field(i) if field.Type() == childType { return true } // Recurse through any intermediate types. if isSameSizeParent(field, childType) { return true } // Does it make sense to keep going if the first field // doesn't match? Yes, because there might be an // arbitrary number of zero-sized fields before we get // a match, and childType itself can be zero-sized. } return false case reflect.Array: // The only case where an array with more than one elements can // return true is if childType is zero-sized. In such cases, // it's ambiguous which element contains the match since a // zero-sized child object fully fits in any of the zero-sized // elements in an array... However since all elements are of // the same type, we only need to check one element. // // For non-zero-sized childTypes, parent.Len() must be 1, but a // combination of the precondition and an implicit comparison // between the array element size and childType ensures this. return parent.Len() > 0 && isSameSizeParent(parent.Index(0), childType) default: return false } } // nextID returns the next valid ID. func (es *encodeState) nextID() objectID { es.lastID++ return objectID(es.lastID) } // dummyAddr points to the dummy zero-sized address. var dummyAddr = reflect.ValueOf(new(struct{})).Pointer() // resolve records the address range occupied by an object. func (es *encodeState) resolve(obj reflect.Value, ref *wire.Ref) { addr := obj.Pointer() // Is this a map pointer? Just record the single address. It is not // possible to take any pointers into the map internals. if obj.Kind() == reflect.Map { if addr == 0 { // Just leave the nil reference alone. This is fine, we // may need to encode as a reference in this way. We // return nil for our objectEncodeState so that anyone // depending on this value knows there's nothing there. return } seg, gap := es.values.Find(addr) if seg.Ok() { // Ensure the map types match. existing := seg.Value() if existing.obj.Type() != obj.Type() { Failf("overlapping map objects at 0x%x: [new object] %#v [existing object type] %s", addr, obj, existing.obj) } // No sense recording refs, maps may not be replaced by // covering objects, they are maximal. ref.Root = wire.Uint(existing.id) return } // Record the map. r := addrRange{addr, addr + 1} oes := &objectEncodeState{ id: es.nextID(), obj: obj, how: encodeMapAsValue, } // Use Insert instead of InsertWithoutMergingUnchecked when race // detection is enabled to get additional sanity-checking from Merge. if !raceEnabled { es.values.InsertWithoutMergingUnchecked(gap, r, oes) } else { es.values.Insert(gap, r, oes) } es.pending[oes.id] = oes es.deferred.PushBack(oes) // See above: no ref recording. ref.Root = wire.Uint(oes.id) return } // If not a map, then the object must be a pointer. if obj.Kind() != reflect.Ptr { Failf("attempt to record non-map and non-pointer object %#v", obj) } obj = obj.Elem() // Value from here. // Is this a zero-sized type? typ := obj.Type() size := typ.Size() if size == 0 { if addr == dummyAddr { // Zero-sized objects point to a dummy byte within the // runtime. There's no sense recording this in the // address map. We add this to the dedicated // zeroValues. // // Note that zero-sized objects must be *true* // zero-sized objects. They cannot be part of some // larger object. In that case, they are assigned a // 1-byte address at the end of the object. oes, ok := es.zeroValues[typ] if !ok { oes = &objectEncodeState{ id: es.nextID(), obj: obj, } es.zeroValues[typ] = oes es.pending[oes.id] = oes es.deferred.PushBack(oes) } // There's also no sense tracking back references. We // know that this is a true zero-sized object, and not // part of a larger container, so it will not change. ref.Root = wire.Uint(oes.id) return } size = 1 // See above. } end := addr + size r := addrRange{addr, end} seg := es.values.LowerBoundSegment(addr) var ( oes *objectEncodeState gap addrGapIterator ) // Does at least one previously-registered object overlap this one? if seg.Ok() && seg.Start() < end { existing := seg.Value() if seg.Range() == r && typ == existing.obj.Type() { // This exact object is already registered. Avoid the traversal and // just return directly. We don't need to encode the type // information or any dots here. ref.Root = wire.Uint(existing.id) existing.refs = append(existing.refs, ref) return } if seg.Range().IsSupersetOf(r) && (seg.Range() != r || isSameSizeParent(existing.obj, typ)) { // This object is contained within a previously-registered object. // Perform traversal from the container to the new object. ref.Root = wire.Uint(existing.id) ref.Dots = traverse(existing.obj.Type(), typ, seg.Start(), addr) ref.Type = es.findType(existing.obj.Type()) existing.refs = append(existing.refs, ref) return } // This object contains one or more previously-registered objects. // Remove them and update existing references to use the new one. oes := &objectEncodeState{ // Reuse the root ID of the first contained element. id: existing.id, obj: obj, } type elementEncodeState struct { addr uintptr typ reflect.Type refs []*wire.Ref } var ( elems []elementEncodeState gap addrGapIterator ) for { // Each contained object should be completely contained within // this one. if raceEnabled && !r.IsSupersetOf(seg.Range()) { Failf("containing object %#v does not contain existing object %#v", obj, existing.obj) } elems = append(elems, elementEncodeState{ addr: seg.Start(), typ: existing.obj.Type(), refs: existing.refs, }) delete(es.pending, existing.id) es.deferred.Remove(existing) gap = es.values.Remove(seg) seg = gap.NextSegment() if !seg.Ok() || seg.Start() >= end { break } existing = seg.Value() } wt := es.findType(typ) for _, elem := range elems { dots := traverse(typ, elem.typ, addr, elem.addr) for _, ref := range elem.refs { ref.Root = wire.Uint(oes.id) ref.Dots = append(ref.Dots, dots...) ref.Type = wt } oes.refs = append(oes.refs, elem.refs...) } // Finally register the new containing object. if !raceEnabled { es.values.InsertWithoutMergingUnchecked(gap, r, oes) } else { es.values.Insert(gap, r, oes) } es.pending[oes.id] = oes es.deferred.PushBack(oes) ref.Root = wire.Uint(oes.id) oes.refs = append(oes.refs, ref) return } // No existing object overlaps this one. Register a new object. oes = &objectEncodeState{ id: es.nextID(), obj: obj, } if seg.Ok() { gap = seg.PrevGap() } else { gap = es.values.LastGap() } if !raceEnabled { es.values.InsertWithoutMergingUnchecked(gap, r, oes) } else { es.values.Insert(gap, r, oes) } es.pending[oes.id] = oes es.deferred.PushBack(oes) ref.Root = wire.Uint(oes.id) oes.refs = append(oes.refs, ref) } // traverse searches for a target object within a root object, where the target // object is a struct field or array element within root, with potentially // multiple intervening types. traverse returns the set of field or element // traversals required to reach the target. // // Note that for efficiency, traverse returns the dots in the reverse order. // That is, the first traversal required will be the last element of the list. // // Precondition: The target object must lie completely within the range defined // by [rootAddr, rootAddr + sizeof(rootType)]. func traverse(rootType, targetType reflect.Type, rootAddr, targetAddr uintptr) []wire.Dot { // Recursion base case: the types actually match. if targetType == rootType && targetAddr == rootAddr { return nil } switch rootType.Kind() { case reflect.Struct: offset := targetAddr - rootAddr for i := rootType.NumField(); i > 0; i-- { field := rootType.Field(i - 1) // The first field from the end with an offset that is // smaller than or equal to our address offset is where // the target is located. Traverse from there. if field.Offset <= offset { dots := traverse(field.Type, targetType, rootAddr+field.Offset, targetAddr) fieldName := wire.FieldName(field.Name) return append(dots, &fieldName) } } // Should never happen; the target should be reachable. Failf("no field in root type %v contains target type %v", rootType, targetType) case reflect.Array: // Since arrays have homogeneous types, all elements have the // same size and we can compute where the target lives. This // does not matter for the purpose of typing, but matters for // the purpose of computing the address of the given index. elemSize := int(rootType.Elem().Size()) n := int(targetAddr-rootAddr) / elemSize // Relies on integer division rounding down. if rootType.Len() < n { Failf("traversal target of type %v @%x is beyond the end of the array type %v @%x with %v elements", targetType, targetAddr, rootType, rootAddr, rootType.Len()) } dots := traverse(rootType.Elem(), targetType, rootAddr+uintptr(n*elemSize), targetAddr) return append(dots, wire.Index(n)) default: // For any other type, there's no possibility of aliasing so if // the types didn't match earlier then we have an address // collision which shouldn't be possible at this point. Failf("traverse failed for root type %v and target type %v", rootType, targetType) } panic("unreachable") } // encodeMap encodes a map. func (es *encodeState) encodeMap(obj reflect.Value, dest *wire.Object) { if obj.IsNil() { // Because there is a difference between a nil map and an empty // map, we need to not decode in the case of a truly nil map. *dest = wire.Nil{} return } l := obj.Len() m := &wire.Map{ Keys: make([]wire.Object, l), Values: make([]wire.Object, l), } *dest = m for i, k := range obj.MapKeys() { v := obj.MapIndex(k) // Map keys must be encoded using the full value because the // type will be omitted after the first key. es.encodeObject(k, encodeAsValue, &m.Keys[i]) es.encodeObject(v, encodeAsValue, &m.Values[i]) } } // objectEncoder is for encoding structs. type objectEncoder struct { // es is encodeState. es *encodeState // encoded is the encoded struct. encoded *wire.Struct } // save is called by the public methods on Sink. func (oe *objectEncoder) save(slot int, obj reflect.Value) { fieldValue := oe.encoded.Field(slot) oe.es.encodeObject(obj, encodeDefault, fieldValue) } // encodeStruct encodes a composite object. func (es *encodeState) encodeStruct(obj reflect.Value, dest *wire.Object) { if s, ok := es.encodedStructs[obj]; ok { *dest = s return } s := &wire.Struct{} *dest = s es.encodedStructs[obj] = s // Ensure that the obj is addressable. There are two cases when it is // not. First, is when this is dispatched via SaveValue. Second, when // this is a map key as a struct. Either way, we need to make a copy to // obtain an addressable value. if !obj.CanAddr() { localObj := reflect.New(obj.Type()) localObj.Elem().Set(obj) obj = localObj.Elem() } // Look the type up in the database. te, ok := es.types.Lookup(obj.Type()) if te == nil { if obj.NumField() == 0 { // Allow unregistered anonymous, empty structs. This // will just return success without ever invoking the // passed function. This uses the immutable EmptyStruct // variable to prevent an allocation in this case. // // Note that this mechanism does *not* work for // interfaces in general. So you can't dispatch // non-registered empty structs via interfaces because // then they can't be restored. s.Alloc(0) return } // We need a SaverLoader for struct types. Failf("struct %T does not implement SaverLoader", obj.Interface()) } if !ok { // Queue the type to be serialized. es.pendingTypes = append(es.pendingTypes, te.Type) } // Invoke the provided saver. s.TypeID = wire.TypeID(te.ID) s.Alloc(len(te.Fields)) oe := objectEncoder{ es: es, encoded: s, } es.stats.start(te.ID) defer es.stats.done() if sl, ok := obj.Addr().Interface().(SaverLoader); ok { // Note: may be a registered empty struct which does not // implement the saver/loader interfaces. sl.StateSave(Sink{internal: oe}) } } // encodeArray encodes an array. func (es *encodeState) encodeArray(obj reflect.Value, dest *wire.Object) { l := obj.Len() a := &wire.Array{ Contents: make([]wire.Object, l), } *dest = a for i := 0; i < l; i++ { // We need to encode the full value because arrays are encoded // using the type information from only the first element. es.encodeObject(obj.Index(i), encodeAsValue, &a.Contents[i]) } } // findType recursively finds type information. func (es *encodeState) findType(typ reflect.Type) wire.TypeSpec { // First: check if this is a proper type. It's possible for pointers, // slices, arrays, maps, etc to all have some different type. te, ok := es.types.Lookup(typ) if te != nil { if !ok { // See encodeStruct. es.pendingTypes = append(es.pendingTypes, te.Type) } return wire.TypeID(te.ID) } switch typ.Kind() { case reflect.Ptr: return &wire.TypeSpecPointer{ Type: es.findType(typ.Elem()), } case reflect.Slice: return &wire.TypeSpecSlice{ Type: es.findType(typ.Elem()), } case reflect.Array: return &wire.TypeSpecArray{ Count: wire.Uint(typ.Len()), Type: es.findType(typ.Elem()), } case reflect.Map: return &wire.TypeSpecMap{ Key: es.findType(typ.Key()), Value: es.findType(typ.Elem()), } default: // After potentially chasing many pointers, the // ultimate type of the object is not known. Failf("type %q is not known", typ) } panic("unreachable") } // encodeInterface encodes an interface. func (es *encodeState) encodeInterface(obj reflect.Value, dest *wire.Object) { // Dereference the object. obj = obj.Elem() if !obj.IsValid() { // Special case: the nil object. *dest = &wire.Interface{ Type: wire.TypeSpecNil{}, Value: wire.Nil{}, } return } // Encode underlying object. i := &wire.Interface{ Type: es.findType(obj.Type()), } *dest = i es.encodeObject(obj, encodeAsValue, &i.Value) } // isPrimitive returns true if this is a primitive object, or a composite // object composed entirely of primitives. func isPrimitiveZero(typ reflect.Type) bool { switch typ.Kind() { case reflect.Ptr: // Pointers are always treated as primitive types because we // won't encode directly from here. Returning true here won't // prevent the object from being encoded correctly. return true case reflect.Bool: return true case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: return true case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr: return true case reflect.Float32, reflect.Float64: return true case reflect.Complex64, reflect.Complex128: return true case reflect.String: return true case reflect.Slice: // The slice itself a primitive, but not necessarily the array // that points to. This is similar to a pointer. return true case reflect.Array: // We cannot treat an array as a primitive, because it may be // composed of structures or other things with side-effects. return isPrimitiveZero(typ.Elem()) case reflect.Interface: // Since we now that this type is the zero type, the interface // value must be zero. Therefore this is primitive. return true case reflect.Struct: return false case reflect.Map: // The isPrimitiveZero function is called only on zero-types to // see if it's safe to serialize. Since a zero map has no // elements, it is safe to treat as a primitive. return true default: Failf("unknown type %q", typ.Name()) } panic("unreachable") } // encodeStrategy is the strategy used for encodeObject. type encodeStrategy int const ( // encodeDefault means types are encoded normally as references. encodeDefault encodeStrategy = iota // encodeAsValue means that types will never take short-circuited and // will always be encoded as a normal value. encodeAsValue // encodeMapAsValue means that even maps will be fully encoded. encodeMapAsValue ) // encodeObject encodes an object. func (es *encodeState) encodeObject(obj reflect.Value, how encodeStrategy, dest *wire.Object) { if how == encodeDefault && isPrimitiveZero(obj.Type()) && obj.IsZero() { *dest = wire.Nil{} return } switch obj.Kind() { case reflect.Ptr: // Fast path: first. r := new(wire.Ref) *dest = r if obj.IsNil() { // May be in an array or elsewhere such that a value is // required. So we encode as a reference to the zero // object, which does not exist. Note that this has to // be handled correctly in the decode path as well. return } es.resolve(obj, r) case reflect.Bool: *dest = wire.Bool(obj.Bool()) case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: *dest = wire.Int(obj.Int()) case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr: *dest = wire.Uint(obj.Uint()) case reflect.Float32: *dest = wire.Float32(obj.Float()) case reflect.Float64: *dest = wire.Float64(obj.Float()) case reflect.Complex64: c := wire.Complex64(obj.Complex()) *dest = &c // Needs alloc. case reflect.Complex128: c := wire.Complex128(obj.Complex()) *dest = &c // Needs alloc. case reflect.String: s := wire.String(obj.String()) *dest = &s // Needs alloc. case reflect.Array: es.encodeArray(obj, dest) case reflect.Slice: s := &wire.Slice{ Capacity: wire.Uint(obj.Cap()), Length: wire.Uint(obj.Len()), } *dest = s // Note that we do need to provide a wire.Slice type here as // how is not encodeDefault. If this were the case, then it // would have been caught by the IsZero check above and we // would have just used wire.Nil{}. if obj.IsNil() { return } // Slices need pointer resolution. es.resolve(arrayFromSlice(obj), &s.Ref) case reflect.Interface: es.encodeInterface(obj, dest) case reflect.Struct: es.encodeStruct(obj, dest) case reflect.Map: if how == encodeMapAsValue { es.encodeMap(obj, dest) return } r := new(wire.Ref) *dest = r es.resolve(obj, r) default: Failf("unknown object %#v", obj.Interface()) panic("unreachable") } } // Save serializes the object graph rooted at obj. func (es *encodeState) Save(obj reflect.Value) { es.stats.init() defer es.stats.fini(func(id typeID) string { return es.pendingTypes[id-1].Name }) // Resolve the first object, which should queue a pile of additional // objects on the pending list. All queued objects should be fully // resolved, and we should be able to serialize after this call. var root wire.Ref es.resolve(obj.Addr(), &root) // Encode the graph. var oes *objectEncodeState if err := safely(func() { for oes = es.deferred.Front(); oes != nil; oes = es.deferred.Front() { // Remove and encode the object. Note that as a result // of this encoding, the object may be enqueued on the // deferred list yet again. That's expected, and why it // is removed first. es.deferred.Remove(oes) es.encodeObject(oes.obj, oes.how, &oes.encoded) } }); err != nil { // Include the object in the error message. Failf("encoding error: %w\nfor object %#v", err, oes.obj.Interface()) } // Check that we have objects to serialize. if len(es.pending) == 0 { Failf("pending is empty?") } // Write the header with the number of objects. if err := WriteHeader(es.w, uint64(len(es.pending)), true); err != nil { Failf("error writing header: %w", err) } // Serialize all pending types and pending objects. Note that we don't // bother removing from this list as we walk it because that just // wastes time. It will not change after this point. if err := safely(func() { for _, wt := range es.pendingTypes { // Encode the type. wire.Save(es.w, &wt) } // Emit objects in ID order. ids := make([]objectID, 0, len(es.pending)) for id := range es.pending { ids = append(ids, id) } sort.Slice(ids, func(i, j int) bool { return ids[i] < ids[j] }) for _, id := range ids { // Encode the id. wire.Save(es.w, wire.Uint(id)) // Marshal the object. oes := es.pending[id] wire.Save(es.w, oes.encoded) } }); err != nil { // Include the object and the error. Failf("error serializing object %#v: %w", oes.encoded, err) } } // objectFlag indicates that the length is a # of objects, rather than a raw // byte length. When this is set on a length header in the stream, it may be // decoded appropriately. const objectFlag uint64 = 1 << 63 // WriteHeader writes a header. // // Each object written to the statefile should be prefixed with a header. In // order to generate statefiles that play nicely with debugging tools, raw // writes should be prefixed with a header with object set to false and the // appropriate length. This will allow tools to skip these regions. func WriteHeader(w io.Writer, length uint64, object bool) error { // Sanity check the length. if length&objectFlag != 0 { Failf("impossibly huge length: %d", length) } if object { length |= objectFlag } // Write a header. return safely(func() { wire.SaveUint(w, length) }) } // addrSetFunctions is used by addrSet. type addrSetFunctions struct{} func (addrSetFunctions) MinKey() uintptr { return 0 } func (addrSetFunctions) MaxKey() uintptr { return ^uintptr(0) } func (addrSetFunctions) ClearValue(val **objectEncodeState) { *val = nil } func (addrSetFunctions) Merge(r1 addrRange, val1 *objectEncodeState, r2 addrRange, val2 *objectEncodeState) (*objectEncodeState, bool) { if val1.obj == val2.obj { // This, should never happen. It would indicate that the same // object exists in two non-contiguous address ranges. Note // that this assertion can only be triggered if the race // detector is enabled. Failf("unexpected merge in addrSet @ %v and %v: %#v and %#v", r1, r2, val1.obj, val2.obj) } // Reject the merge. return val1, false } func (addrSetFunctions) Split(r addrRange, val *objectEncodeState, _ uintptr) (*objectEncodeState, *objectEncodeState) { // A split should never happen: we don't remove ranges. Failf("unexpected split in addrSet @ %v: %#v", r, val.obj) panic("unreachable") } golang-gvisor-gvisor-0.0~20240729.0/pkg/state/encode_unsafe.go000066400000000000000000000017261465435605700236770ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package state import ( "reflect" "unsafe" ) // arrayFromSlice constructs a new pointer to the slice data. // // It would be similar to the following: // // x := make([]Foo, l, c) // a := ([l]Foo*)(unsafe.Pointer(x[0])) func arrayFromSlice(obj reflect.Value) reflect.Value { return reflect.NewAt( reflect.ArrayOf(obj.Cap(), obj.Type().Elem()), unsafe.Pointer(obj.Pointer())) } golang-gvisor-gvisor-0.0~20240729.0/pkg/state/pretty/000077500000000000000000000000001465435605700220735ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/state/pretty/pretty.go000066400000000000000000000213101465435605700237460ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package pretty is a pretty-printer for state streams. package pretty import ( "fmt" "io" "io/ioutil" "reflect" "strings" "gvisor.dev/gvisor/pkg/state" "gvisor.dev/gvisor/pkg/state/wire" ) type printer struct { html bool typeSpecs map[string]*wire.Type } func (p *printer) formatRef(x *wire.Ref, graph uint64) string { baseRef := fmt.Sprintf("g%dr%d", graph, x.Root) fullRef := baseRef if len(x.Dots) > 0 { // See wire.Ref; Type valid if Dots non-zero. typ, _ := p.formatType(x.Type, graph) var buf strings.Builder buf.WriteString("(*") buf.WriteString(typ) buf.WriteString(")(") buf.WriteString(baseRef) buf.WriteString(")") for _, component := range x.Dots { switch v := component.(type) { case *wire.FieldName: buf.WriteString(".") buf.WriteString(string(*v)) case wire.Index: buf.WriteString(fmt.Sprintf("[%d]", v)) default: panic(fmt.Sprintf("unreachable: switch should be exhaustive, unhandled case %v", reflect.TypeOf(component))) } } fullRef = buf.String() } if p.html { return fmt.Sprintf("%s", baseRef, fullRef) } return fullRef } func (p *printer) formatType(t wire.TypeSpec, graph uint64) (string, bool) { switch x := t.(type) { case wire.TypeID: tag := fmt.Sprintf("g%dt%d", graph, x) desc := tag if spec, ok := p.typeSpecs[tag]; ok { desc += fmt.Sprintf("=%s", spec.Name) } else { desc += "!missing-type-spec" } if p.html { return fmt.Sprintf("%s", tag, desc), true } return desc, true case wire.TypeSpecNil: return "", false // Only nil type. case *wire.TypeSpecPointer: element, _ := p.formatType(x.Type, graph) return fmt.Sprintf("(*%s)", element), true case *wire.TypeSpecArray: element, _ := p.formatType(x.Type, graph) return fmt.Sprintf("[%d](%s)", x.Count, element), true case *wire.TypeSpecSlice: element, _ := p.formatType(x.Type, graph) return fmt.Sprintf("([]%s)", element), true case *wire.TypeSpecMap: key, _ := p.formatType(x.Key, graph) value, _ := p.formatType(x.Value, graph) return fmt.Sprintf("(map[%s]%s)", key, value), true default: panic(fmt.Sprintf("unreachable: unknown type %T", t)) } } // format formats a single object, for pretty-printing. It also returns whether // the value is a non-zero value. func (p *printer) format(graph uint64, depth int, encoded wire.Object) (string, bool) { switch x := encoded.(type) { case wire.Nil: return "nil", false case *wire.String: return fmt.Sprintf("%q", *x), *x != "" case *wire.Complex64: return fmt.Sprintf("%f+%fi", real(*x), imag(*x)), *x != 0.0 case *wire.Complex128: return fmt.Sprintf("%f+%fi", real(*x), imag(*x)), *x != 0.0 case *wire.Ref: return p.formatRef(x, graph), x.Root != 0 case *wire.Type: tabs := "\n" + strings.Repeat("\t", depth) items := make([]string, 0, len(x.Fields)+2) items = append(items, fmt.Sprintf("type %s {", x.Name)) for i := 0; i < len(x.Fields); i++ { items = append(items, fmt.Sprintf("\t%d: %s,", i, x.Fields[i])) } items = append(items, "}") return strings.Join(items, tabs), true // No zero value. case *wire.Slice: return fmt.Sprintf("%s{len:%d,cap:%d}", p.formatRef(&x.Ref, graph), x.Length, x.Capacity), x.Capacity != 0 case *wire.Array: if len(x.Contents) == 0 { return "[]", false } items := make([]string, 0, len(x.Contents)+2) zeros := make([]string, 0) // used to eliminate zero entries. items = append(items, "[") tabs := "\n" + strings.Repeat("\t", depth) for i := 0; i < len(x.Contents); i++ { item, ok := p.format(graph, depth+1, x.Contents[i]) if !ok { zeros = append(zeros, fmt.Sprintf("\t%s,", item)) continue } if len(zeros) > 0 { items = append(items, zeros...) zeros = nil } items = append(items, fmt.Sprintf("\t%s,", item)) } if len(zeros) > 0 { items = append(items, fmt.Sprintf("\t... (%d zeros),", len(zeros))) } items = append(items, "]") return strings.Join(items, tabs), len(zeros) < len(x.Contents) case *wire.Struct: tag := fmt.Sprintf("g%dt%d", graph, x.TypeID) spec, _ := p.typeSpecs[tag] typ, _ := p.formatType(x.TypeID, graph) if x.Fields() == 0 { return fmt.Sprintf("struct[%s]{}", typ), false } items := make([]string, 0, 2) items = append(items, fmt.Sprintf("struct[%s]{", typ)) tabs := "\n" + strings.Repeat("\t", depth) allZero := true for i := 0; i < x.Fields(); i++ { var name string if spec != nil && i < len(spec.Fields) { name = spec.Fields[i] } else { name = fmt.Sprintf("%d", i) } element, ok := p.format(graph, depth+1, *x.Field(i)) allZero = allZero && !ok items = append(items, fmt.Sprintf("\t%s: %s,", name, element)) } items = append(items, "}") return strings.Join(items, tabs), !allZero case *wire.Map: if len(x.Keys) == 0 { return "map{}", false } items := make([]string, 0, len(x.Keys)+2) items = append(items, "map{") tabs := "\n" + strings.Repeat("\t", depth) for i := 0; i < len(x.Keys); i++ { key, _ := p.format(graph, depth+1, x.Keys[i]) value, _ := p.format(graph, depth+1, x.Values[i]) items = append(items, fmt.Sprintf("\t%s: %s,", key, value)) } items = append(items, "}") return strings.Join(items, tabs), true case *wire.Interface: typ, typOk := p.formatType(x.Type, graph) element, elementOk := p.format(graph, depth+1, x.Value) return fmt.Sprintf("interface[%s]{%s}", typ, element), typOk || elementOk default: // Must be a primitive; use reflection. return fmt.Sprintf("%v", encoded), true } } // printStream is the basic print implementation. func (p *printer) printStream(w io.Writer, r io.Reader) (err error) { // current graph ID. var graph uint64 if p.html { fmt.Fprintf(w, "
")
		defer fmt.Fprintf(w, "
") } defer func() { if r := recover(); r != nil { if rErr, ok := r.(error); ok { err = rErr // Override return. return } panic(r) // Propagate. } }() p.typeSpecs = make(map[string]*wire.Type) for { // Find the first object to begin generation. length, object, err := state.ReadHeader(r) if err == io.EOF { // Nothing else to do. break } else if err != nil { return err } if !object { graph++ // Increment the graph. if length > 0 { fmt.Fprintf(w, "(%d bytes non-object data)\n", length) io.Copy(ioutil.Discard, &io.LimitedReader{ R: r, N: int64(length), }) } continue } // Read & unmarshal the object. // // Note that this loop must match the general structure of the // loop in decode.go. But we don't register type information, // etc. and just print the raw structures. type objectAndID struct { id uint64 obj wire.Object } var ( tid uint64 = 1 objects []objectAndID ) for i := uint64(0); i < length; { // Unmarshal either a type object or object ID. encoded := wire.Load(r) switch we := encoded.(type) { case *wire.Type: str, _ := p.format(graph, 0, encoded) tag := fmt.Sprintf("g%dt%d", graph, tid) p.typeSpecs[tag] = we if p.html { // See below. tag = fmt.Sprintf("%s", tag, tag, tag) } if _, err := fmt.Fprintf(w, "%s = %s\n", tag, str); err != nil { return err } tid++ case wire.Uint: // Unmarshal the actual object. objects = append(objects, objectAndID{ id: uint64(we), obj: wire.Load(r), }) i++ default: return fmt.Errorf("wanted type or object ID, got %#v", encoded) } } for _, objAndID := range objects { // Format the node. str, _ := p.format(graph, 0, objAndID.obj) tag := fmt.Sprintf("g%dr%d", graph, objAndID.id) if p.html { // Create a little tag with an anchor next to it for linking. tag = fmt.Sprintf("%s", tag, tag, tag) } if _, err := fmt.Fprintf(w, "%s = %s\n", tag, str); err != nil { return err } } } return nil } // PrintText reads the stream from r and prints text to w. func PrintText(w io.Writer, r io.Reader) error { return (&printer{}).printStream(w, r) } // PrintHTML reads the stream from r and prints html to w. func PrintHTML(w io.Writer, r io.Reader) error { return (&printer{html: true}).printStream(w, r) } golang-gvisor-gvisor-0.0~20240729.0/pkg/state/pretty/pretty_state_autogen.go000066400000000000000000000000701465435605700266700ustar00rootroot00000000000000// automatically generated by stateify. package pretty golang-gvisor-gvisor-0.0~20240729.0/pkg/state/state.go000066400000000000000000000222211465435605700222120ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package state provides functionality related to saving and loading object // graphs. For most types, it provides a set of default saving / loading logic // that will be invoked automatically if custom logic is not defined. // // Kind Support // ---- ------- // Bool default // Int default // Int8 default // Int16 default // Int32 default // Int64 default // Uint default // Uint8 default // Uint16 default // Uint32 default // Uint64 default // Float32 default // Float64 default // Complex64 default // Complex128 default // Array default // Chan custom // Func custom // Interface default // Map default // Ptr default // Slice default // String default // Struct custom (*) Unless zero-sized. // UnsafePointer custom // // See README.md for an overview of how encoding and decoding works. package state import ( "context" "fmt" "io" "reflect" "runtime" "gvisor.dev/gvisor/pkg/state/wire" ) // objectID is a unique identifier assigned to each object to be serialized. // Each instance of an object is considered separately, i.e. if there are two // objects of the same type in the object graph being serialized, they'll be // assigned unique objectIDs. type objectID uint32 // typeID is the identifier for a type. Types are serialized and tracked // alongside objects in order to avoid the overhead of encoding field names in // all objects. type typeID uint32 // ErrState is returned when an error is encountered during encode/decode. type ErrState struct { // err is the underlying error. err error // trace is the stack trace. trace string } // Error returns a sensible description of the state error. func (e *ErrState) Error() string { return fmt.Sprintf("%v:\n%s", e.err, e.trace) } // Unwrap implements standard unwrapping. func (e *ErrState) Unwrap() error { return e.err } // Save saves the given object state. func Save(ctx context.Context, w io.Writer, rootPtr any) (Stats, error) { // Create the encoding state. es := encodeState{ ctx: ctx, w: w, types: makeTypeEncodeDatabase(), zeroValues: make(map[reflect.Type]*objectEncodeState), pending: make(map[objectID]*objectEncodeState), encodedStructs: make(map[reflect.Value]*wire.Struct), } // Perform the encoding. err := safely(func() { es.Save(reflect.ValueOf(rootPtr).Elem()) }) return es.stats, err } // Load loads a checkpoint. func Load(ctx context.Context, r io.Reader, rootPtr any) (Stats, error) { // Create the decoding state. ds := decodeState{ ctx: ctx, r: r, types: makeTypeDecodeDatabase(), deferred: make(map[objectID]wire.Object), } // Attempt our decode. err := safely(func() { ds.Load(reflect.ValueOf(rootPtr).Elem()) }) return ds.stats, err } // Sink is used for Type.StateSave. type Sink struct { internal objectEncoder } // Save adds the given object to the map. // // You should pass always pointers to the object you are saving. For example: // // type X struct { // A int // B *int // } // // func (x *X) StateTypeInfo(m Sink) state.TypeInfo { // return state.TypeInfo{ // Name: "pkg.X", // Fields: []string{ // "A", // "B", // }, // } // } // // func (x *X) StateSave(m Sink) { // m.Save(0, &x.A) // Field is A. // m.Save(1, &x.B) // Field is B. // } // // func (x *X) StateLoad(m Source) { // m.Load(0, &x.A) // Field is A. // m.Load(1, &x.B) // Field is B. // } func (s Sink) Save(slot int, objPtr any) { s.internal.save(slot, reflect.ValueOf(objPtr).Elem()) } // SaveValue adds the given object value to the map. // // This should be used for values where pointers are not available, or casts // are required during Save/Load. // // For example, if we want to cast external package type P.Foo to int64: // // func (x *X) StateSave(m Sink) { // m.SaveValue(0, "A", int64(x.A)) // } // // func (x *X) StateLoad(m Source) { // m.LoadValue(0, new(int64), func(x any) { // x.A = P.Foo(x.(int64)) // }) // } func (s Sink) SaveValue(slot int, obj any) { s.internal.save(slot, reflect.ValueOf(obj)) } // Context returns the context object provided at save time. func (s Sink) Context() context.Context { return s.internal.es.ctx } // Type is an interface that must be implemented by Struct objects. This allows // these objects to be serialized while minimizing runtime reflection required. // // All these methods can be automatically generated by the go_statify tool. type Type interface { // StateTypeName returns the type's name. // // This is used for matching type information during encoding and // decoding, as well as dynamic interface dispatch. This should be // globally unique. StateTypeName() string // StateFields returns information about the type. // // Fields is the set of fields for the object. Calls to Sink.Save and // Source.Load must be made in-order with respect to these fields. // // This will be called at most once per serialization. StateFields() []string } // SaverLoader must be implemented by struct types. type SaverLoader interface { // StateSave saves the state of the object to the given Map. StateSave(Sink) // StateLoad loads the state of the object. StateLoad(context.Context, Source) } // Source is used for Type.StateLoad. type Source struct { internal objectDecoder } // Load loads the given object passed as a pointer.. // // See Sink.Save for an example. func (s Source) Load(slot int, objPtr any) { s.internal.load(slot, reflect.ValueOf(objPtr), false, nil) } // LoadWait loads the given objects from the map, and marks it as requiring all // AfterLoad executions to complete prior to running this object's AfterLoad. // // See Sink.Save for an example. func (s Source) LoadWait(slot int, objPtr any) { s.internal.load(slot, reflect.ValueOf(objPtr), true, nil) } // LoadValue loads the given object value from the map. // // See Sink.SaveValue for an example. func (s Source) LoadValue(slot int, objPtr any, fn func(any)) { o := reflect.ValueOf(objPtr) s.internal.load(slot, o, true, func() { fn(o.Elem().Interface()) }) } // AfterLoad schedules a function execution when all objects have been // allocated and their automated loading and customized load logic have been // executed. fn will not be executed until all of current object's // dependencies' AfterLoad() logic, if exist, have been executed. func (s Source) AfterLoad(fn func()) { s.internal.afterLoad(fn) } // Context returns the context object provided at load time. func (s Source) Context() context.Context { return s.internal.ds.ctx } // IsZeroValue checks if the given value is the zero value. // // This function is used by the stateify tool. func IsZeroValue(val any) bool { return val == nil || reflect.ValueOf(val).Elem().IsZero() } // Failf is a wrapper around panic that should be used to generate errors that // can be caught during saving and loading. func Failf(fmtStr string, v ...any) { panic(fmt.Errorf(fmtStr, v...)) } // safely executes the given function, catching a panic and unpacking as an // error. // // The error flow through the state package uses panic and recover. There are // two important reasons for this: // // 1) Many of the reflection methods will already panic with invalid data or // violated assumptions. We would want to recover anyways here. // // 2) It allows us to eliminate boilerplate within Save() and Load() functions. // In nearly all cases, when the low-level serialization functions fail, you // will want the checkpoint to fail anyways. Plumbing errors through every // method doesn't add a lot of value. If there are specific error conditions // that you'd like to handle, you should add appropriate functionality to // objects themselves prior to calling Save() and Load(). func safely(fn func()) (err error) { defer func() { if r := recover(); r != nil { if es, ok := r.(*ErrState); ok { err = es // Propagate. return } // Build a new state error. es := new(ErrState) if e, ok := r.(error); ok { es.err = e } else { es.err = fmt.Errorf("%v", r) } // Make a stack. We don't know how big it will be ahead // of time, but want to make sure we get the whole // thing. So we just do a stupid brute force approach. var stack []byte for sz := 1024; ; sz *= 2 { stack = make([]byte, sz) n := runtime.Stack(stack, false) if n < sz { es.trace = string(stack[:n]) break } } // Set the error. err = es } }() // Execute the function. fn() return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/state/state_norace.go000066400000000000000000000012361465435605700235440ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !race // +build !race package state var raceEnabled = false golang-gvisor-gvisor-0.0~20240729.0/pkg/state/state_race.go000066400000000000000000000012331465435605700232040ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build race // +build race package state var raceEnabled = true golang-gvisor-gvisor-0.0~20240729.0/pkg/state/statefile/000077500000000000000000000000001465435605700225245ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/state/statefile/async_io.go000066400000000000000000000043201465435605700246560ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package statefile import ( "runtime" "sync/atomic" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/sync" ) type chunk struct { dst []byte off int64 } // AsyncReader can be used to do reads asynchronously. It does not change the // underlying file's offset. type AsyncReader struct { // in is the backing file which contains all pages. in *fd.FD // off is the offset being read. off int64 // q is the work queue. q chan chunk // err stores the latest IO error that occured during async read. err atomic.Pointer[error] // wg tracks all in flight work. wg sync.WaitGroup } // NewAsyncReader initializes a new AsyncReader. func NewAsyncReader(in *fd.FD, off int64) *AsyncReader { workers := runtime.GOMAXPROCS(0) r := &AsyncReader{ in: in, off: off, q: make(chan chunk, workers), } for i := 0; i < workers; i++ { go r.work() } return r } // ReadAsync schedules a read of len(p) bytes from current offset into p. func (r *AsyncReader) ReadAsync(p []byte) { r.wg.Add(1) r.q <- chunk{off: r.off, dst: p} r.off += int64(len(p)) } // Wait blocks until all in flight work is complete and then returns any IO // errors that occurred since the last call to Wait(). func (r *AsyncReader) Wait() error { r.wg.Wait() if err := r.err.Swap(nil); err != nil { return *err } return nil } // Close calls Wait() and additionally cleans up all worker goroutines. func (r *AsyncReader) Close() error { err := r.Wait() close(r.q) return err } func (r *AsyncReader) work() { for { c := <-r.q if c.dst == nil { return } if _, err := r.in.ReadAt(c.dst, c.off); err != nil { r.err.Store(&err) } r.wg.Done() } } golang-gvisor-gvisor-0.0~20240729.0/pkg/state/statefile/statefile.go000066400000000000000000000244441465435605700250430ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package statefile defines the state file data stream. // // This package currently does not include any details regarding the state // encoding itself, only details regarding state metadata and data layout. // // The file format is defined as follows. // // /------------------------------------------------------\ // | header (8-bytes) | // +------------------------------------------------------+ // | metadata length (8-bytes) | // +------------------------------------------------------+ // | metadata | // +------------------------------------------------------+ // | data | // \------------------------------------------------------/ // // First, it includes a 8-byte magic header which is the following // sequence of bytes [0x67, 0x56, 0x69, 0x73, 0x6f, 0x72, 0x53, 0x46] // // This header is followed by an 8-byte length N (big endian), and an // ASCII-encoded JSON map that is exactly N bytes long. // // This map includes only strings for keys and strings for values. Keys in the // map that begin with "_" are for internal use only. They may be read, but may // not be provided by the user. In the future, this metadata may contain some // information relating to the state encoding itself. // // After the map, the remainder of the file is the state data. package statefile import ( "bytes" "compress/flate" "crypto/hmac" "crypto/sha256" "encoding/binary" "encoding/json" "fmt" "hash" "io" "strings" "time" "gvisor.dev/gvisor/pkg/compressio" ) // keySize is the AES-256 key length. const keySize = 32 // compressionChunkSize is the chunk size for compression. const compressionChunkSize = 1024 * 1024 // maxMetadataSize is the size limit of metadata section. const maxMetadataSize = 16 * 1024 * 1024 // magicHeader is the byte sequence beginning each file. var magicHeader = []byte("\x67\x56\x69\x73\x6f\x72\x53\x46") // ErrBadMagic is returned if the header does not match. var ErrBadMagic = fmt.Errorf("bad magic header") // ErrMetadataMissing is returned if the state file is missing mandatory metadata. var ErrMetadataMissing = fmt.Errorf("missing metadata") // ErrInvalidMetadataLength is returned if the metadata length is too large. var ErrInvalidMetadataLength = fmt.Errorf("metadata length invalid, maximum size is %d", maxMetadataSize) // ErrMetadataInvalid is returned if passed metadata is invalid. var ErrMetadataInvalid = fmt.Errorf("metadata invalid, can't start with _") // ErrInvalidFlags is returned if passed flags set is invalid. var ErrInvalidFlags = fmt.Errorf("flags set is invalid") const ( // CompressionKey is the key for the compression level in the metadata. CompressionKey = "compression" ) // CompressionLevel is the image compression level. type CompressionLevel string const ( // CompressionLevelFlateBestSpeed represents flate algorithm in best-speed mode. CompressionLevelFlateBestSpeed = CompressionLevel("flate-best-speed") // CompressionLevelNone represents the absence of any compression on an image. CompressionLevelNone = CompressionLevel("none") // CompressionLevelDefault represents the default compression level. CompressionLevelDefault = CompressionLevelFlateBestSpeed ) func (c CompressionLevel) String() string { return string(c) } // Options is statefile options. type Options struct { // Compression is an image compression type/level. Compression CompressionLevel // Resume indicates if the sandbox process should continue running // after checkpointing. Resume bool } // WriteToMetadata save options to the metadata storage. Method returns the // reference to the original metadata map to allow to be used in the chain calls. func (o Options) WriteToMetadata(metadata map[string]string) map[string]string { metadata[CompressionKey] = string(o.Compression) return metadata } // CompressionLevelFromString parses a string into the CompressionLevel. func CompressionLevelFromString(val string) (CompressionLevel, error) { switch val { case string(CompressionLevelFlateBestSpeed): return CompressionLevelFlateBestSpeed, nil case string(CompressionLevelNone): return CompressionLevelNone, nil case "": return CompressionLevelDefault, nil default: return CompressionLevelNone, ErrInvalidFlags } } // CompressionLevelFromMetadata returns image compression type stored in the metadata. // If the metadata doesn't contain compression information the default behavior // is the "flate-best-speed" state because the default behavior used to be to always // compress. If the parameter is missing it will be set to default. func CompressionLevelFromMetadata(metadata map[string]string) (CompressionLevel, error) { compression := CompressionLevelDefault if val, ok := metadata[CompressionKey]; ok { var err error if compression, err = CompressionLevelFromString(val); err != nil { return CompressionLevelNone, err } } else { metadata[CompressionKey] = string(compression) } return compression, nil } func writeMetadataLen(w io.Writer, val uint64) error { var buf [8]byte binary.BigEndian.PutUint64(buf[:], val) _, err := w.Write(buf[:]) return err } // NewWriter returns a state data writer for a statefile. // // Note that the returned WriteCloser must be closed. func NewWriter(w io.Writer, key []byte, metadata map[string]string) (io.WriteCloser, error) { if metadata == nil { metadata = make(map[string]string) } for k := range metadata { if strings.HasPrefix(k, "_") { return nil, ErrMetadataInvalid } } // Create our HMAC function. h := hmac.New(sha256.New, key) mw := io.MultiWriter(w, h) // First, write the header. if _, err := mw.Write(magicHeader); err != nil { return nil, err } // Generate a timestamp, for convenience only. metadata["_timestamp"] = time.Now().UTC().String() defer delete(metadata, "_timestamp") // Save compression state compression, err := CompressionLevelFromMetadata(metadata) if err != nil { return nil, err } // Write the metadata. b, err := json.Marshal(metadata) if err != nil { return nil, err } if len(b) > maxMetadataSize { return nil, ErrInvalidMetadataLength } // Metadata length. if err := writeMetadataLen(mw, uint64(len(b))); err != nil { return nil, err } // Metadata bytes; io.MultiWriter will return a short write error if // any of the writers returns < n. if _, err := mw.Write(b); err != nil { return nil, err } // Write the current hash. cur := h.Sum(nil) for done := 0; done < len(cur); { n, err := mw.Write(cur[done:]) done += n if err != nil { return nil, err } } // Wrap in compression. When using "best compression" mode, there is usually // only a little gain in file size reduction, which translate to even smaller // gain in restore latency reduction, while inccuring much more CPU usage at // save time. if compression == CompressionLevelFlateBestSpeed { return compressio.NewWriter(w, key, compressionChunkSize, flate.BestSpeed) } return compressio.NewSimpleWriter(w, key) } // MetadataUnsafe reads out the metadata from a state file without verifying any // HMAC. This function shouldn't be called for untrusted input files. func MetadataUnsafe(r io.Reader) (map[string]string, error) { return metadata(r, nil) } func readMetadataLen(r io.Reader) (uint64, error) { var buf [8]byte if _, err := io.ReadFull(r, buf[:]); err != nil { return 0, err } return binary.BigEndian.Uint64(buf[:]), nil } // metadata validates the magic header and reads out the metadata from a state // data stream. func metadata(r io.Reader, h hash.Hash) (map[string]string, error) { if h != nil { r = io.TeeReader(r, h) } // Read and validate magic header. b := make([]byte, len(magicHeader)) if _, err := r.Read(b); err != nil { return nil, err } if !bytes.Equal(b, magicHeader) { return nil, ErrBadMagic } // Read and validate metadata. b, err := func() (b []byte, err error) { defer func() { if r := recover(); r != nil { b = nil err = fmt.Errorf("%v", r) } }() metadataLen, err := readMetadataLen(r) if err != nil { return nil, err } if metadataLen > maxMetadataSize { return nil, ErrInvalidMetadataLength } b = make([]byte, int(metadataLen)) if _, err := io.ReadFull(r, b); err != nil { return nil, err } return b, nil }() if err != nil { return nil, err } if h != nil { // Check the hash prior to decoding. cur := h.Sum(nil) buf := make([]byte, len(cur)) if _, err := io.ReadFull(r, buf); err != nil { return nil, err } if !hmac.Equal(cur, buf) { return nil, compressio.ErrHashMismatch } } // Decode the metadata. metadata := make(map[string]string) if err := json.Unmarshal(b, &metadata); err != nil { return nil, err } return metadata, nil } // NewReader returns a reader for a statefile. func NewReader(r io.Reader, key []byte) (io.Reader, map[string]string, error) { // Read the metadata with the hash. h := hmac.New(sha256.New, key) metadata, err := metadata(r, h) if err != nil { return nil, nil, err } // Determine image compression state. If the metadata doesn't contain // compression information the default behavior is the "compressed" state // because the default behavior used to be to always compress. compression, err := CompressionLevelFromMetadata(metadata) if err != nil { return nil, nil, err } // Pick correct reader var cr io.Reader if compression == CompressionLevelFlateBestSpeed { cr, err = compressio.NewReader(r, key) } else if compression == CompressionLevelNone { cr, err = compressio.NewSimpleReader(r, key) } else { // Should never occur, as it has the default path. return nil, nil, fmt.Errorf("metadata contains invalid compression flag value: %v", compression) } if err != nil { return nil, nil, err } return cr, metadata, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/state/statefile/statefile_state_autogen.go000066400000000000000000000000731465435605700277550ustar00rootroot00000000000000// automatically generated by stateify. package statefile golang-gvisor-gvisor-0.0~20240729.0/pkg/state/stats.go000066400000000000000000000074131465435605700222360ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package state import ( "bytes" "fmt" "sort" "time" ) type statEntry struct { count uint total time.Duration } // Stats tracks encode / decode timing. // // This currently provides a meaningful String function and no other way to // extract stats about individual types. // // All exported receivers accept nil. type Stats struct { // byType contains a breakdown of time spent by type. // // This is indexed *directly* by typeID, including zero. byType []statEntry // stack contains objects in progress. stack []typeID // names contains type names. // // This is also indexed *directly* by typeID, including zero, which we // hard-code as "state.default". This is only resolved by calling fini // on the stats object. names []string // last is the last start time. last time.Time } // init initializes statistics. func (s *Stats) init() { s.last = time.Now() s.stack = append(s.stack, 0) } // fini finalizes statistics. func (s *Stats) fini(resolve func(id typeID) string) { s.done() // Resolve all type names. s.names = make([]string, len(s.byType)) s.names[0] = "state.default" // See above. for id := typeID(1); int(id) < len(s.names); id++ { s.names[id] = resolve(id) } } // sample adds the samples to the given object. func (s *Stats) sample(id typeID) { now := time.Now() if len(s.byType) <= int(id) { // Allocate all the missing entries in one fell swoop. s.byType = append(s.byType, make([]statEntry, 1+int(id)-len(s.byType))...) } s.byType[id].total += now.Sub(s.last) s.last = now } // start starts a sample. func (s *Stats) start(id typeID) { last := s.stack[len(s.stack)-1] s.sample(last) s.stack = append(s.stack, id) } // done finishes the current sample. func (s *Stats) done() { last := s.stack[len(s.stack)-1] s.sample(last) s.byType[last].count++ s.stack = s.stack[:len(s.stack)-1] } type sliceEntry struct { name string entry *statEntry } // String returns a table representation of the stats. func (s *Stats) String() string { // Build a list of stat entries. ss := make([]sliceEntry, 0, len(s.byType)) for id := 0; id < len(s.names); id++ { ss = append(ss, sliceEntry{ name: s.names[id], entry: &s.byType[id], }) } // Sort by total time (descending). sort.Slice(ss, func(i, j int) bool { return ss[i].entry.total > ss[j].entry.total }) // Print the stat results. var ( buf bytes.Buffer count uint total time.Duration ) buf.WriteString("\n") buf.WriteString(fmt.Sprintf("% 16s | % 8s | % 16s | %s\n", "total", "count", "per", "type")) buf.WriteString("-----------------+----------+------------------+----------------\n") for _, se := range ss { if se.entry.count == 0 { // Since we store all types linearly, we are not // guaranteed that any entry actually has time. continue } count += se.entry.count total += se.entry.total per := se.entry.total / time.Duration(se.entry.count) buf.WriteString(fmt.Sprintf("% 16s | %8d | % 16s | %s\n", se.entry.total, se.entry.count, per, se.name)) } buf.WriteString("-----------------+----------+------------------+----------------\n") buf.WriteString(fmt.Sprintf("% 16s | % 8d | % 16s | [all]", total, count, total/time.Duration(count))) return string(buf.Bytes()) } golang-gvisor-gvisor-0.0~20240729.0/pkg/state/types.go000066400000000000000000000266021465435605700222450ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package state import ( "reflect" "sort" "gvisor.dev/gvisor/pkg/state/wire" ) // assertValidType asserts that the type is valid. func assertValidType(name string, fields []string) { if name == "" { Failf("type has empty name") } fieldsCopy := make([]string, len(fields)) for i := 0; i < len(fields); i++ { if fields[i] == "" { Failf("field has empty name for type %q", name) } fieldsCopy[i] = fields[i] } sort.Slice(fieldsCopy, func(i, j int) bool { return fieldsCopy[i] < fieldsCopy[j] }) for i := range fieldsCopy { if i > 0 && fieldsCopy[i-1] == fieldsCopy[i] { Failf("duplicate field %q for type %s", fieldsCopy[i], name) } } } // typeEntry is an entry in the typeDatabase. type typeEntry struct { ID typeID wire.Type } // reconciledTypeEntry is a reconciled entry in the typeDatabase. type reconciledTypeEntry struct { wire.Type LocalType reflect.Type FieldOrder []int } // typeEncodeDatabase is an internal TypeInfo database for encoding. type typeEncodeDatabase struct { // byType maps by type to the typeEntry. byType map[reflect.Type]*typeEntry // lastID is the last used ID. lastID typeID } // makeTypeEncodeDatabase makes a typeDatabase. func makeTypeEncodeDatabase() typeEncodeDatabase { return typeEncodeDatabase{ byType: make(map[reflect.Type]*typeEntry), } } // typeDecodeDatabase is an internal TypeInfo database for decoding. type typeDecodeDatabase struct { // byID maps by ID to type. byID []*reconciledTypeEntry // pending are entries that are pending validation by Lookup. These // will be reconciled with actual objects. Note that these will also be // used to lookup types by name, since they may not be reconciled and // there's little value to deleting from this map. pending []*wire.Type } // makeTypeDecodeDatabase makes a typeDatabase. func makeTypeDecodeDatabase() typeDecodeDatabase { return typeDecodeDatabase{} } // lookupNameFields extracts the name and fields from an object. func lookupNameFields(typ reflect.Type) (string, []string, bool) { v := reflect.Zero(reflect.PtrTo(typ)).Interface() t, ok := v.(Type) if !ok { // Is this a primitive? if typ.Kind() == reflect.Interface { return interfaceType, nil, true } name := typ.Name() if _, ok := primitiveTypeDatabase[name]; !ok { // This is not a known type, and not a primitive. The // encoder may proceed for anonymous empty structs, or // it may deference the type pointer and try again. return "", nil, false } return name, nil, true } // Sanity check the type. if raceEnabled { if _, ok := reverseTypeDatabase[typ]; !ok { // The type was not registered? Must be an embedded // structure or something else. return "", nil, false } } // Extract the name from the object. name := t.StateTypeName() fields := t.StateFields() assertValidType(name, fields) return name, fields, true } // Lookup looks up or registers the given object. // // The bool indicates whether this is an existing entry: false means the entry // did not exist, and true means the entry did exist. If this bool is false and // the returned typeEntry are nil, then the obj did not implement the Type // interface. func (tdb *typeEncodeDatabase) Lookup(typ reflect.Type) (*typeEntry, bool) { te, ok := tdb.byType[typ] if !ok { // Lookup the type information. name, fields, ok := lookupNameFields(typ) if !ok { // Empty structs may still be encoded, so let the // caller decide what to do from here. return nil, false } // Register the new type. tdb.lastID++ te = &typeEntry{ ID: tdb.lastID, Type: wire.Type{ Name: name, Fields: fields, }, } // All done. tdb.byType[typ] = te return te, false } return te, true } // Register adds a typeID entry. func (tbd *typeDecodeDatabase) Register(typ *wire.Type) { assertValidType(typ.Name, typ.Fields) tbd.pending = append(tbd.pending, typ) } // LookupName looks up the type name by ID. func (tbd *typeDecodeDatabase) LookupName(id typeID) string { if len(tbd.pending) < int(id) { // This is likely an encoder error? Failf("type ID %d not available", id) } return tbd.pending[id-1].Name } // LookupType looks up the type by ID. func (tbd *typeDecodeDatabase) LookupType(id typeID) reflect.Type { name := tbd.LookupName(id) typ, ok := globalTypeDatabase[name] if !ok { // If not available, see if it's primitive. typ, ok = primitiveTypeDatabase[name] if !ok && name == interfaceType { // Matches the built-in interface type. var i any return reflect.TypeOf(&i).Elem() } if !ok { // The type is perhaps not registered? Failf("type name %q is not available", name) } return typ // Primitive type. } return typ // Registered type. } // singleFieldOrder defines the field order for a single field. var singleFieldOrder = []int{0} // Lookup looks up or registers the given object. // // First, the typeID is searched to see if this has already been appropriately // reconciled. If no, then a reconciliation will take place that may result in a // field ordering. If a nil reconciledTypeEntry is returned from this method, // then the object does not support the Type interface. // // This method never returns nil. func (tbd *typeDecodeDatabase) Lookup(id typeID, typ reflect.Type) *reconciledTypeEntry { if len(tbd.byID) > int(id) && tbd.byID[id-1] != nil { // Already reconciled. return tbd.byID[id-1] } // The ID has not been reconciled yet. That's fine. We need to make // sure it aligns with the current provided object. if len(tbd.pending) < int(id) { // This id was never registered. Probably an encoder error? Failf("typeDatabase does not contain id %d", id) } // Extract the pending info. pending := tbd.pending[id-1] // Grow the byID list. if len(tbd.byID) < int(id) { tbd.byID = append(tbd.byID, make([]*reconciledTypeEntry, int(id)-len(tbd.byID))...) } // Reconcile the type. name, fields, ok := lookupNameFields(typ) if !ok { // Empty structs are decoded only when the type is nil. Since // this isn't the case, we fail here. Failf("unsupported type %q during decode; can't reconcile", pending.Name) } if name != pending.Name { // Are these the same type? Print a helpful message as this may // actually happen in practice if types change. Failf("typeDatabase contains conflicting definitions for id %d: %s->%v (current) and %s->%v (existing)", id, name, fields, pending.Name, pending.Fields) } rte := &reconciledTypeEntry{ Type: wire.Type{ Name: name, Fields: fields, }, LocalType: typ, } // If there are zero or one fields, then we skip allocating the field // slice. There is special handling for decoding in this case. If the // field name does not match, it will be caught in the general purpose // code below. if len(fields) != len(pending.Fields) { Failf("type %q contains different fields: %v (decode) and %v (encode)", name, fields, pending.Fields) } if len(fields) == 0 { tbd.byID[id-1] = rte // Save. return rte } if len(fields) == 1 && fields[0] == pending.Fields[0] { tbd.byID[id-1] = rte // Save. rte.FieldOrder = singleFieldOrder return rte } // For each field in the current object's information, match it to a // field in the destination object. We know from the assertion above // and the insertion on insertion to pending that neither field // contains any duplicates. fieldOrder := make([]int, len(fields)) for i, name := range fields { fieldOrder[i] = -1 // Sentinel. // Is it an exact match? if pending.Fields[i] == name { fieldOrder[i] = i continue } // Find the matching field. for j, otherName := range pending.Fields { if name == otherName { fieldOrder[i] = j break } } if fieldOrder[i] == -1 { // The type name matches but we are lacking some common fields. Failf("type %q has mismatched fields: %v (decode) and %v (encode)", name, fields, pending.Fields) } } // The type has been reeconciled. rte.FieldOrder = fieldOrder tbd.byID[id-1] = rte return rte } // interfaceType defines all interfaces. const interfaceType = "interface" // primitiveTypeDatabase is a set of fixed types. var primitiveTypeDatabase = func() map[string]reflect.Type { r := make(map[string]reflect.Type) for _, t := range []reflect.Type{ reflect.TypeOf(false), reflect.TypeOf(int(0)), reflect.TypeOf(int8(0)), reflect.TypeOf(int16(0)), reflect.TypeOf(int32(0)), reflect.TypeOf(int64(0)), reflect.TypeOf(uint(0)), reflect.TypeOf(uintptr(0)), reflect.TypeOf(uint8(0)), reflect.TypeOf(uint16(0)), reflect.TypeOf(uint32(0)), reflect.TypeOf(uint64(0)), reflect.TypeOf(""), reflect.TypeOf(float32(0.0)), reflect.TypeOf(float64(0.0)), reflect.TypeOf(complex64(0.0)), reflect.TypeOf(complex128(0.0)), } { r[t.Name()] = t } return r }() // globalTypeDatabase is used for dispatching interfaces on decode. var globalTypeDatabase = map[string]reflect.Type{} // reverseTypeDatabase is a reverse mapping. var reverseTypeDatabase = map[reflect.Type]string{} // Release releases references to global type databases. // Must only be called in contexts where they will definitely never be used, // in order to save memory. func Release() { globalTypeDatabase = nil reverseTypeDatabase = nil } // Register registers a type. // // This must be called on init and only done once. func Register(t Type) { name := t.StateTypeName() typ := reflect.TypeOf(t) if raceEnabled { assertValidType(name, t.StateFields()) // Register must always be called on pointers. if typ.Kind() != reflect.Ptr { Failf("Register must be called on pointers") } } typ = typ.Elem() if raceEnabled { if typ.Kind() == reflect.Struct { // All registered structs must implement SaverLoader. We allow // the registration is non-struct types with just the Type // interface, but we need to call StateSave/StateLoad methods // on aggregate types. if _, ok := t.(SaverLoader); !ok { Failf("struct %T does not implement SaverLoader", t) } } else { // Non-structs must not have any fields. We don't support // calling StateSave/StateLoad methods on any non-struct types. // If custom behavior is required, these types should be // wrapped in a structure of some kind. if fields := t.StateFields(); len(fields) != 0 { Failf("non-struct %T has non-zero fields %v", t, fields) } // We don't allow non-structs to implement StateSave/StateLoad // methods, because they won't be called and it's confusing. if _, ok := t.(SaverLoader); ok { Failf("non-struct %T implements SaverLoader", t) } } if _, ok := primitiveTypeDatabase[name]; ok { Failf("conflicting primitiveTypeDatabase entry for %T: used by primitive", t) } if _, ok := globalTypeDatabase[name]; ok { Failf("conflicting globalTypeDatabase entries for %T: name conflict", t) } if name == interfaceType { Failf("conflicting name for %T: matches interfaceType", t) } reverseTypeDatabase[typ] = name } globalTypeDatabase[name] = typ } golang-gvisor-gvisor-0.0~20240729.0/pkg/state/wire/000077500000000000000000000000001465435605700215125ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/state/wire/wire.go000066400000000000000000000501071465435605700230120ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package wire contains a few basic types that can be composed to serialize // graph information for the state package. This package defines the wire // protocol. // // Note that these types are careful about how they implement the relevant // interfaces (either value receiver or pointer receiver), so that native-sized // types, such as integers and simple pointers, can fit inside the interface // object. // // This package also uses panic as control flow, so called should be careful to // wrap calls in appropriate handlers. // // Testing for this package is driven by the state test package. package wire import ( "fmt" "io" "math" "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/sync" ) var oneByteArrayPool = sync.Pool{ New: func() any { return &[1]byte{} }, } // readFull is a utility. The equivalent is not needed for Write, but the API // contract dictates that it must always complete all bytes given or return an // error. func readFull(r io.Reader, p []byte) { for done := 0; done < len(p); { n, err := r.Read(p[done:]) done += n if n == 0 && err != nil { panic(err) } } } // Object is a generic object. type Object interface { // save saves the given object. // // Panic is used for error control flow. save(io.Writer) // load loads a new object of the given type. // // Panic is used for error control flow. load(io.Reader) Object } // Bool is a boolean. type Bool bool // loadBool loads an object of type Bool. func loadBool(r io.Reader) Bool { b := loadUint(r) return Bool(b == 1) } // save implements Object.save. func (b Bool) save(w io.Writer) { var v Uint if b { v = 1 } else { v = 0 } v.save(w) } // load implements Object.load. func (Bool) load(r io.Reader) Object { return loadBool(r) } // Int is a signed integer. // // This uses varint encoding. type Int int64 // loadInt loads an object of type Int. func loadInt(r io.Reader) Int { u := loadUint(r) x := Int(u >> 1) if u&1 != 0 { x = ^x } return x } // save implements Object.save. func (i Int) save(w io.Writer) { u := Uint(i) << 1 if i < 0 { u = ^u } u.save(w) } // load implements Object.load. func (Int) load(r io.Reader) Object { return loadInt(r) } // Uint is an unsigned integer. type Uint uint64 func readByte(r io.Reader) byte { p := oneByteArrayPool.Get().(*[1]byte) defer oneByteArrayPool.Put(p) n, err := r.Read(p[:]) if n != 1 { panic(err) } return p[0] } // loadUint loads an object of type Uint. func loadUint(r io.Reader) Uint { var ( u Uint s uint ) for i := 0; i <= 9; i++ { b := readByte(r) if b < 0x80 { if i == 9 && b > 1 { panic("overflow") } u |= Uint(b) << s return u } u |= Uint(b&0x7f) << s s += 7 } panic("unreachable") } func writeByte(w io.Writer, b byte) { p := oneByteArrayPool.Get().(*[1]byte) defer oneByteArrayPool.Put(p) p[0] = b n, err := w.Write(p[:]) if n != 1 { panic(err) } } // save implements Object.save. func (u Uint) save(w io.Writer) { for u >= 0x80 { writeByte(w, byte(u)|0x80) u >>= 7 } writeByte(w, byte(u)) } // load implements Object.load. func (Uint) load(r io.Reader) Object { return loadUint(r) } // Float32 is a 32-bit floating point number. type Float32 float32 // loadFloat32 loads an object of type Float32. func loadFloat32(r io.Reader) Float32 { n := loadUint(r) return Float32(math.Float32frombits(uint32(n))) } // save implements Object.save. func (f Float32) save(w io.Writer) { n := Uint(math.Float32bits(float32(f))) n.save(w) } // load implements Object.load. func (Float32) load(r io.Reader) Object { return loadFloat32(r) } // Float64 is a 64-bit floating point number. type Float64 float64 // loadFloat64 loads an object of type Float64. func loadFloat64(r io.Reader) Float64 { n := loadUint(r) return Float64(math.Float64frombits(uint64(n))) } // save implements Object.save. func (f Float64) save(w io.Writer) { n := Uint(math.Float64bits(float64(f))) n.save(w) } // load implements Object.load. func (Float64) load(r io.Reader) Object { return loadFloat64(r) } // Complex64 is a 64-bit complex number. type Complex64 complex128 // loadComplex64 loads an object of type Complex64. func loadComplex64(r io.Reader) Complex64 { re := loadFloat32(r) im := loadFloat32(r) return Complex64(complex(float32(re), float32(im))) } // save implements Object.save. func (c *Complex64) save(w io.Writer) { re := Float32(real(*c)) im := Float32(imag(*c)) re.save(w) im.save(w) } // load implements Object.load. func (*Complex64) load(r io.Reader) Object { c := loadComplex64(r) return &c } // Complex128 is a 128-bit complex number. type Complex128 complex128 // loadComplex128 loads an object of type Complex128. func loadComplex128(r io.Reader) Complex128 { re := loadFloat64(r) im := loadFloat64(r) return Complex128(complex(float64(re), float64(im))) } // save implements Object.save. func (c *Complex128) save(w io.Writer) { re := Float64(real(*c)) im := Float64(imag(*c)) re.save(w) im.save(w) } // load implements Object.load. func (*Complex128) load(r io.Reader) Object { c := loadComplex128(r) return &c } // String is a string. type String string // loadString loads an object of type String. func loadString(r io.Reader) String { l := loadUint(r) p := make([]byte, l) readFull(r, p) return String(gohacks.StringFromImmutableBytes(p)) } // save implements Object.save. func (s *String) save(w io.Writer) { l := Uint(len(*s)) l.save(w) p := gohacks.ImmutableBytesFromString(string(*s)) _, err := w.Write(p) // Must write all bytes. if err != nil { panic(err) } } // load implements Object.load. func (*String) load(r io.Reader) Object { s := loadString(r) return &s } // Dot is a kind of reference: one of Index and FieldName. type Dot interface { isDot() } // Index is a reference resolution. type Index uint32 func (Index) isDot() {} // FieldName is a reference resolution. type FieldName string func (*FieldName) isDot() {} // Ref is a reference to an object. type Ref struct { // Root is the root object. Root Uint // Dots is the set of traversals required from the Root object above. // Note that this will be stored in reverse order for efficiency. Dots []Dot // Type is the base type for the root object. This is non-nil iff Dots // is non-zero length (that is, this is a complex reference). This is // not *strictly* necessary, but can be used to simplify decoding. Type TypeSpec } // loadRef loads an object of type Ref (abstract). func loadRef(r io.Reader) Ref { ref := Ref{ Root: loadUint(r), } l := loadUint(r) ref.Dots = make([]Dot, l) for i := 0; i < int(l); i++ { // Disambiguate between an Index (non-negative) and a field // name (negative). This does some space and avoids a dedicate // loadDot function. See Ref.save for the other side. d := loadInt(r) if d >= 0 { ref.Dots[i] = Index(d) continue } p := make([]byte, -d) readFull(r, p) fieldName := FieldName(gohacks.StringFromImmutableBytes(p)) ref.Dots[i] = &fieldName } if l != 0 { // Only if dots is non-zero. ref.Type = loadTypeSpec(r) } return ref } // save implements Object.save. func (r *Ref) save(w io.Writer) { r.Root.save(w) l := Uint(len(r.Dots)) l.save(w) for _, d := range r.Dots { // See LoadRef. We use non-negative numbers to encode Index // objects and negative numbers to encode field lengths. switch x := d.(type) { case Index: i := Int(x) i.save(w) case *FieldName: d := Int(-len(*x)) d.save(w) p := gohacks.ImmutableBytesFromString(string(*x)) if _, err := w.Write(p); err != nil { panic(err) } default: panic("unknown dot implementation") } } if l != 0 { // See above. saveTypeSpec(w, r.Type) } } // load implements Object.load. func (*Ref) load(r io.Reader) Object { ref := loadRef(r) return &ref } // Nil is a primitive zero value of any type. type Nil struct{} // loadNil loads an object of type Nil. func loadNil(r io.Reader) Nil { return Nil{} } // save implements Object.save. func (Nil) save(w io.Writer) {} // load implements Object.load. func (Nil) load(r io.Reader) Object { return loadNil(r) } // Slice is a slice value. type Slice struct { Length Uint Capacity Uint Ref Ref } // loadSlice loads an object of type Slice. func loadSlice(r io.Reader) Slice { return Slice{ Length: loadUint(r), Capacity: loadUint(r), Ref: loadRef(r), } } // save implements Object.save. func (s *Slice) save(w io.Writer) { s.Length.save(w) s.Capacity.save(w) s.Ref.save(w) } // load implements Object.load. func (*Slice) load(r io.Reader) Object { s := loadSlice(r) return &s } // Array is an array value. type Array struct { Contents []Object } // loadArray loads an object of type Array. func loadArray(r io.Reader) Array { l := loadUint(r) if l == 0 { // Note that there isn't a single object available to encode // the type of, so we need this additional branch. return Array{} } // All the objects here have the same type, so use dynamic dispatch // only once. All other objects will automatically take the same type // as the first object. contents := make([]Object, l) v := Load(r) contents[0] = v for i := 1; i < int(l); i++ { contents[i] = v.load(r) } return Array{ Contents: contents, } } // save implements Object.save. func (a *Array) save(w io.Writer) { l := Uint(len(a.Contents)) l.save(w) if l == 0 { // See LoadArray. return } // See above. Save(w, a.Contents[0]) for i := 1; i < int(l); i++ { a.Contents[i].save(w) } } // load implements Object.load. func (*Array) load(r io.Reader) Object { a := loadArray(r) return &a } // Map is a map value. type Map struct { Keys []Object Values []Object } // loadMap loads an object of type Map. func loadMap(r io.Reader) Map { l := loadUint(r) if l == 0 { // See LoadArray. return Map{} } // See type dispatch notes in Array. keys := make([]Object, l) values := make([]Object, l) k := Load(r) v := Load(r) keys[0] = k values[0] = v for i := 1; i < int(l); i++ { keys[i] = k.load(r) values[i] = v.load(r) } return Map{ Keys: keys, Values: values, } } // save implements Object.save. func (m *Map) save(w io.Writer) { l := Uint(len(m.Keys)) if int(l) != len(m.Values) { panic(fmt.Sprintf("mismatched keys (%d) Aand values (%d)", len(m.Keys), len(m.Values))) } l.save(w) if l == 0 { // See LoadArray. return } // See above. Save(w, m.Keys[0]) Save(w, m.Values[0]) for i := 1; i < int(l); i++ { m.Keys[i].save(w) m.Values[i].save(w) } } // load implements Object.load. func (*Map) load(r io.Reader) Object { m := loadMap(r) return &m } // TypeSpec is a type dereference. type TypeSpec interface { isTypeSpec() } // TypeID is a concrete type ID. type TypeID Uint func (TypeID) isTypeSpec() {} // TypeSpecPointer is a pointer type. type TypeSpecPointer struct { Type TypeSpec } func (*TypeSpecPointer) isTypeSpec() {} // TypeSpecArray is an array type. type TypeSpecArray struct { Count Uint Type TypeSpec } func (*TypeSpecArray) isTypeSpec() {} // TypeSpecSlice is a slice type. type TypeSpecSlice struct { Type TypeSpec } func (*TypeSpecSlice) isTypeSpec() {} // TypeSpecMap is a map type. type TypeSpecMap struct { Key TypeSpec Value TypeSpec } func (*TypeSpecMap) isTypeSpec() {} // TypeSpecNil is an empty type. type TypeSpecNil struct{} func (TypeSpecNil) isTypeSpec() {} // TypeSpec types. // // These use a distinct encoding on the wire, as they are used only in the // interface object. They are decoded through the dedicated loadTypeSpec and // saveTypeSpec functions. const ( typeSpecTypeID Uint = iota typeSpecPointer typeSpecArray typeSpecSlice typeSpecMap typeSpecNil ) // loadTypeSpec loads TypeSpec values. func loadTypeSpec(r io.Reader) TypeSpec { switch hdr := loadUint(r); hdr { case typeSpecTypeID: return TypeID(loadUint(r)) case typeSpecPointer: return &TypeSpecPointer{ Type: loadTypeSpec(r), } case typeSpecArray: return &TypeSpecArray{ Count: loadUint(r), Type: loadTypeSpec(r), } case typeSpecSlice: return &TypeSpecSlice{ Type: loadTypeSpec(r), } case typeSpecMap: return &TypeSpecMap{ Key: loadTypeSpec(r), Value: loadTypeSpec(r), } case typeSpecNil: return TypeSpecNil{} default: // This is not a valid stream? panic(fmt.Errorf("unknown header: %d", hdr)) } } // saveTypeSpec saves TypeSpec values. func saveTypeSpec(w io.Writer, t TypeSpec) { switch x := t.(type) { case TypeID: typeSpecTypeID.save(w) Uint(x).save(w) case *TypeSpecPointer: typeSpecPointer.save(w) saveTypeSpec(w, x.Type) case *TypeSpecArray: typeSpecArray.save(w) x.Count.save(w) saveTypeSpec(w, x.Type) case *TypeSpecSlice: typeSpecSlice.save(w) saveTypeSpec(w, x.Type) case *TypeSpecMap: typeSpecMap.save(w) saveTypeSpec(w, x.Key) saveTypeSpec(w, x.Value) case TypeSpecNil: typeSpecNil.save(w) default: // This should not happen? panic(fmt.Errorf("unknown type %T", t)) } } // Interface is an interface value. type Interface struct { Type TypeSpec Value Object } // loadInterface loads an object of type Interface. func loadInterface(r io.Reader) Interface { return Interface{ Type: loadTypeSpec(r), Value: Load(r), } } // save implements Object.save. func (i *Interface) save(w io.Writer) { saveTypeSpec(w, i.Type) Save(w, i.Value) } // load implements Object.load. func (*Interface) load(r io.Reader) Object { i := loadInterface(r) return &i } // Type is type information. type Type struct { Name string Fields []string } // loadType loads an object of type Type. func loadType(r io.Reader) Type { name := string(loadString(r)) l := loadUint(r) fields := make([]string, l) for i := 0; i < int(l); i++ { fields[i] = string(loadString(r)) } return Type{ Name: name, Fields: fields, } } // save implements Object.save. func (t *Type) save(w io.Writer) { s := String(t.Name) s.save(w) l := Uint(len(t.Fields)) l.save(w) for i := 0; i < int(l); i++ { s := String(t.Fields[i]) s.save(w) } } // load implements Object.load. func (*Type) load(r io.Reader) Object { t := loadType(r) return &t } // multipleObjects is a special type for serializing multiple objects. type multipleObjects []Object // loadMultipleObjects loads a series of objects. func loadMultipleObjects(r io.Reader) multipleObjects { l := loadUint(r) m := make(multipleObjects, l) for i := 0; i < int(l); i++ { m[i] = Load(r) } return m } // save implements Object.save. func (m *multipleObjects) save(w io.Writer) { l := Uint(len(*m)) l.save(w) for i := 0; i < int(l); i++ { Save(w, (*m)[i]) } } // load implements Object.load. func (*multipleObjects) load(r io.Reader) Object { m := loadMultipleObjects(r) return &m } // noObjects represents no objects. type noObjects struct{} // loadNoObjects loads a sentinel. func loadNoObjects(r io.Reader) noObjects { return noObjects{} } // save implements Object.save. func (noObjects) save(w io.Writer) {} // load implements Object.load. func (noObjects) load(r io.Reader) Object { return loadNoObjects(r) } // Struct is a basic composite value. type Struct struct { TypeID TypeID fields Object // Optionally noObjects or *multipleObjects. } // Field returns a pointer to the given field slot. // // This must be called after Alloc. func (s *Struct) Field(i int) *Object { if fields, ok := s.fields.(*multipleObjects); ok { return &((*fields)[i]) } if _, ok := s.fields.(noObjects); ok { // Alloc may be optionally called; can't call twice. panic("Field called inappropriately, wrong Alloc?") } return &s.fields } // Alloc allocates the given number of fields. // // This must be called before Add and Save. // // Precondition: slots must be positive. func (s *Struct) Alloc(slots int) { switch { case slots == 0: s.fields = noObjects{} case slots == 1: // Leave it alone. case slots > 1: fields := make(multipleObjects, slots) s.fields = &fields default: // Violates precondition. panic(fmt.Sprintf("Alloc called with negative slots %d?", slots)) } } // Fields returns the number of fields. func (s *Struct) Fields() int { switch x := s.fields.(type) { case *multipleObjects: return len(*x) case noObjects: return 0 default: return 1 } } // loadStruct loads an object of type Struct. func loadStruct(r io.Reader) Struct { return Struct{ TypeID: TypeID(loadUint(r)), fields: Load(r), } } // save implements Object.save. // // Precondition: Alloc must have been called, and the fields all filled in // appropriately. See Alloc and Add for more details. func (s *Struct) save(w io.Writer) { Uint(s.TypeID).save(w) Save(w, s.fields) } // load implements Object.load. func (*Struct) load(r io.Reader) Object { s := loadStruct(r) return &s } // Object types. // // N.B. Be careful about changing the order or introducing new elements in the // middle here. This is part of the wire format and shouldn't change. const ( typeBool Uint = iota typeInt typeUint typeFloat32 typeFloat64 typeNil typeRef typeString typeSlice typeArray typeMap typeStruct typeNoObjects typeMultipleObjects typeInterface typeComplex64 typeComplex128 typeType ) // Save saves the given object. // // +checkescape all // // N.B. This function will panic on error. func Save(w io.Writer, obj Object) { switch x := obj.(type) { case Bool: typeBool.save(w) x.save(w) case Int: typeInt.save(w) x.save(w) case Uint: typeUint.save(w) x.save(w) case Float32: typeFloat32.save(w) x.save(w) case Float64: typeFloat64.save(w) x.save(w) case Nil: typeNil.save(w) x.save(w) case *Ref: typeRef.save(w) x.save(w) case *String: typeString.save(w) x.save(w) case *Slice: typeSlice.save(w) x.save(w) case *Array: typeArray.save(w) x.save(w) case *Map: typeMap.save(w) x.save(w) case *Struct: typeStruct.save(w) x.save(w) case noObjects: typeNoObjects.save(w) x.save(w) case *multipleObjects: typeMultipleObjects.save(w) x.save(w) case *Interface: typeInterface.save(w) x.save(w) case *Type: typeType.save(w) x.save(w) case *Complex64: typeComplex64.save(w) x.save(w) case *Complex128: typeComplex128.save(w) x.save(w) default: panic(fmt.Errorf("unknown type: %#v", obj)) } } // Load loads a new object. // // +checkescape all // // N.B. This function will panic on error. func Load(r io.Reader) Object { switch hdr := loadUint(r); hdr { case typeBool: return loadBool(r) case typeInt: return loadInt(r) case typeUint: return loadUint(r) case typeFloat32: return loadFloat32(r) case typeFloat64: return loadFloat64(r) case typeNil: return loadNil(r) case typeRef: return ((*Ref)(nil)).load(r) // Escapes. case typeString: return ((*String)(nil)).load(r) // Escapes. case typeSlice: return ((*Slice)(nil)).load(r) // Escapes. case typeArray: return ((*Array)(nil)).load(r) // Escapes. case typeMap: return ((*Map)(nil)).load(r) // Escapes. case typeStruct: return ((*Struct)(nil)).load(r) // Escapes. case typeNoObjects: // Special for struct. return loadNoObjects(r) case typeMultipleObjects: // Special for struct. return ((*multipleObjects)(nil)).load(r) // Escapes. case typeInterface: return ((*Interface)(nil)).load(r) // Escapes. case typeComplex64: return ((*Complex64)(nil)).load(r) // Escapes. case typeComplex128: return ((*Complex128)(nil)).load(r) // Escapes. case typeType: return ((*Type)(nil)).load(r) // Escapes. default: // This is not a valid stream? panic(fmt.Errorf("unknown header: %d", hdr)) } } // LoadUint loads a single unsigned integer. // // N.B. This function will panic on error. func LoadUint(r io.Reader) uint64 { return uint64(loadUint(r)) } // SaveUint saves a single unsigned integer. // // N.B. This function will panic on error. func SaveUint(w io.Writer, v uint64) { Uint(v).save(w) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/000077500000000000000000000000001465435605700204005ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/aliases.go000066400000000000000000000012451465435605700223520ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package sync import ( "sync" ) // Aliases of standard library types. type ( // Cond is an alias of sync.Cond. Cond = sync.Cond // Locker is an alias of sync.Locker. Locker = sync.Locker // Once is an alias of sync.Once. Once = sync.Once // Pool is an alias of sync.Pool. Pool = sync.Pool // WaitGroup is an alias of sync.WaitGroup. WaitGroup = sync.WaitGroup // Map is an alias of sync.Map. Map = sync.Map ) // NewCond is a wrapper around sync.NewCond. func NewCond(l Locker) *Cond { return sync.NewCond(l) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/checklocks_off_unsafe.go000066400000000000000000000004571465435605700252410ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build !checklocks // +build !checklocks package sync import ( "unsafe" ) func noteLock(l unsafe.Pointer) { } func noteUnlock(l unsafe.Pointer) { } golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/checklocks_on_unsafe.go000066400000000000000000000051471465435605700251040ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build checklocks // +build checklocks package sync import ( "fmt" "strings" "sync" "unsafe" "gvisor.dev/gvisor/pkg/goid" ) // gLocks contains metadata about the locks held by a goroutine. type gLocks struct { locksHeld []unsafe.Pointer } // map[goid int]*gLocks // // Each key may only be written by the G with the goid it refers to. // // Note that entries are not evicted when a G exit, causing unbounded growth // with new G creation / destruction. If this proves problematic, entries could // be evicted when no locks are held at the expense of more allocations when // taking top-level locks. var locksHeld sync.Map func getGLocks() *gLocks { id := goid.Get() var locks *gLocks if l, ok := locksHeld.Load(id); ok { locks = l.(*gLocks) } else { locks = &gLocks{ // Initialize space for a few locks. locksHeld: make([]unsafe.Pointer, 0, 8), } locksHeld.Store(id, locks) } return locks } func noteLock(l unsafe.Pointer) { locks := getGLocks() for _, lock := range locks.locksHeld { if lock == l { panic(fmt.Sprintf("Deadlock on goroutine %d! Double lock of %p: %+v", goid.Get(), l, locks)) } } // Commit only after checking for panic conditions so that this lock // isn't on the list if the above panic is recovered. locks.locksHeld = append(locks.locksHeld, l) } func noteUnlock(l unsafe.Pointer) { locks := getGLocks() if len(locks.locksHeld) == 0 { panic(fmt.Sprintf("Unlock of %p on goroutine %d without any locks held! All locks:\n%s", l, goid.Get(), dumpLocks())) } // Search backwards since callers are most likely to unlock in LIFO order. length := len(locks.locksHeld) for i := length - 1; i >= 0; i-- { if l == locks.locksHeld[i] { copy(locks.locksHeld[i:length-1], locks.locksHeld[i+1:length]) // Clear last entry to ensure addr can be GC'd. locks.locksHeld[length-1] = nil locks.locksHeld = locks.locksHeld[:length-1] return } } panic(fmt.Sprintf("Unlock of %p on goroutine %d without matching lock! All locks:\n%s", l, goid.Get(), dumpLocks())) } func dumpLocks() string { var s strings.Builder locksHeld.Range(func(key, value any) bool { goid := key.(int64) locks := value.(*gLocks) // N.B. accessing gLocks of another G is fundamentally racy. fmt.Fprintf(&s, "goroutine %d:\n", goid) if len(locks.locksHeld) == 0 { fmt.Fprintf(&s, "\t\n") } for _, lock := range locks.locksHeld { fmt.Fprintf(&s, "\t%p\n", lock) } fmt.Fprintf(&s, "\n") return true }) return s.String() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/fence.go000066400000000000000000000013401465435605700220050ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sync // MemoryFenceReads ensures that all preceding memory loads happen before // following memory loads. func MemoryFenceReads() golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/fence_amd64.s000066400000000000000000000017561465435605700226500ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 #include "textflag.h" // func MemoryFenceReads() TEXT ·MemoryFenceReads(SB),NOSPLIT|NOFRAME,$0-0 // No memory fence is required on x86. However, a compiler fence is // required to prevent the compiler from reordering memory accesses. The Go // compiler will not reorder memory accesses around a call to an assembly // function; compare runtime.publicationBarrier. RET golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/fence_arm64.s000066400000000000000000000013621465435605700226570ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 #include "textflag.h" // func MemoryFenceReads() TEXT ·MemoryFenceReads(SB),NOSPLIT|NOFRAME,$0-0 DMB $0x9 // ISHLD RET golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/gate_unsafe.go000066400000000000000000000107321465435605700232130ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sync import ( "fmt" "math" "sync/atomic" "unsafe" "gvisor.dev/gvisor/pkg/gohacks" ) // Gate is a synchronization primitive that allows concurrent goroutines to // "enter" it as long as it hasn't been closed yet. Once it's been closed, // goroutines cannot enter it anymore, but are allowed to leave, and the closer // will be informed when all goroutines have left. // // Gate is similar to WaitGroup: // // - Gate.Enter() is analogous to WaitGroup.Add(1), but may be called even if // the Gate counter is 0 and fails if Gate.Close() has been called. // // - Gate.Leave() is equivalent to WaitGroup.Done(). // // - Gate.Close() is analogous to WaitGroup.Wait(), but also causes future // // calls to Gate.Enter() to fail and may only be called once, from a single // goroutine. // // This is useful, for example, in cases when a goroutine is trying to clean up // an object for which multiple goroutines have pointers. In such a case, users // would be required to enter and leave the Gate, and the cleaner would wait // until all users are gone (and no new ones are allowed) before proceeding. // // Users: // // if !g.Enter() { // // Gate is closed, we can't use the object. // return // } // // // Do something with object. // [...] // // g.Leave() // // Closer: // // // Prevent new users from using the object, and wait for the existing // // ones to complete. // g.Close() // // // Clean up the object. // [...] type Gate struct { userCount int32 closingG uintptr } const preparingG = 1 // Enter tries to enter the gate. It will succeed if it hasn't been closed yet, // in which case the caller must eventually call Leave(). // // This function is thread-safe. func (g *Gate) Enter() bool { if atomic.AddInt32(&g.userCount, 1) > 0 { return true } g.leaveAfterFailedEnter() return false } // leaveAfterFailedEnter is identical to Leave, but is marked noinline to // prevent it from being inlined into Enter, since as of this writing inlining // Leave into Enter prevents Enter from being inlined into its callers. // //go:noinline func (g *Gate) leaveAfterFailedEnter() { if atomic.AddInt32(&g.userCount, -1) == math.MinInt32 { g.leaveClosed() } } // Leave leaves the gate. This must only be called after a successful call to // Enter(). If the gate has been closed and this is the last one inside the // gate, it will notify the closer that the gate is done. // // This function is thread-safe. func (g *Gate) Leave() { if atomic.AddInt32(&g.userCount, -1) == math.MinInt32 { g.leaveClosed() } } func (g *Gate) leaveClosed() { if atomic.LoadUintptr(&g.closingG) == 0 { return } if g := atomic.SwapUintptr(&g.closingG, 0); g > preparingG { goready(g, 0) } } // Close closes the gate, causing future calls to Enter to fail, and waits // until all goroutines that are currently inside the gate leave before // returning. // // Only one goroutine can call this function. func (g *Gate) Close() { if atomic.LoadInt32(&g.userCount) == math.MinInt32 { // The gate is already closed, with no goroutines inside. For legacy // reasons, we have to allow Close to be called again in this case. return } if v := atomic.AddInt32(&g.userCount, math.MinInt32); v == math.MinInt32 { // userCount was already 0. return } else if v >= 0 { panic("concurrent Close of sync.Gate") } if g := atomic.SwapUintptr(&g.closingG, preparingG); g != 0 { panic(fmt.Sprintf("invalid sync.Gate.closingG during Close: %#x", g)) } if atomic.LoadInt32(&g.userCount) == math.MinInt32 { // The last call to Leave arrived while we were setting up closingG. return } // WaitReasonSemacquire/TraceBlockSync are consistent with WaitGroup. gopark(gateCommit, gohacks.Noescape(unsafe.Pointer(&g.closingG)), WaitReasonSemacquire, TraceBlockSync, 0) } //go:norace //go:nosplit func gateCommit(g uintptr, closingG unsafe.Pointer) bool { return RaceUncheckedAtomicCompareAndSwapUintptr((*uintptr)(closingG), preparingG, g) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/goyield_go113_unsafe.go000066400000000000000000000005001465435605700246310ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build go1.13 && !go1.14 // +build go1.13,!go1.14 package sync import ( "runtime" ) func goyield() { // goyield is not available until Go 1.14. runtime.Gosched() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/goyield_unsafe.go000066400000000000000000000007631465435605700237320ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build go1.14 // +build go1.14 // //go:linkname directives type-checked by checklinkname. Any other // non-linkname assumptions outside the Go 1 compatibility guarantee should // have an accompanied vet check or version guard build tag. package sync import ( _ "unsafe" // for go:linkname ) //go:linkname goyield runtime.goyield func goyield() golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/locking/000077500000000000000000000000001465435605700220265ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/locking/atomicptrmap_ancestors_unsafe.go000066400000000000000000000317341465435605700305070ustar00rootroot00000000000000package locking import ( "sync/atomic" "unsafe" "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/sync" ) const ( // ShardOrder is an optional parameter specifying the base-2 log of the // number of shards per AtomicPtrMap. Higher values of ShardOrder reduce // unnecessary synchronization between unrelated concurrent operations, // improving performance for write-heavy workloads, but increase memory // usage for small maps. ancestorsShardOrder = 0 ) // Hasher is an optional type parameter. If Hasher is provided, it must define // the Init and Hash methods. One Hasher will be shared by all AtomicPtrMaps. type ancestorsHasher struct { ancestorsdefaultHasher } // defaultHasher is the default Hasher. This indirection exists because // defaultHasher must exist even if a custom Hasher is provided, to prevent the // Go compiler from complaining about defaultHasher's unused imports. type ancestorsdefaultHasher struct { fn func(unsafe.Pointer, uintptr) uintptr seed uintptr } // Init initializes the Hasher. func (h *ancestorsdefaultHasher) Init() { h.fn = sync.MapKeyHasher(map[*MutexClass]*string(nil)) h.seed = sync.RandUintptr() } // Hash returns the hash value for the given Key. func (h *ancestorsdefaultHasher) Hash(key *MutexClass) uintptr { return h.fn(gohacks.Noescape(unsafe.Pointer(&key)), h.seed) } var ancestorshasher ancestorsHasher func init() { ancestorshasher.Init() } // An AtomicPtrMap maps Keys to non-nil pointers to Values. AtomicPtrMap are // safe for concurrent use from multiple goroutines without additional // synchronization. // // The zero value of AtomicPtrMap is empty (maps all Keys to nil) and ready for // use. AtomicPtrMaps must not be copied after first use. // // sync.Map may be faster than AtomicPtrMap if most operations on the map are // concurrent writes to a fixed set of keys. AtomicPtrMap is usually faster in // other circumstances. type ancestorsAtomicPtrMap struct { shards [1 << ancestorsShardOrder]ancestorsapmShard } func (m *ancestorsAtomicPtrMap) shard(hash uintptr) *ancestorsapmShard { // Go defines right shifts >= width of shifted unsigned operand as 0, so // this is correct even if ShardOrder is 0 (although nogo complains because // nogo is dumb). const indexLSB = unsafe.Sizeof(uintptr(0))*8 - ancestorsShardOrder index := hash >> indexLSB return (*ancestorsapmShard)(unsafe.Pointer(uintptr(unsafe.Pointer(&m.shards)) + (index * unsafe.Sizeof(ancestorsapmShard{})))) } type ancestorsapmShard struct { ancestorsapmShardMutationData _ [ancestorsapmShardMutationDataPadding]byte ancestorsapmShardLookupData _ [ancestorsapmShardLookupDataPadding]byte } type ancestorsapmShardMutationData struct { dirtyMu sync.Mutex // serializes slot transitions out of empty dirty uintptr // # slots with val != nil count uintptr // # slots with val != nil and val != tombstone() rehashMu sync.Mutex // serializes rehashing } type ancestorsapmShardLookupData struct { seq sync.SeqCount // allows atomic reads of slots+mask slots unsafe.Pointer // [mask+1]slot or nil; protected by rehashMu/seq mask uintptr // always (a power of 2) - 1; protected by rehashMu/seq } const ( ancestorscacheLineBytes = 64 // Cache line padding is enabled if sharding is. ancestorsapmEnablePadding = (ancestorsShardOrder + 63) >> 6 // 0 if ShardOrder == 0, 1 otherwise // The -1 and +1 below are required to ensure that if unsafe.Sizeof(T) % // cacheLineBytes == 0, then padding is 0 (rather than cacheLineBytes). ancestorsapmShardMutationDataRequiredPadding = ancestorscacheLineBytes - (((unsafe.Sizeof(ancestorsapmShardMutationData{}) - 1) % ancestorscacheLineBytes) + 1) ancestorsapmShardMutationDataPadding = ancestorsapmEnablePadding * ancestorsapmShardMutationDataRequiredPadding ancestorsapmShardLookupDataRequiredPadding = ancestorscacheLineBytes - (((unsafe.Sizeof(ancestorsapmShardLookupData{}) - 1) % ancestorscacheLineBytes) + 1) ancestorsapmShardLookupDataPadding = ancestorsapmEnablePadding * ancestorsapmShardLookupDataRequiredPadding // These define fractional thresholds for when apmShard.rehash() is called // (i.e. the load factor) and when it rehases to a larger table // respectively. They are chosen such that the rehash threshold = the // expansion threshold + 1/2, so that when reuse of deleted slots is rare // or non-existent, rehashing occurs after the insertion of at least 1/2 // the table's size in new entries, which is acceptably infrequent. ancestorsapmRehashThresholdNum = 2 ancestorsapmRehashThresholdDen = 3 ancestorsapmExpansionThresholdNum = 1 ancestorsapmExpansionThresholdDen = 6 ) type ancestorsapmSlot struct { // slot states are indicated by val: // // * Empty: val == nil; key is meaningless. May transition to full or // evacuated with dirtyMu locked. // // * Full: val != nil, tombstone(), or evacuated(); key is immutable. val // is the Value mapped to key. May transition to deleted or evacuated. // // * Deleted: val == tombstone(); key is still immutable. key is mapped to // no Value. May transition to full or evacuated. // // * Evacuated: val == evacuated(); key is immutable. Set by rehashing on // slots that have already been moved, requiring readers to wait for // rehashing to complete and use the new table. Terminal state. // // Note that once val is non-nil, it cannot become nil again. That is, the // transition from empty to non-empty is irreversible for a given slot; // the only way to create more empty slots is by rehashing. val unsafe.Pointer key *MutexClass } func ancestorsapmSlotAt(slots unsafe.Pointer, pos uintptr) *ancestorsapmSlot { return (*ancestorsapmSlot)(unsafe.Pointer(uintptr(slots) + pos*unsafe.Sizeof(ancestorsapmSlot{}))) } var ancestorstombstoneObj byte func ancestorstombstone() unsafe.Pointer { return unsafe.Pointer(&ancestorstombstoneObj) } var ancestorsevacuatedObj byte func ancestorsevacuated() unsafe.Pointer { return unsafe.Pointer(&ancestorsevacuatedObj) } // Load returns the Value stored in m for key. func (m *ancestorsAtomicPtrMap) Load(key *MutexClass) *string { hash := ancestorshasher.Hash(key) shard := m.shard(hash) retry: epoch := shard.seq.BeginRead() slots := atomic.LoadPointer(&shard.slots) mask := atomic.LoadUintptr(&shard.mask) if !shard.seq.ReadOk(epoch) { goto retry } if slots == nil { return nil } i := hash & mask inc := uintptr(1) for { slot := ancestorsapmSlotAt(slots, i) slotVal := atomic.LoadPointer(&slot.val) if slotVal == nil { return nil } if slotVal == ancestorsevacuated() { goto retry } if slot.key == key { if slotVal == ancestorstombstone() { return nil } return (*string)(slotVal) } i = (i + inc) & mask inc++ } } // Store stores the Value val for key. func (m *ancestorsAtomicPtrMap) Store(key *MutexClass, val *string) { m.maybeCompareAndSwap(key, false, nil, val) } // Swap stores the Value val for key and returns the previously-mapped Value. func (m *ancestorsAtomicPtrMap) Swap(key *MutexClass, val *string) *string { return m.maybeCompareAndSwap(key, false, nil, val) } // CompareAndSwap checks that the Value stored for key is oldVal; if it is, it // stores the Value newVal for key. CompareAndSwap returns the previous Value // stored for key, whether or not it stores newVal. func (m *ancestorsAtomicPtrMap) CompareAndSwap(key *MutexClass, oldVal, newVal *string) *string { return m.maybeCompareAndSwap(key, true, oldVal, newVal) } func (m *ancestorsAtomicPtrMap) maybeCompareAndSwap(key *MutexClass, compare bool, typedOldVal, typedNewVal *string) *string { hash := ancestorshasher.Hash(key) shard := m.shard(hash) oldVal := ancestorstombstone() if typedOldVal != nil { oldVal = unsafe.Pointer(typedOldVal) } newVal := ancestorstombstone() if typedNewVal != nil { newVal = unsafe.Pointer(typedNewVal) } retry: epoch := shard.seq.BeginRead() slots := atomic.LoadPointer(&shard.slots) mask := atomic.LoadUintptr(&shard.mask) if !shard.seq.ReadOk(epoch) { goto retry } if slots == nil { if (compare && oldVal != ancestorstombstone()) || newVal == ancestorstombstone() { return nil } shard.rehash(nil) goto retry } i := hash & mask inc := uintptr(1) for { slot := ancestorsapmSlotAt(slots, i) slotVal := atomic.LoadPointer(&slot.val) if slotVal == nil { if (compare && oldVal != ancestorstombstone()) || newVal == ancestorstombstone() { return nil } shard.dirtyMu.Lock() slotVal = atomic.LoadPointer(&slot.val) if slotVal == nil { if dirty, capacity := shard.dirty+1, mask+1; dirty*ancestorsapmRehashThresholdDen >= capacity*ancestorsapmRehashThresholdNum { shard.dirtyMu.Unlock() shard.rehash(slots) goto retry } slot.key = key atomic.StorePointer(&slot.val, newVal) shard.dirty++ atomic.AddUintptr(&shard.count, 1) shard.dirtyMu.Unlock() return nil } shard.dirtyMu.Unlock() } if slotVal == ancestorsevacuated() { goto retry } if slot.key == key { for { if (compare && oldVal != slotVal) || newVal == slotVal { if slotVal == ancestorstombstone() { return nil } return (*string)(slotVal) } if atomic.CompareAndSwapPointer(&slot.val, slotVal, newVal) { if slotVal == ancestorstombstone() { atomic.AddUintptr(&shard.count, 1) return nil } if newVal == ancestorstombstone() { atomic.AddUintptr(&shard.count, ^uintptr(0)) } return (*string)(slotVal) } slotVal = atomic.LoadPointer(&slot.val) if slotVal == ancestorsevacuated() { goto retry } } } i = (i + inc) & mask inc++ } } // rehash is marked nosplit to avoid preemption during table copying. // //go:nosplit func (shard *ancestorsapmShard) rehash(oldSlots unsafe.Pointer) { shard.rehashMu.Lock() defer shard.rehashMu.Unlock() if shard.slots != oldSlots { return } newSize := uintptr(8) if oldSlots != nil { oldSize := shard.mask + 1 newSize = oldSize if count := atomic.LoadUintptr(&shard.count) + 1; count*ancestorsapmExpansionThresholdDen > oldSize*ancestorsapmExpansionThresholdNum { newSize *= 2 } } newSlotsSlice := make([]ancestorsapmSlot, newSize) newSlots := unsafe.Pointer(&newSlotsSlice[0]) newMask := newSize - 1 shard.dirtyMu.Lock() shard.seq.BeginWrite() if oldSlots != nil { realCount := uintptr(0) oldMask := shard.mask for i := uintptr(0); i <= oldMask; i++ { oldSlot := ancestorsapmSlotAt(oldSlots, i) val := atomic.SwapPointer(&oldSlot.val, ancestorsevacuated()) if val == nil || val == ancestorstombstone() { continue } hash := ancestorshasher.Hash(oldSlot.key) j := hash & newMask inc := uintptr(1) for { newSlot := ancestorsapmSlotAt(newSlots, j) if newSlot.val == nil { newSlot.val = val newSlot.key = oldSlot.key break } j = (j + inc) & newMask inc++ } realCount++ } shard.dirty = realCount } atomic.StorePointer(&shard.slots, newSlots) atomic.StoreUintptr(&shard.mask, newMask) shard.seq.EndWrite() shard.dirtyMu.Unlock() } // Range invokes f on each Key-Value pair stored in m. If any call to f returns // false, Range stops iteration and returns. // // Range does not necessarily correspond to any consistent snapshot of the // Map's contents: no Key will be visited more than once, but if the Value for // any Key is stored or deleted concurrently, Range may reflect any mapping for // that Key from any point during the Range call. // // f must not call other methods on m. func (m *ancestorsAtomicPtrMap) Range(f func(key *MutexClass, val *string) bool) { for si := 0; si < len(m.shards); si++ { shard := &m.shards[si] if !shard.doRange(f) { return } } } func (shard *ancestorsapmShard) doRange(f func(key *MutexClass, val *string) bool) bool { shard.rehashMu.Lock() defer shard.rehashMu.Unlock() slots := shard.slots if slots == nil { return true } mask := shard.mask for i := uintptr(0); i <= mask; i++ { slot := ancestorsapmSlotAt(slots, i) slotVal := atomic.LoadPointer(&slot.val) if slotVal == nil || slotVal == ancestorstombstone() { continue } if !f(slot.key, (*string)(slotVal)) { return false } } return true } // RangeRepeatable is like Range, but: // // - RangeRepeatable may visit the same Key multiple times in the presence of // concurrent mutators, possibly passing different Values to f in different // calls. // // - It is safe for f to call other methods on m. func (m *ancestorsAtomicPtrMap) RangeRepeatable(f func(key *MutexClass, val *string) bool) { for si := 0; si < len(m.shards); si++ { shard := &m.shards[si] retry: epoch := shard.seq.BeginRead() slots := atomic.LoadPointer(&shard.slots) mask := atomic.LoadUintptr(&shard.mask) if !shard.seq.ReadOk(epoch) { goto retry } if slots == nil { continue } for i := uintptr(0); i <= mask; i++ { slot := ancestorsapmSlotAt(slots, i) slotVal := atomic.LoadPointer(&slot.val) if slotVal == ancestorsevacuated() { goto retry } if slotVal == nil || slotVal == ancestorstombstone() { continue } if !f(slot.key, (*string)(slotVal)) { return } } } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/locking/atomicptrmap_goroutine_unsafe.go000066400000000000000000000330071465435605700305140ustar00rootroot00000000000000package locking import ( "sync/atomic" "unsafe" "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/sync" ) const ( // ShardOrder is an optional parameter specifying the base-2 log of the // number of shards per AtomicPtrMap. Higher values of ShardOrder reduce // unnecessary synchronization between unrelated concurrent operations, // improving performance for write-heavy workloads, but increase memory // usage for small maps. goroutineLocksShardOrder = 0 ) // Hasher is an optional type parameter. If Hasher is provided, it must define // the Init and Hash methods. One Hasher will be shared by all AtomicPtrMaps. type goroutineLocksHasher struct { goroutineLocksdefaultHasher } // defaultHasher is the default Hasher. This indirection exists because // defaultHasher must exist even if a custom Hasher is provided, to prevent the // Go compiler from complaining about defaultHasher's unused imports. type goroutineLocksdefaultHasher struct { fn func(unsafe.Pointer, uintptr) uintptr seed uintptr } // Init initializes the Hasher. func (h *goroutineLocksdefaultHasher) Init() { h.fn = sync.MapKeyHasher(map[int64]*goroutineLocks(nil)) h.seed = sync.RandUintptr() } // Hash returns the hash value for the given Key. func (h *goroutineLocksdefaultHasher) Hash(key int64) uintptr { return h.fn(gohacks.Noescape(unsafe.Pointer(&key)), h.seed) } var goroutineLockshasher goroutineLocksHasher func init() { goroutineLockshasher.Init() } // An AtomicPtrMap maps Keys to non-nil pointers to Values. AtomicPtrMap are // safe for concurrent use from multiple goroutines without additional // synchronization. // // The zero value of AtomicPtrMap is empty (maps all Keys to nil) and ready for // use. AtomicPtrMaps must not be copied after first use. // // sync.Map may be faster than AtomicPtrMap if most operations on the map are // concurrent writes to a fixed set of keys. AtomicPtrMap is usually faster in // other circumstances. type goroutineLocksAtomicPtrMap struct { shards [1 << goroutineLocksShardOrder]goroutineLocksapmShard } func (m *goroutineLocksAtomicPtrMap) shard(hash uintptr) *goroutineLocksapmShard { // Go defines right shifts >= width of shifted unsigned operand as 0, so // this is correct even if ShardOrder is 0 (although nogo complains because // nogo is dumb). const indexLSB = unsafe.Sizeof(uintptr(0))*8 - goroutineLocksShardOrder index := hash >> indexLSB return (*goroutineLocksapmShard)(unsafe.Pointer(uintptr(unsafe.Pointer(&m.shards)) + (index * unsafe.Sizeof(goroutineLocksapmShard{})))) } type goroutineLocksapmShard struct { goroutineLocksapmShardMutationData _ [goroutineLocksapmShardMutationDataPadding]byte goroutineLocksapmShardLookupData _ [goroutineLocksapmShardLookupDataPadding]byte } type goroutineLocksapmShardMutationData struct { dirtyMu sync.Mutex // serializes slot transitions out of empty dirty uintptr // # slots with val != nil count uintptr // # slots with val != nil and val != tombstone() rehashMu sync.Mutex // serializes rehashing } type goroutineLocksapmShardLookupData struct { seq sync.SeqCount // allows atomic reads of slots+mask slots unsafe.Pointer // [mask+1]slot or nil; protected by rehashMu/seq mask uintptr // always (a power of 2) - 1; protected by rehashMu/seq } const ( goroutineLockscacheLineBytes = 64 // Cache line padding is enabled if sharding is. goroutineLocksapmEnablePadding = (goroutineLocksShardOrder + 63) >> 6 // 0 if ShardOrder == 0, 1 otherwise // The -1 and +1 below are required to ensure that if unsafe.Sizeof(T) % // cacheLineBytes == 0, then padding is 0 (rather than cacheLineBytes). goroutineLocksapmShardMutationDataRequiredPadding = goroutineLockscacheLineBytes - (((unsafe.Sizeof(goroutineLocksapmShardMutationData{}) - 1) % goroutineLockscacheLineBytes) + 1) goroutineLocksapmShardMutationDataPadding = goroutineLocksapmEnablePadding * goroutineLocksapmShardMutationDataRequiredPadding goroutineLocksapmShardLookupDataRequiredPadding = goroutineLockscacheLineBytes - (((unsafe.Sizeof(goroutineLocksapmShardLookupData{}) - 1) % goroutineLockscacheLineBytes) + 1) goroutineLocksapmShardLookupDataPadding = goroutineLocksapmEnablePadding * goroutineLocksapmShardLookupDataRequiredPadding // These define fractional thresholds for when apmShard.rehash() is called // (i.e. the load factor) and when it rehases to a larger table // respectively. They are chosen such that the rehash threshold = the // expansion threshold + 1/2, so that when reuse of deleted slots is rare // or non-existent, rehashing occurs after the insertion of at least 1/2 // the table's size in new entries, which is acceptably infrequent. goroutineLocksapmRehashThresholdNum = 2 goroutineLocksapmRehashThresholdDen = 3 goroutineLocksapmExpansionThresholdNum = 1 goroutineLocksapmExpansionThresholdDen = 6 ) type goroutineLocksapmSlot struct { // slot states are indicated by val: // // * Empty: val == nil; key is meaningless. May transition to full or // evacuated with dirtyMu locked. // // * Full: val != nil, tombstone(), or evacuated(); key is immutable. val // is the Value mapped to key. May transition to deleted or evacuated. // // * Deleted: val == tombstone(); key is still immutable. key is mapped to // no Value. May transition to full or evacuated. // // * Evacuated: val == evacuated(); key is immutable. Set by rehashing on // slots that have already been moved, requiring readers to wait for // rehashing to complete and use the new table. Terminal state. // // Note that once val is non-nil, it cannot become nil again. That is, the // transition from empty to non-empty is irreversible for a given slot; // the only way to create more empty slots is by rehashing. val unsafe.Pointer key int64 } func goroutineLocksapmSlotAt(slots unsafe.Pointer, pos uintptr) *goroutineLocksapmSlot { return (*goroutineLocksapmSlot)(unsafe.Pointer(uintptr(slots) + pos*unsafe.Sizeof(goroutineLocksapmSlot{}))) } var goroutineLockstombstoneObj byte func goroutineLockstombstone() unsafe.Pointer { return unsafe.Pointer(&goroutineLockstombstoneObj) } var goroutineLocksevacuatedObj byte func goroutineLocksevacuated() unsafe.Pointer { return unsafe.Pointer(&goroutineLocksevacuatedObj) } // Load returns the Value stored in m for key. func (m *goroutineLocksAtomicPtrMap) Load(key int64) *goroutineLocks { hash := goroutineLockshasher.Hash(key) shard := m.shard(hash) retry: epoch := shard.seq.BeginRead() slots := atomic.LoadPointer(&shard.slots) mask := atomic.LoadUintptr(&shard.mask) if !shard.seq.ReadOk(epoch) { goto retry } if slots == nil { return nil } i := hash & mask inc := uintptr(1) for { slot := goroutineLocksapmSlotAt(slots, i) slotVal := atomic.LoadPointer(&slot.val) if slotVal == nil { return nil } if slotVal == goroutineLocksevacuated() { goto retry } if slot.key == key { if slotVal == goroutineLockstombstone() { return nil } return (*goroutineLocks)(slotVal) } i = (i + inc) & mask inc++ } } // Store stores the Value val for key. func (m *goroutineLocksAtomicPtrMap) Store(key int64, val *goroutineLocks) { m.maybeCompareAndSwap(key, false, nil, val) } // Swap stores the Value val for key and returns the previously-mapped Value. func (m *goroutineLocksAtomicPtrMap) Swap(key int64, val *goroutineLocks) *goroutineLocks { return m.maybeCompareAndSwap(key, false, nil, val) } // CompareAndSwap checks that the Value stored for key is oldVal; if it is, it // stores the Value newVal for key. CompareAndSwap returns the previous Value // stored for key, whether or not it stores newVal. func (m *goroutineLocksAtomicPtrMap) CompareAndSwap(key int64, oldVal, newVal *goroutineLocks) *goroutineLocks { return m.maybeCompareAndSwap(key, true, oldVal, newVal) } func (m *goroutineLocksAtomicPtrMap) maybeCompareAndSwap(key int64, compare bool, typedOldVal, typedNewVal *goroutineLocks) *goroutineLocks { hash := goroutineLockshasher.Hash(key) shard := m.shard(hash) oldVal := goroutineLockstombstone() if typedOldVal != nil { oldVal = unsafe.Pointer(typedOldVal) } newVal := goroutineLockstombstone() if typedNewVal != nil { newVal = unsafe.Pointer(typedNewVal) } retry: epoch := shard.seq.BeginRead() slots := atomic.LoadPointer(&shard.slots) mask := atomic.LoadUintptr(&shard.mask) if !shard.seq.ReadOk(epoch) { goto retry } if slots == nil { if (compare && oldVal != goroutineLockstombstone()) || newVal == goroutineLockstombstone() { return nil } shard.rehash(nil) goto retry } i := hash & mask inc := uintptr(1) for { slot := goroutineLocksapmSlotAt(slots, i) slotVal := atomic.LoadPointer(&slot.val) if slotVal == nil { if (compare && oldVal != goroutineLockstombstone()) || newVal == goroutineLockstombstone() { return nil } shard.dirtyMu.Lock() slotVal = atomic.LoadPointer(&slot.val) if slotVal == nil { if dirty, capacity := shard.dirty+1, mask+1; dirty*goroutineLocksapmRehashThresholdDen >= capacity*goroutineLocksapmRehashThresholdNum { shard.dirtyMu.Unlock() shard.rehash(slots) goto retry } slot.key = key atomic.StorePointer(&slot.val, newVal) shard.dirty++ atomic.AddUintptr(&shard.count, 1) shard.dirtyMu.Unlock() return nil } shard.dirtyMu.Unlock() } if slotVal == goroutineLocksevacuated() { goto retry } if slot.key == key { for { if (compare && oldVal != slotVal) || newVal == slotVal { if slotVal == goroutineLockstombstone() { return nil } return (*goroutineLocks)(slotVal) } if atomic.CompareAndSwapPointer(&slot.val, slotVal, newVal) { if slotVal == goroutineLockstombstone() { atomic.AddUintptr(&shard.count, 1) return nil } if newVal == goroutineLockstombstone() { atomic.AddUintptr(&shard.count, ^uintptr(0)) } return (*goroutineLocks)(slotVal) } slotVal = atomic.LoadPointer(&slot.val) if slotVal == goroutineLocksevacuated() { goto retry } } } i = (i + inc) & mask inc++ } } // rehash is marked nosplit to avoid preemption during table copying. // //go:nosplit func (shard *goroutineLocksapmShard) rehash(oldSlots unsafe.Pointer) { shard.rehashMu.Lock() defer shard.rehashMu.Unlock() if shard.slots != oldSlots { return } newSize := uintptr(8) if oldSlots != nil { oldSize := shard.mask + 1 newSize = oldSize if count := atomic.LoadUintptr(&shard.count) + 1; count*goroutineLocksapmExpansionThresholdDen > oldSize*goroutineLocksapmExpansionThresholdNum { newSize *= 2 } } newSlotsSlice := make([]goroutineLocksapmSlot, newSize) newSlots := unsafe.Pointer(&newSlotsSlice[0]) newMask := newSize - 1 shard.dirtyMu.Lock() shard.seq.BeginWrite() if oldSlots != nil { realCount := uintptr(0) oldMask := shard.mask for i := uintptr(0); i <= oldMask; i++ { oldSlot := goroutineLocksapmSlotAt(oldSlots, i) val := atomic.SwapPointer(&oldSlot.val, goroutineLocksevacuated()) if val == nil || val == goroutineLockstombstone() { continue } hash := goroutineLockshasher.Hash(oldSlot.key) j := hash & newMask inc := uintptr(1) for { newSlot := goroutineLocksapmSlotAt(newSlots, j) if newSlot.val == nil { newSlot.val = val newSlot.key = oldSlot.key break } j = (j + inc) & newMask inc++ } realCount++ } shard.dirty = realCount } atomic.StorePointer(&shard.slots, newSlots) atomic.StoreUintptr(&shard.mask, newMask) shard.seq.EndWrite() shard.dirtyMu.Unlock() } // Range invokes f on each Key-Value pair stored in m. If any call to f returns // false, Range stops iteration and returns. // // Range does not necessarily correspond to any consistent snapshot of the // Map's contents: no Key will be visited more than once, but if the Value for // any Key is stored or deleted concurrently, Range may reflect any mapping for // that Key from any point during the Range call. // // f must not call other methods on m. func (m *goroutineLocksAtomicPtrMap) Range(f func(key int64, val *goroutineLocks) bool) { for si := 0; si < len(m.shards); si++ { shard := &m.shards[si] if !shard.doRange(f) { return } } } func (shard *goroutineLocksapmShard) doRange(f func(key int64, val *goroutineLocks) bool) bool { shard.rehashMu.Lock() defer shard.rehashMu.Unlock() slots := shard.slots if slots == nil { return true } mask := shard.mask for i := uintptr(0); i <= mask; i++ { slot := goroutineLocksapmSlotAt(slots, i) slotVal := atomic.LoadPointer(&slot.val) if slotVal == nil || slotVal == goroutineLockstombstone() { continue } if !f(slot.key, (*goroutineLocks)(slotVal)) { return false } } return true } // RangeRepeatable is like Range, but: // // - RangeRepeatable may visit the same Key multiple times in the presence of // concurrent mutators, possibly passing different Values to f in different // calls. // // - It is safe for f to call other methods on m. func (m *goroutineLocksAtomicPtrMap) RangeRepeatable(f func(key int64, val *goroutineLocks) bool) { for si := 0; si < len(m.shards); si++ { shard := &m.shards[si] retry: epoch := shard.seq.BeginRead() slots := atomic.LoadPointer(&shard.slots) mask := atomic.LoadUintptr(&shard.mask) if !shard.seq.ReadOk(epoch) { goto retry } if slots == nil { continue } for i := uintptr(0); i <= mask; i++ { slot := goroutineLocksapmSlotAt(slots, i) slotVal := atomic.LoadPointer(&slot.val) if slotVal == goroutineLocksevacuated() { goto retry } if slotVal == nil || slotVal == goroutineLockstombstone() { continue } if !f(slot.key, (*goroutineLocks)(slotVal)) { return } } } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/locking/lockdep.go000066400000000000000000000136671465435605700240130ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build lockdep // +build lockdep package locking import ( "fmt" "reflect" "strings" "gvisor.dev/gvisor/pkg/goid" "gvisor.dev/gvisor/pkg/log" ) // NewMutexClass allocates a new mutex class. func NewMutexClass(t reflect.Type, lockNames []string) *MutexClass { c := &MutexClass{ typ: t, nestedLockNames: lockNames, nestedLockClasses: make([]*MutexClass, len(lockNames)), } for i := range lockNames { c.nestedLockClasses[i] = NewMutexClass(t, nil) c.nestedLockClasses[i].lockName = lockNames[i] } return c } // MutexClass describes dependencies of a specific class. type MutexClass struct { // The type of the mutex. typ reflect.Type // Name of the nested lock of the above type. lockName string // ancestors are locks that are locked before the current class. ancestors ancestorsAtomicPtrMap // nestedLockNames is a list of names for nested locks which are considered difference instances // of the same lock class. nestedLockNames []string // namedLockClasses is a list of MutexClass instances of the same mutex class, but that are // considered OK to lock simultaneously with each other, as well as with this mutex class. // This is used for nested locking, where multiple instances of the same lock class are used // simultaneously. // Maps one-to-one with nestedLockNames. nestedLockClasses []*MutexClass } func (m *MutexClass) String() string { if m.lockName == "" { return m.typ.String() } return fmt.Sprintf("%s[%s]", m.typ.String(), m.lockName) } type goroutineLocks map[*MutexClass]bool var routineLocks goroutineLocksAtomicPtrMap // maxChainLen is the maximum length of a lock chain. const maxChainLen = 32 // checkLock checks that class isn't in the ancestors of prevClass. func checkLock(class *MutexClass, prevClass *MutexClass, chain []*MutexClass) { chain = append(chain, prevClass) if len(chain) >= maxChainLen { // It can be a race condition with another thread that added // the lock to the graph but don't complete the validation. var b strings.Builder fmt.Fprintf(&b, "WARNING: The maximum lock depth has been reached: %s", chain[0]) for i := 1; i < len(chain); i++ { fmt.Fprintf(&b, "-> %s", chain[i]) } log.Warningf("%s", b.String()) return } if c := prevClass.ancestors.Load(class); c != nil { var b strings.Builder fmt.Fprintf(&b, "WARNING: circular locking detected: %s -> %s:\n%s\n", chain[0], class, log.LocalStack(3)) fmt.Fprintf(&b, "known lock chain: ") c := class for i := len(chain) - 1; i >= 0; i-- { fmt.Fprintf(&b, "%s -> ", c) c = chain[i] } fmt.Fprintf(&b, "%s\n", chain[0]) c = class for i := len(chain) - 1; i >= 0; i-- { fmt.Fprintf(&b, "\n====== %s -> %s =====\n%s", c, chain[i], *chain[i].ancestors.Load(c)) c = chain[i] } panic(b.String()) } prevClass.ancestors.RangeRepeatable(func(parentClass *MutexClass, stacks *string) bool { // The recursion is fine here. If it fails, you need to reduce // a number of nested locks. checkLock(class, parentClass, chain) return true }) } // AddGLock records a lock to the current goroutine and updates dependencies. func AddGLock(class *MutexClass, lockNameIndex int) { gid := goid.Get() if lockNameIndex != -1 { class = class.nestedLockClasses[lockNameIndex] } currentLocks := routineLocks.Load(gid) if currentLocks == nil { locks := goroutineLocks(make(map[*MutexClass]bool)) locks[class] = true routineLocks.Store(gid, &locks) return } if (*currentLocks)[class] { panic(fmt.Sprintf("nested locking: %s:\n%s", class, log.LocalStack(2))) } (*currentLocks)[class] = true // Check dependencies and add locked mutexes to the ancestors list. for prevClass := range *currentLocks { if prevClass == class { continue } checkLock(class, prevClass, nil) if c := class.ancestors.Load(prevClass); c == nil { stacks := string(log.LocalStack(2)) class.ancestors.Store(prevClass, &stacks) } } } // DelGLock deletes a lock from the current goroutine. func DelGLock(class *MutexClass, lockNameIndex int) { if lockNameIndex != -1 { class = class.nestedLockClasses[lockNameIndex] } gid := goid.Get() currentLocks := routineLocks.Load(gid) if currentLocks == nil { panic("the current goroutine doesn't have locks") } if _, ok := (*currentLocks)[class]; !ok { var b strings.Builder fmt.Fprintf(&b, "Lock not held: %s:\n", class) fmt.Fprintf(&b, "Current stack:\n%s\n", string(log.LocalStack(2))) fmt.Fprintf(&b, "Current locks:\n") for c := range *currentLocks { heldToClass := class.ancestors.Load(c) classToHeld := c.ancestors.Load(class) if heldToClass == nil && classToHeld == nil { fmt.Fprintf(&b, "\t- Holding lock: %s (no dependency to/from %s found)\n", c, class) } else if heldToClass != nil && classToHeld != nil { fmt.Fprintf(&b, "\t- Holding lock: %s (mutual dependency with %s found, this should never happen)\n", c, class) } else if heldToClass != nil && classToHeld == nil { fmt.Fprintf(&b, "\t- Holding lock: %s (dependency: %s -> %s)\n", c, c, class) fmt.Fprintf(&b, "%s\n\n", *heldToClass) } else if heldToClass == nil && classToHeld != nil { fmt.Fprintf(&b, "\t- Holding lock: %s (dependency: %s -> %s)\n", c, class, c) fmt.Fprintf(&b, "%s\n\n", *classToHeld) } } fmt.Fprintf(&b, "** End of locks held **\n") panic(b.String()) } delete(*currentLocks, class) if len(*currentLocks) == 0 { routineLocks.Store(gid, nil) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/locking/lockdep_norace.go000066400000000000000000000021341465435605700253250ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !lockdep // +build !lockdep package locking import ( "reflect" ) type goroutineLocks map[*MutexClass]bool // MutexClass is a stub class without the lockdep tag. type MutexClass struct{} // NewMutexClass is no-op without the lockdep tag. func NewMutexClass(reflect.Type, []string) *MutexClass { return nil } // AddGLock is no-op without the lockdep tag. // //go:inline func AddGLock(*MutexClass, int) {} // DelGLock is no-op without the lockdep tag. // //go:inline func DelGLock(*MutexClass, int) {} golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/locking/locking.go000066400000000000000000000025221465435605700240040ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package locking implements lock primitives with the correctness validator. // // All mutexes are divided on classes and the validator check following conditions: // - Mutexes of the same class are not taken more than once except cases when // that is expected. // - Mutexes are never locked in a reverse order. Lock dependencies are tracked // on the class level. // // The validator is implemented in a very straightforward way. For each mutex // class, we maintain the ancestors list of all classes that have ever been // taken before the target one. For each goroutine, we have the list of // currently locked mutexes. And finally, all lock methods check that // ancestors of currently locked mutexes don't contain the target one. package locking golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/mutex_unsafe.go000066400000000000000000000035071465435605700234370ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package sync import ( "sync" "unsafe" ) // CrossGoroutineMutex is equivalent to Mutex, but it need not be unlocked by a // the same goroutine that locked the mutex. type CrossGoroutineMutex struct { m sync.Mutex } // Lock locks the underlying Mutex. // +checklocksignore func (m *CrossGoroutineMutex) Lock() { m.m.Lock() } // Unlock unlocks the underlying Mutex. // +checklocksignore func (m *CrossGoroutineMutex) Unlock() { m.m.Unlock() } // TryLock tries to acquire the mutex. It returns true if it succeeds and false // otherwise. TryLock does not block. func (m *CrossGoroutineMutex) TryLock() bool { return m.m.TryLock() } // Mutex is a mutual exclusion lock. The zero value for a Mutex is an unlocked // mutex. // // A Mutex must not be copied after first use. // // A Mutex must be unlocked by the same goroutine that locked it. This // invariant is enforced with the 'checklocks' build tag. type Mutex struct { m CrossGoroutineMutex } // Lock locks m. If the lock is already in use, the calling goroutine blocks // until the mutex is available. // +checklocksignore func (m *Mutex) Lock() { noteLock(unsafe.Pointer(m)) m.m.Lock() } // Unlock unlocks m. // // Preconditions: // - m is locked. // - m was locked by this goroutine. // // +checklocksignore func (m *Mutex) Unlock() { noteUnlock(unsafe.Pointer(m)) m.m.Unlock() } // TryLock tries to acquire the mutex. It returns true if it succeeds and false // otherwise. TryLock does not block. // +checklocksignore func (m *Mutex) TryLock() bool { // Note lock first to enforce proper locking even if unsuccessful. noteLock(unsafe.Pointer(m)) locked := m.m.TryLock() if !locked { noteUnlock(unsafe.Pointer(m)) } return locked } golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/nocopy.go000066400000000000000000000017221465435605700222400ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sync // NoCopy may be embedded into structs which must not be copied // after the first use. // // See https://golang.org/issues/8005#issuecomment-190753527 // for details. type NoCopy struct{} // Lock is a no-op used by -copylocks checker from `go vet`. func (*NoCopy) Lock() {} // Unlock is a no-op used by -copylocks checker from `go vet`. func (*NoCopy) Unlock() {} golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/norace_unsafe.go000066400000000000000000000024251465435605700235420ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build !race // +build !race package sync import ( "sync/atomic" "unsafe" ) // RaceEnabled is true if the Go data race detector is enabled. const RaceEnabled = false // RaceDisable has the same semantics as runtime.RaceDisable. func RaceDisable() { } // RaceEnable has the same semantics as runtime.RaceEnable. func RaceEnable() { } // RaceAcquire has the same semantics as runtime.RaceAcquire. func RaceAcquire(addr unsafe.Pointer) { } // RaceRelease has the same semantics as runtime.RaceRelease. func RaceRelease(addr unsafe.Pointer) { } // RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge. func RaceReleaseMerge(addr unsafe.Pointer) { } // RaceUncheckedAtomicCompareAndSwapUintptr is equivalent to // sync/atomic.CompareAndSwapUintptr, but is not checked by the race detector. // This is necessary when implementing gopark callbacks, since no race context // is available during their execution. func RaceUncheckedAtomicCompareAndSwapUintptr(ptr *uintptr, old, new uintptr) bool { // Use atomic.CompareAndSwapUintptr outside of race builds for // inlinability. return atomic.CompareAndSwapUintptr(ptr, old, new) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/race_amd64.s000066400000000000000000000016721465435605700224770ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build race && amd64 // +build race,amd64 #include "textflag.h" // func RaceUncheckedAtomicCompareAndSwapUintptr(ptr *uintptr, old, new uintptr) bool TEXT ·RaceUncheckedAtomicCompareAndSwapUintptr(SB),NOSPLIT|NOFRAME,$0-25 MOVQ ptr+0(FP), DI MOVQ old+8(FP), AX MOVQ new+16(FP), SI LOCK CMPXCHGQ SI, 0(DI) SETEQ AX MOVB AX, ret+24(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/race_arm64.s000066400000000000000000000017531465435605700225150ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build race && arm64 // +build race,arm64 #include "textflag.h" // func RaceUncheckedAtomicCompareAndSwapUintptr(ptr *uintptr, old, new uintptr) bool TEXT ·RaceUncheckedAtomicCompareAndSwapUintptr(SB),NOSPLIT,$0-25 MOVD ptr+0(FP), R0 MOVD old+8(FP), R1 MOVD new+16(FP), R1 again: LDAXR (R0), R3 CMP R1, R3 BNE ok STLXR R2, (R0), R3 CBNZ R3, again ok: CSET EQ, R0 MOVB R0, ret+24(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/race_unsafe.go000066400000000000000000000024071465435605700232050ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build race // +build race package sync import ( "runtime" "unsafe" ) // RaceEnabled is true if the Go data race detector is enabled. const RaceEnabled = true // RaceDisable has the same semantics as runtime.RaceDisable. func RaceDisable() { runtime.RaceDisable() } // RaceEnable has the same semantics as runtime.RaceEnable. func RaceEnable() { runtime.RaceEnable() } // RaceAcquire has the same semantics as runtime.RaceAcquire. func RaceAcquire(addr unsafe.Pointer) { runtime.RaceAcquire(addr) } // RaceRelease has the same semantics as runtime.RaceRelease. func RaceRelease(addr unsafe.Pointer) { runtime.RaceRelease(addr) } // RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge. func RaceReleaseMerge(addr unsafe.Pointer) { runtime.RaceReleaseMerge(addr) } // RaceUncheckedAtomicCompareAndSwapUintptr is equivalent to // sync/atomic.CompareAndSwapUintptr, but is not checked by the race detector. // This is necessary when implementing gopark callbacks, since no race context // is available during their execution. func RaceUncheckedAtomicCompareAndSwapUintptr(ptr *uintptr, old, new uintptr) bool golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/runtime.go000066400000000000000000000012631465435605700224140ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sync import ( "runtime" ) // Dummy reference for facts. const _ = runtime.Compiler golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/runtime_amd64.go000066400000000000000000000011031465435605700234000ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build amd64 package sync import ( "sync/atomic" ) const supportsWakeSuppression = true // addrOfSpinning returns the address of runtime.sched.nmspinning. func addrOfSpinning() *int32 // nmspinning caches addrOfSpinning. var nmspinning = addrOfSpinning() //go:nosplit func preGoReadyWakeSuppression() { atomic.AddInt32(nmspinning, 1) } //go:nosplit func postGoReadyWakeSuppression() { atomic.AddInt32(nmspinning, -1) } golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/runtime_constants.go000066400000000000000000000016441465435605700245130ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sync // Values for the reason argument to gopark, from Go's src/runtime/runtime2.go. const ( WaitReasonSelect uint8 = 9 // +checkconst runtime waitReasonSelect WaitReasonChanReceive uint8 = 14 // +checkconst runtime waitReasonChanReceive WaitReasonSemacquire uint8 = 18 // +checkconst runtime waitReasonSemacquire ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/runtime_exectracer2.go000066400000000000000000000015201465435605700246770ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sync // TraceBlockReason constants, from Go's src/runtime/trace2runtime.go. const ( TraceBlockSelect TraceBlockReason = 3 // +checkconst runtime traceBlockSelect TraceBlockSync TraceBlockReason = 5 // +checkconst runtime traceBlockSync ) golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/runtime_go121_unsafe.go000066400000000000000000000006271465435605700246710ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build go1.21 package sync import ( "unsafe" ) // Use checkoffset to assert that maptype.hasher (the only field we use) has // the correct offset. const maptypeHasherOffset = unsafe.Offsetof(maptype{}.Hasher) // +checkoffset internal/abi MapType.Hasher golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/runtime_not_go121_unsafe.go000066400000000000000000000007261465435605700255510ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // runtime.maptype is moved to internal/abi.MapType in Go 1.21. // //go:build !go1.21 package sync import ( "unsafe" ) // Use checkoffset to assert that maptype.hasher (the only field we use) has // the correct offset. const maptypeHasherOffset = unsafe.Offsetof(maptype{}.Hasher) // +checkoffset runtime maptype.hasher golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/runtime_other.go000066400000000000000000000005271465435605700236170ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build !amd64 // +build !amd64 package sync const supportsWakeSuppression = false func preGoReadyWakeSuppression() {} // Never called. func postGoReadyWakeSuppression() {} // Never called. golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/runtime_spinning_amd64.s000066400000000000000000000015101465435605700251440ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 #include "textflag.h" #define NMSPINNING_OFFSET 92 // +checkoffset runtime schedt.nmspinning TEXT ·addrOfSpinning(SB),NOSPLIT|NOFRAME,$0-8 LEAQ runtime·sched(SB), AX ADDQ $NMSPINNING_OFFSET, AX MOVQ AX, ret+0(FP) RET golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/runtime_spinning_other.s000066400000000000000000000013621465435605700253570ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !amd64 // This file is intentionally left blank. Other arches don't use // addrOfSpinning, but we still need an input to the nogo template rule. golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/runtime_unsafe.go000066400000000000000000000074651465435605700237670ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // //go:linkname directives type-checked by checklinkname. // Runtime type copies checked by checkoffset. package sync import ( "fmt" "reflect" "unsafe" ) // Goyield is runtime.goyield, which is similar to runtime.Gosched but only // yields the processor to other goroutines already on the processor's // runqueue. // //go:nosplit func Goyield() { goyield() } // Gopark is runtime.gopark. Gopark calls unlockf(pointer to runtime.g, lock); // if unlockf returns true, Gopark blocks until Goready(pointer to runtime.g) // is called. unlockf and its callees must be nosplit and norace, since stack // splitting and race context are not available where it is called. // //go:nosplit func Gopark(unlockf func(uintptr, unsafe.Pointer) bool, lock unsafe.Pointer, reason uint8, traceReason TraceBlockReason, traceskip int) { gopark(unlockf, lock, reason, traceReason, traceskip) } //go:linkname gopark runtime.gopark func gopark(unlockf func(uintptr, unsafe.Pointer) bool, lock unsafe.Pointer, reason uint8, traceReason TraceBlockReason, traceskip int) // TraceBlockReason is equivalent to runtime.traceBlockReason. type TraceBlockReason uint8 //go:linkname wakep runtime.wakep func wakep() // Wakep is runtime.wakep. // //go:nosplit func Wakep() { // This is only supported if we can suppress the wakep called // from Goready below, which is in certain architectures only. if supportsWakeSuppression { wakep() } } //go:linkname goready runtime.goready func goready(gp uintptr, traceskip int) // Goready is runtime.goready. // // The additional wakep argument controls whether a new thread will be kicked to // execute the P. This should be true in most circumstances. However, if the // current thread is about to sleep, then this can be false for efficiency. // //go:nosplit func Goready(gp uintptr, traceskip int, wakep bool) { if supportsWakeSuppression && !wakep { preGoReadyWakeSuppression() } goready(gp, traceskip) if supportsWakeSuppression && !wakep { postGoReadyWakeSuppression() } } // Rand32 returns a non-cryptographically-secure random uint32. func Rand32() uint32 { return fastrand() } // Rand64 returns a non-cryptographically-secure random uint64. func Rand64() uint64 { return uint64(fastrand())<<32 | uint64(fastrand()) } //go:linkname fastrand runtime.fastrand func fastrand() uint32 // RandUintptr returns a non-cryptographically-secure random uintptr. func RandUintptr() uintptr { if unsafe.Sizeof(uintptr(0)) == 4 { return uintptr(Rand32()) } return uintptr(Rand64()) } // MapKeyHasher returns a hash function for pointers of m's key type. // // Preconditions: m must be a map. func MapKeyHasher(m any) func(unsafe.Pointer, uintptr) uintptr { if rtyp := reflect.TypeOf(m); rtyp.Kind() != reflect.Map { panic(fmt.Sprintf("sync.MapKeyHasher: m is %v, not map", rtyp)) } mtyp := *(**maptype)(unsafe.Pointer(&m)) return mtyp.Hasher } // maptype is equivalent to the beginning of internal/abi.MapType. type maptype struct { size uintptr ptrdata uintptr hash uint32 tflag uint8 align uint8 fieldAlign uint8 kind uint8 equal func(unsafe.Pointer, unsafe.Pointer) bool gcdata *byte str int32 ptrToThis int32 key unsafe.Pointer elem unsafe.Pointer bucket unsafe.Pointer Hasher func(unsafe.Pointer, uintptr) uintptr // more fields } // These functions are only used within the sync package. //go:linkname semacquire sync.runtime_Semacquire func semacquire(addr *uint32) //go:linkname semrelease sync.runtime_Semrelease func semrelease(addr *uint32, handoff bool, skipframes int) //go:linkname canSpin sync.runtime_canSpin func canSpin(i int) bool //go:linkname doSpin sync.runtime_doSpin func doSpin() golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/rwmutex_unsafe.go000066400000000000000000000205071465435605700240070ustar00rootroot00000000000000// Copyright 2009 The Go Authors. All rights reserved. // Copyright 2019 The gVisor Authors. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // This is mostly copied from the standard library's sync/rwmutex.go. // // Happens-before relationships indicated to the race detector: // - Unlock -> Lock (via writerSem) // - Unlock -> RLock (via readerSem) // - RUnlock -> Lock (via writerSem) // - DowngradeLock -> RLock (via readerSem) package sync import ( "sync/atomic" "unsafe" ) // CrossGoroutineRWMutex is equivalent to RWMutex, but it need not be unlocked // by a the same goroutine that locked the mutex. type CrossGoroutineRWMutex struct { // w is held if there are pending writers // // We use CrossGoroutineMutex rather than Mutex because the lock // annotation instrumentation in Mutex will trigger false positives in // the race detector when called inside of RaceDisable. w CrossGoroutineMutex writerSem uint32 // semaphore for writers to wait for completing readers readerSem uint32 // semaphore for readers to wait for completing writers readerCount int32 // number of pending readers readerWait int32 // number of departing readers } const rwmutexMaxReaders = 1 << 30 // TryRLock locks rw for reading. It returns true if it succeeds and false // otherwise. It does not block. // +checklocksignore func (rw *CrossGoroutineRWMutex) TryRLock() bool { if RaceEnabled { RaceDisable() } for { rc := atomic.LoadInt32(&rw.readerCount) if rc < 0 { if RaceEnabled { RaceEnable() } return false } if !atomic.CompareAndSwapInt32(&rw.readerCount, rc, rc+1) { continue } if RaceEnabled { RaceEnable() RaceAcquire(unsafe.Pointer(&rw.readerSem)) } return true } } // RLock locks rw for reading. // // It should not be used for recursive read locking; a blocked Lock call // excludes new readers from acquiring the lock. See the documentation on the // RWMutex type. // +checklocksignore func (rw *CrossGoroutineRWMutex) RLock() { if RaceEnabled { RaceDisable() } if atomic.AddInt32(&rw.readerCount, 1) < 0 { // A writer is pending, wait for it. semacquire(&rw.readerSem) } if RaceEnabled { RaceEnable() RaceAcquire(unsafe.Pointer(&rw.readerSem)) } } // RUnlock undoes a single RLock call. // // Preconditions: // - rw is locked for reading. // // +checklocksignore func (rw *CrossGoroutineRWMutex) RUnlock() { if RaceEnabled { RaceReleaseMerge(unsafe.Pointer(&rw.writerSem)) RaceDisable() } if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 { if r+1 == 0 || r+1 == -rwmutexMaxReaders { panic("RUnlock of unlocked RWMutex") } // A writer is pending. if atomic.AddInt32(&rw.readerWait, -1) == 0 { // The last reader unblocks the writer. semrelease(&rw.writerSem, false, 0) } } if RaceEnabled { RaceEnable() } } // TryLock locks rw for writing. It returns true if it succeeds and false // otherwise. It does not block. // +checklocksignore func (rw *CrossGoroutineRWMutex) TryLock() bool { if RaceEnabled { RaceDisable() } // First, resolve competition with other writers. if !rw.w.TryLock() { if RaceEnabled { RaceEnable() } return false } // Only proceed if there are no readers. if !atomic.CompareAndSwapInt32(&rw.readerCount, 0, -rwmutexMaxReaders) { rw.w.Unlock() if RaceEnabled { RaceEnable() } return false } if RaceEnabled { RaceEnable() RaceAcquire(unsafe.Pointer(&rw.writerSem)) } return true } // Lock locks rw for writing. If the lock is already locked for reading or // writing, Lock blocks until the lock is available. // +checklocksignore func (rw *CrossGoroutineRWMutex) Lock() { if RaceEnabled { RaceDisable() } // First, resolve competition with other writers. rw.w.Lock() // Announce to readers there is a pending writer. r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders // Wait for active readers. if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 { semacquire(&rw.writerSem) } if RaceEnabled { RaceEnable() RaceAcquire(unsafe.Pointer(&rw.writerSem)) } } // Unlock unlocks rw for writing. // // Preconditions: // - rw is locked for writing. // // +checklocksignore func (rw *CrossGoroutineRWMutex) Unlock() { if RaceEnabled { RaceRelease(unsafe.Pointer(&rw.writerSem)) RaceRelease(unsafe.Pointer(&rw.readerSem)) RaceDisable() } // Announce to readers there is no active writer. r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders) if r >= rwmutexMaxReaders { panic("Unlock of unlocked RWMutex") } // Unblock blocked readers, if any. for i := 0; i < int(r); i++ { semrelease(&rw.readerSem, false, 0) } // Allow other writers to proceed. rw.w.Unlock() if RaceEnabled { RaceEnable() } } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // // Preconditions: // - rw is locked for writing. // // +checklocksignore func (rw *CrossGoroutineRWMutex) DowngradeLock() { if RaceEnabled { RaceRelease(unsafe.Pointer(&rw.readerSem)) RaceDisable() } // Announce to readers there is no active writer and one additional reader. r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1) if r >= rwmutexMaxReaders+1 { panic("DowngradeLock of unlocked RWMutex") } // Unblock blocked readers, if any. Note that this loop starts as 1 since r // includes this goroutine. for i := 1; i < int(r); i++ { semrelease(&rw.readerSem, false, 0) } // Allow other writers to proceed to rw.w.Lock(). Note that they will still // block on rw.writerSem since at least this reader exists, such that // DowngradeLock() is atomic with the previous write lock. rw.w.Unlock() if RaceEnabled { RaceEnable() } } // A RWMutex is a reader/writer mutual exclusion lock. The lock can be held by // an arbitrary number of readers or a single writer. The zero value for a // RWMutex is an unlocked mutex. // // A RWMutex must not be copied after first use. // // If a goroutine holds a RWMutex for reading and another goroutine might call // Lock, no goroutine should expect to be able to acquire a read lock until the // initial read lock is released. In particular, this prohibits recursive read // locking. This is to ensure that the lock eventually becomes available; a // blocked Lock call excludes new readers from acquiring the lock. // // A Mutex must be unlocked by the same goroutine that locked it. This // invariant is enforced with the 'checklocks' build tag. type RWMutex struct { m CrossGoroutineRWMutex } // TryRLock locks rw for reading. It returns true if it succeeds and false // otherwise. It does not block. // +checklocksignore func (rw *RWMutex) TryRLock() bool { // Note lock first to enforce proper locking even if unsuccessful. noteLock(unsafe.Pointer(rw)) locked := rw.m.TryRLock() if !locked { noteUnlock(unsafe.Pointer(rw)) } return locked } // RLock locks rw for reading. // // It should not be used for recursive read locking; a blocked Lock call // excludes new readers from acquiring the lock. See the documentation on the // RWMutex type. // +checklocksignore func (rw *RWMutex) RLock() { noteLock(unsafe.Pointer(rw)) rw.m.RLock() } // RUnlock undoes a single RLock call. // // Preconditions: // - rw is locked for reading. // - rw was locked by this goroutine. // // +checklocksignore func (rw *RWMutex) RUnlock() { rw.m.RUnlock() noteUnlock(unsafe.Pointer(rw)) } // TryLock locks rw for writing. It returns true if it succeeds and false // otherwise. It does not block. // +checklocksignore func (rw *RWMutex) TryLock() bool { // Note lock first to enforce proper locking even if unsuccessful. noteLock(unsafe.Pointer(rw)) locked := rw.m.TryLock() if !locked { noteUnlock(unsafe.Pointer(rw)) } return locked } // Lock locks rw for writing. If the lock is already locked for reading or // writing, Lock blocks until the lock is available. // +checklocksignore func (rw *RWMutex) Lock() { noteLock(unsafe.Pointer(rw)) rw.m.Lock() } // Unlock unlocks rw for writing. // // Preconditions: // - rw is locked for writing. // - rw was locked by this goroutine. // // +checklocksignore func (rw *RWMutex) Unlock() { rw.m.Unlock() noteUnlock(unsafe.Pointer(rw)) } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // // Preconditions: // - rw is locked for writing. // // +checklocksignore func (rw *RWMutex) DowngradeLock() { // No note change for DowngradeLock. rw.m.DowngradeLock() } golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/seqcount.go000066400000000000000000000102211465435605700225640ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package sync import ( "sync/atomic" ) // SeqCount is a synchronization primitive for optimistic reader/writer // synchronization in cases where readers can work with stale data and // therefore do not need to block writers. // // Compared to sync/atomic.Value: // // - Mutation of SeqCount-protected data does not require memory allocation, // whereas atomic.Value generally does. This is a significant advantage when // writes are common. // // - Atomic reads of SeqCount-protected data require copying. This is a // disadvantage when atomic reads are common. // // - SeqCount may be more flexible: correct use of SeqCount.ReadOk allows other // operations to be made atomic with reads of SeqCount-protected data. // // - SeqCount is more cumbersome to use; atomic reads of SeqCount-protected // data require instantiating function templates using go_generics (see // seqatomic.go). type SeqCount struct { // epoch is incremented by BeginWrite and EndWrite, such that epoch is odd // if a writer critical section is active, and a read from data protected // by this SeqCount is atomic iff epoch is the same even value before and // after the read. epoch uint32 } // SeqCountEpoch tracks writer critical sections in a SeqCount. type SeqCountEpoch uint32 // BeginRead indicates the beginning of a reader critical section. Reader // critical sections DO NOT BLOCK writer critical sections, so operations in a // reader critical section MAY RACE with writer critical sections. Races are // detected by ReadOk at the end of the reader critical section. Thus, the // low-level structure of readers is generally: // // for { // epoch := seq.BeginRead() // // do something idempotent with seq-protected data // if seq.ReadOk(epoch) { // break // } // } // // However, since reader critical sections may race with writer critical // sections, the Go race detector will (accurately) flag data races in readers // using this pattern. Most users of SeqCount will need to use the // SeqAtomicLoad function template in seqatomic.go. func (s *SeqCount) BeginRead() SeqCountEpoch { if epoch := atomic.LoadUint32(&s.epoch); epoch&1 == 0 { return SeqCountEpoch(epoch) } return s.beginReadSlow() } func (s *SeqCount) beginReadSlow() SeqCountEpoch { i := 0 for { if canSpin(i) { i++ doSpin() } else { goyield() } if epoch := atomic.LoadUint32(&s.epoch); epoch&1 == 0 { return SeqCountEpoch(epoch) } } } // ReadOk returns true if the reader critical section initiated by a previous // call to BeginRead() that returned epoch did not race with any writer critical // sections. // // ReadOk may be called any number of times during a reader critical section. // Reader critical sections do not need to be explicitly terminated; the last // call to ReadOk is implicitly the end of the reader critical section. func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool { MemoryFenceReads() return atomic.LoadUint32(&s.epoch) == uint32(epoch) } // BeginWrite indicates the beginning of a writer critical section. // // SeqCount does not support concurrent writer critical sections; clients with // concurrent writers must synchronize them using e.g. sync.Mutex. func (s *SeqCount) BeginWrite() { if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 == 0 { panic("SeqCount.BeginWrite during writer critical section") } } // BeginWriteOk combines the semantics of ReadOk and BeginWrite. If the reader // critical section initiated by a previous call to BeginRead() that returned // epoch did not race with any writer critical sections, it begins a writer // critical section and returns true. Otherwise it does nothing and returns // false. func (s *SeqCount) BeginWriteOk(epoch SeqCountEpoch) bool { return atomic.CompareAndSwapUint32(&s.epoch, uint32(epoch), uint32(epoch)+1) } // EndWrite ends the effect of a preceding BeginWrite or successful // BeginWriteOk. func (s *SeqCount) EndWrite() { if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 != 0 { panic("SeqCount.EndWrite outside writer critical section") } } golang-gvisor-gvisor-0.0~20240729.0/pkg/sync/sync.go000066400000000000000000000003561465435605700217070ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Package sync provides synchronization primitives. // // +checkalignedignore package sync golang-gvisor-gvisor-0.0~20240729.0/pkg/syncevent/000077500000000000000000000000001465435605700214425ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/syncevent/broadcaster.go000066400000000000000000000151631465435605700242700ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package syncevent import ( "gvisor.dev/gvisor/pkg/sync" ) // Broadcaster is an implementation of Source that supports any number of // subscribed Receivers. // // The zero value of Broadcaster is valid and has no subscribed Receivers. // Broadcaster is not copyable by value. // // All Broadcaster methods may be called concurrently from multiple goroutines. type Broadcaster struct { // Broadcaster is implemented as a hash table where keys are assigned by // the Broadcaster and returned as SubscriptionIDs, making it safe to use // the identity function for hashing. The hash table resolves collisions // using linear probing and features Robin Hood insertion and backward // shift deletion in order to support a relatively high load factor // efficiently, which matters since the cost of Broadcast is linear in the // size of the table. // mu protects the following fields. mu sync.Mutex // Invariants: len(table) is 0 or a power of 2. table []broadcasterSlot // load is the number of entries in table with receiver != nil. load int lastID SubscriptionID } type broadcasterSlot struct { // Invariants: If receiver == nil, then filter == NoEvents and id == 0. // Otherwise, id != 0. receiver *Receiver filter Set id SubscriptionID } const ( broadcasterMinNonZeroTableSize = 2 // must be a power of 2 > 1 broadcasterMaxLoadNum = 13 broadcasterMaxLoadDen = 16 ) // SubscribeEvents implements Source.SubscribeEvents. func (b *Broadcaster) SubscribeEvents(r *Receiver, filter Set) SubscriptionID { b.mu.Lock() // Assign an ID for this subscription. b.lastID++ id := b.lastID // Expand the table if over the maximum load factor: // // load / len(b.table) > broadcasterMaxLoadNum / broadcasterMaxLoadDen // load * broadcasterMaxLoadDen > broadcasterMaxLoadNum * len(b.table) b.load++ if (b.load * broadcasterMaxLoadDen) > (broadcasterMaxLoadNum * len(b.table)) { // Double the number of slots in the new table. newlen := broadcasterMinNonZeroTableSize if len(b.table) != 0 { newlen = 2 * len(b.table) } if newlen <= cap(b.table) { // Reuse excess capacity in the current table, moving entries not // already in their first-probed positions to better ones. newtable := b.table[:newlen] newmask := uint64(newlen - 1) for i := range b.table { if b.table[i].receiver != nil && uint64(b.table[i].id)&newmask != uint64(i) { entry := b.table[i] b.table[i] = broadcasterSlot{} broadcasterTableInsert(newtable, entry.id, entry.receiver, entry.filter) } } b.table = newtable } else { newtable := make([]broadcasterSlot, newlen) // Copy existing entries to the new table. for i := range b.table { if b.table[i].receiver != nil { broadcasterTableInsert(newtable, b.table[i].id, b.table[i].receiver, b.table[i].filter) } } // Switch to the new table. b.table = newtable } } broadcasterTableInsert(b.table, id, r, filter) b.mu.Unlock() return id } // Preconditions: // - table must not be full. // - len(table) is a power of 2. func broadcasterTableInsert(table []broadcasterSlot, id SubscriptionID, r *Receiver, filter Set) { entry := broadcasterSlot{ receiver: r, filter: filter, id: id, } mask := uint64(len(table) - 1) i := uint64(id) & mask disp := uint64(0) for { if table[i].receiver == nil { table[i] = entry return } // If we've been displaced farther from our first-probed slot than the // element stored in this one, swap elements and switch to inserting // the replaced one. (This is Robin Hood insertion.) slotDisp := (i - uint64(table[i].id)) & mask if disp > slotDisp { table[i], entry = entry, table[i] disp = slotDisp } i = (i + 1) & mask disp++ } } // UnsubscribeEvents implements Source.UnsubscribeEvents. func (b *Broadcaster) UnsubscribeEvents(id SubscriptionID) { b.mu.Lock() mask := uint64(len(b.table) - 1) i := uint64(id) & mask for { if b.table[i].id == id { // Found the element to remove. Move all subsequent elements // backward until we either find an empty slot, or an element that // is already in its first-probed slot. (This is backward shift // deletion.) for { next := (i + 1) & mask if b.table[next].receiver == nil { break } if uint64(b.table[next].id)&mask == next { break } b.table[i] = b.table[next] i = next } b.table[i] = broadcasterSlot{} break } i = (i + 1) & mask } // If a table 1/4 of the current size would still be at or under the // maximum load factor (i.e. the current table size is at least two // expansions bigger than necessary), halve the size of the table to reduce // the cost of Broadcast. Since we are concerned with iteration time and // not memory usage, reuse the existing slice to reduce future allocations // from table re-expansion. b.load-- if len(b.table) > broadcasterMinNonZeroTableSize && (b.load*(4*broadcasterMaxLoadDen)) <= (broadcasterMaxLoadNum*len(b.table)) { newlen := len(b.table) / 2 newtable := b.table[:newlen] for i := newlen; i < len(b.table); i++ { if b.table[i].receiver != nil { broadcasterTableInsert(newtable, b.table[i].id, b.table[i].receiver, b.table[i].filter) b.table[i] = broadcasterSlot{} } } b.table = newtable } b.mu.Unlock() } // Broadcast notifies all Receivers subscribed to the Broadcaster of the subset // of events to which they subscribed. The order in which Receivers are // notified is unspecified. func (b *Broadcaster) Broadcast(events Set) { b.mu.Lock() for i := range b.table { if intersection := events & b.table[i].filter; intersection != 0 { // We don't need to check if broadcasterSlot.receiver is nil, since // if it is then broadcasterSlot.filter is 0. b.table[i].receiver.Notify(intersection) } } b.mu.Unlock() } // FilteredEvents returns the set of events for which Broadcast will notify at // least one Receiver, i.e. the union of filters for all subscribed Receivers. func (b *Broadcaster) FilteredEvents() Set { var es Set b.mu.Lock() for i := range b.table { es |= b.table[i].filter } b.mu.Unlock() return es } golang-gvisor-gvisor-0.0~20240729.0/pkg/syncevent/receiver.go000066400000000000000000000064771465435605700236130ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package syncevent import ( "gvisor.dev/gvisor/pkg/atomicbitops" ) // Receiver is an event sink that holds pending events and invokes a callback // whenever new events become pending. Receiver's methods may be called // concurrently from multiple goroutines. // // Receiver.Init() must be called before first use. type Receiver struct { // pending is the set of pending events. pending is accessed using atomic // memory operations. pending atomicbitops.Uint64 // cb is notified when new events become pending. cb is immutable after // Init(). cb ReceiverCallback } // ReceiverCallback receives callbacks from a Receiver. type ReceiverCallback interface { // NotifyPending is called when the corresponding Receiver has new pending // events. // // NotifyPending is called synchronously from Receiver.Notify(), so // implementations must not take locks that may be held by callers of // Receiver.Notify(). NotifyPending may be called concurrently from // multiple goroutines. NotifyPending() } // Init must be called before first use of r. func (r *Receiver) Init(cb ReceiverCallback) { r.cb = cb } // Pending returns the set of pending events. func (r *Receiver) Pending() Set { return Set(r.pending.Load()) } // Notify sets the given events as pending. func (r *Receiver) Notify(es Set) { p := Set(r.pending.Load()) // Optimization: Skip the atomic CAS on r.pending if all events are // already pending. if p&es == es { return } // When this is uncontended (the common case), CAS is faster than // atomic-OR because the former is inlined and the latter (which we // implement in assembly ourselves) is not. if !r.pending.CompareAndSwap(uint64(p), uint64(p|es)) { // If the CAS fails, fall back to atomic-OR. atomicbitops.OrUint64(&r.pending, uint64(es)) } r.cb.NotifyPending() } // Ack unsets the given events as pending. func (r *Receiver) Ack(es Set) { p := Set(r.pending.Load()) // Optimization: Skip the atomic CAS on r.pending if all events are // already not pending. if p&es == 0 { return } // When this is uncontended (the common case), CAS is faster than // atomic-AND because the former is inlined and the latter (which we // implement in assembly ourselves) is not. if !r.pending.CompareAndSwap(uint64(p), uint64(p&^es)) { // If the CAS fails, fall back to atomic-AND. atomicbitops.AndUint64(&r.pending, ^uint64(es)) } } // PendingAndAckAll unsets all events as pending and returns the set of // previously-pending events. // // PendingAndAckAll should only be used in preference to a call to Pending // followed by a conditional call to Ack when the caller expects events to be // pending (e.g. after a call to ReceiverCallback.NotifyPending()). func (r *Receiver) PendingAndAckAll() Set { return Set(r.pending.Swap(0)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/syncevent/source.go000066400000000000000000000040701465435605700232720ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package syncevent // Source represents an event source. type Source interface { // SubscribeEvents causes the Source to notify the given Receiver of the // given subset of events. // // Preconditions: // * r != nil. // * The ReceiverCallback for r must not take locks that are ordered // prior to the Source; for example, it cannot call any Source // methods. SubscribeEvents(r *Receiver, filter Set) SubscriptionID // UnsubscribeEvents causes the Source to stop notifying the Receiver // subscribed by a previous call to SubscribeEvents that returned the given // SubscriptionID. // // Preconditions: UnsubscribeEvents may be called at most once for any // given SubscriptionID. UnsubscribeEvents(id SubscriptionID) } // SubscriptionID identifies a call to Source.SubscribeEvents. type SubscriptionID uint64 // UnsubscribeAndAck is a convenience function that unsubscribes r from the // given events from src and also clears them from r. func UnsubscribeAndAck(src Source, r *Receiver, filter Set, id SubscriptionID) { src.UnsubscribeEvents(id) r.Ack(filter) } // NoopSource implements Source by never sending events to subscribed // Receivers. type NoopSource struct{} // SubscribeEvents implements Source.SubscribeEvents. func (NoopSource) SubscribeEvents(*Receiver, Set) SubscriptionID { return 0 } // UnsubscribeEvents implements Source.UnsubscribeEvents. func (NoopSource) UnsubscribeEvents(SubscriptionID) { } // See Broadcaster for a non-noop implementations of Source. golang-gvisor-gvisor-0.0~20240729.0/pkg/syncevent/syncevent.go000066400000000000000000000021521465435605700240070ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package syncevent provides efficient primitives for goroutine // synchronization based on event bitmasks. package syncevent // Set is a bitmask where each bit represents a distinct user-defined event. // The event package does not treat any bits in Set specially. type Set uint64 const ( // NoEvents is a Set containing no events. NoEvents = Set(0) // AllEvents is a Set containing all possible events. AllEvents = ^Set(0) // MaxEvents is the number of distinct events that can be represented by a Set. MaxEvents = 64 ) golang-gvisor-gvisor-0.0~20240729.0/pkg/syncevent/syncevent_state_autogen.go000066400000000000000000000000731465435605700267310ustar00rootroot00000000000000// automatically generated by stateify. package syncevent golang-gvisor-gvisor-0.0~20240729.0/pkg/syncevent/syncevent_unsafe_state_autogen.go000066400000000000000000000000731465435605700302720ustar00rootroot00000000000000// automatically generated by stateify. package syncevent golang-gvisor-gvisor-0.0~20240729.0/pkg/syncevent/waiter_unsafe.go000066400000000000000000000142031465435605700246250ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package syncevent import ( "sync/atomic" "unsafe" "gvisor.dev/gvisor/pkg/sync" ) // Waiter allows a goroutine to block on pending events received by a Receiver. // // Waiter.Init() must be called before first use. type Waiter struct { r Receiver // g is one of: // // - 0: No goroutine is blocking in Wait. // // - preparingG: A goroutine is in Wait preparing to sleep, but hasn't yet // completed waiterUnlock(). Thus the wait can only be interrupted by // replacing the value of g with 0 (the G may not be in state Gwaiting yet, // so we can't call goready.) // // - Otherwise: g is a pointer to the runtime.g in state Gwaiting for the // goroutine blocked in Wait, which can only be woken by calling goready. g uintptr `state:"zerovalue"` } const preparingG = 1 // Init must be called before first use of w. func (w *Waiter) Init() { w.r.Init(w) } // Receiver returns the Receiver that receives events that unblock calls to // w.Wait(). func (w *Waiter) Receiver() *Receiver { return &w.r } // Pending returns the set of pending events. func (w *Waiter) Pending() Set { return w.r.Pending() } // Wait blocks until at least one event is pending, then returns the set of // pending events. It does not affect the set of pending events; callers must // call w.Ack() to do so, or use w.WaitAndAck() instead. // // Precondition: Only one goroutine may call any Wait* method at a time. func (w *Waiter) Wait() Set { return w.WaitFor(AllEvents) } // WaitFor blocks until at least one event in es is pending, then returns the // set of pending events (including those not in es). It does not affect the // set of pending events; callers must call w.Ack() to do so. // // Precondition: Only one goroutine may call any Wait* method at a time. func (w *Waiter) WaitFor(es Set) Set { for { // Optimization: Skip the atomic store to w.g if an event is already // pending. if p := w.r.Pending(); p&es != NoEvents { return p } // Indicate that we're preparing to go to sleep. atomic.StoreUintptr(&w.g, preparingG) // If an event is pending, abort the sleep. if p := w.r.Pending(); p&es != NoEvents { atomic.StoreUintptr(&w.g, 0) return p } // If w.g is still preparingG (i.e. w.NotifyPending() has not been // called or has not reached atomic.SwapUintptr()), go to sleep until // w.NotifyPending() => goready(). sync.Gopark(waiterCommit, unsafe.Pointer(&w.g), sync.WaitReasonSelect, sync.TraceBlockSelect, 0) } } //go:norace //go:nosplit func waiterCommit(g uintptr, wg unsafe.Pointer) bool { // The only way this CAS can fail is if a call to Waiter.NotifyPending() // has replaced *wg with nil, in which case we should not sleep. return sync.RaceUncheckedAtomicCompareAndSwapUintptr((*uintptr)(wg), preparingG, g) } // Ack marks the given events as not pending. func (w *Waiter) Ack(es Set) { w.r.Ack(es) } // WaitAndAckAll blocks until at least one event is pending, then marks all // events as not pending and returns the set of previously-pending events. // // Precondition: Only one goroutine may call any Wait* method at a time. func (w *Waiter) WaitAndAckAll() Set { // Optimization: Skip the atomic store to w.g if an event is already // pending. Call Pending() first since, in the common case that events are // not yet pending, this skips an atomic swap on w.r.pending. if w.r.Pending() != NoEvents { if p := w.r.PendingAndAckAll(); p != NoEvents { return p } } for { // Indicate that we're preparing to go to sleep. atomic.StoreUintptr(&w.g, preparingG) // If an event is pending, abort the sleep. if w.r.Pending() != NoEvents { if p := w.r.PendingAndAckAll(); p != NoEvents { atomic.StoreUintptr(&w.g, 0) return p } } // If w.g is still preparingG (i.e. w.NotifyPending() has not been // called or has not reached atomic.SwapUintptr()), go to sleep until // w.NotifyPending() => goready(). sync.Gopark(waiterCommit, unsafe.Pointer(&w.g), sync.WaitReasonSelect, sync.TraceBlockSelect, 0) // Check for pending events. We call PendingAndAckAll() directly now since // we only expect to be woken after events become pending. if p := w.r.PendingAndAckAll(); p != NoEvents { return p } } } // Notify marks the given events as pending, possibly unblocking concurrent // calls to w.Wait() or w.WaitFor(). func (w *Waiter) Notify(es Set) { w.r.Notify(es) } // NotifyPending implements ReceiverCallback.NotifyPending. Users of Waiter // should not call NotifyPending. func (w *Waiter) NotifyPending() { // Optimization: Skip the atomic swap on w.g if there is no sleeping // goroutine. NotifyPending is called after w.r.Pending() is updated, so // concurrent and future calls to w.Wait() will observe pending events and // abort sleeping. if atomic.LoadUintptr(&w.g) == 0 { return } // Wake a sleeping G, or prevent a G that is preparing to sleep from doing // so. Swap is needed here to ensure that only one call to NotifyPending // calls goready. if g := atomic.SwapUintptr(&w.g, 0); g > preparingG { sync.Goready(g, 0, true /* wakep */) } } var waiterPool = sync.Pool{ New: func() any { w := &Waiter{} w.Init() return w }, } // GetWaiter returns an unused Waiter. PutWaiter should be called to release // the Waiter once it is no longer needed. // // Where possible, users should prefer to associate each goroutine that calls // Waiter.Wait() with a distinct pre-allocated Waiter to avoid allocation of // Waiters in hot paths. func GetWaiter() *Waiter { return waiterPool.Get().(*Waiter) } // PutWaiter releases an unused Waiter previously returned by GetWaiter. func PutWaiter(w *Waiter) { waiterPool.Put(w) } golang-gvisor-gvisor-0.0~20240729.0/pkg/syserr/000077500000000000000000000000001465435605700207535ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/syserr/host_darwin.go000066400000000000000000000026041465435605700236250ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build darwin // +build darwin package syserr import ( "fmt" "golang.org/x/sys/unix" ) const maxErrno = 107 var darwinHostTranslations [maxErrno]*Error // FromHost translates a unix.Errno to a corresponding Error value. func FromHost(err unix.Errno) *Error { if int(err) >= len(darwinHostTranslations) || darwinHostTranslations[err] == nil { panic(fmt.Sprintf("unknown host errno %q (%d)", err.Error(), err)) } return darwinHostTranslations[err] } // TODO(gvisor.dev/issue/1270): We currently only add translations for errors // that exist both on Darwin and Linux. func addHostTranslation(host unix.Errno, trans *Error) { if darwinHostTranslations[host] != nil { panic(fmt.Sprintf("duplicate translation for host errno %q (%d)", host.Error(), host)) } darwinHostTranslations[host] = trans } golang-gvisor-gvisor-0.0~20240729.0/pkg/syserr/host_linux.go000066400000000000000000000127661465435605700235120ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package syserr import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux/errno" ) const maxErrno = 134 var linuxHostTranslations [maxErrno]*Error // FromHost translates a unix.Errno to a corresponding Error value. func FromHost(err unix.Errno) *Error { if int(err) >= len(linuxHostTranslations) || linuxHostTranslations[err] == nil { panic(fmt.Sprintf("unknown host errno %q (%d)", err.Error(), err)) } return linuxHostTranslations[err] } func addHostTranslation(host unix.Errno, trans *Error) { if linuxHostTranslations[host] != nil { panic(fmt.Sprintf("duplicate translation for host errno %q (%d)", host.Error(), host)) } linuxHostTranslations[host] = trans } // TODO(b/34162363): Remove or replace most of these errors. // // Some of the errors should be replaced with package specific errors and // others should be removed entirely. var ( ErrDeadlock = newWithHost("resource deadlock would occur", errno.EDEADLOCK, unix.EDEADLOCK) ErrChannelOutOfRange = newWithHost("channel number out of range", errno.ECHRNG, unix.ECHRNG) ErrLevelTwoNotSynced = newWithHost("level 2 not synchronized", errno.EL2NSYNC, unix.EL2NSYNC) ErrLevelThreeHalted = newWithHost("level 3 halted", errno.EL3HLT, unix.EL3HLT) ErrLevelThreeReset = newWithHost("level 3 reset", errno.EL3RST, unix.EL3RST) ErrLinkNumberOutOfRange = newWithHost("link number out of range", errno.ELNRNG, unix.ELNRNG) ErrProtocolDriverNotAttached = newWithHost("protocol driver not attached", errno.EUNATCH, unix.EUNATCH) ErrNoCSIAvailable = newWithHost("no CSI structure available", errno.ENOCSI, unix.ENOCSI) ErrLevelTwoHalted = newWithHost("level 2 halted", errno.EL2HLT, unix.EL2HLT) ErrInvalidExchange = newWithHost("invalid exchange", errno.EBADE, unix.EBADE) ErrInvalidRequestDescriptor = newWithHost("invalid request descriptor", errno.EBADR, unix.EBADR) ErrExchangeFull = newWithHost("exchange full", errno.EXFULL, unix.EXFULL) ErrNoAnode = newWithHost("no anode", errno.ENOANO, unix.ENOANO) ErrInvalidRequestCode = newWithHost("invalid request code", errno.EBADRQC, unix.EBADRQC) ErrInvalidSlot = newWithHost("invalid slot", errno.EBADSLT, unix.EBADSLT) ErrBadFontFile = newWithHost("bad font file format", errno.EBFONT, unix.EBFONT) ErrMachineNotOnNetwork = newWithHost("machine is not on the network", errno.ENONET, unix.ENONET) ErrPackageNotInstalled = newWithHost("package not installed", errno.ENOPKG, unix.ENOPKG) ErrAdvertise = newWithHost("advertise error", errno.EADV, unix.EADV) ErrSRMount = newWithHost("srmount error", errno.ESRMNT, unix.ESRMNT) ErrSendCommunication = newWithHost("communication error on send", errno.ECOMM, unix.ECOMM) ErrRFS = newWithHost("RFS specific error", errno.EDOTDOT, unix.EDOTDOT) ErrNetworkNameNotUnique = newWithHost("name not unique on network", errno.ENOTUNIQ, unix.ENOTUNIQ) ErrFDInBadState = newWithHost("file descriptor in bad state", errno.EBADFD, unix.EBADFD) ErrRemoteAddressChanged = newWithHost("remote address changed", errno.EREMCHG, unix.EREMCHG) ErrSharedLibraryInaccessible = newWithHost("can not access a needed shared library", errno.ELIBACC, unix.ELIBACC) ErrCorruptedSharedLibrary = newWithHost("accessing a corrupted shared library", errno.ELIBBAD, unix.ELIBBAD) ErrLibSectionCorrupted = newWithHost(".lib section in a.out corrupted", errno.ELIBSCN, unix.ELIBSCN) ErrTooManySharedLibraries = newWithHost("attempting to link in too many shared libraries", errno.ELIBMAX, unix.ELIBMAX) ErrSharedLibraryExeced = newWithHost("cannot exec a shared library directly", errno.ELIBEXEC, unix.ELIBEXEC) ErrShouldRestart = newWithHost("interrupted system call should be restarted", errno.ERESTART, unix.ERESTART) ErrStreamPipe = newWithHost("streams pipe error", errno.ESTRPIPE, unix.ESTRPIPE) ErrStructureNeedsCleaning = newWithHost("structure needs cleaning", errno.EUCLEAN, unix.EUCLEAN) ErrIsNamedFile = newWithHost("is a named type file", errno.ENOTNAM, unix.ENOTNAM) ErrRemoteIO = newWithHost("remote I/O error", errno.EREMOTEIO, unix.EREMOTEIO) ErrNoMedium = newWithHost("no medium found", errno.ENOMEDIUM, unix.ENOMEDIUM) ErrWrongMediumType = newWithHost("wrong medium type", errno.EMEDIUMTYPE, unix.EMEDIUMTYPE) ErrNoKey = newWithHost("required key not available", errno.ENOKEY, unix.ENOKEY) ErrKeyExpired = newWithHost("key has expired", errno.EKEYEXPIRED, unix.EKEYEXPIRED) ErrKeyRevoked = newWithHost("key has been revoked", errno.EKEYREVOKED, unix.EKEYREVOKED) ErrKeyRejected = newWithHost("key was rejected by service", errno.EKEYREJECTED, unix.EKEYREJECTED) ) golang-gvisor-gvisor-0.0~20240729.0/pkg/syserr/netstack.go000066400000000000000000000145071465435605700231250ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package syserr import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux/errno" "gvisor.dev/gvisor/pkg/tcpip" ) // LINT.IfChange // Mapping for tcpip.Error types. var ( ErrUnknownProtocol = New((&tcpip.ErrUnknownProtocol{}).String(), errno.EINVAL) ErrUnknownNICID = New((&tcpip.ErrUnknownNICID{}).String(), errno.ENODEV) ErrUnknownDevice = New((&tcpip.ErrUnknownDevice{}).String(), errno.ENODEV) ErrUnknownProtocolOption = New((&tcpip.ErrUnknownProtocolOption{}).String(), errno.ENOPROTOOPT) ErrDuplicateNICID = New((&tcpip.ErrDuplicateNICID{}).String(), errno.EEXIST) ErrDuplicateAddress = New((&tcpip.ErrDuplicateAddress{}).String(), errno.EEXIST) ErrAlreadyBound = New((&tcpip.ErrAlreadyBound{}).String(), errno.EINVAL) ErrInvalidEndpointState = New((&tcpip.ErrInvalidEndpointState{}).String(), errno.EINVAL) ErrAlreadyConnecting = New((&tcpip.ErrAlreadyConnecting{}).String(), errno.EALREADY) ErrNoPortAvailable = New((&tcpip.ErrNoPortAvailable{}).String(), errno.EAGAIN) ErrPortInUse = New((&tcpip.ErrPortInUse{}).String(), errno.EADDRINUSE) ErrBadLocalAddress = New((&tcpip.ErrBadLocalAddress{}).String(), errno.EADDRNOTAVAIL) ErrClosedForSend = New((&tcpip.ErrClosedForSend{}).String(), errno.EPIPE) ErrClosedForReceive = New((&tcpip.ErrClosedForReceive{}).String(), errno.NOERRNO) ErrTimeout = New((&tcpip.ErrTimeout{}).String(), errno.ETIMEDOUT) ErrAborted = New((&tcpip.ErrAborted{}).String(), errno.EPIPE) ErrConnectStarted = New((&tcpip.ErrConnectStarted{}).String(), errno.EINPROGRESS) ErrDestinationRequired = New((&tcpip.ErrDestinationRequired{}).String(), errno.EDESTADDRREQ) ErrNotSupported = New((&tcpip.ErrNotSupported{}).String(), errno.EOPNOTSUPP) ErrQueueSizeNotSupported = New((&tcpip.ErrQueueSizeNotSupported{}).String(), errno.ENOTTY) ErrNoSuchFile = New((&tcpip.ErrNoSuchFile{}).String(), errno.ENOENT) ErrInvalidOptionValue = New((&tcpip.ErrInvalidOptionValue{}).String(), errno.EINVAL) ErrBroadcastDisabled = New((&tcpip.ErrBroadcastDisabled{}).String(), errno.EACCES) ErrNotPermittedNet = New((&tcpip.ErrNotPermitted{}).String(), errno.EPERM) ErrBadBuffer = New((&tcpip.ErrBadBuffer{}).String(), errno.EFAULT) ErrMalformedHeader = New((&tcpip.ErrMalformedHeader{}).String(), errno.EINVAL) ErrInvalidPortRange = New((&tcpip.ErrInvalidPortRange{}).String(), errno.EINVAL) ErrMulticastInputCannotBeOutput = New((&tcpip.ErrMulticastInputCannotBeOutput{}).String(), errno.EINVAL) ErrMissingRequiredFields = New((&tcpip.ErrMissingRequiredFields{}).String(), errno.EINVAL) ErrNoNet = New((&tcpip.ErrNoNet{}).String(), errno.ENONET) ) // TranslateNetstackError converts an error from the tcpip package to a sentry // internal error. func TranslateNetstackError(err tcpip.Error) *Error { switch err.(type) { case nil: return nil case *tcpip.ErrUnknownProtocol: return ErrUnknownProtocol case *tcpip.ErrUnknownNICID: return ErrUnknownNICID case *tcpip.ErrUnknownDevice: return ErrUnknownDevice case *tcpip.ErrUnknownProtocolOption: return ErrUnknownProtocolOption case *tcpip.ErrDuplicateNICID: return ErrDuplicateNICID case *tcpip.ErrDuplicateAddress: return ErrDuplicateAddress case *tcpip.ErrHostUnreachable: return ErrHostUnreachable case *tcpip.ErrHostDown: return ErrHostDown case *tcpip.ErrNoNet: return ErrNoNet case *tcpip.ErrAlreadyBound: return ErrAlreadyBound case *tcpip.ErrInvalidEndpointState: return ErrInvalidEndpointState case *tcpip.ErrAlreadyConnecting: return ErrAlreadyConnecting case *tcpip.ErrAlreadyConnected: return ErrAlreadyConnected case *tcpip.ErrNoPortAvailable: return ErrNoPortAvailable case *tcpip.ErrPortInUse: return ErrPortInUse case *tcpip.ErrBadLocalAddress: return ErrBadLocalAddress case *tcpip.ErrClosedForSend: return ErrClosedForSend case *tcpip.ErrClosedForReceive: return ErrClosedForReceive case *tcpip.ErrWouldBlock: return ErrWouldBlock case *tcpip.ErrConnectionRefused: return ErrConnectionRefused case *tcpip.ErrTimeout: return ErrTimeout case *tcpip.ErrAborted: return ErrAborted case *tcpip.ErrConnectStarted: return ErrConnectStarted case *tcpip.ErrDestinationRequired: return ErrDestinationRequired case *tcpip.ErrNotSupported: return ErrNotSupported case *tcpip.ErrQueueSizeNotSupported: return ErrQueueSizeNotSupported case *tcpip.ErrNotConnected: return ErrNotConnected case *tcpip.ErrConnectionReset: return ErrConnectionReset case *tcpip.ErrConnectionAborted: return ErrConnectionAborted case *tcpip.ErrNoSuchFile: return ErrNoSuchFile case *tcpip.ErrInvalidOptionValue: return ErrInvalidOptionValue case *tcpip.ErrBadAddress: return ErrBadAddress case *tcpip.ErrNetworkUnreachable: return ErrNetworkUnreachable case *tcpip.ErrMessageTooLong: return ErrMessageTooLong case *tcpip.ErrNoBufferSpace: return ErrNoBufferSpace case *tcpip.ErrBroadcastDisabled: return ErrBroadcastDisabled case *tcpip.ErrNotPermitted: return ErrNotPermittedNet case *tcpip.ErrAddressFamilyNotSupported: return ErrAddressFamilyNotSupported case *tcpip.ErrBadBuffer: return ErrBadBuffer case *tcpip.ErrMalformedHeader: return ErrMalformedHeader case *tcpip.ErrInvalidPortRange: return ErrInvalidPortRange case *tcpip.ErrMulticastInputCannotBeOutput: return ErrMulticastInputCannotBeOutput case *tcpip.ErrMissingRequiredFields: return ErrMissingRequiredFields default: panic(fmt.Sprintf("unknown error %T", err)) } } // LINT.ThenChange(../tcpip/errors.go) golang-gvisor-gvisor-0.0~20240729.0/pkg/syserr/syserr.go000066400000000000000000000320371465435605700226360ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package syserr contains sandbox-internal errors. These errors are distinct // from both the errors returned by host system calls and the errors returned // to sandboxed applications. package syserr import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux/errno" "gvisor.dev/gvisor/pkg/errors" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/safecopy" ) // Error represents an internal error. type Error struct { // message is the human readable form of this Error. message string // noTranslation indicates that this Error cannot be translated to a // errno.Errno. noTranslation bool // errno is the errno.Errno this Error should be translated to. errno errno.Errno } // New creates a new Error and adds a translation for it. // // New must only be called at init. func New(message string, linuxTranslation errno.Errno) *Error { err := &Error{message: message, errno: linuxTranslation} // TODO(b/34162363): Remove this. if int(err.errno) >= len(linuxBackwardsTranslations) { panic(fmt.Sprint("invalid errno: ", err.errno)) } e := error(unix.Errno(err.errno)) // linuxerr.ErrWouldBlock gets translated to linuxerr.EWOULDBLOCK and // enables proper blocking semantics. This should temporary address the // class of blocking bugs that keep popping up with the current state of // the error space. if err.errno == linuxerr.EWOULDBLOCK.Errno() { e = linuxerr.ErrWouldBlock } linuxBackwardsTranslations[err.errno] = linuxBackwardsTranslation{err: e, ok: true} return err } // NewDynamic creates a new error with a dynamic error message and an errno // translation. // // NewDynamic should only be used sparingly and not be used for static error // messages. Errors with static error messages should be declared with New as // global variables. func NewDynamic(message string, linuxTranslation errno.Errno) *Error { return &Error{message: message, errno: linuxTranslation} } func newWithHost(message string, linuxTranslation errno.Errno, hostErrno unix.Errno) *Error { e := New(message, linuxTranslation) addHostTranslation(hostErrno, e) return e } // String implements fmt.Stringer.String. func (e *Error) String() string { if e == nil { return "" } return e.message } type linuxBackwardsTranslation struct { err error ok bool } // TODO(b/34162363): Remove this. var linuxBackwardsTranslations [maxErrno]linuxBackwardsTranslation // ToError translates an Error to a corresponding error value. // // TODO(b/34162363): Remove this. func (e *Error) ToError() error { if e == nil { return nil } if e.noTranslation { panic(fmt.Sprintf("error %q does not support translation", e.message)) } err := int(e.errno) if err == errno.NOERRNO { return nil } if err >= len(linuxBackwardsTranslations) || !linuxBackwardsTranslations[err].ok { panic(fmt.Sprintf("unknown error %q (%d)", e.message, err)) } return linuxBackwardsTranslations[err].err } // ToLinux converts the Error to a Linux ABI error that can be returned to the // application. func (e *Error) ToLinux() errno.Errno { if e.noTranslation { panic(fmt.Sprintf("No Linux ABI translation available for %q", e.message)) } return e.errno } // TODO(b/34162363): Remove or replace most of these errors. // // Some of the errors should be replaced with package specific errors and // others should be removed entirely. // // Note that some errors are declared in platform-specific files. var ( ErrNotPermitted = newWithHost("operation not permitted", errno.EPERM, unix.EPERM) ErrNoFileOrDir = newWithHost("no such file or directory", errno.ENOENT, unix.ENOENT) ErrNoProcess = newWithHost("no such process", errno.ESRCH, unix.ESRCH) ErrInterrupted = newWithHost("interrupted system call", errno.EINTR, unix.EINTR) ErrIO = newWithHost("I/O error", errno.EIO, unix.EIO) ErrDeviceOrAddress = newWithHost("no such device or address", errno.ENXIO, unix.ENXIO) ErrTooManyArgs = newWithHost("argument list too long", errno.E2BIG, unix.E2BIG) ErrEcec = newWithHost("exec format error", errno.ENOEXEC, unix.ENOEXEC) ErrBadFD = newWithHost("bad file number", errno.EBADF, unix.EBADF) ErrNoChild = newWithHost("no child processes", errno.ECHILD, unix.ECHILD) ErrTryAgain = newWithHost("try again", errno.EAGAIN, unix.EAGAIN) ErrNoMemory = newWithHost("out of memory", errno.ENOMEM, unix.ENOMEM) ErrPermissionDenied = newWithHost("permission denied", errno.EACCES, unix.EACCES) ErrBadAddress = newWithHost("bad address", errno.EFAULT, unix.EFAULT) ErrNotBlockDevice = newWithHost("block device required", errno.ENOTBLK, unix.ENOTBLK) ErrBusy = newWithHost("device or resource busy", errno.EBUSY, unix.EBUSY) ErrExists = newWithHost("file exists", errno.EEXIST, unix.EEXIST) ErrCrossDeviceLink = newWithHost("cross-device link", errno.EXDEV, unix.EXDEV) ErrNoDevice = newWithHost("no such device", errno.ENODEV, unix.ENODEV) ErrNotDir = newWithHost("not a directory", errno.ENOTDIR, unix.ENOTDIR) ErrIsDir = newWithHost("is a directory", errno.EISDIR, unix.EISDIR) ErrInvalidArgument = newWithHost("invalid argument", errno.EINVAL, unix.EINVAL) ErrFileTableOverflow = newWithHost("file table overflow", errno.ENFILE, unix.ENFILE) ErrTooManyOpenFiles = newWithHost("too many open files", errno.EMFILE, unix.EMFILE) ErrNotTTY = newWithHost("not a typewriter", errno.ENOTTY, unix.ENOTTY) ErrTestFileBusy = newWithHost("text file busy", errno.ETXTBSY, unix.ETXTBSY) ErrFileTooBig = newWithHost("file too large", errno.EFBIG, unix.EFBIG) ErrNoSpace = newWithHost("no space left on device", errno.ENOSPC, unix.ENOSPC) ErrIllegalSeek = newWithHost("illegal seek", errno.ESPIPE, unix.ESPIPE) ErrReadOnlyFS = newWithHost("read-only file system", errno.EROFS, unix.EROFS) ErrTooManyLinks = newWithHost("too many links", errno.EMLINK, unix.EMLINK) ErrBrokenPipe = newWithHost("broken pipe", errno.EPIPE, unix.EPIPE) ErrDomain = newWithHost("math argument out of domain of func", errno.EDOM, unix.EDOM) ErrRange = newWithHost("math result not representable", errno.ERANGE, unix.ERANGE) ErrNameTooLong = newWithHost("file name too long", errno.ENAMETOOLONG, unix.ENAMETOOLONG) ErrNoLocksAvailable = newWithHost("no record locks available", errno.ENOLCK, unix.ENOLCK) ErrInvalidSyscall = newWithHost("invalid system call number", errno.ENOSYS, unix.ENOSYS) ErrDirNotEmpty = newWithHost("directory not empty", errno.ENOTEMPTY, unix.ENOTEMPTY) ErrLinkLoop = newWithHost("too many symbolic links encountered", errno.ELOOP, unix.ELOOP) ErrNoMessage = newWithHost("no message of desired type", errno.ENOMSG, unix.ENOMSG) ErrIdentifierRemoved = newWithHost("identifier removed", errno.EIDRM, unix.EIDRM) ErrNotStream = newWithHost("device not a stream", errno.ENOSTR, unix.ENOSTR) ErrNoDataAvailable = newWithHost("no data available", errno.ENODATA, unix.ENODATA) ErrTimerExpired = newWithHost("timer expired", errno.ETIME, unix.ETIME) ErrStreamsResourceDepleted = newWithHost("out of streams resources", errno.ENOSR, unix.ENOSR) ErrIsRemote = newWithHost("object is remote", errno.EREMOTE, unix.EREMOTE) ErrNoLink = newWithHost("link has been severed", errno.ENOLINK, unix.ENOLINK) ErrProtocol = newWithHost("protocol error", errno.EPROTO, unix.EPROTO) ErrMultihopAttempted = newWithHost("multihop attempted", errno.EMULTIHOP, unix.EMULTIHOP) ErrInvalidDataMessage = newWithHost("not a data message", errno.EBADMSG, unix.EBADMSG) ErrOverflow = newWithHost("value too large for defined data type", errno.EOVERFLOW, unix.EOVERFLOW) ErrIllegalByteSequence = newWithHost("illegal byte sequence", errno.EILSEQ, unix.EILSEQ) ErrTooManyUsers = newWithHost("too many users", errno.EUSERS, unix.EUSERS) ErrNotASocket = newWithHost("socket operation on non-socket", errno.ENOTSOCK, unix.ENOTSOCK) ErrDestinationAddressRequired = newWithHost("destination address required", errno.EDESTADDRREQ, unix.EDESTADDRREQ) ErrMessageTooLong = newWithHost("message too long", errno.EMSGSIZE, unix.EMSGSIZE) ErrWrongProtocolForSocket = newWithHost("protocol wrong type for socket", errno.EPROTOTYPE, unix.EPROTOTYPE) ErrProtocolNotAvailable = newWithHost("protocol not available", errno.ENOPROTOOPT, unix.ENOPROTOOPT) ErrProtocolNotSupported = newWithHost("protocol not supported", errno.EPROTONOSUPPORT, unix.EPROTONOSUPPORT) ErrSocketNotSupported = newWithHost("socket type not supported", errno.ESOCKTNOSUPPORT, unix.ESOCKTNOSUPPORT) ErrEndpointOperation = newWithHost("operation not supported on transport endpoint", errno.EOPNOTSUPP, unix.EOPNOTSUPP) ErrProtocolFamilyNotSupported = newWithHost("protocol family not supported", errno.EPFNOSUPPORT, unix.EPFNOSUPPORT) ErrAddressFamilyNotSupported = newWithHost("address family not supported by protocol", errno.EAFNOSUPPORT, unix.EAFNOSUPPORT) ErrAddressInUse = newWithHost("address already in use", errno.EADDRINUSE, unix.EADDRINUSE) ErrAddressNotAvailable = newWithHost("cannot assign requested address", errno.EADDRNOTAVAIL, unix.EADDRNOTAVAIL) ErrNetworkDown = newWithHost("network is down", errno.ENETDOWN, unix.ENETDOWN) ErrNetworkUnreachable = newWithHost("network is unreachable", errno.ENETUNREACH, unix.ENETUNREACH) ErrNetworkReset = newWithHost("network dropped connection because of reset", errno.ENETRESET, unix.ENETRESET) ErrConnectionAborted = newWithHost("software caused connection abort", errno.ECONNABORTED, unix.ECONNABORTED) ErrConnectionReset = newWithHost("connection reset by peer", errno.ECONNRESET, unix.ECONNRESET) ErrNoBufferSpace = newWithHost("no buffer space available", errno.ENOBUFS, unix.ENOBUFS) ErrAlreadyConnected = newWithHost("transport endpoint is already connected", errno.EISCONN, unix.EISCONN) ErrNotConnected = newWithHost("transport endpoint is not connected", errno.ENOTCONN, unix.ENOTCONN) ErrShutdown = newWithHost("cannot send after transport endpoint shutdown", errno.ESHUTDOWN, unix.ESHUTDOWN) ErrTooManyRefs = newWithHost("too many references: cannot splice", errno.ETOOMANYREFS, unix.ETOOMANYREFS) ErrTimedOut = newWithHost("connection timed out", errno.ETIMEDOUT, unix.ETIMEDOUT) ErrConnectionRefused = newWithHost("connection refused", errno.ECONNREFUSED, unix.ECONNREFUSED) ErrHostDown = newWithHost("host is down", errno.EHOSTDOWN, unix.EHOSTDOWN) ErrHostUnreachable = newWithHost("no route to host", errno.EHOSTUNREACH, unix.EHOSTUNREACH) ErrAlreadyInProgress = newWithHost("operation already in progress", errno.EALREADY, unix.EALREADY) ErrInProgress = newWithHost("operation now in progress", errno.EINPROGRESS, unix.EINPROGRESS) ErrStaleFileHandle = newWithHost("stale file handle", errno.ESTALE, unix.ESTALE) ErrQuotaExceeded = newWithHost("quota exceeded", errno.EDQUOT, unix.EDQUOT) ErrCanceled = newWithHost("operation canceled", errno.ECANCELED, unix.ECANCELED) ErrOwnerDied = newWithHost("owner died", errno.EOWNERDEAD, unix.EOWNERDEAD) ErrNotRecoverable = newWithHost("state not recoverable", errno.ENOTRECOVERABLE, unix.ENOTRECOVERABLE) // ErrWouldBlock translates to EWOULDBLOCK which is the same as EAGAIN // on Linux. ErrWouldBlock = New("operation would block", errno.EWOULDBLOCK) ) // FromError converts a generic error to an *Error. // // TODO(b/34162363): Remove this function. func FromError(err error) *Error { if err == nil { return nil } switch e := err.(type) { case unix.Errno: return FromHost(e) case *errors.Error: return FromHost(unix.Errno(e.Errno())) case safecopy.SegvError, safecopy.BusError, safecopy.AlignmentError: return FromHost(unix.EFAULT) } msg := fmt.Sprintf("err: %s type: %T", err.Error(), err) panic(msg) } golang-gvisor-gvisor-0.0~20240729.0/pkg/syserr/syserr_linux_state_autogen.go000066400000000000000000000001321465435605700267660ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux // +build linux package syserr golang-gvisor-gvisor-0.0~20240729.0/pkg/syserr/syserr_state_autogen.go000066400000000000000000000001341465435605700255510ustar00rootroot00000000000000// automatically generated by stateify. //go:build darwin // +build darwin package syserr golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/000077500000000000000000000000001465435605700205435ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/adapters/000077500000000000000000000000001465435605700223465ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/adapters/gonet/000077500000000000000000000000001465435605700234625ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/adapters/gonet/gonet.go000066400000000000000000000435361465435605700251400ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package gonet provides a Go net package compatible wrapper for a tcpip stack. package gonet import ( "bytes" "context" "errors" "fmt" "io" "net" "time" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" "gvisor.dev/gvisor/pkg/tcpip/transport/udp" "gvisor.dev/gvisor/pkg/waiter" ) var ( errCanceled = errors.New("operation canceled") errWouldBlock = errors.New("operation would block") ) // timeoutError is how the net package reports timeouts. type timeoutError struct{} func (e *timeoutError) Error() string { return "i/o timeout" } func (e *timeoutError) Timeout() bool { return true } func (e *timeoutError) Temporary() bool { return true } // A TCPListener is a wrapper around a TCP tcpip.Endpoint that implements // net.Listener. type TCPListener struct { stack *stack.Stack ep tcpip.Endpoint wq *waiter.Queue cancelOnce sync.Once cancel chan struct{} } // NewTCPListener creates a new TCPListener from a listening tcpip.Endpoint. func NewTCPListener(s *stack.Stack, wq *waiter.Queue, ep tcpip.Endpoint) *TCPListener { return &TCPListener{ stack: s, ep: ep, wq: wq, cancel: make(chan struct{}), } } // maxListenBacklog is set to be reasonably high for most uses of gonet. Go net // package uses the value in /proc/sys/net/core/somaxconn file in Linux as the // default listen backlog. The value below matches the default in common linux // distros. // // See: https://cs.opensource.google/go/go/+/refs/tags/go1.18.1:src/net/sock_linux.go;drc=refs%2Ftags%2Fgo1.18.1;l=66 const maxListenBacklog = 4096 // ListenTCP creates a new TCPListener. func ListenTCP(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*TCPListener, error) { // Create a TCP endpoint, bind it, then start listening. var wq waiter.Queue ep, err := s.NewEndpoint(tcp.ProtocolNumber, network, &wq) if err != nil { return nil, errors.New(err.String()) } if err := ep.Bind(addr); err != nil { ep.Close() return nil, &net.OpError{ Op: "bind", Net: "tcp", Addr: fullToTCPAddr(addr), Err: errors.New(err.String()), } } if err := ep.Listen(maxListenBacklog); err != nil { ep.Close() return nil, &net.OpError{ Op: "listen", Net: "tcp", Addr: fullToTCPAddr(addr), Err: errors.New(err.String()), } } return NewTCPListener(s, &wq, ep), nil } // Close implements net.Listener.Close. func (l *TCPListener) Close() error { l.ep.Close() return nil } // Shutdown stops the HTTP server. func (l *TCPListener) Shutdown() { l.ep.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead) l.cancelOnce.Do(func() { close(l.cancel) // broadcast cancellation }) } // Addr implements net.Listener.Addr. func (l *TCPListener) Addr() net.Addr { a, err := l.ep.GetLocalAddress() if err != nil { return nil } return fullToTCPAddr(a) } type deadlineTimer struct { // mu protects the fields below. mu sync.Mutex readTimer *time.Timer readCancelCh chan struct{} writeTimer *time.Timer writeCancelCh chan struct{} } func (d *deadlineTimer) init() { d.readCancelCh = make(chan struct{}) d.writeCancelCh = make(chan struct{}) } func (d *deadlineTimer) readCancel() <-chan struct{} { d.mu.Lock() c := d.readCancelCh d.mu.Unlock() return c } func (d *deadlineTimer) writeCancel() <-chan struct{} { d.mu.Lock() c := d.writeCancelCh d.mu.Unlock() return c } // setDeadline contains the shared logic for setting a deadline. // // cancelCh and timer must be pointers to deadlineTimer.readCancelCh and // deadlineTimer.readTimer or deadlineTimer.writeCancelCh and // deadlineTimer.writeTimer. // // setDeadline must only be called while holding d.mu. func (d *deadlineTimer) setDeadline(cancelCh *chan struct{}, timer **time.Timer, t time.Time) { if *timer != nil && !(*timer).Stop() { *cancelCh = make(chan struct{}) } // Create a new channel if we already closed it due to setting an already // expired time. We won't race with the timer because we already handled // that above. select { case <-*cancelCh: *cancelCh = make(chan struct{}) default: } // "A zero value for t means I/O operations will not time out." // - net.Conn.SetDeadline if t.IsZero() { *timer = nil return } timeout := t.Sub(time.Now()) if timeout <= 0 { close(*cancelCh) return } // Timer.Stop returns whether or not the AfterFunc has started, but // does not indicate whether or not it has completed. Make a copy of // the cancel channel to prevent this code from racing with the next // call of setDeadline replacing *cancelCh. ch := *cancelCh *timer = time.AfterFunc(timeout, func() { close(ch) }) } // SetReadDeadline implements net.Conn.SetReadDeadline and // net.PacketConn.SetReadDeadline. func (d *deadlineTimer) SetReadDeadline(t time.Time) error { d.mu.Lock() d.setDeadline(&d.readCancelCh, &d.readTimer, t) d.mu.Unlock() return nil } // SetWriteDeadline implements net.Conn.SetWriteDeadline and // net.PacketConn.SetWriteDeadline. func (d *deadlineTimer) SetWriteDeadline(t time.Time) error { d.mu.Lock() d.setDeadline(&d.writeCancelCh, &d.writeTimer, t) d.mu.Unlock() return nil } // SetDeadline implements net.Conn.SetDeadline and net.PacketConn.SetDeadline. func (d *deadlineTimer) SetDeadline(t time.Time) error { d.mu.Lock() d.setDeadline(&d.readCancelCh, &d.readTimer, t) d.setDeadline(&d.writeCancelCh, &d.writeTimer, t) d.mu.Unlock() return nil } // A TCPConn is a wrapper around a TCP tcpip.Endpoint that implements the net.Conn // interface. type TCPConn struct { deadlineTimer wq *waiter.Queue ep tcpip.Endpoint // readMu serializes reads and implicitly protects read. // // Lock ordering: // If both readMu and deadlineTimer.mu are to be used in a single // request, readMu must be acquired before deadlineTimer.mu. readMu sync.Mutex // read contains bytes that have been read from the endpoint, // but haven't yet been returned. read []byte } // NewTCPConn creates a new TCPConn. func NewTCPConn(wq *waiter.Queue, ep tcpip.Endpoint) *TCPConn { c := &TCPConn{ wq: wq, ep: ep, } c.deadlineTimer.init() return c } // Accept implements net.Conn.Accept. func (l *TCPListener) Accept() (net.Conn, error) { n, wq, err := l.ep.Accept(nil) if _, ok := err.(*tcpip.ErrWouldBlock); ok { // Create wait queue entry that notifies a channel. waitEntry, notifyCh := waiter.NewChannelEntry(waiter.ReadableEvents) l.wq.EventRegister(&waitEntry) defer l.wq.EventUnregister(&waitEntry) for { n, wq, err = l.ep.Accept(nil) if _, ok := err.(*tcpip.ErrWouldBlock); !ok { break } select { case <-l.cancel: return nil, errCanceled case <-notifyCh: } } } if err != nil { return nil, &net.OpError{ Op: "accept", Net: "tcp", Addr: l.Addr(), Err: errors.New(err.String()), } } return NewTCPConn(wq, n), nil } type opErrorer interface { newOpError(op string, err error) *net.OpError } // commonRead implements the common logic between net.Conn.Read and // net.PacketConn.ReadFrom. func commonRead(b []byte, ep tcpip.Endpoint, wq *waiter.Queue, deadline <-chan struct{}, addr *tcpip.FullAddress, errorer opErrorer) (int, error) { select { case <-deadline: return 0, errorer.newOpError("read", &timeoutError{}) default: } w := tcpip.SliceWriter(b) opts := tcpip.ReadOptions{NeedRemoteAddr: addr != nil} res, err := ep.Read(&w, opts) if _, ok := err.(*tcpip.ErrWouldBlock); ok { // Create wait queue entry that notifies a channel. waitEntry, notifyCh := waiter.NewChannelEntry(waiter.ReadableEvents) wq.EventRegister(&waitEntry) defer wq.EventUnregister(&waitEntry) for { res, err = ep.Read(&w, opts) if _, ok := err.(*tcpip.ErrWouldBlock); !ok { break } select { case <-deadline: return 0, errorer.newOpError("read", &timeoutError{}) case <-notifyCh: } } } if _, ok := err.(*tcpip.ErrClosedForReceive); ok { return 0, io.EOF } if err != nil { return 0, errorer.newOpError("read", errors.New(err.String())) } if addr != nil { *addr = res.RemoteAddr } return res.Count, nil } // Read implements net.Conn.Read. func (c *TCPConn) Read(b []byte) (int, error) { c.readMu.Lock() defer c.readMu.Unlock() deadline := c.readCancel() n, err := commonRead(b, c.ep, c.wq, deadline, nil, c) if n != 0 { c.ep.ModerateRecvBuf(n) } return n, err } // Write implements net.Conn.Write. func (c *TCPConn) Write(b []byte) (int, error) { deadline := c.writeCancel() // Check if deadlineTimer has already expired. select { case <-deadline: return 0, c.newOpError("write", &timeoutError{}) default: } // We must handle two soft failure conditions simultaneously: // 1. Write may write nothing and return *tcpip.ErrWouldBlock. // If this happens, we need to register for notifications if we have // not already and wait to try again. // 2. Write may write fewer than the full number of bytes and return // without error. In this case we need to try writing the remaining // bytes again. I do not need to register for notifications. // // What is more, these two soft failure conditions can be interspersed. // There is no guarantee that all of the condition #1s will occur before // all of the condition #2s or visa-versa. var ( r bytes.Reader nbytes int entry waiter.Entry ch <-chan struct{} ) for nbytes != len(b) { r.Reset(b[nbytes:]) n, err := c.ep.Write(&r, tcpip.WriteOptions{}) nbytes += int(n) switch err.(type) { case nil: case *tcpip.ErrWouldBlock: if ch == nil { entry, ch = waiter.NewChannelEntry(waiter.WritableEvents) c.wq.EventRegister(&entry) defer c.wq.EventUnregister(&entry) } else { // Don't wait immediately after registration in case more data // became available between when we last checked and when we setup // the notification. select { case <-deadline: return nbytes, c.newOpError("write", &timeoutError{}) case <-ch: continue } } default: return nbytes, c.newOpError("write", errors.New(err.String())) } } return nbytes, nil } // Close implements net.Conn.Close. func (c *TCPConn) Close() error { c.ep.Close() return nil } // CloseRead shuts down the reading side of the TCP connection. Most callers // should just use Close. // // A TCP Half-Close is performed the same as CloseRead for *net.TCPConn. func (c *TCPConn) CloseRead() error { if terr := c.ep.Shutdown(tcpip.ShutdownRead); terr != nil { return c.newOpError("close", errors.New(terr.String())) } return nil } // CloseWrite shuts down the writing side of the TCP connection. Most callers // should just use Close. // // A TCP Half-Close is performed the same as CloseWrite for *net.TCPConn. func (c *TCPConn) CloseWrite() error { if terr := c.ep.Shutdown(tcpip.ShutdownWrite); terr != nil { return c.newOpError("close", errors.New(terr.String())) } return nil } // LocalAddr implements net.Conn.LocalAddr. func (c *TCPConn) LocalAddr() net.Addr { a, err := c.ep.GetLocalAddress() if err != nil { return nil } return fullToTCPAddr(a) } // RemoteAddr implements net.Conn.RemoteAddr. func (c *TCPConn) RemoteAddr() net.Addr { a, err := c.ep.GetRemoteAddress() if err != nil { return nil } return fullToTCPAddr(a) } func (c *TCPConn) newOpError(op string, err error) *net.OpError { return &net.OpError{ Op: op, Net: "tcp", Source: c.LocalAddr(), Addr: c.RemoteAddr(), Err: err, } } func fullToTCPAddr(addr tcpip.FullAddress) *net.TCPAddr { return &net.TCPAddr{IP: net.IP(addr.Addr.AsSlice()), Port: int(addr.Port)} } func fullToUDPAddr(addr tcpip.FullAddress) *net.UDPAddr { return &net.UDPAddr{IP: net.IP(addr.Addr.AsSlice()), Port: int(addr.Port)} } // DialTCP creates a new TCPConn connected to the specified address. func DialTCP(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*TCPConn, error) { return DialContextTCP(context.Background(), s, addr, network) } // DialTCPWithBind creates a new TCPConn connected to the specified // remoteAddress with its local address bound to localAddr. func DialTCPWithBind(ctx context.Context, s *stack.Stack, localAddr, remoteAddr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*TCPConn, error) { // Create TCP endpoint, then connect. var wq waiter.Queue ep, err := s.NewEndpoint(tcp.ProtocolNumber, network, &wq) if err != nil { return nil, errors.New(err.String()) } // Create wait queue entry that notifies a channel. // // We do this unconditionally as Connect will always return an error. waitEntry, notifyCh := waiter.NewChannelEntry(waiter.WritableEvents) wq.EventRegister(&waitEntry) defer wq.EventUnregister(&waitEntry) select { case <-ctx.Done(): return nil, ctx.Err() default: } // Bind before connect if requested. if localAddr != (tcpip.FullAddress{}) { if err = ep.Bind(localAddr); err != nil { return nil, fmt.Errorf("ep.Bind(%+v) = %s", localAddr, err) } } err = ep.Connect(remoteAddr) if _, ok := err.(*tcpip.ErrConnectStarted); ok { select { case <-ctx.Done(): ep.Close() return nil, ctx.Err() case <-notifyCh: } err = ep.LastError() } if err != nil { ep.Close() return nil, &net.OpError{ Op: "connect", Net: "tcp", Addr: fullToTCPAddr(remoteAddr), Err: errors.New(err.String()), } } return NewTCPConn(&wq, ep), nil } // DialContextTCP creates a new TCPConn connected to the specified address // with the option of adding cancellation and timeouts. func DialContextTCP(ctx context.Context, s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*TCPConn, error) { return DialTCPWithBind(ctx, s, tcpip.FullAddress{} /* localAddr */, addr /* remoteAddr */, network) } // A UDPConn is a wrapper around a UDP tcpip.Endpoint that implements // net.Conn and net.PacketConn. type UDPConn struct { deadlineTimer ep tcpip.Endpoint wq *waiter.Queue } // NewUDPConn creates a new UDPConn. func NewUDPConn(wq *waiter.Queue, ep tcpip.Endpoint) *UDPConn { c := &UDPConn{ ep: ep, wq: wq, } c.deadlineTimer.init() return c } // DialUDP creates a new UDPConn. // // If laddr is nil, a local address is automatically chosen. // // If raddr is nil, the UDPConn is left unconnected. func DialUDP(s *stack.Stack, laddr, raddr *tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*UDPConn, error) { var wq waiter.Queue ep, err := s.NewEndpoint(udp.ProtocolNumber, network, &wq) if err != nil { return nil, errors.New(err.String()) } if laddr != nil { if err := ep.Bind(*laddr); err != nil { ep.Close() return nil, &net.OpError{ Op: "bind", Net: "udp", Addr: fullToUDPAddr(*laddr), Err: errors.New(err.String()), } } } c := NewUDPConn(&wq, ep) if raddr != nil { if err := c.ep.Connect(*raddr); err != nil { c.ep.Close() return nil, &net.OpError{ Op: "connect", Net: "udp", Addr: fullToUDPAddr(*raddr), Err: errors.New(err.String()), } } } return c, nil } func (c *UDPConn) newOpError(op string, err error) *net.OpError { return c.newRemoteOpError(op, nil, err) } func (c *UDPConn) newRemoteOpError(op string, remote net.Addr, err error) *net.OpError { return &net.OpError{ Op: op, Net: "udp", Source: c.LocalAddr(), Addr: remote, Err: err, } } // RemoteAddr implements net.Conn.RemoteAddr. func (c *UDPConn) RemoteAddr() net.Addr { a, err := c.ep.GetRemoteAddress() if err != nil { return nil } return fullToUDPAddr(a) } // Read implements net.Conn.Read func (c *UDPConn) Read(b []byte) (int, error) { bytesRead, _, err := c.ReadFrom(b) return bytesRead, err } // ReadFrom implements net.PacketConn.ReadFrom. func (c *UDPConn) ReadFrom(b []byte) (int, net.Addr, error) { deadline := c.readCancel() var addr tcpip.FullAddress n, err := commonRead(b, c.ep, c.wq, deadline, &addr, c) if err != nil { return 0, nil, err } return n, fullToUDPAddr(addr), nil } func (c *UDPConn) Write(b []byte) (int, error) { return c.WriteTo(b, nil) } // WriteTo implements net.PacketConn.WriteTo. func (c *UDPConn) WriteTo(b []byte, addr net.Addr) (int, error) { deadline := c.writeCancel() // Check if deadline has already expired. select { case <-deadline: return 0, c.newRemoteOpError("write", addr, &timeoutError{}) default: } // If we're being called by Write, there is no addr writeOptions := tcpip.WriteOptions{} if addr != nil { ua := addr.(*net.UDPAddr) writeOptions.To = &tcpip.FullAddress{ Addr: tcpip.AddrFromSlice(ua.IP), Port: uint16(ua.Port), } } var r bytes.Reader r.Reset(b) n, err := c.ep.Write(&r, writeOptions) if _, ok := err.(*tcpip.ErrWouldBlock); ok { // Create wait queue entry that notifies a channel. waitEntry, notifyCh := waiter.NewChannelEntry(waiter.WritableEvents) c.wq.EventRegister(&waitEntry) defer c.wq.EventUnregister(&waitEntry) for { select { case <-deadline: return int(n), c.newRemoteOpError("write", addr, &timeoutError{}) case <-notifyCh: } n, err = c.ep.Write(&r, writeOptions) if _, ok := err.(*tcpip.ErrWouldBlock); !ok { break } } } if err == nil { return int(n), nil } return int(n), c.newRemoteOpError("write", addr, errors.New(err.String())) } // Close implements net.PacketConn.Close. func (c *UDPConn) Close() error { c.ep.Close() return nil } // LocalAddr implements net.PacketConn.LocalAddr. func (c *UDPConn) LocalAddr() net.Addr { a, err := c.ep.GetLocalAddress() if err != nil { return nil } return fullToUDPAddr(a) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/adapters/gonet/gonet_state_autogen.go000066400000000000000000000000671465435605700300520ustar00rootroot00000000000000// automatically generated by stateify. package gonet golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/checksum/000077500000000000000000000000001465435605700223455ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/checksum/checksum.go000066400000000000000000000036711465435605700245050ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package checksum provides the implementation of the encoding and decoding of // network protocol headers. package checksum import ( "encoding/binary" ) // Size is the size of a checksum. // // The checksum is held in a uint16 which is 2 bytes. const Size = 2 // Put puts the checksum in the provided byte slice. func Put(b []byte, xsum uint16) { binary.BigEndian.PutUint16(b, xsum) } // Checksum calculates the checksum (as defined in RFC 1071) of the bytes in the // given byte array. This function uses an optimized version of the checksum // algorithm. // // The initial checksum must have been computed on an even number of bytes. func Checksum(buf []byte, initial uint16) uint16 { s, _ := calculateChecksum(buf, false, initial) return s } // Checksumer calculates checksum defined in RFC 1071. type Checksumer struct { sum uint16 odd bool } // Add adds b to checksum. func (c *Checksumer) Add(b []byte) { if len(b) > 0 { c.sum, c.odd = calculateChecksum(b, c.odd, c.sum) } } // Checksum returns the latest checksum value. func (c *Checksumer) Checksum() uint16 { return c.sum } // Combine combines the two uint16 to form their checksum. This is done // by adding them and the carry. // // Note that checksum a must have been computed on an even number of bytes. func Combine(a, b uint16) uint16 { v := uint32(a) + uint32(b) return uint16(v + v>>16) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/checksum/checksum_state_autogen.go000066400000000000000000000000721465435605700274170ustar00rootroot00000000000000// automatically generated by stateify. package checksum golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/checksum/checksum_unsafe.go000066400000000000000000000143121465435605700260400ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package checksum import ( "encoding/binary" "math/bits" "unsafe" ) // Note: odd indicates whether initial is a partial checksum over an odd number // of bytes. func calculateChecksum(buf []byte, odd bool, initial uint16) (uint16, bool) { // Use a larger-than-uint16 accumulator to benefit from parallel summation // as described in RFC 1071 1.2.C. acc := uint64(initial) // Handle an odd number of previously-summed bytes, and get the return // value for odd. if odd { acc += uint64(buf[0]) buf = buf[1:] } odd = len(buf)&1 != 0 // Aligning &buf[0] below is much simpler if len(buf) >= 8; special-case // smaller bufs. if len(buf) < 8 { if len(buf) >= 4 { acc += (uint64(buf[0]) << 8) + uint64(buf[1]) acc += (uint64(buf[2]) << 8) + uint64(buf[3]) buf = buf[4:] } if len(buf) >= 2 { acc += (uint64(buf[0]) << 8) + uint64(buf[1]) buf = buf[2:] } if len(buf) >= 1 { acc += uint64(buf[0]) << 8 // buf = buf[1:] is skipped because it's unused and nogo will // complain. } return reduce(acc), odd } // On little-endian architectures, multi-byte loads from buf will load // bytes in the wrong order. Rather than byte-swap after each load (slow), // we byte-swap the accumulator before summing any bytes and byte-swap it // back before returning, which still produces the correct result as // described in RFC 1071 1.2.B "Byte Order Independence". // // acc is at most a uint16 + a uint8, so its upper 32 bits must be 0s. We // preserve this property by byte-swapping only the lower 32 bits of acc, // so that additions to acc performed during alignment can't overflow. acc = uint64(bswapIfLittleEndian32(uint32(acc))) // Align &buf[0] to an 8-byte boundary. bswapped := false if sliceAddr(buf)&1 != 0 { // Compute the rest of the partial checksum with bytes swapped, and // swap back before returning; see the last paragraph of // RFC 1071 1.2.B. acc = uint64(bits.ReverseBytes32(uint32(acc))) bswapped = true // No `<< 8` here due to the byte swap we just did. acc += uint64(bswapIfLittleEndian16(uint16(buf[0]))) buf = buf[1:] } if sliceAddr(buf)&2 != 0 { acc += uint64(*(*uint16)(unsafe.Pointer(&buf[0]))) buf = buf[2:] } if sliceAddr(buf)&4 != 0 { acc += uint64(*(*uint32)(unsafe.Pointer(&buf[0]))) buf = buf[4:] } // Sum 64 bytes at a time. Beyond this point, additions to acc may // overflow, so we have to handle carrying. for len(buf) >= 64 { var carry uint64 acc, carry = bits.Add64(acc, *(*uint64)(unsafe.Pointer(&buf[0])), 0) acc, carry = bits.Add64(acc, *(*uint64)(unsafe.Pointer(&buf[8])), carry) acc, carry = bits.Add64(acc, *(*uint64)(unsafe.Pointer(&buf[16])), carry) acc, carry = bits.Add64(acc, *(*uint64)(unsafe.Pointer(&buf[24])), carry) acc, carry = bits.Add64(acc, *(*uint64)(unsafe.Pointer(&buf[32])), carry) acc, carry = bits.Add64(acc, *(*uint64)(unsafe.Pointer(&buf[40])), carry) acc, carry = bits.Add64(acc, *(*uint64)(unsafe.Pointer(&buf[48])), carry) acc, carry = bits.Add64(acc, *(*uint64)(unsafe.Pointer(&buf[56])), carry) acc, _ = bits.Add64(acc, 0, carry) buf = buf[64:] } // Sum the remaining 0-63 bytes. if len(buf) >= 32 { var carry uint64 acc, carry = bits.Add64(acc, *(*uint64)(unsafe.Pointer(&buf[0])), 0) acc, carry = bits.Add64(acc, *(*uint64)(unsafe.Pointer(&buf[8])), carry) acc, carry = bits.Add64(acc, *(*uint64)(unsafe.Pointer(&buf[16])), carry) acc, carry = bits.Add64(acc, *(*uint64)(unsafe.Pointer(&buf[24])), carry) acc, _ = bits.Add64(acc, 0, carry) buf = buf[32:] } if len(buf) >= 16 { var carry uint64 acc, carry = bits.Add64(acc, *(*uint64)(unsafe.Pointer(&buf[0])), 0) acc, carry = bits.Add64(acc, *(*uint64)(unsafe.Pointer(&buf[8])), carry) acc, _ = bits.Add64(acc, 0, carry) buf = buf[16:] } if len(buf) >= 8 { var carry uint64 acc, carry = bits.Add64(acc, *(*uint64)(unsafe.Pointer(&buf[0])), 0) acc, _ = bits.Add64(acc, 0, carry) buf = buf[8:] } if len(buf) >= 4 { var carry uint64 acc, carry = bits.Add64(acc, uint64(*(*uint32)(unsafe.Pointer(&buf[0]))), 0) acc, _ = bits.Add64(acc, 0, carry) buf = buf[4:] } if len(buf) >= 2 { var carry uint64 acc, carry = bits.Add64(acc, uint64(*(*uint16)(unsafe.Pointer(&buf[0]))), 0) acc, _ = bits.Add64(acc, 0, carry) buf = buf[2:] } if len(buf) >= 1 { // bswapIfBigEndian16(buf[0]) == bswapIfLittleEndian16(buf[0]<<8). var carry uint64 acc, carry = bits.Add64(acc, uint64(bswapIfBigEndian16(uint16(buf[0]))), 0) acc, _ = bits.Add64(acc, 0, carry) // buf = buf[1:] is skipped because it's unused and nogo will complain. } // Reduce the checksum to 16 bits and undo byte swaps before returning. acc16 := bswapIfLittleEndian16(reduce(acc)) if bswapped { acc16 = bits.ReverseBytes16(acc16) } return acc16, odd } func reduce(acc uint64) uint16 { // Ideally we would do: // return uint16(acc>>48) +' uint16(acc>>32) +' uint16(acc>>16) +' uint16(acc) // for more instruction-level parallelism; however, there is no // bits.Add16(). acc = (acc >> 32) + (acc & 0xffff_ffff) // at most 0x1_ffff_fffe acc32 := uint32(acc>>32 + acc) // at most 0xffff_ffff acc32 = (acc32 >> 16) + (acc32 & 0xffff) // at most 0x1_fffe return uint16(acc32>>16 + acc32) // at most 0xffff } func bswapIfLittleEndian32(val uint32) uint32 { return binary.BigEndian.Uint32((*[4]byte)(unsafe.Pointer(&val))[:]) } func bswapIfLittleEndian16(val uint16) uint16 { return binary.BigEndian.Uint16((*[2]byte)(unsafe.Pointer(&val))[:]) } func bswapIfBigEndian16(val uint16) uint16 { return binary.LittleEndian.Uint16((*[2]byte)(unsafe.Pointer(&val))[:]) } func sliceAddr(buf []byte) uintptr { return uintptr(unsafe.Pointer(unsafe.SliceData(buf))) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/checksum/checksum_unsafe_state_autogen.go000066400000000000000000000000721465435605700307600ustar00rootroot00000000000000// automatically generated by stateify. package checksum golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/errors.go000066400000000000000000000377321465435605700224220ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcpip import ( "fmt" ) // Error represents an error in the netstack error space. // // The error interface is intentionally omitted to avoid loss of type // information that would occur if these errors were passed as error. type Error interface { isError() // IgnoreStats indicates whether this error should be included in failure // counts in tcpip.Stats structs. IgnoreStats() bool fmt.Stringer } const maxErrno = 134 // LINT.IfChange // ErrAborted indicates the operation was aborted. // // +stateify savable type ErrAborted struct{} func (*ErrAborted) isError() {} // IgnoreStats implements Error. func (*ErrAborted) IgnoreStats() bool { return false } func (*ErrAborted) String() string { return "operation aborted" } // ErrAddressFamilyNotSupported indicates the operation does not support the // given address family. // // +stateify savable type ErrAddressFamilyNotSupported struct{} func (*ErrAddressFamilyNotSupported) isError() {} // IgnoreStats implements Error. func (*ErrAddressFamilyNotSupported) IgnoreStats() bool { return false } func (*ErrAddressFamilyNotSupported) String() string { return "address family not supported by protocol" } // ErrAlreadyBound indicates the endpoint is already bound. // // +stateify savable type ErrAlreadyBound struct{} func (*ErrAlreadyBound) isError() {} // IgnoreStats implements Error. func (*ErrAlreadyBound) IgnoreStats() bool { return true } func (*ErrAlreadyBound) String() string { return "endpoint already bound" } // ErrAlreadyConnected indicates the endpoint is already connected. // // +stateify savable type ErrAlreadyConnected struct{} func (*ErrAlreadyConnected) isError() {} // IgnoreStats implements Error. func (*ErrAlreadyConnected) IgnoreStats() bool { return true } func (*ErrAlreadyConnected) String() string { return "endpoint is already connected" } // ErrAlreadyConnecting indicates the endpoint is already connecting. // // +stateify savable type ErrAlreadyConnecting struct{} func (*ErrAlreadyConnecting) isError() {} // IgnoreStats implements Error. func (*ErrAlreadyConnecting) IgnoreStats() bool { return true } func (*ErrAlreadyConnecting) String() string { return "endpoint is already connecting" } // ErrBadAddress indicates a bad address was provided. // // +stateify savable type ErrBadAddress struct{} func (*ErrBadAddress) isError() {} // IgnoreStats implements Error. func (*ErrBadAddress) IgnoreStats() bool { return false } func (*ErrBadAddress) String() string { return "bad address" } // ErrBadBuffer indicates a bad buffer was provided. // // +stateify savable type ErrBadBuffer struct{} func (*ErrBadBuffer) isError() {} // IgnoreStats implements Error. func (*ErrBadBuffer) IgnoreStats() bool { return false } func (*ErrBadBuffer) String() string { return "bad buffer" } // ErrBadLocalAddress indicates a bad local address was provided. // // +stateify savable type ErrBadLocalAddress struct{} func (*ErrBadLocalAddress) isError() {} // IgnoreStats implements Error. func (*ErrBadLocalAddress) IgnoreStats() bool { return false } func (*ErrBadLocalAddress) String() string { return "bad local address" } // ErrBroadcastDisabled indicates broadcast is not enabled on the endpoint. // // +stateify savable type ErrBroadcastDisabled struct{} func (*ErrBroadcastDisabled) isError() {} // IgnoreStats implements Error. func (*ErrBroadcastDisabled) IgnoreStats() bool { return false } func (*ErrBroadcastDisabled) String() string { return "broadcast socket option disabled" } // ErrClosedForReceive indicates the endpoint is closed for incoming data. // // +stateify savable type ErrClosedForReceive struct{} func (*ErrClosedForReceive) isError() {} // IgnoreStats implements Error. func (*ErrClosedForReceive) IgnoreStats() bool { return false } func (*ErrClosedForReceive) String() string { return "endpoint is closed for receive" } // ErrClosedForSend indicates the endpoint is closed for outgoing data. // // +stateify savable type ErrClosedForSend struct{} func (*ErrClosedForSend) isError() {} // IgnoreStats implements Error. func (*ErrClosedForSend) IgnoreStats() bool { return false } func (*ErrClosedForSend) String() string { return "endpoint is closed for send" } // ErrConnectStarted indicates the endpoint is connecting asynchronously. // // +stateify savable type ErrConnectStarted struct{} func (*ErrConnectStarted) isError() {} // IgnoreStats implements Error. func (*ErrConnectStarted) IgnoreStats() bool { return true } func (*ErrConnectStarted) String() string { return "connection attempt started" } // ErrConnectionAborted indicates the connection was aborted. // // +stateify savable type ErrConnectionAborted struct{} func (*ErrConnectionAborted) isError() {} // IgnoreStats implements Error. func (*ErrConnectionAborted) IgnoreStats() bool { return false } func (*ErrConnectionAborted) String() string { return "connection aborted" } // ErrConnectionRefused indicates the connection was refused. // // +stateify savable type ErrConnectionRefused struct{} func (*ErrConnectionRefused) isError() {} // IgnoreStats implements Error. func (*ErrConnectionRefused) IgnoreStats() bool { return false } func (*ErrConnectionRefused) String() string { return "connection was refused" } // ErrConnectionReset indicates the connection was reset. // // +stateify savable type ErrConnectionReset struct{} func (*ErrConnectionReset) isError() {} // IgnoreStats implements Error. func (*ErrConnectionReset) IgnoreStats() bool { return false } func (*ErrConnectionReset) String() string { return "connection reset by peer" } // ErrDestinationRequired indicates the operation requires a destination // address, and one was not provided. // // +stateify savable type ErrDestinationRequired struct{} func (*ErrDestinationRequired) isError() {} // IgnoreStats implements Error. func (*ErrDestinationRequired) IgnoreStats() bool { return false } func (*ErrDestinationRequired) String() string { return "destination address is required" } // ErrDuplicateAddress indicates the operation encountered a duplicate address. // // +stateify savable type ErrDuplicateAddress struct{} func (*ErrDuplicateAddress) isError() {} // IgnoreStats implements Error. func (*ErrDuplicateAddress) IgnoreStats() bool { return false } func (*ErrDuplicateAddress) String() string { return "duplicate address" } // ErrDuplicateNICID indicates the operation encountered a duplicate NIC ID. // // +stateify savable type ErrDuplicateNICID struct{} func (*ErrDuplicateNICID) isError() {} // IgnoreStats implements Error. func (*ErrDuplicateNICID) IgnoreStats() bool { return false } func (*ErrDuplicateNICID) String() string { return "duplicate nic id" } // ErrInvalidNICID indicates the operation used an invalid NIC ID. // // +stateify savable type ErrInvalidNICID struct{} func (*ErrInvalidNICID) isError() {} // IgnoreStats implements Error. func (*ErrInvalidNICID) IgnoreStats() bool { return false } func (*ErrInvalidNICID) String() string { return "invalid nic id" } // ErrInvalidEndpointState indicates the endpoint is in an invalid state. // // +stateify savable type ErrInvalidEndpointState struct{} func (*ErrInvalidEndpointState) isError() {} // IgnoreStats implements Error. func (*ErrInvalidEndpointState) IgnoreStats() bool { return false } func (*ErrInvalidEndpointState) String() string { return "endpoint is in invalid state" } // ErrInvalidOptionValue indicates an invalid option value was provided. // // +stateify savable type ErrInvalidOptionValue struct{} func (*ErrInvalidOptionValue) isError() {} // IgnoreStats implements Error. func (*ErrInvalidOptionValue) IgnoreStats() bool { return false } func (*ErrInvalidOptionValue) String() string { return "invalid option value specified" } // ErrInvalidPortRange indicates an attempt to set an invalid port range. // // +stateify savable type ErrInvalidPortRange struct{} func (*ErrInvalidPortRange) isError() {} // IgnoreStats implements Error. func (*ErrInvalidPortRange) IgnoreStats() bool { return true } func (*ErrInvalidPortRange) String() string { return "invalid port range" } // ErrMalformedHeader indicates the operation encountered a malformed header. // // +stateify savable type ErrMalformedHeader struct{} func (*ErrMalformedHeader) isError() {} // IgnoreStats implements Error. func (*ErrMalformedHeader) IgnoreStats() bool { return false } func (*ErrMalformedHeader) String() string { return "header is malformed" } // ErrMessageTooLong indicates the operation encountered a message whose length // exceeds the maximum permitted. // // +stateify savable type ErrMessageTooLong struct{} func (*ErrMessageTooLong) isError() {} // IgnoreStats implements Error. func (*ErrMessageTooLong) IgnoreStats() bool { return false } func (*ErrMessageTooLong) String() string { return "message too long" } // ErrNetworkUnreachable indicates the operation is not able to reach the // destination network. // // +stateify savable type ErrNetworkUnreachable struct{} func (*ErrNetworkUnreachable) isError() {} // IgnoreStats implements Error. func (*ErrNetworkUnreachable) IgnoreStats() bool { return false } func (*ErrNetworkUnreachable) String() string { return "network is unreachable" } // ErrNoBufferSpace indicates no buffer space is available. // // +stateify savable type ErrNoBufferSpace struct{} func (*ErrNoBufferSpace) isError() {} // IgnoreStats implements Error. func (*ErrNoBufferSpace) IgnoreStats() bool { return false } func (*ErrNoBufferSpace) String() string { return "no buffer space available" } // ErrNoPortAvailable indicates no port could be allocated for the operation. // // +stateify savable type ErrNoPortAvailable struct{} func (*ErrNoPortAvailable) isError() {} // IgnoreStats implements Error. func (*ErrNoPortAvailable) IgnoreStats() bool { return false } func (*ErrNoPortAvailable) String() string { return "no ports are available" } // ErrHostUnreachable indicates that a destination host could not be // reached. // // +stateify savable type ErrHostUnreachable struct{} func (*ErrHostUnreachable) isError() {} // IgnoreStats implements Error. func (*ErrHostUnreachable) IgnoreStats() bool { return false } func (*ErrHostUnreachable) String() string { return "no route to host" } // ErrHostDown indicates that a destination host is down. // // +stateify savable type ErrHostDown struct{} func (*ErrHostDown) isError() {} // IgnoreStats implements Error. func (*ErrHostDown) IgnoreStats() bool { return false } func (*ErrHostDown) String() string { return "host is down" } // ErrNoNet indicates that the host is not on the network. // // +stateify savable type ErrNoNet struct{} func (*ErrNoNet) isError() {} // IgnoreStats implements Error. func (*ErrNoNet) IgnoreStats() bool { return false } func (*ErrNoNet) String() string { return "machine is not on the network" } // ErrNoSuchFile is used to indicate that ENOENT should be returned the to // calling application. // // +stateify savable type ErrNoSuchFile struct{} func (*ErrNoSuchFile) isError() {} // IgnoreStats implements Error. func (*ErrNoSuchFile) IgnoreStats() bool { return false } func (*ErrNoSuchFile) String() string { return "no such file" } // ErrNotConnected indicates the endpoint is not connected. // // +stateify savable type ErrNotConnected struct{} func (*ErrNotConnected) isError() {} // IgnoreStats implements Error. func (*ErrNotConnected) IgnoreStats() bool { return false } func (*ErrNotConnected) String() string { return "endpoint not connected" } // ErrNotPermitted indicates the operation is not permitted. // // +stateify savable type ErrNotPermitted struct{} func (*ErrNotPermitted) isError() {} // IgnoreStats implements Error. func (*ErrNotPermitted) IgnoreStats() bool { return false } func (*ErrNotPermitted) String() string { return "operation not permitted" } // ErrNotSupported indicates the operation is not supported. // // +stateify savable type ErrNotSupported struct{} func (*ErrNotSupported) isError() {} // IgnoreStats implements Error. func (*ErrNotSupported) IgnoreStats() bool { return false } func (*ErrNotSupported) String() string { return "operation not supported" } // ErrPortInUse indicates the provided port is in use. // // +stateify savable type ErrPortInUse struct{} func (*ErrPortInUse) isError() {} // IgnoreStats implements Error. func (*ErrPortInUse) IgnoreStats() bool { return false } func (*ErrPortInUse) String() string { return "port is in use" } // ErrQueueSizeNotSupported indicates the endpoint does not allow queue size // operation. // // +stateify savable type ErrQueueSizeNotSupported struct{} func (*ErrQueueSizeNotSupported) isError() {} // IgnoreStats implements Error. func (*ErrQueueSizeNotSupported) IgnoreStats() bool { return false } func (*ErrQueueSizeNotSupported) String() string { return "queue size querying not supported" } // ErrTimeout indicates the operation timed out. // // +stateify savable type ErrTimeout struct{} func (*ErrTimeout) isError() {} // IgnoreStats implements Error. func (*ErrTimeout) IgnoreStats() bool { return false } func (*ErrTimeout) String() string { return "operation timed out" } // ErrUnknownDevice indicates an unknown device identifier was provided. // // +stateify savable type ErrUnknownDevice struct{} func (*ErrUnknownDevice) isError() {} // IgnoreStats implements Error. func (*ErrUnknownDevice) IgnoreStats() bool { return false } func (*ErrUnknownDevice) String() string { return "unknown device" } // ErrUnknownNICID indicates an unknown NIC ID was provided. // // +stateify savable type ErrUnknownNICID struct{} func (*ErrUnknownNICID) isError() {} // IgnoreStats implements Error. func (*ErrUnknownNICID) IgnoreStats() bool { return false } func (*ErrUnknownNICID) String() string { return "unknown nic id" } // ErrUnknownProtocol indicates an unknown protocol was requested. // // +stateify savable type ErrUnknownProtocol struct{} func (*ErrUnknownProtocol) isError() {} // IgnoreStats implements Error. func (*ErrUnknownProtocol) IgnoreStats() bool { return false } func (*ErrUnknownProtocol) String() string { return "unknown protocol" } // ErrUnknownProtocolOption indicates an unknown protocol option was provided. // // +stateify savable type ErrUnknownProtocolOption struct{} func (*ErrUnknownProtocolOption) isError() {} // IgnoreStats implements Error. func (*ErrUnknownProtocolOption) IgnoreStats() bool { return false } func (*ErrUnknownProtocolOption) String() string { return "unknown option for protocol" } // ErrWouldBlock indicates the operation would block. // // +stateify savable type ErrWouldBlock struct{} func (*ErrWouldBlock) isError() {} // IgnoreStats implements Error. func (*ErrWouldBlock) IgnoreStats() bool { return true } func (*ErrWouldBlock) String() string { return "operation would block" } // ErrMissingRequiredFields indicates that a required field is missing. // // +stateify savable type ErrMissingRequiredFields struct{} func (*ErrMissingRequiredFields) isError() {} // IgnoreStats implements Error. func (*ErrMissingRequiredFields) IgnoreStats() bool { return true } func (*ErrMissingRequiredFields) String() string { return "missing required fields" } // ErrMulticastInputCannotBeOutput indicates that an input interface matches an // output interface in the same multicast route. // // +stateify savable type ErrMulticastInputCannotBeOutput struct{} func (*ErrMulticastInputCannotBeOutput) isError() {} // IgnoreStats implements Error. func (*ErrMulticastInputCannotBeOutput) IgnoreStats() bool { return true } func (*ErrMulticastInputCannotBeOutput) String() string { return "output cannot contain input" } // LINT.ThenChange(../syserr/netstack.go) golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/errors_linux.go000066400000000000000000000040051465435605700236240ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package tcpip import ( "golang.org/x/sys/unix" ) // TranslateErrno translate an errno from the syscall package into a // tcpip Error. // // Valid, but unrecognized errnos will be translated to // *ErrInvalidEndpointState (EINVAL). This includes the "zero" value. func TranslateErrno(e unix.Errno) Error { switch e { case unix.EEXIST: return &ErrDuplicateAddress{} case unix.ENETUNREACH: return &ErrHostUnreachable{} case unix.EINVAL: return &ErrInvalidEndpointState{} case unix.EALREADY: return &ErrAlreadyConnecting{} case unix.EISCONN: return &ErrAlreadyConnected{} case unix.EADDRINUSE: return &ErrPortInUse{} case unix.EADDRNOTAVAIL: return &ErrBadLocalAddress{} case unix.EPIPE: return &ErrClosedForSend{} case unix.EWOULDBLOCK: return &ErrWouldBlock{} case unix.ECONNREFUSED: return &ErrConnectionRefused{} case unix.ETIMEDOUT: return &ErrTimeout{} case unix.EINPROGRESS: return &ErrConnectStarted{} case unix.EDESTADDRREQ: return &ErrDestinationRequired{} case unix.ENOTSUP: return &ErrNotSupported{} case unix.ENOTTY: return &ErrQueueSizeNotSupported{} case unix.ENOTCONN: return &ErrNotConnected{} case unix.ECONNRESET: return &ErrConnectionReset{} case unix.ECONNABORTED: return &ErrConnectionAborted{} case unix.EMSGSIZE: return &ErrMessageTooLong{} case unix.ENOBUFS: return &ErrNoBufferSpace{} default: return &ErrInvalidEndpointState{} } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/faketime/000077500000000000000000000000001465435605700223305ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/faketime/faketime.go000066400000000000000000000212771465435605700244550ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package faketime provides a fake clock that implements tcpip.Clock interface. package faketime import ( "container/heap" "fmt" "sync" "time" "gvisor.dev/gvisor/pkg/tcpip" ) // NullClock implements a clock that never advances. // // +stateify savable type NullClock struct{} var _ tcpip.Clock = (*NullClock)(nil) // Now implements tcpip.Clock.Now. func (*NullClock) Now() time.Time { return time.Time{} } // NowMonotonic implements tcpip.Clock.NowMonotonic. func (*NullClock) NowMonotonic() tcpip.MonotonicTime { return tcpip.MonotonicTime{} } // nullTimer implements a timer that never fires. // // +stateify savable type nullTimer struct{} var _ tcpip.Timer = (*nullTimer)(nil) // Stop implements tcpip.Timer. func (*nullTimer) Stop() bool { return true } // Reset implements tcpip.Timer. func (*nullTimer) Reset(time.Duration) {} // AfterFunc implements tcpip.Clock.AfterFunc. func (*NullClock) AfterFunc(time.Duration, func()) tcpip.Timer { return &nullTimer{} } type notificationChannels struct { mu struct { sync.Mutex ch []<-chan struct{} } } func (n *notificationChannels) add(ch <-chan struct{}) { n.mu.Lock() defer n.mu.Unlock() n.mu.ch = append(n.mu.ch, ch) } // wait returns once all the notification channels are readable. // // Channels that are added while waiting on existing channels will be waited on // as well. func (n *notificationChannels) wait() { for { n.mu.Lock() ch := n.mu.ch n.mu.ch = nil n.mu.Unlock() if len(ch) == 0 { break } for _, c := range ch { <-c } } } // +stateify savable type manualClockMutex struct { sync.RWMutex `state:"nosave"` // now is the current (fake) time of the clock. now time.Time // times is min-heap of times. times timeHeap // timers holds the timers scheduled for each time. timers map[time.Time]map[*manualTimer]struct{} } // ManualClock implements tcpip.Clock and only advances manually with Advance // method. // // +stateify savable type ManualClock struct { // runningTimers tracks the completion of timer callbacks that began running // immediately upon their scheduling. It is used to ensure the proper ordering // of timer callback dispatch. runningTimers notificationChannels mu manualClockMutex } // NewManualClock creates a new ManualClock instance. func NewManualClock() *ManualClock { c := &ManualClock{} c.mu.Lock() defer c.mu.Unlock() // Set the initial time to a non-zero value since the zero value is used to // detect inactive timers. c.mu.now = time.Unix(0, 0) c.mu.timers = make(map[time.Time]map[*manualTimer]struct{}) return c } var _ tcpip.Clock = (*ManualClock)(nil) // Now implements tcpip.Clock.Now. func (mc *ManualClock) Now() time.Time { mc.mu.RLock() defer mc.mu.RUnlock() return mc.mu.now } // NowMonotonic implements tcpip.Clock.NowMonotonic. func (mc *ManualClock) NowMonotonic() tcpip.MonotonicTime { var mt tcpip.MonotonicTime return mt.Add(mc.Now().Sub(time.Unix(0, 0))) } // AfterFunc implements tcpip.Clock.AfterFunc. func (mc *ManualClock) AfterFunc(d time.Duration, f func()) tcpip.Timer { mt := &manualTimer{ clock: mc, f: f, } mc.mu.Lock() defer mc.mu.Unlock() mt.mu.Lock() defer mt.mu.Unlock() mc.resetTimerLocked(mt, d) return mt } // resetTimerLocked schedules a timer to be fired after the given duration. // // Precondition: mc.mu and mt.mu must be locked. func (mc *ManualClock) resetTimerLocked(mt *manualTimer, d time.Duration) { if !mt.mu.firesAt.IsZero() { panic("tried to reset an active timer") } t := mc.mu.now.Add(d) if !mc.mu.now.Before(t) { // If the timer is scheduled to fire immediately, call its callback // in a new goroutine immediately. // // It needs to be called in its own goroutine to escape its current // execution context - like an actual timer. ch := make(chan struct{}) mc.runningTimers.add(ch) go func() { defer close(ch) mt.f() }() return } mt.mu.firesAt = t timers, ok := mc.mu.timers[t] if !ok { timers = make(map[*manualTimer]struct{}) mc.mu.timers[t] = timers heap.Push(&mc.mu.times, t) } timers[mt] = struct{}{} } // stopTimerLocked stops a timer from firing. // // Precondition: mc.mu and mt.mu must be locked. func (mc *ManualClock) stopTimerLocked(mt *manualTimer) { t := mt.mu.firesAt mt.mu.firesAt = time.Time{} if t.IsZero() { panic("tried to stop an inactive timer") } timers, ok := mc.mu.timers[t] if !ok { err := fmt.Sprintf("tried to stop an active timer but the clock does not have anything scheduled for the timer @ t = %s %p\nScheduled timers @:", t.UTC(), mt) for t := range mc.mu.timers { err += fmt.Sprintf("%s\n", t.UTC()) } panic(err) } if _, ok := timers[mt]; !ok { panic(fmt.Sprintf("did not have an entry in timers for an active timer @ t = %s", t.UTC())) } delete(timers, mt) if len(timers) == 0 { delete(mc.mu.timers, t) } } // RunImmediatelyScheduledJobs runs all jobs scheduled to run at the current // time. func (mc *ManualClock) RunImmediatelyScheduledJobs() { mc.Advance(0) } // Advance executes all work that have been scheduled to execute within d from // the current time. Blocks until all work has completed execution. func (mc *ManualClock) Advance(d time.Duration) { // We spawn goroutines for timers that were scheduled to fire at the time of // being reset. Wait for those goroutines to complete before proceeding so // that timer callbacks are called in the right order. mc.runningTimers.wait() mc.mu.Lock() defer mc.mu.Unlock() until := mc.mu.now.Add(d) for mc.mu.times.Len() > 0 { t := heap.Pop(&mc.mu.times).(time.Time) if t.After(until) { // No work to do heap.Push(&mc.mu.times, t) break } timers := mc.mu.timers[t] delete(mc.mu.timers, t) mc.mu.now = t // Mark the timers as inactive since they will be fired. // // This needs to be done while holding mc's lock because we remove the entry // in the map of timers for the current time. If an attempt to stop a // timer is made after mc's lock was dropped but before the timer is // marked inactive, we would panic since no entry exists for the time when // the timer was expected to fire. for mt := range timers { mt.mu.Lock() mt.mu.firesAt = time.Time{} mt.mu.Unlock() } // Release the lock before calling the timer's callback fn since the // callback fn might try to schedule a timer which requires obtaining // mc's lock. mc.mu.Unlock() for mt := range timers { mt.f() } // The timer callbacks may have scheduled a timer to fire immediately. // We spawn goroutines for these timers and need to wait for them to // finish before proceeding so that timer callbacks are called in the // right order. mc.runningTimers.wait() mc.mu.Lock() } mc.mu.now = until } func (mc *ManualClock) resetTimer(mt *manualTimer, d time.Duration) { mc.mu.Lock() defer mc.mu.Unlock() mt.mu.Lock() defer mt.mu.Unlock() if !mt.mu.firesAt.IsZero() { mc.stopTimerLocked(mt) } mc.resetTimerLocked(mt, d) } func (mc *ManualClock) stopTimer(mt *manualTimer) bool { mc.mu.Lock() defer mc.mu.Unlock() mt.mu.Lock() defer mt.mu.Unlock() if mt.mu.firesAt.IsZero() { return false } mc.stopTimerLocked(mt) return true } // +stateify savable type manualTimerMu struct { sync.Mutex `state:"nosave"` // firesAt is the time when the timer will fire. // // Zero only when the timer is not active. firesAt time.Time } // +stateify savable type manualTimer struct { clock *ManualClock // TODO(b/341946753): Restore when netstack is savable. f func() `state:"nosave"` mu manualTimerMu } var _ tcpip.Timer = (*manualTimer)(nil) // Reset implements tcpip.Timer.Reset. func (mt *manualTimer) Reset(d time.Duration) { mt.clock.resetTimer(mt, d) } // Stop implements tcpip.Timer.Stop. func (mt *manualTimer) Stop() bool { return mt.clock.stopTimer(mt) } type timeHeap []time.Time var _ heap.Interface = (*timeHeap)(nil) func (h timeHeap) Len() int { return len(h) } func (h timeHeap) Less(i, j int) bool { return h[i].Before(h[j]) } func (h timeHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } func (h *timeHeap) Push(x any) { *h = append(*h, x.(time.Time)) } func (h *timeHeap) Pop() any { last := (*h)[len(*h)-1] *h = (*h)[:len(*h)-1] return last } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/faketime/faketime_state_autogen.go000066400000000000000000000074351465435605700273770ustar00rootroot00000000000000// automatically generated by stateify. package faketime import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (n *NullClock) StateTypeName() string { return "pkg/tcpip/faketime.NullClock" } func (n *NullClock) StateFields() []string { return []string{} } func (n *NullClock) beforeSave() {} // +checklocksignore func (n *NullClock) StateSave(stateSinkObject state.Sink) { n.beforeSave() } func (n *NullClock) afterLoad(context.Context) {} // +checklocksignore func (n *NullClock) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (n *nullTimer) StateTypeName() string { return "pkg/tcpip/faketime.nullTimer" } func (n *nullTimer) StateFields() []string { return []string{} } func (n *nullTimer) beforeSave() {} // +checklocksignore func (n *nullTimer) StateSave(stateSinkObject state.Sink) { n.beforeSave() } func (n *nullTimer) afterLoad(context.Context) {} // +checklocksignore func (n *nullTimer) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (m *manualClockMutex) StateTypeName() string { return "pkg/tcpip/faketime.manualClockMutex" } func (m *manualClockMutex) StateFields() []string { return []string{ "now", "times", "timers", } } func (m *manualClockMutex) beforeSave() {} // +checklocksignore func (m *manualClockMutex) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.now) stateSinkObject.Save(1, &m.times) stateSinkObject.Save(2, &m.timers) } func (m *manualClockMutex) afterLoad(context.Context) {} // +checklocksignore func (m *manualClockMutex) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.now) stateSourceObject.Load(1, &m.times) stateSourceObject.Load(2, &m.timers) } func (mc *ManualClock) StateTypeName() string { return "pkg/tcpip/faketime.ManualClock" } func (mc *ManualClock) StateFields() []string { return []string{ "runningTimers", "mu", } } func (mc *ManualClock) beforeSave() {} // +checklocksignore func (mc *ManualClock) StateSave(stateSinkObject state.Sink) { mc.beforeSave() stateSinkObject.Save(0, &mc.runningTimers) stateSinkObject.Save(1, &mc.mu) } func (mc *ManualClock) afterLoad(context.Context) {} // +checklocksignore func (mc *ManualClock) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &mc.runningTimers) stateSourceObject.Load(1, &mc.mu) } func (m *manualTimerMu) StateTypeName() string { return "pkg/tcpip/faketime.manualTimerMu" } func (m *manualTimerMu) StateFields() []string { return []string{ "firesAt", } } func (m *manualTimerMu) beforeSave() {} // +checklocksignore func (m *manualTimerMu) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.firesAt) } func (m *manualTimerMu) afterLoad(context.Context) {} // +checklocksignore func (m *manualTimerMu) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.firesAt) } func (mt *manualTimer) StateTypeName() string { return "pkg/tcpip/faketime.manualTimer" } func (mt *manualTimer) StateFields() []string { return []string{ "clock", "mu", } } func (mt *manualTimer) beforeSave() {} // +checklocksignore func (mt *manualTimer) StateSave(stateSinkObject state.Sink) { mt.beforeSave() stateSinkObject.Save(0, &mt.clock) stateSinkObject.Save(1, &mt.mu) } func (mt *manualTimer) afterLoad(context.Context) {} // +checklocksignore func (mt *manualTimer) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &mt.clock) stateSourceObject.Load(1, &mt.mu) } func init() { state.Register((*NullClock)(nil)) state.Register((*nullTimer)(nil)) state.Register((*manualClockMutex)(nil)) state.Register((*ManualClock)(nil)) state.Register((*manualTimerMu)(nil)) state.Register((*manualTimer)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/hash/000077500000000000000000000000001465435605700214665ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/hash/jenkins/000077500000000000000000000000001465435605700231275ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/hash/jenkins/jenkins.go000066400000000000000000000041421465435605700251200ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package jenkins implements Jenkins's one_at_a_time, non-cryptographic hash // functions created by by Bob Jenkins. // // See https://en.wikipedia.org/wiki/Jenkins_hash_function#cite_note-dobbsx-1 package jenkins import ( "hash" ) // Sum32 represents Jenkins's one_at_a_time hash. // // Use the Sum32 type directly (as opposed to New32 below) // to avoid allocations. type Sum32 uint32 // New32 returns a new 32-bit Jenkins's one_at_a_time hash.Hash. // // Its Sum method will lay the value out in big-endian byte order. func New32() hash.Hash32 { var s Sum32 return &s } // Reset resets the hash to its initial state. func (s *Sum32) Reset() { *s = 0 } // Sum32 returns the hash value func (s *Sum32) Sum32() uint32 { sCopy := *s sCopy += sCopy << 3 sCopy ^= sCopy >> 11 sCopy += sCopy << 15 return uint32(sCopy) } // Write adds more data to the running hash. // // It never returns an error. func (s *Sum32) Write(data []byte) (int, error) { sCopy := *s for _, b := range data { sCopy += Sum32(b) sCopy += sCopy << 10 sCopy ^= sCopy >> 6 } *s = sCopy return len(data), nil } // Size returns the number of bytes Sum will return. func (s *Sum32) Size() int { return 4 } // BlockSize returns the hash's underlying block size. func (s *Sum32) BlockSize() int { return 1 } // Sum appends the current hash to in and returns the resulting slice. // // It does not change the underlying hash state. func (s *Sum32) Sum(in []byte) []byte { v := s.Sum32() return append(in, byte(v>>24), byte(v>>16), byte(v>>8), byte(v)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/hash/jenkins/jenkins_state_autogen.go000066400000000000000000000000711465435605700300370ustar00rootroot00000000000000// automatically generated by stateify. package jenkins golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/000077500000000000000000000000001465435605700217735ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/arp.go000066400000000000000000000106601465435605700231070ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import ( "encoding/binary" "gvisor.dev/gvisor/pkg/tcpip" ) const ( // ARPProtocolNumber is the ARP network protocol number. ARPProtocolNumber tcpip.NetworkProtocolNumber = 0x0806 // ARPSize is the size of an IPv4-over-Ethernet ARP packet. ARPSize = 28 ) // ARPHardwareType is the hardware type for LinkEndpoint in an ARP header. type ARPHardwareType uint16 // Typical ARP HardwareType values. Some of the constants have to be specific // values as they are egressed on the wire in the HTYPE field of an ARP header. const ( ARPHardwareNone ARPHardwareType = 0 // ARPHardwareEther specifically is the HTYPE for Ethernet as specified // in the IANA list here: // // https://www.iana.org/assignments/arp-parameters/arp-parameters.xhtml#arp-parameters-2 ARPHardwareEther ARPHardwareType = 1 ARPHardwareLoopback ARPHardwareType = 2 ) // ARPOp is an ARP opcode. type ARPOp uint16 // Typical ARP opcodes defined in RFC 826. const ( ARPRequest ARPOp = 1 ARPReply ARPOp = 2 ) // ARP is an ARP packet stored in a byte array as described in RFC 826. type ARP []byte const ( hTypeOffset = 0 protocolOffset = 2 haAddressSizeOffset = 4 protoAddressSizeOffset = 5 opCodeOffset = 6 senderHAAddressOffset = 8 senderProtocolAddressOffset = senderHAAddressOffset + EthernetAddressSize targetHAAddressOffset = senderProtocolAddressOffset + IPv4AddressSize targetProtocolAddressOffset = targetHAAddressOffset + EthernetAddressSize ) func (a ARP) hardwareAddressType() ARPHardwareType { return ARPHardwareType(binary.BigEndian.Uint16(a[hTypeOffset:])) } func (a ARP) protocolAddressSpace() uint16 { return binary.BigEndian.Uint16(a[protocolOffset:]) } func (a ARP) hardwareAddressSize() int { return int(a[haAddressSizeOffset]) } func (a ARP) protocolAddressSize() int { return int(a[protoAddressSizeOffset]) } // Op is the ARP opcode. func (a ARP) Op() ARPOp { return ARPOp(binary.BigEndian.Uint16(a[opCodeOffset:])) } // SetOp sets the ARP opcode. func (a ARP) SetOp(op ARPOp) { binary.BigEndian.PutUint16(a[opCodeOffset:], uint16(op)) } // SetIPv4OverEthernet configures the ARP packet for IPv4-over-Ethernet. func (a ARP) SetIPv4OverEthernet() { binary.BigEndian.PutUint16(a[hTypeOffset:], uint16(ARPHardwareEther)) binary.BigEndian.PutUint16(a[protocolOffset:], uint16(IPv4ProtocolNumber)) a[haAddressSizeOffset] = EthernetAddressSize a[protoAddressSizeOffset] = uint8(IPv4AddressSize) } // HardwareAddressSender is the link address of the sender. // It is a view on to the ARP packet so it can be used to set the value. func (a ARP) HardwareAddressSender() []byte { return a[senderHAAddressOffset : senderHAAddressOffset+EthernetAddressSize] } // ProtocolAddressSender is the protocol address of the sender. // It is a view on to the ARP packet so it can be used to set the value. func (a ARP) ProtocolAddressSender() []byte { return a[senderProtocolAddressOffset : senderProtocolAddressOffset+IPv4AddressSize] } // HardwareAddressTarget is the link address of the target. // It is a view on to the ARP packet so it can be used to set the value. func (a ARP) HardwareAddressTarget() []byte { return a[targetHAAddressOffset : targetHAAddressOffset+EthernetAddressSize] } // ProtocolAddressTarget is the protocol address of the target. // It is a view on to the ARP packet so it can be used to set the value. func (a ARP) ProtocolAddressTarget() []byte { return a[targetProtocolAddressOffset : targetProtocolAddressOffset+IPv4AddressSize] } // IsValid reports whether this is an ARP packet for IPv4 over Ethernet. func (a ARP) IsValid() bool { if len(a) < ARPSize { return false } return a.hardwareAddressType() == ARPHardwareEther && a.protocolAddressSpace() == uint16(IPv4ProtocolNumber) && a.hardwareAddressSize() == EthernetAddressSize && a.protocolAddressSize() == IPv4AddressSize } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/checksum.go000066400000000000000000000075451465435605700241370ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package header provides the implementation of the encoding and decoding of // network protocol headers. package header import ( "encoding/binary" "fmt" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/checksum" ) // PseudoHeaderChecksum calculates the pseudo-header checksum for the given // destination protocol and network address. Pseudo-headers are needed by // transport layers when calculating their own checksum. func PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, srcAddr tcpip.Address, dstAddr tcpip.Address, totalLen uint16) uint16 { xsum := checksum.Checksum(srcAddr.AsSlice(), 0) xsum = checksum.Checksum(dstAddr.AsSlice(), xsum) // Add the length portion of the checksum to the pseudo-checksum. var tmp [2]byte binary.BigEndian.PutUint16(tmp[:], totalLen) xsum = checksum.Checksum(tmp[:], xsum) return checksum.Checksum([]byte{0, uint8(protocol)}, xsum) } // checksumUpdate2ByteAlignedUint16 updates a uint16 value in a calculated // checksum. // // The value MUST begin at a 2-byte boundary in the original buffer. func checksumUpdate2ByteAlignedUint16(xsum, old, new uint16) uint16 { // As per RFC 1071 page 4, // (4) Incremental Update // // ... // // To update the checksum, simply add the differences of the // sixteen bit integers that have been changed. To see why this // works, observe that every 16-bit integer has an additive inverse // and that addition is associative. From this it follows that // given the original value m, the new value m', and the old // checksum C, the new checksum C' is: // // C' = C + (-m) + m' = C + (m' - m) if old == new { return xsum } return checksum.Combine(xsum, checksum.Combine(new, ^old)) } // checksumUpdate2ByteAlignedAddress updates an address in a calculated // checksum. // // The addresses must have the same length and must contain an even number // of bytes. The address MUST begin at a 2-byte boundary in the original buffer. func checksumUpdate2ByteAlignedAddress(xsum uint16, old, new tcpip.Address) uint16 { const uint16Bytes = 2 if old.BitLen() != new.BitLen() { panic(fmt.Sprintf("buffer lengths are different; old = %d, new = %d", old.BitLen()/8, new.BitLen()/8)) } if oldBytes := old.BitLen() % 16; oldBytes != 0 { panic(fmt.Sprintf("buffer has an odd number of bytes; got = %d", oldBytes)) } oldAddr := old.AsSlice() newAddr := new.AsSlice() // As per RFC 1071 page 4, // (4) Incremental Update // // ... // // To update the checksum, simply add the differences of the // sixteen bit integers that have been changed. To see why this // works, observe that every 16-bit integer has an additive inverse // and that addition is associative. From this it follows that // given the original value m, the new value m', and the old // checksum C, the new checksum C' is: // // C' = C + (-m) + m' = C + (m' - m) for len(oldAddr) != 0 { // Convert the 2 byte sequences to uint16 values then apply the increment // update. xsum = checksumUpdate2ByteAlignedUint16(xsum, (uint16(oldAddr[0])<<8)+uint16(oldAddr[1]), (uint16(newAddr[0])<<8)+uint16(newAddr[1])) oldAddr = oldAddr[uint16Bytes:] newAddr = newAddr[uint16Bytes:] } return xsum } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/datagram.go000066400000000000000000000013341465435605700241030ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header // DatagramMaximumSize is the maximum supported size of a single datagram. const DatagramMaximumSize = 0xffff // 65KB. golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/eth.go000066400000000000000000000151161465435605700231060ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import ( "encoding/binary" "gvisor.dev/gvisor/pkg/tcpip" ) const ( dstMAC = 0 srcMAC = 6 ethType = 12 ) // EthernetFields contains the fields of an ethernet frame header. It is used to // describe the fields of a frame that needs to be encoded. type EthernetFields struct { // SrcAddr is the "MAC source" field of an ethernet frame header. SrcAddr tcpip.LinkAddress // DstAddr is the "MAC destination" field of an ethernet frame header. DstAddr tcpip.LinkAddress // Type is the "ethertype" field of an ethernet frame header. Type tcpip.NetworkProtocolNumber } // Ethernet represents an ethernet frame header stored in a byte array. type Ethernet []byte const ( // EthernetMinimumSize is the minimum size of a valid ethernet frame. EthernetMinimumSize = 14 // EthernetMaximumSize is the maximum size of a valid ethernet frame. EthernetMaximumSize = 18 // EthernetAddressSize is the size, in bytes, of an ethernet address. EthernetAddressSize = 6 // UnspecifiedEthernetAddress is the unspecified ethernet address // (all bits set to 0). UnspecifiedEthernetAddress = tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00") // EthernetBroadcastAddress is an ethernet address that addresses every node // on a local link. EthernetBroadcastAddress = tcpip.LinkAddress("\xff\xff\xff\xff\xff\xff") // unicastMulticastFlagMask is the mask of the least significant bit in // the first octet (in network byte order) of an ethernet address that // determines whether the ethernet address is a unicast or multicast. If // the masked bit is a 1, then the address is a multicast, unicast // otherwise. // // See the IEEE Std 802-2001 document for more details. Specifically, // section 9.2.1 of http://ieee802.org/secmail/pdfocSP2xXA6d.pdf: // "A 48-bit universal address consists of two parts. The first 24 bits // correspond to the OUI as assigned by the IEEE, expect that the // assignee may set the LSB of the first octet to 1 for group addresses // or set it to 0 for individual addresses." unicastMulticastFlagMask = 1 // unicastMulticastFlagByteIdx is the byte that holds the // unicast/multicast flag. See unicastMulticastFlagMask. unicastMulticastFlagByteIdx = 0 ) const ( // EthernetProtocolAll is a catch-all for all protocols carried inside // an ethernet frame. It is mainly used to create packet sockets that // capture all traffic. EthernetProtocolAll tcpip.NetworkProtocolNumber = 0x0003 // EthernetProtocolPUP is the PARC Universal Packet protocol ethertype. EthernetProtocolPUP tcpip.NetworkProtocolNumber = 0x0200 ) // Ethertypes holds the protocol numbers describing the payload of an ethernet // frame. These types aren't necessarily supported by netstack, but can be used // to catch all traffic of a type via packet endpoints. var Ethertypes = []tcpip.NetworkProtocolNumber{ EthernetProtocolAll, EthernetProtocolPUP, } // SourceAddress returns the "MAC source" field of the ethernet frame header. func (b Ethernet) SourceAddress() tcpip.LinkAddress { return tcpip.LinkAddress(b[srcMAC:][:EthernetAddressSize]) } // DestinationAddress returns the "MAC destination" field of the ethernet frame // header. func (b Ethernet) DestinationAddress() tcpip.LinkAddress { return tcpip.LinkAddress(b[dstMAC:][:EthernetAddressSize]) } // Type returns the "ethertype" field of the ethernet frame header. func (b Ethernet) Type() tcpip.NetworkProtocolNumber { return tcpip.NetworkProtocolNumber(binary.BigEndian.Uint16(b[ethType:])) } // Encode encodes all the fields of the ethernet frame header. func (b Ethernet) Encode(e *EthernetFields) { binary.BigEndian.PutUint16(b[ethType:], uint16(e.Type)) copy(b[srcMAC:][:EthernetAddressSize], e.SrcAddr) copy(b[dstMAC:][:EthernetAddressSize], e.DstAddr) } // IsMulticastEthernetAddress returns true if the address is a multicast // ethernet address. func IsMulticastEthernetAddress(addr tcpip.LinkAddress) bool { if len(addr) != EthernetAddressSize { return false } return addr[unicastMulticastFlagByteIdx]&unicastMulticastFlagMask != 0 } // IsValidUnicastEthernetAddress returns true if the address is a unicast // ethernet address. func IsValidUnicastEthernetAddress(addr tcpip.LinkAddress) bool { if len(addr) != EthernetAddressSize { return false } if addr == UnspecifiedEthernetAddress { return false } if addr[unicastMulticastFlagByteIdx]&unicastMulticastFlagMask != 0 { return false } return true } // EthernetAddressFromMulticastIPv4Address returns a multicast Ethernet address // for a multicast IPv4 address. // // addr MUST be a multicast IPv4 address. func EthernetAddressFromMulticastIPv4Address(addr tcpip.Address) tcpip.LinkAddress { var linkAddrBytes [EthernetAddressSize]byte // RFC 1112 Host Extensions for IP Multicasting // // 6.4. Extensions to an Ethernet Local Network Module: // // An IP host group address is mapped to an Ethernet multicast // address by placing the low-order 23-bits of the IP address // into the low-order 23 bits of the Ethernet multicast address // 01-00-5E-00-00-00 (hex). addrBytes := addr.As4() linkAddrBytes[0] = 0x1 linkAddrBytes[2] = 0x5e linkAddrBytes[3] = addrBytes[1] & 0x7F copy(linkAddrBytes[4:], addrBytes[IPv4AddressSize-2:]) return tcpip.LinkAddress(linkAddrBytes[:]) } // EthernetAddressFromMulticastIPv6Address returns a multicast Ethernet address // for a multicast IPv6 address. // // addr MUST be a multicast IPv6 address. func EthernetAddressFromMulticastIPv6Address(addr tcpip.Address) tcpip.LinkAddress { // RFC 2464 Transmission of IPv6 Packets over Ethernet Networks // // 7. Address Mapping -- Multicast // // An IPv6 packet with a multicast destination address DST, // consisting of the sixteen octets DST[1] through DST[16], is // transmitted to the Ethernet multicast address whose first // two octets are the value 3333 hexadecimal and whose last // four octets are the last four octets of DST. addrBytes := addr.As16() linkAddrBytes := []byte(addrBytes[IPv6AddressSize-EthernetAddressSize:]) linkAddrBytes[0] = 0x33 linkAddrBytes[1] = 0x33 return tcpip.LinkAddress(linkAddrBytes[:]) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/gue.go000066400000000000000000000041071465435605700231040ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header const ( typeHLen = 0 encapProto = 1 ) // GUEFields contains the fields of a GUE packet. It is used to describe the // fields of a packet that needs to be encoded. type GUEFields struct { // Type is the "type" field of the GUE header. Type uint8 // Control is the "control" field of the GUE header. Control bool // HeaderLength is the "header length" field of the GUE header. It must // be at least 4 octets, and a multiple of 4 as well. HeaderLength uint8 // Protocol is the "protocol" field of the GUE header. This is one of // the IPPROTO_* values. Protocol uint8 } // GUE represents a Generic UDP Encapsulation header stored in a byte array, the // fields are described in https://tools.ietf.org/html/draft-ietf-nvo3-gue-01. type GUE []byte const ( // GUEMinimumSize is the minimum size of a valid GUE packet. GUEMinimumSize = 4 ) // TypeAndControl returns the GUE packet type (top 3 bits of the first byte, // which includes the control bit). func (b GUE) TypeAndControl() uint8 { return b[typeHLen] >> 5 } // HeaderLength returns the total length of the GUE header. func (b GUE) HeaderLength() uint8 { return 4 + 4*(b[typeHLen]&0x1f) } // Protocol returns the protocol field of the GUE header. func (b GUE) Protocol() uint8 { return b[encapProto] } // Encode encodes all the fields of the GUE header. func (b GUE) Encode(i *GUEFields) { ctl := uint8(0) if i.Control { ctl = 1 << 5 } b[typeHLen] = ctl | i.Type<<6 | (i.HeaderLength-4)/4 b[encapProto] = i.Protocol } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/header_state_autogen.go000066400000000000000000000052211465435605700264740ustar00rootroot00000000000000// automatically generated by stateify. package header import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (t *TCPSynOptions) StateTypeName() string { return "pkg/tcpip/header.TCPSynOptions" } func (t *TCPSynOptions) StateFields() []string { return []string{ "MSS", "WS", "TS", "TSVal", "TSEcr", "SACKPermitted", "Flags", } } func (t *TCPSynOptions) beforeSave() {} // +checklocksignore func (t *TCPSynOptions) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.MSS) stateSinkObject.Save(1, &t.WS) stateSinkObject.Save(2, &t.TS) stateSinkObject.Save(3, &t.TSVal) stateSinkObject.Save(4, &t.TSEcr) stateSinkObject.Save(5, &t.SACKPermitted) stateSinkObject.Save(6, &t.Flags) } func (t *TCPSynOptions) afterLoad(context.Context) {} // +checklocksignore func (t *TCPSynOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.MSS) stateSourceObject.Load(1, &t.WS) stateSourceObject.Load(2, &t.TS) stateSourceObject.Load(3, &t.TSVal) stateSourceObject.Load(4, &t.TSEcr) stateSourceObject.Load(5, &t.SACKPermitted) stateSourceObject.Load(6, &t.Flags) } func (r *SACKBlock) StateTypeName() string { return "pkg/tcpip/header.SACKBlock" } func (r *SACKBlock) StateFields() []string { return []string{ "Start", "End", } } func (r *SACKBlock) beforeSave() {} // +checklocksignore func (r *SACKBlock) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.Start) stateSinkObject.Save(1, &r.End) } func (r *SACKBlock) afterLoad(context.Context) {} // +checklocksignore func (r *SACKBlock) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.Start) stateSourceObject.Load(1, &r.End) } func (t *TCPOptions) StateTypeName() string { return "pkg/tcpip/header.TCPOptions" } func (t *TCPOptions) StateFields() []string { return []string{ "TS", "TSVal", "TSEcr", "SACKBlocks", } } func (t *TCPOptions) beforeSave() {} // +checklocksignore func (t *TCPOptions) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.TS) stateSinkObject.Save(1, &t.TSVal) stateSinkObject.Save(2, &t.TSEcr) stateSinkObject.Save(3, &t.SACKBlocks) } func (t *TCPOptions) afterLoad(context.Context) {} // +checklocksignore func (t *TCPOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.TS) stateSourceObject.Load(1, &t.TSVal) stateSourceObject.Load(2, &t.TSEcr) stateSourceObject.Load(3, &t.SACKBlocks) } func init() { state.Register((*TCPSynOptions)(nil)) state.Register((*SACKBlock)(nil)) state.Register((*TCPOptions)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/icmpv4.go000066400000000000000000000171251465435605700235320ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import ( "encoding/binary" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/checksum" ) // ICMPv4 represents an ICMPv4 header stored in a byte array. type ICMPv4 []byte const ( // ICMPv4PayloadOffset defines the start of ICMP payload. ICMPv4PayloadOffset = 8 // ICMPv4MinimumSize is the minimum size of a valid ICMP packet. ICMPv4MinimumSize = 8 // ICMPv4MinimumErrorPayloadSize Is the smallest number of bytes of an // errant packet's transport layer that an ICMP error type packet should // attempt to send as per RFC 792 (see each type) and RFC 1122 // section 3.2.2 which states: // Every ICMP error message includes the Internet header and at // least the first 8 data octets of the datagram that triggered // the error; more than 8 octets MAY be sent; this header and data // MUST be unchanged from the received datagram. // // RFC 792 shows: // 0 1 2 3 // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Type | Code | Checksum | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | unused | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Internet Header + 64 bits of Original Data Datagram | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ICMPv4MinimumErrorPayloadSize = 8 // ICMPv4ProtocolNumber is the ICMP transport protocol number. ICMPv4ProtocolNumber tcpip.TransportProtocolNumber = 1 // icmpv4ChecksumOffset is the offset of the checksum field // in an ICMPv4 message. icmpv4ChecksumOffset = 2 // icmpv4MTUOffset is the offset of the MTU field // in an ICMPv4FragmentationNeeded message. icmpv4MTUOffset = 6 // icmpv4IdentOffset is the offset of the ident field // in an ICMPv4EchoRequest/Reply message. icmpv4IdentOffset = 4 // icmpv4PointerOffset is the offset of the pointer field // in an ICMPv4ParamProblem message. icmpv4PointerOffset = 4 // icmpv4SequenceOffset is the offset of the sequence field // in an ICMPv4EchoRequest/Reply message. icmpv4SequenceOffset = 6 ) // ICMPv4Type is the ICMP type field described in RFC 792. type ICMPv4Type byte // ICMPv4Code is the ICMP code field described in RFC 792. type ICMPv4Code byte // Typical values of ICMPv4Type defined in RFC 792. const ( ICMPv4EchoReply ICMPv4Type = 0 ICMPv4DstUnreachable ICMPv4Type = 3 ICMPv4SrcQuench ICMPv4Type = 4 ICMPv4Redirect ICMPv4Type = 5 ICMPv4Echo ICMPv4Type = 8 ICMPv4TimeExceeded ICMPv4Type = 11 ICMPv4ParamProblem ICMPv4Type = 12 ICMPv4Timestamp ICMPv4Type = 13 ICMPv4TimestampReply ICMPv4Type = 14 ICMPv4InfoRequest ICMPv4Type = 15 ICMPv4InfoReply ICMPv4Type = 16 ) // ICMP codes for ICMPv4 Time Exceeded messages as defined in RFC 792. const ( ICMPv4TTLExceeded ICMPv4Code = 0 ICMPv4ReassemblyTimeout ICMPv4Code = 1 ) // ICMP codes for ICMPv4 Destination Unreachable messages as defined in RFC 792, // RFC 1122 section 3.2.2.1 and RFC 1812 section 5.2.7.1. const ( ICMPv4NetUnreachable ICMPv4Code = 0 ICMPv4HostUnreachable ICMPv4Code = 1 ICMPv4ProtoUnreachable ICMPv4Code = 2 ICMPv4PortUnreachable ICMPv4Code = 3 ICMPv4FragmentationNeeded ICMPv4Code = 4 ICMPv4SourceRouteFailed ICMPv4Code = 5 ICMPv4DestinationNetworkUnknown ICMPv4Code = 6 ICMPv4DestinationHostUnknown ICMPv4Code = 7 ICMPv4SourceHostIsolated ICMPv4Code = 8 ICMPv4NetProhibited ICMPv4Code = 9 ICMPv4HostProhibited ICMPv4Code = 10 ICMPv4NetUnreachableForTos ICMPv4Code = 11 ICMPv4HostUnreachableForTos ICMPv4Code = 12 ICMPv4AdminProhibited ICMPv4Code = 13 ICMPv4HostPrecedenceViolation ICMPv4Code = 14 ICMPv4PrecedenceCutInEffect ICMPv4Code = 15 ) // ICMPv4UnusedCode is a code to use in ICMP messages where no code is needed. const ICMPv4UnusedCode ICMPv4Code = 0 // Type is the ICMP type field. func (b ICMPv4) Type() ICMPv4Type { return ICMPv4Type(b[0]) } // SetType sets the ICMP type field. func (b ICMPv4) SetType(t ICMPv4Type) { b[0] = byte(t) } // Code is the ICMP code field. Its meaning depends on the value of Type. func (b ICMPv4) Code() ICMPv4Code { return ICMPv4Code(b[1]) } // SetCode sets the ICMP code field. func (b ICMPv4) SetCode(c ICMPv4Code) { b[1] = byte(c) } // Pointer returns the pointer field in a Parameter Problem packet. func (b ICMPv4) Pointer() byte { return b[icmpv4PointerOffset] } // SetPointer sets the pointer field in a Parameter Problem packet. func (b ICMPv4) SetPointer(c byte) { b[icmpv4PointerOffset] = c } // Checksum is the ICMP checksum field. func (b ICMPv4) Checksum() uint16 { return binary.BigEndian.Uint16(b[icmpv4ChecksumOffset:]) } // SetChecksum sets the ICMP checksum field. func (b ICMPv4) SetChecksum(cs uint16) { checksum.Put(b[icmpv4ChecksumOffset:], cs) } // SourcePort implements Transport.SourcePort. func (ICMPv4) SourcePort() uint16 { return 0 } // DestinationPort implements Transport.DestinationPort. func (ICMPv4) DestinationPort() uint16 { return 0 } // SetSourcePort implements Transport.SetSourcePort. func (ICMPv4) SetSourcePort(uint16) { } // SetDestinationPort implements Transport.SetDestinationPort. func (ICMPv4) SetDestinationPort(uint16) { } // Payload implements Transport.Payload. func (b ICMPv4) Payload() []byte { return b[ICMPv4PayloadOffset:] } // MTU retrieves the MTU field from an ICMPv4 message. func (b ICMPv4) MTU() uint16 { return binary.BigEndian.Uint16(b[icmpv4MTUOffset:]) } // SetMTU sets the MTU field from an ICMPv4 message. func (b ICMPv4) SetMTU(mtu uint16) { binary.BigEndian.PutUint16(b[icmpv4MTUOffset:], mtu) } // Ident retrieves the Ident field from an ICMPv4 message. func (b ICMPv4) Ident() uint16 { return binary.BigEndian.Uint16(b[icmpv4IdentOffset:]) } // SetIdent sets the Ident field from an ICMPv4 message. func (b ICMPv4) SetIdent(ident uint16) { binary.BigEndian.PutUint16(b[icmpv4IdentOffset:], ident) } // SetIdentWithChecksumUpdate sets the Ident field and updates the checksum. func (b ICMPv4) SetIdentWithChecksumUpdate(new uint16) { old := b.Ident() b.SetIdent(new) b.SetChecksum(^checksumUpdate2ByteAlignedUint16(^b.Checksum(), old, new)) } // Sequence retrieves the Sequence field from an ICMPv4 message. func (b ICMPv4) Sequence() uint16 { return binary.BigEndian.Uint16(b[icmpv4SequenceOffset:]) } // SetSequence sets the Sequence field from an ICMPv4 message. func (b ICMPv4) SetSequence(sequence uint16) { binary.BigEndian.PutUint16(b[icmpv4SequenceOffset:], sequence) } // ICMPv4Checksum calculates the ICMP checksum over the provided ICMP header, // and payload. func ICMPv4Checksum(h ICMPv4, payloadCsum uint16) uint16 { xsum := payloadCsum // h[2:4] is the checksum itself, skip it to avoid checksumming the checksum. xsum = checksum.Checksum(h[:2], xsum) xsum = checksum.Checksum(h[4:], xsum) return ^xsum } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/icmpv6.go000066400000000000000000000236371465435605700235410ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import ( "encoding/binary" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/checksum" ) // ICMPv6 represents an ICMPv6 header stored in a byte array. type ICMPv6 []byte const ( // ICMPv6HeaderSize is the size of the ICMPv6 header. That is, the // sum of the size of the ICMPv6 Type, Code and Checksum fields, as // per RFC 4443 section 2.1. After the ICMPv6 header, the ICMPv6 // message body begins. ICMPv6HeaderSize = 4 // ICMPv6MinimumSize is the minimum size of a valid ICMP packet. ICMPv6MinimumSize = 8 // ICMPv6PayloadOffset is the offset of the payload in an // ICMP packet. ICMPv6PayloadOffset = 8 // ICMPv6ProtocolNumber is the ICMP transport protocol number. ICMPv6ProtocolNumber tcpip.TransportProtocolNumber = 58 // ICMPv6NeighborSolicitMinimumSize is the minimum size of a // neighbor solicitation packet. ICMPv6NeighborSolicitMinimumSize = ICMPv6HeaderSize + NDPNSMinimumSize // ICMPv6NeighborAdvertMinimumSize is the minimum size of a // neighbor advertisement packet. ICMPv6NeighborAdvertMinimumSize = ICMPv6HeaderSize + NDPNAMinimumSize // ICMPv6EchoMinimumSize is the minimum size of a valid echo packet. ICMPv6EchoMinimumSize = 8 // ICMPv6ErrorHeaderSize is the size of an ICMP error packet header, // as per RFC 4443, Appendix A, item 4 and the errata. // ... all ICMP error messages shall have exactly // 32 bits of type-specific data, so that receivers can reliably find // the embedded invoking packet even when they don't recognize the // ICMP message Type. ICMPv6ErrorHeaderSize = 8 // ICMPv6DstUnreachableMinimumSize is the minimum size of a valid ICMP // destination unreachable packet. ICMPv6DstUnreachableMinimumSize = ICMPv6MinimumSize // ICMPv6PacketTooBigMinimumSize is the minimum size of a valid ICMP // packet-too-big packet. ICMPv6PacketTooBigMinimumSize = ICMPv6MinimumSize // ICMPv6ChecksumOffset is the offset of the checksum field // in an ICMPv6 message. ICMPv6ChecksumOffset = 2 // icmpv6PointerOffset is the offset of the pointer // in an ICMPv6 Parameter problem message. icmpv6PointerOffset = 4 // icmpv6MTUOffset is the offset of the MTU field in an ICMPv6 // PacketTooBig message. icmpv6MTUOffset = 4 // icmpv6IdentOffset is the offset of the ident field // in a ICMPv6 Echo Request/Reply message. icmpv6IdentOffset = 4 // icmpv6SequenceOffset is the offset of the sequence field // in a ICMPv6 Echo Request/Reply message. icmpv6SequenceOffset = 6 // NDPHopLimit is the expected IP hop limit value of 255 for received // NDP packets, as per RFC 4861 sections 4.1 - 4.5, 6.1.1, 6.1.2, 7.1.1, // 7.1.2 and 8.1. If the hop limit value is not 255, nodes MUST silently // drop the NDP packet. All outgoing NDP packets must use this value for // its IP hop limit field. NDPHopLimit = 255 ) // ICMPv6Type is the ICMP type field described in RFC 4443. type ICMPv6Type byte // Values for use in the Type field of ICMPv6 packet from RFC 4433. const ( ICMPv6DstUnreachable ICMPv6Type = 1 ICMPv6PacketTooBig ICMPv6Type = 2 ICMPv6TimeExceeded ICMPv6Type = 3 ICMPv6ParamProblem ICMPv6Type = 4 ICMPv6EchoRequest ICMPv6Type = 128 ICMPv6EchoReply ICMPv6Type = 129 // Neighbor Discovery Protocol (NDP) messages, see RFC 4861. ICMPv6RouterSolicit ICMPv6Type = 133 ICMPv6RouterAdvert ICMPv6Type = 134 ICMPv6NeighborSolicit ICMPv6Type = 135 ICMPv6NeighborAdvert ICMPv6Type = 136 ICMPv6RedirectMsg ICMPv6Type = 137 // Multicast Listener Discovery (MLD) messages, see RFC 2710. ICMPv6MulticastListenerQuery ICMPv6Type = 130 ICMPv6MulticastListenerReport ICMPv6Type = 131 ICMPv6MulticastListenerDone ICMPv6Type = 132 // Multicast Listener Discovert Version 2 (MLDv2) messages, see RFC 3810. ICMPv6MulticastListenerV2Report ICMPv6Type = 143 ) // IsErrorType returns true if the receiver is an ICMP error type. func (typ ICMPv6Type) IsErrorType() bool { // Per RFC 4443 section 2.1: // ICMPv6 messages are grouped into two classes: error messages and // informational messages. Error messages are identified as such by a // zero in the high-order bit of their message Type field values. Thus, // error messages have message types from 0 to 127; informational // messages have message types from 128 to 255. return typ&0x80 == 0 } // ICMPv6Code is the ICMP Code field described in RFC 4443. type ICMPv6Code byte // ICMP codes used with Destination Unreachable (Type 1). As per RFC 4443 // section 3.1. const ( ICMPv6NetworkUnreachable ICMPv6Code = 0 ICMPv6Prohibited ICMPv6Code = 1 ICMPv6BeyondScope ICMPv6Code = 2 ICMPv6AddressUnreachable ICMPv6Code = 3 ICMPv6PortUnreachable ICMPv6Code = 4 ICMPv6Policy ICMPv6Code = 5 ICMPv6RejectRoute ICMPv6Code = 6 ) // ICMP codes used with Time Exceeded (Type 3). As per RFC 4443 section 3.3. const ( ICMPv6HopLimitExceeded ICMPv6Code = 0 ICMPv6ReassemblyTimeout ICMPv6Code = 1 ) // ICMP codes used with Parameter Problem (Type 4). As per RFC 4443 section 3.4. const ( // ICMPv6ErroneousHeader indicates an erroneous header field was encountered. ICMPv6ErroneousHeader ICMPv6Code = 0 // ICMPv6UnknownHeader indicates an unrecognized Next Header type encountered. ICMPv6UnknownHeader ICMPv6Code = 1 // ICMPv6UnknownOption indicates an unrecognized IPv6 option was encountered. ICMPv6UnknownOption ICMPv6Code = 2 ) // ICMPv6UnusedCode is the code value used with ICMPv6 messages which don't use // the code field. (Types not mentioned above.) const ICMPv6UnusedCode ICMPv6Code = 0 // Type is the ICMP type field. func (b ICMPv6) Type() ICMPv6Type { return ICMPv6Type(b[0]) } // SetType sets the ICMP type field. func (b ICMPv6) SetType(t ICMPv6Type) { b[0] = byte(t) } // Code is the ICMP code field. Its meaning depends on the value of Type. func (b ICMPv6) Code() ICMPv6Code { return ICMPv6Code(b[1]) } // SetCode sets the ICMP code field. func (b ICMPv6) SetCode(c ICMPv6Code) { b[1] = byte(c) } // TypeSpecific returns the type specific data field. func (b ICMPv6) TypeSpecific() uint32 { return binary.BigEndian.Uint32(b[icmpv6PointerOffset:]) } // SetTypeSpecific sets the type specific data field. func (b ICMPv6) SetTypeSpecific(val uint32) { binary.BigEndian.PutUint32(b[icmpv6PointerOffset:], val) } // Checksum is the ICMP checksum field. func (b ICMPv6) Checksum() uint16 { return binary.BigEndian.Uint16(b[ICMPv6ChecksumOffset:]) } // SetChecksum sets the ICMP checksum field. func (b ICMPv6) SetChecksum(cs uint16) { checksum.Put(b[ICMPv6ChecksumOffset:], cs) } // SourcePort implements Transport.SourcePort. func (ICMPv6) SourcePort() uint16 { return 0 } // DestinationPort implements Transport.DestinationPort. func (ICMPv6) DestinationPort() uint16 { return 0 } // SetSourcePort implements Transport.SetSourcePort. func (ICMPv6) SetSourcePort(uint16) { } // SetDestinationPort implements Transport.SetDestinationPort. func (ICMPv6) SetDestinationPort(uint16) { } // MTU retrieves the MTU field from an ICMPv6 message. func (b ICMPv6) MTU() uint32 { return binary.BigEndian.Uint32(b[icmpv6MTUOffset:]) } // SetMTU sets the MTU field from an ICMPv6 message. func (b ICMPv6) SetMTU(mtu uint32) { binary.BigEndian.PutUint32(b[icmpv6MTUOffset:], mtu) } // Ident retrieves the Ident field from an ICMPv6 message. func (b ICMPv6) Ident() uint16 { return binary.BigEndian.Uint16(b[icmpv6IdentOffset:]) } // SetIdent sets the Ident field from an ICMPv6 message. func (b ICMPv6) SetIdent(ident uint16) { binary.BigEndian.PutUint16(b[icmpv6IdentOffset:], ident) } // SetIdentWithChecksumUpdate sets the Ident field and updates the checksum. func (b ICMPv6) SetIdentWithChecksumUpdate(new uint16) { old := b.Ident() b.SetIdent(new) b.SetChecksum(^checksumUpdate2ByteAlignedUint16(^b.Checksum(), old, new)) } // Sequence retrieves the Sequence field from an ICMPv6 message. func (b ICMPv6) Sequence() uint16 { return binary.BigEndian.Uint16(b[icmpv6SequenceOffset:]) } // SetSequence sets the Sequence field from an ICMPv6 message. func (b ICMPv6) SetSequence(sequence uint16) { binary.BigEndian.PutUint16(b[icmpv6SequenceOffset:], sequence) } // MessageBody returns the message body as defined by RFC 4443 section 2.1; the // portion of the ICMPv6 buffer after the first ICMPv6HeaderSize bytes. func (b ICMPv6) MessageBody() []byte { return b[ICMPv6HeaderSize:] } // Payload implements Transport.Payload. func (b ICMPv6) Payload() []byte { return b[ICMPv6PayloadOffset:] } // ICMPv6ChecksumParams contains parameters to calculate ICMPv6 checksum. type ICMPv6ChecksumParams struct { Header ICMPv6 Src tcpip.Address Dst tcpip.Address PayloadCsum uint16 PayloadLen int } // ICMPv6Checksum calculates the ICMP checksum over the provided ICMPv6 header, // IPv6 src/dst addresses and the payload. func ICMPv6Checksum(params ICMPv6ChecksumParams) uint16 { h := params.Header xsum := PseudoHeaderChecksum(ICMPv6ProtocolNumber, params.Src, params.Dst, uint16(len(h)+params.PayloadLen)) xsum = checksum.Combine(xsum, params.PayloadCsum) // h[2:4] is the checksum itself, skip it to avoid checksumming the checksum. xsum = checksum.Checksum(h[:2], xsum) xsum = checksum.Checksum(h[4:], xsum) return ^xsum } // UpdateChecksumPseudoHeaderAddress updates the checksum to reflect an // updated address in the pseudo header. func (b ICMPv6) UpdateChecksumPseudoHeaderAddress(old, new tcpip.Address) { b.SetChecksum(^checksumUpdate2ByteAlignedAddress(^b.Checksum(), old, new)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/igmp.go000066400000000000000000000141131465435605700232560ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import ( "encoding/binary" "fmt" "time" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/checksum" ) // IGMP represents an IGMP header stored in a byte array. type IGMP []byte // IGMP implements `Transport`. var _ Transport = (*IGMP)(nil) const ( // IGMPMinimumSize is the minimum size of a valid IGMP packet in bytes, // as per RFC 2236, Section 2, Page 2. IGMPMinimumSize = 8 // IGMPQueryMinimumSize is the minimum size of a valid Membership Query // Message in bytes, as per RFC 2236, Section 2, Page 2. IGMPQueryMinimumSize = 8 // IGMPReportMinimumSize is the minimum size of a valid Report Message in // bytes, as per RFC 2236, Section 2, Page 2. IGMPReportMinimumSize = 8 // IGMPLeaveMessageMinimumSize is the minimum size of a valid Leave Message // in bytes, as per RFC 2236, Section 2, Page 2. IGMPLeaveMessageMinimumSize = 8 // IGMPTTL is the TTL for all IGMP messages, as per RFC 2236, Section 3, Page // 3. IGMPTTL = 1 // igmpTypeOffset defines the offset of the type field in an IGMP message. igmpTypeOffset = 0 // igmpMaxRespTimeOffset defines the offset of the MaxRespTime field in an // IGMP message. igmpMaxRespTimeOffset = 1 // igmpChecksumOffset defines the offset of the checksum field in an IGMP // message. igmpChecksumOffset = 2 // igmpGroupAddressOffset defines the offset of the Group Address field in an // IGMP message. igmpGroupAddressOffset = 4 // IGMPProtocolNumber is IGMP's transport protocol number. IGMPProtocolNumber tcpip.TransportProtocolNumber = 2 ) // IGMPType is the IGMP type field as per RFC 2236. type IGMPType byte // Values for the IGMP Type described in RFC 2236 Section 2.1, Page 2. // Descriptions below come from there. const ( // IGMPMembershipQuery indicates that the message type is Membership Query. // "There are two sub-types of Membership Query messages: // - General Query, used to learn which groups have members on an // attached network. // - Group-Specific Query, used to learn if a particular group // has any members on an attached network. // These two messages are differentiated by the Group Address, as // described in section 1.4 ." IGMPMembershipQuery IGMPType = 0x11 // IGMPv1MembershipReport indicates that the message is a Membership Report // generated by a host using the IGMPv1 protocol: "an additional type of // message, for backwards-compatibility with IGMPv1" IGMPv1MembershipReport IGMPType = 0x12 // IGMPv2MembershipReport indicates that the Message type is a Membership // Report generated by a host using the IGMPv2 protocol. IGMPv2MembershipReport IGMPType = 0x16 // IGMPLeaveGroup indicates that the message type is a Leave Group // notification message. IGMPLeaveGroup IGMPType = 0x17 // IGMPv3MembershipReport indicates that the message type is a IGMPv3 report. IGMPv3MembershipReport IGMPType = 0x22 ) // Type is the IGMP type field. func (b IGMP) Type() IGMPType { return IGMPType(b[igmpTypeOffset]) } // SetType sets the IGMP type field. func (b IGMP) SetType(t IGMPType) { b[igmpTypeOffset] = byte(t) } // MaxRespTime gets the MaxRespTimeField. This is meaningful only in Membership // Query messages, in other cases it is set to 0 by the sender and ignored by // the receiver. func (b IGMP) MaxRespTime() time.Duration { // As per RFC 2236 section 2.2, // // The Max Response Time field is meaningful only in Membership Query // messages, and specifies the maximum allowed time before sending a // responding report in units of 1/10 second. In all other messages, it // is set to zero by the sender and ignored by receivers. return DecisecondToDuration(uint16(b[igmpMaxRespTimeOffset])) } // SetMaxRespTime sets the MaxRespTimeField. func (b IGMP) SetMaxRespTime(m byte) { b[igmpMaxRespTimeOffset] = m } // Checksum is the IGMP checksum field. func (b IGMP) Checksum() uint16 { return binary.BigEndian.Uint16(b[igmpChecksumOffset:]) } // SetChecksum sets the IGMP checksum field. func (b IGMP) SetChecksum(checksum uint16) { binary.BigEndian.PutUint16(b[igmpChecksumOffset:], checksum) } // GroupAddress gets the Group Address field. func (b IGMP) GroupAddress() tcpip.Address { return tcpip.AddrFrom4([4]byte(b[igmpGroupAddressOffset:][:IPv4AddressSize])) } // SetGroupAddress sets the Group Address field. func (b IGMP) SetGroupAddress(address tcpip.Address) { addrBytes := address.As4() if n := copy(b[igmpGroupAddressOffset:], addrBytes[:]); n != IPv4AddressSize { panic(fmt.Sprintf("copied %d bytes, expected %d", n, IPv4AddressSize)) } } // SourcePort implements Transport.SourcePort. func (IGMP) SourcePort() uint16 { return 0 } // DestinationPort implements Transport.DestinationPort. func (IGMP) DestinationPort() uint16 { return 0 } // SetSourcePort implements Transport.SetSourcePort. func (IGMP) SetSourcePort(uint16) { } // SetDestinationPort implements Transport.SetDestinationPort. func (IGMP) SetDestinationPort(uint16) { } // Payload implements Transport.Payload. func (IGMP) Payload() []byte { return nil } // IGMPCalculateChecksum calculates the IGMP checksum over the provided IGMP // header. func IGMPCalculateChecksum(h IGMP) uint16 { // The header contains a checksum itself, set it aside to avoid checksumming // the checksum and replace it afterwards. existingXsum := h.Checksum() h.SetChecksum(0) xsum := ^checksum.Checksum(h, 0) h.SetChecksum(existingXsum) return xsum } // DecisecondToDuration converts a value representing deci-seconds to a // time.Duration. func DecisecondToDuration(ds uint16) time.Duration { return time.Duration(ds) * time.Second / 10 } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/igmpv3.go000066400000000000000000000560711465435605700235400ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import ( "bytes" "encoding/binary" "fmt" "time" "gvisor.dev/gvisor/pkg/tcpip" ) var ( // IGMPv3RoutersAddress is the address to send IGMPv3 reports to. // // As per RFC 3376 section 4.2.14, // // Version 3 Reports are sent with an IP destination address of // 224.0.0.22, to which all IGMPv3-capable multicast routers listen. IGMPv3RoutersAddress = tcpip.AddrFrom4([4]byte{0xe0, 0x00, 0x00, 0x16}) ) const ( // IGMPv3QueryMinimumSize is the mimum size of a valid IGMPv3 query, // as per RFC 3376 section 4.1. IGMPv3QueryMinimumSize = 12 igmpv3QueryMaxRespCodeOffset = 1 igmpv3QueryGroupAddressOffset = 4 igmpv3QueryResvSQRVOffset = 8 igmpv3QueryQRVMask = 0b111 igmpv3QueryQQICOffset = 9 igmpv3QueryNumberOfSourcesOffset = 10 igmpv3QuerySourcesOffset = 12 ) // IGMPv3Query is an IGMPv3 query message. // // As per RFC 3376 section 4.1, // // 0 1 2 3 // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Type = 0x11 | Max Resp Code | Checksum | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Group Address | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Resv |S| QRV | QQIC | Number of Sources (N) | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Source Address [1] | // +- -+ // | Source Address [2] | // +- . -+ // . . . // . . . // +- -+ // | Source Address [N] | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ type IGMPv3Query IGMP // MaximumResponseCode returns the Maximum Response Code. func (i IGMPv3Query) MaximumResponseCode() uint8 { return i[igmpv3QueryMaxRespCodeOffset] } // IGMPv3MaximumResponseDelay returns the Maximum Response Delay in an IGMPv3 // Maximum Response Code. // // As per RFC 3376 section 4.1.1, // // The Max Resp Code field specifies the maximum time allowed before // sending a responding report. The actual time allowed, called the Max // Resp Time, is represented in units of 1/10 second and is derived from // the Max Resp Code as follows: // // If Max Resp Code < 128, Max Resp Time = Max Resp Code // // If Max Resp Code >= 128, Max Resp Code represents a floating-point // value as follows: // // 0 1 2 3 4 5 6 7 // +-+-+-+-+-+-+-+-+ // |1| exp | mant | // +-+-+-+-+-+-+-+-+ // // Max Resp Time = (mant | 0x10) << (exp + 3) // // Small values of Max Resp Time allow IGMPv3 routers to tune the "leave // latency" (the time between the moment the last host leaves a group // and the moment the routing protocol is notified that there are no // more members). Larger values, especially in the exponential range, // allow tuning of the burstiness of IGMP traffic on a network. func IGMPv3MaximumResponseDelay(codeRaw uint8) time.Duration { code := uint16(codeRaw) if code < 128 { return DecisecondToDuration(code) } const mantBits = 4 const expMask = 0b111 exp := (code >> mantBits) & expMask mant := code & ((1 << mantBits) - 1) return DecisecondToDuration((mant | 0x10) << (exp + 3)) } // GroupAddress returns the group address. func (i IGMPv3Query) GroupAddress() tcpip.Address { return tcpip.AddrFrom4([4]byte(i[igmpv3QueryGroupAddressOffset:][:IPv4AddressSize])) } // QuerierRobustnessVariable returns the querier's robustness variable. func (i IGMPv3Query) QuerierRobustnessVariable() uint8 { return i[igmpv3QueryResvSQRVOffset] & igmpv3QueryQRVMask } // QuerierQueryInterval returns the querier's query interval. func (i IGMPv3Query) QuerierQueryInterval() time.Duration { return mldv2AndIGMPv3QuerierQueryCodeToInterval(i[igmpv3QueryQQICOffset]) } // Sources returns an iterator over source addresses in the query. // // Returns false if the message cannot hold the expected number of sources. func (i IGMPv3Query) Sources() (AddressIterator, bool) { return makeAddressIterator( i[igmpv3QuerySourcesOffset:], binary.BigEndian.Uint16(i[igmpv3QueryNumberOfSourcesOffset:]), IPv4AddressSize, ) } // IGMPv3ReportRecordType is the type of an IGMPv3 multicast address record // found in an IGMPv3 report, as per RFC 3810 section 5.2.12. type IGMPv3ReportRecordType int // IGMPv3 multicast address record types, as per RFC 3810 section 5.2.12. const ( IGMPv3ReportRecordModeIsInclude IGMPv3ReportRecordType = 1 IGMPv3ReportRecordModeIsExclude IGMPv3ReportRecordType = 2 IGMPv3ReportRecordChangeToIncludeMode IGMPv3ReportRecordType = 3 IGMPv3ReportRecordChangeToExcludeMode IGMPv3ReportRecordType = 4 IGMPv3ReportRecordAllowNewSources IGMPv3ReportRecordType = 5 IGMPv3ReportRecordBlockOldSources IGMPv3ReportRecordType = 6 ) const ( igmpv3ReportGroupAddressRecordMinimumSize = 8 igmpv3ReportGroupAddressRecordTypeOffset = 0 igmpv3ReportGroupAddressRecordAuxDataLenOffset = 1 igmpv3ReportGroupAddressRecordAuxDataLenUnits = 4 igmpv3ReportGroupAddressRecordNumberOfSourcesOffset = 2 igmpv3ReportGroupAddressRecordGroupAddressOffset = 4 igmpv3ReportGroupAddressRecordSourcesOffset = 8 ) // IGMPv3ReportGroupAddressRecordSerializer is an IGMPv3 Multicast Address // Record serializer. // // As per RFC 3810 section 5.2, a Multicast Address Record has the following // internal format: // // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Record Type | Aux Data Len | Number of Sources (N) | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // * * // | | // * Multicast Address * // | | // * * // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // * * // | | // * Source Address [1] * // | | // * * // | | // +- -+ // | | // * * // | | // * Source Address [2] * // | | // * * // | | // +- -+ // . . . // . . . // . . . // +- -+ // | | // * * // | | // * Source Address [N] * // | | // * * // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // . . // . Auxiliary Data . // . . // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ type IGMPv3ReportGroupAddressRecordSerializer struct { RecordType IGMPv3ReportRecordType GroupAddress tcpip.Address Sources []tcpip.Address } // Length returns the number of bytes this serializer would occupy. func (s *IGMPv3ReportGroupAddressRecordSerializer) Length() int { return igmpv3ReportGroupAddressRecordSourcesOffset + len(s.Sources)*IPv4AddressSize } func copyIPv4Address(dst []byte, src tcpip.Address) { srcBytes := src.As4() if n := copy(dst, srcBytes[:]); n != IPv4AddressSize { panic(fmt.Sprintf("got copy(...) = %d, want = %d", n, IPv4AddressSize)) } } // SerializeInto serializes the record into the buffer. // // Panics if the buffer does not have enough space to fit the record. func (s *IGMPv3ReportGroupAddressRecordSerializer) SerializeInto(b []byte) { b[igmpv3ReportGroupAddressRecordTypeOffset] = byte(s.RecordType) b[igmpv3ReportGroupAddressRecordAuxDataLenOffset] = 0 binary.BigEndian.PutUint16(b[igmpv3ReportGroupAddressRecordNumberOfSourcesOffset:], uint16(len(s.Sources))) copyIPv4Address(b[igmpv3ReportGroupAddressRecordGroupAddressOffset:], s.GroupAddress) b = b[igmpv3ReportGroupAddressRecordSourcesOffset:] for _, source := range s.Sources { copyIPv4Address(b, source) b = b[IPv4AddressSize:] } } const ( igmpv3ReportTypeOffset = 0 igmpv3ReportReserved1Offset = 1 igmpv3ReportReserved2Offset = 4 igmpv3ReportNumberOfGroupAddressRecordsOffset = 6 igmpv3ReportGroupAddressRecordsOffset = 8 ) // IGMPv3ReportSerializer is an MLD Version 2 Report serializer. // // As per RFC 3810 section 5.2, // // 0 1 2 3 // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Type = 143 | Reserved | Checksum | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Reserved |Nr of Mcast Address Records (M)| // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // . . // . Multicast Address Record [1] . // . . // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // . . // . Multicast Address Record [2] . // . . // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | . | // . . . // | . | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // . . // . Multicast Address Record [M] . // . . // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ type IGMPv3ReportSerializer struct { Records []IGMPv3ReportGroupAddressRecordSerializer } // Length returns the number of bytes this serializer would occupy. func (s *IGMPv3ReportSerializer) Length() int { ret := igmpv3ReportGroupAddressRecordsOffset for _, record := range s.Records { ret += record.Length() } return ret } // SerializeInto serializes the report into the buffer. // // Panics if the buffer does not have enough space to fit the report. func (s *IGMPv3ReportSerializer) SerializeInto(b []byte) { b[igmpv3ReportTypeOffset] = byte(IGMPv3MembershipReport) b[igmpv3ReportReserved1Offset] = 0 binary.BigEndian.PutUint16(b[igmpv3ReportReserved2Offset:], 0) binary.BigEndian.PutUint16(b[igmpv3ReportNumberOfGroupAddressRecordsOffset:], uint16(len(s.Records))) recordsBytes := b[igmpv3ReportGroupAddressRecordsOffset:] for _, record := range s.Records { len := record.Length() record.SerializeInto(recordsBytes[:len]) recordsBytes = recordsBytes[len:] } binary.BigEndian.PutUint16(b[igmpChecksumOffset:], IGMPCalculateChecksum(b)) } // IGMPv3ReportGroupAddressRecord is an IGMPv3 record. // // As per RFC 3810 section 5.2, a Multicast Address Record has the following // internal format: // // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Record Type | Aux Data Len | Number of Sources (N) | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // * * // | | // * Multicast Address * // | | // * * // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // * * // | | // * Source Address [1] * // | | // * * // | | // +- -+ // | | // * * // | | // * Source Address [2] * // | | // * * // | | // +- -+ // . . . // . . . // . . . // +- -+ // | | // * * // | | // * Source Address [N] * // | | // * * // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // . . // . Auxiliary Data . // . . // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ type IGMPv3ReportGroupAddressRecord []byte // RecordType returns the type of this record. func (r IGMPv3ReportGroupAddressRecord) RecordType() IGMPv3ReportRecordType { return IGMPv3ReportRecordType(r[igmpv3ReportGroupAddressRecordTypeOffset]) } // AuxDataLen returns the length of the auxiliary data in this record. func (r IGMPv3ReportGroupAddressRecord) AuxDataLen() int { return int(r[igmpv3ReportGroupAddressRecordAuxDataLenOffset]) * igmpv3ReportGroupAddressRecordAuxDataLenUnits } // numberOfSources returns the number of sources in this record. func (r IGMPv3ReportGroupAddressRecord) numberOfSources() uint16 { return binary.BigEndian.Uint16(r[igmpv3ReportGroupAddressRecordNumberOfSourcesOffset:]) } // GroupAddress returns the multicast address this record targets. func (r IGMPv3ReportGroupAddressRecord) GroupAddress() tcpip.Address { return tcpip.AddrFrom4([4]byte(r[igmpv3ReportGroupAddressRecordGroupAddressOffset:][:IPv4AddressSize])) } // Sources returns an iterator over source addresses in the query. // // Returns false if the message cannot hold the expected number of sources. func (r IGMPv3ReportGroupAddressRecord) Sources() (AddressIterator, bool) { expectedLen := int(r.numberOfSources()) * IPv4AddressSize b := r[igmpv3ReportGroupAddressRecordSourcesOffset:] if len(b) < expectedLen { return AddressIterator{}, false } return AddressIterator{addressSize: IPv4AddressSize, buf: bytes.NewBuffer(b[:expectedLen])}, true } // IGMPv3Report is an IGMPv3 Report. // // As per RFC 3810 section 5.2, // // 0 1 2 3 // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Type = 143 | Reserved | Checksum | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Reserved |Nr of Mcast Address Records (M)| // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // . . // . Multicast Address Record [1] . // . . // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // . . // . Multicast Address Record [2] . // . . // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | . | // . . . // | . | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // . . // . Multicast Address Record [M] . // . . // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ type IGMPv3Report []byte // Checksum returns the checksum. func (i IGMPv3Report) Checksum() uint16 { return binary.BigEndian.Uint16(i[igmpChecksumOffset:]) } // IGMPv3ReportGroupAddressRecordIterator is an iterator over IGMPv3 Multicast // Address Records. type IGMPv3ReportGroupAddressRecordIterator struct { recordsLeft uint16 buf *bytes.Buffer } // IGMPv3ReportGroupAddressRecordIteratorNextDisposition is the possible // return values from IGMPv3ReportGroupAddressRecordIterator.Next. type IGMPv3ReportGroupAddressRecordIteratorNextDisposition int const ( // IGMPv3ReportGroupAddressRecordIteratorNextOk indicates that a multicast // address record was yielded. IGMPv3ReportGroupAddressRecordIteratorNextOk IGMPv3ReportGroupAddressRecordIteratorNextDisposition = iota // IGMPv3ReportGroupAddressRecordIteratorNextDone indicates that the iterator // has been exhausted. IGMPv3ReportGroupAddressRecordIteratorNextDone // IGMPv3ReportGroupAddressRecordIteratorNextErrBufferTooShort indicates // that the iterator expected another record, but the buffer ended // prematurely. IGMPv3ReportGroupAddressRecordIteratorNextErrBufferTooShort ) // Next returns the next IGMPv3 Multicast Address Record. func (it *IGMPv3ReportGroupAddressRecordIterator) Next() (IGMPv3ReportGroupAddressRecord, IGMPv3ReportGroupAddressRecordIteratorNextDisposition) { if it.recordsLeft == 0 { return IGMPv3ReportGroupAddressRecord{}, IGMPv3ReportGroupAddressRecordIteratorNextDone } if it.buf.Len() < igmpv3ReportGroupAddressRecordMinimumSize { return IGMPv3ReportGroupAddressRecord{}, IGMPv3ReportGroupAddressRecordIteratorNextErrBufferTooShort } hdr := IGMPv3ReportGroupAddressRecord(it.buf.Bytes()) expectedLen := igmpv3ReportGroupAddressRecordMinimumSize + int(hdr.AuxDataLen()) + int(hdr.numberOfSources())*IPv4AddressSize bytes := it.buf.Next(expectedLen) if len(bytes) < expectedLen { return IGMPv3ReportGroupAddressRecord{}, IGMPv3ReportGroupAddressRecordIteratorNextErrBufferTooShort } it.recordsLeft-- return IGMPv3ReportGroupAddressRecord(bytes), IGMPv3ReportGroupAddressRecordIteratorNextOk } // GroupAddressRecords returns an iterator of IGMPv3 Multicast Address // Records. func (i IGMPv3Report) GroupAddressRecords() IGMPv3ReportGroupAddressRecordIterator { return IGMPv3ReportGroupAddressRecordIterator{ recordsLeft: binary.BigEndian.Uint16(i[igmpv3ReportNumberOfGroupAddressRecordsOffset:]), buf: bytes.NewBuffer(i[igmpv3ReportGroupAddressRecordsOffset:]), } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/interfaces.go000066400000000000000000000105411465435605700244460ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import ( "gvisor.dev/gvisor/pkg/tcpip" ) const ( // MaxIPPacketSize is the maximum supported IP packet size, excluding // jumbograms. The maximum IPv4 packet size is 64k-1 (total size must fit // in 16 bits). For IPv6, the payload max size (excluding jumbograms) is // 64k-1 (also needs to fit in 16 bits). So we use 64k - 1 + 2 * m, where // m is the minimum IPv6 header size; we leave room for some potential // IP options. MaxIPPacketSize = 0xffff + 2*IPv6MinimumSize ) // Transport offers generic methods to query and/or update the fields of the // header of a transport protocol buffer. type Transport interface { // SourcePort returns the value of the "source port" field. SourcePort() uint16 // Destination returns the value of the "destination port" field. DestinationPort() uint16 // Checksum returns the value of the "checksum" field. Checksum() uint16 // SetSourcePort sets the value of the "source port" field. SetSourcePort(uint16) // SetDestinationPort sets the value of the "destination port" field. SetDestinationPort(uint16) // SetChecksum sets the value of the "checksum" field. SetChecksum(uint16) // Payload returns the data carried in the transport buffer. Payload() []byte } // ChecksummableTransport is a Transport that supports checksumming. type ChecksummableTransport interface { Transport // SetSourcePortWithChecksumUpdate sets the source port and updates // the checksum. // // The receiver's checksum must be a fully calculated checksum. SetSourcePortWithChecksumUpdate(port uint16) // SetDestinationPortWithChecksumUpdate sets the destination port and updates // the checksum. // // The receiver's checksum must be a fully calculated checksum. SetDestinationPortWithChecksumUpdate(port uint16) // UpdateChecksumPseudoHeaderAddress updates the checksum to reflect an // updated address in the pseudo header. // // If fullChecksum is true, the receiver's checksum field is assumed to hold a // fully calculated checksum. Otherwise, it is assumed to hold a partially // calculated checksum which only reflects the pseudo header. UpdateChecksumPseudoHeaderAddress(old, new tcpip.Address, fullChecksum bool) } // Network offers generic methods to query and/or update the fields of the // header of a network protocol buffer. type Network interface { // SourceAddress returns the value of the "source address" field. SourceAddress() tcpip.Address // DestinationAddress returns the value of the "destination address" // field. DestinationAddress() tcpip.Address // Checksum returns the value of the "checksum" field. Checksum() uint16 // SetSourceAddress sets the value of the "source address" field. SetSourceAddress(tcpip.Address) // SetDestinationAddress sets the value of the "destination address" // field. SetDestinationAddress(tcpip.Address) // SetChecksum sets the value of the "checksum" field. SetChecksum(uint16) // TransportProtocol returns the number of the transport protocol // stored in the payload. TransportProtocol() tcpip.TransportProtocolNumber // Payload returns a byte slice containing the payload of the network // packet. Payload() []byte // TOS returns the values of the "type of service" and "flow label" fields. TOS() (uint8, uint32) // SetTOS sets the values of the "type of service" and "flow label" fields. SetTOS(t uint8, l uint32) } // ChecksummableNetwork is a Network that supports checksumming. type ChecksummableNetwork interface { Network // SetSourceAddressAndChecksum sets the source address and updates the // checksum to reflect the new address. SetSourceAddressWithChecksumUpdate(tcpip.Address) // SetDestinationAddressAndChecksum sets the destination address and // updates the checksum to reflect the new address. SetDestinationAddressWithChecksumUpdate(tcpip.Address) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/ipv4.go000066400000000000000000001226251465435605700232140ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import ( "encoding/binary" "fmt" "time" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/checksum" ) // RFC 971 defines the fields of the IPv4 header on page 11 using the following // diagram: ("Figure 4") // // 0 1 2 3 // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // |Version| IHL |Type of Service| Total Length | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Identification |Flags| Fragment Offset | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Time to Live | Protocol | Header Checksum | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Source Address | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Destination Address | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Options | Padding | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ const ( versIHL = 0 tos = 1 // IPv4TotalLenOffset is the offset of the total length field in the // IPv4 header. IPv4TotalLenOffset = 2 id = 4 flagsFO = 6 ttl = 8 protocol = 9 xsum = 10 srcAddr = 12 dstAddr = 16 options = 20 ) // IPv4Fields contains the fields of an IPv4 packet. It is used to describe the // fields of a packet that needs to be encoded. The IHL field is not here as // it is totally defined by the size of the options. type IPv4Fields struct { // TOS is the "type of service" field of an IPv4 packet. TOS uint8 // TotalLength is the "total length" field of an IPv4 packet. TotalLength uint16 // ID is the "identification" field of an IPv4 packet. ID uint16 // Flags is the "flags" field of an IPv4 packet. Flags uint8 // FragmentOffset is the "fragment offset" field of an IPv4 packet. FragmentOffset uint16 // TTL is the "time to live" field of an IPv4 packet. TTL uint8 // Protocol is the "protocol" field of an IPv4 packet. Protocol uint8 // Checksum is the "checksum" field of an IPv4 packet. Checksum uint16 // SrcAddr is the "source ip address" of an IPv4 packet. SrcAddr tcpip.Address // DstAddr is the "destination ip address" of an IPv4 packet. DstAddr tcpip.Address // Options must be 40 bytes or less as they must fit along with the // rest of the IPv4 header into the maximum size describable in the // IHL field. RFC 791 section 3.1 says: // IHL: 4 bits // // Internet Header Length is the length of the internet header in 32 // bit words, and thus points to the beginning of the data. Note that // the minimum value for a correct header is 5. // // That leaves ten 32 bit (4 byte) fields for options. An attempt to encode // more will fail. Options IPv4OptionsSerializer } // IPv4 is an IPv4 header. // Most of the methods of IPv4 access to the underlying slice without // checking the boundaries and could panic because of 'index out of range'. // Always call IsValid() to validate an instance of IPv4 before using other // methods. type IPv4 []byte const ( // IPv4MinimumSize is the minimum size of a valid IPv4 packet; // i.e. a packet header with no options. IPv4MinimumSize = 20 // IPv4MaximumHeaderSize is the maximum size of an IPv4 header. Given // that there are only 4 bits (max 0xF (15)) to represent the header length // in 32-bit (4 byte) units, the header cannot exceed 15*4 = 60 bytes. IPv4MaximumHeaderSize = 60 // IPv4MaximumOptionsSize is the largest size the IPv4 options can be. IPv4MaximumOptionsSize = IPv4MaximumHeaderSize - IPv4MinimumSize // IPv4MaximumPayloadSize is the maximum size of a valid IPv4 payload. // // Linux limits this to 65,515 octets (the max IP datagram size - the IPv4 // header size). But RFC 791 section 3.2 discusses the design of the IPv4 // fragment "allows 2**13 = 8192 fragments of 8 octets each for a total of // 65,536 octets. Note that this is consistent with the datagram total // length field (of course, the header is counted in the total length and not // in the fragments)." IPv4MaximumPayloadSize = 65536 // MinIPFragmentPayloadSize is the minimum number of payload bytes that // the first fragment must carry when an IPv4 packet is fragmented. MinIPFragmentPayloadSize = 8 // IPv4AddressSize is the size, in bytes, of an IPv4 address. IPv4AddressSize = 4 // IPv4AddressSizeBits is the size, in bits, of an IPv4 address. IPv4AddressSizeBits = 32 // IPv4ProtocolNumber is IPv4's network protocol number. IPv4ProtocolNumber tcpip.NetworkProtocolNumber = 0x0800 // IPv4Version is the version of the IPv4 protocol. IPv4Version = 4 // IPv4MinimumProcessableDatagramSize is the minimum size of an IP // packet that every IPv4 capable host must be able to // process/reassemble. IPv4MinimumProcessableDatagramSize = 576 // IPv4MinimumMTU is the minimum MTU required by IPv4, per RFC 791, // section 3.2: // Every internet module must be able to forward a datagram of 68 octets // without further fragmentation. This is because an internet header may be // up to 60 octets, and the minimum fragment is 8 octets. IPv4MinimumMTU = 68 ) var ( // IPv4AllSystems is the all systems IPv4 multicast address as per // IANA's IPv4 Multicast Address Space Registry. See // https://www.iana.org/assignments/multicast-addresses/multicast-addresses.xhtml. IPv4AllSystems = tcpip.AddrFrom4([4]byte{0xe0, 0x00, 0x00, 0x01}) // IPv4Broadcast is the broadcast address of the IPv4 procotol. IPv4Broadcast = tcpip.AddrFrom4([4]byte{0xff, 0xff, 0xff, 0xff}) // IPv4Any is the non-routable IPv4 "any" meta address. IPv4Any = tcpip.AddrFrom4([4]byte{0x00, 0x00, 0x00, 0x00}) // IPv4AllRoutersGroup is a multicast address for all routers. IPv4AllRoutersGroup = tcpip.AddrFrom4([4]byte{0xe0, 0x00, 0x00, 0x02}) // IPv4Loopback is the loopback IPv4 address. IPv4Loopback = tcpip.AddrFrom4([4]byte{0x7f, 0x00, 0x00, 0x01}) ) // Flags that may be set in an IPv4 packet. const ( IPv4FlagMoreFragments = 1 << iota IPv4FlagDontFragment ) // ipv4LinkLocalUnicastSubnet is the IPv4 link local unicast subnet as defined // by RFC 3927 section 1. var ipv4LinkLocalUnicastSubnet = func() tcpip.Subnet { subnet, err := tcpip.NewSubnet(tcpip.AddrFrom4([4]byte{0xa9, 0xfe, 0x00, 0x00}), tcpip.MaskFrom("\xff\xff\x00\x00")) if err != nil { panic(err) } return subnet }() // ipv4LinkLocalMulticastSubnet is the IPv4 link local multicast subnet as // defined by RFC 5771 section 4. var ipv4LinkLocalMulticastSubnet = func() tcpip.Subnet { subnet, err := tcpip.NewSubnet(tcpip.AddrFrom4([4]byte{0xe0, 0x00, 0x00, 0x00}), tcpip.MaskFrom("\xff\xff\xff\x00")) if err != nil { panic(err) } return subnet }() // IPv4EmptySubnet is the empty IPv4 subnet. var IPv4EmptySubnet = func() tcpip.Subnet { subnet, err := tcpip.NewSubnet(IPv4Any, tcpip.MaskFrom("\x00\x00\x00\x00")) if err != nil { panic(err) } return subnet }() // IPv4CurrentNetworkSubnet is the subnet of addresses for the current network, // per RFC 6890 section 2.2.2, // // +----------------------+----------------------------+ // | Attribute | Value | // +----------------------+----------------------------+ // | Address Block | 0.0.0.0/8 | // | Name | "This host on this network"| // | RFC | [RFC1122], Section 3.2.1.3 | // | Allocation Date | September 1981 | // | Termination Date | N/A | // | Source | True | // | Destination | False | // | Forwardable | False | // | Global | False | // | Reserved-by-Protocol | True | // +----------------------+----------------------------+ var IPv4CurrentNetworkSubnet = func() tcpip.Subnet { subnet, err := tcpip.NewSubnet(IPv4Any, tcpip.MaskFrom("\xff\x00\x00\x00")) if err != nil { panic(err) } return subnet }() // IPv4LoopbackSubnet is the loopback subnet for IPv4. var IPv4LoopbackSubnet = func() tcpip.Subnet { subnet, err := tcpip.NewSubnet(tcpip.AddrFrom4([4]byte{0x7f, 0x00, 0x00, 0x00}), tcpip.MaskFrom("\xff\x00\x00\x00")) if err != nil { panic(err) } return subnet }() // IPVersion returns the version of IP used in the given packet. It returns -1 // if the packet is not large enough to contain the version field. func IPVersion(b []byte) int { // Length must be at least offset+length of version field. if len(b) < versIHL+1 { return -1 } return int(b[versIHL] >> ipVersionShift) } // RFC 791 page 11 shows the header length (IHL) is in the lower 4 bits // of the first byte, and is counted in multiples of 4 bytes. // // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // |Version| IHL |Type of Service| Total Length | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // (...) // Version: 4 bits // The Version field indicates the format of the internet header. This // document describes version 4. // // IHL: 4 bits // Internet Header Length is the length of the internet header in 32 // bit words, and thus points to the beginning of the data. Note that // the minimum value for a correct header is 5. const ( ipVersionShift = 4 ipIHLMask = 0x0f IPv4IHLStride = 4 ) // HeaderLength returns the value of the "header length" field of the IPv4 // header. The length returned is in bytes. func (b IPv4) HeaderLength() uint8 { return (b[versIHL] & ipIHLMask) * IPv4IHLStride } // SetHeaderLength sets the value of the "Internet Header Length" field. func (b IPv4) SetHeaderLength(hdrLen uint8) { if hdrLen > IPv4MaximumHeaderSize { panic(fmt.Sprintf("got IPv4 Header size = %d, want <= %d", hdrLen, IPv4MaximumHeaderSize)) } b[versIHL] = (IPv4Version << ipVersionShift) | ((hdrLen / IPv4IHLStride) & ipIHLMask) } // ID returns the value of the identifier field of the IPv4 header. func (b IPv4) ID() uint16 { return binary.BigEndian.Uint16(b[id:]) } // Protocol returns the value of the protocol field of the IPv4 header. func (b IPv4) Protocol() uint8 { return b[protocol] } // Flags returns the "flags" field of the IPv4 header. func (b IPv4) Flags() uint8 { return uint8(binary.BigEndian.Uint16(b[flagsFO:]) >> 13) } // More returns whether the more fragments flag is set. func (b IPv4) More() bool { return b.Flags()&IPv4FlagMoreFragments != 0 } // TTL returns the "TTL" field of the IPv4 header. func (b IPv4) TTL() uint8 { return b[ttl] } // FragmentOffset returns the "fragment offset" field of the IPv4 header. func (b IPv4) FragmentOffset() uint16 { return binary.BigEndian.Uint16(b[flagsFO:]) << 3 } // TotalLength returns the "total length" field of the IPv4 header. func (b IPv4) TotalLength() uint16 { return binary.BigEndian.Uint16(b[IPv4TotalLenOffset:]) } // Checksum returns the checksum field of the IPv4 header. func (b IPv4) Checksum() uint16 { return binary.BigEndian.Uint16(b[xsum:]) } // SourceAddress returns the "source address" field of the IPv4 header. func (b IPv4) SourceAddress() tcpip.Address { return tcpip.AddrFrom4([4]byte(b[srcAddr : srcAddr+IPv4AddressSize])) } // DestinationAddress returns the "destination address" field of the IPv4 // header. func (b IPv4) DestinationAddress() tcpip.Address { return tcpip.AddrFrom4([4]byte(b[dstAddr : dstAddr+IPv4AddressSize])) } // SourceAddressSlice returns the "source address" field of the IPv4 header as a // byte slice. func (b IPv4) SourceAddressSlice() []byte { return []byte(b[srcAddr : srcAddr+IPv4AddressSize]) } // DestinationAddressSlice returns the "destination address" field of the IPv4 // header as a byte slice. func (b IPv4) DestinationAddressSlice() []byte { return []byte(b[dstAddr : dstAddr+IPv4AddressSize]) } // SetSourceAddressWithChecksumUpdate implements ChecksummableNetwork. func (b IPv4) SetSourceAddressWithChecksumUpdate(new tcpip.Address) { b.SetChecksum(^checksumUpdate2ByteAlignedAddress(^b.Checksum(), b.SourceAddress(), new)) b.SetSourceAddress(new) } // SetDestinationAddressWithChecksumUpdate implements ChecksummableNetwork. func (b IPv4) SetDestinationAddressWithChecksumUpdate(new tcpip.Address) { b.SetChecksum(^checksumUpdate2ByteAlignedAddress(^b.Checksum(), b.DestinationAddress(), new)) b.SetDestinationAddress(new) } // padIPv4OptionsLength returns the total length for IPv4 options of length l // after applying padding according to RFC 791: // // The internet header padding is used to ensure that the internet // header ends on a 32 bit boundary. func padIPv4OptionsLength(length uint8) uint8 { return (length + IPv4IHLStride - 1) & ^uint8(IPv4IHLStride-1) } // IPv4Options is a buffer that holds all the raw IP options. type IPv4Options []byte // Options returns a buffer holding the options. func (b IPv4) Options() IPv4Options { hdrLen := b.HeaderLength() return IPv4Options(b[options:hdrLen:hdrLen]) } // TransportProtocol implements Network.TransportProtocol. func (b IPv4) TransportProtocol() tcpip.TransportProtocolNumber { return tcpip.TransportProtocolNumber(b.Protocol()) } // Payload implements Network.Payload. func (b IPv4) Payload() []byte { return b[b.HeaderLength():][:b.PayloadLength()] } // PayloadLength returns the length of the payload portion of the IPv4 packet. func (b IPv4) PayloadLength() uint16 { return b.TotalLength() - uint16(b.HeaderLength()) } // TOS returns the "type of service" field of the IPv4 header. func (b IPv4) TOS() (uint8, uint32) { return b[tos], 0 } // SetTOS sets the "type of service" field of the IPv4 header. func (b IPv4) SetTOS(v uint8, _ uint32) { b[tos] = v } // SetTTL sets the "Time to Live" field of the IPv4 header. func (b IPv4) SetTTL(v byte) { b[ttl] = v } // SetTotalLength sets the "total length" field of the IPv4 header. func (b IPv4) SetTotalLength(totalLength uint16) { binary.BigEndian.PutUint16(b[IPv4TotalLenOffset:], totalLength) } // SetChecksum sets the checksum field of the IPv4 header. func (b IPv4) SetChecksum(v uint16) { checksum.Put(b[xsum:], v) } // SetFlagsFragmentOffset sets the "flags" and "fragment offset" fields of the // IPv4 header. func (b IPv4) SetFlagsFragmentOffset(flags uint8, offset uint16) { v := (uint16(flags) << 13) | (offset >> 3) binary.BigEndian.PutUint16(b[flagsFO:], v) } // SetID sets the identification field. func (b IPv4) SetID(v uint16) { binary.BigEndian.PutUint16(b[id:], v) } // SetSourceAddress sets the "source address" field of the IPv4 header. func (b IPv4) SetSourceAddress(addr tcpip.Address) { copy(b[srcAddr:srcAddr+IPv4AddressSize], addr.AsSlice()) } // SetDestinationAddress sets the "destination address" field of the IPv4 // header. func (b IPv4) SetDestinationAddress(addr tcpip.Address) { copy(b[dstAddr:dstAddr+IPv4AddressSize], addr.AsSlice()) } // CalculateChecksum calculates the checksum of the IPv4 header. func (b IPv4) CalculateChecksum() uint16 { return checksum.Checksum(b[:b.HeaderLength()], 0) } // Encode encodes all the fields of the IPv4 header. func (b IPv4) Encode(i *IPv4Fields) { // The size of the options defines the size of the whole header and thus the // IHL field. Options are rare and this is a heavily used function so it is // worth a bit of optimisation here to keep the serializer out of the fast // path. hdrLen := uint8(IPv4MinimumSize) if len(i.Options) != 0 { hdrLen += i.Options.Serialize(b[options:]) } if hdrLen > IPv4MaximumHeaderSize { panic(fmt.Sprintf("%d is larger than maximum IPv4 header size of %d", hdrLen, IPv4MaximumHeaderSize)) } b.SetHeaderLength(hdrLen) b[tos] = i.TOS b.SetTotalLength(i.TotalLength) binary.BigEndian.PutUint16(b[id:], i.ID) b.SetFlagsFragmentOffset(i.Flags, i.FragmentOffset) b[ttl] = i.TTL b[protocol] = i.Protocol b.SetChecksum(i.Checksum) copy(b[srcAddr:srcAddr+IPv4AddressSize], i.SrcAddr.AsSlice()) copy(b[dstAddr:dstAddr+IPv4AddressSize], i.DstAddr.AsSlice()) } // EncodePartial updates the total length and checksum fields of IPv4 header, // taking in the partial checksum, which is the checksum of the header without // the total length and checksum fields. It is useful in cases when similar // packets are produced. func (b IPv4) EncodePartial(partialChecksum, totalLength uint16) { b.SetTotalLength(totalLength) xsum := checksum.Checksum(b[IPv4TotalLenOffset:IPv4TotalLenOffset+2], partialChecksum) b.SetChecksum(^xsum) } // IsValid performs basic validation on the packet. func (b IPv4) IsValid(pktSize int) bool { if len(b) < IPv4MinimumSize { return false } hlen := int(b.HeaderLength()) tlen := int(b.TotalLength()) if hlen < IPv4MinimumSize || hlen > tlen || tlen > pktSize { return false } if IPVersion(b) != IPv4Version { return false } return true } // IsV4LinkLocalUnicastAddress determines if the provided address is an IPv4 // link-local unicast address. func IsV4LinkLocalUnicastAddress(addr tcpip.Address) bool { return ipv4LinkLocalUnicastSubnet.Contains(addr) } // IsV4LinkLocalMulticastAddress determines if the provided address is an IPv4 // link-local multicast address. func IsV4LinkLocalMulticastAddress(addr tcpip.Address) bool { return ipv4LinkLocalMulticastSubnet.Contains(addr) } // IsChecksumValid returns true iff the IPv4 header's checksum is valid. func (b IPv4) IsChecksumValid() bool { // There has been some confusion regarding verifying checksums. We need // just look for negative 0 (0xffff) as the checksum, as it's not possible to // get positive 0 (0) for the checksum. Some bad implementations could get it // when doing entry replacement in the early days of the Internet, // however the lore that one needs to check for both persists. // // RFC 1624 section 1 describes the source of this confusion as: // [the partial recalculation method described in RFC 1071] computes a // result for certain cases that differs from the one obtained from // scratch (one's complement of one's complement sum of the original // fields). // // However RFC 1624 section 5 clarifies that if using the verification method // "recommended by RFC 1071, it does not matter if an intermediate system // generated a -0 instead of +0". // // RFC1071 page 1 specifies the verification method as: // (3) To check a checksum, the 1's complement sum is computed over the // same set of octets, including the checksum field. If the result // is all 1 bits (-0 in 1's complement arithmetic), the check // succeeds. return b.CalculateChecksum() == 0xffff } // IsV4MulticastAddress determines if the provided address is an IPv4 multicast // address (range 224.0.0.0 to 239.255.255.255). The four most significant bits // will be 1110 = 0xe0. func IsV4MulticastAddress(addr tcpip.Address) bool { if addr.BitLen() != IPv4AddressSizeBits { return false } addrBytes := addr.As4() return (addrBytes[0] & 0xf0) == 0xe0 } // IsV4LoopbackAddress determines if the provided address is an IPv4 loopback // address (belongs to 127.0.0.0/8 subnet). See RFC 1122 section 3.2.1.3. func IsV4LoopbackAddress(addr tcpip.Address) bool { if addr.BitLen() != IPv4AddressSizeBits { return false } addrBytes := addr.As4() return addrBytes[0] == 0x7f } // ========================= Options ========================== // An IPv4OptionType can hold the value for the Type in an IPv4 option. type IPv4OptionType byte // These constants are needed to identify individual options in the option list. // While RFC 791 (page 31) says "Every internet module must be able to act on // every option." This has not generally been adhered to and some options have // very low rates of support. We do not support options other than those shown // below. const ( // IPv4OptionListEndType is the option type for the End Of Option List // option. Anything following is ignored. IPv4OptionListEndType IPv4OptionType = 0 // IPv4OptionNOPType is the No-Operation option. May appear between other // options and may appear multiple times. IPv4OptionNOPType IPv4OptionType = 1 // IPv4OptionRouterAlertType is the option type for the Router Alert option, // defined in RFC 2113 Section 2.1. IPv4OptionRouterAlertType IPv4OptionType = 20 | 0x80 // IPv4OptionRecordRouteType is used by each router on the path of the packet // to record its path. It is carried over to an Echo Reply. IPv4OptionRecordRouteType IPv4OptionType = 7 // IPv4OptionTimestampType is the option type for the Timestamp option. IPv4OptionTimestampType IPv4OptionType = 68 // ipv4OptionTypeOffset is the offset in an option of its type field. ipv4OptionTypeOffset = 0 // IPv4OptionLengthOffset is the offset in an option of its length field. IPv4OptionLengthOffset = 1 ) // IPv4OptParameterProblem indicates that a Parameter Problem message // should be generated, and gives the offset in the current entity // that should be used in that packet. type IPv4OptParameterProblem struct { Pointer uint8 NeedICMP bool } // IPv4Option is an interface representing various option types. type IPv4Option interface { // Type returns the type identifier of the option. Type() IPv4OptionType // Size returns the size of the option in bytes. Size() uint8 // Contents returns a slice holding the contents of the option. Contents() []byte } var _ IPv4Option = (*IPv4OptionGeneric)(nil) // IPv4OptionGeneric is an IPv4 Option of unknown type. type IPv4OptionGeneric []byte // Type implements IPv4Option. func (o *IPv4OptionGeneric) Type() IPv4OptionType { return IPv4OptionType((*o)[ipv4OptionTypeOffset]) } // Size implements IPv4Option. func (o *IPv4OptionGeneric) Size() uint8 { return uint8(len(*o)) } // Contents implements IPv4Option. func (o *IPv4OptionGeneric) Contents() []byte { return *o } // IPv4OptionIterator is an iterator pointing to a specific IP option // at any point of time. It also holds information as to a new options buffer // that we are building up to hand back to the caller. // TODO(https://gvisor.dev/issues/5513): Add unit tests for IPv4OptionIterator. type IPv4OptionIterator struct { options IPv4Options // ErrCursor is where we are while parsing options. It is exported as any // resulting ICMP packet is supposed to have a pointer to the byte within // the IP packet where the error was detected. ErrCursor uint8 nextErrCursor uint8 newOptions [IPv4MaximumOptionsSize]byte writePoint int } // MakeIterator sets up and returns an iterator of options. It also sets up the // building of a new option set. func (o IPv4Options) MakeIterator() IPv4OptionIterator { return IPv4OptionIterator{ options: o, nextErrCursor: IPv4MinimumSize, } } // InitReplacement copies the option into the new option buffer. func (i *IPv4OptionIterator) InitReplacement(option IPv4Option) IPv4Options { replacementOption := i.RemainingBuffer()[:option.Size()] if copied := copy(replacementOption, option.Contents()); copied != len(replacementOption) { panic(fmt.Sprintf("copied %d bytes in the replacement option buffer, expected %d bytes", copied, len(replacementOption))) } return replacementOption } // RemainingBuffer returns the remaining (unused) part of the new option buffer, // into which a new option may be written. func (i *IPv4OptionIterator) RemainingBuffer() IPv4Options { return i.newOptions[i.writePoint:] } // ConsumeBuffer marks a portion of the new buffer as used. func (i *IPv4OptionIterator) ConsumeBuffer(size int) { i.writePoint += size } // PushNOPOrEnd puts one of the single byte options onto the new options. // Only values 0 or 1 (ListEnd or NOP) are valid input. func (i *IPv4OptionIterator) PushNOPOrEnd(val IPv4OptionType) { if val > IPv4OptionNOPType { panic(fmt.Sprintf("invalid option type %d pushed onto option build buffer", val)) } i.newOptions[i.writePoint] = byte(val) i.writePoint++ } // Finalize returns the completed replacement options buffer padded // as needed. func (i *IPv4OptionIterator) Finalize() IPv4Options { // RFC 791 page 31 says: // The options might not end on a 32-bit boundary. The internet header // must be filled out with octets of zeros. The first of these would // be interpreted as the end-of-options option, and the remainder as // internet header padding. // Since the buffer is already zero filled we just need to step the write // pointer up to the next multiple of 4. options := IPv4Options(i.newOptions[:(i.writePoint+0x3) & ^0x3]) // Poison the write pointer. i.writePoint = len(i.newOptions) return options } // Next returns the next IP option in the buffer/list of IP options. // It returns // - A slice of bytes holding the next option or nil if there is error. // - A boolean which is true if parsing of all the options is complete. // Undefined in the case of error. // - An error indication which is non-nil if an error condition was found. func (i *IPv4OptionIterator) Next() (IPv4Option, bool, *IPv4OptParameterProblem) { // The opts slice gets shorter as we process the options. When we have no // bytes left we are done. if len(i.options) == 0 { return nil, true, nil } i.ErrCursor = i.nextErrCursor optType := IPv4OptionType(i.options[ipv4OptionTypeOffset]) if optType == IPv4OptionNOPType || optType == IPv4OptionListEndType { optionBody := i.options[:1] i.options = i.options[1:] i.nextErrCursor = i.ErrCursor + 1 retval := IPv4OptionGeneric(optionBody) return &retval, false, nil } // There are no more single byte options defined. All the rest have a length // field so we need to sanity check it. if len(i.options) == 1 { return nil, false, &IPv4OptParameterProblem{ Pointer: i.ErrCursor, NeedICMP: true, } } optLen := i.options[IPv4OptionLengthOffset] if optLen <= IPv4OptionLengthOffset || optLen > uint8(len(i.options)) { // The actual error is in the length (2nd byte of the option) but we // return the start of the option for compatibility with Linux. return nil, false, &IPv4OptParameterProblem{ Pointer: i.ErrCursor, NeedICMP: true, } } optionBody := i.options[:optLen] i.nextErrCursor = i.ErrCursor + optLen i.options = i.options[optLen:] // Check the length of some option types that we know. switch optType { case IPv4OptionTimestampType: if optLen < IPv4OptionTimestampHdrLength { i.ErrCursor++ return nil, false, &IPv4OptParameterProblem{ Pointer: i.ErrCursor, NeedICMP: true, } } retval := IPv4OptionTimestamp(optionBody) return &retval, false, nil case IPv4OptionRecordRouteType: if optLen < IPv4OptionRecordRouteHdrLength { i.ErrCursor++ return nil, false, &IPv4OptParameterProblem{ Pointer: i.ErrCursor, NeedICMP: true, } } retval := IPv4OptionRecordRoute(optionBody) return &retval, false, nil case IPv4OptionRouterAlertType: if optLen != IPv4OptionRouterAlertLength { i.ErrCursor++ return nil, false, &IPv4OptParameterProblem{ Pointer: i.ErrCursor, NeedICMP: true, } } retval := IPv4OptionRouterAlert(optionBody) return &retval, false, nil } retval := IPv4OptionGeneric(optionBody) return &retval, false, nil } // // IP Timestamp option - RFC 791 page 22. // +--------+--------+--------+--------+ // |01000100| length | pointer|oflw|flg| // +--------+--------+--------+--------+ // | internet address | // +--------+--------+--------+--------+ // | timestamp | // +--------+--------+--------+--------+ // | ... | // // Type = 68 // // The Option Length is the number of octets in the option counting // the type, length, pointer, and overflow/flag octets (maximum // length 40). // // The Pointer is the number of octets from the beginning of this // option to the end of timestamps plus one (i.e., it points to the // octet beginning the space for next timestamp). The smallest // legal value is 5. The timestamp area is full when the pointer // is greater than the length. // // The Overflow (oflw) [4 bits] is the number of IP modules that // cannot register timestamps due to lack of space. // // The Flag (flg) [4 bits] values are // // 0 -- time stamps only, stored in consecutive 32-bit words, // // 1 -- each timestamp is preceded with internet address of the // registering entity, // // 3 -- the internet address fields are prespecified. An IP // module only registers its timestamp if it matches its own // address with the next specified internet address. // // Timestamps are defined in RFC 791 page 22 as milliseconds since midnight UTC. // // The Timestamp is a right-justified, 32-bit timestamp in // milliseconds since midnight UT. If the time is not available in // milliseconds or cannot be provided with respect to midnight UT // then any time may be inserted as a timestamp provided the high // order bit of the timestamp field is set to one to indicate the // use of a non-standard value. // IPv4OptTSFlags sefines the values expected in the Timestamp // option Flags field. type IPv4OptTSFlags uint8 // Timestamp option specific related constants. const ( // IPv4OptionTimestampHdrLength is the length of the timestamp option header. IPv4OptionTimestampHdrLength = 4 // IPv4OptionTimestampSize is the size of an IP timestamp. IPv4OptionTimestampSize = 4 // IPv4OptionTimestampWithAddrSize is the size of an IP timestamp + Address. IPv4OptionTimestampWithAddrSize = IPv4AddressSize + IPv4OptionTimestampSize // IPv4OptionTimestampMaxSize is limited by space for options IPv4OptionTimestampMaxSize = IPv4MaximumOptionsSize // IPv4OptionTimestampOnlyFlag is a flag indicating that only timestamp // is present. IPv4OptionTimestampOnlyFlag IPv4OptTSFlags = 0 // IPv4OptionTimestampWithIPFlag is a flag indicating that both timestamps and // IP are present. IPv4OptionTimestampWithIPFlag IPv4OptTSFlags = 1 // IPv4OptionTimestampWithPredefinedIPFlag is a flag indicating that // predefined IP is present. IPv4OptionTimestampWithPredefinedIPFlag IPv4OptTSFlags = 3 ) // ipv4TimestampTime provides the current time as specified in RFC 791. func ipv4TimestampTime(clock tcpip.Clock) uint32 { // Per RFC 791 page 21: // The Timestamp is a right-justified, 32-bit timestamp in // milliseconds since midnight UT. now := clock.Now().UTC() midnight := now.Truncate(24 * time.Hour) return uint32(now.Sub(midnight).Milliseconds()) } // IP Timestamp option fields. const ( // IPv4OptTSPointerOffset is the offset of the Timestamp pointer field. IPv4OptTSPointerOffset = 2 // IPv4OptTSPointerOffset is the offset of the combined Flag and Overflow // fields, (each being 4 bits). IPv4OptTSOFLWAndFLGOffset = 3 // These constants define the sub byte fields of the Flag and OverFlow field. ipv4OptionTimestampOverflowshift = 4 ipv4OptionTimestampFlagsMask byte = 0x0f ) var _ IPv4Option = (*IPv4OptionTimestamp)(nil) // IPv4OptionTimestamp is a Timestamp option from RFC 791. type IPv4OptionTimestamp []byte // Type implements IPv4Option.Type(). func (ts *IPv4OptionTimestamp) Type() IPv4OptionType { return IPv4OptionTimestampType } // Size implements IPv4Option. func (ts *IPv4OptionTimestamp) Size() uint8 { return uint8(len(*ts)) } // Contents implements IPv4Option. func (ts *IPv4OptionTimestamp) Contents() []byte { return *ts } // Pointer returns the pointer field in the IP Timestamp option. func (ts *IPv4OptionTimestamp) Pointer() uint8 { return (*ts)[IPv4OptTSPointerOffset] } // Flags returns the flags field in the IP Timestamp option. func (ts *IPv4OptionTimestamp) Flags() IPv4OptTSFlags { return IPv4OptTSFlags((*ts)[IPv4OptTSOFLWAndFLGOffset] & ipv4OptionTimestampFlagsMask) } // Overflow returns the Overflow field in the IP Timestamp option. func (ts *IPv4OptionTimestamp) Overflow() uint8 { return (*ts)[IPv4OptTSOFLWAndFLGOffset] >> ipv4OptionTimestampOverflowshift } // IncOverflow increments the Overflow field in the IP Timestamp option. It // returns the incremented value. If the return value is 0 then the field // overflowed. func (ts *IPv4OptionTimestamp) IncOverflow() uint8 { (*ts)[IPv4OptTSOFLWAndFLGOffset] += 1 << ipv4OptionTimestampOverflowshift return ts.Overflow() } // UpdateTimestamp updates the fields of the next free timestamp slot. func (ts *IPv4OptionTimestamp) UpdateTimestamp(addr tcpip.Address, clock tcpip.Clock) { slot := (*ts)[ts.Pointer()-1:] switch ts.Flags() { case IPv4OptionTimestampOnlyFlag: binary.BigEndian.PutUint32(slot, ipv4TimestampTime(clock)) (*ts)[IPv4OptTSPointerOffset] += IPv4OptionTimestampSize case IPv4OptionTimestampWithIPFlag: if n := copy(slot, addr.AsSlice()); n != IPv4AddressSize { panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, IPv4AddressSize)) } binary.BigEndian.PutUint32(slot[IPv4AddressSize:], ipv4TimestampTime(clock)) (*ts)[IPv4OptTSPointerOffset] += IPv4OptionTimestampWithAddrSize case IPv4OptionTimestampWithPredefinedIPFlag: if tcpip.AddrFrom4([4]byte(slot[:IPv4AddressSize])) == addr { binary.BigEndian.PutUint32(slot[IPv4AddressSize:], ipv4TimestampTime(clock)) (*ts)[IPv4OptTSPointerOffset] += IPv4OptionTimestampWithAddrSize } } } // RecordRoute option specific related constants. // // from RFC 791 page 20: // // Record Route // // +--------+--------+--------+---------//--------+ // |00000111| length | pointer| route data | // +--------+--------+--------+---------//--------+ // Type=7 // // The record route option provides a means to record the route of // an internet datagram. // // The option begins with the option type code. The second octet // is the option length which includes the option type code and the // length octet, the pointer octet, and length-3 octets of route // data. The third octet is the pointer into the route data // indicating the octet which begins the next area to store a route // address. The pointer is relative to this option, and the // smallest legal value for the pointer is 4. const ( // IPv4OptionRecordRouteHdrLength is the length of the Record Route option // header. IPv4OptionRecordRouteHdrLength = 3 // IPv4OptRRPointerOffset is the offset to the pointer field in an RR // option, which points to the next free slot in the list of addresses. IPv4OptRRPointerOffset = 2 ) var _ IPv4Option = (*IPv4OptionRecordRoute)(nil) // IPv4OptionRecordRoute is an IPv4 RecordRoute option defined by RFC 791. type IPv4OptionRecordRoute []byte // Pointer returns the pointer field in the IP RecordRoute option. func (rr *IPv4OptionRecordRoute) Pointer() uint8 { return (*rr)[IPv4OptRRPointerOffset] } // StoreAddress stores the given IPv4 address into the next free slot. func (rr *IPv4OptionRecordRoute) StoreAddress(addr tcpip.Address) { start := rr.Pointer() - 1 // A one based number. // start and room checked by caller. if n := copy((*rr)[start:], addr.AsSlice()); n != IPv4AddressSize { panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, IPv4AddressSize)) } (*rr)[IPv4OptRRPointerOffset] += IPv4AddressSize } // Type implements IPv4Option. func (rr *IPv4OptionRecordRoute) Type() IPv4OptionType { return IPv4OptionRecordRouteType } // Size implements IPv4Option. func (rr *IPv4OptionRecordRoute) Size() uint8 { return uint8(len(*rr)) } // Contents implements IPv4Option. func (rr *IPv4OptionRecordRoute) Contents() []byte { return *rr } // Router Alert option specific related constants. // // from RFC 2113 section 2.1: // // +--------+--------+--------+--------+ // |10010100|00000100| 2 octet value | // +--------+--------+--------+--------+ // // Type: // Copied flag: 1 (all fragments must carry the option) // Option class: 0 (control) // Option number: 20 (decimal) // // Length: 4 // // Value: A two octet code with the following values: // 0 - Router shall examine packet // 1-65535 - Reserved const ( // IPv4OptionRouterAlertLength is the length of a Router Alert option. IPv4OptionRouterAlertLength = 4 // IPv4OptionRouterAlertValue is the only permissible value of the 16 bit // payload of the router alert option. IPv4OptionRouterAlertValue = 0 // IPv4OptionRouterAlertValueOffset is the offset for the value of a // RouterAlert option. IPv4OptionRouterAlertValueOffset = 2 ) var _ IPv4Option = (*IPv4OptionRouterAlert)(nil) // IPv4OptionRouterAlert is an IPv4 RouterAlert option defined by RFC 2113. type IPv4OptionRouterAlert []byte // Type implements IPv4Option. func (*IPv4OptionRouterAlert) Type() IPv4OptionType { return IPv4OptionRouterAlertType } // Size implements IPv4Option. func (ra *IPv4OptionRouterAlert) Size() uint8 { return uint8(len(*ra)) } // Contents implements IPv4Option. func (ra *IPv4OptionRouterAlert) Contents() []byte { return *ra } // Value returns the value of the IPv4OptionRouterAlert. func (ra *IPv4OptionRouterAlert) Value() uint16 { return binary.BigEndian.Uint16(ra.Contents()[IPv4OptionRouterAlertValueOffset:]) } // IPv4SerializableOption is an interface to represent serializable IPv4 option // types. type IPv4SerializableOption interface { // optionType returns the type identifier of the option. optionType() IPv4OptionType } // IPv4SerializableOptionPayload is an interface providing serialization of the // payload of an IPv4 option. type IPv4SerializableOptionPayload interface { // length returns the size of the payload. length() uint8 // serializeInto serializes the payload into the provided byte buffer. // // Note, the caller MUST provide a byte buffer with size of at least // Length. Implementers of this function may assume that the byte buffer // is of sufficient size. serializeInto MUST panic if the provided byte // buffer is not of sufficient size. // // serializeInto will return the number of bytes that was used to // serialize the receiver. Implementers must only use the number of // bytes required to serialize the receiver. Callers MAY provide a // larger buffer than required to serialize into. serializeInto(buffer []byte) uint8 } // IPv4OptionsSerializer is a serializer for IPv4 options. type IPv4OptionsSerializer []IPv4SerializableOption // Length returns the total number of bytes required to serialize the options. func (s IPv4OptionsSerializer) Length() uint8 { var total uint8 for _, opt := range s { total++ if withPayload, ok := opt.(IPv4SerializableOptionPayload); ok { // Add 1 to reported length to account for the length byte. total += 1 + withPayload.length() } } return padIPv4OptionsLength(total) } // Serialize serializes the provided list of IPV4 options into b. // // Note, b must be of sufficient size to hold all the options in s. See // IPv4OptionsSerializer.Length for details on the getting the total size // of a serialized IPv4OptionsSerializer. // // Serialize panics if b is not of sufficient size to hold all the options in s. func (s IPv4OptionsSerializer) Serialize(b []byte) uint8 { var total uint8 for _, opt := range s { ty := opt.optionType() if withPayload, ok := opt.(IPv4SerializableOptionPayload); ok { // Serialize first to reduce bounds checks. l := 2 + withPayload.serializeInto(b[2:]) b[0] = byte(ty) b[1] = l b = b[l:] total += l continue } // Options without payload consist only of the type field. // // NB: Repeating code from the branch above is intentional to minimize // bounds checks. b[0] = byte(ty) b = b[1:] total++ } // According to RFC 791: // // The internet header padding is used to ensure that the internet // header ends on a 32 bit boundary. The padding is zero. padded := padIPv4OptionsLength(total) b = b[:padded-total] clear(b) return padded } var _ IPv4SerializableOptionPayload = (*IPv4SerializableRouterAlertOption)(nil) var _ IPv4SerializableOption = (*IPv4SerializableRouterAlertOption)(nil) // IPv4SerializableRouterAlertOption provides serialization of the Router Alert // IPv4 option according to RFC 2113. type IPv4SerializableRouterAlertOption struct{} // Type implements IPv4SerializableOption. func (*IPv4SerializableRouterAlertOption) optionType() IPv4OptionType { return IPv4OptionRouterAlertType } // Length implements IPv4SerializableOption. func (*IPv4SerializableRouterAlertOption) length() uint8 { return IPv4OptionRouterAlertLength - IPv4OptionRouterAlertValueOffset } // SerializeInto implements IPv4SerializableOption. func (o *IPv4SerializableRouterAlertOption) serializeInto(buffer []byte) uint8 { binary.BigEndian.PutUint16(buffer, IPv4OptionRouterAlertValue) return o.length() } var _ IPv4SerializableOption = (*IPv4SerializableNOPOption)(nil) // IPv4SerializableNOPOption provides serialization for the IPv4 no-op option. type IPv4SerializableNOPOption struct{} // Type implements IPv4SerializableOption. func (*IPv4SerializableNOPOption) optionType() IPv4OptionType { return IPv4OptionNOPType } var _ IPv4SerializableOption = (*IPv4SerializableListEndOption)(nil) // IPv4SerializableListEndOption provides serialization for the IPv4 List End // option. type IPv4SerializableListEndOption struct{} // Type implements IPv4SerializableOption. func (*IPv4SerializableListEndOption) optionType() IPv4OptionType { return IPv4OptionListEndType } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/ipv6.go000066400000000000000000000525431465435605700232170ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import ( "crypto/sha256" "encoding/binary" "fmt" "gvisor.dev/gvisor/pkg/tcpip" ) const ( versTCFL = 0 // IPv6PayloadLenOffset is the offset of the PayloadLength field in // IPv6 header. IPv6PayloadLenOffset = 4 // IPv6NextHeaderOffset is the offset of the NextHeader field in // IPv6 header. IPv6NextHeaderOffset = 6 hopLimit = 7 v6SrcAddr = 8 v6DstAddr = v6SrcAddr + IPv6AddressSize // IPv6FixedHeaderSize is the size of the fixed header. IPv6FixedHeaderSize = v6DstAddr + IPv6AddressSize ) // IPv6Fields contains the fields of an IPv6 packet. It is used to describe the // fields of a packet that needs to be encoded. type IPv6Fields struct { // TrafficClass is the "traffic class" field of an IPv6 packet. TrafficClass uint8 // FlowLabel is the "flow label" field of an IPv6 packet. FlowLabel uint32 // PayloadLength is the "payload length" field of an IPv6 packet, including // the length of all extension headers. PayloadLength uint16 // TransportProtocol is the transport layer protocol number. Serialized in the // last "next header" field of the IPv6 header + extension headers. TransportProtocol tcpip.TransportProtocolNumber // HopLimit is the "Hop Limit" field of an IPv6 packet. HopLimit uint8 // SrcAddr is the "source ip address" of an IPv6 packet. SrcAddr tcpip.Address // DstAddr is the "destination ip address" of an IPv6 packet. DstAddr tcpip.Address // ExtensionHeaders are the extension headers following the IPv6 header. ExtensionHeaders IPv6ExtHdrSerializer } // IPv6 represents an ipv6 header stored in a byte array. // Most of the methods of IPv6 access to the underlying slice without // checking the boundaries and could panic because of 'index out of range'. // Always call IsValid() to validate an instance of IPv6 before using other methods. type IPv6 []byte const ( // IPv6MinimumSize is the minimum size of a valid IPv6 packet. IPv6MinimumSize = IPv6FixedHeaderSize // IPv6AddressSize is the size, in bytes, of an IPv6 address. IPv6AddressSize = 16 // IPv6AddressSizeBits is the size, in bits, of an IPv6 address. IPv6AddressSizeBits = 128 // IPv6MaximumPayloadSize is the maximum size of a valid IPv6 payload per // RFC 8200 Section 4.5. IPv6MaximumPayloadSize = 65535 // IPv6ProtocolNumber is IPv6's network protocol number. IPv6ProtocolNumber tcpip.NetworkProtocolNumber = 0x86dd // IPv6Version is the version of the ipv6 protocol. IPv6Version = 6 // IIDSize is the size of an interface identifier (IID), in bytes, as // defined by RFC 4291 section 2.5.1. IIDSize = 8 // IPv6MinimumMTU is the minimum MTU required by IPv6, per RFC 8200, // section 5: // IPv6 requires that every link in the Internet have an MTU of 1280 octets // or greater. This is known as the IPv6 minimum link MTU. IPv6MinimumMTU = 1280 // IIDOffsetInIPv6Address is the offset, in bytes, from the start // of an IPv6 address to the beginning of the interface identifier // (IID) for auto-generated addresses. That is, all bytes before // the IIDOffsetInIPv6Address-th byte are the prefix bytes, and all // bytes including and after the IIDOffsetInIPv6Address-th byte are // for the IID. IIDOffsetInIPv6Address = 8 // OpaqueIIDSecretKeyMinBytes is the recommended minimum number of bytes // for the secret key used to generate an opaque interface identifier as // outlined by RFC 7217. OpaqueIIDSecretKeyMinBytes = 16 // ipv6MulticastAddressScopeByteIdx is the byte where the scope (scop) field // is located within a multicast IPv6 address, as per RFC 4291 section 2.7. ipv6MulticastAddressScopeByteIdx = 1 // ipv6MulticastAddressScopeMask is the mask for the scope (scop) field, // within the byte holding the field, as per RFC 4291 section 2.7. ipv6MulticastAddressScopeMask = 0xF ) var ( // IPv6AllNodesMulticastAddress is a link-local multicast group that // all IPv6 nodes MUST join, as per RFC 4291, section 2.8. Packets // destined to this address will reach all nodes on a link. // // The address is ff02::1. IPv6AllNodesMulticastAddress = tcpip.AddrFrom16([16]byte{0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) // IPv6AllRoutersInterfaceLocalMulticastAddress is an interface-local // multicast group that all IPv6 routers MUST join, as per RFC 4291, section // 2.8. Packets destined to this address will reach the router on an // interface. // // The address is ff01::2. IPv6AllRoutersInterfaceLocalMulticastAddress = tcpip.AddrFrom16([16]byte{0xff, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02}) // IPv6AllRoutersLinkLocalMulticastAddress is a link-local multicast group // that all IPv6 routers MUST join, as per RFC 4291, section 2.8. Packets // destined to this address will reach all routers on a link. // // The address is ff02::2. IPv6AllRoutersLinkLocalMulticastAddress = tcpip.AddrFrom16([16]byte{0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02}) // IPv6AllRoutersSiteLocalMulticastAddress is a site-local multicast group // that all IPv6 routers MUST join, as per RFC 4291, section 2.8. Packets // destined to this address will reach all routers in a site. // // The address is ff05::2. IPv6AllRoutersSiteLocalMulticastAddress = tcpip.AddrFrom16([16]byte{0xff, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02}) // IPv6Loopback is the IPv6 Loopback address. IPv6Loopback = tcpip.AddrFrom16([16]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) // IPv6Any is the non-routable IPv6 "any" meta address. It is also // known as the unspecified address. IPv6Any = tcpip.AddrFrom16([16]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}) ) // IPv6EmptySubnet is the empty IPv6 subnet. It may also be known as the // catch-all or wildcard subnet. That is, all IPv6 addresses are considered to // be contained within this subnet. var IPv6EmptySubnet = tcpip.AddressWithPrefix{ Address: IPv6Any, PrefixLen: 0, }.Subnet() // IPv4MappedIPv6Subnet is the prefix for an IPv4 mapped IPv6 address as defined // by RFC 4291 section 2.5.5. var IPv4MappedIPv6Subnet = tcpip.AddressWithPrefix{ Address: tcpip.AddrFrom16([16]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}), PrefixLen: 96, }.Subnet() // IPv6LinkLocalPrefix is the prefix for IPv6 link-local addresses, as defined // by RFC 4291 section 2.5.6. // // The prefix is fe80::/64 var IPv6LinkLocalPrefix = tcpip.AddressWithPrefix{ Address: tcpip.AddrFrom16([16]byte{0xfe, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}), PrefixLen: 64, } // PayloadLength returns the value of the "payload length" field of the ipv6 // header. func (b IPv6) PayloadLength() uint16 { return binary.BigEndian.Uint16(b[IPv6PayloadLenOffset:]) } // HopLimit returns the value of the "Hop Limit" field of the ipv6 header. func (b IPv6) HopLimit() uint8 { return b[hopLimit] } // NextHeader returns the value of the "next header" field of the ipv6 header. func (b IPv6) NextHeader() uint8 { return b[IPv6NextHeaderOffset] } // TransportProtocol implements Network.TransportProtocol. func (b IPv6) TransportProtocol() tcpip.TransportProtocolNumber { return tcpip.TransportProtocolNumber(b.NextHeader()) } // Payload implements Network.Payload. func (b IPv6) Payload() []byte { return b[IPv6MinimumSize:][:b.PayloadLength()] } // SourceAddress returns the "source address" field of the ipv6 header. func (b IPv6) SourceAddress() tcpip.Address { return tcpip.AddrFrom16([16]byte(b[v6SrcAddr:][:IPv6AddressSize])) } // DestinationAddress returns the "destination address" field of the ipv6 // header. func (b IPv6) DestinationAddress() tcpip.Address { return tcpip.AddrFrom16([16]byte(b[v6DstAddr:][:IPv6AddressSize])) } // SourceAddressSlice returns the "source address" field of the ipv6 header as a // byte slice. func (b IPv6) SourceAddressSlice() []byte { return []byte(b[v6SrcAddr:][:IPv6AddressSize]) } // DestinationAddressSlice returns the "destination address" field of the ipv6 // header as a byte slice. func (b IPv6) DestinationAddressSlice() []byte { return []byte(b[v6DstAddr:][:IPv6AddressSize]) } // Checksum implements Network.Checksum. Given that IPv6 doesn't have a // checksum, it just returns 0. func (IPv6) Checksum() uint16 { return 0 } // TOS returns the "traffic class" and "flow label" fields of the ipv6 header. func (b IPv6) TOS() (uint8, uint32) { v := binary.BigEndian.Uint32(b[versTCFL:]) return uint8(v >> 20), v & 0xfffff } // SetTOS sets the "traffic class" and "flow label" fields of the ipv6 header. func (b IPv6) SetTOS(t uint8, l uint32) { vtf := (6 << 28) | (uint32(t) << 20) | (l & 0xfffff) binary.BigEndian.PutUint32(b[versTCFL:], vtf) } // SetPayloadLength sets the "payload length" field of the ipv6 header. func (b IPv6) SetPayloadLength(payloadLength uint16) { binary.BigEndian.PutUint16(b[IPv6PayloadLenOffset:], payloadLength) } // SetSourceAddress sets the "source address" field of the ipv6 header. func (b IPv6) SetSourceAddress(addr tcpip.Address) { copy(b[v6SrcAddr:][:IPv6AddressSize], addr.AsSlice()) } // SetDestinationAddress sets the "destination address" field of the ipv6 // header. func (b IPv6) SetDestinationAddress(addr tcpip.Address) { copy(b[v6DstAddr:][:IPv6AddressSize], addr.AsSlice()) } // SetHopLimit sets the value of the "Hop Limit" field. func (b IPv6) SetHopLimit(v uint8) { b[hopLimit] = v } // SetNextHeader sets the value of the "next header" field of the ipv6 header. func (b IPv6) SetNextHeader(v uint8) { b[IPv6NextHeaderOffset] = v } // SetChecksum implements Network.SetChecksum. Given that IPv6 doesn't have a // checksum, it is empty. func (IPv6) SetChecksum(uint16) { } // Encode encodes all the fields of the ipv6 header. func (b IPv6) Encode(i *IPv6Fields) { extHdr := b[IPv6MinimumSize:] b.SetTOS(i.TrafficClass, i.FlowLabel) b.SetPayloadLength(i.PayloadLength) b[hopLimit] = i.HopLimit b.SetSourceAddress(i.SrcAddr) b.SetDestinationAddress(i.DstAddr) nextHeader, _ := i.ExtensionHeaders.Serialize(i.TransportProtocol, extHdr) b[IPv6NextHeaderOffset] = nextHeader } // IsValid performs basic validation on the packet. func (b IPv6) IsValid(pktSize int) bool { if len(b) < IPv6MinimumSize { return false } dlen := int(b.PayloadLength()) if dlen > pktSize-IPv6MinimumSize { return false } if IPVersion(b) != IPv6Version { return false } return true } // IsV4MappedAddress determines if the provided address is an IPv4 mapped // address by checking if its prefix is 0:0:0:0:0:ffff::/96. func IsV4MappedAddress(addr tcpip.Address) bool { if addr.BitLen() != IPv6AddressSizeBits { return false } return IPv4MappedIPv6Subnet.Contains(addr) } // IsV6MulticastAddress determines if the provided address is an IPv6 // multicast address (anything starting with FF). func IsV6MulticastAddress(addr tcpip.Address) bool { if addr.BitLen() != IPv6AddressSizeBits { return false } return addr.As16()[0] == 0xff } // IsV6UnicastAddress determines if the provided address is a valid IPv6 // unicast (and specified) address. That is, IsV6UnicastAddress returns // true if addr contains IPv6AddressSize bytes, is not the unspecified // address and is not a multicast address. func IsV6UnicastAddress(addr tcpip.Address) bool { if addr.BitLen() != IPv6AddressSizeBits { return false } // Must not be unspecified if addr == IPv6Any { return false } // Return if not a multicast. return addr.As16()[0] != 0xff } var solicitedNodeMulticastPrefix = [13]byte{0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0xff} // SolicitedNodeAddr computes the solicited-node multicast address. This is // used for NDP. Described in RFC 4291. The argument must be a full-length IPv6 // address. func SolicitedNodeAddr(addr tcpip.Address) tcpip.Address { addrBytes := addr.As16() return tcpip.AddrFrom16([16]byte(append(solicitedNodeMulticastPrefix[:], addrBytes[len(addrBytes)-3:]...))) } // IsSolicitedNodeAddr determines whether the address is a solicited-node // multicast address. func IsSolicitedNodeAddr(addr tcpip.Address) bool { addrBytes := addr.As16() return solicitedNodeMulticastPrefix == [13]byte(addrBytes[:len(addrBytes)-3]) } // EthernetAdddressToModifiedEUI64IntoBuf populates buf with a modified EUI-64 // from a 48-bit Ethernet/MAC address, as per RFC 4291 section 2.5.1. // // buf MUST be at least 8 bytes. func EthernetAdddressToModifiedEUI64IntoBuf(linkAddr tcpip.LinkAddress, buf []byte) { buf[0] = linkAddr[0] ^ 2 buf[1] = linkAddr[1] buf[2] = linkAddr[2] buf[3] = 0xFF buf[4] = 0xFE buf[5] = linkAddr[3] buf[6] = linkAddr[4] buf[7] = linkAddr[5] } // EthernetAddressToModifiedEUI64 computes a modified EUI-64 from a 48-bit // Ethernet/MAC address, as per RFC 4291 section 2.5.1. func EthernetAddressToModifiedEUI64(linkAddr tcpip.LinkAddress) [IIDSize]byte { var buf [IIDSize]byte EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, buf[:]) return buf } // LinkLocalAddr computes the default IPv6 link-local address from a link-layer // (MAC) address. func LinkLocalAddr(linkAddr tcpip.LinkAddress) tcpip.Address { // Convert a 48-bit MAC to a modified EUI-64 and then prepend the // link-local header, FE80::. // // The conversion is very nearly: // aa:bb:cc:dd:ee:ff => FE80::Aabb:ccFF:FEdd:eeff // Note the capital A. The conversion aa->Aa involves a bit flip. lladdrb := [IPv6AddressSize]byte{ 0: 0xFE, 1: 0x80, } EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, lladdrb[IIDOffsetInIPv6Address:]) return tcpip.AddrFrom16(lladdrb) } // IsV6LinkLocalUnicastAddress returns true iff the provided address is an IPv6 // link-local unicast address, as defined by RFC 4291 section 2.5.6. func IsV6LinkLocalUnicastAddress(addr tcpip.Address) bool { if addr.BitLen() != IPv6AddressSizeBits { return false } addrBytes := addr.As16() return addrBytes[0] == 0xfe && (addrBytes[1]&0xc0) == 0x80 } // IsV6LoopbackAddress returns true iff the provided address is an IPv6 loopback // address, as defined by RFC 4291 section 2.5.3. func IsV6LoopbackAddress(addr tcpip.Address) bool { return addr == IPv6Loopback } // IsV6LinkLocalMulticastAddress returns true iff the provided address is an // IPv6 link-local multicast address, as defined by RFC 4291 section 2.7. func IsV6LinkLocalMulticastAddress(addr tcpip.Address) bool { return IsV6MulticastAddress(addr) && V6MulticastScope(addr) == IPv6LinkLocalMulticastScope } // AppendOpaqueInterfaceIdentifier appends a 64 bit opaque interface identifier // (IID) to buf as outlined by RFC 7217 and returns the extended buffer. // // The opaque IID is generated from the cryptographic hash of the concatenation // of the prefix, NIC's name, DAD counter (DAD retry counter) and the secret // key. The secret key SHOULD be at least OpaqueIIDSecretKeyMinBytes bytes and // MUST be generated to a pseudo-random number. See RFC 4086 for randomness // requirements for security. // // If buf has enough capacity for the IID (IIDSize bytes), a new underlying // array for the buffer will not be allocated. func AppendOpaqueInterfaceIdentifier(buf []byte, prefix tcpip.Subnet, nicName string, dadCounter uint8, secretKey []byte) []byte { // As per RFC 7217 section 5, the opaque identifier can be generated as a // cryptographic hash of the concatenation of each of the function parameters. // Note, we omit the optional Network_ID field. h := sha256.New() // h.Write never returns an error. prefixID := prefix.ID() h.Write([]byte(prefixID.AsSlice()[:IIDOffsetInIPv6Address])) h.Write([]byte(nicName)) h.Write([]byte{dadCounter}) h.Write(secretKey) var sumBuf [sha256.Size]byte sum := h.Sum(sumBuf[:0]) return append(buf, sum[:IIDSize]...) } // LinkLocalAddrWithOpaqueIID computes the default IPv6 link-local address with // an opaque IID. func LinkLocalAddrWithOpaqueIID(nicName string, dadCounter uint8, secretKey []byte) tcpip.Address { lladdrb := [IPv6AddressSize]byte{ 0: 0xFE, 1: 0x80, } return tcpip.AddrFrom16([16]byte(AppendOpaqueInterfaceIdentifier(lladdrb[:IIDOffsetInIPv6Address], IPv6LinkLocalPrefix.Subnet(), nicName, dadCounter, secretKey))) } // IPv6AddressScope is the scope of an IPv6 address. type IPv6AddressScope int const ( // LinkLocalScope indicates a link-local address. LinkLocalScope IPv6AddressScope = iota // GlobalScope indicates a global address. GlobalScope ) // ScopeForIPv6Address returns the scope for an IPv6 address. func ScopeForIPv6Address(addr tcpip.Address) (IPv6AddressScope, tcpip.Error) { if addr.BitLen() != IPv6AddressSizeBits { return GlobalScope, &tcpip.ErrBadAddress{} } switch { case IsV6LinkLocalMulticastAddress(addr): return LinkLocalScope, nil case IsV6LinkLocalUnicastAddress(addr): return LinkLocalScope, nil default: return GlobalScope, nil } } // InitialTempIID generates the initial temporary IID history value to generate // temporary SLAAC addresses with. // // Panics if initialTempIIDHistory is not at least IIDSize bytes. func InitialTempIID(initialTempIIDHistory []byte, seed []byte, nicID tcpip.NICID) { h := sha256.New() // h.Write never returns an error. h.Write(seed) var nicIDBuf [4]byte binary.BigEndian.PutUint32(nicIDBuf[:], uint32(nicID)) h.Write(nicIDBuf[:]) var sumBuf [sha256.Size]byte sum := h.Sum(sumBuf[:0]) if n := copy(initialTempIIDHistory, sum[sha256.Size-IIDSize:]); n != IIDSize { panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, IIDSize)) } } // GenerateTempIPv6SLAACAddr generates a temporary SLAAC IPv6 address for an // associated stable/permanent SLAAC address. // // GenerateTempIPv6SLAACAddr will update the temporary IID history value to be // used when generating a new temporary IID. // // Panics if tempIIDHistory is not at least IIDSize bytes. func GenerateTempIPv6SLAACAddr(tempIIDHistory []byte, stableAddr tcpip.Address) tcpip.AddressWithPrefix { addrBytes := stableAddr.As16() h := sha256.New() h.Write(tempIIDHistory) h.Write(addrBytes[IIDOffsetInIPv6Address:]) var sumBuf [sha256.Size]byte sum := h.Sum(sumBuf[:0]) // The rightmost 64 bits of sum are saved for the next iteration. if n := copy(tempIIDHistory, sum[sha256.Size-IIDSize:]); n != IIDSize { panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, IIDSize)) } // The leftmost 64 bits of sum is used as the IID. if n := copy(addrBytes[IIDOffsetInIPv6Address:], sum); n != IIDSize { panic(fmt.Sprintf("copied %d IID bytes, expected %d bytes", n, IIDSize)) } return tcpip.AddressWithPrefix{ Address: tcpip.AddrFrom16(addrBytes), PrefixLen: IIDOffsetInIPv6Address * 8, } } // IPv6MulticastScope is the scope of a multicast IPv6 address, as defined by // RFC 7346 section 2. type IPv6MulticastScope uint8 // The various values for IPv6 multicast scopes, as per RFC 7346 section 2: // // +------+--------------------------+-------------------------+ // | scop | NAME | REFERENCE | // +------+--------------------------+-------------------------+ // | 0 | Reserved | [RFC4291], RFC 7346 | // | 1 | Interface-Local scope | [RFC4291], RFC 7346 | // | 2 | Link-Local scope | [RFC4291], RFC 7346 | // | 3 | Realm-Local scope | [RFC4291], RFC 7346 | // | 4 | Admin-Local scope | [RFC4291], RFC 7346 | // | 5 | Site-Local scope | [RFC4291], RFC 7346 | // | 6 | Unassigned | | // | 7 | Unassigned | | // | 8 | Organization-Local scope | [RFC4291], RFC 7346 | // | 9 | Unassigned | | // | A | Unassigned | | // | B | Unassigned | | // | C | Unassigned | | // | D | Unassigned | | // | E | Global scope | [RFC4291], RFC 7346 | // | F | Reserved | [RFC4291], RFC 7346 | // +------+--------------------------+-------------------------+ const ( IPv6Reserved0MulticastScope = IPv6MulticastScope(0x0) IPv6InterfaceLocalMulticastScope = IPv6MulticastScope(0x1) IPv6LinkLocalMulticastScope = IPv6MulticastScope(0x2) IPv6RealmLocalMulticastScope = IPv6MulticastScope(0x3) IPv6AdminLocalMulticastScope = IPv6MulticastScope(0x4) IPv6SiteLocalMulticastScope = IPv6MulticastScope(0x5) IPv6OrganizationLocalMulticastScope = IPv6MulticastScope(0x8) IPv6GlobalMulticastScope = IPv6MulticastScope(0xE) IPv6ReservedFMulticastScope = IPv6MulticastScope(0xF) ) // V6MulticastScope returns the scope of a multicast address. func V6MulticastScope(addr tcpip.Address) IPv6MulticastScope { addrBytes := addr.As16() return IPv6MulticastScope(addrBytes[ipv6MulticastAddressScopeByteIdx] & ipv6MulticastAddressScopeMask) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/ipv6_extension_headers.go000066400000000000000000001105131465435605700267760ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import ( "encoding/binary" "errors" "fmt" "io" "math" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/tcpip" ) // IPv6ExtensionHeaderIdentifier is an IPv6 extension header identifier. type IPv6ExtensionHeaderIdentifier uint8 const ( // IPv6HopByHopOptionsExtHdrIdentifier is the header identifier of a Hop by // Hop Options extension header, as per RFC 8200 section 4.3. IPv6HopByHopOptionsExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 0 // IPv6RoutingExtHdrIdentifier is the header identifier of a Routing extension // header, as per RFC 8200 section 4.4. IPv6RoutingExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 43 // IPv6FragmentExtHdrIdentifier is the header identifier of a Fragment // extension header, as per RFC 8200 section 4.5. IPv6FragmentExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 44 // IPv6DestinationOptionsExtHdrIdentifier is the header identifier of a // Destination Options extension header, as per RFC 8200 section 4.6. IPv6DestinationOptionsExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 60 // IPv6NoNextHeaderIdentifier is the header identifier used to signify the end // of an IPv6 payload, as per RFC 8200 section 4.7. IPv6NoNextHeaderIdentifier IPv6ExtensionHeaderIdentifier = 59 // IPv6UnknownExtHdrIdentifier is reserved by IANA. // https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml#extension-header // "254 Use for experimentation and testing [RFC3692][RFC4727]" IPv6UnknownExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 254 ) const ( // ipv6UnknownExtHdrOptionActionMask is the mask of the action to take when // a node encounters an unrecognized option. ipv6UnknownExtHdrOptionActionMask = 192 // ipv6UnknownExtHdrOptionActionShift is the least significant bits to discard // from the action value for an unrecognized option identifier. ipv6UnknownExtHdrOptionActionShift = 6 // ipv6RoutingExtHdrSegmentsLeftIdx is the index to the Segments Left field // within an IPv6RoutingExtHdr. ipv6RoutingExtHdrSegmentsLeftIdx = 1 // IPv6FragmentExtHdrLength is the length of an IPv6 extension header, in // bytes. IPv6FragmentExtHdrLength = 8 // ipv6FragmentExtHdrFragmentOffsetOffset is the offset to the start of the // Fragment Offset field within an IPv6FragmentExtHdr. ipv6FragmentExtHdrFragmentOffsetOffset = 0 // ipv6FragmentExtHdrFragmentOffsetShift is the bit offset of the Fragment // Offset field within an IPv6FragmentExtHdr. ipv6FragmentExtHdrFragmentOffsetShift = 3 // ipv6FragmentExtHdrFlagsIdx is the index to the flags field within an // IPv6FragmentExtHdr. ipv6FragmentExtHdrFlagsIdx = 1 // ipv6FragmentExtHdrMFlagMask is the mask of the More (M) flag within the // flags field of an IPv6FragmentExtHdr. ipv6FragmentExtHdrMFlagMask = 1 // ipv6FragmentExtHdrIdentificationOffset is the offset to the Identification // field within an IPv6FragmentExtHdr. ipv6FragmentExtHdrIdentificationOffset = 2 // ipv6ExtHdrLenBytesPerUnit is the unit size of an extension header's length // field. That is, given a Length field of 2, the extension header expects // 16 bytes following the first 8 bytes (see ipv6ExtHdrLenBytesExcluded for // details about the first 8 bytes' exclusion from the Length field). ipv6ExtHdrLenBytesPerUnit = 8 // ipv6ExtHdrLenBytesExcluded is the number of bytes excluded from an // extension header's Length field following the Length field. // // The Length field excludes the first 8 bytes, but the Next Header and Length // field take up the first 2 of the 8 bytes so we expect (at minimum) 6 bytes // after the Length field. // // This ensures that every extension header is at least 8 bytes. ipv6ExtHdrLenBytesExcluded = 6 // IPv6FragmentExtHdrFragmentOffsetBytesPerUnit is the unit size of a Fragment // extension header's Fragment Offset field. That is, given a Fragment Offset // of 2, the extension header is indicating that the fragment's payload // starts at the 16th byte in the reassembled packet. IPv6FragmentExtHdrFragmentOffsetBytesPerUnit = 8 ) // padIPv6OptionsLength returns the total length for IPv6 options of length l // considering the 8-octet alignment as stated in RFC 8200 Section 4.2. func padIPv6OptionsLength(length int) int { return (length + ipv6ExtHdrLenBytesPerUnit - 1) & ^(ipv6ExtHdrLenBytesPerUnit - 1) } // padIPv6Option fills b with the appropriate padding options depending on its // length. func padIPv6Option(b []byte) { switch len(b) { case 0: // No padding needed. case 1: // Pad with Pad1. b[ipv6ExtHdrOptionTypeOffset] = uint8(ipv6Pad1ExtHdrOptionIdentifier) default: // Pad with PadN. s := b[ipv6ExtHdrOptionPayloadOffset:] clear(s) b[ipv6ExtHdrOptionTypeOffset] = uint8(ipv6PadNExtHdrOptionIdentifier) b[ipv6ExtHdrOptionLengthOffset] = uint8(len(s)) } } // ipv6OptionsAlignmentPadding returns the number of padding bytes needed to // serialize an option at headerOffset with alignment requirements // [align]n + alignOffset. func ipv6OptionsAlignmentPadding(headerOffset int, align int, alignOffset int) int { padLen := headerOffset - alignOffset return ((padLen + align - 1) & ^(align - 1)) - padLen } // IPv6PayloadHeader is implemented by the various headers that can be found // in an IPv6 payload. // // These headers include IPv6 extension headers or upper layer data. type IPv6PayloadHeader interface { isIPv6PayloadHeader() // Release frees all resources held by the header. Release() } // IPv6RawPayloadHeader the remainder of an IPv6 payload after an iterator // encounters a Next Header field it does not recognize as an IPv6 extension // header. The caller is responsible for releasing the underlying buffer after // it's no longer needed. type IPv6RawPayloadHeader struct { Identifier IPv6ExtensionHeaderIdentifier Buf buffer.Buffer } // isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader. func (IPv6RawPayloadHeader) isIPv6PayloadHeader() {} // Release implements IPv6PayloadHeader.Release. func (i IPv6RawPayloadHeader) Release() { i.Buf.Release() } // ipv6OptionsExtHdr is an IPv6 extension header that holds options. type ipv6OptionsExtHdr struct { buf *buffer.View } // Release implements IPv6PayloadHeader.Release. func (i ipv6OptionsExtHdr) Release() { if i.buf != nil { i.buf.Release() } } // Iter returns an iterator over the IPv6 extension header options held in b. func (i ipv6OptionsExtHdr) Iter() IPv6OptionsExtHdrOptionsIterator { it := IPv6OptionsExtHdrOptionsIterator{} it.reader = i.buf return it } // IPv6OptionsExtHdrOptionsIterator is an iterator over IPv6 extension header // options. // // Note, between when an IPv6OptionsExtHdrOptionsIterator is obtained and last // used, no changes to the underlying buffer may happen. Doing so may cause // undefined and unexpected behaviour. It is fine to obtain an // IPv6OptionsExtHdrOptionsIterator, iterate over the first few options then // modify the backing payload so long as the IPv6OptionsExtHdrOptionsIterator // obtained before modification is no longer used. type IPv6OptionsExtHdrOptionsIterator struct { reader *buffer.View // optionOffset is the number of bytes from the first byte of the // options field to the beginning of the current option. optionOffset uint32 // nextOptionOffset is the offset of the next option. nextOptionOffset uint32 } // OptionOffset returns the number of bytes parsed while processing the // option field of the current Extension Header. func (i *IPv6OptionsExtHdrOptionsIterator) OptionOffset() uint32 { return i.optionOffset } // IPv6OptionUnknownAction is the action that must be taken if the processing // IPv6 node does not recognize the option, as outlined in RFC 8200 section 4.2. type IPv6OptionUnknownAction int const ( // IPv6OptionUnknownActionSkip indicates that the unrecognized option must // be skipped and the node should continue processing the header. IPv6OptionUnknownActionSkip IPv6OptionUnknownAction = 0 // IPv6OptionUnknownActionDiscard indicates that the packet must be silently // discarded. IPv6OptionUnknownActionDiscard IPv6OptionUnknownAction = 1 // IPv6OptionUnknownActionDiscardSendICMP indicates that the packet must be // discarded and the node must send an ICMP Parameter Problem, Code 2, message // to the packet's source, regardless of whether or not the packet's // Destination was a multicast address. IPv6OptionUnknownActionDiscardSendICMP IPv6OptionUnknownAction = 2 // IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest indicates that the // packet must be discarded and the node must send an ICMP Parameter Problem, // Code 2, message to the packet's source only if the packet's Destination was // not a multicast address. IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest IPv6OptionUnknownAction = 3 ) // IPv6ExtHdrOption is implemented by the various IPv6 extension header options. type IPv6ExtHdrOption interface { // UnknownAction returns the action to take in response to an unrecognized // option. UnknownAction() IPv6OptionUnknownAction // isIPv6ExtHdrOption is used to "lock" this interface so it is not // implemented by other packages. isIPv6ExtHdrOption() } // IPv6ExtHdrOptionIdentifier is an IPv6 extension header option identifier. type IPv6ExtHdrOptionIdentifier uint8 const ( // ipv6Pad1ExtHdrOptionIdentifier is the identifier for a padding option that // provides 1 byte padding, as outlined in RFC 8200 section 4.2. ipv6Pad1ExtHdrOptionIdentifier IPv6ExtHdrOptionIdentifier = 0 // ipv6PadNExtHdrOptionIdentifier is the identifier for a padding option that // provides variable length byte padding, as outlined in RFC 8200 section 4.2. ipv6PadNExtHdrOptionIdentifier IPv6ExtHdrOptionIdentifier = 1 // ipv6RouterAlertHopByHopOptionIdentifier is the identifier for the Router // Alert Hop by Hop option as defined in RFC 2711 section 2.1. ipv6RouterAlertHopByHopOptionIdentifier IPv6ExtHdrOptionIdentifier = 5 // ipv6ExtHdrOptionTypeOffset is the option type offset in an extension header // option as defined in RFC 8200 section 4.2. ipv6ExtHdrOptionTypeOffset = 0 // ipv6ExtHdrOptionLengthOffset is the option length offset in an extension // header option as defined in RFC 8200 section 4.2. ipv6ExtHdrOptionLengthOffset = 1 // ipv6ExtHdrOptionPayloadOffset is the option payload offset in an extension // header option as defined in RFC 8200 section 4.2. ipv6ExtHdrOptionPayloadOffset = 2 ) // ipv6UnknownActionFromIdentifier maps an extension header option's // identifier's high bits to the action to take when the identifier is unknown. func ipv6UnknownActionFromIdentifier(id IPv6ExtHdrOptionIdentifier) IPv6OptionUnknownAction { return IPv6OptionUnknownAction((id & ipv6UnknownExtHdrOptionActionMask) >> ipv6UnknownExtHdrOptionActionShift) } // ErrMalformedIPv6ExtHdrOption indicates that an IPv6 extension header option // is malformed. var ErrMalformedIPv6ExtHdrOption = errors.New("malformed IPv6 extension header option") // IPv6UnknownExtHdrOption holds the identifier and data for an IPv6 extension // header option that is unknown by the parsing utilities. type IPv6UnknownExtHdrOption struct { Identifier IPv6ExtHdrOptionIdentifier Data *buffer.View } // UnknownAction implements IPv6OptionUnknownAction.UnknownAction. func (o *IPv6UnknownExtHdrOption) UnknownAction() IPv6OptionUnknownAction { return ipv6UnknownActionFromIdentifier(o.Identifier) } // isIPv6ExtHdrOption implements IPv6ExtHdrOption.isIPv6ExtHdrOption. func (*IPv6UnknownExtHdrOption) isIPv6ExtHdrOption() {} // Next returns the next option in the options data. // // If the next item is not a known extension header option, // IPv6UnknownExtHdrOption will be returned with the option identifier and data. // // The return is of the format (option, done, error). done will be true when // Next is unable to return anything because the iterator has reached the end of // the options data, or an error occurred. func (i *IPv6OptionsExtHdrOptionsIterator) Next() (IPv6ExtHdrOption, bool, error) { for { i.optionOffset = i.nextOptionOffset temp, err := i.reader.ReadByte() if err != nil { // If we can't read the first byte of a new option, then we know the // options buffer has been exhausted and we are done iterating. return nil, true, nil } id := IPv6ExtHdrOptionIdentifier(temp) // If the option identifier indicates the option is a Pad1 option, then we // know the option does not have Length and Data fields. End processing of // the Pad1 option and continue processing the buffer as a new option. if id == ipv6Pad1ExtHdrOptionIdentifier { i.nextOptionOffset = i.optionOffset + 1 continue } length, err := i.reader.ReadByte() if err != nil { if err != io.EOF { // ReadByte should only ever return nil or io.EOF. panic(fmt.Sprintf("unexpected error when reading the option's Length field for option with id = %d: %s", id, err)) } // We use io.ErrUnexpectedEOF as exhausting the buffer is unexpected once // we start parsing an option; we expect the reader to contain enough // bytes for the whole option. return nil, true, fmt.Errorf("error when reading the option's Length field for option with id = %d: %w", id, io.ErrUnexpectedEOF) } // Do we have enough bytes in the reader for the next option? if n := i.reader.Size(); n < int(length) { // Consume the remaining buffer. i.reader.TrimFront(i.reader.Size()) // We return the same error as if we failed to read a non-padding option // so consumers of this iterator don't need to differentiate between // padding and non-padding options. return nil, true, fmt.Errorf("read %d out of %d option data bytes for option with id = %d: %w", n, length, id, io.ErrUnexpectedEOF) } i.nextOptionOffset = i.optionOffset + uint32(length) + 1 /* option ID */ + 1 /* length byte */ switch id { case ipv6PadNExtHdrOptionIdentifier: // Special-case the variable length padding option to avoid a copy. i.reader.TrimFront(int(length)) continue case ipv6RouterAlertHopByHopOptionIdentifier: var routerAlertValue [ipv6RouterAlertPayloadLength]byte if n, err := io.ReadFull(i.reader, routerAlertValue[:]); err != nil { switch err { case io.EOF, io.ErrUnexpectedEOF: return nil, true, fmt.Errorf("got invalid length (%d) for router alert option (want = %d): %w", length, ipv6RouterAlertPayloadLength, ErrMalformedIPv6ExtHdrOption) default: return nil, true, fmt.Errorf("read %d out of %d option data bytes for router alert option: %w", n, ipv6RouterAlertPayloadLength, err) } } else if n != int(length) { return nil, true, fmt.Errorf("got invalid length (%d) for router alert option (want = %d): %w", length, ipv6RouterAlertPayloadLength, ErrMalformedIPv6ExtHdrOption) } return &IPv6RouterAlertOption{Value: IPv6RouterAlertValue(binary.BigEndian.Uint16(routerAlertValue[:]))}, false, nil default: bytes := buffer.NewView(int(length)) if n, err := io.CopyN(bytes, i.reader, int64(length)); err != nil { if err == io.EOF { err = io.ErrUnexpectedEOF } return nil, true, fmt.Errorf("read %d out of %d option data bytes for option with id = %d: %w", n, length, id, err) } return &IPv6UnknownExtHdrOption{Identifier: id, Data: bytes}, false, nil } } } // IPv6HopByHopOptionsExtHdr is a buffer holding the Hop By Hop Options // extension header. type IPv6HopByHopOptionsExtHdr struct { ipv6OptionsExtHdr } // isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader. func (IPv6HopByHopOptionsExtHdr) isIPv6PayloadHeader() {} // IPv6DestinationOptionsExtHdr is a buffer holding the Destination Options // extension header. type IPv6DestinationOptionsExtHdr struct { ipv6OptionsExtHdr } // isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader. func (IPv6DestinationOptionsExtHdr) isIPv6PayloadHeader() {} // IPv6RoutingExtHdr is a buffer holding the Routing extension header specific // data as outlined in RFC 8200 section 4.4. type IPv6RoutingExtHdr struct { Buf *buffer.View } // isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader. func (IPv6RoutingExtHdr) isIPv6PayloadHeader() {} // Release implements IPv6PayloadHeader.Release. func (b IPv6RoutingExtHdr) Release() { b.Buf.Release() } // SegmentsLeft returns the Segments Left field. func (b IPv6RoutingExtHdr) SegmentsLeft() uint8 { return b.Buf.AsSlice()[ipv6RoutingExtHdrSegmentsLeftIdx] } // IPv6FragmentExtHdr is a buffer holding the Fragment extension header specific // data as outlined in RFC 8200 section 4.5. // // Note, the buffer does not include the Next Header and Reserved fields. type IPv6FragmentExtHdr [6]byte // isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader. func (IPv6FragmentExtHdr) isIPv6PayloadHeader() {} // Release implements IPv6PayloadHeader.Release. func (IPv6FragmentExtHdr) Release() {} // FragmentOffset returns the Fragment Offset field. // // This value indicates where the buffer following the Fragment extension header // starts in the target (reassembled) packet. func (b IPv6FragmentExtHdr) FragmentOffset() uint16 { return binary.BigEndian.Uint16(b[ipv6FragmentExtHdrFragmentOffsetOffset:]) >> ipv6FragmentExtHdrFragmentOffsetShift } // More returns the More (M) flag. // // This indicates whether any fragments are expected to succeed b. func (b IPv6FragmentExtHdr) More() bool { return b[ipv6FragmentExtHdrFlagsIdx]&ipv6FragmentExtHdrMFlagMask != 0 } // ID returns the Identification field. // // This value is used to uniquely identify the packet, between a // source and destination. func (b IPv6FragmentExtHdr) ID() uint32 { return binary.BigEndian.Uint32(b[ipv6FragmentExtHdrIdentificationOffset:]) } // IsAtomic returns whether the fragment header indicates an atomic fragment. An // atomic fragment is a fragment that contains all the data required to // reassemble a full packet. func (b IPv6FragmentExtHdr) IsAtomic() bool { return !b.More() && b.FragmentOffset() == 0 } // IPv6PayloadIterator is an iterator over the contents of an IPv6 payload. // // The IPv6 payload may contain IPv6 extension headers before any upper layer // data. // // Note, between when an IPv6PayloadIterator is obtained and last used, no // changes to the payload may happen. Doing so may cause undefined and // unexpected behaviour. It is fine to obtain an IPv6PayloadIterator, iterate // over the first few headers then modify the backing payload so long as the // IPv6PayloadIterator obtained before modification is no longer used. type IPv6PayloadIterator struct { // The identifier of the next header to parse. nextHdrIdentifier IPv6ExtensionHeaderIdentifier payload buffer.Buffer // Indicates to the iterator that it should return the remaining payload as a // raw payload on the next call to Next. forceRaw bool // headerOffset is the offset of the beginning of the current extension // header starting from the beginning of the fixed header. headerOffset uint32 // parseOffset is the byte offset into the current extension header of the // field we are currently examining. It can be added to the header offset // if the absolute offset within the packet is required. parseOffset uint32 // nextOffset is the offset of the next header. nextOffset uint32 } // HeaderOffset returns the offset to the start of the extension // header most recently processed. func (i IPv6PayloadIterator) HeaderOffset() uint32 { return i.headerOffset } // ParseOffset returns the number of bytes successfully parsed. func (i IPv6PayloadIterator) ParseOffset() uint32 { return i.headerOffset + i.parseOffset } // MakeIPv6PayloadIterator returns an iterator over the IPv6 payload containing // extension headers, or a raw payload if the payload cannot be parsed. The // iterator takes ownership of the payload. func MakeIPv6PayloadIterator(nextHdrIdentifier IPv6ExtensionHeaderIdentifier, payload buffer.Buffer) IPv6PayloadIterator { return IPv6PayloadIterator{ nextHdrIdentifier: nextHdrIdentifier, payload: payload, nextOffset: IPv6FixedHeaderSize, } } // Release frees the resources owned by the iterator. func (i *IPv6PayloadIterator) Release() { i.payload.Release() } // AsRawHeader returns the remaining payload of i as a raw header and // optionally consumes the iterator. // // If consume is true, calls to Next after calling AsRawHeader on i will // indicate that the iterator is done. The returned header takes ownership of // its payload. func (i *IPv6PayloadIterator) AsRawHeader(consume bool) IPv6RawPayloadHeader { identifier := i.nextHdrIdentifier var buf buffer.Buffer if consume { // Since we consume the iterator, we return the payload as is. buf = i.payload // Mark i as done, but keep track of where we were for error reporting. *i = IPv6PayloadIterator{ nextHdrIdentifier: IPv6NoNextHeaderIdentifier, headerOffset: i.headerOffset, nextOffset: i.nextOffset, } } else { buf = i.payload.Clone() } return IPv6RawPayloadHeader{Identifier: identifier, Buf: buf} } // Next returns the next item in the payload. // // If the next item is not a known IPv6 extension header, IPv6RawPayloadHeader // will be returned with the remaining bytes and next header identifier. // // The return is of the format (header, done, error). done will be true when // Next is unable to return anything because the iterator has reached the end of // the payload, or an error occurred. func (i *IPv6PayloadIterator) Next() (IPv6PayloadHeader, bool, error) { i.headerOffset = i.nextOffset i.parseOffset = 0 // We could be forced to return i as a raw header when the previous header was // a fragment extension header as the data following the fragment extension // header may not be complete. if i.forceRaw { return i.AsRawHeader(true /* consume */), false, nil } // Is the header we are parsing a known extension header? switch i.nextHdrIdentifier { case IPv6HopByHopOptionsExtHdrIdentifier: nextHdrIdentifier, view, err := i.nextHeaderData(false /* fragmentHdr */, nil) if err != nil { return nil, true, err } i.nextHdrIdentifier = nextHdrIdentifier return IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr{view}}, false, nil case IPv6RoutingExtHdrIdentifier: nextHdrIdentifier, view, err := i.nextHeaderData(false /* fragmentHdr */, nil) if err != nil { return nil, true, err } i.nextHdrIdentifier = nextHdrIdentifier return IPv6RoutingExtHdr{view}, false, nil case IPv6FragmentExtHdrIdentifier: var data [6]byte // We ignore the returned bytes because we know the fragment extension // header specific data will fit in data. nextHdrIdentifier, _, err := i.nextHeaderData(true /* fragmentHdr */, data[:]) if err != nil { return nil, true, err } fragmentExtHdr := IPv6FragmentExtHdr(data) // If the packet is not the first fragment, do not attempt to parse anything // after the fragment extension header as the payload following the fragment // extension header should not contain any headers; the first fragment must // hold all the headers up to and including any upper layer headers, as per // RFC 8200 section 4.5. if fragmentExtHdr.FragmentOffset() != 0 { i.forceRaw = true } i.nextHdrIdentifier = nextHdrIdentifier return fragmentExtHdr, false, nil case IPv6DestinationOptionsExtHdrIdentifier: nextHdrIdentifier, view, err := i.nextHeaderData(false /* fragmentHdr */, nil) if err != nil { return nil, true, err } i.nextHdrIdentifier = nextHdrIdentifier return IPv6DestinationOptionsExtHdr{ipv6OptionsExtHdr{view}}, false, nil case IPv6NoNextHeaderIdentifier: // This indicates the end of the IPv6 payload. return nil, true, nil default: // The header we are parsing is not a known extension header. Return the // raw payload. return i.AsRawHeader(true /* consume */), false, nil } } // NextHeaderIdentifier returns the identifier of the header next returned by // it.Next(). func (i *IPv6PayloadIterator) NextHeaderIdentifier() IPv6ExtensionHeaderIdentifier { return i.nextHdrIdentifier } // nextHeaderData returns the extension header's Next Header field and raw data. // // fragmentHdr indicates that the extension header being parsed is the Fragment // extension header so the Length field should be ignored as it is Reserved // for the Fragment extension header. // // If bytes is not nil, extension header specific data will be read into bytes // if it has enough capacity. If bytes is provided but does not have enough // capacity for the data, nextHeaderData will panic. func (i *IPv6PayloadIterator) nextHeaderData(fragmentHdr bool, bytes []byte) (IPv6ExtensionHeaderIdentifier, *buffer.View, error) { // We ignore the number of bytes read because we know we will only ever read // at max 1 bytes since rune has a length of 1. If we read 0 bytes, the Read // would return io.EOF to indicate that io.Reader has reached the end of the // payload. rdr := i.payload.AsBufferReader() nextHdrIdentifier, err := rdr.ReadByte() if err != nil { return 0, nil, fmt.Errorf("error when reading the Next Header field for extension header with id = %d: %w", i.nextHdrIdentifier, err) } i.parseOffset++ var length uint8 length, err = rdr.ReadByte() if err != nil { if fragmentHdr { return 0, nil, fmt.Errorf("error when reading the Length field for extension header with id = %d: %w", i.nextHdrIdentifier, err) } return 0, nil, fmt.Errorf("error when reading the Reserved field for extension header with id = %d: %w", i.nextHdrIdentifier, err) } if fragmentHdr { length = 0 } // Make parseOffset point to the first byte of the Extension Header // specific data. i.parseOffset++ // length is in 8 byte chunks but doesn't include the first one. // See RFC 8200 for each header type, sections 4.3-4.6 and the requirement // in section 4.8 for new extension headers at the top of page 24. // [ Hdr Ext Len ] ... Length of the Destination Options header in 8-octet // units, not including the first 8 octets. i.nextOffset += uint32((length + 1) * ipv6ExtHdrLenBytesPerUnit) bytesLen := int(length)*ipv6ExtHdrLenBytesPerUnit + ipv6ExtHdrLenBytesExcluded if fragmentHdr { if n := len(bytes); n < bytesLen { panic(fmt.Sprintf("bytes only has space for %d bytes but need space for %d bytes (length = %d) for extension header with id = %d", n, bytesLen, length, i.nextHdrIdentifier)) } if n, err := io.ReadFull(&rdr, bytes); err != nil { return 0, nil, fmt.Errorf("read %d out of %d extension header data bytes (length = %d) for header with id = %d: %w", n, bytesLen, length, i.nextHdrIdentifier, err) } return IPv6ExtensionHeaderIdentifier(nextHdrIdentifier), nil, nil } v := buffer.NewView(bytesLen) if n, err := io.CopyN(v, &rdr, int64(bytesLen)); err != nil { if err == io.EOF { err = io.ErrUnexpectedEOF } v.Release() return 0, nil, fmt.Errorf("read %d out of %d extension header data bytes (length = %d) for header with id = %d: %w", n, bytesLen, length, i.nextHdrIdentifier, err) } return IPv6ExtensionHeaderIdentifier(nextHdrIdentifier), v, nil } // IPv6SerializableExtHdr provides serialization for IPv6 extension // headers. type IPv6SerializableExtHdr interface { // identifier returns the assigned IPv6 header identifier for this extension // header. identifier() IPv6ExtensionHeaderIdentifier // length returns the total serialized length in bytes of this extension // header, including the common next header and length fields. length() int // serializeInto serializes the receiver into the provided byte // buffer and with the provided nextHeader value. // // Note, the caller MUST provide a byte buffer with size of at least // length. Implementers of this function may assume that the byte buffer // is of sufficient size. serializeInto MAY panic if the provided byte // buffer is not of sufficient size. // // serializeInto returns the number of bytes that was used to serialize the // receiver. Implementers must only use the number of bytes required to // serialize the receiver. Callers MAY provide a larger buffer than required // to serialize into. serializeInto(nextHeader uint8, b []byte) int } var _ IPv6SerializableExtHdr = (*IPv6SerializableHopByHopExtHdr)(nil) // IPv6SerializableHopByHopExtHdr implements serialization of the Hop by Hop // options extension header. type IPv6SerializableHopByHopExtHdr []IPv6SerializableHopByHopOption const ( // ipv6HopByHopExtHdrNextHeaderOffset is the offset of the next header field // in a hop by hop extension header as defined in RFC 8200 section 4.3. ipv6HopByHopExtHdrNextHeaderOffset = 0 // ipv6HopByHopExtHdrLengthOffset is the offset of the length field in a hop // by hop extension header as defined in RFC 8200 section 4.3. ipv6HopByHopExtHdrLengthOffset = 1 // ipv6HopByHopExtHdrPayloadOffset is the offset of the options in a hop by // hop extension header as defined in RFC 8200 section 4.3. ipv6HopByHopExtHdrOptionsOffset = 2 // ipv6HopByHopExtHdrUnaccountedLenWords is the implicit number of 8-octet // words in a hop by hop extension header's length field, as stated in RFC // 8200 section 4.3: // Length of the Hop-by-Hop Options header in 8-octet units, // not including the first 8 octets. ipv6HopByHopExtHdrUnaccountedLenWords = 1 ) // identifier implements IPv6SerializableExtHdr. func (IPv6SerializableHopByHopExtHdr) identifier() IPv6ExtensionHeaderIdentifier { return IPv6HopByHopOptionsExtHdrIdentifier } // length implements IPv6SerializableExtHdr. func (h IPv6SerializableHopByHopExtHdr) length() int { var total int for _, opt := range h { align, alignOffset := opt.alignment() total += ipv6OptionsAlignmentPadding(total, align, alignOffset) total += ipv6ExtHdrOptionPayloadOffset + int(opt.length()) } // Account for next header and total length fields and add padding. return padIPv6OptionsLength(ipv6HopByHopExtHdrOptionsOffset + total) } // serializeInto implements IPv6SerializableExtHdr. func (h IPv6SerializableHopByHopExtHdr) serializeInto(nextHeader uint8, b []byte) int { optBuffer := b[ipv6HopByHopExtHdrOptionsOffset:] totalLength := ipv6HopByHopExtHdrOptionsOffset for _, opt := range h { // Calculate alignment requirements and pad buffer if necessary. align, alignOffset := opt.alignment() padLen := ipv6OptionsAlignmentPadding(totalLength, align, alignOffset) if padLen != 0 { padIPv6Option(optBuffer[:padLen]) totalLength += padLen optBuffer = optBuffer[padLen:] } l := opt.serializeInto(optBuffer[ipv6ExtHdrOptionPayloadOffset:]) optBuffer[ipv6ExtHdrOptionTypeOffset] = uint8(opt.identifier()) optBuffer[ipv6ExtHdrOptionLengthOffset] = l l += ipv6ExtHdrOptionPayloadOffset totalLength += int(l) optBuffer = optBuffer[l:] } padded := padIPv6OptionsLength(totalLength) if padded != totalLength { padIPv6Option(optBuffer[:padded-totalLength]) totalLength = padded } wordsLen := totalLength/ipv6ExtHdrLenBytesPerUnit - ipv6HopByHopExtHdrUnaccountedLenWords if wordsLen > math.MaxUint8 { panic(fmt.Sprintf("IPv6 hop by hop options too large: %d+1 64-bit words", wordsLen)) } b[ipv6HopByHopExtHdrNextHeaderOffset] = nextHeader b[ipv6HopByHopExtHdrLengthOffset] = uint8(wordsLen) return totalLength } // IPv6SerializableHopByHopOption provides serialization for hop by hop options. type IPv6SerializableHopByHopOption interface { // identifier returns the option identifier of this Hop by Hop option. identifier() IPv6ExtHdrOptionIdentifier // length returns the *payload* size of the option (not considering the type // and length fields). length() uint8 // alignment returns the alignment requirements from this option. // // Alignment requirements take the form [align]n + offset as specified in // RFC 8200 section 4.2. The alignment requirement is on the offset between // the option type byte and the start of the hop by hop header. // // align must be a power of 2. alignment() (align int, offset int) // serializeInto serializes the receiver into the provided byte // buffer. // // Note, the caller MUST provide a byte buffer with size of at least // length. Implementers of this function may assume that the byte buffer // is of sufficient size. serializeInto MAY panic if the provided byte // buffer is not of sufficient size. // // serializeInto will return the number of bytes that was used to // serialize the receiver. Implementers must only use the number of // bytes required to serialize the receiver. Callers MAY provide a // larger buffer than required to serialize into. serializeInto([]byte) uint8 } var _ IPv6SerializableHopByHopOption = (*IPv6RouterAlertOption)(nil) // IPv6RouterAlertOption is the IPv6 Router alert Hop by Hop option defined in // RFC 2711 section 2.1. type IPv6RouterAlertOption struct { Value IPv6RouterAlertValue } // IPv6RouterAlertValue is the payload of an IPv6 Router Alert option. type IPv6RouterAlertValue uint16 const ( // IPv6RouterAlertMLD indicates a datagram containing a Multicast Listener // Discovery message as defined in RFC 2711 section 2.1. IPv6RouterAlertMLD IPv6RouterAlertValue = 0 // IPv6RouterAlertRSVP indicates a datagram containing an RSVP message as // defined in RFC 2711 section 2.1. IPv6RouterAlertRSVP IPv6RouterAlertValue = 1 // IPv6RouterAlertActiveNetworks indicates a datagram containing an Active // Networks message as defined in RFC 2711 section 2.1. IPv6RouterAlertActiveNetworks IPv6RouterAlertValue = 2 // ipv6RouterAlertPayloadLength is the length of the Router Alert payload // as defined in RFC 2711. ipv6RouterAlertPayloadLength = 2 // ipv6RouterAlertAlignmentRequirement is the alignment requirement for the // Router Alert option defined as 2n+0 in RFC 2711. ipv6RouterAlertAlignmentRequirement = 2 // ipv6RouterAlertAlignmentOffsetRequirement is the alignment offset // requirement for the Router Alert option defined as 2n+0 in RFC 2711 section // 2.1. ipv6RouterAlertAlignmentOffsetRequirement = 0 ) // UnknownAction implements IPv6ExtHdrOption. func (*IPv6RouterAlertOption) UnknownAction() IPv6OptionUnknownAction { return ipv6UnknownActionFromIdentifier(ipv6RouterAlertHopByHopOptionIdentifier) } // isIPv6ExtHdrOption implements IPv6ExtHdrOption. func (*IPv6RouterAlertOption) isIPv6ExtHdrOption() {} // identifier implements IPv6SerializableHopByHopOption. func (*IPv6RouterAlertOption) identifier() IPv6ExtHdrOptionIdentifier { return ipv6RouterAlertHopByHopOptionIdentifier } // length implements IPv6SerializableHopByHopOption. func (*IPv6RouterAlertOption) length() uint8 { return ipv6RouterAlertPayloadLength } // alignment implements IPv6SerializableHopByHopOption. func (*IPv6RouterAlertOption) alignment() (int, int) { // From RFC 2711 section 2.1: // Alignment requirement: 2n+0. return ipv6RouterAlertAlignmentRequirement, ipv6RouterAlertAlignmentOffsetRequirement } // serializeInto implements IPv6SerializableHopByHopOption. func (o *IPv6RouterAlertOption) serializeInto(b []byte) uint8 { binary.BigEndian.PutUint16(b, uint16(o.Value)) return ipv6RouterAlertPayloadLength } // IPv6ExtHdrSerializer provides serialization of IPv6 extension headers. type IPv6ExtHdrSerializer []IPv6SerializableExtHdr // Serialize serializes the provided list of IPv6 extension headers into b. // // Note, b must be of sufficient size to hold all the headers in s. See // IPv6ExtHdrSerializer.Length for details on the getting the total size of a // serialized IPv6ExtHdrSerializer. // // Serialize may panic if b is not of sufficient size to hold all the options // in s. // // Serialize takes the transportProtocol value to be used as the last extension // header's Next Header value and returns the header identifier of the first // serialized extension header and the total serialized length. func (s IPv6ExtHdrSerializer) Serialize(transportProtocol tcpip.TransportProtocolNumber, b []byte) (uint8, int) { nextHeader := uint8(transportProtocol) if len(s) == 0 { return nextHeader, 0 } var totalLength int for i, h := range s[:len(s)-1] { length := h.serializeInto(uint8(s[i+1].identifier()), b) b = b[length:] totalLength += length } totalLength += s[len(s)-1].serializeInto(nextHeader, b) return uint8(s[0].identifier()), totalLength } // Length returns the total number of bytes required to serialize the extension // headers. func (s IPv6ExtHdrSerializer) Length() int { var totalLength int for _, h := range s { totalLength += h.length() } return totalLength } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/ipv6_fragment.go000066400000000000000000000112771465435605700251010ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import ( "encoding/binary" "gvisor.dev/gvisor/pkg/tcpip" ) const ( nextHdrFrag = 0 fragOff = 2 more = 3 idV6 = 4 ) var _ IPv6SerializableExtHdr = (*IPv6SerializableFragmentExtHdr)(nil) // IPv6SerializableFragmentExtHdr is used to serialize an IPv6 fragment // extension header as defined in RFC 8200 section 4.5. type IPv6SerializableFragmentExtHdr struct { // FragmentOffset is the "fragment offset" field of an IPv6 fragment. FragmentOffset uint16 // M is the "more" field of an IPv6 fragment. M bool // Identification is the "identification" field of an IPv6 fragment. Identification uint32 } // identifier implements IPv6SerializableFragmentExtHdr. func (h *IPv6SerializableFragmentExtHdr) identifier() IPv6ExtensionHeaderIdentifier { return IPv6FragmentHeader } // length implements IPv6SerializableFragmentExtHdr. func (h *IPv6SerializableFragmentExtHdr) length() int { return IPv6FragmentHeaderSize } // serializeInto implements IPv6SerializableFragmentExtHdr. func (h *IPv6SerializableFragmentExtHdr) serializeInto(nextHeader uint8, b []byte) int { // Prevent too many bounds checks. _ = b[IPv6FragmentHeaderSize:] binary.BigEndian.PutUint32(b[idV6:], h.Identification) binary.BigEndian.PutUint16(b[fragOff:], h.FragmentOffset<= IPv6FragmentHeaderSize } // NextHeader returns the value of the "next header" field of the ipv6 fragment. func (b IPv6Fragment) NextHeader() uint8 { return b[nextHdrFrag] } // FragmentOffset returns the "fragment offset" field of the ipv6 fragment. func (b IPv6Fragment) FragmentOffset() uint16 { return binary.BigEndian.Uint16(b[fragOff:]) >> 3 } // More returns the "more" field of the ipv6 fragment. func (b IPv6Fragment) More() bool { return b[more]&1 > 0 } // Payload implements Network.Payload. func (b IPv6Fragment) Payload() []byte { return b[IPv6FragmentHeaderSize:] } // ID returns the value of the identifier field of the ipv6 fragment. func (b IPv6Fragment) ID() uint32 { return binary.BigEndian.Uint32(b[idV6:]) } // TransportProtocol implements Network.TransportProtocol. func (b IPv6Fragment) TransportProtocol() tcpip.TransportProtocolNumber { return tcpip.TransportProtocolNumber(b.NextHeader()) } // The functions below have been added only to satisfy the Network interface. // Checksum is not supported by IPv6Fragment. func (b IPv6Fragment) Checksum() uint16 { panic("not supported") } // SourceAddress is not supported by IPv6Fragment. func (b IPv6Fragment) SourceAddress() tcpip.Address { panic("not supported") } // DestinationAddress is not supported by IPv6Fragment. func (b IPv6Fragment) DestinationAddress() tcpip.Address { panic("not supported") } // SetSourceAddress is not supported by IPv6Fragment. func (b IPv6Fragment) SetSourceAddress(tcpip.Address) { panic("not supported") } // SetDestinationAddress is not supported by IPv6Fragment. func (b IPv6Fragment) SetDestinationAddress(tcpip.Address) { panic("not supported") } // SetChecksum is not supported by IPv6Fragment. func (b IPv6Fragment) SetChecksum(uint16) { panic("not supported") } // TOS is not supported by IPv6Fragment. func (b IPv6Fragment) TOS() (uint8, uint32) { panic("not supported") } // SetTOS is not supported by IPv6Fragment. func (b IPv6Fragment) SetTOS(t uint8, l uint32) { panic("not supported") } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/mld.go000066400000000000000000000102031465435605700230720ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import ( "encoding/binary" "fmt" "time" "gvisor.dev/gvisor/pkg/tcpip" ) const ( // MLDMinimumSize is the minimum size for an MLD message. MLDMinimumSize = 20 // MLDHopLimit is the Hop Limit for all IPv6 packets with an MLD message, as // per RFC 2710 section 3. MLDHopLimit = 1 // mldMaximumResponseDelayOffset is the offset to the Maximum Response Delay // field within MLD. mldMaximumResponseDelayOffset = 0 // mldMulticastAddressOffset is the offset to the Multicast Address field // within MLD. mldMulticastAddressOffset = 4 ) // MLD is a Multicast Listener Discovery message in an ICMPv6 packet. // // MLD will only contain the body of an ICMPv6 packet. // // As per RFC 2710 section 3, MLD messages have the following format (MLD only // holds the bytes after the first four bytes in the diagram below): // // 0 1 2 3 // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Type | Code | Checksum | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Maximum Response Delay | Reserved | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // + + // | | // + Multicast Address + // | | // + + // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ type MLD []byte // MaximumResponseDelay returns the Maximum Response Delay. func (m MLD) MaximumResponseDelay() time.Duration { // As per RFC 2710 section 3.4: // // The Maximum Response Delay field is meaningful only in Query // messages, and specifies the maximum allowed delay before sending a // responding Report, in units of milliseconds. In all other messages, // it is set to zero by the sender and ignored by receivers. return time.Duration(binary.BigEndian.Uint16(m[mldMaximumResponseDelayOffset:])) * time.Millisecond } // SetMaximumResponseDelay sets the Maximum Response Delay field. // // maxRespDelayMS is the value in milliseconds. func (m MLD) SetMaximumResponseDelay(maxRespDelayMS uint16) { binary.BigEndian.PutUint16(m[mldMaximumResponseDelayOffset:], maxRespDelayMS) } // MulticastAddress returns the Multicast Address. func (m MLD) MulticastAddress() tcpip.Address { // As per RFC 2710 section 3.5: // // In a Query message, the Multicast Address field is set to zero when // sending a General Query, and set to a specific IPv6 multicast address // when sending a Multicast-Address-Specific Query. // // In a Report or Done message, the Multicast Address field holds a // specific IPv6 multicast address to which the message sender is // listening or is ceasing to listen, respectively. return tcpip.AddrFrom16([16]byte(m[mldMulticastAddressOffset:][:IPv6AddressSize])) } // SetMulticastAddress sets the Multicast Address field. func (m MLD) SetMulticastAddress(multicastAddress tcpip.Address) { if n := copy(m[mldMulticastAddressOffset:], multicastAddress.AsSlice()); n != IPv6AddressSize { panic(fmt.Sprintf("copied %d bytes, expected to copy %d bytes", n, IPv6AddressSize)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/mldv2.go000066400000000000000000000636211465435605700233560ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import ( "bytes" "encoding/binary" "fmt" "time" "gvisor.dev/gvisor/pkg/tcpip" ) const ( // MLDv2QueryMinimumSize is the minimum size for an MLDv2 message. MLDv2QueryMinimumSize = 24 mldv2QueryMaximumResponseCodeOffset = 0 mldv2QueryResvSQRVOffset = 20 mldv2QueryQRVMask = 0b111 mldv2QueryQQICOffset = 21 // mldv2QueryNumberOfSourcesOffset is the offset to the Number of Sources // field within MLDv2Query. mldv2QueryNumberOfSourcesOffset = 22 // MLDv2ReportMinimumSize is the minimum size of an MLDv2 report. MLDv2ReportMinimumSize = 24 // mldv2QuerySourcesOffset is the offset to the Sources field within // MLDv2Query. mldv2QuerySourcesOffset = 24 ) var ( // MLDv2RoutersAddress is the address to send MLDv2 reports to. // // As per RFC 3810 section 5.2.14, // // Version 2 Multicast Listener Reports are sent with an IP destination // address of FF02:0:0:0:0:0:0:16, to which all MLDv2-capable multicast // routers listen (see section 11 for IANA considerations related to // this special destination address). MLDv2RoutersAddress = tcpip.AddrFrom16([16]byte{0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x16}) ) // MLDv2Query is a Multicast Listener Discovery Version 2 Query message in an // ICMPv6 packet. // // MLDv2Query will only contain the body of an ICMPv6 packet. // // As per RFC 3810 section 5.1, MLDv2 Query messages have the following format // (MLDv2Query only holds the bytes after the first four bytes in the diagram // below): // // 0 1 2 3 // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Type = 130 | Code | Checksum | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Maximum Response Code | Reserved | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // * * // | | // * Multicast Address * // | | // * * // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Resv |S| QRV | QQIC | Number of Sources (N) | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // * * // | | // * Source Address [1] * // | | // * * // | | // +- -+ // | | // * * // | | // * Source Address [2] * // | | // * * // | | // +- . -+ // . . . // . . . // +- -+ // | | // * * // | | // * Source Address [N] * // | | // * * // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ type MLDv2Query MLD // MaximumResponseCode returns the Maximum Response Code func (m MLDv2Query) MaximumResponseCode() uint16 { return binary.BigEndian.Uint16(m[mldv2QueryMaximumResponseCodeOffset:]) } // MLDv2MaximumResponseDelay returns the Maximum Response Delay in an MLDv2 // Maximum Response Code. // // As per RFC 3810 section 5.1.3, // // The Maximum Response Code field specifies the maximum time allowed // before sending a responding Report. The actual time allowed, called // the Maximum Response Delay, is represented in units of milliseconds, // and is derived from the Maximum Response Code as follows: // // If Maximum Response Code < 32768, // Maximum Response Delay = Maximum Response Code // // If Maximum Response Code >=32768, Maximum Response Code represents a // floating-point value as follows: // // 0 1 2 3 4 5 6 7 8 9 A B C D E F // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // |1| exp | mant | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // // Maximum Response Delay = (mant | 0x1000) << (exp+3) // // Small values of Maximum Response Delay allow MLDv2 routers to tune // the "leave latency" (the time between the moment the last node on a // link ceases to listen to a specific multicast address and the moment // the routing protocol is notified that there are no more listeners for // that address). Larger values, especially in the exponential range, // allow the tuning of the burstiness of MLD traffic on a link. func MLDv2MaximumResponseDelay(codeRaw uint16) time.Duration { code := time.Duration(codeRaw) if code < 32768 { return code * time.Millisecond } const mantBits = 12 const expMask = 0b111 exp := (code >> mantBits) & expMask mant := code & ((1 << mantBits) - 1) return (mant | 0x1000) << (exp + 3) * time.Millisecond } // MulticastAddress returns the Multicast Address. func (m MLDv2Query) MulticastAddress() tcpip.Address { // As per RFC 2710 section 3.5: // // In a Query message, the Multicast Address field is set to zero when // sending a General Query, and set to a specific IPv6 multicast address // when sending a Multicast-Address-Specific Query. // // In a Report or Done message, the Multicast Address field holds a // specific IPv6 multicast address to which the message sender is // listening or is ceasing to listen, respectively. return tcpip.AddrFrom16([16]byte(m[mldMulticastAddressOffset:][:IPv6AddressSize])) } // QuerierRobustnessVariable returns the querier's robustness variable. func (m MLDv2Query) QuerierRobustnessVariable() uint8 { return m[mldv2QueryResvSQRVOffset] & mldv2QueryQRVMask } // QuerierQueryInterval returns the querier's query interval. func (m MLDv2Query) QuerierQueryInterval() time.Duration { return mldv2AndIGMPv3QuerierQueryCodeToInterval(m[mldv2QueryQQICOffset]) } // Sources returns an iterator over source addresses in the query. // // Returns false if the message cannot hold the expected number of sources. func (m MLDv2Query) Sources() (AddressIterator, bool) { return makeAddressIterator( m[mldv2QuerySourcesOffset:], binary.BigEndian.Uint16(m[mldv2QueryNumberOfSourcesOffset:]), IPv6AddressSize, ) } // MLDv2ReportRecordType is the type of an MLDv2 multicast address record // found in an MLDv2 report, as per RFC 3810 section 5.2.12. type MLDv2ReportRecordType int // MLDv2 multicast address record types, as per RFC 3810 section 5.2.12. const ( MLDv2ReportRecordModeIsInclude MLDv2ReportRecordType = 1 MLDv2ReportRecordModeIsExclude MLDv2ReportRecordType = 2 MLDv2ReportRecordChangeToIncludeMode MLDv2ReportRecordType = 3 MLDv2ReportRecordChangeToExcludeMode MLDv2ReportRecordType = 4 MLDv2ReportRecordAllowNewSources MLDv2ReportRecordType = 5 MLDv2ReportRecordBlockOldSources MLDv2ReportRecordType = 6 ) const ( mldv2ReportMulticastAddressRecordMinimumSize = 20 mldv2ReportMulticastAddressRecordTypeOffset = 0 mldv2ReportMulticastAddressRecordAuxDataLenOffset = 1 mldv2ReportMulticastAddressRecordAuxDataLenUnits = 4 mldv2ReportMulticastAddressRecordNumberOfSourcesOffset = 2 mldv2ReportMulticastAddressRecordMulticastAddressOffset = 4 mldv2ReportMulticastAddressRecordSourcesOffset = 20 ) // MLDv2ReportMulticastAddressRecordSerializer is an MLDv2 Multicast Address // Record serializer. // // As per RFC 3810 section 5.2, a Multicast Address Record has the following // internal format: // // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Record Type | Aux Data Len | Number of Sources (N) | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // * * // | | // * Multicast Address * // | | // * * // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // * * // | | // * Source Address [1] * // | | // * * // | | // +- -+ // | | // * * // | | // * Source Address [2] * // | | // * * // | | // +- -+ // . . . // . . . // . . . // +- -+ // | | // * * // | | // * Source Address [N] * // | | // * * // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // . . // . Auxiliary Data . // . . // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ type MLDv2ReportMulticastAddressRecordSerializer struct { RecordType MLDv2ReportRecordType MulticastAddress tcpip.Address Sources []tcpip.Address } // Length returns the number of bytes this serializer would occupy. func (s *MLDv2ReportMulticastAddressRecordSerializer) Length() int { return mldv2ReportMulticastAddressRecordSourcesOffset + len(s.Sources)*IPv6AddressSize } func copyIPv6Address(dst []byte, src tcpip.Address) { if n := copy(dst, src.AsSlice()); n != IPv6AddressSize { panic(fmt.Sprintf("got copy(...) = %d, want = %d", n, IPv6AddressSize)) } } // SerializeInto serializes the record into the buffer. // // Panics if the buffer does not have enough space to fit the record. func (s *MLDv2ReportMulticastAddressRecordSerializer) SerializeInto(b []byte) { b[mldv2ReportMulticastAddressRecordTypeOffset] = byte(s.RecordType) b[mldv2ReportMulticastAddressRecordAuxDataLenOffset] = 0 binary.BigEndian.PutUint16(b[mldv2ReportMulticastAddressRecordNumberOfSourcesOffset:], uint16(len(s.Sources))) copyIPv6Address(b[mldv2ReportMulticastAddressRecordMulticastAddressOffset:], s.MulticastAddress) b = b[mldv2ReportMulticastAddressRecordSourcesOffset:] for _, source := range s.Sources { copyIPv6Address(b, source) b = b[IPv6AddressSize:] } } const ( mldv2ReportReservedOffset = 0 mldv2ReportNumberOfMulticastAddressRecordsOffset = 2 mldv2ReportMulticastAddressRecordsOffset = 4 ) // MLDv2ReportSerializer is an MLD Version 2 Report serializer. // // As per RFC 3810 section 5.2, // // 0 1 2 3 // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Type = 143 | Reserved | Checksum | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Reserved |Nr of Mcast Address Records (M)| // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // . . // . Multicast Address Record [1] . // . . // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // . . // . Multicast Address Record [2] . // . . // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | . | // . . . // | . | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // . . // . Multicast Address Record [M] . // . . // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ type MLDv2ReportSerializer struct { Records []MLDv2ReportMulticastAddressRecordSerializer } // Length returns the number of bytes this serializer would occupy. func (s *MLDv2ReportSerializer) Length() int { ret := mldv2ReportMulticastAddressRecordsOffset for _, record := range s.Records { ret += record.Length() } return ret } // SerializeInto serializes the report into the buffer. // // Panics if the buffer does not have enough space to fit the report. func (s *MLDv2ReportSerializer) SerializeInto(b []byte) { binary.BigEndian.PutUint16(b[mldv2ReportReservedOffset:], 0) binary.BigEndian.PutUint16(b[mldv2ReportNumberOfMulticastAddressRecordsOffset:], uint16(len(s.Records))) b = b[mldv2ReportMulticastAddressRecordsOffset:] for _, record := range s.Records { len := record.Length() record.SerializeInto(b[:len]) b = b[len:] } } // MLDv2ReportMulticastAddressRecord is an MLDv2 record. // // As per RFC 3810 section 5.2, a Multicast Address Record has the following // internal format: // // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Record Type | Aux Data Len | Number of Sources (N) | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // * * // | | // * Multicast Address * // | | // * * // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // * * // | | // * Source Address [1] * // | | // * * // | | // +- -+ // | | // * * // | | // * Source Address [2] * // | | // * * // | | // +- -+ // . . . // . . . // . . . // +- -+ // | | // * * // | | // * Source Address [N] * // | | // * * // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // . . // . Auxiliary Data . // . . // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ type MLDv2ReportMulticastAddressRecord []byte // RecordType returns the type of this record. func (r MLDv2ReportMulticastAddressRecord) RecordType() MLDv2ReportRecordType { return MLDv2ReportRecordType(r[mldv2ReportMulticastAddressRecordTypeOffset]) } // AuxDataLen returns the length of the auxiliary data in this record. func (r MLDv2ReportMulticastAddressRecord) AuxDataLen() int { return int(r[mldv2ReportMulticastAddressRecordAuxDataLenOffset]) * mldv2ReportMulticastAddressRecordAuxDataLenUnits } // numberOfSources returns the number of sources in this record. func (r MLDv2ReportMulticastAddressRecord) numberOfSources() uint16 { return binary.BigEndian.Uint16(r[mldv2ReportMulticastAddressRecordNumberOfSourcesOffset:]) } // MulticastAddress returns the multicast address this record targets. func (r MLDv2ReportMulticastAddressRecord) MulticastAddress() tcpip.Address { return tcpip.AddrFrom16([16]byte(r[mldv2ReportMulticastAddressRecordMulticastAddressOffset:][:IPv6AddressSize])) } // Sources returns an iterator over source addresses in the query. // // Returns false if the message cannot hold the expected number of sources. func (r MLDv2ReportMulticastAddressRecord) Sources() (AddressIterator, bool) { expectedLen := int(r.numberOfSources()) * IPv6AddressSize b := r[mldv2ReportMulticastAddressRecordSourcesOffset:] if len(b) < expectedLen { return AddressIterator{}, false } return AddressIterator{addressSize: IPv6AddressSize, buf: bytes.NewBuffer(b[:expectedLen])}, true } // MLDv2Report is an MLDv2 Report. // // As per RFC 3810 section 5.2, // // 0 1 2 3 // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Type = 143 | Reserved | Checksum | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Reserved |Nr of Mcast Address Records (M)| // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // . . // . Multicast Address Record [1] . // . . // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // . . // . Multicast Address Record [2] . // . . // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | . | // . . . // | . | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | | // . . // . Multicast Address Record [M] . // . . // | | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ type MLDv2Report []byte // MLDv2ReportMulticastAddressRecordIterator is an iterator over MLDv2 Multicast // Address Records. type MLDv2ReportMulticastAddressRecordIterator struct { recordsLeft uint16 buf *bytes.Buffer } // MLDv2ReportMulticastAddressRecordIteratorNextDisposition is the possible // return values from MLDv2ReportMulticastAddressRecordIterator.Next. type MLDv2ReportMulticastAddressRecordIteratorNextDisposition int const ( // MLDv2ReportMulticastAddressRecordIteratorNextOk indicates that a multicast // address record was yielded. MLDv2ReportMulticastAddressRecordIteratorNextOk MLDv2ReportMulticastAddressRecordIteratorNextDisposition = iota // MLDv2ReportMulticastAddressRecordIteratorNextDone indicates that the iterator // has been exhausted. MLDv2ReportMulticastAddressRecordIteratorNextDone // MLDv2ReportMulticastAddressRecordIteratorNextErrBufferTooShort indicates // that the iterator expected another record, but the buffer ended // prematurely. MLDv2ReportMulticastAddressRecordIteratorNextErrBufferTooShort ) // Next returns the next MLDv2 Multicast Address Record. func (it *MLDv2ReportMulticastAddressRecordIterator) Next() (MLDv2ReportMulticastAddressRecord, MLDv2ReportMulticastAddressRecordIteratorNextDisposition) { if it.recordsLeft == 0 { return MLDv2ReportMulticastAddressRecord{}, MLDv2ReportMulticastAddressRecordIteratorNextDone } if it.buf.Len() < mldv2ReportMulticastAddressRecordMinimumSize { return MLDv2ReportMulticastAddressRecord{}, MLDv2ReportMulticastAddressRecordIteratorNextErrBufferTooShort } hdr := MLDv2ReportMulticastAddressRecord(it.buf.Bytes()) expectedLen := mldv2ReportMulticastAddressRecordMinimumSize + int(hdr.AuxDataLen()) + int(hdr.numberOfSources())*IPv6AddressSize bytes := it.buf.Next(expectedLen) if len(bytes) < expectedLen { return MLDv2ReportMulticastAddressRecord{}, MLDv2ReportMulticastAddressRecordIteratorNextErrBufferTooShort } it.recordsLeft-- return MLDv2ReportMulticastAddressRecord(bytes), MLDv2ReportMulticastAddressRecordIteratorNextOk } // MulticastAddressRecords returns an iterator of MLDv2 Multicast Address // Records. func (m MLDv2Report) MulticastAddressRecords() MLDv2ReportMulticastAddressRecordIterator { return MLDv2ReportMulticastAddressRecordIterator{ recordsLeft: binary.BigEndian.Uint16(m[mldv2ReportNumberOfMulticastAddressRecordsOffset:]), buf: bytes.NewBuffer(m[mldv2ReportMulticastAddressRecordsOffset:]), } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/mldv2_igmpv3_common.go000066400000000000000000000100211465435605700261750ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import ( "bytes" "fmt" "time" "gvisor.dev/gvisor/pkg/tcpip" ) func mldv2AndIGMPv3QuerierQueryCodeToInterval(code uint8) time.Duration { // MLDv2: As per RFC 3810 section 5.1.19, // // The Querier's Query Interval Code field specifies the [Query // Interval] used by the Querier. The actual interval, called the // Querier's Query Interval (QQI), is represented in units of seconds, // and is derived from the Querier's Query Interval Code as follows: // // If QQIC < 128, QQI = QQIC // // If QQIC >= 128, QQIC represents a floating-point value as follows: // // 0 1 2 3 4 5 6 7 // +-+-+-+-+-+-+-+-+ // |1| exp | mant | // +-+-+-+-+-+-+-+-+ // // QQI = (mant | 0x10) << (exp + 3) // // Multicast routers that are not the current Querier adopt the QQI // value from the most recently received Query as their own [Query // Interval] value, unless that most recently received QQI was zero, in // which case the receiving routers use the default [Query Interval] // value specified in section 9.2. // // IGMPv3: As per RFC 3376 section 4.1.7, // // The Querier's Query Interval Code field specifies the [Query // Interval] used by the querier. The actual interval, called the // Querier's Query Interval (QQI), is represented in units of seconds // and is derived from the Querier's Query Interval Code as follows: // // If QQIC < 128, QQI = QQIC // // If QQIC >= 128, QQIC represents a floating-point value as follows: // // 0 1 2 3 4 5 6 7 // +-+-+-+-+-+-+-+-+ // |1| exp | mant | // +-+-+-+-+-+-+-+-+ // // QQI = (mant | 0x10) << (exp + 3) // // Multicast routers that are not the current querier adopt the QQI // value from the most recently received Query as their own [Query // Interval] value, unless that most recently received QQI was zero, in // which case the receiving routers use the default [Query Interval] // value specified in section 8.2. interval := time.Duration(code) if interval < 128 { return interval * time.Second } const expMask = 0b111 const mantBits = 4 mant := interval & ((1 << mantBits) - 1) exp := (interval >> mantBits) & expMask return (mant | 0x10) << (exp + 3) * time.Second } // MakeAddressIterator returns an AddressIterator. func MakeAddressIterator(addressSize int, buf *bytes.Buffer) AddressIterator { return AddressIterator{addressSize: addressSize, buf: buf} } // AddressIterator is an iterator over IPv6 addresses. type AddressIterator struct { addressSize int buf *bytes.Buffer } // Done indicates that the iterator has been exhausted/has no more elements. func (it *AddressIterator) Done() bool { return it.buf.Len() == 0 } // Next returns the next address in the iterator. // // Returns false if the iterator has been exhausted. func (it *AddressIterator) Next() (tcpip.Address, bool) { if it.Done() { var emptyAddress tcpip.Address return emptyAddress, false } b := it.buf.Next(it.addressSize) if len(b) != it.addressSize { panic(fmt.Sprintf("got len(buf.Next(%d)) = %d, want = %d", it.addressSize, len(b), it.addressSize)) } return tcpip.AddrFromSlice(b), true } func makeAddressIterator(b []byte, expectedAddresses uint16, addressSize int) (AddressIterator, bool) { expectedLen := int(expectedAddresses) * addressSize if len(b) < expectedLen { return AddressIterator{}, false } return MakeAddressIterator(addressSize, bytes.NewBuffer(b[:expectedLen])), true } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/ndp_neighbor_advert.go000066400000000000000000000070331465435605700263300ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import "gvisor.dev/gvisor/pkg/tcpip" // NDPNeighborAdvert is an NDP Neighbor Advertisement message. It will // only contain the body of an ICMPv6 packet. // // See RFC 4861 section 4.4 for more details. type NDPNeighborAdvert []byte const ( // NDPNAMinimumSize is the minimum size of a valid NDP Neighbor // Advertisement message (body of an ICMPv6 packet). NDPNAMinimumSize = 20 // ndpNATargetAddressOffset is the start of the Target Address // field within an NDPNeighborAdvert. ndpNATargetAddressOffset = 4 // ndpNAOptionsOffset is the start of the NDP options in an // NDPNeighborAdvert. ndpNAOptionsOffset = ndpNATargetAddressOffset + IPv6AddressSize // ndpNAFlagsOffset is the offset of the flags within an // NDPNeighborAdvert ndpNAFlagsOffset = 0 // ndpNARouterFlagMask is the mask of the Router Flag field in // the flags byte within in an NDPNeighborAdvert. ndpNARouterFlagMask = (1 << 7) // ndpNASolicitedFlagMask is the mask of the Solicited Flag field in // the flags byte within in an NDPNeighborAdvert. ndpNASolicitedFlagMask = (1 << 6) // ndpNAOverrideFlagMask is the mask of the Override Flag field in // the flags byte within in an NDPNeighborAdvert. ndpNAOverrideFlagMask = (1 << 5) ) // TargetAddress returns the value within the Target Address field. func (b NDPNeighborAdvert) TargetAddress() tcpip.Address { return tcpip.AddrFrom16Slice(b[ndpNATargetAddressOffset:][:IPv6AddressSize]) } // SetTargetAddress sets the value within the Target Address field. func (b NDPNeighborAdvert) SetTargetAddress(addr tcpip.Address) { copy(b[ndpNATargetAddressOffset:][:IPv6AddressSize], addr.AsSlice()) } // RouterFlag returns the value of the Router Flag field. func (b NDPNeighborAdvert) RouterFlag() bool { return b[ndpNAFlagsOffset]&ndpNARouterFlagMask != 0 } // SetRouterFlag sets the value in the Router Flag field. func (b NDPNeighborAdvert) SetRouterFlag(f bool) { if f { b[ndpNAFlagsOffset] |= ndpNARouterFlagMask } else { b[ndpNAFlagsOffset] &^= ndpNARouterFlagMask } } // SolicitedFlag returns the value of the Solicited Flag field. func (b NDPNeighborAdvert) SolicitedFlag() bool { return b[ndpNAFlagsOffset]&ndpNASolicitedFlagMask != 0 } // SetSolicitedFlag sets the value in the Solicited Flag field. func (b NDPNeighborAdvert) SetSolicitedFlag(f bool) { if f { b[ndpNAFlagsOffset] |= ndpNASolicitedFlagMask } else { b[ndpNAFlagsOffset] &^= ndpNASolicitedFlagMask } } // OverrideFlag returns the value of the Override Flag field. func (b NDPNeighborAdvert) OverrideFlag() bool { return b[ndpNAFlagsOffset]&ndpNAOverrideFlagMask != 0 } // SetOverrideFlag sets the value in the Override Flag field. func (b NDPNeighborAdvert) SetOverrideFlag(f bool) { if f { b[ndpNAFlagsOffset] |= ndpNAOverrideFlagMask } else { b[ndpNAFlagsOffset] &^= ndpNAOverrideFlagMask } } // Options returns an NDPOptions of the options body. func (b NDPNeighborAdvert) Options() NDPOptions { return NDPOptions(b[ndpNAOptionsOffset:]) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/ndp_neighbor_solicit.go000066400000000000000000000034711465435605700265130ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import "gvisor.dev/gvisor/pkg/tcpip" // NDPNeighborSolicit is an NDP Neighbor Solicitation message. It will only // contain the body of an ICMPv6 packet. // // See RFC 4861 section 4.3 for more details. type NDPNeighborSolicit []byte const ( // NDPNSMinimumSize is the minimum size of a valid NDP Neighbor // Solicitation message (body of an ICMPv6 packet). NDPNSMinimumSize = 20 // ndpNSTargetAddessOffset is the start of the Target Address // field within an NDPNeighborSolicit. ndpNSTargetAddessOffset = 4 // ndpNSOptionsOffset is the start of the NDP options in an // NDPNeighborSolicit. ndpNSOptionsOffset = ndpNSTargetAddessOffset + IPv6AddressSize ) // TargetAddress returns the value within the Target Address field. func (b NDPNeighborSolicit) TargetAddress() tcpip.Address { return tcpip.AddrFrom16Slice(b[ndpNSTargetAddessOffset:][:IPv6AddressSize]) } // SetTargetAddress sets the value within the Target Address field. func (b NDPNeighborSolicit) SetTargetAddress(addr tcpip.Address) { copy(b[ndpNSTargetAddessOffset:][:IPv6AddressSize], addr.AsSlice()) } // Options returns an NDPOptions of the options body. func (b NDPNeighborSolicit) Options() NDPOptions { return NDPOptions(b[ndpNSOptionsOffset:]) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/ndp_options.go000066400000000000000000001120231465435605700246550ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import ( "bytes" "encoding/binary" "errors" "fmt" "io" "math" "time" "gvisor.dev/gvisor/pkg/tcpip" ) // ndpOptionIdentifier is an NDP option type identifier. type ndpOptionIdentifier uint8 const ( // ndpSourceLinkLayerAddressOptionType is the type of the Source Link Layer // Address option, as per RFC 4861 section 4.6.1. ndpSourceLinkLayerAddressOptionType ndpOptionIdentifier = 1 // ndpTargetLinkLayerAddressOptionType is the type of the Target Link Layer // Address option, as per RFC 4861 section 4.6.1. ndpTargetLinkLayerAddressOptionType ndpOptionIdentifier = 2 // ndpPrefixInformationType is the type of the Prefix Information // option, as per RFC 4861 section 4.6.2. ndpPrefixInformationType ndpOptionIdentifier = 3 // ndpNonceOptionType is the type of the Nonce option, as per // RFC 3971 section 5.3.2. ndpNonceOptionType ndpOptionIdentifier = 14 // ndpRecursiveDNSServerOptionType is the type of the Recursive DNS // Server option, as per RFC 8106 section 5.1. ndpRecursiveDNSServerOptionType ndpOptionIdentifier = 25 // ndpDNSSearchListOptionType is the type of the DNS Search List option, // as per RFC 8106 section 5.2. ndpDNSSearchListOptionType ndpOptionIdentifier = 31 ) const ( // NDPLinkLayerAddressSize is the size of a Source or Target Link Layer // Address option for an Ethernet address. NDPLinkLayerAddressSize = 8 // ndpPrefixInformationLength is the expected length, in bytes, of the // body of an NDP Prefix Information option, as per RFC 4861 section // 4.6.2 which specifies that the Length field is 4. Given this, the // expected length, in bytes, is 30 because 4 * lengthByteUnits (8) - 2 // (Type & Length) = 30. ndpPrefixInformationLength = 30 // ndpPrefixInformationPrefixLengthOffset is the offset of the Prefix // Length field within an NDPPrefixInformation. ndpPrefixInformationPrefixLengthOffset = 0 // ndpPrefixInformationFlagsOffset is the offset of the flags byte // within an NDPPrefixInformation. ndpPrefixInformationFlagsOffset = 1 // ndpPrefixInformationOnLinkFlagMask is the mask of the On-Link Flag // field in the flags byte within an NDPPrefixInformation. ndpPrefixInformationOnLinkFlagMask = 1 << 7 // ndpPrefixInformationAutoAddrConfFlagMask is the mask of the // Autonomous Address-Configuration flag field in the flags byte within // an NDPPrefixInformation. ndpPrefixInformationAutoAddrConfFlagMask = 1 << 6 // ndpPrefixInformationReserved1FlagsMask is the mask of the Reserved1 // field in the flags byte within an NDPPrefixInformation. ndpPrefixInformationReserved1FlagsMask = 63 // ndpPrefixInformationValidLifetimeOffset is the start of the 4-byte // Valid Lifetime field within an NDPPrefixInformation. ndpPrefixInformationValidLifetimeOffset = 2 // ndpPrefixInformationPreferredLifetimeOffset is the start of the // 4-byte Preferred Lifetime field within an NDPPrefixInformation. ndpPrefixInformationPreferredLifetimeOffset = 6 // ndpPrefixInformationReserved2Offset is the start of the 4-byte // Reserved2 field within an NDPPrefixInformation. ndpPrefixInformationReserved2Offset = 10 // ndpPrefixInformationReserved2Length is the length of the Reserved2 // field. // // It is 4 bytes. ndpPrefixInformationReserved2Length = 4 // ndpPrefixInformationPrefixOffset is the start of the Prefix field // within an NDPPrefixInformation. ndpPrefixInformationPrefixOffset = 14 // ndpRecursiveDNSServerLifetimeOffset is the start of the 4-byte // Lifetime field within an NDPRecursiveDNSServer. ndpRecursiveDNSServerLifetimeOffset = 2 // ndpRecursiveDNSServerAddressesOffset is the start of the addresses // for IPv6 Recursive DNS Servers within an NDPRecursiveDNSServer. ndpRecursiveDNSServerAddressesOffset = 6 // minNDPRecursiveDNSServerLength is the minimum NDP Recursive DNS Server // option's body size when it contains at least one IPv6 address, as per // RFC 8106 section 5.3.1. minNDPRecursiveDNSServerBodySize = 22 // ndpDNSSearchListLifetimeOffset is the start of the 4-byte // Lifetime field within an NDPDNSSearchList. ndpDNSSearchListLifetimeOffset = 2 // ndpDNSSearchListDomainNamesOffset is the start of the DNS search list // domain names within an NDPDNSSearchList. ndpDNSSearchListDomainNamesOffset = 6 // minNDPDNSSearchListBodySize is the minimum NDP DNS Search List option's // body size when it contains at least one domain name, as per RFC 8106 // section 5.3.1. minNDPDNSSearchListBodySize = 14 // maxDomainNameLabelLength is the maximum length of a domain name // label, as per RFC 1035 section 3.1. maxDomainNameLabelLength = 63 // maxDomainNameLength is the maximum length of a domain name, including // label AND label length octet, as per RFC 1035 section 3.1. maxDomainNameLength = 255 // lengthByteUnits is the multiplier factor for the Length field of an // NDP option. That is, the length field for NDP options is in units of // 8 octets, as per RFC 4861 section 4.6. lengthByteUnits = 8 // NDPInfiniteLifetime is a value that represents infinity for the // 4-byte lifetime fields found in various NDP options. Its value is // (2^32 - 1)s = 4294967295s. NDPInfiniteLifetime = time.Second * math.MaxUint32 ) // NDPOptionIterator is an iterator of NDPOption. // // Note, between when an NDPOptionIterator is obtained and last used, no changes // to the NDPOptions may happen. Doing so may cause undefined and unexpected // behaviour. It is fine to obtain an NDPOptionIterator, iterate over the first // few NDPOption then modify the backing NDPOptions so long as the // NDPOptionIterator obtained before modification is no longer used. type NDPOptionIterator struct { opts *bytes.Buffer } // Potential errors when iterating over an NDPOptions. var ( ErrNDPOptMalformedBody = errors.New("NDP option has a malformed body") ErrNDPOptMalformedHeader = errors.New("NDP option has a malformed header") ) // Next returns the next element in the backing NDPOptions, or true if we are // done, or false if an error occurred. // // The return can be read as option, done, error. Note, option should only be // used if done is false and error is nil. func (i *NDPOptionIterator) Next() (NDPOption, bool, error) { for { // Do we still have elements to look at? if i.opts.Len() == 0 { return nil, true, nil } // Get the Type field. temp, err := i.opts.ReadByte() if err != nil { if err != io.EOF { // ReadByte should only ever return nil or io.EOF. panic(fmt.Sprintf("unexpected error when reading the option's Type field: %s", err)) } // We use io.ErrUnexpectedEOF as exhausting the buffer is unexpected once // we start parsing an option; we expect the buffer to contain enough // bytes for the whole option. return nil, true, fmt.Errorf("unexpectedly exhausted buffer when reading the option's Type field: %w", io.ErrUnexpectedEOF) } kind := ndpOptionIdentifier(temp) // Get the Length field. length, err := i.opts.ReadByte() if err != nil { if err != io.EOF { panic(fmt.Sprintf("unexpected error when reading the option's Length field for %s: %s", kind, err)) } return nil, true, fmt.Errorf("unexpectedly exhausted buffer when reading the option's Length field for %s: %w", kind, io.ErrUnexpectedEOF) } // This would indicate an erroneous NDP option as the Length field should // never be 0. if length == 0 { return nil, true, fmt.Errorf("zero valued Length field for %s: %w", kind, ErrNDPOptMalformedHeader) } // Get the body. numBytes := int(length) * lengthByteUnits numBodyBytes := numBytes - 2 body := i.opts.Next(numBodyBytes) if len(body) < numBodyBytes { return nil, true, fmt.Errorf("unexpectedly exhausted buffer when reading the option's Body for %s: %w", kind, io.ErrUnexpectedEOF) } switch kind { case ndpSourceLinkLayerAddressOptionType: return NDPSourceLinkLayerAddressOption(body), false, nil case ndpTargetLinkLayerAddressOptionType: return NDPTargetLinkLayerAddressOption(body), false, nil case ndpNonceOptionType: return NDPNonceOption(body), false, nil case ndpRouteInformationType: if numBodyBytes > ndpRouteInformationMaxLength { return nil, true, fmt.Errorf("got %d bytes for NDP Route Information option's body, expected at max %d bytes: %w", numBodyBytes, ndpRouteInformationMaxLength, ErrNDPOptMalformedBody) } opt := NDPRouteInformation(body) if err := opt.hasError(); err != nil { return nil, true, err } return opt, false, nil case ndpPrefixInformationType: // Make sure the length of a Prefix Information option // body is ndpPrefixInformationLength, as per RFC 4861 // section 4.6.2. if numBodyBytes != ndpPrefixInformationLength { return nil, true, fmt.Errorf("got %d bytes for NDP Prefix Information option's body, expected %d bytes: %w", numBodyBytes, ndpPrefixInformationLength, ErrNDPOptMalformedBody) } return NDPPrefixInformation(body), false, nil case ndpRecursiveDNSServerOptionType: opt := NDPRecursiveDNSServer(body) if err := opt.checkAddresses(); err != nil { return nil, true, err } return opt, false, nil case ndpDNSSearchListOptionType: opt := NDPDNSSearchList(body) if err := opt.checkDomainNames(); err != nil { return nil, true, err } return opt, false, nil default: // We do not yet recognize the option, just skip for // now. This is okay because RFC 4861 allows us to // skip/ignore any unrecognized options. However, // we MUST recognized all the options in RFC 4861. // // TODO(b/141487990): Handle all NDP options as defined // by RFC 4861. } } } // NDPOptions is a buffer of NDP options as defined by RFC 4861 section 4.6. type NDPOptions []byte // Iter returns an iterator of NDPOption. // // If check is true, Iter will do an integrity check on the options by iterating // over it and returning an error if detected. // // See NDPOptionIterator for more information. func (b NDPOptions) Iter(check bool) (NDPOptionIterator, error) { it := NDPOptionIterator{ opts: bytes.NewBuffer(b), } if check { it2 := NDPOptionIterator{ opts: bytes.NewBuffer(b), } for { if _, done, err := it2.Next(); err != nil || done { return it, err } } } return it, nil } // Serialize serializes the provided list of NDP options into b. // // Note, b must be of sufficient size to hold all the options in s. See // NDPOptionsSerializer.Length for details on the getting the total size // of a serialized NDPOptionsSerializer. // // Serialize may panic if b is not of sufficient size to hold all the options // in s. func (b NDPOptions) Serialize(s NDPOptionsSerializer) int { done := 0 for _, o := range s { l := paddedLength(o) if l == 0 { continue } b[0] = byte(o.kind()) // We know this safe because paddedLength would have returned // 0 if o had an invalid length (> 255 * lengthByteUnits). b[1] = uint8(l / lengthByteUnits) // Serialize NDP option body. used := o.serializeInto(b[2:]) // Zero out remaining (padding) bytes, if any exists. if used+2 < l { clear(b[used+2 : l]) } b = b[l:] done += l } return done } // NDPOption is the set of functions to be implemented by all NDP option types. type NDPOption interface { fmt.Stringer // kind returns the type of the receiver. kind() ndpOptionIdentifier // length returns the length of the body of the receiver, in bytes. length() int // serializeInto serializes the receiver into the provided byte // buffer. // // Note, the caller MUST provide a byte buffer with size of at least // Length. Implementers of this function may assume that the byte buffer // is of sufficient size. serializeInto MAY panic if the provided byte // buffer is not of sufficient size. // // serializeInto will return the number of bytes that was used to // serialize the receiver. Implementers must only use the number of // bytes required to serialize the receiver. Callers MAY provide a // larger buffer than required to serialize into. serializeInto([]byte) int } // paddedLength returns the length of o, in bytes, with any padding bytes, if // required. func paddedLength(o NDPOption) int { l := o.length() if l == 0 { return 0 } // Length excludes the 2 Type and Length bytes. l += 2 // Add extra bytes if needed to make sure the option is // lengthByteUnits-byte aligned. We do this by adding lengthByteUnits-1 // to l and then stripping off the last few LSBits from l. This will // make sure that l is rounded up to the nearest unit of // lengthByteUnits. This works since lengthByteUnits is a power of 2 // (= 8). mask := lengthByteUnits - 1 l += mask l &^= mask if l/lengthByteUnits > 255 { // Should never happen because an option can only have a max // value of 255 for its Length field, so just return 0 so this // option does not get serialized. // // Returning 0 here will make sure that this option does not get // serialized when NDPOptions.Serialize is called with the // NDPOptionsSerializer that holds this option, effectively // skipping this option during serialization. Also note that // a value of zero for the Length field in an NDP option is // invalid so this is another sign to the caller that this NDP // option is malformed, as per RFC 4861 section 4.6. return 0 } return l } // NDPOptionsSerializer is a serializer for NDP options. type NDPOptionsSerializer []NDPOption // Length returns the total number of bytes required to serialize. func (b NDPOptionsSerializer) Length() int { l := 0 for _, o := range b { l += paddedLength(o) } return l } // NDPNonceOption is the NDP Nonce Option as defined by RFC 3971 section 5.3.2. // // It is the first X bytes following the NDP option's Type and Length field // where X is the value in Length multiplied by lengthByteUnits - 2 bytes. type NDPNonceOption []byte // kind implements NDPOption. func (o NDPNonceOption) kind() ndpOptionIdentifier { return ndpNonceOptionType } // length implements NDPOption. func (o NDPNonceOption) length() int { return len(o) } // serializeInto implements NDPOption. func (o NDPNonceOption) serializeInto(b []byte) int { return copy(b, o) } // String implements fmt.Stringer. func (o NDPNonceOption) String() string { return fmt.Sprintf("%T(%x)", o, []byte(o)) } // Nonce returns the nonce value this option holds. func (o NDPNonceOption) Nonce() []byte { return o } // NDPSourceLinkLayerAddressOption is the NDP Source Link Layer Option // as defined by RFC 4861 section 4.6.1. // // It is the first X bytes following the NDP option's Type and Length field // where X is the value in Length multiplied by lengthByteUnits - 2 bytes. type NDPSourceLinkLayerAddressOption tcpip.LinkAddress // kind implements NDPOption. func (o NDPSourceLinkLayerAddressOption) kind() ndpOptionIdentifier { return ndpSourceLinkLayerAddressOptionType } // length implements NDPOption. func (o NDPSourceLinkLayerAddressOption) length() int { return len(o) } // serializeInto implements NDPOption. func (o NDPSourceLinkLayerAddressOption) serializeInto(b []byte) int { return copy(b, o) } // String implements fmt.Stringer. func (o NDPSourceLinkLayerAddressOption) String() string { return fmt.Sprintf("%T(%s)", o, tcpip.LinkAddress(o)) } // EthernetAddress will return an ethernet (MAC) address if the // NDPSourceLinkLayerAddressOption's body has at minimum EthernetAddressSize // bytes. If the body has more than EthernetAddressSize bytes, only the first // EthernetAddressSize bytes are returned as that is all that is needed for an // Ethernet address. func (o NDPSourceLinkLayerAddressOption) EthernetAddress() tcpip.LinkAddress { if len(o) >= EthernetAddressSize { return tcpip.LinkAddress(o[:EthernetAddressSize]) } return tcpip.LinkAddress([]byte(nil)) } // NDPTargetLinkLayerAddressOption is the NDP Target Link Layer Option // as defined by RFC 4861 section 4.6.1. // // It is the first X bytes following the NDP option's Type and Length field // where X is the value in Length multiplied by lengthByteUnits - 2 bytes. type NDPTargetLinkLayerAddressOption tcpip.LinkAddress // kind implements NDPOption. func (o NDPTargetLinkLayerAddressOption) kind() ndpOptionIdentifier { return ndpTargetLinkLayerAddressOptionType } // length implements NDPOption. func (o NDPTargetLinkLayerAddressOption) length() int { return len(o) } // serializeInto implements NDPOption. func (o NDPTargetLinkLayerAddressOption) serializeInto(b []byte) int { return copy(b, o) } // String implements fmt.Stringer. func (o NDPTargetLinkLayerAddressOption) String() string { return fmt.Sprintf("%T(%s)", o, tcpip.LinkAddress(o)) } // EthernetAddress will return an ethernet (MAC) address if the // NDPTargetLinkLayerAddressOption's body has at minimum EthernetAddressSize // bytes. If the body has more than EthernetAddressSize bytes, only the first // EthernetAddressSize bytes are returned as that is all that is needed for an // Ethernet address. func (o NDPTargetLinkLayerAddressOption) EthernetAddress() tcpip.LinkAddress { if len(o) >= EthernetAddressSize { return tcpip.LinkAddress(o[:EthernetAddressSize]) } return tcpip.LinkAddress([]byte(nil)) } // NDPPrefixInformation is the NDP Prefix Information option as defined by // RFC 4861 section 4.6.2. // // The length, in bytes, of a valid NDP Prefix Information option body MUST be // ndpPrefixInformationLength bytes. type NDPPrefixInformation []byte // kind implements NDPOption. func (o NDPPrefixInformation) kind() ndpOptionIdentifier { return ndpPrefixInformationType } // length implements NDPOption. func (o NDPPrefixInformation) length() int { return ndpPrefixInformationLength } // serializeInto implements NDPOption. func (o NDPPrefixInformation) serializeInto(b []byte) int { used := copy(b, o) // Zero out the Reserved1 field. b[ndpPrefixInformationFlagsOffset] &^= ndpPrefixInformationReserved1FlagsMask // Zero out the Reserved2 field. reserved2 := b[ndpPrefixInformationReserved2Offset:][:ndpPrefixInformationReserved2Length] clear(reserved2) return used } // String implements fmt.Stringer. func (o NDPPrefixInformation) String() string { return fmt.Sprintf("%T(O=%t, A=%t, PL=%s, VL=%s, Prefix=%s)", o, o.OnLinkFlag(), o.AutonomousAddressConfigurationFlag(), o.PreferredLifetime(), o.ValidLifetime(), o.Subnet()) } // PrefixLength returns the value in the number of leading bits in the Prefix // that are valid. // // Valid values are in the range [0, 128], but o may not always contain valid // values. It is up to the caller to valdiate the Prefix Information option. func (o NDPPrefixInformation) PrefixLength() uint8 { return o[ndpPrefixInformationPrefixLengthOffset] } // OnLinkFlag returns true of the prefix is considered on-link. On-link means // that a forwarding node is not needed to send packets to other nodes on the // same prefix. // // Note, when this function returns false, no statement is made about the // on-link property of a prefix. That is, if OnLinkFlag returns false, the // caller MUST NOT conclude that the prefix is off-link and MUST NOT update any // previously stored state for this prefix about its on-link status. func (o NDPPrefixInformation) OnLinkFlag() bool { return o[ndpPrefixInformationFlagsOffset]&ndpPrefixInformationOnLinkFlagMask != 0 } // AutonomousAddressConfigurationFlag returns true if the prefix can be used for // Stateless Address Auto-Configuration (as specified in RFC 4862). func (o NDPPrefixInformation) AutonomousAddressConfigurationFlag() bool { return o[ndpPrefixInformationFlagsOffset]&ndpPrefixInformationAutoAddrConfFlagMask != 0 } // ValidLifetime returns the length of time that the prefix is valid for the // purpose of on-link determination. This value is relative to the send time of // the packet that the Prefix Information option was present in. // // Note, a value of 0 implies the prefix should not be considered as on-link, // and a value of infinity/forever is represented by // NDPInfiniteLifetime. func (o NDPPrefixInformation) ValidLifetime() time.Duration { // The field is the time in seconds, as per RFC 4861 section 4.6.2. return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpPrefixInformationValidLifetimeOffset:])) } // PreferredLifetime returns the length of time that an address generated from // the prefix via Stateless Address Auto-Configuration remains preferred. This // value is relative to the send time of the packet that the Prefix Information // option was present in. // // Note, a value of 0 implies that addresses generated from the prefix should // no longer remain preferred, and a value of infinity is represented by // NDPInfiniteLifetime. // // Also note that the value of this field MUST NOT exceed the Valid Lifetime // field to avoid preferring addresses that are no longer valid, for the // purpose of Stateless Address Auto-Configuration. func (o NDPPrefixInformation) PreferredLifetime() time.Duration { // The field is the time in seconds, as per RFC 4861 section 4.6.2. return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpPrefixInformationPreferredLifetimeOffset:])) } // Prefix returns an IPv6 address or a prefix of an IPv6 address. The Prefix // Length field (see NDPPrefixInformation.PrefixLength) contains the number // of valid leading bits in the prefix. // // Hosts SHOULD ignore an NDP Prefix Information option where the Prefix field // holds the link-local prefix (fe80::). func (o NDPPrefixInformation) Prefix() tcpip.Address { return tcpip.AddrFrom16Slice(o[ndpPrefixInformationPrefixOffset:][:IPv6AddressSize]) } // Subnet returns the Prefix field and Prefix Length field represented in a // tcpip.Subnet. func (o NDPPrefixInformation) Subnet() tcpip.Subnet { addrWithPrefix := tcpip.AddressWithPrefix{ Address: o.Prefix(), PrefixLen: int(o.PrefixLength()), } return addrWithPrefix.Subnet() } // NDPRecursiveDNSServer is the NDP Recursive DNS Server option, as defined by // RFC 8106 section 5.1. // // To make sure that the option meets its minimum length and does not end in the // middle of a DNS server's IPv6 address, the length of a valid // NDPRecursiveDNSServer must meet the following constraint: // // (Length - ndpRecursiveDNSServerAddressesOffset) % IPv6AddressSize == 0 type NDPRecursiveDNSServer []byte // Type returns the type of an NDP Recursive DNS Server option. // // kind implements NDPOption. func (NDPRecursiveDNSServer) kind() ndpOptionIdentifier { return ndpRecursiveDNSServerOptionType } // length implements NDPOption. func (o NDPRecursiveDNSServer) length() int { return len(o) } // serializeInto implements NDPOption. func (o NDPRecursiveDNSServer) serializeInto(b []byte) int { used := copy(b, o) // Zero out the reserved bytes that are before the Lifetime field. clear(b[0:ndpRecursiveDNSServerLifetimeOffset]) return used } // String implements fmt.Stringer. func (o NDPRecursiveDNSServer) String() string { lt := o.Lifetime() addrs, err := o.Addresses() if err != nil { return fmt.Sprintf("%T([] valid for %s; err = %s)", o, lt, err) } return fmt.Sprintf("%T(%s valid for %s)", o, addrs, lt) } // Lifetime returns the length of time that the DNS server addresses // in this option may be used for name resolution. // // Note, a value of 0 implies the addresses should no longer be used, // and a value of infinity/forever is represented by NDPInfiniteLifetime. // // Lifetime may panic if o does not have enough bytes to hold the Lifetime // field. func (o NDPRecursiveDNSServer) Lifetime() time.Duration { // The field is the time in seconds, as per RFC 8106 section 5.1. return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpRecursiveDNSServerLifetimeOffset:])) } // Addresses returns the recursive DNS server IPv6 addresses that may be // used for name resolution. // // Note, the addresses MAY be link-local addresses. func (o NDPRecursiveDNSServer) Addresses() ([]tcpip.Address, error) { var addrs []tcpip.Address return addrs, o.iterAddresses(func(addr tcpip.Address) { addrs = append(addrs, addr) }) } // checkAddresses iterates over the addresses in an NDP Recursive DNS Server // option and returns any error it encounters. func (o NDPRecursiveDNSServer) checkAddresses() error { return o.iterAddresses(nil) } // iterAddresses iterates over the addresses in an NDP Recursive DNS Server // option and calls a function with each valid unicast IPv6 address. // // Note, the addresses MAY be link-local addresses. func (o NDPRecursiveDNSServer) iterAddresses(fn func(tcpip.Address)) error { if l := len(o); l < minNDPRecursiveDNSServerBodySize { return fmt.Errorf("got %d bytes for NDP Recursive DNS Server option's body, expected at least %d bytes: %w", l, minNDPRecursiveDNSServerBodySize, io.ErrUnexpectedEOF) } o = o[ndpRecursiveDNSServerAddressesOffset:] l := len(o) if l%IPv6AddressSize != 0 { return fmt.Errorf("NDP Recursive DNS Server option's body ends in the middle of an IPv6 address (addresses body size = %d bytes): %w", l, ErrNDPOptMalformedBody) } for i := 0; len(o) != 0; i++ { addr := tcpip.AddrFrom16Slice(o[:IPv6AddressSize]) if !IsV6UnicastAddress(addr) { return fmt.Errorf("%d-th address (%s) in NDP Recursive DNS Server option is not a valid unicast IPv6 address: %w", i, addr, ErrNDPOptMalformedBody) } if fn != nil { fn(addr) } o = o[IPv6AddressSize:] } return nil } // NDPDNSSearchList is the NDP DNS Search List option, as defined by // RFC 8106 section 5.2. type NDPDNSSearchList []byte // kind implements NDPOption. func (o NDPDNSSearchList) kind() ndpOptionIdentifier { return ndpDNSSearchListOptionType } // length implements NDPOption. func (o NDPDNSSearchList) length() int { return len(o) } // serializeInto implements NDPOption. func (o NDPDNSSearchList) serializeInto(b []byte) int { used := copy(b, o) // Zero out the reserved bytes that are before the Lifetime field. clear(b[0:ndpDNSSearchListLifetimeOffset]) return used } // String implements fmt.Stringer. func (o NDPDNSSearchList) String() string { lt := o.Lifetime() domainNames, err := o.DomainNames() if err != nil { return fmt.Sprintf("%T([] valid for %s; err = %s)", o, lt, err) } return fmt.Sprintf("%T(%s valid for %s)", o, domainNames, lt) } // Lifetime returns the length of time that the DNS search list of domain names // in this option may be used for name resolution. // // Note, a value of 0 implies the domain names should no longer be used, // and a value of infinity/forever is represented by NDPInfiniteLifetime. func (o NDPDNSSearchList) Lifetime() time.Duration { // The field is the time in seconds, as per RFC 8106 section 5.1. return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpDNSSearchListLifetimeOffset:])) } // DomainNames returns a DNS search list of domain names. // // DomainNames will parse the backing buffer as outlined by RFC 1035 section // 3.1 and return a list of strings, with all domain names in lower case. func (o NDPDNSSearchList) DomainNames() ([]string, error) { var domainNames []string return domainNames, o.iterDomainNames(func(domainName string) { domainNames = append(domainNames, domainName) }) } // checkDomainNames iterates over the domain names in an NDP DNS Search List // option and returns any error it encounters. func (o NDPDNSSearchList) checkDomainNames() error { return o.iterDomainNames(nil) } // iterDomainNames iterates over the domain names in an NDP DNS Search List // option and calls a function with each valid domain name. func (o NDPDNSSearchList) iterDomainNames(fn func(string)) error { if l := len(o); l < minNDPDNSSearchListBodySize { return fmt.Errorf("got %d bytes for NDP DNS Search List option's body, expected at least %d bytes: %w", l, minNDPDNSSearchListBodySize, io.ErrUnexpectedEOF) } var searchList bytes.Reader searchList.Reset(o[ndpDNSSearchListDomainNamesOffset:]) var scratch [maxDomainNameLength]byte domainName := bytes.NewBuffer(scratch[:]) // Parse the domain names, as per RFC 1035 section 3.1. for searchList.Len() != 0 { domainName.Reset() // Parse a label within a domain name, as per RFC 1035 section 3.1. for { // The first byte is the label length. labelLenByte, err := searchList.ReadByte() if err != nil { if err != io.EOF { // ReadByte should only ever return nil or io.EOF. panic(fmt.Sprintf("unexpected error when reading a label's length: %s", err)) } // We use io.ErrUnexpectedEOF as exhausting the buffer is unexpected // once we start parsing a domain name; we expect the buffer to contain // enough bytes for the whole domain name. return fmt.Errorf("unexpected exhausted buffer while parsing a new label for a domain from NDP Search List option: %w", io.ErrUnexpectedEOF) } labelLen := int(labelLenByte) // A zero-length label implies the end of a domain name. if labelLen == 0 { // If the domain name is empty or we have no callback function, do // nothing further with the current domain name. if domainName.Len() == 0 || fn == nil { break } // Ignore the trailing period in the parsed domain name. domainName.Truncate(domainName.Len() - 1) fn(domainName.String()) break } // The label's length must not exceed the maximum length for a label. if labelLen > maxDomainNameLabelLength { return fmt.Errorf("label length of %d bytes is greater than the max label length of %d bytes for an NDP Search List option: %w", labelLen, maxDomainNameLabelLength, ErrNDPOptMalformedBody) } // The label (and trailing period) must not make the domain name too long. if labelLen+1 > domainName.Cap()-domainName.Len() { return fmt.Errorf("label would make an NDP Search List option's domain name longer than the max domain name length of %d bytes: %w", maxDomainNameLength, ErrNDPOptMalformedBody) } // Copy the label and add a trailing period. for i := 0; i < labelLen; i++ { b, err := searchList.ReadByte() if err != nil { if err != io.EOF { panic(fmt.Sprintf("unexpected error when reading domain name's label: %s", err)) } return fmt.Errorf("read %d out of %d bytes for a domain name's label from NDP Search List option: %w", i, labelLen, io.ErrUnexpectedEOF) } // As per RFC 1035 section 2.3.1: // 1) the label must only contain ASCII include letters, digits and // hyphens // 2) the first character in a label must be a letter // 3) the last letter in a label must be a letter or digit if !isLetter(b) { if i == 0 { return fmt.Errorf("first character of a domain name's label in an NDP Search List option must be a letter, got character code = %d: %w", b, ErrNDPOptMalformedBody) } if b == '-' { if i == labelLen-1 { return fmt.Errorf("last character of a domain name's label in an NDP Search List option must not be a hyphen (-): %w", ErrNDPOptMalformedBody) } } else if !isDigit(b) { return fmt.Errorf("domain name's label in an NDP Search List option may only contain letters, digits and hyphens, got character code = %d: %w", b, ErrNDPOptMalformedBody) } } // If b is an upper case character, make it lower case. if isUpperLetter(b) { b = b - 'A' + 'a' } if err := domainName.WriteByte(b); err != nil { panic(fmt.Sprintf("unexpected error writing label to domain name buffer: %s", err)) } } if err := domainName.WriteByte('.'); err != nil { panic(fmt.Sprintf("unexpected error writing trailing period to domain name buffer: %s", err)) } } } return nil } func isLetter(b byte) bool { return b >= 'a' && b <= 'z' || isUpperLetter(b) } func isUpperLetter(b byte) bool { return b >= 'A' && b <= 'Z' } func isDigit(b byte) bool { return b >= '0' && b <= '9' } // As per RFC 4191 section 2.3, // // 2.3. Route Information Option // // 0 1 2 3 // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Type | Length | Prefix Length |Resvd|Prf|Resvd| // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Route Lifetime | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Prefix (Variable Length) | // . . // . . // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // // Fields: // // Type 24 // // // Length 8-bit unsigned integer. The length of the option // (including the Type and Length fields) in units of 8 // octets. The Length field is 1, 2, or 3 depending on the // Prefix Length. If Prefix Length is greater than 64, then // Length must be 3. If Prefix Length is greater than 0, // then Length must be 2 or 3. If Prefix Length is zero, // then Length must be 1, 2, or 3. const ( ndpRouteInformationType = ndpOptionIdentifier(24) ndpRouteInformationMaxLength = 22 ndpRouteInformationPrefixLengthIdx = 0 ndpRouteInformationFlagsIdx = 1 ndpRouteInformationPrfShift = 3 ndpRouteInformationPrfMask = 3 << ndpRouteInformationPrfShift ndpRouteInformationRouteLifetimeIdx = 2 ndpRouteInformationRoutePrefixIdx = 6 ) // NDPRouteInformation is the NDP Router Information option, as defined by // RFC 4191 section 2.3. type NDPRouteInformation []byte func (NDPRouteInformation) kind() ndpOptionIdentifier { return ndpRouteInformationType } func (o NDPRouteInformation) length() int { return len(o) } func (o NDPRouteInformation) serializeInto(b []byte) int { return copy(b, o) } // String implements fmt.Stringer. func (o NDPRouteInformation) String() string { return fmt.Sprintf("%T", o) } // PrefixLength returns the length of the prefix. func (o NDPRouteInformation) PrefixLength() uint8 { return o[ndpRouteInformationPrefixLengthIdx] } // RoutePreference returns the preference of the route over other routes to the // same destination but through a different router. func (o NDPRouteInformation) RoutePreference() NDPRoutePreference { return NDPRoutePreference((o[ndpRouteInformationFlagsIdx] & ndpRouteInformationPrfMask) >> ndpRouteInformationPrfShift) } // RouteLifetime returns the lifetime of the route. // // Note, a value of 0 implies the route is now invalid and a value of // infinity/forever is represented by NDPInfiniteLifetime. func (o NDPRouteInformation) RouteLifetime() time.Duration { return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpRouteInformationRouteLifetimeIdx:])) } // Prefix returns the prefix of the destination subnet this route is for. func (o NDPRouteInformation) Prefix() (tcpip.Subnet, error) { prefixLength := int(o.PrefixLength()) if max := IPv6AddressSize * 8; prefixLength > max { return tcpip.Subnet{}, fmt.Errorf("got prefix length = %d, want <= %d", prefixLength, max) } prefix := o[ndpRouteInformationRoutePrefixIdx:] var addrBytes [IPv6AddressSize]byte if n := copy(addrBytes[:], prefix); n != len(prefix) { panic(fmt.Sprintf("got copy(addrBytes, prefix) = %d, want = %d", n, len(prefix))) } return tcpip.AddressWithPrefix{ Address: tcpip.AddrFrom16(addrBytes), PrefixLen: prefixLength, }.Subnet(), nil } func (o NDPRouteInformation) hasError() error { l := len(o) if l < ndpRouteInformationRoutePrefixIdx { return fmt.Errorf("%T too small, got = %d bytes: %w", o, l, ErrNDPOptMalformedBody) } prefixLength := int(o.PrefixLength()) if max := IPv6AddressSize * 8; prefixLength > max { return fmt.Errorf("got prefix length = %d, want <= %d: %w", prefixLength, max, ErrNDPOptMalformedBody) } // Length 8-bit unsigned integer. The length of the option // (including the Type and Length fields) in units of 8 // octets. The Length field is 1, 2, or 3 depending on the // Prefix Length. If Prefix Length is greater than 64, then // Length must be 3. If Prefix Length is greater than 0, // then Length must be 2 or 3. If Prefix Length is zero, // then Length must be 1, 2, or 3. l += 2 // Add 2 bytes for the type and length bytes. lengthField := l / lengthByteUnits if prefixLength > 64 { if lengthField != 3 { return fmt.Errorf("Length field must be 3 when Prefix Length (%d) is > 64 (got = %d): %w", prefixLength, lengthField, ErrNDPOptMalformedBody) } } else if prefixLength > 0 { if lengthField != 2 && lengthField != 3 { return fmt.Errorf("Length field must be 2 or 3 when Prefix Length (%d) is between 0 and 64 (got = %d): %w", prefixLength, lengthField, ErrNDPOptMalformedBody) } } else if lengthField == 0 || lengthField > 3 { return fmt.Errorf("Length field must be 1, 2, or 3 when Prefix Length is zero (got = %d): %w", lengthField, ErrNDPOptMalformedBody) } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/ndp_router_advert.go000066400000000000000000000165321465435605700260570ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import ( "encoding/binary" "fmt" "time" ) var _ fmt.Stringer = NDPRoutePreference(0) // NDPRoutePreference is the preference values for default routers or // more-specific routes. // // As per RFC 4191 section 2.1, // // Default router preferences and preferences for more-specific routes // are encoded the same way. // // Preference values are encoded as a two-bit signed integer, as // follows: // // 01 High // 00 Medium (default) // 11 Low // 10 Reserved - MUST NOT be sent // // Note that implementations can treat the value as a two-bit signed // integer. // // Having just three values reinforces that they are not metrics and // more values do not appear to be necessary for reasonable scenarios. type NDPRoutePreference uint8 const ( // HighRoutePreference indicates a high preference, as per // RFC 4191 section 2.1. HighRoutePreference NDPRoutePreference = 0b01 // MediumRoutePreference indicates a medium preference, as per // RFC 4191 section 2.1. // // This is the default preference value. MediumRoutePreference = 0b00 // LowRoutePreference indicates a low preference, as per // RFC 4191 section 2.1. LowRoutePreference = 0b11 // ReservedRoutePreference is a reserved preference value, as per // RFC 4191 section 2.1. // // It MUST NOT be sent. ReservedRoutePreference = 0b10 ) // String implements fmt.Stringer. func (p NDPRoutePreference) String() string { switch p { case HighRoutePreference: return "HighRoutePreference" case MediumRoutePreference: return "MediumRoutePreference" case LowRoutePreference: return "LowRoutePreference" case ReservedRoutePreference: return "ReservedRoutePreference" default: return fmt.Sprintf("NDPRoutePreference(%d)", p) } } // NDPRouterAdvert is an NDP Router Advertisement message. It will only contain // the body of an ICMPv6 packet. // // See RFC 4861 section 4.2 and RFC 4191 section 2.2 for more details. type NDPRouterAdvert []byte // As per RFC 4191 section 2.2, // // 0 1 2 3 // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Type | Code | Checksum | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Cur Hop Limit |M|O|H|Prf|Resvd| Router Lifetime | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Reachable Time | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Retrans Timer | // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // | Options ... // +-+-+-+-+-+-+-+-+-+-+-+- const ( // NDPRAMinimumSize is the minimum size of a valid NDP Router // Advertisement message (body of an ICMPv6 packet). NDPRAMinimumSize = 12 // ndpRACurrHopLimitOffset is the byte of the Curr Hop Limit field // within an NDPRouterAdvert. ndpRACurrHopLimitOffset = 0 // ndpRAFlagsOffset is the byte with the NDP RA bit-fields/flags // within an NDPRouterAdvert. ndpRAFlagsOffset = 1 // ndpRAManagedAddrConfFlagMask is the mask of the Managed Address // Configuration flag within the bit-field/flags byte of an // NDPRouterAdvert. ndpRAManagedAddrConfFlagMask = (1 << 7) // ndpRAOtherConfFlagMask is the mask of the Other Configuration flag // within the bit-field/flags byte of an NDPRouterAdvert. ndpRAOtherConfFlagMask = (1 << 6) // ndpDefaultRouterPreferenceShift is the shift of the Prf (Default Router // Preference) field within the flags byte of an NDPRouterAdvert. ndpDefaultRouterPreferenceShift = 3 // ndpDefaultRouterPreferenceMask is the mask of the Prf (Default Router // Preference) field within the flags byte of an NDPRouterAdvert. ndpDefaultRouterPreferenceMask = (0b11 << ndpDefaultRouterPreferenceShift) // ndpRARouterLifetimeOffset is the start of the 2-byte Router Lifetime // field within an NDPRouterAdvert. ndpRARouterLifetimeOffset = 2 // ndpRAReachableTimeOffset is the start of the 4-byte Reachable Time // field within an NDPRouterAdvert. ndpRAReachableTimeOffset = 4 // ndpRARetransTimerOffset is the start of the 4-byte Retrans Timer // field within an NDPRouterAdvert. ndpRARetransTimerOffset = 8 // ndpRAOptionsOffset is the start of the NDP options in an // NDPRouterAdvert. ndpRAOptionsOffset = 12 ) // CurrHopLimit returns the value of the Curr Hop Limit field. func (b NDPRouterAdvert) CurrHopLimit() uint8 { return b[ndpRACurrHopLimitOffset] } // ManagedAddrConfFlag returns the value of the Managed Address Configuration // flag. func (b NDPRouterAdvert) ManagedAddrConfFlag() bool { return b[ndpRAFlagsOffset]&ndpRAManagedAddrConfFlagMask != 0 } // OtherConfFlag returns the value of the Other Configuration flag. func (b NDPRouterAdvert) OtherConfFlag() bool { return b[ndpRAFlagsOffset]&ndpRAOtherConfFlagMask != 0 } // DefaultRouterPreference returns the Default Router Preference field. func (b NDPRouterAdvert) DefaultRouterPreference() NDPRoutePreference { return NDPRoutePreference((b[ndpRAFlagsOffset] & ndpDefaultRouterPreferenceMask) >> ndpDefaultRouterPreferenceShift) } // RouterLifetime returns the lifetime associated with the default router. A // value of 0 means the source of the Router Advertisement is not a default // router and SHOULD NOT appear on the default router list. Note, a value of 0 // only means that the router should not be used as a default router, it does // not apply to other information contained in the Router Advertisement. func (b NDPRouterAdvert) RouterLifetime() time.Duration { // The field is the time in seconds, as per RFC 4861 section 4.2. return time.Second * time.Duration(binary.BigEndian.Uint16(b[ndpRARouterLifetimeOffset:])) } // ReachableTime returns the time that a node assumes a neighbor is reachable // after having received a reachability confirmation. A value of 0 means // that it is unspecified by the source of the Router Advertisement message. func (b NDPRouterAdvert) ReachableTime() time.Duration { // The field is the time in milliseconds, as per RFC 4861 section 4.2. return time.Millisecond * time.Duration(binary.BigEndian.Uint32(b[ndpRAReachableTimeOffset:])) } // RetransTimer returns the time between retransmitted Neighbor Solicitation // messages. A value of 0 means that it is unspecified by the source of the // Router Advertisement message. func (b NDPRouterAdvert) RetransTimer() time.Duration { // The field is the time in milliseconds, as per RFC 4861 section 4.2. return time.Millisecond * time.Duration(binary.BigEndian.Uint32(b[ndpRARetransTimerOffset:])) } // Options returns an NDPOptions of the options body. func (b NDPRouterAdvert) Options() NDPOptions { return NDPOptions(b[ndpRAOptionsOffset:]) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/ndp_router_solicit.go000066400000000000000000000022721465435605700262340ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header // NDPRouterSolicit is an NDP Router Solicitation message. It will only contain // the body of an ICMPv6 packet. // // See RFC 4861 section 4.1 for more details. type NDPRouterSolicit []byte const ( // NDPRSMinimumSize is the minimum size of a valid NDP Router // Solicitation message (body of an ICMPv6 packet). NDPRSMinimumSize = 4 // ndpRSOptionsOffset is the start of the NDP options in an // NDPRouterSolicit. ndpRSOptionsOffset = 4 ) // Options returns an NDPOptions of the options body. func (b NDPRouterSolicit) Options() NDPOptions { return NDPOptions(b[ndpRSOptionsOffset:]) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/ndpoptionidentifier_string.go000066400000000000000000000036301465435605700277670ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by "stringer -type ndpOptionIdentifier"; DO NOT EDIT. package header import "strconv" func _() { // An "invalid array index" compiler error signifies that the constant values have changed. // Re-run the stringer command to generate them again. var x [1]struct{} _ = x[ndpSourceLinkLayerAddressOptionType-1] _ = x[ndpTargetLinkLayerAddressOptionType-2] _ = x[ndpPrefixInformationType-3] _ = x[ndpNonceOptionType-14] _ = x[ndpRecursiveDNSServerOptionType-25] _ = x[ndpDNSSearchListOptionType-31] } const ( _ndpOptionIdentifier_name_0 = "ndpSourceLinkLayerAddressOptionTypendpTargetLinkLayerAddressOptionTypendpPrefixInformationType" _ndpOptionIdentifier_name_1 = "ndpNonceOptionType" _ndpOptionIdentifier_name_2 = "ndpRecursiveDNSServerOptionType" _ndpOptionIdentifier_name_3 = "ndpDNSSearchListOptionType" ) var ( _ndpOptionIdentifier_index_0 = [...]uint8{0, 35, 70, 94} ) func (i ndpOptionIdentifier) String() string { switch { case 1 <= i && i <= 3: i -= 1 return _ndpOptionIdentifier_name_0[_ndpOptionIdentifier_index_0[i]:_ndpOptionIdentifier_index_0[i+1]] case i == 14: return _ndpOptionIdentifier_name_1 case i == 25: return _ndpOptionIdentifier_name_2 case i == 31: return _ndpOptionIdentifier_name_3 default: return "ndpOptionIdentifier(" + strconv.FormatInt(int64(i), 10) + ")" } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/parse/000077500000000000000000000000001465435605700231055ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/parse/parse.go000066400000000000000000000172001465435605700245460ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package parse provides utilities to parse packets. package parse import ( "fmt" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // ARP populates pkt's network header with an ARP header found in // pkt.Data. // // Returns true if the header was successfully parsed. func ARP(pkt *stack.PacketBuffer) bool { _, ok := pkt.NetworkHeader().Consume(header.ARPSize) if ok { pkt.NetworkProtocolNumber = header.ARPProtocolNumber } return ok } // IPv4 parses an IPv4 packet found in pkt.Data and populates pkt's network // header with the IPv4 header. // // Returns true if the header was successfully parsed. func IPv4(pkt *stack.PacketBuffer) bool { hdr, ok := pkt.Data().PullUp(header.IPv4MinimumSize) if !ok { return false } ipHdr := header.IPv4(hdr) // Header may have options, determine the true header length. headerLen := int(ipHdr.HeaderLength()) if headerLen < header.IPv4MinimumSize { // TODO(gvisor.dev/issue/2404): Per RFC 791, IHL needs to be at least 5 in // order for the packet to be valid. Figure out if we want to reject this // case. headerLen = header.IPv4MinimumSize } hdr, ok = pkt.NetworkHeader().Consume(headerLen) if !ok { return false } ipHdr = header.IPv4(hdr) length := int(ipHdr.TotalLength()) - len(hdr) if length < 0 { return false } pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber pkt.Data().CapLength(length) return true } // IPv6 parses an IPv6 packet found in pkt.Data and populates pkt's network // header with the IPv6 header. func IPv6(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, fragID uint32, fragOffset uint16, fragMore bool, ok bool) { hdr, ok := pkt.Data().PullUp(header.IPv6MinimumSize) if !ok { return 0, 0, 0, false, false } ipHdr := header.IPv6(hdr) // Create a VV to parse the packet. We don't plan to modify anything here. // dataVV consists of: // - Any IPv6 header bytes after the first 40 (i.e. extensions). // - The transport header, if present. // - Any other payload data. dataBuf := pkt.Data().ToBuffer() dataBuf.TrimFront(header.IPv6MinimumSize) it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(ipHdr.NextHeader()), dataBuf) defer it.Release() // Iterate over the IPv6 extensions to find their length. var nextHdr tcpip.TransportProtocolNumber var extensionsSize int64 traverseExtensions: for { extHdr, done, err := it.Next() if err != nil { break } // If we exhaust the extension list, the entire packet is the IPv6 header // and (possibly) extensions. if done { extensionsSize = dataBuf.Size() break } switch extHdr := extHdr.(type) { case header.IPv6FragmentExtHdr: if extHdr.IsAtomic() { // This fragment extension header indicates that this packet is an // atomic fragment. An atomic fragment is a fragment that contains // all the data required to reassemble a full packet. As per RFC 6946, // atomic fragments must not interfere with "normal" fragmented traffic // so we skip processing the fragment instead of feeding it through the // reassembly process below. continue } if fragID == 0 && fragOffset == 0 && !fragMore { fragID = extHdr.ID() fragOffset = extHdr.FragmentOffset() fragMore = extHdr.More() } rawPayload := it.AsRawHeader(true /* consume */) extensionsSize = dataBuf.Size() - rawPayload.Buf.Size() rawPayload.Release() extHdr.Release() break traverseExtensions case header.IPv6RawPayloadHeader: // We've found the payload after any extensions. extensionsSize = dataBuf.Size() - extHdr.Buf.Size() nextHdr = tcpip.TransportProtocolNumber(extHdr.Identifier) extHdr.Release() break traverseExtensions default: extHdr.Release() // Any other extension is a no-op, keep looping until we find the payload. } } // Put the IPv6 header with extensions in pkt.NetworkHeader(). hdr, ok = pkt.NetworkHeader().Consume(header.IPv6MinimumSize + int(extensionsSize)) if !ok { panic(fmt.Sprintf("pkt.Data should have at least %d bytes, but only has %d.", header.IPv6MinimumSize+extensionsSize, pkt.Data().Size())) } ipHdr = header.IPv6(hdr) pkt.Data().CapLength(int(ipHdr.PayloadLength())) pkt.NetworkProtocolNumber = header.IPv6ProtocolNumber return nextHdr, fragID, fragOffset, fragMore, true } // UDP parses a UDP packet found in pkt.Data and populates pkt's transport // header with the UDP header. // // Returns true if the header was successfully parsed. func UDP(pkt *stack.PacketBuffer) bool { _, ok := pkt.TransportHeader().Consume(header.UDPMinimumSize) pkt.TransportProtocolNumber = header.UDPProtocolNumber return ok } // TCP parses a TCP packet found in pkt.Data and populates pkt's transport // header with the TCP header. // // Returns true if the header was successfully parsed. func TCP(pkt *stack.PacketBuffer) bool { // TCP header is variable length, peek at it first. hdrLen := header.TCPMinimumSize hdr, ok := pkt.Data().PullUp(hdrLen) if !ok { return false } // If the header has options, pull those up as well. if offset := int(header.TCP(hdr).DataOffset()); offset > header.TCPMinimumSize && offset <= pkt.Data().Size() { // TODO(gvisor.dev/issue/2404): Figure out whether to reject this kind of // packets. hdrLen = offset } _, ok = pkt.TransportHeader().Consume(hdrLen) pkt.TransportProtocolNumber = header.TCPProtocolNumber return ok } // ICMPv4 populates the packet buffer's transport header with an ICMPv4 header, // if present. // // Returns true if an ICMPv4 header was successfully parsed. func ICMPv4(pkt *stack.PacketBuffer) bool { if _, ok := pkt.TransportHeader().Consume(header.ICMPv4MinimumSize); ok { pkt.TransportProtocolNumber = header.ICMPv4ProtocolNumber return true } return false } // ICMPv6 populates the packet buffer's transport header with an ICMPv4 header, // if present. // // Returns true if an ICMPv6 header was successfully parsed. func ICMPv6(pkt *stack.PacketBuffer) bool { hdr, ok := pkt.Data().PullUp(header.ICMPv6MinimumSize) if !ok { return false } h := header.ICMPv6(hdr) switch h.Type() { case header.ICMPv6RouterSolicit, header.ICMPv6RouterAdvert, header.ICMPv6NeighborSolicit, header.ICMPv6NeighborAdvert, header.ICMPv6RedirectMsg, header.ICMPv6MulticastListenerQuery, header.ICMPv6MulticastListenerReport, header.ICMPv6MulticastListenerV2Report, header.ICMPv6MulticastListenerDone: size := pkt.Data().Size() if _, ok := pkt.TransportHeader().Consume(size); !ok { panic(fmt.Sprintf("expected to consume the full data of size = %d bytes into transport header", size)) } case header.ICMPv6DstUnreachable, header.ICMPv6PacketTooBig, header.ICMPv6TimeExceeded, header.ICMPv6ParamProblem, header.ICMPv6EchoRequest, header.ICMPv6EchoReply: fallthrough default: if _, ok := pkt.TransportHeader().Consume(header.ICMPv6MinimumSize); !ok { // Checked above if the packet buffer holds at least the minimum size for // an ICMPv6 packet. panic(fmt.Sprintf("expected to consume %d bytes", header.ICMPv6MinimumSize)) } } pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber return true } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/parse/parse_state_autogen.go000066400000000000000000000000671465435605700274730ustar00rootroot00000000000000// automatically generated by stateify. package parse golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/tcp.go000066400000000000000000000535331465435605700231210ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import ( "encoding/binary" "github.com/google/btree" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/checksum" "gvisor.dev/gvisor/pkg/tcpip/seqnum" ) // These constants are the offsets of the respective fields in the TCP header. const ( TCPSrcPortOffset = 0 TCPDstPortOffset = 2 TCPSeqNumOffset = 4 TCPAckNumOffset = 8 TCPDataOffset = 12 TCPFlagsOffset = 13 TCPWinSizeOffset = 14 TCPChecksumOffset = 16 TCPUrgentPtrOffset = 18 ) const ( // MaxWndScale is maximum allowed window scaling, as described in // RFC 1323, section 2.3, page 11. MaxWndScale = 14 // TCPMaxSACKBlocks is the maximum number of SACK blocks that can // be encoded in a TCP option field. TCPMaxSACKBlocks = 4 ) // TCPFlags is the dedicated type for TCP flags. type TCPFlags uint8 // Intersects returns true iff there are flags common to both f and o. func (f TCPFlags) Intersects(o TCPFlags) bool { return f&o != 0 } // Contains returns true iff all the flags in o are contained within f. func (f TCPFlags) Contains(o TCPFlags) bool { return f&o == o } // String implements Stringer.String. func (f TCPFlags) String() string { flagsStr := []byte("FSRPAUEC") for i := range flagsStr { if f&(1<> 4) * 4 } // Payload returns the data in the TCP packet. func (b TCP) Payload() []byte { return b[b.DataOffset():] } // Flags returns the flags field of the TCP header. func (b TCP) Flags() TCPFlags { return TCPFlags(b[TCPFlagsOffset]) } // WindowSize returns the "window size" field of the TCP header. func (b TCP) WindowSize() uint16 { return binary.BigEndian.Uint16(b[TCPWinSizeOffset:]) } // Checksum returns the "checksum" field of the TCP header. func (b TCP) Checksum() uint16 { return binary.BigEndian.Uint16(b[TCPChecksumOffset:]) } // UrgentPointer returns the "urgent pointer" field of the TCP header. func (b TCP) UrgentPointer() uint16 { return binary.BigEndian.Uint16(b[TCPUrgentPtrOffset:]) } // SetSourcePort sets the "source port" field of the TCP header. func (b TCP) SetSourcePort(port uint16) { binary.BigEndian.PutUint16(b[TCPSrcPortOffset:], port) } // SetDestinationPort sets the "destination port" field of the TCP header. func (b TCP) SetDestinationPort(port uint16) { binary.BigEndian.PutUint16(b[TCPDstPortOffset:], port) } // SetChecksum sets the checksum field of the TCP header. func (b TCP) SetChecksum(xsum uint16) { checksum.Put(b[TCPChecksumOffset:], xsum) } // SetDataOffset sets the data offset field of the TCP header. headerLen should // be the length of the TCP header in bytes. func (b TCP) SetDataOffset(headerLen uint8) { b[TCPDataOffset] = (headerLen / 4) << 4 } // SetSequenceNumber sets the sequence number field of the TCP header. func (b TCP) SetSequenceNumber(seqNum uint32) { binary.BigEndian.PutUint32(b[TCPSeqNumOffset:], seqNum) } // SetAckNumber sets the ack number field of the TCP header. func (b TCP) SetAckNumber(ackNum uint32) { binary.BigEndian.PutUint32(b[TCPAckNumOffset:], ackNum) } // SetFlags sets the flags field of the TCP header. func (b TCP) SetFlags(flags uint8) { b[TCPFlagsOffset] = flags } // SetWindowSize sets the window size field of the TCP header. func (b TCP) SetWindowSize(rcvwnd uint16) { binary.BigEndian.PutUint16(b[TCPWinSizeOffset:], rcvwnd) } // SetUrgentPointer sets the window size field of the TCP header. func (b TCP) SetUrgentPointer(urgentPointer uint16) { binary.BigEndian.PutUint16(b[TCPUrgentPtrOffset:], urgentPointer) } // CalculateChecksum calculates the checksum of the TCP segment. // partialChecksum is the checksum of the network-layer pseudo-header // and the checksum of the segment data. func (b TCP) CalculateChecksum(partialChecksum uint16) uint16 { // Calculate the rest of the checksum. return checksum.Checksum(b[:b.DataOffset()], partialChecksum) } // IsChecksumValid returns true iff the TCP header's checksum is valid. func (b TCP) IsChecksumValid(src, dst tcpip.Address, payloadChecksum, payloadLength uint16) bool { xsum := PseudoHeaderChecksum(TCPProtocolNumber, src, dst, uint16(b.DataOffset())+payloadLength) xsum = checksum.Combine(xsum, payloadChecksum) return b.CalculateChecksum(xsum) == 0xffff } // Options returns a slice that holds the unparsed TCP options in the segment. func (b TCP) Options() []byte { return b[TCPMinimumSize:b.DataOffset()] } // ParsedOptions returns a TCPOptions structure which parses and caches the TCP // option values in the TCP segment. NOTE: Invoking this function repeatedly is // expensive as it reparses the options on each invocation. func (b TCP) ParsedOptions() TCPOptions { return ParseTCPOptions(b.Options()) } func (b TCP) encodeSubset(seq, ack uint32, flags TCPFlags, rcvwnd uint16) { binary.BigEndian.PutUint32(b[TCPSeqNumOffset:], seq) binary.BigEndian.PutUint32(b[TCPAckNumOffset:], ack) b[TCPFlagsOffset] = uint8(flags) binary.BigEndian.PutUint16(b[TCPWinSizeOffset:], rcvwnd) } // Encode encodes all the fields of the TCP header. func (b TCP) Encode(t *TCPFields) { b.encodeSubset(t.SeqNum, t.AckNum, t.Flags, t.WindowSize) b.SetSourcePort(t.SrcPort) b.SetDestinationPort(t.DstPort) b.SetDataOffset(t.DataOffset) b.SetChecksum(t.Checksum) b.SetUrgentPointer(t.UrgentPointer) } // EncodePartial updates a subset of the fields of the TCP header. It is useful // in cases when similar segments are produced. func (b TCP) EncodePartial(partialChecksum, length uint16, seqnum, acknum uint32, flags TCPFlags, rcvwnd uint16) { // Add the total length and "flags" field contributions to the checksum. // We don't use the flags field directly from the header because it's a // one-byte field with an odd offset, so it would be accounted for // incorrectly by the Checksum routine. tmp := make([]byte, 4) binary.BigEndian.PutUint16(tmp, length) binary.BigEndian.PutUint16(tmp[2:], uint16(flags)) xsum := checksum.Checksum(tmp, partialChecksum) // Encode the passed-in fields. b.encodeSubset(seqnum, acknum, flags, rcvwnd) // Add the contributions of the passed-in fields to the checksum. xsum = checksum.Checksum(b[TCPSeqNumOffset:TCPSeqNumOffset+8], xsum) xsum = checksum.Checksum(b[TCPWinSizeOffset:TCPWinSizeOffset+2], xsum) // Encode the checksum. b.SetChecksum(^xsum) } // SetSourcePortWithChecksumUpdate implements ChecksummableTransport. func (b TCP) SetSourcePortWithChecksumUpdate(new uint16) { old := b.SourcePort() b.SetSourcePort(new) b.SetChecksum(^checksumUpdate2ByteAlignedUint16(^b.Checksum(), old, new)) } // SetDestinationPortWithChecksumUpdate implements ChecksummableTransport. func (b TCP) SetDestinationPortWithChecksumUpdate(new uint16) { old := b.DestinationPort() b.SetDestinationPort(new) b.SetChecksum(^checksumUpdate2ByteAlignedUint16(^b.Checksum(), old, new)) } // UpdateChecksumPseudoHeaderAddress implements ChecksummableTransport. func (b TCP) UpdateChecksumPseudoHeaderAddress(old, new tcpip.Address, fullChecksum bool) { xsum := b.Checksum() if fullChecksum { xsum = ^xsum } xsum = checksumUpdate2ByteAlignedAddress(xsum, old, new) if fullChecksum { xsum = ^xsum } b.SetChecksum(xsum) } // ParseSynOptions parses the options received in a SYN segment and returns the // relevant ones. opts should point to the option part of the TCP header. func ParseSynOptions(opts []byte, isAck bool) TCPSynOptions { limit := len(opts) synOpts := TCPSynOptions{ // Per RFC 1122, page 85: "If an MSS option is not received at // connection setup, TCP MUST assume a default send MSS of 536." MSS: TCPDefaultMSS, // If no window scale option is specified, WS in options is // returned as -1; this is because the absence of the option // indicates that the we cannot use window scaling on the // receive end either. WS: -1, } for i := 0; i < limit; { switch opts[i] { case TCPOptionEOL: i = limit case TCPOptionNOP: i++ case TCPOptionMSS: if i+4 > limit || opts[i+1] != 4 { return synOpts } mss := uint16(opts[i+2])<<8 | uint16(opts[i+3]) if mss == 0 { return synOpts } synOpts.MSS = mss if mss < TCPMinimumSendMSS { synOpts.MSS = TCPMinimumSendMSS } i += 4 case TCPOptionWS: if i+3 > limit || opts[i+1] != 3 { return synOpts } ws := int(opts[i+2]) if ws > MaxWndScale { ws = MaxWndScale } synOpts.WS = ws i += 3 case TCPOptionTS: if i+10 > limit || opts[i+1] != 10 { return synOpts } synOpts.TSVal = binary.BigEndian.Uint32(opts[i+2:]) if isAck { // If the segment is a SYN-ACK then store the Timestamp Echo Reply // in the segment. synOpts.TSEcr = binary.BigEndian.Uint32(opts[i+6:]) } synOpts.TS = true i += 10 case TCPOptionSACKPermitted: if i+2 > limit || opts[i+1] != 2 { return synOpts } synOpts.SACKPermitted = true i += 2 default: // We don't recognize this option, just skip over it. if i+2 > limit { return synOpts } l := int(opts[i+1]) // If the length is incorrect or if l+i overflows the // total options length then return false. if l < 2 || i+l > limit { return synOpts } i += l } } return synOpts } // ParseTCPOptions extracts and stores all known options in the provided byte // slice in a TCPOptions structure. func ParseTCPOptions(b []byte) TCPOptions { opts := TCPOptions{} limit := len(b) for i := 0; i < limit; { switch b[i] { case TCPOptionEOL: i = limit case TCPOptionNOP: i++ case TCPOptionTS: if i+10 > limit || (b[i+1] != 10) { return opts } opts.TS = true opts.TSVal = binary.BigEndian.Uint32(b[i+2:]) opts.TSEcr = binary.BigEndian.Uint32(b[i+6:]) i += 10 case TCPOptionSACK: if i+2 > limit { // Malformed SACK block, just return and stop parsing. return opts } sackOptionLen := int(b[i+1]) if i+sackOptionLen > limit || (sackOptionLen-2)%8 != 0 { // Malformed SACK block, just return and stop parsing. return opts } numBlocks := (sackOptionLen - 2) / 8 opts.SACKBlocks = []SACKBlock{} for j := 0; j < numBlocks; j++ { start := binary.BigEndian.Uint32(b[i+2+j*8:]) end := binary.BigEndian.Uint32(b[i+2+j*8+4:]) opts.SACKBlocks = append(opts.SACKBlocks, SACKBlock{ Start: seqnum.Value(start), End: seqnum.Value(end), }) } i += sackOptionLen default: // We don't recognize this option, just skip over it. if i+2 > limit { return opts } l := int(b[i+1]) // If the length is incorrect or if l+i overflows the // total options length then return false. if l < 2 || i+l > limit { return opts } i += l } } return opts } // EncodeMSSOption encodes the MSS TCP option with the provided MSS values in // the supplied buffer. If the provided buffer is not large enough then it just // returns without encoding anything. It returns the number of bytes written to // the provided buffer. func EncodeMSSOption(mss uint32, b []byte) int { if len(b) < TCPOptionMSSLength { return 0 } b[0], b[1], b[2], b[3] = TCPOptionMSS, TCPOptionMSSLength, byte(mss>>8), byte(mss) return TCPOptionMSSLength } // EncodeWSOption encodes the WS TCP option with the WS value in the // provided buffer. If the provided buffer is not large enough then it just // returns without encoding anything. It returns the number of bytes written to // the provided buffer. func EncodeWSOption(ws int, b []byte) int { if len(b) < TCPOptionWSLength { return 0 } b[0], b[1], b[2] = TCPOptionWS, TCPOptionWSLength, uint8(ws) return int(b[1]) } // EncodeTSOption encodes the provided tsVal and tsEcr values as a TCP timestamp // option into the provided buffer. If the buffer is smaller than expected it // just returns without encoding anything. It returns the number of bytes // written to the provided buffer. func EncodeTSOption(tsVal, tsEcr uint32, b []byte) int { if len(b) < TCPOptionTSLength { return 0 } b[0], b[1] = TCPOptionTS, TCPOptionTSLength binary.BigEndian.PutUint32(b[2:], tsVal) binary.BigEndian.PutUint32(b[6:], tsEcr) return int(b[1]) } // EncodeSACKPermittedOption encodes a SACKPermitted option into the provided // buffer. If the buffer is smaller than required it just returns without // encoding anything. It returns the number of bytes written to the provided // buffer. func EncodeSACKPermittedOption(b []byte) int { if len(b) < TCPOptionSackPermittedLength { return 0 } b[0], b[1] = TCPOptionSACKPermitted, TCPOptionSackPermittedLength return int(b[1]) } // EncodeSACKBlocks encodes the provided SACK blocks as a TCP SACK option block // in the provided slice. It tries to fit in as many blocks as possible based on // number of bytes available in the provided buffer. It returns the number of // bytes written to the provided buffer. func EncodeSACKBlocks(sackBlocks []SACKBlock, b []byte) int { if len(sackBlocks) == 0 { return 0 } l := len(sackBlocks) if l > TCPMaxSACKBlocks { l = TCPMaxSACKBlocks } if ll := (len(b) - 2) / 8; ll < l { l = ll } if l == 0 { // There is not enough space in the provided buffer to add // any SACK blocks. return 0 } b[0] = TCPOptionSACK b[1] = byte(l*8 + 2) for i := 0; i < l; i++ { binary.BigEndian.PutUint32(b[i*8+2:], uint32(sackBlocks[i].Start)) binary.BigEndian.PutUint32(b[i*8+6:], uint32(sackBlocks[i].End)) } return int(b[1]) } // EncodeNOP adds an explicit NOP to the option list. func EncodeNOP(b []byte) int { if len(b) == 0 { return 0 } b[0] = TCPOptionNOP return 1 } // AddTCPOptionPadding adds the required number of TCPOptionNOP to quad align // the option buffer. It adds padding bytes after the offset specified and // returns the number of padding bytes added. The passed in options slice // must have space for the padding bytes. func AddTCPOptionPadding(options []byte, offset int) int { paddingToAdd := -offset & 3 // Now add any padding bytes that might be required to quad align the // options. for i := offset; i < offset+paddingToAdd; i++ { options[i] = TCPOptionNOP } return paddingToAdd } // Acceptable checks if a segment that starts at segSeq and has length segLen is // "acceptable" for arriving in a receive window that starts at rcvNxt and ends // before rcvAcc, according to the table on page 26 and 69 of RFC 793. func Acceptable(segSeq seqnum.Value, segLen seqnum.Size, rcvNxt, rcvAcc seqnum.Value) bool { if rcvNxt == rcvAcc { return segLen == 0 && segSeq == rcvNxt } if segLen == 0 { // rcvWnd is incremented by 1 because that is Linux's behavior despite the // RFC. return segSeq.InRange(rcvNxt, rcvAcc.Add(1)) } // Page 70 of RFC 793 allows packets that can be made "acceptable" by trimming // the payload, so we'll accept any payload that overlaps the receive window. // segSeq < rcvAcc is more correct according to RFC, however, Linux does it // differently, it uses segSeq <= rcvAcc, we'd want to keep the same behavior // as Linux. return rcvNxt.LessThan(segSeq.Add(segLen)) && segSeq.LessThanEq(rcvAcc) } // TCPValid returns true if the pkt has a valid TCP header. It checks whether: // - The data offset is too small. // - The data offset is too large. // - The checksum is invalid. // // TCPValid corresponds to net/netfilter/nf_conntrack_proto_tcp.c:tcp_error. func TCPValid(hdr TCP, payloadChecksum func() uint16, payloadSize uint16, srcAddr, dstAddr tcpip.Address, skipChecksumValidation bool) (csum uint16, csumValid, ok bool) { if offset := int(hdr.DataOffset()); offset < TCPMinimumSize || offset > len(hdr) { return } if skipChecksumValidation { csumValid = true } else { csum = hdr.Checksum() csumValid = hdr.IsChecksumValid(srcAddr, dstAddr, payloadChecksum(), payloadSize) } return csum, csumValid, true } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/udp.go000066400000000000000000000137761465435605700231300ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import ( "encoding/binary" "math" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/checksum" ) const ( udpSrcPort = 0 udpDstPort = 2 udpLength = 4 udpChecksum = 6 ) const ( // UDPMaximumPacketSize is the largest possible UDP packet. UDPMaximumPacketSize = 0xffff ) // UDPFields contains the fields of a UDP packet. It is used to describe the // fields of a packet that needs to be encoded. type UDPFields struct { // SrcPort is the "source port" field of a UDP packet. SrcPort uint16 // DstPort is the "destination port" field of a UDP packet. DstPort uint16 // Length is the "length" field of a UDP packet. Length uint16 // Checksum is the "checksum" field of a UDP packet. Checksum uint16 } // UDP represents a UDP header stored in a byte array. type UDP []byte const ( // UDPMinimumSize is the minimum size of a valid UDP packet. UDPMinimumSize = 8 // UDPMaximumSize is the maximum size of a valid UDP packet. The length field // in the UDP header is 16 bits as per RFC 768. UDPMaximumSize = math.MaxUint16 // UDPProtocolNumber is UDP's transport protocol number. UDPProtocolNumber tcpip.TransportProtocolNumber = 17 ) // SourcePort returns the "source port" field of the UDP header. func (b UDP) SourcePort() uint16 { return binary.BigEndian.Uint16(b[udpSrcPort:]) } // DestinationPort returns the "destination port" field of the UDP header. func (b UDP) DestinationPort() uint16 { return binary.BigEndian.Uint16(b[udpDstPort:]) } // Length returns the "length" field of the UDP header. func (b UDP) Length() uint16 { return binary.BigEndian.Uint16(b[udpLength:]) } // Payload returns the data contained in the UDP datagram. func (b UDP) Payload() []byte { return b[UDPMinimumSize:] } // Checksum returns the "checksum" field of the UDP header. func (b UDP) Checksum() uint16 { return binary.BigEndian.Uint16(b[udpChecksum:]) } // SetSourcePort sets the "source port" field of the UDP header. func (b UDP) SetSourcePort(port uint16) { binary.BigEndian.PutUint16(b[udpSrcPort:], port) } // SetDestinationPort sets the "destination port" field of the UDP header. func (b UDP) SetDestinationPort(port uint16) { binary.BigEndian.PutUint16(b[udpDstPort:], port) } // SetChecksum sets the "checksum" field of the UDP header. func (b UDP) SetChecksum(xsum uint16) { checksum.Put(b[udpChecksum:], xsum) } // SetLength sets the "length" field of the UDP header. func (b UDP) SetLength(length uint16) { binary.BigEndian.PutUint16(b[udpLength:], length) } // CalculateChecksum calculates the checksum of the UDP packet, given the // checksum of the network-layer pseudo-header and the checksum of the payload. func (b UDP) CalculateChecksum(partialChecksum uint16) uint16 { // Calculate the rest of the checksum. return checksum.Checksum(b[:UDPMinimumSize], partialChecksum) } // IsChecksumValid returns true iff the UDP header's checksum is valid. func (b UDP) IsChecksumValid(src, dst tcpip.Address, payloadChecksum uint16) bool { xsum := PseudoHeaderChecksum(UDPProtocolNumber, dst, src, b.Length()) xsum = checksum.Combine(xsum, payloadChecksum) return b.CalculateChecksum(xsum) == 0xffff } // Encode encodes all the fields of the UDP header. func (b UDP) Encode(u *UDPFields) { b.SetSourcePort(u.SrcPort) b.SetDestinationPort(u.DstPort) b.SetLength(u.Length) b.SetChecksum(u.Checksum) } // SetSourcePortWithChecksumUpdate implements ChecksummableTransport. func (b UDP) SetSourcePortWithChecksumUpdate(new uint16) { old := b.SourcePort() b.SetSourcePort(new) b.SetChecksum(^checksumUpdate2ByteAlignedUint16(^b.Checksum(), old, new)) } // SetDestinationPortWithChecksumUpdate implements ChecksummableTransport. func (b UDP) SetDestinationPortWithChecksumUpdate(new uint16) { old := b.DestinationPort() b.SetDestinationPort(new) b.SetChecksum(^checksumUpdate2ByteAlignedUint16(^b.Checksum(), old, new)) } // UpdateChecksumPseudoHeaderAddress implements ChecksummableTransport. func (b UDP) UpdateChecksumPseudoHeaderAddress(old, new tcpip.Address, fullChecksum bool) { xsum := b.Checksum() if fullChecksum { xsum = ^xsum } xsum = checksumUpdate2ByteAlignedAddress(xsum, old, new) if fullChecksum { xsum = ^xsum } b.SetChecksum(xsum) } // UDPValid returns true if the pkt has a valid UDP header. It checks whether: // - The length field is too small. // - The length field is too large. // - The checksum is invalid. // // UDPValid corresponds to net/netfilter/nf_conntrack_proto_udp.c:udp_error. func UDPValid(hdr UDP, payloadChecksum func() uint16, payloadSize uint16, netProto tcpip.NetworkProtocolNumber, srcAddr, dstAddr tcpip.Address, skipChecksumValidation bool) (lengthValid, csumValid bool) { if length := hdr.Length(); length > payloadSize+UDPMinimumSize || length < UDPMinimumSize { return false, false } if skipChecksumValidation { return true, true } // On IPv4, UDP checksum is optional, and a zero value means the transmitter // omitted the checksum generation, as per RFC 768: // // An all zero transmitted checksum value means that the transmitter // generated no checksum (for debugging or for higher level protocols that // don't care). // // On IPv6, UDP checksum is not optional, as per RFC 2460 Section 8.1: // // Unlike IPv4, when UDP packets are originated by an IPv6 node, the UDP // checksum is not optional. if netProto == IPv4ProtocolNumber && hdr.Checksum() == 0 { return true, true } return true, hdr.IsChecksumValid(srcAddr, dstAddr, payloadChecksum()) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/header/virtionet.go000066400000000000000000000053001465435605700243430ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package header import "encoding/binary" // These constants are declared in linux/virtio_net.h. const ( _VIRTIO_NET_HDR_F_NEEDS_CSUM = 1 _VIRTIO_NET_HDR_GSO_NONE = 0 _VIRTIO_NET_HDR_GSO_TCPV4 = 1 _VIRTIO_NET_HDR_GSO_TCPV6 = 4 ) const ( // VirtioNetHeaderSize is the size of VirtioNetHeader in bytes. VirtioNetHeaderSize = 10 ) // Offsets for fields in the virtio net header. const ( flags = 0 gsoType = 1 hdrLen = 2 gsoSize = 4 csumStart = 6 csumOffset = 8 ) // VirtioNetHeaderFields is the Go equivalent of the struct declared in // linux/virtio_net.h. type VirtioNetHeaderFields struct { Flags uint8 GSOType uint8 HdrLen uint16 GSOSize uint16 CSumStart uint16 CSumOffset uint16 } // VirtioNetHeader represents a virtio net header stored in a byte array. type VirtioNetHeader []byte // Flags returns the "flags" field of the virtio net header. func (v VirtioNetHeader) Flags() uint8 { return uint8(v[flags]) } // GSOType returns the "gsoType" field of the virtio net header. func (v VirtioNetHeader) GSOType() uint8 { return uint8(v[gsoType]) } // HdrLen returns the "hdrLen" field of the virtio net header. func (v VirtioNetHeader) HdrLen() uint16 { return binary.BigEndian.Uint16(v[hdrLen:]) } // GSOSize returns the "gsoSize" field of the virtio net header. func (v VirtioNetHeader) GSOSize() uint16 { return binary.BigEndian.Uint16(v[gsoSize:]) } // CSumStart returns the "csumStart" field of the virtio net header. func (v VirtioNetHeader) CSumStart() uint16 { return binary.BigEndian.Uint16(v[csumStart:]) } // CSumOffset returns the "csumOffset" field of the virtio net header. func (v VirtioNetHeader) CSumOffset() uint16 { return binary.BigEndian.Uint16(v[csumOffset:]) } // Encode encodes all the fields of the virtio net header. func (v VirtioNetHeader) Encode(f *VirtioNetHeaderFields) { v[flags] = uint8(f.Flags) v[gsoType] = uint8(f.GSOType) binary.BigEndian.PutUint16(v[hdrLen:], f.HdrLen) binary.BigEndian.PutUint16(v[gsoSize:], f.GSOSize) binary.BigEndian.PutUint16(v[csumStart:], f.CSumStart) binary.BigEndian.PutUint16(v[csumOffset:], f.CSumOffset) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/internal/000077500000000000000000000000001465435605700223575ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/internal/tcp/000077500000000000000000000000001465435605700231455ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/internal/tcp/tcp.go000066400000000000000000000030441465435605700242630ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package tcp contains internal type definitions that are not expected to be // used by anyone else outside pkg/tcpip. package tcp import ( "time" "gvisor.dev/gvisor/pkg/tcpip" ) // TSOffset is an offset applied to the value of the TSVal field in the TCP // Timestamp option. // // +stateify savable type TSOffset struct { milliseconds uint32 } // NewTSOffset creates a new TSOffset from milliseconds. func NewTSOffset(milliseconds uint32) TSOffset { return TSOffset{ milliseconds: milliseconds, } } // TSVal applies the offset to now and returns the timestamp in milliseconds. func (offset TSOffset) TSVal(now tcpip.MonotonicTime) uint32 { return uint32(now.Sub(tcpip.MonotonicTime{}).Milliseconds()) + offset.milliseconds } // Elapsed calculates the elapsed time given now and the echoed back timestamp. func (offset TSOffset) Elapsed(now tcpip.MonotonicTime, tsEcr uint32) time.Duration { return time.Duration(offset.TSVal(now)-tsEcr) * time.Millisecond } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/internal/tcp/tcp_state_autogen.go000066400000000000000000000013661465435605700272120ustar00rootroot00000000000000// automatically generated by stateify. package tcp import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (offset *TSOffset) StateTypeName() string { return "pkg/tcpip/internal/tcp.TSOffset" } func (offset *TSOffset) StateFields() []string { return []string{ "milliseconds", } } func (offset *TSOffset) beforeSave() {} // +checklocksignore func (offset *TSOffset) StateSave(stateSinkObject state.Sink) { offset.beforeSave() stateSinkObject.Save(0, &offset.milliseconds) } func (offset *TSOffset) afterLoad(context.Context) {} // +checklocksignore func (offset *TSOffset) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &offset.milliseconds) } func init() { state.Register((*TSOffset)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/000077500000000000000000000000001465435605700215005ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/channel/000077500000000000000000000000001465435605700231105ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/channel/channel.go000066400000000000000000000173651465435605700250630ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package channel provides the implementation of channel-based data-link layer // endpoints. Such endpoints allow injection of inbound packets and store // outbound packets in a channel. package channel import ( "context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // Notification is the interface for receiving notification from the packet // queue. type Notification interface { // WriteNotify will be called when a write happens to the queue. WriteNotify() } // NotificationHandle is an opaque handle to the registered notification target. // It can be used to unregister the notification when no longer interested. // // +stateify savable type NotificationHandle struct { n Notification } type queue struct { // c is the outbound packet channel. c chan *stack.PacketBuffer mu sync.RWMutex // +checklocks:mu notify []*NotificationHandle // +checklocks:mu closed bool } func (q *queue) Close() { q.mu.Lock() defer q.mu.Unlock() if !q.closed { close(q.c) } q.closed = true } func (q *queue) Read() *stack.PacketBuffer { select { case p := <-q.c: return p default: return nil } } func (q *queue) ReadContext(ctx context.Context) *stack.PacketBuffer { select { case pkt := <-q.c: return pkt case <-ctx.Done(): return nil } } func (q *queue) Write(pkt *stack.PacketBuffer) tcpip.Error { // q holds the PacketBuffer. q.mu.RLock() if q.closed { q.mu.RUnlock() return &tcpip.ErrClosedForSend{} } wrote := false select { case q.c <- pkt.IncRef(): wrote = true default: pkt.DecRef() } notify := q.notify q.mu.RUnlock() if wrote { // Send notification outside of lock. for _, h := range notify { h.n.WriteNotify() } return nil } return &tcpip.ErrNoBufferSpace{} } func (q *queue) Num() int { return len(q.c) } func (q *queue) AddNotify(notify Notification) *NotificationHandle { q.mu.Lock() defer q.mu.Unlock() h := &NotificationHandle{n: notify} q.notify = append(q.notify, h) return h } func (q *queue) RemoveNotify(handle *NotificationHandle) { q.mu.Lock() defer q.mu.Unlock() // Make a copy, since we reads the array outside of lock when notifying. notify := make([]*NotificationHandle, 0, len(q.notify)) for _, h := range q.notify { if h != handle { notify = append(notify, h) } } q.notify = notify } var _ stack.LinkEndpoint = (*Endpoint)(nil) var _ stack.GSOEndpoint = (*Endpoint)(nil) // Endpoint is link layer endpoint that stores outbound packets in a channel // and allows injection of inbound packets. // // +stateify savable type Endpoint struct { LinkEPCapabilities stack.LinkEndpointCapabilities SupportedGSOKind stack.SupportedGSO mu sync.RWMutex `state:"nosave"` // +checklocks:mu dispatcher stack.NetworkDispatcher // +checklocks:mu linkAddr tcpip.LinkAddress // +checklocks:mu mtu uint32 // Outbound packet queue. q *queue } // New creates a new channel endpoint. func New(size int, mtu uint32, linkAddr tcpip.LinkAddress) *Endpoint { return &Endpoint{ q: &queue{ c: make(chan *stack.PacketBuffer, size), }, mtu: mtu, linkAddr: linkAddr, } } // Close closes e. Further packet injections will return an error, and all pending // packets are discarded. Close may be called concurrently with WritePackets. func (e *Endpoint) Close() { e.q.Close() e.Drain() } // Read does non-blocking read one packet from the outbound packet queue. func (e *Endpoint) Read() *stack.PacketBuffer { return e.q.Read() } // ReadContext does blocking read for one packet from the outbound packet queue. // It can be cancelled by ctx, and in this case, it returns nil. func (e *Endpoint) ReadContext(ctx context.Context) *stack.PacketBuffer { return e.q.ReadContext(ctx) } // Drain removes all outbound packets from the channel and counts them. func (e *Endpoint) Drain() int { c := 0 for pkt := e.Read(); pkt != nil; pkt = e.Read() { pkt.DecRef() c++ } return c } // NumQueued returns the number of packet queued for outbound. func (e *Endpoint) NumQueued() int { return e.q.Num() } // InjectInbound injects an inbound packet. If the endpoint is not attached, the // packet is not delivered. func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { e.mu.RLock() d := e.dispatcher e.mu.RUnlock() if d != nil { d.DeliverNetworkPacket(protocol, pkt) } } // Attach saves the stack network-layer dispatcher for use later when packets // are injected. func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) { e.mu.Lock() defer e.mu.Unlock() e.dispatcher = dispatcher } // IsAttached implements stack.LinkEndpoint.IsAttached. func (e *Endpoint) IsAttached() bool { e.mu.RLock() defer e.mu.RUnlock() return e.dispatcher != nil } // MTU implements stack.LinkEndpoint.MTU. func (e *Endpoint) MTU() uint32 { e.mu.RLock() defer e.mu.RUnlock() return e.mtu } // SetMTU implements stack.LinkEndpoint.SetMTU. func (e *Endpoint) SetMTU(mtu uint32) { e.mu.Lock() defer e.mu.Unlock() e.mtu = mtu } // Capabilities implements stack.LinkEndpoint.Capabilities. func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities { return e.LinkEPCapabilities } // GSOMaxSize implements stack.GSOEndpoint. func (*Endpoint) GSOMaxSize() uint32 { return 1 << 15 } // SupportedGSO implements stack.GSOEndpoint. func (e *Endpoint) SupportedGSO() stack.SupportedGSO { return e.SupportedGSOKind } // MaxHeaderLength returns the maximum size of the link layer header. Given it // doesn't have a header, it just returns 0. func (*Endpoint) MaxHeaderLength() uint16 { return 0 } // LinkAddress returns the link address of this endpoint. func (e *Endpoint) LinkAddress() tcpip.LinkAddress { e.mu.RLock() defer e.mu.RUnlock() return e.linkAddr } // SetLinkAddress implements stack.LinkEndpoint.SetLinkAddress. func (e *Endpoint) SetLinkAddress(addr tcpip.LinkAddress) { e.mu.Lock() defer e.mu.Unlock() e.linkAddr = addr } // WritePackets stores outbound packets into the channel. // Multiple concurrent calls are permitted. func (e *Endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { n := 0 for _, pkt := range pkts.AsSlice() { if err := e.q.Write(pkt); err != nil { if _, ok := err.(*tcpip.ErrNoBufferSpace); !ok && n == 0 { return 0, err } break } n++ } return n, nil } // Wait implements stack.LinkEndpoint.Wait. func (*Endpoint) Wait() {} // AddNotify adds a notification target for receiving event about outgoing // packets. func (e *Endpoint) AddNotify(notify Notification) *NotificationHandle { return e.q.AddNotify(notify) } // RemoveNotify removes handle from the list of notification targets. func (e *Endpoint) RemoveNotify(handle *NotificationHandle) { e.q.RemoveNotify(handle) } // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. func (*Endpoint) ARPHardwareType() header.ARPHardwareType { return header.ARPHardwareNone } // AddHeader implements stack.LinkEndpoint.AddHeader. func (*Endpoint) AddHeader(*stack.PacketBuffer) {} // ParseHeader implements stack.LinkEndpoint.ParseHeader. func (*Endpoint) ParseHeader(*stack.PacketBuffer) bool { return true } // SetOnCloseAction implements stack.LinkEndpoint. func (*Endpoint) SetOnCloseAction(func()) {} golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/channel/channel_state_autogen.go000066400000000000000000000034401465435605700277720ustar00rootroot00000000000000// automatically generated by stateify. package channel import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (n *NotificationHandle) StateTypeName() string { return "pkg/tcpip/link/channel.NotificationHandle" } func (n *NotificationHandle) StateFields() []string { return []string{ "n", } } func (n *NotificationHandle) beforeSave() {} // +checklocksignore func (n *NotificationHandle) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.n) } func (n *NotificationHandle) afterLoad(context.Context) {} // +checklocksignore func (n *NotificationHandle) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.n) } func (e *Endpoint) StateTypeName() string { return "pkg/tcpip/link/channel.Endpoint" } func (e *Endpoint) StateFields() []string { return []string{ "LinkEPCapabilities", "SupportedGSOKind", "dispatcher", "linkAddr", "mtu", "q", } } func (e *Endpoint) beforeSave() {} // +checklocksignore func (e *Endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.LinkEPCapabilities) stateSinkObject.Save(1, &e.SupportedGSOKind) stateSinkObject.Save(2, &e.dispatcher) stateSinkObject.Save(3, &e.linkAddr) stateSinkObject.Save(4, &e.mtu) stateSinkObject.Save(5, &e.q) } func (e *Endpoint) afterLoad(context.Context) {} // +checklocksignore func (e *Endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.LinkEPCapabilities) stateSourceObject.Load(1, &e.SupportedGSOKind) stateSourceObject.Load(2, &e.dispatcher) stateSourceObject.Load(3, &e.linkAddr) stateSourceObject.Load(4, &e.mtu) stateSourceObject.Load(5, &e.q) } func init() { state.Register((*NotificationHandle)(nil)) state.Register((*Endpoint)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/ethernet/000077500000000000000000000000001465435605700233165ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/ethernet/ethernet.go000066400000000000000000000073551465435605700254750ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package ethernet provides an implementation of an ethernet link endpoint that // wraps an inner link endpoint. package ethernet import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/link/nested" "gvisor.dev/gvisor/pkg/tcpip/stack" ) var _ stack.NetworkDispatcher = (*Endpoint)(nil) var _ stack.LinkEndpoint = (*Endpoint)(nil) // New returns an ethernet link endpoint that wraps an inner link endpoint. func New(ep stack.LinkEndpoint) *Endpoint { var e Endpoint e.Endpoint.Init(ep, &e) return &e } // Endpoint is an ethernet endpoint. // // It adds an ethernet header to packets before sending them out through its // inner link endpoint and consumes an ethernet header before sending the // packet to the stack. // // +stateify savable type Endpoint struct { nested.Endpoint } // LinkAddress implements stack.LinkEndpoint. func (e *Endpoint) LinkAddress() tcpip.LinkAddress { if l := e.Endpoint.LinkAddress(); len(l) != 0 { return l } return header.UnspecifiedEthernetAddress } // MTU implements stack.LinkEndpoint. func (e *Endpoint) MTU() uint32 { if mtu := e.Endpoint.MTU(); mtu > header.EthernetMinimumSize { return mtu - header.EthernetMinimumSize } return 0 } // DeliverNetworkPacket implements stack.NetworkDispatcher. func (e *Endpoint) DeliverNetworkPacket(_ tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { if !e.ParseHeader(pkt) { return } eth := header.Ethernet(pkt.LinkHeader().Slice()) dst := eth.DestinationAddress() if dst == header.EthernetBroadcastAddress { pkt.PktType = tcpip.PacketBroadcast } else if header.IsMulticastEthernetAddress(dst) { pkt.PktType = tcpip.PacketMulticast } else if dst == e.LinkAddress() { pkt.PktType = tcpip.PacketHost } else { pkt.PktType = tcpip.PacketOtherHost } // Note, there is no need to check the destination link address here since // the ethernet hardware filters frames based on their destination addresses. e.Endpoint.DeliverNetworkPacket(eth.Type() /* protocol */, pkt) } // Capabilities implements stack.LinkEndpoint. func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities { c := e.Endpoint.Capabilities() if c&stack.CapabilityLoopback == 0 { c |= stack.CapabilityResolutionRequired } return c } // MaxHeaderLength implements stack.LinkEndpoint. func (e *Endpoint) MaxHeaderLength() uint16 { return header.EthernetMinimumSize + e.Endpoint.MaxHeaderLength() } // ARPHardwareType implements stack.LinkEndpoint. func (e *Endpoint) ARPHardwareType() header.ARPHardwareType { if a := e.Endpoint.ARPHardwareType(); a != header.ARPHardwareNone { return a } return header.ARPHardwareEther } // AddHeader implements stack.LinkEndpoint. func (*Endpoint) AddHeader(pkt *stack.PacketBuffer) { eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize)) fields := header.EthernetFields{ SrcAddr: pkt.EgressRoute.LocalLinkAddress, DstAddr: pkt.EgressRoute.RemoteLinkAddress, Type: pkt.NetworkProtocolNumber, } eth.Encode(&fields) } // ParseHeader implements stack.LinkEndpoint. func (*Endpoint) ParseHeader(pkt *stack.PacketBuffer) bool { _, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize) return ok } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/ethernet/ethernet_state_autogen.go000066400000000000000000000013031465435605700304020ustar00rootroot00000000000000// automatically generated by stateify. package ethernet import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (e *Endpoint) StateTypeName() string { return "pkg/tcpip/link/ethernet.Endpoint" } func (e *Endpoint) StateFields() []string { return []string{ "Endpoint", } } func (e *Endpoint) beforeSave() {} // +checklocksignore func (e *Endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.Endpoint) } func (e *Endpoint) afterLoad(context.Context) {} // +checklocksignore func (e *Endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.Endpoint) } func init() { state.Register((*Endpoint)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/fdbased/000077500000000000000000000000001465435605700230705ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/fdbased/endpoint.go000066400000000000000000000653141465435605700252500ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux // Package fdbased provides the implementation of data-link layer endpoints // backed by boundary-preserving file descriptors (e.g., TUN devices, // seqpacket/datagram sockets). // // FD based endpoints can be used in the networking stack by calling New() to // create a new endpoint, and then passing it as an argument to // Stack.CreateNIC(). // // FD based endpoints can use more than one file descriptor to read incoming // packets. If there are more than one FDs specified and the underlying FD is an // AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the // host kernel will consistently hash the packets to the sockets. This ensures // that packets for the same TCP streams are not reordered. // // Similarly if more than one FD's are specified where the underlying FD is not // AF_PACKET then it's the caller's responsibility to ensure that all inbound // packets on the descriptors are consistently 5 tuple hashed to one of the // descriptors to prevent TCP reordering. // // Since netstack today does not compute 5 tuple hashes for outgoing packets we // only use the first FD to write outbound packets. Once 5 tuple hashes for // all outbound packets are available we will make use of all underlying FD's to // write outbound packets. package fdbased import ( "fmt" "runtime" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/rawfile" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // linkDispatcher reads packets from the link FD and dispatches them to the // NetworkDispatcher. type linkDispatcher interface { Stop() dispatch() (bool, tcpip.Error) release() } // PacketDispatchMode are the various supported methods of receiving and // dispatching packets from the underlying FD. type PacketDispatchMode int // BatchSize is the number of packets to write in each syscall. It is 47 // because when GVisorGSO is in use then a single 65KB TCP segment can get // split into 46 segments of 1420 bytes and a single 216 byte segment. const BatchSize = 47 const ( // Readv is the default dispatch mode and is the least performant of the // dispatch options but the one that is supported by all underlying FD // types. Readv PacketDispatchMode = iota // RecvMMsg enables use of recvmmsg() syscall instead of readv() to // read inbound packets. This reduces # of syscalls needed to process // packets. // // NOTE: recvmmsg() is only supported for sockets, so if the underlying // FD is not a socket then the code will still fall back to the readv() // path. RecvMMsg // PacketMMap enables use of PACKET_RX_RING to receive packets from the // NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The // primary use-case for this is runsc which uses an AF_PACKET FD to // receive packets from the veth device. PacketMMap ) func (p PacketDispatchMode) String() string { switch p { case Readv: return "Readv" case RecvMMsg: return "RecvMMsg" case PacketMMap: return "PacketMMap" default: return fmt.Sprintf("unknown packet dispatch mode '%d'", p) } } var _ stack.LinkEndpoint = (*endpoint)(nil) var _ stack.GSOEndpoint = (*endpoint)(nil) // +stateify savable type fdInfo struct { fd int isSocket bool } // +stateify savable type endpoint struct { // fds is the set of file descriptors each identifying one inbound/outbound // channel. The endpoint will dispatch from all inbound channels as well as // hash outbound packets to specific channels based on the packet hash. fds []fdInfo // hdrSize specifies the link-layer header size. If set to 0, no header // is added/removed; otherwise an ethernet header is used. hdrSize int // caps holds the endpoint capabilities. caps stack.LinkEndpointCapabilities // closed is a function to be called when the FD's peer (if any) closes // its end of the communication pipe. closed func(tcpip.Error) inboundDispatchers []linkDispatcher mu sync.RWMutex `state:"nosave"` // +checklocks:mu dispatcher stack.NetworkDispatcher // packetDispatchMode controls the packet dispatcher used by this // endpoint. packetDispatchMode PacketDispatchMode // gsoMaxSize is the maximum GSO packet size. It is zero if GSO is // disabled. gsoMaxSize uint32 // wg keeps track of running goroutines. wg sync.WaitGroup // gsoKind is the supported kind of GSO. gsoKind stack.SupportedGSO // maxSyscallHeaderBytes has the same meaning as // Options.MaxSyscallHeaderBytes. maxSyscallHeaderBytes uintptr // writevMaxIovs is the maximum number of iovecs that may be passed to // rawfile.NonBlockingWriteIovec, as possibly limited by // maxSyscallHeaderBytes. (No analogous limit is defined for // rawfile.NonBlockingSendMMsg, since in that case the maximum number of // iovecs also depends on the number of mmsghdrs. Instead, if sendBatch // encounters a packet whose iovec count is limited by // maxSyscallHeaderBytes, it falls back to writing the packet using writev // via WritePacket.) writevMaxIovs int // addr is the address of the endpoint. // // +checklocks:mu addr tcpip.LinkAddress // mtu (maximum transmission unit) is the maximum size of a packet. // +checklocks:mu mtu uint32 } // Options specify the details about the fd-based endpoint to be created. // // +stateify savable type Options struct { // FDs is a set of FDs used to read/write packets. FDs []int // MTU is the mtu to use for this endpoint. MTU uint32 // EthernetHeader if true, indicates that the endpoint should read/write // ethernet frames instead of IP packets. EthernetHeader bool // ClosedFunc is a function to be called when an endpoint's peer (if // any) closes its end of the communication pipe. ClosedFunc func(tcpip.Error) // Address is the link address for this endpoint. Only used if // EthernetHeader is true. Address tcpip.LinkAddress // SaveRestore if true, indicates that this NIC capability set should // include CapabilitySaveRestore SaveRestore bool // DisconnectOk if true, indicates that this NIC capability set should // include CapabilityDisconnectOk. DisconnectOk bool // GSOMaxSize is the maximum GSO packet size. It is zero if GSO is // disabled. GSOMaxSize uint32 // GVisorGSOEnabled indicates whether Gvisor GSO is enabled or not. GVisorGSOEnabled bool // PacketDispatchMode specifies the type of inbound dispatcher to be // used for this endpoint. PacketDispatchMode PacketDispatchMode // TXChecksumOffload if true, indicates that this endpoints capability // set should include CapabilityTXChecksumOffload. TXChecksumOffload bool // RXChecksumOffload if true, indicates that this endpoints capability // set should include CapabilityRXChecksumOffload. RXChecksumOffload bool // If MaxSyscallHeaderBytes is non-zero, it is the maximum number of bytes // of struct iovec, msghdr, and mmsghdr that may be passed by each host // system call. MaxSyscallHeaderBytes int // InterfaceIndex is the interface index of the underlying device. InterfaceIndex int // GRO enables generic receive offload. GRO bool // ProcessorsPerChannel is the number of goroutines used to handle packets // from each FD. ProcessorsPerChannel int } // fanoutID is used for AF_PACKET based endpoints to enable PACKET_FANOUT // support in the host kernel. This allows us to use multiple FD's to receive // from the same underlying NIC. The fanoutID needs to be the same for a given // set of FD's that point to the same NIC. Trying to set the PACKET_FANOUT // option for an FD with a fanoutID already in use by another FD for a different // NIC will return an EINVAL. // // Since fanoutID must be unique within the network namespace, we start with // the PID to avoid collisions. The only way to be sure of avoiding collisions // is to run in a new network namespace. var fanoutID atomicbitops.Int32 = atomicbitops.FromInt32(int32(unix.Getpid())) // New creates a new fd-based endpoint. // // Makes fd non-blocking, but does not take ownership of fd, which must remain // open for the lifetime of the returned endpoint (until after the endpoint has // stopped being using and Wait returns). func New(opts *Options) (stack.LinkEndpoint, error) { caps := stack.LinkEndpointCapabilities(0) if opts.RXChecksumOffload { caps |= stack.CapabilityRXChecksumOffload } if opts.TXChecksumOffload { caps |= stack.CapabilityTXChecksumOffload } hdrSize := 0 if opts.EthernetHeader { hdrSize = header.EthernetMinimumSize caps |= stack.CapabilityResolutionRequired } if opts.SaveRestore { caps |= stack.CapabilitySaveRestore } if opts.DisconnectOk { caps |= stack.CapabilityDisconnectOk } if len(opts.FDs) == 0 { return nil, fmt.Errorf("opts.FD is empty, at least one FD must be specified") } if opts.MaxSyscallHeaderBytes < 0 { return nil, fmt.Errorf("opts.MaxSyscallHeaderBytes is negative") } e := &endpoint{ mtu: opts.MTU, caps: caps, closed: opts.ClosedFunc, addr: opts.Address, hdrSize: hdrSize, packetDispatchMode: opts.PacketDispatchMode, maxSyscallHeaderBytes: uintptr(opts.MaxSyscallHeaderBytes), writevMaxIovs: rawfile.MaxIovs, } if e.maxSyscallHeaderBytes != 0 { if max := int(e.maxSyscallHeaderBytes / rawfile.SizeofIovec); max < e.writevMaxIovs { e.writevMaxIovs = max } } // Increment fanoutID to ensure that we don't re-use the same fanoutID // for the next endpoint. fid := fanoutID.Add(1) // Create per channel dispatchers. for _, fd := range opts.FDs { if err := unix.SetNonblock(fd, true); err != nil { return nil, fmt.Errorf("unix.SetNonblock(%v) failed: %v", fd, err) } isSocket, err := isSocketFD(fd) if err != nil { return nil, err } e.fds = append(e.fds, fdInfo{fd: fd, isSocket: isSocket}) if isSocket { if opts.GSOMaxSize != 0 { if opts.GVisorGSOEnabled { e.gsoKind = stack.GVisorGSOSupported } else { e.gsoKind = stack.HostGSOSupported } e.gsoMaxSize = opts.GSOMaxSize } } if opts.ProcessorsPerChannel == 0 { opts.ProcessorsPerChannel = max(1, runtime.GOMAXPROCS(0)/len(opts.FDs)) } inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket, fid, opts) if err != nil { return nil, fmt.Errorf("createInboundDispatcher(...) = %v", err) } e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher) } return e, nil } func createInboundDispatcher(e *endpoint, fd int, isSocket bool, fID int32, opts *Options) (linkDispatcher, error) { // By default use the readv() dispatcher as it works with all kinds of // FDs (tap/tun/unix domain sockets and af_packet). inboundDispatcher, err := newReadVDispatcher(fd, e, opts) if err != nil { return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err) } if isSocket { sa, err := unix.Getsockname(fd) if err != nil { return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err) } switch sa.(type) { case *unix.SockaddrLinklayer: // Enable PACKET_FANOUT mode if the underlying socket is of type // AF_PACKET. We do not enable PACKET_FANOUT_FLAG_DEFRAG as that will // prevent gvisor from receiving fragmented packets and the host does the // reassembly on our behalf before delivering the fragments. This makes it // hard to test fragmentation reassembly code in Netstack. // // See: include/uapi/linux/if_packet.h (struct fanout_args). // // NOTE: We are using SetSockOptInt here even though the underlying // option is actually a struct. The code follows the example in the // kernel documentation as described at the link below: // // See: https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt // // This works out because the actual implementation for the option zero // initializes the structure and will initialize the max_members field // to a proper value if zero. // // See: https://github.com/torvalds/linux/blob/7acac4b3196caee5e21fb5ea53f8bc124e6a16fc/net/packet/af_packet.c#L3881 const fanoutType = unix.PACKET_FANOUT_HASH fanoutArg := (int(fID) & 0xffff) | fanoutType<<16 if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil { return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err) } } switch e.packetDispatchMode { case PacketMMap: inboundDispatcher, err = newPacketMMapDispatcher(fd, e, opts) if err != nil { return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err) } case RecvMMsg: // If the provided FD is a socket then we optimize // packet reads by using recvmmsg() instead of read() to // read packets in a batch. inboundDispatcher, err = newRecvMMsgDispatcher(fd, e, opts) if err != nil { return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err) } case Readv: default: return nil, fmt.Errorf("unknown dispatch mode %d", e.packetDispatchMode) } } return inboundDispatcher, nil } func isSocketFD(fd int) (bool, error) { var stat unix.Stat_t if err := unix.Fstat(fd, &stat); err != nil { return false, fmt.Errorf("unix.Fstat(%v,...) failed: %v", fd, err) } return (stat.Mode & unix.S_IFSOCK) == unix.S_IFSOCK, nil } // Attach launches the goroutine that reads packets from the file descriptor and // dispatches them via the provided dispatcher. If one is already attached, // then nothing happens. // // Attach implements stack.LinkEndpoint.Attach. func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { e.mu.Lock() defer e.mu.Unlock() // nil means the NIC is being removed. if dispatcher == nil && e.dispatcher != nil { for _, dispatcher := range e.inboundDispatchers { dispatcher.Stop() } e.Wait() e.dispatcher = nil return } if dispatcher != nil && e.dispatcher == nil { e.dispatcher = dispatcher // Link endpoints are not savable. When transportation endpoints are // saved, they stop sending outgoing packets and all incoming packets // are rejected. for i := range e.inboundDispatchers { e.wg.Add(1) go func(i int) { // S/R-SAFE: See above. e.dispatchLoop(e.inboundDispatchers[i]) e.wg.Done() }(i) } } } // IsAttached implements stack.LinkEndpoint.IsAttached. func (e *endpoint) IsAttached() bool { e.mu.RLock() defer e.mu.RUnlock() return e.dispatcher != nil } // MTU implements stack.LinkEndpoint.MTU. func (e *endpoint) MTU() uint32 { e.mu.RLock() defer e.mu.RUnlock() return e.mtu } // SetMTU implements stack.LinkEndpoint.SetMTU. func (e *endpoint) SetMTU(mtu uint32) { e.mu.Lock() defer e.mu.Unlock() e.mtu = mtu } // Capabilities implements stack.LinkEndpoint.Capabilities. func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { return e.caps } // MaxHeaderLength returns the maximum size of the link-layer header. func (e *endpoint) MaxHeaderLength() uint16 { return uint16(e.hdrSize) } // LinkAddress returns the link address of this endpoint. func (e *endpoint) LinkAddress() tcpip.LinkAddress { e.mu.RLock() defer e.mu.RUnlock() return e.addr } // SetLinkAddress implements stack.LinkEndpoint.SetLinkAddress. func (e *endpoint) SetLinkAddress(addr tcpip.LinkAddress) { e.mu.Lock() defer e.mu.Unlock() e.addr = addr } // Wait implements stack.LinkEndpoint.Wait. It waits for the endpoint to stop // reading from its FD. func (e *endpoint) Wait() { e.wg.Wait() } // virtioNetHdr is declared in linux/virtio_net.h. type virtioNetHdr struct { flags uint8 gsoType uint8 hdrLen uint16 gsoSize uint16 csumStart uint16 csumOffset uint16 } // marshal serializes h to a newly-allocated byte slice, in little-endian byte // order. // // Note: Virtio v1.0 onwards specifies little-endian as the byte ordering used // for general serialization. This makes it difficult to use go-marshal for // virtio types, as go-marshal implicitly uses the native byte ordering. func (h *virtioNetHdr) marshal() []byte { buf := [virtioNetHdrSize]byte{ 0: byte(h.flags), 1: byte(h.gsoType), // Manually lay out the fields in little-endian byte order. Little endian => // least significant bit goes to the lower address. 2: byte(h.hdrLen), 3: byte(h.hdrLen >> 8), 4: byte(h.gsoSize), 5: byte(h.gsoSize >> 8), 6: byte(h.csumStart), 7: byte(h.csumStart >> 8), 8: byte(h.csumOffset), 9: byte(h.csumOffset >> 8), } return buf[:] } // These constants are declared in linux/virtio_net.h. const ( _VIRTIO_NET_HDR_F_NEEDS_CSUM = 1 _VIRTIO_NET_HDR_GSO_TCPV4 = 1 _VIRTIO_NET_HDR_GSO_TCPV6 = 4 ) // AddHeader implements stack.LinkEndpoint.AddHeader. func (e *endpoint) AddHeader(pkt *stack.PacketBuffer) { if e.hdrSize > 0 { // Add ethernet header if needed. eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize)) eth.Encode(&header.EthernetFields{ SrcAddr: pkt.EgressRoute.LocalLinkAddress, DstAddr: pkt.EgressRoute.RemoteLinkAddress, Type: pkt.NetworkProtocolNumber, }) } } func (e *endpoint) parseHeader(pkt *stack.PacketBuffer) bool { _, ok := pkt.LinkHeader().Consume(e.hdrSize) return ok } // ParseHeader implements stack.LinkEndpoint.ParseHeader. func (e *endpoint) ParseHeader(pkt *stack.PacketBuffer) bool { if e.hdrSize > 0 { return e.parseHeader(pkt) } return true } // writePacket writes outbound packets to the file descriptor. If it is not // currently writable, the packet is dropped. func (e *endpoint) writePacket(pkt *stack.PacketBuffer) tcpip.Error { fdInfo := e.fds[pkt.Hash%uint32(len(e.fds))] fd := fdInfo.fd var vnetHdrBuf []byte if e.gsoKind == stack.HostGSOSupported { vnetHdr := virtioNetHdr{} if pkt.GSOOptions.Type != stack.GSONone { vnetHdr.hdrLen = uint16(pkt.HeaderSize()) if pkt.GSOOptions.NeedsCsum { vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset } if uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS { switch pkt.GSOOptions.Type { case stack.GSOTCPv4: vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 case stack.GSOTCPv6: vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 default: panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type)) } vnetHdr.gsoSize = pkt.GSOOptions.MSS } } vnetHdrBuf = vnetHdr.marshal() } views := pkt.AsSlices() numIovecs := len(views) if len(vnetHdrBuf) != 0 { numIovecs++ } if numIovecs > e.writevMaxIovs { numIovecs = e.writevMaxIovs } // Allocate small iovec arrays on the stack. var iovecsArr [8]unix.Iovec iovecs := iovecsArr[:0] if numIovecs > len(iovecsArr) { iovecs = make([]unix.Iovec, 0, numIovecs) } iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs) for _, v := range views { iovecs = rawfile.AppendIovecFromBytes(iovecs, v, numIovecs) } if errno := rawfile.NonBlockingWriteIovec(fd, iovecs); errno != 0 { return tcpip.TranslateErrno(errno) } return nil } func (e *endpoint) sendBatch(batchFDInfo fdInfo, pkts []*stack.PacketBuffer) (int, tcpip.Error) { // Degrade to writePacket if underlying fd is not a socket. if !batchFDInfo.isSocket { var written int var err tcpip.Error for written < len(pkts) { if err = e.writePacket(pkts[written]); err != nil { break } written++ } return written, err } // Send a batch of packets through batchFD. batchFD := batchFDInfo.fd mmsgHdrsStorage := make([]rawfile.MMsgHdr, 0, len(pkts)) packets := 0 for packets < len(pkts) { mmsgHdrs := mmsgHdrsStorage batch := pkts[packets:] syscallHeaderBytes := uintptr(0) for _, pkt := range batch { var vnetHdrBuf []byte if e.gsoKind == stack.HostGSOSupported { vnetHdr := virtioNetHdr{} if pkt.GSOOptions.Type != stack.GSONone { vnetHdr.hdrLen = uint16(pkt.HeaderSize()) if pkt.GSOOptions.NeedsCsum { vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset } if pkt.GSOOptions.Type != stack.GSONone && uint16(pkt.Data().Size()) > pkt.GSOOptions.MSS { switch pkt.GSOOptions.Type { case stack.GSOTCPv4: vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 case stack.GSOTCPv6: vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 default: panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type)) } vnetHdr.gsoSize = pkt.GSOOptions.MSS } } vnetHdrBuf = vnetHdr.marshal() } views, offset := pkt.AsViewList() var skipped int var view *buffer.View for view = views.Front(); view != nil && offset >= view.Size(); view = view.Next() { offset -= view.Size() skipped++ } // We've made it to the usable views. numIovecs := views.Len() - skipped if len(vnetHdrBuf) != 0 { numIovecs++ } if numIovecs > rawfile.MaxIovs { numIovecs = rawfile.MaxIovs } if e.maxSyscallHeaderBytes != 0 { syscallHeaderBytes += rawfile.SizeofMMsgHdr + uintptr(numIovecs)*rawfile.SizeofIovec if syscallHeaderBytes > e.maxSyscallHeaderBytes { // We can't fit this packet into this call to sendmmsg(). // We could potentially do so if we reduced numIovecs // further, but this might incur considerable extra // copying. Leave it to the next batch instead. break } } // We can't easily allocate iovec arrays on the stack here since // they will escape this loop iteration via mmsgHdrs. iovecs := make([]unix.Iovec, 0, numIovecs) iovecs = rawfile.AppendIovecFromBytes(iovecs, vnetHdrBuf, numIovecs) // At most one slice has a non-zero offset. iovecs = rawfile.AppendIovecFromBytes(iovecs, view.AsSlice()[offset:], numIovecs) for view = view.Next(); view != nil; view = view.Next() { iovecs = rawfile.AppendIovecFromBytes(iovecs, view.AsSlice(), numIovecs) } var mmsgHdr rawfile.MMsgHdr mmsgHdr.Msg.Iov = &iovecs[0] mmsgHdr.Msg.SetIovlen(len(iovecs)) mmsgHdrs = append(mmsgHdrs, mmsgHdr) } if len(mmsgHdrs) == 0 { // We can't fit batch[0] into a mmsghdr while staying under // e.maxSyscallHeaderBytes. Use WritePacket, which will avoid the // mmsghdr (by using writev) and re-buffer iovecs more aggressively // if necessary (by using e.writevMaxIovs instead of // rawfile.MaxIovs). pkt := batch[0] if err := e.writePacket(pkt); err != nil { return packets, err } packets++ } else { for len(mmsgHdrs) > 0 { sent, errno := rawfile.NonBlockingSendMMsg(batchFD, mmsgHdrs) if errno != 0 { return packets, tcpip.TranslateErrno(errno) } packets += sent mmsgHdrs = mmsgHdrs[sent:] } } } return packets, nil } // WritePackets writes outbound packets to the underlying file descriptors. If // one is not currently writable, the packet is dropped. // // Being a batch API, each packet in pkts should have the following // fields populated: // - pkt.EgressRoute // - pkt.GSOOptions // - pkt.NetworkProtocolNumber func (e *endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { // Preallocate to avoid repeated reallocation as we append to batch. batch := make([]*stack.PacketBuffer, 0, BatchSize) batchFDInfo := fdInfo{fd: -1, isSocket: false} sentPackets := 0 for _, pkt := range pkts.AsSlice() { if len(batch) == 0 { batchFDInfo = e.fds[pkt.Hash%uint32(len(e.fds))] } pktFDInfo := e.fds[pkt.Hash%uint32(len(e.fds))] if sendNow := pktFDInfo != batchFDInfo; !sendNow { batch = append(batch, pkt) continue } n, err := e.sendBatch(batchFDInfo, batch) sentPackets += n if err != nil { return sentPackets, err } batch = batch[:0] batch = append(batch, pkt) batchFDInfo = pktFDInfo } if len(batch) != 0 { n, err := e.sendBatch(batchFDInfo, batch) sentPackets += n if err != nil { return sentPackets, err } } return sentPackets, nil } // InjectOutbound implements stack.InjectableEndpoint.InjectOutbound. func (e *endpoint) InjectOutbound(dest tcpip.Address, packet *buffer.View) tcpip.Error { if errno := rawfile.NonBlockingWrite(e.fds[0].fd, packet.AsSlice()); errno != 0 { return tcpip.TranslateErrno(errno) } return nil } // dispatchLoop reads packets from the file descriptor in a loop and dispatches // them to the network stack. func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) tcpip.Error { for { cont, err := inboundDispatcher.dispatch() if err != nil || !cont { if e.closed != nil { e.closed(err) } inboundDispatcher.release() return err } } } // GSOMaxSize implements stack.GSOEndpoint. func (e *endpoint) GSOMaxSize() uint32 { return e.gsoMaxSize } // SupportedGSO implements stack.GSOEndpoint. func (e *endpoint) SupportedGSO() stack.SupportedGSO { return e.gsoKind } // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. func (e *endpoint) ARPHardwareType() header.ARPHardwareType { if e.hdrSize > 0 { return header.ARPHardwareEther } return header.ARPHardwareNone } // Close implements stack.LinkEndpoint. func (e *endpoint) Close() {} // SetOnCloseAction implements stack.LinkEndpoint. func (*endpoint) SetOnCloseAction(func()) {} // InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes // to the FD, but does not read from it. All reads come from injected packets. // // +satetify savable type InjectableEndpoint struct { endpoint mu sync.RWMutex `state:"nosave"` // +checklocks:mu dispatcher stack.NetworkDispatcher } // Attach saves the stack network-layer dispatcher for use later when packets // are injected. func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) { e.mu.Lock() defer e.mu.Unlock() e.dispatcher = dispatcher } // InjectInbound injects an inbound packet. If the endpoint is not attached, the // packet is not delivered. func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { e.mu.RLock() d := e.dispatcher e.mu.RUnlock() if d != nil { d.DeliverNetworkPacket(protocol, pkt) } } // NewInjectable creates a new fd-based InjectableEndpoint. func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) (*InjectableEndpoint, error) { unix.SetNonblock(fd, true) isSocket, err := isSocketFD(fd) if err != nil { return nil, err } return &InjectableEndpoint{endpoint: endpoint{ fds: []fdInfo{{fd: fd, isSocket: isSocket}}, mtu: mtu, caps: capabilities, writevMaxIovs: rawfile.MaxIovs, }}, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/fdbased/endpoint_unsafe.go000066400000000000000000000013321465435605700265770ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package fdbased import ( "unsafe" ) const virtioNetHdrSize = int(unsafe.Sizeof(virtioNetHdr{})) golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/fdbased/fdbased_state_autogen.go000066400000000000000000000115561465435605700277410ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux && ((linux && amd64) || (linux && arm64)) && (!linux || (!amd64 && !arm64)) && linux && linux // +build linux // +build linux,amd64 linux,arm64 // +build !linux !amd64,!arm64 // +build linux // +build linux package fdbased import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (f *fdInfo) StateTypeName() string { return "pkg/tcpip/link/fdbased.fdInfo" } func (f *fdInfo) StateFields() []string { return []string{ "fd", "isSocket", } } func (f *fdInfo) beforeSave() {} // +checklocksignore func (f *fdInfo) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.fd) stateSinkObject.Save(1, &f.isSocket) } func (f *fdInfo) afterLoad(context.Context) {} // +checklocksignore func (f *fdInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.fd) stateSourceObject.Load(1, &f.isSocket) } func (e *endpoint) StateTypeName() string { return "pkg/tcpip/link/fdbased.endpoint" } func (e *endpoint) StateFields() []string { return []string{ "fds", "hdrSize", "caps", "closed", "inboundDispatchers", "dispatcher", "packetDispatchMode", "gsoMaxSize", "wg", "gsoKind", "maxSyscallHeaderBytes", "writevMaxIovs", "addr", "mtu", } } func (e *endpoint) beforeSave() {} // +checklocksignore func (e *endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.fds) stateSinkObject.Save(1, &e.hdrSize) stateSinkObject.Save(2, &e.caps) stateSinkObject.Save(3, &e.closed) stateSinkObject.Save(4, &e.inboundDispatchers) stateSinkObject.Save(5, &e.dispatcher) stateSinkObject.Save(6, &e.packetDispatchMode) stateSinkObject.Save(7, &e.gsoMaxSize) stateSinkObject.Save(8, &e.wg) stateSinkObject.Save(9, &e.gsoKind) stateSinkObject.Save(10, &e.maxSyscallHeaderBytes) stateSinkObject.Save(11, &e.writevMaxIovs) stateSinkObject.Save(12, &e.addr) stateSinkObject.Save(13, &e.mtu) } func (e *endpoint) afterLoad(context.Context) {} // +checklocksignore func (e *endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.fds) stateSourceObject.Load(1, &e.hdrSize) stateSourceObject.Load(2, &e.caps) stateSourceObject.Load(3, &e.closed) stateSourceObject.Load(4, &e.inboundDispatchers) stateSourceObject.Load(5, &e.dispatcher) stateSourceObject.Load(6, &e.packetDispatchMode) stateSourceObject.Load(7, &e.gsoMaxSize) stateSourceObject.Load(8, &e.wg) stateSourceObject.Load(9, &e.gsoKind) stateSourceObject.Load(10, &e.maxSyscallHeaderBytes) stateSourceObject.Load(11, &e.writevMaxIovs) stateSourceObject.Load(12, &e.addr) stateSourceObject.Load(13, &e.mtu) } func (o *Options) StateTypeName() string { return "pkg/tcpip/link/fdbased.Options" } func (o *Options) StateFields() []string { return []string{ "FDs", "MTU", "EthernetHeader", "ClosedFunc", "Address", "SaveRestore", "DisconnectOk", "GSOMaxSize", "GVisorGSOEnabled", "PacketDispatchMode", "TXChecksumOffload", "RXChecksumOffload", "MaxSyscallHeaderBytes", "InterfaceIndex", "GRO", "ProcessorsPerChannel", } } func (o *Options) beforeSave() {} // +checklocksignore func (o *Options) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.FDs) stateSinkObject.Save(1, &o.MTU) stateSinkObject.Save(2, &o.EthernetHeader) stateSinkObject.Save(3, &o.ClosedFunc) stateSinkObject.Save(4, &o.Address) stateSinkObject.Save(5, &o.SaveRestore) stateSinkObject.Save(6, &o.DisconnectOk) stateSinkObject.Save(7, &o.GSOMaxSize) stateSinkObject.Save(8, &o.GVisorGSOEnabled) stateSinkObject.Save(9, &o.PacketDispatchMode) stateSinkObject.Save(10, &o.TXChecksumOffload) stateSinkObject.Save(11, &o.RXChecksumOffload) stateSinkObject.Save(12, &o.MaxSyscallHeaderBytes) stateSinkObject.Save(13, &o.InterfaceIndex) stateSinkObject.Save(14, &o.GRO) stateSinkObject.Save(15, &o.ProcessorsPerChannel) } func (o *Options) afterLoad(context.Context) {} // +checklocksignore func (o *Options) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.FDs) stateSourceObject.Load(1, &o.MTU) stateSourceObject.Load(2, &o.EthernetHeader) stateSourceObject.Load(3, &o.ClosedFunc) stateSourceObject.Load(4, &o.Address) stateSourceObject.Load(5, &o.SaveRestore) stateSourceObject.Load(6, &o.DisconnectOk) stateSourceObject.Load(7, &o.GSOMaxSize) stateSourceObject.Load(8, &o.GVisorGSOEnabled) stateSourceObject.Load(9, &o.PacketDispatchMode) stateSourceObject.Load(10, &o.TXChecksumOffload) stateSourceObject.Load(11, &o.RXChecksumOffload) stateSourceObject.Load(12, &o.MaxSyscallHeaderBytes) stateSourceObject.Load(13, &o.InterfaceIndex) stateSourceObject.Load(14, &o.GRO) stateSourceObject.Load(15, &o.ProcessorsPerChannel) } func init() { state.Register((*fdInfo)(nil)) state.Register((*endpoint)(nil)) state.Register((*Options)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/fdbased/fdbased_unsafe_state_autogen.go000066400000000000000000000002471465435605700312750ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux && ((linux && amd64) || (linux && arm64)) // +build linux // +build linux,amd64 linux,arm64 package fdbased golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/fdbased/mmap.go000066400000000000000000000131741465435605700243570ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build (linux && amd64) || (linux && arm64) // +build linux,amd64 linux,arm64 package fdbased import ( "encoding/binary" "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/rawfile" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/link/stopfd" "gvisor.dev/gvisor/pkg/tcpip/stack" ) const ( tPacketAlignment = uintptr(16) tpStatusKernel = 0 tpStatusUser = 1 tpStatusCopy = 2 tpStatusLosing = 4 ) // We overallocate the frame size to accommodate space for the // TPacketHdr+RawSockAddrLinkLayer+MAC header and any padding. // // Memory allocated for the ring buffer: tpBlockSize * tpBlockNR = 2 MiB // // NOTE: // // Frames need to be aligned at 16 byte boundaries. // BlockSize needs to be page aligned. // // For details see PACKET_MMAP setting constraints in // https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt const ( tpFrameSize = 65536 + 128 tpBlockSize = tpFrameSize * 32 tpBlockNR = 1 tpFrameNR = (tpBlockSize * tpBlockNR) / tpFrameSize ) // tPacketAlign aligns the pointer v at a tPacketAlignment boundary. Direct // translation of the TPACKET_ALIGN macro in . func tPacketAlign(v uintptr) uintptr { return (v + tPacketAlignment - 1) & uintptr(^(tPacketAlignment - 1)) } // tPacketReq is the tpacket_req structure as described in // https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt type tPacketReq struct { tpBlockSize uint32 tpBlockNR uint32 tpFrameSize uint32 tpFrameNR uint32 } // tPacketHdr is tpacket_hdr structure as described in type tPacketHdr []byte const ( tpStatusOffset = 0 tpLenOffset = 8 tpSnapLenOffset = 12 tpMacOffset = 16 tpNetOffset = 18 tpSecOffset = 20 tpUSecOffset = 24 ) func (t tPacketHdr) tpLen() uint32 { return binary.LittleEndian.Uint32(t[tpLenOffset:]) } func (t tPacketHdr) tpSnapLen() uint32 { return binary.LittleEndian.Uint32(t[tpSnapLenOffset:]) } func (t tPacketHdr) tpMac() uint16 { return binary.LittleEndian.Uint16(t[tpMacOffset:]) } func (t tPacketHdr) tpNet() uint16 { return binary.LittleEndian.Uint16(t[tpNetOffset:]) } func (t tPacketHdr) tpSec() uint32 { return binary.LittleEndian.Uint32(t[tpSecOffset:]) } func (t tPacketHdr) tpUSec() uint32 { return binary.LittleEndian.Uint32(t[tpUSecOffset:]) } func (t tPacketHdr) Payload() []byte { return t[uint32(t.tpMac()) : uint32(t.tpMac())+t.tpSnapLen()] } // packetMMapDispatcher uses PACKET_RX_RING's to read/dispatch inbound packets. // See: mmap_amd64_unsafe.go for implementation details. type packetMMapDispatcher struct { stopfd.StopFD // fd is the file descriptor used to send and receive packets. fd int // e is the endpoint this dispatcher is attached to. e *endpoint // ringBuffer is only used when PacketMMap dispatcher is used and points // to the start of the mmapped PACKET_RX_RING buffer. ringBuffer []byte // ringOffset is the current offset into the ring buffer where the next // inbound packet will be placed by the kernel. ringOffset int // mgr is the processor goroutine manager. mgr *processorManager } func (d *packetMMapDispatcher) release() { d.mgr.close() } func (d *packetMMapDispatcher) readMMappedPackets() (stack.PacketBufferList, bool, tcpip.Error) { var pkts stack.PacketBufferList hdr := tPacketHdr(d.ringBuffer[d.ringOffset*tpFrameSize:]) for hdr.tpStatus()&tpStatusUser == 0 { stopped, errno := rawfile.BlockingPollUntilStopped(d.EFD, d.fd, unix.POLLIN|unix.POLLERR) if errno != 0 { if errno == unix.EINTR { continue } return pkts, stopped, tcpip.TranslateErrno(errno) } if stopped { return pkts, true, nil } if hdr.tpStatus()&tpStatusCopy != 0 { // This frame is truncated so skip it after flipping the // buffer to the kernel. hdr.setTPStatus(tpStatusKernel) d.ringOffset = (d.ringOffset + 1) % tpFrameNR hdr = (tPacketHdr)(d.ringBuffer[d.ringOffset*tpFrameSize:]) continue } } for hdr.tpStatus()&tpStatusUser == 1 { // Copy out the packet from the mmapped frame to a locally owned buffer. pkts.PushBack(stack.NewPacketBuffer(stack.PacketBufferOptions{ Payload: buffer.MakeWithView(buffer.NewViewWithData(hdr.Payload())), })) // Release packet to kernel. hdr.setTPStatus(tpStatusKernel) d.ringOffset = (d.ringOffset + 1) % tpFrameNR hdr = tPacketHdr(d.ringBuffer[d.ringOffset*tpFrameSize:]) } return pkts, false, nil } // dispatch reads packets from an mmaped ring buffer and dispatches them to the // network stack. func (d *packetMMapDispatcher) dispatch() (bool, tcpip.Error) { pkts, stopped, err := d.readMMappedPackets() defer pkts.Reset() if err != nil || stopped { return false, err } for _, pkt := range pkts.AsSlice() { if d.e.hdrSize > 0 { hdr, ok := pkt.LinkHeader().Consume(d.e.hdrSize) if !ok { panic(fmt.Sprintf("LinkHeader().Consume(%d) must succeed", d.e.hdrSize)) } pkt.NetworkProtocolNumber = header.Ethernet(hdr).Type() } d.mgr.queuePacket(pkt, d.e.hdrSize > 0) } if pkts.Len() > 0 { d.mgr.wakeReady() } return true, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/fdbased/mmap_stub.go000066400000000000000000000015411465435605700254070ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !linux || (!amd64 && !arm64) // +build !linux !amd64,!arm64 package fdbased // Stubbed out version for non-linux/non-amd64/non-arm64 platforms. func newPacketMMapDispatcher(fd int, e *endpoint, opts *Options) (linkDispatcher, error) { return nil, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/fdbased/mmap_unsafe.go000066400000000000000000000060641465435605700257200ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build (linux && amd64) || (linux && arm64) // +build linux,amd64 linux,arm64 package fdbased import ( "fmt" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/tcpip/link/stopfd" ) // tPacketHdrlen is the TPACKET_HDRLEN variable defined in . var tPacketHdrlen = tPacketAlign(unsafe.Sizeof(tPacketHdr{}) + unsafe.Sizeof(unix.RawSockaddrLinklayer{})) // tpStatus returns the frame status field. // The status is concurrently updated by the kernel as a result we must // use atomic operations to prevent races. func (t tPacketHdr) tpStatus() uint32 { hdr := unsafe.Pointer(&t[0]) statusPtr := unsafe.Pointer(uintptr(hdr) + uintptr(tpStatusOffset)) return (*atomicbitops.Uint32)(statusPtr).Load() } // setTPStatus set's the frame status to the provided status. // The status is concurrently updated by the kernel as a result we must // use atomic operations to prevent races. func (t tPacketHdr) setTPStatus(status uint32) { hdr := unsafe.Pointer(&t[0]) statusPtr := unsafe.Pointer(uintptr(hdr) + uintptr(tpStatusOffset)) (*atomicbitops.Uint32)(statusPtr).Store(status) } func newPacketMMapDispatcher(fd int, e *endpoint, opts *Options) (linkDispatcher, error) { stopFD, err := stopfd.New() if err != nil { return nil, err } d := &packetMMapDispatcher{ StopFD: stopFD, fd: fd, e: e, } pageSize := unix.Getpagesize() if tpBlockSize%pageSize != 0 { return nil, fmt.Errorf("tpBlockSize: %d is not page aligned, pagesize: %d", tpBlockSize, pageSize) } tReq := tPacketReq{ tpBlockSize: uint32(tpBlockSize), tpBlockNR: uint32(tpBlockNR), tpFrameSize: uint32(tpFrameSize), tpFrameNR: uint32(tpFrameNR), } // Setup PACKET_RX_RING. if err := setsockopt(d.fd, unix.SOL_PACKET, unix.PACKET_RX_RING, unsafe.Pointer(&tReq), unsafe.Sizeof(tReq)); err != nil { return nil, fmt.Errorf("failed to enable PACKET_RX_RING: %v", err) } // Let's mmap the blocks. sz := tpBlockSize * tpBlockNR buf, err := unix.Mmap(d.fd, 0, sz, unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED) if err != nil { return nil, fmt.Errorf("unix.Mmap(...,0, %v, ...) failed = %v", sz, err) } d.mgr = newProcessorManager(opts, e) d.mgr.start() d.ringBuffer = buf return d, nil } func setsockopt(fd, level, name int, val unsafe.Pointer, vallen uintptr) error { if _, _, errno := unix.Syscall6(unix.SYS_SETSOCKOPT, uintptr(fd), uintptr(level), uintptr(name), uintptr(val), vallen, 0); errno != 0 { return error(errno) } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/fdbased/packet_dispatchers.go000066400000000000000000000207321465435605700272630ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package fdbased import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/rawfile" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/link/stopfd" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/stack/gro" ) // BufConfig defines the shape of the buffer used to read packets from the NIC. var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768} type iovecBuffer struct { // buffer is the actual buffer that holds the packet contents. Some contents // are reused across calls to pullBuffer if number of requested bytes is // smaller than the number of bytes allocated in the buffer. views []*buffer.View // iovecs are initialized with base pointers/len of the corresponding // entries in the views defined above, except when GSO is enabled // (skipsVnetHdr) then the first iovec points to a buffer for the vnet header // which is stripped before the views are passed up the stack for further // processing. iovecs []unix.Iovec // sizes is an array of buffer sizes for the underlying views. sizes is // immutable. sizes []int // skipsVnetHdr is true if virtioNetHdr is to skipped. skipsVnetHdr bool // pulledIndex is the index of the last []byte buffer pulled from the // underlying buffer storage during a call to pullBuffers. It is -1 // if no buffer is pulled. pulledIndex int } func newIovecBuffer(sizes []int, skipsVnetHdr bool) *iovecBuffer { b := &iovecBuffer{ views: make([]*buffer.View, len(sizes)), sizes: sizes, skipsVnetHdr: skipsVnetHdr, } niov := len(b.views) if b.skipsVnetHdr { niov++ } b.iovecs = make([]unix.Iovec, niov) return b } func (b *iovecBuffer) nextIovecs() []unix.Iovec { vnetHdrOff := 0 if b.skipsVnetHdr { var vnetHdr [virtioNetHdrSize]byte // The kernel adds virtioNetHdr before each packet, but // we don't use it, so we allocate a buffer for it, // add it in iovecs but don't add it in a view. b.iovecs[0] = unix.Iovec{Base: &vnetHdr[0]} b.iovecs[0].SetLen(virtioNetHdrSize) vnetHdrOff++ } for i := range b.views { if b.views[i] != nil { break } v := buffer.NewViewSize(b.sizes[i]) b.views[i] = v b.iovecs[i+vnetHdrOff] = unix.Iovec{Base: v.BasePtr()} b.iovecs[i+vnetHdrOff].SetLen(v.Size()) } return b.iovecs } // pullBuffer extracts the enough underlying storage from b.buffer to hold n // bytes. It removes this storage from b.buffer, returns a new buffer // that holds the storage, and updates pulledIndex to indicate which part // of b.buffer's storage must be reallocated during the next call to // nextIovecs. func (b *iovecBuffer) pullBuffer(n int) buffer.Buffer { var views []*buffer.View c := 0 if b.skipsVnetHdr { c += virtioNetHdrSize if c >= n { // Nothing in the packet. return buffer.Buffer{} } } // Remove the used views from the buffer. for i, v := range b.views { c += v.Size() if c >= n { b.views[i].CapLength(v.Size() - (c - n)) views = append(views, b.views[:i+1]...) break } } for i := range views { b.views[i] = nil } if b.skipsVnetHdr { // Exclude the size of the vnet header. n -= virtioNetHdrSize } pulled := buffer.Buffer{} for _, v := range views { pulled.Append(v) } pulled.Truncate(int64(n)) return pulled } func (b *iovecBuffer) release() { for _, v := range b.views { if v != nil { v.Release() v = nil } } } // readVDispatcher uses readv() system call to read inbound packets and // dispatches them. type readVDispatcher struct { stopfd.StopFD // fd is the file descriptor used to send and receive packets. fd int // e is the endpoint this dispatcher is attached to. e *endpoint // buf is the iovec buffer that contains the packet contents. buf *iovecBuffer // mgr is the processor goroutine manager. mgr *processorManager } func newReadVDispatcher(fd int, e *endpoint, opts *Options) (linkDispatcher, error) { stopFD, err := stopfd.New() if err != nil { return nil, err } d := &readVDispatcher{ StopFD: stopFD, fd: fd, e: e, } skipsVnetHdr := d.e.gsoKind == stack.HostGSOSupported d.buf = newIovecBuffer(BufConfig, skipsVnetHdr) d.mgr = newProcessorManager(opts, e) d.mgr.start() return d, nil } func (d *readVDispatcher) release() { d.buf.release() d.mgr.close() } // dispatch reads one packet from the file descriptor and dispatches it. func (d *readVDispatcher) dispatch() (bool, tcpip.Error) { n, errno := rawfile.BlockingReadvUntilStopped(d.EFD, d.fd, d.buf.nextIovecs()) if n <= 0 || errno != 0 { return false, tcpip.TranslateErrno(errno) } pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ Payload: d.buf.pullBuffer(n), }) defer pkt.DecRef() if d.e.hdrSize > 0 { if !d.e.parseHeader(pkt) { return false, nil } pkt.NetworkProtocolNumber = header.Ethernet(pkt.LinkHeader().Slice()).Type() } d.mgr.queuePacket(pkt, d.e.hdrSize > 0) d.mgr.wakeReady() return true, nil } // recvMMsgDispatcher uses the recvmmsg system call to read inbound packets and // dispatches them. type recvMMsgDispatcher struct { stopfd.StopFD // fd is the file descriptor used to send and receive packets. fd int // e is the endpoint this dispatcher is attached to. e *endpoint // bufs is an array of iovec buffers that contain packet contents. bufs []*iovecBuffer // msgHdrs is an array of MMsgHdr objects where each MMsghdr is used to // reference an array of iovecs in the iovecs field defined above. This // array is passed as the parameter to recvmmsg call to retrieve // potentially more than 1 packet per unix. msgHdrs []rawfile.MMsgHdr // pkts is reused to avoid allocations. pkts stack.PacketBufferList // gro coalesces incoming packets to increase throughput. gro gro.GRO // mgr is the processor goroutine manager. mgr *processorManager } const ( // MaxMsgsPerRecv is the maximum number of packets we want to retrieve // in a single RecvMMsg call. MaxMsgsPerRecv = 8 ) func newRecvMMsgDispatcher(fd int, e *endpoint, opts *Options) (linkDispatcher, error) { stopFD, err := stopfd.New() if err != nil { return nil, err } d := &recvMMsgDispatcher{ StopFD: stopFD, fd: fd, e: e, bufs: make([]*iovecBuffer, MaxMsgsPerRecv), msgHdrs: make([]rawfile.MMsgHdr, MaxMsgsPerRecv), } skipsVnetHdr := d.e.gsoKind == stack.HostGSOSupported for i := range d.bufs { d.bufs[i] = newIovecBuffer(BufConfig, skipsVnetHdr) } d.gro.Init(opts.GRO) d.mgr = newProcessorManager(opts, e) d.mgr.start() return d, nil } func (d *recvMMsgDispatcher) release() { for _, iov := range d.bufs { iov.release() } d.mgr.close() } // recvMMsgDispatch reads more than one packet at a time from the file // descriptor and dispatches it. func (d *recvMMsgDispatcher) dispatch() (bool, tcpip.Error) { // Fill message headers. for k := range d.msgHdrs { if d.msgHdrs[k].Msg.Iovlen > 0 { break } iovecs := d.bufs[k].nextIovecs() iovLen := len(iovecs) d.msgHdrs[k].Len = 0 d.msgHdrs[k].Msg.Iov = &iovecs[0] d.msgHdrs[k].Msg.SetIovlen(iovLen) } nMsgs, errno := rawfile.BlockingRecvMMsgUntilStopped(d.EFD, d.fd, d.msgHdrs) if errno != 0 { return false, tcpip.TranslateErrno(errno) } if nMsgs == -1 { return false, nil } // Process each of received packets. d.e.mu.RLock() dsp := d.e.dispatcher d.e.mu.RUnlock() d.gro.Dispatcher = dsp defer d.pkts.Reset() for k := 0; k < nMsgs; k++ { n := int(d.msgHdrs[k].Len) pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ Payload: d.bufs[k].pullBuffer(n), }) d.pkts.PushBack(pkt) // Mark that this iovec has been processed. d.msgHdrs[k].Msg.Iovlen = 0 if d.e.hdrSize > 0 { hdr, ok := pkt.LinkHeader().Consume(d.e.hdrSize) if !ok { return false, nil } pkt.NetworkProtocolNumber = header.Ethernet(hdr).Type() } pkt.RXChecksumValidated = d.e.caps&stack.CapabilityRXChecksumOffload != 0 d.mgr.queuePacket(pkt, d.e.hdrSize > 0) } d.mgr.wakeReady() return true, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/fdbased/processors.go000066400000000000000000000157171465435605700256340ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package fdbased import ( "encoding/binary" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sleep" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/stack/gro" ) type processor struct { mu sync.Mutex // +checklocks:mu pkts stack.PacketBufferList e *endpoint gro gro.GRO sleeper sleep.Sleeper packetWaker sleep.Waker closeWaker sleep.Waker } func (p *processor) start(wg *sync.WaitGroup) { defer wg.Done() defer p.sleeper.Done() for { switch w := p.sleeper.Fetch(true); { case w == &p.packetWaker: p.deliverPackets() case w == &p.closeWaker: p.mu.Lock() p.pkts.Reset() p.mu.Unlock() return } } } func (p *processor) deliverPackets() { p.e.mu.RLock() p.gro.Dispatcher = p.e.dispatcher p.e.mu.RUnlock() if p.gro.Dispatcher == nil { p.mu.Lock() p.pkts.Reset() p.mu.Unlock() return } p.mu.Lock() for p.pkts.Len() > 0 { pkt := p.pkts.PopFront() p.mu.Unlock() p.gro.Enqueue(pkt) pkt.DecRef() p.mu.Lock() } p.mu.Unlock() p.gro.Flush() } // processorManager handles starting, closing, and queuing packets on processor // goroutines. type processorManager struct { processors []processor seed uint32 wg sync.WaitGroup e *endpoint ready []bool } // newProcessorManager creates a new processor manager. func newProcessorManager(opts *Options, e *endpoint) *processorManager { m := &processorManager{} m.seed = rand.Uint32() m.ready = make([]bool, opts.ProcessorsPerChannel) m.processors = make([]processor, opts.ProcessorsPerChannel) m.e = e m.wg.Add(opts.ProcessorsPerChannel) for i := range m.processors { p := &m.processors[i] p.sleeper.AddWaker(&p.packetWaker) p.sleeper.AddWaker(&p.closeWaker) p.gro.Init(opts.GRO) p.e = e } return m } // start starts the processor goroutines if the processor manager is configured // with more than one processor. func (m *processorManager) start() { for i := range m.processors { p := &m.processors[i] // Only start processor in a separate goroutine if we have multiple of them. if len(m.processors) > 1 { go p.start(&m.wg) } } } func (m *processorManager) connectionHash(cid *connectionID) uint32 { var payload [4]byte binary.LittleEndian.PutUint16(payload[0:], cid.srcPort) binary.LittleEndian.PutUint16(payload[2:], cid.dstPort) h := jenkins.Sum32(m.seed) h.Write(payload[:]) h.Write(cid.srcAddr) h.Write(cid.dstAddr) return h.Sum32() } // queuePacket queues a packet to be delivered to the appropriate processor. func (m *processorManager) queuePacket(pkt *stack.PacketBuffer, hasEthHeader bool) { var pIdx uint32 cid, nonConnectionPkt := tcpipConnectionID(pkt) if !hasEthHeader { if nonConnectionPkt { // If there's no eth header this should be a standard tcpip packet. If // it isn't the packet is invalid so drop it. return } pkt.NetworkProtocolNumber = cid.proto } if len(m.processors) == 1 || nonConnectionPkt { // If the packet is not associated with an active connection, use the // first processor. pIdx = 0 } else { pIdx = m.connectionHash(&cid) % uint32(len(m.processors)) } p := &m.processors[pIdx] p.mu.Lock() defer p.mu.Unlock() pkt.IncRef() p.pkts.PushBack(pkt) m.ready[pIdx] = true } type connectionID struct { srcAddr, dstAddr []byte srcPort, dstPort uint16 proto tcpip.NetworkProtocolNumber } // tcpipConnectionID returns a tcpip connection id tuple based on the data found // in the packet. It returns true if the packet is not associated with an active // connection (e.g ARP, NDP, etc). The method assumes link headers have already // been processed if they were present. func tcpipConnectionID(pkt *stack.PacketBuffer) (connectionID, bool) { var cid connectionID h, ok := pkt.Data().PullUp(1) if !ok { // Skip this packet. return cid, true } const tcpSrcDstPortLen = 4 switch header.IPVersion(h) { case header.IPv4Version: hdrLen := header.IPv4(h).HeaderLength() h, ok = pkt.Data().PullUp(int(hdrLen) + tcpSrcDstPortLen) if !ok { return cid, true } ipHdr := header.IPv4(h[:hdrLen]) tcpHdr := header.TCP(h[hdrLen:][:tcpSrcDstPortLen]) cid.srcAddr = ipHdr.SourceAddressSlice() cid.dstAddr = ipHdr.DestinationAddressSlice() // All fragment packets need to be processed by the same goroutine, so // only record the TCP ports if this is not a fragment packet. if ipHdr.IsValid(pkt.Data().Size()) && !ipHdr.More() && ipHdr.FragmentOffset() == 0 { cid.srcPort = tcpHdr.SourcePort() cid.dstPort = tcpHdr.DestinationPort() } cid.proto = header.IPv4ProtocolNumber case header.IPv6Version: h, ok = pkt.Data().PullUp(header.IPv6FixedHeaderSize + tcpSrcDstPortLen) if !ok { return cid, true } ipHdr := header.IPv6(h) var tcpHdr header.TCP if tcpip.TransportProtocolNumber(ipHdr.NextHeader()) == header.TCPProtocolNumber { tcpHdr = header.TCP(h[header.IPv6FixedHeaderSize:][:tcpSrcDstPortLen]) } else { // Slow path for IPv6 extension headers :(. dataBuf := pkt.Data().ToBuffer() dataBuf.TrimFront(header.IPv6MinimumSize) it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(ipHdr.NextHeader()), dataBuf) defer it.Release() for { hdr, done, err := it.Next() if done || err != nil { break } hdr.Release() } h, ok = pkt.Data().PullUp(int(it.HeaderOffset()) + tcpSrcDstPortLen) if !ok { return cid, true } tcpHdr = header.TCP(h[it.HeaderOffset():][:tcpSrcDstPortLen]) } cid.srcAddr = ipHdr.SourceAddressSlice() cid.dstAddr = ipHdr.DestinationAddressSlice() cid.srcPort = tcpHdr.SourcePort() cid.dstPort = tcpHdr.DestinationPort() cid.proto = header.IPv6ProtocolNumber default: return cid, true } return cid, false } func (m *processorManager) close() { if len(m.processors) < 2 { return } for i := range m.processors { p := &m.processors[i] p.closeWaker.Assert() } } // wakeReady wakes up all processors that have a packet queued. If there is only // one processor, the method delivers the packet inline without waking a // goroutine. func (m *processorManager) wakeReady() { for i, ready := range m.ready { if !ready { continue } p := &m.processors[i] if len(m.processors) > 1 { p.packetWaker.Assert() } else { p.deliverPackets() } m.ready[i] = false } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/loopback/000077500000000000000000000000001465435605700232725ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/loopback/loopback.go000066400000000000000000000105231465435605700254140ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package loopback provides the implementation of loopback data-link layer // endpoints. Such endpoints just turn outbound packets into inbound ones. // // Loopback endpoints can be used in the networking stack by calling New() to // create a new endpoint, and then passing it as an argument to // Stack.CreateNIC(). package loopback import ( "sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) const ( loopbackMTU = 65536 ) // +stateify savable type endpoint struct { mu sync.RWMutex `state:"nosave"` // +checklocks:mu dispatcher stack.NetworkDispatcher // +checklocks:mu addr tcpip.LinkAddress // +checklocks:mu mtu uint32 } // New creates a new loopback endpoint. This link-layer endpoint just turns // outbound packets into inbound packets. func New() stack.LinkEndpoint { return &endpoint{ mtu: loopbackMTU, } } // Attach implements stack.LinkEndpoint.Attach. It just saves the stack network- // layer dispatcher for later use when packets need to be dispatched. func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { e.mu.Lock() defer e.mu.Unlock() e.dispatcher = dispatcher } // IsAttached implements stack.LinkEndpoint.IsAttached. func (e *endpoint) IsAttached() bool { e.mu.RLock() defer e.mu.RUnlock() return e.dispatcher != nil } // MTU implements stack.LinkEndpoint.MTU. func (e *endpoint) MTU() uint32 { e.mu.RLock() defer e.mu.RUnlock() return e.mtu } // SetMTU implements stack.LinkEndpoint.SetMTU. It has no impact. func (e *endpoint) SetMTU(mtu uint32) { e.mu.Lock() defer e.mu.Unlock() e.mtu = mtu } // Capabilities implements stack.LinkEndpoint.Capabilities. Loopback advertises // itself as supporting checksum offload, but in reality it's just omitted. func (*endpoint) Capabilities() stack.LinkEndpointCapabilities { return stack.CapabilityRXChecksumOffload | stack.CapabilityTXChecksumOffload | stack.CapabilitySaveRestore | stack.CapabilityLoopback } // MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength. Given that the // loopback interface doesn't have a header, it just returns 0. func (*endpoint) MaxHeaderLength() uint16 { return 0 } // LinkAddress returns the link address of this endpoint. func (e *endpoint) LinkAddress() tcpip.LinkAddress { e.mu.RLock() defer e.mu.RUnlock() return e.addr } // SetLinkAddress implements stack.LinkEndpoint.SetLinkAddress. func (e *endpoint) SetLinkAddress(addr tcpip.LinkAddress) { e.mu.Lock() defer e.mu.Unlock() e.addr = addr } // Wait implements stack.LinkEndpoint.Wait. func (*endpoint) Wait() {} // WritePackets implements stack.LinkEndpoint.WritePackets. If the endpoint is // not attached, the packets are not delivered. func (e *endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { e.mu.RLock() d := e.dispatcher e.mu.RUnlock() for _, pkt := range pkts.AsSlice() { // In order to properly loop back to the inbound side we must create a // fresh packet that only contains the underlying payload with no headers // or struct fields set. newPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ Payload: pkt.ToBuffer(), }) if d != nil { d.DeliverNetworkPacket(pkt.NetworkProtocolNumber, newPkt) } newPkt.DecRef() } return pkts.Len(), nil } // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. func (*endpoint) ARPHardwareType() header.ARPHardwareType { return header.ARPHardwareLoopback } // AddHeader implements stack.LinkEndpoint. func (*endpoint) AddHeader(*stack.PacketBuffer) {} // ParseHeader implements stack.LinkEndpoint. func (*endpoint) ParseHeader(*stack.PacketBuffer) bool { return true } // Close implements stack.LinkEndpoint. func (*endpoint) Close() {} // SetOnCloseAction implements stack.LinkEndpoint. func (*endpoint) SetOnCloseAction(func()) {} golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/loopback/loopback_state_autogen.go000066400000000000000000000015461465435605700303430ustar00rootroot00000000000000// automatically generated by stateify. package loopback import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (e *endpoint) StateTypeName() string { return "pkg/tcpip/link/loopback.endpoint" } func (e *endpoint) StateFields() []string { return []string{ "dispatcher", "addr", "mtu", } } func (e *endpoint) beforeSave() {} // +checklocksignore func (e *endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.dispatcher) stateSinkObject.Save(1, &e.addr) stateSinkObject.Save(2, &e.mtu) } func (e *endpoint) afterLoad(context.Context) {} // +checklocksignore func (e *endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.dispatcher) stateSourceObject.Load(1, &e.addr) stateSourceObject.Load(2, &e.mtu) } func init() { state.Register((*endpoint)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/muxed/000077500000000000000000000000001465435605700226225ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/muxed/injectable.go000066400000000000000000000120741465435605700252550ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package muxed provides a muxed link endpoints. package muxed import ( "sync" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // InjectableEndpoint is an injectable multi endpoint. The endpoint has // trivial routing rules that determine which InjectableEndpoint a given packet // will be written to. Note that HandleLocal works differently for this // endpoint (see WritePacket). // // +stateify savable type InjectableEndpoint struct { routes map[tcpip.Address]stack.InjectableLinkEndpoint mu sync.RWMutex `state:"nosave"` // +checklocks:mu dispatcher stack.NetworkDispatcher } // MTU implements stack.LinkEndpoint. func (m *InjectableEndpoint) MTU() uint32 { minMTU := ^uint32(0) for _, endpoint := range m.routes { if endpointMTU := endpoint.MTU(); endpointMTU < minMTU { minMTU = endpointMTU } } return minMTU } // SetMTU implements stack.LinkEndpoint. func (m *InjectableEndpoint) SetMTU(mtu uint32) { for _, endpoint := range m.routes { endpoint.SetMTU(mtu) } } // Capabilities implements stack.LinkEndpoint. func (m *InjectableEndpoint) Capabilities() stack.LinkEndpointCapabilities { minCapabilities := stack.LinkEndpointCapabilities(^uint(0)) for _, endpoint := range m.routes { minCapabilities &= endpoint.Capabilities() } return minCapabilities } // MaxHeaderLength implements stack.LinkEndpoint. func (m *InjectableEndpoint) MaxHeaderLength() uint16 { minHeaderLen := ^uint16(0) for _, endpoint := range m.routes { if headerLen := endpoint.MaxHeaderLength(); headerLen < minHeaderLen { minHeaderLen = headerLen } } return minHeaderLen } // LinkAddress implements stack.LinkEndpoint. func (m *InjectableEndpoint) LinkAddress() tcpip.LinkAddress { return "" } // SetLinkAddress implements stack.LinkEndpoint.SetLinkAddress. func (m *InjectableEndpoint) SetLinkAddress(tcpip.LinkAddress) {} // Attach implements stack.LinkEndpoint. func (m *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) { for _, endpoint := range m.routes { endpoint.Attach(dispatcher) } m.mu.Lock() m.dispatcher = dispatcher m.mu.Unlock() } // IsAttached implements stack.LinkEndpoint. func (m *InjectableEndpoint) IsAttached() bool { m.mu.RLock() defer m.mu.RUnlock() return m.dispatcher != nil } // InjectInbound implements stack.InjectableLinkEndpoint. func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { m.mu.RLock() d := m.dispatcher m.mu.RUnlock() d.DeliverNetworkPacket(protocol, pkt) } // WritePackets writes outbound packets to the appropriate // LinkInjectableEndpoint based on the RemoteAddress. HandleLocal only works if // pkt.EgressRoute.RemoteAddress has a route registered in this endpoint. func (m *InjectableEndpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { i := 0 for _, pkt := range pkts.AsSlice() { endpoint, ok := m.routes[pkt.EgressRoute.RemoteAddress] if !ok { return i, &tcpip.ErrHostUnreachable{} } var tmpPkts stack.PacketBufferList tmpPkts.PushBack(pkt) n, err := endpoint.WritePackets(tmpPkts) if err != nil { return i, err } i += n } return i, nil } // InjectOutbound writes outbound packets to the appropriate // LinkInjectableEndpoint based on the dest address. func (m *InjectableEndpoint) InjectOutbound(dest tcpip.Address, packet *buffer.View) tcpip.Error { endpoint, ok := m.routes[dest] if !ok { return &tcpip.ErrHostUnreachable{} } return endpoint.InjectOutbound(dest, packet) } // Wait implements stack.LinkEndpoint.Wait. func (m *InjectableEndpoint) Wait() { for _, ep := range m.routes { ep.Wait() } } // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. func (*InjectableEndpoint) ARPHardwareType() header.ARPHardwareType { panic("unsupported operation") } // AddHeader implements stack.LinkEndpoint.AddHeader. func (*InjectableEndpoint) AddHeader(*stack.PacketBuffer) {} // ParseHeader implements stack.LinkEndpoint.ParseHeader. func (*InjectableEndpoint) ParseHeader(*stack.PacketBuffer) bool { return true } // Close implements stack.LinkEndpoint. func (*InjectableEndpoint) Close() {} // SetOnCloseAction implements stack.LinkEndpoint.SetOnCloseAction. func (*InjectableEndpoint) SetOnCloseAction(func()) {} // NewInjectableEndpoint creates a new multi-endpoint injectable endpoint. func NewInjectableEndpoint(routes map[tcpip.Address]stack.InjectableLinkEndpoint) *InjectableEndpoint { return &InjectableEndpoint{ routes: routes, } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/muxed/muxed_state_autogen.go000066400000000000000000000015511465435605700272170ustar00rootroot00000000000000// automatically generated by stateify. package muxed import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (m *InjectableEndpoint) StateTypeName() string { return "pkg/tcpip/link/muxed.InjectableEndpoint" } func (m *InjectableEndpoint) StateFields() []string { return []string{ "routes", "dispatcher", } } func (m *InjectableEndpoint) beforeSave() {} // +checklocksignore func (m *InjectableEndpoint) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.routes) stateSinkObject.Save(1, &m.dispatcher) } func (m *InjectableEndpoint) afterLoad(context.Context) {} // +checklocksignore func (m *InjectableEndpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.routes) stateSourceObject.Load(1, &m.dispatcher) } func init() { state.Register((*InjectableEndpoint)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/nested/000077500000000000000000000000001465435605700227625ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/nested/nested.go000066400000000000000000000115671465435605700246050ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package nested provides helpers to implement the pattern of nested // stack.LinkEndpoints. package nested import ( "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // Endpoint is a wrapper around stack.LinkEndpoint and stack.NetworkDispatcher // that can be used to implement nesting safely by providing lifecycle // concurrency guards. // // See the tests in this package for example usage. // // +stateify savable type Endpoint struct { child stack.LinkEndpoint embedder stack.NetworkDispatcher // mu protects dispatcher. mu sync.RWMutex `state:"nosave"` dispatcher stack.NetworkDispatcher } var _ stack.GSOEndpoint = (*Endpoint)(nil) var _ stack.LinkEndpoint = (*Endpoint)(nil) var _ stack.NetworkDispatcher = (*Endpoint)(nil) // Init initializes a nested.Endpoint that uses embedder as the dispatcher for // child on Attach. // // See the tests in this package for example usage. func (e *Endpoint) Init(child stack.LinkEndpoint, embedder stack.NetworkDispatcher) { e.child = child e.embedder = embedder } // DeliverNetworkPacket implements stack.NetworkDispatcher. func (e *Endpoint) DeliverNetworkPacket(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { e.mu.RLock() d := e.dispatcher e.mu.RUnlock() if d != nil { d.DeliverNetworkPacket(protocol, pkt) } } // DeliverLinkPacket implements stack.NetworkDispatcher. func (e *Endpoint) DeliverLinkPacket(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { e.mu.RLock() d := e.dispatcher e.mu.RUnlock() if d != nil { d.DeliverLinkPacket(protocol, pkt) } } // Attach implements stack.LinkEndpoint. func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) { e.mu.Lock() e.dispatcher = dispatcher e.mu.Unlock() // If we're attaching to a valid dispatcher, pass embedder as the dispatcher // to our child, otherwise detach the child by giving it a nil dispatcher. var pass stack.NetworkDispatcher if dispatcher != nil { pass = e.embedder } e.child.Attach(pass) } // IsAttached implements stack.LinkEndpoint. func (e *Endpoint) IsAttached() bool { e.mu.RLock() isAttached := e.dispatcher != nil e.mu.RUnlock() return isAttached } // MTU implements stack.LinkEndpoint. func (e *Endpoint) MTU() uint32 { return e.child.MTU() } // SetMTU implements stack.LinkEndpoint. func (e *Endpoint) SetMTU(mtu uint32) { e.child.SetMTU(mtu) } // Capabilities implements stack.LinkEndpoint. func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities { return e.child.Capabilities() } // MaxHeaderLength implements stack.LinkEndpoint. func (e *Endpoint) MaxHeaderLength() uint16 { return e.child.MaxHeaderLength() } // LinkAddress implements stack.LinkEndpoint. func (e *Endpoint) LinkAddress() tcpip.LinkAddress { return e.child.LinkAddress() } // SetLinkAddress implements stack.LinkEndpoint.SetLinkAddress. func (e *Endpoint) SetLinkAddress(addr tcpip.LinkAddress) { e.mu.Lock() defer e.mu.Unlock() e.child.SetLinkAddress(addr) } // WritePackets implements stack.LinkEndpoint. func (e *Endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { return e.child.WritePackets(pkts) } // Wait implements stack.LinkEndpoint. func (e *Endpoint) Wait() { e.child.Wait() } // GSOMaxSize implements stack.GSOEndpoint. func (e *Endpoint) GSOMaxSize() uint32 { if e, ok := e.child.(stack.GSOEndpoint); ok { return e.GSOMaxSize() } return 0 } // SupportedGSO implements stack.GSOEndpoint. func (e *Endpoint) SupportedGSO() stack.SupportedGSO { if e, ok := e.child.(stack.GSOEndpoint); ok { return e.SupportedGSO() } return stack.GSONotSupported } // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType func (e *Endpoint) ARPHardwareType() header.ARPHardwareType { return e.child.ARPHardwareType() } // AddHeader implements stack.LinkEndpoint.AddHeader. func (e *Endpoint) AddHeader(pkt *stack.PacketBuffer) { e.child.AddHeader(pkt) } // ParseHeader implements stack.LinkEndpoint.ParseHeader. func (e *Endpoint) ParseHeader(pkt *stack.PacketBuffer) bool { return e.child.ParseHeader(pkt) } // Close implements stack.LinkEndpoint. func (e *Endpoint) Close() { e.child.Close() } // SetOnCloseAction implement stack.LinkEndpoints. func (e *Endpoint) SetOnCloseAction(action func()) { e.child.SetOnCloseAction(action) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/nested/nested_state_autogen.go000066400000000000000000000015641465435605700275230ustar00rootroot00000000000000// automatically generated by stateify. package nested import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (e *Endpoint) StateTypeName() string { return "pkg/tcpip/link/nested.Endpoint" } func (e *Endpoint) StateFields() []string { return []string{ "child", "embedder", "dispatcher", } } func (e *Endpoint) beforeSave() {} // +checklocksignore func (e *Endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.child) stateSinkObject.Save(1, &e.embedder) stateSinkObject.Save(2, &e.dispatcher) } func (e *Endpoint) afterLoad(context.Context) {} // +checklocksignore func (e *Endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.child) stateSourceObject.Load(1, &e.embedder) stateSourceObject.Load(2, &e.dispatcher) } func init() { state.Register((*Endpoint)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/packetsocket/000077500000000000000000000000001465435605700241605ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/packetsocket/packetsocket.go000066400000000000000000000035651465435605700272000ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package packetsocket provides a link endpoint that enables delivery of // incoming and outgoing packets to any interested packet sockets. package packetsocket import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/link/nested" "gvisor.dev/gvisor/pkg/tcpip/stack" ) var _ stack.NetworkDispatcher = (*endpoint)(nil) var _ stack.LinkEndpoint = (*endpoint)(nil) // +stateify savable type endpoint struct { nested.Endpoint } // New creates a new packetsocket link endpoint wrapping a lower link endpoint. // // On ingress, the lower link endpoint must only deliver packets that have // a link-layer header set if one is required for the link. func New(lower stack.LinkEndpoint) stack.LinkEndpoint { e := &endpoint{} e.Endpoint.Init(lower, e) return e } // DeliverNetworkPacket implements stack.NetworkDispatcher. func (e *endpoint) DeliverNetworkPacket(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { e.Endpoint.DeliverLinkPacket(protocol, pkt) e.Endpoint.DeliverNetworkPacket(protocol, pkt) } // WritePackets implements stack.LinkEndpoint. func (e *endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { for _, pkt := range pkts.AsSlice() { e.Endpoint.DeliverLinkPacket(pkt.NetworkProtocolNumber, pkt) } return e.Endpoint.WritePackets(pkts) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/packetsocket/packetsocket_state_autogen.go000066400000000000000000000013131465435605700321070ustar00rootroot00000000000000// automatically generated by stateify. package packetsocket import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (e *endpoint) StateTypeName() string { return "pkg/tcpip/link/packetsocket.endpoint" } func (e *endpoint) StateFields() []string { return []string{ "Endpoint", } } func (e *endpoint) beforeSave() {} // +checklocksignore func (e *endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.Endpoint) } func (e *endpoint) afterLoad(context.Context) {} // +checklocksignore func (e *endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.Endpoint) } func init() { state.Register((*endpoint)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/pipe/000077500000000000000000000000001465435605700224355ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/pipe/pipe.go000066400000000000000000000076261465435605700237340ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package pipe provides the implementation of pipe-like data-link layer // endpoints. Such endpoints allow packets to be sent between two interfaces. package pipe import ( "sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) var _ stack.LinkEndpoint = (*Endpoint)(nil) // New returns both ends of a new pipe. func New(linkAddr1, linkAddr2 tcpip.LinkAddress, mtu uint32) (*Endpoint, *Endpoint) { ep1 := &Endpoint{ linkAddr: linkAddr1, mtu: mtu, } ep2 := &Endpoint{ linkAddr: linkAddr2, mtu: mtu, } ep1.linked = ep2 ep2.linked = ep1 return ep1, ep2 } // Endpoint is one end of a pipe. // // +stateify savable type Endpoint struct { linked *Endpoint mu sync.RWMutex `state:"nosave"` // +checklocks:mu dispatcher stack.NetworkDispatcher // +checklocks:mu linkAddr tcpip.LinkAddress // +checklocks:mu mtu uint32 } func (e *Endpoint) deliverPackets(pkts stack.PacketBufferList) { if !e.linked.IsAttached() { return } for _, pkt := range pkts.AsSlice() { // Create a fresh packet with pkt's payload but without struct fields // or headers set so the next link protocol can properly set the link // header. newPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ Payload: pkt.ToBuffer(), }) e.linked.mu.RLock() d := e.linked.dispatcher e.linked.mu.RUnlock() d.DeliverNetworkPacket(pkt.NetworkProtocolNumber, newPkt) newPkt.DecRef() } } // WritePackets implements stack.LinkEndpoint. func (e *Endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { n := pkts.Len() e.deliverPackets(pkts) return n, nil } // Attach implements stack.LinkEndpoint. func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) { e.mu.Lock() defer e.mu.Unlock() e.dispatcher = dispatcher } // IsAttached implements stack.LinkEndpoint. func (e *Endpoint) IsAttached() bool { e.mu.RLock() defer e.mu.RUnlock() return e.dispatcher != nil } // Wait implements stack.LinkEndpoint. func (*Endpoint) Wait() {} // MTU implements stack.LinkEndpoint. func (e *Endpoint) MTU() uint32 { e.mu.RLock() defer e.mu.RUnlock() return e.mtu } // SetMTU implements stack.LinkEndpoint. func (e *Endpoint) SetMTU(mtu uint32) { e.mu.Lock() defer e.mu.Unlock() e.mtu = mtu } // Capabilities implements stack.LinkEndpoint. func (*Endpoint) Capabilities() stack.LinkEndpointCapabilities { return 0 } // MaxHeaderLength implements stack.LinkEndpoint. func (*Endpoint) MaxHeaderLength() uint16 { return 0 } // LinkAddress implements stack.LinkEndpoint. func (e *Endpoint) LinkAddress() tcpip.LinkAddress { e.mu.RLock() defer e.mu.RUnlock() return e.linkAddr } // SetLinkAddress implements stack.LinkEndpoint. func (e *Endpoint) SetLinkAddress(addr tcpip.LinkAddress) { e.mu.Lock() defer e.mu.Unlock() e.linkAddr = addr } // ARPHardwareType implements stack.LinkEndpoint. func (*Endpoint) ARPHardwareType() header.ARPHardwareType { return header.ARPHardwareNone } // AddHeader implements stack.LinkEndpoint. func (*Endpoint) AddHeader(*stack.PacketBuffer) {} // ParseHeader implements stack.LinkEndpoint. func (*Endpoint) ParseHeader(*stack.PacketBuffer) bool { return true } // Close implements stack.LinkEndpoint. func (e *Endpoint) Close() {} // SetOnCloseAction implements stack.LinkEndpoint.SetOnCloseAction. func (*Endpoint) SetOnCloseAction(func()) {} golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/pipe/pipe_state_autogen.go000066400000000000000000000017001465435605700266410ustar00rootroot00000000000000// automatically generated by stateify. package pipe import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (e *Endpoint) StateTypeName() string { return "pkg/tcpip/link/pipe.Endpoint" } func (e *Endpoint) StateFields() []string { return []string{ "linked", "dispatcher", "linkAddr", "mtu", } } func (e *Endpoint) beforeSave() {} // +checklocksignore func (e *Endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.linked) stateSinkObject.Save(1, &e.dispatcher) stateSinkObject.Save(2, &e.linkAddr) stateSinkObject.Save(3, &e.mtu) } func (e *Endpoint) afterLoad(context.Context) {} // +checklocksignore func (e *Endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.linked) stateSourceObject.Load(1, &e.dispatcher) stateSourceObject.Load(2, &e.linkAddr) stateSourceObject.Load(3, &e.mtu) } func init() { state.Register((*Endpoint)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/qdisc/000077500000000000000000000000001465435605700226035ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/qdisc/fifo/000077500000000000000000000000001465435605700235265ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/qdisc/fifo/fifo.go000066400000000000000000000102661465435605700250050ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package fifo provides the implementation of FIFO queuing discipline that // queues all outbound packets and asynchronously dispatches them to the // lower link endpoint in the order that they were queued. package fifo import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sleep" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) var _ stack.QueueingDiscipline = (*discipline)(nil) const ( // BatchSize is the number of packets to write in each syscall. It is 47 // because when GVisorGSO is in use then a single 65KB TCP segment can get // split into 46 segments of 1420 bytes and a single 216 byte segment. BatchSize = 47 qDiscClosed = 1 ) // discipline represents a QueueingDiscipline which implements a FIFO queue for // all outgoing packets. discipline can have 1 or more underlying // queueDispatchers. All outgoing packets are consistently hashed to a single // underlying queue using the PacketBuffer.Hash if set, otherwise all packets // are queued to the first queue to avoid reordering in case of missing hash. type discipline struct { wg sync.WaitGroup dispatchers []queueDispatcher closed atomicbitops.Int32 } // queueDispatcher is responsible for dispatching all outbound packets in its // queue. It will also smartly batch packets when possible and write them // through the lower LinkWriter. type queueDispatcher struct { lower stack.LinkWriter mu sync.Mutex // +checklocks:mu queue packetBufferCircularList newPacketWaker sleep.Waker closeWaker sleep.Waker } // New creates a new fifo queuing discipline with the n queues with maximum // capacity of queueLen. // // +checklocksignore: we don't have to hold locks during initialization. func New(lower stack.LinkWriter, n int, queueLen int) stack.QueueingDiscipline { d := &discipline{ dispatchers: make([]queueDispatcher, n), } // Create the required dispatchers for i := range d.dispatchers { qd := &d.dispatchers[i] qd.lower = lower qd.queue.init(queueLen) d.wg.Add(1) go func() { defer d.wg.Done() qd.dispatchLoop() }() } return d } func (qd *queueDispatcher) dispatchLoop() { s := sleep.Sleeper{} s.AddWaker(&qd.newPacketWaker) s.AddWaker(&qd.closeWaker) defer s.Done() var batch stack.PacketBufferList for { switch w := s.Fetch(true); w { case &qd.newPacketWaker: case &qd.closeWaker: qd.mu.Lock() for p := qd.queue.removeFront(); p != nil; p = qd.queue.removeFront() { p.DecRef() } qd.queue.decRef() qd.mu.Unlock() return default: panic("unknown waker") } qd.mu.Lock() for pkt := qd.queue.removeFront(); pkt != nil; pkt = qd.queue.removeFront() { batch.PushBack(pkt) if batch.Len() < BatchSize && !qd.queue.isEmpty() { continue } qd.mu.Unlock() _, _ = qd.lower.WritePackets(batch) batch.Reset() qd.mu.Lock() } qd.mu.Unlock() } } // WritePacket implements stack.QueueingDiscipline.WritePacket. // // The packet must have the following fields populated: // - pkt.EgressRoute // - pkt.GSOOptions // - pkt.NetworkProtocolNumber func (d *discipline) WritePacket(pkt *stack.PacketBuffer) tcpip.Error { if d.closed.Load() == qDiscClosed { return &tcpip.ErrClosedForSend{} } qd := &d.dispatchers[int(pkt.Hash)%len(d.dispatchers)] qd.mu.Lock() haveSpace := qd.queue.hasSpace() if haveSpace { qd.queue.pushBack(pkt.IncRef()) } qd.mu.Unlock() if !haveSpace { return &tcpip.ErrNoBufferSpace{} } qd.newPacketWaker.Assert() return nil } func (d *discipline) Close() { d.closed.Store(qDiscClosed) for i := range d.dispatchers { d.dispatchers[i].closeWaker.Assert() } d.wg.Wait() } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/qdisc/fifo/fifo_state_autogen.go000066400000000000000000000017371465435605700277320ustar00rootroot00000000000000// automatically generated by stateify. package fifo import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (pl *packetBufferCircularList) StateTypeName() string { return "pkg/tcpip/link/qdisc/fifo.packetBufferCircularList" } func (pl *packetBufferCircularList) StateFields() []string { return []string{ "pbs", "head", "size", } } func (pl *packetBufferCircularList) beforeSave() {} // +checklocksignore func (pl *packetBufferCircularList) StateSave(stateSinkObject state.Sink) { pl.beforeSave() stateSinkObject.Save(0, &pl.pbs) stateSinkObject.Save(1, &pl.head) stateSinkObject.Save(2, &pl.size) } func (pl *packetBufferCircularList) afterLoad(context.Context) {} // +checklocksignore func (pl *packetBufferCircularList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &pl.pbs) stateSourceObject.Load(1, &pl.head) stateSourceObject.Load(2, &pl.size) } func init() { state.Register((*packetBufferCircularList)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/qdisc/fifo/packet_buffer_circular_list.go000066400000000000000000000047201465435605700315770ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fifo import "gvisor.dev/gvisor/pkg/tcpip/stack" // packetBufferCircularList is a slice-backed circular list. All operations are // O(1) unless otherwise noted. It only allocates once, during the call to // init(). // // Users should call init() before using packetBufferCircularList. // // +stateify savable type packetBufferCircularList struct { pbs []*stack.PacketBuffer head int size int } // init initializes the list with the given size. func (pl *packetBufferCircularList) init(size int) { pl.pbs = make([]*stack.PacketBuffer, size) } // length returns the number of elements in the list. // //go:nosplit func (pl *packetBufferCircularList) length() int { return pl.size } // hasSpace returns whether there is space left in the list. // //go:nosplit func (pl *packetBufferCircularList) hasSpace() bool { return pl.size < len(pl.pbs) } // isEmpty returns whether the list is empty. // //go:nosplit func (pl *packetBufferCircularList) isEmpty() bool { return pl.size == 0 } // pushBack inserts the PacketBuffer at the end of the list. // // Users must check beforehand that there is space via a call to hasSpace(). // Failing to do so may clobber existing entries. // //go:nosplit func (pl *packetBufferCircularList) pushBack(pb *stack.PacketBuffer) { next := (pl.head + pl.size) % len(pl.pbs) pl.pbs[next] = pb pl.size++ } // removeFront returns the first element of the list or nil. // //go:nosplit func (pl *packetBufferCircularList) removeFront() *stack.PacketBuffer { if pl.isEmpty() { return nil } ret := pl.pbs[pl.head] pl.pbs[pl.head] = nil pl.head = (pl.head + 1) % len(pl.pbs) pl.size-- return ret } // decRef decreases the reference count on each stack.PacketBuffer stored in // the list. // // NOTE: runs in O(n) time. // //go:nosplit func (pl *packetBufferCircularList) decRef() { for i := 0; i < pl.size; i++ { pl.pbs[(pl.head+i)%len(pl.pbs)].DecRef() } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/000077500000000000000000000000001465435605700234455ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/pipe/000077500000000000000000000000001465435605700244025ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/pipe/pipe.go000066400000000000000000000053221465435605700256700ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package pipe implements a shared memory ring buffer on which a single reader // and a single writer can operate (read/write) concurrently. The ring buffer // allows for data of different sizes to be written, and preserves the boundary // of the written data. // // Example usage is as follows: // // wb := t.Push(20) // // Write data to wb. // t.Flush() // // rb := r.Pull() // // Do something with data in rb. // t.Flush() package pipe import ( "math" ) const ( jump uint64 = math.MaxUint32 + 1 offsetMask uint64 = math.MaxUint32 revolutionMask uint64 = ^offsetMask sizeOfSlotHeader = 8 // sizeof(uint64) slotFree uint64 = 1 << 63 slotSizeMask uint64 = math.MaxUint32 ) // payloadToSlotSize calculates the total size of a slot based on its payload // size. The total size is the header size, plus the payload size, plus padding // if necessary to make the total size a multiple of sizeOfSlotHeader. func payloadToSlotSize(payloadSize uint64) uint64 { s := sizeOfSlotHeader + payloadSize return (s + sizeOfSlotHeader - 1) &^ (sizeOfSlotHeader - 1) } // slotToPayloadSize calculates the payload size of a slot based on the total // size of the slot. This is only meant to be used when creating slots that // don't carry information (e.g., free slots or wrap slots). func slotToPayloadSize(offset uint64) uint64 { return offset - sizeOfSlotHeader } // pipe is a basic data structure used by both (transmit & receive) ends of a // pipe. Indices into this pipe are split into two fields: offset, which counts // the number of bytes from the beginning of the buffer, and revolution, which // counts the number of times the index has wrapped around. type pipe struct { buffer []byte } // init initializes the pipe buffer such that its size is a multiple of the size // of the slot header. func (p *pipe) init(b []byte) { p.buffer = b[:len(b)&^(sizeOfSlotHeader-1)] } // data returns a section of the buffer starting at the given index (which may // include revolution information) and with the given size. func (p *pipe) data(idx uint64, size uint64) []byte { return p.buffer[(idx&offsetMask)+sizeOfSlotHeader:][:size] } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/pipe/pipe_state_autogen.go000066400000000000000000000000661465435605700306120ustar00rootroot00000000000000// automatically generated by stateify. package pipe golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go000066400000000000000000000021131465435605700272240ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pipe import ( "unsafe" "gvisor.dev/gvisor/pkg/atomicbitops" ) func (p *pipe) write(idx uint64, v uint64) { ptr := (*uint64)(unsafe.Pointer(&p.buffer[idx&offsetMask:][:8][0])) *ptr = v } func (p *pipe) writeAtomic(idx uint64, v uint64) { ptr := (*atomicbitops.Uint64)(unsafe.Pointer(&p.buffer[idx&offsetMask:][:8][0])) ptr.Store(v) } func (p *pipe) readAtomic(idx uint64) uint64 { ptr := (*atomicbitops.Uint64)(unsafe.Pointer(&p.buffer[idx&offsetMask:][:8][0])) return ptr.Load() } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe_state_autogen.go000066400000000000000000000000661465435605700321530ustar00rootroot00000000000000// automatically generated by stateify. package pipe golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/pipe/rx.go000066400000000000000000000056571465435605700253770ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pipe // Rx is the receive side of the shared memory ring buffer. type Rx struct { p pipe tail uint64 head uint64 } // Init initializes the receive end of the pipe. In the initial state, the next // slot to be inspected is the very first one. func (r *Rx) Init(b []byte) { r.p.init(b) r.tail = 0xfffffffe * jump r.head = r.tail } // Pull reads the next buffer from the pipe, returning nil if there isn't one // currently available. // // The returned slice is available until Flush() is next called. After that, it // must not be touched. func (r *Rx) Pull() []byte { if r.head == r.tail+jump { // We've already pulled the whole pipe. return nil } header := r.p.readAtomic(r.head) if header&slotFree != 0 { // The next slot is free, we can't pull it yet. return nil } payloadSize := header & slotSizeMask newHead := r.head + payloadToSlotSize(payloadSize) headWrap := (r.head & revolutionMask) | uint64(len(r.p.buffer)) // Check if this is a wrapping slot. If that's the case, it carries no // data, so we just skip it and try again from the first slot. if int64(newHead-headWrap) >= 0 { // If newHead passes the tail, the pipe is either damaged or the // RX view of the pipe has completely wrapped without an // intervening flush. if int64(newHead-(r.tail+jump)) > 0 { return nil } // The pipe is damaged if newHead doesn't point to the start of // the ring. if newHead&offsetMask != 0 { return nil } if r.tail == r.head { // If this is the first pull since the last Flush() // call, we flush the state so that the sender can use // this space if it needs to. r.p.writeAtomic(r.head, slotFree|slotToPayloadSize(newHead-r.head)) r.tail = newHead } r.head = newHead return r.Pull() } // Grab the buffer before updating r.head. b := r.p.data(r.head, payloadSize) r.head = newHead return b } // Flush tells the transmitter that all buffers pulled since the last Flush() // have been used, so the transmitter is free to used their slots for further // transmission. func (r *Rx) Flush() { if r.head == r.tail { return } r.p.writeAtomic(r.tail, slotFree|slotToPayloadSize(r.head-r.tail)) r.tail = r.head } // Abort unpulls any pulled buffers. func (r *Rx) Abort() { r.head = r.tail } // Bytes returns the byte slice on which the pipe operates. func (r *Rx) Bytes() []byte { return r.p.buffer } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/pipe/tx.go000066400000000000000000000114001465435605700253600ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pipe // Tx is the transmit side of the shared memory ring buffer. type Tx struct { p pipe maxPayloadSize uint64 head uint64 tail uint64 next uint64 tailHeader uint64 } // Init initializes the transmit end of the pipe. In the initial state, the next // slot to be written is the very first one, and the transmitter has the whole // ring buffer available to it. func (t *Tx) Init(b []byte) { t.p.init(b) // maxPayloadSize excludes the header of the payload, and the header // of the wrapping message. t.maxPayloadSize = uint64(len(t.p.buffer)) - 2*sizeOfSlotHeader t.tail = 0xfffffffe * jump t.next = t.tail t.head = t.tail + jump t.p.write(t.tail, slotFree) } // Capacity determines how many records of the given size can be written to the // pipe before it fills up. func (t *Tx) Capacity(recordSize uint64) uint64 { available := uint64(len(t.p.buffer)) - sizeOfSlotHeader entryLen := payloadToSlotSize(recordSize) return available / entryLen } // Push reserves "payloadSize" bytes for transmission in the pipe. The caller // populates the returned slice with the data to be transferred and enventually // calls Flush() to make the data visible to the reader, or Abort() to make the // pipe forget all Push() calls since the last Flush(). // // The returned slice is available until Flush() or Abort() is next called. // After that, it must not be touched. func (t *Tx) Push(payloadSize uint64) []byte { // Fail request if we know we will never have enough room. if payloadSize > t.maxPayloadSize { return nil } // True if TxPipe currently has a pushed message, i.e., it is not // Flush()'ed. messageAhead := t.next != t.tail totalLen := payloadToSlotSize(payloadSize) newNext := t.next + totalLen nextWrap := (t.next & revolutionMask) | uint64(len(t.p.buffer)) if int64(newNext-nextWrap) >= 0 { // The new buffer would overflow the pipe, so we push a wrapping // slot, then try to add the actual slot to the front of the // pipe. newNext = (newNext & revolutionMask) + jump if !t.reclaim(newNext) { return nil } wrappingPayloadSize := slotToPayloadSize(newNext - t.next) oldNext := t.next t.next = newNext if messageAhead { t.p.write(oldNext, wrappingPayloadSize) } else { t.tailHeader = wrappingPayloadSize t.Flush() } return t.Push(payloadSize) } // Check that we have enough room for the buffer. if !t.reclaim(newNext) { return nil } if messageAhead { t.p.write(t.next, payloadSize) } else { t.tailHeader = payloadSize } // Grab the buffer before updating t.next. b := t.p.data(t.next, payloadSize) t.next = newNext return b } // reclaim attempts to advance the head until at least newNext. If the head is // already at or beyond newNext, nothing happens and true is returned; otherwise // it tries to reclaim slots that have already been consumed by the receive end // of the pipe (they will be marked as free) and returns a boolean indicating // whether it was successful in reclaiming enough slots. func (t *Tx) reclaim(newNext uint64) bool { for int64(newNext-t.head) > 0 { // Can't reclaim if slot is not free. header := t.p.readAtomic(t.head) if header&slotFree == 0 { return false } payloadSize := header & slotSizeMask newHead := t.head + payloadToSlotSize(payloadSize) // Check newHead is within bounds and valid. if int64(newHead-t.tail) > int64(jump) || newHead&offsetMask >= uint64(len(t.p.buffer)) { return false } t.head = newHead } return true } // Abort causes all Push() calls since the last Flush() to be forgotten and // therefore they will not be made visible to the receiver. func (t *Tx) Abort() { t.next = t.tail } // Flush causes all buffers pushed since the last Flush() [or Abort(), whichever // is the most recent] to be made visible to the receiver. func (t *Tx) Flush() { if t.next == t.tail { // Nothing to do if there are no pushed buffers. return } if t.next != t.head { // The receiver will spin in t.next, so we must make sure that // the slotFree bit is set. t.p.write(t.next, slotFree) } t.p.writeAtomic(t.tail, t.tailHeader) t.tail = t.next } // Bytes returns the byte slice on which the pipe operates. func (t *Tx) Bytes() []byte { return t.p.buffer } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/queue/000077500000000000000000000000001465435605700245715ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/queue/queue_state_autogen.go000066400000000000000000000000671465435605700311710ustar00rootroot00000000000000// automatically generated by stateify. package queue golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/queue/rx.go000066400000000000000000000161461465435605700255610ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package queue provides the implementation of transmit and receive queues // based on shared memory ring buffers. package queue import ( "encoding/binary" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe" ) const ( // Offsets within a posted buffer. postedOffset = 0 postedSize = 8 postedRemainingInGroup = 12 postedUserData = 16 postedID = 24 sizeOfPostedBuffer = 32 // Offsets within a received packet header. consumedPacketSize = 0 consumedPacketReserved = 4 sizeOfConsumedPacketHeader = 8 // Offsets within a consumed buffer. consumedOffset = 0 consumedSize = 8 consumedUserData = 12 consumedID = 20 sizeOfConsumedBuffer = 28 // The following are the allowed states of the shared data area. // EventFDUinitialized is the value stored at the start of the shared data // region when it hasn't been initialized. EventFDUninitialized = 0 // EventFDDisabled is the value stored at the start of the shared data region // when notifications using eventFD has been disabled. EventFDDisabled = 1 // EventFDEnabled is the value stored at the start of the shared data region // when eventFD should be notified as the peer might be blocked waiting on // notifications. EventFDEnabled = 2 ) // RxBuffer is the descriptor of a receive buffer. type RxBuffer struct { Offset uint64 Size uint32 ID uint64 UserData uint64 } // Rx is a receive queue. It is implemented with one tx and one rx pipe: the tx // pipe is used to "post" buffers, while the rx pipe is used to receive packets // whose contents have been written to previously posted buffers. // // This struct is thread-compatible. type Rx struct { tx pipe.Tx rx pipe.Rx sharedEventFDState *atomicbitops.Uint32 } // Init initializes the receive queue with the given pipes, and shared state // pointer -- the latter is used to enable/disable eventfd notifications. func (r *Rx) Init(tx, rx []byte, sharedEventFDState *atomicbitops.Uint32) { r.sharedEventFDState = sharedEventFDState r.tx.Init(tx) r.rx.Init(rx) } // EnableNotification updates the shared state such that the peer will notify // the eventfd when there are packets to be dequeued. func (r *Rx) EnableNotification() { r.sharedEventFDState.Store(EventFDEnabled) } // DisableNotification updates the shared state such that the peer will not // notify the eventfd. func (r *Rx) DisableNotification() { r.sharedEventFDState.Store(EventFDDisabled) } // PostedBuffersLimit returns the maximum number of buffers that can be posted // before the tx queue fills up. func (r *Rx) PostedBuffersLimit() uint64 { return r.tx.Capacity(sizeOfPostedBuffer) } // PostBuffers makes the given buffers available for receiving data from the // peer. Once they are posted, the peer is free to write to them and will // eventually post them back for consumption. func (r *Rx) PostBuffers(buffers []RxBuffer) bool { for i := range buffers { b := r.tx.Push(sizeOfPostedBuffer) if b == nil { r.tx.Abort() return false } pb := &buffers[i] binary.LittleEndian.PutUint64(b[postedOffset:], pb.Offset) binary.LittleEndian.PutUint32(b[postedSize:], pb.Size) binary.LittleEndian.PutUint32(b[postedRemainingInGroup:], 0) binary.LittleEndian.PutUint64(b[postedUserData:], pb.UserData) binary.LittleEndian.PutUint64(b[postedID:], pb.ID) } r.tx.Flush() return true } // Dequeue receives buffers that have been previously posted by PostBuffers() // and that have been filled by the peer and posted back. // // This is similar to append() in that new buffers are appended to "bufs", with // reallocation only if "bufs" doesn't have enough capacity. func (r *Rx) Dequeue(bufs []RxBuffer) ([]RxBuffer, uint32) { for { outBufs := bufs // Pull the next descriptor from the rx pipe. b := r.rx.Pull() if b == nil { return bufs, 0 } if len(b) < sizeOfConsumedPacketHeader { log.Warningf("Ignoring packet header: size (%v) is less than header size (%v)", len(b), sizeOfConsumedPacketHeader) r.rx.Flush() continue } totalDataSize := binary.LittleEndian.Uint32(b[consumedPacketSize:]) // Calculate the number of buffer descriptors and copy them // over to the output. count := (len(b) - sizeOfConsumedPacketHeader) / sizeOfConsumedBuffer offset := sizeOfConsumedPacketHeader buffersSize := uint32(0) for i := count; i > 0; i-- { s := binary.LittleEndian.Uint32(b[offset+consumedSize:]) buffersSize += s if buffersSize < s { // The buffer size overflows an unsigned 32-bit // integer, so break out and force it to be // ignored. totalDataSize = 1 buffersSize = 0 break } outBufs = append(outBufs, RxBuffer{ Offset: binary.LittleEndian.Uint64(b[offset+consumedOffset:]), Size: s, ID: binary.LittleEndian.Uint64(b[offset+consumedID:]), }) offset += sizeOfConsumedBuffer } r.rx.Flush() if buffersSize < totalDataSize { // The descriptor is corrupted, ignore it. log.Warningf("Ignoring packet: actual data size (%v) less than expected size (%v)", buffersSize, totalDataSize) continue } return outBufs, totalDataSize } } // Bytes returns the byte slices on which the queue operates. func (r *Rx) Bytes() (tx, rx []byte) { return r.tx.Bytes(), r.rx.Bytes() } // DecodeRxBufferHeader decodes the header of a buffer posted on an rx queue. func DecodeRxBufferHeader(b []byte) RxBuffer { return RxBuffer{ Offset: binary.LittleEndian.Uint64(b[postedOffset:]), Size: binary.LittleEndian.Uint32(b[postedSize:]), ID: binary.LittleEndian.Uint64(b[postedID:]), UserData: binary.LittleEndian.Uint64(b[postedUserData:]), } } // RxCompletionSize returns the number of bytes needed to encode an rx // completion containing "count" buffers. func RxCompletionSize(count int) uint64 { return sizeOfConsumedPacketHeader + uint64(count)*sizeOfConsumedBuffer } // EncodeRxCompletion encodes an rx completion header. func EncodeRxCompletion(b []byte, size, reserved uint32) { binary.LittleEndian.PutUint32(b[consumedPacketSize:], size) binary.LittleEndian.PutUint32(b[consumedPacketReserved:], reserved) } // EncodeRxCompletionBuffer encodes the i-th rx completion buffer header. func EncodeRxCompletionBuffer(b []byte, i int, rxb RxBuffer) { b = b[RxCompletionSize(i):] binary.LittleEndian.PutUint64(b[consumedOffset:], rxb.Offset) binary.LittleEndian.PutUint32(b[consumedSize:], rxb.Size) binary.LittleEndian.PutUint64(b[consumedUserData:], rxb.UserData) binary.LittleEndian.PutUint64(b[consumedID:], rxb.ID) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/queue/tx.go000066400000000000000000000110151465435605700255510ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package queue import ( "encoding/binary" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe" ) const ( // Offsets within a packet header. packetID = 0 packetSize = 8 packetReserved = 12 sizeOfPacketHeader = 16 // Offsets with a buffer descriptor bufferOffset = 0 bufferSize = 8 sizeOfBufferDescriptor = 12 ) // TxBuffer is the descriptor of a transmit buffer. type TxBuffer struct { Next *TxBuffer Offset uint64 Size uint32 } // Tx is a transmit queue. It is implemented with one tx and one rx pipe: the // tx pipe is used to request the transmission of packets, while the rx pipe // is used to receive which transmissions have completed. // // This struct is thread-compatible. type Tx struct { tx pipe.Tx rx pipe.Rx sharedEventFDState *atomicbitops.Uint32 } // Init initializes the transmit queue with the given pipes. func (t *Tx) Init(tx, rx []byte, sharedEventFDState *atomicbitops.Uint32) { t.tx.Init(tx) t.rx.Init(rx) t.sharedEventFDState = sharedEventFDState } // NotificationsEnabled returns true if eventFD should be used to notify the // peer of events (eg. packet transmit etc). func (t *Tx) NotificationsEnabled() bool { // Notifications are considered enabled unless explicitly disabled. return t.sharedEventFDState.Load() != EventFDDisabled } // Enqueue queues the given linked list of buffers for transmission as one // packet. While it is queued, the caller must not modify them. func (t *Tx) Enqueue(id uint64, totalDataLen, bufferCount uint32, buffer *TxBuffer) bool { // Reserve room in the tx pipe. totalLen := sizeOfPacketHeader + uint64(bufferCount)*sizeOfBufferDescriptor b := t.tx.Push(totalLen) if b == nil { return false } // Initialize the packet and buffer descriptors. binary.LittleEndian.PutUint64(b[packetID:], id) binary.LittleEndian.PutUint32(b[packetSize:], totalDataLen) binary.LittleEndian.PutUint32(b[packetReserved:], 0) offset := sizeOfPacketHeader for i := bufferCount; i != 0; i-- { binary.LittleEndian.PutUint64(b[offset+bufferOffset:], buffer.Offset) binary.LittleEndian.PutUint32(b[offset+bufferSize:], buffer.Size) offset += sizeOfBufferDescriptor buffer = buffer.Next } t.tx.Flush() return true } // CompletedPacket returns the id of the last completed transmission. The // returned id, if any, refers to a value passed on a previous call to // Enqueue(). func (t *Tx) CompletedPacket() (id uint64, ok bool) { for { b := t.rx.Pull() if b == nil { return 0, false } if len(b) != 8 { t.rx.Flush() log.Warningf("Ignoring completed packet: size (%v) is less than expected (%v)", len(b), 8) continue } v := binary.LittleEndian.Uint64(b) t.rx.Flush() return v, true } } // Bytes returns the byte slices on which the queue operates. func (t *Tx) Bytes() (tx, rx []byte) { return t.tx.Bytes(), t.rx.Bytes() } // TxPacketInfo holds information about a packet sent on a tx queue. type TxPacketInfo struct { ID uint64 Size uint32 Reserved uint32 BufferCount int } // DecodeTxPacketHeader decodes the header of a packet sent over a tx queue. func DecodeTxPacketHeader(b []byte) TxPacketInfo { return TxPacketInfo{ ID: binary.LittleEndian.Uint64(b[packetID:]), Size: binary.LittleEndian.Uint32(b[packetSize:]), Reserved: binary.LittleEndian.Uint32(b[packetReserved:]), BufferCount: (len(b) - sizeOfPacketHeader) / sizeOfBufferDescriptor, } } // DecodeTxBufferHeader decodes the header of the i-th buffer of a packet sent // over a tx queue. func DecodeTxBufferHeader(b []byte, i int) TxBuffer { b = b[sizeOfPacketHeader+i*sizeOfBufferDescriptor:] return TxBuffer{ Offset: binary.LittleEndian.Uint64(b[bufferOffset:]), Size: binary.LittleEndian.Uint32(b[bufferSize:]), } } // EncodeTxCompletion encodes a tx completion header. func EncodeTxCompletion(b []byte, id uint64) { binary.LittleEndian.PutUint64(b, id) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/queuepair.go000066400000000000000000000144541465435605700260040ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package sharedmem import ( "fmt" "io/ioutil" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/eventfd" ) const ( // DefaultQueueDataSize is the size of the shared memory data region that // holds the scatter/gather buffers. DefaultQueueDataSize = 1 << 20 // 1MiB // DefaultQueuePipeSize is the size of the pipe that holds the packet descriptors. // // Assuming each packet data is approximately 1280 bytes (IPv6 Minimum MTU) // then we can hold approximately 1024*1024/1280 ~ 819 packets in the data // area. Which means the pipe needs to be big enough to hold 819 // descriptors. // // Each descriptor is approximately 8 (slot descriptor in pipe) + // 16 (packet descriptor) + 12 (for buffer descriptor) assuming each packet is // stored in exactly 1 buffer descriptor (see queue/tx.go and pipe/tx.go.) // // Which means we need approximately 36*819 ~ 29 KiB to store all packet // descriptors. We could go with a 32 KiB pipe but to give it some slack in // how the upper layer may make use of the scatter gather buffers we double // this to hold enough descriptors. DefaultQueuePipeSize = 64 << 10 // 64KiB // DefaultSharedDataSize is the size of the sharedData region used to // enable/disable notifications. DefaultSharedDataSize = 4 << 10 // 4KiB // DefaultBufferSize is the size of each individual buffer that the data // region is broken down into to hold packet data. Should be larger than // 1500 + 14 (Ethernet header) + 10 (VirtIO header) to fit each packet // in a single buffer. DefaultBufferSize = 2048 // DefaultTmpDir is the path used to create the memory files if a path // is not provided. DefaultTmpDir = "/dev/shm" ) // A QueuePair represents a pair of TX/RX queues. type QueuePair struct { // txCfg is the QueueConfig to be used for transmit queue. txCfg QueueConfig // rxCfg is the QueueConfig to be used for receive queue. rxCfg QueueConfig } // QueueOptions allows queue specific configuration to be specified when // creating a QueuePair. type QueueOptions struct { // SharedMemPath is the path to use to create the shared memory backing // files for the queue. // // If unspecified it defaults to "/dev/shm". SharedMemPath string } // NewQueuePair creates a shared memory QueuePair. func NewQueuePair(opts QueueOptions) (*QueuePair, error) { txCfg, err := createQueueFDs(opts.SharedMemPath, queueSizes{ dataSize: DefaultQueueDataSize, txPipeSize: DefaultQueuePipeSize, rxPipeSize: DefaultQueuePipeSize, sharedDataSize: DefaultSharedDataSize, }) if err != nil { return nil, fmt.Errorf("failed to create tx queue: %s", err) } rxCfg, err := createQueueFDs(opts.SharedMemPath, queueSizes{ dataSize: DefaultQueueDataSize, txPipeSize: DefaultQueuePipeSize, rxPipeSize: DefaultQueuePipeSize, sharedDataSize: DefaultSharedDataSize, }) if err != nil { closeFDs(txCfg) return nil, fmt.Errorf("failed to create rx queue: %s", err) } return &QueuePair{ txCfg: txCfg, rxCfg: rxCfg, }, nil } // Close closes underlying tx/rx queue fds. func (q *QueuePair) Close() { closeFDs(q.txCfg) closeFDs(q.rxCfg) } // TXQueueConfig returns the QueueConfig for the receive queue. func (q *QueuePair) TXQueueConfig() QueueConfig { return q.txCfg } // RXQueueConfig returns the QueueConfig for the transmit queue. func (q *QueuePair) RXQueueConfig() QueueConfig { return q.rxCfg } type queueSizes struct { dataSize int64 txPipeSize int64 rxPipeSize int64 sharedDataSize int64 } func createQueueFDs(sharedMemPath string, s queueSizes) (QueueConfig, error) { success := false var eventFD eventfd.Eventfd var dataFD, txPipeFD, rxPipeFD, sharedDataFD int defer func() { if success { return } closeFDs(QueueConfig{ EventFD: eventFD, DataFD: dataFD, TxPipeFD: txPipeFD, RxPipeFD: rxPipeFD, SharedDataFD: sharedDataFD, }) }() eventFD, err := eventfd.Create() if err != nil { return QueueConfig{}, fmt.Errorf("eventfd failed: %v", err) } dataFD, err = createFile(sharedMemPath, s.dataSize, false) if err != nil { return QueueConfig{}, fmt.Errorf("failed to create dataFD: %s", err) } txPipeFD, err = createFile(sharedMemPath, s.txPipeSize, true) if err != nil { return QueueConfig{}, fmt.Errorf("failed to create txPipeFD: %s", err) } rxPipeFD, err = createFile(sharedMemPath, s.rxPipeSize, true) if err != nil { return QueueConfig{}, fmt.Errorf("failed to create rxPipeFD: %s", err) } sharedDataFD, err = createFile(sharedMemPath, s.sharedDataSize, false) if err != nil { return QueueConfig{}, fmt.Errorf("failed to create sharedDataFD: %s", err) } success = true return QueueConfig{ EventFD: eventFD, DataFD: dataFD, TxPipeFD: txPipeFD, RxPipeFD: rxPipeFD, SharedDataFD: sharedDataFD, }, nil } func createFile(sharedMemPath string, size int64, initQueue bool) (fd int, err error) { var tmpDir = DefaultTmpDir if sharedMemPath != "" { tmpDir = sharedMemPath } f, err := ioutil.TempFile(tmpDir, "sharedmem_test") if err != nil { return -1, fmt.Errorf("TempFile failed: %v", err) } defer f.Close() unix.Unlink(f.Name()) if initQueue { // Write the "slot-free" flag in the initial queue. if _, err := f.WriteAt([]byte{0, 0, 0, 0, 0, 0, 0, 0x80}, 0); err != nil { return -1, fmt.Errorf("WriteAt failed: %v", err) } } fd, err = unix.Dup(int(f.Fd())) if err != nil { return -1, fmt.Errorf("unix.Dup(%d) failed: %v", f.Fd(), err) } if err := unix.Ftruncate(fd, size); err != nil { unix.Close(fd) return -1, fmt.Errorf("ftruncate(%d, %d) failed: %v", fd, size, err) } return fd, nil } func closeFDs(c QueueConfig) { unix.Close(c.DataFD) c.EventFD.Close() unix.Close(c.TxPipeFD) unix.Close(c.RxPipeFD) unix.Close(c.SharedDataFD) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/rx.go000066400000000000000000000073011465435605700244260ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package sharedmem import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/eventfd" "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue" ) // rx holds all state associated with an rx queue. type rx struct { data []byte sharedData []byte q queue.Rx eventFD eventfd.Eventfd } // init initializes all state needed by the rx queue based on the information // provided. // // The caller always retains ownership of all file descriptors passed in. The // queue implementation will duplicate any that it may need in the future. func (r *rx) init(mtu uint32, c *QueueConfig) error { // Map in all buffers. txPipe, err := getBuffer(c.TxPipeFD) if err != nil { return err } rxPipe, err := getBuffer(c.RxPipeFD) if err != nil { unix.Munmap(txPipe) return err } data, err := getBuffer(c.DataFD) if err != nil { unix.Munmap(txPipe) unix.Munmap(rxPipe) return err } sharedData, err := getBuffer(c.SharedDataFD) if err != nil { unix.Munmap(txPipe) unix.Munmap(rxPipe) unix.Munmap(data) return err } // Duplicate the eventFD so that caller can close it but we can still // use it. efd, err := c.EventFD.Dup() if err != nil { unix.Munmap(txPipe) unix.Munmap(rxPipe) unix.Munmap(data) unix.Munmap(sharedData) return err } // Initialize state based on buffers. r.q.Init(txPipe, rxPipe, sharedDataPointer(sharedData)) r.data = data r.eventFD = efd r.sharedData = sharedData return nil } // cleanup releases all resources allocated during init() except r.eventFD. It // must only be called if init() has previously succeeded. func (r *rx) cleanup() { a, b := r.q.Bytes() unix.Munmap(a) unix.Munmap(b) unix.Munmap(r.data) unix.Munmap(r.sharedData) } // notify writes to the tx.eventFD to indicate to the peer that there is data to // be read. func (r *rx) notify() { r.eventFD.Notify() } // postAndReceive posts the provided buffers (if any), and then tries to read // from the receive queue. // // Capacity permitting, it reuses the posted buffer slice to store the buffers // that were read as well. // // This function will block if there aren't any available packets. func (r *rx) postAndReceive(b []queue.RxBuffer, stopRequested *atomicbitops.Uint32) ([]queue.RxBuffer, uint32) { // Post the buffers first. If we cannot post, sleep until we can. We // never post more than will fit concurrently, so it's safe to wait // until enough room is available. if len(b) != 0 && !r.q.PostBuffers(b) { r.q.EnableNotification() for !r.q.PostBuffers(b) { r.eventFD.Wait() if stopRequested.Load() != 0 { r.q.DisableNotification() return nil, 0 } } r.q.DisableNotification() } // Read the next set of descriptors. b, n := r.q.Dequeue(b[:0]) if len(b) != 0 { return b, n } // Data isn't immediately available. Enable eventfd notifications. r.q.EnableNotification() for { b, n = r.q.Dequeue(b) if len(b) != 0 { break } // Wait for notification. r.eventFD.Wait() if stopRequested.Load() != 0 { r.q.DisableNotification() return nil, 0 } } r.q.DisableNotification() return b, n } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/server_rx.go000066400000000000000000000110251465435605700260120ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package sharedmem import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/eventfd" "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe" "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue" ) type serverRx struct { // packetPipe represents the receive end of the pipe that carries the packet // descriptors sent by the client. packetPipe pipe.Rx // completionPipe represents the transmit end of the pipe that will carry // completion notifications from the server to the client. completionPipe pipe.Tx // data represents the buffer area where the packet payload is held. data []byte // eventFD is used to notify the peer when transmission is completed. eventFD eventfd.Eventfd // sharedData the memory region to use to enable/disable notifications. sharedData []byte // sharedEventFDState is the memory region in sharedData used to enable // disable notifications on eventFD. sharedEventFDState *atomicbitops.Uint32 } // init initializes all state needed by the serverTx queue based on the // information provided. // // The caller always retains ownership of all file descriptors passed in. The // queue implementation will duplicate any that it may need in the future. func (s *serverRx) init(c *QueueConfig) error { // Map in all buffers. packetPipeMem, err := getBuffer(c.TxPipeFD) if err != nil { return err } cu := cleanup.Make(func() { unix.Munmap(packetPipeMem) }) defer cu.Clean() completionPipeMem, err := getBuffer(c.RxPipeFD) if err != nil { return err } cu.Add(func() { unix.Munmap(completionPipeMem) }) data, err := getBuffer(c.DataFD) if err != nil { return err } cu.Add(func() { unix.Munmap(data) }) sharedData, err := getBuffer(c.SharedDataFD) if err != nil { return err } cu.Add(func() { unix.Munmap(sharedData) }) // Duplicate the eventFD so that caller can close it but we can still // use it. efd, err := c.EventFD.Dup() if err != nil { return err } cu.Add(func() { efd.Close() }) s.packetPipe.Init(packetPipeMem) s.completionPipe.Init(completionPipeMem) s.data = data s.eventFD = efd s.sharedData = sharedData s.sharedEventFDState = sharedDataPointer(sharedData) cu.Release() return nil } func (s *serverRx) cleanup() { unix.Munmap(s.packetPipe.Bytes()) unix.Munmap(s.completionPipe.Bytes()) unix.Munmap(s.data) unix.Munmap(s.sharedData) s.eventFD.Close() } // EnableNotification updates the shared state such that the peer will notify // the eventfd when there are packets to be dequeued. func (s *serverRx) EnableNotification() { s.sharedEventFDState.Store(queue.EventFDEnabled) } // DisableNotification updates the shared state such that the peer will not // notify the eventfd. func (s *serverRx) DisableNotification() { s.sharedEventFDState.Store(queue.EventFDDisabled) } // completionNotificationSize is size in bytes of a completion notification sent // on the completion queue after a transmitted packet has been handled. const completionNotificationSize = 8 // receive receives a single packet from the packetPipe. func (s *serverRx) receive() *buffer.View { desc := s.packetPipe.Pull() if desc == nil { return nil } pktInfo := queue.DecodeTxPacketHeader(desc) contents := buffer.NewView(int(pktInfo.Size)) toCopy := pktInfo.Size for i := 0; i < pktInfo.BufferCount; i++ { txBuf := queue.DecodeTxBufferHeader(desc, i) if txBuf.Size <= toCopy { contents.Write(s.data[txBuf.Offset:][:txBuf.Size]) toCopy -= txBuf.Size continue } contents.Write(s.data[txBuf.Offset:][:toCopy]) break } // Flush to let peer know that slots queued for transmission have been handled // and its free to reuse the slots. s.packetPipe.Flush() // Encode packet completion. b := s.completionPipe.Push(completionNotificationSize) queue.EncodeTxCompletion(b, pktInfo.ID) s.completionPipe.Flush() return contents } func (s *serverRx) waitForPackets() { s.eventFD.Wait() } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/server_tx.go000066400000000000000000000130561465435605700260220ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package sharedmem import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/eventfd" "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe" "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // serverTx represents the server end of the sharedmem queue and is used to send // packets to the peer in the buffers posted by the peer in the fillPipe. type serverTx struct { // fillPipe represents the receive end of the pipe that carries the RxBuffers // posted by the peer. fillPipe pipe.Rx // completionPipe represents the transmit end of the pipe that carries the // descriptors for filled RxBuffers. completionPipe pipe.Tx // data represents the buffer area where the packet payload is held. data []byte // eventFD is used to notify the peer when fill requests are fulfilled. eventFD eventfd.Eventfd // sharedData the memory region to use to enable/disable notifications. sharedData []byte // sharedEventFDState is the memory region in sharedData used to enable/disable // notifications on eventFD. sharedEventFDState *atomicbitops.Uint32 } // init initializes all tstate needed by the serverTx queue based on the // information provided. // // The caller always retains ownership of all file descriptors passed in. The // queue implementation will duplicate any that it may need in the future. func (s *serverTx) init(c *QueueConfig) error { // Map in all buffers. fillPipeMem, err := getBuffer(c.TxPipeFD) if err != nil { return err } cu := cleanup.Make(func() { unix.Munmap(fillPipeMem) }) defer cu.Clean() completionPipeMem, err := getBuffer(c.RxPipeFD) if err != nil { return err } cu.Add(func() { unix.Munmap(completionPipeMem) }) data, err := getBuffer(c.DataFD) if err != nil { return err } cu.Add(func() { unix.Munmap(data) }) sharedData, err := getBuffer(c.SharedDataFD) if err != nil { return err } cu.Add(func() { unix.Munmap(sharedData) }) // Duplicate the eventFD so that caller can close it but we can still // use it. efd, err := c.EventFD.Dup() if err != nil { return err } cu.Add(func() { efd.Close() }) cu.Release() s.fillPipe.Init(fillPipeMem) s.completionPipe.Init(completionPipeMem) s.data = data s.eventFD = efd s.sharedData = sharedData s.sharedEventFDState = sharedDataPointer(sharedData) return nil } func (s *serverTx) cleanup() { unix.Munmap(s.fillPipe.Bytes()) unix.Munmap(s.completionPipe.Bytes()) unix.Munmap(s.data) unix.Munmap(s.sharedData) s.eventFD.Close() } // acquireBuffers acquires enough buffers to hold all the data in views or // returns nil if not enough buffers are currently available. func (s *serverTx) acquireBuffers(pktBuffer buffer.Buffer, buffers []queue.RxBuffer) (acquiredBuffers []queue.RxBuffer) { acquiredBuffers = buffers[:0] wantBytes := int(pktBuffer.Size()) for wantBytes > 0 { var b []byte if b = s.fillPipe.Pull(); b == nil { s.fillPipe.Abort() return nil } rxBuffer := queue.DecodeRxBufferHeader(b) acquiredBuffers = append(acquiredBuffers, rxBuffer) wantBytes -= int(rxBuffer.Size) } return acquiredBuffers } // fillPacket copies the data in the provided views into buffers pulled from the // fillPipe and returns a slice of RxBuffers that contain the copied data as // well as the total number of bytes copied. // // To avoid allocations the filledBuffers are appended to the buffers slice // which will be grown as required. This method takes ownership of pktBuffer. func (s *serverTx) fillPacket(pktBuffer buffer.Buffer, buffers []queue.RxBuffer) (filledBuffers []queue.RxBuffer, totalCopied uint32) { bufs := s.acquireBuffers(pktBuffer, buffers) if bufs == nil { pktBuffer.Release() return nil, 0 } br := pktBuffer.AsBufferReader() defer br.Close() for i := 0; br.Len() > 0 && i < len(bufs); i++ { buf := bufs[i] copied, err := br.Read(s.data[buf.Offset:][:buf.Size]) buf.Size = uint32(copied) // Copy the packet into the posted buffer. totalCopied += bufs[i].Size if err != nil { return bufs, totalCopied } } return bufs, totalCopied } func (s *serverTx) transmit(pkt *stack.PacketBuffer) bool { buffers := make([]queue.RxBuffer, 8) buffers, totalCopied := s.fillPacket(pkt.ToBuffer(), buffers) if totalCopied == 0 { // drop the packet as not enough buffers were probably available // to send. return false } b := s.completionPipe.Push(queue.RxCompletionSize(len(buffers))) if b == nil { return false } queue.EncodeRxCompletion(b, totalCopied, 0 /* reserved */) for i := 0; i < len(buffers); i++ { queue.EncodeRxCompletionBuffer(b, i, buffers[i]) } s.completionPipe.Flush() s.fillPipe.Flush() return true } func (s *serverTx) notificationsEnabled() bool { // notifications are considered to be enabled unless explicitly disabled. return s.sharedEventFDState.Load() != queue.EventFDDisabled } func (s *serverTx) notify() { if s.notificationsEnabled() { s.eventFD.Notify() } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/sharedmem.go000066400000000000000000000367121465435605700257520ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux // Package sharedmem provides the implementation of data-link layer endpoints // backed by shared memory. // // Shared memory endpoints can be used in the networking stack by calling New() // to create a new endpoint, and then passing it as an argument to // Stack.CreateNIC(). package sharedmem import ( "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/eventfd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/rawfile" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // QueueConfig holds all the file descriptors needed to describe a tx or rx // queue over shared memory. It is used when creating new shared memory // endpoints to describe tx and rx queues. // // +stateify savable type QueueConfig struct { // DataFD is a file descriptor for the file that contains the data to // be transmitted via this queue. Descriptors contain offsets within // this file. DataFD int // EventFD is a file descriptor for the event that is signaled when // data is becomes available in this queue. EventFD eventfd.Eventfd // TxPipeFD is a file descriptor for the tx pipe associated with the // queue. TxPipeFD int // RxPipeFD is a file descriptor for the rx pipe associated with the // queue. RxPipeFD int // SharedDataFD is a file descriptor for the file that contains shared // state between the two ends of the queue. This data specifies, for // example, whether EventFD signaling is enabled or disabled. SharedDataFD int } // FDs returns the FD's in the QueueConfig as a slice of ints. This must // be used in conjunction with QueueConfigFromFDs to ensure the order // of FDs matches when reconstructing the config when serialized or sent // as part of control messages. func (q *QueueConfig) FDs() []int { return []int{q.DataFD, q.EventFD.FD(), q.TxPipeFD, q.RxPipeFD, q.SharedDataFD} } // QueueConfigFromFDs constructs a QueueConfig out of a slice of ints where each // entry represents an file descriptor. The order of FDs in the slice must be in // the order specified below for the config to be valid. QueueConfig.FDs() // should be used when the config needs to be serialized or sent as part of a // control message to ensure the correct order. func QueueConfigFromFDs(fds []int) (QueueConfig, error) { if len(fds) != 5 { return QueueConfig{}, fmt.Errorf("insufficient number of fds: len(fds): %d, want: 5", len(fds)) } return QueueConfig{ DataFD: fds[0], EventFD: eventfd.Wrap(fds[1]), TxPipeFD: fds[2], RxPipeFD: fds[3], SharedDataFD: fds[4], }, nil } // Options specify the details about the sharedmem endpoint to be created. // // +stateify savable type Options struct { // MTU is the mtu to use for this endpoint. MTU uint32 // BufferSize is the size of each scatter/gather buffer that will hold packet // data. // // NOTE: This directly determines number of packets that can be held in // the ring buffer at any time. This does not have to be sized to the MTU as // the shared memory queue design allows usage of more than one buffer to be // used to make up a given packet. BufferSize uint32 // LinkAddress is the link address for this endpoint (required). LinkAddress tcpip.LinkAddress // TX is the transmit queue configuration for this shared memory endpoint. TX QueueConfig // RX is the receive queue configuration for this shared memory endpoint. RX QueueConfig // PeerFD is the fd for the connected peer which can be used to detect // peer disconnects. PeerFD int // OnClosed is a function that is called when the endpoint is being closed // (probably due to peer going away) OnClosed func(err tcpip.Error) // TXChecksumOffload if true, indicates that this endpoints capability // set should include CapabilityTXChecksumOffload. TXChecksumOffload bool // RXChecksumOffload if true, indicates that this endpoints capability // set should include CapabilityRXChecksumOffload. RXChecksumOffload bool // VirtioNetHeaderRequired if true, indicates that all outbound packets should have // a virtio header and inbound packets should have a virtio header as well. VirtioNetHeaderRequired bool // GSOMaxSize is the maximum GSO packet size. It is zero if GSO is // disabled. Note that only gVisor GSO is supported, not host GSO. GSOMaxSize uint32 } var _ stack.LinkEndpoint = (*endpoint)(nil) var _ stack.GSOEndpoint = (*endpoint)(nil) // +stateify savable type endpoint struct { // bufferSize is the size of each individual buffer. // bufferSize is immutable. bufferSize uint32 // peerFD is an fd to the peer that can be used to detect when the // peer is gone. // peerFD is immutable. peerFD int // caps holds the endpoint capabilities. caps stack.LinkEndpointCapabilities // hdrSize is the size of the link layer header if any. // hdrSize is immutable. hdrSize uint32 // gSOMaxSize is the maximum GSO packet size. It is zero if GSO is // disabled. Note that only gVisor GSO is supported, not host GSO. // gsoMaxSize is immutable. gsoMaxSize uint32 // virtioNetHeaderRequired if true indicates that a virtio header is expected // in all inbound/outbound packets. virtioNetHeaderRequired bool // rx is the receive queue. rx rx // stopRequested determines whether the worker goroutines should stop. stopRequested atomicbitops.Uint32 // Wait group used to indicate that all workers have stopped. completed sync.WaitGroup // onClosed is a function to be called when the FD's peer (if any) closes // its end of the communication pipe. // TODO(b/341946753): Restore when netstack is savable. onClosed func(tcpip.Error) `state:"nosave"` // mu protects the following fields. mu sync.RWMutex `state:"nosave"` // tx is the transmit queue. // +checklocks:mu tx tx // workerStarted specifies whether the worker goroutine was started. // +checklocks:mu workerStarted bool // addr is the local address of this endpoint. // // +checklocks:mu addr tcpip.LinkAddress // mtu (maximum transmission unit) is the maximum size of a packet. // +checklocks:mu mtu uint32 } // New creates a new shared-memory-based endpoint. Buffers will be broken up // into buffers of "bufferSize" bytes. // // In order to release all resources held by the returned endpoint, Close() // must be called followed by Wait(). func New(opts Options) (stack.LinkEndpoint, error) { e := &endpoint{ mtu: opts.MTU, bufferSize: opts.BufferSize, addr: opts.LinkAddress, peerFD: opts.PeerFD, onClosed: opts.OnClosed, virtioNetHeaderRequired: opts.VirtioNetHeaderRequired, gsoMaxSize: opts.GSOMaxSize, } if err := e.tx.init(opts.BufferSize, &opts.TX); err != nil { return nil, err } if err := e.rx.init(opts.BufferSize, &opts.RX); err != nil { e.tx.cleanup() return nil, err } e.caps = stack.LinkEndpointCapabilities(0) if opts.RXChecksumOffload { e.caps |= stack.CapabilityRXChecksumOffload } if opts.TXChecksumOffload { e.caps |= stack.CapabilityTXChecksumOffload } if opts.LinkAddress != "" { e.hdrSize = header.EthernetMinimumSize e.caps |= stack.CapabilityResolutionRequired } if opts.VirtioNetHeaderRequired { e.hdrSize += header.VirtioNetHeaderSize } return e, nil } // SetOnCloseAction implements stack.LinkEndpoint.SetOnCloseAction. func (e *endpoint) SetOnCloseAction(func()) {} // Close frees most resources associated with the endpoint. Wait() must be // called after Close() in order to free the rest. func (e *endpoint) Close() { // Tell dispatch goroutine to stop, then write to the eventfd so that // it wakes up in case it's sleeping. if e.stopRequested.Swap(1) == 1 { // It is already closed. return } e.rx.eventFD.Notify() // Cleanup the queues inline if the worker hasn't started yet; we also // know it won't start from now on because stopRequested is set to 1. e.mu.Lock() defer e.mu.Unlock() workerPresent := e.workerStarted if !workerPresent { e.tx.cleanup() e.rx.cleanup() } } // Wait implements stack.LinkEndpoint.Wait. It waits until all workers have // stopped after a Close() call. func (e *endpoint) Wait() { e.completed.Wait() e.rx.eventFD.Close() } // Attach implements stack.LinkEndpoint.Attach. It launches the goroutine that // reads packets from the rx queue. func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { if dispatcher == nil { e.Close() return } e.mu.Lock() if !e.workerStarted && e.stopRequested.Load() == 0 { e.workerStarted = true e.completed.Add(1) // Spin up a goroutine to monitor for peer shutdown. if e.peerFD >= 0 { e.completed.Add(1) go func() { defer e.completed.Done() b := make([]byte, 1) // When sharedmem endpoint is in use the peerFD is never used for any data // transfer and this Read should only return if the peer is shutting down. _, errno := rawfile.BlockingRead(e.peerFD, b) if e.onClosed != nil { if errno == 0 { e.onClosed(nil) } else { e.onClosed(tcpip.TranslateErrno(errno)) } } }() } // Link endpoints are not savable. When transportation endpoints // are saved, they stop sending outgoing packets and all // incoming packets are rejected. go e.dispatchLoop(dispatcher) // S/R-SAFE: see above. } e.mu.Unlock() } // IsAttached implements stack.LinkEndpoint.IsAttached. func (e *endpoint) IsAttached() bool { e.mu.Lock() defer e.mu.Unlock() return e.workerStarted } // MTU implements stack.LinkEndpoint.MTU. func (e *endpoint) MTU() uint32 { e.mu.RLock() defer e.mu.RUnlock() return e.mtu } func (e *endpoint) SetMTU(mtu uint32) { e.mu.Lock() defer e.mu.Unlock() e.mtu = mtu } // Capabilities implements stack.LinkEndpoint.Capabilities. func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { return e.caps } // MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength. It returns the // ethernet frame header size. func (e *endpoint) MaxHeaderLength() uint16 { return uint16(e.hdrSize) } // LinkAddress implements stack.LinkEndpoint.LinkAddress. It returns the local // link address. func (e *endpoint) LinkAddress() tcpip.LinkAddress { e.mu.RLock() defer e.mu.RUnlock() return e.addr } // SetLinkAddress implements stack.LinkEndpoint.SetLinkAddress. func (e *endpoint) SetLinkAddress(addr tcpip.LinkAddress) { e.mu.Lock() defer e.mu.Unlock() e.addr = addr } // AddHeader implements stack.LinkEndpoint.AddHeader. func (e *endpoint) AddHeader(pkt *stack.PacketBuffer) { e.mu.RLock() defer e.mu.RUnlock() // Add ethernet header if needed. if len(e.addr) == 0 { return } eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize)) eth.Encode(&header.EthernetFields{ SrcAddr: pkt.EgressRoute.LocalLinkAddress, DstAddr: pkt.EgressRoute.RemoteLinkAddress, Type: pkt.NetworkProtocolNumber, }) } func (e *endpoint) parseHeader(pkt *stack.PacketBuffer) bool { _, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize) return ok } // ParseHeader implements stack.LinkEndpoint.ParseHeader. func (e *endpoint) ParseHeader(pkt *stack.PacketBuffer) bool { e.mu.RLock() defer e.mu.RUnlock() // Add ethernet header if needed. if len(e.addr) == 0 { return true } return e.parseHeader(pkt) } func (e *endpoint) AddVirtioNetHeader(pkt *stack.PacketBuffer) { virtio := header.VirtioNetHeader(pkt.VirtioNetHeader().Push(header.VirtioNetHeaderSize)) virtio.Encode(&header.VirtioNetHeaderFields{}) } // +checklocks:e.mu func (e *endpoint) writePacketLocked(r stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error { if e.virtioNetHeaderRequired { e.AddVirtioNetHeader(pkt) } // Transmit the packet. b := pkt.ToBuffer() defer b.Release() ok := e.tx.transmit(b) if !ok { return &tcpip.ErrWouldBlock{} } return nil } // WritePackets implements stack.LinkEndpoint.WritePackets. func (e *endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { n := 0 var err tcpip.Error e.mu.Lock() defer e.mu.Unlock() for _, pkt := range pkts.AsSlice() { if err = e.writePacketLocked(pkt.EgressRoute, pkt.NetworkProtocolNumber, pkt); err != nil { break } n++ } // WritePackets never returns an error if it successfully transmitted at least // one packet. if err != nil && n == 0 { return 0, err } e.tx.notify() return n, nil } // dispatchLoop reads packets from the rx queue in a loop and dispatches them // to the network stack. func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) { // Post initial set of buffers. limit := e.rx.q.PostedBuffersLimit() if l := uint64(len(e.rx.data)) / uint64(e.bufferSize); limit > l { limit = l } for i := uint64(0); i < limit; i++ { b := queue.RxBuffer{ Offset: i * uint64(e.bufferSize), Size: e.bufferSize, ID: i, } if !e.rx.q.PostBuffers([]queue.RxBuffer{b}) { log.Warningf("Unable to post %v-th buffer", i) } } // Read in a loop until a stop is requested. var rxb []queue.RxBuffer for e.stopRequested.Load() == 0 { var n uint32 rxb, n = e.rx.postAndReceive(rxb, &e.stopRequested) // Copy data from the shared area to its own buffer, then // prepare to repost the buffer. v := buffer.NewView(int(n)) v.Grow(int(n)) offset := uint32(0) for i := range rxb { v.WriteAt(e.rx.data[rxb[i].Offset:][:rxb[i].Size], int(offset)) offset += rxb[i].Size rxb[i].Size = e.bufferSize } pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ Payload: buffer.MakeWithView(v), }) if e.virtioNetHeaderRequired { _, ok := pkt.VirtioNetHeader().Consume(header.VirtioNetHeaderSize) if !ok { pkt.DecRef() continue } } var proto tcpip.NetworkProtocolNumber e.mu.RLock() addrLen := len(e.addr) e.mu.RUnlock() if addrLen != 0 { if !e.parseHeader(pkt) { pkt.DecRef() continue } proto = header.Ethernet(pkt.LinkHeader().Slice()).Type() } else { // We don't get any indication of what the packet is, so try to guess // if it's an IPv4 or IPv6 packet. // IP version information is at the first octet, so pulling up 1 byte. h, ok := pkt.Data().PullUp(1) if !ok { pkt.DecRef() continue } switch header.IPVersion(h) { case header.IPv4Version: proto = header.IPv4ProtocolNumber case header.IPv6Version: proto = header.IPv6ProtocolNumber default: pkt.DecRef() continue } } // Send packet up the stack. d.DeliverNetworkPacket(proto, pkt) pkt.DecRef() } e.mu.Lock() defer e.mu.Unlock() // Clean state. e.tx.cleanup() e.rx.cleanup() e.completed.Done() } // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType func (*endpoint) ARPHardwareType() header.ARPHardwareType { return header.ARPHardwareEther } // GSOMaxSize implements stack.GSOEndpoint. func (e *endpoint) GSOMaxSize() uint32 { return e.gsoMaxSize } // SupportsGSO implements stack.GSOEndpoint. func (e *endpoint) SupportedGSO() stack.SupportedGSO { return stack.GVisorGSOSupported } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/sharedmem_server.go000066400000000000000000000250201465435605700273260ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package sharedmem import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/rawfile" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) type serverEndpoint struct { // bufferSize is the size of each individual buffer. // bufferSize is immutable. bufferSize uint32 // rx is the receive queue. rx serverRx // stopRequested determines whether the worker goroutines should stop. stopRequested atomicbitops.Uint32 // Wait group used to indicate that all workers have stopped. completed sync.WaitGroup // peerFD is an fd to the peer that can be used to detect when the peer is // gone. // peerFD is immutable. peerFD int // caps holds the endpoint capabilities. caps stack.LinkEndpointCapabilities // hdrSize is the size of the link layer header if any. // hdrSize is immutable. hdrSize uint32 // virtioNetHeaderRequired if true indicates that a virtio header is expected // in all inbound/outbound packets. virtioNetHeaderRequired bool // onClosed is a function to be called when the FD's peer (if any) closes its // end of the communication pipe. onClosed func(tcpip.Error) // mu protects the following fields. mu sync.RWMutex // tx is the transmit queue. // +checklocks:mu tx serverTx // workerStarted specifies whether the worker goroutine was started. // +checklocks:mu workerStarted bool // addr is the local address of this endpoint. // // +checklocks:mu addr tcpip.LinkAddress // mtu (maximum transmission unit) is the maximum size of a packet. // +checklocks:mu mtu uint32 } // NewServerEndpoint creates a new shared-memory-based endpoint. Buffers will be // broken up into buffers of "bufferSize" bytes. func NewServerEndpoint(opts Options) (stack.LinkEndpoint, error) { e := &serverEndpoint{ mtu: opts.MTU, bufferSize: opts.BufferSize, addr: opts.LinkAddress, peerFD: opts.PeerFD, onClosed: opts.OnClosed, } if err := e.tx.init(&opts.RX); err != nil { return nil, err } if err := e.rx.init(&opts.TX); err != nil { e.tx.cleanup() return nil, err } e.caps = stack.LinkEndpointCapabilities(0) if opts.RXChecksumOffload { e.caps |= stack.CapabilityRXChecksumOffload } if opts.TXChecksumOffload { e.caps |= stack.CapabilityTXChecksumOffload } if opts.LinkAddress != "" { e.hdrSize = header.EthernetMinimumSize e.caps |= stack.CapabilityResolutionRequired } return e, nil } // SetOnCloseAction implements stack.LinkEndpoint.SetOnCloseAction. func (*serverEndpoint) SetOnCloseAction(func()) {} // Close frees all resources associated with the endpoint. func (e *serverEndpoint) Close() { // Tell dispatch goroutine to stop, then write to the eventfd so that it wakes // up in case it's sleeping. e.stopRequested.Store(1) e.rx.eventFD.Notify() // Cleanup the queues inline if the worker hasn't started yet; we also know it // won't start from now on because stopRequested is set to 1. e.mu.Lock() defer e.mu.Unlock() workerPresent := e.workerStarted if !workerPresent { e.tx.cleanup() e.rx.cleanup() } } // Wait implements stack.LinkEndpoint.Wait. It waits until all workers have // stopped after a Close() call. func (e *serverEndpoint) Wait() { e.completed.Wait() } // Attach implements stack.LinkEndpoint.Attach. It launches the goroutine that // reads packets from the rx queue. func (e *serverEndpoint) Attach(dispatcher stack.NetworkDispatcher) { e.mu.Lock() if !e.workerStarted && e.stopRequested.Load() == 0 { e.workerStarted = true e.completed.Add(1) if e.peerFD >= 0 { e.completed.Add(1) // Spin up a goroutine to monitor for peer shutdown. go func() { b := make([]byte, 1) // When sharedmem endpoint is in use the peerFD is never used for any // data transfer and this Read should only return if the peer is // shutting down. _, errno := rawfile.BlockingRead(e.peerFD, b) if e.onClosed != nil { if errno == 0 { e.onClosed(nil) } else { e.onClosed(tcpip.TranslateErrno(errno)) } } e.completed.Done() }() } // Link endpoints are not savable. When transportation endpoints are saved, // they stop sending outgoing packets and all incoming packets are rejected. go e.dispatchLoop(dispatcher) // S/R-SAFE: see above. } e.mu.Unlock() } // IsAttached implements stack.LinkEndpoint.IsAttached. func (e *serverEndpoint) IsAttached() bool { e.mu.Lock() defer e.mu.Unlock() return e.workerStarted } // MTU implements stack.LinkEndpoint.MTU. func (e *serverEndpoint) MTU() uint32 { e.mu.RLock() defer e.mu.RUnlock() return e.mtu } func (e *serverEndpoint) SetMTU(mtu uint32) { e.mu.Lock() defer e.mu.Unlock() e.mtu = mtu } // Capabilities implements stack.LinkEndpoint.Capabilities. func (e *serverEndpoint) Capabilities() stack.LinkEndpointCapabilities { return e.caps } // MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength. It returns the // ethernet frame header size. func (e *serverEndpoint) MaxHeaderLength() uint16 { return uint16(e.hdrSize) } // LinkAddress implements stack.LinkEndpoint.LinkAddress. It returns the local // link address. func (e *serverEndpoint) LinkAddress() tcpip.LinkAddress { e.mu.RLock() defer e.mu.RUnlock() return e.addr } // SetLinkAddress implements stack.LinkEndpoint.SetLinkAddress. func (e *serverEndpoint) SetLinkAddress(addr tcpip.LinkAddress) { e.mu.Lock() defer e.mu.Unlock() e.addr = addr } // AddHeader implements stack.LinkEndpoint.AddHeader. func (e *serverEndpoint) AddHeader(pkt *stack.PacketBuffer) { e.mu.RLock() defer e.mu.RUnlock() // Add ethernet header if needed. if len(e.addr) == 0 { return } eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize)) eth.Encode(&header.EthernetFields{ SrcAddr: pkt.EgressRoute.LocalLinkAddress, DstAddr: pkt.EgressRoute.RemoteLinkAddress, Type: pkt.NetworkProtocolNumber, }) } func (e *serverEndpoint) parseHeader(pkt *stack.PacketBuffer) bool { _, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize) return ok } // ParseHeader implements stack.LinkEndpoint.ParseHeader. func (e *serverEndpoint) ParseHeader(pkt *stack.PacketBuffer) bool { e.mu.RLock() defer e.mu.RUnlock() // Add ethernet header if needed. if len(e.addr) == 0 { return true } return e.parseHeader(pkt) } func (e *serverEndpoint) AddVirtioNetHeader(pkt *stack.PacketBuffer) { virtio := header.VirtioNetHeader(pkt.VirtioNetHeader().Push(header.VirtioNetHeaderSize)) virtio.Encode(&header.VirtioNetHeaderFields{}) } // +checklocks:e.mu func (e *serverEndpoint) writePacketLocked(r stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error { if e.virtioNetHeaderRequired { e.AddVirtioNetHeader(pkt) } ok := e.tx.transmit(pkt) if !ok { return &tcpip.ErrWouldBlock{} } return nil } // WritePacket writes outbound packets to the file descriptor. If it is not // currently writable, the packet is dropped. // WritePacket implements stack.LinkEndpoint.WritePacket. func (e *serverEndpoint) WritePacket(_ stack.RouteInfo, _ tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error { // Transmit the packet. e.mu.Lock() defer e.mu.Unlock() if err := e.writePacketLocked(pkt.EgressRoute, pkt.NetworkProtocolNumber, pkt); err != nil { return err } e.tx.notify() return nil } // WritePackets implements stack.LinkEndpoint.WritePackets. func (e *serverEndpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { n := 0 var err tcpip.Error e.mu.Lock() defer e.mu.Unlock() for _, pkt := range pkts.AsSlice() { if err = e.writePacketLocked(pkt.EgressRoute, pkt.NetworkProtocolNumber, pkt); err != nil { break } n++ } // WritePackets never returns an error if it successfully transmitted at least // one packet. if err != nil && n == 0 { return 0, err } e.tx.notify() return n, nil } // dispatchLoop reads packets from the rx queue in a loop and dispatches them // to the network stack. func (e *serverEndpoint) dispatchLoop(d stack.NetworkDispatcher) { for e.stopRequested.Load() == 0 { b := e.rx.receive() if b == nil { e.rx.EnableNotification() // Now pull again to make sure we didn't receive any packets // while notifications were not enabled. for { b = e.rx.receive() if b != nil { // Disable notifications as we only need to be notified when we are going // to block on eventFD. This should prevent the peer from needlessly // writing to eventFD when this end is already awake and processing // packets. e.rx.DisableNotification() break } e.rx.waitForPackets() } } pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ Payload: buffer.MakeWithView(b), }) if e.virtioNetHeaderRequired { _, ok := pkt.VirtioNetHeader().Consume(header.VirtioNetHeaderSize) if !ok { pkt.DecRef() continue } } var proto tcpip.NetworkProtocolNumber e.mu.RLock() addrLen := len(e.addr) e.mu.RUnlock() if addrLen != 0 { if !e.parseHeader(pkt) { pkt.DecRef() continue } proto = header.Ethernet(pkt.LinkHeader().Slice()).Type() } else { // We don't get any indication of what the packet is, so try to guess // if it's an IPv4 or IPv6 packet. // IP version information is at the first octet, so pulling up 1 byte. h, ok := pkt.Data().PullUp(1) if !ok { pkt.DecRef() continue } switch header.IPVersion(h) { case header.IPv4Version: proto = header.IPv4ProtocolNumber case header.IPv6Version: proto = header.IPv6ProtocolNumber default: pkt.DecRef() continue } } // Send packet up the stack. d.DeliverNetworkPacket(proto, pkt) pkt.DecRef() } e.mu.Lock() defer e.mu.Unlock() // Clean state. e.tx.cleanup() e.rx.cleanup() e.completed.Done() } // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType func (e *serverEndpoint) ARPHardwareType() header.ARPHardwareType { if e.hdrSize > 0 { return header.ARPHardwareEther } return header.ARPHardwareNone } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/sharedmem_state_autogen.go000066400000000000000000000105561465435605700306720ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux && linux && linux && linux && linux && linux // +build linux,linux,linux,linux,linux,linux package sharedmem import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (q *QueueConfig) StateTypeName() string { return "pkg/tcpip/link/sharedmem.QueueConfig" } func (q *QueueConfig) StateFields() []string { return []string{ "DataFD", "EventFD", "TxPipeFD", "RxPipeFD", "SharedDataFD", } } func (q *QueueConfig) beforeSave() {} // +checklocksignore func (q *QueueConfig) StateSave(stateSinkObject state.Sink) { q.beforeSave() stateSinkObject.Save(0, &q.DataFD) stateSinkObject.Save(1, &q.EventFD) stateSinkObject.Save(2, &q.TxPipeFD) stateSinkObject.Save(3, &q.RxPipeFD) stateSinkObject.Save(4, &q.SharedDataFD) } func (q *QueueConfig) afterLoad(context.Context) {} // +checklocksignore func (q *QueueConfig) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &q.DataFD) stateSourceObject.Load(1, &q.EventFD) stateSourceObject.Load(2, &q.TxPipeFD) stateSourceObject.Load(3, &q.RxPipeFD) stateSourceObject.Load(4, &q.SharedDataFD) } func (o *Options) StateTypeName() string { return "pkg/tcpip/link/sharedmem.Options" } func (o *Options) StateFields() []string { return []string{ "MTU", "BufferSize", "LinkAddress", "TX", "RX", "PeerFD", "OnClosed", "TXChecksumOffload", "RXChecksumOffload", "VirtioNetHeaderRequired", "GSOMaxSize", } } func (o *Options) beforeSave() {} // +checklocksignore func (o *Options) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.MTU) stateSinkObject.Save(1, &o.BufferSize) stateSinkObject.Save(2, &o.LinkAddress) stateSinkObject.Save(3, &o.TX) stateSinkObject.Save(4, &o.RX) stateSinkObject.Save(5, &o.PeerFD) stateSinkObject.Save(6, &o.OnClosed) stateSinkObject.Save(7, &o.TXChecksumOffload) stateSinkObject.Save(8, &o.RXChecksumOffload) stateSinkObject.Save(9, &o.VirtioNetHeaderRequired) stateSinkObject.Save(10, &o.GSOMaxSize) } func (o *Options) afterLoad(context.Context) {} // +checklocksignore func (o *Options) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.MTU) stateSourceObject.Load(1, &o.BufferSize) stateSourceObject.Load(2, &o.LinkAddress) stateSourceObject.Load(3, &o.TX) stateSourceObject.Load(4, &o.RX) stateSourceObject.Load(5, &o.PeerFD) stateSourceObject.Load(6, &o.OnClosed) stateSourceObject.Load(7, &o.TXChecksumOffload) stateSourceObject.Load(8, &o.RXChecksumOffload) stateSourceObject.Load(9, &o.VirtioNetHeaderRequired) stateSourceObject.Load(10, &o.GSOMaxSize) } func (e *endpoint) StateTypeName() string { return "pkg/tcpip/link/sharedmem.endpoint" } func (e *endpoint) StateFields() []string { return []string{ "bufferSize", "peerFD", "caps", "hdrSize", "gsoMaxSize", "virtioNetHeaderRequired", "rx", "stopRequested", "completed", "tx", "workerStarted", "addr", "mtu", } } func (e *endpoint) beforeSave() {} // +checklocksignore func (e *endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.bufferSize) stateSinkObject.Save(1, &e.peerFD) stateSinkObject.Save(2, &e.caps) stateSinkObject.Save(3, &e.hdrSize) stateSinkObject.Save(4, &e.gsoMaxSize) stateSinkObject.Save(5, &e.virtioNetHeaderRequired) stateSinkObject.Save(6, &e.rx) stateSinkObject.Save(7, &e.stopRequested) stateSinkObject.Save(8, &e.completed) stateSinkObject.Save(9, &e.tx) stateSinkObject.Save(10, &e.workerStarted) stateSinkObject.Save(11, &e.addr) stateSinkObject.Save(12, &e.mtu) } func (e *endpoint) afterLoad(context.Context) {} // +checklocksignore func (e *endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.bufferSize) stateSourceObject.Load(1, &e.peerFD) stateSourceObject.Load(2, &e.caps) stateSourceObject.Load(3, &e.hdrSize) stateSourceObject.Load(4, &e.gsoMaxSize) stateSourceObject.Load(5, &e.virtioNetHeaderRequired) stateSourceObject.Load(6, &e.rx) stateSourceObject.Load(7, &e.stopRequested) stateSourceObject.Load(8, &e.completed) stateSourceObject.Load(9, &e.tx) stateSourceObject.Load(10, &e.workerStarted) stateSourceObject.Load(11, &e.addr) stateSourceObject.Load(12, &e.mtu) } func init() { state.Register((*QueueConfig)(nil)) state.Register((*Options)(nil)) state.Register((*endpoint)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go000066400000000000000000000033721465435605700273070ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sharedmem import ( "fmt" "reflect" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/memutil" ) // sharedDataPointer converts the shared data slice into a pointer so that it // can be used in atomic operations. func sharedDataPointer(sharedData []byte) *atomicbitops.Uint32 { return (*atomicbitops.Uint32)(unsafe.Pointer(&sharedData[0:4][0])) } // getBuffer returns a memory region mapped to the full contents of the given // file descriptor. func getBuffer(fd int) ([]byte, error) { var s unix.Stat_t if err := unix.Fstat(fd, &s); err != nil { return nil, err } // Check that size doesn't overflow an int. if s.Size > int64(^uint(0)>>1) { return nil, unix.EDOM } addr, err := memutil.MapFile(0 /* addr */, uintptr(s.Size), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED|unix.MAP_FILE, uintptr(fd), 0 /*offset*/) if err != nil { return nil, fmt.Errorf("failed to map memory for buffer fd: %d, error: %s", fd, err) } // Use unsafe to convert addr into a []byte. var b []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&b)) hdr.Data = addr hdr.Len = int(s.Size) hdr.Cap = int(s.Size) return b, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/sharedmem_unsafe_state_autogen.go000066400000000000000000000000731465435605700322240ustar00rootroot00000000000000// automatically generated by stateify. package sharedmem golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sharedmem/tx.go000066400000000000000000000151751465435605700244400ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sharedmem import ( "math" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/eventfd" "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue" ) const ( nilID = math.MaxUint64 ) // tx holds all state associated with a tx queue. type tx struct { data []byte q queue.Tx ids idManager bufs bufferManager eventFD eventfd.Eventfd sharedData []byte sharedDataFD int } // init initializes all state needed by the tx queue based on the information // provided. // // The caller always retains ownership of all file descriptors passed in. The // queue implementation will duplicate any that it may need in the future. func (t *tx) init(bufferSize uint32, c *QueueConfig) error { // Map in all buffers. txPipe, err := getBuffer(c.TxPipeFD) if err != nil { return err } rxPipe, err := getBuffer(c.RxPipeFD) if err != nil { unix.Munmap(txPipe) return err } data, err := getBuffer(c.DataFD) if err != nil { unix.Munmap(txPipe) unix.Munmap(rxPipe) return err } sharedData, err := getBuffer(c.SharedDataFD) if err != nil { unix.Munmap(txPipe) unix.Munmap(rxPipe) unix.Munmap(data) } // Initialize state based on buffers. t.q.Init(txPipe, rxPipe, sharedDataPointer(sharedData)) t.ids.init() t.bufs.init(0, len(data), int(bufferSize)) t.data = data t.eventFD = c.EventFD t.sharedDataFD = c.SharedDataFD t.sharedData = sharedData return nil } // cleanup releases all resources allocated during init(). It must only be // called if init() has previously succeeded. func (t *tx) cleanup() { a, b := t.q.Bytes() unix.Munmap(a) unix.Munmap(b) unix.Munmap(t.data) } // transmit sends a packet made of bufs. Returns a boolean that specifies // whether the packet was successfully transmitted. func (t *tx) transmit(transmitBuf buffer.Buffer) bool { // Pull completions from the tx queue and add their buffers back to the // pool so that we can reuse them. for { id, ok := t.q.CompletedPacket() if !ok { break } if buf := t.ids.remove(id); buf != nil { t.bufs.free(buf) } } bSize := t.bufs.entrySize total := uint32(transmitBuf.Size()) bufCount := (total + bSize - 1) / bSize // Allocate enough buffers to hold all the data. var buf *queue.TxBuffer for i := bufCount; i != 0; i-- { b := t.bufs.alloc() if b == nil { // Failed to get all buffers. Return to the pool // whatever we had managed to get. if buf != nil { t.bufs.free(buf) } return false } b.Next = buf buf = b } // Copy data into allocated buffers. nBuf := buf var dBuf []byte transmitBuf.Apply(func(v *buffer.View) { for v.Size() > 0 { if len(dBuf) == 0 { dBuf = t.data[nBuf.Offset:][:nBuf.Size] nBuf = nBuf.Next } n := copy(dBuf, v.AsSlice()) v.TrimFront(n) dBuf = dBuf[n:] } }) // Get an id for this packet and send it out. id := t.ids.add(buf) if !t.q.Enqueue(id, total, bufCount, buf) { t.ids.remove(id) t.bufs.free(buf) return false } return true } // notify writes to the tx.eventFD to indicate to the peer that there is data to // be read. func (t *tx) notify() { if t.q.NotificationsEnabled() { t.eventFD.Notify() } } // idDescriptor is used by idManager to either point to a tx buffer (in case // the ID is assigned) or to the next free element (if the id is not assigned). type idDescriptor struct { buf *queue.TxBuffer nextFree uint64 } // idManager is a manager of tx buffer identifiers. It assigns unique IDs to // tx buffers that are added to it; the IDs can only be reused after they have // been removed. // // The ID assignments are stored so that the tx buffers can be retrieved from // the IDs previously assigned to them. type idManager struct { // ids is a slice containing all tx buffers. The ID is the index into // this slice. ids []idDescriptor // freeList a list of free IDs. freeList uint64 } // init initializes the id manager. func (m *idManager) init() { m.freeList = nilID } // add assigns an ID to the given tx buffer. func (m *idManager) add(b *queue.TxBuffer) uint64 { if i := m.freeList; i != nilID { // There is an id available in the free list, just use it. m.ids[i].buf = b m.freeList = m.ids[i].nextFree return i } // We need to expand the id descriptor. m.ids = append(m.ids, idDescriptor{buf: b}) return uint64(len(m.ids) - 1) } // remove retrieves the tx buffer associated with the given ID, and removes the // ID from the assigned table so that it can be reused in the future. func (m *idManager) remove(i uint64) *queue.TxBuffer { if i >= uint64(len(m.ids)) { return nil } desc := &m.ids[i] b := desc.buf if b == nil { // The provided id is not currently assigned. return nil } desc.buf = nil desc.nextFree = m.freeList m.freeList = i return b } // bufferManager manages a buffer region broken up into smaller, equally sized // buffers. Smaller buffers can be allocated and freed. type bufferManager struct { freeList *queue.TxBuffer curOffset uint64 limit uint64 entrySize uint32 } // init initializes the buffer manager. func (b *bufferManager) init(initialOffset, size, entrySize int) { b.freeList = nil b.curOffset = uint64(initialOffset) b.limit = uint64(initialOffset + size/entrySize*entrySize) b.entrySize = uint32(entrySize) } // alloc allocates a buffer from the manager, if one is available. func (b *bufferManager) alloc() *queue.TxBuffer { if b.freeList != nil { // There is a descriptor ready for reuse in the free list. d := b.freeList b.freeList = d.Next d.Next = nil return d } if b.curOffset < b.limit { // There is room available in the never-used range, so create // a new descriptor for it. d := &queue.TxBuffer{ Offset: b.curOffset, Size: b.entrySize, } b.curOffset += uint64(b.entrySize) return d } return nil } // free returns all buffers in the list to the buffer manager so that they can // be reused. func (b *bufferManager) free(d *queue.TxBuffer) { // Find the last buffer in the list. last := d for last.Next != nil { last = last.Next } // Push list onto free list. last.Next = b.freeList b.freeList = d } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sniffer/000077500000000000000000000000001465435605700231345ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sniffer/pcap.go000066400000000000000000000041451465435605700244120ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sniffer import ( "encoding" "encoding/binary" "time" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) type pcapHeader struct { // MagicNumber is the file magic number. MagicNumber uint32 // VersionMajor is the major version number. VersionMajor uint16 // VersionMinor is the minor version number. VersionMinor uint16 // Thiszone is the GMT to local correction. Thiszone int32 // Sigfigs is the accuracy of timestamps. Sigfigs uint32 // Snaplen is the max length of captured packets, in octets. Snaplen uint32 // Network is the data link type. Network uint32 } var _ encoding.BinaryMarshaler = (*pcapPacket)(nil) type pcapPacket struct { timestamp time.Time packet *stack.PacketBuffer maxCaptureLen int } func (p *pcapPacket) MarshalBinary() ([]byte, error) { pkt := trimmedClone(p.packet) defer pkt.DecRef() packetSize := pkt.Size() captureLen := p.maxCaptureLen if packetSize < captureLen { captureLen = packetSize } b := make([]byte, 16+captureLen) binary.LittleEndian.PutUint32(b[0:4], uint32(p.timestamp.Unix())) binary.LittleEndian.PutUint32(b[4:8], uint32(p.timestamp.Nanosecond()/1000)) binary.LittleEndian.PutUint32(b[8:12], uint32(captureLen)) binary.LittleEndian.PutUint32(b[12:16], uint32(packetSize)) w := tcpip.SliceWriter(b[16:]) for _, v := range pkt.AsSlices() { if captureLen == 0 { break } if len(v) > captureLen { v = v[:captureLen] } n, err := w.Write(v) if err != nil { panic(err) } captureLen -= n } return b, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sniffer/sniffer.go000066400000000000000000000303721465435605700251240ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package sniffer provides the implementation of data-link layer endpoints that // wrap another endpoint and logs inbound and outbound packets. // // Sniffer endpoints can be used in the networking stack by calling New(eID) to // create a new endpoint, where eID is the ID of the endpoint being wrapped, // and then passing it as an argument to Stack.CreateNIC(). package sniffer import ( "encoding/binary" "fmt" "io" "time" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/link/nested" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // LogPackets is a flag used to enable or disable packet logging via the log // package. Valid values are 0 or 1. var LogPackets atomicbitops.Uint32 = atomicbitops.FromUint32(1) // LogPacketsToPCAP is a flag used to enable or disable logging packets to a // pcap writer. Valid values are 0 or 1. A writer must have been specified when the // sniffer was created for this flag to have effect. var LogPacketsToPCAP atomicbitops.Uint32 = atomicbitops.FromUint32(1) // +stateify savable type endpoint struct { nested.Endpoint writer io.Writer maxPCAPLen uint32 logPrefix string } var _ stack.GSOEndpoint = (*endpoint)(nil) var _ stack.LinkEndpoint = (*endpoint)(nil) var _ stack.NetworkDispatcher = (*endpoint)(nil) // A Direction indicates whether the packing is being sent or received. type Direction int const ( // DirectionSend indicates a sent packet. DirectionSend = iota // DirectionRecv indicates a received packet. DirectionRecv ) // New creates a new sniffer link-layer endpoint. It wraps around another // endpoint and logs packets and they traverse the endpoint. func New(lower stack.LinkEndpoint) stack.LinkEndpoint { return NewWithPrefix(lower, "") } // NewWithPrefix creates a new sniffer link-layer endpoint. It wraps around // another endpoint and logs packets prefixed with logPrefix as they traverse // the endpoint. // // logPrefix is prepended to the log line without any separators. // E.g. logPrefix = "NIC:en0/" will produce log lines like // "NIC:en0/send udp [...]". func NewWithPrefix(lower stack.LinkEndpoint, logPrefix string) stack.LinkEndpoint { sniffer := &endpoint{logPrefix: logPrefix} sniffer.Endpoint.Init(lower, sniffer) return sniffer } func zoneOffset() (int32, error) { date := time.Date(0, 0, 0, 0, 0, 0, 0, time.Local) _, offset := date.Zone() return int32(offset), nil } func writePCAPHeader(w io.Writer, maxLen uint32) error { offset, err := zoneOffset() if err != nil { return err } return binary.Write(w, binary.LittleEndian, pcapHeader{ // From https://wiki.wireshark.org/Development/LibpcapFileFormat MagicNumber: 0xa1b2c3d4, VersionMajor: 2, VersionMinor: 4, Thiszone: offset, Sigfigs: 0, Snaplen: maxLen, Network: 101, // LINKTYPE_RAW }) } // NewWithWriter creates a new sniffer link-layer endpoint. It wraps around // another endpoint and logs packets as they traverse the endpoint. // // Each packet is written to writer in the pcap format in a single Write call // without synchronization. A sniffer created with this function will not emit // packets using the standard log package. // // snapLen is the maximum amount of a packet to be saved. Packets with a length // less than or equal to snapLen will be saved in their entirety. Longer // packets will be truncated to snapLen. func NewWithWriter(lower stack.LinkEndpoint, writer io.Writer, snapLen uint32) (stack.LinkEndpoint, error) { if err := writePCAPHeader(writer, snapLen); err != nil { return nil, err } sniffer := &endpoint{ writer: writer, maxPCAPLen: snapLen, } sniffer.Endpoint.Init(lower, sniffer) return sniffer, nil } // DeliverNetworkPacket implements the stack.NetworkDispatcher interface. It is // called by the link-layer endpoint being wrapped when a packet arrives, and // logs the packet before forwarding to the actual dispatcher. func (e *endpoint) DeliverNetworkPacket(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { e.dumpPacket(DirectionRecv, protocol, pkt) e.Endpoint.DeliverNetworkPacket(protocol, pkt) } func (e *endpoint) dumpPacket(dir Direction, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { writer := e.writer if LogPackets.Load() == 1 { LogPacket(e.logPrefix, dir, protocol, pkt) } if writer != nil && LogPacketsToPCAP.Load() == 1 { packet := pcapPacket{ timestamp: time.Now(), packet: pkt, maxCaptureLen: int(e.maxPCAPLen), } b, err := packet.MarshalBinary() if err != nil { panic(err) } if _, err := writer.Write(b); err != nil { panic(err) } } } // WritePackets implements the stack.LinkEndpoint interface. It is called by // higher-level protocols to write packets; it just logs the packet and // forwards the request to the lower endpoint. func (e *endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { for _, pkt := range pkts.AsSlice() { e.dumpPacket(DirectionSend, pkt.NetworkProtocolNumber, pkt) } return e.Endpoint.WritePackets(pkts) } // LogPacket logs a packet to stdout. func LogPacket(prefix string, dir Direction, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { // Figure out the network layer info. var transProto uint8 var src tcpip.Address var dst tcpip.Address var size uint16 var id uint32 var fragmentOffset uint16 var moreFragments bool var directionPrefix string switch dir { case DirectionSend: directionPrefix = "send" case DirectionRecv: directionPrefix = "recv" default: panic(fmt.Sprintf("unrecognized direction: %d", dir)) } clone := trimmedClone(pkt) defer clone.DecRef() switch protocol { case header.IPv4ProtocolNumber: if ok := parse.IPv4(clone); !ok { return } ipv4 := header.IPv4(clone.NetworkHeader().Slice()) fragmentOffset = ipv4.FragmentOffset() moreFragments = ipv4.Flags()&header.IPv4FlagMoreFragments == header.IPv4FlagMoreFragments src = ipv4.SourceAddress() dst = ipv4.DestinationAddress() transProto = ipv4.Protocol() size = ipv4.TotalLength() - uint16(ipv4.HeaderLength()) id = uint32(ipv4.ID()) case header.IPv6ProtocolNumber: proto, fragID, fragOffset, fragMore, ok := parse.IPv6(clone) if !ok { return } ipv6 := header.IPv6(clone.NetworkHeader().Slice()) src = ipv6.SourceAddress() dst = ipv6.DestinationAddress() transProto = uint8(proto) size = ipv6.PayloadLength() id = fragID moreFragments = fragMore fragmentOffset = fragOffset case header.ARPProtocolNumber: if !parse.ARP(clone) { return } arp := header.ARP(clone.NetworkHeader().Slice()) log.Infof( "%s%s arp %s (%s) -> %s (%s) valid:%t", prefix, directionPrefix, tcpip.AddrFromSlice(arp.ProtocolAddressSender()), tcpip.LinkAddress(arp.HardwareAddressSender()), tcpip.AddrFromSlice(arp.ProtocolAddressTarget()), tcpip.LinkAddress(arp.HardwareAddressTarget()), arp.IsValid(), ) return default: log.Infof("%s%s unknown network protocol: %d", prefix, directionPrefix, protocol) return } // Figure out the transport layer info. transName := "unknown" srcPort := uint16(0) dstPort := uint16(0) details := "" switch tcpip.TransportProtocolNumber(transProto) { case header.ICMPv4ProtocolNumber: transName = "icmp" hdr, ok := clone.Data().PullUp(header.ICMPv4MinimumSize) if !ok { break } icmp := header.ICMPv4(hdr) icmpType := "unknown" if fragmentOffset == 0 { switch icmp.Type() { case header.ICMPv4EchoReply: icmpType = "echo reply" case header.ICMPv4DstUnreachable: icmpType = "destination unreachable" case header.ICMPv4SrcQuench: icmpType = "source quench" case header.ICMPv4Redirect: icmpType = "redirect" case header.ICMPv4Echo: icmpType = "echo" case header.ICMPv4TimeExceeded: icmpType = "time exceeded" case header.ICMPv4ParamProblem: icmpType = "param problem" case header.ICMPv4Timestamp: icmpType = "timestamp" case header.ICMPv4TimestampReply: icmpType = "timestamp reply" case header.ICMPv4InfoRequest: icmpType = "info request" case header.ICMPv4InfoReply: icmpType = "info reply" } } log.Infof("%s%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, directionPrefix, transName, src, dst, icmpType, size, id, icmp.Code()) return case header.ICMPv6ProtocolNumber: transName = "icmp" hdr, ok := clone.Data().PullUp(header.ICMPv6MinimumSize) if !ok { break } icmp := header.ICMPv6(hdr) icmpType := "unknown" switch icmp.Type() { case header.ICMPv6DstUnreachable: icmpType = "destination unreachable" case header.ICMPv6PacketTooBig: icmpType = "packet too big" case header.ICMPv6TimeExceeded: icmpType = "time exceeded" case header.ICMPv6ParamProblem: icmpType = "param problem" case header.ICMPv6EchoRequest: icmpType = "echo request" case header.ICMPv6EchoReply: icmpType = "echo reply" case header.ICMPv6RouterSolicit: icmpType = "router solicit" case header.ICMPv6RouterAdvert: icmpType = "router advert" case header.ICMPv6NeighborSolicit: icmpType = "neighbor solicit" case header.ICMPv6NeighborAdvert: icmpType = "neighbor advert" case header.ICMPv6RedirectMsg: icmpType = "redirect message" } log.Infof("%s%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, directionPrefix, transName, src, dst, icmpType, size, id, icmp.Code()) return case header.UDPProtocolNumber: transName = "udp" if ok := parse.UDP(clone); !ok { break } udp := header.UDP(clone.TransportHeader().Slice()) if fragmentOffset == 0 { srcPort = udp.SourcePort() dstPort = udp.DestinationPort() details = fmt.Sprintf("xsum: 0x%x", udp.Checksum()) size -= header.UDPMinimumSize } case header.TCPProtocolNumber: transName = "tcp" if ok := parse.TCP(clone); !ok { break } tcp := header.TCP(clone.TransportHeader().Slice()) if fragmentOffset == 0 { offset := int(tcp.DataOffset()) if offset < header.TCPMinimumSize { details += fmt.Sprintf("invalid packet: tcp data offset too small %d", offset) break } if size := clone.Data().Size() + len(tcp); offset > size && !moreFragments { details += fmt.Sprintf("invalid packet: tcp data offset %d larger than tcp packet length %d", offset, size) break } srcPort = tcp.SourcePort() dstPort = tcp.DestinationPort() size -= uint16(offset) // Initialize the TCP flags. flags := tcp.Flags() details = fmt.Sprintf("flags: %s seqnum: %d ack: %d win: %d xsum:0x%x", flags, tcp.SequenceNumber(), tcp.AckNumber(), tcp.WindowSize(), tcp.Checksum()) if flags&header.TCPFlagSyn != 0 { details += fmt.Sprintf(" options: %+v", header.ParseSynOptions(tcp.Options(), flags&header.TCPFlagAck != 0)) } else { details += fmt.Sprintf(" options: %+v", tcp.ParsedOptions()) } } default: log.Infof("%s%s %s -> %s unknown transport protocol: %d", prefix, directionPrefix, src, dst, transProto) return } if pkt.GSOOptions.Type != stack.GSONone { details += fmt.Sprintf(" gso: %#v", pkt.GSOOptions) } log.Infof("%s%s %s %s:%d -> %s:%d len:%d id:%04x %s", prefix, directionPrefix, transName, src, srcPort, dst, dstPort, size, id, details) } // trimmedClone clones the packet buffer to not modify the original. It trims // anything before the network header. func trimmedClone(pkt *stack.PacketBuffer) *stack.PacketBuffer { // We don't clone the original packet buffer so that the new packet buffer // does not have any of its headers set. // // We trim the link headers from the cloned buffer as the sniffer doesn't // handle link headers. buf := pkt.ToBuffer() buf.TrimFront(int64(len(pkt.VirtioNetHeader().Slice()))) buf.TrimFront(int64(len(pkt.LinkHeader().Slice()))) return stack.NewPacketBuffer(stack.PacketBufferOptions{Payload: buf}) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/sniffer/sniffer_state_autogen.go000066400000000000000000000017301465435605700300420ustar00rootroot00000000000000// automatically generated by stateify. package sniffer import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (e *endpoint) StateTypeName() string { return "pkg/tcpip/link/sniffer.endpoint" } func (e *endpoint) StateFields() []string { return []string{ "Endpoint", "writer", "maxPCAPLen", "logPrefix", } } func (e *endpoint) beforeSave() {} // +checklocksignore func (e *endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.Endpoint) stateSinkObject.Save(1, &e.writer) stateSinkObject.Save(2, &e.maxPCAPLen) stateSinkObject.Save(3, &e.logPrefix) } func (e *endpoint) afterLoad(context.Context) {} // +checklocksignore func (e *endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.Endpoint) stateSourceObject.Load(1, &e.writer) stateSourceObject.Load(2, &e.maxPCAPLen) stateSourceObject.Load(3, &e.logPrefix) } func init() { state.Register((*endpoint)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/stopfd/000077500000000000000000000000001465435605700227775ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/stopfd/stopfd.go000066400000000000000000000032431465435605700246270ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux // Package stopfd provides an type that can be used to signal the stop of a dispatcher. package stopfd import ( "fmt" "golang.org/x/sys/unix" ) // StopFD is an eventfd used to signal the stop of a dispatcher. type StopFD struct { EFD int } // New returns a new, initialized StopFD. func New() (StopFD, error) { efd, err := unix.Eventfd(0, unix.EFD_NONBLOCK) if err != nil { return StopFD{EFD: -1}, fmt.Errorf("failed to create eventfd: %w", err) } return StopFD{EFD: efd}, nil } // Stop writes to the eventfd and notifies the dispatcher to stop. It does not // block. func (sf *StopFD) Stop() { increment := []byte{1, 0, 0, 0, 0, 0, 0, 0} if n, err := unix.Write(sf.EFD, increment); n != len(increment) || err != nil { // There are two possible errors documented in eventfd(2) for writing: // 1. We are writing 8 bytes and not 0xffffffffffffff, thus no EINVAL. // 2. stop is only supposed to be called once, it can't reach the limit, // thus no EAGAIN. panic(fmt.Sprintf("write(EFD) = (%d, %s), want (%d, nil)", n, err, len(increment))) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/stopfd/stopfd_state_autogen.go000066400000000000000000000001321465435605700275430ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux // +build linux package stopfd golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/tun/000077500000000000000000000000001465435605700223065ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/tun/device.go000066400000000000000000000224411465435605700240770ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tun import ( "fmt" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/link/channel" "gvisor.dev/gvisor/pkg/tcpip/link/packetsocket" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/waiter" ) const ( // drivers/net/tun.c:tun_net_init() defaultDevMtu = 1500 // Queue length for outbound packet, arriving at fd side for read. Overflow // causes packet drops. gVisor implementation-specific. defaultDevOutQueueLen = 1024 ) var zeroMAC [6]byte // Device is an opened /dev/net/tun device. // // +stateify savable type Device struct { waiter.Queue mu sync.RWMutex `state:"nosave"` endpoint *tunEndpoint notifyHandle *channel.NotificationHandle flags Flags } // Flags set properties of a Device // // +stateify savable type Flags struct { TUN bool TAP bool NoPacketInfo bool } // beforeSave is invoked by stateify. func (d *Device) beforeSave() { d.mu.Lock() defer d.mu.Unlock() // TODO(b/110961832): Restore the device to stack. At this moment, the stack // is not savable. if d.endpoint != nil { panic("/dev/net/tun does not support save/restore when a device is associated with it.") } } // Release implements fs.FileOperations.Release. func (d *Device) Release(ctx context.Context) { d.mu.Lock() defer d.mu.Unlock() // Decrease refcount if there is an endpoint associated with this file. if d.endpoint != nil { d.endpoint.Drain() d.endpoint.RemoveNotify(d.notifyHandle) d.endpoint.DecRef(ctx) d.endpoint = nil } } // SetIff services TUNSETIFF ioctl(2) request. func (d *Device) SetIff(s *stack.Stack, name string, flags Flags) error { d.mu.Lock() defer d.mu.Unlock() if d.endpoint != nil { return linuxerr.EINVAL } // Input validation. if flags.TAP && flags.TUN || !flags.TAP && !flags.TUN { return linuxerr.EINVAL } prefix := "tun" if flags.TAP { prefix = "tap" } linkCaps := stack.CapabilityNone if flags.TAP { linkCaps |= stack.CapabilityResolutionRequired } endpoint, err := attachOrCreateNIC(s, name, prefix, linkCaps) if err != nil { return linuxerr.EINVAL } d.endpoint = endpoint d.notifyHandle = d.endpoint.AddNotify(d) d.flags = flags return nil } func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkEndpointCapabilities) (*tunEndpoint, error) { for { // 1. Try to attach to an existing NIC. if name != "" { if linkEP := s.GetLinkEndpointByName(name); linkEP != nil { endpoint, ok := linkEP.(*tunEndpoint) if !ok { // Not a NIC created by tun device. return nil, linuxerr.EOPNOTSUPP } if !endpoint.TryIncRef() { // Race detected: NIC got deleted in between. continue } return endpoint, nil } } // 2. Creating a new NIC. id := s.NextNICID() endpoint := &tunEndpoint{ Endpoint: channel.New(defaultDevOutQueueLen, defaultDevMtu, ""), stack: s, nicID: id, name: name, isTap: prefix == "tap", } endpoint.InitRefs() endpoint.Endpoint.LinkEPCapabilities = linkCaps if endpoint.name == "" { endpoint.name = fmt.Sprintf("%s%d", prefix, id) } err := s.CreateNICWithOptions(endpoint.nicID, packetsocket.New(endpoint), stack.NICOptions{ Name: endpoint.name, }) switch err.(type) { case nil: return endpoint, nil case *tcpip.ErrDuplicateNICID: // Race detected: A NIC has been created in between. continue default: return nil, linuxerr.EINVAL } } } // MTU returns the tun endpoint MTU (maximum transmission unit). func (d *Device) MTU() (uint32, error) { d.mu.RLock() endpoint := d.endpoint d.mu.RUnlock() if endpoint == nil { return 0, linuxerr.EBADFD } if !endpoint.IsAttached() { return 0, linuxerr.EIO } return endpoint.MTU(), nil } // Write inject one inbound packet to the network interface. func (d *Device) Write(data *buffer.View) (int64, error) { d.mu.RLock() endpoint := d.endpoint d.mu.RUnlock() if endpoint == nil { return 0, linuxerr.EBADFD } if !endpoint.IsAttached() { return 0, linuxerr.EIO } dataLen := int64(data.Size()) // Packet information. var pktInfoHdr PacketInfoHeader if !d.flags.NoPacketInfo { if dataLen < PacketInfoHeaderSize { // Ignore bad packet. return dataLen, nil } pktInfoHdrView := data.Clone() defer pktInfoHdrView.Release() pktInfoHdrView.CapLength(PacketInfoHeaderSize) pktInfoHdr = PacketInfoHeader(pktInfoHdrView.AsSlice()) data.TrimFront(PacketInfoHeaderSize) } // Ethernet header (TAP only). var ethHdr header.Ethernet if d.flags.TAP { if data.Size() < header.EthernetMinimumSize { // Ignore bad packet. return dataLen, nil } ethHdrView := data.Clone() defer ethHdrView.Release() ethHdrView.CapLength(header.EthernetMinimumSize) ethHdr = header.Ethernet(ethHdrView.AsSlice()) data.TrimFront(header.EthernetMinimumSize) } // Try to determine network protocol number, default zero. var protocol tcpip.NetworkProtocolNumber switch { case pktInfoHdr != nil: protocol = pktInfoHdr.Protocol() case ethHdr != nil: protocol = ethHdr.Type() case d.flags.TUN: // TUN interface with IFF_NO_PI enabled, thus // we need to determine protocol from version field version := data.AsSlice()[0] >> 4 if version == 4 { protocol = header.IPv4ProtocolNumber } else if version == 6 { protocol = header.IPv6ProtocolNumber } } pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ ReserveHeaderBytes: len(ethHdr), Payload: buffer.MakeWithView(data.Clone()), }) defer pkt.DecRef() copy(pkt.LinkHeader().Push(len(ethHdr)), ethHdr) endpoint.InjectInbound(protocol, pkt) return dataLen, nil } // Read reads one outgoing packet from the network interface. func (d *Device) Read() (*buffer.View, error) { d.mu.RLock() endpoint := d.endpoint d.mu.RUnlock() if endpoint == nil { return nil, linuxerr.EBADFD } pkt := endpoint.Read() if pkt == nil { return nil, linuxerr.ErrWouldBlock } v := d.encodePkt(pkt) pkt.DecRef() return v, nil } // encodePkt encodes packet for fd side. func (d *Device) encodePkt(pkt *stack.PacketBuffer) *buffer.View { var view *buffer.View // Packet information. if !d.flags.NoPacketInfo { view = buffer.NewView(PacketInfoHeaderSize + pkt.Size()) view.Grow(PacketInfoHeaderSize) hdr := PacketInfoHeader(view.AsSlice()) hdr.Encode(&PacketInfoFields{ Protocol: pkt.NetworkProtocolNumber, }) pktView := pkt.ToView() view.Write(pktView.AsSlice()) pktView.Release() } else { view = pkt.ToView() } return view } // Name returns the name of the attached network interface. Empty string if // unattached. func (d *Device) Name() string { d.mu.RLock() defer d.mu.RUnlock() if d.endpoint != nil { return d.endpoint.name } return "" } // Flags returns the flags set for d. Zero value if unset. func (d *Device) Flags() Flags { d.mu.RLock() defer d.mu.RUnlock() return d.flags } // Readiness implements watier.Waitable.Readiness. func (d *Device) Readiness(mask waiter.EventMask) waiter.EventMask { if mask&waiter.ReadableEvents != 0 { d.mu.RLock() endpoint := d.endpoint d.mu.RUnlock() if endpoint != nil && endpoint.NumQueued() == 0 { mask &= ^waiter.ReadableEvents } } return mask & (waiter.ReadableEvents | waiter.WritableEvents) } // WriteNotify implements channel.Notification.WriteNotify. func (d *Device) WriteNotify() { d.Notify(waiter.ReadableEvents) } // tunEndpoint is the link endpoint for the NIC created by the tun device. // // It is ref-counted as multiple opening files can attach to the same NIC. // The last owner is responsible for deleting the NIC. type tunEndpoint struct { tunEndpointRefs *channel.Endpoint stack *stack.Stack nicID tcpip.NICID name string isTap bool } // DecRef decrements refcount of e, removing NIC if it reaches 0. func (e *tunEndpoint) DecRef(ctx context.Context) { e.tunEndpointRefs.DecRef(func() { e.Close() e.stack.RemoveNIC(e.nicID) }) } // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. func (e *tunEndpoint) ARPHardwareType() header.ARPHardwareType { if e.isTap { return header.ARPHardwareEther } return header.ARPHardwareNone } // AddHeader implements stack.LinkEndpoint.AddHeader. func (e *tunEndpoint) AddHeader(pkt *stack.PacketBuffer) { if !e.isTap { return } eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize)) eth.Encode(&header.EthernetFields{ SrcAddr: pkt.EgressRoute.LocalLinkAddress, DstAddr: pkt.EgressRoute.RemoteLinkAddress, Type: pkt.NetworkProtocolNumber, }) } // MaxHeaderLength returns the maximum size of the link layer header. func (e *tunEndpoint) MaxHeaderLength() uint16 { if e.isTap { return header.EthernetMinimumSize } return 0 } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/tun/protocol.go000066400000000000000000000032301465435605700244740ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tun import ( "encoding/binary" "gvisor.dev/gvisor/pkg/tcpip" ) const ( // PacketInfoHeaderSize is the size of the packet information header. PacketInfoHeaderSize = 4 offsetFlags = 0 offsetProtocol = 2 ) // PacketInfoFields contains fields sent through the wire if IFF_NO_PI flag is // not set. type PacketInfoFields struct { Flags uint16 Protocol tcpip.NetworkProtocolNumber } // PacketInfoHeader is the wire representation of the packet information sent if // IFF_NO_PI flag is not set. type PacketInfoHeader []byte // Encode encodes f into h. func (h PacketInfoHeader) Encode(f *PacketInfoFields) { binary.BigEndian.PutUint16(h[offsetFlags:][:2], f.Flags) binary.BigEndian.PutUint16(h[offsetProtocol:][:2], uint16(f.Protocol)) } // Flags returns the flag field in h. func (h PacketInfoHeader) Flags() uint16 { return binary.BigEndian.Uint16(h[offsetFlags:]) } // Protocol returns the protocol field in h. func (h PacketInfoHeader) Protocol() tcpip.NetworkProtocolNumber { return tcpip.NetworkProtocolNumber(binary.BigEndian.Uint16(h[offsetProtocol:])) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/tun/tun_endpoint_refs.go000066400000000000000000000102401465435605700263570ustar00rootroot00000000000000package tun import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const tunEndpointenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var tunEndpointobj *tunEndpoint // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type tunEndpointRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *tunEndpointRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *tunEndpointRefs) RefType() string { return fmt.Sprintf("%T", tunEndpointobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *tunEndpointRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *tunEndpointRefs) LogRefs() bool { return tunEndpointenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *tunEndpointRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *tunEndpointRefs) IncRef() { v := r.refCount.Add(1) if tunEndpointenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *tunEndpointRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if tunEndpointenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *tunEndpointRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if tunEndpointenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *tunEndpointRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/tun/tun_state_autogen.go000066400000000000000000000043241465435605700263700ustar00rootroot00000000000000// automatically generated by stateify. package tun import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (d *Device) StateTypeName() string { return "pkg/tcpip/link/tun.Device" } func (d *Device) StateFields() []string { return []string{ "Queue", "endpoint", "notifyHandle", "flags", } } // +checklocksignore func (d *Device) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.Queue) stateSinkObject.Save(1, &d.endpoint) stateSinkObject.Save(2, &d.notifyHandle) stateSinkObject.Save(3, &d.flags) } func (d *Device) afterLoad(context.Context) {} // +checklocksignore func (d *Device) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.Queue) stateSourceObject.Load(1, &d.endpoint) stateSourceObject.Load(2, &d.notifyHandle) stateSourceObject.Load(3, &d.flags) } func (f *Flags) StateTypeName() string { return "pkg/tcpip/link/tun.Flags" } func (f *Flags) StateFields() []string { return []string{ "TUN", "TAP", "NoPacketInfo", } } func (f *Flags) beforeSave() {} // +checklocksignore func (f *Flags) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.TUN) stateSinkObject.Save(1, &f.TAP) stateSinkObject.Save(2, &f.NoPacketInfo) } func (f *Flags) afterLoad(context.Context) {} // +checklocksignore func (f *Flags) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.TUN) stateSourceObject.Load(1, &f.TAP) stateSourceObject.Load(2, &f.NoPacketInfo) } func (r *tunEndpointRefs) StateTypeName() string { return "pkg/tcpip/link/tun.tunEndpointRefs" } func (r *tunEndpointRefs) StateFields() []string { return []string{ "refCount", } } func (r *tunEndpointRefs) beforeSave() {} // +checklocksignore func (r *tunEndpointRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *tunEndpointRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func init() { state.Register((*Device)(nil)) state.Register((*Flags)(nil)) state.Register((*tunEndpointRefs)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/tun/tun_unsafe.go000066400000000000000000000032151465435605700250050ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux // Package tun contains methods to open TAP and TUN devices. package tun import ( "unsafe" "golang.org/x/sys/unix" ) // Open opens the specified TUN device, sets it to non-blocking mode, and // returns its file descriptor. func Open(name string) (int, error) { return open(name, unix.IFF_TUN|unix.IFF_NO_PI) } // OpenTAP opens the specified TAP device, sets it to non-blocking mode, and // returns its file descriptor. func OpenTAP(name string) (int, error) { return open(name, unix.IFF_TAP|unix.IFF_NO_PI) } func open(name string, flags uint16) (int, error) { fd, err := unix.Open("/dev/net/tun", unix.O_RDWR, 0) if err != nil { return -1, err } var ifr struct { name [16]byte flags uint16 _ [22]byte } copy(ifr.name[:], name) ifr.flags = flags _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(fd), unix.TUNSETIFF, uintptr(unsafe.Pointer(&ifr))) if errno != 0 { unix.Close(fd) return -1, errno } if err = unix.SetNonblock(fd, true); err != nil { unix.Close(fd) return -1, err } return fd, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/tun/tun_unsafe_state_autogen.go000066400000000000000000000001271465435605700277260ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux // +build linux package tun golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/veth/000077500000000000000000000000001465435605700224465ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/veth/veth.go000066400000000000000000000144151465435605700237500ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package veth provides the implementation of virtual ethernet device pair. package veth import ( "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) var _ stack.LinkEndpoint = (*Endpoint)(nil) var _ stack.GSOEndpoint = (*Endpoint)(nil) type veth struct { mu sync.RWMutex closed bool backlogQueue chan vethPacket mtu uint32 endpoints [2]Endpoint } func (v *veth) close() { v.mu.Lock() closed := v.closed v.closed = true v.mu.Unlock() if closed { return } for i := range v.endpoints { e := &v.endpoints[i] e.mu.Lock() action := e.onCloseAction e.onCloseAction = nil e.mu.Unlock() if action != nil { action() } } close(v.backlogQueue) } // +stateify savable type vethPacket struct { e *Endpoint protocol tcpip.NetworkProtocolNumber pkt *stack.PacketBuffer } const backlogQueueSize = 64 // Endpoint is link layer endpoint that redirects packets to a pair veth endpoint. // // +stateify savable type Endpoint struct { peer *Endpoint veth *veth mu sync.RWMutex `state:"nosave"` // +checklocks:mu dispatcher stack.NetworkDispatcher // linkAddr is the local address of this endpoint. // // +checklocks:mu linkAddr tcpip.LinkAddress // +checklocks:mu onCloseAction func() } // NewPair creates a new veth pair. func NewPair(mtu uint32) (*Endpoint, *Endpoint) { veth := veth{ backlogQueue: make(chan vethPacket, backlogQueueSize), mtu: mtu, endpoints: [2]Endpoint{ Endpoint{ linkAddr: tcpip.GetRandMacAddr(), }, Endpoint{ linkAddr: tcpip.GetRandMacAddr(), }, }, } a := &veth.endpoints[0] b := &veth.endpoints[1] a.peer = b b.peer = a a.veth = &veth b.veth = &veth go func() { for t := range veth.backlogQueue { t.e.InjectInbound(t.protocol, t.pkt) t.pkt.DecRef() } }() return a, b } // Close closes e. Further packet injections will return an error, and all pending // packets are discarded. Close may be called concurrently with WritePackets. func (e *Endpoint) Close() { e.veth.close() } // InjectInbound injects an inbound packet. If the endpoint is not attached, the // packet is not delivered. func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { e.mu.RLock() d := e.dispatcher e.mu.RUnlock() if d != nil { d.DeliverNetworkPacket(protocol, pkt) } } // Attach saves the stack network-layer dispatcher for use later when packets // are injected. func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) { e.mu.Lock() defer e.mu.Unlock() e.dispatcher = dispatcher } // IsAttached implements stack.LinkEndpoint.IsAttached. func (e *Endpoint) IsAttached() bool { e.mu.RLock() defer e.mu.RUnlock() return e.dispatcher != nil } // MTU implements stack.LinkEndpoint.MTU. func (e *Endpoint) MTU() uint32 { e.veth.mu.RLock() defer e.veth.mu.RUnlock() return e.veth.mtu } // SetMTU implements stack.LinkEndpoint.SetMTU. func (e *Endpoint) SetMTU(mtu uint32) { e.veth.mu.Lock() defer e.veth.mu.Unlock() e.veth.mtu = mtu } // Capabilities implements stack.LinkEndpoint.Capabilities. func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities { // TODO(b/352384218): Enable CapabilityTXChecksumOffload. return stack.CapabilityRXChecksumOffload | stack.CapabilitySaveRestore } // GSOMaxSize implements stack.GSOEndpoint. func (*Endpoint) GSOMaxSize() uint32 { return stack.GVisorGSOMaxSize } // SupportedGSO implements stack.GSOEndpoint. func (e *Endpoint) SupportedGSO() stack.SupportedGSO { return stack.GVisorGSOSupported } // MaxHeaderLength returns the maximum size of the link layer header. Given it // doesn't have a header, it just returns 0. func (*Endpoint) MaxHeaderLength() uint16 { return 0 } // LinkAddress returns the link address of this endpoint. func (e *Endpoint) LinkAddress() tcpip.LinkAddress { e.mu.RLock() defer e.mu.RUnlock() return e.linkAddr } // SetLinkAddress implements stack.LinkEndpoint.SetLinkAddress. func (e *Endpoint) SetLinkAddress(addr tcpip.LinkAddress) { e.mu.Lock() defer e.mu.Unlock() e.linkAddr = addr } // WritePackets stores outbound packets into the channel. // Multiple concurrent calls are permitted. func (e *Endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { e.veth.mu.RLock() defer e.veth.mu.RUnlock() if e.veth.closed { return 0, nil } n := 0 for _, pkt := range pkts.AsSlice() { // In order to properly loop back to the inbound side we must create a // fresh packet that only contains the underlying payload with no headers // or struct fields set. We must deep clone the payload to avoid // two goroutines writing to the same buffer. // // TODO(b/240580913): Remove this once IP headers use reference counted // views instead of raw byte slices. payload := pkt.ToBuffer() newPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ Payload: payload.DeepClone(), }) payload.Release() (e.veth.backlogQueue) <- vethPacket{ e: e.peer, protocol: pkt.NetworkProtocolNumber, pkt: newPkt, } n++ } return n, nil } // Wait implements stack.LinkEndpoint.Wait. func (*Endpoint) Wait() {} // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. func (*Endpoint) ARPHardwareType() header.ARPHardwareType { return header.ARPHardwareNone } // AddHeader implements stack.LinkEndpoint.AddHeader. func (e *Endpoint) AddHeader(pkt *stack.PacketBuffer) {} // ParseHeader implements stack.LinkEndpoint.ParseHeader. func (e *Endpoint) ParseHeader(pkt *stack.PacketBuffer) bool { return true } // SetOnCloseAction implements stack.LinkEndpoint. func (e *Endpoint) SetOnCloseAction(action func()) { e.mu.Lock() defer e.mu.Unlock() e.onCloseAction = action } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/veth/veth_state_autogen.go000066400000000000000000000034111465435605700266640ustar00rootroot00000000000000// automatically generated by stateify. package veth import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (v *vethPacket) StateTypeName() string { return "pkg/tcpip/link/veth.vethPacket" } func (v *vethPacket) StateFields() []string { return []string{ "e", "protocol", "pkt", } } func (v *vethPacket) beforeSave() {} // +checklocksignore func (v *vethPacket) StateSave(stateSinkObject state.Sink) { v.beforeSave() stateSinkObject.Save(0, &v.e) stateSinkObject.Save(1, &v.protocol) stateSinkObject.Save(2, &v.pkt) } func (v *vethPacket) afterLoad(context.Context) {} // +checklocksignore func (v *vethPacket) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &v.e) stateSourceObject.Load(1, &v.protocol) stateSourceObject.Load(2, &v.pkt) } func (e *Endpoint) StateTypeName() string { return "pkg/tcpip/link/veth.Endpoint" } func (e *Endpoint) StateFields() []string { return []string{ "peer", "veth", "dispatcher", "linkAddr", "onCloseAction", } } func (e *Endpoint) beforeSave() {} // +checklocksignore func (e *Endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.peer) stateSinkObject.Save(1, &e.veth) stateSinkObject.Save(2, &e.dispatcher) stateSinkObject.Save(3, &e.linkAddr) stateSinkObject.Save(4, &e.onCloseAction) } func (e *Endpoint) afterLoad(context.Context) {} // +checklocksignore func (e *Endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.peer) stateSourceObject.Load(1, &e.veth) stateSourceObject.Load(2, &e.dispatcher) stateSourceObject.Load(3, &e.linkAddr) stateSourceObject.Load(4, &e.onCloseAction) } func init() { state.Register((*vethPacket)(nil)) state.Register((*Endpoint)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/waitable/000077500000000000000000000000001465435605700232705ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/waitable/waitable.go000066400000000000000000000135711465435605700254160ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package waitable provides the implementation of data-link layer endpoints // that wrap other endpoints, and can wait for inflight calls to WritePacket or // DeliverNetworkPacket to finish (and new ones to be prevented). // // Waitable endpoints can be used in the networking stack by calling New(eID) to // create a new endpoint, where eID is the ID of the endpoint being wrapped, // and then passing it as an argument to Stack.CreateNIC(). package waitable import ( "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) var _ stack.NetworkDispatcher = (*Endpoint)(nil) var _ stack.LinkEndpoint = (*Endpoint)(nil) // Endpoint is a waitable link-layer endpoint. // // +stateify savable type Endpoint struct { dispatchGate sync.Gate mu sync.RWMutex `state:"nosave"` // +checklocks:mu dispatcher stack.NetworkDispatcher writeGate sync.Gate lower stack.LinkEndpoint } // New creates a new waitable link-layer endpoint. It wraps around another // endpoint and allows the caller to block new write/dispatch calls and wait for // the inflight ones to finish before returning. func New(lower stack.LinkEndpoint) *Endpoint { return &Endpoint{ lower: lower, } } // DeliverNetworkPacket implements stack.NetworkDispatcher.DeliverNetworkPacket. // It is called by the link-layer endpoint being wrapped when a packet arrives, // and only forwards to the actual dispatcher if Wait or WaitDispatch haven't // been called. func (e *Endpoint) DeliverNetworkPacket(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { if !e.dispatchGate.Enter() { return } e.mu.RLock() d := e.dispatcher e.mu.RUnlock() if d != nil { d.DeliverNetworkPacket(protocol, pkt) } e.dispatchGate.Leave() } // DeliverLinkPacket implements stack.NetworkDispatcher. func (e *Endpoint) DeliverLinkPacket(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { if !e.dispatchGate.Enter() { return } e.mu.RLock() d := e.dispatcher e.mu.RUnlock() if d != nil { d.DeliverLinkPacket(protocol, pkt) } e.dispatchGate.Leave() } // Attach implements stack.LinkEndpoint.Attach. It saves the dispatcher and // registers with the lower endpoint as its dispatcher so that "e" is called // for inbound packets. func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) { e.mu.Lock() e.dispatcher = dispatcher e.mu.Unlock() e.lower.Attach(e) } // IsAttached implements stack.LinkEndpoint.IsAttached. func (e *Endpoint) IsAttached() bool { e.mu.RLock() defer e.mu.RUnlock() return e.dispatcher != nil } // MTU implements stack.LinkEndpoint.MTU. It just forwards the request to the // lower endpoint. func (e *Endpoint) MTU() uint32 { return e.lower.MTU() } // SetMTU implements stack.LinkEndpoint.SetMTU. It just forwards the request to // the lower endpoint. func (e *Endpoint) SetMTU(mtu uint32) { e.lower.SetMTU(mtu) } // Capabilities implements stack.LinkEndpoint.Capabilities. It just forwards the // request to the lower endpoint. func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities { return e.lower.Capabilities() } // MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength. It just // forwards the request to the lower endpoint. func (e *Endpoint) MaxHeaderLength() uint16 { return e.lower.MaxHeaderLength() } // LinkAddress implements stack.LinkEndpoint.LinkAddress. It just forwards the // request to the lower endpoint. func (e *Endpoint) LinkAddress() tcpip.LinkAddress { return e.lower.LinkAddress() } // SetLinkAddress implements stack.LinkEndpoint.SetLinkAddress. It forwards the // request to the lower endpoint. func (e *Endpoint) SetLinkAddress(addr tcpip.LinkAddress) { e.mu.Lock() defer e.mu.Unlock() e.lower.SetLinkAddress(addr) } // WritePackets implements stack.LinkEndpoint.WritePackets. It is called by // higher-level protocols to write packets. It only forwards packets to the // lower endpoint if Wait or WaitWrite haven't been called. func (e *Endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { if !e.writeGate.Enter() { return pkts.Len(), nil } n, err := e.lower.WritePackets(pkts) e.writeGate.Leave() return n, err } // WaitWrite prevents new calls to WritePacket from reaching the lower endpoint, // and waits for inflight ones to finish before returning. func (e *Endpoint) WaitWrite() { e.writeGate.Close() } // WaitDispatch prevents new calls to DeliverNetworkPacket from reaching the // actual dispatcher, and waits for inflight ones to finish before returning. func (e *Endpoint) WaitDispatch() { e.dispatchGate.Close() } // Wait implements stack.LinkEndpoint.Wait. func (e *Endpoint) Wait() {} // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. func (e *Endpoint) ARPHardwareType() header.ARPHardwareType { return e.lower.ARPHardwareType() } // AddHeader implements stack.LinkEndpoint.AddHeader. func (e *Endpoint) AddHeader(pkt *stack.PacketBuffer) { e.lower.AddHeader(pkt) } // ParseHeader implements stack.LinkEndpoint.ParseHeader. func (e *Endpoint) ParseHeader(pkt *stack.PacketBuffer) bool { return e.lower.ParseHeader(pkt) } // SetOnCloseAction implements stack.LinkEndpoint.SetOnCloseAction. func (e *Endpoint) SetOnCloseAction(action func()) { e.lower.SetOnCloseAction(action) } // Close implements stack.LinkEndpoint. func (e *Endpoint) Close() { e.lower.Close() } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/waitable/waitable_state_autogen.go000066400000000000000000000017431465435605700303360ustar00rootroot00000000000000// automatically generated by stateify. package waitable import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (e *Endpoint) StateTypeName() string { return "pkg/tcpip/link/waitable.Endpoint" } func (e *Endpoint) StateFields() []string { return []string{ "dispatchGate", "dispatcher", "writeGate", "lower", } } func (e *Endpoint) beforeSave() {} // +checklocksignore func (e *Endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.dispatchGate) stateSinkObject.Save(1, &e.dispatcher) stateSinkObject.Save(2, &e.writeGate) stateSinkObject.Save(3, &e.lower) } func (e *Endpoint) afterLoad(context.Context) {} // +checklocksignore func (e *Endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.dispatchGate) stateSourceObject.Load(1, &e.dispatcher) stateSourceObject.Load(2, &e.writeGate) stateSourceObject.Load(3, &e.lower) } func init() { state.Register((*Endpoint)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/xdp/000077500000000000000000000000001465435605700222735ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/xdp/endpoint.go000066400000000000000000000273011465435605700244450ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux // Package xdp provides link layer endpoints backed by AF_XDP sockets. package xdp import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/rawfile" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo" "gvisor.dev/gvisor/pkg/tcpip/link/stopfd" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/xdp" ) // TODO(b/240191988): Turn off GSO, GRO, and LRO. Limit veth MTU to 1500. // MTU is sized to ensure packets fit inside a 2048 byte XDP frame. const MTU = 1500 var _ stack.LinkEndpoint = (*endpoint)(nil) // +stateify savable type endpoint struct { // fd is the underlying AF_XDP socket. fd int // caps holds the endpoint capabilities. caps stack.LinkEndpointCapabilities // closed is a function to be called when the FD's peer (if any) closes // its end of the communication pipe. // TODO(b/341946753): Restore when netstack is savable. closed func(tcpip.Error) `state:"nosave"` mu sync.RWMutex `state:"nosave"` // +checkloks:mu networkDispatcher stack.NetworkDispatcher // wg keeps track of running goroutines. wg sync.WaitGroup `state:"nosave"` // control is used to control the AF_XDP socket. control *xdp.ControlBlock // stopFD is used to stop the dispatch loop. stopFD stopfd.StopFD // addr is the address of the endpoint. // // +checklocks:mu addr tcpip.LinkAddress } // Options specify the details about the fd-based endpoint to be created. type Options struct { // FD is used to read/write packets. FD int // ClosedFunc is a function to be called when an endpoint's peer (if // any) closes its end of the communication pipe. ClosedFunc func(tcpip.Error) // Address is the link address for this endpoint. Address tcpip.LinkAddress // SaveRestore if true, indicates that this NIC capability set should // include CapabilitySaveRestore SaveRestore bool // DisconnectOk if true, indicates that this NIC capability set should // include CapabilityDisconnectOk. DisconnectOk bool // TXChecksumOffload if true, indicates that this endpoints capability // set should include CapabilityTXChecksumOffload. TXChecksumOffload bool // RXChecksumOffload if true, indicates that this endpoints capability // set should include CapabilityRXChecksumOffload. RXChecksumOffload bool // InterfaceIndex is the interface index of the underlying device. InterfaceIndex int // Bind is true when we're responsible for binding the AF_XDP socket to // a device. When false, another process is expected to bind for us. Bind bool // GRO enables generic receive offload. GRO bool } // New creates a new endpoint from an AF_XDP socket. func New(opts *Options) (stack.LinkEndpoint, error) { caps := stack.CapabilityResolutionRequired if opts.RXChecksumOffload { caps |= stack.CapabilityRXChecksumOffload } if opts.TXChecksumOffload { caps |= stack.CapabilityTXChecksumOffload } if opts.SaveRestore { caps |= stack.CapabilitySaveRestore } if opts.DisconnectOk { caps |= stack.CapabilityDisconnectOk } if err := unix.SetNonblock(opts.FD, true); err != nil { return nil, fmt.Errorf("unix.SetNonblock(%v) failed: %v", opts.FD, err) } ep := &endpoint{ fd: opts.FD, caps: caps, closed: opts.ClosedFunc, addr: opts.Address, } stopFD, err := stopfd.New() if err != nil { return nil, err } ep.stopFD = stopFD // Use a 2MB UMEM to match the PACKET_MMAP dispatcher. There will be // 1024 UMEM frames, and each queue will have 512 descriptors. Having // fewer descriptors than frames prevents RX and TX from starving each // other. // TODO(b/240191988): Consider different numbers of descriptors for // different queues. const ( frameSize = 2048 umemSize = 1 << 21 nFrames = umemSize / frameSize ) xdpOpts := xdp.Opts{ NFrames: nFrames, FrameSize: frameSize, NDescriptors: nFrames / 2, Bind: opts.Bind, } ep.control, err = xdp.NewFromSocket(opts.FD, uint32(opts.InterfaceIndex), 0 /* queueID */, xdpOpts) if err != nil { return nil, fmt.Errorf("failed to create AF_XDP dispatcher: %v", err) } ep.control.UMEM.Lock() defer ep.control.UMEM.Unlock() ep.control.Fill.FillAll(&ep.control.UMEM) return ep, nil } // Attach launches the goroutine that reads packets from the file descriptor and // dispatches them via the provided dispatcher. If one is already attached, // then nothing happens. // // Attach implements stack.LinkEndpoint.Attach. func (ep *endpoint) Attach(networkDispatcher stack.NetworkDispatcher) { ep.mu.Lock() defer ep.mu.Unlock() // nil means the NIC is being removed. if networkDispatcher == nil && ep.IsAttached() { ep.stopFD.Stop() ep.Wait() ep.networkDispatcher = nil return } if networkDispatcher != nil && ep.networkDispatcher == nil { ep.networkDispatcher = networkDispatcher // Link endpoints are not savable. When transportation endpoints are // saved, they stop sending outgoing packets and all incoming packets // are rejected. ep.wg.Add(1) go func() { // S/R-SAFE: See above. defer ep.wg.Done() for { cont, err := ep.dispatch() if err != nil || !cont { if ep.closed != nil { ep.closed(err) } return } } }() } } // IsAttached implements stack.LinkEndpoint.IsAttached. func (ep *endpoint) IsAttached() bool { ep.mu.RLock() defer ep.mu.RUnlock() return ep.networkDispatcher != nil } // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized // during construction. func (ep *endpoint) MTU() uint32 { return MTU } // SetMTU implements stack.LinkEndpoint.SetMTU. It has no impact. func (*endpoint) SetMTU(uint32) {} // Capabilities implements stack.LinkEndpoint.Capabilities. func (ep *endpoint) Capabilities() stack.LinkEndpointCapabilities { return ep.caps } // MaxHeaderLength returns the maximum size of the link-layer header. func (ep *endpoint) MaxHeaderLength() uint16 { return uint16(header.EthernetMinimumSize) } // LinkAddress returns the link address of this endpoint. func (ep *endpoint) LinkAddress() tcpip.LinkAddress { ep.mu.RLock() defer ep.mu.RUnlock() return ep.addr } // SetLinkAddress implemens stack.LinkEndpoint.SetLinkAddress func (ep *endpoint) SetLinkAddress(addr tcpip.LinkAddress) { ep.mu.Lock() defer ep.mu.Unlock() ep.addr = addr } // Wait implements stack.LinkEndpoint.Wait. It waits for the endpoint to stop // reading from its FD. func (ep *endpoint) Wait() { ep.wg.Wait() } // AddHeader implements stack.LinkEndpoint.AddHeader. func (ep *endpoint) AddHeader(pkt *stack.PacketBuffer) { // Add ethernet header if needed. eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize)) eth.Encode(&header.EthernetFields{ SrcAddr: pkt.EgressRoute.LocalLinkAddress, DstAddr: pkt.EgressRoute.RemoteLinkAddress, Type: pkt.NetworkProtocolNumber, }) } // ParseHeader implements stack.LinkEndpoint.ParseHeader. func (ep *endpoint) ParseHeader(pkt *stack.PacketBuffer) bool { _, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize) return ok } // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. func (ep *endpoint) ARPHardwareType() header.ARPHardwareType { return header.ARPHardwareEther } // WritePackets writes outbound packets to the underlying file descriptors. If // one is not currently writable, the packet is dropped. // // Each packet in pkts should have the following fields populated: // - pkt.EgressRoute // - pkt.NetworkProtocolNumber // // The following should not be populated, as GSO is not supported with XDP. // - pkt.GSOOptions func (ep *endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { // We expect to be called via fifo, which imposes a limit of // fifo.BatchSize. var preallocatedBatch [fifo.BatchSize]unix.XDPDesc batch := preallocatedBatch[:0] ep.control.UMEM.Lock() ep.control.Completion.FreeAll(&ep.control.UMEM) // Reserve TX queue descriptors and umem buffers nReserved, index := ep.control.TX.Reserve(&ep.control.UMEM, uint32(pkts.Len())) if nReserved == 0 { ep.control.UMEM.Unlock() return 0, &tcpip.ErrNoBufferSpace{} } // Allocate UMEM space. In order to release the UMEM lock as soon as // possible we allocate up-front. for _, pkt := range pkts.AsSlice() { batch = append(batch, unix.XDPDesc{ Addr: ep.control.UMEM.AllocFrame(), Len: uint32(pkt.Size()), }) } for i, pkt := range pkts.AsSlice() { // Copy packets into UMEM frame. frame := ep.control.UMEM.Get(batch[i]) offset := 0 var view *buffer.View views, pktOffset := pkt.AsViewList() for view = views.Front(); view != nil && pktOffset >= view.Size(); view = view.Next() { pktOffset -= view.Size() } offset += copy(frame[offset:], view.AsSlice()[pktOffset:]) for view = view.Next(); view != nil; view = view.Next() { offset += copy(frame[offset:], view.AsSlice()) } ep.control.TX.Set(index+uint32(i), batch[i]) } // Notify the kernel that there're packets to write. ep.control.TX.Notify() // TODO(b/240191988): Explore more fine-grained locking. We shouldn't // need to hold the UMEM lock for the whole duration of packet copying. ep.control.UMEM.Unlock() return pkts.Len(), nil } func (ep *endpoint) dispatch() (bool, tcpip.Error) { var views []*buffer.View for { stopped, errno := rawfile.BlockingPollUntilStopped(ep.stopFD.EFD, ep.fd, unix.POLLIN|unix.POLLERR) if errno != 0 { if errno == unix.EINTR { continue } return !stopped, tcpip.TranslateErrno(errno) } if stopped { return true, nil } // Avoid the cost of the poll syscall if possible by peeking // until there are no packets left. for { // We can receive multiple packets at once. nReceived, rxIndex := ep.control.RX.Peek() if nReceived == 0 { break } // Reuse views to avoid allocating. views = views[:0] // Populate views quickly so that we can release frames // back to the kernel. ep.control.UMEM.Lock() for i := uint32(0); i < nReceived; i++ { // Copy packet bytes into a view and free up the // buffer. descriptor := ep.control.RX.Get(rxIndex + i) data := ep.control.UMEM.Get(descriptor) view := buffer.NewView(len(data)) view.Write(data) views = append(views, view) ep.control.UMEM.FreeFrame(descriptor.Addr) } ep.control.Fill.FillAll(&ep.control.UMEM) ep.control.UMEM.Unlock() // Process each packet. ep.mu.RLock() d := ep.networkDispatcher ep.mu.RUnlock() for i := uint32(0); i < nReceived; i++ { view := views[i] data := view.AsSlice() netProto := header.Ethernet(data).Type() // Wrap the packet in a PacketBuffer and send it up the stack. pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ Payload: buffer.MakeWithView(view), }) // AF_XDP packets always have a link header. if !ep.ParseHeader(pkt) { panic("ParseHeader(_) must succeed") } d.DeliverNetworkPacket(netProto, pkt) pkt.DecRef() } // Tell the kernel that we're done with these // descriptors in the RX queue. ep.control.RX.Release(nReceived) } } } // Close implements stack.LinkEndpoint. func (*endpoint) Close() {} // SetOnCloseAction implements stack.LinkEndpoint. func (*endpoint) SetOnCloseAction(func()) {} golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/link/xdp/xdp_state_autogen.go000066400000000000000000000022421465435605700263370ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux // +build linux package xdp import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (ep *endpoint) StateTypeName() string { return "pkg/tcpip/link/xdp.endpoint" } func (ep *endpoint) StateFields() []string { return []string{ "fd", "caps", "networkDispatcher", "control", "stopFD", "addr", } } func (ep *endpoint) beforeSave() {} // +checklocksignore func (ep *endpoint) StateSave(stateSinkObject state.Sink) { ep.beforeSave() stateSinkObject.Save(0, &ep.fd) stateSinkObject.Save(1, &ep.caps) stateSinkObject.Save(2, &ep.networkDispatcher) stateSinkObject.Save(3, &ep.control) stateSinkObject.Save(4, &ep.stopFD) stateSinkObject.Save(5, &ep.addr) } func (ep *endpoint) afterLoad(context.Context) {} // +checklocksignore func (ep *endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &ep.fd) stateSourceObject.Load(1, &ep.caps) stateSourceObject.Load(2, &ep.networkDispatcher) stateSourceObject.Load(3, &ep.control) stateSourceObject.Load(4, &ep.stopFD) stateSourceObject.Load(5, &ep.addr) } func init() { state.Register((*endpoint)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/000077500000000000000000000000001465435605700222345ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/arp/000077500000000000000000000000001465435605700230165ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/arp/arp.go000066400000000000000000000320651465435605700241350ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package arp implements the ARP network protocol. It is used to resolve // IPv4 addresses into link-local MAC addresses, and advertises IPv4 // addresses of its stack with the local network. package arp import ( "fmt" "reflect" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/network/internal/ip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) const ( // ProtocolNumber is the ARP protocol number. ProtocolNumber = header.ARPProtocolNumber ) var _ stack.DuplicateAddressDetector = (*endpoint)(nil) var _ stack.LinkAddressResolver = (*endpoint)(nil) var _ ip.DADProtocol = (*endpoint)(nil) // ARP endpoints need to implement stack.NetworkEndpoint because the stack // considers the layer above the link-layer a network layer; the only // facility provided by the stack to deliver packets to a layer above // the link-layer is via stack.NetworkEndpoint.HandlePacket. var _ stack.NetworkEndpoint = (*endpoint)(nil) // +stateify savable type endpoint struct { protocol *protocol // enabled is set to 1 when the NIC is enabled and 0 when it is disabled. enabled atomicbitops.Uint32 nic stack.NetworkInterface stats sharedStats // mu protects annotated fields below. mu sync.Mutex `state:"nosave"` // +checklocks:mu dad ip.DAD } // CheckDuplicateAddress implements stack.DuplicateAddressDetector. func (e *endpoint) CheckDuplicateAddress(addr tcpip.Address, h stack.DADCompletionHandler) stack.DADCheckAddressDisposition { e.mu.Lock() defer e.mu.Unlock() return e.dad.CheckDuplicateAddressLocked(addr, h) } // SetDADConfigurations implements stack.DuplicateAddressDetector. func (e *endpoint) SetDADConfigurations(c stack.DADConfigurations) { e.mu.Lock() defer e.mu.Unlock() e.dad.SetConfigsLocked(c) } // DuplicateAddressProtocol implements stack.DuplicateAddressDetector. func (*endpoint) DuplicateAddressProtocol() tcpip.NetworkProtocolNumber { return header.IPv4ProtocolNumber } // SendDADMessage implements ip.DADProtocol. func (e *endpoint) SendDADMessage(addr tcpip.Address, _ []byte) tcpip.Error { return e.sendARPRequest(header.IPv4Any, addr, header.EthernetBroadcastAddress) } func (e *endpoint) Enable() tcpip.Error { if !e.nic.Enabled() { return &tcpip.ErrNotPermitted{} } e.setEnabled(true) return nil } func (e *endpoint) Enabled() bool { return e.nic.Enabled() && e.isEnabled() } // isEnabled returns true if the endpoint is enabled, regardless of the // enabled status of the NIC. func (e *endpoint) isEnabled() bool { return e.enabled.Load() == 1 } // setEnabled sets the enabled status for the endpoint. func (e *endpoint) setEnabled(v bool) { if v { e.enabled.Store(1) } else { e.enabled.Store(0) } } func (e *endpoint) Disable() { e.setEnabled(false) } // DefaultTTL is unused for ARP. It implements stack.NetworkEndpoint. func (*endpoint) DefaultTTL() uint8 { return 0 } func (e *endpoint) MTU() uint32 { lmtu := e.nic.MTU() return lmtu - uint32(e.MaxHeaderLength()) } func (e *endpoint) MaxHeaderLength() uint16 { return e.nic.MaxHeaderLength() + header.ARPSize } func (*endpoint) Close() {} func (*endpoint) WritePacket(*stack.Route, stack.NetworkHeaderParams, *stack.PacketBuffer) tcpip.Error { return &tcpip.ErrNotSupported{} } // NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber. func (*endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber { return ProtocolNumber } func (*endpoint) WriteHeaderIncludedPacket(*stack.Route, *stack.PacketBuffer) tcpip.Error { return &tcpip.ErrNotSupported{} } func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { stats := e.stats.arp stats.packetsReceived.Increment() if !e.isEnabled() { stats.disabledPacketsReceived.Increment() return } if _, _, ok := e.protocol.Parse(pkt); !ok { stats.malformedPacketsReceived.Increment() return } h := header.ARP(pkt.NetworkHeader().Slice()) if !h.IsValid() { stats.malformedPacketsReceived.Increment() return } switch h.Op() { case header.ARPRequest: stats.requestsReceived.Increment() localAddr := tcpip.AddrFrom4Slice(h.ProtocolAddressTarget()) if !e.nic.CheckLocalAddress(header.IPv4ProtocolNumber, localAddr) { stats.requestsReceivedUnknownTargetAddress.Increment() return // we have no useful answer, ignore the request } remoteAddr := tcpip.AddrFrom4Slice(h.ProtocolAddressSender()) remoteLinkAddr := tcpip.LinkAddress(h.HardwareAddressSender()) switch err := e.nic.HandleNeighborProbe(header.IPv4ProtocolNumber, remoteAddr, remoteLinkAddr); err.(type) { case nil: case *tcpip.ErrNotSupported: // The stack may support ARP but the NIC may not need link resolution. default: panic(fmt.Sprintf("unexpected error when informing NIC of neighbor probe message: %s", err)) } respPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ ReserveHeaderBytes: int(e.nic.MaxHeaderLength()) + header.ARPSize, }) defer respPkt.DecRef() packet := header.ARP(respPkt.NetworkHeader().Push(header.ARPSize)) respPkt.NetworkProtocolNumber = ProtocolNumber packet.SetIPv4OverEthernet() packet.SetOp(header.ARPReply) // TODO(gvisor.dev/issue/4582): check copied length once TAP devices have a // link address. _ = copy(packet.HardwareAddressSender(), e.nic.LinkAddress()) if n := copy(packet.ProtocolAddressSender(), h.ProtocolAddressTarget()); n != header.IPv4AddressSize { panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, header.IPv4AddressSize)) } origSender := h.HardwareAddressSender() if n := copy(packet.HardwareAddressTarget(), origSender); n != header.EthernetAddressSize { panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, header.EthernetAddressSize)) } if n := copy(packet.ProtocolAddressTarget(), h.ProtocolAddressSender()); n != header.IPv4AddressSize { panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, header.IPv4AddressSize)) } // As per RFC 826, under Packet Reception: // Swap hardware and protocol fields, putting the local hardware and // protocol addresses in the sender fields. // // Send the packet to the (new) target hardware address on the same // hardware on which the request was received. if err := e.nic.WritePacketToRemote(tcpip.LinkAddress(origSender), respPkt); err != nil { stats.outgoingRepliesDropped.Increment() } else { stats.outgoingRepliesSent.Increment() } case header.ARPReply: stats.repliesReceived.Increment() addr := tcpip.AddrFrom4Slice(h.ProtocolAddressSender()) linkAddr := tcpip.LinkAddress(h.HardwareAddressSender()) e.mu.Lock() e.dad.StopLocked(addr, &stack.DADDupAddrDetected{HolderLinkAddress: linkAddr}) e.mu.Unlock() switch err := e.nic.HandleNeighborConfirmation(header.IPv4ProtocolNumber, addr, linkAddr, stack.ReachabilityConfirmationFlags{ // Only unicast ARP replies are considered solicited. Broadcast replies // are gratuitous ARP replies and should not move neighbor entries to the // reachable state. Solicited: pkt.PktType == tcpip.PacketHost, // If a different link address is received than the one cached, the entry // should always go to Stale. Override: false, // ARP does not distinguish between router and non-router hosts. IsRouter: false, }); err.(type) { case nil: case *tcpip.ErrNotSupported: // The stack may support ARP but the NIC may not need link resolution. default: panic(fmt.Sprintf("unexpected error when informing NIC of neighbor confirmation message: %s", err)) } } } // Stats implements stack.NetworkEndpoint. func (e *endpoint) Stats() stack.NetworkEndpointStats { return &e.stats.localStats } var _ stack.NetworkProtocol = (*protocol)(nil) // +stateify savable type protocol struct { stack *stack.Stack options Options } func (p *protocol) Number() tcpip.NetworkProtocolNumber { return ProtocolNumber } func (p *protocol) MinimumPacketSize() int { return header.ARPSize } func (*protocol) ParseAddresses([]byte) (src, dst tcpip.Address) { return tcpip.Address{}, tcpip.Address{} } func (p *protocol) NewEndpoint(nic stack.NetworkInterface, _ stack.TransportDispatcher) stack.NetworkEndpoint { e := &endpoint{ protocol: p, nic: nic, } e.mu.Lock() e.dad.Init(&e.mu, p.options.DADConfigs, ip.DADOptions{ Clock: p.stack.Clock(), SecureRNG: p.stack.SecureRNG().Reader, // ARP does not support sending nonce values. NonceSize: 0, Protocol: e, NICID: nic.ID(), }) e.mu.Unlock() tcpip.InitStatCounters(reflect.ValueOf(&e.stats.localStats).Elem()) stackStats := p.stack.Stats() e.stats.arp.init(&e.stats.localStats.ARP, &stackStats.ARP) return e } // LinkAddressProtocol implements stack.LinkAddressResolver.LinkAddressProtocol. func (*endpoint) LinkAddressProtocol() tcpip.NetworkProtocolNumber { return header.IPv4ProtocolNumber } // LinkAddressRequest implements stack.LinkAddressResolver.LinkAddressRequest. func (e *endpoint) LinkAddressRequest(targetAddr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) tcpip.Error { stats := e.stats.arp if len(remoteLinkAddr) == 0 { remoteLinkAddr = header.EthernetBroadcastAddress } if localAddr.BitLen() == 0 { addr, err := e.nic.PrimaryAddress(header.IPv4ProtocolNumber) if err != nil { return err } if addr.Address.BitLen() == 0 { stats.outgoingRequestInterfaceHasNoLocalAddressErrors.Increment() return &tcpip.ErrNetworkUnreachable{} } localAddr = addr.Address } else if !e.nic.CheckLocalAddress(header.IPv4ProtocolNumber, localAddr) { stats.outgoingRequestBadLocalAddressErrors.Increment() return &tcpip.ErrBadLocalAddress{} } return e.sendARPRequest(localAddr, targetAddr, remoteLinkAddr) } func (e *endpoint) sendARPRequest(localAddr, targetAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) tcpip.Error { pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ ReserveHeaderBytes: int(e.MaxHeaderLength()), }) defer pkt.DecRef() h := header.ARP(pkt.NetworkHeader().Push(header.ARPSize)) pkt.NetworkProtocolNumber = ProtocolNumber h.SetIPv4OverEthernet() h.SetOp(header.ARPRequest) // TODO(gvisor.dev/issue/4582): check copied length once TAP devices have a // link address. _ = copy(h.HardwareAddressSender(), e.nic.LinkAddress()) if n := copy(h.ProtocolAddressSender(), localAddr.AsSlice()); n != header.IPv4AddressSize { panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, header.IPv4AddressSize)) } if n := copy(h.ProtocolAddressTarget(), targetAddr.AsSlice()); n != header.IPv4AddressSize { panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, header.IPv4AddressSize)) } stats := e.stats.arp if err := e.nic.WritePacketToRemote(remoteLinkAddr, pkt); err != nil { stats.outgoingRequestsDropped.Increment() return err } stats.outgoingRequestsSent.Increment() return nil } // ResolveStaticAddress implements stack.LinkAddressResolver.ResolveStaticAddress. func (*endpoint) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) { if addr == header.IPv4Broadcast { return header.EthernetBroadcastAddress, true } if header.IsV4MulticastAddress(addr) { return header.EthernetAddressFromMulticastIPv4Address(addr), true } return tcpip.LinkAddress([]byte(nil)), false } // SetOption implements stack.NetworkProtocol.SetOption. func (*protocol) SetOption(tcpip.SettableNetworkProtocolOption) tcpip.Error { return &tcpip.ErrUnknownProtocolOption{} } // Option implements stack.NetworkProtocol.Option. func (*protocol) Option(tcpip.GettableNetworkProtocolOption) tcpip.Error { return &tcpip.ErrUnknownProtocolOption{} } // Close implements stack.TransportProtocol.Close. func (*protocol) Close() {} // Wait implements stack.TransportProtocol.Wait. func (*protocol) Wait() {} // Parse implements stack.NetworkProtocol.Parse. func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) { return 0, false, parse.ARP(pkt) } // Options holds options to configure a protocol. // // +stateify savable type Options struct { // DADConfigs is the default DAD configurations used by ARP endpoints. DADConfigs stack.DADConfigurations } // NewProtocolWithOptions returns an ARP network protocol factory that // will return an ARP network protocol with the provided options. func NewProtocolWithOptions(opts Options) stack.NetworkProtocolFactory { return func(s *stack.Stack) stack.NetworkProtocol { return &protocol{ stack: s, options: opts, } } } // NewProtocol returns an ARP network protocol. func NewProtocol(s *stack.Stack) stack.NetworkProtocol { return NewProtocolWithOptions(Options{})(s) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/arp/arp_state_autogen.go000066400000000000000000000131771465435605700270620ustar00rootroot00000000000000// automatically generated by stateify. package arp import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (e *endpoint) StateTypeName() string { return "pkg/tcpip/network/arp.endpoint" } func (e *endpoint) StateFields() []string { return []string{ "protocol", "enabled", "nic", "stats", "dad", } } func (e *endpoint) beforeSave() {} // +checklocksignore func (e *endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.protocol) stateSinkObject.Save(1, &e.enabled) stateSinkObject.Save(2, &e.nic) stateSinkObject.Save(3, &e.stats) stateSinkObject.Save(4, &e.dad) } func (e *endpoint) afterLoad(context.Context) {} // +checklocksignore func (e *endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.protocol) stateSourceObject.Load(1, &e.enabled) stateSourceObject.Load(2, &e.nic) stateSourceObject.Load(3, &e.stats) stateSourceObject.Load(4, &e.dad) } func (p *protocol) StateTypeName() string { return "pkg/tcpip/network/arp.protocol" } func (p *protocol) StateFields() []string { return []string{ "stack", "options", } } func (p *protocol) beforeSave() {} // +checklocksignore func (p *protocol) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.stack) stateSinkObject.Save(1, &p.options) } func (p *protocol) afterLoad(context.Context) {} // +checklocksignore func (p *protocol) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.stack) stateSourceObject.Load(1, &p.options) } func (o *Options) StateTypeName() string { return "pkg/tcpip/network/arp.Options" } func (o *Options) StateFields() []string { return []string{ "DADConfigs", } } func (o *Options) beforeSave() {} // +checklocksignore func (o *Options) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.DADConfigs) } func (o *Options) afterLoad(context.Context) {} // +checklocksignore func (o *Options) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.DADConfigs) } func (s *Stats) StateTypeName() string { return "pkg/tcpip/network/arp.Stats" } func (s *Stats) StateFields() []string { return []string{ "ARP", } } func (s *Stats) beforeSave() {} // +checklocksignore func (s *Stats) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.ARP) } func (s *Stats) afterLoad(context.Context) {} // +checklocksignore func (s *Stats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.ARP) } func (s *sharedStats) StateTypeName() string { return "pkg/tcpip/network/arp.sharedStats" } func (s *sharedStats) StateFields() []string { return []string{ "localStats", "arp", } } func (s *sharedStats) beforeSave() {} // +checklocksignore func (s *sharedStats) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.localStats) stateSinkObject.Save(1, &s.arp) } func (s *sharedStats) afterLoad(context.Context) {} // +checklocksignore func (s *sharedStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.localStats) stateSourceObject.Load(1, &s.arp) } func (m *multiCounterARPStats) StateTypeName() string { return "pkg/tcpip/network/arp.multiCounterARPStats" } func (m *multiCounterARPStats) StateFields() []string { return []string{ "packetsReceived", "disabledPacketsReceived", "malformedPacketsReceived", "requestsReceived", "requestsReceivedUnknownTargetAddress", "outgoingRequestInterfaceHasNoLocalAddressErrors", "outgoingRequestBadLocalAddressErrors", "outgoingRequestsDropped", "outgoingRequestsSent", "repliesReceived", "outgoingRepliesDropped", "outgoingRepliesSent", } } func (m *multiCounterARPStats) beforeSave() {} // +checklocksignore func (m *multiCounterARPStats) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.packetsReceived) stateSinkObject.Save(1, &m.disabledPacketsReceived) stateSinkObject.Save(2, &m.malformedPacketsReceived) stateSinkObject.Save(3, &m.requestsReceived) stateSinkObject.Save(4, &m.requestsReceivedUnknownTargetAddress) stateSinkObject.Save(5, &m.outgoingRequestInterfaceHasNoLocalAddressErrors) stateSinkObject.Save(6, &m.outgoingRequestBadLocalAddressErrors) stateSinkObject.Save(7, &m.outgoingRequestsDropped) stateSinkObject.Save(8, &m.outgoingRequestsSent) stateSinkObject.Save(9, &m.repliesReceived) stateSinkObject.Save(10, &m.outgoingRepliesDropped) stateSinkObject.Save(11, &m.outgoingRepliesSent) } func (m *multiCounterARPStats) afterLoad(context.Context) {} // +checklocksignore func (m *multiCounterARPStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.packetsReceived) stateSourceObject.Load(1, &m.disabledPacketsReceived) stateSourceObject.Load(2, &m.malformedPacketsReceived) stateSourceObject.Load(3, &m.requestsReceived) stateSourceObject.Load(4, &m.requestsReceivedUnknownTargetAddress) stateSourceObject.Load(5, &m.outgoingRequestInterfaceHasNoLocalAddressErrors) stateSourceObject.Load(6, &m.outgoingRequestBadLocalAddressErrors) stateSourceObject.Load(7, &m.outgoingRequestsDropped) stateSourceObject.Load(8, &m.outgoingRequestsSent) stateSourceObject.Load(9, &m.repliesReceived) stateSourceObject.Load(10, &m.outgoingRepliesDropped) stateSourceObject.Load(11, &m.outgoingRepliesSent) } func init() { state.Register((*endpoint)(nil)) state.Register((*protocol)(nil)) state.Register((*Options)(nil)) state.Register((*Stats)(nil)) state.Register((*sharedStats)(nil)) state.Register((*multiCounterARPStats)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/arp/stats.go000066400000000000000000000062621465435605700245110ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package arp import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) var _ stack.NetworkEndpointStats = (*Stats)(nil) // Stats holds statistics related to ARP. // // +stateify savable type Stats struct { // ARP holds ARP statistics. ARP tcpip.ARPStats } // IsNetworkEndpointStats implements stack.NetworkEndpointStats. func (*Stats) IsNetworkEndpointStats() {} // +stateify savable type sharedStats struct { localStats Stats arp multiCounterARPStats } // LINT.IfChange(multiCounterARPStats) // +stateify savable type multiCounterARPStats struct { packetsReceived tcpip.MultiCounterStat disabledPacketsReceived tcpip.MultiCounterStat malformedPacketsReceived tcpip.MultiCounterStat requestsReceived tcpip.MultiCounterStat requestsReceivedUnknownTargetAddress tcpip.MultiCounterStat outgoingRequestInterfaceHasNoLocalAddressErrors tcpip.MultiCounterStat outgoingRequestBadLocalAddressErrors tcpip.MultiCounterStat outgoingRequestsDropped tcpip.MultiCounterStat outgoingRequestsSent tcpip.MultiCounterStat repliesReceived tcpip.MultiCounterStat outgoingRepliesDropped tcpip.MultiCounterStat outgoingRepliesSent tcpip.MultiCounterStat } func (m *multiCounterARPStats) init(a, b *tcpip.ARPStats) { m.packetsReceived.Init(a.PacketsReceived, b.PacketsReceived) m.disabledPacketsReceived.Init(a.DisabledPacketsReceived, b.DisabledPacketsReceived) m.malformedPacketsReceived.Init(a.MalformedPacketsReceived, b.MalformedPacketsReceived) m.requestsReceived.Init(a.RequestsReceived, b.RequestsReceived) m.requestsReceivedUnknownTargetAddress.Init(a.RequestsReceivedUnknownTargetAddress, b.RequestsReceivedUnknownTargetAddress) m.outgoingRequestInterfaceHasNoLocalAddressErrors.Init(a.OutgoingRequestInterfaceHasNoLocalAddressErrors, b.OutgoingRequestInterfaceHasNoLocalAddressErrors) m.outgoingRequestBadLocalAddressErrors.Init(a.OutgoingRequestBadLocalAddressErrors, b.OutgoingRequestBadLocalAddressErrors) m.outgoingRequestsDropped.Init(a.OutgoingRequestsDropped, b.OutgoingRequestsDropped) m.outgoingRequestsSent.Init(a.OutgoingRequestsSent, b.OutgoingRequestsSent) m.repliesReceived.Init(a.RepliesReceived, b.RepliesReceived) m.outgoingRepliesDropped.Init(a.OutgoingRepliesDropped, b.OutgoingRepliesDropped) m.outgoingRepliesSent.Init(a.OutgoingRepliesSent, b.OutgoingRepliesSent) } // LINT.ThenChange(../../tcpip.go:ARPStats) golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/hash/000077500000000000000000000000001465435605700231575ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/hash/hash.go000066400000000000000000000051621465435605700244350ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package hash contains utility functions for hashing. package hash import ( "encoding/binary" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/tcpip/header" ) var hashIV = RandN32(1)[0] // RandN32 generates a slice of n cryptographic random 32-bit numbers. func RandN32(n int) []uint32 { b := make([]byte, 4*n) if _, err := rand.Read(b); err != nil { panic("unable to get random numbers: " + err.Error()) } r := make([]uint32, n) for i := range r { r[i] = binary.LittleEndian.Uint32(b[4*i : (4*i + 4)]) } return r } // Hash3Words calculates the Jenkins hash of 3 32-bit words. This is adapted // from linux. func Hash3Words(a, b, c, initval uint32) uint32 { const iv = 0xdeadbeef + (3 << 2) initval += iv a += initval b += initval c += initval c ^= b c -= rol32(b, 14) a ^= c a -= rol32(c, 11) b ^= a b -= rol32(a, 25) c ^= b c -= rol32(b, 16) a ^= c a -= rol32(c, 4) b ^= a b -= rol32(a, 14) c ^= b c -= rol32(b, 24) return c } // IPv4FragmentHash computes the hash of the IPv4 fragment as suggested in RFC 791. func IPv4FragmentHash(h header.IPv4) uint32 { x := uint32(h.ID())<<16 | uint32(h.Protocol()) t := h.SourceAddress().As4() y := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24 t = h.DestinationAddress().As4() z := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24 return Hash3Words(x, y, z, hashIV) } // IPv6FragmentHash computes the hash of the ipv6 fragment. // Unlike IPv4, the protocol is not used to compute the hash. // RFC 2640 (sec 4.5) is not very sharp on this aspect. // As a reference, also Linux ignores the protocol to compute // the hash (inet6_hash_frag). func IPv6FragmentHash(h header.IPv6, id uint32) uint32 { t := h.SourceAddress().As16() y := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24 t = h.DestinationAddress().As16() z := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24 return Hash3Words(id, y, z, hashIV) } func rol32(v, shift uint32) uint32 { return (v << shift) | (v >> ((-shift) & 31)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/hash/hash_state_autogen.go000066400000000000000000000000661465435605700273550ustar00rootroot00000000000000// automatically generated by stateify. package hash golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/internal/000077500000000000000000000000001465435605700240505ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/internal/fragmentation/000077500000000000000000000000001465435605700267065ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/internal/fragmentation/fragmentation.go000066400000000000000000000301521465435605700320740ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package fragmentation contains the implementation of IP fragmentation. // It is based on RFC 791, RFC 815 and RFC 8200. package fragmentation import ( "errors" "fmt" "time" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) const ( // HighFragThreshold is the threshold at which we start trimming old // fragmented packets. Linux uses a default value of 4 MB. See // net.ipv4.ipfrag_high_thresh for more information. HighFragThreshold = 4 << 20 // 4MB // LowFragThreshold is the threshold we reach to when we start dropping // older fragmented packets. It's important that we keep enough room for newer // packets to be re-assembled. Hence, this needs to be lower than // HighFragThreshold enough. Linux uses a default value of 3 MB. See // net.ipv4.ipfrag_low_thresh for more information. LowFragThreshold = 3 << 20 // 3MB // minBlockSize is the minimum block size for fragments. minBlockSize = 1 ) var ( // ErrInvalidArgs indicates to the caller that an invalid argument was // provided. ErrInvalidArgs = errors.New("invalid args") // ErrFragmentOverlap indicates that, during reassembly, a fragment overlaps // with another one. ErrFragmentOverlap = errors.New("overlapping fragments") // ErrFragmentConflict indicates that, during reassembly, some fragments are // in conflict with one another. ErrFragmentConflict = errors.New("conflicting fragments") ) // FragmentID is the identifier for a fragment. // // +stateify savable type FragmentID struct { // Source is the source address of the fragment. Source tcpip.Address // Destination is the destination address of the fragment. Destination tcpip.Address // ID is the identification value of the fragment. // // This is a uint32 because IPv6 uses a 32-bit identification value. ID uint32 // The protocol for the packet. Protocol uint8 } // Fragmentation is the main structure that other modules // of the stack should use to implement IP Fragmentation. // // +stateify savable type Fragmentation struct { mu sync.Mutex `state:"nosave"` highLimit int lowLimit int reassemblers map[FragmentID]*reassembler rList reassemblerList memSize int timeout time.Duration blockSize uint16 clock tcpip.Clock releaseJob *tcpip.Job timeoutHandler TimeoutHandler } // TimeoutHandler is consulted if a packet reassembly has timed out. type TimeoutHandler interface { // OnReassemblyTimeout will be called with the first fragment (or nil, if the // first fragment has not been received) of a packet whose reassembly has // timed out. OnReassemblyTimeout(pkt *stack.PacketBuffer) } // NewFragmentation creates a new Fragmentation. // // blockSize specifies the fragment block size, in bytes. // // highMemoryLimit specifies the limit on the memory consumed // by the fragments stored by Fragmentation (overhead of internal data-structures // is not accounted). Fragments are dropped when the limit is reached. // // lowMemoryLimit specifies the limit on which we will reach by dropping // fragments after reaching highMemoryLimit. // // reassemblingTimeout specifies the maximum time allowed to reassemble a packet. // Fragments are lazily evicted only when a new a packet with an // already existing fragmentation-id arrives after the timeout. func NewFragmentation(blockSize uint16, highMemoryLimit, lowMemoryLimit int, reassemblingTimeout time.Duration, clock tcpip.Clock, timeoutHandler TimeoutHandler) *Fragmentation { if lowMemoryLimit >= highMemoryLimit { lowMemoryLimit = highMemoryLimit } if lowMemoryLimit < 0 { lowMemoryLimit = 0 } if blockSize < minBlockSize { blockSize = minBlockSize } f := &Fragmentation{ reassemblers: make(map[FragmentID]*reassembler), highLimit: highMemoryLimit, lowLimit: lowMemoryLimit, timeout: reassemblingTimeout, blockSize: blockSize, clock: clock, timeoutHandler: timeoutHandler, } f.releaseJob = tcpip.NewJob(f.clock, &f.mu, f.releaseReassemblersLocked) return f } // Process processes an incoming fragment belonging to an ID and returns a // complete packet and its protocol number when all the packets belonging to // that ID have been received. // // [first, last] is the range of the fragment bytes. // // first must be a multiple of the block size f is configured with. The size // of the fragment data must be a multiple of the block size, unless there are // no fragments following this fragment (more set to false). // // proto is the protocol number marked in the fragment being processed. It has // to be given here outside of the FragmentID struct because IPv6 should not use // the protocol to identify a fragment. func (f *Fragmentation) Process( id FragmentID, first, last uint16, more bool, proto uint8, pkt *stack.PacketBuffer) ( *stack.PacketBuffer, uint8, bool, error) { if first > last { return nil, 0, false, fmt.Errorf("first=%d is greater than last=%d: %w", first, last, ErrInvalidArgs) } if first%f.blockSize != 0 { return nil, 0, false, fmt.Errorf("first=%d is not a multiple of block size=%d: %w", first, f.blockSize, ErrInvalidArgs) } fragmentSize := last - first + 1 if more && fragmentSize%f.blockSize != 0 { return nil, 0, false, fmt.Errorf("fragment size=%d bytes is not a multiple of block size=%d on non-final fragment: %w", fragmentSize, f.blockSize, ErrInvalidArgs) } if l := pkt.Data().Size(); l != int(fragmentSize) { return nil, 0, false, fmt.Errorf("got fragment size=%d bytes not equal to the expected fragment size=%d bytes (first=%d last=%d): %w", l, fragmentSize, first, last, ErrInvalidArgs) } f.mu.Lock() if f.reassemblers == nil { return nil, 0, false, fmt.Errorf("Release() called before fragmentation processing could finish") } r, ok := f.reassemblers[id] if !ok { r = newReassembler(id, f.clock) f.reassemblers[id] = r wasEmpty := f.rList.Empty() f.rList.PushFront(r) if wasEmpty { // If we have just pushed a first reassembler into an empty list, we // should kickstart the release job. The release job will keep // rescheduling itself until the list becomes empty. f.releaseReassemblersLocked() } } f.mu.Unlock() resPkt, firstFragmentProto, done, memConsumed, err := r.process(first, last, more, proto, pkt) if err != nil { // We probably got an invalid sequence of fragments. Just // discard the reassembler and move on. f.mu.Lock() f.release(r, false /* timedOut */) f.mu.Unlock() return nil, 0, false, fmt.Errorf("fragmentation processing error: %w", err) } f.mu.Lock() f.memSize += memConsumed if done { f.release(r, false /* timedOut */) } // Evict reassemblers if we are consuming more memory than highLimit until // we reach lowLimit. if f.memSize > f.highLimit { for f.memSize > f.lowLimit { tail := f.rList.Back() if tail == nil { break } f.release(tail, false /* timedOut */) } } f.mu.Unlock() return resPkt, firstFragmentProto, done, nil } // Release releases all underlying resources. func (f *Fragmentation) Release() { f.mu.Lock() defer f.mu.Unlock() for _, r := range f.reassemblers { f.release(r, false /* timedOut */) } f.reassemblers = nil } func (f *Fragmentation) release(r *reassembler, timedOut bool) { // Before releasing a fragment we need to check if r is already marked as done. // Otherwise, we would delete it twice. if r.checkDoneOrMark() { return } delete(f.reassemblers, r.id) f.rList.Remove(r) f.memSize -= r.memSize if f.memSize < 0 { log.Warningf("memory counter < 0 (%d), this is an accounting bug that requires investigation", f.memSize) f.memSize = 0 } if h := f.timeoutHandler; timedOut && h != nil { h.OnReassemblyTimeout(r.pkt) } if r.pkt != nil { r.pkt.DecRef() r.pkt = nil } for _, h := range r.holes { if h.pkt != nil { h.pkt.DecRef() h.pkt = nil } } r.holes = nil } // releaseReassemblersLocked releases already-expired reassemblers, then // schedules the job to call back itself for the remaining reassemblers if // any. This function must be called with f.mu locked. func (f *Fragmentation) releaseReassemblersLocked() { now := f.clock.NowMonotonic() for { // The reassembler at the end of the list is the oldest. r := f.rList.Back() if r == nil { // The list is empty. break } elapsed := now.Sub(r.createdAt) if f.timeout > elapsed { // If the oldest reassembler has not expired, schedule the release // job so that this function is called back when it has expired. f.releaseJob.Schedule(f.timeout - elapsed) break } // If the oldest reassembler has already expired, release it. f.release(r, true /* timedOut*/) } } // PacketFragmenter is the book-keeping struct for packet fragmentation. type PacketFragmenter struct { transportHeader []byte data buffer.Buffer reserve int fragmentPayloadLen int fragmentCount int currentFragment int fragmentOffset int } // MakePacketFragmenter prepares the struct needed for packet fragmentation. // // pkt is the packet to be fragmented. // // fragmentPayloadLen is the maximum number of bytes of fragmentable data a fragment can // have. // // reserve is the number of bytes that should be reserved for the headers in // each generated fragment. func MakePacketFragmenter(pkt *stack.PacketBuffer, fragmentPayloadLen uint32, reserve int) PacketFragmenter { // As per RFC 8200 Section 4.5, some IPv6 extension headers should not be // repeated in each fragment. However we do not currently support any header // of that kind yet, so the following computation is valid for both IPv4 and // IPv6. // TODO(gvisor.dev/issue/3912): Once Authentication or ESP Headers are // supported for outbound packets, the fragmentable data should not include // these headers. var fragmentableData buffer.Buffer fragmentableData.Append(pkt.TransportHeader().View()) pktBuf := pkt.Data().ToBuffer() fragmentableData.Merge(&pktBuf) fragmentCount := (uint32(fragmentableData.Size()) + fragmentPayloadLen - 1) / fragmentPayloadLen return PacketFragmenter{ data: fragmentableData, reserve: reserve, fragmentPayloadLen: int(fragmentPayloadLen), fragmentCount: int(fragmentCount), } } // BuildNextFragment returns a packet with the payload of the next fragment, // along with the fragment's offset, the number of bytes copied and a boolean // indicating if there are more fragments left or not. If this function is // called again after it indicated that no more fragments were left, it will // panic. // // Note that the returned packet will not have its network and link headers // populated, but space for them will be reserved. The transport header will be // stored in the packet's data. func (pf *PacketFragmenter) BuildNextFragment() (*stack.PacketBuffer, int, int, bool) { if pf.currentFragment >= pf.fragmentCount { panic("BuildNextFragment should not be called again after the last fragment was returned") } fragPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ ReserveHeaderBytes: pf.reserve, }) // Copy data for the fragment. copied := fragPkt.Data().ReadFrom(&pf.data, pf.fragmentPayloadLen) offset := pf.fragmentOffset pf.fragmentOffset += copied pf.currentFragment++ more := pf.currentFragment != pf.fragmentCount return fragPkt, offset, copied, more } // RemainingFragmentCount returns the number of fragments left to be built. func (pf *PacketFragmenter) RemainingFragmentCount() int { return pf.fragmentCount - pf.currentFragment } // Release frees resources owned by the packet fragmenter. func (pf *PacketFragmenter) Release() { pf.data.Release() } fragmentation_state_autogen.go000066400000000000000000000137461465435605700347510ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/internal/fragmentation// automatically generated by stateify. package fragmentation import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (f *FragmentID) StateTypeName() string { return "pkg/tcpip/network/internal/fragmentation.FragmentID" } func (f *FragmentID) StateFields() []string { return []string{ "Source", "Destination", "ID", "Protocol", } } func (f *FragmentID) beforeSave() {} // +checklocksignore func (f *FragmentID) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.Source) stateSinkObject.Save(1, &f.Destination) stateSinkObject.Save(2, &f.ID) stateSinkObject.Save(3, &f.Protocol) } func (f *FragmentID) afterLoad(context.Context) {} // +checklocksignore func (f *FragmentID) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.Source) stateSourceObject.Load(1, &f.Destination) stateSourceObject.Load(2, &f.ID) stateSourceObject.Load(3, &f.Protocol) } func (f *Fragmentation) StateTypeName() string { return "pkg/tcpip/network/internal/fragmentation.Fragmentation" } func (f *Fragmentation) StateFields() []string { return []string{ "highLimit", "lowLimit", "reassemblers", "rList", "memSize", "timeout", "blockSize", "clock", "releaseJob", "timeoutHandler", } } func (f *Fragmentation) beforeSave() {} // +checklocksignore func (f *Fragmentation) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.highLimit) stateSinkObject.Save(1, &f.lowLimit) stateSinkObject.Save(2, &f.reassemblers) stateSinkObject.Save(3, &f.rList) stateSinkObject.Save(4, &f.memSize) stateSinkObject.Save(5, &f.timeout) stateSinkObject.Save(6, &f.blockSize) stateSinkObject.Save(7, &f.clock) stateSinkObject.Save(8, &f.releaseJob) stateSinkObject.Save(9, &f.timeoutHandler) } func (f *Fragmentation) afterLoad(context.Context) {} // +checklocksignore func (f *Fragmentation) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.highLimit) stateSourceObject.Load(1, &f.lowLimit) stateSourceObject.Load(2, &f.reassemblers) stateSourceObject.Load(3, &f.rList) stateSourceObject.Load(4, &f.memSize) stateSourceObject.Load(5, &f.timeout) stateSourceObject.Load(6, &f.blockSize) stateSourceObject.Load(7, &f.clock) stateSourceObject.Load(8, &f.releaseJob) stateSourceObject.Load(9, &f.timeoutHandler) } func (h *hole) StateTypeName() string { return "pkg/tcpip/network/internal/fragmentation.hole" } func (h *hole) StateFields() []string { return []string{ "first", "last", "filled", "final", "pkt", } } func (h *hole) beforeSave() {} // +checklocksignore func (h *hole) StateSave(stateSinkObject state.Sink) { h.beforeSave() stateSinkObject.Save(0, &h.first) stateSinkObject.Save(1, &h.last) stateSinkObject.Save(2, &h.filled) stateSinkObject.Save(3, &h.final) stateSinkObject.Save(4, &h.pkt) } func (h *hole) afterLoad(context.Context) {} // +checklocksignore func (h *hole) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &h.first) stateSourceObject.Load(1, &h.last) stateSourceObject.Load(2, &h.filled) stateSourceObject.Load(3, &h.final) stateSourceObject.Load(4, &h.pkt) } func (r *reassembler) StateTypeName() string { return "pkg/tcpip/network/internal/fragmentation.reassembler" } func (r *reassembler) StateFields() []string { return []string{ "reassemblerEntry", "id", "memSize", "proto", "holes", "filled", "done", "createdAt", "pkt", } } func (r *reassembler) beforeSave() {} // +checklocksignore func (r *reassembler) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.reassemblerEntry) stateSinkObject.Save(1, &r.id) stateSinkObject.Save(2, &r.memSize) stateSinkObject.Save(3, &r.proto) stateSinkObject.Save(4, &r.holes) stateSinkObject.Save(5, &r.filled) stateSinkObject.Save(6, &r.done) stateSinkObject.Save(7, &r.createdAt) stateSinkObject.Save(8, &r.pkt) } func (r *reassembler) afterLoad(context.Context) {} // +checklocksignore func (r *reassembler) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.reassemblerEntry) stateSourceObject.Load(1, &r.id) stateSourceObject.Load(2, &r.memSize) stateSourceObject.Load(3, &r.proto) stateSourceObject.Load(4, &r.holes) stateSourceObject.Load(5, &r.filled) stateSourceObject.Load(6, &r.done) stateSourceObject.Load(7, &r.createdAt) stateSourceObject.Load(8, &r.pkt) } func (l *reassemblerList) StateTypeName() string { return "pkg/tcpip/network/internal/fragmentation.reassemblerList" } func (l *reassemblerList) StateFields() []string { return []string{ "head", "tail", } } func (l *reassemblerList) beforeSave() {} // +checklocksignore func (l *reassemblerList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *reassemblerList) afterLoad(context.Context) {} // +checklocksignore func (l *reassemblerList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *reassemblerEntry) StateTypeName() string { return "pkg/tcpip/network/internal/fragmentation.reassemblerEntry" } func (e *reassemblerEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *reassemblerEntry) beforeSave() {} // +checklocksignore func (e *reassemblerEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *reassemblerEntry) afterLoad(context.Context) {} // +checklocksignore func (e *reassemblerEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func init() { state.Register((*FragmentID)(nil)) state.Register((*Fragmentation)(nil)) state.Register((*hole)(nil)) state.Register((*reassembler)(nil)) state.Register((*reassemblerList)(nil)) state.Register((*reassemblerEntry)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/internal/fragmentation/reassembler.go000066400000000000000000000122751465435605700315500ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fragmentation import ( "math" "sort" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // +stateify savable type hole struct { first uint16 last uint16 filled bool final bool // pkt is the fragment packet if hole is filled. We keep the whole pkt rather // than the fragmented payload to prevent binding to specific buffer types. pkt *stack.PacketBuffer } // +stateify savable type reassembler struct { reassemblerEntry id FragmentID memSize int proto uint8 mu sync.Mutex `state:"nosave"` holes []hole filled int done bool createdAt tcpip.MonotonicTime pkt *stack.PacketBuffer } func newReassembler(id FragmentID, clock tcpip.Clock) *reassembler { r := &reassembler{ id: id, createdAt: clock.NowMonotonic(), } r.holes = append(r.holes, hole{ first: 0, last: math.MaxUint16, filled: false, final: true, }) return r } func (r *reassembler) process(first, last uint16, more bool, proto uint8, pkt *stack.PacketBuffer) (*stack.PacketBuffer, uint8, bool, int, error) { r.mu.Lock() defer r.mu.Unlock() if r.done { // A concurrent goroutine might have already reassembled // the packet and emptied the heap while this goroutine // was waiting on the mutex. We don't have to do anything in this case. return nil, 0, false, 0, nil } var holeFound bool var memConsumed int for i := range r.holes { currentHole := &r.holes[i] if last < currentHole.first || currentHole.last < first { continue } // For IPv6, overlaps with an existing fragment are explicitly forbidden by // RFC 8200 section 4.5: // If any of the fragments being reassembled overlap with any other // fragments being reassembled for the same packet, reassembly of that // packet must be abandoned and all the fragments that have been received // for that packet must be discarded, and no ICMP error messages should be // sent. // // It is not explicitly forbidden for IPv4, but to keep parity with Linux we // disallow it as well: // https://github.com/torvalds/linux/blob/38525c6/net/ipv4/inet_fragment.c#L349 if first < currentHole.first || currentHole.last < last { // Incoming fragment only partially fits in the free hole. return nil, 0, false, 0, ErrFragmentOverlap } if !more { if !currentHole.final || currentHole.filled && currentHole.last != last { // We have another final fragment, which does not perfectly overlap. return nil, 0, false, 0, ErrFragmentConflict } } holeFound = true if currentHole.filled { // Incoming fragment is a duplicate. continue } // We are populating the current hole with the payload and creating a new // hole for any unfilled ranges on either end. if first > currentHole.first { r.holes = append(r.holes, hole{ first: currentHole.first, last: first - 1, filled: false, final: false, }) } if last < currentHole.last && more { r.holes = append(r.holes, hole{ first: last + 1, last: currentHole.last, filled: false, final: currentHole.final, }) currentHole.final = false } memConsumed = pkt.MemSize() r.memSize += memConsumed // Update the current hole to precisely match the incoming fragment. r.holes[i] = hole{ first: first, last: last, filled: true, final: currentHole.final, pkt: pkt.IncRef(), } r.filled++ // For IPv6, it is possible to have different Protocol values between // fragments of a packet (because, unlike IPv4, the Protocol is not used to // identify a fragment). In this case, only the Protocol of the first // fragment must be used as per RFC 8200 Section 4.5. // // TODO(gvisor.dev/issue/3648): During reassembly of an IPv6 packet, IP // options received in the first fragment should be used - and they should // override options from following fragments. if first == 0 { if r.pkt != nil { r.pkt.DecRef() } r.pkt = pkt.IncRef() r.proto = proto } break } if !holeFound { // Incoming fragment is beyond end. return nil, 0, false, 0, ErrFragmentConflict } // Check if all the holes have been filled and we are ready to reassemble. if r.filled < len(r.holes) { return nil, 0, false, memConsumed, nil } sort.Slice(r.holes, func(i, j int) bool { return r.holes[i].first < r.holes[j].first }) resPkt := r.holes[0].pkt.Clone() for i := 1; i < len(r.holes); i++ { stack.MergeFragment(resPkt, r.holes[i].pkt) } return resPkt, r.proto, true /* done */, memConsumed, nil } func (r *reassembler) checkDoneOrMark() bool { r.mu.Lock() prev := r.done r.done = true r.mu.Unlock() return prev } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/internal/fragmentation/reassembler_list.go000066400000000000000000000124501465435605700325760ustar00rootroot00000000000000package fragmentation // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type reassemblerElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (reassemblerElementMapper) linkerFor(elem *reassembler) *reassembler { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type reassemblerList struct { head *reassembler tail *reassembler } // Reset resets list l to the empty state. func (l *reassemblerList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *reassemblerList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *reassemblerList) Front() *reassembler { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *reassemblerList) Back() *reassembler { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *reassemblerList) Len() (count int) { for e := l.Front(); e != nil; e = (reassemblerElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *reassemblerList) PushFront(e *reassembler) { linker := reassemblerElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { reassemblerElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *reassemblerList) PushFrontList(m *reassemblerList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { reassemblerElementMapper{}.linkerFor(l.head).SetPrev(m.tail) reassemblerElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *reassemblerList) PushBack(e *reassembler) { linker := reassemblerElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { reassemblerElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *reassemblerList) PushBackList(m *reassemblerList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { reassemblerElementMapper{}.linkerFor(l.tail).SetNext(m.head) reassemblerElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *reassemblerList) InsertAfter(b, e *reassembler) { bLinker := reassemblerElementMapper{}.linkerFor(b) eLinker := reassemblerElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { reassemblerElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *reassemblerList) InsertBefore(a, e *reassembler) { aLinker := reassemblerElementMapper{}.linkerFor(a) eLinker := reassemblerElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { reassemblerElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *reassemblerList) Remove(e *reassembler) { linker := reassemblerElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { reassemblerElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { reassemblerElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type reassemblerEntry struct { next *reassembler prev *reassembler } // Next returns the entry that follows e in the list. // //go:nosplit func (e *reassemblerEntry) Next() *reassembler { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *reassemblerEntry) Prev() *reassembler { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *reassemblerEntry) SetNext(elem *reassembler) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *reassemblerEntry) SetPrev(elem *reassembler) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/internal/ip/000077500000000000000000000000001465435605700244605ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/internal/ip/duplicate_address_detection.go000066400000000000000000000177231465435605700325360ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package ip holds IPv4/IPv6 common utilities. package ip import ( "bytes" "fmt" "io" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) type extendRequest int const ( notRequested extendRequest = iota requested extended ) // +stateify savable type dadState struct { nonce []byte extendRequest extendRequest done *bool timer tcpip.Timer completionHandlers []stack.DADCompletionHandler } // DADProtocol is a protocol whose core state machine can be represented by DAD. type DADProtocol interface { // SendDADMessage attempts to send a DAD probe message. SendDADMessage(tcpip.Address, []byte) tcpip.Error } // DADOptions holds options for DAD. // // +stateify savable type DADOptions struct { Clock tcpip.Clock // TODO(b/341946753): Restore when netstack is savable. SecureRNG io.Reader `state:"nosave"` NonceSize uint8 ExtendDADTransmits uint8 Protocol DADProtocol NICID tcpip.NICID } // DAD performs duplicate address detection for addresses. // // +stateify savable type DAD struct { opts DADOptions configs stack.DADConfigurations protocolMU sync.Locker `state:"nosave"` addresses map[tcpip.Address]dadState } // Init initializes the DAD state. // // Must only be called once for the lifetime of d; Init will panic if it is // called twice. // // The lock will only be taken when timers fire. func (d *DAD) Init(protocolMU sync.Locker, configs stack.DADConfigurations, opts DADOptions) { if d.addresses != nil { panic("attempted to initialize DAD state twice") } if opts.NonceSize != 0 && opts.ExtendDADTransmits == 0 { panic(fmt.Sprintf("given a non-zero value for NonceSize (%d) but zero for ExtendDADTransmits", opts.NonceSize)) } configs.Validate() *d = DAD{ opts: opts, configs: configs, protocolMU: protocolMU, addresses: make(map[tcpip.Address]dadState), } } // CheckDuplicateAddressLocked performs DAD for an address, calling the // completion handler once DAD resolves. // // If DAD is already performing for the provided address, h will be called when // the currently running process completes. // // Precondition: d.protocolMU must be locked. func (d *DAD) CheckDuplicateAddressLocked(addr tcpip.Address, h stack.DADCompletionHandler) stack.DADCheckAddressDisposition { if d.configs.DupAddrDetectTransmits == 0 { return stack.DADDisabled } ret := stack.DADAlreadyRunning s, ok := d.addresses[addr] if !ok { ret = stack.DADStarting remaining := d.configs.DupAddrDetectTransmits // Protected by d.protocolMU. done := false s = dadState{ done: &done, timer: d.opts.Clock.AfterFunc(0, func() { dadDone := remaining == 0 nonce, earlyReturn := func() ([]byte, bool) { d.protocolMU.Lock() defer d.protocolMU.Unlock() if done { return nil, true } s, ok := d.addresses[addr] if !ok { panic(fmt.Sprintf("dad: timer fired but missing state for %s on NIC(%d)", addr, d.opts.NICID)) } // As per RFC 7527 section 4 // // If any probe is looped back within RetransTimer milliseconds // after having sent DupAddrDetectTransmits NS(DAD) messages, the // interface continues with another MAX_MULTICAST_SOLICIT number of // NS(DAD) messages transmitted RetransTimer milliseconds apart. if dadDone && s.extendRequest == requested { dadDone = false remaining = d.opts.ExtendDADTransmits s.extendRequest = extended } if !dadDone && d.opts.NonceSize != 0 { if s.nonce == nil { s.nonce = make([]byte, d.opts.NonceSize) } if n, err := io.ReadFull(d.opts.SecureRNG, s.nonce); err != nil { panic(fmt.Sprintf("SecureRNG.Read(...): %s", err)) } else if n != len(s.nonce) { panic(fmt.Sprintf("expected to read %d bytes from secure RNG, only read %d bytes", len(s.nonce), n)) } } d.addresses[addr] = s return s.nonce, false }() if earlyReturn { return } var err tcpip.Error if !dadDone { err = d.opts.Protocol.SendDADMessage(addr, nonce) } d.protocolMU.Lock() defer d.protocolMU.Unlock() if done { return } s, ok := d.addresses[addr] if !ok { panic(fmt.Sprintf("dad: timer fired but missing state for %s on NIC(%d)", addr, d.opts.NICID)) } if !dadDone && err == nil { remaining-- s.timer.Reset(d.configs.RetransmitTimer) return } // At this point we know that either DAD has resolved or we hit an error // sending the last DAD message. Either way, clear the DAD state. done = false s.timer.Stop() delete(d.addresses, addr) var res stack.DADResult = &stack.DADSucceeded{} if err != nil { res = &stack.DADError{Err: err} } for _, h := range s.completionHandlers { h(res) } }), } } s.completionHandlers = append(s.completionHandlers, h) d.addresses[addr] = s return ret } // ExtendIfNonceEqualLockedDisposition enumerates the possible results from // ExtendIfNonceEqualLocked. type ExtendIfNonceEqualLockedDisposition int const ( // Extended indicates that the DAD process was extended. Extended ExtendIfNonceEqualLockedDisposition = iota // AlreadyExtended indicates that the DAD process was already extended. AlreadyExtended // NoDADStateFound indicates that DAD state was not found for the address. NoDADStateFound // NonceDisabled indicates that nonce values are not sent with DAD messages. NonceDisabled // NonceNotEqual indicates that the nonce value passed and the nonce in the // last send DAD message are not equal. NonceNotEqual ) // ExtendIfNonceEqualLocked extends the DAD process if the provided nonce is the // same as the nonce sent in the last DAD message. // // Precondition: d.protocolMU must be locked. func (d *DAD) ExtendIfNonceEqualLocked(addr tcpip.Address, nonce []byte) ExtendIfNonceEqualLockedDisposition { s, ok := d.addresses[addr] if !ok { return NoDADStateFound } if d.opts.NonceSize == 0 { return NonceDisabled } if s.extendRequest != notRequested { return AlreadyExtended } // As per RFC 7527 section 4 // // If any probe is looped back within RetransTimer milliseconds after having // sent DupAddrDetectTransmits NS(DAD) messages, the interface continues // with another MAX_MULTICAST_SOLICIT number of NS(DAD) messages transmitted // RetransTimer milliseconds apart. // // If a DAD message has already been sent and the nonce value we observed is // the same as the nonce value we last sent, then we assume our probe was // looped back and request an extension to the DAD process. // // Note, the first DAD message is sent asynchronously so we need to make sure // that we sent a DAD message by checking if we have a nonce value set. if s.nonce != nil && bytes.Equal(s.nonce, nonce) { s.extendRequest = requested d.addresses[addr] = s return Extended } return NonceNotEqual } // StopLocked stops a currently running DAD process. // // Precondition: d.protocolMU must be locked. func (d *DAD) StopLocked(addr tcpip.Address, reason stack.DADResult) { s, ok := d.addresses[addr] if !ok { return } *s.done = true s.timer.Stop() delete(d.addresses, addr) for _, h := range s.completionHandlers { h(reason) } } // SetConfigsLocked sets the DAD configurations. // // Precondition: d.protocolMU must be locked. func (d *DAD) SetConfigsLocked(c stack.DADConfigurations) { c.Validate() d.configs = c } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/internal/ip/errors.go000066400000000000000000000104751465435605700263320ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ip import ( "fmt" "gvisor.dev/gvisor/pkg/tcpip" ) // ForwardingError represents an error that occurred while trying to forward // a packet. type ForwardingError interface { isForwardingError() fmt.Stringer } // ErrTTLExceeded indicates that the received packet's TTL has been exceeded. type ErrTTLExceeded struct{} func (*ErrTTLExceeded) isForwardingError() {} func (*ErrTTLExceeded) String() string { return "ttl exceeded" } // ErrOutgoingDeviceNoBufferSpace indicates that the outgoing device does not // have enough space to hold a buffer. type ErrOutgoingDeviceNoBufferSpace struct{} func (*ErrOutgoingDeviceNoBufferSpace) isForwardingError() {} func (*ErrOutgoingDeviceNoBufferSpace) String() string { return "no device buffer space" } // ErrParameterProblem indicates the received packet had a problem with an IP // parameter. type ErrParameterProblem struct{} func (*ErrParameterProblem) isForwardingError() {} func (*ErrParameterProblem) String() string { return "parameter problem" } // ErrInitializingSourceAddress indicates the received packet had a source // address that may only be used on the local network as part of initialization // work. type ErrInitializingSourceAddress struct{} func (*ErrInitializingSourceAddress) isForwardingError() {} func (*ErrInitializingSourceAddress) String() string { return "initializing source address" } // ErrLinkLocalSourceAddress indicates the received packet had a link-local // source address. type ErrLinkLocalSourceAddress struct{} func (*ErrLinkLocalSourceAddress) isForwardingError() {} func (*ErrLinkLocalSourceAddress) String() string { return "link local source address" } // ErrLinkLocalDestinationAddress indicates the received packet had a link-local // destination address. type ErrLinkLocalDestinationAddress struct{} func (*ErrLinkLocalDestinationAddress) isForwardingError() {} func (*ErrLinkLocalDestinationAddress) String() string { return "link local destination address" } // ErrHostUnreachable indicates that the destination host could not be reached. type ErrHostUnreachable struct{} func (*ErrHostUnreachable) isForwardingError() {} func (*ErrHostUnreachable) String() string { return "no route to host" } // ErrMessageTooLong indicates the packet was too big for the outgoing MTU. // // +stateify savable type ErrMessageTooLong struct{} func (*ErrMessageTooLong) isForwardingError() {} func (*ErrMessageTooLong) String() string { return "message too long" } // ErrNoMulticastPendingQueueBufferSpace indicates that a multicast packet // could not be added to the pending packet queue due to insufficient buffer // space. // // +stateify savable type ErrNoMulticastPendingQueueBufferSpace struct{} func (*ErrNoMulticastPendingQueueBufferSpace) isForwardingError() {} func (*ErrNoMulticastPendingQueueBufferSpace) String() string { return "no buffer space" } // ErrUnexpectedMulticastInputInterface indicates that the interface that the // packet arrived on did not match the routes expected input interface. type ErrUnexpectedMulticastInputInterface struct{} func (*ErrUnexpectedMulticastInputInterface) isForwardingError() {} func (*ErrUnexpectedMulticastInputInterface) String() string { return "unexpected input interface" } // ErrUnknownOutputEndpoint indicates that the output endpoint associated with // a route could not be found. type ErrUnknownOutputEndpoint struct{} func (*ErrUnknownOutputEndpoint) isForwardingError() {} func (*ErrUnknownOutputEndpoint) String() string { return "unknown endpoint" } // ErrOther indicates the packet coould not be forwarded for a reason // captured by the contained error. type ErrOther struct { Err tcpip.Error } func (*ErrOther) isForwardingError() {} func (e *ErrOther) String() string { return fmt.Sprintf("other tcpip error: %s", e.Err) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/internal/ip/generic_multicast_protocol.go000066400000000000000000001250171465435605700324370ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ip import ( "fmt" "math/rand" "time" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" ) const ( // As per RFC 2236 section 3, // // When a host joins a multicast group, it should immediately transmit // an unsolicited Version 2 Membership Report for that group, in case it // is the first member of that group on the network. To cover the // possibility of the initial Membership Report being lost or damaged, // it is recommended that it be repeated once or twice after short // delays [Unsolicited Report Interval]. (A simple way to accomplish // this is to send the initial Version 2 Membership Report and then act // as if a Group-Specific Query was received for that group, and set a // timer appropriately). // // As per RFC 2710 section 4, // // When a node starts listening to a multicast address on an interface, // it should immediately transmit an unsolicited Report for that address // on that interface, in case it is the first listener on the link. To // cover the possibility of the initial Report being lost or damaged, it // is recommended that it be repeated once or twice after short delays // [Unsolicited Report Interval]. (A simple way to accomplish this is // to send the initial Report and then act as if a Multicast-Address- // Specific Query was received for that address, and set a timer // appropriately). unsolicitedTransmissionCount = 2 // Responses to queries may be delayed, but we only send a response to a // query once. A response to a query can be handled by any pending // unsolicited transmission count, but we should send at least one report // after sending a query. // // As per RFC 2236 section 3, // // When a host receives a General Query, it sets delay timers for each // group (excluding the all-systems group) of which it is a member on // the interface from which it received the query. // // As per RFC 2710 section 4, // // When a node receives a General Query, it sets a delay timer for each // multicast address to which it is listening on the interface from // which it received the Query, EXCLUDING the link-scope all-nodes // address and any multicast addresses of scope 0 (reserved) or 1 // (node-local). minQueryResponseTransmissionCount = 1 // DefaultRobustnessVariable is the default robustness variable // // As per RFC 3810 section 9.1 (for MLDv2), // // The Robustness Variable allows tuning for the expected packet loss on // a link. If a link is expected to be lossy, the value of the // Robustness Variable may be increased. MLD is robust to [Robustness // Variable] - 1 packet losses. The value of the Robustness Variable // MUST NOT be zero, and SHOULD NOT be one. Default value: 2. // // As per RFC 3376 section 8.1 (for IGMPv3), // // The Robustness Variable allows tuning for the expected packet loss on // a network. If a network is expected to be lossy, the Robustness // Variable may be increased. IGMP is robust to (Robustness Variable - // 1) packet losses. The Robustness Variable MUST NOT be zero, and // SHOULD NOT be one. Default: 2 DefaultRobustnessVariable = 2 // DefaultQueryInterval is the default query interval. // // As per RFC 3810 section 9.2 (for MLDv2), // // The Query Interval variable denotes the interval between General // Queries sent by the Querier. Default value: 125 seconds. // // As per RFC 3376 section 8.2 (for IGMPv3), // // The Query Interval is the interval between General Queries sent by // the Querier. Default: 125 seconds. DefaultQueryInterval = 125 * time.Second ) // multicastGroupState holds the Generic Multicast Protocol state for a // multicast group. // // +stateify savable type multicastGroupState struct { // joins is the number of times the group has been joined. joins uint64 // transmissionLeft is the number of transmissions left to send. transmissionLeft uint8 // lastToSendReport is true if we sent the last report for the group. It is // used to track whether there are other hosts on the subnet that are also // members of the group. // // Defined in RFC 2236 section 6 page 9 for IGMPv2 and RFC 2710 section 5 page // 8 for MLDv1. lastToSendReport bool // delayedReportJob is used to delay sending responses to membership report // messages in order to reduce duplicate reports from multiple hosts on the // interface. // // Must not be nil. delayedReportJob *tcpip.Job // delyedReportJobFiresAt is the time when the delayed report job will fire. // // A zero value indicates that the job is not scheduled. // TODO(b/341946753): Restore when netstack is savable. delayedReportJobFiresAt time.Time `state:"nosave"` // queriedIncludeSources holds sources that were queried for. // // Indicates that there is a pending source-specific query response for the // multicast address. queriedIncludeSources map[tcpip.Address]struct{} deleteScheduled bool } func (m *multicastGroupState) cancelDelayedReportJob() { m.delayedReportJob.Cancel() m.delayedReportJobFiresAt = time.Time{} m.transmissionLeft = 0 } func (m *multicastGroupState) clearQueriedIncludeSources() { for source := range m.queriedIncludeSources { delete(m.queriedIncludeSources, source) } } // GenericMulticastProtocolOptions holds options for the generic multicast // protocol. // // +stateify savable type GenericMulticastProtocolOptions struct { // Rand is the source of random numbers. // TODO(b/341946753): Restore when netstack is savable. Rand *rand.Rand `state:"nosave"` // Clock is the clock used to create timers. Clock tcpip.Clock // Protocol is the implementation of the variant of multicast group protocol // in use. Protocol MulticastGroupProtocol // MaxUnsolicitedReportDelay is the maximum amount of time to wait between // transmitting unsolicited reports. // // Unsolicited reports are transmitted when a group is newly joined. MaxUnsolicitedReportDelay time.Duration } // MulticastGroupProtocolV2ReportRecordType is the type of a // MulticastGroupProtocolv2 multicast address record. type MulticastGroupProtocolV2ReportRecordType int // MulticastGroupProtocolv2 multicast address record types. const ( _ MulticastGroupProtocolV2ReportRecordType = iota MulticastGroupProtocolV2ReportRecordModeIsInclude MulticastGroupProtocolV2ReportRecordModeIsExclude MulticastGroupProtocolV2ReportRecordChangeToIncludeMode MulticastGroupProtocolV2ReportRecordChangeToExcludeMode MulticastGroupProtocolV2ReportRecordAllowNewSources MulticastGroupProtocolV2ReportRecordBlockOldSources ) // MulticastGroupProtocolV2ReportBuilder is a builder for a V2 report. type MulticastGroupProtocolV2ReportBuilder interface { // AddRecord adds a record to the report. AddRecord(recordType MulticastGroupProtocolV2ReportRecordType, groupAddress tcpip.Address) // Send sends the report. // // Does nothing if no records were added. // // It is invalid to use this builder after this method is called. Send() (sent bool, err tcpip.Error) } // MulticastGroupProtocol is a multicast group protocol whose core state machine // can be represented by GenericMulticastProtocolState. type MulticastGroupProtocol interface { // Enabled indicates whether the generic multicast protocol will be // performed. // // When enabled, the protocol may transmit report and leave messages when // joining and leaving multicast groups respectively, and handle incoming // packets. // // When disabled, the protocol will still keep track of locally joined groups, // it just won't transmit and handle packets, or update groups' state. Enabled() bool // SendReport sends a multicast report for the specified group address. // // Returns false if the caller should queue the report to be sent later. Note, // returning false does not mean that the receiver hit an error. SendReport(groupAddress tcpip.Address) (sent bool, err tcpip.Error) // SendLeave sends a multicast leave for the specified group address. SendLeave(groupAddress tcpip.Address) tcpip.Error // ShouldPerformProtocol returns true iff the protocol should be performed for // the specified group. ShouldPerformProtocol(tcpip.Address) bool // NewReportV2Builder creates a new V2 builder. NewReportV2Builder() MulticastGroupProtocolV2ReportBuilder // V2QueryMaxRespCodeToV2Delay takes a V2 query's maximum response code and // returns the V2 delay. V2QueryMaxRespCodeToV2Delay(code uint16) time.Duration // V2QueryMaxRespCodeToV1Delay takes a V2 query's maximum response code and // returns the V1 delay. V2QueryMaxRespCodeToV1Delay(code uint16) time.Duration } type protocolMode int const ( protocolModeV2 protocolMode = iota protocolModeV1 protocolModeV1Compatibility ) // GenericMulticastProtocolState is the per interface generic multicast protocol // state. // // There is actually no protocol named "Generic Multicast Protocol". Instead, // the term used to refer to a generic multicast protocol that applies to both // IPv4 and IPv6. Specifically, Generic Multicast Protocol is the core state // machine of IGMPv2 as defined by RFC 2236 and MLDv1 as defined by RFC 2710. // // Callers must synchronize accesses to the generic multicast protocol state; // GenericMulticastProtocolState obtains no locks in any of its methods. The // only exception to this is GenericMulticastProtocolState's timer/job callbacks // which will obtain the lock provided to the GenericMulticastProtocolState when // it is initialized. // // GenericMulticastProtocolState.Init MUST be called before calling any of // the methods on GenericMulticastProtocolState. // // GenericMulticastProtocolState.MakeAllNonMemberLocked MUST be called when the // multicast group protocol is disabled so that leave messages may be sent. // // +stateify savable type GenericMulticastProtocolState struct { // Do not allow overwriting this state. _ sync.NoCopy `state:"nosave"` opts GenericMulticastProtocolOptions // memberships holds group addresses and their associated state. memberships map[tcpip.Address]multicastGroupState // protocolMU is the mutex used to protect the protocol. protocolMU *sync.RWMutex `state:"nosave"` // V2 state. robustnessVariable uint8 queryInterval time.Duration mode protocolMode modeTimer tcpip.Timer generalQueryV2Timer tcpip.Timer // TODO(b/341946753): Restore when netstack is savable. generalQueryV2TimerFiresAt time.Time `state:"nosave"` stateChangedReportV2Timer tcpip.Timer stateChangedReportV2TimerSet bool } // GetV1ModeLocked returns the V1 configuration. // // Precondition: g.protocolMU must be read locked. func (g *GenericMulticastProtocolState) GetV1ModeLocked() bool { switch g.mode { case protocolModeV2, protocolModeV1Compatibility: return false case protocolModeV1: return true default: panic(fmt.Sprintf("unrecognized mode = %d", g.mode)) } } func (g *GenericMulticastProtocolState) stopModeTimer() { if g.modeTimer != nil { g.modeTimer.Stop() } } // SetV1ModeLocked sets the V1 configuration. // // Returns the previous configuration. // // Precondition: g.protocolMU must be locked. func (g *GenericMulticastProtocolState) SetV1ModeLocked(v bool) bool { if g.GetV1ModeLocked() == v { return v } if v { g.stopModeTimer() g.cancelV2ReportTimers() g.mode = protocolModeV1 return false } g.mode = protocolModeV2 return true } func (g *GenericMulticastProtocolState) cancelV2ReportTimers() { if g.generalQueryV2Timer != nil { g.generalQueryV2Timer.Stop() g.generalQueryV2TimerFiresAt = time.Time{} } if g.stateChangedReportV2Timer != nil { g.stateChangedReportV2Timer.Stop() g.stateChangedReportV2TimerSet = false } } // Init initializes the Generic Multicast Protocol state. // // Must only be called once for the lifetime of g; Init will panic if it is // called twice. // // The GenericMulticastProtocolState will only grab the lock when timers/jobs // fire. // // Note: the methods on opts.Protocol will always be called while protocolMU is // held. func (g *GenericMulticastProtocolState) Init(protocolMU *sync.RWMutex, opts GenericMulticastProtocolOptions) { if g.memberships != nil { panic("attempted to initialize generic membership protocol state twice") } *g = GenericMulticastProtocolState{ opts: opts, memberships: make(map[tcpip.Address]multicastGroupState), protocolMU: protocolMU, robustnessVariable: DefaultRobustnessVariable, queryInterval: DefaultQueryInterval, mode: protocolModeV2, } } // MakeAllNonMemberLocked transitions all groups to the non-member state. // // The groups will still be considered joined locally. // // MUST be called when the multicast group protocol is disabled. // // Precondition: g.protocolMU must be locked. func (g *GenericMulticastProtocolState) MakeAllNonMemberLocked() { if !g.opts.Protocol.Enabled() { return } g.stopModeTimer() g.cancelV2ReportTimers() var v2ReportBuilder MulticastGroupProtocolV2ReportBuilder var handler func(tcpip.Address, *multicastGroupState) switch g.mode { case protocolModeV2: v2ReportBuilder = g.opts.Protocol.NewReportV2Builder() handler = func(groupAddress tcpip.Address, info *multicastGroupState) { info.cancelDelayedReportJob() // Send a report immediately to announce us leaving the group. v2ReportBuilder.AddRecord( MulticastGroupProtocolV2ReportRecordChangeToIncludeMode, groupAddress, ) } case protocolModeV1Compatibility: g.mode = protocolModeV2 fallthrough case protocolModeV1: handler = g.transitionToNonMemberLocked default: panic(fmt.Sprintf("unrecognized mode = %d", g.mode)) } for groupAddress, info := range g.memberships { if !g.shouldPerformForGroup(groupAddress) { continue } handler(groupAddress, &info) if info.deleteScheduled { delete(g.memberships, groupAddress) } else { info.transmissionLeft = 0 g.memberships[groupAddress] = info } } if v2ReportBuilder != nil { // Nothing meaningful we can do with the error here - this method may be // called when an interface is being disabled when we expect sends to // fail. _, _ = v2ReportBuilder.Send() } } // InitializeGroupsLocked initializes each group, as if they were newly joined // but without affecting the groups' join count. // // Must only be called after calling MakeAllNonMember as a group should not be // initialized while it is not in the non-member state. // // Precondition: g.protocolMU must be locked. func (g *GenericMulticastProtocolState) InitializeGroupsLocked() { if !g.opts.Protocol.Enabled() { return } var v2ReportBuilder MulticastGroupProtocolV2ReportBuilder switch g.mode { case protocolModeV2: v2ReportBuilder = g.opts.Protocol.NewReportV2Builder() case protocolModeV1Compatibility, protocolModeV1: default: panic(fmt.Sprintf("unrecognized mode = %d", g.mode)) } for groupAddress, info := range g.memberships { g.initializeNewMemberLocked(groupAddress, &info, v2ReportBuilder) g.memberships[groupAddress] = info } if v2ReportBuilder == nil { return } if sent, err := v2ReportBuilder.Send(); sent && err == nil { g.scheduleStateChangedTimer() } else { // Nothing meaningful we could do with the error here - the interface may // not yet have an address. This is okay because we would either schedule a // report to be sent later or we will be notified when an address is added, // at which point we will try to send messages again. for groupAddress, info := range g.memberships { if !g.shouldPerformForGroup(groupAddress) { continue } // Revert the transmissions count since we did not successfully send. info.transmissionLeft++ g.memberships[groupAddress] = info } } } // SendQueuedReportsLocked attempts to send reports for groups that failed to // send reports during their last attempt. // // Precondition: g.protocolMU must be locked. func (g *GenericMulticastProtocolState) SendQueuedReportsLocked() { if g.stateChangedReportV2TimerSet { return } for groupAddress, info := range g.memberships { if info.delayedReportJobFiresAt.IsZero() { switch g.mode { case protocolModeV2: g.sendV2ReportAndMaybeScheduleChangedTimer(groupAddress, &info, MulticastGroupProtocolV2ReportRecordChangeToExcludeMode) case protocolModeV1Compatibility, protocolModeV1: g.maybeSendReportLocked(groupAddress, &info) default: panic(fmt.Sprintf("unrecognized mode = %d", g.mode)) } g.memberships[groupAddress] = info } } } // JoinGroupLocked handles joining a new group. // // Precondition: g.protocolMU must be locked. func (g *GenericMulticastProtocolState) JoinGroupLocked(groupAddress tcpip.Address) { info, ok := g.memberships[groupAddress] if ok { info.joins++ if info.joins > 1 { // The group has already been joined. g.memberships[groupAddress] = info return } } else { info = multicastGroupState{ // Since we just joined the group, its count is 1. joins: 1, lastToSendReport: false, delayedReportJob: tcpip.NewJob(g.opts.Clock, g.protocolMU, func() { if !g.opts.Protocol.Enabled() { panic(fmt.Sprintf("delayed report job fired for group %s while the multicast group protocol is disabled", groupAddress)) } info, ok := g.memberships[groupAddress] if !ok { panic(fmt.Sprintf("expected to find group state for group = %s", groupAddress)) } info.delayedReportJobFiresAt = time.Time{} switch g.mode { case protocolModeV2: reportBuilder := g.opts.Protocol.NewReportV2Builder() reportBuilder.AddRecord(MulticastGroupProtocolV2ReportRecordModeIsExclude, groupAddress) // Nothing meaningful we can do with the error here - we only try to // send a delayed report once. _, _ = reportBuilder.Send() case protocolModeV1Compatibility, protocolModeV1: g.maybeSendReportLocked(groupAddress, &info) default: panic(fmt.Sprintf("unrecognized mode = %d", g.mode)) } info.clearQueriedIncludeSources() g.memberships[groupAddress] = info }), queriedIncludeSources: make(map[tcpip.Address]struct{}), } } info.deleteScheduled = false info.clearQueriedIncludeSources() info.delayedReportJobFiresAt = time.Time{} info.lastToSendReport = false g.initializeNewMemberLocked(groupAddress, &info, nil /* callersV2ReportBuilder */) g.memberships[groupAddress] = info } // IsLocallyJoinedRLocked returns true if the group is locally joined. // // Precondition: g.protocolMU must be read locked. func (g *GenericMulticastProtocolState) IsLocallyJoinedRLocked(groupAddress tcpip.Address) bool { info, ok := g.memberships[groupAddress] return ok && !info.deleteScheduled } func (g *GenericMulticastProtocolState) sendV2ReportAndMaybeScheduleChangedTimer( groupAddress tcpip.Address, info *multicastGroupState, recordType MulticastGroupProtocolV2ReportRecordType, ) bool { if info.transmissionLeft == 0 { return false } successfullySentAndHasMore := false // Send a report immediately to announce us leaving the group. reportBuilder := g.opts.Protocol.NewReportV2Builder() reportBuilder.AddRecord(recordType, groupAddress) if sent, err := reportBuilder.Send(); sent && err == nil { info.transmissionLeft-- successfullySentAndHasMore = info.transmissionLeft != 0 // Use the interface-wide state changed report for further transmissions. if successfullySentAndHasMore { g.scheduleStateChangedTimer() } } return successfullySentAndHasMore } func (g *GenericMulticastProtocolState) scheduleStateChangedTimer() { if g.stateChangedReportV2TimerSet { return } delay := g.calculateDelayTimerDuration(g.opts.MaxUnsolicitedReportDelay) if g.stateChangedReportV2Timer == nil { // TODO(https://issuetracker.google.com/264799098): Create timer on // initialization instead of lazily creating the timer since the timer // does not change after being created. g.stateChangedReportV2Timer = g.opts.Clock.AfterFunc(delay, func() { g.protocolMU.Lock() defer g.protocolMU.Unlock() reportBuilder := g.opts.Protocol.NewReportV2Builder() nonEmptyReport := false for groupAddress, info := range g.memberships { if info.transmissionLeft == 0 || !g.shouldPerformForGroup(groupAddress) { continue } info.transmissionLeft-- nonEmptyReport = true mode := MulticastGroupProtocolV2ReportRecordChangeToExcludeMode if info.deleteScheduled { mode = MulticastGroupProtocolV2ReportRecordChangeToIncludeMode } reportBuilder.AddRecord(mode, groupAddress) if info.deleteScheduled && info.transmissionLeft == 0 { // No more transmissions left so we can actually delete the // membership. delete(g.memberships, groupAddress) } else { g.memberships[groupAddress] = info } } // Nothing meaningful we can do with the error here. We will retry // sending a state changed report again anyways. _, _ = reportBuilder.Send() if nonEmptyReport { g.stateChangedReportV2Timer.Reset(g.calculateDelayTimerDuration(g.opts.MaxUnsolicitedReportDelay)) } else { g.stateChangedReportV2TimerSet = false } }) } else { g.stateChangedReportV2Timer.Reset(delay) } g.stateChangedReportV2TimerSet = true } // LeaveGroupLocked handles leaving the group. // // Returns false if the group is not currently joined. // // Precondition: g.protocolMU must be locked. func (g *GenericMulticastProtocolState) LeaveGroupLocked(groupAddress tcpip.Address) bool { info, ok := g.memberships[groupAddress] if !ok || info.joins == 0 { return false } info.joins-- if info.joins != 0 { // If we still have outstanding joins, then do nothing further. g.memberships[groupAddress] = info return true } info.deleteScheduled = true info.cancelDelayedReportJob() if !g.shouldPerformForGroup(groupAddress) { delete(g.memberships, groupAddress) return true } switch g.mode { case protocolModeV2: info.transmissionLeft = g.robustnessVariable if g.sendV2ReportAndMaybeScheduleChangedTimer(groupAddress, &info, MulticastGroupProtocolV2ReportRecordChangeToIncludeMode) { g.memberships[groupAddress] = info } else { delete(g.memberships, groupAddress) } case protocolModeV1Compatibility, protocolModeV1: g.transitionToNonMemberLocked(groupAddress, &info) delete(g.memberships, groupAddress) default: panic(fmt.Sprintf("unrecognized mode = %d", g.mode)) } return true } // HandleQueryV2Locked handles a V2 query. // // Precondition: g.protocolMU must be locked. func (g *GenericMulticastProtocolState) HandleQueryV2Locked(groupAddress tcpip.Address, maxResponseCode uint16, sources header.AddressIterator, robustnessVariable uint8, queryInterval time.Duration) { if !g.opts.Protocol.Enabled() { return } switch g.mode { case protocolModeV1Compatibility, protocolModeV1: g.handleQueryInnerLocked(groupAddress, g.opts.Protocol.V2QueryMaxRespCodeToV1Delay(maxResponseCode)) return case protocolModeV2: default: panic(fmt.Sprintf("unrecognized mode = %d", g.mode)) } if robustnessVariable != 0 { g.robustnessVariable = robustnessVariable } if queryInterval != 0 { g.queryInterval = queryInterval } maxResponseTime := g.calculateDelayTimerDuration(g.opts.Protocol.V2QueryMaxRespCodeToV2Delay(maxResponseCode)) // As per RFC 3376 section 5.2, // // 1. If there is a pending response to a previous General Query // scheduled sooner than the selected delay, no additional response // needs to be scheduled. // // 2. If the received Query is a General Query, the interface timer is // used to schedule a response to the General Query after the // selected delay. Any previously pending response to a General // Query is canceled. // // 3. If the received Query is a Group-Specific Query or a Group-and- // Source-Specific Query and there is no pending response to a // previous Query for this group, then the group timer is used to // schedule a report. If the received Query is a Group-and-Source- // Specific Query, the list of queried sources is recorded to be used // when generating a response. // // 4. If there already is a pending response to a previous Query // scheduled for this group, and either the new Query is a Group- // Specific Query or the recorded source-list associated with the // group is empty, then the group source-list is cleared and a single // response is scheduled using the group timer. The new response is // scheduled to be sent at the earliest of the remaining time for the // pending report and the selected delay. // // 5. If the received Query is a Group-and-Source-Specific Query and // there is a pending response for this group with a non-empty // source-list, then the group source list is augmented to contain // the list of sources in the new Query and a single response is // scheduled using the group timer. The new response is scheduled to // be sent at the earliest of the remaining time for the pending // report and the selected delay. // // As per RFC 3810 section 6.2, // // 1. If there is a pending response to a previous General Query // scheduled sooner than the selected delay, no additional response // needs to be scheduled. // // 2. If the received Query is a General Query, the Interface Timer is // used to schedule a response to the General Query after the // selected delay. Any previously pending response to a General // Query is canceled. // // 3. If the received Query is a Multicast Address Specific Query or a // Multicast Address and Source Specific Query and there is no // pending response to a previous Query for this multicast address, // then the Multicast Address Timer is used to schedule a report. If // the received Query is a Multicast Address and Source Specific // Query, the list of queried sources is recorded to be used when // generating a response. // // 4. If there is already a pending response to a previous Query // scheduled for this multicast address, and either the new Query is // a Multicast Address Specific Query or the recorded source list // associated with the multicast address is empty, then the multicast // address source list is cleared and a single response is scheduled, // using the Multicast Address Timer. The new response is scheduled // to be sent at the earliest of the remaining time for the pending // report and the selected delay. // // 5. If the received Query is a Multicast Address and Source Specific // Query and there is a pending response for this multicast address // with a non-empty source list, then the multicast address source // list is augmented to contain the list of sources in the new Query, // and a single response is scheduled using the Multicast Address // Timer. The new response is scheduled to be sent at the earliest // of the remaining time for the pending report and the selected // delay. now := g.opts.Clock.Now() if !g.generalQueryV2TimerFiresAt.IsZero() && g.generalQueryV2TimerFiresAt.Sub(now) <= maxResponseTime { return } if groupAddress.Unspecified() { if g.generalQueryV2Timer == nil { // TODO(https://issuetracker.google.com/264799098): Create timer on // initialization instead of lazily creating the timer since the timer // does not change after being created. g.generalQueryV2Timer = g.opts.Clock.AfterFunc(maxResponseTime, func() { g.protocolMU.Lock() defer g.protocolMU.Unlock() g.generalQueryV2TimerFiresAt = time.Time{} // As per RFC 3810 section 6.3, // // If the expired timer is the Interface Timer (i.e., there is a // pending response to a General Query), then one Current State // Record is sent for each multicast address for which the specified // interface has listening state, as described in section 4.2. The // Current State Record carries the multicast address and its // associated filter mode (MODE_IS_INCLUDE or MODE_IS_EXCLUDE) and // Source list. Multiple Current State Records are packed into // individual Report messages, to the extent possible. // // As per RFC 3376 section 5.2, // // If the expired timer is the interface timer (i.e., it is a pending // response to a General Query), then one Current-State Record is // sent for each multicast address for which the specified interface // has reception state, as described in section 3.2. The Current- // State Record carries the multicast address and its associated // filter mode (MODE_IS_INCLUDE or MODE_IS_EXCLUDE) and source list. // Multiple Current-State Records are packed into individual Report // messages, to the extent possible. reportBuilder := g.opts.Protocol.NewReportV2Builder() for groupAddress, info := range g.memberships { if info.deleteScheduled || !g.shouldPerformForGroup(groupAddress) { continue } // A MODE_IS_EXCLUDE record without any sources indicates that we are // interested in traffic from all sources for the group. // // We currently only hold groups if we have an active interest in the // group. reportBuilder.AddRecord( MulticastGroupProtocolV2ReportRecordModeIsExclude, groupAddress, ) } _, _ = reportBuilder.Send() }) } else { g.generalQueryV2Timer.Reset(maxResponseTime) } g.generalQueryV2TimerFiresAt = now.Add(maxResponseTime) return } if info, ok := g.memberships[groupAddress]; ok && !info.deleteScheduled && g.shouldPerformForGroup(groupAddress) { if info.delayedReportJobFiresAt.IsZero() || (!sources.Done() && len(info.queriedIncludeSources) != 0) { for { source, ok := sources.Next() if !ok { break } info.queriedIncludeSources[source] = struct{}{} } } else { info.clearQueriedIncludeSources() } g.setDelayTimerForAddressLocked(groupAddress, &info, maxResponseTime) g.memberships[groupAddress] = info } } // HandleQueryLocked handles a query message with the specified maximum response // time. // // If the group address is unspecified, then reports will be scheduled for all // joined groups. // // Report(s) will be scheduled to be sent after a random duration between 0 and // the maximum response time. // // Precondition: g.protocolMU must be locked. func (g *GenericMulticastProtocolState) HandleQueryLocked(groupAddress tcpip.Address, maxResponseTime time.Duration) { if !g.opts.Protocol.Enabled() { return } switch g.mode { case protocolModeV2, protocolModeV1Compatibility: // As per 3376 section 8.12 (for IGMPv3), // // The Older Version Querier Interval is the time-out for transitioning // a host back to IGMPv3 mode once an older version query is heard. // When an older version query is received, hosts set their Older // Version Querier Present Timer to Older Version Querier Interval. // // This value MUST be ((the Robustness Variable) times (the Query // Interval in the last Query received)) plus (one Query Response // Interval). // // As per RFC 3810 section 9.12 (for MLDv2), // // The Older Version Querier Present Timeout is the time-out for // transitioning a host back to MLDv2 Host Compatibility Mode. When an // MLDv1 query is received, MLDv2 hosts set their Older Version Querier // Present Timer to [Older Version Querier Present Timeout]. // // This value MUST be ([Robustness Variable] times (the [Query Interval] // in the last Query received)) plus ([Query Response Interval]). modeRevertDelay := time.Duration(g.robustnessVariable) * g.queryInterval if g.modeTimer == nil { // TODO(https://issuetracker.google.com/264799098): Create timer on // initialization instead of lazily creating the timer since the timer // does not change after being created. g.modeTimer = g.opts.Clock.AfterFunc(modeRevertDelay, func() { g.protocolMU.Lock() defer g.protocolMU.Unlock() g.mode = protocolModeV2 }) } else { g.modeTimer.Reset(modeRevertDelay) } g.mode = protocolModeV1Compatibility g.cancelV2ReportTimers() case protocolModeV1: default: panic(fmt.Sprintf("unrecognized mode = %d", g.mode)) } g.handleQueryInnerLocked(groupAddress, maxResponseTime) } func (g *GenericMulticastProtocolState) handleQueryInnerLocked(groupAddress tcpip.Address, maxResponseTime time.Duration) { maxResponseTime = g.calculateDelayTimerDuration(maxResponseTime) // As per RFC 2236 section 2.4 (for IGMPv2), // // In a Membership Query message, the group address field is set to zero // when sending a General Query, and set to the group address being // queried when sending a Group-Specific Query. // // As per RFC 2710 section 3.6 (for MLDv1), // // In a Query message, the Multicast Address field is set to zero when // sending a General Query, and set to a specific IPv6 multicast address // when sending a Multicast-Address-Specific Query. if groupAddress.Unspecified() { // This is a general query as the group address is unspecified. for groupAddress, info := range g.memberships { g.setDelayTimerForAddressLocked(groupAddress, &info, maxResponseTime) g.memberships[groupAddress] = info } } else if info, ok := g.memberships[groupAddress]; ok && !info.deleteScheduled { g.setDelayTimerForAddressLocked(groupAddress, &info, maxResponseTime) g.memberships[groupAddress] = info } } // HandleReportLocked handles a report message. // // If the report is for a joined group, any active delayed report will be // cancelled and the host state for the group transitions to idle. // // Precondition: g.protocolMU must be locked. func (g *GenericMulticastProtocolState) HandleReportLocked(groupAddress tcpip.Address) { if !g.opts.Protocol.Enabled() { return } // As per RFC 2236 section 3 pages 3-4 (for IGMPv2), // // If the host receives another host's Report (version 1 or 2) while it has // a timer running, it stops its timer for the specified group and does not // send a Report // // As per RFC 2710 section 4 page 6 (for MLDv1), // // If a node receives another node's Report from an interface for a // multicast address while it has a timer running for that same address // on that interface, it stops its timer and does not send a Report for // that address, thus suppressing duplicate reports on the link. if info, ok := g.memberships[groupAddress]; ok { info.cancelDelayedReportJob() info.lastToSendReport = false g.memberships[groupAddress] = info } } // initializeNewMemberLocked initializes a new group membership. // // Precondition: g.protocolMU must be locked. func (g *GenericMulticastProtocolState) initializeNewMemberLocked(groupAddress tcpip.Address, info *multicastGroupState, callersV2ReportBuilder MulticastGroupProtocolV2ReportBuilder) { if !g.shouldPerformForGroup(groupAddress) { return } info.lastToSendReport = false switch g.mode { case protocolModeV2: info.transmissionLeft = g.robustnessVariable if callersV2ReportBuilder == nil { g.sendV2ReportAndMaybeScheduleChangedTimer(groupAddress, info, MulticastGroupProtocolV2ReportRecordChangeToExcludeMode) } else { callersV2ReportBuilder.AddRecord(MulticastGroupProtocolV2ReportRecordChangeToExcludeMode, groupAddress) info.transmissionLeft-- } case protocolModeV1Compatibility, protocolModeV1: info.transmissionLeft = unsolicitedTransmissionCount g.maybeSendReportLocked(groupAddress, info) default: panic(fmt.Sprintf("unrecognized mode = %d", g.mode)) } } func (g *GenericMulticastProtocolState) shouldPerformForGroup(groupAddress tcpip.Address) bool { return g.opts.Protocol.ShouldPerformProtocol(groupAddress) && g.opts.Protocol.Enabled() } // maybeSendReportLocked attempts to send a report for a group. // // Precondition: g.protocolMU must be locked. func (g *GenericMulticastProtocolState) maybeSendReportLocked(groupAddress tcpip.Address, info *multicastGroupState) { if info.transmissionLeft == 0 { return } // As per RFC 2236 section 3 page 5 (for IGMPv2), // // When a host joins a multicast group, it should immediately transmit an // unsolicited Version 2 Membership Report for that group" ... "it is // recommended that it be repeated". // // As per RFC 2710 section 4 page 6 (for MLDv1), // // When a node starts listening to a multicast address on an interface, // it should immediately transmit an unsolicited Report for that address // on that interface, in case it is the first listener on the link. To // cover the possibility of the initial Report being lost or damaged, it // is recommended that it be repeated once or twice after short delays // [Unsolicited Report Interval]. // // TODO(gvisor.dev/issue/4901): Support a configurable number of initial // unsolicited reports. sent, err := g.opts.Protocol.SendReport(groupAddress) if err == nil && sent { info.lastToSendReport = true info.transmissionLeft-- if info.transmissionLeft > 0 { g.setDelayTimerForAddressLocked( groupAddress, info, g.calculateDelayTimerDuration(g.opts.MaxUnsolicitedReportDelay), ) } } } // maybeSendLeave attempts to send a leave message. func (g *GenericMulticastProtocolState) maybeSendLeave(groupAddress tcpip.Address, lastToSendReport bool) { if !g.shouldPerformForGroup(groupAddress) || !lastToSendReport { return } // Okay to ignore the error here as if packet write failed, the multicast // routers will eventually drop our membership anyways. If the interface is // being disabled or removed, the generic multicast protocol's should be // cleared eventually. // // As per RFC 2236 section 3 page 5 (for IGMPv2), // // When a router receives a Report, it adds the group being reported to // the list of multicast group memberships on the network on which it // received the Report and sets the timer for the membership to the // [Group Membership Interval]. Repeated Reports refresh the timer. If // no Reports are received for a particular group before this timer has // expired, the router assumes that the group has no local members and // that it need not forward remotely-originated multicasts for that // group onto the attached network. // // As per RFC 2710 section 4 page 5 (for MLDv1), // // When a router receives a Report from a link, if the reported address // is not already present in the router's list of multicast address // having listeners on that link, the reported address is added to the // list, its timer is set to [Multicast Listener Interval], and its // appearance is made known to the router's multicast routing component. // If a Report is received for a multicast address that is already // present in the router's list, the timer for that address is reset to // [Multicast Listener Interval]. If an address's timer expires, it is // assumed that there are no longer any listeners for that address // present on the link, so it is deleted from the list and its // disappearance is made known to the multicast routing component. // // The requirement to send a leave message is also optional (it MAY be // skipped): // // As per RFC 2236 section 6 page 8 (for IGMPv2), // // "send leave" for the group on the interface. If the interface // state says the Querier is running IGMPv1, this action SHOULD be // skipped. If the flag saying we were the last host to report is // cleared, this action MAY be skipped. The Leave Message is sent to // the ALL-ROUTERS group (224.0.0.2). // // As per RFC 2710 section 5 page 8 (for MLDv1), // // "send done" for the address on the interface. If the flag saying // we were the last node to report is cleared, this action MAY be // skipped. The Done message is sent to the link-scope all-routers // address (FF02::2). _ = g.opts.Protocol.SendLeave(groupAddress) } // transitionToNonMemberLocked transitions the given multicast group the the // non-member/listener state. // // Precondition: g.protocolMU must be locked. func (g *GenericMulticastProtocolState) transitionToNonMemberLocked(groupAddress tcpip.Address, info *multicastGroupState) { info.cancelDelayedReportJob() g.maybeSendLeave(groupAddress, info.lastToSendReport) info.lastToSendReport = false } // setDelayTimerForAddressLocked sets timer to send a delayed report. // // Precondition: g.protocolMU MUST be locked. func (g *GenericMulticastProtocolState) setDelayTimerForAddressLocked(groupAddress tcpip.Address, info *multicastGroupState, maxResponseTime time.Duration) { if !g.shouldPerformForGroup(groupAddress) { return } if info.transmissionLeft < minQueryResponseTransmissionCount { info.transmissionLeft = minQueryResponseTransmissionCount } // As per RFC 2236 section 3 page 3 (for IGMPv2), // // If a timer for the group is already running, it is reset to the random // value only if the requested Max Response Time is less than the remaining // value of the running timer. // // As per RFC 2710 section 4 page 5 (for MLDv1), // // If a timer for any address is already running, it is reset to the new // random value only if the requested Maximum Response Delay is less than // the remaining value of the running timer. now := g.opts.Clock.Now() if !info.delayedReportJobFiresAt.IsZero() && info.delayedReportJobFiresAt.Sub(now) <= maxResponseTime { // The timer is scheduled to fire before the maximum response time so we // leave our timer as is. return } info.delayedReportJob.Cancel() info.delayedReportJob.Schedule(maxResponseTime) info.delayedReportJobFiresAt = now.Add(maxResponseTime) } // calculateDelayTimerDuration returns a random time between (0, maxRespTime]. func (g *GenericMulticastProtocolState) calculateDelayTimerDuration(maxRespTime time.Duration) time.Duration { // As per RFC 2236 section 3 page 3 (for IGMPv2), // // When a host receives a Group-Specific Query, it sets a delay timer to a // random value selected from the range (0, Max Response Time]... // // As per RFC 2710 section 4 page 6 (for MLDv1), // // When a node receives a Multicast-Address-Specific Query, if it is // listening to the queried Multicast Address on the interface from // which the Query was received, it sets a delay timer for that address // to a random value selected from the range [0, Maximum Response Delay], // as above. if maxRespTime == 0 { return 0 } return time.Duration(g.opts.Rand.Int63n(int64(maxRespTime))) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/internal/ip/ip_state_autogen.go000066400000000000000000000323441465435605700303470ustar00rootroot00000000000000// automatically generated by stateify. package ip import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (d *dadState) StateTypeName() string { return "pkg/tcpip/network/internal/ip.dadState" } func (d *dadState) StateFields() []string { return []string{ "nonce", "extendRequest", "done", "timer", "completionHandlers", } } func (d *dadState) beforeSave() {} // +checklocksignore func (d *dadState) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.nonce) stateSinkObject.Save(1, &d.extendRequest) stateSinkObject.Save(2, &d.done) stateSinkObject.Save(3, &d.timer) stateSinkObject.Save(4, &d.completionHandlers) } func (d *dadState) afterLoad(context.Context) {} // +checklocksignore func (d *dadState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.nonce) stateSourceObject.Load(1, &d.extendRequest) stateSourceObject.Load(2, &d.done) stateSourceObject.Load(3, &d.timer) stateSourceObject.Load(4, &d.completionHandlers) } func (d *DADOptions) StateTypeName() string { return "pkg/tcpip/network/internal/ip.DADOptions" } func (d *DADOptions) StateFields() []string { return []string{ "Clock", "NonceSize", "ExtendDADTransmits", "Protocol", "NICID", } } func (d *DADOptions) beforeSave() {} // +checklocksignore func (d *DADOptions) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.Clock) stateSinkObject.Save(1, &d.NonceSize) stateSinkObject.Save(2, &d.ExtendDADTransmits) stateSinkObject.Save(3, &d.Protocol) stateSinkObject.Save(4, &d.NICID) } func (d *DADOptions) afterLoad(context.Context) {} // +checklocksignore func (d *DADOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.Clock) stateSourceObject.Load(1, &d.NonceSize) stateSourceObject.Load(2, &d.ExtendDADTransmits) stateSourceObject.Load(3, &d.Protocol) stateSourceObject.Load(4, &d.NICID) } func (d *DAD) StateTypeName() string { return "pkg/tcpip/network/internal/ip.DAD" } func (d *DAD) StateFields() []string { return []string{ "opts", "configs", "addresses", } } func (d *DAD) beforeSave() {} // +checklocksignore func (d *DAD) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.opts) stateSinkObject.Save(1, &d.configs) stateSinkObject.Save(2, &d.addresses) } func (d *DAD) afterLoad(context.Context) {} // +checklocksignore func (d *DAD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.opts) stateSourceObject.Load(1, &d.configs) stateSourceObject.Load(2, &d.addresses) } func (e *ErrMessageTooLong) StateTypeName() string { return "pkg/tcpip/network/internal/ip.ErrMessageTooLong" } func (e *ErrMessageTooLong) StateFields() []string { return []string{} } func (e *ErrMessageTooLong) beforeSave() {} // +checklocksignore func (e *ErrMessageTooLong) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrMessageTooLong) afterLoad(context.Context) {} // +checklocksignore func (e *ErrMessageTooLong) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrNoMulticastPendingQueueBufferSpace) StateTypeName() string { return "pkg/tcpip/network/internal/ip.ErrNoMulticastPendingQueueBufferSpace" } func (e *ErrNoMulticastPendingQueueBufferSpace) StateFields() []string { return []string{} } func (e *ErrNoMulticastPendingQueueBufferSpace) beforeSave() {} // +checklocksignore func (e *ErrNoMulticastPendingQueueBufferSpace) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrNoMulticastPendingQueueBufferSpace) afterLoad(context.Context) {} // +checklocksignore func (e *ErrNoMulticastPendingQueueBufferSpace) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (m *multicastGroupState) StateTypeName() string { return "pkg/tcpip/network/internal/ip.multicastGroupState" } func (m *multicastGroupState) StateFields() []string { return []string{ "joins", "transmissionLeft", "lastToSendReport", "delayedReportJob", "queriedIncludeSources", "deleteScheduled", } } func (m *multicastGroupState) beforeSave() {} // +checklocksignore func (m *multicastGroupState) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.joins) stateSinkObject.Save(1, &m.transmissionLeft) stateSinkObject.Save(2, &m.lastToSendReport) stateSinkObject.Save(3, &m.delayedReportJob) stateSinkObject.Save(4, &m.queriedIncludeSources) stateSinkObject.Save(5, &m.deleteScheduled) } func (m *multicastGroupState) afterLoad(context.Context) {} // +checklocksignore func (m *multicastGroupState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.joins) stateSourceObject.Load(1, &m.transmissionLeft) stateSourceObject.Load(2, &m.lastToSendReport) stateSourceObject.Load(3, &m.delayedReportJob) stateSourceObject.Load(4, &m.queriedIncludeSources) stateSourceObject.Load(5, &m.deleteScheduled) } func (g *GenericMulticastProtocolOptions) StateTypeName() string { return "pkg/tcpip/network/internal/ip.GenericMulticastProtocolOptions" } func (g *GenericMulticastProtocolOptions) StateFields() []string { return []string{ "Clock", "Protocol", "MaxUnsolicitedReportDelay", } } func (g *GenericMulticastProtocolOptions) beforeSave() {} // +checklocksignore func (g *GenericMulticastProtocolOptions) StateSave(stateSinkObject state.Sink) { g.beforeSave() stateSinkObject.Save(0, &g.Clock) stateSinkObject.Save(1, &g.Protocol) stateSinkObject.Save(2, &g.MaxUnsolicitedReportDelay) } func (g *GenericMulticastProtocolOptions) afterLoad(context.Context) {} // +checklocksignore func (g *GenericMulticastProtocolOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &g.Clock) stateSourceObject.Load(1, &g.Protocol) stateSourceObject.Load(2, &g.MaxUnsolicitedReportDelay) } func (g *GenericMulticastProtocolState) StateTypeName() string { return "pkg/tcpip/network/internal/ip.GenericMulticastProtocolState" } func (g *GenericMulticastProtocolState) StateFields() []string { return []string{ "opts", "memberships", "robustnessVariable", "queryInterval", "mode", "modeTimer", "generalQueryV2Timer", "stateChangedReportV2Timer", "stateChangedReportV2TimerSet", } } func (g *GenericMulticastProtocolState) beforeSave() {} // +checklocksignore func (g *GenericMulticastProtocolState) StateSave(stateSinkObject state.Sink) { g.beforeSave() stateSinkObject.Save(0, &g.opts) stateSinkObject.Save(1, &g.memberships) stateSinkObject.Save(2, &g.robustnessVariable) stateSinkObject.Save(3, &g.queryInterval) stateSinkObject.Save(4, &g.mode) stateSinkObject.Save(5, &g.modeTimer) stateSinkObject.Save(6, &g.generalQueryV2Timer) stateSinkObject.Save(7, &g.stateChangedReportV2Timer) stateSinkObject.Save(8, &g.stateChangedReportV2TimerSet) } func (g *GenericMulticastProtocolState) afterLoad(context.Context) {} // +checklocksignore func (g *GenericMulticastProtocolState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &g.opts) stateSourceObject.Load(1, &g.memberships) stateSourceObject.Load(2, &g.robustnessVariable) stateSourceObject.Load(3, &g.queryInterval) stateSourceObject.Load(4, &g.mode) stateSourceObject.Load(5, &g.modeTimer) stateSourceObject.Load(6, &g.generalQueryV2Timer) stateSourceObject.Load(7, &g.stateChangedReportV2Timer) stateSourceObject.Load(8, &g.stateChangedReportV2TimerSet) } func (m *MultiCounterIPForwardingStats) StateTypeName() string { return "pkg/tcpip/network/internal/ip.MultiCounterIPForwardingStats" } func (m *MultiCounterIPForwardingStats) StateFields() []string { return []string{ "Unrouteable", "ExhaustedTTL", "InitializingSource", "LinkLocalSource", "LinkLocalDestination", "PacketTooBig", "HostUnreachable", "ExtensionHeaderProblem", "UnexpectedMulticastInputInterface", "UnknownOutputEndpoint", "NoMulticastPendingQueueBufferSpace", "OutgoingDeviceNoBufferSpace", "Errors", } } func (m *MultiCounterIPForwardingStats) beforeSave() {} // +checklocksignore func (m *MultiCounterIPForwardingStats) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.Unrouteable) stateSinkObject.Save(1, &m.ExhaustedTTL) stateSinkObject.Save(2, &m.InitializingSource) stateSinkObject.Save(3, &m.LinkLocalSource) stateSinkObject.Save(4, &m.LinkLocalDestination) stateSinkObject.Save(5, &m.PacketTooBig) stateSinkObject.Save(6, &m.HostUnreachable) stateSinkObject.Save(7, &m.ExtensionHeaderProblem) stateSinkObject.Save(8, &m.UnexpectedMulticastInputInterface) stateSinkObject.Save(9, &m.UnknownOutputEndpoint) stateSinkObject.Save(10, &m.NoMulticastPendingQueueBufferSpace) stateSinkObject.Save(11, &m.OutgoingDeviceNoBufferSpace) stateSinkObject.Save(12, &m.Errors) } func (m *MultiCounterIPForwardingStats) afterLoad(context.Context) {} // +checklocksignore func (m *MultiCounterIPForwardingStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.Unrouteable) stateSourceObject.Load(1, &m.ExhaustedTTL) stateSourceObject.Load(2, &m.InitializingSource) stateSourceObject.Load(3, &m.LinkLocalSource) stateSourceObject.Load(4, &m.LinkLocalDestination) stateSourceObject.Load(5, &m.PacketTooBig) stateSourceObject.Load(6, &m.HostUnreachable) stateSourceObject.Load(7, &m.ExtensionHeaderProblem) stateSourceObject.Load(8, &m.UnexpectedMulticastInputInterface) stateSourceObject.Load(9, &m.UnknownOutputEndpoint) stateSourceObject.Load(10, &m.NoMulticastPendingQueueBufferSpace) stateSourceObject.Load(11, &m.OutgoingDeviceNoBufferSpace) stateSourceObject.Load(12, &m.Errors) } func (m *MultiCounterIPStats) StateTypeName() string { return "pkg/tcpip/network/internal/ip.MultiCounterIPStats" } func (m *MultiCounterIPStats) StateFields() []string { return []string{ "PacketsReceived", "ValidPacketsReceived", "DisabledPacketsReceived", "InvalidDestinationAddressesReceived", "InvalidSourceAddressesReceived", "PacketsDelivered", "PacketsSent", "OutgoingPacketErrors", "MalformedPacketsReceived", "MalformedFragmentsReceived", "IPTablesPreroutingDropped", "IPTablesInputDropped", "IPTablesForwardDropped", "IPTablesOutputDropped", "IPTablesPostroutingDropped", "OptionTimestampReceived", "OptionRecordRouteReceived", "OptionRouterAlertReceived", "OptionUnknownReceived", "Forwarding", } } func (m *MultiCounterIPStats) beforeSave() {} // +checklocksignore func (m *MultiCounterIPStats) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.PacketsReceived) stateSinkObject.Save(1, &m.ValidPacketsReceived) stateSinkObject.Save(2, &m.DisabledPacketsReceived) stateSinkObject.Save(3, &m.InvalidDestinationAddressesReceived) stateSinkObject.Save(4, &m.InvalidSourceAddressesReceived) stateSinkObject.Save(5, &m.PacketsDelivered) stateSinkObject.Save(6, &m.PacketsSent) stateSinkObject.Save(7, &m.OutgoingPacketErrors) stateSinkObject.Save(8, &m.MalformedPacketsReceived) stateSinkObject.Save(9, &m.MalformedFragmentsReceived) stateSinkObject.Save(10, &m.IPTablesPreroutingDropped) stateSinkObject.Save(11, &m.IPTablesInputDropped) stateSinkObject.Save(12, &m.IPTablesForwardDropped) stateSinkObject.Save(13, &m.IPTablesOutputDropped) stateSinkObject.Save(14, &m.IPTablesPostroutingDropped) stateSinkObject.Save(15, &m.OptionTimestampReceived) stateSinkObject.Save(16, &m.OptionRecordRouteReceived) stateSinkObject.Save(17, &m.OptionRouterAlertReceived) stateSinkObject.Save(18, &m.OptionUnknownReceived) stateSinkObject.Save(19, &m.Forwarding) } func (m *MultiCounterIPStats) afterLoad(context.Context) {} // +checklocksignore func (m *MultiCounterIPStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.PacketsReceived) stateSourceObject.Load(1, &m.ValidPacketsReceived) stateSourceObject.Load(2, &m.DisabledPacketsReceived) stateSourceObject.Load(3, &m.InvalidDestinationAddressesReceived) stateSourceObject.Load(4, &m.InvalidSourceAddressesReceived) stateSourceObject.Load(5, &m.PacketsDelivered) stateSourceObject.Load(6, &m.PacketsSent) stateSourceObject.Load(7, &m.OutgoingPacketErrors) stateSourceObject.Load(8, &m.MalformedPacketsReceived) stateSourceObject.Load(9, &m.MalformedFragmentsReceived) stateSourceObject.Load(10, &m.IPTablesPreroutingDropped) stateSourceObject.Load(11, &m.IPTablesInputDropped) stateSourceObject.Load(12, &m.IPTablesForwardDropped) stateSourceObject.Load(13, &m.IPTablesOutputDropped) stateSourceObject.Load(14, &m.IPTablesPostroutingDropped) stateSourceObject.Load(15, &m.OptionTimestampReceived) stateSourceObject.Load(16, &m.OptionRecordRouteReceived) stateSourceObject.Load(17, &m.OptionRouterAlertReceived) stateSourceObject.Load(18, &m.OptionUnknownReceived) stateSourceObject.Load(19, &m.Forwarding) } func init() { state.Register((*dadState)(nil)) state.Register((*DADOptions)(nil)) state.Register((*DAD)(nil)) state.Register((*ErrMessageTooLong)(nil)) state.Register((*ErrNoMulticastPendingQueueBufferSpace)(nil)) state.Register((*multicastGroupState)(nil)) state.Register((*GenericMulticastProtocolOptions)(nil)) state.Register((*GenericMulticastProtocolState)(nil)) state.Register((*MultiCounterIPForwardingStats)(nil)) state.Register((*MultiCounterIPStats)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/internal/ip/stats.go000066400000000000000000000224001465435605700261430ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ip import "gvisor.dev/gvisor/pkg/tcpip" // LINT.IfChange(MultiCounterIPForwardingStats) // MultiCounterIPForwardingStats holds IP forwarding statistics. Each counter // may have several versions. // // +stateify savable type MultiCounterIPForwardingStats struct { // Unrouteable is the number of IP packets received which were dropped // because the netstack could not construct a route to their // destination. Unrouteable tcpip.MultiCounterStat // ExhaustedTTL is the number of IP packets received which were dropped // because their TTL was exhausted. ExhaustedTTL tcpip.MultiCounterStat // InitializingSource is the number of IP packets which were dropped // because they contained a source address that may only be used on the local // network as part of initialization work. InitializingSource tcpip.MultiCounterStat // LinkLocalSource is the number of IP packets which were dropped // because they contained a link-local source address. LinkLocalSource tcpip.MultiCounterStat // LinkLocalDestination is the number of IP packets which were dropped // because they contained a link-local destination address. LinkLocalDestination tcpip.MultiCounterStat // PacketTooBig is the number of IP packets which were dropped because they // were too big for the outgoing MTU. PacketTooBig tcpip.MultiCounterStat // HostUnreachable is the number of IP packets received which could not be // successfully forwarded due to an unresolvable next hop. HostUnreachable tcpip.MultiCounterStat // ExtensionHeaderProblem is the number of IP packets which were dropped // because of a problem encountered when processing an IPv6 extension // header. ExtensionHeaderProblem tcpip.MultiCounterStat // UnexpectedMulticastInputInterface is the number of multicast packets that // were received on an interface that did not match the corresponding route's // expected input interface. UnexpectedMulticastInputInterface tcpip.MultiCounterStat // UnknownOutputEndpoint is the number of packets that could not be forwarded // because the output endpoint could not be found. UnknownOutputEndpoint tcpip.MultiCounterStat // NoMulticastPendingQueueBufferSpace is the number of multicast packets that // were dropped due to insufficient buffer space in the pending packet queue. NoMulticastPendingQueueBufferSpace tcpip.MultiCounterStat // OutgoingDeviceNoBufferSpace is the number of packets that were dropped due // to insufficient space in the outgoing device. OutgoingDeviceNoBufferSpace tcpip.MultiCounterStat // Errors is the number of IP packets received which could not be // successfully forwarded. Errors tcpip.MultiCounterStat } // Init sets internal counters to track a and b counters. func (m *MultiCounterIPForwardingStats) Init(a, b *tcpip.IPForwardingStats) { m.Unrouteable.Init(a.Unrouteable, b.Unrouteable) m.Errors.Init(a.Errors, b.Errors) m.InitializingSource.Init(a.InitializingSource, b.InitializingSource) m.LinkLocalSource.Init(a.LinkLocalSource, b.LinkLocalSource) m.LinkLocalDestination.Init(a.LinkLocalDestination, b.LinkLocalDestination) m.ExtensionHeaderProblem.Init(a.ExtensionHeaderProblem, b.ExtensionHeaderProblem) m.PacketTooBig.Init(a.PacketTooBig, b.PacketTooBig) m.ExhaustedTTL.Init(a.ExhaustedTTL, b.ExhaustedTTL) m.HostUnreachable.Init(a.HostUnreachable, b.HostUnreachable) m.UnexpectedMulticastInputInterface.Init(a.UnexpectedMulticastInputInterface, b.UnexpectedMulticastInputInterface) m.UnknownOutputEndpoint.Init(a.UnknownOutputEndpoint, b.UnknownOutputEndpoint) m.NoMulticastPendingQueueBufferSpace.Init(a.NoMulticastPendingQueueBufferSpace, b.NoMulticastPendingQueueBufferSpace) m.OutgoingDeviceNoBufferSpace.Init(a.OutgoingDeviceNoBufferSpace, b.OutgoingDeviceNoBufferSpace) } // LINT.ThenChange(:MultiCounterIPForwardingStats, ../../../tcpip.go:IPForwardingStats) // LINT.IfChange(MultiCounterIPStats) // MultiCounterIPStats holds IP statistics, each counter may have several // versions. // // +stateify savable type MultiCounterIPStats struct { // PacketsReceived is the number of IP packets received from the link // layer. PacketsReceived tcpip.MultiCounterStat // ValidPacketsReceived is the number of valid IP packets that reached the IP // layer. ValidPacketsReceived tcpip.MultiCounterStat // DisabledPacketsReceived is the number of IP packets received from // the link layer when the IP layer is disabled. DisabledPacketsReceived tcpip.MultiCounterStat // InvalidDestinationAddressesReceived is the number of IP packets // received with an unknown or invalid destination address. InvalidDestinationAddressesReceived tcpip.MultiCounterStat // InvalidSourceAddressesReceived is the number of IP packets received // with a source address that should never have been received on the // wire. InvalidSourceAddressesReceived tcpip.MultiCounterStat // PacketsDelivered is the number of incoming IP packets successfully // delivered to the transport layer. PacketsDelivered tcpip.MultiCounterStat // PacketsSent is the number of IP packets sent via WritePacket. PacketsSent tcpip.MultiCounterStat // OutgoingPacketErrors is the number of IP packets which failed to // write to a link-layer endpoint. OutgoingPacketErrors tcpip.MultiCounterStat // MalformedPacketsReceived is the number of IP Packets that were // dropped due to the IP packet header failing validation checks. MalformedPacketsReceived tcpip.MultiCounterStat // MalformedFragmentsReceived is the number of IP Fragments that were // dropped due to the fragment failing validation checks. MalformedFragmentsReceived tcpip.MultiCounterStat // IPTablesPreroutingDropped is the number of IP packets dropped in the // Prerouting chain. IPTablesPreroutingDropped tcpip.MultiCounterStat // IPTablesInputDropped is the number of IP packets dropped in the // Input chain. IPTablesInputDropped tcpip.MultiCounterStat // IPTablesForwardDropped is the number of IP packets dropped in the // Forward chain. IPTablesForwardDropped tcpip.MultiCounterStat // IPTablesOutputDropped is the number of IP packets dropped in the // Output chain. IPTablesOutputDropped tcpip.MultiCounterStat // IPTablesPostroutingDropped is the number of IP packets dropped in // the Postrouting chain. IPTablesPostroutingDropped tcpip.MultiCounterStat // TODO(https://gvisor.dev/issues/5529): Move the IPv4-only option // stats out of IPStats. // OptionTimestampReceived is the number of Timestamp options seen. OptionTimestampReceived tcpip.MultiCounterStat // OptionRecordRouteReceived is the number of Record Route options // seen. OptionRecordRouteReceived tcpip.MultiCounterStat // OptionRouterAlertReceived is the number of Router Alert options // seen. OptionRouterAlertReceived tcpip.MultiCounterStat // OptionUnknownReceived is the number of unknown IP options seen. OptionUnknownReceived tcpip.MultiCounterStat // Forwarding collects stats related to IP forwarding. Forwarding MultiCounterIPForwardingStats } // Init sets internal counters to track a and b counters. func (m *MultiCounterIPStats) Init(a, b *tcpip.IPStats) { m.PacketsReceived.Init(a.PacketsReceived, b.PacketsReceived) m.ValidPacketsReceived.Init(a.ValidPacketsReceived, b.ValidPacketsReceived) m.DisabledPacketsReceived.Init(a.DisabledPacketsReceived, b.DisabledPacketsReceived) m.InvalidDestinationAddressesReceived.Init(a.InvalidDestinationAddressesReceived, b.InvalidDestinationAddressesReceived) m.InvalidSourceAddressesReceived.Init(a.InvalidSourceAddressesReceived, b.InvalidSourceAddressesReceived) m.PacketsDelivered.Init(a.PacketsDelivered, b.PacketsDelivered) m.PacketsSent.Init(a.PacketsSent, b.PacketsSent) m.OutgoingPacketErrors.Init(a.OutgoingPacketErrors, b.OutgoingPacketErrors) m.MalformedPacketsReceived.Init(a.MalformedPacketsReceived, b.MalformedPacketsReceived) m.MalformedFragmentsReceived.Init(a.MalformedFragmentsReceived, b.MalformedFragmentsReceived) m.IPTablesPreroutingDropped.Init(a.IPTablesPreroutingDropped, b.IPTablesPreroutingDropped) m.IPTablesInputDropped.Init(a.IPTablesInputDropped, b.IPTablesInputDropped) m.IPTablesForwardDropped.Init(a.IPTablesForwardDropped, b.IPTablesForwardDropped) m.IPTablesOutputDropped.Init(a.IPTablesOutputDropped, b.IPTablesOutputDropped) m.IPTablesPostroutingDropped.Init(a.IPTablesPostroutingDropped, b.IPTablesPostroutingDropped) m.OptionTimestampReceived.Init(a.OptionTimestampReceived, b.OptionTimestampReceived) m.OptionRecordRouteReceived.Init(a.OptionRecordRouteReceived, b.OptionRecordRouteReceived) m.OptionRouterAlertReceived.Init(a.OptionRouterAlertReceived, b.OptionRouterAlertReceived) m.OptionUnknownReceived.Init(a.OptionUnknownReceived, b.OptionUnknownReceived) m.Forwarding.Init(&a.Forwarding, &b.Forwarding) } // LINT.ThenChange(:MultiCounterIPStats, ../../../tcpip.go:IPStats) golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/internal/multicast/000077500000000000000000000000001465435605700260555ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/internal/multicast/multicast_state_autogen.go000066400000000000000000000065251465435605700333430ustar00rootroot00000000000000// automatically generated by stateify. package multicast import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (r *RouteTable) StateTypeName() string { return "pkg/tcpip/network/internal/multicast.RouteTable" } func (r *RouteTable) StateFields() []string { return []string{ "installedRoutes", "pendingRoutes", "cleanupPendingRoutesTimer", "isCleanupRoutineRunning", "config", } } func (r *RouteTable) beforeSave() {} // +checklocksignore func (r *RouteTable) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.installedRoutes) stateSinkObject.Save(1, &r.pendingRoutes) stateSinkObject.Save(2, &r.cleanupPendingRoutesTimer) stateSinkObject.Save(3, &r.isCleanupRoutineRunning) stateSinkObject.Save(4, &r.config) } func (r *RouteTable) afterLoad(context.Context) {} // +checklocksignore func (r *RouteTable) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.installedRoutes) stateSourceObject.Load(1, &r.pendingRoutes) stateSourceObject.Load(2, &r.cleanupPendingRoutesTimer) stateSourceObject.Load(3, &r.isCleanupRoutineRunning) stateSourceObject.Load(4, &r.config) } func (r *InstalledRoute) StateTypeName() string { return "pkg/tcpip/network/internal/multicast.InstalledRoute" } func (r *InstalledRoute) StateFields() []string { return []string{ "MulticastRoute", "lastUsedTimestamp", } } func (r *InstalledRoute) beforeSave() {} // +checklocksignore func (r *InstalledRoute) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.MulticastRoute) stateSinkObject.Save(1, &r.lastUsedTimestamp) } func (r *InstalledRoute) afterLoad(context.Context) {} // +checklocksignore func (r *InstalledRoute) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.MulticastRoute) stateSourceObject.Load(1, &r.lastUsedTimestamp) } func (p *PendingRoute) StateTypeName() string { return "pkg/tcpip/network/internal/multicast.PendingRoute" } func (p *PendingRoute) StateFields() []string { return []string{ "packets", "expiration", } } func (p *PendingRoute) beforeSave() {} // +checklocksignore func (p *PendingRoute) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.packets) stateSinkObject.Save(1, &p.expiration) } func (p *PendingRoute) afterLoad(context.Context) {} // +checklocksignore func (p *PendingRoute) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.packets) stateSourceObject.Load(1, &p.expiration) } func (c *Config) StateTypeName() string { return "pkg/tcpip/network/internal/multicast.Config" } func (c *Config) StateFields() []string { return []string{ "MaxPendingQueueSize", "Clock", } } func (c *Config) beforeSave() {} // +checklocksignore func (c *Config) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.MaxPendingQueueSize) stateSinkObject.Save(1, &c.Clock) } func (c *Config) afterLoad(context.Context) {} // +checklocksignore func (c *Config) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.MaxPendingQueueSize) stateSourceObject.Load(1, &c.Clock) } func init() { state.Register((*RouteTable)(nil)) state.Register((*InstalledRoute)(nil)) state.Register((*PendingRoute)(nil)) state.Register((*Config)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/internal/multicast/route_table.go000066400000000000000000000343201465435605700307130ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package multicast contains utilities for supporting multicast routing. package multicast import ( "errors" "fmt" "sync" "time" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // RouteTable represents a multicast routing table. // // +stateify savable type RouteTable struct { // Internally, installed and pending routes are stored and locked separately // A couple of reasons for structuring the table this way: // // 1. We can avoid write locking installed routes when pending packets are // being queued. In other words, the happy path of reading installed // routes doesn't require an exclusive lock. // 2. The cleanup process for expired routes only needs to operate on pending // routes. Like above, a write lock on the installed routes can be // avoided. // 3. This structure is similar to the Linux implementation: // https://github.com/torvalds/linux/blob/cffb2b72d3e/include/linux/mroute_base.h#L250 // The installedMu lock should typically be acquired before the pendingMu // lock. This ensures that installed routes can continue to be read even when // the pending routes are write locked. installedMu sync.RWMutex `state:"nosave"` // Maintaining pointers ensures that the installed routes are exclusively // locked only when a route is being installed. // +checklocks:installedMu installedRoutes map[stack.UnicastSourceAndMulticastDestination]*InstalledRoute pendingMu sync.RWMutex `state:"nosave"` // +checklocks:pendingMu pendingRoutes map[stack.UnicastSourceAndMulticastDestination]PendingRoute // cleanupPendingRoutesTimer is a timer that triggers a routine to remove // pending routes that are expired. // +checklocks:pendingMu cleanupPendingRoutesTimer tcpip.Timer // +checklocks:pendingMu isCleanupRoutineRunning bool config Config } var ( // ErrNoBufferSpace indicates that no buffer space is available in the // pending route packet queue. ErrNoBufferSpace = errors.New("unable to queue packet, no buffer space available") // ErrMissingClock indicates that a clock was not provided as part of the // Config, but is required. ErrMissingClock = errors.New("clock must not be nil") // ErrAlreadyInitialized indicates that RouteTable.Init was already invoked. ErrAlreadyInitialized = errors.New("table is already initialized") ) // InstalledRoute represents a route that is in the installed state. // // If a route is in the installed state, then it may be used to forward // multicast packets. // // +stateify savable type InstalledRoute struct { stack.MulticastRoute lastUsedTimestampMu sync.RWMutex `state:"nosave"` // +checklocks:lastUsedTimestampMu lastUsedTimestamp tcpip.MonotonicTime } // LastUsedTimestamp returns a monotonic timestamp that corresponds to the last // time the route was used or updated. func (r *InstalledRoute) LastUsedTimestamp() tcpip.MonotonicTime { r.lastUsedTimestampMu.RLock() defer r.lastUsedTimestampMu.RUnlock() return r.lastUsedTimestamp } // SetLastUsedTimestamp sets the time that the route was last used. // // The timestamp is only updated if it occurs after the currently set // timestamp. Callers should invoke this anytime the route is used to forward a // packet. func (r *InstalledRoute) SetLastUsedTimestamp(monotonicTime tcpip.MonotonicTime) { r.lastUsedTimestampMu.Lock() defer r.lastUsedTimestampMu.Unlock() if monotonicTime.After(r.lastUsedTimestamp) { r.lastUsedTimestamp = monotonicTime } } // PendingRoute represents a route that is in the "pending" state. // // A route is in the pending state if an installed route does not yet exist // for the entry. For such routes, packets are added to an expiring queue until // a route is installed. // // +stateify savable type PendingRoute struct { packets []*stack.PacketBuffer // expiration is the timestamp at which the pending route should be expired. // // If this value is before the current time, then this pending route will // be dropped. expiration tcpip.MonotonicTime } func (p *PendingRoute) releasePackets() { for _, pkt := range p.packets { pkt.DecRef() } } func (p *PendingRoute) isExpired(currentTime tcpip.MonotonicTime) bool { return currentTime.After(p.expiration) } const ( // DefaultMaxPendingQueueSize corresponds to the number of elements that can // be in the packet queue for a pending route. // // Matches the Linux default queue size: // https://github.com/torvalds/linux/blob/26291c54e11/net/ipv6/ip6mr.c#L1186 DefaultMaxPendingQueueSize uint8 = 3 // DefaultPendingRouteExpiration is the default maximum lifetime of a pending // route. // // Matches the Linux default: // https://github.com/torvalds/linux/blob/26291c54e11/net/ipv6/ip6mr.c#L991 DefaultPendingRouteExpiration time.Duration = 10 * time.Second // DefaultCleanupInterval is the default frequency of the routine that // expires pending routes. // // Matches the Linux default: // https://github.com/torvalds/linux/blob/26291c54e11/net/ipv6/ip6mr.c#L793 DefaultCleanupInterval time.Duration = 10 * time.Second ) // Config represents the options for configuring a RouteTable. // // +stateify savable type Config struct { // MaxPendingQueueSize corresponds to the maximum number of queued packets // for a pending route. // // If the caller attempts to queue a packet and the queue already contains // MaxPendingQueueSize elements, then the packet will be rejected and should // not be forwarded. MaxPendingQueueSize uint8 // Clock represents the clock that should be used to obtain the current time. // // This field is required and must have a non-nil value. Clock tcpip.Clock } // DefaultConfig returns the default configuration for the table. func DefaultConfig(clock tcpip.Clock) Config { return Config{ MaxPendingQueueSize: DefaultMaxPendingQueueSize, Clock: clock, } } // Init initializes the RouteTable with the provided config. // // An error is returned if the config is not valid. // // Must be called before any other function on the table. func (r *RouteTable) Init(config Config) error { r.installedMu.Lock() defer r.installedMu.Unlock() r.pendingMu.Lock() defer r.pendingMu.Unlock() if r.installedRoutes != nil { return ErrAlreadyInitialized } if config.Clock == nil { return ErrMissingClock } r.config = config r.installedRoutes = make(map[stack.UnicastSourceAndMulticastDestination]*InstalledRoute) r.pendingRoutes = make(map[stack.UnicastSourceAndMulticastDestination]PendingRoute) return nil } // Close cleans up resources held by the table. // // Calling this will stop the cleanup routine and release any packets owned by // the table. func (r *RouteTable) Close() { r.pendingMu.Lock() defer r.pendingMu.Unlock() if r.cleanupPendingRoutesTimer != nil { r.cleanupPendingRoutesTimer.Stop() } for key, route := range r.pendingRoutes { delete(r.pendingRoutes, key) route.releasePackets() } } // maybeStopCleanupRoutine stops the pending routes cleanup routine if no // pending routes exist. // // Returns true if the timer is not running. Otherwise, returns false. // // +checklocks:r.pendingMu func (r *RouteTable) maybeStopCleanupRoutineLocked() bool { if !r.isCleanupRoutineRunning { return true } if len(r.pendingRoutes) == 0 { r.cleanupPendingRoutesTimer.Stop() r.isCleanupRoutineRunning = false return true } return false } func (r *RouteTable) cleanupPendingRoutes() { currentTime := r.config.Clock.NowMonotonic() r.pendingMu.Lock() defer r.pendingMu.Unlock() for key, route := range r.pendingRoutes { if route.isExpired(currentTime) { delete(r.pendingRoutes, key) route.releasePackets() } } if stopped := r.maybeStopCleanupRoutineLocked(); !stopped { r.cleanupPendingRoutesTimer.Reset(DefaultCleanupInterval) } } func (r *RouteTable) newPendingRoute() PendingRoute { return PendingRoute{ packets: make([]*stack.PacketBuffer, 0, r.config.MaxPendingQueueSize), expiration: r.config.Clock.NowMonotonic().Add(DefaultPendingRouteExpiration), } } // NewInstalledRoute instantiates an installed route for the table. func (r *RouteTable) NewInstalledRoute(route stack.MulticastRoute) *InstalledRoute { return &InstalledRoute{ MulticastRoute: route, lastUsedTimestamp: r.config.Clock.NowMonotonic(), } } // GetRouteResult represents the result of calling GetRouteOrInsertPending. type GetRouteResult struct { // GetRouteResultState signals the result of calling GetRouteOrInsertPending. GetRouteResultState GetRouteResultState // InstalledRoute represents the existing installed route. This field will // only be populated if the GetRouteResultState is InstalledRouteFound. InstalledRoute *InstalledRoute } // GetRouteResultState signals the result of calling GetRouteOrInsertPending. type GetRouteResultState uint8 const ( // InstalledRouteFound indicates that an InstalledRoute was found. InstalledRouteFound GetRouteResultState = iota // PacketQueuedInPendingRoute indicates that the packet was queued in an // existing pending route. PacketQueuedInPendingRoute // NoRouteFoundAndPendingInserted indicates that no route was found and that // a pending route was newly inserted into the RouteTable. NoRouteFoundAndPendingInserted ) func (e GetRouteResultState) String() string { switch e { case InstalledRouteFound: return "InstalledRouteFound" case PacketQueuedInPendingRoute: return "PacketQueuedInPendingRoute" case NoRouteFoundAndPendingInserted: return "NoRouteFoundAndPendingInserted" default: return fmt.Sprintf("%d", uint8(e)) } } // GetRouteOrInsertPending attempts to fetch the installed route that matches // the provided key. // // If no matching installed route is found, then the pkt is cloned and queued // in a pending route. The GetRouteResult.GetRouteResultState will indicate // whether the pkt was queued in a new pending route or an existing one. // // If the relevant pending route queue is at max capacity, then returns false. // Otherwise, returns true. func (r *RouteTable) GetRouteOrInsertPending(key stack.UnicastSourceAndMulticastDestination, pkt *stack.PacketBuffer) (GetRouteResult, bool) { r.installedMu.RLock() defer r.installedMu.RUnlock() if route, ok := r.installedRoutes[key]; ok { return GetRouteResult{GetRouteResultState: InstalledRouteFound, InstalledRoute: route}, true } r.pendingMu.Lock() defer r.pendingMu.Unlock() pendingRoute, getRouteResultState := r.getOrCreatePendingRouteRLocked(key) if len(pendingRoute.packets) >= int(r.config.MaxPendingQueueSize) { // The incoming packet is rejected if the pending queue is already at max // capacity. This behavior matches the Linux implementation: // https://github.com/torvalds/linux/blob/ae085d7f936/net/ipv4/ipmr.c#L1147 return GetRouteResult{}, false } pendingRoute.packets = append(pendingRoute.packets, pkt.Clone()) r.pendingRoutes[key] = pendingRoute if !r.isCleanupRoutineRunning { // The cleanup routine isn't running, but should be. Start it. if r.cleanupPendingRoutesTimer == nil { r.cleanupPendingRoutesTimer = r.config.Clock.AfterFunc(DefaultCleanupInterval, r.cleanupPendingRoutes) } else { r.cleanupPendingRoutesTimer.Reset(DefaultCleanupInterval) } r.isCleanupRoutineRunning = true } return GetRouteResult{GetRouteResultState: getRouteResultState, InstalledRoute: nil}, true } // +checklocks:r.pendingMu func (r *RouteTable) getOrCreatePendingRouteRLocked(key stack.UnicastSourceAndMulticastDestination) (PendingRoute, GetRouteResultState) { if pendingRoute, ok := r.pendingRoutes[key]; ok { return pendingRoute, PacketQueuedInPendingRoute } return r.newPendingRoute(), NoRouteFoundAndPendingInserted } // AddInstalledRoute adds the provided route to the table. // // Packets that were queued while the route was in the pending state are // returned. The caller assumes ownership of these packets and is responsible // for forwarding and releasing them. If an installed route already exists for // the provided key, then it is overwritten. func (r *RouteTable) AddInstalledRoute(key stack.UnicastSourceAndMulticastDestination, route *InstalledRoute) []*stack.PacketBuffer { r.installedMu.Lock() defer r.installedMu.Unlock() r.installedRoutes[key] = route r.pendingMu.Lock() pendingRoute, ok := r.pendingRoutes[key] delete(r.pendingRoutes, key) // No need to reset the timer here. The cleanup routine is responsible for // doing so. _ = r.maybeStopCleanupRoutineLocked() r.pendingMu.Unlock() // Ignore the pending route if it is expired. It may be in this state since // the cleanup process is only run periodically. if !ok || pendingRoute.isExpired(r.config.Clock.NowMonotonic()) { pendingRoute.releasePackets() return nil } return pendingRoute.packets } // RemoveInstalledRoute deletes any installed route that matches the provided // key. // // Returns true if a route was removed. Otherwise returns false. func (r *RouteTable) RemoveInstalledRoute(key stack.UnicastSourceAndMulticastDestination) bool { r.installedMu.Lock() defer r.installedMu.Unlock() if _, ok := r.installedRoutes[key]; ok { delete(r.installedRoutes, key) return true } return false } // RemoveAllInstalledRoutes removes all installed routes from the table. func (r *RouteTable) RemoveAllInstalledRoutes() { r.installedMu.Lock() defer r.installedMu.Unlock() for key := range r.installedRoutes { delete(r.installedRoutes, key) } } // GetLastUsedTimestamp returns a monotonic timestamp that represents the last // time the route that matches the provided key was used or updated. // // Returns true if a matching route was found. Otherwise returns false. func (r *RouteTable) GetLastUsedTimestamp(key stack.UnicastSourceAndMulticastDestination) (tcpip.MonotonicTime, bool) { r.installedMu.RLock() defer r.installedMu.RUnlock() if route, ok := r.installedRoutes[key]; ok { return route.LastUsedTimestamp(), true } return tcpip.MonotonicTime{}, false } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/ipv4/000077500000000000000000000000001465435605700231165ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/ipv4/icmp.go000066400000000000000000000717121465435605700244050ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ipv4 import ( "fmt" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/checksum" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // icmpv4DestinationUnreachableSockError is a general ICMPv4 Destination // Unreachable error. // // +stateify savable type icmpv4DestinationUnreachableSockError struct{} // Origin implements tcpip.SockErrorCause. func (*icmpv4DestinationUnreachableSockError) Origin() tcpip.SockErrOrigin { return tcpip.SockExtErrorOriginICMP } // Type implements tcpip.SockErrorCause. func (*icmpv4DestinationUnreachableSockError) Type() uint8 { return uint8(header.ICMPv4DstUnreachable) } // Info implements tcpip.SockErrorCause. func (*icmpv4DestinationUnreachableSockError) Info() uint32 { return 0 } var _ stack.TransportError = (*icmpv4DestinationHostUnreachableSockError)(nil) // icmpv4DestinationHostUnreachableSockError is an ICMPv4 Destination Host // Unreachable error. // // It indicates that a packet was not able to reach the destination host. // // +stateify savable type icmpv4DestinationHostUnreachableSockError struct { icmpv4DestinationUnreachableSockError } // Code implements tcpip.SockErrorCause. func (*icmpv4DestinationHostUnreachableSockError) Code() uint8 { return uint8(header.ICMPv4HostUnreachable) } // Kind implements stack.TransportError. func (*icmpv4DestinationHostUnreachableSockError) Kind() stack.TransportErrorKind { return stack.DestinationHostUnreachableTransportError } var _ stack.TransportError = (*icmpv4DestinationNetUnreachableSockError)(nil) // icmpv4DestinationNetUnreachableSockError is an ICMPv4 Destination Net // Unreachable error. // // It indicates that a packet was not able to reach the destination network. // // +stateify savable type icmpv4DestinationNetUnreachableSockError struct { icmpv4DestinationUnreachableSockError } // Code implements tcpip.SockErrorCause. func (*icmpv4DestinationNetUnreachableSockError) Code() uint8 { return uint8(header.ICMPv4NetUnreachable) } // Kind implements stack.TransportError. func (*icmpv4DestinationNetUnreachableSockError) Kind() stack.TransportErrorKind { return stack.DestinationNetworkUnreachableTransportError } var _ stack.TransportError = (*icmpv4DestinationPortUnreachableSockError)(nil) // icmpv4DestinationPortUnreachableSockError is an ICMPv4 Destination Port // Unreachable error. // // It indicates that a packet reached the destination host, but the transport // protocol was not active on the destination port. // // +stateify savable type icmpv4DestinationPortUnreachableSockError struct { icmpv4DestinationUnreachableSockError } // Code implements tcpip.SockErrorCause. func (*icmpv4DestinationPortUnreachableSockError) Code() uint8 { return uint8(header.ICMPv4PortUnreachable) } // Kind implements stack.TransportError. func (*icmpv4DestinationPortUnreachableSockError) Kind() stack.TransportErrorKind { return stack.DestinationPortUnreachableTransportError } var _ stack.TransportError = (*icmpv4DestinationProtoUnreachableSockError)(nil) // icmpv4DestinationProtoUnreachableSockError is an ICMPv4 Destination Protocol // Unreachable error. // // It indicates that a packet reached the destination host, but the transport // protocol was not reachable // // +stateify savable type icmpv4DestinationProtoUnreachableSockError struct { icmpv4DestinationUnreachableSockError } // Code implements tcpip.SockErrorCause. func (*icmpv4DestinationProtoUnreachableSockError) Code() uint8 { return uint8(header.ICMPv4ProtoUnreachable) } // Kind implements stack.TransportError. func (*icmpv4DestinationProtoUnreachableSockError) Kind() stack.TransportErrorKind { return stack.DestinationProtoUnreachableTransportError } var _ stack.TransportError = (*icmpv4SourceRouteFailedSockError)(nil) // icmpv4SourceRouteFailedSockError is an ICMPv4 Destination Unreachable error // due to source route failed. // // +stateify savable type icmpv4SourceRouteFailedSockError struct { icmpv4DestinationUnreachableSockError } // Code implements tcpip.SockErrorCause. func (*icmpv4SourceRouteFailedSockError) Code() uint8 { return uint8(header.ICMPv4SourceRouteFailed) } // Kind implements stack.TransportError. func (*icmpv4SourceRouteFailedSockError) Kind() stack.TransportErrorKind { return stack.SourceRouteFailedTransportError } var _ stack.TransportError = (*icmpv4SourceHostIsolatedSockError)(nil) // icmpv4SourceHostIsolatedSockError is an ICMPv4 Destination Unreachable error // due to source host isolated (not on the network). // // +stateify savable type icmpv4SourceHostIsolatedSockError struct { icmpv4DestinationUnreachableSockError } // Code implements tcpip.SockErrorCause. func (*icmpv4SourceHostIsolatedSockError) Code() uint8 { return uint8(header.ICMPv4SourceHostIsolated) } // Kind implements stack.TransportError. func (*icmpv4SourceHostIsolatedSockError) Kind() stack.TransportErrorKind { return stack.SourceHostIsolatedTransportError } var _ stack.TransportError = (*icmpv4DestinationHostUnknownSockError)(nil) // icmpv4DestinationHostUnknownSockError is an ICMPv4 Destination Unreachable // error due to destination host unknown/down. // // +stateify savable type icmpv4DestinationHostUnknownSockError struct { icmpv4DestinationUnreachableSockError } // Code implements tcpip.SockErrorCause. func (*icmpv4DestinationHostUnknownSockError) Code() uint8 { return uint8(header.ICMPv4DestinationHostUnknown) } // Kind implements stack.TransportError. func (*icmpv4DestinationHostUnknownSockError) Kind() stack.TransportErrorKind { return stack.DestinationHostDownTransportError } var _ stack.TransportError = (*icmpv4FragmentationNeededSockError)(nil) // icmpv4FragmentationNeededSockError is an ICMPv4 Destination Unreachable error // due to fragmentation being required but the packet was set to not be // fragmented. // // It indicates that a link exists on the path to the destination with an MTU // that is too small to carry the packet. // // +stateify savable type icmpv4FragmentationNeededSockError struct { icmpv4DestinationUnreachableSockError mtu uint32 } // Code implements tcpip.SockErrorCause. func (*icmpv4FragmentationNeededSockError) Code() uint8 { return uint8(header.ICMPv4FragmentationNeeded) } // Info implements tcpip.SockErrorCause. func (e *icmpv4FragmentationNeededSockError) Info() uint32 { return e.mtu } // Kind implements stack.TransportError. func (*icmpv4FragmentationNeededSockError) Kind() stack.TransportErrorKind { return stack.PacketTooBigTransportError } func (e *endpoint) checkLocalAddress(addr tcpip.Address) bool { if e.nic.Spoofing() { return true } if addressEndpoint := e.AcquireAssignedAddress(addr, false, stack.NeverPrimaryEndpoint, true /* readOnly */); addressEndpoint != nil { return true } return false } // handleControl handles the case when an ICMP error packet contains the headers // of the original packet that caused the ICMP one to be sent. This information // is used to find out which transport endpoint must be notified about the ICMP // packet. We only expect the payload, not the enclosing ICMP packet. func (e *endpoint) handleControl(errInfo stack.TransportError, pkt *stack.PacketBuffer) { h, ok := pkt.Data().PullUp(header.IPv4MinimumSize) if !ok { return } hdr := header.IPv4(h) // We don't use IsValid() here because ICMP only requires that the IP // header plus 8 bytes of the transport header be included. So it's // likely that it is truncated, which would cause IsValid to return // false. // // Drop packet if it doesn't have the basic IPv4 header or if the // original source address doesn't match an address we own. srcAddr := hdr.SourceAddress() if !e.checkLocalAddress(srcAddr) { return } hlen := int(hdr.HeaderLength()) if pkt.Data().Size() < hlen || hdr.FragmentOffset() != 0 { // We won't be able to handle this if it doesn't contain the // full IPv4 header, or if it's a fragment not at offset 0 // (because it won't have the transport header). return } // Keep needed information before trimming header. p := hdr.TransportProtocol() dstAddr := hdr.DestinationAddress() // Skip the ip header, then deliver the error. if _, ok := pkt.Data().Consume(hlen); !ok { panic(fmt.Sprintf("could not consume the IP header of %d bytes", hlen)) } e.dispatcher.DeliverTransportError(srcAddr, dstAddr, ProtocolNumber, p, errInfo, pkt) } func (e *endpoint) handleICMP(pkt *stack.PacketBuffer) { received := e.stats.icmp.packetsReceived h := header.ICMPv4(pkt.TransportHeader().Slice()) if len(h) < header.ICMPv4MinimumSize { received.invalid.Increment() return } // Only do in-stack processing if the checksum is correct. if checksum.Checksum(h, pkt.Data().Checksum()) != 0xffff { received.invalid.Increment() // It's possible that a raw socket expects to receive this regardless // of checksum errors. If it's an echo request we know it's safe because // we are the only handler, however other types do not cope well with // packets with checksum errors. switch h.Type() { case header.ICMPv4Echo: e.dispatcher.DeliverTransportPacket(header.ICMPv4ProtocolNumber, pkt) } return } iph := header.IPv4(pkt.NetworkHeader().Slice()) var newOptions header.IPv4Options if opts := iph.Options(); len(opts) != 0 { // RFC 1122 section 3.2.2.6 (page 43) (and similar for other round trip // type ICMP packets): // If a Record Route and/or Time Stamp option is received in an // ICMP Echo Request, this option (these options) SHOULD be // updated to include the current host and included in the IP // header of the Echo Reply message, without "truncation". // Thus, the recorded route will be for the entire round trip. // // So we need to let the option processor know how it should handle them. var op optionsUsage if h.Type() == header.ICMPv4Echo { op = &optionUsageEcho{} } else { op = &optionUsageReceive{} } var optProblem *header.IPv4OptParameterProblem newOptions, _, optProblem = e.processIPOptions(pkt, opts, op) if optProblem != nil { if optProblem.NeedICMP { _ = e.protocol.returnError(&icmpReasonParamProblem{ pointer: optProblem.Pointer, }, pkt, true /* deliveredLocally */) e.stats.ip.MalformedPacketsReceived.Increment() } return } copied := copy(opts, newOptions) if copied != len(newOptions) { panic(fmt.Sprintf("copied %d bytes of new options, expected %d bytes", copied, len(newOptions))) } for i := copied; i < len(opts); i++ { // Pad with 0 (EOL). RFC 791 page 23 says "The padding is zero". opts[i] = byte(header.IPv4OptionListEndType) } } // TODO(b/112892170): Meaningfully handle all ICMP types. switch h.Type() { case header.ICMPv4Echo: received.echoRequest.Increment() // DeliverTransportPacket may modify pkt so don't use it beyond // this point. Make a deep copy of the data before pkt gets sent as we will // be modifying fields. Both the ICMP header (with its type modified to // EchoReply) and payload are reused in the reply packet. // // TODO(gvisor.dev/issue/4399): The copy may not be needed if there are no // waiting endpoints. Consider moving responsibility for doing the copy to // DeliverTransportPacket so that is is only done when needed. replyData := stack.PayloadSince(pkt.TransportHeader()) defer replyData.Release() ipHdr := header.IPv4(pkt.NetworkHeader().Slice()) localAddressBroadcast := pkt.NetworkPacketInfo.LocalAddressBroadcast // It's possible that a raw socket expects to receive this. e.dispatcher.DeliverTransportPacket(header.ICMPv4ProtocolNumber, pkt) pkt = nil sent := e.stats.icmp.packetsSent if !e.protocol.allowICMPReply(header.ICMPv4EchoReply, header.ICMPv4UnusedCode) { sent.rateLimited.Increment() return } // As per RFC 1122 section 3.2.1.3, when a host sends any datagram, the IP // source address MUST be one of its own IP addresses (but not a broadcast // or multicast address). localAddr := ipHdr.DestinationAddress() if localAddressBroadcast || header.IsV4MulticastAddress(localAddr) { localAddr = tcpip.Address{} } r, err := e.protocol.stack.FindRoute(e.nic.ID(), localAddr, ipHdr.SourceAddress(), ProtocolNumber, false /* multicastLoop */) if err != nil { // If we cannot find a route to the destination, silently drop the packet. return } defer r.Release() outgoingEP, ok := e.protocol.getEndpointForNIC(r.NICID()) if !ok { // The outgoing NIC went away. sent.dropped.Increment() return } // Because IP and ICMP are so closely intertwined, we need to handcraft our // IP header to be able to follow RFC 792. The wording on page 13 is as // follows: // IP Fields: // Addresses // The address of the source in an echo message will be the // destination of the echo reply message. To form an echo reply // message, the source and destination addresses are simply reversed, // the type code changed to 0, and the checksum recomputed. // // This was interpreted by early implementors to mean that all options must // be copied from the echo request IP header to the echo reply IP header // and this behaviour is still relied upon by some applications. // // Create a copy of the IP header we received, options and all, and change // The fields we need to alter. // // We need to produce the entire packet in the data segment in order to // use WriteHeaderIncludedPacket(). WriteHeaderIncludedPacket sets the // total length and the header checksum so we don't need to set those here. // // Take the base of the incoming request IP header but replace the options. replyHeaderLength := uint8(header.IPv4MinimumSize + len(newOptions)) replyIPHdrView := buffer.NewView(int(replyHeaderLength)) replyIPHdrView.Write(iph[:header.IPv4MinimumSize]) replyIPHdrView.Write(newOptions) replyIPHdr := header.IPv4(replyIPHdrView.AsSlice()) replyIPHdr.SetHeaderLength(replyHeaderLength) replyIPHdr.SetSourceAddress(r.LocalAddress()) replyIPHdr.SetDestinationAddress(r.RemoteAddress()) replyIPHdr.SetTTL(r.DefaultTTL()) replyIPHdr.SetTotalLength(uint16(len(replyIPHdr) + len(replyData.AsSlice()))) replyIPHdr.SetChecksum(0) replyIPHdr.SetChecksum(^replyIPHdr.CalculateChecksum()) replyICMPHdr := header.ICMPv4(replyData.AsSlice()) replyICMPHdr.SetType(header.ICMPv4EchoReply) replyICMPHdr.SetChecksum(0) replyICMPHdr.SetChecksum(^checksum.Checksum(replyData.AsSlice(), 0)) replyBuf := buffer.MakeWithView(replyIPHdrView) replyBuf.Append(replyData.Clone()) replyPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ ReserveHeaderBytes: int(r.MaxHeaderLength()), Payload: replyBuf, }) defer replyPkt.DecRef() // Populate the network/transport headers in the packet buffer so the // ICMP packet goes through IPTables. if ok := parse.IPv4(replyPkt); !ok { panic("expected to parse IPv4 header we just created") } if ok := parse.ICMPv4(replyPkt); !ok { panic("expected to parse ICMPv4 header we just created") } if err := outgoingEP.writePacket(r, replyPkt); err != nil { sent.dropped.Increment() return } sent.echoReply.Increment() case header.ICMPv4EchoReply: received.echoReply.Increment() // ICMP sockets expect the ICMP header to be present, so we don't consume // the ICMP header. e.dispatcher.DeliverTransportPacket(header.ICMPv4ProtocolNumber, pkt) case header.ICMPv4DstUnreachable: received.dstUnreachable.Increment() mtu := h.MTU() code := h.Code() switch code { case header.ICMPv4NetUnreachable, header.ICMPv4DestinationNetworkUnknown, header.ICMPv4NetUnreachableForTos, header.ICMPv4NetProhibited: e.handleControl(&icmpv4DestinationNetUnreachableSockError{}, pkt) case header.ICMPv4HostUnreachable, header.ICMPv4HostProhibited, header.ICMPv4AdminProhibited, header.ICMPv4HostUnreachableForTos, header.ICMPv4HostPrecedenceViolation, header.ICMPv4PrecedenceCutInEffect: e.handleControl(&icmpv4DestinationHostUnreachableSockError{}, pkt) case header.ICMPv4PortUnreachable: e.handleControl(&icmpv4DestinationPortUnreachableSockError{}, pkt) case header.ICMPv4FragmentationNeeded: networkMTU, err := calculateNetworkMTU(uint32(mtu), header.IPv4MinimumSize) if err != nil { networkMTU = 0 } e.handleControl(&icmpv4FragmentationNeededSockError{mtu: networkMTU}, pkt) case header.ICMPv4ProtoUnreachable: e.handleControl(&icmpv4DestinationProtoUnreachableSockError{}, pkt) case header.ICMPv4SourceRouteFailed: e.handleControl(&icmpv4SourceRouteFailedSockError{}, pkt) case header.ICMPv4SourceHostIsolated: e.handleControl(&icmpv4SourceHostIsolatedSockError{}, pkt) case header.ICMPv4DestinationHostUnknown: e.handleControl(&icmpv4DestinationHostUnknownSockError{}, pkt) } case header.ICMPv4SrcQuench: received.srcQuench.Increment() case header.ICMPv4Redirect: received.redirect.Increment() case header.ICMPv4TimeExceeded: received.timeExceeded.Increment() case header.ICMPv4ParamProblem: received.paramProblem.Increment() case header.ICMPv4Timestamp: received.timestamp.Increment() case header.ICMPv4TimestampReply: received.timestampReply.Increment() case header.ICMPv4InfoRequest: received.infoRequest.Increment() case header.ICMPv4InfoReply: received.infoReply.Increment() default: received.invalid.Increment() } } // ======= ICMP Error packet generation ========= // icmpReason is a marker interface for IPv4 specific ICMP errors. type icmpReason interface { isICMPReason() } // icmpReasonNetworkProhibited is an error where the destination network is // prohibited. type icmpReasonNetworkProhibited struct{} func (*icmpReasonNetworkProhibited) isICMPReason() {} // icmpReasonHostProhibited is an error where the destination host is // prohibited. type icmpReasonHostProhibited struct{} func (*icmpReasonHostProhibited) isICMPReason() {} // icmpReasonAdministrativelyProhibited is an error where the destination is // administratively prohibited. type icmpReasonAdministrativelyProhibited struct{} func (*icmpReasonAdministrativelyProhibited) isICMPReason() {} // icmpReasonPortUnreachable is an error where the transport protocol has no // listener and no alternative means to inform the sender. type icmpReasonPortUnreachable struct{} func (*icmpReasonPortUnreachable) isICMPReason() {} // icmpReasonProtoUnreachable is an error where the transport protocol is // not supported. type icmpReasonProtoUnreachable struct{} func (*icmpReasonProtoUnreachable) isICMPReason() {} // icmpReasonTTLExceeded is an error where a packet's time to live exceeded in // transit to its final destination, as per RFC 792 page 6, Time Exceeded // Message. type icmpReasonTTLExceeded struct{} func (*icmpReasonTTLExceeded) isICMPReason() {} // icmpReasonReassemblyTimeout is an error where insufficient fragments are // received to complete reassembly of a packet within a configured time after // the reception of the first-arriving fragment of that packet. type icmpReasonReassemblyTimeout struct{} func (*icmpReasonReassemblyTimeout) isICMPReason() {} // icmpReasonParamProblem is an error to use to request a Parameter Problem // message to be sent. type icmpReasonParamProblem struct { pointer byte } func (*icmpReasonParamProblem) isICMPReason() {} // icmpReasonNetworkUnreachable is an error in which the network specified in // the internet destination field of the datagram is unreachable. type icmpReasonNetworkUnreachable struct{} func (*icmpReasonNetworkUnreachable) isICMPReason() {} // icmpReasonFragmentationNeeded is an error where a packet requires // fragmentation while also having the Don't Fragment flag set, as per RFC 792 // page 3, Destination Unreachable Message. type icmpReasonFragmentationNeeded struct{} func (*icmpReasonFragmentationNeeded) isICMPReason() {} // icmpReasonHostUnreachable is an error in which the host specified in the // internet destination field of the datagram is unreachable. type icmpReasonHostUnreachable struct{} func (*icmpReasonHostUnreachable) isICMPReason() {} // returnError takes an error descriptor and generates the appropriate ICMP // error packet for IPv4 and sends it back to the remote device that sent // the problematic packet. It incorporates as much of that packet as // possible as well as any error metadata as is available. returnError // expects pkt to hold a valid IPv4 packet as per the wire format. func (p *protocol) returnError(reason icmpReason, pkt *stack.PacketBuffer, deliveredLocally bool) tcpip.Error { origIPHdr := header.IPv4(pkt.NetworkHeader().Slice()) origIPHdrSrc := origIPHdr.SourceAddress() origIPHdrDst := origIPHdr.DestinationAddress() // We check we are responding only when we are allowed to. // See RFC 1812 section 4.3.2.7 (shown below). // // ========= // 4.3.2.7 When Not to Send ICMP Errors // // An ICMP error message MUST NOT be sent as the result of receiving: // // o An ICMP error message, or // // o A packet which fails the IP header validation tests described in // Section [5.2.2] (except where that section specifically permits // the sending of an ICMP error message), or // // o A packet destined to an IP broadcast or IP multicast address, or // // o A packet sent as a Link Layer broadcast or multicast, or // // o Any fragment of a datagram other then the first fragment (i.e., a // packet for which the fragment offset in the IP header is nonzero). // // TODO(gvisor.dev/issues/4058): Make sure we don't send ICMP errors in // response to a non-initial fragment, but it currently can not happen. if pkt.NetworkPacketInfo.LocalAddressBroadcast || header.IsV4MulticastAddress(origIPHdrDst) || origIPHdrSrc == header.IPv4Any { return nil } // If the packet wasn't delivered locally, do not use the packet's destination // address as the response's source address as we should not not own the // destination address of a packet we are forwarding. localAddr := origIPHdrDst if !deliveredLocally { localAddr = tcpip.Address{} } // Even if we were able to receive a packet from some remote, we may not have // a route to it - the remote may be blocked via routing rules. We must always // consult our routing table and find a route to the remote before sending any // packet. route, err := p.stack.FindRoute(pkt.NICID, localAddr, origIPHdrSrc, ProtocolNumber, false /* multicastLoop */) if err != nil { return err } defer route.Release() p.mu.Lock() // We retrieve an endpoint using the newly constructed route's NICID rather // than the packet's NICID. The packet's NICID corresponds to the NIC on // which it arrived, which isn't necessarily the same as the NIC on which it // will be transmitted. On the other hand, the route's NIC *is* guaranteed // to be the NIC on which the packet will be transmitted. netEP, ok := p.eps[route.NICID()] p.mu.Unlock() if !ok { return &tcpip.ErrNotConnected{} } transportHeader := pkt.TransportHeader().Slice() // Don't respond to icmp error packets. if origIPHdr.Protocol() == uint8(header.ICMPv4ProtocolNumber) { // We need to decide to explicitly name the packets we can respond to or // the ones we can not respond to. The decision is somewhat arbitrary and // if problems arise this could be reversed. It was judged less of a breach // of protocol to not respond to unknown non-error packets than to respond // to unknown error packets so we take the first approach. if len(transportHeader) < header.ICMPv4MinimumSize { // The packet is malformed. return nil } switch header.ICMPv4(transportHeader).Type() { case header.ICMPv4EchoReply, header.ICMPv4Echo, header.ICMPv4Timestamp, header.ICMPv4TimestampReply, header.ICMPv4InfoRequest, header.ICMPv4InfoReply: default: // Assume any type we don't know about may be an error type. return nil } } sent := netEP.stats.icmp.packetsSent icmpType, icmpCode, counter, pointer := func() (header.ICMPv4Type, header.ICMPv4Code, tcpip.MultiCounterStat, byte) { switch reason := reason.(type) { case *icmpReasonNetworkProhibited: return header.ICMPv4DstUnreachable, header.ICMPv4NetProhibited, sent.dstUnreachable, 0 case *icmpReasonHostProhibited: return header.ICMPv4DstUnreachable, header.ICMPv4HostProhibited, sent.dstUnreachable, 0 case *icmpReasonAdministrativelyProhibited: return header.ICMPv4DstUnreachable, header.ICMPv4AdminProhibited, sent.dstUnreachable, 0 case *icmpReasonPortUnreachable: return header.ICMPv4DstUnreachable, header.ICMPv4PortUnreachable, sent.dstUnreachable, 0 case *icmpReasonProtoUnreachable: return header.ICMPv4DstUnreachable, header.ICMPv4ProtoUnreachable, sent.dstUnreachable, 0 case *icmpReasonNetworkUnreachable: return header.ICMPv4DstUnreachable, header.ICMPv4NetUnreachable, sent.dstUnreachable, 0 case *icmpReasonHostUnreachable: return header.ICMPv4DstUnreachable, header.ICMPv4HostUnreachable, sent.dstUnreachable, 0 case *icmpReasonFragmentationNeeded: return header.ICMPv4DstUnreachable, header.ICMPv4FragmentationNeeded, sent.dstUnreachable, 0 case *icmpReasonTTLExceeded: return header.ICMPv4TimeExceeded, header.ICMPv4TTLExceeded, sent.timeExceeded, 0 case *icmpReasonReassemblyTimeout: return header.ICMPv4TimeExceeded, header.ICMPv4ReassemblyTimeout, sent.timeExceeded, 0 case *icmpReasonParamProblem: return header.ICMPv4ParamProblem, header.ICMPv4UnusedCode, sent.paramProblem, reason.pointer default: panic(fmt.Sprintf("unsupported ICMP type %T", reason)) } }() if !p.allowICMPReply(icmpType, icmpCode) { sent.rateLimited.Increment() return nil } // Now work out how much of the triggering packet we should return. // As per RFC 1812 Section 4.3.2.3 // // ICMP datagram SHOULD contain as much of the original // datagram as possible without the length of the ICMP // datagram exceeding 576 bytes. // // NOTE: The above RFC referenced is different from the original // recommendation in RFC 1122 and RFC 792 where it mentioned that at // least 8 bytes of the payload must be included. Today linux and other // systems implement the RFC 1812 definition and not the original // requirement. We treat 8 bytes as the minimum but will try send more. mtu := int(route.MTU()) const maxIPData = header.IPv4MinimumProcessableDatagramSize - header.IPv4MinimumSize if mtu > maxIPData { mtu = maxIPData } available := mtu - header.ICMPv4MinimumSize if available < len(origIPHdr)+header.ICMPv4MinimumErrorPayloadSize { return nil } payloadLen := len(origIPHdr) + len(transportHeader) + pkt.Data().Size() if payloadLen > available { payloadLen = available } // The buffers used by pkt may be used elsewhere in the system. // For example, an AF_RAW or AF_PACKET socket may use what the transport // protocol considers an unreachable destination. Thus we deep copy pkt to // prevent multiple ownership and SR errors. The new copy is a vectorized // view with the entire incoming IP packet reassembled and truncated as // required. This is now the payload of the new ICMP packet and no longer // considered a packet in its own right. payload := buffer.MakeWithView(pkt.NetworkHeader().View()) payload.Append(pkt.TransportHeader().View()) if dataCap := payloadLen - int(payload.Size()); dataCap > 0 { buf := pkt.Data().ToBuffer() buf.Truncate(int64(dataCap)) payload.Merge(&buf) } else { payload.Truncate(int64(payloadLen)) } icmpPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ ReserveHeaderBytes: int(route.MaxHeaderLength()) + header.ICMPv4MinimumSize, Payload: payload, }) defer icmpPkt.DecRef() icmpPkt.TransportProtocolNumber = header.ICMPv4ProtocolNumber icmpHdr := header.ICMPv4(icmpPkt.TransportHeader().Push(header.ICMPv4MinimumSize)) icmpHdr.SetCode(icmpCode) icmpHdr.SetType(icmpType) icmpHdr.SetPointer(pointer) icmpHdr.SetChecksum(header.ICMPv4Checksum(icmpHdr, icmpPkt.Data().Checksum())) if err := route.WritePacket( stack.NetworkHeaderParams{ Protocol: header.ICMPv4ProtocolNumber, TTL: route.DefaultTTL(), TOS: stack.DefaultTOS, }, icmpPkt, ); err != nil { sent.dropped.Increment() return err } counter.Increment() return nil } // OnReassemblyTimeout implements fragmentation.TimeoutHandler. func (p *protocol) OnReassemblyTimeout(pkt *stack.PacketBuffer) { // OnReassemblyTimeout sends a Time Exceeded Message, as per RFC 792: // // If a host reassembling a fragmented datagram cannot complete the // reassembly due to missing fragments within its time limit it discards the // datagram, and it may send a time exceeded message. // // If fragment zero is not available then no time exceeded need be sent at // all. if pkt != nil { p.returnError(&icmpReasonReassemblyTimeout{}, pkt, true /* deliveredLocally */) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/ipv4/igmp.go000066400000000000000000000514421465435605700244070ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ipv4 import ( "fmt" "math" "time" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/network/internal/ip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) const ( // v1RouterPresentTimeout from RFC 2236 Section 8.11, Page 18 // See note on igmpState.igmpV1Present for more detail. v1RouterPresentTimeout = 400 * time.Second // v1MaxRespTime from RFC 2236 Section 4, Page 5. "The IGMPv1 router // will send General Queries with the Max Response Time set to 0. This MUST // be interpreted as a value of 100 (10 seconds)." // // Note that the Max Response Time field is a value in units of deciseconds. v1MaxRespTime = 10 * time.Second // UnsolicitedReportIntervalMax is the maximum delay between sending // unsolicited IGMP reports. // // Obtained from RFC 2236 Section 8.10, Page 19. UnsolicitedReportIntervalMax = 10 * time.Second ) type protocolMode int const ( protocolModeV2OrV3 protocolMode = iota protocolModeV1 // protocolModeV1Compatibility is for maintaining compatibility with IGMPv1 // Routers. // // Per RFC 2236 Section 4 Page 6: "The IGMPv1 router expects Version 1 // Membership Reports in response to its Queries, and will not pay // attention to Version 2 Membership Reports. Therefore, a state variable // MUST be kept for each interface, describing whether the multicast // Querier on that interface is running IGMPv1 or IGMPv2. This variable // MUST be based upon whether or not an IGMPv1 query was heard in the last // [Version 1 Router Present Timeout] seconds". protocolModeV1Compatibility ) // IGMPVersion is the forced version of IGMP. type IGMPVersion int const ( _ IGMPVersion = iota // IGMPVersion1 indicates IGMPv1. IGMPVersion1 // IGMPVersion2 indicates IGMPv2. Note that IGMP may still fallback to V1 // compatibility mode as required by IGMPv2. IGMPVersion2 // IGMPVersion3 indicates IGMPv3. Note that IGMP may still fallback to V2 // compatibility mode as required by IGMPv3. IGMPVersion3 ) // IGMPEndpoint is a network endpoint that supports IGMP. type IGMPEndpoint interface { // SetIGMPVersion sets the IGMP version. // // Returns the previous IGMP version. SetIGMPVersion(IGMPVersion) IGMPVersion // GetIGMPVersion returns the IGMP version. GetIGMPVersion() IGMPVersion } // IGMPOptions holds options for IGMP. // // +stateify savable type IGMPOptions struct { // Enabled indicates whether IGMP will be performed. // // When enabled, IGMP may transmit IGMP report and leave messages when // joining and leaving multicast groups respectively, and handle incoming // IGMP packets. // // This field is ignored and is always assumed to be false for interfaces // without neighbouring nodes (e.g. loopback). Enabled bool } var _ ip.MulticastGroupProtocol = (*igmpState)(nil) // igmpState is the per-interface IGMP state. // // igmpState.init() MUST be called after creating an IGMP state. // // +stateify savable type igmpState struct { // The IPv4 endpoint this igmpState is for. ep *endpoint genericMulticastProtocol ip.GenericMulticastProtocolState // mode is used to configure the version of IGMP to perform. mode protocolMode // igmpV1Job is scheduled when this interface receives an IGMPv1 style // message, upon expiration the igmpV1Present flag is cleared. // igmpV1Job may not be nil once igmpState is initialized. igmpV1Job *tcpip.Job } // Enabled implements ip.MulticastGroupProtocol. func (igmp *igmpState) Enabled() bool { // No need to perform IGMP on loopback interfaces since they don't have // neighbouring nodes. return igmp.ep.protocol.options.IGMP.Enabled && !igmp.ep.nic.IsLoopback() && igmp.ep.Enabled() } // SendReport implements ip.MulticastGroupProtocol. // // +checklocksread:igmp.ep.mu func (igmp *igmpState) SendReport(groupAddress tcpip.Address) (bool, tcpip.Error) { igmpType := header.IGMPv2MembershipReport switch igmp.mode { case protocolModeV2OrV3: case protocolModeV1, protocolModeV1Compatibility: igmpType = header.IGMPv1MembershipReport default: panic(fmt.Sprintf("unrecognized mode = %d", igmp.mode)) } return igmp.writePacket(groupAddress, groupAddress, igmpType) } // SendLeave implements ip.MulticastGroupProtocol. // // +checklocksread:igmp.ep.mu func (igmp *igmpState) SendLeave(groupAddress tcpip.Address) tcpip.Error { // As per RFC 2236 Section 6, Page 8: "If the interface state says the // Querier is running IGMPv1, this action SHOULD be skipped. If the flag // saying we were the last host to report is cleared, this action MAY be // skipped." switch igmp.mode { case protocolModeV2OrV3: _, err := igmp.writePacket(header.IPv4AllRoutersGroup, groupAddress, header.IGMPLeaveGroup) return err case protocolModeV1, protocolModeV1Compatibility: return nil default: panic(fmt.Sprintf("unrecognized mode = %d", igmp.mode)) } } // ShouldPerformProtocol implements ip.MulticastGroupProtocol. func (igmp *igmpState) ShouldPerformProtocol(groupAddress tcpip.Address) bool { // As per RFC 2236 section 6 page 10, // // The all-systems group (address 224.0.0.1) is handled as a special // case. The host starts in Idle Member state for that group on every // interface, never transitions to another state, and never sends a // report for that group. return groupAddress != header.IPv4AllSystems } type igmpv3ReportBuilder struct { igmp *igmpState records []header.IGMPv3ReportGroupAddressRecordSerializer } // AddRecord implements ip.MulticastGroupProtocolV2ReportBuilder. func (b *igmpv3ReportBuilder) AddRecord(genericRecordType ip.MulticastGroupProtocolV2ReportRecordType, groupAddress tcpip.Address) { var recordType header.IGMPv3ReportRecordType switch genericRecordType { case ip.MulticastGroupProtocolV2ReportRecordModeIsInclude: recordType = header.IGMPv3ReportRecordModeIsInclude case ip.MulticastGroupProtocolV2ReportRecordModeIsExclude: recordType = header.IGMPv3ReportRecordModeIsExclude case ip.MulticastGroupProtocolV2ReportRecordChangeToIncludeMode: recordType = header.IGMPv3ReportRecordChangeToIncludeMode case ip.MulticastGroupProtocolV2ReportRecordChangeToExcludeMode: recordType = header.IGMPv3ReportRecordChangeToExcludeMode case ip.MulticastGroupProtocolV2ReportRecordAllowNewSources: recordType = header.IGMPv3ReportRecordAllowNewSources case ip.MulticastGroupProtocolV2ReportRecordBlockOldSources: recordType = header.IGMPv3ReportRecordBlockOldSources default: panic(fmt.Sprintf("unrecognied genericRecordType = %d", genericRecordType)) } b.records = append(b.records, header.IGMPv3ReportGroupAddressRecordSerializer{ RecordType: recordType, GroupAddress: groupAddress, Sources: nil, }) } // Send implements ip.MulticastGroupProtocolV2ReportBuilder. // // +checklocksread:b.igmp.ep.mu func (b *igmpv3ReportBuilder) Send() (sent bool, err tcpip.Error) { if len(b.records) == 0 { return false, err } options := header.IPv4OptionsSerializer{ &header.IPv4SerializableRouterAlertOption{}, } mtu := int(b.igmp.ep.MTU()) - int(options.Length()) allSentWithSpecifiedAddress := true var firstErr tcpip.Error for records := b.records; len(records) != 0; { spaceLeft := mtu maxRecords := 0 for ; maxRecords < len(records); maxRecords++ { tmp := spaceLeft - records[maxRecords].Length() if tmp > 0 { spaceLeft = tmp } else { break } } serializer := header.IGMPv3ReportSerializer{Records: records[:maxRecords]} records = records[maxRecords:] icmpView := buffer.NewViewSize(serializer.Length()) serializer.SerializeInto(icmpView.AsSlice()) if sentWithSpecifiedAddress, err := b.igmp.writePacketInner( icmpView, b.igmp.ep.stats.igmp.packetsSent.v3MembershipReport, options, header.IGMPv3RoutersAddress, ); err != nil { if firstErr != nil { firstErr = nil } allSentWithSpecifiedAddress = false } else if !sentWithSpecifiedAddress { allSentWithSpecifiedAddress = false } } return allSentWithSpecifiedAddress, firstErr } // NewReportV2Builder implements ip.MulticastGroupProtocol. func (igmp *igmpState) NewReportV2Builder() ip.MulticastGroupProtocolV2ReportBuilder { return &igmpv3ReportBuilder{igmp: igmp} } // V2QueryMaxRespCodeToV2Delay implements ip.MulticastGroupProtocol. func (*igmpState) V2QueryMaxRespCodeToV2Delay(code uint16) time.Duration { if code > math.MaxUint8 { panic(fmt.Sprintf("got IGMPv3 MaxRespCode = %d, want <= %d", code, math.MaxUint8)) } return header.IGMPv3MaximumResponseDelay(uint8(code)) } // V2QueryMaxRespCodeToV1Delay implements ip.MulticastGroupProtocol. func (*igmpState) V2QueryMaxRespCodeToV1Delay(code uint16) time.Duration { return time.Duration(code) * time.Millisecond } // init sets up an igmpState struct, and is required to be called before using // a new igmpState. // // Must only be called once for the lifetime of igmp. func (igmp *igmpState) init(ep *endpoint) { igmp.ep = ep igmp.genericMulticastProtocol.Init(&ep.mu, ip.GenericMulticastProtocolOptions{ Rand: ep.protocol.stack.InsecureRNG(), Clock: ep.protocol.stack.Clock(), Protocol: igmp, MaxUnsolicitedReportDelay: UnsolicitedReportIntervalMax, }) // As per RFC 2236 Page 9 says "No IGMPv1 Router Present ... is // the initial state. igmp.mode = protocolModeV2OrV3 igmp.igmpV1Job = tcpip.NewJob(ep.protocol.stack.Clock(), &ep.mu, func() { igmp.mode = protocolModeV2OrV3 }) } // +checklocks:igmp.ep.mu func (igmp *igmpState) isSourceIPValidLocked(src tcpip.Address, messageType header.IGMPType) bool { if messageType == header.IGMPMembershipQuery { // RFC 2236 does not require the IGMP implementation to check the source IP // for Membership Query messages. return true } // As per RFC 2236 section 10, // // Ignore the Report if you cannot identify the source address of the // packet as belonging to a subnet assigned to the interface on which the // packet was received. // // Ignore the Leave message if you cannot identify the source address of // the packet as belonging to a subnet assigned to the interface on which // the packet was received. // // Note: this rule applies to both V1 and V2 Membership Reports. var isSourceIPValid bool igmp.ep.addressableEndpointState.ForEachPrimaryEndpoint(func(addressEndpoint stack.AddressEndpoint) bool { if subnet := addressEndpoint.Subnet(); subnet.Contains(src) { isSourceIPValid = true return false } return true }) return isSourceIPValid } // +checklocks:igmp.ep.mu func (igmp *igmpState) isPacketValidLocked(pkt *stack.PacketBuffer, messageType header.IGMPType, hasRouterAlertOption bool) bool { // We can safely assume that the IP header is valid if we got this far. iph := header.IPv4(pkt.NetworkHeader().Slice()) // As per RFC 2236 section 2, // // All IGMP messages described in this document are sent with IP TTL 1, and // contain the IP Router Alert option [RFC 2113] in their IP header. if !hasRouterAlertOption || iph.TTL() != header.IGMPTTL { return false } return igmp.isSourceIPValidLocked(iph.SourceAddress(), messageType) } // handleIGMP handles an IGMP packet. // // +checklocks:igmp.ep.mu func (igmp *igmpState) handleIGMP(pkt *stack.PacketBuffer, hasRouterAlertOption bool) { received := igmp.ep.stats.igmp.packetsReceived hdr, ok := pkt.Data().PullUp(pkt.Data().Size()) if !ok { received.invalid.Increment() return } h := header.IGMP(hdr) if len(h) < header.IGMPMinimumSize { received.invalid.Increment() return } // As per RFC 1071 section 1.3, // // To check a checksum, the 1's complement sum is computed over the // same set of octets, including the checksum field. If the result // is all 1 bits (-0 in 1's complement arithmetic), the check // succeeds. if pkt.Data().Checksum() != 0xFFFF { received.checksumErrors.Increment() return } isValid := func(minimumSize int) bool { return len(hdr) >= minimumSize && igmp.isPacketValidLocked(pkt, h.Type(), hasRouterAlertOption) } switch h.Type() { case header.IGMPMembershipQuery: received.membershipQuery.Increment() if len(h) >= header.IGMPv3QueryMinimumSize { if isValid(header.IGMPv3QueryMinimumSize) { igmp.handleMembershipQueryV3(header.IGMPv3Query(h)) } else { received.invalid.Increment() } return } else if !isValid(header.IGMPQueryMinimumSize) { received.invalid.Increment() return } igmp.handleMembershipQuery(h.GroupAddress(), h.MaxRespTime()) case header.IGMPv1MembershipReport: received.v1MembershipReport.Increment() if !isValid(header.IGMPReportMinimumSize) { received.invalid.Increment() return } igmp.handleMembershipReport(h.GroupAddress()) case header.IGMPv2MembershipReport: received.v2MembershipReport.Increment() if !isValid(header.IGMPReportMinimumSize) { received.invalid.Increment() return } igmp.handleMembershipReport(h.GroupAddress()) case header.IGMPLeaveGroup: received.leaveGroup.Increment() if !isValid(header.IGMPLeaveMessageMinimumSize) { received.invalid.Increment() return } // As per RFC 2236 Section 6, Page 7: "IGMP messages other than Query or // Report, are ignored in all states" default: // As per RFC 2236 Section 2.1 Page 3: "Unrecognized message types should // be silently ignored. New message types may be used by newer versions of // IGMP, by multicast routing protocols, or other uses." received.unrecognized.Increment() } } func (igmp *igmpState) resetV1Present() { igmp.igmpV1Job.Cancel() switch igmp.mode { case protocolModeV2OrV3, protocolModeV1: case protocolModeV1Compatibility: igmp.mode = protocolModeV2OrV3 default: panic(fmt.Sprintf("unrecognized mode = %d", igmp.mode)) } } // handleMembershipQuery handles a membership query. // // +checklocks:igmp.ep.mu func (igmp *igmpState) handleMembershipQuery(groupAddress tcpip.Address, maxRespTime time.Duration) { // As per RFC 2236 Section 6, Page 10: If the maximum response time is zero // then change the state to note that an IGMPv1 router is present and // schedule the query received Job. if maxRespTime == 0 && igmp.Enabled() { switch igmp.mode { case protocolModeV2OrV3, protocolModeV1Compatibility: igmp.igmpV1Job.Cancel() igmp.igmpV1Job.Schedule(v1RouterPresentTimeout) igmp.mode = protocolModeV1Compatibility case protocolModeV1: default: panic(fmt.Sprintf("unrecognized mode = %d", igmp.mode)) } maxRespTime = v1MaxRespTime } igmp.genericMulticastProtocol.HandleQueryLocked(groupAddress, maxRespTime) } // handleMembershipQueryV3 handles a membership query. // // +checklocks:igmp.ep.mu func (igmp *igmpState) handleMembershipQueryV3(igmpHdr header.IGMPv3Query) { sources, ok := igmpHdr.Sources() if !ok { return } igmp.genericMulticastProtocol.HandleQueryV2Locked( igmpHdr.GroupAddress(), uint16(igmpHdr.MaximumResponseCode()), sources, igmpHdr.QuerierRobustnessVariable(), igmpHdr.QuerierQueryInterval(), ) } // handleMembershipReport handles a membership report. // // +checklocks:igmp.ep.mu func (igmp *igmpState) handleMembershipReport(groupAddress tcpip.Address) { igmp.genericMulticastProtocol.HandleReportLocked(groupAddress) } // writePacket assembles and sends an IGMP packet. // // +checklocksread:igmp.ep.mu func (igmp *igmpState) writePacket(destAddress tcpip.Address, groupAddress tcpip.Address, igmpType header.IGMPType) (bool, tcpip.Error) { igmpView := buffer.NewViewSize(header.IGMPReportMinimumSize) igmpData := header.IGMP(igmpView.AsSlice()) igmpData.SetType(igmpType) igmpData.SetGroupAddress(groupAddress) igmpData.SetChecksum(header.IGMPCalculateChecksum(igmpData)) var reportType tcpip.MultiCounterStat sentStats := igmp.ep.stats.igmp.packetsSent switch igmpType { case header.IGMPv1MembershipReport: reportType = sentStats.v1MembershipReport case header.IGMPv2MembershipReport: reportType = sentStats.v2MembershipReport case header.IGMPLeaveGroup: reportType = sentStats.leaveGroup default: panic(fmt.Sprintf("unrecognized igmp type = %d", igmpType)) } return igmp.writePacketInner( igmpView, reportType, header.IPv4OptionsSerializer{ &header.IPv4SerializableRouterAlertOption{}, }, destAddress, ) } // +checklocksread:igmp.ep.mu func (igmp *igmpState) writePacketInner(buf *buffer.View, reportStat tcpip.MultiCounterStat, options header.IPv4OptionsSerializer, destAddress tcpip.Address) (bool, tcpip.Error) { pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ ReserveHeaderBytes: int(igmp.ep.MaxHeaderLength()), Payload: buffer.MakeWithView(buf), }) defer pkt.DecRef() addressEndpoint := igmp.ep.acquireOutgoingPrimaryAddressRLocked(destAddress, tcpip.Address{} /* srcHint */, false /* allowExpired */) if addressEndpoint == nil { return false, nil } localAddr := addressEndpoint.AddressWithPrefix().Address addressEndpoint.DecRef() addressEndpoint = nil if err := igmp.ep.addIPHeader(localAddr, destAddress, pkt, stack.NetworkHeaderParams{ Protocol: header.IGMPProtocolNumber, TTL: header.IGMPTTL, TOS: stack.DefaultTOS, }, options); err != nil { panic(fmt.Sprintf("failed to add IP header: %s", err)) } sentStats := igmp.ep.stats.igmp.packetsSent if err := igmp.ep.nic.WritePacketToRemote(header.EthernetAddressFromMulticastIPv4Address(destAddress), pkt); err != nil { sentStats.dropped.Increment() return false, err } reportStat.Increment() return true, nil } // joinGroup handles adding a new group to the membership map, setting up the // IGMP state for the group, and sending and scheduling the required // messages. // // If the group already exists in the membership map, returns // *tcpip.ErrDuplicateAddress. // // +checklocks:igmp.ep.mu func (igmp *igmpState) joinGroup(groupAddress tcpip.Address) { igmp.genericMulticastProtocol.JoinGroupLocked(groupAddress) } // isInGroup returns true if the specified group has been joined locally. // // +checklocksread:igmp.ep.mu func (igmp *igmpState) isInGroup(groupAddress tcpip.Address) bool { return igmp.genericMulticastProtocol.IsLocallyJoinedRLocked(groupAddress) } // leaveGroup handles removing the group from the membership map, cancels any // delay timers associated with that group, and sends the Leave Group message // if required. // // +checklocks:igmp.ep.mu func (igmp *igmpState) leaveGroup(groupAddress tcpip.Address) tcpip.Error { // LeaveGroup returns false only if the group was not joined. if igmp.genericMulticastProtocol.LeaveGroupLocked(groupAddress) { return nil } return &tcpip.ErrBadLocalAddress{} } // softLeaveAll leaves all groups from the perspective of IGMP, but remains // joined locally. // // +checklocks:igmp.ep.mu func (igmp *igmpState) softLeaveAll() { igmp.genericMulticastProtocol.MakeAllNonMemberLocked() } // initializeAll attempts to initialize the IGMP state for each group that has // been joined locally. // // +checklocks:igmp.ep.mu func (igmp *igmpState) initializeAll() { igmp.genericMulticastProtocol.InitializeGroupsLocked() } // sendQueuedReports attempts to send any reports that are queued for sending. // // +checklocks:igmp.ep.mu func (igmp *igmpState) sendQueuedReports() { igmp.genericMulticastProtocol.SendQueuedReportsLocked() } // setVersion sets the IGMP version. // // +checklocks:igmp.ep.mu func (igmp *igmpState) setVersion(v IGMPVersion) IGMPVersion { prev := igmp.mode igmp.igmpV1Job.Cancel() var prevGenericModeV1 bool switch v { case IGMPVersion3: prevGenericModeV1 = igmp.genericMulticastProtocol.SetV1ModeLocked(false) igmp.mode = protocolModeV2OrV3 case IGMPVersion2: // IGMPv1 and IGMPv2 map to V1 of the generic multicast protocol. prevGenericModeV1 = igmp.genericMulticastProtocol.SetV1ModeLocked(true) igmp.mode = protocolModeV2OrV3 case IGMPVersion1: // IGMPv1 and IGMPv2 map to V1 of the generic multicast protocol. prevGenericModeV1 = igmp.genericMulticastProtocol.SetV1ModeLocked(true) igmp.mode = protocolModeV1 default: panic(fmt.Sprintf("unrecognized version = %d", v)) } return toIGMPVersion(prev, prevGenericModeV1) } func toIGMPVersion(mode protocolMode, genericV1 bool) IGMPVersion { switch mode { case protocolModeV2OrV3, protocolModeV1Compatibility: if genericV1 { return IGMPVersion2 } return IGMPVersion3 case protocolModeV1: return IGMPVersion1 default: panic(fmt.Sprintf("unrecognized mode = %d", mode)) } } // getVersion returns the IGMP version. // // +checklocksread:igmp.ep.mu func (igmp *igmpState) getVersion() IGMPVersion { return toIGMPVersion(igmp.mode, igmp.genericMulticastProtocol.GetV1ModeLocked()) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/ipv4/ipv4.go000066400000000000000000002427241465435605700243420ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package ipv4 contains the implementation of the ipv4 network protocol. package ipv4 import ( "fmt" "math" "reflect" "time" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/network/hash" "gvisor.dev/gvisor/pkg/tcpip/network/internal/fragmentation" "gvisor.dev/gvisor/pkg/tcpip/network/internal/ip" "gvisor.dev/gvisor/pkg/tcpip/network/internal/multicast" "gvisor.dev/gvisor/pkg/tcpip/stack" ) const ( // ReassembleTimeout is the time a packet stays in the reassembly // system before being evicted. // As per RFC 791 section 3.2: // The current recommendation for the initial timer setting is 15 seconds. // This may be changed as experience with this protocol accumulates. // // Considering that it is an old recommendation, we use the same reassembly // timeout that linux defines, which is 30 seconds: // https://github.com/torvalds/linux/blob/47ec5303d73ea344e84f46660fff693c57641386/include/net/ip.h#L138 ReassembleTimeout = 30 * time.Second // ProtocolNumber is the ipv4 protocol number. ProtocolNumber = header.IPv4ProtocolNumber // MaxTotalSize is maximum size that can be encoded in the 16-bit // TotalLength field of the ipv4 header. MaxTotalSize = 0xffff // DefaultTTL is the default time-to-live value for this endpoint. DefaultTTL = 64 // buckets is the number of identifier buckets. buckets = 2048 // The size of a fragment block, in bytes, as per RFC 791 section 3.1, // page 14. fragmentblockSize = 8 ) const ( forwardingDisabled = 0 forwardingEnabled = 1 ) var ipv4BroadcastAddr = header.IPv4Broadcast.WithPrefix() var _ stack.LinkResolvableNetworkEndpoint = (*endpoint)(nil) var _ stack.ForwardingNetworkEndpoint = (*endpoint)(nil) var _ stack.MulticastForwardingNetworkEndpoint = (*endpoint)(nil) var _ stack.GroupAddressableEndpoint = (*endpoint)(nil) var _ stack.AddressableEndpoint = (*endpoint)(nil) var _ stack.NetworkEndpoint = (*endpoint)(nil) var _ IGMPEndpoint = (*endpoint)(nil) // +stateify savable type endpoint struct { nic stack.NetworkInterface dispatcher stack.TransportDispatcher protocol *protocol stats sharedStats // enabled is set to 1 when the endpoint is enabled and 0 when it is // disabled. enabled atomicbitops.Uint32 // forwarding is set to forwardingEnabled when the endpoint has forwarding // enabled and forwardingDisabled when it is disabled. forwarding atomicbitops.Uint32 // multicastForwarding is set to forwardingEnabled when the endpoint has // forwarding enabled and forwardingDisabled when it is disabled. multicastForwarding atomicbitops.Uint32 // mu protects below. mu sync.RWMutex `state:"nosave"` // +checklocks:mu addressableEndpointState stack.AddressableEndpointState // +checklocks:mu igmp igmpState } // SetIGMPVersion implements IGMPEndpoint. func (e *endpoint) SetIGMPVersion(v IGMPVersion) IGMPVersion { e.mu.Lock() defer e.mu.Unlock() return e.setIGMPVersionLocked(v) } // GetIGMPVersion implements IGMPEndpoint. func (e *endpoint) GetIGMPVersion() IGMPVersion { e.mu.RLock() defer e.mu.RUnlock() return e.getIGMPVersionLocked() } // +checklocks:e.mu // +checklocksalias:e.igmp.ep.mu=e.mu func (e *endpoint) setIGMPVersionLocked(v IGMPVersion) IGMPVersion { return e.igmp.setVersion(v) } // +checklocksread:e.mu // +checklocksalias:e.igmp.ep.mu=e.mu func (e *endpoint) getIGMPVersionLocked() IGMPVersion { return e.igmp.getVersion() } // HandleLinkResolutionFailure implements stack.LinkResolvableNetworkEndpoint. func (e *endpoint) HandleLinkResolutionFailure(pkt *stack.PacketBuffer) { // If we are operating as a router, return an ICMP error to the original // packet's sender. if pkt.NetworkPacketInfo.IsForwardedPacket { // TODO(gvisor.dev/issue/6005): Propagate asynchronously generated ICMP // errors to local endpoints. e.protocol.returnError(&icmpReasonHostUnreachable{}, pkt, false /* deliveredLocally */) e.stats.ip.Forwarding.Errors.Increment() e.stats.ip.Forwarding.HostUnreachable.Increment() return } // handleControl expects the entire offending packet to be in the packet // buffer's data field. pkt = stack.NewPacketBuffer(stack.PacketBufferOptions{ Payload: pkt.ToBuffer(), }) defer pkt.DecRef() pkt.NICID = e.nic.ID() pkt.NetworkProtocolNumber = ProtocolNumber // Use the same control type as an ICMPv4 destination host unreachable error // since the host is considered unreachable if we cannot resolve the link // address to the next hop. e.handleControl(&icmpv4DestinationHostUnreachableSockError{}, pkt) } // NewEndpoint creates a new ipv4 endpoint. func (p *protocol) NewEndpoint(nic stack.NetworkInterface, dispatcher stack.TransportDispatcher) stack.NetworkEndpoint { e := &endpoint{ nic: nic, dispatcher: dispatcher, protocol: p, } e.mu.Lock() e.addressableEndpointState.Init(e, stack.AddressableEndpointStateOptions{HiddenWhileDisabled: false}) e.igmp.init(e) e.mu.Unlock() tcpip.InitStatCounters(reflect.ValueOf(&e.stats.localStats).Elem()) stackStats := p.stack.Stats() e.stats.ip.Init(&e.stats.localStats.IP, &stackStats.IP) e.stats.icmp.init(&e.stats.localStats.ICMP, &stackStats.ICMP.V4) e.stats.igmp.init(&e.stats.localStats.IGMP, &stackStats.IGMP) p.mu.Lock() p.eps[nic.ID()] = e p.mu.Unlock() return e } func (p *protocol) findEndpointWithAddress(addr tcpip.Address) *endpoint { p.mu.RLock() defer p.mu.RUnlock() for _, e := range p.eps { if addressEndpoint := e.AcquireAssignedAddress(addr, false /* allowTemp */, stack.NeverPrimaryEndpoint, true /* readOnly */); addressEndpoint != nil { return e } } return nil } func (p *protocol) getEndpointForNIC(id tcpip.NICID) (*endpoint, bool) { p.mu.RLock() defer p.mu.RUnlock() ep, ok := p.eps[id] return ep, ok } func (p *protocol) forgetEndpoint(nicID tcpip.NICID) { p.mu.Lock() defer p.mu.Unlock() delete(p.eps, nicID) } // Forwarding implements stack.ForwardingNetworkEndpoint. func (e *endpoint) Forwarding() bool { return e.forwarding.Load() == forwardingEnabled } // setForwarding sets the forwarding status for the endpoint. // // Returns the previous forwarding status. func (e *endpoint) setForwarding(v bool) bool { forwarding := uint32(forwardingDisabled) if v { forwarding = forwardingEnabled } return e.forwarding.Swap(forwarding) != forwardingDisabled } // SetForwarding implements stack.ForwardingNetworkEndpoint. func (e *endpoint) SetForwarding(forwarding bool) bool { e.mu.Lock() defer e.mu.Unlock() prevForwarding := e.setForwarding(forwarding) if prevForwarding == forwarding { return prevForwarding } if forwarding { // There does not seem to be an RFC requirement for a node to join the all // routers multicast address but // https://www.iana.org/assignments/multicast-addresses/multicast-addresses.xhtml // specifies the address as a group for all routers on a subnet so we join // the group here. if err := e.joinGroupLocked(header.IPv4AllRoutersGroup); err != nil { // joinGroupLocked only returns an error if the group address is not a // valid IPv4 multicast address. panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv4AllRoutersGroup, err)) } return prevForwarding } switch err := e.leaveGroupLocked(header.IPv4AllRoutersGroup).(type) { case nil: case *tcpip.ErrBadLocalAddress: // The endpoint may have already left the multicast group. default: panic(fmt.Sprintf("e.leaveGroupLocked(%s): %s", header.IPv4AllRoutersGroup, err)) } return prevForwarding } // MulticastForwarding implements stack.MulticastForwardingNetworkEndpoint. func (e *endpoint) MulticastForwarding() bool { return e.multicastForwarding.Load() == forwardingEnabled } // SetMulticastForwarding implements stack.MulticastForwardingNetworkEndpoint. func (e *endpoint) SetMulticastForwarding(forwarding bool) bool { updatedForwarding := uint32(forwardingDisabled) if forwarding { updatedForwarding = forwardingEnabled } return e.multicastForwarding.Swap(updatedForwarding) != forwardingDisabled } // Enable implements stack.NetworkEndpoint. func (e *endpoint) Enable() tcpip.Error { e.mu.Lock() defer e.mu.Unlock() return e.enableLocked() } // +checklocks:e.mu // +checklocksalias:e.igmp.ep.mu=e.mu func (e *endpoint) enableLocked() tcpip.Error { // If the NIC is not enabled, the endpoint can't do anything meaningful so // don't enable the endpoint. if !e.nic.Enabled() { return &tcpip.ErrNotPermitted{} } // If the endpoint is already enabled, there is nothing for it to do. if !e.setEnabled(true) { return nil } // Must be called after Enabled has already been set. e.addressableEndpointState.OnNetworkEndpointEnabledChanged() // Create an endpoint to receive broadcast packets on this interface. ep, err := e.addressableEndpointState.AddAndAcquirePermanentAddress(ipv4BroadcastAddr, stack.AddressProperties{PEB: stack.NeverPrimaryEndpoint}) if err != nil { return err } // We have no need for the address endpoint. ep.DecRef() // Groups may have been joined while the endpoint was disabled, or the // endpoint may have left groups from the perspective of IGMP when the // endpoint was disabled. Either way, we need to let routers know to // send us multicast traffic. e.igmp.initializeAll() // As per RFC 1122 section 3.3.7, all hosts should join the all-hosts // multicast group. Note, the IANA calls the all-hosts multicast group the // all-systems multicast group. if err := e.joinGroupLocked(header.IPv4AllSystems); err != nil { // joinGroupLocked only returns an error if the group address is not a valid // IPv4 multicast address. panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv4AllSystems, err)) } return nil } // Enabled implements stack.NetworkEndpoint. func (e *endpoint) Enabled() bool { return e.nic.Enabled() && e.isEnabled() } // isEnabled returns true if the endpoint is enabled, regardless of the // enabled status of the NIC. func (e *endpoint) isEnabled() bool { return e.enabled.Load() == 1 } // setEnabled sets the enabled status for the endpoint. // // Returns true if the enabled status was updated. func (e *endpoint) setEnabled(v bool) bool { if v { return e.enabled.Swap(1) == 0 } return e.enabled.Swap(0) == 1 } // Disable implements stack.NetworkEndpoint. func (e *endpoint) Disable() { e.mu.Lock() defer e.mu.Unlock() e.disableLocked() } // +checklocks:e.mu // +checklocksalias:e.igmp.ep.mu=e.mu func (e *endpoint) disableLocked() { if !e.isEnabled() { return } // The endpoint may have already left the multicast group. switch err := e.leaveGroupLocked(header.IPv4AllSystems).(type) { case nil, *tcpip.ErrBadLocalAddress: default: panic(fmt.Sprintf("unexpected error when leaving group = %s: %s", header.IPv4AllSystems, err)) } // Leave groups from the perspective of IGMP so that routers know that // we are no longer interested in the group. e.igmp.softLeaveAll() // The address may have already been removed. switch err := e.addressableEndpointState.RemovePermanentAddress(ipv4BroadcastAddr.Address); err.(type) { case nil, *tcpip.ErrBadLocalAddress: default: panic(fmt.Sprintf("unexpected error when removing address = %s: %s", ipv4BroadcastAddr.Address, err)) } // Reset the IGMP V1 present flag. // // If the node comes back up on the same network, it will re-learn that it // needs to perform IGMPv1. e.igmp.resetV1Present() if !e.setEnabled(false) { panic("should have only done work to disable the endpoint if it was enabled") } // Must be called after Enabled has been set. e.addressableEndpointState.OnNetworkEndpointEnabledChanged() } // emitMulticastEvent emits a multicast forwarding event using the provided // generator if a valid event dispatcher exists. func (e *endpoint) emitMulticastEvent(eventGenerator func(stack.MulticastForwardingEventDispatcher)) { e.protocol.mu.RLock() defer e.protocol.mu.RUnlock() if mcastDisp := e.protocol.multicastForwardingDisp; mcastDisp != nil { eventGenerator(mcastDisp) } } // DefaultTTL is the default time-to-live value for this endpoint. func (e *endpoint) DefaultTTL() uint8 { return e.protocol.DefaultTTL() } // MTU implements stack.NetworkEndpoint. It returns the link-layer MTU minus the // network layer max header length. func (e *endpoint) MTU() uint32 { networkMTU, err := calculateNetworkMTU(e.nic.MTU(), header.IPv4MinimumSize) if err != nil { return 0 } return networkMTU } // MaxHeaderLength returns the maximum length needed by ipv4 headers (and // underlying protocols). func (e *endpoint) MaxHeaderLength() uint16 { return e.nic.MaxHeaderLength() + header.IPv4MaximumHeaderSize } // NetworkProtocolNumber implements stack.NetworkEndpoint. func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber { return e.protocol.Number() } // getID returns a random uint16 number (other than zero) to be used as ID in // the IPv4 header. func (e *endpoint) getID() uint16 { rng := e.protocol.stack.SecureRNG() id := rng.Uint16() for id == 0 { id = rng.Uint16() } return id } func (e *endpoint) addIPHeader(srcAddr, dstAddr tcpip.Address, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams, options header.IPv4OptionsSerializer) tcpip.Error { hdrLen := header.IPv4MinimumSize var optLen int if options != nil { optLen = int(options.Length()) } hdrLen += optLen if hdrLen > header.IPv4MaximumHeaderSize { return &tcpip.ErrMessageTooLong{} } ipH := header.IPv4(pkt.NetworkHeader().Push(hdrLen)) length := pkt.Size() if length > math.MaxUint16 { return &tcpip.ErrMessageTooLong{} } fields := header.IPv4Fields{ TotalLength: uint16(length), TTL: params.TTL, TOS: params.TOS, Protocol: uint8(params.Protocol), SrcAddr: srcAddr, DstAddr: dstAddr, Options: options, } if params.DF { // Treat want and do the same. fields.Flags = header.IPv4FlagDontFragment } else { // RFC 6864 section 4.3 mandates uniqueness of ID values for // non-atomic datagrams. fields.ID = e.getID() } ipH.Encode(&fields) ipH.SetChecksum(^ipH.CalculateChecksum()) pkt.NetworkProtocolNumber = ProtocolNumber return nil } // handleFragments fragments pkt and calls the handler function on each // fragment. It returns the number of fragments handled and the number of // fragments left to be processed. The IP header must already be present in the // original packet. func (e *endpoint) handleFragments(_ *stack.Route, networkMTU uint32, pkt *stack.PacketBuffer, handler func(*stack.PacketBuffer) tcpip.Error) (int, int, tcpip.Error) { // Round the MTU down to align to 8 bytes. fragmentPayloadSize := networkMTU &^ 7 networkHeader := header.IPv4(pkt.NetworkHeader().Slice()) pf := fragmentation.MakePacketFragmenter(pkt, fragmentPayloadSize, pkt.AvailableHeaderBytes()+len(networkHeader)) defer pf.Release() var n int for { fragPkt, more := buildNextFragment(&pf, networkHeader) err := handler(fragPkt) fragPkt.DecRef() if err != nil { return n, pf.RemainingFragmentCount() + 1, err } n++ if !more { return n, pf.RemainingFragmentCount(), nil } } } // WritePacket writes a packet to the given destination address and protocol. func (e *endpoint) WritePacket(r *stack.Route, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) tcpip.Error { if err := e.addIPHeader(r.LocalAddress(), r.RemoteAddress(), pkt, params, nil /* options */); err != nil { return err } return e.writePacket(r, pkt) } func (e *endpoint) writePacket(r *stack.Route, pkt *stack.PacketBuffer) tcpip.Error { netHeader := header.IPv4(pkt.NetworkHeader().Slice()) dstAddr := netHeader.DestinationAddress() // iptables filtering. All packets that reach here are locally // generated. outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) if ok := e.protocol.stack.IPTables().CheckOutput(pkt, r, outNicName); !ok { // iptables is telling us to drop the packet. e.stats.ip.IPTablesOutputDropped.Increment() return nil } // If the packet is manipulated as per DNAT Output rules, handle packet // based on destination address and do not send the packet to link // layer. // // We should do this for every packet, rather than only DNATted packets, but // removing this check short circuits broadcasts before they are sent out to // other hosts. if newDstAddr := netHeader.DestinationAddress(); dstAddr != newDstAddr { if ep := e.protocol.findEndpointWithAddress(newDstAddr); ep != nil { // Since we rewrote the packet but it is being routed back to us, we // can safely assume the checksum is valid. ep.handleLocalPacket(pkt, true /* canSkipRXChecksum */) return nil } } return e.writePacketPostRouting(r, pkt, false /* headerIncluded */) } func (e *endpoint) writePacketPostRouting(r *stack.Route, pkt *stack.PacketBuffer, headerIncluded bool) tcpip.Error { if r.Loop()&stack.PacketLoop != 0 { // If the packet was generated by the stack (not a raw/packet endpoint // where a packet may be written with the header included), then we can // safely assume the checksum is valid. e.handleLocalPacket(pkt, !headerIncluded /* canSkipRXChecksum */) } if r.Loop()&stack.PacketOut == 0 { return nil } // Postrouting NAT can only change the source address, and does not alter the // route or outgoing interface of the packet. outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) if ok := e.protocol.stack.IPTables().CheckPostrouting(pkt, r, e, outNicName); !ok { // iptables is telling us to drop the packet. e.stats.ip.IPTablesPostroutingDropped.Increment() return nil } stats := e.stats.ip networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(len(pkt.NetworkHeader().Slice()))) if err != nil { stats.OutgoingPacketErrors.Increment() return err } if packetMustBeFragmented(pkt, networkMTU) { h := header.IPv4(pkt.NetworkHeader().Slice()) if h.Flags()&header.IPv4FlagDontFragment != 0 && pkt.NetworkPacketInfo.IsForwardedPacket { // TODO(gvisor.dev/issue/5919): Handle error condition in which DontFragment // is set but the packet must be fragmented for the non-forwarding case. return &tcpip.ErrMessageTooLong{} } sent, remain, err := e.handleFragments(r, networkMTU, pkt, func(fragPkt *stack.PacketBuffer) tcpip.Error { // TODO(gvisor.dev/issue/3884): Evaluate whether we want to send each // fragment one by one using WritePacket() (current strategy) or if we // want to create a PacketBufferList from the fragments and feed it to // WritePackets(). It'll be faster but cost more memory. return e.nic.WritePacket(r, fragPkt) }) stats.PacketsSent.IncrementBy(uint64(sent)) stats.OutgoingPacketErrors.IncrementBy(uint64(remain)) return err } if err := e.nic.WritePacket(r, pkt); err != nil { stats.OutgoingPacketErrors.Increment() return err } stats.PacketsSent.Increment() return nil } // WriteHeaderIncludedPacket implements stack.NetworkEndpoint. func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) tcpip.Error { // The packet already has an IP header, but there are a few required // checks. h, ok := pkt.Data().PullUp(header.IPv4MinimumSize) if !ok { return &tcpip.ErrMalformedHeader{} } hdrLen := header.IPv4(h).HeaderLength() if hdrLen < header.IPv4MinimumSize { return &tcpip.ErrMalformedHeader{} } h, ok = pkt.Data().PullUp(int(hdrLen)) if !ok { return &tcpip.ErrMalformedHeader{} } ipH := header.IPv4(h) // Always set the total length. pktSize := pkt.Data().Size() ipH.SetTotalLength(uint16(pktSize)) // Set the source address when zero. if ipH.SourceAddress() == header.IPv4Any { ipH.SetSourceAddress(r.LocalAddress()) } // Set the packet ID when zero. if ipH.ID() == 0 { // RFC 6864 section 4.3 mandates uniqueness of ID values for // non-atomic datagrams, so assign an ID to all such datagrams // according to the definition given in RFC 6864 section 4. if ipH.Flags()&header.IPv4FlagDontFragment == 0 || ipH.Flags()&header.IPv4FlagMoreFragments != 0 || ipH.FragmentOffset() > 0 { ipH.SetID(e.getID()) } } // Always set the checksum. ipH.SetChecksum(0) ipH.SetChecksum(^ipH.CalculateChecksum()) // Populate the packet buffer's network header and don't allow an invalid // packet to be sent. // // Note that parsing only makes sure that the packet is well formed as per the // wire format. We also want to check if the header's fields are valid before // sending the packet. if !parse.IPv4(pkt) || !header.IPv4(pkt.NetworkHeader().Slice()).IsValid(pktSize) { return &tcpip.ErrMalformedHeader{} } return e.writePacketPostRouting(r, pkt, true /* headerIncluded */) } // forwardPacketWithRoute emits the pkt using the provided route. // // If updateOptions is true, then the IP options will be updated in the copied // pkt using the outgoing endpoint. Otherwise, the caller is responsible for // updating the options. // // This method should be invoked by the endpoint that received the pkt. func (e *endpoint) forwardPacketWithRoute(route *stack.Route, pkt *stack.PacketBuffer, updateOptions bool) ip.ForwardingError { h := header.IPv4(pkt.NetworkHeader().Slice()) stk := e.protocol.stack inNicName := stk.FindNICNameFromID(e.nic.ID()) outNicName := stk.FindNICNameFromID(route.NICID()) if ok := stk.IPTables().CheckForward(pkt, inNicName, outNicName); !ok { // iptables is telling us to drop the packet. e.stats.ip.IPTablesForwardDropped.Increment() return nil } // We need to do a deep copy of the IP packet because // WriteHeaderIncludedPacket may modify the packet buffer, but we do // not own it. // // TODO(https://gvisor.dev/issue/7473): For multicast, only create one deep // copy and then clone. newPkt := pkt.DeepCopyForForwarding(int(route.MaxHeaderLength())) newHdr := header.IPv4(newPkt.NetworkHeader().Slice()) defer newPkt.DecRef() forwardToEp, ok := e.protocol.getEndpointForNIC(route.NICID()) if !ok { return &ip.ErrUnknownOutputEndpoint{} } if updateOptions { if err := forwardToEp.updateOptionsForForwarding(newPkt); err != nil { return err } } ttl := h.TTL() // As per RFC 791 page 30, Time to Live, // // This field must be decreased at each point that the internet header // is processed to reflect the time spent processing the datagram. // Even if no local information is available on the time actually // spent, the field must be decremented by 1. newHdr.SetTTL(ttl - 1) // We perform a full checksum as we may have updated options above. The IP // header is relatively small so this is not expected to be an expensive // operation. newHdr.SetChecksum(0) newHdr.SetChecksum(^newHdr.CalculateChecksum()) switch err := forwardToEp.writePacketPostRouting(route, newPkt, true /* headerIncluded */); err.(type) { case nil: return nil case *tcpip.ErrMessageTooLong: // As per RFC 792, page 4, Destination Unreachable: // // Another case is when a datagram must be fragmented to be forwarded by a // gateway yet the Don't Fragment flag is on. In this case the gateway must // discard the datagram and may return a destination unreachable message. // // WriteHeaderIncludedPacket checks for the presence of the Don't Fragment bit // while sending the packet and returns this error iff fragmentation is // necessary and the bit is also set. _ = e.protocol.returnError(&icmpReasonFragmentationNeeded{}, pkt, false /* deliveredLocally */) return &ip.ErrMessageTooLong{} case *tcpip.ErrNoBufferSpace: return &ip.ErrOutgoingDeviceNoBufferSpace{} default: return &ip.ErrOther{Err: err} } } // forwardUnicastPacket attempts to forward a packet to its final destination. func (e *endpoint) forwardUnicastPacket(pkt *stack.PacketBuffer) ip.ForwardingError { hView := pkt.NetworkHeader().View() defer hView.Release() h := header.IPv4(hView.AsSlice()) dstAddr := h.DestinationAddress() if err := validateAddressesForForwarding(h); err != nil { return err } ttl := h.TTL() if ttl == 0 { // As per RFC 792 page 6, Time Exceeded Message, // // If the gateway processing a datagram finds the time to live field // is zero it must discard the datagram. The gateway may also notify // the source host via the time exceeded message. // // We return the original error rather than the result of returning // the ICMP packet because the original error is more relevant to // the caller. _ = e.protocol.returnError(&icmpReasonTTLExceeded{}, pkt, false /* deliveredLocally */) return &ip.ErrTTLExceeded{} } if err := e.updateOptionsForForwarding(pkt); err != nil { return err } stk := e.protocol.stack // Check if the destination is owned by the stack. if ep := e.protocol.findEndpointWithAddress(dstAddr); ep != nil { inNicName := stk.FindNICNameFromID(e.nic.ID()) outNicName := stk.FindNICNameFromID(ep.nic.ID()) if ok := stk.IPTables().CheckForward(pkt, inNicName, outNicName); !ok { // iptables is telling us to drop the packet. e.stats.ip.IPTablesForwardDropped.Increment() return nil } // The packet originally arrived on e so provide its NIC as the input NIC. ep.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */) return nil } r, err := stk.FindRoute(0, tcpip.Address{}, dstAddr, ProtocolNumber, false /* multicastLoop */) switch err.(type) { case nil: // TODO(https://gvisor.dev/issues/8105): We should not observe ErrHostUnreachable from route // lookups. case *tcpip.ErrHostUnreachable, *tcpip.ErrNetworkUnreachable: // We return the original error rather than the result of returning // the ICMP packet because the original error is more relevant to // the caller. _ = e.protocol.returnError(&icmpReasonNetworkUnreachable{}, pkt, false /* deliveredLocally */) return &ip.ErrHostUnreachable{} default: return &ip.ErrOther{Err: err} } defer r.Release() // TODO(https://gvisor.dev/issue/7472): Unicast IP options should be updated // using the output endpoint (instead of the input endpoint). In particular, // RFC 1812 section 5.2.1 states the following: // // Processing of certain IP options requires that the router insert its IP // address into the option. As noted in Section [5.2.4], the address // inserted MUST be the address of the logical interface on which the // packet is sent or the router's router-id if the packet is sent over an // unnumbered interface. Thus, processing of these options cannot be // completed until after the output interface is chosen. return e.forwardPacketWithRoute(r, pkt, false /* updateOptions */) } // HandlePacket is called by the link layer when new ipv4 packets arrive for // this endpoint. func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { stats := e.stats.ip stats.PacketsReceived.Increment() if !e.isEnabled() { stats.DisabledPacketsReceived.Increment() return } hView, ok := e.protocol.parseAndValidate(pkt) if !ok { stats.MalformedPacketsReceived.Increment() return } h := header.IPv4(hView.AsSlice()) defer hView.Release() if !e.nic.IsLoopback() { if !e.protocol.options.AllowExternalLoopbackTraffic { if header.IsV4LoopbackAddress(h.SourceAddress()) { stats.InvalidSourceAddressesReceived.Increment() return } if header.IsV4LoopbackAddress(h.DestinationAddress()) { stats.InvalidDestinationAddressesReceived.Increment() return } } if e.protocol.stack.HandleLocal() { addressEndpoint := e.AcquireAssignedAddress(header.IPv4(pkt.NetworkHeader().Slice()).SourceAddress(), e.nic.Promiscuous(), stack.CanBePrimaryEndpoint, true /* readOnly */) if addressEndpoint != nil { // The source address is one of our own, so we never should have gotten // a packet like this unless HandleLocal is false or our NIC is the // loopback interface. stats.InvalidSourceAddressesReceived.Increment() return } } // Loopback traffic skips the prerouting chain. inNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) if ok := e.protocol.stack.IPTables().CheckPrerouting(pkt, e, inNicName); !ok { // iptables is telling us to drop the packet. stats.IPTablesPreroutingDropped.Increment() return } } e.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */) } // handleLocalPacket is like HandlePacket except it does not perform the // prerouting iptables hook or check for loopback traffic that originated from // outside of the netstack (i.e. martian loopback packets). func (e *endpoint) handleLocalPacket(pkt *stack.PacketBuffer, canSkipRXChecksum bool) { stats := e.stats.ip stats.PacketsReceived.Increment() pkt = pkt.CloneToInbound() defer pkt.DecRef() pkt.RXChecksumValidated = canSkipRXChecksum hView, ok := e.protocol.parseAndValidate(pkt) if !ok { stats.MalformedPacketsReceived.Increment() return } h := header.IPv4(hView.AsSlice()) defer hView.Release() e.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */) } func validateAddressesForForwarding(h header.IPv4) ip.ForwardingError { srcAddr := h.SourceAddress() // As per RFC 5735 section 3, // // 0.0.0.0/8 - Addresses in this block refer to source hosts on "this" // network. Address 0.0.0.0/32 may be used as a source address for this // host on this network; other addresses within 0.0.0.0/8 may be used to // refer to specified hosts on this network ([RFC1122], Section 3.2.1.3). // // And RFC 6890 section 2.2.2, // // +----------------------+----------------------------+ // | Attribute | Value | // +----------------------+----------------------------+ // | Address Block | 0.0.0.0/8 | // | Name | "This host on this network"| // | RFC | [RFC1122], Section 3.2.1.3 | // | Allocation Date | September 1981 | // | Termination Date | N/A | // | Source | True | // | Destination | False | // | Forwardable | False | // | Global | False | // | Reserved-by-Protocol | True | // +----------------------+----------------------------+ if header.IPv4CurrentNetworkSubnet.Contains(srcAddr) { return &ip.ErrInitializingSourceAddress{} } // As per RFC 3927 section 7, // // A router MUST NOT forward a packet with an IPv4 Link-Local source or // destination address, irrespective of the router's default route // configuration or routes obtained from dynamic routing protocols. // // A router which receives a packet with an IPv4 Link-Local source or // destination address MUST NOT forward the packet. This prevents // forwarding of packets back onto the network segment from which they // originated, or to any other segment. if header.IsV4LinkLocalUnicastAddress(srcAddr) { return &ip.ErrLinkLocalSourceAddress{} } if dstAddr := h.DestinationAddress(); header.IsV4LinkLocalUnicastAddress(dstAddr) || header.IsV4LinkLocalMulticastAddress(dstAddr) { return &ip.ErrLinkLocalDestinationAddress{} } return nil } // forwardMulticastPacket validates a multicast pkt and attempts to forward it. // // This method should be invoked for incoming multicast packets using the // endpoint that received the packet. func (e *endpoint) forwardMulticastPacket(h header.IPv4, pkt *stack.PacketBuffer) ip.ForwardingError { if err := validateAddressesForForwarding(h); err != nil { return err } if opts := h.Options(); len(opts) != 0 { // Check if the options are valid, but don't mutate them. This corresponds // to step 3 of RFC 1812 section 5.2.1.1. if _, _, optProblem := e.processIPOptions(pkt, opts, &optionUsageVerify{}); optProblem != nil { // Per RFC 1812 section 4.3.2.7, an ICMP error message should not be // sent for: // // A packet destined to an IP broadcast or IP multicast address. // // Note that protocol.returnError also enforces this requirement. // However, we intentionally omit it here since this path is multicast // only. return &ip.ErrParameterProblem{} } } routeKey := stack.UnicastSourceAndMulticastDestination{ Source: h.SourceAddress(), Destination: h.DestinationAddress(), } // The pkt has been validated. Consequently, if a route is not found, then // the pkt can safely be queued. result, hasBufferSpace := e.protocol.multicastRouteTable.GetRouteOrInsertPending(routeKey, pkt) if !hasBufferSpace { // Unable to queue the pkt. Silently drop it. return &ip.ErrNoMulticastPendingQueueBufferSpace{} } switch result.GetRouteResultState { case multicast.InstalledRouteFound: // Attempt to forward the pkt using an existing route. return e.forwardValidatedMulticastPacket(pkt, result.InstalledRoute) case multicast.NoRouteFoundAndPendingInserted: e.emitMulticastEvent(func(disp stack.MulticastForwardingEventDispatcher) { disp.OnMissingRoute(stack.MulticastPacketContext{ stack.UnicastSourceAndMulticastDestination{h.SourceAddress(), h.DestinationAddress()}, e.nic.ID(), }) }) case multicast.PacketQueuedInPendingRoute: default: panic(fmt.Sprintf("unexpected GetRouteResultState: %s", result.GetRouteResultState)) } return &ip.ErrHostUnreachable{} } func (e *endpoint) updateOptionsForForwarding(pkt *stack.PacketBuffer) ip.ForwardingError { h := header.IPv4(pkt.NetworkHeader().Slice()) if opts := h.Options(); len(opts) != 0 { newOpts, _, optProblem := e.processIPOptions(pkt, opts, &optionUsageForward{}) if optProblem != nil { if optProblem.NeedICMP { // Note that this will not emit an ICMP error if the destination is // multicast. _ = e.protocol.returnError(&icmpReasonParamProblem{ pointer: optProblem.Pointer, }, pkt, false /* deliveredLocally */) } return &ip.ErrParameterProblem{} } copied := copy(opts, newOpts) if copied != len(newOpts) { panic(fmt.Sprintf("copied %d bytes of new options, expected %d bytes", copied, len(newOpts))) } // Since in forwarding we handle all options, including copying those we // do not recognise, the options region should remain the same size which // simplifies processing. As we MAY receive a packet with a lot of padded // bytes after the "end of options list" byte, make sure we copy // them as the legal padding value (0). for i := copied; i < len(opts); i++ { // Pad with 0 (EOL). RFC 791 page 23 says "The padding is zero". opts[i] = byte(header.IPv4OptionListEndType) } } return nil } // forwardValidatedMulticastPacket attempts to forward the pkt using the // provided installedRoute. // // This method should be invoked by the endpoint that received the pkt. func (e *endpoint) forwardValidatedMulticastPacket(pkt *stack.PacketBuffer, installedRoute *multicast.InstalledRoute) ip.ForwardingError { // Per RFC 1812 section 5.2.1.3, // // Based on the IP source and destination addresses found in the datagram // header, the router determines whether the datagram has been received // on the proper interface for forwarding. If not, the datagram is // dropped silently. if e.nic.ID() != installedRoute.ExpectedInputInterface { h := header.IPv4(pkt.NetworkHeader().Slice()) e.emitMulticastEvent(func(disp stack.MulticastForwardingEventDispatcher) { disp.OnUnexpectedInputInterface(stack.MulticastPacketContext{ stack.UnicastSourceAndMulticastDestination{h.SourceAddress(), h.DestinationAddress()}, e.nic.ID(), }, installedRoute.ExpectedInputInterface) }) return &ip.ErrUnexpectedMulticastInputInterface{} } for _, outgoingInterface := range installedRoute.OutgoingInterfaces { if err := e.forwardMulticastPacketForOutgoingInterface(pkt, outgoingInterface); err != nil { e.handleForwardingError(err) continue } // The pkt was successfully forwarded. Mark the route as used. installedRoute.SetLastUsedTimestamp(e.protocol.stack.Clock().NowMonotonic()) } return nil } // forwardMulticastPacketForOutgoingInterface attempts to forward the pkt out // of the provided outgoingInterface. // // This method should be invoked by the endpoint that received the pkt. func (e *endpoint) forwardMulticastPacketForOutgoingInterface(pkt *stack.PacketBuffer, outgoingInterface stack.MulticastRouteOutgoingInterface) ip.ForwardingError { h := header.IPv4(pkt.NetworkHeader().Slice()) // Per RFC 1812 section 5.2.1.3, // // A copy of the multicast datagram is forwarded out each outgoing // interface whose minimum TTL value is less than or equal to the TTL // value in the datagram header. // // Copying of the packet is deferred to forwardPacketWithRoute since unicast // and multicast both require a copy. if outgoingInterface.MinTTL > h.TTL() { return &ip.ErrTTLExceeded{} } route := e.protocol.stack.NewRouteForMulticast(outgoingInterface.ID, h.DestinationAddress(), e.NetworkProtocolNumber()) if route == nil { // Failed to convert to a stack.Route. This likely means that the outgoing // endpoint no longer exists. return &ip.ErrHostUnreachable{} } defer route.Release() return e.forwardPacketWithRoute(route, pkt, true /* updateOptions */) } func (e *endpoint) handleValidatedPacket(h header.IPv4, pkt *stack.PacketBuffer, inNICName string) { pkt.NICID = e.nic.ID() // Raw socket packets are delivered based solely on the transport protocol // number. We only require that the packet be valid IPv4, and that they not // be fragmented. if !h.More() && h.FragmentOffset() == 0 { e.dispatcher.DeliverRawPacket(h.TransportProtocol(), pkt) } stats := e.stats stats.ip.ValidPacketsReceived.Increment() srcAddr := h.SourceAddress() dstAddr := h.DestinationAddress() // As per RFC 1122 section 3.2.1.3: // When a host sends any datagram, the IP source address MUST // be one of its own IP addresses (but not a broadcast or // multicast address). if srcAddr == header.IPv4Broadcast || header.IsV4MulticastAddress(srcAddr) { stats.ip.InvalidSourceAddressesReceived.Increment() return } // Make sure the source address is not a subnet-local broadcast address. if addressEndpoint := e.AcquireAssignedAddress(srcAddr, false /* createTemp */, stack.NeverPrimaryEndpoint, true /* readOnly */); addressEndpoint != nil { subnet := addressEndpoint.Subnet() if subnet.IsBroadcast(srcAddr) { stats.ip.InvalidSourceAddressesReceived.Increment() return } } if header.IsV4MulticastAddress(dstAddr) { // Handle all packets destined to a multicast address separately. Unlike // unicast, these packets can be both delivered locally and forwarded. See // RFC 1812 section 5.2.3 for details regarding the forwarding/local // delivery decision. multicastForwarding := e.MulticastForwarding() && e.protocol.multicastForwarding() if multicastForwarding { e.handleForwardingError(e.forwardMulticastPacket(h, pkt)) } if e.IsInGroup(dstAddr) { e.deliverPacketLocally(h, pkt, inNICName) return } if !multicastForwarding { // Only consider the destination address invalid if we didn't attempt to // forward the pkt and it was not delivered locally. stats.ip.InvalidDestinationAddressesReceived.Increment() } return } // Before we do any processing, check if the packet was received as some // sort of broadcast. // // If the packet is destined for this device, then it should be delivered // locally. Otherwise, if forwarding is enabled, it should be forwarded. if addressEndpoint := e.AcquireAssignedAddress(dstAddr, e.nic.Promiscuous(), stack.CanBePrimaryEndpoint, true /* readOnly */); addressEndpoint != nil { subnet := addressEndpoint.AddressWithPrefix().Subnet() pkt.NetworkPacketInfo.LocalAddressBroadcast = subnet.IsBroadcast(dstAddr) || dstAddr == header.IPv4Broadcast e.deliverPacketLocally(h, pkt, inNICName) } else if e.Forwarding() { e.handleForwardingError(e.forwardUnicastPacket(pkt)) } else { stats.ip.InvalidDestinationAddressesReceived.Increment() } } // handleForwardingError processes the provided err and increments any relevant // counters. func (e *endpoint) handleForwardingError(err ip.ForwardingError) { stats := e.stats.ip switch err := err.(type) { case nil: return case *ip.ErrInitializingSourceAddress: stats.Forwarding.InitializingSource.Increment() case *ip.ErrLinkLocalSourceAddress: stats.Forwarding.LinkLocalSource.Increment() case *ip.ErrLinkLocalDestinationAddress: stats.Forwarding.LinkLocalDestination.Increment() case *ip.ErrTTLExceeded: stats.Forwarding.ExhaustedTTL.Increment() case *ip.ErrHostUnreachable: stats.Forwarding.Unrouteable.Increment() case *ip.ErrParameterProblem: stats.MalformedPacketsReceived.Increment() case *ip.ErrMessageTooLong: stats.Forwarding.PacketTooBig.Increment() case *ip.ErrNoMulticastPendingQueueBufferSpace: stats.Forwarding.NoMulticastPendingQueueBufferSpace.Increment() case *ip.ErrUnexpectedMulticastInputInterface: stats.Forwarding.UnexpectedMulticastInputInterface.Increment() case *ip.ErrUnknownOutputEndpoint: stats.Forwarding.UnknownOutputEndpoint.Increment() case *ip.ErrOutgoingDeviceNoBufferSpace: stats.Forwarding.OutgoingDeviceNoBufferSpace.Increment() default: panic(fmt.Sprintf("unrecognized forwarding error: %s", err)) } stats.Forwarding.Errors.Increment() } func (e *endpoint) deliverPacketLocally(h header.IPv4, pkt *stack.PacketBuffer, inNICName string) { stats := e.stats // iptables filtering. All packets that reach here are intended for // this machine and will not be forwarded. if ok := e.protocol.stack.IPTables().CheckInput(pkt, inNICName); !ok { // iptables is telling us to drop the packet. stats.ip.IPTablesInputDropped.Increment() return } if h.More() || h.FragmentOffset() != 0 { if pkt.Data().Size()+len(pkt.TransportHeader().Slice()) == 0 { // Drop the packet as it's marked as a fragment but has // no payload. stats.ip.MalformedPacketsReceived.Increment() stats.ip.MalformedFragmentsReceived.Increment() return } if opts := h.Options(); len(opts) != 0 { // If there are options we need to check them before we do assembly // or we could be assembling errant packets. However we do not change the // options as that could lead to double processing later. if _, _, optProblem := e.processIPOptions(pkt, opts, &optionUsageVerify{}); optProblem != nil { if optProblem.NeedICMP { _ = e.protocol.returnError(&icmpReasonParamProblem{ pointer: optProblem.Pointer, }, pkt, true /* deliveredLocally */) e.stats.ip.MalformedPacketsReceived.Increment() } return } } // The packet is a fragment, let's try to reassemble it. start := h.FragmentOffset() // Drop the fragment if the size of the reassembled payload would exceed the // maximum payload size. // // Note that this addition doesn't overflow even on 32bit architecture // because pkt.Data().Size() should not exceed 65535 (the max IP datagram // size). Otherwise the packet would've been rejected as invalid before // reaching here. if int(start)+pkt.Data().Size() > header.IPv4MaximumPayloadSize { stats.ip.MalformedPacketsReceived.Increment() stats.ip.MalformedFragmentsReceived.Increment() return } proto := h.Protocol() resPkt, transProtoNum, ready, err := e.protocol.fragmentation.Process( // As per RFC 791 section 2.3, the identification value is unique // for a source-destination pair and protocol. fragmentation.FragmentID{ Source: h.SourceAddress(), Destination: h.DestinationAddress(), ID: uint32(h.ID()), Protocol: proto, }, start, start+uint16(pkt.Data().Size())-1, h.More(), proto, pkt, ) if err != nil { stats.ip.MalformedPacketsReceived.Increment() stats.ip.MalformedFragmentsReceived.Increment() return } if !ready { return } defer resPkt.DecRef() pkt = resPkt h = header.IPv4(pkt.NetworkHeader().Slice()) // The reassembler doesn't take care of fixing up the header, so we need // to do it here. h.SetTotalLength(uint16(pkt.Data().Size() + len(h))) h.SetFlagsFragmentOffset(0, 0) e.protocol.parseTransport(pkt, tcpip.TransportProtocolNumber(transProtoNum)) // Now that the packet is reassembled, it can be sent to raw sockets. e.dispatcher.DeliverRawPacket(h.TransportProtocol(), pkt) } stats.ip.PacketsDelivered.Increment() p := h.TransportProtocol() if p == header.ICMPv4ProtocolNumber { // TODO(gvisor.dev/issues/3810): when we sort out ICMP and transport // headers, the setting of the transport number here should be // unnecessary and removed. pkt.TransportProtocolNumber = p e.handleICMP(pkt) return } // ICMP handles options itself but do it here for all remaining destinations. var hasRouterAlertOption bool if opts := h.Options(); len(opts) != 0 { newOpts, processedOpts, optProblem := e.processIPOptions(pkt, opts, &optionUsageReceive{}) if optProblem != nil { if optProblem.NeedICMP { _ = e.protocol.returnError(&icmpReasonParamProblem{ pointer: optProblem.Pointer, }, pkt, true /* deliveredLocally */) stats.ip.MalformedPacketsReceived.Increment() } return } hasRouterAlertOption = processedOpts.routerAlert copied := copy(opts, newOpts) if copied != len(newOpts) { panic(fmt.Sprintf("copied %d bytes of new options, expected %d bytes", copied, len(newOpts))) } for i := copied; i < len(opts); i++ { // Pad with 0 (EOL). RFC 791 page 23 says "The padding is zero". opts[i] = byte(header.IPv4OptionListEndType) } } if p == header.IGMPProtocolNumber { e.mu.Lock() e.igmp.handleIGMP(pkt, hasRouterAlertOption) // +checklocksforce: e == e.igmp.ep. e.mu.Unlock() return } switch res := e.dispatcher.DeliverTransportPacket(p, pkt); res { case stack.TransportPacketHandled: case stack.TransportPacketDestinationPortUnreachable: // As per RFC: 1122 Section 3.2.2.1 A host SHOULD generate Destination // Unreachable messages with code: // 3 (Port Unreachable), when the designated transport protocol // (e.g., UDP) is unable to demultiplex the datagram but has no // protocol mechanism to inform the sender. _ = e.protocol.returnError(&icmpReasonPortUnreachable{}, pkt, true /* deliveredLocally */) case stack.TransportPacketProtocolUnreachable: // As per RFC: 1122 Section 3.2.2.1 // A host SHOULD generate Destination Unreachable messages with code: // 2 (Protocol Unreachable), when the designated transport protocol // is not supported _ = e.protocol.returnError(&icmpReasonProtoUnreachable{}, pkt, true /* deliveredLocally */) default: panic(fmt.Sprintf("unrecognized result from DeliverTransportPacket = %d", res)) } } // Close cleans up resources associated with the endpoint. func (e *endpoint) Close() { e.mu.Lock() e.disableLocked() e.addressableEndpointState.Cleanup() e.mu.Unlock() e.protocol.forgetEndpoint(e.nic.ID()) } // AddAndAcquirePermanentAddress implements stack.AddressableEndpoint. func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, properties stack.AddressProperties) (stack.AddressEndpoint, tcpip.Error) { e.mu.Lock() defer e.mu.Unlock() ep, err := e.addressableEndpointState.AddAndAcquireAddress(addr, properties, stack.Permanent) if err == nil { e.sendQueuedReports() } return ep, err } // sendQueuedReports sends queued igmp reports. // // +checklocks:e.mu // +checklocksalias:e.igmp.ep.mu=e.mu func (e *endpoint) sendQueuedReports() { e.igmp.sendQueuedReports() } // RemovePermanentAddress implements stack.AddressableEndpoint. func (e *endpoint) RemovePermanentAddress(addr tcpip.Address) tcpip.Error { e.mu.RLock() defer e.mu.RUnlock() return e.addressableEndpointState.RemovePermanentAddress(addr) } // SetDeprecated implements stack.AddressableEndpoint. func (e *endpoint) SetDeprecated(addr tcpip.Address, deprecated bool) tcpip.Error { e.mu.RLock() defer e.mu.RUnlock() return e.addressableEndpointState.SetDeprecated(addr, deprecated) } // SetLifetimes implements stack.AddressableEndpoint. func (e *endpoint) SetLifetimes(addr tcpip.Address, lifetimes stack.AddressLifetimes) tcpip.Error { e.mu.RLock() defer e.mu.RUnlock() return e.addressableEndpointState.SetLifetimes(addr, lifetimes) } // MainAddress implements stack.AddressableEndpoint. func (e *endpoint) MainAddress() tcpip.AddressWithPrefix { e.mu.RLock() defer e.mu.RUnlock() return e.addressableEndpointState.MainAddress() } // AcquireAssignedAddress implements stack.AddressableEndpoint. func (e *endpoint) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior, readOnly bool) stack.AddressEndpoint { e.mu.RLock() defer e.mu.RUnlock() loopback := e.nic.IsLoopback() return e.addressableEndpointState.AcquireAssignedAddressOrMatching(localAddr, func(addressEndpoint stack.AddressEndpoint) bool { subnet := addressEndpoint.Subnet() // IPv4 has a notion of a subnet broadcast address and considers the // loopback interface bound to an address's whole subnet (on linux). return subnet.IsBroadcast(localAddr) || (loopback && subnet.Contains(localAddr)) }, allowTemp, tempPEB, readOnly) } // AcquireOutgoingPrimaryAddress implements stack.AddressableEndpoint. func (e *endpoint) AcquireOutgoingPrimaryAddress(remoteAddr, srcHint tcpip.Address, allowExpired bool) stack.AddressEndpoint { e.mu.RLock() defer e.mu.RUnlock() return e.acquireOutgoingPrimaryAddressRLocked(remoteAddr, srcHint, allowExpired) } // acquireOutgoingPrimaryAddressRLocked is like AcquireOutgoingPrimaryAddress // but with locking requirements // // +checklocksread:e.mu func (e *endpoint) acquireOutgoingPrimaryAddressRLocked(remoteAddr, srcHint tcpip.Address, allowExpired bool) stack.AddressEndpoint { return e.addressableEndpointState.AcquireOutgoingPrimaryAddress(remoteAddr, srcHint, allowExpired) } // PrimaryAddresses implements stack.AddressableEndpoint. func (e *endpoint) PrimaryAddresses() []tcpip.AddressWithPrefix { e.mu.RLock() defer e.mu.RUnlock() return e.addressableEndpointState.PrimaryAddresses() } // PermanentAddresses implements stack.AddressableEndpoint. func (e *endpoint) PermanentAddresses() []tcpip.AddressWithPrefix { e.mu.RLock() defer e.mu.RUnlock() return e.addressableEndpointState.PermanentAddresses() } // JoinGroup implements stack.GroupAddressableEndpoint. func (e *endpoint) JoinGroup(addr tcpip.Address) tcpip.Error { e.mu.Lock() defer e.mu.Unlock() return e.joinGroupLocked(addr) } // joinGroupLocked is like JoinGroup but with locking requirements. // // +checklocks:e.mu // +checklocksalias:e.igmp.ep.mu=e.mu func (e *endpoint) joinGroupLocked(addr tcpip.Address) tcpip.Error { if !header.IsV4MulticastAddress(addr) { return &tcpip.ErrBadAddress{} } e.igmp.joinGroup(addr) return nil } // LeaveGroup implements stack.GroupAddressableEndpoint. func (e *endpoint) LeaveGroup(addr tcpip.Address) tcpip.Error { e.mu.Lock() defer e.mu.Unlock() return e.leaveGroupLocked(addr) } // leaveGroupLocked is like LeaveGroup but with locking requirements. // // +checklocks:e.mu // +checklocksalias:e.igmp.ep.mu=e.mu func (e *endpoint) leaveGroupLocked(addr tcpip.Address) tcpip.Error { return e.igmp.leaveGroup(addr) } // IsInGroup implements stack.GroupAddressableEndpoint. func (e *endpoint) IsInGroup(addr tcpip.Address) bool { e.mu.RLock() defer e.mu.RUnlock() return e.igmp.isInGroup(addr) // +checklocksforce: e.mu==e.igmp.ep.mu. } // Stats implements stack.NetworkEndpoint. func (e *endpoint) Stats() stack.NetworkEndpointStats { return &e.stats.localStats } var _ stack.NetworkProtocol = (*protocol)(nil) var _ stack.MulticastForwardingNetworkProtocol = (*protocol)(nil) var _ stack.RejectIPv4WithHandler = (*protocol)(nil) var _ fragmentation.TimeoutHandler = (*protocol)(nil) // +stateify savable type protocol struct { stack *stack.Stack // mu protects annotated fields below. mu sync.RWMutex `state:"nosave"` // eps is keyed by NICID to allow protocol methods to retrieve an endpoint // when handling a packet, by looking at which NIC handled the packet. // +checklocks:mu eps map[tcpip.NICID]*endpoint // ICMP types for which the stack's global rate limiting must apply. // +checklocks:mu icmpRateLimitedTypes map[header.ICMPv4Type]struct{} // defaultTTL is the current default TTL for the protocol. Only the // uint8 portion of it is meaningful. defaultTTL atomicbitops.Uint32 ids []atomicbitops.Uint32 hashIV uint32 // idTS is the unix timestamp in milliseconds 'ids' was last accessed. idTS atomicbitops.Int64 fragmentation *fragmentation.Fragmentation options Options multicastRouteTable multicast.RouteTable // multicastForwardingDisp is the multicast forwarding event dispatcher that // an integrator can provide to receive multicast forwarding events. Note // that multicast packets will only be forwarded if this is non-nil. // +checklocks:mu multicastForwardingDisp stack.MulticastForwardingEventDispatcher } // Number returns the ipv4 protocol number. func (p *protocol) Number() tcpip.NetworkProtocolNumber { return ProtocolNumber } // MinimumPacketSize returns the minimum valid ipv4 packet size. func (p *protocol) MinimumPacketSize() int { return header.IPv4MinimumSize } // ParseAddresses implements stack.NetworkProtocol. func (*protocol) ParseAddresses(v []byte) (src, dst tcpip.Address) { h := header.IPv4(v) return h.SourceAddress(), h.DestinationAddress() } // SetOption implements stack.NetworkProtocol. func (p *protocol) SetOption(option tcpip.SettableNetworkProtocolOption) tcpip.Error { switch v := option.(type) { case *tcpip.DefaultTTLOption: p.SetDefaultTTL(uint8(*v)) return nil default: return &tcpip.ErrUnknownProtocolOption{} } } // Option implements stack.NetworkProtocol. func (p *protocol) Option(option tcpip.GettableNetworkProtocolOption) tcpip.Error { switch v := option.(type) { case *tcpip.DefaultTTLOption: *v = tcpip.DefaultTTLOption(p.DefaultTTL()) return nil default: return &tcpip.ErrUnknownProtocolOption{} } } // SetDefaultTTL sets the default TTL for endpoints created with this protocol. func (p *protocol) SetDefaultTTL(ttl uint8) { p.defaultTTL.Store(uint32(ttl)) } // DefaultTTL returns the default TTL for endpoints created with this protocol. func (p *protocol) DefaultTTL() uint8 { return uint8(p.defaultTTL.Load()) } // Close implements stack.TransportProtocol. func (p *protocol) Close() { p.fragmentation.Release() p.multicastRouteTable.Close() } // Wait implements stack.TransportProtocol. func (*protocol) Wait() {} func (p *protocol) validateUnicastSourceAndMulticastDestination(addresses stack.UnicastSourceAndMulticastDestination) tcpip.Error { if !p.isUnicastAddress(addresses.Source) || header.IsV4LinkLocalUnicastAddress(addresses.Source) { return &tcpip.ErrBadAddress{} } if !header.IsV4MulticastAddress(addresses.Destination) || header.IsV4LinkLocalMulticastAddress(addresses.Destination) { return &tcpip.ErrBadAddress{} } return nil } func (p *protocol) multicastForwarding() bool { p.mu.RLock() defer p.mu.RUnlock() return p.multicastForwardingDisp != nil } func (p *protocol) newInstalledRoute(route stack.MulticastRoute) (*multicast.InstalledRoute, tcpip.Error) { if len(route.OutgoingInterfaces) == 0 { return nil, &tcpip.ErrMissingRequiredFields{} } if !p.stack.HasNIC(route.ExpectedInputInterface) { return nil, &tcpip.ErrUnknownNICID{} } for _, outgoingInterface := range route.OutgoingInterfaces { if route.ExpectedInputInterface == outgoingInterface.ID { return nil, &tcpip.ErrMulticastInputCannotBeOutput{} } if !p.stack.HasNIC(outgoingInterface.ID) { return nil, &tcpip.ErrUnknownNICID{} } } return p.multicastRouteTable.NewInstalledRoute(route), nil } // AddMulticastRoute implements stack.MulticastForwardingNetworkProtocol. func (p *protocol) AddMulticastRoute(addresses stack.UnicastSourceAndMulticastDestination, route stack.MulticastRoute) tcpip.Error { if !p.multicastForwarding() { return &tcpip.ErrNotPermitted{} } if err := p.validateUnicastSourceAndMulticastDestination(addresses); err != nil { return err } installedRoute, err := p.newInstalledRoute(route) if err != nil { return err } pendingPackets := p.multicastRouteTable.AddInstalledRoute(addresses, installedRoute) for _, pkt := range pendingPackets { p.forwardPendingMulticastPacket(pkt, installedRoute) } return nil } // RemoveMulticastRoute implements // stack.MulticastForwardingNetworkProtocol.RemoveMulticastRoute. func (p *protocol) RemoveMulticastRoute(addresses stack.UnicastSourceAndMulticastDestination) tcpip.Error { if err := p.validateUnicastSourceAndMulticastDestination(addresses); err != nil { return err } if removed := p.multicastRouteTable.RemoveInstalledRoute(addresses); !removed { return &tcpip.ErrHostUnreachable{} } return nil } // EnableMulticastForwarding implements // stack.MulticastForwardingNetworkProtocol.EnableMulticastForwarding. func (p *protocol) EnableMulticastForwarding(disp stack.MulticastForwardingEventDispatcher) (bool, tcpip.Error) { p.mu.Lock() defer p.mu.Unlock() if p.multicastForwardingDisp != nil { return true, nil } if disp == nil { return false, &tcpip.ErrInvalidOptionValue{} } p.multicastForwardingDisp = disp return false, nil } // DisableMulticastForwarding implements // stack.MulticastForwardingNetworkProtocol.DisableMulticastForwarding. func (p *protocol) DisableMulticastForwarding() { p.mu.Lock() defer p.mu.Unlock() p.multicastForwardingDisp = nil p.multicastRouteTable.RemoveAllInstalledRoutes() } // MulticastRouteLastUsedTime implements // stack.MulticastForwardingNetworkProtocol. func (p *protocol) MulticastRouteLastUsedTime(addresses stack.UnicastSourceAndMulticastDestination) (tcpip.MonotonicTime, tcpip.Error) { if err := p.validateUnicastSourceAndMulticastDestination(addresses); err != nil { return tcpip.MonotonicTime{}, err } timestamp, found := p.multicastRouteTable.GetLastUsedTimestamp(addresses) if !found { return tcpip.MonotonicTime{}, &tcpip.ErrHostUnreachable{} } return timestamp, nil } func (p *protocol) forwardPendingMulticastPacket(pkt *stack.PacketBuffer, installedRoute *multicast.InstalledRoute) { defer pkt.DecRef() // Attempt to forward the packet using the endpoint that it originally // arrived on. This ensures that the packet is only forwarded if it // matches the route's expected input interface (see 5a of RFC 1812 section // 5.2.1.3). ep, ok := p.getEndpointForNIC(pkt.NICID) if !ok { // The endpoint that the packet arrived on no longer exists. Silently // drop the pkt. return } if !ep.MulticastForwarding() { return } ep.handleForwardingError(ep.forwardValidatedMulticastPacket(pkt, installedRoute)) } func (p *protocol) isUnicastAddress(addr tcpip.Address) bool { if addr.BitLen() != header.IPv4AddressSizeBits { return false } if addr == header.IPv4Any || addr == header.IPv4Broadcast { return false } if p.isSubnetLocalBroadcastAddress(addr) { return false } return !header.IsV4MulticastAddress(addr) } func (p *protocol) isSubnetLocalBroadcastAddress(addr tcpip.Address) bool { p.mu.RLock() defer p.mu.RUnlock() for _, e := range p.eps { if addressEndpoint := e.AcquireAssignedAddress(addr, false /* createTemp */, stack.NeverPrimaryEndpoint, true /* readOnly */); addressEndpoint != nil { subnet := addressEndpoint.Subnet() if subnet.IsBroadcast(addr) { return true } } } return false } // parseAndValidate parses the packet (including its transport layer header) and // returns the parsed IP header. // // Returns true if the IP header was successfully parsed. func (p *protocol) parseAndValidate(pkt *stack.PacketBuffer) (*buffer.View, bool) { transProtoNum, hasTransportHdr, ok := p.Parse(pkt) if !ok { return nil, false } h := header.IPv4(pkt.NetworkHeader().Slice()) // Do not include the link header's size when calculating the size of the IP // packet. if !h.IsValid(pkt.Size() - len(pkt.LinkHeader().Slice())) { return nil, false } if !pkt.RXChecksumValidated && !h.IsChecksumValid() { return nil, false } if hasTransportHdr { p.parseTransport(pkt, transProtoNum) } return pkt.NetworkHeader().View(), true } func (p *protocol) parseTransport(pkt *stack.PacketBuffer, transProtoNum tcpip.TransportProtocolNumber) { if transProtoNum == header.ICMPv4ProtocolNumber { // The transport layer will handle transport layer parsing errors. _ = parse.ICMPv4(pkt) return } switch err := p.stack.ParsePacketBufferTransport(transProtoNum, pkt); err { case stack.ParsedOK: case stack.UnknownTransportProtocol, stack.TransportLayerParseError: // The transport layer will handle unknown protocols and transport layer // parsing errors. default: panic(fmt.Sprintf("unexpected error parsing transport header = %d", err)) } } // Parse implements stack.NetworkProtocol. func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) { if ok := parse.IPv4(pkt); !ok { return 0, false, false } ipHdr := header.IPv4(pkt.NetworkHeader().Slice()) return ipHdr.TransportProtocol(), !ipHdr.More() && ipHdr.FragmentOffset() == 0, true } // allowICMPReply reports whether an ICMP reply with provided type and code may // be sent following the rate mask options and global ICMP rate limiter. func (p *protocol) allowICMPReply(icmpType header.ICMPv4Type, code header.ICMPv4Code) bool { // Mimic linux and never rate limit for PMTU discovery. // https://github.com/torvalds/linux/blob/9e9fb7655ed585da8f468e29221f0ba194a5f613/net/ipv4/icmp.c#L288 if icmpType == header.ICMPv4DstUnreachable && code == header.ICMPv4FragmentationNeeded { return true } p.mu.RLock() defer p.mu.RUnlock() if _, ok := p.icmpRateLimitedTypes[icmpType]; ok { return p.stack.AllowICMPMessage() } return true } // SendRejectionError implements stack.RejectIPv4WithHandler. func (p *protocol) SendRejectionError(pkt *stack.PacketBuffer, rejectWith stack.RejectIPv4WithICMPType, inputHook bool) tcpip.Error { switch rejectWith { case stack.RejectIPv4WithICMPNetUnreachable: return p.returnError(&icmpReasonNetworkUnreachable{}, pkt, inputHook) case stack.RejectIPv4WithICMPHostUnreachable: return p.returnError(&icmpReasonHostUnreachable{}, pkt, inputHook) case stack.RejectIPv4WithICMPPortUnreachable: return p.returnError(&icmpReasonPortUnreachable{}, pkt, inputHook) case stack.RejectIPv4WithICMPNetProhibited: return p.returnError(&icmpReasonNetworkProhibited{}, pkt, inputHook) case stack.RejectIPv4WithICMPHostProhibited: return p.returnError(&icmpReasonHostProhibited{}, pkt, inputHook) case stack.RejectIPv4WithICMPAdminProhibited: return p.returnError(&icmpReasonAdministrativelyProhibited{}, pkt, inputHook) default: panic(fmt.Sprintf("unhandled %[1]T = %[1]d", rejectWith)) } } // calculateNetworkMTU calculates the network-layer payload MTU based on the // link-layer payload mtu. func calculateNetworkMTU(linkMTU, networkHeaderSize uint32) (uint32, tcpip.Error) { if linkMTU < header.IPv4MinimumMTU { return 0, &tcpip.ErrInvalidEndpointState{} } // As per RFC 791 section 3.1, an IPv4 header cannot exceed 60 bytes in // length: // The maximal internet header is 60 octets, and a typical internet header // is 20 octets, allowing a margin for headers of higher level protocols. if networkHeaderSize > header.IPv4MaximumHeaderSize { return 0, &tcpip.ErrMalformedHeader{} } networkMTU := linkMTU if networkMTU > MaxTotalSize { networkMTU = MaxTotalSize } return networkMTU - networkHeaderSize, nil } func packetMustBeFragmented(pkt *stack.PacketBuffer, networkMTU uint32) bool { payload := len(pkt.TransportHeader().Slice()) + pkt.Data().Size() return pkt.GSOOptions.Type == stack.GSONone && uint32(payload) > networkMTU } // addressToUint32 translates an IPv4 address into its little endian uint32 // representation. // // This function does the same thing as binary.LittleEndian.Uint32 but operates // on a tcpip.Address (a string) without the need to convert it to a byte slice, // which would cause an allocation. func addressToUint32(addr tcpip.Address) uint32 { addrBytes := addr.As4() _ = addrBytes[3] // bounds check hint to compiler return uint32(addrBytes[0]) | uint32(addrBytes[1])<<8 | uint32(addrBytes[2])<<16 | uint32(addrBytes[3])<<24 } // hashRoute calculates a hash value for the given source/destination pair using // the addresses, transport protocol number and a 32-bit number to generate the // hash. func hashRoute(srcAddr, dstAddr tcpip.Address, protocol tcpip.TransportProtocolNumber, hashIV uint32) uint32 { a := addressToUint32(srcAddr) b := addressToUint32(dstAddr) return hash.Hash3Words(a, b, uint32(protocol), hashIV) } // Options holds options to configure a new protocol. // // +stateify savable type Options struct { // IGMP holds options for IGMP. IGMP IGMPOptions // AllowExternalLoopbackTraffic indicates that inbound loopback packets (i.e. // martian loopback packets) should be accepted. AllowExternalLoopbackTraffic bool } // NewProtocolWithOptions returns an IPv4 network protocol. func NewProtocolWithOptions(opts Options) stack.NetworkProtocolFactory { ids := make([]atomicbitops.Uint32, buckets) // Randomly initialize hashIV and the ids. r := hash.RandN32(1 + buckets) for i := range ids { ids[i] = atomicbitops.FromUint32(r[i]) } hashIV := r[buckets] return func(s *stack.Stack) stack.NetworkProtocol { p := &protocol{ stack: s, ids: ids, hashIV: hashIV, defaultTTL: atomicbitops.FromUint32(DefaultTTL), options: opts, } p.fragmentation = fragmentation.NewFragmentation(fragmentblockSize, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, ReassembleTimeout, s.Clock(), p) p.eps = make(map[tcpip.NICID]*endpoint) // Set ICMP rate limiting to Linux defaults. // See https://man7.org/linux/man-pages/man7/icmp.7.html. p.icmpRateLimitedTypes = map[header.ICMPv4Type]struct{}{ header.ICMPv4DstUnreachable: {}, header.ICMPv4SrcQuench: {}, header.ICMPv4TimeExceeded: {}, header.ICMPv4ParamProblem: {}, } if err := p.multicastRouteTable.Init(multicast.DefaultConfig(s.Clock())); err != nil { panic(fmt.Sprintf("p.multicastRouteTable.Init(_): %s", err)) } return p } } // NewProtocol is equivalent to NewProtocolWithOptions with an empty Options. func NewProtocol(s *stack.Stack) stack.NetworkProtocol { return NewProtocolWithOptions(Options{})(s) } func buildNextFragment(pf *fragmentation.PacketFragmenter, originalIPHeader header.IPv4) (*stack.PacketBuffer, bool) { fragPkt, offset, copied, more := pf.BuildNextFragment() fragPkt.NetworkProtocolNumber = ProtocolNumber originalIPHeaderLength := len(originalIPHeader) nextFragIPHeader := header.IPv4(fragPkt.NetworkHeader().Push(originalIPHeaderLength)) fragPkt.NetworkProtocolNumber = ProtocolNumber if copied := copy(nextFragIPHeader, originalIPHeader); copied != len(originalIPHeader) { panic(fmt.Sprintf("wrong number of bytes copied into fragmentIPHeaders: got = %d, want = %d", copied, originalIPHeaderLength)) } flags := originalIPHeader.Flags() if more { flags |= header.IPv4FlagMoreFragments } nextFragIPHeader.SetFlagsFragmentOffset(flags, uint16(offset)) nextFragIPHeader.SetTotalLength(uint16(nextFragIPHeader.HeaderLength()) + uint16(copied)) nextFragIPHeader.SetChecksum(0) nextFragIPHeader.SetChecksum(^nextFragIPHeader.CalculateChecksum()) return fragPkt, more } // optionAction describes possible actions that may be taken on an option // while processing it. type optionAction uint8 const ( // optionRemove says that the option should not be in the output option set. optionRemove optionAction = iota // optionProcess says that the option should be fully processed. optionProcess // optionVerify says the option should be checked and passed unchanged. optionVerify // optionPass says to pass the output set without checking. optionPass ) // optionActions list what to do for each option in a given scenario. type optionActions struct { // timestamp controls what to do with a Timestamp option. timestamp optionAction // recordRoute controls what to do with a Record Route option. recordRoute optionAction // routerAlert controls what to do with a Router Alert option. routerAlert optionAction // unknown controls what to do with an unknown option. unknown optionAction } // optionsUsage specifies the ways options may be operated upon for a given // scenario during packet processing. type optionsUsage interface { actions() optionActions } // optionUsageVerify implements optionsUsage for when we just want to check // fragments. Don't change anything, just check and reject if bad. No // replacement options are generated. type optionUsageVerify struct{} // actions implements optionsUsage. func (*optionUsageVerify) actions() optionActions { return optionActions{ timestamp: optionVerify, recordRoute: optionVerify, routerAlert: optionVerify, unknown: optionRemove, } } // optionUsageReceive implements optionsUsage for packets we will pass // to the transport layer (with the exception of Echo requests). type optionUsageReceive struct{} // actions implements optionsUsage. func (*optionUsageReceive) actions() optionActions { return optionActions{ timestamp: optionProcess, recordRoute: optionProcess, routerAlert: optionVerify, unknown: optionPass, } } // optionUsageForward implements optionsUsage for packets about to be forwarded. // All options are passed on regardless of whether we recognise them, however // we do process the Timestamp and Record Route options. type optionUsageForward struct{} // actions implements optionsUsage. func (*optionUsageForward) actions() optionActions { return optionActions{ timestamp: optionProcess, recordRoute: optionProcess, routerAlert: optionVerify, unknown: optionPass, } } // optionUsageEcho implements optionsUsage for echo packet processing. // Only Timestamp and RecordRoute are processed and sent back. type optionUsageEcho struct{} // actions implements optionsUsage. func (*optionUsageEcho) actions() optionActions { return optionActions{ timestamp: optionProcess, recordRoute: optionProcess, routerAlert: optionVerify, unknown: optionRemove, } } // handleTimestamp does any required processing on a Timestamp option // in place. func handleTimestamp(tsOpt header.IPv4OptionTimestamp, localAddress tcpip.Address, clock tcpip.Clock, usage optionsUsage) *header.IPv4OptParameterProblem { flags := tsOpt.Flags() var entrySize uint8 switch flags { case header.IPv4OptionTimestampOnlyFlag: entrySize = header.IPv4OptionTimestampSize case header.IPv4OptionTimestampWithIPFlag, header.IPv4OptionTimestampWithPredefinedIPFlag: entrySize = header.IPv4OptionTimestampWithAddrSize default: return &header.IPv4OptParameterProblem{ Pointer: header.IPv4OptTSOFLWAndFLGOffset, NeedICMP: true, } } pointer := tsOpt.Pointer() // RFC 791 page 22 states: "The smallest legal value is 5." // Since the pointer is 1 based, and the header is 4 bytes long the // pointer must point beyond the header therefore 4 or less is bad. if pointer <= header.IPv4OptionTimestampHdrLength { return &header.IPv4OptParameterProblem{ Pointer: header.IPv4OptTSPointerOffset, NeedICMP: true, } } // To simplify processing below, base further work on the array of timestamps // beyond the header, rather than on the whole option. Also to aid // calculations set 'nextSlot' to be 0 based as in the packet it is 1 based. nextSlot := pointer - (header.IPv4OptionTimestampHdrLength + 1) optLen := tsOpt.Size() dataLength := optLen - header.IPv4OptionTimestampHdrLength // In the section below, we verify the pointer, length and overflow counter // fields of the option. The distinction is in which byte you return as being // in error in the ICMP packet. Offsets 1 (length), 2 pointer) // or 3 (overflowed counter). // // The following RFC sections cover this section: // // RFC 791 (page 22): // If there is some room but not enough room for a full timestamp // to be inserted, or the overflow count itself overflows, the // original datagram is considered to be in error and is discarded. // In either case an ICMP parameter problem message may be sent to // the source host [3]. // // You can get this situation in two ways. Firstly if the data area is not // a multiple of the entry size or secondly, if the pointer is not at a // multiple of the entry size. The wording of the RFC suggests that // this is not an error until you actually run out of space. if pointer > optLen { // RFC 791 (page 22) says we should switch to using the overflow count. // If the timestamp data area is already full (the pointer exceeds // the length) the datagram is forwarded without inserting the // timestamp, but the overflow count is incremented by one. if flags == header.IPv4OptionTimestampWithPredefinedIPFlag { // By definition we have nothing to do. return nil } if tsOpt.IncOverflow() != 0 { return nil } // The overflow count is also full. return &header.IPv4OptParameterProblem{ Pointer: header.IPv4OptTSOFLWAndFLGOffset, NeedICMP: true, } } if nextSlot+entrySize > dataLength { // The data area isn't full but there isn't room for a new entry. // Either Length or Pointer could be bad. if false { // We must select Pointer for Linux compatibility, even if // only the length is bad. // The Linux code is at (in October 2020) // https://github.com/torvalds/linux/blob/bbf5c979011a099af5dc76498918ed7df445635b/net/ipv4/ip_options.c#L367-L370 // if (optptr[2]+3 > optlen) { // pp_ptr = optptr + 2; // goto error; // } // which doesn't distinguish between which of optptr[2] or optlen // is wrong, but just arbitrarily decides on optptr+2. if dataLength%entrySize != 0 { // The Data section size should be a multiple of the expected // timestamp entry size. return &header.IPv4OptParameterProblem{ Pointer: header.IPv4OptionLengthOffset, NeedICMP: false, } } // If the size is OK, the pointer must be corrupted. } return &header.IPv4OptParameterProblem{ Pointer: header.IPv4OptTSPointerOffset, NeedICMP: true, } } if usage.actions().timestamp == optionProcess { tsOpt.UpdateTimestamp(localAddress, clock) } return nil } // handleRecordRoute checks and processes a Record route option. It is much // like the timestamp type 1 option, but without timestamps. The passed in // address is stored in the option in the correct spot if possible. func handleRecordRoute(rrOpt header.IPv4OptionRecordRoute, localAddress tcpip.Address, usage optionsUsage) *header.IPv4OptParameterProblem { optlen := rrOpt.Size() if optlen < header.IPv4AddressSize+header.IPv4OptionRecordRouteHdrLength { return &header.IPv4OptParameterProblem{ Pointer: header.IPv4OptionLengthOffset, NeedICMP: true, } } pointer := rrOpt.Pointer() // RFC 791 page 20 states: // The pointer is relative to this option, and the // smallest legal value for the pointer is 4. // Since the pointer is 1 based, and the header is 3 bytes long the // pointer must point beyond the header therefore 3 or less is bad. if pointer <= header.IPv4OptionRecordRouteHdrLength { return &header.IPv4OptParameterProblem{ Pointer: header.IPv4OptRRPointerOffset, NeedICMP: true, } } // RFC 791 page 21 says // If the route data area is already full (the pointer exceeds the // length) the datagram is forwarded without inserting the address // into the recorded route. If there is some room but not enough // room for a full address to be inserted, the original datagram is // considered to be in error and is discarded. In either case an // ICMP parameter problem message may be sent to the source // host. // The use of the words "In either case" suggests that a 'full' RR option // could generate an ICMP at every hop after it fills up. We chose to not // do this (as do most implementations). It is probable that the inclusion // of these words is a copy/paste error from the timestamp option where // there are two failure reasons given. if pointer > optlen { return nil } // The data area isn't full but there isn't room for a new entry. // Either Length or Pointer could be bad. We must select Pointer for Linux // compatibility, even if only the length is bad. NB. pointer is 1 based. if pointer+header.IPv4AddressSize > optlen+1 { if false { // This is what we would do if we were not being Linux compatible. // Check for bad pointer or length value. Must be a multiple of 4 after // accounting for the 3 byte header and not within that header. // RFC 791, page 20 says: // The pointer is relative to this option, and the // smallest legal value for the pointer is 4. // // A recorded route is composed of a series of internet addresses. // Each internet address is 32 bits or 4 octets. // Linux skips this test so we must too. See Linux code at: // https://github.com/torvalds/linux/blob/bbf5c979011a099af5dc76498918ed7df445635b/net/ipv4/ip_options.c#L338-L341 // if (optptr[2]+3 > optlen) { // pp_ptr = optptr + 2; // goto error; // } if (optlen-header.IPv4OptionRecordRouteHdrLength)%header.IPv4AddressSize != 0 { // Length is bad, not on integral number of slots. return &header.IPv4OptParameterProblem{ Pointer: header.IPv4OptionLengthOffset, NeedICMP: true, } } // If not length, the fault must be with the pointer. } return &header.IPv4OptParameterProblem{ Pointer: header.IPv4OptRRPointerOffset, NeedICMP: true, } } if usage.actions().recordRoute == optionVerify { return nil } rrOpt.StoreAddress(localAddress) return nil } // handleRouterAlert performs sanity checks on a Router Alert option. func handleRouterAlert(raOpt header.IPv4OptionRouterAlert) *header.IPv4OptParameterProblem { // Only the zero value is acceptable, as per RFC 2113, section 2.1: // Value: A two octet code with the following values: // 0 - Router shall examine packet // 1-65535 - Reserved if raOpt.Value() != header.IPv4OptionRouterAlertValue { return &header.IPv4OptParameterProblem{ Pointer: header.IPv4OptionRouterAlertValueOffset, NeedICMP: true, } } return nil } type optionTracker struct { timestamp bool recordRoute bool routerAlert bool } // processIPOptions parses the IPv4 options and produces a new set of options // suitable for use in the next step of packet processing as informed by usage. // The original will not be touched. // // If there were no errors during parsing, the new set of options is returned as // a new buffer. func (e *endpoint) processIPOptions(pkt *stack.PacketBuffer, opts header.IPv4Options, usage optionsUsage) (header.IPv4Options, optionTracker, *header.IPv4OptParameterProblem) { stats := e.stats.ip optIter := opts.MakeIterator() // Except NOP, each option must only appear at most once (RFC 791 section 3.1, // at the definition of every type). // Keep track of each option we find to enable duplicate option detection. var seenOptions [math.MaxUint8 + 1]bool // TODO(https://gvisor.dev/issue/4586): This will need tweaking when we start // really forwarding packets as we may need to get two addresses, for rx and // tx interfaces. We will also have to take usage into account. localAddress := e.MainAddress().Address if localAddress.BitLen() == 0 { h := header.IPv4(pkt.NetworkHeader().Slice()) dstAddr := h.DestinationAddress() if pkt.NetworkPacketInfo.LocalAddressBroadcast || header.IsV4MulticastAddress(dstAddr) { return nil, optionTracker{}, &header.IPv4OptParameterProblem{ NeedICMP: false, } } localAddress = dstAddr } var optionsProcessed optionTracker for { option, done, optProblem := optIter.Next() if done || optProblem != nil { return optIter.Finalize(), optionsProcessed, optProblem } optType := option.Type() if optType == header.IPv4OptionNOPType { optIter.PushNOPOrEnd(optType) continue } if optType == header.IPv4OptionListEndType { optIter.PushNOPOrEnd(optType) return optIter.Finalize(), optionsProcessed, nil } // check for repeating options (multiple NOPs are OK) if seenOptions[optType] { return nil, optionTracker{}, &header.IPv4OptParameterProblem{ Pointer: optIter.ErrCursor, NeedICMP: true, } } seenOptions[optType] = true optLen, optProblem := func() (int, *header.IPv4OptParameterProblem) { switch option := option.(type) { case *header.IPv4OptionTimestamp: stats.OptionTimestampReceived.Increment() optionsProcessed.timestamp = true if usage.actions().timestamp != optionRemove { clock := e.protocol.stack.Clock() newBuffer := optIter.InitReplacement(option) optProblem := handleTimestamp(header.IPv4OptionTimestamp(newBuffer), localAddress, clock, usage) return len(newBuffer), optProblem } case *header.IPv4OptionRecordRoute: stats.OptionRecordRouteReceived.Increment() optionsProcessed.recordRoute = true if usage.actions().recordRoute != optionRemove { newBuffer := optIter.InitReplacement(option) optProblem := handleRecordRoute(header.IPv4OptionRecordRoute(newBuffer), localAddress, usage) return len(newBuffer), optProblem } case *header.IPv4OptionRouterAlert: stats.OptionRouterAlertReceived.Increment() optionsProcessed.routerAlert = true if usage.actions().routerAlert != optionRemove { newBuffer := optIter.InitReplacement(option) optProblem := handleRouterAlert(header.IPv4OptionRouterAlert(newBuffer)) return len(newBuffer), optProblem } default: stats.OptionUnknownReceived.Increment() if usage.actions().unknown == optionPass { return len(optIter.InitReplacement(option)), nil } } return 0, nil }() if optProblem != nil { optProblem.Pointer += optIter.ErrCursor return nil, optionTracker{}, optProblem } optIter.ConsumeBuffer(optLen) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/ipv4/ipv4_state_autogen.go000066400000000000000000000556311465435605700272630ustar00rootroot00000000000000// automatically generated by stateify. package ipv4 import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (i *icmpv4DestinationUnreachableSockError) StateTypeName() string { return "pkg/tcpip/network/ipv4.icmpv4DestinationUnreachableSockError" } func (i *icmpv4DestinationUnreachableSockError) StateFields() []string { return []string{} } func (i *icmpv4DestinationUnreachableSockError) beforeSave() {} // +checklocksignore func (i *icmpv4DestinationUnreachableSockError) StateSave(stateSinkObject state.Sink) { i.beforeSave() } func (i *icmpv4DestinationUnreachableSockError) afterLoad(context.Context) {} // +checklocksignore func (i *icmpv4DestinationUnreachableSockError) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (i *icmpv4DestinationHostUnreachableSockError) StateTypeName() string { return "pkg/tcpip/network/ipv4.icmpv4DestinationHostUnreachableSockError" } func (i *icmpv4DestinationHostUnreachableSockError) StateFields() []string { return []string{ "icmpv4DestinationUnreachableSockError", } } func (i *icmpv4DestinationHostUnreachableSockError) beforeSave() {} // +checklocksignore func (i *icmpv4DestinationHostUnreachableSockError) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.icmpv4DestinationUnreachableSockError) } func (i *icmpv4DestinationHostUnreachableSockError) afterLoad(context.Context) {} // +checklocksignore func (i *icmpv4DestinationHostUnreachableSockError) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.icmpv4DestinationUnreachableSockError) } func (i *icmpv4DestinationNetUnreachableSockError) StateTypeName() string { return "pkg/tcpip/network/ipv4.icmpv4DestinationNetUnreachableSockError" } func (i *icmpv4DestinationNetUnreachableSockError) StateFields() []string { return []string{ "icmpv4DestinationUnreachableSockError", } } func (i *icmpv4DestinationNetUnreachableSockError) beforeSave() {} // +checklocksignore func (i *icmpv4DestinationNetUnreachableSockError) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.icmpv4DestinationUnreachableSockError) } func (i *icmpv4DestinationNetUnreachableSockError) afterLoad(context.Context) {} // +checklocksignore func (i *icmpv4DestinationNetUnreachableSockError) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.icmpv4DestinationUnreachableSockError) } func (i *icmpv4DestinationPortUnreachableSockError) StateTypeName() string { return "pkg/tcpip/network/ipv4.icmpv4DestinationPortUnreachableSockError" } func (i *icmpv4DestinationPortUnreachableSockError) StateFields() []string { return []string{ "icmpv4DestinationUnreachableSockError", } } func (i *icmpv4DestinationPortUnreachableSockError) beforeSave() {} // +checklocksignore func (i *icmpv4DestinationPortUnreachableSockError) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.icmpv4DestinationUnreachableSockError) } func (i *icmpv4DestinationPortUnreachableSockError) afterLoad(context.Context) {} // +checklocksignore func (i *icmpv4DestinationPortUnreachableSockError) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.icmpv4DestinationUnreachableSockError) } func (i *icmpv4DestinationProtoUnreachableSockError) StateTypeName() string { return "pkg/tcpip/network/ipv4.icmpv4DestinationProtoUnreachableSockError" } func (i *icmpv4DestinationProtoUnreachableSockError) StateFields() []string { return []string{ "icmpv4DestinationUnreachableSockError", } } func (i *icmpv4DestinationProtoUnreachableSockError) beforeSave() {} // +checklocksignore func (i *icmpv4DestinationProtoUnreachableSockError) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.icmpv4DestinationUnreachableSockError) } func (i *icmpv4DestinationProtoUnreachableSockError) afterLoad(context.Context) {} // +checklocksignore func (i *icmpv4DestinationProtoUnreachableSockError) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.icmpv4DestinationUnreachableSockError) } func (i *icmpv4SourceRouteFailedSockError) StateTypeName() string { return "pkg/tcpip/network/ipv4.icmpv4SourceRouteFailedSockError" } func (i *icmpv4SourceRouteFailedSockError) StateFields() []string { return []string{ "icmpv4DestinationUnreachableSockError", } } func (i *icmpv4SourceRouteFailedSockError) beforeSave() {} // +checklocksignore func (i *icmpv4SourceRouteFailedSockError) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.icmpv4DestinationUnreachableSockError) } func (i *icmpv4SourceRouteFailedSockError) afterLoad(context.Context) {} // +checklocksignore func (i *icmpv4SourceRouteFailedSockError) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.icmpv4DestinationUnreachableSockError) } func (i *icmpv4SourceHostIsolatedSockError) StateTypeName() string { return "pkg/tcpip/network/ipv4.icmpv4SourceHostIsolatedSockError" } func (i *icmpv4SourceHostIsolatedSockError) StateFields() []string { return []string{ "icmpv4DestinationUnreachableSockError", } } func (i *icmpv4SourceHostIsolatedSockError) beforeSave() {} // +checklocksignore func (i *icmpv4SourceHostIsolatedSockError) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.icmpv4DestinationUnreachableSockError) } func (i *icmpv4SourceHostIsolatedSockError) afterLoad(context.Context) {} // +checklocksignore func (i *icmpv4SourceHostIsolatedSockError) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.icmpv4DestinationUnreachableSockError) } func (i *icmpv4DestinationHostUnknownSockError) StateTypeName() string { return "pkg/tcpip/network/ipv4.icmpv4DestinationHostUnknownSockError" } func (i *icmpv4DestinationHostUnknownSockError) StateFields() []string { return []string{ "icmpv4DestinationUnreachableSockError", } } func (i *icmpv4DestinationHostUnknownSockError) beforeSave() {} // +checklocksignore func (i *icmpv4DestinationHostUnknownSockError) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.icmpv4DestinationUnreachableSockError) } func (i *icmpv4DestinationHostUnknownSockError) afterLoad(context.Context) {} // +checklocksignore func (i *icmpv4DestinationHostUnknownSockError) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.icmpv4DestinationUnreachableSockError) } func (e *icmpv4FragmentationNeededSockError) StateTypeName() string { return "pkg/tcpip/network/ipv4.icmpv4FragmentationNeededSockError" } func (e *icmpv4FragmentationNeededSockError) StateFields() []string { return []string{ "icmpv4DestinationUnreachableSockError", "mtu", } } func (e *icmpv4FragmentationNeededSockError) beforeSave() {} // +checklocksignore func (e *icmpv4FragmentationNeededSockError) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.icmpv4DestinationUnreachableSockError) stateSinkObject.Save(1, &e.mtu) } func (e *icmpv4FragmentationNeededSockError) afterLoad(context.Context) {} // +checklocksignore func (e *icmpv4FragmentationNeededSockError) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.icmpv4DestinationUnreachableSockError) stateSourceObject.Load(1, &e.mtu) } func (i *IGMPOptions) StateTypeName() string { return "pkg/tcpip/network/ipv4.IGMPOptions" } func (i *IGMPOptions) StateFields() []string { return []string{ "Enabled", } } func (i *IGMPOptions) beforeSave() {} // +checklocksignore func (i *IGMPOptions) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.Enabled) } func (i *IGMPOptions) afterLoad(context.Context) {} // +checklocksignore func (i *IGMPOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.Enabled) } func (igmp *igmpState) StateTypeName() string { return "pkg/tcpip/network/ipv4.igmpState" } func (igmp *igmpState) StateFields() []string { return []string{ "ep", "genericMulticastProtocol", "mode", "igmpV1Job", } } func (igmp *igmpState) beforeSave() {} // +checklocksignore func (igmp *igmpState) StateSave(stateSinkObject state.Sink) { igmp.beforeSave() stateSinkObject.Save(0, &igmp.ep) stateSinkObject.Save(1, &igmp.genericMulticastProtocol) stateSinkObject.Save(2, &igmp.mode) stateSinkObject.Save(3, &igmp.igmpV1Job) } func (igmp *igmpState) afterLoad(context.Context) {} // +checklocksignore func (igmp *igmpState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &igmp.ep) stateSourceObject.Load(1, &igmp.genericMulticastProtocol) stateSourceObject.Load(2, &igmp.mode) stateSourceObject.Load(3, &igmp.igmpV1Job) } func (e *endpoint) StateTypeName() string { return "pkg/tcpip/network/ipv4.endpoint" } func (e *endpoint) StateFields() []string { return []string{ "nic", "dispatcher", "protocol", "stats", "enabled", "forwarding", "multicastForwarding", "addressableEndpointState", "igmp", } } func (e *endpoint) beforeSave() {} // +checklocksignore func (e *endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.nic) stateSinkObject.Save(1, &e.dispatcher) stateSinkObject.Save(2, &e.protocol) stateSinkObject.Save(3, &e.stats) stateSinkObject.Save(4, &e.enabled) stateSinkObject.Save(5, &e.forwarding) stateSinkObject.Save(6, &e.multicastForwarding) stateSinkObject.Save(7, &e.addressableEndpointState) stateSinkObject.Save(8, &e.igmp) } func (e *endpoint) afterLoad(context.Context) {} // +checklocksignore func (e *endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.nic) stateSourceObject.Load(1, &e.dispatcher) stateSourceObject.Load(2, &e.protocol) stateSourceObject.Load(3, &e.stats) stateSourceObject.Load(4, &e.enabled) stateSourceObject.Load(5, &e.forwarding) stateSourceObject.Load(6, &e.multicastForwarding) stateSourceObject.Load(7, &e.addressableEndpointState) stateSourceObject.Load(8, &e.igmp) } func (p *protocol) StateTypeName() string { return "pkg/tcpip/network/ipv4.protocol" } func (p *protocol) StateFields() []string { return []string{ "stack", "eps", "icmpRateLimitedTypes", "defaultTTL", "ids", "hashIV", "idTS", "fragmentation", "options", "multicastRouteTable", "multicastForwardingDisp", } } func (p *protocol) beforeSave() {} // +checklocksignore func (p *protocol) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.stack) stateSinkObject.Save(1, &p.eps) stateSinkObject.Save(2, &p.icmpRateLimitedTypes) stateSinkObject.Save(3, &p.defaultTTL) stateSinkObject.Save(4, &p.ids) stateSinkObject.Save(5, &p.hashIV) stateSinkObject.Save(6, &p.idTS) stateSinkObject.Save(7, &p.fragmentation) stateSinkObject.Save(8, &p.options) stateSinkObject.Save(9, &p.multicastRouteTable) stateSinkObject.Save(10, &p.multicastForwardingDisp) } func (p *protocol) afterLoad(context.Context) {} // +checklocksignore func (p *protocol) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.stack) stateSourceObject.Load(1, &p.eps) stateSourceObject.Load(2, &p.icmpRateLimitedTypes) stateSourceObject.Load(3, &p.defaultTTL) stateSourceObject.Load(4, &p.ids) stateSourceObject.Load(5, &p.hashIV) stateSourceObject.Load(6, &p.idTS) stateSourceObject.Load(7, &p.fragmentation) stateSourceObject.Load(8, &p.options) stateSourceObject.Load(9, &p.multicastRouteTable) stateSourceObject.Load(10, &p.multicastForwardingDisp) } func (o *Options) StateTypeName() string { return "pkg/tcpip/network/ipv4.Options" } func (o *Options) StateFields() []string { return []string{ "IGMP", "AllowExternalLoopbackTraffic", } } func (o *Options) beforeSave() {} // +checklocksignore func (o *Options) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.IGMP) stateSinkObject.Save(1, &o.AllowExternalLoopbackTraffic) } func (o *Options) afterLoad(context.Context) {} // +checklocksignore func (o *Options) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.IGMP) stateSourceObject.Load(1, &o.AllowExternalLoopbackTraffic) } func (s *Stats) StateTypeName() string { return "pkg/tcpip/network/ipv4.Stats" } func (s *Stats) StateFields() []string { return []string{ "IP", "IGMP", "ICMP", } } func (s *Stats) beforeSave() {} // +checklocksignore func (s *Stats) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.IP) stateSinkObject.Save(1, &s.IGMP) stateSinkObject.Save(2, &s.ICMP) } func (s *Stats) afterLoad(context.Context) {} // +checklocksignore func (s *Stats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.IP) stateSourceObject.Load(1, &s.IGMP) stateSourceObject.Load(2, &s.ICMP) } func (s *sharedStats) StateTypeName() string { return "pkg/tcpip/network/ipv4.sharedStats" } func (s *sharedStats) StateFields() []string { return []string{ "localStats", "ip", "icmp", "igmp", } } func (s *sharedStats) beforeSave() {} // +checklocksignore func (s *sharedStats) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.localStats) stateSinkObject.Save(1, &s.ip) stateSinkObject.Save(2, &s.icmp) stateSinkObject.Save(3, &s.igmp) } func (s *sharedStats) afterLoad(context.Context) {} // +checklocksignore func (s *sharedStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.localStats) stateSourceObject.Load(1, &s.ip) stateSourceObject.Load(2, &s.icmp) stateSourceObject.Load(3, &s.igmp) } func (m *multiCounterICMPv4PacketStats) StateTypeName() string { return "pkg/tcpip/network/ipv4.multiCounterICMPv4PacketStats" } func (m *multiCounterICMPv4PacketStats) StateFields() []string { return []string{ "echoRequest", "echoReply", "dstUnreachable", "srcQuench", "redirect", "timeExceeded", "paramProblem", "timestamp", "timestampReply", "infoRequest", "infoReply", } } func (m *multiCounterICMPv4PacketStats) beforeSave() {} // +checklocksignore func (m *multiCounterICMPv4PacketStats) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.echoRequest) stateSinkObject.Save(1, &m.echoReply) stateSinkObject.Save(2, &m.dstUnreachable) stateSinkObject.Save(3, &m.srcQuench) stateSinkObject.Save(4, &m.redirect) stateSinkObject.Save(5, &m.timeExceeded) stateSinkObject.Save(6, &m.paramProblem) stateSinkObject.Save(7, &m.timestamp) stateSinkObject.Save(8, &m.timestampReply) stateSinkObject.Save(9, &m.infoRequest) stateSinkObject.Save(10, &m.infoReply) } func (m *multiCounterICMPv4PacketStats) afterLoad(context.Context) {} // +checklocksignore func (m *multiCounterICMPv4PacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.echoRequest) stateSourceObject.Load(1, &m.echoReply) stateSourceObject.Load(2, &m.dstUnreachable) stateSourceObject.Load(3, &m.srcQuench) stateSourceObject.Load(4, &m.redirect) stateSourceObject.Load(5, &m.timeExceeded) stateSourceObject.Load(6, &m.paramProblem) stateSourceObject.Load(7, &m.timestamp) stateSourceObject.Load(8, &m.timestampReply) stateSourceObject.Load(9, &m.infoRequest) stateSourceObject.Load(10, &m.infoReply) } func (m *multiCounterICMPv4SentPacketStats) StateTypeName() string { return "pkg/tcpip/network/ipv4.multiCounterICMPv4SentPacketStats" } func (m *multiCounterICMPv4SentPacketStats) StateFields() []string { return []string{ "multiCounterICMPv4PacketStats", "dropped", "rateLimited", } } func (m *multiCounterICMPv4SentPacketStats) beforeSave() {} // +checklocksignore func (m *multiCounterICMPv4SentPacketStats) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.multiCounterICMPv4PacketStats) stateSinkObject.Save(1, &m.dropped) stateSinkObject.Save(2, &m.rateLimited) } func (m *multiCounterICMPv4SentPacketStats) afterLoad(context.Context) {} // +checklocksignore func (m *multiCounterICMPv4SentPacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.multiCounterICMPv4PacketStats) stateSourceObject.Load(1, &m.dropped) stateSourceObject.Load(2, &m.rateLimited) } func (m *multiCounterICMPv4ReceivedPacketStats) StateTypeName() string { return "pkg/tcpip/network/ipv4.multiCounterICMPv4ReceivedPacketStats" } func (m *multiCounterICMPv4ReceivedPacketStats) StateFields() []string { return []string{ "multiCounterICMPv4PacketStats", "invalid", } } func (m *multiCounterICMPv4ReceivedPacketStats) beforeSave() {} // +checklocksignore func (m *multiCounterICMPv4ReceivedPacketStats) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.multiCounterICMPv4PacketStats) stateSinkObject.Save(1, &m.invalid) } func (m *multiCounterICMPv4ReceivedPacketStats) afterLoad(context.Context) {} // +checklocksignore func (m *multiCounterICMPv4ReceivedPacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.multiCounterICMPv4PacketStats) stateSourceObject.Load(1, &m.invalid) } func (m *multiCounterICMPv4Stats) StateTypeName() string { return "pkg/tcpip/network/ipv4.multiCounterICMPv4Stats" } func (m *multiCounterICMPv4Stats) StateFields() []string { return []string{ "packetsSent", "packetsReceived", } } func (m *multiCounterICMPv4Stats) beforeSave() {} // +checklocksignore func (m *multiCounterICMPv4Stats) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.packetsSent) stateSinkObject.Save(1, &m.packetsReceived) } func (m *multiCounterICMPv4Stats) afterLoad(context.Context) {} // +checklocksignore func (m *multiCounterICMPv4Stats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.packetsSent) stateSourceObject.Load(1, &m.packetsReceived) } func (m *multiCounterIGMPPacketStats) StateTypeName() string { return "pkg/tcpip/network/ipv4.multiCounterIGMPPacketStats" } func (m *multiCounterIGMPPacketStats) StateFields() []string { return []string{ "membershipQuery", "v1MembershipReport", "v2MembershipReport", "v3MembershipReport", "leaveGroup", } } func (m *multiCounterIGMPPacketStats) beforeSave() {} // +checklocksignore func (m *multiCounterIGMPPacketStats) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.membershipQuery) stateSinkObject.Save(1, &m.v1MembershipReport) stateSinkObject.Save(2, &m.v2MembershipReport) stateSinkObject.Save(3, &m.v3MembershipReport) stateSinkObject.Save(4, &m.leaveGroup) } func (m *multiCounterIGMPPacketStats) afterLoad(context.Context) {} // +checklocksignore func (m *multiCounterIGMPPacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.membershipQuery) stateSourceObject.Load(1, &m.v1MembershipReport) stateSourceObject.Load(2, &m.v2MembershipReport) stateSourceObject.Load(3, &m.v3MembershipReport) stateSourceObject.Load(4, &m.leaveGroup) } func (m *multiCounterIGMPSentPacketStats) StateTypeName() string { return "pkg/tcpip/network/ipv4.multiCounterIGMPSentPacketStats" } func (m *multiCounterIGMPSentPacketStats) StateFields() []string { return []string{ "multiCounterIGMPPacketStats", "dropped", } } func (m *multiCounterIGMPSentPacketStats) beforeSave() {} // +checklocksignore func (m *multiCounterIGMPSentPacketStats) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.multiCounterIGMPPacketStats) stateSinkObject.Save(1, &m.dropped) } func (m *multiCounterIGMPSentPacketStats) afterLoad(context.Context) {} // +checklocksignore func (m *multiCounterIGMPSentPacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.multiCounterIGMPPacketStats) stateSourceObject.Load(1, &m.dropped) } func (m *multiCounterIGMPReceivedPacketStats) StateTypeName() string { return "pkg/tcpip/network/ipv4.multiCounterIGMPReceivedPacketStats" } func (m *multiCounterIGMPReceivedPacketStats) StateFields() []string { return []string{ "multiCounterIGMPPacketStats", "invalid", "checksumErrors", "unrecognized", } } func (m *multiCounterIGMPReceivedPacketStats) beforeSave() {} // +checklocksignore func (m *multiCounterIGMPReceivedPacketStats) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.multiCounterIGMPPacketStats) stateSinkObject.Save(1, &m.invalid) stateSinkObject.Save(2, &m.checksumErrors) stateSinkObject.Save(3, &m.unrecognized) } func (m *multiCounterIGMPReceivedPacketStats) afterLoad(context.Context) {} // +checklocksignore func (m *multiCounterIGMPReceivedPacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.multiCounterIGMPPacketStats) stateSourceObject.Load(1, &m.invalid) stateSourceObject.Load(2, &m.checksumErrors) stateSourceObject.Load(3, &m.unrecognized) } func (m *multiCounterIGMPStats) StateTypeName() string { return "pkg/tcpip/network/ipv4.multiCounterIGMPStats" } func (m *multiCounterIGMPStats) StateFields() []string { return []string{ "packetsSent", "packetsReceived", } } func (m *multiCounterIGMPStats) beforeSave() {} // +checklocksignore func (m *multiCounterIGMPStats) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.packetsSent) stateSinkObject.Save(1, &m.packetsReceived) } func (m *multiCounterIGMPStats) afterLoad(context.Context) {} // +checklocksignore func (m *multiCounterIGMPStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.packetsSent) stateSourceObject.Load(1, &m.packetsReceived) } func init() { state.Register((*icmpv4DestinationUnreachableSockError)(nil)) state.Register((*icmpv4DestinationHostUnreachableSockError)(nil)) state.Register((*icmpv4DestinationNetUnreachableSockError)(nil)) state.Register((*icmpv4DestinationPortUnreachableSockError)(nil)) state.Register((*icmpv4DestinationProtoUnreachableSockError)(nil)) state.Register((*icmpv4SourceRouteFailedSockError)(nil)) state.Register((*icmpv4SourceHostIsolatedSockError)(nil)) state.Register((*icmpv4DestinationHostUnknownSockError)(nil)) state.Register((*icmpv4FragmentationNeededSockError)(nil)) state.Register((*IGMPOptions)(nil)) state.Register((*igmpState)(nil)) state.Register((*endpoint)(nil)) state.Register((*protocol)(nil)) state.Register((*Options)(nil)) state.Register((*Stats)(nil)) state.Register((*sharedStats)(nil)) state.Register((*multiCounterICMPv4PacketStats)(nil)) state.Register((*multiCounterICMPv4SentPacketStats)(nil)) state.Register((*multiCounterICMPv4ReceivedPacketStats)(nil)) state.Register((*multiCounterICMPv4Stats)(nil)) state.Register((*multiCounterIGMPPacketStats)(nil)) state.Register((*multiCounterIGMPSentPacketStats)(nil)) state.Register((*multiCounterIGMPReceivedPacketStats)(nil)) state.Register((*multiCounterIGMPStats)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/ipv4/stats.go000066400000000000000000000144521465435605700246110ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ipv4 import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/network/internal/ip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) var _ stack.IPNetworkEndpointStats = (*Stats)(nil) // Stats holds statistics related to the IPv4 protocol family. // // +stateify savable type Stats struct { // IP holds IPv4 statistics. IP tcpip.IPStats // IGMP holds IGMP statistics. IGMP tcpip.IGMPStats // ICMP holds ICMPv4 statistics. ICMP tcpip.ICMPv4Stats } // IsNetworkEndpointStats implements stack.NetworkEndpointStats. func (*Stats) IsNetworkEndpointStats() {} // IPStats implements stack.IPNetworkEndointStats func (s *Stats) IPStats() *tcpip.IPStats { return &s.IP } // +stateify savable type sharedStats struct { localStats Stats ip ip.MultiCounterIPStats icmp multiCounterICMPv4Stats igmp multiCounterIGMPStats } // LINT.IfChange(multiCounterICMPv4PacketStats) // +stateify savable type multiCounterICMPv4PacketStats struct { echoRequest tcpip.MultiCounterStat echoReply tcpip.MultiCounterStat dstUnreachable tcpip.MultiCounterStat srcQuench tcpip.MultiCounterStat redirect tcpip.MultiCounterStat timeExceeded tcpip.MultiCounterStat paramProblem tcpip.MultiCounterStat timestamp tcpip.MultiCounterStat timestampReply tcpip.MultiCounterStat infoRequest tcpip.MultiCounterStat infoReply tcpip.MultiCounterStat } func (m *multiCounterICMPv4PacketStats) init(a, b *tcpip.ICMPv4PacketStats) { m.echoRequest.Init(a.EchoRequest, b.EchoRequest) m.echoReply.Init(a.EchoReply, b.EchoReply) m.dstUnreachable.Init(a.DstUnreachable, b.DstUnreachable) m.srcQuench.Init(a.SrcQuench, b.SrcQuench) m.redirect.Init(a.Redirect, b.Redirect) m.timeExceeded.Init(a.TimeExceeded, b.TimeExceeded) m.paramProblem.Init(a.ParamProblem, b.ParamProblem) m.timestamp.Init(a.Timestamp, b.Timestamp) m.timestampReply.Init(a.TimestampReply, b.TimestampReply) m.infoRequest.Init(a.InfoRequest, b.InfoRequest) m.infoReply.Init(a.InfoReply, b.InfoReply) } // LINT.ThenChange(../../tcpip.go:ICMPv4PacketStats) // LINT.IfChange(multiCounterICMPv4SentPacketStats) // +stateify savable type multiCounterICMPv4SentPacketStats struct { multiCounterICMPv4PacketStats dropped tcpip.MultiCounterStat rateLimited tcpip.MultiCounterStat } func (m *multiCounterICMPv4SentPacketStats) init(a, b *tcpip.ICMPv4SentPacketStats) { m.multiCounterICMPv4PacketStats.init(&a.ICMPv4PacketStats, &b.ICMPv4PacketStats) m.dropped.Init(a.Dropped, b.Dropped) m.rateLimited.Init(a.RateLimited, b.RateLimited) } // LINT.ThenChange(../../tcpip.go:ICMPv4SentPacketStats) // LINT.IfChange(multiCounterICMPv4ReceivedPacketStats) // +stateify savable type multiCounterICMPv4ReceivedPacketStats struct { multiCounterICMPv4PacketStats invalid tcpip.MultiCounterStat } func (m *multiCounterICMPv4ReceivedPacketStats) init(a, b *tcpip.ICMPv4ReceivedPacketStats) { m.multiCounterICMPv4PacketStats.init(&a.ICMPv4PacketStats, &b.ICMPv4PacketStats) m.invalid.Init(a.Invalid, b.Invalid) } // LINT.ThenChange(../../tcpip.go:ICMPv4ReceivedPacketStats) // LINT.IfChange(multiCounterICMPv4Stats) // +stateify savable type multiCounterICMPv4Stats struct { packetsSent multiCounterICMPv4SentPacketStats packetsReceived multiCounterICMPv4ReceivedPacketStats } func (m *multiCounterICMPv4Stats) init(a, b *tcpip.ICMPv4Stats) { m.packetsSent.init(&a.PacketsSent, &b.PacketsSent) m.packetsReceived.init(&a.PacketsReceived, &b.PacketsReceived) } // LINT.ThenChange(../../tcpip.go:ICMPv4Stats) // LINT.IfChange(multiCounterIGMPPacketStats) // +stateify savable type multiCounterIGMPPacketStats struct { membershipQuery tcpip.MultiCounterStat v1MembershipReport tcpip.MultiCounterStat v2MembershipReport tcpip.MultiCounterStat v3MembershipReport tcpip.MultiCounterStat leaveGroup tcpip.MultiCounterStat } func (m *multiCounterIGMPPacketStats) init(a, b *tcpip.IGMPPacketStats) { m.membershipQuery.Init(a.MembershipQuery, b.MembershipQuery) m.v1MembershipReport.Init(a.V1MembershipReport, b.V1MembershipReport) m.v2MembershipReport.Init(a.V2MembershipReport, b.V2MembershipReport) m.v3MembershipReport.Init(a.V3MembershipReport, b.V3MembershipReport) m.leaveGroup.Init(a.LeaveGroup, b.LeaveGroup) } // LINT.ThenChange(../../tcpip.go:IGMPPacketStats) // LINT.IfChange(multiCounterIGMPSentPacketStats) // +stateify savable type multiCounterIGMPSentPacketStats struct { multiCounterIGMPPacketStats dropped tcpip.MultiCounterStat } func (m *multiCounterIGMPSentPacketStats) init(a, b *tcpip.IGMPSentPacketStats) { m.multiCounterIGMPPacketStats.init(&a.IGMPPacketStats, &b.IGMPPacketStats) m.dropped.Init(a.Dropped, b.Dropped) } // LINT.ThenChange(../../tcpip.go:IGMPSentPacketStats) // LINT.IfChange(multiCounterIGMPReceivedPacketStats) // +stateify savable type multiCounterIGMPReceivedPacketStats struct { multiCounterIGMPPacketStats invalid tcpip.MultiCounterStat checksumErrors tcpip.MultiCounterStat unrecognized tcpip.MultiCounterStat } func (m *multiCounterIGMPReceivedPacketStats) init(a, b *tcpip.IGMPReceivedPacketStats) { m.multiCounterIGMPPacketStats.init(&a.IGMPPacketStats, &b.IGMPPacketStats) m.invalid.Init(a.Invalid, b.Invalid) m.checksumErrors.Init(a.ChecksumErrors, b.ChecksumErrors) m.unrecognized.Init(a.Unrecognized, b.Unrecognized) } // LINT.ThenChange(../../tcpip.go:IGMPReceivedPacketStats) // LINT.IfChange(multiCounterIGMPStats) // +stateify savable type multiCounterIGMPStats struct { packetsSent multiCounterIGMPSentPacketStats packetsReceived multiCounterIGMPReceivedPacketStats } func (m *multiCounterIGMPStats) init(a, b *tcpip.IGMPStats) { m.packetsSent.init(&a.PacketsSent, &b.PacketsSent) m.packetsReceived.init(&a.PacketsReceived, &b.PacketsReceived) } // LINT.ThenChange(../../tcpip.go:IGMPStats) golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/ipv6/000077500000000000000000000000001465435605700231205ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/ipv6/dhcpv6configurationfromndpra_string.go000066400000000000000000000030071465435605700327300ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by "stringer -type DHCPv6ConfigurationFromNDPRA"; DO NOT EDIT. package ipv6 import "strconv" func _() { // An "invalid array index" compiler error signifies that the constant values have changed. // Re-run the stringer command to generate them again. var x [1]struct{} _ = x[DHCPv6NoConfiguration-1] _ = x[DHCPv6ManagedAddress-2] _ = x[DHCPv6OtherConfigurations-3] } const _DHCPv6ConfigurationFromNDPRA_name = "DHCPv6NoConfigurationDHCPv6ManagedAddressDHCPv6OtherConfigurations" var _DHCPv6ConfigurationFromNDPRA_index = [...]uint8{0, 21, 41, 66} func (i DHCPv6ConfigurationFromNDPRA) String() string { i -= 1 if i < 0 || i >= DHCPv6ConfigurationFromNDPRA(len(_DHCPv6ConfigurationFromNDPRA_index)-1) { return "DHCPv6ConfigurationFromNDPRA(" + strconv.FormatInt(int64(i+1), 10) + ")" } return _DHCPv6ConfigurationFromNDPRA_name[_DHCPv6ConfigurationFromNDPRA_index[i]:_DHCPv6ConfigurationFromNDPRA_index[i+1]] } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/ipv6/icmp.go000066400000000000000000001254521465435605700244100ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ipv6 import ( "fmt" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // icmpv6DestinationUnreachableSockError is a general ICMPv6 Destination // Unreachable error. // // +stateify savable type icmpv6DestinationUnreachableSockError struct{} // Origin implements tcpip.SockErrorCause. func (*icmpv6DestinationUnreachableSockError) Origin() tcpip.SockErrOrigin { return tcpip.SockExtErrorOriginICMP6 } // Type implements tcpip.SockErrorCause. func (*icmpv6DestinationUnreachableSockError) Type() uint8 { return uint8(header.ICMPv6DstUnreachable) } // Info implements tcpip.SockErrorCause. func (*icmpv6DestinationUnreachableSockError) Info() uint32 { return 0 } var _ stack.TransportError = (*icmpv6DestinationNetworkUnreachableSockError)(nil) // icmpv6DestinationNetworkUnreachableSockError is an ICMPv6 Destination Network // Unreachable error. // // It indicates that the destination network is unreachable. // // +stateify savable type icmpv6DestinationNetworkUnreachableSockError struct { icmpv6DestinationUnreachableSockError } // Code implements tcpip.SockErrorCause. func (*icmpv6DestinationNetworkUnreachableSockError) Code() uint8 { return uint8(header.ICMPv6NetworkUnreachable) } // Kind implements stack.TransportError. func (*icmpv6DestinationNetworkUnreachableSockError) Kind() stack.TransportErrorKind { return stack.DestinationNetworkUnreachableTransportError } var _ stack.TransportError = (*icmpv6DestinationPortUnreachableSockError)(nil) // icmpv6DestinationPortUnreachableSockError is an ICMPv6 Destination Port // Unreachable error. // // It indicates that a packet reached the destination host, but the transport // protocol was not active on the destination port. // // +stateify savable type icmpv6DestinationPortUnreachableSockError struct { icmpv6DestinationUnreachableSockError } // Code implements tcpip.SockErrorCause. func (*icmpv6DestinationPortUnreachableSockError) Code() uint8 { return uint8(header.ICMPv6PortUnreachable) } // Kind implements stack.TransportError. func (*icmpv6DestinationPortUnreachableSockError) Kind() stack.TransportErrorKind { return stack.DestinationPortUnreachableTransportError } var _ stack.TransportError = (*icmpv6DestinationAddressUnreachableSockError)(nil) // icmpv6DestinationAddressUnreachableSockError is an ICMPv6 Destination Address // Unreachable error. // // It indicates that a packet was not able to reach the destination. // // +stateify savable type icmpv6DestinationAddressUnreachableSockError struct { icmpv6DestinationUnreachableSockError } // Code implements tcpip.SockErrorCause. func (*icmpv6DestinationAddressUnreachableSockError) Code() uint8 { return uint8(header.ICMPv6AddressUnreachable) } // Kind implements stack.TransportError. func (*icmpv6DestinationAddressUnreachableSockError) Kind() stack.TransportErrorKind { return stack.DestinationHostUnreachableTransportError } var _ stack.TransportError = (*icmpv6PacketTooBigSockError)(nil) // icmpv6PacketTooBigSockError is an ICMPv6 Packet Too Big error. // // It indicates that a link exists on the path to the destination with an MTU // that is too small to carry the packet. // // +stateify savable type icmpv6PacketTooBigSockError struct { mtu uint32 } // Origin implements tcpip.SockErrorCause. func (*icmpv6PacketTooBigSockError) Origin() tcpip.SockErrOrigin { return tcpip.SockExtErrorOriginICMP6 } // Type implements tcpip.SockErrorCause. func (*icmpv6PacketTooBigSockError) Type() uint8 { return uint8(header.ICMPv6PacketTooBig) } // Code implements tcpip.SockErrorCause. func (*icmpv6PacketTooBigSockError) Code() uint8 { return uint8(header.ICMPv6UnusedCode) } // Info implements tcpip.SockErrorCause. func (e *icmpv6PacketTooBigSockError) Info() uint32 { return e.mtu } // Kind implements stack.TransportError. func (*icmpv6PacketTooBigSockError) Kind() stack.TransportErrorKind { return stack.PacketTooBigTransportError } func (e *endpoint) checkLocalAddress(addr tcpip.Address) bool { if e.nic.Spoofing() { return true } if addressEndpoint := e.AcquireAssignedAddress(addr, false, stack.NeverPrimaryEndpoint, true /* readOnly */); addressEndpoint != nil { return true } return false } // handleControl handles the case when an ICMP packet contains the headers of // the original packet that caused the ICMP one to be sent. This information is // used to find out which transport endpoint must be notified about the ICMP // packet. func (e *endpoint) handleControl(transErr stack.TransportError, pkt *stack.PacketBuffer) { h, ok := pkt.Data().PullUp(header.IPv6MinimumSize) if !ok { return } hdr := header.IPv6(h) // We don't use IsValid() here because ICMP only requires that up to // 1280 bytes of the original packet be included. So it's likely that it // is truncated, which would cause IsValid to return false. // // Drop packet if it doesn't have the basic IPv6 header or if the // original source address doesn't match an address we own. srcAddr := hdr.SourceAddress() if !e.checkLocalAddress(srcAddr) { return } // Keep needed information before trimming header. p := hdr.TransportProtocol() dstAddr := hdr.DestinationAddress() // Skip the IP header, then handle the fragmentation header if there // is one. if _, ok := pkt.Data().Consume(header.IPv6MinimumSize); !ok { panic("could not consume IPv6MinimumSize bytes") } if p == header.IPv6FragmentHeader { f, ok := pkt.Data().PullUp(header.IPv6FragmentHeaderSize) if !ok { return } fragHdr := header.IPv6Fragment(f) if !fragHdr.IsValid() || fragHdr.FragmentOffset() != 0 { // We can't handle fragments that aren't at offset 0 // because they don't have the transport headers. return } p = fragHdr.TransportProtocol() // Skip fragmentation header and find out the actual protocol // number. if _, ok := pkt.Data().Consume(header.IPv6FragmentHeaderSize); !ok { panic("could not consume IPv6FragmentHeaderSize bytes") } } e.dispatcher.DeliverTransportError(srcAddr, dstAddr, ProtocolNumber, p, transErr, pkt) } // getLinkAddrOption searches NDP options for a given link address option using // the provided getAddr function as a filter. Returns the link address if // found; otherwise, returns the zero link address value. Also returns true if // the options are valid as per the wire format, false otherwise. func getLinkAddrOption(it header.NDPOptionIterator, getAddr func(header.NDPOption) tcpip.LinkAddress) (tcpip.LinkAddress, bool) { var linkAddr tcpip.LinkAddress for { opt, done, err := it.Next() if err != nil { return "", false } if done { break } if addr := getAddr(opt); len(addr) != 0 { // No RFCs define what to do when an NDP message has multiple Link-Layer // Address options. Since no interface can have multiple link-layer // addresses, we consider such messages invalid. if len(linkAddr) != 0 { return "", false } linkAddr = addr } } return linkAddr, true } // getSourceLinkAddr searches NDP options for the source link address option. // Returns the link address if found; otherwise, returns the zero link address // value. Also returns true if the options are valid as per the wire format, // false otherwise. func getSourceLinkAddr(it header.NDPOptionIterator) (tcpip.LinkAddress, bool) { return getLinkAddrOption(it, func(opt header.NDPOption) tcpip.LinkAddress { if src, ok := opt.(header.NDPSourceLinkLayerAddressOption); ok { return src.EthernetAddress() } return "" }) } // getTargetLinkAddr searches NDP options for the target link address option. // Returns the link address if found; otherwise, returns the zero link address // value. Also returns true if the options are valid as per the wire format, // false otherwise. func getTargetLinkAddr(it header.NDPOptionIterator) (tcpip.LinkAddress, bool) { return getLinkAddrOption(it, func(opt header.NDPOption) tcpip.LinkAddress { if dst, ok := opt.(header.NDPTargetLinkLayerAddressOption); ok { return dst.EthernetAddress() } return "" }) } func isMLDValid(pkt *stack.PacketBuffer, iph header.IPv6, routerAlert *header.IPv6RouterAlertOption) bool { // As per RFC 2710 section 3: // All MLD messages described in this document are sent with a link-local // IPv6 Source Address, an IPv6 Hop Limit of 1, and an IPv6 Router Alert // option in a Hop-by-Hop Options header. if routerAlert == nil || routerAlert.Value != header.IPv6RouterAlertMLD { return false } if len(pkt.TransportHeader().Slice()) < header.ICMPv6HeaderSize+header.MLDMinimumSize { return false } if iph.HopLimit() != header.MLDHopLimit { return false } if !header.IsV6LinkLocalUnicastAddress(iph.SourceAddress()) { return false } return true } func (e *endpoint) handleICMP(pkt *stack.PacketBuffer, hasFragmentHeader bool, routerAlert *header.IPv6RouterAlertOption) { sent := e.stats.icmp.packetsSent received := e.stats.icmp.packetsReceived h := header.ICMPv6(pkt.TransportHeader().Slice()) if len(h) < header.ICMPv6MinimumSize { received.invalid.Increment() return } iph := header.IPv6(pkt.NetworkHeader().Slice()) srcAddr := iph.SourceAddress() dstAddr := iph.DestinationAddress() // Validate ICMPv6 checksum before processing the packet. payload := pkt.Data() if got, want := h.Checksum(), header.ICMPv6Checksum(header.ICMPv6ChecksumParams{ Header: h, Src: srcAddr, Dst: dstAddr, PayloadCsum: payload.Checksum(), PayloadLen: payload.Size(), }); got != want { received.invalid.Increment() return } isNDPValid := func() bool { // As per RFC 4861 sections 4.1 - 4.5, 6.1.1, 6.1.2, 7.1.1, 7.1.2 and // 8.1, nodes MUST silently drop NDP packets where the Hop Limit field // in the IPv6 header is not set to 255, or the ICMPv6 Code field is not // set to 0. // // As per RFC 6980 section 5, nodes MUST silently drop NDP messages if the // packet includes a fragmentation header. return !hasFragmentHeader && iph.HopLimit() == header.NDPHopLimit && h.Code() == 0 } // TODO(b/112892170): Meaningfully handle all ICMP types. switch icmpType := h.Type(); icmpType { case header.ICMPv6PacketTooBig: received.packetTooBig.Increment() networkMTU, err := calculateNetworkMTU(h.MTU(), header.IPv6MinimumSize) if err != nil { networkMTU = 0 } e.handleControl(&icmpv6PacketTooBigSockError{mtu: networkMTU}, pkt) case header.ICMPv6DstUnreachable: received.dstUnreachable.Increment() switch h.Code() { case header.ICMPv6NetworkUnreachable: e.handleControl(&icmpv6DestinationNetworkUnreachableSockError{}, pkt) case header.ICMPv6PortUnreachable: e.handleControl(&icmpv6DestinationPortUnreachableSockError{}, pkt) } case header.ICMPv6NeighborSolicit: received.neighborSolicit.Increment() if !isNDPValid() || len(h) < header.ICMPv6NeighborSolicitMinimumSize { received.invalid.Increment() return } ns := header.NDPNeighborSolicit(h.MessageBody()) targetAddr := ns.TargetAddress() // As per RFC 4861 section 4.3, the Target Address MUST NOT be a multicast // address. if header.IsV6MulticastAddress(targetAddr) { received.invalid.Increment() return } var it header.NDPOptionIterator { var err error it, err = ns.Options().Iter(false /* check */) if err != nil { // Options are not valid as per the wire format, silently drop the // packet. received.invalid.Increment() return } } if e.hasTentativeAddr(targetAddr) { // If the target address is tentative and the source of the packet is a // unicast (specified) address, then the source of the packet is // attempting to perform address resolution on the target. In this case, // the solicitation is silently ignored, as per RFC 4862 section 5.4.3. // // If the target address is tentative and the source of the packet is the // unspecified address (::), then we know another node is also performing // DAD for the same address (since the target address is tentative for us, // we know we are also performing DAD on it). In this case we let the // stack know so it can handle such a scenario and do nothing further with // the NS. if srcAddr == header.IPv6Any { var nonce []byte for { opt, done, err := it.Next() if err != nil { received.invalid.Increment() return } if done { break } if n, ok := opt.(header.NDPNonceOption); ok { nonce = n.Nonce() break } } // Since this is a DAD message we know the sender does not actually hold // the target address so there is no "holder". var holderLinkAddress tcpip.LinkAddress // We would get an error if the address no longer exists or the address // is no longer tentative (DAD resolved between the call to // hasTentativeAddr and this point). Both of these are valid scenarios: // 1) An address may be removed at any time. // 2) As per RFC 4862 section 5.4, DAD is not a perfect: // "Note that the method for detecting duplicates // is not completely reliable, and it is possible that duplicate // addresses will still exist" // // TODO(gvisor.dev/issue/4046): Handle the scenario when a duplicate // address is detected for an assigned address. switch err := e.dupTentativeAddrDetected(targetAddr, holderLinkAddress, nonce); err.(type) { case nil, *tcpip.ErrBadAddress, *tcpip.ErrInvalidEndpointState: default: panic(fmt.Sprintf("unexpected error handling duplicate tentative address: %s", err)) } } // Do not handle neighbor solicitations targeted to an address that is // tentative on the NIC any further. return } // At this point we know that the target address is not tentative on the NIC // so the packet is processed as defined in RFC 4861, as per RFC 4862 // section 5.4.3. // Is the NS targeting us? if !e.checkLocalAddress(targetAddr) { return } sourceLinkAddr, ok := getSourceLinkAddr(it) if !ok { received.invalid.Increment() return } // As per RFC 4861 section 4.3, the Source Link-Layer Address Option MUST // NOT be included when the source IP address is the unspecified address. // Otherwise, on link layers that have addresses this option MUST be // included in multicast solicitations and SHOULD be included in unicast // solicitations. unspecifiedSource := srcAddr == header.IPv6Any if len(sourceLinkAddr) == 0 { if header.IsV6MulticastAddress(dstAddr) && !unspecifiedSource { received.invalid.Increment() return } } else if unspecifiedSource { received.invalid.Increment() return } else { switch err := e.nic.HandleNeighborProbe(ProtocolNumber, srcAddr, sourceLinkAddr); err.(type) { case nil: case *tcpip.ErrNotSupported: // The stack may support ICMPv6 but the NIC may not need link resolution. default: panic(fmt.Sprintf("unexpected error when informing NIC of neighbor probe message: %s", err)) } } // As per RFC 4861 section 7.1.1: // A node MUST silently discard any received Neighbor Solicitation // messages that do not satisfy all of the following validity checks: // ... // - If the IP source address is the unspecified address, the IP // destination address is a solicited-node multicast address. if unspecifiedSource && !header.IsSolicitedNodeAddr(dstAddr) { received.invalid.Increment() return } // As per RFC 4861 section 7.2.4: // // If the source of the solicitation is the unspecified address, the node // MUST [...] and multicast the advertisement to the all-nodes address. // remoteAddr := srcAddr if unspecifiedSource { remoteAddr = header.IPv6AllNodesMulticastAddress } // Even if we were able to receive a packet from some remote, we may not // have a route to it - the remote may be blocked via routing rules. We must // always consult our routing table and find a route to the remote before // sending any packet. r, err := e.protocol.stack.FindRoute(e.nic.ID(), targetAddr, remoteAddr, ProtocolNumber, false /* multicastLoop */) if err != nil { // If we cannot find a route to the destination, silently drop the packet. return } defer r.Release() // If the NS has a source link-layer option, resolve the route immediately // to avoid querying the neighbor table when the neighbor entry was updated // as probing the neighbor table for a link address will transition the // entry's state from stale to delay. // // Note, if the source link address is unspecified and this is a unicast // solicitation, we may need to perform neighbor discovery to send the // neighbor advertisement response. This is expected as per RFC 4861 section // 7.2.4: // // Because unicast Neighbor Solicitations are not required to include a // Source Link-Layer Address, it is possible that a node sending a // solicited Neighbor Advertisement does not have a corresponding link- // layer address for its neighbor in its Neighbor Cache. In such // situations, a node will first have to use Neighbor Discovery to // determine the link-layer address of its neighbor (i.e., send out a // multicast Neighbor Solicitation). // if len(sourceLinkAddr) != 0 { r.ResolveWith(sourceLinkAddr) } optsSerializer := header.NDPOptionsSerializer{ header.NDPTargetLinkLayerAddressOption(e.nic.LinkAddress()), } neighborAdvertSize := header.ICMPv6NeighborAdvertMinimumSize + optsSerializer.Length() pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ ReserveHeaderBytes: int(r.MaxHeaderLength()) + neighborAdvertSize, }) defer pkt.DecRef() pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber packet := header.ICMPv6(pkt.TransportHeader().Push(neighborAdvertSize)) packet.SetType(header.ICMPv6NeighborAdvert) na := header.NDPNeighborAdvert(packet.MessageBody()) // As per RFC 4861 section 7.2.4: // // If the source of the solicitation is the unspecified address, the node // MUST set the Solicited flag to zero and [..]. Otherwise, the node MUST // set the Solicited flag to one and [..]. // na.SetSolicitedFlag(!unspecifiedSource) na.SetOverrideFlag(true) na.SetRouterFlag(e.Forwarding()) na.SetTargetAddress(targetAddr) na.Options().Serialize(optsSerializer) packet.SetChecksum(header.ICMPv6Checksum(header.ICMPv6ChecksumParams{ Header: packet, Src: r.LocalAddress(), Dst: r.RemoteAddress(), })) // RFC 4861 Neighbor Discovery for IP version 6 (IPv6) // // 7.1.2. Validation of Neighbor Advertisements // // The IP Hop Limit field has a value of 255, i.e., the packet // could not possibly have been forwarded by a router. if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: stack.DefaultTOS}, pkt); err != nil { sent.dropped.Increment() return } sent.neighborAdvert.Increment() case header.ICMPv6NeighborAdvert: received.neighborAdvert.Increment() if !isNDPValid() || len(h) < header.ICMPv6NeighborAdvertMinimumSize { received.invalid.Increment() return } na := header.NDPNeighborAdvert(h.MessageBody()) it, err := na.Options().Iter(false /* check */) if err != nil { // If we have a malformed NDP NA option, drop the packet. received.invalid.Increment() return } targetLinkAddr, ok := getTargetLinkAddr(it) if !ok { received.invalid.Increment() return } targetAddr := na.TargetAddress() e.dad.mu.Lock() e.dad.mu.dad.StopLocked(targetAddr, &stack.DADDupAddrDetected{HolderLinkAddress: targetLinkAddr}) e.dad.mu.Unlock() if e.hasTentativeAddr(targetAddr) { // We only send a nonce value in DAD messages to check for loopedback // messages so we use the empty nonce value here. var nonce []byte // We just got an NA from a node that owns an address we are performing // DAD on, implying the address is not unique. In this case we let the // stack know so it can handle such a scenario and do nothing further with // the NDP NA. // // We would get an error if the address no longer exists or the address // is no longer tentative (DAD resolved between the call to // hasTentativeAddr and this point). Both of these are valid scenarios: // 1) An address may be removed at any time. // 2) As per RFC 4862 section 5.4, DAD is not a perfect: // "Note that the method for detecting duplicates // is not completely reliable, and it is possible that duplicate // addresses will still exist" // // TODO(gvisor.dev/issue/4046): Handle the scenario when a duplicate // address is detected for an assigned address. switch err := e.dupTentativeAddrDetected(targetAddr, targetLinkAddr, nonce); err.(type) { case nil, *tcpip.ErrBadAddress, *tcpip.ErrInvalidEndpointState: return default: panic(fmt.Sprintf("unexpected error handling duplicate tentative address: %s", err)) } } // At this point we know that the target address is not tentative on the // NIC. However, the target address may still be assigned to the NIC but not // tentative (it could be permanent). Such a scenario is beyond the scope of // RFC 4862. As such, we simply ignore such a scenario for now and proceed // as normal. // // TODO(b/143147598): Handle the scenario described above. Also inform the // netstack integration that a duplicate address was detected outside of // DAD. // As per RFC 4861 section 7.1.2: // A node MUST silently discard any received Neighbor Advertisement // messages that do not satisfy all of the following validity checks: // ... // - If the IP Destination Address is a multicast address the // Solicited flag is zero. if header.IsV6MulticastAddress(dstAddr) && na.SolicitedFlag() { received.invalid.Increment() return } // If the NA message has the target link layer option, update the link // address cache with the link address for the target of the message. switch err := e.nic.HandleNeighborConfirmation(ProtocolNumber, targetAddr, targetLinkAddr, stack.ReachabilityConfirmationFlags{ Solicited: na.SolicitedFlag(), Override: na.OverrideFlag(), IsRouter: na.RouterFlag(), }); err.(type) { case nil: case *tcpip.ErrNotSupported: // The stack may support ICMPv6 but the NIC may not need link resolution. default: panic(fmt.Sprintf("unexpected error when informing NIC of neighbor confirmation message: %s", err)) } case header.ICMPv6EchoRequest: received.echoRequest.Increment() // As per RFC 4291 section 2.7, multicast addresses must not be used as // source addresses in IPv6 packets. localAddr := dstAddr if header.IsV6MulticastAddress(dstAddr) { localAddr = tcpip.Address{} } r, err := e.protocol.stack.FindRoute(e.nic.ID(), localAddr, srcAddr, ProtocolNumber, false /* multicastLoop */) if err != nil { // If we cannot find a route to the destination, silently drop the packet. return } defer r.Release() if !e.protocol.allowICMPReply(header.ICMPv6EchoReply) { sent.rateLimited.Increment() return } replyPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ ReserveHeaderBytes: int(r.MaxHeaderLength()) + header.ICMPv6EchoMinimumSize, Payload: pkt.Data().ToBuffer(), }) defer replyPkt.DecRef() icmp := header.ICMPv6(replyPkt.TransportHeader().Push(header.ICMPv6EchoMinimumSize)) replyPkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber copy(icmp, h) icmp.SetType(header.ICMPv6EchoReply) replyData := replyPkt.Data() icmp.SetChecksum(header.ICMPv6Checksum(header.ICMPv6ChecksumParams{ Header: icmp, Src: r.LocalAddress(), Dst: r.RemoteAddress(), PayloadCsum: replyData.Checksum(), PayloadLen: replyData.Size(), })) replyTClass, _ := iph.TOS() if err := r.WritePacket(stack.NetworkHeaderParams{ Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), // Even though RFC 4443 does not mention anything about it, Linux uses the // TrafficClass of the received echo request when replying. // https://github.com/torvalds/linux/blob/0280e3c58f9/net/ipv6/icmp.c#L797 TOS: replyTClass, }, replyPkt); err != nil { sent.dropped.Increment() return } sent.echoReply.Increment() case header.ICMPv6EchoReply: received.echoReply.Increment() if len(h) < header.ICMPv6EchoMinimumSize { received.invalid.Increment() return } e.dispatcher.DeliverTransportPacket(header.ICMPv6ProtocolNumber, pkt) case header.ICMPv6TimeExceeded: received.timeExceeded.Increment() case header.ICMPv6ParamProblem: received.paramProblem.Increment() case header.ICMPv6RouterSolicit: received.routerSolicit.Increment() // // Validate the RS as per RFC 4861 section 6.1.1. // // Is the NDP payload of sufficient size to hold a Router Solictation? if !isNDPValid() || len(h)-header.ICMPv6HeaderSize < header.NDPRSMinimumSize { received.invalid.Increment() return } if !e.Forwarding() { received.routerOnlyPacketsDroppedByHost.Increment() return } rs := header.NDPRouterSolicit(h.MessageBody()) it, err := rs.Options().Iter(false /* check */) if err != nil { // Options are not valid as per the wire format, silently drop the packet. received.invalid.Increment() return } sourceLinkAddr, ok := getSourceLinkAddr(it) if !ok { received.invalid.Increment() return } // If the RS message has the source link layer option, update the link // address cache with the link address for the source of the message. if len(sourceLinkAddr) != 0 { // As per RFC 4861 section 4.1, the Source Link-Layer Address Option MUST // NOT be included when the source IP address is the unspecified address. // Otherwise, it SHOULD be included on link layers that have addresses. if srcAddr == header.IPv6Any { received.invalid.Increment() return } // A RS with a specified source IP address modifies the neighbor table // in the same way a regular probe would. switch err := e.nic.HandleNeighborProbe(ProtocolNumber, srcAddr, sourceLinkAddr); err.(type) { case nil: case *tcpip.ErrNotSupported: // The stack may support ICMPv6 but the NIC may not need link resolution. default: panic(fmt.Sprintf("unexpected error when informing NIC of neighbor probe message: %s", err)) } } case header.ICMPv6RouterAdvert: received.routerAdvert.Increment() // // Validate the RA as per RFC 4861 section 6.1.2. // // Is the NDP payload of sufficient size to hold a Router Advertisement? if !isNDPValid() || len(h)-header.ICMPv6HeaderSize < header.NDPRAMinimumSize { received.invalid.Increment() return } routerAddr := srcAddr // Is the IP Source Address a link-local address? if !header.IsV6LinkLocalUnicastAddress(routerAddr) { // ...No, silently drop the packet. received.invalid.Increment() return } ra := header.NDPRouterAdvert(h.MessageBody()) it, err := ra.Options().Iter(false /* check */) if err != nil { // Options are not valid as per the wire format, silently drop the packet. received.invalid.Increment() return } sourceLinkAddr, ok := getSourceLinkAddr(it) if !ok { received.invalid.Increment() return } // // At this point, we have a valid Router Advertisement, as far // as RFC 4861 section 6.1.2 is concerned. // // If the RA has the source link layer option, update the link address // cache with the link address for the advertised router. if len(sourceLinkAddr) != 0 { switch err := e.nic.HandleNeighborProbe(ProtocolNumber, routerAddr, sourceLinkAddr); err.(type) { case nil: case *tcpip.ErrNotSupported: // The stack may support ICMPv6 but the NIC may not need link resolution. default: panic(fmt.Sprintf("unexpected error when informing NIC of neighbor probe message: %s", err)) } } e.mu.Lock() e.mu.ndp.handleRA(routerAddr, ra) e.mu.Unlock() case header.ICMPv6RedirectMsg: // TODO(gvisor.dev/issue/2285): Call `e.nud.HandleProbe` after validating // this redirect message, as per RFC 4871 section 7.3.3: // // "A Neighbor Cache entry enters the STALE state when created as a // result of receiving packets other than solicited Neighbor // Advertisements (i.e., Router Solicitations, Router Advertisements, // Redirects, and Neighbor Solicitations). These packets contain the // link-layer address of either the sender or, in the case of Redirect, // the redirection target. However, receipt of these link-layer // addresses does not confirm reachability of the forward-direction path // to that node. Placing a newly created Neighbor Cache entry for which // the link-layer address is known in the STALE state provides assurance // that path failures are detected quickly. In addition, should a cached // link-layer address be modified due to receiving one of the above // messages, the state SHOULD also be set to STALE to provide prompt // verification that the path to the new link-layer address is working." received.redirectMsg.Increment() if !isNDPValid() { received.invalid.Increment() return } case header.ICMPv6MulticastListenerQuery, header.ICMPv6MulticastListenerReport, header.ICMPv6MulticastListenerV2Report, header.ICMPv6MulticastListenerDone: icmpBody := h.MessageBody() switch icmpType { case header.ICMPv6MulticastListenerQuery: received.multicastListenerQuery.Increment() case header.ICMPv6MulticastListenerReport: received.multicastListenerReport.Increment() case header.ICMPv6MulticastListenerV2Report: received.multicastListenerReportV2.Increment() case header.ICMPv6MulticastListenerDone: received.multicastListenerDone.Increment() default: panic(fmt.Sprintf("unrecognized MLD message = %d", icmpType)) } if !isMLDValid(pkt, iph, routerAlert) { received.invalid.Increment() return } switch icmpType { case header.ICMPv6MulticastListenerQuery: e.mu.Lock() if len(icmpBody) >= header.MLDv2QueryMinimumSize { e.mu.mld.handleMulticastListenerQueryV2(header.MLDv2Query(icmpBody)) } else { e.mu.mld.handleMulticastListenerQuery(header.MLD(icmpBody)) } e.mu.Unlock() case header.ICMPv6MulticastListenerReport: e.mu.Lock() e.mu.mld.handleMulticastListenerReport(header.MLD(icmpBody)) e.mu.Unlock() case header.ICMPv6MulticastListenerDone, header.ICMPv6MulticastListenerV2Report: default: panic(fmt.Sprintf("unrecognized MLD message = %d", icmpType)) } default: received.unrecognized.Increment() } } // LinkAddressProtocol implements stack.LinkAddressResolver. func (*endpoint) LinkAddressProtocol() tcpip.NetworkProtocolNumber { return header.IPv6ProtocolNumber } // LinkAddressRequest implements stack.LinkAddressResolver. func (e *endpoint) LinkAddressRequest(targetAddr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) tcpip.Error { remoteAddr := targetAddr if len(remoteLinkAddr) == 0 { remoteAddr = header.SolicitedNodeAddr(targetAddr) remoteLinkAddr = header.EthernetAddressFromMulticastIPv6Address(remoteAddr) } if localAddr.BitLen() == 0 { // Find an address that we can use as our source address. addressEndpoint := e.AcquireOutgoingPrimaryAddress(remoteAddr, tcpip.Address{} /* srcHint */, false /* allowExpired */) if addressEndpoint == nil { return &tcpip.ErrNetworkUnreachable{} } localAddr = addressEndpoint.AddressWithPrefix().Address addressEndpoint.DecRef() } else if !e.checkLocalAddress(localAddr) { // The provided local address is not assigned to us. return &tcpip.ErrBadLocalAddress{} } return e.sendNDPNS(localAddr, remoteAddr, targetAddr, remoteLinkAddr, header.NDPOptionsSerializer{ header.NDPSourceLinkLayerAddressOption(e.nic.LinkAddress()), }) } // ResolveStaticAddress implements stack.LinkAddressResolver. func (*endpoint) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) { if header.IsV6MulticastAddress(addr) { return header.EthernetAddressFromMulticastIPv6Address(addr), true } return tcpip.LinkAddress([]byte(nil)), false } // ======= ICMP Error packet generation ========= // icmpReason is a marker interface for IPv6 specific ICMP errors. type icmpReason interface { isICMPReason() // respondToMulticast indicates whether this error falls under the exception // outlined by RFC 4443 section 2.4 point e.3 exception 2: // // (e.3) A packet destined to an IPv6 multicast address. (There are two // exceptions to this rule: (1) the Packet Too Big Message (Section 3.2) to // allow Path MTU discovery to work for IPv6 multicast, and (2) the Parameter // Problem Message, Code 2 (Section 3.4) reporting an unrecognized IPv6 // option (see Section 4.2 of [IPv6]) that has the Option Type highest- // order two bits set to 10). respondsToMulticast() bool } // icmpReasonParameterProblem is an error during processing of extension headers // or the fixed header defined in RFC 4443 section 3.4. type icmpReasonParameterProblem struct { code header.ICMPv6Code // pointer is defined in the RFC 4443 section 3.4 which reads: // // Pointer Identifies the octet offset within the invoking packet // where the error was detected. // // The pointer will point beyond the end of the ICMPv6 // packet if the field in error is beyond what can fit // in the maximum size of an ICMPv6 error message. pointer uint32 respondToMulticast bool } func (*icmpReasonParameterProblem) isICMPReason() {} func (p *icmpReasonParameterProblem) respondsToMulticast() bool { return p.respondToMulticast } // icmpReasonAdministrativelyProhibited is an error where the destination is // administratively prohibited. type icmpReasonAdministrativelyProhibited struct{} func (*icmpReasonAdministrativelyProhibited) isICMPReason() {} func (*icmpReasonAdministrativelyProhibited) respondsToMulticast() bool { return false } // icmpReasonPortUnreachable is an error where the transport protocol has no // listener and no alternative means to inform the sender. type icmpReasonPortUnreachable struct{} func (*icmpReasonPortUnreachable) isICMPReason() {} func (*icmpReasonPortUnreachable) respondsToMulticast() bool { return false } // icmpReasonNetUnreachable is an error where no route can be found to the // network of the final destination. type icmpReasonNetUnreachable struct{} func (*icmpReasonNetUnreachable) isICMPReason() {} func (*icmpReasonNetUnreachable) respondsToMulticast() bool { return false } // icmpReasonHostUnreachable is an error in which the host specified in the // internet destination field of the datagram is unreachable. type icmpReasonHostUnreachable struct{} func (*icmpReasonHostUnreachable) isICMPReason() {} func (*icmpReasonHostUnreachable) respondsToMulticast() bool { return false } // icmpReasonFragmentationNeeded is an error where a packet is to big to be sent // out through the outgoing MTU, as per RFC 4443 page 9, Packet Too Big Message. type icmpReasonPacketTooBig struct{} func (*icmpReasonPacketTooBig) isICMPReason() {} func (*icmpReasonPacketTooBig) respondsToMulticast() bool { return true } // icmpReasonHopLimitExceeded is an error where a packet's hop limit exceeded in // transit to its final destination, as per RFC 4443 section 3.3. type icmpReasonHopLimitExceeded struct{} func (*icmpReasonHopLimitExceeded) isICMPReason() {} func (*icmpReasonHopLimitExceeded) respondsToMulticast() bool { return false } // icmpReasonReassemblyTimeout is an error where insufficient fragments are // received to complete reassembly of a packet within a configured time after // the reception of the first-arriving fragment of that packet. type icmpReasonReassemblyTimeout struct{} func (*icmpReasonReassemblyTimeout) isICMPReason() {} func (*icmpReasonReassemblyTimeout) respondsToMulticast() bool { return false } // returnError takes an error descriptor and generates the appropriate ICMP // error packet for IPv6 and sends it. func (p *protocol) returnError(reason icmpReason, pkt *stack.PacketBuffer, deliveredLocally bool) tcpip.Error { origIPHdr := header.IPv6(pkt.NetworkHeader().Slice()) origIPHdrSrc := origIPHdr.SourceAddress() origIPHdrDst := origIPHdr.DestinationAddress() // Only send ICMP error if the address is not a multicast v6 // address and the source is not the unspecified address. // // There are exceptions to this rule. // See: point e.3) RFC 4443 section-2.4 // // (e) An ICMPv6 error message MUST NOT be originated as a result of // receiving the following: // // (e.1) An ICMPv6 error message. // // (e.2) An ICMPv6 redirect message [IPv6-DISC]. // // (e.3) A packet destined to an IPv6 multicast address. (There are // two exceptions to this rule: (1) the Packet Too Big Message // (Section 3.2) to allow Path MTU discovery to work for IPv6 // multicast, and (2) the Parameter Problem Message, Code 2 // (Section 3.4) reporting an unrecognized IPv6 option (see // Section 4.2 of [IPv6]) that has the Option Type highest- // order two bits set to 10). // allowResponseToMulticast := reason.respondsToMulticast() isOrigDstMulticast := header.IsV6MulticastAddress(origIPHdrDst) if (!allowResponseToMulticast && isOrigDstMulticast) || origIPHdrSrc == header.IPv6Any { return nil } // If the packet wasn't delivered locally, do not use the packet's destination // address as the response's source address as we should not own the // destination address of a packet we are forwarding. // // If the packet was originally destined to a multicast address, then do not // use the packet's destination address as the source for the response ICMP // packet as "multicast addresses must not be used as source addresses in IPv6 // packets", as per RFC 4291 section 2.7. localAddr := origIPHdrDst if !deliveredLocally || isOrigDstMulticast { localAddr = tcpip.Address{} } // Even if we were able to receive a packet from some remote, we may not have // a route to it - the remote may be blocked via routing rules. We must always // consult our routing table and find a route to the remote before sending any // packet. route, err := p.stack.FindRoute(pkt.NICID, localAddr, origIPHdrSrc, ProtocolNumber, false /* multicastLoop */) if err != nil { return err } defer route.Release() p.mu.Lock() // We retrieve an endpoint using the newly constructed route's NICID rather // than the packet's NICID. The packet's NICID corresponds to the NIC on // which it arrived, which isn't necessarily the same as the NIC on which it // will be transmitted. On the other hand, the route's NIC *is* guaranteed // to be the NIC on which the packet will be transmitted. netEP, ok := p.mu.eps[route.NICID()] p.mu.Unlock() if !ok { return &tcpip.ErrNotConnected{} } if pkt.TransportProtocolNumber == header.ICMPv6ProtocolNumber { if typ := header.ICMPv6(pkt.TransportHeader().Slice()).Type(); typ.IsErrorType() || typ == header.ICMPv6RedirectMsg { return nil } } sent := netEP.stats.icmp.packetsSent icmpType, icmpCode, counter, typeSpecific := func() (header.ICMPv6Type, header.ICMPv6Code, tcpip.MultiCounterStat, uint32) { switch reason := reason.(type) { case *icmpReasonParameterProblem: return header.ICMPv6ParamProblem, reason.code, sent.paramProblem, reason.pointer case *icmpReasonAdministrativelyProhibited: return header.ICMPv6DstUnreachable, header.ICMPv6Prohibited, sent.dstUnreachable, 0 case *icmpReasonPortUnreachable: return header.ICMPv6DstUnreachable, header.ICMPv6PortUnreachable, sent.dstUnreachable, 0 case *icmpReasonNetUnreachable: return header.ICMPv6DstUnreachable, header.ICMPv6NetworkUnreachable, sent.dstUnreachable, 0 case *icmpReasonHostUnreachable: return header.ICMPv6DstUnreachable, header.ICMPv6AddressUnreachable, sent.dstUnreachable, 0 case *icmpReasonPacketTooBig: return header.ICMPv6PacketTooBig, header.ICMPv6UnusedCode, sent.packetTooBig, 0 case *icmpReasonHopLimitExceeded: return header.ICMPv6TimeExceeded, header.ICMPv6HopLimitExceeded, sent.timeExceeded, 0 case *icmpReasonReassemblyTimeout: return header.ICMPv6TimeExceeded, header.ICMPv6ReassemblyTimeout, sent.timeExceeded, 0 default: panic(fmt.Sprintf("unsupported ICMP type %T", reason)) } }() if !p.allowICMPReply(icmpType) { sent.rateLimited.Increment() return nil } network, transport := pkt.NetworkHeader().View(), pkt.TransportHeader().View() // As per RFC 4443 section 2.4 // // (c) Every ICMPv6 error message (type < 128) MUST include // as much of the IPv6 offending (invoking) packet (the // packet that caused the error) as possible without making // the error message packet exceed the minimum IPv6 MTU // [IPv6]. mtu := int(route.MTU()) const maxIPv6Data = header.IPv6MinimumMTU - header.IPv6FixedHeaderSize if mtu > maxIPv6Data { mtu = maxIPv6Data } available := mtu - header.ICMPv6ErrorHeaderSize if available < header.IPv6MinimumSize { return nil } payloadLen := network.Size() + transport.Size() + pkt.Data().Size() if payloadLen > available { payloadLen = available } payload := buffer.MakeWithView(network) payload.Append(transport) dataBuf := pkt.Data().ToBuffer() payload.Merge(&dataBuf) payload.Truncate(int64(payloadLen)) newPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ ReserveHeaderBytes: int(route.MaxHeaderLength()) + header.ICMPv6ErrorHeaderSize, Payload: payload, }) defer newPkt.DecRef() newPkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber icmpHdr := header.ICMPv6(newPkt.TransportHeader().Push(header.ICMPv6DstUnreachableMinimumSize)) icmpHdr.SetType(icmpType) icmpHdr.SetCode(icmpCode) icmpHdr.SetTypeSpecific(typeSpecific) pktData := newPkt.Data() icmpHdr.SetChecksum(header.ICMPv6Checksum(header.ICMPv6ChecksumParams{ Header: icmpHdr, Src: route.LocalAddress(), Dst: route.RemoteAddress(), PayloadCsum: pktData.Checksum(), PayloadLen: pktData.Size(), })) if err := route.WritePacket( stack.NetworkHeaderParams{ Protocol: header.ICMPv6ProtocolNumber, TTL: route.DefaultTTL(), TOS: stack.DefaultTOS, }, newPkt, ); err != nil { sent.dropped.Increment() return err } counter.Increment() return nil } // OnReassemblyTimeout implements fragmentation.TimeoutHandler. func (p *protocol) OnReassemblyTimeout(pkt *stack.PacketBuffer) { // OnReassemblyTimeout sends a Time Exceeded Message as per RFC 2460 Section // 4.5: // // If the first fragment (i.e., the one with a Fragment Offset of zero) has // been received, an ICMP Time Exceeded -- Fragment Reassembly Time Exceeded // message should be sent to the source of that fragment. if pkt != nil { p.returnError(&icmpReasonReassemblyTimeout{}, pkt, true /* deliveredLocally */) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/ipv6/ipv6.go000066400000000000000000003054611465435605700243440ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package ipv6 contains the implementation of the ipv6 network protocol. package ipv6 import ( "fmt" "math" "reflect" "sort" "time" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/network/internal/fragmentation" "gvisor.dev/gvisor/pkg/tcpip/network/internal/ip" "gvisor.dev/gvisor/pkg/tcpip/network/internal/multicast" "gvisor.dev/gvisor/pkg/tcpip/stack" ) const ( // ReassembleTimeout controls how long a fragment will be held. // As per RFC 8200 section 4.5: // // If insufficient fragments are received to complete reassembly of a packet // within 60 seconds of the reception of the first-arriving fragment of that // packet, reassembly of that packet must be abandoned. // // Linux also uses 60 seconds for reassembly timeout: // https://github.com/torvalds/linux/blob/47ec5303d73ea344e84f46660fff693c57641386/include/net/ipv6.h#L456 ReassembleTimeout = 60 * time.Second // ProtocolNumber is the ipv6 protocol number. ProtocolNumber = header.IPv6ProtocolNumber // maxPayloadSize is the maximum size that can be encoded in the 16-bit // PayloadLength field of the ipv6 header. maxPayloadSize = 0xffff // DefaultTTL is the default hop limit for IPv6 Packets egressed by // Netstack. DefaultTTL = 64 // buckets for fragment identifiers buckets = 2048 ) const ( forwardingDisabled = 0 forwardingEnabled = 1 ) // policyTable is the default policy table defined in RFC 6724 section 2.1. // // A more human-readable version: // // Prefix Precedence Label // ::1/128 50 0 // ::/0 40 1 // ::ffff:0:0/96 35 4 // 2002::/16 30 2 // 2001::/32 5 5 // fc00::/7 3 13 // ::/96 1 3 // fec0::/10 1 11 // 3ffe::/16 1 12 // // The table is sorted by prefix length so longest-prefix match can be easily // achieved. // // We willingly left out ::/96, fec0::/10 and 3ffe::/16 since those prefix // assignments are deprecated. // // As per RFC 4291 section 2.5.5.1 (for ::/96), // // The "IPv4-Compatible IPv6 address" is now deprecated because the // current IPv6 transition mechanisms no longer use these addresses. // New or updated implementations are not required to support this // address type. // // As per RFC 3879 section 4 (for fec0::/10), // // This document formally deprecates the IPv6 site-local unicast prefix // defined in [RFC3513], i.e., 1111111011 binary or FEC0::/10. // // As per RFC 3701 section 1 (for 3ffe::/16), // // As clearly stated in [TEST-NEW], the addresses for the 6bone are // temporary and will be reclaimed in the future. It further states // that all users of these addresses (within the 3FFE::/16 prefix) will // be required to renumber at some time in the future. // // and section 2, // // Thus after the pTLA allocation cutoff date January 1, 2004, it is // REQUIRED that no new 6bone 3FFE pTLAs be allocated. // // MUST NOT BE MODIFIED. var policyTable = [...]struct { subnet tcpip.Subnet label uint8 }{ // ::1/128 { subnet: header.IPv6Loopback.WithPrefix().Subnet(), label: 0, }, // ::ffff:0:0/96 { subnet: header.IPv4MappedIPv6Subnet, label: 4, }, // 2001::/32 (Teredo prefix as per RFC 4380 section 2.6). { subnet: tcpip.AddressWithPrefix{ Address: tcpip.AddrFrom16([16]byte{0x20, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}), PrefixLen: 32, }.Subnet(), label: 5, }, // 2002::/16 (6to4 prefix as per RFC 3056 section 2). { subnet: tcpip.AddressWithPrefix{ Address: tcpip.AddrFrom16([16]byte{0x20, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}), PrefixLen: 16, }.Subnet(), label: 2, }, // fc00::/7 (Unique local addresses as per RFC 4193 section 3.1). { subnet: tcpip.AddressWithPrefix{ Address: tcpip.AddrFrom16([16]byte{0xfc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}), PrefixLen: 7, }.Subnet(), label: 13, }, // ::/0 { subnet: header.IPv6EmptySubnet, label: 1, }, } func getLabel(addr tcpip.Address) uint8 { for _, p := range policyTable { if p.subnet.Contains(addr) { return p.label } } panic(fmt.Sprintf("should have a label for address = %s", addr)) } var _ stack.DuplicateAddressDetector = (*endpoint)(nil) var _ stack.LinkAddressResolver = (*endpoint)(nil) var _ stack.LinkResolvableNetworkEndpoint = (*endpoint)(nil) var _ stack.ForwardingNetworkEndpoint = (*endpoint)(nil) var _ stack.MulticastForwardingNetworkEndpoint = (*endpoint)(nil) var _ stack.GroupAddressableEndpoint = (*endpoint)(nil) var _ stack.AddressableEndpoint = (*endpoint)(nil) var _ stack.NetworkEndpoint = (*endpoint)(nil) var _ stack.NDPEndpoint = (*endpoint)(nil) var _ MLDEndpoint = (*endpoint)(nil) var _ NDPEndpoint = (*endpoint)(nil) // +stateify savable type endpointMu struct { sync.RWMutex `state:"nosave"` addressableEndpointState stack.AddressableEndpointState ndp ndpState mld mldState } // +stateify savable type dadMu struct { sync.Mutex `state:"nosave"` dad ip.DAD } // +stateify savable type endpointDAD struct { mu dadMu } // +stateify savable type endpoint struct { nic stack.NetworkInterface dispatcher stack.TransportDispatcher protocol *protocol stats sharedStats // enabled is set to 1 when the endpoint is enabled and 0 when it is // disabled. enabled atomicbitops.Uint32 // forwarding is set to forwardingEnabled when the endpoint has forwarding // enabled and forwardingDisabled when it is disabled. forwarding atomicbitops.Uint32 // multicastForwarding is set to forwardingEnabled when the endpoint has // forwarding enabled and forwardingDisabled when it is disabled. multicastForwarding atomicbitops.Uint32 mu endpointMu // dad is used to check if an arbitrary address is already assigned to some // neighbor. // // Note: this is different from mu.ndp.dad which is used to perform DAD for // addresses that are assigned to the interface. Removing an address aborts // DAD; if we had used the same state, handlers for a removed address would // not be called with the actual DAD result. // // LOCK ORDERING: mu > dad.mu. dad endpointDAD } // NICNameFromID is a function that returns a stable name for the specified NIC, // even if different NIC IDs are used to refer to the same NIC in different // program runs. It is used when generating opaque interface identifiers (IIDs). // If the NIC was created with a name, it is passed to NICNameFromID. // // NICNameFromID SHOULD return unique NIC names so unique opaque IIDs are // generated for the same prefix on different NICs. type NICNameFromID func(tcpip.NICID, string) string // OpaqueInterfaceIdentifierOptions holds the options related to the generation // of opaque interface identifiers (IIDs) as defined by RFC 7217. // // +stateify savable type OpaqueInterfaceIdentifierOptions struct { // NICNameFromID is a function that returns a stable name for a specified NIC, // even if the NIC ID changes over time. // // Must be specified to generate the opaque IID. NICNameFromID NICNameFromID `state:"nosave"` // SecretKey is a pseudo-random number used as the secret key when generating // opaque IIDs as defined by RFC 7217. The key SHOULD be at least // header.OpaqueIIDSecretKeyMinBytes bytes and MUST follow minimum randomness // requirements for security as outlined by RFC 4086. SecretKey MUST NOT // change between program runs, unless explicitly changed. // // OpaqueInterfaceIdentifierOptions takes ownership of SecretKey. SecretKey // MUST NOT be modified after Stack is created. // // May be nil, but a nil value is highly discouraged to maintain // some level of randomness between nodes. SecretKey []byte } // CheckDuplicateAddress implements stack.DuplicateAddressDetector. func (e *endpoint) CheckDuplicateAddress(addr tcpip.Address, h stack.DADCompletionHandler) stack.DADCheckAddressDisposition { e.dad.mu.Lock() defer e.dad.mu.Unlock() return e.dad.mu.dad.CheckDuplicateAddressLocked(addr, h) } // SetDADConfigurations implements stack.DuplicateAddressDetector. func (e *endpoint) SetDADConfigurations(c stack.DADConfigurations) { e.mu.Lock() defer e.mu.Unlock() e.dad.mu.Lock() defer e.dad.mu.Unlock() e.mu.ndp.dad.SetConfigsLocked(c) e.dad.mu.dad.SetConfigsLocked(c) } // DuplicateAddressProtocol implements stack.DuplicateAddressDetector. func (*endpoint) DuplicateAddressProtocol() tcpip.NetworkProtocolNumber { return ProtocolNumber } // HandleLinkResolutionFailure implements stack.LinkResolvableNetworkEndpoint. func (e *endpoint) HandleLinkResolutionFailure(pkt *stack.PacketBuffer) { // If we are operating as a router, we should return an ICMP error to the // original packet's sender. if pkt.NetworkPacketInfo.IsForwardedPacket { // TODO(gvisor.dev/issue/6005): Propagate asynchronously generated ICMP // errors to local endpoints. e.protocol.returnError(&icmpReasonHostUnreachable{}, pkt, false /* deliveredLocally */) e.stats.ip.Forwarding.Errors.Increment() e.stats.ip.Forwarding.HostUnreachable.Increment() return } // handleControl expects the entire offending packet to be in the packet // buffer's data field. pkt = stack.NewPacketBuffer(stack.PacketBufferOptions{ Payload: pkt.ToBuffer(), }) defer pkt.DecRef() pkt.NICID = e.nic.ID() pkt.NetworkProtocolNumber = ProtocolNumber e.handleControl(&icmpv6DestinationAddressUnreachableSockError{}, pkt) } // onAddressAssignedLocked handles an address being assigned. // // Precondition: e.mu must be exclusively locked. func (e *endpoint) onAddressAssignedLocked(addr tcpip.Address) { // As per RFC 2710 section 3, // // All MLD messages described in this document are sent with a link-local // IPv6 Source Address, ... // // If we just completed DAD for a link-local address, then attempt to send any // queued MLD reports. Note, we may have sent reports already for some of the // groups before we had a valid link-local address to use as the source for // the MLD messages, but that was only so that MLD snooping switches are aware // of our membership to groups - routers would not have handled those reports. // // As per RFC 3590 section 4, // // MLD Report and Done messages are sent with a link-local address as // the IPv6 source address, if a valid address is available on the // interface. If a valid link-local address is not available (e.g., one // has not been configured), the message is sent with the unspecified // address (::) as the IPv6 source address. // // Once a valid link-local address is available, a node SHOULD generate // new MLD Report messages for all multicast addresses joined on the // interface. // // Routers receiving an MLD Report or Done message with the unspecified // address as the IPv6 source address MUST silently discard the packet // without taking any action on the packets contents. // // Snooping switches MUST manage multicast forwarding state based on MLD // Report and Done messages sent with the unspecified address as the // IPv6 source address. if header.IsV6LinkLocalUnicastAddress(addr) { e.mu.mld.sendQueuedReports() } } // InvalidateDefaultRouter implements stack.NDPEndpoint. func (e *endpoint) InvalidateDefaultRouter(rtr tcpip.Address) { e.mu.Lock() defer e.mu.Unlock() // We represent default routers with a default (off-link) route through the // router. e.mu.ndp.invalidateOffLinkRoute(offLinkRoute{dest: header.IPv6EmptySubnet, router: rtr}) } // SetMLDVersion implements MLDEndpoint. func (e *endpoint) SetMLDVersion(v MLDVersion) MLDVersion { e.mu.Lock() defer e.mu.Unlock() return e.mu.mld.setVersion(v) } // GetMLDVersion implements MLDEndpoint. func (e *endpoint) GetMLDVersion() MLDVersion { e.mu.RLock() defer e.mu.RUnlock() return e.mu.mld.getVersion() } // SetNDPConfigurations implements NDPEndpoint. func (e *endpoint) SetNDPConfigurations(c NDPConfigurations) { c.validate() e.mu.Lock() defer e.mu.Unlock() e.mu.ndp.configs = c } // hasTentativeAddr returns true if addr is tentative on e. func (e *endpoint) hasTentativeAddr(addr tcpip.Address) bool { e.mu.RLock() addressEndpoint := e.getAddressRLocked(addr) e.mu.RUnlock() return addressEndpoint != nil && addressEndpoint.GetKind() == stack.PermanentTentative } // dupTentativeAddrDetected attempts to inform e that a tentative addr is a // duplicate on a link. // // dupTentativeAddrDetected removes the tentative address if it exists. If the // address was generated via SLAAC, an attempt is made to generate a new // address. func (e *endpoint) dupTentativeAddrDetected(addr tcpip.Address, holderLinkAddr tcpip.LinkAddress, nonce []byte) tcpip.Error { e.mu.Lock() defer e.mu.Unlock() addressEndpoint := e.getAddressRLocked(addr) if addressEndpoint == nil { return &tcpip.ErrBadAddress{} } if addressEndpoint.GetKind() != stack.PermanentTentative { return &tcpip.ErrInvalidEndpointState{} } switch result := e.mu.ndp.dad.ExtendIfNonceEqualLocked(addr, nonce); result { case ip.Extended: // The nonce we got back was the same we sent so we know the message // indicating a duplicate address was likely ours so do not consider // the address duplicate here. return nil case ip.AlreadyExtended: // See Extended. // // Our DAD message was looped back already. return nil case ip.NoDADStateFound: panic(fmt.Sprintf("expected DAD state for tentative address %s", addr)) case ip.NonceDisabled: // If nonce is disabled then we have no way to know if the packet was // looped-back so we have to assume it indicates a duplicate address. fallthrough case ip.NonceNotEqual: // If the address is a SLAAC address, do not invalidate its SLAAC prefix as an // attempt will be made to generate a new address for it. if err := e.removePermanentEndpointLocked(addressEndpoint, false /* allowSLAACInvalidation */, stack.AddressRemovalDADFailed, &stack.DADDupAddrDetected{HolderLinkAddress: holderLinkAddr}); err != nil { return err } prefix := addressEndpoint.Subnet() switch t := addressEndpoint.ConfigType(); t { case stack.AddressConfigStatic: case stack.AddressConfigSlaac: if addressEndpoint.Temporary() { // Do not reset the generation attempts counter for the prefix as the // temporary address is being regenerated in response to a DAD conflict. e.mu.ndp.regenerateTempSLAACAddr(prefix, false /* resetGenAttempts */) } else { e.mu.ndp.regenerateSLAACAddr(prefix) } default: panic(fmt.Sprintf("unrecognized address config type = %d", t)) } return nil default: panic(fmt.Sprintf("unhandled result = %d", result)) } } // Forwarding implements stack.ForwardingNetworkEndpoint. func (e *endpoint) Forwarding() bool { return e.forwarding.Load() == forwardingEnabled } // setForwarding sets the forwarding status for the endpoint. // // Returns the previous forwarding status. func (e *endpoint) setForwarding(v bool) bool { forwarding := uint32(forwardingDisabled) if v { forwarding = forwardingEnabled } return e.forwarding.Swap(forwarding) != forwardingDisabled } // SetForwarding implements stack.ForwardingNetworkEndpoint. func (e *endpoint) SetForwarding(forwarding bool) bool { e.mu.Lock() defer e.mu.Unlock() prevForwarding := e.setForwarding(forwarding) if prevForwarding == forwarding { return prevForwarding } allRoutersGroups := [...]tcpip.Address{ header.IPv6AllRoutersInterfaceLocalMulticastAddress, header.IPv6AllRoutersLinkLocalMulticastAddress, header.IPv6AllRoutersSiteLocalMulticastAddress, } if forwarding { // As per RFC 4291 section 2.8: // // A router is required to recognize all addresses that a host is // required to recognize, plus the following addresses as identifying // itself: // // o The All-Routers multicast addresses defined in Section 2.7.1. // // As per RFC 4291 section 2.7.1, // // All Routers Addresses: FF01:0:0:0:0:0:0:2 // FF02:0:0:0:0:0:0:2 // FF05:0:0:0:0:0:0:2 // // The above multicast addresses identify the group of all IPv6 routers, // within scope 1 (interface-local), 2 (link-local), or 5 (site-local). for _, g := range allRoutersGroups { if err := e.joinGroupLocked(g); err != nil { // joinGroupLocked only returns an error if the group address is not a // valid IPv6 multicast address. panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", g, err)) } } } else { for _, g := range allRoutersGroups { switch err := e.leaveGroupLocked(g).(type) { case nil: case *tcpip.ErrBadLocalAddress: // The endpoint may have already left the multicast group. default: panic(fmt.Sprintf("e.leaveGroupLocked(%s): %s", g, err)) } } } e.mu.ndp.forwardingChanged(forwarding) return prevForwarding } // MulticastForwarding implements stack.MulticastForwardingNetworkEndpoint. func (e *endpoint) MulticastForwarding() bool { return e.multicastForwarding.Load() == forwardingEnabled } // SetMulticastForwarding implements stack.MulticastForwardingNetworkEndpoint. func (e *endpoint) SetMulticastForwarding(forwarding bool) bool { updatedForwarding := uint32(forwardingDisabled) if forwarding { updatedForwarding = forwardingEnabled } return e.multicastForwarding.Swap(updatedForwarding) != forwardingDisabled } // Enable implements stack.NetworkEndpoint. func (e *endpoint) Enable() tcpip.Error { e.mu.Lock() defer e.mu.Unlock() // If the NIC is not enabled, the endpoint can't do anything meaningful so // don't enable the endpoint. if !e.nic.Enabled() { return &tcpip.ErrNotPermitted{} } // If the endpoint is already enabled, there is nothing for it to do. if !e.setEnabled(true) { return nil } // Perform DAD on the all the unicast IPv6 endpoints that are in the permanent // state. // // Addresses may have already completed DAD but in the time since the endpoint // was last enabled, other devices may have acquired the same addresses. var err tcpip.Error e.mu.addressableEndpointState.ForEachEndpoint(func(addressEndpoint stack.AddressEndpoint) bool { addr := addressEndpoint.AddressWithPrefix().Address if !header.IsV6UnicastAddress(addr) { return true } switch kind := addressEndpoint.GetKind(); kind { case stack.Permanent: addressEndpoint.SetKind(stack.PermanentTentative) fallthrough case stack.PermanentTentative: err = e.mu.ndp.startDuplicateAddressDetection(addr, addressEndpoint) return err == nil case stack.Temporary, stack.PermanentExpired: return true default: panic(fmt.Sprintf("address %s has unknown kind %d", addressEndpoint.AddressWithPrefix(), kind)) } }) // It is important to enable after starting DAD on all the addresses so that // if DAD is disabled, the Tentative state is not observed. // // Must be called after Enabled has been set. e.mu.addressableEndpointState.OnNetworkEndpointEnabledChanged() if err != nil { return err } // Groups may have been joined when the endpoint was disabled, or the // endpoint may have left groups from the perspective of MLD when the // endpoint was disabled. Either way, we need to let routers know to // send us multicast traffic. e.mu.mld.initializeAll() // Join the IPv6 All-Nodes Multicast group if the stack is configured to // use IPv6. This is required to ensure that this node properly receives // and responds to the various NDP messages that are destined to the // all-nodes multicast address. An example is the Neighbor Advertisement // when we perform Duplicate Address Detection, or Router Advertisement // when we do Router Discovery. See RFC 4862, section 5.4.2 and RFC 4861 // section 4.2 for more information. // // Also auto-generate an IPv6 link-local address based on the endpoint's // link address if it is configured to do so. Note, each interface is // required to have IPv6 link-local unicast address, as per RFC 4291 // section 2.1. // Join the All-Nodes multicast group before starting DAD as responses to DAD // (NDP NS) messages may be sent to the All-Nodes multicast group if the // source address of the NDP NS is the unspecified address, as per RFC 4861 // section 7.2.4. if err := e.joinGroupLocked(header.IPv6AllNodesMulticastAddress); err != nil { // joinGroupLocked only returns an error if the group address is not a valid // IPv6 multicast address. panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv6AllNodesMulticastAddress, err)) } // Do not auto-generate an IPv6 link-local address for loopback devices. if e.protocol.options.AutoGenLinkLocal && !e.nic.IsLoopback() { // The valid and preferred lifetime is infinite for the auto-generated // link-local address. e.mu.ndp.doSLAAC(header.IPv6LinkLocalPrefix.Subnet(), header.NDPInfiniteLifetime, header.NDPInfiniteLifetime) } e.mu.ndp.startSolicitingRouters() return nil } // Enabled implements stack.NetworkEndpoint. func (e *endpoint) Enabled() bool { return e.nic.Enabled() && e.isEnabled() } // isEnabled returns true if the endpoint is enabled, regardless of the // enabled status of the NIC. func (e *endpoint) isEnabled() bool { return e.enabled.Load() == 1 } // setEnabled sets the enabled status for the endpoint. // // Returns true if the enabled status was updated. func (e *endpoint) setEnabled(v bool) bool { if v { return e.enabled.Swap(1) == 0 } return e.enabled.Swap(0) == 1 } // Disable implements stack.NetworkEndpoint. func (e *endpoint) Disable() { e.mu.Lock() defer e.mu.Unlock() e.disableLocked() } func (e *endpoint) disableLocked() { if !e.Enabled() { return } e.mu.ndp.stopSolicitingRouters() e.mu.ndp.cleanupState() // The endpoint may have already left the multicast group. switch err := e.leaveGroupLocked(header.IPv6AllNodesMulticastAddress).(type) { case nil, *tcpip.ErrBadLocalAddress: default: panic(fmt.Sprintf("unexpected error when leaving group = %s: %s", header.IPv6AllNodesMulticastAddress, err)) } // Leave groups from the perspective of MLD so that routers know that // we are no longer interested in the group. e.mu.mld.softLeaveAll() // Stop DAD for all the tentative unicast addresses. e.mu.addressableEndpointState.ForEachEndpoint(func(addressEndpoint stack.AddressEndpoint) bool { addrWithPrefix := addressEndpoint.AddressWithPrefix() switch kind := addressEndpoint.GetKind(); kind { case stack.Permanent, stack.PermanentTentative: if header.IsV6UnicastAddress(addrWithPrefix.Address) { e.mu.ndp.stopDuplicateAddressDetection(addrWithPrefix.Address, &stack.DADAborted{}) } case stack.Temporary, stack.PermanentExpired: default: panic(fmt.Sprintf("address %s has unknown address kind %d", addrWithPrefix, kind)) } return true }) if !e.setEnabled(false) { panic("should have only done work to disable the endpoint if it was enabled") } // Must be called after Enabled has been set. e.mu.addressableEndpointState.OnNetworkEndpointEnabledChanged() } // DefaultTTL is the default hop limit for this endpoint. func (e *endpoint) DefaultTTL() uint8 { return e.protocol.DefaultTTL() } // MTU implements stack.NetworkEndpoint. It returns the link-layer MTU minus the // network layer max header length. func (e *endpoint) MTU() uint32 { networkMTU, err := calculateNetworkMTU(e.nic.MTU(), header.IPv6MinimumSize) if err != nil { return 0 } return networkMTU } // MaxHeaderLength returns the maximum length needed by ipv6 headers (and // underlying protocols). func (e *endpoint) MaxHeaderLength() uint16 { // TODO(gvisor.dev/issues/5035): The maximum header length returned here does // not open the possibility for the caller to know about size required for // extension headers. return e.nic.MaxHeaderLength() + header.IPv6MinimumSize } func addIPHeader(srcAddr, dstAddr tcpip.Address, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams, extensionHeaders header.IPv6ExtHdrSerializer) tcpip.Error { extHdrsLen := extensionHeaders.Length() length := pkt.Size() + extensionHeaders.Length() if length > math.MaxUint16 { return &tcpip.ErrMessageTooLong{} } header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize + extHdrsLen)).Encode(&header.IPv6Fields{ PayloadLength: uint16(length), TransportProtocol: params.Protocol, HopLimit: params.TTL, TrafficClass: params.TOS, SrcAddr: srcAddr, DstAddr: dstAddr, ExtensionHeaders: extensionHeaders, }) pkt.NetworkProtocolNumber = ProtocolNumber return nil } func packetMustBeFragmented(pkt *stack.PacketBuffer, networkMTU uint32) bool { payload := len(pkt.TransportHeader().Slice()) + pkt.Data().Size() return pkt.GSOOptions.Type == stack.GSONone && uint32(payload) > networkMTU } // handleFragments fragments pkt and calls the handler function on each // fragment. It returns the number of fragments handled and the number of // fragments left to be processed. The IP header must already be present in the // original packet. The transport header protocol number is required to avoid // parsing the IPv6 extension headers. func (e *endpoint) handleFragments(r *stack.Route, networkMTU uint32, pkt *stack.PacketBuffer, transProto tcpip.TransportProtocolNumber, handler func(*stack.PacketBuffer) tcpip.Error) (int, int, tcpip.Error) { networkHeader := header.IPv6(pkt.NetworkHeader().Slice()) // TODO(gvisor.dev/issue/3912): Once the Authentication or ESP Headers are // supported for outbound packets, their length should not affect the fragment // maximum payload length because they should only be transmitted once. fragmentPayloadLen := (networkMTU - header.IPv6FragmentHeaderSize) &^ 7 if fragmentPayloadLen < header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit { // We need at least 8 bytes of space left for the fragmentable part because // the fragment payload must obviously be non-zero and must be a multiple // of 8 as per RFC 8200 section 4.5: // Each complete fragment, except possibly the last ("rightmost") one, is // an integer multiple of 8 octets long. return 0, 1, &tcpip.ErrMessageTooLong{} } if fragmentPayloadLen < uint32(len(pkt.TransportHeader().Slice())) { // As per RFC 8200 Section 4.5, the Transport Header is expected to be small // enough to fit in the first fragment. return 0, 1, &tcpip.ErrMessageTooLong{} } pf := fragmentation.MakePacketFragmenter(pkt, fragmentPayloadLen, calculateFragmentReserve(pkt)) defer pf.Release() id := e.getFragmentID() var n int for { fragPkt, more := buildNextFragment(&pf, networkHeader, transProto, id) err := handler(fragPkt) fragPkt.DecRef() if err != nil { return n, pf.RemainingFragmentCount() + 1, err } n++ if !more { return n, pf.RemainingFragmentCount(), nil } } } // WritePacket writes a packet to the given destination address and protocol. func (e *endpoint) WritePacket(r *stack.Route, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) tcpip.Error { dstAddr := r.RemoteAddress() if err := addIPHeader(r.LocalAddress(), dstAddr, pkt, params, nil /* extensionHeaders */); err != nil { return err } // iptables filtering. All packets that reach here are locally // generated. outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) if ok := e.protocol.stack.IPTables().CheckOutput(pkt, r, outNicName); !ok { // iptables is telling us to drop the packet. e.stats.ip.IPTablesOutputDropped.Increment() return nil } // If the packet is manipulated as per DNAT Output rules, handle packet // based on destination address and do not send the packet to link // layer. // // We should do this for every packet, rather than only DNATted packets, but // removing this check short circuits broadcasts before they are sent out to // other hosts. if netHeader := header.IPv6(pkt.NetworkHeader().Slice()); dstAddr != netHeader.DestinationAddress() { if ep := e.protocol.findEndpointWithAddress(netHeader.DestinationAddress()); ep != nil { // Since we rewrote the packet but it is being routed back to us, we // can safely assume the checksum is valid. ep.handleLocalPacket(pkt, true /* canSkipRXChecksum */) return nil } } return e.writePacket(r, pkt, params.Protocol, false /* headerIncluded */) } func (e *endpoint) writePacket(r *stack.Route, pkt *stack.PacketBuffer, protocol tcpip.TransportProtocolNumber, headerIncluded bool) tcpip.Error { if r.Loop()&stack.PacketLoop != 0 { // If the packet was generated by the stack (not a raw/packet endpoint // where a packet may be written with the header included), then we can // safely assume the checksum is valid. e.handleLocalPacket(pkt, !headerIncluded /* canSkipRXChecksum */) } if r.Loop()&stack.PacketOut == 0 { return nil } // Postrouting NAT can only change the source address, and does not alter the // route or outgoing interface of the packet. outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) if ok := e.protocol.stack.IPTables().CheckPostrouting(pkt, r, e, outNicName); !ok { // iptables is telling us to drop the packet. e.stats.ip.IPTablesPostroutingDropped.Increment() return nil } stats := e.stats.ip networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(len(pkt.NetworkHeader().Slice()))) if err != nil { stats.OutgoingPacketErrors.Increment() return err } if packetMustBeFragmented(pkt, networkMTU) { if pkt.NetworkPacketInfo.IsForwardedPacket { // As per RFC 2460, section 4.5: // Unlike IPv4, fragmentation in IPv6 is performed only by source nodes, // not by routers along a packet's delivery path. return &tcpip.ErrMessageTooLong{} } sent, remain, err := e.handleFragments(r, networkMTU, pkt, protocol, func(fragPkt *stack.PacketBuffer) tcpip.Error { // TODO(gvisor.dev/issue/3884): Evaluate whether we want to send each // fragment one by one using WritePacket() (current strategy) or if we // want to create a PacketBufferList from the fragments and feed it to // WritePackets(). It'll be faster but cost more memory. return e.nic.WritePacket(r, fragPkt) }) stats.PacketsSent.IncrementBy(uint64(sent)) stats.OutgoingPacketErrors.IncrementBy(uint64(remain)) return err } if err := e.nic.WritePacket(r, pkt); err != nil { stats.OutgoingPacketErrors.Increment() return err } stats.PacketsSent.Increment() return nil } // WriteHeaderIncludedPacket implements stack.NetworkEndpoint. func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) tcpip.Error { // The packet already has an IP header, but there are a few required checks. h, ok := pkt.Data().PullUp(header.IPv6MinimumSize) if !ok { return &tcpip.ErrMalformedHeader{} } ipH := header.IPv6(h) // Always set the payload length. pktSize := pkt.Data().Size() ipH.SetPayloadLength(uint16(pktSize - header.IPv6MinimumSize)) // Set the source address when zero. if ipH.SourceAddress() == header.IPv6Any { ipH.SetSourceAddress(r.LocalAddress()) } // Populate the packet buffer's network header and don't allow an invalid // packet to be sent. // // Note that parsing only makes sure that the packet is well formed as per the // wire format. We also want to check if the header's fields are valid before // sending the packet. proto, _, _, _, ok := parse.IPv6(pkt) if !ok || !header.IPv6(pkt.NetworkHeader().Slice()).IsValid(pktSize) { return &tcpip.ErrMalformedHeader{} } return e.writePacket(r, pkt, proto, true /* headerIncluded */) } func validateAddressesForForwarding(h header.IPv6) ip.ForwardingError { srcAddr := h.SourceAddress() // As per RFC 4291 section 2.5.2, // // The address 0:0:0:0:0:0:0:0 is called the unspecified address. It // must never be assigned to any node. It indicates the absence of an // address. One example of its use is in the Source Address field of // any IPv6 packets sent by an initializing host before it has learned // its own address. // // The unspecified address must not be used as the destination address // of IPv6 packets or in IPv6 Routing headers. An IPv6 packet with a // source address of unspecified must never be forwarded by an IPv6 // router. if srcAddr.Unspecified() { return &ip.ErrInitializingSourceAddress{} } // As per RFC 4291 section 2.5.6, // // Routers must not forward any packets with Link-Local source or // destination addresses to other links. if header.IsV6LinkLocalUnicastAddress(srcAddr) { return &ip.ErrLinkLocalSourceAddress{} } if dstAddr := h.DestinationAddress(); header.IsV6LinkLocalUnicastAddress(dstAddr) || header.IsV6LinkLocalMulticastAddress(dstAddr) { return &ip.ErrLinkLocalDestinationAddress{} } return nil } // forwardUnicastPacket attempts to forward a unicast packet to its final // destination. func (e *endpoint) forwardUnicastPacket(pkt *stack.PacketBuffer) ip.ForwardingError { h := header.IPv6(pkt.NetworkHeader().Slice()) if err := validateAddressesForForwarding(h); err != nil { return err } hopLimit := h.HopLimit() if hopLimit <= 1 { // As per RFC 4443 section 3.3, // // If a router receives a packet with a Hop Limit of zero, or if a // router decrements a packet's Hop Limit to zero, it MUST discard the // packet and originate an ICMPv6 Time Exceeded message with Code 0 to // the source of the packet. This indicates either a routing loop or // too small an initial Hop Limit value. // // We return the original error rather than the result of returning // the ICMP packet because the original error is more relevant to // the caller. _ = e.protocol.returnError(&icmpReasonHopLimitExceeded{}, pkt, false /* deliveredLocally */) return &ip.ErrTTLExceeded{} } stk := e.protocol.stack dstAddr := h.DestinationAddress() // Check if the destination is owned by the stack. if ep := e.protocol.findEndpointWithAddress(dstAddr); ep != nil { inNicName := stk.FindNICNameFromID(e.nic.ID()) outNicName := stk.FindNICNameFromID(ep.nic.ID()) if ok := stk.IPTables().CheckForward(pkt, inNicName, outNicName); !ok { // iptables is telling us to drop the packet. e.stats.ip.IPTablesForwardDropped.Increment() return nil } // The packet originally arrived on e so provide its NIC as the input NIC. ep.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */) return nil } // Check extension headers for any errors requiring action during forwarding. if err := e.processExtensionHeaders(h, pkt, true /* forwarding */); err != nil { return &ip.ErrParameterProblem{} } r, err := stk.FindRoute(0, tcpip.Address{}, dstAddr, ProtocolNumber, false /* multicastLoop */) switch err.(type) { case nil: // TODO(https://gvisor.dev/issues/8105): We should not observe ErrHostUnreachable from route // lookups. case *tcpip.ErrHostUnreachable, *tcpip.ErrNetworkUnreachable: // We return the original error rather than the result of returning the // ICMP packet because the original error is more relevant to the caller. _ = e.protocol.returnError(&icmpReasonNetUnreachable{}, pkt, false /* deliveredLocally */) return &ip.ErrHostUnreachable{} default: return &ip.ErrOther{Err: err} } defer r.Release() return e.forwardPacketWithRoute(r, pkt) } // forwardPacketWithRoute emits the pkt using the provided route. // // This method should be invoked by the endpoint that received the pkt. func (e *endpoint) forwardPacketWithRoute(route *stack.Route, pkt *stack.PacketBuffer) ip.ForwardingError { h := header.IPv6(pkt.NetworkHeader().Slice()) stk := e.protocol.stack inNicName := stk.FindNICNameFromID(e.nic.ID()) outNicName := stk.FindNICNameFromID(route.NICID()) if ok := stk.IPTables().CheckForward(pkt, inNicName, outNicName); !ok { // iptables is telling us to drop the packet. e.stats.ip.IPTablesForwardDropped.Increment() return nil } hopLimit := h.HopLimit() // We need to do a deep copy of the IP packet because // WriteHeaderIncludedPacket takes ownership of the packet buffer, but we do // not own it. newPkt := pkt.DeepCopyForForwarding(int(route.MaxHeaderLength())) defer newPkt.DecRef() newHdr := header.IPv6(newPkt.NetworkHeader().Slice()) // As per RFC 8200 section 3, // // Hop Limit 8-bit unsigned integer. Decremented by 1 by // each node that forwards the packet. newHdr.SetHopLimit(hopLimit - 1) forwardToEp, ok := e.protocol.getEndpointForNIC(route.NICID()) if !ok { // The interface was removed after we obtained the route. return &ip.ErrUnknownOutputEndpoint{} } switch err := forwardToEp.writePacket(route, newPkt, newPkt.TransportProtocolNumber, true /* headerIncluded */); err.(type) { case nil: return nil case *tcpip.ErrMessageTooLong: // As per RFC 4443, section 3.2: // A Packet Too Big MUST be sent by a router in response to a packet that // it cannot forward because the packet is larger than the MTU of the // outgoing link. _ = e.protocol.returnError(&icmpReasonPacketTooBig{}, pkt, false /* deliveredLocally */) return &ip.ErrMessageTooLong{} case *tcpip.ErrNoBufferSpace: return &ip.ErrOutgoingDeviceNoBufferSpace{} default: return &ip.ErrOther{Err: err} } } // HandlePacket is called by the link layer when new ipv6 packets arrive for // this endpoint. func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { stats := e.stats.ip stats.PacketsReceived.Increment() if !e.isEnabled() { stats.DisabledPacketsReceived.Increment() return } hView, ok := e.protocol.parseAndValidate(pkt) if !ok { stats.MalformedPacketsReceived.Increment() return } defer hView.Release() h := header.IPv6(hView.AsSlice()) if !checkV4Mapped(h, stats) { return } if !e.nic.IsLoopback() { if !e.protocol.options.AllowExternalLoopbackTraffic { if header.IsV6LoopbackAddress(h.SourceAddress()) { stats.InvalidSourceAddressesReceived.Increment() return } if header.IsV6LoopbackAddress(h.DestinationAddress()) { stats.InvalidDestinationAddressesReceived.Increment() return } } if e.protocol.stack.HandleLocal() { addressEndpoint := e.AcquireAssignedAddress(header.IPv6(pkt.NetworkHeader().Slice()).SourceAddress(), e.nic.Promiscuous(), stack.CanBePrimaryEndpoint, true /* readOnly */) if addressEndpoint != nil { // The source address is one of our own, so we never should have gotten // a packet like this unless HandleLocal is false or our NIC is the // loopback interface. stats.InvalidSourceAddressesReceived.Increment() return } } // Loopback traffic skips the prerouting chain. inNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) if ok := e.protocol.stack.IPTables().CheckPrerouting(pkt, e, inNicName); !ok { // iptables is telling us to drop the packet. stats.IPTablesPreroutingDropped.Increment() return } } e.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */) } // handleLocalPacket is like HandlePacket except it does not perform the // prerouting iptables hook or check for loopback traffic that originated from // outside of the netstack (i.e. martian loopback packets). func (e *endpoint) handleLocalPacket(pkt *stack.PacketBuffer, canSkipRXChecksum bool) { stats := e.stats.ip stats.PacketsReceived.Increment() pkt = pkt.CloneToInbound() defer pkt.DecRef() pkt.RXChecksumValidated = canSkipRXChecksum hView, ok := e.protocol.parseAndValidate(pkt) if !ok { stats.MalformedPacketsReceived.Increment() return } defer hView.Release() h := header.IPv6(hView.AsSlice()) if !checkV4Mapped(h, stats) { return } e.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */) } // forwardMulticastPacket validates a multicast pkt and attempts to forward it. // // This method should be invoked for incoming multicast packets using the // endpoint that received the packet. func (e *endpoint) forwardMulticastPacket(h header.IPv6, pkt *stack.PacketBuffer) ip.ForwardingError { if err := validateAddressesForForwarding(h); err != nil { return err } // Check extension headers for any errors. if err := e.processExtensionHeaders(h, pkt, true /* forwarding */); err != nil { return &ip.ErrParameterProblem{} } routeKey := stack.UnicastSourceAndMulticastDestination{ Source: h.SourceAddress(), Destination: h.DestinationAddress(), } // The pkt has been validated. Consequently, if a route is not found, then // the pkt can safely be queued. result, hasBufferSpace := e.protocol.multicastRouteTable.GetRouteOrInsertPending(routeKey, pkt) if !hasBufferSpace { // Unable to queue the pkt. Silently drop it. return &ip.ErrNoMulticastPendingQueueBufferSpace{} } switch result.GetRouteResultState { case multicast.InstalledRouteFound: // Attempt to forward the pkt using an existing route. return e.forwardValidatedMulticastPacket(pkt, result.InstalledRoute) case multicast.NoRouteFoundAndPendingInserted: e.emitMulticastEvent(func(disp stack.MulticastForwardingEventDispatcher) { disp.OnMissingRoute(stack.MulticastPacketContext{ stack.UnicastSourceAndMulticastDestination{h.SourceAddress(), h.DestinationAddress()}, e.nic.ID(), }) }) case multicast.PacketQueuedInPendingRoute: default: panic(fmt.Sprintf("unexpected GetRouteResultState: %s", result.GetRouteResultState)) } return &ip.ErrHostUnreachable{} } // forwardValidatedMulticastPacket attempts to forward the pkt using the // provided installedRoute. // // This method should be invoked by the endpoint that received the pkt. func (e *endpoint) forwardValidatedMulticastPacket(pkt *stack.PacketBuffer, installedRoute *multicast.InstalledRoute) ip.ForwardingError { // Per RFC 1812 section 5.2.1.3, // // Based on the IP source and destination addresses found in the datagram // header, the router determines whether the datagram has been received // on the proper interface for forwarding. If not, the datagram is // dropped silently. if e.nic.ID() != installedRoute.ExpectedInputInterface { h := header.IPv6(pkt.NetworkHeader().Slice()) e.emitMulticastEvent(func(disp stack.MulticastForwardingEventDispatcher) { disp.OnUnexpectedInputInterface(stack.MulticastPacketContext{ stack.UnicastSourceAndMulticastDestination{h.SourceAddress(), h.DestinationAddress()}, e.nic.ID(), }, installedRoute.ExpectedInputInterface) }) return &ip.ErrUnexpectedMulticastInputInterface{} } for _, outgoingInterface := range installedRoute.OutgoingInterfaces { if err := e.forwardMulticastPacketForOutgoingInterface(pkt, outgoingInterface); err != nil { e.handleForwardingError(err) continue } // The pkt was successfully forwarded. Mark the route as used. installedRoute.SetLastUsedTimestamp(e.protocol.stack.Clock().NowMonotonic()) } return nil } // forwardMulticastPacketForOutgoingInterface attempts to forward the pkt out // of the provided outgoing interface. // // This method should be invoked by the endpoint that received the pkt. func (e *endpoint) forwardMulticastPacketForOutgoingInterface(pkt *stack.PacketBuffer, outgoingInterface stack.MulticastRouteOutgoingInterface) ip.ForwardingError { h := header.IPv6(pkt.NetworkHeader().Slice()) // Per RFC 1812 section 5.2.1.3, // // A copy of the multicast datagram is forwarded out each outgoing // interface whose minimum TTL value is less than or equal to the TTL // value in the datagram header. // // Copying of the packet is deferred to forwardPacketWithRoute since unicast // and multicast both require a copy. if outgoingInterface.MinTTL > h.HopLimit() { return &ip.ErrTTLExceeded{} } route := e.protocol.stack.NewRouteForMulticast(outgoingInterface.ID, h.DestinationAddress(), e.NetworkProtocolNumber()) if route == nil { // Failed to convert to a stack.Route. This likely means that the outgoing // endpoint no longer exists. return &ip.ErrHostUnreachable{} } defer route.Release() return e.forwardPacketWithRoute(route, pkt) } // handleForwardingError processes the provided err and increments any relevant // counters. func (e *endpoint) handleForwardingError(err ip.ForwardingError) { stats := e.stats.ip switch err := err.(type) { case nil: return case *ip.ErrInitializingSourceAddress: stats.Forwarding.InitializingSource.Increment() case *ip.ErrLinkLocalSourceAddress: stats.Forwarding.LinkLocalSource.Increment() case *ip.ErrLinkLocalDestinationAddress: stats.Forwarding.LinkLocalDestination.Increment() case *ip.ErrTTLExceeded: stats.Forwarding.ExhaustedTTL.Increment() case *ip.ErrHostUnreachable: stats.Forwarding.Unrouteable.Increment() case *ip.ErrParameterProblem: stats.Forwarding.ExtensionHeaderProblem.Increment() case *ip.ErrMessageTooLong: stats.Forwarding.PacketTooBig.Increment() case *ip.ErrNoMulticastPendingQueueBufferSpace: stats.Forwarding.NoMulticastPendingQueueBufferSpace.Increment() case *ip.ErrUnexpectedMulticastInputInterface: stats.Forwarding.UnexpectedMulticastInputInterface.Increment() case *ip.ErrUnknownOutputEndpoint: stats.Forwarding.UnknownOutputEndpoint.Increment() case *ip.ErrOutgoingDeviceNoBufferSpace: stats.Forwarding.OutgoingDeviceNoBufferSpace.Increment() default: panic(fmt.Sprintf("unrecognized forwarding error: %s", err)) } stats.Forwarding.Errors.Increment() } func (e *endpoint) handleValidatedPacket(h header.IPv6, pkt *stack.PacketBuffer, inNICName string) { pkt.NICID = e.nic.ID() // Raw socket packets are delivered based solely on the transport protocol // number. We only require that the packet be valid IPv6. e.dispatcher.DeliverRawPacket(h.TransportProtocol(), pkt) stats := e.stats.ip stats.ValidPacketsReceived.Increment() srcAddr := h.SourceAddress() dstAddr := h.DestinationAddress() // As per RFC 4291 section 2.7: // Multicast addresses must not be used as source addresses in IPv6 // packets or appear in any Routing header. if header.IsV6MulticastAddress(srcAddr) { stats.InvalidSourceAddressesReceived.Increment() return } if header.IsV6MulticastAddress(dstAddr) { // Handle all packets destined to a multicast address separately. Unlike // unicast, these packets can be both delivered locally and forwarded. See // RFC 1812 section 5.2.3 for details regarding the forwarding/local // delivery decision. multicastForwading := e.MulticastForwarding() && e.protocol.multicastForwarding() if multicastForwading { e.handleForwardingError(e.forwardMulticastPacket(h, pkt)) } if e.IsInGroup(dstAddr) { e.deliverPacketLocally(h, pkt, inNICName) return } if !multicastForwading { // Only consider the destination address invalid if we didn't attempt to // forward the pkt and it was not delivered locally. stats.InvalidDestinationAddressesReceived.Increment() } return } // The destination address should be an address we own for us to receive the // packet. Otherwise, attempt to forward the packet. if addressEndpoint := e.AcquireAssignedAddress(dstAddr, e.nic.Promiscuous(), stack.CanBePrimaryEndpoint, true /* readOnly */); addressEndpoint != nil { e.deliverPacketLocally(h, pkt, inNICName) } else if e.Forwarding() { e.handleForwardingError(e.forwardUnicastPacket(pkt)) } else { stats.InvalidDestinationAddressesReceived.Increment() } } func (e *endpoint) deliverPacketLocally(h header.IPv6, pkt *stack.PacketBuffer, inNICName string) { stats := e.stats.ip // iptables filtering. All packets that reach here are intended for // this machine and need not be forwarded. if ok := e.protocol.stack.IPTables().CheckInput(pkt, inNICName); !ok { // iptables is telling us to drop the packet. stats.IPTablesInputDropped.Increment() return } // Any returned error is only useful for terminating execution early, but // we have nothing left to do, so we can drop it. _ = e.processExtensionHeaders(h, pkt, false /* forwarding */) } func (e *endpoint) processExtensionHeader(it *header.IPv6PayloadIterator, pkt **stack.PacketBuffer, h header.IPv6, routerAlert **header.IPv6RouterAlertOption, hasFragmentHeader *bool, forwarding bool) (bool, error) { stats := e.stats.ip dstAddr := h.DestinationAddress() // Keep track of the start of the previous header so we can report the // special case of a Hop by Hop at a location other than at the start. previousHeaderStart := it.HeaderOffset() extHdr, done, err := it.Next() if err != nil { stats.MalformedPacketsReceived.Increment() return true, err } if done { return true, nil } defer extHdr.Release() // As per RFC 8200, section 4: // // Extension headers (except for the Hop-by-Hop Options header) are // not processed, inserted, or deleted by any node along a packet's // delivery path until the packet reaches the node identified in the // Destination Address field of the IPv6 header. // // Furthermore, as per RFC 8200 section 4.1, the Hop By Hop extension // header is restricted to appear first in the list of extension headers. // // Therefore, we can immediately return once we hit any header other // than the Hop-by-Hop header while forwarding a packet. if forwarding { if _, ok := extHdr.(header.IPv6HopByHopOptionsExtHdr); !ok { return true, nil } } switch extHdr := extHdr.(type) { case header.IPv6HopByHopOptionsExtHdr: if err := e.processIPv6HopByHopOptionsExtHdr(&extHdr, it, *pkt, dstAddr, routerAlert, previousHeaderStart, forwarding); err != nil { return true, err } case header.IPv6RoutingExtHdr: if err := e.processIPv6RoutingExtHeader(&extHdr, it, *pkt); err != nil { return true, err } case header.IPv6FragmentExtHdr: *hasFragmentHeader = true if extHdr.IsAtomic() { // This fragment extension header indicates that this packet is an // atomic fragment. An atomic fragment is a fragment that contains // all the data required to reassemble a full packet. As per RFC 6946, // atomic fragments must not interfere with "normal" fragmented traffic // so we skip processing the fragment instead of feeding it through the // reassembly process below. return false, nil } if err := e.processFragmentExtHdr(&extHdr, it, pkt, h); err != nil { return true, err } case header.IPv6DestinationOptionsExtHdr: if err := e.processIPv6DestinationOptionsExtHdr(&extHdr, it, *pkt, dstAddr); err != nil { return true, err } case header.IPv6RawPayloadHeader: if err := e.processIPv6RawPayloadHeader(&extHdr, it, *pkt, *routerAlert, previousHeaderStart, *hasFragmentHeader); err != nil { return true, err } default: // Since the iterator returns IPv6RawPayloadHeader for unknown Extension // Header IDs this should never happen unless we missed a supported type // here. panic(fmt.Sprintf("unrecognized type from it.Next() = %T", extHdr)) } return false, nil } // processExtensionHeaders processes the extension headers in the given packet. // Returns an error if the processing of a header failed or if the packet should // be discarded. func (e *endpoint) processExtensionHeaders(h header.IPv6, pkt *stack.PacketBuffer, forwarding bool) error { // Create a VV to parse the packet. We don't plan to modify anything here. // vv consists of: // - Any IPv6 header bytes after the first 40 (i.e. extensions). // - The transport header, if present. // - Any other payload data. v := pkt.NetworkHeader().View() if v != nil { v.TrimFront(header.IPv6MinimumSize) } buf := buffer.MakeWithView(v) buf.Append(pkt.TransportHeader().View()) dataBuf := pkt.Data().ToBuffer() buf.Merge(&dataBuf) it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(h.NextHeader()), buf) // Add a reference to pkt because fragment header processing can replace this // packet with a new one that has an extra reference. Adding a reference here // keeps the two in parity so they can both be DecRef'd the same way. pkt.IncRef() defer func() { pkt.DecRef() it.Release() }() var ( hasFragmentHeader bool routerAlert *header.IPv6RouterAlertOption ) for { if done, err := e.processExtensionHeader(&it, &pkt, h, &routerAlert, &hasFragmentHeader, forwarding); err != nil || done { return err } } } func (e *endpoint) processIPv6RawPayloadHeader(extHdr *header.IPv6RawPayloadHeader, it *header.IPv6PayloadIterator, pkt *stack.PacketBuffer, routerAlert *header.IPv6RouterAlertOption, previousHeaderStart uint32, hasFragmentHeader bool) error { stats := e.stats.ip // If the last header in the payload isn't a known IPv6 extension header, // handle it as if it is transport layer data.å // Calculate the number of octets parsed from data. We want to consume all // the data except the unparsed portion located at the end, whose size is // extHdr.Buf.Size(). trim := pkt.Data().Size() - int(extHdr.Buf.Size()) // For unfragmented packets, extHdr still contains the transport header. // Consume that too. // // For reassembled fragments, pkt.TransportHeader is unset, so this is a // no-op and pkt.Data begins with the transport header. trim += len(pkt.TransportHeader().Slice()) if _, ok := pkt.Data().Consume(trim); !ok { stats.MalformedPacketsReceived.Increment() return fmt.Errorf("could not consume %d bytes", trim) } proto := tcpip.TransportProtocolNumber(extHdr.Identifier) // If the packet was reassembled from a fragment, it will not have a // transport header set yet. if len(pkt.TransportHeader().Slice()) == 0 { e.protocol.parseTransport(pkt, proto) } stats.PacketsDelivered.Increment() if proto == header.ICMPv6ProtocolNumber { e.handleICMP(pkt, hasFragmentHeader, routerAlert) return nil } switch res := e.dispatcher.DeliverTransportPacket(proto, pkt); res { case stack.TransportPacketHandled: return nil case stack.TransportPacketDestinationPortUnreachable: // As per RFC 4443 section 3.1: // A destination node SHOULD originate a Destination Unreachable // message with Code 4 in response to a packet for which the // transport protocol (e.g., UDP) has no listener, if that transport // protocol has no alternative means to inform the sender. _ = e.protocol.returnError(&icmpReasonPortUnreachable{}, pkt, true /* deliveredLocally */) return fmt.Errorf("destination port unreachable") case stack.TransportPacketProtocolUnreachable: // As per RFC 8200 section 4. (page 7): // Extension headers are numbered from IANA IP Protocol Numbers // [IANA-PN], the same values used for IPv4 and IPv6. When // processing a sequence of Next Header values in a packet, the // first one that is not an extension header [IANA-EH] indicates // that the next item in the packet is the corresponding upper-layer // header. // With more related information on page 8: // If, as a result of processing a header, the destination node is // required to proceed to the next header but the Next Header value // in the current header is unrecognized by the node, it should // discard the packet and send an ICMP Parameter Problem message to // the source of the packet, with an ICMP Code value of 1 // ("unrecognized Next Header type encountered") and the ICMP // Pointer field containing the offset of the unrecognized value // within the original packet. // // Which when taken together indicate that an unknown protocol should // be treated as an unrecognized next header value. // The location of the Next Header field is in a different place in // the initial IPv6 header than it is in the extension headers so // treat it specially. prevHdrIDOffset := uint32(header.IPv6NextHeaderOffset) if previousHeaderStart != 0 { prevHdrIDOffset = previousHeaderStart } _ = e.protocol.returnError(&icmpReasonParameterProblem{ code: header.ICMPv6UnknownHeader, pointer: prevHdrIDOffset, }, pkt, true /* deliveredLocally */) return fmt.Errorf("transport protocol unreachable") default: panic(fmt.Sprintf("unrecognized result from DeliverTransportPacket = %d", res)) } } func (e *endpoint) processIPv6RoutingExtHeader(extHdr *header.IPv6RoutingExtHdr, it *header.IPv6PayloadIterator, pkt *stack.PacketBuffer) error { // As per RFC 8200 section 4.4, if a node encounters a routing header with // an unrecognized routing type value, with a non-zero Segments Left // value, the node must discard the packet and send an ICMP Parameter // Problem, Code 0 to the packet's Source Address, pointing to the // unrecognized Routing Type. // // If the Segments Left is 0, the node must ignore the Routing extension // header and process the next header in the packet. // // Note, the stack does not yet handle any type of routing extension // header, so we just make sure Segments Left is zero before processing // the next extension header. if extHdr.SegmentsLeft() == 0 { return nil } _ = e.protocol.returnError(&icmpReasonParameterProblem{ code: header.ICMPv6ErroneousHeader, pointer: it.ParseOffset(), }, pkt, true /* deliveredLocally */) return fmt.Errorf("found unrecognized routing type with non-zero segments left in header = %#v", extHdr) } func (e *endpoint) processIPv6DestinationOptionsExtHdr(extHdr *header.IPv6DestinationOptionsExtHdr, it *header.IPv6PayloadIterator, pkt *stack.PacketBuffer, dstAddr tcpip.Address) error { stats := e.stats.ip optsIt := extHdr.Iter() var uopt *header.IPv6UnknownExtHdrOption defer func() { if uopt != nil { uopt.Data.Release() } }() for { opt, done, err := optsIt.Next() if err != nil { stats.MalformedPacketsReceived.Increment() return err } if uo, ok := opt.(*header.IPv6UnknownExtHdrOption); ok { uopt = uo } if done { break } // We currently do not support any IPv6 Destination extension header // options. switch opt.UnknownAction() { case header.IPv6OptionUnknownActionSkip: case header.IPv6OptionUnknownActionDiscard: return fmt.Errorf("found unknown destination header option = %#v with discard action", opt) case header.IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest: if header.IsV6MulticastAddress(dstAddr) { if uo, ok := opt.(*header.IPv6UnknownExtHdrOption); ok { uopt = uo } return fmt.Errorf("found unknown destination header option %#v with discard action", opt) } fallthrough case header.IPv6OptionUnknownActionDiscardSendICMP: // This case satisfies a requirement of RFC 8200 section 4.2 // which states that an unknown option starting with bits [10] should: // // discard the packet and, regardless of whether or not the // packet's Destination Address was a multicast address, send an // ICMP Parameter Problem, Code 2, message to the packet's // Source Address, pointing to the unrecognized Option Type. // _ = e.protocol.returnError(&icmpReasonParameterProblem{ code: header.ICMPv6UnknownOption, pointer: it.ParseOffset() + optsIt.OptionOffset(), respondToMulticast: true, }, pkt, true /* deliveredLocally */) return fmt.Errorf("found unknown destination header option %#v with discard action", opt) default: panic(fmt.Sprintf("unrecognized action for an unrecognized Destination extension header option = %#v", opt)) } if uopt != nil { uopt.Data.Release() uopt = nil } } return nil } func (e *endpoint) processIPv6HopByHopOptionsExtHdr(extHdr *header.IPv6HopByHopOptionsExtHdr, it *header.IPv6PayloadIterator, pkt *stack.PacketBuffer, dstAddr tcpip.Address, routerAlert **header.IPv6RouterAlertOption, previousHeaderStart uint32, forwarding bool) error { stats := e.stats.ip // As per RFC 8200 section 4.1, the Hop By Hop extension header is // restricted to appear immediately after an IPv6 fixed header. if previousHeaderStart != 0 { _ = e.protocol.returnError(&icmpReasonParameterProblem{ code: header.ICMPv6UnknownHeader, pointer: previousHeaderStart, }, pkt, !forwarding /* deliveredLocally */) return fmt.Errorf("found Hop-by-Hop header = %#v with non-zero previous header offset = %d", extHdr, previousHeaderStart) } optsIt := extHdr.Iter() var uopt *header.IPv6UnknownExtHdrOption defer func() { if uopt != nil { uopt.Data.Release() } }() for { opt, done, err := optsIt.Next() if err != nil { stats.MalformedPacketsReceived.Increment() return err } if uo, ok := opt.(*header.IPv6UnknownExtHdrOption); ok { uopt = uo } if done { break } switch opt := opt.(type) { case *header.IPv6RouterAlertOption: if *routerAlert != nil { // As per RFC 2711 section 3, there should be at most one Router // Alert option per packet. // // There MUST only be one option of this type, regardless of // value, per Hop-by-Hop header. stats.MalformedPacketsReceived.Increment() return fmt.Errorf("found multiple Router Alert options (%#v, %#v)", opt, *routerAlert) } *routerAlert = opt stats.OptionRouterAlertReceived.Increment() default: switch opt.UnknownAction() { case header.IPv6OptionUnknownActionSkip: case header.IPv6OptionUnknownActionDiscard: return fmt.Errorf("found unknown Hop-by-Hop header option = %#v with discard action", opt) case header.IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest: if header.IsV6MulticastAddress(dstAddr) { return fmt.Errorf("found unknown hop-by-hop header option = %#v with discard action", opt) } fallthrough case header.IPv6OptionUnknownActionDiscardSendICMP: // This case satisfies a requirement of RFC 8200 section 4.2 which // states that an unknown option starting with bits [10] should: // // discard the packet and, regardless of whether or not the // packet's Destination Address was a multicast address, send an // ICMP Parameter Problem, Code 2, message to the packet's // Source Address, pointing to the unrecognized Option Type. _ = e.protocol.returnError(&icmpReasonParameterProblem{ code: header.ICMPv6UnknownOption, pointer: it.ParseOffset() + optsIt.OptionOffset(), respondToMulticast: true, }, pkt, !forwarding /* deliveredLocally */) return fmt.Errorf("found unknown hop-by-hop header option = %#v with discard action", opt) default: panic(fmt.Sprintf("unrecognized action for an unrecognized Hop By Hop extension header option = %#v", opt)) } } if uopt != nil { uopt.Data.Release() uopt = nil } } return nil } func (e *endpoint) processFragmentExtHdr(extHdr *header.IPv6FragmentExtHdr, it *header.IPv6PayloadIterator, pkt **stack.PacketBuffer, h header.IPv6) error { stats := e.stats.ip fragmentFieldOffset := it.ParseOffset() // Don't consume the iterator if we have the first fragment because we // will use it to validate that the first fragment holds the upper layer // header. rawPayload := it.AsRawHeader(extHdr.FragmentOffset() != 0 /* consume */) defer rawPayload.Release() if extHdr.FragmentOffset() == 0 { // Check that the iterator ends with a raw payload as the first fragment // should include all headers up to and including any upper layer // headers, as per RFC 8200 section 4.5; only upper layer data // (non-headers) should follow the fragment extension header. var lastHdr header.IPv6PayloadHeader for { it, done, err := it.Next() if err != nil { stats.MalformedPacketsReceived.Increment() stats.MalformedFragmentsReceived.Increment() return err } if done { break } it.Release() lastHdr = it } // If the last header is a raw header, then the last portion of the IPv6 // payload is not a known IPv6 extension header. Note, this does not // mean that the last portion is an upper layer header or not an // extension header because: // 1) we do not yet support all extension headers // 2) we do not validate the upper layer header before reassembling. // // This check makes sure that a known IPv6 extension header is not // present after the Fragment extension header in a non-initial // fragment. // // TODO(#2196): Support IPv6 Authentication and Encapsulated // Security Payload extension headers. // TODO(#2333): Validate that the upper layer header is valid. switch lastHdr.(type) { case header.IPv6RawPayloadHeader: default: stats.MalformedPacketsReceived.Increment() stats.MalformedFragmentsReceived.Increment() return fmt.Errorf("known extension header = %#v present after fragment header in a non-initial fragment", lastHdr) } } fragmentPayloadLen := rawPayload.Buf.Size() if fragmentPayloadLen == 0 { // Drop the packet as it's marked as a fragment but has no payload. stats.MalformedPacketsReceived.Increment() stats.MalformedFragmentsReceived.Increment() return fmt.Errorf("fragment has no payload") } // As per RFC 2460 Section 4.5: // // If the length of a fragment, as derived from the fragment packet's // Payload Length field, is not a multiple of 8 octets and the M flag // of that fragment is 1, then that fragment must be discarded and an // ICMP Parameter Problem, Code 0, message should be sent to the source // of the fragment, pointing to the Payload Length field of the // fragment packet. if extHdr.More() && fragmentPayloadLen%header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit != 0 { stats.MalformedPacketsReceived.Increment() stats.MalformedFragmentsReceived.Increment() _ = e.protocol.returnError(&icmpReasonParameterProblem{ code: header.ICMPv6ErroneousHeader, pointer: header.IPv6PayloadLenOffset, }, *pkt, true /* deliveredLocally */) return fmt.Errorf("found fragment length = %d that is not a multiple of 8 octets", fragmentPayloadLen) } // The packet is a fragment, let's try to reassemble it. start := extHdr.FragmentOffset() * header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit // As per RFC 2460 Section 4.5: // // If the length and offset of a fragment are such that the Payload // Length of the packet reassembled from that fragment would exceed // 65,535 octets, then that fragment must be discarded and an ICMP // Parameter Problem, Code 0, message should be sent to the source of // the fragment, pointing to the Fragment Offset field of the fragment // packet. lengthAfterReassembly := int(start) + int(fragmentPayloadLen) if lengthAfterReassembly > header.IPv6MaximumPayloadSize { stats.MalformedPacketsReceived.Increment() stats.MalformedFragmentsReceived.Increment() _ = e.protocol.returnError(&icmpReasonParameterProblem{ code: header.ICMPv6ErroneousHeader, pointer: fragmentFieldOffset, }, *pkt, true /* deliveredLocally */) return fmt.Errorf("determined that reassembled packet length = %d would exceed allowed length = %d", lengthAfterReassembly, header.IPv6MaximumPayloadSize) } // Note that pkt doesn't have its transport header set after reassembly, // and won't until DeliverNetworkPacket sets it. resPkt, proto, ready, err := e.protocol.fragmentation.Process( // IPv6 ignores the Protocol field since the ID only needs to be unique // across source-destination pairs, as per RFC 8200 section 4.5. fragmentation.FragmentID{ Source: h.SourceAddress(), Destination: h.DestinationAddress(), ID: extHdr.ID(), }, start, start+uint16(fragmentPayloadLen)-1, extHdr.More(), uint8(rawPayload.Identifier), *pkt, ) if err != nil { stats.MalformedPacketsReceived.Increment() stats.MalformedFragmentsReceived.Increment() return err } if ready { // We create a new iterator with the reassembled packet because we could // have more extension headers in the reassembled payload, as per RFC // 8200 section 4.5. We also use the NextHeader value from the first // fragment. it.Release() *it = header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(proto), resPkt.Data().ToBuffer()) (*pkt).DecRef() *pkt = resPkt } return nil } // Close cleans up resources associated with the endpoint. func (e *endpoint) Close() { e.mu.Lock() e.disableLocked() e.mu.addressableEndpointState.Cleanup() e.mu.Unlock() e.protocol.forgetEndpoint(e.nic.ID()) } // NetworkProtocolNumber implements stack.NetworkEndpoint. func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber { return e.protocol.Number() } // AddAndAcquirePermanentAddress implements stack.AddressableEndpoint. func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, properties stack.AddressProperties) (stack.AddressEndpoint, tcpip.Error) { // TODO(b/169350103): add checks here after making sure we no longer receive // an empty address. e.mu.Lock() defer e.mu.Unlock() // The dance of registering the dispatcher after adding the address makes it // so that the tentative state is skipped if DAD is disabled. addrDisp := properties.Disp properties.Disp = nil addressEndpoint, err := e.addAndAcquirePermanentAddressLocked(addr, properties) if addrDisp != nil && err == nil { addressEndpoint.RegisterDispatcher(addrDisp) } return addressEndpoint, err } // addAndAcquirePermanentAddressLocked is like AddAndAcquirePermanentAddress but // with locking requirements. // // addAndAcquirePermanentAddressLocked also joins the passed address's // solicited-node multicast group and start duplicate address detection. // // Precondition: e.mu must be write locked. func (e *endpoint) addAndAcquirePermanentAddressLocked(addr tcpip.AddressWithPrefix, properties stack.AddressProperties) (stack.AddressEndpoint, tcpip.Error) { addressEndpoint, err := e.mu.addressableEndpointState.AddAndAcquireAddress(addr, properties, stack.PermanentTentative) if err != nil { return nil, err } if !header.IsV6UnicastAddress(addr.Address) { return addressEndpoint, nil } if e.Enabled() { if err := e.mu.ndp.startDuplicateAddressDetection(addr.Address, addressEndpoint); err != nil { return nil, err } } snmc := header.SolicitedNodeAddr(addr.Address) if err := e.joinGroupLocked(snmc); err != nil { // joinGroupLocked only returns an error if the group address is not a valid // IPv6 multicast address. panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", snmc, err)) } return addressEndpoint, nil } // RemovePermanentAddress implements stack.AddressableEndpoint. func (e *endpoint) RemovePermanentAddress(addr tcpip.Address) tcpip.Error { e.mu.Lock() defer e.mu.Unlock() addressEndpoint := e.getAddressRLocked(addr) if addressEndpoint == nil || !addressEndpoint.GetKind().IsPermanent() { return &tcpip.ErrBadLocalAddress{} } return e.removePermanentEndpointLocked(addressEndpoint, true /* allowSLAACInvalidation */, stack.AddressRemovalManualAction, &stack.DADAborted{}) } // removePermanentEndpointLocked is like removePermanentAddressLocked except // it works with a stack.AddressEndpoint. // // Precondition: e.mu must be write locked. func (e *endpoint) removePermanentEndpointLocked(addressEndpoint stack.AddressEndpoint, allowSLAACInvalidation bool, reason stack.AddressRemovalReason, dadResult stack.DADResult) tcpip.Error { addr := addressEndpoint.AddressWithPrefix() // If we are removing an address generated via SLAAC, cleanup // its SLAAC resources and notify the integrator. if addressEndpoint.ConfigType() == stack.AddressConfigSlaac { if addressEndpoint.Temporary() { e.mu.ndp.cleanupTempSLAACAddrResourcesAndNotify(addr) } else { e.mu.ndp.cleanupSLAACAddrResourcesAndNotify(addr, allowSLAACInvalidation) } } return e.removePermanentEndpointInnerLocked(addressEndpoint, reason, dadResult) } // removePermanentEndpointInnerLocked is like removePermanentEndpointLocked // except it does not cleanup SLAAC address state. // // Precondition: e.mu must be write locked. func (e *endpoint) removePermanentEndpointInnerLocked(addressEndpoint stack.AddressEndpoint, reason stack.AddressRemovalReason, dadResult stack.DADResult) tcpip.Error { addr := addressEndpoint.AddressWithPrefix() e.mu.ndp.stopDuplicateAddressDetection(addr.Address, dadResult) if err := e.mu.addressableEndpointState.RemovePermanentEndpoint(addressEndpoint, reason); err != nil { return err } snmc := header.SolicitedNodeAddr(addr.Address) err := e.leaveGroupLocked(snmc) // The endpoint may have already left the multicast group. if _, ok := err.(*tcpip.ErrBadLocalAddress); ok { err = nil } return err } // hasPermanentAddressLocked returns true if the endpoint has a permanent // address equal to the passed address. // // Precondition: e.mu must be read or write locked. func (e *endpoint) hasPermanentAddressRLocked(addr tcpip.Address) bool { addressEndpoint := e.getAddressRLocked(addr) if addressEndpoint == nil { return false } return addressEndpoint.GetKind().IsPermanent() } // getAddressRLocked returns the endpoint for the passed address. // // Precondition: e.mu must be read or write locked. func (e *endpoint) getAddressRLocked(localAddr tcpip.Address) stack.AddressEndpoint { return e.mu.addressableEndpointState.GetAddress(localAddr) } // SetDeprecated implements stack.AddressableEndpoint. func (e *endpoint) SetDeprecated(addr tcpip.Address, deprecated bool) tcpip.Error { e.mu.RLock() defer e.mu.RUnlock() return e.mu.addressableEndpointState.SetDeprecated(addr, deprecated) } // SetLifetimes implements stack.AddressableEndpoint. func (e *endpoint) SetLifetimes(addr tcpip.Address, lifetimes stack.AddressLifetimes) tcpip.Error { e.mu.RLock() defer e.mu.RUnlock() return e.mu.addressableEndpointState.SetLifetimes(addr, lifetimes) } // MainAddress implements stack.AddressableEndpoint. func (e *endpoint) MainAddress() tcpip.AddressWithPrefix { e.mu.RLock() defer e.mu.RUnlock() return e.mu.addressableEndpointState.MainAddress() } // AcquireAssignedAddress implements stack.AddressableEndpoint. func (e *endpoint) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior, readOnly bool) stack.AddressEndpoint { e.mu.RLock() defer e.mu.RUnlock() return e.acquireAddressOrCreateTempLocked(localAddr, allowTemp, tempPEB, readOnly) } // acquireAddressOrCreateTempLocked is like AcquireAssignedAddress but with // locking requirements. // // Precondition: e.mu must be write locked. func (e *endpoint) acquireAddressOrCreateTempLocked(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior, readOnly bool) stack.AddressEndpoint { return e.mu.addressableEndpointState.AcquireAssignedAddress(localAddr, allowTemp, tempPEB, readOnly) } // AcquireOutgoingPrimaryAddress implements stack.AddressableEndpoint. func (e *endpoint) AcquireOutgoingPrimaryAddress(remoteAddr, srcHint tcpip.Address, allowExpired bool) stack.AddressEndpoint { e.mu.RLock() defer e.mu.RUnlock() return e.acquireOutgoingPrimaryAddressRLocked(remoteAddr, srcHint, allowExpired) } // getLinkLocalAddressRLocked returns a link-local address from the primary list // of addresses, if one is available. // // See stack.PrimaryEndpointBehavior for more details about the primary list. // // Precondition: e.mu must be read locked. func (e *endpoint) getLinkLocalAddressRLocked() tcpip.Address { var linkLocalAddr tcpip.Address e.mu.addressableEndpointState.ForEachPrimaryEndpoint(func(addressEndpoint stack.AddressEndpoint) bool { if addressEndpoint.IsAssigned(false /* allowExpired */) { if addr := addressEndpoint.AddressWithPrefix().Address; header.IsV6LinkLocalUnicastAddress(addr) { linkLocalAddr = addr return false } } return true }) return linkLocalAddr } // acquireOutgoingPrimaryAddressRLocked is like AcquireOutgoingPrimaryAddress // but with locking requirements. // // Precondition: e.mu must be read locked. func (e *endpoint) acquireOutgoingPrimaryAddressRLocked(remoteAddr, srcHint tcpip.Address, allowExpired bool) stack.AddressEndpoint { // TODO(b/309216156): Support IPv6 hints. // addrCandidate is a candidate for Source Address Selection, as per // RFC 6724 section 5. type addrCandidate struct { addressEndpoint stack.AddressEndpoint addr tcpip.Address scope header.IPv6AddressScope label uint8 matchingPrefix uint8 } if remoteAddr.BitLen() == 0 { return e.mu.addressableEndpointState.AcquireOutgoingPrimaryAddress(remoteAddr, srcHint, allowExpired) } // Create a candidate set of available addresses we can potentially use as a // source address. var cs []addrCandidate e.mu.addressableEndpointState.ForEachPrimaryEndpoint(func(addressEndpoint stack.AddressEndpoint) bool { // If r is not valid for outgoing connections, it is not a valid endpoint. if !addressEndpoint.IsAssigned(allowExpired) { return true } addr := addressEndpoint.AddressWithPrefix().Address scope, err := header.ScopeForIPv6Address(addr) if err != nil { // Should never happen as we got r from the primary IPv6 endpoint list and // ScopeForIPv6Address only returns an error if addr is not an IPv6 // address. panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", addr, err)) } cs = append(cs, addrCandidate{ addressEndpoint: addressEndpoint, addr: addr, scope: scope, label: getLabel(addr), matchingPrefix: remoteAddr.MatchingPrefix(addr), }) return true }) remoteScope, err := header.ScopeForIPv6Address(remoteAddr) if err != nil { // primaryIPv6Endpoint should never be called with an invalid IPv6 address. panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", remoteAddr, err)) } remoteLabel := getLabel(remoteAddr) // Sort the addresses as per RFC 6724 section 5 rules 1-3. // // TODO(b/146021396): Implement rules 4, 5 of RFC 6724 section 5. sort.Slice(cs, func(i, j int) bool { sa := cs[i] sb := cs[j] // Prefer same address as per RFC 6724 section 5 rule 1. if sa.addr == remoteAddr { return true } if sb.addr == remoteAddr { return false } // Prefer appropriate scope as per RFC 6724 section 5 rule 2. if sa.scope < sb.scope { return sa.scope >= remoteScope } else if sb.scope < sa.scope { return sb.scope < remoteScope } // Avoid deprecated addresses as per RFC 6724 section 5 rule 3. if saDep, sbDep := sa.addressEndpoint.Deprecated(), sb.addressEndpoint.Deprecated(); saDep != sbDep { // If sa is not deprecated, it is preferred over sb. return sbDep } // Prefer matching label as per RFC 6724 section 5 rule 6. if sa, sb := sa.label == remoteLabel, sb.label == remoteLabel; sa != sb { if sa { return true } if sb { return false } } // Prefer temporary addresses as per RFC 6724 section 5 rule 7. if saTemp, sbTemp := sa.addressEndpoint.Temporary(), sb.addressEndpoint.Temporary(); saTemp != sbTemp { return saTemp } // Use longest matching prefix as per RFC 6724 section 5 rule 8. if sa.matchingPrefix > sb.matchingPrefix { return true } if sb.matchingPrefix > sa.matchingPrefix { return false } // sa and sb are equal, return the endpoint that is closest to the front of // the primary endpoint list. return i < j }) // Return the most preferred address that can have its reference count // incremented. for _, c := range cs { if c.addressEndpoint.TryIncRef() { return c.addressEndpoint } } return nil } // PrimaryAddresses implements stack.AddressableEndpoint. func (e *endpoint) PrimaryAddresses() []tcpip.AddressWithPrefix { e.mu.RLock() defer e.mu.RUnlock() return e.mu.addressableEndpointState.PrimaryAddresses() } // PermanentAddresses implements stack.AddressableEndpoint. func (e *endpoint) PermanentAddresses() []tcpip.AddressWithPrefix { e.mu.RLock() defer e.mu.RUnlock() return e.mu.addressableEndpointState.PermanentAddresses() } // JoinGroup implements stack.GroupAddressableEndpoint. func (e *endpoint) JoinGroup(addr tcpip.Address) tcpip.Error { e.mu.Lock() defer e.mu.Unlock() return e.joinGroupLocked(addr) } // joinGroupLocked is like JoinGroup but with locking requirements. // // Precondition: e.mu must be locked. func (e *endpoint) joinGroupLocked(addr tcpip.Address) tcpip.Error { if !header.IsV6MulticastAddress(addr) { return &tcpip.ErrBadAddress{} } e.mu.mld.joinGroup(addr) return nil } // LeaveGroup implements stack.GroupAddressableEndpoint. func (e *endpoint) LeaveGroup(addr tcpip.Address) tcpip.Error { e.mu.Lock() defer e.mu.Unlock() return e.leaveGroupLocked(addr) } // leaveGroupLocked is like LeaveGroup but with locking requirements. // // Precondition: e.mu must be locked. func (e *endpoint) leaveGroupLocked(addr tcpip.Address) tcpip.Error { return e.mu.mld.leaveGroup(addr) } // IsInGroup implements stack.GroupAddressableEndpoint. func (e *endpoint) IsInGroup(addr tcpip.Address) bool { e.mu.RLock() defer e.mu.RUnlock() return e.mu.mld.isInGroup(addr) } // Stats implements stack.NetworkEndpoint. func (e *endpoint) Stats() stack.NetworkEndpointStats { return &e.stats.localStats } var _ stack.NetworkProtocol = (*protocol)(nil) var _ stack.MulticastForwardingNetworkProtocol = (*protocol)(nil) var _ stack.RejectIPv6WithHandler = (*protocol)(nil) var _ fragmentation.TimeoutHandler = (*protocol)(nil) // +stateify savable type protocolMu struct { sync.RWMutex `state:"nosave"` // eps is keyed by NICID to allow protocol methods to retrieve an endpoint // when handling a packet, by looking at which NIC handled the packet. eps map[tcpip.NICID]*endpoint // ICMP types for which the stack's global rate limiting must apply. icmpRateLimitedTypes map[header.ICMPv6Type]struct{} // multicastForwardingDisp is the multicast forwarding event dispatcher that // an integrator can provide to receive multicast forwarding events. Note // that multicast packets will only be forwarded if this is non-nil. multicastForwardingDisp stack.MulticastForwardingEventDispatcher } // +stateify savable type protocol struct { stack *stack.Stack options Options mu protocolMu // defaultTTL is the current default TTL for the protocol. Only the // uint8 portion of it is meaningful. defaultTTL atomicbitops.Uint32 fragmentation *fragmentation.Fragmentation icmpRateLimiter *stack.ICMPRateLimiter multicastRouteTable multicast.RouteTable } // Number returns the ipv6 protocol number. func (p *protocol) Number() tcpip.NetworkProtocolNumber { return ProtocolNumber } // MinimumPacketSize returns the minimum valid ipv6 packet size. func (p *protocol) MinimumPacketSize() int { return header.IPv6MinimumSize } // ParseAddresses implements stack.NetworkProtocol. func (*protocol) ParseAddresses(b []byte) (src, dst tcpip.Address) { h := header.IPv6(b) return h.SourceAddress(), h.DestinationAddress() } // NewEndpoint creates a new ipv6 endpoint. func (p *protocol) NewEndpoint(nic stack.NetworkInterface, dispatcher stack.TransportDispatcher) stack.NetworkEndpoint { e := &endpoint{ nic: nic, dispatcher: dispatcher, protocol: p, } // NDP options must be 8 octet aligned and the first 2 bytes are used for // the type and length fields leaving 6 octets as the minimum size for a // nonce option without padding. const nonceSize = 6 // As per RFC 7527 section 4.1, // // If any probe is looped back within RetransTimer milliseconds after // having sent DupAddrDetectTransmits NS(DAD) messages, the interface // continues with another MAX_MULTICAST_SOLICIT number of NS(DAD) // messages transmitted RetransTimer milliseconds apart. // // Value taken from RFC 4861 section 10. const maxMulticastSolicit = 3 dadOptions := ip.DADOptions{ Clock: p.stack.Clock(), SecureRNG: p.stack.SecureRNG().Reader, NonceSize: nonceSize, ExtendDADTransmits: maxMulticastSolicit, Protocol: &e.mu.ndp, NICID: nic.ID(), } e.mu.Lock() e.mu.addressableEndpointState.Init(e, stack.AddressableEndpointStateOptions{HiddenWhileDisabled: true}) e.mu.ndp.init(e, dadOptions) e.mu.mld.init(e) e.dad.mu.Lock() e.dad.mu.dad.Init(&e.dad.mu, p.options.DADConfigs, dadOptions) e.dad.mu.Unlock() e.mu.Unlock() stackStats := p.stack.Stats() tcpip.InitStatCounters(reflect.ValueOf(&e.stats.localStats).Elem()) e.stats.ip.Init(&e.stats.localStats.IP, &stackStats.IP) e.stats.icmp.init(&e.stats.localStats.ICMP, &stackStats.ICMP.V6) p.mu.Lock() defer p.mu.Unlock() p.mu.eps[nic.ID()] = e return e } func (p *protocol) findEndpointWithAddress(addr tcpip.Address) *endpoint { p.mu.RLock() defer p.mu.RUnlock() for _, e := range p.mu.eps { if addressEndpoint := e.AcquireAssignedAddress(addr, false /* allowTemp */, stack.NeverPrimaryEndpoint, true /* readOnly */); addressEndpoint != nil { return e } } return nil } func (p *protocol) getEndpointForNIC(id tcpip.NICID) (*endpoint, bool) { p.mu.RLock() defer p.mu.RUnlock() ep, ok := p.mu.eps[id] return ep, ok } func (p *protocol) forgetEndpoint(nicID tcpip.NICID) { p.mu.Lock() defer p.mu.Unlock() delete(p.mu.eps, nicID) } // SetOption implements stack.NetworkProtocol. func (p *protocol) SetOption(option tcpip.SettableNetworkProtocolOption) tcpip.Error { switch v := option.(type) { case *tcpip.DefaultTTLOption: p.SetDefaultTTL(uint8(*v)) return nil default: return &tcpip.ErrUnknownProtocolOption{} } } // Option implements stack.NetworkProtocol. func (p *protocol) Option(option tcpip.GettableNetworkProtocolOption) tcpip.Error { switch v := option.(type) { case *tcpip.DefaultTTLOption: *v = tcpip.DefaultTTLOption(p.DefaultTTL()) return nil default: return &tcpip.ErrUnknownProtocolOption{} } } // SetDefaultTTL sets the default TTL for endpoints created with this protocol. func (p *protocol) SetDefaultTTL(ttl uint8) { p.defaultTTL.Store(uint32(ttl)) } // DefaultTTL returns the default TTL for endpoints created with this protocol. func (p *protocol) DefaultTTL() uint8 { return uint8(p.defaultTTL.Load()) } // emitMulticastEvent emits a multicast forwarding event using the provided // generator if a valid event dispatcher exists. func (e *endpoint) emitMulticastEvent(eventGenerator func(stack.MulticastForwardingEventDispatcher)) { e.protocol.mu.RLock() defer e.protocol.mu.RUnlock() if mcastDisp := e.protocol.mu.multicastForwardingDisp; mcastDisp != nil { eventGenerator(mcastDisp) } } // Close implements stack.TransportProtocol. func (p *protocol) Close() { p.fragmentation.Release() p.multicastRouteTable.Close() } func validateUnicastSourceAndMulticastDestination(addresses stack.UnicastSourceAndMulticastDestination) tcpip.Error { if !header.IsV6UnicastAddress(addresses.Source) || header.IsV6LinkLocalUnicastAddress(addresses.Source) { return &tcpip.ErrBadAddress{} } if !header.IsV6MulticastAddress(addresses.Destination) || header.IsV6LinkLocalMulticastAddress(addresses.Destination) { return &tcpip.ErrBadAddress{} } return nil } func (p *protocol) multicastForwarding() bool { p.mu.RLock() defer p.mu.RUnlock() return p.mu.multicastForwardingDisp != nil } func (p *protocol) newInstalledRoute(route stack.MulticastRoute) (*multicast.InstalledRoute, tcpip.Error) { if len(route.OutgoingInterfaces) == 0 { return nil, &tcpip.ErrMissingRequiredFields{} } if !p.stack.HasNIC(route.ExpectedInputInterface) { return nil, &tcpip.ErrUnknownNICID{} } for _, outgoingInterface := range route.OutgoingInterfaces { if route.ExpectedInputInterface == outgoingInterface.ID { return nil, &tcpip.ErrMulticastInputCannotBeOutput{} } if !p.stack.HasNIC(outgoingInterface.ID) { return nil, &tcpip.ErrUnknownNICID{} } } return p.multicastRouteTable.NewInstalledRoute(route), nil } // AddMulticastRoute implements stack.MulticastForwardingNetworkProtocol. func (p *protocol) AddMulticastRoute(addresses stack.UnicastSourceAndMulticastDestination, route stack.MulticastRoute) tcpip.Error { if !p.multicastForwarding() { return &tcpip.ErrNotPermitted{} } if err := validateUnicastSourceAndMulticastDestination(addresses); err != nil { return err } installedRoute, err := p.newInstalledRoute(route) if err != nil { return err } pendingPackets := p.multicastRouteTable.AddInstalledRoute(addresses, installedRoute) for _, pkt := range pendingPackets { p.forwardPendingMulticastPacket(pkt, installedRoute) } return nil } // RemoveMulticastRoute implements // stack.MulticastForwardingNetworkProtocol.RemoveMulticastRoute. func (p *protocol) RemoveMulticastRoute(addresses stack.UnicastSourceAndMulticastDestination) tcpip.Error { if err := validateUnicastSourceAndMulticastDestination(addresses); err != nil { return err } if removed := p.multicastRouteTable.RemoveInstalledRoute(addresses); !removed { return &tcpip.ErrHostUnreachable{} } return nil } // MulticastRouteLastUsedTime implements // stack.MulticastForwardingNetworkProtocol. func (p *protocol) MulticastRouteLastUsedTime(addresses stack.UnicastSourceAndMulticastDestination) (tcpip.MonotonicTime, tcpip.Error) { if err := validateUnicastSourceAndMulticastDestination(addresses); err != nil { return tcpip.MonotonicTime{}, err } timestamp, found := p.multicastRouteTable.GetLastUsedTimestamp(addresses) if !found { return tcpip.MonotonicTime{}, &tcpip.ErrHostUnreachable{} } return timestamp, nil } // EnableMulticastForwarding implements // stack.MulticastForwardingNetworkProtocol.EnableMulticastForwarding. func (p *protocol) EnableMulticastForwarding(disp stack.MulticastForwardingEventDispatcher) (bool, tcpip.Error) { p.mu.Lock() defer p.mu.Unlock() if p.mu.multicastForwardingDisp != nil { return true, nil } if disp == nil { return false, &tcpip.ErrInvalidOptionValue{} } p.mu.multicastForwardingDisp = disp return false, nil } // DisableMulticastForwarding implements // stack.MulticastForwardingNetworkProtocol.DisableMulticastForwarding. func (p *protocol) DisableMulticastForwarding() { p.mu.Lock() defer p.mu.Unlock() p.mu.multicastForwardingDisp = nil p.multicastRouteTable.RemoveAllInstalledRoutes() } func (p *protocol) forwardPendingMulticastPacket(pkt *stack.PacketBuffer, installedRoute *multicast.InstalledRoute) { defer pkt.DecRef() // Attempt to forward the packet using the endpoint that it originally // arrived on. This ensures that the packet is only forwarded if it // matches the route's expected input interface (see 5a of RFC 1812 section // 5.2.1.3). ep, ok := p.getEndpointForNIC(pkt.NICID) if !ok { // The endpoint that the packet arrived on no longer exists. Silently // drop the pkt. return } if !ep.MulticastForwarding() { return } ep.handleForwardingError(ep.forwardValidatedMulticastPacket(pkt, installedRoute)) } // Wait implements stack.TransportProtocol. func (*protocol) Wait() {} // parseAndValidate parses the packet (including its transport layer header) and // returns a view containing the parsed IP header. The caller is responsible // for releasing the returned View. // // Returns true if the IP header was successfully parsed. func (p *protocol) parseAndValidate(pkt *stack.PacketBuffer) (*buffer.View, bool) { transProtoNum, hasTransportHdr, ok := p.Parse(pkt) if !ok { return nil, false } h := header.IPv6(pkt.NetworkHeader().Slice()) // Do not include the link header's size when calculating the size of the IP // packet. if !h.IsValid(pkt.Size() - len(pkt.LinkHeader().Slice())) { return nil, false } if hasTransportHdr { p.parseTransport(pkt, transProtoNum) } return pkt.NetworkHeader().View(), true } func (p *protocol) parseTransport(pkt *stack.PacketBuffer, transProtoNum tcpip.TransportProtocolNumber) { if transProtoNum == header.ICMPv6ProtocolNumber { // The transport layer will handle transport layer parsing errors. _ = parse.ICMPv6(pkt) return } switch err := p.stack.ParsePacketBufferTransport(transProtoNum, pkt); err { case stack.ParsedOK: case stack.UnknownTransportProtocol, stack.TransportLayerParseError: // The transport layer will handle unknown protocols and transport layer // parsing errors. default: panic(fmt.Sprintf("unexpected error parsing transport header = %d", err)) } } // Parse implements stack.NetworkProtocol. func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) { proto, _, fragOffset, fragMore, ok := parse.IPv6(pkt) if !ok { return 0, false, false } return proto, !fragMore && fragOffset == 0, true } // allowICMPReply reports whether an ICMP reply with provided type may // be sent following the rate mask options and global ICMP rate limiter. func (p *protocol) allowICMPReply(icmpType header.ICMPv6Type) bool { p.mu.RLock() defer p.mu.RUnlock() if _, ok := p.mu.icmpRateLimitedTypes[icmpType]; ok { return p.stack.AllowICMPMessage() } return true } // SendRejectionError implements stack.RejectIPv6WithHandler. func (p *protocol) SendRejectionError(pkt *stack.PacketBuffer, rejectWith stack.RejectIPv6WithICMPType, inputHook bool) tcpip.Error { switch rejectWith { case stack.RejectIPv6WithICMPNoRoute: return p.returnError(&icmpReasonNetUnreachable{}, pkt, inputHook) case stack.RejectIPv6WithICMPAddrUnreachable: return p.returnError(&icmpReasonHostUnreachable{}, pkt, inputHook) case stack.RejectIPv6WithICMPPortUnreachable: return p.returnError(&icmpReasonPortUnreachable{}, pkt, inputHook) case stack.RejectIPv6WithICMPAdminProhibited: return p.returnError(&icmpReasonAdministrativelyProhibited{}, pkt, inputHook) default: panic(fmt.Sprintf("unhandled %[1]T = %[1]d", rejectWith)) } } // calculateNetworkMTU calculates the network-layer payload MTU based on the // link-layer payload MTU and the length of every IPv6 header. // Note that this is different than the Payload Length field of the IPv6 header, // which includes the length of the extension headers. func calculateNetworkMTU(linkMTU, networkHeadersLen uint32) (uint32, tcpip.Error) { if linkMTU < header.IPv6MinimumMTU { return 0, &tcpip.ErrInvalidEndpointState{} } // As per RFC 7112 section 5, we should discard packets if their IPv6 header // is bigger than 1280 bytes (ie, the minimum link MTU) since we do not // support PMTU discovery: // Hosts that do not discover the Path MTU MUST limit the IPv6 Header Chain // length to 1280 bytes. Limiting the IPv6 Header Chain length to 1280 // bytes ensures that the header chain length does not exceed the IPv6 // minimum MTU. if networkHeadersLen > header.IPv6MinimumMTU { return 0, &tcpip.ErrMalformedHeader{} } networkMTU := linkMTU - networkHeadersLen if networkMTU > maxPayloadSize { networkMTU = maxPayloadSize } return networkMTU, nil } // Options holds options to configure a new protocol. // // +stateify savable type Options struct { // NDPConfigs is the default NDP configurations used by interfaces. NDPConfigs NDPConfigurations // AutoGenLinkLocal determines whether or not the stack attempts to // auto-generate a link-local address for newly enabled non-loopback // NICs. // // Note, setting this to true does not mean that a link-local address is // assigned right away, or at all. If Duplicate Address Detection is enabled, // an address is only assigned if it successfully resolves. If it fails, no // further attempts are made to auto-generate a link-local address. // // The generated link-local address follows RFC 4291 Appendix A guidelines. AutoGenLinkLocal bool // NDPDisp is the NDP event dispatcher that an integrator can provide to // receive NDP related events. NDPDisp NDPDispatcher // OpaqueIIDOpts hold the options for generating opaque interface // identifiers (IIDs) as outlined by RFC 7217. OpaqueIIDOpts OpaqueInterfaceIdentifierOptions // TempIIDSeed is used to seed the initial temporary interface identifier // history value used to generate IIDs for temporary SLAAC addresses. // // Temporary SLAAC addresses are short-lived addresses which are unpredictable // and random from the perspective of other nodes on the network. It is // recommended that the seed be a random byte buffer of at least // header.IIDSize bytes to make sure that temporary SLAAC addresses are // sufficiently random. It should follow minimum randomness requirements for // security as outlined by RFC 4086. // // Note: using a nil value, the same seed across netstack program runs, or a // seed that is too small would reduce randomness and increase predictability, // defeating the purpose of temporary SLAAC addresses. TempIIDSeed []byte // MLD holds options for MLD. MLD MLDOptions // DADConfigs holds the default DAD configurations used by IPv6 endpoints. DADConfigs stack.DADConfigurations // AllowExternalLoopbackTraffic indicates that inbound loopback packets (i.e. // martian loopback packets) should be accepted. AllowExternalLoopbackTraffic bool } // NewProtocolWithOptions returns an IPv6 network protocol. func NewProtocolWithOptions(opts Options) stack.NetworkProtocolFactory { opts.NDPConfigs.validate() return func(s *stack.Stack) stack.NetworkProtocol { p := &protocol{ stack: s, options: opts, } p.fragmentation = fragmentation.NewFragmentation(header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, ReassembleTimeout, s.Clock(), p) p.mu.eps = make(map[tcpip.NICID]*endpoint) p.SetDefaultTTL(DefaultTTL) // Set default ICMP rate limiting to Linux defaults. // // Default: 0-1,3-127 (rate limit ICMPv6 errors except Packet Too Big) // See https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt. defaultIcmpTypes := make(map[header.ICMPv6Type]struct{}) for i := header.ICMPv6Type(0); i < header.ICMPv6EchoRequest; i++ { switch i { case header.ICMPv6PacketTooBig: // Do not rate limit packet too big by default. default: defaultIcmpTypes[i] = struct{}{} } } p.mu.icmpRateLimitedTypes = defaultIcmpTypes if err := p.multicastRouteTable.Init(multicast.DefaultConfig(s.Clock())); err != nil { panic(fmt.Sprintf("p.multicastRouteTable.Init(_): %s", err)) } return p } } // NewProtocol is equivalent to NewProtocolWithOptions with an empty Options. func NewProtocol(s *stack.Stack) stack.NetworkProtocol { return NewProtocolWithOptions(Options{})(s) } func calculateFragmentReserve(pkt *stack.PacketBuffer) int { return pkt.AvailableHeaderBytes() + len(pkt.NetworkHeader().Slice()) + header.IPv6FragmentHeaderSize } // getFragmentID returns a random uint32 number (other than zero) to be used as // fragment ID in the IPv6 header. func (e *endpoint) getFragmentID() uint32 { rng := e.protocol.stack.SecureRNG() id := rng.Uint32() for id == 0 { id = rng.Uint32() } return id } func buildNextFragment(pf *fragmentation.PacketFragmenter, originalIPHeaders header.IPv6, transportProto tcpip.TransportProtocolNumber, id uint32) (*stack.PacketBuffer, bool) { fragPkt, offset, copied, more := pf.BuildNextFragment() fragPkt.NetworkProtocolNumber = ProtocolNumber originalIPHeadersLength := len(originalIPHeaders) s := header.IPv6ExtHdrSerializer{&header.IPv6SerializableFragmentExtHdr{ FragmentOffset: uint16(offset / header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit), M: more, Identification: id, }} fragmentIPHeadersLength := originalIPHeadersLength + s.Length() fragmentIPHeaders := header.IPv6(fragPkt.NetworkHeader().Push(fragmentIPHeadersLength)) // Copy the IPv6 header and any extension headers already populated. if copied := copy(fragmentIPHeaders, originalIPHeaders); copied != originalIPHeadersLength { panic(fmt.Sprintf("wrong number of bytes copied into fragmentIPHeaders: got %d, want %d", copied, originalIPHeadersLength)) } nextHeader, _ := s.Serialize(transportProto, fragmentIPHeaders[originalIPHeadersLength:]) fragmentIPHeaders.SetNextHeader(nextHeader) fragmentIPHeaders.SetPayloadLength(uint16(copied + fragmentIPHeadersLength - header.IPv6MinimumSize)) return fragPkt, more } func checkV4Mapped(h header.IPv6, stats ip.MultiCounterIPStats) bool { // Disallow IPv4-mapped addresses per RFC 6890 section 2.2.3. ret := true if header.IsV4MappedAddress(h.SourceAddress()) { stats.InvalidSourceAddressesReceived.Increment() ret = false } if header.IsV4MappedAddress(h.DestinationAddress()) { stats.InvalidDestinationAddressesReceived.Increment() ret = false } return ret } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/ipv6/ipv6_state_autogen.go000066400000000000000000000725101465435605700272620ustar00rootroot00000000000000// automatically generated by stateify. package ipv6 import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (i *icmpv6DestinationUnreachableSockError) StateTypeName() string { return "pkg/tcpip/network/ipv6.icmpv6DestinationUnreachableSockError" } func (i *icmpv6DestinationUnreachableSockError) StateFields() []string { return []string{} } func (i *icmpv6DestinationUnreachableSockError) beforeSave() {} // +checklocksignore func (i *icmpv6DestinationUnreachableSockError) StateSave(stateSinkObject state.Sink) { i.beforeSave() } func (i *icmpv6DestinationUnreachableSockError) afterLoad(context.Context) {} // +checklocksignore func (i *icmpv6DestinationUnreachableSockError) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (i *icmpv6DestinationNetworkUnreachableSockError) StateTypeName() string { return "pkg/tcpip/network/ipv6.icmpv6DestinationNetworkUnreachableSockError" } func (i *icmpv6DestinationNetworkUnreachableSockError) StateFields() []string { return []string{ "icmpv6DestinationUnreachableSockError", } } func (i *icmpv6DestinationNetworkUnreachableSockError) beforeSave() {} // +checklocksignore func (i *icmpv6DestinationNetworkUnreachableSockError) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.icmpv6DestinationUnreachableSockError) } func (i *icmpv6DestinationNetworkUnreachableSockError) afterLoad(context.Context) {} // +checklocksignore func (i *icmpv6DestinationNetworkUnreachableSockError) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.icmpv6DestinationUnreachableSockError) } func (i *icmpv6DestinationPortUnreachableSockError) StateTypeName() string { return "pkg/tcpip/network/ipv6.icmpv6DestinationPortUnreachableSockError" } func (i *icmpv6DestinationPortUnreachableSockError) StateFields() []string { return []string{ "icmpv6DestinationUnreachableSockError", } } func (i *icmpv6DestinationPortUnreachableSockError) beforeSave() {} // +checklocksignore func (i *icmpv6DestinationPortUnreachableSockError) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.icmpv6DestinationUnreachableSockError) } func (i *icmpv6DestinationPortUnreachableSockError) afterLoad(context.Context) {} // +checklocksignore func (i *icmpv6DestinationPortUnreachableSockError) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.icmpv6DestinationUnreachableSockError) } func (i *icmpv6DestinationAddressUnreachableSockError) StateTypeName() string { return "pkg/tcpip/network/ipv6.icmpv6DestinationAddressUnreachableSockError" } func (i *icmpv6DestinationAddressUnreachableSockError) StateFields() []string { return []string{ "icmpv6DestinationUnreachableSockError", } } func (i *icmpv6DestinationAddressUnreachableSockError) beforeSave() {} // +checklocksignore func (i *icmpv6DestinationAddressUnreachableSockError) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.icmpv6DestinationUnreachableSockError) } func (i *icmpv6DestinationAddressUnreachableSockError) afterLoad(context.Context) {} // +checklocksignore func (i *icmpv6DestinationAddressUnreachableSockError) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.icmpv6DestinationUnreachableSockError) } func (e *icmpv6PacketTooBigSockError) StateTypeName() string { return "pkg/tcpip/network/ipv6.icmpv6PacketTooBigSockError" } func (e *icmpv6PacketTooBigSockError) StateFields() []string { return []string{ "mtu", } } func (e *icmpv6PacketTooBigSockError) beforeSave() {} // +checklocksignore func (e *icmpv6PacketTooBigSockError) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.mtu) } func (e *icmpv6PacketTooBigSockError) afterLoad(context.Context) {} // +checklocksignore func (e *icmpv6PacketTooBigSockError) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.mtu) } func (e *endpointMu) StateTypeName() string { return "pkg/tcpip/network/ipv6.endpointMu" } func (e *endpointMu) StateFields() []string { return []string{ "addressableEndpointState", "ndp", "mld", } } func (e *endpointMu) beforeSave() {} // +checklocksignore func (e *endpointMu) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.addressableEndpointState) stateSinkObject.Save(1, &e.ndp) stateSinkObject.Save(2, &e.mld) } func (e *endpointMu) afterLoad(context.Context) {} // +checklocksignore func (e *endpointMu) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.addressableEndpointState) stateSourceObject.Load(1, &e.ndp) stateSourceObject.Load(2, &e.mld) } func (d *dadMu) StateTypeName() string { return "pkg/tcpip/network/ipv6.dadMu" } func (d *dadMu) StateFields() []string { return []string{ "dad", } } func (d *dadMu) beforeSave() {} // +checklocksignore func (d *dadMu) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.dad) } func (d *dadMu) afterLoad(context.Context) {} // +checklocksignore func (d *dadMu) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.dad) } func (e *endpointDAD) StateTypeName() string { return "pkg/tcpip/network/ipv6.endpointDAD" } func (e *endpointDAD) StateFields() []string { return []string{ "mu", } } func (e *endpointDAD) beforeSave() {} // +checklocksignore func (e *endpointDAD) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.mu) } func (e *endpointDAD) afterLoad(context.Context) {} // +checklocksignore func (e *endpointDAD) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.mu) } func (e *endpoint) StateTypeName() string { return "pkg/tcpip/network/ipv6.endpoint" } func (e *endpoint) StateFields() []string { return []string{ "nic", "dispatcher", "protocol", "stats", "enabled", "forwarding", "multicastForwarding", "mu", "dad", } } func (e *endpoint) beforeSave() {} // +checklocksignore func (e *endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.nic) stateSinkObject.Save(1, &e.dispatcher) stateSinkObject.Save(2, &e.protocol) stateSinkObject.Save(3, &e.stats) stateSinkObject.Save(4, &e.enabled) stateSinkObject.Save(5, &e.forwarding) stateSinkObject.Save(6, &e.multicastForwarding) stateSinkObject.Save(7, &e.mu) stateSinkObject.Save(8, &e.dad) } func (e *endpoint) afterLoad(context.Context) {} // +checklocksignore func (e *endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.nic) stateSourceObject.Load(1, &e.dispatcher) stateSourceObject.Load(2, &e.protocol) stateSourceObject.Load(3, &e.stats) stateSourceObject.Load(4, &e.enabled) stateSourceObject.Load(5, &e.forwarding) stateSourceObject.Load(6, &e.multicastForwarding) stateSourceObject.Load(7, &e.mu) stateSourceObject.Load(8, &e.dad) } func (o *OpaqueInterfaceIdentifierOptions) StateTypeName() string { return "pkg/tcpip/network/ipv6.OpaqueInterfaceIdentifierOptions" } func (o *OpaqueInterfaceIdentifierOptions) StateFields() []string { return []string{ "SecretKey", } } func (o *OpaqueInterfaceIdentifierOptions) beforeSave() {} // +checklocksignore func (o *OpaqueInterfaceIdentifierOptions) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.SecretKey) } func (o *OpaqueInterfaceIdentifierOptions) afterLoad(context.Context) {} // +checklocksignore func (o *OpaqueInterfaceIdentifierOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.SecretKey) } func (p *protocolMu) StateTypeName() string { return "pkg/tcpip/network/ipv6.protocolMu" } func (p *protocolMu) StateFields() []string { return []string{ "eps", "icmpRateLimitedTypes", "multicastForwardingDisp", } } func (p *protocolMu) beforeSave() {} // +checklocksignore func (p *protocolMu) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.eps) stateSinkObject.Save(1, &p.icmpRateLimitedTypes) stateSinkObject.Save(2, &p.multicastForwardingDisp) } func (p *protocolMu) afterLoad(context.Context) {} // +checklocksignore func (p *protocolMu) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.eps) stateSourceObject.Load(1, &p.icmpRateLimitedTypes) stateSourceObject.Load(2, &p.multicastForwardingDisp) } func (p *protocol) StateTypeName() string { return "pkg/tcpip/network/ipv6.protocol" } func (p *protocol) StateFields() []string { return []string{ "stack", "options", "mu", "defaultTTL", "fragmentation", "icmpRateLimiter", "multicastRouteTable", } } func (p *protocol) beforeSave() {} // +checklocksignore func (p *protocol) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.stack) stateSinkObject.Save(1, &p.options) stateSinkObject.Save(2, &p.mu) stateSinkObject.Save(3, &p.defaultTTL) stateSinkObject.Save(4, &p.fragmentation) stateSinkObject.Save(5, &p.icmpRateLimiter) stateSinkObject.Save(6, &p.multicastRouteTable) } func (p *protocol) afterLoad(context.Context) {} // +checklocksignore func (p *protocol) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.stack) stateSourceObject.Load(1, &p.options) stateSourceObject.Load(2, &p.mu) stateSourceObject.Load(3, &p.defaultTTL) stateSourceObject.Load(4, &p.fragmentation) stateSourceObject.Load(5, &p.icmpRateLimiter) stateSourceObject.Load(6, &p.multicastRouteTable) } func (o *Options) StateTypeName() string { return "pkg/tcpip/network/ipv6.Options" } func (o *Options) StateFields() []string { return []string{ "NDPConfigs", "AutoGenLinkLocal", "NDPDisp", "OpaqueIIDOpts", "TempIIDSeed", "MLD", "DADConfigs", "AllowExternalLoopbackTraffic", } } func (o *Options) beforeSave() {} // +checklocksignore func (o *Options) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.NDPConfigs) stateSinkObject.Save(1, &o.AutoGenLinkLocal) stateSinkObject.Save(2, &o.NDPDisp) stateSinkObject.Save(3, &o.OpaqueIIDOpts) stateSinkObject.Save(4, &o.TempIIDSeed) stateSinkObject.Save(5, &o.MLD) stateSinkObject.Save(6, &o.DADConfigs) stateSinkObject.Save(7, &o.AllowExternalLoopbackTraffic) } func (o *Options) afterLoad(context.Context) {} // +checklocksignore func (o *Options) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.NDPConfigs) stateSourceObject.Load(1, &o.AutoGenLinkLocal) stateSourceObject.Load(2, &o.NDPDisp) stateSourceObject.Load(3, &o.OpaqueIIDOpts) stateSourceObject.Load(4, &o.TempIIDSeed) stateSourceObject.Load(5, &o.MLD) stateSourceObject.Load(6, &o.DADConfigs) stateSourceObject.Load(7, &o.AllowExternalLoopbackTraffic) } func (m *MLDOptions) StateTypeName() string { return "pkg/tcpip/network/ipv6.MLDOptions" } func (m *MLDOptions) StateFields() []string { return []string{ "Enabled", } } func (m *MLDOptions) beforeSave() {} // +checklocksignore func (m *MLDOptions) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.Enabled) } func (m *MLDOptions) afterLoad(context.Context) {} // +checklocksignore func (m *MLDOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.Enabled) } func (mld *mldState) StateTypeName() string { return "pkg/tcpip/network/ipv6.mldState" } func (mld *mldState) StateFields() []string { return []string{ "ep", "genericMulticastProtocol", } } func (mld *mldState) beforeSave() {} // +checklocksignore func (mld *mldState) StateSave(stateSinkObject state.Sink) { mld.beforeSave() stateSinkObject.Save(0, &mld.ep) stateSinkObject.Save(1, &mld.genericMulticastProtocol) } func (mld *mldState) afterLoad(context.Context) {} // +checklocksignore func (mld *mldState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &mld.ep) stateSourceObject.Load(1, &mld.genericMulticastProtocol) } func (c *NDPConfigurations) StateTypeName() string { return "pkg/tcpip/network/ipv6.NDPConfigurations" } func (c *NDPConfigurations) StateFields() []string { return []string{ "MaxRtrSolicitations", "RtrSolicitationInterval", "MaxRtrSolicitationDelay", "HandleRAs", "DiscoverDefaultRouters", "DiscoverMoreSpecificRoutes", "DiscoverOnLinkPrefixes", "AutoGenGlobalAddresses", "AutoGenAddressConflictRetries", "AutoGenTempGlobalAddresses", "MaxTempAddrValidLifetime", "MaxTempAddrPreferredLifetime", "RegenAdvanceDuration", } } func (c *NDPConfigurations) beforeSave() {} // +checklocksignore func (c *NDPConfigurations) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.MaxRtrSolicitations) stateSinkObject.Save(1, &c.RtrSolicitationInterval) stateSinkObject.Save(2, &c.MaxRtrSolicitationDelay) stateSinkObject.Save(3, &c.HandleRAs) stateSinkObject.Save(4, &c.DiscoverDefaultRouters) stateSinkObject.Save(5, &c.DiscoverMoreSpecificRoutes) stateSinkObject.Save(6, &c.DiscoverOnLinkPrefixes) stateSinkObject.Save(7, &c.AutoGenGlobalAddresses) stateSinkObject.Save(8, &c.AutoGenAddressConflictRetries) stateSinkObject.Save(9, &c.AutoGenTempGlobalAddresses) stateSinkObject.Save(10, &c.MaxTempAddrValidLifetime) stateSinkObject.Save(11, &c.MaxTempAddrPreferredLifetime) stateSinkObject.Save(12, &c.RegenAdvanceDuration) } func (c *NDPConfigurations) afterLoad(context.Context) {} // +checklocksignore func (c *NDPConfigurations) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.MaxRtrSolicitations) stateSourceObject.Load(1, &c.RtrSolicitationInterval) stateSourceObject.Load(2, &c.MaxRtrSolicitationDelay) stateSourceObject.Load(3, &c.HandleRAs) stateSourceObject.Load(4, &c.DiscoverDefaultRouters) stateSourceObject.Load(5, &c.DiscoverMoreSpecificRoutes) stateSourceObject.Load(6, &c.DiscoverOnLinkPrefixes) stateSourceObject.Load(7, &c.AutoGenGlobalAddresses) stateSourceObject.Load(8, &c.AutoGenAddressConflictRetries) stateSourceObject.Load(9, &c.AutoGenTempGlobalAddresses) stateSourceObject.Load(10, &c.MaxTempAddrValidLifetime) stateSourceObject.Load(11, &c.MaxTempAddrPreferredLifetime) stateSourceObject.Load(12, &c.RegenAdvanceDuration) } func (t *timer) StateTypeName() string { return "pkg/tcpip/network/ipv6.timer" } func (t *timer) StateFields() []string { return []string{ "done", "timer", } } func (t *timer) beforeSave() {} // +checklocksignore func (t *timer) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.done) stateSinkObject.Save(1, &t.timer) } func (t *timer) afterLoad(context.Context) {} // +checklocksignore func (t *timer) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.done) stateSourceObject.Load(1, &t.timer) } func (o *offLinkRoute) StateTypeName() string { return "pkg/tcpip/network/ipv6.offLinkRoute" } func (o *offLinkRoute) StateFields() []string { return []string{ "dest", "router", } } func (o *offLinkRoute) beforeSave() {} // +checklocksignore func (o *offLinkRoute) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.dest) stateSinkObject.Save(1, &o.router) } func (o *offLinkRoute) afterLoad(context.Context) {} // +checklocksignore func (o *offLinkRoute) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.dest) stateSourceObject.Load(1, &o.router) } func (ndp *ndpState) StateTypeName() string { return "pkg/tcpip/network/ipv6.ndpState" } func (ndp *ndpState) StateFields() []string { return []string{ "ep", "configs", "dad", "offLinkRoutes", "rtrSolicitTimer", "onLinkPrefixes", "slaacPrefixes", "dhcpv6Configuration", "temporaryIIDHistory", "temporaryAddressDesyncFactor", } } func (ndp *ndpState) beforeSave() {} // +checklocksignore func (ndp *ndpState) StateSave(stateSinkObject state.Sink) { ndp.beforeSave() stateSinkObject.Save(0, &ndp.ep) stateSinkObject.Save(1, &ndp.configs) stateSinkObject.Save(2, &ndp.dad) stateSinkObject.Save(3, &ndp.offLinkRoutes) stateSinkObject.Save(4, &ndp.rtrSolicitTimer) stateSinkObject.Save(5, &ndp.onLinkPrefixes) stateSinkObject.Save(6, &ndp.slaacPrefixes) stateSinkObject.Save(7, &ndp.dhcpv6Configuration) stateSinkObject.Save(8, &ndp.temporaryIIDHistory) stateSinkObject.Save(9, &ndp.temporaryAddressDesyncFactor) } func (ndp *ndpState) afterLoad(context.Context) {} // +checklocksignore func (ndp *ndpState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &ndp.ep) stateSourceObject.Load(1, &ndp.configs) stateSourceObject.Load(2, &ndp.dad) stateSourceObject.Load(3, &ndp.offLinkRoutes) stateSourceObject.Load(4, &ndp.rtrSolicitTimer) stateSourceObject.Load(5, &ndp.onLinkPrefixes) stateSourceObject.Load(6, &ndp.slaacPrefixes) stateSourceObject.Load(7, &ndp.dhcpv6Configuration) stateSourceObject.Load(8, &ndp.temporaryIIDHistory) stateSourceObject.Load(9, &ndp.temporaryAddressDesyncFactor) } func (o *offLinkRouteState) StateTypeName() string { return "pkg/tcpip/network/ipv6.offLinkRouteState" } func (o *offLinkRouteState) StateFields() []string { return []string{ "prf", "invalidationJob", } } func (o *offLinkRouteState) beforeSave() {} // +checklocksignore func (o *offLinkRouteState) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.prf) stateSinkObject.Save(1, &o.invalidationJob) } func (o *offLinkRouteState) afterLoad(context.Context) {} // +checklocksignore func (o *offLinkRouteState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.prf) stateSourceObject.Load(1, &o.invalidationJob) } func (o *onLinkPrefixState) StateTypeName() string { return "pkg/tcpip/network/ipv6.onLinkPrefixState" } func (o *onLinkPrefixState) StateFields() []string { return []string{ "invalidationJob", } } func (o *onLinkPrefixState) beforeSave() {} // +checklocksignore func (o *onLinkPrefixState) StateSave(stateSinkObject state.Sink) { o.beforeSave() stateSinkObject.Save(0, &o.invalidationJob) } func (o *onLinkPrefixState) afterLoad(context.Context) {} // +checklocksignore func (o *onLinkPrefixState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &o.invalidationJob) } func (t *tempSLAACAddrState) StateTypeName() string { return "pkg/tcpip/network/ipv6.tempSLAACAddrState" } func (t *tempSLAACAddrState) StateFields() []string { return []string{ "deprecationJob", "invalidationJob", "regenJob", "createdAt", "addressEndpoint", "regenerated", } } func (t *tempSLAACAddrState) beforeSave() {} // +checklocksignore func (t *tempSLAACAddrState) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.deprecationJob) stateSinkObject.Save(1, &t.invalidationJob) stateSinkObject.Save(2, &t.regenJob) stateSinkObject.Save(3, &t.createdAt) stateSinkObject.Save(4, &t.addressEndpoint) stateSinkObject.Save(5, &t.regenerated) } func (t *tempSLAACAddrState) afterLoad(context.Context) {} // +checklocksignore func (t *tempSLAACAddrState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.deprecationJob) stateSourceObject.Load(1, &t.invalidationJob) stateSourceObject.Load(2, &t.regenJob) stateSourceObject.Load(3, &t.createdAt) stateSourceObject.Load(4, &t.addressEndpoint) stateSourceObject.Load(5, &t.regenerated) } func (s *stableAddrState) StateTypeName() string { return "pkg/tcpip/network/ipv6.stableAddrState" } func (s *stableAddrState) StateFields() []string { return []string{ "addressEndpoint", "localGenerationFailures", } } func (s *stableAddrState) beforeSave() {} // +checklocksignore func (s *stableAddrState) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.addressEndpoint) stateSinkObject.Save(1, &s.localGenerationFailures) } func (s *stableAddrState) afterLoad(context.Context) {} // +checklocksignore func (s *stableAddrState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.addressEndpoint) stateSourceObject.Load(1, &s.localGenerationFailures) } func (s *slaacPrefixState) StateTypeName() string { return "pkg/tcpip/network/ipv6.slaacPrefixState" } func (s *slaacPrefixState) StateFields() []string { return []string{ "deprecationJob", "invalidationJob", "validUntil", "preferredUntil", "stableAddr", "tempAddrs", "generationAttempts", "maxGenerationAttempts", } } func (s *slaacPrefixState) beforeSave() {} // +checklocksignore func (s *slaacPrefixState) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.deprecationJob) stateSinkObject.Save(1, &s.invalidationJob) stateSinkObject.Save(2, &s.validUntil) stateSinkObject.Save(3, &s.preferredUntil) stateSinkObject.Save(4, &s.stableAddr) stateSinkObject.Save(5, &s.tempAddrs) stateSinkObject.Save(6, &s.generationAttempts) stateSinkObject.Save(7, &s.maxGenerationAttempts) } func (s *slaacPrefixState) afterLoad(context.Context) {} // +checklocksignore func (s *slaacPrefixState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.deprecationJob) stateSourceObject.Load(1, &s.invalidationJob) stateSourceObject.Load(2, &s.validUntil) stateSourceObject.Load(3, &s.preferredUntil) stateSourceObject.Load(4, &s.stableAddr) stateSourceObject.Load(5, &s.tempAddrs) stateSourceObject.Load(6, &s.generationAttempts) stateSourceObject.Load(7, &s.maxGenerationAttempts) } func (s *Stats) StateTypeName() string { return "pkg/tcpip/network/ipv6.Stats" } func (s *Stats) StateFields() []string { return []string{ "IP", "ICMP", "UnhandledRouterAdvertisements", } } func (s *Stats) beforeSave() {} // +checklocksignore func (s *Stats) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.IP) stateSinkObject.Save(1, &s.ICMP) stateSinkObject.Save(2, &s.UnhandledRouterAdvertisements) } func (s *Stats) afterLoad(context.Context) {} // +checklocksignore func (s *Stats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.IP) stateSourceObject.Load(1, &s.ICMP) stateSourceObject.Load(2, &s.UnhandledRouterAdvertisements) } func (s *sharedStats) StateTypeName() string { return "pkg/tcpip/network/ipv6.sharedStats" } func (s *sharedStats) StateFields() []string { return []string{ "localStats", "ip", "icmp", } } func (s *sharedStats) beforeSave() {} // +checklocksignore func (s *sharedStats) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.localStats) stateSinkObject.Save(1, &s.ip) stateSinkObject.Save(2, &s.icmp) } func (s *sharedStats) afterLoad(context.Context) {} // +checklocksignore func (s *sharedStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.localStats) stateSourceObject.Load(1, &s.ip) stateSourceObject.Load(2, &s.icmp) } func (m *multiCounterICMPv6PacketStats) StateTypeName() string { return "pkg/tcpip/network/ipv6.multiCounterICMPv6PacketStats" } func (m *multiCounterICMPv6PacketStats) StateFields() []string { return []string{ "echoRequest", "echoReply", "dstUnreachable", "packetTooBig", "timeExceeded", "paramProblem", "routerSolicit", "routerAdvert", "neighborSolicit", "neighborAdvert", "redirectMsg", "multicastListenerQuery", "multicastListenerReport", "multicastListenerReportV2", "multicastListenerDone", } } func (m *multiCounterICMPv6PacketStats) beforeSave() {} // +checklocksignore func (m *multiCounterICMPv6PacketStats) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.echoRequest) stateSinkObject.Save(1, &m.echoReply) stateSinkObject.Save(2, &m.dstUnreachable) stateSinkObject.Save(3, &m.packetTooBig) stateSinkObject.Save(4, &m.timeExceeded) stateSinkObject.Save(5, &m.paramProblem) stateSinkObject.Save(6, &m.routerSolicit) stateSinkObject.Save(7, &m.routerAdvert) stateSinkObject.Save(8, &m.neighborSolicit) stateSinkObject.Save(9, &m.neighborAdvert) stateSinkObject.Save(10, &m.redirectMsg) stateSinkObject.Save(11, &m.multicastListenerQuery) stateSinkObject.Save(12, &m.multicastListenerReport) stateSinkObject.Save(13, &m.multicastListenerReportV2) stateSinkObject.Save(14, &m.multicastListenerDone) } func (m *multiCounterICMPv6PacketStats) afterLoad(context.Context) {} // +checklocksignore func (m *multiCounterICMPv6PacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.echoRequest) stateSourceObject.Load(1, &m.echoReply) stateSourceObject.Load(2, &m.dstUnreachable) stateSourceObject.Load(3, &m.packetTooBig) stateSourceObject.Load(4, &m.timeExceeded) stateSourceObject.Load(5, &m.paramProblem) stateSourceObject.Load(6, &m.routerSolicit) stateSourceObject.Load(7, &m.routerAdvert) stateSourceObject.Load(8, &m.neighborSolicit) stateSourceObject.Load(9, &m.neighborAdvert) stateSourceObject.Load(10, &m.redirectMsg) stateSourceObject.Load(11, &m.multicastListenerQuery) stateSourceObject.Load(12, &m.multicastListenerReport) stateSourceObject.Load(13, &m.multicastListenerReportV2) stateSourceObject.Load(14, &m.multicastListenerDone) } func (m *multiCounterICMPv6SentPacketStats) StateTypeName() string { return "pkg/tcpip/network/ipv6.multiCounterICMPv6SentPacketStats" } func (m *multiCounterICMPv6SentPacketStats) StateFields() []string { return []string{ "multiCounterICMPv6PacketStats", "dropped", "rateLimited", } } func (m *multiCounterICMPv6SentPacketStats) beforeSave() {} // +checklocksignore func (m *multiCounterICMPv6SentPacketStats) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.multiCounterICMPv6PacketStats) stateSinkObject.Save(1, &m.dropped) stateSinkObject.Save(2, &m.rateLimited) } func (m *multiCounterICMPv6SentPacketStats) afterLoad(context.Context) {} // +checklocksignore func (m *multiCounterICMPv6SentPacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.multiCounterICMPv6PacketStats) stateSourceObject.Load(1, &m.dropped) stateSourceObject.Load(2, &m.rateLimited) } func (m *multiCounterICMPv6ReceivedPacketStats) StateTypeName() string { return "pkg/tcpip/network/ipv6.multiCounterICMPv6ReceivedPacketStats" } func (m *multiCounterICMPv6ReceivedPacketStats) StateFields() []string { return []string{ "multiCounterICMPv6PacketStats", "unrecognized", "invalid", "routerOnlyPacketsDroppedByHost", } } func (m *multiCounterICMPv6ReceivedPacketStats) beforeSave() {} // +checklocksignore func (m *multiCounterICMPv6ReceivedPacketStats) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.multiCounterICMPv6PacketStats) stateSinkObject.Save(1, &m.unrecognized) stateSinkObject.Save(2, &m.invalid) stateSinkObject.Save(3, &m.routerOnlyPacketsDroppedByHost) } func (m *multiCounterICMPv6ReceivedPacketStats) afterLoad(context.Context) {} // +checklocksignore func (m *multiCounterICMPv6ReceivedPacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.multiCounterICMPv6PacketStats) stateSourceObject.Load(1, &m.unrecognized) stateSourceObject.Load(2, &m.invalid) stateSourceObject.Load(3, &m.routerOnlyPacketsDroppedByHost) } func (m *multiCounterICMPv6Stats) StateTypeName() string { return "pkg/tcpip/network/ipv6.multiCounterICMPv6Stats" } func (m *multiCounterICMPv6Stats) StateFields() []string { return []string{ "packetsSent", "packetsReceived", } } func (m *multiCounterICMPv6Stats) beforeSave() {} // +checklocksignore func (m *multiCounterICMPv6Stats) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.packetsSent) stateSinkObject.Save(1, &m.packetsReceived) } func (m *multiCounterICMPv6Stats) afterLoad(context.Context) {} // +checklocksignore func (m *multiCounterICMPv6Stats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.packetsSent) stateSourceObject.Load(1, &m.packetsReceived) } func init() { state.Register((*icmpv6DestinationUnreachableSockError)(nil)) state.Register((*icmpv6DestinationNetworkUnreachableSockError)(nil)) state.Register((*icmpv6DestinationPortUnreachableSockError)(nil)) state.Register((*icmpv6DestinationAddressUnreachableSockError)(nil)) state.Register((*icmpv6PacketTooBigSockError)(nil)) state.Register((*endpointMu)(nil)) state.Register((*dadMu)(nil)) state.Register((*endpointDAD)(nil)) state.Register((*endpoint)(nil)) state.Register((*OpaqueInterfaceIdentifierOptions)(nil)) state.Register((*protocolMu)(nil)) state.Register((*protocol)(nil)) state.Register((*Options)(nil)) state.Register((*MLDOptions)(nil)) state.Register((*mldState)(nil)) state.Register((*NDPConfigurations)(nil)) state.Register((*timer)(nil)) state.Register((*offLinkRoute)(nil)) state.Register((*ndpState)(nil)) state.Register((*offLinkRouteState)(nil)) state.Register((*onLinkPrefixState)(nil)) state.Register((*tempSLAACAddrState)(nil)) state.Register((*stableAddrState)(nil)) state.Register((*slaacPrefixState)(nil)) state.Register((*Stats)(nil)) state.Register((*sharedStats)(nil)) state.Register((*multiCounterICMPv6PacketStats)(nil)) state.Register((*multiCounterICMPv6SentPacketStats)(nil)) state.Register((*multiCounterICMPv6ReceivedPacketStats)(nil)) state.Register((*multiCounterICMPv6Stats)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/ipv6/mld.go000066400000000000000000000403361465435605700242310ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ipv6 import ( "fmt" "time" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/network/internal/ip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) const ( // UnsolicitedReportIntervalMax is the maximum delay between sending // unsolicited MLD reports. // // Obtained from RFC 2710 Section 7.10. UnsolicitedReportIntervalMax = 10 * time.Second ) // MLDVersion is the forced version of MLD. type MLDVersion int const ( _ MLDVersion = iota // MLDVersion1 indicates MLDv1. MLDVersion1 // MLDVersion2 indicates MLDv2. Note that MLD may still fallback to V1 // compatibility mode as required by MLDv2. MLDVersion2 ) // MLDEndpoint is a network endpoint that supports MLD. type MLDEndpoint interface { // SetMLDVersions sets the MLD version. // // Returns the previous MLD version. SetMLDVersion(MLDVersion) MLDVersion // GetMLDVersion returns the MLD version. GetMLDVersion() MLDVersion } // MLDOptions holds options for MLD. // // +stateify savable type MLDOptions struct { // Enabled indicates whether MLD will be performed. // // When enabled, MLD may transmit MLD report and done messages when // joining and leaving multicast groups respectively, and handle incoming // MLD packets. // // This field is ignored and is always assumed to be false for interfaces // without neighbouring nodes (e.g. loopback). Enabled bool } var _ ip.MulticastGroupProtocol = (*mldState)(nil) // mldState is the per-interface MLD state. // // mldState.init MUST be called to initialize the MLD state. // // +stateify savable type mldState struct { // The IPv6 endpoint this mldState is for. ep *endpoint genericMulticastProtocol ip.GenericMulticastProtocolState } // Enabled implements ip.MulticastGroupProtocol. func (mld *mldState) Enabled() bool { // No need to perform MLD on loopback interfaces since they don't have // neighbouring nodes. return mld.ep.protocol.options.MLD.Enabled && !mld.ep.nic.IsLoopback() && mld.ep.Enabled() } // SendReport implements ip.MulticastGroupProtocol. // // Precondition: mld.ep.mu must be read locked. func (mld *mldState) SendReport(groupAddress tcpip.Address) (bool, tcpip.Error) { return mld.writePacket(groupAddress, groupAddress, header.ICMPv6MulticastListenerReport) } // SendLeave implements ip.MulticastGroupProtocol. // // Precondition: mld.ep.mu must be read locked. func (mld *mldState) SendLeave(groupAddress tcpip.Address) tcpip.Error { _, err := mld.writePacket(header.IPv6AllRoutersLinkLocalMulticastAddress, groupAddress, header.ICMPv6MulticastListenerDone) return err } // ShouldPerformProtocol implements ip.MulticastGroupProtocol. func (mld *mldState) ShouldPerformProtocol(groupAddress tcpip.Address) bool { // As per RFC 2710 section 5 page 10, // // The link-scope all-nodes address (FF02::1) is handled as a special // case. The node starts in Idle Listener state for that address on // every interface, never transitions to another state, and never sends // a Report or Done for that address. // // MLD messages are never sent for multicast addresses whose scope is 0 // (reserved) or 1 (node-local). if groupAddress == header.IPv6AllNodesMulticastAddress { return false } scope := header.V6MulticastScope(groupAddress) return scope != header.IPv6Reserved0MulticastScope && scope != header.IPv6InterfaceLocalMulticastScope } type mldv2ReportBuilder struct { mld *mldState records []header.MLDv2ReportMulticastAddressRecordSerializer } // AddRecord implements ip.MulticastGroupProtocolV2ReportBuilder. func (b *mldv2ReportBuilder) AddRecord(genericRecordType ip.MulticastGroupProtocolV2ReportRecordType, groupAddress tcpip.Address) { var recordType header.MLDv2ReportRecordType switch genericRecordType { case ip.MulticastGroupProtocolV2ReportRecordModeIsInclude: recordType = header.MLDv2ReportRecordModeIsInclude case ip.MulticastGroupProtocolV2ReportRecordModeIsExclude: recordType = header.MLDv2ReportRecordModeIsExclude case ip.MulticastGroupProtocolV2ReportRecordChangeToIncludeMode: recordType = header.MLDv2ReportRecordChangeToIncludeMode case ip.MulticastGroupProtocolV2ReportRecordChangeToExcludeMode: recordType = header.MLDv2ReportRecordChangeToExcludeMode case ip.MulticastGroupProtocolV2ReportRecordAllowNewSources: recordType = header.MLDv2ReportRecordAllowNewSources case ip.MulticastGroupProtocolV2ReportRecordBlockOldSources: recordType = header.MLDv2ReportRecordBlockOldSources default: panic(fmt.Sprintf("unrecognied genericRecordType = %d", genericRecordType)) } b.records = append(b.records, header.MLDv2ReportMulticastAddressRecordSerializer{ RecordType: recordType, MulticastAddress: groupAddress, Sources: nil, }) } // Send implements ip.MulticastGroupProtocolV2ReportBuilder. func (b *mldv2ReportBuilder) Send() (sent bool, err tcpip.Error) { if len(b.records) == 0 { return false, err } extensionHeaders := header.IPv6ExtHdrSerializer{ header.IPv6SerializableHopByHopExtHdr{ &header.IPv6RouterAlertOption{Value: header.IPv6RouterAlertMLD}, }, } mtu := int(b.mld.ep.MTU()) - extensionHeaders.Length() allSentWithSpecifiedAddress := true var firstErr tcpip.Error for records := b.records; len(records) != 0; { spaceLeft := mtu maxRecords := 0 for ; maxRecords < len(records); maxRecords++ { tmp := spaceLeft - records[maxRecords].Length() if tmp > 0 { spaceLeft = tmp } else { break } } serializer := header.MLDv2ReportSerializer{Records: records[:maxRecords]} records = records[maxRecords:] icmpView := buffer.NewViewSize(header.ICMPv6HeaderSize + serializer.Length()) icmp := header.ICMPv6(icmpView.AsSlice()) serializer.SerializeInto(icmp.MessageBody()) if sentWithSpecifiedAddress, err := b.mld.writePacketInner( icmpView, header.ICMPv6MulticastListenerV2Report, b.mld.ep.stats.icmp.packetsSent.multicastListenerReportV2, extensionHeaders, header.MLDv2RoutersAddress, ); err != nil { if firstErr != nil { firstErr = nil } allSentWithSpecifiedAddress = false } else if !sentWithSpecifiedAddress { allSentWithSpecifiedAddress = false } } return allSentWithSpecifiedAddress, firstErr } // NewReportV2Builder implements ip.MulticastGroupProtocol. func (mld *mldState) NewReportV2Builder() ip.MulticastGroupProtocolV2ReportBuilder { return &mldv2ReportBuilder{mld: mld} } // V2QueryMaxRespCodeToV2Delay implements ip.MulticastGroupProtocol. func (*mldState) V2QueryMaxRespCodeToV2Delay(code uint16) time.Duration { return header.MLDv2MaximumResponseDelay(code) } // V2QueryMaxRespCodeToV1Delay implements ip.MulticastGroupProtocol. func (*mldState) V2QueryMaxRespCodeToV1Delay(code uint16) time.Duration { return time.Duration(code) * time.Millisecond } // init sets up an mldState struct, and is required to be called before using // a new mldState. // // Must only be called once for the lifetime of mld. func (mld *mldState) init(ep *endpoint) { mld.ep = ep mld.genericMulticastProtocol.Init(&ep.mu.RWMutex, ip.GenericMulticastProtocolOptions{ Rand: ep.protocol.stack.InsecureRNG(), Clock: ep.protocol.stack.Clock(), Protocol: mld, MaxUnsolicitedReportDelay: UnsolicitedReportIntervalMax, }) } // handleMulticastListenerQuery handles a query message. // // Precondition: mld.ep.mu must be locked. func (mld *mldState) handleMulticastListenerQuery(mldHdr header.MLD) { mld.genericMulticastProtocol.HandleQueryLocked(mldHdr.MulticastAddress(), mldHdr.MaximumResponseDelay()) } // handleMulticastListenerQueryV2 handles a V2 query message. // // Precondition: mld.ep.mu must be locked. func (mld *mldState) handleMulticastListenerQueryV2(mldHdr header.MLDv2Query) { sources, ok := mldHdr.Sources() if !ok { return } mld.genericMulticastProtocol.HandleQueryV2Locked( mldHdr.MulticastAddress(), mldHdr.MaximumResponseCode(), sources, mldHdr.QuerierRobustnessVariable(), mldHdr.QuerierQueryInterval(), ) } // handleMulticastListenerReport handles a report message. // // Precondition: mld.ep.mu must be locked. func (mld *mldState) handleMulticastListenerReport(mldHdr header.MLD) { mld.genericMulticastProtocol.HandleReportLocked(mldHdr.MulticastAddress()) } // joinGroup handles joining a new group and sending and scheduling the required // messages. // // If the group is already joined, returns *tcpip.ErrDuplicateAddress. // // Precondition: mld.ep.mu must be locked. func (mld *mldState) joinGroup(groupAddress tcpip.Address) { mld.genericMulticastProtocol.JoinGroupLocked(groupAddress) } // isInGroup returns true if the specified group has been joined locally. // // Precondition: mld.ep.mu must be read locked. func (mld *mldState) isInGroup(groupAddress tcpip.Address) bool { return mld.genericMulticastProtocol.IsLocallyJoinedRLocked(groupAddress) } // leaveGroup handles removing the group from the membership map, cancels any // delay timers associated with that group, and sends the Done message, if // required. // // Precondition: mld.ep.mu must be locked. func (mld *mldState) leaveGroup(groupAddress tcpip.Address) tcpip.Error { // LeaveGroup returns false only if the group was not joined. if mld.genericMulticastProtocol.LeaveGroupLocked(groupAddress) { return nil } return &tcpip.ErrBadLocalAddress{} } // softLeaveAll leaves all groups from the perspective of MLD, but remains // joined locally. // // Precondition: mld.ep.mu must be locked. func (mld *mldState) softLeaveAll() { mld.genericMulticastProtocol.MakeAllNonMemberLocked() } // initializeAll attempts to initialize the MLD state for each group that has // been joined locally. // // Precondition: mld.ep.mu must be locked. func (mld *mldState) initializeAll() { mld.genericMulticastProtocol.InitializeGroupsLocked() } // sendQueuedReports attempts to send any reports that are queued for sending. // // Precondition: mld.ep.mu must be locked. func (mld *mldState) sendQueuedReports() { mld.genericMulticastProtocol.SendQueuedReportsLocked() } // setVersion sets the MLD version. // // Precondition: mld.ep.mu must be locked. func (mld *mldState) setVersion(v MLDVersion) MLDVersion { var prev bool switch v { case MLDVersion2: prev = mld.genericMulticastProtocol.SetV1ModeLocked(false) case MLDVersion1: prev = mld.genericMulticastProtocol.SetV1ModeLocked(true) default: panic(fmt.Sprintf("unrecognized version = %d", v)) } return toMLDVersion(prev) } func toMLDVersion(v1Generic bool) MLDVersion { if v1Generic { return MLDVersion1 } return MLDVersion2 } // getVersion returns the MLD version. // // Precondition: mld.ep.mu must be read locked. func (mld *mldState) getVersion() MLDVersion { return toMLDVersion(mld.genericMulticastProtocol.GetV1ModeLocked()) } // writePacket assembles and sends an MLD packet. // // Precondition: mld.ep.mu must be read locked. func (mld *mldState) writePacket(destAddress, groupAddress tcpip.Address, mldType header.ICMPv6Type) (bool, tcpip.Error) { sentStats := mld.ep.stats.icmp.packetsSent var mldStat tcpip.MultiCounterStat switch mldType { case header.ICMPv6MulticastListenerReport: mldStat = sentStats.multicastListenerReport case header.ICMPv6MulticastListenerDone: mldStat = sentStats.multicastListenerDone default: panic(fmt.Sprintf("unrecognized mld type = %d", mldType)) } icmpView := buffer.NewViewSize(header.ICMPv6HeaderSize + header.MLDMinimumSize) icmp := header.ICMPv6(icmpView.AsSlice()) header.MLD(icmp.MessageBody()).SetMulticastAddress(groupAddress) extensionHeaders := header.IPv6ExtHdrSerializer{ header.IPv6SerializableHopByHopExtHdr{ &header.IPv6RouterAlertOption{Value: header.IPv6RouterAlertMLD}, }, } return mld.writePacketInner( icmpView, mldType, mldStat, extensionHeaders, destAddress, ) } func (mld *mldState) writePacketInner(buf *buffer.View, mldType header.ICMPv6Type, reportStat tcpip.MultiCounterStat, extensionHeaders header.IPv6ExtHdrSerializer, destAddress tcpip.Address) (bool, tcpip.Error) { icmp := header.ICMPv6(buf.AsSlice()) icmp.SetType(mldType) // As per RFC 2710 section 3, // // All MLD messages described in this document are sent with a link-local // IPv6 Source Address, an IPv6 Hop Limit of 1, and an IPv6 Router Alert // option in a Hop-by-Hop Options header. // // However, this would cause problems with Duplicate Address Detection with // the first address as MLD snooping switches may not send multicast traffic // that DAD depends on to the node performing DAD without the MLD report, as // documented in RFC 4816: // // Note that when a node joins a multicast address, it typically sends a // Multicast Listener Discovery (MLD) report message [RFC2710] [RFC3810] // for the multicast address. In the case of Duplicate Address // Detection, the MLD report message is required in order to inform MLD- // snooping switches, rather than routers, to forward multicast packets. // In the above description, the delay for joining the multicast address // thus means delaying transmission of the corresponding MLD report // message. Since the MLD specifications do not request a random delay // to avoid race conditions, just delaying Neighbor Solicitation would // cause congestion by the MLD report messages. The congestion would // then prevent the MLD-snooping switches from working correctly and, as // a result, prevent Duplicate Address Detection from working. The // requirement to include the delay for the MLD report in this case // avoids this scenario. [RFC3590] also talks about some interaction // issues between Duplicate Address Detection and MLD, and specifies // which source address should be used for the MLD report in this case. // // As per RFC 3590 section 4, we should still send out MLD reports with an // unspecified source address if we do not have an assigned link-local // address to use as the source address to ensure DAD works as expected on // networks with MLD snooping switches: // // MLD Report and Done messages are sent with a link-local address as // the IPv6 source address, if a valid address is available on the // interface. If a valid link-local address is not available (e.g., one // has not been configured), the message is sent with the unspecified // address (::) as the IPv6 source address. // // Once a valid link-local address is available, a node SHOULD generate // new MLD Report messages for all multicast addresses joined on the // interface. // // Routers receiving an MLD Report or Done message with the unspecified // address as the IPv6 source address MUST silently discard the packet // without taking any action on the packets contents. // // Snooping switches MUST manage multicast forwarding state based on MLD // Report and Done messages sent with the unspecified address as the // IPv6 source address. localAddress := mld.ep.getLinkLocalAddressRLocked() if localAddress.BitLen() == 0 { localAddress = header.IPv6Any } icmp.SetChecksum(header.ICMPv6Checksum(header.ICMPv6ChecksumParams{ Header: icmp, Src: localAddress, Dst: destAddress, })) pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ ReserveHeaderBytes: int(mld.ep.MaxHeaderLength()) + extensionHeaders.Length(), Payload: buffer.MakeWithView(buf), }) defer pkt.DecRef() if err := addIPHeader(localAddress, destAddress, pkt, stack.NetworkHeaderParams{ Protocol: header.ICMPv6ProtocolNumber, TTL: header.MLDHopLimit, }, extensionHeaders); err != nil { panic(fmt.Sprintf("failed to add IP header: %s", err)) } if err := mld.ep.nic.WritePacketToRemote(header.EthernetAddressFromMulticastIPv6Address(destAddress), pkt); err != nil { mld.ep.stats.icmp.packetsSent.dropped.Increment() return false, err } reportStat.Increment() return localAddress != header.IPv6Any, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/ipv6/ndp.go000066400000000000000000002150251465435605700242350ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ipv6 import ( "fmt" "time" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/network/internal/ip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) const ( // defaultMaxRtrSolicitations is the default number of Router // Solicitation messages to send when an IPv6 endpoint becomes enabled. // // Default = 3 (from RFC 4861 section 10). defaultMaxRtrSolicitations = 3 // defaultRtrSolicitationInterval is the default amount of time between // sending Router Solicitation messages. // // Default = 4s (from 4861 section 10). defaultRtrSolicitationInterval = 4 * time.Second // defaultMaxRtrSolicitationDelay is the default maximum amount of time // to wait before sending the first Router Solicitation message. // // Default = 1s (from 4861 section 10). defaultMaxRtrSolicitationDelay = time.Second // defaultHandleRAs is the default configuration for whether or not to // handle incoming Router Advertisements as a host. defaultHandleRAs = HandlingRAsEnabledWhenForwardingDisabled // defaultDiscoverDefaultRouters is the default configuration for // whether or not to discover default routers from incoming Router // Advertisements, as a host. defaultDiscoverDefaultRouters = true // defaultDiscoverMoreSpecificRoutes is the default configuration for // whether or not to discover more-specific routes from incoming Router // Advertisements, as a host. defaultDiscoverMoreSpecificRoutes = true // defaultDiscoverOnLinkPrefixes is the default configuration for // whether or not to discover on-link prefixes from incoming Router // Advertisements' Prefix Information option, as a host. defaultDiscoverOnLinkPrefixes = true // defaultAutoGenGlobalAddresses is the default configuration for // whether or not to generate global IPv6 addresses in response to // receiving a new Prefix Information option with its Autonomous // Address AutoConfiguration flag set, as a host. // // Default = true. defaultAutoGenGlobalAddresses = true // minimumRtrSolicitationInterval is the minimum amount of time to wait // between sending Router Solicitation messages. This limit is imposed // to make sure that Router Solicitation messages are not sent all at // once, defeating the purpose of sending the initial few messages. minimumRtrSolicitationInterval = 500 * time.Millisecond // minimumMaxRtrSolicitationDelay is the minimum amount of time to wait // before sending the first Router Solicitation message. It is 0 because // we cannot have a negative delay. minimumMaxRtrSolicitationDelay = 0 // MaxDiscoveredOffLinkRoutes is the maximum number of discovered off-link // routes. The stack should stop discovering new off-link routes after // this limit is reached. // // This value MUST be at minimum 2 as per RFC 4861 section 6.3.4, and // SHOULD be more. MaxDiscoveredOffLinkRoutes = 10 // MaxDiscoveredOnLinkPrefixes is the maximum number of discovered // on-link prefixes. The stack should stop discovering new on-link // prefixes after discovering MaxDiscoveredOnLinkPrefixes on-link // prefixes. MaxDiscoveredOnLinkPrefixes = 10 // MaxDiscoveredSLAACPrefixes is the maximum number of discovered // SLAAC prefixes. The stack will stop discovering new SLAAC // prefixes after discovering MaxDiscoveredSLAACPrefixes SLAAC prefixes. MaxDiscoveredSLAACPrefixes = 10 // validPrefixLenForAutoGen is the expected prefix length that an // address can be generated for. Must be 64 bits as the interface // identifier (IID) is 64 bits and an IPv6 address is 128 bits, so // 128 - 64 = 64. validPrefixLenForAutoGen = 64 // defaultAutoGenTempGlobalAddresses is the default configuration for whether // or not to generate temporary SLAAC addresses. defaultAutoGenTempGlobalAddresses = true // defaultMaxTempAddrValidLifetime is the default maximum valid lifetime // for temporary SLAAC addresses generated as part of RFC 4941. // // Default = 7 days (from RFC 4941 section 5). defaultMaxTempAddrValidLifetime = 7 * 24 * time.Hour // defaultMaxTempAddrPreferredLifetime is the default preferred lifetime // for temporary SLAAC addresses generated as part of RFC 4941. // // Default = 1 day (from RFC 4941 section 5). defaultMaxTempAddrPreferredLifetime = 24 * time.Hour // defaultRegenAdvanceDuration is the default duration before the deprecation // of a temporary address when a new address will be generated. // // Default = 5s (from RFC 4941 section 5). defaultRegenAdvanceDuration = 5 * time.Second // minRegenAdvanceDuration is the minimum duration before the deprecation // of a temporary address when a new address will be generated. minRegenAdvanceDuration = time.Duration(0) // maxSLAACAddrLocalRegenAttempts is the maximum number of times to attempt // SLAAC address regenerations in response to an IPv6 endpoint-local conflict. maxSLAACAddrLocalRegenAttempts = 10 // MinPrefixInformationValidLifetimeForUpdate is the minimum Valid // Lifetime to update the valid lifetime of a generated address by // SLAAC. // // Min = 2hrs. MinPrefixInformationValidLifetimeForUpdate = 2 * time.Hour // MaxDesyncFactor is the upper bound for the preferred lifetime's desync // factor for temporary SLAAC addresses. // // Must be greater than 0. // // Max = 10m (from RFC 4941 section 5). MaxDesyncFactor = 10 * time.Minute // MinMaxTempAddrPreferredLifetime is the minimum value allowed for the // maximum preferred lifetime for temporary SLAAC addresses. // // This value guarantees that a temporary address is preferred for at // least 1hr if the SLAAC prefix is valid for at least that time. MinMaxTempAddrPreferredLifetime = defaultRegenAdvanceDuration + MaxDesyncFactor + time.Hour // MinMaxTempAddrValidLifetime is the minimum value allowed for the // maximum valid lifetime for temporary SLAAC addresses. // // This value guarantees that a temporary address is valid for at least // 2hrs if the SLAAC prefix is valid for at least that time. MinMaxTempAddrValidLifetime = 2 * time.Hour ) // NDPEndpoint is an endpoint that supports NDP. type NDPEndpoint interface { // SetNDPConfigurations sets the NDP configurations. SetNDPConfigurations(NDPConfigurations) } // DHCPv6ConfigurationFromNDPRA is a configuration available via DHCPv6 that an // NDP Router Advertisement informed the Stack about. type DHCPv6ConfigurationFromNDPRA int const ( _ DHCPv6ConfigurationFromNDPRA = iota // DHCPv6NoConfiguration indicates that no configurations are available via // DHCPv6. DHCPv6NoConfiguration // DHCPv6ManagedAddress indicates that addresses are available via DHCPv6. // // DHCPv6ManagedAddress also implies DHCPv6OtherConfigurations because DHCPv6 // returns all available configuration information when serving addresses. DHCPv6ManagedAddress // DHCPv6OtherConfigurations indicates that other configuration information is // available via DHCPv6. // // Other configurations are configurations other than addresses. Examples of // other configurations are recursive DNS server list, DNS search lists and // default gateway. DHCPv6OtherConfigurations ) // NDPDispatcher is the interface integrators of netstack must implement to // receive and handle NDP related events. type NDPDispatcher interface { // OnDuplicateAddressDetectionResult is called when the DAD process for an // address on a NIC completes. // // This function is not permitted to block indefinitely. This function // is also not permitted to call into the stack. OnDuplicateAddressDetectionResult(tcpip.NICID, tcpip.Address, stack.DADResult) // OnOffLinkRouteUpdated is called when an off-link route is updated. // // This function is not permitted to block indefinitely. This function // is also not permitted to call into the stack. OnOffLinkRouteUpdated(tcpip.NICID, tcpip.Subnet, tcpip.Address, header.NDPRoutePreference) // OnOffLinkRouteInvalidated is called when an off-link route is invalidated. // // This function is not permitted to block indefinitely. This function // is also not permitted to call into the stack. OnOffLinkRouteInvalidated(tcpip.NICID, tcpip.Subnet, tcpip.Address) // OnOnLinkPrefixDiscovered is called when a new on-link prefix is discovered. // // This function is not permitted to block indefinitely. This function // is also not permitted to call into the stack. OnOnLinkPrefixDiscovered(tcpip.NICID, tcpip.Subnet) // OnOnLinkPrefixInvalidated is called when a discovered on-link prefix that // was remembered is invalidated. // // This function is not permitted to block indefinitely. This function // is also not permitted to call into the stack. OnOnLinkPrefixInvalidated(tcpip.NICID, tcpip.Subnet) // OnAutoGenAddress is called when a new prefix with its autonomous address- // configuration flag set is received and SLAAC was performed. // // This function is not permitted to block indefinitely. It must not // call functions on the stack itself. // // If a non-nil AddressDispatcher is returned, events related to the address // will be sent to the dispatcher. OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) stack.AddressDispatcher // OnAutoGenAddressDeprecated is called when an auto-generated address (SLAAC) // is deprecated, but is still considered valid. Note, if an address is // invalidated at the same time it is deprecated, the deprecation event may // not be received. // // This function is not permitted to block indefinitely. It must not // call functions on the stack itself. OnAutoGenAddressDeprecated(tcpip.NICID, tcpip.AddressWithPrefix) // OnAutoGenAddressInvalidated is called when an auto-generated address // (SLAAC) is invalidated. // // This function is not permitted to block indefinitely. It must not // call functions on the stack itself. OnAutoGenAddressInvalidated(tcpip.NICID, tcpip.AddressWithPrefix) // OnRecursiveDNSServerOption is called when the stack learns of DNS servers // through NDP. Note, the addresses may contain link-local addresses. // // It is up to the caller to use the DNS Servers only for their valid // lifetime. OnRecursiveDNSServerOption may be called for new or // already known DNS servers. If called with known DNS servers, their // valid lifetimes must be refreshed to the lifetime (it may be increased, // decreased, or completely invalidated when the lifetime = 0). // // This function is not permitted to block indefinitely. It must not // call functions on the stack itself. OnRecursiveDNSServerOption(tcpip.NICID, []tcpip.Address, time.Duration) // OnDNSSearchListOption is called when the stack learns of DNS search lists // through NDP. // // It is up to the caller to use the domain names in the search list // for only their valid lifetime. OnDNSSearchListOption may be called // with new or already known domain names. If called with known domain // names, their valid lifetimes must be refreshed to the lifetime (it may // be increased, decreased or completely invalidated when the lifetime = 0. OnDNSSearchListOption(tcpip.NICID, []string, time.Duration) // OnDHCPv6Configuration is called with an updated configuration that is // available via DHCPv6 for the passed NIC. // // This function is not permitted to block indefinitely. It must not // call functions on the stack itself. OnDHCPv6Configuration(tcpip.NICID, DHCPv6ConfigurationFromNDPRA) } var _ fmt.Stringer = HandleRAsConfiguration(0) // HandleRAsConfiguration enumerates when RAs may be handled. type HandleRAsConfiguration int const ( // HandlingRAsDisabled indicates that Router Advertisements will not be // handled. HandlingRAsDisabled HandleRAsConfiguration = iota // HandlingRAsEnabledWhenForwardingDisabled indicates that router // advertisements will only be handled when forwarding is disabled. HandlingRAsEnabledWhenForwardingDisabled // HandlingRAsAlwaysEnabled indicates that Router Advertisements will always // be handled, even when forwarding is enabled. HandlingRAsAlwaysEnabled ) // String implements fmt.Stringer. func (c HandleRAsConfiguration) String() string { switch c { case HandlingRAsDisabled: return "HandlingRAsDisabled" case HandlingRAsEnabledWhenForwardingDisabled: return "HandlingRAsEnabledWhenForwardingDisabled" case HandlingRAsAlwaysEnabled: return "HandlingRAsAlwaysEnabled" default: return fmt.Sprintf("HandleRAsConfiguration(%d)", c) } } // enabled returns true iff Router Advertisements may be handled given the // specified forwarding status. func (c HandleRAsConfiguration) enabled(forwarding bool) bool { switch c { case HandlingRAsDisabled: return false case HandlingRAsEnabledWhenForwardingDisabled: return !forwarding case HandlingRAsAlwaysEnabled: return true default: panic(fmt.Sprintf("unhandled HandleRAsConfiguration = %d", c)) } } // NDPConfigurations is the NDP configurations for the netstack. // // +stateify savable type NDPConfigurations struct { // The number of Router Solicitation messages to send when the IPv6 endpoint // becomes enabled. // // Ignored unless configured to handle Router Advertisements. MaxRtrSolicitations uint8 // The amount of time between transmitting Router Solicitation messages. // // Must be greater than or equal to 0.5s. RtrSolicitationInterval time.Duration // The maximum amount of time before transmitting the first Router // Solicitation message. // // Must be greater than or equal to 0s. MaxRtrSolicitationDelay time.Duration // HandleRAs is the configuration for when Router Advertisements should be // handled. HandleRAs HandleRAsConfiguration // DiscoverDefaultRouters determines whether or not default routers are // discovered from Router Advertisements, as per RFC 4861 section 6. This // configuration is ignored if RAs will not be processed (see HandleRAs). DiscoverDefaultRouters bool // DiscoverMoreSpecificRoutes determines whether or not more specific routes // are discovered from Router Advertisements, as per RFC 4191. This // configuration is ignored if RAs will not be processed (see HandleRAs). DiscoverMoreSpecificRoutes bool // DiscoverOnLinkPrefixes determines whether or not on-link prefixes are // discovered from Router Advertisements' Prefix Information option, as per // RFC 4861 section 6. This configuration is ignored if RAs will not be // processed (see HandleRAs). DiscoverOnLinkPrefixes bool // AutoGenGlobalAddresses determines whether or not an IPv6 endpoint performs // SLAAC to auto-generate global SLAAC addresses in response to Prefix // Information options, as per RFC 4862. // // Note, if an address was already generated for some unique prefix, as // part of SLAAC, this option does not affect whether or not the // lifetime(s) of the generated address changes; this option only // affects the generation of new addresses as part of SLAAC. AutoGenGlobalAddresses bool // AutoGenAddressConflictRetries determines how many times to attempt to retry // generation of a permanent auto-generated address in response to DAD // conflicts. // // If the method used to generate the address does not support creating // alternative addresses (e.g. IIDs based on the modified EUI64 of a NIC's // MAC address), then no attempt is made to resolve the conflict. AutoGenAddressConflictRetries uint8 // AutoGenTempGlobalAddresses determines whether or not temporary SLAAC // addresses are generated for an IPv6 endpoint as part of SLAAC privacy // extensions, as per RFC 4941. // // Ignored if AutoGenGlobalAddresses is false. AutoGenTempGlobalAddresses bool // MaxTempAddrValidLifetime is the maximum valid lifetime for temporary // SLAAC addresses. MaxTempAddrValidLifetime time.Duration // MaxTempAddrPreferredLifetime is the maximum preferred lifetime for // temporary SLAAC addresses. MaxTempAddrPreferredLifetime time.Duration // RegenAdvanceDuration is the duration before the deprecation of a temporary // address when a new address will be generated. RegenAdvanceDuration time.Duration } // DefaultNDPConfigurations returns an NDPConfigurations populated with // default values. func DefaultNDPConfigurations() NDPConfigurations { return NDPConfigurations{ MaxRtrSolicitations: defaultMaxRtrSolicitations, RtrSolicitationInterval: defaultRtrSolicitationInterval, MaxRtrSolicitationDelay: defaultMaxRtrSolicitationDelay, HandleRAs: defaultHandleRAs, DiscoverDefaultRouters: defaultDiscoverDefaultRouters, DiscoverMoreSpecificRoutes: defaultDiscoverMoreSpecificRoutes, DiscoverOnLinkPrefixes: defaultDiscoverOnLinkPrefixes, AutoGenGlobalAddresses: defaultAutoGenGlobalAddresses, AutoGenTempGlobalAddresses: defaultAutoGenTempGlobalAddresses, MaxTempAddrValidLifetime: defaultMaxTempAddrValidLifetime, MaxTempAddrPreferredLifetime: defaultMaxTempAddrPreferredLifetime, RegenAdvanceDuration: defaultRegenAdvanceDuration, } } // validate modifies an NDPConfigurations with valid values. If invalid values // are present in c, the corresponding default values are used instead. func (c *NDPConfigurations) validate() { if c.RtrSolicitationInterval < minimumRtrSolicitationInterval { c.RtrSolicitationInterval = defaultRtrSolicitationInterval } if c.MaxRtrSolicitationDelay < minimumMaxRtrSolicitationDelay { c.MaxRtrSolicitationDelay = defaultMaxRtrSolicitationDelay } if c.MaxTempAddrValidLifetime < MinMaxTempAddrValidLifetime { c.MaxTempAddrValidLifetime = MinMaxTempAddrValidLifetime } if c.MaxTempAddrPreferredLifetime < MinMaxTempAddrPreferredLifetime || c.MaxTempAddrPreferredLifetime > c.MaxTempAddrValidLifetime { c.MaxTempAddrPreferredLifetime = MinMaxTempAddrPreferredLifetime } if c.RegenAdvanceDuration < minRegenAdvanceDuration { c.RegenAdvanceDuration = minRegenAdvanceDuration } } // +stateify savable type timer struct { // done indicates to the timer that the timer was stopped. done *bool timer tcpip.Timer } // +stateify savable type offLinkRoute struct { dest tcpip.Subnet router tcpip.Address } // ndpState is the per-Interface NDP state. // // +stateify savable type ndpState struct { // Do not allow overwriting this state. _ sync.NoCopy `state:"nosave"` // The IPv6 endpoint this ndpState is for. ep *endpoint // configs is the per-interface NDP configurations. configs NDPConfigurations // The DAD timers to send the next NS message, or resolve the address. dad ip.DAD // The off-link routes discovered through Router Advertisements. offLinkRoutes map[offLinkRoute]offLinkRouteState // rtrSolicitTimer is the timer used to send the next router solicitation // message. // // rtrSolicitTimer is the zero value when NDP is not soliciting routers. rtrSolicitTimer timer // The on-link prefixes discovered through Router Advertisements' Prefix // Information option. onLinkPrefixes map[tcpip.Subnet]onLinkPrefixState // The SLAAC prefixes discovered through Router Advertisements' Prefix // Information option. slaacPrefixes map[tcpip.Subnet]slaacPrefixState // The last learned DHCPv6 configuration from an NDP RA. dhcpv6Configuration DHCPv6ConfigurationFromNDPRA // temporaryIIDHistory is the history value used to generate a new temporary // IID. temporaryIIDHistory [header.IIDSize]byte // temporaryAddressDesyncFactor is the preferred lifetime's desync factor for // temporary SLAAC addresses. temporaryAddressDesyncFactor time.Duration } // offLinkRouteState holds data associated with an off-link route discovered by // a Router Advertisement (RA). // // +stateify savable type offLinkRouteState struct { prf header.NDPRoutePreference // Job to invalidate the route. // // Must not be nil. invalidationJob *tcpip.Job } // onLinkPrefixState holds data associated with an on-link prefix discovered by // a Router Advertisement's Prefix Information option (PI) when the NDP // configurations was configured to do so. // // +stateify savable type onLinkPrefixState struct { // Job to invalidate the on-link prefix. // // Must not be nil. invalidationJob *tcpip.Job } // tempSLAACAddrState holds state associated with a temporary SLAAC address. // // +stateify savable type tempSLAACAddrState struct { // Job to deprecate the temporary SLAAC address. // // Must not be nil. deprecationJob *tcpip.Job // Job to invalidate the temporary SLAAC address. // // Must not be nil. invalidationJob *tcpip.Job // Job to regenerate the temporary SLAAC address. // // Must not be nil. regenJob *tcpip.Job createdAt tcpip.MonotonicTime // The address's endpoint. // // Must not be nil. addressEndpoint stack.AddressEndpoint // Has a new temporary SLAAC address already been regenerated? regenerated bool } // +stateify savable type stableAddrState struct { // The address's endpoint. // // May only be nil when the address is being (re-)generated. Otherwise, // must not be nil as all SLAAC prefixes must have a stable address. addressEndpoint stack.AddressEndpoint // The number of times an address has been generated locally where the IPv6 // endpoint already had the generated address. localGenerationFailures uint8 } // slaacPrefixState holds state associated with a SLAAC prefix. // // +stateify savable type slaacPrefixState struct { // Job to deprecate the prefix. // // Must not be nil. deprecationJob *tcpip.Job // Job to invalidate the prefix. // // Must not be nil. invalidationJob *tcpip.Job // nil iff the address is valid forever. validUntil *tcpip.MonotonicTime // nil iff the address is preferred forever. preferredUntil *tcpip.MonotonicTime // State associated with the stable address generated for the prefix. stableAddr stableAddrState // The temporary (short-lived) addresses generated for the SLAAC prefix. tempAddrs map[tcpip.Address]tempSLAACAddrState // The next two fields are used by both stable and temporary addresses // generated for a SLAAC prefix. This is safe as only 1 address is in the // generation and DAD process at any time. That is, no two addresses are // generated at the same time for a given SLAAC prefix. // The number of times an address has been generated and added to the IPv6 // endpoint. // // Addresses may be regenerated in reseponse to a DAD conflicts. generationAttempts uint8 // The maximum number of times to attempt regeneration of a SLAAC address // in response to DAD conflicts. maxGenerationAttempts uint8 } // startDuplicateAddressDetection performs Duplicate Address Detection. // // This function must only be called by IPv6 addresses that are currently // tentative. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, addressEndpoint stack.AddressEndpoint) tcpip.Error { // addr must be a valid unicast IPv6 address. if !header.IsV6UnicastAddress(addr) { return &tcpip.ErrAddressFamilyNotSupported{} } if addressEndpoint.GetKind() != stack.PermanentTentative { // The endpoint should be marked as tentative since we are starting DAD. panic(fmt.Sprintf("ndpdad: addr %s is not tentative on NIC(%d)", addr, ndp.ep.nic.ID())) } ret := ndp.dad.CheckDuplicateAddressLocked(addr, func(r stack.DADResult) { if addressEndpoint.GetKind() != stack.PermanentTentative { // The endpoint should still be marked as tentative since we are still // performing DAD on it. panic(fmt.Sprintf("ndpdad: addr %s is no longer tentative on NIC(%d)", addr, ndp.ep.nic.ID())) } var dadSucceeded bool switch r.(type) { case *stack.DADAborted, *stack.DADError, *stack.DADDupAddrDetected: dadSucceeded = false case *stack.DADSucceeded: dadSucceeded = true default: panic(fmt.Sprintf("unrecognized DAD result = %T", r)) } if dadSucceeded { addressEndpoint.SetKind(stack.Permanent) } if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { ndpDisp.OnDuplicateAddressDetectionResult(ndp.ep.nic.ID(), addr, r) } if dadSucceeded { if addressEndpoint.ConfigType() == stack.AddressConfigSlaac && !addressEndpoint.Temporary() { // Reset the generation attempts counter as we are starting the // generation of a new address for the SLAAC prefix. ndp.regenerateTempSLAACAddr(addressEndpoint.AddressWithPrefix().Subnet(), true /* resetGenAttempts */) } ndp.ep.onAddressAssignedLocked(addr) } }) switch ret { case stack.DADStarting: case stack.DADAlreadyRunning: panic(fmt.Sprintf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, ndp.ep.nic.ID())) case stack.DADDisabled: addressEndpoint.SetKind(stack.Permanent) // Consider DAD to have resolved even if no DAD messages were actually // transmitted. if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { ndpDisp.OnDuplicateAddressDetectionResult(ndp.ep.nic.ID(), addr, &stack.DADSucceeded{}) } ndp.ep.onAddressAssignedLocked(addr) } return nil } // stopDuplicateAddressDetection ends a running Duplicate Address Detection // process. Note, this may leave the DAD process for a tentative address in // such a state forever, unless some other external event resolves the DAD // process (receiving an NA from the true owner of addr, or an NS for addr // (implying another node is attempting to use addr)). It is up to the caller // of this function to handle such a scenario. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address, reason stack.DADResult) { ndp.dad.StopLocked(addr, reason) } // handleRA handles a Router Advertisement message that arrived on the NIC // this ndp is for. Does nothing if the NIC is configured to not handle RAs. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) { // Is the IPv6 endpoint configured to handle RAs at all? // // Currently, the stack does not determine router interface status on a // per-interface basis; it is a protocol-wide configuration, so we check the // protocol's forwarding flag to determine if the IPv6 endpoint is forwarding // packets. if !ndp.configs.HandleRAs.enabled(ndp.ep.Forwarding()) { ndp.ep.stats.localStats.UnhandledRouterAdvertisements.Increment() return } // Only worry about the DHCPv6 configuration if we have an NDPDispatcher as we // only inform the dispatcher on configuration changes. We do nothing else // with the information. if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { var configuration DHCPv6ConfigurationFromNDPRA switch { case ra.ManagedAddrConfFlag(): configuration = DHCPv6ManagedAddress case ra.OtherConfFlag(): configuration = DHCPv6OtherConfigurations default: configuration = DHCPv6NoConfiguration } if ndp.dhcpv6Configuration != configuration { ndp.dhcpv6Configuration = configuration ndpDisp.OnDHCPv6Configuration(ndp.ep.nic.ID(), configuration) } } // Is the IPv6 endpoint configured to discover default routers? if ndp.configs.DiscoverDefaultRouters { prf := ra.DefaultRouterPreference() if prf == header.ReservedRoutePreference { // As per RFC 4191 section 2.2, // // Prf (Default Router Preference) // // If the Reserved (10) value is received, the receiver MUST treat the // value as if it were (00). // // Note that the value 00 is the medium (default) router preference value. prf = header.MediumRoutePreference } // We represent default routers with a default (off-link) route through the // router. ndp.handleOffLinkRouteDiscovery(offLinkRoute{dest: header.IPv6EmptySubnet, router: ip}, ra.RouterLifetime(), prf) } // TODO(b/141556115): Do (RetransTimer, ReachableTime)) Parameter // Discovery. // We know the options is valid as far as wire format is concerned since // we got the Router Advertisement, as documented by this fn. Given this // we do not check the iterator for errors on calls to Next. it, _ := ra.Options().Iter(false) for opt, done, _ := it.Next(); !done; opt, done, _ = it.Next() { switch opt := opt.(type) { case header.NDPRecursiveDNSServer: if ndp.ep.protocol.options.NDPDisp == nil { continue } addrs, _ := opt.Addresses() ndp.ep.protocol.options.NDPDisp.OnRecursiveDNSServerOption(ndp.ep.nic.ID(), addrs, opt.Lifetime()) case header.NDPDNSSearchList: if ndp.ep.protocol.options.NDPDisp == nil { continue } domainNames, _ := opt.DomainNames() ndp.ep.protocol.options.NDPDisp.OnDNSSearchListOption(ndp.ep.nic.ID(), domainNames, opt.Lifetime()) case header.NDPPrefixInformation: prefix := opt.Subnet() // Is the prefix a link-local? if header.IsV6LinkLocalUnicastAddress(prefix.ID()) { // ...Yes, skip as per RFC 4861 section 6.3.4, // and RFC 4862 section 5.5.3.b (for SLAAC). continue } // Is the Prefix Length 0? if prefix.Prefix() == 0 { // ...Yes, skip as this is an invalid prefix // as all IPv6 addresses cannot be on-link. continue } if opt.OnLinkFlag() { ndp.handleOnLinkPrefixInformation(opt) } if opt.AutonomousAddressConfigurationFlag() { ndp.handleAutonomousPrefixInformation(opt) } case header.NDPRouteInformation: if !ndp.configs.DiscoverMoreSpecificRoutes { continue } dest, err := opt.Prefix() if err != nil { panic(fmt.Sprintf("%T.Prefix(): %s", opt, err)) } prf := opt.RoutePreference() if prf == header.ReservedRoutePreference { // As per RFC 4191 section 2.3, // // Prf (Route Preference) // 2-bit signed integer. The Route Preference indicates // whether to prefer the router associated with this prefix // over others, when multiple identical prefixes (for // different routers) have been received. If the Reserved // (10) value is received, the Route Information Option MUST // be ignored. continue } ndp.handleOffLinkRouteDiscovery(offLinkRoute{dest: dest, router: ip}, opt.RouteLifetime(), prf) } // TODO(b/141556115): Do (MTU) Parameter Discovery. } } // invalidateOffLinkRoute invalidates a discovered off-link route. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) invalidateOffLinkRoute(route offLinkRoute) { state, ok := ndp.offLinkRoutes[route] if !ok { return } state.invalidationJob.Cancel() delete(ndp.offLinkRoutes, route) // Let the integrator know a discovered off-link route is invalidated. if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { ndpDisp.OnOffLinkRouteInvalidated(ndp.ep.nic.ID(), route.dest, route.router) } } // handleOffLinkRouteDiscovery handles the discovery of an off-link route. // // Precondition: ndp.ep.mu must be locked. func (ndp *ndpState) handleOffLinkRouteDiscovery(route offLinkRoute, lifetime time.Duration, prf header.NDPRoutePreference) { ndpDisp := ndp.ep.protocol.options.NDPDisp if ndpDisp == nil { return } state, ok := ndp.offLinkRoutes[route] switch { case !ok && lifetime != 0: // This is a new route we are discovering. // // Only remember it if we currently know about less than // MaxDiscoveredOffLinkRoutes routers. if len(ndp.offLinkRoutes) < MaxDiscoveredOffLinkRoutes { // Inform the integrator when we discovered an off-link route. ndpDisp.OnOffLinkRouteUpdated(ndp.ep.nic.ID(), route.dest, route.router, prf) state := offLinkRouteState{ prf: prf, invalidationJob: tcpip.NewJob(ndp.ep.protocol.stack.Clock(), &ndp.ep.mu, func() { ndp.invalidateOffLinkRoute(route) }), } state.invalidationJob.Schedule(lifetime) ndp.offLinkRoutes[route] = state } case ok && lifetime != 0: // This is an already discovered off-link route. Update the lifetime. state.invalidationJob.Cancel() state.invalidationJob.Schedule(lifetime) if prf != state.prf { state.prf = prf // Inform the integrator about route preference updates. ndpDisp.OnOffLinkRouteUpdated(ndp.ep.nic.ID(), route.dest, route.router, prf) } ndp.offLinkRoutes[route] = state case ok && lifetime == 0: // The already discovered off-link route is no longer considered valid so we // invalidate it immediately. ndp.invalidateOffLinkRoute(route) } } // rememberOnLinkPrefix remembers a newly discovered on-link prefix with IPv6 // address with prefix prefix with lifetime l. // // The prefix identified by prefix MUST NOT already be known. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration) { ndpDisp := ndp.ep.protocol.options.NDPDisp if ndpDisp == nil { return } // Inform the integrator when we discovered an on-link prefix. ndpDisp.OnOnLinkPrefixDiscovered(ndp.ep.nic.ID(), prefix) state := onLinkPrefixState{ invalidationJob: tcpip.NewJob(ndp.ep.protocol.stack.Clock(), &ndp.ep.mu, func() { ndp.invalidateOnLinkPrefix(prefix) }), } if l < header.NDPInfiniteLifetime { state.invalidationJob.Schedule(l) } ndp.onLinkPrefixes[prefix] = state } // invalidateOnLinkPrefix invalidates a discovered on-link prefix. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) { s, ok := ndp.onLinkPrefixes[prefix] // Is the on-link prefix still discovered? if !ok { // ...Nope, do nothing further. return } s.invalidationJob.Cancel() delete(ndp.onLinkPrefixes, prefix) // Let the integrator know a discovered on-link prefix is invalidated. if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { ndpDisp.OnOnLinkPrefixInvalidated(ndp.ep.nic.ID(), prefix) } } // handleOnLinkPrefixInformation handles a Prefix Information option with // its on-link flag set, as per RFC 4861 section 6.3.4. // // handleOnLinkPrefixInformation assumes that the prefix this pi is for is // not the link-local prefix and the on-link flag is set. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) handleOnLinkPrefixInformation(pi header.NDPPrefixInformation) { prefix := pi.Subnet() prefixState, ok := ndp.onLinkPrefixes[prefix] vl := pi.ValidLifetime() if !ok && vl == 0 { // Don't know about this prefix but it has a zero valid // lifetime, so just ignore. return } if !ok && vl != 0 { // This is a new on-link prefix we are discovering // // Only remember it if we currently know about less than // MaxDiscoveredOnLinkPrefixes on-link prefixes. if ndp.configs.DiscoverOnLinkPrefixes && len(ndp.onLinkPrefixes) < MaxDiscoveredOnLinkPrefixes { ndp.rememberOnLinkPrefix(prefix, vl) } return } if ok && vl == 0 { // We know about the on-link prefix, but it is // no longer to be considered on-link, so // invalidate it. ndp.invalidateOnLinkPrefix(prefix) return } // This is an already discovered on-link prefix with a // new non-zero valid lifetime. // // Update the invalidation job. prefixState.invalidationJob.Cancel() if vl < header.NDPInfiniteLifetime { // Prefix is valid for a finite lifetime, schedule the job to execute after // the new valid lifetime. prefixState.invalidationJob.Schedule(vl) } ndp.onLinkPrefixes[prefix] = prefixState } // handleAutonomousPrefixInformation handles a Prefix Information option with // its autonomous flag set, as per RFC 4862 section 5.5.3. // // handleAutonomousPrefixInformation assumes that the prefix this pi is for is // not the link-local prefix and the autonomous flag is set. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInformation) { vl := pi.ValidLifetime() pl := pi.PreferredLifetime() // If the preferred lifetime is greater than the valid lifetime, // silently ignore the Prefix Information option, as per RFC 4862 // section 5.5.3.c. if pl > vl { return } prefix := pi.Subnet() // Check if we already maintain SLAAC state for prefix. if state, ok := ndp.slaacPrefixes[prefix]; ok { // As per RFC 4862 section 5.5.3.e, refresh prefix's SLAAC lifetimes. ndp.refreshSLAACPrefixLifetimes(prefix, &state, pl, vl) ndp.slaacPrefixes[prefix] = state return } // prefix is a new SLAAC prefix. Do the work as outlined by RFC 4862 section // 5.5.3.d if ndp is configured to auto-generate new addresses via SLAAC. if !ndp.configs.AutoGenGlobalAddresses { return } // Limit the number of discovered SLAAC prefixes. if len(ndp.slaacPrefixes) == MaxDiscoveredSLAACPrefixes { return } ndp.doSLAAC(prefix, pl, vl) } // doSLAAC generates a new SLAAC address with the provided lifetimes // for prefix. // // pl is the new preferred lifetime. vl is the new valid lifetime. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) { // If we do not already have an address for this prefix and the valid // lifetime is 0, no need to do anything further, as per RFC 4862 // section 5.5.3.d. if vl == 0 { return } // Make sure the prefix is valid (as far as its length is concerned) to // generate a valid IPv6 address from an interface identifier (IID), as // per RFC 4862 sectiion 5.5.3.d. if prefix.Prefix() != validPrefixLenForAutoGen { return } state := slaacPrefixState{ deprecationJob: tcpip.NewJob(ndp.ep.protocol.stack.Clock(), &ndp.ep.mu, func() { state, ok := ndp.slaacPrefixes[prefix] if !ok { panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the deprecated SLAAC prefix %s", prefix)) } ndp.deprecateSLAACAddress(state.stableAddr.addressEndpoint) }), invalidationJob: tcpip.NewJob(ndp.ep.protocol.stack.Clock(), &ndp.ep.mu, func() { state, ok := ndp.slaacPrefixes[prefix] if !ok { panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the invalidated SLAAC prefix %s", prefix)) } ndp.invalidateSLAACPrefix(prefix, state) }), tempAddrs: make(map[tcpip.Address]tempSLAACAddrState), maxGenerationAttempts: ndp.configs.AutoGenAddressConflictRetries + 1, } now := ndp.ep.protocol.stack.Clock().NowMonotonic() // The time an address is preferred until is needed to properly generate the // address. if pl < header.NDPInfiniteLifetime { t := now.Add(pl) state.preferredUntil = &t } // The time at which an address is invalidated is exposed as a property of the // address. if vl < header.NDPInfiniteLifetime { t := now.Add(vl) state.validUntil = &t } if !ndp.generateSLAACAddr(prefix, &state) { // We were unable to generate an address for the prefix, we do not nothing // further as there is no reason to maintain state or jobs for a prefix we // do not have an address for. return } // Setup the initial jobs to deprecate and invalidate prefix. if pl < header.NDPInfiniteLifetime && pl != 0 { state.deprecationJob.Schedule(pl) } if vl < header.NDPInfiniteLifetime { state.invalidationJob.Schedule(vl) } // If the address is assigned (DAD resolved), generate a temporary address. if state.stableAddr.addressEndpoint.GetKind() == stack.Permanent { // Reset the generation attempts counter as we are starting the generation // of a new address for the SLAAC prefix. ndp.generateTempSLAACAddr(prefix, &state, true /* resetGenAttempts */) } ndp.slaacPrefixes[prefix] = state } // addAndAcquireSLAACAddr adds a SLAAC address to the IPv6 endpoint. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) addAndAcquireSLAACAddr(addr tcpip.AddressWithPrefix, temporary bool, lifetimes stack.AddressLifetimes) stack.AddressEndpoint { addressEndpoint, err := ndp.ep.addAndAcquirePermanentAddressLocked(addr, stack.AddressProperties{ PEB: stack.FirstPrimaryEndpoint, ConfigType: stack.AddressConfigSlaac, Lifetimes: lifetimes, Temporary: temporary, }) if err != nil { panic(fmt.Sprintf("ndp: error when adding SLAAC address %+v: %s", addr, err)) } // Inform the integrator that we have a new SLAAC address. if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { if disp := ndpDisp.OnAutoGenAddress(ndp.ep.nic.ID(), addr); disp != nil { addressEndpoint.RegisterDispatcher(disp) } } return addressEndpoint } // generateSLAACAddr generates a SLAAC address for prefix. // // Returns true if an address was successfully generated. // // Panics if the prefix is not a SLAAC prefix or it already has an address. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) generateSLAACAddr(prefix tcpip.Subnet, state *slaacPrefixState) bool { if addressEndpoint := state.stableAddr.addressEndpoint; addressEndpoint != nil { panic(fmt.Sprintf("ndp: SLAAC prefix %s already has a permanent address %s", prefix, addressEndpoint.AddressWithPrefix())) } // If we have already reached the maximum address generation attempts for the // prefix, do not generate another address. if state.generationAttempts == state.maxGenerationAttempts { return false } var generatedAddr tcpip.AddressWithPrefix prefixID := prefix.ID() addrBytes := prefixID.AsSlice() for i := 0; ; i++ { // If we were unable to generate an address after the maximum SLAAC address // local regeneration attempts, do nothing further. if i == maxSLAACAddrLocalRegenAttempts { return false } dadCounter := state.generationAttempts + state.stableAddr.localGenerationFailures if oIID := ndp.ep.protocol.options.OpaqueIIDOpts; oIID.NICNameFromID != nil { addrBytes = header.AppendOpaqueInterfaceIdentifier( addrBytes[:header.IIDOffsetInIPv6Address], prefix, oIID.NICNameFromID(ndp.ep.nic.ID(), ndp.ep.nic.Name()), dadCounter, oIID.SecretKey, ) } else if dadCounter == 0 { // Modified-EUI64 based IIDs have no way to resolve DAD conflicts, so if // the DAD counter is non-zero, we cannot use this method. // // Only attempt to generate an interface-specific IID if we have a valid // link address. // // TODO(b/141011931): Validate a LinkEndpoint's link address (provided by // LinkEndpoint.LinkAddress) before reaching this point. linkAddr := ndp.ep.nic.LinkAddress() if !header.IsValidUnicastEthernetAddress(linkAddr) { return false } // Generate an address within prefix from the modified EUI-64 of ndp's // NIC's Ethernet MAC address. header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:]) } else { // We have no way to regenerate an address in response to an address // conflict when addresses are not generated with opaque IIDs. return false } generatedAddr = tcpip.AddressWithPrefix{ Address: tcpip.AddrFrom16Slice(addrBytes), PrefixLen: validPrefixLenForAutoGen, } if !ndp.ep.hasPermanentAddressRLocked(generatedAddr.Address) { break } state.stableAddr.localGenerationFailures++ } deprecated := state.preferredUntil != nil && !state.preferredUntil.After(ndp.ep.protocol.stack.Clock().NowMonotonic()) var preferredUntil tcpip.MonotonicTime if !deprecated { if state.preferredUntil != nil { preferredUntil = *state.preferredUntil } else { preferredUntil = tcpip.MonotonicTimeInfinite() } } validUntil := tcpip.MonotonicTimeInfinite() if state.validUntil != nil { validUntil = *state.validUntil } if addressEndpoint := ndp.addAndAcquireSLAACAddr(generatedAddr, false /* temporary */, stack.AddressLifetimes{ Deprecated: deprecated, PreferredUntil: preferredUntil, ValidUntil: validUntil, }); addressEndpoint != nil { state.stableAddr.addressEndpoint = addressEndpoint state.generationAttempts++ return true } return false } // regenerateSLAACAddr regenerates an address for a SLAAC prefix. // // If generating a new address for the prefix fails, the prefix is invalidated. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) regenerateSLAACAddr(prefix tcpip.Subnet) { state, ok := ndp.slaacPrefixes[prefix] if !ok { panic(fmt.Sprintf("ndp: SLAAC prefix state not found to regenerate address for %s", prefix)) } if ndp.generateSLAACAddr(prefix, &state) { ndp.slaacPrefixes[prefix] = state return } // We were unable to generate a permanent address for the SLAAC prefix so // invalidate the prefix as there is no reason to maintain state for a // SLAAC prefix we do not have an address for. ndp.invalidateSLAACPrefix(prefix, state) } // generateTempSLAACAddr generates a new temporary SLAAC address. // // If resetGenAttempts is true, the prefix's generation counter is reset. // // Returns true if a new address was generated. func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *slaacPrefixState, resetGenAttempts bool) bool { // Are we configured to auto-generate new temporary global addresses for the // prefix? if !ndp.configs.AutoGenTempGlobalAddresses || prefix == header.IPv6LinkLocalPrefix.Subnet() { return false } if resetGenAttempts { prefixState.generationAttempts = 0 prefixState.maxGenerationAttempts = ndp.configs.AutoGenAddressConflictRetries + 1 } // If we have already reached the maximum address generation attempts for the // prefix, do not generate another address. if prefixState.generationAttempts == prefixState.maxGenerationAttempts { return false } stableAddr := prefixState.stableAddr.addressEndpoint.AddressWithPrefix().Address now := ndp.ep.protocol.stack.Clock().NowMonotonic() // As per RFC 4941 section 3.3 step 4, the valid lifetime of a temporary // address is the lower of the valid lifetime of the stable address or the // maximum temporary address valid lifetime. vl := ndp.configs.MaxTempAddrValidLifetime if prefixState.validUntil != nil { if prefixVL := prefixState.validUntil.Sub(now); vl > prefixVL { vl = prefixVL } } if vl <= 0 { // Cannot create an address without a valid lifetime. return false } // As per RFC 4941 section 3.3 step 4, the preferred lifetime of a temporary // address is the lower of the preferred lifetime of the stable address or the // maximum temporary address preferred lifetime - the temporary address desync // factor. pl := ndp.configs.MaxTempAddrPreferredLifetime - ndp.temporaryAddressDesyncFactor if prefixState.preferredUntil != nil { if prefixPL := prefixState.preferredUntil.Sub(now); pl > prefixPL { // Respect the preferred lifetime of the prefix, as per RFC 4941 section // 3.3 step 4. pl = prefixPL } } // As per RFC 4941 section 3.3 step 5, a temporary address is created only if // the calculated preferred lifetime is greater than the advance regeneration // duration. In particular, we MUST NOT create a temporary address with a zero // Preferred Lifetime. if pl <= ndp.configs.RegenAdvanceDuration { return false } // Attempt to generate a new address that is not already assigned to the IPv6 // endpoint. var generatedAddr tcpip.AddressWithPrefix for i := 0; ; i++ { // If we were unable to generate an address after the maximum SLAAC address // local regeneration attempts, do nothing further. if i == maxSLAACAddrLocalRegenAttempts { return false } generatedAddr = header.GenerateTempIPv6SLAACAddr(ndp.temporaryIIDHistory[:], stableAddr) if !ndp.ep.hasPermanentAddressRLocked(generatedAddr.Address) { break } } // As per RFC RFC 4941 section 3.3 step 5, we MUST NOT create a temporary // address with a zero preferred lifetime. The checks above ensure this // so we know the address is not deprecated. addressEndpoint := ndp.addAndAcquireSLAACAddr(generatedAddr, true /* temporary */, stack.AddressLifetimes{ Deprecated: false, PreferredUntil: now.Add(pl), ValidUntil: now.Add(vl), }) if addressEndpoint == nil { return false } state := tempSLAACAddrState{ deprecationJob: tcpip.NewJob(ndp.ep.protocol.stack.Clock(), &ndp.ep.mu, func() { prefixState, ok := ndp.slaacPrefixes[prefix] if !ok { panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to deprecate temporary address %s", prefix, generatedAddr)) } tempAddrState, ok := prefixState.tempAddrs[generatedAddr.Address] if !ok { panic(fmt.Sprintf("ndp: must have a tempAddr entry to deprecate temporary address %s", generatedAddr)) } ndp.deprecateSLAACAddress(tempAddrState.addressEndpoint) }), invalidationJob: tcpip.NewJob(ndp.ep.protocol.stack.Clock(), &ndp.ep.mu, func() { prefixState, ok := ndp.slaacPrefixes[prefix] if !ok { panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to invalidate temporary address %s", prefix, generatedAddr)) } tempAddrState, ok := prefixState.tempAddrs[generatedAddr.Address] if !ok { panic(fmt.Sprintf("ndp: must have a tempAddr entry to invalidate temporary address %s", generatedAddr)) } ndp.invalidateTempSLAACAddr(prefixState.tempAddrs, generatedAddr.Address, tempAddrState) }), regenJob: tcpip.NewJob(ndp.ep.protocol.stack.Clock(), &ndp.ep.mu, func() { prefixState, ok := ndp.slaacPrefixes[prefix] if !ok { panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to regenerate temporary address after %s", prefix, generatedAddr)) } tempAddrState, ok := prefixState.tempAddrs[generatedAddr.Address] if !ok { panic(fmt.Sprintf("ndp: must have a tempAddr entry to regenerate temporary address after %s", generatedAddr)) } // If an address has already been regenerated for this address, don't // regenerate another address. if tempAddrState.regenerated { return } // Reset the generation attempts counter as we are starting the generation // of a new address for the SLAAC prefix. tempAddrState.regenerated = ndp.generateTempSLAACAddr(prefix, &prefixState, true /* resetGenAttempts */) prefixState.tempAddrs[generatedAddr.Address] = tempAddrState ndp.slaacPrefixes[prefix] = prefixState }), createdAt: now, addressEndpoint: addressEndpoint, } state.deprecationJob.Schedule(pl) state.invalidationJob.Schedule(vl) state.regenJob.Schedule(pl - ndp.configs.RegenAdvanceDuration) prefixState.generationAttempts++ prefixState.tempAddrs[generatedAddr.Address] = state return true } // regenerateTempSLAACAddr regenerates a temporary address for a SLAAC prefix. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) regenerateTempSLAACAddr(prefix tcpip.Subnet, resetGenAttempts bool) { state, ok := ndp.slaacPrefixes[prefix] if !ok { panic(fmt.Sprintf("ndp: SLAAC prefix state not found to regenerate temporary address for %s", prefix)) } ndp.generateTempSLAACAddr(prefix, &state, resetGenAttempts) ndp.slaacPrefixes[prefix] = state } // refreshSLAACPrefixLifetimes refreshes the lifetimes of a SLAAC prefix. // // pl is the new preferred lifetime. vl is the new valid lifetime. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixState *slaacPrefixState, pl, vl time.Duration) { // If prefix was preferred for some finite lifetime before, cancel the // deprecation job so it can be reset. prefixState.deprecationJob.Cancel() now := ndp.ep.protocol.stack.Clock().NowMonotonic() // Schedule the deprecation job if prefix has a finite preferred lifetime. deprecated := pl == 0 if pl < header.NDPInfiniteLifetime { if !deprecated { prefixState.deprecationJob.Schedule(pl) } t := now.Add(pl) prefixState.preferredUntil = &t } else { prefixState.preferredUntil = nil } // As per RFC 4862 section 5.5.3.e, update the valid lifetime for prefix: // // 1) If the received Valid Lifetime is greater than 2 hours or greater than // RemainingLifetime, set the valid lifetime of the prefix to the // advertised Valid Lifetime. // // 2) If RemainingLifetime is less than or equal to 2 hours, ignore the // advertised Valid Lifetime. // // 3) Otherwise, reset the valid lifetime of the prefix to 2 hours. if vl >= header.NDPInfiniteLifetime { // Handle the infinite valid lifetime separately as we do not schedule a // job in this case. prefixState.invalidationJob.Cancel() prefixState.validUntil = nil } else { var effectiveVl time.Duration var rl time.Duration // If the prefix was originally set to be valid forever, assume the // remaining time to be the maximum possible value. if prefixState.validUntil == nil { rl = header.NDPInfiniteLifetime } else { rl = prefixState.validUntil.Sub(now) } if vl > MinPrefixInformationValidLifetimeForUpdate || vl > rl { effectiveVl = vl } else if rl > MinPrefixInformationValidLifetimeForUpdate { effectiveVl = MinPrefixInformationValidLifetimeForUpdate } if effectiveVl != 0 { prefixState.invalidationJob.Cancel() prefixState.invalidationJob.Schedule(effectiveVl) t := now.Add(effectiveVl) prefixState.validUntil = &t } } // If the preferred lifetime is zero, then the prefix should be deprecated. { var preferredUntil tcpip.MonotonicTime if !deprecated { if prefixState.preferredUntil == nil { preferredUntil = tcpip.MonotonicTimeInfinite() } else { preferredUntil = *prefixState.preferredUntil } } validUntil := tcpip.MonotonicTimeInfinite() if prefixState.validUntil != nil { validUntil = *prefixState.validUntil } if addressEndpoint := prefixState.stableAddr.addressEndpoint; !addressEndpoint.Deprecated() && deprecated { if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { ndpDisp.OnAutoGenAddressDeprecated(ndp.ep.nic.ID(), addressEndpoint.AddressWithPrefix()) } } prefixState.stableAddr.addressEndpoint.SetLifetimes(stack.AddressLifetimes{ Deprecated: deprecated, PreferredUntil: preferredUntil, ValidUntil: validUntil, }) } // If DAD is not yet complete on the stable address, there is no need to do // work with temporary addresses. if prefixState.stableAddr.addressEndpoint.GetKind() != stack.Permanent { return } // Note, we do not need to update the entries in the temporary address map // after updating the jobs because the jobs are held as pointers. var regenForAddr tcpip.Address allAddressesRegenerated := true for tempAddr, tempAddrState := range prefixState.tempAddrs { // As per RFC 4941 section 3.3 step 4, the valid lifetime of a temporary // address is the lower of the valid lifetime of the stable address or the // maximum temporary address valid lifetime. Note, the valid lifetime of a // temporary address is relative to the address's creation time. validUntil := tempAddrState.createdAt.Add(ndp.configs.MaxTempAddrValidLifetime) if prefixState.validUntil != nil && prefixState.validUntil.Before(validUntil) { validUntil = *prefixState.validUntil } // If the address is no longer valid, invalidate it immediately. Otherwise, // reset the invalidation job. newValidLifetime := validUntil.Sub(now) if newValidLifetime <= 0 { ndp.invalidateTempSLAACAddr(prefixState.tempAddrs, tempAddr, tempAddrState) continue } tempAddrState.invalidationJob.Cancel() tempAddrState.invalidationJob.Schedule(newValidLifetime) // As per RFC 4941 section 3.3 step 4, the preferred lifetime of a temporary // address is the lower of the preferred lifetime of the stable address or // the maximum temporary address preferred lifetime - the temporary address // desync factor. Note, the preferred lifetime of a temporary address is // relative to the address's creation time. preferredUntil := tempAddrState.createdAt.Add(ndp.configs.MaxTempAddrPreferredLifetime - ndp.temporaryAddressDesyncFactor) if prefixState.preferredUntil != nil && prefixState.preferredUntil.Before(preferredUntil) { preferredUntil = *prefixState.preferredUntil } // If the address is no longer preferred, deprecate it immediately. // Otherwise, schedule the deprecation job again. newPreferredLifetime := preferredUntil.Sub(now) tempAddrState.deprecationJob.Cancel() deprecated := newPreferredLifetime <= 0 if !deprecated { tempAddrState.deprecationJob.Schedule(newPreferredLifetime) } if addressEndpoint := tempAddrState.addressEndpoint; !addressEndpoint.Deprecated() && deprecated { if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { ndpDisp.OnAutoGenAddressDeprecated(ndp.ep.nic.ID(), addressEndpoint.AddressWithPrefix()) } } tempAddrState.addressEndpoint.SetLifetimes(stack.AddressLifetimes{ Deprecated: deprecated, ValidUntil: validUntil, PreferredUntil: preferredUntil, }) tempAddrState.regenJob.Cancel() if tempAddrState.regenerated { } else { allAddressesRegenerated = false if newPreferredLifetime <= ndp.configs.RegenAdvanceDuration { // The new preferred lifetime is less than the advance regeneration // duration so regenerate an address for this temporary address // immediately after we finish iterating over the temporary addresses. regenForAddr = tempAddr } else { tempAddrState.regenJob.Schedule(newPreferredLifetime - ndp.configs.RegenAdvanceDuration) } } } // Generate a new temporary address if all of the existing temporary addresses // have been regenerated, or we need to immediately regenerate an address // due to an update in preferred lifetime. // // If each temporary address has already been regenerated, no new temporary // address is generated. To ensure continuation of temporary SLAAC addresses, // we manually try to regenerate an address here. if regenForAddr.BitLen() != 0 || allAddressesRegenerated { // Reset the generation attempts counter as we are starting the generation // of a new address for the SLAAC prefix. if state, ok := prefixState.tempAddrs[regenForAddr]; ndp.generateTempSLAACAddr(prefix, prefixState, true /* resetGenAttempts */) && ok { state.regenerated = true prefixState.tempAddrs[regenForAddr] = state } } } // deprecateSLAACAddress marks the address as deprecated and notifies the NDP // dispatcher that address has been deprecated. // // deprecateSLAACAddress does nothing if the address is already deprecated. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) deprecateSLAACAddress(addressEndpoint stack.AddressEndpoint) { if addressEndpoint.Deprecated() { return } addressEndpoint.SetDeprecated(true) if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { ndpDisp.OnAutoGenAddressDeprecated(ndp.ep.nic.ID(), addressEndpoint.AddressWithPrefix()) } } // invalidateSLAACPrefix invalidates a SLAAC prefix. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) invalidateSLAACPrefix(prefix tcpip.Subnet, state slaacPrefixState) { ndp.cleanupSLAACPrefixResources(prefix, state) if addressEndpoint := state.stableAddr.addressEndpoint; addressEndpoint != nil { if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { ndpDisp.OnAutoGenAddressInvalidated(ndp.ep.nic.ID(), addressEndpoint.AddressWithPrefix()) } if err := ndp.ep.removePermanentEndpointInnerLocked(addressEndpoint, stack.AddressRemovalInvalidated, &stack.DADAborted{}); err != nil { panic(fmt.Sprintf("ndp: error removing stable SLAAC address %s: %s", addressEndpoint.AddressWithPrefix(), err)) } } } // cleanupSLAACAddrResourcesAndNotify cleans up an invalidated SLAAC address's // resources. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) cleanupSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix, invalidatePrefix bool) { if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { ndpDisp.OnAutoGenAddressInvalidated(ndp.ep.nic.ID(), addr) } prefix := addr.Subnet() state, ok := ndp.slaacPrefixes[prefix] if !ok || state.stableAddr.addressEndpoint == nil || addr.Address != state.stableAddr.addressEndpoint.AddressWithPrefix().Address { return } if !invalidatePrefix { // If the prefix is not being invalidated, disassociate the address from the // prefix and do nothing further. state.stableAddr.addressEndpoint.DecRef() state.stableAddr.addressEndpoint = nil ndp.slaacPrefixes[prefix] = state return } ndp.cleanupSLAACPrefixResources(prefix, state) } // cleanupSLAACPrefixResources cleans up a SLAAC prefix's jobs and entry. // // Panics if the SLAAC prefix is not known. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) cleanupSLAACPrefixResources(prefix tcpip.Subnet, state slaacPrefixState) { // Invalidate all temporary addresses. for tempAddr, tempAddrState := range state.tempAddrs { ndp.invalidateTempSLAACAddr(state.tempAddrs, tempAddr, tempAddrState) } if state.stableAddr.addressEndpoint != nil { state.stableAddr.addressEndpoint.DecRef() state.stableAddr.addressEndpoint = nil } state.deprecationJob.Cancel() state.invalidationJob.Cancel() delete(ndp.slaacPrefixes, prefix) } // invalidateTempSLAACAddr invalidates a temporary SLAAC address. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) invalidateTempSLAACAddr(tempAddrs map[tcpip.Address]tempSLAACAddrState, tempAddr tcpip.Address, tempAddrState tempSLAACAddrState) { ndp.cleanupTempSLAACAddrResourcesAndNotifyInner(tempAddrs, tempAddr, tempAddrState) if err := ndp.ep.removePermanentEndpointInnerLocked(tempAddrState.addressEndpoint, stack.AddressRemovalInvalidated, &stack.DADAborted{}); err != nil { panic(fmt.Sprintf("error removing temporary SLAAC address %s: %s", tempAddrState.addressEndpoint.AddressWithPrefix(), err)) } } // cleanupTempSLAACAddrResourcesAndNotify cleans up an invalidated temporary // SLAAC address's resources from ndp and notifies the NDP dispatcher that the // address was invalidated. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) cleanupTempSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix) { prefix := addr.Subnet() state, ok := ndp.slaacPrefixes[prefix] if !ok { panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry to clean up temp addr %s resources", addr)) } tempAddrState, ok := state.tempAddrs[addr.Address] if !ok { panic(fmt.Sprintf("ndp: must have a tempAddr entry to clean up temp addr %s resources", addr)) } ndp.cleanupTempSLAACAddrResourcesAndNotifyInner(state.tempAddrs, addr.Address, tempAddrState) } // cleanupTempSLAACAddrResourcesAndNotifyInner is like // cleanupTempSLAACAddrResourcesAndNotify except it does not lookup the // temporary address's state in ndp - it assumes the passed state is valid. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) cleanupTempSLAACAddrResourcesAndNotifyInner(tempAddrs map[tcpip.Address]tempSLAACAddrState, tempAddr tcpip.Address, tempAddrState tempSLAACAddrState) { if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil { ndpDisp.OnAutoGenAddressInvalidated(ndp.ep.nic.ID(), tempAddrState.addressEndpoint.AddressWithPrefix()) } tempAddrState.addressEndpoint.DecRef() tempAddrState.addressEndpoint = nil tempAddrState.deprecationJob.Cancel() tempAddrState.invalidationJob.Cancel() tempAddrState.regenJob.Cancel() delete(tempAddrs, tempAddr) } // cleanupState cleans up ndp's state. // // This function invalidates all discovered on-link prefixes, discovered // routers, and auto-generated addresses. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) cleanupState() { for prefix, state := range ndp.slaacPrefixes { ndp.invalidateSLAACPrefix(prefix, state) } for prefix := range ndp.onLinkPrefixes { ndp.invalidateOnLinkPrefix(prefix) } if got := len(ndp.onLinkPrefixes); got != 0 { panic(fmt.Sprintf("ndp: still have discovered on-link prefixes after cleaning up; found = %d", got)) } for route := range ndp.offLinkRoutes { ndp.invalidateOffLinkRoute(route) } if got := len(ndp.offLinkRoutes); got != 0 { panic(fmt.Sprintf("ndp: still have discovered off-link routes after cleaning up; found = %d", got)) } ndp.dhcpv6Configuration = 0 } // startSolicitingRouters starts soliciting routers, as per RFC 4861 section // 6.3.7. If routers are already being solicited, this function does nothing. // // If ndp is not configured to handle Router Advertisements, routers will not // be solicited as there is no point soliciting routers if we don't handle their // advertisements. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) startSolicitingRouters() { if ndp.rtrSolicitTimer.timer != nil { // We are already soliciting routers. return } remaining := ndp.configs.MaxRtrSolicitations if remaining == 0 { return } if !ndp.configs.HandleRAs.enabled(ndp.ep.Forwarding()) { return } // Calculate the random delay before sending our first RS, as per RFC // 4861 section 6.3.7. var delay time.Duration if ndp.configs.MaxRtrSolicitationDelay > 0 { delay = time.Duration(ndp.ep.protocol.stack.InsecureRNG().Int63n(int64(ndp.configs.MaxRtrSolicitationDelay))) } // Protected by ndp.ep.mu. done := false ndp.rtrSolicitTimer = timer{ done: &done, timer: ndp.ep.protocol.stack.Clock().AfterFunc(delay, func() { // As per RFC 4861 section 4.1: // // IP Fields: // Source Address // An IP address assigned to the sending interface, or // the unspecified address if no address is assigned // to the sending interface. localAddr := header.IPv6Any if addressEndpoint := ndp.ep.AcquireOutgoingPrimaryAddress(header.IPv6AllRoutersLinkLocalMulticastAddress, tcpip.Address{} /* srcHint */, false); addressEndpoint != nil { localAddr = addressEndpoint.AddressWithPrefix().Address addressEndpoint.DecRef() } // As per RFC 4861 section 4.1, an NDP RS SHOULD include the source // link-layer address option if the source address of the NDP RS is // specified. This option MUST NOT be included if the source address is // unspecified. // // TODO(b/141011931): Validate a LinkEndpoint's link address (provided by // LinkEndpoint.LinkAddress) before reaching this point. var optsSerializer header.NDPOptionsSerializer linkAddress := ndp.ep.nic.LinkAddress() if localAddr != header.IPv6Any && header.IsValidUnicastEthernetAddress(linkAddress) { optsSerializer = header.NDPOptionsSerializer{ header.NDPSourceLinkLayerAddressOption(linkAddress), } } payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize + optsSerializer.Length() icmpView := buffer.NewView(payloadSize) icmpView.Grow(payloadSize) icmpData := header.ICMPv6(icmpView.AsSlice()) icmpData.SetType(header.ICMPv6RouterSolicit) rs := header.NDPRouterSolicit(icmpData.MessageBody()) rs.Options().Serialize(optsSerializer) icmpData.SetChecksum(header.ICMPv6Checksum(header.ICMPv6ChecksumParams{ Header: icmpData, Src: localAddr, Dst: header.IPv6AllRoutersLinkLocalMulticastAddress, })) pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ ReserveHeaderBytes: int(ndp.ep.MaxHeaderLength()), Payload: buffer.MakeWithView(icmpView), }) defer pkt.DecRef() sent := ndp.ep.stats.icmp.packetsSent if err := addIPHeader(localAddr, header.IPv6AllRoutersLinkLocalMulticastAddress, pkt, stack.NetworkHeaderParams{ Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, }, nil /* extensionHeaders */); err != nil { panic(fmt.Sprintf("failed to add IP header: %s", err)) } if err := ndp.ep.nic.WritePacketToRemote(header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllRoutersLinkLocalMulticastAddress), pkt); err != nil { sent.dropped.Increment() // Don't send any more messages if we had an error. remaining = 0 } else { sent.routerSolicit.Increment() remaining-- } ndp.ep.mu.Lock() defer ndp.ep.mu.Unlock() if done { // Router solicitation was stopped. return } if remaining == 0 { // We are done soliciting routers. ndp.stopSolicitingRouters() return } ndp.rtrSolicitTimer.timer.Reset(ndp.configs.RtrSolicitationInterval) }), } } // forwardingChanged handles a change in forwarding configuration. // // If transitioning to a host, router solicitation will be started. Otherwise, // router solicitation will be stopped if NDP is not configured to handle RAs // as a router. // // Precondition: ndp.ep.mu must be locked. func (ndp *ndpState) forwardingChanged(forwarding bool) { if forwarding { if ndp.configs.HandleRAs.enabled(forwarding) { return } ndp.stopSolicitingRouters() return } // Solicit routers when transitioning to a host. // // If the endpoint is not currently enabled, routers will be solicited when // the endpoint becomes enabled (if it is still a host). if ndp.ep.Enabled() { ndp.startSolicitingRouters() } } // stopSolicitingRouters stops soliciting routers. If routers are not currently // being solicited, this function does nothing. // // The IPv6 endpoint that ndp belongs to MUST be locked. func (ndp *ndpState) stopSolicitingRouters() { if ndp.rtrSolicitTimer.timer == nil { // Nothing to do. return } ndp.rtrSolicitTimer.timer.Stop() *ndp.rtrSolicitTimer.done = true ndp.rtrSolicitTimer = timer{} } func (ndp *ndpState) init(ep *endpoint, dadOptions ip.DADOptions) { if ndp.offLinkRoutes != nil { panic("attempted to initialize NDP state twice") } ndp.ep = ep ndp.configs = ep.protocol.options.NDPConfigs ndp.dad.Init(&ndp.ep.mu, ep.protocol.options.DADConfigs, dadOptions) ndp.offLinkRoutes = make(map[offLinkRoute]offLinkRouteState) ndp.onLinkPrefixes = make(map[tcpip.Subnet]onLinkPrefixState) ndp.slaacPrefixes = make(map[tcpip.Subnet]slaacPrefixState) header.InitialTempIID(ndp.temporaryIIDHistory[:], ndp.ep.protocol.options.TempIIDSeed, ndp.ep.nic.ID()) ndp.temporaryAddressDesyncFactor = time.Duration(ep.protocol.stack.InsecureRNG().Int63n(int64(MaxDesyncFactor))) } func (ndp *ndpState) SendDADMessage(addr tcpip.Address, nonce []byte) tcpip.Error { snmc := header.SolicitedNodeAddr(addr) return ndp.ep.sendNDPNS(header.IPv6Any, snmc, addr, header.EthernetAddressFromMulticastIPv6Address(snmc), header.NDPOptionsSerializer{ header.NDPNonceOption(nonce), }) } func (e *endpoint) sendNDPNS(srcAddr, dstAddr, targetAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, opts header.NDPOptionsSerializer) tcpip.Error { icmpView := buffer.NewView(header.ICMPv6NeighborSolicitMinimumSize + opts.Length()) icmpView.Grow(header.ICMPv6NeighborSolicitMinimumSize + opts.Length()) icmp := header.ICMPv6(icmpView.AsSlice()) icmp.SetType(header.ICMPv6NeighborSolicit) ns := header.NDPNeighborSolicit(icmp.MessageBody()) ns.SetTargetAddress(targetAddr) ns.Options().Serialize(opts) icmp.SetChecksum(header.ICMPv6Checksum(header.ICMPv6ChecksumParams{ Header: icmp, Src: srcAddr, Dst: dstAddr, })) pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ ReserveHeaderBytes: int(e.MaxHeaderLength()), Payload: buffer.MakeWithView(icmpView), }) defer pkt.DecRef() if err := addIPHeader(srcAddr, dstAddr, pkt, stack.NetworkHeaderParams{ Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, }, nil /* extensionHeaders */); err != nil { panic(fmt.Sprintf("failed to add IP header: %s", err)) } sent := e.stats.icmp.packetsSent err := e.nic.WritePacketToRemote(remoteLinkAddr, pkt) if err != nil { sent.dropped.Increment() } else { sent.neighborSolicit.Increment() } return err } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/network/ipv6/stats.go000066400000000000000000000121471465435605700246120ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ipv6 import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/network/internal/ip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) var _ stack.IPNetworkEndpointStats = (*Stats)(nil) // Stats holds statistics related to the IPv6 protocol family. // // +stateify savable type Stats struct { // IP holds IPv6 statistics. IP tcpip.IPStats // ICMP holds ICMPv6 statistics. ICMP tcpip.ICMPv6Stats // UnhandledRouterAdvertisements is the number of Router Advertisements that // were observed but not handled. UnhandledRouterAdvertisements *tcpip.StatCounter } // IsNetworkEndpointStats implements stack.NetworkEndpointStats. func (*Stats) IsNetworkEndpointStats() {} // IPStats implements stack.IPNetworkEndointStats func (s *Stats) IPStats() *tcpip.IPStats { return &s.IP } // +stateify savable type sharedStats struct { localStats Stats ip ip.MultiCounterIPStats icmp multiCounterICMPv6Stats } // LINT.IfChange(multiCounterICMPv6PacketStats) // +stateify savable type multiCounterICMPv6PacketStats struct { echoRequest tcpip.MultiCounterStat echoReply tcpip.MultiCounterStat dstUnreachable tcpip.MultiCounterStat packetTooBig tcpip.MultiCounterStat timeExceeded tcpip.MultiCounterStat paramProblem tcpip.MultiCounterStat routerSolicit tcpip.MultiCounterStat routerAdvert tcpip.MultiCounterStat neighborSolicit tcpip.MultiCounterStat neighborAdvert tcpip.MultiCounterStat redirectMsg tcpip.MultiCounterStat multicastListenerQuery tcpip.MultiCounterStat multicastListenerReport tcpip.MultiCounterStat multicastListenerReportV2 tcpip.MultiCounterStat multicastListenerDone tcpip.MultiCounterStat } func (m *multiCounterICMPv6PacketStats) init(a, b *tcpip.ICMPv6PacketStats) { m.echoRequest.Init(a.EchoRequest, b.EchoRequest) m.echoReply.Init(a.EchoReply, b.EchoReply) m.dstUnreachable.Init(a.DstUnreachable, b.DstUnreachable) m.packetTooBig.Init(a.PacketTooBig, b.PacketTooBig) m.timeExceeded.Init(a.TimeExceeded, b.TimeExceeded) m.paramProblem.Init(a.ParamProblem, b.ParamProblem) m.routerSolicit.Init(a.RouterSolicit, b.RouterSolicit) m.routerAdvert.Init(a.RouterAdvert, b.RouterAdvert) m.neighborSolicit.Init(a.NeighborSolicit, b.NeighborSolicit) m.neighborAdvert.Init(a.NeighborAdvert, b.NeighborAdvert) m.redirectMsg.Init(a.RedirectMsg, b.RedirectMsg) m.multicastListenerQuery.Init(a.MulticastListenerQuery, b.MulticastListenerQuery) m.multicastListenerReport.Init(a.MulticastListenerReport, b.MulticastListenerReport) m.multicastListenerReportV2.Init(a.MulticastListenerReportV2, b.MulticastListenerReportV2) m.multicastListenerDone.Init(a.MulticastListenerDone, b.MulticastListenerDone) } // LINT.ThenChange(../../tcpip.go:ICMPv6PacketStats) // LINT.IfChange(multiCounterICMPv6SentPacketStats) // +stateify savable type multiCounterICMPv6SentPacketStats struct { multiCounterICMPv6PacketStats dropped tcpip.MultiCounterStat rateLimited tcpip.MultiCounterStat } func (m *multiCounterICMPv6SentPacketStats) init(a, b *tcpip.ICMPv6SentPacketStats) { m.multiCounterICMPv6PacketStats.init(&a.ICMPv6PacketStats, &b.ICMPv6PacketStats) m.dropped.Init(a.Dropped, b.Dropped) m.rateLimited.Init(a.RateLimited, b.RateLimited) } // LINT.ThenChange(../../tcpip.go:ICMPv6SentPacketStats) // LINT.IfChange(multiCounterICMPv6ReceivedPacketStats) // +stateify savable type multiCounterICMPv6ReceivedPacketStats struct { multiCounterICMPv6PacketStats unrecognized tcpip.MultiCounterStat invalid tcpip.MultiCounterStat routerOnlyPacketsDroppedByHost tcpip.MultiCounterStat } func (m *multiCounterICMPv6ReceivedPacketStats) init(a, b *tcpip.ICMPv6ReceivedPacketStats) { m.multiCounterICMPv6PacketStats.init(&a.ICMPv6PacketStats, &b.ICMPv6PacketStats) m.unrecognized.Init(a.Unrecognized, b.Unrecognized) m.invalid.Init(a.Invalid, b.Invalid) m.routerOnlyPacketsDroppedByHost.Init(a.RouterOnlyPacketsDroppedByHost, b.RouterOnlyPacketsDroppedByHost) } // LINT.ThenChange(../../tcpip.go:ICMPv6ReceivedPacketStats) // LINT.IfChange(multiCounterICMPv6Stats) // +stateify savable type multiCounterICMPv6Stats struct { packetsSent multiCounterICMPv6SentPacketStats packetsReceived multiCounterICMPv6ReceivedPacketStats } func (m *multiCounterICMPv6Stats) init(a, b *tcpip.ICMPv6Stats) { m.packetsSent.init(&a.PacketsSent, &b.PacketsSent) m.packetsReceived.init(&a.PacketsReceived, &b.PacketsReceived) } // LINT.ThenChange(../../tcpip.go:ICMPv6Stats) golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/ports/000077500000000000000000000000001465435605700217125ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/ports/flags.go000066400000000000000000000072011465435605700233350ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ports // Flags represents the type of port reservation. // // +stateify savable type Flags struct { // MostRecent represents UDP SO_REUSEADDR. MostRecent bool // LoadBalanced indicates SO_REUSEPORT. // // LoadBalanced takes precedence over MostRecent. LoadBalanced bool // TupleOnly represents TCP SO_REUSEADDR. TupleOnly bool } // Bits converts the Flags to their bitset form. func (f Flags) Bits() BitFlags { var rf BitFlags if f.MostRecent { rf |= MostRecentFlag } if f.LoadBalanced { rf |= LoadBalancedFlag } if f.TupleOnly { rf |= TupleOnlyFlag } return rf } // Effective returns the effective behavior of a flag config. func (f Flags) Effective() Flags { e := f if e.LoadBalanced && e.MostRecent { e.MostRecent = false } return e } // BitFlags is a bitset representation of Flags. type BitFlags uint32 const ( // MostRecentFlag represents Flags.MostRecent. MostRecentFlag BitFlags = 1 << iota // LoadBalancedFlag represents Flags.LoadBalanced. LoadBalancedFlag // TupleOnlyFlag represents Flags.TupleOnly. TupleOnlyFlag // nextFlag is the value that the next added flag will have. // // It is used to calculate FlagMask below. It is also the number of // valid flag states. nextFlag // FlagMask is a bit mask for BitFlags. FlagMask = nextFlag - 1 // MultiBindFlagMask contains the flags that allow binding the same // tuple multiple times. MultiBindFlagMask = MostRecentFlag | LoadBalancedFlag ) // ToFlags converts the bitset into a Flags struct. func (f BitFlags) ToFlags() Flags { return Flags{ MostRecent: f&MostRecentFlag != 0, LoadBalanced: f&LoadBalancedFlag != 0, TupleOnly: f&TupleOnlyFlag != 0, } } // FlagCounter counts how many references each flag combination has. // // +stateify savable type FlagCounter struct { // refs stores the count for each possible flag combination, (0 though // FlagMask). refs [nextFlag]int } // AddRef increases the reference count for a specific flag combination. func (c *FlagCounter) AddRef(flags BitFlags) { c.refs[flags]++ } // DropRef decreases the reference count for a specific flag combination. func (c *FlagCounter) DropRef(flags BitFlags) { c.refs[flags]-- } // TotalRefs calculates the total number of references for all flag // combinations. func (c FlagCounter) TotalRefs() int { var total int for _, r := range c.refs { total += r } return total } // FlagRefs returns the number of references with all specified flags. func (c FlagCounter) FlagRefs(flags BitFlags) int { var total int for i, r := range c.refs { if BitFlags(i)&flags == flags { total += r } } return total } // AllRefsHave returns if all references have all specified flags. func (c FlagCounter) AllRefsHave(flags BitFlags) bool { for i, r := range c.refs { if BitFlags(i)&flags != flags && r > 0 { return false } } return true } // SharedFlags returns the set of flags shared by all references. func (c FlagCounter) SharedFlags() BitFlags { intersection := FlagMask for i, r := range c.refs { if r > 0 { intersection &= BitFlags(i) } } return intersection } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/ports/ports.go000066400000000000000000000331621465435605700234150ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package ports provides PortManager that manages allocating, reserving and // releasing ports. package ports import ( "math" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" ) const ( firstEphemeral = 16000 ) var ( anyIPAddress = tcpip.Address{} ) // Reservation describes a port reservation. type Reservation struct { // Networks is a list of network protocols to which the reservation // applies. Can be IPv4, IPv6, or both. Networks []tcpip.NetworkProtocolNumber // Transport is the transport protocol to which the reservation applies. Transport tcpip.TransportProtocolNumber // Addr is the address of the local endpoint. Addr tcpip.Address // Port is the local port number. Port uint16 // Flags describe features of the reservation. Flags Flags // BindToDevice is the NIC to which the reservation applies. BindToDevice tcpip.NICID // Dest is the destination address. Dest tcpip.FullAddress } func (rs Reservation) dst() destination { return destination{ rs.Dest.Addr, rs.Dest.Port, } } // +stateify savable type portDescriptor struct { network tcpip.NetworkProtocolNumber transport tcpip.TransportProtocolNumber port uint16 } // +stateify savable type destination struct { addr tcpip.Address port uint16 } // destToCounter maps each destination to the FlagCounter that represents // endpoints to that destination. // // destToCounter is never empty. When it has no elements, it is removed from // the map that references it. type destToCounter map[destination]FlagCounter // intersectionFlags calculates the intersection of flag bit values which affect // the specified destination. // // If no destinations are present, all flag values are returned as there are no // entries to limit possible flag values of a new entry. // // In addition to the intersection, the number of intersecting refs is // returned. func (dc destToCounter) intersectionFlags(res Reservation) (BitFlags, int) { intersection := FlagMask var count int for dest, counter := range dc { if dest == res.dst() { intersection &= counter.SharedFlags() count++ continue } // Wildcard destinations affect all destinations for TupleOnly. if dest.addr == anyIPAddress || res.Dest.Addr == anyIPAddress { // Only bitwise and the TupleOnlyFlag. intersection &= (^TupleOnlyFlag) | counter.SharedFlags() count++ } } return intersection, count } // deviceToDest maps NICs to destinations for which there are port reservations. // // deviceToDest is never empty. When it has no elements, it is removed from the // map that references it. type deviceToDest map[tcpip.NICID]destToCounter // isAvailable checks whether binding is possible by device. If not binding to // a device, check against all FlagCounters. If binding to a specific device, // check against the unspecified device and the provided device. // // If either of the port reuse flags is enabled on any of the nodes, all nodes // sharing a port must share at least one reuse flag. This matches Linux's // behavior. func (dd deviceToDest) isAvailable(res Reservation, portSpecified bool) bool { flagBits := res.Flags.Bits() if res.BindToDevice == 0 { intersection := FlagMask for _, dest := range dd { flags, count := dest.intersectionFlags(res) if count == 0 { continue } intersection &= flags if intersection&flagBits == 0 { // Can't bind because the (addr,port) was // previously bound without reuse. return false } } if !portSpecified && res.Transport == header.TCPProtocolNumber { return false } return true } intersection := FlagMask if dests, ok := dd[0]; ok { var count int intersection, count = dests.intersectionFlags(res) if count > 0 { if intersection&flagBits == 0 { return false } if !portSpecified && res.Transport == header.TCPProtocolNumber { return false } } } if dests, ok := dd[res.BindToDevice]; ok { flags, count := dests.intersectionFlags(res) intersection &= flags if count > 0 { if intersection&flagBits == 0 { return false } if !portSpecified && res.Transport == header.TCPProtocolNumber { return false } } } return true } // addrToDevice maps IP addresses to NICs that have port reservations. type addrToDevice map[tcpip.Address]deviceToDest // isAvailable checks whether an IP address is available to bind to. If the // address is the "any" address, check all other addresses. Otherwise, just // check against the "any" address and the provided address. func (ad addrToDevice) isAvailable(res Reservation, portSpecified bool) bool { if res.Addr == anyIPAddress { // If binding to the "any" address then check that there are no // conflicts with all addresses. for _, devices := range ad { if !devices.isAvailable(res, portSpecified) { return false } } return true } // Check that there is no conflict with the "any" address. if devices, ok := ad[anyIPAddress]; ok { if !devices.isAvailable(res, portSpecified) { return false } } // Check that this is no conflict with the provided address. if devices, ok := ad[res.Addr]; ok { if !devices.isAvailable(res, portSpecified) { return false } } return true } // PortManager manages allocating, reserving and releasing ports. // // +stateify savable type PortManager struct { // mu protects allocatedPorts. // LOCK ORDERING: mu > ephemeralMu. mu sync.RWMutex `state:"nosave"` // allocatedPorts is a nesting of maps that ultimately map Reservations // to FlagCounters describing whether the Reservation is valid and can // be reused. allocatedPorts map[portDescriptor]addrToDevice // ephemeralMu protects firstEphemeral and numEphemeral. ephemeralMu sync.RWMutex `state:"nosave"` firstEphemeral uint16 numEphemeral uint16 } // NewPortManager creates new PortManager. func NewPortManager() *PortManager { return &PortManager{ allocatedPorts: make(map[portDescriptor]addrToDevice), firstEphemeral: firstEphemeral, numEphemeral: math.MaxUint16 - firstEphemeral + 1, } } // PortTester indicates whether the passed in port is suitable. Returning an // error causes the function to which the PortTester is passed to return that // error. type PortTester func(port uint16) (good bool, err tcpip.Error) // PickEphemeralPort randomly chooses a starting point and iterates over all // possible ephemeral ports, allowing the caller to decide whether a given port // is suitable for its needs, and stopping when a port is found or an error // occurs. func (pm *PortManager) PickEphemeralPort(rng rand.RNG, testPort PortTester) (port uint16, err tcpip.Error) { pm.ephemeralMu.RLock() firstEphemeral := pm.firstEphemeral numEphemeral := pm.numEphemeral pm.ephemeralMu.RUnlock() return pickEphemeralPort(rng.Uint32(), firstEphemeral, numEphemeral, testPort) } // pickEphemeralPort starts at the offset specified from the FirstEphemeral port // and iterates over the number of ports specified by count and allows the // caller to decide whether a given port is suitable for its needs, and stopping // when a port is found or an error occurs. func pickEphemeralPort(offset uint32, first, count uint16, testPort PortTester) (port uint16, err tcpip.Error) { // This implements Algorithm 1 as per RFC 6056 Section 3.3.1. for i := uint32(0); i < uint32(count); i++ { port := uint16(uint32(first) + (offset+i)%uint32(count)) ok, err := testPort(port) if err != nil { return 0, err } if ok { return port, nil } } return 0, &tcpip.ErrNoPortAvailable{} } // ReservePort marks a port/IP combination as reserved so that it cannot be // reserved by another endpoint. If port is zero, ReservePort will search for // an unreserved ephemeral port and reserve it, returning its value in the // "port" return value. // // An optional PortTester can be passed in which if provided will be used to // test if the picked port can be used. The function should return true if the // port is safe to use, false otherwise. func (pm *PortManager) ReservePort(rng rand.RNG, res Reservation, testPort PortTester) (reservedPort uint16, err tcpip.Error) { pm.mu.Lock() defer pm.mu.Unlock() // If a port is specified, just try to reserve it for all network // protocols. if res.Port != 0 { if !pm.reserveSpecificPortLocked(res, true /* portSpecified */) { return 0, &tcpip.ErrPortInUse{} } if testPort != nil { ok, err := testPort(res.Port) if err != nil { pm.releasePortLocked(res) return 0, err } if !ok { pm.releasePortLocked(res) return 0, &tcpip.ErrPortInUse{} } } return res.Port, nil } // A port wasn't specified, so try to find one. return pm.PickEphemeralPort(rng, func(p uint16) (bool, tcpip.Error) { res.Port = p if !pm.reserveSpecificPortLocked(res, false /* portSpecified */) { return false, nil } if testPort != nil { ok, err := testPort(p) if err != nil { pm.releasePortLocked(res) return false, err } if !ok { pm.releasePortLocked(res) return false, nil } } return true, nil }) } // reserveSpecificPortLocked tries to reserve the given port on all given // protocols. func (pm *PortManager) reserveSpecificPortLocked(res Reservation, portSpecified bool) bool { // Make sure the port is available. for _, network := range res.Networks { desc := portDescriptor{network, res.Transport, res.Port} if addrs, ok := pm.allocatedPorts[desc]; ok { if !addrs.isAvailable(res, portSpecified) { return false } } } // Reserve port on all network protocols. flagBits := res.Flags.Bits() dst := res.dst() for _, network := range res.Networks { desc := portDescriptor{network, res.Transport, res.Port} addrToDev, ok := pm.allocatedPorts[desc] if !ok { addrToDev = make(addrToDevice) pm.allocatedPorts[desc] = addrToDev } devToDest, ok := addrToDev[res.Addr] if !ok { devToDest = make(deviceToDest) addrToDev[res.Addr] = devToDest } destToCntr := devToDest[res.BindToDevice] if destToCntr == nil { destToCntr = make(destToCounter) } counter := destToCntr[dst] counter.AddRef(flagBits) destToCntr[dst] = counter devToDest[res.BindToDevice] = destToCntr } return true } // ReserveTuple adds a port reservation for the tuple on all given protocol. func (pm *PortManager) ReserveTuple(res Reservation) bool { flagBits := res.Flags.Bits() dst := res.dst() pm.mu.Lock() defer pm.mu.Unlock() // It is easier to undo the entire reservation, so if we find that the // tuple can't be fully added, finish and undo the whole thing. undo := false // Reserve port on all network protocols. for _, network := range res.Networks { desc := portDescriptor{network, res.Transport, res.Port} addrToDev, ok := pm.allocatedPorts[desc] if !ok { addrToDev = make(addrToDevice) pm.allocatedPorts[desc] = addrToDev } devToDest, ok := addrToDev[res.Addr] if !ok { devToDest = make(deviceToDest) addrToDev[res.Addr] = devToDest } destToCntr := devToDest[res.BindToDevice] if destToCntr == nil { destToCntr = make(destToCounter) } counter := destToCntr[dst] if counter.TotalRefs() != 0 && counter.SharedFlags()&flagBits == 0 { // Tuple already exists. undo = true } counter.AddRef(flagBits) destToCntr[dst] = counter devToDest[res.BindToDevice] = destToCntr } if undo { // releasePortLocked decrements the counts (rather than setting // them to zero), so it will undo the incorrect incrementing // above. pm.releasePortLocked(res) return false } return true } // ReleasePort releases the reservation on a port/IP combination so that it can // be reserved by other endpoints. func (pm *PortManager) ReleasePort(res Reservation) { pm.mu.Lock() defer pm.mu.Unlock() pm.releasePortLocked(res) } func (pm *PortManager) releasePortLocked(res Reservation) { dst := res.dst() for _, network := range res.Networks { desc := portDescriptor{network, res.Transport, res.Port} addrToDev, ok := pm.allocatedPorts[desc] if !ok { continue } devToDest, ok := addrToDev[res.Addr] if !ok { continue } destToCounter, ok := devToDest[res.BindToDevice] if !ok { continue } counter, ok := destToCounter[dst] if !ok { continue } counter.DropRef(res.Flags.Bits()) if counter.TotalRefs() > 0 { destToCounter[dst] = counter continue } delete(destToCounter, dst) if len(destToCounter) > 0 { continue } delete(devToDest, res.BindToDevice) if len(devToDest) > 0 { continue } delete(addrToDev, res.Addr) if len(addrToDev) > 0 { continue } delete(pm.allocatedPorts, desc) } } // PortRange returns the UDP and TCP inclusive range of ephemeral ports used in // both IPv4 and IPv6. func (pm *PortManager) PortRange() (uint16, uint16) { pm.ephemeralMu.RLock() defer pm.ephemeralMu.RUnlock() return pm.firstEphemeral, pm.firstEphemeral + pm.numEphemeral - 1 } // SetPortRange sets the UDP and TCP IPv4 and IPv6 ephemeral port range // (inclusive). func (pm *PortManager) SetPortRange(start uint16, end uint16) tcpip.Error { if start > end { return &tcpip.ErrInvalidPortRange{} } pm.ephemeralMu.Lock() defer pm.ephemeralMu.Unlock() pm.firstEphemeral = start pm.numEphemeral = end - start + 1 return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/ports/ports_state_autogen.go000066400000000000000000000072421465435605700263370ustar00rootroot00000000000000// automatically generated by stateify. package ports import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (f *Flags) StateTypeName() string { return "pkg/tcpip/ports.Flags" } func (f *Flags) StateFields() []string { return []string{ "MostRecent", "LoadBalanced", "TupleOnly", } } func (f *Flags) beforeSave() {} // +checklocksignore func (f *Flags) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.MostRecent) stateSinkObject.Save(1, &f.LoadBalanced) stateSinkObject.Save(2, &f.TupleOnly) } func (f *Flags) afterLoad(context.Context) {} // +checklocksignore func (f *Flags) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.MostRecent) stateSourceObject.Load(1, &f.LoadBalanced) stateSourceObject.Load(2, &f.TupleOnly) } func (c *FlagCounter) StateTypeName() string { return "pkg/tcpip/ports.FlagCounter" } func (c *FlagCounter) StateFields() []string { return []string{ "refs", } } func (c *FlagCounter) beforeSave() {} // +checklocksignore func (c *FlagCounter) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.refs) } func (c *FlagCounter) afterLoad(context.Context) {} // +checklocksignore func (c *FlagCounter) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.refs) } func (p *portDescriptor) StateTypeName() string { return "pkg/tcpip/ports.portDescriptor" } func (p *portDescriptor) StateFields() []string { return []string{ "network", "transport", "port", } } func (p *portDescriptor) beforeSave() {} // +checklocksignore func (p *portDescriptor) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.network) stateSinkObject.Save(1, &p.transport) stateSinkObject.Save(2, &p.port) } func (p *portDescriptor) afterLoad(context.Context) {} // +checklocksignore func (p *portDescriptor) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.network) stateSourceObject.Load(1, &p.transport) stateSourceObject.Load(2, &p.port) } func (d *destination) StateTypeName() string { return "pkg/tcpip/ports.destination" } func (d *destination) StateFields() []string { return []string{ "addr", "port", } } func (d *destination) beforeSave() {} // +checklocksignore func (d *destination) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.addr) stateSinkObject.Save(1, &d.port) } func (d *destination) afterLoad(context.Context) {} // +checklocksignore func (d *destination) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.addr) stateSourceObject.Load(1, &d.port) } func (pm *PortManager) StateTypeName() string { return "pkg/tcpip/ports.PortManager" } func (pm *PortManager) StateFields() []string { return []string{ "allocatedPorts", "firstEphemeral", "numEphemeral", } } func (pm *PortManager) beforeSave() {} // +checklocksignore func (pm *PortManager) StateSave(stateSinkObject state.Sink) { pm.beforeSave() stateSinkObject.Save(0, &pm.allocatedPorts) stateSinkObject.Save(1, &pm.firstEphemeral) stateSinkObject.Save(2, &pm.numEphemeral) } func (pm *PortManager) afterLoad(context.Context) {} // +checklocksignore func (pm *PortManager) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &pm.allocatedPorts) stateSourceObject.Load(1, &pm.firstEphemeral) stateSourceObject.Load(2, &pm.numEphemeral) } func init() { state.Register((*Flags)(nil)) state.Register((*FlagCounter)(nil)) state.Register((*portDescriptor)(nil)) state.Register((*destination)(nil)) state.Register((*PortManager)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/route_list.go000066400000000000000000000117121465435605700232650ustar00rootroot00000000000000package tcpip // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type RouteElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (RouteElementMapper) linkerFor(elem *Route) *Route { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type RouteList struct { head *Route tail *Route } // Reset resets list l to the empty state. func (l *RouteList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *RouteList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *RouteList) Front() *Route { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *RouteList) Back() *Route { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *RouteList) Len() (count int) { for e := l.Front(); e != nil; e = (RouteElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *RouteList) PushFront(e *Route) { linker := RouteElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { RouteElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *RouteList) PushFrontList(m *RouteList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { RouteElementMapper{}.linkerFor(l.head).SetPrev(m.tail) RouteElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *RouteList) PushBack(e *Route) { linker := RouteElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { RouteElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *RouteList) PushBackList(m *RouteList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { RouteElementMapper{}.linkerFor(l.tail).SetNext(m.head) RouteElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *RouteList) InsertAfter(b, e *Route) { bLinker := RouteElementMapper{}.linkerFor(b) eLinker := RouteElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { RouteElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *RouteList) InsertBefore(a, e *Route) { aLinker := RouteElementMapper{}.linkerFor(a) eLinker := RouteElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { RouteElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *RouteList) Remove(e *Route) { linker := RouteElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { RouteElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { RouteElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type RouteEntry struct { next *Route prev *Route } // Next returns the entry that follows e in the list. // //go:nosplit func (e *RouteEntry) Next() *Route { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *RouteEntry) Prev() *Route { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *RouteEntry) SetNext(elem *Route) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *RouteEntry) SetPrev(elem *Route) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/seqnum/000077500000000000000000000000001465435605700220535ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/seqnum/seqnum.go000066400000000000000000000035351465435605700237200ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package seqnum defines the types and methods for TCP sequence numbers such // that they fit in 32-bit words and work properly when overflows occur. package seqnum // Value represents the value of a sequence number. type Value uint32 // Size represents the size (length) of a sequence number window. type Size uint32 // LessThan checks if v is before w, i.e., v < w. func (v Value) LessThan(w Value) bool { return int32(v-w) < 0 } // LessThanEq returns true if v==w or v is before i.e., v < w. func (v Value) LessThanEq(w Value) bool { if v == w { return true } return v.LessThan(w) } // InRange checks if v is in the range [a,b), i.e., a <= v < b. func (v Value) InRange(a, b Value) bool { return v-a < b-a } // InWindow checks if v is in the window that starts at 'first' and spans 'size' // sequence numbers. func (v Value) InWindow(first Value, size Size) bool { return v.InRange(first, first.Add(size)) } // Add calculates the sequence number following the [v, v+s) window. func (v Value) Add(s Size) Value { return v + Value(s) } // Size calculates the size of the window defined by [v, w). func (v Value) Size(w Value) Size { return Size(w - v) } // UpdateForward updates v such that it becomes v + s. func (v *Value) UpdateForward(s Size) { *v += Value(s) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/seqnum/seqnum_state_autogen.go000066400000000000000000000000701465435605700266310ustar00rootroot00000000000000// automatically generated by stateify. package seqnum golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/sock_err_list.go000066400000000000000000000122561465435605700237420ustar00rootroot00000000000000package tcpip // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type sockErrorElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (sockErrorElementMapper) linkerFor(elem *SockError) *SockError { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type sockErrorList struct { head *SockError tail *SockError } // Reset resets list l to the empty state. func (l *sockErrorList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *sockErrorList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *sockErrorList) Front() *SockError { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *sockErrorList) Back() *SockError { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *sockErrorList) Len() (count int) { for e := l.Front(); e != nil; e = (sockErrorElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *sockErrorList) PushFront(e *SockError) { linker := sockErrorElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { sockErrorElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *sockErrorList) PushFrontList(m *sockErrorList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { sockErrorElementMapper{}.linkerFor(l.head).SetPrev(m.tail) sockErrorElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *sockErrorList) PushBack(e *SockError) { linker := sockErrorElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { sockErrorElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *sockErrorList) PushBackList(m *sockErrorList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { sockErrorElementMapper{}.linkerFor(l.tail).SetNext(m.head) sockErrorElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *sockErrorList) InsertAfter(b, e *SockError) { bLinker := sockErrorElementMapper{}.linkerFor(b) eLinker := sockErrorElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { sockErrorElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *sockErrorList) InsertBefore(a, e *SockError) { aLinker := sockErrorElementMapper{}.linkerFor(a) eLinker := sockErrorElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { sockErrorElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *sockErrorList) Remove(e *SockError) { linker := sockErrorElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { sockErrorElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { sockErrorElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type sockErrorEntry struct { next *SockError prev *SockError } // Next returns the entry that follows e in the list. // //go:nosplit func (e *sockErrorEntry) Next() *SockError { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *sockErrorEntry) Prev() *SockError { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *sockErrorEntry) SetNext(elem *SockError) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *sockErrorEntry) SetPrev(elem *SockError) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/socketops.go000066400000000000000000000574221465435605700231160ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcpip import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/sync" ) // SocketOptionsHandler holds methods that help define endpoint specific // behavior for socket level socket options. These must be implemented by // endpoints to get notified when socket level options are set. type SocketOptionsHandler interface { // OnReuseAddressSet is invoked when SO_REUSEADDR is set for an endpoint. OnReuseAddressSet(v bool) // OnReusePortSet is invoked when SO_REUSEPORT is set for an endpoint. OnReusePortSet(v bool) // OnKeepAliveSet is invoked when SO_KEEPALIVE is set for an endpoint. OnKeepAliveSet(v bool) // OnDelayOptionSet is invoked when TCP_NODELAY is set for an endpoint. // Note that v will be the inverse of TCP_NODELAY option. OnDelayOptionSet(v bool) // OnCorkOptionSet is invoked when TCP_CORK is set for an endpoint. OnCorkOptionSet(v bool) // LastError is invoked when SO_ERROR is read for an endpoint. LastError() Error // UpdateLastError updates the endpoint specific last error field. UpdateLastError(err Error) // HasNIC is invoked to check if the NIC is valid for SO_BINDTODEVICE. HasNIC(v int32) bool // OnSetSendBufferSize is invoked when the send buffer size for an endpoint is // changed. The handler is invoked with the new value for the socket send // buffer size. It also returns the newly set value. OnSetSendBufferSize(v int64) (newSz int64) // OnSetReceiveBufferSize is invoked by SO_RCVBUF and SO_RCVBUFFORCE. The // handler can optionally return a callback which will be called after // the buffer size is updated to newSz. OnSetReceiveBufferSize(v, oldSz int64) (newSz int64, postSet func()) // WakeupWriters is invoked when the send buffer size for an endpoint is // changed. The handler notifies the writers if the send buffer size is // increased with setsockopt(2) for TCP endpoints. WakeupWriters() // GetAcceptConn returns true if the socket is a TCP socket and is in // listening state. GetAcceptConn() bool } // DefaultSocketOptionsHandler is an embeddable type that implements no-op // implementations for SocketOptionsHandler methods. type DefaultSocketOptionsHandler struct{} var _ SocketOptionsHandler = (*DefaultSocketOptionsHandler)(nil) // OnReuseAddressSet implements SocketOptionsHandler.OnReuseAddressSet. func (*DefaultSocketOptionsHandler) OnReuseAddressSet(bool) {} // OnReusePortSet implements SocketOptionsHandler.OnReusePortSet. func (*DefaultSocketOptionsHandler) OnReusePortSet(bool) {} // OnKeepAliveSet implements SocketOptionsHandler.OnKeepAliveSet. func (*DefaultSocketOptionsHandler) OnKeepAliveSet(bool) {} // OnDelayOptionSet implements SocketOptionsHandler.OnDelayOptionSet. func (*DefaultSocketOptionsHandler) OnDelayOptionSet(bool) {} // OnCorkOptionSet implements SocketOptionsHandler.OnCorkOptionSet. func (*DefaultSocketOptionsHandler) OnCorkOptionSet(bool) {} // LastError implements SocketOptionsHandler.LastError. func (*DefaultSocketOptionsHandler) LastError() Error { return nil } // UpdateLastError implements SocketOptionsHandler.UpdateLastError. func (*DefaultSocketOptionsHandler) UpdateLastError(Error) {} // HasNIC implements SocketOptionsHandler.HasNIC. func (*DefaultSocketOptionsHandler) HasNIC(int32) bool { return false } // OnSetSendBufferSize implements SocketOptionsHandler.OnSetSendBufferSize. func (*DefaultSocketOptionsHandler) OnSetSendBufferSize(v int64) (newSz int64) { return v } // WakeupWriters implements SocketOptionsHandler.WakeupWriters. func (*DefaultSocketOptionsHandler) WakeupWriters() {} // OnSetReceiveBufferSize implements SocketOptionsHandler.OnSetReceiveBufferSize. func (*DefaultSocketOptionsHandler) OnSetReceiveBufferSize(v, oldSz int64) (newSz int64, postSet func()) { return v, nil } // GetAcceptConn implements SocketOptionsHandler.GetAcceptConn. func (*DefaultSocketOptionsHandler) GetAcceptConn() bool { return false } // StackHandler holds methods to access the stack options. These must be // implemented by the stack. type StackHandler interface { // Option allows retrieving stack wide options. Option(option any) Error // TransportProtocolOption allows retrieving individual protocol level // option values. TransportProtocolOption(proto TransportProtocolNumber, option GettableTransportProtocolOption) Error } // SocketOptions contains all the variables which store values for SOL_SOCKET, // SOL_IP, SOL_IPV6 and SOL_TCP level options. // // +stateify savable type SocketOptions struct { handler SocketOptionsHandler // StackHandler is initialized at the creation time and will not change. stackHandler StackHandler `state:"manual"` // These fields are accessed and modified using atomic operations. // broadcastEnabled determines whether datagram sockets are allowed to // send packets to a broadcast address. broadcastEnabled atomicbitops.Uint32 // passCredEnabled determines whether SCM_CREDENTIALS socket control // messages are enabled. passCredEnabled atomicbitops.Uint32 // noChecksumEnabled determines whether UDP checksum is disabled while // transmitting for this socket. noChecksumEnabled atomicbitops.Uint32 // reuseAddressEnabled determines whether Bind() should allow reuse of // local address. reuseAddressEnabled atomicbitops.Uint32 // reusePortEnabled determines whether to permit multiple sockets to be // bound to an identical socket address. reusePortEnabled atomicbitops.Uint32 // keepAliveEnabled determines whether TCP keepalive is enabled for this // socket. keepAliveEnabled atomicbitops.Uint32 // multicastLoopEnabled determines whether multicast packets sent over a // non-loopback interface will be looped back. multicastLoopEnabled atomicbitops.Uint32 // receiveTOSEnabled is used to specify if the TOS ancillary message is // passed with incoming packets. receiveTOSEnabled atomicbitops.Uint32 // receiveTTLEnabled is used to specify if the TTL ancillary message is passed // with incoming packets. receiveTTLEnabled atomicbitops.Uint32 // receiveHopLimitEnabled is used to specify if the HopLimit ancillary message // is passed with incoming packets. receiveHopLimitEnabled atomicbitops.Uint32 // receiveTClassEnabled is used to specify if the IPV6_TCLASS ancillary // message is passed with incoming packets. receiveTClassEnabled atomicbitops.Uint32 // receivePacketInfoEnabled is used to specify if more information is // provided with incoming IPv4 packets. receivePacketInfoEnabled atomicbitops.Uint32 // receivePacketInfoEnabled is used to specify if more information is // provided with incoming IPv6 packets. receiveIPv6PacketInfoEnabled atomicbitops.Uint32 // hdrIncludeEnabled is used to indicate for a raw endpoint that all packets // being written have an IP header and the endpoint should not attach an IP // header. hdrIncludedEnabled atomicbitops.Uint32 // v6OnlyEnabled is used to determine whether an IPv6 socket is to be // restricted to sending and receiving IPv6 packets only. v6OnlyEnabled atomicbitops.Uint32 // quickAckEnabled is used to represent the value of TCP_QUICKACK option. // It currently does not have any effect on the TCP endpoint. quickAckEnabled atomicbitops.Uint32 // delayOptionEnabled is used to specify if data should be sent out immediately // by the transport protocol. For TCP, it determines if the Nagle algorithm // is on or off. delayOptionEnabled atomicbitops.Uint32 // corkOptionEnabled is used to specify if data should be held until segments // are full by the TCP transport protocol. corkOptionEnabled atomicbitops.Uint32 // receiveOriginalDstAddress is used to specify if the original destination of // the incoming packet should be returned as an ancillary message. receiveOriginalDstAddress atomicbitops.Uint32 // ipv4RecvErrEnabled determines whether extended reliable error message // passing is enabled for IPv4. ipv4RecvErrEnabled atomicbitops.Uint32 // ipv6RecvErrEnabled determines whether extended reliable error message // passing is enabled for IPv6. ipv6RecvErrEnabled atomicbitops.Uint32 // errQueue is the per-socket error queue. It is protected by errQueueMu. errQueueMu sync.Mutex `state:"nosave"` errQueue sockErrorList // bindToDevice determines the device to which the socket is bound. bindToDevice atomicbitops.Int32 // getSendBufferLimits provides the handler to get the min, default and max // size for send buffer. It is initialized at the creation time and will not // change. getSendBufferLimits GetSendBufferLimits `state:"manual"` // sendBufferSize determines the send buffer size for this socket. sendBufferSize atomicbitops.Int64 // getReceiveBufferLimits provides the handler to get the min, default and // max size for receive buffer. It is initialized at the creation time and // will not change. getReceiveBufferLimits GetReceiveBufferLimits `state:"manual"` // receiveBufferSize determines the receive buffer size for this socket. receiveBufferSize atomicbitops.Int64 // mu protects the access to the below fields. mu sync.Mutex `state:"nosave"` // linger determines the amount of time the socket should linger before // close. We currently implement this option for TCP socket only. linger LingerOption // rcvlowat specifies the minimum number of bytes which should be // received to indicate the socket as readable. rcvlowat atomicbitops.Int32 } // InitHandler initializes the handler. This must be called before using the // socket options utility. func (so *SocketOptions) InitHandler(handler SocketOptionsHandler, stack StackHandler, getSendBufferLimits GetSendBufferLimits, getReceiveBufferLimits GetReceiveBufferLimits) { so.handler = handler so.stackHandler = stack so.getSendBufferLimits = getSendBufferLimits so.getReceiveBufferLimits = getReceiveBufferLimits } func storeAtomicBool(addr *atomicbitops.Uint32, v bool) { var val uint32 if v { val = 1 } addr.Store(val) } // SetLastError sets the last error for a socket. func (so *SocketOptions) SetLastError(err Error) { so.handler.UpdateLastError(err) } // GetBroadcast gets value for SO_BROADCAST option. func (so *SocketOptions) GetBroadcast() bool { return so.broadcastEnabled.Load() != 0 } // SetBroadcast sets value for SO_BROADCAST option. func (so *SocketOptions) SetBroadcast(v bool) { storeAtomicBool(&so.broadcastEnabled, v) } // GetPassCred gets value for SO_PASSCRED option. func (so *SocketOptions) GetPassCred() bool { return so.passCredEnabled.Load() != 0 } // SetPassCred sets value for SO_PASSCRED option. func (so *SocketOptions) SetPassCred(v bool) { storeAtomicBool(&so.passCredEnabled, v) } // GetNoChecksum gets value for SO_NO_CHECK option. func (so *SocketOptions) GetNoChecksum() bool { return so.noChecksumEnabled.Load() != 0 } // SetNoChecksum sets value for SO_NO_CHECK option. func (so *SocketOptions) SetNoChecksum(v bool) { storeAtomicBool(&so.noChecksumEnabled, v) } // GetReuseAddress gets value for SO_REUSEADDR option. func (so *SocketOptions) GetReuseAddress() bool { return so.reuseAddressEnabled.Load() != 0 } // SetReuseAddress sets value for SO_REUSEADDR option. func (so *SocketOptions) SetReuseAddress(v bool) { storeAtomicBool(&so.reuseAddressEnabled, v) so.handler.OnReuseAddressSet(v) } // GetReusePort gets value for SO_REUSEPORT option. func (so *SocketOptions) GetReusePort() bool { return so.reusePortEnabled.Load() != 0 } // SetReusePort sets value for SO_REUSEPORT option. func (so *SocketOptions) SetReusePort(v bool) { storeAtomicBool(&so.reusePortEnabled, v) so.handler.OnReusePortSet(v) } // GetKeepAlive gets value for SO_KEEPALIVE option. func (so *SocketOptions) GetKeepAlive() bool { return so.keepAliveEnabled.Load() != 0 } // SetKeepAlive sets value for SO_KEEPALIVE option. func (so *SocketOptions) SetKeepAlive(v bool) { storeAtomicBool(&so.keepAliveEnabled, v) so.handler.OnKeepAliveSet(v) } // GetMulticastLoop gets value for IP_MULTICAST_LOOP option. func (so *SocketOptions) GetMulticastLoop() bool { return so.multicastLoopEnabled.Load() != 0 } // SetMulticastLoop sets value for IP_MULTICAST_LOOP option. func (so *SocketOptions) SetMulticastLoop(v bool) { storeAtomicBool(&so.multicastLoopEnabled, v) } // GetReceiveTOS gets value for IP_RECVTOS option. func (so *SocketOptions) GetReceiveTOS() bool { return so.receiveTOSEnabled.Load() != 0 } // SetReceiveTOS sets value for IP_RECVTOS option. func (so *SocketOptions) SetReceiveTOS(v bool) { storeAtomicBool(&so.receiveTOSEnabled, v) } // GetReceiveTTL gets value for IP_RECVTTL option. func (so *SocketOptions) GetReceiveTTL() bool { return so.receiveTTLEnabled.Load() != 0 } // SetReceiveTTL sets value for IP_RECVTTL option. func (so *SocketOptions) SetReceiveTTL(v bool) { storeAtomicBool(&so.receiveTTLEnabled, v) } // GetReceiveHopLimit gets value for IP_RECVHOPLIMIT option. func (so *SocketOptions) GetReceiveHopLimit() bool { return so.receiveHopLimitEnabled.Load() != 0 } // SetReceiveHopLimit sets value for IP_RECVHOPLIMIT option. func (so *SocketOptions) SetReceiveHopLimit(v bool) { storeAtomicBool(&so.receiveHopLimitEnabled, v) } // GetReceiveTClass gets value for IPV6_RECVTCLASS option. func (so *SocketOptions) GetReceiveTClass() bool { return so.receiveTClassEnabled.Load() != 0 } // SetReceiveTClass sets value for IPV6_RECVTCLASS option. func (so *SocketOptions) SetReceiveTClass(v bool) { storeAtomicBool(&so.receiveTClassEnabled, v) } // GetReceivePacketInfo gets value for IP_PKTINFO option. func (so *SocketOptions) GetReceivePacketInfo() bool { return so.receivePacketInfoEnabled.Load() != 0 } // SetReceivePacketInfo sets value for IP_PKTINFO option. func (so *SocketOptions) SetReceivePacketInfo(v bool) { storeAtomicBool(&so.receivePacketInfoEnabled, v) } // GetIPv6ReceivePacketInfo gets value for IPV6_RECVPKTINFO option. func (so *SocketOptions) GetIPv6ReceivePacketInfo() bool { return so.receiveIPv6PacketInfoEnabled.Load() != 0 } // SetIPv6ReceivePacketInfo sets value for IPV6_RECVPKTINFO option. func (so *SocketOptions) SetIPv6ReceivePacketInfo(v bool) { storeAtomicBool(&so.receiveIPv6PacketInfoEnabled, v) } // GetHeaderIncluded gets value for IP_HDRINCL option. func (so *SocketOptions) GetHeaderIncluded() bool { return so.hdrIncludedEnabled.Load() != 0 } // SetHeaderIncluded sets value for IP_HDRINCL option. func (so *SocketOptions) SetHeaderIncluded(v bool) { storeAtomicBool(&so.hdrIncludedEnabled, v) } // GetV6Only gets value for IPV6_V6ONLY option. func (so *SocketOptions) GetV6Only() bool { return so.v6OnlyEnabled.Load() != 0 } // SetV6Only sets value for IPV6_V6ONLY option. // // Preconditions: the backing TCP or UDP endpoint must be in initial state. func (so *SocketOptions) SetV6Only(v bool) { storeAtomicBool(&so.v6OnlyEnabled, v) } // GetQuickAck gets value for TCP_QUICKACK option. func (so *SocketOptions) GetQuickAck() bool { return so.quickAckEnabled.Load() != 0 } // SetQuickAck sets value for TCP_QUICKACK option. func (so *SocketOptions) SetQuickAck(v bool) { storeAtomicBool(&so.quickAckEnabled, v) } // GetDelayOption gets inverted value for TCP_NODELAY option. func (so *SocketOptions) GetDelayOption() bool { return so.delayOptionEnabled.Load() != 0 } // SetDelayOption sets inverted value for TCP_NODELAY option. func (so *SocketOptions) SetDelayOption(v bool) { storeAtomicBool(&so.delayOptionEnabled, v) so.handler.OnDelayOptionSet(v) } // GetCorkOption gets value for TCP_CORK option. func (so *SocketOptions) GetCorkOption() bool { return so.corkOptionEnabled.Load() != 0 } // SetCorkOption sets value for TCP_CORK option. func (so *SocketOptions) SetCorkOption(v bool) { storeAtomicBool(&so.corkOptionEnabled, v) so.handler.OnCorkOptionSet(v) } // GetReceiveOriginalDstAddress gets value for IP(V6)_RECVORIGDSTADDR option. func (so *SocketOptions) GetReceiveOriginalDstAddress() bool { return so.receiveOriginalDstAddress.Load() != 0 } // SetReceiveOriginalDstAddress sets value for IP(V6)_RECVORIGDSTADDR option. func (so *SocketOptions) SetReceiveOriginalDstAddress(v bool) { storeAtomicBool(&so.receiveOriginalDstAddress, v) } // GetIPv4RecvError gets value for IP_RECVERR option. func (so *SocketOptions) GetIPv4RecvError() bool { return so.ipv4RecvErrEnabled.Load() != 0 } // SetIPv4RecvError sets value for IP_RECVERR option. func (so *SocketOptions) SetIPv4RecvError(v bool) { storeAtomicBool(&so.ipv4RecvErrEnabled, v) if !v { so.pruneErrQueue() } } // GetIPv6RecvError gets value for IPV6_RECVERR option. func (so *SocketOptions) GetIPv6RecvError() bool { return so.ipv6RecvErrEnabled.Load() != 0 } // SetIPv6RecvError sets value for IPV6_RECVERR option. func (so *SocketOptions) SetIPv6RecvError(v bool) { storeAtomicBool(&so.ipv6RecvErrEnabled, v) if !v { so.pruneErrQueue() } } // GetLastError gets value for SO_ERROR option. func (so *SocketOptions) GetLastError() Error { return so.handler.LastError() } // GetOutOfBandInline gets value for SO_OOBINLINE option. func (*SocketOptions) GetOutOfBandInline() bool { return true } // SetOutOfBandInline sets value for SO_OOBINLINE option. We currently do not // support disabling this option. func (*SocketOptions) SetOutOfBandInline(bool) {} // GetLinger gets value for SO_LINGER option. func (so *SocketOptions) GetLinger() LingerOption { so.mu.Lock() linger := so.linger so.mu.Unlock() return linger } // SetLinger sets value for SO_LINGER option. func (so *SocketOptions) SetLinger(linger LingerOption) { so.mu.Lock() so.linger = linger so.mu.Unlock() } // SockErrOrigin represents the constants for error origin. type SockErrOrigin uint8 const ( // SockExtErrorOriginNone represents an unknown error origin. SockExtErrorOriginNone SockErrOrigin = iota // SockExtErrorOriginLocal indicates a local error. SockExtErrorOriginLocal // SockExtErrorOriginICMP indicates an IPv4 ICMP error. SockExtErrorOriginICMP // SockExtErrorOriginICMP6 indicates an IPv6 ICMP error. SockExtErrorOriginICMP6 ) // IsICMPErr indicates if the error originated from an ICMP error. func (origin SockErrOrigin) IsICMPErr() bool { return origin == SockExtErrorOriginICMP || origin == SockExtErrorOriginICMP6 } // SockErrorCause is the cause of a socket error. type SockErrorCause interface { // Origin is the source of the error. Origin() SockErrOrigin // Type is the origin specific type of error. Type() uint8 // Code is the origin and type specific error code. Code() uint8 // Info is any extra information about the error. Info() uint32 } // LocalSockError is a socket error that originated from the local host. // // +stateify savable type LocalSockError struct { info uint32 } // Origin implements SockErrorCause. func (*LocalSockError) Origin() SockErrOrigin { return SockExtErrorOriginLocal } // Type implements SockErrorCause. func (*LocalSockError) Type() uint8 { return 0 } // Code implements SockErrorCause. func (*LocalSockError) Code() uint8 { return 0 } // Info implements SockErrorCause. func (l *LocalSockError) Info() uint32 { return l.info } // SockError represents a queue entry in the per-socket error queue. // // +stateify savable type SockError struct { sockErrorEntry // Err is the error caused by the errant packet. Err Error // Cause is the detailed cause of the error. Cause SockErrorCause // Payload is the errant packet's payload. Payload *buffer.View // Dst is the original destination address of the errant packet. Dst FullAddress // Offender is the original sender address of the errant packet. Offender FullAddress // NetProto is the network protocol being used to transmit the packet. NetProto NetworkProtocolNumber } // pruneErrQueue resets the queue. func (so *SocketOptions) pruneErrQueue() { so.errQueueMu.Lock() so.errQueue.Reset() so.errQueueMu.Unlock() } // DequeueErr dequeues a socket extended error from the error queue and returns // it. Returns nil if queue is empty. func (so *SocketOptions) DequeueErr() *SockError { so.errQueueMu.Lock() defer so.errQueueMu.Unlock() err := so.errQueue.Front() if err != nil { so.errQueue.Remove(err) } return err } // PeekErr returns the error in the front of the error queue. Returns nil if // the error queue is empty. func (so *SocketOptions) PeekErr() *SockError { so.errQueueMu.Lock() defer so.errQueueMu.Unlock() return so.errQueue.Front() } // QueueErr inserts the error at the back of the error queue. // // Preconditions: so.GetIPv4RecvError() or so.GetIPv6RecvError() is true. func (so *SocketOptions) QueueErr(err *SockError) { so.errQueueMu.Lock() defer so.errQueueMu.Unlock() so.errQueue.PushBack(err) } // QueueLocalErr queues a local error onto the local queue. func (so *SocketOptions) QueueLocalErr(err Error, net NetworkProtocolNumber, info uint32, dst FullAddress, payload *buffer.View) { so.QueueErr(&SockError{ Err: err, Cause: &LocalSockError{info: info}, Payload: payload, Dst: dst, NetProto: net, }) } // GetBindToDevice gets value for SO_BINDTODEVICE option. func (so *SocketOptions) GetBindToDevice() int32 { return so.bindToDevice.Load() } // SetBindToDevice sets value for SO_BINDTODEVICE option. If bindToDevice is // zero, the socket device binding is removed. func (so *SocketOptions) SetBindToDevice(bindToDevice int32) Error { if bindToDevice != 0 && !so.handler.HasNIC(bindToDevice) { return &ErrUnknownDevice{} } so.bindToDevice.Store(bindToDevice) return nil } // GetSendBufferSize gets value for SO_SNDBUF option. func (so *SocketOptions) GetSendBufferSize() int64 { return so.sendBufferSize.Load() } // SendBufferLimits returns the [min, max) range of allowable send buffer // sizes. func (so *SocketOptions) SendBufferLimits() (min, max int64) { limits := so.getSendBufferLimits(so.stackHandler) return int64(limits.Min), int64(limits.Max) } // SetSendBufferSize sets value for SO_SNDBUF option. notify indicates if the // stack handler should be invoked to set the send buffer size. func (so *SocketOptions) SetSendBufferSize(sendBufferSize int64, notify bool) { if notify { sendBufferSize = so.handler.OnSetSendBufferSize(sendBufferSize) } so.sendBufferSize.Store(sendBufferSize) if notify { so.handler.WakeupWriters() } } // GetReceiveBufferSize gets value for SO_RCVBUF option. func (so *SocketOptions) GetReceiveBufferSize() int64 { return so.receiveBufferSize.Load() } // ReceiveBufferLimits returns the [min, max) range of allowable receive buffer // sizes. func (so *SocketOptions) ReceiveBufferLimits() (min, max int64) { limits := so.getReceiveBufferLimits(so.stackHandler) return int64(limits.Min), int64(limits.Max) } // SetReceiveBufferSize sets the value of the SO_RCVBUF option, optionally // notifying the owning endpoint. func (so *SocketOptions) SetReceiveBufferSize(receiveBufferSize int64, notify bool) { var postSet func() if notify { oldSz := so.receiveBufferSize.Load() receiveBufferSize, postSet = so.handler.OnSetReceiveBufferSize(receiveBufferSize, oldSz) } so.receiveBufferSize.Store(receiveBufferSize) if postSet != nil { postSet() } } // GetRcvlowat gets value for SO_RCVLOWAT option. func (so *SocketOptions) GetRcvlowat() int32 { // TODO(b/226603727): Return so.rcvlowat after adding complete support // for SO_RCVLOWAT option. For now, return the default value of 1. defaultRcvlowat := int32(1) return defaultRcvlowat } // SetRcvlowat sets value for SO_RCVLOWAT option. func (so *SocketOptions) SetRcvlowat(rcvlowat int32) Error { so.rcvlowat.Store(rcvlowat) return nil } // GetAcceptConn gets value for SO_ACCEPTCONN option. func (so *SocketOptions) GetAcceptConn() bool { return so.handler.GetAcceptConn() } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/000077500000000000000000000000001465435605700216505ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/address_state_mutex.go000066400000000000000000000047641465435605700262610ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type addressStateRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var addressStatelockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type addressStatelockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *addressStateRWMutex) Lock() { locking.AddGLock(addressStateprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *addressStateRWMutex) NestedLock(i addressStatelockNameIndex) { locking.AddGLock(addressStateprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *addressStateRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(addressStateprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *addressStateRWMutex) NestedUnlock(i addressStatelockNameIndex) { m.mu.Unlock() locking.DelGLock(addressStateprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *addressStateRWMutex) RLock() { locking.AddGLock(addressStateprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *addressStateRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(addressStateprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *addressStateRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *addressStateRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *addressStateRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var addressStateprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func addressStateinitLockNames() {} func init() { addressStateinitLockNames() addressStateprefixIndex = locking.NewMutexClass(reflect.TypeOf(addressStateRWMutex{}), addressStatelockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/address_state_refs.go000066400000000000000000000102641465435605700260460ustar00rootroot00000000000000package stack import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const addressStateenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var addressStateobj *addressState // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type addressStateRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *addressStateRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *addressStateRefs) RefType() string { return fmt.Sprintf("%T", addressStateobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *addressStateRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *addressStateRefs) LogRefs() bool { return addressStateenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *addressStateRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *addressStateRefs) IncRef() { v := r.refCount.Add(1) if addressStateenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *addressStateRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if addressStateenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *addressStateRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if addressStateenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *addressStateRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/addressable_endpoint_state.go000066400000000000000000000707041465435605700275600ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "fmt" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" ) func (lifetimes *AddressLifetimes) sanitize() { if lifetimes.Deprecated { lifetimes.PreferredUntil = tcpip.MonotonicTime{} } } var _ AddressableEndpoint = (*AddressableEndpointState)(nil) // AddressableEndpointState is an implementation of an AddressableEndpoint. // // +stateify savable type AddressableEndpointState struct { networkEndpoint NetworkEndpoint options AddressableEndpointStateOptions // Lock ordering (from outer to inner lock ordering): // // AddressableEndpointState.mu // addressState.mu mu addressableEndpointStateRWMutex `state:"nosave"` // +checklocks:mu endpoints map[tcpip.Address]*addressState // +checklocks:mu primary []*addressState } // AddressableEndpointStateOptions contains options used to configure an // AddressableEndpointState. // // +stateify savable type AddressableEndpointStateOptions struct { // HiddenWhileDisabled determines whether addresses should be returned to // callers while the NetworkEndpoint this AddressableEndpointState belongs // to is disabled. HiddenWhileDisabled bool } // Init initializes the AddressableEndpointState with networkEndpoint. // // Must be called before calling any other function on m. func (a *AddressableEndpointState) Init(networkEndpoint NetworkEndpoint, options AddressableEndpointStateOptions) { a.networkEndpoint = networkEndpoint a.options = options a.mu.Lock() defer a.mu.Unlock() a.endpoints = make(map[tcpip.Address]*addressState) } // OnNetworkEndpointEnabledChanged must be called every time the // NetworkEndpoint this AddressableEndpointState belongs to is enabled or // disabled so that any AddressDispatchers can be notified of the NIC enabled // change. func (a *AddressableEndpointState) OnNetworkEndpointEnabledChanged() { a.mu.RLock() defer a.mu.RUnlock() for _, ep := range a.endpoints { ep.mu.Lock() ep.notifyChangedLocked() ep.mu.Unlock() } } // GetAddress returns the AddressEndpoint for the passed address. // // GetAddress does not increment the address's reference count or check if the // address is considered bound to the endpoint. // // Returns nil if the passed address is not associated with the endpoint. func (a *AddressableEndpointState) GetAddress(addr tcpip.Address) AddressEndpoint { a.mu.RLock() defer a.mu.RUnlock() ep, ok := a.endpoints[addr] if !ok { return nil } return ep } // ForEachEndpoint calls f for each address. // // Once f returns false, f will no longer be called. func (a *AddressableEndpointState) ForEachEndpoint(f func(AddressEndpoint) bool) { a.mu.RLock() defer a.mu.RUnlock() for _, ep := range a.endpoints { if !f(ep) { return } } } // ForEachPrimaryEndpoint calls f for each primary address. // // Once f returns false, f will no longer be called. func (a *AddressableEndpointState) ForEachPrimaryEndpoint(f func(AddressEndpoint) bool) { a.mu.RLock() defer a.mu.RUnlock() for _, ep := range a.primary { if !f(ep) { return } } } func (a *AddressableEndpointState) releaseAddressState(addrState *addressState) { a.mu.Lock() defer a.mu.Unlock() a.releaseAddressStateLocked(addrState) } // releaseAddressStateLocked removes addrState from a's address state // (primary and endpoints list). // // +checklocks:a.mu func (a *AddressableEndpointState) releaseAddressStateLocked(addrState *addressState) { oldPrimary := a.primary for i, s := range a.primary { if s == addrState { a.primary = append(a.primary[:i], a.primary[i+1:]...) oldPrimary[len(oldPrimary)-1] = nil break } } delete(a.endpoints, addrState.addr.Address) } // AddAndAcquirePermanentAddress implements AddressableEndpoint. func (a *AddressableEndpointState) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, properties AddressProperties) (AddressEndpoint, tcpip.Error) { return a.AddAndAcquireAddress(addr, properties, Permanent) } // AddAndAcquireTemporaryAddress adds a temporary address. // // Returns *tcpip.ErrDuplicateAddress if the address exists. // // The temporary address's endpoint is acquired and returned. func (a *AddressableEndpointState) AddAndAcquireTemporaryAddress(addr tcpip.AddressWithPrefix, peb PrimaryEndpointBehavior) (AddressEndpoint, tcpip.Error) { return a.AddAndAcquireAddress(addr, AddressProperties{PEB: peb}, Temporary) } // AddAndAcquireAddress adds an address with the specified kind. // // Returns *tcpip.ErrDuplicateAddress if the address exists. func (a *AddressableEndpointState) AddAndAcquireAddress(addr tcpip.AddressWithPrefix, properties AddressProperties, kind AddressKind) (AddressEndpoint, tcpip.Error) { a.mu.Lock() defer a.mu.Unlock() ep, err := a.addAndAcquireAddressLocked(addr, properties, kind) // From https://golang.org/doc/faq#nil_error: // // Under the covers, interfaces are implemented as two elements, a type T and // a value V. // // An interface value is nil only if the V and T are both unset, (T=nil, V is // not set), In particular, a nil interface will always hold a nil type. If we // store a nil pointer of type *int inside an interface value, the inner type // will be *int regardless of the value of the pointer: (T=*int, V=nil). Such // an interface value will therefore be non-nil even when the pointer value V // inside is nil. // // Since addAndAcquireAddressLocked returns a nil value with a non-nil type, // we need to explicitly return nil below if ep is (a typed) nil. if ep == nil { return nil, err } return ep, err } // addAndAcquireAddressLocked adds, acquires and returns a permanent or // temporary address. // // If the addressable endpoint already has the address in a non-permanent state, // and addAndAcquireAddressLocked is adding a permanent address, that address is // promoted in place and its properties set to the properties provided. If the // address already exists in any other state, then *tcpip.ErrDuplicateAddress is // returned, regardless the kind of address that is being added. // // +checklocks:a.mu func (a *AddressableEndpointState) addAndAcquireAddressLocked(addr tcpip.AddressWithPrefix, properties AddressProperties, kind AddressKind) (*addressState, tcpip.Error) { var permanent bool switch kind { case PermanentExpired: panic(fmt.Sprintf("cannot add address %s in PermanentExpired state", addr)) case Permanent, PermanentTentative: permanent = true case Temporary: default: panic(fmt.Sprintf("unknown address kind: %d", kind)) } // attemptAddToPrimary is false when the address is already in the primary // address list. attemptAddToPrimary := true addrState, ok := a.endpoints[addr.Address] if ok { if !permanent { // We are adding a non-permanent address but the address exists. No need // to go any further since we can only promote existing temporary/expired // addresses to permanent. return nil, &tcpip.ErrDuplicateAddress{} } addrState.mu.RLock() if addrState.refs.ReadRefs() == 0 { panic(fmt.Sprintf("found an address that should have been released (ref count == 0); address = %s", addrState.addr)) } isPermanent := addrState.kind.IsPermanent() addrState.mu.RUnlock() if isPermanent { // We are adding a permanent address but a permanent address already // exists. return nil, &tcpip.ErrDuplicateAddress{} } // We now promote the address. for i, s := range a.primary { if s == addrState { switch properties.PEB { case CanBePrimaryEndpoint: // The address is already in the primary address list. attemptAddToPrimary = false case FirstPrimaryEndpoint: if i == 0 { // The address is already first in the primary address list. attemptAddToPrimary = false } else { a.primary = append(a.primary[:i], a.primary[i+1:]...) } case NeverPrimaryEndpoint: a.primary = append(a.primary[:i], a.primary[i+1:]...) default: panic(fmt.Sprintf("unrecognized primary endpoint behaviour = %d", properties.PEB)) } break } } addrState.refs.IncRef() } else { addrState = &addressState{ addressableEndpointState: a, addr: addr, temporary: properties.Temporary, // Cache the subnet in addrState to avoid calls to addr.Subnet() as that // results in allocations on every call. subnet: addr.Subnet(), } addrState.refs.InitRefs() a.endpoints[addr.Address] = addrState // We never promote an address to temporary - it can only be added as such. // If we are actually adding a permanent address, it is promoted below. addrState.kind = Temporary } // At this point we have an address we are either promoting from an expired or // temporary address to permanent, promoting an expired address to temporary, // or we are adding a new temporary or permanent address. // // The address MUST be write locked at this point. addrState.mu.Lock() defer addrState.mu.Unlock() if permanent { if addrState.kind.IsPermanent() { panic(fmt.Sprintf("only non-permanent addresses should be promoted to permanent; address = %s", addrState.addr)) } // Primary addresses are biased by 1. addrState.refs.IncRef() addrState.kind = kind } addrState.configType = properties.ConfigType lifetimes := properties.Lifetimes lifetimes.sanitize() addrState.lifetimes = lifetimes addrState.disp = properties.Disp if attemptAddToPrimary { switch properties.PEB { case NeverPrimaryEndpoint: case CanBePrimaryEndpoint: a.primary = append(a.primary, addrState) case FirstPrimaryEndpoint: if cap(a.primary) == len(a.primary) { a.primary = append([]*addressState{addrState}, a.primary...) } else { // Shift all the endpoints by 1 to make room for the new address at the // front. We could have just created a new slice but this saves // allocations when the slice has capacity for the new address. primaryCount := len(a.primary) a.primary = append(a.primary, nil) if n := copy(a.primary[1:], a.primary); n != primaryCount { panic(fmt.Sprintf("copied %d elements; expected = %d elements", n, primaryCount)) } a.primary[0] = addrState } default: panic(fmt.Sprintf("unrecognized primary endpoint behaviour = %d", properties.PEB)) } } addrState.notifyChangedLocked() return addrState, nil } // RemovePermanentAddress implements AddressableEndpoint. func (a *AddressableEndpointState) RemovePermanentAddress(addr tcpip.Address) tcpip.Error { a.mu.Lock() defer a.mu.Unlock() return a.removePermanentAddressLocked(addr) } // removePermanentAddressLocked is like RemovePermanentAddress but with locking // requirements. // // +checklocks:a.mu func (a *AddressableEndpointState) removePermanentAddressLocked(addr tcpip.Address) tcpip.Error { addrState, ok := a.endpoints[addr] if !ok { return &tcpip.ErrBadLocalAddress{} } return a.removePermanentEndpointLocked(addrState, AddressRemovalManualAction) } // RemovePermanentEndpoint removes the passed endpoint if it is associated with // a and permanent. func (a *AddressableEndpointState) RemovePermanentEndpoint(ep AddressEndpoint, reason AddressRemovalReason) tcpip.Error { addrState, ok := ep.(*addressState) if !ok || addrState.addressableEndpointState != a { return &tcpip.ErrInvalidEndpointState{} } a.mu.Lock() defer a.mu.Unlock() return a.removePermanentEndpointLocked(addrState, reason) } // removePermanentAddressLocked is like RemovePermanentAddress but with locking // requirements. // // +checklocks:a.mu func (a *AddressableEndpointState) removePermanentEndpointLocked(addrState *addressState, reason AddressRemovalReason) tcpip.Error { if !addrState.GetKind().IsPermanent() { return &tcpip.ErrBadLocalAddress{} } addrState.remove(reason) a.decAddressRefLocked(addrState) return nil } // decAddressRef decrements the address's reference count and releases it once // the reference count hits 0. func (a *AddressableEndpointState) decAddressRef(addrState *addressState) { a.mu.Lock() defer a.mu.Unlock() a.decAddressRefLocked(addrState) } // decAddressRefLocked is like decAddressRef but with locking requirements. // // +checklocks:a.mu func (a *AddressableEndpointState) decAddressRefLocked(addrState *addressState) { destroy := false addrState.refs.DecRef(func() { destroy = true }) if !destroy { return } addrState.mu.Lock() defer addrState.mu.Unlock() // A non-expired permanent address must not have its reference count dropped // to 0. if addrState.kind.IsPermanent() { panic(fmt.Sprintf("permanent addresses should be removed through the AddressableEndpoint: addr = %s, kind = %d", addrState.addr, addrState.kind)) } a.releaseAddressStateLocked(addrState) } // SetDeprecated implements stack.AddressableEndpoint. func (a *AddressableEndpointState) SetDeprecated(addr tcpip.Address, deprecated bool) tcpip.Error { a.mu.RLock() defer a.mu.RUnlock() addrState, ok := a.endpoints[addr] if !ok { return &tcpip.ErrBadLocalAddress{} } addrState.SetDeprecated(deprecated) return nil } // SetLifetimes implements stack.AddressableEndpoint. func (a *AddressableEndpointState) SetLifetimes(addr tcpip.Address, lifetimes AddressLifetimes) tcpip.Error { a.mu.RLock() defer a.mu.RUnlock() addrState, ok := a.endpoints[addr] if !ok { return &tcpip.ErrBadLocalAddress{} } addrState.SetLifetimes(lifetimes) return nil } // MainAddress implements AddressableEndpoint. func (a *AddressableEndpointState) MainAddress() tcpip.AddressWithPrefix { a.mu.RLock() defer a.mu.RUnlock() ep := a.acquirePrimaryAddressRLocked(tcpip.Address{}, tcpip.Address{} /* srcHint */, func(ep *addressState) bool { switch kind := ep.GetKind(); kind { case Permanent: return a.networkEndpoint.Enabled() || !a.options.HiddenWhileDisabled case PermanentTentative, PermanentExpired, Temporary: return false default: panic(fmt.Sprintf("unknown address kind: %d", kind)) } }) if ep == nil { return tcpip.AddressWithPrefix{} } addr := ep.AddressWithPrefix() // Note that when ep must have a ref count >=2, because its ref count // must be >=1 in order to be found and the ref count was incremented // when a reference was acquired. The only way for the ref count to // drop below 2 is for the endpoint to be removed, which requires a // write lock; so we're guaranteed to be able to decrement the ref // count and not need to remove the endpoint from a.primary. ep.decRefMustNotFree() return addr } // acquirePrimaryAddressRLocked returns an acquired primary address that is // valid according to isValid. // // +checklocksread:a.mu func (a *AddressableEndpointState) acquirePrimaryAddressRLocked(remoteAddr, srcHint tcpip.Address, isValid func(*addressState) bool) *addressState { // TODO: Move this out into IPv4-specific code. // IPv6 handles source IP selection elsewhere. We have to do source // selection only for IPv4, in which case ep is never deprecated. Thus // we don't have to worry about refcounts. if remoteAddr.Len() == header.IPv4AddressSize && remoteAddr != (tcpip.Address{}) { var best *addressState var bestLen uint8 for _, state := range a.primary { if !isValid(state) { continue } // Source hint takes precedent over prefix matching. if state.addr.Address == srcHint && srcHint != (tcpip.Address{}) { best = state break } stateLen := state.addr.Address.MatchingPrefix(remoteAddr) if best == nil || bestLen < stateLen { best = state bestLen = stateLen } } if best != nil && best.TryIncRef() { return best } } var deprecatedEndpoint *addressState for _, ep := range a.primary { if !isValid(ep) { continue } if !ep.Deprecated() { if ep.TryIncRef() { // ep is not deprecated, so return it immediately. // // If we kept track of a deprecated endpoint, decrement its reference // count since it was incremented when we decided to keep track of it. if deprecatedEndpoint != nil { // Note that when deprecatedEndpoint was found, its ref count // must have necessarily been >=1, and after incrementing it // must be >=2. The only way for the ref count to drop below 2 is // for the endpoint to be removed, which requires a write lock; // so we're guaranteed to be able to decrement the ref count // and not need to remove the endpoint from a.primary. deprecatedEndpoint.decRefMustNotFree() } return ep } } else if deprecatedEndpoint == nil && ep.TryIncRef() { // We prefer an endpoint that is not deprecated, but we keep track of // ep in case a doesn't have any non-deprecated endpoints. // // If we end up finding a more preferred endpoint, ep's reference count // will be decremented. deprecatedEndpoint = ep } } return deprecatedEndpoint } // AcquireAssignedAddressOrMatching returns an address endpoint that is // considered assigned to the addressable endpoint. // // If the address is an exact match with an existing address, that address is // returned. Otherwise, if f is provided, f is called with each address and // the address that f returns true for is returned. // // If there is no matching address, a temporary address will be returned if // allowTemp is true. // // If readOnly is true, the address will be returned without an extra reference. // In this case it is not safe to modify the endpoint, only read attributes like // subnet. // // Regardless how the address was obtained, it will be acquired before it is // returned. func (a *AddressableEndpointState) AcquireAssignedAddressOrMatching(localAddr tcpip.Address, f func(AddressEndpoint) bool, allowTemp bool, tempPEB PrimaryEndpointBehavior, readOnly bool) AddressEndpoint { lookup := func() *addressState { if addrState, ok := a.endpoints[localAddr]; ok { if !addrState.IsAssigned(allowTemp) { return nil } if !readOnly && !addrState.TryIncRef() { panic(fmt.Sprintf("failed to increase the reference count for address = %s", addrState.addr)) } return addrState } if f != nil { for _, addrState := range a.endpoints { if addrState.IsAssigned(allowTemp) && f(addrState) { if !readOnly && !addrState.TryIncRef() { continue } return addrState } } } return nil } // Avoid exclusive lock on mu unless we need to add a new address. a.mu.RLock() ep := lookup() a.mu.RUnlock() if ep != nil { return ep } if !allowTemp { return nil } // Acquire state lock in exclusive mode as we need to add a new temporary // endpoint. a.mu.Lock() defer a.mu.Unlock() // Do the lookup again in case another goroutine added the address in the time // we released and acquired the lock. ep = lookup() if ep != nil { return ep } // Proceed to add a new temporary endpoint. addr := localAddr.WithPrefix() ep, err := a.addAndAcquireAddressLocked(addr, AddressProperties{PEB: tempPEB}, Temporary) if err != nil { // addAndAcquireAddressLocked only returns an error if the address is // already assigned but we just checked above if the address exists so we // expect no error. panic(fmt.Sprintf("a.addAndAcquireAddressLocked(%s, AddressProperties{PEB: %s}, false): %s", addr, tempPEB, err)) } // From https://golang.org/doc/faq#nil_error: // // Under the covers, interfaces are implemented as two elements, a type T and // a value V. // // An interface value is nil only if the V and T are both unset, (T=nil, V is // not set), In particular, a nil interface will always hold a nil type. If we // store a nil pointer of type *int inside an interface value, the inner type // will be *int regardless of the value of the pointer: (T=*int, V=nil). Such // an interface value will therefore be non-nil even when the pointer value V // inside is nil. // // Since addAndAcquireAddressLocked returns a nil value with a non-nil type, // we need to explicitly return nil below if ep is (a typed) nil. if ep == nil { return nil } if readOnly { if ep.addressableEndpointState == a { // Checklocks doesn't understand that we are logically guaranteed to have // ep.mu locked already. We need to use checklocksignore to appease the // analyzer. ep.addressableEndpointState.decAddressRefLocked(ep) // +checklocksignore } else { ep.DecRef() } } return ep } // AcquireAssignedAddress implements AddressableEndpoint. func (a *AddressableEndpointState) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB PrimaryEndpointBehavior, readOnly bool) AddressEndpoint { return a.AcquireAssignedAddressOrMatching(localAddr, nil, allowTemp, tempPEB, readOnly) } // AcquireOutgoingPrimaryAddress implements AddressableEndpoint. func (a *AddressableEndpointState) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, srcHint tcpip.Address, allowExpired bool) AddressEndpoint { a.mu.Lock() defer a.mu.Unlock() ep := a.acquirePrimaryAddressRLocked(remoteAddr, srcHint, func(ep *addressState) bool { return ep.IsAssigned(allowExpired) }) // From https://golang.org/doc/faq#nil_error: // // Under the covers, interfaces are implemented as two elements, a type T and // a value V. // // An interface value is nil only if the V and T are both unset, (T=nil, V is // not set), In particular, a nil interface will always hold a nil type. If we // store a nil pointer of type *int inside an interface value, the inner type // will be *int regardless of the value of the pointer: (T=*int, V=nil). Such // an interface value will therefore be non-nil even when the pointer value V // inside is nil. // // Since acquirePrimaryAddressLocked returns a nil value with a non-nil type, // we need to explicitly return nil below if ep is (a typed) nil. if ep == nil { return nil } return ep } // PrimaryAddresses implements AddressableEndpoint. func (a *AddressableEndpointState) PrimaryAddresses() []tcpip.AddressWithPrefix { a.mu.RLock() defer a.mu.RUnlock() var addrs []tcpip.AddressWithPrefix if a.options.HiddenWhileDisabled && !a.networkEndpoint.Enabled() { return addrs } for _, ep := range a.primary { switch kind := ep.GetKind(); kind { // Don't include tentative, expired or temporary endpoints // to avoid confusion and prevent the caller from using // those. case PermanentTentative, PermanentExpired, Temporary: continue case Permanent: default: panic(fmt.Sprintf("address %s has unknown kind %d", ep.AddressWithPrefix(), kind)) } addrs = append(addrs, ep.AddressWithPrefix()) } return addrs } // PermanentAddresses implements AddressableEndpoint. func (a *AddressableEndpointState) PermanentAddresses() []tcpip.AddressWithPrefix { a.mu.RLock() defer a.mu.RUnlock() var addrs []tcpip.AddressWithPrefix for _, ep := range a.endpoints { if !ep.GetKind().IsPermanent() { continue } addrs = append(addrs, ep.AddressWithPrefix()) } return addrs } // Cleanup forcefully leaves all groups and removes all permanent addresses. func (a *AddressableEndpointState) Cleanup() { a.mu.Lock() defer a.mu.Unlock() for _, ep := range a.endpoints { // removePermanentEndpointLocked returns *tcpip.ErrBadLocalAddress if ep is // not a permanent address. switch err := a.removePermanentEndpointLocked(ep, AddressRemovalInterfaceRemoved); err.(type) { case nil, *tcpip.ErrBadLocalAddress: default: panic(fmt.Sprintf("unexpected error from removePermanentEndpointLocked(%s): %s", ep.addr, err)) } } } var _ AddressEndpoint = (*addressState)(nil) // addressState holds state for an address. // // +stateify savable type addressState struct { addressableEndpointState *AddressableEndpointState addr tcpip.AddressWithPrefix subnet tcpip.Subnet temporary bool // Lock ordering (from outer to inner lock ordering): // // AddressableEndpointState.mu // addressState.mu mu addressStateRWMutex `state:"nosave"` refs addressStateRefs // checklocks:mu kind AddressKind // checklocks:mu configType AddressConfigType // lifetimes holds this address' lifetimes. // // Invariant: if lifetimes.deprecated is true, then lifetimes.PreferredUntil // must be the zero value. Note that the converse does not need to be // upheld! // // checklocks:mu lifetimes AddressLifetimes // The enclosing mutex must be write-locked before calling methods on the // dispatcher. // // checklocks:mu disp AddressDispatcher } // AddressWithPrefix implements AddressEndpoint. func (a *addressState) AddressWithPrefix() tcpip.AddressWithPrefix { return a.addr } // Subnet implements AddressEndpoint. func (a *addressState) Subnet() tcpip.Subnet { return a.subnet } // GetKind implements AddressEndpoint. func (a *addressState) GetKind() AddressKind { a.mu.RLock() defer a.mu.RUnlock() return a.kind } // SetKind implements AddressEndpoint. func (a *addressState) SetKind(kind AddressKind) { a.mu.Lock() defer a.mu.Unlock() prevKind := a.kind a.kind = kind if kind == PermanentExpired { a.notifyRemovedLocked(AddressRemovalManualAction) } else if prevKind != kind && a.addressableEndpointState.networkEndpoint.Enabled() { a.notifyChangedLocked() } } // notifyRemovedLocked notifies integrators of address removal. // // +checklocks:a.mu func (a *addressState) notifyRemovedLocked(reason AddressRemovalReason) { if disp := a.disp; disp != nil { a.disp.OnRemoved(reason) a.disp = nil } } func (a *addressState) remove(reason AddressRemovalReason) { a.mu.Lock() defer a.mu.Unlock() a.kind = PermanentExpired a.notifyRemovedLocked(reason) } // IsAssigned implements AddressEndpoint. func (a *addressState) IsAssigned(allowExpired bool) bool { switch kind := a.GetKind(); kind { case PermanentTentative: return false case PermanentExpired: return allowExpired case Permanent, Temporary: return true default: panic(fmt.Sprintf("address %s has unknown kind %d", a.AddressWithPrefix(), kind)) } } // IncRef implements AddressEndpoint. func (a *addressState) TryIncRef() bool { return a.refs.TryIncRef() } // DecRef implements AddressEndpoint. func (a *addressState) DecRef() { a.addressableEndpointState.decAddressRef(a) } // decRefMustNotFree decreases the reference count with the guarantee that the // reference count will be greater than 0 after the decrement. // // Panics if the ref count is less than 2 after acquiring the lock in this // function. func (a *addressState) decRefMustNotFree() { a.refs.DecRef(func() { panic(fmt.Sprintf("cannot decrease addressState %s without freeing the endpoint", a.addr)) }) } // ConfigType implements AddressEndpoint. func (a *addressState) ConfigType() AddressConfigType { a.mu.RLock() defer a.mu.RUnlock() return a.configType } // notifyChangedLocked notifies integrators of address property changes. // // +checklocks:a.mu func (a *addressState) notifyChangedLocked() { if a.disp == nil { return } state := AddressDisabled if a.addressableEndpointState.networkEndpoint.Enabled() { switch a.kind { case Permanent: state = AddressAssigned case PermanentTentative: state = AddressTentative case Temporary, PermanentExpired: return default: panic(fmt.Sprintf("unrecognized address kind = %d", a.kind)) } } a.disp.OnChanged(a.lifetimes, state) } // SetDeprecated implements AddressEndpoint. func (a *addressState) SetDeprecated(d bool) { a.mu.Lock() defer a.mu.Unlock() var changed bool if a.lifetimes.Deprecated != d { a.lifetimes.Deprecated = d changed = true } if d { a.lifetimes.PreferredUntil = tcpip.MonotonicTime{} } if changed { a.notifyChangedLocked() } } // Deprecated implements AddressEndpoint. func (a *addressState) Deprecated() bool { a.mu.RLock() defer a.mu.RUnlock() return a.lifetimes.Deprecated } // SetLifetimes implements AddressEndpoint. func (a *addressState) SetLifetimes(lifetimes AddressLifetimes) { a.mu.Lock() defer a.mu.Unlock() lifetimes.sanitize() var changed bool if a.lifetimes != lifetimes { changed = true } a.lifetimes = lifetimes if changed { a.notifyChangedLocked() } } // Lifetimes implements AddressEndpoint. func (a *addressState) Lifetimes() AddressLifetimes { a.mu.RLock() defer a.mu.RUnlock() return a.lifetimes } // Temporary implements AddressEndpoint. func (a *addressState) Temporary() bool { return a.temporary } // RegisterDispatcher implements AddressEndpoint. func (a *addressState) RegisterDispatcher(disp AddressDispatcher) { a.mu.Lock() defer a.mu.Unlock() if disp != nil { a.disp = disp a.notifyChangedLocked() } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/addressable_endpoint_state_mutex.go000066400000000000000000000054541465435605700310020ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type addressableEndpointStateRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var addressableEndpointStatelockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type addressableEndpointStatelockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *addressableEndpointStateRWMutex) Lock() { locking.AddGLock(addressableEndpointStateprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *addressableEndpointStateRWMutex) NestedLock(i addressableEndpointStatelockNameIndex) { locking.AddGLock(addressableEndpointStateprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *addressableEndpointStateRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(addressableEndpointStateprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *addressableEndpointStateRWMutex) NestedUnlock(i addressableEndpointStatelockNameIndex) { m.mu.Unlock() locking.DelGLock(addressableEndpointStateprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *addressableEndpointStateRWMutex) RLock() { locking.AddGLock(addressableEndpointStateprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *addressableEndpointStateRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(addressableEndpointStateprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *addressableEndpointStateRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *addressableEndpointStateRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *addressableEndpointStateRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var addressableEndpointStateprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func addressableEndpointStateinitLockNames() {} func init() { addressableEndpointStateinitLockNames() addressableEndpointStateprefixIndex = locking.NewMutexClass(reflect.TypeOf(addressableEndpointStateRWMutex{}), addressableEndpointStatelockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/bridge.go000066400000000000000000000137771465435605700234520ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" ) var _ NetworkLinkEndpoint = (*BridgeEndpoint)(nil) type bridgePort struct { bridge *BridgeEndpoint nic *nic } // ParseHeader implements stack.LinkEndpoint. func (p *bridgePort) ParseHeader(pkt *PacketBuffer) bool { _, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize) return ok } // DeliverNetworkPacket implements stack.NetworkDispatcher. func (p *bridgePort) DeliverNetworkPacket(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) { bridge := p.bridge bridge.mu.RLock() // Send the packet to all other ports. for _, port := range bridge.ports { if p == port { continue } newPkt := NewPacketBuffer(PacketBufferOptions{ ReserveHeaderBytes: int(port.nic.MaxHeaderLength()), Payload: pkt.ToBuffer(), }) port.nic.writeRawPacket(newPkt) newPkt.DecRef() } d := bridge.dispatcher bridge.mu.RUnlock() if d != nil { // The dispatcher may acquire Stack.mu in DeliverNetworkPacket(), which is // ordered above bridge.mu. So call DeliverNetworkPacket() without holding // bridge.mu to avoid circular locking. d.DeliverNetworkPacket(protocol, pkt) } } func (p *bridgePort) DeliverLinkPacket(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) { } // NewBridgeEndpoint creates a new bridge endpoint. func NewBridgeEndpoint(mtu uint32) *BridgeEndpoint { b := &BridgeEndpoint{ mtu: mtu, addr: tcpip.GetRandMacAddr(), } b.ports = make(map[tcpip.NICID]*bridgePort) return b } // BridgeEndpoint is a bridge endpoint. type BridgeEndpoint struct { mu bridgeRWMutex // +checklocks:mu ports map[tcpip.NICID]*bridgePort // +checklocks:mu dispatcher NetworkDispatcher // +checklocks:mu addr tcpip.LinkAddress // +checklocks:mu attached bool // +checklocks:mu mtu uint32 maxHeaderLength atomicbitops.Uint32 } // WritePackets implements stack.LinkEndpoint.WritePackets. func (b *BridgeEndpoint) WritePackets(pkts PacketBufferList) (int, tcpip.Error) { b.mu.RLock() defer b.mu.RUnlock() pktsSlice := pkts.AsSlice() n := len(pktsSlice) for _, p := range b.ports { for _, pkt := range pktsSlice { // In order to properly loop back to the inbound side we must create a // fresh packet that only contains the underlying payload with no headers // or struct fields set. newPkt := NewPacketBuffer(PacketBufferOptions{ Payload: pkt.ToBuffer(), ReserveHeaderBytes: int(p.nic.MaxHeaderLength()), }) newPkt.EgressRoute = pkt.EgressRoute newPkt.NetworkProtocolNumber = pkt.NetworkProtocolNumber p.nic.writePacket(newPkt) newPkt.DecRef() } } return n, nil } // AddNIC adds the specified NIC to the bridge. func (b *BridgeEndpoint) AddNIC(n *nic) tcpip.Error { b.mu.Lock() defer b.mu.Unlock() port := &bridgePort{ nic: n, bridge: b, } n.NetworkLinkEndpoint.Attach(port) b.ports[n.id] = port if b.maxHeaderLength.Load() < uint32(n.MaxHeaderLength()) { b.maxHeaderLength.Store(uint32(n.MaxHeaderLength())) } return nil } // DelNIC remove the specified NIC from the bridge. func (b *BridgeEndpoint) DelNIC(nic *nic) tcpip.Error { b.mu.Lock() defer b.mu.Unlock() delete(b.ports, nic.id) nic.NetworkLinkEndpoint.Attach(nic) return nil } // MTU implements stack.LinkEndpoint.MTU. func (b *BridgeEndpoint) MTU() uint32 { b.mu.RLock() defer b.mu.RUnlock() if b.mtu > header.EthernetMinimumSize { return b.mtu - header.EthernetMinimumSize } return 0 } // SetMTU implements stack.LinkEndpoint.SetMTU. func (b *BridgeEndpoint) SetMTU(mtu uint32) { b.mu.Lock() defer b.mu.Unlock() b.mtu = mtu } // MaxHeaderLength implements stack.LinkEndpoint. func (b *BridgeEndpoint) MaxHeaderLength() uint16 { return uint16(b.maxHeaderLength.Load()) } // LinkAddress implements stack.LinkEndpoint.LinkAddress. func (b *BridgeEndpoint) LinkAddress() tcpip.LinkAddress { b.mu.Lock() defer b.mu.Unlock() return b.addr } // SetLinkAddress implements stack.LinkEndpoint.SetLinkAddress. func (b *BridgeEndpoint) SetLinkAddress(addr tcpip.LinkAddress) { b.mu.Lock() defer b.mu.Unlock() b.addr = addr } // Capabilities implements stack.LinkEndpoint.Capabilities. func (b *BridgeEndpoint) Capabilities() LinkEndpointCapabilities { return CapabilityRXChecksumOffload | CapabilitySaveRestore | CapabilityResolutionRequired } // Attach implements stack.LinkEndpoint.Attach. func (b *BridgeEndpoint) Attach(dispatcher NetworkDispatcher) { b.mu.Lock() defer b.mu.Unlock() for _, p := range b.ports { p.nic.Primary = nil } b.dispatcher = dispatcher b.ports = make(map[tcpip.NICID]*bridgePort) } // IsAttached implements stack.LinkEndpoint.IsAttached. func (b *BridgeEndpoint) IsAttached() bool { b.mu.RLock() defer b.mu.RUnlock() return b.dispatcher != nil } // Wait implements stack.LinkEndpoint.Wait. func (b *BridgeEndpoint) Wait() { } // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType. func (b *BridgeEndpoint) ARPHardwareType() header.ARPHardwareType { return header.ARPHardwareEther } // AddHeader implements stack.LinkEndpoint.AddHeader. func (b *BridgeEndpoint) AddHeader(pkt *PacketBuffer) { } // ParseHeader implements stack.LinkEndpoint.ParseHeader. func (b *BridgeEndpoint) ParseHeader(*PacketBuffer) bool { return true } // Close implements stack.LinkEndpoint.Close. func (b *BridgeEndpoint) Close() {} // SetOnCloseAction implements stack.LinkEndpoint.Close. func (b *BridgeEndpoint) SetOnCloseAction(func()) {} golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/bridge_mutex.go000066400000000000000000000045301465435605700246570ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type bridgeRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var bridgelockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type bridgelockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *bridgeRWMutex) Lock() { locking.AddGLock(bridgeprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *bridgeRWMutex) NestedLock(i bridgelockNameIndex) { locking.AddGLock(bridgeprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *bridgeRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(bridgeprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *bridgeRWMutex) NestedUnlock(i bridgelockNameIndex) { m.mu.Unlock() locking.DelGLock(bridgeprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *bridgeRWMutex) RLock() { locking.AddGLock(bridgeprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *bridgeRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(bridgeprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *bridgeRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *bridgeRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *bridgeRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var bridgeprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func bridgeinitLockNames() {} func init() { bridgeinitLockNames() bridgeprefixIndex = locking.NewMutexClass(reflect.TypeOf(bridgeRWMutex{}), bridgelockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/bucket_mutex.go000066400000000000000000000046361465435605700247070ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type bucketRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var bucketlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type bucketlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. const ( bucketLockOthertuple = bucketlockNameIndex(0) ) const () // Lock locks m. // +checklocksignore func (m *bucketRWMutex) Lock() { locking.AddGLock(bucketprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *bucketRWMutex) NestedLock(i bucketlockNameIndex) { locking.AddGLock(bucketprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *bucketRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(bucketprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *bucketRWMutex) NestedUnlock(i bucketlockNameIndex) { m.mu.Unlock() locking.DelGLock(bucketprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *bucketRWMutex) RLock() { locking.AddGLock(bucketprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *bucketRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(bucketprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *bucketRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *bucketRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *bucketRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var bucketprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func bucketinitLockNames() { bucketlockNames = []string{"otherTuple"} } func init() { bucketinitLockNames() bucketprefixIndex = locking.NewMutexClass(reflect.TypeOf(bucketRWMutex{}), bucketlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/cleanup_endpoints_mutex.go000066400000000000000000000034351465435605700271400ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type cleanupEndpointsMutex struct { mu sync.Mutex } var cleanupEndpointsprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var cleanupEndpointslockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type cleanupEndpointslockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *cleanupEndpointsMutex) Lock() { locking.AddGLock(cleanupEndpointsprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *cleanupEndpointsMutex) NestedLock(i cleanupEndpointslockNameIndex) { locking.AddGLock(cleanupEndpointsprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *cleanupEndpointsMutex) Unlock() { locking.DelGLock(cleanupEndpointsprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *cleanupEndpointsMutex) NestedUnlock(i cleanupEndpointslockNameIndex) { locking.DelGLock(cleanupEndpointsprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func cleanupEndpointsinitLockNames() {} func init() { cleanupEndpointsinitLockNames() cleanupEndpointsprefixIndex = locking.NewMutexClass(reflect.TypeOf(cleanupEndpointsMutex{}), cleanupEndpointslockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/conn_mutex.go000066400000000000000000000044441465435605700243640ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type connRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var connlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type connlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *connRWMutex) Lock() { locking.AddGLock(connprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *connRWMutex) NestedLock(i connlockNameIndex) { locking.AddGLock(connprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *connRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(connprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *connRWMutex) NestedUnlock(i connlockNameIndex) { m.mu.Unlock() locking.DelGLock(connprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *connRWMutex) RLock() { locking.AddGLock(connprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *connRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(connprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *connRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *connRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *connRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var connprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func conninitLockNames() {} func init() { conninitLockNames() connprefixIndex = locking.NewMutexClass(reflect.TypeOf(connRWMutex{}), connlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/conn_track_mutex.go000066400000000000000000000046461465435605700255540ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type connTrackRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var connTracklockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type connTracklockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *connTrackRWMutex) Lock() { locking.AddGLock(connTrackprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *connTrackRWMutex) NestedLock(i connTracklockNameIndex) { locking.AddGLock(connTrackprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *connTrackRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(connTrackprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *connTrackRWMutex) NestedUnlock(i connTracklockNameIndex) { m.mu.Unlock() locking.DelGLock(connTrackprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *connTrackRWMutex) RLock() { locking.AddGLock(connTrackprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *connTrackRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(connTrackprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *connTrackRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *connTrackRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *connTrackRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var connTrackprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func connTrackinitLockNames() {} func init() { connTrackinitLockNames() connTrackprefixIndex = locking.NewMutexClass(reflect.TypeOf(connTrackRWMutex{}), connTracklockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/conntrack.go000066400000000000000000001124771465435605700241750ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "encoding/binary" "fmt" "math" "math/rand" "sync" "time" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/transport/tcpconntrack" ) // Connection tracking is used to track and manipulate packets for NAT rules. // The connection is created for a packet if it does not exist. Every // connection contains two tuples (original and reply). The tuples are // manipulated if there is a matching NAT rule. The packet is modified by // looking at the tuples in each hook. // // Currently, only TCP tracking is supported. // Our hash table has 16K buckets. const numBuckets = 1 << 14 const ( establishedTimeout time.Duration = 5 * 24 * time.Hour unestablishedTimeout time.Duration = 120 * time.Second ) // tuple holds a connection's identifying and manipulating data in one // direction. It is immutable. // // +stateify savable type tuple struct { // tupleEntry is used to build an intrusive list of tuples. tupleEntry // conn is the connection tracking entry this tuple belongs to. conn *conn // reply is true iff the tuple's direction is opposite that of the first // packet seen on the connection. reply bool // tupleID is set at initialization and is immutable. tupleID tupleID } // tupleID uniquely identifies a trackable connection in one direction. // // +stateify savable type tupleID struct { srcAddr tcpip.Address // The source port of a packet in the original direction is overloaded with // the ident of an Echo Request packet. // // This also matches the behaviour of sending packets on Linux where the // socket's source port value is used for the source port of outgoing packets // for TCP/UDP and the ident field for outgoing Echo Requests on Ping sockets: // // IPv4: https://github.com/torvalds/linux/blob/c5c17547b778975b3d83a73c8d84e8fb5ecf3ba5/net/ipv4/ping.c#L810 // IPv6: https://github.com/torvalds/linux/blob/c5c17547b778975b3d83a73c8d84e8fb5ecf3ba5/net/ipv6/ping.c#L133 srcPortOrEchoRequestIdent uint16 dstAddr tcpip.Address // The opposite of srcPortOrEchoRequestIdent; the destination port of a packet // in the reply direction is overloaded with the ident of an Echo Reply. dstPortOrEchoReplyIdent uint16 transProto tcpip.TransportProtocolNumber netProto tcpip.NetworkProtocolNumber } // reply creates the reply tupleID. func (ti tupleID) reply() tupleID { return tupleID{ srcAddr: ti.dstAddr, srcPortOrEchoRequestIdent: ti.dstPortOrEchoReplyIdent, dstAddr: ti.srcAddr, dstPortOrEchoReplyIdent: ti.srcPortOrEchoRequestIdent, transProto: ti.transProto, netProto: ti.netProto, } } type manipType int const ( // manipNotPerformed indicates that NAT has not been performed. manipNotPerformed manipType = iota // manipPerformed indicates that NAT was performed. manipPerformed // manipPerformedNoop indicates that NAT was performed but it was a no-op. manipPerformedNoop ) type finalizeResult uint32 const ( // A finalizeResult must be explicitly set so we don't make use of the zero // value. _ finalizeResult = iota finalizeResultSuccess finalizeResultConflict ) // conn is a tracked connection. // // +stateify savable type conn struct { ct *ConnTrack // original is the tuple in original direction. It is immutable. original tuple // reply is the tuple in reply direction. reply tuple // TODO(b/341946753): Restore when netstack is savable. finalizeOnce sync.Once `state:"nosave"` // Holds a finalizeResult. finalizeResult atomicbitops.Uint32 mu connRWMutex `state:"nosave"` // sourceManip indicates the source manipulation type. // // +checklocks:mu sourceManip manipType // destinationManip indicates the destination's manipulation type. // // +checklocks:mu destinationManip manipType stateMu stateConnRWMutex `state:"nosave"` // tcb is TCB control block. It is used to keep track of states // of tcp connection. // // +checklocks:stateMu tcb tcpconntrack.TCB // lastUsed is the last time the connection saw a relevant packet, and // is updated by each packet on the connection. // // +checklocks:stateMu lastUsed tcpip.MonotonicTime } // timedOut returns whether the connection timed out based on its state. func (cn *conn) timedOut(now tcpip.MonotonicTime) bool { cn.stateMu.RLock() defer cn.stateMu.RUnlock() if cn.tcb.State() == tcpconntrack.ResultAlive { // Use the same default as Linux, which doesn't delete // established connections for 5(!) days. return now.Sub(cn.lastUsed) > establishedTimeout } // Use the same default as Linux, which lets connections in most states // other than established remain for <= 120 seconds. return now.Sub(cn.lastUsed) > unestablishedTimeout } // update the connection tracking state. func (cn *conn) update(pkt *PacketBuffer, reply bool) { cn.stateMu.Lock() defer cn.stateMu.Unlock() // Mark the connection as having been used recently so it isn't reaped. cn.lastUsed = cn.ct.clock.NowMonotonic() if pkt.TransportProtocolNumber != header.TCPProtocolNumber { return } tcpHeader := header.TCP(pkt.TransportHeader().Slice()) // Update the state of tcb. tcb assumes it's always initialized on the // client. However, we only need to know whether the connection is // established or not, so the client/server distinction isn't important. if cn.tcb.IsEmpty() { cn.tcb.Init(tcpHeader, pkt.Data().Size()) return } if reply { cn.tcb.UpdateStateReply(tcpHeader, pkt.Data().Size()) } else { cn.tcb.UpdateStateOriginal(tcpHeader, pkt.Data().Size()) } } // ConnTrack tracks all connections created for NAT rules. Most users are // expected to only call handlePacket, insertRedirectConn, and maybeInsertNoop. // // ConnTrack keeps all connections in a slice of buckets, each of which holds a // linked list of tuples. This gives us some desirable properties: // - Each bucket has its own lock, lessening lock contention. // - The slice is large enough that lists stay short (<10 elements on average). // Thus traversal is fast. // - During linked list traversal we reap expired connections. This amortizes // the cost of reaping them and makes reapUnused faster. // // Locks are ordered by their location in the buckets slice. That is, a // goroutine that locks buckets[i] can only lock buckets[j] s.t. i < j. // // +stateify savable type ConnTrack struct { // seed is a one-time random value initialized at stack startup // and is used in the calculation of hash keys for the list of buckets. // It is immutable. seed uint32 // clock provides timing used to determine conntrack reapings. clock tcpip.Clock // TODO(b/341946753): Restore when netstack is savable. rand *rand.Rand `state:"nosave"` mu connTrackRWMutex `state:"nosave"` // mu protects the buckets slice, but not buckets' contents. Only take // the write lock if you are modifying the slice or saving for S/R. // // +checklocks:mu buckets []bucket } // +stateify savable type bucket struct { mu bucketRWMutex `state:"nosave"` // +checklocks:mu tuples tupleList } // A netAndTransHeadersFunc returns the network and transport headers found // in an ICMP payload. The transport layer's payload will not be returned. // // May panic if the packet does not hold the transport header. type netAndTransHeadersFunc func(icmpPayload []byte, minTransHdrLen int) (netHdr header.Network, transHdrBytes []byte) func v4NetAndTransHdr(icmpPayload []byte, minTransHdrLen int) (header.Network, []byte) { netHdr := header.IPv4(icmpPayload) // Do not use netHdr.Payload() as we might not hold the full packet // in the ICMP error; Payload() panics if the buffer is smaller than // the total length specified in the IPv4 header. transHdr := icmpPayload[netHdr.HeaderLength():] return netHdr, transHdr[:minTransHdrLen] } func v6NetAndTransHdr(icmpPayload []byte, minTransHdrLen int) (header.Network, []byte) { netHdr := header.IPv6(icmpPayload) // Do not use netHdr.Payload() as we might not hold the full packet // in the ICMP error; Payload() panics if the IP payload is smaller than // the payload length specified in the IPv6 header. transHdr := icmpPayload[header.IPv6MinimumSize:] return netHdr, transHdr[:minTransHdrLen] } func getEmbeddedNetAndTransHeaders(pkt *PacketBuffer, netHdrLength int, getNetAndTransHdr netAndTransHeadersFunc, transProto tcpip.TransportProtocolNumber) (header.Network, header.ChecksummableTransport, bool) { switch transProto { case header.TCPProtocolNumber: if netAndTransHeader, ok := pkt.Data().PullUp(netHdrLength + header.TCPMinimumSize); ok { netHeader, transHeaderBytes := getNetAndTransHdr(netAndTransHeader, header.TCPMinimumSize) return netHeader, header.TCP(transHeaderBytes), true } case header.UDPProtocolNumber: if netAndTransHeader, ok := pkt.Data().PullUp(netHdrLength + header.UDPMinimumSize); ok { netHeader, transHeaderBytes := getNetAndTransHdr(netAndTransHeader, header.UDPMinimumSize) return netHeader, header.UDP(transHeaderBytes), true } } return nil, nil, false } func getHeaders(pkt *PacketBuffer) (netHdr header.Network, transHdr header.Transport, isICMPError bool, ok bool) { switch pkt.TransportProtocolNumber { case header.TCPProtocolNumber: if tcpHeader := header.TCP(pkt.TransportHeader().Slice()); len(tcpHeader) >= header.TCPMinimumSize { return pkt.Network(), tcpHeader, false, true } return nil, nil, false, false case header.UDPProtocolNumber: if udpHeader := header.UDP(pkt.TransportHeader().Slice()); len(udpHeader) >= header.UDPMinimumSize { return pkt.Network(), udpHeader, false, true } return nil, nil, false, false case header.ICMPv4ProtocolNumber: icmpHeader := header.ICMPv4(pkt.TransportHeader().Slice()) if len(icmpHeader) < header.ICMPv4MinimumSize { return nil, nil, false, false } switch icmpType := icmpHeader.Type(); icmpType { case header.ICMPv4Echo, header.ICMPv4EchoReply: return pkt.Network(), icmpHeader, false, true case header.ICMPv4DstUnreachable, header.ICMPv4TimeExceeded, header.ICMPv4ParamProblem: default: panic(fmt.Sprintf("unexpected ICMPv4 type = %d", icmpType)) } h, ok := pkt.Data().PullUp(header.IPv4MinimumSize) if !ok { panic(fmt.Sprintf("should have a valid IPv4 packet; only have %d bytes, want at least %d bytes", pkt.Data().Size(), header.IPv4MinimumSize)) } if header.IPv4(h).HeaderLength() > header.IPv4MinimumSize { // TODO(https://gvisor.dev/issue/6765): Handle IPv4 options. panic("should have dropped packets with IPv4 options") } if netHdr, transHdr, ok := getEmbeddedNetAndTransHeaders(pkt, header.IPv4MinimumSize, v4NetAndTransHdr, pkt.tuple.tupleID.transProto); ok { return netHdr, transHdr, true, true } return nil, nil, false, false case header.ICMPv6ProtocolNumber: icmpHeader := header.ICMPv6(pkt.TransportHeader().Slice()) if len(icmpHeader) < header.ICMPv6MinimumSize { return nil, nil, false, false } switch icmpType := icmpHeader.Type(); icmpType { case header.ICMPv6EchoRequest, header.ICMPv6EchoReply: return pkt.Network(), icmpHeader, false, true case header.ICMPv6DstUnreachable, header.ICMPv6PacketTooBig, header.ICMPv6TimeExceeded, header.ICMPv6ParamProblem: default: panic(fmt.Sprintf("unexpected ICMPv6 type = %d", icmpType)) } h, ok := pkt.Data().PullUp(header.IPv6MinimumSize) if !ok { panic(fmt.Sprintf("should have a valid IPv6 packet; only have %d bytes, want at least %d bytes", pkt.Data().Size(), header.IPv6MinimumSize)) } // We do not support extension headers in ICMP errors so the next header // in the IPv6 packet should be a tracked protocol if we reach this point. // // TODO(https://gvisor.dev/issue/6789): Support extension headers. transProto := pkt.tuple.tupleID.transProto if got := header.IPv6(h).TransportProtocol(); got != transProto { panic(fmt.Sprintf("got TransportProtocol() = %d, want = %d", got, transProto)) } if netHdr, transHdr, ok := getEmbeddedNetAndTransHeaders(pkt, header.IPv6MinimumSize, v6NetAndTransHdr, transProto); ok { return netHdr, transHdr, true, true } return nil, nil, false, false default: panic(fmt.Sprintf("unexpected transport protocol = %d", pkt.TransportProtocolNumber)) } } func getTupleIDForRegularPacket(netHdr header.Network, netProto tcpip.NetworkProtocolNumber, transHdr header.Transport, transProto tcpip.TransportProtocolNumber) tupleID { return tupleID{ srcAddr: netHdr.SourceAddress(), srcPortOrEchoRequestIdent: transHdr.SourcePort(), dstAddr: netHdr.DestinationAddress(), dstPortOrEchoReplyIdent: transHdr.DestinationPort(), transProto: transProto, netProto: netProto, } } func getTupleIDForPacketInICMPError(pkt *PacketBuffer, getNetAndTransHdr netAndTransHeadersFunc, netProto tcpip.NetworkProtocolNumber, netLen int, transProto tcpip.TransportProtocolNumber) (tupleID, bool) { if netHdr, transHdr, ok := getEmbeddedNetAndTransHeaders(pkt, netLen, getNetAndTransHdr, transProto); ok { return tupleID{ srcAddr: netHdr.DestinationAddress(), srcPortOrEchoRequestIdent: transHdr.DestinationPort(), dstAddr: netHdr.SourceAddress(), dstPortOrEchoReplyIdent: transHdr.SourcePort(), transProto: transProto, netProto: netProto, }, true } return tupleID{}, false } type getTupleIDDisposition int const ( getTupleIDNotOK getTupleIDDisposition = iota getTupleIDOKAndAllowNewConn getTupleIDOKAndDontAllowNewConn ) func getTupleIDForEchoPacket(pkt *PacketBuffer, ident uint16, request bool) tupleID { netHdr := pkt.Network() tid := tupleID{ srcAddr: netHdr.SourceAddress(), dstAddr: netHdr.DestinationAddress(), transProto: pkt.TransportProtocolNumber, netProto: pkt.NetworkProtocolNumber, } if request { tid.srcPortOrEchoRequestIdent = ident } else { tid.dstPortOrEchoReplyIdent = ident } return tid } func getTupleID(pkt *PacketBuffer) (tupleID, getTupleIDDisposition) { switch pkt.TransportProtocolNumber { case header.TCPProtocolNumber: if transHeader := header.TCP(pkt.TransportHeader().Slice()); len(transHeader) >= header.TCPMinimumSize { return getTupleIDForRegularPacket(pkt.Network(), pkt.NetworkProtocolNumber, transHeader, pkt.TransportProtocolNumber), getTupleIDOKAndAllowNewConn } case header.UDPProtocolNumber: if transHeader := header.UDP(pkt.TransportHeader().Slice()); len(transHeader) >= header.UDPMinimumSize { return getTupleIDForRegularPacket(pkt.Network(), pkt.NetworkProtocolNumber, transHeader, pkt.TransportProtocolNumber), getTupleIDOKAndAllowNewConn } case header.ICMPv4ProtocolNumber: icmp := header.ICMPv4(pkt.TransportHeader().Slice()) if len(icmp) < header.ICMPv4MinimumSize { return tupleID{}, getTupleIDNotOK } switch icmp.Type() { case header.ICMPv4Echo: return getTupleIDForEchoPacket(pkt, icmp.Ident(), true /* request */), getTupleIDOKAndAllowNewConn case header.ICMPv4EchoReply: // Do not create a new connection in response to a reply packet as only // the first packet of a connection should create a conntrack entry but // a reply is never the first packet sent for a connection. return getTupleIDForEchoPacket(pkt, icmp.Ident(), false /* request */), getTupleIDOKAndDontAllowNewConn case header.ICMPv4DstUnreachable, header.ICMPv4TimeExceeded, header.ICMPv4ParamProblem: default: // Unsupported ICMP type for NAT-ing. return tupleID{}, getTupleIDNotOK } h, ok := pkt.Data().PullUp(header.IPv4MinimumSize) if !ok { return tupleID{}, getTupleIDNotOK } ipv4 := header.IPv4(h) if ipv4.HeaderLength() > header.IPv4MinimumSize { // TODO(https://gvisor.dev/issue/6765): Handle IPv4 options. return tupleID{}, getTupleIDNotOK } if tid, ok := getTupleIDForPacketInICMPError(pkt, v4NetAndTransHdr, header.IPv4ProtocolNumber, header.IPv4MinimumSize, ipv4.TransportProtocol()); ok { // Do not create a new connection in response to an ICMP error. return tid, getTupleIDOKAndDontAllowNewConn } case header.ICMPv6ProtocolNumber: icmp := header.ICMPv6(pkt.TransportHeader().Slice()) if len(icmp) < header.ICMPv6MinimumSize { return tupleID{}, getTupleIDNotOK } switch icmp.Type() { case header.ICMPv6EchoRequest: return getTupleIDForEchoPacket(pkt, icmp.Ident(), true /* request */), getTupleIDOKAndAllowNewConn case header.ICMPv6EchoReply: // Do not create a new connection in response to a reply packet as only // the first packet of a connection should create a conntrack entry but // a reply is never the first packet sent for a connection. return getTupleIDForEchoPacket(pkt, icmp.Ident(), false /* request */), getTupleIDOKAndDontAllowNewConn case header.ICMPv6DstUnreachable, header.ICMPv6PacketTooBig, header.ICMPv6TimeExceeded, header.ICMPv6ParamProblem: default: return tupleID{}, getTupleIDNotOK } h, ok := pkt.Data().PullUp(header.IPv6MinimumSize) if !ok { return tupleID{}, getTupleIDNotOK } // TODO(https://gvisor.dev/issue/6789): Handle extension headers. if tid, ok := getTupleIDForPacketInICMPError(pkt, v6NetAndTransHdr, header.IPv6ProtocolNumber, header.IPv6MinimumSize, header.IPv6(h).TransportProtocol()); ok { // Do not create a new connection in response to an ICMP error. return tid, getTupleIDOKAndDontAllowNewConn } } return tupleID{}, getTupleIDNotOK } func (ct *ConnTrack) init() { ct.mu.Lock() defer ct.mu.Unlock() ct.buckets = make([]bucket, numBuckets) } // getConnAndUpdate attempts to get a connection or creates one if no // connection exists for the packet and packet's protocol is trackable. // // If the packet's protocol is trackable, the connection's state is updated to // match the contents of the packet. func (ct *ConnTrack) getConnAndUpdate(pkt *PacketBuffer, skipChecksumValidation bool) *tuple { // Get or (maybe) create a connection. t := func() *tuple { var allowNewConn bool tid, res := getTupleID(pkt) switch res { case getTupleIDNotOK: return nil case getTupleIDOKAndAllowNewConn: allowNewConn = true case getTupleIDOKAndDontAllowNewConn: allowNewConn = false default: panic(fmt.Sprintf("unhandled %[1]T = %[1]d", res)) } // Just skip bad packets. They'll be rejected later by the appropriate // protocol package. switch pkt.TransportProtocolNumber { case header.TCPProtocolNumber: _, csumValid, ok := header.TCPValid( header.TCP(pkt.TransportHeader().Slice()), func() uint16 { return pkt.Data().Checksum() }, uint16(pkt.Data().Size()), tid.srcAddr, tid.dstAddr, pkt.RXChecksumValidated || skipChecksumValidation) if !csumValid || !ok { return nil } case header.UDPProtocolNumber: lengthValid, csumValid := header.UDPValid( header.UDP(pkt.TransportHeader().Slice()), func() uint16 { return pkt.Data().Checksum() }, uint16(pkt.Data().Size()), pkt.NetworkProtocolNumber, tid.srcAddr, tid.dstAddr, pkt.RXChecksumValidated || skipChecksumValidation) if !lengthValid || !csumValid { return nil } } ct.mu.RLock() bkt := &ct.buckets[ct.bucket(tid)] ct.mu.RUnlock() now := ct.clock.NowMonotonic() if t := bkt.connForTID(tid, now); t != nil { return t } if !allowNewConn { return nil } bkt.mu.Lock() defer bkt.mu.Unlock() // Make sure a connection wasn't added between when we last checked the // bucket and acquired the bucket's write lock. if t := bkt.connForTIDRLocked(tid, now); t != nil { return t } // This is the first packet we're seeing for the connection. Create an entry // for this new connection. conn := &conn{ ct: ct, original: tuple{tupleID: tid}, reply: tuple{tupleID: tid.reply(), reply: true}, lastUsed: now, } conn.original.conn = conn conn.reply.conn = conn // For now, we only map an entry for the packet's original tuple as NAT may be // performed on this connection. Until the packet goes through all the hooks // and its final address/port is known, we cannot know what the response // packet's addresses/ports will look like. // // This is okay because the destination cannot send its response until it // receives the packet; the packet will only be received once all the hooks // have been performed. // // See (*conn).finalize. bkt.tuples.PushFront(&conn.original) return &conn.original }() if t != nil { t.conn.update(pkt, t.reply) } return t } func (ct *ConnTrack) connForTID(tid tupleID) *tuple { ct.mu.RLock() bkt := &ct.buckets[ct.bucket(tid)] ct.mu.RUnlock() return bkt.connForTID(tid, ct.clock.NowMonotonic()) } func (bkt *bucket) connForTID(tid tupleID, now tcpip.MonotonicTime) *tuple { bkt.mu.RLock() defer bkt.mu.RUnlock() return bkt.connForTIDRLocked(tid, now) } // +checklocksread:bkt.mu func (bkt *bucket) connForTIDRLocked(tid tupleID, now tcpip.MonotonicTime) *tuple { for other := bkt.tuples.Front(); other != nil; other = other.Next() { if tid == other.tupleID && !other.conn.timedOut(now) { return other } } return nil } func (ct *ConnTrack) finalize(cn *conn) finalizeResult { ct.mu.RLock() buckets := ct.buckets ct.mu.RUnlock() { tid := cn.reply.tupleID id := ct.bucketWithTableLength(tid, len(buckets)) bkt := &buckets[id] bkt.mu.Lock() t := bkt.connForTIDRLocked(tid, ct.clock.NowMonotonic()) if t == nil { bkt.tuples.PushFront(&cn.reply) bkt.mu.Unlock() return finalizeResultSuccess } bkt.mu.Unlock() if t.conn == cn { // We already have an entry for the reply tuple. // // This can occur when the source address/port is the same as the // destination address/port. In this scenario, tid == tid.reply(). return finalizeResultSuccess } } // Another connection for the reply already exists. Remove the original and // let the caller know we failed. // // TODO(https://gvisor.dev/issue/6850): Investigate handling this clash // better. tid := cn.original.tupleID id := ct.bucketWithTableLength(tid, len(buckets)) bkt := &buckets[id] bkt.mu.Lock() defer bkt.mu.Unlock() bkt.tuples.Remove(&cn.original) return finalizeResultConflict } func (cn *conn) getFinalizeResult() finalizeResult { return finalizeResult(cn.finalizeResult.Load()) } // finalize attempts to finalize the connection and returns true iff the // connection was successfully finalized. // // If the connection failed to finalize, the caller should drop the packet // associated with the connection. // // If multiple goroutines attempt to finalize at the same time, only one // goroutine will perform the work to finalize the connection, but all // goroutines will block until the finalizing goroutine finishes finalizing. func (cn *conn) finalize() bool { cn.finalizeOnce.Do(func() { cn.finalizeResult.Store(uint32(cn.ct.finalize(cn))) }) switch res := cn.getFinalizeResult(); res { case finalizeResultSuccess: return true case finalizeResultConflict: return false default: panic(fmt.Sprintf("unhandled result = %d", res)) } } // If NAT has not been configured for this connection, either mark the // connection as configured for "no-op NAT", in the case of DNAT, or, in the // case of SNAT, perform source port remapping so that source ports used by // locally-generated traffic do not conflict with ports occupied by existing NAT // bindings. // // Note that in the typical case this is also a no-op, because `snatAction` // will do nothing if the original tuple is already unique. func (cn *conn) maybePerformNoopNAT(pkt *PacketBuffer, hook Hook, r *Route, dnat bool) { cn.mu.Lock() var manip *manipType if dnat { manip = &cn.destinationManip } else { manip = &cn.sourceManip } if *manip != manipNotPerformed { cn.mu.Unlock() _ = cn.handlePacket(pkt, hook, r) return } if dnat { *manip = manipPerformedNoop cn.mu.Unlock() _ = cn.handlePacket(pkt, hook, r) return } cn.mu.Unlock() // At this point, we know that NAT has not yet been performed on this // connection, and the DNAT case has been handled with a no-op. For SNAT, we // simply perform source port remapping to ensure that source ports for // locally generated traffic do not clash with ports used by existing NAT // bindings. _, _ = snatAction(pkt, hook, r, 0, tcpip.Address{}, true /* changePort */, false /* changeAddress */) } type portOrIdentRange struct { start uint16 size uint32 } // performNAT setups up the connection for the specified NAT and rewrites the // packet. // // If NAT has already been performed on the connection, then the packet will // be rewritten with the NAT performed on the connection, ignoring the passed // address and port range. // // Generally, only the first packet of a connection reaches this method; other // packets will be manipulated without needing to modify the connection. func (cn *conn) performNAT(pkt *PacketBuffer, hook Hook, r *Route, portsOrIdents portOrIdentRange, natAddress tcpip.Address, dnat, changePort, changeAddress bool) { lastPortOrIdent := func() uint16 { lastPortOrIdent := uint32(portsOrIdents.start) + portsOrIdents.size - 1 if lastPortOrIdent > math.MaxUint16 { panic(fmt.Sprintf("got lastPortOrIdent = %d, want <= MaxUint16(=%d); portsOrIdents=%#v", lastPortOrIdent, math.MaxUint16, portsOrIdents)) } return uint16(lastPortOrIdent) }() // Make sure the packet is re-written after performing NAT. defer func() { // handlePacket returns true if the packet may skip the NAT table as the // connection is already NATed, but if we reach this point we must be in the // NAT table, so the return value is useless for us. _ = cn.handlePacket(pkt, hook, r) }() cn.mu.Lock() defer cn.mu.Unlock() var manip *manipType var address *tcpip.Address var portOrIdent *uint16 if dnat { manip = &cn.destinationManip address = &cn.reply.tupleID.srcAddr portOrIdent = &cn.reply.tupleID.srcPortOrEchoRequestIdent } else { manip = &cn.sourceManip address = &cn.reply.tupleID.dstAddr portOrIdent = &cn.reply.tupleID.dstPortOrEchoReplyIdent } if *manip != manipNotPerformed { return } *manip = manipPerformed if changeAddress { *address = natAddress } // Everything below here is port-fiddling. if !changePort { return } // Does the current port/ident fit in the range? if portsOrIdents.start <= *portOrIdent && *portOrIdent <= lastPortOrIdent { // Yes, is the current reply tuple unique? // // Or, does the reply tuple refer to the same connection as the current one that // we are NATing? This would apply, for example, to a self-connected socket, // where the original and reply tuples are identical. other := cn.ct.connForTID(cn.reply.tupleID) if other == nil || other.conn == cn { // Yes! No need to change the port. return } } // Try our best to find a port/ident that results in a unique reply tuple. // // We limit the number of attempts to find a unique tuple to not waste a lot // of time looking for a unique tuple. // // Matches linux behaviour introduced in // https://github.com/torvalds/linux/commit/a504b703bb1da526a01593da0e4be2af9d9f5fa8. const maxAttemptsForInitialRound uint32 = 128 const minAttemptsToContinue = 16 allowedInitialAttempts := maxAttemptsForInitialRound if allowedInitialAttempts > portsOrIdents.size { allowedInitialAttempts = portsOrIdents.size } for maxAttempts := allowedInitialAttempts; ; maxAttempts /= 2 { // Start reach round with a random initial port/ident offset. randOffset := cn.ct.rand.Uint32() for i := uint32(0); i < maxAttempts; i++ { newPortOrIdentU32 := uint32(portsOrIdents.start) + (randOffset+i)%portsOrIdents.size if newPortOrIdentU32 > math.MaxUint16 { panic(fmt.Sprintf("got newPortOrIdentU32 = %d, want <= MaxUint16(=%d); portsOrIdents=%#v, randOffset=%d", newPortOrIdentU32, math.MaxUint16, portsOrIdents, randOffset)) } *portOrIdent = uint16(newPortOrIdentU32) if other := cn.ct.connForTID(cn.reply.tupleID); other == nil { // We found a unique tuple! return } } if maxAttempts == portsOrIdents.size { // We already tried all the ports/idents in the range so no need to keep // trying. return } if maxAttempts < minAttemptsToContinue { return } } // We did not find a unique tuple, use the last used port anyways. // TODO(https://gvisor.dev/issue/6850): Handle not finding a unique tuple // better (e.g. remove the connection and drop the packet). } // handlePacket attempts to handle a packet and perform NAT if the connection // has had NAT performed on it. // // Returns true if the packet can skip the NAT table. func (cn *conn) handlePacket(pkt *PacketBuffer, hook Hook, rt *Route) bool { netHdr, transHdr, isICMPError, ok := getHeaders(pkt) if !ok { return false } fullChecksum := false updatePseudoHeader := false natDone := &pkt.snatDone dnat := false switch hook { case Prerouting: // Packet came from outside the stack so it must have a checksum set // already. fullChecksum = true updatePseudoHeader = true natDone = &pkt.dnatDone dnat = true case Input: case Forward: panic("should not handle packet in the forwarding hook") case Output: natDone = &pkt.dnatDone dnat = true fallthrough case Postrouting: if pkt.TransportProtocolNumber == header.TCPProtocolNumber && pkt.GSOOptions.Type != GSONone && pkt.GSOOptions.NeedsCsum { updatePseudoHeader = true } else if rt.RequiresTXTransportChecksum() { fullChecksum = true updatePseudoHeader = true } default: panic(fmt.Sprintf("unrecognized hook = %d", hook)) } if *natDone { panic(fmt.Sprintf("packet already had NAT(dnat=%t) performed at hook=%s; pkt=%#v", dnat, hook, pkt)) } // TODO(gvisor.dev/issue/5748): TCP checksums on inbound packets should be // validated if checksum offloading is off. It may require IP defrag if the // packets are fragmented. reply := pkt.tuple.reply tid, manip := func() (tupleID, manipType) { cn.mu.RLock() defer cn.mu.RUnlock() if reply { tid := cn.original.tupleID if dnat { return tid, cn.sourceManip } return tid, cn.destinationManip } tid := cn.reply.tupleID if dnat { return tid, cn.destinationManip } return tid, cn.sourceManip }() switch manip { case manipNotPerformed: return false case manipPerformedNoop: *natDone = true return true case manipPerformed: default: panic(fmt.Sprintf("unhandled manip = %d", manip)) } newPort := tid.dstPortOrEchoReplyIdent newAddr := tid.dstAddr if dnat { newPort = tid.srcPortOrEchoRequestIdent newAddr = tid.srcAddr } rewritePacket( netHdr, transHdr, !dnat != isICMPError, fullChecksum, updatePseudoHeader, newPort, newAddr, ) *natDone = true if !isICMPError { return true } // We performed NAT on (erroneous) packet that triggered an ICMP response, but // not the ICMP packet itself. switch pkt.TransportProtocolNumber { case header.ICMPv4ProtocolNumber: icmp := header.ICMPv4(pkt.TransportHeader().Slice()) // TODO(https://gvisor.dev/issue/6788): Incrementally update ICMP checksum. icmp.SetChecksum(0) icmp.SetChecksum(header.ICMPv4Checksum(icmp, pkt.Data().Checksum())) network := header.IPv4(pkt.NetworkHeader().Slice()) if dnat { network.SetDestinationAddressWithChecksumUpdate(tid.srcAddr) } else { network.SetSourceAddressWithChecksumUpdate(tid.dstAddr) } case header.ICMPv6ProtocolNumber: network := header.IPv6(pkt.NetworkHeader().Slice()) srcAddr := network.SourceAddress() dstAddr := network.DestinationAddress() if dnat { dstAddr = tid.srcAddr } else { srcAddr = tid.dstAddr } icmp := header.ICMPv6(pkt.TransportHeader().Slice()) // TODO(https://gvisor.dev/issue/6788): Incrementally update ICMP checksum. icmp.SetChecksum(0) payload := pkt.Data() icmp.SetChecksum(header.ICMPv6Checksum(header.ICMPv6ChecksumParams{ Header: icmp, Src: srcAddr, Dst: dstAddr, PayloadCsum: payload.Checksum(), PayloadLen: payload.Size(), })) if dnat { network.SetDestinationAddress(dstAddr) } else { network.SetSourceAddress(srcAddr) } } return true } // bucket gets the conntrack bucket for a tupleID. // +checklocksread:ct.mu func (ct *ConnTrack) bucket(id tupleID) int { return ct.bucketWithTableLength(id, len(ct.buckets)) } func (ct *ConnTrack) bucketWithTableLength(id tupleID, tableLength int) int { h := jenkins.Sum32(ct.seed) h.Write(id.srcAddr.AsSlice()) h.Write(id.dstAddr.AsSlice()) shortBuf := make([]byte, 2) binary.LittleEndian.PutUint16(shortBuf, id.srcPortOrEchoRequestIdent) h.Write([]byte(shortBuf)) binary.LittleEndian.PutUint16(shortBuf, id.dstPortOrEchoReplyIdent) h.Write([]byte(shortBuf)) binary.LittleEndian.PutUint16(shortBuf, uint16(id.transProto)) h.Write([]byte(shortBuf)) binary.LittleEndian.PutUint16(shortBuf, uint16(id.netProto)) h.Write([]byte(shortBuf)) return int(h.Sum32()) % tableLength } // reapUnused deletes timed out entries from the conntrack map. The rules for // reaping are: // - Each call to reapUnused traverses a fraction of the conntrack table. // Specifically, it traverses len(ct.buckets)/fractionPerReaping. // - After reaping, reapUnused decides when it should next run based on the // ratio of expired connections to examined connections. If the ratio is // greater than maxExpiredPct, it schedules the next run quickly. Otherwise it // slightly increases the interval between runs. // - maxFullTraversal caps the time it takes to traverse the entire table. // // reapUnused returns the next bucket that should be checked and the time after // which it should be called again. func (ct *ConnTrack) reapUnused(start int, prevInterval time.Duration) (int, time.Duration) { const fractionPerReaping = 128 const maxExpiredPct = 50 const maxFullTraversal = 60 * time.Second const minInterval = 10 * time.Millisecond const maxInterval = maxFullTraversal / fractionPerReaping now := ct.clock.NowMonotonic() checked := 0 expired := 0 var idx int ct.mu.RLock() defer ct.mu.RUnlock() for i := 0; i < len(ct.buckets)/fractionPerReaping; i++ { idx = (i + start) % len(ct.buckets) bkt := &ct.buckets[idx] bkt.mu.Lock() for tuple := bkt.tuples.Front(); tuple != nil; { // reapTupleLocked updates tuple's next pointer so we grab it here. nextTuple := tuple.Next() checked++ if ct.reapTupleLocked(tuple, idx, bkt, now) { expired++ } tuple = nextTuple } bkt.mu.Unlock() } // We already checked buckets[idx]. idx++ // If half or more of the connections are expired, the table has gotten // stale. Reschedule quickly. expiredPct := 0 if checked != 0 { expiredPct = expired * 100 / checked } if expiredPct > maxExpiredPct { return idx, minInterval } if interval := prevInterval + minInterval; interval <= maxInterval { // Increment the interval between runs. return idx, interval } // We've hit the maximum interval. return idx, maxInterval } // reapTupleLocked tries to remove tuple and its reply from the table. It // returns whether the tuple's connection has timed out. // // Precondition: ct.mu is read locked and bkt.mu is write locked. // +checklocksread:ct.mu // +checklocks:bkt.mu func (ct *ConnTrack) reapTupleLocked(reapingTuple *tuple, bktID int, bkt *bucket, now tcpip.MonotonicTime) bool { if !reapingTuple.conn.timedOut(now) { return false } var otherTuple *tuple if reapingTuple.reply { otherTuple = &reapingTuple.conn.original } else { otherTuple = &reapingTuple.conn.reply } otherTupleBktID := ct.bucket(otherTuple.tupleID) replyTupleInserted := reapingTuple.conn.getFinalizeResult() == finalizeResultSuccess // To maintain lock order, we can only reap both tuples if the tuple for the // other direction appears later in the table. if bktID > otherTupleBktID && replyTupleInserted { return true } bkt.tuples.Remove(reapingTuple) if !replyTupleInserted { // The other tuple is the reply which has not yet been inserted. return true } // Reap the other connection. if bktID == otherTupleBktID { // Don't re-lock if both tuples are in the same bucket. bkt.tuples.Remove(otherTuple) } else { otherTupleBkt := &ct.buckets[otherTupleBktID] otherTupleBkt.mu.NestedLock(bucketLockOthertuple) otherTupleBkt.tuples.Remove(otherTuple) otherTupleBkt.mu.NestedUnlock(bucketLockOthertuple) } return true } func (ct *ConnTrack) originalDst(epID TransportEndpointID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber) (tcpip.Address, uint16, tcpip.Error) { // Lookup the connection. The reply's original destination // describes the original address. tid := tupleID{ srcAddr: epID.LocalAddress, srcPortOrEchoRequestIdent: epID.LocalPort, dstAddr: epID.RemoteAddress, dstPortOrEchoReplyIdent: epID.RemotePort, transProto: transProto, netProto: netProto, } t := ct.connForTID(tid) if t == nil { // Not a tracked connection. return tcpip.Address{}, 0, &tcpip.ErrNotConnected{} } t.conn.mu.RLock() defer t.conn.mu.RUnlock() if t.conn.destinationManip == manipNotPerformed { // Unmanipulated destination. return tcpip.Address{}, 0, &tcpip.ErrInvalidOptionValue{} } id := t.conn.original.tupleID return id.dstAddr, id.dstPortOrEchoReplyIdent, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/endpoints_by_nic_mutex.go000066400000000000000000000050501465435605700267470ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type endpointsByNICRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var endpointsByNIClockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type endpointsByNIClockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *endpointsByNICRWMutex) Lock() { locking.AddGLock(endpointsByNICprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *endpointsByNICRWMutex) NestedLock(i endpointsByNIClockNameIndex) { locking.AddGLock(endpointsByNICprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *endpointsByNICRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(endpointsByNICprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *endpointsByNICRWMutex) NestedUnlock(i endpointsByNIClockNameIndex) { m.mu.Unlock() locking.DelGLock(endpointsByNICprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *endpointsByNICRWMutex) RLock() { locking.AddGLock(endpointsByNICprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *endpointsByNICRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(endpointsByNICprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *endpointsByNICRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *endpointsByNICRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *endpointsByNICRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var endpointsByNICprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func endpointsByNICinitLockNames() {} func init() { endpointsByNICinitLockNames() endpointsByNICprefixIndex = locking.NewMutexClass(reflect.TypeOf(endpointsByNICRWMutex{}), endpointsByNIClockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/gro/000077500000000000000000000000001465435605700224375ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/gro/gro.go000066400000000000000000000452431465435605700235650ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package gro implements generic receive offload. package gro import ( "bytes" "fmt" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // TODO(b/256037250): Enable by default. // TODO(b/256037250): We parse headers here. We should save those headers in // PacketBuffers so they don't have to be re-parsed later. // TODO(b/256037250): I still see the occasional SACK block in the zero-loss // benchmark, which should not happen. // TODO(b/256037250): Some dispatchers, e.g. XDP and RecvMmsg, can receive // multiple packets at a time. Even if the GRO interval is 0, there is an // opportunity for coalescing. // TODO(b/256037250): We're doing some header parsing here, which presents the // opportunity to skip it later. // TODO(b/256037250): Can we pass a packet list up the stack too? const ( // groNBuckets is the number of GRO buckets. groNBuckets = 8 groNBucketsMask = groNBuckets - 1 // groBucketSize is the size of each GRO bucket. groBucketSize = 8 // groMaxPacketSize is the maximum size of a GRO'd packet. groMaxPacketSize = 1 << 16 // 65KB. ) // A groBucket holds packets that are undergoing GRO. type groBucket struct { // count is the number of packets in the bucket. count int // packets is the linked list of packets. packets groPacketList // packetsPrealloc and allocIdxs are used to preallocate and reuse // groPacket structs and avoid allocation. packetsPrealloc [groBucketSize]groPacket allocIdxs [groBucketSize]int } func (gb *groBucket) full() bool { return gb.count == groBucketSize } // insert inserts pkt into the bucket. func (gb *groBucket) insert(pkt *stack.PacketBuffer, ipHdr []byte, tcpHdr header.TCP) { groPkt := &gb.packetsPrealloc[gb.allocIdxs[gb.count]] *groPkt = groPacket{ pkt: pkt, ipHdr: ipHdr, tcpHdr: tcpHdr, initialLength: pkt.Data().Size(), // pkt.Data() contains network header. idx: groPkt.idx, } gb.count++ gb.packets.PushBack(groPkt) } // removeOldest removes the oldest packet from gb and returns the contained // PacketBuffer. gb must not be empty. func (gb *groBucket) removeOldest() *stack.PacketBuffer { pkt := gb.packets.Front() gb.packets.Remove(pkt) gb.count-- gb.allocIdxs[gb.count] = pkt.idx ret := pkt.pkt pkt.reset() return ret } // removeOne removes a packet from gb. It also resets pkt to its zero value. func (gb *groBucket) removeOne(pkt *groPacket) { gb.packets.Remove(pkt) gb.count-- gb.allocIdxs[gb.count] = pkt.idx pkt.reset() } // findGROPacket4 returns the groPkt that matches ipHdr and tcpHdr, or nil if // none exists. It also returns whether the groPkt should be flushed based on // differences between the two headers. func (gb *groBucket) findGROPacket4(pkt *stack.PacketBuffer, ipHdr header.IPv4, tcpHdr header.TCP) (*groPacket, bool) { for groPkt := gb.packets.Front(); groPkt != nil; groPkt = groPkt.Next() { // Do the addresses match? groIPHdr := header.IPv4(groPkt.ipHdr) if ipHdr.SourceAddress() != groIPHdr.SourceAddress() || ipHdr.DestinationAddress() != groIPHdr.DestinationAddress() { continue } // Do the ports match? if tcpHdr.SourcePort() != groPkt.tcpHdr.SourcePort() || tcpHdr.DestinationPort() != groPkt.tcpHdr.DestinationPort() { continue } // We've found a packet of the same flow. // IP checks. TOS, _ := ipHdr.TOS() groTOS, _ := groIPHdr.TOS() if ipHdr.TTL() != groIPHdr.TTL() || TOS != groTOS { return groPkt, true } // TCP checks. if shouldFlushTCP(groPkt, tcpHdr) { return groPkt, true } // There's an upper limit on coalesced packet size. if pkt.Data().Size()-header.IPv4MinimumSize-int(tcpHdr.DataOffset())+groPkt.pkt.Data().Size() >= groMaxPacketSize { return groPkt, true } return groPkt, false } return nil, false } // findGROPacket6 returns the groPkt that matches ipHdr and tcpHdr, or nil if // none exists. It also returns whether the groPkt should be flushed based on // differences between the two headers. func (gb *groBucket) findGROPacket6(pkt *stack.PacketBuffer, ipHdr header.IPv6, tcpHdr header.TCP) (*groPacket, bool) { for groPkt := gb.packets.Front(); groPkt != nil; groPkt = groPkt.Next() { // Do the addresses match? groIPHdr := header.IPv6(groPkt.ipHdr) if ipHdr.SourceAddress() != groIPHdr.SourceAddress() || ipHdr.DestinationAddress() != groIPHdr.DestinationAddress() { continue } // Need to check that headers are the same except: // - Traffic class, a difference of which causes a flush. // - Hop limit, a difference of which causes a flush. // - Length, which is checked later. // - Version, which is checked by an earlier call to IsValid(). trafficClass, flowLabel := ipHdr.TOS() groTrafficClass, groFlowLabel := groIPHdr.TOS() if flowLabel != groFlowLabel || ipHdr.NextHeader() != groIPHdr.NextHeader() { continue } // Unlike IPv4, IPv6 packets with extension headers can be coalesced. if !bytes.Equal(ipHdr[header.IPv6MinimumSize:], groIPHdr[header.IPv6MinimumSize:]) { continue } // Do the ports match? if tcpHdr.SourcePort() != groPkt.tcpHdr.SourcePort() || tcpHdr.DestinationPort() != groPkt.tcpHdr.DestinationPort() { continue } // We've found a packet of the same flow. // TCP checks. if shouldFlushTCP(groPkt, tcpHdr) { return groPkt, true } // Do the traffic class and hop limit match? if trafficClass != groTrafficClass || ipHdr.HopLimit() != groIPHdr.HopLimit() { return groPkt, true } // This limit is artificial for IPv6 -- we could allow even // larger packets via jumbograms. if pkt.Data().Size()-len(ipHdr)-int(tcpHdr.DataOffset())+groPkt.pkt.Data().Size() >= groMaxPacketSize { return groPkt, true } return groPkt, false } return nil, false } func (gb *groBucket) found(gd *GRO, groPkt *groPacket, flushGROPkt bool, pkt *stack.PacketBuffer, ipHdr []byte, tcpHdr header.TCP, updateIPHdr func([]byte, int)) { // Flush groPkt or merge the packets. pktSize := pkt.Data().Size() flags := tcpHdr.Flags() dataOff := tcpHdr.DataOffset() tcpPayloadSize := pkt.Data().Size() - len(ipHdr) - int(dataOff) if flushGROPkt { // Flush the existing GRO packet. pkt := groPkt.pkt gb.removeOne(groPkt) gd.handlePacket(pkt) pkt.DecRef() groPkt = nil } else if groPkt != nil { // Merge pkt in to GRO packet. pkt.Data().TrimFront(len(ipHdr) + int(dataOff)) groPkt.pkt.Data().Merge(pkt.Data()) // Update the IP total length. updateIPHdr(groPkt.ipHdr, tcpPayloadSize) // Add flags from the packet to the GRO packet. groPkt.tcpHdr.SetFlags(uint8(groPkt.tcpHdr.Flags() | (flags & (header.TCPFlagFin | header.TCPFlagPsh)))) pkt = nil } // Flush if the packet isn't the same size as the previous packets or // if certain flags are set. The reason for checking size equality is: // - If the packet is smaller than the others, this is likely the end // of some message. Peers will send MSS-sized packets until they have // insufficient data to do so. // - If the packet is larger than the others, this packet is either // malformed, a local GSO packet, or has already been handled by host // GRO. flush := header.TCPFlags(flags)&(header.TCPFlagUrg|header.TCPFlagPsh|header.TCPFlagRst|header.TCPFlagSyn|header.TCPFlagFin) != 0 flush = flush || tcpPayloadSize == 0 if groPkt != nil { flush = flush || pktSize != groPkt.initialLength } switch { case flush && groPkt != nil: // A merge occurred and we need to flush groPkt. pkt := groPkt.pkt gb.removeOne(groPkt) gd.handlePacket(pkt) pkt.DecRef() case flush && groPkt == nil: // No merge occurred and the incoming packet needs to be flushed. gd.handlePacket(pkt) case !flush && groPkt == nil: // New flow and we don't need to flush. Insert pkt into GRO. if gb.full() { // Head is always the oldest packet toFlush := gb.removeOldest() gb.insert(pkt.IncRef(), ipHdr, tcpHdr) gd.handlePacket(toFlush) toFlush.DecRef() } else { gb.insert(pkt.IncRef(), ipHdr, tcpHdr) } default: // A merge occurred and we don't need to flush anything. } } // A groPacket is packet undergoing GRO. It may be several packets coalesced // together. type groPacket struct { // groPacketEntry is an intrusive list. groPacketEntry // pkt is the coalesced packet. pkt *stack.PacketBuffer // ipHdr is the IP (v4 or v6) header for the coalesced packet. ipHdr []byte // tcpHdr is the TCP header for the coalesced packet. tcpHdr header.TCP // initialLength is the length of the first packet in the flow. It is // used as a best-effort guess at MSS: senders will send MSS-sized // packets until they run out of data, so we coalesce as long as // packets are the same size. initialLength int // idx is the groPacket's index in its bucket packetsPrealloc. It is // immutable. idx int } // reset resets all mutable fields of the groPacket. func (pk *groPacket) reset() { *pk = groPacket{ idx: pk.idx, } } // payloadSize is the payload size of the coalesced packet, which does not // include the network or transport headers. func (pk *groPacket) payloadSize() int { return pk.pkt.Data().Size() - len(pk.ipHdr) - int(pk.tcpHdr.DataOffset()) } // GRO coalesces incoming packets to increase throughput. type GRO struct { enabled bool buckets [groNBuckets]groBucket Dispatcher stack.NetworkDispatcher } // Init initializes GRO. func (gd *GRO) Init(enabled bool) { gd.enabled = enabled for i := range gd.buckets { bucket := &gd.buckets[i] for j := range bucket.packetsPrealloc { bucket.allocIdxs[j] = j bucket.packetsPrealloc[j].idx = j } } } // Enqueue the packet in GRO. This does not flush packets; Flush() must be // called explicitly for that. // // pkt.NetworkProtocolNumber and pkt.RXChecksumValidated must be set. func (gd *GRO) Enqueue(pkt *stack.PacketBuffer) { if !gd.enabled { gd.handlePacket(pkt) return } switch pkt.NetworkProtocolNumber { case header.IPv4ProtocolNumber: gd.dispatch4(pkt) case header.IPv6ProtocolNumber: gd.dispatch6(pkt) default: gd.handlePacket(pkt) } } func (gd *GRO) dispatch4(pkt *stack.PacketBuffer) { // Immediately get the IPv4 and TCP headers. We need a way to hash the // packet into its bucket, which requires addresses and ports. Linux // simply gets a hash passed by hardware, but we're not so lucky. // We only GRO TCP packets. The check for the transport protocol number // is done below so that we can PullUp both the IP and TCP headers // together. hdrBytes, ok := pkt.Data().PullUp(header.IPv4MinimumSize + header.TCPMinimumSize) if !ok { gd.handlePacket(pkt) return } ipHdr := header.IPv4(hdrBytes) // We don't handle fragments. That should be the vast majority of // traffic, and simplifies handling. if ipHdr.FragmentOffset() != 0 || ipHdr.Flags()&header.IPv4FlagMoreFragments != 0 { gd.handlePacket(pkt) return } // We only handle TCP packets without IP options. if ipHdr.HeaderLength() != header.IPv4MinimumSize || tcpip.TransportProtocolNumber(ipHdr.Protocol()) != header.TCPProtocolNumber { gd.handlePacket(pkt) return } tcpHdr := header.TCP(hdrBytes[header.IPv4MinimumSize:]) ipHdr = ipHdr[:header.IPv4MinimumSize] dataOff := tcpHdr.DataOffset() if dataOff < header.TCPMinimumSize { // Malformed packet: will be handled further up the stack. gd.handlePacket(pkt) return } hdrBytes, ok = pkt.Data().PullUp(header.IPv4MinimumSize + int(dataOff)) if !ok { // Malformed packet: will be handled further up the stack. gd.handlePacket(pkt) return } tcpHdr = header.TCP(hdrBytes[header.IPv4MinimumSize:]) // If either checksum is bad, flush the packet. Since we don't know // what bits were flipped, we can't identify this packet with a flow. if !pkt.RXChecksumValidated { if !ipHdr.IsValid(pkt.Data().Size()) || !ipHdr.IsChecksumValid() { gd.handlePacket(pkt) return } payloadChecksum := pkt.Data().ChecksumAtOffset(header.IPv4MinimumSize + int(dataOff)) tcpPayloadSize := pkt.Data().Size() - header.IPv4MinimumSize - int(dataOff) if !tcpHdr.IsChecksumValid(ipHdr.SourceAddress(), ipHdr.DestinationAddress(), payloadChecksum, uint16(tcpPayloadSize)) { gd.handlePacket(pkt) return } // We've validated the checksum, no reason for others to do it // again. pkt.RXChecksumValidated = true } // Now we can get the bucket for the packet. bucket := &gd.buckets[gd.bucketForPacket4(ipHdr, tcpHdr)&groNBucketsMask] groPkt, flushGROPkt := bucket.findGROPacket4(pkt, ipHdr, tcpHdr) bucket.found(gd, groPkt, flushGROPkt, pkt, ipHdr, tcpHdr, updateIPv4Hdr) } func (gd *GRO) dispatch6(pkt *stack.PacketBuffer) { // Immediately get the IPv6 and TCP headers. We need a way to hash the // packet into its bucket, which requires addresses and ports. Linux // simply gets a hash passed by hardware, but we're not so lucky. hdrBytes, ok := pkt.Data().PullUp(header.IPv6MinimumSize) if !ok { gd.handlePacket(pkt) return } ipHdr := header.IPv6(hdrBytes) // Getting the IP header (+ extension headers) size is a bit of a pain // on IPv6. transProto := tcpip.TransportProtocolNumber(ipHdr.NextHeader()) buf := pkt.Data().ToBuffer() buf.TrimFront(header.IPv6MinimumSize) it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(transProto), buf) ipHdrSize := int(header.IPv6MinimumSize) for { transProto = tcpip.TransportProtocolNumber(it.NextHeaderIdentifier()) extHdr, done, err := it.Next() if err != nil { gd.handlePacket(pkt) return } if done { break } switch extHdr.(type) { // We can GRO these, so just skip over them. case header.IPv6HopByHopOptionsExtHdr: case header.IPv6RoutingExtHdr: case header.IPv6DestinationOptionsExtHdr: default: // This is either a TCP header or something we can't handle. ipHdrSize = int(it.HeaderOffset()) done = true } extHdr.Release() if done { break } } hdrBytes, ok = pkt.Data().PullUp(ipHdrSize + header.TCPMinimumSize) if !ok { gd.handlePacket(pkt) return } ipHdr = header.IPv6(hdrBytes[:ipHdrSize]) // We only handle TCP packets. if transProto != header.TCPProtocolNumber { gd.handlePacket(pkt) return } tcpHdr := header.TCP(hdrBytes[ipHdrSize:]) dataOff := tcpHdr.DataOffset() if dataOff < header.TCPMinimumSize { // Malformed packet: will be handled further up the stack. gd.handlePacket(pkt) return } hdrBytes, ok = pkt.Data().PullUp(ipHdrSize + int(dataOff)) if !ok { // Malformed packet: will be handled further up the stack. gd.handlePacket(pkt) return } tcpHdr = header.TCP(hdrBytes[ipHdrSize:]) // If either checksum is bad, flush the packet. Since we don't know // what bits were flipped, we can't identify this packet with a flow. if !pkt.RXChecksumValidated { if !ipHdr.IsValid(pkt.Data().Size()) { gd.handlePacket(pkt) return } payloadChecksum := pkt.Data().ChecksumAtOffset(ipHdrSize + int(dataOff)) tcpPayloadSize := pkt.Data().Size() - ipHdrSize - int(dataOff) if !tcpHdr.IsChecksumValid(ipHdr.SourceAddress(), ipHdr.DestinationAddress(), payloadChecksum, uint16(tcpPayloadSize)) { gd.handlePacket(pkt) return } // We've validated the checksum, no reason for others to do it // again. pkt.RXChecksumValidated = true } // Now we can get the bucket for the packet. bucket := &gd.buckets[gd.bucketForPacket6(ipHdr, tcpHdr)&groNBucketsMask] groPkt, flushGROPkt := bucket.findGROPacket6(pkt, ipHdr, tcpHdr) bucket.found(gd, groPkt, flushGROPkt, pkt, ipHdr, tcpHdr, updateIPv6Hdr) } func (gd *GRO) bucketForPacket4(ipHdr header.IPv4, tcpHdr header.TCP) int { // TODO(b/256037250): Use jenkins or checksum. Write a test to print // distribution. var sum int srcAddr := ipHdr.SourceAddress() for _, val := range srcAddr.AsSlice() { sum += int(val) } dstAddr := ipHdr.DestinationAddress() for _, val := range dstAddr.AsSlice() { sum += int(val) } sum += int(tcpHdr.SourcePort()) sum += int(tcpHdr.DestinationPort()) return sum } func (gd *GRO) bucketForPacket6(ipHdr header.IPv6, tcpHdr header.TCP) int { // TODO(b/256037250): Use jenkins or checksum. Write a test to print // distribution. var sum int srcAddr := ipHdr.SourceAddress() for _, val := range srcAddr.AsSlice() { sum += int(val) } dstAddr := ipHdr.DestinationAddress() for _, val := range dstAddr.AsSlice() { sum += int(val) } sum += int(tcpHdr.SourcePort()) sum += int(tcpHdr.DestinationPort()) return sum } // Flush sends all packets up the stack. func (gd *GRO) Flush() { for i := range gd.buckets { for groPkt := gd.buckets[i].packets.Front(); groPkt != nil; groPkt = groPkt.Next() { pkt := groPkt.pkt gd.buckets[i].removeOne(groPkt) gd.handlePacket(pkt) pkt.DecRef() } } } func (gd *GRO) handlePacket(pkt *stack.PacketBuffer) { gd.Dispatcher.DeliverNetworkPacket(pkt.NetworkProtocolNumber, pkt) } // String implements fmt.Stringer. func (gd *GRO) String() string { ret := "GRO state: \n" for i := range gd.buckets { bucket := &gd.buckets[i] ret += fmt.Sprintf("bucket %d: %d packets: ", i, bucket.count) for groPkt := bucket.packets.Front(); groPkt != nil; groPkt = groPkt.Next() { ret += fmt.Sprintf("%d, ", groPkt.pkt.Data().Size()) } ret += "\n" } return ret } // shouldFlushTCP returns whether the TCP headers indicate that groPkt should // be flushed func shouldFlushTCP(groPkt *groPacket, tcpHdr header.TCP) bool { flags := tcpHdr.Flags() groPktFlags := groPkt.tcpHdr.Flags() dataOff := tcpHdr.DataOffset() if flags&header.TCPFlagCwr != 0 || // Is congestion control occurring? (flags^groPktFlags)&^(header.TCPFlagCwr|header.TCPFlagFin|header.TCPFlagPsh) != 0 || // Do the flags differ besides CRW, FIN, and PSH? tcpHdr.AckNumber() != groPkt.tcpHdr.AckNumber() || // Do the ACKs match? dataOff != groPkt.tcpHdr.DataOffset() || // Are the TCP headers the same length? groPkt.tcpHdr.SequenceNumber()+uint32(groPkt.payloadSize()) != tcpHdr.SequenceNumber() { // Does the incoming packet match the expected sequence number? return true } // The options, including timestamps, must be identical. return !bytes.Equal(tcpHdr[header.TCPMinimumSize:], groPkt.tcpHdr[header.TCPMinimumSize:]) } func updateIPv4Hdr(ipHdrBytes []byte, newBytes int) { ipHdr := header.IPv4(ipHdrBytes) ipHdr.SetTotalLength(ipHdr.TotalLength() + uint16(newBytes)) } func updateIPv6Hdr(ipHdrBytes []byte, newBytes int) { ipHdr := header.IPv6(ipHdrBytes) ipHdr.SetPayloadLength(ipHdr.PayloadLength() + uint16(newBytes)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/gro/gro_packet_list.go000066400000000000000000000122541465435605700261430ustar00rootroot00000000000000package gro // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type groPacketElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (groPacketElementMapper) linkerFor(elem *groPacket) *groPacket { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type groPacketList struct { head *groPacket tail *groPacket } // Reset resets list l to the empty state. func (l *groPacketList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *groPacketList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *groPacketList) Front() *groPacket { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *groPacketList) Back() *groPacket { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *groPacketList) Len() (count int) { for e := l.Front(); e != nil; e = (groPacketElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *groPacketList) PushFront(e *groPacket) { linker := groPacketElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { groPacketElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *groPacketList) PushFrontList(m *groPacketList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { groPacketElementMapper{}.linkerFor(l.head).SetPrev(m.tail) groPacketElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *groPacketList) PushBack(e *groPacket) { linker := groPacketElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { groPacketElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *groPacketList) PushBackList(m *groPacketList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { groPacketElementMapper{}.linkerFor(l.tail).SetNext(m.head) groPacketElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *groPacketList) InsertAfter(b, e *groPacket) { bLinker := groPacketElementMapper{}.linkerFor(b) eLinker := groPacketElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { groPacketElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *groPacketList) InsertBefore(a, e *groPacket) { aLinker := groPacketElementMapper{}.linkerFor(a) eLinker := groPacketElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { groPacketElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *groPacketList) Remove(e *groPacket) { linker := groPacketElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { groPacketElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { groPacketElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type groPacketEntry struct { next *groPacket prev *groPacket } // Next returns the entry that follows e in the list. // //go:nosplit func (e *groPacketEntry) Next() *groPacket { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *groPacketEntry) Prev() *groPacket { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *groPacketEntry) SetNext(elem *groPacket) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *groPacketEntry) SetPrev(elem *groPacket) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/gro/gro_state_autogen.go000066400000000000000000000027271465435605700265070ustar00rootroot00000000000000// automatically generated by stateify. package gro import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (l *groPacketList) StateTypeName() string { return "pkg/tcpip/stack/gro.groPacketList" } func (l *groPacketList) StateFields() []string { return []string{ "head", "tail", } } func (l *groPacketList) beforeSave() {} // +checklocksignore func (l *groPacketList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *groPacketList) afterLoad(context.Context) {} // +checklocksignore func (l *groPacketList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *groPacketEntry) StateTypeName() string { return "pkg/tcpip/stack/gro.groPacketEntry" } func (e *groPacketEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *groPacketEntry) beforeSave() {} // +checklocksignore func (e *groPacketEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *groPacketEntry) afterLoad(context.Context) {} // +checklocksignore func (e *groPacketEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func init() { state.Register((*groPacketList)(nil)) state.Register((*groPacketEntry)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/headertype_string.go000066400000000000000000000025461465435605700257260ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by "stringer -type headerType ."; DO NOT EDIT. package stack import "strconv" func _() { // An "invalid array index" compiler error signifies that the constant values have changed. // Re-run the stringer command to generate them again. var x [1]struct{} _ = x[virtioNetHeader-0] _ = x[linkHeader-1] _ = x[networkHeader-2] _ = x[transportHeader-3] _ = x[numHeaderType-4] } const _headerType_name = "virtioNetHeaderlinkHeadernetworkHeadertransportHeadernumHeaderType" var _headerType_index = [...]uint8{0, 10, 23, 38, 51} func (i headerType) String() string { if i < 0 || i >= headerType(len(_headerType_index)-1) { return "headerType(" + strconv.FormatInt(int64(i), 10) + ")" } return _headerType_name[_headerType_index[i]:_headerType_index[i+1]] } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/hook_string.go000066400000000000000000000024161465435605700245300ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by "stringer -type Hook ."; DO NOT EDIT. package stack import "strconv" func _() { // An "invalid array index" compiler error signifies that the constant values have changed. // Re-run the stringer command to generate them again. var x [1]struct{} _ = x[Prerouting-0] _ = x[Input-1] _ = x[Forward-2] _ = x[Output-3] _ = x[Postrouting-4] _ = x[NumHooks-5] } const _Hook_name = "PreroutingInputForwardOutputPostroutingNumHooks" var _Hook_index = [...]uint8{0, 10, 15, 22, 28, 39, 47} func (i Hook) String() string { if i >= Hook(len(_Hook_index)-1) { return "Hook(" + strconv.FormatInt(int64(i), 10) + ")" } return _Hook_name[_Hook_index[i]:_Hook_index[i+1]] } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/icmp_rate_limit.go000066400000000000000000000043141465435605700253420ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "golang.org/x/time/rate" "gvisor.dev/gvisor/pkg/tcpip" ) const ( // icmpLimit is the default maximum number of ICMP messages permitted by this // rate limiter. icmpLimit = 1000 // icmpBurst is the default number of ICMP messages that can be sent in a single // burst. icmpBurst = 50 ) // ICMPRateLimiter is a global rate limiter that controls the generation of // ICMP messages generated by the stack. // // +stateify savable type ICMPRateLimiter struct { // TODO(b/341946753): Restore when netstack is savable. limiter *rate.Limiter `state:"nosave"` clock tcpip.Clock } // NewICMPRateLimiter returns a global rate limiter for controlling the rate // at which ICMP messages are generated by the stack. The returned limiter // does not apply limits to any ICMP types by default. func NewICMPRateLimiter(clock tcpip.Clock) *ICMPRateLimiter { return &ICMPRateLimiter{ clock: clock, limiter: rate.NewLimiter(icmpLimit, icmpBurst), } } // SetLimit sets a new Limit for the limiter. func (l *ICMPRateLimiter) SetLimit(limit rate.Limit) { l.limiter.SetLimitAt(l.clock.Now(), limit) } // Limit returns the maximum overall event rate. func (l *ICMPRateLimiter) Limit() rate.Limit { return l.limiter.Limit() } // SetBurst sets a new burst size for the limiter. func (l *ICMPRateLimiter) SetBurst(burst int) { l.limiter.SetBurstAt(l.clock.Now(), burst) } // Burst returns the maximum burst size. func (l *ICMPRateLimiter) Burst() int { return l.limiter.Burst() } // Allow reports whether one ICMP message may be sent now. func (l *ICMPRateLimiter) Allow() bool { return l.limiter.AllowN(l.clock.Now(), 1) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/iptables.go000066400000000000000000000517521465435605700240140ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "context" "fmt" "math/rand" "reflect" "time" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" ) // TableID identifies a specific table. type TableID int // Each value identifies a specific table. const ( NATID TableID = iota MangleID FilterID NumTables ) // HookUnset indicates that there is no hook set for an entrypoint or // underflow. const HookUnset = -1 // reaperDelay is how long to wait before starting to reap connections. const reaperDelay = 5 * time.Second // DefaultTables returns a default set of tables. Each chain is set to accept // all packets. func DefaultTables(clock tcpip.Clock, rand *rand.Rand) *IPTables { return &IPTables{ v4Tables: [NumTables]Table{ NATID: { Rules: []Rule{ {Filter: EmptyFilter4(), Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}}, {Filter: EmptyFilter4(), Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}}, {Filter: EmptyFilter4(), Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}}, {Filter: EmptyFilter4(), Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}}, {Filter: EmptyFilter4(), Target: &ErrorTarget{NetworkProtocol: header.IPv4ProtocolNumber}}, }, BuiltinChains: [NumHooks]int{ Prerouting: 0, Input: 1, Forward: HookUnset, Output: 2, Postrouting: 3, }, Underflows: [NumHooks]int{ Prerouting: 0, Input: 1, Forward: HookUnset, Output: 2, Postrouting: 3, }, }, MangleID: { Rules: []Rule{ {Filter: EmptyFilter4(), Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}}, {Filter: EmptyFilter4(), Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}}, {Filter: EmptyFilter4(), Target: &ErrorTarget{NetworkProtocol: header.IPv4ProtocolNumber}}, }, BuiltinChains: [NumHooks]int{ Prerouting: 0, Output: 1, }, Underflows: [NumHooks]int{ Prerouting: 0, Input: HookUnset, Forward: HookUnset, Output: 1, Postrouting: HookUnset, }, }, FilterID: { Rules: []Rule{ {Filter: EmptyFilter4(), Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}}, {Filter: EmptyFilter4(), Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}}, {Filter: EmptyFilter4(), Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}}, {Filter: EmptyFilter4(), Target: &ErrorTarget{NetworkProtocol: header.IPv4ProtocolNumber}}, }, BuiltinChains: [NumHooks]int{ Prerouting: HookUnset, Input: 0, Forward: 1, Output: 2, Postrouting: HookUnset, }, Underflows: [NumHooks]int{ Prerouting: HookUnset, Input: 0, Forward: 1, Output: 2, Postrouting: HookUnset, }, }, }, v6Tables: [NumTables]Table{ NATID: { Rules: []Rule{ {Filter: EmptyFilter6(), Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}}, {Filter: EmptyFilter6(), Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}}, {Filter: EmptyFilter6(), Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}}, {Filter: EmptyFilter6(), Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}}, {Filter: EmptyFilter6(), Target: &ErrorTarget{NetworkProtocol: header.IPv6ProtocolNumber}}, }, BuiltinChains: [NumHooks]int{ Prerouting: 0, Input: 1, Forward: HookUnset, Output: 2, Postrouting: 3, }, Underflows: [NumHooks]int{ Prerouting: 0, Input: 1, Forward: HookUnset, Output: 2, Postrouting: 3, }, }, MangleID: { Rules: []Rule{ {Filter: EmptyFilter6(), Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}}, {Filter: EmptyFilter6(), Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}}, {Filter: EmptyFilter6(), Target: &ErrorTarget{NetworkProtocol: header.IPv6ProtocolNumber}}, }, BuiltinChains: [NumHooks]int{ Prerouting: 0, Output: 1, }, Underflows: [NumHooks]int{ Prerouting: 0, Input: HookUnset, Forward: HookUnset, Output: 1, Postrouting: HookUnset, }, }, FilterID: { Rules: []Rule{ {Filter: EmptyFilter6(), Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}}, {Filter: EmptyFilter6(), Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}}, {Filter: EmptyFilter6(), Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}}, {Filter: EmptyFilter6(), Target: &ErrorTarget{NetworkProtocol: header.IPv6ProtocolNumber}}, }, BuiltinChains: [NumHooks]int{ Prerouting: HookUnset, Input: 0, Forward: 1, Output: 2, Postrouting: HookUnset, }, Underflows: [NumHooks]int{ Prerouting: HookUnset, Input: 0, Forward: 1, Output: 2, Postrouting: HookUnset, }, }, }, connections: ConnTrack{ seed: rand.Uint32(), clock: clock, rand: rand, }, } } // EmptyFilterTable returns a Table with no rules and the filter table chains // mapped to HookUnset. func EmptyFilterTable() Table { return Table{ Rules: []Rule{}, BuiltinChains: [NumHooks]int{ Prerouting: HookUnset, Postrouting: HookUnset, }, Underflows: [NumHooks]int{ Prerouting: HookUnset, Postrouting: HookUnset, }, } } // EmptyNATTable returns a Table with no rules and the filter table chains // mapped to HookUnset. func EmptyNATTable() Table { return Table{ Rules: []Rule{}, BuiltinChains: [NumHooks]int{ Forward: HookUnset, }, Underflows: [NumHooks]int{ Forward: HookUnset, }, } } // GetTable returns a table with the given id and IP version. It panics when an // invalid id is provided. func (it *IPTables) GetTable(id TableID, ipv6 bool) Table { it.mu.RLock() defer it.mu.RUnlock() return it.getTableRLocked(id, ipv6) } // +checklocksread:it.mu func (it *IPTables) getTableRLocked(id TableID, ipv6 bool) Table { if ipv6 { return it.v6Tables[id] } return it.v4Tables[id] } // ReplaceTable replaces or inserts table by name. It panics when an invalid id // is provided. func (it *IPTables) ReplaceTable(id TableID, table Table, ipv6 bool) { it.replaceTable(id, table, ipv6, false /* force */) } // ForceReplaceTable replaces or inserts table by name. It panics when an invalid id // is provided. It enables iptables even when the inserted table is all // conditionless ACCEPT, skipping our optimization that disables iptables until // they're modified. func (it *IPTables) ForceReplaceTable(id TableID, table Table, ipv6 bool) { it.replaceTable(id, table, ipv6, true /* force */) } func (it *IPTables) replaceTable(id TableID, table Table, ipv6, force bool) { it.mu.Lock() defer it.mu.Unlock() // If iptables is being enabled, initialize the conntrack table and // reaper. if !it.modified { // Don't do anything if the table is identical. if ((ipv6 && reflect.DeepEqual(table, it.v6Tables[id])) || (!ipv6 && reflect.DeepEqual(table, it.v4Tables[id]))) && !force { return } it.connections.init() it.startReaper(reaperDelay) } it.modified = true if ipv6 { it.v6Tables[id] = table } else { it.v4Tables[id] = table } } // A chainVerdict is what a table decides should be done with a packet. type chainVerdict int const ( // chainAccept indicates the packet should continue through netstack. chainAccept chainVerdict = iota // chainDrop indicates the packet should be dropped. chainDrop // chainReturn indicates the packet should return to the calling chain // or the underflow rule of a builtin chain. chainReturn ) type checkTable struct { fn checkTableFn tableID TableID table Table } // shouldSkipOrPopulateTables returns true iff IPTables should be skipped. // // If IPTables should not be skipped, tables will be updated with the // specified table. // // This is called in the hot path even when iptables are disabled, so we ensure // it does not allocate. We check recursively for heap allocations, but not for: // - Stack splitting, which can allocate. // - Calls to interfaces, which can allocate. // - Calls to dynamic functions, which can allocate. // // +checkescape:hard func (it *IPTables) shouldSkipOrPopulateTables(tables []checkTable, pkt *PacketBuffer) bool { switch pkt.NetworkProtocolNumber { case header.IPv4ProtocolNumber, header.IPv6ProtocolNumber: default: // IPTables only supports IPv4/IPv6. return true } it.mu.RLock() defer it.mu.RUnlock() if !it.modified { // Many users never configure iptables. Spare them the cost of rule // traversal if rules have never been set. return true } for i := range tables { table := &tables[i] table.table = it.getTableRLocked(table.tableID, pkt.NetworkProtocolNumber == header.IPv6ProtocolNumber) } return false } // CheckPrerouting performs the prerouting hook on the packet. // // Returns true iff the packet may continue traversing the stack; the packet // must be dropped if false is returned. // // Precondition: The packet's network and transport header must be set. // // This is called in the hot path even when iptables are disabled, so we ensure // that it does not allocate. Note that called functions (e.g. // getConnAndUpdate) can allocate. // TODO(b/233951539): checkescape fails on arm sometimes. Fix and re-add. func (it *IPTables) CheckPrerouting(pkt *PacketBuffer, addressEP AddressableEndpoint, inNicName string) bool { tables := [...]checkTable{ { fn: check, tableID: MangleID, }, { fn: checkNAT, tableID: NATID, }, } if it.shouldSkipOrPopulateTables(tables[:], pkt) { return true } pkt.tuple = it.connections.getConnAndUpdate(pkt, false /* skipChecksumValidation */) for _, table := range tables { if !table.fn(it, table.table, Prerouting, pkt, nil /* route */, addressEP, inNicName, "" /* outNicName */) { return false } } return true } // CheckInput performs the input hook on the packet. // // Returns true iff the packet may continue traversing the stack; the packet // must be dropped if false is returned. // // Precondition: The packet's network and transport header must be set. // // This is called in the hot path even when iptables are disabled, so we ensure // that it does not allocate. Note that called functions (e.g. // getConnAndUpdate) can allocate. // TODO(b/233951539): checkescape fails on arm sometimes. Fix and re-add. func (it *IPTables) CheckInput(pkt *PacketBuffer, inNicName string) bool { tables := [...]checkTable{ { fn: checkNAT, tableID: NATID, }, { fn: check, tableID: FilterID, }, } if it.shouldSkipOrPopulateTables(tables[:], pkt) { return true } for _, table := range tables { if !table.fn(it, table.table, Input, pkt, nil /* route */, nil /* addressEP */, inNicName, "" /* outNicName */) { return false } } if t := pkt.tuple; t != nil { pkt.tuple = nil return t.conn.finalize() } return true } // CheckForward performs the forward hook on the packet. // // Returns true iff the packet may continue traversing the stack; the packet // must be dropped if false is returned. // // Precondition: The packet's network and transport header must be set. // // This is called in the hot path even when iptables are disabled, so we ensure // that it does not allocate. Note that called functions (e.g. // getConnAndUpdate) can allocate. // TODO(b/233951539): checkescape fails on arm sometimes. Fix and re-add. func (it *IPTables) CheckForward(pkt *PacketBuffer, inNicName, outNicName string) bool { tables := [...]checkTable{ { fn: check, tableID: FilterID, }, } if it.shouldSkipOrPopulateTables(tables[:], pkt) { return true } for _, table := range tables { if !table.fn(it, table.table, Forward, pkt, nil /* route */, nil /* addressEP */, inNicName, outNicName) { return false } } return true } // CheckOutput performs the output hook on the packet. // // Returns true iff the packet may continue traversing the stack; the packet // must be dropped if false is returned. // // Precondition: The packet's network and transport header must be set. // // This is called in the hot path even when iptables are disabled, so we ensure // that it does not allocate. Note that called functions (e.g. // getConnAndUpdate) can allocate. // TODO(b/233951539): checkescape fails on arm sometimes. Fix and re-add. func (it *IPTables) CheckOutput(pkt *PacketBuffer, r *Route, outNicName string) bool { tables := [...]checkTable{ { fn: check, tableID: MangleID, }, { fn: checkNAT, tableID: NATID, }, { fn: check, tableID: FilterID, }, } if it.shouldSkipOrPopulateTables(tables[:], pkt) { return true } // We don't need to validate the checksum in the Output path: we can assume // we calculate it correctly, plus checksumming may be deferred due to GSO. pkt.tuple = it.connections.getConnAndUpdate(pkt, true /* skipChecksumValidation */) for _, table := range tables { if !table.fn(it, table.table, Output, pkt, r, nil /* addressEP */, "" /* inNicName */, outNicName) { return false } } return true } // CheckPostrouting performs the postrouting hook on the packet. // // Returns true iff the packet may continue traversing the stack; the packet // must be dropped if false is returned. // // Precondition: The packet's network and transport header must be set. // // This is called in the hot path even when iptables are disabled, so we ensure // that it does not allocate. Note that called functions (e.g. // getConnAndUpdate) can allocate. // TODO(b/233951539): checkescape fails on arm sometimes. Fix and re-add. func (it *IPTables) CheckPostrouting(pkt *PacketBuffer, r *Route, addressEP AddressableEndpoint, outNicName string) bool { tables := [...]checkTable{ { fn: check, tableID: MangleID, }, { fn: checkNAT, tableID: NATID, }, } if it.shouldSkipOrPopulateTables(tables[:], pkt) { return true } for _, table := range tables { if !table.fn(it, table.table, Postrouting, pkt, r, addressEP, "" /* inNicName */, outNicName) { return false } } if t := pkt.tuple; t != nil { pkt.tuple = nil return t.conn.finalize() } return true } // Note: this used to omit the *IPTables parameter, but doing so caused // unnecessary allocations. type checkTableFn func(it *IPTables, table Table, hook Hook, pkt *PacketBuffer, r *Route, addressEP AddressableEndpoint, inNicName, outNicName string) bool func checkNAT(it *IPTables, table Table, hook Hook, pkt *PacketBuffer, r *Route, addressEP AddressableEndpoint, inNicName, outNicName string) bool { return it.checkNAT(table, hook, pkt, r, addressEP, inNicName, outNicName) } // checkNAT runs the packet through the NAT table. // // See check. func (it *IPTables) checkNAT(table Table, hook Hook, pkt *PacketBuffer, r *Route, addressEP AddressableEndpoint, inNicName, outNicName string) bool { t := pkt.tuple if t != nil && t.conn.handlePacket(pkt, hook, r) { return true } if !it.check(table, hook, pkt, r, addressEP, inNicName, outNicName) { return false } if t == nil { return true } dnat, natDone := func() (bool, bool) { switch hook { case Prerouting, Output: return true, pkt.dnatDone case Input, Postrouting: return false, pkt.snatDone case Forward: panic("should not attempt NAT in forwarding") default: panic(fmt.Sprintf("unhandled hook = %d", hook)) } }() // Make sure the connection is NATed. // // If the packet was already NATed, the connection must be NATed. if !natDone { t.conn.maybePerformNoopNAT(pkt, hook, r, dnat) } return true } func check(it *IPTables, table Table, hook Hook, pkt *PacketBuffer, r *Route, addressEP AddressableEndpoint, inNicName, outNicName string) bool { return it.check(table, hook, pkt, r, addressEP, inNicName, outNicName) } // check runs the packet through the rules in the specified table for the // hook. It returns true if the packet should continue to traverse through the // network stack or tables, or false when it must be dropped. // // Precondition: The packet's network and transport header must be set. func (it *IPTables) check(table Table, hook Hook, pkt *PacketBuffer, r *Route, addressEP AddressableEndpoint, inNicName, outNicName string) bool { ruleIdx := table.BuiltinChains[hook] switch verdict := it.checkChain(hook, pkt, table, ruleIdx, r, addressEP, inNicName, outNicName); verdict { // If the table returns Accept, move on to the next table. case chainAccept: return true // The Drop verdict is final. case chainDrop: return false case chainReturn: // Any Return from a built-in chain means we have to // call the underflow. underflow := table.Rules[table.Underflows[hook]] switch v, _ := underflow.Target.Action(pkt, hook, r, addressEP); v { case RuleAccept: return true case RuleDrop: return false case RuleJump, RuleReturn: panic("Underflows should only return RuleAccept or RuleDrop.") default: panic(fmt.Sprintf("Unknown verdict: %d", v)) } default: panic(fmt.Sprintf("Unknown verdict %v.", verdict)) } } // beforeSave is invoked by stateify. func (it *IPTables) beforeSave() { // Ensure the reaper exits cleanly. it.reaper.Stop() // Prevent others from modifying the connection table. it.connections.mu.Lock() } // afterLoad is invoked by stateify. func (it *IPTables) afterLoad(context.Context) { it.startReaper(reaperDelay) } // startReaper periodically reaps timed out connections. func (it *IPTables) startReaper(interval time.Duration) { bucket := 0 it.reaper = it.connections.clock.AfterFunc(interval, func() { bucket, interval = it.connections.reapUnused(bucket, interval) it.reaper.Reset(interval) }) } // Preconditions: // - pkt is a IPv4 packet of at least length header.IPv4MinimumSize. // - pkt.NetworkHeader is not nil. func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, r *Route, addressEP AddressableEndpoint, inNicName, outNicName string) chainVerdict { // Start from ruleIdx and walk the list of rules until a rule gives us // a verdict. for ruleIdx < len(table.Rules) { switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx, r, addressEP, inNicName, outNicName); verdict { case RuleAccept: return chainAccept case RuleDrop: return chainDrop case RuleReturn: return chainReturn case RuleJump: // "Jumping" to the next rule just means we're // continuing on down the list. if jumpTo == ruleIdx+1 { ruleIdx++ continue } switch verdict := it.checkChain(hook, pkt, table, jumpTo, r, addressEP, inNicName, outNicName); verdict { case chainAccept: return chainAccept case chainDrop: return chainDrop case chainReturn: ruleIdx++ continue default: panic(fmt.Sprintf("Unknown verdict: %d", verdict)) } default: panic(fmt.Sprintf("Unknown verdict: %d", verdict)) } } // We got through the entire table without a decision. Default to DROP // for safety. return chainDrop } // Preconditions: // - pkt is a IPv4 packet of at least length header.IPv4MinimumSize. // - pkt.NetworkHeader is not nil. // // * pkt is a IPv4 packet of at least length header.IPv4MinimumSize. // * pkt.NetworkHeader is not nil. func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, r *Route, addressEP AddressableEndpoint, inNicName, outNicName string) (RuleVerdict, int) { rule := table.Rules[ruleIdx] // Check whether the packet matches the IP header filter. if !rule.Filter.match(pkt, hook, inNicName, outNicName) { // Continue on to the next rule. return RuleJump, ruleIdx + 1 } // Go through each rule matcher. If they all match, run // the rule target. for _, matcher := range rule.Matchers { matches, hotdrop := matcher.Match(hook, pkt, inNicName, outNicName) if hotdrop { return RuleDrop, 0 } if !matches { // Continue on to the next rule. return RuleJump, ruleIdx + 1 } } // All the matchers matched, so run the target. return rule.Target.Action(pkt, hook, r, addressEP) } // OriginalDst returns the original destination of redirected connections. It // returns an error if the connection doesn't exist or isn't redirected. func (it *IPTables) OriginalDst(epID TransportEndpointID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber) (tcpip.Address, uint16, tcpip.Error) { it.mu.RLock() defer it.mu.RUnlock() if !it.modified { return tcpip.Address{}, 0, &tcpip.ErrNotConnected{} } return it.connections.originalDst(epID, netProto, transProto) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/iptables_mutex.go000066400000000000000000000046141465435605700252310ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type ipTablesRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var ipTableslockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type ipTableslockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *ipTablesRWMutex) Lock() { locking.AddGLock(ipTablesprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *ipTablesRWMutex) NestedLock(i ipTableslockNameIndex) { locking.AddGLock(ipTablesprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *ipTablesRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(ipTablesprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *ipTablesRWMutex) NestedUnlock(i ipTableslockNameIndex) { m.mu.Unlock() locking.DelGLock(ipTablesprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *ipTablesRWMutex) RLock() { locking.AddGLock(ipTablesprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *ipTablesRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(ipTablesprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *ipTablesRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *ipTablesRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *ipTablesRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var ipTablesprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func ipTablesinitLockNames() {} func init() { ipTablesinitLockNames() ipTablesprefixIndex = locking.NewMutexClass(reflect.TypeOf(ipTablesRWMutex{}), ipTableslockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/iptables_targets.go000066400000000000000000000364541465435605700255470ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "fmt" "math" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" ) // AcceptTarget accepts packets. // // +stateify savable type AcceptTarget struct { // NetworkProtocol is the network protocol the target is used with. NetworkProtocol tcpip.NetworkProtocolNumber } // Action implements Target.Action. func (*AcceptTarget) Action(*PacketBuffer, Hook, *Route, AddressableEndpoint) (RuleVerdict, int) { return RuleAccept, 0 } // DropTarget drops packets. // // +stateify savable type DropTarget struct { // NetworkProtocol is the network protocol the target is used with. NetworkProtocol tcpip.NetworkProtocolNumber } // Action implements Target.Action. func (*DropTarget) Action(*PacketBuffer, Hook, *Route, AddressableEndpoint) (RuleVerdict, int) { return RuleDrop, 0 } // RejectIPv4WithHandler handles rejecting a packet. type RejectIPv4WithHandler interface { // SendRejectionError sends an error packet in response to the packet. SendRejectionError(pkt *PacketBuffer, rejectWith RejectIPv4WithICMPType, inputHook bool) tcpip.Error } // RejectIPv4WithICMPType indicates the type of ICMP error that should be sent. type RejectIPv4WithICMPType int // The types of errors that may be returned when rejecting IPv4 packets. const ( _ RejectIPv4WithICMPType = iota RejectIPv4WithICMPNetUnreachable RejectIPv4WithICMPHostUnreachable RejectIPv4WithICMPPortUnreachable RejectIPv4WithICMPNetProhibited RejectIPv4WithICMPHostProhibited RejectIPv4WithICMPAdminProhibited ) // RejectIPv4Target drops packets and sends back an error packet in response to the // matched packet. // // +stateify savable type RejectIPv4Target struct { Handler RejectIPv4WithHandler RejectWith RejectIPv4WithICMPType } // Action implements Target.Action. func (rt *RejectIPv4Target) Action(pkt *PacketBuffer, hook Hook, _ *Route, _ AddressableEndpoint) (RuleVerdict, int) { switch hook { case Input, Forward, Output: // There is nothing reasonable for us to do in response to an error here; // we already drop the packet. _ = rt.Handler.SendRejectionError(pkt, rt.RejectWith, hook == Input) return RuleDrop, 0 case Prerouting, Postrouting: panic(fmt.Sprintf("%s hook not supported for REDIRECT", hook)) default: panic(fmt.Sprintf("unhandled hook = %s", hook)) } } // RejectIPv6WithHandler handles rejecting a packet. type RejectIPv6WithHandler interface { // SendRejectionError sends an error packet in response to the packet. SendRejectionError(pkt *PacketBuffer, rejectWith RejectIPv6WithICMPType, forwardingHook bool) tcpip.Error } // RejectIPv6WithICMPType indicates the type of ICMP error that should be sent. type RejectIPv6WithICMPType int // The types of errors that may be returned when rejecting IPv6 packets. const ( _ RejectIPv6WithICMPType = iota RejectIPv6WithICMPNoRoute RejectIPv6WithICMPAddrUnreachable RejectIPv6WithICMPPortUnreachable RejectIPv6WithICMPAdminProhibited ) // RejectIPv6Target drops packets and sends back an error packet in response to the // matched packet. // // +stateify savable type RejectIPv6Target struct { Handler RejectIPv6WithHandler RejectWith RejectIPv6WithICMPType } // Action implements Target.Action. func (rt *RejectIPv6Target) Action(pkt *PacketBuffer, hook Hook, _ *Route, _ AddressableEndpoint) (RuleVerdict, int) { switch hook { case Input, Forward, Output: // There is nothing reasonable for us to do in response to an error here; // we already drop the packet. _ = rt.Handler.SendRejectionError(pkt, rt.RejectWith, hook == Input) return RuleDrop, 0 case Prerouting, Postrouting: panic(fmt.Sprintf("%s hook not supported for REDIRECT", hook)) default: panic(fmt.Sprintf("unhandled hook = %s", hook)) } } // ErrorTarget logs an error and drops the packet. It represents a target that // should be unreachable. // // +stateify savable type ErrorTarget struct { // NetworkProtocol is the network protocol the target is used with. NetworkProtocol tcpip.NetworkProtocolNumber } // Action implements Target.Action. func (*ErrorTarget) Action(*PacketBuffer, Hook, *Route, AddressableEndpoint) (RuleVerdict, int) { log.Debugf("ErrorTarget triggered.") return RuleDrop, 0 } // UserChainTarget marks a rule as the beginning of a user chain. // // +stateify savable type UserChainTarget struct { // Name is the chain name. Name string // NetworkProtocol is the network protocol the target is used with. NetworkProtocol tcpip.NetworkProtocolNumber } // Action implements Target.Action. func (*UserChainTarget) Action(*PacketBuffer, Hook, *Route, AddressableEndpoint) (RuleVerdict, int) { panic("UserChainTarget should never be called.") } // ReturnTarget returns from the current chain. If the chain is a built-in, the // hook's underflow should be called. // // +stateify savable type ReturnTarget struct { // NetworkProtocol is the network protocol the target is used with. NetworkProtocol tcpip.NetworkProtocolNumber } // Action implements Target.Action. func (*ReturnTarget) Action(*PacketBuffer, Hook, *Route, AddressableEndpoint) (RuleVerdict, int) { return RuleReturn, 0 } // DNATTarget modifies the destination port/IP of packets. // // +stateify savable type DNATTarget struct { // The new destination address for packets. // // Immutable. Addr tcpip.Address // The new destination port for packets. // // Immutable. Port uint16 // NetworkProtocol is the network protocol the target is used with. // // Immutable. NetworkProtocol tcpip.NetworkProtocolNumber // ChangeAddress indicates whether we should check addresses. // // Immutable. ChangeAddress bool // ChangePort indicates whether we should check ports. // // Immutable. ChangePort bool } // Action implements Target.Action. func (rt *DNATTarget) Action(pkt *PacketBuffer, hook Hook, r *Route, addressEP AddressableEndpoint) (RuleVerdict, int) { // Sanity check. if rt.NetworkProtocol != pkt.NetworkProtocolNumber { panic(fmt.Sprintf( "DNATTarget.Action with NetworkProtocol %d called on packet with NetworkProtocolNumber %d", rt.NetworkProtocol, pkt.NetworkProtocolNumber)) } switch hook { case Prerouting, Output: case Input, Forward, Postrouting: panic(fmt.Sprintf("%s not supported for DNAT", hook)) default: panic(fmt.Sprintf("%s unrecognized", hook)) } return dnatAction(pkt, hook, r, rt.Port, rt.Addr, rt.ChangePort, rt.ChangeAddress) } // RedirectTarget redirects the packet to this machine by modifying the // destination port/IP. Outgoing packets are redirected to the loopback device, // and incoming packets are redirected to the incoming interface (rather than // forwarded). // // +stateify savable type RedirectTarget struct { // Port indicates port used to redirect. It is immutable. Port uint16 // NetworkProtocol is the network protocol the target is used with. It // is immutable. NetworkProtocol tcpip.NetworkProtocolNumber } // Action implements Target.Action. func (rt *RedirectTarget) Action(pkt *PacketBuffer, hook Hook, r *Route, addressEP AddressableEndpoint) (RuleVerdict, int) { // Sanity check. if rt.NetworkProtocol != pkt.NetworkProtocolNumber { panic(fmt.Sprintf( "RedirectTarget.Action with NetworkProtocol %d called on packet with NetworkProtocolNumber %d", rt.NetworkProtocol, pkt.NetworkProtocolNumber)) } // Change the address to loopback (127.0.0.1 or ::1) in Output and to // the primary address of the incoming interface in Prerouting. var address tcpip.Address switch hook { case Output: if pkt.NetworkProtocolNumber == header.IPv4ProtocolNumber { address = tcpip.AddrFrom4([4]byte{127, 0, 0, 1}) } else { address = header.IPv6Loopback } case Prerouting: // addressEP is expected to be set for the prerouting hook. address = addressEP.MainAddress().Address default: panic("redirect target is supported only on output and prerouting hooks") } return dnatAction(pkt, hook, r, rt.Port, address, true /* changePort */, true /* changeAddress */) } // SNATTarget modifies the source port/IP in the outgoing packets. // // +stateify savable type SNATTarget struct { Addr tcpip.Address Port uint16 // NetworkProtocol is the network protocol the target is used with. It // is immutable. NetworkProtocol tcpip.NetworkProtocolNumber // ChangeAddress indicates whether we should check addresses. // // Immutable. ChangeAddress bool // ChangePort indicates whether we should check ports. // // Immutable. ChangePort bool } func dnatAction(pkt *PacketBuffer, hook Hook, r *Route, port uint16, address tcpip.Address, changePort, changeAddress bool) (RuleVerdict, int) { return natAction(pkt, hook, r, portOrIdentRange{start: port, size: 1}, address, true /* dnat */, changePort, changeAddress) } func targetPortRangeForTCPAndUDP(originalSrcPort uint16) portOrIdentRange { // As per iptables(8), // // If no port range is specified, then source ports below 512 will be // mapped to other ports below 512: those between 512 and 1023 inclusive // will be mapped to ports below 1024, and other ports will be mapped to // 1024 or above. switch { case originalSrcPort < 512: return portOrIdentRange{start: 1, size: 511} case originalSrcPort < 1024: return portOrIdentRange{start: 1, size: 1023} default: return portOrIdentRange{start: 1024, size: math.MaxUint16 - 1023} } } func snatAction(pkt *PacketBuffer, hook Hook, r *Route, port uint16, address tcpip.Address, changePort, changeAddress bool) (RuleVerdict, int) { portsOrIdents := portOrIdentRange{start: port, size: 1} switch pkt.TransportProtocolNumber { case header.UDPProtocolNumber: if port == 0 { portsOrIdents = targetPortRangeForTCPAndUDP(header.UDP(pkt.TransportHeader().Slice()).SourcePort()) } case header.TCPProtocolNumber: if port == 0 { portsOrIdents = targetPortRangeForTCPAndUDP(header.TCP(pkt.TransportHeader().Slice()).SourcePort()) } case header.ICMPv4ProtocolNumber, header.ICMPv6ProtocolNumber: // Allow NAT-ing to any 16-bit value for ICMP's Ident field to match Linux // behaviour. // // https://github.com/torvalds/linux/blob/58e1100fdc5990b0cc0d4beaf2562a92e621ac7d/net/netfilter/nf_nat_core.c#L391 portsOrIdents = portOrIdentRange{start: 0, size: math.MaxUint16 + 1} } return natAction(pkt, hook, r, portsOrIdents, address, false /* dnat */, changePort, changeAddress) } func natAction(pkt *PacketBuffer, hook Hook, r *Route, portsOrIdents portOrIdentRange, address tcpip.Address, dnat, changePort, changeAddress bool) (RuleVerdict, int) { // Drop the packet if network and transport header are not set. if len(pkt.NetworkHeader().Slice()) == 0 || len(pkt.TransportHeader().Slice()) == 0 { return RuleDrop, 0 } if t := pkt.tuple; t != nil { t.conn.performNAT(pkt, hook, r, portsOrIdents, address, dnat, changePort, changeAddress) return RuleAccept, 0 } return RuleDrop, 0 } // Action implements Target.Action. func (st *SNATTarget) Action(pkt *PacketBuffer, hook Hook, r *Route, _ AddressableEndpoint) (RuleVerdict, int) { // Sanity check. if st.NetworkProtocol != pkt.NetworkProtocolNumber { panic(fmt.Sprintf( "SNATTarget.Action with NetworkProtocol %d called on packet with NetworkProtocolNumber %d", st.NetworkProtocol, pkt.NetworkProtocolNumber)) } switch hook { case Postrouting, Input: case Prerouting, Output, Forward: panic(fmt.Sprintf("%s not supported", hook)) default: panic(fmt.Sprintf("%s unrecognized", hook)) } return snatAction(pkt, hook, r, st.Port, st.Addr, st.ChangePort, st.ChangeAddress) } // MasqueradeTarget modifies the source port/IP in the outgoing packets. // // +stateify savable type MasqueradeTarget struct { // NetworkProtocol is the network protocol the target is used with. It // is immutable. NetworkProtocol tcpip.NetworkProtocolNumber } // Action implements Target.Action. func (mt *MasqueradeTarget) Action(pkt *PacketBuffer, hook Hook, r *Route, addressEP AddressableEndpoint) (RuleVerdict, int) { // Sanity check. if mt.NetworkProtocol != pkt.NetworkProtocolNumber { panic(fmt.Sprintf( "MasqueradeTarget.Action with NetworkProtocol %d called on packet with NetworkProtocolNumber %d", mt.NetworkProtocol, pkt.NetworkProtocolNumber)) } switch hook { case Postrouting: case Prerouting, Input, Forward, Output: panic(fmt.Sprintf("masquerade target is supported only on postrouting hook; hook = %d", hook)) default: panic(fmt.Sprintf("%s unrecognized", hook)) } // addressEP is expected to be set for the postrouting hook. ep := addressEP.AcquireOutgoingPrimaryAddress(pkt.Network().DestinationAddress(), tcpip.Address{} /* srcHint */, false /* allowExpired */) if ep == nil { // No address exists that we can use as a source address. return RuleDrop, 0 } address := ep.AddressWithPrefix().Address ep.DecRef() return snatAction(pkt, hook, r, 0 /* port */, address, true /* changePort */, true /* changeAddress */) } func rewritePacket(n header.Network, t header.Transport, updateSRCFields, fullChecksum, updatePseudoHeader bool, newPortOrIdent uint16, newAddr tcpip.Address) { switch t := t.(type) { case header.ChecksummableTransport: if updateSRCFields { if fullChecksum { t.SetSourcePortWithChecksumUpdate(newPortOrIdent) } else { t.SetSourcePort(newPortOrIdent) } } else { if fullChecksum { t.SetDestinationPortWithChecksumUpdate(newPortOrIdent) } else { t.SetDestinationPort(newPortOrIdent) } } if updatePseudoHeader { var oldAddr tcpip.Address if updateSRCFields { oldAddr = n.SourceAddress() } else { oldAddr = n.DestinationAddress() } t.UpdateChecksumPseudoHeaderAddress(oldAddr, newAddr, fullChecksum) } case header.ICMPv4: switch icmpType := t.Type(); icmpType { case header.ICMPv4Echo: if updateSRCFields { t.SetIdentWithChecksumUpdate(newPortOrIdent) } case header.ICMPv4EchoReply: if !updateSRCFields { t.SetIdentWithChecksumUpdate(newPortOrIdent) } default: panic(fmt.Sprintf("unexpected ICMPv4 type = %d", icmpType)) } case header.ICMPv6: switch icmpType := t.Type(); icmpType { case header.ICMPv6EchoRequest: if updateSRCFields { t.SetIdentWithChecksumUpdate(newPortOrIdent) } case header.ICMPv6EchoReply: if !updateSRCFields { t.SetIdentWithChecksumUpdate(newPortOrIdent) } default: panic(fmt.Sprintf("unexpected ICMPv4 type = %d", icmpType)) } var oldAddr tcpip.Address if updateSRCFields { oldAddr = n.SourceAddress() } else { oldAddr = n.DestinationAddress() } t.UpdateChecksumPseudoHeaderAddress(oldAddr, newAddr) default: panic(fmt.Sprintf("unhandled transport = %#v", t)) } if checksummableNetHeader, ok := n.(header.ChecksummableNetwork); ok { if updateSRCFields { checksummableNetHeader.SetSourceAddressWithChecksumUpdate(newAddr) } else { checksummableNetHeader.SetDestinationAddressWithChecksumUpdate(newAddr) } } else if updateSRCFields { n.SetSourceAddress(newAddr) } else { n.SetDestinationAddress(newAddr) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/iptables_types.go000066400000000000000000000273571465435605700252440ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "fmt" "strings" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" ) // A Hook specifies one of the hooks built into the network stack. // // Userspace app Userspace app // ^ | // | v // [Input] [Output] // ^ | // | v // | routing // | | // | v // ----->[Prerouting]----->routing----->[Forward]---------[Postrouting]-----> type Hook uint const ( // Prerouting happens before a packet is routed to applications or to // be forwarded. Prerouting Hook = iota // Input happens before a packet reaches an application. Input // Forward happens once it's decided that a packet should be forwarded // to another host. Forward // Output happens after a packet is written by an application to be // sent out. Output // Postrouting happens just before a packet goes out on the wire. Postrouting // NumHooks is the total number of hooks. NumHooks ) // A RuleVerdict is what a rule decides should be done with a packet. type RuleVerdict int const ( // RuleAccept indicates the packet should continue through netstack. RuleAccept RuleVerdict = iota // RuleDrop indicates the packet should be dropped. RuleDrop // RuleJump indicates the packet should jump to another chain. RuleJump // RuleReturn indicates the packet should return to the previous chain. RuleReturn ) // IPTables holds all the tables for a netstack. // // +stateify savable type IPTables struct { connections ConnTrack reaper tcpip.Timer mu ipTablesRWMutex `state:"nosave"` // v4Tables and v6tables map tableIDs to tables. They hold builtin // tables only, not user tables. // // mu protects the array of tables, but not the tables themselves. // +checklocks:mu v4Tables [NumTables]Table // // mu protects the array of tables, but not the tables themselves. // +checklocks:mu v6Tables [NumTables]Table // modified is whether tables have been modified at least once. It is // used to elide the iptables performance overhead for workloads that // don't utilize iptables. // // +checklocks:mu modified bool } // Modified returns whether iptables has been modified. It is inherently racy // and intended for use only in tests. func (it *IPTables) Modified() bool { it.mu.Lock() defer it.mu.Unlock() return it.modified } // VisitTargets traverses all the targets of all tables and replaces each with // transform(target). func (it *IPTables) VisitTargets(transform func(Target) Target) { it.mu.Lock() defer it.mu.Unlock() for tid := range it.v4Tables { for i, rule := range it.v4Tables[tid].Rules { it.v4Tables[tid].Rules[i].Target = transform(rule.Target) } } for tid := range it.v6Tables { for i, rule := range it.v6Tables[tid].Rules { it.v6Tables[tid].Rules[i].Target = transform(rule.Target) } } } // A Table defines a set of chains and hooks into the network stack. // // It is a list of Rules, entry points (BuiltinChains), and error handlers // (Underflows). As packets traverse netstack, they hit hooks. When a packet // hits a hook, iptables compares it to Rules starting from that hook's entry // point. So if a packet hits the Input hook, we look up the corresponding // entry point in BuiltinChains and jump to that point. // // If the Rule doesn't match the packet, iptables continues to the next Rule. // If a Rule does match, it can issue a verdict on the packet (e.g. RuleAccept // or RuleDrop) that causes the packet to stop traversing iptables. It can also // jump to other rules or perform custom actions based on Rule.Target. // // Underflow Rules are invoked when a chain returns without reaching a verdict. // // +stateify savable type Table struct { // Rules holds the rules that make up the table. Rules []Rule // BuiltinChains maps builtin chains to their entrypoint rule in Rules. BuiltinChains [NumHooks]int // Underflows maps builtin chains to their underflow rule in Rules // (i.e. the rule to execute if the chain returns without a verdict). Underflows [NumHooks]int } // ValidHooks returns a bitmap of the builtin hooks for the given table. func (table *Table) ValidHooks() uint32 { hooks := uint32(0) for hook, ruleIdx := range table.BuiltinChains { if ruleIdx != HookUnset { hooks |= 1 << hook } } return hooks } // A Rule is a packet processing rule. It consists of two pieces. First it // contains zero or more matchers, each of which is a specification of which // packets this rule applies to. If there are no matchers in the rule, it // applies to any packet. // // +stateify savable type Rule struct { // Filter holds basic IP filtering fields common to every rule. Filter IPHeaderFilter // Matchers is the list of matchers for this rule. Matchers []Matcher // Target is the action to invoke if all the matchers match the packet. Target Target } // IPHeaderFilter performs basic IP header matching common to every rule. // // +stateify savable type IPHeaderFilter struct { // Protocol matches the transport protocol. Protocol tcpip.TransportProtocolNumber // CheckProtocol determines whether the Protocol field should be // checked during matching. CheckProtocol bool // Dst matches the destination IP address. Dst tcpip.Address // DstMask masks bits of the destination IP address when comparing with // Dst. DstMask tcpip.Address // DstInvert inverts the meaning of the destination IP check, i.e. when // true the filter will match packets that fail the destination // comparison. DstInvert bool // Src matches the source IP address. Src tcpip.Address // SrcMask masks bits of the source IP address when comparing with Src. SrcMask tcpip.Address // SrcInvert inverts the meaning of the source IP check, i.e. when true the // filter will match packets that fail the source comparison. SrcInvert bool // InputInterface matches the name of the incoming interface for the packet. InputInterface string // InputInterfaceMask masks the characters of the interface name when // comparing with InputInterface. InputInterfaceMask string // InputInterfaceInvert inverts the meaning of incoming interface check, // i.e. when true the filter will match packets that fail the incoming // interface comparison. InputInterfaceInvert bool // OutputInterface matches the name of the outgoing interface for the packet. OutputInterface string // OutputInterfaceMask masks the characters of the interface name when // comparing with OutputInterface. OutputInterfaceMask string // OutputInterfaceInvert inverts the meaning of outgoing interface check, // i.e. when true the filter will match packets that fail the outgoing // interface comparison. OutputInterfaceInvert bool } // EmptyFilter4 returns an initialized IPv4 header filter. func EmptyFilter4() IPHeaderFilter { return IPHeaderFilter{ Dst: tcpip.AddrFrom4([4]byte{}), DstMask: tcpip.AddrFrom4([4]byte{}), Src: tcpip.AddrFrom4([4]byte{}), SrcMask: tcpip.AddrFrom4([4]byte{}), } } // EmptyFilter6 returns an initialized IPv6 header filter. func EmptyFilter6() IPHeaderFilter { return IPHeaderFilter{ Dst: tcpip.AddrFrom16([16]byte{}), DstMask: tcpip.AddrFrom16([16]byte{}), Src: tcpip.AddrFrom16([16]byte{}), SrcMask: tcpip.AddrFrom16([16]byte{}), } } // match returns whether pkt matches the filter. // // Preconditions: pkt.NetworkHeader is set and is at least of the minimal IPv4 // or IPv6 header length. func (fl IPHeaderFilter) match(pkt *PacketBuffer, hook Hook, inNicName, outNicName string) bool { // Extract header fields. var ( transProto tcpip.TransportProtocolNumber dstAddr tcpip.Address srcAddr tcpip.Address ) switch proto := pkt.NetworkProtocolNumber; proto { case header.IPv4ProtocolNumber: hdr := header.IPv4(pkt.NetworkHeader().Slice()) transProto = hdr.TransportProtocol() dstAddr = hdr.DestinationAddress() srcAddr = hdr.SourceAddress() case header.IPv6ProtocolNumber: hdr := header.IPv6(pkt.NetworkHeader().Slice()) transProto = hdr.TransportProtocol() dstAddr = hdr.DestinationAddress() srcAddr = hdr.SourceAddress() default: panic(fmt.Sprintf("unknown network protocol with EtherType: %d", proto)) } // Check the transport protocol. if fl.CheckProtocol && fl.Protocol != transProto { return false } // Check the addresses. if !filterAddress(dstAddr, fl.DstMask, fl.Dst, fl.DstInvert) || !filterAddress(srcAddr, fl.SrcMask, fl.Src, fl.SrcInvert) { return false } switch hook { case Prerouting, Input: return matchIfName(inNicName, fl.InputInterface, fl.InputInterfaceInvert) case Output: return matchIfName(outNicName, fl.OutputInterface, fl.OutputInterfaceInvert) case Forward: if !matchIfName(inNicName, fl.InputInterface, fl.InputInterfaceInvert) { return false } if !matchIfName(outNicName, fl.OutputInterface, fl.OutputInterfaceInvert) { return false } return true case Postrouting: return true default: panic(fmt.Sprintf("unknown hook: %d", hook)) } } func matchIfName(nicName string, ifName string, invert bool) bool { n := len(ifName) if n == 0 { // If the interface name is omitted in the filter, any interface will match. return true } // If the interface name ends with '+', any interface which begins with the // name should be matched. var matches bool if strings.HasSuffix(ifName, "+") { matches = strings.HasPrefix(nicName, ifName[:n-1]) } else { matches = nicName == ifName } return matches != invert } // NetworkProtocol returns the protocol (IPv4 or IPv6) on to which the header // applies. func (fl IPHeaderFilter) NetworkProtocol() tcpip.NetworkProtocolNumber { switch fl.Src.BitLen() { case header.IPv4AddressSizeBits: return header.IPv4ProtocolNumber case header.IPv6AddressSizeBits: return header.IPv6ProtocolNumber } panic(fmt.Sprintf("invalid address in IPHeaderFilter: %s", fl.Src)) } // filterAddress returns whether addr matches the filter. func filterAddress(addr, mask, filterAddr tcpip.Address, invert bool) bool { matches := true addrBytes := addr.AsSlice() maskBytes := mask.AsSlice() filterBytes := filterAddr.AsSlice() for i := range filterAddr.AsSlice() { if addrBytes[i]&maskBytes[i] != filterBytes[i] { matches = false break } } return matches != invert } // A Matcher is the interface for matching packets. type Matcher interface { // Match returns whether the packet matches and whether the packet // should be "hotdropped", i.e. dropped immediately. This is usually // used for suspicious packets. // // Precondition: packet.NetworkHeader is set. Match(hook Hook, packet *PacketBuffer, inputInterfaceName, outputInterfaceName string) (matches bool, hotdrop bool) } // A Target is the interface for taking an action for a packet. type Target interface { // Action takes an action on the packet and returns a verdict on how // traversal should (or should not) continue. If the return value is // Jump, it also returns the index of the rule to jump to. Action(*PacketBuffer, Hook, *Route, AddressableEndpoint) (RuleVerdict, int) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/multi_port_endpoint_mutex.go000066400000000000000000000051661465435605700275270ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type multiPortEndpointRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var multiPortEndpointlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type multiPortEndpointlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *multiPortEndpointRWMutex) Lock() { locking.AddGLock(multiPortEndpointprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *multiPortEndpointRWMutex) NestedLock(i multiPortEndpointlockNameIndex) { locking.AddGLock(multiPortEndpointprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *multiPortEndpointRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(multiPortEndpointprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *multiPortEndpointRWMutex) NestedUnlock(i multiPortEndpointlockNameIndex) { m.mu.Unlock() locking.DelGLock(multiPortEndpointprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *multiPortEndpointRWMutex) RLock() { locking.AddGLock(multiPortEndpointprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *multiPortEndpointRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(multiPortEndpointprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *multiPortEndpointRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *multiPortEndpointRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *multiPortEndpointRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var multiPortEndpointprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func multiPortEndpointinitLockNames() {} func init() { multiPortEndpointinitLockNames() multiPortEndpointprefixIndex = locking.NewMutexClass(reflect.TypeOf(multiPortEndpointRWMutex{}), multiPortEndpointlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/neighbor_cache.go000066400000000000000000000237671465435605700251360ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "fmt" "gvisor.dev/gvisor/pkg/tcpip" ) // NeighborCacheSize is the size of the neighborCache. Exceeding this size will // result in the least recently used entry being evicted. const NeighborCacheSize = 512 // max entries per interface // NeighborStats holds metrics for the neighbor table. type NeighborStats struct { // UnreachableEntryLookups counts the number of lookups performed on an // entry in Unreachable state. UnreachableEntryLookups *tcpip.StatCounter } // +stateify savable type dynamicCacheEntry struct { lru neighborEntryList // count tracks the amount of dynamic entries in the cache. This is // needed since static entries do not count towards the LRU cache // eviction strategy. count uint16 } // +stateify savable type neighborCacheMu struct { neighborCacheRWMutex `state:"nosave"` cache map[tcpip.Address]*neighborEntry dynamic dynamicCacheEntry } // neighborCache maps IP addresses to link addresses. It uses the Least // Recently Used (LRU) eviction strategy to implement a bounded cache for // dynamically acquired entries. It contains the state machine and configuration // for running Neighbor Unreachability Detection (NUD). // // There are two types of entries in the neighbor cache: // 1. Dynamic entries are discovered automatically by neighbor discovery // protocols (e.g. ARP, NDP). These protocols will attempt to reconfirm // reachability with the device once the entry's state becomes Stale. // 2. Static entries are explicitly added by a user and have no expiration. // Their state is always Static. The amount of static entries stored in the // cache is unbounded. // // +stateify savable type neighborCache struct { nic *nic state *NUDState linkRes LinkAddressResolver mu neighborCacheMu } // getOrCreateEntry retrieves a cache entry associated with addr. The // returned entry is always refreshed in the cache (it is reachable via the // map, and its place is bumped in LRU). // // If a matching entry exists in the cache, it is returned. If no matching // entry exists and the cache is full, an existing entry is evicted via LRU, // reset to state incomplete, and returned. If no matching entry exists and the // cache is not full, a new entry with state incomplete is allocated and // returned. func (n *neighborCache) getOrCreateEntry(remoteAddr tcpip.Address) *neighborEntry { n.mu.Lock() defer n.mu.Unlock() if entry, ok := n.mu.cache[remoteAddr]; ok { entry.mu.RLock() if entry.mu.neigh.State != Static { n.mu.dynamic.lru.Remove(entry) n.mu.dynamic.lru.PushFront(entry) } entry.mu.RUnlock() return entry } // The entry that needs to be created must be dynamic since all static // entries are directly added to the cache via addStaticEntry. entry := newNeighborEntry(n, remoteAddr, n.state) if n.mu.dynamic.count == NeighborCacheSize { e := n.mu.dynamic.lru.Back() e.mu.Lock() delete(n.mu.cache, e.mu.neigh.Addr) n.mu.dynamic.lru.Remove(e) n.mu.dynamic.count-- e.removeLocked() e.mu.Unlock() } n.mu.cache[remoteAddr] = entry n.mu.dynamic.lru.PushFront(entry) n.mu.dynamic.count++ return entry } // entry looks up neighbor information matching the remote address, and returns // it if readily available. // // Returns ErrWouldBlock if the link address is not readily available, along // with a notification channel for the caller to block on. Triggers address // resolution asynchronously. // // If onResolve is provided, it will be called either immediately, if resolution // is not required, or when address resolution is complete, with the resolved // link address and whether resolution succeeded. After any callbacks have been // called, the returned notification channel is closed. // // NB: if a callback is provided, it should not call into the neighbor cache. // // If specified, the local address must be an address local to the interface the // neighbor cache belongs to. The local address is the source address of a // packet prompting NUD/link address resolution. func (n *neighborCache) entry(remoteAddr, localAddr tcpip.Address, onResolve func(LinkResolutionResult)) (*neighborEntry, <-chan struct{}, tcpip.Error) { entry := n.getOrCreateEntry(remoteAddr) entry.mu.Lock() defer entry.mu.Unlock() switch s := entry.mu.neigh.State; s { case Stale: entry.handlePacketQueuedLocked(localAddr) fallthrough case Reachable, Static, Delay, Probe: // As per RFC 4861 section 7.3.3: // "Neighbor Unreachability Detection operates in parallel with the sending // of packets to a neighbor. While reasserting a neighbor's reachability, // a node continues sending packets to that neighbor using the cached // link-layer address." if onResolve != nil { onResolve(LinkResolutionResult{LinkAddress: entry.mu.neigh.LinkAddr, Err: nil}) } return entry, nil, nil case Unknown, Incomplete, Unreachable: if onResolve != nil { entry.mu.onResolve = append(entry.mu.onResolve, onResolve) } if entry.mu.done == nil { // Address resolution needs to be initiated. entry.mu.done = make(chan struct{}) } entry.handlePacketQueuedLocked(localAddr) return entry, entry.mu.done, &tcpip.ErrWouldBlock{} default: panic(fmt.Sprintf("Invalid cache entry state: %s", s)) } } // entries returns all entries in the neighbor cache. func (n *neighborCache) entries() []NeighborEntry { n.mu.RLock() defer n.mu.RUnlock() entries := make([]NeighborEntry, 0, len(n.mu.cache)) for _, entry := range n.mu.cache { entry.mu.RLock() entries = append(entries, entry.mu.neigh) entry.mu.RUnlock() } return entries } // addStaticEntry adds a static entry to the neighbor cache, mapping an IP // address to a link address. If a dynamic entry exists in the neighbor cache // with the same address, it will be replaced with this static entry. If a // static entry exists with the same address but different link address, it // will be updated with the new link address. If a static entry exists with the // same address and link address, nothing will happen. func (n *neighborCache) addStaticEntry(addr tcpip.Address, linkAddr tcpip.LinkAddress) { n.mu.Lock() defer n.mu.Unlock() if entry, ok := n.mu.cache[addr]; ok { entry.mu.Lock() if entry.mu.neigh.State != Static { // Dynamic entry found with the same address. n.mu.dynamic.lru.Remove(entry) n.mu.dynamic.count-- } else if entry.mu.neigh.LinkAddr == linkAddr { // Static entry found with the same address and link address. entry.mu.Unlock() return } else { // Static entry found with the same address but different link address. entry.mu.neigh.LinkAddr = linkAddr entry.dispatchChangeEventLocked() entry.mu.Unlock() return } entry.removeLocked() entry.mu.Unlock() } entry := newStaticNeighborEntry(n, addr, linkAddr, n.state) n.mu.cache[addr] = entry entry.mu.Lock() defer entry.mu.Unlock() entry.dispatchAddEventLocked() } // removeEntry removes a dynamic or static entry by address from the neighbor // cache. Returns true if the entry was found and deleted. func (n *neighborCache) removeEntry(addr tcpip.Address) bool { n.mu.Lock() defer n.mu.Unlock() entry, ok := n.mu.cache[addr] if !ok { return false } entry.mu.Lock() defer entry.mu.Unlock() if entry.mu.neigh.State != Static { n.mu.dynamic.lru.Remove(entry) n.mu.dynamic.count-- } entry.removeLocked() delete(n.mu.cache, entry.mu.neigh.Addr) return true } // clear removes all dynamic and static entries from the neighbor cache. func (n *neighborCache) clear() { n.mu.Lock() defer n.mu.Unlock() for _, entry := range n.mu.cache { entry.mu.Lock() entry.removeLocked() entry.mu.Unlock() } n.mu.dynamic.lru = neighborEntryList{} clear(n.mu.cache) n.mu.dynamic.count = 0 } // config returns the NUD configuration. func (n *neighborCache) config() NUDConfigurations { return n.state.Config() } // setConfig changes the NUD configuration. // // If config contains invalid NUD configuration values, it will be fixed to // use default values for the erroneous values. func (n *neighborCache) setConfig(config NUDConfigurations) { config.resetInvalidFields() n.state.SetConfig(config) } // handleProbe handles a neighbor probe as defined by RFC 4861 section 7.2.3. // // Validation of the probe is expected to be handled by the caller. func (n *neighborCache) handleProbe(remoteAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) { entry := n.getOrCreateEntry(remoteAddr) entry.mu.Lock() entry.handleProbeLocked(remoteLinkAddr) entry.mu.Unlock() } // handleConfirmation handles a neighbor confirmation as defined by // RFC 4861 section 7.2.5. // // Validation of the confirmation is expected to be handled by the caller. func (n *neighborCache) handleConfirmation(addr tcpip.Address, linkAddr tcpip.LinkAddress, flags ReachabilityConfirmationFlags) { n.mu.RLock() entry, ok := n.mu.cache[addr] n.mu.RUnlock() if ok { entry.mu.Lock() entry.handleConfirmationLocked(linkAddr, flags) entry.mu.Unlock() } else { // The confirmation SHOULD be silently discarded if the recipient did not // initiate any communication with the target. This is indicated if there is // no matching entry for the remote address. n.nic.stats.neighbor.droppedConfirmationForNoninitiatedNeighbor.Increment() } } func (n *neighborCache) init(nic *nic, r LinkAddressResolver) { *n = neighborCache{ nic: nic, state: NewNUDState(nic.stack.nudConfigs, nic.stack.clock, nic.stack.insecureRNG), linkRes: r, } n.mu.Lock() n.mu.cache = make(map[tcpip.Address]*neighborEntry, NeighborCacheSize) n.mu.Unlock() } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/neighbor_cache_mutex.go000066400000000000000000000050161465435605700263430ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type neighborCacheRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var neighborCachelockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type neighborCachelockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *neighborCacheRWMutex) Lock() { locking.AddGLock(neighborCacheprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *neighborCacheRWMutex) NestedLock(i neighborCachelockNameIndex) { locking.AddGLock(neighborCacheprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *neighborCacheRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(neighborCacheprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *neighborCacheRWMutex) NestedUnlock(i neighborCachelockNameIndex) { m.mu.Unlock() locking.DelGLock(neighborCacheprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *neighborCacheRWMutex) RLock() { locking.AddGLock(neighborCacheprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *neighborCacheRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(neighborCacheprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *neighborCacheRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *neighborCacheRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *neighborCacheRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var neighborCacheprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func neighborCacheinitLockNames() {} func init() { neighborCacheinitLockNames() neighborCacheprefixIndex = locking.NewMutexClass(reflect.TypeOf(neighborCacheRWMutex{}), neighborCachelockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/neighbor_entry.go000066400000000000000000000470201465435605700252200ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "fmt" "time" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" ) const ( // immediateDuration is a duration of zero for scheduling work that needs to // be done immediately but asynchronously to avoid deadlock. immediateDuration time.Duration = 0 ) // NeighborEntry describes a neighboring device in the local network. type NeighborEntry struct { Addr tcpip.Address LinkAddr tcpip.LinkAddress State NeighborState UpdatedAt tcpip.MonotonicTime } // NeighborState defines the state of a NeighborEntry within the Neighbor // Unreachability Detection state machine, as per RFC 4861 section 7.3.2 and // RFC 7048. type NeighborState uint8 const ( // Unknown means reachability has not been verified yet. This is the initial // state of entries that have been created automatically by the Neighbor // Unreachability Detection state machine. Unknown NeighborState = iota // Incomplete means that there is an outstanding request to resolve the // address. Incomplete // Reachable means the path to the neighbor is functioning properly for both // receive and transmit paths. Reachable // Stale means reachability to the neighbor is unknown, but packets are still // able to be transmitted to the possibly stale link address. Stale // Delay means reachability to the neighbor is unknown and pending // confirmation from an upper-level protocol like TCP, but packets are still // able to be transmitted to the possibly stale link address. Delay // Probe means a reachability confirmation is actively being sought by // periodically retransmitting reachability probes until a reachability // confirmation is received, or until the maximum number of probes has been // sent. Probe // Static describes entries that have been explicitly added by the user. They // do not expire and are not deleted until explicitly removed. Static // Unreachable means reachability confirmation failed; the maximum number of // reachability probes has been sent and no replies have been received. // // TODO(gvisor.dev/issue/5472): Add the following sentence when we implement // RFC 7048: "Packets continue to be sent to the neighbor while // re-attempting to resolve the address." Unreachable ) type timer struct { // done indicates to the timer that the timer was stopped. done *bool timer tcpip.Timer } // neighborEntry implements a neighbor entry's individual node behavior, as per // RFC 4861 section 7.3.3. Neighbor Unreachability Detection operates in // parallel with the sending of packets to a neighbor, necessitating the // entry's lock to be acquired for all operations. type neighborEntry struct { neighborEntryEntry cache *neighborCache // nudState points to the Neighbor Unreachability Detection configuration. nudState *NUDState mu struct { neighborEntryRWMutex neigh NeighborEntry // done is closed when address resolution is complete. It is nil iff s is // incomplete and resolution is not yet in progress. done chan struct{} // onResolve is called with the result of address resolution. onResolve []func(LinkResolutionResult) isRouter bool timer timer } } // newNeighborEntry creates a neighbor cache entry starting at the default // state, Unknown. Transition out of Unknown by calling either // `handlePacketQueuedLocked` or `handleProbeLocked` on the newly created // neighborEntry. func newNeighborEntry(cache *neighborCache, remoteAddr tcpip.Address, nudState *NUDState) *neighborEntry { n := &neighborEntry{ cache: cache, nudState: nudState, } n.mu.Lock() n.mu.neigh = NeighborEntry{ Addr: remoteAddr, State: Unknown, } n.mu.Unlock() return n } // newStaticNeighborEntry creates a neighbor cache entry starting at the // Static state. The entry can only transition out of Static by directly // calling `setStateLocked`. func newStaticNeighborEntry(cache *neighborCache, addr tcpip.Address, linkAddr tcpip.LinkAddress, state *NUDState) *neighborEntry { entry := NeighborEntry{ Addr: addr, LinkAddr: linkAddr, State: Static, UpdatedAt: cache.nic.stack.clock.NowMonotonic(), } n := &neighborEntry{ cache: cache, nudState: state, } n.mu.Lock() n.mu.neigh = entry n.mu.Unlock() return n } // notifyCompletionLocked notifies those waiting for address resolution, with // the link address if resolution completed successfully. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) notifyCompletionLocked(err tcpip.Error) { res := LinkResolutionResult{LinkAddress: e.mu.neigh.LinkAddr, Err: err} for _, callback := range e.mu.onResolve { callback(res) } e.mu.onResolve = nil if ch := e.mu.done; ch != nil { close(ch) e.mu.done = nil // Dequeue the pending packets asynchronously to not hold up the current // goroutine as writing packets may be a costly operation. // // At the time of writing, when writing packets, a neighbor's link address // is resolved (which ends up obtaining the entry's lock) while holding the // link resolution queue's lock. Dequeuing packets asynchronously avoids a // lock ordering violation. // // NB: this is equivalent to spawning a goroutine directly using the go // keyword but allows tests that use manual clocks to deterministically // wait for this work to complete. e.cache.nic.stack.clock.AfterFunc(0, func() { e.cache.nic.linkResQueue.dequeue(ch, e.mu.neigh.LinkAddr, err) }) } } // dispatchAddEventLocked signals to stack's NUD Dispatcher that the entry has // been added. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) dispatchAddEventLocked() { if nudDisp := e.cache.nic.stack.nudDisp; nudDisp != nil { nudDisp.OnNeighborAdded(e.cache.nic.id, e.mu.neigh) } } // dispatchChangeEventLocked signals to stack's NUD Dispatcher that the entry // has changed state or link-layer address. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) dispatchChangeEventLocked() { if nudDisp := e.cache.nic.stack.nudDisp; nudDisp != nil { nudDisp.OnNeighborChanged(e.cache.nic.id, e.mu.neigh) } } // dispatchRemoveEventLocked signals to stack's NUD Dispatcher that the entry // has been removed. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) dispatchRemoveEventLocked() { if nudDisp := e.cache.nic.stack.nudDisp; nudDisp != nil { nudDisp.OnNeighborRemoved(e.cache.nic.id, e.mu.neigh) } } // cancelTimerLocked cancels the currently scheduled action, if there is one. // Entries in Unknown, Stale, or Static state do not have a scheduled action. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) cancelTimerLocked() { if e.mu.timer.timer != nil { e.mu.timer.timer.Stop() *e.mu.timer.done = true e.mu.timer = timer{} } } // removeLocked prepares the entry for removal. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) removeLocked() { e.mu.neigh.UpdatedAt = e.cache.nic.stack.clock.NowMonotonic() e.dispatchRemoveEventLocked() // Set state to unknown to invalidate this entry if it's cached in a Route. e.setStateLocked(Unknown) e.cancelTimerLocked() // TODO(https://gvisor.dev/issues/5583): test the case where this function is // called during resolution; that can happen in at least these scenarios: // // - manual address removal during resolution // // - neighbor cache eviction during resolution e.notifyCompletionLocked(&tcpip.ErrAborted{}) } // setStateLocked transitions the entry to the specified state immediately. // // Follows the logic defined in RFC 4861 section 7.3.3. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) setStateLocked(next NeighborState) { e.cancelTimerLocked() prev := e.mu.neigh.State e.mu.neigh.State = next e.mu.neigh.UpdatedAt = e.cache.nic.stack.clock.NowMonotonic() config := e.nudState.Config() switch next { case Incomplete: panic(fmt.Sprintf("should never transition to Incomplete with setStateLocked; neigh = %#v, prev state = %s", e.mu.neigh, prev)) case Reachable: // Protected by e.mu. done := false e.mu.timer = timer{ done: &done, timer: e.cache.nic.stack.Clock().AfterFunc(e.nudState.ReachableTime(), func() { e.mu.Lock() defer e.mu.Unlock() if done { // The timer was stopped because the entry changed state. return } e.setStateLocked(Stale) e.dispatchChangeEventLocked() }), } case Delay: // Protected by e.mu. done := false e.mu.timer = timer{ done: &done, timer: e.cache.nic.stack.Clock().AfterFunc(config.DelayFirstProbeTime, func() { e.mu.Lock() defer e.mu.Unlock() if done { // The timer was stopped because the entry changed state. return } e.setStateLocked(Probe) e.dispatchChangeEventLocked() }), } case Probe: // Protected by e.mu. done := false remaining := config.MaxUnicastProbes addr := e.mu.neigh.Addr linkAddr := e.mu.neigh.LinkAddr // Send a probe in another gorountine to free this thread of execution // for finishing the state transition. This is necessary to escape the // currently held lock so we can send the probe message without holding // a shared lock. e.mu.timer = timer{ done: &done, timer: e.cache.nic.stack.Clock().AfterFunc(immediateDuration, func() { var err tcpip.Error = &tcpip.ErrTimeout{} if remaining != 0 { err = e.cache.linkRes.LinkAddressRequest(addr, tcpip.Address{} /* localAddr */, linkAddr) } e.mu.Lock() defer e.mu.Unlock() if done { // The timer was stopped because the entry changed state. return } if err != nil { e.setStateLocked(Unreachable) e.notifyCompletionLocked(err) e.dispatchChangeEventLocked() return } remaining-- e.mu.timer.timer.Reset(config.RetransmitTimer) }), } case Unreachable: case Unknown, Stale, Static: // Do nothing default: panic(fmt.Sprintf("Invalid state transition from %q to %q", prev, next)) } } // handlePacketQueuedLocked advances the state machine according to a packet // being queued for outgoing transmission. // // Follows the logic defined in RFC 4861 section 7.3.3. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) handlePacketQueuedLocked(localAddr tcpip.Address) { switch e.mu.neigh.State { case Unknown, Unreachable: prev := e.mu.neigh.State e.mu.neigh.State = Incomplete e.mu.neigh.UpdatedAt = e.cache.nic.stack.clock.NowMonotonic() switch prev { case Unknown: e.dispatchAddEventLocked() case Unreachable: e.dispatchChangeEventLocked() e.cache.nic.stats.neighbor.unreachableEntryLookups.Increment() } config := e.nudState.Config() // Protected by e.mu. done := false remaining := config.MaxMulticastProbes addr := e.mu.neigh.Addr // Send a probe in another gorountine to free this thread of execution // for finishing the state transition. This is necessary to escape the // currently held lock so we can send the probe message without holding // a shared lock. e.mu.timer = timer{ done: &done, timer: e.cache.nic.stack.Clock().AfterFunc(immediateDuration, func() { var err tcpip.Error = &tcpip.ErrTimeout{} if remaining != 0 { // As per RFC 4861 section 7.2.2: // // If the source address of the packet prompting the solicitation is // the same as one of the addresses assigned to the outgoing interface, // that address SHOULD be placed in the IP Source Address of the // outgoing solicitation. // err = e.cache.linkRes.LinkAddressRequest(addr, localAddr, "" /* linkAddr */) } e.mu.Lock() defer e.mu.Unlock() if done { // The timer was stopped because the entry changed state. return } if err != nil { e.setStateLocked(Unreachable) e.notifyCompletionLocked(err) e.dispatchChangeEventLocked() return } remaining-- e.mu.timer.timer.Reset(config.RetransmitTimer) }), } case Stale: e.setStateLocked(Delay) e.dispatchChangeEventLocked() case Incomplete, Reachable, Delay, Probe, Static: // Do nothing default: panic(fmt.Sprintf("Invalid cache entry state: %s", e.mu.neigh.State)) } } // handleProbeLocked processes an incoming neighbor probe (e.g. ARP request or // Neighbor Solicitation for ARP or NDP, respectively). // // Follows the logic defined in RFC 4861 section 7.2.3. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) handleProbeLocked(remoteLinkAddr tcpip.LinkAddress) { // Probes MUST be silently discarded if the target address is tentative, does // not exist, or not bound to the NIC as per RFC 4861 section 7.2.3. These // checks MUST be done by the NetworkEndpoint. switch e.mu.neigh.State { case Unknown: e.mu.neigh.LinkAddr = remoteLinkAddr e.setStateLocked(Stale) e.dispatchAddEventLocked() case Incomplete: // "If an entry already exists, and the cached link-layer address // differs from the one in the received Source Link-Layer option, the // cached address should be replaced by the received address, and the // entry's reachability state MUST be set to STALE." // - RFC 4861 section 7.2.3 e.mu.neigh.LinkAddr = remoteLinkAddr e.setStateLocked(Stale) e.notifyCompletionLocked(nil) e.dispatchChangeEventLocked() case Reachable, Delay, Probe: if e.mu.neigh.LinkAddr != remoteLinkAddr { e.mu.neigh.LinkAddr = remoteLinkAddr e.setStateLocked(Stale) e.dispatchChangeEventLocked() } case Stale: if e.mu.neigh.LinkAddr != remoteLinkAddr { e.mu.neigh.LinkAddr = remoteLinkAddr e.dispatchChangeEventLocked() } case Unreachable: // TODO(gvisor.dev/issue/5472): Do not change the entry if the link // address is the same, as per RFC 7048. e.mu.neigh.LinkAddr = remoteLinkAddr e.setStateLocked(Stale) e.dispatchChangeEventLocked() case Static: // Do nothing default: panic(fmt.Sprintf("Invalid cache entry state: %s", e.mu.neigh.State)) } } // handleConfirmationLocked processes an incoming neighbor confirmation // (e.g. ARP reply or Neighbor Advertisement for ARP or NDP, respectively). // // Follows the state machine defined by RFC 4861 section 7.2.5. // // TODO(gvisor.dev/issue/2277): To protect against ARP poisoning and other // attacks against NDP functions, Secure Neighbor Discovery (SEND) Protocol // should be deployed where preventing access to the broadcast segment might // not be possible. SEND uses RSA key pairs to produce Cryptographically // Generated Addresses (CGA), as defined in RFC 3972. This ensures that the // claimed source of an NDP message is the owner of the claimed address. // // Precondition: e.mu MUST be locked. func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, flags ReachabilityConfirmationFlags) { switch e.mu.neigh.State { case Incomplete: if len(linkAddr) == 0 { // "If the link layer has addresses and no Target Link-Layer Address // option is included, the receiving node SHOULD silently discard the // received advertisement." - RFC 4861 section 7.2.5 e.cache.nic.stats.neighbor.droppedInvalidLinkAddressConfirmations.Increment() break } e.mu.neigh.LinkAddr = linkAddr if flags.Solicited { e.setStateLocked(Reachable) } else { e.setStateLocked(Stale) } e.dispatchChangeEventLocked() e.mu.isRouter = flags.IsRouter e.notifyCompletionLocked(nil) // "Note that the Override flag is ignored if the entry is in the // INCOMPLETE state." - RFC 4861 section 7.2.5 case Reachable, Stale, Delay, Probe: isLinkAddrDifferent := len(linkAddr) != 0 && e.mu.neigh.LinkAddr != linkAddr if isLinkAddrDifferent { if !flags.Override { if e.mu.neigh.State == Reachable { e.setStateLocked(Stale) e.dispatchChangeEventLocked() } break } e.mu.neigh.LinkAddr = linkAddr if !flags.Solicited { if e.mu.neigh.State != Stale { e.setStateLocked(Stale) e.dispatchChangeEventLocked() } else { // Notify the LinkAddr change, even though NUD state hasn't changed. e.dispatchChangeEventLocked() } break } } if flags.Solicited && (flags.Override || !isLinkAddrDifferent) { wasReachable := e.mu.neigh.State == Reachable // Set state to Reachable again to refresh timers. e.setStateLocked(Reachable) e.notifyCompletionLocked(nil) if !wasReachable { e.dispatchChangeEventLocked() } } if e.mu.isRouter && !flags.IsRouter && header.IsV6UnicastAddress(e.mu.neigh.Addr) { // "In those cases where the IsRouter flag changes from TRUE to FALSE as // a result of this update, the node MUST remove that router from the // Default Router List and update the Destination Cache entries for all // destinations using that neighbor as a router as specified in Section // 7.3.3. This is needed to detect when a node that is used as a router // stops forwarding packets due to being configured as a host." // - RFC 4861 section 7.2.5 // // TODO(gvisor.dev/issue/4085): Remove the special casing we do for IPv6 // here. ep := e.cache.nic.getNetworkEndpoint(header.IPv6ProtocolNumber) if ep == nil { panic(fmt.Sprintf("have a neighbor entry for an IPv6 router but no IPv6 network endpoint")) } if ndpEP, ok := ep.(NDPEndpoint); ok { ndpEP.InvalidateDefaultRouter(e.mu.neigh.Addr) } } e.mu.isRouter = flags.IsRouter case Unknown, Unreachable, Static: // Do nothing default: panic(fmt.Sprintf("Invalid cache entry state: %s", e.mu.neigh.State)) } } // handleUpperLevelConfirmation processes an incoming upper-level protocol // (e.g. TCP acknowledgements) reachability confirmation. func (e *neighborEntry) handleUpperLevelConfirmation() { tryHandleConfirmation := func() bool { switch e.mu.neigh.State { case Stale, Delay, Probe: return true case Reachable: // Avoid setStateLocked; Timer.Reset is cheaper. // // Note that setting the timer does not need to be protected by the // entry's write lock since we do not modify the timer pointer, but the // time the timer should fire. The timer should have internal locks to // synchronize timer resets changes with the clock. e.mu.timer.timer.Reset(e.nudState.ReachableTime()) return false case Unknown, Incomplete, Unreachable, Static: // Do nothing return false default: panic(fmt.Sprintf("Invalid cache entry state: %s", e.mu.neigh.State)) } } e.mu.RLock() needsTransition := tryHandleConfirmation() e.mu.RUnlock() if !needsTransition { return } // We need to transition the neighbor to Reachable so take the write lock and // perform the transition, but only if we still need the transition since the // state could have changed since we dropped the read lock above. e.mu.Lock() defer e.mu.Unlock() if needsTransition := tryHandleConfirmation(); needsTransition { e.setStateLocked(Reachable) e.dispatchChangeEventLocked() } } // getRemoteLinkAddress returns the entry's link address and whether that link // address is valid. func (e *neighborEntry) getRemoteLinkAddress() (tcpip.LinkAddress, bool) { e.mu.RLock() defer e.mu.RUnlock() switch e.mu.neigh.State { case Reachable, Static, Delay, Probe: return e.mu.neigh.LinkAddr, true case Unknown, Incomplete, Unreachable, Stale: return "", false default: panic(fmt.Sprintf("invalid state for neighbor entry %v: %v", e.mu.neigh, e.mu.neigh.State)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/neighbor_entry_list.go000066400000000000000000000126221465435605700262530ustar00rootroot00000000000000package stack // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type neighborEntryElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (neighborEntryElementMapper) linkerFor(elem *neighborEntry) *neighborEntry { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type neighborEntryList struct { head *neighborEntry tail *neighborEntry } // Reset resets list l to the empty state. func (l *neighborEntryList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *neighborEntryList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *neighborEntryList) Front() *neighborEntry { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *neighborEntryList) Back() *neighborEntry { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *neighborEntryList) Len() (count int) { for e := l.Front(); e != nil; e = (neighborEntryElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *neighborEntryList) PushFront(e *neighborEntry) { linker := neighborEntryElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { neighborEntryElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *neighborEntryList) PushFrontList(m *neighborEntryList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { neighborEntryElementMapper{}.linkerFor(l.head).SetPrev(m.tail) neighborEntryElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *neighborEntryList) PushBack(e *neighborEntry) { linker := neighborEntryElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { neighborEntryElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *neighborEntryList) PushBackList(m *neighborEntryList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { neighborEntryElementMapper{}.linkerFor(l.tail).SetNext(m.head) neighborEntryElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *neighborEntryList) InsertAfter(b, e *neighborEntry) { bLinker := neighborEntryElementMapper{}.linkerFor(b) eLinker := neighborEntryElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { neighborEntryElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *neighborEntryList) InsertBefore(a, e *neighborEntry) { aLinker := neighborEntryElementMapper{}.linkerFor(a) eLinker := neighborEntryElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { neighborEntryElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *neighborEntryList) Remove(e *neighborEntry) { linker := neighborEntryElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { neighborEntryElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { neighborEntryElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type neighborEntryEntry struct { next *neighborEntry prev *neighborEntry } // Next returns the entry that follows e in the list. // //go:nosplit func (e *neighborEntryEntry) Next() *neighborEntry { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *neighborEntryEntry) Prev() *neighborEntry { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *neighborEntryEntry) SetNext(elem *neighborEntry) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *neighborEntryEntry) SetPrev(elem *neighborEntry) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/neighbor_entry_mutex.go000066400000000000000000000050161465435605700264410ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type neighborEntryRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var neighborEntrylockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type neighborEntrylockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *neighborEntryRWMutex) Lock() { locking.AddGLock(neighborEntryprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *neighborEntryRWMutex) NestedLock(i neighborEntrylockNameIndex) { locking.AddGLock(neighborEntryprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *neighborEntryRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(neighborEntryprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *neighborEntryRWMutex) NestedUnlock(i neighborEntrylockNameIndex) { m.mu.Unlock() locking.DelGLock(neighborEntryprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *neighborEntryRWMutex) RLock() { locking.AddGLock(neighborEntryprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *neighborEntryRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(neighborEntryprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *neighborEntryRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *neighborEntryRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *neighborEntryRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var neighborEntryprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func neighborEntryinitLockNames() {} func init() { neighborEntryinitLockNames() neighborEntryprefixIndex = locking.NewMutexClass(reflect.TypeOf(neighborEntryRWMutex{}), neighborEntrylockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/neighborstate_string.go000066400000000000000000000026311465435605700264250ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by "stringer -type NeighborState"; DO NOT EDIT. package stack import "strconv" func _() { // An "invalid array index" compiler error signifies that the constant values have changed. // Re-run the stringer command to generate them again. var x [1]struct{} _ = x[Unknown-0] _ = x[Incomplete-1] _ = x[Reachable-2] _ = x[Stale-3] _ = x[Delay-4] _ = x[Probe-5] _ = x[Static-6] _ = x[Unreachable-7] } const _NeighborState_name = "UnknownIncompleteReachableStaleDelayProbeStaticUnreachable" var _NeighborState_index = [...]uint8{0, 7, 17, 26, 31, 36, 41, 47, 58} func (i NeighborState) String() string { if i >= NeighborState(len(_NeighborState_index)-1) { return "NeighborState(" + strconv.FormatInt(int64(i), 10) + ")" } return _NeighborState_name[_NeighborState_index[i]:_NeighborState_index[i+1]] } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/nic.go000066400000000000000000001011601465435605700227470ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "fmt" "reflect" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" ) // +stateify savable type linkResolver struct { resolver LinkAddressResolver neigh neighborCache } var _ NetworkInterface = (*nic)(nil) var _ NetworkDispatcher = (*nic)(nil) // nic represents a "network interface card" to which the networking stack is // attached. // // +stateify savable type nic struct { NetworkLinkEndpoint stack *Stack id tcpip.NICID name string context NICContext stats sharedStats // enableDisableMu is used to synchronize attempts to enable/disable the NIC. // Without this mutex, calls to enable/disable the NIC may interleave and // leave the NIC in an inconsistent state. enableDisableMu nicRWMutex `state:"nosave"` // The network endpoints themselves may be modified by calling the interface's // methods, but the map reference and entries must be constant. networkEndpoints map[tcpip.NetworkProtocolNumber]NetworkEndpoint linkAddrResolvers map[tcpip.NetworkProtocolNumber]*linkResolver duplicateAddressDetectors map[tcpip.NetworkProtocolNumber]DuplicateAddressDetector // enabled indicates whether the NIC is enabled. enabled atomicbitops.Bool // spoofing indicates whether the NIC is spoofing. spoofing atomicbitops.Bool // promiscuous indicates whether the NIC is promiscuous. promiscuous atomicbitops.Bool // linkResQueue holds packets that are waiting for link resolution to // complete. linkResQueue packetsPendingLinkResolution // packetEPsMu protects annotated fields below. packetEPsMu packetEPsRWMutex `state:"nosave"` // eps is protected by the mutex, but the values contained in it are not. // // +checklocks:packetEPsMu packetEPs map[tcpip.NetworkProtocolNumber]*packetEndpointList qDisc QueueingDiscipline // deliverLinkPackets specifies whether this NIC delivers packets to // packet sockets. It is immutable. // // deliverLinkPackets is off by default because some users already // deliver link packets by explicitly calling nic.DeliverLinkPackets. deliverLinkPackets bool // Primary is the main controlling interface in a bonded setup. Primary *nic } // makeNICStats initializes the NIC statistics and associates them to the global // NIC statistics. func makeNICStats(global tcpip.NICStats) sharedStats { var stats sharedStats tcpip.InitStatCounters(reflect.ValueOf(&stats.local).Elem()) stats.init(&stats.local, &global) return stats } // +stateify savable type packetEndpointList struct { mu packetEndpointListRWMutex // eps is protected by mu, but the contained PacketEndpoint values are not. // // +checklocks:mu eps []PacketEndpoint } func (p *packetEndpointList) add(ep PacketEndpoint) { p.mu.Lock() defer p.mu.Unlock() p.eps = append(p.eps, ep) } func (p *packetEndpointList) remove(ep PacketEndpoint) { p.mu.Lock() defer p.mu.Unlock() for i, epOther := range p.eps { if epOther == ep { p.eps = append(p.eps[:i], p.eps[i+1:]...) break } } } func (p *packetEndpointList) len() int { p.mu.RLock() defer p.mu.RUnlock() return len(p.eps) } // forEach calls fn with each endpoints in p while holding the read lock on p. func (p *packetEndpointList) forEach(fn func(PacketEndpoint)) { p.mu.RLock() defer p.mu.RUnlock() for _, ep := range p.eps { fn(ep) } } var _ QueueingDiscipline = (*delegatingQueueingDiscipline)(nil) // +stateify savable type delegatingQueueingDiscipline struct { LinkWriter } func (*delegatingQueueingDiscipline) Close() {} // WritePacket passes the packet through to the underlying LinkWriter's WritePackets. func (qDisc *delegatingQueueingDiscipline) WritePacket(pkt *PacketBuffer) tcpip.Error { var pkts PacketBufferList pkts.PushBack(pkt) _, err := qDisc.LinkWriter.WritePackets(pkts) return err } // newNIC returns a new NIC using the default NDP configurations from stack. func newNIC(stack *Stack, id tcpip.NICID, ep LinkEndpoint, opts NICOptions) *nic { // TODO(b/141011931): Validate a LinkEndpoint (ep) is valid. For // example, make sure that the link address it provides is a valid // unicast ethernet address. // If no queueing discipline was specified provide a stub implementation that // just delegates to the lower link endpoint. qDisc := opts.QDisc if qDisc == nil { qDisc = &delegatingQueueingDiscipline{LinkWriter: ep} } // TODO(b/143357959): RFC 8200 section 5 requires that IPv6 endpoints // observe an MTU of at least 1280 bytes. Ensure that this requirement // of IPv6 is supported on this endpoint's LinkEndpoint. nic := &nic{ NetworkLinkEndpoint: ep, stack: stack, id: id, name: opts.Name, context: opts.Context, stats: makeNICStats(stack.Stats().NICs), networkEndpoints: make(map[tcpip.NetworkProtocolNumber]NetworkEndpoint), linkAddrResolvers: make(map[tcpip.NetworkProtocolNumber]*linkResolver), duplicateAddressDetectors: make(map[tcpip.NetworkProtocolNumber]DuplicateAddressDetector), qDisc: qDisc, deliverLinkPackets: opts.DeliverLinkPackets, } nic.linkResQueue.init(nic) nic.packetEPsMu.Lock() defer nic.packetEPsMu.Unlock() nic.packetEPs = make(map[tcpip.NetworkProtocolNumber]*packetEndpointList) resolutionRequired := ep.Capabilities()&CapabilityResolutionRequired != 0 for _, netProto := range stack.networkProtocols { netNum := netProto.Number() netEP := netProto.NewEndpoint(nic, nic) nic.networkEndpoints[netNum] = netEP if resolutionRequired { if r, ok := netEP.(LinkAddressResolver); ok { l := &linkResolver{resolver: r} l.neigh.init(nic, r) nic.linkAddrResolvers[r.LinkAddressProtocol()] = l } } if d, ok := netEP.(DuplicateAddressDetector); ok { nic.duplicateAddressDetectors[d.DuplicateAddressProtocol()] = d } } nic.NetworkLinkEndpoint.Attach(nic) return nic } func (n *nic) getNetworkEndpoint(proto tcpip.NetworkProtocolNumber) NetworkEndpoint { return n.networkEndpoints[proto] } // Enabled implements NetworkInterface. func (n *nic) Enabled() bool { return n.enabled.Load() } // setEnabled sets the enabled status for the NIC. // // Returns true if the enabled status was updated. // // +checklocks:n.enableDisableMu func (n *nic) setEnabled(v bool) bool { return n.enabled.Swap(v) != v } // disable disables n. // // It undoes the work done by enable. func (n *nic) disable() { n.enableDisableMu.Lock() defer n.enableDisableMu.Unlock() n.disableLocked() } // disableLocked disables n. // // It undoes the work done by enable. // // +checklocks:n.enableDisableMu func (n *nic) disableLocked() { if !n.Enabled() { return } // TODO(gvisor.dev/issue/1491): Should Routes that are currently bound to n be // invalidated? Currently, Routes will continue to work when a NIC is enabled // again, and applications may not know that the underlying NIC was ever // disabled. for _, ep := range n.networkEndpoints { ep.Disable() // Clear the neighbour table (including static entries) as we cannot // guarantee that the current neighbour table will be valid when the NIC is // enabled again. // // This matches linux's behaviour at the time of writing: // https://github.com/torvalds/linux/blob/71c061d2443814de15e177489d5cc00a4a253ef3/net/core/neighbour.c#L371 netProto := ep.NetworkProtocolNumber() switch err := n.clearNeighbors(netProto); err.(type) { case nil, *tcpip.ErrNotSupported: default: panic(fmt.Sprintf("n.clearNeighbors(%d): %s", netProto, err)) } } if !n.setEnabled(false) { panic("should have only done work to disable the NIC if it was enabled") } } // enable enables n. // // If the stack has IPv6 enabled, enable will join the IPv6 All-Nodes Multicast // address (ff02::1), start DAD for permanent addresses, and start soliciting // routers if the stack is not operating as a router. If the stack is also // configured to auto-generate a link-local address, one will be generated. func (n *nic) enable() tcpip.Error { n.enableDisableMu.Lock() defer n.enableDisableMu.Unlock() if !n.setEnabled(true) { return nil } for _, ep := range n.networkEndpoints { if err := ep.Enable(); err != nil { return err } } return nil } // remove detaches NIC from the link endpoint and releases network endpoint // resources. This guarantees no packets between this NIC and the network // stack. // // It returns an action that has to be excuted after releasing the Stack lock // and any error encountered. func (n *nic) remove(closeLinkEndpoint bool) (func(), tcpip.Error) { n.enableDisableMu.Lock() n.disableLocked() for _, ep := range n.networkEndpoints { ep.Close() } n.enableDisableMu.Unlock() // Drain and drop any packets pending link resolution. // We must not hold n.enableDisableMu here. n.linkResQueue.cancel() var deferAct func() // Prevent packets from going down to the link before shutting the link down. n.qDisc.Close() n.NetworkLinkEndpoint.Attach(nil) if closeLinkEndpoint { ep := n.NetworkLinkEndpoint ep.SetOnCloseAction(nil) // The link endpoint has to be closed without holding a // netstack lock, because it can trigger other netstack // operations. deferAct = ep.Close } return deferAct, nil } // setPromiscuousMode enables or disables promiscuous mode. func (n *nic) setPromiscuousMode(enable bool) { n.promiscuous.Store(enable) } // Promiscuous implements NetworkInterface. func (n *nic) Promiscuous() bool { return n.promiscuous.Load() } // IsLoopback implements NetworkInterface. func (n *nic) IsLoopback() bool { return n.NetworkLinkEndpoint.Capabilities()&CapabilityLoopback != 0 } // WritePacket implements NetworkEndpoint. func (n *nic) WritePacket(r *Route, pkt *PacketBuffer) tcpip.Error { routeInfo, _, err := r.resolvedFields(nil) switch err.(type) { case nil: pkt.EgressRoute = routeInfo return n.writePacket(pkt) case *tcpip.ErrWouldBlock: // As per relevant RFCs, we should queue packets while we wait for link // resolution to complete. // // RFC 1122 section 2.3.2.2 (for IPv4): // The link layer SHOULD save (rather than discard) at least // one (the latest) packet of each set of packets destined to // the same unresolved IP address, and transmit the saved // packet when the address has been resolved. // // RFC 4861 section 7.2.2 (for IPv6): // While waiting for address resolution to complete, the sender MUST, for // each neighbor, retain a small queue of packets waiting for address // resolution to complete. The queue MUST hold at least one packet, and // MAY contain more. However, the number of queued packets per neighbor // SHOULD be limited to some small value. When a queue overflows, the new // arrival SHOULD replace the oldest entry. Once address resolution // completes, the node transmits any queued packets. return n.linkResQueue.enqueue(r, pkt) default: return err } } // WritePacketToRemote implements NetworkInterface. func (n *nic) WritePacketToRemote(remoteLinkAddr tcpip.LinkAddress, pkt *PacketBuffer) tcpip.Error { pkt.EgressRoute = RouteInfo{ routeInfo: routeInfo{ NetProto: pkt.NetworkProtocolNumber, LocalLinkAddress: n.LinkAddress(), }, RemoteLinkAddress: remoteLinkAddr, } return n.writePacket(pkt) } func (n *nic) writePacket(pkt *PacketBuffer) tcpip.Error { n.NetworkLinkEndpoint.AddHeader(pkt) return n.writeRawPacket(pkt) } func (n *nic) writeRawPacketWithLinkHeaderInPayload(pkt *PacketBuffer) tcpip.Error { if !n.NetworkLinkEndpoint.ParseHeader(pkt) { return &tcpip.ErrMalformedHeader{} } return n.writeRawPacket(pkt) } func (n *nic) writeRawPacket(pkt *PacketBuffer) tcpip.Error { // Always an outgoing packet. pkt.PktType = tcpip.PacketOutgoing if n.deliverLinkPackets { n.DeliverLinkPacket(pkt.NetworkProtocolNumber, pkt) } if err := n.qDisc.WritePacket(pkt); err != nil { if _, ok := err.(*tcpip.ErrNoBufferSpace); ok { n.stats.txPacketsDroppedNoBufferSpace.Increment() } return err } n.stats.tx.packets.Increment() n.stats.tx.bytes.IncrementBy(uint64(pkt.Size())) return nil } // setSpoofing enables or disables address spoofing. func (n *nic) setSpoofing(enable bool) { n.spoofing.Store(enable) } // Spoofing implements NetworkInterface. func (n *nic) Spoofing() bool { return n.spoofing.Load() } // primaryAddress returns an address that can be used to communicate with // remoteAddr. func (n *nic) primaryEndpoint(protocol tcpip.NetworkProtocolNumber, remoteAddr, srcHint tcpip.Address) AssignableAddressEndpoint { ep := n.getNetworkEndpoint(protocol) if ep == nil { return nil } addressableEndpoint, ok := ep.(AddressableEndpoint) if !ok { return nil } return addressableEndpoint.AcquireOutgoingPrimaryAddress(remoteAddr, srcHint, n.Spoofing()) } type getAddressBehaviour int const ( // spoofing indicates that the NIC's spoofing flag should be observed when // getting a NIC's address endpoint. spoofing getAddressBehaviour = iota // promiscuous indicates that the NIC's promiscuous flag should be observed // when getting a NIC's address endpoint. promiscuous ) func (n *nic) getAddress(protocol tcpip.NetworkProtocolNumber, dst tcpip.Address) AssignableAddressEndpoint { return n.getAddressOrCreateTemp(protocol, dst, CanBePrimaryEndpoint, promiscuous) } func (n *nic) hasAddress(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) bool { ep := n.getAddressOrCreateTempInner(protocol, addr, false, NeverPrimaryEndpoint) if ep != nil { ep.DecRef() return true } return false } // findEndpoint finds the endpoint, if any, with the given address. func (n *nic) findEndpoint(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior) AssignableAddressEndpoint { return n.getAddressOrCreateTemp(protocol, address, peb, spoofing) } // getAddressEpOrCreateTemp returns the address endpoint for the given protocol // and address. // // If none exists a temporary one may be created if we are in promiscuous mode // or spoofing. Promiscuous mode will only be checked if promiscuous is true. // Similarly, spoofing will only be checked if spoofing is true. // // If the address is the IPv4 broadcast address for an endpoint's network, that // endpoint will be returned. func (n *nic) getAddressOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior, tempRef getAddressBehaviour) AssignableAddressEndpoint { var spoofingOrPromiscuous bool switch tempRef { case spoofing: spoofingOrPromiscuous = n.Spoofing() case promiscuous: spoofingOrPromiscuous = n.Promiscuous() } return n.getAddressOrCreateTempInner(protocol, address, spoofingOrPromiscuous, peb) } // getAddressOrCreateTempInner is like getAddressEpOrCreateTemp except a boolean // is passed to indicate whether or not we should generate temporary endpoints. func (n *nic) getAddressOrCreateTempInner(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, createTemp bool, peb PrimaryEndpointBehavior) AssignableAddressEndpoint { ep := n.getNetworkEndpoint(protocol) if ep == nil { return nil } addressableEndpoint, ok := ep.(AddressableEndpoint) if !ok { return nil } return addressableEndpoint.AcquireAssignedAddress(address, createTemp, peb, false) } // addAddress adds a new address to n, so that it starts accepting packets // targeted at the given address (and network protocol). func (n *nic) addAddress(protocolAddress tcpip.ProtocolAddress, properties AddressProperties) tcpip.Error { ep := n.getNetworkEndpoint(protocolAddress.Protocol) if ep == nil { return &tcpip.ErrUnknownProtocol{} } addressableEndpoint, ok := ep.(AddressableEndpoint) if !ok { return &tcpip.ErrNotSupported{} } addressEndpoint, err := addressableEndpoint.AddAndAcquirePermanentAddress(protocolAddress.AddressWithPrefix, properties) if err == nil { // We have no need for the address endpoint. addressEndpoint.DecRef() } return err } // allPermanentAddresses returns all permanent addresses associated with // this NIC. func (n *nic) allPermanentAddresses() []tcpip.ProtocolAddress { var addrs []tcpip.ProtocolAddress for p, ep := range n.networkEndpoints { addressableEndpoint, ok := ep.(AddressableEndpoint) if !ok { continue } for _, a := range addressableEndpoint.PermanentAddresses() { addrs = append(addrs, tcpip.ProtocolAddress{Protocol: p, AddressWithPrefix: a}) } } return addrs } // primaryAddresses returns the primary addresses associated with this NIC. func (n *nic) primaryAddresses() []tcpip.ProtocolAddress { var addrs []tcpip.ProtocolAddress for p, ep := range n.networkEndpoints { addressableEndpoint, ok := ep.(AddressableEndpoint) if !ok { continue } for _, a := range addressableEndpoint.PrimaryAddresses() { addrs = append(addrs, tcpip.ProtocolAddress{Protocol: p, AddressWithPrefix: a}) } } return addrs } // PrimaryAddress implements NetworkInterface. func (n *nic) PrimaryAddress(proto tcpip.NetworkProtocolNumber) (tcpip.AddressWithPrefix, tcpip.Error) { ep := n.getNetworkEndpoint(proto) if ep == nil { return tcpip.AddressWithPrefix{}, &tcpip.ErrUnknownProtocol{} } addressableEndpoint, ok := ep.(AddressableEndpoint) if !ok { return tcpip.AddressWithPrefix{}, &tcpip.ErrNotSupported{} } return addressableEndpoint.MainAddress(), nil } // removeAddress removes an address from n. func (n *nic) removeAddress(addr tcpip.Address) tcpip.Error { for _, ep := range n.networkEndpoints { addressableEndpoint, ok := ep.(AddressableEndpoint) if !ok { continue } switch err := addressableEndpoint.RemovePermanentAddress(addr); err.(type) { case *tcpip.ErrBadLocalAddress: continue default: return err } } return &tcpip.ErrBadLocalAddress{} } func (n *nic) setAddressLifetimes(addr tcpip.Address, lifetimes AddressLifetimes) tcpip.Error { for _, ep := range n.networkEndpoints { ep, ok := ep.(AddressableEndpoint) if !ok { continue } switch err := ep.SetLifetimes(addr, lifetimes); err.(type) { case *tcpip.ErrBadLocalAddress: continue default: return err } } return &tcpip.ErrBadLocalAddress{} } func (n *nic) getLinkAddress(addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, onResolve func(LinkResolutionResult)) tcpip.Error { linkRes, ok := n.linkAddrResolvers[protocol] if !ok { return &tcpip.ErrNotSupported{} } if linkAddr, ok := linkRes.resolver.ResolveStaticAddress(addr); ok { onResolve(LinkResolutionResult{LinkAddress: linkAddr, Err: nil}) return nil } _, _, err := linkRes.neigh.entry(addr, localAddr, onResolve) return err } func (n *nic) neighbors(protocol tcpip.NetworkProtocolNumber) ([]NeighborEntry, tcpip.Error) { if linkRes, ok := n.linkAddrResolvers[protocol]; ok { return linkRes.neigh.entries(), nil } return nil, &tcpip.ErrNotSupported{} } func (n *nic) addStaticNeighbor(addr tcpip.Address, protocol tcpip.NetworkProtocolNumber, linkAddress tcpip.LinkAddress) tcpip.Error { if linkRes, ok := n.linkAddrResolvers[protocol]; ok { linkRes.neigh.addStaticEntry(addr, linkAddress) return nil } return &tcpip.ErrNotSupported{} } func (n *nic) removeNeighbor(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.Error { if linkRes, ok := n.linkAddrResolvers[protocol]; ok { if !linkRes.neigh.removeEntry(addr) { return &tcpip.ErrBadAddress{} } return nil } return &tcpip.ErrNotSupported{} } func (n *nic) clearNeighbors(protocol tcpip.NetworkProtocolNumber) tcpip.Error { if linkRes, ok := n.linkAddrResolvers[protocol]; ok { linkRes.neigh.clear() return nil } return &tcpip.ErrNotSupported{} } // joinGroup adds a new endpoint for the given multicast address, if none // exists yet. Otherwise it just increments its count. func (n *nic) joinGroup(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.Error { // TODO(b/143102137): When implementing MLD, make sure MLD packets are // not sent unless a valid link-local address is available for use on n // as an MLD packet's source address must be a link-local address as // outlined in RFC 3810 section 5. ep := n.getNetworkEndpoint(protocol) if ep == nil { return &tcpip.ErrNotSupported{} } gep, ok := ep.(GroupAddressableEndpoint) if !ok { return &tcpip.ErrNotSupported{} } return gep.JoinGroup(addr) } // leaveGroup decrements the count for the given multicast address, and when it // reaches zero removes the endpoint for this address. func (n *nic) leaveGroup(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.Error { ep := n.getNetworkEndpoint(protocol) if ep == nil { return &tcpip.ErrNotSupported{} } gep, ok := ep.(GroupAddressableEndpoint) if !ok { return &tcpip.ErrNotSupported{} } return gep.LeaveGroup(addr) } // isInGroup returns true if n has joined the multicast group addr. func (n *nic) isInGroup(addr tcpip.Address) bool { for _, ep := range n.networkEndpoints { gep, ok := ep.(GroupAddressableEndpoint) if !ok { continue } if gep.IsInGroup(addr) { return true } } return false } // DeliverNetworkPacket finds the appropriate network protocol endpoint and // hands the packet over for further processing. This function is called when // the NIC receives a packet from the link endpoint. func (n *nic) DeliverNetworkPacket(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) { enabled := n.Enabled() // If the NIC is not yet enabled, don't receive any packets. if !enabled { n.stats.disabledRx.packets.Increment() n.stats.disabledRx.bytes.IncrementBy(uint64(pkt.Data().Size())) return } n.stats.rx.packets.Increment() n.stats.rx.bytes.IncrementBy(uint64(pkt.Data().Size())) networkEndpoint := n.getNetworkEndpoint(protocol) if networkEndpoint == nil { n.stats.unknownL3ProtocolRcvdPacketCounts.Increment(uint64(protocol)) return } pkt.RXChecksumValidated = n.NetworkLinkEndpoint.Capabilities()&CapabilityRXChecksumOffload != 0 if n.deliverLinkPackets { n.DeliverLinkPacket(protocol, pkt) } networkEndpoint.HandlePacket(pkt) } func (n *nic) DeliverLinkPacket(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) { // Deliver to interested packet endpoints without holding NIC lock. var packetEPPkt *PacketBuffer defer func() { if packetEPPkt != nil { packetEPPkt.DecRef() } }() deliverPacketEPs := func(ep PacketEndpoint) { if packetEPPkt == nil { // Packet endpoints hold the full packet. // // We perform a deep copy because higher-level endpoints may point to // the middle of a view that is held by a packet endpoint. Save/Restore // does not support overlapping slices and will panic in this case. // // TODO(https://gvisor.dev/issue/6517): Avoid this copy once S/R supports // overlapping slices (e.g. by passing a shallow copy of pkt to the packet // endpoint). packetEPPkt = NewPacketBuffer(PacketBufferOptions{ Payload: BufferSince(pkt.LinkHeader()), }) // If a link header was populated in the original packet buffer, then // populate it in the packet buffer we provide to packet endpoints as // packet endpoints inspect link headers. packetEPPkt.LinkHeader().Consume(len(pkt.LinkHeader().Slice())) packetEPPkt.PktType = pkt.PktType // Assume the packet is for us if the packet type is unset. // The packet type is set to PacketOutgoing when sending packets so // this may only be unset for incoming packets where link endpoints // have not set it. if packetEPPkt.PktType == 0 { packetEPPkt.PktType = tcpip.PacketHost } } clone := packetEPPkt.Clone() defer clone.DecRef() ep.HandlePacket(n.id, protocol, clone) } n.packetEPsMu.Lock() // Are any packet type sockets listening for this network protocol? protoEPs, protoEPsOK := n.packetEPs[protocol] // Other packet type sockets that are listening for all protocols. anyEPs, anyEPsOK := n.packetEPs[header.EthernetProtocolAll] n.packetEPsMu.Unlock() // On Linux, only ETH_P_ALL endpoints get outbound packets. if pkt.PktType != tcpip.PacketOutgoing && protoEPsOK { protoEPs.forEach(deliverPacketEPs) } if anyEPsOK { anyEPs.forEach(deliverPacketEPs) } } // DeliverTransportPacket delivers the packets to the appropriate transport // protocol endpoint. func (n *nic) DeliverTransportPacket(protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) TransportPacketDisposition { state, ok := n.stack.transportProtocols[protocol] if !ok { n.stats.unknownL4ProtocolRcvdPacketCounts.Increment(uint64(protocol)) return TransportPacketProtocolUnreachable } transProto := state.proto if len(pkt.TransportHeader().Slice()) == 0 { n.stats.malformedL4RcvdPackets.Increment() return TransportPacketHandled } srcPort, dstPort, err := transProto.ParsePorts(pkt.TransportHeader().Slice()) if err != nil { n.stats.malformedL4RcvdPackets.Increment() return TransportPacketHandled } netProto, ok := n.stack.networkProtocols[pkt.NetworkProtocolNumber] if !ok { panic(fmt.Sprintf("expected network protocol = %d, have = %#v", pkt.NetworkProtocolNumber, n.stack.networkProtocolNumbers())) } src, dst := netProto.ParseAddresses(pkt.NetworkHeader().Slice()) id := TransportEndpointID{ LocalPort: dstPort, LocalAddress: dst, RemotePort: srcPort, RemoteAddress: src, } if n.stack.demux.deliverPacket(protocol, pkt, id) { return TransportPacketHandled } // Try to deliver to per-stack default handler. if state.defaultHandler != nil { if state.defaultHandler(id, pkt) { return TransportPacketHandled } } // We could not find an appropriate destination for this packet so // give the protocol specific error handler a chance to handle it. // If it doesn't handle it then we should do so. switch res := transProto.HandleUnknownDestinationPacket(id, pkt); res { case UnknownDestinationPacketMalformed: n.stats.malformedL4RcvdPackets.Increment() return TransportPacketHandled case UnknownDestinationPacketUnhandled: return TransportPacketDestinationPortUnreachable case UnknownDestinationPacketHandled: return TransportPacketHandled default: panic(fmt.Sprintf("unrecognized result from HandleUnknownDestinationPacket = %d", res)) } } // DeliverTransportError implements TransportDispatcher. func (n *nic) DeliverTransportError(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, transErr TransportError, pkt *PacketBuffer) { state, ok := n.stack.transportProtocols[trans] if !ok { return } transProto := state.proto // ICMPv4 only guarantees that 8 bytes of the transport protocol will // be present in the payload. We know that the ports are within the // first 8 bytes for all known transport protocols. transHeader, ok := pkt.Data().PullUp(8) if !ok { return } srcPort, dstPort, err := transProto.ParsePorts(transHeader) if err != nil { return } id := TransportEndpointID{srcPort, local, dstPort, remote} if n.stack.demux.deliverError(n, net, trans, transErr, pkt, id) { return } } // DeliverRawPacket implements TransportDispatcher. func (n *nic) DeliverRawPacket(protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) { // For ICMPv4 only we validate the header length for compatibility with // raw(7) ICMP_FILTER. The same check is made in Linux here: // https://github.com/torvalds/linux/blob/70585216/net/ipv4/raw.c#L189. if protocol == header.ICMPv4ProtocolNumber && len(pkt.TransportHeader().Slice())+pkt.Data().Size() < header.ICMPv4MinimumSize { return } n.stack.demux.deliverRawPacket(protocol, pkt) } // ID implements NetworkInterface. func (n *nic) ID() tcpip.NICID { return n.id } // Name implements NetworkInterface. func (n *nic) Name() string { return n.name } // nudConfigs gets the NUD configurations for n. func (n *nic) nudConfigs(protocol tcpip.NetworkProtocolNumber) (NUDConfigurations, tcpip.Error) { if linkRes, ok := n.linkAddrResolvers[protocol]; ok { return linkRes.neigh.config(), nil } return NUDConfigurations{}, &tcpip.ErrNotSupported{} } // setNUDConfigs sets the NUD configurations for n. // // Note, if c contains invalid NUD configuration values, it will be fixed to // use default values for the erroneous values. func (n *nic) setNUDConfigs(protocol tcpip.NetworkProtocolNumber, c NUDConfigurations) tcpip.Error { if linkRes, ok := n.linkAddrResolvers[protocol]; ok { c.resetInvalidFields() linkRes.neigh.setConfig(c) return nil } return &tcpip.ErrNotSupported{} } func (n *nic) registerPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) { n.packetEPsMu.Lock() defer n.packetEPsMu.Unlock() eps, ok := n.packetEPs[netProto] if !ok { eps = new(packetEndpointList) n.packetEPs[netProto] = eps } eps.add(ep) } func (n *nic) unregisterPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) { n.packetEPsMu.Lock() defer n.packetEPsMu.Unlock() eps, ok := n.packetEPs[netProto] if !ok { return } eps.remove(ep) if eps.len() == 0 { delete(n.packetEPs, netProto) } } // isValidForOutgoing returns true if the endpoint can be used to send out a // packet. It requires the endpoint to not be marked expired (i.e., its address // has been removed) unless the NIC is in spoofing mode, or temporary. func (n *nic) isValidForOutgoing(ep AssignableAddressEndpoint) bool { return n.Enabled() && ep.IsAssigned(n.Spoofing()) } // HandleNeighborProbe implements NetworkInterface. func (n *nic) HandleNeighborProbe(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, linkAddr tcpip.LinkAddress) tcpip.Error { if l, ok := n.linkAddrResolvers[protocol]; ok { l.neigh.handleProbe(addr, linkAddr) return nil } return &tcpip.ErrNotSupported{} } // HandleNeighborConfirmation implements NetworkInterface. func (n *nic) HandleNeighborConfirmation(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, linkAddr tcpip.LinkAddress, flags ReachabilityConfirmationFlags) tcpip.Error { if l, ok := n.linkAddrResolvers[protocol]; ok { l.neigh.handleConfirmation(addr, linkAddr, flags) return nil } return &tcpip.ErrNotSupported{} } // CheckLocalAddress implements NetworkInterface. func (n *nic) CheckLocalAddress(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) bool { if n.Spoofing() { return true } if addressEndpoint := n.getAddressOrCreateTempInner(protocol, addr, false /* createTemp */, NeverPrimaryEndpoint); addressEndpoint != nil { addressEndpoint.DecRef() return true } return false } func (n *nic) checkDuplicateAddress(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, h DADCompletionHandler) (DADCheckAddressDisposition, tcpip.Error) { d, ok := n.duplicateAddressDetectors[protocol] if !ok { return 0, &tcpip.ErrNotSupported{} } return d.CheckDuplicateAddress(addr, h), nil } func (n *nic) setForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) (bool, tcpip.Error) { ep := n.getNetworkEndpoint(protocol) if ep == nil { return false, &tcpip.ErrUnknownProtocol{} } forwardingEP, ok := ep.(ForwardingNetworkEndpoint) if !ok { return false, &tcpip.ErrNotSupported{} } return forwardingEP.SetForwarding(enable), nil } func (n *nic) forwarding(protocol tcpip.NetworkProtocolNumber) (bool, tcpip.Error) { ep := n.getNetworkEndpoint(protocol) if ep == nil { return false, &tcpip.ErrUnknownProtocol{} } forwardingEP, ok := ep.(ForwardingNetworkEndpoint) if !ok { return false, &tcpip.ErrNotSupported{} } return forwardingEP.Forwarding(), nil } func (n *nic) multicastForwardingEndpoint(protocol tcpip.NetworkProtocolNumber) (MulticastForwardingNetworkEndpoint, tcpip.Error) { ep := n.getNetworkEndpoint(protocol) if ep == nil { return nil, &tcpip.ErrUnknownProtocol{} } forwardingEP, ok := ep.(MulticastForwardingNetworkEndpoint) if !ok { return nil, &tcpip.ErrNotSupported{} } return forwardingEP, nil } func (n *nic) setMulticastForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) (bool, tcpip.Error) { ep, err := n.multicastForwardingEndpoint(protocol) if err != nil { return false, err } return ep.SetMulticastForwarding(enable), nil } func (n *nic) multicastForwarding(protocol tcpip.NetworkProtocolNumber) (bool, tcpip.Error) { ep, err := n.multicastForwardingEndpoint(protocol) if err != nil { return false, err } return ep.MulticastForwarding(), nil } // CoordinatorNIC represents NetworkLinkEndpoint that can join multiple network devices. type CoordinatorNIC interface { // AddNIC adds the specified NIC device. AddNIC(n *nic) tcpip.Error // DelNIC deletes the specified NIC device. DelNIC(n *nic) tcpip.Error } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/nic_mutex.go000066400000000000000000000044121465435605700241730ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type nicRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var niclockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type niclockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *nicRWMutex) Lock() { locking.AddGLock(nicprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *nicRWMutex) NestedLock(i niclockNameIndex) { locking.AddGLock(nicprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *nicRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(nicprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *nicRWMutex) NestedUnlock(i niclockNameIndex) { m.mu.Unlock() locking.DelGLock(nicprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *nicRWMutex) RLock() { locking.AddGLock(nicprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *nicRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(nicprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *nicRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *nicRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *nicRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var nicprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func nicinitLockNames() {} func init() { nicinitLockNames() nicprefixIndex = locking.NewMutexClass(reflect.TypeOf(nicRWMutex{}), niclockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/nic_stats.go000066400000000000000000000061661465435605700241770ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "gvisor.dev/gvisor/pkg/tcpip" ) // +stateify savable type sharedStats struct { local tcpip.NICStats multiCounterNICStats } // LINT.IfChange(multiCounterNICPacketStats) // +stateify savable type multiCounterNICPacketStats struct { packets tcpip.MultiCounterStat bytes tcpip.MultiCounterStat } func (m *multiCounterNICPacketStats) init(a, b *tcpip.NICPacketStats) { m.packets.Init(a.Packets, b.Packets) m.bytes.Init(a.Bytes, b.Bytes) } // LINT.ThenChange(../tcpip.go:NICPacketStats) // LINT.IfChange(multiCounterNICNeighborStats) // +stateify savable type multiCounterNICNeighborStats struct { unreachableEntryLookups tcpip.MultiCounterStat droppedConfirmationForNoninitiatedNeighbor tcpip.MultiCounterStat droppedInvalidLinkAddressConfirmations tcpip.MultiCounterStat } func (m *multiCounterNICNeighborStats) init(a, b *tcpip.NICNeighborStats) { m.unreachableEntryLookups.Init(a.UnreachableEntryLookups, b.UnreachableEntryLookups) m.droppedConfirmationForNoninitiatedNeighbor.Init(a.DroppedConfirmationForNoninitiatedNeighbor, b.DroppedConfirmationForNoninitiatedNeighbor) m.droppedInvalidLinkAddressConfirmations.Init(a.DroppedInvalidLinkAddressConfirmations, b.DroppedInvalidLinkAddressConfirmations) } // LINT.ThenChange(../tcpip.go:NICNeighborStats) // LINT.IfChange(multiCounterNICStats) // +stateify savable type multiCounterNICStats struct { unknownL3ProtocolRcvdPacketCounts tcpip.MultiIntegralStatCounterMap unknownL4ProtocolRcvdPacketCounts tcpip.MultiIntegralStatCounterMap malformedL4RcvdPackets tcpip.MultiCounterStat tx multiCounterNICPacketStats txPacketsDroppedNoBufferSpace tcpip.MultiCounterStat rx multiCounterNICPacketStats disabledRx multiCounterNICPacketStats neighbor multiCounterNICNeighborStats } func (m *multiCounterNICStats) init(a, b *tcpip.NICStats) { m.unknownL3ProtocolRcvdPacketCounts.Init(a.UnknownL3ProtocolRcvdPacketCounts, b.UnknownL3ProtocolRcvdPacketCounts) m.unknownL4ProtocolRcvdPacketCounts.Init(a.UnknownL4ProtocolRcvdPacketCounts, b.UnknownL4ProtocolRcvdPacketCounts) m.malformedL4RcvdPackets.Init(a.MalformedL4RcvdPackets, b.MalformedL4RcvdPackets) m.tx.init(&a.Tx, &b.Tx) m.txPacketsDroppedNoBufferSpace.Init(a.TxPacketsDroppedNoBufferSpace, b.TxPacketsDroppedNoBufferSpace) m.rx.init(&a.Rx, &b.Rx) m.disabledRx.init(&a.DisabledRx, &b.DisabledRx) m.neighbor.init(&a.Neighbor, &b.Neighbor) } // LINT.ThenChange(../tcpip.go:NICStats) golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/nud.go000066400000000000000000000377251465435605700230030ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "math" "math/rand" "sync" "time" "gvisor.dev/gvisor/pkg/tcpip" ) const ( // defaultBaseReachableTime is the default base duration for computing the // random reachable time. // // Reachable time is the duration for which a neighbor is considered // reachable after a positive reachability confirmation is received. It is a // function of a uniformly distributed random value between the minimum and // maximum random factors, multiplied by the base reachable time. Using a // random component eliminates the possibility that Neighbor Unreachability // Detection messages will synchronize with each other. // // Default taken from REACHABLE_TIME of RFC 4861 section 10. defaultBaseReachableTime = 30 * time.Second // minimumBaseReachableTime is the minimum base duration for computing the // random reachable time. // // Minimum = 1ms minimumBaseReachableTime = time.Millisecond // defaultMinRandomFactor is the default minimum value of the random factor // used for computing reachable time. // // Default taken from MIN_RANDOM_FACTOR of RFC 4861 section 10. defaultMinRandomFactor = 0.5 // defaultMaxRandomFactor is the default maximum value of the random factor // used for computing reachable time. // // The default value depends on the value of MinRandomFactor. // If MinRandomFactor is less than MAX_RANDOM_FACTOR of RFC 4861 section 10, // the value from the RFC will be used; otherwise, the default is // MinRandomFactor multiplied by three. defaultMaxRandomFactor = 1.5 // defaultRetransmitTimer is the default amount of time to wait between // sending reachability probes. // // Default taken from RETRANS_TIMER of RFC 4861 section 10. defaultRetransmitTimer = time.Second // minimumRetransmitTimer is the minimum amount of time to wait between // sending reachability probes. // // Note, RFC 4861 does not impose a minimum Retransmit Timer, but we do here // to make sure the messages are not sent all at once. We also come to this // value because in the RetransmitTimer field of a Router Advertisement, a // value of 0 means unspecified, so the smallest valid value is 1. Note, the // unit of the RetransmitTimer field in the Router Advertisement is // milliseconds. minimumRetransmitTimer = time.Millisecond // defaultDelayFirstProbeTime is the default duration to wait for a // non-Neighbor-Discovery related protocol to reconfirm reachability after // entering the DELAY state. After this time, a reachability probe will be // sent and the entry will transition to the PROBE state. // // Default taken from DELAY_FIRST_PROBE_TIME of RFC 4861 section 10. defaultDelayFirstProbeTime = 5 * time.Second // defaultMaxMulticastProbes is the default number of reachabililty probes // to send before concluding negative reachability and deleting the neighbor // entry from the INCOMPLETE state. // // Default taken from MAX_MULTICAST_SOLICIT of RFC 4861 section 10. defaultMaxMulticastProbes = 3 // defaultMaxUnicastProbes is the default number of reachability probes to // send before concluding retransmission from within the PROBE state should // cease and the entry SHOULD be deleted. // // Default taken from MAX_UNICASE_SOLICIT of RFC 4861 section 10. defaultMaxUnicastProbes = 3 // defaultMaxAnycastDelayTime is the default time in which the stack SHOULD // delay sending a response for a random time between 0 and this time, if the // target address is an anycast address. // // Default taken from MAX_ANYCAST_DELAY_TIME of RFC 4861 section 10. defaultMaxAnycastDelayTime = time.Second // defaultMaxReachbilityConfirmations is the default amount of unsolicited // reachability confirmation messages a node MAY send to all-node multicast // address when it determines its link-layer address has changed. // // Default taken from MAX_NEIGHBOR_ADVERTISEMENT of RFC 4861 section 10. defaultMaxReachbilityConfirmations = 3 ) // NUDDispatcher is the interface integrators of netstack must implement to // receive and handle NUD related events. type NUDDispatcher interface { // OnNeighborAdded will be called when a new entry is added to a NIC's (with // ID nicID) neighbor table. // // This function is permitted to block indefinitely without interfering with // the stack's operation. // // May be called concurrently. OnNeighborAdded(tcpip.NICID, NeighborEntry) // OnNeighborChanged will be called when an entry in a NIC's (with ID nicID) // neighbor table changes state and/or link address. // // This function is permitted to block indefinitely without interfering with // the stack's operation. // // May be called concurrently. OnNeighborChanged(tcpip.NICID, NeighborEntry) // OnNeighborRemoved will be called when an entry is removed from a NIC's // (with ID nicID) neighbor table. // // This function is permitted to block indefinitely without interfering with // the stack's operation. // // May be called concurrently. OnNeighborRemoved(tcpip.NICID, NeighborEntry) } // ReachabilityConfirmationFlags describes the flags used within a reachability // confirmation (e.g. ARP reply or Neighbor Advertisement for ARP or NDP, // respectively). type ReachabilityConfirmationFlags struct { // Solicited indicates that the advertisement was sent in response to a // reachability probe. Solicited bool // Override indicates that the reachability confirmation should override an // existing neighbor cache entry and update the cached link-layer address. // When Override is not set the confirmation will not update a cached // link-layer address, but will update an existing neighbor cache entry for // which no link-layer address is known. Override bool // IsRouter indicates that the sender is a router. IsRouter bool } // NUDConfigurations is the NUD configurations for the netstack. This is used // by the neighbor cache to operate the NUD state machine on each device in the // local network. // // +stateify savable type NUDConfigurations struct { // BaseReachableTime is the base duration for computing the random reachable // time. // // Reachable time is the duration for which a neighbor is considered // reachable after a positive reachability confirmation is received. It is a // function of uniformly distributed random value between minRandomFactor and // maxRandomFactor multiplied by baseReachableTime. Using a random component // eliminates the possibility that Neighbor Unreachability Detection messages // will synchronize with each other. // // After this time, a neighbor entry will transition from REACHABLE to STALE // state. // // Must be greater than 0. BaseReachableTime time.Duration // LearnBaseReachableTime enables learning BaseReachableTime during runtime // from the neighbor discovery protocol, if supported. // // TODO(gvisor.dev/issue/2240): Implement this NUD configuration option. LearnBaseReachableTime bool // MinRandomFactor is the minimum value of the random factor used for // computing reachable time. // // See BaseReachbleTime for more information on computing the reachable time. // // Must be greater than 0. MinRandomFactor float32 // MaxRandomFactor is the maximum value of the random factor used for // computing reachabile time. // // See BaseReachbleTime for more information on computing the reachable time. // // Must be great than or equal to MinRandomFactor. MaxRandomFactor float32 // RetransmitTimer is the duration between retransmission of reachability // probes in the PROBE state. RetransmitTimer time.Duration // LearnRetransmitTimer enables learning RetransmitTimer during runtime from // the neighbor discovery protocol, if supported. // // TODO(gvisor.dev/issue/2241): Implement this NUD configuration option. LearnRetransmitTimer bool // DelayFirstProbeTime is the duration to wait for a non-Neighbor-Discovery // related protocol to reconfirm reachability after entering the DELAY state. // After this time, a reachability probe will be sent and the entry will // transition to the PROBE state. // // Must be greater than 0. DelayFirstProbeTime time.Duration // MaxMulticastProbes is the number of reachability probes to send before // concluding negative reachability and deleting the neighbor entry from the // INCOMPLETE state. // // Must be greater than 0. MaxMulticastProbes uint32 // MaxUnicastProbes is the number of reachability probes to send before // concluding retransmission from within the PROBE state should cease and // entry SHOULD be deleted. // // Must be greater than 0. MaxUnicastProbes uint32 // MaxAnycastDelayTime is the time in which the stack SHOULD delay sending a // response for a random time between 0 and this time, if the target address // is an anycast address. // // TODO(gvisor.dev/issue/2242): Use this option when sending solicited // neighbor confirmations to anycast addresses and proxying neighbor // confirmations. MaxAnycastDelayTime time.Duration // MaxReachabilityConfirmations is the number of unsolicited reachability // confirmation messages a node MAY send to all-node multicast address when // it determines its link-layer address has changed. // // TODO(gvisor.dev/issue/2246): Discuss if implementation of this NUD // configuration option is necessary. MaxReachabilityConfirmations uint32 } // DefaultNUDConfigurations returns a NUDConfigurations populated with default // values defined by RFC 4861 section 10. func DefaultNUDConfigurations() NUDConfigurations { return NUDConfigurations{ BaseReachableTime: defaultBaseReachableTime, LearnBaseReachableTime: true, MinRandomFactor: defaultMinRandomFactor, MaxRandomFactor: defaultMaxRandomFactor, RetransmitTimer: defaultRetransmitTimer, LearnRetransmitTimer: true, DelayFirstProbeTime: defaultDelayFirstProbeTime, MaxMulticastProbes: defaultMaxMulticastProbes, MaxUnicastProbes: defaultMaxUnicastProbes, MaxAnycastDelayTime: defaultMaxAnycastDelayTime, MaxReachabilityConfirmations: defaultMaxReachbilityConfirmations, } } // resetInvalidFields modifies an invalid NDPConfigurations with valid values. // If invalid values are present in c, the corresponding default values will be // used instead. This is needed to check, and conditionally fix, user-specified // NUDConfigurations. func (c *NUDConfigurations) resetInvalidFields() { if c.BaseReachableTime < minimumBaseReachableTime { c.BaseReachableTime = defaultBaseReachableTime } if c.MinRandomFactor <= 0 { c.MinRandomFactor = defaultMinRandomFactor } if c.MaxRandomFactor < c.MinRandomFactor { c.MaxRandomFactor = calcMaxRandomFactor(c.MinRandomFactor) } if c.RetransmitTimer < minimumRetransmitTimer { c.RetransmitTimer = defaultRetransmitTimer } if c.DelayFirstProbeTime == 0 { c.DelayFirstProbeTime = defaultDelayFirstProbeTime } if c.MaxMulticastProbes == 0 { c.MaxMulticastProbes = defaultMaxMulticastProbes } if c.MaxUnicastProbes == 0 { c.MaxUnicastProbes = defaultMaxUnicastProbes } } // calcMaxRandomFactor calculates the maximum value of the random factor used // for computing reachable time. This function is necessary for when the // default specified in RFC 4861 section 10 is less than the current // MinRandomFactor. // // Assumes minRandomFactor is positive since validation of the minimum value // should come before the validation of the maximum. func calcMaxRandomFactor(minRandomFactor float32) float32 { if minRandomFactor > defaultMaxRandomFactor { return minRandomFactor * 3 } return defaultMaxRandomFactor } // +stateify savable type nudStateMu struct { sync.RWMutex `state:"nosave"` config NUDConfigurations // reachableTime is the duration to wait for a REACHABLE entry to // transition into STALE after inactivity. This value is calculated with // the algorithm defined in RFC 4861 section 6.3.2. reachableTime time.Duration expiration tcpip.MonotonicTime prevBaseReachableTime time.Duration prevMinRandomFactor float32 prevMaxRandomFactor float32 } // NUDState stores states needed for calculating reachable time. // // +stateify savable type NUDState struct { clock tcpip.Clock // TODO(b/341946753): Restore when netstack is savable. rng *rand.Rand `state:"nosave"` mu nudStateMu } // NewNUDState returns new NUDState using c as configuration and the specified // random number generator for use in recomputing ReachableTime. func NewNUDState(c NUDConfigurations, clock tcpip.Clock, rng *rand.Rand) *NUDState { s := &NUDState{ clock: clock, rng: rng, } s.mu.config = c return s } // Config returns the NUD configuration. func (s *NUDState) Config() NUDConfigurations { s.mu.RLock() defer s.mu.RUnlock() return s.mu.config } // SetConfig replaces the existing NUD configurations with c. func (s *NUDState) SetConfig(c NUDConfigurations) { s.mu.Lock() defer s.mu.Unlock() s.mu.config = c } // ReachableTime returns the duration to wait for a REACHABLE entry to // transition into STALE after inactivity. This value is recalculated for new // values of BaseReachableTime, MinRandomFactor, and MaxRandomFactor using the // algorithm defined in RFC 4861 section 6.3.2. func (s *NUDState) ReachableTime() time.Duration { s.mu.Lock() defer s.mu.Unlock() if s.clock.NowMonotonic().After(s.mu.expiration) || s.mu.config.BaseReachableTime != s.mu.prevBaseReachableTime || s.mu.config.MinRandomFactor != s.mu.prevMinRandomFactor || s.mu.config.MaxRandomFactor != s.mu.prevMaxRandomFactor { s.recomputeReachableTimeLocked() } return s.mu.reachableTime } // recomputeReachableTimeLocked forces a recalculation of ReachableTime using // the algorithm defined in RFC 4861 section 6.3.2. // // This SHOULD automatically be invoked during certain situations, as per // RFC 4861 section 6.3.4: // // If the received Reachable Time value is non-zero, the host SHOULD set its // BaseReachableTime variable to the received value. If the new value // differs from the previous value, the host SHOULD re-compute a new random // ReachableTime value. ReachableTime is computed as a uniformly // distributed random value between MIN_RANDOM_FACTOR and MAX_RANDOM_FACTOR // times the BaseReachableTime. Using a random component eliminates the // possibility that Neighbor Unreachability Detection messages will // synchronize with each other. // // In most cases, the advertised Reachable Time value will be the same in // consecutive Router Advertisements, and a host's BaseReachableTime rarely // changes. In such cases, an implementation SHOULD ensure that a new // random value gets re-computed at least once every few hours. // // s.mu MUST be locked for writing. func (s *NUDState) recomputeReachableTimeLocked() { s.mu.prevBaseReachableTime = s.mu.config.BaseReachableTime s.mu.prevMinRandomFactor = s.mu.config.MinRandomFactor s.mu.prevMaxRandomFactor = s.mu.config.MaxRandomFactor randomFactor := s.mu.config.MinRandomFactor + s.rng.Float32()*(s.mu.config.MaxRandomFactor-s.mu.config.MinRandomFactor) // Check for overflow, given that minRandomFactor and maxRandomFactor are // guaranteed to be positive numbers. if math.MaxInt64/randomFactor < float32(s.mu.config.BaseReachableTime) { s.mu.reachableTime = time.Duration(math.MaxInt64) } else if randomFactor == 1 { // Avoid loss of precision when a large base reachable time is used. s.mu.reachableTime = s.mu.config.BaseReachableTime } else { reachableTime := int64(float32(s.mu.config.BaseReachableTime) * randomFactor) s.mu.reachableTime = time.Duration(reachableTime) } s.mu.expiration = s.clock.NowMonotonic().Add(2 * time.Hour) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/packet_buffer.go000066400000000000000000000537201465435605700250060ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "fmt" "io" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" ) type headerType int const ( virtioNetHeader headerType = iota linkHeader networkHeader transportHeader numHeaderType ) var pkPool = sync.Pool{ New: func() any { return &PacketBuffer{} }, } // PacketBufferOptions specifies options for PacketBuffer creation. type PacketBufferOptions struct { // ReserveHeaderBytes is the number of bytes to reserve for headers. Total // number of bytes pushed onto the headers must not exceed this value. ReserveHeaderBytes int // Payload is the initial unparsed data for the new packet. If set, it will // be owned by the new packet. Payload buffer.Buffer // IsForwardedPacket identifies that the PacketBuffer being created is for a // forwarded packet. IsForwardedPacket bool // OnRelease is a function to be run when the packet buffer is no longer // referenced (released back to the pool). OnRelease func() } // A PacketBuffer contains all the data of a network packet. // // As a PacketBuffer traverses up the stack, it may be necessary to pass it to // multiple endpoints. // // The whole packet is expected to be a series of bytes in the following order: // LinkHeader, NetworkHeader, TransportHeader, and Data. Any of them can be // empty. Use of PacketBuffer in any other order is unsupported. // // PacketBuffer must be created with NewPacketBuffer, which sets the initial // reference count to 1. Owners should call `DecRef()` when they are finished // with the buffer to return it to the pool. // // Internal structure: A PacketBuffer holds a pointer to buffer.Buffer, which // exposes a logically-contiguous byte storage. The underlying storage structure // is abstracted out, and should not be a concern here for most of the time. // // |- reserved ->| // |--->| consumed (incoming) // 0 V V // +--------+----+----+--------------------+ // | | | | current data ... | (buf) // +--------+----+----+--------------------+ // ^ | // |<---| pushed (outgoing) // // When a PacketBuffer is created, a `reserved` header region can be specified, // which stack pushes headers in this region for an outgoing packet. There could // be no such region for an incoming packet, and `reserved` is 0. The value of // `reserved` never changes in the entire lifetime of the packet. // // Outgoing Packet: When a header is pushed, `pushed` gets incremented by the // pushed length, and the current value is stored for each header. PacketBuffer // subtracts this value from `reserved` to compute the starting offset of each // header in `buf`. // // Incoming Packet: When a header is consumed (a.k.a. parsed), the current // `consumed` value is stored for each header, and it gets incremented by the // consumed length. PacketBuffer adds this value to `reserved` to compute the // starting offset of each header in `buf`. // // +stateify savable type PacketBuffer struct { _ sync.NoCopy packetBufferRefs // buf is the underlying buffer for the packet. See struct level docs for // details. buf buffer.Buffer reserved int pushed int consumed int // headers stores metadata about each header. headers [numHeaderType]headerInfo // NetworkProtocolNumber is only valid when NetworkHeader().View().IsEmpty() // returns false. // TODO(gvisor.dev/issue/3574): Remove the separately passed protocol // numbers in registration APIs that take a PacketBuffer. NetworkProtocolNumber tcpip.NetworkProtocolNumber // TransportProtocol is only valid if it is non zero. // TODO(gvisor.dev/issue/3810): This and the network protocol number should // be moved into the headerinfo. This should resolve the validity issue. TransportProtocolNumber tcpip.TransportProtocolNumber // Hash is the transport layer hash of this packet. A value of zero // indicates no valid hash has been set. Hash uint32 // Owner is implemented by task to get the uid and gid. // Only set for locally generated packets. Owner tcpip.PacketOwner // The following fields are only set by the qdisc layer when the packet // is added to a queue. EgressRoute RouteInfo GSOOptions GSO // snatDone indicates if the packet's source has been manipulated as per // iptables NAT table. snatDone bool // dnatDone indicates if the packet's destination has been manipulated as per // iptables NAT table. dnatDone bool // PktType indicates the SockAddrLink.PacketType of the packet as defined in // https://www.man7.org/linux/man-pages/man7/packet.7.html. PktType tcpip.PacketType // NICID is the ID of the last interface the network packet was handled at. NICID tcpip.NICID // RXChecksumValidated indicates that checksum verification may be // safely skipped. RXChecksumValidated bool // NetworkPacketInfo holds an incoming packet's network-layer information. NetworkPacketInfo NetworkPacketInfo tuple *tuple // onRelease is a function to be run when the packet buffer is no longer // referenced (released back to the pool). onRelease func() `state:"nosave"` } // NewPacketBuffer creates a new PacketBuffer with opts. func NewPacketBuffer(opts PacketBufferOptions) *PacketBuffer { pk := pkPool.Get().(*PacketBuffer) pk.reset() if opts.ReserveHeaderBytes != 0 { v := buffer.NewViewSize(opts.ReserveHeaderBytes) pk.buf.Append(v) pk.reserved = opts.ReserveHeaderBytes } if opts.Payload.Size() > 0 { pk.buf.Merge(&opts.Payload) } pk.NetworkPacketInfo.IsForwardedPacket = opts.IsForwardedPacket pk.onRelease = opts.OnRelease pk.InitRefs() return pk } // IncRef increments the PacketBuffer's refcount. func (pk *PacketBuffer) IncRef() *PacketBuffer { pk.packetBufferRefs.IncRef() return pk } // DecRef decrements the PacketBuffer's refcount. If the refcount is // decremented to zero, the PacketBuffer is returned to the PacketBuffer // pool. func (pk *PacketBuffer) DecRef() { pk.packetBufferRefs.DecRef(func() { if pk.onRelease != nil { pk.onRelease() } pk.buf.Release() pkPool.Put(pk) }) } func (pk *PacketBuffer) reset() { *pk = PacketBuffer{} } // ReservedHeaderBytes returns the number of bytes initially reserved for // headers. func (pk *PacketBuffer) ReservedHeaderBytes() int { return pk.reserved } // AvailableHeaderBytes returns the number of bytes currently available for // headers. This is relevant to PacketHeader.Push method only. func (pk *PacketBuffer) AvailableHeaderBytes() int { return pk.reserved - pk.pushed } // VirtioNetHeader returns the handle to virtio-layer header. func (pk *PacketBuffer) VirtioNetHeader() PacketHeader { return PacketHeader{ pk: pk, typ: virtioNetHeader, } } // LinkHeader returns the handle to link-layer header. func (pk *PacketBuffer) LinkHeader() PacketHeader { return PacketHeader{ pk: pk, typ: linkHeader, } } // NetworkHeader returns the handle to network-layer header. func (pk *PacketBuffer) NetworkHeader() PacketHeader { return PacketHeader{ pk: pk, typ: networkHeader, } } // TransportHeader returns the handle to transport-layer header. func (pk *PacketBuffer) TransportHeader() PacketHeader { return PacketHeader{ pk: pk, typ: transportHeader, } } // HeaderSize returns the total size of all headers in bytes. func (pk *PacketBuffer) HeaderSize() int { return pk.pushed + pk.consumed } // Size returns the size of packet in bytes. func (pk *PacketBuffer) Size() int { return int(pk.buf.Size()) - pk.headerOffset() } // MemSize returns the estimation size of the pk in memory, including backing // buffer data. func (pk *PacketBuffer) MemSize() int { return int(pk.buf.Size()) + PacketBufferStructSize } // Data returns the handle to data portion of pk. func (pk *PacketBuffer) Data() PacketData { return PacketData{pk: pk} } // AsSlices returns the underlying storage of the whole packet. // // Note that AsSlices can allocate a lot. In hot paths it may be preferable to // iterate over a PacketBuffer's data via AsViewList. func (pk *PacketBuffer) AsSlices() [][]byte { vl := pk.buf.AsViewList() views := make([][]byte, 0, vl.Len()) offset := pk.headerOffset() pk.buf.SubApply(offset, int(pk.buf.Size())-offset, func(v *buffer.View) { views = append(views, v.AsSlice()) }) return views } // AsViewList returns the list of Views backing the PacketBuffer along with the // header offset into them. Users may not save or modify the ViewList returned. func (pk *PacketBuffer) AsViewList() (buffer.ViewList, int) { return pk.buf.AsViewList(), pk.headerOffset() } // ToBuffer returns a caller-owned copy of the underlying storage of the whole // packet. func (pk *PacketBuffer) ToBuffer() buffer.Buffer { b := pk.buf.Clone() b.TrimFront(int64(pk.headerOffset())) return b } // ToView returns a caller-owned copy of the underlying storage of the whole // packet as a view. func (pk *PacketBuffer) ToView() *buffer.View { p := buffer.NewView(int(pk.buf.Size())) offset := pk.headerOffset() pk.buf.SubApply(offset, int(pk.buf.Size())-offset, func(v *buffer.View) { p.Write(v.AsSlice()) }) return p } func (pk *PacketBuffer) headerOffset() int { return pk.reserved - pk.pushed } func (pk *PacketBuffer) headerOffsetOf(typ headerType) int { return pk.reserved + pk.headers[typ].offset } func (pk *PacketBuffer) dataOffset() int { return pk.reserved + pk.consumed } func (pk *PacketBuffer) push(typ headerType, size int) []byte { h := &pk.headers[typ] if h.length > 0 { panic(fmt.Sprintf("push(%s, %d) called after previous push", typ, size)) } if pk.pushed+size > pk.reserved { panic(fmt.Sprintf("push(%s, %d) overflows; pushed=%d reserved=%d", typ, size, pk.pushed, pk.reserved)) } pk.pushed += size h.offset = -pk.pushed h.length = size view := pk.headerView(typ) return view.AsSlice() } func (pk *PacketBuffer) consume(typ headerType, size int) (v []byte, consumed bool) { h := &pk.headers[typ] if h.length > 0 { panic(fmt.Sprintf("consume must not be called twice: type %s", typ)) } if pk.reserved+pk.consumed+size > int(pk.buf.Size()) { return nil, false } h.offset = pk.consumed h.length = size pk.consumed += size view := pk.headerView(typ) return view.AsSlice(), true } func (pk *PacketBuffer) headerView(typ headerType) buffer.View { h := &pk.headers[typ] if h.length == 0 { return buffer.View{} } v, ok := pk.buf.PullUp(pk.headerOffsetOf(typ), h.length) if !ok { panic("PullUp failed") } return v } // Clone makes a semi-deep copy of pk. The underlying packet payload is // shared. Hence, no modifications is done to underlying packet payload. func (pk *PacketBuffer) Clone() *PacketBuffer { newPk := pkPool.Get().(*PacketBuffer) newPk.reset() newPk.buf = pk.buf.Clone() newPk.reserved = pk.reserved newPk.pushed = pk.pushed newPk.consumed = pk.consumed newPk.headers = pk.headers newPk.Hash = pk.Hash newPk.Owner = pk.Owner newPk.GSOOptions = pk.GSOOptions newPk.NetworkProtocolNumber = pk.NetworkProtocolNumber newPk.dnatDone = pk.dnatDone newPk.snatDone = pk.snatDone newPk.TransportProtocolNumber = pk.TransportProtocolNumber newPk.PktType = pk.PktType newPk.NICID = pk.NICID newPk.RXChecksumValidated = pk.RXChecksumValidated newPk.NetworkPacketInfo = pk.NetworkPacketInfo newPk.tuple = pk.tuple newPk.InitRefs() return newPk } // ReserveHeaderBytes prepends reserved space for headers at the front // of the underlying buf. Can only be called once per packet. func (pk *PacketBuffer) ReserveHeaderBytes(reserved int) { if pk.reserved != 0 { panic(fmt.Sprintf("ReserveHeaderBytes(...) called on packet with reserved=%d, want reserved=0", pk.reserved)) } pk.reserved = reserved pk.buf.Prepend(buffer.NewViewSize(reserved)) } // Network returns the network header as a header.Network. // // Network should only be called when NetworkHeader has been set. func (pk *PacketBuffer) Network() header.Network { switch netProto := pk.NetworkProtocolNumber; netProto { case header.IPv4ProtocolNumber: return header.IPv4(pk.NetworkHeader().Slice()) case header.IPv6ProtocolNumber: return header.IPv6(pk.NetworkHeader().Slice()) default: panic(fmt.Sprintf("unknown network protocol number %d", netProto)) } } // CloneToInbound makes a semi-deep copy of the packet buffer (similar to // Clone) to be used as an inbound packet. // // See PacketBuffer.Data for details about how a packet buffer holds an inbound // packet. func (pk *PacketBuffer) CloneToInbound() *PacketBuffer { newPk := pkPool.Get().(*PacketBuffer) newPk.reset() newPk.buf = pk.buf.Clone() newPk.InitRefs() // Treat unfilled header portion as reserved. newPk.reserved = pk.AvailableHeaderBytes() newPk.tuple = pk.tuple return newPk } // DeepCopyForForwarding creates a deep copy of the packet buffer for // forwarding. // // The returned packet buffer will have the network and transport headers // set if the original packet buffer did. func (pk *PacketBuffer) DeepCopyForForwarding(reservedHeaderBytes int) *PacketBuffer { payload := BufferSince(pk.NetworkHeader()) defer payload.Release() newPk := NewPacketBuffer(PacketBufferOptions{ ReserveHeaderBytes: reservedHeaderBytes, Payload: payload.DeepClone(), IsForwardedPacket: true, }) { consumeBytes := len(pk.NetworkHeader().Slice()) if _, consumed := newPk.NetworkHeader().Consume(consumeBytes); !consumed { panic(fmt.Sprintf("expected to consume network header %d bytes from new packet", consumeBytes)) } newPk.NetworkProtocolNumber = pk.NetworkProtocolNumber } { consumeBytes := len(pk.TransportHeader().Slice()) if _, consumed := newPk.TransportHeader().Consume(consumeBytes); !consumed { panic(fmt.Sprintf("expected to consume transport header %d bytes from new packet", consumeBytes)) } newPk.TransportProtocolNumber = pk.TransportProtocolNumber } newPk.tuple = pk.tuple return newPk } // headerInfo stores metadata about a header in a packet. // // +stateify savable type headerInfo struct { // offset is the offset of the header in pk.buf relative to // pk.buf[pk.reserved]. See the PacketBuffer struct for details. offset int // length is the length of this header. length int } // PacketHeader is a handle object to a header in the underlying packet. type PacketHeader struct { pk *PacketBuffer typ headerType } // View returns an caller-owned copy of the underlying storage of h as a // *buffer.View. func (h PacketHeader) View() *buffer.View { view := h.pk.headerView(h.typ) if view.Size() == 0 { return nil } return view.Clone() } // Slice returns the underlying storage of h as a []byte. The returned slice // should not be modified if the underlying packet could be shared, cloned, or // borrowed. func (h PacketHeader) Slice() []byte { view := h.pk.headerView(h.typ) return view.AsSlice() } // Push pushes size bytes in the front of its residing packet, and returns the // backing storage. Callers may only call one of Push or Consume once on each // header in the lifetime of the underlying packet. func (h PacketHeader) Push(size int) []byte { return h.pk.push(h.typ, size) } // Consume moves the first size bytes of the unparsed data portion in the packet // to h, and returns the backing storage. In the case of data is shorter than // size, consumed will be false, and the state of h will not be affected. // Callers may only call one of Push or Consume once on each header in the // lifetime of the underlying packet. func (h PacketHeader) Consume(size int) (v []byte, consumed bool) { return h.pk.consume(h.typ, size) } // PacketData represents the data portion of a PacketBuffer. // // +stateify savable type PacketData struct { pk *PacketBuffer } // PullUp returns a contiguous slice of size bytes from the beginning of d. // Callers should not keep the view for later use. Callers can write to the // returned slice if they have singular ownership over the underlying // Buffer. func (d PacketData) PullUp(size int) (b []byte, ok bool) { view, ok := d.pk.buf.PullUp(d.pk.dataOffset(), size) return view.AsSlice(), ok } // Consume is the same as PullUp except that is additionally consumes the // returned bytes. Subsequent PullUp or Consume will not return these bytes. func (d PacketData) Consume(size int) ([]byte, bool) { v, ok := d.PullUp(size) if ok { d.pk.consumed += size } return v, ok } // ReadTo reads bytes from d to dst. It also removes these bytes from d // unless peek is true. func (d PacketData) ReadTo(dst io.Writer, peek bool) (int, error) { var ( err error done int ) offset := d.pk.dataOffset() d.pk.buf.SubApply(offset, int(d.pk.buf.Size())-offset, func(v *buffer.View) { if err != nil { return } var n int n, err = dst.Write(v.AsSlice()) done += n if err != nil { return } if n != v.Size() { panic(fmt.Sprintf("io.Writer.Write succeeded with incomplete write: %d != %d", n, v.Size())) } }) if !peek { d.pk.buf.TrimFront(int64(done)) } return done, err } // CapLength reduces d to at most length bytes. func (d PacketData) CapLength(length int) { if length < 0 { panic("length < 0") } d.pk.buf.Truncate(int64(length + d.pk.dataOffset())) } // ToBuffer returns the underlying storage of d in a buffer.Buffer. func (d PacketData) ToBuffer() buffer.Buffer { buf := d.pk.buf.Clone() offset := d.pk.dataOffset() buf.TrimFront(int64(offset)) return buf } // AppendView appends v into d, taking the ownership of v. func (d PacketData) AppendView(v *buffer.View) { d.pk.buf.Append(v) } // MergeBuffer merges b into d and clears b. func (d PacketData) MergeBuffer(b *buffer.Buffer) { d.pk.buf.Merge(b) } // MergeFragment appends the data portion of frag to dst. It modifies // frag and frag should not be used again. func MergeFragment(dst, frag *PacketBuffer) { frag.buf.TrimFront(int64(frag.dataOffset())) dst.buf.Merge(&frag.buf) } // ReadFrom moves at most count bytes from the beginning of src to the end // of d and returns the number of bytes moved. func (d PacketData) ReadFrom(src *buffer.Buffer, count int) int { toRead := int64(count) if toRead > src.Size() { toRead = src.Size() } clone := src.Clone() clone.Truncate(toRead) d.pk.buf.Merge(&clone) src.TrimFront(toRead) return int(toRead) } // ReadFromPacketData moves count bytes from the beginning of oth to the end of // d. func (d PacketData) ReadFromPacketData(oth PacketData, count int) { buf := oth.ToBuffer() buf.Truncate(int64(count)) d.MergeBuffer(&buf) oth.TrimFront(count) buf.Release() } // Merge clears headers in oth and merges its data with d. func (d PacketData) Merge(oth PacketData) { oth.pk.buf.TrimFront(int64(oth.pk.dataOffset())) d.pk.buf.Merge(&oth.pk.buf) } // TrimFront removes up to count bytes from the front of d's payload. func (d PacketData) TrimFront(count int) { if count > d.Size() { count = d.Size() } buf := d.pk.Data().ToBuffer() buf.TrimFront(int64(count)) d.pk.buf.Truncate(int64(d.pk.dataOffset())) d.pk.buf.Merge(&buf) } // Size returns the number of bytes in the data payload of the packet. func (d PacketData) Size() int { return int(d.pk.buf.Size()) - d.pk.dataOffset() } // AsRange returns a Range representing the current data payload of the packet. func (d PacketData) AsRange() Range { return Range{ pk: d.pk, offset: d.pk.dataOffset(), length: d.Size(), } } // Checksum returns a checksum over the data payload of the packet. func (d PacketData) Checksum() uint16 { return d.pk.buf.Checksum(d.pk.dataOffset()) } // ChecksumAtOffset returns a checksum over the data payload of the packet // starting from offset. func (d PacketData) ChecksumAtOffset(offset int) uint16 { return d.pk.buf.Checksum(offset) } // Range represents a contiguous subportion of a PacketBuffer. type Range struct { pk *PacketBuffer offset int length int } // Size returns the number of bytes in r. func (r Range) Size() int { return r.length } // SubRange returns a new Range starting at off bytes of r. It returns an empty // range if off is out-of-bounds. func (r Range) SubRange(off int) Range { if off > r.length { return Range{pk: r.pk} } return Range{ pk: r.pk, offset: r.offset + off, length: r.length - off, } } // Capped returns a new Range with the same starting point of r and length // capped at max. func (r Range) Capped(max int) Range { if r.length <= max { return r } return Range{ pk: r.pk, offset: r.offset, length: max, } } // ToSlice returns a caller-owned copy of data in r. func (r Range) ToSlice() []byte { if r.length == 0 { return nil } all := make([]byte, 0, r.length) r.iterate(func(v *buffer.View) { all = append(all, v.AsSlice()...) }) return all } // ToView returns a caller-owned copy of data in r. func (r Range) ToView() *buffer.View { if r.length == 0 { return nil } newV := buffer.NewView(r.length) r.iterate(func(v *buffer.View) { newV.Write(v.AsSlice()) }) return newV } // iterate calls fn for each piece in r. fn is always called with a non-empty // slice. func (r Range) iterate(fn func(*buffer.View)) { r.pk.buf.SubApply(r.offset, r.length, fn) } // PayloadSince returns a caller-owned view containing the payload starting from // and including a particular header. func PayloadSince(h PacketHeader) *buffer.View { offset := h.pk.headerOffset() for i := headerType(0); i < h.typ; i++ { offset += h.pk.headers[i].length } return Range{ pk: h.pk, offset: offset, length: int(h.pk.buf.Size()) - offset, }.ToView() } // BufferSince returns a caller-owned view containing the packet payload // starting from and including a particular header. func BufferSince(h PacketHeader) buffer.Buffer { offset := h.pk.headerOffset() for i := headerType(0); i < h.typ; i++ { offset += h.pk.headers[i].length } clone := h.pk.buf.Clone() clone.TrimFront(int64(offset)) return clone } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/packet_buffer_list.go000066400000000000000000000045141465435605700260360ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack // PacketBufferList is a slice-backed list. All operations are O(1) unless // otherwise noted. // // Note: this is intentionally backed by a slice, not an intrusive list. We've // switched PacketBufferList back-and-forth between intrusive list and // slice-backed implementations, and the latter has proven to be preferable: // // - Intrusive lists are a refcounting nightmare, as modifying the list // sometimes-but-not-always modifies the list for others. // - The slice-backed implementation has been benchmarked and is slightly more // performant. // // +stateify savable type PacketBufferList struct { pbs []*PacketBuffer } // AsSlice returns a slice containing the packets in the list. // //go:nosplit func (pl *PacketBufferList) AsSlice() []*PacketBuffer { return pl.pbs } // Reset decrements all elements and resets the list to the empty state. // //go:nosplit func (pl *PacketBufferList) Reset() { for i, pb := range pl.pbs { pb.DecRef() pl.pbs[i] = nil } pl.pbs = pl.pbs[:0] } // Len returns the number of elements in the list. // //go:nosplit func (pl *PacketBufferList) Len() int { return len(pl.pbs) } // PushBack inserts the PacketBuffer at the back of the list. // //go:nosplit func (pl *PacketBufferList) PushBack(pb *PacketBuffer) { pl.pbs = append(pl.pbs, pb) } // PopFront removes the first element in the list if it exists and returns it. // //go:nosplit func (pl *PacketBufferList) PopFront() *PacketBuffer { if len(pl.pbs) == 0 { return nil } pkt := pl.pbs[0] pl.pbs = pl.pbs[1:] return pkt } // DecRef decreases the reference count on each PacketBuffer // stored in the list. // // NOTE: runs in O(n) time. // //go:nosplit func (pl PacketBufferList) DecRef() { for _, pb := range pl.pbs { pb.DecRef() } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/packet_buffer_refs.go000066400000000000000000000102641465435605700260210ustar00rootroot00000000000000package stack import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const packetBufferenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var packetBufferobj *PacketBuffer // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type packetBufferRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *packetBufferRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *packetBufferRefs) RefType() string { return fmt.Sprintf("%T", packetBufferobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *packetBufferRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *packetBufferRefs) LogRefs() bool { return packetBufferenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *packetBufferRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *packetBufferRefs) IncRef() { v := r.refCount.Add(1) if packetBufferenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *packetBufferRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if packetBufferenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *packetBufferRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if packetBufferenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *packetBufferRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/packet_buffer_unsafe.go000066400000000000000000000017761465435605700263530ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import "unsafe" // PacketBufferStructSize is the minimal size of the packet buffer overhead. const PacketBufferStructSize = int(unsafe.Sizeof(PacketBuffer{})) // ID returns a unique ID for the underlying storage of the packet. // // Two *PacketBuffers have the same IDs if and only if they point to the same // location in memory. func (pk *PacketBuffer) ID() uintptr { return uintptr(unsafe.Pointer(pk)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/packet_endpoint_list_mutex.go000066400000000000000000000052201465435605700276220ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type packetEndpointListRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var packetEndpointListlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type packetEndpointListlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *packetEndpointListRWMutex) Lock() { locking.AddGLock(packetEndpointListprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *packetEndpointListRWMutex) NestedLock(i packetEndpointListlockNameIndex) { locking.AddGLock(packetEndpointListprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *packetEndpointListRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(packetEndpointListprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *packetEndpointListRWMutex) NestedUnlock(i packetEndpointListlockNameIndex) { m.mu.Unlock() locking.DelGLock(packetEndpointListprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *packetEndpointListRWMutex) RLock() { locking.AddGLock(packetEndpointListprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *packetEndpointListRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(packetEndpointListprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *packetEndpointListRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *packetEndpointListRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *packetEndpointListRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var packetEndpointListprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func packetEndpointListinitLockNames() {} func init() { packetEndpointListinitLockNames() packetEndpointListprefixIndex = locking.NewMutexClass(reflect.TypeOf(packetEndpointListRWMutex{}), packetEndpointListlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/packet_eps_mutex.go000066400000000000000000000046461465435605700255510ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type packetEPsRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var packetEPslockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type packetEPslockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *packetEPsRWMutex) Lock() { locking.AddGLock(packetEPsprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *packetEPsRWMutex) NestedLock(i packetEPslockNameIndex) { locking.AddGLock(packetEPsprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *packetEPsRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(packetEPsprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *packetEPsRWMutex) NestedUnlock(i packetEPslockNameIndex) { m.mu.Unlock() locking.DelGLock(packetEPsprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *packetEPsRWMutex) RLock() { locking.AddGLock(packetEPsprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *packetEPsRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(packetEPsprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *packetEPsRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *packetEPsRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *packetEPsRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var packetEPsprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func packetEPsinitLockNames() {} func init() { packetEPsinitLockNames() packetEPsprefixIndex = locking.NewMutexClass(reflect.TypeOf(packetEPsRWMutex{}), packetEPslockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/packets_pending_link_resolution_mutex.go000066400000000000000000000040011465435605700320520ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // Mutex is sync.Mutex with the correctness validator. type packetsPendingLinkResolutionMutex struct { mu sync.Mutex } var packetsPendingLinkResolutionprefixIndex *locking.MutexClass // lockNames is a list of user-friendly lock names. // Populated in init. var packetsPendingLinkResolutionlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type packetsPendingLinkResolutionlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *packetsPendingLinkResolutionMutex) Lock() { locking.AddGLock(packetsPendingLinkResolutionprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *packetsPendingLinkResolutionMutex) NestedLock(i packetsPendingLinkResolutionlockNameIndex) { locking.AddGLock(packetsPendingLinkResolutionprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *packetsPendingLinkResolutionMutex) Unlock() { locking.DelGLock(packetsPendingLinkResolutionprefixIndex, -1) m.mu.Unlock() } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *packetsPendingLinkResolutionMutex) NestedUnlock(i packetsPendingLinkResolutionlockNameIndex) { locking.DelGLock(packetsPendingLinkResolutionprefixIndex, int(i)) m.mu.Unlock() } // DO NOT REMOVE: The following function is automatically replaced. func packetsPendingLinkResolutioninitLockNames() {} func init() { packetsPendingLinkResolutioninitLockNames() packetsPendingLinkResolutionprefixIndex = locking.NewMutexClass(reflect.TypeOf(packetsPendingLinkResolutionMutex{}), packetsPendingLinkResolutionlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/pending_packets.go000066400000000000000000000151141465435605700253370ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "fmt" "gvisor.dev/gvisor/pkg/tcpip" ) const ( // maxPendingResolutions is the maximum number of pending link-address // resolutions. maxPendingResolutions = 64 maxPendingPacketsPerResolution = 256 ) // +stateify savable type pendingPacket struct { routeInfo RouteInfo pkt *PacketBuffer } // +stateify savable type packetsPendingLinkResolutionMu struct { packetsPendingLinkResolutionMutex `state:"nosave"` // The packets to send once the resolver completes. // // The link resolution channel is used as the key for this map. packets map[<-chan struct{}][]pendingPacket // FIFO of channels used to cancel the oldest goroutine waiting for // link-address resolution. // // cancelChans holds the same channels that are used as keys to packets. cancelChans []<-chan struct{} } // packetsPendingLinkResolution is a queue of packets pending link resolution. // // Once link resolution completes successfully, the packets will be written. // // +stateify savable type packetsPendingLinkResolution struct { nic *nic mu packetsPendingLinkResolutionMu } func (f *packetsPendingLinkResolution) incrementOutgoingPacketErrors(pkt *PacketBuffer) { f.nic.stack.stats.IP.OutgoingPacketErrors.Increment() if ipEndpointStats, ok := f.nic.getNetworkEndpoint(pkt.NetworkProtocolNumber).Stats().(IPNetworkEndpointStats); ok { ipEndpointStats.IPStats().OutgoingPacketErrors.Increment() } } func (f *packetsPendingLinkResolution) init(nic *nic) { f.mu.Lock() defer f.mu.Unlock() f.nic = nic f.mu.packets = make(map[<-chan struct{}][]pendingPacket) } // cancel drains all pending packet queues and release all packet // references. func (f *packetsPendingLinkResolution) cancel() { f.mu.Lock() defer f.mu.Unlock() for ch, pendingPackets := range f.mu.packets { for _, p := range pendingPackets { p.pkt.DecRef() } delete(f.mu.packets, ch) } f.mu.cancelChans = nil } // dequeue any pending packets associated with ch. // // If err is nil, packets will be written and sent to the given remote link // address. func (f *packetsPendingLinkResolution) dequeue(ch <-chan struct{}, linkAddr tcpip.LinkAddress, err tcpip.Error) { f.mu.Lock() packets, ok := f.mu.packets[ch] delete(f.mu.packets, ch) if ok { for i, cancelChan := range f.mu.cancelChans { if cancelChan == ch { f.mu.cancelChans = append(f.mu.cancelChans[:i], f.mu.cancelChans[i+1:]...) break } } } f.mu.Unlock() if ok { f.dequeuePackets(packets, linkAddr, err) } } // enqueue a packet to be sent once link resolution completes. // // If the maximum number of pending resolutions is reached, the packets // associated with the oldest link resolution will be dequeued as if they failed // link resolution. func (f *packetsPendingLinkResolution) enqueue(r *Route, pkt *PacketBuffer) tcpip.Error { f.mu.Lock() // Make sure we attempt resolution while holding f's lock so that we avoid // a race where link resolution completes before we enqueue the packets. // // A @ T1: Call ResolvedFields (get link resolution channel) // B @ T2: Complete link resolution, dequeue pending packets // C @ T1: Enqueue packet that already completed link resolution (which will // never dequeue) // // To make sure B does not interleave with A and C, we make sure A and C are // done while holding the lock. routeInfo, ch, err := r.resolvedFields(nil) switch err.(type) { case nil: // The route resolved immediately, so we don't need to wait for link // resolution to send the packet. f.mu.Unlock() pkt.EgressRoute = routeInfo return f.nic.writePacket(pkt) case *tcpip.ErrWouldBlock: // We need to wait for link resolution to complete. default: f.mu.Unlock() return err } defer f.mu.Unlock() packets, ok := f.mu.packets[ch] packets = append(packets, pendingPacket{ routeInfo: routeInfo, pkt: pkt.IncRef(), }) if len(packets) > maxPendingPacketsPerResolution { f.incrementOutgoingPacketErrors(packets[0].pkt) packets[0].pkt.DecRef() packets[0] = pendingPacket{} packets = packets[1:] if numPackets := len(packets); numPackets != maxPendingPacketsPerResolution { panic(fmt.Sprintf("holding more queued packets than expected; got = %d, want <= %d", numPackets, maxPendingPacketsPerResolution)) } } f.mu.packets[ch] = packets if ok { return nil } cancelledPackets := f.newCancelChannelLocked(ch) if len(cancelledPackets) != 0 { // Dequeue the pending packets in a new goroutine to not hold up the current // goroutine as handing link resolution failures may be a costly operation. go f.dequeuePackets(cancelledPackets, "" /* linkAddr */, &tcpip.ErrAborted{}) } return nil } // newCancelChannelLocked appends the link resolution channel to a FIFO. If the // maximum number of pending resolutions is reached, the oldest channel will be // removed and its associated pending packets will be returned. func (f *packetsPendingLinkResolution) newCancelChannelLocked(newCH <-chan struct{}) []pendingPacket { f.mu.cancelChans = append(f.mu.cancelChans, newCH) if len(f.mu.cancelChans) <= maxPendingResolutions { return nil } ch := f.mu.cancelChans[0] f.mu.cancelChans[0] = nil f.mu.cancelChans = f.mu.cancelChans[1:] if l := len(f.mu.cancelChans); l > maxPendingResolutions { panic(fmt.Sprintf("max pending resolutions reached; got %d active resolutions, max = %d", l, maxPendingResolutions)) } packets, ok := f.mu.packets[ch] if !ok { panic("must have a packet queue for an uncancelled channel") } delete(f.mu.packets, ch) return packets } func (f *packetsPendingLinkResolution) dequeuePackets(packets []pendingPacket, linkAddr tcpip.LinkAddress, err tcpip.Error) { for _, p := range packets { if err == nil { p.routeInfo.RemoteLinkAddress = linkAddr p.pkt.EgressRoute = p.routeInfo _ = f.nic.writePacket(p.pkt) } else { f.incrementOutgoingPacketErrors(p.pkt) if linkResolvableEP, ok := f.nic.getNetworkEndpoint(p.pkt.NetworkProtocolNumber).(LinkResolvableNetworkEndpoint); ok { linkResolvableEP.HandleLinkResolutionFailure(p.pkt) } } p.pkt.DecRef() } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/rand.go000066400000000000000000000017471465435605700231340ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "math/rand" "gvisor.dev/gvisor/pkg/sync" ) // lockedRandomSource provides a threadsafe rand.Source. type lockedRandomSource struct { mu sync.Mutex src rand.Source } func (r *lockedRandomSource) Int63() (n int64) { r.mu.Lock() n = r.src.Int63() r.mu.Unlock() return n } func (r *lockedRandomSource) Seed(seed int64) { r.mu.Lock() r.src.Seed(seed) r.mu.Unlock() } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/registration.go000066400000000000000000001420471465435605700247210ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "fmt" "time" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/waiter" ) // NetworkEndpointID is the identifier of a network layer protocol endpoint. // Currently the local address is sufficient because all supported protocols // (i.e., IPv4 and IPv6) have different sizes for their addresses. type NetworkEndpointID struct { LocalAddress tcpip.Address } // TransportEndpointID is the identifier of a transport layer protocol endpoint. // // +stateify savable type TransportEndpointID struct { // LocalPort is the local port associated with the endpoint. LocalPort uint16 // LocalAddress is the local [network layer] address associated with // the endpoint. LocalAddress tcpip.Address // RemotePort is the remote port associated with the endpoint. RemotePort uint16 // RemoteAddress it the remote [network layer] address associated with // the endpoint. RemoteAddress tcpip.Address } // NetworkPacketInfo holds information about a network layer packet. // // +stateify savable type NetworkPacketInfo struct { // LocalAddressBroadcast is true if the packet's local address is a broadcast // address. LocalAddressBroadcast bool // IsForwardedPacket is true if the packet is being forwarded. IsForwardedPacket bool } // TransportErrorKind enumerates error types that are handled by the transport // layer. type TransportErrorKind int const ( // PacketTooBigTransportError indicates that a packet did not reach its // destination because a link on the path to the destination had an MTU that // was too small to carry the packet. PacketTooBigTransportError TransportErrorKind = iota // DestinationHostUnreachableTransportError indicates that the destination // host was unreachable. DestinationHostUnreachableTransportError // DestinationPortUnreachableTransportError indicates that a packet reached // the destination host, but the transport protocol was not active on the // destination port. DestinationPortUnreachableTransportError // DestinationNetworkUnreachableTransportError indicates that the destination // network was unreachable. DestinationNetworkUnreachableTransportError // DestinationProtoUnreachableTransportError indicates that the destination // protocol was unreachable. DestinationProtoUnreachableTransportError // SourceRouteFailedTransportError indicates that the source route failed. SourceRouteFailedTransportError // SourceHostIsolatedTransportError indicates that the source machine is not // on the network. SourceHostIsolatedTransportError // DestinationHostDownTransportError indicates that the destination host is // down. DestinationHostDownTransportError ) // TransportError is a marker interface for errors that may be handled by the // transport layer. type TransportError interface { tcpip.SockErrorCause // Kind returns the type of the transport error. Kind() TransportErrorKind } // TransportEndpoint is the interface that needs to be implemented by transport // protocol (e.g., tcp, udp) endpoints that can handle packets. type TransportEndpoint interface { // HandlePacket is called by the stack when new packets arrive to this // transport endpoint. It sets the packet buffer's transport header. // // HandlePacket may modify the packet. HandlePacket(TransportEndpointID, *PacketBuffer) // HandleError is called when the transport endpoint receives an error. // // HandleError takes may modify the packet buffer. HandleError(TransportError, *PacketBuffer) // Abort initiates an expedited endpoint teardown. It puts the endpoint // in a closed state and frees all resources associated with it. This // cleanup may happen asynchronously. Wait can be used to block on this // asynchronous cleanup. Abort() // Wait waits for any worker goroutines owned by the endpoint to stop. // // An endpoint can be requested to stop its worker goroutines by calling // its Close method. // // Wait will not block if the endpoint hasn't started any goroutines // yet, even if it might later. Wait() } // RawTransportEndpoint is the interface that needs to be implemented by raw // transport protocol endpoints. RawTransportEndpoints receive the entire // packet - including the network and transport headers - as delivered to // netstack. type RawTransportEndpoint interface { // HandlePacket is called by the stack when new packets arrive to // this transport endpoint. The packet contains all data from the link // layer up. // // HandlePacket may modify the packet. HandlePacket(*PacketBuffer) } // PacketEndpoint is the interface that needs to be implemented by packet // transport protocol endpoints. These endpoints receive link layer headers in // addition to whatever they contain (usually network and transport layer // headers and a payload). type PacketEndpoint interface { // HandlePacket is called by the stack when new packets arrive that // match the endpoint. // // Implementers should treat packet as immutable and should copy it // before before modification. // // linkHeader may have a length of 0, in which case the PacketEndpoint // should construct its own ethernet header for applications. // // HandlePacket may modify pkt. HandlePacket(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, pkt *PacketBuffer) } // UnknownDestinationPacketDisposition enumerates the possible return values from // HandleUnknownDestinationPacket(). type UnknownDestinationPacketDisposition int const ( // UnknownDestinationPacketMalformed denotes that the packet was malformed // and no further processing should be attempted other than updating // statistics. UnknownDestinationPacketMalformed UnknownDestinationPacketDisposition = iota // UnknownDestinationPacketUnhandled tells the caller that the packet was // well formed but that the issue was not handled and the stack should take // the default action. UnknownDestinationPacketUnhandled // UnknownDestinationPacketHandled tells the caller that it should do // no further processing. UnknownDestinationPacketHandled ) // TransportProtocol is the interface that needs to be implemented by transport // protocols (e.g., tcp, udp) that want to be part of the networking stack. type TransportProtocol interface { // Number returns the transport protocol number. Number() tcpip.TransportProtocolNumber // NewEndpoint creates a new endpoint of the transport protocol. NewEndpoint(netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) // NewRawEndpoint creates a new raw endpoint of the transport protocol. NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) // MinimumPacketSize returns the minimum valid packet size of this // transport protocol. The stack automatically drops any packets smaller // than this targeted at this protocol. MinimumPacketSize() int // ParsePorts returns the source and destination ports stored in a // packet of this protocol. ParsePorts(b []byte) (src, dst uint16, err tcpip.Error) // HandleUnknownDestinationPacket handles packets targeted at this // protocol that don't match any existing endpoint. For example, // it is targeted at a port that has no listeners. // // HandleUnknownDestinationPacket may modify the packet if it handles // the issue. HandleUnknownDestinationPacket(TransportEndpointID, *PacketBuffer) UnknownDestinationPacketDisposition // SetOption allows enabling/disabling protocol specific features. // SetOption returns an error if the option is not supported or the // provided option value is invalid. SetOption(option tcpip.SettableTransportProtocolOption) tcpip.Error // Option allows retrieving protocol specific option values. // Option returns an error if the option is not supported or the // provided option value is invalid. Option(option tcpip.GettableTransportProtocolOption) tcpip.Error // Close requests that any worker goroutines owned by the protocol // stop. Close() // Wait waits for any worker goroutines owned by the protocol to stop. Wait() // Pause requests that any protocol level background workers pause. Pause() // Resume resumes any protocol level background workers that were // previously paused by Pause. Resume() // Parse sets pkt.TransportHeader and trims pkt.Data appropriately. It does // neither and returns false if pkt.Data is too small, i.e. pkt.Data.Size() < // MinimumPacketSize() Parse(pkt *PacketBuffer) (ok bool) } // TransportPacketDisposition is the result from attempting to deliver a packet // to the transport layer. type TransportPacketDisposition int const ( // TransportPacketHandled indicates that a transport packet was handled by the // transport layer and callers need not take any further action. TransportPacketHandled TransportPacketDisposition = iota // TransportPacketProtocolUnreachable indicates that the transport // protocol requested in the packet is not supported. TransportPacketProtocolUnreachable // TransportPacketDestinationPortUnreachable indicates that there weren't any // listeners interested in the packet and the transport protocol has no means // to notify the sender. TransportPacketDestinationPortUnreachable ) // TransportDispatcher contains the methods used by the network stack to deliver // packets to the appropriate transport endpoint after it has been handled by // the network layer. type TransportDispatcher interface { // DeliverTransportPacket delivers packets to the appropriate // transport protocol endpoint. // // pkt.NetworkHeader must be set before calling DeliverTransportPacket. // // DeliverTransportPacket may modify the packet. DeliverTransportPacket(tcpip.TransportProtocolNumber, *PacketBuffer) TransportPacketDisposition // DeliverTransportError delivers an error to the appropriate transport // endpoint. // // DeliverTransportError may modify the packet buffer. DeliverTransportError(local, remote tcpip.Address, _ tcpip.NetworkProtocolNumber, _ tcpip.TransportProtocolNumber, _ TransportError, _ *PacketBuffer) // DeliverRawPacket delivers a packet to any subscribed raw sockets. // // DeliverRawPacket does NOT take ownership of the packet buffer. DeliverRawPacket(tcpip.TransportProtocolNumber, *PacketBuffer) } // PacketLooping specifies where an outbound packet should be sent. type PacketLooping byte const ( // PacketOut indicates that the packet should be passed to the link // endpoint. PacketOut PacketLooping = 1 << iota // PacketLoop indicates that the packet should be handled locally. PacketLoop ) // NetworkHeaderParams are the header parameters given as input by the // transport endpoint to the network. type NetworkHeaderParams struct { // Protocol refers to the transport protocol number. Protocol tcpip.TransportProtocolNumber // TTL refers to Time To Live field of the IP-header. TTL uint8 // TOS refers to TypeOfService or TrafficClass field of the IP-header. TOS uint8 // DF indicates whether the DF bit should be set. DF bool } // GroupAddressableEndpoint is an endpoint that supports group addressing. // // An endpoint is considered to support group addressing when one or more // endpoints may associate themselves with the same identifier (group address). type GroupAddressableEndpoint interface { // JoinGroup joins the specified group. JoinGroup(group tcpip.Address) tcpip.Error // LeaveGroup attempts to leave the specified group. LeaveGroup(group tcpip.Address) tcpip.Error // IsInGroup returns true if the endpoint is a member of the specified group. IsInGroup(group tcpip.Address) bool } // PrimaryEndpointBehavior is an enumeration of an AddressEndpoint's primary // behavior. type PrimaryEndpointBehavior int const ( // CanBePrimaryEndpoint indicates the endpoint can be used as a primary // endpoint for new connections with no local address. CanBePrimaryEndpoint PrimaryEndpointBehavior = iota // FirstPrimaryEndpoint indicates the endpoint should be the first // primary endpoint considered. If there are multiple endpoints with // this behavior, they are ordered by recency. FirstPrimaryEndpoint // NeverPrimaryEndpoint indicates the endpoint should never be a // primary endpoint. NeverPrimaryEndpoint ) func (peb PrimaryEndpointBehavior) String() string { switch peb { case CanBePrimaryEndpoint: return "CanBePrimaryEndpoint" case FirstPrimaryEndpoint: return "FirstPrimaryEndpoint" case NeverPrimaryEndpoint: return "NeverPrimaryEndpoint" default: panic(fmt.Sprintf("unknown primary endpoint behavior: %d", peb)) } } // AddressConfigType is the method used to add an address. type AddressConfigType int const ( // AddressConfigStatic is a statically configured address endpoint that was // added by some user-specified action (adding an explicit address, joining a // multicast group). AddressConfigStatic AddressConfigType = iota // AddressConfigSlaac is an address endpoint added by SLAAC, as per RFC 4862 // section 5.5.3. AddressConfigSlaac ) // AddressLifetimes encodes an address' preferred and valid lifetimes, as well // as if the address is deprecated. // // +stateify savable type AddressLifetimes struct { // Deprecated is whether the address is deprecated. Deprecated bool // PreferredUntil is the time at which the address will be deprecated. // // Note that for certain addresses, deprecating the address at the // PreferredUntil time is not handled as a scheduled job by the stack, but // is information provided by the owner as an indication of when it will // deprecate the address. // // PreferredUntil should be ignored if Deprecated is true. If Deprecated // is false, and PreferredUntil is the zero value, no information about // the preferred lifetime can be inferred. PreferredUntil tcpip.MonotonicTime // ValidUntil is the time at which the address will be invalidated. // // Note that for certain addresses, invalidating the address at the // ValidUntil time is not handled as a scheduled job by the stack, but // is information provided by the owner as an indication of when it will // invalidate the address. // // If ValidUntil is the zero value, no information about the valid lifetime // can be inferred. ValidUntil tcpip.MonotonicTime } // AddressProperties contains additional properties that can be configured when // adding an address. type AddressProperties struct { PEB PrimaryEndpointBehavior ConfigType AddressConfigType // Lifetimes encodes the address' lifetimes. // // Lifetimes.PreferredUntil and Lifetimes.ValidUntil are informational, i.e. // the stack will not deprecated nor invalidate the address upon reaching // these timestamps. // // If Lifetimes.Deprecated is true, the address will be added as deprecated. Lifetimes AddressLifetimes // Temporary is as defined in RFC 4941, but applies not only to addresses // added via SLAAC, e.g. DHCPv6 can also add temporary addresses. Temporary // addresses are short-lived and are not to be valid (or preferred) // forever; hence the term temporary. Temporary bool Disp AddressDispatcher } // AddressAssignmentState is an address' assignment state. type AddressAssignmentState int const ( _ AddressAssignmentState = iota // AddressDisabled indicates the NIC the address is assigned to is disabled. AddressDisabled // AddressTentative indicates an address is yet to pass DAD (IPv4 addresses // are never tentative). AddressTentative // AddressAssigned indicates an address is assigned. AddressAssigned ) func (state AddressAssignmentState) String() string { switch state { case AddressDisabled: return "Disabled" case AddressTentative: return "Tentative" case AddressAssigned: return "Assigned" default: panic(fmt.Sprintf("unknown address assignment state: %d", state)) } } // AddressRemovalReason is the reason an address was removed. type AddressRemovalReason int const ( _ AddressRemovalReason = iota // AddressRemovalManualAction indicates the address was removed explicitly // using the stack API. AddressRemovalManualAction // AddressRemovalInterfaceRemoved indicates the address was removed because // the NIC it is assigned to was removed. AddressRemovalInterfaceRemoved // AddressRemovalDADFailed indicates the address was removed because DAD // failed. AddressRemovalDADFailed // AddressRemovalInvalidated indicates the address was removed because it // was invalidated. AddressRemovalInvalidated ) func (reason AddressRemovalReason) String() string { switch reason { case AddressRemovalManualAction: return "ManualAction" case AddressRemovalInterfaceRemoved: return "InterfaceRemoved" case AddressRemovalDADFailed: return "DADFailed" case AddressRemovalInvalidated: return "Invalidated" default: panic(fmt.Sprintf("unknown address removal reason: %d", reason)) } } // AddressDispatcher is the interface integrators can implement to receive // address-related events. type AddressDispatcher interface { // OnChanged is called with an address' properties when they change. // // OnChanged is called once when the address is added with the initial state, // and every time a property changes. // // The PreferredUntil and ValidUntil fields in AddressLifetimes must be // considered informational, i.e. one must not consider an address to be // deprecated/invalid even if the monotonic clock timestamp is past these // deadlines. The Deprecated field indicates whether an address is // preferred or not; and OnRemoved will be called when an address is // removed due to invalidation. OnChanged(AddressLifetimes, AddressAssignmentState) // OnRemoved is called when an address is removed with the removal reason. OnRemoved(AddressRemovalReason) } // AssignableAddressEndpoint is a reference counted address endpoint that may be // assigned to a NetworkEndpoint. type AssignableAddressEndpoint interface { // AddressWithPrefix returns the endpoint's address. AddressWithPrefix() tcpip.AddressWithPrefix // Subnet returns the subnet of the endpoint's address. Subnet() tcpip.Subnet // IsAssigned returns whether or not the endpoint is considered bound // to its NetworkEndpoint. IsAssigned(allowExpired bool) bool // TryIncRef tries to increment this endpoint's reference count. // // Returns true if it was successfully incremented. If it returns false, then // the endpoint is considered expired and should no longer be used. TryIncRef() bool // DecRef decrements this endpoint's reference count. DecRef() } // AddressEndpoint is an endpoint representing an address assigned to an // AddressableEndpoint. type AddressEndpoint interface { AssignableAddressEndpoint // GetKind returns the address kind for this endpoint. GetKind() AddressKind // SetKind sets the address kind for this endpoint. SetKind(AddressKind) // ConfigType returns the method used to add the address. ConfigType() AddressConfigType // Deprecated returns whether or not this endpoint is deprecated. Deprecated() bool // SetDeprecated sets this endpoint's deprecated status. SetDeprecated(bool) // Lifetimes returns this endpoint's lifetimes. Lifetimes() AddressLifetimes // SetLifetimes sets this endpoint's lifetimes. // // Note that setting preferred-until and valid-until times do not result in // deprecation/invalidation jobs to be scheduled by the stack. SetLifetimes(AddressLifetimes) // Temporary returns whether or not this endpoint is temporary. Temporary() bool // RegisterDispatcher registers an address dispatcher. // // OnChanged will be called immediately on the provided address dispatcher // with this endpoint's current state. RegisterDispatcher(AddressDispatcher) } // AddressKind is the kind of an address. // // See the values of AddressKind for more details. type AddressKind int const ( // PermanentTentative is a permanent address endpoint that is not yet // considered to be fully bound to an interface in the traditional // sense. That is, the address is associated with a NIC, but packets // destined to the address MUST NOT be accepted and MUST be silently // dropped, and the address MUST NOT be used as a source address for // outgoing packets. For IPv6, addresses are of this kind until NDP's // Duplicate Address Detection (DAD) resolves. If DAD fails, the address // is removed. PermanentTentative AddressKind = iota // Permanent is a permanent endpoint (vs. a temporary one) assigned to the // NIC. Its reference count is biased by 1 to avoid removal when no route // holds a reference to it. It is removed by explicitly removing the address // from the NIC. Permanent // PermanentExpired is a permanent endpoint that had its address removed from // the NIC, and it is waiting to be removed once no references to it are held. // // If the address is re-added before the endpoint is removed, its type // changes back to Permanent. PermanentExpired // Temporary is an endpoint, created on a one-off basis to temporarily // consider the NIC bound an an address that it is not explicitly bound to // (such as a permanent address). Its reference count must not be biased by 1 // so that the address is removed immediately when references to it are no // longer held. // // A temporary endpoint may be promoted to permanent if the address is added // permanently. Temporary ) // IsPermanent returns true if the AddressKind represents a permanent address. func (k AddressKind) IsPermanent() bool { switch k { case Permanent, PermanentTentative: return true case Temporary, PermanentExpired: return false default: panic(fmt.Sprintf("unrecognized address kind = %d", k)) } } // AddressableEndpoint is an endpoint that supports addressing. // // An endpoint is considered to support addressing when the endpoint may // associate itself with an identifier (address). type AddressableEndpoint interface { // AddAndAcquirePermanentAddress adds the passed permanent address. // // Returns *tcpip.ErrDuplicateAddress if the address exists. // // Acquires and returns the AddressEndpoint for the added address. AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, properties AddressProperties) (AddressEndpoint, tcpip.Error) // RemovePermanentAddress removes the passed address if it is a permanent // address. // // Returns *tcpip.ErrBadLocalAddress if the endpoint does not have the passed // permanent address. RemovePermanentAddress(addr tcpip.Address) tcpip.Error // SetLifetimes sets an address' lifetimes (strictly informational) and // whether it should be deprecated or preferred. // // Returns *tcpip.ErrBadLocalAddress if the endpoint does not have the passed // address. SetLifetimes(addr tcpip.Address, lifetimes AddressLifetimes) tcpip.Error // MainAddress returns the endpoint's primary permanent address. MainAddress() tcpip.AddressWithPrefix // AcquireAssignedAddress returns an address endpoint for the passed address // that is considered bound to the endpoint, optionally creating a temporary // endpoint if requested and no existing address exists. // // The returned endpoint's reference count is incremented if readOnly is // false. // // Returns nil if the specified address is not local to this endpoint. AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB PrimaryEndpointBehavior, readOnly bool) AddressEndpoint // AcquireOutgoingPrimaryAddress returns a primary address that may be used as // a source address when sending packets to the passed remote address. // // If allowExpired is true, expired addresses may be returned. // // The returned endpoint's reference count is incremented. // // Returns nil if a primary address is not available. AcquireOutgoingPrimaryAddress(remoteAddr, srcHint tcpip.Address, allowExpired bool) AddressEndpoint // PrimaryAddresses returns the primary addresses. PrimaryAddresses() []tcpip.AddressWithPrefix // PermanentAddresses returns all the permanent addresses. PermanentAddresses() []tcpip.AddressWithPrefix } // NDPEndpoint is a network endpoint that supports NDP. type NDPEndpoint interface { NetworkEndpoint // InvalidateDefaultRouter invalidates a default router discovered through // NDP. InvalidateDefaultRouter(tcpip.Address) } // NetworkInterface is a network interface. type NetworkInterface interface { NetworkLinkEndpoint // ID returns the interface's ID. ID() tcpip.NICID // IsLoopback returns true if the interface is a loopback interface. IsLoopback() bool // Name returns the name of the interface. // // May return an empty string if the interface is not configured with a name. Name() string // Enabled returns true if the interface is enabled. Enabled() bool // Promiscuous returns true if the interface is in promiscuous mode. // // When in promiscuous mode, the interface should accept all packets. Promiscuous() bool // Spoofing returns true if the interface is in spoofing mode. // // When in spoofing mode, the interface should consider all addresses as // assigned to it. Spoofing() bool // PrimaryAddress returns the primary address associated with the interface. // // PrimaryAddress will return the first non-deprecated address if such an // address exists. If no non-deprecated addresses exist, the first deprecated // address will be returned. If no deprecated addresses exist, the zero value // will be returned. PrimaryAddress(tcpip.NetworkProtocolNumber) (tcpip.AddressWithPrefix, tcpip.Error) // CheckLocalAddress returns true if the address exists on the interface. CheckLocalAddress(tcpip.NetworkProtocolNumber, tcpip.Address) bool // WritePacketToRemote writes the packet to the given remote link address. WritePacketToRemote(tcpip.LinkAddress, *PacketBuffer) tcpip.Error // WritePacket writes a packet through the given route. // // WritePacket may modify the packet buffer. The packet buffer's // network and transport header must be set. WritePacket(*Route, *PacketBuffer) tcpip.Error // HandleNeighborProbe processes an incoming neighbor probe (e.g. ARP // request or NDP Neighbor Solicitation). // // HandleNeighborProbe assumes that the probe is valid for the network // interface the probe was received on. HandleNeighborProbe(tcpip.NetworkProtocolNumber, tcpip.Address, tcpip.LinkAddress) tcpip.Error // HandleNeighborConfirmation processes an incoming neighbor confirmation // (e.g. ARP reply or NDP Neighbor Advertisement). HandleNeighborConfirmation(tcpip.NetworkProtocolNumber, tcpip.Address, tcpip.LinkAddress, ReachabilityConfirmationFlags) tcpip.Error } // LinkResolvableNetworkEndpoint handles link resolution events. type LinkResolvableNetworkEndpoint interface { // HandleLinkResolutionFailure is called when link resolution prevents the // argument from having been sent. HandleLinkResolutionFailure(*PacketBuffer) } // NetworkEndpoint is the interface that needs to be implemented by endpoints // of network layer protocols (e.g., ipv4, ipv6). type NetworkEndpoint interface { // Enable enables the endpoint. // // Must only be called when the stack is in a state that allows the endpoint // to send and receive packets. // // Returns *tcpip.ErrNotPermitted if the endpoint cannot be enabled. Enable() tcpip.Error // Enabled returns true if the endpoint is enabled. Enabled() bool // Disable disables the endpoint. Disable() // DefaultTTL is the default time-to-live value (or hop limit, in ipv6) // for this endpoint. DefaultTTL() uint8 // MTU is the maximum transmission unit for this endpoint. This is // generally calculated as the MTU of the underlying data link endpoint // minus the network endpoint max header length. MTU() uint32 // MaxHeaderLength returns the maximum size the network (and lower // level layers combined) headers can have. Higher levels use this // information to reserve space in the front of the packets they're // building. MaxHeaderLength() uint16 // WritePacket writes a packet to the given destination address and // protocol. It may modify pkt. pkt.TransportHeader must have // already been set. WritePacket(r *Route, params NetworkHeaderParams, pkt *PacketBuffer) tcpip.Error // WriteHeaderIncludedPacket writes a packet that includes a network // header to the given destination address. It may modify pkt. WriteHeaderIncludedPacket(r *Route, pkt *PacketBuffer) tcpip.Error // HandlePacket is called by the link layer when new packets arrive to // this network endpoint. It sets pkt.NetworkHeader. // // HandlePacket may modify pkt. HandlePacket(pkt *PacketBuffer) // Close is called when the endpoint is removed from a stack. Close() // NetworkProtocolNumber returns the tcpip.NetworkProtocolNumber for // this endpoint. NetworkProtocolNumber() tcpip.NetworkProtocolNumber // Stats returns a reference to the network endpoint stats. Stats() NetworkEndpointStats } // NetworkEndpointStats is the interface implemented by each network endpoint // stats struct. type NetworkEndpointStats interface { // IsNetworkEndpointStats is an empty method to implement the // NetworkEndpointStats marker interface. IsNetworkEndpointStats() } // IPNetworkEndpointStats is a NetworkEndpointStats that tracks IP-related // statistics. type IPNetworkEndpointStats interface { NetworkEndpointStats // IPStats returns the IP statistics of a network endpoint. IPStats() *tcpip.IPStats } // ForwardingNetworkEndpoint is a network endpoint that may forward packets. type ForwardingNetworkEndpoint interface { NetworkEndpoint // Forwarding returns the forwarding configuration. Forwarding() bool // SetForwarding sets the forwarding configuration. // // Returns the previous forwarding configuration. SetForwarding(bool) bool } // MulticastForwardingNetworkEndpoint is a network endpoint that may forward // multicast packets. type MulticastForwardingNetworkEndpoint interface { ForwardingNetworkEndpoint // MulticastForwarding returns true if multicast forwarding is enabled. // Otherwise, returns false. MulticastForwarding() bool // SetMulticastForwarding sets the multicast forwarding configuration. // // Returns the previous forwarding configuration. SetMulticastForwarding(bool) bool } // NetworkProtocol is the interface that needs to be implemented by network // protocols (e.g., ipv4, ipv6) that want to be part of the networking stack. type NetworkProtocol interface { // Number returns the network protocol number. Number() tcpip.NetworkProtocolNumber // MinimumPacketSize returns the minimum valid packet size of this // network protocol. The stack automatically drops any packets smaller // than this targeted at this protocol. MinimumPacketSize() int // ParseAddresses returns the source and destination addresses stored in a // packet of this protocol. ParseAddresses(b []byte) (src, dst tcpip.Address) // NewEndpoint creates a new endpoint of this protocol. NewEndpoint(nic NetworkInterface, dispatcher TransportDispatcher) NetworkEndpoint // SetOption allows enabling/disabling protocol specific features. // SetOption returns an error if the option is not supported or the // provided option value is invalid. SetOption(option tcpip.SettableNetworkProtocolOption) tcpip.Error // Option allows retrieving protocol specific option values. // Option returns an error if the option is not supported or the // provided option value is invalid. Option(option tcpip.GettableNetworkProtocolOption) tcpip.Error // Close requests that any worker goroutines owned by the protocol // stop. Close() // Wait waits for any worker goroutines owned by the protocol to stop. Wait() // Parse sets pkt.NetworkHeader and trims pkt.Data appropriately. It // returns: // - The encapsulated protocol, if present. // - Whether there is an encapsulated transport protocol payload (e.g. ARP // does not encapsulate anything). // - Whether pkt.Data was large enough to parse and set pkt.NetworkHeader. Parse(pkt *PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) } // UnicastSourceAndMulticastDestination is a tuple that represents a unicast // source address and a multicast destination address. // // +stateify savable type UnicastSourceAndMulticastDestination struct { // Source represents a unicast source address. Source tcpip.Address // Destination represents a multicast destination address. Destination tcpip.Address } // MulticastRouteOutgoingInterface represents an outgoing interface in a // multicast route. type MulticastRouteOutgoingInterface struct { // ID corresponds to the outgoing NIC. ID tcpip.NICID // MinTTL represents the minimum TTL/HopLimit a multicast packet must have to // be sent through the outgoing interface. // // Note: a value of 0 allows all packets to be forwarded. MinTTL uint8 } // MulticastRoute is a multicast route. type MulticastRoute struct { // ExpectedInputInterface is the interface on which packets using this route // are expected to ingress. ExpectedInputInterface tcpip.NICID // OutgoingInterfaces is the set of interfaces that a multicast packet should // be forwarded out of. // // This field should not be empty. OutgoingInterfaces []MulticastRouteOutgoingInterface } // MulticastForwardingNetworkProtocol is the interface that needs to be // implemented by the network protocols that support multicast forwarding. type MulticastForwardingNetworkProtocol interface { NetworkProtocol // AddMulticastRoute adds a route to the multicast routing table such that // packets matching the addresses will be forwarded using the provided route. // // Returns an error if the addresses or route is invalid. AddMulticastRoute(UnicastSourceAndMulticastDestination, MulticastRoute) tcpip.Error // RemoveMulticastRoute removes the route matching the provided addresses // from the multicast routing table. // // Returns an error if the addresses are invalid or a matching route is not // found. RemoveMulticastRoute(UnicastSourceAndMulticastDestination) tcpip.Error // MulticastRouteLastUsedTime returns a monotonic timestamp that // represents the last time that the route matching the provided addresses // was used or updated. // // Returns an error if the addresses are invalid or a matching route was not // found. MulticastRouteLastUsedTime(UnicastSourceAndMulticastDestination) (tcpip.MonotonicTime, tcpip.Error) // EnableMulticastForwarding enables multicast forwarding for the protocol. // // Returns an error if the provided multicast forwarding event dispatcher is // nil. Otherwise, returns true if the multicast forwarding was already // enabled. EnableMulticastForwarding(MulticastForwardingEventDispatcher) (bool, tcpip.Error) // DisableMulticastForwarding disables multicast forwarding for the protocol. DisableMulticastForwarding() } // MulticastPacketContext is the context in which a multicast packet triggered // a multicast forwarding event. type MulticastPacketContext struct { // SourceAndDestination contains the unicast source address and the multicast // destination address found in the relevant multicast packet. SourceAndDestination UnicastSourceAndMulticastDestination // InputInterface is the interface on which the relevant multicast packet // arrived. InputInterface tcpip.NICID } // MulticastForwardingEventDispatcher is the interface that integrators should // implement to handle multicast routing events. type MulticastForwardingEventDispatcher interface { // OnMissingRoute is called when an incoming multicast packet does not match // any installed route. // // The packet that triggered this event may be queued so that it can be // transmitted once a route is installed. Even then, it may still be dropped // as per the routing table's GC/eviction policy. OnMissingRoute(MulticastPacketContext) // OnUnexpectedInputInterface is called when a multicast packet arrives at an // interface that does not match the installed route's expected input // interface. // // This may be an indication of a routing loop. The packet that triggered // this event is dropped without being forwarded. OnUnexpectedInputInterface(context MulticastPacketContext, expectedInputInterface tcpip.NICID) } // NetworkDispatcher contains the methods used by the network stack to deliver // inbound/outbound packets to the appropriate network/packet(if any) endpoints. type NetworkDispatcher interface { // DeliverNetworkPacket finds the appropriate network protocol endpoint // and hands the packet over for further processing. // // // If the link-layer has a header, the packet's link header must be populated. // // DeliverNetworkPacket may modify pkt. DeliverNetworkPacket(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) // DeliverLinkPacket delivers a packet to any interested packet endpoints. // // This method should be called with both incoming and outgoing packets. // // If the link-layer has a header, the packet's link header must be populated. DeliverLinkPacket(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) } // LinkEndpointCapabilities is the type associated with the capabilities // supported by a link-layer endpoint. It is a set of bitfields. type LinkEndpointCapabilities uint // The following are the supported link endpoint capabilities. const ( CapabilityNone LinkEndpointCapabilities = 0 // CapabilityTXChecksumOffload indicates that the link endpoint supports // checksum computation for outgoing packets and the stack can skip // computing checksums when sending packets. CapabilityTXChecksumOffload LinkEndpointCapabilities = 1 << iota // CapabilityRXChecksumOffload indicates that the link endpoint supports // checksum verification on received packets and that it's safe for the // stack to skip checksum verification. CapabilityRXChecksumOffload CapabilityResolutionRequired CapabilitySaveRestore CapabilityDisconnectOk CapabilityLoopback ) // LinkWriter is an interface that supports sending packets via a data-link // layer endpoint. It is used with QueueingDiscipline to batch writes from // upper layer endpoints. type LinkWriter interface { // WritePackets writes packets. Must not be called with an empty list of // packet buffers. // // Each packet must have the link-layer header set, if the link requires // one. // // WritePackets may modify the packet buffers, and takes ownership of the PacketBufferList. // it is not safe to use the PacketBufferList after a call to WritePackets. WritePackets(PacketBufferList) (int, tcpip.Error) } // NetworkLinkEndpoint is a data-link layer that supports sending network // layer packets. type NetworkLinkEndpoint interface { // MTU is the maximum transmission unit for this endpoint. This is // usually dictated by the backing physical network; when such a // physical network doesn't exist, the limit is generally 64k, which // includes the maximum size of an IP packet. MTU() uint32 // SetMTU update the maximum transmission unit for the endpoint. SetMTU(mtu uint32) // MaxHeaderLength returns the maximum size the data link (and // lower level layers combined) headers can have. Higher levels use this // information to reserve space in the front of the packets they're // building. MaxHeaderLength() uint16 // LinkAddress returns the link address (typically a MAC) of the // endpoint. LinkAddress() tcpip.LinkAddress // SetLinkAddress updated the endpoint's link address (typically a MAC). SetLinkAddress(addr tcpip.LinkAddress) // Capabilities returns the set of capabilities supported by the // endpoint. Capabilities() LinkEndpointCapabilities // Attach attaches the data link layer endpoint to the network-layer // dispatcher of the stack. // // Attach is called with a nil dispatcher when the endpoint's NIC is being // removed. Attach(dispatcher NetworkDispatcher) // IsAttached returns whether a NetworkDispatcher is attached to the // endpoint. IsAttached() bool // Wait waits for any worker goroutines owned by the endpoint to stop. // // For now, requesting that an endpoint's worker goroutine(s) stop is // implementation specific. // // Wait will not block if the endpoint hasn't started any goroutines // yet, even if it might later. Wait() // ARPHardwareType returns the ARPHRD_TYPE of the link endpoint. // // See: // https://github.com/torvalds/linux/blob/aa0c9086b40c17a7ad94425b3b70dd1fdd7497bf/include/uapi/linux/if_arp.h#L30 ARPHardwareType() header.ARPHardwareType // AddHeader adds a link layer header to the packet if required. AddHeader(*PacketBuffer) // ParseHeader parses the link layer header to the packet. ParseHeader(*PacketBuffer) bool // Close is called when the endpoint is removed from a stack. Close() // SetOnCloseAction sets the action that will be exected before closing the // endpoint. It is used to destroy a network device when its endpoint // is closed. Endpoints that are closed only after destroying their // network devices can implement this method as no-op. SetOnCloseAction(func()) } // QueueingDiscipline provides a queueing strategy for outgoing packets (e.g // FIFO, LIFO, Random Early Drop etc). type QueueingDiscipline interface { // WritePacket writes a packet. // // WritePacket may modify the packet buffer. The packet buffer's // network and transport header must be set. // // To participate in transparent bridging, a LinkEndpoint implementation // should call eth.Encode with header.EthernetFields.SrcAddr set to // pkg.EgressRoute.LocalLinkAddress if it is provided. WritePacket(*PacketBuffer) tcpip.Error Close() } // LinkEndpoint is the interface implemented by data link layer protocols (e.g., // ethernet, loopback, raw) and used by network layer protocols to send packets // out through the implementer's data link endpoint. When a link header exists, // it sets each PacketBuffer's LinkHeader field before passing it up the // stack. type LinkEndpoint interface { NetworkLinkEndpoint LinkWriter } // InjectableLinkEndpoint is a LinkEndpoint where inbound packets are // delivered via the Inject method. type InjectableLinkEndpoint interface { LinkEndpoint // InjectInbound injects an inbound packet. InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) // InjectOutbound writes a fully formed outbound packet directly to the // link. // // dest is used by endpoints with multiple raw destinations. InjectOutbound(dest tcpip.Address, packet *buffer.View) tcpip.Error } // DADResult is a marker interface for the result of a duplicate address // detection process. type DADResult interface { isDADResult() } var _ DADResult = (*DADSucceeded)(nil) // DADSucceeded indicates DAD completed without finding any duplicate addresses. type DADSucceeded struct{} func (*DADSucceeded) isDADResult() {} var _ DADResult = (*DADError)(nil) // DADError indicates DAD hit an error. type DADError struct { Err tcpip.Error } func (*DADError) isDADResult() {} var _ DADResult = (*DADAborted)(nil) // DADAborted indicates DAD was aborted. type DADAborted struct{} func (*DADAborted) isDADResult() {} var _ DADResult = (*DADDupAddrDetected)(nil) // DADDupAddrDetected indicates DAD detected a duplicate address. type DADDupAddrDetected struct { // HolderLinkAddress is the link address of the node that holds the duplicate // address. HolderLinkAddress tcpip.LinkAddress } func (*DADDupAddrDetected) isDADResult() {} // DADCompletionHandler is a handler for DAD completion. type DADCompletionHandler func(DADResult) // DADCheckAddressDisposition enumerates the possible return values from // DAD.CheckDuplicateAddress. type DADCheckAddressDisposition int const ( _ DADCheckAddressDisposition = iota // DADDisabled indicates that DAD is disabled. DADDisabled // DADStarting indicates that DAD is starting for an address. DADStarting // DADAlreadyRunning indicates that DAD was already started for an address. DADAlreadyRunning ) const ( // defaultDupAddrDetectTransmits is the default number of NDP Neighbor // Solicitation messages to send when doing Duplicate Address Detection // for a tentative address. // // Default = 1 (from RFC 4862 section 5.1) defaultDupAddrDetectTransmits = 1 ) // DADConfigurations holds configurations for duplicate address detection. // // +stateify savable type DADConfigurations struct { // The number of Neighbor Solicitation messages to send when doing // Duplicate Address Detection for a tentative address. // // Note, a value of zero effectively disables DAD. DupAddrDetectTransmits uint8 // The amount of time to wait between sending Neighbor Solicitation // messages. // // Must be greater than or equal to 1ms. RetransmitTimer time.Duration } // DefaultDADConfigurations returns the default DAD configurations. func DefaultDADConfigurations() DADConfigurations { return DADConfigurations{ DupAddrDetectTransmits: defaultDupAddrDetectTransmits, RetransmitTimer: defaultRetransmitTimer, } } // Validate modifies the configuration with valid values. If invalid values are // present in the configurations, the corresponding default values are used // instead. func (c *DADConfigurations) Validate() { if c.RetransmitTimer < minimumRetransmitTimer { c.RetransmitTimer = defaultRetransmitTimer } } // DuplicateAddressDetector handles checking if an address is already assigned // to some neighboring node on the link. type DuplicateAddressDetector interface { // CheckDuplicateAddress checks if an address is assigned to a neighbor. // // If DAD is already being performed for the address, the handler will be // called with the result of the original DAD request. CheckDuplicateAddress(tcpip.Address, DADCompletionHandler) DADCheckAddressDisposition // SetDADConfigurations sets the configurations for DAD. SetDADConfigurations(c DADConfigurations) // DuplicateAddressProtocol returns the network protocol the receiver can // perform duplicate address detection for. DuplicateAddressProtocol() tcpip.NetworkProtocolNumber } // LinkAddressResolver handles link address resolution for a network protocol. type LinkAddressResolver interface { // LinkAddressRequest sends a request for the link address of the target // address. The request is broadcast on the local network if a remote link // address is not provided. LinkAddressRequest(targetAddr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) tcpip.Error // ResolveStaticAddress attempts to resolve address without sending // requests. It either resolves the name immediately or returns the // empty LinkAddress. // // It can be used to resolve broadcast addresses for example. ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) // LinkAddressProtocol returns the network protocol of the // addresses this resolver can resolve. LinkAddressProtocol() tcpip.NetworkProtocolNumber } // RawFactory produces endpoints for writing various types of raw packets. type RawFactory interface { // NewUnassociatedEndpoint produces endpoints for writing packets not // associated with a particular transport protocol. Such endpoints can // be used to write arbitrary packets that include the network header. NewUnassociatedEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) // NewPacketEndpoint produces endpoints for reading and writing packets // that include network and (when cooked is false) link layer headers. NewPacketEndpoint(stack *Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) } // GSOType is the type of GSO segments. // // +stateify savable type GSOType int // Types of gso segments. const ( GSONone GSOType = iota // Hardware GSO types: GSOTCPv4 GSOTCPv6 // GSOGvisor is used for gVisor GSO segments which have to be sent by // endpoint.WritePackets. GSOGvisor ) // GSO contains generic segmentation offload properties. // // +stateify savable type GSO struct { // Type is one of GSONone, GSOTCPv4, etc. Type GSOType // NeedsCsum is set if the checksum offload is enabled. NeedsCsum bool // CsumOffset is offset after that to place checksum. CsumOffset uint16 // Mss is maximum segment size. MSS uint16 // L3Len is L3 (IP) header length. L3HdrLen uint16 // MaxSize is maximum GSO packet size. MaxSize uint32 } // SupportedGSO is the type of segmentation offloading supported. type SupportedGSO int const ( // GSONotSupported indicates that segmentation offloading is not supported. GSONotSupported SupportedGSO = iota // HostGSOSupported indicates that segmentation offloading may be performed // by the host. This is typically true when netstack is attached to a host // AF_PACKET socket, and not true when attached to a unix socket or other // non-networking data layer. HostGSOSupported // GVisorGSOSupported indicates that segmentation offloading may be performed // in gVisor. GVisorGSOSupported ) // GSOEndpoint provides access to GSO properties. type GSOEndpoint interface { // GSOMaxSize returns the maximum GSO packet size. GSOMaxSize() uint32 // SupportedGSO returns the supported segmentation offloading. SupportedGSO() SupportedGSO } // GVisorGSOMaxSize is a maximum allowed size of a software GSO segment. // This isn't a hard limit, because it is never set into packet headers. const GVisorGSOMaxSize = 1 << 16 golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/route.go000066400000000000000000000432351465435605700233440ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "fmt" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" ) // Route represents a route through the networking stack to a given destination. // // It is safe to call Route's methods from multiple goroutines. type Route struct { routeInfo routeInfo // localAddressNIC is the interface the address is associated with. // TODO(gvisor.dev/issue/4548): Remove this field once we can query the // address's assigned status without the NIC. localAddressNIC *nic // mu protects annotated fields below. mu routeRWMutex // localAddressEndpoint is the local address this route is associated with. // +checklocks:mu localAddressEndpoint AssignableAddressEndpoint // remoteLinkAddress is the link-layer (MAC) address of the next hop. // +checklocks:mu remoteLinkAddress tcpip.LinkAddress // outgoingNIC is the interface this route uses to write packets. outgoingNIC *nic // linkRes is set if link address resolution is enabled for this protocol on // the route's NIC. linkRes *linkResolver // neighborEntry is the cached result of fetching a neighbor entry from the // neighbor cache. // +checklocks:mu neighborEntry *neighborEntry // mtu is the maximum transmission unit to use for this route. // If mtu is 0, this field is ignored and the MTU of the outgoing NIC // is used for egress packets. mtu uint32 } // +stateify savable type routeInfo struct { RemoteAddress tcpip.Address LocalAddress tcpip.Address LocalLinkAddress tcpip.LinkAddress NextHop tcpip.Address NetProto tcpip.NetworkProtocolNumber Loop PacketLooping } // RemoteAddress returns the route's destination. func (r *Route) RemoteAddress() tcpip.Address { return r.routeInfo.RemoteAddress } // LocalAddress returns the route's local address. func (r *Route) LocalAddress() tcpip.Address { return r.routeInfo.LocalAddress } // LocalLinkAddress returns the route's local link-layer address. func (r *Route) LocalLinkAddress() tcpip.LinkAddress { return r.routeInfo.LocalLinkAddress } // NextHop returns the next node in the route's path to the destination. func (r *Route) NextHop() tcpip.Address { return r.routeInfo.NextHop } // NetProto returns the route's network-layer protocol number. func (r *Route) NetProto() tcpip.NetworkProtocolNumber { return r.routeInfo.NetProto } // Loop returns the route's required packet looping. func (r *Route) Loop() PacketLooping { return r.routeInfo.Loop } // OutgoingNIC returns the route's outgoing NIC. func (r *Route) OutgoingNIC() tcpip.NICID { return r.outgoingNIC.id } // RouteInfo contains all of Route's exported fields. // // +stateify savable type RouteInfo struct { routeInfo // RemoteLinkAddress is the link-layer (MAC) address of the next hop in the // route. RemoteLinkAddress tcpip.LinkAddress } // Fields returns a RouteInfo with all of the known values for the route's // fields. // // If any fields are unknown (e.g. remote link address when it is waiting for // link address resolution), they will be unset. func (r *Route) Fields() RouteInfo { r.mu.RLock() defer r.mu.RUnlock() return r.fieldsLocked() } // +checklocksread:r.mu func (r *Route) fieldsLocked() RouteInfo { return RouteInfo{ routeInfo: r.routeInfo, RemoteLinkAddress: r.remoteLinkAddress, } } // constructAndValidateRoute validates and initializes a route. It takes // ownership of the provided local address. // // Returns an empty route if validation fails. func constructAndValidateRoute(netProto tcpip.NetworkProtocolNumber, addressEndpoint AssignableAddressEndpoint, localAddressNIC, outgoingNIC *nic, gateway, localAddr, remoteAddr tcpip.Address, handleLocal, multicastLoop bool, mtu uint32) *Route { if localAddr.BitLen() == 0 { localAddr = addressEndpoint.AddressWithPrefix().Address } if localAddressNIC != outgoingNIC && header.IsV6LinkLocalUnicastAddress(localAddr) { addressEndpoint.DecRef() return nil } // If no remote address is provided, use the local address. if remoteAddr.BitLen() == 0 { remoteAddr = localAddr } r := makeRoute( netProto, gateway, localAddr, remoteAddr, outgoingNIC, localAddressNIC, addressEndpoint, handleLocal, multicastLoop, mtu, ) return r } // makeRoute initializes a new route. It takes ownership of the provided // AssignableAddressEndpoint. func makeRoute(netProto tcpip.NetworkProtocolNumber, gateway, localAddr, remoteAddr tcpip.Address, outgoingNIC, localAddressNIC *nic, localAddressEndpoint AssignableAddressEndpoint, handleLocal, multicastLoop bool, mtu uint32) *Route { if localAddressNIC.stack != outgoingNIC.stack { panic(fmt.Sprintf("cannot create a route with NICs from different stacks")) } if localAddr.BitLen() == 0 { localAddr = localAddressEndpoint.AddressWithPrefix().Address } loop := PacketOut // Loopback interface loops back packets at the link endpoint level. We // could remove this check if loopback interfaces looped back packets // at the network layer. if !outgoingNIC.IsLoopback() { if handleLocal && localAddr != (tcpip.Address{}) && remoteAddr == localAddr { loop = PacketLoop } else if multicastLoop && (header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr)) { loop |= PacketLoop } else if remoteAddr == header.IPv4Broadcast { loop |= PacketLoop } else if subnet := localAddressEndpoint.AddressWithPrefix().Subnet(); subnet.IsBroadcast(remoteAddr) { loop |= PacketLoop } } r := makeRouteInner(netProto, localAddr, remoteAddr, outgoingNIC, localAddressNIC, localAddressEndpoint, loop, mtu) if r.Loop()&PacketOut == 0 { // Packet will not leave the stack, no need for a gateway or a remote link // address. return r } if r.outgoingNIC.NetworkLinkEndpoint.Capabilities()&CapabilityResolutionRequired != 0 { if linkRes, ok := r.outgoingNIC.linkAddrResolvers[r.NetProto()]; ok { r.linkRes = linkRes } } if gateway.BitLen() > 0 { r.routeInfo.NextHop = gateway return r } if r.linkRes == nil { return r } if linkAddr, ok := r.linkRes.resolver.ResolveStaticAddress(r.RemoteAddress()); ok { r.ResolveWith(linkAddr) return r } if subnet := localAddressEndpoint.Subnet(); subnet.IsBroadcast(remoteAddr) { r.ResolveWith(header.EthernetBroadcastAddress) return r } if r.RemoteAddress() == r.LocalAddress() { // Local link address is already known. r.ResolveWith(r.LocalLinkAddress()) } return r } func makeRouteInner(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, outgoingNIC, localAddressNIC *nic, localAddressEndpoint AssignableAddressEndpoint, loop PacketLooping, mtu uint32) *Route { r := &Route{ routeInfo: routeInfo{ NetProto: netProto, LocalAddress: localAddr, LocalLinkAddress: outgoingNIC.NetworkLinkEndpoint.LinkAddress(), RemoteAddress: remoteAddr, Loop: loop, }, localAddressNIC: localAddressNIC, outgoingNIC: outgoingNIC, mtu: mtu, } r.mu.Lock() r.localAddressEndpoint = localAddressEndpoint r.mu.Unlock() return r } // makeLocalRoute initializes a new local route. It takes ownership of the // provided AssignableAddressEndpoint. // // A local route is a route to a destination that is local to the stack. func makeLocalRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, outgoingNIC, localAddressNIC *nic, localAddressEndpoint AssignableAddressEndpoint) *Route { loop := PacketLoop // Loopback interface loops back packets at the link endpoint level. We // could remove this check if loopback interfaces looped back packets // at the network layer. if outgoingNIC.IsLoopback() { loop = PacketOut } return makeRouteInner(netProto, localAddr, remoteAddr, outgoingNIC, localAddressNIC, localAddressEndpoint, loop, 0 /* mtu */) } // RemoteLinkAddress returns the link-layer (MAC) address of the next hop in // the route. func (r *Route) RemoteLinkAddress() tcpip.LinkAddress { r.mu.RLock() defer r.mu.RUnlock() return r.remoteLinkAddress } // NICID returns the id of the NIC from which this route originates. func (r *Route) NICID() tcpip.NICID { return r.outgoingNIC.ID() } // MaxHeaderLength forwards the call to the network endpoint's implementation. func (r *Route) MaxHeaderLength() uint16 { return r.outgoingNIC.getNetworkEndpoint(r.NetProto()).MaxHeaderLength() } // Stats returns a mutable copy of current stats. func (r *Route) Stats() tcpip.Stats { return r.outgoingNIC.stack.Stats() } // PseudoHeaderChecksum forwards the call to the network endpoint's // implementation. func (r *Route) PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, totalLen uint16) uint16 { return header.PseudoHeaderChecksum(protocol, r.LocalAddress(), r.RemoteAddress(), totalLen) } // RequiresTXTransportChecksum returns false if the route does not require // transport checksums to be populated. func (r *Route) RequiresTXTransportChecksum() bool { if r.local() { return false } return r.outgoingNIC.NetworkLinkEndpoint.Capabilities()&CapabilityTXChecksumOffload == 0 } // HasGVisorGSOCapability returns true if the route supports gVisor GSO. func (r *Route) HasGVisorGSOCapability() bool { if gso, ok := r.outgoingNIC.NetworkLinkEndpoint.(GSOEndpoint); ok { return gso.SupportedGSO() == GVisorGSOSupported } return false } // HasHostGSOCapability returns true if the route supports host GSO. func (r *Route) HasHostGSOCapability() bool { if gso, ok := r.outgoingNIC.NetworkLinkEndpoint.(GSOEndpoint); ok { return gso.SupportedGSO() == HostGSOSupported } return false } // HasSaveRestoreCapability returns true if the route supports save/restore. func (r *Route) HasSaveRestoreCapability() bool { return r.outgoingNIC.NetworkLinkEndpoint.Capabilities()&CapabilitySaveRestore != 0 } // HasDisconnectOkCapability returns true if the route supports disconnecting. func (r *Route) HasDisconnectOkCapability() bool { return r.outgoingNIC.NetworkLinkEndpoint.Capabilities()&CapabilityDisconnectOk != 0 } // GSOMaxSize returns the maximum GSO packet size. func (r *Route) GSOMaxSize() uint32 { if gso, ok := r.outgoingNIC.NetworkLinkEndpoint.(GSOEndpoint); ok { return gso.GSOMaxSize() } return 0 } // ResolveWith immediately resolves a route with the specified remote link // address. func (r *Route) ResolveWith(addr tcpip.LinkAddress) { r.mu.Lock() defer r.mu.Unlock() r.remoteLinkAddress = addr } // ResolvedFieldsResult is the result of a route resolution attempt. type ResolvedFieldsResult struct { RouteInfo RouteInfo Err tcpip.Error } // ResolvedFields attempts to resolve the remote link address if it is not // known. // // If a callback is provided, it will be called before ResolvedFields returns // when address resolution is not required. If address resolution is required, // the callback will be called once address resolution is complete, regardless // of success or failure. // // Note, the route will not cache the remote link address when address // resolution completes. func (r *Route) ResolvedFields(afterResolve func(ResolvedFieldsResult)) tcpip.Error { _, _, err := r.resolvedFields(afterResolve) return err } // resolvedFields is like ResolvedFields but also returns a notification channel // when address resolution is required. This channel will become readable once // address resolution is complete. // // The route's fields will also be returned, regardless of whether address // resolution is required or not. func (r *Route) resolvedFields(afterResolve func(ResolvedFieldsResult)) (RouteInfo, <-chan struct{}, tcpip.Error) { r.mu.RLock() fields := r.fieldsLocked() resolutionRequired := r.isResolutionRequiredRLocked() r.mu.RUnlock() if !resolutionRequired { if afterResolve != nil { afterResolve(ResolvedFieldsResult{RouteInfo: fields, Err: nil}) } return fields, nil, nil } // If specified, the local address used for link address resolution must be an // address on the outgoing interface. var linkAddressResolutionRequestLocalAddr tcpip.Address if r.localAddressNIC == r.outgoingNIC { linkAddressResolutionRequestLocalAddr = r.LocalAddress() } nEntry := r.getCachedNeighborEntry() if nEntry != nil { if addr, ok := nEntry.getRemoteLinkAddress(); ok { fields.RemoteLinkAddress = addr if afterResolve != nil { afterResolve(ResolvedFieldsResult{RouteInfo: fields, Err: nil}) } return fields, nil, nil } } afterResolveFields := fields entry, ch, err := r.linkRes.neigh.entry(r.nextHop(), linkAddressResolutionRequestLocalAddr, func(lrr LinkResolutionResult) { if afterResolve != nil { if lrr.Err == nil { afterResolveFields.RemoteLinkAddress = lrr.LinkAddress } afterResolve(ResolvedFieldsResult{RouteInfo: afterResolveFields, Err: lrr.Err}) } }) if err == nil { fields.RemoteLinkAddress, _ = entry.getRemoteLinkAddress() } r.setCachedNeighborEntry(entry) return fields, ch, err } func (r *Route) getCachedNeighborEntry() *neighborEntry { r.mu.RLock() defer r.mu.RUnlock() return r.neighborEntry } func (r *Route) setCachedNeighborEntry(entry *neighborEntry) { r.mu.Lock() defer r.mu.Unlock() r.neighborEntry = entry } func (r *Route) nextHop() tcpip.Address { if r.NextHop().BitLen() == 0 { return r.RemoteAddress() } return r.NextHop() } // local returns true if the route is a local route. func (r *Route) local() bool { return r.Loop() == PacketLoop || r.outgoingNIC.IsLoopback() } // IsResolutionRequired returns true if Resolve() must be called to resolve // the link address before the route can be written to. // // The NICs the route is associated with must not be locked. func (r *Route) IsResolutionRequired() bool { r.mu.RLock() defer r.mu.RUnlock() return r.isResolutionRequiredRLocked() } // +checklocksread:r.mu func (r *Route) isResolutionRequiredRLocked() bool { return len(r.remoteLinkAddress) == 0 && r.linkRes != nil && r.isValidForOutgoingRLocked() && !r.local() } func (r *Route) isValidForOutgoing() bool { r.mu.RLock() defer r.mu.RUnlock() return r.isValidForOutgoingRLocked() } // +checklocksread:r.mu func (r *Route) isValidForOutgoingRLocked() bool { if !r.outgoingNIC.Enabled() { return false } localAddressEndpoint := r.localAddressEndpoint if localAddressEndpoint == nil || !r.localAddressNIC.isValidForOutgoing(localAddressEndpoint) { return false } // If the source NIC and outgoing NIC are different, make sure the stack has // forwarding enabled, or the packet will be handled locally. if r.outgoingNIC != r.localAddressNIC && !isNICForwarding(r.localAddressNIC, r.NetProto()) && (!r.outgoingNIC.stack.handleLocal || !r.outgoingNIC.hasAddress(r.NetProto(), r.RemoteAddress())) { return false } return true } // WritePacket writes the packet through the given route. func (r *Route) WritePacket(params NetworkHeaderParams, pkt *PacketBuffer) tcpip.Error { if !r.isValidForOutgoing() { return &tcpip.ErrInvalidEndpointState{} } return r.outgoingNIC.getNetworkEndpoint(r.NetProto()).WritePacket(r, params, pkt) } // WriteHeaderIncludedPacket writes a packet already containing a network // header through the given route. func (r *Route) WriteHeaderIncludedPacket(pkt *PacketBuffer) tcpip.Error { if !r.isValidForOutgoing() { return &tcpip.ErrInvalidEndpointState{} } return r.outgoingNIC.getNetworkEndpoint(r.NetProto()).WriteHeaderIncludedPacket(r, pkt) } // DefaultTTL returns the default TTL of the underlying network endpoint. func (r *Route) DefaultTTL() uint8 { return r.outgoingNIC.getNetworkEndpoint(r.NetProto()).DefaultTTL() } // MTU returns the MTU of the route if present, otherwise the MTU of the underlying network endpoint. func (r *Route) MTU() uint32 { if r.mtu > 0 { return r.mtu } return r.outgoingNIC.getNetworkEndpoint(r.NetProto()).MTU() } // Release decrements the reference counter of the resources associated with the // route. func (r *Route) Release() { r.mu.Lock() defer r.mu.Unlock() if ep := r.localAddressEndpoint; ep != nil { ep.DecRef() } } // Acquire increments the reference counter of the resources associated with the // route. func (r *Route) Acquire() { r.mu.RLock() defer r.mu.RUnlock() r.acquireLocked() } // +checklocksread:r.mu func (r *Route) acquireLocked() { if ep := r.localAddressEndpoint; ep != nil { if !ep.TryIncRef() { panic(fmt.Sprintf("failed to increment reference count for local address endpoint = %s", r.LocalAddress())) } } } // Stack returns the instance of the Stack that owns this route. func (r *Route) Stack() *Stack { return r.outgoingNIC.stack } func (r *Route) isV4Broadcast(addr tcpip.Address) bool { if addr == header.IPv4Broadcast { return true } r.mu.RLock() localAddressEndpoint := r.localAddressEndpoint r.mu.RUnlock() if localAddressEndpoint == nil { return false } subnet := localAddressEndpoint.Subnet() return subnet.IsBroadcast(addr) } // IsOutboundBroadcast returns true if the route is for an outbound broadcast // packet. func (r *Route) IsOutboundBroadcast() bool { // Only IPv4 has a notion of broadcast. return r.isV4Broadcast(r.RemoteAddress()) } // ConfirmReachable informs the network/link layer that the neighbour used for // the route is reachable. // // "Reachable" is defined as having full-duplex communication between the // local and remote ends of the route. func (r *Route) ConfirmReachable() { if entry := r.getCachedNeighborEntry(); entry != nil { entry.handleUpperLevelConfirmation() } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/route_mutex.go000066400000000000000000000044761465435605700245720ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type routeRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var routelockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type routelockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *routeRWMutex) Lock() { locking.AddGLock(routeprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *routeRWMutex) NestedLock(i routelockNameIndex) { locking.AddGLock(routeprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *routeRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(routeprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *routeRWMutex) NestedUnlock(i routelockNameIndex) { m.mu.Unlock() locking.DelGLock(routeprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *routeRWMutex) RLock() { locking.AddGLock(routeprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *routeRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(routeprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *routeRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *routeRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *routeRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var routeprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func routeinitLockNames() {} func init() { routeinitLockNames() routeprefixIndex = locking.NewMutexClass(reflect.TypeOf(routeRWMutex{}), routelockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/route_stack_mutex.go000066400000000000000000000047001465435605700257450ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type routeStackRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var routeStacklockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type routeStacklockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *routeStackRWMutex) Lock() { locking.AddGLock(routeStackprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *routeStackRWMutex) NestedLock(i routeStacklockNameIndex) { locking.AddGLock(routeStackprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *routeStackRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(routeStackprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *routeStackRWMutex) NestedUnlock(i routeStacklockNameIndex) { m.mu.Unlock() locking.DelGLock(routeStackprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *routeStackRWMutex) RLock() { locking.AddGLock(routeStackprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *routeStackRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(routeStackprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *routeStackRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *routeStackRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *routeStackRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var routeStackprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func routeStackinitLockNames() {} func init() { routeStackinitLockNames() routeStackprefixIndex = locking.NewMutexClass(reflect.TypeOf(routeStackRWMutex{}), routeStacklockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/stack.go000066400000000000000000002233171465435605700233140ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package stack provides the glue between networking protocols and the // consumers of the networking stack. // // For consumers, the only function of interest is New(), everything else is // provided by the tcpip/public package. package stack import ( "encoding/binary" "fmt" "io" "math/rand" "sync/atomic" "time" "golang.org/x/time/rate" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/log" cryptorand "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/ports" "gvisor.dev/gvisor/pkg/waiter" ) const ( // DefaultTOS is the default type of service value for network endpoints. DefaultTOS = 0 ) // +stateify savable type transportProtocolState struct { proto TransportProtocol defaultHandler func(id TransportEndpointID, pkt *PacketBuffer) bool `state:"nosave"` } // RestoredEndpoint is an endpoint that needs to be restored. type RestoredEndpoint interface { // Restore restores an endpoint. This can be used to restart background // workers such as protocol goroutines. This must be called after all // indirect dependencies of the endpoint has been restored, which // generally implies at the end of the restore process. Restore(*Stack) } // ResumableEndpoint is an endpoint that needs to be resumed after save. type ResumableEndpoint interface { // Resume resumes an endpoint. Resume() } var netRawMissingLogger = log.BasicRateLimitedLogger(time.Minute) // Stack is a networking stack, with all supported protocols, NICs, and route // table. // // LOCK ORDERING: mu > routeMu. // // +stateify savable type Stack struct { transportProtocols map[tcpip.TransportProtocolNumber]*transportProtocolState networkProtocols map[tcpip.NetworkProtocolNumber]NetworkProtocol // rawFactory creates raw endpoints. If nil, raw endpoints are // disabled. It is set during Stack creation and is immutable. rawFactory RawFactory packetEndpointWriteSupported bool demux *transportDemuxer stats tcpip.Stats // routeMu protects annotated fields below. routeMu routeStackRWMutex `state:"nosave"` // routeTable is a list of routes sorted by prefix length, longest (most specific) first. // +checklocks:routeMu routeTable tcpip.RouteList mu stackRWMutex `state:"nosave"` // +checklocks:mu nics map[tcpip.NICID]*nic // +checklocks:mu defaultForwardingEnabled map[tcpip.NetworkProtocolNumber]struct{} // nicIDGen is used to generate NIC IDs. nicIDGen atomicbitops.Int32 // cleanupEndpointsMu protects cleanupEndpoints. cleanupEndpointsMu cleanupEndpointsMutex `state:"nosave"` // +checklocks:cleanupEndpointsMu cleanupEndpoints map[TransportEndpoint]struct{} *ports.PortManager // If not nil, then any new endpoints will have this probe function // invoked everytime they receive a TCP segment. // TODO(b/341946753): Restore them when netstack is savable. tcpProbeFunc atomic.Value `state:"nosave"` // TCPProbeFunc // clock is used to generate user-visible times. clock tcpip.Clock // handleLocal allows non-loopback interfaces to loop packets. handleLocal bool // tables are the iptables packet filtering and manipulation rules. // TODO(gvisor.dev/issue/4595): S/R this field. tables *IPTables `state:"nosave"` // restoredEndpoints is a list of endpoints that need to be restored if the // stack is being restored. restoredEndpoints []RestoredEndpoint // resumableEndpoints is a list of endpoints that need to be resumed // after save. resumableEndpoints []ResumableEndpoint // icmpRateLimiter is a global rate limiter for all ICMP messages generated // by the stack. icmpRateLimiter *ICMPRateLimiter // seed is a one-time random value initialized at stack startup. // // TODO(gvisor.dev/issue/940): S/R this field. seed uint32 // nudConfigs is the default NUD configurations used by interfaces. nudConfigs NUDConfigurations // nudDisp is the NUD event dispatcher that is used to send the netstack // integrator NUD related events. nudDisp NUDDispatcher // randomGenerator is an injectable pseudo random generator that can be // used when a random number is required. It must not be used in // security-sensitive contexts. // TODO(b/341946753): Restore them when netstack is savable. insecureRNG *rand.Rand `state:"nosave"` // secureRNG is a cryptographically secure random number generator. // TODO(b/341946753): Restore them when netstack is savable. secureRNG cryptorand.RNG `state:"nosave"` // sendBufferSize holds the min/default/max send buffer sizes for // endpoints other than TCP. sendBufferSize tcpip.SendBufferSizeOption // receiveBufferSize holds the min/default/max receive buffer sizes for // endpoints other than TCP. receiveBufferSize tcpip.ReceiveBufferSizeOption // tcpInvalidRateLimit is the maximal rate for sending duplicate // acknowledgements in response to incoming TCP packets that are for an existing // connection but that are invalid due to any of the following reasons: // // a) out-of-window sequence number. // b) out-of-window acknowledgement number. // c) PAWS check failure (when implemented). // // This is required to prevent potential ACK loops. // Setting this to 0 will disable all rate limiting. tcpInvalidRateLimit time.Duration // tsOffsetSecret is the secret key for generating timestamp offsets // initialized at stack startup. tsOffsetSecret uint32 } // NetworkProtocolFactory instantiates a network protocol. // // NetworkProtocolFactory must not attempt to modify the stack, it may only // query the stack. type NetworkProtocolFactory func(*Stack) NetworkProtocol // TransportProtocolFactory instantiates a transport protocol. // // TransportProtocolFactory must not attempt to modify the stack, it may only // query the stack. type TransportProtocolFactory func(*Stack) TransportProtocol // Options contains optional Stack configuration. type Options struct { // NetworkProtocols lists the network protocols to enable. NetworkProtocols []NetworkProtocolFactory // TransportProtocols lists the transport protocols to enable. TransportProtocols []TransportProtocolFactory // Clock is an optional clock used for timekeeping. // // If Clock is nil, tcpip.NewStdClock() will be used. Clock tcpip.Clock // Stats are optional statistic counters. Stats tcpip.Stats // HandleLocal indicates whether packets destined to their source // should be handled by the stack internally (true) or outside the // stack (false). HandleLocal bool // NUDConfigs is the default NUD configurations used by interfaces. NUDConfigs NUDConfigurations // NUDDisp is the NUD event dispatcher that an integrator can provide to // receive NUD related events. NUDDisp NUDDispatcher // RawFactory produces raw endpoints. Raw endpoints are enabled only if // this is non-nil. RawFactory RawFactory // AllowPacketEndpointWrite determines if packet endpoints support write // operations. AllowPacketEndpointWrite bool // RandSource is an optional source to use to generate random // numbers. If omitted it defaults to a Source seeded by the data // returned by the stack secure RNG. // // RandSource must be thread-safe. RandSource rand.Source // IPTables are the initial iptables rules. If nil, DefaultIPTables will be // used to construct the initial iptables rules. // all traffic. IPTables *IPTables // DefaultIPTables is an optional iptables rules constructor that is called // if IPTables is nil. If both fields are nil, iptables will allow all // traffic. DefaultIPTables func(clock tcpip.Clock, rand *rand.Rand) *IPTables // SecureRNG is a cryptographically secure random number generator. SecureRNG io.Reader } // TransportEndpointInfo holds useful information about a transport endpoint // which can be queried by monitoring tools. // // +stateify savable type TransportEndpointInfo struct { // The following fields are initialized at creation time and are // immutable. NetProto tcpip.NetworkProtocolNumber TransProto tcpip.TransportProtocolNumber // The following fields are protected by endpoint mu. ID TransportEndpointID // BindNICID and bindAddr are set via calls to Bind(). They are used to // reject attempts to send data or connect via a different NIC or // address BindNICID tcpip.NICID BindAddr tcpip.Address // RegisterNICID is the default NICID registered as a side-effect of // connect or datagram write. RegisterNICID tcpip.NICID } // AddrNetProtoLocked unwraps the specified address if it is a V4-mapped V6 // address and returns the network protocol number to be used to communicate // with the specified address. It returns an error if the passed address is // incompatible with the receiver. // // Preconditon: the parent endpoint mu must be held while calling this method. func (t *TransportEndpointInfo) AddrNetProtoLocked(addr tcpip.FullAddress, v6only bool, bind bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) { netProto := t.NetProto switch addr.Addr.BitLen() { case header.IPv4AddressSizeBits: netProto = header.IPv4ProtocolNumber case header.IPv6AddressSizeBits: if header.IsV4MappedAddress(addr.Addr) { netProto = header.IPv4ProtocolNumber addr.Addr = tcpip.AddrFrom4Slice(addr.Addr.AsSlice()[header.IPv6AddressSize-header.IPv4AddressSize:]) if addr.Addr == header.IPv4Any { addr.Addr = tcpip.Address{} } } } switch t.ID.LocalAddress.BitLen() { case header.IPv4AddressSizeBits: if addr.Addr.BitLen() == header.IPv6AddressSizeBits { return tcpip.FullAddress{}, 0, &tcpip.ErrInvalidEndpointState{} } case header.IPv6AddressSizeBits: if addr.Addr.BitLen() == header.IPv4AddressSizeBits { return tcpip.FullAddress{}, 0, &tcpip.ErrNetworkUnreachable{} } } if !bind && addr.Addr.Unspecified() { // If the destination address isn't set, Linux sets it to the // source address. If a source address isn't set either, it // sets both to the loopback address. if t.ID.LocalAddress.Unspecified() { switch netProto { case header.IPv4ProtocolNumber: addr.Addr = header.IPv4Loopback case header.IPv6ProtocolNumber: addr.Addr = header.IPv6Loopback } } else { addr.Addr = t.ID.LocalAddress } } switch { case netProto == t.NetProto: case netProto == header.IPv4ProtocolNumber && t.NetProto == header.IPv6ProtocolNumber: if v6only { return tcpip.FullAddress{}, 0, &tcpip.ErrHostUnreachable{} } default: return tcpip.FullAddress{}, 0, &tcpip.ErrInvalidEndpointState{} } return addr, netProto, nil } // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo // marker interface. func (*TransportEndpointInfo) IsEndpointInfo() {} // New allocates a new networking stack with only the requested networking and // transport protocols configured with default options. // // Note, NDPConfigurations will be fixed before being used by the Stack. That // is, if an invalid value was provided, it will be reset to the default value. // // Protocol options can be changed by calling the // SetNetworkProtocolOption/SetTransportProtocolOption methods provided by the // stack. Please refer to individual protocol implementations as to what options // are supported. func New(opts Options) *Stack { clock := opts.Clock if clock == nil { clock = tcpip.NewStdClock() } if opts.SecureRNG == nil { opts.SecureRNG = cryptorand.Reader } secureRNG := cryptorand.RNGFrom(opts.SecureRNG) randSrc := opts.RandSource if randSrc == nil { var v int64 if err := binary.Read(opts.SecureRNG, binary.LittleEndian, &v); err != nil { panic(err) } // Source provided by rand.NewSource is not thread-safe so // we wrap it in a simple thread-safe version. randSrc = &lockedRandomSource{src: rand.NewSource(v)} } insecureRNG := rand.New(randSrc) if opts.IPTables == nil { if opts.DefaultIPTables == nil { opts.DefaultIPTables = DefaultTables } opts.IPTables = opts.DefaultIPTables(clock, insecureRNG) } opts.NUDConfigs.resetInvalidFields() s := &Stack{ transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState), networkProtocols: make(map[tcpip.NetworkProtocolNumber]NetworkProtocol), nics: make(map[tcpip.NICID]*nic), packetEndpointWriteSupported: opts.AllowPacketEndpointWrite, defaultForwardingEnabled: make(map[tcpip.NetworkProtocolNumber]struct{}), cleanupEndpoints: make(map[TransportEndpoint]struct{}), PortManager: ports.NewPortManager(), clock: clock, stats: opts.Stats.FillIn(), handleLocal: opts.HandleLocal, tables: opts.IPTables, icmpRateLimiter: NewICMPRateLimiter(clock), seed: secureRNG.Uint32(), nudConfigs: opts.NUDConfigs, nudDisp: opts.NUDDisp, insecureRNG: insecureRNG, secureRNG: secureRNG, sendBufferSize: tcpip.SendBufferSizeOption{ Min: MinBufferSize, Default: DefaultBufferSize, Max: DefaultMaxBufferSize, }, receiveBufferSize: tcpip.ReceiveBufferSizeOption{ Min: MinBufferSize, Default: DefaultBufferSize, Max: DefaultMaxBufferSize, }, tcpInvalidRateLimit: defaultTCPInvalidRateLimit, tsOffsetSecret: secureRNG.Uint32(), } // Add specified network protocols. for _, netProtoFactory := range opts.NetworkProtocols { netProto := netProtoFactory(s) s.networkProtocols[netProto.Number()] = netProto } // Add specified transport protocols. for _, transProtoFactory := range opts.TransportProtocols { transProto := transProtoFactory(s) s.transportProtocols[transProto.Number()] = &transportProtocolState{ proto: transProto, } } // Add the factory for raw endpoints, if present. s.rawFactory = opts.RawFactory // Create the global transport demuxer. s.demux = newTransportDemuxer(s) return s } // NextNICID allocates the next available NIC ID and returns it. func (s *Stack) NextNICID() tcpip.NICID { next := s.nicIDGen.Add(1) if next < 0 { panic("NICID overflow") } return tcpip.NICID(next) } // SetNetworkProtocolOption allows configuring individual protocol level // options. This method returns an error if the protocol is not supported or // option is not supported by the protocol implementation or the provided value // is incorrect. func (s *Stack) SetNetworkProtocolOption(network tcpip.NetworkProtocolNumber, option tcpip.SettableNetworkProtocolOption) tcpip.Error { netProto, ok := s.networkProtocols[network] if !ok { return &tcpip.ErrUnknownProtocol{} } return netProto.SetOption(option) } // NetworkProtocolOption allows retrieving individual protocol level option // values. This method returns an error if the protocol is not supported or // option is not supported by the protocol implementation. E.g.: // // var v ipv4.MyOption // err := s.NetworkProtocolOption(tcpip.IPv4ProtocolNumber, &v) // if err != nil { // ... // } func (s *Stack) NetworkProtocolOption(network tcpip.NetworkProtocolNumber, option tcpip.GettableNetworkProtocolOption) tcpip.Error { netProto, ok := s.networkProtocols[network] if !ok { return &tcpip.ErrUnknownProtocol{} } return netProto.Option(option) } // SetTransportProtocolOption allows configuring individual protocol level // options. This method returns an error if the protocol is not supported or // option is not supported by the protocol implementation or the provided value // is incorrect. func (s *Stack) SetTransportProtocolOption(transport tcpip.TransportProtocolNumber, option tcpip.SettableTransportProtocolOption) tcpip.Error { transProtoState, ok := s.transportProtocols[transport] if !ok { return &tcpip.ErrUnknownProtocol{} } return transProtoState.proto.SetOption(option) } // TransportProtocolOption allows retrieving individual protocol level option // values. This method returns an error if the protocol is not supported or // option is not supported by the protocol implementation. // // var v tcp.SACKEnabled // if err := s.TransportProtocolOption(tcpip.TCPProtocolNumber, &v); err != nil { // ... // } func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber, option tcpip.GettableTransportProtocolOption) tcpip.Error { transProtoState, ok := s.transportProtocols[transport] if !ok { return &tcpip.ErrUnknownProtocol{} } return transProtoState.proto.Option(option) } // SendBufSizeProto is a protocol that can return its send buffer size. type SendBufSizeProto interface { SendBufferSize() tcpip.TCPSendBufferSizeRangeOption } // TCPSendBufferLimits returns the TCP send buffer size limit. func (s *Stack) TCPSendBufferLimits() tcpip.TCPSendBufferSizeRangeOption { return s.transportProtocols[header.TCPProtocolNumber].proto.(SendBufSizeProto).SendBufferSize() } // SetTransportProtocolHandler sets the per-stack default handler for the given // protocol. // // It must be called only during initialization of the stack. Changing it as the // stack is operating is not supported. func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(TransportEndpointID, *PacketBuffer) bool) { state := s.transportProtocols[p] if state != nil { state.defaultHandler = h } } // Clock returns the Stack's clock for retrieving the current time and // scheduling work. func (s *Stack) Clock() tcpip.Clock { return s.clock } // Stats returns a mutable copy of the current stats. // // This is not generally exported via the public interface, but is available // internally. func (s *Stack) Stats() tcpip.Stats { return s.stats } // SetNICForwarding enables or disables packet forwarding on the specified NIC // for the passed protocol. // // Returns the previous configuration on the NIC. func (s *Stack) SetNICForwarding(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, enable bool) (bool, tcpip.Error) { s.mu.RLock() defer s.mu.RUnlock() nic, ok := s.nics[id] if !ok { return false, &tcpip.ErrUnknownNICID{} } return nic.setForwarding(protocol, enable) } // NICForwarding returns the forwarding configuration for the specified NIC. func (s *Stack) NICForwarding(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber) (bool, tcpip.Error) { s.mu.RLock() defer s.mu.RUnlock() nic, ok := s.nics[id] if !ok { return false, &tcpip.ErrUnknownNICID{} } return nic.forwarding(protocol) } // SetForwardingDefaultAndAllNICs sets packet forwarding for all NICs for the // passed protocol and sets the default setting for newly created NICs. func (s *Stack) SetForwardingDefaultAndAllNICs(protocol tcpip.NetworkProtocolNumber, enable bool) tcpip.Error { s.mu.Lock() defer s.mu.Unlock() doneOnce := false for id, nic := range s.nics { if _, err := nic.setForwarding(protocol, enable); err != nil { // Expect forwarding to be settable on all interfaces if it was set on // one. if doneOnce { panic(fmt.Sprintf("nic(id=%d).setForwarding(%d, %t): %s", id, protocol, enable, err)) } return err } doneOnce = true } if enable { s.defaultForwardingEnabled[protocol] = struct{}{} } else { delete(s.defaultForwardingEnabled, protocol) } return nil } // AddMulticastRoute adds a multicast route to be used for the specified // addresses and protocol. func (s *Stack) AddMulticastRoute(protocol tcpip.NetworkProtocolNumber, addresses UnicastSourceAndMulticastDestination, route MulticastRoute) tcpip.Error { netProto, ok := s.networkProtocols[protocol] if !ok { return &tcpip.ErrUnknownProtocol{} } forwardingNetProto, ok := netProto.(MulticastForwardingNetworkProtocol) if !ok { return &tcpip.ErrNotSupported{} } return forwardingNetProto.AddMulticastRoute(addresses, route) } // RemoveMulticastRoute removes a multicast route that matches the specified // addresses and protocol. func (s *Stack) RemoveMulticastRoute(protocol tcpip.NetworkProtocolNumber, addresses UnicastSourceAndMulticastDestination) tcpip.Error { netProto, ok := s.networkProtocols[protocol] if !ok { return &tcpip.ErrUnknownProtocol{} } forwardingNetProto, ok := netProto.(MulticastForwardingNetworkProtocol) if !ok { return &tcpip.ErrNotSupported{} } return forwardingNetProto.RemoveMulticastRoute(addresses) } // MulticastRouteLastUsedTime returns a monotonic timestamp that represents the // last time that the route that matches the provided addresses and protocol // was used or updated. func (s *Stack) MulticastRouteLastUsedTime(protocol tcpip.NetworkProtocolNumber, addresses UnicastSourceAndMulticastDestination) (tcpip.MonotonicTime, tcpip.Error) { netProto, ok := s.networkProtocols[protocol] if !ok { return tcpip.MonotonicTime{}, &tcpip.ErrUnknownProtocol{} } forwardingNetProto, ok := netProto.(MulticastForwardingNetworkProtocol) if !ok { return tcpip.MonotonicTime{}, &tcpip.ErrNotSupported{} } return forwardingNetProto.MulticastRouteLastUsedTime(addresses) } // EnableMulticastForwardingForProtocol enables multicast forwarding for the // provided protocol. // // Returns true if forwarding was already enabled on the protocol. // Additionally, returns an error if: // // - The protocol is not found. // - The protocol doesn't support multicast forwarding. // - The multicast forwarding event dispatcher is nil. // // If successful, future multicast forwarding events will be sent to the // provided event dispatcher. func (s *Stack) EnableMulticastForwardingForProtocol(protocol tcpip.NetworkProtocolNumber, disp MulticastForwardingEventDispatcher) (bool, tcpip.Error) { netProto, ok := s.networkProtocols[protocol] if !ok { return false, &tcpip.ErrUnknownProtocol{} } forwardingNetProto, ok := netProto.(MulticastForwardingNetworkProtocol) if !ok { return false, &tcpip.ErrNotSupported{} } return forwardingNetProto.EnableMulticastForwarding(disp) } // DisableMulticastForwardingForProtocol disables multicast forwarding for the // provided protocol. // // Returns an error if the provided protocol is not found or if it does not // support multicast forwarding. func (s *Stack) DisableMulticastForwardingForProtocol(protocol tcpip.NetworkProtocolNumber) tcpip.Error { netProto, ok := s.networkProtocols[protocol] if !ok { return &tcpip.ErrUnknownProtocol{} } forwardingNetProto, ok := netProto.(MulticastForwardingNetworkProtocol) if !ok { return &tcpip.ErrNotSupported{} } forwardingNetProto.DisableMulticastForwarding() return nil } // SetNICMulticastForwarding enables or disables multicast packet forwarding on // the specified NIC for the passed protocol. // // Returns the previous configuration on the NIC. func (s *Stack) SetNICMulticastForwarding(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, enable bool) (bool, tcpip.Error) { s.mu.RLock() defer s.mu.RUnlock() nic, ok := s.nics[id] if !ok { return false, &tcpip.ErrUnknownNICID{} } return nic.setMulticastForwarding(protocol, enable) } // NICMulticastForwarding returns the multicast forwarding configuration for // the specified NIC. func (s *Stack) NICMulticastForwarding(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber) (bool, tcpip.Error) { s.mu.RLock() defer s.mu.RUnlock() nic, ok := s.nics[id] if !ok { return false, &tcpip.ErrUnknownNICID{} } return nic.multicastForwarding(protocol) } // PortRange returns the UDP and TCP inclusive range of ephemeral ports used in // both IPv4 and IPv6. func (s *Stack) PortRange() (uint16, uint16) { return s.PortManager.PortRange() } // SetPortRange sets the UDP and TCP IPv4 and IPv6 ephemeral port range // (inclusive). func (s *Stack) SetPortRange(start uint16, end uint16) tcpip.Error { return s.PortManager.SetPortRange(start, end) } // SetRouteTable assigns the route table to be used by this stack. It // specifies which NIC to use for given destination address ranges. // // This method takes ownership of the table. func (s *Stack) SetRouteTable(table []tcpip.Route) { s.routeMu.Lock() defer s.routeMu.Unlock() s.routeTable.Reset() for _, r := range table { s.addRouteLocked(&r) } } // GetRouteTable returns the route table which is currently in use. func (s *Stack) GetRouteTable() []tcpip.Route { s.routeMu.RLock() defer s.routeMu.RUnlock() table := make([]tcpip.Route, 0) for r := s.routeTable.Front(); r != nil; r = r.Next() { table = append(table, *r) } return table } // AddRoute appends a route to the route table. func (s *Stack) AddRoute(route tcpip.Route) { s.routeMu.Lock() defer s.routeMu.Unlock() s.addRouteLocked(&route) } // +checklocks:s.routeMu func (s *Stack) addRouteLocked(route *tcpip.Route) { routePrefix := route.Destination.Prefix() n := s.routeTable.Front() for ; n != nil; n = n.Next() { if n.Destination.Prefix() < routePrefix { s.routeTable.InsertBefore(n, route) return } } s.routeTable.PushBack(route) } // RemoveRoutes removes matching routes from the route table. func (s *Stack) RemoveRoutes(match func(tcpip.Route) bool) { s.routeMu.Lock() defer s.routeMu.Unlock() s.removeRoutesLocked(match) } // +checklocks:s.routeMu func (s *Stack) removeRoutesLocked(match func(tcpip.Route) bool) { for route := s.routeTable.Front(); route != nil; { next := route.Next() if match(*route) { s.routeTable.Remove(route) } route = next } } // ReplaceRoute replaces the route in the routing table which matchse // the lookup key for the routing table. If there is no match, the given // route will still be added to the routing table. // The lookup key consists of destination, ToS, scope and output interface. func (s *Stack) ReplaceRoute(route tcpip.Route) { s.routeMu.Lock() defer s.routeMu.Unlock() s.removeRoutesLocked(func(rt tcpip.Route) bool { return rt.Equal(route) }) s.addRouteLocked(&route) } // NewEndpoint creates a new transport layer endpoint of the given protocol. func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { t, ok := s.transportProtocols[transport] if !ok { return nil, &tcpip.ErrUnknownProtocol{} } return t.proto.NewEndpoint(network, waiterQueue) } // NewRawEndpoint creates a new raw transport layer endpoint of the given // protocol. Raw endpoints receive all traffic for a given protocol regardless // of address. func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, tcpip.Error) { if s.rawFactory == nil { netRawMissingLogger.Infof("A process tried to create a raw socket, but --net-raw was not specified. Should runsc be run with --net-raw?") return nil, &tcpip.ErrNotPermitted{} } if !associated { return s.rawFactory.NewUnassociatedEndpoint(s, network, transport, waiterQueue) } t, ok := s.transportProtocols[transport] if !ok { return nil, &tcpip.ErrUnknownProtocol{} } return t.proto.NewRawEndpoint(network, waiterQueue) } // NewPacketEndpoint creates a new packet endpoint listening for the given // netProto. func (s *Stack) NewPacketEndpoint(cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { if s.rawFactory == nil { return nil, &tcpip.ErrNotPermitted{} } return s.rawFactory.NewPacketEndpoint(s, cooked, netProto, waiterQueue) } // NICContext is an opaque pointer used to store client-supplied NIC metadata. type NICContext any // NICOptions specifies the configuration of a NIC as it is being created. // The zero value creates an enabled, unnamed NIC. type NICOptions struct { // Name specifies the name of the NIC. Name string // Disabled specifies whether to avoid calling Attach on the passed // LinkEndpoint. Disabled bool // Context specifies user-defined data that will be returned in stack.NICInfo // for the NIC. Clients of this library can use it to add metadata that // should be tracked alongside a NIC, to avoid having to keep a // map[tcpip.NICID]metadata mirroring stack.Stack's nic map. Context NICContext // QDisc is the queue discipline to use for this NIC. QDisc QueueingDiscipline // DeliverLinkPackets specifies whether the NIC is responsible for // delivering raw packets to packet sockets. DeliverLinkPackets bool } // GetNICByID return a network device associated with the specified ID. func (s *Stack) GetNICByID(id tcpip.NICID) (*nic, tcpip.Error) { s.mu.Lock() defer s.mu.Unlock() n, ok := s.nics[id] if !ok { return nil, &tcpip.ErrNoSuchFile{} } return n, nil } // CreateNICWithOptions creates a NIC with the provided id, LinkEndpoint, and // NICOptions. See the documentation on type NICOptions for details on how // NICs can be configured. // // LinkEndpoint.Attach will be called to bind ep with a NetworkDispatcher. func (s *Stack) CreateNICWithOptions(id tcpip.NICID, ep LinkEndpoint, opts NICOptions) tcpip.Error { s.mu.Lock() defer s.mu.Unlock() if id == 0 { return &tcpip.ErrInvalidNICID{} } // Make sure id is unique. if _, ok := s.nics[id]; ok { return &tcpip.ErrDuplicateNICID{} } // Make sure name is unique, unless unnamed. if opts.Name != "" { for _, n := range s.nics { if n.Name() == opts.Name { return &tcpip.ErrDuplicateNICID{} } } } n := newNIC(s, id, ep, opts) for proto := range s.defaultForwardingEnabled { if _, err := n.setForwarding(proto, true); err != nil { panic(fmt.Sprintf("newNIC(%d, ...).setForwarding(%d, true): %s", id, proto, err)) } } s.nics[id] = n ep.SetOnCloseAction(func() { s.RemoveNIC(id) }) if !opts.Disabled { return n.enable() } return nil } // CreateNIC creates a NIC with the provided id and LinkEndpoint and calls // LinkEndpoint.Attach to bind ep with a NetworkDispatcher. func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) tcpip.Error { return s.CreateNICWithOptions(id, ep, NICOptions{}) } // GetLinkEndpointByName gets the link endpoint specified by name. func (s *Stack) GetLinkEndpointByName(name string) LinkEndpoint { s.mu.RLock() defer s.mu.RUnlock() for _, nic := range s.nics { if nic.Name() == name { linkEP, ok := nic.NetworkLinkEndpoint.(LinkEndpoint) if !ok { panic(fmt.Sprintf("unexpected NetworkLinkEndpoint(%#v) is not a LinkEndpoint", nic.NetworkLinkEndpoint)) } return linkEP } } return nil } // EnableNIC enables the given NIC so that the link-layer endpoint can start // delivering packets to it. func (s *Stack) EnableNIC(id tcpip.NICID) tcpip.Error { s.mu.RLock() defer s.mu.RUnlock() nic, ok := s.nics[id] if !ok { return &tcpip.ErrUnknownNICID{} } return nic.enable() } // DisableNIC disables the given NIC. func (s *Stack) DisableNIC(id tcpip.NICID) tcpip.Error { s.mu.RLock() defer s.mu.RUnlock() nic, ok := s.nics[id] if !ok { return &tcpip.ErrUnknownNICID{} } nic.disable() return nil } // CheckNIC checks if a NIC is usable. func (s *Stack) CheckNIC(id tcpip.NICID) bool { s.mu.RLock() defer s.mu.RUnlock() nic, ok := s.nics[id] if !ok { return false } return nic.Enabled() } // RemoveNIC removes NIC and all related routes from the network stack. func (s *Stack) RemoveNIC(id tcpip.NICID) tcpip.Error { s.mu.Lock() deferAct, err := s.removeNICLocked(id) s.mu.Unlock() if deferAct != nil { deferAct() } return err } // removeNICLocked removes NIC and all related routes from the network stack. // // +checklocks:s.mu func (s *Stack) removeNICLocked(id tcpip.NICID) (func(), tcpip.Error) { nic, ok := s.nics[id] if !ok { return nil, &tcpip.ErrUnknownNICID{} } delete(s.nics, id) if nic.Primary != nil { b := nic.Primary.NetworkLinkEndpoint.(CoordinatorNIC) if err := b.DelNIC(nic); err != nil { return nil, err } } // Remove routes in-place. n tracks the number of routes written. s.routeMu.Lock() for r := s.routeTable.Front(); r != nil; { next := r.Next() if r.NIC == id { s.routeTable.Remove(r) } r = next } s.routeMu.Unlock() return nic.remove(true /* closeLinkEndpoint */) } // SetNICCoordinator sets a coordinator device. func (s *Stack) SetNICCoordinator(id tcpip.NICID, mid tcpip.NICID) tcpip.Error { s.mu.Lock() defer s.mu.Unlock() nic, ok := s.nics[id] if !ok { return &tcpip.ErrUnknownNICID{} } m, ok := s.nics[mid] if !ok { return &tcpip.ErrUnknownNICID{} } b, ok := m.NetworkLinkEndpoint.(CoordinatorNIC) if !ok { return &tcpip.ErrNotSupported{} } if err := b.AddNIC(nic); err != nil { return err } nic.Primary = m return nil } // SetNICAddress sets the hardware address which is identified by the nic ID. func (s *Stack) SetNICAddress(id tcpip.NICID, addr tcpip.LinkAddress) tcpip.Error { s.mu.Lock() defer s.mu.Unlock() nic, ok := s.nics[id] if !ok { return &tcpip.ErrUnknownNICID{} } nic.NetworkLinkEndpoint.SetLinkAddress(addr) return nil } // SetNICName sets a NIC's name. func (s *Stack) SetNICName(id tcpip.NICID, name string) tcpip.Error { s.mu.Lock() defer s.mu.Unlock() nic, ok := s.nics[id] if !ok { return &tcpip.ErrUnknownNICID{} } nic.name = name return nil } // SetNICMTU sets a NIC's MTU. func (s *Stack) SetNICMTU(id tcpip.NICID, mtu uint32) tcpip.Error { s.mu.Lock() defer s.mu.Unlock() nic, ok := s.nics[id] if !ok { return &tcpip.ErrUnknownNICID{} } nic.NetworkLinkEndpoint.SetMTU(mtu) return nil } // NICInfo captures the name and addresses assigned to a NIC. type NICInfo struct { Name string LinkAddress tcpip.LinkAddress ProtocolAddresses []tcpip.ProtocolAddress // Flags indicate the state of the NIC. Flags NICStateFlags // MTU is the maximum transmission unit. MTU uint32 Stats tcpip.NICStats // NetworkStats holds the stats of each NetworkEndpoint bound to the NIC. NetworkStats map[tcpip.NetworkProtocolNumber]NetworkEndpointStats // Context is user-supplied data optionally supplied in CreateNICWithOptions. // See type NICOptions for more details. Context NICContext // ARPHardwareType holds the ARP Hardware type of the NIC. This is the // value sent in haType field of an ARP Request sent by this NIC and the // value expected in the haType field of an ARP response. ARPHardwareType header.ARPHardwareType // Forwarding holds the forwarding status for each network endpoint that // supports forwarding. Forwarding map[tcpip.NetworkProtocolNumber]bool // MulticastForwarding holds the forwarding status for each network endpoint // that supports multicast forwarding. MulticastForwarding map[tcpip.NetworkProtocolNumber]bool } // HasNIC returns true if the NICID is defined in the stack. func (s *Stack) HasNIC(id tcpip.NICID) bool { s.mu.RLock() _, ok := s.nics[id] s.mu.RUnlock() return ok } // NICInfo returns a map of NICIDs to their associated information. func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo { s.mu.RLock() defer s.mu.RUnlock() type forwardingFn func(tcpip.NetworkProtocolNumber) (bool, tcpip.Error) forwardingValue := func(forwardingFn forwardingFn, proto tcpip.NetworkProtocolNumber, nicID tcpip.NICID, fnName string) (forward bool, ok bool) { switch forwarding, err := forwardingFn(proto); err.(type) { case nil: return forwarding, true case *tcpip.ErrUnknownProtocol: panic(fmt.Sprintf("expected network protocol %d to be available on NIC %d", proto, nicID)) case *tcpip.ErrNotSupported: // Not all network protocols support forwarding. default: panic(fmt.Sprintf("nic(id=%d).%s(%d): %s", nicID, fnName, proto, err)) } return false, false } nics := make(map[tcpip.NICID]NICInfo) for id, nic := range s.nics { flags := NICStateFlags{ Up: true, // Netstack interfaces are always up. Running: nic.Enabled(), Promiscuous: nic.Promiscuous(), Loopback: nic.IsLoopback(), } netStats := make(map[tcpip.NetworkProtocolNumber]NetworkEndpointStats) for proto, netEP := range nic.networkEndpoints { netStats[proto] = netEP.Stats() } info := NICInfo{ Name: nic.name, LinkAddress: nic.NetworkLinkEndpoint.LinkAddress(), ProtocolAddresses: nic.primaryAddresses(), Flags: flags, MTU: nic.NetworkLinkEndpoint.MTU(), Stats: nic.stats.local, NetworkStats: netStats, Context: nic.context, ARPHardwareType: nic.NetworkLinkEndpoint.ARPHardwareType(), Forwarding: make(map[tcpip.NetworkProtocolNumber]bool), MulticastForwarding: make(map[tcpip.NetworkProtocolNumber]bool), } for proto := range s.networkProtocols { if forwarding, ok := forwardingValue(nic.forwarding, proto, id, "forwarding"); ok { info.Forwarding[proto] = forwarding } if multicastForwarding, ok := forwardingValue(nic.multicastForwarding, proto, id, "multicastForwarding"); ok { info.MulticastForwarding[proto] = multicastForwarding } } nics[id] = info } return nics } // NICStateFlags holds information about the state of an NIC. type NICStateFlags struct { // Up indicates whether the interface is running. Up bool // Running indicates whether resources are allocated. Running bool // Promiscuous indicates whether the interface is in promiscuous mode. Promiscuous bool // Loopback indicates whether the interface is a loopback. Loopback bool } // AddProtocolAddress adds an address to the specified NIC, possibly with extra // properties. func (s *Stack) AddProtocolAddress(id tcpip.NICID, protocolAddress tcpip.ProtocolAddress, properties AddressProperties) tcpip.Error { s.mu.RLock() defer s.mu.RUnlock() nic, ok := s.nics[id] if !ok { return &tcpip.ErrUnknownNICID{} } return nic.addAddress(protocolAddress, properties) } // RemoveAddress removes an existing network-layer address from the specified // NIC. func (s *Stack) RemoveAddress(id tcpip.NICID, addr tcpip.Address) tcpip.Error { s.mu.RLock() defer s.mu.RUnlock() if nic, ok := s.nics[id]; ok { return nic.removeAddress(addr) } return &tcpip.ErrUnknownNICID{} } // SetAddressLifetimes sets informational preferred and valid lifetimes, and // whether the address should be preferred or deprecated. func (s *Stack) SetAddressLifetimes(id tcpip.NICID, addr tcpip.Address, lifetimes AddressLifetimes) tcpip.Error { s.mu.RLock() defer s.mu.RUnlock() if nic, ok := s.nics[id]; ok { return nic.setAddressLifetimes(addr, lifetimes) } return &tcpip.ErrUnknownNICID{} } // AllAddresses returns a map of NICIDs to their protocol addresses (primary // and non-primary). func (s *Stack) AllAddresses() map[tcpip.NICID][]tcpip.ProtocolAddress { s.mu.RLock() defer s.mu.RUnlock() nics := make(map[tcpip.NICID][]tcpip.ProtocolAddress) for id, nic := range s.nics { nics[id] = nic.allPermanentAddresses() } return nics } // GetMainNICAddress returns the first non-deprecated primary address and prefix // for the given NIC and protocol. If no non-deprecated primary addresses exist, // a deprecated address will be returned. If no deprecated addresses exist, the // zero value will be returned. func (s *Stack) GetMainNICAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber) (tcpip.AddressWithPrefix, tcpip.Error) { s.mu.RLock() defer s.mu.RUnlock() nic, ok := s.nics[id] if !ok { return tcpip.AddressWithPrefix{}, &tcpip.ErrUnknownNICID{} } return nic.PrimaryAddress(protocol) } func (s *Stack) getAddressEP(nic *nic, localAddr, remoteAddr, srcHint tcpip.Address, netProto tcpip.NetworkProtocolNumber) AssignableAddressEndpoint { if localAddr.BitLen() == 0 { return nic.primaryEndpoint(netProto, remoteAddr, srcHint) } return nic.findEndpoint(netProto, localAddr, CanBePrimaryEndpoint) } // NewRouteForMulticast returns a Route that may be used to forward multicast // packets. // // Returns nil if validation fails. func (s *Stack) NewRouteForMulticast(nicID tcpip.NICID, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) *Route { s.mu.RLock() defer s.mu.RUnlock() nic, ok := s.nics[nicID] if !ok || !nic.Enabled() { return nil } if addressEndpoint := s.getAddressEP(nic, tcpip.Address{} /* localAddr */, remoteAddr, tcpip.Address{} /* srcHint */, netProto); addressEndpoint != nil { return constructAndValidateRoute(netProto, addressEndpoint, nic, nic, tcpip.Address{} /* gateway */, tcpip.Address{} /* localAddr */, remoteAddr, s.handleLocal, false /* multicastLoop */, 0 /* mtu */) } return nil } // findLocalRouteFromNICRLocked is like findLocalRouteRLocked but finds a route // from the specified NIC. // // +checklocksread:s.mu func (s *Stack) findLocalRouteFromNICRLocked(localAddressNIC *nic, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) *Route { localAddressEndpoint := localAddressNIC.getAddressOrCreateTempInner(netProto, localAddr, false /* createTemp */, NeverPrimaryEndpoint) if localAddressEndpoint == nil { return nil } var outgoingNIC *nic // Prefer a local route to the same interface as the local address. if localAddressNIC.hasAddress(netProto, remoteAddr) { outgoingNIC = localAddressNIC } // If the remote address isn't owned by the local address's NIC, check all // NICs. if outgoingNIC == nil { for _, nic := range s.nics { if nic.hasAddress(netProto, remoteAddr) { outgoingNIC = nic break } } } // If the remote address is not owned by the stack, we can't return a local // route. if outgoingNIC == nil { localAddressEndpoint.DecRef() return nil } r := makeLocalRoute( netProto, localAddr, remoteAddr, outgoingNIC, localAddressNIC, localAddressEndpoint, ) if r.IsOutboundBroadcast() { r.Release() return nil } return r } // findLocalRouteRLocked returns a local route. // // A local route is a route to some remote address which the stack owns. That // is, a local route is a route where packets never have to leave the stack. // // +checklocksread:s.mu func (s *Stack) findLocalRouteRLocked(localAddressNICID tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) *Route { if localAddr.BitLen() == 0 { localAddr = remoteAddr } if localAddressNICID == 0 { for _, localAddressNIC := range s.nics { if r := s.findLocalRouteFromNICRLocked(localAddressNIC, localAddr, remoteAddr, netProto); r != nil { return r } } return nil } if localAddressNIC, ok := s.nics[localAddressNICID]; ok { return s.findLocalRouteFromNICRLocked(localAddressNIC, localAddr, remoteAddr, netProto) } return nil } // HandleLocal returns true if non-loopback interfaces are allowed to loop packets. func (s *Stack) HandleLocal() bool { return s.handleLocal } func isNICForwarding(nic *nic, proto tcpip.NetworkProtocolNumber) bool { switch forwarding, err := nic.forwarding(proto); err.(type) { case nil: return forwarding case *tcpip.ErrUnknownProtocol: panic(fmt.Sprintf("expected network protocol %d to be available on NIC %d", proto, nic.ID())) case *tcpip.ErrNotSupported: // Not all network protocols support forwarding. return false default: panic(fmt.Sprintf("nic(id=%d).forwarding(%d): %s", nic.ID(), proto, err)) } } // findRouteWithLocalAddrFromAnyInterfaceRLocked returns a route to the given // destination address, leaving through the given NIC. // // Rather than preferring to find a route that uses a local address assigned to // the outgoing interface, it finds any NIC that holds a matching local address // endpoint. // // +checklocksread:s.mu func (s *Stack) findRouteWithLocalAddrFromAnyInterfaceRLocked(outgoingNIC *nic, localAddr, remoteAddr, srcHint, gateway tcpip.Address, netProto tcpip.NetworkProtocolNumber, multicastLoop bool, mtu uint32) *Route { for _, aNIC := range s.nics { addressEndpoint := s.getAddressEP(aNIC, localAddr, remoteAddr, srcHint, netProto) if addressEndpoint == nil { continue } if r := constructAndValidateRoute(netProto, addressEndpoint, aNIC /* localAddressNIC */, outgoingNIC, gateway, localAddr, remoteAddr, s.handleLocal, multicastLoop, mtu); r != nil { return r } } return nil } // FindRoute creates a route to the given destination address, leaving through // the given NIC and local address (if provided). // // If a NIC is not specified, the returned route will leave through the same // NIC as the NIC that has the local address assigned when forwarding is // disabled. If forwarding is enabled and the NIC is unspecified, the route may // leave through any interface unless the route is link-local. // // If no local address is provided, the stack will select a local address. If no // remote address is provided, the stack will use a remote address equal to the // local address. func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber, multicastLoop bool) (*Route, tcpip.Error) { s.mu.RLock() defer s.mu.RUnlock() // Reject attempts to use unsupported protocols. if !s.CheckNetworkProtocol(netProto) { return nil, &tcpip.ErrUnknownProtocol{} } isLinkLocal := header.IsV6LinkLocalUnicastAddress(remoteAddr) || header.IsV6LinkLocalMulticastAddress(remoteAddr) isLocalBroadcast := remoteAddr == header.IPv4Broadcast isMulticast := header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr) isLoopback := header.IsV4LoopbackAddress(remoteAddr) || header.IsV6LoopbackAddress(remoteAddr) needRoute := !(isLocalBroadcast || isMulticast || isLinkLocal || isLoopback) if s.handleLocal && !isMulticast && !isLocalBroadcast { if r := s.findLocalRouteRLocked(id, localAddr, remoteAddr, netProto); r != nil { return r, nil } } // If the interface is specified and we do not need a route, return a route // through the interface if the interface is valid and enabled. if id != 0 && !needRoute { if nic, ok := s.nics[id]; ok && nic.Enabled() { if addressEndpoint := s.getAddressEP(nic, localAddr, remoteAddr, tcpip.Address{} /* srcHint */, netProto); addressEndpoint != nil { return makeRoute( netProto, tcpip.Address{}, /* gateway */ localAddr, remoteAddr, nic, /* outgoingNIC */ nic, /* localAddressNIC*/ addressEndpoint, s.handleLocal, multicastLoop, 0, /* mtu */ ), nil } } if isLoopback { return nil, &tcpip.ErrBadLocalAddress{} } return nil, &tcpip.ErrNetworkUnreachable{} } onlyGlobalAddresses := !header.IsV6LinkLocalUnicastAddress(localAddr) && !isLinkLocal // Find a route to the remote with the route table. var chosenRoute tcpip.Route if r := func() *Route { s.routeMu.RLock() defer s.routeMu.RUnlock() for route := s.routeTable.Front(); route != nil; route = route.Next() { if remoteAddr.BitLen() != 0 && !route.Destination.Contains(remoteAddr) { continue } nic, ok := s.nics[route.NIC] if !ok || !nic.Enabled() { continue } if id == 0 || id == route.NIC { if addressEndpoint := s.getAddressEP(nic, localAddr, remoteAddr, route.SourceHint, netProto); addressEndpoint != nil { var gateway tcpip.Address if needRoute { gateway = route.Gateway } r := constructAndValidateRoute(netProto, addressEndpoint, nic /* outgoingNIC */, nic /* outgoingNIC */, gateway, localAddr, remoteAddr, s.handleLocal, multicastLoop, route.MTU) if r == nil { panic(fmt.Sprintf("non-forwarding route validation failed with route table entry = %#v, id = %d, localAddr = %s, remoteAddr = %s", route, id, localAddr, remoteAddr)) } return r } } // If the stack has forwarding enabled, we haven't found a valid route to // the remote address yet, and we are routing locally generated traffic, // keep track of the first valid route. We keep iterating because we // prefer routes that let us use a local address that is assigned to the // outgoing interface. There is no requirement to do this from any RFC // but simply a choice made to better follow a strong host model which // the netstack follows at the time of writing. // // Note that for incoming traffic that we are forwarding (for which the // NIC and local address are unspecified), we do not keep iterating, as // there is no reason to prefer routes that let us use a local address // when routing forwarded (as opposed to locally-generated) traffic. locallyGenerated := (id != 0 || localAddr != tcpip.Address{}) if onlyGlobalAddresses && chosenRoute.Equal(tcpip.Route{}) && isNICForwarding(nic, netProto) { if locallyGenerated { chosenRoute = *route continue } if r := s.findRouteWithLocalAddrFromAnyInterfaceRLocked(nic, localAddr, remoteAddr, route.SourceHint, route.Gateway, netProto, multicastLoop, route.MTU); r != nil { return r } } } return nil }(); r != nil { return r, nil } if !chosenRoute.Equal(tcpip.Route{}) { // At this point we know the stack has forwarding enabled since chosenRoute is // only set when forwarding is enabled. nic, ok := s.nics[chosenRoute.NIC] if !ok { // If the route's NIC was invalid, we should not have chosen the route. panic(fmt.Sprintf("chosen route must have a valid NIC with ID = %d", chosenRoute.NIC)) } var gateway tcpip.Address if needRoute { gateway = chosenRoute.Gateway } // Use the specified NIC to get the local address endpoint. if id != 0 { if aNIC, ok := s.nics[id]; ok { if addressEndpoint := s.getAddressEP(aNIC, localAddr, remoteAddr, chosenRoute.SourceHint, netProto); addressEndpoint != nil { if r := constructAndValidateRoute(netProto, addressEndpoint, aNIC /* localAddressNIC */, nic /* outgoingNIC */, gateway, localAddr, remoteAddr, s.handleLocal, multicastLoop, chosenRoute.MTU); r != nil { return r, nil } } } // TODO(https://gvisor.dev/issues/8105): This should be ErrNetworkUnreachable. return nil, &tcpip.ErrHostUnreachable{} } if id == 0 { // If an interface is not specified, try to find a NIC that holds the local // address endpoint to construct a route. if r := s.findRouteWithLocalAddrFromAnyInterfaceRLocked(nic, localAddr, remoteAddr, chosenRoute.SourceHint, gateway, netProto, multicastLoop, chosenRoute.MTU); r != nil { return r, nil } } } if needRoute { // TODO(https://gvisor.dev/issues/8105): This should be ErrNetworkUnreachable. return nil, &tcpip.ErrHostUnreachable{} } if header.IsV6LoopbackAddress(remoteAddr) { return nil, &tcpip.ErrBadLocalAddress{} } // TODO(https://gvisor.dev/issues/8105): This should be ErrNetworkUnreachable. return nil, &tcpip.ErrNetworkUnreachable{} } // CheckNetworkProtocol checks if a given network protocol is enabled in the // stack. func (s *Stack) CheckNetworkProtocol(protocol tcpip.NetworkProtocolNumber) bool { _, ok := s.networkProtocols[protocol] return ok } // CheckDuplicateAddress performs duplicate address detection for the address on // the specified interface. func (s *Stack) CheckDuplicateAddress(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, h DADCompletionHandler) (DADCheckAddressDisposition, tcpip.Error) { s.mu.RLock() nic, ok := s.nics[nicID] s.mu.RUnlock() if !ok { return 0, &tcpip.ErrUnknownNICID{} } return nic.checkDuplicateAddress(protocol, addr, h) } // CheckLocalAddress determines if the given local address exists, and if it // does, returns the id of the NIC it's bound to. Returns 0 if the address // does not exist. func (s *Stack) CheckLocalAddress(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.NICID { s.mu.RLock() defer s.mu.RUnlock() // If a NIC is specified, use its NIC id. if nicID != 0 { nic, ok := s.nics[nicID] if !ok { return 0 } // In IPv4, linux only checks the interface. If it matches, then it does // not bother with the address. // https://github.com/torvalds/linux/blob/15205c2829ca2cbb5ece5ceaafe1171a8470e62b/net/ipv4/igmp.c#L1829-L1837 if protocol == header.IPv4ProtocolNumber { return nic.id } if nic.CheckLocalAddress(protocol, addr) { return nic.id } return 0 } // Go through all the NICs. for _, nic := range s.nics { if nic.CheckLocalAddress(protocol, addr) { return nic.id } } return 0 } // SetPromiscuousMode enables or disables promiscuous mode in the given NIC. func (s *Stack) SetPromiscuousMode(nicID tcpip.NICID, enable bool) tcpip.Error { s.mu.RLock() defer s.mu.RUnlock() nic, ok := s.nics[nicID] if !ok { return &tcpip.ErrUnknownNICID{} } nic.setPromiscuousMode(enable) return nil } // SetSpoofing enables or disables address spoofing in the given NIC, allowing // endpoints to bind to any address in the NIC. func (s *Stack) SetSpoofing(nicID tcpip.NICID, enable bool) tcpip.Error { s.mu.RLock() defer s.mu.RUnlock() nic, ok := s.nics[nicID] if !ok { return &tcpip.ErrUnknownNICID{} } nic.setSpoofing(enable) return nil } // LinkResolutionResult is the result of a link address resolution attempt. type LinkResolutionResult struct { LinkAddress tcpip.LinkAddress Err tcpip.Error } // GetLinkAddress finds the link address corresponding to a network address. // // Returns ErrNotSupported if the stack is not configured with a link address // resolver for the specified network protocol. // // Returns ErrWouldBlock if the link address is not readily available, along // with a notification channel for the caller to block on. Triggers address // resolution asynchronously. // // onResolve will be called either immediately, if resolution is not required, // or when address resolution is complete, with the resolved link address and // whether resolution succeeded. // // If specified, the local address must be an address local to the interface // the neighbor cache belongs to. The local address is the source address of // a packet prompting NUD/link address resolution. func (s *Stack) GetLinkAddress(nicID tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, onResolve func(LinkResolutionResult)) tcpip.Error { s.mu.RLock() nic, ok := s.nics[nicID] s.mu.RUnlock() if !ok { return &tcpip.ErrUnknownNICID{} } return nic.getLinkAddress(addr, localAddr, protocol, onResolve) } // Neighbors returns all IP to MAC address associations. func (s *Stack) Neighbors(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber) ([]NeighborEntry, tcpip.Error) { s.mu.RLock() nic, ok := s.nics[nicID] s.mu.RUnlock() if !ok { return nil, &tcpip.ErrUnknownNICID{} } return nic.neighbors(protocol) } // AddStaticNeighbor statically associates an IP address to a MAC address. func (s *Stack) AddStaticNeighbor(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, linkAddr tcpip.LinkAddress) tcpip.Error { s.mu.RLock() nic, ok := s.nics[nicID] s.mu.RUnlock() if !ok { return &tcpip.ErrUnknownNICID{} } return nic.addStaticNeighbor(addr, protocol, linkAddr) } // RemoveNeighbor removes an IP to MAC address association previously created // either automatically or by AddStaticNeighbor. Returns ErrBadAddress if there // is no association with the provided address. func (s *Stack) RemoveNeighbor(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.Error { s.mu.RLock() nic, ok := s.nics[nicID] s.mu.RUnlock() if !ok { return &tcpip.ErrUnknownNICID{} } return nic.removeNeighbor(protocol, addr) } // ClearNeighbors removes all IP to MAC address associations. func (s *Stack) ClearNeighbors(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber) tcpip.Error { s.mu.RLock() nic, ok := s.nics[nicID] s.mu.RUnlock() if !ok { return &tcpip.ErrUnknownNICID{} } return nic.clearNeighbors(protocol) } // RegisterTransportEndpoint registers the given endpoint with the stack // transport dispatcher. Received packets that match the provided id will be // delivered to the given endpoint; specifying a nic is optional, but // nic-specific IDs have precedence over global ones. func (s *Stack) RegisterTransportEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) tcpip.Error { return s.demux.registerEndpoint(netProtos, protocol, id, ep, flags, bindToDevice) } // CheckRegisterTransportEndpoint checks if an endpoint can be registered with // the stack transport dispatcher. func (s *Stack) CheckRegisterTransportEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, flags ports.Flags, bindToDevice tcpip.NICID) tcpip.Error { return s.demux.checkEndpoint(netProtos, protocol, id, flags, bindToDevice) } // UnregisterTransportEndpoint removes the endpoint with the given id from the // stack transport dispatcher. func (s *Stack) UnregisterTransportEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) { s.demux.unregisterEndpoint(netProtos, protocol, id, ep, flags, bindToDevice) } // StartTransportEndpointCleanup removes the endpoint with the given id from // the stack transport dispatcher. It also transitions it to the cleanup stage. func (s *Stack) StartTransportEndpointCleanup(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) { s.cleanupEndpointsMu.Lock() s.cleanupEndpoints[ep] = struct{}{} s.cleanupEndpointsMu.Unlock() s.demux.unregisterEndpoint(netProtos, protocol, id, ep, flags, bindToDevice) } // CompleteTransportEndpointCleanup removes the endpoint from the cleanup // stage. func (s *Stack) CompleteTransportEndpointCleanup(ep TransportEndpoint) { s.cleanupEndpointsMu.Lock() delete(s.cleanupEndpoints, ep) s.cleanupEndpointsMu.Unlock() } // FindTransportEndpoint finds an endpoint that most closely matches the provided // id. If no endpoint is found it returns nil. func (s *Stack) FindTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, id TransportEndpointID, nicID tcpip.NICID) TransportEndpoint { return s.demux.findTransportEndpoint(netProto, transProto, id, nicID) } // RegisterRawTransportEndpoint registers the given endpoint with the stack // transport dispatcher. Received packets that match the provided transport // protocol will be delivered to the given endpoint. func (s *Stack) RegisterRawTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) tcpip.Error { return s.demux.registerRawEndpoint(netProto, transProto, ep) } // UnregisterRawTransportEndpoint removes the endpoint for the transport // protocol from the stack transport dispatcher. func (s *Stack) UnregisterRawTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) { s.demux.unregisterRawEndpoint(netProto, transProto, ep) } // RegisterRestoredEndpoint records e as an endpoint that has been restored on // this stack. func (s *Stack) RegisterRestoredEndpoint(e RestoredEndpoint) { s.mu.Lock() defer s.mu.Unlock() s.restoredEndpoints = append(s.restoredEndpoints, e) } // RegisterResumableEndpoint records e as an endpoint that has to be resumed. func (s *Stack) RegisterResumableEndpoint(e ResumableEndpoint) { s.mu.Lock() defer s.mu.Unlock() s.resumableEndpoints = append(s.resumableEndpoints, e) } // RegisteredEndpoints returns all endpoints which are currently registered. func (s *Stack) RegisteredEndpoints() []TransportEndpoint { s.mu.Lock() defer s.mu.Unlock() var es []TransportEndpoint for _, e := range s.demux.protocol { es = append(es, e.transportEndpoints()...) } return es } // CleanupEndpoints returns endpoints currently in the cleanup state. func (s *Stack) CleanupEndpoints() []TransportEndpoint { s.cleanupEndpointsMu.Lock() defer s.cleanupEndpointsMu.Unlock() es := make([]TransportEndpoint, 0, len(s.cleanupEndpoints)) for e := range s.cleanupEndpoints { es = append(es, e) } return es } // RestoreCleanupEndpoints adds endpoints to cleanup tracking. This is useful // for restoring a stack after a save. func (s *Stack) RestoreCleanupEndpoints(es []TransportEndpoint) { s.cleanupEndpointsMu.Lock() defer s.cleanupEndpointsMu.Unlock() for _, e := range es { s.cleanupEndpoints[e] = struct{}{} } } // Close closes all currently registered transport endpoints. // // Endpoints created or modified during this call may not get closed. func (s *Stack) Close() { for _, e := range s.RegisteredEndpoints() { e.Abort() } for _, p := range s.transportProtocols { p.proto.Close() } for _, p := range s.networkProtocols { p.Close() } } // Wait waits for all transport and link endpoints to halt their worker // goroutines. // // Endpoints created or modified during this call may not get waited on. // // Note that link endpoints must be stopped via an implementation specific // mechanism. func (s *Stack) Wait() { for _, e := range s.RegisteredEndpoints() { e.Wait() } for _, e := range s.CleanupEndpoints() { e.Wait() } for _, p := range s.transportProtocols { p.proto.Wait() } for _, p := range s.networkProtocols { p.Wait() } deferActs := make([]func(), 0) s.mu.Lock() for id, n := range s.nics { // Remove NIC to ensure that qDisc goroutines are correctly // terminated on stack teardown. act, _ := s.removeNICLocked(id) n.NetworkLinkEndpoint.Wait() if act != nil { deferActs = append(deferActs, act) } } s.mu.Unlock() for _, act := range deferActs { act() } } // Destroy destroys the stack with all endpoints. func (s *Stack) Destroy() { s.Close() s.Wait() } // Pause pauses any protocol level background workers. func (s *Stack) Pause() { for _, p := range s.transportProtocols { p.proto.Pause() } } // Restore restarts the stack after a restore. This must be called after the // entire system has been restored. func (s *Stack) Restore() { // RestoredEndpoint.Restore() may call other methods on s, so we can't hold // s.mu while restoring the endpoints. s.mu.Lock() eps := s.restoredEndpoints s.restoredEndpoints = nil s.mu.Unlock() for _, e := range eps { e.Restore(s) } // Now resume any protocol level background workers. for _, p := range s.transportProtocols { p.proto.Resume() } } // Resume resumes the stack after a save. func (s *Stack) Resume() { s.mu.Lock() eps := s.resumableEndpoints s.resumableEndpoints = nil s.mu.Unlock() for _, e := range eps { e.Resume() } // Now resume any protocol level background workers. for _, p := range s.transportProtocols { p.proto.Resume() } } // RegisterPacketEndpoint registers ep with the stack, causing it to receive // all traffic of the specified netProto on the given NIC. If nicID is 0, it // receives traffic from every NIC. func (s *Stack) RegisterPacketEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) tcpip.Error { s.mu.Lock() defer s.mu.Unlock() // If no NIC is specified, capture on all devices. if nicID == 0 { // Register with each NIC. for _, nic := range s.nics { nic.registerPacketEndpoint(netProto, ep) } return nil } // Capture on a specific device. nic, ok := s.nics[nicID] if !ok { return &tcpip.ErrUnknownNICID{} } nic.registerPacketEndpoint(netProto, ep) return nil } // UnregisterPacketEndpoint unregisters ep for packets of the specified // netProto from the specified NIC. If nicID is 0, ep is unregistered from all // NICs. func (s *Stack) UnregisterPacketEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) { s.mu.Lock() defer s.mu.Unlock() s.unregisterPacketEndpointLocked(nicID, netProto, ep) } // +checklocks:s.mu func (s *Stack) unregisterPacketEndpointLocked(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) { // If no NIC is specified, unregister on all devices. if nicID == 0 { // Unregister with each NIC. for _, nic := range s.nics { nic.unregisterPacketEndpoint(netProto, ep) } return } // Unregister in a single device. nic, ok := s.nics[nicID] if !ok { return } nic.unregisterPacketEndpoint(netProto, ep) } // WritePacketToRemote writes a payload on the specified NIC using the provided // network protocol and remote link address. func (s *Stack) WritePacketToRemote(nicID tcpip.NICID, remote tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, payload buffer.Buffer) tcpip.Error { s.mu.Lock() nic, ok := s.nics[nicID] s.mu.Unlock() if !ok { return &tcpip.ErrUnknownDevice{} } pkt := NewPacketBuffer(PacketBufferOptions{ ReserveHeaderBytes: int(nic.MaxHeaderLength()), Payload: payload, }) defer pkt.DecRef() pkt.NetworkProtocolNumber = netProto return nic.WritePacketToRemote(remote, pkt) } // WriteRawPacket writes data directly to the specified NIC without adding any // headers. func (s *Stack) WriteRawPacket(nicID tcpip.NICID, proto tcpip.NetworkProtocolNumber, payload buffer.Buffer) tcpip.Error { s.mu.RLock() nic, ok := s.nics[nicID] s.mu.RUnlock() if !ok { return &tcpip.ErrUnknownNICID{} } pkt := NewPacketBuffer(PacketBufferOptions{ Payload: payload, }) defer pkt.DecRef() pkt.NetworkProtocolNumber = proto return nic.writeRawPacketWithLinkHeaderInPayload(pkt) } // NetworkProtocolInstance returns the protocol instance in the stack for the // specified network protocol. This method is public for protocol implementers // and tests to use. func (s *Stack) NetworkProtocolInstance(num tcpip.NetworkProtocolNumber) NetworkProtocol { if p, ok := s.networkProtocols[num]; ok { return p } return nil } // TransportProtocolInstance returns the protocol instance in the stack for the // specified transport protocol. This method is public for protocol implementers // and tests to use. func (s *Stack) TransportProtocolInstance(num tcpip.TransportProtocolNumber) TransportProtocol { if pState, ok := s.transportProtocols[num]; ok { return pState.proto } return nil } // AddTCPProbe installs a probe function that will be invoked on every segment // received by a given TCP endpoint. The probe function is passed a copy of the // TCP endpoint state before and after processing of the segment. // // NOTE: TCPProbe is added only to endpoints created after this call. Endpoints // created prior to this call will not call the probe function. // // Further, installing two different probes back to back can result in some // endpoints calling the first one and some the second one. There is no // guarantee provided on which probe will be invoked. Ideally this should only // be called once per stack. func (s *Stack) AddTCPProbe(probe TCPProbeFunc) { s.tcpProbeFunc.Store(probe) } // GetTCPProbe returns the TCPProbeFunc if installed with AddTCPProbe, nil // otherwise. func (s *Stack) GetTCPProbe() TCPProbeFunc { p := s.tcpProbeFunc.Load() if p == nil { return nil } return p.(TCPProbeFunc) } // RemoveTCPProbe removes an installed TCP probe. // // NOTE: This only ensures that endpoints created after this call do not // have a probe attached. Endpoints already created will continue to invoke // TCP probe. func (s *Stack) RemoveTCPProbe() { // This must be TCPProbeFunc(nil) because atomic.Value.Store(nil) panics. s.tcpProbeFunc.Store(TCPProbeFunc(nil)) } // JoinGroup joins the given multicast group on the given NIC. func (s *Stack) JoinGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NICID, multicastAddr tcpip.Address) tcpip.Error { s.mu.RLock() defer s.mu.RUnlock() if nic, ok := s.nics[nicID]; ok { return nic.joinGroup(protocol, multicastAddr) } return &tcpip.ErrUnknownNICID{} } // LeaveGroup leaves the given multicast group on the given NIC. func (s *Stack) LeaveGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NICID, multicastAddr tcpip.Address) tcpip.Error { s.mu.RLock() defer s.mu.RUnlock() if nic, ok := s.nics[nicID]; ok { return nic.leaveGroup(protocol, multicastAddr) } return &tcpip.ErrUnknownNICID{} } // IsInGroup returns true if the NIC with ID nicID has joined the multicast // group multicastAddr. func (s *Stack) IsInGroup(nicID tcpip.NICID, multicastAddr tcpip.Address) (bool, tcpip.Error) { s.mu.RLock() defer s.mu.RUnlock() if nic, ok := s.nics[nicID]; ok { return nic.isInGroup(multicastAddr), nil } return false, &tcpip.ErrUnknownNICID{} } // IPTables returns the stack's iptables. func (s *Stack) IPTables() *IPTables { return s.tables } // ICMPLimit returns the maximum number of ICMP messages that can be sent // in one second. func (s *Stack) ICMPLimit() rate.Limit { return s.icmpRateLimiter.Limit() } // SetICMPLimit sets the maximum number of ICMP messages that be sent // in one second. func (s *Stack) SetICMPLimit(newLimit rate.Limit) { s.icmpRateLimiter.SetLimit(newLimit) } // ICMPBurst returns the maximum number of ICMP messages that can be sent // in a single burst. func (s *Stack) ICMPBurst() int { return s.icmpRateLimiter.Burst() } // SetICMPBurst sets the maximum number of ICMP messages that can be sent // in a single burst. func (s *Stack) SetICMPBurst(burst int) { s.icmpRateLimiter.SetBurst(burst) } // AllowICMPMessage returns true if we the rate limiter allows at least one // ICMP message to be sent at this instant. func (s *Stack) AllowICMPMessage() bool { return s.icmpRateLimiter.Allow() } // GetNetworkEndpoint returns the NetworkEndpoint with the specified protocol // number installed on the specified NIC. func (s *Stack) GetNetworkEndpoint(nicID tcpip.NICID, proto tcpip.NetworkProtocolNumber) (NetworkEndpoint, tcpip.Error) { s.mu.Lock() defer s.mu.Unlock() nic, ok := s.nics[nicID] if !ok { return nil, &tcpip.ErrUnknownNICID{} } return nic.getNetworkEndpoint(proto), nil } // NUDConfigurations gets the per-interface NUD configurations. func (s *Stack) NUDConfigurations(id tcpip.NICID, proto tcpip.NetworkProtocolNumber) (NUDConfigurations, tcpip.Error) { s.mu.RLock() nic, ok := s.nics[id] s.mu.RUnlock() if !ok { return NUDConfigurations{}, &tcpip.ErrUnknownNICID{} } return nic.nudConfigs(proto) } // SetNUDConfigurations sets the per-interface NUD configurations. // // Note, if c contains invalid NUD configuration values, it will be fixed to // use default values for the erroneous values. func (s *Stack) SetNUDConfigurations(id tcpip.NICID, proto tcpip.NetworkProtocolNumber, c NUDConfigurations) tcpip.Error { s.mu.RLock() nic, ok := s.nics[id] s.mu.RUnlock() if !ok { return &tcpip.ErrUnknownNICID{} } return nic.setNUDConfigs(proto, c) } // Seed returns a 32 bit value that can be used as a seed value. // // NOTE: The seed is generated once during stack initialization only. func (s *Stack) Seed() uint32 { return s.seed } // InsecureRNG returns a reference to a pseudo random generator that can be used // to generate random numbers as required. It is not cryptographically secure // and should not be used for security sensitive work. func (s *Stack) InsecureRNG() *rand.Rand { return s.insecureRNG } // SecureRNG returns the stack's cryptographically secure random number // generator. func (s *Stack) SecureRNG() cryptorand.RNG { return s.secureRNG } // FindNICNameFromID returns the name of the NIC for the given NICID. func (s *Stack) FindNICNameFromID(id tcpip.NICID) string { s.mu.RLock() defer s.mu.RUnlock() nic, ok := s.nics[id] if !ok { return "" } return nic.Name() } // ParseResult indicates the result of a parsing attempt. type ParseResult int const ( // ParsedOK indicates that a packet was successfully parsed. ParsedOK ParseResult = iota // UnknownTransportProtocol indicates that the transport protocol is unknown. UnknownTransportProtocol // TransportLayerParseError indicates that the transport packet was not // successfully parsed. TransportLayerParseError ) // ParsePacketBufferTransport parses the provided packet buffer's transport // header. func (s *Stack) ParsePacketBufferTransport(protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) ParseResult { pkt.TransportProtocolNumber = protocol // Parse the transport header if present. state, ok := s.transportProtocols[protocol] if !ok { return UnknownTransportProtocol } if !state.proto.Parse(pkt) { return TransportLayerParseError } return ParsedOK } // networkProtocolNumbers returns the network protocol numbers the stack is // configured with. func (s *Stack) networkProtocolNumbers() []tcpip.NetworkProtocolNumber { protos := make([]tcpip.NetworkProtocolNumber, 0, len(s.networkProtocols)) for p := range s.networkProtocols { protos = append(protos, p) } return protos } func isSubnetBroadcastOnNIC(nic *nic, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) bool { addressEndpoint := nic.getAddressOrCreateTempInner(protocol, addr, false /* createTemp */, NeverPrimaryEndpoint) if addressEndpoint == nil { return false } subnet := addressEndpoint.Subnet() addressEndpoint.DecRef() return subnet.IsBroadcast(addr) } // IsSubnetBroadcast returns true if the provided address is a subnet-local // broadcast address on the specified NIC and protocol. // // Returns false if the NIC is unknown or if the protocol is unknown or does // not support addressing. // // If the NIC is not specified, the stack will check all NICs. func (s *Stack) IsSubnetBroadcast(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) bool { s.mu.RLock() defer s.mu.RUnlock() if nicID != 0 { nic, ok := s.nics[nicID] if !ok { return false } return isSubnetBroadcastOnNIC(nic, protocol, addr) } for _, nic := range s.nics { if isSubnetBroadcastOnNIC(nic, protocol, addr) { return true } } return false } // PacketEndpointWriteSupported returns true iff packet endpoints support write // operations. func (s *Stack) PacketEndpointWriteSupported() bool { return s.packetEndpointWriteSupported } // SetNICStack moves the network device to the specified network namespace. func (s *Stack) SetNICStack(id tcpip.NICID, peer *Stack) (tcpip.NICID, tcpip.Error) { s.mu.Lock() nic, ok := s.nics[id] if !ok { s.mu.Unlock() return 0, &tcpip.ErrUnknownNICID{} } if s == peer { s.mu.Unlock() return id, nil } delete(s.nics, id) // Remove routes in-place. n tracks the number of routes written. s.RemoveRoutes(func(r tcpip.Route) bool { return r.NIC == id }) ne := nic.NetworkLinkEndpoint.(LinkEndpoint) deferAct, err := nic.remove(false /* closeLinkEndpoint */) s.mu.Unlock() if deferAct != nil { deferAct() } if err != nil { return 0, err } id = tcpip.NICID(peer.NextNICID()) return id, peer.CreateNICWithOptions(id, ne, NICOptions{Name: nic.Name()}) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/stack_mutex.go000066400000000000000000000044761465435605700245410ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type stackRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var stacklockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type stacklockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *stackRWMutex) Lock() { locking.AddGLock(stackprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *stackRWMutex) NestedLock(i stacklockNameIndex) { locking.AddGLock(stackprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *stackRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(stackprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *stackRWMutex) NestedUnlock(i stacklockNameIndex) { m.mu.Unlock() locking.DelGLock(stackprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *stackRWMutex) RLock() { locking.AddGLock(stackprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *stackRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(stackprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *stackRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *stackRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *stackRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var stackprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func stackinitLockNames() {} func init() { stackinitLockNames() stackprefixIndex = locking.NewMutexClass(reflect.TypeOf(stackRWMutex{}), stacklockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/stack_options.go000066400000000000000000000062001465435605700250550ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "time" "gvisor.dev/gvisor/pkg/tcpip" ) const ( // MinBufferSize is the smallest size of a receive or send buffer. MinBufferSize = 4 << 10 // 4 KiB // DefaultBufferSize is the default size of the send/recv buffer for a // transport endpoint. DefaultBufferSize = 212 << 10 // 212 KiB // DefaultMaxBufferSize is the default maximum permitted size of a // send/receive buffer. DefaultMaxBufferSize = 4 << 20 // 4 MiB // defaultTCPInvalidRateLimit is the default value for // stack.TCPInvalidRateLimit. defaultTCPInvalidRateLimit = 500 * time.Millisecond ) // ReceiveBufferSizeOption is used by stack.(Stack*).Option/SetOption to // get/set the default, min and max receive buffer sizes. type ReceiveBufferSizeOption struct { Min int Default int Max int } // TCPInvalidRateLimitOption is used by stack.(Stack*).Option/SetOption to get/set // stack.tcpInvalidRateLimit. type TCPInvalidRateLimitOption time.Duration // SetOption allows setting stack wide options. func (s *Stack) SetOption(option any) tcpip.Error { switch v := option.(type) { case tcpip.SendBufferSizeOption: // Make sure we don't allow lowering the buffer below minimum // required for stack to work. if v.Min < MinBufferSize { return &tcpip.ErrInvalidOptionValue{} } if v.Default < v.Min || v.Default > v.Max { return &tcpip.ErrInvalidOptionValue{} } s.mu.Lock() s.sendBufferSize = v s.mu.Unlock() return nil case tcpip.ReceiveBufferSizeOption: // Make sure we don't allow lowering the buffer below minimum // required for stack to work. if v.Min < MinBufferSize { return &tcpip.ErrInvalidOptionValue{} } if v.Default < v.Min || v.Default > v.Max { return &tcpip.ErrInvalidOptionValue{} } s.mu.Lock() s.receiveBufferSize = v s.mu.Unlock() return nil case TCPInvalidRateLimitOption: if v < 0 { return &tcpip.ErrInvalidOptionValue{} } s.mu.Lock() s.tcpInvalidRateLimit = time.Duration(v) s.mu.Unlock() return nil default: return &tcpip.ErrUnknownProtocolOption{} } } // Option allows retrieving stack wide options. func (s *Stack) Option(option any) tcpip.Error { switch v := option.(type) { case *tcpip.SendBufferSizeOption: s.mu.RLock() *v = s.sendBufferSize s.mu.RUnlock() return nil case *tcpip.ReceiveBufferSizeOption: s.mu.RLock() *v = s.receiveBufferSize s.mu.RUnlock() return nil case *TCPInvalidRateLimitOption: s.mu.RLock() *v = TCPInvalidRateLimitOption(s.tcpInvalidRateLimit) s.mu.RUnlock() return nil default: return &tcpip.ErrUnknownProtocolOption{} } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/stack_state_autogen.go000066400000000000000000002310371465435605700262340ustar00rootroot00000000000000// automatically generated by stateify. package stack import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (r *addressStateRefs) StateTypeName() string { return "pkg/tcpip/stack.addressStateRefs" } func (r *addressStateRefs) StateFields() []string { return []string{ "refCount", } } func (r *addressStateRefs) beforeSave() {} // +checklocksignore func (r *addressStateRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *addressStateRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (a *AddressableEndpointState) StateTypeName() string { return "pkg/tcpip/stack.AddressableEndpointState" } func (a *AddressableEndpointState) StateFields() []string { return []string{ "networkEndpoint", "options", "endpoints", "primary", } } func (a *AddressableEndpointState) beforeSave() {} // +checklocksignore func (a *AddressableEndpointState) StateSave(stateSinkObject state.Sink) { a.beforeSave() stateSinkObject.Save(0, &a.networkEndpoint) stateSinkObject.Save(1, &a.options) stateSinkObject.Save(2, &a.endpoints) stateSinkObject.Save(3, &a.primary) } func (a *AddressableEndpointState) afterLoad(context.Context) {} // +checklocksignore func (a *AddressableEndpointState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &a.networkEndpoint) stateSourceObject.Load(1, &a.options) stateSourceObject.Load(2, &a.endpoints) stateSourceObject.Load(3, &a.primary) } func (a *AddressableEndpointStateOptions) StateTypeName() string { return "pkg/tcpip/stack.AddressableEndpointStateOptions" } func (a *AddressableEndpointStateOptions) StateFields() []string { return []string{ "HiddenWhileDisabled", } } func (a *AddressableEndpointStateOptions) beforeSave() {} // +checklocksignore func (a *AddressableEndpointStateOptions) StateSave(stateSinkObject state.Sink) { a.beforeSave() stateSinkObject.Save(0, &a.HiddenWhileDisabled) } func (a *AddressableEndpointStateOptions) afterLoad(context.Context) {} // +checklocksignore func (a *AddressableEndpointStateOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &a.HiddenWhileDisabled) } func (a *addressState) StateTypeName() string { return "pkg/tcpip/stack.addressState" } func (a *addressState) StateFields() []string { return []string{ "addressableEndpointState", "addr", "subnet", "temporary", "refs", "kind", "configType", "lifetimes", "disp", } } func (a *addressState) beforeSave() {} // +checklocksignore func (a *addressState) StateSave(stateSinkObject state.Sink) { a.beforeSave() stateSinkObject.Save(0, &a.addressableEndpointState) stateSinkObject.Save(1, &a.addr) stateSinkObject.Save(2, &a.subnet) stateSinkObject.Save(3, &a.temporary) stateSinkObject.Save(4, &a.refs) stateSinkObject.Save(5, &a.kind) stateSinkObject.Save(6, &a.configType) stateSinkObject.Save(7, &a.lifetimes) stateSinkObject.Save(8, &a.disp) } func (a *addressState) afterLoad(context.Context) {} // +checklocksignore func (a *addressState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &a.addressableEndpointState) stateSourceObject.Load(1, &a.addr) stateSourceObject.Load(2, &a.subnet) stateSourceObject.Load(3, &a.temporary) stateSourceObject.Load(4, &a.refs) stateSourceObject.Load(5, &a.kind) stateSourceObject.Load(6, &a.configType) stateSourceObject.Load(7, &a.lifetimes) stateSourceObject.Load(8, &a.disp) } func (t *tuple) StateTypeName() string { return "pkg/tcpip/stack.tuple" } func (t *tuple) StateFields() []string { return []string{ "tupleEntry", "conn", "reply", "tupleID", } } func (t *tuple) beforeSave() {} // +checklocksignore func (t *tuple) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.tupleEntry) stateSinkObject.Save(1, &t.conn) stateSinkObject.Save(2, &t.reply) stateSinkObject.Save(3, &t.tupleID) } func (t *tuple) afterLoad(context.Context) {} // +checklocksignore func (t *tuple) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.tupleEntry) stateSourceObject.Load(1, &t.conn) stateSourceObject.Load(2, &t.reply) stateSourceObject.Load(3, &t.tupleID) } func (ti *tupleID) StateTypeName() string { return "pkg/tcpip/stack.tupleID" } func (ti *tupleID) StateFields() []string { return []string{ "srcAddr", "srcPortOrEchoRequestIdent", "dstAddr", "dstPortOrEchoReplyIdent", "transProto", "netProto", } } func (ti *tupleID) beforeSave() {} // +checklocksignore func (ti *tupleID) StateSave(stateSinkObject state.Sink) { ti.beforeSave() stateSinkObject.Save(0, &ti.srcAddr) stateSinkObject.Save(1, &ti.srcPortOrEchoRequestIdent) stateSinkObject.Save(2, &ti.dstAddr) stateSinkObject.Save(3, &ti.dstPortOrEchoReplyIdent) stateSinkObject.Save(4, &ti.transProto) stateSinkObject.Save(5, &ti.netProto) } func (ti *tupleID) afterLoad(context.Context) {} // +checklocksignore func (ti *tupleID) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &ti.srcAddr) stateSourceObject.Load(1, &ti.srcPortOrEchoRequestIdent) stateSourceObject.Load(2, &ti.dstAddr) stateSourceObject.Load(3, &ti.dstPortOrEchoReplyIdent) stateSourceObject.Load(4, &ti.transProto) stateSourceObject.Load(5, &ti.netProto) } func (cn *conn) StateTypeName() string { return "pkg/tcpip/stack.conn" } func (cn *conn) StateFields() []string { return []string{ "ct", "original", "reply", "finalizeResult", "sourceManip", "destinationManip", "tcb", "lastUsed", } } func (cn *conn) beforeSave() {} // +checklocksignore func (cn *conn) StateSave(stateSinkObject state.Sink) { cn.beforeSave() stateSinkObject.Save(0, &cn.ct) stateSinkObject.Save(1, &cn.original) stateSinkObject.Save(2, &cn.reply) stateSinkObject.Save(3, &cn.finalizeResult) stateSinkObject.Save(4, &cn.sourceManip) stateSinkObject.Save(5, &cn.destinationManip) stateSinkObject.Save(6, &cn.tcb) stateSinkObject.Save(7, &cn.lastUsed) } func (cn *conn) afterLoad(context.Context) {} // +checklocksignore func (cn *conn) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &cn.ct) stateSourceObject.Load(1, &cn.original) stateSourceObject.Load(2, &cn.reply) stateSourceObject.Load(3, &cn.finalizeResult) stateSourceObject.Load(4, &cn.sourceManip) stateSourceObject.Load(5, &cn.destinationManip) stateSourceObject.Load(6, &cn.tcb) stateSourceObject.Load(7, &cn.lastUsed) } func (ct *ConnTrack) StateTypeName() string { return "pkg/tcpip/stack.ConnTrack" } func (ct *ConnTrack) StateFields() []string { return []string{ "seed", "clock", "buckets", } } func (ct *ConnTrack) beforeSave() {} // +checklocksignore func (ct *ConnTrack) StateSave(stateSinkObject state.Sink) { ct.beforeSave() stateSinkObject.Save(0, &ct.seed) stateSinkObject.Save(1, &ct.clock) stateSinkObject.Save(2, &ct.buckets) } func (ct *ConnTrack) afterLoad(context.Context) {} // +checklocksignore func (ct *ConnTrack) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &ct.seed) stateSourceObject.Load(1, &ct.clock) stateSourceObject.Load(2, &ct.buckets) } func (bkt *bucket) StateTypeName() string { return "pkg/tcpip/stack.bucket" } func (bkt *bucket) StateFields() []string { return []string{ "tuples", } } func (bkt *bucket) beforeSave() {} // +checklocksignore func (bkt *bucket) StateSave(stateSinkObject state.Sink) { bkt.beforeSave() stateSinkObject.Save(0, &bkt.tuples) } func (bkt *bucket) afterLoad(context.Context) {} // +checklocksignore func (bkt *bucket) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &bkt.tuples) } func (l *ICMPRateLimiter) StateTypeName() string { return "pkg/tcpip/stack.ICMPRateLimiter" } func (l *ICMPRateLimiter) StateFields() []string { return []string{ "clock", } } func (l *ICMPRateLimiter) beforeSave() {} // +checklocksignore func (l *ICMPRateLimiter) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.clock) } func (l *ICMPRateLimiter) afterLoad(context.Context) {} // +checklocksignore func (l *ICMPRateLimiter) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.clock) } func (a *AcceptTarget) StateTypeName() string { return "pkg/tcpip/stack.AcceptTarget" } func (a *AcceptTarget) StateFields() []string { return []string{ "NetworkProtocol", } } func (a *AcceptTarget) beforeSave() {} // +checklocksignore func (a *AcceptTarget) StateSave(stateSinkObject state.Sink) { a.beforeSave() stateSinkObject.Save(0, &a.NetworkProtocol) } func (a *AcceptTarget) afterLoad(context.Context) {} // +checklocksignore func (a *AcceptTarget) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &a.NetworkProtocol) } func (d *DropTarget) StateTypeName() string { return "pkg/tcpip/stack.DropTarget" } func (d *DropTarget) StateFields() []string { return []string{ "NetworkProtocol", } } func (d *DropTarget) beforeSave() {} // +checklocksignore func (d *DropTarget) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.NetworkProtocol) } func (d *DropTarget) afterLoad(context.Context) {} // +checklocksignore func (d *DropTarget) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.NetworkProtocol) } func (rt *RejectIPv4Target) StateTypeName() string { return "pkg/tcpip/stack.RejectIPv4Target" } func (rt *RejectIPv4Target) StateFields() []string { return []string{ "Handler", "RejectWith", } } func (rt *RejectIPv4Target) beforeSave() {} // +checklocksignore func (rt *RejectIPv4Target) StateSave(stateSinkObject state.Sink) { rt.beforeSave() stateSinkObject.Save(0, &rt.Handler) stateSinkObject.Save(1, &rt.RejectWith) } func (rt *RejectIPv4Target) afterLoad(context.Context) {} // +checklocksignore func (rt *RejectIPv4Target) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &rt.Handler) stateSourceObject.Load(1, &rt.RejectWith) } func (rt *RejectIPv6Target) StateTypeName() string { return "pkg/tcpip/stack.RejectIPv6Target" } func (rt *RejectIPv6Target) StateFields() []string { return []string{ "Handler", "RejectWith", } } func (rt *RejectIPv6Target) beforeSave() {} // +checklocksignore func (rt *RejectIPv6Target) StateSave(stateSinkObject state.Sink) { rt.beforeSave() stateSinkObject.Save(0, &rt.Handler) stateSinkObject.Save(1, &rt.RejectWith) } func (rt *RejectIPv6Target) afterLoad(context.Context) {} // +checklocksignore func (rt *RejectIPv6Target) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &rt.Handler) stateSourceObject.Load(1, &rt.RejectWith) } func (e *ErrorTarget) StateTypeName() string { return "pkg/tcpip/stack.ErrorTarget" } func (e *ErrorTarget) StateFields() []string { return []string{ "NetworkProtocol", } } func (e *ErrorTarget) beforeSave() {} // +checklocksignore func (e *ErrorTarget) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.NetworkProtocol) } func (e *ErrorTarget) afterLoad(context.Context) {} // +checklocksignore func (e *ErrorTarget) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.NetworkProtocol) } func (u *UserChainTarget) StateTypeName() string { return "pkg/tcpip/stack.UserChainTarget" } func (u *UserChainTarget) StateFields() []string { return []string{ "Name", "NetworkProtocol", } } func (u *UserChainTarget) beforeSave() {} // +checklocksignore func (u *UserChainTarget) StateSave(stateSinkObject state.Sink) { u.beforeSave() stateSinkObject.Save(0, &u.Name) stateSinkObject.Save(1, &u.NetworkProtocol) } func (u *UserChainTarget) afterLoad(context.Context) {} // +checklocksignore func (u *UserChainTarget) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &u.Name) stateSourceObject.Load(1, &u.NetworkProtocol) } func (r *ReturnTarget) StateTypeName() string { return "pkg/tcpip/stack.ReturnTarget" } func (r *ReturnTarget) StateFields() []string { return []string{ "NetworkProtocol", } } func (r *ReturnTarget) beforeSave() {} // +checklocksignore func (r *ReturnTarget) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.NetworkProtocol) } func (r *ReturnTarget) afterLoad(context.Context) {} // +checklocksignore func (r *ReturnTarget) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.NetworkProtocol) } func (rt *DNATTarget) StateTypeName() string { return "pkg/tcpip/stack.DNATTarget" } func (rt *DNATTarget) StateFields() []string { return []string{ "Addr", "Port", "NetworkProtocol", "ChangeAddress", "ChangePort", } } func (rt *DNATTarget) beforeSave() {} // +checklocksignore func (rt *DNATTarget) StateSave(stateSinkObject state.Sink) { rt.beforeSave() stateSinkObject.Save(0, &rt.Addr) stateSinkObject.Save(1, &rt.Port) stateSinkObject.Save(2, &rt.NetworkProtocol) stateSinkObject.Save(3, &rt.ChangeAddress) stateSinkObject.Save(4, &rt.ChangePort) } func (rt *DNATTarget) afterLoad(context.Context) {} // +checklocksignore func (rt *DNATTarget) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &rt.Addr) stateSourceObject.Load(1, &rt.Port) stateSourceObject.Load(2, &rt.NetworkProtocol) stateSourceObject.Load(3, &rt.ChangeAddress) stateSourceObject.Load(4, &rt.ChangePort) } func (rt *RedirectTarget) StateTypeName() string { return "pkg/tcpip/stack.RedirectTarget" } func (rt *RedirectTarget) StateFields() []string { return []string{ "Port", "NetworkProtocol", } } func (rt *RedirectTarget) beforeSave() {} // +checklocksignore func (rt *RedirectTarget) StateSave(stateSinkObject state.Sink) { rt.beforeSave() stateSinkObject.Save(0, &rt.Port) stateSinkObject.Save(1, &rt.NetworkProtocol) } func (rt *RedirectTarget) afterLoad(context.Context) {} // +checklocksignore func (rt *RedirectTarget) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &rt.Port) stateSourceObject.Load(1, &rt.NetworkProtocol) } func (st *SNATTarget) StateTypeName() string { return "pkg/tcpip/stack.SNATTarget" } func (st *SNATTarget) StateFields() []string { return []string{ "Addr", "Port", "NetworkProtocol", "ChangeAddress", "ChangePort", } } func (st *SNATTarget) beforeSave() {} // +checklocksignore func (st *SNATTarget) StateSave(stateSinkObject state.Sink) { st.beforeSave() stateSinkObject.Save(0, &st.Addr) stateSinkObject.Save(1, &st.Port) stateSinkObject.Save(2, &st.NetworkProtocol) stateSinkObject.Save(3, &st.ChangeAddress) stateSinkObject.Save(4, &st.ChangePort) } func (st *SNATTarget) afterLoad(context.Context) {} // +checklocksignore func (st *SNATTarget) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &st.Addr) stateSourceObject.Load(1, &st.Port) stateSourceObject.Load(2, &st.NetworkProtocol) stateSourceObject.Load(3, &st.ChangeAddress) stateSourceObject.Load(4, &st.ChangePort) } func (mt *MasqueradeTarget) StateTypeName() string { return "pkg/tcpip/stack.MasqueradeTarget" } func (mt *MasqueradeTarget) StateFields() []string { return []string{ "NetworkProtocol", } } func (mt *MasqueradeTarget) beforeSave() {} // +checklocksignore func (mt *MasqueradeTarget) StateSave(stateSinkObject state.Sink) { mt.beforeSave() stateSinkObject.Save(0, &mt.NetworkProtocol) } func (mt *MasqueradeTarget) afterLoad(context.Context) {} // +checklocksignore func (mt *MasqueradeTarget) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &mt.NetworkProtocol) } func (it *IPTables) StateTypeName() string { return "pkg/tcpip/stack.IPTables" } func (it *IPTables) StateFields() []string { return []string{ "connections", "reaper", "v4Tables", "v6Tables", "modified", } } // +checklocksignore func (it *IPTables) StateSave(stateSinkObject state.Sink) { it.beforeSave() stateSinkObject.Save(0, &it.connections) stateSinkObject.Save(1, &it.reaper) stateSinkObject.Save(2, &it.v4Tables) stateSinkObject.Save(3, &it.v6Tables) stateSinkObject.Save(4, &it.modified) } // +checklocksignore func (it *IPTables) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &it.connections) stateSourceObject.Load(1, &it.reaper) stateSourceObject.Load(2, &it.v4Tables) stateSourceObject.Load(3, &it.v6Tables) stateSourceObject.Load(4, &it.modified) stateSourceObject.AfterLoad(func() { it.afterLoad(ctx) }) } func (table *Table) StateTypeName() string { return "pkg/tcpip/stack.Table" } func (table *Table) StateFields() []string { return []string{ "Rules", "BuiltinChains", "Underflows", } } func (table *Table) beforeSave() {} // +checklocksignore func (table *Table) StateSave(stateSinkObject state.Sink) { table.beforeSave() stateSinkObject.Save(0, &table.Rules) stateSinkObject.Save(1, &table.BuiltinChains) stateSinkObject.Save(2, &table.Underflows) } func (table *Table) afterLoad(context.Context) {} // +checklocksignore func (table *Table) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &table.Rules) stateSourceObject.Load(1, &table.BuiltinChains) stateSourceObject.Load(2, &table.Underflows) } func (r *Rule) StateTypeName() string { return "pkg/tcpip/stack.Rule" } func (r *Rule) StateFields() []string { return []string{ "Filter", "Matchers", "Target", } } func (r *Rule) beforeSave() {} // +checklocksignore func (r *Rule) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.Filter) stateSinkObject.Save(1, &r.Matchers) stateSinkObject.Save(2, &r.Target) } func (r *Rule) afterLoad(context.Context) {} // +checklocksignore func (r *Rule) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.Filter) stateSourceObject.Load(1, &r.Matchers) stateSourceObject.Load(2, &r.Target) } func (fl *IPHeaderFilter) StateTypeName() string { return "pkg/tcpip/stack.IPHeaderFilter" } func (fl *IPHeaderFilter) StateFields() []string { return []string{ "Protocol", "CheckProtocol", "Dst", "DstMask", "DstInvert", "Src", "SrcMask", "SrcInvert", "InputInterface", "InputInterfaceMask", "InputInterfaceInvert", "OutputInterface", "OutputInterfaceMask", "OutputInterfaceInvert", } } func (fl *IPHeaderFilter) beforeSave() {} // +checklocksignore func (fl *IPHeaderFilter) StateSave(stateSinkObject state.Sink) { fl.beforeSave() stateSinkObject.Save(0, &fl.Protocol) stateSinkObject.Save(1, &fl.CheckProtocol) stateSinkObject.Save(2, &fl.Dst) stateSinkObject.Save(3, &fl.DstMask) stateSinkObject.Save(4, &fl.DstInvert) stateSinkObject.Save(5, &fl.Src) stateSinkObject.Save(6, &fl.SrcMask) stateSinkObject.Save(7, &fl.SrcInvert) stateSinkObject.Save(8, &fl.InputInterface) stateSinkObject.Save(9, &fl.InputInterfaceMask) stateSinkObject.Save(10, &fl.InputInterfaceInvert) stateSinkObject.Save(11, &fl.OutputInterface) stateSinkObject.Save(12, &fl.OutputInterfaceMask) stateSinkObject.Save(13, &fl.OutputInterfaceInvert) } func (fl *IPHeaderFilter) afterLoad(context.Context) {} // +checklocksignore func (fl *IPHeaderFilter) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &fl.Protocol) stateSourceObject.Load(1, &fl.CheckProtocol) stateSourceObject.Load(2, &fl.Dst) stateSourceObject.Load(3, &fl.DstMask) stateSourceObject.Load(4, &fl.DstInvert) stateSourceObject.Load(5, &fl.Src) stateSourceObject.Load(6, &fl.SrcMask) stateSourceObject.Load(7, &fl.SrcInvert) stateSourceObject.Load(8, &fl.InputInterface) stateSourceObject.Load(9, &fl.InputInterfaceMask) stateSourceObject.Load(10, &fl.InputInterfaceInvert) stateSourceObject.Load(11, &fl.OutputInterface) stateSourceObject.Load(12, &fl.OutputInterfaceMask) stateSourceObject.Load(13, &fl.OutputInterfaceInvert) } func (d *dynamicCacheEntry) StateTypeName() string { return "pkg/tcpip/stack.dynamicCacheEntry" } func (d *dynamicCacheEntry) StateFields() []string { return []string{ "lru", "count", } } func (d *dynamicCacheEntry) beforeSave() {} // +checklocksignore func (d *dynamicCacheEntry) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.lru) stateSinkObject.Save(1, &d.count) } func (d *dynamicCacheEntry) afterLoad(context.Context) {} // +checklocksignore func (d *dynamicCacheEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.lru) stateSourceObject.Load(1, &d.count) } func (n *neighborCacheMu) StateTypeName() string { return "pkg/tcpip/stack.neighborCacheMu" } func (n *neighborCacheMu) StateFields() []string { return []string{ "cache", "dynamic", } } func (n *neighborCacheMu) beforeSave() {} // +checklocksignore func (n *neighborCacheMu) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.cache) stateSinkObject.Save(1, &n.dynamic) } func (n *neighborCacheMu) afterLoad(context.Context) {} // +checklocksignore func (n *neighborCacheMu) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.cache) stateSourceObject.Load(1, &n.dynamic) } func (n *neighborCache) StateTypeName() string { return "pkg/tcpip/stack.neighborCache" } func (n *neighborCache) StateFields() []string { return []string{ "nic", "state", "linkRes", "mu", } } func (n *neighborCache) beforeSave() {} // +checklocksignore func (n *neighborCache) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.nic) stateSinkObject.Save(1, &n.state) stateSinkObject.Save(2, &n.linkRes) stateSinkObject.Save(3, &n.mu) } func (n *neighborCache) afterLoad(context.Context) {} // +checklocksignore func (n *neighborCache) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.nic) stateSourceObject.Load(1, &n.state) stateSourceObject.Load(2, &n.linkRes) stateSourceObject.Load(3, &n.mu) } func (l *neighborEntryList) StateTypeName() string { return "pkg/tcpip/stack.neighborEntryList" } func (l *neighborEntryList) StateFields() []string { return []string{ "head", "tail", } } func (l *neighborEntryList) beforeSave() {} // +checklocksignore func (l *neighborEntryList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *neighborEntryList) afterLoad(context.Context) {} // +checklocksignore func (l *neighborEntryList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *neighborEntryEntry) StateTypeName() string { return "pkg/tcpip/stack.neighborEntryEntry" } func (e *neighborEntryEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *neighborEntryEntry) beforeSave() {} // +checklocksignore func (e *neighborEntryEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *neighborEntryEntry) afterLoad(context.Context) {} // +checklocksignore func (e *neighborEntryEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (l *linkResolver) StateTypeName() string { return "pkg/tcpip/stack.linkResolver" } func (l *linkResolver) StateFields() []string { return []string{ "resolver", "neigh", } } func (l *linkResolver) beforeSave() {} // +checklocksignore func (l *linkResolver) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.resolver) stateSinkObject.Save(1, &l.neigh) } func (l *linkResolver) afterLoad(context.Context) {} // +checklocksignore func (l *linkResolver) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.resolver) stateSourceObject.Load(1, &l.neigh) } func (n *nic) StateTypeName() string { return "pkg/tcpip/stack.nic" } func (n *nic) StateFields() []string { return []string{ "NetworkLinkEndpoint", "stack", "id", "name", "context", "stats", "networkEndpoints", "linkAddrResolvers", "duplicateAddressDetectors", "enabled", "spoofing", "promiscuous", "linkResQueue", "packetEPs", "qDisc", "deliverLinkPackets", "Primary", } } func (n *nic) beforeSave() {} // +checklocksignore func (n *nic) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.NetworkLinkEndpoint) stateSinkObject.Save(1, &n.stack) stateSinkObject.Save(2, &n.id) stateSinkObject.Save(3, &n.name) stateSinkObject.Save(4, &n.context) stateSinkObject.Save(5, &n.stats) stateSinkObject.Save(6, &n.networkEndpoints) stateSinkObject.Save(7, &n.linkAddrResolvers) stateSinkObject.Save(8, &n.duplicateAddressDetectors) stateSinkObject.Save(9, &n.enabled) stateSinkObject.Save(10, &n.spoofing) stateSinkObject.Save(11, &n.promiscuous) stateSinkObject.Save(12, &n.linkResQueue) stateSinkObject.Save(13, &n.packetEPs) stateSinkObject.Save(14, &n.qDisc) stateSinkObject.Save(15, &n.deliverLinkPackets) stateSinkObject.Save(16, &n.Primary) } func (n *nic) afterLoad(context.Context) {} // +checklocksignore func (n *nic) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.NetworkLinkEndpoint) stateSourceObject.Load(1, &n.stack) stateSourceObject.Load(2, &n.id) stateSourceObject.Load(3, &n.name) stateSourceObject.Load(4, &n.context) stateSourceObject.Load(5, &n.stats) stateSourceObject.Load(6, &n.networkEndpoints) stateSourceObject.Load(7, &n.linkAddrResolvers) stateSourceObject.Load(8, &n.duplicateAddressDetectors) stateSourceObject.Load(9, &n.enabled) stateSourceObject.Load(10, &n.spoofing) stateSourceObject.Load(11, &n.promiscuous) stateSourceObject.Load(12, &n.linkResQueue) stateSourceObject.Load(13, &n.packetEPs) stateSourceObject.Load(14, &n.qDisc) stateSourceObject.Load(15, &n.deliverLinkPackets) stateSourceObject.Load(16, &n.Primary) } func (p *packetEndpointList) StateTypeName() string { return "pkg/tcpip/stack.packetEndpointList" } func (p *packetEndpointList) StateFields() []string { return []string{ "mu", "eps", } } func (p *packetEndpointList) beforeSave() {} // +checklocksignore func (p *packetEndpointList) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.mu) stateSinkObject.Save(1, &p.eps) } func (p *packetEndpointList) afterLoad(context.Context) {} // +checklocksignore func (p *packetEndpointList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.mu) stateSourceObject.Load(1, &p.eps) } func (qDisc *delegatingQueueingDiscipline) StateTypeName() string { return "pkg/tcpip/stack.delegatingQueueingDiscipline" } func (qDisc *delegatingQueueingDiscipline) StateFields() []string { return []string{ "LinkWriter", } } func (qDisc *delegatingQueueingDiscipline) beforeSave() {} // +checklocksignore func (qDisc *delegatingQueueingDiscipline) StateSave(stateSinkObject state.Sink) { qDisc.beforeSave() stateSinkObject.Save(0, &qDisc.LinkWriter) } func (qDisc *delegatingQueueingDiscipline) afterLoad(context.Context) {} // +checklocksignore func (qDisc *delegatingQueueingDiscipline) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &qDisc.LinkWriter) } func (s *sharedStats) StateTypeName() string { return "pkg/tcpip/stack.sharedStats" } func (s *sharedStats) StateFields() []string { return []string{ "local", "multiCounterNICStats", } } func (s *sharedStats) beforeSave() {} // +checklocksignore func (s *sharedStats) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.local) stateSinkObject.Save(1, &s.multiCounterNICStats) } func (s *sharedStats) afterLoad(context.Context) {} // +checklocksignore func (s *sharedStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.local) stateSourceObject.Load(1, &s.multiCounterNICStats) } func (m *multiCounterNICPacketStats) StateTypeName() string { return "pkg/tcpip/stack.multiCounterNICPacketStats" } func (m *multiCounterNICPacketStats) StateFields() []string { return []string{ "packets", "bytes", } } func (m *multiCounterNICPacketStats) beforeSave() {} // +checklocksignore func (m *multiCounterNICPacketStats) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.packets) stateSinkObject.Save(1, &m.bytes) } func (m *multiCounterNICPacketStats) afterLoad(context.Context) {} // +checklocksignore func (m *multiCounterNICPacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.packets) stateSourceObject.Load(1, &m.bytes) } func (m *multiCounterNICNeighborStats) StateTypeName() string { return "pkg/tcpip/stack.multiCounterNICNeighborStats" } func (m *multiCounterNICNeighborStats) StateFields() []string { return []string{ "unreachableEntryLookups", "droppedConfirmationForNoninitiatedNeighbor", "droppedInvalidLinkAddressConfirmations", } } func (m *multiCounterNICNeighborStats) beforeSave() {} // +checklocksignore func (m *multiCounterNICNeighborStats) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.unreachableEntryLookups) stateSinkObject.Save(1, &m.droppedConfirmationForNoninitiatedNeighbor) stateSinkObject.Save(2, &m.droppedInvalidLinkAddressConfirmations) } func (m *multiCounterNICNeighborStats) afterLoad(context.Context) {} // +checklocksignore func (m *multiCounterNICNeighborStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.unreachableEntryLookups) stateSourceObject.Load(1, &m.droppedConfirmationForNoninitiatedNeighbor) stateSourceObject.Load(2, &m.droppedInvalidLinkAddressConfirmations) } func (m *multiCounterNICStats) StateTypeName() string { return "pkg/tcpip/stack.multiCounterNICStats" } func (m *multiCounterNICStats) StateFields() []string { return []string{ "unknownL3ProtocolRcvdPacketCounts", "unknownL4ProtocolRcvdPacketCounts", "malformedL4RcvdPackets", "tx", "txPacketsDroppedNoBufferSpace", "rx", "disabledRx", "neighbor", } } func (m *multiCounterNICStats) beforeSave() {} // +checklocksignore func (m *multiCounterNICStats) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.unknownL3ProtocolRcvdPacketCounts) stateSinkObject.Save(1, &m.unknownL4ProtocolRcvdPacketCounts) stateSinkObject.Save(2, &m.malformedL4RcvdPackets) stateSinkObject.Save(3, &m.tx) stateSinkObject.Save(4, &m.txPacketsDroppedNoBufferSpace) stateSinkObject.Save(5, &m.rx) stateSinkObject.Save(6, &m.disabledRx) stateSinkObject.Save(7, &m.neighbor) } func (m *multiCounterNICStats) afterLoad(context.Context) {} // +checklocksignore func (m *multiCounterNICStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.unknownL3ProtocolRcvdPacketCounts) stateSourceObject.Load(1, &m.unknownL4ProtocolRcvdPacketCounts) stateSourceObject.Load(2, &m.malformedL4RcvdPackets) stateSourceObject.Load(3, &m.tx) stateSourceObject.Load(4, &m.txPacketsDroppedNoBufferSpace) stateSourceObject.Load(5, &m.rx) stateSourceObject.Load(6, &m.disabledRx) stateSourceObject.Load(7, &m.neighbor) } func (c *NUDConfigurations) StateTypeName() string { return "pkg/tcpip/stack.NUDConfigurations" } func (c *NUDConfigurations) StateFields() []string { return []string{ "BaseReachableTime", "LearnBaseReachableTime", "MinRandomFactor", "MaxRandomFactor", "RetransmitTimer", "LearnRetransmitTimer", "DelayFirstProbeTime", "MaxMulticastProbes", "MaxUnicastProbes", "MaxAnycastDelayTime", "MaxReachabilityConfirmations", } } func (c *NUDConfigurations) beforeSave() {} // +checklocksignore func (c *NUDConfigurations) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.BaseReachableTime) stateSinkObject.Save(1, &c.LearnBaseReachableTime) stateSinkObject.Save(2, &c.MinRandomFactor) stateSinkObject.Save(3, &c.MaxRandomFactor) stateSinkObject.Save(4, &c.RetransmitTimer) stateSinkObject.Save(5, &c.LearnRetransmitTimer) stateSinkObject.Save(6, &c.DelayFirstProbeTime) stateSinkObject.Save(7, &c.MaxMulticastProbes) stateSinkObject.Save(8, &c.MaxUnicastProbes) stateSinkObject.Save(9, &c.MaxAnycastDelayTime) stateSinkObject.Save(10, &c.MaxReachabilityConfirmations) } func (c *NUDConfigurations) afterLoad(context.Context) {} // +checklocksignore func (c *NUDConfigurations) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.BaseReachableTime) stateSourceObject.Load(1, &c.LearnBaseReachableTime) stateSourceObject.Load(2, &c.MinRandomFactor) stateSourceObject.Load(3, &c.MaxRandomFactor) stateSourceObject.Load(4, &c.RetransmitTimer) stateSourceObject.Load(5, &c.LearnRetransmitTimer) stateSourceObject.Load(6, &c.DelayFirstProbeTime) stateSourceObject.Load(7, &c.MaxMulticastProbes) stateSourceObject.Load(8, &c.MaxUnicastProbes) stateSourceObject.Load(9, &c.MaxAnycastDelayTime) stateSourceObject.Load(10, &c.MaxReachabilityConfirmations) } func (n *nudStateMu) StateTypeName() string { return "pkg/tcpip/stack.nudStateMu" } func (n *nudStateMu) StateFields() []string { return []string{ "config", "reachableTime", "expiration", "prevBaseReachableTime", "prevMinRandomFactor", "prevMaxRandomFactor", } } func (n *nudStateMu) beforeSave() {} // +checklocksignore func (n *nudStateMu) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.config) stateSinkObject.Save(1, &n.reachableTime) stateSinkObject.Save(2, &n.expiration) stateSinkObject.Save(3, &n.prevBaseReachableTime) stateSinkObject.Save(4, &n.prevMinRandomFactor) stateSinkObject.Save(5, &n.prevMaxRandomFactor) } func (n *nudStateMu) afterLoad(context.Context) {} // +checklocksignore func (n *nudStateMu) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.config) stateSourceObject.Load(1, &n.reachableTime) stateSourceObject.Load(2, &n.expiration) stateSourceObject.Load(3, &n.prevBaseReachableTime) stateSourceObject.Load(4, &n.prevMinRandomFactor) stateSourceObject.Load(5, &n.prevMaxRandomFactor) } func (s *NUDState) StateTypeName() string { return "pkg/tcpip/stack.NUDState" } func (s *NUDState) StateFields() []string { return []string{ "clock", "mu", } } func (s *NUDState) beforeSave() {} // +checklocksignore func (s *NUDState) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.clock) stateSinkObject.Save(1, &s.mu) } func (s *NUDState) afterLoad(context.Context) {} // +checklocksignore func (s *NUDState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.clock) stateSourceObject.Load(1, &s.mu) } func (pk *PacketBuffer) StateTypeName() string { return "pkg/tcpip/stack.PacketBuffer" } func (pk *PacketBuffer) StateFields() []string { return []string{ "packetBufferRefs", "buf", "reserved", "pushed", "consumed", "headers", "NetworkProtocolNumber", "TransportProtocolNumber", "Hash", "Owner", "EgressRoute", "GSOOptions", "snatDone", "dnatDone", "PktType", "NICID", "RXChecksumValidated", "NetworkPacketInfo", "tuple", } } func (pk *PacketBuffer) beforeSave() {} // +checklocksignore func (pk *PacketBuffer) StateSave(stateSinkObject state.Sink) { pk.beforeSave() stateSinkObject.Save(0, &pk.packetBufferRefs) stateSinkObject.Save(1, &pk.buf) stateSinkObject.Save(2, &pk.reserved) stateSinkObject.Save(3, &pk.pushed) stateSinkObject.Save(4, &pk.consumed) stateSinkObject.Save(5, &pk.headers) stateSinkObject.Save(6, &pk.NetworkProtocolNumber) stateSinkObject.Save(7, &pk.TransportProtocolNumber) stateSinkObject.Save(8, &pk.Hash) stateSinkObject.Save(9, &pk.Owner) stateSinkObject.Save(10, &pk.EgressRoute) stateSinkObject.Save(11, &pk.GSOOptions) stateSinkObject.Save(12, &pk.snatDone) stateSinkObject.Save(13, &pk.dnatDone) stateSinkObject.Save(14, &pk.PktType) stateSinkObject.Save(15, &pk.NICID) stateSinkObject.Save(16, &pk.RXChecksumValidated) stateSinkObject.Save(17, &pk.NetworkPacketInfo) stateSinkObject.Save(18, &pk.tuple) } func (pk *PacketBuffer) afterLoad(context.Context) {} // +checklocksignore func (pk *PacketBuffer) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &pk.packetBufferRefs) stateSourceObject.Load(1, &pk.buf) stateSourceObject.Load(2, &pk.reserved) stateSourceObject.Load(3, &pk.pushed) stateSourceObject.Load(4, &pk.consumed) stateSourceObject.Load(5, &pk.headers) stateSourceObject.Load(6, &pk.NetworkProtocolNumber) stateSourceObject.Load(7, &pk.TransportProtocolNumber) stateSourceObject.Load(8, &pk.Hash) stateSourceObject.Load(9, &pk.Owner) stateSourceObject.Load(10, &pk.EgressRoute) stateSourceObject.Load(11, &pk.GSOOptions) stateSourceObject.Load(12, &pk.snatDone) stateSourceObject.Load(13, &pk.dnatDone) stateSourceObject.Load(14, &pk.PktType) stateSourceObject.Load(15, &pk.NICID) stateSourceObject.Load(16, &pk.RXChecksumValidated) stateSourceObject.Load(17, &pk.NetworkPacketInfo) stateSourceObject.Load(18, &pk.tuple) } func (h *headerInfo) StateTypeName() string { return "pkg/tcpip/stack.headerInfo" } func (h *headerInfo) StateFields() []string { return []string{ "offset", "length", } } func (h *headerInfo) beforeSave() {} // +checklocksignore func (h *headerInfo) StateSave(stateSinkObject state.Sink) { h.beforeSave() stateSinkObject.Save(0, &h.offset) stateSinkObject.Save(1, &h.length) } func (h *headerInfo) afterLoad(context.Context) {} // +checklocksignore func (h *headerInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &h.offset) stateSourceObject.Load(1, &h.length) } func (d *PacketData) StateTypeName() string { return "pkg/tcpip/stack.PacketData" } func (d *PacketData) StateFields() []string { return []string{ "pk", } } func (d *PacketData) beforeSave() {} // +checklocksignore func (d *PacketData) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.pk) } func (d *PacketData) afterLoad(context.Context) {} // +checklocksignore func (d *PacketData) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.pk) } func (pl *PacketBufferList) StateTypeName() string { return "pkg/tcpip/stack.PacketBufferList" } func (pl *PacketBufferList) StateFields() []string { return []string{ "pbs", } } func (pl *PacketBufferList) beforeSave() {} // +checklocksignore func (pl *PacketBufferList) StateSave(stateSinkObject state.Sink) { pl.beforeSave() stateSinkObject.Save(0, &pl.pbs) } func (pl *PacketBufferList) afterLoad(context.Context) {} // +checklocksignore func (pl *PacketBufferList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &pl.pbs) } func (r *packetBufferRefs) StateTypeName() string { return "pkg/tcpip/stack.packetBufferRefs" } func (r *packetBufferRefs) StateFields() []string { return []string{ "refCount", } } func (r *packetBufferRefs) beforeSave() {} // +checklocksignore func (r *packetBufferRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *packetBufferRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func (p *pendingPacket) StateTypeName() string { return "pkg/tcpip/stack.pendingPacket" } func (p *pendingPacket) StateFields() []string { return []string{ "routeInfo", "pkt", } } func (p *pendingPacket) beforeSave() {} // +checklocksignore func (p *pendingPacket) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.routeInfo) stateSinkObject.Save(1, &p.pkt) } func (p *pendingPacket) afterLoad(context.Context) {} // +checklocksignore func (p *pendingPacket) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.routeInfo) stateSourceObject.Load(1, &p.pkt) } func (p *packetsPendingLinkResolutionMu) StateTypeName() string { return "pkg/tcpip/stack.packetsPendingLinkResolutionMu" } func (p *packetsPendingLinkResolutionMu) StateFields() []string { return []string{ "packets", "cancelChans", } } func (p *packetsPendingLinkResolutionMu) beforeSave() {} // +checklocksignore func (p *packetsPendingLinkResolutionMu) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.packets) stateSinkObject.Save(1, &p.cancelChans) } func (p *packetsPendingLinkResolutionMu) afterLoad(context.Context) {} // +checklocksignore func (p *packetsPendingLinkResolutionMu) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.packets) stateSourceObject.Load(1, &p.cancelChans) } func (f *packetsPendingLinkResolution) StateTypeName() string { return "pkg/tcpip/stack.packetsPendingLinkResolution" } func (f *packetsPendingLinkResolution) StateFields() []string { return []string{ "nic", "mu", } } func (f *packetsPendingLinkResolution) beforeSave() {} // +checklocksignore func (f *packetsPendingLinkResolution) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.nic) stateSinkObject.Save(1, &f.mu) } func (f *packetsPendingLinkResolution) afterLoad(context.Context) {} // +checklocksignore func (f *packetsPendingLinkResolution) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.nic) stateSourceObject.Load(1, &f.mu) } func (t *TransportEndpointID) StateTypeName() string { return "pkg/tcpip/stack.TransportEndpointID" } func (t *TransportEndpointID) StateFields() []string { return []string{ "LocalPort", "LocalAddress", "RemotePort", "RemoteAddress", } } func (t *TransportEndpointID) beforeSave() {} // +checklocksignore func (t *TransportEndpointID) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.LocalPort) stateSinkObject.Save(1, &t.LocalAddress) stateSinkObject.Save(2, &t.RemotePort) stateSinkObject.Save(3, &t.RemoteAddress) } func (t *TransportEndpointID) afterLoad(context.Context) {} // +checklocksignore func (t *TransportEndpointID) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.LocalPort) stateSourceObject.Load(1, &t.LocalAddress) stateSourceObject.Load(2, &t.RemotePort) stateSourceObject.Load(3, &t.RemoteAddress) } func (n *NetworkPacketInfo) StateTypeName() string { return "pkg/tcpip/stack.NetworkPacketInfo" } func (n *NetworkPacketInfo) StateFields() []string { return []string{ "LocalAddressBroadcast", "IsForwardedPacket", } } func (n *NetworkPacketInfo) beforeSave() {} // +checklocksignore func (n *NetworkPacketInfo) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.LocalAddressBroadcast) stateSinkObject.Save(1, &n.IsForwardedPacket) } func (n *NetworkPacketInfo) afterLoad(context.Context) {} // +checklocksignore func (n *NetworkPacketInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.LocalAddressBroadcast) stateSourceObject.Load(1, &n.IsForwardedPacket) } func (lifetimes *AddressLifetimes) StateTypeName() string { return "pkg/tcpip/stack.AddressLifetimes" } func (lifetimes *AddressLifetimes) StateFields() []string { return []string{ "Deprecated", "PreferredUntil", "ValidUntil", } } func (lifetimes *AddressLifetimes) beforeSave() {} // +checklocksignore func (lifetimes *AddressLifetimes) StateSave(stateSinkObject state.Sink) { lifetimes.beforeSave() stateSinkObject.Save(0, &lifetimes.Deprecated) stateSinkObject.Save(1, &lifetimes.PreferredUntil) stateSinkObject.Save(2, &lifetimes.ValidUntil) } func (lifetimes *AddressLifetimes) afterLoad(context.Context) {} // +checklocksignore func (lifetimes *AddressLifetimes) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &lifetimes.Deprecated) stateSourceObject.Load(1, &lifetimes.PreferredUntil) stateSourceObject.Load(2, &lifetimes.ValidUntil) } func (u *UnicastSourceAndMulticastDestination) StateTypeName() string { return "pkg/tcpip/stack.UnicastSourceAndMulticastDestination" } func (u *UnicastSourceAndMulticastDestination) StateFields() []string { return []string{ "Source", "Destination", } } func (u *UnicastSourceAndMulticastDestination) beforeSave() {} // +checklocksignore func (u *UnicastSourceAndMulticastDestination) StateSave(stateSinkObject state.Sink) { u.beforeSave() stateSinkObject.Save(0, &u.Source) stateSinkObject.Save(1, &u.Destination) } func (u *UnicastSourceAndMulticastDestination) afterLoad(context.Context) {} // +checklocksignore func (u *UnicastSourceAndMulticastDestination) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &u.Source) stateSourceObject.Load(1, &u.Destination) } func (c *DADConfigurations) StateTypeName() string { return "pkg/tcpip/stack.DADConfigurations" } func (c *DADConfigurations) StateFields() []string { return []string{ "DupAddrDetectTransmits", "RetransmitTimer", } } func (c *DADConfigurations) beforeSave() {} // +checklocksignore func (c *DADConfigurations) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.DupAddrDetectTransmits) stateSinkObject.Save(1, &c.RetransmitTimer) } func (c *DADConfigurations) afterLoad(context.Context) {} // +checklocksignore func (c *DADConfigurations) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.DupAddrDetectTransmits) stateSourceObject.Load(1, &c.RetransmitTimer) } func (g *GSOType) StateTypeName() string { return "pkg/tcpip/stack.GSOType" } func (g *GSOType) StateFields() []string { return nil } func (g *GSO) StateTypeName() string { return "pkg/tcpip/stack.GSO" } func (g *GSO) StateFields() []string { return []string{ "Type", "NeedsCsum", "CsumOffset", "MSS", "L3HdrLen", "MaxSize", } } func (g *GSO) beforeSave() {} // +checklocksignore func (g *GSO) StateSave(stateSinkObject state.Sink) { g.beforeSave() stateSinkObject.Save(0, &g.Type) stateSinkObject.Save(1, &g.NeedsCsum) stateSinkObject.Save(2, &g.CsumOffset) stateSinkObject.Save(3, &g.MSS) stateSinkObject.Save(4, &g.L3HdrLen) stateSinkObject.Save(5, &g.MaxSize) } func (g *GSO) afterLoad(context.Context) {} // +checklocksignore func (g *GSO) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &g.Type) stateSourceObject.Load(1, &g.NeedsCsum) stateSourceObject.Load(2, &g.CsumOffset) stateSourceObject.Load(3, &g.MSS) stateSourceObject.Load(4, &g.L3HdrLen) stateSourceObject.Load(5, &g.MaxSize) } func (r *routeInfo) StateTypeName() string { return "pkg/tcpip/stack.routeInfo" } func (r *routeInfo) StateFields() []string { return []string{ "RemoteAddress", "LocalAddress", "LocalLinkAddress", "NextHop", "NetProto", "Loop", } } func (r *routeInfo) beforeSave() {} // +checklocksignore func (r *routeInfo) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.RemoteAddress) stateSinkObject.Save(1, &r.LocalAddress) stateSinkObject.Save(2, &r.LocalLinkAddress) stateSinkObject.Save(3, &r.NextHop) stateSinkObject.Save(4, &r.NetProto) stateSinkObject.Save(5, &r.Loop) } func (r *routeInfo) afterLoad(context.Context) {} // +checklocksignore func (r *routeInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.RemoteAddress) stateSourceObject.Load(1, &r.LocalAddress) stateSourceObject.Load(2, &r.LocalLinkAddress) stateSourceObject.Load(3, &r.NextHop) stateSourceObject.Load(4, &r.NetProto) stateSourceObject.Load(5, &r.Loop) } func (r *RouteInfo) StateTypeName() string { return "pkg/tcpip/stack.RouteInfo" } func (r *RouteInfo) StateFields() []string { return []string{ "routeInfo", "RemoteLinkAddress", } } func (r *RouteInfo) beforeSave() {} // +checklocksignore func (r *RouteInfo) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.routeInfo) stateSinkObject.Save(1, &r.RemoteLinkAddress) } func (r *RouteInfo) afterLoad(context.Context) {} // +checklocksignore func (r *RouteInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.routeInfo) stateSourceObject.Load(1, &r.RemoteLinkAddress) } func (t *transportProtocolState) StateTypeName() string { return "pkg/tcpip/stack.transportProtocolState" } func (t *transportProtocolState) StateFields() []string { return []string{ "proto", } } func (t *transportProtocolState) beforeSave() {} // +checklocksignore func (t *transportProtocolState) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.proto) } func (t *transportProtocolState) afterLoad(context.Context) {} // +checklocksignore func (t *transportProtocolState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.proto) } func (s *Stack) StateTypeName() string { return "pkg/tcpip/stack.Stack" } func (s *Stack) StateFields() []string { return []string{ "transportProtocols", "networkProtocols", "rawFactory", "packetEndpointWriteSupported", "demux", "stats", "routeTable", "nics", "defaultForwardingEnabled", "nicIDGen", "cleanupEndpoints", "PortManager", "clock", "handleLocal", "restoredEndpoints", "resumableEndpoints", "icmpRateLimiter", "seed", "nudConfigs", "nudDisp", "sendBufferSize", "receiveBufferSize", "tcpInvalidRateLimit", "tsOffsetSecret", } } func (s *Stack) beforeSave() {} // +checklocksignore func (s *Stack) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.transportProtocols) stateSinkObject.Save(1, &s.networkProtocols) stateSinkObject.Save(2, &s.rawFactory) stateSinkObject.Save(3, &s.packetEndpointWriteSupported) stateSinkObject.Save(4, &s.demux) stateSinkObject.Save(5, &s.stats) stateSinkObject.Save(6, &s.routeTable) stateSinkObject.Save(7, &s.nics) stateSinkObject.Save(8, &s.defaultForwardingEnabled) stateSinkObject.Save(9, &s.nicIDGen) stateSinkObject.Save(10, &s.cleanupEndpoints) stateSinkObject.Save(11, &s.PortManager) stateSinkObject.Save(12, &s.clock) stateSinkObject.Save(13, &s.handleLocal) stateSinkObject.Save(14, &s.restoredEndpoints) stateSinkObject.Save(15, &s.resumableEndpoints) stateSinkObject.Save(16, &s.icmpRateLimiter) stateSinkObject.Save(17, &s.seed) stateSinkObject.Save(18, &s.nudConfigs) stateSinkObject.Save(19, &s.nudDisp) stateSinkObject.Save(20, &s.sendBufferSize) stateSinkObject.Save(21, &s.receiveBufferSize) stateSinkObject.Save(22, &s.tcpInvalidRateLimit) stateSinkObject.Save(23, &s.tsOffsetSecret) } func (s *Stack) afterLoad(context.Context) {} // +checklocksignore func (s *Stack) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.transportProtocols) stateSourceObject.Load(1, &s.networkProtocols) stateSourceObject.Load(2, &s.rawFactory) stateSourceObject.Load(3, &s.packetEndpointWriteSupported) stateSourceObject.Load(4, &s.demux) stateSourceObject.Load(5, &s.stats) stateSourceObject.Load(6, &s.routeTable) stateSourceObject.Load(7, &s.nics) stateSourceObject.Load(8, &s.defaultForwardingEnabled) stateSourceObject.Load(9, &s.nicIDGen) stateSourceObject.Load(10, &s.cleanupEndpoints) stateSourceObject.Load(11, &s.PortManager) stateSourceObject.Load(12, &s.clock) stateSourceObject.Load(13, &s.handleLocal) stateSourceObject.Load(14, &s.restoredEndpoints) stateSourceObject.Load(15, &s.resumableEndpoints) stateSourceObject.Load(16, &s.icmpRateLimiter) stateSourceObject.Load(17, &s.seed) stateSourceObject.Load(18, &s.nudConfigs) stateSourceObject.Load(19, &s.nudDisp) stateSourceObject.Load(20, &s.sendBufferSize) stateSourceObject.Load(21, &s.receiveBufferSize) stateSourceObject.Load(22, &s.tcpInvalidRateLimit) stateSourceObject.Load(23, &s.tsOffsetSecret) } func (t *TransportEndpointInfo) StateTypeName() string { return "pkg/tcpip/stack.TransportEndpointInfo" } func (t *TransportEndpointInfo) StateFields() []string { return []string{ "NetProto", "TransProto", "ID", "BindNICID", "BindAddr", "RegisterNICID", } } func (t *TransportEndpointInfo) beforeSave() {} // +checklocksignore func (t *TransportEndpointInfo) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.NetProto) stateSinkObject.Save(1, &t.TransProto) stateSinkObject.Save(2, &t.ID) stateSinkObject.Save(3, &t.BindNICID) stateSinkObject.Save(4, &t.BindAddr) stateSinkObject.Save(5, &t.RegisterNICID) } func (t *TransportEndpointInfo) afterLoad(context.Context) {} // +checklocksignore func (t *TransportEndpointInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.NetProto) stateSourceObject.Load(1, &t.TransProto) stateSourceObject.Load(2, &t.ID) stateSourceObject.Load(3, &t.BindNICID) stateSourceObject.Load(4, &t.BindAddr) stateSourceObject.Load(5, &t.RegisterNICID) } func (t *TCPCubicState) StateTypeName() string { return "pkg/tcpip/stack.TCPCubicState" } func (t *TCPCubicState) StateFields() []string { return []string{ "WLastMax", "WMax", "T", "TimeSinceLastCongestion", "C", "K", "Beta", "WC", "WEst", "EndSeq", "CurrRTT", "LastRTT", "SampleCount", "LastAck", "RoundStart", } } func (t *TCPCubicState) beforeSave() {} // +checklocksignore func (t *TCPCubicState) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.WLastMax) stateSinkObject.Save(1, &t.WMax) stateSinkObject.Save(2, &t.T) stateSinkObject.Save(3, &t.TimeSinceLastCongestion) stateSinkObject.Save(4, &t.C) stateSinkObject.Save(5, &t.K) stateSinkObject.Save(6, &t.Beta) stateSinkObject.Save(7, &t.WC) stateSinkObject.Save(8, &t.WEst) stateSinkObject.Save(9, &t.EndSeq) stateSinkObject.Save(10, &t.CurrRTT) stateSinkObject.Save(11, &t.LastRTT) stateSinkObject.Save(12, &t.SampleCount) stateSinkObject.Save(13, &t.LastAck) stateSinkObject.Save(14, &t.RoundStart) } func (t *TCPCubicState) afterLoad(context.Context) {} // +checklocksignore func (t *TCPCubicState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.WLastMax) stateSourceObject.Load(1, &t.WMax) stateSourceObject.Load(2, &t.T) stateSourceObject.Load(3, &t.TimeSinceLastCongestion) stateSourceObject.Load(4, &t.C) stateSourceObject.Load(5, &t.K) stateSourceObject.Load(6, &t.Beta) stateSourceObject.Load(7, &t.WC) stateSourceObject.Load(8, &t.WEst) stateSourceObject.Load(9, &t.EndSeq) stateSourceObject.Load(10, &t.CurrRTT) stateSourceObject.Load(11, &t.LastRTT) stateSourceObject.Load(12, &t.SampleCount) stateSourceObject.Load(13, &t.LastAck) stateSourceObject.Load(14, &t.RoundStart) } func (t *TCPRACKState) StateTypeName() string { return "pkg/tcpip/stack.TCPRACKState" } func (t *TCPRACKState) StateFields() []string { return []string{ "XmitTime", "EndSequence", "FACK", "RTT", "Reord", "DSACKSeen", "ReoWnd", "ReoWndIncr", "ReoWndPersist", "RTTSeq", } } func (t *TCPRACKState) beforeSave() {} // +checklocksignore func (t *TCPRACKState) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.XmitTime) stateSinkObject.Save(1, &t.EndSequence) stateSinkObject.Save(2, &t.FACK) stateSinkObject.Save(3, &t.RTT) stateSinkObject.Save(4, &t.Reord) stateSinkObject.Save(5, &t.DSACKSeen) stateSinkObject.Save(6, &t.ReoWnd) stateSinkObject.Save(7, &t.ReoWndIncr) stateSinkObject.Save(8, &t.ReoWndPersist) stateSinkObject.Save(9, &t.RTTSeq) } func (t *TCPRACKState) afterLoad(context.Context) {} // +checklocksignore func (t *TCPRACKState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.XmitTime) stateSourceObject.Load(1, &t.EndSequence) stateSourceObject.Load(2, &t.FACK) stateSourceObject.Load(3, &t.RTT) stateSourceObject.Load(4, &t.Reord) stateSourceObject.Load(5, &t.DSACKSeen) stateSourceObject.Load(6, &t.ReoWnd) stateSourceObject.Load(7, &t.ReoWndIncr) stateSourceObject.Load(8, &t.ReoWndPersist) stateSourceObject.Load(9, &t.RTTSeq) } func (t *TCPEndpointID) StateTypeName() string { return "pkg/tcpip/stack.TCPEndpointID" } func (t *TCPEndpointID) StateFields() []string { return []string{ "LocalPort", "LocalAddress", "RemotePort", "RemoteAddress", } } func (t *TCPEndpointID) beforeSave() {} // +checklocksignore func (t *TCPEndpointID) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.LocalPort) stateSinkObject.Save(1, &t.LocalAddress) stateSinkObject.Save(2, &t.RemotePort) stateSinkObject.Save(3, &t.RemoteAddress) } func (t *TCPEndpointID) afterLoad(context.Context) {} // +checklocksignore func (t *TCPEndpointID) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.LocalPort) stateSourceObject.Load(1, &t.LocalAddress) stateSourceObject.Load(2, &t.RemotePort) stateSourceObject.Load(3, &t.RemoteAddress) } func (t *TCPFastRecoveryState) StateTypeName() string { return "pkg/tcpip/stack.TCPFastRecoveryState" } func (t *TCPFastRecoveryState) StateFields() []string { return []string{ "Active", "First", "Last", "MaxCwnd", "HighRxt", "RescueRxt", } } func (t *TCPFastRecoveryState) beforeSave() {} // +checklocksignore func (t *TCPFastRecoveryState) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.Active) stateSinkObject.Save(1, &t.First) stateSinkObject.Save(2, &t.Last) stateSinkObject.Save(3, &t.MaxCwnd) stateSinkObject.Save(4, &t.HighRxt) stateSinkObject.Save(5, &t.RescueRxt) } func (t *TCPFastRecoveryState) afterLoad(context.Context) {} // +checklocksignore func (t *TCPFastRecoveryState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.Active) stateSourceObject.Load(1, &t.First) stateSourceObject.Load(2, &t.Last) stateSourceObject.Load(3, &t.MaxCwnd) stateSourceObject.Load(4, &t.HighRxt) stateSourceObject.Load(5, &t.RescueRxt) } func (t *TCPReceiverState) StateTypeName() string { return "pkg/tcpip/stack.TCPReceiverState" } func (t *TCPReceiverState) StateFields() []string { return []string{ "RcvNxt", "RcvAcc", "RcvWndScale", "PendingBufUsed", } } func (t *TCPReceiverState) beforeSave() {} // +checklocksignore func (t *TCPReceiverState) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.RcvNxt) stateSinkObject.Save(1, &t.RcvAcc) stateSinkObject.Save(2, &t.RcvWndScale) stateSinkObject.Save(3, &t.PendingBufUsed) } func (t *TCPReceiverState) afterLoad(context.Context) {} // +checklocksignore func (t *TCPReceiverState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.RcvNxt) stateSourceObject.Load(1, &t.RcvAcc) stateSourceObject.Load(2, &t.RcvWndScale) stateSourceObject.Load(3, &t.PendingBufUsed) } func (t *TCPRTTState) StateTypeName() string { return "pkg/tcpip/stack.TCPRTTState" } func (t *TCPRTTState) StateFields() []string { return []string{ "SRTT", "RTTVar", "SRTTInited", } } func (t *TCPRTTState) beforeSave() {} // +checklocksignore func (t *TCPRTTState) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.SRTT) stateSinkObject.Save(1, &t.RTTVar) stateSinkObject.Save(2, &t.SRTTInited) } func (t *TCPRTTState) afterLoad(context.Context) {} // +checklocksignore func (t *TCPRTTState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.SRTT) stateSourceObject.Load(1, &t.RTTVar) stateSourceObject.Load(2, &t.SRTTInited) } func (t *TCPSenderState) StateTypeName() string { return "pkg/tcpip/stack.TCPSenderState" } func (t *TCPSenderState) StateFields() []string { return []string{ "LastSendTime", "DupAckCount", "SndCwnd", "Ssthresh", "SndCAAckCount", "Outstanding", "SackedOut", "SndWnd", "SndUna", "SndNxt", "RTTMeasureSeqNum", "RTTMeasureTime", "Closed", "RTO", "RTTState", "MaxPayloadSize", "SndWndScale", "MaxSentAck", "FastRecovery", "Cubic", "RACKState", "RetransmitTS", "SpuriousRecovery", } } func (t *TCPSenderState) beforeSave() {} // +checklocksignore func (t *TCPSenderState) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.LastSendTime) stateSinkObject.Save(1, &t.DupAckCount) stateSinkObject.Save(2, &t.SndCwnd) stateSinkObject.Save(3, &t.Ssthresh) stateSinkObject.Save(4, &t.SndCAAckCount) stateSinkObject.Save(5, &t.Outstanding) stateSinkObject.Save(6, &t.SackedOut) stateSinkObject.Save(7, &t.SndWnd) stateSinkObject.Save(8, &t.SndUna) stateSinkObject.Save(9, &t.SndNxt) stateSinkObject.Save(10, &t.RTTMeasureSeqNum) stateSinkObject.Save(11, &t.RTTMeasureTime) stateSinkObject.Save(12, &t.Closed) stateSinkObject.Save(13, &t.RTO) stateSinkObject.Save(14, &t.RTTState) stateSinkObject.Save(15, &t.MaxPayloadSize) stateSinkObject.Save(16, &t.SndWndScale) stateSinkObject.Save(17, &t.MaxSentAck) stateSinkObject.Save(18, &t.FastRecovery) stateSinkObject.Save(19, &t.Cubic) stateSinkObject.Save(20, &t.RACKState) stateSinkObject.Save(21, &t.RetransmitTS) stateSinkObject.Save(22, &t.SpuriousRecovery) } func (t *TCPSenderState) afterLoad(context.Context) {} // +checklocksignore func (t *TCPSenderState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.LastSendTime) stateSourceObject.Load(1, &t.DupAckCount) stateSourceObject.Load(2, &t.SndCwnd) stateSourceObject.Load(3, &t.Ssthresh) stateSourceObject.Load(4, &t.SndCAAckCount) stateSourceObject.Load(5, &t.Outstanding) stateSourceObject.Load(6, &t.SackedOut) stateSourceObject.Load(7, &t.SndWnd) stateSourceObject.Load(8, &t.SndUna) stateSourceObject.Load(9, &t.SndNxt) stateSourceObject.Load(10, &t.RTTMeasureSeqNum) stateSourceObject.Load(11, &t.RTTMeasureTime) stateSourceObject.Load(12, &t.Closed) stateSourceObject.Load(13, &t.RTO) stateSourceObject.Load(14, &t.RTTState) stateSourceObject.Load(15, &t.MaxPayloadSize) stateSourceObject.Load(16, &t.SndWndScale) stateSourceObject.Load(17, &t.MaxSentAck) stateSourceObject.Load(18, &t.FastRecovery) stateSourceObject.Load(19, &t.Cubic) stateSourceObject.Load(20, &t.RACKState) stateSourceObject.Load(21, &t.RetransmitTS) stateSourceObject.Load(22, &t.SpuriousRecovery) } func (t *TCPSACKInfo) StateTypeName() string { return "pkg/tcpip/stack.TCPSACKInfo" } func (t *TCPSACKInfo) StateFields() []string { return []string{ "Blocks", "ReceivedBlocks", "MaxSACKED", } } func (t *TCPSACKInfo) beforeSave() {} // +checklocksignore func (t *TCPSACKInfo) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.Blocks) stateSinkObject.Save(1, &t.ReceivedBlocks) stateSinkObject.Save(2, &t.MaxSACKED) } func (t *TCPSACKInfo) afterLoad(context.Context) {} // +checklocksignore func (t *TCPSACKInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.Blocks) stateSourceObject.Load(1, &t.ReceivedBlocks) stateSourceObject.Load(2, &t.MaxSACKED) } func (r *RcvBufAutoTuneParams) StateTypeName() string { return "pkg/tcpip/stack.RcvBufAutoTuneParams" } func (r *RcvBufAutoTuneParams) StateFields() []string { return []string{ "MeasureTime", "CopiedBytes", "PrevCopiedBytes", "RcvBufSize", "RTT", "RTTVar", "RTTMeasureSeqNumber", "RTTMeasureTime", "Disabled", } } func (r *RcvBufAutoTuneParams) beforeSave() {} // +checklocksignore func (r *RcvBufAutoTuneParams) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.MeasureTime) stateSinkObject.Save(1, &r.CopiedBytes) stateSinkObject.Save(2, &r.PrevCopiedBytes) stateSinkObject.Save(3, &r.RcvBufSize) stateSinkObject.Save(4, &r.RTT) stateSinkObject.Save(5, &r.RTTVar) stateSinkObject.Save(6, &r.RTTMeasureSeqNumber) stateSinkObject.Save(7, &r.RTTMeasureTime) stateSinkObject.Save(8, &r.Disabled) } func (r *RcvBufAutoTuneParams) afterLoad(context.Context) {} // +checklocksignore func (r *RcvBufAutoTuneParams) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.MeasureTime) stateSourceObject.Load(1, &r.CopiedBytes) stateSourceObject.Load(2, &r.PrevCopiedBytes) stateSourceObject.Load(3, &r.RcvBufSize) stateSourceObject.Load(4, &r.RTT) stateSourceObject.Load(5, &r.RTTVar) stateSourceObject.Load(6, &r.RTTMeasureSeqNumber) stateSourceObject.Load(7, &r.RTTMeasureTime) stateSourceObject.Load(8, &r.Disabled) } func (t *TCPRcvBufState) StateTypeName() string { return "pkg/tcpip/stack.TCPRcvBufState" } func (t *TCPRcvBufState) StateFields() []string { return []string{ "RcvBufUsed", "RcvAutoParams", "RcvClosed", } } func (t *TCPRcvBufState) beforeSave() {} // +checklocksignore func (t *TCPRcvBufState) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.RcvBufUsed) stateSinkObject.Save(1, &t.RcvAutoParams) stateSinkObject.Save(2, &t.RcvClosed) } func (t *TCPRcvBufState) afterLoad(context.Context) {} // +checklocksignore func (t *TCPRcvBufState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.RcvBufUsed) stateSourceObject.Load(1, &t.RcvAutoParams) stateSourceObject.Load(2, &t.RcvClosed) } func (t *TCPSndBufState) StateTypeName() string { return "pkg/tcpip/stack.TCPSndBufState" } func (t *TCPSndBufState) StateFields() []string { return []string{ "SndBufSize", "SndBufUsed", "SndClosed", "PacketTooBigCount", "SndMTU", "AutoTuneSndBufDisabled", } } func (t *TCPSndBufState) beforeSave() {} // +checklocksignore func (t *TCPSndBufState) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.SndBufSize) stateSinkObject.Save(1, &t.SndBufUsed) stateSinkObject.Save(2, &t.SndClosed) stateSinkObject.Save(3, &t.PacketTooBigCount) stateSinkObject.Save(4, &t.SndMTU) stateSinkObject.Save(5, &t.AutoTuneSndBufDisabled) } func (t *TCPSndBufState) afterLoad(context.Context) {} // +checklocksignore func (t *TCPSndBufState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.SndBufSize) stateSourceObject.Load(1, &t.SndBufUsed) stateSourceObject.Load(2, &t.SndClosed) stateSourceObject.Load(3, &t.PacketTooBigCount) stateSourceObject.Load(4, &t.SndMTU) stateSourceObject.Load(5, &t.AutoTuneSndBufDisabled) } func (t *TCPEndpointStateInner) StateTypeName() string { return "pkg/tcpip/stack.TCPEndpointStateInner" } func (t *TCPEndpointStateInner) StateFields() []string { return []string{ "TSOffset", "SACKPermitted", "SendTSOk", "RecentTS", } } func (t *TCPEndpointStateInner) beforeSave() {} // +checklocksignore func (t *TCPEndpointStateInner) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.TSOffset) stateSinkObject.Save(1, &t.SACKPermitted) stateSinkObject.Save(2, &t.SendTSOk) stateSinkObject.Save(3, &t.RecentTS) } func (t *TCPEndpointStateInner) afterLoad(context.Context) {} // +checklocksignore func (t *TCPEndpointStateInner) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.TSOffset) stateSourceObject.Load(1, &t.SACKPermitted) stateSourceObject.Load(2, &t.SendTSOk) stateSourceObject.Load(3, &t.RecentTS) } func (t *TCPEndpointState) StateTypeName() string { return "pkg/tcpip/stack.TCPEndpointState" } func (t *TCPEndpointState) StateFields() []string { return []string{ "TCPEndpointStateInner", "ID", "SegTime", "RcvBufState", "SndBufState", "SACK", "Receiver", "Sender", } } func (t *TCPEndpointState) beforeSave() {} // +checklocksignore func (t *TCPEndpointState) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.TCPEndpointStateInner) stateSinkObject.Save(1, &t.ID) stateSinkObject.Save(2, &t.SegTime) stateSinkObject.Save(3, &t.RcvBufState) stateSinkObject.Save(4, &t.SndBufState) stateSinkObject.Save(5, &t.SACK) stateSinkObject.Save(6, &t.Receiver) stateSinkObject.Save(7, &t.Sender) } func (t *TCPEndpointState) afterLoad(context.Context) {} // +checklocksignore func (t *TCPEndpointState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.TCPEndpointStateInner) stateSourceObject.Load(1, &t.ID) stateSourceObject.Load(2, &t.SegTime) stateSourceObject.Load(3, &t.RcvBufState) stateSourceObject.Load(4, &t.SndBufState) stateSourceObject.Load(5, &t.SACK) stateSourceObject.Load(6, &t.Receiver) stateSourceObject.Load(7, &t.Sender) } func (p *protocolIDs) StateTypeName() string { return "pkg/tcpip/stack.protocolIDs" } func (p *protocolIDs) StateFields() []string { return []string{ "network", "transport", } } func (p *protocolIDs) beforeSave() {} // +checklocksignore func (p *protocolIDs) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.network) stateSinkObject.Save(1, &p.transport) } func (p *protocolIDs) afterLoad(context.Context) {} // +checklocksignore func (p *protocolIDs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.network) stateSourceObject.Load(1, &p.transport) } func (eps *transportEndpoints) StateTypeName() string { return "pkg/tcpip/stack.transportEndpoints" } func (eps *transportEndpoints) StateFields() []string { return []string{ "endpoints", "rawEndpoints", } } func (eps *transportEndpoints) beforeSave() {} // +checklocksignore func (eps *transportEndpoints) StateSave(stateSinkObject state.Sink) { eps.beforeSave() stateSinkObject.Save(0, &eps.endpoints) stateSinkObject.Save(1, &eps.rawEndpoints) } func (eps *transportEndpoints) afterLoad(context.Context) {} // +checklocksignore func (eps *transportEndpoints) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &eps.endpoints) stateSourceObject.Load(1, &eps.rawEndpoints) } func (epsByNIC *endpointsByNIC) StateTypeName() string { return "pkg/tcpip/stack.endpointsByNIC" } func (epsByNIC *endpointsByNIC) StateFields() []string { return []string{ "seed", "endpoints", } } func (epsByNIC *endpointsByNIC) beforeSave() {} // +checklocksignore func (epsByNIC *endpointsByNIC) StateSave(stateSinkObject state.Sink) { epsByNIC.beforeSave() stateSinkObject.Save(0, &epsByNIC.seed) stateSinkObject.Save(1, &epsByNIC.endpoints) } func (epsByNIC *endpointsByNIC) afterLoad(context.Context) {} // +checklocksignore func (epsByNIC *endpointsByNIC) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &epsByNIC.seed) stateSourceObject.Load(1, &epsByNIC.endpoints) } func (d *transportDemuxer) StateTypeName() string { return "pkg/tcpip/stack.transportDemuxer" } func (d *transportDemuxer) StateFields() []string { return []string{ "stack", "protocol", "queuedProtocols", } } func (d *transportDemuxer) beforeSave() {} // +checklocksignore func (d *transportDemuxer) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.stack) stateSinkObject.Save(1, &d.protocol) stateSinkObject.Save(2, &d.queuedProtocols) } func (d *transportDemuxer) afterLoad(context.Context) {} // +checklocksignore func (d *transportDemuxer) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.stack) stateSourceObject.Load(1, &d.protocol) stateSourceObject.Load(2, &d.queuedProtocols) } func (ep *multiPortEndpoint) StateTypeName() string { return "pkg/tcpip/stack.multiPortEndpoint" } func (ep *multiPortEndpoint) StateFields() []string { return []string{ "demux", "netProto", "transProto", "flags", "endpoints", } } func (ep *multiPortEndpoint) beforeSave() {} // +checklocksignore func (ep *multiPortEndpoint) StateSave(stateSinkObject state.Sink) { ep.beforeSave() stateSinkObject.Save(0, &ep.demux) stateSinkObject.Save(1, &ep.netProto) stateSinkObject.Save(2, &ep.transProto) stateSinkObject.Save(3, &ep.flags) stateSinkObject.Save(4, &ep.endpoints) } func (ep *multiPortEndpoint) afterLoad(context.Context) {} // +checklocksignore func (ep *multiPortEndpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &ep.demux) stateSourceObject.Load(1, &ep.netProto) stateSourceObject.Load(2, &ep.transProto) stateSourceObject.Load(3, &ep.flags) stateSourceObject.Load(4, &ep.endpoints) } func (l *tupleList) StateTypeName() string { return "pkg/tcpip/stack.tupleList" } func (l *tupleList) StateFields() []string { return []string{ "head", "tail", } } func (l *tupleList) beforeSave() {} // +checklocksignore func (l *tupleList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *tupleList) afterLoad(context.Context) {} // +checklocksignore func (l *tupleList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *tupleEntry) StateTypeName() string { return "pkg/tcpip/stack.tupleEntry" } func (e *tupleEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *tupleEntry) beforeSave() {} // +checklocksignore func (e *tupleEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *tupleEntry) afterLoad(context.Context) {} // +checklocksignore func (e *tupleEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func init() { state.Register((*addressStateRefs)(nil)) state.Register((*AddressableEndpointState)(nil)) state.Register((*AddressableEndpointStateOptions)(nil)) state.Register((*addressState)(nil)) state.Register((*tuple)(nil)) state.Register((*tupleID)(nil)) state.Register((*conn)(nil)) state.Register((*ConnTrack)(nil)) state.Register((*bucket)(nil)) state.Register((*ICMPRateLimiter)(nil)) state.Register((*AcceptTarget)(nil)) state.Register((*DropTarget)(nil)) state.Register((*RejectIPv4Target)(nil)) state.Register((*RejectIPv6Target)(nil)) state.Register((*ErrorTarget)(nil)) state.Register((*UserChainTarget)(nil)) state.Register((*ReturnTarget)(nil)) state.Register((*DNATTarget)(nil)) state.Register((*RedirectTarget)(nil)) state.Register((*SNATTarget)(nil)) state.Register((*MasqueradeTarget)(nil)) state.Register((*IPTables)(nil)) state.Register((*Table)(nil)) state.Register((*Rule)(nil)) state.Register((*IPHeaderFilter)(nil)) state.Register((*dynamicCacheEntry)(nil)) state.Register((*neighborCacheMu)(nil)) state.Register((*neighborCache)(nil)) state.Register((*neighborEntryList)(nil)) state.Register((*neighborEntryEntry)(nil)) state.Register((*linkResolver)(nil)) state.Register((*nic)(nil)) state.Register((*packetEndpointList)(nil)) state.Register((*delegatingQueueingDiscipline)(nil)) state.Register((*sharedStats)(nil)) state.Register((*multiCounterNICPacketStats)(nil)) state.Register((*multiCounterNICNeighborStats)(nil)) state.Register((*multiCounterNICStats)(nil)) state.Register((*NUDConfigurations)(nil)) state.Register((*nudStateMu)(nil)) state.Register((*NUDState)(nil)) state.Register((*PacketBuffer)(nil)) state.Register((*headerInfo)(nil)) state.Register((*PacketData)(nil)) state.Register((*PacketBufferList)(nil)) state.Register((*packetBufferRefs)(nil)) state.Register((*pendingPacket)(nil)) state.Register((*packetsPendingLinkResolutionMu)(nil)) state.Register((*packetsPendingLinkResolution)(nil)) state.Register((*TransportEndpointID)(nil)) state.Register((*NetworkPacketInfo)(nil)) state.Register((*AddressLifetimes)(nil)) state.Register((*UnicastSourceAndMulticastDestination)(nil)) state.Register((*DADConfigurations)(nil)) state.Register((*GSOType)(nil)) state.Register((*GSO)(nil)) state.Register((*routeInfo)(nil)) state.Register((*RouteInfo)(nil)) state.Register((*transportProtocolState)(nil)) state.Register((*Stack)(nil)) state.Register((*TransportEndpointInfo)(nil)) state.Register((*TCPCubicState)(nil)) state.Register((*TCPRACKState)(nil)) state.Register((*TCPEndpointID)(nil)) state.Register((*TCPFastRecoveryState)(nil)) state.Register((*TCPReceiverState)(nil)) state.Register((*TCPRTTState)(nil)) state.Register((*TCPSenderState)(nil)) state.Register((*TCPSACKInfo)(nil)) state.Register((*RcvBufAutoTuneParams)(nil)) state.Register((*TCPRcvBufState)(nil)) state.Register((*TCPSndBufState)(nil)) state.Register((*TCPEndpointStateInner)(nil)) state.Register((*TCPEndpointState)(nil)) state.Register((*protocolIDs)(nil)) state.Register((*transportEndpoints)(nil)) state.Register((*endpointsByNIC)(nil)) state.Register((*transportDemuxer)(nil)) state.Register((*multiPortEndpoint)(nil)) state.Register((*tupleList)(nil)) state.Register((*tupleEntry)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/stack_unsafe_state_autogen.go000066400000000000000000000000671465435605700275720ustar00rootroot00000000000000// automatically generated by stateify. package stack golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/state_conn_mutex.go000066400000000000000000000046461465435605700255700ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type stateConnRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var stateConnlockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type stateConnlockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *stateConnRWMutex) Lock() { locking.AddGLock(stateConnprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *stateConnRWMutex) NestedLock(i stateConnlockNameIndex) { locking.AddGLock(stateConnprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *stateConnRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(stateConnprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *stateConnRWMutex) NestedUnlock(i stateConnlockNameIndex) { m.mu.Unlock() locking.DelGLock(stateConnprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *stateConnRWMutex) RLock() { locking.AddGLock(stateConnprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *stateConnRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(stateConnprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *stateConnRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *stateConnRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *stateConnRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var stateConnprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func stateConninitLockNames() {} func init() { stateConninitLockNames() stateConnprefixIndex = locking.NewMutexClass(reflect.TypeOf(stateConnRWMutex{}), stateConnlockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/tcp.go000066400000000000000000000357431465435605700230010ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "context" "time" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/internal/tcp" "gvisor.dev/gvisor/pkg/tcpip/seqnum" ) // contextID is this package's type for context.Context.Value keys. type contextID int const ( // CtxRestoreStack is a Context.Value key for the stack to be used in restore. CtxRestoreStack contextID = iota ) // RestoreStackFromContext returns the stack to be used during restore. func RestoreStackFromContext(ctx context.Context) *Stack { return ctx.Value(CtxRestoreStack).(*Stack) } // TCPProbeFunc is the expected function type for a TCP probe function to be // passed to stack.AddTCPProbe. type TCPProbeFunc func(s *TCPEndpointState) // TCPCubicState is used to hold a copy of the internal cubic state when the // TCPProbeFunc is invoked. // // +stateify savable type TCPCubicState struct { // WLastMax is the previous wMax value. WLastMax float64 // WMax is the value of the congestion window at the time of the last // congestion event. WMax float64 // T is the time when the current congestion avoidance was entered. T tcpip.MonotonicTime // TimeSinceLastCongestion denotes the time since the current // congestion avoidance was entered. TimeSinceLastCongestion time.Duration // C is the cubic constant as specified in RFC8312, page 11. C float64 // K is the time period (in seconds) that the above function takes to // increase the current window size to WMax if there are no further // congestion events and is calculated using the following equation: // // K = cubic_root(WMax*(1-beta_cubic)/C) (Eq. 2, page 5) K float64 // Beta is the CUBIC multiplication decrease factor. That is, when a // congestion event is detected, CUBIC reduces its cwnd to // WC(0)=WMax*beta_cubic. Beta float64 // WC is window computed by CUBIC at time TimeSinceLastCongestion. It's // calculated using the formula: // // WC(TimeSinceLastCongestion) = C*(t-K)^3 + WMax (Eq. 1) WC float64 // WEst is the window computed by CUBIC at time // TimeSinceLastCongestion+RTT i.e WC(TimeSinceLastCongestion+RTT). WEst float64 // EndSeq is the sequence number that, when cumulatively ACK'd, ends the // HyStart round. EndSeq seqnum.Value // CurrRTT is the minimum round-trip time from the current round. CurrRTT time.Duration // LastRTT is the minimum round-trip time from the previous round. LastRTT time.Duration // SampleCount is the number of samples from the current round. SampleCount uint // LastAck is the time we received the most recent ACK (or start of round if // more recent). LastAck tcpip.MonotonicTime // RoundStart is the time we started the most recent HyStart round. RoundStart tcpip.MonotonicTime } // TCPRACKState is used to hold a copy of the internal RACK state when the // TCPProbeFunc is invoked. // // +stateify savable type TCPRACKState struct { // XmitTime is the transmission timestamp of the most recent // acknowledged segment. XmitTime tcpip.MonotonicTime // EndSequence is the ending TCP sequence number of the most recent // acknowledged segment. EndSequence seqnum.Value // FACK is the highest selectively or cumulatively acknowledged // sequence. FACK seqnum.Value // RTT is the round trip time of the most recently delivered packet on // the connection (either cumulatively acknowledged or selectively // acknowledged) that was not marked invalid as a possible spurious // retransmission. RTT time.Duration // Reord is true iff reordering has been detected on this connection. Reord bool // DSACKSeen is true iff the connection has seen a DSACK. DSACKSeen bool // ReoWnd is the reordering window time used for recording packet // transmission times. It is used to defer the moment at which RACK // marks a packet lost. ReoWnd time.Duration // ReoWndIncr is the multiplier applied to adjust reorder window. ReoWndIncr uint8 // ReoWndPersist is the number of loss recoveries before resetting // reorder window. ReoWndPersist int8 // RTTSeq is the SND.NXT when RTT is updated. RTTSeq seqnum.Value } // TCPEndpointID is the unique 4 tuple that identifies a given endpoint. // // +stateify savable type TCPEndpointID struct { // LocalPort is the local port associated with the endpoint. LocalPort uint16 // LocalAddress is the local [network layer] address associated with // the endpoint. LocalAddress tcpip.Address // RemotePort is the remote port associated with the endpoint. RemotePort uint16 // RemoteAddress it the remote [network layer] address associated with // the endpoint. RemoteAddress tcpip.Address } // TCPFastRecoveryState holds a copy of the internal fast recovery state of a // TCP endpoint. // // +stateify savable type TCPFastRecoveryState struct { // Active if true indicates the endpoint is in fast recovery. The // following fields are only meaningful when Active is true. Active bool // First is the first unacknowledged sequence number being recovered. First seqnum.Value // Last is the 'recover' sequence number that indicates the point at // which we should exit recovery barring any timeouts etc. Last seqnum.Value // MaxCwnd is the maximum value we are permitted to grow the congestion // window during recovery. This is set at the time we enter recovery. // It exists to avoid attacks where the receiver intentionally sends // duplicate acks to artificially inflate the sender's cwnd. MaxCwnd int // HighRxt is the highest sequence number which has been retransmitted // during the current loss recovery phase. See: RFC 6675 Section 2 for // details. HighRxt seqnum.Value // RescueRxt is the highest sequence number which has been // optimistically retransmitted to prevent stalling of the ACK clock // when there is loss at the end of the window and no new data is // available for transmission. See: RFC 6675 Section 2 for details. RescueRxt seqnum.Value } // TCPReceiverState holds a copy of the internal state of the receiver for a // given TCP endpoint. // // +stateify savable type TCPReceiverState struct { // RcvNxt is the TCP variable RCV.NXT. RcvNxt seqnum.Value // RcvAcc is one beyond the last acceptable sequence number. That is, // the "largest" sequence value that the receiver has announced to its // peer that it's willing to accept. This may be different than RcvNxt // + (last advertised receive window) if the receive window is reduced; // in that case we have to reduce the window as we receive more data // instead of shrinking it. RcvAcc seqnum.Value // RcvWndScale is the window scaling to use for inbound segments. RcvWndScale uint8 // PendingBufUsed is the number of bytes pending in the receive queue. PendingBufUsed int } // TCPRTTState holds a copy of information about the endpoint's round trip // time. // // +stateify savable type TCPRTTState struct { // SRTT is the smoothed round trip time defined in section 2 of RFC // 6298. SRTT time.Duration // RTTVar is the round-trip time variation as defined in section 2 of // RFC 6298. RTTVar time.Duration // SRTTInited if true indicates that a valid RTT measurement has been // completed. SRTTInited bool } // TCPSenderState holds a copy of the internal state of the sender for a given // TCP Endpoint. // // +stateify savable type TCPSenderState struct { // LastSendTime is the timestamp at which we sent the last segment. LastSendTime tcpip.MonotonicTime // DupAckCount is the number of Duplicate ACKs received. It is used for // fast retransmit. DupAckCount int // SndCwnd is the size of the sending congestion window in packets. SndCwnd int // Ssthresh is the threshold between slow start and congestion // avoidance. Ssthresh int // SndCAAckCount is the number of packets acknowledged during // congestion avoidance. When enough packets have been ack'd (typically // cwnd packets), the congestion window is incremented by one. SndCAAckCount int // Outstanding is the number of packets that have been sent but not yet // acknowledged. Outstanding int // SackedOut is the number of packets which have been selectively // acked. SackedOut int // SndWnd is the send window size in bytes. SndWnd seqnum.Size // SndUna is the next unacknowledged sequence number. SndUna seqnum.Value // SndNxt is the sequence number of the next segment to be sent. SndNxt seqnum.Value // RTTMeasureSeqNum is the sequence number being used for the latest // RTT measurement. RTTMeasureSeqNum seqnum.Value // RTTMeasureTime is the time when the RTTMeasureSeqNum was sent. RTTMeasureTime tcpip.MonotonicTime // Closed indicates that the caller has closed the endpoint for // sending. Closed bool // RTO is the retransmit timeout as defined in section of 2 of RFC // 6298. RTO time.Duration // RTTState holds information about the endpoint's round trip time. RTTState TCPRTTState // MaxPayloadSize is the maximum size of the payload of a given // segment. It is initialized on demand. MaxPayloadSize int // SndWndScale is the number of bits to shift left when reading the // send window size from a segment. SndWndScale uint8 // MaxSentAck is the highest acknowledgement number sent till now. MaxSentAck seqnum.Value // FastRecovery holds the fast recovery state for the endpoint. FastRecovery TCPFastRecoveryState // Cubic holds the state related to CUBIC congestion control. Cubic TCPCubicState // RACKState holds the state related to RACK loss detection algorithm. RACKState TCPRACKState // RetransmitTS records the timestamp used to detect spurious recovery. RetransmitTS uint32 // SpuriousRecovery indicates if the sender entered recovery spuriously. SpuriousRecovery bool } // TCPSACKInfo holds TCP SACK related information for a given TCP endpoint. // // +stateify savable type TCPSACKInfo struct { // Blocks is the list of SACK Blocks that identify the out of order // segments held by a given TCP endpoint. Blocks []header.SACKBlock // ReceivedBlocks are the SACK blocks received by this endpoint from // the peer endpoint. ReceivedBlocks []header.SACKBlock // MaxSACKED is the highest sequence number that has been SACKED by the // peer. MaxSACKED seqnum.Value } // RcvBufAutoTuneParams holds state related to TCP receive buffer auto-tuning. // // +stateify savable type RcvBufAutoTuneParams struct { // MeasureTime is the time at which the current measurement was // started. MeasureTime tcpip.MonotonicTime // CopiedBytes is the number of bytes copied to user space since this // measure began. CopiedBytes int // PrevCopiedBytes is the number of bytes copied to userspace in the // previous RTT period. PrevCopiedBytes int // RcvBufSize is the auto tuned receive buffer size. RcvBufSize int // RTT is the smoothed RTT as measured by observing the time between // when a byte is first acknowledged and the receipt of data that is at // least one window beyond the sequence number that was acknowledged. RTT time.Duration // RTTVar is the "round-trip time variation" as defined in section 2 of // RFC6298. RTTVar time.Duration // RTTMeasureSeqNumber is the highest acceptable sequence number at the // time this RTT measurement period began. RTTMeasureSeqNumber seqnum.Value // RTTMeasureTime is the absolute time at which the current RTT // measurement period began. RTTMeasureTime tcpip.MonotonicTime // Disabled is true if an explicit receive buffer is set for the // endpoint. Disabled bool } // TCPRcvBufState contains information about the state of an endpoint's receive // socket buffer. // // +stateify savable type TCPRcvBufState struct { // RcvBufUsed is the amount of bytes actually held in the receive // socket buffer for the endpoint. RcvBufUsed int // RcvBufAutoTuneParams is used to hold state variables to compute the // auto tuned receive buffer size. RcvAutoParams RcvBufAutoTuneParams // RcvClosed if true, indicates the endpoint has been closed for // reading. RcvClosed bool } // TCPSndBufState contains information about the state of an endpoint's send // socket buffer. // // +stateify savable type TCPSndBufState struct { // SndBufSize is the size of the socket send buffer. SndBufSize int // SndBufUsed is the number of bytes held in the socket send buffer. SndBufUsed int // SndClosed indicates that the endpoint has been closed for sends. SndClosed bool // PacketTooBigCount is used to notify the main protocol routine how // many times a "packet too big" control packet is received. PacketTooBigCount int // SndMTU is the smallest MTU seen in the control packets received. SndMTU int // AutoTuneSndBufDisabled indicates that the auto tuning of send buffer // is disabled. AutoTuneSndBufDisabled atomicbitops.Uint32 } // TCPEndpointStateInner contains the members of TCPEndpointState used directly // (that is, not within another containing struct) within the endpoint's // internal implementation. // // +stateify savable type TCPEndpointStateInner struct { // TSOffset is a randomized offset added to the value of the TSVal // field in the timestamp option. TSOffset tcp.TSOffset // SACKPermitted is set to true if the peer sends the TCPSACKPermitted // option in the SYN/SYN-ACK. SACKPermitted bool // SendTSOk is used to indicate when the TS Option has been negotiated. // When sendTSOk is true every non-RST segment should carry a TS as per // RFC7323#section-1.1. SendTSOk bool // RecentTS is the timestamp that should be sent in the TSEcr field of // the timestamp for future segments sent by the endpoint. This field // is updated if required when a new segment is received by this // endpoint. RecentTS uint32 } // TCPEndpointState is a copy of the internal state of a TCP endpoint. // // +stateify savable type TCPEndpointState struct { // TCPEndpointStateInner contains the members of TCPEndpointState used // by the endpoint's internal implementation. TCPEndpointStateInner // ID is a copy of the TransportEndpointID for the endpoint. ID TCPEndpointID // SegTime denotes the absolute time when this segment was received. SegTime tcpip.MonotonicTime // RcvBufState contains information about the state of the endpoint's // receive socket buffer. RcvBufState TCPRcvBufState // SndBufState contains information about the state of the endpoint's // send socket buffer. SndBufState TCPSndBufState // SACK holds TCP SACK related information for this endpoint. SACK TCPSACKInfo // Receiver holds variables related to the TCP receiver for the // endpoint. Receiver TCPReceiverState // Sender holds state related to the TCP Sender for the endpoint. Sender TCPSenderState } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/transport_demuxer.go000066400000000000000000000551241465435605700257730ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stack import ( "fmt" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/ports" ) // +stateify savable type protocolIDs struct { network tcpip.NetworkProtocolNumber transport tcpip.TransportProtocolNumber } // transportEndpoints manages all endpoints of a given protocol. It has its own // mutex so as to reduce interference between protocols. // // +stateify savable type transportEndpoints struct { mu transportEndpointsRWMutex `state:"nosave"` // +checklocks:mu endpoints map[TransportEndpointID]*endpointsByNIC // rawEndpoints contains endpoints for raw sockets, which receive all // traffic of a given protocol regardless of port. // // +checklocks:mu rawEndpoints []RawTransportEndpoint } // unregisterEndpoint unregisters the endpoint with the given id such that it // won't receive any more packets. func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) { eps.mu.Lock() defer eps.mu.Unlock() epsByNIC, ok := eps.endpoints[id] if !ok { return } if !epsByNIC.unregisterEndpoint(bindToDevice, ep, flags) { return } delete(eps.endpoints, id) } func (eps *transportEndpoints) transportEndpoints() []TransportEndpoint { eps.mu.RLock() defer eps.mu.RUnlock() es := make([]TransportEndpoint, 0, len(eps.endpoints)) for _, e := range eps.endpoints { es = append(es, e.transportEndpoints()...) } return es } // iterEndpointsLocked yields all endpointsByNIC in eps that match id, in // descending order of match quality. If a call to yield returns false, // iterEndpointsLocked stops iteration and returns immediately. // // +checklocksread:eps.mu func (eps *transportEndpoints) iterEndpointsLocked(id TransportEndpointID, yield func(*endpointsByNIC) bool) { // Try to find a match with the id as provided. if ep, ok := eps.endpoints[id]; ok { if !yield(ep) { return } } // Try to find a match with the id minus the local address. nid := id nid.LocalAddress = tcpip.Address{} if ep, ok := eps.endpoints[nid]; ok { if !yield(ep) { return } } // Try to find a match with the id minus the remote part. nid.LocalAddress = id.LocalAddress nid.RemoteAddress = tcpip.Address{} nid.RemotePort = 0 if ep, ok := eps.endpoints[nid]; ok { if !yield(ep) { return } } // Try to find a match with only the local port. nid.LocalAddress = tcpip.Address{} if ep, ok := eps.endpoints[nid]; ok { if !yield(ep) { return } } } // findAllEndpointsLocked returns all endpointsByNIC in eps that match id, in // descending order of match quality. // // +checklocksread:eps.mu func (eps *transportEndpoints) findAllEndpointsLocked(id TransportEndpointID) []*endpointsByNIC { var matchedEPs []*endpointsByNIC eps.iterEndpointsLocked(id, func(ep *endpointsByNIC) bool { matchedEPs = append(matchedEPs, ep) return true }) return matchedEPs } // findEndpointLocked returns the endpoint that most closely matches the given id. // // +checklocksread:eps.mu func (eps *transportEndpoints) findEndpointLocked(id TransportEndpointID) *endpointsByNIC { var matchedEP *endpointsByNIC eps.iterEndpointsLocked(id, func(ep *endpointsByNIC) bool { matchedEP = ep return false }) return matchedEP } // +stateify savable type endpointsByNIC struct { // seed is a random secret for a jenkins hash. seed uint32 mu endpointsByNICRWMutex `state:"nosave"` // +checklocks:mu endpoints map[tcpip.NICID]*multiPortEndpoint } func (epsByNIC *endpointsByNIC) transportEndpoints() []TransportEndpoint { epsByNIC.mu.RLock() defer epsByNIC.mu.RUnlock() var eps []TransportEndpoint for _, ep := range epsByNIC.endpoints { eps = append(eps, ep.transportEndpoints()...) } return eps } // handlePacket is called by the stack when new packets arrive to this transport // endpoint. It returns false if the packet could not be matched to any // transport endpoint, true otherwise. func (epsByNIC *endpointsByNIC) handlePacket(id TransportEndpointID, pkt *PacketBuffer) bool { epsByNIC.mu.RLock() mpep, ok := epsByNIC.endpoints[pkt.NICID] if !ok { if mpep, ok = epsByNIC.endpoints[0]; !ok { epsByNIC.mu.RUnlock() // Don't use defer for performance reasons. return false } } // If this is a broadcast or multicast datagram, deliver the datagram to all // endpoints bound to the right device. if isInboundMulticastOrBroadcast(pkt, id.LocalAddress) { mpep.handlePacketAll(id, pkt) epsByNIC.mu.RUnlock() // Don't use defer for performance reasons. return true } // multiPortEndpoints are guaranteed to have at least one element. transEP := mpep.selectEndpoint(id, epsByNIC.seed) if queuedProtocol, mustQueue := mpep.demux.queuedProtocols[protocolIDs{mpep.netProto, mpep.transProto}]; mustQueue { queuedProtocol.QueuePacket(transEP, id, pkt) epsByNIC.mu.RUnlock() return true } epsByNIC.mu.RUnlock() transEP.HandlePacket(id, pkt) return true } // handleError delivers an error to the transport endpoint identified by id. func (epsByNIC *endpointsByNIC) handleError(n *nic, id TransportEndpointID, transErr TransportError, pkt *PacketBuffer) { epsByNIC.mu.RLock() mpep, ok := epsByNIC.endpoints[n.ID()] if !ok { mpep, ok = epsByNIC.endpoints[0] } if !ok { epsByNIC.mu.RUnlock() return } // TODO(eyalsoha): Why don't we look at id to see if this packet needs to // broadcast like we are doing with handlePacket above? // multiPortEndpoints are guaranteed to have at least one element. transEP := mpep.selectEndpoint(id, epsByNIC.seed) epsByNIC.mu.RUnlock() transEP.HandleError(transErr, pkt) } // registerEndpoint returns true if it succeeds. It fails and returns // false if ep already has an element with the same key. func (epsByNIC *endpointsByNIC) registerEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, t TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) tcpip.Error { epsByNIC.mu.Lock() defer epsByNIC.mu.Unlock() multiPortEp, ok := epsByNIC.endpoints[bindToDevice] if !ok { multiPortEp = &multiPortEndpoint{ demux: d, netProto: netProto, transProto: transProto, } } if err := multiPortEp.singleRegisterEndpoint(t, flags); err != nil { return err } // Only add this newly created multiportEndpoint if the singleRegisterEndpoint // succeeded. if !ok { epsByNIC.endpoints[bindToDevice] = multiPortEp } return nil } func (epsByNIC *endpointsByNIC) checkEndpoint(flags ports.Flags, bindToDevice tcpip.NICID) tcpip.Error { epsByNIC.mu.RLock() defer epsByNIC.mu.RUnlock() multiPortEp, ok := epsByNIC.endpoints[bindToDevice] if !ok { return nil } return multiPortEp.singleCheckEndpoint(flags) } // unregisterEndpoint returns true if endpointsByNIC has to be unregistered. func (epsByNIC *endpointsByNIC) unregisterEndpoint(bindToDevice tcpip.NICID, t TransportEndpoint, flags ports.Flags) bool { epsByNIC.mu.Lock() defer epsByNIC.mu.Unlock() multiPortEp, ok := epsByNIC.endpoints[bindToDevice] if !ok { return false } if multiPortEp.unregisterEndpoint(t, flags) { delete(epsByNIC.endpoints, bindToDevice) } return len(epsByNIC.endpoints) == 0 } // transportDemuxer demultiplexes packets targeted at a transport endpoint // (i.e., after they've been parsed by the network layer). It does two levels // of demultiplexing: first based on the network and transport protocols, then // based on endpoints IDs. It should only be instantiated via // newTransportDemuxer. // // +stateify savable type transportDemuxer struct { stack *Stack // protocol is immutable. protocol map[protocolIDs]*transportEndpoints queuedProtocols map[protocolIDs]queuedTransportProtocol } // queuedTransportProtocol if supported by a protocol implementation will cause // the dispatcher to delivery packets to the QueuePacket method instead of // calling HandlePacket directly on the endpoint. type queuedTransportProtocol interface { QueuePacket(ep TransportEndpoint, id TransportEndpointID, pkt *PacketBuffer) } func newTransportDemuxer(stack *Stack) *transportDemuxer { d := &transportDemuxer{ stack: stack, protocol: make(map[protocolIDs]*transportEndpoints), queuedProtocols: make(map[protocolIDs]queuedTransportProtocol), } // Add each network and transport pair to the demuxer. for netProto := range stack.networkProtocols { for proto := range stack.transportProtocols { protoIDs := protocolIDs{netProto, proto} d.protocol[protoIDs] = &transportEndpoints{ endpoints: make(map[TransportEndpointID]*endpointsByNIC), } qTransProto, isQueued := (stack.transportProtocols[proto].proto).(queuedTransportProtocol) if isQueued { d.queuedProtocols[protoIDs] = qTransProto } } } return d } // registerEndpoint registers the given endpoint with the dispatcher such that // packets that match the endpoint ID are delivered to it. func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) tcpip.Error { for i, n := range netProtos { if err := d.singleRegisterEndpoint(n, protocol, id, ep, flags, bindToDevice); err != nil { d.unregisterEndpoint(netProtos[:i], protocol, id, ep, flags, bindToDevice) return err } } return nil } // checkEndpoint checks if an endpoint can be registered with the dispatcher. func (d *transportDemuxer) checkEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, flags ports.Flags, bindToDevice tcpip.NICID) tcpip.Error { for _, n := range netProtos { if err := d.singleCheckEndpoint(n, protocol, id, flags, bindToDevice); err != nil { return err } } return nil } // multiPortEndpoint is a container for TransportEndpoints which are bound to // the same pair of address and port. endpointsArr always has at least one // element. // // FIXME(gvisor.dev/issue/873): Restore this properly. Currently, we just save // this to ensure that the underlying endpoints get saved/restored, but not not // use the restored copy. // // +stateify savable type multiPortEndpoint struct { demux *transportDemuxer netProto tcpip.NetworkProtocolNumber transProto tcpip.TransportProtocolNumber flags ports.FlagCounter mu multiPortEndpointRWMutex `state:"nosave"` // endpoints stores the transport endpoints in the order in which they // were bound. This is required for UDP SO_REUSEADDR. // // +checklocks:mu endpoints []TransportEndpoint } func (ep *multiPortEndpoint) transportEndpoints() []TransportEndpoint { ep.mu.RLock() eps := append([]TransportEndpoint(nil), ep.endpoints...) ep.mu.RUnlock() return eps } // reciprocalScale scales a value into range [0, n). // // This is similar to val % n, but faster. // See http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ func reciprocalScale(val, n uint32) uint32 { return uint32((uint64(val) * uint64(n)) >> 32) } // selectEndpoint calculates a hash of destination and source addresses and // ports then uses it to select a socket. In this case, all packets from one // address will be sent to same endpoint. func (ep *multiPortEndpoint) selectEndpoint(id TransportEndpointID, seed uint32) TransportEndpoint { ep.mu.RLock() defer ep.mu.RUnlock() if len(ep.endpoints) == 1 { return ep.endpoints[0] } if ep.flags.SharedFlags().ToFlags().Effective().MostRecent { return ep.endpoints[len(ep.endpoints)-1] } payload := []byte{ byte(id.LocalPort), byte(id.LocalPort >> 8), byte(id.RemotePort), byte(id.RemotePort >> 8), } h := jenkins.Sum32(seed) h.Write(payload) h.Write(id.LocalAddress.AsSlice()) h.Write(id.RemoteAddress.AsSlice()) hash := h.Sum32() idx := reciprocalScale(hash, uint32(len(ep.endpoints))) return ep.endpoints[idx] } func (ep *multiPortEndpoint) handlePacketAll(id TransportEndpointID, pkt *PacketBuffer) { ep.mu.RLock() queuedProtocol, mustQueue := ep.demux.queuedProtocols[protocolIDs{ep.netProto, ep.transProto}] // HandlePacket may modify pkt, so each endpoint needs // its own copy except for the final one. for _, endpoint := range ep.endpoints[:len(ep.endpoints)-1] { clone := pkt.Clone() if mustQueue { queuedProtocol.QueuePacket(endpoint, id, clone) } else { endpoint.HandlePacket(id, clone) } clone.DecRef() } if endpoint := ep.endpoints[len(ep.endpoints)-1]; mustQueue { queuedProtocol.QueuePacket(endpoint, id, pkt) } else { endpoint.HandlePacket(id, pkt) } ep.mu.RUnlock() // Don't use defer for performance reasons. } // singleRegisterEndpoint tries to add an endpoint to the multiPortEndpoint // list. The list might be empty already. func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint, flags ports.Flags) tcpip.Error { ep.mu.Lock() defer ep.mu.Unlock() bits := flags.Bits() & ports.MultiBindFlagMask if len(ep.endpoints) != 0 { // If it was previously bound, we need to check if we can bind again. if ep.flags.TotalRefs() > 0 && bits&ep.flags.SharedFlags() == 0 { return &tcpip.ErrPortInUse{} } } ep.endpoints = append(ep.endpoints, t) ep.flags.AddRef(bits) return nil } func (ep *multiPortEndpoint) singleCheckEndpoint(flags ports.Flags) tcpip.Error { ep.mu.RLock() defer ep.mu.RUnlock() bits := flags.Bits() & ports.MultiBindFlagMask if len(ep.endpoints) != 0 { // If it was previously bound, we need to check if we can bind again. if ep.flags.TotalRefs() > 0 && bits&ep.flags.SharedFlags() == 0 { return &tcpip.ErrPortInUse{} } } return nil } // unregisterEndpoint returns true if multiPortEndpoint has to be unregistered. func (ep *multiPortEndpoint) unregisterEndpoint(t TransportEndpoint, flags ports.Flags) bool { ep.mu.Lock() defer ep.mu.Unlock() for i, endpoint := range ep.endpoints { if endpoint == t { copy(ep.endpoints[i:], ep.endpoints[i+1:]) ep.endpoints[len(ep.endpoints)-1] = nil ep.endpoints = ep.endpoints[:len(ep.endpoints)-1] ep.flags.DropRef(flags.Bits() & ports.MultiBindFlagMask) break } } return len(ep.endpoints) == 0 } func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) tcpip.Error { if id.RemotePort != 0 { // SO_REUSEPORT only applies to bound/listening endpoints. flags.LoadBalanced = false } eps, ok := d.protocol[protocolIDs{netProto, protocol}] if !ok { return &tcpip.ErrUnknownProtocol{} } eps.mu.Lock() defer eps.mu.Unlock() epsByNIC, ok := eps.endpoints[id] if !ok { epsByNIC = &endpointsByNIC{ endpoints: make(map[tcpip.NICID]*multiPortEndpoint), seed: d.stack.seed, } } if err := epsByNIC.registerEndpoint(d, netProto, protocol, ep, flags, bindToDevice); err != nil { return err } // Only add this newly created epsByNIC if registerEndpoint succeeded. if !ok { eps.endpoints[id] = epsByNIC } return nil } func (d *transportDemuxer) singleCheckEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, flags ports.Flags, bindToDevice tcpip.NICID) tcpip.Error { if id.RemotePort != 0 { // SO_REUSEPORT only applies to bound/listening endpoints. flags.LoadBalanced = false } eps, ok := d.protocol[protocolIDs{netProto, protocol}] if !ok { return &tcpip.ErrUnknownProtocol{} } eps.mu.RLock() defer eps.mu.RUnlock() epsByNIC, ok := eps.endpoints[id] if !ok { return nil } return epsByNIC.checkEndpoint(flags, bindToDevice) } // unregisterEndpoint unregisters the endpoint with the given id such that it // won't receive any more packets. func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) { if id.RemotePort != 0 { // SO_REUSEPORT only applies to bound/listening endpoints. flags.LoadBalanced = false } for _, n := range netProtos { if eps, ok := d.protocol[protocolIDs{n, protocol}]; ok { eps.unregisterEndpoint(id, ep, flags, bindToDevice) } } } // deliverPacket attempts to find one or more matching transport endpoints, and // then, if matches are found, delivers the packet to them. Returns true if // the packet no longer needs to be handled. func (d *transportDemuxer) deliverPacket(protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer, id TransportEndpointID) bool { eps, ok := d.protocol[protocolIDs{pkt.NetworkProtocolNumber, protocol}] if !ok { return false } // If the packet is a UDP broadcast or multicast, then find all matching // transport endpoints. if protocol == header.UDPProtocolNumber && isInboundMulticastOrBroadcast(pkt, id.LocalAddress) { eps.mu.RLock() destEPs := eps.findAllEndpointsLocked(id) eps.mu.RUnlock() // Fail if we didn't find at least one matching transport endpoint. if len(destEPs) == 0 { d.stack.stats.UDP.UnknownPortErrors.Increment() return false } // handlePacket takes may modify pkt, so each endpoint needs its own // copy except for the final one. for _, ep := range destEPs[:len(destEPs)-1] { clone := pkt.Clone() ep.handlePacket(id, clone) clone.DecRef() } destEPs[len(destEPs)-1].handlePacket(id, pkt) return true } // If the packet is a TCP packet with a unspecified source or non-unicast // destination address, then do nothing further and instruct the caller to do // the same. The network layer handles address validation for specified source // addresses. if protocol == header.TCPProtocolNumber && (!isSpecified(id.LocalAddress) || !isSpecified(id.RemoteAddress) || isInboundMulticastOrBroadcast(pkt, id.LocalAddress)) { // TCP can only be used to communicate between a single source and a // single destination; the addresses must be unicast.e d.stack.stats.TCP.InvalidSegmentsReceived.Increment() return true } eps.mu.RLock() ep := eps.findEndpointLocked(id) eps.mu.RUnlock() if ep == nil { if protocol == header.UDPProtocolNumber { d.stack.stats.UDP.UnknownPortErrors.Increment() } return false } return ep.handlePacket(id, pkt) } // deliverRawPacket attempts to deliver the given packet and returns whether it // was delivered successfully. func (d *transportDemuxer) deliverRawPacket(protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) bool { eps, ok := d.protocol[protocolIDs{pkt.NetworkProtocolNumber, protocol}] if !ok { return false } // As in net/ipv4/ip_input.c:ip_local_deliver, attempt to deliver via // raw endpoint first. If there are multiple raw endpoints, they all // receive the packet. eps.mu.RLock() // Copy the list of raw endpoints to avoid packet handling under lock. var rawEPs []RawTransportEndpoint if n := len(eps.rawEndpoints); n != 0 { rawEPs = make([]RawTransportEndpoint, n) if m := copy(rawEPs, eps.rawEndpoints); m != n { panic(fmt.Sprintf("unexpected copy = %d, want %d", m, n)) } } eps.mu.RUnlock() for _, rawEP := range rawEPs { // Each endpoint gets its own copy of the packet for the sake // of save/restore. clone := pkt.Clone() rawEP.HandlePacket(clone) clone.DecRef() } return len(rawEPs) != 0 } // deliverError attempts to deliver the given error to the appropriate transport // endpoint. // // Returns true if the error was delivered. func (d *transportDemuxer) deliverError(n *nic, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, transErr TransportError, pkt *PacketBuffer, id TransportEndpointID) bool { eps, ok := d.protocol[protocolIDs{net, trans}] if !ok { return false } eps.mu.RLock() ep := eps.findEndpointLocked(id) eps.mu.RUnlock() if ep == nil { return false } ep.handleError(n, id, transErr, pkt) return true } // findTransportEndpoint find a single endpoint that most closely matches the provided id. func (d *transportDemuxer) findTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, id TransportEndpointID, nicID tcpip.NICID) TransportEndpoint { eps, ok := d.protocol[protocolIDs{netProto, transProto}] if !ok { return nil } eps.mu.RLock() epsByNIC := eps.findEndpointLocked(id) if epsByNIC == nil { eps.mu.RUnlock() return nil } epsByNIC.mu.RLock() eps.mu.RUnlock() mpep, ok := epsByNIC.endpoints[nicID] if !ok { if mpep, ok = epsByNIC.endpoints[0]; !ok { epsByNIC.mu.RUnlock() // Don't use defer for performance reasons. return nil } } ep := mpep.selectEndpoint(id, epsByNIC.seed) epsByNIC.mu.RUnlock() return ep } // registerRawEndpoint registers the given endpoint with the dispatcher such // that packets of the appropriate protocol are delivered to it. A single // packet can be sent to one or more raw endpoints along with a non-raw // endpoint. func (d *transportDemuxer) registerRawEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) tcpip.Error { eps, ok := d.protocol[protocolIDs{netProto, transProto}] if !ok { return &tcpip.ErrNotSupported{} } eps.mu.Lock() eps.rawEndpoints = append(eps.rawEndpoints, ep) eps.mu.Unlock() return nil } // unregisterRawEndpoint unregisters the raw endpoint for the given transport // protocol such that it won't receive any more packets. func (d *transportDemuxer) unregisterRawEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) { eps, ok := d.protocol[protocolIDs{netProto, transProto}] if !ok { panic(fmt.Errorf("tried to unregister endpoint with unsupported network and transport protocol pair: %d, %d", netProto, transProto)) } eps.mu.Lock() for i, rawEP := range eps.rawEndpoints { if rawEP == ep { lastIdx := len(eps.rawEndpoints) - 1 eps.rawEndpoints[i] = eps.rawEndpoints[lastIdx] eps.rawEndpoints[lastIdx] = nil eps.rawEndpoints = eps.rawEndpoints[:lastIdx] break } } eps.mu.Unlock() } func isInboundMulticastOrBroadcast(pkt *PacketBuffer, localAddr tcpip.Address) bool { return pkt.NetworkPacketInfo.LocalAddressBroadcast || header.IsV4MulticastAddress(localAddr) || header.IsV6MulticastAddress(localAddr) } func isSpecified(addr tcpip.Address) bool { return addr != header.IPv4Any && addr != header.IPv6Any } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/transport_endpoints_mutex.go000066400000000000000000000052201465435605700275370ustar00rootroot00000000000000package stack import ( "reflect" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/sync/locking" ) // RWMutex is sync.RWMutex with the correctness validator. type transportEndpointsRWMutex struct { mu sync.RWMutex } // lockNames is a list of user-friendly lock names. // Populated in init. var transportEndpointslockNames []string // lockNameIndex is used as an index passed to NestedLock and NestedUnlock, // referring to an index within lockNames. // Values are specified using the "consts" field of go_template_instance. type transportEndpointslockNameIndex int // DO NOT REMOVE: The following function automatically replaced with lock index constants. // LOCK_NAME_INDEX_CONSTANTS const () // Lock locks m. // +checklocksignore func (m *transportEndpointsRWMutex) Lock() { locking.AddGLock(transportEndpointsprefixIndex, -1) m.mu.Lock() } // NestedLock locks m knowing that another lock of the same type is held. // +checklocksignore func (m *transportEndpointsRWMutex) NestedLock(i transportEndpointslockNameIndex) { locking.AddGLock(transportEndpointsprefixIndex, int(i)) m.mu.Lock() } // Unlock unlocks m. // +checklocksignore func (m *transportEndpointsRWMutex) Unlock() { m.mu.Unlock() locking.DelGLock(transportEndpointsprefixIndex, -1) } // NestedUnlock unlocks m knowing that another lock of the same type is held. // +checklocksignore func (m *transportEndpointsRWMutex) NestedUnlock(i transportEndpointslockNameIndex) { m.mu.Unlock() locking.DelGLock(transportEndpointsprefixIndex, int(i)) } // RLock locks m for reading. // +checklocksignore func (m *transportEndpointsRWMutex) RLock() { locking.AddGLock(transportEndpointsprefixIndex, -1) m.mu.RLock() } // RUnlock undoes a single RLock call. // +checklocksignore func (m *transportEndpointsRWMutex) RUnlock() { m.mu.RUnlock() locking.DelGLock(transportEndpointsprefixIndex, -1) } // RLockBypass locks m for reading without executing the validator. // +checklocksignore func (m *transportEndpointsRWMutex) RLockBypass() { m.mu.RLock() } // RUnlockBypass undoes a single RLockBypass call. // +checklocksignore func (m *transportEndpointsRWMutex) RUnlockBypass() { m.mu.RUnlock() } // DowngradeLock atomically unlocks rw for writing and locks it for reading. // +checklocksignore func (m *transportEndpointsRWMutex) DowngradeLock() { m.mu.DowngradeLock() } var transportEndpointsprefixIndex *locking.MutexClass // DO NOT REMOVE: The following function is automatically replaced. func transportEndpointsinitLockNames() {} func init() { transportEndpointsinitLockNames() transportEndpointsprefixIndex = locking.NewMutexClass(reflect.TypeOf(transportEndpointsRWMutex{}), transportEndpointslockNames) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stack/tuple_list.go000066400000000000000000000117121465435605700243650ustar00rootroot00000000000000package stack // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type tupleElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (tupleElementMapper) linkerFor(elem *tuple) *tuple { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type tupleList struct { head *tuple tail *tuple } // Reset resets list l to the empty state. func (l *tupleList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *tupleList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *tupleList) Front() *tuple { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *tupleList) Back() *tuple { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *tupleList) Len() (count int) { for e := l.Front(); e != nil; e = (tupleElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *tupleList) PushFront(e *tuple) { linker := tupleElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { tupleElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *tupleList) PushFrontList(m *tupleList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { tupleElementMapper{}.linkerFor(l.head).SetPrev(m.tail) tupleElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *tupleList) PushBack(e *tuple) { linker := tupleElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { tupleElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *tupleList) PushBackList(m *tupleList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { tupleElementMapper{}.linkerFor(l.tail).SetNext(m.head) tupleElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *tupleList) InsertAfter(b, e *tuple) { bLinker := tupleElementMapper{}.linkerFor(b) eLinker := tupleElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { tupleElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *tupleList) InsertBefore(a, e *tuple) { aLinker := tupleElementMapper{}.linkerFor(a) eLinker := tupleElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { tupleElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *tupleList) Remove(e *tuple) { linker := tupleElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { tupleElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { tupleElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type tupleEntry struct { next *tuple prev *tuple } // Next returns the entry that follows e in the list. // //go:nosplit func (e *tupleEntry) Next() *tuple { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *tupleEntry) Prev() *tuple { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *tupleEntry) SetNext(elem *tuple) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *tupleEntry) SetPrev(elem *tuple) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stdclock.go000066400000000000000000000071351465435605700227060ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcpip import ( "fmt" "time" ) // stdClock implements Clock with the time package. // // +stateify savable type stdClock struct { // baseTime holds the time when the clock was constructed. // // This value is used to calculate the monotonic time from the time package. // As per https://golang.org/pkg/time/#hdr-Monotonic_Clocks, // // Operating systems provide both a “wall clock,” which is subject to // changes for clock synchronization, and a “monotonic clock,” which is not. // The general rule is that the wall clock is for telling time and the // monotonic clock is for measuring time. Rather than split the API, in this // package the Time returned by time.Now contains both a wall clock reading // and a monotonic clock reading; later time-telling operations use the wall // clock reading, but later time-measuring operations, specifically // comparisons and subtractions, use the monotonic clock reading. // // ... // // If Times t and u both contain monotonic clock readings, the operations // t.After(u), t.Before(u), t.Equal(u), and t.Sub(u) are carried out using // the monotonic clock readings alone, ignoring the wall clock readings. If // either t or u contains no monotonic clock reading, these operations fall // back to using the wall clock readings. // // Given the above, we can safely conclude that time.Since(baseTime) will // return monotonically increasing values if we use time.Now() to set baseTime // at the time of clock construction. // // Note that time.Since(t) is shorthand for time.Now().Sub(t), as per // https://golang.org/pkg/time/#Since. baseTime time.Time `state:"nosave"` // monotonicOffset is the offset applied to the calculated monotonic time. // // monotonicOffset is assigned after restore so that the monotonic time // will continue from where it "left off" before saving as part of S/R. monotonicOffset MonotonicTime } // NewStdClock returns an instance of a clock that uses the time package. func NewStdClock() Clock { return &stdClock{ baseTime: time.Now(), } } var _ Clock = (*stdClock)(nil) // Now implements Clock.Now. func (*stdClock) Now() time.Time { return time.Now() } // NowMonotonic implements Clock.NowMonotonic. func (s *stdClock) NowMonotonic() MonotonicTime { sinceBase := time.Since(s.baseTime) if sinceBase < 0 { panic(fmt.Sprintf("got negative duration = %s since base time = %s", sinceBase, s.baseTime)) } return s.monotonicOffset.Add(sinceBase) } // AfterFunc implements Clock.AfterFunc. func (*stdClock) AfterFunc(d time.Duration, f func()) Timer { return &stdTimer{ t: time.AfterFunc(d, f), } } // +stateify savable type stdTimer struct { t *time.Timer } var _ Timer = (*stdTimer)(nil) // Stop implements Timer.Stop. func (st *stdTimer) Stop() bool { return st.t.Stop() } // Reset implements Timer.Reset. func (st *stdTimer) Reset(d time.Duration) { st.t.Reset(d) } // NewStdTimer returns a Timer implemented with the time package. func NewStdTimer(t *time.Timer) Timer { return &stdTimer{t: t} } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/stdclock_state.go000066400000000000000000000015441465435605700241040ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcpip import ( "context" "time" ) // beforeSave is invoked by stateify. func (s *stdClock) beforeSave() { s.monotonicOffset = s.NowMonotonic() } // afterLoad is invoked by stateify. func (s *stdClock) afterLoad(context.Context) { s.baseTime = time.Now() } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/tcpip.go000066400000000000000000002503411465435605700222160ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package tcpip provides the interfaces and related types that users of the // tcpip stack will use in order to create endpoints used to send and receive // data over the network stack. // // The starting point is the creation and configuration of a stack. A stack can // be created by calling the New() function of the tcpip/stack/stack package; // configuring a stack involves creating NICs (via calls to Stack.CreateNIC()), // adding network addresses (via calls to Stack.AddProtocolAddress()), and // setting a route table (via a call to Stack.SetRouteTable()). // // Once a stack is configured, endpoints can be created by calling // Stack.NewEndpoint(). Such endpoints can be used to send/receive data, connect // to peers, listen for connections, accept connections, etc., depending on the // transport protocol selected. package tcpip import ( "bytes" "errors" "fmt" "io" "math" "math/bits" "math/rand" "net" "reflect" "strconv" "strings" "time" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) // Using the header package here would cause an import cycle. const ( ipv4AddressSize = 4 ipv4ProtocolNumber = 0x0800 ipv6AddressSize = 16 ipv6ProtocolNumber = 0x86dd ) const ( // LinkAddressSize is the size of a MAC address. LinkAddressSize = 6 ) // Known IP address. var ( IPv4Zero = []byte{0, 0, 0, 0} IPv6Zero = []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} ) // Errors related to Subnet var ( errSubnetLengthMismatch = errors.New("subnet length of address and mask differ") errSubnetAddressMasked = errors.New("subnet address has bits set outside the mask") ) // ErrSaveRejection indicates a failed save due to unsupported networking state. // This type of errors is only used for save logic. type ErrSaveRejection struct { Err error } // Error returns a sensible description of the save rejection error. func (e *ErrSaveRejection) Error() string { return "save rejected due to unsupported networking state: " + e.Err.Error() } // MonotonicTime is a monotonic clock reading. // // +stateify savable type MonotonicTime struct { nanoseconds int64 } // String implements Stringer. func (mt MonotonicTime) String() string { return strconv.FormatInt(mt.nanoseconds, 10) } // MonotonicTimeInfinite returns the monotonic timestamp as far away in the // future as possible. func MonotonicTimeInfinite() MonotonicTime { return MonotonicTime{nanoseconds: math.MaxInt64} } // Before reports whether the monotonic clock reading mt is before u. func (mt MonotonicTime) Before(u MonotonicTime) bool { return mt.nanoseconds < u.nanoseconds } // After reports whether the monotonic clock reading mt is after u. func (mt MonotonicTime) After(u MonotonicTime) bool { return mt.nanoseconds > u.nanoseconds } // Add returns the monotonic clock reading mt+d. func (mt MonotonicTime) Add(d time.Duration) MonotonicTime { return MonotonicTime{ nanoseconds: time.Unix(0, mt.nanoseconds).Add(d).Sub(time.Unix(0, 0)).Nanoseconds(), } } // Sub returns the duration mt-u. If the result exceeds the maximum (or minimum) // value that can be stored in a Duration, the maximum (or minimum) duration // will be returned. To compute t-d for a duration d, use t.Add(-d). func (mt MonotonicTime) Sub(u MonotonicTime) time.Duration { return time.Unix(0, mt.nanoseconds).Sub(time.Unix(0, u.nanoseconds)) } // Milliseconds returns the time in milliseconds. func (mt MonotonicTime) Milliseconds() int64 { return mt.nanoseconds / 1e6 } // A Clock provides the current time and schedules work for execution. // // Times returned by a Clock should always be used for application-visible // time. Only monotonic times should be used for netstack internal timekeeping. type Clock interface { // Now returns the current local time. Now() time.Time // NowMonotonic returns the current monotonic clock reading. NowMonotonic() MonotonicTime // AfterFunc waits for the duration to elapse and then calls f in its own // goroutine. It returns a Timer that can be used to cancel the call using // its Stop method. AfterFunc(d time.Duration, f func()) Timer } // Timer represents a single event. A Timer must be created with // Clock.AfterFunc. type Timer interface { // Stop prevents the Timer from firing. It returns true if the call stops the // timer, false if the timer has already expired or been stopped. // // If Stop returns false, then the timer has already expired and the function // f of Clock.AfterFunc(d, f) has been started in its own goroutine; Stop // does not wait for f to complete before returning. If the caller needs to // know whether f is completed, it must coordinate with f explicitly. Stop() bool // Reset changes the timer to expire after duration d. // // Reset should be invoked only on stopped or expired timers. If the timer is // known to have expired, Reset can be used directly. Otherwise, the caller // must coordinate with the function f of Clock.AfterFunc(d, f). Reset(d time.Duration) } // Address is a byte slice cast as a string that represents the address of a // network node. Or, in the case of unix endpoints, it may represent a path. // // +stateify savable type Address struct { addr [16]byte length int } // AddrFrom4 converts addr to an Address. func AddrFrom4(addr [4]byte) Address { ret := Address{ length: 4, } // It's guaranteed that copy will return 4. copy(ret.addr[:], addr[:]) return ret } // AddrFrom4Slice converts addr to an Address. It panics if len(addr) != 4. func AddrFrom4Slice(addr []byte) Address { if len(addr) != 4 { panic(fmt.Sprintf("bad address length for address %v", addr)) } ret := Address{ length: 4, } // It's guaranteed that copy will return 4. copy(ret.addr[:], addr) return ret } // AddrFrom16 converts addr to an Address. func AddrFrom16(addr [16]byte) Address { ret := Address{ length: 16, } // It's guaranteed that copy will return 16. copy(ret.addr[:], addr[:]) return ret } // AddrFrom16Slice converts addr to an Address. It panics if len(addr) != 16. func AddrFrom16Slice(addr []byte) Address { if len(addr) != 16 { panic(fmt.Sprintf("bad address length for address %v", addr)) } ret := Address{ length: 16, } // It's guaranteed that copy will return 16. copy(ret.addr[:], addr) return ret } // AddrFromSlice converts addr to an Address. It returns the Address zero value // if len(addr) != 4 or 16. func AddrFromSlice(addr []byte) Address { switch len(addr) { case ipv4AddressSize: return AddrFrom4Slice(addr) case ipv6AddressSize: return AddrFrom16Slice(addr) } return Address{} } // As4 returns a as a 4 byte array. It panics if the address length is not 4. func (a Address) As4() [4]byte { if a.Len() != 4 { panic(fmt.Sprintf("bad address length for address %v", a.addr)) } return [4]byte(a.addr[:4]) } // As16 returns a as a 16 byte array. It panics if the address length is not 16. func (a Address) As16() [16]byte { if a.Len() != 16 { panic(fmt.Sprintf("bad address length for address %v", a.addr)) } return [16]byte(a.addr[:16]) } // AsSlice returns a as a byte slice. Callers should be careful as it can // return a window into existing memory. // // +checkescape func (a *Address) AsSlice() []byte { return a.addr[:a.length] } // BitLen returns the length in bits of a. func (a Address) BitLen() int { return a.Len() * 8 } // Len returns the length in bytes of a. func (a Address) Len() int { return a.length } // WithPrefix returns the address with a prefix that represents a point subnet. func (a Address) WithPrefix() AddressWithPrefix { return AddressWithPrefix{ Address: a, PrefixLen: a.BitLen(), } } // Unspecified returns true if the address is unspecified. func (a Address) Unspecified() bool { for _, b := range a.addr { if b != 0 { return false } } return true } // Equal returns whether a and other are equal. It exists for use by the cmp // library. func (a Address) Equal(other Address) bool { return a == other } // MatchingPrefix returns the matching prefix length in bits. // // Panics if b and a have different lengths. func (a Address) MatchingPrefix(b Address) uint8 { const bitsInAByte = 8 if a.Len() != b.Len() { panic(fmt.Sprintf("addresses %s and %s do not have the same length", a, b)) } var prefix uint8 for i := 0; i < a.length; i++ { aByte := a.addr[i] bByte := b.addr[i] if aByte == bByte { prefix += bitsInAByte continue } // Count the remaining matching bits in the byte from MSbit to LSBbit. mask := uint8(1) << (bitsInAByte - 1) for { if aByte&mask == bByte&mask { prefix++ mask >>= 1 continue } break } break } return prefix } // AddressMask is a bitmask for an address. // // +stateify savable type AddressMask struct { mask [16]byte length int } // MaskFrom returns a Mask based on str. // // MaskFrom may allocate, and so should not be in hot paths. func MaskFrom(str string) AddressMask { mask := AddressMask{length: len(str)} copy(mask.mask[:], str) return mask } // MaskFromBytes returns a Mask based on bs. func MaskFromBytes(bs []byte) AddressMask { mask := AddressMask{length: len(bs)} copy(mask.mask[:], bs) return mask } // String implements Stringer. func (m AddressMask) String() string { return fmt.Sprintf("%x", m.mask) } // AsSlice returns a as a byte slice. Callers should be careful as it can // return a window into existing memory. func (m *AddressMask) AsSlice() []byte { return []byte(m.mask[:m.length]) } // BitLen returns the length of the mask in bits. func (m AddressMask) BitLen() int { return m.length * 8 } // Len returns the length of the mask in bytes. func (m AddressMask) Len() int { return m.length } // Prefix returns the number of bits before the first host bit. func (m AddressMask) Prefix() int { p := 0 for _, b := range m.mask[:m.length] { p += bits.LeadingZeros8(^b) } return p } // Equal returns whether m and other are equal. It exists for use by the cmp // library. func (m AddressMask) Equal(other AddressMask) bool { return m == other } // Subnet is a subnet defined by its address and mask. // // +stateify savable type Subnet struct { address Address mask AddressMask } // NewSubnet creates a new Subnet, checking that the address and mask are the same length. func NewSubnet(a Address, m AddressMask) (Subnet, error) { if a.Len() != m.Len() { return Subnet{}, errSubnetLengthMismatch } for i := 0; i < a.Len(); i++ { if a.addr[i]&^m.mask[i] != 0 { return Subnet{}, errSubnetAddressMasked } } return Subnet{a, m}, nil } // String implements Stringer. func (s Subnet) String() string { return fmt.Sprintf("%s/%d", s.ID(), s.Prefix()) } // Contains returns true iff the address is of the same length and matches the // subnet address and mask. func (s *Subnet) Contains(a Address) bool { if a.Len() != s.address.Len() { return false } for i := 0; i < a.Len(); i++ { if a.addr[i]&s.mask.mask[i] != s.address.addr[i] { return false } } return true } // ID returns the subnet ID. func (s *Subnet) ID() Address { return s.address } // Bits returns the number of ones (network bits) and zeros (host bits) in the // subnet mask. func (s *Subnet) Bits() (ones int, zeros int) { ones = s.mask.Prefix() return ones, s.mask.BitLen() - ones } // Prefix returns the number of bits before the first host bit. func (s *Subnet) Prefix() int { return s.mask.Prefix() } // Mask returns the subnet mask. func (s *Subnet) Mask() AddressMask { return s.mask } // Broadcast returns the subnet's broadcast address. func (s *Subnet) Broadcast() Address { addrCopy := s.address for i := 0; i < addrCopy.Len(); i++ { addrCopy.addr[i] |= ^s.mask.mask[i] } return addrCopy } // IsBroadcast returns true if the address is considered a broadcast address. func (s *Subnet) IsBroadcast(address Address) bool { // Only IPv4 supports the notion of a broadcast address. if address.Len() != ipv4AddressSize { return false } // Normally, we would just compare address with the subnet's broadcast // address but there is an exception where a simple comparison is not // correct. This exception is for /31 and /32 IPv4 subnets where all // addresses are considered valid host addresses. // // For /31 subnets, the case is easy. RFC 3021 Section 2.1 states that // both addresses in a /31 subnet "MUST be interpreted as host addresses." // // For /32, the case is a bit more vague. RFC 3021 makes no mention of /32 // subnets. However, the same reasoning applies - if an exception is not // made, then there do not exist any host addresses in a /32 subnet. RFC // 4632 Section 3.1 also vaguely implies this interpretation by referring // to addresses in /32 subnets as "host routes." return s.Prefix() <= 30 && s.Broadcast() == address } // Equal returns true if this Subnet is equal to the given Subnet. func (s Subnet) Equal(o Subnet) bool { // If this changes, update Route.Equal accordingly. return s == o } // NICID is a number that uniquely identifies a NIC. type NICID int32 // ShutdownFlags represents flags that can be passed to the Shutdown() method // of the Endpoint interface. type ShutdownFlags int // Values of the flags that can be passed to the Shutdown() method. They can // be OR'ed together. const ( ShutdownRead ShutdownFlags = 1 << iota ShutdownWrite ) // PacketType is used to indicate the destination of the packet. type PacketType uint8 const ( // PacketHost indicates a packet addressed to the local host. PacketHost PacketType = iota // PacketOtherHost indicates an outgoing packet addressed to // another host caught by a NIC in promiscuous mode. PacketOtherHost // PacketOutgoing for a packet originating from the local host // that is looped back to a packet socket. PacketOutgoing // PacketBroadcast indicates a link layer broadcast packet. PacketBroadcast // PacketMulticast indicates a link layer multicast packet. PacketMulticast ) // FullAddress represents a full transport node address, as required by the // Connect() and Bind() methods. // // +stateify savable type FullAddress struct { // NIC is the ID of the NIC this address refers to. // // This may not be used by all endpoint types. NIC NICID // Addr is the network address. Addr Address // Port is the transport port. // // This may not be used by all endpoint types. Port uint16 // LinkAddr is the link layer address. LinkAddr LinkAddress } // Payloader is an interface that provides data. // // This interface allows the endpoint to request the amount of data it needs // based on internal buffers without exposing them. type Payloader interface { io.Reader // Len returns the number of bytes of the unread portion of the // Reader. Len() int } var _ Payloader = (*bytes.Buffer)(nil) var _ Payloader = (*bytes.Reader)(nil) var _ io.Writer = (*SliceWriter)(nil) // SliceWriter implements io.Writer for slices. type SliceWriter []byte // Write implements io.Writer.Write. func (s *SliceWriter) Write(b []byte) (int, error) { n := copy(*s, b) *s = (*s)[n:] var err error if n != len(b) { err = io.ErrShortWrite } return n, err } var _ io.Writer = (*LimitedWriter)(nil) // A LimitedWriter writes to W but limits the amount of data copied to just N // bytes. Each call to Write updates N to reflect the new amount remaining. type LimitedWriter struct { W io.Writer N int64 } func (l *LimitedWriter) Write(p []byte) (int, error) { pLen := int64(len(p)) if pLen > l.N { p = p[:l.N] } n, err := l.W.Write(p) n64 := int64(n) if err == nil && n64 != pLen { err = io.ErrShortWrite } l.N -= n64 return n, err } // SendableControlMessages contains socket control messages that can be written. // // +stateify savable type SendableControlMessages struct { // HasTTL indicates whether TTL is valid/set. HasTTL bool // TTL is the IPv4 Time To Live of the associated packet. TTL uint8 // HasHopLimit indicates whether HopLimit is valid/set. HasHopLimit bool // HopLimit is the IPv6 Hop Limit of the associated packet. HopLimit uint8 // HasIPv6PacketInfo indicates whether IPv6PacketInfo is set. HasIPv6PacketInfo bool // IPv6PacketInfo holds interface and address data on an incoming packet. IPv6PacketInfo IPv6PacketInfo } // ReceivableControlMessages contains socket control messages that can be // received. // // +stateify savable type ReceivableControlMessages struct { // Timestamp is the time that the last packet used to create the read data // was received. Timestamp time.Time `state:".(int64)"` // HasInq indicates whether Inq is valid/set. HasInq bool // Inq is the number of bytes ready to be received. Inq int32 // HasTOS indicates whether TOS is valid/set. HasTOS bool // TOS is the IPv4 type of service of the associated packet. TOS uint8 // HasTTL indicates whether TTL is valid/set. HasTTL bool // TTL is the IPv4 Time To Live of the associated packet. TTL uint8 // HasHopLimit indicates whether HopLimit is valid/set. HasHopLimit bool // HopLimit is the IPv6 Hop Limit of the associated packet. HopLimit uint8 // HasTimestamp indicates whether Timestamp is valid/set. HasTimestamp bool // HasTClass indicates whether TClass is valid/set. HasTClass bool // TClass is the IPv6 traffic class of the associated packet. TClass uint32 // HasIPPacketInfo indicates whether PacketInfo is set. HasIPPacketInfo bool // PacketInfo holds interface and address data on an incoming packet. PacketInfo IPPacketInfo // HasIPv6PacketInfo indicates whether IPv6PacketInfo is set. HasIPv6PacketInfo bool // IPv6PacketInfo holds interface and address data on an incoming packet. IPv6PacketInfo IPv6PacketInfo // HasOriginalDestinationAddress indicates whether OriginalDstAddress is // set. HasOriginalDstAddress bool // OriginalDestinationAddress holds the original destination address // and port of the incoming packet. OriginalDstAddress FullAddress // SockErr is the dequeued socket error on recvmsg(MSG_ERRQUEUE). SockErr *SockError } // PacketOwner is used to get UID and GID of the packet. type PacketOwner interface { // KUID returns KUID of the packet. KUID() uint32 // KGID returns KGID of the packet. KGID() uint32 } // ReadOptions contains options for Endpoint.Read. type ReadOptions struct { // Peek indicates whether this read is a peek. Peek bool // NeedRemoteAddr indicates whether to return the remote address, if // supported. NeedRemoteAddr bool // NeedLinkPacketInfo indicates whether to return the link-layer information, // if supported. NeedLinkPacketInfo bool } // ReadResult represents result for a successful Endpoint.Read. type ReadResult struct { // Count is the number of bytes received and written to the buffer. Count int // Total is the number of bytes of the received packet. This can be used to // determine whether the read is truncated. Total int // ControlMessages is the control messages received. ControlMessages ReceivableControlMessages // RemoteAddr is the remote address if ReadOptions.NeedAddr is true. RemoteAddr FullAddress // LinkPacketInfo is the link-layer information of the received packet if // ReadOptions.NeedLinkPacketInfo is true. LinkPacketInfo LinkPacketInfo } // Endpoint is the interface implemented by transport protocols (e.g., tcp, udp) // that exposes functionality like read, write, connect, etc. to users of the // networking stack. type Endpoint interface { // Close puts the endpoint in a closed state and frees all resources // associated with it. Close initiates the teardown process, the // Endpoint may not be fully closed when Close returns. Close() // Abort initiates an expedited endpoint teardown. As compared to // Close, Abort prioritizes closing the Endpoint quickly over cleanly. // Abort is best effort; implementing Abort with Close is acceptable. Abort() // Read reads data from the endpoint and optionally writes to dst. // // This method does not block if there is no data pending; in this case, // ErrWouldBlock is returned. // // If non-zero number of bytes are successfully read and written to dst, err // must be nil. Otherwise, if dst failed to write anything, ErrBadBuffer // should be returned. Read(io.Writer, ReadOptions) (ReadResult, Error) // Write writes data to the endpoint's peer. This method does not block if // the data cannot be written. // // Unlike io.Writer.Write, Endpoint.Write transfers ownership of any bytes // successfully written to the Endpoint. That is, if a call to // Write(SlicePayload{data}) returns (n, err), it may retain data[:n], and // the caller should not use data[:n] after Write returns. // // Note that unlike io.Writer.Write, it is not an error for Write to // perform a partial write (if n > 0, no error may be returned). Only // stream (TCP) Endpoints may return partial writes, and even then only // in the case where writing additional data would block. Other Endpoints // will either write the entire message or return an error. Write(Payloader, WriteOptions) (int64, Error) // Connect connects the endpoint to its peer. Specifying a NIC is // optional. // // There are three classes of return values: // nil -- the attempt to connect succeeded. // ErrConnectStarted/ErrAlreadyConnecting -- the connect attempt started // but hasn't completed yet. In this case, the caller must call Connect // or GetSockOpt(ErrorOption) when the endpoint becomes writable to // get the actual result. The first call to Connect after the socket has // connected returns nil. Calling connect again results in ErrAlreadyConnected. // Anything else -- the attempt to connect failed. // // If address.Addr is empty, this means that Endpoint has to be // disconnected if this is supported, otherwise // ErrAddressFamilyNotSupported must be returned. Connect(address FullAddress) Error // Disconnect disconnects the endpoint from its peer. Disconnect() Error // Shutdown closes the read and/or write end of the endpoint connection // to its peer. Shutdown(flags ShutdownFlags) Error // Listen puts the endpoint in "listen" mode, which allows it to accept // new connections. Listen(backlog int) Error // Accept returns a new endpoint if a peer has established a connection // to an endpoint previously set to listen mode. This method does not // block if no new connections are available. // // The returned Queue is the wait queue for the newly created endpoint. // // If peerAddr is not nil then it is populated with the peer address of the // returned endpoint. Accept(peerAddr *FullAddress) (Endpoint, *waiter.Queue, Error) // Bind binds the endpoint to a specific local address and port. // Specifying a NIC is optional. Bind(address FullAddress) Error // GetLocalAddress returns the address to which the endpoint is bound. GetLocalAddress() (FullAddress, Error) // GetRemoteAddress returns the address to which the endpoint is // connected. GetRemoteAddress() (FullAddress, Error) // Readiness returns the current readiness of the endpoint. For example, // if waiter.EventIn is set, the endpoint is immediately readable. Readiness(mask waiter.EventMask) waiter.EventMask // SetSockOpt sets a socket option. SetSockOpt(opt SettableSocketOption) Error // SetSockOptInt sets a socket option, for simple cases where a value // has the int type. SetSockOptInt(opt SockOptInt, v int) Error // GetSockOpt gets a socket option. GetSockOpt(opt GettableSocketOption) Error // GetSockOptInt gets a socket option for simple cases where a return // value has the int type. GetSockOptInt(SockOptInt) (int, Error) // State returns a socket's lifecycle state. The returned value is // protocol-specific and is primarily used for diagnostics. State() uint32 // ModerateRecvBuf should be called everytime data is copied to the user // space. This allows for dynamic tuning of recv buffer space for a // given socket. // // NOTE: This method is a no-op for sockets other than TCP. ModerateRecvBuf(copied int) // Info returns a copy to the transport endpoint info. Info() EndpointInfo // Stats returns a reference to the endpoint stats. Stats() EndpointStats // SetOwner sets the task owner to the endpoint owner. SetOwner(owner PacketOwner) // LastError clears and returns the last error reported by the endpoint. LastError() Error // SocketOptions returns the structure which contains all the socket // level options. SocketOptions() *SocketOptions } // EndpointWithPreflight is the interface implemented by endpoints that need // to expose the `Preflight` method for preparing the endpoint prior to // calling `Write`. type EndpointWithPreflight interface { // Prepares the endpoint for writes using the provided WriteOptions, // returning an error if the options were incompatible with the endpoint's // current state. Preflight(WriteOptions) Error } // LinkPacketInfo holds Link layer information for a received packet. // // +stateify savable type LinkPacketInfo struct { // Protocol is the NetworkProtocolNumber for the packet. Protocol NetworkProtocolNumber // PktType is used to indicate the destination of the packet. PktType PacketType } // EndpointInfo is the interface implemented by each endpoint info struct. type EndpointInfo interface { // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo // marker interface. IsEndpointInfo() } // EndpointStats is the interface implemented by each endpoint stats struct. type EndpointStats interface { // IsEndpointStats is an empty method to implement the tcpip.EndpointStats // marker interface. IsEndpointStats() } // WriteOptions contains options for Endpoint.Write. type WriteOptions struct { // If To is not nil, write to the given address instead of the endpoint's // peer. To *FullAddress // More has the same semantics as Linux's MSG_MORE. More bool // EndOfRecord has the same semantics as Linux's MSG_EOR. EndOfRecord bool // Atomic means that all data fetched from Payloader must be written to the // endpoint. If Atomic is false, then data fetched from the Payloader may be // discarded if available endpoint buffer space is insufficient. Atomic bool // ControlMessages contains optional overrides used when writing a packet. ControlMessages SendableControlMessages } // SockOptInt represents socket options which values have the int type. type SockOptInt int const ( // KeepaliveCountOption is used by SetSockOptInt/GetSockOptInt to // specify the number of un-ACKed TCP keepalives that will be sent // before the connection is closed. KeepaliveCountOption SockOptInt = iota // IPv4TOSOption is used by SetSockOptInt/GetSockOptInt to specify TOS // for all subsequent outgoing IPv4 packets from the endpoint. IPv4TOSOption // IPv6TrafficClassOption is used by SetSockOptInt/GetSockOptInt to // specify TOS for all subsequent outgoing IPv6 packets from the // endpoint. IPv6TrafficClassOption // MaxSegOption is used by SetSockOptInt/GetSockOptInt to set/get the // current Maximum Segment Size(MSS) value as specified using the // TCP_MAXSEG option. MaxSegOption // MTUDiscoverOption is used to set/get the path MTU discovery setting. // // NOTE: Setting this option to any other value than PMTUDiscoveryDont // is not supported and will fail as such, and getting this option will // always return PMTUDiscoveryDont. MTUDiscoverOption // MulticastTTLOption is used by SetSockOptInt/GetSockOptInt to control // the default TTL value for multicast messages. The default is 1. MulticastTTLOption // ReceiveQueueSizeOption is used in GetSockOptInt to specify that the // number of unread bytes in the input buffer should be returned. ReceiveQueueSizeOption // SendQueueSizeOption is used in GetSockOptInt to specify that the // number of unread bytes in the output buffer should be returned. SendQueueSizeOption // IPv4TTLOption is used by SetSockOptInt/GetSockOptInt to control the default // TTL value for unicast messages. // // The default is configured by DefaultTTLOption. A UseDefaultIPv4TTL value // configures the endpoint to use the default. IPv4TTLOption // IPv6HopLimitOption is used by SetSockOptInt/GetSockOptInt to control the // default hop limit value for unicast messages. // // The default is configured by DefaultTTLOption. A UseDefaultIPv6HopLimit // value configures the endpoint to use the default. IPv6HopLimitOption // TCPSynCountOption is used by SetSockOptInt/GetSockOptInt to specify // the number of SYN retransmits that TCP should send before aborting // the attempt to connect. It cannot exceed 255. // // NOTE: This option is currently only stubbed out and is no-op. TCPSynCountOption // TCPWindowClampOption is used by SetSockOptInt/GetSockOptInt to bound // the size of the advertised window to this value. // // NOTE: This option is currently only stubed out and is a no-op TCPWindowClampOption // IPv6Checksum is used to request the stack to populate and validate the IPv6 // checksum for transport level headers. IPv6Checksum ) const ( // UseDefaultIPv4TTL is the IPv4TTLOption value that configures an endpoint to // use the default ttl currently configured by the IPv4 protocol (see // DefaultTTLOption). UseDefaultIPv4TTL = 0 // UseDefaultIPv6HopLimit is the IPv6HopLimitOption value that configures an // endpoint to use the default hop limit currently configured by the IPv6 // protocol (see DefaultTTLOption). UseDefaultIPv6HopLimit = -1 ) // PMTUDStrategy is the kind of PMTUD to perform. type PMTUDStrategy int const ( // PMTUDiscoveryWant is a setting of the MTUDiscoverOption to use // per-route settings. PMTUDiscoveryWant PMTUDStrategy = iota // PMTUDiscoveryDont is a setting of the MTUDiscoverOption to disable // path MTU discovery. PMTUDiscoveryDont // PMTUDiscoveryDo is a setting of the MTUDiscoverOption to always do // path MTU discovery. PMTUDiscoveryDo // PMTUDiscoveryProbe is a setting of the MTUDiscoverOption to set DF // but ignore path MTU. PMTUDiscoveryProbe ) // GettableNetworkProtocolOption is a marker interface for network protocol // options that may be queried. type GettableNetworkProtocolOption interface { isGettableNetworkProtocolOption() } // SettableNetworkProtocolOption is a marker interface for network protocol // options that may be set. type SettableNetworkProtocolOption interface { isSettableNetworkProtocolOption() } // DefaultTTLOption is used by stack.(*Stack).NetworkProtocolOption to specify // a default TTL. type DefaultTTLOption uint8 func (*DefaultTTLOption) isGettableNetworkProtocolOption() {} func (*DefaultTTLOption) isSettableNetworkProtocolOption() {} // GettableTransportProtocolOption is a marker interface for transport protocol // options that may be queried. type GettableTransportProtocolOption interface { isGettableTransportProtocolOption() } // SettableTransportProtocolOption is a marker interface for transport protocol // options that may be set. type SettableTransportProtocolOption interface { isSettableTransportProtocolOption() } // TCPSACKEnabled the SACK option for TCP. // // See: https://tools.ietf.org/html/rfc2018. type TCPSACKEnabled bool func (*TCPSACKEnabled) isGettableTransportProtocolOption() {} func (*TCPSACKEnabled) isSettableTransportProtocolOption() {} // TCPRecovery is the loss deteoction algorithm used by TCP. type TCPRecovery int32 func (*TCPRecovery) isGettableTransportProtocolOption() {} func (*TCPRecovery) isSettableTransportProtocolOption() {} // TCPAlwaysUseSynCookies indicates unconditional usage of syncookies. type TCPAlwaysUseSynCookies bool func (*TCPAlwaysUseSynCookies) isGettableTransportProtocolOption() {} func (*TCPAlwaysUseSynCookies) isSettableTransportProtocolOption() {} const ( // TCPRACKLossDetection indicates RACK is used for loss detection and // recovery. TCPRACKLossDetection TCPRecovery = 1 << iota // TCPRACKStaticReoWnd indicates the reordering window should not be // adjusted when DSACK is received. TCPRACKStaticReoWnd // TCPRACKNoDupTh indicates RACK should not consider the classic three // duplicate acknowledgements rule to mark the segments as lost. This // is used when reordering is not detected. TCPRACKNoDupTh ) // TCPDelayEnabled enables/disables Nagle's algorithm in TCP. type TCPDelayEnabled bool func (*TCPDelayEnabled) isGettableTransportProtocolOption() {} func (*TCPDelayEnabled) isSettableTransportProtocolOption() {} // TCPSendBufferSizeRangeOption is the send buffer size range for TCP. // // +stateify savable type TCPSendBufferSizeRangeOption struct { Min int Default int Max int } func (*TCPSendBufferSizeRangeOption) isGettableTransportProtocolOption() {} func (*TCPSendBufferSizeRangeOption) isSettableTransportProtocolOption() {} // TCPReceiveBufferSizeRangeOption is the receive buffer size range for TCP. // // +stateify savable type TCPReceiveBufferSizeRangeOption struct { Min int Default int Max int } func (*TCPReceiveBufferSizeRangeOption) isGettableTransportProtocolOption() {} func (*TCPReceiveBufferSizeRangeOption) isSettableTransportProtocolOption() {} // TCPAvailableCongestionControlOption is the supported congestion control // algorithms for TCP type TCPAvailableCongestionControlOption string func (*TCPAvailableCongestionControlOption) isGettableTransportProtocolOption() {} func (*TCPAvailableCongestionControlOption) isSettableTransportProtocolOption() {} // TCPModerateReceiveBufferOption enables/disables receive buffer moderation // for TCP. type TCPModerateReceiveBufferOption bool func (*TCPModerateReceiveBufferOption) isGettableTransportProtocolOption() {} func (*TCPModerateReceiveBufferOption) isSettableTransportProtocolOption() {} // GettableSocketOption is a marker interface for socket options that may be // queried. type GettableSocketOption interface { isGettableSocketOption() } // SettableSocketOption is a marker interface for socket options that may be // configured. type SettableSocketOption interface { isSettableSocketOption() } // ICMPv6Filter specifies a filter for ICMPv6 types. // // +stateify savable type ICMPv6Filter struct { // DenyType indicates if an ICMP type should be blocked. // // The ICMPv6 type field is 8 bits so there are up to 256 different ICMPv6 // types. DenyType [8]uint32 } // ShouldDeny returns true iff the ICMPv6 Type should be denied. func (f *ICMPv6Filter) ShouldDeny(icmpType uint8) bool { const bitsInUint32 = 32 i := icmpType / bitsInUint32 b := icmpType % bitsInUint32 return f.DenyType[i]&(1< 0 { _, _ = fmt.Fprintf(&out, " via %s", r.Gateway) } _, _ = fmt.Fprintf(&out, " nic %d", r.NIC) return out.String() } // Equal returns true if the given Route is equal to this Route. func (r Route) Equal(to Route) bool { // NOTE: This relies on the fact that r.Destination == to.Destination return r.Destination.Equal(to.Destination) && r.NIC == to.NIC } // TransportProtocolNumber is the number of a transport protocol. type TransportProtocolNumber uint32 // NetworkProtocolNumber is the EtherType of a network protocol in an Ethernet // frame. // // See: https://www.iana.org/assignments/ieee-802-numbers/ieee-802-numbers.xhtml type NetworkProtocolNumber uint32 // A StatCounter keeps track of a statistic. // // +stateify savable type StatCounter struct { count atomicbitops.Uint64 } // Increment adds one to the counter. func (s *StatCounter) Increment() { s.IncrementBy(1) } // Decrement minuses one to the counter. func (s *StatCounter) Decrement() { s.IncrementBy(^uint64(0)) } // Value returns the current value of the counter. func (s *StatCounter) Value() uint64 { return s.count.Load() } // IncrementBy increments the counter by v. func (s *StatCounter) IncrementBy(v uint64) { s.count.Add(v) } func (s *StatCounter) String() string { return strconv.FormatUint(s.Value(), 10) } // A MultiCounterStat keeps track of two counters at once. // // +stateify savable type MultiCounterStat struct { a *StatCounter b *StatCounter } // Init sets both internal counters to point to a and b. func (m *MultiCounterStat) Init(a, b *StatCounter) { m.a = a m.b = b } // Increment adds one to the counters. func (m *MultiCounterStat) Increment() { m.a.Increment() m.b.Increment() } // IncrementBy increments the counters by v. func (m *MultiCounterStat) IncrementBy(v uint64) { m.a.IncrementBy(v) m.b.IncrementBy(v) } // ICMPv4PacketStats enumerates counts for all ICMPv4 packet types. // // +stateify savable type ICMPv4PacketStats struct { // LINT.IfChange(ICMPv4PacketStats) // EchoRequest is the number of ICMPv4 echo packets counted. EchoRequest *StatCounter // EchoReply is the number of ICMPv4 echo reply packets counted. EchoReply *StatCounter // DstUnreachable is the number of ICMPv4 destination unreachable packets // counted. DstUnreachable *StatCounter // SrcQuench is the number of ICMPv4 source quench packets counted. SrcQuench *StatCounter // Redirect is the number of ICMPv4 redirect packets counted. Redirect *StatCounter // TimeExceeded is the number of ICMPv4 time exceeded packets counted. TimeExceeded *StatCounter // ParamProblem is the number of ICMPv4 parameter problem packets counted. ParamProblem *StatCounter // Timestamp is the number of ICMPv4 timestamp packets counted. Timestamp *StatCounter // TimestampReply is the number of ICMPv4 timestamp reply packets counted. TimestampReply *StatCounter // InfoRequest is the number of ICMPv4 information request packets counted. InfoRequest *StatCounter // InfoReply is the number of ICMPv4 information reply packets counted. InfoReply *StatCounter // LINT.ThenChange(network/ipv4/stats.go:multiCounterICMPv4PacketStats) } // ICMPv4SentPacketStats collects outbound ICMPv4-specific stats. // // +stateify savable type ICMPv4SentPacketStats struct { // LINT.IfChange(ICMPv4SentPacketStats) ICMPv4PacketStats // Dropped is the number of ICMPv4 packets dropped due to link layer errors. Dropped *StatCounter // RateLimited is the number of ICMPv4 packets dropped due to rate limit being // exceeded. RateLimited *StatCounter // LINT.ThenChange(network/ipv4/stats.go:multiCounterICMPv4SentPacketStats) } // ICMPv4ReceivedPacketStats collects inbound ICMPv4-specific stats. // // +stateify savable type ICMPv4ReceivedPacketStats struct { // LINT.IfChange(ICMPv4ReceivedPacketStats) ICMPv4PacketStats // Invalid is the number of invalid ICMPv4 packets received. Invalid *StatCounter // LINT.ThenChange(network/ipv4/stats.go:multiCounterICMPv4ReceivedPacketStats) } // ICMPv4Stats collects ICMPv4-specific stats. // // +stateify savable type ICMPv4Stats struct { // LINT.IfChange(ICMPv4Stats) // PacketsSent contains statistics about sent packets. PacketsSent ICMPv4SentPacketStats // PacketsReceived contains statistics about received packets. PacketsReceived ICMPv4ReceivedPacketStats // LINT.ThenChange(network/ipv4/stats.go:multiCounterICMPv4Stats) } // ICMPv6PacketStats enumerates counts for all ICMPv6 packet types. // // +stateify savable type ICMPv6PacketStats struct { // LINT.IfChange(ICMPv6PacketStats) // EchoRequest is the number of ICMPv6 echo request packets counted. EchoRequest *StatCounter // EchoReply is the number of ICMPv6 echo reply packets counted. EchoReply *StatCounter // DstUnreachable is the number of ICMPv6 destination unreachable packets // counted. DstUnreachable *StatCounter // PacketTooBig is the number of ICMPv6 packet too big packets counted. PacketTooBig *StatCounter // TimeExceeded is the number of ICMPv6 time exceeded packets counted. TimeExceeded *StatCounter // ParamProblem is the number of ICMPv6 parameter problem packets counted. ParamProblem *StatCounter // RouterSolicit is the number of ICMPv6 router solicit packets counted. RouterSolicit *StatCounter // RouterAdvert is the number of ICMPv6 router advert packets counted. RouterAdvert *StatCounter // NeighborSolicit is the number of ICMPv6 neighbor solicit packets counted. NeighborSolicit *StatCounter // NeighborAdvert is the number of ICMPv6 neighbor advert packets counted. NeighborAdvert *StatCounter // RedirectMsg is the number of ICMPv6 redirect message packets counted. RedirectMsg *StatCounter // MulticastListenerQuery is the number of Multicast Listener Query messages // counted. MulticastListenerQuery *StatCounter // MulticastListenerReport is the number of Multicast Listener Report messages // counted. MulticastListenerReport *StatCounter // MulticastListenerReportV2 is the number of Multicast Listener Report // messages counted. MulticastListenerReportV2 *StatCounter // MulticastListenerDone is the number of Multicast Listener Done messages // counted. MulticastListenerDone *StatCounter // LINT.ThenChange(network/ipv6/stats.go:multiCounterICMPv6PacketStats) } // ICMPv6SentPacketStats collects outbound ICMPv6-specific stats. // // +stateify savable type ICMPv6SentPacketStats struct { // LINT.IfChange(ICMPv6SentPacketStats) ICMPv6PacketStats // Dropped is the number of ICMPv6 packets dropped due to link layer errors. Dropped *StatCounter // RateLimited is the number of ICMPv6 packets dropped due to rate limit being // exceeded. RateLimited *StatCounter // LINT.ThenChange(network/ipv6/stats.go:multiCounterICMPv6SentPacketStats) } // ICMPv6ReceivedPacketStats collects inbound ICMPv6-specific stats. // // +stateify savable type ICMPv6ReceivedPacketStats struct { // LINT.IfChange(ICMPv6ReceivedPacketStats) ICMPv6PacketStats // Unrecognized is the number of ICMPv6 packets received that the transport // layer does not know how to parse. Unrecognized *StatCounter // Invalid is the number of invalid ICMPv6 packets received. Invalid *StatCounter // RouterOnlyPacketsDroppedByHost is the number of ICMPv6 packets dropped due // to being router-specific packets. RouterOnlyPacketsDroppedByHost *StatCounter // LINT.ThenChange(network/ipv6/stats.go:multiCounterICMPv6ReceivedPacketStats) } // ICMPv6Stats collects ICMPv6-specific stats. // // +stateify savable type ICMPv6Stats struct { // LINT.IfChange(ICMPv6Stats) // PacketsSent contains statistics about sent packets. PacketsSent ICMPv6SentPacketStats // PacketsReceived contains statistics about received packets. PacketsReceived ICMPv6ReceivedPacketStats // LINT.ThenChange(network/ipv6/stats.go:multiCounterICMPv6Stats) } // ICMPStats collects ICMP-specific stats (both v4 and v6). // // +stateify savable type ICMPStats struct { // V4 contains the ICMPv4-specifics stats. V4 ICMPv4Stats // V6 contains the ICMPv4-specifics stats. V6 ICMPv6Stats } // IGMPPacketStats enumerates counts for all IGMP packet types. // // +stateify savable type IGMPPacketStats struct { // LINT.IfChange(IGMPPacketStats) // MembershipQuery is the number of Membership Query messages counted. MembershipQuery *StatCounter // V1MembershipReport is the number of Version 1 Membership Report messages // counted. V1MembershipReport *StatCounter // V2MembershipReport is the number of Version 2 Membership Report messages // counted. V2MembershipReport *StatCounter // V3MembershipReport is the number of Version 3 Membership Report messages // counted. V3MembershipReport *StatCounter // LeaveGroup is the number of Leave Group messages counted. LeaveGroup *StatCounter // LINT.ThenChange(network/ipv4/stats.go:multiCounterIGMPPacketStats) } // IGMPSentPacketStats collects outbound IGMP-specific stats. // // +stateify savable type IGMPSentPacketStats struct { // LINT.IfChange(IGMPSentPacketStats) IGMPPacketStats // Dropped is the number of IGMP packets dropped. Dropped *StatCounter // LINT.ThenChange(network/ipv4/stats.go:multiCounterIGMPSentPacketStats) } // IGMPReceivedPacketStats collects inbound IGMP-specific stats. // // +stateify savable type IGMPReceivedPacketStats struct { // LINT.IfChange(IGMPReceivedPacketStats) IGMPPacketStats // Invalid is the number of invalid IGMP packets received. Invalid *StatCounter // ChecksumErrors is the number of IGMP packets dropped due to bad checksums. ChecksumErrors *StatCounter // Unrecognized is the number of unrecognized messages counted, these are // silently ignored for forward-compatibility. Unrecognized *StatCounter // LINT.ThenChange(network/ipv4/stats.go:multiCounterIGMPReceivedPacketStats) } // IGMPStats collects IGMP-specific stats. // // +stateify savable type IGMPStats struct { // LINT.IfChange(IGMPStats) // PacketsSent contains statistics about sent packets. PacketsSent IGMPSentPacketStats // PacketsReceived contains statistics about received packets. PacketsReceived IGMPReceivedPacketStats // LINT.ThenChange(network/ipv4/stats.go:multiCounterIGMPStats) } // IPForwardingStats collects stats related to IP forwarding (both v4 and v6). // // +stateify savable type IPForwardingStats struct { // LINT.IfChange(IPForwardingStats) // Unrouteable is the number of IP packets received which were dropped // because a route to their destination could not be constructed. Unrouteable *StatCounter // ExhaustedTTL is the number of IP packets received which were dropped // because their TTL was exhausted. ExhaustedTTL *StatCounter // InitializingSource is the number of IP packets which were dropped // because they contained a source address that may only be used on the local // network as part of initialization work. InitializingSource *StatCounter // LinkLocalSource is the number of IP packets which were dropped // because they contained a link-local source address. LinkLocalSource *StatCounter // LinkLocalDestination is the number of IP packets which were dropped // because they contained a link-local destination address. LinkLocalDestination *StatCounter // PacketTooBig is the number of IP packets which were dropped because they // were too big for the outgoing MTU. PacketTooBig *StatCounter // HostUnreachable is the number of IP packets received which could not be // successfully forwarded due to an unresolvable next hop. HostUnreachable *StatCounter // ExtensionHeaderProblem is the number of IP packets which were dropped // because of a problem encountered when processing an IPv6 extension // header. ExtensionHeaderProblem *StatCounter // UnexpectedMulticastInputInterface is the number of multicast packets that // were received on an interface that did not match the corresponding route's // expected input interface. UnexpectedMulticastInputInterface *StatCounter // UnknownOutputEndpoint is the number of packets that could not be forwarded // because the output endpoint could not be found. UnknownOutputEndpoint *StatCounter // NoMulticastPendingQueueBufferSpace is the number of multicast packets that // were dropped due to insufficient buffer space in the pending packet queue. NoMulticastPendingQueueBufferSpace *StatCounter // OutgoingDeviceNoBufferSpace is the number of packets that were dropped due // to insufficient space in the outgoing device. OutgoingDeviceNoBufferSpace *StatCounter // Errors is the number of IP packets received which could not be // successfully forwarded. Errors *StatCounter // LINT.ThenChange(network/internal/ip/stats.go:MultiCounterIPForwardingStats) } // IPStats collects IP-specific stats (both v4 and v6). // // +stateify savable type IPStats struct { // LINT.IfChange(IPStats) // PacketsReceived is the number of IP packets received from the link layer. PacketsReceived *StatCounter // ValidPacketsReceived is the number of valid IP packets that reached the IP // layer. ValidPacketsReceived *StatCounter // DisabledPacketsReceived is the number of IP packets received from the link // layer when the IP layer is disabled. DisabledPacketsReceived *StatCounter // InvalidDestinationAddressesReceived is the number of IP packets received // with an unknown or invalid destination address. InvalidDestinationAddressesReceived *StatCounter // InvalidSourceAddressesReceived is the number of IP packets received with a // source address that should never have been received on the wire. InvalidSourceAddressesReceived *StatCounter // PacketsDelivered is the number of incoming IP packets that are successfully // delivered to the transport layer. PacketsDelivered *StatCounter // PacketsSent is the number of IP packets sent via WritePacket. PacketsSent *StatCounter // OutgoingPacketErrors is the number of IP packets which failed to write to a // link-layer endpoint. OutgoingPacketErrors *StatCounter // MalformedPacketsReceived is the number of IP Packets that were dropped due // to the IP packet header failing validation checks. MalformedPacketsReceived *StatCounter // MalformedFragmentsReceived is the number of IP Fragments that were dropped // due to the fragment failing validation checks. MalformedFragmentsReceived *StatCounter // IPTablesPreroutingDropped is the number of IP packets dropped in the // Prerouting chain. IPTablesPreroutingDropped *StatCounter // IPTablesInputDropped is the number of IP packets dropped in the Input // chain. IPTablesInputDropped *StatCounter // IPTablesForwardDropped is the number of IP packets dropped in the Forward // chain. IPTablesForwardDropped *StatCounter // IPTablesOutputDropped is the number of IP packets dropped in the Output // chain. IPTablesOutputDropped *StatCounter // IPTablesPostroutingDropped is the number of IP packets dropped in the // Postrouting chain. IPTablesPostroutingDropped *StatCounter // TODO(https://gvisor.dev/issues/5529): Move the IPv4-only option stats out // of IPStats. // OptionTimestampReceived is the number of Timestamp options seen. OptionTimestampReceived *StatCounter // OptionRecordRouteReceived is the number of Record Route options seen. OptionRecordRouteReceived *StatCounter // OptionRouterAlertReceived is the number of Router Alert options seen. OptionRouterAlertReceived *StatCounter // OptionUnknownReceived is the number of unknown IP options seen. OptionUnknownReceived *StatCounter // Forwarding collects stats related to IP forwarding. Forwarding IPForwardingStats // LINT.ThenChange(network/internal/ip/stats.go:MultiCounterIPStats) } // ARPStats collects ARP-specific stats. // // +stateify savable type ARPStats struct { // LINT.IfChange(ARPStats) // PacketsReceived is the number of ARP packets received from the link layer. PacketsReceived *StatCounter // DisabledPacketsReceived is the number of ARP packets received from the link // layer when the ARP layer is disabled. DisabledPacketsReceived *StatCounter // MalformedPacketsReceived is the number of ARP packets that were dropped due // to being malformed. MalformedPacketsReceived *StatCounter // RequestsReceived is the number of ARP requests received. RequestsReceived *StatCounter // RequestsReceivedUnknownTargetAddress is the number of ARP requests that // were targeted to an interface different from the one it was received on. RequestsReceivedUnknownTargetAddress *StatCounter // OutgoingRequestInterfaceHasNoLocalAddressErrors is the number of failures // to send an ARP request because the interface has no network address // assigned to it. OutgoingRequestInterfaceHasNoLocalAddressErrors *StatCounter // OutgoingRequestBadLocalAddressErrors is the number of failures to send an // ARP request with a bad local address. OutgoingRequestBadLocalAddressErrors *StatCounter // OutgoingRequestsDropped is the number of ARP requests which failed to write // to a link-layer endpoint. OutgoingRequestsDropped *StatCounter // OutgoingRequestSent is the number of ARP requests successfully written to a // link-layer endpoint. OutgoingRequestsSent *StatCounter // RepliesReceived is the number of ARP replies received. RepliesReceived *StatCounter // OutgoingRepliesDropped is the number of ARP replies which failed to write // to a link-layer endpoint. OutgoingRepliesDropped *StatCounter // OutgoingRepliesSent is the number of ARP replies successfully written to a // link-layer endpoint. OutgoingRepliesSent *StatCounter // LINT.ThenChange(network/arp/stats.go:multiCounterARPStats) } // TCPStats collects TCP-specific stats. // // +stateify savable type TCPStats struct { // ActiveConnectionOpenings is the number of connections opened // successfully via Connect. ActiveConnectionOpenings *StatCounter // PassiveConnectionOpenings is the number of connections opened // successfully via Listen. PassiveConnectionOpenings *StatCounter // CurrentEstablished is the number of TCP connections for which the // current state is ESTABLISHED. CurrentEstablished *StatCounter // CurrentConnected is the number of TCP connections that // are in connected state. CurrentConnected *StatCounter // EstablishedResets is the number of times TCP connections have made // a direct transition to the CLOSED state from either the // ESTABLISHED state or the CLOSE-WAIT state. EstablishedResets *StatCounter // EstablishedClosed is the number of times established TCP connections // made a transition to CLOSED state. EstablishedClosed *StatCounter // EstablishedTimedout is the number of times an established connection // was reset because of keep-alive time out. EstablishedTimedout *StatCounter // ListenOverflowSynDrop is the number of times the listen queue overflowed // and a SYN was dropped. ListenOverflowSynDrop *StatCounter // ListenOverflowAckDrop is the number of times the final ACK // in the handshake was dropped due to overflow. ListenOverflowAckDrop *StatCounter // ListenOverflowCookieSent is the number of times a SYN cookie was sent. ListenOverflowSynCookieSent *StatCounter // ListenOverflowSynCookieRcvd is the number of times a valid SYN // cookie was received. ListenOverflowSynCookieRcvd *StatCounter // ListenOverflowInvalidSynCookieRcvd is the number of times an invalid SYN cookie // was received. ListenOverflowInvalidSynCookieRcvd *StatCounter // FailedConnectionAttempts is the number of calls to Connect or Listen // (active and passive openings, respectively) that end in an error. FailedConnectionAttempts *StatCounter // ValidSegmentsReceived is the number of TCP segments received that // the transport layer successfully parsed. ValidSegmentsReceived *StatCounter // InvalidSegmentsReceived is the number of TCP segments received that // the transport layer could not parse. InvalidSegmentsReceived *StatCounter // SegmentsSent is the number of TCP segments sent. SegmentsSent *StatCounter // SegmentSendErrors is the number of TCP segments failed to be sent. SegmentSendErrors *StatCounter // ResetsSent is the number of TCP resets sent. ResetsSent *StatCounter // ResetsReceived is the number of TCP resets received. ResetsReceived *StatCounter // Retransmits is the number of TCP segments retransmitted. Retransmits *StatCounter // FastRecovery is the number of times Fast Recovery was used to // recover from packet loss. FastRecovery *StatCounter // SACKRecovery is the number of times SACK Recovery was used to // recover from packet loss. SACKRecovery *StatCounter // TLPRecovery is the number of times recovery was accomplished by the tail // loss probe. TLPRecovery *StatCounter // SlowStartRetransmits is the number of segments retransmitted in slow // start. SlowStartRetransmits *StatCounter // FastRetransmit is the number of segments retransmitted in fast // recovery. FastRetransmit *StatCounter // Timeouts is the number of times the RTO expired. Timeouts *StatCounter // ChecksumErrors is the number of segments dropped due to bad checksums. ChecksumErrors *StatCounter // FailedPortReservations is the number of times TCP failed to reserve // a port. FailedPortReservations *StatCounter // SegmentsAckedWithDSACK is the number of segments acknowledged with // DSACK. SegmentsAckedWithDSACK *StatCounter // SpuriousRecovery is the number of times the connection entered loss // recovery spuriously. SpuriousRecovery *StatCounter // SpuriousRTORecovery is the number of spurious RTOs. SpuriousRTORecovery *StatCounter // ForwardMaxInFlightDrop is the number of connection requests that are // dropped due to exceeding the maximum number of in-flight connection // requests. ForwardMaxInFlightDrop *StatCounter } // UDPStats collects UDP-specific stats. // // +stateify savable type UDPStats struct { // PacketsReceived is the number of UDP datagrams received via // HandlePacket. PacketsReceived *StatCounter // UnknownPortErrors is the number of incoming UDP datagrams dropped // because they did not have a known destination port. UnknownPortErrors *StatCounter // ReceiveBufferErrors is the number of incoming UDP datagrams dropped // due to the receiving buffer being in an invalid state. ReceiveBufferErrors *StatCounter // MalformedPacketsReceived is the number of incoming UDP datagrams // dropped due to the UDP header being in a malformed state. MalformedPacketsReceived *StatCounter // PacketsSent is the number of UDP datagrams sent via sendUDP. PacketsSent *StatCounter // PacketSendErrors is the number of datagrams failed to be sent. PacketSendErrors *StatCounter // ChecksumErrors is the number of datagrams dropped due to bad checksums. ChecksumErrors *StatCounter } // NICNeighborStats holds metrics for the neighbor table. // // +stateify savable type NICNeighborStats struct { // LINT.IfChange(NICNeighborStats) // UnreachableEntryLookups counts the number of lookups performed on an // entry in Unreachable state. UnreachableEntryLookups *StatCounter // DroppedConfirmationForNoninitiatedNeighbor counts the number of neighbor // responses that were dropped because they didn't match an entry in the // cache. DroppedConfirmationForNoninitiatedNeighbor *StatCounter // DroppedInvalidLinkAddressConfirmations counts the number of neighbor // responses that were ignored because they had an invalid source link-layer // address. DroppedInvalidLinkAddressConfirmations *StatCounter // LINT.ThenChange(stack/nic_stats.go:multiCounterNICNeighborStats) } // NICPacketStats holds basic packet statistics. // // +stateify savable type NICPacketStats struct { // LINT.IfChange(NICPacketStats) // Packets is the number of packets counted. Packets *StatCounter // Bytes is the number of bytes counted. Bytes *StatCounter // LINT.ThenChange(stack/nic_stats.go:multiCounterNICPacketStats) } // IntegralStatCounterMap holds a map associating integral keys with // StatCounters. // // +stateify savable type IntegralStatCounterMap struct { mu sync.RWMutex `state:"nosave"` // +checklocks:mu counterMap map[uint64]*StatCounter } // Keys returns all keys present in the map. func (m *IntegralStatCounterMap) Keys() []uint64 { m.mu.RLock() defer m.mu.RUnlock() var keys []uint64 for k := range m.counterMap { keys = append(keys, k) } return keys } // Get returns the counter mapped by the provided key. func (m *IntegralStatCounterMap) Get(key uint64) (*StatCounter, bool) { m.mu.RLock() defer m.mu.RUnlock() counter, ok := m.counterMap[key] return counter, ok } // Init initializes the map. func (m *IntegralStatCounterMap) Init() { m.mu.Lock() defer m.mu.Unlock() m.counterMap = make(map[uint64]*StatCounter) } // Increment increments the counter associated with the provided key. func (m *IntegralStatCounterMap) Increment(key uint64) { m.mu.RLock() counter, ok := m.counterMap[key] m.mu.RUnlock() if !ok { m.mu.Lock() counter, ok = m.counterMap[key] if !ok { counter = new(StatCounter) m.counterMap[key] = counter } m.mu.Unlock() } counter.Increment() } // A MultiIntegralStatCounterMap keeps track of two integral counter maps at // once. // // +stateify savable type MultiIntegralStatCounterMap struct { a *IntegralStatCounterMap b *IntegralStatCounterMap } // Init sets the internal integral counter maps to point to a and b. func (m *MultiIntegralStatCounterMap) Init(a, b *IntegralStatCounterMap) { m.a = a m.b = b } // Increment increments the counter in each map corresponding to the // provided key. func (m *MultiIntegralStatCounterMap) Increment(key uint64) { m.a.Increment(key) m.b.Increment(key) } // NICStats holds NIC statistics. // // +stateify savable type NICStats struct { // LINT.IfChange(NICStats) // UnknownL3ProtocolRcvdPacketCounts records the number of packets received // for each unknown or unsupported network protocol number. UnknownL3ProtocolRcvdPacketCounts *IntegralStatCounterMap // UnknownL4ProtocolRcvdPacketCounts records the number of packets received // for each unknown or unsupported transport protocol number. UnknownL4ProtocolRcvdPacketCounts *IntegralStatCounterMap // MalformedL4RcvdPackets is the number of packets received by a NIC that // could not be delivered to a transport endpoint because the L4 header could // not be parsed. MalformedL4RcvdPackets *StatCounter // Tx contains statistics about transmitted packets. Tx NICPacketStats // TxPacketsDroppedNoBufferSpace is the number of packets dropepd due to the // NIC not having enough buffer space to send the packet. // // Packets may be dropped with a no buffer space error when the device TX // queue is full. TxPacketsDroppedNoBufferSpace *StatCounter // Rx contains statistics about received packets. Rx NICPacketStats // DisabledRx contains statistics about received packets on disabled NICs. DisabledRx NICPacketStats // Neighbor contains statistics about neighbor entries. Neighbor NICNeighborStats // LINT.ThenChange(stack/nic_stats.go:multiCounterNICStats) } // FillIn returns a copy of s with nil fields initialized to new StatCounters. func (s NICStats) FillIn() NICStats { InitStatCounters(reflect.ValueOf(&s).Elem()) return s } // Stats holds statistics about the networking stack. // // +stateify savable type Stats struct { // TODO(https://gvisor.dev/issues/5986): Make the DroppedPackets stat less // ambiguous. // DroppedPackets is the number of packets dropped at the transport layer. DroppedPackets *StatCounter // NICs is an aggregation of every NIC's statistics. These should not be // incremented using this field, but using the relevant NIC multicounters. NICs NICStats // ICMP is an aggregation of every NetworkEndpoint's ICMP statistics (both v4 // and v6). These should not be incremented using this field, but using the // relevant NetworkEndpoint ICMP multicounters. ICMP ICMPStats // IGMP is an aggregation of every NetworkEndpoint's IGMP statistics. These // should not be incremented using this field, but using the relevant // NetworkEndpoint IGMP multicounters. IGMP IGMPStats // IP is an aggregation of every NetworkEndpoint's IP statistics. These should // not be incremented using this field, but using the relevant NetworkEndpoint // IP multicounters. IP IPStats // ARP is an aggregation of every NetworkEndpoint's ARP statistics. These // should not be incremented using this field, but using the relevant // NetworkEndpoint ARP multicounters. ARP ARPStats // TCP holds TCP-specific stats. TCP TCPStats // UDP holds UDP-specific stats. UDP UDPStats } // ReceiveErrors collects packet receive errors within transport endpoint. // // +stateify savable type ReceiveErrors struct { // ReceiveBufferOverflow is the number of received packets dropped // due to the receive buffer being full. ReceiveBufferOverflow StatCounter // MalformedPacketsReceived is the number of incoming packets // dropped due to the packet header being in a malformed state. MalformedPacketsReceived StatCounter // ClosedReceiver is the number of received packets dropped because // of receiving endpoint state being closed. ClosedReceiver StatCounter // ChecksumErrors is the number of packets dropped due to bad checksums. ChecksumErrors StatCounter } // SendErrors collects packet send errors within the transport layer for an // endpoint. // // +stateify savable type SendErrors struct { // SendToNetworkFailed is the number of packets failed to be written to // the network endpoint. SendToNetworkFailed StatCounter // NoRoute is the number of times we failed to resolve IP route. NoRoute StatCounter } // ReadErrors collects segment read errors from an endpoint read call. // // +stateify savable type ReadErrors struct { // ReadClosed is the number of received packet drops because the endpoint // was shutdown for read. ReadClosed StatCounter // InvalidEndpointState is the number of times we found the endpoint state // to be unexpected. InvalidEndpointState StatCounter // NotConnected is the number of times we tried to read but found that the // endpoint was not connected. NotConnected StatCounter } // WriteErrors collects packet write errors from an endpoint write call. // // +stateify savable type WriteErrors struct { // WriteClosed is the number of packet drops because the endpoint // was shutdown for write. WriteClosed StatCounter // InvalidEndpointState is the number of times we found the endpoint state // to be unexpected. InvalidEndpointState StatCounter // InvalidArgs is the number of times invalid input arguments were // provided for endpoint Write call. InvalidArgs StatCounter } // TransportEndpointStats collects statistics about the endpoint. // // +stateify savable type TransportEndpointStats struct { // PacketsReceived is the number of successful packet receives. PacketsReceived StatCounter // PacketsSent is the number of successful packet sends. PacketsSent StatCounter // ReceiveErrors collects packet receive errors within transport layer. ReceiveErrors ReceiveErrors // ReadErrors collects packet read errors from an endpoint read call. ReadErrors ReadErrors // SendErrors collects packet send errors within the transport layer. SendErrors SendErrors // WriteErrors collects packet write errors from an endpoint write call. WriteErrors WriteErrors } // IsEndpointStats is an empty method to implement the tcpip.EndpointStats // marker interface. func (*TransportEndpointStats) IsEndpointStats() {} // InitStatCounters initializes v's fields with nil StatCounter fields to new // StatCounters. func InitStatCounters(v reflect.Value) { for i := 0; i < v.NumField(); i++ { v := v.Field(i) if s, ok := v.Addr().Interface().(**StatCounter); ok { if *s == nil { *s = new(StatCounter) } } else if s, ok := v.Addr().Interface().(**IntegralStatCounterMap); ok { if *s == nil { *s = new(IntegralStatCounterMap) (*s).Init() } } else { InitStatCounters(v) } } } // FillIn returns a copy of s with nil fields initialized to new StatCounters. func (s Stats) FillIn() Stats { InitStatCounters(reflect.ValueOf(&s).Elem()) return s } // Clone clones a copy of the TransportEndpointStats into dst by atomically // reading each field. func (src *TransportEndpointStats) Clone(dst *TransportEndpointStats) { clone(reflect.ValueOf(dst).Elem(), reflect.ValueOf(src).Elem()) } func clone(dst reflect.Value, src reflect.Value) { for i := 0; i < dst.NumField(); i++ { d := dst.Field(i) s := src.Field(i) if c, ok := s.Addr().Interface().(*StatCounter); ok { d.Addr().Interface().(*StatCounter).IncrementBy(c.Value()) } else { clone(d, s) } } } // String implements the fmt.Stringer interface. func (a Address) String() string { switch l := a.Len(); l { case 4: return fmt.Sprintf("%d.%d.%d.%d", int(a.addr[0]), int(a.addr[1]), int(a.addr[2]), int(a.addr[3])) case 16: // Find the longest subsequence of hexadecimal zeros. start, end := -1, -1 for i := 0; i < a.Len(); i += 2 { j := i for j < a.Len() && a.addr[j] == 0 && a.addr[j+1] == 0 { j += 2 } if j > i+2 && j-i > end-start { start, end = i, j } } var b strings.Builder for i := 0; i < a.Len(); i += 2 { if i == start { b.WriteString("::") i = end if end >= a.Len() { break } } else if i > 0 { b.WriteByte(':') } v := uint16(a.addr[i+0])<<8 | uint16(a.addr[i+1]) if v == 0 { b.WriteByte('0') } else { const digits = "0123456789abcdef" for i := uint(3); i < 4; i-- { if v := v >> (i * 4); v != 0 { b.WriteByte(digits[v&0xf]) } } } } return b.String() default: return fmt.Sprintf("%x", a.addr[:l]) } } // To4 converts the IPv4 address to a 4-byte representation. // If the address is not an IPv4 address, To4 returns the empty Address. func (a Address) To4() Address { const ( ipv4len = 4 ipv6len = 16 ) if a.Len() == ipv4len { return a } if a.Len() == ipv6len && isZeros(a.addr[:10]) && a.addr[10] == 0xff && a.addr[11] == 0xff { return AddrFrom4Slice(a.addr[12:16]) } return Address{} } // isZeros reports whether addr is all zeros. func isZeros(addr []byte) bool { for _, b := range addr { if b != 0 { return false } } return true } // LinkAddress is a byte slice cast as a string that represents a link address. // It is typically a 6-byte MAC address. type LinkAddress string // String implements the fmt.Stringer interface. func (a LinkAddress) String() string { switch len(a) { case 6: return fmt.Sprintf("%02x:%02x:%02x:%02x:%02x:%02x", a[0], a[1], a[2], a[3], a[4], a[5]) default: return fmt.Sprintf("%x", []byte(a)) } } // ParseMACAddress parses an IEEE 802 address. // // It must be in the format aa:bb:cc:dd:ee:ff or aa-bb-cc-dd-ee-ff. func ParseMACAddress(s string) (LinkAddress, error) { parts := strings.FieldsFunc(s, func(c rune) bool { return c == ':' || c == '-' }) if len(parts) != LinkAddressSize { return "", fmt.Errorf("inconsistent parts: %s", s) } addr := make([]byte, 0, len(parts)) for _, part := range parts { u, err := strconv.ParseUint(part, 16, 8) if err != nil { return "", fmt.Errorf("invalid hex digits: %s", s) } addr = append(addr, byte(u)) } return LinkAddress(addr), nil } // GetRandMacAddr returns a mac address that can be used for local virtual devices. func GetRandMacAddr() LinkAddress { mac := make(net.HardwareAddr, LinkAddressSize) rand.Read(mac) // Fill with random data. mac[0] &^= 0x1 // Clear multicast bit. mac[0] |= 0x2 // Set local assignment bit (IEEE802). return LinkAddress(mac) } // AddressWithPrefix is an address with its subnet prefix length. // // +stateify savable type AddressWithPrefix struct { // Address is a network address. Address Address // PrefixLen is the subnet prefix length. PrefixLen int } // String implements the fmt.Stringer interface. func (a AddressWithPrefix) String() string { return fmt.Sprintf("%s/%d", a.Address, a.PrefixLen) } // Subnet converts the address and prefix into a Subnet value and returns it. func (a AddressWithPrefix) Subnet() Subnet { addrLen := a.Address.length if a.PrefixLen <= 0 { return Subnet{ address: Address{length: addrLen}, mask: AddressMask{length: addrLen}, } } if a.PrefixLen >= addrLen*8 { sub := Subnet{ address: a.Address, mask: AddressMask{length: addrLen}, } for i := 0; i < addrLen; i++ { sub.mask.mask[i] = 0xff } return sub } sa := Address{length: addrLen} sm := AddressMask{length: addrLen} n := uint(a.PrefixLen) for i := 0; i < addrLen; i++ { if n >= 8 { sa.addr[i] = a.Address.addr[i] sm.mask[i] = 0xff n -= 8 continue } sm.mask[i] = ^byte(0xff >> n) sa.addr[i] = a.Address.addr[i] & sm.mask[i] n = 0 } // For extra caution, call NewSubnet rather than directly creating the Subnet // value. If that fails it indicates a serious bug in this code, so panic is // in order. s, err := NewSubnet(sa, sm) if err != nil { panic("invalid subnet: " + err.Error()) } return s } // ProtocolAddress is an address and the network protocol it is associated // with. // // +stateify savable type ProtocolAddress struct { // Protocol is the protocol of the address. Protocol NetworkProtocolNumber // AddressWithPrefix is a network address with its subnet prefix length. AddressWithPrefix AddressWithPrefix } var ( // danglingEndpointsMu protects access to danglingEndpoints. danglingEndpointsMu sync.Mutex // danglingEndpoints tracks all dangling endpoints no longer owned by the app. danglingEndpoints = make(map[Endpoint]struct{}) ) // GetDanglingEndpoints returns all dangling endpoints. func GetDanglingEndpoints() []Endpoint { danglingEndpointsMu.Lock() es := make([]Endpoint, 0, len(danglingEndpoints)) for e := range danglingEndpoints { es = append(es, e) } danglingEndpointsMu.Unlock() return es } // ReleaseDanglingEndpoints clears out all all reference counted objects held by // dangling endpoints. func ReleaseDanglingEndpoints() { // Get the dangling endpoints first to avoid locking around Release(), which // can cause a lock inversion with endpoint.mu and danglingEndpointsMu. // Calling Release on a dangling endpoint that has been deleted is a noop. eps := GetDanglingEndpoints() for _, ep := range eps { ep.Abort() } } // AddDanglingEndpoint adds a dangling endpoint. func AddDanglingEndpoint(e Endpoint) { danglingEndpointsMu.Lock() danglingEndpoints[e] = struct{}{} danglingEndpointsMu.Unlock() } // DeleteDanglingEndpoint removes a dangling endpoint. func DeleteDanglingEndpoint(e Endpoint) { danglingEndpointsMu.Lock() delete(danglingEndpoints, e) danglingEndpointsMu.Unlock() } // AsyncLoading is the global barrier for asynchronous endpoint loading // activities. var AsyncLoading sync.WaitGroup golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/tcpip_linux_state_autogen.go000066400000000000000000000001311465435605700263450ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux // +build linux package tcpip golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/tcpip_state.go000066400000000000000000000015301465435605700234100ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcpip import ( "context" "time" ) func (c *ReceivableControlMessages) saveTimestamp() int64 { return c.Timestamp.UnixNano() } func (c *ReceivableControlMessages) loadTimestamp(_ context.Context, nsec int64) { c.Timestamp = time.Unix(0, nsec) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/tcpip_state_autogen.go000066400000000000000000002561301465435605700251420ustar00rootroot00000000000000// automatically generated by stateify. package tcpip import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (e *ErrAborted) StateTypeName() string { return "pkg/tcpip.ErrAborted" } func (e *ErrAborted) StateFields() []string { return []string{} } func (e *ErrAborted) beforeSave() {} // +checklocksignore func (e *ErrAborted) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrAborted) afterLoad(context.Context) {} // +checklocksignore func (e *ErrAborted) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrAddressFamilyNotSupported) StateTypeName() string { return "pkg/tcpip.ErrAddressFamilyNotSupported" } func (e *ErrAddressFamilyNotSupported) StateFields() []string { return []string{} } func (e *ErrAddressFamilyNotSupported) beforeSave() {} // +checklocksignore func (e *ErrAddressFamilyNotSupported) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrAddressFamilyNotSupported) afterLoad(context.Context) {} // +checklocksignore func (e *ErrAddressFamilyNotSupported) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrAlreadyBound) StateTypeName() string { return "pkg/tcpip.ErrAlreadyBound" } func (e *ErrAlreadyBound) StateFields() []string { return []string{} } func (e *ErrAlreadyBound) beforeSave() {} // +checklocksignore func (e *ErrAlreadyBound) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrAlreadyBound) afterLoad(context.Context) {} // +checklocksignore func (e *ErrAlreadyBound) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrAlreadyConnected) StateTypeName() string { return "pkg/tcpip.ErrAlreadyConnected" } func (e *ErrAlreadyConnected) StateFields() []string { return []string{} } func (e *ErrAlreadyConnected) beforeSave() {} // +checklocksignore func (e *ErrAlreadyConnected) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrAlreadyConnected) afterLoad(context.Context) {} // +checklocksignore func (e *ErrAlreadyConnected) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrAlreadyConnecting) StateTypeName() string { return "pkg/tcpip.ErrAlreadyConnecting" } func (e *ErrAlreadyConnecting) StateFields() []string { return []string{} } func (e *ErrAlreadyConnecting) beforeSave() {} // +checklocksignore func (e *ErrAlreadyConnecting) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrAlreadyConnecting) afterLoad(context.Context) {} // +checklocksignore func (e *ErrAlreadyConnecting) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrBadAddress) StateTypeName() string { return "pkg/tcpip.ErrBadAddress" } func (e *ErrBadAddress) StateFields() []string { return []string{} } func (e *ErrBadAddress) beforeSave() {} // +checklocksignore func (e *ErrBadAddress) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrBadAddress) afterLoad(context.Context) {} // +checklocksignore func (e *ErrBadAddress) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrBadBuffer) StateTypeName() string { return "pkg/tcpip.ErrBadBuffer" } func (e *ErrBadBuffer) StateFields() []string { return []string{} } func (e *ErrBadBuffer) beforeSave() {} // +checklocksignore func (e *ErrBadBuffer) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrBadBuffer) afterLoad(context.Context) {} // +checklocksignore func (e *ErrBadBuffer) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrBadLocalAddress) StateTypeName() string { return "pkg/tcpip.ErrBadLocalAddress" } func (e *ErrBadLocalAddress) StateFields() []string { return []string{} } func (e *ErrBadLocalAddress) beforeSave() {} // +checklocksignore func (e *ErrBadLocalAddress) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrBadLocalAddress) afterLoad(context.Context) {} // +checklocksignore func (e *ErrBadLocalAddress) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrBroadcastDisabled) StateTypeName() string { return "pkg/tcpip.ErrBroadcastDisabled" } func (e *ErrBroadcastDisabled) StateFields() []string { return []string{} } func (e *ErrBroadcastDisabled) beforeSave() {} // +checklocksignore func (e *ErrBroadcastDisabled) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrBroadcastDisabled) afterLoad(context.Context) {} // +checklocksignore func (e *ErrBroadcastDisabled) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrClosedForReceive) StateTypeName() string { return "pkg/tcpip.ErrClosedForReceive" } func (e *ErrClosedForReceive) StateFields() []string { return []string{} } func (e *ErrClosedForReceive) beforeSave() {} // +checklocksignore func (e *ErrClosedForReceive) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrClosedForReceive) afterLoad(context.Context) {} // +checklocksignore func (e *ErrClosedForReceive) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrClosedForSend) StateTypeName() string { return "pkg/tcpip.ErrClosedForSend" } func (e *ErrClosedForSend) StateFields() []string { return []string{} } func (e *ErrClosedForSend) beforeSave() {} // +checklocksignore func (e *ErrClosedForSend) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrClosedForSend) afterLoad(context.Context) {} // +checklocksignore func (e *ErrClosedForSend) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrConnectStarted) StateTypeName() string { return "pkg/tcpip.ErrConnectStarted" } func (e *ErrConnectStarted) StateFields() []string { return []string{} } func (e *ErrConnectStarted) beforeSave() {} // +checklocksignore func (e *ErrConnectStarted) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrConnectStarted) afterLoad(context.Context) {} // +checklocksignore func (e *ErrConnectStarted) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrConnectionAborted) StateTypeName() string { return "pkg/tcpip.ErrConnectionAborted" } func (e *ErrConnectionAborted) StateFields() []string { return []string{} } func (e *ErrConnectionAborted) beforeSave() {} // +checklocksignore func (e *ErrConnectionAborted) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrConnectionAborted) afterLoad(context.Context) {} // +checklocksignore func (e *ErrConnectionAborted) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrConnectionRefused) StateTypeName() string { return "pkg/tcpip.ErrConnectionRefused" } func (e *ErrConnectionRefused) StateFields() []string { return []string{} } func (e *ErrConnectionRefused) beforeSave() {} // +checklocksignore func (e *ErrConnectionRefused) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrConnectionRefused) afterLoad(context.Context) {} // +checklocksignore func (e *ErrConnectionRefused) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrConnectionReset) StateTypeName() string { return "pkg/tcpip.ErrConnectionReset" } func (e *ErrConnectionReset) StateFields() []string { return []string{} } func (e *ErrConnectionReset) beforeSave() {} // +checklocksignore func (e *ErrConnectionReset) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrConnectionReset) afterLoad(context.Context) {} // +checklocksignore func (e *ErrConnectionReset) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrDestinationRequired) StateTypeName() string { return "pkg/tcpip.ErrDestinationRequired" } func (e *ErrDestinationRequired) StateFields() []string { return []string{} } func (e *ErrDestinationRequired) beforeSave() {} // +checklocksignore func (e *ErrDestinationRequired) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrDestinationRequired) afterLoad(context.Context) {} // +checklocksignore func (e *ErrDestinationRequired) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrDuplicateAddress) StateTypeName() string { return "pkg/tcpip.ErrDuplicateAddress" } func (e *ErrDuplicateAddress) StateFields() []string { return []string{} } func (e *ErrDuplicateAddress) beforeSave() {} // +checklocksignore func (e *ErrDuplicateAddress) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrDuplicateAddress) afterLoad(context.Context) {} // +checklocksignore func (e *ErrDuplicateAddress) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrDuplicateNICID) StateTypeName() string { return "pkg/tcpip.ErrDuplicateNICID" } func (e *ErrDuplicateNICID) StateFields() []string { return []string{} } func (e *ErrDuplicateNICID) beforeSave() {} // +checklocksignore func (e *ErrDuplicateNICID) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrDuplicateNICID) afterLoad(context.Context) {} // +checklocksignore func (e *ErrDuplicateNICID) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrInvalidNICID) StateTypeName() string { return "pkg/tcpip.ErrInvalidNICID" } func (e *ErrInvalidNICID) StateFields() []string { return []string{} } func (e *ErrInvalidNICID) beforeSave() {} // +checklocksignore func (e *ErrInvalidNICID) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrInvalidNICID) afterLoad(context.Context) {} // +checklocksignore func (e *ErrInvalidNICID) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrInvalidEndpointState) StateTypeName() string { return "pkg/tcpip.ErrInvalidEndpointState" } func (e *ErrInvalidEndpointState) StateFields() []string { return []string{} } func (e *ErrInvalidEndpointState) beforeSave() {} // +checklocksignore func (e *ErrInvalidEndpointState) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrInvalidEndpointState) afterLoad(context.Context) {} // +checklocksignore func (e *ErrInvalidEndpointState) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrInvalidOptionValue) StateTypeName() string { return "pkg/tcpip.ErrInvalidOptionValue" } func (e *ErrInvalidOptionValue) StateFields() []string { return []string{} } func (e *ErrInvalidOptionValue) beforeSave() {} // +checklocksignore func (e *ErrInvalidOptionValue) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrInvalidOptionValue) afterLoad(context.Context) {} // +checklocksignore func (e *ErrInvalidOptionValue) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrInvalidPortRange) StateTypeName() string { return "pkg/tcpip.ErrInvalidPortRange" } func (e *ErrInvalidPortRange) StateFields() []string { return []string{} } func (e *ErrInvalidPortRange) beforeSave() {} // +checklocksignore func (e *ErrInvalidPortRange) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrInvalidPortRange) afterLoad(context.Context) {} // +checklocksignore func (e *ErrInvalidPortRange) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrMalformedHeader) StateTypeName() string { return "pkg/tcpip.ErrMalformedHeader" } func (e *ErrMalformedHeader) StateFields() []string { return []string{} } func (e *ErrMalformedHeader) beforeSave() {} // +checklocksignore func (e *ErrMalformedHeader) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrMalformedHeader) afterLoad(context.Context) {} // +checklocksignore func (e *ErrMalformedHeader) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrMessageTooLong) StateTypeName() string { return "pkg/tcpip.ErrMessageTooLong" } func (e *ErrMessageTooLong) StateFields() []string { return []string{} } func (e *ErrMessageTooLong) beforeSave() {} // +checklocksignore func (e *ErrMessageTooLong) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrMessageTooLong) afterLoad(context.Context) {} // +checklocksignore func (e *ErrMessageTooLong) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrNetworkUnreachable) StateTypeName() string { return "pkg/tcpip.ErrNetworkUnreachable" } func (e *ErrNetworkUnreachable) StateFields() []string { return []string{} } func (e *ErrNetworkUnreachable) beforeSave() {} // +checklocksignore func (e *ErrNetworkUnreachable) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrNetworkUnreachable) afterLoad(context.Context) {} // +checklocksignore func (e *ErrNetworkUnreachable) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrNoBufferSpace) StateTypeName() string { return "pkg/tcpip.ErrNoBufferSpace" } func (e *ErrNoBufferSpace) StateFields() []string { return []string{} } func (e *ErrNoBufferSpace) beforeSave() {} // +checklocksignore func (e *ErrNoBufferSpace) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrNoBufferSpace) afterLoad(context.Context) {} // +checklocksignore func (e *ErrNoBufferSpace) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrNoPortAvailable) StateTypeName() string { return "pkg/tcpip.ErrNoPortAvailable" } func (e *ErrNoPortAvailable) StateFields() []string { return []string{} } func (e *ErrNoPortAvailable) beforeSave() {} // +checklocksignore func (e *ErrNoPortAvailable) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrNoPortAvailable) afterLoad(context.Context) {} // +checklocksignore func (e *ErrNoPortAvailable) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrHostUnreachable) StateTypeName() string { return "pkg/tcpip.ErrHostUnreachable" } func (e *ErrHostUnreachable) StateFields() []string { return []string{} } func (e *ErrHostUnreachable) beforeSave() {} // +checklocksignore func (e *ErrHostUnreachable) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrHostUnreachable) afterLoad(context.Context) {} // +checklocksignore func (e *ErrHostUnreachable) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrHostDown) StateTypeName() string { return "pkg/tcpip.ErrHostDown" } func (e *ErrHostDown) StateFields() []string { return []string{} } func (e *ErrHostDown) beforeSave() {} // +checklocksignore func (e *ErrHostDown) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrHostDown) afterLoad(context.Context) {} // +checklocksignore func (e *ErrHostDown) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrNoNet) StateTypeName() string { return "pkg/tcpip.ErrNoNet" } func (e *ErrNoNet) StateFields() []string { return []string{} } func (e *ErrNoNet) beforeSave() {} // +checklocksignore func (e *ErrNoNet) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrNoNet) afterLoad(context.Context) {} // +checklocksignore func (e *ErrNoNet) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrNoSuchFile) StateTypeName() string { return "pkg/tcpip.ErrNoSuchFile" } func (e *ErrNoSuchFile) StateFields() []string { return []string{} } func (e *ErrNoSuchFile) beforeSave() {} // +checklocksignore func (e *ErrNoSuchFile) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrNoSuchFile) afterLoad(context.Context) {} // +checklocksignore func (e *ErrNoSuchFile) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrNotConnected) StateTypeName() string { return "pkg/tcpip.ErrNotConnected" } func (e *ErrNotConnected) StateFields() []string { return []string{} } func (e *ErrNotConnected) beforeSave() {} // +checklocksignore func (e *ErrNotConnected) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrNotConnected) afterLoad(context.Context) {} // +checklocksignore func (e *ErrNotConnected) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrNotPermitted) StateTypeName() string { return "pkg/tcpip.ErrNotPermitted" } func (e *ErrNotPermitted) StateFields() []string { return []string{} } func (e *ErrNotPermitted) beforeSave() {} // +checklocksignore func (e *ErrNotPermitted) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrNotPermitted) afterLoad(context.Context) {} // +checklocksignore func (e *ErrNotPermitted) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrNotSupported) StateTypeName() string { return "pkg/tcpip.ErrNotSupported" } func (e *ErrNotSupported) StateFields() []string { return []string{} } func (e *ErrNotSupported) beforeSave() {} // +checklocksignore func (e *ErrNotSupported) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrNotSupported) afterLoad(context.Context) {} // +checklocksignore func (e *ErrNotSupported) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrPortInUse) StateTypeName() string { return "pkg/tcpip.ErrPortInUse" } func (e *ErrPortInUse) StateFields() []string { return []string{} } func (e *ErrPortInUse) beforeSave() {} // +checklocksignore func (e *ErrPortInUse) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrPortInUse) afterLoad(context.Context) {} // +checklocksignore func (e *ErrPortInUse) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrQueueSizeNotSupported) StateTypeName() string { return "pkg/tcpip.ErrQueueSizeNotSupported" } func (e *ErrQueueSizeNotSupported) StateFields() []string { return []string{} } func (e *ErrQueueSizeNotSupported) beforeSave() {} // +checklocksignore func (e *ErrQueueSizeNotSupported) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrQueueSizeNotSupported) afterLoad(context.Context) {} // +checklocksignore func (e *ErrQueueSizeNotSupported) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrTimeout) StateTypeName() string { return "pkg/tcpip.ErrTimeout" } func (e *ErrTimeout) StateFields() []string { return []string{} } func (e *ErrTimeout) beforeSave() {} // +checklocksignore func (e *ErrTimeout) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrTimeout) afterLoad(context.Context) {} // +checklocksignore func (e *ErrTimeout) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrUnknownDevice) StateTypeName() string { return "pkg/tcpip.ErrUnknownDevice" } func (e *ErrUnknownDevice) StateFields() []string { return []string{} } func (e *ErrUnknownDevice) beforeSave() {} // +checklocksignore func (e *ErrUnknownDevice) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrUnknownDevice) afterLoad(context.Context) {} // +checklocksignore func (e *ErrUnknownDevice) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrUnknownNICID) StateTypeName() string { return "pkg/tcpip.ErrUnknownNICID" } func (e *ErrUnknownNICID) StateFields() []string { return []string{} } func (e *ErrUnknownNICID) beforeSave() {} // +checklocksignore func (e *ErrUnknownNICID) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrUnknownNICID) afterLoad(context.Context) {} // +checklocksignore func (e *ErrUnknownNICID) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrUnknownProtocol) StateTypeName() string { return "pkg/tcpip.ErrUnknownProtocol" } func (e *ErrUnknownProtocol) StateFields() []string { return []string{} } func (e *ErrUnknownProtocol) beforeSave() {} // +checklocksignore func (e *ErrUnknownProtocol) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrUnknownProtocol) afterLoad(context.Context) {} // +checklocksignore func (e *ErrUnknownProtocol) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrUnknownProtocolOption) StateTypeName() string { return "pkg/tcpip.ErrUnknownProtocolOption" } func (e *ErrUnknownProtocolOption) StateFields() []string { return []string{} } func (e *ErrUnknownProtocolOption) beforeSave() {} // +checklocksignore func (e *ErrUnknownProtocolOption) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrUnknownProtocolOption) afterLoad(context.Context) {} // +checklocksignore func (e *ErrUnknownProtocolOption) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrWouldBlock) StateTypeName() string { return "pkg/tcpip.ErrWouldBlock" } func (e *ErrWouldBlock) StateFields() []string { return []string{} } func (e *ErrWouldBlock) beforeSave() {} // +checklocksignore func (e *ErrWouldBlock) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrWouldBlock) afterLoad(context.Context) {} // +checklocksignore func (e *ErrWouldBlock) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrMissingRequiredFields) StateTypeName() string { return "pkg/tcpip.ErrMissingRequiredFields" } func (e *ErrMissingRequiredFields) StateFields() []string { return []string{} } func (e *ErrMissingRequiredFields) beforeSave() {} // +checklocksignore func (e *ErrMissingRequiredFields) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrMissingRequiredFields) afterLoad(context.Context) {} // +checklocksignore func (e *ErrMissingRequiredFields) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (e *ErrMulticastInputCannotBeOutput) StateTypeName() string { return "pkg/tcpip.ErrMulticastInputCannotBeOutput" } func (e *ErrMulticastInputCannotBeOutput) StateFields() []string { return []string{} } func (e *ErrMulticastInputCannotBeOutput) beforeSave() {} // +checklocksignore func (e *ErrMulticastInputCannotBeOutput) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *ErrMulticastInputCannotBeOutput) afterLoad(context.Context) {} // +checklocksignore func (e *ErrMulticastInputCannotBeOutput) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (l *RouteList) StateTypeName() string { return "pkg/tcpip.RouteList" } func (l *RouteList) StateFields() []string { return []string{ "head", "tail", } } func (l *RouteList) beforeSave() {} // +checklocksignore func (l *RouteList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *RouteList) afterLoad(context.Context) {} // +checklocksignore func (l *RouteList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *RouteEntry) StateTypeName() string { return "pkg/tcpip.RouteEntry" } func (e *RouteEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *RouteEntry) beforeSave() {} // +checklocksignore func (e *RouteEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *RouteEntry) afterLoad(context.Context) {} // +checklocksignore func (e *RouteEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (l *sockErrorList) StateTypeName() string { return "pkg/tcpip.sockErrorList" } func (l *sockErrorList) StateFields() []string { return []string{ "head", "tail", } } func (l *sockErrorList) beforeSave() {} // +checklocksignore func (l *sockErrorList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *sockErrorList) afterLoad(context.Context) {} // +checklocksignore func (l *sockErrorList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *sockErrorEntry) StateTypeName() string { return "pkg/tcpip.sockErrorEntry" } func (e *sockErrorEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *sockErrorEntry) beforeSave() {} // +checklocksignore func (e *sockErrorEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *sockErrorEntry) afterLoad(context.Context) {} // +checklocksignore func (e *sockErrorEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (so *SocketOptions) StateTypeName() string { return "pkg/tcpip.SocketOptions" } func (so *SocketOptions) StateFields() []string { return []string{ "handler", "broadcastEnabled", "passCredEnabled", "noChecksumEnabled", "reuseAddressEnabled", "reusePortEnabled", "keepAliveEnabled", "multicastLoopEnabled", "receiveTOSEnabled", "receiveTTLEnabled", "receiveHopLimitEnabled", "receiveTClassEnabled", "receivePacketInfoEnabled", "receiveIPv6PacketInfoEnabled", "hdrIncludedEnabled", "v6OnlyEnabled", "quickAckEnabled", "delayOptionEnabled", "corkOptionEnabled", "receiveOriginalDstAddress", "ipv4RecvErrEnabled", "ipv6RecvErrEnabled", "errQueue", "bindToDevice", "sendBufferSize", "receiveBufferSize", "linger", "rcvlowat", } } func (so *SocketOptions) beforeSave() {} // +checklocksignore func (so *SocketOptions) StateSave(stateSinkObject state.Sink) { so.beforeSave() stateSinkObject.Save(0, &so.handler) stateSinkObject.Save(1, &so.broadcastEnabled) stateSinkObject.Save(2, &so.passCredEnabled) stateSinkObject.Save(3, &so.noChecksumEnabled) stateSinkObject.Save(4, &so.reuseAddressEnabled) stateSinkObject.Save(5, &so.reusePortEnabled) stateSinkObject.Save(6, &so.keepAliveEnabled) stateSinkObject.Save(7, &so.multicastLoopEnabled) stateSinkObject.Save(8, &so.receiveTOSEnabled) stateSinkObject.Save(9, &so.receiveTTLEnabled) stateSinkObject.Save(10, &so.receiveHopLimitEnabled) stateSinkObject.Save(11, &so.receiveTClassEnabled) stateSinkObject.Save(12, &so.receivePacketInfoEnabled) stateSinkObject.Save(13, &so.receiveIPv6PacketInfoEnabled) stateSinkObject.Save(14, &so.hdrIncludedEnabled) stateSinkObject.Save(15, &so.v6OnlyEnabled) stateSinkObject.Save(16, &so.quickAckEnabled) stateSinkObject.Save(17, &so.delayOptionEnabled) stateSinkObject.Save(18, &so.corkOptionEnabled) stateSinkObject.Save(19, &so.receiveOriginalDstAddress) stateSinkObject.Save(20, &so.ipv4RecvErrEnabled) stateSinkObject.Save(21, &so.ipv6RecvErrEnabled) stateSinkObject.Save(22, &so.errQueue) stateSinkObject.Save(23, &so.bindToDevice) stateSinkObject.Save(24, &so.sendBufferSize) stateSinkObject.Save(25, &so.receiveBufferSize) stateSinkObject.Save(26, &so.linger) stateSinkObject.Save(27, &so.rcvlowat) } func (so *SocketOptions) afterLoad(context.Context) {} // +checklocksignore func (so *SocketOptions) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &so.handler) stateSourceObject.Load(1, &so.broadcastEnabled) stateSourceObject.Load(2, &so.passCredEnabled) stateSourceObject.Load(3, &so.noChecksumEnabled) stateSourceObject.Load(4, &so.reuseAddressEnabled) stateSourceObject.Load(5, &so.reusePortEnabled) stateSourceObject.Load(6, &so.keepAliveEnabled) stateSourceObject.Load(7, &so.multicastLoopEnabled) stateSourceObject.Load(8, &so.receiveTOSEnabled) stateSourceObject.Load(9, &so.receiveTTLEnabled) stateSourceObject.Load(10, &so.receiveHopLimitEnabled) stateSourceObject.Load(11, &so.receiveTClassEnabled) stateSourceObject.Load(12, &so.receivePacketInfoEnabled) stateSourceObject.Load(13, &so.receiveIPv6PacketInfoEnabled) stateSourceObject.Load(14, &so.hdrIncludedEnabled) stateSourceObject.Load(15, &so.v6OnlyEnabled) stateSourceObject.Load(16, &so.quickAckEnabled) stateSourceObject.Load(17, &so.delayOptionEnabled) stateSourceObject.Load(18, &so.corkOptionEnabled) stateSourceObject.Load(19, &so.receiveOriginalDstAddress) stateSourceObject.Load(20, &so.ipv4RecvErrEnabled) stateSourceObject.Load(21, &so.ipv6RecvErrEnabled) stateSourceObject.Load(22, &so.errQueue) stateSourceObject.Load(23, &so.bindToDevice) stateSourceObject.Load(24, &so.sendBufferSize) stateSourceObject.Load(25, &so.receiveBufferSize) stateSourceObject.Load(26, &so.linger) stateSourceObject.Load(27, &so.rcvlowat) } func (l *LocalSockError) StateTypeName() string { return "pkg/tcpip.LocalSockError" } func (l *LocalSockError) StateFields() []string { return []string{ "info", } } func (l *LocalSockError) beforeSave() {} // +checklocksignore func (l *LocalSockError) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.info) } func (l *LocalSockError) afterLoad(context.Context) {} // +checklocksignore func (l *LocalSockError) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.info) } func (s *SockError) StateTypeName() string { return "pkg/tcpip.SockError" } func (s *SockError) StateFields() []string { return []string{ "sockErrorEntry", "Err", "Cause", "Payload", "Dst", "Offender", "NetProto", } } func (s *SockError) beforeSave() {} // +checklocksignore func (s *SockError) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.sockErrorEntry) stateSinkObject.Save(1, &s.Err) stateSinkObject.Save(2, &s.Cause) stateSinkObject.Save(3, &s.Payload) stateSinkObject.Save(4, &s.Dst) stateSinkObject.Save(5, &s.Offender) stateSinkObject.Save(6, &s.NetProto) } func (s *SockError) afterLoad(context.Context) {} // +checklocksignore func (s *SockError) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.sockErrorEntry) stateSourceObject.Load(1, &s.Err) stateSourceObject.Load(2, &s.Cause) stateSourceObject.Load(3, &s.Payload) stateSourceObject.Load(4, &s.Dst) stateSourceObject.Load(5, &s.Offender) stateSourceObject.Load(6, &s.NetProto) } func (s *stdClock) StateTypeName() string { return "pkg/tcpip.stdClock" } func (s *stdClock) StateFields() []string { return []string{ "monotonicOffset", } } // +checklocksignore func (s *stdClock) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.monotonicOffset) } // +checklocksignore func (s *stdClock) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.monotonicOffset) stateSourceObject.AfterLoad(func() { s.afterLoad(ctx) }) } func (st *stdTimer) StateTypeName() string { return "pkg/tcpip.stdTimer" } func (st *stdTimer) StateFields() []string { return []string{ "t", } } func (st *stdTimer) beforeSave() {} // +checklocksignore func (st *stdTimer) StateSave(stateSinkObject state.Sink) { st.beforeSave() stateSinkObject.Save(0, &st.t) } func (st *stdTimer) afterLoad(context.Context) {} // +checklocksignore func (st *stdTimer) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &st.t) } func (mt *MonotonicTime) StateTypeName() string { return "pkg/tcpip.MonotonicTime" } func (mt *MonotonicTime) StateFields() []string { return []string{ "nanoseconds", } } func (mt *MonotonicTime) beforeSave() {} // +checklocksignore func (mt *MonotonicTime) StateSave(stateSinkObject state.Sink) { mt.beforeSave() stateSinkObject.Save(0, &mt.nanoseconds) } func (mt *MonotonicTime) afterLoad(context.Context) {} // +checklocksignore func (mt *MonotonicTime) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &mt.nanoseconds) } func (a *Address) StateTypeName() string { return "pkg/tcpip.Address" } func (a *Address) StateFields() []string { return []string{ "addr", "length", } } func (a *Address) beforeSave() {} // +checklocksignore func (a *Address) StateSave(stateSinkObject state.Sink) { a.beforeSave() stateSinkObject.Save(0, &a.addr) stateSinkObject.Save(1, &a.length) } func (a *Address) afterLoad(context.Context) {} // +checklocksignore func (a *Address) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &a.addr) stateSourceObject.Load(1, &a.length) } func (m *AddressMask) StateTypeName() string { return "pkg/tcpip.AddressMask" } func (m *AddressMask) StateFields() []string { return []string{ "mask", "length", } } func (m *AddressMask) beforeSave() {} // +checklocksignore func (m *AddressMask) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.mask) stateSinkObject.Save(1, &m.length) } func (m *AddressMask) afterLoad(context.Context) {} // +checklocksignore func (m *AddressMask) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.mask) stateSourceObject.Load(1, &m.length) } func (s *Subnet) StateTypeName() string { return "pkg/tcpip.Subnet" } func (s *Subnet) StateFields() []string { return []string{ "address", "mask", } } func (s *Subnet) beforeSave() {} // +checklocksignore func (s *Subnet) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.address) stateSinkObject.Save(1, &s.mask) } func (s *Subnet) afterLoad(context.Context) {} // +checklocksignore func (s *Subnet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.address) stateSourceObject.Load(1, &s.mask) } func (f *FullAddress) StateTypeName() string { return "pkg/tcpip.FullAddress" } func (f *FullAddress) StateFields() []string { return []string{ "NIC", "Addr", "Port", "LinkAddr", } } func (f *FullAddress) beforeSave() {} // +checklocksignore func (f *FullAddress) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.NIC) stateSinkObject.Save(1, &f.Addr) stateSinkObject.Save(2, &f.Port) stateSinkObject.Save(3, &f.LinkAddr) } func (f *FullAddress) afterLoad(context.Context) {} // +checklocksignore func (f *FullAddress) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.NIC) stateSourceObject.Load(1, &f.Addr) stateSourceObject.Load(2, &f.Port) stateSourceObject.Load(3, &f.LinkAddr) } func (s *SendableControlMessages) StateTypeName() string { return "pkg/tcpip.SendableControlMessages" } func (s *SendableControlMessages) StateFields() []string { return []string{ "HasTTL", "TTL", "HasHopLimit", "HopLimit", "HasIPv6PacketInfo", "IPv6PacketInfo", } } func (s *SendableControlMessages) beforeSave() {} // +checklocksignore func (s *SendableControlMessages) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.HasTTL) stateSinkObject.Save(1, &s.TTL) stateSinkObject.Save(2, &s.HasHopLimit) stateSinkObject.Save(3, &s.HopLimit) stateSinkObject.Save(4, &s.HasIPv6PacketInfo) stateSinkObject.Save(5, &s.IPv6PacketInfo) } func (s *SendableControlMessages) afterLoad(context.Context) {} // +checklocksignore func (s *SendableControlMessages) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.HasTTL) stateSourceObject.Load(1, &s.TTL) stateSourceObject.Load(2, &s.HasHopLimit) stateSourceObject.Load(3, &s.HopLimit) stateSourceObject.Load(4, &s.HasIPv6PacketInfo) stateSourceObject.Load(5, &s.IPv6PacketInfo) } func (c *ReceivableControlMessages) StateTypeName() string { return "pkg/tcpip.ReceivableControlMessages" } func (c *ReceivableControlMessages) StateFields() []string { return []string{ "Timestamp", "HasInq", "Inq", "HasTOS", "TOS", "HasTTL", "TTL", "HasHopLimit", "HopLimit", "HasTimestamp", "HasTClass", "TClass", "HasIPPacketInfo", "PacketInfo", "HasIPv6PacketInfo", "IPv6PacketInfo", "HasOriginalDstAddress", "OriginalDstAddress", "SockErr", } } func (c *ReceivableControlMessages) beforeSave() {} // +checklocksignore func (c *ReceivableControlMessages) StateSave(stateSinkObject state.Sink) { c.beforeSave() var TimestampValue int64 TimestampValue = c.saveTimestamp() stateSinkObject.SaveValue(0, TimestampValue) stateSinkObject.Save(1, &c.HasInq) stateSinkObject.Save(2, &c.Inq) stateSinkObject.Save(3, &c.HasTOS) stateSinkObject.Save(4, &c.TOS) stateSinkObject.Save(5, &c.HasTTL) stateSinkObject.Save(6, &c.TTL) stateSinkObject.Save(7, &c.HasHopLimit) stateSinkObject.Save(8, &c.HopLimit) stateSinkObject.Save(9, &c.HasTimestamp) stateSinkObject.Save(10, &c.HasTClass) stateSinkObject.Save(11, &c.TClass) stateSinkObject.Save(12, &c.HasIPPacketInfo) stateSinkObject.Save(13, &c.PacketInfo) stateSinkObject.Save(14, &c.HasIPv6PacketInfo) stateSinkObject.Save(15, &c.IPv6PacketInfo) stateSinkObject.Save(16, &c.HasOriginalDstAddress) stateSinkObject.Save(17, &c.OriginalDstAddress) stateSinkObject.Save(18, &c.SockErr) } func (c *ReceivableControlMessages) afterLoad(context.Context) {} // +checklocksignore func (c *ReceivableControlMessages) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(1, &c.HasInq) stateSourceObject.Load(2, &c.Inq) stateSourceObject.Load(3, &c.HasTOS) stateSourceObject.Load(4, &c.TOS) stateSourceObject.Load(5, &c.HasTTL) stateSourceObject.Load(6, &c.TTL) stateSourceObject.Load(7, &c.HasHopLimit) stateSourceObject.Load(8, &c.HopLimit) stateSourceObject.Load(9, &c.HasTimestamp) stateSourceObject.Load(10, &c.HasTClass) stateSourceObject.Load(11, &c.TClass) stateSourceObject.Load(12, &c.HasIPPacketInfo) stateSourceObject.Load(13, &c.PacketInfo) stateSourceObject.Load(14, &c.HasIPv6PacketInfo) stateSourceObject.Load(15, &c.IPv6PacketInfo) stateSourceObject.Load(16, &c.HasOriginalDstAddress) stateSourceObject.Load(17, &c.OriginalDstAddress) stateSourceObject.Load(18, &c.SockErr) stateSourceObject.LoadValue(0, new(int64), func(y any) { c.loadTimestamp(ctx, y.(int64)) }) } func (l *LinkPacketInfo) StateTypeName() string { return "pkg/tcpip.LinkPacketInfo" } func (l *LinkPacketInfo) StateFields() []string { return []string{ "Protocol", "PktType", } } func (l *LinkPacketInfo) beforeSave() {} // +checklocksignore func (l *LinkPacketInfo) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.Protocol) stateSinkObject.Save(1, &l.PktType) } func (l *LinkPacketInfo) afterLoad(context.Context) {} // +checklocksignore func (l *LinkPacketInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.Protocol) stateSourceObject.Load(1, &l.PktType) } func (t *TCPSendBufferSizeRangeOption) StateTypeName() string { return "pkg/tcpip.TCPSendBufferSizeRangeOption" } func (t *TCPSendBufferSizeRangeOption) StateFields() []string { return []string{ "Min", "Default", "Max", } } func (t *TCPSendBufferSizeRangeOption) beforeSave() {} // +checklocksignore func (t *TCPSendBufferSizeRangeOption) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.Min) stateSinkObject.Save(1, &t.Default) stateSinkObject.Save(2, &t.Max) } func (t *TCPSendBufferSizeRangeOption) afterLoad(context.Context) {} // +checklocksignore func (t *TCPSendBufferSizeRangeOption) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.Min) stateSourceObject.Load(1, &t.Default) stateSourceObject.Load(2, &t.Max) } func (t *TCPReceiveBufferSizeRangeOption) StateTypeName() string { return "pkg/tcpip.TCPReceiveBufferSizeRangeOption" } func (t *TCPReceiveBufferSizeRangeOption) StateFields() []string { return []string{ "Min", "Default", "Max", } } func (t *TCPReceiveBufferSizeRangeOption) beforeSave() {} // +checklocksignore func (t *TCPReceiveBufferSizeRangeOption) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.Min) stateSinkObject.Save(1, &t.Default) stateSinkObject.Save(2, &t.Max) } func (t *TCPReceiveBufferSizeRangeOption) afterLoad(context.Context) {} // +checklocksignore func (t *TCPReceiveBufferSizeRangeOption) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.Min) stateSourceObject.Load(1, &t.Default) stateSourceObject.Load(2, &t.Max) } func (f *ICMPv6Filter) StateTypeName() string { return "pkg/tcpip.ICMPv6Filter" } func (f *ICMPv6Filter) StateFields() []string { return []string{ "DenyType", } } func (f *ICMPv6Filter) beforeSave() {} // +checklocksignore func (f *ICMPv6Filter) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.DenyType) } func (f *ICMPv6Filter) afterLoad(context.Context) {} // +checklocksignore func (f *ICMPv6Filter) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.DenyType) } func (l *LingerOption) StateTypeName() string { return "pkg/tcpip.LingerOption" } func (l *LingerOption) StateFields() []string { return []string{ "Enabled", "Timeout", } } func (l *LingerOption) beforeSave() {} // +checklocksignore func (l *LingerOption) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.Enabled) stateSinkObject.Save(1, &l.Timeout) } func (l *LingerOption) afterLoad(context.Context) {} // +checklocksignore func (l *LingerOption) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.Enabled) stateSourceObject.Load(1, &l.Timeout) } func (i *IPPacketInfo) StateTypeName() string { return "pkg/tcpip.IPPacketInfo" } func (i *IPPacketInfo) StateFields() []string { return []string{ "NIC", "LocalAddr", "DestinationAddr", } } func (i *IPPacketInfo) beforeSave() {} // +checklocksignore func (i *IPPacketInfo) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.NIC) stateSinkObject.Save(1, &i.LocalAddr) stateSinkObject.Save(2, &i.DestinationAddr) } func (i *IPPacketInfo) afterLoad(context.Context) {} // +checklocksignore func (i *IPPacketInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.NIC) stateSourceObject.Load(1, &i.LocalAddr) stateSourceObject.Load(2, &i.DestinationAddr) } func (i *IPv6PacketInfo) StateTypeName() string { return "pkg/tcpip.IPv6PacketInfo" } func (i *IPv6PacketInfo) StateFields() []string { return []string{ "Addr", "NIC", } } func (i *IPv6PacketInfo) beforeSave() {} // +checklocksignore func (i *IPv6PacketInfo) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.Addr) stateSinkObject.Save(1, &i.NIC) } func (i *IPv6PacketInfo) afterLoad(context.Context) {} // +checklocksignore func (i *IPv6PacketInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.Addr) stateSourceObject.Load(1, &i.NIC) } func (s *SendBufferSizeOption) StateTypeName() string { return "pkg/tcpip.SendBufferSizeOption" } func (s *SendBufferSizeOption) StateFields() []string { return []string{ "Min", "Default", "Max", } } func (s *SendBufferSizeOption) beforeSave() {} // +checklocksignore func (s *SendBufferSizeOption) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.Min) stateSinkObject.Save(1, &s.Default) stateSinkObject.Save(2, &s.Max) } func (s *SendBufferSizeOption) afterLoad(context.Context) {} // +checklocksignore func (s *SendBufferSizeOption) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.Min) stateSourceObject.Load(1, &s.Default) stateSourceObject.Load(2, &s.Max) } func (r *ReceiveBufferSizeOption) StateTypeName() string { return "pkg/tcpip.ReceiveBufferSizeOption" } func (r *ReceiveBufferSizeOption) StateFields() []string { return []string{ "Min", "Default", "Max", } } func (r *ReceiveBufferSizeOption) beforeSave() {} // +checklocksignore func (r *ReceiveBufferSizeOption) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.Min) stateSinkObject.Save(1, &r.Default) stateSinkObject.Save(2, &r.Max) } func (r *ReceiveBufferSizeOption) afterLoad(context.Context) {} // +checklocksignore func (r *ReceiveBufferSizeOption) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.Min) stateSourceObject.Load(1, &r.Default) stateSourceObject.Load(2, &r.Max) } func (r *Route) StateTypeName() string { return "pkg/tcpip.Route" } func (r *Route) StateFields() []string { return []string{ "RouteEntry", "Destination", "Gateway", "NIC", "SourceHint", "MTU", } } func (r *Route) beforeSave() {} // +checklocksignore func (r *Route) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.RouteEntry) stateSinkObject.Save(1, &r.Destination) stateSinkObject.Save(2, &r.Gateway) stateSinkObject.Save(3, &r.NIC) stateSinkObject.Save(4, &r.SourceHint) stateSinkObject.Save(5, &r.MTU) } func (r *Route) afterLoad(context.Context) {} // +checklocksignore func (r *Route) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.RouteEntry) stateSourceObject.Load(1, &r.Destination) stateSourceObject.Load(2, &r.Gateway) stateSourceObject.Load(3, &r.NIC) stateSourceObject.Load(4, &r.SourceHint) stateSourceObject.Load(5, &r.MTU) } func (s *StatCounter) StateTypeName() string { return "pkg/tcpip.StatCounter" } func (s *StatCounter) StateFields() []string { return []string{ "count", } } func (s *StatCounter) beforeSave() {} // +checklocksignore func (s *StatCounter) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.count) } func (s *StatCounter) afterLoad(context.Context) {} // +checklocksignore func (s *StatCounter) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.count) } func (m *MultiCounterStat) StateTypeName() string { return "pkg/tcpip.MultiCounterStat" } func (m *MultiCounterStat) StateFields() []string { return []string{ "a", "b", } } func (m *MultiCounterStat) beforeSave() {} // +checklocksignore func (m *MultiCounterStat) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.a) stateSinkObject.Save(1, &m.b) } func (m *MultiCounterStat) afterLoad(context.Context) {} // +checklocksignore func (m *MultiCounterStat) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.a) stateSourceObject.Load(1, &m.b) } func (i *ICMPv4PacketStats) StateTypeName() string { return "pkg/tcpip.ICMPv4PacketStats" } func (i *ICMPv4PacketStats) StateFields() []string { return []string{ "EchoRequest", "EchoReply", "DstUnreachable", "SrcQuench", "Redirect", "TimeExceeded", "ParamProblem", "Timestamp", "TimestampReply", "InfoRequest", "InfoReply", } } func (i *ICMPv4PacketStats) beforeSave() {} // +checklocksignore func (i *ICMPv4PacketStats) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.EchoRequest) stateSinkObject.Save(1, &i.EchoReply) stateSinkObject.Save(2, &i.DstUnreachable) stateSinkObject.Save(3, &i.SrcQuench) stateSinkObject.Save(4, &i.Redirect) stateSinkObject.Save(5, &i.TimeExceeded) stateSinkObject.Save(6, &i.ParamProblem) stateSinkObject.Save(7, &i.Timestamp) stateSinkObject.Save(8, &i.TimestampReply) stateSinkObject.Save(9, &i.InfoRequest) stateSinkObject.Save(10, &i.InfoReply) } func (i *ICMPv4PacketStats) afterLoad(context.Context) {} // +checklocksignore func (i *ICMPv4PacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.EchoRequest) stateSourceObject.Load(1, &i.EchoReply) stateSourceObject.Load(2, &i.DstUnreachable) stateSourceObject.Load(3, &i.SrcQuench) stateSourceObject.Load(4, &i.Redirect) stateSourceObject.Load(5, &i.TimeExceeded) stateSourceObject.Load(6, &i.ParamProblem) stateSourceObject.Load(7, &i.Timestamp) stateSourceObject.Load(8, &i.TimestampReply) stateSourceObject.Load(9, &i.InfoRequest) stateSourceObject.Load(10, &i.InfoReply) } func (i *ICMPv4SentPacketStats) StateTypeName() string { return "pkg/tcpip.ICMPv4SentPacketStats" } func (i *ICMPv4SentPacketStats) StateFields() []string { return []string{ "ICMPv4PacketStats", "Dropped", "RateLimited", } } func (i *ICMPv4SentPacketStats) beforeSave() {} // +checklocksignore func (i *ICMPv4SentPacketStats) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.ICMPv4PacketStats) stateSinkObject.Save(1, &i.Dropped) stateSinkObject.Save(2, &i.RateLimited) } func (i *ICMPv4SentPacketStats) afterLoad(context.Context) {} // +checklocksignore func (i *ICMPv4SentPacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.ICMPv4PacketStats) stateSourceObject.Load(1, &i.Dropped) stateSourceObject.Load(2, &i.RateLimited) } func (i *ICMPv4ReceivedPacketStats) StateTypeName() string { return "pkg/tcpip.ICMPv4ReceivedPacketStats" } func (i *ICMPv4ReceivedPacketStats) StateFields() []string { return []string{ "ICMPv4PacketStats", "Invalid", } } func (i *ICMPv4ReceivedPacketStats) beforeSave() {} // +checklocksignore func (i *ICMPv4ReceivedPacketStats) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.ICMPv4PacketStats) stateSinkObject.Save(1, &i.Invalid) } func (i *ICMPv4ReceivedPacketStats) afterLoad(context.Context) {} // +checklocksignore func (i *ICMPv4ReceivedPacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.ICMPv4PacketStats) stateSourceObject.Load(1, &i.Invalid) } func (i *ICMPv4Stats) StateTypeName() string { return "pkg/tcpip.ICMPv4Stats" } func (i *ICMPv4Stats) StateFields() []string { return []string{ "PacketsSent", "PacketsReceived", } } func (i *ICMPv4Stats) beforeSave() {} // +checklocksignore func (i *ICMPv4Stats) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.PacketsSent) stateSinkObject.Save(1, &i.PacketsReceived) } func (i *ICMPv4Stats) afterLoad(context.Context) {} // +checklocksignore func (i *ICMPv4Stats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.PacketsSent) stateSourceObject.Load(1, &i.PacketsReceived) } func (i *ICMPv6PacketStats) StateTypeName() string { return "pkg/tcpip.ICMPv6PacketStats" } func (i *ICMPv6PacketStats) StateFields() []string { return []string{ "EchoRequest", "EchoReply", "DstUnreachable", "PacketTooBig", "TimeExceeded", "ParamProblem", "RouterSolicit", "RouterAdvert", "NeighborSolicit", "NeighborAdvert", "RedirectMsg", "MulticastListenerQuery", "MulticastListenerReport", "MulticastListenerReportV2", "MulticastListenerDone", } } func (i *ICMPv6PacketStats) beforeSave() {} // +checklocksignore func (i *ICMPv6PacketStats) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.EchoRequest) stateSinkObject.Save(1, &i.EchoReply) stateSinkObject.Save(2, &i.DstUnreachable) stateSinkObject.Save(3, &i.PacketTooBig) stateSinkObject.Save(4, &i.TimeExceeded) stateSinkObject.Save(5, &i.ParamProblem) stateSinkObject.Save(6, &i.RouterSolicit) stateSinkObject.Save(7, &i.RouterAdvert) stateSinkObject.Save(8, &i.NeighborSolicit) stateSinkObject.Save(9, &i.NeighborAdvert) stateSinkObject.Save(10, &i.RedirectMsg) stateSinkObject.Save(11, &i.MulticastListenerQuery) stateSinkObject.Save(12, &i.MulticastListenerReport) stateSinkObject.Save(13, &i.MulticastListenerReportV2) stateSinkObject.Save(14, &i.MulticastListenerDone) } func (i *ICMPv6PacketStats) afterLoad(context.Context) {} // +checklocksignore func (i *ICMPv6PacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.EchoRequest) stateSourceObject.Load(1, &i.EchoReply) stateSourceObject.Load(2, &i.DstUnreachable) stateSourceObject.Load(3, &i.PacketTooBig) stateSourceObject.Load(4, &i.TimeExceeded) stateSourceObject.Load(5, &i.ParamProblem) stateSourceObject.Load(6, &i.RouterSolicit) stateSourceObject.Load(7, &i.RouterAdvert) stateSourceObject.Load(8, &i.NeighborSolicit) stateSourceObject.Load(9, &i.NeighborAdvert) stateSourceObject.Load(10, &i.RedirectMsg) stateSourceObject.Load(11, &i.MulticastListenerQuery) stateSourceObject.Load(12, &i.MulticastListenerReport) stateSourceObject.Load(13, &i.MulticastListenerReportV2) stateSourceObject.Load(14, &i.MulticastListenerDone) } func (i *ICMPv6SentPacketStats) StateTypeName() string { return "pkg/tcpip.ICMPv6SentPacketStats" } func (i *ICMPv6SentPacketStats) StateFields() []string { return []string{ "ICMPv6PacketStats", "Dropped", "RateLimited", } } func (i *ICMPv6SentPacketStats) beforeSave() {} // +checklocksignore func (i *ICMPv6SentPacketStats) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.ICMPv6PacketStats) stateSinkObject.Save(1, &i.Dropped) stateSinkObject.Save(2, &i.RateLimited) } func (i *ICMPv6SentPacketStats) afterLoad(context.Context) {} // +checklocksignore func (i *ICMPv6SentPacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.ICMPv6PacketStats) stateSourceObject.Load(1, &i.Dropped) stateSourceObject.Load(2, &i.RateLimited) } func (i *ICMPv6ReceivedPacketStats) StateTypeName() string { return "pkg/tcpip.ICMPv6ReceivedPacketStats" } func (i *ICMPv6ReceivedPacketStats) StateFields() []string { return []string{ "ICMPv6PacketStats", "Unrecognized", "Invalid", "RouterOnlyPacketsDroppedByHost", } } func (i *ICMPv6ReceivedPacketStats) beforeSave() {} // +checklocksignore func (i *ICMPv6ReceivedPacketStats) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.ICMPv6PacketStats) stateSinkObject.Save(1, &i.Unrecognized) stateSinkObject.Save(2, &i.Invalid) stateSinkObject.Save(3, &i.RouterOnlyPacketsDroppedByHost) } func (i *ICMPv6ReceivedPacketStats) afterLoad(context.Context) {} // +checklocksignore func (i *ICMPv6ReceivedPacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.ICMPv6PacketStats) stateSourceObject.Load(1, &i.Unrecognized) stateSourceObject.Load(2, &i.Invalid) stateSourceObject.Load(3, &i.RouterOnlyPacketsDroppedByHost) } func (i *ICMPv6Stats) StateTypeName() string { return "pkg/tcpip.ICMPv6Stats" } func (i *ICMPv6Stats) StateFields() []string { return []string{ "PacketsSent", "PacketsReceived", } } func (i *ICMPv6Stats) beforeSave() {} // +checklocksignore func (i *ICMPv6Stats) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.PacketsSent) stateSinkObject.Save(1, &i.PacketsReceived) } func (i *ICMPv6Stats) afterLoad(context.Context) {} // +checklocksignore func (i *ICMPv6Stats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.PacketsSent) stateSourceObject.Load(1, &i.PacketsReceived) } func (i *ICMPStats) StateTypeName() string { return "pkg/tcpip.ICMPStats" } func (i *ICMPStats) StateFields() []string { return []string{ "V4", "V6", } } func (i *ICMPStats) beforeSave() {} // +checklocksignore func (i *ICMPStats) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.V4) stateSinkObject.Save(1, &i.V6) } func (i *ICMPStats) afterLoad(context.Context) {} // +checklocksignore func (i *ICMPStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.V4) stateSourceObject.Load(1, &i.V6) } func (i *IGMPPacketStats) StateTypeName() string { return "pkg/tcpip.IGMPPacketStats" } func (i *IGMPPacketStats) StateFields() []string { return []string{ "MembershipQuery", "V1MembershipReport", "V2MembershipReport", "V3MembershipReport", "LeaveGroup", } } func (i *IGMPPacketStats) beforeSave() {} // +checklocksignore func (i *IGMPPacketStats) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.MembershipQuery) stateSinkObject.Save(1, &i.V1MembershipReport) stateSinkObject.Save(2, &i.V2MembershipReport) stateSinkObject.Save(3, &i.V3MembershipReport) stateSinkObject.Save(4, &i.LeaveGroup) } func (i *IGMPPacketStats) afterLoad(context.Context) {} // +checklocksignore func (i *IGMPPacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.MembershipQuery) stateSourceObject.Load(1, &i.V1MembershipReport) stateSourceObject.Load(2, &i.V2MembershipReport) stateSourceObject.Load(3, &i.V3MembershipReport) stateSourceObject.Load(4, &i.LeaveGroup) } func (i *IGMPSentPacketStats) StateTypeName() string { return "pkg/tcpip.IGMPSentPacketStats" } func (i *IGMPSentPacketStats) StateFields() []string { return []string{ "IGMPPacketStats", "Dropped", } } func (i *IGMPSentPacketStats) beforeSave() {} // +checklocksignore func (i *IGMPSentPacketStats) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.IGMPPacketStats) stateSinkObject.Save(1, &i.Dropped) } func (i *IGMPSentPacketStats) afterLoad(context.Context) {} // +checklocksignore func (i *IGMPSentPacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.IGMPPacketStats) stateSourceObject.Load(1, &i.Dropped) } func (i *IGMPReceivedPacketStats) StateTypeName() string { return "pkg/tcpip.IGMPReceivedPacketStats" } func (i *IGMPReceivedPacketStats) StateFields() []string { return []string{ "IGMPPacketStats", "Invalid", "ChecksumErrors", "Unrecognized", } } func (i *IGMPReceivedPacketStats) beforeSave() {} // +checklocksignore func (i *IGMPReceivedPacketStats) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.IGMPPacketStats) stateSinkObject.Save(1, &i.Invalid) stateSinkObject.Save(2, &i.ChecksumErrors) stateSinkObject.Save(3, &i.Unrecognized) } func (i *IGMPReceivedPacketStats) afterLoad(context.Context) {} // +checklocksignore func (i *IGMPReceivedPacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.IGMPPacketStats) stateSourceObject.Load(1, &i.Invalid) stateSourceObject.Load(2, &i.ChecksumErrors) stateSourceObject.Load(3, &i.Unrecognized) } func (i *IGMPStats) StateTypeName() string { return "pkg/tcpip.IGMPStats" } func (i *IGMPStats) StateFields() []string { return []string{ "PacketsSent", "PacketsReceived", } } func (i *IGMPStats) beforeSave() {} // +checklocksignore func (i *IGMPStats) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.PacketsSent) stateSinkObject.Save(1, &i.PacketsReceived) } func (i *IGMPStats) afterLoad(context.Context) {} // +checklocksignore func (i *IGMPStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.PacketsSent) stateSourceObject.Load(1, &i.PacketsReceived) } func (i *IPForwardingStats) StateTypeName() string { return "pkg/tcpip.IPForwardingStats" } func (i *IPForwardingStats) StateFields() []string { return []string{ "Unrouteable", "ExhaustedTTL", "InitializingSource", "LinkLocalSource", "LinkLocalDestination", "PacketTooBig", "HostUnreachable", "ExtensionHeaderProblem", "UnexpectedMulticastInputInterface", "UnknownOutputEndpoint", "NoMulticastPendingQueueBufferSpace", "OutgoingDeviceNoBufferSpace", "Errors", } } func (i *IPForwardingStats) beforeSave() {} // +checklocksignore func (i *IPForwardingStats) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.Unrouteable) stateSinkObject.Save(1, &i.ExhaustedTTL) stateSinkObject.Save(2, &i.InitializingSource) stateSinkObject.Save(3, &i.LinkLocalSource) stateSinkObject.Save(4, &i.LinkLocalDestination) stateSinkObject.Save(5, &i.PacketTooBig) stateSinkObject.Save(6, &i.HostUnreachable) stateSinkObject.Save(7, &i.ExtensionHeaderProblem) stateSinkObject.Save(8, &i.UnexpectedMulticastInputInterface) stateSinkObject.Save(9, &i.UnknownOutputEndpoint) stateSinkObject.Save(10, &i.NoMulticastPendingQueueBufferSpace) stateSinkObject.Save(11, &i.OutgoingDeviceNoBufferSpace) stateSinkObject.Save(12, &i.Errors) } func (i *IPForwardingStats) afterLoad(context.Context) {} // +checklocksignore func (i *IPForwardingStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.Unrouteable) stateSourceObject.Load(1, &i.ExhaustedTTL) stateSourceObject.Load(2, &i.InitializingSource) stateSourceObject.Load(3, &i.LinkLocalSource) stateSourceObject.Load(4, &i.LinkLocalDestination) stateSourceObject.Load(5, &i.PacketTooBig) stateSourceObject.Load(6, &i.HostUnreachable) stateSourceObject.Load(7, &i.ExtensionHeaderProblem) stateSourceObject.Load(8, &i.UnexpectedMulticastInputInterface) stateSourceObject.Load(9, &i.UnknownOutputEndpoint) stateSourceObject.Load(10, &i.NoMulticastPendingQueueBufferSpace) stateSourceObject.Load(11, &i.OutgoingDeviceNoBufferSpace) stateSourceObject.Load(12, &i.Errors) } func (i *IPStats) StateTypeName() string { return "pkg/tcpip.IPStats" } func (i *IPStats) StateFields() []string { return []string{ "PacketsReceived", "ValidPacketsReceived", "DisabledPacketsReceived", "InvalidDestinationAddressesReceived", "InvalidSourceAddressesReceived", "PacketsDelivered", "PacketsSent", "OutgoingPacketErrors", "MalformedPacketsReceived", "MalformedFragmentsReceived", "IPTablesPreroutingDropped", "IPTablesInputDropped", "IPTablesForwardDropped", "IPTablesOutputDropped", "IPTablesPostroutingDropped", "OptionTimestampReceived", "OptionRecordRouteReceived", "OptionRouterAlertReceived", "OptionUnknownReceived", "Forwarding", } } func (i *IPStats) beforeSave() {} // +checklocksignore func (i *IPStats) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.PacketsReceived) stateSinkObject.Save(1, &i.ValidPacketsReceived) stateSinkObject.Save(2, &i.DisabledPacketsReceived) stateSinkObject.Save(3, &i.InvalidDestinationAddressesReceived) stateSinkObject.Save(4, &i.InvalidSourceAddressesReceived) stateSinkObject.Save(5, &i.PacketsDelivered) stateSinkObject.Save(6, &i.PacketsSent) stateSinkObject.Save(7, &i.OutgoingPacketErrors) stateSinkObject.Save(8, &i.MalformedPacketsReceived) stateSinkObject.Save(9, &i.MalformedFragmentsReceived) stateSinkObject.Save(10, &i.IPTablesPreroutingDropped) stateSinkObject.Save(11, &i.IPTablesInputDropped) stateSinkObject.Save(12, &i.IPTablesForwardDropped) stateSinkObject.Save(13, &i.IPTablesOutputDropped) stateSinkObject.Save(14, &i.IPTablesPostroutingDropped) stateSinkObject.Save(15, &i.OptionTimestampReceived) stateSinkObject.Save(16, &i.OptionRecordRouteReceived) stateSinkObject.Save(17, &i.OptionRouterAlertReceived) stateSinkObject.Save(18, &i.OptionUnknownReceived) stateSinkObject.Save(19, &i.Forwarding) } func (i *IPStats) afterLoad(context.Context) {} // +checklocksignore func (i *IPStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &i.PacketsReceived) stateSourceObject.Load(1, &i.ValidPacketsReceived) stateSourceObject.Load(2, &i.DisabledPacketsReceived) stateSourceObject.Load(3, &i.InvalidDestinationAddressesReceived) stateSourceObject.Load(4, &i.InvalidSourceAddressesReceived) stateSourceObject.Load(5, &i.PacketsDelivered) stateSourceObject.Load(6, &i.PacketsSent) stateSourceObject.Load(7, &i.OutgoingPacketErrors) stateSourceObject.Load(8, &i.MalformedPacketsReceived) stateSourceObject.Load(9, &i.MalformedFragmentsReceived) stateSourceObject.Load(10, &i.IPTablesPreroutingDropped) stateSourceObject.Load(11, &i.IPTablesInputDropped) stateSourceObject.Load(12, &i.IPTablesForwardDropped) stateSourceObject.Load(13, &i.IPTablesOutputDropped) stateSourceObject.Load(14, &i.IPTablesPostroutingDropped) stateSourceObject.Load(15, &i.OptionTimestampReceived) stateSourceObject.Load(16, &i.OptionRecordRouteReceived) stateSourceObject.Load(17, &i.OptionRouterAlertReceived) stateSourceObject.Load(18, &i.OptionUnknownReceived) stateSourceObject.Load(19, &i.Forwarding) } func (a *ARPStats) StateTypeName() string { return "pkg/tcpip.ARPStats" } func (a *ARPStats) StateFields() []string { return []string{ "PacketsReceived", "DisabledPacketsReceived", "MalformedPacketsReceived", "RequestsReceived", "RequestsReceivedUnknownTargetAddress", "OutgoingRequestInterfaceHasNoLocalAddressErrors", "OutgoingRequestBadLocalAddressErrors", "OutgoingRequestsDropped", "OutgoingRequestsSent", "RepliesReceived", "OutgoingRepliesDropped", "OutgoingRepliesSent", } } func (a *ARPStats) beforeSave() {} // +checklocksignore func (a *ARPStats) StateSave(stateSinkObject state.Sink) { a.beforeSave() stateSinkObject.Save(0, &a.PacketsReceived) stateSinkObject.Save(1, &a.DisabledPacketsReceived) stateSinkObject.Save(2, &a.MalformedPacketsReceived) stateSinkObject.Save(3, &a.RequestsReceived) stateSinkObject.Save(4, &a.RequestsReceivedUnknownTargetAddress) stateSinkObject.Save(5, &a.OutgoingRequestInterfaceHasNoLocalAddressErrors) stateSinkObject.Save(6, &a.OutgoingRequestBadLocalAddressErrors) stateSinkObject.Save(7, &a.OutgoingRequestsDropped) stateSinkObject.Save(8, &a.OutgoingRequestsSent) stateSinkObject.Save(9, &a.RepliesReceived) stateSinkObject.Save(10, &a.OutgoingRepliesDropped) stateSinkObject.Save(11, &a.OutgoingRepliesSent) } func (a *ARPStats) afterLoad(context.Context) {} // +checklocksignore func (a *ARPStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &a.PacketsReceived) stateSourceObject.Load(1, &a.DisabledPacketsReceived) stateSourceObject.Load(2, &a.MalformedPacketsReceived) stateSourceObject.Load(3, &a.RequestsReceived) stateSourceObject.Load(4, &a.RequestsReceivedUnknownTargetAddress) stateSourceObject.Load(5, &a.OutgoingRequestInterfaceHasNoLocalAddressErrors) stateSourceObject.Load(6, &a.OutgoingRequestBadLocalAddressErrors) stateSourceObject.Load(7, &a.OutgoingRequestsDropped) stateSourceObject.Load(8, &a.OutgoingRequestsSent) stateSourceObject.Load(9, &a.RepliesReceived) stateSourceObject.Load(10, &a.OutgoingRepliesDropped) stateSourceObject.Load(11, &a.OutgoingRepliesSent) } func (t *TCPStats) StateTypeName() string { return "pkg/tcpip.TCPStats" } func (t *TCPStats) StateFields() []string { return []string{ "ActiveConnectionOpenings", "PassiveConnectionOpenings", "CurrentEstablished", "CurrentConnected", "EstablishedResets", "EstablishedClosed", "EstablishedTimedout", "ListenOverflowSynDrop", "ListenOverflowAckDrop", "ListenOverflowSynCookieSent", "ListenOverflowSynCookieRcvd", "ListenOverflowInvalidSynCookieRcvd", "FailedConnectionAttempts", "ValidSegmentsReceived", "InvalidSegmentsReceived", "SegmentsSent", "SegmentSendErrors", "ResetsSent", "ResetsReceived", "Retransmits", "FastRecovery", "SACKRecovery", "TLPRecovery", "SlowStartRetransmits", "FastRetransmit", "Timeouts", "ChecksumErrors", "FailedPortReservations", "SegmentsAckedWithDSACK", "SpuriousRecovery", "SpuriousRTORecovery", "ForwardMaxInFlightDrop", } } func (t *TCPStats) beforeSave() {} // +checklocksignore func (t *TCPStats) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.ActiveConnectionOpenings) stateSinkObject.Save(1, &t.PassiveConnectionOpenings) stateSinkObject.Save(2, &t.CurrentEstablished) stateSinkObject.Save(3, &t.CurrentConnected) stateSinkObject.Save(4, &t.EstablishedResets) stateSinkObject.Save(5, &t.EstablishedClosed) stateSinkObject.Save(6, &t.EstablishedTimedout) stateSinkObject.Save(7, &t.ListenOverflowSynDrop) stateSinkObject.Save(8, &t.ListenOverflowAckDrop) stateSinkObject.Save(9, &t.ListenOverflowSynCookieSent) stateSinkObject.Save(10, &t.ListenOverflowSynCookieRcvd) stateSinkObject.Save(11, &t.ListenOverflowInvalidSynCookieRcvd) stateSinkObject.Save(12, &t.FailedConnectionAttempts) stateSinkObject.Save(13, &t.ValidSegmentsReceived) stateSinkObject.Save(14, &t.InvalidSegmentsReceived) stateSinkObject.Save(15, &t.SegmentsSent) stateSinkObject.Save(16, &t.SegmentSendErrors) stateSinkObject.Save(17, &t.ResetsSent) stateSinkObject.Save(18, &t.ResetsReceived) stateSinkObject.Save(19, &t.Retransmits) stateSinkObject.Save(20, &t.FastRecovery) stateSinkObject.Save(21, &t.SACKRecovery) stateSinkObject.Save(22, &t.TLPRecovery) stateSinkObject.Save(23, &t.SlowStartRetransmits) stateSinkObject.Save(24, &t.FastRetransmit) stateSinkObject.Save(25, &t.Timeouts) stateSinkObject.Save(26, &t.ChecksumErrors) stateSinkObject.Save(27, &t.FailedPortReservations) stateSinkObject.Save(28, &t.SegmentsAckedWithDSACK) stateSinkObject.Save(29, &t.SpuriousRecovery) stateSinkObject.Save(30, &t.SpuriousRTORecovery) stateSinkObject.Save(31, &t.ForwardMaxInFlightDrop) } func (t *TCPStats) afterLoad(context.Context) {} // +checklocksignore func (t *TCPStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.ActiveConnectionOpenings) stateSourceObject.Load(1, &t.PassiveConnectionOpenings) stateSourceObject.Load(2, &t.CurrentEstablished) stateSourceObject.Load(3, &t.CurrentConnected) stateSourceObject.Load(4, &t.EstablishedResets) stateSourceObject.Load(5, &t.EstablishedClosed) stateSourceObject.Load(6, &t.EstablishedTimedout) stateSourceObject.Load(7, &t.ListenOverflowSynDrop) stateSourceObject.Load(8, &t.ListenOverflowAckDrop) stateSourceObject.Load(9, &t.ListenOverflowSynCookieSent) stateSourceObject.Load(10, &t.ListenOverflowSynCookieRcvd) stateSourceObject.Load(11, &t.ListenOverflowInvalidSynCookieRcvd) stateSourceObject.Load(12, &t.FailedConnectionAttempts) stateSourceObject.Load(13, &t.ValidSegmentsReceived) stateSourceObject.Load(14, &t.InvalidSegmentsReceived) stateSourceObject.Load(15, &t.SegmentsSent) stateSourceObject.Load(16, &t.SegmentSendErrors) stateSourceObject.Load(17, &t.ResetsSent) stateSourceObject.Load(18, &t.ResetsReceived) stateSourceObject.Load(19, &t.Retransmits) stateSourceObject.Load(20, &t.FastRecovery) stateSourceObject.Load(21, &t.SACKRecovery) stateSourceObject.Load(22, &t.TLPRecovery) stateSourceObject.Load(23, &t.SlowStartRetransmits) stateSourceObject.Load(24, &t.FastRetransmit) stateSourceObject.Load(25, &t.Timeouts) stateSourceObject.Load(26, &t.ChecksumErrors) stateSourceObject.Load(27, &t.FailedPortReservations) stateSourceObject.Load(28, &t.SegmentsAckedWithDSACK) stateSourceObject.Load(29, &t.SpuriousRecovery) stateSourceObject.Load(30, &t.SpuriousRTORecovery) stateSourceObject.Load(31, &t.ForwardMaxInFlightDrop) } func (u *UDPStats) StateTypeName() string { return "pkg/tcpip.UDPStats" } func (u *UDPStats) StateFields() []string { return []string{ "PacketsReceived", "UnknownPortErrors", "ReceiveBufferErrors", "MalformedPacketsReceived", "PacketsSent", "PacketSendErrors", "ChecksumErrors", } } func (u *UDPStats) beforeSave() {} // +checklocksignore func (u *UDPStats) StateSave(stateSinkObject state.Sink) { u.beforeSave() stateSinkObject.Save(0, &u.PacketsReceived) stateSinkObject.Save(1, &u.UnknownPortErrors) stateSinkObject.Save(2, &u.ReceiveBufferErrors) stateSinkObject.Save(3, &u.MalformedPacketsReceived) stateSinkObject.Save(4, &u.PacketsSent) stateSinkObject.Save(5, &u.PacketSendErrors) stateSinkObject.Save(6, &u.ChecksumErrors) } func (u *UDPStats) afterLoad(context.Context) {} // +checklocksignore func (u *UDPStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &u.PacketsReceived) stateSourceObject.Load(1, &u.UnknownPortErrors) stateSourceObject.Load(2, &u.ReceiveBufferErrors) stateSourceObject.Load(3, &u.MalformedPacketsReceived) stateSourceObject.Load(4, &u.PacketsSent) stateSourceObject.Load(5, &u.PacketSendErrors) stateSourceObject.Load(6, &u.ChecksumErrors) } func (n *NICNeighborStats) StateTypeName() string { return "pkg/tcpip.NICNeighborStats" } func (n *NICNeighborStats) StateFields() []string { return []string{ "UnreachableEntryLookups", "DroppedConfirmationForNoninitiatedNeighbor", "DroppedInvalidLinkAddressConfirmations", } } func (n *NICNeighborStats) beforeSave() {} // +checklocksignore func (n *NICNeighborStats) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.UnreachableEntryLookups) stateSinkObject.Save(1, &n.DroppedConfirmationForNoninitiatedNeighbor) stateSinkObject.Save(2, &n.DroppedInvalidLinkAddressConfirmations) } func (n *NICNeighborStats) afterLoad(context.Context) {} // +checklocksignore func (n *NICNeighborStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.UnreachableEntryLookups) stateSourceObject.Load(1, &n.DroppedConfirmationForNoninitiatedNeighbor) stateSourceObject.Load(2, &n.DroppedInvalidLinkAddressConfirmations) } func (n *NICPacketStats) StateTypeName() string { return "pkg/tcpip.NICPacketStats" } func (n *NICPacketStats) StateFields() []string { return []string{ "Packets", "Bytes", } } func (n *NICPacketStats) beforeSave() {} // +checklocksignore func (n *NICPacketStats) StateSave(stateSinkObject state.Sink) { n.beforeSave() stateSinkObject.Save(0, &n.Packets) stateSinkObject.Save(1, &n.Bytes) } func (n *NICPacketStats) afterLoad(context.Context) {} // +checklocksignore func (n *NICPacketStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &n.Packets) stateSourceObject.Load(1, &n.Bytes) } func (m *IntegralStatCounterMap) StateTypeName() string { return "pkg/tcpip.IntegralStatCounterMap" } func (m *IntegralStatCounterMap) StateFields() []string { return []string{ "counterMap", } } func (m *IntegralStatCounterMap) beforeSave() {} // +checklocksignore func (m *IntegralStatCounterMap) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.counterMap) } func (m *IntegralStatCounterMap) afterLoad(context.Context) {} // +checklocksignore func (m *IntegralStatCounterMap) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.counterMap) } func (m *MultiIntegralStatCounterMap) StateTypeName() string { return "pkg/tcpip.MultiIntegralStatCounterMap" } func (m *MultiIntegralStatCounterMap) StateFields() []string { return []string{ "a", "b", } } func (m *MultiIntegralStatCounterMap) beforeSave() {} // +checklocksignore func (m *MultiIntegralStatCounterMap) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.a) stateSinkObject.Save(1, &m.b) } func (m *MultiIntegralStatCounterMap) afterLoad(context.Context) {} // +checklocksignore func (m *MultiIntegralStatCounterMap) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.a) stateSourceObject.Load(1, &m.b) } func (s *NICStats) StateTypeName() string { return "pkg/tcpip.NICStats" } func (s *NICStats) StateFields() []string { return []string{ "UnknownL3ProtocolRcvdPacketCounts", "UnknownL4ProtocolRcvdPacketCounts", "MalformedL4RcvdPackets", "Tx", "TxPacketsDroppedNoBufferSpace", "Rx", "DisabledRx", "Neighbor", } } func (s *NICStats) beforeSave() {} // +checklocksignore func (s *NICStats) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.UnknownL3ProtocolRcvdPacketCounts) stateSinkObject.Save(1, &s.UnknownL4ProtocolRcvdPacketCounts) stateSinkObject.Save(2, &s.MalformedL4RcvdPackets) stateSinkObject.Save(3, &s.Tx) stateSinkObject.Save(4, &s.TxPacketsDroppedNoBufferSpace) stateSinkObject.Save(5, &s.Rx) stateSinkObject.Save(6, &s.DisabledRx) stateSinkObject.Save(7, &s.Neighbor) } func (s *NICStats) afterLoad(context.Context) {} // +checklocksignore func (s *NICStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.UnknownL3ProtocolRcvdPacketCounts) stateSourceObject.Load(1, &s.UnknownL4ProtocolRcvdPacketCounts) stateSourceObject.Load(2, &s.MalformedL4RcvdPackets) stateSourceObject.Load(3, &s.Tx) stateSourceObject.Load(4, &s.TxPacketsDroppedNoBufferSpace) stateSourceObject.Load(5, &s.Rx) stateSourceObject.Load(6, &s.DisabledRx) stateSourceObject.Load(7, &s.Neighbor) } func (s *Stats) StateTypeName() string { return "pkg/tcpip.Stats" } func (s *Stats) StateFields() []string { return []string{ "DroppedPackets", "NICs", "ICMP", "IGMP", "IP", "ARP", "TCP", "UDP", } } func (s *Stats) beforeSave() {} // +checklocksignore func (s *Stats) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.DroppedPackets) stateSinkObject.Save(1, &s.NICs) stateSinkObject.Save(2, &s.ICMP) stateSinkObject.Save(3, &s.IGMP) stateSinkObject.Save(4, &s.IP) stateSinkObject.Save(5, &s.ARP) stateSinkObject.Save(6, &s.TCP) stateSinkObject.Save(7, &s.UDP) } func (s *Stats) afterLoad(context.Context) {} // +checklocksignore func (s *Stats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.DroppedPackets) stateSourceObject.Load(1, &s.NICs) stateSourceObject.Load(2, &s.ICMP) stateSourceObject.Load(3, &s.IGMP) stateSourceObject.Load(4, &s.IP) stateSourceObject.Load(5, &s.ARP) stateSourceObject.Load(6, &s.TCP) stateSourceObject.Load(7, &s.UDP) } func (r *ReceiveErrors) StateTypeName() string { return "pkg/tcpip.ReceiveErrors" } func (r *ReceiveErrors) StateFields() []string { return []string{ "ReceiveBufferOverflow", "MalformedPacketsReceived", "ClosedReceiver", "ChecksumErrors", } } func (r *ReceiveErrors) beforeSave() {} // +checklocksignore func (r *ReceiveErrors) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.ReceiveBufferOverflow) stateSinkObject.Save(1, &r.MalformedPacketsReceived) stateSinkObject.Save(2, &r.ClosedReceiver) stateSinkObject.Save(3, &r.ChecksumErrors) } func (r *ReceiveErrors) afterLoad(context.Context) {} // +checklocksignore func (r *ReceiveErrors) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.ReceiveBufferOverflow) stateSourceObject.Load(1, &r.MalformedPacketsReceived) stateSourceObject.Load(2, &r.ClosedReceiver) stateSourceObject.Load(3, &r.ChecksumErrors) } func (s *SendErrors) StateTypeName() string { return "pkg/tcpip.SendErrors" } func (s *SendErrors) StateFields() []string { return []string{ "SendToNetworkFailed", "NoRoute", } } func (s *SendErrors) beforeSave() {} // +checklocksignore func (s *SendErrors) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.SendToNetworkFailed) stateSinkObject.Save(1, &s.NoRoute) } func (s *SendErrors) afterLoad(context.Context) {} // +checklocksignore func (s *SendErrors) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.SendToNetworkFailed) stateSourceObject.Load(1, &s.NoRoute) } func (r *ReadErrors) StateTypeName() string { return "pkg/tcpip.ReadErrors" } func (r *ReadErrors) StateFields() []string { return []string{ "ReadClosed", "InvalidEndpointState", "NotConnected", } } func (r *ReadErrors) beforeSave() {} // +checklocksignore func (r *ReadErrors) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.ReadClosed) stateSinkObject.Save(1, &r.InvalidEndpointState) stateSinkObject.Save(2, &r.NotConnected) } func (r *ReadErrors) afterLoad(context.Context) {} // +checklocksignore func (r *ReadErrors) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.ReadClosed) stateSourceObject.Load(1, &r.InvalidEndpointState) stateSourceObject.Load(2, &r.NotConnected) } func (w *WriteErrors) StateTypeName() string { return "pkg/tcpip.WriteErrors" } func (w *WriteErrors) StateFields() []string { return []string{ "WriteClosed", "InvalidEndpointState", "InvalidArgs", } } func (w *WriteErrors) beforeSave() {} // +checklocksignore func (w *WriteErrors) StateSave(stateSinkObject state.Sink) { w.beforeSave() stateSinkObject.Save(0, &w.WriteClosed) stateSinkObject.Save(1, &w.InvalidEndpointState) stateSinkObject.Save(2, &w.InvalidArgs) } func (w *WriteErrors) afterLoad(context.Context) {} // +checklocksignore func (w *WriteErrors) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &w.WriteClosed) stateSourceObject.Load(1, &w.InvalidEndpointState) stateSourceObject.Load(2, &w.InvalidArgs) } func (src *TransportEndpointStats) StateTypeName() string { return "pkg/tcpip.TransportEndpointStats" } func (src *TransportEndpointStats) StateFields() []string { return []string{ "PacketsReceived", "PacketsSent", "ReceiveErrors", "ReadErrors", "SendErrors", "WriteErrors", } } func (src *TransportEndpointStats) beforeSave() {} // +checklocksignore func (src *TransportEndpointStats) StateSave(stateSinkObject state.Sink) { src.beforeSave() stateSinkObject.Save(0, &src.PacketsReceived) stateSinkObject.Save(1, &src.PacketsSent) stateSinkObject.Save(2, &src.ReceiveErrors) stateSinkObject.Save(3, &src.ReadErrors) stateSinkObject.Save(4, &src.SendErrors) stateSinkObject.Save(5, &src.WriteErrors) } func (src *TransportEndpointStats) afterLoad(context.Context) {} // +checklocksignore func (src *TransportEndpointStats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &src.PacketsReceived) stateSourceObject.Load(1, &src.PacketsSent) stateSourceObject.Load(2, &src.ReceiveErrors) stateSourceObject.Load(3, &src.ReadErrors) stateSourceObject.Load(4, &src.SendErrors) stateSourceObject.Load(5, &src.WriteErrors) } func (a *AddressWithPrefix) StateTypeName() string { return "pkg/tcpip.AddressWithPrefix" } func (a *AddressWithPrefix) StateFields() []string { return []string{ "Address", "PrefixLen", } } func (a *AddressWithPrefix) beforeSave() {} // +checklocksignore func (a *AddressWithPrefix) StateSave(stateSinkObject state.Sink) { a.beforeSave() stateSinkObject.Save(0, &a.Address) stateSinkObject.Save(1, &a.PrefixLen) } func (a *AddressWithPrefix) afterLoad(context.Context) {} // +checklocksignore func (a *AddressWithPrefix) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &a.Address) stateSourceObject.Load(1, &a.PrefixLen) } func (p *ProtocolAddress) StateTypeName() string { return "pkg/tcpip.ProtocolAddress" } func (p *ProtocolAddress) StateFields() []string { return []string{ "Protocol", "AddressWithPrefix", } } func (p *ProtocolAddress) beforeSave() {} // +checklocksignore func (p *ProtocolAddress) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.Protocol) stateSinkObject.Save(1, &p.AddressWithPrefix) } func (p *ProtocolAddress) afterLoad(context.Context) {} // +checklocksignore func (p *ProtocolAddress) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.Protocol) stateSourceObject.Load(1, &p.AddressWithPrefix) } func (j *jobInstance) StateTypeName() string { return "pkg/tcpip.jobInstance" } func (j *jobInstance) StateFields() []string { return []string{ "timer", "earlyReturn", } } func (j *jobInstance) beforeSave() {} // +checklocksignore func (j *jobInstance) StateSave(stateSinkObject state.Sink) { j.beforeSave() stateSinkObject.Save(0, &j.timer) stateSinkObject.Save(1, &j.earlyReturn) } func (j *jobInstance) afterLoad(context.Context) {} // +checklocksignore func (j *jobInstance) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &j.timer) stateSourceObject.Load(1, &j.earlyReturn) } func (j *Job) StateTypeName() string { return "pkg/tcpip.Job" } func (j *Job) StateFields() []string { return []string{ "clock", "instance", } } func (j *Job) beforeSave() {} // +checklocksignore func (j *Job) StateSave(stateSinkObject state.Sink) { j.beforeSave() stateSinkObject.Save(0, &j.clock) stateSinkObject.Save(1, &j.instance) } func (j *Job) afterLoad(context.Context) {} // +checklocksignore func (j *Job) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &j.clock) stateSourceObject.Load(1, &j.instance) } func init() { state.Register((*ErrAborted)(nil)) state.Register((*ErrAddressFamilyNotSupported)(nil)) state.Register((*ErrAlreadyBound)(nil)) state.Register((*ErrAlreadyConnected)(nil)) state.Register((*ErrAlreadyConnecting)(nil)) state.Register((*ErrBadAddress)(nil)) state.Register((*ErrBadBuffer)(nil)) state.Register((*ErrBadLocalAddress)(nil)) state.Register((*ErrBroadcastDisabled)(nil)) state.Register((*ErrClosedForReceive)(nil)) state.Register((*ErrClosedForSend)(nil)) state.Register((*ErrConnectStarted)(nil)) state.Register((*ErrConnectionAborted)(nil)) state.Register((*ErrConnectionRefused)(nil)) state.Register((*ErrConnectionReset)(nil)) state.Register((*ErrDestinationRequired)(nil)) state.Register((*ErrDuplicateAddress)(nil)) state.Register((*ErrDuplicateNICID)(nil)) state.Register((*ErrInvalidNICID)(nil)) state.Register((*ErrInvalidEndpointState)(nil)) state.Register((*ErrInvalidOptionValue)(nil)) state.Register((*ErrInvalidPortRange)(nil)) state.Register((*ErrMalformedHeader)(nil)) state.Register((*ErrMessageTooLong)(nil)) state.Register((*ErrNetworkUnreachable)(nil)) state.Register((*ErrNoBufferSpace)(nil)) state.Register((*ErrNoPortAvailable)(nil)) state.Register((*ErrHostUnreachable)(nil)) state.Register((*ErrHostDown)(nil)) state.Register((*ErrNoNet)(nil)) state.Register((*ErrNoSuchFile)(nil)) state.Register((*ErrNotConnected)(nil)) state.Register((*ErrNotPermitted)(nil)) state.Register((*ErrNotSupported)(nil)) state.Register((*ErrPortInUse)(nil)) state.Register((*ErrQueueSizeNotSupported)(nil)) state.Register((*ErrTimeout)(nil)) state.Register((*ErrUnknownDevice)(nil)) state.Register((*ErrUnknownNICID)(nil)) state.Register((*ErrUnknownProtocol)(nil)) state.Register((*ErrUnknownProtocolOption)(nil)) state.Register((*ErrWouldBlock)(nil)) state.Register((*ErrMissingRequiredFields)(nil)) state.Register((*ErrMulticastInputCannotBeOutput)(nil)) state.Register((*RouteList)(nil)) state.Register((*RouteEntry)(nil)) state.Register((*sockErrorList)(nil)) state.Register((*sockErrorEntry)(nil)) state.Register((*SocketOptions)(nil)) state.Register((*LocalSockError)(nil)) state.Register((*SockError)(nil)) state.Register((*stdClock)(nil)) state.Register((*stdTimer)(nil)) state.Register((*MonotonicTime)(nil)) state.Register((*Address)(nil)) state.Register((*AddressMask)(nil)) state.Register((*Subnet)(nil)) state.Register((*FullAddress)(nil)) state.Register((*SendableControlMessages)(nil)) state.Register((*ReceivableControlMessages)(nil)) state.Register((*LinkPacketInfo)(nil)) state.Register((*TCPSendBufferSizeRangeOption)(nil)) state.Register((*TCPReceiveBufferSizeRangeOption)(nil)) state.Register((*ICMPv6Filter)(nil)) state.Register((*LingerOption)(nil)) state.Register((*IPPacketInfo)(nil)) state.Register((*IPv6PacketInfo)(nil)) state.Register((*SendBufferSizeOption)(nil)) state.Register((*ReceiveBufferSizeOption)(nil)) state.Register((*Route)(nil)) state.Register((*StatCounter)(nil)) state.Register((*MultiCounterStat)(nil)) state.Register((*ICMPv4PacketStats)(nil)) state.Register((*ICMPv4SentPacketStats)(nil)) state.Register((*ICMPv4ReceivedPacketStats)(nil)) state.Register((*ICMPv4Stats)(nil)) state.Register((*ICMPv6PacketStats)(nil)) state.Register((*ICMPv6SentPacketStats)(nil)) state.Register((*ICMPv6ReceivedPacketStats)(nil)) state.Register((*ICMPv6Stats)(nil)) state.Register((*ICMPStats)(nil)) state.Register((*IGMPPacketStats)(nil)) state.Register((*IGMPSentPacketStats)(nil)) state.Register((*IGMPReceivedPacketStats)(nil)) state.Register((*IGMPStats)(nil)) state.Register((*IPForwardingStats)(nil)) state.Register((*IPStats)(nil)) state.Register((*ARPStats)(nil)) state.Register((*TCPStats)(nil)) state.Register((*UDPStats)(nil)) state.Register((*NICNeighborStats)(nil)) state.Register((*NICPacketStats)(nil)) state.Register((*IntegralStatCounterMap)(nil)) state.Register((*MultiIntegralStatCounterMap)(nil)) state.Register((*NICStats)(nil)) state.Register((*Stats)(nil)) state.Register((*ReceiveErrors)(nil)) state.Register((*SendErrors)(nil)) state.Register((*ReadErrors)(nil)) state.Register((*WriteErrors)(nil)) state.Register((*TransportEndpointStats)(nil)) state.Register((*AddressWithPrefix)(nil)) state.Register((*ProtocolAddress)(nil)) state.Register((*jobInstance)(nil)) state.Register((*Job)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/timer.go000066400000000000000000000155671465435605700222300ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcpip import ( "time" "gvisor.dev/gvisor/pkg/sync" ) // jobInstance is a specific instance of Job. // // Different instances are created each time Job is scheduled so each timer has // its own earlyReturn signal. This is to address a bug when a Job is stopped // and reset in quick succession resulting in a timer instance's earlyReturn // signal being affected or seen by another timer instance. // // Consider the following sceneario where timer instances share a common // earlyReturn signal (T1 creates, stops and resets a Cancellable timer under a // lock L; T2, T3, T4 and T5 are goroutines that handle the first (A), second // (B), third (C), and fourth (D) instance of the timer firing, respectively): // // T1: Obtain L // T1: Create a new Job w/ lock L (create instance A) // T2: instance A fires, blocked trying to obtain L. // T1: Attempt to stop instance A (set earlyReturn = true) // T1: Schedule timer (create instance B) // T3: instance B fires, blocked trying to obtain L. // T1: Attempt to stop instance B (set earlyReturn = true) // T1: Schedule timer (create instance C) // T4: instance C fires, blocked trying to obtain L. // T1: Attempt to stop instance C (set earlyReturn = true) // T1: Schedule timer (create instance D) // T5: instance D fires, blocked trying to obtain L. // T1: Release L // // Now that T1 has released L, any of the 4 timer instances can take L and // check earlyReturn. If the timers simply check earlyReturn and then do // nothing further, then instance D will never early return even though it was // not requested to stop. If the timers reset earlyReturn before early // returning, then all but one of the timers will do work when only one was // expected to. If Job resets earlyReturn when resetting, then all the timers // will fire (again, when only one was expected to). // // To address the above concerns the simplest solution was to give each timer // its own earlyReturn signal. // // +stateify savable type jobInstance struct { timer Timer // Used to inform the timer to early return when it gets stopped while the // lock the timer tries to obtain when fired is held (T1 is a goroutine that // tries to cancel the timer and T2 is the goroutine that handles the timer // firing): // T1: Obtain the lock, then call Cancel() // T2: timer fires, and gets blocked on obtaining the lock // T1: Releases lock // T2: Obtains lock does unintended work // // To resolve this, T1 will check to see if the timer already fired, and // inform the timer using earlyReturn to return early so that once T2 obtains // the lock, it will see that it is set to true and do nothing further. earlyReturn *bool } // stop stops the job instance j from firing if it hasn't fired already. If it // has fired and is blocked at obtaining the lock, earlyReturn will be set to // true so that it will early return when it obtains the lock. func (j *jobInstance) stop() { if j.timer != nil { j.timer.Stop() *j.earlyReturn = true } } // Job represents some work that can be scheduled for execution. The work can // be safely cancelled when it fires at the same time some "related work" is // being done. // // The term "related work" is defined as some work that needs to be done while // holding some lock that the timer must also hold while doing some work. // // Note, it is not safe to copy a Job as its timer instance creates // a closure over the address of the Job. // // +stateify savable type Job struct { _ sync.NoCopy // The clock used to schedule the backing timer clock Clock // The active instance of a cancellable timer. instance jobInstance // locker is the lock taken by the timer immediately after it fires and must // be held when attempting to stop the timer. // // Must never change after being assigned. locker sync.Locker `state:"nosave"` // fn is the function that will be called when a timer fires and has not been // signaled to early return. // // fn MUST NOT attempt to lock locker. // // Must never change after being assigned. // TODO(b/341946753): Restore when netstack is savable. fn func() `state:"nosave"` } // Cancel prevents the Job from executing if it has not executed already. // // Cancel requires appropriate locking to be in place for any resources managed // by the Job. If the Job is blocked on obtaining the lock when Cancel is // called, it will early return. // // Note, t will be modified. // // j.locker MUST be locked. func (j *Job) Cancel() { j.instance.stop() // Nothing to do with the stopped instance anymore. j.instance = jobInstance{} } // Schedule schedules the Job for execution after duration d. This can be // called on cancelled or completed Jobs to schedule them again. // // Schedule should be invoked only on unscheduled, cancelled, or completed // Jobs. To be safe, callers should always call Cancel before calling Schedule. // // Note, j will be modified. func (j *Job) Schedule(d time.Duration) { // Create a new instance. earlyReturn := false // Capture the locker so that updating the timer does not cause a data race // when a timer fires and tries to obtain the lock (read the timer's locker). locker := j.locker j.instance = jobInstance{ timer: j.clock.AfterFunc(d, func() { locker.Lock() defer locker.Unlock() if earlyReturn { // If we reach this point, it means that the timer fired while another // goroutine called Cancel while it had the lock. Simply return here // and do nothing further. earlyReturn = false return } j.fn() }), earlyReturn: &earlyReturn, } } // NewJob returns a new Job that can be used to schedule f to run in its own // gorountine. l will be locked before calling f then unlocked after f returns. // // var clock tcpip.StdClock // var mu sync.Mutex // message := "foo" // job := tcpip.NewJob(&clock, &mu, func() { // fmt.Println(message) // }) // job.Schedule(time.Second) // // mu.Lock() // message = "bar" // mu.Unlock() // // // Output: bar // // f MUST NOT attempt to lock l. // // l MUST be locked prior to calling the returned job's Cancel(). // // var clock tcpip.StdClock // var mu sync.Mutex // message := "foo" // job := tcpip.NewJob(&clock, &mu, func() { // fmt.Println(message) // }) // job.Schedule(time.Second) // // mu.Lock() // job.Cancel() // mu.Unlock() func NewJob(c Clock, l sync.Locker, f func()) *Job { return &Job{ clock: c, locker: l, fn: f, } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/000077500000000000000000000000001465435605700225775ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/datagram.go000066400000000000000000000025501465435605700247100ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package transport import ( "fmt" "gvisor.dev/gvisor/pkg/tcpip" ) // DatagramEndpointState is the state of a datagram-based endpoint. type DatagramEndpointState tcpip.EndpointState // The states a datagram-based endpoint may be in. const ( _ DatagramEndpointState = iota DatagramEndpointStateInitial DatagramEndpointStateBound DatagramEndpointStateConnected DatagramEndpointStateClosed ) // String implements fmt.Stringer. func (s DatagramEndpointState) String() string { switch s { case DatagramEndpointStateInitial: return "INITIAL" case DatagramEndpointStateBound: return "BOUND" case DatagramEndpointStateConnected: return "CONNECTED" case DatagramEndpointStateClosed: return "CLOSED" default: panic(fmt.Sprintf("unhandled %[1]T variant = %[1]d", s)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/icmp/000077500000000000000000000000001465435605700235275ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/icmp/endpoint.go000066400000000000000000000565441465435605700257140ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package icmp import ( "fmt" "io" "time" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/checksum" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/ports" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport" "gvisor.dev/gvisor/pkg/tcpip/transport/internal/network" "gvisor.dev/gvisor/pkg/waiter" ) // +stateify savable type icmpPacket struct { icmpPacketEntry senderAddress tcpip.FullAddress packetInfo tcpip.IPPacketInfo data *stack.PacketBuffer receivedAt time.Time `state:".(int64)"` // tosOrTClass stores either the Type of Service for IPv4 or the Traffic Class // for IPv6. tosOrTClass uint8 // ttlOrHopLimit stores either the TTL for IPv4 or the HopLimit for IPv6 ttlOrHopLimit uint8 } // endpoint represents an ICMP endpoint. This struct serves as the interface // between users of the endpoint and the protocol implementation; it is legal to // have concurrent goroutines make calls into the endpoint, they are properly // synchronized. // // +stateify savable type endpoint struct { tcpip.DefaultSocketOptionsHandler // The following fields are initialized at creation time and are // immutable. stack *stack.Stack `state:"manual"` transProto tcpip.TransportProtocolNumber waiterQueue *waiter.Queue net network.Endpoint stats tcpip.TransportEndpointStats ops tcpip.SocketOptions // The following fields are used to manage the receive queue, and are // protected by rcvMu. rcvMu sync.Mutex `state:"nosave"` rcvReady bool rcvList icmpPacketList rcvBufSize int rcvClosed bool // The following fields are protected by the mu mutex. mu sync.RWMutex `state:"nosave"` // frozen indicates if the packets should be delivered to the endpoint // during restore. frozen bool ident uint16 } func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { ep := &endpoint{ stack: s, transProto: transProto, waiterQueue: waiterQueue, } ep.ops.InitHandler(ep, ep.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits) ep.ops.SetSendBufferSize(32*1024, false /* notify */) ep.ops.SetReceiveBufferSize(32*1024, false /* notify */) ep.net.Init(s, netProto, transProto, &ep.ops, waiterQueue) // Override with stack defaults. var ss tcpip.SendBufferSizeOption if err := s.Option(&ss); err == nil { ep.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) } var rs tcpip.ReceiveBufferSizeOption if err := s.Option(&rs); err == nil { ep.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */) } return ep, nil } // WakeupWriters implements tcpip.SocketOptionsHandler. func (e *endpoint) WakeupWriters() { e.net.MaybeSignalWritable() } // Abort implements stack.TransportEndpoint.Abort. func (e *endpoint) Abort() { e.Close() } // Close puts the endpoint in a closed state and frees all resources // associated with it. func (e *endpoint) Close() { notify := func() bool { e.mu.Lock() defer e.mu.Unlock() switch state := e.net.State(); state { case transport.DatagramEndpointStateInitial: case transport.DatagramEndpointStateClosed: return false case transport.DatagramEndpointStateBound, transport.DatagramEndpointStateConnected: info := e.net.Info() info.ID.LocalPort = e.ident e.stack.UnregisterTransportEndpoint([]tcpip.NetworkProtocolNumber{info.NetProto}, e.transProto, info.ID, e, ports.Flags{}, tcpip.NICID(e.ops.GetBindToDevice())) default: panic(fmt.Sprintf("unhandled state = %s", state)) } e.net.Shutdown() e.net.Close() e.rcvMu.Lock() defer e.rcvMu.Unlock() e.rcvClosed = true e.rcvBufSize = 0 for !e.rcvList.Empty() { p := e.rcvList.Front() e.rcvList.Remove(p) p.data.DecRef() } return true }() if notify { e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) } } // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf. func (*endpoint) ModerateRecvBuf(int) {} // SetOwner implements tcpip.Endpoint.SetOwner. func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { e.net.SetOwner(owner) } // Read implements tcpip.Endpoint.Read. func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) { e.rcvMu.Lock() if e.rcvList.Empty() { var err tcpip.Error = &tcpip.ErrWouldBlock{} if e.rcvClosed { e.stats.ReadErrors.ReadClosed.Increment() err = &tcpip.ErrClosedForReceive{} } e.rcvMu.Unlock() return tcpip.ReadResult{}, err } p := e.rcvList.Front() if !opts.Peek { e.rcvList.Remove(p) defer p.data.DecRef() e.rcvBufSize -= p.data.Data().Size() } e.rcvMu.Unlock() // Control Messages // TODO(https://gvisor.dev/issue/7012): Share control message code with other // network endpoints. cm := tcpip.ReceivableControlMessages{ HasTimestamp: true, Timestamp: p.receivedAt, } switch netProto := e.net.NetProto(); netProto { case header.IPv4ProtocolNumber: if e.ops.GetReceiveTOS() { cm.HasTOS = true cm.TOS = p.tosOrTClass } if e.ops.GetReceivePacketInfo() { cm.HasIPPacketInfo = true cm.PacketInfo = p.packetInfo } if e.ops.GetReceiveTTL() { cm.HasTTL = true cm.TTL = p.ttlOrHopLimit } case header.IPv6ProtocolNumber: if e.ops.GetReceiveTClass() { cm.HasTClass = true // Although TClass is an 8-bit value it's read in the CMsg as a uint32. cm.TClass = uint32(p.tosOrTClass) } if e.ops.GetIPv6ReceivePacketInfo() { cm.HasIPv6PacketInfo = true cm.IPv6PacketInfo = tcpip.IPv6PacketInfo{ NIC: p.packetInfo.NIC, Addr: p.packetInfo.DestinationAddr, } } if e.ops.GetReceiveHopLimit() { cm.HasHopLimit = true cm.HopLimit = p.ttlOrHopLimit } default: panic(fmt.Sprintf("unrecognized network protocol = %d", netProto)) } res := tcpip.ReadResult{ Total: p.data.Data().Size(), ControlMessages: cm, } if opts.NeedRemoteAddr { res.RemoteAddr = p.senderAddress } n, err := p.data.Data().ReadTo(dst, opts.Peek) if n == 0 && err != nil { return res, &tcpip.ErrBadBuffer{} } res.Count = n return res, nil } // prepareForWrite prepares the endpoint for sending data. In particular, it // binds it if it's still in the initial state. To do so, it must first // reacquire the mutex in exclusive mode. // // Returns true for retry if preparation should be retried. // +checklocksread:e.mu func (e *endpoint) prepareForWriteInner(to *tcpip.FullAddress) (retry bool, err tcpip.Error) { switch e.net.State() { case transport.DatagramEndpointStateInitial: case transport.DatagramEndpointStateConnected: return false, nil case transport.DatagramEndpointStateBound: if to == nil { return false, &tcpip.ErrDestinationRequired{} } return false, nil default: return false, &tcpip.ErrInvalidEndpointState{} } e.mu.RUnlock() e.mu.Lock() defer e.mu.DowngradeLock() // The state changed when we released the shared locked and re-acquired // it in exclusive mode. Try again. if e.net.State() != transport.DatagramEndpointStateInitial { return true, nil } // The state is still 'initial', so try to bind the endpoint. if err := e.bindLocked(tcpip.FullAddress{}); err != nil { return false, err } return true, nil } // Write writes data to the endpoint's peer. This method does not block // if the data cannot be written. func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { n, err := e.write(p, opts) switch err.(type) { case nil: e.stats.PacketsSent.Increment() case *tcpip.ErrMessageTooLong, *tcpip.ErrInvalidOptionValue: e.stats.WriteErrors.InvalidArgs.Increment() case *tcpip.ErrClosedForSend: e.stats.WriteErrors.WriteClosed.Increment() case *tcpip.ErrInvalidEndpointState: e.stats.WriteErrors.InvalidEndpointState.Increment() case *tcpip.ErrHostUnreachable, *tcpip.ErrBroadcastDisabled, *tcpip.ErrNetworkUnreachable: // Errors indicating any problem with IP routing of the packet. e.stats.SendErrors.NoRoute.Increment() default: // For all other errors when writing to the network layer. e.stats.SendErrors.SendToNetworkFailed.Increment() } return n, err } func (e *endpoint) prepareForWrite(opts tcpip.WriteOptions) (network.WriteContext, uint16, tcpip.Error) { e.mu.RLock() defer e.mu.RUnlock() // Prepare for write. for { retry, err := e.prepareForWriteInner(opts.To) if err != nil { return network.WriteContext{}, 0, err } if !retry { break } } ctx, err := e.net.AcquireContextForWrite(opts) return ctx, e.ident, err } func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { ctx, ident, err := e.prepareForWrite(opts) if err != nil { return 0, err } defer ctx.Release() // Prevents giant buffer allocations. if p.Len() > header.DatagramMaximumSize { return 0, &tcpip.ErrMessageTooLong{} } v := buffer.NewView(p.Len()) defer v.Release() if _, err := io.CopyN(v, p, int64(p.Len())); err != nil { return 0, &tcpip.ErrBadBuffer{} } n := v.Size() switch netProto, pktInfo := e.net.NetProto(), ctx.PacketInfo(); netProto { case header.IPv4ProtocolNumber: if err := send4(e.stack, &ctx, ident, v, pktInfo.MaxHeaderLength); err != nil { return 0, err } case header.IPv6ProtocolNumber: if err := send6(e.stack, &ctx, ident, v, pktInfo.LocalAddress, pktInfo.RemoteAddress, pktInfo.MaxHeaderLength); err != nil { return 0, err } default: panic(fmt.Sprintf("unhandled network protocol = %d", netProto)) } return int64(n), nil } var _ tcpip.SocketOptionsHandler = (*endpoint)(nil) // HasNIC implements tcpip.SocketOptionsHandler. func (e *endpoint) HasNIC(id int32) bool { return e.stack.HasNIC(tcpip.NICID(id)) } // SetSockOpt implements tcpip.Endpoint. func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { return e.net.SetSockOpt(opt) } // SetSockOptInt implements tcpip.Endpoint. func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { return e.net.SetSockOptInt(opt, v) } // GetSockOptInt implements tcpip.Endpoint. func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { switch opt { case tcpip.ReceiveQueueSizeOption: v := 0 e.rcvMu.Lock() if !e.rcvList.Empty() { p := e.rcvList.Front() v = p.data.Data().Size() } e.rcvMu.Unlock() return v, nil default: return e.net.GetSockOptInt(opt) } } // GetSockOpt implements tcpip.Endpoint. func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { return e.net.GetSockOpt(opt) } func send4(s *stack.Stack, ctx *network.WriteContext, ident uint16, data *buffer.View, maxHeaderLength uint16) tcpip.Error { if data.Size() < header.ICMPv4MinimumSize { return &tcpip.ErrInvalidEndpointState{} } pkt := ctx.TryNewPacketBuffer(header.ICMPv4MinimumSize+int(maxHeaderLength), buffer.Buffer{}) if pkt == nil { return &tcpip.ErrWouldBlock{} } defer pkt.DecRef() icmpv4 := header.ICMPv4(pkt.TransportHeader().Push(header.ICMPv4MinimumSize)) pkt.TransportProtocolNumber = header.ICMPv4ProtocolNumber copy(icmpv4, data.AsSlice()) // Set the ident to the user-specified port. Sequence number should // already be set by the user. icmpv4.SetIdent(ident) data.TrimFront(header.ICMPv4MinimumSize) // Linux performs these basic checks. if icmpv4.Type() != header.ICMPv4Echo || icmpv4.Code() != 0 { return &tcpip.ErrInvalidEndpointState{} } icmpv4.SetChecksum(0) icmpv4.SetChecksum(^checksum.Checksum(icmpv4, checksum.Checksum(data.AsSlice(), 0))) pkt.Data().AppendView(data.Clone()) // Because this icmp endpoint is implemented in the transport layer, we can // only increment the 'stack-wide' stats but we can't increment the // 'per-NetworkEndpoint' stats. stats := s.Stats().ICMP.V4.PacketsSent if err := ctx.WritePacket(pkt, false /* headerIncluded */); err != nil { stats.Dropped.Increment() return err } stats.EchoRequest.Increment() return nil } func send6(s *stack.Stack, ctx *network.WriteContext, ident uint16, data *buffer.View, src, dst tcpip.Address, maxHeaderLength uint16) tcpip.Error { if data.Size() < header.ICMPv6EchoMinimumSize { return &tcpip.ErrInvalidEndpointState{} } pkt := ctx.TryNewPacketBuffer(header.ICMPv6MinimumSize+int(maxHeaderLength), buffer.Buffer{}) if pkt == nil { return &tcpip.ErrWouldBlock{} } defer pkt.DecRef() icmpv6 := header.ICMPv6(pkt.TransportHeader().Push(header.ICMPv6MinimumSize)) pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber copy(icmpv6, data.AsSlice()) // Set the ident. Sequence number is provided by the user. icmpv6.SetIdent(ident) data.TrimFront(header.ICMPv6MinimumSize) if icmpv6.Type() != header.ICMPv6EchoRequest || icmpv6.Code() != 0 { return &tcpip.ErrInvalidEndpointState{} } pkt.Data().AppendView(data.Clone()) pktData := pkt.Data() icmpv6.SetChecksum(header.ICMPv6Checksum(header.ICMPv6ChecksumParams{ Header: icmpv6, Src: src, Dst: dst, PayloadCsum: pktData.Checksum(), PayloadLen: pktData.Size(), })) // Because this icmp endpoint is implemented in the transport layer, we can // only increment the 'stack-wide' stats but we can't increment the // 'per-NetworkEndpoint' stats. stats := s.Stats().ICMP.V6.PacketsSent if err := ctx.WritePacket(pkt, false /* headerIncluded */); err != nil { stats.Dropped.Increment() return err } stats.EchoRequest.Increment() return nil } // Disconnect implements tcpip.Endpoint.Disconnect. func (*endpoint) Disconnect() tcpip.Error { return &tcpip.ErrNotSupported{} } // Connect connects the endpoint to its peer. Specifying a NIC is optional. func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error { e.mu.Lock() defer e.mu.Unlock() err := e.net.ConnectAndThen(addr, func(netProto tcpip.NetworkProtocolNumber, previousID, nextID stack.TransportEndpointID) tcpip.Error { nextID.LocalPort = e.ident nextID, err := e.registerWithStack(netProto, nextID) if err != nil { return err } e.ident = nextID.LocalPort return nil }) if err != nil { return err } e.rcvMu.Lock() e.rcvReady = true e.rcvMu.Unlock() return nil } // ConnectEndpoint is not supported. func (*endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error { return &tcpip.ErrInvalidEndpointState{} } // Shutdown closes the read and/or write end of the endpoint connection // to its peer. func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error { e.mu.Lock() defer e.mu.Unlock() switch state := e.net.State(); state { case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateClosed: return &tcpip.ErrNotConnected{} case transport.DatagramEndpointStateBound, transport.DatagramEndpointStateConnected: default: panic(fmt.Sprintf("unhandled state = %s", state)) } if flags&tcpip.ShutdownWrite != 0 { if err := e.net.Shutdown(); err != nil { return err } } if flags&tcpip.ShutdownRead != 0 { e.rcvMu.Lock() wasClosed := e.rcvClosed e.rcvClosed = true e.rcvMu.Unlock() if !wasClosed { e.waiterQueue.Notify(waiter.ReadableEvents) } } return nil } // Listen is not supported by UDP, it just fails. func (*endpoint) Listen(int) tcpip.Error { return &tcpip.ErrNotSupported{} } // Accept is not supported by UDP, it just fails. func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) { return nil, nil, &tcpip.ErrNotSupported{} } func (e *endpoint) registerWithStack(netProto tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.Error) { bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) if id.LocalPort != 0 { // The endpoint already has a local port, just attempt to // register it. return id, e.stack.RegisterTransportEndpoint([]tcpip.NetworkProtocolNumber{netProto}, e.transProto, id, e, ports.Flags{}, bindToDevice) } // We need to find a port for the endpoint. _, err := e.stack.PickEphemeralPort(e.stack.SecureRNG(), func(p uint16) (bool, tcpip.Error) { id.LocalPort = p err := e.stack.RegisterTransportEndpoint([]tcpip.NetworkProtocolNumber{netProto}, e.transProto, id, e, ports.Flags{}, bindToDevice) switch err.(type) { case nil: return true, nil case *tcpip.ErrPortInUse: return false, nil default: return false, err } }) return id, err } func (e *endpoint) bindLocked(addr tcpip.FullAddress) tcpip.Error { // Don't allow binding once endpoint is not in the initial state // anymore. if e.net.State() != transport.DatagramEndpointStateInitial { return &tcpip.ErrInvalidEndpointState{} } err := e.net.BindAndThen(addr, func(boundNetProto tcpip.NetworkProtocolNumber, boundAddr tcpip.Address) tcpip.Error { id := stack.TransportEndpointID{ LocalPort: addr.Port, LocalAddress: addr.Addr, } id, err := e.registerWithStack(boundNetProto, id) if err != nil { return err } e.ident = id.LocalPort return nil }) if err != nil { return err } e.rcvMu.Lock() e.rcvReady = true e.rcvMu.Unlock() return nil } func (e *endpoint) isBroadcastOrMulticast(nicID tcpip.NICID, addr tcpip.Address) bool { return addr == header.IPv4Broadcast || header.IsV4MulticastAddress(addr) || header.IsV6MulticastAddress(addr) || e.stack.IsSubnetBroadcast(nicID, e.net.NetProto(), addr) } // Bind binds the endpoint to a specific local address and port. // Specifying a NIC is optional. func (e *endpoint) Bind(addr tcpip.FullAddress) tcpip.Error { if addr.Addr.BitLen() != 0 && e.isBroadcastOrMulticast(addr.NIC, addr.Addr) { return &tcpip.ErrBadLocalAddress{} } e.mu.Lock() defer e.mu.Unlock() return e.bindLocked(addr) } // GetLocalAddress returns the address to which the endpoint is bound. func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { e.mu.RLock() defer e.mu.RUnlock() addr := e.net.GetLocalAddress() addr.Port = e.ident return addr, nil } // GetRemoteAddress returns the address to which the endpoint is connected. func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { e.mu.RLock() defer e.mu.RUnlock() if addr, connected := e.net.GetRemoteAddress(); connected { return addr, nil } return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} } // Readiness returns the current readiness of the endpoint. For example, if // waiter.EventIn is set, the endpoint is immediately readable. func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { var result waiter.EventMask if e.net.HasSendSpace() { result |= waiter.WritableEvents & mask } // Determine if the endpoint is readable if requested. if (mask & waiter.ReadableEvents) != 0 { e.rcvMu.Lock() if !e.rcvList.Empty() || e.rcvClosed { result |= waiter.ReadableEvents } e.rcvMu.Unlock() } return result } // HandlePacket is called by the stack when new packets arrive to this transport // endpoint. func (e *endpoint) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketBuffer) { // Only accept echo replies. switch e.net.NetProto() { case header.IPv4ProtocolNumber: h := header.ICMPv4(pkt.TransportHeader().Slice()) if len(h) < header.ICMPv4MinimumSize || h.Type() != header.ICMPv4EchoReply { e.stack.Stats().DroppedPackets.Increment() e.stats.ReceiveErrors.MalformedPacketsReceived.Increment() return } case header.IPv6ProtocolNumber: h := header.ICMPv6(pkt.TransportHeader().Slice()) if len(h) < header.ICMPv6MinimumSize || h.Type() != header.ICMPv6EchoReply { e.stack.Stats().DroppedPackets.Increment() e.stats.ReceiveErrors.MalformedPacketsReceived.Increment() return } } e.rcvMu.Lock() // Drop the packet if our buffer is currently full. if !e.rcvReady || e.rcvClosed { e.rcvMu.Unlock() e.stack.Stats().DroppedPackets.Increment() e.stats.ReceiveErrors.ClosedReceiver.Increment() return } rcvBufSize := e.ops.GetReceiveBufferSize() if e.frozen || e.rcvBufSize >= int(rcvBufSize) { e.rcvMu.Unlock() e.stack.Stats().DroppedPackets.Increment() e.stats.ReceiveErrors.ReceiveBufferOverflow.Increment() return } wasEmpty := e.rcvBufSize == 0 net := pkt.Network() dstAddr := net.DestinationAddress() // Push new packet into receive list and increment the buffer size. packet := &icmpPacket{ senderAddress: tcpip.FullAddress{ NIC: pkt.NICID, Addr: id.RemoteAddress, }, packetInfo: tcpip.IPPacketInfo{ // Linux does not 'prepare' [1] in_pktinfo on socket buffers destined to // ping sockets (unlike UDP/RAW sockets). However the interface index [2] // and the Header Destination Address [3] are always filled. // [1] https://github.com/torvalds/linux/blob/dcb85f85fa6/net/ipv4/ip_sockglue.c#L1392 // [2] https://github.com/torvalds/linux/blob/dcb85f85fa6/net/ipv4/ip_input.c#L510 // [3] https://github.com/torvalds/linux/blob/dcb85f85fa6/net/ipv4/ip_sockglue.c#L60 NIC: pkt.NICID, DestinationAddr: dstAddr, }, } // Save any useful information from the network header to the packet. packet.tosOrTClass, _ = net.TOS() switch pkt.NetworkProtocolNumber { case header.IPv4ProtocolNumber: packet.ttlOrHopLimit = header.IPv4(pkt.NetworkHeader().Slice()).TTL() case header.IPv6ProtocolNumber: packet.ttlOrHopLimit = header.IPv6(pkt.NetworkHeader().Slice()).HopLimit() } // ICMP socket's data includes ICMP header but no others. Trim all other // headers from the front of the packet. pktBuf := pkt.ToBuffer() pktBuf.TrimFront(int64(pkt.HeaderSize() - len(pkt.TransportHeader().Slice()))) packet.data = stack.NewPacketBuffer(stack.PacketBufferOptions{Payload: pktBuf}) e.rcvList.PushBack(packet) e.rcvBufSize += packet.data.Data().Size() packet.receivedAt = e.stack.Clock().Now() e.rcvMu.Unlock() e.stats.PacketsReceived.Increment() // Notify any waiters that there's data to be read now. if wasEmpty { e.waiterQueue.Notify(waiter.ReadableEvents) } } // HandleError implements stack.TransportEndpoint. func (*endpoint) HandleError(stack.TransportError, *stack.PacketBuffer) {} // State implements tcpip.Endpoint.State. The ICMP endpoint currently doesn't // expose internal socket state. func (e *endpoint) State() uint32 { return uint32(e.net.State()) } // Info returns a copy of the endpoint info. func (e *endpoint) Info() tcpip.EndpointInfo { e.mu.RLock() defer e.mu.RUnlock() ret := e.net.Info() ret.ID.LocalPort = e.ident return &ret } // Stats returns a pointer to the endpoint stats. func (e *endpoint) Stats() tcpip.EndpointStats { return &e.stats } // Wait implements stack.TransportEndpoint.Wait. func (*endpoint) Wait() {} // LastError implements tcpip.Endpoint.LastError. func (*endpoint) LastError() tcpip.Error { return nil } // SocketOptions implements tcpip.Endpoint.SocketOptions. func (e *endpoint) SocketOptions() *tcpip.SocketOptions { return &e.ops } // freeze prevents any more packets from being delivered to the endpoint. func (e *endpoint) freeze() { e.mu.Lock() e.frozen = true e.mu.Unlock() } // thaw unfreezes a previously frozen endpoint using endpoint.freeze() allows // new packets to be delivered again. func (e *endpoint) thaw() { e.mu.Lock() e.frozen = false e.mu.Unlock() } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/icmp/endpoint_state.go000066400000000000000000000042311465435605700270760ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package icmp import ( "context" "fmt" "time" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport" ) // saveReceivedAt is invoked by stateify. func (p *icmpPacket) saveReceivedAt() int64 { return p.receivedAt.UnixNano() } // loadReceivedAt is invoked by stateify. func (p *icmpPacket) loadReceivedAt(_ context.Context, nsec int64) { p.receivedAt = time.Unix(0, nsec) } // afterLoad is invoked by stateify. func (e *endpoint) afterLoad(ctx context.Context) { stack.RestoreStackFromContext(ctx).RegisterRestoredEndpoint(e) } // beforeSave is invoked by stateify. func (e *endpoint) beforeSave() { e.freeze() e.stack.RegisterResumableEndpoint(e) } // Restore implements tcpip.RestoredEndpoint.Restore. func (e *endpoint) Restore(s *stack.Stack) { e.thaw() e.net.Resume(s) e.stack = s e.ops.InitHandler(e, e.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits) switch state := e.net.State(); state { case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateClosed: case transport.DatagramEndpointStateBound, transport.DatagramEndpointStateConnected: var err tcpip.Error info := e.net.Info() info.ID.LocalPort = e.ident info.ID, err = e.registerWithStack(info.NetProto, info.ID) if err != nil { panic(fmt.Sprintf("e.registerWithStack(%d, %#v): %s", info.NetProto, info.ID, err)) } e.ident = info.ID.LocalPort default: panic(fmt.Sprintf("unhandled state = %s", state)) } } // Resume implements tcpip.ResumableEndpoint.Resume. func (e *endpoint) Resume() { e.thaw() } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/icmp/icmp_packet_list.go000066400000000000000000000123461465435605700273760ustar00rootroot00000000000000package icmp // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type icmpPacketElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (icmpPacketElementMapper) linkerFor(elem *icmpPacket) *icmpPacket { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type icmpPacketList struct { head *icmpPacket tail *icmpPacket } // Reset resets list l to the empty state. func (l *icmpPacketList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *icmpPacketList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *icmpPacketList) Front() *icmpPacket { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *icmpPacketList) Back() *icmpPacket { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *icmpPacketList) Len() (count int) { for e := l.Front(); e != nil; e = (icmpPacketElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *icmpPacketList) PushFront(e *icmpPacket) { linker := icmpPacketElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { icmpPacketElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *icmpPacketList) PushFrontList(m *icmpPacketList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { icmpPacketElementMapper{}.linkerFor(l.head).SetPrev(m.tail) icmpPacketElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *icmpPacketList) PushBack(e *icmpPacket) { linker := icmpPacketElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { icmpPacketElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *icmpPacketList) PushBackList(m *icmpPacketList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { icmpPacketElementMapper{}.linkerFor(l.tail).SetNext(m.head) icmpPacketElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *icmpPacketList) InsertAfter(b, e *icmpPacket) { bLinker := icmpPacketElementMapper{}.linkerFor(b) eLinker := icmpPacketElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { icmpPacketElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *icmpPacketList) InsertBefore(a, e *icmpPacket) { aLinker := icmpPacketElementMapper{}.linkerFor(a) eLinker := icmpPacketElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { icmpPacketElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *icmpPacketList) Remove(e *icmpPacket) { linker := icmpPacketElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { icmpPacketElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { icmpPacketElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type icmpPacketEntry struct { next *icmpPacket prev *icmpPacket } // Next returns the entry that follows e in the list. // //go:nosplit func (e *icmpPacketEntry) Next() *icmpPacket { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *icmpPacketEntry) Prev() *icmpPacket { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *icmpPacketEntry) SetNext(elem *icmpPacket) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *icmpPacketEntry) SetPrev(elem *icmpPacket) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/icmp/icmp_state_autogen.go000066400000000000000000000117511465435605700277350ustar00rootroot00000000000000// automatically generated by stateify. package icmp import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (p *icmpPacket) StateTypeName() string { return "pkg/tcpip/transport/icmp.icmpPacket" } func (p *icmpPacket) StateFields() []string { return []string{ "icmpPacketEntry", "senderAddress", "packetInfo", "data", "receivedAt", "tosOrTClass", "ttlOrHopLimit", } } func (p *icmpPacket) beforeSave() {} // +checklocksignore func (p *icmpPacket) StateSave(stateSinkObject state.Sink) { p.beforeSave() var receivedAtValue int64 receivedAtValue = p.saveReceivedAt() stateSinkObject.SaveValue(4, receivedAtValue) stateSinkObject.Save(0, &p.icmpPacketEntry) stateSinkObject.Save(1, &p.senderAddress) stateSinkObject.Save(2, &p.packetInfo) stateSinkObject.Save(3, &p.data) stateSinkObject.Save(5, &p.tosOrTClass) stateSinkObject.Save(6, &p.ttlOrHopLimit) } func (p *icmpPacket) afterLoad(context.Context) {} // +checklocksignore func (p *icmpPacket) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.icmpPacketEntry) stateSourceObject.Load(1, &p.senderAddress) stateSourceObject.Load(2, &p.packetInfo) stateSourceObject.Load(3, &p.data) stateSourceObject.Load(5, &p.tosOrTClass) stateSourceObject.Load(6, &p.ttlOrHopLimit) stateSourceObject.LoadValue(4, new(int64), func(y any) { p.loadReceivedAt(ctx, y.(int64)) }) } func (e *endpoint) StateTypeName() string { return "pkg/tcpip/transport/icmp.endpoint" } func (e *endpoint) StateFields() []string { return []string{ "DefaultSocketOptionsHandler", "transProto", "waiterQueue", "net", "stats", "ops", "rcvReady", "rcvList", "rcvBufSize", "rcvClosed", "frozen", "ident", } } // +checklocksignore func (e *endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.DefaultSocketOptionsHandler) stateSinkObject.Save(1, &e.transProto) stateSinkObject.Save(2, &e.waiterQueue) stateSinkObject.Save(3, &e.net) stateSinkObject.Save(4, &e.stats) stateSinkObject.Save(5, &e.ops) stateSinkObject.Save(6, &e.rcvReady) stateSinkObject.Save(7, &e.rcvList) stateSinkObject.Save(8, &e.rcvBufSize) stateSinkObject.Save(9, &e.rcvClosed) stateSinkObject.Save(10, &e.frozen) stateSinkObject.Save(11, &e.ident) } // +checklocksignore func (e *endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.DefaultSocketOptionsHandler) stateSourceObject.Load(1, &e.transProto) stateSourceObject.Load(2, &e.waiterQueue) stateSourceObject.Load(3, &e.net) stateSourceObject.Load(4, &e.stats) stateSourceObject.Load(5, &e.ops) stateSourceObject.Load(6, &e.rcvReady) stateSourceObject.Load(7, &e.rcvList) stateSourceObject.Load(8, &e.rcvBufSize) stateSourceObject.Load(9, &e.rcvClosed) stateSourceObject.Load(10, &e.frozen) stateSourceObject.Load(11, &e.ident) stateSourceObject.AfterLoad(func() { e.afterLoad(ctx) }) } func (l *icmpPacketList) StateTypeName() string { return "pkg/tcpip/transport/icmp.icmpPacketList" } func (l *icmpPacketList) StateFields() []string { return []string{ "head", "tail", } } func (l *icmpPacketList) beforeSave() {} // +checklocksignore func (l *icmpPacketList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *icmpPacketList) afterLoad(context.Context) {} // +checklocksignore func (l *icmpPacketList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *icmpPacketEntry) StateTypeName() string { return "pkg/tcpip/transport/icmp.icmpPacketEntry" } func (e *icmpPacketEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *icmpPacketEntry) beforeSave() {} // +checklocksignore func (e *icmpPacketEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *icmpPacketEntry) afterLoad(context.Context) {} // +checklocksignore func (e *icmpPacketEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (p *protocol) StateTypeName() string { return "pkg/tcpip/transport/icmp.protocol" } func (p *protocol) StateFields() []string { return []string{ "stack", "number", } } func (p *protocol) beforeSave() {} // +checklocksignore func (p *protocol) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.stack) stateSinkObject.Save(1, &p.number) } func (p *protocol) afterLoad(context.Context) {} // +checklocksignore func (p *protocol) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.stack) stateSourceObject.Load(1, &p.number) } func init() { state.Register((*icmpPacket)(nil)) state.Register((*endpoint)(nil)) state.Register((*icmpPacketList)(nil)) state.Register((*icmpPacketEntry)(nil)) state.Register((*protocol)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/icmp/protocol.go000066400000000000000000000112701465435605700257200ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package icmp contains the implementation of the ICMP and IPv6-ICMP transport // protocols for use in ping. package icmp import ( "fmt" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/raw" "gvisor.dev/gvisor/pkg/waiter" ) const ( // ProtocolNumber4 is the ICMP protocol number. ProtocolNumber4 = header.ICMPv4ProtocolNumber // ProtocolNumber6 is the IPv6-ICMP protocol number. ProtocolNumber6 = header.ICMPv6ProtocolNumber ) // protocol implements stack.TransportProtocol. // // +stateify savable type protocol struct { stack *stack.Stack number tcpip.TransportProtocolNumber } // Number returns the ICMP protocol number. func (p *protocol) Number() tcpip.TransportProtocolNumber { return p.number } func (p *protocol) netProto() tcpip.NetworkProtocolNumber { switch p.number { case ProtocolNumber4: return header.IPv4ProtocolNumber case ProtocolNumber6: return header.IPv6ProtocolNumber } panic(fmt.Sprint("unknown protocol number: ", p.number)) } // NewEndpoint creates a new icmp endpoint. It implements // stack.TransportProtocol.NewEndpoint. func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { if netProto != p.netProto() { return nil, &tcpip.ErrUnknownProtocol{} } return newEndpoint(p.stack, netProto, p.number, waiterQueue) } // NewRawEndpoint creates a new raw icmp endpoint. It implements // stack.TransportProtocol.NewRawEndpoint. func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { if netProto != p.netProto() { return nil, &tcpip.ErrUnknownProtocol{} } return raw.NewEndpoint(p.stack, netProto, p.number, waiterQueue) } // MinimumPacketSize returns the minimum valid icmp packet size. func (p *protocol) MinimumPacketSize() int { switch p.number { case ProtocolNumber4: return header.ICMPv4MinimumSize case ProtocolNumber6: return header.ICMPv6MinimumSize } panic(fmt.Sprint("unknown protocol number: ", p.number)) } // ParsePorts in case of ICMP sets src to 0, dst to ICMP ID, and err to nil. func (p *protocol) ParsePorts(v []byte) (src, dst uint16, err tcpip.Error) { switch p.number { case ProtocolNumber4: hdr := header.ICMPv4(v) return 0, hdr.Ident(), nil case ProtocolNumber6: hdr := header.ICMPv6(v) return 0, hdr.Ident(), nil } panic(fmt.Sprint("unknown protocol number: ", p.number)) } // HandleUnknownDestinationPacket handles packets targeted at this protocol but // that don't match any existing endpoint. func (*protocol) HandleUnknownDestinationPacket(stack.TransportEndpointID, *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition { return stack.UnknownDestinationPacketHandled } // SetOption implements stack.TransportProtocol.SetOption. func (*protocol) SetOption(tcpip.SettableTransportProtocolOption) tcpip.Error { return &tcpip.ErrUnknownProtocolOption{} } // Option implements stack.TransportProtocol.Option. func (*protocol) Option(tcpip.GettableTransportProtocolOption) tcpip.Error { return &tcpip.ErrUnknownProtocolOption{} } // Close implements stack.TransportProtocol.Close. func (*protocol) Close() {} // Wait implements stack.TransportProtocol.Wait. func (*protocol) Wait() {} // Pause implements stack.TransportProtocol.Pause. func (*protocol) Pause() {} // Resume implements stack.TransportProtocol.Resume. func (*protocol) Resume() {} // Parse implements stack.TransportProtocol.Parse. func (*protocol) Parse(pkt *stack.PacketBuffer) bool { // Right now, the Parse() method is tied to enabled protocols passed into // stack.New. This works for UDP and TCP, but we handle ICMP traffic even // when netstack users don't pass ICMP as a supported protocol. return false } // NewProtocol4 returns an ICMPv4 transport protocol. func NewProtocol4(s *stack.Stack) stack.TransportProtocol { return &protocol{stack: s, number: ProtocolNumber4} } // NewProtocol6 returns an ICMPv6 transport protocol. func NewProtocol6(s *stack.Stack) stack.TransportProtocol { return &protocol{stack: s, number: ProtocolNumber6} } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/internal/000077500000000000000000000000001465435605700244135ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/internal/network/000077500000000000000000000000001465435605700261045ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/internal/network/endpoint.go000066400000000000000000000730131465435605700302570ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package network provides facilities to support tcpip.Endpoints that operate // at the network layer or above. package network import ( "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport" "gvisor.dev/gvisor/pkg/waiter" ) // Endpoint is a datagram-based endpoint. It only supports sending datagrams to // a peer. // // +stateify savable type Endpoint struct { // The following fields must only be set once then never changed. stack *stack.Stack `state:"manual"` ops *tcpip.SocketOptions netProto tcpip.NetworkProtocolNumber transProto tcpip.TransportProtocolNumber waiterQueue *waiter.Queue mu sync.RWMutex `state:"nosave"` // +checklocks:mu wasBound bool // owner is the owner of transmitted packets. // // +checklocks:mu owner tcpip.PacketOwner // +checklocks:mu writeShutdown bool // +checklocks:mu effectiveNetProto tcpip.NetworkProtocolNumber // +checklocks:mu connectedRoute *stack.Route `state:"manual"` // +checklocks:mu multicastMemberships map[multicastMembership]struct{} // +checklocks:mu ipv4TTL uint8 // +checklocks:mu ipv6HopLimit int16 // TODO(https://gvisor.dev/issue/6389): Use different fields for IPv4/IPv6. // +checklocks:mu multicastTTL uint8 // TODO(https://gvisor.dev/issue/6389): Use different fields for IPv4/IPv6. // +checklocks:mu multicastAddr tcpip.Address // TODO(https://gvisor.dev/issue/6389): Use different fields for IPv4/IPv6. // +checklocks:mu multicastNICID tcpip.NICID // +checklocks:mu ipv4TOS uint8 // +checklocks:mu ipv6TClass uint8 // Lock ordering: mu > infoMu. infoMu sync.RWMutex `state:"nosave"` // info has a dedicated mutex so that we can avoid lock ordering violations // when reading the endpoint's info. If we used mu, we need to guarantee // that any lock taken while mu is held is not held when calling Info() // which is not true as of writing (we hold mu while registering transport // endpoints (taking the transport demuxer lock but we also hold the demuxer // lock when delivering packets/errors to endpoints). // // Writes must be performed through setInfo. // // +checklocks:infoMu info stack.TransportEndpointInfo // state holds a transport.DatagramBasedEndpointState. // // state must be accessed with atomics so that we can avoid lock ordering // violations when reading the state. If we used mu, we need to guarantee // that any lock taken while mu is held is not held when calling State() // which is not true as of writing (we hold mu while registering transport // endpoints (taking the transport demuxer lock but we also hold the demuxer // lock when delivering packets/errors to endpoints). // // Writes must be performed through setEndpointState. state atomicbitops.Uint32 // Callers should not attempt to obtain sendBufferSizeInUseMu while holding // another lock on Endpoint. sendBufferSizeInUseMu sync.RWMutex `state:"nosave"` // sendBufferSizeInUse keeps track of the bytes in use by in-flight packets. // // +checklocks:sendBufferSizeInUseMu sendBufferSizeInUse int64 `state:"nosave"` } // +stateify savable type multicastMembership struct { nicID tcpip.NICID multicastAddr tcpip.Address } // Init initializes the endpoint. func (e *Endpoint) Init(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ops *tcpip.SocketOptions, waiterQueue *waiter.Queue) { e.mu.Lock() defer e.mu.Unlock() if e.multicastMemberships != nil { panic(fmt.Sprintf("endpoint is already initialized; got e.multicastMemberships = %#v, want = nil", e.multicastMemberships)) } switch netProto { case header.IPv4ProtocolNumber, header.IPv6ProtocolNumber: default: panic(fmt.Sprintf("invalid protocol number = %d", netProto)) } e.stack = s e.ops = ops e.netProto = netProto e.transProto = transProto e.waiterQueue = waiterQueue e.infoMu.Lock() e.info = stack.TransportEndpointInfo{ NetProto: netProto, TransProto: transProto, } e.infoMu.Unlock() e.effectiveNetProto = netProto e.ipv4TTL = tcpip.UseDefaultIPv4TTL e.ipv6HopLimit = tcpip.UseDefaultIPv6HopLimit // Linux defaults to TTL=1. e.multicastTTL = 1 e.multicastMemberships = make(map[multicastMembership]struct{}) e.setEndpointState(transport.DatagramEndpointStateInitial) } // NetProto returns the network protocol the endpoint was initialized with. func (e *Endpoint) NetProto() tcpip.NetworkProtocolNumber { return e.netProto } // setEndpointState sets the state of the endpoint. // // e.mu must be held to synchronize changes to state with the rest of the // endpoint. // // +checklocks:e.mu func (e *Endpoint) setEndpointState(state transport.DatagramEndpointState) { e.state.Store(uint32(state)) } // State returns the state of the endpoint. func (e *Endpoint) State() transport.DatagramEndpointState { return transport.DatagramEndpointState(e.state.Load()) } // Close cleans the endpoint's resources and leaves the endpoint in a closed // state. func (e *Endpoint) Close() { e.mu.Lock() defer e.mu.Unlock() if e.State() == transport.DatagramEndpointStateClosed { return } for mem := range e.multicastMemberships { e.stack.LeaveGroup(e.netProto, mem.nicID, mem.multicastAddr) } e.multicastMemberships = nil if e.connectedRoute != nil { e.connectedRoute.Release() e.connectedRoute = nil } e.setEndpointState(transport.DatagramEndpointStateClosed) } // SetOwner sets the owner of transmitted packets. func (e *Endpoint) SetOwner(owner tcpip.PacketOwner) { e.mu.Lock() defer e.mu.Unlock() e.owner = owner } // +checklocksread:e.mu func (e *Endpoint) calculateTTL(route *stack.Route) uint8 { remoteAddress := route.RemoteAddress() if header.IsV4MulticastAddress(remoteAddress) || header.IsV6MulticastAddress(remoteAddress) { return e.multicastTTL } switch netProto := route.NetProto(); netProto { case header.IPv4ProtocolNumber: if e.ipv4TTL == 0 { return route.DefaultTTL() } return e.ipv4TTL case header.IPv6ProtocolNumber: if e.ipv6HopLimit == -1 { return route.DefaultTTL() } return uint8(e.ipv6HopLimit) default: panic(fmt.Sprintf("invalid protocol number = %d", netProto)) } } // WriteContext holds the context for a write. type WriteContext struct { e *Endpoint route *stack.Route ttl uint8 tos uint8 } func (c *WriteContext) MTU() uint32 { return c.route.MTU() } // Release releases held resources. func (c *WriteContext) Release() { c.route.Release() *c = WriteContext{} } // WritePacketInfo is the properties of a packet that may be written. type WritePacketInfo struct { NetProto tcpip.NetworkProtocolNumber LocalAddress, RemoteAddress tcpip.Address MaxHeaderLength uint16 RequiresTXTransportChecksum bool } // PacketInfo returns the properties of a packet that will be written. func (c *WriteContext) PacketInfo() WritePacketInfo { return WritePacketInfo{ NetProto: c.route.NetProto(), LocalAddress: c.route.LocalAddress(), RemoteAddress: c.route.RemoteAddress(), MaxHeaderLength: c.route.MaxHeaderLength(), RequiresTXTransportChecksum: c.route.RequiresTXTransportChecksum(), } } // TryNewPacketBuffer returns a new packet buffer iff the endpoint's send buffer // is not full. // // If this method returns nil, the caller should wait for the endpoint to become // writable. func (c *WriteContext) TryNewPacketBuffer(reserveHdrBytes int, data buffer.Buffer) *stack.PacketBuffer { e := c.e e.sendBufferSizeInUseMu.Lock() defer e.sendBufferSizeInUseMu.Unlock() if !e.hasSendSpaceRLocked() { return nil } return c.newPacketBufferLocked(reserveHdrBytes, data) } // TryNewPacketBufferFromPayloader returns a new packet buffer iff the endpoint's send buffer // is not full. Otherwise, data from `payloader` isn't read. // // If this method returns nil, the caller should wait for the endpoint to become // writable. func (c *WriteContext) TryNewPacketBufferFromPayloader(reserveHdrBytes int, payloader tcpip.Payloader) *stack.PacketBuffer { e := c.e e.sendBufferSizeInUseMu.Lock() defer e.sendBufferSizeInUseMu.Unlock() if !e.hasSendSpaceRLocked() { return nil } var data buffer.Buffer if _, err := data.WriteFromReader(payloader, int64(payloader.Len())); err != nil { data.Release() return nil } return c.newPacketBufferLocked(reserveHdrBytes, data) } // +checklocks:c.e.sendBufferSizeInUseMu func (c *WriteContext) newPacketBufferLocked(reserveHdrBytes int, data buffer.Buffer) *stack.PacketBuffer { e := c.e // Note that we allow oversubscription - if there is any space at all in the // send buffer, we accept the full packet which may be larger than the space // available. This is because if the endpoint reports that it is writable, // a write operation should succeed. // // This matches Linux behaviour: // https://github.com/torvalds/linux/blob/38d741cb70b/include/net/sock.h#L2519 // https://github.com/torvalds/linux/blob/38d741cb70b/net/core/sock.c#L2588 pktSize := int64(reserveHdrBytes) + int64(data.Size()) e.sendBufferSizeInUse += pktSize return stack.NewPacketBuffer(stack.PacketBufferOptions{ ReserveHeaderBytes: reserveHdrBytes, Payload: data, OnRelease: func() { e.sendBufferSizeInUseMu.Lock() if got := e.sendBufferSizeInUse; got < pktSize { e.sendBufferSizeInUseMu.Unlock() panic(fmt.Sprintf("e.sendBufferSizeInUse=(%d) < pktSize(=%d)", got, pktSize)) } e.sendBufferSizeInUse -= pktSize signal := e.hasSendSpaceRLocked() e.sendBufferSizeInUseMu.Unlock() // Let waiters know if we now have space in the send buffer. if signal { e.waiterQueue.Notify(waiter.WritableEvents) } }, }) } // WritePacket attempts to write the packet. func (c *WriteContext) WritePacket(pkt *stack.PacketBuffer, headerIncluded bool) tcpip.Error { c.e.mu.RLock() pkt.Owner = c.e.owner c.e.mu.RUnlock() if headerIncluded { return c.route.WriteHeaderIncludedPacket(pkt) } err := c.route.WritePacket(stack.NetworkHeaderParams{ Protocol: c.e.transProto, TTL: c.ttl, TOS: c.tos, }, pkt) if _, ok := err.(*tcpip.ErrNoBufferSpace); ok { var recvErr bool switch netProto := c.route.NetProto(); netProto { case header.IPv4ProtocolNumber: recvErr = c.e.ops.GetIPv4RecvError() case header.IPv6ProtocolNumber: recvErr = c.e.ops.GetIPv6RecvError() default: panic(fmt.Sprintf("unhandled network protocol number = %d", netProto)) } // Linux only returns ENOBUFS to the caller if IP{,V6}_RECVERR is set. // // https://github.com/torvalds/linux/blob/3e71713c9e75c/net/ipv4/udp.c#L969 // https://github.com/torvalds/linux/blob/3e71713c9e75c/net/ipv6/udp.c#L1260 if !recvErr { err = nil } } return err } // MaybeSignalWritable signals waiters with writable events if the send buffer // has space. func (e *Endpoint) MaybeSignalWritable() { e.sendBufferSizeInUseMu.RLock() signal := e.hasSendSpaceRLocked() e.sendBufferSizeInUseMu.RUnlock() if signal { e.waiterQueue.Notify(waiter.WritableEvents) } } // HasSendSpace returns whether or not the send buffer has space. func (e *Endpoint) HasSendSpace() bool { e.sendBufferSizeInUseMu.RLock() defer e.sendBufferSizeInUseMu.RUnlock() return e.hasSendSpaceRLocked() } // +checklocksread:e.sendBufferSizeInUseMu func (e *Endpoint) hasSendSpaceRLocked() bool { return e.ops.GetSendBufferSize() > e.sendBufferSizeInUse } // AcquireContextForWrite acquires a WriteContext. func (e *Endpoint) AcquireContextForWrite(opts tcpip.WriteOptions) (WriteContext, tcpip.Error) { e.mu.RLock() defer e.mu.RUnlock() // MSG_MORE is unimplemented. This also means that MSG_EOR is a no-op. if opts.More { return WriteContext{}, &tcpip.ErrInvalidOptionValue{} } if e.State() == transport.DatagramEndpointStateClosed { return WriteContext{}, &tcpip.ErrInvalidEndpointState{} } if e.writeShutdown { return WriteContext{}, &tcpip.ErrClosedForSend{} } ipv6PktInfoValid := e.effectiveNetProto == header.IPv6ProtocolNumber && opts.ControlMessages.HasIPv6PacketInfo route := e.connectedRoute to := opts.To info := e.Info() switch { case to == nil: // If the user doesn't specify a destination, they should have // connected to another address. if e.State() != transport.DatagramEndpointStateConnected { return WriteContext{}, &tcpip.ErrDestinationRequired{} } if !ipv6PktInfoValid { route.Acquire() break } // We are connected and the caller did not specify the destination but // we have an IPv6 packet info structure which may change our local // interface/address used to send the packet so we need to construct // a new route instead of using the connected route. // // Construct a destination matching the remote the endpoint is connected // to. to = &tcpip.FullAddress{ // RegisterNICID is set when the endpoint is connected. It is usually // only set for link-local addresses or multicast addresses if the // multicast interface was specified (see e.multicastNICID, // e.connectRouteRLocked and e.ConnectAndThen). NIC: info.RegisterNICID, Addr: info.ID.RemoteAddress, } fallthrough default: // Reject destination address if it goes through a different // NIC than the endpoint was bound to. nicID := to.NIC if nicID == 0 { nicID = tcpip.NICID(e.ops.GetBindToDevice()) } var localAddr tcpip.Address if ipv6PktInfoValid { // Uphold strong-host semantics since (as of writing) the stack follows // the strong host model. pktInfoNICID := opts.ControlMessages.IPv6PacketInfo.NIC pktInfoAddr := opts.ControlMessages.IPv6PacketInfo.Addr if pktInfoNICID != 0 { // If we are bound to an interface or specified the destination // interface (usually when using link-local addresses), make sure the // interface matches the specified local interface. if nicID != 0 && nicID != pktInfoNICID { return WriteContext{}, &tcpip.ErrHostUnreachable{} } // If a local address is not specified, then we need to make sure the // bound address belongs to the specified local interface. if pktInfoAddr.BitLen() == 0 { // If the bound interface is different from the specified local // interface, the bound address obviously does not belong to the // specified local interface. // // The bound interface is usually only set for link-local addresses. if info.BindNICID != 0 && info.BindNICID != pktInfoNICID { return WriteContext{}, &tcpip.ErrHostUnreachable{} } if info.ID.LocalAddress.BitLen() != 0 && e.stack.CheckLocalAddress(pktInfoNICID, header.IPv6ProtocolNumber, info.ID.LocalAddress) == 0 { return WriteContext{}, &tcpip.ErrBadLocalAddress{} } } nicID = pktInfoNICID } if pktInfoAddr.BitLen() != 0 { // The local address must belong to the stack. If an outgoing interface // is specified as a result of binding the endpoint to a device, or // specifying the outgoing interface in the destination address/pkt info // structure, the address must belong to that interface. if e.stack.CheckLocalAddress(nicID, header.IPv6ProtocolNumber, pktInfoAddr) == 0 { return WriteContext{}, &tcpip.ErrBadLocalAddress{} } localAddr = pktInfoAddr } } else { if info.BindNICID != 0 { if nicID != 0 && nicID != info.BindNICID { return WriteContext{}, &tcpip.ErrHostUnreachable{} } nicID = info.BindNICID } if nicID == 0 { nicID = info.RegisterNICID } } dst, netProto, err := e.checkV4Mapped(*to, false /* bind */) if err != nil { return WriteContext{}, err } route, _, err = e.connectRouteRLocked(nicID, localAddr, dst, netProto) if err != nil { return WriteContext{}, err } } if !e.ops.GetBroadcast() && route.IsOutboundBroadcast() { route.Release() return WriteContext{}, &tcpip.ErrBroadcastDisabled{} } var tos uint8 var ttl uint8 switch netProto := route.NetProto(); netProto { case header.IPv4ProtocolNumber: tos = e.ipv4TOS if opts.ControlMessages.HasTTL { ttl = opts.ControlMessages.TTL } else { ttl = e.calculateTTL(route) } case header.IPv6ProtocolNumber: tos = e.ipv6TClass if opts.ControlMessages.HasHopLimit { ttl = opts.ControlMessages.HopLimit } else { ttl = e.calculateTTL(route) } default: panic(fmt.Sprintf("invalid protocol number = %d", netProto)) } return WriteContext{ e: e, route: route, ttl: ttl, tos: tos, }, nil } // Disconnect disconnects the endpoint from its peer. func (e *Endpoint) Disconnect() { e.mu.Lock() defer e.mu.Unlock() if e.State() != transport.DatagramEndpointStateConnected { return } info := e.Info() // Exclude ephemerally bound endpoints. if e.wasBound { info.ID = stack.TransportEndpointID{ LocalAddress: info.BindAddr, } e.setEndpointState(transport.DatagramEndpointStateBound) } else { info.ID = stack.TransportEndpointID{} e.setEndpointState(transport.DatagramEndpointStateInitial) } e.setInfo(info) e.connectedRoute.Release() e.connectedRoute = nil } // connectRouteRLocked establishes a route to the specified interface or the // configured multicast interface if no interface is specified and the // specified address is a multicast address. // // +checklocksread:e.mu func (e *Endpoint) connectRouteRLocked(nicID tcpip.NICID, localAddr tcpip.Address, addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber) (*stack.Route, tcpip.NICID, tcpip.Error) { if localAddr.BitLen() == 0 { localAddr = e.Info().ID.LocalAddress if e.isBroadcastOrMulticast(nicID, netProto, localAddr) { // A packet can only originate from a unicast address (i.e., an interface). localAddr = tcpip.Address{} } if header.IsV4MulticastAddress(addr.Addr) || header.IsV6MulticastAddress(addr.Addr) { if nicID == 0 { nicID = e.multicastNICID } if localAddr == (tcpip.Address{}) && nicID == 0 { localAddr = e.multicastAddr } } } // Find a route to the desired destination. r, err := e.stack.FindRoute(nicID, localAddr, addr.Addr, netProto, e.ops.GetMulticastLoop()) if err != nil { return nil, 0, err } return r, nicID, nil } // Connect connects the endpoint to the address. func (e *Endpoint) Connect(addr tcpip.FullAddress) tcpip.Error { return e.ConnectAndThen(addr, func(_ tcpip.NetworkProtocolNumber, _, _ stack.TransportEndpointID) tcpip.Error { return nil }) } // ConnectAndThen connects the endpoint to the address and then calls the // provided function. // // If the function returns an error, the endpoint's state does not change. The // function will be called with the network protocol used to connect to the peer // and the source and destination addresses that will be used to send traffic to // the peer. func (e *Endpoint) ConnectAndThen(addr tcpip.FullAddress, f func(netProto tcpip.NetworkProtocolNumber, previousID, nextID stack.TransportEndpointID) tcpip.Error) tcpip.Error { addr.Port = 0 e.mu.Lock() defer e.mu.Unlock() info := e.Info() nicID := addr.NIC switch e.State() { case transport.DatagramEndpointStateInitial: case transport.DatagramEndpointStateBound, transport.DatagramEndpointStateConnected: if info.BindNICID == 0 { break } if nicID != 0 && nicID != info.BindNICID { return &tcpip.ErrInvalidEndpointState{} } nicID = info.BindNICID default: return &tcpip.ErrInvalidEndpointState{} } addr, netProto, err := e.checkV4Mapped(addr, false /* bind */) if err != nil { return err } r, nicID, err := e.connectRouteRLocked(nicID, tcpip.Address{}, addr, netProto) if err != nil { return err } id := stack.TransportEndpointID{ LocalAddress: info.ID.LocalAddress, RemoteAddress: r.RemoteAddress(), } if e.State() == transport.DatagramEndpointStateInitial { id.LocalAddress = r.LocalAddress() } if err := f(r.NetProto(), info.ID, id); err != nil { r.Release() return err } if e.connectedRoute != nil { // If the endpoint was previously connected then release any previous route. e.connectedRoute.Release() } e.connectedRoute = r info.ID = id info.RegisterNICID = nicID e.setInfo(info) e.effectiveNetProto = netProto e.setEndpointState(transport.DatagramEndpointStateConnected) return nil } // Shutdown shutsdown the endpoint. func (e *Endpoint) Shutdown() tcpip.Error { e.mu.Lock() defer e.mu.Unlock() switch state := e.State(); state { case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateClosed: return &tcpip.ErrNotConnected{} case transport.DatagramEndpointStateBound, transport.DatagramEndpointStateConnected: e.writeShutdown = true return nil default: panic(fmt.Sprintf("unhandled state = %s", state)) } } // checkV4MappedRLocked determines the effective network protocol and converts // addr to its canonical form. func (e *Endpoint) checkV4Mapped(addr tcpip.FullAddress, bind bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) { info := e.Info() unwrapped, netProto, err := info.AddrNetProtoLocked(addr, e.ops.GetV6Only(), bind) if err != nil { return tcpip.FullAddress{}, 0, err } return unwrapped, netProto, nil } func (e *Endpoint) isBroadcastOrMulticast(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, addr tcpip.Address) bool { return addr == header.IPv4Broadcast || header.IsV4MulticastAddress(addr) || header.IsV6MulticastAddress(addr) || e.stack.IsSubnetBroadcast(nicID, netProto, addr) } // Bind binds the endpoint to the address. func (e *Endpoint) Bind(addr tcpip.FullAddress) tcpip.Error { return e.BindAndThen(addr, func(tcpip.NetworkProtocolNumber, tcpip.Address) tcpip.Error { return nil }) } // BindAndThen binds the endpoint to the address and then calls the provided // function. // // If the function returns an error, the endpoint's state does not change. The // function will be called with the bound network protocol and address. func (e *Endpoint) BindAndThen(addr tcpip.FullAddress, f func(tcpip.NetworkProtocolNumber, tcpip.Address) tcpip.Error) tcpip.Error { addr.Port = 0 e.mu.Lock() defer e.mu.Unlock() // Don't allow binding once endpoint is not in the initial state // anymore. if e.State() != transport.DatagramEndpointStateInitial { return &tcpip.ErrInvalidEndpointState{} } addr, netProto, err := e.checkV4Mapped(addr, true /* bind */) if err != nil { return err } nicID := addr.NIC if addr.Addr.BitLen() != 0 && !e.isBroadcastOrMulticast(addr.NIC, netProto, addr.Addr) { nicID = e.stack.CheckLocalAddress(nicID, netProto, addr.Addr) if nicID == 0 { return &tcpip.ErrBadLocalAddress{} } } if err := f(netProto, addr.Addr); err != nil { return err } e.wasBound = true info := e.Info() info.ID = stack.TransportEndpointID{ LocalAddress: addr.Addr, } info.BindNICID = addr.NIC info.RegisterNICID = nicID info.BindAddr = addr.Addr e.setInfo(info) e.effectiveNetProto = netProto e.setEndpointState(transport.DatagramEndpointStateBound) return nil } // WasBound returns true iff the endpoint was ever bound. func (e *Endpoint) WasBound() bool { e.mu.RLock() defer e.mu.RUnlock() return e.wasBound } // GetLocalAddress returns the address that the endpoint is bound to. func (e *Endpoint) GetLocalAddress() tcpip.FullAddress { e.mu.RLock() defer e.mu.RUnlock() info := e.Info() addr := info.BindAddr if e.State() == transport.DatagramEndpointStateConnected { addr = e.connectedRoute.LocalAddress() } return tcpip.FullAddress{ NIC: info.RegisterNICID, Addr: addr, } } // GetRemoteAddress returns the address that the endpoint is connected to. func (e *Endpoint) GetRemoteAddress() (tcpip.FullAddress, bool) { e.mu.RLock() defer e.mu.RUnlock() if e.State() != transport.DatagramEndpointStateConnected { return tcpip.FullAddress{}, false } return tcpip.FullAddress{ Addr: e.connectedRoute.RemoteAddress(), NIC: e.Info().RegisterNICID, }, true } // SetSockOptInt sets the socket option. func (e *Endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { switch opt { case tcpip.MTUDiscoverOption: // Return not supported if the value is not disabling path // MTU discovery. if tcpip.PMTUDStrategy(v) != tcpip.PMTUDiscoveryDont { return &tcpip.ErrNotSupported{} } case tcpip.MulticastTTLOption: e.mu.Lock() e.multicastTTL = uint8(v) e.mu.Unlock() case tcpip.IPv4TTLOption: e.mu.Lock() e.ipv4TTL = uint8(v) e.mu.Unlock() case tcpip.IPv6HopLimitOption: e.mu.Lock() e.ipv6HopLimit = int16(v) e.mu.Unlock() case tcpip.IPv4TOSOption: e.mu.Lock() e.ipv4TOS = uint8(v) e.mu.Unlock() case tcpip.IPv6TrafficClassOption: e.mu.Lock() e.ipv6TClass = uint8(v) e.mu.Unlock() } return nil } // GetSockOptInt returns the socket option. func (e *Endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { switch opt { case tcpip.MTUDiscoverOption: // The only supported setting is path MTU discovery disabled. return int(tcpip.PMTUDiscoveryDont), nil case tcpip.MulticastTTLOption: e.mu.Lock() v := int(e.multicastTTL) e.mu.Unlock() return v, nil case tcpip.IPv4TTLOption: e.mu.Lock() v := int(e.ipv4TTL) e.mu.Unlock() return v, nil case tcpip.IPv6HopLimitOption: e.mu.Lock() v := int(e.ipv6HopLimit) e.mu.Unlock() return v, nil case tcpip.IPv4TOSOption: e.mu.RLock() v := int(e.ipv4TOS) e.mu.RUnlock() return v, nil case tcpip.IPv6TrafficClassOption: e.mu.RLock() v := int(e.ipv6TClass) e.mu.RUnlock() return v, nil default: return -1, &tcpip.ErrUnknownProtocolOption{} } } // SetSockOpt sets the socket option. func (e *Endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { switch v := opt.(type) { case *tcpip.MulticastInterfaceOption: e.mu.Lock() defer e.mu.Unlock() fa := tcpip.FullAddress{Addr: v.InterfaceAddr} fa, netProto, err := e.checkV4Mapped(fa, true /* bind */) if err != nil { return err } nic := v.NIC addr := fa.Addr if nic == 0 && addr == (tcpip.Address{}) { e.multicastAddr = tcpip.Address{} e.multicastNICID = 0 break } if nic != 0 { if !e.stack.CheckNIC(nic) { return &tcpip.ErrBadLocalAddress{} } } else { nic = e.stack.CheckLocalAddress(0, netProto, addr) if nic == 0 { return &tcpip.ErrBadLocalAddress{} } } if info := e.Info(); info.BindNICID != 0 && info.BindNICID != nic { return &tcpip.ErrInvalidEndpointState{} } e.multicastNICID = nic e.multicastAddr = addr case *tcpip.AddMembershipOption: if !(header.IsV4MulticastAddress(v.MulticastAddr) && e.netProto == header.IPv4ProtocolNumber) && !(header.IsV6MulticastAddress(v.MulticastAddr) && e.netProto == header.IPv6ProtocolNumber) { return &tcpip.ErrInvalidOptionValue{} } nicID := v.NIC if v.InterfaceAddr.Unspecified() { if nicID == 0 { if r, err := e.stack.FindRoute(0, tcpip.Address{}, v.MulticastAddr, e.netProto, false /* multicastLoop */); err == nil { nicID = r.NICID() r.Release() } } } else { nicID = e.stack.CheckLocalAddress(nicID, e.netProto, v.InterfaceAddr) } if nicID == 0 { return &tcpip.ErrUnknownDevice{} } memToInsert := multicastMembership{nicID: nicID, multicastAddr: v.MulticastAddr} e.mu.Lock() defer e.mu.Unlock() if _, ok := e.multicastMemberships[memToInsert]; ok { return &tcpip.ErrPortInUse{} } if err := e.stack.JoinGroup(e.netProto, nicID, v.MulticastAddr); err != nil { return err } e.multicastMemberships[memToInsert] = struct{}{} case *tcpip.RemoveMembershipOption: if !(header.IsV4MulticastAddress(v.MulticastAddr) && e.netProto == header.IPv4ProtocolNumber) && !(header.IsV6MulticastAddress(v.MulticastAddr) && e.netProto == header.IPv6ProtocolNumber) { return &tcpip.ErrInvalidOptionValue{} } nicID := v.NIC if v.InterfaceAddr.Unspecified() { if nicID == 0 { if r, err := e.stack.FindRoute(0, tcpip.Address{}, v.MulticastAddr, e.netProto, false /* multicastLoop */); err == nil { nicID = r.NICID() r.Release() } } } else { nicID = e.stack.CheckLocalAddress(nicID, e.netProto, v.InterfaceAddr) } if nicID == 0 { return &tcpip.ErrUnknownDevice{} } memToRemove := multicastMembership{nicID: nicID, multicastAddr: v.MulticastAddr} e.mu.Lock() defer e.mu.Unlock() if _, ok := e.multicastMemberships[memToRemove]; !ok { return &tcpip.ErrBadLocalAddress{} } if err := e.stack.LeaveGroup(e.netProto, nicID, v.MulticastAddr); err != nil { return err } delete(e.multicastMemberships, memToRemove) case *tcpip.SocketDetachFilterOption: return nil } return nil } // GetSockOpt returns the socket option. func (e *Endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { switch o := opt.(type) { case *tcpip.MulticastInterfaceOption: e.mu.Lock() *o = tcpip.MulticastInterfaceOption{ NIC: e.multicastNICID, InterfaceAddr: e.multicastAddr, } e.mu.Unlock() default: return &tcpip.ErrUnknownProtocolOption{} } return nil } // Info returns a copy of the endpoint info. func (e *Endpoint) Info() stack.TransportEndpointInfo { e.infoMu.RLock() defer e.infoMu.RUnlock() return e.info } // setInfo sets the endpoint's info. // // e.mu must be held to synchronize changes to info with the rest of the // endpoint. // // +checklocks:e.mu func (e *Endpoint) setInfo(info stack.TransportEndpointInfo) { e.infoMu.Lock() defer e.infoMu.Unlock() e.info = info } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/internal/network/endpoint_state.go000066400000000000000000000042521465435605700314560ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package network import ( "fmt" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport" ) // Resume implements tcpip.ResumableEndpoint.Resume. func (e *Endpoint) Resume(s *stack.Stack) { e.mu.Lock() defer e.mu.Unlock() e.stack = s for m := range e.multicastMemberships { if err := e.stack.JoinGroup(e.netProto, m.nicID, m.multicastAddr); err != nil { panic(fmt.Sprintf("e.stack.JoinGroup(%d, %d, %s): %s", e.netProto, m.nicID, m.multicastAddr, err)) } } info := e.Info() switch state := e.State(); state { case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateClosed: case transport.DatagramEndpointStateBound: if info.ID.LocalAddress.BitLen() != 0 && !e.isBroadcastOrMulticast(info.RegisterNICID, e.effectiveNetProto, info.ID.LocalAddress) { if e.stack.CheckLocalAddress(info.RegisterNICID, e.effectiveNetProto, info.ID.LocalAddress) == 0 { panic(fmt.Sprintf("got e.stack.CheckLocalAddress(%d, %d, %s) = 0, want != 0", info.RegisterNICID, e.effectiveNetProto, info.ID.LocalAddress)) } } case transport.DatagramEndpointStateConnected: var err tcpip.Error multicastLoop := e.ops.GetMulticastLoop() e.connectedRoute, err = e.stack.FindRoute(info.RegisterNICID, info.ID.LocalAddress, info.ID.RemoteAddress, e.effectiveNetProto, multicastLoop) if err != nil { panic(fmt.Sprintf("e.stack.FindRoute(%d, %s, %s, %d, %t): %s", info.RegisterNICID, info.ID.LocalAddress, info.ID.RemoteAddress, e.effectiveNetProto, multicastLoop, err)) } default: panic(fmt.Sprintf("unhandled state = %s", state)) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/internal/network/network_state_autogen.go000066400000000000000000000061541465435605700330540ustar00rootroot00000000000000// automatically generated by stateify. package network import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (e *Endpoint) StateTypeName() string { return "pkg/tcpip/transport/internal/network.Endpoint" } func (e *Endpoint) StateFields() []string { return []string{ "ops", "netProto", "transProto", "waiterQueue", "wasBound", "owner", "writeShutdown", "effectiveNetProto", "multicastMemberships", "ipv4TTL", "ipv6HopLimit", "multicastTTL", "multicastAddr", "multicastNICID", "ipv4TOS", "ipv6TClass", "info", "state", } } func (e *Endpoint) beforeSave() {} // +checklocksignore func (e *Endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.ops) stateSinkObject.Save(1, &e.netProto) stateSinkObject.Save(2, &e.transProto) stateSinkObject.Save(3, &e.waiterQueue) stateSinkObject.Save(4, &e.wasBound) stateSinkObject.Save(5, &e.owner) stateSinkObject.Save(6, &e.writeShutdown) stateSinkObject.Save(7, &e.effectiveNetProto) stateSinkObject.Save(8, &e.multicastMemberships) stateSinkObject.Save(9, &e.ipv4TTL) stateSinkObject.Save(10, &e.ipv6HopLimit) stateSinkObject.Save(11, &e.multicastTTL) stateSinkObject.Save(12, &e.multicastAddr) stateSinkObject.Save(13, &e.multicastNICID) stateSinkObject.Save(14, &e.ipv4TOS) stateSinkObject.Save(15, &e.ipv6TClass) stateSinkObject.Save(16, &e.info) stateSinkObject.Save(17, &e.state) } func (e *Endpoint) afterLoad(context.Context) {} // +checklocksignore func (e *Endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.ops) stateSourceObject.Load(1, &e.netProto) stateSourceObject.Load(2, &e.transProto) stateSourceObject.Load(3, &e.waiterQueue) stateSourceObject.Load(4, &e.wasBound) stateSourceObject.Load(5, &e.owner) stateSourceObject.Load(6, &e.writeShutdown) stateSourceObject.Load(7, &e.effectiveNetProto) stateSourceObject.Load(8, &e.multicastMemberships) stateSourceObject.Load(9, &e.ipv4TTL) stateSourceObject.Load(10, &e.ipv6HopLimit) stateSourceObject.Load(11, &e.multicastTTL) stateSourceObject.Load(12, &e.multicastAddr) stateSourceObject.Load(13, &e.multicastNICID) stateSourceObject.Load(14, &e.ipv4TOS) stateSourceObject.Load(15, &e.ipv6TClass) stateSourceObject.Load(16, &e.info) stateSourceObject.Load(17, &e.state) } func (m *multicastMembership) StateTypeName() string { return "pkg/tcpip/transport/internal/network.multicastMembership" } func (m *multicastMembership) StateFields() []string { return []string{ "nicID", "multicastAddr", } } func (m *multicastMembership) beforeSave() {} // +checklocksignore func (m *multicastMembership) StateSave(stateSinkObject state.Sink) { m.beforeSave() stateSinkObject.Save(0, &m.nicID) stateSinkObject.Save(1, &m.multicastAddr) } func (m *multicastMembership) afterLoad(context.Context) {} // +checklocksignore func (m *multicastMembership) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &m.nicID) stateSourceObject.Load(1, &m.multicastAddr) } func init() { state.Register((*Endpoint)(nil)) state.Register((*multicastMembership)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/internal/noop/000077500000000000000000000000001465435605700253665ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/internal/noop/endpoint.go000066400000000000000000000116441465435605700275430ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package noop contains an endpoint that implements all tcpip.Endpoint // functions as noops. package noop import ( "fmt" "io" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/waiter" ) // endpoint can be created, but all interactions have no effect or // return errors. // // +stateify savable type endpoint struct { tcpip.DefaultSocketOptionsHandler ops tcpip.SocketOptions } // New returns an initialized noop endpoint. func New(stk *stack.Stack) tcpip.Endpoint { // ep.ops must be in a valid, initialized state for callers of // ep.SocketOptions. var ep endpoint ep.ops.InitHandler(&ep, stk, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits) return &ep } // Abort implements stack.TransportEndpoint.Abort. func (*endpoint) Abort() { // No-op. } // Close implements tcpip.Endpoint.Close. func (*endpoint) Close() { // No-op. } // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf. func (*endpoint) ModerateRecvBuf(int) { // No-op. } func (*endpoint) SetOwner(tcpip.PacketOwner) { // No-op. } // Read implements tcpip.Endpoint.Read. func (*endpoint) Read(io.Writer, tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) { return tcpip.ReadResult{}, &tcpip.ErrNotPermitted{} } // Write implements tcpip.Endpoint.Write. func (*endpoint) Write(tcpip.Payloader, tcpip.WriteOptions) (int64, tcpip.Error) { return 0, &tcpip.ErrNotPermitted{} } // Disconnect implements tcpip.Endpoint.Disconnect. func (*endpoint) Disconnect() tcpip.Error { return &tcpip.ErrNotSupported{} } // Connect implements tcpip.Endpoint.Connect. func (*endpoint) Connect(tcpip.FullAddress) tcpip.Error { return &tcpip.ErrNotPermitted{} } // Shutdown implements tcpip.Endpoint.Shutdown. func (*endpoint) Shutdown(tcpip.ShutdownFlags) tcpip.Error { return &tcpip.ErrNotPermitted{} } // Listen implements tcpip.Endpoint.Listen. func (*endpoint) Listen(int) tcpip.Error { return &tcpip.ErrNotSupported{} } // Accept implements tcpip.Endpoint.Accept. func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) { return nil, nil, &tcpip.ErrNotSupported{} } // Bind implements tcpip.Endpoint.Bind. func (*endpoint) Bind(tcpip.FullAddress) tcpip.Error { return &tcpip.ErrNotPermitted{} } // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress. func (*endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { return tcpip.FullAddress{}, &tcpip.ErrNotSupported{} } // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress. func (*endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} } // Readiness implements tcpip.Endpoint.Readiness. func (*endpoint) Readiness(waiter.EventMask) waiter.EventMask { return 0 } // SetSockOpt implements tcpip.Endpoint.SetSockOpt. func (*endpoint) SetSockOpt(tcpip.SettableSocketOption) tcpip.Error { return &tcpip.ErrUnknownProtocolOption{} } func (*endpoint) SetSockOptInt(tcpip.SockOptInt, int) tcpip.Error { return &tcpip.ErrUnknownProtocolOption{} } // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) tcpip.Error { return &tcpip.ErrUnknownProtocolOption{} } // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. func (*endpoint) GetSockOptInt(tcpip.SockOptInt) (int, tcpip.Error) { return 0, &tcpip.ErrUnknownProtocolOption{} } // HandlePacket implements stack.RawTransportEndpoint.HandlePacket. func (*endpoint) HandlePacket(pkt *stack.PacketBuffer) { panic(fmt.Sprintf("unreachable: noop.endpoint should never be registered, but got packet: %+v", pkt)) } // State implements socket.Socket.State. func (*endpoint) State() uint32 { return 0 } // Wait implements stack.TransportEndpoint.Wait. func (*endpoint) Wait() { // No-op. } // Release implements stack.TransportEndpoint.Release. func (*endpoint) Release() { // No-op. } // LastError implements tcpip.Endpoint.LastError. func (*endpoint) LastError() tcpip.Error { return nil } // SocketOptions implements tcpip.Endpoint.SocketOptions. func (ep *endpoint) SocketOptions() *tcpip.SocketOptions { return &ep.ops } // Info implements tcpip.Endpoint.Info. func (*endpoint) Info() tcpip.EndpointInfo { return &stack.TransportEndpointInfo{} } // Stats returns a pointer to the endpoint stats. func (*endpoint) Stats() tcpip.EndpointStats { return &tcpip.TransportEndpointStats{} } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/internal/noop/noop_state_autogen.go000066400000000000000000000015321465435605700316130ustar00rootroot00000000000000// automatically generated by stateify. package noop import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (ep *endpoint) StateTypeName() string { return "pkg/tcpip/transport/internal/noop.endpoint" } func (ep *endpoint) StateFields() []string { return []string{ "DefaultSocketOptionsHandler", "ops", } } func (ep *endpoint) beforeSave() {} // +checklocksignore func (ep *endpoint) StateSave(stateSinkObject state.Sink) { ep.beforeSave() stateSinkObject.Save(0, &ep.DefaultSocketOptionsHandler) stateSinkObject.Save(1, &ep.ops) } func (ep *endpoint) afterLoad(context.Context) {} // +checklocksignore func (ep *endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &ep.DefaultSocketOptionsHandler) stateSourceObject.Load(1, &ep.ops) } func init() { state.Register((*endpoint)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/packet/000077500000000000000000000000001465435605700240465ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/packet/endpoint.go000066400000000000000000000327251465435605700262260ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package packet provides the implementation of packet sockets (see // packet(7)). Packet sockets allow applications to: // // - manually write and inspect link, network, and transport headers // - receive all traffic of a given network protocol, or all protocols // // Packet sockets are similar to raw sockets, but provide even more power to // users, letting them effectively talk directly to the network device. // // Packet sockets skip the input and output iptables chains. package packet import ( "io" "time" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/waiter" ) // +stateify savable type packet struct { packetEntry // data holds the actual packet data, including any headers and payload. data *stack.PacketBuffer receivedAt time.Time `state:".(int64)"` // senderAddr is the network address of the sender. senderAddr tcpip.FullAddress // packetInfo holds additional information like the protocol // of the packet etc. packetInfo tcpip.LinkPacketInfo } // endpoint is the packet socket implementation of tcpip.Endpoint. It is legal // to have goroutines make concurrent calls into the endpoint. // // Lock order: // // endpoint.mu // endpoint.rcvMu // // +stateify savable type endpoint struct { tcpip.DefaultSocketOptionsHandler // The following fields are initialized at creation time and are // immutable. stack *stack.Stack `state:"manual"` waiterQueue *waiter.Queue cooked bool ops tcpip.SocketOptions stats tcpip.TransportEndpointStats // The following fields are used to manage the receive queue. rcvMu sync.Mutex `state:"nosave"` // +checklocks:rcvMu rcvList packetList // +checklocks:rcvMu rcvBufSize int // +checklocks:rcvMu rcvClosed bool // +checklocks:rcvMu rcvDisabled bool mu sync.RWMutex `state:"nosave"` // +checklocks:mu closed bool // +checklocks:mu boundNetProto tcpip.NetworkProtocolNumber // +checklocks:mu boundNIC tcpip.NICID lastErrorMu sync.Mutex `state:"nosave"` // +checklocks:lastErrorMu lastError tcpip.Error } // NewEndpoint returns a new packet endpoint. func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) tcpip.Endpoint { ep := &endpoint{ stack: s, cooked: cooked, boundNetProto: netProto, waiterQueue: waiterQueue, } ep.ops.InitHandler(ep, ep.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits) ep.ops.SetReceiveBufferSize(32*1024, false /* notify */) // Override with stack defaults. var ss tcpip.SendBufferSizeOption if err := s.Option(&ss); err == nil { ep.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) } var rs tcpip.ReceiveBufferSizeOption if err := s.Option(&rs); err == nil { ep.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */) } s.RegisterPacketEndpoint(0, netProto, ep) return ep } // Abort implements stack.TransportEndpoint.Abort. func (ep *endpoint) Abort() { ep.Close() } // Close implements tcpip.Endpoint.Close. func (ep *endpoint) Close() { ep.mu.Lock() defer ep.mu.Unlock() if ep.closed { return } ep.stack.UnregisterPacketEndpoint(ep.boundNIC, ep.boundNetProto, ep) ep.rcvMu.Lock() defer ep.rcvMu.Unlock() // Clear the receive list. ep.rcvClosed = true ep.rcvBufSize = 0 for !ep.rcvList.Empty() { p := ep.rcvList.Front() ep.rcvList.Remove(p) p.data.DecRef() } ep.closed = true ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) } // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf. func (*endpoint) ModerateRecvBuf(int) {} // Read implements tcpip.Endpoint.Read. func (ep *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) { ep.rcvMu.Lock() // If there's no data to read, return that read would block or that the // endpoint is closed. if ep.rcvList.Empty() { var err tcpip.Error = &tcpip.ErrWouldBlock{} if ep.rcvClosed { ep.stats.ReadErrors.ReadClosed.Increment() err = &tcpip.ErrClosedForReceive{} } ep.rcvMu.Unlock() return tcpip.ReadResult{}, err } packet := ep.rcvList.Front() if !opts.Peek { ep.rcvList.Remove(packet) defer packet.data.DecRef() ep.rcvBufSize -= packet.data.Size() } ep.rcvMu.Unlock() res := tcpip.ReadResult{ Total: packet.data.Size(), ControlMessages: tcpip.ReceivableControlMessages{ HasTimestamp: true, Timestamp: packet.receivedAt, }, } if opts.NeedRemoteAddr { res.RemoteAddr = packet.senderAddr } if opts.NeedLinkPacketInfo { res.LinkPacketInfo = packet.packetInfo } n, err := packet.data.Data().ReadTo(dst, opts.Peek) if n == 0 && err != nil { return res, &tcpip.ErrBadBuffer{} } res.Count = n return res, nil } func (ep *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { if !ep.stack.PacketEndpointWriteSupported() { return 0, &tcpip.ErrNotSupported{} } ep.mu.Lock() closed := ep.closed nicID := ep.boundNIC proto := ep.boundNetProto ep.mu.Unlock() if closed { return 0, &tcpip.ErrClosedForSend{} } var remote tcpip.LinkAddress if to := opts.To; to != nil { remote = to.LinkAddr if n := to.NIC; n != 0 { nicID = n } if p := to.Port; p != 0 { proto = tcpip.NetworkProtocolNumber(p) } } if nicID == 0 { return 0, &tcpip.ErrInvalidOptionValue{} } // Prevents giant buffer allocations. if p.Len() > header.DatagramMaximumSize { return 0, &tcpip.ErrMessageTooLong{} } var payload buffer.Buffer if _, err := payload.WriteFromReader(p, int64(p.Len())); err != nil { return 0, &tcpip.ErrBadBuffer{} } payloadSz := payload.Size() if err := func() tcpip.Error { if ep.cooked { return ep.stack.WritePacketToRemote(nicID, remote, proto, payload) } return ep.stack.WriteRawPacket(nicID, proto, payload) }(); err != nil { return 0, err } return payloadSz, nil } // Disconnect implements tcpip.Endpoint.Disconnect. Packet sockets cannot be // disconnected, and this function always returns tpcip.ErrNotSupported. func (*endpoint) Disconnect() tcpip.Error { return &tcpip.ErrNotSupported{} } // Connect implements tcpip.Endpoint.Connect. Packet sockets cannot be // connected, and this function always returns *tcpip.ErrNotSupported. func (*endpoint) Connect(tcpip.FullAddress) tcpip.Error { return &tcpip.ErrNotSupported{} } // Shutdown implements tcpip.Endpoint.Shutdown. Packet sockets cannot be used // with Shutdown, and this function always returns *tcpip.ErrNotSupported. func (*endpoint) Shutdown(tcpip.ShutdownFlags) tcpip.Error { return &tcpip.ErrNotSupported{} } // Listen implements tcpip.Endpoint.Listen. Packet sockets cannot be used with // Listen, and this function always returns *tcpip.ErrNotSupported. func (*endpoint) Listen(int) tcpip.Error { return &tcpip.ErrNotSupported{} } // Accept implements tcpip.Endpoint.Accept. Packet sockets cannot be used with // Accept, and this function always returns *tcpip.ErrNotSupported. func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) { return nil, nil, &tcpip.ErrNotSupported{} } // Bind implements tcpip.Endpoint.Bind. func (ep *endpoint) Bind(addr tcpip.FullAddress) tcpip.Error { // "By default, all packets of the specified protocol type are passed // to a packet socket. To get packets only from a specific interface // use bind(2) specifying an address in a struct sockaddr_ll to bind // the packet socket to an interface. Fields used for binding are // sll_family (should be AF_PACKET), sll_protocol, and sll_ifindex." // - packet(7). ep.mu.Lock() defer ep.mu.Unlock() netProto := tcpip.NetworkProtocolNumber(addr.Port) if netProto == 0 { // Do not allow unbinding the network protocol. netProto = ep.boundNetProto } if ep.boundNIC == addr.NIC && ep.boundNetProto == netProto { // Already bound to the requested NIC and network protocol. return nil } // TODO(https://gvisor.dev/issue/6618): Unregister after registering the new // binding. ep.stack.UnregisterPacketEndpoint(ep.boundNIC, ep.boundNetProto, ep) ep.boundNIC = 0 ep.boundNetProto = 0 // Bind endpoint to receive packets from specific interface. if err := ep.stack.RegisterPacketEndpoint(addr.NIC, netProto, ep); err != nil { return err } ep.boundNIC = addr.NIC ep.boundNetProto = netProto return nil } // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress. func (ep *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { ep.mu.RLock() defer ep.mu.RUnlock() return tcpip.FullAddress{ NIC: ep.boundNIC, Port: uint16(ep.boundNetProto), }, nil } // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress. func (*endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { // Even a connected socket doesn't return a remote address. return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} } // Readiness implements tcpip.Endpoint.Readiness. func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { // The endpoint is always writable. result := waiter.WritableEvents & mask // Determine whether the endpoint is readable. if (mask & waiter.ReadableEvents) != 0 { ep.rcvMu.Lock() if !ep.rcvList.Empty() || ep.rcvClosed { result |= waiter.ReadableEvents } ep.rcvMu.Unlock() } return result } // SetSockOpt implements tcpip.Endpoint.SetSockOpt. Packet sockets cannot be // used with SetSockOpt, and this function always returns // *tcpip.ErrNotSupported. func (ep *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { switch opt.(type) { case *tcpip.SocketDetachFilterOption: return nil default: return &tcpip.ErrUnknownProtocolOption{} } } // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt. func (*endpoint) SetSockOptInt(tcpip.SockOptInt, int) tcpip.Error { return &tcpip.ErrUnknownProtocolOption{} } func (ep *endpoint) LastError() tcpip.Error { ep.lastErrorMu.Lock() defer ep.lastErrorMu.Unlock() err := ep.lastError ep.lastError = nil return err } // UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError. func (ep *endpoint) UpdateLastError(err tcpip.Error) { ep.lastErrorMu.Lock() ep.lastError = err ep.lastErrorMu.Unlock() } // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) tcpip.Error { return &tcpip.ErrNotSupported{} } // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { switch opt { case tcpip.ReceiveQueueSizeOption: v := 0 ep.rcvMu.Lock() if !ep.rcvList.Empty() { p := ep.rcvList.Front() v = p.data.Size() } ep.rcvMu.Unlock() return v, nil default: return -1, &tcpip.ErrUnknownProtocolOption{} } } // HandlePacket implements stack.PacketEndpoint.HandlePacket. func (ep *endpoint) HandlePacket(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { ep.rcvMu.Lock() // Drop the packet if our buffer is currently full. if ep.rcvClosed { ep.rcvMu.Unlock() ep.stack.Stats().DroppedPackets.Increment() ep.stats.ReceiveErrors.ClosedReceiver.Increment() return } rcvBufSize := ep.ops.GetReceiveBufferSize() if ep.rcvDisabled || ep.rcvBufSize >= int(rcvBufSize) { ep.rcvMu.Unlock() ep.stack.Stats().DroppedPackets.Increment() ep.stats.ReceiveErrors.ReceiveBufferOverflow.Increment() return } wasEmpty := ep.rcvBufSize == 0 rcvdPkt := packet{ packetInfo: tcpip.LinkPacketInfo{ Protocol: netProto, PktType: pkt.PktType, }, senderAddr: tcpip.FullAddress{ NIC: nicID, }, receivedAt: ep.stack.Clock().Now(), } if len(pkt.LinkHeader().Slice()) != 0 { hdr := header.Ethernet(pkt.LinkHeader().Slice()) rcvdPkt.senderAddr.LinkAddr = hdr.SourceAddress() } // Raw packet endpoints include link-headers in received packets. pktBuf := pkt.ToBuffer() if ep.cooked { // Cooked packet endpoints don't include the link-headers in received // packets. pktBuf.TrimFront(int64(len(pkt.LinkHeader().Slice()) + len(pkt.VirtioNetHeader().Slice()))) } rcvdPkt.data = stack.NewPacketBuffer(stack.PacketBufferOptions{Payload: pktBuf}) ep.rcvList.PushBack(&rcvdPkt) ep.rcvBufSize += rcvdPkt.data.Size() ep.rcvMu.Unlock() ep.stats.PacketsReceived.Increment() // Notify waiters that there's data to be read. if wasEmpty { ep.waiterQueue.Notify(waiter.ReadableEvents) } } // State implements socket.Socket.State. func (*endpoint) State() uint32 { return 0 } // Info returns a copy of the endpoint info. func (ep *endpoint) Info() tcpip.EndpointInfo { ep.mu.RLock() defer ep.mu.RUnlock() return &stack.TransportEndpointInfo{NetProto: ep.boundNetProto} } // Stats returns a pointer to the endpoint stats. func (ep *endpoint) Stats() tcpip.EndpointStats { return &ep.stats } // SetOwner implements tcpip.Endpoint.SetOwner. func (*endpoint) SetOwner(tcpip.PacketOwner) {} // SocketOptions implements tcpip.Endpoint.SocketOptions. func (ep *endpoint) SocketOptions() *tcpip.SocketOptions { return &ep.ops } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/packet/endpoint_state.go000066400000000000000000000034721465435605700274230ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package packet import ( "context" "fmt" "time" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // saveReceivedAt is invoked by stateify. func (p *packet) saveReceivedAt() int64 { return p.receivedAt.UnixNano() } // loadReceivedAt is invoked by stateify. func (p *packet) loadReceivedAt(_ context.Context, nsec int64) { p.receivedAt = time.Unix(0, nsec) } // beforeSave is invoked by stateify. func (ep *endpoint) beforeSave() { ep.rcvMu.Lock() defer ep.rcvMu.Unlock() ep.rcvDisabled = true ep.stack.RegisterResumableEndpoint(ep) } // afterLoad is invoked by stateify. func (ep *endpoint) afterLoad(ctx context.Context) { ep.mu.Lock() defer ep.mu.Unlock() ep.stack = stack.RestoreStackFromContext(ctx) ep.ops.InitHandler(ep, ep.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits) if err := ep.stack.RegisterPacketEndpoint(ep.boundNIC, ep.boundNetProto, ep); err != nil { panic(fmt.Sprintf("RegisterPacketEndpoint(%d, %d, _): %s", ep.boundNIC, ep.boundNetProto, err)) } ep.rcvMu.Lock() ep.rcvDisabled = false ep.rcvMu.Unlock() } // Resume implements tcpip.ResumableEndpoint.Resume. func (ep *endpoint) Resume() { ep.rcvMu.Lock() defer ep.rcvMu.Unlock() ep.rcvDisabled = false } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/packet/packet_list.go000066400000000000000000000120041465435605700266740ustar00rootroot00000000000000package packet // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type packetElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (packetElementMapper) linkerFor(elem *packet) *packet { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type packetList struct { head *packet tail *packet } // Reset resets list l to the empty state. func (l *packetList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *packetList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *packetList) Front() *packet { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *packetList) Back() *packet { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *packetList) Len() (count int) { for e := l.Front(); e != nil; e = (packetElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *packetList) PushFront(e *packet) { linker := packetElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { packetElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *packetList) PushFrontList(m *packetList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { packetElementMapper{}.linkerFor(l.head).SetPrev(m.tail) packetElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *packetList) PushBack(e *packet) { linker := packetElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { packetElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *packetList) PushBackList(m *packetList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { packetElementMapper{}.linkerFor(l.tail).SetNext(m.head) packetElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *packetList) InsertAfter(b, e *packet) { bLinker := packetElementMapper{}.linkerFor(b) eLinker := packetElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { packetElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *packetList) InsertBefore(a, e *packet) { aLinker := packetElementMapper{}.linkerFor(a) eLinker := packetElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { packetElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *packetList) Remove(e *packet) { linker := packetElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { packetElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { packetElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type packetEntry struct { next *packet prev *packet } // Next returns the entry that follows e in the list. // //go:nosplit func (e *packetEntry) Next() *packet { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *packetEntry) Prev() *packet { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *packetEntry) SetNext(elem *packet) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *packetEntry) SetPrev(elem *packet) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/packet/packet_state_autogen.go000066400000000000000000000103041465435605700305640ustar00rootroot00000000000000// automatically generated by stateify. package packet import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (p *packet) StateTypeName() string { return "pkg/tcpip/transport/packet.packet" } func (p *packet) StateFields() []string { return []string{ "packetEntry", "data", "receivedAt", "senderAddr", "packetInfo", } } func (p *packet) beforeSave() {} // +checklocksignore func (p *packet) StateSave(stateSinkObject state.Sink) { p.beforeSave() var receivedAtValue int64 receivedAtValue = p.saveReceivedAt() stateSinkObject.SaveValue(2, receivedAtValue) stateSinkObject.Save(0, &p.packetEntry) stateSinkObject.Save(1, &p.data) stateSinkObject.Save(3, &p.senderAddr) stateSinkObject.Save(4, &p.packetInfo) } func (p *packet) afterLoad(context.Context) {} // +checklocksignore func (p *packet) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.packetEntry) stateSourceObject.Load(1, &p.data) stateSourceObject.Load(3, &p.senderAddr) stateSourceObject.Load(4, &p.packetInfo) stateSourceObject.LoadValue(2, new(int64), func(y any) { p.loadReceivedAt(ctx, y.(int64)) }) } func (ep *endpoint) StateTypeName() string { return "pkg/tcpip/transport/packet.endpoint" } func (ep *endpoint) StateFields() []string { return []string{ "DefaultSocketOptionsHandler", "waiterQueue", "cooked", "ops", "stats", "rcvList", "rcvBufSize", "rcvClosed", "rcvDisabled", "closed", "boundNetProto", "boundNIC", "lastError", } } // +checklocksignore func (ep *endpoint) StateSave(stateSinkObject state.Sink) { ep.beforeSave() stateSinkObject.Save(0, &ep.DefaultSocketOptionsHandler) stateSinkObject.Save(1, &ep.waiterQueue) stateSinkObject.Save(2, &ep.cooked) stateSinkObject.Save(3, &ep.ops) stateSinkObject.Save(4, &ep.stats) stateSinkObject.Save(5, &ep.rcvList) stateSinkObject.Save(6, &ep.rcvBufSize) stateSinkObject.Save(7, &ep.rcvClosed) stateSinkObject.Save(8, &ep.rcvDisabled) stateSinkObject.Save(9, &ep.closed) stateSinkObject.Save(10, &ep.boundNetProto) stateSinkObject.Save(11, &ep.boundNIC) stateSinkObject.Save(12, &ep.lastError) } // +checklocksignore func (ep *endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &ep.DefaultSocketOptionsHandler) stateSourceObject.Load(1, &ep.waiterQueue) stateSourceObject.Load(2, &ep.cooked) stateSourceObject.Load(3, &ep.ops) stateSourceObject.Load(4, &ep.stats) stateSourceObject.Load(5, &ep.rcvList) stateSourceObject.Load(6, &ep.rcvBufSize) stateSourceObject.Load(7, &ep.rcvClosed) stateSourceObject.Load(8, &ep.rcvDisabled) stateSourceObject.Load(9, &ep.closed) stateSourceObject.Load(10, &ep.boundNetProto) stateSourceObject.Load(11, &ep.boundNIC) stateSourceObject.Load(12, &ep.lastError) stateSourceObject.AfterLoad(func() { ep.afterLoad(ctx) }) } func (l *packetList) StateTypeName() string { return "pkg/tcpip/transport/packet.packetList" } func (l *packetList) StateFields() []string { return []string{ "head", "tail", } } func (l *packetList) beforeSave() {} // +checklocksignore func (l *packetList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *packetList) afterLoad(context.Context) {} // +checklocksignore func (l *packetList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *packetEntry) StateTypeName() string { return "pkg/tcpip/transport/packet.packetEntry" } func (e *packetEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *packetEntry) beforeSave() {} // +checklocksignore func (e *packetEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *packetEntry) afterLoad(context.Context) {} // +checklocksignore func (e *packetEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func init() { state.Register((*packet)(nil)) state.Register((*endpoint)(nil)) state.Register((*packetList)(nil)) state.Register((*packetEntry)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/raw/000077500000000000000000000000001465435605700233705ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/raw/endpoint.go000066400000000000000000000545261465435605700255530ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package raw provides the implementation of raw sockets (see raw(7)). Raw // sockets allow applications to: // // - manually write and inspect transport layer headers and payloads // - receive all traffic of a given transport protocol (e.g. ICMP or UDP) // - optionally write and inspect network layer headers of packets // // Raw sockets don't have any notion of ports, and incoming packets are // demultiplexed solely by protocol number. Thus, a raw UDP endpoint will // receive every UDP packet received by netstack. bind(2) and connect(2) can be // used to filter incoming packets by source and destination. package raw import ( "fmt" "io" "time" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/checksum" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport" "gvisor.dev/gvisor/pkg/tcpip/transport/internal/network" "gvisor.dev/gvisor/pkg/waiter" ) // +stateify savable type rawPacket struct { rawPacketEntry // data holds the actual packet data, including any headers and // payload. data *stack.PacketBuffer receivedAt time.Time `state:".(int64)"` // senderAddr is the network address of the sender. senderAddr tcpip.FullAddress packetInfo tcpip.IPPacketInfo // tosOrTClass stores either the Type of Service for IPv4 or the Traffic Class // for IPv6. tosOrTClass uint8 // ttlOrHopLimit stores either the TTL for IPv4 or the HopLimit for IPv6 ttlOrHopLimit uint8 } // endpoint is the raw socket implementation of tcpip.Endpoint. It is legal to // have goroutines make concurrent calls into the endpoint. // // Lock order: // // endpoint.mu // endpoint.rcvMu // // +stateify savable type endpoint struct { tcpip.DefaultSocketOptionsHandler // The following fields are initialized at creation time and are // immutable. stack *stack.Stack `state:"manual"` transProto tcpip.TransportProtocolNumber waiterQueue *waiter.Queue associated bool net network.Endpoint stats tcpip.TransportEndpointStats ops tcpip.SocketOptions rcvMu sync.Mutex `state:"nosave"` // +checklocks:rcvMu rcvList rawPacketList // +checklocks:rcvMu rcvBufSize int // +checklocks:rcvMu rcvClosed bool // +checklocks:rcvMu rcvDisabled bool mu sync.RWMutex `state:"nosave"` // ipv6ChecksumOffset indicates the offset to populate the IPv6 checksum at. // // A negative value indicates no checksum should be calculated. // // +checklocks:mu ipv6ChecksumOffset int // icmp6Filter holds the filter for ICMPv6 packets. // // +checklocks:mu icmpv6Filter tcpip.ICMPv6Filter } // NewEndpoint returns a raw endpoint for the given protocols. func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { return newEndpoint(stack, netProto, transProto, waiterQueue, true /* associated */) } func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, tcpip.Error) { // Calculating the upper-layer checksum is disabled by default for raw IPv6 // endpoints, unless the upper-layer protocol is ICMPv6. // // As per RFC 3542 section 3.1, // // The kernel will calculate and insert the ICMPv6 checksum for ICMPv6 // raw sockets, since this checksum is mandatory. ipv6ChecksumOffset := -1 if netProto == header.IPv6ProtocolNumber && transProto == header.ICMPv6ProtocolNumber { ipv6ChecksumOffset = header.ICMPv6ChecksumOffset } e := &endpoint{ stack: s, transProto: transProto, waiterQueue: waiterQueue, associated: associated, ipv6ChecksumOffset: ipv6ChecksumOffset, } e.ops.InitHandler(e, e.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits) e.ops.SetMulticastLoop(true) e.ops.SetHeaderIncluded(!associated) e.ops.SetSendBufferSize(32*1024, false /* notify */) e.ops.SetReceiveBufferSize(32*1024, false /* notify */) e.net.Init(s, netProto, transProto, &e.ops, waiterQueue) // Override with stack defaults. var ss tcpip.SendBufferSizeOption if err := s.Option(&ss); err == nil { e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) } var rs tcpip.ReceiveBufferSizeOption if err := s.Option(&rs); err == nil { e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */) } // Unassociated endpoints are write-only and users call Write() with IP // headers included. Because they're write-only, We don't need to // register with the stack. if !associated { e.ops.SetReceiveBufferSize(0, false /* notify */) e.waiterQueue = nil return e, nil } if err := e.stack.RegisterRawTransportEndpoint(netProto, e.transProto, e); err != nil { return nil, err } return e, nil } // WakeupWriters implements tcpip.SocketOptionsHandler. func (e *endpoint) WakeupWriters() { e.net.MaybeSignalWritable() } // HasNIC implements tcpip.SocketOptionsHandler. func (e *endpoint) HasNIC(id int32) bool { return e.stack.HasNIC(tcpip.NICID(id)) } // Abort implements stack.TransportEndpoint.Abort. func (e *endpoint) Abort() { e.Close() } // Close implements tcpip.Endpoint.Close. func (e *endpoint) Close() { e.mu.Lock() defer e.mu.Unlock() if e.net.State() == transport.DatagramEndpointStateClosed { return } e.net.Close() if !e.associated { return } e.stack.UnregisterRawTransportEndpoint(e.net.NetProto(), e.transProto, e) e.rcvMu.Lock() defer e.rcvMu.Unlock() // Clear the receive list. e.rcvClosed = true e.rcvBufSize = 0 for !e.rcvList.Empty() { p := e.rcvList.Front() e.rcvList.Remove(p) p.data.DecRef() } e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) } // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf. func (*endpoint) ModerateRecvBuf(int) {} func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { e.net.SetOwner(owner) } // Read implements tcpip.Endpoint.Read. func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) { e.rcvMu.Lock() // If there's no data to read, return that read would block or that the // endpoint is closed. if e.rcvList.Empty() { var err tcpip.Error = &tcpip.ErrWouldBlock{} if e.rcvClosed { e.stats.ReadErrors.ReadClosed.Increment() err = &tcpip.ErrClosedForReceive{} } e.rcvMu.Unlock() return tcpip.ReadResult{}, err } pkt := e.rcvList.Front() if !opts.Peek { e.rcvList.Remove(pkt) defer pkt.data.DecRef() e.rcvBufSize -= pkt.data.Data().Size() } e.rcvMu.Unlock() // Control Messages // TODO(https://gvisor.dev/issue/7012): Share control message code with other // network endpoints. cm := tcpip.ReceivableControlMessages{ HasTimestamp: true, Timestamp: pkt.receivedAt, } switch netProto := e.net.NetProto(); netProto { case header.IPv4ProtocolNumber: if e.ops.GetReceiveTOS() { cm.HasTOS = true cm.TOS = pkt.tosOrTClass } if e.ops.GetReceiveTTL() { cm.HasTTL = true cm.TTL = pkt.ttlOrHopLimit } if e.ops.GetReceivePacketInfo() { cm.HasIPPacketInfo = true cm.PacketInfo = pkt.packetInfo } case header.IPv6ProtocolNumber: if e.ops.GetReceiveTClass() { cm.HasTClass = true // Although TClass is an 8-bit value it's read in the CMsg as a uint32. cm.TClass = uint32(pkt.tosOrTClass) } if e.ops.GetReceiveHopLimit() { cm.HasHopLimit = true cm.HopLimit = pkt.ttlOrHopLimit } if e.ops.GetIPv6ReceivePacketInfo() { cm.HasIPv6PacketInfo = true cm.IPv6PacketInfo = tcpip.IPv6PacketInfo{ NIC: pkt.packetInfo.NIC, Addr: pkt.packetInfo.DestinationAddr, } } default: panic(fmt.Sprintf("unrecognized network protocol = %d", netProto)) } res := tcpip.ReadResult{ Total: pkt.data.Data().Size(), ControlMessages: cm, } if opts.NeedRemoteAddr { res.RemoteAddr = pkt.senderAddr } n, err := pkt.data.Data().ReadTo(dst, opts.Peek) if n == 0 && err != nil { return res, &tcpip.ErrBadBuffer{} } res.Count = n return res, nil } // Write implements tcpip.Endpoint.Write. func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { netProto := e.net.NetProto() // We can create, but not write to, unassociated IPv6 endpoints. if !e.associated && netProto == header.IPv6ProtocolNumber { return 0, &tcpip.ErrInvalidOptionValue{} } if opts.To != nil { // Raw sockets do not support sending to a IPv4 address on a IPv6 endpoint. if netProto == header.IPv6ProtocolNumber && opts.To.Addr.BitLen() != header.IPv6AddressSizeBits { return 0, &tcpip.ErrInvalidOptionValue{} } } n, err := e.write(p, opts) switch err.(type) { case nil: e.stats.PacketsSent.Increment() case *tcpip.ErrMessageTooLong, *tcpip.ErrInvalidOptionValue: e.stats.WriteErrors.InvalidArgs.Increment() case *tcpip.ErrClosedForSend: e.stats.WriteErrors.WriteClosed.Increment() case *tcpip.ErrInvalidEndpointState: e.stats.WriteErrors.InvalidEndpointState.Increment() case *tcpip.ErrHostUnreachable, *tcpip.ErrBroadcastDisabled, *tcpip.ErrNetworkUnreachable: // Errors indicating any problem with IP routing of the packet. e.stats.SendErrors.NoRoute.Increment() default: // For all other errors when writing to the network layer. e.stats.SendErrors.SendToNetworkFailed.Increment() } return n, err } func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { e.mu.Lock() ctx, err := e.net.AcquireContextForWrite(opts) ipv6ChecksumOffset := e.ipv6ChecksumOffset e.mu.Unlock() if err != nil { return 0, err } defer ctx.Release() if p.Len() > int(ctx.MTU()) { return 0, &tcpip.ErrMessageTooLong{} } // Prevents giant buffer allocations. if p.Len() > header.DatagramMaximumSize { return 0, &tcpip.ErrMessageTooLong{} } var payload buffer.Buffer defer payload.Release() if _, err := payload.WriteFromReader(p, int64(p.Len())); err != nil { return 0, &tcpip.ErrBadBuffer{} } payloadSz := payload.Size() if packetInfo := ctx.PacketInfo(); packetInfo.NetProto == header.IPv6ProtocolNumber && ipv6ChecksumOffset >= 0 { // Make sure we can fit the checksum. if payload.Size() < int64(ipv6ChecksumOffset+checksum.Size) { return 0, &tcpip.ErrInvalidOptionValue{} } payloadView, _ := payload.PullUp(ipv6ChecksumOffset, int(payload.Size())-ipv6ChecksumOffset) xsum := header.PseudoHeaderChecksum(e.transProto, packetInfo.LocalAddress, packetInfo.RemoteAddress, uint16(payload.Size())) checksum.Put(payloadView.AsSlice(), 0) xsum = checksum.Combine(payload.Checksum(0), xsum) checksum.Put(payloadView.AsSlice(), ^xsum) } pkt := ctx.TryNewPacketBuffer(int(ctx.PacketInfo().MaxHeaderLength), payload.Clone()) if pkt == nil { return 0, &tcpip.ErrWouldBlock{} } defer pkt.DecRef() if err := ctx.WritePacket(pkt, e.ops.GetHeaderIncluded()); err != nil { return 0, err } return payloadSz, nil } // Disconnect implements tcpip.Endpoint.Disconnect. func (*endpoint) Disconnect() tcpip.Error { return &tcpip.ErrNotSupported{} } // Connect implements tcpip.Endpoint.Connect. func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error { netProto := e.net.NetProto() // Raw sockets do not support connecting to a IPv4 address on a IPv6 endpoint. if netProto == header.IPv6ProtocolNumber && addr.Addr.BitLen() != header.IPv6AddressSizeBits { return &tcpip.ErrAddressFamilyNotSupported{} } return e.net.ConnectAndThen(addr, func(_ tcpip.NetworkProtocolNumber, _, _ stack.TransportEndpointID) tcpip.Error { if e.associated { // Re-register the endpoint with the appropriate NIC. if err := e.stack.RegisterRawTransportEndpoint(netProto, e.transProto, e); err != nil { return err } e.stack.UnregisterRawTransportEndpoint(netProto, e.transProto, e) } return nil }) } // Shutdown implements tcpip.Endpoint.Shutdown. It's a noop for raw sockets. func (e *endpoint) Shutdown(tcpip.ShutdownFlags) tcpip.Error { if e.net.State() != transport.DatagramEndpointStateConnected { return &tcpip.ErrNotConnected{} } return nil } // Listen implements tcpip.Endpoint.Listen. func (*endpoint) Listen(int) tcpip.Error { return &tcpip.ErrNotSupported{} } // Accept implements tcpip.Endpoint.Accept. func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) { return nil, nil, &tcpip.ErrNotSupported{} } // Bind implements tcpip.Endpoint.Bind. func (e *endpoint) Bind(addr tcpip.FullAddress) tcpip.Error { return e.net.BindAndThen(addr, func(netProto tcpip.NetworkProtocolNumber, _ tcpip.Address) tcpip.Error { if !e.associated { return nil } // Re-register the endpoint with the appropriate NIC. if err := e.stack.RegisterRawTransportEndpoint(netProto, e.transProto, e); err != nil { return err } e.stack.UnregisterRawTransportEndpoint(netProto, e.transProto, e) return nil }) } // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress. func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { a := e.net.GetLocalAddress() // Linux returns the protocol in the port field. a.Port = uint16(e.transProto) return a, nil } // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress. func (*endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { // Even a connected socket doesn't return a remote address. return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} } // Readiness implements tcpip.Endpoint.Readiness. func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { var result waiter.EventMask if e.net.HasSendSpace() { result |= waiter.WritableEvents & mask } // Determine whether the endpoint is readable. if (mask & waiter.ReadableEvents) != 0 { e.rcvMu.Lock() if !e.rcvList.Empty() || e.rcvClosed { result |= waiter.ReadableEvents } e.rcvMu.Unlock() } return result } // SetSockOpt implements tcpip.Endpoint.SetSockOpt. func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { switch opt := opt.(type) { case *tcpip.SocketDetachFilterOption: return nil case *tcpip.ICMPv6Filter: if e.net.NetProto() != header.IPv6ProtocolNumber { return &tcpip.ErrUnknownProtocolOption{} } if e.transProto != header.ICMPv6ProtocolNumber { return &tcpip.ErrInvalidOptionValue{} } e.mu.Lock() defer e.mu.Unlock() e.icmpv6Filter = *opt return nil default: return e.net.SetSockOpt(opt) } } func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { switch opt { case tcpip.IPv6Checksum: if e.net.NetProto() != header.IPv6ProtocolNumber { return &tcpip.ErrUnknownProtocolOption{} } if e.transProto == header.ICMPv6ProtocolNumber { // As per RFC 3542 section 3.1, // // An attempt to set IPV6_CHECKSUM for an ICMPv6 socket will fail. return &tcpip.ErrInvalidOptionValue{} } // Make sure the offset is aligned properly if checksum is requested. if v > 0 && v%checksum.Size != 0 { return &tcpip.ErrInvalidOptionValue{} } e.mu.Lock() defer e.mu.Unlock() e.ipv6ChecksumOffset = v return nil default: return e.net.SetSockOptInt(opt, v) } } // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { switch opt := opt.(type) { case *tcpip.ICMPv6Filter: if e.net.NetProto() != header.IPv6ProtocolNumber { return &tcpip.ErrUnknownProtocolOption{} } if e.transProto != header.ICMPv6ProtocolNumber { return &tcpip.ErrInvalidOptionValue{} } e.mu.RLock() defer e.mu.RUnlock() *opt = e.icmpv6Filter return nil default: return e.net.GetSockOpt(opt) } } // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { switch opt { case tcpip.ReceiveQueueSizeOption: v := 0 e.rcvMu.Lock() if !e.rcvList.Empty() { p := e.rcvList.Front() v = p.data.Data().Size() } e.rcvMu.Unlock() return v, nil case tcpip.IPv6Checksum: if e.net.NetProto() != header.IPv6ProtocolNumber { return 0, &tcpip.ErrUnknownProtocolOption{} } e.mu.Lock() defer e.mu.Unlock() return e.ipv6ChecksumOffset, nil default: return e.net.GetSockOptInt(opt) } } // HandlePacket implements stack.RawTransportEndpoint.HandlePacket. func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { notifyReadableEvents := func() bool { e.mu.RLock() defer e.mu.RUnlock() e.rcvMu.Lock() defer e.rcvMu.Unlock() // Drop the packet if our buffer is currently full or if this is an unassociated // endpoint (i.e endpoint created w/ IPPROTO_RAW). Such endpoints are send only // See: https://man7.org/linux/man-pages/man7/raw.7.html // // An IPPROTO_RAW socket is send only. If you really want to receive // all IP packets, use a packet(7) socket with the ETH_P_IP protocol. // Note that packet sockets don't reassemble IP fragments, unlike raw // sockets. if e.rcvClosed || !e.associated { e.stack.Stats().DroppedPackets.Increment() e.stats.ReceiveErrors.ClosedReceiver.Increment() return false } rcvBufSize := e.ops.GetReceiveBufferSize() if e.rcvDisabled || e.rcvBufSize >= int(rcvBufSize) { e.stack.Stats().DroppedPackets.Increment() e.stats.ReceiveErrors.ReceiveBufferOverflow.Increment() return false } net := pkt.Network() dstAddr := net.DestinationAddress() srcAddr := net.SourceAddress() info := e.net.Info() switch state := e.net.State(); state { case transport.DatagramEndpointStateInitial: case transport.DatagramEndpointStateConnected: // If connected, only accept packets from the remote address we // connected to. if info.ID.RemoteAddress != srcAddr { return false } // Connected sockets may also have been bound to a specific // address/NIC. fallthrough case transport.DatagramEndpointStateBound: // If bound to a NIC, only accept data for that NIC. if info.BindNICID != 0 && info.BindNICID != pkt.NICID { return false } // If bound to an address, only accept data for that address. if info.BindAddr != (tcpip.Address{}) && info.BindAddr != dstAddr { return false } default: panic(fmt.Sprintf("unhandled state = %s", state)) } wasEmpty := e.rcvBufSize == 0 // Push new packet into receive list and increment the buffer size. packet := &rawPacket{ senderAddr: tcpip.FullAddress{ NIC: pkt.NICID, Addr: srcAddr, }, packetInfo: tcpip.IPPacketInfo{ // TODO(gvisor.dev/issue/3556): dstAddr may be a multicast or broadcast // address. LocalAddr should hold a unicast address that can be // used to respond to the incoming packet. LocalAddr: dstAddr, DestinationAddr: dstAddr, NIC: pkt.NICID, }, } // Save any useful information from the network header to the packet. packet.tosOrTClass, _ = pkt.Network().TOS() switch pkt.NetworkProtocolNumber { case header.IPv4ProtocolNumber: packet.ttlOrHopLimit = header.IPv4(pkt.NetworkHeader().Slice()).TTL() case header.IPv6ProtocolNumber: packet.ttlOrHopLimit = header.IPv6(pkt.NetworkHeader().Slice()).HopLimit() } // Raw IPv4 endpoints return the IP header, but IPv6 endpoints do not. // We copy headers' underlying bytes because pkt.*Header may point to // the middle of a slice, and another struct may point to the "outer" // slice. Save/restore doesn't support overlapping slices and will fail. // // TODO(https://gvisor.dev/issue/6517): Avoid the copy once S/R supports // overlapping slices. transportHeader := pkt.TransportHeader().Slice() var combinedBuf buffer.Buffer defer combinedBuf.Release() switch info.NetProto { case header.IPv4ProtocolNumber: networkHeader := pkt.NetworkHeader().Slice() headers := buffer.NewView(len(networkHeader) + len(transportHeader)) headers.Write(networkHeader) headers.Write(transportHeader) combinedBuf = buffer.MakeWithView(headers) pktBuf := pkt.Data().ToBuffer() combinedBuf.Merge(&pktBuf) case header.IPv6ProtocolNumber: if e.transProto == header.ICMPv6ProtocolNumber { if len(transportHeader) < header.ICMPv6MinimumSize { return false } if e.icmpv6Filter.ShouldDeny(uint8(header.ICMPv6(transportHeader).Type())) { return false } } combinedBuf = buffer.MakeWithView(pkt.TransportHeader().View()) pktBuf := pkt.Data().ToBuffer() combinedBuf.Merge(&pktBuf) if checksumOffset := e.ipv6ChecksumOffset; checksumOffset >= 0 { bufSize := int(combinedBuf.Size()) if bufSize < checksumOffset+checksum.Size { // Message too small to fit checksum. return false } xsum := header.PseudoHeaderChecksum(e.transProto, srcAddr, dstAddr, uint16(bufSize)) xsum = checksum.Combine(combinedBuf.Checksum(0), xsum) if xsum != 0xFFFF { // Invalid checksum. return false } } default: panic(fmt.Sprintf("unrecognized protocol number = %d", info.NetProto)) } packet.data = stack.NewPacketBuffer(stack.PacketBufferOptions{Payload: combinedBuf.Clone()}) packet.receivedAt = e.stack.Clock().Now() e.rcvList.PushBack(packet) e.rcvBufSize += packet.data.Data().Size() e.stats.PacketsReceived.Increment() // Notify waiters that there is data to be read now. return wasEmpty }() if notifyReadableEvents { e.waiterQueue.Notify(waiter.ReadableEvents) } } // State implements socket.Socket.State. func (e *endpoint) State() uint32 { return uint32(e.net.State()) } // Info returns a copy of the endpoint info. func (e *endpoint) Info() tcpip.EndpointInfo { ret := e.net.Info() return &ret } // Stats returns a pointer to the endpoint stats. func (e *endpoint) Stats() tcpip.EndpointStats { return &e.stats } // Wait implements stack.TransportEndpoint.Wait. func (*endpoint) Wait() {} // LastError implements tcpip.Endpoint.LastError. func (*endpoint) LastError() tcpip.Error { return nil } // SocketOptions implements tcpip.Endpoint.SocketOptions. func (e *endpoint) SocketOptions() *tcpip.SocketOptions { return &e.ops } func (e *endpoint) setReceiveDisabled(v bool) { e.rcvMu.Lock() defer e.rcvMu.Unlock() e.rcvDisabled = v } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/raw/endpoint_state.go000066400000000000000000000035701465435605700267440ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package raw import ( "context" "fmt" "time" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // saveReceivedAt is invoked by stateify. func (p *rawPacket) saveReceivedAt() int64 { return p.receivedAt.UnixNano() } // loadReceivedAt is invoked by stateify. func (p *rawPacket) loadReceivedAt(_ context.Context, nsec int64) { p.receivedAt = time.Unix(0, nsec) } // afterLoad is invoked by stateify. func (e *endpoint) afterLoad(ctx context.Context) { stack.RestoreStackFromContext(ctx).RegisterRestoredEndpoint(e) } // beforeSave is invoked by stateify. func (e *endpoint) beforeSave() { e.setReceiveDisabled(true) e.stack.RegisterResumableEndpoint(e) } // Restore implements tcpip.RestoredEndpoint.Restore. func (e *endpoint) Restore(s *stack.Stack) { e.net.Resume(s) e.setReceiveDisabled(false) e.stack = s e.ops.InitHandler(e, e.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits) if e.associated { netProto := e.net.NetProto() if err := e.stack.RegisterRawTransportEndpoint(netProto, e.transProto, e); err != nil { panic(fmt.Sprintf("e.stack.RegisterRawTransportEndpoint(%d, %d, _): %s", netProto, e.transProto, err)) } } } // Resume implements tcpip.ResumableEndpoint.Resume. func (e *endpoint) Resume() { e.setReceiveDisabled(false) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/raw/protocol.go000066400000000000000000000045521465435605700255660ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package raw import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/internal/noop" "gvisor.dev/gvisor/pkg/tcpip/transport/packet" "gvisor.dev/gvisor/pkg/waiter" ) // EndpointFactory implements stack.RawFactory. // // +stateify savable type EndpointFactory struct{} // NewUnassociatedEndpoint implements stack.RawFactory.NewUnassociatedEndpoint. func (EndpointFactory) NewUnassociatedEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { return newEndpoint(stack, netProto, transProto, waiterQueue, false /* associated */) } // NewPacketEndpoint implements stack.RawFactory.NewPacketEndpoint. func (EndpointFactory) NewPacketEndpoint(stack *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { return packet.NewEndpoint(stack, cooked, netProto, waiterQueue), nil } // CreateOnlyFactory implements stack.RawFactory. It allows creation of raw // endpoints that do not support reading, writing, binding, etc. // // +stateify savable type CreateOnlyFactory struct{} // NewUnassociatedEndpoint implements stack.RawFactory.NewUnassociatedEndpoint. func (CreateOnlyFactory) NewUnassociatedEndpoint(stk *stack.Stack, _ tcpip.NetworkProtocolNumber, _ tcpip.TransportProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { return noop.New(stk), nil } // NewPacketEndpoint implements stack.RawFactory.NewPacketEndpoint. func (CreateOnlyFactory) NewPacketEndpoint(*stack.Stack, bool, tcpip.NetworkProtocolNumber, *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { // This isn't needed by anything, so it isn't implemented. return nil, &tcpip.ErrNotPermitted{} } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/raw/raw_packet_list.go000066400000000000000000000122541465435605700270760ustar00rootroot00000000000000package raw // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type rawPacketElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (rawPacketElementMapper) linkerFor(elem *rawPacket) *rawPacket { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type rawPacketList struct { head *rawPacket tail *rawPacket } // Reset resets list l to the empty state. func (l *rawPacketList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *rawPacketList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *rawPacketList) Front() *rawPacket { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *rawPacketList) Back() *rawPacket { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *rawPacketList) Len() (count int) { for e := l.Front(); e != nil; e = (rawPacketElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *rawPacketList) PushFront(e *rawPacket) { linker := rawPacketElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { rawPacketElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *rawPacketList) PushFrontList(m *rawPacketList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { rawPacketElementMapper{}.linkerFor(l.head).SetPrev(m.tail) rawPacketElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *rawPacketList) PushBack(e *rawPacket) { linker := rawPacketElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { rawPacketElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *rawPacketList) PushBackList(m *rawPacketList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { rawPacketElementMapper{}.linkerFor(l.tail).SetNext(m.head) rawPacketElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *rawPacketList) InsertAfter(b, e *rawPacket) { bLinker := rawPacketElementMapper{}.linkerFor(b) eLinker := rawPacketElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { rawPacketElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *rawPacketList) InsertBefore(a, e *rawPacket) { aLinker := rawPacketElementMapper{}.linkerFor(a) eLinker := rawPacketElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { rawPacketElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *rawPacketList) Remove(e *rawPacket) { linker := rawPacketElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { rawPacketElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { rawPacketElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type rawPacketEntry struct { next *rawPacket prev *rawPacket } // Next returns the entry that follows e in the list. // //go:nosplit func (e *rawPacketEntry) Next() *rawPacket { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *rawPacketEntry) Prev() *rawPacket { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *rawPacketEntry) SetNext(elem *rawPacket) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *rawPacketEntry) SetPrev(elem *rawPacket) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/raw/raw_state_autogen.go000066400000000000000000000130351465435605700274340ustar00rootroot00000000000000// automatically generated by stateify. package raw import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (p *rawPacket) StateTypeName() string { return "pkg/tcpip/transport/raw.rawPacket" } func (p *rawPacket) StateFields() []string { return []string{ "rawPacketEntry", "data", "receivedAt", "senderAddr", "packetInfo", "tosOrTClass", "ttlOrHopLimit", } } func (p *rawPacket) beforeSave() {} // +checklocksignore func (p *rawPacket) StateSave(stateSinkObject state.Sink) { p.beforeSave() var receivedAtValue int64 receivedAtValue = p.saveReceivedAt() stateSinkObject.SaveValue(2, receivedAtValue) stateSinkObject.Save(0, &p.rawPacketEntry) stateSinkObject.Save(1, &p.data) stateSinkObject.Save(3, &p.senderAddr) stateSinkObject.Save(4, &p.packetInfo) stateSinkObject.Save(5, &p.tosOrTClass) stateSinkObject.Save(6, &p.ttlOrHopLimit) } func (p *rawPacket) afterLoad(context.Context) {} // +checklocksignore func (p *rawPacket) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.rawPacketEntry) stateSourceObject.Load(1, &p.data) stateSourceObject.Load(3, &p.senderAddr) stateSourceObject.Load(4, &p.packetInfo) stateSourceObject.Load(5, &p.tosOrTClass) stateSourceObject.Load(6, &p.ttlOrHopLimit) stateSourceObject.LoadValue(2, new(int64), func(y any) { p.loadReceivedAt(ctx, y.(int64)) }) } func (e *endpoint) StateTypeName() string { return "pkg/tcpip/transport/raw.endpoint" } func (e *endpoint) StateFields() []string { return []string{ "DefaultSocketOptionsHandler", "transProto", "waiterQueue", "associated", "net", "stats", "ops", "rcvList", "rcvBufSize", "rcvClosed", "rcvDisabled", "ipv6ChecksumOffset", "icmpv6Filter", } } // +checklocksignore func (e *endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.DefaultSocketOptionsHandler) stateSinkObject.Save(1, &e.transProto) stateSinkObject.Save(2, &e.waiterQueue) stateSinkObject.Save(3, &e.associated) stateSinkObject.Save(4, &e.net) stateSinkObject.Save(5, &e.stats) stateSinkObject.Save(6, &e.ops) stateSinkObject.Save(7, &e.rcvList) stateSinkObject.Save(8, &e.rcvBufSize) stateSinkObject.Save(9, &e.rcvClosed) stateSinkObject.Save(10, &e.rcvDisabled) stateSinkObject.Save(11, &e.ipv6ChecksumOffset) stateSinkObject.Save(12, &e.icmpv6Filter) } // +checklocksignore func (e *endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.DefaultSocketOptionsHandler) stateSourceObject.Load(1, &e.transProto) stateSourceObject.Load(2, &e.waiterQueue) stateSourceObject.Load(3, &e.associated) stateSourceObject.Load(4, &e.net) stateSourceObject.Load(5, &e.stats) stateSourceObject.Load(6, &e.ops) stateSourceObject.Load(7, &e.rcvList) stateSourceObject.Load(8, &e.rcvBufSize) stateSourceObject.Load(9, &e.rcvClosed) stateSourceObject.Load(10, &e.rcvDisabled) stateSourceObject.Load(11, &e.ipv6ChecksumOffset) stateSourceObject.Load(12, &e.icmpv6Filter) stateSourceObject.AfterLoad(func() { e.afterLoad(ctx) }) } func (e *EndpointFactory) StateTypeName() string { return "pkg/tcpip/transport/raw.EndpointFactory" } func (e *EndpointFactory) StateFields() []string { return []string{} } func (e *EndpointFactory) beforeSave() {} // +checklocksignore func (e *EndpointFactory) StateSave(stateSinkObject state.Sink) { e.beforeSave() } func (e *EndpointFactory) afterLoad(context.Context) {} // +checklocksignore func (e *EndpointFactory) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (c *CreateOnlyFactory) StateTypeName() string { return "pkg/tcpip/transport/raw.CreateOnlyFactory" } func (c *CreateOnlyFactory) StateFields() []string { return []string{} } func (c *CreateOnlyFactory) beforeSave() {} // +checklocksignore func (c *CreateOnlyFactory) StateSave(stateSinkObject state.Sink) { c.beforeSave() } func (c *CreateOnlyFactory) afterLoad(context.Context) {} // +checklocksignore func (c *CreateOnlyFactory) StateLoad(ctx context.Context, stateSourceObject state.Source) { } func (l *rawPacketList) StateTypeName() string { return "pkg/tcpip/transport/raw.rawPacketList" } func (l *rawPacketList) StateFields() []string { return []string{ "head", "tail", } } func (l *rawPacketList) beforeSave() {} // +checklocksignore func (l *rawPacketList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *rawPacketList) afterLoad(context.Context) {} // +checklocksignore func (l *rawPacketList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *rawPacketEntry) StateTypeName() string { return "pkg/tcpip/transport/raw.rawPacketEntry" } func (e *rawPacketEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *rawPacketEntry) beforeSave() {} // +checklocksignore func (e *rawPacketEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *rawPacketEntry) afterLoad(context.Context) {} // +checklocksignore func (e *rawPacketEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func init() { state.Register((*rawPacket)(nil)) state.Register((*endpoint)(nil)) state.Register((*EndpointFactory)(nil)) state.Register((*CreateOnlyFactory)(nil)) state.Register((*rawPacketList)(nil)) state.Register((*rawPacketEntry)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/000077500000000000000000000000001465435605700233655ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/accept.go000066400000000000000000000555231465435605700251650ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "container/list" "crypto/sha1" "encoding/binary" "fmt" "hash" "io" "time" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/ports" "gvisor.dev/gvisor/pkg/tcpip/seqnum" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/waiter" ) const ( // tsLen is the length, in bits, of the timestamp in the SYN cookie. tsLen = 8 // tsMask is a mask for timestamp values (i.e., tsLen bits). tsMask = (1 << tsLen) - 1 // tsOffset is the offset, in bits, of the timestamp in the SYN cookie. tsOffset = 24 // hashMask is the mask for hash values (i.e., tsOffset bits). hashMask = (1 << tsOffset) - 1 // maxTSDiff is the maximum allowed difference between a received cookie // timestamp and the current timestamp. If the difference is greater // than maxTSDiff, the cookie is expired. maxTSDiff = 2 ) var ( // mssTable is a slice containing the possible MSS values that we // encode in the SYN cookie with two bits. mssTable = []uint16{536, 1300, 1440, 1460} ) func encodeMSS(mss uint16) uint32 { for i := len(mssTable) - 1; i > 0; i-- { if mss >= mssTable[i] { return uint32(i) } } return 0 } // listenContext is used by a listening endpoint to store state used while // listening for connections. This struct is allocated by the listen goroutine // and must not be accessed or have its methods called concurrently as they // may mutate the stored objects. type listenContext struct { stack *stack.Stack protocol *protocol // rcvWnd is the receive window that is sent by this listening context // in the initial SYN-ACK. rcvWnd seqnum.Size // nonce are random bytes that are initialized once when the context // is created and used to seed the hash function when generating // the SYN cookie. nonce [2][sha1.BlockSize]byte // listenEP is a reference to the listening endpoint associated with // this context. Can be nil if the context is created by the forwarder. listenEP *Endpoint // hasherMu protects hasher. hasherMu sync.Mutex // hasher is the hash function used to generate a SYN cookie. hasher hash.Hash // v6Only is true if listenEP is a dual stack socket and has the // IPV6_V6ONLY option set. v6Only bool // netProto indicates the network protocol(IPv4/v6) for the listening // endpoint. netProto tcpip.NetworkProtocolNumber } // timeStamp returns an 8-bit timestamp with a granularity of 64 seconds. func timeStamp(clock tcpip.Clock) uint32 { return uint32(clock.NowMonotonic().Sub(tcpip.MonotonicTime{}).Seconds()) >> 6 & tsMask } // newListenContext creates a new listen context. func newListenContext(stk *stack.Stack, protocol *protocol, listenEP *Endpoint, rcvWnd seqnum.Size, v6Only bool, netProto tcpip.NetworkProtocolNumber) *listenContext { l := &listenContext{ stack: stk, protocol: protocol, rcvWnd: rcvWnd, hasher: sha1.New(), v6Only: v6Only, netProto: netProto, listenEP: listenEP, } for i := range l.nonce { if _, err := io.ReadFull(stk.SecureRNG().Reader, l.nonce[i][:]); err != nil { panic(err) } } return l } // cookieHash calculates the cookieHash for the given id, timestamp and nonce // index. The hash is used to create and validate cookies. func (l *listenContext) cookieHash(id stack.TransportEndpointID, ts uint32, nonceIndex int) uint32 { // Initialize block with fixed-size data: local ports and v. var payload [8]byte binary.BigEndian.PutUint16(payload[0:], id.LocalPort) binary.BigEndian.PutUint16(payload[2:], id.RemotePort) binary.BigEndian.PutUint32(payload[4:], ts) // Feed everything to the hasher. l.hasherMu.Lock() l.hasher.Reset() // Per hash.Hash.Writer: // // It never returns an error. l.hasher.Write(payload[:]) l.hasher.Write(l.nonce[nonceIndex][:]) l.hasher.Write(id.LocalAddress.AsSlice()) l.hasher.Write(id.RemoteAddress.AsSlice()) // Finalize the calculation of the hash and return the first 4 bytes. h := l.hasher.Sum(nil) l.hasherMu.Unlock() return binary.BigEndian.Uint32(h[:]) } // createCookie creates a SYN cookie for the given id and incoming sequence // number. func (l *listenContext) createCookie(id stack.TransportEndpointID, seq seqnum.Value, data uint32) seqnum.Value { ts := timeStamp(l.stack.Clock()) v := l.cookieHash(id, 0, 0) + uint32(seq) + (ts << tsOffset) v += (l.cookieHash(id, ts, 1) + data) & hashMask return seqnum.Value(v) } // isCookieValid checks if the supplied cookie is valid for the given id and // sequence number. If it is, it also returns the data originally encoded in the // cookie when createCookie was called. func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnum.Value, seq seqnum.Value) (uint32, bool) { ts := timeStamp(l.stack.Clock()) v := uint32(cookie) - l.cookieHash(id, 0, 0) - uint32(seq) cookieTS := v >> tsOffset if ((ts - cookieTS) & tsMask) > maxTSDiff { return 0, false } return (v - l.cookieHash(id, cookieTS, 1)) & hashMask, true } // createConnectingEndpoint creates a new endpoint in a connecting state, with // the connection parameters given by the arguments. The newly created endpoint // will be locked. // +checklocksacquire:n.mu func (l *listenContext) createConnectingEndpoint(s *segment, rcvdSynOpts header.TCPSynOptions, queue *waiter.Queue) (n *Endpoint, _ tcpip.Error) { // Create a new endpoint. netProto := l.netProto if netProto == 0 { netProto = s.pkt.NetworkProtocolNumber } route, err := l.stack.FindRoute(s.pkt.NICID, s.pkt.Network().DestinationAddress(), s.pkt.Network().SourceAddress(), s.pkt.NetworkProtocolNumber, false /* multicastLoop */) if err != nil { return nil, err // +checklocksignore } n = newEndpoint(l.stack, l.protocol, netProto, queue) n.mu.Lock() n.ops.SetV6Only(l.v6Only) n.TransportEndpointInfo.ID = s.id n.boundNICID = s.pkt.NICID n.route = route n.effectiveNetProtos = []tcpip.NetworkProtocolNumber{s.pkt.NetworkProtocolNumber} n.ops.SetReceiveBufferSize(int64(l.rcvWnd), false /* notify */) n.amss = calculateAdvertisedMSS(n.userMSS, n.route) n.setEndpointState(StateConnecting) n.maybeEnableTimestamp(rcvdSynOpts) n.maybeEnableSACKPermitted(rcvdSynOpts) n.initGSO() // Bootstrap the auto tuning algorithm. Starting at zero will result in // a large step function on the first window adjustment causing the // window to grow to a really large value. initWnd := n.initialReceiveWindow() n.rcvQueueMu.Lock() n.RcvAutoParams.PrevCopiedBytes = initWnd n.rcvQueueMu.Unlock() return n, nil } // startHandshake creates a new endpoint in connecting state and then sends // the SYN-ACK for the TCP 3-way handshake. It returns the state of the // handshake in progress, which includes the new endpoint in the SYN-RCVD // state. // // On success, a handshake h is returned. // // NOTE: h.ep.mu is not held and must be acquired if any state needs to be // modified. // // Precondition: if l.listenEP != nil, l.listenEP.mu must be locked. func (l *listenContext) startHandshake(s *segment, opts header.TCPSynOptions, queue *waiter.Queue, owner tcpip.PacketOwner) (h *handshake, _ tcpip.Error) { // Create new endpoint. irs := s.sequenceNumber isn := generateSecureISN(s.id, l.stack.Clock(), l.protocol.seqnumSecret) ep, err := l.createConnectingEndpoint(s, opts, queue) if err != nil { return nil, err // +checklocksignore } ep.owner = owner // listenEP is nil when listenContext is used by tcp.Forwarder. deferAccept := time.Duration(0) if l.listenEP != nil { if l.listenEP.EndpointState() != StateListen { // Ensure we release any registrations done by the newly // created endpoint. ep.mu.Unlock() ep.Close() return nil, &tcpip.ErrConnectionAborted{} // +checklocksignore } // Propagate any inheritable options from the listening endpoint // to the newly created endpoint. l.listenEP.propagateInheritableOptionsLocked(ep) // +checklocksforce if !ep.reserveTupleLocked() { ep.mu.Unlock() ep.Close() return nil, &tcpip.ErrConnectionAborted{} // +checklocksignore } deferAccept = l.listenEP.deferAccept } // Register new endpoint so that packets are routed to it. if err := ep.stack.RegisterTransportEndpoint( ep.effectiveNetProtos, ProtocolNumber, ep.TransportEndpointInfo.ID, ep, ep.boundPortFlags, ep.boundBindToDevice, ); err != nil { ep.mu.Unlock() ep.Close() ep.drainClosingSegmentQueue() return nil, err // +checklocksignore } ep.isRegistered = true // Initialize and start the handshake. h = ep.newPassiveHandshake(isn, irs, opts, deferAccept) h.listenEP = l.listenEP h.start() h.ep.mu.Unlock() return h, nil } // performHandshake performs a TCP 3-way handshake. On success, the new // established endpoint is returned. // // Precondition: if l.listenEP != nil, l.listenEP.mu must be locked. func (l *listenContext) performHandshake(s *segment, opts header.TCPSynOptions, queue *waiter.Queue, owner tcpip.PacketOwner) (*Endpoint, tcpip.Error) { waitEntry, notifyCh := waiter.NewChannelEntry(waiter.WritableEvents) queue.EventRegister(&waitEntry) defer queue.EventUnregister(&waitEntry) h, err := l.startHandshake(s, opts, queue, owner) if err != nil { return nil, err } // performHandshake is used by the Forwarder which will block till the // handshake either succeeds or fails. We do this by registering for // events above and block on the notification channel. <-notifyCh ep := h.ep ep.mu.Lock() if !ep.EndpointState().connected() { ep.stack.Stats().TCP.FailedConnectionAttempts.Increment() ep.stats.FailedConnectionAttempts.Increment() ep.h = nil ep.mu.Unlock() ep.Close() ep.notifyAborted() ep.drainClosingSegmentQueue() err := ep.LastError() if err == nil { // If err was nil then return the best error we can to indicate // a connection failure. err = &tcpip.ErrConnectionAborted{} } return nil, err } ep.isConnectNotified = true // Transfer any state from the completed handshake to the endpoint. // // Update the receive window scaling. We can't do it before the // handshake because it's possible that the peer doesn't support window // scaling. ep.rcv.RcvWndScale = ep.h.effectiveRcvWndScale() // Clean up handshake state stored in the endpoint so that it can be // GCed. ep.h = nil ep.mu.Unlock() return ep, nil } // propagateInheritableOptionsLocked propagates any options set on the listening // endpoint to the newly created endpoint. // // +checklocks:e.mu // +checklocks:n.mu func (e *Endpoint) propagateInheritableOptionsLocked(n *Endpoint) { n.userTimeout = e.userTimeout n.portFlags = e.portFlags n.boundBindToDevice = e.boundBindToDevice n.boundPortFlags = e.boundPortFlags n.userMSS = e.userMSS } // reserveTupleLocked reserves an accepted endpoint's tuple. // // Precondition: e.propagateInheritableOptionsLocked has been called. // // +checklocks:e.mu func (e *Endpoint) reserveTupleLocked() bool { dest := tcpip.FullAddress{ Addr: e.TransportEndpointInfo.ID.RemoteAddress, Port: e.TransportEndpointInfo.ID.RemotePort, } portRes := ports.Reservation{ Networks: e.effectiveNetProtos, Transport: ProtocolNumber, Addr: e.TransportEndpointInfo.ID.LocalAddress, Port: e.TransportEndpointInfo.ID.LocalPort, Flags: e.boundPortFlags, BindToDevice: e.boundBindToDevice, Dest: dest, } if !e.stack.ReserveTuple(portRes) { e.stack.Stats().TCP.FailedPortReservations.Increment() return false } e.isPortReserved = true e.boundDest = dest return true } // notifyAborted wakes up any waiters on registered, but not accepted // endpoints. // // This is strictly not required normally as a socket that was never accepted // can't really have any registered waiters except when stack.Wait() is called // which waits for all registered endpoints to stop and expects an EventHUp. func (e *Endpoint) notifyAborted() { e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) } func (e *Endpoint) acceptQueueIsFull() bool { e.acceptMu.Lock() full := e.acceptQueue.isFull() e.acceptMu.Unlock() return full } // +stateify savable type acceptQueue struct { // NB: this could be an endpointList, but ilist only permits endpoints to // belong to one list at a time, and endpoints are already stored in the // dispatcher's list. endpoints list.List `state:".([]*Endpoint)"` // pendingEndpoints is a set of all endpoints for which a handshake is // in progress. pendingEndpoints map[*Endpoint]struct{} // capacity is the maximum number of endpoints that can be in endpoints. capacity int } func (a *acceptQueue) isFull() bool { return a.endpoints.Len() >= a.capacity } // handleListenSegment is called when a listening endpoint receives a segment // and needs to handle it. // // +checklocks:e.mu func (e *Endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Error { e.rcvQueueMu.Lock() rcvClosed := e.RcvClosed e.rcvQueueMu.Unlock() if rcvClosed || s.flags.Contains(header.TCPFlagSyn|header.TCPFlagAck) { // If the endpoint is shutdown, reply with reset. // // RFC 793 section 3.4 page 35 (figure 12) outlines that a RST // must be sent in response to a SYN-ACK while in the listen // state to prevent completing a handshake from an old SYN. return replyWithReset(e.stack, s, e.sendTOS, e.ipv4TTL, e.ipv6HopLimit) } switch { case s.flags.Contains(header.TCPFlagRst): e.stack.Stats().DroppedPackets.Increment() return nil case s.flags.Contains(header.TCPFlagSyn): if e.acceptQueueIsFull() { e.stack.Stats().TCP.ListenOverflowSynDrop.Increment() e.stats.ReceiveErrors.ListenOverflowSynDrop.Increment() e.stack.Stats().DroppedPackets.Increment() return nil } opts := parseSynSegmentOptions(s) useSynCookies, err := func() (bool, tcpip.Error) { var alwaysUseSynCookies tcpip.TCPAlwaysUseSynCookies if err := e.stack.TransportProtocolOption(header.TCPProtocolNumber, &alwaysUseSynCookies); err != nil { panic(fmt.Sprintf("TransportProtocolOption(%d, %T) = %s", header.TCPProtocolNumber, alwaysUseSynCookies, err)) } if alwaysUseSynCookies { return true, nil } e.acceptMu.Lock() defer e.acceptMu.Unlock() // The capacity of the accepted queue would always be one greater than the // listen backlog. But, the SYNRCVD connections count is always checked // against the listen backlog value for Linux parity reason. // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/inet_connection_sock.h#L280 if len(e.acceptQueue.pendingEndpoints) == e.acceptQueue.capacity-1 { return true, nil } h, err := ctx.startHandshake(s, opts, &waiter.Queue{}, e.owner) if err != nil { e.stack.Stats().TCP.FailedConnectionAttempts.Increment() e.stats.FailedConnectionAttempts.Increment() return false, err } e.acceptQueue.pendingEndpoints[h.ep] = struct{}{} return false, nil }() if err != nil { return err } if !useSynCookies { return nil } net := s.pkt.Network() route, err := e.stack.FindRoute(s.pkt.NICID, net.DestinationAddress(), net.SourceAddress(), s.pkt.NetworkProtocolNumber, false /* multicastLoop */) if err != nil { return err } defer route.Release() // Send SYN without window scaling because we currently // don't encode this information in the cookie. // // Enable Timestamp option if the original syn did have // the timestamp option specified. // // Use the user supplied MSS on the listening socket for // new connections, if available. synOpts := header.TCPSynOptions{ WS: -1, TS: opts.TS, TSEcr: opts.TSVal, MSS: calculateAdvertisedMSS(e.userMSS, route), } if opts.TS { offset := e.protocol.tsOffset(net.DestinationAddress(), net.SourceAddress()) now := e.stack.Clock().NowMonotonic() synOpts.TSVal = offset.TSVal(now) } cookie := ctx.createCookie(s.id, s.sequenceNumber, encodeMSS(opts.MSS)) fields := tcpFields{ id: s.id, ttl: calculateTTL(route, e.ipv4TTL, e.ipv6HopLimit), tos: e.sendTOS, flags: header.TCPFlagSyn | header.TCPFlagAck, seq: cookie, ack: s.sequenceNumber + 1, rcvWnd: ctx.rcvWnd, } if err := e.sendSynTCP(route, fields, synOpts); err != nil { return err } e.stack.Stats().TCP.ListenOverflowSynCookieSent.Increment() return nil case s.flags.Contains(header.TCPFlagAck): iss := s.ackNumber - 1 irs := s.sequenceNumber - 1 // As an edge case when SYN-COOKIES are in use and we receive a // segment that has data and is valid we should check if it // already matches a created endpoint and redirect the segment // rather than try and create a new endpoint. This can happen // where the final ACK for the handshake and other data packets // arrive at the same time and are queued to the listening // endpoint before the listening endpoint has had time to // process the first ACK and create the endpoint that matches // the incoming packet's full 5 tuple. netProtos := []tcpip.NetworkProtocolNumber{s.pkt.NetworkProtocolNumber} // If the local address is an IPv4 Address then also look for IPv6 // dual stack endpoints. if s.id.LocalAddress.To4() != (tcpip.Address{}) { netProtos = []tcpip.NetworkProtocolNumber{header.IPv4ProtocolNumber, header.IPv6ProtocolNumber} } for _, netProto := range netProtos { if newEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, s.id, s.pkt.NICID); newEP != nil && newEP != e { tcpEP := newEP.(*Endpoint) if !tcpEP.EndpointState().connected() { continue } if !tcpEP.enqueueSegment(s) { // Just silently drop the segment as we failed // to queue, we don't want to generate a RST // further below or try and create a new // endpoint etc. return nil } tcpEP.notifyProcessor() return nil } } // Since SYN cookies are in use this is potentially an ACK to a // SYN-ACK we sent but don't have a half open connection state // as cookies are being used to protect against a potential SYN // flood. In such cases validate the cookie and if valid create // a fully connected endpoint and deliver to the accept queue. // // If not, silently drop the ACK to avoid leaking information // when under a potential syn flood attack. // // Validate the cookie. data, ok := ctx.isCookieValid(s.id, iss, irs) if !ok || int(data) >= len(mssTable) { e.stack.Stats().TCP.ListenOverflowInvalidSynCookieRcvd.Increment() e.stack.Stats().DroppedPackets.Increment() // When not using SYN cookies, as per RFC 793, section 3.9, page 64: // Any acknowledgment is bad if it arrives on a connection still in // the LISTEN state. An acceptable reset segment should be formed // for any arriving ACK-bearing segment. The RST should be // formatted as follows: // // // // Send a reset as this is an ACK for which there is no // half open connections and we are not using cookies // yet. // // The only time we should reach here when a connection // was opened and closed really quickly and a delayed // ACK was received from the sender. return replyWithReset(e.stack, s, e.sendTOS, e.ipv4TTL, e.ipv6HopLimit) } // Keep hold of acceptMu until the new endpoint is in the accept queue (or // if there is an error), to guarantee that we will keep our spot in the // queue even if another handshake from the syn queue completes. e.acceptMu.Lock() if e.acceptQueue.isFull() { // Silently drop the ack as the application can't accept // the connection at this point. The ack will be // retransmitted by the sender anyway and we can // complete the connection at the time of retransmit if // the backlog has space. e.acceptMu.Unlock() e.stack.Stats().TCP.ListenOverflowAckDrop.Increment() e.stats.ReceiveErrors.ListenOverflowAckDrop.Increment() e.stack.Stats().DroppedPackets.Increment() return nil } e.stack.Stats().TCP.ListenOverflowSynCookieRcvd.Increment() // Create newly accepted endpoint and deliver it. rcvdSynOptions := header.TCPSynOptions{ MSS: mssTable[data], // Disable Window scaling as original SYN is // lost. WS: -1, } // When syn cookies are in use we enable timestamp only // if the ack specifies the timestamp option assuming // that the other end did in fact negotiate the // timestamp option in the original SYN. if s.parsedOptions.TS { rcvdSynOptions.TS = true rcvdSynOptions.TSVal = s.parsedOptions.TSVal rcvdSynOptions.TSEcr = s.parsedOptions.TSEcr } n, err := ctx.createConnectingEndpoint(s, rcvdSynOptions, &waiter.Queue{}) if err != nil { e.acceptMu.Unlock() return err } // Propagate any inheritable options from the listening endpoint // to the newly created endpoint. e.propagateInheritableOptionsLocked(n) if !n.reserveTupleLocked() { n.mu.Unlock() e.acceptMu.Unlock() n.Close() e.stack.Stats().TCP.FailedConnectionAttempts.Increment() e.stats.FailedConnectionAttempts.Increment() return nil } // Register new endpoint so that packets are routed to it. if err := n.stack.RegisterTransportEndpoint( n.effectiveNetProtos, ProtocolNumber, n.TransportEndpointInfo.ID, n, n.boundPortFlags, n.boundBindToDevice, ); err != nil { n.mu.Unlock() e.acceptMu.Unlock() n.Close() e.stack.Stats().TCP.FailedConnectionAttempts.Increment() e.stats.FailedConnectionAttempts.Increment() return err } n.isRegistered = true net := s.pkt.Network() n.TSOffset = n.protocol.tsOffset(net.DestinationAddress(), net.SourceAddress()) // Switch state to connected. n.isConnectNotified = true h := handshake{ ep: n, iss: iss, ackNum: irs + 1, rcvWnd: seqnum.Size(n.initialReceiveWindow()), sndWnd: s.window, rcvWndScale: e.rcvWndScaleForHandshake(), sndWndScale: rcvdSynOptions.WS, mss: rcvdSynOptions.MSS, sampleRTTWithTSOnly: true, } h.ep.AssertLockHeld(n) h.transitionToStateEstablishedLocked(s) n.mu.Unlock() // Requeue the segment if the ACK completing the handshake has more info // to be processed by the newly established endpoint. if (s.flags.Contains(header.TCPFlagFin) || s.payloadSize() > 0) && n.enqueueSegment(s) { n.notifyProcessor() } e.stack.Stats().TCP.PassiveConnectionOpenings.Increment() // Deliver the endpoint to the accept queue. e.acceptQueue.endpoints.PushBack(n) e.acceptMu.Unlock() e.waiterQueue.Notify(waiter.ReadableEvents) return nil default: e.stack.Stats().DroppedPackets.Increment() return nil } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/connect.go000066400000000000000000001402061465435605700253500ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "crypto/sha256" "encoding/binary" "fmt" "math" "time" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/checksum" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/seqnum" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/waiter" ) // InitialRTO is the initial retransmission timeout. // https://github.com/torvalds/linux/blob/7c636d4d20f/include/net/tcp.h#L142 const InitialRTO = time.Second // maxSegmentsPerWake is the maximum number of segments to process in the main // protocol goroutine per wake-up. Yielding [after this number of segments are // processed] allows other events to be processed as well (e.g., timeouts, // resets, etc.). const maxSegmentsPerWake = 100 type handshakeState int // The following are the possible states of the TCP connection during a 3-way // handshake. A depiction of the states and transitions can be found in RFC 793, // page 23. const ( handshakeSynSent handshakeState = iota handshakeSynRcvd handshakeCompleted ) const ( // Maximum space available for options. maxOptionSize = 40 ) // handshake holds the state used during a TCP 3-way handshake. // // NOTE: handshake.ep.mu is held during handshake processing. It is released if // we are going to block and reacquired when we start processing an event. // // +stateify savable type handshake struct { ep *Endpoint listenEP *Endpoint state handshakeState active bool flags header.TCPFlags ackNum seqnum.Value // iss is the initial send sequence number, as defined in RFC 793. iss seqnum.Value // rcvWnd is the receive window, as defined in RFC 793. rcvWnd seqnum.Size // sndWnd is the send window, as defined in RFC 793. sndWnd seqnum.Size // mss is the maximum segment size received from the peer. mss uint16 // sndWndScale is the send window scale, as defined in RFC 1323. A // negative value means no scaling is supported by the peer. sndWndScale int // rcvWndScale is the receive window scale, as defined in RFC 1323. rcvWndScale int // startTime is the time at which the first SYN/SYN-ACK was sent. startTime tcpip.MonotonicTime // deferAccept if non-zero will drop the final ACK for a passive // handshake till an ACK segment with data is received or the timeout is // hit. deferAccept time.Duration // acked is true if the final ACK for a 3-way handshake has // been received. This is required to stop retransmitting the // original SYN-ACK when deferAccept is enabled. acked bool // sendSYNOpts is the cached values for the SYN options to be sent. sendSYNOpts header.TCPSynOptions // sampleRTTWithTSOnly is true when the segment was retransmitted or we can't // tell; then RTT can only be sampled when the incoming segment has timestamp // options enabled. sampleRTTWithTSOnly bool // retransmitTimer is used to retransmit SYN/SYN-ACK with exponential backoff // till handshake is either completed or timesout. retransmitTimer *backoffTimer `state:"nosave"` } // timerHandler takes a handler function for a timer and returns a function that // will invoke the provided handler with the endpoint mutex held. In addition // the returned function will perform any cleanup that may be required if the // timer handler returns an error. In the case of no errors it will notify the // processor if there are pending segments that need to be processed. // // NOTE: e.mu is held for the duration of the call to f(). func timerHandler(e *Endpoint, f func() tcpip.Error) func() { return func() { e.mu.Lock() if err := f(); err != nil { e.lastErrorMu.Lock() // If the handler timed out and we have a lastError recorded (maybe due // to an ICMP message received), promote it to be the hard error. if _, isTimeout := err.(*tcpip.ErrTimeout); e.lastError != nil && isTimeout { e.hardError = e.lastError } else { e.hardError = err } e.lastError = err e.lastErrorMu.Unlock() e.cleanupLocked() e.setEndpointState(StateError) e.mu.Unlock() e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) return } processor := e.protocol.dispatcher.selectProcessor(e.ID) e.mu.Unlock() // notify processor if there are pending segments to be // processed. if !e.segmentQueue.empty() { processor.queueEndpoint(e) } } } // +checklocks:e.mu // +checklocksacquire:h.ep.mu func (e *Endpoint) newHandshake() (h *handshake) { h = &handshake{ ep: e, active: true, rcvWnd: seqnum.Size(e.initialReceiveWindow()), rcvWndScale: e.rcvWndScaleForHandshake(), } h.ep.AssertLockHeld(e) h.resetState() // Store reference to handshake state in endpoint. e.h = h // By the time handshake is created, e.ID is already initialized. e.TSOffset = e.protocol.tsOffset(e.ID.LocalAddress, e.ID.RemoteAddress) timer, err := newBackoffTimer(h.ep.stack.Clock(), InitialRTO, MaxRTO, timerHandler(e, h.retransmitHandlerLocked)) if err != nil { panic(fmt.Sprintf("newBackOffTimer(_, %s, %s, _) failed: %s", InitialRTO, MaxRTO, err)) } h.retransmitTimer = timer return h } // +checklocks:e.mu // +checklocksacquire:h.ep.mu func (e *Endpoint) newPassiveHandshake(isn, irs seqnum.Value, opts header.TCPSynOptions, deferAccept time.Duration) (h *handshake) { h = e.newHandshake() h.resetToSynRcvd(isn, irs, opts, deferAccept) return h } // FindWndScale determines the window scale to use for the given maximum window // size. func FindWndScale(wnd seqnum.Size) int { if wnd < 0x10000 { return 0 } max := seqnum.Size(math.MaxUint16) s := 0 for wnd > max && s < header.MaxWndScale { s++ max <<= 1 } return s } // resetState resets the state of the handshake object such that it becomes // ready for a new 3-way handshake. func (h *handshake) resetState() { h.state = handshakeSynSent h.flags = header.TCPFlagSyn h.ackNum = 0 h.mss = 0 h.iss = generateSecureISN(h.ep.TransportEndpointInfo.ID, h.ep.stack.Clock(), h.ep.protocol.seqnumSecret) } // generateSecureISN generates a secure Initial Sequence number based on the // recommendation here https://tools.ietf.org/html/rfc6528#page-3. func generateSecureISN(id stack.TransportEndpointID, clock tcpip.Clock, seed [16]byte) seqnum.Value { isnHasher := sha256.New() // Per hash.Hash.Writer: // // It never returns an error. _, _ = isnHasher.Write(seed[:]) _, _ = isnHasher.Write(id.LocalAddress.AsSlice()) _, _ = isnHasher.Write(id.RemoteAddress.AsSlice()) portBuf := make([]byte, 2) binary.LittleEndian.PutUint16(portBuf, id.LocalPort) _, _ = isnHasher.Write(portBuf) binary.LittleEndian.PutUint16(portBuf, id.RemotePort) _, _ = isnHasher.Write(portBuf) // The time period here is 64ns. This is similar to what linux uses // generate a sequence number that overlaps less than one // time per MSL (2 minutes). // // A 64ns clock ticks 10^9/64 = 15625000) times in a second. // To wrap the whole 32 bit space would require // 2^32/1562500 ~ 274 seconds. // // Which sort of guarantees that we won't reuse the ISN for a new // connection for the same tuple for at least 274s. hash := binary.LittleEndian.Uint32(isnHasher.Sum(nil)[:4]) isn := hash + uint32(clock.NowMonotonic().Sub(tcpip.MonotonicTime{}).Nanoseconds()>>6) return seqnum.Value(isn) } // effectiveRcvWndScale returns the effective receive window scale to be used. // If the peer doesn't support window scaling, the effective rcv wnd scale is // zero; otherwise it's the value calculated based on the initial rcv wnd. func (h *handshake) effectiveRcvWndScale() uint8 { if h.sndWndScale < 0 { return 0 } return uint8(h.rcvWndScale) } // resetToSynRcvd resets the state of the handshake object to the SYN-RCVD // state. // +checklocks:h.ep.mu func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts header.TCPSynOptions, deferAccept time.Duration) { h.active = false h.state = handshakeSynRcvd h.flags = header.TCPFlagSyn | header.TCPFlagAck h.iss = iss h.ackNum = irs + 1 h.mss = opts.MSS h.sndWndScale = opts.WS h.deferAccept = deferAccept h.ep.setEndpointState(StateSynRecv) } // checkAck checks if the ACK number, if present, of a segment received during // a TCP 3-way handshake is valid. func (h *handshake) checkAck(s *segment) bool { return !(s.flags.Contains(header.TCPFlagAck) && s.ackNumber != h.iss+1) } // synSentState handles a segment received when the TCP 3-way handshake is in // the SYN-SENT state. // +checklocks:h.ep.mu func (h *handshake) synSentState(s *segment) tcpip.Error { // RFC 793, page 37, states that in the SYN-SENT state, a reset is // acceptable if the ack field acknowledges the SYN. if s.flags.Contains(header.TCPFlagRst) { if s.flags.Contains(header.TCPFlagAck) && s.ackNumber == h.iss+1 { // RFC 793, page 67, states that "If the RST bit is set [and] If the ACK // was acceptable then signal the user "error: connection reset", drop // the segment, enter CLOSED state, delete TCB, and return." // Although the RFC above calls out ECONNRESET, Linux actually returns // ECONNREFUSED here so we do as well. return &tcpip.ErrConnectionRefused{} } return nil } if !h.checkAck(s) { // RFC 793, page 72 (https://datatracker.ietf.org/doc/html/rfc793#page-72): // If the segment acknowledgment is not acceptable, form a reset segment, // // and send it. h.ep.sendEmptyRaw(header.TCPFlagRst, s.ackNumber, 0, 0) return nil } // We are in the SYN-SENT state. We only care about segments that have // the SYN flag. if !s.flags.Contains(header.TCPFlagSyn) { return nil } // Parse the SYN options. rcvSynOpts := parseSynSegmentOptions(s) // Remember if the Timestamp option was negotiated. h.ep.maybeEnableTimestamp(rcvSynOpts) // Remember if the SACKPermitted option was negotiated. h.ep.maybeEnableSACKPermitted(rcvSynOpts) // Remember the sequence we'll ack from now on. h.ackNum = s.sequenceNumber + 1 h.flags |= header.TCPFlagAck h.mss = rcvSynOpts.MSS h.sndWndScale = rcvSynOpts.WS // If this is a SYN ACK response, we only need to acknowledge the SYN // and the handshake is completed. if s.flags.Contains(header.TCPFlagAck) { h.state = handshakeCompleted h.transitionToStateEstablishedLocked(s) h.ep.sendEmptyRaw(header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale()) return nil } // A SYN segment was received, but no ACK in it. We acknowledge the SYN // but resend our own SYN and wait for it to be acknowledged in the // SYN-RCVD state. h.state = handshakeSynRcvd ttl := calculateTTL(h.ep.route, h.ep.ipv4TTL, h.ep.ipv6HopLimit) amss := h.ep.amss h.ep.setEndpointState(StateSynRecv) synOpts := header.TCPSynOptions{ WS: int(h.effectiveRcvWndScale()), TS: rcvSynOpts.TS, TSVal: h.ep.tsValNow(), TSEcr: h.ep.recentTimestamp(), // We only send SACKPermitted if the other side indicated it // permits SACK. This is not explicitly defined in the RFC but // this is the behaviour implemented by Linux. SACKPermitted: rcvSynOpts.SACKPermitted, MSS: amss, } if ttl == 0 { ttl = h.ep.route.DefaultTTL() } h.ep.sendSynTCP(h.ep.route, tcpFields{ id: h.ep.TransportEndpointInfo.ID, ttl: ttl, tos: h.ep.sendTOS, flags: h.flags, seq: h.iss, ack: h.ackNum, rcvWnd: h.rcvWnd, }, synOpts) return nil } // synRcvdState handles a segment received when the TCP 3-way handshake is in // the SYN-RCVD state. // +checklocks:h.ep.mu func (h *handshake) synRcvdState(s *segment) tcpip.Error { if s.flags.Contains(header.TCPFlagRst) { // RFC 793, page 37, states that in the SYN-RCVD state, a reset // is acceptable if the sequence number is in the window. if s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) { return &tcpip.ErrConnectionRefused{} } return nil } // It's possible that s is an ACK of a SYN cookie. This can happen if: // // - We receive a SYN while under load and issue a SYN/ACK with // cookie S. // - We receive a retransmitted SYN while space exists in the SYN // queue, and issue a SYN/ACK with seqnum S'. // - We receive the ACK based on S. // // If we receive a SYN cookie ACK, just use the cookie seqnum. if !h.checkAck(s) && h.listenEP != nil { iss := s.ackNumber - 1 data, ok := h.listenEP.listenCtx.isCookieValid(s.id, iss, s.sequenceNumber-1) if !ok || int(data) >= len(mssTable) { // This isn't a valid cookie. // RFC 793, page 72 (https://datatracker.ietf.org/doc/html/rfc793#page-72): // If the segment acknowledgment is not acceptable, form a reset segment, // // and send it. h.ep.sendEmptyRaw(header.TCPFlagRst, s.ackNumber, 0, 0) return nil } // This is a cookie that snuck its way in after we stopped using them. h.mss = mssTable[data] h.iss = iss } // RFC 793, Section 3.9, page 69, states that in the SYN-RCVD state, a // sequence number outside of the window causes an ACK with the proper seq // number and "After sending the acknowledgment, drop the unacceptable // segment and return." if !s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) { if h.ep.allowOutOfWindowAck() { h.ep.sendEmptyRaw(header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd) } return nil } if s.flags.Contains(header.TCPFlagSyn) && s.sequenceNumber != h.ackNum-1 { // We received two SYN segments with different sequence // numbers, so we reset this and restart the whole // process, except that we don't reset the timer. ack := s.sequenceNumber.Add(s.logicalLen()) seq := seqnum.Value(0) if s.flags.Contains(header.TCPFlagAck) { seq = s.ackNumber } h.ep.sendEmptyRaw(header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0) if !h.active { return &tcpip.ErrInvalidEndpointState{} } h.resetState() synOpts := header.TCPSynOptions{ WS: h.rcvWndScale, TS: h.ep.SendTSOk, TSVal: h.ep.tsValNow(), TSEcr: h.ep.recentTimestamp(), SACKPermitted: h.ep.SACKPermitted, MSS: h.ep.amss, } h.ep.sendSynTCP(h.ep.route, tcpFields{ id: h.ep.TransportEndpointInfo.ID, ttl: calculateTTL(h.ep.route, h.ep.ipv4TTL, h.ep.ipv6HopLimit), tos: h.ep.sendTOS, flags: h.flags, seq: h.iss, ack: h.ackNum, rcvWnd: h.rcvWnd, }, synOpts) return nil } // We have previously received (and acknowledged) the peer's SYN. If the // peer acknowledges our SYN, the handshake is completed. if s.flags.Contains(header.TCPFlagAck) { // If deferAccept is not zero and this is a bare ACK and the // timeout is not hit then drop the ACK. if h.deferAccept != 0 && s.payloadSize() == 0 && h.ep.stack.Clock().NowMonotonic().Sub(h.startTime) < h.deferAccept { h.acked = true h.ep.stack.Stats().DroppedPackets.Increment() return nil } // If the timestamp option is negotiated and the segment does // not carry a timestamp option then the segment must be dropped // as per https://tools.ietf.org/html/rfc7323#section-3.2. if h.ep.SendTSOk && !s.parsedOptions.TS { h.ep.stack.Stats().DroppedPackets.Increment() return nil } // Drop the ACK if the accept queue is full. // https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_ipv4.c#L1523 // We could abort the connection as well with a tunable as in // https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_minisocks.c#L788 if listenEP := h.listenEP; listenEP != nil && listenEP.acceptQueueIsFull() { listenEP.stack.Stats().DroppedPackets.Increment() return nil } // Update timestamp if required. See RFC7323, section-4.3. if h.ep.SendTSOk && s.parsedOptions.TS { h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber) } h.state = handshakeCompleted h.transitionToStateEstablishedLocked(s) // Requeue the segment if the ACK completing the handshake has more info // to be processed by the newly established endpoint. if (s.flags.Contains(header.TCPFlagFin) || s.payloadSize() > 0) && h.ep.enqueueSegment(s) { h.ep.protocol.dispatcher.selectProcessor(h.ep.ID).queueEndpoint(h.ep) } return nil } return nil } // +checklocks:h.ep.mu func (h *handshake) handleSegment(s *segment) tcpip.Error { h.sndWnd = s.window if !s.flags.Contains(header.TCPFlagSyn) && h.sndWndScale > 0 { h.sndWnd <<= uint8(h.sndWndScale) } switch h.state { case handshakeSynRcvd: return h.synRcvdState(s) case handshakeSynSent: return h.synSentState(s) } return nil } // processSegments goes through the segment queue and processes up to // maxSegmentsPerWake (if they're available). // +checklocks:h.ep.mu func (h *handshake) processSegments() tcpip.Error { for i := 0; i < maxSegmentsPerWake; i++ { s := h.ep.segmentQueue.dequeue() if s == nil { return nil } err := h.handleSegment(s) s.DecRef() if err != nil { return err } // We stop processing packets once the handshake is completed, // otherwise we may process packets meant to be processed by // the main protocol goroutine. if h.state == handshakeCompleted { break } } return nil } // start sends the first SYN/SYN-ACK. It does not block, even if link address // resolution is required. func (h *handshake) start() { h.startTime = h.ep.stack.Clock().NowMonotonic() h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route) var sackEnabled tcpip.TCPSACKEnabled if err := h.ep.stack.TransportProtocolOption(ProtocolNumber, &sackEnabled); err != nil { // If stack returned an error when checking for SACKEnabled // status then just default to switching off SACK negotiation. sackEnabled = false } synOpts := header.TCPSynOptions{ WS: h.rcvWndScale, TS: true, TSVal: h.ep.tsValNow(), TSEcr: h.ep.recentTimestamp(), SACKPermitted: bool(sackEnabled), MSS: h.ep.amss, } // start() is also called in a listen context so we want to make sure we only // send the TS/SACK option when we received the TS/SACK in the initial SYN. if h.state == handshakeSynRcvd { synOpts.TS = h.ep.SendTSOk synOpts.SACKPermitted = h.ep.SACKPermitted && bool(sackEnabled) if h.sndWndScale < 0 { // Disable window scaling if the peer did not send us // the window scaling option. synOpts.WS = -1 } } h.sendSYNOpts = synOpts h.ep.sendSynTCP(h.ep.route, tcpFields{ id: h.ep.TransportEndpointInfo.ID, ttl: calculateTTL(h.ep.route, h.ep.ipv4TTL, h.ep.ipv6HopLimit), tos: h.ep.sendTOS, flags: h.flags, seq: h.iss, ack: h.ackNum, rcvWnd: h.rcvWnd, }, synOpts) } // retransmitHandler handles retransmissions of un-acked SYNs. // +checklocks:h.ep.mu func (h *handshake) retransmitHandlerLocked() tcpip.Error { e := h.ep // If the endpoint has already transition out of a connecting state due // to say an error (e.g) peer send RST or an ICMP error. Then just // return. Any required cleanup should have been done when the RST/error // was handled. if !e.EndpointState().connecting() { return nil } if err := h.retransmitTimer.reset(); err != nil { return err } // Resend the SYN/SYN-ACK only if the following conditions hold. // - It's an active handshake (deferAccept does not apply) // - It's a passive handshake and we have not yet got the final-ACK. // - It's a passive handshake and we got an ACK but deferAccept is // enabled and we are now past the deferAccept duration. // The last is required to provide a way for the peer to complete // the connection with another ACK or data (as ACKs are never // retransmitted on their own). if h.active || !h.acked || h.deferAccept != 0 && e.stack.Clock().NowMonotonic().Sub(h.startTime) > h.deferAccept { e.sendSynTCP(e.route, tcpFields{ id: e.TransportEndpointInfo.ID, ttl: calculateTTL(e.route, e.ipv4TTL, e.ipv6HopLimit), tos: e.sendTOS, flags: h.flags, seq: h.iss, ack: h.ackNum, rcvWnd: h.rcvWnd, }, h.sendSYNOpts) // If we have ever retransmitted the SYN-ACK or // SYN segment, we should only measure RTT if // TS option is present. h.sampleRTTWithTSOnly = true } return nil } // transitionToStateEstablisedLocked transitions the endpoint of the handshake // to an established state given the last segment received from peer. It also // initializes sender/receiver. // +checklocks:h.ep.mu func (h *handshake) transitionToStateEstablishedLocked(s *segment) { // Stop the SYN retransmissions now that handshake is complete. if h.retransmitTimer != nil { h.retransmitTimer.stop() } // Transfer handshake state to TCP connection. We disable // receive window scaling if the peer doesn't support it // (indicated by a negative send window scale). h.ep.snd = newSender(h.ep, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale) now := h.ep.stack.Clock().NowMonotonic() var rtt time.Duration if h.ep.SendTSOk && s.parsedOptions.TSEcr != 0 { rtt = h.ep.elapsed(now, s.parsedOptions.TSEcr) } if !h.sampleRTTWithTSOnly && rtt == 0 { rtt = now.Sub(h.startTime) } if rtt > 0 { h.ep.snd.updateRTO(rtt) } h.ep.rcvQueueMu.Lock() h.ep.rcv = newReceiver(h.ep, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale()) // Bootstrap the auto tuning algorithm. Starting at zero will // result in a really large receive window after the first auto // tuning adjustment. h.ep.RcvAutoParams.PrevCopiedBytes = int(h.rcvWnd) h.ep.rcvQueueMu.Unlock() h.ep.setEndpointState(StateEstablished) // Completing the 3-way handshake is an indication that the route is valid // and the remote is reachable as the only way we can complete a handshake // is if our SYN reached the remote and their ACK reached us. h.ep.route.ConfirmReachable() // Tell waiters that the endpoint is connected and writable. h.ep.waiterQueue.Notify(waiter.WritableEvents) } type backoffTimer struct { timeout time.Duration maxTimeout time.Duration t tcpip.Timer } func newBackoffTimer(clock tcpip.Clock, timeout, maxTimeout time.Duration, f func()) (*backoffTimer, tcpip.Error) { if timeout > maxTimeout { return nil, &tcpip.ErrTimeout{} } bt := &backoffTimer{timeout: timeout, maxTimeout: maxTimeout} bt.t = clock.AfterFunc(timeout, f) return bt, nil } func (bt *backoffTimer) reset() tcpip.Error { bt.timeout *= 2 if bt.timeout > bt.maxTimeout { return &tcpip.ErrTimeout{} } bt.t.Reset(bt.timeout) return nil } func (bt *backoffTimer) stop() { bt.t.Stop() } func parseSynSegmentOptions(s *segment) header.TCPSynOptions { synOpts := header.ParseSynOptions(s.options, s.flags.Contains(header.TCPFlagAck)) if synOpts.TS { s.parsedOptions.TSVal = synOpts.TSVal s.parsedOptions.TSEcr = synOpts.TSEcr } return synOpts } var optionPool = sync.Pool{ New: func() any { return &[maxOptionSize]byte{} }, } func getOptions() []byte { return (*optionPool.Get().(*[maxOptionSize]byte))[:] } func putOptions(options []byte) { // Reslice to full capacity. optionPool.Put(optionsToArray(options)) } func makeSynOptions(opts header.TCPSynOptions) []byte { // Emulate linux option order. This is as follows: // // if md5: NOP NOP MD5SIG 18 md5sig(16) // if mss: MSS 4 mss(2) // if ts and sack_advertise: // SACK 2 TIMESTAMP 2 timestamp(8) // elif ts: NOP NOP TIMESTAMP 10 timestamp(8) // elif sack: NOP NOP SACK 2 // if wscale: NOP WINDOW 3 ws(1) // if sack_blocks: NOP NOP SACK ((2 + (#blocks * 8)) // [for each block] start_seq(4) end_seq(4) // if fastopen_cookie: // if exp: EXP (4 + len(cookie)) FASTOPEN_MAGIC(2) // else: FASTOPEN (2 + len(cookie)) // cookie(variable) [padding to four bytes] // options := getOptions() // Always encode the mss. offset := header.EncodeMSSOption(uint32(opts.MSS), options) // Special ordering is required here. If both TS and SACK are enabled, // then the SACK option precedes TS, with no padding. If they are // enabled individually, then we see padding before the option. if opts.TS && opts.SACKPermitted { offset += header.EncodeSACKPermittedOption(options[offset:]) offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:]) } else if opts.TS { offset += header.EncodeNOP(options[offset:]) offset += header.EncodeNOP(options[offset:]) offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:]) } else if opts.SACKPermitted { offset += header.EncodeNOP(options[offset:]) offset += header.EncodeNOP(options[offset:]) offset += header.EncodeSACKPermittedOption(options[offset:]) } // Initialize the WS option. if opts.WS >= 0 { offset += header.EncodeNOP(options[offset:]) offset += header.EncodeWSOption(opts.WS, options[offset:]) } // Padding to the end; note that this never apply unless we add a // fastopen option, we always expect the offset to remain the same. if delta := header.AddTCPOptionPadding(options, offset); delta != 0 { panic("unexpected option encoding") } return options[:offset] } // tcpFields is a struct to carry different parameters required by the // send*TCP variant functions below. type tcpFields struct { id stack.TransportEndpointID ttl uint8 tos uint8 flags header.TCPFlags seq seqnum.Value ack seqnum.Value rcvWnd seqnum.Size opts []byte txHash uint32 df bool } func (e *Endpoint) sendSynTCP(r *stack.Route, tf tcpFields, opts header.TCPSynOptions) tcpip.Error { tf.opts = makeSynOptions(opts) // We ignore SYN send errors and let the callers re-attempt send. p := stack.NewPacketBuffer(stack.PacketBufferOptions{ReserveHeaderBytes: header.TCPMinimumSize + int(r.MaxHeaderLength()) + len(tf.opts)}) defer p.DecRef() if err := e.sendTCP(r, tf, p, stack.GSO{}); err != nil { e.stats.SendErrors.SynSendToNetworkFailed.Increment() } putOptions(tf.opts) return nil } // This method takes ownership of pkt. func (e *Endpoint) sendTCP(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso stack.GSO) tcpip.Error { tf.txHash = e.txHash if err := sendTCP(r, tf, pkt, gso, e.owner); err != nil { e.stats.SendErrors.SegmentSendToNetworkFailed.Increment() return err } e.stats.SegmentsSent.Increment() return nil } func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso stack.GSO) { optLen := len(tf.opts) tcp := header.TCP(pkt.TransportHeader().Push(header.TCPMinimumSize + optLen)) pkt.TransportProtocolNumber = header.TCPProtocolNumber tcp.Encode(&header.TCPFields{ SrcPort: tf.id.LocalPort, DstPort: tf.id.RemotePort, SeqNum: uint32(tf.seq), AckNum: uint32(tf.ack), DataOffset: uint8(header.TCPMinimumSize + optLen), Flags: tf.flags, WindowSize: uint16(tf.rcvWnd), }) copy(tcp[header.TCPMinimumSize:], tf.opts) xsum := r.PseudoHeaderChecksum(ProtocolNumber, uint16(pkt.Size())) // Only calculate the checksum if offloading isn't supported. if gso.Type != stack.GSONone && gso.NeedsCsum { // This is called CHECKSUM_PARTIAL in the Linux kernel. We // calculate a checksum of the pseudo-header and save it in the // TCP header, then the kernel calculate a checksum of the // header and data and get the right sum of the TCP packet. tcp.SetChecksum(xsum) } else if r.RequiresTXTransportChecksum() { xsum = checksum.Combine(xsum, pkt.Data().Checksum()) tcp.SetChecksum(^tcp.CalculateChecksum(xsum)) } } func sendTCPBatch(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error { optLen := len(tf.opts) if tf.rcvWnd > math.MaxUint16 { tf.rcvWnd = math.MaxUint16 } mss := int(gso.MSS) n := (pkt.Data().Size() + mss - 1) / mss size := pkt.Data().Size() hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen for i := 0; i < n; i++ { packetSize := mss if packetSize > size { packetSize = size } size -= packetSize pkt := pkt // No need to split the packet in the final iteration. The original // packet already has the truncated data. shouldSplitPacket := i != n-1 if shouldSplitPacket { splitPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ReserveHeaderBytes: hdrSize}) splitPkt.Data().ReadFromPacketData(pkt.Data(), packetSize) pkt = splitPkt } pkt.Hash = tf.txHash pkt.Owner = owner buildTCPHdr(r, tf, pkt, gso) tf.seq = tf.seq.Add(seqnum.Size(packetSize)) pkt.GSOOptions = gso if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos, DF: tf.df}, pkt); err != nil { r.Stats().TCP.SegmentSendErrors.Increment() if shouldSplitPacket { pkt.DecRef() } return err } r.Stats().TCP.SegmentsSent.Increment() if shouldSplitPacket { pkt.DecRef() } } return nil } // sendTCP sends a TCP segment with the provided options via the provided // network endpoint and under the provided identity. This method takes // ownership of pkt. func sendTCP(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error { if tf.rcvWnd > math.MaxUint16 { tf.rcvWnd = math.MaxUint16 } if r.Loop()&stack.PacketLoop == 0 && gso.Type == stack.GSOGvisor && int(gso.MSS) < pkt.Data().Size() { return sendTCPBatch(r, tf, pkt, gso, owner) } pkt.GSOOptions = gso pkt.Hash = tf.txHash pkt.Owner = owner buildTCPHdr(r, tf, pkt, gso) if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos, DF: tf.df}, pkt); err != nil { r.Stats().TCP.SegmentSendErrors.Increment() return err } r.Stats().TCP.SegmentsSent.Increment() if (tf.flags & header.TCPFlagRst) != 0 { r.Stats().TCP.ResetsSent.Increment() } return nil } // makeOptions makes an options slice. func (e *Endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte { options := getOptions() offset := 0 // N.B. the ordering here matches the ordering used by Linux internally // and described in the raw makeOptions function. We don't include // unnecessary cases here (post connection.) if e.SendTSOk { // Embed the timestamp if timestamp has been enabled. // // We only use the lower 32 bits of the unix time in // milliseconds. This is similar to what Linux does where it // uses the lower 32 bits of the jiffies value in the tsVal // field of the timestamp option. // // Further, RFC7323 section-5.4 recommends millisecond // resolution as the lowest recommended resolution for the // timestamp clock. // // Ref: https://tools.ietf.org/html/rfc7323#section-5.4. offset += header.EncodeNOP(options[offset:]) offset += header.EncodeNOP(options[offset:]) offset += header.EncodeTSOption(e.tsValNow(), e.recentTimestamp(), options[offset:]) } if e.SACKPermitted && len(sackBlocks) > 0 { offset += header.EncodeNOP(options[offset:]) offset += header.EncodeNOP(options[offset:]) offset += header.EncodeSACKBlocks(sackBlocks, options[offset:]) } // We expect the above to produce an aligned offset. if delta := header.AddTCPOptionPadding(options, offset); delta != 0 { panic("unexpected option encoding") } return options[:offset] } // sendEmptyRaw sends a TCP segment with no payload to the endpoint's peer. // // +checklocks:e.mu // +checklocksalias:e.snd.ep.mu=e.mu func (e *Endpoint) sendEmptyRaw(flags header.TCPFlags, seq, ack seqnum.Value, rcvWnd seqnum.Size) tcpip.Error { pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{}) defer pkt.DecRef() return e.sendRaw(pkt, flags, seq, ack, rcvWnd) } // sendRaw sends a TCP segment to the endpoint's peer. This method takes // ownership of pkt. pkt must not have any headers set. // // +checklocks:e.mu // +checklocksalias:e.snd.ep.mu=e.mu func (e *Endpoint) sendRaw(pkt *stack.PacketBuffer, flags header.TCPFlags, seq, ack seqnum.Value, rcvWnd seqnum.Size) tcpip.Error { var sackBlocks []header.SACKBlock if e.EndpointState() == StateEstablished && e.rcv.pendingRcvdSegments.Len() > 0 && (flags&header.TCPFlagAck != 0) { sackBlocks = e.sack.Blocks[:e.sack.NumBlocks] } options := e.makeOptions(sackBlocks) defer putOptions(options) pkt.ReserveHeaderBytes(header.TCPMinimumSize + int(e.route.MaxHeaderLength()) + len(options)) return e.sendTCP(e.route, tcpFields{ id: e.TransportEndpointInfo.ID, ttl: calculateTTL(e.route, e.ipv4TTL, e.ipv6HopLimit), tos: e.sendTOS, flags: flags, seq: seq, ack: ack, rcvWnd: rcvWnd, opts: options, df: e.pmtud == tcpip.PMTUDiscoveryWant || e.pmtud == tcpip.PMTUDiscoveryDo, }, pkt, e.gso) } // +checklocks:e.mu // +checklocksalias:e.snd.ep.mu=e.mu func (e *Endpoint) sendData(next *segment) { // Initialize the next segment to write if it's currently nil. if e.snd.writeNext == nil { if next == nil { return } e.snd.updateWriteNext(next) } // Push out any new packets. e.snd.sendData() } // resetConnectionLocked puts the endpoint in an error state with the given // error code and sends a RST if and only if the error is not ErrConnectionReset // indicating that the connection is being reset due to receiving a RST. This // method must only be called from the protocol goroutine. // +checklocks:e.mu func (e *Endpoint) resetConnectionLocked(err tcpip.Error) { // Only send a reset if the connection is being aborted for a reason // other than receiving a reset. e.hardError = err switch err.(type) { case *tcpip.ErrConnectionReset, *tcpip.ErrTimeout: default: // The exact sequence number to be used for the RST is the same as the // one used by Linux. We need to handle the case of window being shrunk // which can cause sndNxt to be outside the acceptable window on the // receiver. // // See: https://www.snellman.net/blog/archive/2016-02-01-tcp-rst/ for more // information. sndWndEnd := e.snd.SndUna.Add(e.snd.SndWnd) resetSeqNum := sndWndEnd if !sndWndEnd.LessThan(e.snd.SndNxt) || e.snd.SndNxt.Size(sndWndEnd) < (1< // // After sending the acknowledgment, TCP MUST drop the unacceptable // segment and stop processing further. // // By sending an ACK, the remote peer is challenged to confirm the loss // of the previous connection and the request to start a new connection. // A legitimate peer, after restart, would not have a TCB in the // synchronized state. Thus, when the ACK arrives, the peer should send // a RST segment back with the sequence number derived from the ACK // field that caused the RST. // This RST will confirm that the remote peer has indeed closed the // previous connection. Upon receipt of a valid RST, the local TCP // endpoint MUST terminate its connection. The local TCP endpoint // should then rely on SYN retransmission from the remote end to // re-establish the connection. e.snd.maybeSendOutOfWindowAck(s) } else if s.flags.Contains(header.TCPFlagAck) { // Patch the window size in the segment according to the // send window scale. s.window <<= e.snd.SndWndScale // RFC 793, page 41 states that "once in the ESTABLISHED // state all segments must carry current acknowledgment // information." drop, err := e.rcv.handleRcvdSegment(s) if err != nil { return false, err } if drop { return true, nil } // Now check if the received segment has caused us to transition // to a CLOSED state, if yes then terminate processing and do // not invoke the sender. state := e.EndpointState() if state == StateClose { // When we get into StateClose while processing from the queue, // return immediately and let the protocolMainloop handle it. // // We can reach StateClose only while processing a previous segment // or a notification from the protocolMainLoop (caller goroutine). // This means that with this return, the segment dequeue below can // never occur on a closed endpoint. return false, nil } e.snd.handleRcvdSegment(s) } return true, nil } // keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP // keepalive packets periodically when the connection is idle. If we don't hear // from the other side after a number of tries, we terminate the connection. // +checklocks:e.mu // +checklocksalias:e.snd.ep.mu=e.mu func (e *Endpoint) keepaliveTimerExpired() tcpip.Error { userTimeout := e.userTimeout // If the route is not ready or already cleaned up, then we don't need to // send keepalives. if e.route == nil { return nil } e.keepalive.Lock() if !e.SocketOptions().GetKeepAlive() || e.keepalive.timer.isUninitialized() || !e.keepalive.timer.checkExpiration() { e.keepalive.Unlock() return nil } // If a userTimeout is set then abort the connection if it is // exceeded. if userTimeout != 0 && e.stack.Clock().NowMonotonic().Sub(e.rcv.lastRcvdAckTime) >= userTimeout && e.keepalive.unacked > 0 { e.keepalive.Unlock() e.stack.Stats().TCP.EstablishedTimedout.Increment() return &tcpip.ErrTimeout{} } if e.keepalive.unacked >= e.keepalive.count { e.keepalive.Unlock() e.stack.Stats().TCP.EstablishedTimedout.Increment() return &tcpip.ErrTimeout{} } // RFC1122 4.2.3.6: TCP keepalive is a dataless ACK with // seg.seq = snd.nxt-1. e.keepalive.unacked++ e.keepalive.Unlock() e.snd.sendEmptySegment(header.TCPFlagAck, e.snd.SndNxt-1) e.resetKeepaliveTimer(false) return nil } // resetKeepaliveTimer restarts or stops the keepalive timer, depending on // whether it is enabled for this endpoint. func (e *Endpoint) resetKeepaliveTimer(receivedData bool) { e.keepalive.Lock() defer e.keepalive.Unlock() if e.keepalive.timer.isUninitialized() { if state := e.EndpointState(); !state.closed() { panic(fmt.Sprintf("Unexpected state when the keepalive time is cleaned up, got %s, want %s or %s", state, StateClose, StateError)) } return } if receivedData { e.keepalive.unacked = 0 } // Start the keepalive timer IFF it's enabled and there is no pending // data to send. if !e.SocketOptions().GetKeepAlive() || e.snd == nil || e.snd.SndUna != e.snd.SndNxt { e.keepalive.timer.disable() return } if e.keepalive.unacked > 0 { e.keepalive.timer.enable(e.keepalive.interval) } else { e.keepalive.timer.enable(e.keepalive.idle) } } // disableKeepaliveTimer stops the keepalive timer. func (e *Endpoint) disableKeepaliveTimer() { e.keepalive.Lock() e.keepalive.timer.disable() e.keepalive.Unlock() } // finWait2TimerExpired is called when the FIN-WAIT-2 timeout is hit // and the peer hasn't sent us a FIN. func (e *Endpoint) finWait2TimerExpired() { e.mu.Lock() e.transitionToStateCloseLocked() e.mu.Unlock() e.drainClosingSegmentQueue() e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) } // +checklocks:e.mu func (e *Endpoint) handshakeFailed(err tcpip.Error) { e.lastErrorMu.Lock() e.lastError = err e.lastErrorMu.Unlock() // handshakeFailed is also called from startHandshake when a listener // transitions out of Listen state by the time the SYN is processed. In // such cases the handshake is never initialized and the newly created // endpoint is closed right away. if e.h != nil && e.h.retransmitTimer != nil { e.h.retransmitTimer.stop() } e.hardError = err e.cleanupLocked() e.setEndpointState(StateError) } // handleTimeWaitSegments processes segments received during TIME_WAIT // state. // +checklocks:e.mu // +checklocksalias:e.rcv.ep.mu=e.mu func (e *Endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()) { for i := 0; i < maxSegmentsPerWake; i++ { s := e.segmentQueue.dequeue() if s == nil { break } extTW, newSyn := e.rcv.handleTimeWaitSegment(s) if newSyn { info := e.TransportEndpointInfo newID := info.ID newID.RemoteAddress = tcpip.Address{} newID.RemotePort = 0 netProtos := []tcpip.NetworkProtocolNumber{info.NetProto} // If the local address is an IPv4 address then also // look for IPv6 dual stack endpoints that might be // listening on the local address. if newID.LocalAddress.To4() != (tcpip.Address{}) { netProtos = []tcpip.NetworkProtocolNumber{header.IPv4ProtocolNumber, header.IPv6ProtocolNumber} } for _, netProto := range netProtos { if listenEP := e.stack.FindTransportEndpoint(netProto, info.TransProto, newID, s.pkt.NICID); listenEP != nil { tcpEP := listenEP.(*Endpoint) if EndpointState(tcpEP.State()) == StateListen { reuseTW = func() { if !tcpEP.enqueueSegment(s) { return } tcpEP.notifyProcessor() s.DecRef() } // We explicitly do not DecRef the segment as it's still valid and // being reflected to a listening endpoint. return false, reuseTW } } } } if extTW { extendTimeWait = true } s.DecRef() } return extendTimeWait, nil } // +checklocks:e.mu func (e *Endpoint) getTimeWaitDuration() time.Duration { timeWaitDuration := DefaultTCPTimeWaitTimeout // Get the stack wide configuration. var tcpTW tcpip.TCPTimeWaitTimeoutOption if err := e.stack.TransportProtocolOption(ProtocolNumber, &tcpTW); err == nil { timeWaitDuration = time.Duration(tcpTW) } return timeWaitDuration } // timeWaitTimerExpired is called when an endpoint completes the required time // (typically 2 * MSL unless configured to something else at a stack level) in // TIME-WAIT state. func (e *Endpoint) timeWaitTimerExpired() { e.mu.Lock() if e.EndpointState() != StateTimeWait { e.mu.Unlock() return } e.transitionToStateCloseLocked() e.mu.Unlock() e.drainClosingSegmentQueue() e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) } // notifyProcessor queues this endpoint for processing to its TCP processor. func (e *Endpoint) notifyProcessor() { // We use TryLock here to avoid deadlocks in cases where a listening endpoint that is being // closed tries to abort half completed connections which in turn try to queue any segments // queued to that endpoint back to the same listening endpoint (because it may have got // segments that matched its id but were either a RST or a new SYN which must be handled // by a listening endpoint). In such cases the Close() on the listening endpoint will handle // any queued segments after it releases the lock. if !e.mu.TryLock() { return } processor := e.protocol.dispatcher.selectProcessor(e.ID) e.mu.Unlock() processor.queueEndpoint(e) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/connect_unsafe.go000066400000000000000000000020171465435605700267060ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "reflect" "unsafe" ) // optionsToArray converts a slice of capacity >-= maxOptionSize to an array. // // optionsToArray panics if the capacity of options is smaller than // maxOptionSize. func optionsToArray(options []byte) *[maxOptionSize]byte { // Reslice to full capacity. options = options[0:maxOptionSize] return (*[maxOptionSize]byte)(unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&options)).Data)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/cubic.go000066400000000000000000000231751465435605700250110ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "math" "time" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // effectivelyInfinity is an initialization value used for round-trip times // that are then set using min. It is equal to approximately 100 years: large // enough that it will always be greater than a real TCP round-trip time, and // small enough that it fits in time.Duration. const effectivelyInfinity = time.Duration(math.MaxInt64) const ( // RTT = round-trip time. // The delay increase sensitivity is determined by minRTTThresh and // maxRTTThresh. Smaller values of minRTTThresh may cause spurious exits // from slow start. Larger values of maxRTTThresh may result in slow start // not exiting until loss is encountered for connections on large RTT paths. minRTTThresh = 4 * time.Millisecond maxRTTThresh = 16 * time.Millisecond // minRTTDivisor is a fraction of RTT to compute the delay threshold. A // smaller value would mean a larger threshold and thus less sensitivity to // delay increase, and vice versa. minRTTDivisor = 8 // nRTTSample is the minimum number of RTT samples in the round before // considering whether to exit the round due to increased RTT. nRTTSample = 8 // ackDelta is the maximum time between ACKs for them to be considered part // of the same ACK Train during HyStart ackDelta = 2 * time.Millisecond ) // cubicState stores the variables related to TCP CUBIC congestion // control algorithm state. // // See: https://tools.ietf.org/html/rfc8312. // +stateify savable type cubicState struct { stack.TCPCubicState // numCongestionEvents tracks the number of congestion events since last // RTO. numCongestionEvents int s *sender } // newCubicCC returns a partially initialized cubic state with the constants // beta and c set and t set to current time. func newCubicCC(s *sender) *cubicState { now := s.ep.stack.Clock().NowMonotonic() return &cubicState{ TCPCubicState: stack.TCPCubicState{ T: now, Beta: 0.7, C: 0.4, // By this point, the sender has initialized it's initial sequence // number. EndSeq: s.SndNxt, LastRTT: effectivelyInfinity, CurrRTT: effectivelyInfinity, LastAck: now, RoundStart: now, }, s: s, } } // enterCongestionAvoidance is used to initialize cubic in cases where we exit // SlowStart without a real congestion event taking place. This can happen when // a connection goes back to slow start due to a retransmit and we exceed the // previously lowered ssThresh without experiencing packet loss. // // Refer: https://tools.ietf.org/html/rfc8312#section-4.8 func (c *cubicState) enterCongestionAvoidance() { // See: https://tools.ietf.org/html/rfc8312#section-4.7 & // https://tools.ietf.org/html/rfc8312#section-4.8 if c.numCongestionEvents == 0 { c.K = 0 c.T = c.s.ep.stack.Clock().NowMonotonic() c.WLastMax = c.WMax c.WMax = float64(c.s.SndCwnd) } } // updateHyStart tracks packet round-trip time (rtt) to find a safe threshold // to exit slow start without triggering packet loss. It updates the SSThresh // when it does. // // Implementation of HyStart follows the algorithm from the Linux kernel, rather // than RFC 9406 (https://www.rfc-editor.org/rfc/rfc9406.html). Briefly, the // Linux kernel algorithm is based directly on the original HyStart paper // (https://doi.org/10.1016/j.comnet.2011.01.014), and differs from the RFC in // that two detection algorithms run in parallel ('ACK train' and 'Delay // increase'). The RFC version includes only the latter algorithm and adds an // intermediate phase called Conservative Slow Start, which is not implemented // here. func (c *cubicState) updateHyStart(rtt time.Duration) { if rtt < 0 { // negative indicates unknown return } now := c.s.ep.stack.Clock().NowMonotonic() if c.EndSeq.LessThan(c.s.SndUna) { c.beginHyStartRound(now) } // ACK train if now.Sub(c.LastAck) < ackDelta && // ensures acks are part of the same "train" c.LastRTT < effectivelyInfinity { c.LastAck = now if thresh := c.LastRTT / 2; now.Sub(c.RoundStart) > thresh { c.s.Ssthresh = c.s.SndCwnd } } // Delay increase c.CurrRTT = min(c.CurrRTT, rtt) c.SampleCount++ if c.SampleCount >= nRTTSample && c.LastRTT < effectivelyInfinity { // i.e. LastRTT/minRTTDivisor, but clamped to minRTTThresh & maxRTTThresh thresh := max( minRTTThresh, min(maxRTTThresh, c.LastRTT/minRTTDivisor), ) if c.CurrRTT >= (c.LastRTT + thresh) { // Triggered HyStart safe exit threshold c.s.Ssthresh = c.s.SndCwnd } } } func (c *cubicState) beginHyStartRound(now tcpip.MonotonicTime) { c.EndSeq = c.s.SndNxt c.SampleCount = 0 c.LastRTT = c.CurrRTT c.CurrRTT = effectivelyInfinity c.LastAck = now c.RoundStart = now } // updateSlowStart will update the congestion window as per the slow-start // algorithm used by NewReno. If after adjusting the congestion window we cross // the ssThresh then it will return the number of packets that must be consumed // in congestion avoidance mode. func (c *cubicState) updateSlowStart(packetsAcked int) int { // Don't let the congestion window cross into the congestion // avoidance range. newcwnd := c.s.SndCwnd + packetsAcked enterCA := false if newcwnd >= c.s.Ssthresh { newcwnd = c.s.Ssthresh c.s.SndCAAckCount = 0 enterCA = true } packetsAcked -= newcwnd - c.s.SndCwnd c.s.SndCwnd = newcwnd if enterCA { c.enterCongestionAvoidance() } return packetsAcked } // Update updates cubic's internal state variables. It must be called on every // ACK received. // Refer: https://tools.ietf.org/html/rfc8312#section-4 func (c *cubicState) Update(packetsAcked int, rtt time.Duration) { if c.s.Ssthresh == InitialSsthresh && c.s.SndCwnd < c.s.Ssthresh { c.updateHyStart(rtt) } if c.s.SndCwnd < c.s.Ssthresh { packetsAcked = c.updateSlowStart(packetsAcked) if packetsAcked == 0 { return } } else { c.s.rtt.Lock() srtt := c.s.rtt.TCPRTTState.SRTT c.s.rtt.Unlock() c.s.SndCwnd = c.getCwnd(packetsAcked, c.s.SndCwnd, srtt) } } // cubicCwnd computes the CUBIC congestion window after t seconds from last // congestion event. func (c *cubicState) cubicCwnd(t float64) float64 { return c.C*math.Pow(t, 3.0) + c.WMax } // getCwnd returns the current congestion window as computed by CUBIC. // Refer: https://tools.ietf.org/html/rfc8312#section-4 func (c *cubicState) getCwnd(packetsAcked, sndCwnd int, srtt time.Duration) int { elapsed := c.s.ep.stack.Clock().NowMonotonic().Sub(c.T) elapsedSeconds := elapsed.Seconds() // Compute the window as per Cubic after 'elapsed' time // since last congestion event. c.WC = c.cubicCwnd(elapsedSeconds - c.K) // Compute the TCP friendly estimate of the congestion window. c.WEst = c.WMax*c.Beta + (3.0*((1.0-c.Beta)/(1.0+c.Beta)))*(elapsedSeconds/srtt.Seconds()) // Make sure in the TCP friendly region CUBIC performs at least // as well as Reno. if c.WC < c.WEst && float64(sndCwnd) < c.WEst { // TCP Friendly region of cubic. return int(c.WEst) } // In Concave/Convex region of CUBIC, calculate what CUBIC window // will be after 1 RTT and use that to grow congestion window // for every ack. tEst := (elapsed + srtt).Seconds() wtRtt := c.cubicCwnd(tEst - c.K) // As per 4.3 for each received ACK cwnd must be incremented // by (w_cubic(t+RTT) - cwnd/cwnd. cwnd := float64(sndCwnd) for i := 0; i < packetsAcked; i++ { // Concave/Convex regions of cubic have the same formulas. // See: https://tools.ietf.org/html/rfc8312#section-4.3 cwnd += (wtRtt - cwnd) / cwnd } return int(cwnd) } // HandleLossDetected implements congestionControl.HandleLossDetected. func (c *cubicState) HandleLossDetected() { // See: https://tools.ietf.org/html/rfc8312#section-4.5 c.numCongestionEvents++ c.T = c.s.ep.stack.Clock().NowMonotonic() c.WLastMax = c.WMax c.WMax = float64(c.s.SndCwnd) c.fastConvergence() c.reduceSlowStartThreshold() } // HandleRTOExpired implements congestionContrl.HandleRTOExpired. func (c *cubicState) HandleRTOExpired() { // See: https://tools.ietf.org/html/rfc8312#section-4.6 c.T = c.s.ep.stack.Clock().NowMonotonic() c.numCongestionEvents = 0 c.WLastMax = c.WMax c.WMax = float64(c.s.SndCwnd) c.fastConvergence() // We lost a packet, so reduce ssthresh. c.reduceSlowStartThreshold() // Reduce the congestion window to 1, i.e., enter slow-start. Per // RFC 5681, page 7, we must use 1 regardless of the value of the // initial congestion window. c.s.SndCwnd = 1 } // fastConvergence implements the logic for Fast Convergence algorithm as // described in https://tools.ietf.org/html/rfc8312#section-4.6. func (c *cubicState) fastConvergence() { if c.WMax < c.WLastMax { c.WLastMax = c.WMax c.WMax = c.WMax * (1.0 + c.Beta) / 2.0 } else { c.WLastMax = c.WMax } // Recompute k as wMax may have changed. c.K = math.Cbrt(c.WMax * (1 - c.Beta) / c.C) } // PostRecovery implements congestionControl.PostRecovery. func (c *cubicState) PostRecovery() { c.T = c.s.ep.stack.Clock().NowMonotonic() } // reduceSlowStartThreshold returns new SsThresh as described in // https://tools.ietf.org/html/rfc8312#section-4.7. func (c *cubicState) reduceSlowStartThreshold() { c.s.Ssthresh = int(math.Max(float64(c.s.SndCwnd)*c.Beta, 2.0)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/dispatcher.go000066400000000000000000000347121465435605700260510ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "encoding/binary" "fmt" "math/rand" "gvisor.dev/gvisor/pkg/sleep" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/waiter" ) // epQueue is a queue of endpoints. // // +stateify savable type epQueue struct { mu sync.Mutex `state:"nosave"` list endpointList } // enqueue adds e to the queue if the endpoint is not already on the queue. func (q *epQueue) enqueue(e *Endpoint) { q.mu.Lock() defer q.mu.Unlock() e.pendingProcessingMu.Lock() defer e.pendingProcessingMu.Unlock() if e.pendingProcessing { return } q.list.PushBack(e) e.pendingProcessing = true } // dequeue removes and returns the first element from the queue if available, // returns nil otherwise. func (q *epQueue) dequeue() *Endpoint { q.mu.Lock() if e := q.list.Front(); e != nil { q.list.Remove(e) e.pendingProcessingMu.Lock() e.pendingProcessing = false e.pendingProcessingMu.Unlock() q.mu.Unlock() return e } q.mu.Unlock() return nil } // empty returns true if the queue is empty, false otherwise. func (q *epQueue) empty() bool { q.mu.Lock() v := q.list.Empty() q.mu.Unlock() return v } // processor is responsible for processing packets queued to a tcp endpoint. // // +stateify savable type processor struct { epQ epQueue sleeper sleep.Sleeper // TODO(b/341946753): Restore them when netstack is savable. newEndpointWaker sleep.Waker `state:"nosave"` closeWaker sleep.Waker `state:"nosave"` pauseWaker sleep.Waker `state:"nosave"` pauseChan chan struct{} `state:"nosave"` resumeChan chan struct{} `state:"nosave"` } func (p *processor) close() { p.closeWaker.Assert() } func (p *processor) queueEndpoint(ep *Endpoint) { // Queue an endpoint for processing by the processor goroutine. p.epQ.enqueue(ep) p.newEndpointWaker.Assert() } // deliverAccepted delivers a passively connected endpoint to the accept queue // of its associated listening endpoint. // // +checklocks:ep.mu func deliverAccepted(ep *Endpoint) bool { lEP := ep.h.listenEP lEP.acceptMu.Lock() // Remove endpoint from list of pendingEndpoints as the handshake is now // complete. delete(lEP.acceptQueue.pendingEndpoints, ep) // Deliver this endpoint to the listening socket's accept queue. if lEP.acceptQueue.capacity == 0 { lEP.acceptMu.Unlock() return false } // NOTE: We always queue the endpoint and on purpose do not check if // accept queue is full at this point. This is similar to linux because // two racing incoming ACK's can both pass the acceptQueue.isFull check // and proceed to ESTABLISHED state. In such a case its better to // deliver both even if it temporarily exceeds the queue limit rather // than drop a connection that is fully connected. // // For reference see: // https://github.com/torvalds/linux/blob/169e77764adc041b1dacba84ea90516a895d43b2/net/ipv4/tcp_minisocks.c#L764 // https://github.com/torvalds/linux/blob/169e77764adc041b1dacba84ea90516a895d43b2/net/ipv4/tcp_ipv4.c#L1500 lEP.acceptQueue.endpoints.PushBack(ep) lEP.acceptMu.Unlock() ep.h.listenEP.waiterQueue.Notify(waiter.ReadableEvents) return true } // handleConnecting is responsible for TCP processing for an endpoint in one of // the connecting states. func handleConnecting(ep *Endpoint) { if !ep.TryLock() { return } cleanup := func() { ep.mu.Unlock() ep.drainClosingSegmentQueue() ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) } if !ep.EndpointState().connecting() { // If the endpoint has already transitioned out of a connecting // stage then just return (only possible if it was closed or // timed out by the time we got around to processing the wakeup. ep.mu.Unlock() return } if err := ep.h.processSegments(); err != nil { // +checklocksforce:ep.h.ep.mu // handshake failed. clean up the tcp endpoint and handshake // state. if lEP := ep.h.listenEP; lEP != nil { lEP.acceptMu.Lock() delete(lEP.acceptQueue.pendingEndpoints, ep) lEP.acceptMu.Unlock() } ep.handshakeFailed(err) cleanup() return } if ep.EndpointState() == StateEstablished && ep.h.listenEP != nil { ep.isConnectNotified = true ep.stack.Stats().TCP.PassiveConnectionOpenings.Increment() if !deliverAccepted(ep) { ep.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) cleanup() return } } ep.mu.Unlock() } // handleConnected is responsible for TCP processing for an endpoint in one of // the connected states(StateEstablished, StateFinWait1 etc.) func handleConnected(ep *Endpoint) { if !ep.TryLock() { return } if !ep.EndpointState().connected() { // If the endpoint has already transitioned out of a connected // state then just return (only possible if it was closed or // timed out by the time we got around to processing the wakeup. ep.mu.Unlock() return } // NOTE: We read this outside of e.mu lock which means that by the time // we get to handleSegments the endpoint may not be in ESTABLISHED. But // this should be fine as all normal shutdown states are handled by // handleSegmentsLocked. switch err := ep.handleSegmentsLocked(); { case err != nil: // Send any active resets if required. ep.resetConnectionLocked(err) fallthrough case ep.EndpointState() == StateClose: ep.mu.Unlock() ep.drainClosingSegmentQueue() ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) return case ep.EndpointState() == StateTimeWait: startTimeWait(ep) } ep.mu.Unlock() } // startTimeWait starts a new goroutine to handle TIME-WAIT. // // +checklocks:ep.mu func startTimeWait(ep *Endpoint) { // Disable close timer as we are now entering real TIME_WAIT. if ep.finWait2Timer != nil { ep.finWait2Timer.Stop() } // Wake up any waiters before we start TIME-WAIT. ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) timeWaitDuration := ep.getTimeWaitDuration() ep.timeWaitTimer = ep.stack.Clock().AfterFunc(timeWaitDuration, ep.timeWaitTimerExpired) } // handleTimeWait is responsible for TCP processing for an endpoint in TIME-WAIT // state. func handleTimeWait(ep *Endpoint) { if !ep.TryLock() { return } if ep.EndpointState() != StateTimeWait { // If the endpoint has already transitioned out of a TIME-WAIT // state then just return (only possible if it was closed or // timed out by the time we got around to processing the wakeup. ep.mu.Unlock() return } extendTimeWait, reuseTW := ep.handleTimeWaitSegments() if reuseTW != nil { ep.transitionToStateCloseLocked() ep.mu.Unlock() ep.drainClosingSegmentQueue() ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) reuseTW() return } if extendTimeWait { ep.timeWaitTimer.Reset(ep.getTimeWaitDuration()) } ep.mu.Unlock() } // handleListen is responsible for TCP processing for an endpoint in LISTEN // state. func handleListen(ep *Endpoint) { if !ep.TryLock() { return } defer ep.mu.Unlock() if ep.EndpointState() != StateListen { // If the endpoint has already transitioned out of a LISTEN // state then just return (only possible if it was closed or // shutdown). return } for i := 0; i < maxSegmentsPerWake; i++ { s := ep.segmentQueue.dequeue() if s == nil { break } // TODO(gvisor.dev/issue/4690): Better handle errors instead of // silently dropping. _ = ep.handleListenSegment(ep.listenCtx, s) s.DecRef() } } // start runs the main loop for a processor which is responsible for all TCP // processing for TCP endpoints. func (p *processor) start(wg *sync.WaitGroup) { defer wg.Done() defer p.sleeper.Done() for { switch w := p.sleeper.Fetch(true); { case w == &p.closeWaker: return case w == &p.pauseWaker: if !p.epQ.empty() { p.newEndpointWaker.Assert() p.pauseWaker.Assert() continue } else { p.pauseChan <- struct{}{} <-p.resumeChan } case w == &p.newEndpointWaker: for { ep := p.epQ.dequeue() if ep == nil { break } if ep.segmentQueue.empty() { continue } switch state := ep.EndpointState(); { case state.connecting(): handleConnecting(ep) case state.connected() && state != StateTimeWait: handleConnected(ep) case state == StateTimeWait: handleTimeWait(ep) case state == StateListen: handleListen(ep) case state == StateError || state == StateClose: // Try to redeliver any still queued // packets to another endpoint or send a // RST if it can't be delivered. ep.mu.Lock() if st := ep.EndpointState(); st == StateError || st == StateClose { ep.drainClosingSegmentQueue() } ep.mu.Unlock() default: panic(fmt.Sprintf("unexpected tcp state in processor: %v", state)) } // If there are more segments to process and the // endpoint lock is not held by user then // requeue this endpoint for processing. if !ep.segmentQueue.empty() && !ep.isOwnedByUser() { p.epQ.enqueue(ep) } } } } } // pause pauses the processor loop. func (p *processor) pause() chan struct{} { p.pauseWaker.Assert() return p.pauseChan } // resume resumes a previously paused loop. // // Precondition: Pause must have been called previously. func (p *processor) resume() { p.resumeChan <- struct{}{} } // dispatcher manages a pool of TCP endpoint processors which are responsible // for the processing of inbound segments. This fixed pool of processor // goroutines do full tcp processing. The processor is selected based on the // hash of the endpoint id to ensure that delivery for the same endpoint happens // in-order. // // +stateify savable type dispatcher struct { processors []processor wg sync.WaitGroup `state:"nosave"` hasher jenkinsHasher mu sync.Mutex `state:"nosave"` // +checklocks:mu paused bool // +checklocks:mu closed bool } // init initializes a dispatcher and starts the main loop for all the processors // owned by this dispatcher. func (d *dispatcher) init(rng *rand.Rand, nProcessors int) { d.close() d.wait() d.mu.Lock() defer d.mu.Unlock() d.closed = false d.processors = make([]processor, nProcessors) d.hasher = jenkinsHasher{seed: rng.Uint32()} for i := range d.processors { p := &d.processors[i] p.sleeper.AddWaker(&p.newEndpointWaker) p.sleeper.AddWaker(&p.closeWaker) p.sleeper.AddWaker(&p.pauseWaker) p.pauseChan = make(chan struct{}) p.resumeChan = make(chan struct{}) d.wg.Add(1) // NB: sleeper-waker registration must happen synchronously to avoid races // with `close`. It's possible to pull all this logic into `start`, but // that results in a heap-allocated function literal. go p.start(&d.wg) } } // close closes a dispatcher and its processors. func (d *dispatcher) close() { d.mu.Lock() d.closed = true d.mu.Unlock() for i := range d.processors { d.processors[i].close() } } // wait waits for all processor goroutines to end. func (d *dispatcher) wait() { d.wg.Wait() } // queuePacket queues an incoming packet to the matching tcp endpoint and // also queues the endpoint to a processor queue for processing. func (d *dispatcher) queuePacket(stackEP stack.TransportEndpoint, id stack.TransportEndpointID, clock tcpip.Clock, pkt *stack.PacketBuffer) { d.mu.Lock() closed := d.closed d.mu.Unlock() if closed { return } ep := stackEP.(*Endpoint) s, err := newIncomingSegment(id, clock, pkt) if err != nil { ep.stack.Stats().TCP.InvalidSegmentsReceived.Increment() ep.stats.ReceiveErrors.MalformedPacketsReceived.Increment() return } defer s.DecRef() if !s.csumValid { ep.stack.Stats().TCP.ChecksumErrors.Increment() ep.stats.ReceiveErrors.ChecksumErrors.Increment() return } ep.stack.Stats().TCP.ValidSegmentsReceived.Increment() ep.stats.SegmentsReceived.Increment() if (s.flags & header.TCPFlagRst) != 0 { ep.stack.Stats().TCP.ResetsReceived.Increment() } if !ep.enqueueSegment(s) { return } // Only wakeup the processor if endpoint lock is not held by a user // goroutine as endpoint.UnlockUser will wake up the processor if the // segment queue is not empty. if !ep.isOwnedByUser() { d.selectProcessor(id).queueEndpoint(ep) } } // selectProcessor uses a hash of the transport endpoint ID to queue the // endpoint to a specific processor. This is required to main TCP ordering as // queueing the same endpoint to multiple processors can *potentially* result in // out of order processing of incoming segments. It also ensures that a dispatcher // evenly loads the processor goroutines. func (d *dispatcher) selectProcessor(id stack.TransportEndpointID) *processor { return &d.processors[d.hasher.hash(id)%uint32(len(d.processors))] } // pause pauses a dispatcher and all its processor goroutines. func (d *dispatcher) pause() { d.mu.Lock() d.paused = true d.mu.Unlock() for i := range d.processors { <-d.processors[i].pause() } } // resume resumes a previously paused dispatcher and its processor goroutines. // Calling resume on a dispatcher that was never paused is a no-op. func (d *dispatcher) resume() { d.mu.Lock() if !d.paused { // If this was a restore run the stack is a new instance and // it was never paused, so just return as there is nothing to // resume. d.mu.Unlock() return } d.paused = false d.mu.Unlock() for i := range d.processors { d.processors[i].resume() } } // jenkinsHasher contains state needed to for a jenkins hash. // // +stateify savable type jenkinsHasher struct { seed uint32 } // hash hashes the provided TransportEndpointID using the jenkins hash // algorithm. func (j jenkinsHasher) hash(id stack.TransportEndpointID) uint32 { var payload [4]byte binary.LittleEndian.PutUint16(payload[0:], id.LocalPort) binary.LittleEndian.PutUint16(payload[2:], id.RemotePort) h := jenkins.Sum32(j.seed) h.Write(payload[:]) h.Write(id.LocalAddress.AsSlice()) h.Write(id.RemoteAddress.AsSlice()) return h.Sum32() } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/endpoint.go000066400000000000000000003122621465435605700255420ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "container/heap" "fmt" "io" "math" "runtime" "strings" "time" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/sleep" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/ports" "gvisor.dev/gvisor/pkg/tcpip/seqnum" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/waiter" ) // EndpointState represents the state of a TCP endpoint. type EndpointState tcpip.EndpointState // Endpoint states. Note that are represented in a netstack-specific manner and // may not be meaningful externally. Specifically, they need to be translated to // Linux's representation for these states if presented to userspace. const ( _ EndpointState = iota // TCP protocol states in sync with the definitions in // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/tcp_states.h#L13 StateEstablished StateSynSent StateSynRecv StateFinWait1 StateFinWait2 StateTimeWait StateClose StateCloseWait StateLastAck StateListen StateClosing // Endpoint states internal to netstack. StateInitial StateBound StateConnecting // Connect() called, but the initial SYN hasn't been sent. StateError ) const ( // rcvAdvWndScale is used to split the available socket buffer into // application buffer and the window to be advertised to the peer. This is // currently hard coded to split the available space equally. rcvAdvWndScale = 1 // SegOverheadFactor is used to multiply the value provided by the // user on a SetSockOpt for setting the socket send/receive buffer sizes. SegOverheadFactor = 2 ) type connDirectionState uint32 // Connection direction states used for directionState checks in endpoint struct // to detect half-closed connection and deliver POLLRDHUP const ( connDirectionStateOpen connDirectionState = 0 connDirectionStateRcvClosed connDirectionState = 1 connDirectionStateSndClosed connDirectionState = 2 connDirectionStateAll connDirectionState = connDirectionStateOpen | connDirectionStateRcvClosed | connDirectionStateSndClosed ) // connected returns true when s is one of the states representing an // endpoint connected to a peer. func (s EndpointState) connected() bool { switch s { case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing: return true default: return false } } // connecting returns true when s is one of the states representing a // connection in progress, but not yet fully established. func (s EndpointState) connecting() bool { switch s { case StateConnecting, StateSynSent, StateSynRecv: return true default: return false } } // internal returns true when the state is netstack internal. func (s EndpointState) internal() bool { switch s { case StateInitial, StateBound, StateConnecting, StateError: return true default: return false } } // handshake returns true when s is one of the states representing an endpoint // in the middle of a TCP handshake. func (s EndpointState) handshake() bool { switch s { case StateSynSent, StateSynRecv: return true default: return false } } // closed returns true when s is one of the states an endpoint transitions to // when closed or when it encounters an error. This is distinct from a newly // initialized endpoint that was never connected. func (s EndpointState) closed() bool { switch s { case StateClose, StateError: return true default: return false } } // String implements fmt.Stringer.String. func (s EndpointState) String() string { switch s { case StateInitial: return "INITIAL" case StateBound: return "BOUND" case StateConnecting: return "CONNECTING" case StateError: return "ERROR" case StateEstablished: return "ESTABLISHED" case StateSynSent: return "SYN-SENT" case StateSynRecv: return "SYN-RCVD" case StateFinWait1: return "FIN-WAIT1" case StateFinWait2: return "FIN-WAIT2" case StateTimeWait: return "TIME-WAIT" case StateClose: return "CLOSED" case StateCloseWait: return "CLOSE-WAIT" case StateLastAck: return "LAST-ACK" case StateListen: return "LISTEN" case StateClosing: return "CLOSING" default: panic("unreachable") } } // SACKInfo holds TCP SACK related information for a given endpoint. // // +stateify savable type SACKInfo struct { // Blocks is the maximum number of SACK blocks we track // per endpoint. Blocks [MaxSACKBlocks]header.SACKBlock // NumBlocks is the number of valid SACK blocks stored in the // blocks array above. NumBlocks int } // ReceiveErrors collect segment receive errors within transport layer. // // +stateify savable type ReceiveErrors struct { tcpip.ReceiveErrors // SegmentQueueDropped is the number of segments dropped due to // a full segment queue. SegmentQueueDropped tcpip.StatCounter // ChecksumErrors is the number of segments dropped due to bad checksums. ChecksumErrors tcpip.StatCounter // ListenOverflowSynDrop is the number of times the listen queue overflowed // and a SYN was dropped. ListenOverflowSynDrop tcpip.StatCounter // ListenOverflowAckDrop is the number of times the final ACK // in the handshake was dropped due to overflow. ListenOverflowAckDrop tcpip.StatCounter // ZeroRcvWindowState is the number of times we advertised // a zero receive window when rcvQueue is full. ZeroRcvWindowState tcpip.StatCounter // WantZeroWindow is the number of times we wanted to advertise a // zero receive window but couldn't because it would have caused // the receive window's right edge to shrink. WantZeroRcvWindow tcpip.StatCounter } // SendErrors collect segment send errors within the transport layer. // // +stateify savable type SendErrors struct { tcpip.SendErrors // SegmentSendToNetworkFailed is the number of TCP segments failed to be sent // to the network endpoint. SegmentSendToNetworkFailed tcpip.StatCounter // SynSendToNetworkFailed is the number of TCP SYNs failed to be sent // to the network endpoint. SynSendToNetworkFailed tcpip.StatCounter // Retransmits is the number of TCP segments retransmitted. Retransmits tcpip.StatCounter // FastRetransmit is the number of segments retransmitted in fast // recovery. FastRetransmit tcpip.StatCounter // Timeouts is the number of times the RTO expired. Timeouts tcpip.StatCounter } // Stats holds statistics about the endpoint. // // +stateify savable type Stats struct { // SegmentsReceived is the number of TCP segments received that // the transport layer successfully parsed. SegmentsReceived tcpip.StatCounter // SegmentsSent is the number of TCP segments sent. SegmentsSent tcpip.StatCounter // FailedConnectionAttempts is the number of times we saw Connect and // Accept errors. FailedConnectionAttempts tcpip.StatCounter // ReceiveErrors collects segment receive errors within the // transport layer. ReceiveErrors ReceiveErrors // ReadErrors collects segment read errors from an endpoint read call. ReadErrors tcpip.ReadErrors // SendErrors collects segment send errors within the transport layer. SendErrors SendErrors // WriteErrors collects segment write errors from an endpoint write call. WriteErrors tcpip.WriteErrors } // IsEndpointStats is an empty method to implement the tcpip.EndpointStats // marker interface. func (*Stats) IsEndpointStats() {} // sndQueueInfo implements a send queue. // // +stateify savable type sndQueueInfo struct { sndQueueMu sync.Mutex `state:"nosave"` stack.TCPSndBufState // sndWaker is used to signal the protocol goroutine when there may be // segments that need to be sent. sndWaker sleep.Waker `state:"manual"` } // CloneState clones sq into other. It is not thread safe func (sq *sndQueueInfo) CloneState(other *stack.TCPSndBufState) { other.SndBufSize = sq.SndBufSize other.SndBufUsed = sq.SndBufUsed other.SndClosed = sq.SndClosed other.PacketTooBigCount = sq.PacketTooBigCount other.SndMTU = sq.SndMTU other.AutoTuneSndBufDisabled = atomicbitops.FromUint32(sq.AutoTuneSndBufDisabled.RacyLoad()) } // Endpoint represents a TCP endpoint. This struct serves as the interface // between users of the endpoint and the protocol implementation; it is legal to // have concurrent goroutines make calls into the endpoint, they are properly // synchronized. The protocol implementation, however, runs in a single // goroutine. // // Each endpoint has a few mutexes: // // e.mu -> Primary mutex for an endpoint must be held for all operations except // in e.Readiness where acquiring it will result in a deadlock in epoll // implementation. // // The following three mutexes can be acquired independent of e.mu but if // acquired with e.mu then e.mu must be acquired first. // // e.acceptMu -> Protects e.acceptQueue. // e.rcvQueueMu -> Protects e.rcvQueue's associated fields but not e.rcvQueue // itself. // e.sndQueueMu -> Protects the e.sndQueue and associated fields. // e.lastErrorMu -> Protects the lastError field. // // LOCKING/UNLOCKING of the endpoint. The locking of an endpoint is different // based on the context in which the lock is acquired. In the syscall context // e.LockUser/e.UnlockUser should be used and when doing background processing // e.mu.Lock/e.mu.Unlock should be used. The distinction is described below // in brief. // // The reason for this locking behaviour is to avoid wakeups to handle packets. // In cases where the endpoint is already locked the background processor can // queue the packet up and go its merry way and the lock owner will eventually // process the backlog when releasing the lock. Similarly when acquiring the // lock from say a syscall goroutine we can implement a bit of spinning if we // know that the lock is not held by another syscall goroutine. Background // processors should never hold the lock for long and we can avoid an expensive // sleep/wakeup by spinning for a shortwhile. // // For more details please see the detailed documentation on // e.LockUser/e.UnlockUser methods. // // +stateify savable type Endpoint struct { stack.TCPEndpointStateInner stack.TransportEndpointInfo tcpip.DefaultSocketOptionsHandler // EndpointEntry is used to queue endpoints for processing to the // a given tcp processor goroutine. // // Precondition: epQueue.mu must be held to read/write this field.. endpointEntry `state:"nosave"` // pendingProcessingMu protects pendingProcessing. pendingProcessingMu sync.Mutex `state:"nosave"` // pendingProcessing is true if this endpoint is queued for processing // to a TCP processor. // +checklocks:pendingProcessingMu pendingProcessing bool `state:"nosave"` // The following fields are initialized at creation time and do not // change throughout the lifetime of the endpoint. stack *stack.Stack `state:"manual"` protocol *protocol `state:"manual"` waiterQueue *waiter.Queue `state:"wait"` // hardError is meaningful only when state is stateError. It stores the // error to be returned when read/write syscalls are called and the // endpoint is in this state. hardError is protected by endpoint mu. hardError tcpip.Error // lastError represents the last error that the endpoint reported; // access to it is protected by the following mutex. lastErrorMu sync.Mutex `state:"nosave"` lastError tcpip.Error rcvQueueMu sync.Mutex `state:"nosave"` // +checklocks:rcvQueueMu stack.TCPRcvBufState // rcvMemUsed tracks the total amount of memory in use by received segments // held in rcvQueue, pendingRcvdSegments and the segment queue. This is used to // compute the window and the actual available buffer space. This is distinct // from rcvBufUsed above which is the actual number of payload bytes held in // the buffer not including any segment overheads. rcvMemUsed atomicbitops.Int32 // mu protects all endpoint fields unless documented otherwise. mu must // be acquired before interacting with the endpoint fields. // // During handshake, mu is locked by the protocol listen goroutine and // released by the handshake completion goroutine. mu sync.CrossGoroutineMutex `state:"nosave"` ownedByUser atomicbitops.Uint32 // rcvQueue is the queue for ready-for-delivery segments. // // +checklocks:mu rcvQueue segmentList `state:"wait"` // state must be read/set using the EndpointState()/setEndpointState() // methods. state atomicbitops.Uint32 `state:".(EndpointState)"` // connectionDirectionState holds current state of send and receive, // accessed atomically connectionDirectionState atomicbitops.Uint32 // origEndpointState is only used during a restore phase to save the // endpoint state at restore time as the socket is moved to it's correct // state. origEndpointState uint32 `state:"nosave"` isPortReserved bool `state:"manual"` isRegistered bool `state:"manual"` boundNICID tcpip.NICID route *stack.Route `state:"manual"` ipv4TTL uint8 ipv6HopLimit int16 isConnectNotified bool // h stores a reference to the current handshake state if the endpoint is in // the SYN-SENT or SYN-RECV states, in which case endpoint == endpoint.h.ep. // nil otherwise. // +checklocks:mu h *handshake // portFlags stores the current values of port related flags. portFlags ports.Flags // Values used to reserve a port or register a transport endpoint // (which ever happens first). boundBindToDevice tcpip.NICID boundPortFlags ports.Flags boundDest tcpip.FullAddress // effectiveNetProtos contains the network protocols actually in use. In // most cases it will only contain "netProto", but in cases like IPv6 // endpoints with v6only set to false, this could include multiple // protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g., // IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped // address). effectiveNetProtos []tcpip.NetworkProtocolNumber // recentTSTime is the unix time when we last updated // TCPEndpointStateInner.RecentTS. recentTSTime tcpip.MonotonicTime // shutdownFlags represent the current shutdown state of the endpoint. shutdownFlags tcpip.ShutdownFlags // tcpRecovery is the loss recovery algorithm used by TCP. tcpRecovery tcpip.TCPRecovery // sack holds TCP SACK related information for this endpoint. sack SACKInfo // delay enables Nagle's algorithm. // // delay is a boolean (0 is false) and must be accessed atomically. delay uint32 // scoreboard holds TCP SACK Scoreboard information for this endpoint. scoreboard *SACKScoreboard // segmentQueue is used to hand received segments to the protocol // goroutine. Segments are queued as long as the queue is not full, // and dropped when it is. segmentQueue segmentQueue `state:"wait"` // userMSS if non-zero is the MSS value explicitly set by the user // for this endpoint using the TCP_MAXSEG setsockopt. userMSS uint16 // maxSynRetries is the maximum number of SYN retransmits that TCP should // send before aborting the attempt to connect. It cannot exceed 255. // // NOTE: This is currently a no-op and does not change the SYN // retransmissions. maxSynRetries uint8 // windowClamp is used to bound the size of the advertised window to // this value. windowClamp uint32 // sndQueueInfo contains the implementation of the endpoint's send queue. sndQueueInfo sndQueueInfo // cc stores the name of the Congestion Control algorithm to use for // this endpoint. cc tcpip.CongestionControlOption // keepalive manages TCP keepalive state. When the connection is idle // (no data sent or received) for keepaliveIdle, we start sending // keepalives every keepalive.interval. If we send keepalive.count // without hearing a response, the connection is closed. keepalive keepalive // userTimeout if non-zero specifies a user specified timeout for // a connection w/ pending data to send. A connection that has pending // unacked data will be forcibily aborted if the timeout is reached // without any data being acked. userTimeout time.Duration // deferAccept if non-zero specifies a user specified time during // which the final ACK of a handshake will be dropped provided the // ACK is a bare ACK and carries no data. If the timeout is crossed then // the bare ACK is accepted and the connection is delivered to the // listener. deferAccept time.Duration // acceptMu protects accepQueue acceptMu sync.Mutex `state:"nosave"` // acceptQueue is used by a listening endpoint to send newly accepted // connections to the endpoint so that they can be read by Accept() // calls. // // +checklocks:acceptMu acceptQueue acceptQueue // The following are only used from the protocol goroutine, and // therefore don't need locks to protect them. rcv *receiver `state:"wait"` snd *sender `state:"wait"` // The goroutine drain completion notification channel. drainDone chan struct{} `state:"nosave"` // The goroutine undrain notification channel. This is currently used as // a way to block the worker goroutines. Today nothing closes/writes // this channel and this causes any goroutines waiting on this to just // block. This is used during save/restore to prevent worker goroutines // from mutating state as it's being saved. undrain chan struct{} `state:"nosave"` // probe if not nil is invoked on every received segment. It is passed // a copy of the current state of the endpoint. probe stack.TCPProbeFunc `state:"nosave"` // The following are only used to assist the restore run to re-connect. connectingAddress tcpip.Address // amss is the advertised MSS to the peer by this endpoint. amss uint16 // sendTOS represents IPv4 TOS or IPv6 TrafficClass, // applied while sending packets. Defaults to 0 as on Linux. sendTOS uint8 gso stack.GSO stats Stats // tcpLingerTimeout is the maximum amount of a time a socket // a socket stays in TIME_WAIT state before being marked // closed. tcpLingerTimeout time.Duration // closed indicates that the user has called closed on the // endpoint and at this point the endpoint is only around // to complete the TCP shutdown. closed bool // txHash is the transport layer hash to be set on outbound packets // emitted by this endpoint. txHash uint32 // owner is used to get uid and gid of the packet. owner tcpip.PacketOwner // ops is used to get socket level options. ops tcpip.SocketOptions // lastOutOfWindowAckTime is the time at which the an ACK was sent in response // to an out of window segment being received by this endpoint. lastOutOfWindowAckTime tcpip.MonotonicTime // finWait2Timer is used to reap orphaned sockets in FIN-WAIT-2 where the peer // is yet to send a FIN but on our end the socket is fully closed i.e. endpoint.Close() // has been called on the socket. This timer is not started for sockets that // are waiting for a peer FIN but are not closed. finWait2Timer tcpip.Timer `state:"nosave"` // timeWaitTimer is used to reap a socket once a socket has been in TIME-WAIT state // for tcp.DefaultTCPTimeWaitTimeout seconds. timeWaitTimer tcpip.Timer `state:"nosave"` // listenCtx is used by listening endpoints to store state used while listening for // connections. Nil otherwise. listenCtx *listenContext `state:"nosave"` // limRdr is reused to avoid allocations. // // +checklocks:mu limRdr *io.LimitedReader `state:"nosave"` // pmtud is the PMTUD strategy to use. // // +checklocks:mu pmtud tcpip.PMTUDStrategy } // calculateAdvertisedMSS calculates the MSS to advertise. // // If userMSS is non-zero and is not greater than the maximum possible MSS for // r, it will be used; otherwise, the maximum possible MSS will be used. func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 { // The maximum possible MSS is dependent on the route. // TODO(b/143359391): Respect TCP Min and Max size. maxMSS := uint16(r.MTU() - header.TCPMinimumSize) if userMSS != 0 && userMSS < maxMSS { return userMSS } return maxMSS } // isOwnedByUser() returns true if the endpoint lock is currently // held by a user(syscall) goroutine. func (e *Endpoint) isOwnedByUser() bool { return e.ownedByUser.Load() == 1 } // LockUser tries to lock e.mu and if it fails it will check if the lock is held // by another syscall goroutine. If yes, then it will goto sleep waiting for the // lock to be released, if not then it will spin till it acquires the lock or // another syscall goroutine acquires it in which case it will goto sleep as // described above. // // The assumption behind spinning here being that background packet processing // should not be holding the lock for long and spinning reduces latency as we // avoid an expensive sleep/wakeup of the syscall goroutine). // +checklocksacquire:e.mu func (e *Endpoint) LockUser() { const iterations = 5 for i := 0; i < iterations; i++ { // Try first if the sock is locked then check if it's owned // by another user goroutine if not then we spin, otherwise // we just go to sleep on the Lock() and wait. if !e.TryLock() { // If socket is owned by the user then just go to sleep // as the lock could be held for a reasonably long time. if e.ownedByUser.Load() == 1 { e.mu.Lock() e.ownedByUser.Store(1) return } // Spin but don't yield the processor since the lower half // should yield the lock soon. continue } e.ownedByUser.Store(1) return } for i := 0; i < iterations; i++ { // Try first if the sock is locked then check if it's owned // by another user goroutine if not then we spin, otherwise // we just go to sleep on the Lock() and wait. if !e.TryLock() { // If socket is owned by the user then just go to sleep // as the lock could be held for a reasonably long time. if e.ownedByUser.Load() == 1 { e.mu.Lock() e.ownedByUser.Store(1) return } // Spin but yield the processor since the lower half // should yield the lock soon. runtime.Gosched() continue } e.ownedByUser.Store(1) return } // Finally just give up and wait for the Lock. e.mu.Lock() e.ownedByUser.Store(1) } // UnlockUser will check if there are any segments already queued for processing // and wake up a processor goroutine to process them before unlocking e.mu. // This is required because we when packets arrive and endpoint lock is already // held then such packets are queued up to be processed. // // Precondition: e.LockUser() must have been called before calling e.UnlockUser() // +checklocksrelease:e.mu func (e *Endpoint) UnlockUser() { // Lock segment queue before checking so that we avoid a race where // segments can be queued between the time we check if queue is empty // and actually unlock the endpoint mutex. e.segmentQueue.mu.Lock() if e.segmentQueue.emptyLocked() { if e.ownedByUser.Swap(0) != 1 { panic("e.UnlockUser() called without calling e.LockUser()") } e.mu.Unlock() e.segmentQueue.mu.Unlock() return } e.segmentQueue.mu.Unlock() // Since we are waking the processor goroutine here just unlock // and let it process the queued segments. if e.ownedByUser.Swap(0) != 1 { panic("e.UnlockUser() called without calling e.LockUser()") } processor := e.protocol.dispatcher.selectProcessor(e.ID) e.mu.Unlock() // Wake up the processor for this endpoint to process any queued // segments after releasing the lock to avoid the case where if the // processor goroutine starts running before we release the lock here // then it will fail to process as TryLock() will fail. processor.queueEndpoint(e) return } // StopWork halts packet processing. Only to be used in tests. // +checklocksacquire:e.mu func (e *Endpoint) StopWork() { e.mu.Lock() } // ResumeWork resumes packet processing. Only to be used in tests. // +checklocksrelease:e.mu func (e *Endpoint) ResumeWork() { e.mu.Unlock() } // AssertLockHeld forces the checklocks analyzer to consider e.mu held. This is // used in places where we know that e.mu is held, but checklocks does not, // which can happen when creating new locked objects. You must pass the known // locked endpoint to this function and it must be the same as the caller // endpoint. // TODO(b/226403629): Remove this function once checklocks understands local // variable locks. // +checklocks:locked.mu // +checklocksacquire:e.mu func (e *Endpoint) AssertLockHeld(locked *Endpoint) { if e != locked { panic("AssertLockHeld failed: locked endpoint != asserting endpoint") } } // TryLock is a helper that calls TryLock on the endpoint's mutex and // adds the necessary checklocks annotations. // TODO(b/226403629): Remove this once checklocks understands TryLock. // +checklocksacquire:e.mu func (e *Endpoint) TryLock() bool { if e.mu.TryLock() { return true // +checklocksforce } return false // +checklocksignore } // setEndpointState updates the state of the endpoint to state atomically. This // method is unexported as the only place we should update the state is in this // package but we allow the state to be read freely without holding e.mu. // // +checklocks:e.mu func (e *Endpoint) setEndpointState(state EndpointState) { oldstate := EndpointState(e.state.Swap(uint32(state))) switch state { case StateEstablished: e.stack.Stats().TCP.CurrentEstablished.Increment() e.stack.Stats().TCP.CurrentConnected.Increment() case StateError: fallthrough case StateClose: if oldstate == StateCloseWait || oldstate == StateEstablished { e.stack.Stats().TCP.EstablishedResets.Increment() } if oldstate.connected() { e.stack.Stats().TCP.CurrentConnected.Decrement() } fallthrough default: if oldstate == StateEstablished { e.stack.Stats().TCP.CurrentEstablished.Decrement() } } } // EndpointState returns the current state of the endpoint. func (e *Endpoint) EndpointState() EndpointState { return EndpointState(e.state.Load()) } // setRecentTimestamp sets the recentTS field to the provided value. func (e *Endpoint) setRecentTimestamp(recentTS uint32) { e.RecentTS = recentTS e.recentTSTime = e.stack.Clock().NowMonotonic() } // recentTimestamp returns the value of the recentTS field. func (e *Endpoint) recentTimestamp() uint32 { return e.RecentTS } // TODO(gvisor.dev/issue/6974): Remove once tcp endpoints are composed with a // network.Endpoint, which also defines this function. func calculateTTL(route *stack.Route, ipv4TTL uint8, ipv6HopLimit int16) uint8 { switch netProto := route.NetProto(); netProto { case header.IPv4ProtocolNumber: if ipv4TTL == tcpip.UseDefaultIPv4TTL { return route.DefaultTTL() } return ipv4TTL case header.IPv6ProtocolNumber: if ipv6HopLimit == tcpip.UseDefaultIPv6HopLimit { return route.DefaultTTL() } return uint8(ipv6HopLimit) default: panic(fmt.Sprintf("invalid protocol number = %d", netProto)) } } // keepalive is a synchronization wrapper used to appease stateify. See the // comment in endpoint, where it is used. // // +stateify savable type keepalive struct { sync.Mutex `state:"nosave"` idle time.Duration interval time.Duration count int unacked int // should never be a zero timer if the endpoint is not closed. timer timer `state:"nosave"` waker sleep.Waker `state:"nosave"` } func newEndpoint(s *stack.Stack, protocol *protocol, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *Endpoint { e := &Endpoint{ stack: s, protocol: protocol, TransportEndpointInfo: stack.TransportEndpointInfo{ NetProto: netProto, TransProto: header.TCPProtocolNumber, }, sndQueueInfo: sndQueueInfo{ TCPSndBufState: stack.TCPSndBufState{ SndMTU: math.MaxInt32, }, }, waiterQueue: waiterQueue, state: atomicbitops.FromUint32(uint32(StateInitial)), keepalive: keepalive{ idle: DefaultKeepaliveIdle, interval: DefaultKeepaliveInterval, count: DefaultKeepaliveCount, }, ipv4TTL: tcpip.UseDefaultIPv4TTL, ipv6HopLimit: tcpip.UseDefaultIPv6HopLimit, // txHash only determines which outgoing queue to use, so // InsecureRNG is fine. txHash: s.InsecureRNG().Uint32(), windowClamp: DefaultReceiveBufferSize, maxSynRetries: DefaultSynRetries, limRdr: &io.LimitedReader{}, } e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits, GetTCPReceiveBufferLimits) e.ops.SetMulticastLoop(true) e.ops.SetQuickAck(true) e.ops.SetSendBufferSize(DefaultSendBufferSize, false /* notify */) e.ops.SetReceiveBufferSize(DefaultReceiveBufferSize, false /* notify */) var ss tcpip.TCPSendBufferSizeRangeOption if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil { e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) } var rs tcpip.TCPReceiveBufferSizeRangeOption if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil { e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */) } var cs tcpip.CongestionControlOption if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil { e.cc = cs } var mrb tcpip.TCPModerateReceiveBufferOption if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil { e.RcvAutoParams.Disabled = !bool(mrb) } var de tcpip.TCPDelayEnabled if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de { e.ops.SetDelayOption(true) } var tcpLT tcpip.TCPLingerTimeoutOption if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil { e.tcpLingerTimeout = time.Duration(tcpLT) } var synRetries tcpip.TCPSynRetriesOption if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil { e.maxSynRetries = uint8(synRetries) } if p := s.GetTCPProbe(); p != nil { e.probe = p } e.segmentQueue.ep = e // TODO(https://gvisor.dev/issues/7493): Defer creating the timer until TCP connection becomes // established. e.keepalive.timer.init(e.stack.Clock(), timerHandler(e, e.keepaliveTimerExpired)) return e } // Readiness returns the current readiness of the endpoint. For example, if // waiter.EventIn is set, the endpoint is immediately readable. func (e *Endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { result := waiter.EventMask(0) switch e.EndpointState() { case StateInitial, StateBound: // This prevents blocking of new sockets which are not // connected when SO_LINGER is set. result |= waiter.EventHUp case StateConnecting, StateSynSent, StateSynRecv: // Ready for nothing. case StateClose, StateError, StateTimeWait: // Ready for anything. result = mask case StateListen: // Check if there's anything in the accepted queue. if (mask & waiter.ReadableEvents) != 0 { e.acceptMu.Lock() if e.acceptQueue.endpoints.Len() != 0 { result |= waiter.ReadableEvents } e.acceptMu.Unlock() } } if e.EndpointState().connected() { // Determine if the endpoint is writable if requested. if (mask & waiter.WritableEvents) != 0 { e.sndQueueInfo.sndQueueMu.Lock() sndBufSize := e.getSendBufferSize() if e.sndQueueInfo.SndClosed || e.sndQueueInfo.SndBufUsed < sndBufSize { result |= waiter.WritableEvents } if e.sndQueueInfo.SndClosed { e.updateConnDirectionState(connDirectionStateSndClosed) } e.sndQueueInfo.sndQueueMu.Unlock() } // Determine if the endpoint is readable if requested. if (mask & waiter.ReadableEvents) != 0 { e.rcvQueueMu.Lock() if e.RcvBufUsed > 0 || e.RcvClosed { result |= waiter.ReadableEvents } if e.RcvClosed { e.updateConnDirectionState(connDirectionStateRcvClosed) } e.rcvQueueMu.Unlock() } } // Determine whether endpoint is half-closed with rcv shutdown if e.connDirectionState() == connDirectionStateRcvClosed { result |= waiter.EventRdHUp } return result } // Purging pending rcv segments is only necessary on RST. func (e *Endpoint) purgePendingRcvQueue() { if e.rcv != nil { for e.rcv.pendingRcvdSegments.Len() > 0 { s := heap.Pop(&e.rcv.pendingRcvdSegments).(*segment) s.DecRef() } } } // +checklocks:e.mu func (e *Endpoint) purgeReadQueue() { if e.rcv != nil { e.rcvQueueMu.Lock() defer e.rcvQueueMu.Unlock() for { s := e.rcvQueue.Front() if s == nil { break } e.rcvQueue.Remove(s) s.DecRef() } e.RcvBufUsed = 0 } } // +checklocks:e.mu func (e *Endpoint) purgeWriteQueue() { if e.snd != nil { e.sndQueueInfo.sndQueueMu.Lock() defer e.sndQueueInfo.sndQueueMu.Unlock() e.snd.updateWriteNext(nil) for { s := e.snd.writeList.Front() if s == nil { break } e.snd.writeList.Remove(s) s.DecRef() } e.sndQueueInfo.SndBufUsed = 0 e.sndQueueInfo.SndClosed = true } } // Abort implements stack.TransportEndpoint.Abort. func (e *Endpoint) Abort() { defer e.drainClosingSegmentQueue() e.LockUser() defer e.UnlockUser() defer e.purgeReadQueue() // Reset all connected endpoints. switch state := e.EndpointState(); { case state.connected(): e.resetConnectionLocked(&tcpip.ErrAborted{}) e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) return } e.closeLocked() } // Close puts the endpoint in a closed state and frees all resources associated // with it. It must be called only once and with no other concurrent calls to // the endpoint. func (e *Endpoint) Close() { e.LockUser() if e.closed { e.UnlockUser() return } // We always want to purge the read queue, but do so after the checks in // shutdownLocked. e.closeLocked() e.purgeReadQueue() if e.EndpointState() == StateClose || e.EndpointState() == StateError { // It should be safe to purge the read queue now as the endpoint // is now closed or in an error state and further reads are not // permitted. e.UnlockUser() e.drainClosingSegmentQueue() e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) return } e.UnlockUser() } // +checklocks:e.mu func (e *Endpoint) closeLocked() { linger := e.SocketOptions().GetLinger() if linger.Enabled && linger.Timeout == 0 { s := e.EndpointState() isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv if isResetState { // Close the endpoint without doing full shutdown and // send a RST. e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) return } } // Issue a shutdown so that the peer knows we won't send any more data // if we're connected, or stop accepting if we're listening. e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead) e.closeNoShutdownLocked() } // closeNoShutdown closes the endpoint without doing a full shutdown. // +checklocks:e.mu func (e *Endpoint) closeNoShutdownLocked() { // For listening sockets, we always release ports inline so that they // are immediately available for reuse after Close() is called. If also // registered, we unregister as well otherwise the next user would fail // in Listen() when trying to register. if e.EndpointState() == StateListen && e.isPortReserved { if e.isRegistered { e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) e.isRegistered = false } portRes := ports.Reservation{ Networks: e.effectiveNetProtos, Transport: ProtocolNumber, Addr: e.TransportEndpointInfo.ID.LocalAddress, Port: e.TransportEndpointInfo.ID.LocalPort, Flags: e.boundPortFlags, BindToDevice: e.boundBindToDevice, Dest: e.boundDest, } e.stack.ReleasePort(portRes) e.isPortReserved = false e.boundBindToDevice = 0 e.boundPortFlags = ports.Flags{} e.boundDest = tcpip.FullAddress{} } // Mark endpoint as closed. e.closed = true tcpip.AddDanglingEndpoint(e) eventMask := waiter.ReadableEvents | waiter.WritableEvents switch e.EndpointState() { case StateInitial, StateBound, StateListen: e.setEndpointState(StateClose) fallthrough case StateClose, StateError: eventMask |= waiter.EventHUp e.cleanupLocked() case StateConnecting, StateSynSent, StateSynRecv: // Abort the handshake and set the error. // Notify that the endpoint is closed. eventMask |= waiter.EventHUp e.handshakeFailed(&tcpip.ErrAborted{}) // Notify that the endpoint is closed. eventMask |= waiter.EventHUp case StateFinWait2: // The socket has been closed and we are in FIN-WAIT-2 so start // the FIN-WAIT-2 timer. if e.finWait2Timer == nil { e.finWait2Timer = e.stack.Clock().AfterFunc(e.tcpLingerTimeout, e.finWait2TimerExpired) } } e.waiterQueue.Notify(eventMask) } // closePendingAcceptableConnections closes all connections that have completed // handshake but not yet been delivered to the application. func (e *Endpoint) closePendingAcceptableConnectionsLocked() { e.acceptMu.Lock() pendingEndpoints := e.acceptQueue.pendingEndpoints e.acceptQueue.pendingEndpoints = nil completedEndpoints := make([]*Endpoint, 0, e.acceptQueue.endpoints.Len()) for n := e.acceptQueue.endpoints.Front(); n != nil; n = n.Next() { completedEndpoints = append(completedEndpoints, n.Value.(*Endpoint)) } e.acceptQueue.endpoints.Init() e.acceptQueue.capacity = 0 e.acceptMu.Unlock() // Close any endpoints in SYN-RCVD state. for n := range pendingEndpoints { n.Abort() } // Reset all connections that are waiting to be accepted. for _, n := range completedEndpoints { n.Abort() } } // cleanupLocked frees all resources associated with the endpoint. // +checklocks:e.mu func (e *Endpoint) cleanupLocked() { if e.snd != nil { e.snd.resendTimer.cleanup() e.snd.probeTimer.cleanup() e.snd.reorderTimer.cleanup() e.snd.corkTimer.cleanup() } if e.finWait2Timer != nil { e.finWait2Timer.Stop() } if e.timeWaitTimer != nil { e.timeWaitTimer.Stop() } // Close all endpoints that might have been accepted by TCP but not by // the client. e.closePendingAcceptableConnectionsLocked() e.keepalive.timer.cleanup() if e.isRegistered { e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) e.isRegistered = false } if e.isPortReserved { portRes := ports.Reservation{ Networks: e.effectiveNetProtos, Transport: ProtocolNumber, Addr: e.TransportEndpointInfo.ID.LocalAddress, Port: e.TransportEndpointInfo.ID.LocalPort, Flags: e.boundPortFlags, BindToDevice: e.boundBindToDevice, Dest: e.boundDest, } e.stack.ReleasePort(portRes) e.isPortReserved = false } e.boundBindToDevice = 0 e.boundPortFlags = ports.Flags{} e.boundDest = tcpip.FullAddress{} if e.route != nil { e.route.Release() e.route = nil } e.purgeWriteQueue() // Only purge the read queue here if the socket is fully closed by the // user. if e.closed { e.purgeReadQueue() } e.stack.CompleteTransportEndpointCleanup(e) tcpip.DeleteDanglingEndpoint(e) } // wndFromSpace returns the window that we can advertise based on the available // receive buffer space. func wndFromSpace(space int) int { return space >> rcvAdvWndScale } // initialReceiveWindow returns the initial receive window to advertise in the // SYN/SYN-ACK. func (e *Endpoint) initialReceiveWindow() int { rcvWnd := wndFromSpace(e.receiveBufferAvailable()) if rcvWnd > math.MaxUint16 { rcvWnd = math.MaxUint16 } // Use the user supplied MSS, if available. routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2 if rcvWnd > routeWnd { rcvWnd = routeWnd } rcvWndScale := e.rcvWndScaleForHandshake() // Round-down the rcvWnd to a multiple of wndScale. This ensures that the // window offered in SYN won't be reduced due to the loss of precision if // window scaling is enabled after the handshake. rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale) // Ensure we can always accept at least 1 byte if the scale specified // was too high for the provided rcvWnd. if rcvWnd == 0 { rcvWnd = 1 } return rcvWnd } // ModerateRecvBuf adjusts the receive buffer and the advertised window // based on the number of bytes copied to userspace. func (e *Endpoint) ModerateRecvBuf(copied int) { e.LockUser() defer e.UnlockUser() sendNonZeroWindowUpdate := false e.rcvQueueMu.Lock() if e.RcvAutoParams.Disabled { e.rcvQueueMu.Unlock() return } now := e.stack.Clock().NowMonotonic() if rtt := e.RcvAutoParams.RTT; rtt == 0 || now.Sub(e.RcvAutoParams.MeasureTime) < rtt { e.RcvAutoParams.CopiedBytes += copied e.rcvQueueMu.Unlock() return } prevRTTCopied := e.RcvAutoParams.CopiedBytes + copied prevCopied := e.RcvAutoParams.PrevCopiedBytes rcvWnd := 0 if prevRTTCopied > prevCopied { // The minimal receive window based on what was copied by the app // in the immediate preceding RTT and some extra buffer for 16 // segments to account for variations. // We multiply by 2 to account for packet losses. rcvWnd = prevRTTCopied*2 + 16*int(e.amss) // Scale for slow start based on bytes copied in this RTT vs previous. grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied // Multiply growth factor by 2 again to account for sender being // in slow-start where the sender grows it's congestion window // by 100% per RTT. rcvWnd += grow * 2 // Make sure auto tuned buffer size can always receive upto 2x // the initial window of 10 segments. if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd { rcvWnd = minRcvWnd } // Cap the auto tuned buffer size by the maximum permissible // receive buffer size. if max := e.maxReceiveBufferSize(); rcvWnd > max { rcvWnd = max } // We do not adjust downwards as that can cause the receiver to // reject valid data that might already be in flight as the // acceptable window will shrink. rcvBufSize := int(e.ops.GetReceiveBufferSize()) if rcvWnd > rcvBufSize { availBefore := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize)) e.ops.SetReceiveBufferSize(int64(rcvWnd), false /* notify */) availAfter := wndFromSpace(e.receiveBufferAvailableLocked(rcvWnd)) if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, rcvBufSize); crossed && above { sendNonZeroWindowUpdate = true } } // We only update PrevCopiedBytes when we grow the buffer because in cases // where PrevCopiedBytes > prevRTTCopied the existing buffer is already big // enough to handle the current rate and we don't need to do any // adjustments. e.RcvAutoParams.PrevCopiedBytes = prevRTTCopied } e.RcvAutoParams.MeasureTime = now e.RcvAutoParams.CopiedBytes = 0 e.rcvQueueMu.Unlock() // Send the update after unlocking rcvQueueMu as sending a segment acquires // the lock to calculate the window to be sent. if e.EndpointState().connected() && sendNonZeroWindowUpdate { e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu } } // SetOwner implements tcpip.Endpoint.SetOwner. func (e *Endpoint) SetOwner(owner tcpip.PacketOwner) { e.owner = owner } // +checklocks:e.mu func (e *Endpoint) hardErrorLocked() tcpip.Error { err := e.hardError e.hardError = nil return err } // +checklocks:e.mu func (e *Endpoint) lastErrorLocked() tcpip.Error { e.lastErrorMu.Lock() defer e.lastErrorMu.Unlock() err := e.lastError e.lastError = nil return err } // LastError implements tcpip.Endpoint.LastError. func (e *Endpoint) LastError() tcpip.Error { e.LockUser() defer e.UnlockUser() if err := e.hardErrorLocked(); err != nil { return err } return e.lastErrorLocked() } // LastErrorLocked reads and clears lastError. // Only to be used in tests. // +checklocks:e.mu func (e *Endpoint) LastErrorLocked() tcpip.Error { return e.lastErrorLocked() } // UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError. func (e *Endpoint) UpdateLastError(err tcpip.Error) { e.LockUser() e.lastErrorMu.Lock() e.lastError = err e.lastErrorMu.Unlock() e.UnlockUser() } // Read implements tcpip.Endpoint.Read. func (e *Endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) { e.LockUser() defer e.UnlockUser() if err := e.checkReadLocked(); err != nil { if _, ok := err.(*tcpip.ErrClosedForReceive); ok { e.stats.ReadErrors.ReadClosed.Increment() } return tcpip.ReadResult{}, err } var err error done := 0 // N.B. Here we get the first segment to be processed. It is safe to not // hold rcvQueueMu when processing, since we hold e.mu to ensure we only // remove segments from the list through Read() and that new segments // cannot be appended. s := e.rcvQueue.Front() for s != nil { var n int n, err = s.ReadTo(dst, opts.Peek) // Book keeping first then error handling. done += n if opts.Peek { s = s.Next() } else { sendNonZeroWindowUpdate := false memDelta := 0 for { seg := e.rcvQueue.Front() if seg == nil || seg.payloadSize() != 0 { break } e.rcvQueue.Remove(seg) // Memory is only considered released when the whole segment has been // read. memDelta += seg.segMemSize() seg.DecRef() } e.rcvQueueMu.Lock() e.RcvBufUsed -= n s = e.rcvQueue.Front() if memDelta > 0 { // If the window was small before this read and if the read freed up // enough buffer space, to either fit an aMSS or half a receive buffer // (whichever smaller), then notify the protocol goroutine to send a // window update. if crossed, above := e.windowCrossedACKThresholdLocked(memDelta, int(e.ops.GetReceiveBufferSize())); crossed && above { sendNonZeroWindowUpdate = true } } e.rcvQueueMu.Unlock() if e.EndpointState().connected() && sendNonZeroWindowUpdate { e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu } } if err != nil { break } } // If something is read, we must report it. Report error when nothing is read. if done == 0 && err != nil { return tcpip.ReadResult{}, &tcpip.ErrBadBuffer{} } return tcpip.ReadResult{ Count: done, Total: done, }, nil } // checkRead checks that endpoint is in a readable state. // // +checklocks:e.mu func (e *Endpoint) checkReadLocked() tcpip.Error { e.rcvQueueMu.Lock() defer e.rcvQueueMu.Unlock() // When in SYN-SENT state, let the caller block on the receive. // An application can initiate a non-blocking connect and then block // on a receive. It can expect to read any data after the handshake // is complete. RFC793, section 3.9, p58. if e.EndpointState() == StateSynSent { return &tcpip.ErrWouldBlock{} } // The endpoint can be read if it's connected, or if it's already closed // but has some pending unread data. Also note that a RST being received // would cause the state to become StateError so we should allow the // reads to proceed before returning a ECONNRESET. bufUsed := e.RcvBufUsed if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 { if s == StateError { if err := e.hardErrorLocked(); err != nil { return err } return &tcpip.ErrClosedForReceive{} } e.stats.ReadErrors.NotConnected.Increment() return &tcpip.ErrNotConnected{} } if e.RcvBufUsed == 0 { if e.RcvClosed || !e.EndpointState().connected() { return &tcpip.ErrClosedForReceive{} } return &tcpip.ErrWouldBlock{} } return nil } // isEndpointWritableLocked checks if a given endpoint is writable // and also returns the number of bytes that can be written at this // moment. If the endpoint is not writable then it returns an error // indicating the reason why it's not writable. // +checklocks:e.mu // +checklocks:e.sndQueueInfo.sndQueueMu func (e *Endpoint) isEndpointWritableLocked() (int, tcpip.Error) { // The endpoint cannot be written to if it's not connected. switch s := e.EndpointState(); { case s == StateError: if err := e.hardErrorLocked(); err != nil { return 0, err } return 0, &tcpip.ErrClosedForSend{} case !s.connecting() && !s.connected(): return 0, &tcpip.ErrClosedForSend{} case s.connecting(): // As per RFC793, page 56, a send request arriving when in connecting // state, can be queued to be completed after the state becomes // connected. Return an error code for the caller of endpoint Write to // try again, until the connection handshake is complete. return 0, &tcpip.ErrWouldBlock{} } // Check if the connection has already been closed for sends. if e.sndQueueInfo.SndClosed { return 0, &tcpip.ErrClosedForSend{} } sndBufSize := e.getSendBufferSize() avail := sndBufSize - e.sndQueueInfo.SndBufUsed if avail <= 0 { return 0, &tcpip.ErrWouldBlock{} } return avail, nil } // readFromPayloader reads a slice from the Payloader. // +checklocks:e.mu // +checklocks:e.sndQueueInfo.sndQueueMu func (e *Endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) (buffer.Buffer, tcpip.Error) { // We can release locks while copying data. // // This is not possible if atomic is set, because we can't allow the // available buffer space to be consumed by some other caller while we // are copying data in. limRdr := e.limRdr if !opts.Atomic { defer func() { e.limRdr = limRdr }() e.limRdr = nil e.sndQueueInfo.sndQueueMu.Unlock() defer e.sndQueueInfo.sndQueueMu.Lock() e.UnlockUser() defer e.LockUser() } // Fetch data. var payload buffer.Buffer if l := p.Len(); l < avail { avail = l } if avail == 0 { return payload, nil } if _, err := payload.WriteFromReaderAndLimitedReader(p, int64(avail), limRdr); err != nil { payload.Release() return buffer.Buffer{}, &tcpip.ErrBadBuffer{} } return payload, nil } // queueSegment reads data from the payloader and returns a segment to be sent. // +checklocks:e.mu func (e *Endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) { e.sndQueueInfo.sndQueueMu.Lock() defer e.sndQueueInfo.sndQueueMu.Unlock() avail, err := e.isEndpointWritableLocked() if err != nil { e.stats.WriteErrors.WriteClosed.Increment() return nil, 0, err } buf, err := e.readFromPayloader(p, opts, avail) if err != nil { return nil, 0, err } // Do not queue zero length segments. if buf.Size() == 0 { return nil, 0, nil } if !opts.Atomic { // Since we released locks in between it's possible that the // endpoint transitioned to a CLOSED/ERROR states so make // sure endpoint is still writable before trying to write. avail, err := e.isEndpointWritableLocked() if err != nil { e.stats.WriteErrors.WriteClosed.Increment() buf.Release() return nil, 0, err } // A simultaneous call to write on the socket can reduce avail. Discard // excess data copied if this is the case. if int64(avail) < buf.Size() { buf.Truncate(int64(avail)) } } // Add data to the send queue. size := int(buf.Size()) s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buf) e.sndQueueInfo.SndBufUsed += size e.snd.writeList.PushBack(s) return s, size, nil } // Write writes data to the endpoint's peer. func (e *Endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { // Linux completely ignores any address passed to sendto(2) for TCP sockets // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More // and opts.EndOfRecord are also ignored. e.LockUser() defer e.UnlockUser() // Return if either we didn't queue anything or if an error occurred while // attempting to queue data. nextSeg, n, err := e.queueSegment(p, opts) if n == 0 || err != nil { return 0, err } e.sendData(nextSeg) return int64(n), nil } // selectWindowLocked returns the new window without checking for shrinking or scaling // applied. // +checklocks:e.mu // +checklocks:e.rcvQueueMu func (e *Endpoint) selectWindowLocked(rcvBufSize int) (wnd seqnum.Size) { wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize)) maxWindow := wndFromSpace(rcvBufSize) wndFromUsedBytes := maxWindow - e.RcvBufUsed // We take the lesser of the wndFromAvailable and wndFromUsedBytes because in // cases where we receive a lot of small segments the segment overhead is a // lot higher and we can run out socket buffer space before we can fill the // previous window we advertised. In cases where we receive MSS sized or close // MSS sized segments we will probably run out of window space before we // exhaust receive buffer. newWnd := wndFromAvailable if newWnd > wndFromUsedBytes { newWnd = wndFromUsedBytes } if newWnd < 0 { newWnd = 0 } return seqnum.Size(newWnd) } // selectWindow invokes selectWindowLocked after acquiring e.rcvQueueMu. // +checklocks:e.mu func (e *Endpoint) selectWindow() (wnd seqnum.Size) { e.rcvQueueMu.Lock() wnd = e.selectWindowLocked(int(e.ops.GetReceiveBufferSize())) e.rcvQueueMu.Unlock() return wnd } // windowCrossedACKThresholdLocked checks if the receive window to be announced // would be under aMSS or under the window derived from half receive buffer, // whichever smaller. This is useful as a receive side silly window syndrome // prevention mechanism. If window grows to reasonable value, we should send ACK // to the sender to inform the rx space is now large. We also want ensure a // series of small read()'s won't trigger a flood of spurious tiny ACK's. // // For large receive buffers, the threshold is aMSS - once reader reads more // than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of // receive buffer size. This is chosen arbitrarily. // crossed will be true if the window size crossed the ACK threshold. // above will be true if the new window is >= ACK threshold and false // otherwise. // // +checklocks:e.mu // +checklocks:e.rcvQueueMu func (e *Endpoint) windowCrossedACKThresholdLocked(deltaBefore int, rcvBufSize int) (crossed bool, above bool) { newAvail := int(e.selectWindowLocked(rcvBufSize)) oldAvail := newAvail - deltaBefore if oldAvail < 0 { oldAvail = 0 } threshold := int(e.amss) // rcvBufFraction is the inverse of the fraction of receive buffer size that // is used to decide if the available buffer space is now above it. const rcvBufFraction = 2 if wndThreshold := wndFromSpace(rcvBufSize / rcvBufFraction); threshold > wndThreshold { threshold = wndThreshold } switch { case oldAvail < threshold && newAvail >= threshold: return true, true case oldAvail >= threshold && newAvail < threshold: return true, false } return false, false } // OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet. func (e *Endpoint) OnReuseAddressSet(v bool) { e.LockUser() e.portFlags.TupleOnly = v e.UnlockUser() } // OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet. func (e *Endpoint) OnReusePortSet(v bool) { e.LockUser() e.portFlags.LoadBalanced = v e.UnlockUser() } // OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet. func (e *Endpoint) OnKeepAliveSet(bool) { e.LockUser() e.resetKeepaliveTimer(true /* receivedData */) e.UnlockUser() } // OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet. func (e *Endpoint) OnDelayOptionSet(v bool) { if !v { e.LockUser() defer e.UnlockUser() // Handle delayed data. if e.EndpointState().connected() { e.sendData(nil /* next */) } } } // OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet. func (e *Endpoint) OnCorkOptionSet(v bool) { if !v { e.LockUser() defer e.UnlockUser() if e.snd != nil { e.snd.corkTimer.disable() } // Handle the corked data. if e.EndpointState().connected() { e.sendData(nil /* next */) } } } func (e *Endpoint) getSendBufferSize() int { return int(e.ops.GetSendBufferSize()) } // OnSetReceiveBufferSize implements tcpip.SocketOptionsHandler.OnSetReceiveBufferSize. func (e *Endpoint) OnSetReceiveBufferSize(rcvBufSz, oldSz int64) (newSz int64, postSet func()) { e.LockUser() sendNonZeroWindowUpdate := false e.rcvQueueMu.Lock() // Make sure the receive buffer size allows us to send a // non-zero window size. scale := uint8(0) if e.rcv != nil { scale = e.rcv.RcvWndScale } if rcvBufSz>>scale == 0 { rcvBufSz = 1 << scale } availBefore := wndFromSpace(e.receiveBufferAvailableLocked(int(oldSz))) availAfter := wndFromSpace(e.receiveBufferAvailableLocked(int(rcvBufSz))) e.RcvAutoParams.Disabled = true // Immediately send an ACK to uncork the sender silly window // syndrome prevetion, when our available space grows above aMSS // or half receive buffer, whichever smaller. if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, int(rcvBufSz)); crossed && above { sendNonZeroWindowUpdate = true } e.rcvQueueMu.Unlock() postSet = func() { e.LockUser() defer e.UnlockUser() if e.EndpointState().connected() && sendNonZeroWindowUpdate { e.rcv.nonZeroWindow() // +checklocksforce:e.rcv.ep.mu } } e.UnlockUser() return rcvBufSz, postSet } // OnSetSendBufferSize implements tcpip.SocketOptionsHandler.OnSetSendBufferSize. func (e *Endpoint) OnSetSendBufferSize(sz int64) int64 { e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Store(1) return sz } // WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters. func (e *Endpoint) WakeupWriters() { e.LockUser() defer e.UnlockUser() sendBufferSize := e.getSendBufferSize() e.sndQueueInfo.sndQueueMu.Lock() notify := (sendBufferSize - e.sndQueueInfo.SndBufUsed) >= e.sndQueueInfo.SndBufUsed>>1 e.sndQueueInfo.sndQueueMu.Unlock() if notify { e.waiterQueue.Notify(waiter.WritableEvents) } } // SetSockOptInt sets a socket option. func (e *Endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { // Lower 2 bits represents ECN bits. RFC 3168, section 23.1 const inetECNMask = 3 switch opt { case tcpip.KeepaliveCountOption: e.LockUser() e.keepalive.Lock() e.keepalive.count = v e.keepalive.Unlock() e.resetKeepaliveTimer(true /* receivedData */) e.UnlockUser() case tcpip.IPv4TOSOption: e.LockUser() // TODO(gvisor.dev/issue/995): ECN is not currently supported, // ignore the bits for now. e.sendTOS = uint8(v) & ^uint8(inetECNMask) e.UnlockUser() case tcpip.IPv6TrafficClassOption: e.LockUser() // TODO(gvisor.dev/issue/995): ECN is not currently supported, // ignore the bits for now. e.sendTOS = uint8(v) & ^uint8(inetECNMask) e.UnlockUser() case tcpip.MaxSegOption: userMSS := v if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS { return &tcpip.ErrInvalidOptionValue{} } e.LockUser() e.userMSS = uint16(userMSS) e.UnlockUser() case tcpip.MTUDiscoverOption: switch v := tcpip.PMTUDStrategy(v); v { case tcpip.PMTUDiscoveryWant, tcpip.PMTUDiscoveryDont, tcpip.PMTUDiscoveryDo: e.LockUser() e.pmtud = v e.UnlockUser() case tcpip.PMTUDiscoveryProbe: // We don't support a way to ignore MTU updates; it's // either on or it's off. return &tcpip.ErrNotSupported{} default: return &tcpip.ErrNotSupported{} } case tcpip.IPv4TTLOption: e.LockUser() e.ipv4TTL = uint8(v) e.UnlockUser() case tcpip.IPv6HopLimitOption: e.LockUser() e.ipv6HopLimit = int16(v) e.UnlockUser() case tcpip.TCPSynCountOption: if v < 1 || v > 255 { return &tcpip.ErrInvalidOptionValue{} } e.LockUser() e.maxSynRetries = uint8(v) e.UnlockUser() case tcpip.TCPWindowClampOption: if v == 0 { e.LockUser() switch e.EndpointState() { case StateClose, StateInitial: e.windowClamp = 0 e.UnlockUser() return nil default: e.UnlockUser() return &tcpip.ErrInvalidOptionValue{} } } var rs tcpip.TCPReceiveBufferSizeRangeOption if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil { if v < rs.Min/2 { v = rs.Min / 2 } } e.LockUser() e.windowClamp = uint32(v) e.UnlockUser() } return nil } // HasNIC returns true if the NICID is defined in the stack or id is 0. func (e *Endpoint) HasNIC(id int32) bool { return id == 0 || e.stack.HasNIC(tcpip.NICID(id)) } // SetSockOpt sets a socket option. func (e *Endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { switch v := opt.(type) { case *tcpip.KeepaliveIdleOption: e.LockUser() e.keepalive.Lock() e.keepalive.idle = time.Duration(*v) e.keepalive.Unlock() e.resetKeepaliveTimer(true /* receivedData */) e.UnlockUser() case *tcpip.KeepaliveIntervalOption: e.LockUser() e.keepalive.Lock() e.keepalive.interval = time.Duration(*v) e.keepalive.Unlock() e.resetKeepaliveTimer(true /* receivedData */) e.UnlockUser() case *tcpip.TCPUserTimeoutOption: e.LockUser() e.userTimeout = time.Duration(*v) e.UnlockUser() case *tcpip.CongestionControlOption: // Query the available cc algorithms in the stack and // validate that the specified algorithm is actually // supported in the stack. var avail tcpip.TCPAvailableCongestionControlOption if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil { return err } availCC := strings.Split(string(avail), " ") for _, cc := range availCC { if *v == tcpip.CongestionControlOption(cc) { e.LockUser() state := e.EndpointState() e.cc = *v switch state { case StateEstablished: if e.EndpointState() == state { e.snd.cc = e.snd.initCongestionControl(e.cc) } } e.UnlockUser() return nil } } // Linux returns ENOENT when an invalid congestion // control algorithm is specified. return &tcpip.ErrNoSuchFile{} case *tcpip.TCPLingerTimeoutOption: e.LockUser() switch { case *v < 0: // Same as effectively disabling TCPLinger timeout. *v = -1 case *v == 0: // Same as the stack default. var stackLingerTimeout tcpip.TCPLingerTimeoutOption if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil { panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err)) } *v = stackLingerTimeout case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout): // Cap it to Stack's default TCP_LINGER2 timeout. *v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout) default: } e.tcpLingerTimeout = time.Duration(*v) e.UnlockUser() case *tcpip.TCPDeferAcceptOption: e.LockUser() if time.Duration(*v) > MaxRTO { *v = tcpip.TCPDeferAcceptOption(MaxRTO) } e.deferAccept = time.Duration(*v) e.UnlockUser() case *tcpip.SocketDetachFilterOption: return nil default: return nil } return nil } // readyReceiveSize returns the number of bytes ready to be received. func (e *Endpoint) readyReceiveSize() (int, tcpip.Error) { e.LockUser() defer e.UnlockUser() // The endpoint cannot be in listen state. if e.EndpointState() == StateListen { return 0, &tcpip.ErrInvalidEndpointState{} } e.rcvQueueMu.Lock() defer e.rcvQueueMu.Unlock() return e.RcvBufUsed, nil } // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. func (e *Endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { switch opt { case tcpip.KeepaliveCountOption: e.keepalive.Lock() v := e.keepalive.count e.keepalive.Unlock() return v, nil case tcpip.IPv4TOSOption: e.LockUser() v := int(e.sendTOS) e.UnlockUser() return v, nil case tcpip.IPv6TrafficClassOption: e.LockUser() v := int(e.sendTOS) e.UnlockUser() return v, nil case tcpip.MaxSegOption: // Linux only returns user_mss value if user_mss is set and the socket is // unconnected. Otherwise Linux returns the actual current MSS. Netstack // mimics the user_mss behavior, but otherwise just returns the defaultMSS // for now. v := header.TCPDefaultMSS e.LockUser() if state := e.EndpointState(); e.userMSS > 0 && (state.internal() || state == StateClose || state == StateListen) { v = int(e.userMSS) } e.UnlockUser() return v, nil case tcpip.MTUDiscoverOption: e.LockUser() v := e.pmtud e.UnlockUser() return int(v), nil case tcpip.ReceiveQueueSizeOption: return e.readyReceiveSize() case tcpip.IPv4TTLOption: e.LockUser() v := int(e.ipv4TTL) e.UnlockUser() return v, nil case tcpip.IPv6HopLimitOption: e.LockUser() v := int(e.ipv6HopLimit) e.UnlockUser() return v, nil case tcpip.TCPSynCountOption: e.LockUser() v := int(e.maxSynRetries) e.UnlockUser() return v, nil case tcpip.TCPWindowClampOption: e.LockUser() v := int(e.windowClamp) e.UnlockUser() return v, nil case tcpip.MulticastTTLOption: return 1, nil default: return -1, &tcpip.ErrUnknownProtocolOption{} } } func (e *Endpoint) getTCPInfo() tcpip.TCPInfoOption { info := tcpip.TCPInfoOption{} e.LockUser() if state := e.EndpointState(); state.internal() { info.State = tcpip.EndpointState(StateClose) } else { info.State = tcpip.EndpointState(state) } snd := e.snd if snd != nil { // We do not calculate RTT before sending the data packets. If // the connection did not send and receive data, then RTT will // be zero. snd.rtt.Lock() info.RTT = snd.rtt.TCPRTTState.SRTT info.RTTVar = snd.rtt.TCPRTTState.RTTVar snd.rtt.Unlock() info.RTO = snd.RTO info.CcState = snd.state info.SndSsthresh = uint32(snd.Ssthresh) info.SndCwnd = uint32(snd.SndCwnd) info.ReorderSeen = snd.rc.Reord } e.UnlockUser() return info } // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (e *Endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { switch o := opt.(type) { case *tcpip.TCPInfoOption: *o = e.getTCPInfo() case *tcpip.KeepaliveIdleOption: e.keepalive.Lock() *o = tcpip.KeepaliveIdleOption(e.keepalive.idle) e.keepalive.Unlock() case *tcpip.KeepaliveIntervalOption: e.keepalive.Lock() *o = tcpip.KeepaliveIntervalOption(e.keepalive.interval) e.keepalive.Unlock() case *tcpip.TCPUserTimeoutOption: e.LockUser() *o = tcpip.TCPUserTimeoutOption(e.userTimeout) e.UnlockUser() case *tcpip.CongestionControlOption: e.LockUser() *o = e.cc e.UnlockUser() case *tcpip.TCPLingerTimeoutOption: e.LockUser() *o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout) e.UnlockUser() case *tcpip.TCPDeferAcceptOption: e.LockUser() *o = tcpip.TCPDeferAcceptOption(e.deferAccept) e.UnlockUser() case *tcpip.OriginalDestinationOption: e.LockUser() ipt := e.stack.IPTables() addr, port, err := ipt.OriginalDst(e.TransportEndpointInfo.ID, e.NetProto, ProtocolNumber) e.UnlockUser() if err != nil { return err } *o = tcpip.OriginalDestinationOption{ Addr: addr, Port: port, } default: return &tcpip.ErrUnknownProtocolOption{} } return nil } // checkV4MappedLocked determines the effective network protocol and converts // addr to its canonical form. // +checklocks:e.mu func (e *Endpoint) checkV4MappedLocked(addr tcpip.FullAddress, bind bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) { unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only(), bind) if err != nil { return tcpip.FullAddress{}, 0, err } return unwrapped, netProto, nil } // Disconnect implements tcpip.Endpoint.Disconnect. func (*Endpoint) Disconnect() tcpip.Error { return &tcpip.ErrNotSupported{} } // Connect connects the endpoint to its peer. func (e *Endpoint) Connect(addr tcpip.FullAddress) tcpip.Error { e.LockUser() defer e.UnlockUser() err := e.connect(addr, true) if err != nil { if !err.IgnoreStats() { // Connect failed. Let's wake up any waiters. e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) e.stack.Stats().TCP.FailedConnectionAttempts.Increment() e.stats.FailedConnectionAttempts.Increment() } } return err } // registerEndpoint registers the endpoint with the provided address. // // +checklocks:e.mu func (e *Endpoint) registerEndpoint(addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber, nicID tcpip.NICID) tcpip.Error { netProtos := []tcpip.NetworkProtocolNumber{netProto} if e.TransportEndpointInfo.ID.LocalPort != 0 { // The endpoint is bound to a port, attempt to register it. err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) if err != nil { return err } } else { // The endpoint doesn't have a local port yet, so try to get // one. Make sure that it isn't one that will result in the same // address/port for both local and remote (otherwise this // endpoint would be trying to connect to itself). sameAddr := e.TransportEndpointInfo.ID.LocalAddress == e.TransportEndpointInfo.ID.RemoteAddress var twReuse tcpip.TCPTimeWaitReuseOption if err := e.stack.TransportProtocolOption(ProtocolNumber, &twReuse); err != nil { panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &twReuse, err)) } reuse := twReuse == tcpip.TCPTimeWaitReuseGlobal if twReuse == tcpip.TCPTimeWaitReuseLoopbackOnly { switch netProto { case header.IPv4ProtocolNumber: reuse = header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.LocalAddress) && header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.RemoteAddress) case header.IPv6ProtocolNumber: reuse = e.TransportEndpointInfo.ID.LocalAddress == header.IPv6Loopback && e.TransportEndpointInfo.ID.RemoteAddress == header.IPv6Loopback } } bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) if _, err := e.stack.PickEphemeralPort(e.stack.SecureRNG(), func(p uint16) (bool, tcpip.Error) { if sameAddr && p == e.TransportEndpointInfo.ID.RemotePort { return false, nil } portRes := ports.Reservation{ Networks: netProtos, Transport: ProtocolNumber, Addr: e.TransportEndpointInfo.ID.LocalAddress, Port: p, Flags: e.portFlags, BindToDevice: bindToDevice, Dest: addr, } if _, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, nil /* testPort */); err != nil { if _, ok := err.(*tcpip.ErrPortInUse); !ok || !reuse { return false, nil } transEPID := e.TransportEndpointInfo.ID transEPID.LocalPort = p // Check if an endpoint is registered with demuxer in TIME-WAIT and if // we can reuse it. If we can't find a transport endpoint then we just // skip using this port as it's possible that either an endpoint has // bound the port but not registered with demuxer yet (no listen/connect // done yet) or the reservation was freed between the check above and // the FindTransportEndpoint below. But rather than retry the same port // we just skip it and move on. transEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, transEPID, nicID) if transEP == nil { // ReservePort failed but there is no registered endpoint with // demuxer. Which indicates there is at least some endpoint that has // bound the port. return false, nil } tcpEP := transEP.(*Endpoint) tcpEP.LockUser() // If the endpoint is not in TIME-WAIT or if it is in TIME-WAIT but // less than 1 second has elapsed since its recentTS was updated then // we cannot reuse the port. if tcpEP.EndpointState() != StateTimeWait || e.stack.Clock().NowMonotonic().Sub(tcpEP.recentTSTime) < 1*time.Second { tcpEP.UnlockUser() return false, nil } // Since the endpoint is in TIME-WAIT it should be safe to acquire its // Lock while holding the lock for this endpoint as endpoints in // TIME-WAIT do not acquire locks on other endpoints. tcpEP.transitionToStateCloseLocked() tcpEP.drainClosingSegmentQueue() tcpEP.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) tcpEP.UnlockUser() // Now try and Reserve again if it fails then we skip. portRes := ports.Reservation{ Networks: netProtos, Transport: ProtocolNumber, Addr: e.TransportEndpointInfo.ID.LocalAddress, Port: p, Flags: e.portFlags, BindToDevice: bindToDevice, Dest: addr, } if _, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, nil /* testPort */); err != nil { return false, nil } } id := e.TransportEndpointInfo.ID id.LocalPort = p if err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil { portRes := ports.Reservation{ Networks: netProtos, Transport: ProtocolNumber, Addr: e.TransportEndpointInfo.ID.LocalAddress, Port: p, Flags: e.portFlags, BindToDevice: bindToDevice, Dest: addr, } e.stack.ReleasePort(portRes) if _, ok := err.(*tcpip.ErrPortInUse); ok { return false, nil } return false, err } // Port picking successful. Save the details of // the selected port. e.TransportEndpointInfo.ID = id e.isPortReserved = true e.boundBindToDevice = bindToDevice e.boundPortFlags = e.portFlags e.boundDest = addr return true, nil }); err != nil { e.stack.Stats().TCP.FailedPortReservations.Increment() return err } } return nil } // connect connects the endpoint to its peer. // +checklocks:e.mu func (e *Endpoint) connect(addr tcpip.FullAddress, handshake bool) tcpip.Error { connectingAddr := addr.Addr addr, netProto, err := e.checkV4MappedLocked(addr, false /* bind */) if err != nil { return err } if e.EndpointState().connected() { // The endpoint is already connected. If caller hasn't been // notified yet, return success. if !e.isConnectNotified { e.isConnectNotified = true return nil } // Otherwise return that it's already connected. return &tcpip.ErrAlreadyConnected{} } nicID := addr.NIC switch e.EndpointState() { case StateBound: // If we're already bound to a NIC but the caller is requesting // that we use a different one now, we cannot proceed. if e.boundNICID == 0 { break } if nicID != 0 && nicID != e.boundNICID { return &tcpip.ErrHostUnreachable{} } nicID = e.boundNICID case StateInitial: // Nothing to do. We'll eventually fill-in the gaps in the ID (if any) // when we find a route. case StateConnecting, StateSynSent, StateSynRecv: // A connection request has already been issued but hasn't completed // yet. return &tcpip.ErrAlreadyConnecting{} case StateError: if err := e.hardErrorLocked(); err != nil { return err } return &tcpip.ErrConnectionAborted{} default: return &tcpip.ErrInvalidEndpointState{} } // Find a route to the desired destination. r, err := e.stack.FindRoute(nicID, e.TransportEndpointInfo.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */) if err != nil { return err } defer r.Release() e.TransportEndpointInfo.ID.LocalAddress = r.LocalAddress() e.TransportEndpointInfo.ID.RemoteAddress = r.RemoteAddress() e.TransportEndpointInfo.ID.RemotePort = addr.Port oldState := e.EndpointState() e.setEndpointState(StateConnecting) if err := e.registerEndpoint(addr, netProto, r.NICID()); err != nil { e.setEndpointState(oldState) if _, ok := err.(*tcpip.ErrPortInUse); ok { return &tcpip.ErrBadLocalAddress{} } return err } e.isRegistered = true r.Acquire() e.route = r e.boundNICID = nicID e.effectiveNetProtos = []tcpip.NetworkProtocolNumber{netProto} e.connectingAddress = connectingAddr e.initGSO() // Connect in the restore phase does not perform handshake. Restore its // connection setting here. if !handshake { e.segmentQueue.mu.Lock() for _, l := range []segmentList{e.segmentQueue.list, e.snd.writeList} { for s := l.Front(); s != nil; s = s.Next() { s.id = e.TransportEndpointInfo.ID e.sndQueueInfo.sndWaker.Assert() } } e.segmentQueue.mu.Unlock() e.snd.ep.AssertLockHeld(e) e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0) e.setEndpointState(StateEstablished) // Set the new auto tuned send buffer size after entering // established state. e.ops.SetSendBufferSize(e.computeTCPSendBufferSize(), false /* notify */) return &tcpip.ErrConnectStarted{} } // Start a new handshake. h := e.newHandshake() e.setEndpointState(StateSynSent) h.start() e.stack.Stats().TCP.ActiveConnectionOpenings.Increment() return &tcpip.ErrConnectStarted{} } // ConnectEndpoint is not supported. func (*Endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error { return &tcpip.ErrInvalidEndpointState{} } // Shutdown closes the read and/or write end of the endpoint connection to its // peer. func (e *Endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error { e.LockUser() defer e.UnlockUser() if e.EndpointState().connecting() { // When calling shutdown(2) on a connecting socket, the endpoint must // enter the error state. But this logic cannot belong to the shutdownLocked // method because that method is called during a close(2) (and closing a // connecting socket is not an error). e.handshakeFailed(&tcpip.ErrConnectionReset{}) e.waiterQueue.Notify(waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) return nil } return e.shutdownLocked(flags) } // +checklocks:e.mu func (e *Endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error { e.shutdownFlags |= flags switch { case e.EndpointState().connected(): // Close for read. if e.shutdownFlags&tcpip.ShutdownRead != 0 { // Mark read side as closed. e.rcvQueueMu.Lock() e.RcvClosed = true rcvBufUsed := e.RcvBufUsed e.rcvQueueMu.Unlock() // If we're fully closed and we have unread data we need to abort // the connection with a RST. if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 { e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) return nil } // Wake up any readers that maybe waiting for the stream to become // readable. events := waiter.ReadableEvents if e.shutdownFlags&tcpip.ShutdownWrite == 0 { // If ShutdownWrite is not set, write end won't close and // we end up with a half-closed connection events |= waiter.EventRdHUp } e.waiterQueue.Notify(events) } // Close for write. if e.shutdownFlags&tcpip.ShutdownWrite != 0 { e.sndQueueInfo.sndQueueMu.Lock() if e.sndQueueInfo.SndClosed { // Already closed. e.sndQueueInfo.sndQueueMu.Unlock() if e.EndpointState() == StateTimeWait { return &tcpip.ErrNotConnected{} } return nil } // Queue fin segment. s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), buffer.Buffer{}) e.snd.writeList.PushBack(s) // Mark endpoint as closed. e.sndQueueInfo.SndClosed = true e.sndQueueInfo.sndQueueMu.Unlock() // Drain the send queue. e.sendData(s) // Mark send side as closed. e.snd.Closed = true // Wake up any writers that maybe waiting for the stream to become // writable. e.waiterQueue.Notify(waiter.WritableEvents) } return nil case e.EndpointState() == StateListen: if e.shutdownFlags&tcpip.ShutdownRead != 0 { // Reset all connections from the accept queue and keep the // worker running so that it can continue handling incoming // segments by replying with RST. // // By not removing this endpoint from the demuxer mapping, we // ensure that any other bind to the same port fails, as on Linux. e.rcvQueueMu.Lock() e.RcvClosed = true e.rcvQueueMu.Unlock() e.closePendingAcceptableConnectionsLocked() // Notify waiters that the endpoint is shutdown. e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) } return nil default: return &tcpip.ErrNotConnected{} } } // Listen puts the endpoint in "listen" mode, which allows it to accept // new connections. func (e *Endpoint) Listen(backlog int) tcpip.Error { if err := e.listen(backlog); err != nil { if !err.IgnoreStats() { e.stack.Stats().TCP.FailedConnectionAttempts.Increment() e.stats.FailedConnectionAttempts.Increment() } return err } return nil } func (e *Endpoint) listen(backlog int) tcpip.Error { e.LockUser() defer e.UnlockUser() if e.EndpointState() == StateListen && !e.closed { e.acceptMu.Lock() defer e.acceptMu.Unlock() // Adjust the size of the backlog iff we can fit // existing pending connections into the new one. if e.acceptQueue.endpoints.Len() > backlog { return &tcpip.ErrInvalidEndpointState{} } e.acceptQueue.capacity = backlog if e.acceptQueue.pendingEndpoints == nil { e.acceptQueue.pendingEndpoints = make(map[*Endpoint]struct{}) } e.shutdownFlags = 0 e.updateConnDirectionState(connDirectionStateOpen) e.rcvQueueMu.Lock() e.RcvClosed = false e.rcvQueueMu.Unlock() return nil } if e.EndpointState() == StateInitial { // The listen is called on an unbound socket, the socket is // automatically bound to a random free port with the local // address set to INADDR_ANY. if err := e.bindLocked(tcpip.FullAddress{}); err != nil { return err } } // Endpoint must be bound before it can transition to listen mode. if e.EndpointState() != StateBound { e.stats.ReadErrors.InvalidEndpointState.Increment() return &tcpip.ErrInvalidEndpointState{} } // Setting this state after RegisterTransportEndpoint will result in a // race where the endpoint is in Bound but reachable via the demuxer. Instead // we set it to listen so that incoming packets will just be queued to the // inbound segment queue by the TCP processor. e.setEndpointState(StateListen) // Register the endpoint. if err := e.stack.RegisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil { e.transitionToStateCloseLocked() return err } e.isRegistered = true // The queue may be non-zero when we're restoring the endpoint, and it // may be pre-populated with some previously accepted (but not Accepted) // endpoints. e.acceptMu.Lock() if e.acceptQueue.pendingEndpoints == nil { e.acceptQueue.pendingEndpoints = make(map[*Endpoint]struct{}) } if e.acceptQueue.capacity == 0 { e.acceptQueue.capacity = backlog } e.acceptMu.Unlock() // Initialize the listening context. rcvWnd := seqnum.Size(e.receiveBufferAvailable()) e.listenCtx = newListenContext(e.stack, e.protocol, e, rcvWnd, e.ops.GetV6Only(), e.NetProto) return nil } // Accept returns a new endpoint if a peer has established a connection // to an endpoint previously set to listen mode. // // addr if not-nil will contain the peer address of the returned endpoint. func (e *Endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) { e.LockUser() defer e.UnlockUser() e.rcvQueueMu.Lock() rcvClosed := e.RcvClosed e.rcvQueueMu.Unlock() // Endpoint must be in listen state before it can accept connections. if rcvClosed || e.EndpointState() != StateListen { return nil, nil, &tcpip.ErrInvalidEndpointState{} } // Get the new accepted endpoint. var n *Endpoint e.acceptMu.Lock() if element := e.acceptQueue.endpoints.Front(); element != nil { n = e.acceptQueue.endpoints.Remove(element).(*Endpoint) } e.acceptMu.Unlock() if n == nil { return nil, nil, &tcpip.ErrWouldBlock{} } if peerAddr != nil { *peerAddr = n.getRemoteAddress() } return n, n.waiterQueue, nil } // Bind binds the endpoint to a specific local port and optionally address. func (e *Endpoint) Bind(addr tcpip.FullAddress) (err tcpip.Error) { e.LockUser() defer e.UnlockUser() return e.bindLocked(addr) } // +checklocks:e.mu func (e *Endpoint) bindLocked(addr tcpip.FullAddress) (err tcpip.Error) { // Don't allow binding once endpoint is not in the initial state // anymore. This is because once the endpoint goes into a connected or // listen state, it is already bound. if e.EndpointState() != StateInitial { return &tcpip.ErrAlreadyBound{} } e.BindAddr = addr.Addr addr, netProto, err := e.checkV4MappedLocked(addr, true /* bind */) if err != nil { return err } netProtos := []tcpip.NetworkProtocolNumber{netProto} // Expand netProtos to include v4 and v6 under dual-stack if the caller is // binding to a wildcard (empty) address, and this is an IPv6 endpoint with // v6only set to false. if netProto == header.IPv6ProtocolNumber { stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber) alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == tcpip.Address{} && stackHasV4 if alsoBindToV4 { netProtos = append(netProtos, header.IPv4ProtocolNumber) } } var nic tcpip.NICID // If an address is specified, we must ensure that it's one of our // local addresses. if addr.Addr.Len() != 0 { nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) if nic == 0 { return &tcpip.ErrBadLocalAddress{} } e.TransportEndpointInfo.ID.LocalAddress = addr.Addr } bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) portRes := ports.Reservation{ Networks: netProtos, Transport: ProtocolNumber, Addr: addr.Addr, Port: addr.Port, Flags: e.portFlags, BindToDevice: bindToDevice, Dest: tcpip.FullAddress{}, } port, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, func(p uint16) (bool, tcpip.Error) { id := e.TransportEndpointInfo.ID id.LocalPort = p // CheckRegisterTransportEndpoint should only return an error if there is a // listening endpoint bound with the same id and portFlags and bindToDevice // options. // // NOTE: Only listening and connected endpoint register with // demuxer. Further connected endpoints always have a remote // address/port. Hence this will only return an error if there is a matching // listening endpoint. if err := e.stack.CheckRegisterTransportEndpoint(netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil { return false, nil } return true, nil }) if err != nil { e.stack.Stats().TCP.FailedPortReservations.Increment() return err } e.boundBindToDevice = bindToDevice e.boundPortFlags = e.portFlags // TODO(gvisor.dev/issue/3691): Add test to verify boundNICID is correct. e.boundNICID = nic e.isPortReserved = true e.effectiveNetProtos = netProtos e.TransportEndpointInfo.ID.LocalPort = port // Mark endpoint as bound. e.setEndpointState(StateBound) return nil } // GetLocalAddress returns the address to which the endpoint is bound. func (e *Endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { e.LockUser() defer e.UnlockUser() return tcpip.FullAddress{ Addr: e.TransportEndpointInfo.ID.LocalAddress, Port: e.TransportEndpointInfo.ID.LocalPort, NIC: e.boundNICID, }, nil } // GetRemoteAddress returns the address to which the endpoint is connected. func (e *Endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { e.LockUser() defer e.UnlockUser() if !e.EndpointState().connected() { return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} } return e.getRemoteAddress(), nil } func (e *Endpoint) getRemoteAddress() tcpip.FullAddress { return tcpip.FullAddress{ Addr: e.TransportEndpointInfo.ID.RemoteAddress, Port: e.TransportEndpointInfo.ID.RemotePort, NIC: e.boundNICID, } } // HandlePacket implements stack.TransportEndpoint.HandlePacket. func (*Endpoint) HandlePacket(stack.TransportEndpointID, *stack.PacketBuffer) { // TCP HandlePacket is not required anymore as inbound packets first // land at the Dispatcher which then can either deliver using the // worker go routine or directly do the invoke the tcp processing inline // based on the state of the endpoint. } func (e *Endpoint) enqueueSegment(s *segment) bool { // Send packet to worker goroutine. if !e.segmentQueue.enqueue(s) { // The queue is full, so we drop the segment. e.stack.Stats().DroppedPackets.Increment() e.stats.ReceiveErrors.SegmentQueueDropped.Increment() return false } return true } func (e *Endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt *stack.PacketBuffer) { // Update last error first. e.lastErrorMu.Lock() e.lastError = err e.lastErrorMu.Unlock() var recvErr bool switch pkt.NetworkProtocolNumber { case header.IPv4ProtocolNumber: recvErr = e.SocketOptions().GetIPv4RecvError() case header.IPv6ProtocolNumber: recvErr = e.SocketOptions().GetIPv6RecvError() default: panic(fmt.Sprintf("unhandled network protocol number = %d", pkt.NetworkProtocolNumber)) } if recvErr { e.SocketOptions().QueueErr(&tcpip.SockError{ Err: err, Cause: transErr, // Linux passes the payload with the TCP header. We don't know if the TCP // header even exists, it may not for fragmented packets. Payload: pkt.Data().AsRange().ToView(), Dst: tcpip.FullAddress{ NIC: pkt.NICID, Addr: e.TransportEndpointInfo.ID.RemoteAddress, Port: e.TransportEndpointInfo.ID.RemotePort, }, Offender: tcpip.FullAddress{ NIC: pkt.NICID, Addr: e.TransportEndpointInfo.ID.LocalAddress, Port: e.TransportEndpointInfo.ID.LocalPort, }, NetProto: pkt.NetworkProtocolNumber, }) } if e.EndpointState().connecting() { e.mu.Lock() if lEP := e.h.listenEP; lEP != nil { // Remove from listening endpoints pending list. lEP.acceptMu.Lock() delete(lEP.acceptQueue.pendingEndpoints, e) lEP.acceptMu.Unlock() lEP.stats.FailedConnectionAttempts.Increment() } e.stack.Stats().TCP.FailedConnectionAttempts.Increment() e.cleanupLocked() e.hardError = err e.setEndpointState(StateError) e.mu.Unlock() e.drainClosingSegmentQueue() e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) } } // HandleError implements stack.TransportEndpoint. func (e *Endpoint) HandleError(transErr stack.TransportError, pkt *stack.PacketBuffer) { handlePacketTooBig := func(mtu uint32) { e.sndQueueInfo.sndQueueMu.Lock() update := false if v := int(mtu); v < e.sndQueueInfo.SndMTU { e.sndQueueInfo.SndMTU = v update = true } newMTU := e.sndQueueInfo.SndMTU e.sndQueueInfo.sndQueueMu.Unlock() if update { e.mu.Lock() defer e.mu.Unlock() if e.snd != nil { e.snd.updateMaxPayloadSize(newMTU, 1 /* count */) // +checklocksforce:e.snd.ep.mu } } } // TODO(gvisor.dev/issues/5270): Handle all transport errors. switch transErr.Kind() { case stack.PacketTooBigTransportError: handlePacketTooBig(transErr.Info()) case stack.DestinationHostUnreachableTransportError: e.onICMPError(&tcpip.ErrHostUnreachable{}, transErr, pkt) case stack.DestinationNetworkUnreachableTransportError: e.onICMPError(&tcpip.ErrNetworkUnreachable{}, transErr, pkt) case stack.DestinationPortUnreachableTransportError: e.onICMPError(&tcpip.ErrConnectionRefused{}, transErr, pkt) case stack.DestinationProtoUnreachableTransportError: e.onICMPError(&tcpip.ErrUnknownProtocolOption{}, transErr, pkt) case stack.SourceRouteFailedTransportError: e.onICMPError(&tcpip.ErrNotSupported{}, transErr, pkt) case stack.SourceHostIsolatedTransportError: e.onICMPError(&tcpip.ErrNoNet{}, transErr, pkt) case stack.DestinationHostDownTransportError: e.onICMPError(&tcpip.ErrHostDown{}, transErr, pkt) } } // updateSndBufferUsage is called by the protocol goroutine when room opens up // in the send buffer. The number of newly available bytes is v. func (e *Endpoint) updateSndBufferUsage(v int) { sendBufferSize := e.getSendBufferSize() e.sndQueueInfo.sndQueueMu.Lock() notify := e.sndQueueInfo.SndBufUsed >= sendBufferSize>>1 e.sndQueueInfo.SndBufUsed -= v // Get the new send buffer size with auto tuning, but do not set it // unless we decide to notify the writers. newSndBufSz := e.computeTCPSendBufferSize() // We only notify when there is half the sendBufferSize available after // a full buffer event occurs. This ensures that we don't wake up // writers to queue just 1-2 segments and go back to sleep. notify = notify && e.sndQueueInfo.SndBufUsed < int(newSndBufSz)>>1 e.sndQueueInfo.sndQueueMu.Unlock() if notify { // Set the new send buffer size calculated from auto tuning. e.ops.SetSendBufferSize(newSndBufSz, false /* notify */) e.waiterQueue.Notify(waiter.WritableEvents) } } // readyToRead is called by the protocol goroutine when a new segment is ready // to be read, or when the connection is closed for receiving (in which case // s will be nil). // // +checklocks:e.mu func (e *Endpoint) readyToRead(s *segment) { e.rcvQueueMu.Lock() if s != nil { e.RcvBufUsed += s.payloadSize() s.IncRef() e.rcvQueue.PushBack(s) } else { e.RcvClosed = true } e.rcvQueueMu.Unlock() e.waiterQueue.Notify(waiter.ReadableEvents) } // receiveBufferAvailableLocked calculates how many bytes are still available // in the receive buffer. // +checklocks:e.rcvQueueMu func (e *Endpoint) receiveBufferAvailableLocked(rcvBufSize int) int { // We may use more bytes than the buffer size when the receive buffer // shrinks. memUsed := e.receiveMemUsed() if memUsed >= rcvBufSize { return 0 } return rcvBufSize - memUsed } // receiveBufferAvailable calculates how many bytes are still available in the // receive buffer based on the actual memory used by all segments held in // receive buffer/pending and segment queue. func (e *Endpoint) receiveBufferAvailable() int { e.rcvQueueMu.Lock() available := e.receiveBufferAvailableLocked(int(e.ops.GetReceiveBufferSize())) e.rcvQueueMu.Unlock() return available } // receiveBufferUsed returns the amount of in-use receive buffer. func (e *Endpoint) receiveBufferUsed() int { e.rcvQueueMu.Lock() used := e.RcvBufUsed e.rcvQueueMu.Unlock() return used } // receiveMemUsed returns the total memory in use by segments held by this // endpoint. func (e *Endpoint) receiveMemUsed() int { return int(e.rcvMemUsed.Load()) } // updateReceiveMemUsed adds the provided delta to e.rcvMemUsed. func (e *Endpoint) updateReceiveMemUsed(delta int) { e.rcvMemUsed.Add(int32(delta)) } // maxReceiveBufferSize returns the stack wide maximum receive buffer size for // an endpoint. func (e *Endpoint) maxReceiveBufferSize() int { var rs tcpip.TCPReceiveBufferSizeRangeOption if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil { // As a fallback return the hardcoded max buffer size. return MaxBufferSize } return rs.Max } // directionState returns the close state of send and receive part of the endpoint func (e *Endpoint) connDirectionState() connDirectionState { return connDirectionState(e.connectionDirectionState.Load()) } // updateDirectionState updates the close state of send and receive part of the endpoint func (e *Endpoint) updateConnDirectionState(state connDirectionState) connDirectionState { return connDirectionState(e.connectionDirectionState.Swap(uint32(e.connDirectionState() | state))) } // rcvWndScaleForHandshake computes the receive window scale to offer to the // peer when window scaling is enabled (true by default). If auto-tuning is // disabled then the window scaling factor is based on the size of the // receiveBuffer otherwise we use the max permissible receive buffer size to // compute the scale. func (e *Endpoint) rcvWndScaleForHandshake() int { bufSizeForScale := e.ops.GetReceiveBufferSize() e.rcvQueueMu.Lock() autoTuningDisabled := e.RcvAutoParams.Disabled e.rcvQueueMu.Unlock() if autoTuningDisabled { return FindWndScale(seqnum.Size(bufSizeForScale)) } return FindWndScale(seqnum.Size(e.maxReceiveBufferSize())) } // updateRecentTimestamp updates the recent timestamp using the algorithm // described in https://tools.ietf.org/html/rfc7323#section-4.3 func (e *Endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) { if e.SendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) { e.setRecentTimestamp(tsVal) } } // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if // the SYN options indicate that timestamp option was negotiated. It also // initializes the recentTS with the value provided in synOpts.TSval. func (e *Endpoint) maybeEnableTimestamp(synOpts header.TCPSynOptions) { if synOpts.TS { e.SendTSOk = true e.setRecentTimestamp(synOpts.TSVal) } } func (e *Endpoint) tsVal(now tcpip.MonotonicTime) uint32 { return e.TSOffset.TSVal(now) } func (e *Endpoint) tsValNow() uint32 { return e.tsVal(e.stack.Clock().NowMonotonic()) } func (e *Endpoint) elapsed(now tcpip.MonotonicTime, tsEcr uint32) time.Duration { return e.TSOffset.Elapsed(now, tsEcr) } // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint // if the SYN options indicate that the SACK option was negotiated and the TCP // stack is configured to enable TCP SACK option. func (e *Endpoint) maybeEnableSACKPermitted(synOpts header.TCPSynOptions) { var v tcpip.TCPSACKEnabled if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil { // Stack doesn't support SACK. So just return. return } if bool(v) && synOpts.SACKPermitted { e.SACKPermitted = true e.stack.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery) } } // maxOptionSize return the maximum size of TCP options. func (e *Endpoint) maxOptionSize() (size int) { var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock options := e.makeOptions(maxSackBlocks[:]) size = len(options) putOptions(options) return size } // completeStateLocked makes a full copy of the endpoint and returns it. This is // used before invoking the probe. // // +checklocks:e.mu func (e *Endpoint) completeStateLocked(s *stack.TCPEndpointState) { s.TCPEndpointStateInner = e.TCPEndpointStateInner s.ID = stack.TCPEndpointID(e.TransportEndpointInfo.ID) s.SegTime = e.stack.Clock().NowMonotonic() s.Receiver = e.rcv.TCPReceiverState s.Sender = e.snd.TCPSenderState sndBufSize := e.getSendBufferSize() // Copy the send buffer atomically. e.sndQueueInfo.sndQueueMu.Lock() e.sndQueueInfo.CloneState(&s.SndBufState) s.SndBufState.SndBufSize = sndBufSize e.sndQueueInfo.sndQueueMu.Unlock() // Copy the receive buffer atomically. e.rcvQueueMu.Lock() s.RcvBufState = e.TCPRcvBufState e.rcvQueueMu.Unlock() // Copy the endpoint TCP Option state. s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks) copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks]) s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy() e.snd.rtt.Lock() s.Sender.RTTState = e.snd.rtt.TCPRTTState e.snd.rtt.Unlock() if cubic, ok := e.snd.cc.(*cubicState); ok { s.Sender.Cubic = cubic.TCPCubicState s.Sender.Cubic.TimeSinceLastCongestion = e.stack.Clock().NowMonotonic().Sub(s.Sender.Cubic.T) } s.Sender.RACKState = e.snd.rc.TCPRACKState s.Sender.RetransmitTS = e.snd.retransmitTS s.Sender.SpuriousRecovery = e.snd.spuriousRecovery } func (e *Endpoint) initHostGSO() { switch e.route.NetProto() { case header.IPv4ProtocolNumber: e.gso.Type = stack.GSOTCPv4 e.gso.L3HdrLen = header.IPv4MinimumSize case header.IPv6ProtocolNumber: e.gso.Type = stack.GSOTCPv6 e.gso.L3HdrLen = header.IPv6MinimumSize default: panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto)) } e.gso.NeedsCsum = true e.gso.CsumOffset = header.TCPChecksumOffset e.gso.MaxSize = e.route.GSOMaxSize() } func (e *Endpoint) initGSO() { if e.route.HasHostGSOCapability() { e.initHostGSO() } else if e.route.HasGVisorGSOCapability() { e.gso = stack.GSO{ MaxSize: e.route.GSOMaxSize(), Type: stack.GSOGvisor, NeedsCsum: false, } } } // State implements tcpip.Endpoint.State. It exports the endpoint's protocol // state for diagnostics. func (e *Endpoint) State() uint32 { return uint32(e.EndpointState()) } // Info returns a copy of the endpoint info. func (e *Endpoint) Info() tcpip.EndpointInfo { e.LockUser() // Make a copy of the endpoint info. ret := e.TransportEndpointInfo e.UnlockUser() return &ret } // Stats returns a pointer to the endpoint stats. func (e *Endpoint) Stats() tcpip.EndpointStats { return &e.stats } // Wait implements stack.TransportEndpoint.Wait. func (e *Endpoint) Wait() { waitEntry, notifyCh := waiter.NewChannelEntry(waiter.EventHUp) e.waiterQueue.EventRegister(&waitEntry) defer e.waiterQueue.EventUnregister(&waitEntry) switch e.EndpointState() { case StateClose, StateError: return } <-notifyCh } // SocketOptions implements tcpip.Endpoint.SocketOptions. func (e *Endpoint) SocketOptions() *tcpip.SocketOptions { return &e.ops } // GetTCPSendBufferLimits is used to get send buffer size limits for TCP. func GetTCPSendBufferLimits(sh tcpip.StackHandler) tcpip.SendBufferSizeOption { // This type assertion is safe because only the TCP stack calls this // function. ss := sh.(*stack.Stack).TCPSendBufferLimits() return tcpip.SendBufferSizeOption{ Min: ss.Min, Default: ss.Default, Max: ss.Max, } } // allowOutOfWindowAck returns true if an out-of-window ACK can be sent now. func (e *Endpoint) allowOutOfWindowAck() bool { now := e.stack.Clock().NowMonotonic() if e.lastOutOfWindowAckTime != (tcpip.MonotonicTime{}) { var limit stack.TCPInvalidRateLimitOption if err := e.stack.Option(&limit); err != nil { panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err)) } if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) { return false } } e.lastOutOfWindowAckTime = now return true } // GetTCPReceiveBufferLimits is used to get send buffer size limits for TCP. func GetTCPReceiveBufferLimits(s tcpip.StackHandler) tcpip.ReceiveBufferSizeOption { var ss tcpip.TCPReceiveBufferSizeRangeOption if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil { panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err)) } return tcpip.ReceiveBufferSizeOption{ Min: ss.Min, Default: ss.Default, Max: ss.Max, } } // computeTCPSendBufferSize implements auto tuning of send buffer size and // returns the new send buffer size. func (e *Endpoint) computeTCPSendBufferSize() int64 { curSndBufSz := int64(e.getSendBufferSize()) // Auto tuning is disabled when the user explicitly sets the send // buffer size with SO_SNDBUF option. if disabled := e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled.Load(); disabled == 1 { return curSndBufSz } const packetOverheadFactor = 2 curMSS := e.snd.MaxPayloadSize numSeg := InitialCwnd if numSeg < e.snd.SndCwnd { numSeg = e.snd.SndCwnd } // SndCwnd indicates the number of segments that can be sent. This means // that the sender can send upto #SndCwnd segments and the send buffer // size should be set to SndCwnd*MSS to accommodate sending of all the // segments. newSndBufSz := int64(numSeg * curMSS * packetOverheadFactor) if newSndBufSz < curSndBufSz { return curSndBufSz } if ss := GetTCPSendBufferLimits(e.stack); int64(ss.Max) < newSndBufSz { newSndBufSz = int64(ss.Max) } return newSndBufSz } // GetAcceptConn implements tcpip.SocketOptionsHandler. func (e *Endpoint) GetAcceptConn() bool { return EndpointState(e.State()) == StateListen } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/endpoint_state.go000066400000000000000000000232161465435605700267400ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/ports" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // beforeSave is invoked by stateify. func (e *Endpoint) beforeSave() { // Stop incoming packets. e.segmentQueue.freeze() e.mu.Lock() defer e.mu.Unlock() epState := e.EndpointState() switch { case epState == StateInitial || epState == StateBound: case epState.connected() || epState.handshake(): if !e.route.HasSaveRestoreCapability() { if !e.route.HasDisconnectOkCapability() { panic(&tcpip.ErrSaveRejection{ Err: fmt.Errorf("endpoint cannot be saved in connected state: local %s:%d, remote %s:%d", e.TransportEndpointInfo.ID.LocalAddress, e.TransportEndpointInfo.ID.LocalPort, e.TransportEndpointInfo.ID.RemoteAddress, e.TransportEndpointInfo.ID.RemotePort), }) } e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) e.mu.Unlock() e.Close() e.mu.Lock() } fallthrough case epState == StateListen: // Nothing to do. case epState.closed(): // Nothing to do. default: panic(fmt.Sprintf("endpoint in unknown state %v", e.EndpointState())) } e.stack.RegisterResumableEndpoint(e) } // saveEndpoints is invoked by stateify. func (a *acceptQueue) saveEndpoints() []*Endpoint { acceptedEndpoints := make([]*Endpoint, a.endpoints.Len()) for i, e := 0, a.endpoints.Front(); e != nil; i, e = i+1, e.Next() { acceptedEndpoints[i] = e.Value.(*Endpoint) } return acceptedEndpoints } // loadEndpoints is invoked by stateify. func (a *acceptQueue) loadEndpoints(_ context.Context, acceptedEndpoints []*Endpoint) { for _, ep := range acceptedEndpoints { a.endpoints.PushBack(ep) } } // saveState is invoked by stateify. func (e *Endpoint) saveState() EndpointState { return e.EndpointState() } // Endpoint loading must be done in the following ordering by their state, to // avoid dangling connecting w/o listening peer, and to avoid conflicts in port // reservation. var connectedLoading sync.WaitGroup var listenLoading sync.WaitGroup var connectingLoading sync.WaitGroup // Bound endpoint loading happens last. // loadState is invoked by stateify. func (e *Endpoint) loadState(_ context.Context, epState EndpointState) { // This is to ensure that the loading wait groups include all applicable // endpoints before any asynchronous calls to the Wait() methods. // For restore purposes we treat TimeWait like a connected endpoint. if epState.connected() || epState == StateTimeWait { connectedLoading.Add(1) } switch { case epState == StateListen: listenLoading.Add(1) case epState.connecting(): connectingLoading.Add(1) } // Directly update the state here rather than using e.setEndpointState // as the endpoint is still being loaded and the stack reference is not // yet initialized. e.state.Store(uint32(epState)) } // afterLoad is invoked by stateify. func (e *Endpoint) afterLoad(ctx context.Context) { // RacyLoad() can be used because we are initializing e. e.origEndpointState = e.state.RacyLoad() // Restore the endpoint to InitialState as it will be moved to // its origEndpointState during Restore. e.state = atomicbitops.FromUint32(uint32(StateInitial)) stack.RestoreStackFromContext(ctx).RegisterRestoredEndpoint(e) } // Restore implements tcpip.RestoredEndpoint.Restore. func (e *Endpoint) Restore(s *stack.Stack) { if !e.EndpointState().closed() { e.keepalive.timer.init(s.Clock(), timerHandler(e, e.keepaliveTimerExpired)) } if snd := e.snd; snd != nil { snd.resendTimer.init(s.Clock(), timerHandler(e, e.snd.retransmitTimerExpired)) snd.reorderTimer.init(s.Clock(), timerHandler(e, e.snd.rc.reorderTimerExpired)) snd.probeTimer.init(s.Clock(), timerHandler(e, e.snd.probeTimerExpired)) snd.corkTimer.init(s.Clock(), timerHandler(e, e.snd.corkTimerExpired)) } e.stack = s e.protocol = protocolFromStack(s) e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits, GetTCPReceiveBufferLimits) e.segmentQueue.thaw() bind := func() { e.mu.Lock() defer e.mu.Unlock() addr, _, err := e.checkV4MappedLocked(tcpip.FullAddress{Addr: e.BindAddr, Port: e.TransportEndpointInfo.ID.LocalPort}, true /* bind */) if err != nil { panic("unable to parse BindAddr: " + err.String()) } portRes := ports.Reservation{ Networks: e.effectiveNetProtos, Transport: ProtocolNumber, Addr: addr.Addr, Port: addr.Port, Flags: e.boundPortFlags, BindToDevice: e.boundBindToDevice, Dest: e.boundDest, } if ok := e.stack.ReserveTuple(portRes); !ok { panic(fmt.Sprintf("unable to re-reserve tuple (%v, %q, %d, %+v, %d, %v)", e.effectiveNetProtos, addr.Addr, addr.Port, e.boundPortFlags, e.boundBindToDevice, e.boundDest)) } e.isPortReserved = true // Mark endpoint as bound. e.setEndpointState(StateBound) } epState := EndpointState(e.origEndpointState) switch { case epState.connected(): bind() if e.connectingAddress.BitLen() == 0 { e.connectingAddress = e.TransportEndpointInfo.ID.RemoteAddress // This endpoint is accepted by netstack but not yet by // the app. If the endpoint is IPv6 but the remote // address is IPv4, we need to connect as IPv6 so that // dual-stack mode can be properly activated. if e.NetProto == header.IPv6ProtocolNumber && e.TransportEndpointInfo.ID.RemoteAddress.BitLen() != header.IPv6AddressSizeBits { e.connectingAddress = tcpip.AddrFrom16Slice(append( []byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff}, e.TransportEndpointInfo.ID.RemoteAddress.AsSlice()..., )) } } // Reset the scoreboard to reinitialize the sack information as // we do not restore SACK information. e.scoreboard.Reset() e.mu.Lock() err := e.connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.TransportEndpointInfo.ID.RemotePort}, false /* handshake */) if _, ok := err.(*tcpip.ErrConnectStarted); !ok { panic("endpoint connecting failed: " + err.String()) } e.state.Store(e.origEndpointState) // For FIN-WAIT-2 and TIME-WAIT we need to start the appropriate timers so // that the socket is closed correctly. switch epState { case StateFinWait2: e.finWait2Timer = e.stack.Clock().AfterFunc(e.tcpLingerTimeout, e.finWait2TimerExpired) case StateTimeWait: e.timeWaitTimer = e.stack.Clock().AfterFunc(e.getTimeWaitDuration(), e.timeWaitTimerExpired) } if e.ops.GetCorkOption() { // Rearm the timer if TCP_CORK is enabled which will // drain all the segments in the queue after restore. e.snd.corkTimer.enable(MinRTO) } e.mu.Unlock() connectedLoading.Done() case epState == StateListen: tcpip.AsyncLoading.Add(1) go func() { connectedLoading.Wait() bind() e.acceptMu.Lock() backlog := e.acceptQueue.capacity e.acceptMu.Unlock() if err := e.Listen(backlog); err != nil { panic("endpoint listening failed: " + err.String()) } e.LockUser() if e.shutdownFlags != 0 { e.shutdownLocked(e.shutdownFlags) } e.UnlockUser() listenLoading.Done() tcpip.AsyncLoading.Done() }() case epState == StateConnecting: // Initial SYN hasn't been sent yet so initiate a connect. tcpip.AsyncLoading.Add(1) go func() { connectedLoading.Wait() listenLoading.Wait() bind() err := e.Connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.TransportEndpointInfo.ID.RemotePort}) if _, ok := err.(*tcpip.ErrConnectStarted); !ok { panic("endpoint connecting failed: " + err.String()) } connectingLoading.Done() tcpip.AsyncLoading.Done() }() case epState == StateSynSent || epState == StateSynRecv: connectedLoading.Wait() listenLoading.Wait() // Initial SYN has been sent/received so we should bind the // ports start the retransmit timer for the SYNs and let it // naturally complete the connection. bind() e.mu.Lock() defer e.mu.Unlock() e.setEndpointState(epState) r, err := e.stack.FindRoute(e.boundNICID, e.TransportEndpointInfo.ID.LocalAddress, e.TransportEndpointInfo.ID.RemoteAddress, e.effectiveNetProtos[0], false /* multicastLoop */) if err != nil { panic(fmt.Sprintf("FindRoute failed when restoring endpoint w/ ID: %+v", e.ID)) } e.route = r timer, err := newBackoffTimer(e.stack.Clock(), InitialRTO, MaxRTO, timerHandler(e, e.h.retransmitHandlerLocked)) if err != nil { panic(fmt.Sprintf("newBackOffTimer(_, %s, %s, _) failed: %s", InitialRTO, MaxRTO, err)) } e.h.retransmitTimer = timer connectingLoading.Done() case epState == StateBound: tcpip.AsyncLoading.Add(1) go func() { connectedLoading.Wait() listenLoading.Wait() connectingLoading.Wait() bind() tcpip.AsyncLoading.Done() }() case epState == StateClose: e.isPortReserved = false e.state.Store(uint32(StateClose)) e.stack.CompleteTransportEndpointCleanup(e) tcpip.DeleteDanglingEndpoint(e) case epState == StateError: e.state.Store(uint32(StateError)) e.stack.CompleteTransportEndpointCleanup(e) tcpip.DeleteDanglingEndpoint(e) } } // Resume implements tcpip.ResumableEndpoint.Resume. func (e *Endpoint) Resume() { e.segmentQueue.thaw() } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/forwarder.go000066400000000000000000000122021465435605700257040ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/seqnum" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/waiter" ) // Forwarder is a connection request forwarder, which allows clients to decide // what to do with a connection request, for example: ignore it, send a RST, or // attempt to complete the 3-way handshake. // // The canonical way of using it is to pass the Forwarder.HandlePacket function // to stack.SetTransportProtocolHandler. type Forwarder struct { stack *stack.Stack maxInFlight int handler func(*ForwarderRequest) mu sync.Mutex inFlight map[stack.TransportEndpointID]struct{} listen *listenContext } // NewForwarder allocates and initializes a new forwarder with the given // maximum number of in-flight connection attempts. Once the maximum is reached // new incoming connection requests will be ignored. // // If rcvWnd is set to zero, the default buffer size is used instead. func NewForwarder(s *stack.Stack, rcvWnd, maxInFlight int, handler func(*ForwarderRequest)) *Forwarder { if rcvWnd == 0 { rcvWnd = DefaultReceiveBufferSize } return &Forwarder{ stack: s, maxInFlight: maxInFlight, handler: handler, inFlight: make(map[stack.TransportEndpointID]struct{}), listen: newListenContext(s, protocolFromStack(s), nil /* listenEP */, seqnum.Size(rcvWnd), true, 0), } } // HandlePacket handles a packet if it is of interest to the forwarder (i.e., if // it's a SYN packet), returning true if it's the case. Otherwise the packet // is not handled and false is returned. // // This function is expected to be passed as an argument to the // stack.SetTransportProtocolHandler function. func (f *Forwarder) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool { s, err := newIncomingSegment(id, f.stack.Clock(), pkt) if err != nil { return false } defer s.DecRef() // We only care about well-formed SYN packets (not SYN-ACK) packets. if !s.csumValid || !s.flags.Contains(header.TCPFlagSyn) || s.flags.Contains(header.TCPFlagAck) { return false } opts := parseSynSegmentOptions(s) f.mu.Lock() defer f.mu.Unlock() // We have an inflight request for this id, ignore this one for now. if _, ok := f.inFlight[id]; ok { return true } // Ignore the segment if we're beyond the limit. if len(f.inFlight) >= f.maxInFlight { f.stack.Stats().TCP.ForwardMaxInFlightDrop.Increment() return true } // Launch a new goroutine to handle the request. f.inFlight[id] = struct{}{} s.IncRef() go f.handler(&ForwarderRequest{ // S/R-SAFE: not used by Sentry. forwarder: f, segment: s, synOptions: opts, }) return true } // ForwarderRequest represents a connection request received by the forwarder // and passed to the client. Clients must eventually call Complete() on it, and // may optionally create an endpoint to represent it via CreateEndpoint. type ForwarderRequest struct { mu sync.Mutex forwarder *Forwarder segment *segment synOptions header.TCPSynOptions } // ID returns the 4-tuple (src address, src port, dst address, dst port) that // represents the connection request. func (r *ForwarderRequest) ID() stack.TransportEndpointID { return r.segment.id } // Complete completes the request, and optionally sends a RST segment back to the // sender. func (r *ForwarderRequest) Complete(sendReset bool) { r.mu.Lock() defer r.mu.Unlock() if r.segment == nil { panic("Completing already completed forwarder request") } // Remove request from the forwarder. r.forwarder.mu.Lock() delete(r.forwarder.inFlight, r.segment.id) r.forwarder.mu.Unlock() if sendReset { replyWithReset(r.forwarder.stack, r.segment, stack.DefaultTOS, tcpip.UseDefaultIPv4TTL, tcpip.UseDefaultIPv6HopLimit) } // Release all resources. r.segment.DecRef() r.segment = nil r.forwarder = nil } // CreateEndpoint creates a TCP endpoint for the connection request, performing // the 3-way handshake in the process. func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { r.mu.Lock() defer r.mu.Unlock() if r.segment == nil { return nil, &tcpip.ErrInvalidEndpointState{} } f := r.forwarder ep, err := f.listen.performHandshake(r.segment, header.TCPSynOptions{ MSS: r.synOptions.MSS, WS: r.synOptions.WS, TS: r.synOptions.TS, TSVal: r.synOptions.TSVal, TSEcr: r.synOptions.TSEcr, SACKPermitted: r.synOptions.SACKPermitted, }, queue, nil) if err != nil { return nil, err } return ep, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/protocol.go000066400000000000000000000404071465435605700255620ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package tcp contains the implementation of the TCP transport protocol. package tcp import ( "crypto/sha256" "encoding/binary" "fmt" "runtime" "strings" "time" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/internal/tcp" "gvisor.dev/gvisor/pkg/tcpip/seqnum" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/raw" "gvisor.dev/gvisor/pkg/waiter" ) const ( // ProtocolNumber is the tcp protocol number. ProtocolNumber = header.TCPProtocolNumber // MinBufferSize is the smallest size of a receive or send buffer. MinBufferSize = 4 << 10 // 4096 bytes. // DefaultSendBufferSize is the default size of the send buffer for // an endpoint. DefaultSendBufferSize = 1 << 20 // 1MB // DefaultReceiveBufferSize is the default size of the receive buffer // for an endpoint. DefaultReceiveBufferSize = 1 << 20 // 1MB // MaxBufferSize is the largest size a receive/send buffer can grow to. MaxBufferSize = 4 << 20 // 4MB // DefaultTCPLingerTimeout is the amount of time that sockets linger in // FIN_WAIT_2 state before being marked closed. DefaultTCPLingerTimeout = 60 * time.Second // MaxTCPLingerTimeout is the maximum amount of time that sockets // linger in FIN_WAIT_2 state before being marked closed. MaxTCPLingerTimeout = 120 * time.Second // DefaultTCPTimeWaitTimeout is the amount of time that sockets linger // in TIME_WAIT state before being marked closed. DefaultTCPTimeWaitTimeout = 60 * time.Second // DefaultSynRetries is the default value for the number of SYN retransmits // before a connect is aborted. DefaultSynRetries = 6 // DefaultKeepaliveIdle is the idle time for a connection before keep-alive // probes are sent. DefaultKeepaliveIdle = 2 * time.Hour // DefaultKeepaliveInterval is the time between two successive keep-alive // probes. DefaultKeepaliveInterval = 75 * time.Second // DefaultKeepaliveCount is the number of keep-alive probes that are sent // before declaring the connection dead. DefaultKeepaliveCount = 9 ) const ( ccReno = "reno" ccCubic = "cubic" ) // +stateify savable type protocol struct { stack *stack.Stack mu sync.RWMutex `state:"nosave"` sackEnabled bool recovery tcpip.TCPRecovery delayEnabled bool alwaysUseSynCookies bool sendBufferSize tcpip.TCPSendBufferSizeRangeOption recvBufferSize tcpip.TCPReceiveBufferSizeRangeOption congestionControl string availableCongestionControl []string moderateReceiveBuffer bool lingerTimeout time.Duration timeWaitTimeout time.Duration timeWaitReuse tcpip.TCPTimeWaitReuseOption minRTO time.Duration maxRTO time.Duration maxRetries uint32 synRetries uint8 dispatcher dispatcher // The following secrets are initialized once and stay unchanged after. seqnumSecret [16]byte tsOffsetSecret [16]byte } // Number returns the tcp protocol number. func (*protocol) Number() tcpip.TransportProtocolNumber { return ProtocolNumber } // NewEndpoint creates a new tcp endpoint. func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { return newEndpoint(p.stack, p, netProto, waiterQueue), nil } // NewRawEndpoint creates a new raw TCP endpoint. Raw TCP sockets are currently // unsupported. It implements stack.TransportProtocol.NewRawEndpoint. func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { return raw.NewEndpoint(p.stack, netProto, header.TCPProtocolNumber, waiterQueue) } // MinimumPacketSize returns the minimum valid tcp packet size. func (*protocol) MinimumPacketSize() int { return header.TCPMinimumSize } // ParsePorts returns the source and destination ports stored in the given tcp // packet. func (*protocol) ParsePorts(v []byte) (src, dst uint16, err tcpip.Error) { h := header.TCP(v) return h.SourcePort(), h.DestinationPort(), nil } // QueuePacket queues packets targeted at an endpoint after hashing the packet // to a specific processing queue. Each queue is serviced by its own processor // goroutine which is responsible for dequeuing and doing full TCP dispatch of // the packet. func (p *protocol) QueuePacket(ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt *stack.PacketBuffer) { p.dispatcher.queuePacket(ep, id, p.stack.Clock(), pkt) } // HandleUnknownDestinationPacket handles packets targeted at this protocol but // that don't match any existing endpoint. // // RFC 793, page 36, states that "If the connection does not exist (CLOSED) then // a reset is sent in response to any incoming segment except another reset. In // particular, SYNs addressed to a non-existent connection are rejected by this // means." func (p *protocol) HandleUnknownDestinationPacket(id stack.TransportEndpointID, pkt *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition { s, err := newIncomingSegment(id, p.stack.Clock(), pkt) if err != nil { return stack.UnknownDestinationPacketMalformed } defer s.DecRef() if !s.csumValid { return stack.UnknownDestinationPacketMalformed } if !s.flags.Contains(header.TCPFlagRst) { replyWithReset(p.stack, s, stack.DefaultTOS, tcpip.UseDefaultIPv4TTL, tcpip.UseDefaultIPv6HopLimit) } return stack.UnknownDestinationPacketHandled } func (p *protocol) tsOffset(src, dst tcpip.Address) tcp.TSOffset { // Initialize a random tsOffset that will be added to the recentTS // everytime the timestamp is sent when the Timestamp option is enabled. // // See https://tools.ietf.org/html/rfc7323#section-5.4 for details on // why this is required. h := sha256.New() // Per hash.Hash.Writer: // // It never returns an error. _, _ = h.Write(p.tsOffsetSecret[:]) _, _ = h.Write(src.AsSlice()) _, _ = h.Write(dst.AsSlice()) return tcp.NewTSOffset(binary.LittleEndian.Uint32(h.Sum(nil)[:4])) } // replyWithReset replies to the given segment with a reset segment. // // If the relevant TTL has its reset value (0 for ipv4TTL, -1 for ipv6HopLimit), // then the route's default TTL will be used. func replyWithReset(st *stack.Stack, s *segment, tos, ipv4TTL uint8, ipv6HopLimit int16) tcpip.Error { net := s.pkt.Network() route, err := st.FindRoute(s.pkt.NICID, net.DestinationAddress(), net.SourceAddress(), s.pkt.NetworkProtocolNumber, false /* multicastLoop */) if err != nil { return err } defer route.Release() ttl := calculateTTL(route, ipv4TTL, ipv6HopLimit) // Get the seqnum from the packet if the ack flag is set. seq := seqnum.Value(0) ack := seqnum.Value(0) flags := header.TCPFlagRst // As per RFC 793 page 35 (Reset Generation) // 1. If the connection does not exist (CLOSED) then a reset is sent // in response to any incoming segment except another reset. In // particular, SYNs addressed to a non-existent connection are rejected // by this means. // If the incoming segment has an ACK field, the reset takes its // sequence number from the ACK field of the segment, otherwise the // reset has sequence number zero and the ACK field is set to the sum // of the sequence number and segment length of the incoming segment. // The connection remains in the CLOSED state. if s.flags.Contains(header.TCPFlagAck) { seq = s.ackNumber } else { flags |= header.TCPFlagAck ack = s.sequenceNumber.Add(s.logicalLen()) } p := stack.NewPacketBuffer(stack.PacketBufferOptions{ReserveHeaderBytes: header.TCPMinimumSize + int(route.MaxHeaderLength())}) defer p.DecRef() return sendTCP(route, tcpFields{ id: s.id, ttl: ttl, tos: tos, flags: flags, seq: seq, ack: ack, rcvWnd: 0, }, p, stack.GSO{}, nil /* PacketOwner */) } // SetOption implements stack.TransportProtocol.SetOption. func (p *protocol) SetOption(option tcpip.SettableTransportProtocolOption) tcpip.Error { switch v := option.(type) { case *tcpip.TCPSACKEnabled: p.mu.Lock() p.sackEnabled = bool(*v) p.mu.Unlock() return nil case *tcpip.TCPRecovery: p.mu.Lock() p.recovery = *v p.mu.Unlock() return nil case *tcpip.TCPDelayEnabled: p.mu.Lock() p.delayEnabled = bool(*v) p.mu.Unlock() return nil case *tcpip.TCPSendBufferSizeRangeOption: if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max { return &tcpip.ErrInvalidOptionValue{} } p.mu.Lock() p.sendBufferSize = *v p.mu.Unlock() return nil case *tcpip.TCPReceiveBufferSizeRangeOption: if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max { return &tcpip.ErrInvalidOptionValue{} } p.mu.Lock() p.recvBufferSize = *v p.mu.Unlock() return nil case *tcpip.CongestionControlOption: for _, c := range p.availableCongestionControl { if string(*v) == c { p.mu.Lock() p.congestionControl = string(*v) p.mu.Unlock() return nil } } // linux returns ENOENT when an invalid congestion control // is specified. return &tcpip.ErrNoSuchFile{} case *tcpip.TCPModerateReceiveBufferOption: p.mu.Lock() p.moderateReceiveBuffer = bool(*v) p.mu.Unlock() return nil case *tcpip.TCPLingerTimeoutOption: p.mu.Lock() if *v < 0 { p.lingerTimeout = 0 } else { p.lingerTimeout = time.Duration(*v) } p.mu.Unlock() return nil case *tcpip.TCPTimeWaitTimeoutOption: p.mu.Lock() if *v < 0 { p.timeWaitTimeout = 0 } else { p.timeWaitTimeout = time.Duration(*v) } p.mu.Unlock() return nil case *tcpip.TCPTimeWaitReuseOption: if *v < tcpip.TCPTimeWaitReuseDisabled || *v > tcpip.TCPTimeWaitReuseLoopbackOnly { return &tcpip.ErrInvalidOptionValue{} } p.mu.Lock() p.timeWaitReuse = *v p.mu.Unlock() return nil case *tcpip.TCPMinRTOOption: p.mu.Lock() defer p.mu.Unlock() if *v < 0 { p.minRTO = MinRTO } else if minRTO := time.Duration(*v); minRTO <= p.maxRTO { p.minRTO = minRTO } else { return &tcpip.ErrInvalidOptionValue{} } return nil case *tcpip.TCPMaxRTOOption: p.mu.Lock() defer p.mu.Unlock() if *v < 0 { p.maxRTO = MaxRTO } else if maxRTO := time.Duration(*v); maxRTO >= p.minRTO { p.maxRTO = maxRTO } else { return &tcpip.ErrInvalidOptionValue{} } return nil case *tcpip.TCPMaxRetriesOption: p.mu.Lock() p.maxRetries = uint32(*v) p.mu.Unlock() return nil case *tcpip.TCPAlwaysUseSynCookies: p.mu.Lock() p.alwaysUseSynCookies = bool(*v) p.mu.Unlock() return nil case *tcpip.TCPSynRetriesOption: if *v < 1 || *v > 255 { return &tcpip.ErrInvalidOptionValue{} } p.mu.Lock() p.synRetries = uint8(*v) p.mu.Unlock() return nil default: return &tcpip.ErrUnknownProtocolOption{} } } // Option implements stack.TransportProtocol.Option. func (p *protocol) Option(option tcpip.GettableTransportProtocolOption) tcpip.Error { switch v := option.(type) { case *tcpip.TCPSACKEnabled: p.mu.RLock() *v = tcpip.TCPSACKEnabled(p.sackEnabled) p.mu.RUnlock() return nil case *tcpip.TCPRecovery: p.mu.RLock() *v = p.recovery p.mu.RUnlock() return nil case *tcpip.TCPDelayEnabled: p.mu.RLock() *v = tcpip.TCPDelayEnabled(p.delayEnabled) p.mu.RUnlock() return nil case *tcpip.TCPSendBufferSizeRangeOption: p.mu.RLock() *v = p.sendBufferSize p.mu.RUnlock() return nil case *tcpip.TCPReceiveBufferSizeRangeOption: p.mu.RLock() *v = p.recvBufferSize p.mu.RUnlock() return nil case *tcpip.CongestionControlOption: p.mu.RLock() *v = tcpip.CongestionControlOption(p.congestionControl) p.mu.RUnlock() return nil case *tcpip.TCPAvailableCongestionControlOption: p.mu.RLock() *v = tcpip.TCPAvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " ")) p.mu.RUnlock() return nil case *tcpip.TCPModerateReceiveBufferOption: p.mu.RLock() *v = tcpip.TCPModerateReceiveBufferOption(p.moderateReceiveBuffer) p.mu.RUnlock() return nil case *tcpip.TCPLingerTimeoutOption: p.mu.RLock() *v = tcpip.TCPLingerTimeoutOption(p.lingerTimeout) p.mu.RUnlock() return nil case *tcpip.TCPTimeWaitTimeoutOption: p.mu.RLock() *v = tcpip.TCPTimeWaitTimeoutOption(p.timeWaitTimeout) p.mu.RUnlock() return nil case *tcpip.TCPTimeWaitReuseOption: p.mu.RLock() *v = p.timeWaitReuse p.mu.RUnlock() return nil case *tcpip.TCPMinRTOOption: p.mu.RLock() *v = tcpip.TCPMinRTOOption(p.minRTO) p.mu.RUnlock() return nil case *tcpip.TCPMaxRTOOption: p.mu.RLock() *v = tcpip.TCPMaxRTOOption(p.maxRTO) p.mu.RUnlock() return nil case *tcpip.TCPMaxRetriesOption: p.mu.RLock() *v = tcpip.TCPMaxRetriesOption(p.maxRetries) p.mu.RUnlock() return nil case *tcpip.TCPAlwaysUseSynCookies: p.mu.RLock() *v = tcpip.TCPAlwaysUseSynCookies(p.alwaysUseSynCookies) p.mu.RUnlock() return nil case *tcpip.TCPSynRetriesOption: p.mu.RLock() *v = tcpip.TCPSynRetriesOption(p.synRetries) p.mu.RUnlock() return nil default: return &tcpip.ErrUnknownProtocolOption{} } } // SendBufferSize implements stack.SendBufSizeProto. func (p *protocol) SendBufferSize() tcpip.TCPSendBufferSizeRangeOption { p.mu.RLock() defer p.mu.RUnlock() return p.sendBufferSize } // Close implements stack.TransportProtocol.Close. func (p *protocol) Close() { p.dispatcher.close() } // Wait implements stack.TransportProtocol.Wait. func (p *protocol) Wait() { p.dispatcher.wait() } // Pause implements stack.TransportProtocol.Pause. func (p *protocol) Pause() { p.dispatcher.pause() } // Resume implements stack.TransportProtocol.Resume. func (p *protocol) Resume() { p.dispatcher.resume() } // Parse implements stack.TransportProtocol.Parse. func (*protocol) Parse(pkt *stack.PacketBuffer) bool { return parse.TCP(pkt) } // NewProtocol returns a TCP transport protocol with Reno congestion control. func NewProtocol(s *stack.Stack) stack.TransportProtocol { return newProtocol(s, ccReno) } // NewProtocolCUBIC returns a TCP transport protocol with CUBIC congestion // control. // // TODO(b/345835636): Remove this and make CUBIC the default across the board. func NewProtocolCUBIC(s *stack.Stack) stack.TransportProtocol { return newProtocol(s, ccCubic) } func newProtocol(s *stack.Stack, cc string) stack.TransportProtocol { rng := s.SecureRNG() var seqnumSecret [16]byte var tsOffsetSecret [16]byte if n, err := rng.Reader.Read(seqnumSecret[:]); err != nil || n != len(seqnumSecret) { panic(fmt.Sprintf("Read() failed: %v", err)) } if n, err := rng.Reader.Read(tsOffsetSecret[:]); err != nil || n != len(tsOffsetSecret) { panic(fmt.Sprintf("Read() failed: %v", err)) } p := protocol{ stack: s, sendBufferSize: tcpip.TCPSendBufferSizeRangeOption{ Min: MinBufferSize, Default: DefaultSendBufferSize, Max: MaxBufferSize, }, recvBufferSize: tcpip.TCPReceiveBufferSizeRangeOption{ Min: MinBufferSize, Default: DefaultReceiveBufferSize, Max: MaxBufferSize, }, sackEnabled: true, congestionControl: cc, availableCongestionControl: []string{ccReno, ccCubic}, moderateReceiveBuffer: true, lingerTimeout: DefaultTCPLingerTimeout, timeWaitTimeout: DefaultTCPTimeWaitTimeout, timeWaitReuse: tcpip.TCPTimeWaitReuseLoopbackOnly, synRetries: DefaultSynRetries, minRTO: MinRTO, maxRTO: MaxRTO, maxRetries: MaxRetries, recovery: tcpip.TCPRACKLossDetection, seqnumSecret: seqnumSecret, tsOffsetSecret: tsOffsetSecret, } p.dispatcher.init(s.InsecureRNG(), runtime.GOMAXPROCS(0)) return &p } // protocolFromStack retrieves the tcp.protocol instance from stack s. func protocolFromStack(s *stack.Stack) *protocol { return s.TransportProtocolInstance(ProtocolNumber).(*protocol) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/rack.go000066400000000000000000000337161465435605700246460ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "time" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/seqnum" "gvisor.dev/gvisor/pkg/tcpip/stack" ) const ( // wcDelayedACKTimeout is the recommended maximum delayed ACK timer // value as defined in the RFC. It stands for worst case delayed ACK // timer (WCDelAckT). When FlightSize is 1, PTO is inflated by // WCDelAckT time to compensate for a potential long delayed ACK timer // at the receiver. // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5. wcDelayedACKTimeout = 200 * time.Millisecond // tcpRACKRecoveryThreshold is the number of loss recoveries for which // the reorder window is inflated and after that the reorder window is // reset to its initial value of minRTT/4. // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2. tcpRACKRecoveryThreshold = 16 ) // RACK is a loss detection algorithm used in TCP to detect packet loss and // reordering using transmission timestamp of the packets instead of packet or // sequence counts. To use RACK, SACK should be enabled on the connection. // rackControl stores the rack related fields. // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-6.1 // // +stateify savable type rackControl struct { stack.TCPRACKState // exitedRecovery indicates if the connection is exiting loss recovery. // This flag is set if the sender is leaving the recovery after // receiving an ACK and is reset during updating of reorder window. exitedRecovery bool // minRTT is the estimated minimum RTT of the connection. minRTT time.Duration // tlpRxtOut indicates whether there is an unacknowledged // TLP retransmission. tlpRxtOut bool // tlpHighRxt the value of sender.sndNxt at the time of sending // a TLP retransmission. tlpHighRxt seqnum.Value // snd is a reference to the sender. snd *sender } // init initializes RACK specific fields. func (rc *rackControl) init(snd *sender, iss seqnum.Value) { rc.FACK = iss rc.ReoWndIncr = 1 rc.snd = snd } // update will update the RACK related fields when an ACK has been received. // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-09#section-6.2 func (rc *rackControl) update(seg *segment, ackSeg *segment) { rtt := rc.snd.ep.stack.Clock().NowMonotonic().Sub(seg.xmitTime) // If the ACK is for a retransmitted packet, do not update if it is a // spurious inference which is determined by below checks: // 1. When Timestamping option is available, if the TSVal is less than // the transmit time of the most recent retransmitted packet. // 2. When RTT calculated for the packet is less than the smoothed RTT // for the connection. // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 // step 2 if seg.xmitCount > 1 { if ackSeg.parsedOptions.TS && ackSeg.parsedOptions.TSEcr != 0 { if ackSeg.parsedOptions.TSEcr < rc.snd.ep.tsVal(seg.xmitTime) { return } } if rtt < rc.minRTT { return } } rc.RTT = rtt // The sender can either track a simple global minimum of all RTT // measurements from the connection, or a windowed min-filtered value // of recent RTT measurements. This implementation keeps track of the // simple global minimum of all RTTs for the connection. if rtt < rc.minRTT || rc.minRTT == 0 { rc.minRTT = rtt } // Update rc.xmitTime and rc.endSequence to the transmit time and // ending sequence number of the packet which has been acknowledged // most recently. endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) if rc.XmitTime.Before(seg.xmitTime) || (seg.xmitTime == rc.XmitTime && rc.EndSequence.LessThan(endSeq)) { rc.XmitTime = seg.xmitTime rc.EndSequence = endSeq } } // detectReorder detects if packet reordering has been observed. // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 // - Step 3: Detect data segment reordering. // To detect reordering, the sender looks for original data segments being // delivered out of order. To detect such cases, the sender tracks the // highest sequence selectively or cumulatively acknowledged in the RACK.fack // variable. The name "fack" stands for the most "Forward ACK" (this term is // adopted from [FACK]). If a never retransmitted segment that's below // RACK.fack is (selectively or cumulatively) acknowledged, it has been // delivered out of order. The sender sets RACK.reord to TRUE if such segment // is identified. func (rc *rackControl) detectReorder(seg *segment) { endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) if rc.FACK.LessThan(endSeq) { rc.FACK = endSeq return } if endSeq.LessThan(rc.FACK) && seg.xmitCount == 1 { rc.Reord = true } } func (rc *rackControl) setDSACKSeen(dsackSeen bool) { rc.DSACKSeen = dsackSeen } // shouldSchedulePTO dictates whether we should schedule a PTO or not. // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1. func (s *sender) shouldSchedulePTO() bool { // Schedule PTO only if RACK loss detection is enabled. return s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 && // The connection supports SACK. s.ep.SACKPermitted && // The connection is not in loss recovery. (s.state != tcpip.RTORecovery && s.state != tcpip.SACKRecovery) && // The connection has no SACKed sequences in the SACK scoreboard. s.ep.scoreboard.Sacked() == 0 } // schedulePTO schedules the probe timeout as defined in // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1. func (s *sender) schedulePTO() { pto := time.Second s.rtt.Lock() if s.rtt.TCPRTTState.SRTTInited && s.rtt.TCPRTTState.SRTT > 0 { pto = s.rtt.TCPRTTState.SRTT * 2 if s.Outstanding == 1 { pto += wcDelayedACKTimeout } } s.rtt.Unlock() now := s.ep.stack.Clock().NowMonotonic() if s.resendTimer.enabled() { if now.Add(pto).After(s.resendTimer.target) { pto = s.resendTimer.target.Sub(now) } s.resendTimer.disable() } s.probeTimer.enable(pto) } // probeTimerExpired is the same as TLP_send_probe() as defined in // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.2. // // +checklocks:s.ep.mu func (s *sender) probeTimerExpired() tcpip.Error { if s.probeTimer.isUninitialized() || !s.probeTimer.checkExpiration() { return nil } var dataSent bool if s.writeNext != nil && s.writeNext.xmitCount == 0 && s.Outstanding < s.SndCwnd { dataSent = s.maybeSendSegment(s.writeNext, int(s.ep.scoreboard.SMSS()), s.SndUna.Add(s.SndWnd)) if dataSent { s.Outstanding += s.pCount(s.writeNext, s.MaxPayloadSize) s.updateWriteNext(s.writeNext.Next()) } } if !dataSent && !s.rc.tlpRxtOut { var highestSeqXmit *segment for highestSeqXmit = s.writeList.Front(); highestSeqXmit != nil; highestSeqXmit = highestSeqXmit.Next() { if highestSeqXmit.xmitCount == 0 { // Nothing in writeList is transmitted, no need to send a probe. highestSeqXmit = nil break } if highestSeqXmit.Next() == nil || highestSeqXmit.Next().xmitCount == 0 { // Either everything in writeList has been transmitted or the next // sequence has not been transmitted. Either way this is the highest // sequence segment that was transmitted. break } } if highestSeqXmit != nil { dataSent = s.maybeSendSegment(highestSeqXmit, int(s.ep.scoreboard.SMSS()), s.SndUna.Add(s.SndWnd)) if dataSent { s.rc.tlpRxtOut = true s.rc.tlpHighRxt = s.SndNxt } } } // Whether or not the probe was sent, the sender must arm the resend timer, // not the probe timer. This ensures that the sender does not send repeated, // back-to-back tail loss probes. s.postXmit(dataSent, false /* shouldScheduleProbe */) return nil } // detectTLPRecovery detects if recovery was accomplished by the loss probes // and updates TLP state accordingly. // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.3. func (s *sender) detectTLPRecovery(ack seqnum.Value, rcvdSeg *segment) { if !(s.ep.SACKPermitted && s.rc.tlpRxtOut) { return } // Step 1. if s.isDupAck(rcvdSeg) && ack == s.rc.tlpHighRxt { var sbAboveTLPHighRxt bool for _, sb := range rcvdSeg.parsedOptions.SACKBlocks { if s.rc.tlpHighRxt.LessThan(sb.End) { sbAboveTLPHighRxt = true break } } if !sbAboveTLPHighRxt { // TLP episode is complete. s.rc.tlpRxtOut = false } } if s.rc.tlpRxtOut && s.rc.tlpHighRxt.LessThanEq(ack) { // TLP episode is complete. s.rc.tlpRxtOut = false if !checkDSACK(rcvdSeg) { // Step 2. Either the original packet or the retransmission (in the // form of a probe) was lost. Invoke a congestion control response // equivalent to fast recovery. s.cc.HandleLossDetected() s.enterRecovery() s.leaveRecovery() } } } // updateRACKReorderWindow updates the reorder window. // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 // - Step 4: Update RACK reordering window // To handle the prevalent small degree of reordering, RACK.reo_wnd serves as // an allowance for settling time before marking a packet lost. RACK starts // initially with a conservative window of min_RTT/4. If no reordering has // been observed RACK uses reo_wnd of zero during loss recovery, in order to // retransmit quickly, or when the number of DUPACKs exceeds the classic // DUPACKthreshold. func (rc *rackControl) updateRACKReorderWindow() { dsackSeen := rc.DSACKSeen snd := rc.snd // React to DSACK once per round trip. // If SND.UNA < RACK.rtt_seq: // RACK.dsack = false if snd.SndUna.LessThan(rc.RTTSeq) { dsackSeen = false } // If RACK.dsack: // RACK.reo_wnd_incr += 1 // RACK.dsack = false // RACK.rtt_seq = SND.NXT // RACK.reo_wnd_persist = 16 if dsackSeen { rc.ReoWndIncr++ dsackSeen = false rc.RTTSeq = snd.SndNxt rc.ReoWndPersist = tcpRACKRecoveryThreshold } else if rc.exitedRecovery { // Else if exiting loss recovery: // RACK.reo_wnd_persist -= 1 // If RACK.reo_wnd_persist <= 0: // RACK.reo_wnd_incr = 1 rc.ReoWndPersist-- if rc.ReoWndPersist <= 0 { rc.ReoWndIncr = 1 } rc.exitedRecovery = false } // Reorder window is zero during loss recovery, or when the number of // DUPACKs exceeds the classic DUPACKthreshold. // If RACK.reord is FALSE: // If in loss recovery: (If in fast or timeout recovery) // RACK.reo_wnd = 0 // Return // Else if RACK.pkts_sacked >= RACK.dupthresh: // RACK.reo_wnd = 0 // return if !rc.Reord { if snd.state == tcpip.RTORecovery || snd.state == tcpip.SACKRecovery { rc.ReoWnd = 0 return } if snd.SackedOut >= nDupAckThreshold { rc.ReoWnd = 0 return } } // Calculate reorder window. // RACK.reo_wnd = RACK.min_RTT / 4 * RACK.reo_wnd_incr // RACK.reo_wnd = min(RACK.reo_wnd, SRTT) snd.rtt.Lock() srtt := snd.rtt.TCPRTTState.SRTT snd.rtt.Unlock() rc.ReoWnd = time.Duration((int64(rc.minRTT) / 4) * int64(rc.ReoWndIncr)) if srtt < rc.ReoWnd { rc.ReoWnd = srtt } } func (rc *rackControl) exitRecovery() { rc.exitedRecovery = true } // detectLoss marks the segment as lost if the reordering window has elapsed // and the ACK is not received. It will also arm the reorder timer. // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 Step 5. func (rc *rackControl) detectLoss(rcvTime tcpip.MonotonicTime) int { var timeout time.Duration numLost := 0 for seg := rc.snd.writeList.Front(); seg != nil && seg.xmitCount != 0; seg = seg.Next() { if rc.snd.ep.scoreboard.IsSACKED(seg.sackBlock()) { continue } if seg.lost && seg.xmitCount == 1 { numLost++ continue } endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) if seg.xmitTime.Before(rc.XmitTime) || (seg.xmitTime == rc.XmitTime && rc.EndSequence.LessThan(endSeq)) { timeRemaining := seg.xmitTime.Sub(rcvTime) + rc.RTT + rc.ReoWnd if timeRemaining <= 0 { seg.lost = true numLost++ } else if timeRemaining > timeout { timeout = timeRemaining } } } if timeout != 0 && !rc.snd.reorderTimer.enabled() { rc.snd.reorderTimer.enable(timeout) } return numLost } // reorderTimerExpired will retransmit the segments which have not been acked // before the reorder timer expired. // // +checklocks:rc.snd.ep.mu func (rc *rackControl) reorderTimerExpired() tcpip.Error { if rc.snd.reorderTimer.isUninitialized() || !rc.snd.reorderTimer.checkExpiration() { return nil } numLost := rc.detectLoss(rc.snd.ep.stack.Clock().NowMonotonic()) if numLost == 0 { return nil } fastRetransmit := false if !rc.snd.FastRecovery.Active { rc.snd.cc.HandleLossDetected() rc.snd.enterRecovery() fastRetransmit = true } rc.DoRecovery(nil, fastRetransmit) return nil } // DoRecovery implements lossRecovery.DoRecovery. // // +checklocks:rc.snd.ep.mu func (rc *rackControl) DoRecovery(_ *segment, fastRetransmit bool) { snd := rc.snd if fastRetransmit { snd.resendSegment() } var dataSent bool // Iterate the writeList and retransmit the segments which are marked // as lost by RACK. for seg := snd.writeList.Front(); seg != nil && seg.xmitCount > 0; seg = seg.Next() { if seg == snd.writeNext { break } if !seg.lost { continue } // Reset seg.lost as it is already SACKed. if snd.ep.scoreboard.IsSACKED(seg.sackBlock()) { seg.lost = false continue } // Check the congestion window after entering recovery. if snd.Outstanding >= snd.SndCwnd { break } if sent := snd.maybeSendSegment(seg, int(snd.ep.scoreboard.SMSS()), snd.SndUna.Add(snd.SndWnd)); !sent { break } dataSent = true snd.Outstanding += snd.pCount(seg, snd.MaxPayloadSize) } snd.postXmit(dataSent, true /* shouldScheduleProbe */) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/rcv.go000066400000000000000000000534051465435605700245150ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "container/heap" "math" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/seqnum" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // receiver holds the state necessary to receive TCP segments and turn them // into a stream of bytes. // // +stateify savable type receiver struct { stack.TCPReceiverState ep *Endpoint // rcvWnd is the non-scaled receive window last advertised to the peer. rcvWnd seqnum.Size // rcvWUP is the RcvNxt value at the last window update sent. rcvWUP seqnum.Value // prevBufused is the snapshot of endpoint rcvBufUsed taken when we // advertise a receive window. prevBufUsed int closed bool // pendingRcvdSegments is bounded by the receive buffer size of the // endpoint. pendingRcvdSegments segmentHeap // Time when the last ack was received. lastRcvdAckTime tcpip.MonotonicTime } func newReceiver(ep *Endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale uint8) *receiver { return &receiver{ ep: ep, TCPReceiverState: stack.TCPReceiverState{ RcvNxt: irs + 1, RcvAcc: irs.Add(rcvWnd + 1), RcvWndScale: rcvWndScale, }, rcvWnd: rcvWnd, rcvWUP: irs + 1, lastRcvdAckTime: ep.stack.Clock().NowMonotonic(), } } // acceptable checks if the segment sequence number range is acceptable // according to the table on page 26 of RFC 793. func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool { // r.rcvWnd could be much larger than the window size we advertised in our // outgoing packets, we should use what we have advertised for acceptability // test. scaledWindowSize := r.rcvWnd >> r.RcvWndScale if scaledWindowSize > math.MaxUint16 { // This is what we actually put in the Window field. scaledWindowSize = math.MaxUint16 } advertisedWindowSize := scaledWindowSize << r.RcvWndScale return header.Acceptable(segSeq, segLen, r.RcvNxt, r.RcvNxt.Add(advertisedWindowSize)) } // currentWindow returns the available space in the window that was advertised // last to our peer. func (r *receiver) currentWindow() (curWnd seqnum.Size) { endOfWnd := r.rcvWUP.Add(r.rcvWnd) if endOfWnd.LessThan(r.RcvNxt) { // return 0 if r.RcvNxt is past the end of the previously advertised window. // This can happen because we accept a large segment completely even if // accepting it causes it to partially exceed the advertised window. return 0 } return r.RcvNxt.Size(endOfWnd) } // getSendParams returns the parameters needed by the sender when building // segments to send. // +checklocks:r.ep.mu func (r *receiver) getSendParams() (RcvNxt seqnum.Value, rcvWnd seqnum.Size) { newWnd := r.ep.selectWindow() curWnd := r.currentWindow() unackLen := int(r.ep.snd.MaxSentAck.Size(r.RcvNxt)) bufUsed := r.ep.receiveBufferUsed() // Grow the right edge of the window only for payloads larger than the // the segment overhead OR if the application is actively consuming data. // // Avoiding growing the right edge otherwise, addresses a situation below: // An application has been slow in reading data and we have burst of // incoming segments lengths < segment overhead. Here, our available free // memory would reduce drastically when compared to the advertised receive // window. // // For example: With incoming 512 bytes segments, segment overhead of // 552 bytes (at the time of writing this comment), with receive window // starting from 1MB and with rcvAdvWndScale being 1, buffer would reach 0 // when the curWnd is still 19436 bytes, because for every incoming segment // newWnd would reduce by (552+512) >> rcvAdvWndScale (current value 1), // while curWnd would reduce by 512 bytes. // Such a situation causes us to keep tail dropping the incoming segments // and never advertise zero receive window to the peer. // // Linux does a similar check for minimal sk_buff size (128): // https://github.com/torvalds/linux/blob/d5beb3140f91b1c8a3d41b14d729aefa4dcc58bc/net/ipv4/tcp_input.c#L783 // // Also, if the application is reading the data, we keep growing the right // edge, as we are still advertising a window that we think can be serviced. toGrow := unackLen >= SegOverheadSize || bufUsed <= r.prevBufUsed // Update RcvAcc only if new window is > previously advertised window. We // should never shrink the acceptable sequence space once it has been // advertised the peer. If we shrink the acceptable sequence space then we // would end up dropping bytes that might already be in flight. // ==================================================== sequence space. // ^ ^ ^ ^ // rcvWUP RcvNxt RcvAcc new RcvAcc // <=====curWnd ===> // <========= newWnd > curWnd ========= > if r.RcvNxt.Add(curWnd).LessThan(r.RcvNxt.Add(newWnd)) && toGrow { // If the new window moves the right edge, then update RcvAcc. r.RcvAcc = r.RcvNxt.Add(newWnd) } else { if newWnd == 0 { // newWnd is zero but we can't advertise a zero as it would cause window // to shrink so just increment a metric to record this event. r.ep.stats.ReceiveErrors.WantZeroRcvWindow.Increment() } newWnd = curWnd } // Apply silly-window avoidance when recovering from zero-window situation. // Keep advertising zero receive window up until the new window reaches a // threshold. if r.rcvWnd == 0 && newWnd != 0 { r.ep.rcvQueueMu.Lock() if crossed, above := r.ep.windowCrossedACKThresholdLocked(int(newWnd), int(r.ep.ops.GetReceiveBufferSize())); !crossed && !above { newWnd = 0 } r.ep.rcvQueueMu.Unlock() } // Stash away the non-scaled receive window as we use it for measuring // receiver's estimated RTT. r.rcvWnd = newWnd r.rcvWUP = r.RcvNxt r.prevBufUsed = bufUsed scaledWnd := r.rcvWnd >> r.RcvWndScale if scaledWnd == 0 { // Increment a metric if we are advertising an actual zero window. r.ep.stats.ReceiveErrors.ZeroRcvWindowState.Increment() } // If we started off with a window larger than what can he held in // the 16bit window field, we ceil the value to the max value. if scaledWnd > math.MaxUint16 { scaledWnd = seqnum.Size(math.MaxUint16) // Ensure that the stashed receive window always reflects what // is being advertised. r.rcvWnd = scaledWnd << r.RcvWndScale } return r.RcvNxt, scaledWnd } // nonZeroWindow is called when the receive window grows from zero to nonzero; // in such cases we may need to send an ack to indicate to our peer that it can // resume sending data. // +checklocks:r.ep.mu // +checklocksalias:r.ep.snd.ep.mu=r.ep.mu func (r *receiver) nonZeroWindow() { // Immediately send an ack. r.ep.snd.sendAck() } // consumeSegment attempts to consume a segment that was received by r. The // segment may have just been received or may have been received earlier but // wasn't ready to be consumed then. // // Returns true if the segment was consumed, false if it cannot be consumed // yet because of a missing segment. // +checklocks:r.ep.mu // +checklocksalias:r.ep.snd.ep.mu=r.ep.mu func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum.Size) bool { if segLen > 0 { // If the segment doesn't include the seqnum we're expecting to // consume now, we're missing a segment. We cannot proceed until // we receive that segment though. if !r.RcvNxt.InWindow(segSeq, segLen) { return false } // Trim segment to eliminate already acknowledged data. if segSeq.LessThan(r.RcvNxt) { diff := segSeq.Size(r.RcvNxt) segLen -= diff segSeq.UpdateForward(diff) s.sequenceNumber.UpdateForward(diff) s.TrimFront(diff) } // Move segment to ready-to-deliver list. Wakeup any waiters. r.ep.readyToRead(s) } else if segSeq != r.RcvNxt { return false } // Update the segment that we're expecting to consume. r.RcvNxt = segSeq.Add(segLen) // In cases of a misbehaving sender which could send more than the // advertised window, we could end up in a situation where we get a // segment that exceeds the window advertised. Instead of partially // accepting the segment and discarding bytes beyond the advertised // window, we accept the whole segment and make sure r.RcvAcc is moved // forward to match r.RcvNxt to indicate that the window is now closed. // // In absence of this check the r.acceptable() check fails and accepts // segments that should be dropped because rcvWnd is calculated as // the size of the interval (RcvNxt, RcvAcc] which becomes extremely // large if RcvAcc is ever less than RcvNxt. if r.RcvAcc.LessThan(r.RcvNxt) { r.RcvAcc = r.RcvNxt } // Trim SACK Blocks to remove any SACK information that covers // sequence numbers that have been consumed. TrimSACKBlockList(&r.ep.sack, r.RcvNxt) // Handle FIN or FIN-ACK. if s.flags.Contains(header.TCPFlagFin) { r.RcvNxt++ // Send ACK immediately. r.ep.snd.sendAck() // Tell any readers that no more data will come. r.closed = true r.ep.readyToRead(nil) // We just received a FIN, our next state depends on whether we sent a // FIN already or not. switch r.ep.EndpointState() { case StateEstablished: r.ep.setEndpointState(StateCloseWait) case StateFinWait1: if s.flags.Contains(header.TCPFlagAck) && s.ackNumber == r.ep.snd.SndNxt { // FIN-ACK, transition to TIME-WAIT. r.ep.setEndpointState(StateTimeWait) } else { // Simultaneous close, expecting a final ACK. r.ep.setEndpointState(StateClosing) } case StateFinWait2: r.ep.setEndpointState(StateTimeWait) } // Flush out any pending segments, except the very first one if // it happens to be the one we're handling now because the // caller is using it. first := 0 if len(r.pendingRcvdSegments) != 0 && r.pendingRcvdSegments[0] == s { first = 1 } for i := first; i < len(r.pendingRcvdSegments); i++ { r.PendingBufUsed -= r.pendingRcvdSegments[i].segMemSize() r.pendingRcvdSegments[i].DecRef() // Note that slice truncation does not allow garbage // collection of truncated items, thus truncated items // must be set to nil to avoid memory leaks. r.pendingRcvdSegments[i] = nil } r.pendingRcvdSegments = r.pendingRcvdSegments[:first] r.ep.updateConnDirectionState(connDirectionStateRcvClosed) return true } // Handle ACK (not FIN-ACK, which we handled above) during one of the // shutdown states. if s.flags.Contains(header.TCPFlagAck) && s.ackNumber == r.ep.snd.SndNxt { switch r.ep.EndpointState() { case StateFinWait1: r.ep.setEndpointState(StateFinWait2) if e := r.ep; e.closed { // The socket has been closed and we are in // FIN-WAIT-2 so start the FIN-WAIT-2 timer. e.finWait2Timer = e.stack.Clock().AfterFunc(e.tcpLingerTimeout, e.finWait2TimerExpired) } case StateClosing: r.ep.setEndpointState(StateTimeWait) case StateLastAck: r.ep.transitionToStateCloseLocked() } } return true } // updateRTT updates the receiver RTT measurement based on the sequence number // of the received segment. func (r *receiver) updateRTT() { // From: https://public.lanl.gov/radiant/pubs/drs/sc2001-poster.pdf // // A system that is only transmitting acknowledgements can still // estimate the round-trip time by observing the time between when a byte // is first acknowledged and the receipt of data that is at least one // window beyond the sequence number that was acknowledged. r.ep.rcvQueueMu.Lock() if r.ep.RcvAutoParams.RTTMeasureTime == (tcpip.MonotonicTime{}) { // New measurement. r.ep.RcvAutoParams.RTTMeasureTime = r.ep.stack.Clock().NowMonotonic() r.ep.RcvAutoParams.RTTMeasureSeqNumber = r.RcvNxt.Add(r.rcvWnd) r.ep.rcvQueueMu.Unlock() return } if r.RcvNxt.LessThan(r.ep.RcvAutoParams.RTTMeasureSeqNumber) { r.ep.rcvQueueMu.Unlock() return } rtt := r.ep.stack.Clock().NowMonotonic().Sub(r.ep.RcvAutoParams.RTTMeasureTime) // We only store the minimum observed RTT here as this is only used in // absence of a SRTT available from either timestamps or a sender // measurement of RTT. if r.ep.RcvAutoParams.RTT == 0 || rtt < r.ep.RcvAutoParams.RTT { r.ep.RcvAutoParams.RTT = rtt } r.ep.RcvAutoParams.RTTMeasureTime = r.ep.stack.Clock().NowMonotonic() r.ep.RcvAutoParams.RTTMeasureSeqNumber = r.RcvNxt.Add(r.rcvWnd) r.ep.rcvQueueMu.Unlock() } // +checklocks:r.ep.mu // +checklocksalias:r.ep.snd.ep.mu=r.ep.mu func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, closed bool) (drop bool, err tcpip.Error) { r.ep.rcvQueueMu.Lock() rcvClosed := r.ep.RcvClosed || r.closed r.ep.rcvQueueMu.Unlock() // If we are in one of the shutdown states then we need to do // additional checks before we try and process the segment. switch state { case StateCloseWait, StateClosing, StateLastAck: if !s.sequenceNumber.LessThanEq(r.RcvNxt) { // Just drop the segment as we have // already received a FIN and this // segment is after the sequence number // for the FIN. return true, nil } fallthrough case StateFinWait1, StateFinWait2: // If the ACK acks something not yet sent then we send an ACK. // // RFC793, page 37: If the connection is in a synchronized state, // (ESTABLISHED, FIN-WAIT-1, FIN-WAIT-2, CLOSE-WAIT, CLOSING, LAST-ACK, // TIME-WAIT), any unacceptable segment (out of window sequence number // or unacceptable acknowledgment number) must elicit only an empty // acknowledgment segment containing the current send-sequence number // and an acknowledgment indicating the next sequence number expected // to be received, and the connection remains in the same state. // // Just as on Linux, we do not apply this behavior when state is // ESTABLISHED. // Linux receive processing for all states except ESTABLISHED and // TIME_WAIT is here where if the ACK check fails, we attempt to // reply back with an ACK with correct seq/ack numbers. // https://github.com/torvalds/linux/blob/v5.8/net/ipv4/tcp_input.c#L6186 // The ESTABLISHED state processing is here where if the ACK check // fails, we ignore the packet: // https://github.com/torvalds/linux/blob/v5.8/net/ipv4/tcp_input.c#L5591 if r.ep.snd.SndNxt.LessThan(s.ackNumber) { r.ep.snd.maybeSendOutOfWindowAck(s) return true, nil } // If we are closed for reads (either due to an // incoming FIN or the user calling shutdown(.., // SHUT_RD) then any data past the RcvNxt should // trigger a RST. endDataSeq := s.sequenceNumber.Add(seqnum.Size(s.payloadSize())) if state != StateCloseWait && rcvClosed && r.RcvNxt.LessThan(endDataSeq) { return true, &tcpip.ErrConnectionAborted{} } if state == StateFinWait1 { break } // If it's a retransmission of an old data segment // or a pure ACK then allow it. if s.sequenceNumber.Add(s.logicalLen()).LessThanEq(r.RcvNxt) || s.logicalLen() == 0 { break } // In FIN-WAIT2 if the socket is fully // closed(not owned by application on our end // then the only acceptable segment is a // FIN. Since FIN can technically also carry // data we verify that the segment carrying a // FIN ends at exactly e.RcvNxt+1. // // From RFC793 page 25. // // For sequence number purposes, the SYN is // considered to occur before the first actual // data octet of the segment in which it occurs, // while the FIN is considered to occur after // the last actual data octet in a segment in // which it occurs. if closed && (!s.flags.Contains(header.TCPFlagFin) || s.sequenceNumber.Add(s.logicalLen()) != r.RcvNxt+1) { return true, &tcpip.ErrConnectionAborted{} } } // We don't care about receive processing anymore if the receive side // is closed. // // NOTE: We still want to permit a FIN as it's possible only our // end has closed and the peer is yet to send a FIN. Hence we // compare only the payload. segEnd := s.sequenceNumber.Add(seqnum.Size(s.payloadSize())) if rcvClosed && !segEnd.LessThanEq(r.RcvNxt) { return true, nil } return false, nil } // handleRcvdSegment handles TCP segments directed at the connection managed by // r as they arrive. It is called by the protocol main loop. // +checklocks:r.ep.mu // +checklocksalias:r.ep.snd.ep.mu=r.ep.mu func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err tcpip.Error) { state := r.ep.EndpointState() closed := r.ep.closed segLen := seqnum.Size(s.payloadSize()) segSeq := s.sequenceNumber // If the sequence number range is outside the acceptable range, just // send an ACK and stop further processing of the segment. // This is according to RFC 793, page 68. if !r.acceptable(segSeq, segLen) { r.ep.snd.maybeSendOutOfWindowAck(s) return true, nil } if state != StateEstablished { drop, err := r.handleRcvdSegmentClosing(s, state, closed) if drop || err != nil { return drop, err } } // Store the time of the last ack. r.lastRcvdAckTime = r.ep.stack.Clock().NowMonotonic() // Defer segment processing if it can't be consumed now. if !r.consumeSegment(s, segSeq, segLen) { if segLen > 0 || s.flags.Contains(header.TCPFlagFin) { // We only store the segment if it's within our buffer // size limit. // // Only use 75% of the receive buffer queue for // out-of-order segments. This ensures that we always // leave some space for the inorder segments to arrive // allowing pending segments to be processed and // delivered to the user. // // The ratio must be at least 50% (the size of rwnd) to // leave space for retransmitted dropped packets. 51% // would make recovery slow when there are multiple // drops by necessitating multiple round trips. 100% // would enable the buffer to be totally full of // out-of-order data and stall the connection. // // An ideal solution is to ensure that there are at // least N bytes free when N bytes are missing, but we // don't have that computed at this point in the stack. if rcvBufSize := r.ep.ops.GetReceiveBufferSize(); rcvBufSize > 0 && (r.PendingBufUsed+int(segLen)) < int(rcvBufSize-rcvBufSize/4) { r.ep.rcvQueueMu.Lock() r.PendingBufUsed += s.segMemSize() r.ep.rcvQueueMu.Unlock() s.IncRef() heap.Push(&r.pendingRcvdSegments, s) UpdateSACKBlocks(&r.ep.sack, segSeq, segSeq.Add(segLen), r.RcvNxt) } // Immediately send an ack so that the peer knows it may // have to retransmit. r.ep.snd.sendAck() } return false, nil } // Since we consumed a segment update the receiver's RTT estimate // if required. if segLen > 0 { r.updateRTT() } // By consuming the current segment, we may have filled a gap in the // sequence number domain that allows pending segments to be consumed // now. So try to do it. for !r.closed && r.pendingRcvdSegments.Len() > 0 { s := r.pendingRcvdSegments[0] segLen := seqnum.Size(s.payloadSize()) segSeq := s.sequenceNumber // Skip segment altogether if it has already been acknowledged. if !segSeq.Add(segLen-1).LessThan(r.RcvNxt) && !r.consumeSegment(s, segSeq, segLen) { break } heap.Pop(&r.pendingRcvdSegments) r.ep.rcvQueueMu.Lock() r.PendingBufUsed -= s.segMemSize() r.ep.rcvQueueMu.Unlock() s.DecRef() } return false, nil } // handleTimeWaitSegment handles inbound segments received when the endpoint // has entered the TIME_WAIT state. // +checklocks:r.ep.mu // +checklocksalias:r.ep.snd.ep.mu=r.ep.mu func (r *receiver) handleTimeWaitSegment(s *segment) (resetTimeWait bool, newSyn bool) { segSeq := s.sequenceNumber segLen := seqnum.Size(s.payloadSize()) // Just silently drop any RST packets in TIME_WAIT. We do not support // TIME_WAIT assassination as a result we confirm w/ fix 1 as described // in https://tools.ietf.org/html/rfc1337#section-3. // // This behavior overrides RFC793 page 70 where we transition to CLOSED // on receiving RST, which is also default Linux behavior. // On Linux the RST can be ignored by setting sysctl net.ipv4.tcp_rfc1337. // // As we do not yet support PAWS, we are being conservative in ignoring // RSTs by default. if s.flags.Contains(header.TCPFlagRst) { return false, false } // If it's a SYN and the sequence number is higher than any seen before // for this connection then try and redirect it to a listening endpoint // if available. // // RFC 1122: // "When a connection is [...] on TIME-WAIT state [...] // [a TCP] MAY accept a new SYN from the remote TCP to // reopen the connection directly, if it: // (1) assigns its initial sequence number for the new // connection to be larger than the largest sequence // number it used on the previous connection incarnation, // and // (2) returns to TIME-WAIT state if the SYN turns out // to be an old duplicate". if s.flags.Contains(header.TCPFlagSyn) && r.RcvNxt.LessThan(segSeq) { return false, true } // Drop the segment if it does not contain an ACK. if !s.flags.Contains(header.TCPFlagAck) { return false, false } // Update Timestamp if required. See RFC7323, section-4.3. if r.ep.SendTSOk && s.parsedOptions.TS { r.ep.updateRecentTimestamp(s.parsedOptions.TSVal, r.ep.snd.MaxSentAck, segSeq) } if segSeq.Add(1) == r.RcvNxt && s.flags.Contains(header.TCPFlagFin) { // If it's a FIN-ACK then resetTimeWait and send an ACK, as it // indicates our final ACK could have been lost. r.ep.snd.sendAck() return true, false } // If the sequence number range is outside the acceptable range or // carries data then just send an ACK. This is according to RFC 793, // page 37. // // NOTE: In TIME_WAIT the only acceptable sequence number is RcvNxt. if segSeq != r.RcvNxt || segLen != 0 { r.ep.snd.sendAck() } return false, false } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/reno.go000066400000000000000000000064701465435605700246660ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "time" ) // renoState stores the variables related to TCP New Reno congestion // control algorithm. // // +stateify savable type renoState struct { s *sender } // newRenoCC initializes the state for the NewReno congestion control algorithm. func newRenoCC(s *sender) *renoState { return &renoState{s: s} } // updateSlowStart will update the congestion window as per the slow-start // algorithm used by NewReno. If after adjusting the congestion window // we cross the SSthreshold then it will return the number of packets that // must be consumed in congestion avoidance mode. func (r *renoState) updateSlowStart(packetsAcked int) int { // Don't let the congestion window cross into the congestion // avoidance range. newcwnd := r.s.SndCwnd + packetsAcked if newcwnd >= r.s.Ssthresh { newcwnd = r.s.Ssthresh r.s.SndCAAckCount = 0 } packetsAcked -= newcwnd - r.s.SndCwnd r.s.SndCwnd = newcwnd return packetsAcked } // updateCongestionAvoidance will update congestion window in congestion // avoidance mode as described in RFC5681 section 3.1 func (r *renoState) updateCongestionAvoidance(packetsAcked int) { // Consume the packets in congestion avoidance mode. r.s.SndCAAckCount += packetsAcked if r.s.SndCAAckCount >= r.s.SndCwnd { r.s.SndCwnd += r.s.SndCAAckCount / r.s.SndCwnd r.s.SndCAAckCount = r.s.SndCAAckCount % r.s.SndCwnd } } // reduceSlowStartThreshold reduces the slow-start threshold per RFC 5681, // page 6, eq. 4. It is called when we detect congestion in the network. func (r *renoState) reduceSlowStartThreshold() { r.s.Ssthresh = r.s.Outstanding / 2 if r.s.Ssthresh < 2 { r.s.Ssthresh = 2 } } // Update updates the congestion state based on the number of packets that // were acknowledged. // Update implements congestionControl.Update. func (r *renoState) Update(packetsAcked int, _ time.Duration) { if r.s.SndCwnd < r.s.Ssthresh { packetsAcked = r.updateSlowStart(packetsAcked) if packetsAcked == 0 { return } } r.updateCongestionAvoidance(packetsAcked) } // HandleLossDetected implements congestionControl.HandleLossDetected. func (r *renoState) HandleLossDetected() { // A retransmit was triggered due to nDupAckThreshold or when RACK // detected loss. Reduce our slow start threshold. r.reduceSlowStartThreshold() } // HandleRTOExpired implements congestionControl.HandleRTOExpired. func (r *renoState) HandleRTOExpired() { // We lost a packet, so reduce ssthresh. r.reduceSlowStartThreshold() // Reduce the congestion window to 1, i.e., enter slow-start. Per // RFC 5681, page 7, we must use 1 regardless of the value of the // initial congestion window. r.s.SndCwnd = 1 } // PostRecovery implements congestionControl.PostRecovery. func (r *renoState) PostRecovery() { // noop. } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/reno_recovery.go000066400000000000000000000040501465435605700265740ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp // renoRecovery stores the variables related to TCP Reno loss recovery // algorithm. // // +stateify savable type renoRecovery struct { s *sender } func newRenoRecovery(s *sender) *renoRecovery { return &renoRecovery{s: s} } // +checklocks:rr.s.ep.mu func (rr *renoRecovery) DoRecovery(rcvdSeg *segment, fastRetransmit bool) { ack := rcvdSeg.ackNumber snd := rr.s // We are in fast recovery mode. Ignore the ack if it's out of range. if !ack.InRange(snd.SndUna, snd.SndNxt+1) { return } // Don't count this as a duplicate if it is carrying data or // updating the window. if rcvdSeg.logicalLen() != 0 || snd.SndWnd != rcvdSeg.window { return } // Inflate the congestion window if we're getting duplicate acks // for the packet we retransmitted. if !fastRetransmit && ack == snd.FastRecovery.First { // We received a dup, inflate the congestion window by 1 packet // if we're not at the max yet. Only inflate the window if // regular FastRecovery is in use, RFC6675 does not require // inflating cwnd on duplicate ACKs. if snd.SndCwnd < snd.FastRecovery.MaxCwnd { snd.SndCwnd++ } return } // A partial ack was received. Retransmit this packet and remember it // so that we don't retransmit it again. // // We don't inflate the window because we're putting the same packet // back onto the wire. // // N.B. The retransmit timer will be reset by the caller. snd.FastRecovery.First = ack snd.DupAckCount = 0 snd.resendSegment() } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/sack.go000066400000000000000000000060211465435605700246340ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/seqnum" ) const ( // MaxSACKBlocks is the maximum number of SACK blocks stored // at receiver side. MaxSACKBlocks = 6 ) // UpdateSACKBlocks updates the list of SACK blocks to include the segment // specified by segStart->segEnd. If the segment happens to be an out of order // delivery then the first block in the sack.blocks always includes the // segment identified by segStart->segEnd. func UpdateSACKBlocks(sack *SACKInfo, segStart seqnum.Value, segEnd seqnum.Value, rcvNxt seqnum.Value) { newSB := header.SACKBlock{Start: segStart, End: segEnd} // Ignore any invalid SACK blocks or blocks that are before rcvNxt as // those bytes have already been acked. if newSB.End.LessThanEq(newSB.Start) || newSB.End.LessThan(rcvNxt) { return } if sack.NumBlocks == 0 { sack.Blocks[0] = newSB sack.NumBlocks = 1 return } var n = 0 for i := 0; i < sack.NumBlocks; i++ { start, end := sack.Blocks[i].Start, sack.Blocks[i].End if end.LessThanEq(rcvNxt) { // Discard any sack blocks that are before rcvNxt as // those have already been acked. continue } if newSB.Start.LessThanEq(end) && start.LessThanEq(newSB.End) { // Merge this SACK block into newSB and discard this SACK // block. if start.LessThan(newSB.Start) { newSB.Start = start } if newSB.End.LessThan(end) { newSB.End = end } } else { // Save this block. sack.Blocks[n] = sack.Blocks[i] n++ } } if rcvNxt.LessThan(newSB.Start) { // If this was an out of order segment then make sure that the // first SACK block is the one that includes the segment. // // See the first bullet point in // https://tools.ietf.org/html/rfc2018#section-4 if n == MaxSACKBlocks { // If the number of SACK blocks is equal to // MaxSACKBlocks then discard the last SACK block. n-- } for i := n - 1; i >= 0; i-- { sack.Blocks[i+1] = sack.Blocks[i] } sack.Blocks[0] = newSB n++ } sack.NumBlocks = n } // TrimSACKBlockList updates the sack block list by removing/modifying any block // where start is < rcvNxt. func TrimSACKBlockList(sack *SACKInfo, rcvNxt seqnum.Value) { n := 0 for i := 0; i < sack.NumBlocks; i++ { if sack.Blocks[i].End.LessThanEq(rcvNxt) { continue } if sack.Blocks[i].Start.LessThan(rcvNxt) { // Shrink this SACK block. sack.Blocks[i].Start = rcvNxt } sack.Blocks[n] = sack.Blocks[i] n++ } sack.NumBlocks = n } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/sack_recovery.go000066400000000000000000000074101465435605700265550ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import "gvisor.dev/gvisor/pkg/tcpip/seqnum" // sackRecovery stores the variables related to TCP SACK loss recovery // algorithm. // // +stateify savable type sackRecovery struct { s *sender } func newSACKRecovery(s *sender) *sackRecovery { return &sackRecovery{s: s} } // handleSACKRecovery implements the loss recovery phase as described in RFC6675 // section 5, step C. // +checklocks:sr.s.ep.mu func (sr *sackRecovery) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) { snd := sr.s snd.SetPipe() if smss := int(snd.ep.scoreboard.SMSS()); limit > smss { // Cap segment size limit to s.smss as SACK recovery requires // that all retransmissions or new segments send during recovery // be of <= SMSS. limit = smss } nextSegHint := snd.writeList.Front() for snd.Outstanding < snd.SndCwnd { var nextSeg *segment var rescueRtx bool nextSeg, nextSegHint, rescueRtx = snd.NextSeg(nextSegHint) if nextSeg == nil { return dataSent } if !snd.isAssignedSequenceNumber(nextSeg) || snd.SndNxt.LessThanEq(nextSeg.sequenceNumber) { // New data being sent. // Step C.3 described below is handled by // maybeSendSegment which increments sndNxt when // a segment is transmitted. // // Step C.3 "If any of the data octets sent in // (C.1) are above HighData, HighData must be // updated to reflect the transmission of // previously unsent data." // // We pass s.smss as the limit as the Step 2) requires that // new data sent should be of size s.smss or less. if sent := snd.maybeSendSegment(nextSeg, limit, end); !sent { return dataSent } dataSent = true snd.Outstanding++ snd.updateWriteNext(nextSeg.Next()) continue } // Now handle the retransmission case where we matched either step 1,3 or 4 // of the NextSeg algorithm. // RFC 6675, Step C.4. // // "The estimate of the amount of data outstanding in the network // must be updated by incrementing pipe by the number of octets // transmitted in (C.1)." snd.Outstanding++ dataSent = true snd.sendSegment(nextSeg) segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen()) if rescueRtx { // We do the last part of rule (4) of NextSeg here to update // RescueRxt as until this point we don't know if we are going // to use the rescue transmission. snd.FastRecovery.RescueRxt = snd.FastRecovery.Last } else { // RFC 6675, Step C.2 // // "If any of the data octets sent in (C.1) are below // HighData, HighRxt MUST be set to the highest sequence // number of the retransmitted segment unless NextSeg () // rule (4) was invoked for this retransmission." snd.FastRecovery.HighRxt = segEnd - 1 } } return dataSent } // +checklocks:sr.s.ep.mu func (sr *sackRecovery) DoRecovery(rcvdSeg *segment, fastRetransmit bool) { snd := sr.s if fastRetransmit { snd.resendSegment() } // We are in fast recovery mode. Ignore the ack if it's out of range. if ack := rcvdSeg.ackNumber; !ack.InRange(snd.SndUna, snd.SndNxt+1) { return } // RFC 6675 recovery algorithm step C 1-5. end := snd.SndUna.Add(snd.SndWnd) dataSent := sr.handleSACKRecovery(snd.MaxPayloadSize, end) snd.postXmit(dataSent, true /* shouldScheduleProbe */) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/sack_scoreboard.go000066400000000000000000000206361465435605700270470ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "fmt" "strings" "github.com/google/btree" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/seqnum" ) const ( // maxSACKBlocks is the maximum number of distinct SACKBlocks the // scoreboard will track. Once there are 100 distinct blocks, new // insertions will fail. maxSACKBlocks = 100 // defaultBtreeDegree is set to 2 as btree.New(2) results in a 2-3-4 // tree. defaultBtreeDegree = 2 ) // SACKScoreboard stores a set of disjoint SACK ranges. // // +stateify savable type SACKScoreboard struct { // smss is defined in RFC5681 as following: // // The SMSS is the size of the largest segment that the sender can // transmit. This value can be based on the maximum transmission unit // of the network, the path MTU discovery [RFC1191, RFC4821] algorithm, // RMSS (see next item), or other factors. The size does not include // the TCP/IP headers and options. smss uint16 maxSACKED seqnum.Value sacked seqnum.Size `state:"nosave"` ranges *btree.BTree `state:"nosave"` } // NewSACKScoreboard returns a new SACK Scoreboard. func NewSACKScoreboard(smss uint16, iss seqnum.Value) *SACKScoreboard { return &SACKScoreboard{ smss: smss, ranges: btree.New(defaultBtreeDegree), maxSACKED: iss, } } // Reset erases all known range information from the SACK scoreboard. func (s *SACKScoreboard) Reset() { s.ranges = btree.New(defaultBtreeDegree) s.sacked = 0 } // Insert inserts/merges the provided SACKBlock into the scoreboard. func (s *SACKScoreboard) Insert(r header.SACKBlock) { if s.ranges.Len() >= maxSACKBlocks { return } // Check if we can merge the new range with a range before or after it. var toDelete []btree.Item if s.maxSACKED.LessThan(r.End - 1) { s.maxSACKED = r.End - 1 } s.ranges.AscendGreaterOrEqual(r, func(i btree.Item) bool { if i == r { return true } sacked := i.(header.SACKBlock) // There is a hole between these two SACK blocks, so we can't // merge anymore. if r.End.LessThan(sacked.Start) { return false } // There is some overlap at this point, merge the blocks and // delete the other one. // // ----sS--------sE // r.S---------------rE // -------sE if sacked.End.LessThan(r.End) { // sacked is contained in the newly inserted range. // Delete this block. toDelete = append(toDelete, i) return true } // sacked covers a range past end of the newly inserted // block. r.End = sacked.End toDelete = append(toDelete, i) return true }) s.ranges.DescendLessOrEqual(r, func(i btree.Item) bool { if i == r { return true } sacked := i.(header.SACKBlock) // sA------sE // rA----rE if sacked.End.LessThan(r.Start) { return false } // The previous range extends into the current block. Merge it // into the newly inserted range and delete the other one. // // <-rA---rE----<---rE---> // sA--------------sE r.Start = sacked.Start // Extend r to cover sacked if sacked extends past r. if r.End.LessThan(sacked.End) { r.End = sacked.End } toDelete = append(toDelete, i) return true }) for _, i := range toDelete { if sb := s.ranges.Delete(i); sb != nil { sb := i.(header.SACKBlock) s.sacked -= sb.Start.Size(sb.End) } } replaced := s.ranges.ReplaceOrInsert(r) if replaced == nil { s.sacked += r.Start.Size(r.End) } } // IsSACKED returns true if the a given range of sequence numbers denoted by r // are already covered by SACK information in the scoreboard. func (s *SACKScoreboard) IsSACKED(r header.SACKBlock) bool { if s.Empty() { return false } found := false s.ranges.DescendLessOrEqual(r, func(i btree.Item) bool { sacked := i.(header.SACKBlock) if sacked.End.LessThan(r.Start) { return false } if sacked.Contains(r) { found = true return false } return true }) return found } // String returns human-readable state of the scoreboard structure. func (s *SACKScoreboard) String() string { var str strings.Builder str.WriteString("SACKScoreboard: {") s.ranges.Ascend(func(i btree.Item) bool { str.WriteString(fmt.Sprintf("%v,", i)) return true }) str.WriteString("}\n") return str.String() } // Delete removes all SACK information prior to seq. func (s *SACKScoreboard) Delete(seq seqnum.Value) { if s.Empty() { return } toDelete := []btree.Item{} toInsert := []btree.Item{} r := header.SACKBlock{seq, seq.Add(1)} s.ranges.DescendLessOrEqual(r, func(i btree.Item) bool { if i == r { return true } sb := i.(header.SACKBlock) toDelete = append(toDelete, i) if sb.End.LessThanEq(seq) { s.sacked -= sb.Start.Size(sb.End) } else { newSB := header.SACKBlock{seq, sb.End} toInsert = append(toInsert, newSB) s.sacked -= sb.Start.Size(seq) } return true }) for _, sb := range toDelete { s.ranges.Delete(sb) } for _, sb := range toInsert { s.ranges.ReplaceOrInsert(sb) } } // Copy provides a copy of the SACK scoreboard. func (s *SACKScoreboard) Copy() (sackBlocks []header.SACKBlock, maxSACKED seqnum.Value) { s.ranges.Ascend(func(i btree.Item) bool { sackBlocks = append(sackBlocks, i.(header.SACKBlock)) return true }) return sackBlocks, s.maxSACKED } // IsRangeLost implements the IsLost(SeqNum) operation defined in RFC 6675 // section 4 but operates on a range of sequence numbers and returns true if // there are at least nDupAckThreshold SACK blocks greater than the range being // checked or if at least (nDupAckThreshold-1)*s.smss bytes have been SACKED // with sequence numbers greater than the block being checked. func (s *SACKScoreboard) IsRangeLost(r header.SACKBlock) bool { if s.Empty() { return false } nDupSACK := 0 nDupSACKBytes := seqnum.Size(0) isLost := false // We need to check if the immediate lower (if any) sacked // range contains or partially overlaps with r. searchMore := true s.ranges.DescendLessOrEqual(r, func(i btree.Item) bool { sacked := i.(header.SACKBlock) if sacked.Contains(r) { searchMore = false return false } if sacked.End.LessThanEq(r.Start) { // all sequence numbers covered by sacked are below // r so we continue searching. return false } // There is a partial overlap. In this case we r.Start is // between sacked.Start & sacked.End and r.End extends beyond // sacked.End. // Move r.Start to sacked.End and continuing searching blocks // above r.Start. r.Start = sacked.End return false }) if !searchMore { return isLost } s.ranges.AscendGreaterOrEqual(r, func(i btree.Item) bool { sacked := i.(header.SACKBlock) if sacked.Contains(r) { return false } nDupSACKBytes += sacked.Start.Size(sacked.End) nDupSACK++ if nDupSACK >= nDupAckThreshold || nDupSACKBytes >= seqnum.Size((nDupAckThreshold-1)*s.smss) { isLost = true return false } return true }) return isLost } // IsLost implements the IsLost(SeqNum) operation defined in RFC3517 section // 4. // // This routine returns whether the given sequence number is considered to be // lost. The routine returns true when either nDupAckThreshold discontiguous // SACKed sequences have arrived above 'SeqNum' or (nDupAckThreshold * SMSS) // bytes with sequence numbers greater than 'SeqNum' have been SACKed. // Otherwise, the routine returns false. func (s *SACKScoreboard) IsLost(seq seqnum.Value) bool { return s.IsRangeLost(header.SACKBlock{seq, seq.Add(1)}) } // Empty returns true if the SACK scoreboard has no entries, false otherwise. func (s *SACKScoreboard) Empty() bool { return s.ranges.Len() == 0 } // Sacked returns the current number of bytes held in the SACK scoreboard. func (s *SACKScoreboard) Sacked() seqnum.Size { return s.sacked } // MaxSACKED returns the highest sequence number ever inserted in the SACK // scoreboard. func (s *SACKScoreboard) MaxSACKED() seqnum.Value { return s.maxSACKED } // SMSS returns the sender's MSS as held by the SACK scoreboard. func (s *SACKScoreboard) SMSS() uint16 { return s.smss } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/segment.go000066400000000000000000000160471465435605700253660ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "fmt" "io" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/seqnum" "gvisor.dev/gvisor/pkg/tcpip/stack" ) // queueFlags are used to indicate which queue of an endpoint a particular segment // belongs to. This is used to track memory accounting correctly. type queueFlags uint8 const ( // SegOverheadSize is the size of an empty seg in memory including packet // buffer overhead. It is advised to use SegOverheadSize instead of segSize // in all cases where accounting for segment memory overhead is important. SegOverheadSize = segSize + stack.PacketBufferStructSize + header.IPv4MaximumHeaderSize recvQ queueFlags = 1 << iota sendQ ) var segmentPool = sync.Pool{ New: func() any { return &segment{} }, } // segment represents a TCP segment. It holds the payload and parsed TCP segment // information, and can be added to intrusive lists. // segment is mostly immutable, the only field allowed to change is data. // // +stateify savable type segment struct { segmentEntry segmentRefs ep *Endpoint qFlags queueFlags id stack.TransportEndpointID `state:"manual"` pkt *stack.PacketBuffer sequenceNumber seqnum.Value ackNumber seqnum.Value flags header.TCPFlags window seqnum.Size // csum is only populated for received segments. csum uint16 // csumValid is true if the csum in the received segment is valid. csumValid bool // parsedOptions stores the parsed values from the options in the segment. parsedOptions header.TCPOptions options []byte `state:".([]byte)"` hasNewSACKInfo bool rcvdTime tcpip.MonotonicTime // xmitTime is the last transmit time of this segment. xmitTime tcpip.MonotonicTime xmitCount uint32 // acked indicates if the segment has already been SACKed. acked bool // dataMemSize is the memory used by pkt initially. The value is used for // memory accounting in the receive buffer instead of pkt.MemSize() because // packet contents can be modified, so relying on the computed memory size // to "free" reserved bytes could leak memory in the receiver. dataMemSize int // lost indicates if the segment is marked as lost by RACK. lost bool } func newIncomingSegment(id stack.TransportEndpointID, clock tcpip.Clock, pkt *stack.PacketBuffer) (*segment, error) { hdr := header.TCP(pkt.TransportHeader().Slice()) var srcAddr tcpip.Address var dstAddr tcpip.Address switch netProto := pkt.NetworkProtocolNumber; netProto { case header.IPv4ProtocolNumber: hdr := header.IPv4(pkt.NetworkHeader().Slice()) srcAddr = hdr.SourceAddress() dstAddr = hdr.DestinationAddress() case header.IPv6ProtocolNumber: hdr := header.IPv6(pkt.NetworkHeader().Slice()) srcAddr = hdr.SourceAddress() dstAddr = hdr.DestinationAddress() default: panic(fmt.Sprintf("unknown network protocol number %d", netProto)) } csum, csumValid, ok := header.TCPValid( hdr, func() uint16 { return pkt.Data().Checksum() }, uint16(pkt.Data().Size()), srcAddr, dstAddr, pkt.RXChecksumValidated) if !ok { return nil, fmt.Errorf("header data offset does not respect size constraints: %d < offset < %d, got offset=%d", header.TCPMinimumSize, len(hdr), hdr.DataOffset()) } s := newSegment() s.id = id s.options = hdr[header.TCPMinimumSize:] s.parsedOptions = header.ParseTCPOptions(hdr[header.TCPMinimumSize:]) s.sequenceNumber = seqnum.Value(hdr.SequenceNumber()) s.ackNumber = seqnum.Value(hdr.AckNumber()) s.flags = hdr.Flags() s.window = seqnum.Size(hdr.WindowSize()) s.rcvdTime = clock.NowMonotonic() s.dataMemSize = pkt.MemSize() s.pkt = pkt.IncRef() s.csumValid = csumValid if !s.pkt.RXChecksumValidated { s.csum = csum } return s, nil } func newOutgoingSegment(id stack.TransportEndpointID, clock tcpip.Clock, buf buffer.Buffer) *segment { s := newSegment() s.id = id s.rcvdTime = clock.NowMonotonic() s.pkt = stack.NewPacketBuffer(stack.PacketBufferOptions{Payload: buf}) s.dataMemSize = s.pkt.MemSize() return s } func (s *segment) clone() *segment { t := newSegment() t.id = s.id t.sequenceNumber = s.sequenceNumber t.ackNumber = s.ackNumber t.flags = s.flags t.window = s.window t.rcvdTime = s.rcvdTime t.xmitTime = s.xmitTime t.xmitCount = s.xmitCount t.ep = s.ep t.qFlags = s.qFlags t.dataMemSize = s.dataMemSize t.pkt = s.pkt.Clone() return t } func newSegment() *segment { s := segmentPool.Get().(*segment) *s = segment{} s.InitRefs() return s } // merge merges data in oth and clears oth. func (s *segment) merge(oth *segment) { s.pkt.Data().Merge(oth.pkt.Data()) s.dataMemSize = s.pkt.MemSize() oth.dataMemSize = oth.pkt.MemSize() } // setOwner sets the owning endpoint for this segment. Its required // to be called to ensure memory accounting for receive/send buffer // queues is done properly. func (s *segment) setOwner(ep *Endpoint, qFlags queueFlags) { switch qFlags { case recvQ: ep.updateReceiveMemUsed(s.segMemSize()) case sendQ: // no memory account for sendQ yet. default: panic(fmt.Sprintf("unexpected queue flag %b", qFlags)) } s.ep = ep s.qFlags = qFlags } func (s *segment) DecRef() { s.segmentRefs.DecRef(func() { if s.ep != nil { switch s.qFlags { case recvQ: s.ep.updateReceiveMemUsed(-s.segMemSize()) case sendQ: // no memory accounting for sendQ yet. default: panic(fmt.Sprintf("unexpected queue flag %b set for segment", s.qFlags)) } } s.pkt.DecRef() s.pkt = nil segmentPool.Put(s) }) } // logicalLen is the segment length in the sequence number space. It's defined // as the data length plus one for each of the SYN and FIN bits set. func (s *segment) logicalLen() seqnum.Size { l := seqnum.Size(s.payloadSize()) if s.flags.Contains(header.TCPFlagSyn) { l++ } if s.flags.Contains(header.TCPFlagFin) { l++ } return l } // payloadSize is the size of s.data. func (s *segment) payloadSize() int { return s.pkt.Data().Size() } // segMemSize is the amount of memory used to hold the segment data and // the associated metadata. func (s *segment) segMemSize() int { return segSize + s.dataMemSize } // sackBlock returns a header.SACKBlock that represents this segment. func (s *segment) sackBlock() header.SACKBlock { return header.SACKBlock{Start: s.sequenceNumber, End: s.sequenceNumber.Add(s.logicalLen())} } func (s *segment) TrimFront(ackLeft seqnum.Size) { s.pkt.Data().TrimFront(int(ackLeft)) } func (s *segment) ReadTo(dst io.Writer, peek bool) (int, error) { return s.pkt.Data().ReadTo(dst, peek) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/segment_heap.go000066400000000000000000000025611465435605700263570ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import "container/heap" type segmentHeap []*segment var _ heap.Interface = (*segmentHeap)(nil) // Len returns the length of h. func (h *segmentHeap) Len() int { return len(*h) } // Less determines whether the i-th element of h is less than the j-th element. func (h *segmentHeap) Less(i, j int) bool { return (*h)[i].sequenceNumber.LessThan((*h)[j].sequenceNumber) } // Swap swaps the i-th and j-th elements of h. func (h *segmentHeap) Swap(i, j int) { (*h)[i], (*h)[j] = (*h)[j], (*h)[i] } // Push adds x as the last element of h. func (h *segmentHeap) Push(x any) { *h = append(*h, x.(*segment)) } // Pop removes the last element of h and returns it. func (h *segmentHeap) Pop() any { old := *h n := len(old) x := old[n-1] old[n-1] = nil *h = old[:n-1] return x } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/segment_queue.go000066400000000000000000000055301465435605700265650ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "gvisor.dev/gvisor/pkg/sync" ) // segmentQueue is a bounded, thread-safe queue of TCP segments. // // +stateify savable type segmentQueue struct { mu sync.Mutex `state:"nosave"` list segmentList `state:"wait"` ep *Endpoint frozen bool } // emptyLocked determines if the queue is empty. // Preconditions: q.mu must be held. func (q *segmentQueue) emptyLocked() bool { return q.list.Empty() } // empty determines if the queue is empty. func (q *segmentQueue) empty() bool { q.mu.Lock() defer q.mu.Unlock() return q.emptyLocked() } // enqueue adds the given segment to the queue. // // Returns true when the segment is successfully added to the queue, in which // case ownership of the reference is transferred to the queue. And returns // false if the queue is full, in which case ownership is retained by the // caller. func (q *segmentQueue) enqueue(s *segment) bool { // q.ep.receiveBufferParams() must be called without holding q.mu to // avoid lock order inversion. bufSz := q.ep.ops.GetReceiveBufferSize() used := q.ep.receiveMemUsed() q.mu.Lock() defer q.mu.Unlock() // Allow zero sized segments (ACK/FIN/RSTs etc even if the segment queue // is currently full). allow := (used <= int(bufSz) || s.payloadSize() == 0) && !q.frozen if allow { s.IncRef() q.list.PushBack(s) // Set the owner now that the endpoint owns the segment. s.setOwner(q.ep, recvQ) } return allow } // dequeue removes and returns the next segment from queue, if one exists. // Ownership is transferred to the caller, who is responsible for decrementing // the ref count when done. func (q *segmentQueue) dequeue() *segment { q.mu.Lock() defer q.mu.Unlock() s := q.list.Front() if s != nil { q.list.Remove(s) } return s } // freeze prevents any more segments from being added to the queue. i.e all // future segmentQueue.enqueue will return false and not add the segment to the // queue till the queue is unfroze with a corresponding segmentQueue.thaw call. func (q *segmentQueue) freeze() { q.mu.Lock() defer q.mu.Unlock() q.frozen = true } // thaw unfreezes a previously frozen queue using segmentQueue.freeze() and // allows new segments to be queued again. func (q *segmentQueue) thaw() { q.mu.Lock() defer q.mu.Unlock() q.frozen = false } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/segment_state.go000066400000000000000000000023331465435605700265570ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "context" ) // saveOptions is invoked by stateify. func (s *segment) saveOptions() []byte { // We cannot save s.options directly as it may point to s.data's trimmed // tail, which is not allowed by state framework (in-struct pointer). b := make([]byte, 0, cap(s.options)) return append(b, s.options...) } // loadOptions is invoked by stateify. func (s *segment) loadOptions(_ context.Context, options []byte) { // NOTE: We cannot point s.options back into s.data's trimmed tail. But // it is OK as they do not need to aliased. Plus, options is already // allocated so there is no cost here. s.options = options } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/segment_unsafe.go000066400000000000000000000012531465435605700267200ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "unsafe" ) const ( segSize = int(unsafe.Sizeof(segment{})) ) golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/snd.go000066400000000000000000001671571465435605700245210ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "fmt" "math" "sort" "time" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/seqnum" "gvisor.dev/gvisor/pkg/tcpip/stack" ) const ( // MinRTO is the minimum allowed value for the retransmit timeout. MinRTO = 200 * time.Millisecond // MaxRTO is the maximum allowed value for the retransmit timeout. MaxRTO = 120 * time.Second // MinSRTT is the minimum allowed value for smoothed RTT. MinSRTT = 1 * time.Millisecond // InitialCwnd is the initial congestion window. InitialCwnd = 10 // nDupAckThreshold is the number of duplicate ACK's required // before fast-retransmit is entered. nDupAckThreshold = 3 // MaxRetries is the maximum number of probe retries sender does // before timing out the connection. // Linux default TCP_RETR2, net.ipv4.tcp_retries2. MaxRetries = 15 // InitialSsthresh is the the maximum int value, which depends on the // platform. InitialSsthresh = math.MaxInt // unknownRTT is used to indicate to congestion control algorithms that we // were unable to measure the round-trip time when processing ACKs. // Algorithms (such as HyStart) that use the round-trip time should ignore // such Updates. unknownRTT = time.Duration(-1) ) // congestionControl is an interface that must be implemented by any supported // congestion control algorithm. type congestionControl interface { // HandleLossDetected is invoked when the loss is detected by RACK or // sender.dupAckCount >= nDupAckThreshold just before entering fast // retransmit. HandleLossDetected() // HandleRTOExpired is invoked when the retransmit timer expires. HandleRTOExpired() // Update is invoked when processing inbound acks. It's passed the // number of packet's that were acked by the most recent cumulative // acknowledgement. rtt is the round-trip time, or is set to unknownRTT // (above) to indicate the time is unknown. Update(packetsAcked int, rtt time.Duration) // PostRecovery is invoked when the sender is exiting a fast retransmit/ // recovery phase. This provides congestion control algorithms a way // to adjust their state when exiting recovery. PostRecovery() } // lossRecovery is an interface that must be implemented by any supported // loss recovery algorithm. type lossRecovery interface { // DoRecovery is invoked when loss is detected and segments need // to be retransmitted. The cumulative or selective ACK is passed along // with the flag which identifies whether the connection entered fast // retransmit with this ACK and to retransmit the first unacknowledged // segment. DoRecovery(rcvdSeg *segment, fastRetransmit bool) } // sender holds the state necessary to send TCP segments. // // +stateify savable type sender struct { stack.TCPSenderState ep *Endpoint // lr is the loss recovery algorithm used by the sender. lr lossRecovery // firstRetransmittedSegXmitTime is the original transmit time of // the first segment that was retransmitted due to RTO expiration. firstRetransmittedSegXmitTime tcpip.MonotonicTime // zeroWindowProbing is set if the sender is currently probing // for zero receive window. zeroWindowProbing bool `state:"nosave"` // unackZeroWindowProbes is the number of unacknowledged zero // window probes. unackZeroWindowProbes uint32 `state:"nosave"` // writeNext is the next segment to write that hasn't already been // written, i.e. the first payload starting at SND.NXT. writeNext *segment // writeList holds all writable data: both unsent data and // sent-but-unacknowledged data. Alternatively: it holds all bytes // starting from SND.UNA. writeList segmentList // resendTimer is used for RTOs. resendTimer timer `state:"nosave"` // rtt.TCPRTTState.SRTT and rtt.TCPRTTState.RTTVar are the "smoothed // round-trip time", and "round-trip time variation", as defined in // section 2 of RFC 6298. rtt rtt // minRTO is the minimum permitted value for sender.rto. minRTO time.Duration // maxRTO is the maximum permitted value for sender.rto. maxRTO time.Duration // maxRetries is the maximum permitted retransmissions. maxRetries uint32 // gso is set if generic segmentation offload is enabled. gso bool // state is the current state of congestion control for this endpoint. state tcpip.CongestionControlState // cc is the congestion control algorithm in use for this sender. cc congestionControl // rc has the fields needed for implementing RACK loss detection // algorithm. rc rackControl // reorderTimer is the timer used to retransmit the segments after RACK // detects them as lost. reorderTimer timer `state:"nosave"` // probeTimer is used to schedule PTO for RACK TLP algorithm. probeTimer timer `state:"nosave"` // spuriousRecovery indicates whether the sender entered recovery // spuriously as described in RFC3522 Section 3.2. spuriousRecovery bool // retransmitTS is the timestamp at which the sender sends retransmitted // segment after entering an RTO for the first time as described in // RFC3522 Section 3.2. retransmitTS uint32 // startCork start corking the segments. startCork bool // corkTimer is used to drain the segments which are held when TCP_CORK // option is enabled. corkTimer timer `state:"nosave"` } // rtt is a synchronization wrapper used to appease stateify. See the comment // in sender, where it is used. // // +stateify savable type rtt struct { sync.Mutex `state:"nosave"` stack.TCPRTTState } // +checklocks:ep.mu func newSender(ep *Endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender { // The sender MUST reduce the TCP data length to account for any IP or // TCP options that it is including in the packets that it sends. // See: https://tools.ietf.org/html/rfc6691#section-2 maxPayloadSize := int(mss) - ep.maxOptionSize() s := &sender{ ep: ep, TCPSenderState: stack.TCPSenderState{ SndWnd: sndWnd, SndUna: iss + 1, SndNxt: iss + 1, RTTMeasureSeqNum: iss + 1, LastSendTime: ep.stack.Clock().NowMonotonic(), MaxPayloadSize: maxPayloadSize, MaxSentAck: irs + 1, FastRecovery: stack.TCPFastRecoveryState{ // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1. Last: iss, HighRxt: iss, RescueRxt: iss, }, RTO: 1 * time.Second, }, gso: ep.gso.Type != stack.GSONone, } if s.gso { s.ep.gso.MSS = uint16(maxPayloadSize) } s.cc = s.initCongestionControl(ep.cc) s.lr = s.initLossRecovery() s.rc.init(s, iss) // A negative sndWndScale means that no scaling is in use, otherwise we // store the scaling value. if sndWndScale > 0 { s.SndWndScale = uint8(sndWndScale) } s.resendTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.retransmitTimerExpired)) s.reorderTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.rc.reorderTimerExpired)) s.probeTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.probeTimerExpired)) s.corkTimer.init(s.ep.stack.Clock(), timerHandler(s.ep, s.corkTimerExpired)) s.ep.AssertLockHeld(ep) s.updateMaxPayloadSize(int(ep.route.MTU()), 0) // Initialize SACK Scoreboard after updating max payload size as we use // the maxPayloadSize as the smss when determining if a segment is lost // etc. s.ep.scoreboard = NewSACKScoreboard(uint16(s.MaxPayloadSize), iss) // Get Stack wide config. var minRTO tcpip.TCPMinRTOOption if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil { panic(fmt.Sprintf("unable to get minRTO from stack: %s", err)) } s.minRTO = time.Duration(minRTO) var maxRTO tcpip.TCPMaxRTOOption if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil { panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err)) } s.maxRTO = time.Duration(maxRTO) var maxRetries tcpip.TCPMaxRetriesOption if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil { panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err)) } s.maxRetries = uint32(maxRetries) return s } // initCongestionControl initializes the specified congestion control module and // returns a handle to it. It also initializes the sndCwnd and sndSsThresh to // their initial values. func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl { s.SndCwnd = InitialCwnd s.Ssthresh = InitialSsthresh switch congestionControlName { case ccCubic: return newCubicCC(s) case ccReno: fallthrough default: return newRenoCC(s) } } // initLossRecovery initiates the loss recovery algorithm for the sender. func (s *sender) initLossRecovery() lossRecovery { if s.ep.SACKPermitted { return newSACKRecovery(s) } return newRenoRecovery(s) } // updateMaxPayloadSize updates the maximum payload size based on the given // MTU. If this is in response to "packet too big" control packets (indicated // by the count argument), it also reduces the number of outstanding packets and // attempts to retransmit the first packet above the MTU size. // +checklocks:s.ep.mu func (s *sender) updateMaxPayloadSize(mtu, count int) { m := mtu - header.TCPMinimumSize m -= s.ep.maxOptionSize() // We don't adjust up for now. if m >= s.MaxPayloadSize { return } // Make sure we can transmit at least one byte. if m <= 0 { m = 1 } oldMSS := s.MaxPayloadSize s.MaxPayloadSize = m if s.gso { s.ep.gso.MSS = uint16(m) } if count == 0 { // updateMaxPayloadSize is also called when the sender is created. // and there is no data to send in such cases. Return immediately. return } // Update the scoreboard's smss to reflect the new lowered // maxPayloadSize. s.ep.scoreboard.smss = uint16(m) s.Outstanding -= count if s.Outstanding < 0 { s.Outstanding = 0 } // Rewind writeNext to the first segment exceeding the MTU. Do nothing // if it is already before such a packet. nextSeg := s.writeNext for seg := s.writeList.Front(); seg != nil; seg = seg.Next() { if seg == s.writeNext { // We got to writeNext before we could find a segment // exceeding the MTU. break } if nextSeg == s.writeNext && seg.payloadSize() > m { // We found a segment exceeding the MTU. Rewind // writeNext and try to retransmit it. nextSeg = seg } if s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { // Update sackedOut for new maximum payload size. s.SackedOut -= s.pCount(seg, oldMSS) s.SackedOut += s.pCount(seg, s.MaxPayloadSize) } } // Since we likely reduced the number of outstanding packets, we may be // ready to send some more. s.updateWriteNext(nextSeg) s.sendData() } // sendAck sends an ACK segment. // +checklocks:s.ep.mu func (s *sender) sendAck() { s.sendEmptySegment(header.TCPFlagAck, s.SndNxt) } // updateRTO updates the retransmit timeout when a new roud-trip time is // available. This is done in accordance with section 2 of RFC 6298. func (s *sender) updateRTO(rtt time.Duration) { s.rtt.Lock() if !s.rtt.TCPRTTState.SRTTInited { s.rtt.TCPRTTState.RTTVar = rtt / 2 s.rtt.TCPRTTState.SRTT = rtt s.rtt.TCPRTTState.SRTTInited = true } else { diff := s.rtt.TCPRTTState.SRTT - rtt if diff < 0 { diff = -diff } // Use RFC6298 standard algorithm to update TCPRTTState.RTTVar and TCPRTTState.SRTT when // no timestamps are available. if !s.ep.SendTSOk { s.rtt.TCPRTTState.RTTVar = (3*s.rtt.TCPRTTState.RTTVar + diff) / 4 s.rtt.TCPRTTState.SRTT = (7*s.rtt.TCPRTTState.SRTT + rtt) / 8 } else { // When we are taking RTT measurements of every ACK then // we need to use a modified method as specified in // https://tools.ietf.org/html/rfc7323#appendix-G if s.Outstanding == 0 { s.rtt.Unlock() return } // Netstack measures congestion window/inflight all in // terms of packets and not bytes. This is similar to // how linux also does cwnd and inflight. In practice // this approximation works as expected. expectedSamples := math.Ceil(float64(s.Outstanding) / 2) // alpha & beta values are the original values as recommended in // https://tools.ietf.org/html/rfc6298#section-2.3. const alpha = 0.125 const beta = 0.25 alphaPrime := alpha / expectedSamples betaPrime := beta / expectedSamples rttVar := (1-betaPrime)*s.rtt.TCPRTTState.RTTVar.Seconds() + betaPrime*diff.Seconds() srtt := (1-alphaPrime)*s.rtt.TCPRTTState.SRTT.Seconds() + alphaPrime*rtt.Seconds() s.rtt.TCPRTTState.RTTVar = time.Duration(rttVar * float64(time.Second)) s.rtt.TCPRTTState.SRTT = time.Duration(srtt * float64(time.Second)) } } if s.rtt.TCPRTTState.SRTT < MinSRTT { s.rtt.TCPRTTState.SRTT = MinSRTT } s.RTO = s.rtt.TCPRTTState.SRTT + 4*s.rtt.TCPRTTState.RTTVar s.rtt.Unlock() if s.RTO < s.minRTO { s.RTO = s.minRTO } if s.RTO > s.maxRTO { s.RTO = s.maxRTO } } // resendSegment resends the first unacknowledged segment. // +checklocks:s.ep.mu func (s *sender) resendSegment() { // Don't use any segments we already sent to measure RTT as they may // have been affected by packets being lost. s.RTTMeasureSeqNum = s.SndNxt // Resend the segment. if seg := s.writeList.Front(); seg != nil { if seg.payloadSize() > s.MaxPayloadSize { s.splitSeg(seg, s.MaxPayloadSize) } // See: RFC 6675 section 5 Step 4.3 // // To prevent retransmission, set both the HighRXT and RescueRXT // to the highest sequence number in the retransmitted segment. s.FastRecovery.HighRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1 s.FastRecovery.RescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) - 1 s.sendSegment(seg) s.ep.stack.Stats().TCP.FastRetransmit.Increment() s.ep.stats.SendErrors.FastRetransmit.Increment() // Run SetPipe() as per RFC 6675 section 5 Step 4.4 s.SetPipe() } } // retransmitTimerExpired is called when the retransmit timer expires, and // unacknowledged segments are assumed lost, and thus need to be resent. // Returns true if the connection is still usable, or false if the connection // is deemed lost. // +checklocks:s.ep.mu func (s *sender) retransmitTimerExpired() tcpip.Error { // Check if the timer actually expired or if it's a spurious wake due // to a previously orphaned runtime timer. if s.resendTimer.isUninitialized() || !s.resendTimer.checkExpiration() { return nil } // Initialize the variables used to detect spurious recovery after // entering RTO. // // See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1. s.spuriousRecovery = false s.retransmitTS = 0 // TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases // when writeList is empty. Remove this once we have a proper fix for this // issue. if s.writeList.Front() == nil { return nil } s.ep.stack.Stats().TCP.Timeouts.Increment() s.ep.stats.SendErrors.Timeouts.Increment() // Set TLPRxtOut to false according to // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1. s.rc.tlpRxtOut = false // Give up if we've waited more than a minute since the last resend or // if a user time out is set and we have exceeded the user specified // timeout since the first retransmission. uto := s.ep.userTimeout if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) { // We store the original xmitTime of the segment that we are // about to retransmit as the retransmission time. This is // required as by the time the retransmitTimer has expired the // segment has already been sent and unacked for the RTO at the // time the segment was sent. s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime } elapsed := s.ep.stack.Clock().NowMonotonic().Sub(s.firstRetransmittedSegXmitTime) remaining := s.maxRTO if uto != 0 { // Cap to the user specified timeout if one is specified. remaining = uto - elapsed } // Always honor the user-timeout irrespective of whether the zero // window probes were acknowledged. // net/ipv4/tcp_timer.c::tcp_probe_timer() if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries { s.ep.stack.Stats().TCP.EstablishedTimedout.Increment() return &tcpip.ErrTimeout{} } // Set new timeout. The timer will be restarted by the call to sendData // below. s.RTO *= 2 // Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5 if s.RTO > s.maxRTO { s.RTO = s.maxRTO } // Cap RTO to remaining time. if s.RTO > remaining { s.RTO = remaining } // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4. // // Retransmit timeouts: // After a retransmit timeout, record the highest sequence number // transmitted in the variable recover, and exit the fast recovery // procedure if applicable. s.FastRecovery.Last = s.SndNxt - 1 if s.FastRecovery.Active { // We were attempting fast recovery but were not successful. // Leave the state. We don't need to update ssthresh because it // has already been updated when entered fast-recovery. s.leaveRecovery() } // Record retransmitTS if the sender is not in recovery as per: // https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 s.recordRetransmitTS() s.state = tcpip.RTORecovery s.cc.HandleRTOExpired() // Mark the next segment to be sent as the first unacknowledged one and // start sending again. Set the number of outstanding packets to 0 so // that we'll be able to retransmit. // // We'll keep on transmitting (or retransmitting) as we get acks for // the data we transmit. s.Outstanding = 0 // Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1 // // In order to avoid memory deadlocks, the TCP receiver is allowed to // discard data that has already been selectively acknowledged. As a // result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK // information gathered from a receiver upon a retransmission timeout // (RTO) "since the timeout might indicate that the data receiver has // reneged." Additionally, a TCP sender MUST "ignore prior SACK // information in determining which data to retransmit." // // NOTE: We take the stricter interpretation and just expunge all // information as we lack more rigorous checks to validate if the SACK // information is usable after an RTO. s.ep.scoreboard.Reset() s.updateWriteNext(s.writeList.Front()) // RFC 1122 4.2.2.17: Start sending zero window probes when we still see a // zero receive window after retransmission interval and we have data to // send. if s.zeroWindowProbing { s.sendZeroWindowProbe() // RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed // indefinitely. As long as the receiving TCP continues to send // acknowledgments in response to the probe segments, the sending TCP // MUST allow the connection to stay open. return nil } seg := s.writeNext // RFC 1122 4.2.3.5: Close the connection when the number of // retransmissions for this segment is beyond a limit. if seg != nil && seg.xmitCount > s.maxRetries { s.ep.stack.Stats().TCP.EstablishedTimedout.Increment() return &tcpip.ErrTimeout{} } s.sendData() return nil } // pCount returns the number of packets in the segment. Due to GSO, a segment // can be composed of multiple packets. func (s *sender) pCount(seg *segment, maxPayloadSize int) int { size := seg.payloadSize() if size == 0 { return 1 } return (size-1)/maxPayloadSize + 1 } // splitSeg splits a given segment at the size specified and inserts the // remainder as a new segment after the current one in the write list. func (s *sender) splitSeg(seg *segment, size int) { if seg.payloadSize() <= size { return } // Split this segment up. nSeg := seg.clone() nSeg.pkt.Data().TrimFront(size) nSeg.sequenceNumber.UpdateForward(seqnum.Size(size)) s.writeList.InsertAfter(seg, nSeg) // The segment being split does not carry PUSH flag because it is // followed by the newly split segment. // RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered // segment (i.e., when there is no more queued data to be sent). // Linux removes PSH flag only when the segment is being split over MSS // and retains it when we are splitting the segment over lack of sender // window space. // ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point() // ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test() if seg.payloadSize() > s.MaxPayloadSize { seg.flags ^= header.TCPFlagPsh } seg.pkt.Data().CapLength(size) } // NextSeg implements the RFC6675 NextSeg() operation. // // NextSeg starts scanning the writeList starting from nextSegHint and returns // the hint to be passed on the next call to NextSeg. This is required to avoid // iterating the write list repeatedly when NextSeg is invoked in a loop during // recovery. The returned hint will be nil if there are no more segments that // can match rules defined by NextSeg operation in RFC6675. // // rescueRtx will be true only if nextSeg is a rescue retransmission as // described by Step 4) of the NextSeg algorithm. func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) { var s3 *segment var s4 *segment // Step 1. for seg := nextSegHint; seg != nil; seg = seg.Next() { // Stop iteration if we hit a segment that has never been // transmitted (i.e. either it has no assigned sequence number // or if it does have one, it's >= the next sequence number // to be sent [i.e. >= s.sndNxt]). if !s.isAssignedSequenceNumber(seg) || s.SndNxt.LessThanEq(seg.sequenceNumber) { hint = nil break } segSeq := seg.sequenceNumber if smss := s.ep.scoreboard.SMSS(); seg.payloadSize() > int(smss) { s.splitSeg(seg, int(smss)) } // See RFC 6675 Section 4 // // 1. If there exists a smallest unSACKED sequence number // 'S2' that meets the following 3 criteria for determinig // loss, the sequence range of one segment of up to SMSS // octets starting with S2 MUST be returned. if !s.ep.scoreboard.IsSACKED(header.SACKBlock{Start: segSeq, End: segSeq.Add(1)}) { // NextSeg(): // // (1.a) S2 is greater than HighRxt // (1.b) S2 is less than highest octet covered by // any received SACK. if s.FastRecovery.HighRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) { // NextSeg(): // (1.c) IsLost(S2) returns true. if s.ep.scoreboard.IsLost(segSeq) { return seg, seg.Next(), false } // NextSeg(): // // (3): If the conditions for rules (1) and (2) // fail, but there exists an unSACKed sequence // number S3 that meets the criteria for // detecting loss given in steps 1.a and 1.b // above (specifically excluding (1.c)) then one // segment of upto SMSS octets starting with S3 // SHOULD be returned. if s3 == nil { s3 = seg hint = seg.Next() } } // NextSeg(): // // (4) If the conditions for (1), (2) and (3) fail, // but there exists outstanding unSACKED data, we // provide the opportunity for a single "rescue" // retransmission per entry into loss recovery. If // HighACK is greater than RescueRxt (or RescueRxt // is undefined), then one segment of upto SMSS // octets that MUST include the highest outstanding // unSACKed sequence number SHOULD be returned, and // RescueRxt set to RecoveryPoint. HighRxt MUST NOT // be updated. if s.FastRecovery.RescueRxt.LessThan(s.SndUna - 1) { if s4 != nil { if s4.sequenceNumber.LessThan(segSeq) { s4 = seg } } else { s4 = seg } } } } // If we got here then no segment matched step (1). // Step (2): "If no sequence number 'S2' per rule (1) // exists but there exists available unsent data and the // receiver's advertised window allows, the sequence // range of one segment of up to SMSS octets of // previously unsent data starting with sequence number // HighData+1 MUST be returned." for seg := s.writeNext; seg != nil; seg = seg.Next() { if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.SndNxt) { continue } // We do not split the segment here to <= smss as it has // potentially not been assigned a sequence number yet. return seg, nil, false } if s3 != nil { return s3, hint, false } return s4, nil, true } // maybeSendSegment tries to send the specified segment and either coalesces // other segments into this one or splits the specified segment based on the // lower of the specified limit value or the receivers window size specified by // end. // +checklocks:s.ep.mu func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) { // We abuse the flags field to determine if we have already // assigned a sequence number to this segment. if !s.isAssignedSequenceNumber(seg) { // Merge segments if allowed. if seg.payloadSize() != 0 { available := int(s.SndNxt.Size(end)) if available > limit { available = limit } // nextTooBig indicates that the next segment was too // large to entirely fit in the current segment. It // would be possible to split the next segment and merge // the portion that fits, but unexpectedly splitting // segments can have user visible side-effects which can // break applications. For example, RFC 7766 section 8 // says that the length and data of a DNS response // should be sent in the same TCP segment to avoid // triggering bugs in poorly written DNS // implementations. var nextTooBig bool for nSeg := seg.Next(); nSeg != nil && nSeg.payloadSize() != 0; nSeg = seg.Next() { if seg.payloadSize()+nSeg.payloadSize() > available { nextTooBig = true break } seg.merge(nSeg) s.writeList.Remove(nSeg) nSeg.DecRef() } if !nextTooBig && seg.payloadSize() < available { // Segment is not full. if s.Outstanding > 0 && s.ep.ops.GetDelayOption() { // Nagle's algorithm. From Wikipedia: // Nagle's algorithm works by // combining a number of small // outgoing messages and sending them // all at once. Specifically, as long // as there is a sent packet for which // the sender has received no // acknowledgment, the sender should // keep buffering its output until it // has a full packet's worth of // output, thus allowing output to be // sent all at once. return false } // With TCP_CORK, hold back until minimum of the available // send space and MSS. if s.ep.ops.GetCorkOption() { if seg.payloadSize() < s.MaxPayloadSize { if !s.startCork { s.startCork = true // Enable the timer for // 200ms, after which // the segments are drained. s.corkTimer.enable(MinRTO) } return false } // Disable the TCP_CORK timer. s.startCork = false s.corkTimer.disable() } } } // Assign flags. We don't do it above so that we can merge // additional data if Nagle holds the segment. seg.sequenceNumber = s.SndNxt seg.flags = header.TCPFlagAck | header.TCPFlagPsh } var segEnd seqnum.Value if seg.payloadSize() == 0 { if s.writeList.Back() != seg { panic("FIN segments must be the final segment in the write list.") } seg.flags = header.TCPFlagAck | header.TCPFlagFin segEnd = seg.sequenceNumber.Add(1) // Update the state to reflect that we have now // queued a FIN. s.ep.updateConnDirectionState(connDirectionStateSndClosed) switch s.ep.EndpointState() { case StateCloseWait: s.ep.setEndpointState(StateLastAck) default: s.ep.setEndpointState(StateFinWait1) } } else { // We're sending a non-FIN segment. if seg.flags&header.TCPFlagFin != 0 { panic("Netstack queues FIN segments without data.") } if !seg.sequenceNumber.LessThan(end) { return false } available := int(seg.sequenceNumber.Size(end)) if available == 0 { return false } // If the whole segment or at least 1MSS sized segment cannot // be accommodated in the receiver advertised window, skip // splitting and sending of the segment. ref: // net/ipv4/tcp_output.c::tcp_snd_wnd_test() // // Linux checks this for all segment transmits not triggered by // a probe timer. On this condition, it defers the segment split // and transmit to a short probe timer. // // ref: include/net/tcp.h::tcp_check_probe_timer() // ref: net/ipv4/tcp_output.c::tcp_write_wakeup() // // Instead of defining a new transmit timer, we attempt to split // the segment right here if there are no pending segments. If // there are pending segments, segment transmits are deferred to // the retransmit timer handler. if s.SndUna != s.SndNxt { switch { case available >= seg.payloadSize(): // OK to send, the whole segments fits in the // receiver's advertised window. case available >= s.MaxPayloadSize: // OK to send, at least 1 MSS sized segment fits // in the receiver's advertised window. default: return false } } // The segment size limit is computed as a function of sender // congestion window and MSS. When sender congestion window is > // 1, this limit can be larger than MSS. Ensure that the // currently available send space is not greater than minimum of // this limit and MSS. if available > limit { available = limit } // If GSO is not in use then cap available to // maxPayloadSize. When GSO is in use the gVisor GSO logic or // the host GSO logic will cap the segment to the correct size. if s.ep.gso.Type == stack.GSONone && available > s.MaxPayloadSize { available = s.MaxPayloadSize } if seg.payloadSize() > available { // A negative value causes splitSeg to panic anyways, so just panic // earlier to get more information about the cause. s.splitSeg(seg, available) } segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.payloadSize())) } s.sendSegment(seg) // Update sndNxt if we actually sent new data (as opposed to // retransmitting some previously sent data). if s.SndNxt.LessThan(segEnd) { s.SndNxt = segEnd } return true } // zeroProbeJunk is data sent during zero window probes. Its value is // irrelevant; since the sequence number has already been acknowledged it will // be discarded. It's only here to avoid allocating. var zeroProbeJunk = []byte{0} // +checklocks:s.ep.mu func (s *sender) sendZeroWindowProbe() { s.unackZeroWindowProbes++ // Send a zero window probe with sequence number pointing to the last // acknowledged byte. Note that, like Linux, this isn't quite what RFC // 9293 3.8.6.1 describes: we don't send the next byte in the stream, // we re-send an ACKed byte to goad the receiver into responding. pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ Payload: buffer.MakeWithData(zeroProbeJunk), }) defer pkt.DecRef() s.sendSegmentFromPacketBuffer(pkt, header.TCPFlagAck, s.SndUna-1) // Rearm the timer to continue probing. s.resendTimer.enable(s.RTO) } func (s *sender) enableZeroWindowProbing() { s.zeroWindowProbing = true // We piggyback the probing on the retransmit timer with the // current retranmission interval, as we may start probing while // segment retransmissions. if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) { s.firstRetransmittedSegXmitTime = s.ep.stack.Clock().NowMonotonic() } s.resendTimer.enable(s.RTO) } func (s *sender) disableZeroWindowProbing() { s.zeroWindowProbing = false s.unackZeroWindowProbes = 0 s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{} s.resendTimer.disable() } func (s *sender) postXmit(dataSent bool, shouldScheduleProbe bool) { if dataSent { // We sent data, so we should stop the keepalive timer to ensure // that no keepalives are sent while there is pending data. s.ep.disableKeepaliveTimer() } // If the sender has advertised zero receive window and we have // data to be sent out, start zero window probing to query the // the remote for it's receive window size. if s.writeNext != nil && s.SndWnd == 0 { s.enableZeroWindowProbing() } // If we have no more pending data, start the keepalive timer. if s.SndUna == s.SndNxt { s.ep.resetKeepaliveTimer(false) } else { // Enable timers if we have pending data. if shouldScheduleProbe && s.shouldSchedulePTO() { // Schedule PTO after transmitting new data that wasn't itself a TLP probe. s.schedulePTO() } else if !s.resendTimer.enabled() { s.probeTimer.disable() if s.Outstanding > 0 { // Enable the resend timer if it's not enabled yet and there is // outstanding data. s.resendTimer.enable(s.RTO) } } } } // sendData sends new data segments. It is called when data becomes available or // when the send window opens up. // +checklocks:s.ep.mu func (s *sender) sendData() { limit := s.MaxPayloadSize if s.gso { limit = int(s.ep.gso.MaxSize - header.TCPTotalHeaderMaximumSize - 1) } end := s.SndUna.Add(s.SndWnd) // Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10. // "A TCP SHOULD set cwnd to no more than RW before beginning // transmission if the TCP has not sent data in the interval exceeding // the retrasmission timeout." if !s.FastRecovery.Active && s.state != tcpip.RTORecovery && s.ep.stack.Clock().NowMonotonic().Sub(s.LastSendTime) > s.RTO { if s.SndCwnd > InitialCwnd { s.SndCwnd = InitialCwnd } } var dataSent bool for seg := s.writeNext; seg != nil && s.Outstanding < s.SndCwnd; seg = seg.Next() { cwndLimit := (s.SndCwnd - s.Outstanding) * s.MaxPayloadSize if cwndLimit < limit { limit = cwndLimit } if s.isAssignedSequenceNumber(seg) && s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { // Move writeNext along so that we don't try and scan data that // has already been SACKED. s.updateWriteNext(seg.Next()) continue } if sent := s.maybeSendSegment(seg, limit, end); !sent { break } dataSent = true s.Outstanding += s.pCount(seg, s.MaxPayloadSize) s.updateWriteNext(seg.Next()) } s.postXmit(dataSent, true /* shouldScheduleProbe */) } func (s *sender) enterRecovery() { // Initialize the variables used to detect spurious recovery after // entering recovery. // // See: https://www.rfc-editor.org/rfc/rfc3522.html#section-3.2 Step 1. s.spuriousRecovery = false s.retransmitTS = 0 s.FastRecovery.Active = true // Save state to reflect we're now in fast recovery. // // See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3. // We inflate the cwnd by 3 to account for the 3 packets which triggered // the 3 duplicate ACKs and are now not in flight. s.SndCwnd = s.Ssthresh + 3 s.SackedOut = 0 s.DupAckCount = 0 s.FastRecovery.First = s.SndUna s.FastRecovery.Last = s.SndNxt - 1 s.FastRecovery.MaxCwnd = s.SndCwnd + s.Outstanding s.FastRecovery.HighRxt = s.SndUna s.FastRecovery.RescueRxt = s.SndUna // Record retransmitTS if the sender is not in recovery as per: // https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 s.recordRetransmitTS() if s.ep.SACKPermitted { s.state = tcpip.SACKRecovery s.ep.stack.Stats().TCP.SACKRecovery.Increment() // Set TLPRxtOut to false according to // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1. if s.rc.tlpRxtOut { // The tail loss probe triggered recovery. s.ep.stack.Stats().TCP.TLPRecovery.Increment() } s.rc.tlpRxtOut = false return } s.state = tcpip.FastRecovery s.ep.stack.Stats().TCP.FastRecovery.Increment() } func (s *sender) leaveRecovery() { s.FastRecovery.Active = false s.FastRecovery.MaxCwnd = 0 s.DupAckCount = 0 // Deflate cwnd. It had been artificially inflated when new dups arrived. s.SndCwnd = s.Ssthresh s.cc.PostRecovery() } // isAssignedSequenceNumber relies on the fact that we only set flags once a // sequencenumber is assigned and that is only done right before we send the // segment. As a result any segment that has a non-zero flag has a valid // sequence number assigned to it. func (s *sender) isAssignedSequenceNumber(seg *segment) bool { return seg.flags != 0 } // SetPipe implements the SetPipe() function described in RFC6675. Netstack // maintains the congestion window in number of packets and not bytes, so // SetPipe() here measures number of outstanding packets rather than actual // outstanding bytes in the network. func (s *sender) SetPipe() { // If SACK isn't permitted or it is permitted but recovery is not active // then ignore pipe calculations. if !s.ep.SACKPermitted || !s.FastRecovery.Active { return } pipe := 0 smss := seqnum.Size(s.ep.scoreboard.SMSS()) for s1 := s.writeList.Front(); s1 != nil && s1.payloadSize() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() { // With GSO each segment can be much larger than SMSS. So check the segment // in SMSS sized ranges. segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.payloadSize())) for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) { endSeq := startSeq.Add(smss) if segEnd.LessThan(endSeq) { endSeq = segEnd } sb := header.SACKBlock{Start: startSeq, End: endSeq} // SetPipe(): // // After initializing pipe to zero, the following steps are // taken for each octet 'S1' in the sequence space between // HighACK and HighData that has not been SACKed: if !s1.sequenceNumber.LessThan(s.SndNxt) { break } if s.ep.scoreboard.IsSACKED(sb) { continue } // SetPipe(): // // (a) If IsLost(S1) returns false, Pipe is incremened by 1. // // NOTE: here we mark the whole segment as lost. We do not try // and test every byte in our write buffer as we maintain our // pipe in terms of outstanding packets and not bytes. if !s.ep.scoreboard.IsRangeLost(sb) { pipe++ } // SetPipe(): // (b) If S1 <= HighRxt, Pipe is incremented by 1. if s1.sequenceNumber.LessThanEq(s.FastRecovery.HighRxt) { pipe++ } } } s.Outstanding = pipe } // shouldEnterRecovery returns true if the sender should enter fast recovery // based on dupAck count and sack scoreboard. // See RFC 6675 section 5. func (s *sender) shouldEnterRecovery() bool { return s.DupAckCount >= nDupAckThreshold || (s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 && s.ep.scoreboard.IsLost(s.SndUna)) } // detectLoss is called when an ack is received and returns whether a loss is // detected. It manages the state related to duplicate acks and determines if // a retransmit is needed according to the rules in RFC 6582 (NewReno). func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) { // We're not in fast recovery yet. // If RACK is enabled and there is no reordering we should honor the // three duplicate ACK rule to enter recovery. // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-4 if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { if s.rc.Reord { return false } } if !s.isDupAck(seg) { s.DupAckCount = 0 return false } s.DupAckCount++ // Do not enter fast recovery until we reach nDupAckThreshold or the // first unacknowledged byte is considered lost as per SACK scoreboard. if !s.shouldEnterRecovery() { // RFC 6675 Step 3. s.FastRecovery.HighRxt = s.SndUna - 1 // Do run SetPipe() to calculate the outstanding segments. s.SetPipe() s.state = tcpip.Disorder return false } // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2 // // We only do the check here, the incrementing of last to the highest // sequence number transmitted till now is done when enterRecovery // is invoked. // // Note that we only enter recovery when at least one more byte of data // beyond s.fr.last (the highest byte that was outstanding when fast // retransmit was last entered) is acked. if !s.FastRecovery.Last.LessThan(seg.ackNumber - 1) { s.DupAckCount = 0 return false } s.cc.HandleLossDetected() s.enterRecovery() return true } // isDupAck determines if seg is a duplicate ack as defined in // https://tools.ietf.org/html/rfc5681#section-2. func (s *sender) isDupAck(seg *segment) bool { // A TCP that utilizes selective acknowledgments (SACKs) [RFC2018, RFC2883] // can leverage the SACK information to determine when an incoming ACK is a // "duplicate" (e.g., if the ACK contains previously unknown SACK // information). if s.ep.SACKPermitted && !seg.hasNewSACKInfo { return false } // (a) The receiver of the ACK has outstanding data. return s.SndUna != s.SndNxt && // (b) The incoming acknowledgment carries no data. seg.logicalLen() == 0 && // (c) The SYN and FIN bits are both off. !seg.flags.Intersects(header.TCPFlagFin|header.TCPFlagSyn) && // (d) the ACK number is equal to the greatest acknowledgment received on // the given connection (TCP.UNA from RFC793). seg.ackNumber == s.SndUna && // (e) the advertised window in the incoming acknowledgment equals the // advertised window in the last incoming acknowledgment. s.SndWnd == seg.window } // Iterate the writeList and update RACK for each segment which is newly acked // either cumulatively or selectively. Loop through the segments which are // sacked, and update the RACK related variables and check for reordering. // Returns true when the DSACK block has been detected in the received ACK. // // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 // steps 2 and 3. func (s *sender) walkSACK(rcvdSeg *segment) bool { s.rc.setDSACKSeen(false) // Look for DSACK block. hasDSACK := false idx := 0 n := len(rcvdSeg.parsedOptions.SACKBlocks) if checkDSACK(rcvdSeg) { dsackBlock := rcvdSeg.parsedOptions.SACKBlocks[0] numDSACK := uint64(dsackBlock.End-dsackBlock.Start) / uint64(s.MaxPayloadSize) // numDSACK can be zero when DSACK is sent for subsegments. if numDSACK < 1 { numDSACK = 1 } s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.IncrementBy(numDSACK) s.rc.setDSACKSeen(true) idx = 1 n-- hasDSACK = true } if n == 0 { return hasDSACK } // Sort the SACK blocks. The first block is the most recent unacked // block. The following blocks can be in arbitrary order. sackBlocks := make([]header.SACKBlock, n) copy(sackBlocks, rcvdSeg.parsedOptions.SACKBlocks[idx:]) sort.Slice(sackBlocks, func(i, j int) bool { return sackBlocks[j].Start.LessThan(sackBlocks[i].Start) }) seg := s.writeList.Front() for _, sb := range sackBlocks { for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 { if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked { s.rc.update(seg, rcvdSeg) s.rc.detectReorder(seg) seg.acked = true s.SackedOut += s.pCount(seg, s.MaxPayloadSize) } seg = seg.Next() } } return hasDSACK } // checkDSACK checks if a DSACK is reported. func checkDSACK(rcvdSeg *segment) bool { n := len(rcvdSeg.parsedOptions.SACKBlocks) if n == 0 { return false } sb := rcvdSeg.parsedOptions.SACKBlocks[0] // Check if SACK block is invalid. if sb.End.LessThan(sb.Start) { return false } // See: https://tools.ietf.org/html/rfc2883#section-5 DSACK is sent in // at most one SACK block. DSACK is detected in the below two cases: // * If the SACK sequence space is less than this cumulative ACK, it is // an indication that the segment identified by the SACK block has // been received more than once by the receiver. // * If the sequence space in the first SACK block is greater than the // cumulative ACK, then the sender next compares the sequence space // in the first SACK block with the sequence space in the second SACK // block, if there is one. This comparison can determine if the first // SACK block is reporting duplicate data that lies above the // cumulative ACK. if sb.Start.LessThan(rcvdSeg.ackNumber) { return true } if n > 1 { sb1 := rcvdSeg.parsedOptions.SACKBlocks[1] if sb1.End.LessThan(sb1.Start) { return false } // If the first SACK block is fully covered by second SACK // block, then the first block is a DSACK block. if sb.End.LessThanEq(sb1.End) && sb1.Start.LessThanEq(sb.Start) { return true } } return false } func (s *sender) recordRetransmitTS() { // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 // // The Eifel detection algorithm is used, only upon initiation of loss // recovery, i.e., when either the timeout-based retransmit or the fast // retransmit is sent. The Eifel detection algorithm MUST NOT be // reinitiated after loss recovery has already started. In particular, // it must not be reinitiated upon subsequent timeouts for the same // segment, and not upon retransmitting segments other than the oldest // outstanding segment, e.g., during selective loss recovery. if s.inRecovery() { return } // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 2 // // Set a "RetransmitTS" variable to the value of the Timestamp Value // field of the Timestamps option included in the retransmit sent when // loss recovery is initiated. A TCP sender must ensure that // RetransmitTS does not get overwritten as loss recovery progresses, // e.g., in case of a second timeout and subsequent second retransmit of // the same octet. s.retransmitTS = s.ep.tsValNow() } func (s *sender) detectSpuriousRecovery(hasDSACK bool, tsEchoReply uint32) { // Return if the sender has already detected spurious recovery. if s.spuriousRecovery { return } // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 4 // // If the value of the Timestamp Echo Reply field of the acceptable ACK's // Timestamps option is smaller than the value of RetransmitTS, then // proceed to next step, else return. if tsEchoReply >= s.retransmitTS { return } // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5 // // If the acceptable ACK carries a DSACK option [RFC2883], then return. if hasDSACK { return } // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 5 // // If during the lifetime of the TCP connection the TCP sender has // previously received an ACK with a DSACK option, or the acceptable ACK // does not acknowledge all outstanding data, then proceed to next step, // else return. numDSACK := s.ep.stack.Stats().TCP.SegmentsAckedWithDSACK.Value() if numDSACK == 0 && s.SndUna == s.SndNxt { return } // See: https://datatracker.ietf.org/doc/html/rfc3522#section-3.2 Step 6 // // If the loss recovery has been initiated with a timeout-based // retransmit, then set // SpuriousRecovery <- SPUR_TO (equal 1), // else set // SpuriousRecovery <- dupacks+1 // Set the spurious recovery variable to true as we do not differentiate // between fast, SACK or RTO recovery. s.spuriousRecovery = true s.ep.stack.Stats().TCP.SpuriousRecovery.Increment() // RFC 3522 will detect all kinds of spurious recoveries (fast, SACK and // timeout). Increment the metric for RTO only as we want to track the // number of timeout recoveries. if s.state == tcpip.RTORecovery { s.ep.stack.Stats().TCP.SpuriousRTORecovery.Increment() } } // Check if the sender is in RTORecovery, FastRecovery or SACKRecovery state. func (s *sender) inRecovery() bool { if s.state == tcpip.RTORecovery || s.state == tcpip.FastRecovery || s.state == tcpip.SACKRecovery { return true } return false } // handleRcvdSegment is called when a segment is received; it is responsible for // updating the send-related state. // +checklocks:s.ep.mu // +checklocksalias:s.rc.snd.ep.mu=s.ep.mu func (s *sender) handleRcvdSegment(rcvdSeg *segment) { bestRTT := unknownRTT // Check if we can extract an RTT measurement from this ack. if !rcvdSeg.parsedOptions.TS && s.RTTMeasureSeqNum.LessThan(rcvdSeg.ackNumber) { bestRTT = s.ep.stack.Clock().NowMonotonic().Sub(s.RTTMeasureTime) s.updateRTO(bestRTT) s.RTTMeasureSeqNum = s.SndNxt } // Update Timestamp if required. See RFC7323, section-4.3. if s.ep.SendTSOk && rcvdSeg.parsedOptions.TS { s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.MaxSentAck, rcvdSeg.sequenceNumber) } // Insert SACKBlock information into our scoreboard. hasDSACK := false if s.ep.SACKPermitted { for _, sb := range rcvdSeg.parsedOptions.SACKBlocks { // Only insert the SACK block if the following holds // true: // * SACK block acks data after the ack number in the // current segment. // * SACK block represents a sequence // between sndUna and sndNxt (i.e. data that is // currently unacked and in-flight). // * SACK block that has not been SACKed already. // // NOTE: This check specifically excludes DSACK blocks // which have start/end before sndUna and are used to // indicate spurious retransmissions. if rcvdSeg.ackNumber.LessThan(sb.Start) && s.SndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.SndNxt) && !s.ep.scoreboard.IsSACKED(sb) { s.ep.scoreboard.Insert(sb) rcvdSeg.hasNewSACKInfo = true } } // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08 // section-7.2 // * Step 2: Update RACK stats. // If the ACK is not ignored as invalid, update the RACK.rtt // to be the RTT sample calculated using this ACK, and // continue. If this ACK or SACK was for the most recently // sent packet, then record the RACK.xmit_ts timestamp and // RACK.end_seq sequence implied by this ACK. // * Step 3: Detect packet reordering. // If the ACK selectively or cumulatively acknowledges an // unacknowledged and also never retransmitted sequence below // RACK.fack, then the corresponding packet has been // reordered and RACK.reord is set to TRUE. if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { hasDSACK = s.walkSACK(rcvdSeg) } s.SetPipe() } ack := rcvdSeg.ackNumber fastRetransmit := false // Do not leave fast recovery, if the ACK is out of range. if s.FastRecovery.Active { // Leave fast recovery if it acknowledges all the data covered by // this fast recovery session. if (ack-1).InRange(s.SndUna, s.SndNxt) && s.FastRecovery.Last.LessThan(ack) { s.leaveRecovery() } } else { // Detect loss by counting the duplicates and enter recovery. fastRetransmit = s.detectLoss(rcvdSeg) } // See if TLP based recovery was successful. if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { s.detectTLPRecovery(ack, rcvdSeg) } // Stash away the current window size. s.SndWnd = rcvdSeg.window // Disable zero window probing if remote advertises a non-zero receive // window. This can be with an ACK to the zero window probe (where the // acknumber refers to the already acknowledged byte) OR to any previously // unacknowledged segment. if s.zeroWindowProbing && rcvdSeg.window > 0 && (ack == s.SndUna || (ack-1).InRange(s.SndUna, s.SndNxt)) { s.disableZeroWindowProbing() } // On receiving the ACK for the zero window probe, account for it and // skip trying to send any segment as we are still probing for // receive window to become non-zero. if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.SndUna { s.unackZeroWindowProbes-- return } // Ignore ack if it doesn't acknowledge any new data. if (ack - 1).InRange(s.SndUna, s.SndNxt) { s.DupAckCount = 0 // See : https://tools.ietf.org/html/rfc1323#section-3.3. // Specifically we should only update the RTO using TSEcr if the // following condition holds: // // A TSecr value received in a segment is used to update the // averaged RTT measurement only if the segment acknowledges // some new data, i.e., only if it advances the left edge of // the send window. if s.ep.SendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 { tsRTT := s.ep.elapsed(s.ep.stack.Clock().NowMonotonic(), rcvdSeg.parsedOptions.TSEcr) s.updateRTO(tsRTT) // Following Linux, prefer RTT computed from ACKs to TSEcr because, // "broken middle-boxes or peers may corrupt TS-ECR fields" // https://github.com/torvalds/linux/blob/39cd87c4eb2b893354f3b850f916353f2658ae6f/net/ipv4/tcp_input.c#L3141C1-L3144C24 if bestRTT == unknownRTT { bestRTT = tsRTT } } if s.shouldSchedulePTO() { // Schedule PTO upon receiving an ACK that cumulatively acknowledges data. // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1. s.schedulePTO() } else { // When an ack is received we must rearm the timer. // RFC 6298 5.3 s.probeTimer.disable() s.resendTimer.enable(s.RTO) } // Remove all acknowledged data from the write list. acked := s.SndUna.Size(ack) s.SndUna = ack ackLeft := acked originalOutstanding := s.Outstanding for ackLeft > 0 { // We use logicalLen here because we can have FIN // segments (which are always at the end of list) that // have no data, but do consume a sequence number. seg := s.writeList.Front() if seg == nil { panic(fmt.Sprintf("invalid state: there are %d unacknowledged bytes left, but the write list is empty:\n%+v", ackLeft, s.TCPSenderState)) } datalen := seg.logicalLen() if datalen > ackLeft { prevCount := s.pCount(seg, s.MaxPayloadSize) seg.TrimFront(ackLeft) seg.sequenceNumber.UpdateForward(ackLeft) s.Outstanding -= prevCount - s.pCount(seg, s.MaxPayloadSize) break } if s.writeNext == seg { s.updateWriteNext(seg.Next()) } // Update the RACK fields if SACK is enabled. if s.ep.SACKPermitted && !seg.acked && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { s.rc.update(seg, rcvdSeg) s.rc.detectReorder(seg) } s.writeList.Remove(seg) // If SACK is enabled then only reduce outstanding if // the segment was not previously SACKED as these have // already been accounted for in SetPipe(). if !s.ep.SACKPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) { s.Outstanding -= s.pCount(seg, s.MaxPayloadSize) } else { s.SackedOut -= s.pCount(seg, s.MaxPayloadSize) } seg.DecRef() ackLeft -= datalen } // Clear SACK information for all acked data. s.ep.scoreboard.Delete(s.SndUna) // Detect if the sender entered recovery spuriously. if s.inRecovery() { s.detectSpuriousRecovery(hasDSACK, rcvdSeg.parsedOptions.TSEcr) } // If we are not in fast recovery then update the congestion // window based on the number of acknowledged packets. if !s.FastRecovery.Active { s.cc.Update(originalOutstanding-s.Outstanding, bestRTT) if s.FastRecovery.Last.LessThan(s.SndUna) { s.state = tcpip.Open // Update RACK when we are exiting fast or RTO // recovery as described in the RFC // draft-ietf-tcpm-rack-08 Section-7.2 Step 4. if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { s.rc.exitRecovery() } s.reorderTimer.disable() } } // Update the send buffer usage and notify potential waiters. s.ep.updateSndBufferUsage(int(acked)) // It is possible for s.outstanding to drop below zero if we get // a retransmit timeout, reset outstanding to zero but later // get an ack that cover previously sent data. if s.Outstanding < 0 { s.Outstanding = 0 } s.SetPipe() // If all outstanding data was acknowledged the disable the timer. // RFC 6298 Rule 5.3 if s.SndUna == s.SndNxt { s.Outstanding = 0 // Reset firstRetransmittedSegXmitTime to the zero value. s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{} s.resendTimer.disable() s.probeTimer.disable() } } if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { // Update RACK reorder window. // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 // * Upon receiving an ACK: // * Step 4: Update RACK reordering window s.rc.updateRACKReorderWindow() // After the reorder window is calculated, detect any loss by checking // if the time elapsed after the segments are sent is greater than the // reorder window. if numLost := s.rc.detectLoss(rcvdSeg.rcvdTime); numLost > 0 && !s.FastRecovery.Active { // If any segment is marked as lost by // RACK, enter recovery and retransmit // the lost segments. s.cc.HandleLossDetected() s.enterRecovery() fastRetransmit = true } if s.FastRecovery.Active { s.rc.DoRecovery(nil, fastRetransmit) } } // Now that we've popped all acknowledged data from the retransmit // queue, retransmit if needed. if s.FastRecovery.Active && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 { s.lr.DoRecovery(rcvdSeg, fastRetransmit) // When SACK is enabled data sending is governed by steps in // RFC 6675 Section 5 recovery steps A-C. // See: https://tools.ietf.org/html/rfc6675#section-5. if s.ep.SACKPermitted { return } } // Send more data now that some of the pending data has been ack'd, or // that the window opened up, or the congestion window was inflated due // to a duplicate ack during fast recovery. This will also re-enable // the retransmit timer if needed. s.sendData() } // sendSegment sends the specified segment. // +checklocks:s.ep.mu func (s *sender) sendSegment(seg *segment) tcpip.Error { if seg.xmitCount > 0 { s.ep.stack.Stats().TCP.Retransmits.Increment() s.ep.stats.SendErrors.Retransmits.Increment() if s.SndCwnd < s.Ssthresh { s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment() } } seg.xmitTime = s.ep.stack.Clock().NowMonotonic() seg.xmitCount++ seg.lost = false err := s.sendSegmentFromPacketBuffer(seg.pkt, seg.flags, seg.sequenceNumber) // Every time a packet containing data is sent (including a // retransmission), if SACK is enabled and we are retransmitting data // then use the conservative timer described in RFC6675 Section 6.0, // otherwise follow the standard time described in RFC6298 Section 5.1. if err != nil && seg.payloadSize() != 0 { if s.FastRecovery.Active && seg.xmitCount > 1 && s.ep.SACKPermitted { s.resendTimer.enable(s.RTO) } else { if !s.resendTimer.enabled() { s.resendTimer.enable(s.RTO) } } } return err } // sendSegmentFromPacketBuffer sends a new segment containing the given payload, // flags and sequence number. // +checklocks:s.ep.mu // +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu func (s *sender) sendSegmentFromPacketBuffer(pkt *stack.PacketBuffer, flags header.TCPFlags, seq seqnum.Value) tcpip.Error { s.LastSendTime = s.ep.stack.Clock().NowMonotonic() if seq == s.RTTMeasureSeqNum { s.RTTMeasureTime = s.LastSendTime } rcvNxt, rcvWnd := s.ep.rcv.getSendParams() // Remember the max sent ack. s.MaxSentAck = rcvNxt // We need to clone the packet because sendRaw takes ownership of pkt, // and pkt could be reprocessed later on (i.e retrasmission). pkt = pkt.Clone() defer pkt.DecRef() return s.ep.sendRaw(pkt, flags, seq, rcvNxt, rcvWnd) } // sendEmptySegment sends a new empty segment, flags and sequence number. // +checklocks:s.ep.mu // +checklocksalias:s.ep.rcv.ep.mu=s.ep.mu func (s *sender) sendEmptySegment(flags header.TCPFlags, seq seqnum.Value) tcpip.Error { s.LastSendTime = s.ep.stack.Clock().NowMonotonic() if seq == s.RTTMeasureSeqNum { s.RTTMeasureTime = s.LastSendTime } rcvNxt, rcvWnd := s.ep.rcv.getSendParams() // Remember the max sent ack. s.MaxSentAck = rcvNxt return s.ep.sendEmptyRaw(flags, seq, rcvNxt, rcvWnd) } // maybeSendOutOfWindowAck sends an ACK if we are not being rate limited // currently. // +checklocks:s.ep.mu func (s *sender) maybeSendOutOfWindowAck(seg *segment) { // Data packets are unlikely to be part of an ACK loop. So always send // an ACK for a packet w/ data. if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() { s.sendAck() } } func (s *sender) updateWriteNext(seg *segment) { if s.writeNext != nil { s.writeNext.DecRef() } if seg != nil { seg.IncRef() } s.writeNext = seg } // corkTimerExpired drains all the segments when TCP_CORK is enabled. // +checklocks:s.ep.mu func (s *sender) corkTimerExpired() tcpip.Error { // Check if the timer actually expired or if it's a spurious wake due // to a previously orphaned runtime timer. if s.corkTimer.isUninitialized() || !s.corkTimer.checkExpiration() { return nil } // Assign sequence number and flags to the segment. seg := s.writeNext if seg == nil { return nil } seg.sequenceNumber = s.SndNxt seg.flags = header.TCPFlagAck | header.TCPFlagPsh // Drain all the segments. s.sendData() return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/tcp_endpoint_list.go000066400000000000000000000121631465435605700274400ustar00rootroot00000000000000package tcp // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type endpointElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (endpointElementMapper) linkerFor(elem *Endpoint) *Endpoint { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type endpointList struct { head *Endpoint tail *Endpoint } // Reset resets list l to the empty state. func (l *endpointList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *endpointList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *endpointList) Front() *Endpoint { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *endpointList) Back() *Endpoint { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *endpointList) Len() (count int) { for e := l.Front(); e != nil; e = (endpointElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *endpointList) PushFront(e *Endpoint) { linker := endpointElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { endpointElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *endpointList) PushFrontList(m *endpointList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { endpointElementMapper{}.linkerFor(l.head).SetPrev(m.tail) endpointElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *endpointList) PushBack(e *Endpoint) { linker := endpointElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { endpointElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *endpointList) PushBackList(m *endpointList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { endpointElementMapper{}.linkerFor(l.tail).SetNext(m.head) endpointElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *endpointList) InsertAfter(b, e *Endpoint) { bLinker := endpointElementMapper{}.linkerFor(b) eLinker := endpointElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { endpointElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *endpointList) InsertBefore(a, e *Endpoint) { aLinker := endpointElementMapper{}.linkerFor(a) eLinker := endpointElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { endpointElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *endpointList) Remove(e *Endpoint) { linker := endpointElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { endpointElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { endpointElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type endpointEntry struct { next *Endpoint prev *Endpoint } // Next returns the entry that follows e in the list. // //go:nosplit func (e *endpointEntry) Next() *Endpoint { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *endpointEntry) Prev() *Endpoint { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *endpointEntry) SetNext(elem *Endpoint) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *endpointEntry) SetPrev(elem *Endpoint) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/tcp_segment_list.go000066400000000000000000000120721465435605700272610ustar00rootroot00000000000000package tcp // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type segmentElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (segmentElementMapper) linkerFor(elem *segment) *segment { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type segmentList struct { head *segment tail *segment } // Reset resets list l to the empty state. func (l *segmentList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *segmentList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *segmentList) Front() *segment { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *segmentList) Back() *segment { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *segmentList) Len() (count int) { for e := l.Front(); e != nil; e = (segmentElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *segmentList) PushFront(e *segment) { linker := segmentElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { segmentElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *segmentList) PushFrontList(m *segmentList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { segmentElementMapper{}.linkerFor(l.head).SetPrev(m.tail) segmentElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *segmentList) PushBack(e *segment) { linker := segmentElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { segmentElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *segmentList) PushBackList(m *segmentList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { segmentElementMapper{}.linkerFor(l.tail).SetNext(m.head) segmentElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *segmentList) InsertAfter(b, e *segment) { bLinker := segmentElementMapper{}.linkerFor(b) eLinker := segmentElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { segmentElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *segmentList) InsertBefore(a, e *segment) { aLinker := segmentElementMapper{}.linkerFor(a) eLinker := segmentElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { segmentElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *segmentList) Remove(e *segment) { linker := segmentElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { segmentElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { segmentElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type segmentEntry struct { next *segment prev *segment } // Next returns the entry that follows e in the list. // //go:nosplit func (e *segmentEntry) Next() *segment { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *segmentEntry) Prev() *segment { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *segmentEntry) SetNext(elem *segment) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *segmentEntry) SetPrev(elem *segment) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/tcp_segment_refs.go000066400000000000000000000101301465435605700272360ustar00rootroot00000000000000package tcp import ( "context" "fmt" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/refs" ) // enableLogging indicates whether reference-related events should be logged (with // stack traces). This is false by default and should only be set to true for // debugging purposes, as it can generate an extremely large amount of output // and drastically degrade performance. const segmentenableLogging = false // obj is used to customize logging. Note that we use a pointer to T so that // we do not copy the entire object when passed as a format parameter. var segmentobj *segment // Refs implements refs.RefCounter. It keeps a reference count using atomic // operations and calls the destructor when the count reaches zero. // // NOTE: Do not introduce additional fields to the Refs struct. It is used by // many filesystem objects, and we want to keep it as small as possible (i.e., // the same size as using an int64 directly) to avoid taking up extra cache // space. In general, this template should not be extended at the cost of // performance. If it does not offer enough flexibility for a particular object // (example: b/187877947), we should implement the RefCounter/CheckedObject // interfaces manually. // // +stateify savable type segmentRefs struct { // refCount is composed of two fields: // // [32-bit speculative references]:[32-bit real references] // // Speculative references are used for TryIncRef, to avoid a CompareAndSwap // loop. See IncRef, DecRef and TryIncRef for details of how these fields are // used. refCount atomicbitops.Int64 } // InitRefs initializes r with one reference and, if enabled, activates leak // checking. func (r *segmentRefs) InitRefs() { r.refCount.RacyStore(1) refs.Register(r) } // RefType implements refs.CheckedObject.RefType. func (r *segmentRefs) RefType() string { return fmt.Sprintf("%T", segmentobj)[1:] } // LeakMessage implements refs.CheckedObject.LeakMessage. func (r *segmentRefs) LeakMessage() string { return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) } // LogRefs implements refs.CheckedObject.LogRefs. func (r *segmentRefs) LogRefs() bool { return segmentenableLogging } // ReadRefs returns the current number of references. The returned count is // inherently racy and is unsafe to use without external synchronization. func (r *segmentRefs) ReadRefs() int64 { return r.refCount.Load() } // IncRef implements refs.RefCounter.IncRef. // //go:nosplit func (r *segmentRefs) IncRef() { v := r.refCount.Add(1) if segmentenableLogging { refs.LogIncRef(r, v) } if v <= 1 { panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) } } // TryIncRef implements refs.TryRefCounter.TryIncRef. // // To do this safely without a loop, a speculative reference is first acquired // on the object. This allows multiple concurrent TryIncRef calls to distinguish // other TryIncRef calls from genuine references held. // //go:nosplit func (r *segmentRefs) TryIncRef() bool { const speculativeRef = 1 << 32 if v := r.refCount.Add(speculativeRef); int32(v) == 0 { r.refCount.Add(-speculativeRef) return false } v := r.refCount.Add(-speculativeRef + 1) if segmentenableLogging { refs.LogTryIncRef(r, v) } return true } // DecRef implements refs.RefCounter.DecRef. // // Note that speculative references are counted here. Since they were added // prior to real references reaching zero, they will successfully convert to // real references. In other words, we see speculative references only in the // following case: // // A: TryIncRef [speculative increase => sees non-negative references] // B: DecRef [real decrease] // A: TryIncRef [transform speculative to real] // //go:nosplit func (r *segmentRefs) DecRef(destroy func()) { v := r.refCount.Add(-1) if segmentenableLogging { refs.LogDecRef(r, v) } switch { case v < 0: panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) case v == 0: refs.Unregister(r) if destroy != nil { destroy() } } } func (r *segmentRefs) afterLoad(context.Context) { if r.ReadRefs() > 0 { refs.Register(r) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/tcp_state_autogen.go000066400000000000000000001042421465435605700274270ustar00rootroot00000000000000// automatically generated by stateify. package tcp import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (a *acceptQueue) StateTypeName() string { return "pkg/tcpip/transport/tcp.acceptQueue" } func (a *acceptQueue) StateFields() []string { return []string{ "endpoints", "pendingEndpoints", "capacity", } } func (a *acceptQueue) beforeSave() {} // +checklocksignore func (a *acceptQueue) StateSave(stateSinkObject state.Sink) { a.beforeSave() var endpointsValue []*Endpoint endpointsValue = a.saveEndpoints() stateSinkObject.SaveValue(0, endpointsValue) stateSinkObject.Save(1, &a.pendingEndpoints) stateSinkObject.Save(2, &a.capacity) } func (a *acceptQueue) afterLoad(context.Context) {} // +checklocksignore func (a *acceptQueue) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(1, &a.pendingEndpoints) stateSourceObject.Load(2, &a.capacity) stateSourceObject.LoadValue(0, new([]*Endpoint), func(y any) { a.loadEndpoints(ctx, y.([]*Endpoint)) }) } func (h *handshake) StateTypeName() string { return "pkg/tcpip/transport/tcp.handshake" } func (h *handshake) StateFields() []string { return []string{ "ep", "listenEP", "state", "active", "flags", "ackNum", "iss", "rcvWnd", "sndWnd", "mss", "sndWndScale", "rcvWndScale", "startTime", "deferAccept", "acked", "sendSYNOpts", "sampleRTTWithTSOnly", } } func (h *handshake) beforeSave() {} // +checklocksignore func (h *handshake) StateSave(stateSinkObject state.Sink) { h.beforeSave() stateSinkObject.Save(0, &h.ep) stateSinkObject.Save(1, &h.listenEP) stateSinkObject.Save(2, &h.state) stateSinkObject.Save(3, &h.active) stateSinkObject.Save(4, &h.flags) stateSinkObject.Save(5, &h.ackNum) stateSinkObject.Save(6, &h.iss) stateSinkObject.Save(7, &h.rcvWnd) stateSinkObject.Save(8, &h.sndWnd) stateSinkObject.Save(9, &h.mss) stateSinkObject.Save(10, &h.sndWndScale) stateSinkObject.Save(11, &h.rcvWndScale) stateSinkObject.Save(12, &h.startTime) stateSinkObject.Save(13, &h.deferAccept) stateSinkObject.Save(14, &h.acked) stateSinkObject.Save(15, &h.sendSYNOpts) stateSinkObject.Save(16, &h.sampleRTTWithTSOnly) } func (h *handshake) afterLoad(context.Context) {} // +checklocksignore func (h *handshake) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &h.ep) stateSourceObject.Load(1, &h.listenEP) stateSourceObject.Load(2, &h.state) stateSourceObject.Load(3, &h.active) stateSourceObject.Load(4, &h.flags) stateSourceObject.Load(5, &h.ackNum) stateSourceObject.Load(6, &h.iss) stateSourceObject.Load(7, &h.rcvWnd) stateSourceObject.Load(8, &h.sndWnd) stateSourceObject.Load(9, &h.mss) stateSourceObject.Load(10, &h.sndWndScale) stateSourceObject.Load(11, &h.rcvWndScale) stateSourceObject.Load(12, &h.startTime) stateSourceObject.Load(13, &h.deferAccept) stateSourceObject.Load(14, &h.acked) stateSourceObject.Load(15, &h.sendSYNOpts) stateSourceObject.Load(16, &h.sampleRTTWithTSOnly) } func (c *cubicState) StateTypeName() string { return "pkg/tcpip/transport/tcp.cubicState" } func (c *cubicState) StateFields() []string { return []string{ "TCPCubicState", "numCongestionEvents", "s", } } func (c *cubicState) beforeSave() {} // +checklocksignore func (c *cubicState) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.TCPCubicState) stateSinkObject.Save(1, &c.numCongestionEvents) stateSinkObject.Save(2, &c.s) } func (c *cubicState) afterLoad(context.Context) {} // +checklocksignore func (c *cubicState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &c.TCPCubicState) stateSourceObject.Load(1, &c.numCongestionEvents) stateSourceObject.Load(2, &c.s) } func (q *epQueue) StateTypeName() string { return "pkg/tcpip/transport/tcp.epQueue" } func (q *epQueue) StateFields() []string { return []string{ "list", } } func (q *epQueue) beforeSave() {} // +checklocksignore func (q *epQueue) StateSave(stateSinkObject state.Sink) { q.beforeSave() stateSinkObject.Save(0, &q.list) } func (q *epQueue) afterLoad(context.Context) {} // +checklocksignore func (q *epQueue) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &q.list) } func (p *processor) StateTypeName() string { return "pkg/tcpip/transport/tcp.processor" } func (p *processor) StateFields() []string { return []string{ "epQ", "sleeper", } } func (p *processor) beforeSave() {} // +checklocksignore func (p *processor) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.epQ) stateSinkObject.Save(1, &p.sleeper) } func (p *processor) afterLoad(context.Context) {} // +checklocksignore func (p *processor) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.epQ) stateSourceObject.Load(1, &p.sleeper) } func (d *dispatcher) StateTypeName() string { return "pkg/tcpip/transport/tcp.dispatcher" } func (d *dispatcher) StateFields() []string { return []string{ "processors", "hasher", "paused", "closed", } } func (d *dispatcher) beforeSave() {} // +checklocksignore func (d *dispatcher) StateSave(stateSinkObject state.Sink) { d.beforeSave() stateSinkObject.Save(0, &d.processors) stateSinkObject.Save(1, &d.hasher) stateSinkObject.Save(2, &d.paused) stateSinkObject.Save(3, &d.closed) } func (d *dispatcher) afterLoad(context.Context) {} // +checklocksignore func (d *dispatcher) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &d.processors) stateSourceObject.Load(1, &d.hasher) stateSourceObject.Load(2, &d.paused) stateSourceObject.Load(3, &d.closed) } func (j *jenkinsHasher) StateTypeName() string { return "pkg/tcpip/transport/tcp.jenkinsHasher" } func (j *jenkinsHasher) StateFields() []string { return []string{ "seed", } } func (j *jenkinsHasher) beforeSave() {} // +checklocksignore func (j *jenkinsHasher) StateSave(stateSinkObject state.Sink) { j.beforeSave() stateSinkObject.Save(0, &j.seed) } func (j *jenkinsHasher) afterLoad(context.Context) {} // +checklocksignore func (j *jenkinsHasher) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &j.seed) } func (s *SACKInfo) StateTypeName() string { return "pkg/tcpip/transport/tcp.SACKInfo" } func (s *SACKInfo) StateFields() []string { return []string{ "Blocks", "NumBlocks", } } func (s *SACKInfo) beforeSave() {} // +checklocksignore func (s *SACKInfo) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.Blocks) stateSinkObject.Save(1, &s.NumBlocks) } func (s *SACKInfo) afterLoad(context.Context) {} // +checklocksignore func (s *SACKInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.Blocks) stateSourceObject.Load(1, &s.NumBlocks) } func (r *ReceiveErrors) StateTypeName() string { return "pkg/tcpip/transport/tcp.ReceiveErrors" } func (r *ReceiveErrors) StateFields() []string { return []string{ "ReceiveErrors", "SegmentQueueDropped", "ChecksumErrors", "ListenOverflowSynDrop", "ListenOverflowAckDrop", "ZeroRcvWindowState", "WantZeroRcvWindow", } } func (r *ReceiveErrors) beforeSave() {} // +checklocksignore func (r *ReceiveErrors) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.ReceiveErrors) stateSinkObject.Save(1, &r.SegmentQueueDropped) stateSinkObject.Save(2, &r.ChecksumErrors) stateSinkObject.Save(3, &r.ListenOverflowSynDrop) stateSinkObject.Save(4, &r.ListenOverflowAckDrop) stateSinkObject.Save(5, &r.ZeroRcvWindowState) stateSinkObject.Save(6, &r.WantZeroRcvWindow) } func (r *ReceiveErrors) afterLoad(context.Context) {} // +checklocksignore func (r *ReceiveErrors) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.ReceiveErrors) stateSourceObject.Load(1, &r.SegmentQueueDropped) stateSourceObject.Load(2, &r.ChecksumErrors) stateSourceObject.Load(3, &r.ListenOverflowSynDrop) stateSourceObject.Load(4, &r.ListenOverflowAckDrop) stateSourceObject.Load(5, &r.ZeroRcvWindowState) stateSourceObject.Load(6, &r.WantZeroRcvWindow) } func (s *SendErrors) StateTypeName() string { return "pkg/tcpip/transport/tcp.SendErrors" } func (s *SendErrors) StateFields() []string { return []string{ "SendErrors", "SegmentSendToNetworkFailed", "SynSendToNetworkFailed", "Retransmits", "FastRetransmit", "Timeouts", } } func (s *SendErrors) beforeSave() {} // +checklocksignore func (s *SendErrors) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.SendErrors) stateSinkObject.Save(1, &s.SegmentSendToNetworkFailed) stateSinkObject.Save(2, &s.SynSendToNetworkFailed) stateSinkObject.Save(3, &s.Retransmits) stateSinkObject.Save(4, &s.FastRetransmit) stateSinkObject.Save(5, &s.Timeouts) } func (s *SendErrors) afterLoad(context.Context) {} // +checklocksignore func (s *SendErrors) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.SendErrors) stateSourceObject.Load(1, &s.SegmentSendToNetworkFailed) stateSourceObject.Load(2, &s.SynSendToNetworkFailed) stateSourceObject.Load(3, &s.Retransmits) stateSourceObject.Load(4, &s.FastRetransmit) stateSourceObject.Load(5, &s.Timeouts) } func (s *Stats) StateTypeName() string { return "pkg/tcpip/transport/tcp.Stats" } func (s *Stats) StateFields() []string { return []string{ "SegmentsReceived", "SegmentsSent", "FailedConnectionAttempts", "ReceiveErrors", "ReadErrors", "SendErrors", "WriteErrors", } } func (s *Stats) beforeSave() {} // +checklocksignore func (s *Stats) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.SegmentsReceived) stateSinkObject.Save(1, &s.SegmentsSent) stateSinkObject.Save(2, &s.FailedConnectionAttempts) stateSinkObject.Save(3, &s.ReceiveErrors) stateSinkObject.Save(4, &s.ReadErrors) stateSinkObject.Save(5, &s.SendErrors) stateSinkObject.Save(6, &s.WriteErrors) } func (s *Stats) afterLoad(context.Context) {} // +checklocksignore func (s *Stats) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.SegmentsReceived) stateSourceObject.Load(1, &s.SegmentsSent) stateSourceObject.Load(2, &s.FailedConnectionAttempts) stateSourceObject.Load(3, &s.ReceiveErrors) stateSourceObject.Load(4, &s.ReadErrors) stateSourceObject.Load(5, &s.SendErrors) stateSourceObject.Load(6, &s.WriteErrors) } func (sq *sndQueueInfo) StateTypeName() string { return "pkg/tcpip/transport/tcp.sndQueueInfo" } func (sq *sndQueueInfo) StateFields() []string { return []string{ "TCPSndBufState", } } func (sq *sndQueueInfo) beforeSave() {} // +checklocksignore func (sq *sndQueueInfo) StateSave(stateSinkObject state.Sink) { sq.beforeSave() stateSinkObject.Save(0, &sq.TCPSndBufState) } func (sq *sndQueueInfo) afterLoad(context.Context) {} // +checklocksignore func (sq *sndQueueInfo) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &sq.TCPSndBufState) } func (e *Endpoint) StateTypeName() string { return "pkg/tcpip/transport/tcp.Endpoint" } func (e *Endpoint) StateFields() []string { return []string{ "TCPEndpointStateInner", "TransportEndpointInfo", "DefaultSocketOptionsHandler", "waiterQueue", "hardError", "lastError", "TCPRcvBufState", "rcvMemUsed", "ownedByUser", "rcvQueue", "state", "connectionDirectionState", "boundNICID", "ipv4TTL", "ipv6HopLimit", "isConnectNotified", "h", "portFlags", "boundBindToDevice", "boundPortFlags", "boundDest", "effectiveNetProtos", "recentTSTime", "shutdownFlags", "tcpRecovery", "sack", "delay", "scoreboard", "segmentQueue", "userMSS", "maxSynRetries", "windowClamp", "sndQueueInfo", "cc", "keepalive", "userTimeout", "deferAccept", "acceptQueue", "rcv", "snd", "connectingAddress", "amss", "sendTOS", "gso", "stats", "tcpLingerTimeout", "closed", "txHash", "owner", "ops", "lastOutOfWindowAckTime", "pmtud", } } // +checklocksignore func (e *Endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() var stateValue EndpointState stateValue = e.saveState() stateSinkObject.SaveValue(10, stateValue) stateSinkObject.Save(0, &e.TCPEndpointStateInner) stateSinkObject.Save(1, &e.TransportEndpointInfo) stateSinkObject.Save(2, &e.DefaultSocketOptionsHandler) stateSinkObject.Save(3, &e.waiterQueue) stateSinkObject.Save(4, &e.hardError) stateSinkObject.Save(5, &e.lastError) stateSinkObject.Save(6, &e.TCPRcvBufState) stateSinkObject.Save(7, &e.rcvMemUsed) stateSinkObject.Save(8, &e.ownedByUser) stateSinkObject.Save(9, &e.rcvQueue) stateSinkObject.Save(11, &e.connectionDirectionState) stateSinkObject.Save(12, &e.boundNICID) stateSinkObject.Save(13, &e.ipv4TTL) stateSinkObject.Save(14, &e.ipv6HopLimit) stateSinkObject.Save(15, &e.isConnectNotified) stateSinkObject.Save(16, &e.h) stateSinkObject.Save(17, &e.portFlags) stateSinkObject.Save(18, &e.boundBindToDevice) stateSinkObject.Save(19, &e.boundPortFlags) stateSinkObject.Save(20, &e.boundDest) stateSinkObject.Save(21, &e.effectiveNetProtos) stateSinkObject.Save(22, &e.recentTSTime) stateSinkObject.Save(23, &e.shutdownFlags) stateSinkObject.Save(24, &e.tcpRecovery) stateSinkObject.Save(25, &e.sack) stateSinkObject.Save(26, &e.delay) stateSinkObject.Save(27, &e.scoreboard) stateSinkObject.Save(28, &e.segmentQueue) stateSinkObject.Save(29, &e.userMSS) stateSinkObject.Save(30, &e.maxSynRetries) stateSinkObject.Save(31, &e.windowClamp) stateSinkObject.Save(32, &e.sndQueueInfo) stateSinkObject.Save(33, &e.cc) stateSinkObject.Save(34, &e.keepalive) stateSinkObject.Save(35, &e.userTimeout) stateSinkObject.Save(36, &e.deferAccept) stateSinkObject.Save(37, &e.acceptQueue) stateSinkObject.Save(38, &e.rcv) stateSinkObject.Save(39, &e.snd) stateSinkObject.Save(40, &e.connectingAddress) stateSinkObject.Save(41, &e.amss) stateSinkObject.Save(42, &e.sendTOS) stateSinkObject.Save(43, &e.gso) stateSinkObject.Save(44, &e.stats) stateSinkObject.Save(45, &e.tcpLingerTimeout) stateSinkObject.Save(46, &e.closed) stateSinkObject.Save(47, &e.txHash) stateSinkObject.Save(48, &e.owner) stateSinkObject.Save(49, &e.ops) stateSinkObject.Save(50, &e.lastOutOfWindowAckTime) stateSinkObject.Save(51, &e.pmtud) } // +checklocksignore func (e *Endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.TCPEndpointStateInner) stateSourceObject.Load(1, &e.TransportEndpointInfo) stateSourceObject.Load(2, &e.DefaultSocketOptionsHandler) stateSourceObject.LoadWait(3, &e.waiterQueue) stateSourceObject.Load(4, &e.hardError) stateSourceObject.Load(5, &e.lastError) stateSourceObject.Load(6, &e.TCPRcvBufState) stateSourceObject.Load(7, &e.rcvMemUsed) stateSourceObject.Load(8, &e.ownedByUser) stateSourceObject.LoadWait(9, &e.rcvQueue) stateSourceObject.Load(11, &e.connectionDirectionState) stateSourceObject.Load(12, &e.boundNICID) stateSourceObject.Load(13, &e.ipv4TTL) stateSourceObject.Load(14, &e.ipv6HopLimit) stateSourceObject.Load(15, &e.isConnectNotified) stateSourceObject.Load(16, &e.h) stateSourceObject.Load(17, &e.portFlags) stateSourceObject.Load(18, &e.boundBindToDevice) stateSourceObject.Load(19, &e.boundPortFlags) stateSourceObject.Load(20, &e.boundDest) stateSourceObject.Load(21, &e.effectiveNetProtos) stateSourceObject.Load(22, &e.recentTSTime) stateSourceObject.Load(23, &e.shutdownFlags) stateSourceObject.Load(24, &e.tcpRecovery) stateSourceObject.Load(25, &e.sack) stateSourceObject.Load(26, &e.delay) stateSourceObject.Load(27, &e.scoreboard) stateSourceObject.LoadWait(28, &e.segmentQueue) stateSourceObject.Load(29, &e.userMSS) stateSourceObject.Load(30, &e.maxSynRetries) stateSourceObject.Load(31, &e.windowClamp) stateSourceObject.Load(32, &e.sndQueueInfo) stateSourceObject.Load(33, &e.cc) stateSourceObject.Load(34, &e.keepalive) stateSourceObject.Load(35, &e.userTimeout) stateSourceObject.Load(36, &e.deferAccept) stateSourceObject.Load(37, &e.acceptQueue) stateSourceObject.LoadWait(38, &e.rcv) stateSourceObject.LoadWait(39, &e.snd) stateSourceObject.Load(40, &e.connectingAddress) stateSourceObject.Load(41, &e.amss) stateSourceObject.Load(42, &e.sendTOS) stateSourceObject.Load(43, &e.gso) stateSourceObject.Load(44, &e.stats) stateSourceObject.Load(45, &e.tcpLingerTimeout) stateSourceObject.Load(46, &e.closed) stateSourceObject.Load(47, &e.txHash) stateSourceObject.Load(48, &e.owner) stateSourceObject.Load(49, &e.ops) stateSourceObject.Load(50, &e.lastOutOfWindowAckTime) stateSourceObject.Load(51, &e.pmtud) stateSourceObject.LoadValue(10, new(EndpointState), func(y any) { e.loadState(ctx, y.(EndpointState)) }) stateSourceObject.AfterLoad(func() { e.afterLoad(ctx) }) } func (k *keepalive) StateTypeName() string { return "pkg/tcpip/transport/tcp.keepalive" } func (k *keepalive) StateFields() []string { return []string{ "idle", "interval", "count", "unacked", } } func (k *keepalive) beforeSave() {} // +checklocksignore func (k *keepalive) StateSave(stateSinkObject state.Sink) { k.beforeSave() stateSinkObject.Save(0, &k.idle) stateSinkObject.Save(1, &k.interval) stateSinkObject.Save(2, &k.count) stateSinkObject.Save(3, &k.unacked) } func (k *keepalive) afterLoad(context.Context) {} // +checklocksignore func (k *keepalive) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &k.idle) stateSourceObject.Load(1, &k.interval) stateSourceObject.Load(2, &k.count) stateSourceObject.Load(3, &k.unacked) } func (p *protocol) StateTypeName() string { return "pkg/tcpip/transport/tcp.protocol" } func (p *protocol) StateFields() []string { return []string{ "stack", "sackEnabled", "recovery", "delayEnabled", "alwaysUseSynCookies", "sendBufferSize", "recvBufferSize", "congestionControl", "availableCongestionControl", "moderateReceiveBuffer", "lingerTimeout", "timeWaitTimeout", "timeWaitReuse", "minRTO", "maxRTO", "maxRetries", "synRetries", "dispatcher", "seqnumSecret", "tsOffsetSecret", } } func (p *protocol) beforeSave() {} // +checklocksignore func (p *protocol) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.stack) stateSinkObject.Save(1, &p.sackEnabled) stateSinkObject.Save(2, &p.recovery) stateSinkObject.Save(3, &p.delayEnabled) stateSinkObject.Save(4, &p.alwaysUseSynCookies) stateSinkObject.Save(5, &p.sendBufferSize) stateSinkObject.Save(6, &p.recvBufferSize) stateSinkObject.Save(7, &p.congestionControl) stateSinkObject.Save(8, &p.availableCongestionControl) stateSinkObject.Save(9, &p.moderateReceiveBuffer) stateSinkObject.Save(10, &p.lingerTimeout) stateSinkObject.Save(11, &p.timeWaitTimeout) stateSinkObject.Save(12, &p.timeWaitReuse) stateSinkObject.Save(13, &p.minRTO) stateSinkObject.Save(14, &p.maxRTO) stateSinkObject.Save(15, &p.maxRetries) stateSinkObject.Save(16, &p.synRetries) stateSinkObject.Save(17, &p.dispatcher) stateSinkObject.Save(18, &p.seqnumSecret) stateSinkObject.Save(19, &p.tsOffsetSecret) } func (p *protocol) afterLoad(context.Context) {} // +checklocksignore func (p *protocol) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.stack) stateSourceObject.Load(1, &p.sackEnabled) stateSourceObject.Load(2, &p.recovery) stateSourceObject.Load(3, &p.delayEnabled) stateSourceObject.Load(4, &p.alwaysUseSynCookies) stateSourceObject.Load(5, &p.sendBufferSize) stateSourceObject.Load(6, &p.recvBufferSize) stateSourceObject.Load(7, &p.congestionControl) stateSourceObject.Load(8, &p.availableCongestionControl) stateSourceObject.Load(9, &p.moderateReceiveBuffer) stateSourceObject.Load(10, &p.lingerTimeout) stateSourceObject.Load(11, &p.timeWaitTimeout) stateSourceObject.Load(12, &p.timeWaitReuse) stateSourceObject.Load(13, &p.minRTO) stateSourceObject.Load(14, &p.maxRTO) stateSourceObject.Load(15, &p.maxRetries) stateSourceObject.Load(16, &p.synRetries) stateSourceObject.Load(17, &p.dispatcher) stateSourceObject.Load(18, &p.seqnumSecret) stateSourceObject.Load(19, &p.tsOffsetSecret) } func (rc *rackControl) StateTypeName() string { return "pkg/tcpip/transport/tcp.rackControl" } func (rc *rackControl) StateFields() []string { return []string{ "TCPRACKState", "exitedRecovery", "minRTT", "tlpRxtOut", "tlpHighRxt", "snd", } } func (rc *rackControl) beforeSave() {} // +checklocksignore func (rc *rackControl) StateSave(stateSinkObject state.Sink) { rc.beforeSave() stateSinkObject.Save(0, &rc.TCPRACKState) stateSinkObject.Save(1, &rc.exitedRecovery) stateSinkObject.Save(2, &rc.minRTT) stateSinkObject.Save(3, &rc.tlpRxtOut) stateSinkObject.Save(4, &rc.tlpHighRxt) stateSinkObject.Save(5, &rc.snd) } func (rc *rackControl) afterLoad(context.Context) {} // +checklocksignore func (rc *rackControl) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &rc.TCPRACKState) stateSourceObject.Load(1, &rc.exitedRecovery) stateSourceObject.Load(2, &rc.minRTT) stateSourceObject.Load(3, &rc.tlpRxtOut) stateSourceObject.Load(4, &rc.tlpHighRxt) stateSourceObject.Load(5, &rc.snd) } func (r *receiver) StateTypeName() string { return "pkg/tcpip/transport/tcp.receiver" } func (r *receiver) StateFields() []string { return []string{ "TCPReceiverState", "ep", "rcvWnd", "rcvWUP", "prevBufUsed", "closed", "pendingRcvdSegments", "lastRcvdAckTime", } } func (r *receiver) beforeSave() {} // +checklocksignore func (r *receiver) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.TCPReceiverState) stateSinkObject.Save(1, &r.ep) stateSinkObject.Save(2, &r.rcvWnd) stateSinkObject.Save(3, &r.rcvWUP) stateSinkObject.Save(4, &r.prevBufUsed) stateSinkObject.Save(5, &r.closed) stateSinkObject.Save(6, &r.pendingRcvdSegments) stateSinkObject.Save(7, &r.lastRcvdAckTime) } func (r *receiver) afterLoad(context.Context) {} // +checklocksignore func (r *receiver) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.TCPReceiverState) stateSourceObject.Load(1, &r.ep) stateSourceObject.Load(2, &r.rcvWnd) stateSourceObject.Load(3, &r.rcvWUP) stateSourceObject.Load(4, &r.prevBufUsed) stateSourceObject.Load(5, &r.closed) stateSourceObject.Load(6, &r.pendingRcvdSegments) stateSourceObject.Load(7, &r.lastRcvdAckTime) } func (r *renoState) StateTypeName() string { return "pkg/tcpip/transport/tcp.renoState" } func (r *renoState) StateFields() []string { return []string{ "s", } } func (r *renoState) beforeSave() {} // +checklocksignore func (r *renoState) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.s) } func (r *renoState) afterLoad(context.Context) {} // +checklocksignore func (r *renoState) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.s) } func (rr *renoRecovery) StateTypeName() string { return "pkg/tcpip/transport/tcp.renoRecovery" } func (rr *renoRecovery) StateFields() []string { return []string{ "s", } } func (rr *renoRecovery) beforeSave() {} // +checklocksignore func (rr *renoRecovery) StateSave(stateSinkObject state.Sink) { rr.beforeSave() stateSinkObject.Save(0, &rr.s) } func (rr *renoRecovery) afterLoad(context.Context) {} // +checklocksignore func (rr *renoRecovery) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &rr.s) } func (sr *sackRecovery) StateTypeName() string { return "pkg/tcpip/transport/tcp.sackRecovery" } func (sr *sackRecovery) StateFields() []string { return []string{ "s", } } func (sr *sackRecovery) beforeSave() {} // +checklocksignore func (sr *sackRecovery) StateSave(stateSinkObject state.Sink) { sr.beforeSave() stateSinkObject.Save(0, &sr.s) } func (sr *sackRecovery) afterLoad(context.Context) {} // +checklocksignore func (sr *sackRecovery) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &sr.s) } func (s *SACKScoreboard) StateTypeName() string { return "pkg/tcpip/transport/tcp.SACKScoreboard" } func (s *SACKScoreboard) StateFields() []string { return []string{ "smss", "maxSACKED", } } func (s *SACKScoreboard) beforeSave() {} // +checklocksignore func (s *SACKScoreboard) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.smss) stateSinkObject.Save(1, &s.maxSACKED) } func (s *SACKScoreboard) afterLoad(context.Context) {} // +checklocksignore func (s *SACKScoreboard) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.smss) stateSourceObject.Load(1, &s.maxSACKED) } func (s *segment) StateTypeName() string { return "pkg/tcpip/transport/tcp.segment" } func (s *segment) StateFields() []string { return []string{ "segmentEntry", "segmentRefs", "ep", "qFlags", "pkt", "sequenceNumber", "ackNumber", "flags", "window", "csum", "csumValid", "parsedOptions", "options", "hasNewSACKInfo", "rcvdTime", "xmitTime", "xmitCount", "acked", "dataMemSize", "lost", } } func (s *segment) beforeSave() {} // +checklocksignore func (s *segment) StateSave(stateSinkObject state.Sink) { s.beforeSave() var optionsValue []byte optionsValue = s.saveOptions() stateSinkObject.SaveValue(12, optionsValue) stateSinkObject.Save(0, &s.segmentEntry) stateSinkObject.Save(1, &s.segmentRefs) stateSinkObject.Save(2, &s.ep) stateSinkObject.Save(3, &s.qFlags) stateSinkObject.Save(4, &s.pkt) stateSinkObject.Save(5, &s.sequenceNumber) stateSinkObject.Save(6, &s.ackNumber) stateSinkObject.Save(7, &s.flags) stateSinkObject.Save(8, &s.window) stateSinkObject.Save(9, &s.csum) stateSinkObject.Save(10, &s.csumValid) stateSinkObject.Save(11, &s.parsedOptions) stateSinkObject.Save(13, &s.hasNewSACKInfo) stateSinkObject.Save(14, &s.rcvdTime) stateSinkObject.Save(15, &s.xmitTime) stateSinkObject.Save(16, &s.xmitCount) stateSinkObject.Save(17, &s.acked) stateSinkObject.Save(18, &s.dataMemSize) stateSinkObject.Save(19, &s.lost) } func (s *segment) afterLoad(context.Context) {} // +checklocksignore func (s *segment) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.segmentEntry) stateSourceObject.Load(1, &s.segmentRefs) stateSourceObject.Load(2, &s.ep) stateSourceObject.Load(3, &s.qFlags) stateSourceObject.Load(4, &s.pkt) stateSourceObject.Load(5, &s.sequenceNumber) stateSourceObject.Load(6, &s.ackNumber) stateSourceObject.Load(7, &s.flags) stateSourceObject.Load(8, &s.window) stateSourceObject.Load(9, &s.csum) stateSourceObject.Load(10, &s.csumValid) stateSourceObject.Load(11, &s.parsedOptions) stateSourceObject.Load(13, &s.hasNewSACKInfo) stateSourceObject.Load(14, &s.rcvdTime) stateSourceObject.Load(15, &s.xmitTime) stateSourceObject.Load(16, &s.xmitCount) stateSourceObject.Load(17, &s.acked) stateSourceObject.Load(18, &s.dataMemSize) stateSourceObject.Load(19, &s.lost) stateSourceObject.LoadValue(12, new([]byte), func(y any) { s.loadOptions(ctx, y.([]byte)) }) } func (q *segmentQueue) StateTypeName() string { return "pkg/tcpip/transport/tcp.segmentQueue" } func (q *segmentQueue) StateFields() []string { return []string{ "list", "ep", "frozen", } } func (q *segmentQueue) beforeSave() {} // +checklocksignore func (q *segmentQueue) StateSave(stateSinkObject state.Sink) { q.beforeSave() stateSinkObject.Save(0, &q.list) stateSinkObject.Save(1, &q.ep) stateSinkObject.Save(2, &q.frozen) } func (q *segmentQueue) afterLoad(context.Context) {} // +checklocksignore func (q *segmentQueue) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.LoadWait(0, &q.list) stateSourceObject.Load(1, &q.ep) stateSourceObject.Load(2, &q.frozen) } func (s *sender) StateTypeName() string { return "pkg/tcpip/transport/tcp.sender" } func (s *sender) StateFields() []string { return []string{ "TCPSenderState", "ep", "lr", "firstRetransmittedSegXmitTime", "writeNext", "writeList", "rtt", "minRTO", "maxRTO", "maxRetries", "gso", "state", "cc", "rc", "spuriousRecovery", "retransmitTS", "startCork", } } func (s *sender) beforeSave() {} // +checklocksignore func (s *sender) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.TCPSenderState) stateSinkObject.Save(1, &s.ep) stateSinkObject.Save(2, &s.lr) stateSinkObject.Save(3, &s.firstRetransmittedSegXmitTime) stateSinkObject.Save(4, &s.writeNext) stateSinkObject.Save(5, &s.writeList) stateSinkObject.Save(6, &s.rtt) stateSinkObject.Save(7, &s.minRTO) stateSinkObject.Save(8, &s.maxRTO) stateSinkObject.Save(9, &s.maxRetries) stateSinkObject.Save(10, &s.gso) stateSinkObject.Save(11, &s.state) stateSinkObject.Save(12, &s.cc) stateSinkObject.Save(13, &s.rc) stateSinkObject.Save(14, &s.spuriousRecovery) stateSinkObject.Save(15, &s.retransmitTS) stateSinkObject.Save(16, &s.startCork) } func (s *sender) afterLoad(context.Context) {} // +checklocksignore func (s *sender) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.TCPSenderState) stateSourceObject.Load(1, &s.ep) stateSourceObject.Load(2, &s.lr) stateSourceObject.Load(3, &s.firstRetransmittedSegXmitTime) stateSourceObject.Load(4, &s.writeNext) stateSourceObject.Load(5, &s.writeList) stateSourceObject.Load(6, &s.rtt) stateSourceObject.Load(7, &s.minRTO) stateSourceObject.Load(8, &s.maxRTO) stateSourceObject.Load(9, &s.maxRetries) stateSourceObject.Load(10, &s.gso) stateSourceObject.Load(11, &s.state) stateSourceObject.Load(12, &s.cc) stateSourceObject.Load(13, &s.rc) stateSourceObject.Load(14, &s.spuriousRecovery) stateSourceObject.Load(15, &s.retransmitTS) stateSourceObject.Load(16, &s.startCork) } func (r *rtt) StateTypeName() string { return "pkg/tcpip/transport/tcp.rtt" } func (r *rtt) StateFields() []string { return []string{ "TCPRTTState", } } func (r *rtt) beforeSave() {} // +checklocksignore func (r *rtt) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.TCPRTTState) } func (r *rtt) afterLoad(context.Context) {} // +checklocksignore func (r *rtt) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.TCPRTTState) } func (l *endpointList) StateTypeName() string { return "pkg/tcpip/transport/tcp.endpointList" } func (l *endpointList) StateFields() []string { return []string{ "head", "tail", } } func (l *endpointList) beforeSave() {} // +checklocksignore func (l *endpointList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *endpointList) afterLoad(context.Context) {} // +checklocksignore func (l *endpointList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *endpointEntry) StateTypeName() string { return "pkg/tcpip/transport/tcp.endpointEntry" } func (e *endpointEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *endpointEntry) beforeSave() {} // +checklocksignore func (e *endpointEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *endpointEntry) afterLoad(context.Context) {} // +checklocksignore func (e *endpointEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (l *segmentList) StateTypeName() string { return "pkg/tcpip/transport/tcp.segmentList" } func (l *segmentList) StateFields() []string { return []string{ "head", "tail", } } func (l *segmentList) beforeSave() {} // +checklocksignore func (l *segmentList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *segmentList) afterLoad(context.Context) {} // +checklocksignore func (l *segmentList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *segmentEntry) StateTypeName() string { return "pkg/tcpip/transport/tcp.segmentEntry" } func (e *segmentEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *segmentEntry) beforeSave() {} // +checklocksignore func (e *segmentEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *segmentEntry) afterLoad(context.Context) {} // +checklocksignore func (e *segmentEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func (r *segmentRefs) StateTypeName() string { return "pkg/tcpip/transport/tcp.segmentRefs" } func (r *segmentRefs) StateFields() []string { return []string{ "refCount", } } func (r *segmentRefs) beforeSave() {} // +checklocksignore func (r *segmentRefs) StateSave(stateSinkObject state.Sink) { r.beforeSave() stateSinkObject.Save(0, &r.refCount) } // +checklocksignore func (r *segmentRefs) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &r.refCount) stateSourceObject.AfterLoad(func() { r.afterLoad(ctx) }) } func init() { state.Register((*acceptQueue)(nil)) state.Register((*handshake)(nil)) state.Register((*cubicState)(nil)) state.Register((*epQueue)(nil)) state.Register((*processor)(nil)) state.Register((*dispatcher)(nil)) state.Register((*jenkinsHasher)(nil)) state.Register((*SACKInfo)(nil)) state.Register((*ReceiveErrors)(nil)) state.Register((*SendErrors)(nil)) state.Register((*Stats)(nil)) state.Register((*sndQueueInfo)(nil)) state.Register((*Endpoint)(nil)) state.Register((*keepalive)(nil)) state.Register((*protocol)(nil)) state.Register((*rackControl)(nil)) state.Register((*receiver)(nil)) state.Register((*renoState)(nil)) state.Register((*renoRecovery)(nil)) state.Register((*sackRecovery)(nil)) state.Register((*SACKScoreboard)(nil)) state.Register((*segment)(nil)) state.Register((*segmentQueue)(nil)) state.Register((*sender)(nil)) state.Register((*rtt)(nil)) state.Register((*endpointList)(nil)) state.Register((*endpointEntry)(nil)) state.Register((*segmentList)(nil)) state.Register((*segmentEntry)(nil)) state.Register((*segmentRefs)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/tcp_unsafe_state_autogen.go000066400000000000000000000000651465435605700307660ustar00rootroot00000000000000// automatically generated by stateify. package tcp golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcp/timer.go000066400000000000000000000116761465435605700250470ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tcp import ( "time" "gvisor.dev/gvisor/pkg/tcpip" ) type timerState int const ( // The timer has not been initialized yet or has been cleaned up. timerUninitialized timerState = iota // The timer is disabled. timerStateDisabled // The timer is enabled, but the clock timer may be set to an earlier // expiration time due to a previous orphaned state. timerStateEnabled // The timer is disabled, but the clock timer is enabled, which means that // it will cause a spurious wakeup unless the timer is enabled before the // clock timer fires. timerStateOrphaned ) // timer is a timer implementation that reduces the interactions with the // clock timer infrastructure by letting timers run (and potentially // eventually expire) even if they are stopped. It makes it cheaper to // disable/reenable timers at the expense of spurious wakes. This is useful for // cases when the same timer is disabled/reenabled repeatedly with relatively // long timeouts farther into the future. // // TCP retransmit timers benefit from this because they the timeouts are long // (currently at least 200ms), and get disabled when acks are received, and // reenabled when new pending segments are sent. // // It is advantageous to avoid interacting with the clock because it acquires // a global mutex and performs O(log n) operations, where n is the global number // of timers, whenever a timer is enabled or disabled, and may make a syscall. // // This struct is thread-compatible. type timer struct { state timerState clock tcpip.Clock // target is the expiration time of the current timer. It is only // meaningful in the enabled state. target tcpip.MonotonicTime // clockTarget is the expiration time of the clock timer. It is // meaningful in the enabled and orphaned states. clockTarget tcpip.MonotonicTime // timer is the clock timer used to wait on. timer tcpip.Timer // callback is the function that's called when the timer expires. callback func() } // init initializes the timer. Once it expires the function callback // passed will be called. func (t *timer) init(clock tcpip.Clock, f func()) { t.state = timerStateDisabled t.clock = clock t.callback = f } // cleanup frees all resources associated with the timer. func (t *timer) cleanup() { if t.timer == nil { // No cleanup needed. return } t.timer.Stop() *t = timer{} } // isUninitialized returns true if the timer is in the uninitialized state. This // is only true if init() has never been called or if cleanup has been called. func (t *timer) isUninitialized() bool { return t.state == timerUninitialized } // checkExpiration checks if the given timer has actually expired, it should be // called whenever the callback function is called, and is used to check if it's // a spurious timer expiration (due to a previously orphaned timer) or a // legitimate one. func (t *timer) checkExpiration() bool { // Transition to fully disabled state if we're just consuming an // orphaned timer. if t.state == timerStateOrphaned { t.state = timerStateDisabled return false } // The timer is enabled, but it may have expired early. Check if that's // the case, and if so, reset the runtime timer to the correct time. now := t.clock.NowMonotonic() if now.Before(t.target) { t.clockTarget = t.target t.timer.Reset(t.target.Sub(now)) return false } // The timer has actually expired, disable it for now and inform the // caller. t.state = timerStateDisabled return true } // disable disables the timer, leaving it in an orphaned state if it wasn't // already disabled. func (t *timer) disable() { if t.state != timerStateDisabled { t.state = timerStateOrphaned } } // enabled returns true if the timer is currently enabled, false otherwise. func (t *timer) enabled() bool { return t.state == timerStateEnabled } // enable enables the timer, programming the runtime timer if necessary. func (t *timer) enable(d time.Duration) { t.target = t.clock.NowMonotonic().Add(d) // Check if we need to set the runtime timer. if t.state == timerStateDisabled || t.target.Before(t.clockTarget) { t.clockTarget = t.target t.resetOrStart(d) } t.state = timerStateEnabled } // resetOrStart creates the timer if it doesn't already exist or resets it with // the given duration if it does. func (t *timer) resetOrStart(d time.Duration) { if t.timer == nil { t.timer = t.clock.AfterFunc(d, t.callback) } else { t.timer.Reset(d) } } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcpconntrack/000077500000000000000000000000001465435605700252705ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go000066400000000000000000000322421465435605700304520ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package tcpconntrack implements a TCP connection tracking object. It allows // users with access to a segment stream to figure out when a connection is // established, reset, and closed (and in the last case, who closed first). package tcpconntrack import ( "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/seqnum" ) // Result is returned when the state of a TCB is updated in response to a // segment. type Result int const ( // ResultDrop indicates that the segment should be dropped. ResultDrop Result = iota // ResultConnecting indicates that the connection remains in a // connecting state. ResultConnecting // ResultAlive indicates that the connection remains alive (connected). ResultAlive // ResultReset indicates that the connection was reset. ResultReset // ResultClosedByResponder indicates that the connection was gracefully // closed, and the reply stream was closed first. ResultClosedByResponder // ResultClosedByOriginator indicates that the connection was gracefully // closed, and the original stream was closed first. ResultClosedByOriginator ) // maxWindowShift is the maximum shift value of the per the windows scale // option defined by RFC 1323. const maxWindowShift = 14 // TCB is a TCP Control Block. It holds state necessary to keep track of a TCP // connection and inform the caller when the connection has been closed. // // +stateify savable type TCB struct { reply stream original stream // State handlers. hdr is not guaranteed to contain bytes beyond the TCP // header itself, i.e. it may not contain the payload. // TODO(b/341946753): Restore them when netstack is savable. handlerReply func(tcb *TCB, hdr header.TCP, dataLen int) Result `state:"nosave"` handlerOriginal func(tcb *TCB, hdr header.TCP, dataLen int) Result `state:"nosave"` // firstFin holds a pointer to the first stream to send a FIN. firstFin *stream // state is the current state of the stream. state Result } // Init initializes the state of the TCB according to the initial SYN. func (t *TCB) Init(initialSyn header.TCP, dataLen int) Result { t.handlerReply = synSentStateReply t.handlerOriginal = synSentStateOriginal iss := seqnum.Value(initialSyn.SequenceNumber()) t.original.una = iss t.original.nxt = iss.Add(logicalLenSyn(initialSyn, dataLen)) t.original.end = t.original.nxt // TODO(gvisor.dev/issue/6734): Cache TCP options instead of re-parsing them. // Because original and reply are streams, scale applies to the reply; it is // the receive window in the reply direction. t.reply.shiftCnt = header.ParseSynOptions(initialSyn.Options(), false /* isAck */).WS // Even though "end" is a sequence number, we don't know the initial // receive sequence number yet, so we store the window size until we get // a SYN from the server. t.reply.una = 0 t.reply.nxt = 0 t.reply.end = seqnum.Value(initialSyn.WindowSize()) t.state = ResultConnecting return t.state } // UpdateStateReply updates the state of the TCB based on the supplied reply // segment. func (t *TCB) UpdateStateReply(tcp header.TCP, dataLen int) Result { st := t.handlerReply(t, tcp, dataLen) if st != ResultDrop { t.state = st } return st } // UpdateStateOriginal updates the state of the TCB based on the supplied // original segment. func (t *TCB) UpdateStateOriginal(tcp header.TCP, dataLen int) Result { st := t.handlerOriginal(t, tcp, dataLen) if st != ResultDrop { t.state = st } return st } // State returns the current state of the TCB. func (t *TCB) State() Result { return t.state } // IsAlive returns true as long as the connection is established(Alive) // or connecting state. func (t *TCB) IsAlive() bool { return !t.reply.rstSeen && !t.original.rstSeen && (!t.reply.closed() || !t.original.closed()) } // OriginalSendSequenceNumber returns the snd.NXT for the original stream. func (t *TCB) OriginalSendSequenceNumber() seqnum.Value { return t.original.nxt } // ReplySendSequenceNumber returns the snd.NXT for the reply stream. func (t *TCB) ReplySendSequenceNumber() seqnum.Value { return t.reply.nxt } // adapResult modifies the supplied "Result" according to the state of the TCB; // if r is anything other than "Alive", or if one of the streams isn't closed // yet, it is returned unmodified. Otherwise it's converted to either // ClosedByOriginator or ClosedByResponder depending on which stream was closed // first. func (t *TCB) adaptResult(r Result) Result { // Check the unmodified case. if r != ResultAlive || !t.reply.closed() || !t.original.closed() { return r } // Find out which was closed first. if t.firstFin == &t.original { return ResultClosedByOriginator } return ResultClosedByResponder } // synSentStateReply is the state handler for reply segments when the // connection is in SYN-SENT state. func synSentStateReply(t *TCB, tcp header.TCP, dataLen int) Result { flags := tcp.Flags() ackPresent := flags&header.TCPFlagAck != 0 ack := seqnum.Value(tcp.AckNumber()) // Ignore segment if ack is present but not acceptable. if ackPresent && !(ack-1).InRange(t.original.una, t.original.nxt) { return ResultConnecting } // If reset is specified, we will let the packet through no matter what // but we will also destroy the connection if the ACK is present (and // implicitly acceptable). if flags&header.TCPFlagRst != 0 { if ackPresent { t.reply.rstSeen = true return ResultReset } return ResultConnecting } // Ignore segment if SYN is not set. if flags&header.TCPFlagSyn == 0 { return ResultConnecting } // TODO(gvisor.dev/issue/6734): Cache TCP options instead of re-parsing them. // Because original and reply are streams, scale applies to the reply; it is // the receive window in the original direction. t.original.shiftCnt = header.ParseSynOptions(tcp.Options(), ackPresent).WS // Window scaling works only when both ends use the scale option. if t.original.shiftCnt != -1 && t.reply.shiftCnt != -1 { // Per RFC 1323 section 2.3: // // "If a Window Scale option is received with a shift.cnt value exceeding // 14, the TCP should log the error but use 14 instead of the specified // value." if t.original.shiftCnt > maxWindowShift { t.original.shiftCnt = maxWindowShift } if t.reply.shiftCnt > maxWindowShift { t.original.shiftCnt = maxWindowShift } } else { t.original.shiftCnt = 0 t.reply.shiftCnt = 0 } // Update state informed by this SYN. irs := seqnum.Value(tcp.SequenceNumber()) t.reply.una = irs t.reply.nxt = irs.Add(logicalLen(tcp, dataLen, seqnum.Size(t.reply.end) /* end currently holds the receive window size */)) t.reply.end <<= t.reply.shiftCnt t.reply.end.UpdateForward(seqnum.Size(irs)) windowSize := t.original.windowSize(tcp) t.original.end = t.original.una.Add(windowSize) // If the ACK was set (it is acceptable), update our unacknowledgement // tracking. if ackPresent { // Advance the "una" and "end" indices of the original stream. if t.original.una.LessThan(ack) { t.original.una = ack } if end := ack.Add(seqnum.Size(windowSize)); t.original.end.LessThan(end) { t.original.end = end } } // Update handlers so that new calls will be handled by new state. t.handlerReply = allOtherReply t.handlerOriginal = allOtherOriginal return ResultAlive } // synSentStateOriginal is the state handler for original segments when the // connection is in SYN-SENT state. func synSentStateOriginal(t *TCB, tcp header.TCP, _ int) Result { // Drop original segments that aren't retransmits of the original one. if tcp.Flags() != header.TCPFlagSyn || tcp.SequenceNumber() != uint32(t.original.una) { return ResultDrop } // Update the receive window. We only remember the largest value seen. if wnd := seqnum.Value(tcp.WindowSize()); wnd > t.reply.end { t.reply.end = wnd } return ResultConnecting } // update updates the state of reply and original streams, given the supplied // reply segment. For original segments, this same function can be called with // swapped reply/original streams. func update(tcp header.TCP, reply, original *stream, firstFin **stream, dataLen int) Result { // Ignore segments out of the window. s := seqnum.Value(tcp.SequenceNumber()) if !reply.acceptable(s, seqnum.Size(dataLen)) { return ResultAlive } flags := tcp.Flags() if flags&header.TCPFlagRst != 0 { reply.rstSeen = true return ResultReset } // Ignore segments that don't have the ACK flag, and those with the SYN // flag. if flags&header.TCPFlagAck == 0 || flags&header.TCPFlagSyn != 0 { return ResultAlive } // Ignore segments that acknowledge not yet sent data. ack := seqnum.Value(tcp.AckNumber()) if original.nxt.LessThan(ack) { return ResultAlive } // Advance the "una" and "end" indices of the original stream. if original.una.LessThan(ack) { original.una = ack } if end := ack.Add(original.windowSize(tcp)); original.end.LessThan(end) { original.end = end } // Advance the "nxt" index of the reply stream. end := s.Add(logicalLen(tcp, dataLen, reply.rwndSize())) if reply.nxt.LessThan(end) { reply.nxt = end } // Note the index of the FIN segment. And stash away a pointer to the // first stream to see a FIN. if flags&header.TCPFlagFin != 0 && !reply.finSeen { reply.finSeen = true reply.fin = end - 1 if *firstFin == nil { *firstFin = reply } } return ResultAlive } // allOtherReply is the state handler for reply segments in all states // except SYN-SENT. func allOtherReply(t *TCB, tcp header.TCP, dataLen int) Result { return t.adaptResult(update(tcp, &t.reply, &t.original, &t.firstFin, dataLen)) } // allOtherOriginal is the state handler for original segments in all states // except SYN-SENT. func allOtherOriginal(t *TCB, tcp header.TCP, dataLen int) Result { return t.adaptResult(update(tcp, &t.original, &t.reply, &t.firstFin, dataLen)) } // streams holds the state of a TCP unidirectional stream. // // +stateify savable type stream struct { // The interval [una, end) is the allowed interval as defined by the // receiver, i.e., anything less than una has already been acknowledged // and anything greater than or equal to end is beyond the receiver // window. The interval [una, nxt) is the acknowledgable range, whose // right edge indicates the sequence number of the next byte to be sent // by the sender, i.e., anything greater than or equal to nxt hasn't // been sent yet. una seqnum.Value nxt seqnum.Value end seqnum.Value // finSeen indicates if a FIN has already been sent on this stream. finSeen bool // fin is the sequence number of the FIN. It is only valid after finSeen // is set to true. fin seqnum.Value // rstSeen indicates if a RST has already been sent on this stream. rstSeen bool // shiftCnt is the shift of the window scale of the receiver of the stream, // i.e. in a stream from A to B it is B's receive window scale. It cannot be // greater than maxWindowScale. shiftCnt int } // acceptable determines if the segment with the given sequence number and data // length is acceptable, i.e., if it's within the [una, end) window or, in case // the window is zero, if it's a packet with no payload and sequence number // equal to una. func (s *stream) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool { return header.Acceptable(segSeq, segLen, s.una, s.end) } // closed determines if the stream has already been closed. This happens when // a FIN has been set by the sender and acknowledged by the receiver. func (s *stream) closed() bool { return s.finSeen && s.fin.LessThan(s.una) } // rwndSize returns the stream's receive window size. func (s *stream) rwndSize() seqnum.Size { return s.una.Size(s.end) } // windowSize returns the stream's window size accounting for scale. func (s *stream) windowSize(tcp header.TCP) seqnum.Size { return seqnum.Size(tcp.WindowSize()) << s.shiftCnt } // logicalLenSyn calculates the logical length of a SYN (without ACK) segment. // It is similar to logicalLen, but does not impose a window size requirement // because of the SYN. func logicalLenSyn(tcp header.TCP, dataLen int) seqnum.Size { length := seqnum.Size(dataLen) flags := tcp.Flags() if flags&header.TCPFlagSyn != 0 { length++ } if flags&header.TCPFlagFin != 0 { length++ } return length } // logicalLen calculates the logical length of the TCP segment. func logicalLen(tcp header.TCP, dataLen int, windowSize seqnum.Size) seqnum.Size { // If the segment is too large, TCP trims the payload per RFC 793 page 70. length := logicalLenSyn(tcp, dataLen) if length > windowSize { length = windowSize } return length } // IsEmpty returns true if tcb is not initialized. func (t *TCB) IsEmpty() bool { if t.reply != (stream{}) || t.original != (stream{}) { return false } if t.firstFin != nil || t.state != ResultDrop { return false } return true } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/tcpconntrack/tcpconntrack_state_autogen.go000066400000000000000000000037121465435605700332350ustar00rootroot00000000000000// automatically generated by stateify. package tcpconntrack import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (t *TCB) StateTypeName() string { return "pkg/tcpip/transport/tcpconntrack.TCB" } func (t *TCB) StateFields() []string { return []string{ "reply", "original", "firstFin", "state", } } func (t *TCB) beforeSave() {} // +checklocksignore func (t *TCB) StateSave(stateSinkObject state.Sink) { t.beforeSave() stateSinkObject.Save(0, &t.reply) stateSinkObject.Save(1, &t.original) stateSinkObject.Save(2, &t.firstFin) stateSinkObject.Save(3, &t.state) } func (t *TCB) afterLoad(context.Context) {} // +checklocksignore func (t *TCB) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &t.reply) stateSourceObject.Load(1, &t.original) stateSourceObject.Load(2, &t.firstFin) stateSourceObject.Load(3, &t.state) } func (s *stream) StateTypeName() string { return "pkg/tcpip/transport/tcpconntrack.stream" } func (s *stream) StateFields() []string { return []string{ "una", "nxt", "end", "finSeen", "fin", "rstSeen", "shiftCnt", } } func (s *stream) beforeSave() {} // +checklocksignore func (s *stream) StateSave(stateSinkObject state.Sink) { s.beforeSave() stateSinkObject.Save(0, &s.una) stateSinkObject.Save(1, &s.nxt) stateSinkObject.Save(2, &s.end) stateSinkObject.Save(3, &s.finSeen) stateSinkObject.Save(4, &s.fin) stateSinkObject.Save(5, &s.rstSeen) stateSinkObject.Save(6, &s.shiftCnt) } func (s *stream) afterLoad(context.Context) {} // +checklocksignore func (s *stream) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &s.una) stateSourceObject.Load(1, &s.nxt) stateSourceObject.Load(2, &s.end) stateSourceObject.Load(3, &s.finSeen) stateSourceObject.Load(4, &s.fin) stateSourceObject.Load(5, &s.rstSeen) stateSourceObject.Load(6, &s.shiftCnt) } func init() { state.Register((*TCB)(nil)) state.Register((*stream)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/transport.go000066400000000000000000000012321465435605700251600ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package transport supports transport protocols. package transport golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/transport_state_autogen.go000066400000000000000000000000731465435605700301040ustar00rootroot00000000000000// automatically generated by stateify. package transport golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/udp/000077500000000000000000000000001465435605700233675ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/udp/endpoint.go000066400000000000000000000747651465435605700255610ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package udp import ( "bytes" "fmt" "io" "math" "time" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/checksum" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/ports" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport" "gvisor.dev/gvisor/pkg/tcpip/transport/internal/network" "gvisor.dev/gvisor/pkg/waiter" ) // +stateify savable type udpPacket struct { udpPacketEntry netProto tcpip.NetworkProtocolNumber senderAddress tcpip.FullAddress destinationAddress tcpip.FullAddress packetInfo tcpip.IPPacketInfo pkt *stack.PacketBuffer receivedAt time.Time `state:".(int64)"` // tosOrTClass stores either the Type of Service for IPv4 or the Traffic Class // for IPv6. tosOrTClass uint8 // ttlOrHopLimit stores either the TTL for IPv4 or the HopLimit for IPv6 ttlOrHopLimit uint8 } // endpoint represents a UDP endpoint. This struct serves as the interface // between users of the endpoint and the protocol implementation; it is legal to // have concurrent goroutines make calls into the endpoint, they are properly // synchronized. // // It implements tcpip.Endpoint. // // +stateify savable type endpoint struct { tcpip.DefaultSocketOptionsHandler // The following fields are initialized at creation time and do not // change throughout the lifetime of the endpoint. stack *stack.Stack `state:"manual"` waiterQueue *waiter.Queue net network.Endpoint stats tcpip.TransportEndpointStats ops tcpip.SocketOptions // The following fields are used to manage the receive queue, and are // protected by rcvMu. rcvMu sync.Mutex `state:"nosave"` rcvReady bool rcvList udpPacketList rcvBufSize int rcvClosed bool lastErrorMu sync.Mutex `state:"nosave"` lastError tcpip.Error // The following fields are protected by the mu mutex. mu sync.RWMutex `state:"nosave"` portFlags ports.Flags // Values used to reserve a port or register a transport endpoint. // (which ever happens first). boundBindToDevice tcpip.NICID boundPortFlags ports.Flags readShutdown bool // effectiveNetProtos contains the network protocols actually in use. In // most cases it will only contain "netProto", but in cases like IPv6 // endpoints with v6only set to false, this could include multiple // protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g., // IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped // address). effectiveNetProtos []tcpip.NetworkProtocolNumber // frozen indicates if the packets should be delivered to the endpoint // during restore. frozen bool localPort uint16 remotePort uint16 } func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint { e := &endpoint{ stack: s, waiterQueue: waiterQueue, } e.ops.InitHandler(e, e.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits) e.ops.SetMulticastLoop(true) e.ops.SetSendBufferSize(32*1024, false /* notify */) e.ops.SetReceiveBufferSize(32*1024, false /* notify */) e.net.Init(s, netProto, header.UDPProtocolNumber, &e.ops, waiterQueue) // Override with stack defaults. var ss tcpip.SendBufferSizeOption if err := s.Option(&ss); err == nil { e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) } var rs tcpip.ReceiveBufferSizeOption if err := s.Option(&rs); err == nil { e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */) } return e } // WakeupWriters implements tcpip.SocketOptionsHandler. func (e *endpoint) WakeupWriters() { e.net.MaybeSignalWritable() } func (e *endpoint) LastError() tcpip.Error { e.lastErrorMu.Lock() defer e.lastErrorMu.Unlock() err := e.lastError e.lastError = nil return err } // UpdateLastError implements tcpip.SocketOptionsHandler. func (e *endpoint) UpdateLastError(err tcpip.Error) { e.lastErrorMu.Lock() e.lastError = err e.lastErrorMu.Unlock() } // Abort implements stack.TransportEndpoint. func (e *endpoint) Abort() { e.Close() } // Close puts the endpoint in a closed state and frees all resources // associated with it. func (e *endpoint) Close() { e.mu.Lock() switch state := e.net.State(); state { case transport.DatagramEndpointStateInitial: case transport.DatagramEndpointStateClosed: e.mu.Unlock() return case transport.DatagramEndpointStateBound, transport.DatagramEndpointStateConnected: id := e.net.Info().ID id.LocalPort = e.localPort id.RemotePort = e.remotePort e.stack.UnregisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, id, e, e.boundPortFlags, e.boundBindToDevice) portRes := ports.Reservation{ Networks: e.effectiveNetProtos, Transport: ProtocolNumber, Addr: id.LocalAddress, Port: id.LocalPort, Flags: e.boundPortFlags, BindToDevice: e.boundBindToDevice, Dest: tcpip.FullAddress{}, } e.stack.ReleasePort(portRes) e.boundBindToDevice = 0 e.boundPortFlags = ports.Flags{} default: panic(fmt.Sprintf("unhandled state = %s", state)) } // Close the receive list and drain it. e.rcvMu.Lock() e.rcvClosed = true e.rcvBufSize = 0 for !e.rcvList.Empty() { p := e.rcvList.Front() e.rcvList.Remove(p) p.pkt.DecRef() } e.rcvMu.Unlock() e.net.Shutdown() e.net.Close() e.readShutdown = true e.mu.Unlock() e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) } // ModerateRecvBuf implements tcpip.Endpoint. func (*endpoint) ModerateRecvBuf(int) {} // Read implements tcpip.Endpoint. func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) { if err := e.LastError(); err != nil { return tcpip.ReadResult{}, err } e.rcvMu.Lock() if e.rcvList.Empty() { var err tcpip.Error = &tcpip.ErrWouldBlock{} if e.rcvClosed { e.stats.ReadErrors.ReadClosed.Increment() err = &tcpip.ErrClosedForReceive{} } e.rcvMu.Unlock() return tcpip.ReadResult{}, err } p := e.rcvList.Front() if !opts.Peek { e.rcvList.Remove(p) defer p.pkt.DecRef() e.rcvBufSize -= p.pkt.Data().Size() } e.rcvMu.Unlock() // Control Messages // TODO(https://gvisor.dev/issue/7012): Share control message code with other // network endpoints. cm := tcpip.ReceivableControlMessages{ HasTimestamp: true, Timestamp: p.receivedAt, } switch p.netProto { case header.IPv4ProtocolNumber: if e.ops.GetReceiveTOS() { cm.HasTOS = true cm.TOS = p.tosOrTClass } if e.ops.GetReceiveTTL() { cm.HasTTL = true cm.TTL = p.ttlOrHopLimit } if e.ops.GetReceivePacketInfo() { cm.HasIPPacketInfo = true cm.PacketInfo = p.packetInfo } case header.IPv6ProtocolNumber: if e.ops.GetReceiveTClass() { cm.HasTClass = true // Although TClass is an 8-bit value it's read in the CMsg as a uint32. cm.TClass = uint32(p.tosOrTClass) } if e.ops.GetReceiveHopLimit() { cm.HasHopLimit = true cm.HopLimit = p.ttlOrHopLimit } if e.ops.GetIPv6ReceivePacketInfo() { cm.HasIPv6PacketInfo = true cm.IPv6PacketInfo = tcpip.IPv6PacketInfo{ NIC: p.packetInfo.NIC, Addr: p.packetInfo.DestinationAddr, } } default: panic(fmt.Sprintf("unrecognized network protocol = %d", p.netProto)) } if e.ops.GetReceiveOriginalDstAddress() { cm.HasOriginalDstAddress = true cm.OriginalDstAddress = p.destinationAddress } // Read Result res := tcpip.ReadResult{ Total: p.pkt.Data().Size(), ControlMessages: cm, } if opts.NeedRemoteAddr { res.RemoteAddr = p.senderAddress } n, err := p.pkt.Data().ReadTo(dst, opts.Peek) if n == 0 && err != nil { return res, &tcpip.ErrBadBuffer{} } res.Count = n return res, nil } // prepareForWriteInner prepares the endpoint for sending data. In particular, // it binds it if it's still in the initial state. To do so, it must first // reacquire the mutex in exclusive mode. // // Returns true for retry if preparation should be retried. // +checklocksread:e.mu func (e *endpoint) prepareForWriteInner(to *tcpip.FullAddress) (retry bool, err tcpip.Error) { switch e.net.State() { case transport.DatagramEndpointStateInitial: case transport.DatagramEndpointStateConnected: return false, nil case transport.DatagramEndpointStateBound: if to == nil { return false, &tcpip.ErrDestinationRequired{} } return false, nil default: return false, &tcpip.ErrInvalidEndpointState{} } e.mu.RUnlock() e.mu.Lock() defer e.mu.DowngradeLock() // The state changed when we released the shared locked and re-acquired // it in exclusive mode. Try again. if e.net.State() != transport.DatagramEndpointStateInitial { return true, nil } // The state is still 'initial', so try to bind the endpoint. if err := e.bindLocked(tcpip.FullAddress{}); err != nil { return false, err } return true, nil } var _ tcpip.EndpointWithPreflight = (*endpoint)(nil) // Validates the passed WriteOptions and prepares the endpoint for writes // using those options. If the endpoint is unbound and the `To` address // is specified, binds the endpoint to that address. func (e *endpoint) Preflight(opts tcpip.WriteOptions) tcpip.Error { var r bytes.Reader udpInfo, err := e.prepareForWrite(&r, opts) if err == nil { udpInfo.ctx.Release() } return err } // Write writes data to the endpoint's peer. This method does not block // if the data cannot be written. func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { n, err := e.write(p, opts) switch err.(type) { case nil: e.stats.PacketsSent.Increment() case *tcpip.ErrMessageTooLong, *tcpip.ErrInvalidOptionValue: e.stats.WriteErrors.InvalidArgs.Increment() case *tcpip.ErrClosedForSend: e.stats.WriteErrors.WriteClosed.Increment() case *tcpip.ErrInvalidEndpointState: e.stats.WriteErrors.InvalidEndpointState.Increment() case *tcpip.ErrHostUnreachable, *tcpip.ErrBroadcastDisabled, *tcpip.ErrNetworkUnreachable: // Errors indicating any problem with IP routing of the packet. e.stats.SendErrors.NoRoute.Increment() default: // For all other errors when writing to the network layer. e.stats.SendErrors.SendToNetworkFailed.Increment() } return n, err } func (e *endpoint) prepareForWrite(p tcpip.Payloader, opts tcpip.WriteOptions) (udpPacketInfo, tcpip.Error) { e.mu.RLock() defer e.mu.RUnlock() // Prepare for write. for { retry, err := e.prepareForWriteInner(opts.To) if err != nil { return udpPacketInfo{}, err } if !retry { break } } dst, connected := e.net.GetRemoteAddress() dst.Port = e.remotePort if opts.To != nil { if opts.To.Port == 0 { // Port 0 is an invalid port to send to. return udpPacketInfo{}, &tcpip.ErrInvalidEndpointState{} } dst = *opts.To } else if !connected { return udpPacketInfo{}, &tcpip.ErrDestinationRequired{} } ctx, err := e.net.AcquireContextForWrite(opts) if err != nil { return udpPacketInfo{}, err } if p.Len() > header.UDPMaximumPacketSize { // Native linux behaviour differs for IPv4 and IPv6 packets; IPv4 packet // errors aren't report to the error queue at all. if ctx.PacketInfo().NetProto == header.IPv6ProtocolNumber { so := e.SocketOptions() if so.GetIPv6RecvError() { so.QueueLocalErr( &tcpip.ErrMessageTooLong{}, e.net.NetProto(), uint32(p.Len()), dst, nil, ) } } ctx.Release() return udpPacketInfo{}, &tcpip.ErrMessageTooLong{} } return udpPacketInfo{ ctx: ctx, localPort: e.localPort, remotePort: dst.Port, }, nil } func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { // Do not hold lock when sending as loopback is synchronous and if the UDP // datagram ends up generating an ICMP response then it can result in a // deadlock where the ICMP response handling ends up acquiring this endpoint's // mutex using e.mu.RLock() in endpoint.HandleControlPacket which can cause a // deadlock if another caller is trying to acquire e.mu in exclusive mode w/ // e.mu.Lock(). Since e.mu.Lock() prevents any new read locks to ensure the // lock can be eventually acquired. // // See: https://golang.org/pkg/sync/#RWMutex for details on why recursive read // locking is prohibited. if err := e.LastError(); err != nil { return 0, err } udpInfo, err := e.prepareForWrite(p, opts) if err != nil { return 0, err } defer udpInfo.ctx.Release() dataSz := p.Len() pktInfo := udpInfo.ctx.PacketInfo() pkt := udpInfo.ctx.TryNewPacketBufferFromPayloader(header.UDPMinimumSize+int(pktInfo.MaxHeaderLength), p) if pkt == nil { return 0, &tcpip.ErrWouldBlock{} } defer pkt.DecRef() // Initialize the UDP header. udp := header.UDP(pkt.TransportHeader().Push(header.UDPMinimumSize)) pkt.TransportProtocolNumber = ProtocolNumber length := uint16(pkt.Size()) udp.Encode(&header.UDPFields{ SrcPort: udpInfo.localPort, DstPort: udpInfo.remotePort, Length: length, }) // Set the checksum field unless TX checksum offload is enabled. // On IPv4, UDP checksum is optional, and a zero value indicates the // transmitter skipped the checksum generation (RFC768). // On IPv6, UDP checksum is not optional (RFC2460 Section 8.1). if pktInfo.RequiresTXTransportChecksum && (!e.ops.GetNoChecksum() || pktInfo.NetProto == header.IPv6ProtocolNumber) { xsum := udp.CalculateChecksum(checksum.Combine( header.PseudoHeaderChecksum(ProtocolNumber, pktInfo.LocalAddress, pktInfo.RemoteAddress, length), pkt.Data().Checksum(), )) // As per RFC 768 page 2, // // Checksum is the 16-bit one's complement of the one's complement sum of // a pseudo header of information from the IP header, the UDP header, and // the data, padded with zero octets at the end (if necessary) to make a // multiple of two octets. // // The pseudo header conceptually prefixed to the UDP header contains the // source address, the destination address, the protocol, and the UDP // length. This information gives protection against misrouted datagrams. // This checksum procedure is the same as is used in TCP. // // If the computed checksum is zero, it is transmitted as all ones (the // equivalent in one's complement arithmetic). An all zero transmitted // checksum value means that the transmitter generated no checksum (for // debugging or for higher level protocols that don't care). // // To avoid the zero value, we only calculate the one's complement of the // one's complement sum if the sum is not all ones. if xsum != math.MaxUint16 { xsum = ^xsum } udp.SetChecksum(xsum) } if err := udpInfo.ctx.WritePacket(pkt, false /* headerIncluded */); err != nil { e.stack.Stats().UDP.PacketSendErrors.Increment() return 0, err } // Track count of packets sent. e.stack.Stats().UDP.PacketsSent.Increment() return int64(dataSz), nil } // OnReuseAddressSet implements tcpip.SocketOptionsHandler. func (e *endpoint) OnReuseAddressSet(v bool) { e.mu.Lock() e.portFlags.MostRecent = v e.mu.Unlock() } // OnReusePortSet implements tcpip.SocketOptionsHandler. func (e *endpoint) OnReusePortSet(v bool) { e.mu.Lock() e.portFlags.LoadBalanced = v e.mu.Unlock() } // SetSockOptInt implements tcpip.Endpoint. func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { return e.net.SetSockOptInt(opt, v) } var _ tcpip.SocketOptionsHandler = (*endpoint)(nil) // HasNIC implements tcpip.SocketOptionsHandler. func (e *endpoint) HasNIC(id int32) bool { return e.stack.HasNIC(tcpip.NICID(id)) } // SetSockOpt implements tcpip.Endpoint. func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { return e.net.SetSockOpt(opt) } // GetSockOptInt implements tcpip.Endpoint. func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { switch opt { case tcpip.ReceiveQueueSizeOption: v := 0 e.rcvMu.Lock() if !e.rcvList.Empty() { p := e.rcvList.Front() v = p.pkt.Data().Size() } e.rcvMu.Unlock() return v, nil default: return e.net.GetSockOptInt(opt) } } // GetSockOpt implements tcpip.Endpoint. func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { return e.net.GetSockOpt(opt) } // udpPacketInfo holds information needed to send a UDP packet. type udpPacketInfo struct { ctx network.WriteContext localPort uint16 remotePort uint16 } // Disconnect implements tcpip.Endpoint. func (e *endpoint) Disconnect() tcpip.Error { e.mu.Lock() defer e.mu.Unlock() if e.net.State() != transport.DatagramEndpointStateConnected { return nil } var ( id stack.TransportEndpointID btd tcpip.NICID ) // We change this value below and we need the old value to unregister // the endpoint. boundPortFlags := e.boundPortFlags // Exclude ephemerally bound endpoints. info := e.net.Info() info.ID.LocalPort = e.localPort info.ID.RemotePort = e.remotePort if e.net.WasBound() { var err tcpip.Error id = stack.TransportEndpointID{ LocalPort: info.ID.LocalPort, LocalAddress: info.ID.LocalAddress, } id, btd, err = e.registerWithStack(e.effectiveNetProtos, id) if err != nil { return err } boundPortFlags = e.boundPortFlags } else { if info.ID.LocalPort != 0 { // Release the ephemeral port. portRes := ports.Reservation{ Networks: e.effectiveNetProtos, Transport: ProtocolNumber, Addr: info.ID.LocalAddress, Port: info.ID.LocalPort, Flags: boundPortFlags, BindToDevice: e.boundBindToDevice, Dest: tcpip.FullAddress{}, } e.stack.ReleasePort(portRes) e.boundPortFlags = ports.Flags{} } } e.stack.UnregisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, info.ID, e, boundPortFlags, e.boundBindToDevice) e.boundBindToDevice = btd e.localPort = id.LocalPort e.remotePort = id.RemotePort e.net.Disconnect() return nil } // Connect connects the endpoint to its peer. Specifying a NIC is optional. func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error { e.mu.Lock() defer e.mu.Unlock() err := e.net.ConnectAndThen(addr, func(netProto tcpip.NetworkProtocolNumber, previousID, nextID stack.TransportEndpointID) tcpip.Error { nextID.LocalPort = e.localPort nextID.RemotePort = addr.Port // Even if we're connected, this endpoint can still be used to send // packets on a different network protocol, so we register both even if // v6only is set to false and this is an ipv6 endpoint. netProtos := []tcpip.NetworkProtocolNumber{netProto} if netProto == header.IPv6ProtocolNumber && !e.ops.GetV6Only() && e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber) { netProtos = []tcpip.NetworkProtocolNumber{ header.IPv4ProtocolNumber, header.IPv6ProtocolNumber, } } oldPortFlags := e.boundPortFlags // Remove the old registration. if e.localPort != 0 { previousID.LocalPort = e.localPort previousID.RemotePort = e.remotePort e.stack.UnregisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, previousID, e, oldPortFlags, e.boundBindToDevice) } nextID, btd, err := e.registerWithStack(netProtos, nextID) if err != nil { return err } e.localPort = nextID.LocalPort e.remotePort = nextID.RemotePort e.boundBindToDevice = btd e.effectiveNetProtos = netProtos return nil }) if err != nil { return err } e.rcvMu.Lock() e.rcvReady = true e.rcvMu.Unlock() return nil } // ConnectEndpoint is not supported. func (*endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error { return &tcpip.ErrInvalidEndpointState{} } // Shutdown closes the read and/or write end of the endpoint connection // to its peer. func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error { e.mu.Lock() defer e.mu.Unlock() switch state := e.net.State(); state { case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateClosed: return &tcpip.ErrNotConnected{} case transport.DatagramEndpointStateBound, transport.DatagramEndpointStateConnected: default: panic(fmt.Sprintf("unhandled state = %s", state)) } if flags&tcpip.ShutdownWrite != 0 { if err := e.net.Shutdown(); err != nil { return err } } if flags&tcpip.ShutdownRead != 0 { e.readShutdown = true e.rcvMu.Lock() wasClosed := e.rcvClosed e.rcvClosed = true e.rcvMu.Unlock() if !wasClosed { e.waiterQueue.Notify(waiter.ReadableEvents) } } if e.net.State() == transport.DatagramEndpointStateBound { return &tcpip.ErrNotConnected{} } return nil } // Listen is not supported by UDP, it just fails. func (*endpoint) Listen(int) tcpip.Error { return &tcpip.ErrNotSupported{} } // Accept is not supported by UDP, it just fails. func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) { return nil, nil, &tcpip.ErrNotSupported{} } func (e *endpoint) registerWithStack(netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.NICID, tcpip.Error) { bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) if e.localPort == 0 { portRes := ports.Reservation{ Networks: netProtos, Transport: ProtocolNumber, Addr: id.LocalAddress, Port: id.LocalPort, Flags: e.portFlags, BindToDevice: bindToDevice, Dest: tcpip.FullAddress{}, } port, err := e.stack.ReservePort(e.stack.SecureRNG(), portRes, nil /* testPort */) if err != nil { return id, bindToDevice, err } id.LocalPort = port } e.boundPortFlags = e.portFlags err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.boundPortFlags, bindToDevice) if err != nil { portRes := ports.Reservation{ Networks: netProtos, Transport: ProtocolNumber, Addr: id.LocalAddress, Port: id.LocalPort, Flags: e.boundPortFlags, BindToDevice: bindToDevice, Dest: tcpip.FullAddress{}, } e.stack.ReleasePort(portRes) e.boundPortFlags = ports.Flags{} } return id, bindToDevice, err } func (e *endpoint) bindLocked(addr tcpip.FullAddress) tcpip.Error { // Don't allow binding once endpoint is not in the initial state // anymore. if e.net.State() != transport.DatagramEndpointStateInitial { return &tcpip.ErrInvalidEndpointState{} } err := e.net.BindAndThen(addr, func(boundNetProto tcpip.NetworkProtocolNumber, boundAddr tcpip.Address) tcpip.Error { // Expand netProtos to include v4 and v6 if the caller is binding to a // wildcard (empty) address, and this is an IPv6 endpoint with v6only // set to false. netProtos := []tcpip.NetworkProtocolNumber{boundNetProto} if boundNetProto == header.IPv6ProtocolNumber && !e.ops.GetV6Only() && boundAddr == (tcpip.Address{}) && e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber) { netProtos = []tcpip.NetworkProtocolNumber{ header.IPv6ProtocolNumber, header.IPv4ProtocolNumber, } } id := stack.TransportEndpointID{ LocalPort: addr.Port, LocalAddress: boundAddr, } id, btd, err := e.registerWithStack(netProtos, id) if err != nil { return err } e.localPort = id.LocalPort e.boundBindToDevice = btd e.effectiveNetProtos = netProtos return nil }) if err != nil { return err } e.rcvMu.Lock() e.rcvReady = true e.rcvMu.Unlock() return nil } // Bind binds the endpoint to a specific local address and port. // Specifying a NIC is optional. func (e *endpoint) Bind(addr tcpip.FullAddress) tcpip.Error { e.mu.Lock() defer e.mu.Unlock() err := e.bindLocked(addr) if err != nil { return err } return nil } // GetLocalAddress returns the address to which the endpoint is bound. func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { e.mu.RLock() defer e.mu.RUnlock() addr := e.net.GetLocalAddress() addr.Port = e.localPort return addr, nil } // GetRemoteAddress returns the address to which the endpoint is connected. func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { e.mu.RLock() defer e.mu.RUnlock() addr, connected := e.net.GetRemoteAddress() if !connected || e.remotePort == 0 { return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} } addr.Port = e.remotePort return addr, nil } // Readiness returns the current readiness of the endpoint. For example, if // waiter.EventIn is set, the endpoint is immediately readable. func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { var result waiter.EventMask if e.net.HasSendSpace() { result |= waiter.WritableEvents & mask } // Determine if the endpoint is readable if requested. if mask&waiter.ReadableEvents != 0 { e.rcvMu.Lock() if !e.rcvList.Empty() || e.rcvClosed { result |= waiter.ReadableEvents } e.rcvMu.Unlock() } e.lastErrorMu.Lock() hasError := e.lastError != nil e.lastErrorMu.Unlock() if hasError { result |= waiter.EventErr } return result } // HandlePacket is called by the stack when new packets arrive to this transport // endpoint. func (e *endpoint) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketBuffer) { // Get the header then trim it from the view. hdr := header.UDP(pkt.TransportHeader().Slice()) netHdr := pkt.Network() lengthValid, csumValid := header.UDPValid( hdr, func() uint16 { return pkt.Data().Checksum() }, uint16(pkt.Data().Size()), pkt.NetworkProtocolNumber, netHdr.SourceAddress(), netHdr.DestinationAddress(), pkt.RXChecksumValidated) if !lengthValid { // Malformed packet. e.stack.Stats().UDP.MalformedPacketsReceived.Increment() e.stats.ReceiveErrors.MalformedPacketsReceived.Increment() return } if !csumValid { e.stack.Stats().UDP.ChecksumErrors.Increment() e.stats.ReceiveErrors.ChecksumErrors.Increment() return } e.stack.Stats().UDP.PacketsReceived.Increment() e.stats.PacketsReceived.Increment() e.rcvMu.Lock() // Drop the packet if our buffer is not ready to receive packets. if !e.rcvReady || e.rcvClosed { e.rcvMu.Unlock() e.stack.Stats().UDP.ReceiveBufferErrors.Increment() e.stats.ReceiveErrors.ClosedReceiver.Increment() return } rcvBufSize := e.ops.GetReceiveBufferSize() // Drop the packet if our buffer is currently full. if e.frozen || e.rcvBufSize >= int(rcvBufSize) { e.rcvMu.Unlock() e.stack.Stats().UDP.ReceiveBufferErrors.Increment() e.stats.ReceiveErrors.ReceiveBufferOverflow.Increment() return } wasEmpty := e.rcvBufSize == 0 // Push new packet into receive list and increment the buffer size. packet := &udpPacket{ netProto: pkt.NetworkProtocolNumber, senderAddress: tcpip.FullAddress{ NIC: pkt.NICID, Addr: id.RemoteAddress, Port: hdr.SourcePort(), }, destinationAddress: tcpip.FullAddress{ NIC: pkt.NICID, Addr: id.LocalAddress, Port: hdr.DestinationPort(), }, pkt: pkt.IncRef(), } e.rcvList.PushBack(packet) e.rcvBufSize += pkt.Data().Size() // Save any useful information from the network header to the packet. packet.tosOrTClass, _ = pkt.Network().TOS() switch pkt.NetworkProtocolNumber { case header.IPv4ProtocolNumber: packet.ttlOrHopLimit = header.IPv4(pkt.NetworkHeader().Slice()).TTL() case header.IPv6ProtocolNumber: packet.ttlOrHopLimit = header.IPv6(pkt.NetworkHeader().Slice()).HopLimit() } // TODO(gvisor.dev/issue/3556): r.LocalAddress may be a multicast or broadcast // address. packetInfo.LocalAddr should hold a unicast address that can be // used to respond to the incoming packet. localAddr := pkt.Network().DestinationAddress() packet.packetInfo.LocalAddr = localAddr packet.packetInfo.DestinationAddr = localAddr packet.packetInfo.NIC = pkt.NICID packet.receivedAt = e.stack.Clock().Now() e.rcvMu.Unlock() // Notify any waiters that there's data to be read now. if wasEmpty { e.waiterQueue.Notify(waiter.ReadableEvents) } } func (e *endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt *stack.PacketBuffer) { // Update last error first. e.lastErrorMu.Lock() e.lastError = err e.lastErrorMu.Unlock() var recvErr bool switch pkt.NetworkProtocolNumber { case header.IPv4ProtocolNumber: recvErr = e.SocketOptions().GetIPv4RecvError() case header.IPv6ProtocolNumber: recvErr = e.SocketOptions().GetIPv6RecvError() default: panic(fmt.Sprintf("unhandled network protocol number = %d", pkt.NetworkProtocolNumber)) } if recvErr { // Linux passes the payload without the UDP header. payload := pkt.Data().AsRange().ToView() udp := header.UDP(payload.AsSlice()) if len(udp) >= header.UDPMinimumSize { payload.TrimFront(header.UDPMinimumSize) } id := e.net.Info().ID e.mu.RLock() e.SocketOptions().QueueErr(&tcpip.SockError{ Err: err, Cause: transErr, Payload: payload, Dst: tcpip.FullAddress{ NIC: pkt.NICID, Addr: id.RemoteAddress, Port: e.remotePort, }, Offender: tcpip.FullAddress{ NIC: pkt.NICID, Addr: id.LocalAddress, Port: e.localPort, }, NetProto: pkt.NetworkProtocolNumber, }) e.mu.RUnlock() } // Notify of the error. e.waiterQueue.Notify(waiter.EventErr) } // HandleError implements stack.TransportEndpoint. func (e *endpoint) HandleError(transErr stack.TransportError, pkt *stack.PacketBuffer) { // TODO(gvisor.dev/issues/5270): Handle all transport errors. switch transErr.Kind() { case stack.DestinationPortUnreachableTransportError: if e.net.State() == transport.DatagramEndpointStateConnected { e.onICMPError(&tcpip.ErrConnectionRefused{}, transErr, pkt) } } } // State implements tcpip.Endpoint. func (e *endpoint) State() uint32 { return uint32(e.net.State()) } // Info returns a copy of the endpoint info. func (e *endpoint) Info() tcpip.EndpointInfo { e.mu.RLock() defer e.mu.RUnlock() info := e.net.Info() info.ID.LocalPort = e.localPort info.ID.RemotePort = e.remotePort return &info } // Stats returns a pointer to the endpoint stats. func (e *endpoint) Stats() tcpip.EndpointStats { return &e.stats } // Wait implements tcpip.Endpoint. func (*endpoint) Wait() {} // SetOwner implements tcpip.Endpoint. func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { e.net.SetOwner(owner) } // SocketOptions implements tcpip.Endpoint. func (e *endpoint) SocketOptions() *tcpip.SocketOptions { return &e.ops } // freeze prevents any more packets from being delivered to the endpoint. func (e *endpoint) freeze() { e.mu.Lock() e.frozen = true e.mu.Unlock() } // thaw unfreezes a previously frozen endpoint using endpoint.freeze() allows // new packets to be delivered again. func (e *endpoint) thaw() { e.mu.Lock() e.frozen = false e.mu.Unlock() } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/udp/endpoint_state.go000066400000000000000000000045571465435605700267510ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package udp import ( "context" "fmt" "time" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport" ) // saveReceivedAt is invoked by stateify. func (p *udpPacket) saveReceivedAt() int64 { return p.receivedAt.UnixNano() } // loadReceivedAt is invoked by stateify. func (p *udpPacket) loadReceivedAt(_ context.Context, nsec int64) { p.receivedAt = time.Unix(0, nsec) } // afterLoad is invoked by stateify. func (e *endpoint) afterLoad(ctx context.Context) { stack.RestoreStackFromContext(ctx).RegisterRestoredEndpoint(e) } // beforeSave is invoked by stateify. func (e *endpoint) beforeSave() { e.freeze() e.stack.RegisterResumableEndpoint(e) } // Restore implements tcpip.RestoredEndpoint.Restore. func (e *endpoint) Restore(s *stack.Stack) { e.thaw() e.mu.Lock() defer e.mu.Unlock() e.net.Resume(s) e.stack = s e.ops.InitHandler(e, e.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits) switch state := e.net.State(); state { case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateClosed: case transport.DatagramEndpointStateBound, transport.DatagramEndpointStateConnected: // Our saved state had a port, but we don't actually have a // reservation. We need to remove the port from our state, but still // pass it to the reservation machinery. var err tcpip.Error id := e.net.Info().ID id.LocalPort = e.localPort id.RemotePort = e.remotePort id, e.boundBindToDevice, err = e.registerWithStack(e.effectiveNetProtos, id) if err != nil { panic(err) } e.localPort = id.LocalPort e.remotePort = id.RemotePort default: panic(fmt.Sprintf("unhandled state = %s", state)) } } // Resume implements tcpip.ResumableEndpoint.Resume. func (e *endpoint) Resume() { e.thaw() } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/udp/forwarder.go000066400000000000000000000062541465435605700257200ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package udp import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/waiter" ) // Forwarder is a session request forwarder, which allows clients to decide // what to do with a session request, for example: ignore it, or process it. // // The canonical way of using it is to pass the Forwarder.HandlePacket function // to stack.SetTransportProtocolHandler. type Forwarder struct { handler func(*ForwarderRequest) stack *stack.Stack } // NewForwarder allocates and initializes a new forwarder. func NewForwarder(s *stack.Stack, handler func(*ForwarderRequest)) *Forwarder { return &Forwarder{ stack: s, handler: handler, } } // HandlePacket handles all packets. // // This function is expected to be passed as an argument to the // stack.SetTransportProtocolHandler function. func (f *Forwarder) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool { f.handler(&ForwarderRequest{ stack: f.stack, id: id, pkt: pkt.IncRef(), }) return true } // ForwarderRequest represents a session request received by the forwarder and // passed to the client. Clients may optionally create an endpoint to represent // it via CreateEndpoint. type ForwarderRequest struct { stack *stack.Stack id stack.TransportEndpointID pkt *stack.PacketBuffer } // ID returns the 4-tuple (src address, src port, dst address, dst port) that // represents the session request. func (r *ForwarderRequest) ID() stack.TransportEndpointID { return r.id } // CreateEndpoint creates a connected UDP endpoint for the session request. func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { ep := newEndpoint(r.stack, r.pkt.NetworkProtocolNumber, queue) ep.mu.Lock() defer ep.mu.Unlock() netHdr := r.pkt.Network() if err := ep.net.Bind(tcpip.FullAddress{NIC: r.pkt.NICID, Addr: netHdr.DestinationAddress(), Port: r.id.LocalPort}); err != nil { return nil, err } if err := ep.net.Connect(tcpip.FullAddress{NIC: r.pkt.NICID, Addr: netHdr.SourceAddress(), Port: r.id.RemotePort}); err != nil { return nil, err } if err := r.stack.RegisterTransportEndpoint([]tcpip.NetworkProtocolNumber{r.pkt.NetworkProtocolNumber}, ProtocolNumber, r.id, ep, ep.portFlags, tcpip.NICID(ep.ops.GetBindToDevice())); err != nil { ep.Close() return nil, err } ep.localPort = r.id.LocalPort ep.remotePort = r.id.RemotePort ep.effectiveNetProtos = []tcpip.NetworkProtocolNumber{r.pkt.NetworkProtocolNumber} ep.boundPortFlags = ep.portFlags ep.rcvMu.Lock() ep.rcvReady = true ep.rcvMu.Unlock() ep.HandlePacket(r.id, r.pkt) return ep, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/udp/protocol.go000066400000000000000000000105761465435605700255700ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package udp contains the implementation of the UDP transport protocol. package udp import ( "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/header/parse" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/raw" "gvisor.dev/gvisor/pkg/waiter" ) const ( // ProtocolNumber is the udp protocol number. ProtocolNumber = header.UDPProtocolNumber // MinBufferSize is the smallest size of a receive or send buffer. MinBufferSize = 4 << 10 // 4KiB bytes. // DefaultSendBufferSize is the default size of the send buffer for // an endpoint. DefaultSendBufferSize = 32 << 10 // 32KiB // DefaultReceiveBufferSize is the default size of the receive buffer // for an endpoint. DefaultReceiveBufferSize = 32 << 10 // 32KiB // MaxBufferSize is the largest size a receive/send buffer can grow to. MaxBufferSize = 4 << 20 // 4MiB ) // +stateify savable type protocol struct { stack *stack.Stack } // Number returns the udp protocol number. func (*protocol) Number() tcpip.TransportProtocolNumber { return ProtocolNumber } // NewEndpoint creates a new udp endpoint. func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { return newEndpoint(p.stack, netProto, waiterQueue), nil } // NewRawEndpoint creates a new raw UDP endpoint. It implements // stack.TransportProtocol.NewRawEndpoint. func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { return raw.NewEndpoint(p.stack, netProto, header.UDPProtocolNumber, waiterQueue) } // MinimumPacketSize returns the minimum valid udp packet size. func (*protocol) MinimumPacketSize() int { return header.UDPMinimumSize } // ParsePorts returns the source and destination ports stored in the given udp // packet. func (*protocol) ParsePorts(v []byte) (src, dst uint16, err tcpip.Error) { h := header.UDP(v) return h.SourcePort(), h.DestinationPort(), nil } // HandleUnknownDestinationPacket handles packets that are targeted at this // protocol but don't match any existing endpoint. func (p *protocol) HandleUnknownDestinationPacket(id stack.TransportEndpointID, pkt *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition { hdr := header.UDP(pkt.TransportHeader().Slice()) netHdr := pkt.Network() lengthValid, csumValid := header.UDPValid( hdr, func() uint16 { return pkt.Data().Checksum() }, uint16(pkt.Data().Size()), pkt.NetworkProtocolNumber, netHdr.SourceAddress(), netHdr.DestinationAddress(), pkt.RXChecksumValidated) if !lengthValid { p.stack.Stats().UDP.MalformedPacketsReceived.Increment() return stack.UnknownDestinationPacketMalformed } if !csumValid { p.stack.Stats().UDP.ChecksumErrors.Increment() return stack.UnknownDestinationPacketMalformed } return stack.UnknownDestinationPacketUnhandled } // SetOption implements stack.TransportProtocol.SetOption. func (*protocol) SetOption(tcpip.SettableTransportProtocolOption) tcpip.Error { return &tcpip.ErrUnknownProtocolOption{} } // Option implements stack.TransportProtocol.Option. func (*protocol) Option(tcpip.GettableTransportProtocolOption) tcpip.Error { return &tcpip.ErrUnknownProtocolOption{} } // Close implements stack.TransportProtocol.Close. func (*protocol) Close() {} // Wait implements stack.TransportProtocol.Wait. func (*protocol) Wait() {} // Pause implements stack.TransportProtocol.Pause. func (*protocol) Pause() {} // Resume implements stack.TransportProtocol.Resume. func (*protocol) Resume() {} // Parse implements stack.TransportProtocol.Parse. func (*protocol) Parse(pkt *stack.PacketBuffer) bool { return parse.UDP(pkt) } // NewProtocol returns a UDP transport protocol. func NewProtocol(s *stack.Stack) stack.TransportProtocol { return &protocol{stack: s} } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/udp/udp_packet_list.go000066400000000000000000000122541465435605700270740ustar00rootroot00000000000000package udp // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type udpPacketElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (udpPacketElementMapper) linkerFor(elem *udpPacket) *udpPacket { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type udpPacketList struct { head *udpPacket tail *udpPacket } // Reset resets list l to the empty state. func (l *udpPacketList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *udpPacketList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *udpPacketList) Front() *udpPacket { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *udpPacketList) Back() *udpPacket { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *udpPacketList) Len() (count int) { for e := l.Front(); e != nil; e = (udpPacketElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *udpPacketList) PushFront(e *udpPacket) { linker := udpPacketElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { udpPacketElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *udpPacketList) PushFrontList(m *udpPacketList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { udpPacketElementMapper{}.linkerFor(l.head).SetPrev(m.tail) udpPacketElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *udpPacketList) PushBack(e *udpPacket) { linker := udpPacketElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { udpPacketElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *udpPacketList) PushBackList(m *udpPacketList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { udpPacketElementMapper{}.linkerFor(l.tail).SetNext(m.head) udpPacketElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *udpPacketList) InsertAfter(b, e *udpPacket) { bLinker := udpPacketElementMapper{}.linkerFor(b) eLinker := udpPacketElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { udpPacketElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *udpPacketList) InsertBefore(a, e *udpPacket) { aLinker := udpPacketElementMapper{}.linkerFor(a) eLinker := udpPacketElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { udpPacketElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *udpPacketList) Remove(e *udpPacket) { linker := udpPacketElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { udpPacketElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { udpPacketElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type udpPacketEntry struct { next *udpPacket prev *udpPacket } // Next returns the entry that follows e in the list. // //go:nosplit func (e *udpPacketEntry) Next() *udpPacket { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *udpPacketEntry) Prev() *udpPacket { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *udpPacketEntry) SetNext(elem *udpPacket) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *udpPacketEntry) SetPrev(elem *udpPacket) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/tcpip/transport/udp/udp_state_autogen.go000066400000000000000000000133421465435605700274330ustar00rootroot00000000000000// automatically generated by stateify. package udp import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (p *udpPacket) StateTypeName() string { return "pkg/tcpip/transport/udp.udpPacket" } func (p *udpPacket) StateFields() []string { return []string{ "udpPacketEntry", "netProto", "senderAddress", "destinationAddress", "packetInfo", "pkt", "receivedAt", "tosOrTClass", "ttlOrHopLimit", } } func (p *udpPacket) beforeSave() {} // +checklocksignore func (p *udpPacket) StateSave(stateSinkObject state.Sink) { p.beforeSave() var receivedAtValue int64 receivedAtValue = p.saveReceivedAt() stateSinkObject.SaveValue(6, receivedAtValue) stateSinkObject.Save(0, &p.udpPacketEntry) stateSinkObject.Save(1, &p.netProto) stateSinkObject.Save(2, &p.senderAddress) stateSinkObject.Save(3, &p.destinationAddress) stateSinkObject.Save(4, &p.packetInfo) stateSinkObject.Save(5, &p.pkt) stateSinkObject.Save(7, &p.tosOrTClass) stateSinkObject.Save(8, &p.ttlOrHopLimit) } func (p *udpPacket) afterLoad(context.Context) {} // +checklocksignore func (p *udpPacket) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.udpPacketEntry) stateSourceObject.Load(1, &p.netProto) stateSourceObject.Load(2, &p.senderAddress) stateSourceObject.Load(3, &p.destinationAddress) stateSourceObject.Load(4, &p.packetInfo) stateSourceObject.Load(5, &p.pkt) stateSourceObject.Load(7, &p.tosOrTClass) stateSourceObject.Load(8, &p.ttlOrHopLimit) stateSourceObject.LoadValue(6, new(int64), func(y any) { p.loadReceivedAt(ctx, y.(int64)) }) } func (e *endpoint) StateTypeName() string { return "pkg/tcpip/transport/udp.endpoint" } func (e *endpoint) StateFields() []string { return []string{ "DefaultSocketOptionsHandler", "waiterQueue", "net", "stats", "ops", "rcvReady", "rcvList", "rcvBufSize", "rcvClosed", "lastError", "portFlags", "boundBindToDevice", "boundPortFlags", "readShutdown", "effectiveNetProtos", "frozen", "localPort", "remotePort", } } // +checklocksignore func (e *endpoint) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.DefaultSocketOptionsHandler) stateSinkObject.Save(1, &e.waiterQueue) stateSinkObject.Save(2, &e.net) stateSinkObject.Save(3, &e.stats) stateSinkObject.Save(4, &e.ops) stateSinkObject.Save(5, &e.rcvReady) stateSinkObject.Save(6, &e.rcvList) stateSinkObject.Save(7, &e.rcvBufSize) stateSinkObject.Save(8, &e.rcvClosed) stateSinkObject.Save(9, &e.lastError) stateSinkObject.Save(10, &e.portFlags) stateSinkObject.Save(11, &e.boundBindToDevice) stateSinkObject.Save(12, &e.boundPortFlags) stateSinkObject.Save(13, &e.readShutdown) stateSinkObject.Save(14, &e.effectiveNetProtos) stateSinkObject.Save(15, &e.frozen) stateSinkObject.Save(16, &e.localPort) stateSinkObject.Save(17, &e.remotePort) } // +checklocksignore func (e *endpoint) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.DefaultSocketOptionsHandler) stateSourceObject.Load(1, &e.waiterQueue) stateSourceObject.Load(2, &e.net) stateSourceObject.Load(3, &e.stats) stateSourceObject.Load(4, &e.ops) stateSourceObject.Load(5, &e.rcvReady) stateSourceObject.Load(6, &e.rcvList) stateSourceObject.Load(7, &e.rcvBufSize) stateSourceObject.Load(8, &e.rcvClosed) stateSourceObject.Load(9, &e.lastError) stateSourceObject.Load(10, &e.portFlags) stateSourceObject.Load(11, &e.boundBindToDevice) stateSourceObject.Load(12, &e.boundPortFlags) stateSourceObject.Load(13, &e.readShutdown) stateSourceObject.Load(14, &e.effectiveNetProtos) stateSourceObject.Load(15, &e.frozen) stateSourceObject.Load(16, &e.localPort) stateSourceObject.Load(17, &e.remotePort) stateSourceObject.AfterLoad(func() { e.afterLoad(ctx) }) } func (p *protocol) StateTypeName() string { return "pkg/tcpip/transport/udp.protocol" } func (p *protocol) StateFields() []string { return []string{ "stack", } } func (p *protocol) beforeSave() {} // +checklocksignore func (p *protocol) StateSave(stateSinkObject state.Sink) { p.beforeSave() stateSinkObject.Save(0, &p.stack) } func (p *protocol) afterLoad(context.Context) {} // +checklocksignore func (p *protocol) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &p.stack) } func (l *udpPacketList) StateTypeName() string { return "pkg/tcpip/transport/udp.udpPacketList" } func (l *udpPacketList) StateFields() []string { return []string{ "head", "tail", } } func (l *udpPacketList) beforeSave() {} // +checklocksignore func (l *udpPacketList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *udpPacketList) afterLoad(context.Context) {} // +checklocksignore func (l *udpPacketList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *udpPacketEntry) StateTypeName() string { return "pkg/tcpip/transport/udp.udpPacketEntry" } func (e *udpPacketEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *udpPacketEntry) beforeSave() {} // +checklocksignore func (e *udpPacketEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *udpPacketEntry) afterLoad(context.Context) {} // +checklocksignore func (e *udpPacketEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func init() { state.Register((*udpPacket)(nil)) state.Register((*endpoint)(nil)) state.Register((*protocol)(nil)) state.Register((*udpPacketList)(nil)) state.Register((*udpPacketEntry)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/unet/000077500000000000000000000000001465435605700203775ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/unet/unet.go000066400000000000000000000307151465435605700217070ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package unet provides a minimal net package based on Unix Domain Sockets. // // This does no pooling, and should only be used for a limited number of // connections in a Go process. Don't use this package for arbitrary servers. package unet import ( "errors" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/eventfd" "gvisor.dev/gvisor/pkg/sync" ) // backlog is used for the listen request. const backlog = 16 // errClosing is returned by wait if the Socket is in the process of closing. var errClosing = errors.New("Socket is closing") // errMessageTruncated indicates that data was lost because the provided buffer // was too small. var errMessageTruncated = errors.New("message truncated") // socketType returns the appropriate type. func socketType(packet bool) int { if packet { return unix.SOCK_SEQPACKET } return unix.SOCK_STREAM } // socket creates a new host socket. func socket(packet bool) (int, error) { // Make a new socket. fd, err := unix.Socket(unix.AF_UNIX, socketType(packet), 0) if err != nil { return 0, err } return fd, nil } // Socket is a connected unix domain socket. type Socket struct { // gate protects use of fd. gate sync.Gate // fd is the bound socket. // // fd only remains valid if read while within gate. fd atomicbitops.Int32 // efd is an event FD that is signaled when the socket is closing. // // efd is immutable and remains valid until Close/Release. efd eventfd.Eventfd // race is an atomic variable used to avoid triggering the race // detector. See comment in SocketPair below. race *atomicbitops.Int32 } // NewSocket returns a socket from an existing FD. // // NewSocket takes ownership of fd. func NewSocket(fd int) (*Socket, error) { // fd must be non-blocking for non-blocking unix.Accept in // ServerSocket.Accept. if err := unix.SetNonblock(fd, true); err != nil { return nil, err } efd, err := eventfd.Create() if err != nil { return nil, err } return &Socket{ fd: atomicbitops.FromInt32(int32(fd)), efd: efd, }, nil } // finish completes use of s.fd by evicting any waiters, closing the gate, and // closing the event FD. func (s *Socket) finish() error { // Signal any blocked or future polls. if err := s.efd.Notify(); err != nil { return err } // Close the gate, blocking until all FD users leave. s.gate.Close() return s.efd.Close() } // Close closes the socket. func (s *Socket) Close() error { // Set the FD in the socket to -1, to ensure that all future calls to // FD/Release get nothing and Close calls return immediately. fd := int(s.fd.Swap(-1)) if fd < 0 { // Already closed or closing. return unix.EBADF } // Shutdown the socket to cancel any pending accepts. s.shutdown(fd) if err := s.finish(); err != nil { return err } return unix.Close(fd) } // Release releases ownership of the socket FD. // // The returned FD is non-blocking. // // Any concurrent or future callers of Socket methods will receive EBADF. func (s *Socket) Release() (int, error) { // Set the FD in the socket to -1, to ensure that all future calls to // FD/Release get nothing and Close calls return immediately. fd := int(s.fd.Swap(-1)) if fd < 0 { // Already closed or closing. return -1, unix.EBADF } if err := s.finish(); err != nil { return -1, err } return fd, nil } // FD returns the FD for this Socket. // // The FD is non-blocking and must not be made blocking. // // N.B. os.File.Fd makes the FD blocking. Use of Release instead of FD is // strongly preferred. // // The returned FD cannot be used safely if there may be concurrent callers to // Close or Release. // // Use Release to take ownership of the FD. func (s *Socket) FD() int { return int(s.fd.Load()) } // enterFD enters the FD gate and returns the FD value. // // If enterFD returns ok, s.gate.Leave must be called when done with the FD. // Callers may only block while within the gate using s.wait. // // The returned FD is guaranteed to remain valid until s.gate.Leave. func (s *Socket) enterFD() (int, bool) { if !s.gate.Enter() { return -1, false } fd := int(s.fd.Load()) if fd < 0 { s.gate.Leave() return -1, false } return fd, true } // SocketPair creates a pair of connected sockets. func SocketPair(packet bool) (*Socket, *Socket, error) { // Make a new pair. fds, err := unix.Socketpair(unix.AF_UNIX, socketType(packet)|unix.SOCK_CLOEXEC, 0) if err != nil { return nil, nil, err } // race is an atomic variable used to avoid triggering the race // detector. We have to fool TSAN into thinking there is a race // variable between our two sockets. We only use SocketPair in tests // anyway. // // NOTE(b/27107811): This is purely due to the fact that the raw // syscall does not serve as a boundary for the sanitizer. a, err := NewSocket(fds[0]) if err != nil { unix.Close(fds[0]) unix.Close(fds[1]) return nil, nil, err } var race atomicbitops.Int32 a.race = &race b, err := NewSocket(fds[1]) if err != nil { a.Close() unix.Close(fds[1]) return nil, nil, err } b.race = &race return a, b, nil } // Connect connects to a server. func Connect(addr string, packet bool) (*Socket, error) { fd, err := socket(packet) if err != nil { return nil, err } // Connect the socket. usa := &unix.SockaddrUnix{Name: addr} if err := unix.Connect(fd, usa); err != nil { unix.Close(fd) return nil, err } return NewSocket(fd) } // ControlMessage wraps around a byte array and provides functions for parsing // as a Unix Domain Socket control message. type ControlMessage []byte // EnableFDs enables receiving FDs via control message. // // This guarantees only a MINIMUM number of FDs received. You may receive MORE // than this due to the way FDs are packed. To be specific, the number of // receivable buffers will be rounded up to the nearest even number. // // This must be called prior to ReadVec if you want to receive FDs. func (c *ControlMessage) EnableFDs(count int) { *c = make([]byte, unix.CmsgSpace(count*4)) } // ExtractFDs returns the list of FDs in the control message. // // Either this or CloseFDs should be used after EnableFDs. func (c *ControlMessage) ExtractFDs() ([]int, error) { msgs, err := unix.ParseSocketControlMessage(*c) if err != nil { return nil, err } var fds []int for _, msg := range msgs { thisFds, err := unix.ParseUnixRights(&msg) if err != nil { // Different control message. return nil, err } for _, fd := range thisFds { if fd >= 0 { fds = append(fds, fd) } } } return fds, nil } // CloseFDs closes the list of FDs in the control message. // // Either this or ExtractFDs should be used after EnableFDs. func (c *ControlMessage) CloseFDs() { fds, _ := c.ExtractFDs() for _, fd := range fds { if fd >= 0 { unix.Close(fd) } } } // PackFDs packs the given list of FDs in the control message. // // This must be used prior to WriteVec. func (c *ControlMessage) PackFDs(fds ...int) { *c = ControlMessage(unix.UnixRights(fds...)) } // UnpackFDs clears the control message. func (c *ControlMessage) UnpackFDs() { *c = nil } // SocketWriter wraps an individual send operation. // // The normal entrypoint is WriteVec. type SocketWriter struct { socket *Socket to []byte blocking bool race *atomicbitops.Int32 ControlMessage } // Writer returns a writer for this socket. func (s *Socket) Writer(blocking bool) SocketWriter { return SocketWriter{socket: s, blocking: blocking, race: s.race} } // Write implements io.Writer.Write. func (s *Socket) Write(p []byte) (int, error) { r := s.Writer(true) return r.WriteVec([][]byte{p}) } // GetSockOpt gets the given socket option. func (s *Socket) GetSockOpt(level int, name int, b []byte) (uint32, error) { fd, ok := s.enterFD() if !ok { return 0, unix.EBADF } defer s.gate.Leave() return getsockopt(fd, level, name, b) } // SetSockOpt sets the given socket option. func (s *Socket) SetSockOpt(level, name int, b []byte) error { fd, ok := s.enterFD() if !ok { return unix.EBADF } defer s.gate.Leave() return setsockopt(fd, level, name, b) } // GetSockName returns the socket name. func (s *Socket) GetSockName() ([]byte, error) { fd, ok := s.enterFD() if !ok { return nil, unix.EBADF } defer s.gate.Leave() var buf []byte l := unix.SizeofSockaddrAny for { // If the buffer is not large enough, allocate a new one with the hint. buf = make([]byte, l) l, err := getsockname(fd, buf) if err != nil { return nil, err } if l <= uint32(len(buf)) { return buf[:l], nil } } } // GetPeerName returns the peer name. func (s *Socket) GetPeerName() ([]byte, error) { fd, ok := s.enterFD() if !ok { return nil, unix.EBADF } defer s.gate.Leave() var buf []byte l := unix.SizeofSockaddrAny for { // See above. buf = make([]byte, l) l, err := getpeername(fd, buf) if err != nil { return nil, err } if l <= uint32(len(buf)) { return buf[:l], nil } } } // SocketReader wraps an individual receive operation. // // This may be used for doing vectorized reads and/or sending additional // control messages (e.g. FDs). The normal entrypoint is ReadVec. // // One of ExtractFDs or DisposeFDs must be called if EnableFDs is used. type SocketReader struct { socket *Socket source []byte blocking bool race *atomicbitops.Int32 ControlMessage } // Reader returns a reader for this socket. func (s *Socket) Reader(blocking bool) SocketReader { return SocketReader{socket: s, blocking: blocking, race: s.race} } // Read implements io.Reader.Read. func (s *Socket) Read(p []byte) (int, error) { r := s.Reader(true) return r.ReadVec([][]byte{p}) } func (s *Socket) shutdown(fd int) error { // Shutdown the socket to cancel any pending accepts. return unix.Shutdown(fd, unix.SHUT_RDWR) } // Shutdown closes the socket for read and write. func (s *Socket) Shutdown() error { fd, ok := s.enterFD() if !ok { return unix.EBADF } defer s.gate.Leave() return s.shutdown(fd) } // ServerSocket is a bound unix domain socket. type ServerSocket struct { socket *Socket } // NewServerSocket returns a socket from an existing FD. func NewServerSocket(fd int) (*ServerSocket, error) { s, err := NewSocket(fd) if err != nil { return nil, err } return &ServerSocket{socket: s}, nil } // Bind creates and binds a new socket. func Bind(addr string, packet bool) (*ServerSocket, error) { fd, err := socket(packet) if err != nil { return nil, err } // Do the bind. usa := &unix.SockaddrUnix{Name: addr} if err := unix.Bind(fd, usa); err != nil { unix.Close(fd) return nil, err } return NewServerSocket(fd) } // BindAndListen creates, binds and listens on a new socket. func BindAndListen(addr string, packet bool) (*ServerSocket, error) { s, err := Bind(addr, packet) if err != nil { return nil, err } // Start listening. if err := s.Listen(); err != nil { s.Close() return nil, err } return s, nil } // Listen starts listening on the socket. func (s *ServerSocket) Listen() error { fd, ok := s.socket.enterFD() if !ok { return unix.EBADF } defer s.socket.gate.Leave() return unix.Listen(fd, backlog) } // Accept accepts a new connection. // // This is always blocking. // // Preconditions: // - ServerSocket is listening (Listen called). func (s *ServerSocket) Accept() (*Socket, error) { fd, ok := s.socket.enterFD() if !ok { return nil, unix.EBADF } defer s.socket.gate.Leave() for { nfd, _, err := unix.Accept(fd) switch err { case nil: return NewSocket(nfd) case unix.EAGAIN: err = s.socket.wait(false) if err == errClosing { err = unix.EBADF } } if err != nil { return nil, err } } } // Close closes the server socket. // // This must only be called once. func (s *ServerSocket) Close() error { return s.socket.Close() } // FD returns the socket's file descriptor. // // See Socket.FD. func (s *ServerSocket) FD() int { return s.socket.FD() } // Release releases ownership of the socket's file descriptor. // // See Socket.Release. func (s *ServerSocket) Release() (int, error) { return s.socket.Release() } golang-gvisor-gvisor-0.0~20240729.0/pkg/unet/unet_state_autogen.go000066400000000000000000000000661465435605700246250ustar00rootroot00000000000000// automatically generated by stateify. package unet golang-gvisor-gvisor-0.0~20240729.0/pkg/unet/unet_unsafe.go000066400000000000000000000152731465435605700232520ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package unet import ( "io" "unsafe" "golang.org/x/sys/unix" ) // wait blocks until the socket FD is ready for reading or writing, depending // on the value of write. // // Returns errClosing if the Socket is in the process of closing. func (s *Socket) wait(write bool) error { for { // Checking the FD on each loop is not strictly necessary, it // just avoids an extra poll call. fd := s.fd.Load() if fd < 0 { return errClosing } events := []unix.PollFd{ { // The actual socket FD. Fd: fd, Events: unix.POLLIN, }, { // The eventfd, signaled when we are closing. Fd: int32(s.efd.FD()), Events: unix.POLLIN, }, } if write { events[0].Events = unix.POLLOUT } _, _, e := unix.Syscall6(unix.SYS_PPOLL, uintptr(unsafe.Pointer(&events[0])), 2, 0, 0, 0, 0) if e == unix.EINTR { continue } if e != 0 { return e } if events[1].Revents&unix.POLLIN == unix.POLLIN { // eventfd signaled, we're closing. return errClosing } return nil } } // buildIovec builds an iovec slice from the given []byte slice. // // iovecs is used as an initial slice, to avoid excessive allocations. func buildIovec(bufs [][]byte, iovecs []unix.Iovec) ([]unix.Iovec, int) { var length int for i := range bufs { if l := len(bufs[i]); l > 0 { iovecs = append(iovecs, unix.Iovec{ Base: &bufs[i][0], Len: uint64(l), }) length += l } } return iovecs, length } // ReadVec reads into the pre-allocated bufs. Returns bytes read. // // The pre-allocatted space used by ReadVec is based upon slice lengths. // // This function is not guaranteed to read all available data, it // returns as soon as a single recvmsg call succeeds. func (r *SocketReader) ReadVec(bufs [][]byte) (int, error) { iovecs, length := buildIovec(bufs, make([]unix.Iovec, 0, 2)) var msg unix.Msghdr if len(r.source) != 0 { msg.Name = &r.source[0] msg.Namelen = uint32(len(r.source)) } if len(r.ControlMessage) != 0 { msg.Control = &r.ControlMessage[0] msg.Controllen = uint64(len(r.ControlMessage)) } if len(iovecs) != 0 { msg.Iov = &iovecs[0] msg.Iovlen = uint64(len(iovecs)) } // n is the bytes received. var n uintptr fd, ok := r.socket.enterFD() if !ok { return 0, unix.EBADF } // Leave on returns below. for { var e unix.Errno // Try a non-blocking recv first, so we don't give up the go runtime M. n, _, e = unix.RawSyscall(unix.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), unix.MSG_DONTWAIT|unix.MSG_TRUNC) if e == 0 { break } if e == unix.EINTR { continue } if !r.blocking { r.socket.gate.Leave() return 0, e } if e != unix.EAGAIN && e != unix.EWOULDBLOCK { r.socket.gate.Leave() return 0, e } // Wait for the socket to become readable. err := r.socket.wait(false) if err == errClosing { err = unix.EBADF } if err != nil { r.socket.gate.Leave() return 0, err } } r.socket.gate.Leave() if msg.Controllen < uint64(len(r.ControlMessage)) { r.ControlMessage = r.ControlMessage[:msg.Controllen] } if msg.Namelen < uint32(len(r.source)) { r.source = r.source[:msg.Namelen] } // All unet sockets are SOCK_STREAM or SOCK_SEQPACKET, both of which // indicate that the other end is closed by returning a 0 length read // with no error. if n == 0 { return 0, io.EOF } if r.race != nil { // See comments on Socket.race. r.race.Add(1) } if int(n) > length { return length, errMessageTruncated } return int(n), nil } // WriteVec writes the bufs to the socket. Returns bytes written. // // This function is not guaranteed to send all data, it returns // as soon as a single sendmsg call succeeds. func (w *SocketWriter) WriteVec(bufs [][]byte) (int, error) { iovecs, _ := buildIovec(bufs, make([]unix.Iovec, 0, 2)) if w.race != nil { // See comments on Socket.race. w.race.Add(1) } var msg unix.Msghdr if len(w.to) != 0 { msg.Name = &w.to[0] msg.Namelen = uint32(len(w.to)) } if len(w.ControlMessage) != 0 { msg.Control = &w.ControlMessage[0] msg.Controllen = uint64(len(w.ControlMessage)) } if len(iovecs) > 0 { msg.Iov = &iovecs[0] msg.Iovlen = uint64(len(iovecs)) } fd, ok := w.socket.enterFD() if !ok { return 0, unix.EBADF } // Leave on returns below. for { // Try a non-blocking send first, so we don't give up the go runtime M. n, _, e := unix.RawSyscall(unix.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), unix.MSG_DONTWAIT|unix.MSG_NOSIGNAL) if e == 0 { w.socket.gate.Leave() return int(n), nil } if e == unix.EINTR { continue } if !w.blocking { w.socket.gate.Leave() return 0, e } if e != unix.EAGAIN && e != unix.EWOULDBLOCK { w.socket.gate.Leave() return 0, e } // Wait for the socket to become writeable. err := w.socket.wait(true) if err == errClosing { err = unix.EBADF } if err != nil { w.socket.gate.Leave() return 0, err } } // Unreachable, no s.gate.Leave needed. } // getsockopt issues a getsockopt unix. func getsockopt(fd int, level int, optname int, buf []byte) (uint32, error) { l := uint32(len(buf)) _, _, e := unix.RawSyscall6(unix.SYS_GETSOCKOPT, uintptr(fd), uintptr(level), uintptr(optname), uintptr(unsafe.Pointer(&buf[0])), uintptr(unsafe.Pointer(&l)), 0) if e != 0 { return 0, e } return l, nil } // setsockopt issues a setsockopt unix. func setsockopt(fd int, level int, optname int, buf []byte) error { _, _, e := unix.RawSyscall6(unix.SYS_SETSOCKOPT, uintptr(fd), uintptr(level), uintptr(optname), uintptr(unsafe.Pointer(&buf[0])), uintptr(len(buf)), 0) if e != 0 { return e } return nil } // getsockname issues a getsockname unix. func getsockname(fd int, buf []byte) (uint32, error) { l := uint32(len(buf)) _, _, e := unix.RawSyscall(unix.SYS_GETSOCKNAME, uintptr(fd), uintptr(unsafe.Pointer(&buf[0])), uintptr(unsafe.Pointer(&l))) if e != 0 { return 0, e } return l, nil } // getpeername issues a getpeername unix. func getpeername(fd int, buf []byte) (uint32, error) { l := uint32(len(buf)) _, _, e := unix.RawSyscall(unix.SYS_GETPEERNAME, uintptr(fd), uintptr(unsafe.Pointer(&buf[0])), uintptr(unsafe.Pointer(&l))) if e != 0 { return 0, e } return l, nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/unet/unet_unsafe_state_autogen.go000066400000000000000000000000661465435605700261660ustar00rootroot00000000000000// automatically generated by stateify. package unet golang-gvisor-gvisor-0.0~20240729.0/pkg/urpc/000077500000000000000000000000001465435605700203755ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/urpc/urpc.go000066400000000000000000000440771465435605700217110ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package urpc provides a minimal RPC package based on unet. // // RPC requests are _not_ concurrent and methods must be explicitly // registered. However, files may be send as part of the payload. package urpc import ( "bytes" "encoding/json" "errors" "fmt" "io" "os" "reflect" "runtime" "time" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) // maxFiles determines the maximum file payload. This limit is arbitrary. Linux // allows SCM_MAX_FD = 253 FDs to be donated in one sendmsg(2) call. const maxFiles = 128 // ErrTooManyFiles is returned when too many file descriptors are mapped. var ErrTooManyFiles = errors.New("too many files") // ErrUnknownMethod is returned when a method is not known. var ErrUnknownMethod = errors.New("unknown method") // errStopped is an internal error indicating the server has been stopped. var errStopped = errors.New("stopped") // RemoteError is an error returned by the remote invocation. // // This indicates that the RPC transport was correct, but that the called // function itself returned an error. type RemoteError struct { // Message is the result of calling Error() on the remote error. Message string } // Error returns the remote error string. func (r RemoteError) Error() string { return r.Message } // FilePayload may be _embedded_ in another type in order to send or receive a // file as a result of an RPC. These are not actually serialized, rather they // are sent via an accompanying SCM_RIGHTS message (plumbed through the unet // package). // // When embedding a FilePayload in an argument struct, the argument type _must_ // be a pointer to the struct rather than the struct type itself. This is // because the urpc package defines pointer methods on FilePayload. type FilePayload struct { Files []*os.File `json:"-"` } // ReleaseFD releases the FD at the specified index. func (f *FilePayload) ReleaseFD(index int) (*fd.FD, error) { return fd.NewFromFile(f.Files[index]) } // filePayload returns the file. It may be nil. func (f *FilePayload) filePayload() []*os.File { return f.Files } // setFilePayload sets the payload. func (f *FilePayload) setFilePayload(fs []*os.File) { f.Files = fs } // closeAll closes a slice of files. func closeAll(files []*os.File) { for _, f := range files { f.Close() } } // filePayloader is implemented only by FilePayload and will be implicitly // implemented by types that have the FilePayload embedded. Note that there is // no way to implement these methods other than by embedding FilePayload, due // to the way unexported method names are mangled. type filePayloader interface { filePayload() []*os.File setFilePayload([]*os.File) } // clientCall is the client=>server method call on the client side. type clientCall struct { Method string `json:"method"` Arg any `json:"arg"` } // serverCall is the client=>server method call on the server side. type serverCall struct { Method string `json:"method"` Arg json.RawMessage `json:"arg"` } // callResult is the server=>client method call result. type callResult struct { Success bool `json:"success"` Err string `json:"err"` Result any `json:"result"` } // registeredMethod is method registered with the server. type registeredMethod struct { // fn is the underlying function. fn reflect.Value // rcvr is the receiver value. rcvr reflect.Value // argType is a typed argument. argType reflect.Type // resultType is also a type result. resultType reflect.Type } // clientState is client metadata. // // The following are valid states: // // idle - not processing any requests, no close request. // processing - actively processing, no close request. // closeRequested - actively processing, pending close. // closed - client connection has been closed. // // The following transitions are possible: // // idle -> processing, closed // processing -> idle, closeRequested // closeRequested -> closed type clientState int // See clientState. const ( idle clientState = iota processing closeRequested closed ) // Server is an RPC server. type Server struct { // mu protects all fields, except wg. mu sync.Mutex // methods is the set of server methods. methods map[string]registeredMethod // stoppers are all registered stoppers. stoppers []Stopper // clients is a map of clients. clients map[*unet.Socket]clientState // wg is a wait group for all outstanding clients. wg sync.WaitGroup // afterRPCCallback is called after each RPC is successfully completed. afterRPCCallback func() } // NewServer returns a new server. func NewServer() *Server { return NewServerWithCallback(nil) } // NewServerWithCallback returns a new server, who upon completion of each RPC // calls the given function. func NewServerWithCallback(afterRPCCallback func()) *Server { return &Server{ methods: make(map[string]registeredMethod), clients: make(map[*unet.Socket]clientState), afterRPCCallback: afterRPCCallback, } } // Stopper is an optional interface, that when implemented, allows an object // to have a callback executed when the server is shutting down. type Stopper interface { Stop() } // Register registers the given object as an RPC receiver. // // This functions is the same way as the built-in RPC package, but it does not // tolerate any object with non-conforming methods. Any non-confirming methods // will lead to an immediate panic, instead of being skipped or an error. // Panics will also be generated by anonymous objects and duplicate entries. func (s *Server) Register(obj any) { s.mu.Lock() defer s.mu.Unlock() typ := reflect.TypeOf(obj) stopper, hasStop := obj.(Stopper) // If we got a pointer, deref it to the underlying object. We need this to // obtain the name of the underlying type. typDeref := typ if typ.Kind() == reflect.Ptr { typDeref = typ.Elem() } for m := 0; m < typ.NumMethod(); m++ { method := typ.Method(m) if typDeref.Name() == "" { // Can't be anonymous. panic("type not named.") } if hasStop && method.Name == "Stop" { s.stoppers = append(s.stoppers, stopper) continue // Legal stop method. } prettyName := typDeref.Name() + "." + method.Name if _, ok := s.methods[prettyName]; ok { // Duplicate entry. panic(fmt.Sprintf("method %s is duplicated.", prettyName)) } if method.PkgPath != "" { // Must be exported. panic(fmt.Sprintf("method %s is not exported.", prettyName)) } mtype := method.Type if mtype.NumIn() != 3 { // Need exactly two arguments (+ receiver). panic(fmt.Sprintf("method %s has wrong number of arguments.", prettyName)) } argType := mtype.In(1) if argType.Kind() != reflect.Ptr { // Need arg pointer. panic(fmt.Sprintf("method %s has non-pointer first argument.", prettyName)) } resultType := mtype.In(2) if resultType.Kind() != reflect.Ptr { // Need result pointer. panic(fmt.Sprintf("method %s has non-pointer second argument.", prettyName)) } if mtype.NumOut() != 1 { // Need single return. panic(fmt.Sprintf("method %s has wrong number of returns.", prettyName)) } if returnType := mtype.Out(0); returnType != reflect.TypeOf((*error)(nil)).Elem() { // Need error return. panic(fmt.Sprintf("method %s has non-error return value.", prettyName)) } // Register the method. s.methods[prettyName] = registeredMethod{ fn: method.Func, rcvr: reflect.ValueOf(obj), argType: argType, resultType: resultType, } } } // lookup looks up the given method. func (s *Server) lookup(method string) (registeredMethod, bool) { s.mu.Lock() defer s.mu.Unlock() rm, ok := s.methods[method] return rm, ok } // handleOne handles a single call. func (s *Server) handleOne(client *unet.Socket) error { // Unmarshal the call. var c serverCall newFs, err := unmarshal(client, &c) if err != nil { // Client is dead. return err } if s.afterRPCCallback != nil { defer s.afterRPCCallback() } // Explicitly close all these files after the call. // // This is also explicitly a reference to the files after the call, // which means they are kept open for the duration of the call. defer closeAll(newFs) // Start the request. if !s.clientBeginRequest(client) { // Client is dead; don't process this call. return errStopped } defer s.clientEndRequest(client) // Lookup the method. rm, ok := s.lookup(c.Method) if !ok { // Try to serialize the error. return marshal(client, &callResult{Err: ErrUnknownMethod.Error()}, nil) } // Unmarshal the arguments now that we know the type. na := reflect.New(rm.argType.Elem()) if err := json.Unmarshal(c.Arg, na.Interface()); err != nil { return marshal(client, &callResult{Err: err.Error()}, nil) } // Set the file payload as an argument. if fp, ok := na.Interface().(filePayloader); ok { fp.setFilePayload(newFs) } // Call the method. re := reflect.New(rm.resultType.Elem()) rValues := rm.fn.Call([]reflect.Value{rm.rcvr, na, re}) if errVal := rValues[0].Interface(); errVal != nil { return marshal(client, &callResult{Err: errVal.(error).Error()}, nil) } // Set the resulting payload. var fs []*os.File if fp, ok := re.Interface().(filePayloader); ok { fs = fp.filePayload() if len(fs) > maxFiles { // Ugh. Send an error to the client, despite success. return marshal(client, &callResult{Err: ErrTooManyFiles.Error()}, nil) } } // Marshal the result. return marshal(client, &callResult{Success: true, Result: re.Interface()}, fs) } // clientBeginRequest begins a request. // // If true is returned, the request may be processed. If false is returned, // then the server has been stopped and the request should be skipped. func (s *Server) clientBeginRequest(client *unet.Socket) bool { s.mu.Lock() defer s.mu.Unlock() switch state := s.clients[client]; state { case idle: // Mark as processing. s.clients[client] = processing return true case closed: // Whoops, how did this happen? Must have closed immediately // following the deserialization. Don't let the RPC actually go // through, since we won't be able to serialize a proper // response. return false default: // Should not happen. panic(fmt.Sprintf("expected idle or closed, got %d", state)) } } // clientEndRequest ends a request. func (s *Server) clientEndRequest(client *unet.Socket) { s.mu.Lock() defer s.mu.Unlock() switch state := s.clients[client]; state { case processing: // Return to idle. s.clients[client] = idle case closeRequested: // Close the connection. client.Close() s.clients[client] = closed default: // Should not happen. panic(fmt.Sprintf("expected processing or requestClose, got %d", state)) } } // clientRegister registers a connection. // // See Stop for more context. func (s *Server) clientRegister(client *unet.Socket) { s.mu.Lock() defer s.mu.Unlock() s.clients[client] = idle s.wg.Add(1) } // clientUnregister unregisters and closes a connection if necessary. // // See Stop for more context. func (s *Server) clientUnregister(client *unet.Socket) { s.mu.Lock() defer s.mu.Unlock() switch state := s.clients[client]; state { case idle: // Close the connection. client.Close() case closed: // Already done. default: // Should not happen. panic(fmt.Sprintf("expected idle or closed, got %d", state)) } delete(s.clients, client) s.wg.Done() } // handleRegistered handles calls from a registered client. func (s *Server) handleRegistered(client *unet.Socket) error { for { // Handle one call. if err := s.handleOne(client); err != nil { // Client is dead. return err } } } // Handle synchronously handles a single client over a connection. func (s *Server) Handle(client *unet.Socket) error { s.clientRegister(client) defer s.clientUnregister(client) return s.handleRegistered(client) } // StartHandling creates a goroutine that handles a single client over a // connection. func (s *Server) StartHandling(client *unet.Socket) { s.clientRegister(client) go func() { // S/R-SAFE: out of scope defer s.clientUnregister(client) s.handleRegistered(client) }() } // Stop safely terminates outstanding clients. // // No new requests should be initiated after calling Stop. Existing clients // will be closed after completing any pending RPCs. This method will block // until all clients have disconnected. // // timeout is the time for clients to complete pending RPCs. After timeout // expires, all clients are drained (i.e. their ongoing RPC is allowed to // complete) and closed. Any new RPCs will not be processed. Note that ongoing // RPCs are *not* interrupted or cancelled. func (s *Server) Stop(timeout time.Duration) { // Call any Stop callbacks. for _, stopper := range s.stoppers { stopper.Stop() } done := make(chan bool, 1) go func() { if timeout != 0 { timer := time.NewTicker(timeout) defer timer.Stop() select { case <-done: return case <-timer.C: } } // Close all known clients. s.mu.Lock() defer s.mu.Unlock() for client, state := range s.clients { switch state { case idle: // Close connection now. client.Close() s.clients[client] = closed case processing: // Request close when done. s.clients[client] = closeRequested } } }() // Wait for all outstanding requests. s.wg.Wait() done <- true } // Client is a urpc client. type Client struct { // mu protects all members. // // It also enforces single-call semantics. mu sync.Mutex // Socket is the underlying socket for this client. // // This _must_ be provided and must be closed manually by calling // Close. Socket *unet.Socket } // NewClient returns a new client. func NewClient(socket *unet.Socket) *Client { return &Client{ Socket: socket, } } // marshal sends the given FD and json struct. func marshal(s *unet.Socket, v any, fs []*os.File) error { // Marshal to a buffer. data, err := json.Marshal(v) if err != nil { log.Warningf("urpc: error marshalling %s: %s", fmt.Sprintf("%v", v), err.Error()) return err } // Write to the socket. w := s.Writer(true) if fs != nil { var fds []int for _, f := range fs { fds = append(fds, int(f.Fd())) } w.PackFDs(fds...) } // Send. for n := 0; n < len(data); { cur, err := w.WriteVec([][]byte{data[n:]}) if n == 0 && cur < len(data) { // Don't send FDs anymore. This call is only made on // the first successful call to WriteVec, assuming cur // is not sufficient to fill the entire buffer. w.PackFDs() } n += cur if err != nil { log.Warningf("urpc: error writing %v: %s", data[n:], err.Error()) return err } } // We're done sending the fds to the client. Explicitly prevent fs from // being GCed until here. Urpc rpcs often unlink the file to send, relying // on the kernel to automatically delete it once the last reference is // dropped. Until we successfully call sendmsg(2), fs may contain the last // references to these files. Without this explicit reference to fs here, // the go runtime is free to assume we're done with fs after the fd // collection loop above, since it just sees us copying ints. runtime.KeepAlive(fs) log.Debugf("urpc: successfully marshalled %d bytes.", len(data)) return nil } // unmarshal receives an FD (optional) and unmarshals the given struct. func unmarshal(s *unet.Socket, v any) ([]*os.File, error) { // Receive a single byte. r := s.Reader(true) r.EnableFDs(maxFiles) firstByte := make([]byte, 1) // Extract any FDs that may be there. if _, err := r.ReadVec([][]byte{firstByte}); err != nil { return nil, err } fds, err := r.ExtractFDs() if err != nil { log.Warningf("urpc: error extracting fds: %s", err.Error()) return nil, err } var fs []*os.File for _, fd := range fds { fs = append(fs, os.NewFile(uintptr(fd), "urpc")) } // Read the rest. d := json.NewDecoder(io.MultiReader(bytes.NewBuffer(firstByte), s)) // urpc internally decodes / re-encodes the data with any as the // intermediate type. We have to unmarshal integers to json.Number type // instead of the default float type for those intermediate values, such // that when they get re-encoded, their values are not printed out in // floating-point formats such as 1e9, which could not be decoded to // explicitly typed integers later. d.UseNumber() if err := d.Decode(v); err != nil { log.Warningf("urpc: error decoding: %s", err.Error()) for _, f := range fs { f.Close() } return nil, err } // All set. log.Debugf("urpc: unmarshal success.") return fs, nil } // Call calls a function. func (c *Client) Call(method string, arg any, result any) error { c.mu.Lock() defer c.mu.Unlock() // If arg is a FilePayload, not a *FilePayload, files won't actually be // sent, so error out. if _, ok := arg.(FilePayload); ok { return fmt.Errorf("argument is a FilePayload, but should be a *FilePayload") } // Are there files to send? var fs []*os.File if fp, ok := arg.(filePayloader); ok { fs = fp.filePayload() if len(fs) > maxFiles { return ErrTooManyFiles } } // Marshal the data. if err := marshal(c.Socket, &clientCall{Method: method, Arg: arg}, fs); err != nil { return err } // Wait for the response. callR := callResult{Result: result} newFs, err := unmarshal(c.Socket, &callR) if err != nil { return fmt.Errorf("urpc method %q failed: %v", method, err) } // Set the file payload. if fp, ok := result.(filePayloader); ok { fp.setFilePayload(newFs) } else { closeAll(newFs) } // Did an error occur? if !callR.Success { return RemoteError{Message: callR.Err} } // All set. return nil } // Close closes the underlying socket. // // Further calls to the client may result in undefined behavior. func (c *Client) Close() error { c.mu.Lock() defer c.mu.Unlock() return c.Socket.Close() } golang-gvisor-gvisor-0.0~20240729.0/pkg/urpc/urpc_state_autogen.go000066400000000000000000000000661465435605700246210ustar00rootroot00000000000000// automatically generated by stateify. package urpc golang-gvisor-gvisor-0.0~20240729.0/pkg/usermem/000077500000000000000000000000001465435605700211015ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/usermem/bytes_io.go000066400000000000000000000100121465435605700232370ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package usermem import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" ) const maxInt = int(^uint(0) >> 1) // BytesIO implements IO using a byte slice. Addresses are interpreted as // offsets into the slice. Reads and writes beyond the end of the slice return // EFAULT. type BytesIO struct { Bytes []byte } // CopyOut implements IO.CopyOut. func (b *BytesIO) CopyOut(ctx context.Context, addr hostarch.Addr, src []byte, opts IOOpts) (int, error) { rngN, rngErr := b.rangeCheck(addr, len(src)) if rngN == 0 { return 0, rngErr } return copy(b.Bytes[int(addr):], src[:rngN]), rngErr } // CopyIn implements IO.CopyIn. func (b *BytesIO) CopyIn(ctx context.Context, addr hostarch.Addr, dst []byte, opts IOOpts) (int, error) { rngN, rngErr := b.rangeCheck(addr, len(dst)) if rngN == 0 { return 0, rngErr } return copy(dst[:rngN], b.Bytes[int(addr):]), rngErr } // ZeroOut implements IO.ZeroOut. func (b *BytesIO) ZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64, opts IOOpts) (int64, error) { if toZero > int64(maxInt) { return 0, linuxerr.EINVAL } rngN, rngErr := b.rangeCheck(addr, int(toZero)) if rngN == 0 { return 0, rngErr } zeroSlice := b.Bytes[int(addr) : int(addr)+rngN] clear(zeroSlice) return int64(rngN), rngErr } // CopyOutFrom implements IO.CopyOutFrom. func (b *BytesIO) CopyOutFrom(ctx context.Context, ars hostarch.AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error) { dsts, rngErr := b.blocksFromAddrRanges(ars) n, err := src.ReadToBlocks(dsts) if err != nil { return int64(n), err } return int64(n), rngErr } // CopyInTo implements IO.CopyInTo. func (b *BytesIO) CopyInTo(ctx context.Context, ars hostarch.AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error) { srcs, rngErr := b.blocksFromAddrRanges(ars) n, err := dst.WriteFromBlocks(srcs) if err != nil { return int64(n), err } return int64(n), rngErr } func (b *BytesIO) rangeCheck(addr hostarch.Addr, length int) (int, error) { if length == 0 { return 0, nil } if length < 0 { return 0, linuxerr.EINVAL } max := hostarch.Addr(len(b.Bytes)) if addr >= max { return 0, linuxerr.EFAULT } end, ok := addr.AddLength(uint64(length)) if !ok || end > max { return int(max - addr), linuxerr.EFAULT } return length, nil } func (b *BytesIO) blocksFromAddrRanges(ars hostarch.AddrRangeSeq) (safemem.BlockSeq, error) { switch ars.NumRanges() { case 0: return safemem.BlockSeq{}, nil case 1: block, err := b.blockFromAddrRange(ars.Head()) return safemem.BlockSeqOf(block), err default: blocks := make([]safemem.Block, 0, ars.NumRanges()) for !ars.IsEmpty() { block, err := b.blockFromAddrRange(ars.Head()) if block.Len() != 0 { blocks = append(blocks, block) } if err != nil { return safemem.BlockSeqFromSlice(blocks), err } ars = ars.Tail() } return safemem.BlockSeqFromSlice(blocks), nil } } func (b *BytesIO) blockFromAddrRange(ar hostarch.AddrRange) (safemem.Block, error) { n, err := b.rangeCheck(ar.Start, int(ar.Length())) if n == 0 { return safemem.Block{}, err } return safemem.BlockFromSafeSlice(b.Bytes[int(ar.Start) : int(ar.Start)+n]), err } // BytesIOSequence returns an IOSequence representing the given byte slice. func BytesIOSequence(buf []byte) IOSequence { return IOSequence{ IO: &BytesIO{buf}, Addrs: hostarch.AddrRangeSeqOf(hostarch.AddrRange{0, hostarch.Addr(len(buf))}), } } golang-gvisor-gvisor-0.0~20240729.0/pkg/usermem/bytes_io_unsafe.go000066400000000000000000000033131465435605700246060ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package usermem import ( "unsafe" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" ) // SwapUint32 implements IO.SwapUint32. func (b *BytesIO) SwapUint32(ctx context.Context, addr hostarch.Addr, new uint32, opts IOOpts) (uint32, error) { if _, rngErr := b.rangeCheck(addr, 4); rngErr != nil { return 0, rngErr } return (*atomicbitops.Uint32)(unsafe.Pointer(&b.Bytes[int(addr)])).Swap(new), nil } // CompareAndSwapUint32 implements IO.CompareAndSwapUint32. func (b *BytesIO) CompareAndSwapUint32(ctx context.Context, addr hostarch.Addr, old, new uint32, opts IOOpts) (uint32, error) { if _, rngErr := b.rangeCheck(addr, 4); rngErr != nil { return 0, rngErr } return atomicbitops.CompareAndSwapUint32((*atomicbitops.Uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), old, new), nil } // LoadUint32 implements IO.LoadUint32. func (b *BytesIO) LoadUint32(ctx context.Context, addr hostarch.Addr, opts IOOpts) (uint32, error) { if _, err := b.rangeCheck(addr, 4); err != nil { return 0, err } return (*atomicbitops.Uint32)(unsafe.Pointer(&b.Bytes[int(addr)])).Load(), nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/usermem/marshal.go000066400000000000000000000025721465435605700230650ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package usermem import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" ) // IOCopyContext wraps an object implementing hostarch.IO to implement // marshal.CopyContext. type IOCopyContext struct { Ctx context.Context IO IO Opts IOOpts } // CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer. func (i *IOCopyContext) CopyScratchBuffer(size int) []byte { return make([]byte, size) } // CopyOutBytes implements marshal.CopyContext.CopyOutBytes. func (i *IOCopyContext) CopyOutBytes(addr hostarch.Addr, b []byte) (int, error) { return i.IO.CopyOut(i.Ctx, addr, b, i.Opts) } // CopyInBytes implements marshal.CopyContext.CopyInBytes. func (i *IOCopyContext) CopyInBytes(addr hostarch.Addr, b []byte) (int, error) { return i.IO.CopyIn(i.Ctx, addr, b, i.Opts) } golang-gvisor-gvisor-0.0~20240729.0/pkg/usermem/usermem.go000066400000000000000000000457561465435605700231260ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package usermem governs access to user memory. package usermem import ( "bytes" "errors" "io" "strconv" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/safemem" ) // IO provides access to the contents of a virtual memory space. type IO interface { // CopyOut copies len(src) bytes from src to the memory mapped at addr. It // returns the number of bytes copied. If the number of bytes copied is < // len(src), it returns a non-nil error explaining why. // // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or // any following locks in the lock order. // // Postconditions: CopyOut does not retain src. CopyOut(ctx context.Context, addr hostarch.Addr, src []byte, opts IOOpts) (int, error) // CopyIn copies len(dst) bytes from the memory mapped at addr to dst. // It returns the number of bytes copied. If the number of bytes copied is // < len(dst), it returns a non-nil error explaining why. // // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or // any following locks in the lock order. // // Postconditions: CopyIn does not retain dst. CopyIn(ctx context.Context, addr hostarch.Addr, dst []byte, opts IOOpts) (int, error) // ZeroOut sets toZero bytes to 0, starting at addr. It returns the number // of bytes zeroed. If the number of bytes zeroed is < toZero, it returns a // non-nil error explaining why. // // Preconditions: // * The caller must not hold mm.MemoryManager.mappingMu or any // following locks in the lock order. // * toZero >= 0. ZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64, opts IOOpts) (int64, error) // CopyOutFrom copies ars.NumBytes() bytes from src to the memory mapped at // ars. It returns the number of bytes copied, which may be less than the // number of bytes read from src if copying fails. CopyOutFrom may return a // partial copy without an error iff src.ReadToBlocks returns a partial // read without an error. // // CopyOutFrom calls src.ReadToBlocks at most once. // // Preconditions: // * The caller must not hold mm.MemoryManager.mappingMu or any // following locks in the lock order. // * src.ReadToBlocks must not block on mm.MemoryManager.activeMu or // any preceding locks in the lock order. CopyOutFrom(ctx context.Context, ars hostarch.AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error) // CopyInTo copies ars.NumBytes() bytes from the memory mapped at ars to // dst. It returns the number of bytes copied. CopyInTo may return a // partial copy without an error iff dst.WriteFromBlocks returns a partial // write without an error. // // CopyInTo calls dst.WriteFromBlocks at most once. // // Preconditions: // * The caller must not hold mm.MemoryManager.mappingMu or any // following locks in the lock order. // * dst.WriteFromBlocks must not block on mm.MemoryManager.activeMu or // any preceding locks in the lock order. CopyInTo(ctx context.Context, ars hostarch.AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error) // TODO(jamieliu): The requirement that CopyOutFrom/CopyInTo call src/dst // at most once, which is unnecessary in most cases, forces implementations // to gather safemem.Blocks into a single slice to pass to src/dst. Add // CopyOutFromIter/CopyInToIter, which relaxes this restriction, to avoid // this allocation. // SwapUint32 atomically sets the uint32 value at addr to new and // returns the previous value. // // Preconditions: // * The caller must not hold mm.MemoryManager.mappingMu or any // following locks in the lock order. // * addr must be aligned to a 4-byte boundary. SwapUint32(ctx context.Context, addr hostarch.Addr, new uint32, opts IOOpts) (uint32, error) // CompareAndSwapUint32 atomically compares the uint32 value at addr to // old; if they are equal, the value in memory is replaced by new. In // either case, the previous value stored in memory is returned. // // Preconditions: // * The caller must not hold mm.MemoryManager.mappingMu or any // following locks in the lock order. // * addr must be aligned to a 4-byte boundary. CompareAndSwapUint32(ctx context.Context, addr hostarch.Addr, old, new uint32, opts IOOpts) (uint32, error) // LoadUint32 atomically loads the uint32 value at addr and returns it. // // Preconditions: // * The caller must not hold mm.MemoryManager.mappingMu or any // following locks in the lock order. // * addr must be aligned to a 4-byte boundary. LoadUint32(ctx context.Context, addr hostarch.Addr, opts IOOpts) (uint32, error) } // IOOpts contains options applicable to all IO methods. type IOOpts struct { // If IgnorePermissions is true, application-defined memory protections set // by mmap(2) or mprotect(2) will be ignored. (Memory protections required // by the target of the mapping are never ignored.) IgnorePermissions bool // If AddressSpaceActive is true, the IO implementation may assume that it // has an active AddressSpace and can therefore use AddressSpace copying // without performing activation. See mm/io.go for details. AddressSpaceActive bool } // IOReadWriter is an io.ReadWriter that reads from / writes to addresses // starting at addr in IO. The preconditions that apply to IO.CopyIn and // IO.CopyOut also apply to IOReadWriter.Read and IOReadWriter.Write // respectively. type IOReadWriter struct { Ctx context.Context IO IO Addr hostarch.Addr Opts IOOpts } // Read implements io.Reader.Read. // // Note that an address space does not have an "end of file", so Read can only // return io.EOF if IO.CopyIn returns io.EOF. Attempts to read unmapped or // unreadable memory, or beyond the end of the address space, should return // EFAULT. func (rw *IOReadWriter) Read(dst []byte) (int, error) { n, err := rw.IO.CopyIn(rw.Ctx, rw.Addr, dst, rw.Opts) end, ok := rw.Addr.AddLength(uint64(n)) if ok { rw.Addr = end } else { // Disallow wraparound. rw.Addr = ^hostarch.Addr(0) if err != nil { err = linuxerr.EFAULT } } return n, err } // Write implements io.Writer.Write. func (rw *IOReadWriter) Write(src []byte) (int, error) { n, err := rw.IO.CopyOut(rw.Ctx, rw.Addr, src, rw.Opts) end, ok := rw.Addr.AddLength(uint64(n)) if ok { rw.Addr = end } else { // Disallow wraparound. rw.Addr = ^hostarch.Addr(0) if err != nil { err = linuxerr.EFAULT } } return n, err } // CopyStringIn tuning parameters, defined outside that function for tests. const ( copyStringIncrement = 64 copyStringMaxInitBufLen = 256 ) // CopyStringIn copies a NUL-terminated string of unknown length from the // memory mapped at addr in uio and returns it as a string (not including the // trailing NUL). If the length of the string, including the terminating NUL, // would exceed maxlen, CopyStringIn returns the string truncated to maxlen and // ENAMETOOLONG. // // Preconditions: Same as IO.CopyFromUser, plus: // - maxlen >= 0. func CopyStringIn(ctx context.Context, uio IO, addr hostarch.Addr, maxlen int, opts IOOpts) (string, error) { initLen := maxlen if initLen > copyStringMaxInitBufLen { initLen = copyStringMaxInitBufLen } buf := make([]byte, initLen) var done int for done < maxlen { // Read up to copyStringIncrement bytes at a time. readlen := copyStringIncrement if readlen > maxlen-done { readlen = maxlen - done } end, ok := addr.AddLength(uint64(readlen)) if !ok { return gohacks.StringFromImmutableBytes(buf[:done]), linuxerr.EFAULT } // Shorten the read to avoid crossing page boundaries, since faulting // in a page unnecessarily is expensive. This also ensures that partial // copies up to the end of application-mappable memory succeed. if addr.RoundDown() != end.RoundDown() { end = end.RoundDown() readlen = int(end - addr) } // Ensure that our buffer is large enough to accommodate the read. if done+readlen > len(buf) { newBufLen := len(buf) * 2 if newBufLen > maxlen { newBufLen = maxlen } buf = append(buf, make([]byte, newBufLen-len(buf))...) } n, err := uio.CopyIn(ctx, addr, buf[done:done+readlen], opts) // Look for the terminating zero byte, which may have occurred before // hitting err. if i := bytes.IndexByte(buf[done:done+n], byte(0)); i >= 0 { return gohacks.StringFromImmutableBytes(buf[:done+i]), nil } done += n if err != nil { return gohacks.StringFromImmutableBytes(buf[:done]), err } addr = end } return gohacks.StringFromImmutableBytes(buf), linuxerr.ENAMETOOLONG } // CopyOutVec copies bytes from src to the memory mapped at ars in uio. The // maximum number of bytes copied is ars.NumBytes() or len(src), whichever is // less. CopyOutVec returns the number of bytes copied; if this is less than // the maximum, it returns a non-nil error explaining why. // // Preconditions: Same as IO.CopyOut. func CopyOutVec(ctx context.Context, uio IO, ars hostarch.AddrRangeSeq, src []byte, opts IOOpts) (int, error) { var done int for !ars.IsEmpty() && done < len(src) { ar := ars.Head() cplen := len(src) - done if hostarch.Addr(cplen) >= ar.Length() { cplen = int(ar.Length()) } n, err := uio.CopyOut(ctx, ar.Start, src[done:done+cplen], opts) done += n if err != nil { return done, err } ars = ars.DropFirst(n) } return done, nil } // CopyInVec copies bytes from the memory mapped at ars in uio to dst. The // maximum number of bytes copied is ars.NumBytes() or len(dst), whichever is // less. CopyInVec returns the number of bytes copied; if this is less than the // maximum, it returns a non-nil error explaining why. // // Preconditions: Same as IO.CopyIn. func CopyInVec(ctx context.Context, uio IO, ars hostarch.AddrRangeSeq, dst []byte, opts IOOpts) (int, error) { var done int for !ars.IsEmpty() && done < len(dst) { ar := ars.Head() cplen := len(dst) - done if hostarch.Addr(cplen) >= ar.Length() { cplen = int(ar.Length()) } n, err := uio.CopyIn(ctx, ar.Start, dst[done:done+cplen], opts) done += n if err != nil { return done, err } ars = ars.DropFirst(n) } return done, nil } // ZeroOutVec writes zeroes to the memory mapped at ars in uio. The maximum // number of bytes written is ars.NumBytes() or toZero, whichever is less. // ZeroOutVec returns the number of bytes written; if this is less than the // maximum, it returns a non-nil error explaining why. // // Preconditions: Same as IO.ZeroOut. func ZeroOutVec(ctx context.Context, uio IO, ars hostarch.AddrRangeSeq, toZero int64, opts IOOpts) (int64, error) { var done int64 for !ars.IsEmpty() && done < toZero { ar := ars.Head() cplen := toZero - done if hostarch.Addr(cplen) >= ar.Length() { cplen = int64(ar.Length()) } n, err := uio.ZeroOut(ctx, ar.Start, cplen, opts) done += n if err != nil { return done, err } ars = ars.DropFirst64(n) } return done, nil } func isASCIIWhitespace(b byte) bool { // Compare Linux include/linux/ctype.h, lib/ctype.c. // 9 => horizontal tab '\t' // 10 => line feed '\n' // 11 => vertical tab '\v' // 12 => form feed '\c' // 13 => carriage return '\r' return b == ' ' || (b >= 9 && b <= 13) } // CopyInt32StringsInVec copies up to len(dsts) whitespace-separated decimal // strings from the memory mapped at ars in uio and converts them to int32 // values in dsts. It returns the number of bytes read. // // CopyInt32StringsInVec shares the following properties with Linux's // kernel/sysctl.c:proc_dointvec(write=1): // // - If any read value overflows the range of int32, or any invalid characters // are encountered during the read, CopyInt32StringsInVec returns EINVAL. // // - If, upon reaching the end of ars, fewer than len(dsts) values have been // read, CopyInt32StringsInVec returns no error if at least 1 value was read // and EINVAL otherwise. // // - Trailing whitespace after the last successfully read value is counted in // the number of bytes read. // // Unlike proc_dointvec(): // // - CopyInt32StringsInVec does not implicitly limit ars.NumBytes() to // PageSize-1; callers that require this must do so explicitly. // // - CopyInt32StringsInVec returns EINVAL if ars.NumBytes() == 0. // // Preconditions: Same as CopyInVec. func CopyInt32StringsInVec(ctx context.Context, uio IO, ars hostarch.AddrRangeSeq, dsts []int32, opts IOOpts) (int64, error) { if len(dsts) == 0 { return 0, nil } buf := make([]byte, ars.NumBytes()) n, cperr := CopyInVec(ctx, uio, ars, buf, opts) buf = buf[:n] var i, j int for ; j < len(dsts); j++ { // Skip leading whitespace. for i < len(buf) && isASCIIWhitespace(buf[i]) { i++ } if i == len(buf) { break } // Find the end of the value to be parsed (next whitespace or end of string). nextI := i + 1 for nextI < len(buf) && !isASCIIWhitespace(buf[nextI]) { nextI++ } // Parse a single value. val, err := strconv.ParseInt(string(buf[i:nextI]), 10, 32) if err != nil { return int64(i), linuxerr.EINVAL } dsts[j] = int32(val) i = nextI } // Skip trailing whitespace. for i < len(buf) && isASCIIWhitespace(buf[i]) { i++ } if cperr != nil { return int64(i), cperr } if j == 0 { return int64(i), linuxerr.EINVAL } return int64(i), nil } // CopyInt32StringInVec is equivalent to CopyInt32StringsInVec, but copies at // most one int32. func CopyInt32StringInVec(ctx context.Context, uio IO, ars hostarch.AddrRangeSeq, dst *int32, opts IOOpts) (int64, error) { dsts := [1]int32{*dst} n, err := CopyInt32StringsInVec(ctx, uio, ars, dsts[:], opts) *dst = dsts[0] return n, err } // IOSequence holds arguments to IO methods. type IOSequence struct { IO IO Addrs hostarch.AddrRangeSeq Opts IOOpts } // NumBytes returns s.Addrs.NumBytes(). // // Note that NumBytes() may return 0 even if !s.Addrs.IsEmpty(), since // s.Addrs may contain a non-zero number of zero-length AddrRanges. // Many clients of // IOSequence currently do something like: // // if ioseq.NumBytes() == 0 { // return 0, nil // } // if f.availableBytes == 0 { // return 0, linuxerr.ErrWouldBlock // } // return ioseq.CopyOutFrom(..., reader) // // In such cases, using s.Addrs.IsEmpty() will cause them to have the wrong // behavior for zero-length I/O. However, using s.NumBytes() == 0 instead means // that we will return success for zero-length I/O in cases where Linux would // return EFAULT due to a failed access_ok() check, so in the long term we // should move checks for ErrWouldBlock etc. into the body of // reader.ReadToBlocks and use s.Addrs.IsEmpty() instead. func (s IOSequence) NumBytes() int64 { return s.Addrs.NumBytes() } // DropFirst returns a copy of s with s.Addrs.DropFirst(n). // // Preconditions: Same as hostarch.AddrRangeSeq.DropFirst. func (s IOSequence) DropFirst(n int) IOSequence { return IOSequence{s.IO, s.Addrs.DropFirst(n), s.Opts} } // DropFirst64 returns a copy of s with s.Addrs.DropFirst64(n). // // Preconditions: Same as hostarch.AddrRangeSeq.DropFirst64. func (s IOSequence) DropFirst64(n int64) IOSequence { return IOSequence{s.IO, s.Addrs.DropFirst64(n), s.Opts} } // TakeFirst returns a copy of s with s.Addrs.TakeFirst(n). // // Preconditions: Same as hostarch.AddrRangeSeq.TakeFirst. func (s IOSequence) TakeFirst(n int) IOSequence { return IOSequence{s.IO, s.Addrs.TakeFirst(n), s.Opts} } // TakeFirst64 returns a copy of s with s.Addrs.TakeFirst64(n). // // Preconditions: Same as hostarch.AddrRangeSeq.TakeFirst64. func (s IOSequence) TakeFirst64(n int64) IOSequence { return IOSequence{s.IO, s.Addrs.TakeFirst64(n), s.Opts} } // CopyOut invokes CopyOutVec over s.Addrs. // // As with CopyOutVec, if s.NumBytes() < len(src), the copy will be truncated // to s.NumBytes(), and a nil error will be returned. // // Preconditions: Same as CopyOutVec. func (s IOSequence) CopyOut(ctx context.Context, src []byte) (int, error) { return CopyOutVec(ctx, s.IO, s.Addrs, src, s.Opts) } // CopyIn invokes CopyInVec over s.Addrs. // // As with CopyInVec, if s.NumBytes() < len(dst), the copy will be truncated to // s.NumBytes(), and a nil error will be returned. // // Preconditions: Same as CopyInVec. func (s IOSequence) CopyIn(ctx context.Context, dst []byte) (int, error) { return CopyInVec(ctx, s.IO, s.Addrs, dst, s.Opts) } // ZeroOut invokes ZeroOutVec over s.Addrs. // // As with ZeroOutVec, if s.NumBytes() < toZero, the write will be truncated // to s.NumBytes(), and a nil error will be returned. // // Preconditions: Same as ZeroOutVec. func (s IOSequence) ZeroOut(ctx context.Context, toZero int64) (int64, error) { return ZeroOutVec(ctx, s.IO, s.Addrs, toZero, s.Opts) } // CopyOutFrom invokes s.CopyOutFrom over s.Addrs. // // Preconditions: Same as IO.CopyOutFrom. func (s IOSequence) CopyOutFrom(ctx context.Context, src safemem.Reader) (int64, error) { return s.IO.CopyOutFrom(ctx, s.Addrs, src, s.Opts) } // CopyInTo invokes s.CopyInTo over s.Addrs. // // Preconditions: Same as IO.CopyInTo. func (s IOSequence) CopyInTo(ctx context.Context, dst safemem.Writer) (int64, error) { return s.IO.CopyInTo(ctx, s.Addrs, dst, s.Opts) } // Reader returns an io.Reader that reads from s. Reads beyond the end of s // return io.EOF. The preconditions that apply to s.CopyIn also apply to the // returned io.Reader.Read. func (s IOSequence) Reader(ctx context.Context) *IOSequenceReadWriter { return &IOSequenceReadWriter{ctx, s} } // Writer returns an io.Writer that writes to s. Writes beyond the end of s // return ErrEndOfIOSequence. The preconditions that apply to s.CopyOut also // apply to the returned io.Writer.Write. func (s IOSequence) Writer(ctx context.Context) *IOSequenceReadWriter { return &IOSequenceReadWriter{ctx, s} } // ErrEndOfIOSequence is returned by IOSequence.Writer().Write() when // attempting to write beyond the end of the IOSequence. var ErrEndOfIOSequence = errors.New("write beyond end of IOSequence") // IOSequenceReadWriter implements io.Reader and io.Writer for an IOSequence. type IOSequenceReadWriter struct { ctx context.Context s IOSequence } // Init initializes the IOSequence. func (rw *IOSequenceReadWriter) Init(ctx context.Context, src IOSequence) { rw.ctx = ctx rw.s = src } // Read implements io.Reader.Read. func (rw *IOSequenceReadWriter) Read(dst []byte) (int, error) { n, err := rw.s.CopyIn(rw.ctx, dst) rw.s = rw.s.DropFirst(n) if err == nil && rw.s.NumBytes() == 0 { err = io.EOF } return n, err } // Len implements tcpip.Payloader. func (rw *IOSequenceReadWriter) Len() int { return int(rw.s.NumBytes()) } // Write implements io.Writer.Write. func (rw *IOSequenceReadWriter) Write(src []byte) (int, error) { n, err := rw.s.CopyOut(rw.ctx, src) rw.s = rw.s.DropFirst(n) if err == nil && n < len(src) { err = ErrEndOfIOSequence } return n, err } golang-gvisor-gvisor-0.0~20240729.0/pkg/usermem/usermem_state_autogen.go000066400000000000000000000000711465435605700260250ustar00rootroot00000000000000// automatically generated by stateify. package usermem golang-gvisor-gvisor-0.0~20240729.0/pkg/usermem/usermem_unsafe_state_autogen.go000066400000000000000000000000711465435605700273660ustar00rootroot00000000000000// automatically generated by stateify. package usermem golang-gvisor-gvisor-0.0~20240729.0/pkg/waiter/000077500000000000000000000000001465435605700207175ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/waiter/waiter.go000066400000000000000000000220421465435605700225410ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package waiter provides the implementation of a wait queue, where waiters can // be enqueued to be notified when an event of interest happens. // // Becoming readable and/or writable are examples of events. Waiters are // expected to use a pattern similar to this to make a blocking function out of // a non-blocking one: // // func (o *object) blockingRead(...) error { // err := o.nonBlockingRead(...) // if err != ErrAgain { // // Completed with no need to wait! // return err // } // // e := createOrGetWaiterEntry(...) // o.EventRegister(&e, waiter.EventIn) // defer o.EventUnregister(&e) // // // We need to try to read again after registration because the // // object may have become readable between the last attempt to // // read and read registration. // err = o.nonBlockingRead(...) // for err == ErrAgain { // wait() // err = o.nonBlockingRead(...) // } // // return err // } // // Another goroutine needs to notify waiters when events happen. For example: // // func (o *object) Write(...) ... { // // Do write work. // [...] // // if oldDataAvailableSize == 0 && dataAvailableSize > 0 { // // If no data was available and now some data is // // available, the object became readable, so notify // // potential waiters about this. // o.Notify(waiter.EventIn) // } // } package waiter import ( "gvisor.dev/gvisor/pkg/sync" ) // EventMask represents io events as used in the poll() syscall. type EventMask uint64 // Events that waiters can wait on. The meaning is the same as those in the // poll() syscall. const ( EventIn EventMask = 0x01 // POLLIN EventPri EventMask = 0x02 // POLLPRI EventOut EventMask = 0x04 // POLLOUT EventErr EventMask = 0x08 // POLLERR EventHUp EventMask = 0x10 // POLLHUP EventRdNorm EventMask = 0x0040 // POLLRDNORM EventWrNorm EventMask = 0x0100 // POLLWRNORM EventInternal EventMask = 0x1000 EventRdHUp EventMask = 0x2000 // POLLRDHUP AllEvents EventMask = 0x1f | EventRdNorm | EventWrNorm | EventRdHUp ReadableEvents EventMask = EventIn | EventRdNorm WritableEvents EventMask = EventOut | EventWrNorm ) // EventMaskFromLinux returns an EventMask representing the supported events // from the Linux events e, which is in the format used by poll(2). func EventMaskFromLinux(e uint32) EventMask { // Our flag definitions are currently identical to Linux. return EventMask(e) & AllEvents } // ToLinux returns e in the format used by Linux poll(2). func (e EventMask) ToLinux() uint32 { // Our flag definitions are currently identical to Linux. return uint32(e) } // Waitable contains the methods that need to be implemented by waitable // objects. type Waitable interface { // Readiness returns what the object is currently ready for. If it's // not ready for a desired purpose, the caller may use EventRegister and // EventUnregister to get notifications once the object becomes ready. // // Implementations should allow for events like EventHUp and EventErr // to be returned regardless of whether they are in the input EventMask. Readiness(mask EventMask) EventMask // EventRegister registers the given waiter entry to receive // notifications when an event occurs that makes the object ready for // at least one of the events in mask. EventRegister(e *Entry) error // EventUnregister unregisters a waiter entry previously registered with // EventRegister(). EventUnregister(e *Entry) } // EventListener provides a notify callback. type EventListener interface { // NotifyEvent is the function to be called when the waiter entry is // notified. It is responsible for doing whatever is needed to wake up // the waiter. // // The callback is supposed to perform minimal work, and cannot call // any method on the queue itself because it will be locked while the // callback is running. // // The mask indicates the events that occurred and that the entry is // interested in. NotifyEvent(mask EventMask) } // Entry represents a waiter that can be add to the a wait queue. It can // only be in one queue at a time, and is added "intrusively" to the queue with // no extra memory allocations. // // +stateify savable type Entry struct { waiterEntry // eventListener receives the notification. eventListener EventListener // mask should be immutable once queued. mask EventMask } // Init initializes the Entry. // // This must only be called when unregistered. func (e *Entry) Init(eventListener EventListener, mask EventMask) { e.eventListener = eventListener e.mask = mask } // Mask returns the entry mask. func (e *Entry) Mask() EventMask { return e.mask } // NotifyEvent notifies the event listener. // // Mask should be the full set of active events. func (e *Entry) NotifyEvent(mask EventMask) { if m := mask & e.mask; m != 0 { e.eventListener.NotifyEvent(m) } } // ChannelNotifier is a simple channel-based notification. type ChannelNotifier chan struct{} // NotifyEvent implements waiter.EventListener.NotifyEvent. func (c ChannelNotifier) NotifyEvent(EventMask) { select { case c <- struct{}{}: default: } } // NewChannelEntry initializes a new Entry that does a non-blocking write to a // struct{} channel when the callback is called. It returns the new Entry // instance and the channel being used. func NewChannelEntry(mask EventMask) (e Entry, ch chan struct{}) { ch = make(chan struct{}, 1) e.Init(ChannelNotifier(ch), mask) return e, ch } type functionNotifier func(EventMask) // NotifyEvent implements waiter.EventListener.NotifyEvent. func (f functionNotifier) NotifyEvent(mask EventMask) { f(mask) } // NewFunctionEntry initializes a new Entry that calls the given function. func NewFunctionEntry(mask EventMask, fn func(EventMask)) (e Entry) { e.Init(functionNotifier(fn), mask) return e } // Queue represents the wait queue where waiters can be added and // notifiers can notify them when events happen. // // The zero value for waiter.Queue is an empty queue ready for use. // // +stateify savable type Queue struct { list waiterList mu sync.RWMutex `state:"nosave"` } // EventRegister adds a waiter to the wait queue. func (q *Queue) EventRegister(e *Entry) { q.mu.Lock() q.list.PushBack(e) q.mu.Unlock() } // EventUnregister removes the given waiter entry from the wait queue. func (q *Queue) EventUnregister(e *Entry) { q.mu.Lock() q.list.Remove(e) q.mu.Unlock() } // Notify notifies all waiters in the queue whose masks have at least one bit // in common with the notification mask. func (q *Queue) Notify(mask EventMask) { q.mu.RLock() for e := q.list.Front(); e != nil; e = e.Next() { m := mask & e.mask if m == 0 { continue } e.eventListener.NotifyEvent(m) // Skip intermediate call. } q.mu.RUnlock() } // Events returns the set of events being waited on. It is the union of the // masks of all registered entries. func (q *Queue) Events() EventMask { q.mu.RLock() defer q.mu.RUnlock() ret := EventMask(0) for e := q.list.Front(); e != nil; e = e.Next() { ret |= e.mask } return ret } // IsEmpty returns if the wait queue is empty or not. func (q *Queue) IsEmpty() bool { q.mu.RLock() defer q.mu.RUnlock() return q.list.Front() == nil } // AlwaysReady implements the Waitable interface but is always ready. Embedding // this struct into another struct makes it implement the boilerplate empty // functions automatically. type AlwaysReady struct { } // Readiness always returns the input mask because this object is always ready. func (*AlwaysReady) Readiness(mask EventMask) EventMask { return mask } // EventRegister doesn't do anything because this object doesn't need to issue // notifications because its readiness never changes. func (*AlwaysReady) EventRegister(*Entry) error { return nil } // EventUnregister doesn't do anything because this object doesn't need to issue // notifications because its readiness never changes. func (*AlwaysReady) EventUnregister(e *Entry) { } // NeverReady implements the Waitable interface but is never ready. Otherwise, // this is exactly the same as AlwaysReady. type NeverReady struct { } // Readiness always returns the input mask because this object is always ready. func (*NeverReady) Readiness(mask EventMask) EventMask { return mask } // EventRegister doesn't do anything because this object doesn't need to issue // notifications because its readiness never changes. func (*NeverReady) EventRegister(e *Entry) error { return nil } // EventUnregister doesn't do anything because this object doesn't need to issue // notifications because its readiness never changes. func (*NeverReady) EventUnregister(e *Entry) { } golang-gvisor-gvisor-0.0~20240729.0/pkg/waiter/waiter_list.go000066400000000000000000000117631465435605700236040ustar00rootroot00000000000000package waiter // ElementMapper provides an identity mapping by default. // // This can be replaced to provide a struct that maps elements to linker // objects, if they are not the same. An ElementMapper is not typically // required if: Linker is left as is, Element is left as is, or Linker and // Element are the same type. type waiterElementMapper struct{} // linkerFor maps an Element to a Linker. // // This default implementation should be inlined. // //go:nosplit func (waiterElementMapper) linkerFor(elem *Entry) *Entry { return elem } // List is an intrusive list. Entries can be added to or removed from the list // in O(1) time and with no additional memory allocations. // // The zero value for List is an empty list ready to use. // // To iterate over a list (where l is a List): // // for e := l.Front(); e != nil; e = e.Next() { // // do something with e. // } // // +stateify savable type waiterList struct { head *Entry tail *Entry } // Reset resets list l to the empty state. func (l *waiterList) Reset() { l.head = nil l.tail = nil } // Empty returns true iff the list is empty. // //go:nosplit func (l *waiterList) Empty() bool { return l.head == nil } // Front returns the first element of list l or nil. // //go:nosplit func (l *waiterList) Front() *Entry { return l.head } // Back returns the last element of list l or nil. // //go:nosplit func (l *waiterList) Back() *Entry { return l.tail } // Len returns the number of elements in the list. // // NOTE: This is an O(n) operation. // //go:nosplit func (l *waiterList) Len() (count int) { for e := l.Front(); e != nil; e = (waiterElementMapper{}.linkerFor(e)).Next() { count++ } return count } // PushFront inserts the element e at the front of list l. // //go:nosplit func (l *waiterList) PushFront(e *Entry) { linker := waiterElementMapper{}.linkerFor(e) linker.SetNext(l.head) linker.SetPrev(nil) if l.head != nil { waiterElementMapper{}.linkerFor(l.head).SetPrev(e) } else { l.tail = e } l.head = e } // PushFrontList inserts list m at the start of list l, emptying m. // //go:nosplit func (l *waiterList) PushFrontList(m *waiterList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { waiterElementMapper{}.linkerFor(l.head).SetPrev(m.tail) waiterElementMapper{}.linkerFor(m.tail).SetNext(l.head) l.head = m.head } m.head = nil m.tail = nil } // PushBack inserts the element e at the back of list l. // //go:nosplit func (l *waiterList) PushBack(e *Entry) { linker := waiterElementMapper{}.linkerFor(e) linker.SetNext(nil) linker.SetPrev(l.tail) if l.tail != nil { waiterElementMapper{}.linkerFor(l.tail).SetNext(e) } else { l.head = e } l.tail = e } // PushBackList inserts list m at the end of list l, emptying m. // //go:nosplit func (l *waiterList) PushBackList(m *waiterList) { if l.head == nil { l.head = m.head l.tail = m.tail } else if m.head != nil { waiterElementMapper{}.linkerFor(l.tail).SetNext(m.head) waiterElementMapper{}.linkerFor(m.head).SetPrev(l.tail) l.tail = m.tail } m.head = nil m.tail = nil } // InsertAfter inserts e after b. // //go:nosplit func (l *waiterList) InsertAfter(b, e *Entry) { bLinker := waiterElementMapper{}.linkerFor(b) eLinker := waiterElementMapper{}.linkerFor(e) a := bLinker.Next() eLinker.SetNext(a) eLinker.SetPrev(b) bLinker.SetNext(e) if a != nil { waiterElementMapper{}.linkerFor(a).SetPrev(e) } else { l.tail = e } } // InsertBefore inserts e before a. // //go:nosplit func (l *waiterList) InsertBefore(a, e *Entry) { aLinker := waiterElementMapper{}.linkerFor(a) eLinker := waiterElementMapper{}.linkerFor(e) b := aLinker.Prev() eLinker.SetNext(a) eLinker.SetPrev(b) aLinker.SetPrev(e) if b != nil { waiterElementMapper{}.linkerFor(b).SetNext(e) } else { l.head = e } } // Remove removes e from l. // //go:nosplit func (l *waiterList) Remove(e *Entry) { linker := waiterElementMapper{}.linkerFor(e) prev := linker.Prev() next := linker.Next() if prev != nil { waiterElementMapper{}.linkerFor(prev).SetNext(next) } else if l.head == e { l.head = next } if next != nil { waiterElementMapper{}.linkerFor(next).SetPrev(prev) } else if l.tail == e { l.tail = prev } linker.SetNext(nil) linker.SetPrev(nil) } // Entry is a default implementation of Linker. Users can add anonymous fields // of this type to their structs to make them automatically implement the // methods needed by List. // // +stateify savable type waiterEntry struct { next *Entry prev *Entry } // Next returns the entry that follows e in the list. // //go:nosplit func (e *waiterEntry) Next() *Entry { return e.next } // Prev returns the entry that precedes e in the list. // //go:nosplit func (e *waiterEntry) Prev() *Entry { return e.prev } // SetNext assigns 'entry' as the entry that follows e in the list. // //go:nosplit func (e *waiterEntry) SetNext(elem *Entry) { e.next = elem } // SetPrev assigns 'entry' as the entry that precedes e in the list. // //go:nosplit func (e *waiterEntry) SetPrev(elem *Entry) { e.prev = elem } golang-gvisor-gvisor-0.0~20240729.0/pkg/waiter/waiter_state_autogen.go000066400000000000000000000052101465435605700254610ustar00rootroot00000000000000// automatically generated by stateify. package waiter import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (e *Entry) StateTypeName() string { return "pkg/waiter.Entry" } func (e *Entry) StateFields() []string { return []string{ "waiterEntry", "eventListener", "mask", } } func (e *Entry) beforeSave() {} // +checklocksignore func (e *Entry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.waiterEntry) stateSinkObject.Save(1, &e.eventListener) stateSinkObject.Save(2, &e.mask) } func (e *Entry) afterLoad(context.Context) {} // +checklocksignore func (e *Entry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.waiterEntry) stateSourceObject.Load(1, &e.eventListener) stateSourceObject.Load(2, &e.mask) } func (q *Queue) StateTypeName() string { return "pkg/waiter.Queue" } func (q *Queue) StateFields() []string { return []string{ "list", } } func (q *Queue) beforeSave() {} // +checklocksignore func (q *Queue) StateSave(stateSinkObject state.Sink) { q.beforeSave() stateSinkObject.Save(0, &q.list) } func (q *Queue) afterLoad(context.Context) {} // +checklocksignore func (q *Queue) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &q.list) } func (l *waiterList) StateTypeName() string { return "pkg/waiter.waiterList" } func (l *waiterList) StateFields() []string { return []string{ "head", "tail", } } func (l *waiterList) beforeSave() {} // +checklocksignore func (l *waiterList) StateSave(stateSinkObject state.Sink) { l.beforeSave() stateSinkObject.Save(0, &l.head) stateSinkObject.Save(1, &l.tail) } func (l *waiterList) afterLoad(context.Context) {} // +checklocksignore func (l *waiterList) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &l.head) stateSourceObject.Load(1, &l.tail) } func (e *waiterEntry) StateTypeName() string { return "pkg/waiter.waiterEntry" } func (e *waiterEntry) StateFields() []string { return []string{ "next", "prev", } } func (e *waiterEntry) beforeSave() {} // +checklocksignore func (e *waiterEntry) StateSave(stateSinkObject state.Sink) { e.beforeSave() stateSinkObject.Save(0, &e.next) stateSinkObject.Save(1, &e.prev) } func (e *waiterEntry) afterLoad(context.Context) {} // +checklocksignore func (e *waiterEntry) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &e.next) stateSourceObject.Load(1, &e.prev) } func init() { state.Register((*Entry)(nil)) state.Register((*Queue)(nil)) state.Register((*waiterList)(nil)) state.Register((*waiterEntry)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/pkg/xdp/000077500000000000000000000000001465435605700202175ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/pkg/xdp/completionqueue.go000066400000000000000000000076671465435605700240040ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 || arm64 // +build amd64 arm64 package xdp import ( "gvisor.dev/gvisor/pkg/atomicbitops" ) // The CompletionQueue is how the kernel tells a process which buffers have // been transmitted and can be reused. // // CompletionQueue is not thread-safe and requires external synchronization type CompletionQueue struct { // mem is the mmap'd area shared with the kernel. Many other fields of // this struct point into mem. mem []byte // ring is the actual ring buffer. It is a list of frame addresses // ready to be reused. // // len(ring) must be a power of 2. ring []uint64 // mask is used whenever indexing into ring. It is always len(ring)-1. // It prevents index out of bounds errors while allowing the producer // and consumer pointers to repeatedly "overflow" and loop back around // the ring. mask uint32 // producer points to the shared atomic value that indicates the last // produced descriptor. Only the kernel updates this value. producer *atomicbitops.Uint32 // consumer points to the shared atomic value that indicates the last // consumed descriptor. Only we update this value. consumer *atomicbitops.Uint32 // flags points to the shared atomic value that holds flags for the // queue. flags *atomicbitops.Uint32 // Cached values are used to avoid relatively expensive atomic // operations. They are used, incremented, and decremented multiple // times with non-atomic operations, and then "batch-updated" by // reading or writing atomically to synchronize with the kernel. // cachedProducer is updated when we atomically read *producer. cachedProducer uint32 // cachedConsumer is used to atomically write *consumer. cachedConsumer uint32 } // Peek returns the number of buffers available to reuse as well as the index // at which they start. Peek will only return a buffer once, so callers must // process any received buffers. func (cq *CompletionQueue) Peek() (nAvailable, index uint32) { // Get the number of available buffers and update cachedConsumer to // reflect that we're going to consume them. entries := cq.free() index = cq.cachedConsumer cq.cachedConsumer += entries return entries, index } func (cq *CompletionQueue) free() uint32 { // Return any buffers we know about without incurring an atomic // operation if possible. entries := cq.cachedProducer - cq.cachedConsumer // If we're not aware of any completed packets, refresh the producer // pointer to see whether the kernel enqueued anything. if entries == 0 { cq.cachedProducer = cq.producer.Load() entries = cq.cachedProducer - cq.cachedConsumer } return entries } // Release notifies the kernel that we have consumed nDone packets. func (cq *CompletionQueue) Release(nDone uint32) { // We don't have to use an atomic add because only we update this; the // kernel just reads it. cq.consumer.Store(cq.consumer.RacyLoad() + nDone) } // Get gets the descriptor at index. func (cq *CompletionQueue) Get(index uint32) uint64 { // Use mask to avoid overflowing and loop back around the ring. return cq.ring[index&cq.mask] } // FreeAll dequeues as many buffers as possible from the queue and returns them // to the UMEM. // // +checklocks:umem.mu func (cq *CompletionQueue) FreeAll(umem *UMEM) { available, index := cq.Peek() if available < 1 { return } for i := uint32(0); i < available; i++ { umem.FreeFrame(cq.Get(index + i)) } cq.Release(available) } golang-gvisor-gvisor-0.0~20240729.0/pkg/xdp/fillqueue.go000066400000000000000000000100721465435605700225410ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 || arm64 // +build amd64 arm64 package xdp import ( "gvisor.dev/gvisor/pkg/atomicbitops" ) // The FillQueue is how a process tells the kernel which buffers are available // to be filled by incoming packets. // // FillQueue is not thread-safe and requires external synchronization type FillQueue struct { // mem is the mmap'd area shared with the kernel. Many other fields of // this struct point into mem. mem []byte // ring is the actual ring buffer. It is a list of frame addresses // ready for incoming packets. // // len(ring) must be a power of 2. ring []uint64 // mask is used whenever indexing into ring. It is always len(ring)-1. // It prevents index out of bounds errors while allowing the producer // and consumer pointers to repeatedly "overflow" and loop back around // the ring. mask uint32 // producer points to the shared atomic value that indicates the last // produced descriptor. Only we update this value. producer *atomicbitops.Uint32 // consumer points to the shared atomic value that indicates the last // consumed descriptor. Only the kernel updates this value. consumer *atomicbitops.Uint32 // flags points to the shared atomic value that holds flags for the // queue. flags *atomicbitops.Uint32 // Cached values are used to avoid relatively expensive atomic // operations. They are used, incremented, and decremented multiple // times with non-atomic operations, and then "batch-updated" by // reading or writing atomically to synchronize with the kernel. // cachedProducer is used to atomically write *producer. cachedProducer uint32 // cachedConsumer is updated when we atomically read *consumer. // cachedConsumer is actually len(ring) larger than the real consumer // value. See free() for details. cachedConsumer uint32 } // free returns the number of free descriptors in the fill queue. func (fq *FillQueue) free(toReserve uint32) uint32 { // Try to find free descriptors without incurring an atomic operation. // // cachedConsumer is always len(fq.ring) larger than the real consumer // value. This lets us, in the common case, compute the number of free // descriptors simply via fq.cachedConsumer - fq.cachedProducer without // also adding len(fq.ring). if available := fq.cachedConsumer - fq.cachedProducer; available >= toReserve { return available } // If we didn't already have enough descriptors available, check // whether the kernel has returned some to us. fq.cachedConsumer = fq.consumer.Load() fq.cachedConsumer += uint32(len(fq.ring)) return fq.cachedConsumer - fq.cachedProducer } // Notify updates the producer such that it is visible to the kernel. func (fq *FillQueue) Notify() { fq.producer.Store(fq.cachedProducer) } // Set sets the fill queue's descriptor at index to addr. func (fq *FillQueue) Set(index uint32, addr uint64) { // Use mask to avoid overflowing and loop back around the ring. fq.ring[index&fq.mask] = addr } // FillAll posts as many empty buffers as possible for the kernel to fill, then // notifies the kernel. // // +checklocks:umem.mu func (fq *FillQueue) FillAll(umem *UMEM) { // Figure out how many buffers and queue slots are available. available := fq.free(umem.nFreeFrames) if available == 0 { return } if available > umem.nFreeFrames { available = umem.nFreeFrames } // Fill the queue as much as possible and notify the kernel. index := fq.cachedProducer fq.cachedProducer += available for i := uint32(0); i < available; i++ { fq.Set(index+i, umem.AllocFrame()) } fq.Notify() } golang-gvisor-gvisor-0.0~20240729.0/pkg/xdp/rxqueue.go000066400000000000000000000071111465435605700222440ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 || arm64 // +build amd64 arm64 package xdp import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" ) // The RXQueue is how the kernel tells a process which buffers are full with // incoming packets. // // RXQueue is not thread-safe and requires external synchronization type RXQueue struct { // mem is the mmap'd area shared with the kernel. Many other fields of // this struct point into mem. mem []byte // ring is the actual ring buffer. It is a list of XDP descriptors // pointing to incoming packets. // // len(ring) must be a power of 2. ring []unix.XDPDesc // mask is used whenever indexing into ring. It is always len(ring)-1. // It prevents index out of bounds errors while allowing the producer // and consumer pointers to repeatedly "overflow" and loop back around // the ring. mask uint32 // producer points to the shared atomic value that indicates the last // produced descriptor. Only the kernel updates this value. producer *atomicbitops.Uint32 // consumer points to the shared atomic value that indicates the last // consumed descriptor. Only we update this value. consumer *atomicbitops.Uint32 // flags points to the shared atomic value that holds flags for the // queue. flags *atomicbitops.Uint32 // Cached values are used to avoid relatively expensive atomic // operations. They are used, incremented, and decremented multiple // times with non-atomic operations, and then "batch-updated" by // reading or writing atomically to synchronize with the kernel. // cachedProducer is updated when we atomically read *producer. cachedProducer uint32 // cachedConsumer is used to atomically write *consumer. cachedConsumer uint32 } // Peek returns the number of packets available to read as well as the index at // which they start. Peek will only return a packet once, so callers must // process any received packets. func (rq *RXQueue) Peek() (nReceived, index uint32) { // Get the number of available buffers and update cachedConsumer to // reflect that we're going to consume them. entries := rq.free() index = rq.cachedConsumer rq.cachedConsumer += entries return entries, index } func (rq *RXQueue) free() uint32 { // Return any buffers we know about without incurring an atomic // operation if possible. entries := rq.cachedProducer - rq.cachedConsumer // If we're not aware of any RX'd packets, refresh the producer pointer // to see whether the kernel enqueued anything. if entries == 0 { rq.cachedProducer = rq.producer.Load() entries = rq.cachedProducer - rq.cachedConsumer } return entries } // Release notifies the kernel that we have consumed nDone packets. func (rq *RXQueue) Release(nDone uint32) { // We don't have to use an atomic add because only we update this; the // kernel just reads it. rq.consumer.Store(rq.consumer.RacyLoad() + nDone) } // Get gets the descriptor at index. func (rq *RXQueue) Get(index uint32) unix.XDPDesc { // Use mask to avoid overflowing and loop back around the ring. return rq.ring[index&rq.mask] } golang-gvisor-gvisor-0.0~20240729.0/pkg/xdp/txqueue.go000066400000000000000000000077121465435605700222550ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 || arm64 // +build amd64 arm64 package xdp import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" ) // The TXQueue is how a process tells the kernel which buffers are available to // be sent via the NIC. // // TXQueue is not thread-safe and requires external synchronization type TXQueue struct { // sockfd is the underlying AF_XDP socket. sockfd uint32 // mem is the mmap'd area shared with the kernel. Many other fields of // this struct point into mem. mem []byte // ring is the actual ring buffer. It is a list of XDP descriptors // pointing to ready-to-transmit packets. // // len(ring) must be a power of 2. ring []unix.XDPDesc // mask is used whenever indexing into ring. It is always len(ring)-1. // It prevents index out of bounds errors while allowing the producer // and consumer pointers to repeatedly "overflow" and loop back around // the ring. mask uint32 // producer points to the shared atomic value that indicates the last // produced descriptor. Only we update this value. producer *atomicbitops.Uint32 // consumer points to the shared atomic value that indicates the last // consumed descriptor. Only the kernel updates this value. consumer *atomicbitops.Uint32 // flags points to the shared atomic value that holds flags for the // queue. flags *atomicbitops.Uint32 // Cached values are used to avoid relatively expensive atomic // operations. They are used, incremented, and decremented multiple // times with non-atomic operations, and then "batch-updated" by // reading or writing atomically to synchronize with the kernel. // cachedProducer is used to atomically write *producer. cachedProducer uint32 // cachedConsumer is updated when we atomically read *consumer. // cachedConsumer is actually len(ring) larger than the real consumer // value. See free() for details. cachedConsumer uint32 } // Reserve reserves descriptors in the queue. If toReserve descriptors cannot // be reserved, none are reserved. // // +checklocks:umem.mu func (tq *TXQueue) Reserve(umem *UMEM, toReserve uint32) (nReserved, index uint32) { if umem.nFreeFrames < toReserve || tq.free(toReserve) < toReserve { return 0, 0 } idx := tq.cachedProducer tq.cachedProducer += toReserve return toReserve, idx } // free returns the number of free descriptors in the TX queue. func (tq *TXQueue) free(toReserve uint32) uint32 { // Try to find free descriptors without incurring an atomic operation. // // cachedConsumer is always len(tq.ring) larger than the real consumer // value. This lets us, in the common case, compute the number of free // descriptors simply via tq.cachedConsumer - tq.cachedProducer without // also addign len(tq.ring). if available := tq.cachedConsumer - tq.cachedProducer; available >= toReserve { return available } // If we didn't already have enough descriptors available, check // whether the kernel has returned some to us. tq.cachedConsumer = tq.consumer.Load() tq.cachedConsumer += uint32(len(tq.ring)) return tq.cachedConsumer - tq.cachedProducer } // Notify updates the producer such that it is visible to the kernel. func (tq *TXQueue) Notify() { tq.producer.Store(tq.cachedProducer) tq.kick() } // Set sets the TX queue's descriptor at index to addr. func (tq *TXQueue) Set(index uint32, desc unix.XDPDesc) { // Use mask to avoid overflowing and loop back around the ring. tq.ring[index&tq.mask] = desc } golang-gvisor-gvisor-0.0~20240729.0/pkg/xdp/umem.go000066400000000000000000000056701465435605700215210ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 || arm64 // +build amd64 arm64 package xdp import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sync" ) // TODO(b/240191988): There's some kind of memory corruption bug that occurs // occasionally. This occurred even before TX was supported. // TODO(b/240191988): We can hold locks for less time if we accept a more // obtuse API. For example, CompletionQueue.FreeAll doesn't need to hold a // mutex for its entire duration. // UMEM is the shared memory area that the kernel and userspace put packets in. type UMEM struct { // mem is the mmap'd area shared with the kernel. mem []byte // sockfd is the underlying AF_XDP socket. sockfd uint32 // frameMask masks the lower bits of an address to get the frame's // address. frameMask uint64 // mu protects frameAddresses and nFreeFrames. mu sync.Mutex // frameAddresses is a stack of available frame addresses. // +checklocks:mu frameAddresses []uint64 // nFreeFrames is the number of frames available and is used to index // into frameAddresses. // +checklocks:mu nFreeFrames uint32 } // SockFD returns the underlying AF_XDP socket FD. func (um *UMEM) SockFD() uint32 { return um.sockfd } // Lock locks the UMEM. // // +checklocksacquire:um.mu func (um *UMEM) Lock() { um.mu.Lock() } // Unlock unlocks the UMEM. // // +checklocksrelease:um.mu func (um *UMEM) Unlock() { um.mu.Unlock() } // FreeFrame returns the frame containing addr to the set of free frames. // // The UMEM must be locked during the call to FreeFrame. // // +checklocks:um.mu func (um *UMEM) FreeFrame(addr uint64) { um.frameAddresses[um.nFreeFrames] = addr um.nFreeFrames++ } // AllocFrame returns the address of a frame that can be enqueued to the fill // or TX queue. It will panic if there are no frames left, so callers must call // it no more than the number of buffers reserved via TXQueue.Reserve(). // // The UMEM must be locked during the call to AllocFrame. // // +checklocks:um.mu func (um *UMEM) AllocFrame() uint64 { um.nFreeFrames-- return um.frameAddresses[um.nFreeFrames] & um.frameMask } // Get gets the bytes of the packet pointed to by desc. func (um *UMEM) Get(desc unix.XDPDesc) []byte { end := desc.Addr + uint64(desc.Len) if desc.Addr&um.frameMask != (end-1)&um.frameMask { panic(fmt.Sprintf("UMEM (%+v) access crosses frame boundaries: %+v", um, desc)) } return um.mem[desc.Addr:end] } golang-gvisor-gvisor-0.0~20240729.0/pkg/xdp/xdp.go000066400000000000000000000247751465435605700213600ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 || arm64 // +build amd64 arm64 // Package xdp provides tools for working with AF_XDP sockets. // // AF_XDP shares a memory area (UMEM) with the kernel to pass packets // back and forth. Communication is done via a number of queues. // Briefly, the queues work as follows: // // - Receive: Userspace adds a descriptor to the fill queue. The // descriptor points to an area of the UMEM that the kernel should fill // with an incoming packet. The packet is filled by the kernel, which // places a descriptor to the same UMEM area in the RX queue, signifying // that userspace may read the packet. // - Transmit: Userspace adds a descriptor to TX queue. The kernel // sends the packet (stored in UMEM) pointed to by the descriptor. // Upon completion, the kernel places a descriptor in the completion // queue to notify userspace that the packet is sent and the UMEM // area can be reused. // // So in short: RX packets move from the fill to RX queue, and TX // packets move from the TX to completion queue. // // Note that the shared UMEM for RX and TX means that packet forwarding // can be done without copying; only the queues need to be updated to point to // the packet in UMEM. package xdp import ( "fmt" "math/bits" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/memutil" ) // A ControlBlock contains all the control structures necessary to use an // AF_XDP socket. // // The ControlBlock and the structures it contains are meant to be used with a // single RX goroutine and a single TX goroutine. type ControlBlock struct { UMEM UMEM Fill FillQueue RX RXQueue TX TXQueue Completion CompletionQueue } // Opts configure an AF_XDP socket. type Opts struct { NFrames uint32 FrameSize uint32 NDescriptors uint32 Bind bool UseNeedWakeup bool } // DefaultOpts provides recommended default options for initializing an AF_XDP // socket. AF_XDP setup is extremely finnicky and can fail if incorrect values // are used. func DefaultOpts() Opts { return Opts{ NFrames: 4096, // Frames must be 2048 or 4096 bytes, although not all drivers support // both. FrameSize: 4096, NDescriptors: 2048, } } // New returns an initialized AF_XDP socket bound to a particular interface and // queue. func New(ifaceIdx, queueID uint32, opts Opts) (*ControlBlock, error) { sockfd, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0) if err != nil { return nil, fmt.Errorf("failed to create AF_XDP socket: %v", err) } return NewFromSocket(sockfd, ifaceIdx, queueID, opts) } // NewFromSocket takes an AF_XDP socket, initializes it, and binds it to a // particular interface and queue. func NewFromSocket(sockfd int, ifaceIdx, queueID uint32, opts Opts) (*ControlBlock, error) { if opts.FrameSize != 2048 && opts.FrameSize != 4096 { return nil, fmt.Errorf("invalid frame size %d: must be either 2048 or 4096", opts.FrameSize) } if bits.OnesCount32(opts.NDescriptors) != 1 { return nil, fmt.Errorf("invalid number of descriptors %d: must be a power of 2", opts.NDescriptors) } var cb ControlBlock // Create the UMEM area. Use mmap instead of make([[]byte) to ensure // that the UMEM is page-aligned. Aligning the UMEM keeps individual // packets from spilling over between pages. var zerofd uintptr umemMemory, err := memutil.MapSlice( 0, uintptr(opts.NFrames*opts.FrameSize), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_PRIVATE|unix.MAP_ANONYMOUS, zerofd-1, 0, ) if err != nil { return nil, fmt.Errorf("failed to mmap umem: %v", err) } cleanup := cleanup.Make(func() { memutil.UnmapSlice(umemMemory) }) if sliceBackingPointer(umemMemory)%uintptr(unix.Getpagesize()) != 0 { return nil, fmt.Errorf("UMEM is not page aligned (address 0x%x)", sliceBackingPointer(umemMemory)) } cb.UMEM = UMEM{ mem: umemMemory, sockfd: uint32(sockfd), frameAddresses: make([]uint64, opts.NFrames), nFreeFrames: opts.NFrames, frameMask: ^(uint64(opts.FrameSize) - 1), } // Fill in each frame address. for i := range cb.UMEM.frameAddresses { cb.UMEM.frameAddresses[i] = uint64(i) * uint64(opts.FrameSize) } // Check whether we're likely to fail due to RLIMIT_MEMLOCK. var rlimit unix.Rlimit if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlimit); err != nil { return nil, fmt.Errorf("failed to get rlimit for memlock: %v", err) } if rlimit.Cur < uint64(len(cb.UMEM.mem)) { log.Infof("UMEM size (%d) may exceed RLIMIT_MEMLOCK (%+v) and cause registration to fail", len(cb.UMEM.mem), rlimit) } reg := unix.XDPUmemReg{ Addr: uint64(sliceBackingPointer(umemMemory)), Len: uint64(len(umemMemory)), Size: opts.FrameSize, // Not useful in the RX path. Headroom: 0, // TODO(b/240191988): Investigate use of SHARED flag. Flags: 0, } if err := registerUMEM(sockfd, reg); err != nil { return nil, fmt.Errorf("failed to register UMEM: %v", err) } // Set the number of descriptors in the fill queue. if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_UMEM_FILL_RING, int(opts.NDescriptors)); err != nil { return nil, fmt.Errorf("failed to register fill ring: %v", err) } // Set the number of descriptors in the completion queue. if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_UMEM_COMPLETION_RING, int(opts.NDescriptors)); err != nil { return nil, fmt.Errorf("failed to register completion ring: %v", err) } // Set the number of descriptors in the RX queue. if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_RX_RING, int(opts.NDescriptors)); err != nil { return nil, fmt.Errorf("failed to register RX queue: %v", err) } // Set the number of descriptors in the TX queue. if err := unix.SetsockoptInt(sockfd, unix.SOL_XDP, unix.XDP_TX_RING, int(opts.NDescriptors)); err != nil { return nil, fmt.Errorf("failed to register TX queue: %v", err) } // Get offset information for the queues. Offsets indicate where, once // we mmap space for each queue, values in the queue are. They give // offsets for the shared pointers, a shared flags value, and the // beginning of the ring of descriptors. off, err := getOffsets(sockfd) if err != nil { return nil, fmt.Errorf("failed to get offsets: %v", err) } // Allocate space for the fill queue. fillQueueMem, err := memutil.MapSlice( 0, uintptr(off.Fr.Desc+uint64(opts.NDescriptors)*sizeOfFillQueueDesc()), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED|unix.MAP_POPULATE, uintptr(sockfd), unix.XDP_UMEM_PGOFF_FILL_RING, ) if err != nil { return nil, fmt.Errorf("failed to mmap fill queue: %v", err) } cleanup.Add(func() { memutil.UnmapSlice(fillQueueMem) }) // Setup the fillQueue with offsets into allocated memory. cb.Fill = FillQueue{ mem: fillQueueMem, mask: opts.NDescriptors - 1, cachedConsumer: opts.NDescriptors, } cb.Fill.init(off, opts) // Allocate space for the completion queue. completionQueueMem, err := memutil.MapSlice( 0, uintptr(off.Cr.Desc+uint64(opts.NDescriptors)*sizeOfCompletionQueueDesc()), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED|unix.MAP_POPULATE, uintptr(sockfd), unix.XDP_UMEM_PGOFF_COMPLETION_RING, ) if err != nil { return nil, fmt.Errorf("failed to mmap completion queue: %v", err) } cleanup.Add(func() { memutil.UnmapSlice(completionQueueMem) }) // Setup the completionQueue with offsets into allocated memory. cb.Completion = CompletionQueue{ mem: completionQueueMem, mask: opts.NDescriptors - 1, } cb.Completion.init(off, opts) // Allocate space for the RX queue. rxQueueMem, err := memutil.MapSlice( 0, uintptr(off.Rx.Desc+uint64(opts.NDescriptors)*sizeOfRXQueueDesc()), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED|unix.MAP_POPULATE, uintptr(sockfd), unix.XDP_PGOFF_RX_RING, ) if err != nil { return nil, fmt.Errorf("failed to mmap RX queue: %v", err) } cleanup.Add(func() { memutil.UnmapSlice(rxQueueMem) }) // Setup the rxQueue with offsets into allocated memory. cb.RX = RXQueue{ mem: rxQueueMem, mask: opts.NDescriptors - 1, } cb.RX.init(off, opts) // Allocate space for the TX queue. txQueueMem, err := memutil.MapSlice( 0, uintptr(off.Tx.Desc+uint64(opts.NDescriptors)*sizeOfTXQueueDesc()), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED|unix.MAP_POPULATE, uintptr(sockfd), unix.XDP_PGOFF_TX_RING, ) if err != nil { return nil, fmt.Errorf("failed to mmap tx queue: %v", err) } cleanup.Add(func() { memutil.UnmapSlice(txQueueMem) }) // Setup the txQueue with offsets into allocated memory. cb.TX = TXQueue{ sockfd: uint32(sockfd), mem: txQueueMem, mask: opts.NDescriptors - 1, cachedConsumer: opts.NDescriptors, } cb.TX.init(off, opts) // In some cases we don't call bind, as we're not in the netns with the // device. In those cases, another process with the same socket will // bind for us. if opts.Bind { if err := Bind(sockfd, ifaceIdx, queueID, opts.UseNeedWakeup); err != nil { return nil, fmt.Errorf("failed to bind to interface %d: %v", ifaceIdx, err) } } cleanup.Release() return &cb, nil } // Bind binds a socket to a particular network interface and queue. func Bind(sockfd int, ifindex, queueID uint32, useNeedWakeup bool) error { var flags uint16 if useNeedWakeup { flags |= unix.XDP_USE_NEED_WAKEUP } addr := unix.SockaddrXDP{ // XDP_USE_NEED_WAKEUP lets the driver sleep if there is no // work to do. It will need to be woken by poll. It is expected // that this improves performance by preventing the driver from // burning cycles. // // By not setting either XDP_COPY or XDP_ZEROCOPY, we instruct // the kernel to use zerocopy if available and then fallback to // copy mode. Flags: flags, Ifindex: ifindex, // AF_XDP sockets are per device RX queue, although multiple // sockets on multiple queues (or devices) can share a single // UMEM. QueueID: queueID, // We're not using shared mode, so the value here is irrelevant. SharedUmemFD: 0, } return unix.Bind(sockfd, &addr) } golang-gvisor-gvisor-0.0~20240729.0/pkg/xdp/xdp_state_autogen.go000066400000000000000000000004721465435605700242660ustar00rootroot00000000000000// automatically generated by stateify. //go:build (amd64 || arm64) && (amd64 || arm64) && (amd64 || arm64) && (amd64 || arm64) && (amd64 || arm64) && (amd64 || arm64) // +build amd64 arm64 // +build amd64 arm64 // +build amd64 arm64 // +build amd64 arm64 // +build amd64 arm64 // +build amd64 arm64 package xdp golang-gvisor-gvisor-0.0~20240729.0/pkg/xdp/xdp_unsafe.go000066400000000000000000000115451465435605700227100ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package xdp import ( "fmt" "reflect" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/atomicbitops" ) func registerUMEM(fd int, reg unix.XDPUmemReg) error { if _, _, errno := unix.Syscall6(unix.SYS_SETSOCKOPT, uintptr(fd), unix.SOL_XDP, unix.XDP_UMEM_REG, uintptr(unsafe.Pointer(®)), unsafe.Sizeof(reg), 0); errno != 0 { return fmt.Errorf("failed to setsockopt(XDP_UMEM_REG): errno %d", errno) } return nil } func getOffsets(fd int) (unix.XDPMmapOffsets, error) { var off unix.XDPMmapOffsets size := unsafe.Sizeof(off) if _, _, errno := unix.Syscall6(unix.SYS_GETSOCKOPT, uintptr(fd), unix.SOL_XDP, unix.XDP_MMAP_OFFSETS, uintptr(unsafe.Pointer(&off)), uintptr(unsafe.Pointer(&size)), 0); errno != 0 { return unix.XDPMmapOffsets{}, fmt.Errorf("failed to get offsets: %v", errno) } else if unsafe.Sizeof(off) != size { return unix.XDPMmapOffsets{}, fmt.Errorf("expected optlen of %d, but found %d", unsafe.Sizeof(off), size) } return off, nil } func sliceBackingPointer(slice []byte) uintptr { return uintptr(unsafe.Pointer(&slice[0])) } func sizeOfFillQueueDesc() uint64 { return uint64(unsafe.Sizeof(uint64(0))) } func sizeOfRXQueueDesc() uint64 { return uint64(unsafe.Sizeof(unix.XDPDesc{})) } func sizeOfCompletionQueueDesc() uint64 { return uint64(unsafe.Sizeof(uint64(0))) } func sizeOfTXQueueDesc() uint64 { return uint64(unsafe.Sizeof(unix.XDPDesc{})) } func (fq *FillQueue) init(off unix.XDPMmapOffsets, opts Opts) { fillQueueRingHdr := (*reflect.SliceHeader)(unsafe.Pointer(&fq.ring)) fillQueueRingHdr.Data = uintptr(unsafe.Pointer(&fq.mem[off.Fr.Desc])) fillQueueRingHdr.Len = int(opts.NDescriptors) fillQueueRingHdr.Cap = fillQueueRingHdr.Len fq.producer = (*atomicbitops.Uint32)(unsafe.Pointer(&fq.mem[off.Fr.Producer])) fq.consumer = (*atomicbitops.Uint32)(unsafe.Pointer(&fq.mem[off.Fr.Consumer])) fq.flags = (*atomicbitops.Uint32)(unsafe.Pointer(&fq.mem[off.Fr.Flags])) } func (rq *RXQueue) init(off unix.XDPMmapOffsets, opts Opts) { rxQueueRingHdr := (*reflect.SliceHeader)(unsafe.Pointer(&rq.ring)) rxQueueRingHdr.Data = uintptr(unsafe.Pointer(&rq.mem[off.Rx.Desc])) rxQueueRingHdr.Len = int(opts.NDescriptors) rxQueueRingHdr.Cap = rxQueueRingHdr.Len rq.producer = (*atomicbitops.Uint32)(unsafe.Pointer(&rq.mem[off.Rx.Producer])) rq.consumer = (*atomicbitops.Uint32)(unsafe.Pointer(&rq.mem[off.Rx.Consumer])) rq.flags = (*atomicbitops.Uint32)(unsafe.Pointer(&rq.mem[off.Rx.Flags])) // These probably don't have to be atomic, but we're only loading once // so better safe than sorry. rq.cachedProducer = rq.producer.Load() rq.cachedConsumer = rq.consumer.Load() } func (cq *CompletionQueue) init(off unix.XDPMmapOffsets, opts Opts) { completionQueueRingHdr := (*reflect.SliceHeader)(unsafe.Pointer(&cq.ring)) completionQueueRingHdr.Data = uintptr(unsafe.Pointer(&cq.mem[off.Cr.Desc])) completionQueueRingHdr.Len = int(opts.NDescriptors) completionQueueRingHdr.Cap = completionQueueRingHdr.Len cq.producer = (*atomicbitops.Uint32)(unsafe.Pointer(&cq.mem[off.Cr.Producer])) cq.consumer = (*atomicbitops.Uint32)(unsafe.Pointer(&cq.mem[off.Cr.Consumer])) cq.flags = (*atomicbitops.Uint32)(unsafe.Pointer(&cq.mem[off.Cr.Flags])) // These probably don't have to be atomic, but we're only loading once // so better safe than sorry. cq.cachedProducer = cq.producer.Load() cq.cachedConsumer = cq.consumer.Load() } func (tq *TXQueue) init(off unix.XDPMmapOffsets, opts Opts) { txQueueRingHdr := (*reflect.SliceHeader)(unsafe.Pointer(&tq.ring)) txQueueRingHdr.Data = uintptr(unsafe.Pointer(&tq.mem[off.Tx.Desc])) txQueueRingHdr.Len = int(opts.NDescriptors) txQueueRingHdr.Cap = txQueueRingHdr.Len tq.producer = (*atomicbitops.Uint32)(unsafe.Pointer(&tq.mem[off.Tx.Producer])) tq.consumer = (*atomicbitops.Uint32)(unsafe.Pointer(&tq.mem[off.Tx.Consumer])) tq.flags = (*atomicbitops.Uint32)(unsafe.Pointer(&tq.mem[off.Tx.Flags])) } // kick notifies the kernel that there are packets to transmit. func (tq *TXQueue) kick() error { if tq.flags.RacyLoad()&unix.XDP_RING_NEED_WAKEUP == 0 { return nil } var msg unix.Msghdr if _, _, errno := unix.Syscall6(unix.SYS_SENDMSG, uintptr(tq.sockfd), uintptr(unsafe.Pointer(&msg)), unix.MSG_DONTWAIT|unix.MSG_NOSIGNAL, 0, 0, 0); errno != 0 { return fmt.Errorf("failed to kick TX queue via sendmsg: errno %d", errno) } return nil } golang-gvisor-gvisor-0.0~20240729.0/pkg/xdp/xdp_unsafe_state_autogen.go000066400000000000000000000000651465435605700256250ustar00rootroot00000000000000// automatically generated by stateify. package xdp golang-gvisor-gvisor-0.0~20240729.0/runsc/000077500000000000000000000000001465435605700177755ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/000077500000000000000000000000001465435605700207405ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/autosave.go000066400000000000000000000076201465435605700231230ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package boot import ( "bytes" "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/state" "gvisor.dev/gvisor/pkg/sentry/strace" "gvisor.dev/gvisor/pkg/sync" ) func getSaveOpts(l *Loader, k *kernel.Kernel, isResume bool) state.SaveOpts { t, _ := state.CPUTime() log.Infof("Before save CPU usage: %s", t.String()) saveOpts := state.SaveOpts{ Key: nil, Resume: isResume, Callback: func(err error) { t1, _ := state.CPUTime() log.Infof("Save CPU usage: %s", (t1 - t).String()) if err == nil { log.Infof("Save succeeded: exiting...") k.SetSaveSuccess(true) } else { log.Warningf("Save failed: exiting... %v", err) k.SetSaveError(err) } if !isResume { // Kill the sandbox. k.Kill(linux.WaitStatusExit(0)) } }, } return saveOpts } func getTargetForSaveResume(l *Loader) func(k *kernel.Kernel) { return func(k *kernel.Kernel) { saveOpts := getSaveOpts(l, k, true /* isResume */) // Store the state file contents in a buffer for save-resume. // There is no need to verify the state file, we just need the // sandbox to continue running after save. var buf bytes.Buffer saveOpts.Destination = &buf saveOpts.Save(k.SupervisorContext(), k, l.watchdog) } } func getTargetForSaveRestore(l *Loader, files []*fd.FD) func(k *kernel.Kernel) { if len(files) != 1 && len(files) != 3 { panic(fmt.Sprintf("Unexpected number of files: %v", len(files))) } var once sync.Once return func(k *kernel.Kernel) { once.Do(func() { saveOpts := getSaveOpts(l, k, false /* isResume */) saveOpts.Destination = files[0] if len(files) == 3 { saveOpts.PagesMetadata = files[1] saveOpts.PagesFile = files[2] } saveOpts.Save(k.SupervisorContext(), k, l.watchdog) }) } } // enableAutosave enables auto save restore in syscall tests. func enableAutosave(l *Loader, isResume bool, files []*fd.FD) error { var target func(k *kernel.Kernel) if isResume { target = getTargetForSaveResume(l) } else { target = getTargetForSaveRestore(l, files) } for _, table := range kernel.SyscallTables() { sys, ok := strace.Lookup(table.OS, table.Arch) if !ok { continue } if err := configureInitSyscall(table, sys, "init_module", kernel.ExternalAfterEnable); err != nil { return err } // Set external args to our closure above. table.External = target } return nil } // configureInitSyscall sets the trigger for the S/R syscall tests and the callback // method to be called after the sycall is executed. func configureInitSyscall(table *kernel.SyscallTable, sys strace.SyscallMap, initSyscall string, syscallFlag uint32) error { sl := make(map[uintptr]bool) sysno, ok := sys.ConvertToSysno(initSyscall) if !ok { return fmt.Errorf("syscall %q not found", initSyscall) } sl[sysno] = true log.Infof("sysno %v name %v", sysno, initSyscall) table.FeatureEnable.Enable(syscallFlag, sl, false) table.ExternalFilterBefore = func(*kernel.Task, uintptr, arch.SyscallArguments) bool { return false } // Sets ExternalFilterAfter to true which calls the closure assigned to // External after the syscall is executed. table.ExternalFilterAfter = func(*kernel.Task, uintptr, arch.SyscallArguments) bool { return true } return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/boot_amd64_state_autogen.go000066400000000000000000000000661465435605700261510ustar00rootroot00000000000000// automatically generated by stateify. package boot golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/boot_arm64_state_autogen.go000066400000000000000000000000661465435605700261670ustar00rootroot00000000000000// automatically generated by stateify. package boot golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/boot_impl_state_autogen.go000066400000000000000000000001321465435605700261710ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package boot golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/boot_state_autogen.go000066400000000000000000000016451465435605700251620ustar00rootroot00000000000000// automatically generated by stateify. package boot import ( "context" "gvisor.dev/gvisor/pkg/state" ) func (f *sandboxNetstackCreator) StateTypeName() string { return "runsc/boot.sandboxNetstackCreator" } func (f *sandboxNetstackCreator) StateFields() []string { return []string{ "clock", "allowPacketEndpointWrite", } } func (f *sandboxNetstackCreator) beforeSave() {} // +checklocksignore func (f *sandboxNetstackCreator) StateSave(stateSinkObject state.Sink) { f.beforeSave() stateSinkObject.Save(0, &f.clock) stateSinkObject.Save(1, &f.allowPacketEndpointWrite) } func (f *sandboxNetstackCreator) afterLoad(context.Context) {} // +checklocksignore func (f *sandboxNetstackCreator) StateLoad(ctx context.Context, stateSourceObject state.Source) { stateSourceObject.Load(0, &f.clock) stateSourceObject.Load(1, &f.allowPacketEndpointWrite) } func init() { state.Register((*sandboxNetstackCreator)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/compat.go000066400000000000000000000116031465435605700225530ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package boot import ( "fmt" "os" "golang.org/x/sys/unix" "google.golang.org/protobuf/proto" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/log" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" "gvisor.dev/gvisor/pkg/sentry/strace" spb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" "gvisor.dev/gvisor/pkg/sync" ) func initCompatLogs(fd int) error { ce, err := newCompatEmitter(fd) if err != nil { return err } eventchannel.AddEmitter(ce) return nil } type compatEmitter struct { sink *log.BasicLogger nameMap strace.SyscallMap // mu protects the fields below. mu sync.Mutex // trackers map syscall number to the respective tracker instance. // Protected by 'mu'. trackers map[uint64]syscallTracker } func newCompatEmitter(logFD int) (*compatEmitter, error) { nameMap, ok := getSyscallNameMap() if !ok { return nil, fmt.Errorf("syscall table not found") } c := &compatEmitter{ // Always logs to default logger. sink: log.Log(), nameMap: nameMap, trackers: make(map[uint64]syscallTracker), } if logFD > 0 { f := os.NewFile(uintptr(logFD), "user log file") target := &log.MultiEmitter{c.sink, log.K8sJSONEmitter{&log.Writer{Next: f}}} c.sink = &log.BasicLogger{Level: log.Info, Emitter: target} } return c, nil } // Emit implements eventchannel.Emitter. func (c *compatEmitter) Emit(msg proto.Message) (bool, error) { switch m := msg.(type) { case *spb.UnimplementedSyscall: c.emitUnimplementedSyscall(m) } return false, nil } func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) { regs := us.Registers c.mu.Lock() defer c.mu.Unlock() sysnr := syscallNum(regs) tr := c.trackers[sysnr] if tr == nil { switch sysnr { case unix.SYS_PRCTL: // args: cmd, ... tr = newArgsTracker(0) case unix.SYS_IOCTL, unix.SYS_EPOLL_CTL, unix.SYS_SHMCTL, unix.SYS_FUTEX, unix.SYS_FALLOCATE: // args: fd/addr, cmd, ... tr = newArgsTracker(1) case unix.SYS_GETSOCKOPT, unix.SYS_SETSOCKOPT: // args: fd, level, name, ... tr = newArgsTracker(1, 2) case unix.SYS_SEMCTL: // args: semid, semnum, cmd, ... tr = newArgsTracker(2) default: tr = newArchArgsTracker(sysnr) if tr == nil { tr = &onceTracker{} } } c.trackers[sysnr] = tr } if tr.shouldReport(regs) { name := c.nameMap.Name(uintptr(sysnr)) c.sink.Infof("Unsupported syscall %s(%#x,%#x,%#x,%#x,%#x,%#x). It is "+ "likely that you can safely ignore this message and that this is not "+ "the cause of any error. Please, refer to %s/%s for more information.", name, argVal(0, regs), argVal(1, regs), argVal(2, regs), argVal(3, regs), argVal(4, regs), argVal(5, regs), syscallLink, name) tr.onReported(regs) } } // Close implements eventchannel.Emitter. func (c *compatEmitter) Close() error { c.sink = nil return nil } // syscallTracker interface allows filters to apply differently depending on // the syscall and arguments. type syscallTracker interface { // shouldReport returns true is the syscall should be reported. shouldReport(regs *rpb.Registers) bool // onReported marks the syscall as reported. onReported(regs *rpb.Registers) } // onceTracker reports only a single time, used for most syscalls. type onceTracker struct { reported bool } func (o *onceTracker) shouldReport(_ *rpb.Registers) bool { return !o.reported } func (o *onceTracker) onReported(_ *rpb.Registers) { o.reported = true } // argsTracker reports only once for each different combination of arguments. // It's used for generic syscalls like ioctl to report once per 'cmd'. type argsTracker struct { // argsIdx is the syscall arguments to use as unique ID. argsIdx []int reported map[string]struct{} count int } func newArgsTracker(argIdx ...int) *argsTracker { return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})} } // key returns the command based on the syscall argument index. func (a *argsTracker) key(regs *rpb.Registers) string { var rv string for _, idx := range a.argsIdx { rv += fmt.Sprintf("%d|", argVal(idx, regs)) } return rv } func (a *argsTracker) shouldReport(regs *rpb.Registers) bool { if a.count >= reportLimit { return false } _, ok := a.reported[a.key(regs)] return !ok } func (a *argsTracker) onReported(regs *rpb.Registers) { a.count++ a.reported[a.key(regs)] = struct{}{} } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/compat_amd64.go000066400000000000000000000045061465435605700235520ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package boot import ( "fmt" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/sentry/arch" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" "gvisor.dev/gvisor/pkg/sentry/strace" ) const ( // reportLimit is the max number of events that should be reported per // tracker. reportLimit = 100 syscallLink = "https://gvisor.dev/c/linux/amd64" ) // newRegs create a empty Registers instance. func newRegs() *rpb.Registers { return &rpb.Registers{ Arch: &rpb.Registers_Amd64{ Amd64: &rpb.AMD64Registers{}, }, } } func argVal(argIdx int, regs *rpb.Registers) uint64 { amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 switch argIdx { case 0: return amd64Regs.Rdi case 1: return amd64Regs.Rsi case 2: return amd64Regs.Rdx case 3: return amd64Regs.R10 case 4: return amd64Regs.R8 case 5: return amd64Regs.R9 } panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) } func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) { amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 switch argIdx { case 0: amd64Regs.Rdi = argVal case 1: amd64Regs.Rsi = argVal case 2: amd64Regs.Rdx = argVal case 3: amd64Regs.R10 = argVal case 4: amd64Regs.R8 = argVal case 5: amd64Regs.R9 = argVal default: panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) } } func getSyscallNameMap() (strace.SyscallMap, bool) { return strace.Lookup(abi.Linux, arch.AMD64) } func syscallNum(regs *rpb.Registers) uint64 { amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64 return amd64Regs.OrigRax } func newArchArgsTracker(sysnr uint64) syscallTracker { switch sysnr { case unix.SYS_ARCH_PRCTL: // args: cmd, ... return newArgsTracker(0) } return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/compat_arm64.go000066400000000000000000000044051465435605700235660ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package boot import ( "fmt" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/sentry/arch" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" "gvisor.dev/gvisor/pkg/sentry/strace" ) const ( // reportLimit is the max number of events that should be reported per // tracker. reportLimit = 100 syscallLink = "https://gvisor.dev/c/linux/arm64" ) // newRegs create a empty Registers instance. func newRegs() *rpb.Registers { return &rpb.Registers{ Arch: &rpb.Registers_Arm64{ Arm64: &rpb.ARM64Registers{}, }, } } func argVal(argIdx int, regs *rpb.Registers) uint64 { arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 switch argIdx { case 0: return arm64Regs.R0 case 1: return arm64Regs.R1 case 2: return arm64Regs.R2 case 3: return arm64Regs.R3 case 4: return arm64Regs.R4 case 5: return arm64Regs.R5 } panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) } func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) { arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 switch argIdx { case 0: arm64Regs.R0 = argVal case 1: arm64Regs.R1 = argVal case 2: arm64Regs.R2 = argVal case 3: arm64Regs.R3 = argVal case 4: arm64Regs.R4 = argVal case 5: arm64Regs.R5 = argVal default: panic(fmt.Sprintf("invalid syscall argument index %d", argIdx)) } } func getSyscallNameMap() (strace.SyscallMap, bool) { return strace.Lookup(abi.Linux, arch.ARM64) } func syscallNum(regs *rpb.Registers) uint64 { arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64 return arm64Regs.R8 } func newArchArgsTracker(sysnr uint64) syscallTracker { // currently, no arch specific syscalls need to be handled here. return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/controller.go000066400000000000000000000676761465435605700235000ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package boot import ( "errors" "fmt" "path" "strconv" "sync" gtime "time" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/control/server" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fsimpl/erofs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/seccheck" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/state/statefile" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot/procfs" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" ) const ( // ContMgrCheckpoint checkpoints a container. ContMgrCheckpoint = "containerManager.Checkpoint" // ContMgrCreateSubcontainer creates a sub-container. ContMgrCreateSubcontainer = "containerManager.CreateSubcontainer" // ContMgrDestroySubcontainer is used to stop a sub-container and free all // associated resources in the sandbox. ContMgrDestroySubcontainer = "containerManager.DestroySubcontainer" // ContMgrEvent gets stats about the container used by "runsc events". ContMgrEvent = "containerManager.Event" // ContMgrExecuteAsync executes a command in a container. ContMgrExecuteAsync = "containerManager.ExecuteAsync" // ContMgrPortForward starts port forwarding with the sandbox. ContMgrPortForward = "containerManager.PortForward" // ContMgrProcesses lists processes running in a container. ContMgrProcesses = "containerManager.Processes" // ContMgrRestore restores a container from a statefile. ContMgrRestore = "containerManager.Restore" // ContMgrRestoreSubcontainer restores a container from a statefile. ContMgrRestoreSubcontainer = "containerManager.RestoreSubcontainer" // ContMgrPause pauses all tasks, blocking until they are stopped. ContMgrPause = "containerManager.Pause" // ContMgrResume resumes all tasks. ContMgrResume = "containerManager.Resume" // ContMgrSignal sends a signal to a container. ContMgrSignal = "containerManager.Signal" // ContMgrStartSubcontainer starts a sub-container inside a running sandbox. ContMgrStartSubcontainer = "containerManager.StartSubcontainer" // ContMgrWait waits on the init process of the container and returns its // ExitStatus. ContMgrWait = "containerManager.Wait" // ContMgrWaitPID waits on a process with a certain PID in the sandbox and // return its ExitStatus. ContMgrWaitPID = "containerManager.WaitPID" // ContMgrWaitCheckpoint waits for the Kernel to have been successfully // checkpointed n-1 times, then waits for either the n-th successful // checkpoint (in which case it returns nil) or any number of failed // checkpoints (in which case it returns an error returned by any such // failure). ContMgrWaitCheckpoint = "containerManager.WaitCheckpoint" // ContMgrRootContainerStart starts a new sandbox with a root container. ContMgrRootContainerStart = "containerManager.StartRoot" // ContMgrCreateTraceSession starts a trace session. ContMgrCreateTraceSession = "containerManager.CreateTraceSession" // ContMgrDeleteTraceSession deletes a trace session. ContMgrDeleteTraceSession = "containerManager.DeleteTraceSession" // ContMgrListTraceSessions lists a trace session. ContMgrListTraceSessions = "containerManager.ListTraceSessions" // ContMgrProcfsDump dumps sandbox procfs state. ContMgrProcfsDump = "containerManager.ProcfsDump" // ContMgrMount mounts a filesystem in a container. ContMgrMount = "containerManager.Mount" // ContMgrContainerRuntimeState returns the runtime state of a container. ContMgrContainerRuntimeState = "containerManager.ContainerRuntimeState" ) const ( // NetworkCreateLinksAndRoutes creates links and routes in a network stack. NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes" // DebugStacks collects sandbox stacks for debugging. DebugStacks = "debug.Stacks" ) // Profiling related commands (see pprof.go for more details). const ( ProfileCPU = "Profile.CPU" ProfileHeap = "Profile.Heap" ProfileBlock = "Profile.Block" ProfileMutex = "Profile.Mutex" ProfileTrace = "Profile.Trace" ) // Logging related commands (see logging.go for more details). const ( LoggingChange = "Logging.Change" ) // Usage related commands (see usage.go for more details). const ( UsageCollect = "Usage.Collect" UsageUsageFD = "Usage.UsageFD" ) // Metrics related commands (see metrics.go). const ( MetricsGetRegistered = "Metrics.GetRegisteredMetrics" MetricsExport = "Metrics.Export" ) // Commands for interacting with cgroupfs within the sandbox. const ( CgroupsReadControlFiles = "Cgroups.ReadControlFiles" CgroupsWriteControlFiles = "Cgroups.WriteControlFiles" ) // controller holds the control server, and is used for communication into the // sandbox. type controller struct { // srv is the control server. srv *server.Server // manager holds the containerManager methods. manager *containerManager } // newController creates a new controller. The caller must call // controller.srv.StartServing() to start the controller. func newController(fd int, l *Loader) (*controller, error) { srv, err := server.CreateFromFD(fd) if err != nil { return nil, err } ctrl := &controller{ manager: &containerManager{ startChan: make(chan struct{}), startResultChan: make(chan error), l: l, }, srv: srv, } ctrl.registerHandlers() return ctrl, nil } func (c *controller) registerHandlers() { l := c.manager.l c.srv.Register(c.manager) c.srv.Register(&control.Cgroups{Kernel: l.k}) c.srv.Register(&control.Lifecycle{Kernel: l.k}) c.srv.Register(&control.Logging{}) c.srv.Register(&control.Proc{Kernel: l.k}) c.srv.Register(&control.State{Kernel: l.k}) c.srv.Register(&control.Usage{Kernel: l.k}) c.srv.Register(&control.Metrics{}) c.srv.Register(&debug{}) if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok { c.srv.Register(&Network{ Stack: eps.Stack, Kernel: l.k, }) } if l.root.conf.ProfileEnable { c.srv.Register(control.NewProfile(l.k)) } } // refreshHandlers resets the server and re-registers all handlers using l. // Useful when l.k has been replaced (e.g. during a restore). func (c *controller) refreshHandlers() { c.srv.ResetServer() c.registerHandlers() } // stopRPCTimeout is the time for clients to finish making any RPCs. Note that // ongoing RPCs after this timeout still run to completion. const stopRPCTimeout = 15 * gtime.Second func (c *controller) stop() { c.srv.Stop(stopRPCTimeout) } // containerManager manages sandbox containers. type containerManager struct { // startChan is used to signal when the root container process should // be started. startChan chan struct{} // startResultChan is used to signal when the root container has // started. Any errors encountered during startup will be sent to the // channel. A nil value indicates success. startResultChan chan error // l is the loader that creates containers and sandboxes. l *Loader // restorer is set when the sandbox in being restored. It stores the state // of all containers and perform all actions required by restore. restorer *restorer } // StartRoot will start the root container process. func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error { log.Debugf("containerManager.StartRoot, cid: %s", *cid) // Tell the root container to start and wait for the result. return cm.onStart() } // onStart notifies that sandbox is ready to start and wait for the result. func (cm *containerManager) onStart() error { cm.startChan <- struct{}{} if err := <-cm.startResultChan; err != nil { return fmt.Errorf("starting sandbox: %v", err) } return nil } // Processes retrieves information about processes running in the sandbox. func (cm *containerManager) Processes(cid *string, out *[]*control.Process) error { log.Debugf("containerManager.Processes, cid: %s", *cid) return control.Processes(cm.l.k, *cid, out) } // CreateArgs contains arguments to the Create method. type CreateArgs struct { // CID is the ID of the container to start. CID string // FilePayload may contain a TTY file for the terminal, if enabled. urpc.FilePayload } // CreateSubcontainer creates a container within a sandbox. func (cm *containerManager) CreateSubcontainer(args *CreateArgs, _ *struct{}) error { log.Debugf("containerManager.CreateSubcontainer: %s", args.CID) if len(args.Files) > 1 { return fmt.Errorf("start arguments must have at most 1 files for TTY") } var tty *fd.FD if len(args.Files) == 1 { var err error tty, err = fd.NewFromFile(args.Files[0]) if err != nil { return fmt.Errorf("error dup'ing TTY file: %w", err) } } return cm.l.createSubcontainer(args.CID, tty) } // StartArgs contains arguments to the Start method. type StartArgs struct { // Spec is the spec of the container to start. Spec *specs.Spec // Config is the runsc-specific configuration for the sandbox. Conf *config.Config // CID is the ID of the container to start. CID string // NumGoferFilestoreFDs is the number of gofer filestore FDs donated. NumGoferFilestoreFDs int // IsDevIoFilePresent indicates whether the dev gofer FD is present. IsDevIoFilePresent bool // GoferMountConfs contains information about how the gofer mounts have been // configured. The first entry is for rootfs and the following entries are // for bind mounts in Spec.Mounts (in the same order). GoferMountConfs []GoferMountConf // FilePayload contains, in order: // * stdin, stdout, and stderr (optional: if terminal is disabled). // * file descriptors to gofer-backing host files (optional). // * file descriptor for /dev gofer connection (optional) // * file descriptors to connect to gofer to serve the root filesystem. urpc.FilePayload } // StartSubcontainer runs a created container within a sandbox. func (cm *containerManager) StartSubcontainer(args *StartArgs, _ *struct{}) error { // Validate arguments. if args == nil { return errors.New("start missing arguments") } log.Debugf("containerManager.StartSubcontainer, cid: %s, args: %+v", args.CID, args) if args.Spec == nil { return errors.New("start arguments missing spec") } if args.Conf == nil { return errors.New("start arguments missing config") } if args.CID == "" { return errors.New("start argument missing container ID") } expectedFDs := 1 // At least one FD for the root filesystem. expectedFDs += args.NumGoferFilestoreFDs if args.IsDevIoFilePresent { expectedFDs++ } if !args.Spec.Process.Terminal { expectedFDs += 3 } if len(args.Files) < expectedFDs { return fmt.Errorf("start arguments must contain at least %d FDs, but only got %d", expectedFDs, len(args.Files)) } // All validation passed, logs the spec for debugging. specutils.LogSpecDebug(args.Spec, args.Conf.OCISeccomp) goferFiles := args.Files var stdios []*fd.FD if !args.Spec.Process.Terminal { // When not using a terminal, stdios come as the first 3 files in the // payload. var err error stdios, err = fd.NewFromFiles(goferFiles[:3]) if err != nil { return fmt.Errorf("error dup'ing stdio files: %w", err) } goferFiles = goferFiles[3:] } defer func() { for _, fd := range stdios { _ = fd.Close() } }() var goferFilestoreFDs []*fd.FD for i := 0; i < args.NumGoferFilestoreFDs; i++ { goferFilestoreFD, err := fd.NewFromFile(goferFiles[i]) if err != nil { return fmt.Errorf("error dup'ing gofer filestore file: %w", err) } goferFilestoreFDs = append(goferFilestoreFDs, goferFilestoreFD) } goferFiles = goferFiles[args.NumGoferFilestoreFDs:] defer func() { for _, fd := range goferFilestoreFDs { _ = fd.Close() } }() var devGoferFD *fd.FD if args.IsDevIoFilePresent { var err error devGoferFD, err = fd.NewFromFile(goferFiles[0]) if err != nil { return fmt.Errorf("error dup'ing dev gofer file: %w", err) } goferFiles = goferFiles[1:] defer devGoferFD.Close() } goferFDs, err := fd.NewFromFiles(goferFiles) if err != nil { return fmt.Errorf("error dup'ing gofer files: %w", err) } defer func() { for _, fd := range goferFDs { _ = fd.Close() } }() if err := cm.l.startSubcontainer(args.Spec, args.Conf, args.CID, stdios, goferFDs, goferFilestoreFDs, devGoferFD, args.GoferMountConfs); err != nil { log.Debugf("containerManager.StartSubcontainer failed, cid: %s, args: %+v, err: %v", args.CID, args, err) return err } log.Debugf("Container started, cid: %s", args.CID) return nil } // DestroySubcontainer stops a container if it is still running and cleans up // its filesystem. func (cm *containerManager) DestroySubcontainer(cid *string, _ *struct{}) error { log.Debugf("containerManager.DestroySubcontainer, cid: %s", *cid) return cm.l.destroySubcontainer(*cid) } // ExecuteAsync starts running a command on a created or running sandbox. It // returns the PID of the new process. func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) error { log.Debugf("containerManager.ExecuteAsync, cid: %s, args: %+v", args.ContainerID, args) tgid, err := cm.l.executeAsync(args) if err != nil { log.Debugf("containerManager.ExecuteAsync failed, cid: %s, args: %+v, err: %v", args.ContainerID, args, err) return err } *pid = int32(tgid) return nil } // Checkpoint pauses a sandbox and saves its state. func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error { log.Debugf("containerManager.Checkpoint") return cm.l.save(o) } // PortForwardOpts contains options for port forwarding to a port in a // container. type PortForwardOpts struct { // FilePayload contains one fd for a UDS (or local port) used for port // forwarding. urpc.FilePayload // ContainerID is the container for the process being executed. ContainerID string // Port is the port to to forward. Port uint16 } // PortForward initiates a port forward to the container. func (cm *containerManager) PortForward(opts *PortForwardOpts, _ *struct{}) error { log.Debugf("containerManager.PortForward, cid: %s, port: %d", opts.ContainerID, opts.Port) if err := cm.l.portForward(opts); err != nil { log.Debugf("containerManager.PortForward failed, opts: %+v, err: %v", opts, err) return err } return nil } // RestoreOpts contains options related to restoring a container's file system. type RestoreOpts struct { // FilePayload contains the state file to be restored, followed in order by: // 1. checkpoint state file. // 2. optional checkpoint pages metadata file. // 3. optional checkpoint pages file. // 4. optional platform device file. urpc.FilePayload HavePagesFile bool HaveDeviceFile bool } // Restore loads a container from a statefile. // The container's current kernel is destroyed, a restore environment is // created, and the kernel is recreated with the restore state file. The // container then sends the signal to start. func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { log.Debugf("containerManager.Restore") if cm.l.state == restoring { return fmt.Errorf("restore is already in progress") } if cm.l.state == started { return fmt.Errorf("cannot restore a started container") } if len(o.Files) == 0 { return fmt.Errorf("at least one file must be passed to Restore") } stateFile, err := o.ReleaseFD(0) if err != nil { return err } var stat unix.Stat_t if err := unix.Fstat(stateFile.FD(), &stat); err != nil { return err } if stat.Size == 0 { return fmt.Errorf("statefile cannot be empty") } cm.restorer = &restorer{restoreDone: cm.onRestoreDone, stateFile: stateFile} cm.l.restoreWaiters = sync.NewCond(&cm.l.mu) cm.l.state = restoring fileIdx := 1 if o.HavePagesFile { cm.restorer.pagesMetadata, err = o.ReleaseFD(fileIdx) if err != nil { return err } fileIdx++ cm.restorer.pagesFile, err = o.ReleaseFD(fileIdx) if err != nil { return err } fileIdx++ } if o.HaveDeviceFile { cm.restorer.deviceFile, err = o.ReleaseFD(fileIdx) if err != nil { return err } fileIdx++ } if fileIdx < len(o.Files) { return fmt.Errorf("more files passed to Restore than expected") } // Pause the kernel while we build a new one. cm.l.k.Pause() metadata, err := statefile.MetadataUnsafe(cm.restorer.stateFile) if err != nil { return fmt.Errorf("reading metadata from statefile: %w", err) } var count int countStr, ok := metadata["container_count"] if !ok { // TODO(gvisor.dev/issue/1956): Add container count with syscall save // trigger. For now, assume that only a single container exists if metadata // isn't present. // // -return errors.New("container count not present in state file") count = 1 } else { count, err = strconv.Atoi(countStr) if err != nil { return fmt.Errorf("invalid container count: %w", err) } if count < 1 { return fmt.Errorf("invalid container count value: %v", count) } } cm.restorer.totalContainers = count log.Infof("Restoring a total of %d containers", cm.restorer.totalContainers) if _, err := unix.Seek(stateFile.FD(), 0, 0); err != nil { return fmt.Errorf("rewinding state file: %w", err) } return cm.restorer.restoreContainerInfo(cm.l, &cm.l.root) } func (cm *containerManager) onRestoreDone() error { if err := cm.onStart(); err != nil { return err } cm.l.restoreWaiters.Broadcast() cm.restorer = nil return nil } func (cm *containerManager) RestoreSubcontainer(args *StartArgs, _ *struct{}) error { log.Debugf("containerManager.RestoreSubcontainer, cid: %s, args: %+v", args.CID, args) if cm.l.state != restoring { return fmt.Errorf("sandbox is not being restored, cannot restore subcontainer") } // Validate arguments. if args.Spec == nil { return errors.New("start arguments missing spec") } if args.Conf == nil { return errors.New("start arguments missing config") } if args.CID == "" { return errors.New("start argument missing container ID") } expectedFDs := 1 // At least one FD for the root filesystem. expectedFDs += args.NumGoferFilestoreFDs if !args.Spec.Process.Terminal { expectedFDs += 3 } if len(args.Files) < expectedFDs { return fmt.Errorf("restore arguments must contain at least %d FDs, but only got %d", expectedFDs, len(args.Files)) } // All validation passed, logs the spec for debugging. specutils.LogSpecDebug(args.Spec, args.Conf.OCISeccomp) goferFiles := args.Files var stdios []*fd.FD if !args.Spec.Process.Terminal { // When not using a terminal, stdios come as the first 3 files in the // payload. var err error stdios, err = fd.NewFromFiles(goferFiles[:3]) if err != nil { return fmt.Errorf("error dup'ing stdio files: %w", err) } goferFiles = goferFiles[3:] } var goferFilestoreFDs []*fd.FD for i := 0; i < args.NumGoferFilestoreFDs; i++ { overlayFilestoreFD, err := fd.NewFromFile(goferFiles[i]) if err != nil { return fmt.Errorf("error dup'ing overlay filestore file: %w", err) } goferFilestoreFDs = append(goferFilestoreFDs, overlayFilestoreFD) } goferFiles = goferFiles[args.NumGoferFilestoreFDs:] var devGoferFD *fd.FD if args.IsDevIoFilePresent { var err error devGoferFD, err = fd.NewFromFile(goferFiles[0]) if err != nil { return fmt.Errorf("error dup'ing dev gofer file: %w", err) } goferFiles = goferFiles[1:] } goferFDs, err := fd.NewFromFiles(goferFiles) if err != nil { return fmt.Errorf("error dup'ing gofer files: %w", err) } if err := cm.restorer.restoreSubcontainer(args.Spec, args.Conf, cm.l, args.CID, stdios, goferFDs, goferFilestoreFDs, devGoferFD, args.GoferMountConfs); err != nil { log.Debugf("containerManager.RestoreSubcontainer failed, cid: %s, args: %+v, err: %v", args.CID, args, err) return err } log.Debugf("Container restored, cid: %s", args.CID) return nil } // Pause pauses all tasks, blocking until they are stopped. func (cm *containerManager) Pause(_, _ *struct{}) error { cm.l.k.Pause() return nil } // Resume resumes all tasks. func (cm *containerManager) Resume(_, _ *struct{}) error { cm.l.k.Unpause() return postResumeImpl(cm.l.k) } // Wait waits for the init process in the given container. func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error { log.Debugf("containerManager.Wait, cid: %s", *cid) err := cm.l.waitContainer(*cid, waitStatus) log.Debugf("containerManager.Wait returned, cid: %s, waitStatus: %#x, err: %v", *cid, *waitStatus, err) return err } // WaitPIDArgs are arguments to the WaitPID method. type WaitPIDArgs struct { // PID is the PID in the container's PID namespace. PID int32 // CID is the container ID. CID string } // WaitPID waits for the process with PID 'pid' in the sandbox. func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error { log.Debugf("containerManager.Wait, cid: %s, pid: %d", args.CID, args.PID) err := cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus) log.Debugf("containerManager.Wait, cid: %s, pid: %d, waitStatus: %#x, err: %v", args.CID, args.PID, *waitStatus, err) return err } // WaitCheckpoint waits for the Kernel to have been successfully checkpointed // n-1 times, then waits for either the n-th successful checkpoint (in which // case it returns nil) or any number of failed checkpoints (in which case it // returns an error returned by any such failure). func (cm *containerManager) WaitCheckpoint(n *uint32, _ *struct{}) error { err := cm.l.k.WaitCheckpoint(*n) log.Debugf("containerManager.WaitCheckpoint, n = %d, err = %v", *n, err) return err } // SignalDeliveryMode enumerates different signal delivery modes. type SignalDeliveryMode int const ( // DeliverToProcess delivers the signal to the container process with // the specified PID. If PID is 0, then the container init process is // signaled. DeliverToProcess SignalDeliveryMode = iota // DeliverToAllProcesses delivers the signal to all processes in the // container. PID must be 0. DeliverToAllProcesses // DeliverToForegroundProcessGroup delivers the signal to the // foreground process group in the same TTY session as the specified // process. If PID is 0, then the signal is delivered to the foreground // process group for the TTY for the init process. DeliverToForegroundProcessGroup ) func (s SignalDeliveryMode) String() string { switch s { case DeliverToProcess: return "Process" case DeliverToAllProcesses: return "All" case DeliverToForegroundProcessGroup: return "Foreground Process Group" } return fmt.Sprintf("unknown signal delivery mode: %d", s) } // SignalArgs are arguments to the Signal method. type SignalArgs struct { // CID is the container ID. CID string // Signo is the signal to send to the process. Signo int32 // PID is the process ID in the given container that will be signaled, // relative to the root PID namespace, not the container's. // If 0, the root container will be signalled. PID int32 // Mode is the signal delivery mode. Mode SignalDeliveryMode } // Signal sends a signal to one or more processes in a container. If args.PID // is 0, then the container init process is used. Depending on the // args.SignalDeliveryMode option, the signal may be sent directly to the // indicated process, to all processes in the container, or to the foreground // process group. func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error { log.Debugf("containerManager.Signal: cid: %s, PID: %d, signal: %d, mode: %v", args.CID, args.PID, args.Signo, args.Mode) return cm.l.signal(args.CID, args.PID, args.Signo, args.Mode) } // CreateTraceSessionArgs are arguments to the CreateTraceSession method. type CreateTraceSessionArgs struct { Config seccheck.SessionConfig Force bool urpc.FilePayload } // CreateTraceSession creates a new trace session. func (cm *containerManager) CreateTraceSession(args *CreateTraceSessionArgs, _ *struct{}) error { log.Debugf("containerManager.CreateTraceSession: config: %+v", args.Config) for i, sinkFile := range args.Files { if sinkFile != nil { fd, err := fd.NewFromFile(sinkFile) if err != nil { return err } args.Config.Sinks[i].FD = fd } } return seccheck.Create(&args.Config, args.Force) } // DeleteTraceSession deletes an existing trace session. func (cm *containerManager) DeleteTraceSession(name *string, _ *struct{}) error { log.Debugf("containerManager.DeleteTraceSession: name: %q", *name) return seccheck.Delete(*name) } // ListTraceSessions lists trace sessions. func (cm *containerManager) ListTraceSessions(_ *struct{}, out *[]seccheck.SessionConfig) error { log.Debugf("containerManager.ListTraceSessions") seccheck.List(out) return nil } // ProcfsDump dumps procfs state of the sandbox. func (cm *containerManager) ProcfsDump(_ *struct{}, out *[]procfs.ProcessProcfsDump) error { log.Debugf("containerManager.ProcfsDump") ts := cm.l.k.TaskSet() pidns := ts.Root *out = make([]procfs.ProcessProcfsDump, 0, len(cm.l.processes)) for _, tg := range pidns.ThreadGroups() { pid := pidns.IDOfThreadGroup(tg) procDump, err := procfs.Dump(tg.Leader(), pid, pidns) if err != nil { log.Warningf("skipping procfs dump for PID %s: %v", pid, err) continue } *out = append(*out, procDump) } return nil } // MountArgs contains arguments to the Mount method. type MountArgs struct { // ContainerID is the container in which we will mount the filesystem. ContainerID string // Source is the mount source. Source string // Destination is the mount target. Destination string // FsType is the filesystem type. FsType string // FilePayload contains the source image FD, if required by the filesystem. urpc.FilePayload } const initTID kernel.ThreadID = 1 // Mount mounts a filesystem in a container. func (cm *containerManager) Mount(args *MountArgs, _ *struct{}) error { log.Debugf("containerManager.Mount, cid: %s, args: %+v", args.ContainerID, args) var cu cleanup.Cleanup defer cu.Clean() eid := execID{cid: args.ContainerID} ep, ok := cm.l.processes[eid] if !ok { return fmt.Errorf("container %v is deleted", args.ContainerID) } if ep.tg == nil { return fmt.Errorf("container %v isn't started", args.ContainerID) } t := ep.tg.PIDNamespace().TaskWithID(initTID) if t == nil { return fmt.Errorf("failed to find init process") } source := args.Source dest := path.Clean(args.Destination) fstype := args.FsType if dest[0] != '/' { return fmt.Errorf("absolute path must be provided for destination") } var opts vfs.MountOptions switch fstype { case erofs.Name: if len(args.FilePayload.Files) != 1 { return fmt.Errorf("exactly one image file must be provided") } imageFD, err := unix.Dup(int(args.FilePayload.Files[0].Fd())) if err != nil { return fmt.Errorf("failed to dup image FD: %v", err) } cu.Add(func() { unix.Close(imageFD) }) opts = vfs.MountOptions{ ReadOnly: true, GetFilesystemOptions: vfs.GetFilesystemOptions{ InternalMount: true, Data: fmt.Sprintf("ifd=%d", imageFD), }, } default: return fmt.Errorf("unsupported filesystem type: %v", fstype) } ctx := context.Background() root := t.FSContext().RootDirectory() defer root.DecRef(ctx) pop := vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse(dest), } if _, err := t.Kernel().VFS().MountAt(ctx, t.Credentials(), source, &pop, fstype, &opts); err != nil { return err } log.Infof("Mounted %q to %q type: %s, internal-options: %q, in container %q", source, dest, fstype, opts.GetFilesystemOptions.Data, args.ContainerID) cu.Release() return nil } // ContainerRuntimeState returns the runtime state of a container. func (cm *containerManager) ContainerRuntimeState(cid *string, state *ContainerRuntimeState) error { log.Debugf("containerManager.ContainerRuntimeState: cid: %s", *cid) *state = cm.l.containerRuntimeState(*cid) return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/debug.go000066400000000000000000000015361465435605700223620ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package boot import ( "gvisor.dev/gvisor/pkg/log" ) type debug struct { } // Stacks collects all sandbox stacks and copies them to 'stacks'. func (*debug) Stacks(_ *struct{}, stacks *string) error { buf := log.Stacks(true) *stacks = string(buf) return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/events.go000066400000000000000000000132561465435605700226020ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package boot import ( "fmt" "strconv" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/usage" ) // NetworkInterface is the network statistics of the particular network interface type NetworkInterface struct { // Name is the name of the network interface. Name string RxBytes uint64 RxPackets uint64 RxErrors uint64 RxDropped uint64 TxBytes uint64 TxPackets uint64 TxErrors uint64 TxDropped uint64 } // EventOut is the return type of the Event command. type EventOut struct { Event Event `json:"event"` // ContainerUsage maps each container ID to its total CPU usage. ContainerUsage map[string]uint64 `json:"containerUsage"` } // Event struct for encoding the event data to JSON. Corresponds to runc's // main.event struct. type Event struct { Type string `json:"type"` ID string `json:"id"` Data Stats `json:"data"` } // Stats is the runc specific stats structure for stability when encoding and // decoding stats. type Stats struct { CPU CPU `json:"cpu"` Memory Memory `json:"memory"` Pids Pids `json:"pids"` NetworkInterfaces []*NetworkInterface `json:"network_interfaces"` } // Pids contains stats on processes. type Pids struct { Current uint64 `json:"current,omitempty"` Limit uint64 `json:"limit,omitempty"` } // MemoryEntry contains stats on a kind of memory. type MemoryEntry struct { Limit uint64 `json:"limit"` Usage uint64 `json:"usage,omitempty"` Max uint64 `json:"max,omitempty"` Failcnt uint64 `json:"failcnt"` } // Memory contains stats on memory. type Memory struct { Cache uint64 `json:"cache,omitempty"` Usage MemoryEntry `json:"usage,omitempty"` Swap MemoryEntry `json:"swap,omitempty"` Kernel MemoryEntry `json:"kernel,omitempty"` KernelTCP MemoryEntry `json:"kernelTCP,omitempty"` Raw map[string]uint64 `json:"raw,omitempty"` } // CPU contains stats on the CPU. type CPU struct { Usage CPUUsage `json:"usage"` } // CPUUsage contains stats on CPU usage. type CPUUsage struct { Kernel uint64 `json:"kernel,omitempty"` User uint64 `json:"user,omitempty"` Total uint64 `json:"total,omitempty"` PerCPU []uint64 `json:"percpu,omitempty"` } func (cm *containerManager) getUsageFromCgroups(file control.CgroupControlFile) (uint64, error) { var out control.CgroupsResults args := control.CgroupsReadArgs{ Args: []control.CgroupsReadArg{ { File: file, }, }, } cgroups := control.Cgroups{Kernel: cm.l.k} if err := cgroups.ReadControlFiles(&args, &out); err != nil { return 0, err } if len(out.Results) != 1 { return 0, fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out) } val, err := out.Results[0].Unpack() if err != nil { return 0, err } usage, err := strconv.ParseUint(val, 10, 64) if err != nil { return 0, err } return usage, nil } // Event gets the events from the container. func (cm *containerManager) Event(cid *string, out *EventOut) error { *out = EventOut{ Event: Event{ ID: *cid, Type: "stats", }, } // PIDs and check that container exists before going further. pids, err := cm.l.pidsCount(*cid) if err != nil { return err } out.Event.Data.Pids.Current = uint64(pids) networkStats, err := cm.l.networkStats() if err != nil { return err } out.Event.Data.NetworkInterfaces = networkStats numContainers := cm.l.containerCount() if numContainers == 0 { return fmt.Errorf("no container was found") } // Memory usage. memFile := control.CgroupControlFile{"memory", "/" + *cid, "memory.usage_in_bytes"} memUsage, err := cm.getUsageFromCgroups(memFile) if err != nil { // Cgroups is not installed or there was an error to get usage // from the cgroups. Fall back to the old method of getting the // usage from the sentry. log.Warningf("could not get container memory usage from cgroups, error: %v", err) mem := cm.l.k.MemoryFile() _ = mem.UpdateUsage(nil) // best effort to update. _, totalUsage := usage.MemoryAccounting.Copy() if numContainers == 1 { memUsage = totalUsage } else { // In the multi-container case, reports 0 for the root (pause) // container, since it's small and idle. Then equally split the // usage to the other containers. At least the sum of all // containers will correctly account for the memory used by the // sandbox. if *cid == cm.l.sandboxID { memUsage = 0 } else { memUsage = totalUsage / uint64(numContainers-1) } } } out.Event.Data.Memory.Usage.Usage = memUsage // CPU usage by container. cpuacctFile := control.CgroupControlFile{"cpuacct", "/" + *cid, "cpuacct.usage"} if cpuUsage, err := cm.getUsageFromCgroups(cpuacctFile); err != nil { // Cgroups is not installed or there was an error to get usage // from the cgroups. Fall back to the old method of getting the // usage from the sentry and host cgroups. log.Warningf("could not get container cpu usage from cgroups, error: %v", err) out.ContainerUsage = control.ContainerUsage(cm.l.k) } else { out.Event.Data.CPU.Usage.Total = cpuUsage } return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/000077500000000000000000000000001465435605700222255ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/000077500000000000000000000000001465435605700234725ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/config.go000066400000000000000000000141421465435605700252700ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package config defines all syscalls the sandbox is allowed to make // to the host. package config import ( "fmt" "os" "runtime" "strings" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/seccomp/precompiledseccomp" "gvisor.dev/gvisor/pkg/sentry/devices/accel" "gvisor.dev/gvisor/pkg/sentry/devices/nvproxy" "gvisor.dev/gvisor/pkg/sentry/devices/tpuproxy" "gvisor.dev/gvisor/pkg/sentry/platform" ) // Options are seccomp filter related options. type Options struct { Platform platform.SeccompInfo HostNetwork bool HostNetworkRawSockets bool HostFilesystem bool ProfileEnable bool NVProxy bool TPUProxy bool ControllerFD uint32 } // isInstrumentationEnabled returns whether there are any // instrumentation-specific filters enabled. func isInstrumentationEnabled() bool { return instrumentationFilters().Size() > 0 } // ConfigKey returns a unique string representing this set of options. // This is used for matching a set of `Options` at seccomp precompile // time with the same set of `Options` at runtime. // As such, it should encompass all fields that change the structure of // the seccomp rules, but should not encompass fields that are only known // at runtime (e.g. `ControllerFD`). func (opt Options) ConfigKey() string { var sb strings.Builder sb.WriteString(fmt.Sprintf("GOARCH=%q ", runtime.GOARCH)) sb.WriteString(fmt.Sprintf("Platform=%q ", opt.Platform.ConfigKey())) sb.WriteString(fmt.Sprintf("HostNetwork=%t ", opt.HostNetwork)) sb.WriteString(fmt.Sprintf("HostNetworkRawSockets=%t ", opt.HostNetworkRawSockets)) sb.WriteString(fmt.Sprintf("HostFilesystem=%t ", opt.HostFilesystem)) sb.WriteString(fmt.Sprintf("ProfileEnable=%t ", opt.ProfileEnable)) sb.WriteString(fmt.Sprintf("Instrumentation=%t ", isInstrumentationEnabled())) sb.WriteString(fmt.Sprintf("NVProxy=%t ", opt.NVProxy)) sb.WriteString(fmt.Sprintf("TPUProxy=%t ", opt.TPUProxy)) return strings.TrimSpace(sb.String()) } // Warnings returns a set of warnings that may be useful to display to the // user when the given options are used. func Warnings(opt Options) []string { var warnings []string if opt.HostNetwork { if opt.HostNetworkRawSockets { warnings = append(warnings, "host networking (with raw sockets) enabled: syscall filters less restrictive!") } else { warnings = append(warnings, "host networking enabled: syscall filters less restrictive!") } } if opt.ProfileEnable { warnings = append(warnings, "profile enabled: syscall filters less restrictive!") } if opt.HostFilesystem { warnings = append(warnings, "host filesystem enabled: syscall filters less restrictive!") } if isInstrumentationEnabled() { warnings = append(warnings, "instrumentation enabled: syscall filters less restrictive!") } if opt.NVProxy { warnings = append(warnings, "Nvidia GPU driver proxy enabled: syscall filters less restrictive!") } if opt.TPUProxy { warnings = append(warnings, "TPU device proxy enabled: syscall filters less restrictive!") } return warnings } // Vars returns the values to use for rendering the precompiled seccomp // program. func (opt Options) Vars() precompiledseccomp.Values { vars := precompiledseccomp.Values{ controllerFDVarName: opt.ControllerFD, } vars.SetUint64(selfPIDVarName, uint64(os.Getpid())) for varName, value := range opt.Platform.Variables() { vars[varName] = value } return vars } // Rules returns the seccomp rules and denyRules to use for the Sentry. func Rules(opt Options) (seccomp.SyscallRules, seccomp.SyscallRules) { return rules(opt, opt.Vars()) } // rules returns the seccomp rules and denyRules to use for the Sentry, // using `vars` as override for variables defined during precompilation. func rules(opt Options, vars precompiledseccomp.Values) (seccomp.SyscallRules, seccomp.SyscallRules) { s := allowedSyscalls.Copy() s.Merge(selfPIDFilters(vars.GetUint64(selfPIDVarName))) s.Merge(controlServerFilters(vars[controllerFDVarName])) // Set of additional filters used by -race and -msan. Returns empty // when not enabled. s.Merge(instrumentationFilters()) if opt.HostNetwork { s.Merge(hostInetFilters(opt.HostNetworkRawSockets)) } if opt.ProfileEnable { s.Merge(profileFilters()) } if opt.HostFilesystem { s.Merge(hostFilesystemFilters()) } if opt.NVProxy { s.Merge(nvproxy.Filters()) } if opt.TPUProxy { s.Merge(accel.Filters()) s.Merge(tpuproxy.Filters()) } s.Merge(opt.Platform.SyscallFilters(vars)) return s, seccomp.DenyNewExecMappings } // SeccompOptions returns the seccomp program options to use for the filter. func SeccompOptions(opt Options) seccomp.ProgramOptions { // futex(2) is unequivocally the most-frequently-used syscall by the // Sentry across all platforms. hotSyscalls := []uintptr{unix.SYS_FUTEX} // ... Then comes the platform-specific hot syscalls which are typically // part of the syscall interception hot path. hotSyscalls = append(hotSyscalls, opt.Platform.HottestSyscalls()...) // ... Then come a few syscalls that are frequent just from workloads in // general. hotSyscalls = append(hotSyscalls, archSpecificHotSyscalls()...) // Now deduplicate them. sysnoMap := make(map[uintptr]struct{}, len(hotSyscalls)) uniqueHotSyscalls := make([]uintptr, 0, len(hotSyscalls)) for _, sysno := range hotSyscalls { if _, alreadyAdded := sysnoMap[sysno]; !alreadyAdded { sysnoMap[sysno] = struct{}{} uniqueHotSyscalls = append(uniqueHotSyscalls, sysno) } } opts := seccomp.DefaultProgramOptions() opts.HotSyscalls = uniqueHotSyscalls return opts } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/config_amd64.go000066400000000000000000000030521465435605700262610ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package config import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/seccomp" ) func init() { allowedSyscalls.Set(unix.SYS_CLONE, seccomp.PerArg{ // parent_tidptr and child_tidptr are always 0 because neither // CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used. seccomp.EqualTo( unix.CLONE_VM | unix.CLONE_FS | unix.CLONE_FILES | unix.CLONE_SETTLS | unix.CLONE_SIGHAND | unix.CLONE_SYSVSEM | unix.CLONE_THREAD), seccomp.AnyValue{}, // newsp seccomp.EqualTo(0), // parent_tidptr seccomp.EqualTo(0), // child_tidptr seccomp.AnyValue{}, // tls }) } func archFstatAtSysNo() uintptr { return unix.SYS_NEWFSTATAT } func archSpecificHotSyscalls() []uintptr { return []uintptr{ unix.SYS_NANOSLEEP, // Used a bunch unix.SYS_SENDMMSG, // Used by network workloads unix.SYS_FSTAT, // Used for file I/O unix.SYS_PPOLL, // Used in general for I/O unix.SYS_EPOLL_WAIT, // Same } } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/config_amd64_state_autogen.go000066400000000000000000000001321465435605700311770ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 // +build amd64 package config golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/config_arm64.go000066400000000000000000000024611465435605700263020ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package config import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/seccomp" ) func init() { allowedSyscalls.Set(unix.SYS_CLONE, seccomp.PerArg{ seccomp.EqualTo( unix.CLONE_VM | unix.CLONE_FS | unix.CLONE_FILES | unix.CLONE_SIGHAND | unix.CLONE_SYSVSEM | unix.CLONE_THREAD), seccomp.AnyValue{}, // newsp // These arguments are left uninitialized by the Go // runtime, so they may be anything (and are unused by // the host). seccomp.AnyValue{}, // parent_tidptr seccomp.AnyValue{}, // tls seccomp.AnyValue{}, // child_tidptr }) } func archFstatAtSysNo() uintptr { return unix.SYS_FSTATAT } func archSpecificHotSyscalls() []uintptr { return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/config_arm64_state_autogen.go000066400000000000000000000001321465435605700312150ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 // +build arm64 package config golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/config_main.go000066400000000000000000000300511465435605700262710ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package config import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" ) // allowedSyscalls is the set of syscalls executed by the Sentry to the host OS. var allowedSyscalls = seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_CLOCK_GETTIME: seccomp.MatchAll{}, unix.SYS_CLOSE: seccomp.MatchAll{}, unix.SYS_DUP: seccomp.MatchAll{}, unix.SYS_DUP3: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.O_CLOEXEC), }, unix.SYS_EPOLL_CREATE1: seccomp.MatchAll{}, unix.SYS_EPOLL_CTL: seccomp.MatchAll{}, unix.SYS_EPOLL_PWAIT: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(0), }, unix.SYS_EVENTFD2: seccomp.PerArg{ seccomp.EqualTo(0), seccomp.EqualTo(0), }, unix.SYS_EXIT: seccomp.MatchAll{}, unix.SYS_EXIT_GROUP: seccomp.MatchAll{}, unix.SYS_FALLOCATE: seccomp.MatchAll{}, unix.SYS_FCHMOD: seccomp.MatchAll{}, unix.SYS_FCNTL: seccomp.Or{ seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(unix.F_GETFL), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(unix.F_SETFL), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(unix.F_GETFD), }, }, unix.SYS_FSTAT: seccomp.MatchAll{}, unix.SYS_FSYNC: seccomp.MatchAll{}, unix.SYS_FTRUNCATE: seccomp.MatchAll{}, unix.SYS_FUTEX: seccomp.PerArg{ seccomp.AnyValue{}, // Allow any combination of FUTEX_{WAIT,WAKE,PRIVATE_FLAG}, but no other. // Non-private variants are included for flipcall support. They are // otherwise unnecessary, as the sentry will use only private futexes // internally. seccomp.BitsAllowlist( linux.FUTEX_WAIT | linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG, ), }, // getcpu is used by some versions of the Go runtime and by the hostcpu // package on arm64. unix.SYS_GETCPU: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(0), seccomp.EqualTo(0), }, unix.SYS_GETPID: seccomp.MatchAll{}, unix.SYS_GETRANDOM: seccomp.MatchAll{}, unix.SYS_GETSOCKOPT: seccomp.Or{ seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(unix.SOL_SOCKET), seccomp.EqualTo(unix.SO_DOMAIN), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(unix.SOL_SOCKET), seccomp.EqualTo(unix.SO_TYPE), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(unix.SOL_SOCKET), seccomp.EqualTo(unix.SO_ERROR), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(unix.SOL_SOCKET), seccomp.EqualTo(unix.SO_SNDBUF), }, }, unix.SYS_GETTID: seccomp.MatchAll{}, unix.SYS_GETTIMEOFDAY: seccomp.MatchAll{}, unix.SYS_IOCTL: seccomp.Or{ // These commands are needed for host FD. seccomp.PerArg{ seccomp.NonNegativeFD{}, /* fd */ seccomp.EqualTo(linux.FIONREAD), seccomp.AnyValue{}, /* int* */ }, // These commands are needed for terminal support, but we only allow // setting/getting termios and winsize. seccomp.PerArg{ seccomp.NonNegativeFD{}, /* fd */ seccomp.EqualTo(linux.TCGETS), seccomp.AnyValue{}, /* termios struct */ }, seccomp.PerArg{ seccomp.NonNegativeFD{}, /* fd */ seccomp.EqualTo(linux.TCSETS), seccomp.AnyValue{}, /* termios struct */ }, seccomp.PerArg{ seccomp.NonNegativeFD{}, /* fd */ seccomp.EqualTo(linux.TCSETSF), seccomp.AnyValue{}, /* termios struct */ }, seccomp.PerArg{ seccomp.NonNegativeFD{}, /* fd */ seccomp.EqualTo(linux.TCSETSW), seccomp.AnyValue{}, /* termios struct */ }, seccomp.PerArg{ seccomp.NonNegativeFD{}, /* fd */ seccomp.EqualTo(linux.TIOCSWINSZ), seccomp.AnyValue{}, /* winsize struct */ }, seccomp.PerArg{ seccomp.NonNegativeFD{}, /* fd */ seccomp.EqualTo(linux.TIOCGWINSZ), seccomp.AnyValue{}, /* winsize struct */ }, seccomp.PerArg{ seccomp.NonNegativeFD{}, /* fd */ seccomp.EqualTo(linux.SIOCGIFTXQLEN), seccomp.AnyValue{}, /* ifreq struct */ }, }, unix.SYS_LSEEK: seccomp.MatchAll{}, unix.SYS_MADVISE: seccomp.MatchAll{}, unix.SYS_MEMBARRIER: seccomp.PerArg{ seccomp.EqualTo(linux.MEMBARRIER_CMD_GLOBAL), seccomp.EqualTo(0), }, unix.SYS_MINCORE: seccomp.MatchAll{}, unix.SYS_MLOCK: seccomp.MatchAll{}, unix.SYS_MMAP: seccomp.Or{ seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.MAP_SHARED), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.MAP_SHARED | unix.MAP_FIXED), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.MAP_PRIVATE), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.MAP_PRIVATE | unix.MAP_ANONYMOUS), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.MAP_PRIVATE | unix.MAP_ANONYMOUS | unix.MAP_STACK), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.MAP_PRIVATE | unix.MAP_ANONYMOUS | unix.MAP_NORESERVE), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.PROT_WRITE | unix.PROT_READ), seccomp.EqualTo(unix.MAP_PRIVATE | unix.MAP_ANONYMOUS | unix.MAP_FIXED), }, }, unix.SYS_MPROTECT: seccomp.MatchAll{}, unix.SYS_MUNLOCK: seccomp.MatchAll{}, unix.SYS_MUNMAP: seccomp.MatchAll{}, unix.SYS_NANOSLEEP: seccomp.MatchAll{}, unix.SYS_PPOLL: seccomp.MatchAll{}, unix.SYS_PREAD64: seccomp.MatchAll{}, unix.SYS_PREADV: seccomp.MatchAll{}, unix.SYS_PREADV2: seccomp.MatchAll{}, unix.SYS_PWRITE64: seccomp.MatchAll{}, unix.SYS_PWRITEV: seccomp.MatchAll{}, unix.SYS_PWRITEV2: seccomp.MatchAll{}, unix.SYS_READ: seccomp.MatchAll{}, unix.SYS_RECVMSG: seccomp.Or{ seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.MSG_DONTWAIT | unix.MSG_TRUNC), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.MSG_DONTWAIT | unix.MSG_TRUNC | unix.MSG_PEEK), }, }, unix.SYS_RECVMMSG: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(fdbased.MaxMsgsPerRecv), seccomp.EqualTo(unix.MSG_DONTWAIT), seccomp.EqualTo(0), }, unix.SYS_SENDMMSG: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.MSG_DONTWAIT), }, unix.SYS_RESTART_SYSCALL: seccomp.MatchAll{}, unix.SYS_RT_SIGACTION: seccomp.MatchAll{}, unix.SYS_RT_SIGPROCMASK: seccomp.MatchAll{}, unix.SYS_RT_SIGRETURN: seccomp.MatchAll{}, unix.SYS_SCHED_YIELD: seccomp.MatchAll{}, unix.SYS_SENDMSG: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.MSG_DONTWAIT | unix.MSG_NOSIGNAL), }, unix.SYS_SETITIMER: seccomp.MatchAll{}, unix.SYS_SHUTDOWN: seccomp.Or{ // Used by fs/host to shutdown host sockets. seccomp.PerArg{seccomp.AnyValue{}, seccomp.EqualTo(unix.SHUT_RD)}, seccomp.PerArg{seccomp.AnyValue{}, seccomp.EqualTo(unix.SHUT_WR)}, // Used by unet to shutdown connections. seccomp.PerArg{seccomp.AnyValue{}, seccomp.EqualTo(unix.SHUT_RDWR)}, }, unix.SYS_SIGALTSTACK: seccomp.MatchAll{}, unix.SYS_STATX: seccomp.MatchAll{}, unix.SYS_SYNC_FILE_RANGE: seccomp.MatchAll{}, unix.SYS_TEE: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(1), /* len */ seccomp.EqualTo(unix.SPLICE_F_NONBLOCK), /* flags */ }, unix.SYS_TIMER_CREATE: seccomp.PerArg{ seccomp.EqualTo(unix.CLOCK_THREAD_CPUTIME_ID), /* which */ seccomp.AnyValue{}, /* sevp */ seccomp.AnyValue{}, /* timerid */ }, unix.SYS_TIMER_DELETE: seccomp.MatchAll{}, unix.SYS_TIMER_SETTIME: seccomp.PerArg{ seccomp.AnyValue{}, /* timerid */ seccomp.EqualTo(0), /* flags */ seccomp.AnyValue{}, /* new_value */ seccomp.EqualTo(0), /* old_value */ }, unix.SYS_UTIMENSAT: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(0), /* null pathname */ seccomp.AnyValue{}, seccomp.EqualTo(0), /* flags */ }, unix.SYS_WRITE: seccomp.MatchAll{}, // For rawfile.NonBlockingWriteIovec. unix.SYS_WRITEV: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.GreaterThan(0), }, }) func controlServerFilters(fd uint32) seccomp.SyscallRules { return seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_ACCEPT4: seccomp.PerArg{ seccomp.EqualTo(fd), }, unix.SYS_LISTEN: seccomp.PerArg{ seccomp.EqualTo(fd), seccomp.EqualTo(16 /* unet.backlog */), }, unix.SYS_GETSOCKOPT: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(unix.SOL_SOCKET), seccomp.EqualTo(unix.SO_PEERCRED), }, }) } // selfPIDFilters contains syscall filters that depend on the process's PID. func selfPIDFilters(pid uint64) seccomp.SyscallRules { return seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_TGKILL: seccomp.PerArg{ seccomp.EqualTo(pid), }, }) } // hostFilesystemFilters contains syscalls that are needed by directfs. func hostFilesystemFilters() seccomp.SyscallRules { // Directfs allows FD-based filesystem syscalls. We deny these syscalls with // negative FD values (like AT_FDCWD or invalid FD numbers). We try to be as // restrictive as possible because any restriction here improves security. We // don't know what set of arguments will trigger a future vulnerability. return seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_FCHOWNAT: seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.AT_EMPTY_PATH | unix.AT_SYMLINK_NOFOLLOW), }, unix.SYS_FCHMODAT: seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.AnyValue{}, seccomp.AnyValue{}, }, unix.SYS_UNLINKAT: seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.AnyValue{}, seccomp.AnyValue{}, }, unix.SYS_GETDENTS64: seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.AnyValue{}, seccomp.AnyValue{}, }, unix.SYS_OPENAT: seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.AnyValue{}, seccomp.MaskedEqual(unix.O_NOFOLLOW, unix.O_NOFOLLOW), seccomp.AnyValue{}, }, unix.SYS_LINKAT: seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.AnyValue{}, seccomp.NonNegativeFD{}, seccomp.AnyValue{}, seccomp.EqualTo(0), }, unix.SYS_MKDIRAT: seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.AnyValue{}, seccomp.AnyValue{}, }, unix.SYS_MKNODAT: seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, }, unix.SYS_SYMLINKAT: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.NonNegativeFD{}, seccomp.AnyValue{}, }, unix.SYS_FSTATFS: seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.AnyValue{}, }, unix.SYS_READLINKAT: seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, }, unix.SYS_UTIMENSAT: seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, }, unix.SYS_RENAMEAT: seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.AnyValue{}, seccomp.NonNegativeFD{}, seccomp.AnyValue{}, }, archFstatAtSysNo(): seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, }, unix.SYS_FGETXATTR: seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, }, }) } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/config_precompiled.go000066400000000000000000000112171465435605700276530ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package config import ( "fmt" "golang.org/x/sync/errgroup" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/seccomp/precompiledseccomp" "gvisor.dev/gvisor/pkg/sentry/platform" // Import platforms that we need to precompile filters for. _ "gvisor.dev/gvisor/runsc/boot/platforms" ) // Variable names used in precompiled filters. const ( // controllerFDVarName is the variable name for `Options.ControllerFD` // used in the precompiled seccomp filters. controllerFDVarName = "controller_fd" // selfPIDVarName is the variable name for the current process ID. selfPIDVarName = "self_pid" ) // allPrecompiledPlatforms returns a list of `platform.SeccompInfo` instances // that should be precompiled into seccomp programs. func allPrecompiledPlatforms() ([]platform.SeccompInfo, error) { var seccompInfos []platform.SeccompInfo for _, platformName := range platform.List() { constructor, err := platform.Lookup(platformName) if err != nil { return nil, fmt.Errorf("cannot lookup platform %q: %w", platformName, err) } for _, si := range constructor.PrecompiledSeccompInfo() { seccompInfos = append(seccompInfos, si) } } return seccompInfos, nil } // optionsToPrecompile returns the set of `Options` for which we should // precompile seccomp filters. func optionsToPrecompile() ([]Options, error) { type expandFn func(opt Options) ([]Options, error) opts := []Options{Options{}} for _, fn := range []expandFn{ // Expand all platforms. func(opt Options) ([]Options, error) { var newOpts []Options platforms, err := allPrecompiledPlatforms() if err != nil { return nil, err } for _, platform := range platforms { optCopy := opt optCopy.Platform = platform newOpts = append(newOpts, optCopy) } return newOpts, nil }, // Only precompile options with host networking disabled. func(opt Options) ([]Options, error) { opt.HostNetwork = false return []Options{opt}, nil }, // Only precompile options with DirectFS enabled. func(opt Options) ([]Options, error) { opt.HostFilesystem = true return []Options{opt}, nil }, // Expand NVProxy vs not. func(opt Options) ([]Options, error) { nvProxyYes := opt nvProxyYes.NVProxy = true nvProxyNo := opt nvProxyNo.NVProxy = false return []Options{nvProxyYes, nvProxyNo}, nil }, // Expand TPUProxy vs not. func(opt Options) ([]Options, error) { tpuProxyYes := opt tpuProxyYes.TPUProxy = true tpuProxyNo := opt tpuProxyNo.TPUProxy = false return []Options{tpuProxyYes, tpuProxyNo}, nil }, } { var newOpts []Options for _, opt := range opts { expanded, err := fn(opt) if err != nil { return nil, err } for _, newOpt := range expanded { newOpts = append(newOpts, newOpt) } } opts = newOpts } return opts, nil } // PrecompiledPrograms returns the set of seccomp programs to precompile. func PrecompiledPrograms() ([]precompiledseccomp.Program, error) { opts, err := optionsToPrecompile() if err != nil { return nil, err } programs := make([]precompiledseccomp.Program, len(opts)) var errGroup errgroup.Group for i, opt := range opts { i, opt := i, opt errGroup.Go(func() error { var varNames []string for varName := range opt.Vars() { varNames = append(varNames, varName) } program, err := precompiledseccomp.Precompile(opt.ConfigKey(), varNames, func(vars precompiledseccomp.Values) precompiledseccomp.ProgramDesc { opt := opt seccompOpts := SeccompOptions(opt) rules, denyRules := rules(opt, vars) return precompiledseccomp.ProgramDesc{ Rules: []seccomp.RuleSet{ { Rules: denyRules.Copy(), Action: seccompOpts.DefaultAction, }, { Rules: rules.Copy(), Action: linux.SECCOMP_RET_ALLOW, }, }, SeccompOptions: seccompOpts, } }) if err != nil { return fmt.Errorf("cannot precompile seccomp program for options %v: %w", opt.ConfigKey(), err) } programs[i] = program return nil }) } if err := errGroup.Wait(); err != nil { return nil, err } return programs, nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/config_profile.go000066400000000000000000000020511465435605700270040ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false package config import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/seccomp" ) // profileFilters returns extra syscalls made by runtime/pprof package. func profileFilters() seccomp.SyscallRules { return seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_OPENAT: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.O_RDONLY | unix.O_LARGEFILE | unix.O_CLOEXEC), }, }) } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/config_race_amd64_state_autogen.go000066400000000000000000000001301465435605700321670ustar00rootroot00000000000000// automatically generated by stateify. //go:build race // +build race package config golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/config_race_arm64_state_autogen.go000066400000000000000000000001301465435605700322050ustar00rootroot00000000000000// automatically generated by stateify. //go:build race // +build race package config golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/config_race_state_autogen.go000066400000000000000000000001301465435605700311740ustar00rootroot00000000000000// automatically generated by stateify. //go:build race // +build race package config golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/config_state_autogen.go000066400000000000000000000002431465435605700302070ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false && !asan && !msan && !race && asan && msan // +build !false,!asan,!msan,!race,asan,msan package config golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/extra_filters.go000066400000000000000000000016511465435605700266770ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !asan && !msan && !race package config import ( "gvisor.dev/gvisor/pkg/seccomp" ) // instrumentationFilters returns additional filters for syscalls used by // Go instrumentation tools, e.g. -race, -msan. // Returns empty when disabled. func instrumentationFilters() seccomp.SyscallRules { return seccomp.NewSyscallRules() } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/extra_filters_asan.go000066400000000000000000000023011465435605700276720ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build asan // +build asan package config import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/seccomp" ) // instrumentationFilters returns additional filters for syscalls used by ASAN. func instrumentationFilters() seccomp.SyscallRules { log.Warningf("ASAN is enabled: syscall filters less restrictive!") return seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_CLONE: seccomp.MatchAll{}, unix.SYS_MMAP: seccomp.MatchAll{}, unix.SYS_SCHED_GETAFFINITY: seccomp.MatchAll{}, unix.SYS_SET_ROBUST_LIST: seccomp.MatchAll{}, }) } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/extra_filters_hostinet.go000066400000000000000000000113431465435605700306130ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package config import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" ) // hostInetFilters contains syscalls that are needed by sentry/socket/hostinet. func hostInetFilters(allowRawSockets bool) seccomp.SyscallRules { rules := seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_ACCEPT4: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.SOCK_NONBLOCK | unix.SOCK_CLOEXEC), }, unix.SYS_BIND: seccomp.MatchAll{}, unix.SYS_CONNECT: seccomp.MatchAll{}, unix.SYS_GETPEERNAME: seccomp.MatchAll{}, unix.SYS_GETSOCKNAME: seccomp.MatchAll{}, unix.SYS_IOCTL: seccomp.Or{ seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(unix.SIOCGIFCONF), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(unix.SIOCETHTOOL), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(unix.SIOCGIFFLAGS), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(unix.SIOCGIFHWADDR), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(unix.SIOCGIFINDEX), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(unix.SIOCGIFMTU), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(unix.SIOCGIFNAME), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(unix.SIOCGIFNETMASK), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(unix.TIOCOUTQ), }, seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.EqualTo(unix.TIOCINQ), }, }, unix.SYS_LISTEN: seccomp.MatchAll{}, unix.SYS_READV: seccomp.MatchAll{}, unix.SYS_RECVFROM: seccomp.MatchAll{}, unix.SYS_RECVMSG: seccomp.MatchAll{}, unix.SYS_SENDMSG: seccomp.MatchAll{}, unix.SYS_SENDTO: seccomp.MatchAll{}, unix.SYS_SHUTDOWN: seccomp.Or{ seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(unix.SHUT_RD), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(unix.SHUT_WR), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(unix.SHUT_RDWR), }, }, unix.SYS_WRITEV: seccomp.MatchAll{}, }) // Need NETLINK_ROUTE and stream sockets to query host interfaces and // routes. socketRules := seccomp.Or{ seccomp.PerArg{ seccomp.EqualTo(unix.AF_NETLINK), seccomp.EqualTo(unix.SOCK_RAW | unix.SOCK_CLOEXEC), seccomp.EqualTo(unix.NETLINK_ROUTE), }, seccomp.PerArg{ seccomp.EqualTo(unix.AF_INET), seccomp.EqualTo(unix.SOCK_STREAM), seccomp.EqualTo(0), }, seccomp.PerArg{ seccomp.EqualTo(unix.AF_INET6), seccomp.EqualTo(unix.SOCK_STREAM), seccomp.EqualTo(0), }, } // Generate rules for socket creation based on hostinet's supported // socket types. stypes := hostinet.AllowedSocketTypes if allowRawSockets { stypes = append(stypes, hostinet.AllowedRawSocketTypes...) } for _, sock := range stypes { rule := seccomp.PerArg{ seccomp.EqualTo(sock.Family), // We always set SOCK_NONBLOCK and SOCK_CLOEXEC. seccomp.EqualTo(sock.Type | linux.SOCK_NONBLOCK | linux.SOCK_CLOEXEC), // Match specific protocol by default. seccomp.EqualTo(sock.Protocol), } if sock.Protocol == hostinet.AllowAllProtocols { // Change protocol filter to MatchAny. rule[2] = seccomp.AnyValue{} } socketRules = append(socketRules, rule) } rules.Set(unix.SYS_SOCKET, socketRules) // Generate rules for socket options based on hostinet's supported // socket options. for _, opt := range hostinet.SockOpts { if opt.AllowGet { rules.Add(unix.SYS_GETSOCKOPT, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(opt.Level), seccomp.EqualTo(opt.Name), }) } if opt.AllowSet { if opt.Size > 0 { rules.Add(unix.SYS_SETSOCKOPT, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(opt.Level), seccomp.EqualTo(opt.Name), seccomp.AnyValue{}, seccomp.EqualTo(opt.Size), }) } else { rules.Add(unix.SYS_SETSOCKOPT, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(opt.Level), seccomp.EqualTo(opt.Name), }) } } } return rules } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/extra_filters_msan.go000066400000000000000000000023011465435605700277060ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build msan // +build msan package config import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/seccomp" ) // instrumentationFilters returns additional filters for syscalls used by MSAN. func instrumentationFilters() seccomp.SyscallRules { log.Warningf("MSAN is enabled: syscall filters less restrictive!") return seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_CLONE: seccomp.MatchAll{}, unix.SYS_MMAP: seccomp.MatchAll{}, unix.SYS_SCHED_GETAFFINITY: seccomp.MatchAll{}, unix.SYS_SET_ROBUST_LIST: seccomp.MatchAll{}, }) } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/extra_filters_race.go000066400000000000000000000031551465435605700276720ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build race // +build race package config import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/seccomp" ) // instrumentationFilters returns additional filters for syscalls used by TSAN. func instrumentationFilters() seccomp.SyscallRules { log.Warningf("TSAN is enabled: syscall filters less restrictive!") return archInstrumentationFilters(seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_BRK: seccomp.MatchAll{}, unix.SYS_CLOCK_NANOSLEEP: seccomp.MatchAll{}, unix.SYS_CLONE: seccomp.MatchAll{}, unix.SYS_CLONE3: seccomp.MatchAll{}, unix.SYS_FUTEX: seccomp.MatchAll{}, unix.SYS_MMAP: seccomp.MatchAll{}, unix.SYS_MUNLOCK: seccomp.MatchAll{}, unix.SYS_NANOSLEEP: seccomp.MatchAll{}, unix.SYS_OPENAT: seccomp.MatchAll{}, unix.SYS_RSEQ: seccomp.MatchAll{}, unix.SYS_SET_ROBUST_LIST: seccomp.MatchAll{}, unix.SYS_SCHED_GETAFFINITY: seccomp.MatchAll{}, })) } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/extra_filters_race_amd64.go000066400000000000000000000016321465435605700306630ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build race // +build race package config import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/seccomp" ) func archInstrumentationFilters(f seccomp.SyscallRules) seccomp.SyscallRules { f.Set(unix.SYS_OPEN, seccomp.MatchAll{}) // Used within glibc's malloc. f.Set(unix.SYS_TIME, seccomp.MatchAll{}) return f } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/config/extra_filters_race_arm64.go000066400000000000000000000014151465435605700307000ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build race // +build race package config import ( "gvisor.dev/gvisor/pkg/seccomp" ) func archInstrumentationFilters(f seccomp.SyscallRules) seccomp.SyscallRules { return f } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/filter.go000066400000000000000000000050031465435605700240370ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package filter installs seccomp filters to prevent prohibited syscalls // in case it's compromised. package filter import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/runsc/boot/filter/config" ) // *** DEBUG TIP *** // If you suspect the Sentry is getting killed due to a seccomp violation, // change this to `true` to get a panic stack trace when there is a // violation. const debugFilter = false // Options is a re-export of the config Options type under this package. type Options = config.Options // Install seccomp filters based on the given platform. func Install(opt Options) error { for _, warning := range config.Warnings(opt) { log.Warningf("*** SECCOMP WARNING: %s", warning) } key := opt.ConfigKey() precompiled, usePrecompiled := GetPrecompiled(key) if usePrecompiled && !debugFilter { vars := opt.Vars() log.Debugf("Loaded precompiled seccomp instructions for options %v, using variables: %v", key, vars) insns, err := precompiled.RenderInstructions(vars) if err != nil { return fmt.Errorf("cannot render precompiled program for options %v / vars %v: %w", key, vars, err) } return seccomp.SetFilter(insns) } seccompOpts := config.SeccompOptions(opt) if debugFilter { log.Infof("Seccomp filter debugging is enabled; seccomp failures will result in a panic stack trace.") seccompOpts.DefaultAction = linux.SECCOMP_RET_TRAP } else { log.Infof("No precompiled program found for config options %v, building seccomp program from scratch. This may slow down container startup.", key) if log.IsLogging(log.Debug) { precompiledKeys := ListPrecompiled() log.Debugf("Precompiled seccomp-bpf program configuration option variants (%d):", len(precompiledKeys)) for k := range precompiledKeys { log.Debugf(" %v", k) } } } rules, denyRules := config.Rules(opt) return seccomp.Install(rules, denyRules, seccompOpts) } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/filter_precompiled.go000066400000000000000000000035401465435605700264260ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package filter import ( "sort" "gvisor.dev/gvisor/pkg/seccomp/precompiledseccomp" "gvisor.dev/gvisor/pkg/sync" ) var ( // precompiledPrograms holds registered programs. // It is populated in `registerPrograms`. precompiledPrograms map[string]precompiledseccomp.Program = nil // registerPrecompiledProgramsOnce ensures that program registration // happens only once. registerPrecompiledProgramsOnce sync.Once ) // GetPrecompiled returns the precompiled program for the given name, // and whether that program name exists. func GetPrecompiled(programName string) (precompiledseccomp.Program, bool) { registerPrecompiledProgramsOnce.Do(registerPrograms) program, ok := precompiledPrograms[programName] return program, ok } // ListPrecompiled returns a list of all registered program names. func ListPrecompiled() []string { registerPrecompiledProgramsOnce.Do(registerPrograms) programNames := make([]string, 0, len(precompiledPrograms)) for name := range precompiledPrograms { programNames = append(programNames, name) } sort.Strings(programNames) return programNames } // registerPrograms registers available programs inside `precompiledPrograms`. func registerPrograms() { programs := make(map[string]precompiledseccomp.Program) precompiledPrograms = programs } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/filter/filter_state_autogen.go000066400000000000000000000000701465435605700267600ustar00rootroot00000000000000// automatically generated by stateify. package filter golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/gofer_conf.go000066400000000000000000000144131465435605700234010ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package boot import ( "fmt" "strings" "gvisor.dev/gvisor/pkg/sentry/fsimpl/erofs" ) // GoferMountConfUpperType describes how upper layer is configured for the gofer mount. type GoferMountConfUpperType byte const ( // NoOverlay indicates that this gofer mount has no upper layer. In this case, // this gofer mount must have a lower layer (i.e. lower != NoneLower). NoOverlay GoferMountConfUpperType = iota // MemoryOverlay indicates that this gofer mount should be overlaid with a // tmpfs backed by application memory. MemoryOverlay // SelfOverlay indicates that this gofer mount should be overlaid with a // tmpfs backed by a host file in the mount's source directory. SelfOverlay // AnonOverlay indicates that this gofer mount should be overlaid with a // tmpfs backed by a host file in an anonymous directory. AnonOverlay // UpperMax indicates the number of the valid upper layer types. UpperMax ) // String returns a human-readable string representing the upper layer type. func (u GoferMountConfUpperType) String() string { switch u { case NoOverlay: return "none" case MemoryOverlay: return "memory" case SelfOverlay: return "self" case AnonOverlay: return "anon" } panic(fmt.Sprintf("Invalid gofer mount config upper layer type: %d", u)) } // Set sets the value. Set(String()) should be idempotent. func (u *GoferMountConfUpperType) Set(v string) error { switch v { case "none": *u = NoOverlay case "memory": *u = MemoryOverlay case "self": *u = SelfOverlay case "anon": *u = AnonOverlay default: return fmt.Errorf("invalid gofer mount config upper layer type: %s", v) } return nil } // GoferMountConfLowerType describes how lower layer is configured for the gofer mount. type GoferMountConfLowerType byte const ( // NoneLower indicates that this gofer mount has no lower layer. NoneLower GoferMountConfLowerType = iota // Lisafs indicates that this gofer mount has a LISAFS lower layer. Lisafs // Erofs indicates that this gofer mount has an EROFS lower layer. Erofs // LowerMax indicates the number of the valid lower layer types. LowerMax ) // String returns a human-readable string representing the lower layer type. func (l GoferMountConfLowerType) String() string { switch l { case NoneLower: return "none" case Lisafs: return "lisafs" case Erofs: return erofs.Name } panic(fmt.Sprintf("Invalid gofer mount config lower layer type: %d", l)) } // Set sets the value. Set(String()) should be idempotent. func (l *GoferMountConfLowerType) Set(v string) error { switch v { case "none": *l = NoneLower case "lisafs": *l = Lisafs case erofs.Name: *l = Erofs default: return fmt.Errorf("invalid gofer mount config lower layer type: %s", v) } return nil } // GoferMountConf describes how a gofer mount is configured in the sandbox. type GoferMountConf struct { Upper GoferMountConfUpperType `json:"upper"` Lower GoferMountConfLowerType `json:"lower"` } // String returns a human-readable string representing the gofer mount config. func (g GoferMountConf) String() string { return fmt.Sprintf("%s:%s", g.Lower, g.Upper) } // Set sets the value. Set(String()) should be idempotent. func (g *GoferMountConf) Set(v string) error { parts := strings.Split(v, ":") if len(parts) != 2 { return fmt.Errorf("invalid gofer mount config format: %q", v) } if err := g.Lower.Set(parts[0]); err != nil { return err } if err := g.Upper.Set(parts[1]); err != nil { return err } if !g.valid() { return fmt.Errorf("invalid gofer mount config: %+v", g) } return nil } // IsFilestorePresent returns true if a filestore file was associated with this. func (g GoferMountConf) IsFilestorePresent() bool { return g.Upper == SelfOverlay || g.Upper == AnonOverlay } // IsSelfBacked returns true if this mount is backed by a filestore in itself. func (g GoferMountConf) IsSelfBacked() bool { return g.Upper == SelfOverlay } // ShouldUseOverlayfs returns true if an overlayfs should be applied. func (g GoferMountConf) ShouldUseOverlayfs() bool { return g.Lower != NoneLower && g.Upper != NoOverlay } // ShouldUseTmpfs returns true if a tmpfs should be applied. func (g GoferMountConf) ShouldUseTmpfs() bool { // g.valid() implies that g.Upper != NoOverlay. return g.Lower == NoneLower } // ShouldUseLisafs returns true if a lisafs client/server should be set up. func (g GoferMountConf) ShouldUseLisafs() bool { return g.Lower == Lisafs } // ShouldUseErofs returns true if an EROFS should be applied. func (g GoferMountConf) ShouldUseErofs() bool { return g.Lower == Erofs } // valid returns true if this is a valid gofer mount config. func (g GoferMountConf) valid() bool { return g.Lower < LowerMax && g.Upper < UpperMax && (g.Lower != NoneLower || g.Upper != NoOverlay) } // GoferMountConfFlags can be used with GoferMountConf flags that appear // multiple times. type GoferMountConfFlags []GoferMountConf // String implements flag.Value. func (g *GoferMountConfFlags) String() string { confs := make([]string, 0, len(*g)) for _, confVal := range *g { confs = append(confs, confVal.String()) } return strings.Join(confs, ",") } // Get implements flag.Value. func (g *GoferMountConfFlags) Get() any { return g } // GetArray returns an array of mappings. func (g *GoferMountConfFlags) GetArray() []GoferMountConf { return *g } // Set implements flag.Value and appends a gofer configuration from the command // line to the configs array. Set(String()) should be idempotent. func (g *GoferMountConfFlags) Set(s string) error { confs := strings.Split(s, ",") for _, conf := range confs { var confVal GoferMountConf if err := confVal.Set(conf); err != nil { return fmt.Errorf("invalid GoferMountConf value (%s): %v", conf, err) } *g = append(*g, confVal) } return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/limits.go000066400000000000000000000107131465435605700225720ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package boot import ( "fmt" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sync" ) func findName(lt limits.LimitType) string { for k, v := range limits.FromLinuxResourceName { if v == lt { return k } } return "unknown" } var defaults defs type defs struct { mu sync.Mutex set *limits.LimitSet err error } func (d *defs) get() (*limits.LimitSet, error) { d.mu.Lock() defer d.mu.Unlock() if d.err != nil { return nil, d.err } if d.set == nil { if err := d.initDefaults(); err != nil { d.err = err return nil, err } } return d.set, nil } func (d *defs) initDefaults() error { ls, err := limits.NewLinuxLimitSet() if err != nil { return err } // Set default limits based on what containers get by default, ex: // $ docker run --rm debian prlimit ls.SetUnchecked(limits.AS, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) ls.SetUnchecked(limits.Core, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) ls.SetUnchecked(limits.CPU, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) ls.SetUnchecked(limits.Data, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) ls.SetUnchecked(limits.FileSize, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) ls.SetUnchecked(limits.Locks, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) ls.SetUnchecked(limits.MemoryLocked, limits.Limit{Cur: 65536, Max: 65536}) ls.SetUnchecked(limits.MessageQueueBytes, limits.Limit{Cur: 819200, Max: 819200}) ls.SetUnchecked(limits.Nice, limits.Limit{Cur: 0, Max: 0}) ls.SetUnchecked(limits.NumberOfFiles, limits.Limit{Cur: 1048576, Max: 1048576}) ls.SetUnchecked(limits.ProcessCount, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) ls.SetUnchecked(limits.Rss, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) ls.SetUnchecked(limits.RealTimePriority, limits.Limit{Cur: 0, Max: 0}) ls.SetUnchecked(limits.Rttime, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) ls.SetUnchecked(limits.SignalsPending, limits.Limit{Cur: 0, Max: 0}) ls.SetUnchecked(limits.Stack, limits.Limit{Cur: 8388608, Max: limits.Infinity}) // Read host limits that directly affect the sandbox and adjust the defaults // based on them. for _, res := range []int{unix.RLIMIT_FSIZE, unix.RLIMIT_NOFILE} { var hl unix.Rlimit if err := unix.Getrlimit(res, &hl); err != nil { return err } lt, ok := limits.FromLinuxResource[res] if !ok { return fmt.Errorf("unknown rlimit type %v", res) } hostLimit := limits.Limit{ Cur: limits.FromLinux(hl.Cur), Max: limits.FromLinux(hl.Max), } defaultLimit := ls.Get(lt) if hostLimit.Cur != limits.Infinity && hostLimit.Cur < defaultLimit.Cur { log.Warningf("Host limit is lower than recommended, resource: %q, host: %d, recommended: %d", findName(lt), hostLimit.Cur, defaultLimit.Cur) } if hostLimit.Cur != defaultLimit.Cur || hostLimit.Max != defaultLimit.Max { log.Infof("Setting limit from host, resource: %q {soft: %d, hard: %d}", findName(lt), hostLimit.Cur, hostLimit.Max) ls.SetUnchecked(lt, hostLimit) } } d.set = ls return nil } func createLimitSet(spec *specs.Spec, enableTPUProxy bool) (*limits.LimitSet, error) { ls, err := defaults.get() if err != nil { return nil, err } // Set RLIMIT_MEMLOCK's default value to unlimited when TPUProxy is enabled. // The value will be overwritten if the exact rlimit is provided. if enableTPUProxy { ls.SetUnchecked(limits.MemoryLocked, limits.Limit{Cur: limits.Infinity, Max: limits.Infinity}) } // Then apply overwrites on top of defaults. for _, rl := range spec.Process.Rlimits { lt, ok := limits.FromLinuxResourceName[rl.Type] if !ok { return nil, fmt.Errorf("unknown resource %q", rl.Type) } ls.SetUnchecked(lt, limits.Limit{ Cur: rl.Soft, Max: rl.Hard, }) } return ls, nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/loader.go000066400000000000000000001716151465435605700225500ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package boot loads the kernel and runs a container. package boot import ( "errors" "fmt" mrand "math/rand" "os" "runtime" "strconv" gtime "time" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/syndtr/gocapability/capability" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/coverage" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/memutil" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/devices/nvproxy" "gvisor.dev/gvisor/pkg/sentry/fdimport" "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/user" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/loader" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/seccheck" pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/sighandling" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/link/ethernet" "gvisor.dev/gvisor/pkg/tcpip/link/loopback" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/arp" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/icmp" "gvisor.dev/gvisor/pkg/tcpip/transport/raw" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" "gvisor.dev/gvisor/pkg/tcpip/transport/udp" "gvisor.dev/gvisor/runsc/boot/filter" _ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms. pf "gvisor.dev/gvisor/runsc/boot/portforward" "gvisor.dev/gvisor/runsc/boot/pprof" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/profile" "gvisor.dev/gvisor/runsc/specutils" "gvisor.dev/gvisor/runsc/specutils/seccomp" // Top-level inet providers. "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" // Include other supported socket providers. _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink" _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route" _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent" _ "gvisor.dev/gvisor/pkg/sentry/socket/unix" ) // ContainerRuntimeState is the runtime state of a container. type ContainerRuntimeState int const ( // RuntimeStateInvalid used just in case of error. RuntimeStateInvalid ContainerRuntimeState = iota // RuntimeStateCreating indicates that the container is being // created, but has not started running yet. RuntimeStateCreating // RuntimeStateRunning indicates that the container is running. RuntimeStateRunning // RuntimeStateStopped indicates that the container has stopped. RuntimeStateStopped ) type containerInfo struct { cid string containerName string conf *config.Config // spec is the base configuration for the root container. spec *specs.Spec // procArgs refers to the container's init task. procArgs kernel.CreateProcessArgs // stdioFDs contains stdin, stdout, and stderr. stdioFDs []*fd.FD // passFDs are mappings of user-supplied host to guest file descriptors. passFDs []fdMapping // execFD is the host file descriptor used for program execution. execFD *fd.FD // goferFDs are the FDs that attach the sandbox to the gofers. goferFDs []*fd.FD // devGoferFD is the FD to attach the sandbox to the dev gofer. devGoferFD *fd.FD // goferFilestoreFDs are FDs to the regular files that will back the tmpfs or // overlayfs mount for certain gofer mounts. goferFilestoreFDs []*fd.FD // goferMountConfs contains information about how the gofer mounts have been // configured. The first entry is for rootfs and the following entries are // for bind mounts in Spec.Mounts (in the same order). goferMountConfs []GoferMountConf // nvidiaUVMDevMajor is the device major number used for nvidia-uvm. nvidiaUVMDevMajor uint32 // nvidiaDriverVersion is the NVIDIA driver ABI version to use for // communicating with NVIDIA devices on the host. nvidiaDriverVersion string } type loaderState int const ( // created indicates that the Loader has been created, but not started yet. created loaderState = iota // started indicates that the Loader has been started. started // restoring indicates that the Loader has been created and is restoring // containers. It will change to started after restore is completed. restoring ) // Loader keeps state needed to start the kernel and run the container. type Loader struct { // k is the kernel. k *kernel.Kernel // ctrl is the control server. ctrl *controller // root contains information about the root container in the sandbox. root containerInfo watchdog *watchdog.Watchdog // stopSignalForwarding disables forwarding of signals to the sandboxed // container. It should be called when a sandbox is destroyed. stopSignalForwarding func() // stopProfiling stops profiling started at container creation. It // should be called when a sandbox is destroyed. stopProfiling func() // PreSeccompCallback is called right before installing seccomp filters. PreSeccompCallback func() // restore is set to true if we are restoring a container. restore bool restoreWaiters *sync.Cond // sandboxID is the ID for the whole sandbox. sandboxID string // mountHints provides extra information about mounts for containers that // apply to the entire pod. mountHints *PodMountHints // productName is the value to show in // /sys/devices/virtual/dmi/id/product_name. productName string // hostShmemHuge is the host's value of // /sys/kernel/mm/transparent_hugepage/shmem_enabled. hostShmemHuge string // mu guards the fields below. mu sync.Mutex // state is guarded by mu. state loaderState // sharedMounts holds VFS mounts that may be shared between containers within // the same pod. It is mapped by mount source. // // sharedMounts is guarded by mu. sharedMounts map[string]*vfs.Mount // processes maps containers init process and invocation of exec. Root // processes are keyed with container ID and pid=0, while exec invocations // have the corresponding pid set. // // processes is guarded by mu. processes map[execID]*execProcess // containerIDs store container names and IDs to assist with restore and container // naming when user didn't provide one. // // Mapping: name -> cid. // processes is guarded by mu. containerIDs map[string]string // portForwardProxies is a list of active port forwarding connections. // // portForwardProxies is guarded by mu. portForwardProxies []*pf.Proxy saveFDs []*fd.FD } // execID uniquely identifies a sentry process that is executed in a container. type execID struct { cid string pid kernel.ThreadID } // execProcess contains the thread group and host TTY of a sentry process. type execProcess struct { // tg will be nil for containers that haven't started yet. tg *kernel.ThreadGroup // tty will be nil if the process is not attached to a terminal. tty *host.TTYFileDescription // pidnsPath is the pid namespace path in spec pidnsPath string // hostTTY is present when creating a sub-container with terminal enabled. // TTY file is passed during container create and must be saved until // container start. hostTTY *fd.FD } // fdMapping maps guest to host file descriptors. Guest file descriptors are // exposed to the application inside the sandbox through the FD table. type fdMapping struct { guest int host *fd.FD } // FDMapping is a helper type to represent a mapping from guest to host file // descriptors. In contrast to the unexported fdMapping type, it does not imply // file ownership. type FDMapping struct { Guest int Host int } func init() { // Initialize the random number generator. mrand.Seed(gtime.Now().UnixNano()) } // Args are the arguments for New(). type Args struct { // Id is the sandbox ID. ID string // Spec is the sandbox specification. Spec *specs.Spec // Conf is the system configuration. Conf *config.Config // ControllerFD is the FD to the URPC controller. The Loader takes ownership // of this FD and may close it at any time. ControllerFD int // Device is an optional argument that is passed to the platform. The Loader // takes ownership of this file and may close it at any time. Device *fd.FD // GoferFDs is an array of FDs used to connect with the Gofer. The Loader // takes ownership of these FDs and may close them at any time. GoferFDs []int // DevGoferFD is the FD for the dev gofer connection. The Loader takes // ownership of this FD and may close it at any time. DevGoferFD int // StdioFDs is the stdio for the application. The Loader takes ownership of // these FDs and may close them at any time. StdioFDs []int // PassFDs are user-supplied FD mappings from host to guest descriptors. // The Loader takes ownership of these FDs and may close them at any time. PassFDs []FDMapping // ExecFD is the host file descriptor used for program execution. ExecFD int // GoferFilestoreFDs are FDs to the regular files that will back the tmpfs or // overlayfs mount for certain gofer mounts. GoferFilestoreFDs []int // GoferMountConfs contains information about how the gofer mounts have been // configured. The first entry is for rootfs and the following entries are // for bind mounts in Spec.Mounts (in the same order). GoferMountConfs []GoferMountConf // NumCPU is the number of CPUs to create inside the sandbox. NumCPU int // TotalMem is the initial amount of total memory to report back to the // container. TotalMem uint64 // TotalHostMem is the total memory reported by host /proc/meminfo. TotalHostMem uint64 // UserLogFD is the file descriptor to write user logs to. UserLogFD int // ProductName is the value to show in // /sys/devices/virtual/dmi/id/product_name. ProductName string // PodInitConfigFD is the file descriptor to a file passed in the // --pod-init-config flag PodInitConfigFD int // SinkFDs is an ordered array of file descriptors to be used by seccheck // sinks configured from the --pod-init-config file. SinkFDs []int // ProfileOpts contains the set of profiles to enable and the // corresponding FDs where profile data will be written. ProfileOpts profile.Opts // NvidiaDriverVersion is the NVIDIA driver ABI version to use for // communicating with NVIDIA devices on the host. NvidiaDriverVersion string // HostShmemHuge is the host's value of // /sys/kernel/mm/transparent_hugepage/shmem_enabled, or empty if this is // unknown. HostShmemHuge string SaveFDs []*fd.FD } // make sure stdioFDs are always the same on initial start and on restore const startingStdioFD = 256 func getRootCredentials(spec *specs.Spec, conf *config.Config, userNs *auth.UserNamespace) *auth.Credentials { // Create capabilities. caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) if err != nil { return nil } // Convert the spec's additional GIDs to KGIDs. extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids)) for _, GID := range spec.Process.User.AdditionalGids { extraKGIDs = append(extraKGIDs, auth.KGID(GID)) } if userNs == nil { userNs = auth.NewRootUserNamespace() } // Create credentials. creds := auth.NewUserCredentials( auth.KUID(spec.Process.User.UID), auth.KGID(spec.Process.User.GID), extraKGIDs, caps, userNs) return creds } // New initializes a new kernel loader configured by spec. // New also handles setting up a kernel for restoring a container. func New(args Args) (*Loader, error) { stopProfilingRuntime := profile.Start(args.ProfileOpts) stopProfiling := func() { stopProfilingRuntime() metric.StopProfilingMetrics() } // Initialize seccheck points. seccheck.Initialize() // We initialize the rand package now to make sure /dev/urandom is pre-opened // on kernels that do not support getrandom(2). if err := rand.Init(); err != nil { return nil, fmt.Errorf("setting up rand: %w", err) } if err := usage.Init(); err != nil { return nil, fmt.Errorf("setting up memory usage: %w", err) } if specutils.NVProxyEnabled(args.Spec, args.Conf) { nvproxy.Init() } kernel.IOUringEnabled = args.Conf.IOUring eid := execID{cid: args.ID} l := &Loader{ sandboxID: args.ID, processes: map[execID]*execProcess{eid: {}}, sharedMounts: make(map[string]*vfs.Mount), stopProfiling: stopProfiling, productName: args.ProductName, hostShmemHuge: args.HostShmemHuge, containerIDs: map[string]string{}, saveFDs: args.SaveFDs, } containerName := l.registerContainerLocked(args.Spec, args.ID) l.root = containerInfo{ cid: args.ID, containerName: containerName, conf: args.Conf, spec: args.Spec, goferMountConfs: args.GoferMountConfs, nvidiaDriverVersion: args.NvidiaDriverVersion, } // Make host FDs stable between invocations. Host FDs must map to the exact // same number when the sandbox is restored. Otherwise the wrong FD will be // used. newfd := startingStdioFD for _, stdioFD := range args.StdioFDs { // Check that newfd is unused to avoid clobbering over it. if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { if err != nil { return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err) } return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) } err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC) if err != nil { return nil, fmt.Errorf("dup3 of stdios failed: %w", err) } l.root.stdioFDs = append(l.root.stdioFDs, fd.New(newfd)) _ = unix.Close(stdioFD) newfd++ } for _, goferFD := range args.GoferFDs { l.root.goferFDs = append(l.root.goferFDs, fd.New(goferFD)) } for _, filestoreFD := range args.GoferFilestoreFDs { l.root.goferFilestoreFDs = append(l.root.goferFilestoreFDs, fd.New(filestoreFD)) } if args.DevGoferFD >= 0 { l.root.devGoferFD = fd.New(args.DevGoferFD) } if args.ExecFD >= 0 { l.root.execFD = fd.New(args.ExecFD) } for _, customFD := range args.PassFDs { l.root.passFDs = append(l.root.passFDs, fdMapping{ host: fd.New(customFD.Host), guest: customFD.Guest, }) } // Create kernel and platform. p, err := createPlatform(args.Conf, args.Device) if err != nil { return nil, fmt.Errorf("creating platform: %w", err) } if specutils.NVProxyEnabled(args.Spec, args.Conf) && p.OwnsPageTables() { return nil, fmt.Errorf("--nvproxy is incompatible with platform %s: owns page tables", args.Conf.Platform) } l.k = &kernel.Kernel{Platform: p} // Create memory file. mf, err := createMemoryFile(args.Conf.AppHugePages, args.HostShmemHuge) if err != nil { return nil, fmt.Errorf("creating memory file: %w", err) } l.k.SetMemoryFile(mf) // Create VDSO. // // Pass k as the platform since it is savable, unlike the actual platform. vdso, err := loader.PrepareVDSO(l.k.MemoryFile()) if err != nil { return nil, fmt.Errorf("creating vdso: %w", err) } // Create timekeeper. tk := kernel.NewTimekeeper() params := kernel.NewVDSOParamPage(l.k.MemoryFile(), vdso.ParamPage.FileRange()) tk.SetClocks(time.NewCalibratedClocks(), params) if err := enableStrace(args.Conf); err != nil { return nil, fmt.Errorf("enabling strace: %w", err) } creds := getRootCredentials(args.Spec, args.Conf, nil /* UserNamespace */) if creds == nil { return nil, fmt.Errorf("getting root credentials") } // Create root network namespace/stack. netns, err := newRootNetworkNamespace(args.Conf, tk, creds.UserNamespace) if err != nil { return nil, fmt.Errorf("creating network: %w", err) } if args.NumCPU == 0 { args.NumCPU = runtime.NumCPU() } log.Infof("CPUs: %d", args.NumCPU) runtime.GOMAXPROCS(args.NumCPU) if args.TotalHostMem > 0 { // As per tmpfs(5), the default size limit is 50% of total physical RAM. // See mm/shmem.c:shmem_default_max_blocks(). tmpfs.SetDefaultSizeLimit(args.TotalHostMem / 2) } if args.TotalMem > 0 { // Adjust the total memory returned by the Sentry so that applications that // use /proc/meminfo can make allocations based on this limit. usage.MinimumTotalMemoryBytes = args.TotalMem usage.MaximumTotalMemoryBytes = args.TotalMem log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30)) } maxFDLimit := kernel.MaxFdLimit if args.Spec.Linux != nil && args.Spec.Linux.Sysctl != nil { if val, ok := args.Spec.Linux.Sysctl["fs.nr_open"]; ok { nrOpen, err := strconv.Atoi(val) if err != nil { return nil, fmt.Errorf("setting fs.nr_open=%s: %w", val, err) } if nrOpen <= 0 || nrOpen > int(kernel.MaxFdLimit) { return nil, fmt.Errorf("setting fs.nr_open=%s", val) } maxFDLimit = int32(nrOpen) } } // Initiate the Kernel object, which is required by the Context passed // to createVFS in order to mount (among other things) procfs. if err = l.k.Init(kernel.InitKernelArgs{ FeatureSet: cpuid.HostFeatureSet().Fixed(), Timekeeper: tk, RootUserNamespace: creds.UserNamespace, RootNetworkNamespace: netns, ApplicationCores: uint(args.NumCPU), Vdso: vdso, VdsoParams: params, RootUTSNamespace: kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace), RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace), PIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace), MaxFDLimit: maxFDLimit, }); err != nil { return nil, fmt.Errorf("initializing kernel: %w", err) } if err := registerFilesystems(l.k, &l.root); err != nil { return nil, fmt.Errorf("registering filesystems: %w", err) } // Turn on packet logging if enabled. if args.Conf.LogPackets { log.Infof("Packet logging enabled") sniffer.LogPackets.Store(1) } else { log.Infof("Packet logging disabled") sniffer.LogPackets.Store(0) } // Create a watchdog. dogOpts := watchdog.DefaultOpts dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction l.watchdog = watchdog.New(l.k, dogOpts) procArgs, err := createProcessArgs(args.ID, args.Spec, args.Conf, creds, l.k, l.k.RootPIDNamespace()) if err != nil { return nil, fmt.Errorf("creating init process for root container: %w", err) } l.root.procArgs = procArgs if err := initCompatLogs(args.UserLogFD); err != nil { return nil, fmt.Errorf("initializing compat logs: %w", err) } l.mountHints, err = NewPodMountHints(args.Spec) if err != nil { return nil, fmt.Errorf("creating pod mount hints: %w", err) } // Set up host mount that will be used for imported fds. hostFilesystem, err := host.NewFilesystem(l.k.VFS()) if err != nil { return nil, fmt.Errorf("failed to create hostfs filesystem: %w", err) } defer hostFilesystem.DecRef(l.k.SupervisorContext()) l.k.SetHostMount(l.k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})) if args.PodInitConfigFD >= 0 { if err := setupSeccheck(args.PodInitConfigFD, args.SinkFDs); err != nil { log.Warningf("unable to configure event session: %v", err) } } l.k.RegisterContainerName(args.ID, l.root.containerName) // We don't care about child signals; some platforms can generate a // tremendous number of useless ones (I'm looking at you, ptrace). if err := sighandling.IgnoreChildStop(); err != nil { return nil, fmt.Errorf("ignore child stop signals failed: %w", err) } if len(args.Conf.TestOnlyAutosaveImagePath) != 0 { enableAutosave(l, args.Conf.TestOnlyAutosaveResume, l.saveFDs) } if err := l.kernelInitExtra(); err != nil { return nil, err } // Create the control server using the provided FD. // // This must be done *after* we have initialized the kernel since the // controller is used to configure the kernel's network stack. ctrl, err := newController(args.ControllerFD, l) if err != nil { return nil, fmt.Errorf("creating control server: %w", err) } l.ctrl = ctrl // Only start serving after Loader is set to controller and controller is set // to Loader, because they are both used in the urpc methods. if err := ctrl.srv.StartServing(); err != nil { return nil, fmt.Errorf("starting control server: %w", err) } return l, nil } // createProcessArgs creates args that can be used with kernel.CreateProcess. func createProcessArgs(id string, spec *specs.Spec, conf *config.Config, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) { // Create initial limits. ls, err := createLimitSet(spec, specutils.TPUProxyIsEnabled(spec, conf)) if err != nil { return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %w", err) } env, err := specutils.ResolveEnvs(spec.Process.Env) if err != nil { return kernel.CreateProcessArgs{}, fmt.Errorf("resolving env: %w", err) } wd := spec.Process.Cwd if wd == "" { wd = "/" } // Create the process arguments. procArgs := kernel.CreateProcessArgs{ Argv: spec.Process.Args, Envv: env, WorkingDirectory: wd, Credentials: creds, Umask: 0022, Limits: ls, MaxSymlinkTraversals: linux.MaxSymlinkTraversals, UTSNamespace: k.RootUTSNamespace(), IPCNamespace: k.RootIPCNamespace(), ContainerID: id, PIDNamespace: pidns, } return procArgs, nil } // Destroy cleans up all resources used by the loader. // // Note that this will block until all open control server connections have // been closed. For that reason, this should NOT be called in a defer, because // a panic in a control server rpc would then hang forever. func (l *Loader) Destroy() { if l.stopSignalForwarding != nil { l.stopSignalForwarding() } l.watchdog.Stop() ctx := l.k.SupervisorContext() for _, m := range l.sharedMounts { m.DecRef(ctx) } // Stop the control server. This will indirectly stop any // long-running control operations that are in flight, e.g. // profiling operations. l.ctrl.stop() // Release all kernel resources. This is only safe after we can no longer // save/restore. l.k.Release() // Release any dangling tcp connections. tcpip.ReleaseDanglingEndpoints() // In the success case, all FDs in l.root will only contain released/closed // FDs whose ownership has been passed over to host FDs and gofer sessions. // Close them here in case of failure. for _, f := range l.root.stdioFDs { _ = f.Close() } for _, f := range l.root.passFDs { _ = f.host.Close() } for _, f := range l.root.goferFDs { _ = f.Close() } for _, f := range l.root.goferFilestoreFDs { _ = f.Close() } if l.root.devGoferFD != nil { _ = l.root.devGoferFD.Close() } l.stopProfiling() // Check all references. refs.OnExit() } func createPlatform(conf *config.Config, deviceFile *fd.FD) (platform.Platform, error) { p, err := platform.Lookup(conf.Platform) if err != nil { panic(fmt.Sprintf("invalid platform %s: %s", conf.Platform, err)) } log.Infof("Platform: %s", conf.Platform) return p.New(deviceFile) } func createMemoryFile(appHugePages bool, hostShmemHuge string) (*pgalloc.MemoryFile, error) { const memfileName = "runsc-memory" memfd, err := memutil.CreateMemFD(memfileName, 0) if err != nil { return nil, fmt.Errorf("error creating memfd: %w", err) } memfile := os.NewFile(uintptr(memfd), memfileName) mfopts := pgalloc.MemoryFileOpts{ // We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if // there are memory cgroups specified, because at this point we're already // in a mount namespace in which the relevant cgroupfs is not visible. } if appHugePages { switch hostShmemHuge { case "": log.Infof("Disabling application huge pages: host shmem_huge is unknown") case "never", "deny": log.Infof("Disabling application huge pages: host shmem_huge is %q", hostShmemHuge) case "advise": log.Infof("Enabling application huge pages: host shmem_huge is %q", hostShmemHuge) mfopts.ExpectHugepages = true mfopts.AdviseHugepage = true case "always", "within_size": log.Infof("Enabling application huge pages: host shmem_huge is %q", hostShmemHuge) // In these cases, memfds will default to using huge pages, and we have to // explicitly ask for small pages. mfopts.ExpectHugepages = true mfopts.AdviseNoHugepage = true case "force": log.Infof("Enabling application huge pages: host shmem_huge is %q", hostShmemHuge) // The kernel will ignore MADV_NOHUGEPAGE, so don't bother. mfopts.ExpectHugepages = true default: log.Infof("Disabling application huge pages: host shmem_huge is unknown value %q", hostShmemHuge) } } mf, err := pgalloc.NewMemoryFile(memfile, mfopts) if err != nil { _ = memfile.Close() return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %w", err) } return mf, nil } // installSeccompFilters installs sandbox seccomp filters with the host. func (l *Loader) installSeccompFilters() error { if l.PreSeccompCallback != nil { l.PreSeccompCallback() } if l.root.conf.DisableSeccomp { log.Warningf("*** SECCOMP WARNING: syscall filter is DISABLED. Running in less secure mode.") } else { hostnet := l.root.conf.Network == config.NetworkHost opts := filter.Options{ Platform: l.k.Platform.SeccompInfo(), HostNetwork: hostnet, HostNetworkRawSockets: hostnet && l.root.conf.EnableRaw, HostFilesystem: l.root.conf.DirectFS, ProfileEnable: l.root.conf.ProfileEnable, NVProxy: specutils.NVProxyEnabled(l.root.spec, l.root.conf), TPUProxy: specutils.TPUProxyIsEnabled(l.root.spec, l.root.conf), ControllerFD: uint32(l.ctrl.srv.FD()), } if err := filter.Install(opts); err != nil { return fmt.Errorf("installing seccomp filters: %w", err) } } return nil } // Run runs the root container. func (l *Loader) Run() error { err := l.run() l.ctrl.manager.startResultChan <- err if err != nil { // Give the controller some time to send the error to the // runtime. If we return too quickly here the process will exit // and the control connection will be closed before the error // is returned. gtime.Sleep(2 * gtime.Second) return err } return nil } func (l *Loader) run() error { if l.root.conf.Network == config.NetworkHost { // Delay host network configuration to this point because network namespace // is configured after the loader is created and before Run() is called. log.Debugf("Configuring host network") s := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack) if err := s.Configure(l.root.conf.EnableRaw); err != nil { return err } } l.mu.Lock() defer l.mu.Unlock() eid := execID{cid: l.sandboxID} ep, ok := l.processes[eid] if !ok { return fmt.Errorf("trying to start deleted container %q", l.sandboxID) } // If we are restoring, we do not want to create a process. // l.restore is set by the container manager when a restore call is made. if !l.restore { if l.root.conf.ProfileEnable { pprof.Initialize() } // Finally done with all configuration. Setup filters before user code // is loaded. if err := l.installSeccompFilters(); err != nil { return err } // Create the root container init task. It will begin running // when the kernel is started. var ( tg *kernel.ThreadGroup err error ) tg, ep.tty, err = l.createContainerProcess(&l.root) if err != nil { return err } if seccheck.Global.Enabled(seccheck.PointContainerStart) { evt := pb.Start{ Id: l.sandboxID, Cwd: l.root.spec.Process.Cwd, Args: l.root.spec.Process.Args, Terminal: l.root.spec.Process.Terminal, } fields := seccheck.Global.GetFieldSet(seccheck.PointContainerStart) if fields.Local.Contains(seccheck.FieldContainerStartEnv) { evt.Env = l.root.spec.Process.Env } if !fields.Context.Empty() { evt.ContextData = &pb.ContextData{} kernel.LoadSeccheckData(tg.Leader(), fields.Context, evt.ContextData) } _ = seccheck.Global.SentToSinks(func(c seccheck.Sink) error { return c.ContainerStart(context.Background(), fields, &evt) }) } } ep.tg = l.k.GlobalInit() if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok { ep.pidnsPath = ns.Path } // Handle signals by forwarding them to the root container process // (except for panic signal, which should cause a panic). l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) { // Panic signal should cause a panic. if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) { panic("Signal-induced panic") } // Otherwise forward to root container. deliveryMode := DeliverToProcess if l.root.spec.Process.Terminal { // Since we are running with a console, we should forward the signal to // the foreground process group so that job control signals like ^C can // be handled properly. deliveryMode = DeliverToForegroundProcessGroup } log.Infof("Received external signal %d, mode: %s", sig, deliveryMode) if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil { log.Warningf("error sending signal %s to container %q: %s", sig, l.sandboxID, err) } }) log.Infof("Process should have started...") l.watchdog.Start() if err := l.k.Start(); err != nil { return err } l.state = started return nil } // createSubcontainer creates a new container inside the sandbox. func (l *Loader) createSubcontainer(cid string, tty *fd.FD) error { l.mu.Lock() defer l.mu.Unlock() eid := execID{cid: cid} if _, ok := l.processes[eid]; ok { return fmt.Errorf("container %q already exists", cid) } l.processes[eid] = &execProcess{hostTTY: tty} return nil } // startSubcontainer starts a child container. It returns the thread group ID of // the newly created process. Used FDs are either closed or released. It's safe // for the caller to close any remaining files upon return. func (l *Loader) startSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs, goferFilestoreFDs []*fd.FD, devGoferFD *fd.FD, goferMountConfs []GoferMountConf) error { l.mu.Lock() defer l.mu.Unlock() ep := l.processes[execID{cid: cid}] if ep == nil { return fmt.Errorf("trying to start a deleted container %q", cid) } // Create credentials. We reuse the root user namespace because the // sentry currently supports only 1 mount namespace, which is tied to a // single user namespace. Thus we must run in the same user namespace // to access mounts. creds := getRootCredentials(spec, conf, l.k.RootUserNamespace()) if creds == nil { return fmt.Errorf("getting root credentials") } var pidns *kernel.PIDNamespace if ns, ok := specutils.GetNS(specs.PIDNamespace, spec); ok { if ns.Path != "" { for _, p := range l.processes { if ns.Path == p.pidnsPath { log.Debugf("Joining PID namespace named %q", ns.Path) pidns = p.tg.PIDNamespace() break } } } if pidns == nil { log.Warningf("PID namespace %q not found, running in new PID namespace", ns.Path) pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace()) } ep.pidnsPath = ns.Path } else { pidns = l.k.RootPIDNamespace() } containerName := l.registerContainerLocked(spec, cid) info := &containerInfo{ cid: cid, containerName: containerName, conf: conf, spec: spec, goferFDs: goferFDs, devGoferFD: devGoferFD, goferFilestoreFDs: goferFilestoreFDs, goferMountConfs: goferMountConfs, nvidiaUVMDevMajor: l.root.nvidiaUVMDevMajor, nvidiaDriverVersion: l.root.nvidiaDriverVersion, } var err error info.procArgs, err = createProcessArgs(cid, spec, conf, creds, l.k, pidns) if err != nil { return fmt.Errorf("creating new process: %w", err) } // Use stdios or TTY depending on the spec configuration. if spec.Process.Terminal { if l := len(stdioFDs); l != 0 { return fmt.Errorf("using TTY, stdios not expected: %d", l) } if ep.hostTTY == nil { return fmt.Errorf("terminal enabled but no TTY provided. Did you set --console-socket on create?") } info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY} ep.hostTTY = nil } else { info.stdioFDs = stdioFDs } var cu cleanup.Cleanup defer cu.Clean() if devGoferFD != nil { cu.Add(func() { // createContainerProcess() will consume devGoferFD and initialize a gofer // connection. This connection is owned by l.k. In case of failure, we want // to clean up this gofer connection so that the gofer process can exit. l.k.RemoveDevGofer(containerName) }) } ep.tg, ep.tty, err = l.createContainerProcess(info) if err != nil { return err } if seccheck.Global.Enabled(seccheck.PointContainerStart) { evt := pb.Start{ Id: cid, Cwd: spec.Process.Cwd, Args: spec.Process.Args, Terminal: spec.Process.Terminal, } fields := seccheck.Global.GetFieldSet(seccheck.PointContainerStart) if fields.Local.Contains(seccheck.FieldContainerStartEnv) { evt.Env = spec.Process.Env } if !fields.Context.Empty() { evt.ContextData = &pb.ContextData{} kernel.LoadSeccheckData(ep.tg.Leader(), fields.Context, evt.ContextData) } _ = seccheck.Global.SentToSinks(func(c seccheck.Sink) error { return c.ContainerStart(context.Background(), fields, &evt) }) } l.k.RegisterContainerName(cid, info.containerName) l.k.StartProcess(ep.tg) // No more failures from this point on. cu.Release() return nil } // +checklocks:l.mu func (l *Loader) createContainerProcess(info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileDescription, error) { // Create the FD map, which will set stdin, stdout, and stderr. ctx := info.procArgs.NewContext(l.k) fdTable, ttyFile, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs, info.passFDs, info.spec.Process.User, info.containerName) if err != nil { return nil, nil, fmt.Errorf("importing fds: %w", err) } // CreateProcess takes a reference on fdTable if successful. We won't need // ours either way. info.procArgs.FDTable = fdTable if info.execFD != nil { if info.procArgs.Filename != "" { return nil, nil, fmt.Errorf("process must either be started from a file or a filename, not both") } file, err := host.NewFD(ctx, l.k.HostMount(), info.execFD.FD(), &host.NewFDOptions{ Readonly: true, Savable: true, VirtualOwner: true, UID: auth.KUID(info.spec.Process.User.UID), GID: auth.KGID(info.spec.Process.User.GID), }) if err != nil { return nil, nil, err } defer file.DecRef(ctx) info.execFD.Release() info.procArgs.File = file } // Gofer FDs must be ordered and the first FD is always the rootfs. if len(info.goferFDs) < 1 { return nil, nil, fmt.Errorf("rootfs gofer FD not found") } l.startGoferMonitor(info) if l.root.cid == l.sandboxID { // Mounts cgroups for all the controllers. if err := l.mountCgroupMounts(info.conf, info.procArgs.Credentials); err != nil { return nil, nil, err } } // We can share l.sharedMounts with containerMounter since l.mu is locked. // Hence, mntr must only be used within this function (while l.mu is locked). mntr := newContainerMounter(info, l.k, l.mountHints, l.sharedMounts, l.productName, l.sandboxID) if err := setupContainerVFS(ctx, info, mntr, &info.procArgs); err != nil { return nil, nil, err } defer func() { for cg := range info.procArgs.InitialCgroups { cg.Dentry.DecRef(ctx) } }() // Add the HOME environment variable if it is not already set. info.procArgs.Envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace, info.procArgs.Credentials.RealKUID, info.procArgs.Envv) if err != nil { return nil, nil, err } // Create and start the new process. tg, _, err := l.k.CreateProcess(info.procArgs) if err != nil { return nil, nil, fmt.Errorf("creating process: %w", err) } // CreateProcess takes a reference on FDTable if successful. info.procArgs.FDTable.DecRef(ctx) // Set the foreground process group on the TTY to the global init process // group, since that is what we are about to start running. if ttyFile != nil { ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) } // Install seccomp filters with the new task if there are any. if info.conf.OCISeccomp { if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp) if err != nil { return nil, nil, fmt.Errorf("building seccomp program: %w", err) } if log.IsLogging(log.Debug) { out, _ := bpf.DecodeProgram(program) log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out) } task := tg.Leader() // NOTE: It seems Flags are ignored by runc so we ignore them too. if err := task.AppendSyscallFilter(program, true); err != nil { return nil, nil, fmt.Errorf("appending seccomp filters: %w", err) } } } else { if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { log.Warningf("Seccomp spec is being ignored") } } return tg, ttyFile, nil } // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on // the gofer FD looking for disconnects, and kills the container processes if // the gofer connection disconnects. func (l *Loader) startGoferMonitor(info *containerInfo) { // We need to pick a suitable gofer connection that is expected to be alive // for the entire container lifecycle. Only the following can be used: // 1. Rootfs gofer connection // 2. Device gofer connection // // Note that other gofer mounts are allowed to be unmounted and disconnected. goferFD := -1 if info.goferMountConfs[0].ShouldUseLisafs() { goferFD = info.goferFDs[0].FD() } else if info.devGoferFD != nil { goferFD = info.devGoferFD.FD() } if goferFD < 0 { log.Warningf("could not find a suitable gofer FD to monitor") return } go func() { log.Debugf("Monitoring gofer health for container %q", info.cid) events := []unix.PollFd{ { Fd: int32(goferFD), Events: unix.POLLHUP | unix.POLLRDHUP, }, } _, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) { // Use ppoll instead of poll because it's already allowed in seccomp. n, err := unix.Ppoll(events, nil, nil) return uintptr(n), 0, err }) if err != nil { panic(fmt.Sprintf("Error monitoring gofer FDs: %s", err)) } l.mu.Lock() defer l.mu.Unlock() // The gofer could have been stopped due to a normal container shutdown. // Check if the container has not stopped yet. if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: info.cid}); tg != nil { log.Infof("Gofer socket disconnected, killing container %q", info.cid) if err := l.signalAllProcesses(info.cid, int32(linux.SIGKILL)); err != nil { log.Warningf("Error killing container %q after gofer stopped: %s", info.cid, err) } } }() } // destroySubcontainer stops a container if it is still running and cleans up // its filesystem. func (l *Loader) destroySubcontainer(cid string) error { l.mu.Lock() defer l.mu.Unlock() tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}) if err != nil { // Container doesn't exist. return err } // The container exists, but has it been started? if tg != nil { if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { return fmt.Errorf("sending SIGKILL to all container processes: %w", err) } // Wait for all processes that belong to the container to exit (including // exec'd processes). for _, t := range l.k.TaskSet().Root.Tasks() { if t.ContainerID() == cid { t.ThreadGroup().WaitExited() } } } // No more failure from this point on. // Remove all container thread groups from the map. for key := range l.processes { if key.cid == cid { delete(l.processes, key) } } // Cleanup the device gofer. l.k.RemoveDevGofer(l.k.ContainerName(cid)) log.Debugf("Container destroyed, cid: %s", cid) return nil } func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { // Hold the lock for the entire operation to ensure that exec'd process is // added to 'processes' in case it races with destroyContainer(). l.mu.Lock() defer l.mu.Unlock() tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID}) if err != nil { return 0, err } if tg == nil { return 0, fmt.Errorf("container %q not started", args.ContainerID) } // Get the container MountNamespace from the Task. Try to acquire ref may fail // in case it raced with task exit. // task.MountNamespace() does not take a ref, so we must do so ourselves. args.MountNamespace = tg.Leader().MountNamespace() if args.MountNamespace == nil || !args.MountNamespace.TryIncRef() { return 0, fmt.Errorf("container %q has stopped", args.ContainerID) } sctx := l.k.SupervisorContext() root := args.MountNamespace.Root(sctx) defer root.DecRef(sctx) ctx := vfs.WithRoot(sctx, root) defer args.MountNamespace.DecRef(ctx) args.Envv, err = specutils.ResolveEnvs(args.Envv) if err != nil { return 0, fmt.Errorf("resolving env: %w", err) } // Add the HOME environment variable if it is not already set. args.Envv, err = user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) if err != nil { return 0, err } args.PIDNamespace = tg.PIDNamespace() args.Limits, err = createLimitSet(l.root.spec, specutils.TPUProxyIsEnabled(l.root.spec, l.root.conf)) if err != nil { return 0, fmt.Errorf("creating limits: %w", err) } // Start the process. proc := control.Proc{Kernel: l.k} newTG, tgid, ttyFile, err := control.ExecAsync(&proc, args) if err != nil { return 0, err } eid := execID{cid: args.ContainerID, pid: tgid} l.processes[eid] = &execProcess{ tg: newTG, tty: ttyFile, } log.Debugf("updated processes: %v", l.processes) return tgid, nil } // waitContainer waits for the init process of a container to exit. func (l *Loader) waitContainer(cid string, waitStatus *uint32) error { // Don't defer unlock, as doing so would make it impossible for // multiple clients to wait on the same container. key := execID{cid: cid} tg, err := l.threadGroupFromID(key) if err != nil { l.mu.Lock() // Extra handling is needed if the container is restoring. if l.state != restoring { l.mu.Unlock() return err } // Container could be restoring, first check if container exists. if _, err := l.findProcessLocked(key); err != nil { l.mu.Unlock() return err } log.Infof("Waiting for container being restored, CID: %q", cid) l.restoreWaiters.Wait() l.mu.Unlock() log.Infof("Restore is completed, trying to wait for container %q again.", cid) return l.waitContainer(cid, waitStatus) } // If the thread either has already exited or exits during waiting, // consider the container exited. ws := l.wait(tg) *waitStatus = ws // Check for leaks and write coverage report after the root container has // exited. This guarantees that the report is written in cases where the // sandbox is killed by a signal after the ContMgrWait request is completed. if l.root.procArgs.ContainerID == cid { // All sentry-created resources should have been released at this point. _ = coverage.Report() } return nil } func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error { if tgid <= 0 { return fmt.Errorf("PID (%d) must be positive", tgid) } // Try to find a process that was exec'd eid := execID{cid: cid, pid: tgid} execTG, err := l.threadGroupFromID(eid) if err == nil { ws := l.wait(execTG) *waitStatus = ws l.mu.Lock() delete(l.processes, eid) log.Debugf("updated processes (removal): %v", l.processes) l.mu.Unlock() return nil } // The caller may be waiting on a process not started directly via exec. // In this case, find the process in the container's PID namespace. initTG, err := l.threadGroupFromID(execID{cid: cid}) if err != nil { return fmt.Errorf("waiting for PID %d: %w", tgid, err) } tg := initTG.PIDNamespace().ThreadGroupWithID(tgid) if tg == nil { return fmt.Errorf("waiting for PID %d: no such process", tgid) } if tg.Leader().ContainerID() != cid { return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID()) } ws := l.wait(tg) *waitStatus = ws return nil } // wait waits for the process with TGID 'tgid' in a container's PID namespace // to exit. func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 { tg.WaitExited() return uint32(tg.ExitStatus()) } // WaitForStartSignal waits for a start signal from the control server. func (l *Loader) WaitForStartSignal() { <-l.ctrl.manager.startChan } // WaitExit waits for the root container to exit, and returns its exit status. func (l *Loader) WaitExit() linux.WaitStatus { // Wait for container. l.k.WaitExited() return l.k.GlobalInit().ExitStatus() } func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, userns *auth.UserNamespace) (*inet.Namespace, error) { // Create an empty network stack because the network namespace may be empty at // this point. Netns is configured before Run() is called. Netstack is // configured using a control uRPC message. Host network is configured inside // Run(). switch conf.Network { case config.NetworkHost: // If configured for raw socket support with host network // stack, make sure that we have CAP_NET_RAW the host, // otherwise we can't make raw sockets. if conf.EnableRaw && !specutils.HasCapabilities(capability.CAP_NET_RAW) { return nil, fmt.Errorf("configuring network=host with raw sockets requires CAP_NET_RAW capability") } // No network namespacing support for hostinet yet, hence creator is nil. return inet.NewRootNamespace(hostinet.NewStack(), nil, userns), nil case config.NetworkNone, config.NetworkSandbox: s, err := newEmptySandboxNetworkStack(clock, conf.AllowPacketEndpointWrite) if err != nil { return nil, err } creator := &sandboxNetstackCreator{ clock: clock, allowPacketEndpointWrite: conf.AllowPacketEndpointWrite, } return inet.NewRootNamespace(s, creator, userns), nil default: panic(fmt.Sprintf("invalid network configuration: %v", conf.Network)) } } func newEmptySandboxNetworkStack(clock tcpip.Clock, allowPacketEndpointWrite bool) (*netstack.Stack, error) { netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol} transProtos := []stack.TransportProtocolFactory{ tcp.NewProtocol, udp.NewProtocol, icmp.NewProtocol4, icmp.NewProtocol6, } s := netstack.Stack{Stack: stack.New(stack.Options{ NetworkProtocols: netProtos, TransportProtocols: transProtos, Clock: clock, Stats: netstack.Metrics, HandleLocal: true, // Enable raw sockets for users with sufficient // privileges. RawFactory: raw.EndpointFactory{}, AllowPacketEndpointWrite: allowPacketEndpointWrite, DefaultIPTables: netfilter.DefaultLinuxTables, })} // Enable SACK Recovery. { opt := tcpip.TCPSACKEnabled(true) if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) } } // Set default TTLs as required by socket/netstack. { opt := tcpip.DefaultTTLOption(netstack.DefaultTTL) if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil { return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err) } if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil { return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err) } } // Enable Receive Buffer Auto-Tuning. { opt := tcpip.TCPModerateReceiveBufferOption(true) if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) } } return &s, nil } // sandboxNetstackCreator implements kernel.NetworkStackCreator. // // +stateify savable type sandboxNetstackCreator struct { clock tcpip.Clock allowPacketEndpointWrite bool } // CreateStack implements kernel.NetworkStackCreator.CreateStack. func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) { s, err := newEmptySandboxNetworkStack(f.clock, f.allowPacketEndpointWrite) if err != nil { return nil, err } // Setup loopback. n := &Network{Stack: s.Stack} nicID := s.Stack.NextNICID() if nicID != linux.LOOPBACK_IFINDEX { return nil, fmt.Errorf("loopback device should always have index %d, got %d", linux.LOOPBACK_IFINDEX, nicID) } link := DefaultLoopbackLink linkEP := ethernet.New(loopback.New()) opts := stack.NICOptions{ Name: link.Name, DeliverLinkPackets: true, } if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil { return nil, err } return s, nil } // signal sends a signal to one or more processes in a container. If PID is 0, // then the container init process is used. Depending on the SignalDeliveryMode // option, the signal may be sent directly to the indicated process, to all // processes in the container, or to the foreground process group. pid is // relative to the root PID namespace, not the container's. func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error { if pid < 0 { return fmt.Errorf("PID (%d) must be positive", pid) } switch mode { case DeliverToProcess: if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil { return fmt.Errorf("signaling process in container %q PID %d: %w", cid, pid, err) } return nil case DeliverToForegroundProcessGroup: if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil { return fmt.Errorf("signaling foreground process group in container %q PID %d: %w", cid, pid, err) } return nil case DeliverToAllProcesses: if pid != 0 { return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid) } // Check that the container has actually started before signaling it. if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil { return err } if err := l.signalAllProcesses(cid, signo); err != nil { return fmt.Errorf("signaling all processes in container %q: %w", cid, err) } return nil default: panic(fmt.Sprintf("unknown signal delivery mode %v", mode)) } } // signalProcess sends signal to process in the given container. tgid is // relative to the root PID namespace, not the container's. func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error { execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid}) if err == nil { // Send signal directly to the identified process. return l.k.SendExternalSignalThreadGroup(execTG, &linux.SignalInfo{Signo: signo}) } // The caller may be signaling a process not started directly via exec. // In this case, find the process and check that the process belongs to the // container in question. tg := l.k.RootPIDNamespace().ThreadGroupWithID(tgid) if tg == nil { return fmt.Errorf("no such process with PID %d", tgid) } if tg.Leader().ContainerID() != cid { return fmt.Errorf("process %d belongs to a different container: %q", tgid, tg.Leader().ContainerID()) } return l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: signo}) } // signalForegrondProcessGroup looks up foreground process group from the TTY // for the given "tgid" inside container "cid", and send the signal to it. func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error { l.mu.Lock() tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid}) if err != nil { l.mu.Unlock() return fmt.Errorf("no thread group found: %w", err) } if tg == nil { l.mu.Unlock() return fmt.Errorf("container %q not started", cid) } tty, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid}) l.mu.Unlock() if err != nil { return fmt.Errorf("no thread group found: %w", err) } if tty == nil { return fmt.Errorf("no TTY attached") } pg := tty.ForegroundProcessGroup() si := &linux.SignalInfo{Signo: signo} if pg == nil { // No foreground process group has been set. Signal the // original thread group. log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid) return l.k.SendExternalSignalThreadGroup(tg, si) } // Send the signal to all processes in the process group. return l.k.SendExternalSignalProcessGroup(pg, si) } // signalAllProcesses that belong to specified container. It's a noop if the // container hasn't started or has exited. func (l *Loader) signalAllProcesses(cid string, signo int32) error { // Pause the kernel to prevent new processes from being created while // the signal is delivered. This prevents process leaks when SIGKILL is // sent to the entire container. l.k.Pause() defer l.k.Unpause() return l.k.SendContainerSignal(cid, &linux.SignalInfo{Signo: signo}) } // threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it // acquires mutex before calling it and fails in case container hasn't started // yet. func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) { l.mu.Lock() defer l.mu.Unlock() tg, err := l.tryThreadGroupFromIDLocked(key) if err != nil { return nil, err } if tg == nil { return nil, fmt.Errorf("container %q not started", key.cid) } return tg, nil } // tryThreadGroupFromIDLocked returns the thread group for the given execution // ID. It may return nil in case the container has not started yet. Returns // error if execution ID is invalid or if the container cannot be found (maybe // it has been deleted). Caller must hold 'mu'. func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) { ep, err := l.findProcessLocked(key) if err != nil { return nil, err } return ep.tg, nil } // ttyFromIDLocked returns the TTY files for the given execution ID. It may // return nil in case the container has not started yet. Returns error if // execution ID is invalid or if the container cannot be found (maybe it has // been deleted). Caller must hold 'mu'. func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileDescription, error) { ep, err := l.findProcessLocked(key) if err != nil { return nil, err } return ep.tty, nil } func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD, passFDs []fdMapping, user specs.User, containerName string) (*kernel.FDTable, *host.TTYFileDescription, error) { if len(stdioFDs) != 3 { return nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) } fdMap := map[int]*fd.FD{ 0: stdioFDs[0], 1: stdioFDs[1], 2: stdioFDs[2], } // Create the entries for the host files that were passed to our app. for _, customFD := range passFDs { if customFD.guest < 0 { return nil, nil, fmt.Errorf("guest file descriptors must be 0 or greater") } fdMap[customFD.guest] = customFD.host } k := kernel.KernelFromContext(ctx) fdTable := k.NewFDTable() ttyFile, err := fdimport.Import(ctx, fdTable, console, auth.KUID(user.UID), auth.KGID(user.GID), fdMap, containerName) if err != nil { fdTable.DecRef(ctx) return nil, nil, err } return fdTable, ttyFile, nil } // portForward implements initiating a portForward connection in the sandbox. portForwardProxies // represent a two connections each copying to each other (read ends to write ends) in goroutines. // The proxies are stored and can be cleaned up, or clean up after themselves if the connection // is broken. func (l *Loader) portForward(opts *PortForwardOpts) error { // Validate that we have a stream FD to write to. If this happens then // it means there is a misbehaved urpc client or a bug has occurred. if len(opts.Files) != 1 { return fmt.Errorf("stream FD is required for port forward") } l.mu.Lock() defer l.mu.Unlock() cid := opts.ContainerID tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}) if err != nil { return fmt.Errorf("failed to get threadgroup from %q: %w", cid, err) } if tg == nil { return fmt.Errorf("container %q not started", cid) } // Import the fd for the UDS. ctx := l.k.SupervisorContext() fd, err := l.importFD(ctx, opts.Files[0]) if err != nil { return fmt.Errorf("importing stream fd: %w", err) } cu := cleanup.Make(func() { fd.DecRef(ctx) }) defer cu.Clean() fdConn := pf.NewFileDescriptionConn(fd) // Create a proxy to forward data between the fdConn and the sandboxed application. pair := pf.ProxyPair{To: fdConn} switch l.root.conf.Network { case config.NetworkSandbox: stack := l.k.RootNetworkNamespace().Stack().(*netstack.Stack).Stack nsConn, err := pf.NewNetstackConn(stack, opts.Port) if err != nil { return fmt.Errorf("creating netstack port forward connection: %w", err) } pair.From = nsConn case config.NetworkHost: hConn, err := pf.NewHostInetConn(opts.Port) if err != nil { return fmt.Errorf("creating hostinet port forward connection: %w", err) } pair.From = hConn default: return fmt.Errorf("unsupported network type %q for container %q", l.root.conf.Network, cid) } cu.Release() proxy := pf.NewProxy(pair, opts.ContainerID) // Add to the list of port forward connections and remove when the // connection closes. l.portForwardProxies = append(l.portForwardProxies, proxy) proxy.AddCleanup(func() { l.mu.Lock() defer l.mu.Unlock() for i := range l.portForwardProxies { if l.portForwardProxies[i] == proxy { l.portForwardProxies = append(l.portForwardProxies[:i], l.portForwardProxies[i+1:]...) break } } }) // Start forwarding on the connection. proxy.Start(ctx) return nil } // importFD generically imports a host file descriptor without adding it to any // fd table. func (l *Loader) importFD(ctx context.Context, f *os.File) (*vfs.FileDescription, error) { hostFD, err := fd.NewFromFile(f) if err != nil { return nil, err } defer hostFD.Close() fd, err := host.NewFD(ctx, l.k.HostMount(), hostFD.FD(), &host.NewFDOptions{ Savable: false, // We disconnect and close on save. IsTTY: false, VirtualOwner: false, // FD not visible to the sandboxed app so user can't be changed. }) if err != nil { return nil, err } hostFD.Release() return fd, nil } func (l *Loader) containerCount() int { l.mu.Lock() defer l.mu.Unlock() containers := 0 for id := range l.processes { if id.pid == 0 { // pid==0 represents the init process of a container. There is // only one of such process per container. containers++ } } return containers } func (l *Loader) pidsCount(cid string) (int, error) { l.mu.Lock() defer l.mu.Unlock() if _, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}); err != nil { // Container doesn't exist. return 0, err } return l.k.TaskSet().Root.NumTasksPerContainer(cid), nil } func (l *Loader) networkStats() ([]*NetworkInterface, error) { var stats []*NetworkInterface stack := l.k.RootNetworkNamespace().Stack() for _, i := range stack.Interfaces() { var stat inet.StatDev if err := stack.Statistics(&stat, i.Name); err != nil { return nil, err } stats = append(stats, &NetworkInterface{ Name: i.Name, RxBytes: stat[0], RxPackets: stat[1], RxErrors: stat[2], RxDropped: stat[3], TxBytes: stat[8], TxPackets: stat[9], TxErrors: stat[10], TxDropped: stat[11], }) } return stats, nil } func (l *Loader) findProcessLocked(key execID) (*execProcess, error) { ep := l.processes[key] if ep == nil { return nil, fmt.Errorf("container %q not found", key.cid) } return ep, nil } func (l *Loader) registerContainer(spec *specs.Spec, cid string) string { l.mu.Lock() defer l.mu.Unlock() return l.registerContainerLocked(spec, cid) } func (l *Loader) registerContainerLocked(spec *specs.Spec, cid string) string { containerName := specutils.ContainerName(spec) if len(containerName) == 0 { // If no name was provided, require containers to be restored in the same order // they were created. containerName = "__no_name_" + strconv.Itoa(len(l.containerIDs)) } l.containerIDs[containerName] = cid return containerName } func (l *Loader) containerRuntimeState(cid string) ContainerRuntimeState { l.mu.Lock() defer l.mu.Unlock() exec, ok := l.processes[execID{cid: cid}] if !ok { // Can't distinguish between invalid CID and stopped container, assume that // CID is valid. return RuntimeStateStopped } if exec.tg == nil { // Container has no thread group assigned, so it has started yet. return RuntimeStateCreating } if exec.tg.Leader().ExitState() == kernel.TaskExitNone { // Init process is still running. return RuntimeStateRunning } // Init process has stopped, but no one has called wait on it yet. return RuntimeStateStopped } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/mount_hints.go000066400000000000000000000210251465435605700236360ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package boot import ( "fmt" "path/filepath" "strings" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fsimpl/erofs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" ) const ( // MountPrefix is the annotation prefix for mount hints applied at the pod level. MountPrefix = "dev.gvisor.spec.mount." // RootfsPrefix is the annotation prefix for rootfs hint applied at the container level. RootfsPrefix = "dev.gvisor.spec.rootfs." ) // ShareType indicates who can access/mutate the volume contents. type ShareType int const ( invalid ShareType = iota // container shareType indicates that the mount is used by a single // container. There are no external observers. container // pod shareType indicates that the mount is used by more than one container // inside the pod. There are no external observers. pod // shared shareType indicates that the mount can also be shared with a process // outside the pod, e.g. NFS. shared ) func (s ShareType) String() string { switch s { case invalid: return "invalid" case container: return "container" case pod: return "pod" case shared: return "shared" default: return fmt.Sprintf("invalid share value %d", s) } } // PodMountHints contains a collection of mountHints for the pod. type PodMountHints struct { Mounts map[string]*MountHint `json:"mounts"` } // NewPodMountHints instantiates PodMountHints using spec. func NewPodMountHints(spec *specs.Spec) (*PodMountHints, error) { mnts := make(map[string]*MountHint) for k, v := range spec.Annotations { // Look for 'dev.gvisor.spec.mount' annotations and parse them. if strings.HasPrefix(k, MountPrefix) { // Remove the prefix and split the rest. parts := strings.Split(k[len(MountPrefix):], ".") if len(parts) != 2 { return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v) } name := parts[0] if len(name) == 0 { return nil, fmt.Errorf("invalid mount name: %s", name) } mnt := mnts[name] if mnt == nil { mnt = &MountHint{Name: name} mnts[name] = mnt } if err := mnt.setField(parts[1], v); err != nil { log.Warningf("ignoring invalid mount annotation (name = %q, key = %q, value = %q): %v", name, parts[1], v, err) } } } // Validate all the parsed hints. for name, m := range mnts { log.Infof("Mount annotation found, name: %s, source: %q, type: %s, share: %v", name, m.Mount.Source, m.Mount.Type, m.Share) if m.Share == invalid || len(m.Mount.Source) == 0 || len(m.Mount.Type) == 0 { log.Warningf("ignoring mount annotations for %q because of missing required field(s)", name) delete(mnts, name) continue } // Check for duplicate mount sources. for name2, m2 := range mnts { if name != name2 && m.Mount.Source == m2.Mount.Source { return nil, fmt.Errorf("mounts %q and %q have the same mount source %q", m.Name, m2.Name, m.Mount.Source) } } } return &PodMountHints{Mounts: mnts}, nil } // MountHint represents extra information about mounts that are provided via // annotations. They can override mount type, and provide sharing information // so that mounts can be correctly shared inside the pod. // It is part of the sandbox.Sandbox struct, so it must be serializable. type MountHint struct { Name string `json:"name"` Share ShareType `json:"share"` Mount specs.Mount `json:"mount"` } func (m *MountHint) setField(key, val string) error { switch key { case "source": if len(val) == 0 { return fmt.Errorf("source cannot be empty") } m.Mount.Source = val case "type": return m.setType(val) case "share": return m.setShare(val) case "options": m.Mount.Options = specutils.FilterMountOptions(strings.Split(val, ",")) default: return fmt.Errorf("invalid mount annotation: %s=%s", key, val) } return nil } func (m *MountHint) setType(val string) error { switch val { case tmpfs.Name, Bind: m.Mount.Type = val default: return fmt.Errorf("invalid type %q", val) } return nil } func (m *MountHint) setShare(val string) error { switch val { case container.String(): m.Share = container case pod.String(): m.Share = pod case shared.String(): m.Share = shared default: return fmt.Errorf("invalid share value %q", val) } return nil } // ShouldShareMount returns true if this mount should be configured as a shared // mount that is shared among multiple containers in a pod. func (m *MountHint) ShouldShareMount() bool { // Only support tmpfs for now. Bind mounts require a common gofer to mount // all shared volumes. return m.Mount.Type == tmpfs.Name && // A shared mount should be configured for share=container too so: // 1. Restarting the container does not lose the tmpfs data. // 2. Repeated mounts in the container reuse the same tmpfs instance. (m.Share == container || m.Share == pod) } // checkCompatible verifies that shared mount is compatible with master. // Master options must be the same or less restrictive than the container mount, // e.g. master can be 'rw' while container mounts as 'ro'. func (m *MountHint) checkCompatible(replica *specs.Mount) error { masterOpts := ParseMountOptions(m.Mount.Options) replicaOpts := ParseMountOptions(replica.Options) if masterOpts.ReadOnly && !replicaOpts.ReadOnly { return fmt.Errorf("cannot mount read-write shared mount because master is read-only, mount: %+v", replica) } if masterOpts.Flags.NoExec && !replicaOpts.Flags.NoExec { return fmt.Errorf("cannot mount exec enabled shared mount because master is noexec, mount: %+v", replica) } if masterOpts.Flags.NoATime && !replicaOpts.Flags.NoATime { return fmt.Errorf("cannot mount atime enabled shared mount because master is noatime, mount: %+v", replica) } return nil } func (m *MountHint) fileAccessType() config.FileAccessType { if m.Share == shared { return config.FileAccessShared } if m.ShouldShareMount() { return config.FileAccessExclusive } if m.Share == container { return config.FileAccessExclusive } return config.FileAccessShared } // FindMount finds the MountHint that applies to this mount. func (p *PodMountHints) FindMount(mountSrc string) *MountHint { for _, m := range p.Mounts { if m.Mount.Source == mountSrc { return m } } return nil } // RootfsHint represents extra information about rootfs that are provided via // annotations. They can provide mount source, mount type and overlay config. type RootfsHint struct { Mount specs.Mount Overlay config.OverlayMedium } func (r *RootfsHint) setSource(val string) error { if !filepath.IsAbs(val) { return fmt.Errorf("source should be an absolute path, got %q", val) } r.Mount.Source = val return nil } func (r *RootfsHint) setType(val string) error { switch val { case erofs.Name, Bind: r.Mount.Type = val default: return fmt.Errorf("invalid type %q", val) } return nil } func (r *RootfsHint) setField(key, val string) error { switch key { case "source": return r.setSource(val) case "type": return r.setType(val) case "overlay": return r.Overlay.Set(val) default: return fmt.Errorf("invalid rootfs annotation: %s=%s", key, val) } } // NewRootfsHint instantiates RootfsHint using spec. func NewRootfsHint(spec *specs.Spec) (*RootfsHint, error) { var hint *RootfsHint for k, v := range spec.Annotations { // Look for 'dev.gvisor.spec.rootfs' annotations and parse them. if !strings.HasPrefix(k, RootfsPrefix) { continue } // Remove the prefix. k = k[len(RootfsPrefix):] if hint == nil { hint = &RootfsHint{} } if err := hint.setField(k, v); err != nil { return nil, fmt.Errorf("invalid rootfs annotation (key = %q, value = %q): %v", k, v, err) } } // Validate the parsed hint. if hint != nil { log.Infof("Rootfs annotations found, source: %q, type: %q, overlay: %q", hint.Mount.Source, hint.Mount.Type, hint.Overlay) if len(hint.Mount.Source) == 0 || len(hint.Mount.Type) == 0 { return nil, fmt.Errorf("rootfs annotations missing required field(s): %+v", hint) } } return hint, nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/network.go000066400000000000000000000402661465435605700227700ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package boot import ( "fmt" "io" "net" "os" "runtime" "strings" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/hostos" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/link/ethernet" "gvisor.dev/gvisor/pkg/tcpip/link/fdbased" "gvisor.dev/gvisor/pkg/tcpip/link/loopback" "gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/link/xdp" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/config" ) var ( // DefaultLoopbackLink contains IP addresses and routes of "127.0.0.1/8" and // "::1/8" on "lo" interface. DefaultLoopbackLink = LoopbackLink{ Name: "lo", Addresses: []IPWithPrefix{ {Address: net.IP("\x7f\x00\x00\x01"), PrefixLen: 8}, {Address: net.IPv6loopback, PrefixLen: 128}, }, Routes: []Route{ { Destination: net.IPNet{ IP: net.IPv4(0x7f, 0, 0, 0), Mask: net.IPv4Mask(0xff, 0, 0, 0), }, }, { Destination: net.IPNet{ IP: net.IPv6loopback, Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)), }, }, }, } ) // Network exposes methods that can be used to configure a network stack. type Network struct { Stack *stack.Stack Kernel *kernel.Kernel } // Route represents a route in the network stack. type Route struct { Destination net.IPNet Gateway net.IP } // DefaultRoute represents a catch all route to the default gateway. type DefaultRoute struct { Route Route Name string } type Neighbor struct { IP net.IP HardwareAddr net.HardwareAddr } // FDBasedLink configures an fd-based link. type FDBasedLink struct { Name string InterfaceIndex int MTU int Addresses []IPWithPrefix Routes []Route GSOMaxSize uint32 GVisorGSOEnabled bool GVisorGRO bool TXChecksumOffload bool RXChecksumOffload bool LinkAddress net.HardwareAddr QDisc config.QueueingDiscipline Neighbors []Neighbor // NumChannels controls how many underlying FDs are to be used to // create this endpoint. NumChannels int // ProcessorsPerChannel controls how many goroutines are used to handle // packets on each channel. ProcessorsPerChannel int } // BindOpt indicates whether the sentry or runsc process is responsible for // binding the AF_XDP socket. type BindOpt int const ( // BindSentry indicates the sentry process must call bind. BindSentry BindOpt = iota // BindRunsc indicates the runsc process must call bind. BindRunsc ) // XDPLink configures an XDP link. type XDPLink struct { Name string InterfaceIndex int MTU int Addresses []IPWithPrefix Routes []Route TXChecksumOffload bool RXChecksumOffload bool LinkAddress net.HardwareAddr QDisc config.QueueingDiscipline Neighbors []Neighbor GVisorGRO bool Bind BindOpt // NumChannels controls how many underlying FDs are to be used to // create this endpoint. NumChannels int } // LoopbackLink configures a loopback link. type LoopbackLink struct { Name string Addresses []IPWithPrefix Routes []Route GVisorGRO bool } // CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes. type CreateLinksAndRoutesArgs struct { // FilePayload contains the fds associated with the FDBasedLinks. The // number of fd's should match the sum of the NumChannels field of the // FDBasedLink entries below. urpc.FilePayload LoopbackLinks []LoopbackLink FDBasedLinks []FDBasedLink XDPLinks []XDPLink Defaultv4Gateway DefaultRoute Defaultv6Gateway DefaultRoute // PCAP indicates that FilePayload also contains a PCAP log file. PCAP bool // LogPackets indicates that packets should be logged. LogPackets bool // NATBlob indicates whether FilePayload also contains an iptables NAT // ruleset. NATBlob bool // DisconnectOk indicates that link endpoints should have the capability // CapabilityDisconnectOk set. DisconnectOk bool } // IPWithPrefix is an address with its subnet prefix length. type IPWithPrefix struct { // Address is a network address. Address net.IP // PrefixLen is the subnet prefix length. PrefixLen int } func (ip IPWithPrefix) String() string { return fmt.Sprintf("%s/%d", ip.Address, ip.PrefixLen) } // Empty returns true if route hasn't been set. func (r *Route) Empty() bool { return r.Destination.IP == nil && r.Destination.Mask == nil && r.Gateway == nil } func (r *Route) toTcpipRoute(id tcpip.NICID) (tcpip.Route, error) { subnet, err := tcpip.NewSubnet(ipToAddress(r.Destination.IP), ipMaskToAddressMask(r.Destination.Mask)) if err != nil { return tcpip.Route{}, err } return tcpip.Route{ Destination: subnet, Gateway: ipToAddress(r.Gateway), NIC: id, }, nil } // CreateLinksAndRoutes creates links and routes in a network stack. It should // only be called once. func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error { if len(args.FDBasedLinks) > 0 && len(args.XDPLinks) > 0 { return fmt.Errorf("received both fdbased and XDP links, but only one can be used at a time") } wantFDs := 0 for _, l := range args.FDBasedLinks { wantFDs += l.NumChannels } for _, link := range args.XDPLinks { // We have to keep several FDs alive when the sentry is // responsible for binding, but when runsc binds we only expect // the AF_XDP socket itself. switch v := link.Bind; v { case BindSentry: wantFDs += 4 case BindRunsc: wantFDs++ default: return fmt.Errorf("unknown bind value: %d", v) } } if args.PCAP { wantFDs++ } if args.NATBlob { wantFDs++ } if got := len(args.FilePayload.Files); got != wantFDs { return fmt.Errorf("args.FilePayload.Files has %d FDs but we need %d entries based on FDBasedLinks, XDPLinks, and PCAP", got, wantFDs) } nicids := make(map[string]tcpip.NICID) // Collect routes from all links. var routes []tcpip.Route // Loopback normally appear before other interfaces. for _, link := range args.LoopbackLinks { nicID := n.Stack.NextNICID() nicids[link.Name] = nicID linkEP := ethernet.New(loopback.New()) log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses) opts := stack.NICOptions{ Name: link.Name, DeliverLinkPackets: true, } if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil { return err } // Collect the routes from this link. for _, r := range link.Routes { route, err := r.toTcpipRoute(nicID) if err != nil { return err } routes = append(routes, route) } } // Setup fdbased or XDP links. fdOffset := 0 if len(args.FDBasedLinks) > 0 { // Choose a dispatch mode. dispatchMode := fdbased.RecvMMsg version, err := hostos.KernelVersion() if err != nil { return err } if version.AtLeast(5, 6) { // TODO(b/333120887): Switch back to using the packet mmap dispatcher when // we have the performance data to justify it. // dispatchMode = fdbased.PacketMMap // log.Infof("Host kernel version >= 5.6, using to packet mmap to dispatch") } else { log.Infof("Host kernel version < 5.6, using to RecvMMsg to dispatch") } for _, link := range args.FDBasedLinks { nicID := n.Stack.NextNICID() nicids[link.Name] = nicID FDs := make([]int, 0, link.NumChannels) for j := 0; j < link.NumChannels; j++ { // Copy the underlying FD. oldFD := args.FilePayload.Files[fdOffset].Fd() newFD, err := unix.Dup(int(oldFD)) if err != nil { return fmt.Errorf("failed to dup FD %v: %v", oldFD, err) } FDs = append(FDs, newFD) fdOffset++ } mac := tcpip.LinkAddress(link.LinkAddress) log.Infof("gso max size is: %d", link.GSOMaxSize) linkEP, err := fdbased.New(&fdbased.Options{ FDs: FDs, MTU: uint32(link.MTU), EthernetHeader: mac != "", Address: mac, PacketDispatchMode: dispatchMode, GSOMaxSize: link.GSOMaxSize, GVisorGSOEnabled: link.GVisorGSOEnabled, TXChecksumOffload: link.TXChecksumOffload, RXChecksumOffload: link.RXChecksumOffload, GRO: link.GVisorGRO, ProcessorsPerChannel: link.ProcessorsPerChannel, DisconnectOk: args.DisconnectOk, }) if err != nil { return err } // Setup packet logging if requested. if args.PCAP { newFD, err := unix.Dup(int(args.FilePayload.Files[fdOffset].Fd())) if err != nil { return fmt.Errorf("failed to dup pcap FD: %v", err) } const packetTruncateSize = 4096 linkEP, err = sniffer.NewWithWriter(linkEP, os.NewFile(uintptr(newFD), "pcap-file"), packetTruncateSize) if err != nil { return fmt.Errorf("failed to create PCAP logger: %v", err) } fdOffset++ } else if args.LogPackets { linkEP = sniffer.New(linkEP) } var qDisc stack.QueueingDiscipline switch link.QDisc { case config.QDiscNone: case config.QDiscFIFO: log.Infof("Enabling FIFO QDisc on %q", link.Name) qDisc = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000) } log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) opts := stack.NICOptions{ Name: link.Name, QDisc: qDisc, DeliverLinkPackets: true, } if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil { return err } // Collect the routes from this link. for _, r := range link.Routes { route, err := r.toTcpipRoute(nicID) if err != nil { return err } routes = append(routes, route) } for _, neigh := range link.Neighbors { proto, tcpipAddr := ipToAddressAndProto(neigh.IP) n.Stack.AddStaticNeighbor(nicID, proto, tcpipAddr, tcpip.LinkAddress(neigh.HardwareAddr)) } } } else if len(args.XDPLinks) > 0 { if nlinks := len(args.XDPLinks); nlinks > 1 { return fmt.Errorf("XDP only supports one link device, but got %d", nlinks) } link := args.XDPLinks[0] nicID := n.Stack.NextNICID() nicids[link.Name] = nicID // Get the AF_XDP socket. oldFD := args.FilePayload.Files[fdOffset].Fd() fd, err := unix.Dup(int(oldFD)) if err != nil { return fmt.Errorf("failed to dup AF_XDP fd %v: %v", oldFD, err) } fdOffset++ // When the sentry is responsible for binding, the runsc // process sends several other FDs in order to keep them open // and alive. These are for BPF programs and maps that, if // closed, will break the dispatcher. if link.Bind == BindSentry { for _, fdName := range []string{"program-fd", "sockmap-fd", "link-fd"} { oldFD := args.FilePayload.Files[fdOffset].Fd() if _, err := unix.Dup(int(oldFD)); err != nil { return fmt.Errorf("failed to dup %s with FD %d: %v", fdName, oldFD, err) } fdOffset++ } } // Setup packet logging if requested. mac := tcpip.LinkAddress(link.LinkAddress) linkEP, err := xdp.New(&xdp.Options{ FD: fd, Address: mac, TXChecksumOffload: link.TXChecksumOffload, RXChecksumOffload: link.RXChecksumOffload, InterfaceIndex: link.InterfaceIndex, Bind: link.Bind == BindSentry, GRO: link.GVisorGRO, DisconnectOk: args.DisconnectOk, }) if err != nil { return err } if args.PCAP { newFD, err := unix.Dup(int(args.FilePayload.Files[fdOffset].Fd())) if err != nil { return fmt.Errorf("failed to dup pcap FD: %v", err) } const packetTruncateSize = 4096 linkEP, err = sniffer.NewWithWriter(linkEP, os.NewFile(uintptr(newFD), "pcap-file"), packetTruncateSize) if err != nil { return fmt.Errorf("failed to create PCAP logger: %v", err) } fdOffset++ } else if args.LogPackets { linkEP = sniffer.New(linkEP) } var qDisc stack.QueueingDiscipline switch link.QDisc { case config.QDiscNone: case config.QDiscFIFO: log.Infof("Enabling FIFO QDisc on %q", link.Name) qDisc = fifo.New(linkEP, runtime.GOMAXPROCS(0), 1000) } log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels) opts := stack.NICOptions{ Name: link.Name, QDisc: qDisc, DeliverLinkPackets: true, } if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil { return err } // Collect the routes from this link. for _, r := range link.Routes { route, err := r.toTcpipRoute(nicID) if err != nil { return err } routes = append(routes, route) } for _, neigh := range link.Neighbors { proto, tcpipAddr := ipToAddressAndProto(neigh.IP) n.Stack.AddStaticNeighbor(nicID, proto, tcpipAddr, tcpip.LinkAddress(neigh.HardwareAddr)) } } if !args.Defaultv4Gateway.Route.Empty() { nicID, ok := nicids[args.Defaultv4Gateway.Name] if !ok { return fmt.Errorf("invalid interface name %q for default route", args.Defaultv4Gateway.Name) } route, err := args.Defaultv4Gateway.Route.toTcpipRoute(nicID) if err != nil { return err } routes = append(routes, route) } if !args.Defaultv6Gateway.Route.Empty() { nicID, ok := nicids[args.Defaultv6Gateway.Name] if !ok { return fmt.Errorf("invalid interface name %q for default route", args.Defaultv6Gateway.Name) } route, err := args.Defaultv6Gateway.Route.toTcpipRoute(nicID) if err != nil { return err } routes = append(routes, route) } log.Infof("Setting routes %+v", routes) n.Stack.SetRouteTable(routes) // Set NAT table rules if necessary. if args.NATBlob { log.Infof("Replacing NAT table") iptReplaceBlob, err := io.ReadAll(args.FilePayload.Files[fdOffset]) if err != nil { return fmt.Errorf("failed to read iptables blob: %v", err) } fdOffset++ if err := netfilter.SetEntries(n.Kernel.RootUserNamespace(), n.Stack, iptReplaceBlob, false); err != nil { return fmt.Errorf("failed to SetEntries: %v", err) } } return nil } // createNICWithAddrs creates a NIC in the network stack and adds the given // addresses. func (n *Network) createNICWithAddrs(id tcpip.NICID, ep stack.LinkEndpoint, opts stack.NICOptions, addrs []IPWithPrefix) error { if err := n.Stack.CreateNICWithOptions(id, ep, opts); err != nil { return fmt.Errorf("CreateNICWithOptions(%d, _, %+v) failed: %v", id, opts, err) } for _, addr := range addrs { proto, tcpipAddr := ipToAddressAndProto(addr.Address) protocolAddr := tcpip.ProtocolAddress{ Protocol: proto, AddressWithPrefix: tcpip.AddressWithPrefix{ Address: tcpipAddr, PrefixLen: addr.PrefixLen, }, } if err := n.Stack.AddProtocolAddress(id, protocolAddr, stack.AddressProperties{}); err != nil { return fmt.Errorf("AddProtocolAddress(%d, %+v, {}) failed: %s", id, protocolAddr, err) } } return nil } // ipToAddressAndProto converts IP to tcpip.Address and a protocol number. // // Note: don't use 'len(ip)' to determine IP version because length is always 16. func ipToAddressAndProto(ip net.IP) (tcpip.NetworkProtocolNumber, tcpip.Address) { if i4 := ip.To4(); i4 != nil { return ipv4.ProtocolNumber, tcpip.AddrFromSlice(i4) } return ipv6.ProtocolNumber, tcpip.AddrFromSlice(ip) } // ipToAddress converts IP to tcpip.Address, ignoring the protocol. func ipToAddress(ip net.IP) tcpip.Address { _, addr := ipToAddressAndProto(ip) return addr } // ipMaskToAddressMask converts IPMask to tcpip.AddressMask, ignoring the // protocol. func ipMaskToAddressMask(ipMask net.IPMask) tcpip.AddressMask { addr := ipToAddress(net.IP(ipMask)) return tcpip.MaskFromBytes(addr.AsSlice()) } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/platforms/000077500000000000000000000000001465435605700227475ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/platforms/platforms.go000066400000000000000000000016431465435605700253110ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux && !debug // +build linux,!debug // Package platforms imports all available platform packages. package platforms import ( // Import platforms that runsc might use. _ "gvisor.dev/gvisor/pkg/sentry/platform/kvm" _ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" _ "gvisor.dev/gvisor/pkg/sentry/platform/systrap" ) golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/platforms/platforms_darwin.go000066400000000000000000000013121465435605700266460ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build darwin // +build darwin package platforms // This file makes the platforms package buildable on Darwin. golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/platforms/platforms_debug.go000066400000000000000000000017311465435605700264550ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux && debug // +build linux,debug package platforms import ( // Import platforms that runsc might use. // The KVM platform is not included because it's incompatible with debug // builds. Unoptimized functions grow the stack too much and fail the nosplit // check. _ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" _ "gvisor.dev/gvisor/pkg/sentry/platform/systrap" ) golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/portforward/000077500000000000000000000000001465435605700233115ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/portforward/portforward.go000066400000000000000000000114231465435605700262120ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package portforward holds the infrastructure to support the port forward command. package portforward import ( "fmt" "sync" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" ) // proxyConn is a port forwarding connection. It is used to manage the // lifecycle of the connection and clean it up if necessary. type proxyConn interface { // Name returns a name for this proxyConn. Name() string // Write performs a write on this connection. Write should block on ErrWouldBlock, but it must // listen to 'cancel' to interrupt blocked calls. Write(ctx context.Context, buf []byte, cancel <-chan struct{}) (int, error) // Read performs a read on this connection. Read should block on ErrWouldBlock by the underlying // connection, but it must listen to `cancel` to interrupt blocked calls. Read(ctx context.Context, buf []byte, cancel <-chan struct{}) (int, error) // Close cleans up all resources owned by this proxyConn. Close(ctx context.Context) } // Proxy controls copying data between two proxyConnections. Proxy takes ownership over the two // connections and is responsible for cleaning up their resources (i.e. calling their Close method). // Proxy(s) all run internal to the sandbox on the supervisor context. type Proxy struct { // containerID for this proxy. cid string // "to" and "from" are the two connections on which this Proxy copies. to proxyConn from proxyConn once sync.Once cancelFrom chan struct{} cancelTo chan struct{} wg sync.WaitGroup cu cleanup.Cleanup } // ProxyPair wraps the to/from arguments for NewProxy so that the user explicitly labels to/from. type ProxyPair struct { To proxyConn From proxyConn } // NewProxy returns a new Proxy. func NewProxy(pair ProxyPair, cid string) *Proxy { return &Proxy{ to: pair.To, from: pair.From, cid: cid, cancelTo: make(chan struct{}, 1), cancelFrom: make(chan struct{}, 1), } } // readFrom reads from the application's vfs.FileDescription and writes to the shim. func (pf *Proxy) readFrom(ctx context.Context) error { buf := make([]byte, 16384 /* 16kb buffer size */) for ctx.Err() == nil { if err := doCopy(ctx, pf.to, pf.from, buf, pf.cancelFrom); err != nil { return fmt.Errorf("readFrom failed on container %q: %v", pf.cid, err) } } return ctx.Err() } // writeTo writes to the application's vfs.FileDescription and reads from the shim. func (pf *Proxy) readTo(ctx context.Context) error { buf := make([]byte, 16384 /* 16kb buffer size */) for ctx.Err() == nil { if err := doCopy(ctx, pf.from, pf.to, buf, pf.cancelTo); err != nil { return fmt.Errorf("readTo failed on container %q: %v", pf.cid, err) } } return ctx.Err() } // doCopy is the shared copy code for each of 'readFrom' and 'readTo'. func doCopy(ctx context.Context, dst, src proxyConn, buf []byte, cancel chan struct{}) error { n, err := src.Read(ctx, buf, cancel) if err != nil { return fmt.Errorf("failed to read from %q: err %v", src.Name(), err) } _, err = dst.Write(ctx, buf[0:n], cancel) if err != nil { return fmt.Errorf("failed to write to %q: err %v", src.Name(), err) } return nil } // Start starts the proxy. On error on either end, the proxy cleans itself up by stopping both // connections. func (pf *Proxy) Start(ctx context.Context) { pf.cu.Add(func() { pf.to.Close(ctx) pf.from.Close(ctx) }) pf.wg.Add(1) go func() { if err := pf.readFrom(ctx); err != nil { ctx.Warningf("Shutting down copy from %q to %q on container %s: %v", pf.from.Name(), pf.to.Name(), pf.cid, err) } pf.wg.Done() pf.Close() }() pf.wg.Add(1) go func() { if err := pf.readTo(ctx); err != nil { ctx.Warningf("Shutting down copy from %q to %q on container %s: %v", pf.to.Name(), pf.from.Name(), pf.cid, err) } pf.wg.Done() pf.Close() }() } // AddCleanup adds a cleanup to this Proxy's cleanup. func (pf *Proxy) AddCleanup(cu func()) { pf.cu.Add(cu) } // Close cleans up the resources in this Proxy and blocks until all resources are cleaned up // and their goroutines exit. func (pf *Proxy) Close() { pf.once.Do(func() { pf.cu.Clean() pf.cancelFrom <- struct{}{} defer close(pf.cancelFrom) pf.cancelTo <- struct{}{} defer close(pf.cancelTo) }) pf.wg.Wait() } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/portforward/portforward_fd_rw.go000066400000000000000000000062361465435605700274010ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package portforward import ( "io" "sync" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // fileDescriptionConn type fileDescriptionConn struct { // file is the file to read and write from. file *vfs.FileDescription // once makes sure we release the owned FileDescription once. once sync.Once } // NewFileDescriptionConn initializes a fileDescriptionConn. func NewFileDescriptionConn(file *vfs.FileDescription) proxyConn { return &fileDescriptionConn{file: file} } // Name implements proxyConn.Name. func (r *fileDescriptionConn) Name() string { return "fileDescriptionConn" } // Read implements proxyConn.Read. func (r *fileDescriptionConn) Read(ctx context.Context, buf []byte, cancel <-chan struct{}) (int, error) { var ( notifyCh chan struct{} waitEntry waiter.Entry ) n, err := r.file.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{}) for linuxerr.Equals(linuxerr.ErrWouldBlock, err) { if notifyCh == nil { waitEntry, notifyCh = waiter.NewChannelEntry(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) // Register for when the endpoint is readable or disconnected. r.file.EventRegister(&waitEntry) defer r.file.EventUnregister(&waitEntry) } select { case <-notifyCh: case <-cancel: return 0, io.EOF } n, err = r.file.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{}) } // host fd FileDescriptions use recvmsg which returns zero when the // peer has shutdown. When that happens return EOF. if n == 0 && err == nil { return 0, io.EOF } return int(n), err } // Write implements proxyConn.Write. func (r *fileDescriptionConn) Write(ctx context.Context, buf []byte, cancel <-chan struct{}) (int, error) { var notifyCh chan struct{} var waitEntry waiter.Entry n, err := r.file.Write(ctx, usermem.BytesIOSequence(buf), vfs.WriteOptions{}) for linuxerr.Equals(linuxerr.ErrWouldBlock, err) { if notifyCh == nil { waitEntry, notifyCh = waiter.NewChannelEntry(waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) // Register for when the endpoint is writable or disconnected. r.file.EventRegister(&waitEntry) defer r.file.EventUnregister(&waitEntry) } select { case <-notifyCh: case <-cancel: return 0, io.EOF } n, err = r.file.Write(ctx, usermem.BytesIOSequence(buf), vfs.WriteOptions{}) } return int(n), err } // Close implements proxyConn.Close. func (r *fileDescriptionConn) Close(ctx context.Context) { r.once.Do(func() { r.file.DecRef(ctx) }) } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/portforward/portforward_hostinet.go000066400000000000000000000114641465435605700301340ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package portforward import ( "fmt" "io" "sync" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" fileDescriptor "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/waiter" ) var ( localHost = [4]byte{127, 0, 0, 1} ) // hostInetConn allows reading and writing to a local host socket for hostinet. // hostInetConn implments proxyConn. type hostInetConn struct { // wq is the WaitQueue registered with fdnotifier for this fd. wq waiter.Queue // fd is the file descriptor for the socket. fd *fileDescriptor.FD // port is the port on which to connect. port uint16 // once makes sure we close only once. once sync.Once } // NewHostInetConn creates a hostInetConn backed by a host socket on the localhost address. func NewHostInetConn(port uint16) (proxyConn, error) { // NOTE: Options must match sandbox seccomp filters. See filter/config.go fd, err := unix.Socket(unix.AF_INET, unix.SOCK_STREAM|unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC, unix.IPPROTO_TCP) if err != nil { return nil, err } s := hostInetConn{ fd: fileDescriptor.New(fd), port: port, } cu := cleanup.Make(func() { s.fd.Close() }) defer cu.Clean() if err := fdnotifier.AddFD(int32(s.fd.FD()), &s.wq); err != nil { return nil, err } cu.Add(func() { fdnotifier.RemoveFD(int32(s.fd.FD())) }) sockAddr := &unix.SockaddrInet4{ Addr: localHost, Port: int(s.port), } if err := unix.Connect(s.fd.FD(), sockAddr); err != nil { if err != unix.EINPROGRESS { return nil, fmt.Errorf("unix.Connect: %w", err) } // Connect is in progress. Wait for the socket to be writable. mask := waiter.WritableEvents waitEntry, notifyCh := waiter.NewChannelEntry(mask) s.eventRegister(&waitEntry) defer s.eventUnregister(&waitEntry) // Wait for connect to succeed. // Check the current socket state and if not ready, wait for the event. if fdnotifier.NonBlockingPoll(int32(s.fd.FD()), mask)&mask == 0 { <-notifyCh } // Call getsockopt to get the connection result. val, err := unix.GetsockoptInt(s.fd.FD(), unix.SOL_SOCKET, unix.SO_ERROR) if err != nil { return nil, fmt.Errorf("unix.GetSockoptInt: %w", err) } if val != 0 { return nil, fmt.Errorf("unix.GetSockoptInt: %w", unix.Errno(val)) } } cu.Release() return &s, nil } func (s *hostInetConn) Name() string { return fmt.Sprintf("localhost:port:%d", s.port) } // Read implements io.Reader.Read. It performs a blocking read on the fd. func (s *hostInetConn) Read(ctx context.Context, buf []byte, cancel <-chan struct{}) (int, error) { var ch chan struct{} var e waiter.Entry n, err := s.fd.Read(buf) for ctx.Err() == nil && linuxerr.Equals(linuxerr.ErrWouldBlock, err) { if ch == nil { e, ch = waiter.NewChannelEntry(waiter.ReadableEvents | waiter.EventHUp | waiter.EventErr) // Register for when the endpoint is writable or disconnected. s.eventRegister(&e) defer s.eventUnregister(&e) } select { case <-ch: case <-cancel: return 0, io.EOF case <-ctx.Done(): return 0, ctx.Err() } n, err = s.fd.Read(buf) } return n, err } // Write implements io.Writer.Write. It performs a blocking write on the fd. func (s *hostInetConn) Write(ctx context.Context, buf []byte, cancel <-chan struct{}) (int, error) { var ch chan struct{} var e waiter.Entry n, err := s.fd.Write(buf) for ctx.Err() == nil && linuxerr.Equals(linuxerr.ErrWouldBlock, err) { if ch == nil { e, ch = waiter.NewChannelEntry(waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) // Register for when the endpoint is writable or disconnected. s.eventRegister(&e) defer s.eventUnregister(&e) } select { case <-ch: case <-cancel: return 0, io.EOF case <-ctx.Done(): return 0, ctx.Err() } n, err = s.fd.Write(buf) } return n, err } func (s *hostInetConn) eventRegister(e *waiter.Entry) { s.wq.EventRegister(e) fdnotifier.UpdateFD(int32(s.fd.FD())) } func (s *hostInetConn) eventUnregister(e *waiter.Entry) { s.wq.EventUnregister(e) fdnotifier.UpdateFD(int32(s.fd.FD())) } // Close closes the host socket and removes it from notifications. func (s *hostInetConn) Close(_ context.Context) { s.once.Do(func() { fdnotifier.RemoveFD(int32(s.fd.FD())) s.fd.Close() }) } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/portforward/portforward_netstack.go000066400000000000000000000103521465435605700301060ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package portforward import ( "bytes" "fmt" "io" "sync" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" "gvisor.dev/gvisor/pkg/waiter" ) // netstackConn allows reading and writing to a netstack endpoint. // netstackConn implements proxyConn. type netstackConn struct { // ep is the tcpip.Endpoint on which to read and write. ep tcpip.Endpoint // port is the port on which to connect. port uint16 // wq is the WaitQueue for this connection to wait on notifications. wq *waiter.Queue // once makes sure Close is called once. once sync.Once } // NewNetstackConn creates a new port forwarding connection to the given // port in netstack mode. func NewNetstackConn(stack *stack.Stack, port uint16) (proxyConn, error) { var wq waiter.Queue ep, tcpErr := stack.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &wq) if tcpErr != nil { return nil, fmt.Errorf("creating endpoint: %v", tcpErr) } n := &netstackConn{ ep: ep, port: port, wq: &wq, } waitEntry, notifyCh := waiter.NewChannelEntry(waiter.WritableEvents) n.wq.EventRegister(&waitEntry) defer n.wq.EventUnregister(&waitEntry) tcpErr = n.ep.Connect(tcpip.FullAddress{ Addr: tcpip.AddrFrom4([4]byte{0x7f, 0x00, 0x00, 0x01}), // 127.0.0.1 Port: n.port, }) if _, ok := tcpErr.(*tcpip.ErrConnectStarted); ok { <-notifyCh tcpErr = n.ep.LastError() } if tcpErr != nil { return nil, fmt.Errorf("connecting endpoint: %v", tcpErr) } return n, nil } // Name implements proxyConn.Name. func (n *netstackConn) Name() string { return fmt.Sprintf("netstack:port:%d", n.port) } // bufWriter is used as an io.Writer to read from tcpip.Endpoint. type bufWriter struct { buf []byte offset int64 } // Write implements io.Writer. func (b *bufWriter) Write(buf []byte) (int, error) { n := copy(b.buf[b.offset:], buf) b.offset += int64(n) return n, nil } // Read implements proxyConn.Read. func (n *netstackConn) Read(ctx context.Context, buf []byte, cancel <-chan struct{}) (int, error) { var ch chan struct{} var e waiter.Entry b := &bufWriter{ buf: buf, } res, tcpErr := n.ep.Read(b, tcpip.ReadOptions{}) for _, ok := tcpErr.(*tcpip.ErrWouldBlock); ok && ctx.Err() == nil; _, ok = tcpErr.(*tcpip.ErrWouldBlock) { if ch == nil { e, ch = waiter.NewChannelEntry(waiter.ReadableEvents | waiter.EventIn | waiter.EventHUp | waiter.EventErr) n.wq.EventRegister(&e) defer n.wq.EventUnregister(&e) } select { case <-ch: case <-cancel: return 0, io.EOF case <-ctx.Done(): return 0, ctx.Err() } res, tcpErr = n.ep.Read(b, tcpip.ReadOptions{}) } if tcpErr != nil { return 0, io.EOF } return res.Total, nil } // Write implements proxyConn.Write. func (n *netstackConn) Write(ctx context.Context, buf []byte, cancel <-chan struct{}) (int, error) { var ch chan struct{} var e waiter.Entry var b bytes.Reader b.Reset(buf) res, tcpErr := n.ep.Write(&b, tcpip.WriteOptions{Atomic: true}) for _, ok := tcpErr.(*tcpip.ErrWouldBlock); ok && ctx.Err() == nil; _, ok = tcpErr.(*tcpip.ErrWouldBlock) { if ch == nil { e, ch = waiter.NewChannelEntry(waiter.WritableEvents | waiter.EventIn | waiter.EventHUp | waiter.EventErr) n.wq.EventRegister(&e) defer n.wq.EventUnregister(&e) } select { case <-ch: case <-cancel: return 0, io.EOF case <-ctx.Done(): return 0, ctx.Err() } res, tcpErr = n.ep.Write(&b, tcpip.WriteOptions{Atomic: true}) } if tcpErr != nil { return 0, io.EOF } return int(res), nil } // Close implements proxyConn.Close. func (n *netstackConn) Close(_ context.Context) { n.once.Do(func() { n.ep.Close() }) } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/portforward/portforward_state_autogen.go000066400000000000000000000000751465435605700311350ustar00rootroot00000000000000// automatically generated by stateify. package portforward golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/portforward/portforward_test_util.go000066400000000000000000000244231465435605700303120ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package portforward import ( "bytes" "io" "sync" "time" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // mockEndpoint defines an endpoint that tests can read and write for validating portforwarders. type mockEndpoint interface { read(n int) ([]byte, error) write(buf []byte) (int, error) } // portforwarderTestHarness mocks both sides of the portforwarder connection so that behavior can be // validated between them. type portforwarderTestHarness struct { app mockEndpoint shim mockEndpoint } func (th *portforwarderTestHarness) appWrite(buf []byte) (int, error) { return th.app.write(buf) } func (th *portforwarderTestHarness) appRead(n int) ([]byte, error) { return th.doRead(n, th.app) } func (th *portforwarderTestHarness) shimWrite(buf []byte) (int, error) { return th.shim.write(buf) } func (th *portforwarderTestHarness) shimRead(n int) ([]byte, error) { return th.doRead(n, th.shim) } func (th *portforwarderTestHarness) doRead(n int, ep mockEndpoint) ([]byte, error) { buf := make([]byte, 0, n) for { out, err := ep.read(n - len(buf)) if err != nil && !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { return nil, err } buf = append(buf, out...) if len(buf) >= n { return buf, nil } } } // mockApplicationFDImpl mocks a VFS file description endpoint on which the sandboxed application // and the portforwarder will communicate. type mockApplicationFDImpl struct { vfs.FileDescriptionDefaultImpl vfs.NoLockFD vfs.DentryMetadataFileDescriptionImpl mu sync.Mutex readBuf bytes.Buffer writeBuf bytes.Buffer released bool queue waiter.Queue notifyStop chan struct{} } var _ vfs.FileDescriptionImpl = (*mockApplicationFDImpl)(nil) func newMockApplicationFDImpl() *mockApplicationFDImpl { app := &mockApplicationFDImpl{notifyStop: make(chan struct{})} go app.doNotify() return app } // Read implements vfs.FileDescriptionImpl.Read details for the parent mockFileDescription. func (s *mockApplicationFDImpl) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { s.mu.Lock() defer s.mu.Unlock() if s.released { return 0, io.EOF } if s.readBuf.Len() == 0 { return 0, linuxerr.ErrWouldBlock } buf := s.readBuf.Next(s.readBuf.Len()) n, err := dst.CopyOut(ctx, buf) return int64(n), err } // Write implements vfs.FileDescriptionImpl.Write details for the parent mockFileDescription. func (s *mockApplicationFDImpl) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { s.mu.Lock() defer s.mu.Unlock() if s.released { return 0, io.EOF } buf := make([]byte, src.NumBytes()) n, _ := src.CopyIn(ctx, buf) res, _ := s.writeBuf.Write(buf[:n]) return int64(res), nil } // write implements mockEndpoint.write. func (s *mockApplicationFDImpl) write(buf []byte) (int, error) { s.mu.Lock() defer s.mu.Unlock() if s.released { return 0, io.EOF } ret, err := s.readBuf.Write(buf) return ret, err } // read implements mockEndpoint.read. func (s *mockApplicationFDImpl) read(n int) ([]byte, error) { s.mu.Lock() defer s.mu.Unlock() if s.released { return nil, io.EOF } if s.writeBuf.Len() == 0 { return nil, linuxerr.ErrWouldBlock } ret := s.writeBuf.Next(n) return ret, nil } func (s *mockApplicationFDImpl) doNotify() { for { s.queue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp) select { case <-s.notifyStop: return default: time.Sleep(time.Millisecond * 50) } } } func (s *mockApplicationFDImpl) IsReadable() bool { s.mu.Lock() defer s.mu.Unlock() if s.released { return false } return s.readBuf.Len() > 0 } func (s *mockApplicationFDImpl) IsWritable() bool { s.mu.Lock() defer s.mu.Unlock() return !s.released } // EventRegister implements vfs.FileDescriptionImpl.EventRegister details for the parent mockFileDescription. func (s *mockApplicationFDImpl) EventRegister(we *waiter.Entry) error { s.mu.Lock() defer s.mu.Unlock() s.queue.EventRegister(we) return nil } // EventUnregister implements vfs.FileDescriptionImpl.Unregister details for the parent mockFileDescription. func (s *mockApplicationFDImpl) EventUnregister(we *waiter.Entry) { s.mu.Lock() defer s.mu.Unlock() s.queue.EventUnregister(we) } // Release implements vfs.FileDescriptionImpl.Release details for the parent mockFileDescription. func (s *mockApplicationFDImpl) Release(context.Context) { s.mu.Lock() defer s.mu.Unlock() s.released = true s.notifyStop <- struct{}{} } // mockTCPEndpointImpl is the subset of methods used by tests for the mockTCPEndpoint struct. This // is so we can quickly change implementations as needed. type mockTCPEndpointImpl interface { Close() Read(io.Writer, tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) Write(tcpip.Payloader, tcpip.WriteOptions) (int64, tcpip.Error) Shutdown(tcpip.ShutdownFlags) tcpip.Error } // mockTCPEndpoint mocks tcpip.Endpoint for tests. type mockTCPEndpoint struct { impl mockTCPEndpointImpl // impl implements the subset of methods needed for mockTCPEndpoints. wq *waiter.Queue notifyDone chan struct{} } func newMockTCPEndpoint(impl mockTCPEndpointImpl, wq *waiter.Queue) *mockTCPEndpoint { ret := &mockTCPEndpoint{ impl: impl, wq: wq, notifyDone: make(chan struct{}), } go ret.doNotify() return ret } func (m *mockTCPEndpoint) doNotify() { for { m.wq.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp) select { case <-m.notifyDone: return default: time.Sleep(time.Millisecond * 50) } } } // The below are trivial stub methods to get mockTCPEndpoint to implement tcpip.Endpoint. They // either panic or call the contained impl's methods. // Close implements tcpip.Endpoint.Close. func (m *mockTCPEndpoint) Close() { m.impl.Close() m.notifyDone <- struct{}{} } // Abort implements tcpip.Endpoint.Abort. func (m *mockTCPEndpoint) Abort() { m.panicWithNotImplementedMsg() } // Read implements tcpip.Endpoint.Read. func (m *mockTCPEndpoint) Read(w io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) { return m.impl.Read(w, opts) } // Write implements tcpip.Endpoint.Write. func (m *mockTCPEndpoint) Write(payload tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { return m.impl.Write(payload, opts) } // Connect implements tcpip.Endpoint.Connect. func (m *mockTCPEndpoint) Connect(address tcpip.FullAddress) tcpip.Error { m.panicWithNotImplementedMsg() return nil } // Disconnect implements tcpip.Endpoint.Disconnect. func (m *mockTCPEndpoint) Disconnect() tcpip.Error { m.panicWithNotImplementedMsg() return nil } // Shutdown implements tcpip.Endpoint.Shutdown. func (m *mockTCPEndpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error { return m.impl.Shutdown(flags) } // Listen implements tcpip.Endpoint.Listen. func (m *mockTCPEndpoint) Listen(backlog int) tcpip.Error { m.panicWithNotImplementedMsg() return nil } // Accept implements tcpip.Endpoint.Accept. func (m *mockTCPEndpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) { m.panicWithNotImplementedMsg() return nil, nil, nil } // Bind implements tcpip.Endpoint.Bind. func (m *mockTCPEndpoint) Bind(address tcpip.FullAddress) tcpip.Error { m.panicWithNotImplementedMsg() return nil } // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress. func (m mockTCPEndpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { m.panicWithNotImplementedMsg() return tcpip.FullAddress{}, nil } // GetRemoteAddress implements tcpip.Endpoint.GetRemoreAddress. func (m *mockTCPEndpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { m.panicWithNotImplementedMsg() return tcpip.FullAddress{}, nil } // Readiness implements tcpip.Endpoint.Readiness. func (m *mockTCPEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask { m.panicWithNotImplementedMsg() return 0 } // SetSockOpt implements tcpip.Endpoint.SetSockOpt. func (m *mockTCPEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { m.panicWithNotImplementedMsg() return nil } // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt. func (m *mockTCPEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { m.panicWithNotImplementedMsg() return nil } // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (m *mockTCPEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { m.panicWithNotImplementedMsg() return nil } // GetSockOptInt implements tcpip.Endpoint.GetSockOpt. func (m *mockTCPEndpoint) GetSockOptInt(tcpip.SockOptInt) (int, tcpip.Error) { m.panicWithNotImplementedMsg() return 0, nil } // State implements tcpip.Endpoint.State. func (m *mockTCPEndpoint) State() uint32 { m.panicWithNotImplementedMsg() return 0 } // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf func (m *mockTCPEndpoint) ModerateRecvBuf(copied int) { m.panicWithNotImplementedMsg() } // Info implements tcpip.Endpoint.Info. func (m *mockTCPEndpoint) Info() tcpip.EndpointInfo { m.panicWithNotImplementedMsg() return nil } // Stats implements tcpip.Endpoint.Stats. func (m *mockTCPEndpoint) Stats() tcpip.EndpointStats { m.panicWithNotImplementedMsg() return nil } // SetOwner implements tcpip.Endpoint.SetOwner. func (m *mockTCPEndpoint) SetOwner(owner tcpip.PacketOwner) { m.panicWithNotImplementedMsg() } // LastError implements tcpip.Endpoint.LastError. func (m *mockTCPEndpoint) LastError() tcpip.Error { m.panicWithNotImplementedMsg() return nil } // SocketOptions implements tcpip.Endpoint.SocketOptions. func (m *mockTCPEndpoint) SocketOptions() *tcpip.SocketOptions { m.panicWithNotImplementedMsg() return nil } func (*mockTCPEndpoint) panicWithNotImplementedMsg() { panic("not implemented") } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/pprof/000077500000000000000000000000001465435605700220665ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/pprof/pprof.go000066400000000000000000000014471465435605700235510ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false // Package pprof provides a stub to initialize custom profilers. package pprof // Initialize will be called at boot for initializing custom profilers. func Initialize() { } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/pprof/pprof_state_autogen.go000066400000000000000000000001331465435605700264620ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package pprof golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/procfs/000077500000000000000000000000001465435605700222345ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/procfs/dump.go000066400000000000000000000242131465435605700235320ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package procfs holds utilities for getting procfs information for sandboxed // processes. package procfs import ( "bytes" "fmt" "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // FDInfo contains information about an application file descriptor. type FDInfo struct { // Number is the FD number. Number int32 `json:"number"` // Path is the path of the file that FD represents. Path string `json:"path,omitempty"` // Mode is the file mode. Mode uint16 `json:"mode"` } // UIDGID contains information for /proc/[pid]/status/{uid,gid}. type UIDGID struct { Real uint32 `json:"real"` Effective uint32 `json:"effective"` Saved uint32 `json:"saved"` } // Status contains information for /proc/[pid]/status. type Status struct { Comm string `json:"comm,omitempty"` PID int32 `json:"pid"` PPID int32 `json:"ppid"` UID UIDGID `json:"uid,omitempty"` GID UIDGID `json:"gid,omitempty"` VMSize uint64 `json:"vm_size,omitempty"` VMRSS uint64 `json:"vm_rss,omitempty"` } // Stat contains information for /proc/[pid]/stat. type Stat struct { PGID int32 `json:"pgid"` SID int32 `json:"sid"` } // Mapping contains information for /proc/[pid]/maps. type Mapping struct { Address hostarch.AddrRange `json:"address,omitempty"` Permissions hostarch.AccessType `json:"permissions"` Private string `json:"private,omitempty"` Offset uint64 `json:"offset"` DevMajor uint32 `json:"deviceMajor,omitempty"` DevMinor uint32 `json:"deviceMinor,omitempty"` Inode uint64 `json:"inode,omitempty"` Pathname string `json:"pathname,omitempty"` } // ProcessProcfsDump contains the procfs dump for one process. For more details // on fields that directly correspond to /proc fields, see proc(5). type ProcessProcfsDump struct { // Exe is the symlink target of /proc/[pid]/exe. Exe string `json:"exe,omitempty"` // Args is /proc/[pid]/cmdline split into an array. Args []string `json:"args,omitempty"` // Env is /proc/[pid]/environ split into an array. Env []string `json:"env,omitempty"` // CWD is the symlink target of /proc/[pid]/cwd. CWD string `json:"cwd,omitempty"` // FDs contains the directory entries of /proc/[pid]/fd and also contains the // symlink target for each FD. FDs []FDInfo `json:"fdlist,omitempty"` // StartTime is the process start time in nanoseconds since Unix epoch. StartTime int64 `json:"clone_ts,omitempty"` // Root is /proc/[pid]/root. Root string `json:"root,omitempty"` // Limits constains resource limits for this process. Currently only // RLIMIT_NOFILE is supported. Limits map[string]limits.Limit `json:"limits,omitempty"` // Cgroup is /proc/[pid]/cgroup split into an array. Cgroup []kernel.TaskCgroupEntry `json:"cgroup,omitempty"` // Status is /proc/[pid]/status. Status Status `json:"status,omitempty"` // Stat is /proc/[pid]/stat. Stat Stat `json:"stat,omitempty"` // Maps is /proc/[pid]/maps. Maps []Mapping `json:"maps,omitempty"` } // getMM returns t's MemoryManager. On success, the MemoryManager's users count // is incremented, and must be decremented by the caller when it is no longer // in use. func getMM(t *kernel.Task) *mm.MemoryManager { var mm *mm.MemoryManager t.WithMuLocked(func(*kernel.Task) { mm = t.MemoryManager() }) if mm == nil || !mm.IncUsers() { return nil } return mm } func getExecutablePath(ctx context.Context, pid kernel.ThreadID, mm *mm.MemoryManager) string { exec := mm.Executable() if exec == nil { log.Warningf("No executable found for PID %s", pid) return "" } defer exec.DecRef(ctx) return exec.MappedName(ctx) } func getMetadataArray(ctx context.Context, pid kernel.ThreadID, mm *mm.MemoryManager, metaType proc.MetadataType) []string { buf := bytes.Buffer{} if err := proc.GetMetadata(ctx, mm, &buf, metaType); err != nil { log.Warningf("failed to get %v metadata for PID %s: %v", metaType, pid, err) return nil } // As per proc(5), /proc/[pid]/cmdline may have "a further null byte after // the last string". Similarly, for /proc/[pid]/environ "there may be a null // byte at the end". So trim off the last null byte if it exists. return strings.Split(strings.TrimSuffix(buf.String(), "\000"), "\000") } func getCWD(ctx context.Context, t *kernel.Task, pid kernel.ThreadID) string { cwdDentry := t.FSContext().WorkingDirectory() if !cwdDentry.Ok() { log.Warningf("No CWD dentry found for PID %s", pid) return "" } root := vfs.RootFromContext(ctx) if !root.Ok() { log.Warningf("no root could be found from context for PID %s", pid) return "" } defer root.DecRef(ctx) vfsObj := cwdDentry.Mount().Filesystem().VirtualFilesystem() name, err := vfsObj.PathnameWithDeleted(ctx, root, cwdDentry) if err != nil { log.Warningf("PathnameWithDeleted failed to find CWD: %v", err) } return name } func getFDs(ctx context.Context, t *kernel.Task, pid kernel.ThreadID) []FDInfo { type fdInfo struct { fd *vfs.FileDescription no int32 } var fds []fdInfo defer func() { for _, fd := range fds { fd.fd.DecRef(ctx) } }() t.WithMuLocked(func(t *kernel.Task) { if fdTable := t.FDTable(); fdTable != nil { fdNos := fdTable.GetFDs(ctx) fds = make([]fdInfo, 0, len(fdNos)) for _, fd := range fdNos { file, _ := fdTable.Get(fd) if file != nil { fds = append(fds, fdInfo{fd: file, no: fd}) } } } }) root := vfs.RootFromContext(ctx) defer root.DecRef(ctx) res := make([]FDInfo, 0, len(fds)) for _, fd := range fds { path, err := t.Kernel().VFS().PathnameWithDeleted(ctx, root, fd.fd.VirtualDentry()) if err != nil { log.Warningf("PathnameWithDeleted failed to find path for fd %d in PID %s: %v", fd.no, pid, err) path = "" } mode := uint16(0) if statx, err := fd.fd.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_MODE}); err != nil { log.Warningf("Stat(STATX_MODE) failed for fd %d in PID %s: %v", fd.no, pid, err) } else { mode = statx.Mode } res = append(res, FDInfo{Number: fd.no, Path: path, Mode: mode}) } return res } func getRoot(t *kernel.Task, pid kernel.ThreadID) string { realRoot := t.MountNamespace().Root(t) defer realRoot.DecRef(t) root := t.FSContext().RootDirectory() defer root.DecRef(t) path, err := t.Kernel().VFS().PathnameWithDeleted(t, realRoot, root) if err != nil { log.Warningf("PathnameWithDeleted failed to find root path for PID %s: %v", pid, err) return "" } return path } func getFDLimit(ctx context.Context, pid kernel.ThreadID) (limits.Limit, error) { if limitSet := limits.FromContext(ctx); limitSet != nil { return limitSet.Get(limits.NumberOfFiles), nil } return limits.Limit{}, fmt.Errorf("could not find limit set for pid %s", pid) } func getStatus(t *kernel.Task, mm *mm.MemoryManager, pid kernel.ThreadID, pidns *kernel.PIDNamespace) Status { creds := t.Credentials() uns := creds.UserNamespace ppid := kernel.ThreadID(0) if parent := t.Parent(); parent != nil { ppid = pidns.IDOfThreadGroup(parent.ThreadGroup()) } return Status{ Comm: t.Name(), PID: int32(pid), PPID: int32(ppid), UID: UIDGID{ Real: uint32(creds.RealKUID.In(uns).OrOverflow()), Effective: uint32(creds.EffectiveKUID.In(uns).OrOverflow()), Saved: uint32(creds.SavedKUID.In(uns).OrOverflow()), }, GID: UIDGID{ Real: uint32(creds.RealKGID.In(uns).OrOverflow()), Effective: uint32(creds.EffectiveKGID.In(uns).OrOverflow()), Saved: uint32(creds.SavedKGID.In(uns).OrOverflow()), }, VMSize: mm.VirtualMemorySize() >> 10, VMRSS: mm.ResidentSetSize() >> 10, } } func getStat(t *kernel.Task, pid kernel.ThreadID, pidns *kernel.PIDNamespace) Stat { return Stat{ PGID: int32(pidns.IDOfProcessGroup(t.ThreadGroup().ProcessGroup())), SID: int32(pidns.IDOfSession(t.ThreadGroup().Session())), } } func getMappings(ctx context.Context, mm *mm.MemoryManager) []Mapping { var maps []Mapping mm.ReadMapsDataInto(ctx, func(start, end hostarch.Addr, permissions hostarch.AccessType, private string, offset uint64, devMajor, devMinor uint32, inode uint64, path string) { maps = append(maps, Mapping{ Address: hostarch.AddrRange{ Start: start, End: end, }, Permissions: permissions, Private: private, Offset: offset, DevMajor: devMajor, DevMinor: devMinor, Inode: inode, Pathname: path, }) }) return maps } // Dump returns a procfs dump for process pid. t must be a task in process pid. func Dump(t *kernel.Task, pid kernel.ThreadID, pidns *kernel.PIDNamespace) (ProcessProcfsDump, error) { ctx := t.AsyncContext() mm := getMM(t) if mm == nil { return ProcessProcfsDump{}, fmt.Errorf("no MM found for PID %s", pid) } defer mm.DecUsers(ctx) fdLimit, err := getFDLimit(ctx, pid) if err != nil { return ProcessProcfsDump{}, err } return ProcessProcfsDump{ Exe: getExecutablePath(ctx, pid, mm), Args: getMetadataArray(ctx, pid, mm, proc.Cmdline), Env: getMetadataArray(ctx, pid, mm, proc.Environ), CWD: getCWD(ctx, t, pid), FDs: getFDs(ctx, t, pid), StartTime: t.StartTime().Nanoseconds(), Root: getRoot(t, pid), Limits: map[string]limits.Limit{ "RLIMIT_NOFILE": fdLimit, }, // We don't need to worry about fake cgroup controllers as that is not // supported in runsc. Cgroup: t.GetCgroupEntries(), Status: getStatus(t, mm, pid, pidns), Stat: getStat(t, pid, pidns), Maps: getMappings(ctx, mm), }, nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/procfs/procfs_state_autogen.go000066400000000000000000000000701465435605700267760ustar00rootroot00000000000000// automatically generated by stateify. package procfs golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/restore.go000066400000000000000000000255071465435605700227630ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package boot import ( "errors" "fmt" "io" "strconv" time2 "time" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/devutil" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/state" "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/runsc/boot/pprof" "gvisor.dev/gvisor/runsc/config" ) const ( // CheckpointStateFileName is the file within the given image-path's // directory which contains the container's saved state. CheckpointStateFileName = "checkpoint.img" // CheckpointPagesMetadataFileName is the file within the given image-path's // directory containing the container's MemoryFile metadata. CheckpointPagesMetadataFileName = "pages_meta.img" // CheckpointPagesFileName is the file within the given image-path's // directory containing the container's MemoryFile pages. CheckpointPagesFileName = "pages.img" ) // restorer manages a restore session for a sandbox. It stores information about // all containers and triggers the full sandbox restore after the last // container is restored. type restorer struct { mu sync.Mutex // totalContainers is the number of containers expected to be restored in // the sandbox. Sandbox restore can only happen, after all containers have // been restored. totalContainers int // containers is the list of containers restored so far. containers []*containerInfo // Files used by restore to rehydrate the state. stateFile io.ReadCloser pagesMetadata *fd.FD pagesFile *fd.FD // deviceFile is the required to start the platform. deviceFile *fd.FD // restoreDone is a callback triggered when restore is successful. restoreDone func() error } func (r *restorer) restoreSubcontainer(spec *specs.Spec, conf *config.Config, l *Loader, cid string, stdioFDs, goferFDs, goferFilestoreFDs []*fd.FD, devGoferFD *fd.FD, goferMountConfs []GoferMountConf) error { containerName := l.registerContainer(spec, cid) info := &containerInfo{ cid: cid, containerName: containerName, conf: conf, spec: spec, stdioFDs: stdioFDs, goferFDs: goferFDs, devGoferFD: devGoferFD, goferFilestoreFDs: goferFilestoreFDs, goferMountConfs: goferMountConfs, } return r.restoreContainerInfo(l, info) } func (r *restorer) restoreContainerInfo(l *Loader, info *containerInfo) error { r.mu.Lock() defer r.mu.Unlock() for _, container := range r.containers { if container.containerName == info.containerName { return fmt.Errorf("container %q already restored", info.containerName) } if container.cid == info.cid { return fmt.Errorf("container CID %q already belongs to container %q", info.cid, container.containerName) } } r.containers = append(r.containers, info) log.Infof("Restored container %d of %d", len(r.containers), r.totalContainers) if log.IsLogging(log.Debug) { for i, fd := range info.stdioFDs { log.Debugf("Restore app FD: %d host FD: %d", i, fd.FD()) } } if len(r.containers) == r.totalContainers { // Trigger the restore if this is the last container. return r.restore(l) } return nil } func createNetworStackForRestore(l *Loader) (*stack.Stack, inet.Stack) { // Save the current network stack to slap on top of the one that was restored. curNetwork := l.k.RootNetworkNamespace().Stack() if eps, ok := curNetwork.(*netstack.Stack); ok { return eps.Stack, curNetwork } return nil, hostinet.NewStack() } func (r *restorer) restore(l *Loader) error { log.Infof("Starting to restore %d containers", len(r.containers)) // Create a new root network namespace with the network stack of the // old kernel to preserve the existing network configuration. oldStack, oldInetStack := createNetworStackForRestore(l) // Reset the network stack in the network namespace to nil before // replacing the kernel. This will not free the network stack when this // old kernel is released. l.k.RootNetworkNamespace().ResetStack() p, err := createPlatform(l.root.conf, r.deviceFile) if err != nil { return fmt.Errorf("creating platform: %v", err) } // Start the old watchdog before replacing it with a new one below. l.watchdog.Start() // Release the kernel and replace it with a new one that will be restored into. if l.k != nil { l.k.Release() } l.k = &kernel.Kernel{ Platform: p, } mf, err := createMemoryFile(l.root.conf.AppHugePages, l.hostShmemHuge) if err != nil { return fmt.Errorf("creating memory file: %v", err) } l.k.SetMemoryFile(mf) if l.root.conf.ProfileEnable { // pprof.Initialize opens /proc/self/maps, so has to be called before // installing seccomp filters. pprof.Initialize() } // Seccomp filters have to be applied before vfs restore and before parsing // the state file. if err := l.installSeccompFilters(); err != nil { return err } // Set up the restore environment. ctx := l.k.SupervisorContext() if oldStack != nil { ctx = context.WithValue(ctx, stack.CtxRestoreStack, oldStack) } fdmap := make(map[vfs.RestoreID]int) mfmap := make(map[string]*pgalloc.MemoryFile) for _, cont := range r.containers { // TODO(b/298078576): Need to process hints here probably mntr := newContainerMounter(cont, l.k, l.mountHints, l.sharedMounts, l.productName, cont.cid) if err = mntr.configureRestore(fdmap, mfmap); err != nil { return fmt.Errorf("configuring filesystem restore: %v", err) } for i, fd := range cont.stdioFDs { key := host.MakeRestoreID(cont.containerName, i) fdmap[key] = fd.Release() } for _, customFD := range cont.passFDs { key := host.MakeRestoreID(cont.containerName, customFD.guest) fdmap[key] = customFD.host.FD() } } log.Debugf("Restore using fdmap: %v", fdmap) ctx = context.WithValue(ctx, vfs.CtxRestoreFilesystemFDMap, fdmap) log.Debugf("Restore using mfmap: %v", mfmap) ctx = context.WithValue(ctx, pgalloc.CtxMemoryFileMap, mfmap) ctx = context.WithValue(ctx, devutil.CtxDevGoferClientProvider, l.k) // Load the state. loadOpts := state.LoadOpts{Source: r.stateFile, PagesMetadata: r.pagesMetadata, PagesFile: r.pagesFile} if err := loadOpts.Load(ctx, l.k, nil, oldInetStack, time.NewCalibratedClocks(), &vfs.CompleteRestoreOptions{}); err != nil { return err } // Since we have a new kernel we also must make a new watchdog. dogOpts := watchdog.DefaultOpts dogOpts.TaskTimeoutAction = l.root.conf.WatchdogAction dogOpts.StartupTimeout = 3 * time2.Minute // Give extra time for all containers to restore. dog := watchdog.New(l.k, dogOpts) // Change the loader fields to reflect the changes made when restoring. l.watchdog.Stop() l.watchdog = dog l.root.procArgs = kernel.CreateProcessArgs{} l.restore = true l.sandboxID = l.root.cid l.mu.Lock() cu := cleanup.Make(func() { l.mu.Unlock() }) defer cu.Clean() // Update all tasks in the system with their respective new container IDs. for _, task := range l.k.TaskSet().Root.Tasks() { oldCid := task.ContainerID() name := l.k.ContainerName(oldCid) newCid, ok := l.containerIDs[name] if !ok { return fmt.Errorf("unable to remap task with CID %q (name: %q). Available names: %v", task.ContainerID(), name, l.containerIDs) } task.RestoreContainerID(newCid) } // Rebuild `processes` map with containers' root process from the restored kernel. for _, tg := range l.k.RootPIDNamespace().ThreadGroups() { // Find all processes with no parent (root of execution), that were not started // via a call to `exec`. if tg.Leader().Parent() == nil && tg.Leader().Origin != kernel.OriginExec { cid := tg.Leader().ContainerID() proc := l.processes[execID{cid: cid}] if proc == nil { return fmt.Errorf("unable to find container root process with CID %q, processes: %v", cid, l.processes) } proc.tg = tg } } // Kill all processes that have been exec'd since they cannot be properly // restored -- the caller is no longer connected. for _, tg := range l.k.RootPIDNamespace().ThreadGroups() { if tg.Leader().Origin == kernel.OriginExec { if err := l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: int32(linux.SIGKILL)}); err != nil { log.Warningf("Failed to kill exec process after restore: %v", err) } } } l.k.RestoreContainerMapping(l.containerIDs) if err := l.kernelInitExtra(); err != nil { return err } // Refresh the control server with the newly created kernel. l.ctrl.refreshHandlers() // Release `l.mu` before calling into callbacks. cu.Clean() // r.restoreDone() signals and waits for the sandbox to start. if err := r.restoreDone(); err != nil { return err } r.stateFile.Close() if r.pagesFile != nil { r.pagesFile.Close() } if r.pagesMetadata != nil { r.pagesMetadata.Close() } if err := postRestoreImpl(l.k); err != nil { return err } // Restore was successful, so increment the checkpoint count manually. The // count was saved while the previous kernel was being saved and checkpoint // success was unknown at that time. Now we know the checkpoint succeeded. l.k.IncCheckpointCount() log.Infof("Restore successful") return nil } func (l *Loader) save(o *control.SaveOpts) (err error) { defer func() { // This closure is required to capture the final value of err. l.k.OnCheckpointAttempt(err) }() l.k.ResetCheckpointStatus() // TODO(gvisor.dev/issues/6243): save/restore not supported w/ hostinet if l.root.conf.Network == config.NetworkHost { return errors.New("checkpoint not supported when using hostinet") } if o.Metadata == nil { o.Metadata = make(map[string]string) } o.Metadata["container_count"] = strconv.Itoa(l.containerCount()) if err := preSaveImpl(l.k, o); err != nil { return err } state := control.State{ Kernel: l.k, Watchdog: l.watchdog, } if err := state.Save(o, nil); err != nil { return err } if o.Resume { if err := postResumeImpl(l.k); err != nil { return err } } return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/restore_impl.go000066400000000000000000000023731465435605700240000ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false package boot import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" "gvisor.dev/gvisor/pkg/sentry/kernel" ) func preSaveImpl(*kernel.Kernel, *control.SaveOpts) error { return nil } // Precondition: The kernel should be running. func postRestoreImpl(*kernel.Kernel) error { return nil } // Precondition: The kernel should be running. func postResumeImpl(*kernel.Kernel) error { return nil } func newProcInternalData(*specs.Spec) *proc.InternalData { return &proc.InternalData{} } func (l *Loader) kernelInitExtra() error { return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/seccheck.go000066400000000000000000000042221465435605700230370ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package boot import ( "encoding/json" "io" "os" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/sentry/seccheck" // Register supported of sinks. _ "gvisor.dev/gvisor/pkg/sentry/seccheck/sinks/null" _ "gvisor.dev/gvisor/pkg/sentry/seccheck/sinks/remote" ) // InitConfig represents the configuration to apply during pod creation. For // now, it supports setting up a seccheck session. type InitConfig struct { TraceSession seccheck.SessionConfig `json:"trace_session"` } func setupSeccheck(configFD int, sinkFDs []int) error { config := fd.New(configFD) defer config.Close() initConf, err := loadInitConfig(config) if err != nil { return err } return initConf.create(sinkFDs) } // LoadInitConfig loads an InitConfig struct from a json formatted file. func LoadInitConfig(path string) (*InitConfig, error) { config, err := os.Open(path) if err != nil { return nil, err } defer config.Close() return loadInitConfig(config) } func loadInitConfig(reader io.Reader) (*InitConfig, error) { decoder := json.NewDecoder(reader) decoder.DisallowUnknownFields() init := &InitConfig{} if err := decoder.Decode(init); err != nil { return nil, err } return init, nil } // Setup performs the actions defined in the InitConfig, e.g. setup seccheck // session. func (c *InitConfig) Setup() ([]*os.File, error) { return seccheck.SetupSinks(c.TraceSession.Sinks) } func (c *InitConfig) create(sinkFDs []int) error { for i, sinkFD := range sinkFDs { if sinkFD >= 0 { c.TraceSession.Sinks[i].FD = fd.New(sinkFD) } } return seccheck.Create(&c.TraceSession, false) } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/strace.go000066400000000000000000000024221465435605700225500ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package boot import ( "strings" "gvisor.dev/gvisor/pkg/sentry/strace" "gvisor.dev/gvisor/runsc/config" ) func enableStrace(conf *config.Config) error { // We must initialize even if strace is not enabled. strace.Initialize() if !conf.Strace { return nil } // For now runsc always allows logging application buffers in strace logs. strace.LogAppDataAllowed = true max := conf.StraceLogSize if max == 0 { max = 1024 } strace.LogMaximumSize = max sink := strace.SinkTypeLog if conf.StraceEvent { sink = strace.SinkTypeEvent } if len(conf.StraceSyscalls) == 0 { strace.EnableAll(sink) return nil } return strace.Enable(strings.Split(conf.StraceSyscalls, ","), sink) } golang-gvisor-gvisor-0.0~20240729.0/runsc/boot/vfs.go000066400000000000000000001451511465435605700220740ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package boot import ( "fmt" "os" "path" "path/filepath" "regexp" "slices" "sort" "strconv" "strings" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/abi/nvgpu" "gvisor.dev/gvisor/pkg/abi/tpu" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/devutil" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/devices/accel" "gvisor.dev/gvisor/pkg/sentry/devices/memdev" "gvisor.dev/gvisor/pkg/sentry/devices/nvproxy" "gvisor.dev/gvisor/pkg/sentry/devices/tpuproxy" "gvisor.dev/gvisor/pkg/sentry/devices/ttydev" "gvisor.dev/gvisor/pkg/sentry/devices/tundev" "gvisor.dev/gvisor/pkg/sentry/fsimpl/cgroupfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/dev" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/erofs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse" "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" "gvisor.dev/gvisor/pkg/sentry/fsimpl/mqfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/overlay" "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/user" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" ) // Supported filesystems that map to different internal filesystems. const ( Bind = "bind" Nonefs = "none" ) // SelfFilestorePrefix is the prefix of the self filestore file name. const SelfFilestorePrefix = ".gvisor.filestore." const ( pciPathGlobTPUv4 = "/sys/devices/pci0000:*/*/accel/accel*" pciPathGlobTPUv5 = "/sys/devices/pci0000:*/*/vfio-dev/vfio*" iommuGroupPathGlob = "/sys/kernel/iommu_groups/*/devices/*" ) // SelfFilestorePath returns the path at which the self filestore file is // stored for a given mount. func SelfFilestorePath(mountSrc, sandboxID string) string { // We will place the filestore file in a gVisor specific hidden file inside // the mount being overlaid itself. The same volume can be overlaid by // multiple sandboxes. So make the filestore file unique to a sandbox by // suffixing the sandbox ID. return path.Join(mountSrc, selfFilestoreName(sandboxID)) } func selfFilestoreName(sandboxID string) string { return SelfFilestorePrefix + sandboxID } // tmpfs has some extra supported options that we must pass through. var tmpfsAllowedData = []string{"mode", "size", "uid", "gid"} func registerFilesystems(k *kernel.Kernel, info *containerInfo) error { ctx := k.SupervisorContext() vfsObj := k.VFS() vfsObj.MustRegisterFilesystemType(cgroupfs.Name, &cgroupfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, }) vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserList: true, // TODO(b/29356795): Users may mount this once the terminals are in a // usable state. AllowUserMount: true, }) vfsObj.MustRegisterFilesystemType(dev.Name, &dev.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{}) vfsObj.MustRegisterFilesystemType(devtmpfs.Name, &devtmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, }) vfsObj.MustRegisterFilesystemType(erofs.Name, &erofs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserList: true, }) vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, }) vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserList: true, }) vfsObj.MustRegisterFilesystemType(overlay.Name, &overlay.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, }) vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, }) vfsObj.MustRegisterFilesystemType(sys.Name, &sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, }) vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, }) vfsObj.MustRegisterFilesystemType(mqfs.Name, &mqfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, }) // Register devices. if err := memdev.Register(vfsObj); err != nil { return fmt.Errorf("registering memdev: %w", err) } if err := ttydev.Register(vfsObj); err != nil { return fmt.Errorf("registering ttydev: %w", err) } tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx)) if tunSupported { if err := tundev.Register(vfsObj); err != nil { return fmt.Errorf("registering tundev: %v", err) } } if err := fuse.Register(vfsObj); err != nil { return fmt.Errorf("registering fusedev: %w", err) } if err := nvproxyRegisterDevices(info, vfsObj); err != nil { return err } if err := tpuProxyRegisterDevices(info, vfsObj); err != nil { return err } return nil } func setupContainerVFS(ctx context.Context, info *containerInfo, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { // Create context with root credentials to mount the filesystem (the current // user may not be privileged enough). rootCreds := auth.NewRootCredentials(procArgs.Credentials.UserNamespace) rootProcArgs := *procArgs rootProcArgs.WorkingDirectory = "/" rootProcArgs.Credentials = rootCreds rootProcArgs.Umask = 0022 rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals rootCtx := rootProcArgs.NewContext(mntr.k) mns, err := mntr.mountAll(rootCtx, rootCreds, info.spec, info.conf, &rootProcArgs) if err != nil { return fmt.Errorf("failed to setupFS: %w", err) } procArgs.MountNamespace = mns // If cgroups are mounted, then only check for the cgroup mounts per // container. Otherwise the root cgroups will be enabled. if mntr.cgroupsMounted { cgroupRegistry := mntr.k.CgroupRegistry() for _, ctrl := range kernel.CgroupCtrls { cg, err := cgroupRegistry.FindCgroup(ctx, ctrl, "/"+mntr.containerID) if err != nil { return fmt.Errorf("cgroup mount for controller %v not found", ctrl) } if procArgs.InitialCgroups == nil { procArgs.InitialCgroups = make(map[kernel.Cgroup]struct{}, len(kernel.CgroupCtrls)) } procArgs.InitialCgroups[cg] = struct{}{} } } mnsRoot := mns.Root(rootCtx) defer mnsRoot.DecRef(rootCtx) if err := createDeviceFiles(rootCtx, rootCreds, info, mntr.k.VFS(), mnsRoot); err != nil { return fmt.Errorf("failed to create device files: %w", err) } // We are executing a file directly. Do not resolve the executable path. if procArgs.File != nil { return nil } // Resolve the executable path from working dir and environment. resolved, err := user.ResolveExecutablePath(ctx, procArgs) if err != nil { return err } procArgs.Filename = resolved return nil } // compileMounts returns the supported mounts from the mount spec, adding any // mandatory mounts that are required by the OCI specification. // // This function must NOT add/remove any gofer mounts or change their order. func compileMounts(spec *specs.Spec, conf *config.Config, containerID string) []specs.Mount { // Keep track of whether proc and sys were mounted. var procMounted, sysMounted, devMounted, devptsMounted, cgroupsMounted bool var mounts []specs.Mount // Mount all submounts from the spec. for _, m := range spec.Mounts { // Mount all the cgroup controllers when "/sys/fs/cgroup" mount // is present. If any other cgroup controller mounts are there, // it will be a no-op, drop them. if m.Type == cgroupfs.Name && cgroupsMounted { continue } switch filepath.Clean(m.Destination) { case "/proc": procMounted = true case "/sys": sysMounted = true case "/dev": m.Type = dev.Name devMounted = true case "/dev/pts": m.Type = devpts.Name devptsMounted = true case "/sys/fs/cgroup": cgroupsMounted = true } mounts = append(mounts, m) } // Mount proc and sys even if the user did not ask for it, as the spec // says we SHOULD. var mandatoryMounts []specs.Mount if !procMounted { mandatoryMounts = append(mandatoryMounts, specs.Mount{ Type: proc.Name, Destination: "/proc", }) } if !sysMounted { mandatoryMounts = append(mandatoryMounts, specs.Mount{ Type: sys.Name, Destination: "/sys", }) } if !devMounted { mandatoryMounts = append(mandatoryMounts, specs.Mount{ Type: dev.Name, Destination: "/dev", }) } if !devptsMounted { mandatoryMounts = append(mandatoryMounts, specs.Mount{ Type: devpts.Name, Destination: "/dev/pts", }) } // The mandatory mounts should be ordered right after the root, in case // there are submounts of these mandatory mounts already in the spec. mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...) return mounts } // goferMountData creates a slice of gofer mount data. func goferMountData(fd int, fa config.FileAccessType, conf *config.Config) []string { opts := []string{ "trans=fd", "rfdno=" + strconv.Itoa(fd), "wfdno=" + strconv.Itoa(fd), } if fa == config.FileAccessShared { opts = append(opts, "cache=remote_revalidating") } if conf.DirectFS { opts = append(opts, "directfs") } if !conf.HostFifo.AllowOpen() { opts = append(opts, "disable_fifo_open") } return opts } // consumeMountOptions consumes mount options from opts based on allowedKeys // and returns the remaining and consumed options. func consumeMountOptions(opts []string, allowedKeys ...string) ([]string, []string, error) { var rem, out []string for _, o := range opts { ok, err := parseMountOption(o, allowedKeys...) if err != nil { return nil, nil, err } if ok { out = append(out, o) } else { rem = append(rem, o) } } return rem, out, nil } func parseMountOption(opt string, allowedKeys ...string) (bool, error) { kv := strings.SplitN(opt, "=", 3) if len(kv) > 2 { return false, fmt.Errorf("invalid option %q", opt) } return slices.Contains(allowedKeys, kv[0]), nil } type fdDispenser struct { fds []*fd.FD } func (f *fdDispenser) remove() int { return f.removeAsFD().Release() } func (f *fdDispenser) removeAsFD() *fd.FD { if f.empty() { panic("fdDispenser out of fds") } rv := f.fds[0] f.fds = f.fds[1:] return rv } func (f *fdDispenser) empty() bool { return len(f.fds) == 0 } type containerMounter struct { root *specs.Root // mounts is the set of submounts for the container. It's a copy from the spec // that may be freely modified without affecting the original spec. mounts []specs.Mount // goferFDs is the list of FDs to be dispensed for gofer mounts. goferFDs fdDispenser // goferFilestoreFDs are FDs to the regular files that will back the tmpfs or // overlayfs mount for certain gofer mounts. goferFilestoreFDs fdDispenser // devGoferFD is the FD to attach the sandbox to the dev gofer. devGoferFD *fd.FD // goferMountConfs contains information about how the gofer mounts have been // configured. The first entry is for rootfs and the following entries are // for bind mounts in Spec.Mounts (in the same order). goferMountConfs []GoferMountConf k *kernel.Kernel // hints is the set of pod mount hints for the sandbox. hints *PodMountHints // sharedMounts is a map of shared mounts that can be reused across // containers. sharedMounts map[string]*vfs.Mount // productName is the value to show in // /sys/devices/virtual/dmi/id/product_name. productName string // containerID is the ID for the container. containerID string // sandboxID is the ID for the whole sandbox. sandboxID string containerName string // cgroupsMounted indicates if cgroups are mounted in the container. // This is used to set the InitialCgroups before starting the container // process. cgroupsMounted bool } func newContainerMounter(info *containerInfo, k *kernel.Kernel, hints *PodMountHints, sharedMounts map[string]*vfs.Mount, productName string, sandboxID string) *containerMounter { return &containerMounter{ root: info.spec.Root, mounts: compileMounts(info.spec, info.conf, info.procArgs.ContainerID), goferFDs: fdDispenser{fds: info.goferFDs}, goferFilestoreFDs: fdDispenser{fds: info.goferFilestoreFDs}, devGoferFD: info.devGoferFD, goferMountConfs: info.goferMountConfs, k: k, hints: hints, sharedMounts: sharedMounts, productName: productName, containerID: info.cid, sandboxID: sandboxID, containerName: info.containerName, } } func (c *containerMounter) checkDispenser() error { if !c.goferFDs.empty() { return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.goferFDs) } if !c.goferFilestoreFDs.empty() { return fmt.Errorf("not all gofer Filestore FDs were consumed, remaining: %v", c.goferFilestoreFDs) } if c.devGoferFD != nil && c.devGoferFD.FD() >= 0 { return fmt.Errorf("dev gofer FD was not consumed: %d", c.devGoferFD.FD()) } return nil } func getMountAccessType(conf *config.Config, hint *MountHint) config.FileAccessType { if hint != nil { return hint.fileAccessType() } return conf.FileAccessMounts } func (c *containerMounter) mountAll(rootCtx context.Context, rootCreds *auth.Credentials, spec *specs.Spec, conf *config.Config, rootProcArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { log.Infof("Configuring container's file system") mns, err := c.createMountNamespace(rootCtx, conf, rootCreds) if err != nil { return nil, fmt.Errorf("creating mount namespace: %w", err) } rootProcArgs.MountNamespace = mns root := mns.Root(rootCtx) defer root.DecRef(rootCtx) if root.Mount().ReadOnly() { // Switch to ReadWrite while we setup submounts. if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil { return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err) } // Restore back to ReadOnly at the end. defer func() { if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil { panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err)) } }() } // Mount submounts. if err := c.mountSubmounts(rootCtx, spec, conf, mns, rootCreds); err != nil { return nil, fmt.Errorf("mounting submounts: %w", err) } return mns, nil } // createMountNamespace creates the container's root mount and namespace. func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { ioFD := c.goferFDs.remove() rootfsConf := c.goferMountConfs[0] var ( fsName string opts *vfs.MountOptions ) switch { case rootfsConf.ShouldUseLisafs(): fsName = gofer.Name data := goferMountData(ioFD, conf.FileAccess, conf) // We can't check for overlayfs here because sandbox is chroot'ed and gofer // can only send mount options for specs.Mounts (specs.Root is missing // Options field). So assume root is always on top of overlayfs. data = append(data, "overlayfs_stale_read") // Configure the gofer dentry cache size. gofer.SetDentryCacheSize(conf.DCache) opts = &vfs.MountOptions{ ReadOnly: c.root.Readonly, GetFilesystemOptions: vfs.GetFilesystemOptions{ InternalMount: true, Data: strings.Join(data, ","), InternalData: gofer.InternalFilesystemOptions{ UniqueID: vfs.RestoreID{ ContainerName: c.containerName, Path: "/", }, }, }, } case rootfsConf.ShouldUseErofs(): fsName = erofs.Name opts = &vfs.MountOptions{ ReadOnly: c.root.Readonly, GetFilesystemOptions: vfs.GetFilesystemOptions{ InternalMount: true, Data: fmt.Sprintf("ifd=%d", ioFD), InternalData: erofs.InternalFilesystemOptions{ UniqueID: vfs.RestoreID{ ContainerName: c.containerName, Path: "/", }, }, }, } default: return nil, fmt.Errorf("unsupported rootfs config: %+v", rootfsConf) } log.Infof("Mounting root with %s, ioFD: %d", fsName, ioFD) if rootfsConf.ShouldUseOverlayfs() { log.Infof("Adding overlay on top of root") var ( err error cleanup func() filestoreFD *fd.FD ) if rootfsConf.IsFilestorePresent() { filestoreFD = c.goferFilestoreFDs.removeAsFD() } opts, cleanup, err = c.configureOverlay(ctx, conf, creds, opts, fsName, filestoreFD, rootfsConf, "/") if err != nil { return nil, fmt.Errorf("mounting root with overlay: %w", err) } defer cleanup() fsName = overlay.Name } // The namespace root mount can't be changed, so let's mount a dummy // read-only tmpfs here. It simplifies creation of containers without // leaking the root file system. mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "rootfs", "tmpfs", &vfs.MountOptions{ReadOnly: true, Locked: true}, c.k) if err != nil { return nil, fmt.Errorf("setting up mount namespace: %w", err) } defer mns.DecRef(ctx) mnt, err := c.k.VFS().MountDisconnected(ctx, creds, "root", fsName, opts) if err != nil { return nil, fmt.Errorf("creating root file system: %w", err) } defer mnt.DecRef(ctx) root := mns.Root(ctx) defer root.DecRef(ctx) target := &vfs.PathOperation{ Root: root, Start: root, } if err := c.k.VFS().ConnectMountAt(ctx, creds, mnt, target); err != nil { return nil, fmt.Errorf("mounting root file system: %w", err) } mns.IncRef() return mns, nil } // configureOverlay mounts the lower layer using "lowerOpts", mounts the upper // layer using tmpfs, and return overlay mount options. "cleanup" must be called // after the options have been used to mount the overlay, to release refs on // lower and upper mounts. func (c *containerMounter) configureOverlay(ctx context.Context, conf *config.Config, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string, filestoreFD *fd.FD, mountConf GoferMountConf, dst string) (*vfs.MountOptions, func(), error) { // First copy options from lower layer to upper layer and overlay. Clear // filesystem specific options. upperOpts := *lowerOpts upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{InternalMount: true} overlayOpts := *lowerOpts overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{InternalMount: true} // All writes go to the upper layer, be paranoid and make lower readonly. lowerOpts.ReadOnly = true lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts) if err != nil { return nil, nil, err } cu := cleanup.Make(func() { lower.DecRef(ctx) }) defer cu.Clean() // Determine the lower layer's root's type. lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root()) stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{ Root: lowerRootVD, Start: lowerRootVD, }, &vfs.StatOptions{ Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE | linux.STATX_TYPE, }) if err != nil { return nil, nil, fmt.Errorf("failed to stat lower layer's root: %v", err) } if stat.Mask&linux.STATX_TYPE == 0 { return nil, nil, fmt.Errorf("failed to get file type of lower layer's root") } rootType := stat.Mode & linux.S_IFMT if rootType != linux.S_IFDIR && rootType != linux.S_IFREG { return nil, nil, fmt.Errorf("lower layer's root has unsupported file type %v", rootType) } // Upper is a tmpfs mount to keep all modifications inside the sandbox. tmpfsOpts := tmpfs.FilesystemOpts{ RootFileType: uint16(rootType), // If a mount is being overlaid, it should not be limited by the default // tmpfs size limit. DisableDefaultSizeLimit: true, } if filestoreFD != nil { // Create memory file for disk-backed overlays. mf, err := createPrivateMemoryFile(filestoreFD.ReleaseToFile("overlay-filestore"), vfs.RestoreID{ContainerName: c.containerName, Path: dst}) if err != nil { return nil, nil, fmt.Errorf("failed to create memory file for overlay: %v", err) } tmpfsOpts.MemoryFile = mf } upperOpts.GetFilesystemOptions.InternalData = tmpfsOpts upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts) if err != nil { return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err) } cu.Add(func() { upper.DecRef(ctx) }) // If the overlay mount consists of a regular file, copy up its contents // from the lower layer, since in the overlay the otherwise-empty upper // layer file will take precedence. upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root()) if rootType == linux.S_IFREG { lowerFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{ Root: lowerRootVD, Start: lowerRootVD, }, &vfs.OpenOptions{ Flags: linux.O_RDONLY, }) if err != nil { return nil, nil, fmt.Errorf("failed to open lower layer root for copying: %v", err) } defer lowerFD.DecRef(ctx) upperFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{ Root: upperRootVD, Start: upperRootVD, }, &vfs.OpenOptions{ Flags: linux.O_WRONLY, }) if err != nil { return nil, nil, fmt.Errorf("failed to open upper layer root for copying: %v", err) } defer upperFD.DecRef(ctx) if _, err := vfs.CopyRegularFileData(ctx, upperFD, lowerFD); err != nil { return nil, nil, fmt.Errorf("failed to copy up overlay file: %v", err) } } // We need to hide the filestore from the containerized application. if mountConf.IsSelfBacked() { if err := overlay.CreateWhiteout(ctx, c.k.VFS(), creds, &vfs.PathOperation{ Root: upperRootVD, Start: upperRootVD, Path: fspath.Parse(selfFilestoreName(c.sandboxID)), }); err != nil { return nil, nil, fmt.Errorf("failed to create whiteout to hide self overlay filestore: %w", err) } } // Propagate the lower layer's root's owner, group, and mode to the upper // layer's root for consistency with VFS1. err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{ Root: upperRootVD, Start: upperRootVD, }, &vfs.SetStatOptions{ Stat: linux.Statx{ Mask: (linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE) & stat.Mask, UID: stat.UID, GID: stat.GID, Mode: stat.Mode, }, }) if err != nil { return nil, nil, err } // Configure overlay with both layers. overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{ UpperRoot: upperRootVD, LowerRoots: []vfs.VirtualDentry{lowerRootVD}, } return &overlayOpts, cu.Release(), nil } func (c *containerMounter) mountSubmounts(ctx context.Context, spec *specs.Spec, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { mounts, err := c.prepareMounts() if err != nil { return err } for i := range mounts { submount := &mounts[i] log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.mount.Source, submount.mount.Destination, submount.mount.Type, submount.mount.Options) var ( mnt *vfs.Mount err error ) if submount.hint != nil && submount.hint.ShouldShareMount() { sharedMount, err := c.getSharedMount(ctx, spec, conf, submount, creds) if err != nil { return fmt.Errorf("getting shared mount %q: %w", submount.hint.Name, err) } mnt, err = c.mountSharedSubmount(ctx, conf, mns, creds, submount, sharedMount) if err != nil { return fmt.Errorf("mount shared mount %q to %q: %v", submount.hint.Name, submount.mount.Destination, err) } } else if submount.mount.Type == cgroupfs.Name { // Mount all the cgroups controllers. if err := c.mountCgroupSubmounts(ctx, spec, conf, mns, creds, submount); err != nil { return fmt.Errorf("mount cgroup %q: %w", submount.mount.Destination, err) } } else { mnt, err = c.mountSubmount(ctx, spec, conf, mns, creds, submount) if err != nil { return fmt.Errorf("mount submount %q: %w", submount.mount.Destination, err) } } if mnt != nil && mnt.ReadOnly() { // Switch to ReadWrite while we setup submounts. if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil { return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.mount.Destination, err) } // Restore back to ReadOnly at the end. defer func() { if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil { panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.mount.Destination, err)) } }() } } if err := c.mountTmp(ctx, spec, conf, creds, mns); err != nil { return fmt.Errorf(`mount submount "/tmp": %w`, err) } return nil } type mountInfo struct { mount *specs.Mount goferFD *fd.FD hint *MountHint goferMountConf GoferMountConf filestoreFD *fd.FD } func (c *containerMounter) prepareMounts() ([]mountInfo, error) { // If device gofer exists, connect to it. if c.devGoferFD != nil { if err := c.k.AddDevGofer(c.containerName, c.devGoferFD.Release()); err != nil { return nil, err } } // Associate bind mounts with their FDs before sorting since there is an // undocumented assumption that FDs are dispensed in the order in which // they are required by mounts. var mounts []mountInfo goferMntIdx := 1 // First index is for rootfs. for i := range c.mounts { info := mountInfo{ mount: &c.mounts[i], hint: c.hints.FindMount(c.mounts[i].Source), } specutils.MaybeConvertToBindMount(info.mount) if specutils.IsGoferMount(*info.mount) { info.goferMountConf = c.goferMountConfs[goferMntIdx] if info.goferMountConf.ShouldUseLisafs() { info.goferFD = c.goferFDs.removeAsFD() } if info.goferMountConf.IsFilestorePresent() { info.filestoreFD = c.goferFilestoreFDs.removeAsFD() } if info.goferMountConf.ShouldUseTmpfs() { specutils.ChangeMountType(info.mount, tmpfs.Name) } goferMntIdx++ } mounts = append(mounts, info) } if err := c.checkDispenser(); err != nil { return nil, err } // Sort the mounts so that we don't place children before parents. sort.Slice(mounts, func(i, j int) bool { return len(mounts[i].mount.Destination) < len(mounts[j].mount.Destination) }) return mounts, nil } func (c *containerMounter) mountSubmount(ctx context.Context, spec *specs.Spec, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountInfo) (*vfs.Mount, error) { fsName, opts, err := getMountNameAndOptions(spec, conf, submount, c.productName, c.containerName) if err != nil { return nil, fmt.Errorf("mountOptions failed: %w", err) } if len(fsName) == 0 { // Filesystem is not supported (e.g. cgroup), just skip it. return nil, nil } if err := c.makeMountPoint(ctx, creds, mns, submount.mount.Destination); err != nil { return nil, fmt.Errorf("creating mount point %q: %w", submount.mount.Destination, err) } if submount.goferMountConf.ShouldUseOverlayfs() { log.Infof("Adding overlay on top of mount %q", submount.mount.Destination) var cleanup func() opts, cleanup, err = c.configureOverlay(ctx, conf, creds, opts, fsName, submount.filestoreFD, submount.goferMountConf, submount.mount.Destination) if err != nil { return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.mount.Destination, err) } defer cleanup() fsName = overlay.Name } root := mns.Root(ctx) defer root.DecRef(ctx) target := &vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse(submount.mount.Destination), } mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts) if err != nil { return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.mount.Destination, submount.mount.Type, err, opts) } log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.mount.Source, submount.mount.Destination, submount.mount.Type, opts.GetFilesystemOptions.Data) return mnt, nil } // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values // used for mounts. func getMountNameAndOptions(spec *specs.Spec, conf *config.Config, m *mountInfo, productName, containerName string) (string, *vfs.MountOptions, error) { fsName := m.mount.Type var ( mopts = m.mount.Options data []string internalData any ) // Find filesystem name and FS specific data field. switch m.mount.Type { case devpts.Name, dev.Name: // Nothing to do. case Nonefs: fsName = sys.Name case proc.Name: internalData = newProcInternalData(spec) case sys.Name: sysData := &sys.InternalData{EnableTPUProxyPaths: specutils.TPUProxyIsEnabled(spec, conf)} if len(productName) > 0 { sysData.ProductName = productName } internalData = sysData case tmpfs.Name: var err error mopts, data, err = consumeMountOptions(mopts, tmpfsAllowedData...) if err != nil { return "", nil, err } if m.filestoreFD != nil { mf, err := createPrivateMemoryFile(m.filestoreFD.ReleaseToFile("tmpfs-filestore"), vfs.RestoreID{ContainerName: containerName, Path: m.mount.Destination}) if err != nil { return "", nil, fmt.Errorf("failed to create memory file for tmpfs: %v", err) } internalData = tmpfs.FilesystemOpts{ MemoryFile: mf, // If a mount is being overlaid with tmpfs, it should not be limited by // the default tmpfs size limit. DisableDefaultSizeLimit: true, } } case Bind: fsName = gofer.Name if m.goferFD == nil { // Check that an FD was provided to fails fast. return "", nil, fmt.Errorf("gofer mount requires a connection FD") } var err error mopts, data, err = consumeMountOptions(mopts, gofer.SupportedMountOptions...) if err != nil { return "", nil, err } data = append(data, goferMountData(m.goferFD.Release(), getMountAccessType(conf, m.hint), conf)...) internalData = gofer.InternalFilesystemOptions{ UniqueID: vfs.RestoreID{ ContainerName: containerName, Path: m.mount.Destination, }, } case cgroupfs.Name: var err error mopts, data, err = consumeMountOptions(mopts, cgroupfs.SupportedMountOptions...) if err != nil { return "", nil, err } default: log.Warningf("ignoring unknown filesystem type %q", m.mount.Type) return "", nil, nil } opts := ParseMountOptions(mopts) opts.GetFilesystemOptions = vfs.GetFilesystemOptions{ Data: strings.Join(data, ","), InternalData: internalData, InternalMount: true, } return fsName, opts, nil } // ParseMountOptions converts specs.Mount.Options to vfs.MountOptions. func ParseMountOptions(opts []string) *vfs.MountOptions { mountOpts := &vfs.MountOptions{ GetFilesystemOptions: vfs.GetFilesystemOptions{ InternalMount: true, }, } // Note: update mountHint.CheckCompatible when more options are added. for _, o := range opts { switch o { case "ro": mountOpts.ReadOnly = true case "noatime": mountOpts.Flags.NoATime = true case "noexec": mountOpts.Flags.NoExec = true case "rw", "atime", "exec": // These use the default value and don't need to be set. case "bind", "rbind": // These are the same as a mount with type="bind". default: log.Warningf("ignoring unknown mount option %q", o) } } return mountOpts } func parseKeyValue(s string) (string, string, bool) { tokens := strings.SplitN(s, "=", 2) if len(tokens) < 2 { return "", "", false } return strings.TrimSpace(tokens[0]), strings.TrimSpace(tokens[1]), true } func createPrivateMemoryFile(file *os.File, restoreID vfs.RestoreID) (*pgalloc.MemoryFile, error) { mfOpts := pgalloc.MemoryFileOpts{ // Private memory files are usually backed by files on disk. Ideally we // would confirm with fstatfs(2) but that is prohibited by seccomp. DiskBackedFile: true, // Disk backed files need to be decommited on destroy to release disk space. DecommitOnDestroy: true, // sentry's seccomp filters don't allow the mmap(2) syscalls that // pgalloc.IMAWorkAroundForMemFile() uses. Users of private memory files // are expected to have performed the work around outside the sandbox. DisableIMAWorkAround: true, // Private memory files need to be restored correctly using this ID. RestoreID: restoreID.String(), } return pgalloc.NewMemoryFile(file, mfOpts) } // mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so. // Technically we don't have to mount tmpfs at /tmp, as we could just rely on // the host /tmp, but this is a nice optimization, and fixes some apps that call // mknod in /tmp. It's unsafe to mount tmpfs if: // 1. /tmp is mounted explicitly: we should not override user's wish // 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp // // Note that when there are submounts inside of '/tmp', directories for the // mount points must be present, making '/tmp' not empty anymore. func (c *containerMounter) mountTmp(ctx context.Context, spec *specs.Spec, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error { for _, m := range c.mounts { // m.Destination has been cleaned, so it's to use equality here. if m.Destination == "/tmp" { log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m) return nil } } root := mns.Root(ctx) defer root.DecRef(ctx) pop := vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse("/tmp"), } fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY}) switch { case err == nil: defer fd.DecRef(ctx) err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { if dirent.Name != "." && dirent.Name != ".." { return linuxerr.ENOTEMPTY } return nil })) switch { case err == nil: log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`) case linuxerr.Equals(linuxerr.ENOTEMPTY, err): // If more than "." and ".." is found, skip internal tmpfs to prevent // hiding existing files. log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`) return nil default: return fmt.Errorf("fd.IterDirents failed: %v", err) } fallthrough case linuxerr.Equals(linuxerr.ENOENT, err): // No '/tmp' found (or fallthrough from above). It's safe to mount internal // tmpfs. tmpMount := specs.Mount{ Type: tmpfs.Name, Destination: "/tmp", // Sticky bit is added to prevent accidental deletion of files from // another user. This is normally done for /tmp. Options: []string{"mode=01777"}, } if _, err := c.mountSubmount(ctx, spec, conf, mns, creds, &mountInfo{mount: &tmpMount}); err != nil { return fmt.Errorf("mountSubmount failed: %v", err) } return nil case linuxerr.Equals(linuxerr.ENOTDIR, err): // Not a dir?! Let it be. return nil default: return fmt.Errorf(`opening "/tmp" inside container: %w`, err) } } func (c *containerMounter) getSharedMount(ctx context.Context, spec *specs.Spec, conf *config.Config, mount *mountInfo, creds *auth.Credentials) (*vfs.Mount, error) { sharedMount, ok := c.sharedMounts[mount.hint.Mount.Source] if ok { log.Infof("Using existing shared mount %q from %q type %q", mount.hint.Name, mount.hint.Mount.Source, mount.hint.Mount.Type) if mount.goferFD != nil { panic(fmt.Errorf("extra goferFD provided for shared mount %q", mount.hint.Name)) } if mount.filestoreFD != nil { mount.filestoreFD.Close() } return sharedMount, nil } log.Infof("Mounting master of shared mount %q from %q type %q", mount.hint.Name, mount.hint.Mount.Source, mount.hint.Mount.Type) sharedMount, err := c.mountSharedMaster(ctx, spec, conf, mount, creds) if err != nil { return nil, fmt.Errorf("mounting shared master %q: %v", mount.hint.Name, err) } c.sharedMounts[mount.hint.Mount.Source] = sharedMount return sharedMount, nil } // mountCgroupMounts mounts the cgroups which are shared across all containers. // Postcondition: Initialized k.cgroupMounts on success. func (l *Loader) mountCgroupMounts(conf *config.Config, creds *auth.Credentials) error { ctx := l.k.SupervisorContext() for _, sopts := range kernel.CgroupCtrls { mopts := &vfs.MountOptions{ GetFilesystemOptions: vfs.GetFilesystemOptions{ Data: string(sopts), InternalMount: true, }, } fs, root, err := l.k.VFS().NewFilesystem(ctx, creds, "cgroup", cgroupfs.Name, mopts) if err != nil { return err } mount := l.k.VFS().NewDisconnectedMount(fs, root, mopts) // Private so that mounts created by containers do not appear // in other container's cgroup paths. l.k.VFS().SetMountPropagation(mount, linux.MS_PRIVATE, false) l.k.AddCgroupMount(string(sopts), &kernel.CgroupMount{ Fs: fs, Root: root, Mount: mount, }) } log.Infof("created cgroup mounts for controllers %v", kernel.CgroupCtrls) return nil } // mountCgroupSubmounts mounts all the cgroup controller submounts for the // container. The cgroup submounts are created under the root controller mount // with containerID as the directory name and then bind mounts this directory // inside the container's mount namespace. func (c *containerMounter) mountCgroupSubmounts(ctx context.Context, spec *specs.Spec, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountInfo) error { root := mns.Root(ctx) defer root.DecRef(ctx) // Mount "/sys/fs/cgroup" in the container's mount namespace. submount.mount.Type = tmpfs.Name mnt, err := c.mountSubmount(ctx, spec, conf, mns, creds, submount) if err != nil { return err } if mnt != nil && mnt.ReadOnly() { // Switch to ReadWrite while we setup submounts. if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil { return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.mount.Destination, err) } // Restore back to ReadOnly at the end. defer func() { if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil { panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.mount.Destination, err)) } }() } // Mount all the cgroup controllers in the container's mount namespace. mountCtx := vfs.WithRoot(vfs.WithMountNamespace(ctx, mns), root) for _, ctrl := range kernel.CgroupCtrls { ctrlName := string(ctrl) cgroupMnt := c.k.GetCgroupMount(ctrlName) if cgroupMnt == nil { return fmt.Errorf("cgroup mount for controller %s not found", ctrlName) } cgroupMntVD := vfs.MakeVirtualDentry(cgroupMnt.Mount, cgroupMnt.Root) sourcePop := vfs.PathOperation{ Root: cgroupMntVD, Start: cgroupMntVD, // Use the containerID as the cgroup path. Path: fspath.Parse(c.containerID), } if err := c.k.VFS().MkdirAt(mountCtx, creds, &sourcePop, &vfs.MkdirOptions{ Mode: 0755, }); err != nil { log.Infof("error in creating directory %v", err) return err } // Bind mount the new cgroup directory into the container's mount namespace. destination := "/sys/fs/cgroup/" + ctrlName if err := c.k.VFS().MakeSyntheticMountpoint(mountCtx, destination, root, creds); err != nil { // Log a warning, but attempt the mount anyway. log.Warningf("Failed to create mount point %q: %v", destination, err) } target := &vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse(destination), } if err := c.k.VFS().BindAt(mountCtx, creds, &sourcePop, target, false); err != nil { log.Infof("error in bind mounting %v", err) return err } } c.cgroupsMounted = true return nil } // mountSharedMaster mounts the master of a volume that is shared among // containers in a pod. func (c *containerMounter) mountSharedMaster(ctx context.Context, spec *specs.Spec, conf *config.Config, mntInfo *mountInfo, creds *auth.Credentials) (*vfs.Mount, error) { // Mount the master using the options from the hint (mount annotations). origOpts := mntInfo.mount.Options mntInfo.mount.Options = mntInfo.hint.Mount.Options fsName, opts, err := getMountNameAndOptions(spec, conf, mntInfo, c.productName, c.containerName) mntInfo.mount.Options = origOpts if err != nil { return nil, err } if len(fsName) == 0 { return nil, fmt.Errorf("mount type not supported %q", mntInfo.hint.Mount.Type) } return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts) } // mountSharedSubmount binds mount to a previously mounted volume that is shared // among containers in the same pod. func (c *containerMounter) mountSharedSubmount(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mntInfo *mountInfo, sharedMount *vfs.Mount) (*vfs.Mount, error) { if err := mntInfo.hint.checkCompatible(mntInfo.mount); err != nil { return nil, err } // Generate mount point specific opts using mntInfo.mount. opts := ParseMountOptions(mntInfo.mount.Options) newMnt := c.k.VFS().NewDisconnectedMount(sharedMount.Filesystem(), sharedMount.Root(), opts) defer newMnt.DecRef(ctx) root := mns.Root(ctx) defer root.DecRef(ctx) target := &vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse(mntInfo.mount.Destination), } if err := c.makeMountPoint(ctx, creds, mns, mntInfo.mount.Destination); err != nil { return nil, fmt.Errorf("creating mount point %q: %w", mntInfo.mount.Destination, err) } if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil { return nil, err } log.Infof("Mounted %q type shared bind to %q", mntInfo.mount.Destination, mntInfo.hint.Name) return newMnt, nil } func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error { root := mns.Root(ctx) defer root.DecRef(ctx) target := &vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse(dest), } // First check if mount point exists. When overlay is enabled, gofer doesn't // allow changes to the FS, making MakeSytheticMountpoint() ineffective // because MkdirAt fails with EROFS even if file exists. vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{}) if err == nil { // File exists, we're done. vd.DecRef(ctx) return nil } return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds) } // configureRestore returns an updated context.Context including filesystem // state used by restore defined by conf. func (c *containerMounter) configureRestore(fdmap map[vfs.RestoreID]int, mfmap map[string]*pgalloc.MemoryFile) error { // Compare createMountNamespace(); rootfs always consumes a gofer FD and a // filestore FD is consumed if the rootfs GoferMountConf indicates so. rootKey := vfs.RestoreID{ContainerName: c.containerName, Path: "/"} fdmap[rootKey] = c.goferFDs.remove() if rootfsConf := c.goferMountConfs[0]; rootfsConf.IsFilestorePresent() { mf, err := createPrivateMemoryFile(c.goferFilestoreFDs.removeAsFD().ReleaseToFile("overlay-filestore"), rootKey) if err != nil { return fmt.Errorf("failed to create private memory file for mount rootfs: %w", err) } mfmap[rootKey.String()] = mf } // prepareMounts() consumes the remaining FDs for submounts. mounts, err := c.prepareMounts() if err != nil { return err } for i := range mounts { submount := &mounts[i] if submount.goferFD != nil { key := vfs.RestoreID{ContainerName: c.containerName, Path: submount.mount.Destination} fdmap[key] = submount.goferFD.Release() } if submount.filestoreFD != nil { key := vfs.RestoreID{ContainerName: c.containerName, Path: submount.mount.Destination} mf, err := createPrivateMemoryFile(submount.filestoreFD.ReleaseToFile("overlay-filestore"), key) if err != nil { return fmt.Errorf("failed to create private memory file for mount %q: %w", submount.mount.Destination, err) } mfmap[key.String()] = mf } } return nil } func createDeviceFiles(ctx context.Context, creds *auth.Credentials, info *containerInfo, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry) error { if info.spec.Linux != nil { // Create any device files specified in the spec. for _, dev := range info.spec.Linux.Devices { if err := createDeviceFile(ctx, creds, info, vfsObj, root, dev); err != nil { return err } } } if specutils.GPUFunctionalityRequestedViaHook(info.spec, info.conf) { // When using nvidia-container-runtime-hook, devices are not injected into // spec.Linux.Devices. So manually create appropriate device files. mode := os.FileMode(0666) nvidiaDevs := []specs.LinuxDevice{ specs.LinuxDevice{Path: "/dev/nvidiactl", Type: "c", Major: nvgpu.NV_MAJOR_DEVICE_NUMBER, Minor: nvgpu.NV_CONTROL_DEVICE_MINOR, FileMode: &mode}, specs.LinuxDevice{Path: "/dev/nvidia-uvm", Type: "c", Major: int64(info.nvidiaUVMDevMajor), Minor: nvgpu.NVIDIA_UVM_PRIMARY_MINOR_NUMBER, FileMode: &mode}, } devClient := devutil.GoferClientFromContext(ctx) if devClient == nil { return fmt.Errorf("dev gofer client not found in context") } names, err := devClient.DirentNames(ctx) if err != nil { return fmt.Errorf("failed to get names of dirents from dev gofer: %w", err) } nvidiaDeviceRegex := regexp.MustCompile(`^nvidia(\d+)$`) for _, name := range names { ms := nvidiaDeviceRegex.FindStringSubmatch(name) if ms == nil { continue } minor, err := strconv.ParseUint(ms[1], 10, 32) if err != nil { return fmt.Errorf("invalid nvidia device name %q: %w", name, err) } nvidiaDevs = append(nvidiaDevs, specs.LinuxDevice{Path: fmt.Sprintf("/dev/nvidia%d", minor), Type: "c", Major: nvgpu.NV_MAJOR_DEVICE_NUMBER, Minor: int64(minor), FileMode: &mode}) } for _, nvidiaDev := range nvidiaDevs { if err := createDeviceFile(ctx, creds, info, vfsObj, root, nvidiaDev); err != nil { return err } } } return nil } func createDeviceFile(ctx context.Context, creds *auth.Credentials, info *containerInfo, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, devSpec specs.LinuxDevice) error { mode := linux.FileMode(devSpec.FileMode.Perm()) var major, minor uint32 // See https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#devices. switch devSpec.Type { case "b": mode |= linux.S_IFBLK major = uint32(devSpec.Major) minor = uint32(devSpec.Minor) case "c", "u": mode |= linux.S_IFCHR major = uint32(devSpec.Major) minor = uint32(devSpec.Minor) case "p": mode |= linux.S_IFIFO default: return fmt.Errorf("specified device at %q has invalid type %q", devSpec.Path, devSpec.Type) } if devSpec.Path == "/dev/nvidia-uvm" && info.nvidiaUVMDevMajor != 0 && major != info.nvidiaUVMDevMajor { // nvidia-uvm's major device number is dynamically assigned, so the // number that it has on the host may differ from the number that // it has in sentry VFS; switch from the former to the latter. log.Infof("Switching /dev/nvidia-uvm device major number from %d to %d", devSpec.Major, info.nvidiaUVMDevMajor) major = info.nvidiaUVMDevMajor } return dev.CreateDeviceFile(ctx, vfsObj, creds, root, devSpec.Path, major, minor, mode, devSpec.UID, devSpec.GID) } // registerTPUDevice registers a TPU device in vfsObj based on the given device ID. func registerTPUDevice(vfsObj *vfs.VirtualFilesystem, minor, deviceNum uint32, deviceID int64) error { switch deviceID { case tpu.TPUV4DeviceID, tpu.TPUV4liteDeviceID: return accel.RegisterTPUDevice(vfsObj, minor, deviceID == tpu.TPUV4liteDeviceID) case tpu.TPUV5eDeviceID, tpu.TPUV5pDeviceID: return tpuproxy.RegisterTPUDevice(vfsObj, minor, deviceNum) default: return fmt.Errorf("unsupported TPU device with ID: 0x%x", deviceID) } } // pathGlobToPathRegex is a map that points a TPU PCI path glob to its path regex. // TPU v4 devices are accessible via /sys/devices/pci0000:00//accel/accel# on the host. // TPU v5 devices are accessible via at /sys/devices/pci0000:00//vfio-dev/vfio# on the host. var pathGlobToPathRegex = map[string]string{ pciPathGlobTPUv4: `^/sys/devices/pci0000:[[:xdigit:]]{2}/\d+:\d+:\d+\.\d+/accel/accel(\d+)$`, pciPathGlobTPUv5: `^/sys/devices/pci0000:[[:xdigit:]]{2}/\d+:\d+:\d+\.\d+/vfio-dev/vfio(\d+)$`, } func tpuProxyRegisterDevices(info *containerInfo, vfsObj *vfs.VirtualFilesystem) error { if !specutils.TPUProxyIsEnabled(info.spec, info.conf) { return nil } // Enumerate all potential PCI paths where TPU devices are available and register the found TPU devices. for pciPathGlobal, pathRegex := range pathGlobToPathRegex { pciAddrs, err := filepath.Glob(pciPathGlobal) if err != nil { return fmt.Errorf("enumerating PCI device files: %w", err) } pciPathRegex := regexp.MustCompile(pathRegex) for _, pciPath := range pciAddrs { ms := pciPathRegex.FindStringSubmatch(pciPath) if ms == nil { continue } deviceNum, err := strconv.ParseUint(ms[1], 10, 32) if err != nil { return fmt.Errorf("parsing PCI device number: %w", err) } var deviceIDBytes []byte if deviceIDBytes, err = os.ReadFile(path.Join(pciPath, "device/device")); err != nil { return fmt.Errorf("reading PCI device ID: %w", err) } deviceIDStr := strings.Replace(string(deviceIDBytes), "0x", "", -1) deviceID, err := strconv.ParseInt(strings.TrimSpace(deviceIDStr), 16, 64) if err != nil { return fmt.Errorf("parsing PCI device ID: %w", err) } // VFIO iommu groups correspond to the device minor number. Use these // paths to get the correct minor number for the sentry-internal TPU // device files. var minorNum int switch deviceID { case tpu.TPUV4DeviceID, tpu.TPUV4liteDeviceID: minorNum = int(deviceNum) case tpu.TPUV5eDeviceID, tpu.TPUV5pDeviceID: groupPaths, err := filepath.Glob(iommuGroupPathGlob) if err != nil { return fmt.Errorf("enumerating IOMMU group files: %w", err) } for _, groupPath := range groupPaths { pci := path.Base(groupPath) if strings.Contains(pciPath, pci) { minor, err := strconv.Atoi(strings.Split(groupPath, "/")[4]) if err != nil { return fmt.Errorf("parsing IOMMU group minor number: %w", err) } minorNum = minor break } } default: return fmt.Errorf("unsupported TPU device with ID: 0x%x", deviceID) } if err := registerTPUDevice(vfsObj, uint32(minorNum), uint32(deviceNum), deviceID); err != nil { return fmt.Errorf("registering TPU driver: %w", err) } } } if err := tpuproxy.RegisterVfioDevice(vfsObj); err != nil { return fmt.Errorf("registering vfio driver: %w", err) } return nil } func nvproxyRegisterDevices(info *containerInfo, vfsObj *vfs.VirtualFilesystem) error { if !specutils.NVProxyEnabled(info.spec, info.conf) { return nil } uvmDevMajor, err := vfsObj.GetDynamicCharDevMajor() if err != nil { return fmt.Errorf("reserving device major number for nvidia-uvm: %w", err) } if err := nvproxy.Register(vfsObj, info.nvidiaDriverVersion, uvmDevMajor); err != nil { return fmt.Errorf("registering nvproxy driver: %w", err) } info.nvidiaUVMDevMajor = uvmDevMajor return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/cgroup/000077500000000000000000000000001465435605700212745ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/cgroup/cgroup.go000066400000000000000000000647161465435605700231400ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package cgroup provides an interface to read and write configuration to // cgroup. package cgroup import ( "bufio" "context" "encoding/json" "errors" "fmt" "io" "io/ioutil" "os" "path/filepath" "strconv" "strings" "time" "github.com/cenkalti/backoff" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sync/errgroup" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/log" ) const ( cgroupv1FsName = "cgroup" cgroupv2FsName = "cgroup2" // procRoot is the procfs root this module uses. procRoot = "/proc" // cgroupRoot is the cgroupfs root this module uses. cgroupRoot = "/sys/fs/cgroup" ) var controllers = map[string]controller{ "blkio": &blockIO{}, "cpu": &cpu{}, "cpuset": &cpuSet{}, "hugetlb": &hugeTLB{}, "memory": &memory{}, "net_cls": &networkClass{}, "net_prio": &networkPrio{}, "pids": &pids{}, // These controllers either don't have anything in the OCI spec or is // irrelevant for a sandbox. "cpuacct": &noop{}, "devices": &noop{}, "freezer": &noop{}, "perf_event": &noop{}, "rdma": &noop{}, "systemd": &noop{}, } // IsOnlyV2 checks whether cgroups V2 is enabled and V1 is not. func IsOnlyV2() bool { var stat unix.Statfs_t if err := unix.Statfs(cgroupRoot, &stat); err != nil { // It's not used for anything important, assume not V2 on failure. return false } return stat.Type == unix.CGROUP2_SUPER_MAGIC } func setOptionalValueInt(path, name string, val *int64) error { if val == nil || *val == 0 { return nil } str := strconv.FormatInt(*val, 10) return setValue(path, name, str) } func setOptionalValueUint(path, name string, val *uint64) error { if val == nil || *val == 0 { return nil } str := strconv.FormatUint(*val, 10) return setValue(path, name, str) } func setOptionalValueUint32(path, name string, val *uint32) error { if val == nil || *val == 0 { return nil } str := strconv.FormatUint(uint64(*val), 10) return setValue(path, name, str) } func setOptionalValueUint16(path, name string, val *uint16) error { if val == nil || *val == 0 { return nil } str := strconv.FormatUint(uint64(*val), 10) return setValue(path, name, str) } func setValue(path, name, data string) error { fullpath := filepath.Join(path, name) log.Debugf("Setting %q to %q", fullpath, data) return writeFile(fullpath, []byte(data), 0700) } // writeFile is similar to ioutil.WriteFile() but doesn't create the file if it // doesn't exist. func writeFile(path string, data []byte, perm os.FileMode) error { f, err := os.OpenFile(path, os.O_WRONLY|os.O_TRUNC, perm) if err != nil { return err } defer f.Close() _, err = f.Write(data) return err } func getValue(path, name string) (string, error) { fullpath := filepath.Join(path, name) out, err := ioutil.ReadFile(fullpath) if err != nil { return "", err } return string(out), nil } func getInt(path, name string) (int, error) { s, err := getValue(path, name) if err != nil { return 0, err } return strconv.Atoi(strings.TrimSpace(s)) } // fillFromAncestor sets the value of a cgroup file from the first ancestor // that has content. It does nothing if the file in 'path' has already been set. func fillFromAncestor(path string) (string, error) { out, err := ioutil.ReadFile(path) if err != nil { return "", err } val := strings.TrimSpace(string(out)) if val != "" { // File is set, stop here. return val, nil } // File is not set, recurse to parent and then set here. name := filepath.Base(path) parent := filepath.Dir(filepath.Dir(path)) val, err = fillFromAncestor(filepath.Join(parent, name)) if err != nil { return "", err } if err := writeFile(path, []byte(val), 0700); err != nil { return "", nil } return val, nil } // countCpuset returns the number of CPU in a string formatted like: // // "0-2,7,12-14 # bits 0, 1, 2, 7, 12, 13, and 14 set" - man 7 cpuset func countCpuset(cpuset string) (int, error) { var count int for _, p := range strings.Split(cpuset, ",") { interval := strings.Split(p, "-") switch len(interval) { case 1: if _, err := strconv.Atoi(interval[0]); err != nil { return 0, err } count++ case 2: start, err := strconv.Atoi(interval[0]) if err != nil { return 0, err } end, err := strconv.Atoi(interval[1]) if err != nil { return 0, err } if start < 0 || end < 0 || start > end { return 0, fmt.Errorf("invalid cpuset: %q", p) } count += end - start + 1 default: return 0, fmt.Errorf("invalid cpuset: %q", p) } } return count, nil } // loadPaths loads cgroup paths for given 'pid', may be set to 'self'. func loadPaths(pid string) (map[string]string, error) { procCgroup, err := os.Open(filepath.Join(procRoot, pid, "cgroup")) if err != nil { return nil, err } defer procCgroup.Close() // Load mountinfo for the current process, because it's where cgroups is // being accessed from. mountinfo, err := os.Open(filepath.Join(procRoot, "self/mountinfo")) if err != nil { return nil, err } defer mountinfo.Close() return loadPathsHelper(procCgroup, mountinfo, IsOnlyV2()) } func loadPathsHelper(cgroup, mountinfo io.Reader, unified bool) (map[string]string, error) { paths := make(map[string]string) scanner := bufio.NewScanner(cgroup) for scanner.Scan() { // Format: ID:[name=]controller1,controller2:path // Example: 2:cpu,cpuacct:/user.slice tokens := strings.Split(scanner.Text(), ":") if len(tokens) != 3 { return nil, fmt.Errorf("invalid cgroups file, line: %q", scanner.Text()) } if len(tokens[1]) == 0 && unified { paths[cgroup2Key] = tokens[2] continue } if len(tokens[1]) == 0 { continue } for _, ctrlr := range strings.Split(tokens[1], ",") { // Remove prefix for cgroups with no controller, eg. systemd. ctrlr = strings.TrimPrefix(ctrlr, "name=") // Discard unknown controllers. if _, ok := controllers[ctrlr]; ok { paths[ctrlr] = tokens[2] } } } if err := scanner.Err(); err != nil { return nil, err } // For nested containers, in /proc/[pid]/cgroup we see paths from host, // which don't exist in container, so recover the container paths here by // double-checking with /proc/[pid]/mountinfo mountScanner := bufio.NewScanner(mountinfo) haveCg2Path := false for mountScanner.Scan() { // Format: ID parent major:minor root mount-point options opt-fields - fs-type source super-options // Example: 39 32 0:34 / /sys/fs/cgroup/devices rw,noexec shared:18 - cgroup cgroup rw,devices fields := strings.Fields(mountScanner.Text()) if len(fields) < 9 { // Skip mounts that are not cgroup mounts. continue } switch fields[len(fields)-3] { case cgroupv1FsName: // Cgroup controller type is in the super-options field. superOptions := strings.Split(fields[len(fields)-1], ",") for _, opt := range superOptions { // Remove prefix for cgroups with no controller, eg. systemd. opt = strings.TrimPrefix(opt, "name=") // Only considers cgroup controllers that are registered, and skip other // irrelevant options, e.g. rw. if cgroupPath, ok := paths[opt]; ok { rootDir := fields[3] if rootDir != "/" { // When cgroup is in submount, remove repeated path components from // cgroup path to avoid duplicating them. relCgroupPath, err := filepath.Rel(rootDir, cgroupPath) if err != nil { return nil, err } paths[opt] = relCgroupPath } } } case cgroupv2FsName: if cgroupPath, ok := paths[cgroup2Key]; !haveCg2Path && ok { root := fields[3] relCgroupPath, err := filepath.Rel(root, cgroupPath) if err != nil { return nil, err } haveCg2Path = true paths[cgroup2Key] = relCgroupPath } } } if err := mountScanner.Err(); err != nil { return nil, err } return paths, nil } // Cgroup represents a cgroup configuration. type Cgroup interface { Install(res *specs.LinuxResources) error Uninstall() error Join() (func(), error) CPUQuota() (float64, error) CPUUsage() (uint64, error) NumCPU() (int, error) MemoryLimit() (uint64, error) MakePath(controllerName string) string } // cgroupV1 represents a group inside all controllers. For example: // // Name='/foo/bar' maps to /sys/fs/cgroup//foo/bar on // all controllers. // // If Name is relative, it uses the parent cgroup path to determine the // location. For example: // // Name='foo/bar' and Parent[ctrl]="/user.slice", then it will map to // /sys/fs/cgroup//user.slice/foo/bar type cgroupV1 struct { Name string `json:"name"` Parents map[string]string `json:"parents"` Own map[string]bool `json:"own"` } // NewFromPath creates a new Cgroup instance from the specified relative path. // Cgroup paths are loaded based on the current process. // If useSystemd is true, the Cgroup will be created and managed with // systemd. This requires systemd (>=v244) to be running on the host and the // cgroup path to be in the form `slice:prefix:name`. func NewFromPath(cgroupsPath string, useSystemd bool) (Cgroup, error) { return new("self", cgroupsPath, useSystemd) } // NewFromPid loads cgroup for the given process. // If useSystemd is true, the Cgroup will be created and managed with // systemd. This requires systemd (>=v244) to be running on the host and the // cgroup path to be in the form `slice:prefix:name`. func NewFromPid(pid int, useSystemd bool) (Cgroup, error) { return new(strconv.Itoa(pid), "", useSystemd) } // LikelySystemdPath returns true if the path looks like a systemd path. This is // by no means an exhaustive check, it's just a useful proxy for logging a // warning. func LikelySystemdPath(path string) bool { parts := strings.SplitN(path, ":", 4) return len(parts) == 3 } // TransformSystemdPath transforms systemd path to be in the form // `slice:prefix:name`. It returns an error if path could not be parsed as a // valid systemd path. func TransformSystemdPath(path, cid string, rootless bool) (string, error) { if len(path) == 0 { path = fmt.Sprintf(":runsc:%s", cid) } parts := strings.SplitN(path, ":", 4) if len(parts) != 3 { return "", fmt.Errorf("invalid systemd path: %q", path) } slice, prefix, name := parts[0], parts[1], parts[2] if len(slice) == 0 { if rootless { slice = "user.slice" } else { slice = "system.slice" } } return fmt.Sprintf("%s:%s:%s", slice, prefix, name), nil } func new(pid, cgroupsPath string, useSystemd bool) (Cgroup, error) { var ( parents map[string]string err error cg Cgroup ) // If path is relative, load cgroup paths for the process to build the // relative paths. if !filepath.IsAbs(cgroupsPath) { parents, err = loadPaths(pid) if err != nil { return nil, fmt.Errorf("finding current cgroups: %w", err) } } if IsOnlyV2() { // The cgroupsPath is in a special `slice:prefix:name` format for systemd // that should not be modified. if p, ok := parents[cgroup2Key]; ok && !useSystemd { // The cgroup of current pid will have tasks in it and we can't use // that, instead, use the its parent which should not have tasks in it. cgroupsPath = filepath.Join(filepath.Dir(p), cgroupsPath) } // Assume that for v2, cgroup is always mounted at cgroupRoot. cg, err = newCgroupV2(cgroupRoot, cgroupsPath, useSystemd) if err != nil { return nil, err } } else { cg = &cgroupV1{ Name: cgroupsPath, Parents: parents, Own: make(map[string]bool), } } log.Debugf("New cgroup for pid: %s, %T: %+v", pid, cg, cg) return cg, nil } // CgroupJSON is a wrapper for Cgroup that can be encoded to JSON. type CgroupJSON struct { Cgroup Cgroup } type cgroupJSONv1 struct { Cgroup *cgroupV1 `json:"cgroupv1"` } type cgroupJSONv2 struct { Cgroup *cgroupV2 `json:"cgroupv2"` } type cgroupJSONSystemd struct { Cgroup *cgroupSystemd `json:"cgroupsystemd"` } type cgroupJSONUnknown struct { Cgroup any `json:"cgroupunknown"` } // UnmarshalJSON implements json.Unmarshaler.UnmarshalJSON func (c *CgroupJSON) UnmarshalJSON(data []byte) error { m := map[string]json.RawMessage{} if err := json.Unmarshal(data, &m); err != nil { return err } var cg Cgroup if rm, ok := m["cgroupv1"]; ok { cg = &cgroupV1{} if err := json.Unmarshal(rm, cg); err != nil { return err } } else if rm, ok := m["cgroupv2"]; ok { cg = &cgroupV2{} if err := json.Unmarshal(rm, cg); err != nil { return err } } else if rm, ok := m["cgroupsystemd"]; ok { cg = &cgroupSystemd{} if err := json.Unmarshal(rm, cg); err != nil { return err } } c.Cgroup = cg return nil } // MarshalJSON implements json.Marshaler.MarshalJSON func (c *CgroupJSON) MarshalJSON() ([]byte, error) { if c.Cgroup == nil { return json.Marshal(cgroupJSONUnknown{}) } switch c.Cgroup.(type) { case *cgroupV1: return json.Marshal(cgroupJSONv1{Cgroup: c.Cgroup.(*cgroupV1)}) case *cgroupV2: return json.Marshal(cgroupJSONv2{Cgroup: c.Cgroup.(*cgroupV2)}) case *cgroupSystemd: return json.Marshal(cgroupJSONSystemd{Cgroup: c.Cgroup.(*cgroupSystemd)}) } return nil, nil } // Install creates and configures cgroups according to 'res'. If cgroup path // already exists, it means that the caller has already provided a // pre-configured cgroups, and 'res' is ignored. func (c *cgroupV1) Install(res *specs.LinuxResources) error { log.Debugf("Installing cgroup path %q", c.Name) // Clean up partially created cgroups on error. Errors during cleanup itself // are ignored. clean := cleanup.Make(func() { _ = c.Uninstall() }) defer clean.Clean() // Controllers can be symlinks to a group of controllers (e.g. cpu,cpuacct). // So first check what directories need to be created. Otherwise, when // the directory for one of the controllers in a group is created, it will // make it seem like the directory already existed and it's not owned by the // other controllers in the group. var missing []string for key := range controllers { path := c.MakePath(key) if _, err := os.Stat(path); err != nil { missing = append(missing, key) } else { log.Debugf("Using pre-created cgroup %q: %q", key, path) } } for _, key := range missing { ctrlr := controllers[key] if skip, err := createController(c, key); skip && ctrlr.optional() { if err := ctrlr.skip(res); err != nil { return err } log.Infof("Skipping cgroup %q, err: %v", key, err) continue } else if err != nil { return err } // Only set controllers that were created by me. c.Own[key] = true path := c.MakePath(key) if err := ctrlr.set(res, path); err != nil { return err } } clean.Release() return nil } // createController creates the controller directory, checking that the // controller is enabled in the system. It returns a boolean indicating whether // the controller should be skipped (e.g. controller is disabled). In case it // should be skipped, it also returns the error it got. func createController(c Cgroup, name string) (bool, error) { ctrlrPath := filepath.Join(cgroupRoot, name) if _, err := os.Stat(ctrlrPath); err != nil { return os.IsNotExist(err), err } path := c.MakePath(name) log.Debugf("Creating cgroup %q: %q", name, path) if err := os.MkdirAll(path, 0755); err != nil { return errors.Is(err, unix.EROFS), err } return false, nil } // Uninstall removes the settings done in Install(). If cgroup path already // existed when Install() was called, Uninstall is a noop. func (c *cgroupV1) Uninstall() error { log.Debugf("Deleting cgroup %q", c.Name) g, ctx := errgroup.WithContext(context.Background()) for key := range controllers { if !c.Own[key] { // cgroup is managed by caller, don't touch it. continue } path := c.MakePath(key) log.Debugf("Removing cgroup controller for key=%q path=%q", key, path) // If we try to remove the cgroup too soon after killing the sandbox we // might get EBUSY, so we retry for a few seconds until it succeeds. ctx, cancel := context.WithTimeout(ctx, 5*time.Second) defer cancel() b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) fn := func() error { err := unix.Rmdir(path) if os.IsNotExist(err) { return nil } return err } // Run deletions in parallel to remove all directories even if there are // failures/timeouts in other directories. g.Go(func() error { if err := backoff.Retry(fn, b); err != nil { return fmt.Errorf("removing cgroup path %q: %w", path, err) } return nil }) } return g.Wait() } // Join adds the current process to the all controllers. Returns function that // restores cgroup to the original state. func (c *cgroupV1) Join() (func(), error) { // First save the current state so it can be restored. paths, err := loadPaths("self") if err != nil { return nil, err } var undoPaths []string for ctrlr, path := range paths { // Skip controllers we don't handle. if _, ok := controllers[ctrlr]; ok { fullPath := filepath.Join(cgroupRoot, ctrlr, path) undoPaths = append(undoPaths, fullPath) } } cu := cleanup.Make(func() { for _, path := range undoPaths { log.Debugf("Restoring cgroup %q", path) // Writing the value 0 to a cgroup.procs file causes // the writing process to be moved to the corresponding // cgroup. - cgroups(7). if err := setValue(path, "cgroup.procs", "0"); err != nil { log.Warningf("Error restoring cgroup %q: %v", path, err) } } }) defer cu.Clean() // Now join the cgroups. for key, ctrlr := range controllers { path := c.MakePath(key) log.Debugf("Joining cgroup %q", path) // Writing the value 0 to a cgroup.procs file causes the writing process to // be moved to the corresponding cgroup - cgroups(7). if err := setValue(path, "cgroup.procs", "0"); err != nil { if ctrlr.optional() && os.IsNotExist(err) { continue } return nil, err } } return cu.Release(), nil } // CPUQuota returns the CFS CPU quota. func (c *cgroupV1) CPUQuota() (float64, error) { path := c.MakePath("cpu") quota, err := getInt(path, "cpu.cfs_quota_us") if err != nil { return -1, err } period, err := getInt(path, "cpu.cfs_period_us") if err != nil { return -1, err } if quota <= 0 || period <= 0 { return -1, err } return float64(quota) / float64(period), nil } // CPUUsage returns the total CPU usage of the cgroup in nanoseconds. func (c *cgroupV1) CPUUsage() (uint64, error) { path := c.MakePath("cpuacct") usage, err := getValue(path, "cpuacct.usage") if err != nil { return 0, err } return strconv.ParseUint(strings.TrimSpace(usage), 10, 64) } // NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'. func (c *cgroupV1) NumCPU() (int, error) { path := c.MakePath("cpuset") cpuset, err := getValue(path, "cpuset.cpus") if err != nil { return 0, err } return countCpuset(strings.TrimSpace(cpuset)) } // MemoryLimit returns the memory limit. func (c *cgroupV1) MemoryLimit() (uint64, error) { path := c.MakePath("memory") limStr, err := getValue(path, "memory.limit_in_bytes") if err != nil { return 0, err } return strconv.ParseUint(strings.TrimSpace(limStr), 10, 64) } // MakePath builds a path to the given controller. func (c *cgroupV1) MakePath(controllerName string) string { path := c.Name if parent, ok := c.Parents[controllerName]; ok { path = filepath.Join(parent, c.Name) } return filepath.Join(cgroupRoot, controllerName, path) } type controller interface { // optional controllers don't fail if not found. optional() bool // set applies resource limits to controller. set(*specs.LinuxResources, string) error // skip is called when controller is not found to check if it can be safely // skipped or not based on the spec. skip(*specs.LinuxResources) error } type noop struct{} func (n *noop) optional() bool { return true } func (*noop) set(*specs.LinuxResources, string) error { return nil } func (n *noop) skip(*specs.LinuxResources) error { return nil } type mandatory struct{} func (*mandatory) optional() bool { return false } func (*mandatory) skip(*specs.LinuxResources) error { panic("cgroup controller is not optional") } type memory struct { mandatory } func (*memory) set(spec *specs.LinuxResources, path string) error { if spec == nil || spec.Memory == nil { return nil } if err := setOptionalValueInt(path, "memory.limit_in_bytes", spec.Memory.Limit); err != nil { return err } if err := setOptionalValueInt(path, "memory.soft_limit_in_bytes", spec.Memory.Reservation); err != nil { return err } if err := setOptionalValueInt(path, "memory.memsw.limit_in_bytes", spec.Memory.Swap); err != nil { return err } if err := setOptionalValueInt(path, "memory.kmem.limit_in_bytes", spec.Memory.Kernel); err != nil { return err } if err := setOptionalValueInt(path, "memory.kmem.tcp.limit_in_bytes", spec.Memory.KernelTCP); err != nil { return err } if err := setOptionalValueUint(path, "memory.swappiness", spec.Memory.Swappiness); err != nil { return err } if spec.Memory.DisableOOMKiller != nil && *spec.Memory.DisableOOMKiller { if err := setValue(path, "memory.oom_control", "1"); err != nil { return err } } return nil } type cpu struct { mandatory } func (*cpu) set(spec *specs.LinuxResources, path string) error { if spec == nil || spec.CPU == nil { return nil } if err := setOptionalValueUint(path, "cpu.shares", spec.CPU.Shares); err != nil { return err } if err := setOptionalValueInt(path, "cpu.cfs_quota_us", spec.CPU.Quota); err != nil { return err } if err := setOptionalValueUint(path, "cpu.cfs_period_us", spec.CPU.Period); err != nil { return err } if err := setOptionalValueUint(path, "cpu.rt_period_us", spec.CPU.RealtimePeriod); err != nil { return err } return setOptionalValueInt(path, "cpu.rt_runtime_us", spec.CPU.RealtimeRuntime) } type cpuSet struct { mandatory } func (*cpuSet) set(spec *specs.LinuxResources, path string) error { // cpuset.cpus and mems are required fields, but are not set on a new cgroup. // If not set in the spec, get it from one of the ancestors cgroup. if spec == nil || spec.CPU == nil || spec.CPU.Cpus == "" { if _, err := fillFromAncestor(filepath.Join(path, "cpuset.cpus")); err != nil { return err } } else { if err := setValue(path, "cpuset.cpus", spec.CPU.Cpus); err != nil { return err } } if spec == nil || spec.CPU == nil || spec.CPU.Mems == "" { _, err := fillFromAncestor(filepath.Join(path, "cpuset.mems")) return err } return setValue(path, "cpuset.mems", spec.CPU.Mems) } type blockIO struct { mandatory } func (*blockIO) set(spec *specs.LinuxResources, path string) error { if spec == nil || spec.BlockIO == nil { return nil } if err := setOptionalValueUint16(path, "blkio.weight", spec.BlockIO.Weight); err != nil { return err } if err := setOptionalValueUint16(path, "blkio.leaf_weight", spec.BlockIO.LeafWeight); err != nil { return err } for _, dev := range spec.BlockIO.WeightDevice { if dev.Weight != nil { val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.Weight) if err := setValue(path, "blkio.weight_device", val); err != nil { return err } } if dev.LeafWeight != nil { val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.LeafWeight) if err := setValue(path, "blkio.leaf_weight_device", val); err != nil { return err } } } if err := setThrottle(path, "blkio.throttle.read_bps_device", spec.BlockIO.ThrottleReadBpsDevice); err != nil { return err } if err := setThrottle(path, "blkio.throttle.write_bps_device", spec.BlockIO.ThrottleWriteBpsDevice); err != nil { return err } if err := setThrottle(path, "blkio.throttle.read_iops_device", spec.BlockIO.ThrottleReadIOPSDevice); err != nil { return err } return setThrottle(path, "blkio.throttle.write_iops_device", spec.BlockIO.ThrottleWriteIOPSDevice) } func setThrottle(path, name string, devs []specs.LinuxThrottleDevice) error { for _, dev := range devs { val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Rate) if err := setValue(path, name, val); err != nil { return err } } return nil } type networkClass struct{} func (*networkClass) optional() bool { return true } func (*networkClass) set(spec *specs.LinuxResources, path string) error { if spec == nil || spec.Network == nil { return nil } return setOptionalValueUint32(path, "net_cls.classid", spec.Network.ClassID) } func (*networkClass) skip(spec *specs.LinuxResources) error { if spec != nil && spec.Network != nil && spec.Network.ClassID != nil { return fmt.Errorf("Network.ClassID set but net_cls cgroup controller not found") } return nil } type networkPrio struct{} func (*networkPrio) optional() bool { return true } func (*networkPrio) set(spec *specs.LinuxResources, path string) error { if spec == nil || spec.Network == nil { return nil } for _, prio := range spec.Network.Priorities { val := fmt.Sprintf("%s %d", prio.Name, prio.Priority) if err := setValue(path, "net_prio.ifpriomap", val); err != nil { return err } } return nil } func (*networkPrio) skip(spec *specs.LinuxResources) error { if spec != nil && spec.Network != nil && len(spec.Network.Priorities) > 0 { return fmt.Errorf("Network.Priorities set but net_prio cgroup controller not found") } return nil } type pids struct{} func (*pids) optional() bool { return true } func (*pids) skip(spec *specs.LinuxResources) error { if spec != nil && spec.Pids != nil && spec.Pids.Limit > 0 { return fmt.Errorf("Pids.Limit set but pids cgroup controller not found") } return nil } func (*pids) set(spec *specs.LinuxResources, path string) error { if spec == nil || spec.Pids == nil || spec.Pids.Limit <= 0 { return nil } val := strconv.FormatInt(spec.Pids.Limit, 10) return setValue(path, "pids.max", val) } type hugeTLB struct{} func (*hugeTLB) optional() bool { return true } func (*hugeTLB) skip(spec *specs.LinuxResources) error { if spec != nil && len(spec.HugepageLimits) > 0 { return fmt.Errorf("HugepageLimits set but hugetlb cgroup controller not found") } return nil } func (*hugeTLB) set(spec *specs.LinuxResources, path string) error { if spec == nil { return nil } for _, limit := range spec.HugepageLimits { name := fmt.Sprintf("hugetlb.%s.limit_in_bytes", limit.Pagesize) val := strconv.FormatUint(limit.Limit, 10) if err := setValue(path, name, val); err != nil { return err } } return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/cgroup/cgroup_state_autogen.go000066400000000000000000000000701465435605700260410ustar00rootroot00000000000000// automatically generated by stateify. package cgroup golang-gvisor-gvisor-0.0~20240729.0/runsc/cgroup/cgroup_v2.go000066400000000000000000000600261465435605700235350ustar00rootroot00000000000000// Copyright The runc Authors. // Copyright The containerd Authors. // Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cgroup import ( "bufio" "bytes" "context" "errors" "fmt" "io/ioutil" "math" "math/big" "os" "path/filepath" "strconv" "strings" "time" "github.com/cenkalti/backoff" "github.com/coreos/go-systemd/v22/dbus" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/log" ) const ( subtreeControl = "cgroup.subtree_control" controllersFile = "cgroup.controllers" cgroup2Key = "cgroup2" memoryLimitCgroup = "memory.max" cpuLimitCgroup = "cpu.max" maxLimitStr = "max" // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html defaultPeriod = 100000 ) var ( ErrInvalidFormat = errors.New("cgroup: parsing file with invalid format failed") ErrInvalidGroupPath = errors.New("cgroup: invalid group path") // controllers2 is the group of all supported cgroupv2 controllers controllers2 = map[string]controllerv2{ "cpu": &cpu2{}, "cpuset": &cpuset2{}, "io": &io2{}, "memory": &memory2{}, "pids": &pid2{}, "hugetlb": &hugeTLB2{}, } ) // cgroupV2 represents a cgroup inside supported all cgroupV2 controllers type cgroupV2 struct { // Mountpoint is the unified mount point of cgroupV2 Mountpoint string `json:"mountpoint"` // Path is the relative path to the unified mountpoint Path string `json:"path"` // Controllers is the list of supported controllers Controllers []string `json:"controllers"` // Own is the list of owned path created when install this cgroup Own []string `json:"own"` } func newCgroupV2(mountpoint, group string, useSystemd bool) (Cgroup, error) { data, err := ioutil.ReadFile(filepath.Join(mountpoint, "cgroup.controllers")) if err != nil { return nil, err } cg := &cgroupV2{ Mountpoint: mountpoint, Path: group, Controllers: strings.Fields(string(data)), } if useSystemd { return newCgroupV2Systemd(cg) } return cg, err } func (c *cgroupV2) createCgroupPaths() (bool, error) { // setup all known controllers for the current subtree // For example, given path /foo/bar and mount /sys/fs/cgroup, we need to write // the controllers to: // * /sys/fs/cgroup/cgroup.subtree_control // * /sys/fs/cgroup/foo/cgroup.subtree_control val := "+" + strings.Join(c.Controllers, " +") elements := strings.Split(c.Path, "/") current := c.Mountpoint created := false for i, e := range elements { current = filepath.Join(current, e) if i > 0 { if err := os.Mkdir(current, 0o755); err != nil { if !os.IsExist(err) { return false, err } } else { created = true c.Own = append(c.Own, current) } } // enable all known controllers for subtree if i < len(elements)-1 { if err := writeFile(filepath.Join(current, subtreeControl), []byte(val), 0700); err != nil { return false, err } } } return created, nil } // Install creates and configures cgroups. func (c *cgroupV2) Install(res *specs.LinuxResources) error { log.Debugf("Installing cgroup path %q", c.MakePath("")) // Clean up partially created cgroups on error. Errors during cleanup itself // are ignored. clean := cleanup.Make(func() { _ = c.Uninstall() }) defer clean.Clean() created, err := c.createCgroupPaths() if err != nil { return err } if created { // If we created our final cgroup path then we can set the resources. for controllerName, ctrlr := range controllers2 { // First check if our controller is found in the system. found := false for _, knownController := range c.Controllers { if controllerName == knownController { found = true } } // In case we don't have the controller. if found { if err := ctrlr.set(res, c.MakePath("")); err != nil { return err } continue } if ctrlr.optional() { if err := ctrlr.skip(res); err != nil { return err } } else { return fmt.Errorf("mandatory cgroup controller %q is missing for %q", controllerName, c.MakePath("")) } } } clean.Release() return nil } // Uninstall removes the settings done in Install(). If cgroup path already // existed when Install() was called, Uninstall is a noop. func (c *cgroupV2) Uninstall() error { log.Debugf("Deleting cgroup %q", c.MakePath("")) // If we try to remove the cgroup too soon after killing the sandbox we // might get EBUSY, so we retry for a few seconds until it succeeds. ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) // Deletion must occur reverse order, because they may contain ancestors. for i := len(c.Own) - 1; i >= 0; i-- { current := c.Own[i] log.Debugf("Removing cgroup for path=%q", current) fn := func() error { err := unix.Rmdir(current) if os.IsNotExist(err) { return nil } return err } if err := backoff.Retry(fn, b); err != nil { return fmt.Errorf("removing cgroup path %q: %w", current, err) } } return nil } // Join adds the current process to the all controllers. Returns function that // restores cgroup to the original state. func (c *cgroupV2) Join() (func(), error) { // First save the current state so it can be restored. paths, err := loadPaths("self") if err != nil { return nil, err } // Since this is unified, get the first path of current process's cgroup is // enough. undoPath := filepath.Join(c.Mountpoint, paths[cgroup2Key]) cu := cleanup.Make(func() { log.Debugf("Restoring cgroup %q", undoPath) // Writing the value 0 to a cgroup.procs file causes // the writing process to be moved to the corresponding // cgroup. - cgroups(7). if err := setValue(undoPath, "cgroup.procs", "0"); err != nil { log.Warningf("Error restoring cgroup %q: %v", undoPath, err) } }) defer cu.Clean() // now join the cgroup if err := setValue(c.MakePath(""), "cgroup.procs", "0"); err != nil { return nil, err } return cu.Release(), nil } func getCPUQuota(path string) (float64, error) { cpuMax, err := getValue(path, cpuLimitCgroup) if err != nil { return -1, err } return parseCPUQuota(cpuMax) } // CPUQuota returns the CFS CPU quota. func (c *cgroupV2) CPUQuota() (float64, error) { cpuQuota, err := getCPUQuota(c.MakePath("")) if err != nil { return -1, err } // In cgroupv2+systemd, limits are set in the parent slice rather // than the leaf node. Check the parent to see if this is the case. if cpuQuota == -1 { cpuQuota, err = getCPUQuota(filepath.Dir(c.MakePath(""))) if err != nil && errors.Is(err, os.ErrNotExist) { err = nil } } return cpuQuota, nil } func parseCPUQuota(cpuMax string) (float64, error) { data := strings.SplitN(strings.TrimSpace(cpuMax), " ", 2) if len(data) != 2 { return -1, fmt.Errorf("invalid cpu.max data %q", cpuMax) } // no cpu limit if quota is max if data[0] == maxLimitStr { return -1, nil } quota, err := strconv.ParseInt(data[0], 10, 64) if err != nil { return -1, err } period, err := strconv.ParseInt(data[1], 10, 64) if err != nil { return -1, err } if quota <= 0 || period <= 0 { return -1, err } return float64(quota) / float64(period), nil } // CPUUsage returns the total CPU usage of the cgroup in nanoseconds. func (c *cgroupV2) CPUUsage() (uint64, error) { cpuStat, err := getValue(c.MakePath(""), "cpu.stat") if err != nil { return 0, err } sc := bufio.NewScanner(strings.NewReader(cpuStat)) for sc.Scan() { key, value, err := parseKeyValue(sc.Text()) if err != nil { return 0, err } if key == "usage_usec" { return value * 1000, nil } } return 0, nil } // NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'. func (c *cgroupV2) NumCPU() (int, error) { cpuset, err := getValue(c.MakePath(""), "cpuset.cpus.effective") if err != nil { return 0, err } return countCpuset(strings.TrimSpace(cpuset)) } func getMemoryLimit(path string) (string, error) { limStr, err := getValue(path, memoryLimitCgroup) if err != nil { return "", err } return strings.TrimSpace(limStr), nil } // MemoryLimit returns the memory limit. func (c *cgroupV2) MemoryLimit() (uint64, error) { limStr, err := getMemoryLimit(c.MakePath("")) if err != nil { return 0, err } // In cgroupv2+systemd, limits are set in the parent slice rather // than the leaf node. Check the parent to see if this is the case. if limStr == maxLimitStr { parentLimStr, err := getMemoryLimit(filepath.Dir(c.MakePath(""))) if err != nil && !errors.Is(err, os.ErrNotExist) { return 0, err } if parentLimStr != "" { limStr = parentLimStr } if limStr == maxLimitStr { return math.MaxUint64, nil } } return strconv.ParseUint(limStr, 10, 64) } // MakePath builds a path to the given controller. func (c *cgroupV2) MakePath(string) string { return filepath.Join(c.Mountpoint, c.Path) } type controllerv2 interface { controller generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) } type cpu2 struct { mandatory } func (*cpu2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) { props := []dbus.Property{} if spec == nil || spec.CPU == nil { return props, nil } cpu := spec.CPU if cpu.Shares != nil { weight := convertCPUSharesToCgroupV2Value(*cpu.Shares) if weight != 0 { props = append(props, newProp("CPUWeight", weight)) } } var ( period uint64 quota int64 ) if cpu.Period != nil { period = *cpu.Period } if cpu.Quota != nil { quota = *cpu.Quota } if period != 0 { props = append(props, newProp("CPUQuotaPeriodUSec", period)) } if quota != 0 || period != 0 { // Corresponds to USEC_INFINITY in systemd. cpuQuotaPerSecUSec := uint64(math.MaxUint64) if quota > 0 { if period == 0 { // Assume the default. period = defaultPeriod } // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to // CPUQuota (integer percentage of CPU) internally. This means that if a // fractional percent of CPU is indicated by spec.CPU.Quota, we need to // round up to the nearest 10ms (1% of a second) such that child cgroups // can set the cpu.cfs_quota_us they expect. cpuQuotaPerSecUSec = uint64(quota*1000000) / period if cpuQuotaPerSecUSec%10000 != 0 { cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 } } props = append(props, newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) } return props, nil } func (*cpu2) set(spec *specs.LinuxResources, path string) error { if spec == nil || spec.CPU == nil { return nil } if spec.CPU.Shares != nil { weight := convertCPUSharesToCgroupV2Value(*spec.CPU.Shares) if weight != 0 { if err := setValue(path, "cpu.weight", strconv.FormatUint(weight, 10)); err != nil { return err } } } if spec.CPU.Period != nil || spec.CPU.Quota != nil { v := maxLimitStr if spec.CPU.Quota != nil && *spec.CPU.Quota > 0 { v = strconv.FormatInt(*spec.CPU.Quota, 10) } var period uint64 if spec.CPU.Period != nil && *spec.CPU.Period != 0 { period = *spec.CPU.Period } else { period = defaultPeriod } v += " " + strconv.FormatUint(period, 10) if err := setValue(path, "cpu.max", v); err != nil { return err } } return nil } type cpuset2 struct { mandatory } func (*cpuset2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) { props := []dbus.Property{} if spec == nil || spec.CPU == nil { return props, nil } cpu := spec.CPU if cpu.Cpus == "" && cpu.Mems == "" { return props, nil } cpus := cpu.Cpus mems := cpu.Mems if cpus != "" { bits, err := RangeToBits(cpus) if err != nil { return nil, fmt.Errorf("%w: cpus=%q conversion error: %v", ErrBadResourceSpec, cpus, err) } props = append(props, newProp("AllowedCPUs", bits)) } if mems != "" { bits, err := RangeToBits(mems) if err != nil { return nil, fmt.Errorf("%w: mems=%q conversion error: %v", ErrBadResourceSpec, mems, err) } props = append(props, newProp("AllowedMemoryNodes", bits)) } return props, nil } func (*cpuset2) set(spec *specs.LinuxResources, path string) error { if spec == nil || spec.CPU == nil { return nil } if spec.CPU.Cpus != "" { if err := setValue(path, "cpuset.cpus", spec.CPU.Cpus); err != nil { return err } } if spec.CPU.Mems != "" { if err := setValue(path, "cpuset.mems", spec.CPU.Mems); err != nil { return err } } return nil } type memory2 struct { mandatory } func (*memory2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) { props := []dbus.Property{} if spec == nil || spec.Memory == nil { return props, nil } mem := spec.Memory if mem.Swap != nil { if mem.Limit == nil { return nil, ErrBadResourceSpec } swap, err := convertMemorySwapToCgroupV2Value(*mem.Swap, *mem.Limit) if err != nil { return nil, err } props = append(props, newProp("MemorySwapMax", uint64(swap))) } if mem.Limit != nil { props = append(props, newProp("MemoryMax", uint64(*mem.Limit))) } if mem.Reservation != nil { props = append(props, newProp("MemoryLow", uint64(*mem.Reservation))) } return props, nil } func (*memory2) set(spec *specs.LinuxResources, path string) error { if spec == nil || spec.Memory == nil { return nil } if spec.Memory.Swap != nil { // in cgroup v2, we set memory and swap separately, but the spec specifies // Swap field as memory+swap, so we need memory limit here to be set in // order to get the correct swap value. if spec.Memory.Limit == nil { return errors.New("cgroup: Memory.Swap is set without Memory.Limit") } swap, err := convertMemorySwapToCgroupV2Value(*spec.Memory.Swap, *spec.Memory.Limit) if err != nil { return nil } swapStr := numToStr(swap) // memory and memorySwap set to the same value -- disable swap if swapStr == "" && swap == 0 && *spec.Memory.Swap > 0 { swapStr = "0" } // never write empty string to `memory.swap.max`, it means set to 0. if swapStr != "" { if err := setValue(path, "memory.swap.max", swapStr); err != nil { return err } } } if spec.Memory.Limit != nil { if val := numToStr(*spec.Memory.Limit); val != "" { if err := setValue(path, "memory.max", val); err != nil { return err } } } if spec.Memory.Reservation != nil { if val := numToStr(*spec.Memory.Reservation); val != "" { if err := setValue(path, "memory.low", val); err != nil { return err } } } return nil } type pid2 struct { mandatory } func (*pid2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) { if spec != nil && spec.Pids != nil { return []dbus.Property{newProp("TasksMax", uint64(spec.Pids.Limit))}, nil } return []dbus.Property{}, nil } func (*pid2) set(spec *specs.LinuxResources, path string) error { if spec == nil || spec.Pids == nil { return nil } if val := numToStr(spec.Pids.Limit); val != "" { return setValue(path, "pids.max", val) } return nil } type io2 struct { mandatory } func (*io2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) { props := []dbus.Property{} if spec == nil || spec.BlockIO == nil { return props, nil } io := spec.BlockIO if io != nil { if io.Weight != nil && *io.Weight != 0 { ioWeight := convertBlkIOToIOWeightValue(*io.Weight) props = append(props, newProp("IOWeight", ioWeight)) } for _, dev := range io.WeightDevice { val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, *dev.Weight) props = append(props, newProp("IODeviceWeight", val)) } props = addIOProps(props, "IOReadBandwidthMax", io.ThrottleReadBpsDevice) props = addIOProps(props, "IOWriteBandwidthMax", io.ThrottleWriteBpsDevice) props = addIOProps(props, "IOReadIOPSMax", io.ThrottleReadIOPSDevice) props = addIOProps(props, "IOWriteIOPSMax", io.ThrottleWriteIOPSDevice) } return props, nil } func (*io2) set(spec *specs.LinuxResources, path string) error { if spec == nil || spec.BlockIO == nil { return nil } blkio := spec.BlockIO var ( err error bfq *os.File ) // If BFQ IO scheduler is available, use it. if blkio.Weight != nil || len(blkio.WeightDevice) > 0 { bfq, err = os.Open(filepath.Join(path, "io.bfq.weight")) if err == nil { defer bfq.Close() } else if !os.IsNotExist(err) { return err } } if blkio.Weight != nil && *blkio.Weight != 0 { if bfq != nil { if _, err := bfq.WriteString(strconv.FormatUint(uint64(*blkio.Weight), 10)); err != nil { return err } } else { // bfq io scheduler is not available, fallback to io.weight with // a conversion scheme ioWeight := convertBlkIOToIOWeightValue(*blkio.Weight) if err = setValue(path, "io.weight", strconv.FormatUint(ioWeight, 10)); err != nil { return err } } } if bfqDeviceWeightSupported(bfq) { // ignore leaf weight, does not apply to cgroupv2 for _, dev := range blkio.WeightDevice { if dev.Weight != nil { val := fmt.Sprintf("%d:%d %d\n", dev.Major, dev.Minor, *dev.Weight) if _, err := bfq.WriteString(val); err != nil { return fmt.Errorf("failed to set device weight %q: %w", val, err) } } } } if err := setThrottle2(path, "rbps", blkio.ThrottleReadBpsDevice); err != nil { return err } if err := setThrottle2(path, "wbps", blkio.ThrottleWriteBpsDevice); err != nil { return err } if err := setThrottle2(path, "riops", blkio.ThrottleReadIOPSDevice); err != nil { return err } if err := setThrottle2(path, "wiops", blkio.ThrottleWriteIOPSDevice); err != nil { return err } return nil } func setThrottle2(path, name string, devs []specs.LinuxThrottleDevice) error { for _, dev := range devs { val := fmt.Sprintf("%d:%d %s=%d", dev.Major, dev.Minor, name, dev.Rate) if err := setValue(path, "io.max", val); err != nil { return err } } return nil } type hugeTLB2 struct { } func (*hugeTLB2) optional() bool { return true } func (*hugeTLB2) skip(spec *specs.LinuxResources) error { if spec != nil && len(spec.HugepageLimits) > 0 { return fmt.Errorf("HugepageLimits set but hugetlb cgroup controller not found") } return nil } func (*hugeTLB2) generateProperties(spec *specs.LinuxResources) ([]dbus.Property, error) { return nil, nil } func (*hugeTLB2) set(spec *specs.LinuxResources, path string) error { if spec == nil { return nil } for _, limit := range spec.HugepageLimits { name := fmt.Sprintf("hugetlb.%s.limit_in_bytes", limit.Pagesize) val := strconv.FormatUint(limit.Limit, 10) if err := setValue(path, name, val); err != nil { return err } } return nil } // Since the OCI spec is designed for cgroup v1, in some cases // there is need to convert from the cgroup v1 configuration to cgroup v2 // the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142) // convert from [2-262144] to [1-10000] // 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)" func convertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 { if cpuShares == 0 { return 0 } return (1 + ((cpuShares-2)*9999)/262142) } // convertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec // for use by cgroup v2 drivers. A conversion is needed since // Resources.MemorySwap is defined as memory+swap combined, while in cgroup v2 // swap is a separate value. func convertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) { // for compatibility with cgroup1 controller, set swap to unlimited in // case the memory is set to unlimited, and swap is not explicitly set, // treating the request as "set both memory and swap to unlimited". if memory == -1 && memorySwap == 0 { return -1, nil } if memorySwap == -1 || memorySwap == 0 { // -1 is "max", 0 is "unset", so treat as is. return memorySwap, nil } // sanity checks if memory == 0 || memory == -1 { return 0, errors.New("unable to set swap limit without memory limit") } if memory < 0 { return 0, fmt.Errorf("invalid memory value: %d", memory) } if memorySwap < memory { return 0, errors.New("memory+swap limit should be >= memory limit") } return memorySwap - memory, nil } // Since the OCI spec is designed for cgroup v1, in some cases // there is need to convert from the cgroup v1 configuration to cgroup v2 // the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990) // convert linearly from [10-1000] to [1-10000] func convertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 { if blkIoWeight == 0 { return 0 } return 1 + (uint64(blkIoWeight)-10)*9999/990 } // numToStr converts an int64 value to a string for writing to a // cgroupv2 files with .min, .max, .low, or .high suffix. // The value of -1 is converted to "max" for cgroupv1 compatibility // (which used to write -1 to remove the limit). func numToStr(value int64) (ret string) { switch { case value == 0: ret = "" case value == -1: ret = maxLimitStr default: ret = strconv.FormatInt(value, 10) } return ret } // bfqDeviceWeightSupported checks for per-device BFQ weight support (added // in kernel v5.4, commit 795fe54c2a8) by reading from "io.bfq.weight". func bfqDeviceWeightSupported(bfq *os.File) bool { if bfq == nil { return false } if _, err := bfq.Seek(0, 0); err != nil { return false } buf := make([]byte, 32) if _, err := bfq.Read(buf); err != nil { return false } // If only a single number (default weight) if read back, we have older // kernel. _, err := strconv.ParseInt(string(bytes.TrimSpace(buf)), 10, 64) return err != nil } // parseKeyValue parses a space-separated "name value" kind of cgroup // parameter and returns its key as a string, and its value as uint64 // (ParseUint is used to convert the value). For example, // "io_service_bytes 1234" will be returned as "io_service_bytes", 1234. func parseKeyValue(t string) (string, uint64, error) { parts := strings.SplitN(t, " ", 3) if len(parts) != 2 { return "", 0, fmt.Errorf("line %q is not in key value format", t) } value, err := parseUint(parts[1], 10, 64) if err != nil { return "", 0, err } return parts[0], value, nil } // parseUint converts a string to an uint64 integer. // Negative values are returned at zero as, due to kernel bugs, // some of the memory cgroup stats can be negative. func parseUint(s string, base, bitSize int) (uint64, error) { value, err := strconv.ParseUint(s, base, bitSize) if err != nil { intValue, intErr := strconv.ParseInt(s, base, bitSize) // 1. Handle negative values greater than MinInt64 (and) // 2. Handle negative values lesser than MinInt64 if intErr == nil && intValue < 0 { return 0, nil } else if errors.Is(intErr, strconv.ErrRange) && intValue < 0 { return 0, nil } return value, err } return value, nil } // RangeToBits converts a text representation of a CPU mask (as written to // or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes // with the corresponding bits set (as consumed by systemd over dbus as // AllowedCPUs/AllowedMemoryNodes unit property value). // Copied from runc. func RangeToBits(str string) ([]byte, error) { bits := &big.Int{} for _, r := range strings.Split(str, ",") { // allow extra spaces around r = strings.TrimSpace(r) // allow empty elements (extra commas) if r == "" { continue } ranges := strings.SplitN(r, "-", 2) if len(ranges) > 1 { start, err := strconv.ParseUint(ranges[0], 10, 32) if err != nil { return nil, err } end, err := strconv.ParseUint(ranges[1], 10, 32) if err != nil { return nil, err } if start > end { return nil, errors.New("invalid range: " + r) } for i := start; i <= end; i++ { bits.SetBit(bits, int(i), 1) } } else { val, err := strconv.ParseUint(ranges[0], 10, 32) if err != nil { return nil, err } bits.SetBit(bits, int(val), 1) } } ret := bits.Bytes() if len(ret) == 0 { // do not allow empty values return nil, errors.New("empty value") } return ret, nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/cgroup/systemd.go000066400000000000000000000230621465435605700233160ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cgroup import ( "context" "errors" "fmt" "os" "path/filepath" "regexp" "strconv" "strings" "sync" "time" systemdDbus "github.com/coreos/go-systemd/v22/dbus" dbus "github.com/godbus/dbus/v5" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/log" ) var ( // ErrBadResourceSpec indicates that a cgroupSystemd function was // passed a specs.LinuxResources object that is impossible or illegal // to process. ErrBadResourceSpec = errors.New("misconfigured resource spec") // ErrInvalidSlice indicates that the slice name passed via cgroup.Path is // invalid. ErrInvalidSlice = errors.New("invalid slice name") ) // cgroupSystemd represents a cgroupv2 managed by systemd. type cgroupSystemd struct { cgroupV2 // Name is the name of the of the systemd scope that controls the cgroups. Name string // Parent is the encapsulating slice. Parent string // ScopePrefix is the prefix for the scope name. ScopePrefix string properties []systemdDbus.Property dbusConn *systemdDbus.Conn } func newCgroupV2Systemd(cgv2 *cgroupV2) (*cgroupSystemd, error) { if !isRunningSystemd() { return nil, fmt.Errorf("systemd not running on host") } ctx := context.Background() cg := &cgroupSystemd{cgroupV2: *cgv2} // Parse the path from expected "slice:prefix:name" // for e.g. "system.slice:docker:1234" parts := strings.Split(cg.Path, ":") if len(parts) != 3 { return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups, got %q instead", cg.Path) } cg.Parent = parts[0] cg.ScopePrefix = parts[1] cg.Name = parts[2] if err := validSlice(cg.Parent); err != nil { return nil, fmt.Errorf("%w: %v", ErrInvalidGroupPath, err) } // Rewrite Path so that it is compatible with cgroupv2 methods. cg.Path = filepath.Join(expandSlice(cg.Parent), cg.unitName()) conn, err := systemdDbus.NewWithContext(ctx) if err != nil { return nil, err } var version int if version, err = systemdVersion(conn); err != nil { return nil, fmt.Errorf("error parsing systemd version: %v", err) } if version < 244 { return nil, fmt.Errorf("systemd version %d not supported, please upgrade to at least 244", version) } cg.dbusConn = conn return cg, err } // Install configures the properties for a scope unit but does not start the // unit. func (c *cgroupSystemd) Install(res *specs.LinuxResources) error { log.Debugf("Installing systemd cgroup resource controller under %v", c.Parent) c.properties = append(c.properties, systemdDbus.PropSlice(c.Parent)) c.properties = append(c.properties, systemdDbus.PropDescription("Secure container "+c.Name)) pid := os.Getpid() c.properties = append(c.properties, systemdDbus.PropPids(uint32(pid))) // We always want proper accounting for the container for reporting resource // usage. c.addProp("MemoryAccounting", true) c.addProp("CPUAccounting", true) c.addProp("TasksAccounting", true) c.addProp("IOAccounting", true) // Delegate must be true so that the container can manage its own cgroups. c.addProp("Delegate", true) // For compatibility with runc. c.addProp("DefaultDependencies", false) for controllerName, ctrlr := range controllers2 { // First check if our controller is found in the system. found := false for _, knownController := range c.Controllers { if controllerName == knownController { found = true } } if found { props, err := ctrlr.generateProperties(res) if err != nil { return err } c.properties = append(c.properties, props...) continue } if ctrlr.optional() { if err := ctrlr.skip(res); err != nil { return err } } else { return fmt.Errorf("mandatory cgroup controller %q is missing for %q", controllerName, c.Path) } } return nil } func (c *cgroupSystemd) unitName() string { return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name) } // MakePath builds a path to the given controller. func (c *cgroupSystemd) MakePath(string) string { fullSlicePath := expandSlice(c.Parent) path := filepath.Join(c.Mountpoint, fullSlicePath, c.unitName()) return path } // Join implements Cgroup.Join. func (c *cgroupSystemd) Join() (func(), error) { log.Debugf("Joining systemd cgroup %v", c.unitName()) timeout := 30 * time.Second ctx := context.Background() // Clean up partially created cgroups on error. Errors during cleanup itself // are ignored. clean := cleanup.Make(func() { _ = c.Uninstall() }) defer clean.Clean() conn, err := systemdDbus.NewWithContext(ctx) if err != nil { return nil, err } c.dbusConn = conn unitName := c.unitName() statusChan := make(chan string) timedCtx, cancel := context.WithTimeout(ctx, timeout) defer cancel() if _, err := c.dbusConn.StartTransientUnitContext(timedCtx, unitName, "replace", c.properties, statusChan); err == nil { s := <-statusChan close(statusChan) switch s { case "done": // All cases that are not "done" according to the dbus package. case "cancelled", "timeout", "failed", "dependency", "skipped": c.dbusConn.ResetFailedUnitContext(ctx, unitName) return nil, fmt.Errorf("error creating systemd unit `%s`: got %s", unitName, s) default: c.dbusConn.ResetFailedUnitContext(ctx, unitName) return nil, fmt.Errorf("unknown job completion status %q", s) } } else if unitAlreadyExists(err) { return clean.Release(), nil } else { return nil, fmt.Errorf("systemd error: %v", err) } if _, err = c.createCgroupPaths(); err != nil { return nil, err } return clean.Release(), nil } // unitAlreadyExists returns true if the error is that a systemd unit already // exists. func unitAlreadyExists(err error) bool { if err != nil { var derr dbus.Error if errors.As(err, &derr) { return strings.Contains(derr.Name, "org.freedesktop.systemd1.UnitExists") } } return false } // systemd represents slice hierarchy using `-`, so we need to follow suit when // generating the path of slice. Essentially, test-a-b.slice becomes // /test.slice/test-a.slice/test-a-b.slice. func expandSlice(slice string) string { var path, prefix string suffix := ".slice" sliceName := strings.TrimSuffix(slice, suffix) // If input was -.slice, we should just return root now. if sliceName == "-" { return "/" } for _, component := range strings.Split(sliceName, "-") { // Append the component to the path and to the prefix. path += "/" + prefix + component + suffix prefix += component + "-" } return path } func validSlice(slice string) error { suffix := ".slice" // Name has to end with ".slice", but can't be just ".slice". if slice == suffix || !strings.HasSuffix(slice, suffix) { return fmt.Errorf("%w: %s", ErrInvalidSlice, slice) } // Path-separators are not allowed. if strings.Contains(slice, "/") { return fmt.Errorf("%w: %s", ErrInvalidSlice, slice) } sliceName := strings.TrimSuffix(slice, suffix) // If input was -.slice, we should just return root now. if sliceName == "-" { return nil } for _, component := range strings.Split(sliceName, "-") { // test--a.slice isn't permitted, nor is -test.slice. if component == "" { return fmt.Errorf("%w: %s", ErrInvalidSlice, slice) } } return nil } var systemdCheck struct { once sync.Once cache bool } func isRunningSystemd() bool { systemdCheck.once.Do(func() { fi, err := os.Lstat("/run/systemd/system") systemdCheck.cache = err == nil && fi.IsDir() }) return systemdCheck.cache } func systemdVersion(conn *systemdDbus.Conn) (int, error) { vStr, err := conn.GetManagerProperty("Version") if err != nil { return -1, errors.New("unable to get systemd version") } // vStr should be of the form: // "v245.4-1.fc32", "245", "v245-1.fc32", "245-1.fc32" (without quotes). // The result for all of the above should be 245. // Thus, we unconditionally remove the "v" prefix // and then match on the first integer we can grab. re := regexp.MustCompile(`v?([0-9]+)`) matches := re.FindStringSubmatch(vStr) if len(matches) < 2 { return -1, fmt.Errorf("can't parse version %q: incorrect number of matches %d", vStr, len(matches)) } version, err := strconv.Atoi(matches[1]) if err != nil { return -1, fmt.Errorf("%w: can't parse version %q", err, vStr) } return version, nil } func addIOProps(props []systemdDbus.Property, name string, devs []specs.LinuxThrottleDevice) []systemdDbus.Property { for _, dev := range devs { val := fmt.Sprintf("%d:%d %d", dev.Major, dev.Minor, dev.Rate) props = append(props, newProp(name, val)) } return props } func (c *cgroupSystemd) addProp(name string, value any) { if value == nil { return } c.properties = append(c.properties, newProp(name, value)) } func newProp(name string, units any) systemdDbus.Property { return systemdDbus.Property{ Name: name, Value: dbus.MakeVariant(units), } } // CreateMockSystemdCgroup returns a mock Cgroup configured for systemd. This // is useful for testing. func CreateMockSystemdCgroup() Cgroup { return &cgroupSystemd{ Name: "test", ScopePrefix: "runsc", Parent: "system.slice", cgroupV2: cgroupV2{ Mountpoint: "/sys/fs/cgroup", Path: "/a/random/path", }, } } golang-gvisor-gvisor-0.0~20240729.0/runsc/cli/000077500000000000000000000000001465435605700205445ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/cli/cli_state_autogen.go000066400000000000000000000000651465435605700245650ustar00rootroot00000000000000// automatically generated by stateify. package cli golang-gvisor-gvisor-0.0~20240729.0/runsc/cli/main.go000066400000000000000000000255331465435605700220270ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package cli is the main entrypoint for runsc. package cli import ( "context" "fmt" "io" "io/ioutil" "os" "os/signal" "runtime" "strconv" "strings" "time" "github.com/google/subcommands" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/coverage" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/runsc/cmd" "gvisor.dev/gvisor/runsc/cmd/nvproxy" "gvisor.dev/gvisor/runsc/cmd/trace" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/specutils" "gvisor.dev/gvisor/runsc/starttime" "gvisor.dev/gvisor/runsc/version" ) // versionFlagName is the name of a flag that triggers printing the version. // Although this flags is not part of the OCI spec, it is used by // Docker, and thus should not be removed. const versionFlagName = "version" var ( // These flags are unique to runsc, and are used to configure parts of the // system that are not covered by the runtime spec. // Debugging flags. logFD = flag.Int("log-fd", -1, "file descriptor to log to. If set, the 'log' flag is ignored.") debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to. If set, the 'debug-log-dir' flag is ignored.") panicLogFD = flag.Int("panic-log-fd", -1, "file descriptor to write Go's runtime messages.") coverageFD = flag.Int("coverage-fd", -1, "file descriptor to write Go coverage output.") ) // Main is the main entrypoint. func Main() { // Register all commands. forEachCmd(subcommands.Register) // Register with the main command line. config.RegisterFlags(flag.CommandLine) // Register version flag if it is not already defined. if flag.Lookup(versionFlagName) == nil { flag.Bool(versionFlagName, false, "show version and exit.") } // All subcommands must be registered before flag parsing. flag.Parse() // Are we showing the version? if flag.Get(flag.Lookup(versionFlagName).Value).(bool) { // The format here is the same as runc. fmt.Fprintf(os.Stdout, "runsc version %s\n", version.Version()) fmt.Fprintf(os.Stdout, "spec: %s\n", specutils.Version) os.Exit(0) } // Create a new Config from the flags. conf, err := config.NewFromFlags(flag.CommandLine) if err != nil { util.Fatalf(err.Error()) } var errorLogger io.Writer if *logFD > -1 { errorLogger = os.NewFile(uintptr(*logFD), "error log file") } else if conf.LogFilename != "" { // We must set O_APPEND and not O_TRUNC because Docker passes // the same log file for all commands (and also parses these // log files), so we can't destroy them on each command. var err error errorLogger, err = os.OpenFile(conf.LogFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644) if err != nil { util.Fatalf("error opening log file %q: %v", conf.LogFilename, err) } } util.ErrorLogger = errorLogger if _, err := platform.Lookup(conf.Platform); err != nil { util.Fatalf("%v", err) } // Sets the reference leak check mode. Also set it in config below to // propagate it to child processes. refs.SetLeakMode(conf.ReferenceLeak) subcommand := flag.CommandLine.Arg(0) // Set up logging. if conf.Debug && specutils.IsDebugCommand(conf, subcommand) { log.SetLevel(log.Debug) } // Logging will include the local date and time via the time package. // // On first use, time.Local initializes the local time zone, which // involves opening tzdata files on the host. Since this requires // opening host files, it must be done before syscall filter // installation. // // Generally there will be a log message before filter installation // that will force initialization, but force initialization here in // case that does not occur. _ = time.Local.String() // Set the start time as soon as possible. startTime := starttime.Get() var emitters log.MultiEmitter if *debugLogFD > -1 { f := os.NewFile(uintptr(*debugLogFD), "debug log file") emitters = append(emitters, newEmitter(conf.DebugLogFormat, f)) } else if len(conf.DebugLog) > 0 && specutils.IsDebugCommand(conf, subcommand) { f, err := specutils.DebugLogFile(conf.DebugLog, subcommand, "" /* name */, startTime) if err != nil { util.Fatalf("error opening debug log file in %q: %v", conf.DebugLog, err) } emitters = append(emitters, newEmitter(conf.DebugLogFormat, f)) } else { // Stderr is reserved for the application, just discard the logs if no debug // log is specified. emitters = append(emitters, newEmitter("text", ioutil.Discard)) } if *panicLogFD > -1 || *debugLogFD > -1 { fd := *panicLogFD if fd < 0 { fd = *debugLogFD } // Quick sanity check to make sure no other commands get passed // a log fd (they should use log dir instead). if subcommand != "boot" && subcommand != "gofer" { util.Fatalf("flags --debug-log-fd and --panic-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand) } // If we are the boot process, then we own our stdio FDs and can do what we // want with them. Since Docker and Containerd both eat boot's stderr, we // dup our stderr to the provided log FD so that panics will appear in the // logs, rather than just disappear. if err := unix.Dup3(fd, int(os.Stderr.Fd()), 0); err != nil { util.Fatalf("error dup'ing fd %d to stderr: %v", fd, err) } } else if conf.AlsoLogToStderr { emitters = append(emitters, newEmitter(conf.DebugLogFormat, os.Stderr)) } if ulEmittter, add := userLogEmitter(conf, subcommand); add { emitters = append(emitters, ulEmittter) } switch len(emitters) { case 0: // Do nothing. case 1: // Use the singular emitter to avoid needless // `for` loop overhead when logging to a single place. log.SetTarget(emitters[0]) default: log.SetTarget(&emitters) } const delimString = `**************** gVisor ****************` log.Infof(delimString) log.Infof("Version %s, %s, %s, %d CPUs, %s, PID %d, PPID %d, UID %d, GID %d", version.Version(), runtime.Version(), runtime.GOARCH, runtime.NumCPU(), runtime.GOOS, os.Getpid(), os.Getppid(), os.Getuid(), os.Getgid()) log.Debugf("Page size: 0x%x (%d bytes)", os.Getpagesize(), os.Getpagesize()) log.Infof("Args: %v", os.Args) conf.Log() log.Infof(delimString) if *coverageFD >= 0 { f := os.NewFile(uintptr(*coverageFD), "coverage file") coverage.EnableReport(f) } if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { // SIGTERM is sent to all processes if a test exceeds its // timeout and this case is handled by syscall_test_runner. log.Warningf("Block the TERM signal. This is only safe in tests!") signal.Ignore(unix.SIGTERM) } linux.SetAFSSyscallPanic(conf.TestOnlyAFSSyscallPanic) // Call the subcommand and pass in the configuration. var ws unix.WaitStatus subcmdCode := subcommands.Execute(context.Background(), conf, &ws) // Check for leaks and write coverage report before os.Exit(). refs.DoLeakCheck() _ = coverage.Report() if subcmdCode == subcommands.ExitSuccess { log.Infof("Exiting with status: %v", ws) if ws.Signaled() { // No good way to return it, emulate what the shell does. Maybe raise // signal to self? os.Exit(128 + int(ws.Signal())) } os.Exit(ws.ExitStatus()) } // Return an error that is unlikely to be used by the application. log.Warningf("Failure to execute command, err: %v", subcmdCode) os.Exit(128) } // forEachCmd invokes the passed callback for each command supported by runsc. func forEachCmd(cb func(cmd subcommands.Command, group string)) { // Help and flags commands are generated automatically. help := cmd.NewHelp(subcommands.DefaultCommander) help.Register(new(cmd.Platforms)) help.Register(new(cmd.Syscalls)) cb(help, "") cb(subcommands.FlagsCommand(), "") // Register OCI user-facing runsc commands. cb(new(cmd.Checkpoint), "") cb(new(cmd.Create), "") cb(new(cmd.Delete), "") cb(new(cmd.Do), "") cb(new(cmd.Events), "") cb(new(cmd.Exec), "") cb(new(cmd.Kill), "") cb(new(cmd.List), "") cb(new(cmd.PS), "") cb(new(cmd.Pause), "") cb(new(cmd.PortForward), "") cb(new(cmd.Restore), "") cb(new(cmd.Resume), "") cb(new(cmd.Run), "") cb(new(cmd.Spec), "") cb(new(cmd.Start), "") cb(new(cmd.State), "") cb(new(cmd.Wait), "") // Helpers. const helperGroup = "helpers" cb(new(cmd.Install), helperGroup) cb(new(cmd.Mitigate), helperGroup) cb(new(cmd.Uninstall), helperGroup) cb(new(nvproxy.Nvproxy), helperGroup) cb(new(trace.Trace), helperGroup) const debugGroup = "debug" cb(new(cmd.Debug), debugGroup) cb(new(cmd.Statefile), debugGroup) cb(new(cmd.Symbolize), debugGroup) cb(new(cmd.Usage), debugGroup) cb(new(cmd.ReadControl), debugGroup) cb(new(cmd.WriteControl), debugGroup) const metricGroup = "metrics" cb(new(cmd.MetricMetadata), metricGroup) cb(new(cmd.MetricExport), metricGroup) cb(new(cmd.MetricServer), metricGroup) // Internal commands. const internalGroup = "internal use only" cb(new(cmd.Boot), internalGroup) cb(new(cmd.Gofer), internalGroup) cb(new(cmd.Umount), internalGroup) } func newEmitter(format string, logFile io.Writer) log.Emitter { switch format { case "text": return log.GoogleEmitter{&log.Writer{Next: logFile}} case "json": return log.JSONEmitter{&log.Writer{Next: logFile}} case "json-k8s": return log.K8sJSONEmitter{&log.Writer{Next: logFile}} } util.Fatalf("invalid log format %q, must be 'text', 'json', or 'json-k8s'", format) panic("unreachable") } // userLogEmitter returns an emitter to add logs to user logs if requested. func userLogEmitter(conf *config.Config, subcommand string) (log.Emitter, bool) { if subcommand != "boot" || !conf.DebugToUserLog { return nil, false } // We need to manually scan for `--user-log-fd` since it is a flag of the // `boot` subcommand. We know it is in `--user-log-fd=FD` format because // we control how arguments to `runsc boot` are formatted. const userLogFDFlagPrefix = "--user-log-fd=" var userLog *os.File for _, arg := range os.Args[1:] { if !strings.HasPrefix(arg, userLogFDFlagPrefix) { continue } if userLog != nil { util.Fatalf("duplicate %q flag", userLogFDFlagPrefix) } userLogFD, err := strconv.Atoi(arg[len(userLogFDFlagPrefix):]) if err != nil { util.Fatalf("invalid user log FD flag %q: %v", arg, err) } userLog = os.NewFile(uintptr(userLogFD), "user log file") } if userLog == nil { return nil, false } return log.K8sJSONEmitter{&log.Writer{Next: userLog}}, true } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/000077500000000000000000000000001465435605700205405ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/boot.go000066400000000000000000000573351465435605700220470ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "fmt" "io/ioutil" "os" "os/exec" "path/filepath" "runtime" "runtime/debug" "strconv" "strings" "time" "github.com/google/subcommands" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/coretag" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/sentry/hostmm" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/profile" "gvisor.dev/gvisor/runsc/specutils" ) // Note that directfsSandboxCaps is the same as caps defined in gofer.go // except CAP_SYS_CHROOT because we don't need to chroot in directfs mode. var directfsSandboxCaps = []string{ "CAP_CHOWN", "CAP_DAC_OVERRIDE", "CAP_DAC_READ_SEARCH", "CAP_FOWNER", "CAP_FSETID", } // directfsSandboxLinuxCaps is the minimal set of capabilities needed by the // sandbox to operate on files in directfs mode. var directfsSandboxLinuxCaps = &specs.LinuxCapabilities{ Bounding: directfsSandboxCaps, Effective: directfsSandboxCaps, Permitted: directfsSandboxCaps, } // Boot implements subcommands.Command for the "boot" command which starts a // new sandbox. It should not be called directly. type Boot struct { // bundleDir is the directory containing the OCI spec. bundleDir string // specFD is the file descriptor that the spec will be read from. specFD int // controllerFD is the file descriptor of a stream socket for the // control server that is donated to this process. controllerFD int // deviceFD is the file descriptor for the platform device file. deviceFD int // ioFDs is the list of FDs used to connect to FS gofers. ioFDs intFlags // devIoFD is the FD to connect to dev gofer. devIoFD int // goferFilestoreFDs are FDs to the regular files that will back the tmpfs or // overlayfs mount for certain gofer mounts. goferFilestoreFDs intFlags // goferMountConfs contains information about how the gofer mounts have been // configured. The first entry is for rootfs and the following entries are // for bind mounts in Spec.Mounts (in the same order). goferMountConfs boot.GoferMountConfFlags // stdioFDs are the fds for stdin, stdout, and stderr. They must be // provided in that order. stdioFDs intFlags // passFDs are mappings of user-supplied host to guest file descriptors. passFDs fdMappings // execFD is the host file descriptor used for program execution. execFD int // applyCaps determines if capabilities defined in the spec should be applied // to the process. applyCaps bool // setUpChroot is set to true if the sandbox is started in an empty root. setUpRoot bool // cpuNum number of CPUs to create inside the sandbox. cpuNum int // totalMem sets the initial amount of total memory to report back to the // container. totalMem uint64 // totalHostMem is the total memory reported by host /proc/meminfo. totalHostMem uint64 // userLogFD is the file descriptor to write user logs to. userLogFD int // startSyncFD is the file descriptor to synchronize runsc and sandbox. startSyncFD int // mountsFD is the file descriptor to read list of mounts after they have // been resolved (direct paths, no symlinks). They are resolved outside the // sandbox (e.g. gofer) and sent through this FD. When mountsFD is not // provided, there is no cleaning required for mounts and the mounts in // the spec can be used as is. mountsFD int podInitConfigFD int sinkFDs intFlags saveFDs intFlags // pidns is set if the sandbox is in its own pid namespace. pidns bool // attached is set to true to kill the sandbox process when the parent process // terminates. This flag is set when the command execve's itself because // parent death signal doesn't propagate through execve when uid/gid changes. attached bool // productName is the value to show in // /sys/devices/virtual/dmi/id/product_name. productName string // Value of /sys/kernel/mm/transparent_hugepage/shmem_enabled on the host. hostShmemHuge string // FDs for profile data. profileFDs profile.FDArgs // profilingMetricsFD is a file descriptor to write Sentry metrics data to. profilingMetricsFD int // profilingMetricsLossy sets whether profilingMetricsFD is a lossy channel. // If so, the format used to write to it will contain a checksum. profilingMetricsLossy bool // procMountSyncFD is a file descriptor that has to be closed when the // procfs mount isn't needed anymore. procMountSyncFD int // syncUsernsFD is the file descriptor that has to be closed when the // boot process should invoke setuid/setgid for root user. This is mainly // used to synchronize rootless user namespace initialization. syncUsernsFD int // nvidiaDriverVersion is the Nvidia driver version on the host. nvidiaDriverVersion string } // Name implements subcommands.Command.Name. func (*Boot) Name() string { return "boot" } // Synopsis implements subcommands.Command.Synopsis. func (*Boot) Synopsis() string { return "launch a sandbox process" } // Usage implements subcommands.Command.Usage. func (*Boot) Usage() string { return `boot [flags] ` } // SetFlags implements subcommands.Command.SetFlags. func (b *Boot) SetFlags(f *flag.FlagSet) { f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory") f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process") f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process") f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace") f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox") f.IntVar(&b.procMountSyncFD, "proc-mount-sync-fd", -1, "file descriptor that has to be written to when /proc isn't needed anymore and can be unmounted") f.IntVar(&b.syncUsernsFD, "sync-userns-fd", -1, "file descriptor used to synchronize rootless user namespace initialization.") f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container") f.Uint64Var(&b.totalHostMem, "total-host-memory", 0, "total memory reported by host /proc/meminfo") f.BoolVar(&b.attached, "attached", false, "if attached is true, kills the sandbox process when the parent process terminates") f.StringVar(&b.productName, "product-name", "", "value to show in /sys/devices/virtual/dmi/id/product_name") f.StringVar(&b.nvidiaDriverVersion, "nvidia-driver-version", "", "Nvidia driver version on the host") f.StringVar(&b.hostShmemHuge, "host-shmem-huge", "", "value of /sys/kernel/mm/transparent_hugepage/shmem_enabled on the host") // Open FDs that are donated to the sandbox. f.IntVar(&b.specFD, "spec-fd", -1, "required fd with the container spec") f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process") f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file") f.Var(&b.ioFDs, "io-fds", "list of image FDs and/or socket FDs to connect gofer clients. They must follow this order: root first, then mounts as defined in the spec") f.IntVar(&b.devIoFD, "dev-io-fd", -1, "FD to connect dev gofer client") f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order") f.Var(&b.passFDs, "pass-fd", "mapping of host to guest FDs. They must be in M:N format. M is the host and N the guest descriptor.") f.IntVar(&b.execFD, "exec-fd", -1, "host file descriptor used for program execution.") f.Var(&b.goferFilestoreFDs, "gofer-filestore-fds", "FDs to the regular files that will back the overlayfs or tmpfs mount if a gofer mount is to be overlaid.") f.Var(&b.goferMountConfs, "gofer-mount-confs", "information about how the gofer mounts have been configured.") f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.") f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup") f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is an optional file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).") f.IntVar(&b.podInitConfigFD, "pod-init-config-fd", -1, "file descriptor to the pod init configuration file.") f.Var(&b.sinkFDs, "sink-fds", "ordered list of file descriptors to be used by the sinks defined in --pod-init-config.") f.Var(&b.saveFDs, "save-fds", "ordered list of file descriptors to be used save checkpoints. Order: kernel state, page metadata, page file") // Profiling flags. b.profileFDs.SetFromFlags(f) f.IntVar(&b.profilingMetricsFD, "profiling-metrics-fd", -1, "file descriptor to write sentry profiling metrics.") f.BoolVar(&b.profilingMetricsLossy, "profiling-metrics-fd-lossy", false, "if true, treat the sentry profiling metrics FD as lossy and write a checksum to it.") } // Execute implements subcommands.Command.Execute. It starts a sandbox in a // waiting state. func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if b.specFD == -1 || b.controllerFD == -1 || b.startSyncFD == -1 || f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } conf := args[0].(*config.Config) // Set traceback level debug.SetTraceback(conf.Traceback) // Initialize CPUID information. cpuid.Initialize() // Initialize ring0 library. ring0.InitDefault() argOverride := make(map[string]string) // Do these before chroot takes effect, otherwise we can't read /sys. if len(b.productName) == 0 { if product, err := ioutil.ReadFile("/sys/devices/virtual/dmi/id/product_name"); err != nil { log.Warningf("Not setting product_name: %v", err) } else { b.productName = strings.TrimSpace(string(product)) log.Infof("Setting product_name: %q", b.productName) argOverride["product-name"] = b.productName } } if conf.AppHugePages && len(b.hostShmemHuge) == 0 { hostShmemHuge, err := hostmm.GetTransparentHugepageEnum("shmem_enabled") if err != nil { log.Warningf("Failed to infer --host-shmem-huge: %v", err) } else { b.hostShmemHuge = hostShmemHuge log.Infof("Setting host-shmem-huge: %q", b.hostShmemHuge) argOverride["host-shmem-huge"] = b.hostShmemHuge } } if b.attached { // Ensure this process is killed after parent process terminates when // attached mode is enabled. In the unfortunate event that the parent // terminates before this point, this process leaks. if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil { util.Fatalf("error setting parent death signal: %v", err) } } if b.syncUsernsFD >= 0 { syncUsernsForRootless(b.syncUsernsFD) argOverride["sync-userns-fd"] = "-1" } // Get the spec from the specFD. We *must* keep this os.File alive past // the call setCapsAndCallSelf, otherwise the FD will be closed and the // child process cannot read it specFile := os.NewFile(uintptr(b.specFD), "spec file") spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile, conf) if err != nil { util.Fatalf("reading spec: %v", err) } if b.setUpRoot { if err := setUpChroot(b.pidns, spec, conf); err != nil { util.Fatalf("error setting up chroot: %v", err) } argOverride["setup-root"] = "false" if !conf.Rootless { // /proc is umounted from a forked process, because the // current one is going to re-execute itself without // capabilities. cmd, w := execProcUmounter() defer cmd.Wait() defer w.Close() if b.procMountSyncFD != -1 { panic("procMountSyncFD is set") } b.procMountSyncFD = int(w.Fd()) argOverride["proc-mount-sync-fd"] = strconv.Itoa(b.procMountSyncFD) // Clear FD_CLOEXEC. Regardless of b.applyCaps, this process will be // re-executed. procMountSyncFD should remain open. if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, w.Fd(), unix.F_SETFD, 0); errno != 0 { util.Fatalf("error clearing CLOEXEC: %v", errno) } if !b.applyCaps { // Remove the args that have already been done before calling self. args := prepareArgs(b.Name(), f, argOverride) // Note that we've already read the spec from the spec FD, and // we will read it again after the exec call. This works // because the ReadSpecFromFile function seeks to the beginning // of the file before reading. util.Fatalf("callSelfAsNobody(%v): %v", args, callSelfAsNobody(args)) // This prevents the specFile finalizer from running and closed // the specFD, which we have passed to ourselves when // re-execing. runtime.KeepAlive(specFile) panic("unreachable") } } } specutils.LogSpecDebug(spec, conf.OCISeccomp) if b.applyCaps { caps := spec.Process.Capabilities if caps == nil { caps = &specs.LinuxCapabilities{} } gPlatform, err := platform.Lookup(conf.Platform) if err != nil { util.Fatalf("loading platform: %v", err) } if gPlatform.Requirements().RequiresCapSysPtrace { // Ptrace platform requires extra capabilities. const c = "CAP_SYS_PTRACE" caps.Bounding = append(caps.Bounding, c) caps.Effective = append(caps.Effective, c) caps.Permitted = append(caps.Permitted, c) } if conf.DirectFS { caps = specutils.MergeCapabilities(caps, directfsSandboxLinuxCaps) } argOverride["apply-caps"] = "false" // Remove the args that have already been done before calling self. args := prepareArgs(b.Name(), f, argOverride) // Note that we've already read the spec from the spec FD, and // we will read it again after the exec call. This works // because the ReadSpecFromFile function seeks to the beginning // of the file before reading. util.Fatalf("setCapsAndCallSelf(%v, %v): %v", args, caps, setCapsAndCallSelf(args, caps)) // This prevents the specFile finalizer from running and closed // the specFD, which we have passed to ourselves when // re-execing. runtime.KeepAlive(specFile) panic("unreachable") } if b.syncUsernsFD >= 0 { // syncUsernsFD is set, but runsc hasn't been re-executed with a new UID and GID. // We expect that setCapsAndCallSelf has to be called in this case. panic("unreachable") } // Close specFile to avoid exposing it to the sandbox. if err := specFile.Close(); err != nil { util.Fatalf("closing specFile: %v", err) } // At this point we won't re-execute, so it's safe to limit via rlimits. Any // limit >= 0 works. If the limit is lower than the current number of open // files, then Setrlimit will succeed, and the next open will fail. if conf.FDLimit > -1 { rlimit := unix.Rlimit{ Cur: uint64(conf.FDLimit), Max: uint64(conf.FDLimit), } switch err := unix.Setrlimit(unix.RLIMIT_NOFILE, &rlimit); err { case nil: case unix.EPERM: log.Warningf("FD limit %d is higher than the current hard limit or system-wide maximum", conf.FDLimit) default: util.Fatalf("Failed to set RLIMIT_NOFILE: %v", err) } } // When mountsFD is not provided, there is no cleaning required. if b.mountsFD >= 0 { // Read resolved mount list and replace the original one from the spec. mountsFile := os.NewFile(uintptr(b.mountsFD), "mounts file") cleanMounts, err := specutils.ReadMounts(mountsFile) if err != nil { mountsFile.Close() util.Fatalf("Error reading mounts file: %v", err) } mountsFile.Close() spec.Mounts = cleanMounts } if conf.DirectFS { // sandbox should run with a umask of 0, because we want to preserve file // modes exactly as sent by the sentry, which would have already applied // the application umask. unix.Umask(0) } if conf.EnableCoreTags { if err := coretag.Enable(); err != nil { util.Fatalf("Failed to core tag sentry: %v", err) } // Verify that all sentry threads are properly core tagged, and log // current core tag. coreTags, err := coretag.GetAllCoreTags(os.Getpid()) if err != nil { util.Fatalf("Failed read current core tags: %v", err) } if len(coreTags) != 1 { util.Fatalf("Not all child threads were core tagged the same. Tags=%v", coreTags) } log.Infof("Core tag enabled (core tag=%d)", coreTags[0]) } // Create the loader. bootArgs := boot.Args{ ID: f.Arg(0), Spec: spec, Conf: conf, ControllerFD: b.controllerFD, Device: fd.New(b.deviceFD), GoferFDs: b.ioFDs.GetArray(), DevGoferFD: b.devIoFD, StdioFDs: b.stdioFDs.GetArray(), PassFDs: b.passFDs.GetArray(), ExecFD: b.execFD, GoferFilestoreFDs: b.goferFilestoreFDs.GetArray(), GoferMountConfs: b.goferMountConfs.GetArray(), NumCPU: b.cpuNum, TotalMem: b.totalMem, TotalHostMem: b.totalHostMem, UserLogFD: b.userLogFD, ProductName: b.productName, PodInitConfigFD: b.podInitConfigFD, SinkFDs: b.sinkFDs.GetArray(), ProfileOpts: b.profileFDs.ToOpts(), NvidiaDriverVersion: b.nvidiaDriverVersion, HostShmemHuge: b.hostShmemHuge, SaveFDs: b.saveFDs.GetFDs(), } l, err := boot.New(bootArgs) if err != nil { util.Fatalf("creating loader: %v", err) } // Fatalf exits the process and doesn't run defers. // 'l' must be destroyed explicitly after this point! if b.procMountSyncFD != -1 { l.PreSeccompCallback = func() { // Call validateOpenFDs() before umounting /proc. validateOpenFDs(bootArgs.PassFDs) // Umount /proc right before installing seccomp filters. umountProc(b.procMountSyncFD) } } // Prepare metrics. // This needs to happen after the kernel is initialized (such that all metrics are registered) // but before the start-sync file is notified, as the parent process needs to query for // registered metrics prior to sending the start signal. metric.Initialize() if b.profilingMetricsFD != -1 { if err := metric.StartProfilingMetrics(metric.ProfilingMetricsOptions[*os.File]{ Sink: os.NewFile(uintptr(b.profilingMetricsFD), "metrics file"), Lossy: b.profilingMetricsLossy, Metrics: conf.ProfilingMetrics, Rate: time.Duration(conf.ProfilingMetricsRate) * time.Microsecond, }); err != nil { l.Destroy() util.Fatalf("unable to start profiling metrics: %v", err) } defer metric.StopProfilingMetrics() } // Notify the parent process the sandbox has booted (and that the controller // is up). startSyncFile := os.NewFile(uintptr(b.startSyncFD), "start-sync file") buf := make([]byte, 1) if w, err := startSyncFile.Write(buf); err != nil || w != 1 { l.Destroy() util.Fatalf("unable to write into the start-sync descriptor: %v", err) } // Closes startSyncFile because 'l.Run()' only returns when the sandbox exits. startSyncFile.Close() // Wait for the start signal from runsc. l.WaitForStartSignal() // Run the application and wait for it to finish. if err := l.Run(); err != nil { l.Destroy() util.Fatalf("running sandbox: %v", err) } ws := l.WaitExit() log.Infof("application exiting with %+v", ws) waitStatus := args[1].(*unix.WaitStatus) *waitStatus = unix.WaitStatus(ws) l.Destroy() return subcommands.ExitSuccess } // prepareArgs returns the args that can be used to re-execute the current // program. It manipulates the flags of the subcommands.Command identified by // subCmdName and fSet is the flag.FlagSet of this subcommand. It applies the // flags specified by override map. In case of conflict, flag is overriden. // // Postcondition: prepareArgs() takes ownership of override map. func prepareArgs(subCmdName string, fSet *flag.FlagSet, override map[string]string) []string { var args []string // Add all args up until (and including) the sub command. for _, arg := range os.Args { args = append(args, arg) if arg == subCmdName { break } } // Set sub command flags. Iterate through all the explicitly set flags. fSet.Visit(func(gf *flag.Flag) { // If a conflict is found with override, then prefer override flag. if ov, ok := override[gf.Name]; ok { args = append(args, fmt.Sprintf("--%s=%s", gf.Name, ov)) delete(override, gf.Name) return } // Otherwise pass through the original flag. args = append(args, fmt.Sprintf("--%s=%s", gf.Name, gf.Value)) }) // Apply remaining override flags (that didn't conflict above). for of, ov := range override { args = append(args, fmt.Sprintf("--%s=%s", of, ov)) } // Add the non-flag arguments at the end. args = append(args, fSet.Args()...) return args } // execProcUmounter execute a child process that umounts /proc when the // returned pipe is closed. func execProcUmounter() (*exec.Cmd, *os.File) { r, w, err := os.Pipe() if err != nil { util.Fatalf("error creating a pipe: %v", err) } defer r.Close() cmd := exec.Command(specutils.ExePath) cmd.Args = append(cmd.Args, "umount", "--sync-fd=3", "/proc") cmd.ExtraFiles = append(cmd.ExtraFiles, r) cmd.Stdin = os.Stdin cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr if err := cmd.Start(); err != nil { util.Fatalf("error executing umounter: %v", err) } return cmd, w } // umountProc writes to syncFD signalling the process started by // execProcUmounter() to umount /proc. func umountProc(syncFD int) { syncFile := os.NewFile(uintptr(syncFD), "procfs umount sync FD") buf := make([]byte, 1) if w, err := syncFile.Write(buf); err != nil || w != 1 { util.Fatalf("unable to write into the proc umounter descriptor: %v", err) } syncFile.Close() var waitStatus unix.WaitStatus if _, err := unix.Wait4(0, &waitStatus, 0, nil); err != nil { util.Fatalf("error waiting for the proc umounter process: %v", err) } if !waitStatus.Exited() || waitStatus.ExitStatus() != 0 { util.Fatalf("the proc umounter process failed: %v", waitStatus) } if err := unix.Access("/proc/self", unix.F_OK); err != unix.ENOENT { util.Fatalf("/proc is still accessible") } } // validateOpenFDs checks that the sandbox process does not have any open // directory FDs. func validateOpenFDs(passFDs []boot.FDMapping) { passHostFDs := make(map[int]struct{}) for _, passFD := range passFDs { passHostFDs[passFD.Host] = struct{}{} } const selfFDDir = "/proc/self/fd" if err := filepath.WalkDir(selfFDDir, func(path string, d os.DirEntry, err error) error { if err != nil { return err } if d.Type() != os.ModeSymlink { // All entries are symlinks. Ignore the callback for fd directory itself. return nil } if fdInfo, err := os.Stat(path); err != nil { if os.IsNotExist(err) { // Ignore FDs that are now closed. For example, the FD to selfFDDir that // was opened by filepath.WalkDir() to read dirents. return nil } return fmt.Errorf("os.Stat(%s) failed: %v", path, err) } else if !fdInfo.IsDir() { return nil } // Uh-oh. This is a directory FD. fdNo, err := strconv.Atoi(d.Name()) if err != nil { return fmt.Errorf("strconv.Atoi(%s) failed: %v", d.Name(), err) } dirLink, err := os.Readlink(path) if err != nil { return fmt.Errorf("os.Readlink(%s) failed: %v", path, err) } if _, ok := passHostFDs[fdNo]; ok { // Passed FDs are allowed to be directories. The user must be knowing // what they are doing. Log a warning regardless. log.Warningf("Sandbox has access to FD %d, which is a directory for %s", fdNo, dirLink) return nil } return fmt.Errorf("FD %d is a directory for %s", fdNo, dirLink) }); err != nil { util.Fatalf("WalkDir(%s) failed: %v", selfFDDir, err) } } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/capability.go000066400000000000000000000123251465435605700232130ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "fmt" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/syndtr/gocapability/capability" "gvisor.dev/gvisor/pkg/log" ) var allCapTypes = []capability.CapType{ capability.BOUNDS, capability.EFFECTIVE, capability.PERMITTED, capability.INHERITABLE, capability.AMBIENT, } // applyCaps applies the capabilities in the spec to the current thread. // // Note that it must be called with current thread locked. func applyCaps(caps *specs.LinuxCapabilities) error { // Load current capabilities to trim the ones not permitted. curCaps, err := capability.NewPid2(0) if err != nil { return err } if err := curCaps.Load(); err != nil { return err } // Create an empty capability set to populate. newCaps, err := capability.NewPid2(0) if err != nil { return err } for _, c := range allCapTypes { if !newCaps.Empty(c) { panic("unloaded capabilities must be empty") } set, err := trimCaps(getCaps(c, caps), curCaps) if err != nil { return err } newCaps.Set(c, set...) } if err := newCaps.Apply(capability.CAPS | capability.BOUNDS | capability.AMBS); err != nil { return err } log.Infof("Capabilities applied: %+v", newCaps) return nil } func getCaps(which capability.CapType, caps *specs.LinuxCapabilities) []string { switch which { case capability.BOUNDS: return caps.Bounding case capability.EFFECTIVE: return caps.Effective case capability.PERMITTED: return caps.Permitted case capability.INHERITABLE: return caps.Inheritable case capability.AMBIENT: return caps.Ambient } panic(fmt.Sprint("invalid capability type:", which)) } func trimCaps(names []string, setter capability.Capabilities) ([]capability.Cap, error) { wantedCaps, err := capsFromNames(names) if err != nil { return nil, err } // Trim down capabilities that aren't possible to acquire. var caps []capability.Cap for _, c := range wantedCaps { // Capability rules are more complicated than this, but this catches most // problems with tests running with non-privileged user. if setter.Get(capability.PERMITTED, c) { caps = append(caps, c) } else { log.Warningf("Capability %q is not permitted, dropping it.", c) } } return caps, nil } func capsFromNames(names []string) ([]capability.Cap, error) { var caps []capability.Cap for _, name := range names { cap, ok := capFromName[name] if !ok { return nil, fmt.Errorf("invalid capability %q", name) } caps = append(caps, cap) } return caps, nil } var capFromName = map[string]capability.Cap{ "CAP_CHOWN": capability.CAP_CHOWN, "CAP_DAC_OVERRIDE": capability.CAP_DAC_OVERRIDE, "CAP_DAC_READ_SEARCH": capability.CAP_DAC_READ_SEARCH, "CAP_FOWNER": capability.CAP_FOWNER, "CAP_FSETID": capability.CAP_FSETID, "CAP_KILL": capability.CAP_KILL, "CAP_SETGID": capability.CAP_SETGID, "CAP_SETUID": capability.CAP_SETUID, "CAP_SETPCAP": capability.CAP_SETPCAP, "CAP_LINUX_IMMUTABLE": capability.CAP_LINUX_IMMUTABLE, "CAP_NET_BIND_SERVICE": capability.CAP_NET_BIND_SERVICE, "CAP_NET_BROADCAST": capability.CAP_NET_BROADCAST, "CAP_NET_ADMIN": capability.CAP_NET_ADMIN, "CAP_NET_RAW": capability.CAP_NET_RAW, "CAP_IPC_LOCK": capability.CAP_IPC_LOCK, "CAP_IPC_OWNER": capability.CAP_IPC_OWNER, "CAP_SYS_MODULE": capability.CAP_SYS_MODULE, "CAP_SYS_RAWIO": capability.CAP_SYS_RAWIO, "CAP_SYS_CHROOT": capability.CAP_SYS_CHROOT, "CAP_SYS_PTRACE": capability.CAP_SYS_PTRACE, "CAP_SYS_PACCT": capability.CAP_SYS_PACCT, "CAP_SYS_ADMIN": capability.CAP_SYS_ADMIN, "CAP_SYS_BOOT": capability.CAP_SYS_BOOT, "CAP_SYS_NICE": capability.CAP_SYS_NICE, "CAP_SYS_RESOURCE": capability.CAP_SYS_RESOURCE, "CAP_SYS_TIME": capability.CAP_SYS_TIME, "CAP_SYS_TTY_CONFIG": capability.CAP_SYS_TTY_CONFIG, "CAP_MKNOD": capability.CAP_MKNOD, "CAP_LEASE": capability.CAP_LEASE, "CAP_AUDIT_WRITE": capability.CAP_AUDIT_WRITE, "CAP_AUDIT_CONTROL": capability.CAP_AUDIT_CONTROL, "CAP_SETFCAP": capability.CAP_SETFCAP, "CAP_MAC_OVERRIDE": capability.CAP_MAC_OVERRIDE, "CAP_MAC_ADMIN": capability.CAP_MAC_ADMIN, "CAP_SYSLOG": capability.CAP_SYSLOG, "CAP_WAKE_ALARM": capability.CAP_WAKE_ALARM, "CAP_BLOCK_SUSPEND": capability.CAP_BLOCK_SUSPEND, "CAP_AUDIT_READ": capability.CAP_AUDIT_READ, "CAP_PERFMON": capability.CAP_PERFMON, "CAP_BPF": capability.CAP_BPF, "CAP_CHECKPOINT_RESTORE": capability.CAP_CHECKPOINT_RESTORE, } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/checkpoint.go000066400000000000000000000114161465435605700232210ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "fmt" "os" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/state/statefile" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // Checkpoint implements subcommands.Command for the "checkpoint" command. type Checkpoint struct { imagePath string leaveRunning bool compression CheckpointCompression excludeCommittedZeroPages bool // direct indicates whether O_DIRECT should be used for writing the // checkpoint pages file. It bypasses the kernel page cache. It is beneficial // if the checkpoint files are not expected to be read again on this host. // For example, if the checkpoint files will be stored on a network block // device, which will be detached after the checkpoint is done. direct bool } // Name implements subcommands.Command.Name. func (*Checkpoint) Name() string { return "checkpoint" } // Synopsis implements subcommands.Command.Synopsis. func (*Checkpoint) Synopsis() string { return "checkpoint current state of container (experimental)" } // Usage implements subcommands.Command.Usage. func (*Checkpoint) Usage() string { return `checkpoint [flags] - save current state of container. ` } // SetFlags implements subcommands.Command.SetFlags. func (c *Checkpoint) SetFlags(f *flag.FlagSet) { f.StringVar(&c.imagePath, "image-path", "", "directory path to saved container image") f.BoolVar(&c.leaveRunning, "leave-running", false, "restart the container after checkpointing") f.Var(newCheckpointCompressionValue(statefile.CompressionLevelDefault, &c.compression), "compression", "compress checkpoint image on disk. Values: none|flate-best-speed.") f.BoolVar(&c.excludeCommittedZeroPages, "exclude-committed-zero-pages", false, "exclude committed zero-filled pages from checkpoint") f.BoolVar(&c.direct, "direct", false, "use O_DIRECT for writing checkpoint pages file") // Unimplemented flags necessary for compatibility with docker. var wp string f.StringVar(&wp, "work-path", "", "ignored") } // Execute implements subcommands.Command.Execute. func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) conf := args[0].(*config.Config) cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { util.Fatalf("loading container: %v", err) } if c.imagePath == "" { util.Fatalf("image-path flag must be provided") } if err := os.MkdirAll(c.imagePath, 0755); err != nil { util.Fatalf("making directories at path provided: %v", err) } sOpts := statefile.Options{ Compression: c.compression.Level(), } mfOpts := pgalloc.SaveOpts{ ExcludeCommittedZeroPages: c.excludeCommittedZeroPages, } if c.leaveRunning { // Do not destroy the sandbox after saving. sOpts.Resume = true } if err := cont.Checkpoint(c.imagePath, c.direct, sOpts, mfOpts); err != nil { util.Fatalf("checkpoint failed: %v", err) } return subcommands.ExitSuccess } // CheckpointCompression represents checkpoint image writer behavior. The // default behavior is to compress because the default behavior used to be to // always compress. type CheckpointCompression statefile.CompressionLevel func newCheckpointCompressionValue(val statefile.CompressionLevel, p *CheckpointCompression) *CheckpointCompression { *p = CheckpointCompression(val) return (*CheckpointCompression)(p) } // Set implements flag.Value. func (g *CheckpointCompression) Set(v string) error { t, err := statefile.CompressionLevelFromString(v) if err != nil { return fmt.Errorf("invalid checkpoint compression type %q", v) } *g = CheckpointCompression(t) return nil } // Get implements flag.Getter. func (g *CheckpointCompression) Get() any { return *g } // String implements flag.Value. func (g CheckpointCompression) String() string { return string(g) } // Level returns corresponding statefile.CompressionLevel value. func (g CheckpointCompression) Level() statefile.CompressionLevel { return statefile.CompressionLevel(g) } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/chroot.go000066400000000000000000000177031465435605700223750ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "fmt" "os" "path" "path/filepath" "regexp" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" ) // mountInChroot creates the destination mount point in the given chroot and // mounts the source. func mountInChroot(chroot, src, dst, typ string, flags uint32) error { chrootDst := filepath.Join(chroot, dst) log.Infof("Mounting %q at %q", src, chrootDst) if err := specutils.SafeSetupAndMount(src, chrootDst, typ, flags, "/proc"); err != nil { return fmt.Errorf("error mounting %q at %q: %v", src, chrootDst, err) } return nil } func pivotRoot(root string) error { if err := os.Chdir(root); err != nil { return fmt.Errorf("error changing working directory: %v", err) } // pivot_root(new_root, put_old) moves the root filesystem (old_root) // of the calling process to the directory put_old and makes new_root // the new root filesystem of the calling process. // // pivot_root(".", ".") makes a mount of the working directory the new // root filesystem, so it will be moved in "/" and then the old_root // will be moved to "/" too. The parent mount of the old_root will be // new_root, so after umounting the old_root, we will see only // the new_root in "/". if err := unix.PivotRoot(".", "."); err != nil { return fmt.Errorf("pivot_root failed, make sure that the root mount has a parent: %v", err) } if err := unix.Unmount(".", unix.MNT_DETACH); err != nil { return fmt.Errorf("error umounting the old root file system: %v", err) } return nil } func copyFile(dst, src string) error { in, err := os.Open(src) if err != nil { return err } defer in.Close() out, err := os.Create(dst) if err != nil { return err } defer out.Close() _, err = out.ReadFrom(in) return err } // setUpChroot creates an empty directory with runsc mounted at /runsc and proc // mounted at /proc. func setUpChroot(pidns bool, spec *specs.Spec, conf *config.Config) error { // We are a new mount namespace, so we can use /tmp as a directory to // construct a new root. chroot := os.TempDir() log.Infof("Setting up sandbox chroot in %q", chroot) // Convert all shared mounts into slave to be sure that nothing will be // propagated outside of our namespace. if err := specutils.SafeMount("", "/", "", unix.MS_SLAVE|unix.MS_REC, "", "/proc"); err != nil { return fmt.Errorf("error converting mounts: %v", err) } if err := specutils.SafeMount("runsc-root", chroot, "tmpfs", unix.MS_NOSUID|unix.MS_NODEV|unix.MS_NOEXEC, "", "/proc"); err != nil { return fmt.Errorf("error mounting tmpfs in chroot: %v", err) } if err := os.Mkdir(filepath.Join(chroot, "etc"), 0755); err != nil { return fmt.Errorf("error creating /etc in chroot: %v", err) } if err := copyFile(filepath.Join(chroot, "etc/localtime"), "/etc/localtime"); err != nil { log.Warningf("Failed to copy /etc/localtime: %v. UTC timezone will be used.", err) } if pidns { flags := uint32(unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC | unix.MS_RDONLY) if err := mountInChroot(chroot, "proc", "/proc", "proc", flags); err != nil { return fmt.Errorf("error mounting proc in chroot: %v", err) } } else { if err := mountInChroot(chroot, "/proc", "/proc", "bind", unix.MS_BIND|unix.MS_RDONLY|unix.MS_REC); err != nil { return fmt.Errorf("error mounting proc in chroot: %v", err) } } if err := tpuProxyUpdateChroot(chroot, spec, conf); err != nil { return fmt.Errorf("error configuring chroot for TPU devices: %w", err) } if err := specutils.SafeMount("", chroot, "", unix.MS_REMOUNT|unix.MS_RDONLY|unix.MS_BIND, "", "/proc"); err != nil { return fmt.Errorf("error remounting chroot in read-only: %v", err) } return pivotRoot(chroot) } // Mount the path that dest points to for TPU at chroot, the mounted path is returned in absolute form. func mountTPUSyslinkInChroot(chroot, dest, relativePath string, validator func(link string) bool) (string, error) { src, err := os.Readlink(dest) if err != nil { return "", fmt.Errorf("error reading %v: %v", src, err) } // Ensure the link is in the form we expect. if !validator(src) { return "", fmt.Errorf("unexpected link %q -> %q", dest, src) } path, err := filepath.Abs(path.Join(filepath.Dir(dest), src, relativePath)) if err != nil { return "", fmt.Errorf("error parsing path %q: %v", src, err) } if err := mountInChroot(chroot, path, path, "bind", unix.MS_BIND|unix.MS_RDONLY); err != nil { return "", fmt.Errorf("error mounting %q in chroot: %v", dest, err) } return path, nil } func mountTPUDeviceInfoInChroot(chroot, devicePath, sysfsFormat, pciDeviceFormat string) error { deviceMinor, valid, err := util.ExtractTPUDeviceMinor(devicePath) if err != nil { return fmt.Errorf("extracting TPU device minor: %w", err) } if !valid { return nil } // Multiple paths link to the /sys/devices// // directory that contains all relevant sysfs accel/vfio device info that we need // bind mounted into the sandbox chroot. We can construct this path by // reading the link below, which points to // * /sys/devices///accel/accel# // * or /sys/devices///vfio-dev/vfio# for VFIO-based TPU // and traversing up 2 directories. // The sysDevicePath itself is a soft link to the device directory. sysDevicePath := fmt.Sprintf(sysfsFormat, deviceMinor) sysPCIDeviceDir, err := mountTPUSyslinkInChroot(chroot, sysDevicePath, "../..", func(link string) bool { sysDeviceLinkMatcher := regexp.MustCompile(fmt.Sprintf(pciDeviceFormat, deviceMinor)) return sysDeviceLinkMatcher.MatchString(link) }) if err != nil { return err } // Mount the device's IOMMU group if available. iommuGroupPath := path.Join(sysPCIDeviceDir, "iommu_group") if _, err := os.Stat(iommuGroupPath); err == nil { if _, err := mountTPUSyslinkInChroot(chroot, iommuGroupPath, "", func(link string) bool { iommuGroupPathMatcher := regexp.MustCompile(`../../../kernel/iommu_groups/\d+`) return iommuGroupPathMatcher.MatchString(link) }); err != nil { return err } } return nil } func tpuProxyUpdateChroot(chroot string, spec *specs.Spec, conf *config.Config) error { if !specutils.TPUProxyIsEnabled(spec, conf) { return nil } // When a path glob is added to pathGlobToSysfsFormat, the corresponding pciDeviceFormat has to be added to pathGlobToPciDeviceFormat. pathGlobToSysfsFormat := map[string]string{ "/dev/accel*": "/sys/class/accel/accel%d", "/dev/vfio/*": "/sys/class/vfio-dev/vfio%d"} pathGlobToPciDeviceFormat := map[string]string{ "/dev/accel*": `../../devices/pci0000:[[:xdigit:]]{2}/(\d+:\d+:\d+\.\d+)/accel/accel%d`, "/dev/vfio/*": `../../devices/pci0000:[[:xdigit:]]{2}/(\d+:\d+:\d+\.\d+)/vfio-dev/vfio%d`} // Bind mount device info directories for all TPU devices on the host. // For v4 TPU, the directory /sys/devices///accel/accel# is mounted; // For v5e TPU, the directory /sys/devices///vfio-dev/vfio# is mounted. for pathGlob, sysfsFormat := range pathGlobToSysfsFormat { paths, err := filepath.Glob(pathGlob) if err != nil { return fmt.Errorf("enumerating TPU device files: %w", err) } for _, devPath := range paths { if err := mountTPUDeviceInfoInChroot(chroot, devPath, sysfsFormat, pathGlobToPciDeviceFormat[pathGlob]); err != nil { return err } } } return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/cmd.go000066400000000000000000000067001465435605700216350ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package cmd holds implementations of the runsc commands. package cmd import ( "fmt" "os" "runtime" "strconv" "strings" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/runsc/specutils" ) // intFlags can be used with int flags that appear multiple times. It supports // comma-separated lists too. type intFlags []int // String implements flag.Value. func (i *intFlags) String() string { sInts := make([]string, 0, len(*i)) for _, fd := range *i { sInts = append(sInts, strconv.Itoa(fd)) } return strings.Join(sInts, ",") } // Get implements flag.Value. func (i *intFlags) Get() any { return i } // GetArray returns an array of ints representing FDs. func (i *intFlags) GetArray() []int { return *i } // GetFDs returns an array of *fd.FD. func (i *intFlags) GetFDs() []*fd.FD { rv := make([]*fd.FD, 0, len(*i)) for _, val := range *i { rv = append(rv, fd.New(val)) } return rv } // Set implements flag.Value. Set(String()) should be idempotent. func (i *intFlags) Set(s string) error { for _, sFD := range strings.Split(s, ",") { fd, err := strconv.Atoi(sFD) if err != nil { return fmt.Errorf("invalid flag value: %v", err) } if fd < -1 { return fmt.Errorf("flag value must be >= -1: %d", fd) } *i = append(*i, fd) } return nil } // setCapsAndCallSelf sets capabilities to the current thread and then execve's // itself again with the arguments specified in 'args' to restart the process // with the desired capabilities. func setCapsAndCallSelf(args []string, caps *specs.LinuxCapabilities) error { // Keep thread locked while capabilities are changed. runtime.LockOSThread() defer runtime.UnlockOSThread() if err := applyCaps(caps); err != nil { return fmt.Errorf("applyCaps() failed: %v", err) } binPath := specutils.ExePath log.Infof("Execve %q again, bye!", binPath) err := unix.Exec(binPath, args, os.Environ()) return fmt.Errorf("error executing %s: %v", binPath, err) } // callSelfAsNobody sets UID and GID to nobody and then execve's itself again. func callSelfAsNobody(args []string) error { // Keep thread locked while user/group are changed. runtime.LockOSThread() defer runtime.UnlockOSThread() const nobody = 65534 if _, _, err := unix.RawSyscall(unix.SYS_SETGID, uintptr(nobody), 0, 0); err != 0 { return fmt.Errorf("error setting uid: %v", err) } if _, _, err := unix.RawSyscall(unix.SYS_SETUID, uintptr(nobody), 0, 0); err != 0 { return fmt.Errorf("error setting gid: %v", err) } // Drop all capabilities. if err := applyCaps(&specs.LinuxCapabilities{}); err != nil { return fmt.Errorf("error dropping capabilities: %w", err) } binPath := specutils.ExePath log.Infof("Execve %q again, bye!", binPath) err := unix.Exec(binPath, args, os.Environ()) return fmt.Errorf("error executing %s: %v", binPath, err) } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/cmd_state_autogen.go000066400000000000000000000001311465435605700245470ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package cmd golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/cmd_unsafe_state_autogen.go000066400000000000000000000000651465435605700261160ustar00rootroot00000000000000// automatically generated by stateify. package cmd golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/create.go000066400000000000000000000075131465435605700223400ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "github.com/google/subcommands" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/specutils" ) // Create implements subcommands.Command for the "create" command. type Create struct { // bundleDir is the path to the bundle directory (defaults to the // current working directory). bundleDir string // pidFile is the filename that the sandbox pid will be written to. // This file should only be created once the container process inside // the sandbox is ready to use. pidFile string // consoleSocket is the path to an AF_UNIX socket which will receive a // file descriptor referencing the master end of the console's // pseudoterminal. This is ignored unless spec.Process.Terminal is // true. consoleSocket string // userLog is the path to send user-visible logs to. This log is different // from debug logs. The former is meant to be consumed by the users and should // contain only information that is relevant to the person running the // container, e.g. unsupported syscalls, while the later is more verbose and // consumed by developers. userLog string } // Name implements subcommands.Command.Name. func (*Create) Name() string { return "create" } // Synopsis implements subcommands.Command.Synopsis. func (*Create) Synopsis() string { return "create a secure container" } // Usage implements subcommands.Command.Usage. func (*Create) Usage() string { return `create [flags] - create a secure container ` } // SetFlags implements subcommands.Command.SetFlags. func (c *Create) SetFlags(f *flag.FlagSet) { f.StringVar(&c.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory") f.StringVar(&c.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal") f.StringVar(&c.pidFile, "pid-file", "", "filename that the container pid will be written to") f.StringVar(&c.userLog, "user-log", "", "filename to send user-visible logs to. Empty means no logging.") } // Execute implements subcommands.Command.Execute. func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) conf := args[0].(*config.Config) if conf.Rootless { return util.Errorf("Rootless mode not supported with %q", c.Name()) } bundleDir := c.bundleDir if bundleDir == "" { bundleDir = getwdOrDie() } spec, err := specutils.ReadSpec(bundleDir, conf) if err != nil { return util.Errorf("reading spec: %v", err) } specutils.LogSpecDebug(spec, conf.OCISeccomp) // Create the container. A new sandbox will be created for the // container unless the metadata specifies that it should be run in an // existing container. contArgs := container.Args{ ID: id, Spec: spec, BundleDir: bundleDir, ConsoleSocket: c.consoleSocket, PIDFile: c.pidFile, UserLog: c.userLog, } if _, err := container.New(conf, contArgs); err != nil { return util.Errorf("creating container: %v", err) } return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/debug.go000066400000000000000000000257141465435605700221660ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "os" "os/signal" "strconv" "strings" "sync" "time" "github.com/google/subcommands" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // Debug implements subcommands.Command for the "debug" command. type Debug struct { pid int stacks bool signal int profileBlock string profileCPU string profileHeap string profileMutex string trace string strace string logLevel string logPackets string delay time.Duration duration time.Duration ps bool mount string } // Name implements subcommands.Command. func (*Debug) Name() string { return "debug" } // Synopsis implements subcommands.Command. func (*Debug) Synopsis() string { return "shows a variety of debug information" } // Usage implements subcommands.Command. func (*Debug) Usage() string { return `debug [flags] ` } // SetFlags implements subcommands.Command. func (d *Debug) SetFlags(f *flag.FlagSet) { f.IntVar(&d.pid, "pid", 0, "sandbox process ID. Container ID is not necessary if this is set") f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log") f.StringVar(&d.profileBlock, "profile-block", "", "writes block profile to the given file.") f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.") f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.") f.StringVar(&d.profileMutex, "profile-mutex", "", "writes mutex profile to the given file.") f.DurationVar(&d.delay, "delay", time.Hour, "amount of time to delay for collecting heap and goroutine profiles.") f.DurationVar(&d.duration, "duration", time.Hour, "amount of time to wait for CPU and trace profiles.") f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.") f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox") f.StringVar(&d.strace, "strace", "", `A comma separated list of syscalls to trace. "all" enables all traces, "off" disables all.`) f.StringVar(&d.logLevel, "log-level", "", "The log level to set: warning (0), info (1), or debug (2).") f.StringVar(&d.logPackets, "log-packets", "", "A boolean value to enable or disable packet logging: true or false.") f.BoolVar(&d.ps, "ps", false, "lists processes") f.StringVar(&d.mount, "mount", "", "Mount a filesystem (-mount fstype:source:destination).") } // Execute implements subcommands.Command.Execute. func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { var c *container.Container conf := args[0].(*config.Config) if conf.ProfileBlock != "" || conf.ProfileCPU != "" || conf.ProfileHeap != "" || conf.ProfileMutex != "" { return util.Errorf("global -profile-{block,cpu,heap,mutex} flags have no effect on runsc debug. Pass runsc debug -profile-{block,cpu,heap,mutex} instead") } if conf.TraceFile != "" { return util.Errorf("global -trace flag has no effect on runsc debug. Pass runsc debug -trace instead") } if d.pid == 0 { // No pid, container ID must have been provided. if f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) var err error c, err = container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{SkipCheck: true}) if err != nil { return util.Errorf("loading container %q: %v", f.Arg(0), err) } } else { if f.NArg() != 0 { f.Usage() return subcommands.ExitUsageError } // Go over all sandboxes and find the one that matches PID. ids, err := container.ListSandboxes(conf.RootDir) if err != nil { return util.Errorf("listing containers: %v", err) } for _, id := range ids { candidate, err := container.Load(conf.RootDir, id, container.LoadOpts{Exact: true, SkipCheck: true}) if err != nil { log.Warningf("Skipping container %q: %v", id, err) continue } if candidate.SandboxPid() == d.pid { c = candidate break } } if c == nil { return util.Errorf("container with PID %d not found", d.pid) } } if !c.IsSandboxRunning() { return util.Errorf("container sandbox is not running") } util.Infof("Found sandbox %q, PID: %d", c.Sandbox.ID, c.Sandbox.Getpid()) // Perform synchronous actions. if d.signal > 0 { pid := c.Sandbox.Getpid() util.Infof("Sending signal %d to process: %d", d.signal, pid) if err := unix.Kill(pid, unix.Signal(d.signal)); err != nil { return util.Errorf("failed to send signal %d to processs %d", d.signal, pid) } } if d.stacks { util.Infof("Retrieving sandbox stacks") stacks, err := c.Sandbox.Stacks() if err != nil { return util.Errorf("retrieving stacks: %v", err) } util.Infof(" *** Stack dump ***\n%s", stacks) } if d.strace != "" || len(d.logLevel) != 0 || len(d.logPackets) != 0 { args := control.LoggingArgs{} switch strings.ToLower(d.strace) { case "": // strace not set, nothing to do here. case "off": util.Infof("Disabling strace") args.SetStrace = true case "all": util.Infof("Enabling all straces") args.SetStrace = true args.EnableStrace = true default: util.Infof("Enabling strace for syscalls: %s", d.strace) args.SetStrace = true args.EnableStrace = true args.StraceAllowlist = strings.Split(d.strace, ",") } if len(d.logLevel) != 0 { args.SetLevel = true switch strings.ToLower(d.logLevel) { case "warning", "0": args.Level = log.Warning case "info", "1": args.Level = log.Info case "debug", "2": args.Level = log.Debug default: return util.Errorf("invalid log level %q", d.logLevel) } util.Infof("Setting log level %v", args.Level) } if len(d.logPackets) != 0 { args.SetLogPackets = true lp, err := strconv.ParseBool(d.logPackets) if err != nil { return util.Errorf("invalid value for log_packets %q", d.logPackets) } args.LogPackets = lp if args.LogPackets { util.Infof("Enabling packet logging") } else { util.Infof("Disabling packet logging") } } if err := c.Sandbox.ChangeLogging(args); err != nil { return util.Errorf(err.Error()) } util.Infof("Logging options changed") } if d.ps { util.Infof("Retrieving process list") pList, err := c.Processes() if err != nil { util.Fatalf("getting processes for container: %v", err) } o, err := control.ProcessListToJSON(pList) if err != nil { util.Fatalf("generating JSON: %v", err) } util.Infof("%s", o) } if d.mount != "" { opts := strings.Split(d.mount, ":") if len(opts) != 3 { util.Fatalf("Mount failed: invalid option: %v", d.mount) } fstype := opts[0] src := opts[1] dest := opts[2] if err := c.Sandbox.Mount(c.ID, fstype, src, dest); err != nil { util.Fatalf(err.Error()) } } // Open profiling files. var ( blockFile *os.File cpuFile *os.File heapFile *os.File mutexFile *os.File traceFile *os.File ) if d.profileBlock != "" { f, err := os.OpenFile(d.profileBlock, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) if err != nil { return util.Errorf("error opening blocking profile output: %v", err) } defer f.Close() blockFile = f } if d.profileCPU != "" { f, err := os.OpenFile(d.profileCPU, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) if err != nil { return util.Errorf("error opening cpu profile output: %v", err) } defer f.Close() cpuFile = f } if d.profileHeap != "" { f, err := os.OpenFile(d.profileHeap, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) if err != nil { return util.Errorf("error opening heap profile output: %v", err) } defer f.Close() heapFile = f } if d.profileMutex != "" { f, err := os.OpenFile(d.profileMutex, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) if err != nil { return util.Errorf("error opening mutex profile output: %v", err) } defer f.Close() mutexFile = f } if d.trace != "" { f, err := os.OpenFile(d.trace, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) if err != nil { return util.Errorf("error opening trace profile output: %v", err) } traceFile = f } // Collect profiles. var ( wg sync.WaitGroup blockErr error cpuErr error heapErr error mutexErr error traceErr error ) if blockFile != nil { wg.Add(1) go func() { defer wg.Done() blockErr = c.Sandbox.BlockProfile(blockFile, d.duration) }() } if cpuFile != nil { wg.Add(1) go func() { defer wg.Done() cpuErr = c.Sandbox.CPUProfile(cpuFile, d.duration) }() } if heapFile != nil { wg.Add(1) go func() { defer wg.Done() heapErr = c.Sandbox.HeapProfile(heapFile, d.delay) }() } if mutexFile != nil { wg.Add(1) go func() { defer wg.Done() mutexErr = c.Sandbox.MutexProfile(mutexFile, d.duration) }() } if traceFile != nil { wg.Add(1) go func() { defer wg.Done() traceErr = c.Sandbox.Trace(traceFile, d.duration) }() } // Before sleeping, allow us to catch signals and try to exit // gracefully before just exiting. If we can't wait for wg, then // we will not be able to read the errors below safely. readyChan := make(chan struct{}) go func() { defer close(readyChan) wg.Wait() }() signals := make(chan os.Signal, 1) signal.Notify(signals, unix.SIGTERM, unix.SIGINT) select { case <-readyChan: break // Safe to proceed. case <-signals: util.Infof("caught signal, waiting at most one more second.") select { case <-signals: util.Infof("caught second signal, exiting immediately.") os.Exit(1) // Not finished. case <-time.After(time.Second): util.Infof("timeout, exiting.") os.Exit(1) // Not finished. case <-readyChan: break // Safe to proceed. } } // Collect all errors. errorCount := 0 if blockErr != nil { errorCount++ util.Infof("error collecting block profile: %v", blockErr) os.Remove(blockFile.Name()) } if cpuErr != nil { errorCount++ util.Infof("error collecting cpu profile: %v", cpuErr) os.Remove(cpuFile.Name()) } if heapErr != nil { errorCount++ util.Infof("error collecting heap profile: %v", heapErr) os.Remove(heapFile.Name()) } if mutexErr != nil { errorCount++ util.Infof("error collecting mutex profile: %v", mutexErr) os.Remove(mutexFile.Name()) } if traceErr != nil { errorCount++ util.Infof("error collecting trace profile: %v", traceErr) os.Remove(traceFile.Name()) } if errorCount > 0 { return subcommands.ExitFailure } return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/delete.go000066400000000000000000000050501465435605700223310ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "fmt" "os" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // Delete implements subcommands.Command for the "delete" command. type Delete struct { // force indicates that the container should be terminated if running. force bool } // Name implements subcommands.Command.Name. func (*Delete) Name() string { return "delete" } // Synopsis implements subcommands.Command.Synopsis. func (*Delete) Synopsis() string { return "delete resources held by a container" } // Usage implements subcommands.Command.Usage. func (*Delete) Usage() string { return `delete [flags] ` } // SetFlags implements subcommands.Command.SetFlags. func (d *Delete) SetFlags(f *flag.FlagSet) { f.BoolVar(&d.force, "force", false, "terminate container if running") } // Execute implements subcommands.Command.Execute. func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() == 0 { f.Usage() return subcommands.ExitUsageError } conf := args[0].(*config.Config) if err := d.execute(f.Args(), conf); err != nil { util.Fatalf("%v", err) } return subcommands.ExitSuccess } func (d *Delete) execute(ids []string, conf *config.Config) error { for _, id := range ids { c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { if os.IsNotExist(err) && d.force { log.Warningf("couldn't find container %q: %v", id, err) return nil } return fmt.Errorf("loading container %q: %v", id, err) } if !d.force && c.Status != container.Created && c.Status != container.Stopped { return fmt.Errorf("cannot delete container that is not stopped without --force flag") } if err := c.Destroy(); err != nil { return fmt.Errorf("destroying container: %v", err) } } return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/do.go000066400000000000000000000335511465435605700215000ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "encoding/json" "errors" "fmt" "io/ioutil" "math/rand" "net" "os" "os/exec" "path/filepath" "strconv" "strings" "github.com/google/subcommands" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/console" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/specutils" ) var errNoDefaultInterface = errors.New("no default interface found") // Do implements subcommands.Command for the "do" command. It sets up a simple // sandbox and executes the command inside it. See Usage() for more details. type Do struct { root string cwd string ip string quiet bool overlay bool uidMap idMapSlice gidMap idMapSlice } // Name implements subcommands.Command.Name. func (*Do) Name() string { return "do" } // Synopsis implements subcommands.Command.Synopsis. func (*Do) Synopsis() string { return "Simplistic way to execute a command inside the sandbox. It's to be used for testing only." } // Usage implements subcommands.Command.Usage. func (*Do) Usage() string { return `do [flags] - runs a command. This command starts a sandbox with host filesystem mounted inside as readonly, with a writable tmpfs overlay on top of it. The given command is executed inside the sandbox. It's to be used to quickly test applications without having to install or run docker. It doesn't give nearly as many options and it's to be used for testing only. ` } type idMapSlice []specs.LinuxIDMapping // String implements flag.Value.String. func (is *idMapSlice) String() string { idMappings := make([]string, 0, len(*is)) for _, m := range *is { idMappings = append(idMappings, fmt.Sprintf("%d %d %d", m.ContainerID, m.HostID, m.Size)) } return strings.Join(idMappings, ",") } // Get implements flag.Value.Get. func (is *idMapSlice) Get() any { return is } // Set implements flag.Value.Set. Set(String()) should be idempotent. func (is *idMapSlice) Set(s string) error { for _, idMap := range strings.Split(s, ",") { fs := strings.Fields(idMap) if len(fs) != 3 { return fmt.Errorf("invalid mapping: %s", idMap) } var cid, hid, size int var err error if cid, err = strconv.Atoi(fs[0]); err != nil { return fmt.Errorf("invalid mapping: %s", idMap) } if hid, err = strconv.Atoi(fs[1]); err != nil { return fmt.Errorf("invalid mapping: %s", idMap) } if size, err = strconv.Atoi(fs[2]); err != nil { return fmt.Errorf("invalid mapping: %s", idMap) } m := specs.LinuxIDMapping{ ContainerID: uint32(cid), HostID: uint32(hid), Size: uint32(size), } *is = append(*is, m) } return nil } // SetFlags implements subcommands.Command.SetFlags. func (c *Do) SetFlags(f *flag.FlagSet) { f.StringVar(&c.root, "root", "/", `path to the root directory, defaults to "/"`) f.StringVar(&c.cwd, "cwd", ".", "path to the current directory, defaults to the current directory") f.StringVar(&c.ip, "ip", "192.168.10.2", "IPv4 address for the sandbox") f.BoolVar(&c.quiet, "quiet", false, "suppress runsc messages to stdout. Application output is still sent to stdout and stderr") f.BoolVar(&c.overlay, "force-overlay", true, "use an overlay. WARNING: disabling gives the command write access to the host") f.Var(&c.uidMap, "uid-map", "Add a user id mapping [ContainerID, HostID, Size]") f.Var(&c.gidMap, "gid-map", "Add a group id mapping [ContainerID, HostID, Size]") } // Execute implements subcommands.Command.Execute. func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if len(f.Args()) == 0 { f.Usage() return subcommands.ExitUsageError } conf := args[0].(*config.Config) waitStatus := args[1].(*unix.WaitStatus) if conf.Rootless { if err := specutils.MaybeRunAsRoot(); err != nil { return util.Errorf("Error executing inside namespace: %v", err) } // Execution will continue here if no more capabilities are needed... } hostname, err := os.Hostname() if err != nil { return util.Errorf("Error to retrieve hostname: %v", err) } // If c.overlay is set, then enable overlay. conf.Overlay = false // conf.Overlay is deprecated. if c.overlay { conf.Overlay2.Set("all:memory") } else { conf.Overlay2.Set("none") } absRoot, err := resolvePath(c.root) if err != nil { return util.Errorf("Error resolving root: %v", err) } absCwd, err := resolvePath(c.cwd) if err != nil { return util.Errorf("Error resolving current directory: %v", err) } spec := &specs.Spec{ Root: &specs.Root{ Path: absRoot, }, Process: &specs.Process{ Cwd: absCwd, Args: f.Args(), Env: os.Environ(), Capabilities: specutils.AllCapabilities(), Terminal: console.IsPty(os.Stdin.Fd()), }, Hostname: hostname, } cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000)) if c.uidMap != nil || c.gidMap != nil { addNamespace(spec, specs.LinuxNamespace{Type: specs.UserNamespace}) spec.Linux.UIDMappings = c.uidMap spec.Linux.GIDMappings = c.gidMap } if conf.Network == config.NetworkNone { addNamespace(spec, specs.LinuxNamespace{Type: specs.NetworkNamespace}) } else if conf.Rootless { if conf.Network == config.NetworkSandbox { c.notifyUser("*** Warning: sandbox network isn't supported with --rootless, switching to host ***") conf.Network = config.NetworkHost } } else { switch clean, err := c.setupNet(cid, spec); err { case errNoDefaultInterface: log.Warningf("Network interface not found, using internal network") addNamespace(spec, specs.LinuxNamespace{Type: specs.NetworkNamespace}) conf.Network = config.NetworkHost case nil: // Setup successfull. defer clean() default: return util.Errorf("Error setting up network: %v", err) } } return startContainerAndWait(spec, conf, cid, waitStatus) } func addNamespace(spec *specs.Spec, ns specs.LinuxNamespace) { if spec.Linux == nil { spec.Linux = &specs.Linux{} } spec.Linux.Namespaces = append(spec.Linux.Namespaces, ns) } func (c *Do) notifyUser(format string, v ...any) { if !c.quiet { fmt.Printf(format+"\n", v...) } log.Warningf(format, v...) } func resolvePath(path string) (string, error) { var err error path, err = filepath.Abs(path) if err != nil { return "", fmt.Errorf("resolving %q: %v", path, err) } path = filepath.Clean(path) if err := unix.Access(path, 0); err != nil { return "", fmt.Errorf("unable to access %q: %v", path, err) } return path, nil } // setupNet setups up the sandbox network, including the creation of a network // namespace, and iptable rules to redirect the traffic. Returns a cleanup // function to tear down the network. Returns errNoDefaultInterface when there // is no network interface available to setup the network. func (c *Do) setupNet(cid string, spec *specs.Spec) (func(), error) { dev, err := defaultDevice() if err != nil { return nil, errNoDefaultInterface } mtu, err := deviceMTU(dev) if err != nil { return nil, err } peerIP, err := calculatePeerIP(c.ip) if err != nil { return nil, err } veth, peer := deviceNames(cid) cmds := []string{ fmt.Sprintf("ip link add %s mtu %v type veth peer name %s", veth, mtu, peer), // Setup device outside the namespace. fmt.Sprintf("ip addr add %s/24 dev %s", peerIP, peer), fmt.Sprintf("ip link set %s up", peer), // Setup device inside the namespace. fmt.Sprintf("ip netns add %s", cid), fmt.Sprintf("ip link set %s netns %s", veth, cid), fmt.Sprintf("ip netns exec %s ip addr add %s/24 dev %s", cid, c.ip, veth), fmt.Sprintf("ip netns exec %s ip link set %s up", cid, veth), fmt.Sprintf("ip netns exec %s ip link set lo up", cid), fmt.Sprintf("ip netns exec %s ip route add default via %s", cid, peerIP), // Enable network access. "sysctl -w net.ipv4.ip_forward=1", fmt.Sprintf("iptables -t nat -A POSTROUTING -s %s -o %s -m comment --comment runsc-%s -j MASQUERADE", c.ip, dev, peer), fmt.Sprintf("iptables -A FORWARD -i %s -o %s -j ACCEPT", dev, peer), fmt.Sprintf("iptables -A FORWARD -o %s -i %s -j ACCEPT", dev, peer), } for _, cmd := range cmds { log.Debugf("Run %q", cmd) args := strings.Split(cmd, " ") cmd := exec.Command(args[0], args[1:]...) if err := cmd.Run(); err != nil { c.cleanupNet(cid, dev, "", "", "") return nil, fmt.Errorf("failed to run %q: %v", cmd, err) } } resolvPath, err := makeFile("/etc/resolv.conf", "nameserver 8.8.8.8\n", spec) if err != nil { c.cleanupNet(cid, dev, "", "", "") return nil, err } hostnamePath, err := makeFile("/etc/hostname", cid+"\n", spec) if err != nil { c.cleanupNet(cid, dev, resolvPath, "", "") return nil, err } hosts := fmt.Sprintf("127.0.0.1\tlocalhost\n%s\t%s\n", c.ip, cid) hostsPath, err := makeFile("/etc/hosts", hosts, spec) if err != nil { c.cleanupNet(cid, dev, resolvPath, hostnamePath, "") return nil, err } netns := specs.LinuxNamespace{ Type: specs.NetworkNamespace, Path: filepath.Join("/var/run/netns", cid), } addNamespace(spec, netns) return func() { c.cleanupNet(cid, dev, resolvPath, hostnamePath, hostsPath) }, nil } // cleanupNet tries to cleanup the network setup in setupNet. // // It may be called when setupNet is only partially complete, in which case it // will cleanup as much as possible, logging warnings for the rest. // // Unfortunately none of this can be automatically cleaned up on process exit, // we must do so explicitly. func (c *Do) cleanupNet(cid, dev, resolvPath, hostnamePath, hostsPath string) { _, peer := deviceNames(cid) cmds := []string{ fmt.Sprintf("ip link delete %s", peer), fmt.Sprintf("ip netns delete %s", cid), fmt.Sprintf("iptables -t nat -D POSTROUTING -s %s -o %s -m comment --comment runsc-%s -j MASQUERADE", c.ip, dev, peer), fmt.Sprintf("iptables -D FORWARD -i %s -o %s -j ACCEPT", dev, peer), fmt.Sprintf("iptables -D FORWARD -o %s -i %s -j ACCEPT", dev, peer), } for _, cmd := range cmds { log.Debugf("Run %q", cmd) args := strings.Split(cmd, " ") c := exec.Command(args[0], args[1:]...) if err := c.Run(); err != nil { log.Warningf("Failed to run %q: %v", cmd, err) } } tryRemove(resolvPath) tryRemove(hostnamePath) tryRemove(hostsPath) } func deviceNames(cid string) (string, string) { // Device name is limited to 15 letters. return "ve-" + cid, "vp-" + cid } func defaultDevice() (string, error) { out, err := exec.Command("ip", "route", "list", "default").CombinedOutput() if err != nil { return "", err } parts := strings.Split(string(out), " ") if len(parts) < 5 { return "", fmt.Errorf("malformed %q output: %q", "ip route list default", string(out)) } return parts[4], nil } func deviceMTU(dev string) (int, error) { intf, err := net.InterfaceByName(dev) if err != nil { return 0, err } return intf.MTU, nil } func makeFile(dest, content string, spec *specs.Spec) (string, error) { tmpFile, err := ioutil.TempFile("", filepath.Base(dest)) if err != nil { return "", err } if _, err := tmpFile.WriteString(content); err != nil { if err := os.Remove(tmpFile.Name()); err != nil { log.Warningf("Failed to remove %q: %v", tmpFile, err) } return "", err } spec.Mounts = append(spec.Mounts, specs.Mount{ Source: tmpFile.Name(), Destination: dest, Type: "bind", Options: []string{"ro"}, }) return tmpFile.Name(), nil } func tryRemove(path string) { if path == "" { return } if err := os.Remove(path); err != nil { log.Warningf("Failed to remove %q: %v", path, err) } } func calculatePeerIP(ip string) (string, error) { parts := strings.Split(ip, ".") if len(parts) != 4 { return "", fmt.Errorf("invalid IP format %q", ip) } n, err := strconv.Atoi(parts[3]) if err != nil { return "", fmt.Errorf("invalid IP format %q: %v", ip, err) } n++ if n > 255 { n = 1 } return fmt.Sprintf("%s.%s.%s.%d", parts[0], parts[1], parts[2], n), nil } func startContainerAndWait(spec *specs.Spec, conf *config.Config, cid string, waitStatus *unix.WaitStatus) subcommands.ExitStatus { specutils.LogSpecDebug(spec, conf.OCISeccomp) out, err := json.Marshal(spec) if err != nil { return util.Errorf("Error to marshal spec: %v", err) } tmpDir, err := ioutil.TempDir("", "runsc-do") if err != nil { return util.Errorf("Error to create tmp dir: %v", err) } defer os.RemoveAll(tmpDir) log.Infof("Changing configuration RootDir to %q", tmpDir) conf.RootDir = tmpDir cfgPath := filepath.Join(tmpDir, "config.json") if err := ioutil.WriteFile(cfgPath, out, 0755); err != nil { return util.Errorf("Error write spec: %v", err) } containerArgs := container.Args{ ID: cid, Spec: spec, BundleDir: tmpDir, Attached: true, } ct, err := container.New(conf, containerArgs) if err != nil { return util.Errorf("creating container: %v", err) } defer ct.Destroy() if err := ct.Start(conf); err != nil { return util.Errorf("starting container: %v", err) } // Forward signals to init in the container. Thus if we get SIGINT from // ^C, the container gracefully exit, and we can clean up. // // N.B. There is a still a window before this where a signal may kill // this process, skipping cleanup. stopForwarding := ct.ForwardSignals(0 /* pid */, spec.Process.Terminal /* fgProcess */) defer stopForwarding() ws, err := ct.Wait() if err != nil { return util.Errorf("waiting for container: %v", err) } *waitStatus = ws return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/events.go000066400000000000000000000062161465435605700224000ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "encoding/json" "os" "time" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // Events implements subcommands.Command for the "events" command. type Events struct { // The interval between stats reporting. intervalSec int // If true, events will print a single group of stats and exit. stats bool } // Name implements subcommands.Command.Name. func (*Events) Name() string { return "events" } // Synopsis implements subcommands.Command.Synopsis. func (*Events) Synopsis() string { return "display container events such as OOM notifications, cpu, memory, and IO usage statistics" } // Usage implements subcommands.Command.Usage. func (*Events) Usage() string { return ` Where "" is the name for the instance of the container. The events command displays information about the container. By default the information is displayed once every 5 seconds. OPTIONS: ` } // SetFlags implements subcommands.Command.SetFlags. func (evs *Events) SetFlags(f *flag.FlagSet) { f.IntVar(&evs.intervalSec, "interval", 5, "set the stats collection interval, in seconds") f.BoolVar(&evs.stats, "stats", false, "display the container's stats then exit") } // Execute implements subcommands.Command.Execute. func (evs *Events) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) conf := args[0].(*config.Config) c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { util.Fatalf("loading sandbox: %v", err) } // Repeatedly get stats from the container. Sleep a bit after every loop // except the first one. for dur := time.Duration(evs.intervalSec) * time.Second; true; time.Sleep(dur) { // Get the event and print it as JSON. ev, err := c.Event() if err != nil { log.Warningf("Error getting events for container: %v", err) if evs.stats { return subcommands.ExitFailure } continue } log.Debugf("Events: %+v", ev) if err := json.NewEncoder(os.Stdout).Encode(ev.Event); err != nil { log.Warningf("Error encoding event %+v: %v", ev.Event, err) if evs.stats { return subcommands.ExitFailure } continue } // Break if we're only running once. If we got this far it was a success. if evs.stats { return subcommands.ExitSuccess } } panic("should never get here") } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/exec.go000066400000000000000000000402321465435605700220140ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "encoding/json" "fmt" "io/ioutil" "os" "os/exec" "path/filepath" "strconv" "strings" "time" "github.com/google/subcommands" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/console" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/specutils" ) // Exec implements subcommands.Command for the "exec" command. type Exec struct { cwd string env stringSlice // user contains the UID and GID with which to run the new process. user user extraKGIDs stringSlice caps stringSlice detach bool processPath string pidFile string internalPidFile string // consoleSocket is the path to an AF_UNIX socket which will receive a // file descriptor referencing the master end of the console's // pseudoterminal. consoleSocket string // passFDs are user-supplied FDs from the host to be exposed to the // sandboxed app. passFDs fdMappings // execFD is the host file descriptor used for program execution. execFD int } // Name implements subcommands.Command.Name. func (*Exec) Name() string { return "exec" } // Synopsis implements subcommands.Command.Synopsis. func (*Exec) Synopsis() string { return "execute new process inside the container" } // Usage implements subcommands.Command.Usage. func (*Exec) Usage() string { return `exec [command options] [command options] || --process process.json Where "" is the name for the instance of the container and "" is the command to be executed in the container. "" can't be empty unless a "-process" flag provided. EXAMPLE: If the container is configured to run /bin/ps the following will output a list of processes running in the container: # runc exec ps OPTIONS: ` } // SetFlags implements subcommands.Command.SetFlags. func (ex *Exec) SetFlags(f *flag.FlagSet) { f.StringVar(&ex.cwd, "cwd", "", "current working directory") f.Var(&ex.env, "env", "set environment variables (e.g. '-env PATH=/bin -env TERM=xterm')") f.Var(&ex.user, "user", "UID (format: [:])") f.Var(&ex.extraKGIDs, "additional-gids", "additional gids") f.Var(&ex.caps, "cap", "add a capability to the bounding set for the process") f.BoolVar(&ex.detach, "detach", false, "detach from the container's process") f.StringVar(&ex.processPath, "process", "", "path to the process.json") f.StringVar(&ex.pidFile, "pid-file", "", "filename that the container pid will be written to") f.StringVar(&ex.internalPidFile, "internal-pid-file", "", "filename that the container-internal pid will be written to") f.StringVar(&ex.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal") f.Var(&ex.passFDs, "pass-fd", "file descriptor passed to the container in M:N format, where M is the host and N is the guest descriptor (can be supplied multiple times)") f.IntVar(&ex.execFD, "exec-fd", -1, "host file descriptor used for program execution") } // Execute implements subcommands.Command.Execute. It starts a process in an // already created container. func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { conf := args[0].(*config.Config) e, id, err := ex.parseArgs(f, conf.EnableRaw) if err != nil { util.Fatalf("parsing process spec: %v", err) } waitStatus := args[1].(*unix.WaitStatus) c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { util.Fatalf("loading sandbox: %v", err) } log.Debugf("Exec arguments: %+v", e) log.Debugf("Exec capabilities: %+v", e.Capabilities) // Replace empty settings with defaults from container. if e.WorkingDirectory == "" { e.WorkingDirectory = c.Spec.Process.Cwd } if e.Envv == nil { e.Envv, err = specutils.ResolveEnvs(c.Spec.Process.Env, ex.env) if err != nil { util.Fatalf("getting environment variables: %v", err) } } if e.Capabilities == nil { e.Capabilities, err = specutils.Capabilities(conf.EnableRaw, c.Spec.Process.Capabilities) if err != nil { util.Fatalf("creating capabilities: %v", err) } log.Infof("Using exec capabilities from container: %+v", e.Capabilities) } // Create the file descriptor map for the process in the container. fdMap := map[int]*os.File{ 0: os.Stdin, 1: os.Stdout, 2: os.Stderr, } // Add custom file descriptors to the map. for _, mapping := range ex.passFDs { file := os.NewFile(uintptr(mapping.Host), "") if file == nil { util.Fatalf("failed to create file from file descriptor %d", mapping.Host) } fdMap[mapping.Guest] = file } var execFile *os.File if ex.execFD >= 0 { execFile = os.NewFile(uintptr(ex.execFD), "exec-fd") } // Close the underlying file descriptors after we have passed them. defer func() { for _, file := range fdMap { fd := file.Fd() if file.Close() != nil { log.Debugf("Failed to close FD %d", fd) } } if execFile != nil && execFile.Close() != nil { log.Debugf("Failed to close exec FD") } }() e.FilePayload = control.NewFilePayload(fdMap, execFile) // containerd expects an actual process to represent the container being // executed. If detach was specified, starts a child in non-detach mode, // write the child's PID to the pid file. So when the container returns, the // child process will also return and signal containerd. if ex.detach { return ex.execChildAndWait(waitStatus) } return ex.exec(conf, c, e, waitStatus) } func (ex *Exec) exec(conf *config.Config, c *container.Container, e *control.ExecArgs, waitStatus *unix.WaitStatus) subcommands.ExitStatus { // Start the new process and get its pid. pid, err := c.Execute(conf, e) if err != nil { return util.Errorf("executing processes for container: %v", err) } if e.StdioIsPty { // Forward signals sent to this process to the foreground // process in the sandbox. stopForwarding := c.ForwardSignals(pid, true /* fgProcess */) defer stopForwarding() } // Write the sandbox-internal pid if required. if ex.internalPidFile != "" { pidStr := []byte(strconv.Itoa(int(pid))) if err := ioutil.WriteFile(ex.internalPidFile, pidStr, 0644); err != nil { return util.Errorf("writing internal pid file %q: %v", ex.internalPidFile, err) } } // Generate the pid file after the internal pid file is generated, so that // users can safely assume that the internal pid file is ready after // `runsc exec -d` returns. if ex.pidFile != "" { if err := ioutil.WriteFile(ex.pidFile, []byte(strconv.Itoa(os.Getpid())), 0644); err != nil { return util.Errorf("writing pid file: %v", err) } } // Wait for the process to exit. ws, err := c.WaitPID(pid) if err != nil { return util.Errorf("waiting on pid %d: %v", pid, err) } *waitStatus = ws return subcommands.ExitSuccess } func (ex *Exec) execChildAndWait(waitStatus *unix.WaitStatus) subcommands.ExitStatus { var args []string for _, a := range os.Args[1:] { if !strings.Contains(a, "detach") { args = append(args, a) } } // The command needs to write a pid file so that execChildAndWait can tell // when it has started. If no pid-file was provided, we should use a // filename in a temp directory. pidFile := ex.pidFile if pidFile == "" { tmpDir, err := ioutil.TempDir("", "exec-pid-") if err != nil { util.Fatalf("creating TempDir: %v", err) } defer os.RemoveAll(tmpDir) pidFile = filepath.Join(tmpDir, "pid") args = append(args, "--pid-file="+pidFile) } cmd := exec.Command(specutils.ExePath, args...) cmd.Args[0] = "runsc-exec" // Exec stdio defaults to current process stdio. cmd.Stdin = os.Stdin cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr // If the console control socket file is provided, then create a new // pty master/replica pair and set the TTY on the sandbox process. if ex.consoleSocket != "" { // Create a new TTY pair and send the master on the provided socket. tty, err := console.NewWithSocket(ex.consoleSocket) if err != nil { util.Fatalf("setting up console with socket %q: %v", ex.consoleSocket, err) } defer tty.Close() // Set stdio to the new TTY replica. cmd.Stdin = tty cmd.Stdout = tty cmd.Stderr = tty cmd.SysProcAttr = &unix.SysProcAttr{ Setsid: true, Setctty: true, // The Ctty FD must be the FD in the child process's FD // table. Since we set cmd.Stdin/Stdout/Stderr to the // tty FD, we can use any of 0, 1, or 2 here. // See https://github.com/golang/go/issues/29458. Ctty: 0, } } if err := cmd.Start(); err != nil { util.Fatalf("failure to start child exec process, err: %v", err) } log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, specutils.ExePath, args) // Wait for PID file to ensure that child process has started. Otherwise, // '--process' file is deleted as soon as this process returns and the child // may fail to read it. ready := func() (bool, error) { pidb, err := ioutil.ReadFile(pidFile) if err == nil { // File appeared, check whether pid is fully written. pid, err := strconv.Atoi(string(pidb)) if err != nil { return false, nil } return pid == cmd.Process.Pid, nil } if pe, ok := err.(*os.PathError); !ok || pe.Err != unix.ENOENT { return false, err } // No file yet, continue to wait... return false, nil } if err := specutils.WaitForReady(cmd.Process.Pid, 10*time.Second, ready); err != nil { // Don't log fatal error here, otherwise it will override the error logged // by the child process that has failed to start. log.Warningf("Unexpected error waiting for PID file, err: %v", err) return subcommands.ExitFailure } *waitStatus = 0 return subcommands.ExitSuccess } // parseArgs parses exec information from the command line or a JSON file // depending on whether the --process flag was used. Returns an ExecArgs and // the ID of the container to be used. func (ex *Exec) parseArgs(f *flag.FlagSet, enableRaw bool) (*control.ExecArgs, string, error) { if ex.processPath == "" { // Requires at least a container ID and command. if f.NArg() < 2 { f.Usage() return nil, "", fmt.Errorf("both a container-id and command are required") } e, err := ex.argsFromCLI(f.Args()[1:], enableRaw) return e, f.Arg(0), err } // Requires only the container ID. if f.NArg() != 1 { f.Usage() return nil, "", fmt.Errorf("a container-id is required") } e, err := ex.argsFromProcessFile(enableRaw) return e, f.Arg(0), err } func (ex *Exec) argsFromCLI(argv []string, enableRaw bool) (*control.ExecArgs, error) { extraKGIDs := make([]auth.KGID, 0, len(ex.extraKGIDs)) for _, s := range ex.extraKGIDs { kgid, err := strconv.Atoi(s) if err != nil { util.Fatalf("parsing GID: %s, %v", s, err) } extraKGIDs = append(extraKGIDs, auth.KGID(kgid)) } var caps *auth.TaskCapabilities if len(ex.caps) > 0 { var err error caps, err = capabilities(ex.caps, enableRaw) if err != nil { return nil, fmt.Errorf("capabilities error: %v", err) } } return &control.ExecArgs{ Argv: argv, WorkingDirectory: ex.cwd, KUID: ex.user.kuid, KGID: ex.user.kgid, ExtraKGIDs: extraKGIDs, Capabilities: caps, StdioIsPty: ex.consoleSocket != "" || console.IsPty(os.Stdin.Fd()), FilePayload: control.NewFilePayload(map[int]*os.File{ 0: os.Stdin, 1: os.Stdout, 2: os.Stderr, }, nil), }, nil } func (ex *Exec) argsFromProcessFile(enableRaw bool) (*control.ExecArgs, error) { f, err := os.Open(ex.processPath) if err != nil { return nil, fmt.Errorf("error opening process file: %s, %v", ex.processPath, err) } defer f.Close() var p specs.Process if err := json.NewDecoder(f).Decode(&p); err != nil { return nil, fmt.Errorf("error parsing process file: %s, %v", ex.processPath, err) } return argsFromProcess(&p, enableRaw) } // argsFromProcess performs all the non-IO conversion from the Process struct // to ExecArgs. func argsFromProcess(p *specs.Process, enableRaw bool) (*control.ExecArgs, error) { // Create capabilities. var caps *auth.TaskCapabilities if p.Capabilities != nil { var err error // Starting from Docker 19, capabilities are explicitly set for exec (instead // of nil like before). So we can't distinguish 'exec' from // 'exec --privileged', as both specify CAP_NET_RAW. Therefore, filter // CAP_NET_RAW in the same way as container start. caps, err = specutils.Capabilities(enableRaw, p.Capabilities) if err != nil { return nil, fmt.Errorf("error creating capabilities: %v", err) } } // Convert the spec's additional GIDs to KGIDs. extraKGIDs := make([]auth.KGID, 0, len(p.User.AdditionalGids)) for _, GID := range p.User.AdditionalGids { extraKGIDs = append(extraKGIDs, auth.KGID(GID)) } return &control.ExecArgs{ Argv: p.Args, Envv: p.Env, WorkingDirectory: p.Cwd, KUID: auth.KUID(p.User.UID), KGID: auth.KGID(p.User.GID), ExtraKGIDs: extraKGIDs, Capabilities: caps, StdioIsPty: p.Terminal, FilePayload: control.NewFilePayload(map[int]*os.File{ 0: os.Stdin, 1: os.Stdout, 2: os.Stderr, }, nil), }, nil } // capabilities takes a list of capabilities as strings and returns an // auth.TaskCapabilities struct with those capabilities in every capability set. // This mimics runc's behavior. func capabilities(cs []string, enableRaw bool) (*auth.TaskCapabilities, error) { var specCaps specs.LinuxCapabilities for _, cap := range cs { specCaps.Ambient = append(specCaps.Ambient, cap) specCaps.Bounding = append(specCaps.Bounding, cap) specCaps.Effective = append(specCaps.Effective, cap) specCaps.Inheritable = append(specCaps.Inheritable, cap) specCaps.Permitted = append(specCaps.Permitted, cap) } // Starting from Docker 19, capabilities are explicitly set for exec (instead // of nil like before). So we can't distinguish 'exec' from // 'exec --privileged', as both specify CAP_NET_RAW. Therefore, filter // CAP_NET_RAW in the same way as container start. return specutils.Capabilities(enableRaw, &specCaps) } // stringSlice allows a flag to be used multiple times, where each occurrence // adds a value to the flag. For example, a flag called "x" could be invoked // via "runsc exec -x foo -x bar", and the corresponding stringSlice would be // {"x", "y"}. type stringSlice []string // String implements flag.Value.String. func (ss *stringSlice) String() string { return strings.Join(*ss, ",") } // Get implements flag.Value.Get. func (ss *stringSlice) Get() any { return ss } // Set implements flag.Value.Set. Set(String()) should be idempotent. func (ss *stringSlice) Set(s string) error { *ss = append(*ss, strings.Split(s, ",")...) return nil } // user allows -user to convey a UID and, optionally, a GID separated by a // colon. type user struct { kuid auth.KUID kgid auth.KGID } // String implements flag.Value.String. func (u *user) String() string { return fmt.Sprintf("%d:%d", u.kuid, u.kgid) } // Get implements flag.Value.Get. func (u *user) Get() any { return u } // Set implements flag.Value.Set. Set(String()) should be idempotent. func (u *user) Set(s string) error { parts := strings.SplitN(s, ":", 2) kuid, err := strconv.Atoi(parts[0]) if err != nil { return fmt.Errorf("couldn't parse UID: %s", parts[0]) } u.kuid = auth.KUID(kuid) if len(parts) > 1 { kgid, err := strconv.Atoi(parts[1]) if err != nil { return fmt.Errorf("couldn't parse GID: %s", parts[1]) } u.kgid = auth.KGID(kgid) } return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/fd_mapping.go000066400000000000000000000047131465435605700232000ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "fmt" "strconv" "strings" "gvisor.dev/gvisor/runsc/boot" ) // fdMappings can be used with flags that appear multiple times. type fdMappings []boot.FDMapping // String implements flag.Value. func (i *fdMappings) String() string { var mappings []string for _, m := range *i { mappings = append(mappings, fmt.Sprintf("%v:%v", m.Host, m.Guest)) } return strings.Join(mappings, ",") } // Get implements flag.Value. func (i *fdMappings) Get() any { return i } // GetArray returns an array of mappings. func (i *fdMappings) GetArray() []boot.FDMapping { return *i } // Set implements flag.Value and appends a mapping from the command line to the // mappings array. Set(String()) should be idempotent. func (i *fdMappings) Set(s string) error { for _, m := range strings.Split(s, ",") { split := strings.Split(m, ":") if len(split) != 2 { // Split returns a slice of length 1 if its first argument does not // contain the separator. An additional length check is not necessary. // In case no separator is used and the argument is a valid integer, we // assume that host FD and guest FD should be identical. fd, err := strconv.Atoi(split[0]) if err != nil { return fmt.Errorf("invalid flag value: must be an integer or a mapping of format M:N") } *i = append(*i, boot.FDMapping{ Host: fd, Guest: fd, }) return nil } fdHost, err := strconv.Atoi(split[0]) if err != nil { return fmt.Errorf("invalid flag host value: %v", err) } if fdHost < 0 { return fmt.Errorf("flag host value must be >= 0: %d", fdHost) } fdGuest, err := strconv.Atoi(split[1]) if err != nil { return fmt.Errorf("invalid flag guest value: %v", err) } if fdGuest < 0 { return fmt.Errorf("flag guest value must be >= 0: %d", fdGuest) } *i = append(*i, boot.FDMapping{ Host: fdHost, Guest: fdGuest, }) } return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/gofer.go000066400000000000000000000656731465435605700222120ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "encoding/json" "fmt" "io" "os" "path/filepath" "regexp" "runtime" "runtime/debug" "strings" "github.com/google/subcommands" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/devices/tpuproxy" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/fsgofer" "gvisor.dev/gvisor/runsc/fsgofer/filter" "gvisor.dev/gvisor/runsc/profile" "gvisor.dev/gvisor/runsc/specutils" ) var caps = []string{ "CAP_CHOWN", "CAP_DAC_OVERRIDE", "CAP_DAC_READ_SEARCH", "CAP_FOWNER", "CAP_FSETID", "CAP_SYS_CHROOT", } // goferCaps is the minimal set of capabilities needed by the Gofer to operate // on files. var goferCaps = &specs.LinuxCapabilities{ Bounding: caps, Effective: caps, Permitted: caps, } // goferSyncFDs contains file descriptors that are used for synchronization // of the Gofer startup process against other processes. type goferSyncFDs struct { // nvproxyFD is a file descriptor that is used to wait until // nvproxy-related setup is done. This setup involves creating mounts in the // Gofer process's mount namespace. // If this is set, this FD is the first that the Gofer waits for. nvproxyFD int // usernsFD is a file descriptor that is used to wait until // user namespace ID mappings are established in the Gofer's userns. // If this is set, this FD is the second that the Gofer waits for. usernsFD int // procMountFD is a file descriptor that has to be closed when the // procfs mount isn't needed anymore. It is read by the procfs unmounter // process. // If this is set, this FD is the last that the Gofer interacts with and // closes. procMountFD int } // Gofer implements subcommands.Command for the "gofer" command, which starts a // filesystem gofer. This command should not be called directly. type Gofer struct { bundleDir string ioFDs intFlags devIoFD int applyCaps bool setUpRoot bool mountConfs boot.GoferMountConfFlags specFD int mountsFD int profileFDs profile.FDArgs syncFDs goferSyncFDs stopProfiling func() } // Name implements subcommands.Command. func (*Gofer) Name() string { return "gofer" } // Synopsis implements subcommands.Command. func (g *Gofer) Synopsis() string { return fmt.Sprintf("launch a gofer process that proxies access to container files") } // Usage implements subcommands.Command. func (*Gofer) Usage() string { return `gofer [flags]` } // SetFlags implements subcommands.Command. func (g *Gofer) SetFlags(f *flag.FlagSet) { f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory") f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do") f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process") // Open FDs that are donated to the gofer. f.Var(&g.ioFDs, "io-fds", "list of FDs to connect gofer servers. Follows the same order as --gofer-mount-confs. FDs are only donated if the mount is backed by lisafs.") f.Var(&g.mountConfs, "gofer-mount-confs", "information about how the gofer mounts have been configured. They must follow this order: root first, then mounts as defined in the spec.") f.IntVar(&g.devIoFD, "dev-io-fd", -1, "optional FD to connect /dev gofer server") f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec") f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).") // Add synchronization FD flags. g.syncFDs.setFlags(f) // Profiling flags. g.profileFDs.SetFromFlags(f) } // Execute implements subcommands.Command. func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if g.bundleDir == "" || len(g.ioFDs) < 1 || g.specFD < 0 { f.Usage() return subcommands.ExitUsageError } conf := args[0].(*config.Config) // Set traceback level debug.SetTraceback(conf.Traceback) specFile := os.NewFile(uintptr(g.specFD), "spec file") defer specFile.Close() spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile, conf) if err != nil { util.Fatalf("reading spec: %v", err) } g.syncFDs.syncNVProxy() g.syncFDs.syncUsernsForRootless() if g.setUpRoot { if err := g.setupRootFS(spec, conf); err != nil { util.Fatalf("Error setting up root FS: %v", err) } if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { cleanupUnmounter := g.syncFDs.spawnProcUnmounter() defer cleanupUnmounter() } } if g.applyCaps { overrides := g.syncFDs.flags() overrides["apply-caps"] = "false" overrides["setup-root"] = "false" args := prepareArgs(g.Name(), f, overrides) util.Fatalf("setCapsAndCallSelf(%v, %v): %v", args, goferCaps, setCapsAndCallSelf(args, goferCaps)) panic("unreachable") } // Start profiling. This will be a noop if no profiling arguments were passed. profileOpts := g.profileFDs.ToOpts() g.stopProfiling = profile.Start(profileOpts) // At this point we won't re-execute, so it's safe to limit via rlimits. Any // limit >= 0 works. If the limit is lower than the current number of open // files, then Setrlimit will succeed, and the next open will fail. if conf.FDLimit > -1 { rlimit := unix.Rlimit{ Cur: uint64(conf.FDLimit), Max: uint64(conf.FDLimit), } switch err := unix.Setrlimit(unix.RLIMIT_NOFILE, &rlimit); err { case nil: case unix.EPERM: log.Warningf("FD limit %d is higher than the current hard limit or system-wide maximum", conf.FDLimit) default: util.Fatalf("Failed to set RLIMIT_NOFILE: %v", err) } } // Find what path is going to be served by this gofer. root := spec.Root.Path if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { root = "/root" } // Resolve mount points paths, then replace mounts from our spec and send the // mount list over to the sandbox, so they are both in sync. // // Note that all mount points have been mounted in the proper location in // setupRootFS(). cleanMounts, err := g.resolveMounts(conf, spec.Mounts, root) if err != nil { util.Fatalf("Failure to resolve mounts: %v", err) } spec.Mounts = cleanMounts go func() { if err := g.writeMounts(cleanMounts); err != nil { panic(fmt.Sprintf("Failed to write mounts: %v", err)) } }() specutils.LogSpecDebug(spec, conf.OCISeccomp) // fsgofer should run with a umask of 0, because we want to preserve file // modes exactly as sent by the sandbox, which will have applied its own umask. unix.Umask(0) procFDPath := procFDBindMount if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { procFDPath = "/proc/self/fd" } if err := fsgofer.OpenProcSelfFD(procFDPath); err != nil { util.Fatalf("failed to open /proc/self/fd: %v", err) } // procfs isn't needed anymore. g.syncFDs.unmountProcfs() if err := unix.Chroot(root); err != nil { util.Fatalf("failed to chroot to %q: %v", root, err) } if err := unix.Chdir("/"); err != nil { util.Fatalf("changing working dir: %v", err) } log.Infof("Process chroot'd to %q", root) // Initialize filters. opts := filter.Options{ UDSOpenEnabled: conf.GetHostUDS().AllowOpen(), UDSCreateEnabled: conf.GetHostUDS().AllowCreate(), ProfileEnabled: len(profileOpts) > 0, DirectFS: conf.DirectFS, } if err := filter.Install(opts); err != nil { util.Fatalf("installing seccomp filters: %v", err) } return g.serve(spec, conf, root) } func newSocket(ioFD int) *unet.Socket { socket, err := unet.NewSocket(ioFD) if err != nil { util.Fatalf("creating server on FD %d: %v", ioFD, err) } return socket } func (g *Gofer) serve(spec *specs.Spec, conf *config.Config, root string) subcommands.ExitStatus { type connectionConfig struct { sock *unet.Socket mountPath string readonly bool } cfgs := make([]connectionConfig, 0, len(spec.Mounts)+1) server := fsgofer.NewLisafsServer(fsgofer.Config{ // These are global options. Ignore readonly configuration, that is set on // a per connection basis. HostUDS: conf.GetHostUDS(), HostFifo: conf.HostFifo, DonateMountPointFD: conf.DirectFS, }) ioFDs := g.ioFDs rootfsConf := g.mountConfs[0] if rootfsConf.ShouldUseLisafs() { // Start with root mount, then add any other additional mount as needed. cfgs = append(cfgs, connectionConfig{ sock: newSocket(ioFDs[0]), mountPath: "/", // fsgofer process is always chroot()ed. So serve root. readonly: spec.Root.Readonly || rootfsConf.ShouldUseOverlayfs(), }) log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, ioFDs[0], cfgs[0].readonly) ioFDs = ioFDs[1:] } mountIdx := 1 // first one is the root for _, m := range spec.Mounts { if !specutils.IsGoferMount(m) { continue } mountConf := g.mountConfs[mountIdx] mountIdx++ if !mountConf.ShouldUseLisafs() { continue } if !filepath.IsAbs(m.Destination) { util.Fatalf("mount destination must be absolute: %q", m.Destination) } if len(ioFDs) == 0 { util.Fatalf("no FD found for mount. Did you forget --io-fd? FDs: %d, Mount: %+v", len(g.ioFDs), m) } ioFD := ioFDs[0] ioFDs = ioFDs[1:] readonly := specutils.IsReadonlyMount(m.Options) || mountConf.ShouldUseOverlayfs() cfgs = append(cfgs, connectionConfig{ sock: newSocket(ioFD), mountPath: m.Destination, readonly: readonly, }) log.Infof("Serving %q mapped on FD %d (ro: %t)", m.Destination, ioFD, readonly) } if len(ioFDs) > 0 { util.Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", len(cfgs), len(g.ioFDs)) } if g.devIoFD >= 0 { cfgs = append(cfgs, connectionConfig{ sock: newSocket(g.devIoFD), mountPath: "/dev", }) log.Infof("Serving /dev mapped on FD %d (ro: false)", g.devIoFD) } for _, cfg := range cfgs { conn, err := server.CreateConnection(cfg.sock, cfg.mountPath, cfg.readonly) if err != nil { util.Fatalf("starting connection on FD %d for gofer mount failed: %v", cfg.sock.FD(), err) } server.StartConnection(conn) } server.Wait() server.Destroy() log.Infof("All lisafs servers exited.") if g.stopProfiling != nil { g.stopProfiling() } return subcommands.ExitSuccess } func (g *Gofer) writeMounts(mounts []specs.Mount) error { bytes, err := json.Marshal(mounts) if err != nil { return err } f := os.NewFile(uintptr(g.mountsFD), "mounts file") defer f.Close() for written := 0; written < len(bytes); { w, err := f.Write(bytes[written:]) if err != nil { return err } written += w } return nil } // Redhat distros don't allow to create bind-mounts in /proc/self directories. // It is protected by selinux rules. const procFDBindMount = "/proc/fs" func (g *Gofer) setupRootFS(spec *specs.Spec, conf *config.Config) error { // Convert all shared mounts into slaves to be sure that nothing will be // propagated outside of our namespace. procPath := "/proc" if err := specutils.SafeMount("", "/", "", unix.MS_SLAVE|unix.MS_REC, "", procPath); err != nil { util.Fatalf("error converting mounts: %v", err) } root := spec.Root.Path if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { // runsc can't be re-executed without /proc, so we create a tmpfs mount, // mount ./proc and ./root there, then move this mount to the root and after // setCapsAndCallSelf, runsc will chroot into /root. // // We need a directory to construct a new root and we know that // runsc can't start without /proc, so we can use it for this. flags := uintptr(unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC) if err := specutils.SafeMount("runsc-root", "/proc/fs", "tmpfs", flags, "", procPath); err != nil { util.Fatalf("error mounting tmpfs: %v", err) } if err := unix.Mount("", "/proc/fs", "", unix.MS_UNBINDABLE, ""); err != nil { util.Fatalf("error setting MS_UNBINDABLE") } // Prepare tree structure for pivot_root(2). if err := os.Mkdir("/proc/fs/proc", 0755); err != nil { util.Fatalf("error creating /proc/fs/proc: %v", err) } if err := os.Mkdir("/proc/fs/root", 0755); err != nil { util.Fatalf("error creating /proc/fs/root: %v", err) } if err := os.Mkdir("/proc/fs/etc", 0755); err != nil { util.Fatalf("error creating /proc/fs/etc: %v", err) } // This cannot use SafeMount because there's no available procfs. But we // know that /proc/fs is an empty tmpfs mount, so this is safe. if err := unix.Mount("/proc", "/proc/fs/proc", "", flags|unix.MS_RDONLY|unix.MS_BIND|unix.MS_REC, ""); err != nil { util.Fatalf("error mounting /proc/fs/proc: %v", err) } // self/fd is bind-mounted, so that the FD return by // OpenProcSelfFD() does not allow escapes with walking ".." . if err := unix.Mount("/proc/fs/proc/self/fd", "/proc/fs/"+procFDBindMount, "", unix.MS_RDONLY|unix.MS_BIND|flags, ""); err != nil { util.Fatalf("error mounting proc/self/fd: %v", err) } if err := copyFile("/proc/fs/etc/localtime", "/etc/localtime"); err != nil { log.Warningf("Failed to copy /etc/localtime: %v. UTC timezone will be used.", err) } root = "/proc/fs/root" procPath = "/proc/fs/proc" } rootfsConf := g.mountConfs[0] if rootfsConf.ShouldUseLisafs() { // Mount root path followed by submounts. if err := specutils.SafeMount(spec.Root.Path, root, "bind", unix.MS_BIND|unix.MS_REC, "", procPath); err != nil { return fmt.Errorf("mounting root on root (%q) err: %v", root, err) } flags := uint32(unix.MS_SLAVE | unix.MS_REC) if spec.Linux != nil && spec.Linux.RootfsPropagation != "" { flags = specutils.PropOptionsToFlags([]string{spec.Linux.RootfsPropagation}) } if err := specutils.SafeMount("", root, "", uintptr(flags), "", procPath); err != nil { return fmt.Errorf("mounting root (%q) with flags: %#x, err: %v", root, flags, err) } } // Replace the current spec, with the clean spec with symlinks resolved. if err := g.setupMounts(conf, spec.Mounts, root, procPath); err != nil { util.Fatalf("error setting up FS: %v", err) } // Set up /dev directory is needed. if g.devIoFD >= 0 { g.setupDev(spec, conf, root, procPath) } // Create working directory if needed. if spec.Process.Cwd != "" { dst, err := resolveSymlinks(root, spec.Process.Cwd) if err != nil { return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err) } log.Infof("Create working directory %q if needed", spec.Process.Cwd) if err := os.MkdirAll(dst, 0755); err != nil { return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err) } } // Check if root needs to be remounted as readonly. if rootfsConf.ShouldUseLisafs() && (spec.Root.Readonly || rootfsConf.ShouldUseOverlayfs()) { // If root is a mount point but not read-only, we can change mount options // to make it read-only for extra safety. // unix.MS_NOSUID and unix.MS_NODEV are included here not only // for safety reasons but also because they can be locked and // any attempts to unset them will fail. See // mount_namespaces(7) for more details. log.Infof("Remounting root as readonly: %q", root) flags := uintptr(unix.MS_BIND | unix.MS_REMOUNT | unix.MS_RDONLY | unix.MS_NOSUID | unix.MS_NODEV) if err := specutils.SafeMount(root, root, "bind", flags, "", procPath); err != nil { return fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", root, root, flags, err) } } if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { if err := pivotRoot("/proc/fs"); err != nil { util.Fatalf("failed to change the root file system: %v", err) } if err := os.Chdir("/"); err != nil { util.Fatalf("failed to change working directory") } } return nil } // setupMounts bind mounts all mounts specified in the spec in their correct // location inside root. It will resolve relative paths and symlinks. It also // creates directories as needed. func (g *Gofer) setupMounts(conf *config.Config, mounts []specs.Mount, root, procPath string) error { mountIdx := 1 // First index is for rootfs. for _, m := range mounts { if !specutils.IsGoferMount(m) { continue } mountConf := g.mountConfs[mountIdx] mountIdx++ if !mountConf.ShouldUseLisafs() { continue } dst, err := resolveSymlinks(root, m.Destination) if err != nil { return fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err) } flags := specutils.OptionsToFlags(m.Options) | unix.MS_BIND if mountConf.ShouldUseOverlayfs() { // Force mount read-only if writes are not going to be sent to it. flags |= unix.MS_RDONLY } log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags) if err := specutils.SafeSetupAndMount(m.Source, dst, m.Type, flags, procPath); err != nil { return fmt.Errorf("mounting %+v: %v", m, err) } // Set propagation options that cannot be set together with other options. flags = specutils.PropOptionsToFlags(m.Options) if flags != 0 { if err := specutils.SafeMount("", dst, "", uintptr(flags), "", procPath); err != nil { return fmt.Errorf("mount dst: %q, flags: %#x, err: %v", dst, flags, err) } } } return nil } // shouldExposeNvidiaDevice returns true if path refers to an Nvidia device // which should be exposed to the container. // // Precondition: nvproxy is enabled. func shouldExposeNvidiaDevice(path string) bool { if !strings.HasPrefix(path, "/dev/nvidia") { return false } if path == "/dev/nvidiactl" || path == "/dev/nvidia-uvm" { return true } nvidiaDevPathReg := regexp.MustCompile(`^/dev/nvidia(\d+)$`) return nvidiaDevPathReg.MatchString(path) } // shouldExposeVfioDevice returns true if path refers to an VFIO device // which shuold be exposed to the container. func shouldExposeVFIODevice(path string) bool { return strings.HasPrefix(path, filepath.Dir(tpuproxy.VFIOPath)) } // shouldExposeTpuDevice returns true if path refers to a TPU device which // should be exposed to the container. // // Precondition: tpuproxy is enabled. func shouldExposeTpuDevice(path string) bool { _, valid, _ := util.ExtractTPUDeviceMinor(path) return valid || shouldExposeVFIODevice(path) } func (g *Gofer) setupDev(spec *specs.Spec, conf *config.Config, root, procPath string) error { if err := os.MkdirAll(filepath.Join(root, "dev"), 0777); err != nil { return fmt.Errorf("creating dev directory: %v", err) } // Mount any devices specified in the spec. if spec.Linux == nil { return nil } nvproxyEnabled := specutils.NVProxyEnabled(spec, conf) tpuproxyEnabled := specutils.TPUProxyIsEnabled(spec, conf) for _, dev := range spec.Linux.Devices { shouldMount := (nvproxyEnabled && shouldExposeNvidiaDevice(dev.Path)) || (tpuproxyEnabled && shouldExposeTpuDevice(dev.Path)) if !shouldMount { continue } dst := filepath.Join(root, dev.Path) log.Infof("Mounting device %q as bind mount at %q", dev.Path, dst) if err := specutils.SafeSetupAndMount(dev.Path, dst, "bind", unix.MS_BIND, procPath); err != nil { return fmt.Errorf("mounting %q: %v", dev.Path, err) } } return nil } // resolveMounts resolved relative paths and symlinks to mount points. // // Note: mount points must already be in place for resolution to work. // Otherwise, it may follow symlinks to locations that would be overwritten // with another mount point and return the wrong location. In short, make sure // setupMounts() has been called before. func (g *Gofer) resolveMounts(conf *config.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) { mountIdx := 1 // First index is for rootfs. cleanMounts := make([]specs.Mount, 0, len(mounts)) for _, m := range mounts { if !specutils.IsGoferMount(m) { cleanMounts = append(cleanMounts, m) continue } mountConf := g.mountConfs[mountIdx] mountIdx++ if !mountConf.ShouldUseLisafs() { cleanMounts = append(cleanMounts, m) continue } dst, err := resolveSymlinks(root, m.Destination) if err != nil { return nil, fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err) } relDst, err := filepath.Rel(root, dst) if err != nil { panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, root, err)) } opts, err := adjustMountOptions(conf, filepath.Join(root, relDst), m.Options) if err != nil { return nil, err } cpy := m cpy.Destination = filepath.Join("/", relDst) cpy.Options = opts cleanMounts = append(cleanMounts, cpy) } return cleanMounts, nil } // ResolveSymlinks walks 'rel' having 'root' as the root directory. If there are // symlinks, they are evaluated relative to 'root' to ensure the end result is // the same as if the process was running inside the container. func resolveSymlinks(root, rel string) (string, error) { return resolveSymlinksImpl(root, root, rel, 255) } func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) { if followCount == 0 { return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel)) } rel = filepath.Clean(rel) for _, name := range strings.Split(rel, string(filepath.Separator)) { if name == "" { continue } // Note that Join() resolves things like ".." and returns a clean path. path := filepath.Join(base, name) if !strings.HasPrefix(path, root) { // One cannot '..' their way out of root. base = root continue } fi, err := os.Lstat(path) if err != nil { if !os.IsNotExist(err) { return "", err } // Not found means there is no symlink to check. Just keep walking dirs. base = path continue } if fi.Mode()&os.ModeSymlink != 0 { link, err := os.Readlink(path) if err != nil { return "", err } if filepath.IsAbs(link) { base = root } base, err = resolveSymlinksImpl(root, base, link, followCount-1) if err != nil { return "", err } continue } base = path } return base, nil } // adjustMountOptions adds filesystem-specific gofer mount options. func adjustMountOptions(conf *config.Config, path string, opts []string) ([]string, error) { rv := make([]string, len(opts)) copy(rv, opts) statfs := unix.Statfs_t{} if err := unix.Statfs(path, &statfs); err != nil { return nil, err } switch statfs.Type { case unix.OVERLAYFS_SUPER_MAGIC: rv = append(rv, "overlayfs_stale_read") case unix.NFS_SUPER_MAGIC: // The gofer client implements remote file handle sharing for performance. // However, remote filesystems like NFS rely on close(2) syscall for // flushing file data to the server. Such handle sharing prevents the // application's close(2) syscall from being propagated to the host. Hence // disable file handle sharing, so NFS files are flushed correctly. rv = append(rv, "disable_file_handle_sharing") } return rv, nil } // setFlags sets sync FD flags on the given FlagSet. func (g *goferSyncFDs) setFlags(f *flag.FlagSet) { f.IntVar(&g.nvproxyFD, "sync-nvproxy-fd", -1, "file descriptor that the gofer waits on until nvproxy setup is done") f.IntVar(&g.usernsFD, "sync-userns-fd", -1, "file descriptor the gofer waits on until userns mappings are set up") f.IntVar(&g.procMountFD, "proc-mount-sync-fd", -1, "file descriptor that the gofer writes to when /proc isn't needed anymore and can be unmounted") } // flags returns the flags necessary to pass along the current sync FD values // to a re-executed version of this process. func (g *goferSyncFDs) flags() map[string]string { return map[string]string{ "sync-nvproxy-fd": fmt.Sprintf("%d", g.nvproxyFD), "sync-userns-fd": fmt.Sprintf("%d", g.usernsFD), "proc-mount-sync-fd": fmt.Sprintf("%d", g.procMountFD), } } // waitForFD waits for the other end of a given FD to be closed. // `fd` is closed unconditionally after that. // This should only be called for actual FDs (i.e. `fd` >= 0). func waitForFD(fd int, fdName string) error { log.Debugf("Waiting on %s %d...", fdName, fd) f := os.NewFile(uintptr(fd), fdName) defer f.Close() var b [1]byte if n, err := f.Read(b[:]); n != 0 || err != io.EOF { return fmt.Errorf("failed to sync on %s: %v: %v", fdName, n, err) } log.Debugf("Synced on %s %d.", fdName, fd) return nil } // spawnProcMounter executes the /proc unmounter process. // It returns a function to wait on the proc unmounter process, which // should be called (via defer) in case of errors in order to clean up the // unmounter process properly. // When procfs is no longer needed, `unmountProcfs` should be called. func (g *goferSyncFDs) spawnProcUnmounter() func() { if g.procMountFD != -1 { util.Fatalf("procMountFD is set") } // /proc is umounted from a forked process, because the // current one may re-execute itself without capabilities. cmd, w := execProcUmounter() // Clear FD_CLOEXEC. This process may be re-executed. procMountFD // should remain open. if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, w.Fd(), unix.F_SETFD, 0); errno != 0 { util.Fatalf("error clearing CLOEXEC: %v", errno) } g.procMountFD = int(w.Fd()) return func() { g.procMountFD = -1 w.Close() cmd.Wait() } } // unmountProcfs signals the proc unmounter process that procfs is no longer // needed. func (g *goferSyncFDs) unmountProcfs() { if g.procMountFD < 0 { return } umountProc(g.procMountFD) g.procMountFD = -1 } // syncUsernsForRootless waits on usernsFD to be closed and then sets // UID/GID to 0. Note that this function calls runtime.LockOSThread(). // This function is a no-op if usernsFD is -1. // // Postcondition: All callers must re-exec themselves after this returns, // unless usernsFD was -1. func (g *goferSyncFDs) syncUsernsForRootless() { if g.usernsFD < 0 { return } syncUsernsForRootless(g.usernsFD) g.usernsFD = -1 } // syncUsernsForRootless waits on usernsFD to be closed and then sets // UID/GID to 0. Note that this function calls runtime.LockOSThread(). // // Postcondition: All callers must re-exec themselves after this returns. func syncUsernsForRootless(fd int) { if err := waitForFD(fd, "userns sync FD"); err != nil { util.Fatalf("failed to sync on userns FD: %v", err) } // SETUID changes UID on the current system thread, so we have // to re-execute current binary. runtime.LockOSThread() if _, _, errno := unix.RawSyscall(unix.SYS_SETUID, 0, 0, 0); errno != 0 { util.Fatalf("failed to set UID: %v", errno) } if _, _, errno := unix.RawSyscall(unix.SYS_SETGID, 0, 0, 0); errno != 0 { util.Fatalf("failed to set GID: %v", errno) } } // syncNVProxy waits on nvproxyFD to be closed. // Used for synchronization during nvproxy setup which is done from the // non-gofer process. // This function is a no-op if nvProxySyncFD is -1. func (g *goferSyncFDs) syncNVProxy() { if g.nvproxyFD < 0 { return } if err := waitForFD(g.nvproxyFD, "nvproxy sync FD"); err != nil { util.Fatalf("failed to sync on NVProxy FD: %v", err) } g.nvproxyFD = -1 } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/help.go000066400000000000000000000072351465435605700220260ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "fmt" "github.com/google/subcommands" "gvisor.dev/gvisor/runsc/flag" ) // NewHelp returns a help command for the given commander. func NewHelp(cdr *subcommands.Commander) *Help { return &Help{ cdr: cdr, } } // Help implements subcommands.Command for the "help" command. The 'help' // command prints help for commands registered to a Commander but also allows for // registering additional help commands that print other documentation. type Help struct { cdr *subcommands.Commander commands []subcommands.Command help bool } // Name implements subcommands.Command.Name. func (*Help) Name() string { return "help" } // Synopsis implements subcommands.Command.Synopsis. func (*Help) Synopsis() string { return "Print help documentation." } // Usage implements subcommands.Command.Usage. func (*Help) Usage() string { return `help []: With an argument, prints detailed information on the use of the specified topic or subcommand. With no argument, print a list of all commands and a brief description of each. ` } // SetFlags implements subcommands.Command.SetFlags. func (h *Help) SetFlags(*flag.FlagSet) {} // Execute implements subcommands.Command.Execute. func (h *Help) Execute(ctx context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { switch f.NArg() { case 0: fmt.Fprintf(h.cdr.Output, "Usage: %s \n\n", h.cdr.Name()) fmt.Fprintf(h.cdr.Output, `runsc is the gVisor container runtime. Functionality is provided by subcommands. For help with a specific subcommand, use "%s %s ". `, h.cdr.Name(), h.Name()) h.cdr.VisitGroups(func(g *subcommands.CommandGroup) { h.cdr.ExplainGroup(h.cdr.Output, g) }) fmt.Fprintf(h.cdr.Output, "Additional help topics (Use \"%s %s \" to see help on the topic):\n", h.cdr.Name(), h.Name()) for _, cmd := range h.commands { fmt.Fprintf(h.cdr.Output, "\t%-15s %s\n", cmd.Name(), cmd.Synopsis()) } fmt.Fprintf(h.cdr.Output, "\nUse \"%s flags\" for a list of top-level flags\n", h.cdr.Name()) return subcommands.ExitSuccess default: // Look for commands registered to the commander and print help explanation if found. found := false h.cdr.VisitCommands(func(g *subcommands.CommandGroup, cmd subcommands.Command) { if f.Arg(0) == cmd.Name() { h.cdr.ExplainCommand(h.cdr.Output, cmd) found = true } }) if found { return subcommands.ExitSuccess } // Next check commands registered to the help command. for _, cmd := range h.commands { if f.Arg(0) == cmd.Name() { fs := flag.NewFlagSet(f.Arg(0), flag.ContinueOnError) fs.Usage = func() { h.cdr.ExplainCommand(h.cdr.Error, cmd) } cmd.SetFlags(fs) if fs.Parse(f.Args()[1:]) != nil { return subcommands.ExitUsageError } return cmd.Execute(ctx, f, args...) } } fmt.Fprintf(h.cdr.Error, "Subcommand %s not understood\n", f.Arg(0)) } f.Usage() return subcommands.ExitUsageError } // Register registers a new help command. func (h *Help) Register(cmd subcommands.Command) { h.commands = append(h.commands, cmd) } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/install.go000066400000000000000000000204101465435605700225320ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "encoding/json" "fmt" "io/ioutil" "log" "os" "path" "regexp" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/flag" ) // Install implements subcommands.Command. type Install struct { ConfigFile string Runtime string Experimental bool Clobber bool CgroupDriver string executablePath string runtimeArgs []string } // Name implements subcommands.Command.Name. func (*Install) Name() string { return "install" } // Synopsis implements subcommands.Command.Synopsis. func (*Install) Synopsis() string { return "adds a runtime to docker daemon configuration" } // Usage implements subcommands.Command.Usage. func (*Install) Usage() string { return `install [--runtime=] [flags] [-- [args...]] -- if provided, args are passed to the runtime ` } // SetFlags implements subcommands.Command.SetFlags. func (i *Install) SetFlags(fs *flag.FlagSet) { fs.StringVar(&i.ConfigFile, "config_file", "/etc/docker/daemon.json", "path to Docker daemon config file") fs.StringVar(&i.Runtime, "runtime", "runsc", "runtime name") fs.BoolVar(&i.Experimental, "experimental", false, "enable/disable experimental features") fs.BoolVar(&i.Clobber, "clobber", true, "clobber existing runtime configuration") fs.StringVar(&i.CgroupDriver, "cgroupdriver", "", "docker cgroup driver") } // Execute implements subcommands.Command.Execute. func (i *Install) Execute(_ context.Context, f *flag.FlagSet, _ ...any) subcommands.ExitStatus { // Grab the name and arguments. i.runtimeArgs = f.Args() testFlags := flag.NewFlagSet("test", flag.ContinueOnError) config.RegisterFlags(testFlags) testFlags.Parse(i.runtimeArgs) conf, err := config.NewFromFlags(testFlags) if err != nil { log.Fatalf("invalid runtime arguments: %v", err) } // Check the platform. p, err := platform.Lookup(conf.Platform) if err != nil { log.Fatalf("invalid platform: %v", err) } deviceFile, err := p.OpenDevice(conf.PlatformDevicePath) if err != nil { log.Printf("WARNING: unable to open platform, runsc may fail to start: %v", err) } if deviceFile != nil { deviceFile.Close() } // Extract the executable. path, err := os.Executable() if err != nil { log.Fatalf("Error reading current exectuable: %v", err) } i.executablePath = path installRW := configReaderWriter{ read: defaultReadConfig, write: defaultWriteConfig, } if err := doInstallConfig(i, installRW); err != nil { log.Fatalf("Install failed: %v", err) } // Success. log.Print("Successfully updated config.") return subcommands.ExitSuccess } func doInstallConfig(i *Install, rw configReaderWriter) error { // Load the configuration file. configBytes, err := rw.read(i.ConfigFile) if err != nil { return fmt.Errorf("error reading config file %q: %v", i.ConfigFile, err) } // Unmarshal the configuration. c := make(map[string]any) if len(configBytes) > 0 { if err := json.Unmarshal(configBytes, &c); err != nil { return err } } // Add the given runtime. var rts map[string]any if i, ok := c["runtimes"]; ok { rts = i.(map[string]any) } else { rts = make(map[string]any) c["runtimes"] = rts } updateRuntime := func() { rts[i.Runtime] = struct { Path string `json:"path,omitempty"` RuntimeArgs []string `json:"runtimeArgs,omitempty"` }{ Path: i.executablePath, RuntimeArgs: i.runtimeArgs, } } _, ok := rts[i.Runtime] switch { case !ok: log.Printf("Runtime %s not found: adding\n", i.Runtime) updateRuntime() case i.Clobber: log.Printf("Clobber is set. Overwriting runtime %s not found: adding\n", i.Runtime) updateRuntime() default: log.Printf("Not overwriting runtime %s\n", i.Runtime) } // Set experimental if required. if i.Experimental { c["experimental"] = true } re := regexp.MustCompile(`^native.cgroupdriver=`) // Set the cgroupdriver if required. if i.CgroupDriver != "" { v, ok := c["exec-opts"] if !ok { c["exec-opts"] = []string{fmt.Sprintf("native.cgroupdriver=%s", i.CgroupDriver)} } else { opts := v.([]any) newOpts := []any{} for _, opt := range opts { if !i.Clobber { newOpts = opts break } o, ok := opt.(string) if !ok { continue } if !re.MatchString(o) { newOpts = append(newOpts, o) } } c["exec-opts"] = append(newOpts, fmt.Sprintf("native.cgroupdriver=%s", i.CgroupDriver)) } } // Write out the runtime. if err := rw.write(c, i.ConfigFile); err != nil { return fmt.Errorf("error writing config file %q: %v", i.ConfigFile, err) } return nil } // Uninstall implements subcommands.Command. type Uninstall struct { ConfigFile string Runtime string } // Name implements subcommands.Command.Name. func (*Uninstall) Name() string { return "uninstall" } // Synopsis implements subcommands.Command.Synopsis. func (*Uninstall) Synopsis() string { return "removes a runtime from docker daemon configuration" } // Usage implements subcommands.Command.Usage. func (*Uninstall) Usage() string { return `uninstall [flags] ` } // SetFlags implements subcommands.Command.SetFlags. func (u *Uninstall) SetFlags(fs *flag.FlagSet) { fs.StringVar(&u.ConfigFile, "config_file", "/etc/docker/daemon.json", "path to Docker daemon config file") fs.StringVar(&u.Runtime, "runtime", "runsc", "runtime name") } // Execute implements subcommands.Command.Execute. func (u *Uninstall) Execute(context.Context, *flag.FlagSet, ...any) subcommands.ExitStatus { log.Printf("Removing runtime %q from %q.", u.Runtime, u.ConfigFile) if err := doUninstallConfig(u, configReaderWriter{ read: defaultReadConfig, write: defaultWriteConfig, }); err != nil { log.Fatalf("Uninstall failed: %v", err) } return subcommands.ExitSuccess } func doUninstallConfig(u *Uninstall, rw configReaderWriter) error { configBytes, err := rw.read(u.ConfigFile) if err != nil { return fmt.Errorf("error reading config file %q: %v", u.ConfigFile, err) } // Unmarshal the configuration. c := make(map[string]any) if len(configBytes) > 0 { if err := json.Unmarshal(configBytes, &c); err != nil { return err } } var rts map[string]any if i, ok := c["runtimes"]; ok { rts = i.(map[string]any) } else { return fmt.Errorf("runtime %q not found", u.Runtime) } if _, ok := rts[u.Runtime]; !ok { return fmt.Errorf("runtime %q not found", u.Runtime) } delete(rts, u.Runtime) if err := rw.write(c, u.ConfigFile); err != nil { return fmt.Errorf("error writing config file %q: %v", u.ConfigFile, err) } return nil } type configReaderWriter struct { read func(string) ([]byte, error) write func(map[string]any, string) error } func defaultReadConfig(path string) ([]byte, error) { // Read the configuration data. configBytes, err := ioutil.ReadFile(path) if err != nil && !os.IsNotExist(err) { return nil, err } return configBytes, nil } func defaultWriteConfig(c map[string]any, filename string) error { // Marshal the configuration. b, err := json.MarshalIndent(c, "", " ") if err != nil { return err } // Copy the old configuration. old, err := ioutil.ReadFile(filename) if err != nil { if !os.IsNotExist(err) { return fmt.Errorf("error reading config file %q: %v", filename, err) } } else { if err := ioutil.WriteFile(filename+"~", old, 0644); err != nil { return fmt.Errorf("error backing up config file %q: %v", filename, err) } } // Make the necessary directories. if err := os.MkdirAll(path.Dir(filename), 0755); err != nil { return fmt.Errorf("error creating config directory for %q: %v", filename, err) } // Write the new configuration. if err := ioutil.WriteFile(filename, b, 0644); err != nil { return fmt.Errorf("error writing config file %q: %v", filename, err) } return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/kill.go000066400000000000000000000076661465435605700220410ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "fmt" "strconv" "strings" "github.com/google/subcommands" "golang.org/x/sys/unix" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // Kill implements subcommands.Command for the "kill" command. type Kill struct { all bool pid int } // Name implements subcommands.Command.Name. func (*Kill) Name() string { return "kill" } // Synopsis implements subcommands.Command.Synopsis. func (*Kill) Synopsis() string { return "sends a signal to the container" } // Usage implements subcommands.Command.Usage. func (*Kill) Usage() string { return `kill [signal]` } // SetFlags implements subcommands.Command.SetFlags. func (k *Kill) SetFlags(f *flag.FlagSet) { f.BoolVar(&k.all, "all", false, "send the specified signal to all processes inside the container") f.IntVar(&k.pid, "pid", 0, "send the specified signal to a specific process. pid is relative to the root PID namespace") } // Execute implements subcommands.Command.Execute. func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() == 0 || f.NArg() > 2 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) conf := args[0].(*config.Config) if k.pid != 0 && k.all { util.Fatalf("it is invalid to specify both --all and --pid") } c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { util.Fatalf("loading container: %v", err) } // The OCI command-line spec says that the signal should be specified // via a flag, but runc (and things that call runc) pass it as an // argument. signal := f.Arg(1) if signal == "" { signal = "TERM" } sig, err := parseSignal(signal) if err != nil { util.Fatalf("%v", err) } if k.pid != 0 { if err := c.SignalProcess(sig, int32(k.pid)); err != nil { util.Fatalf("failed to signal pid %d: %v", k.pid, err) } } else { if err := c.SignalContainer(sig, k.all); err != nil { util.Fatalf("%v", err) } } return subcommands.ExitSuccess } func parseSignal(s string) (unix.Signal, error) { n, err := strconv.Atoi(s) if err == nil { sig := unix.Signal(n) for _, msig := range signalMap { if sig == msig { return sig, nil } } return -1, fmt.Errorf("unknown signal %q", s) } if sig, ok := signalMap[strings.TrimPrefix(strings.ToUpper(s), "SIG")]; ok { return sig, nil } return -1, fmt.Errorf("unknown signal %q", s) } var signalMap = map[string]unix.Signal{ "ABRT": unix.SIGABRT, "ALRM": unix.SIGALRM, "BUS": unix.SIGBUS, "CHLD": unix.SIGCHLD, "CLD": unix.SIGCLD, "CONT": unix.SIGCONT, "FPE": unix.SIGFPE, "HUP": unix.SIGHUP, "ILL": unix.SIGILL, "INT": unix.SIGINT, "IO": unix.SIGIO, "IOT": unix.SIGIOT, "KILL": unix.SIGKILL, "PIPE": unix.SIGPIPE, "POLL": unix.SIGPOLL, "PROF": unix.SIGPROF, "PWR": unix.SIGPWR, "QUIT": unix.SIGQUIT, "SEGV": unix.SIGSEGV, "STKFLT": unix.SIGSTKFLT, "STOP": unix.SIGSTOP, "SYS": unix.SIGSYS, "TERM": unix.SIGTERM, "TRAP": unix.SIGTRAP, "TSTP": unix.SIGTSTP, "TTIN": unix.SIGTTIN, "TTOU": unix.SIGTTOU, "URG": unix.SIGURG, "USR1": unix.SIGUSR1, "USR2": unix.SIGUSR2, "VTALRM": unix.SIGVTALRM, "WINCH": unix.SIGWINCH, "XCPU": unix.SIGXCPU, "XFSZ": unix.SIGXFSZ, } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/list.go000066400000000000000000000066731465435605700220560ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "encoding/json" "fmt" "io" "os" "text/tabwriter" "time" "github.com/google/subcommands" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // List implements subcommands.Command for the "list" command. type List struct { quiet bool format string sandbox bool } // Name implements subcommands.command.name. func (*List) Name() string { return "list" } // Synopsis implements subcommands.Command.Synopsis. func (*List) Synopsis() string { return "list containers started by runsc with the given root" } // Usage implements subcommands.Command.Usage. func (*List) Usage() string { return `list [flags]` } // SetFlags implements subcommands.Command.SetFlags. func (l *List) SetFlags(f *flag.FlagSet) { f.BoolVar(&l.quiet, "quiet", false, "only list container ids") f.StringVar(&l.format, "format", "text", "output format: 'text' (default) or 'json'") f.BoolVar(&l.sandbox, "sandbox", false, "limit output to sandboxes only") } // Execute implements subcommands.Command.Execute. func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() != 0 { f.Usage() return subcommands.ExitUsageError } conf := args[0].(*config.Config) if err := l.execute(conf.RootDir, os.Stdout); err != nil { util.Fatalf("%v", err) } return subcommands.ExitSuccess } func (l *List) execute(rootDir string, out io.Writer) error { var ids []container.FullID var err error if l.sandbox { ids, err = container.ListSandboxes(rootDir) } else { ids, err = container.List(rootDir) } if err != nil { return err } if l.quiet { for _, id := range ids { fmt.Fprintln(out, id.ContainerID) } return nil } // Collect the containers. var containers []*container.Container for _, id := range ids { c, err := container.Load(rootDir, id, container.LoadOpts{Exact: true}) if err != nil { log.Warningf("Skipping container %q: %v", id, err) continue } containers = append(containers, c) } switch l.format { case "text": // Print a nice table. w := tabwriter.NewWriter(out, 12, 1, 3, ' ', 0) fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tOWNER\n") for _, c := range containers { fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\t%s\n", c.ID, c.SandboxPid(), c.Status, c.BundleDir, c.CreatedAt.Format(time.RFC3339Nano), c.Owner) } _ = w.Flush() case "json": // Print just the states. var states []specs.State for _, c := range containers { states = append(states, c.State()) } if err := json.NewEncoder(out).Encode(states); err != nil { return fmt.Errorf("marshaling container state: %w", err) } default: return fmt.Errorf("unknown list format %q", l.format) } return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/metric_export.go000066400000000000000000000066641465435605700237670ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "fmt" "os" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/prometheus" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/metricserver/containermetrics" ) // MetricExport implements subcommands.Command for the "metric-export" command. type MetricExport struct { exporterPrefix string sandboxMetricsFilter string } // Name implements subcommands.Command.Name. func (*MetricExport) Name() string { return "export-metrics" } // Synopsis implements subcommands.Command.Synopsis. func (*MetricExport) Synopsis() string { return "export metric data for the sandbox" } // Usage implements subcommands.Command.Usage. func (*MetricExport) Usage() string { return `export-metrics [-exporter-prefix=] - prints sandbox metric data in Prometheus metric format ` } // SetFlags implements subcommands.Command.SetFlags. func (m *MetricExport) SetFlags(f *flag.FlagSet) { f.StringVar(&m.exporterPrefix, "exporter-prefix", "runsc_", "Prefix for all metric names, following Prometheus exporter convention") f.StringVar(&m.sandboxMetricsFilter, "sandbox-metrics-filter", "", "If set, filter exported metrics using the specified regular expression. This filtering is applied before adding --exporter-prefix.") } // Execute implements subcommands.Command.Execute. func (m *MetricExport) Execute(ctx context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() < 1 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) conf := args[0].(*config.Config) cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { util.Fatalf("loading container: %v", err) } prometheusLabels, err := containermetrics.SandboxPrometheusLabels(cont) if err != nil { util.Fatalf("Cannot compute Prometheus labels of sandbox: %v", err) } snapshot, err := cont.Sandbox.ExportMetrics(control.MetricsExportOpts{ OnlyMetrics: m.sandboxMetricsFilter, }) if err != nil { util.Fatalf("ExportMetrics failed: %v", err) } commentHeader := fmt.Sprintf("Command-line export for sandbox %s", cont.Sandbox.ID) if m.sandboxMetricsFilter != "" { commentHeader = fmt.Sprintf("%s (filtered using regular expression: %q)", commentHeader, m.sandboxMetricsFilter) } written, err := prometheus.Write(os.Stdout, prometheus.ExportOptions{ CommentHeader: commentHeader, }, map[*prometheus.Snapshot]prometheus.SnapshotExportOptions{ snapshot: { ExporterPrefix: m.exporterPrefix, ExtraLabels: prometheusLabels, }, }) if err != nil { util.Fatalf("Cannot write metrics to stdout: %v", err) } util.Infof("Wrote %d bytes of Prometheus metric data to stdout", written) return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/metric_metadata.go000066400000000000000000000037061465435605700242200ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "fmt" "github.com/google/subcommands" "google.golang.org/protobuf/encoding/prototext" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/flag" ) // MetricMetadata implements subcommands.Command for the "metric-metadata" command. type MetricMetadata struct { } // Name implements subcommands.Command.Name. func (*MetricMetadata) Name() string { return "metric-metadata" } // Synopsis implements subcommands.Command.Synopsis. func (*MetricMetadata) Synopsis() string { return "export metric metadata of metrics registered in this build, in text proto format" } // Usage implements subcommands.Command.Usage. func (*MetricMetadata) Usage() string { return "metric-metadata" } // SetFlags implements subcommands.Command.SetFlags. func (m *MetricMetadata) SetFlags(f *flag.FlagSet) { } // Execute implements subcommands.Command.Execute. func (m *MetricMetadata) Execute(ctx context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if err := metric.Initialize(); err != nil { util.Fatalf("Cannot initialize metrics: %v", err) } registration, err := metric.GetMetricRegistration() if err != nil { util.Fatalf("Cannot get metric registration data: %v", err) } fmt.Println(prototext.MarshalOptions{Multiline: true, EmitASCII: true}.Format(registration)) return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/metric_server.go000066400000000000000000000022471465435605700237450ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "github.com/google/subcommands" "gvisor.dev/gvisor/runsc/cmd/metricserver/metricservercmd" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/flag" ) // MetricServer implements subcommands.Command for the "metric-server" command. type MetricServer struct { metricservercmd.Cmd } // Execute implements subcommands.Command.Execute. func (m *MetricServer) Execute(ctx context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { util.Fatalf("this build does not support the metric-server subcommand") return subcommands.ExitFailure } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/metricserver/000077500000000000000000000000001465435605700232525ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/metricserver/metricservercmd/000077500000000000000000000000001465435605700264505ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/metricserver/metricservercmd/metricservercmd.go000066400000000000000000000042141465435605700321760ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package metricservercmd partially implements the 'metric-server' subcommand. package metricservercmd import ( "gvisor.dev/gvisor/runsc/flag" ) // Cmd partially implements subcommands.Command for the metric-server command. type Cmd struct { ExporterPrefix string PIDFile string ExposeProfileEndpoints bool AllowUnknownRoot bool } // Name implements subcommands.Command.Name. func (*Cmd) Name() string { return "metric-server" } // Synopsis implements subcommands.Command.Synopsis. func (*Cmd) Synopsis() string { return "implements Prometheus metrics HTTP endpoint" } // Usage implements subcommands.Command.Usage. func (*Cmd) Usage() string { return `-root= -metric-server= metric-server [-exporter-prefix=] ` } // SetFlags implements subcommands.Command.SetFlags. func (c *Cmd) SetFlags(f *flag.FlagSet) { f.StringVar(&c.ExporterPrefix, "exporter-prefix", "runsc_", "Prefix for all metric names, following Prometheus exporter convention") f.StringVar(&c.PIDFile, "pid-file", "", "If set, write the metric server's own PID to this file after binding to the --metric-server address. The parent directory of this file must already exist.") f.BoolVar(&c.ExposeProfileEndpoints, "allow-profiling", false, "If true, expose /runsc-metrics/profile-cpu and /runsc-metrics/profile-heap to get profiling data about the metric server") f.BoolVar(&c.AllowUnknownRoot, "allow-unknown-root", false, "if set, the metric server will keep running regardless of the existence of --root or the metric server's ability to access it.") } metricservercmd_state_autogen.go000066400000000000000000000001011465435605700350300ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/metricserver/metricservercmd// automatically generated by stateify. package metricservercmd golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/mitigate.go000066400000000000000000000130531465435605700226740ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "fmt" "io/ioutil" "os" "runtime" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/mitigate" ) const ( // cpuInfo is the path used to parse CPU info. cpuInfo = "/proc/cpuinfo" // Path to enable/disable SMT. smtPath = "/sys/devices/system/cpu/smt/control" ) // Mitigate implements subcommands.Command for the "mitigate" command. type Mitigate struct { // Run the command without changing the underlying system. dryRun bool // Reverse mitigate by turning on all CPU cores. reverse bool // Extra data for post mitigate operations. data string // Control to mitigate/reverse smt. control machineControl } // Name implements subcommands.command.name. func (*Mitigate) Name() string { return "mitigate" } // Synopsis implements subcommands.Command.Synopsis. func (*Mitigate) Synopsis() string { return "mitigate mitigates the underlying system against side channel attacks" } // Usage implements Usage for cmd.Mitigate. func (m *Mitigate) Usage() string { return fmt.Sprintf(`mitigate [flags] mitigate mitigates a system to the "MDS" vulnerability by writing "off" to %q. CPUs can be restored by writing "on" to the same file or rebooting your system. The command can be reversed with --reverse, which writes "on" to the file above.%s`, smtPath, m.usage()) } // SetFlags sets flags for the command Mitigate. func (m *Mitigate) SetFlags(f *flag.FlagSet) { f.BoolVar(&m.dryRun, "dryrun", false, "run the command without changing system") f.BoolVar(&m.reverse, "reverse", false, "reverse mitigate by enabling all CPUs") m.setFlags(f) } // Execute implements subcommands.Command.Execute. func (m *Mitigate) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if runtime.GOARCH == "arm64" || runtime.GOARCH == "arm" { log.Warningf("As ARM is not affected by MDS, mitigate does not support ARM machines.") // Set reverse flag so that we still perform post mitigate operations. mitigate reverse is a noop in this case. m.reverse = true } if f.NArg() != 0 { f.Usage() return subcommands.ExitUsageError } m.control = &machineControlImpl{} return m.execute() } // execute executes mitigate operations. Seperate from Execute method for // easier mocking. func (m *Mitigate) execute() subcommands.ExitStatus { beforeSet, err := m.control.getCPUs() if err != nil { return util.Errorf("Get before CPUSet failed: %v", err) } log.Infof("CPUs before: %s", beforeSet.String()) if err := m.doEnableDisable(beforeSet); err != nil { return util.Errorf("Enabled/Disable action failed on %q: %v", smtPath, err) } afterSet, err := m.control.getCPUs() if err != nil { return util.Errorf("Get after CPUSet failed: %v", err) } log.Infof("CPUs after: %s", afterSet.String()) if err = m.postMitigate(afterSet); err != nil { return util.Errorf("Post Mitigate failed: %v", err) } return subcommands.ExitSuccess } // doEnableDisable does either enable or disable operation based on flags. func (m *Mitigate) doEnableDisable(set mitigate.CPUSet) error { if m.reverse { if m.dryRun { log.Infof("Skipping reverse action because dryrun is set.") return nil } return m.control.enable() } if m.dryRun { log.Infof("Skipping mitigate action because dryrun is set.") return nil } if set.IsVulnerable() { return m.control.disable() } log.Infof("CPUs not vulnerable. Skipping disable call.") return nil } // Interface to wrap interactions with underlying machine. Done // so testing with mocks can be done hermetically. type machineControl interface { enable() error disable() error isEnabled() (bool, error) getCPUs() (mitigate.CPUSet, error) } // Implementation of SMT control interaction with the underlying machine. type machineControlImpl struct{} func (*machineControlImpl) enable() error { return checkFileExistsOnWrite("enable", "on") } func (*machineControlImpl) disable() error { return checkFileExistsOnWrite("disable", "off") } // Writes data to SMT control. If file not found, logs file not exist error and returns nil // error, which is done because machines without the file pointed to by smtPath only have one // thread per core in the first place. Otherwise returns error from ioutil.WriteFile. func checkFileExistsOnWrite(op, data string) error { err := ioutil.WriteFile(smtPath, []byte(data), 0644) if err != nil && os.IsExist(err) { log.Infof("File %q does not exist for operation %s. This machine probably has no smt control.", smtPath, op) return nil } return err } func (*machineControlImpl) isEnabled() (bool, error) { data, err := ioutil.ReadFile(cpuInfo) return string(data) == "on", err } func (*machineControlImpl) getCPUs() (mitigate.CPUSet, error) { data, err := ioutil.ReadFile(cpuInfo) if err != nil { return nil, fmt.Errorf("failed to read %s: %w", cpuInfo, err) } set, err := mitigate.NewCPUSet(string(data)) if err != nil { return nil, fmt.Errorf("getCPUs: %v", err) } return set, nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/mitigate_extras.go000066400000000000000000000020401465435605700242540ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false package cmd import ( "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/mitigate" ) // usage returns any extra bits of the usage string. func (m *Mitigate) usage() string { return "" } // setFlags sets extra flags for the command Mitigate. func (m *Mitigate) setFlags(f *flag.FlagSet) {} // postMitigate handles any postMitigate actions. func (m *Mitigate) postMitigate(_ mitigate.CPUSet) error { return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/nvproxy/000077500000000000000000000000001465435605700222655ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/nvproxy/list_supported_drivers.go000066400000000000000000000035441465435605700274400ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package nvproxy provides subcommands for the nvproxy command. package nvproxy import ( "context" "fmt" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/sentry/devices/nvproxy" "gvisor.dev/gvisor/runsc/flag" ) // listSupportedDrivers implements subcommands.Command for the "nvproxy list-supported-drivers" command. type listSupportedDrivers struct{} // Name implements subcommands.Command. func (*listSupportedDrivers) Name() string { return "list-supported-drivers" } // Synopsis implements subcommands.Command. func (*listSupportedDrivers) Synopsis() string { return "list all nvidia driver versions supported by nvproxy" } // Usage implements subcommands.Command. func (*listSupportedDrivers) Usage() string { return `list-supported-drivers - list all nvidia driver versions supported by nvproxy ` } // SetFlags implements subcommands.Command. func (*listSupportedDrivers) SetFlags(*flag.FlagSet) {} // Execute implements subcommands.Command. func (*listSupportedDrivers) Execute(ctx context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() != 0 { f.Usage() return subcommands.ExitUsageError } nvproxy.ForEachSupportDriver(func(version nvproxy.DriverVersion, _ string) { fmt.Println(version) }) return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/nvproxy/nvproxy.go000066400000000000000000000033101465435605700243360ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package nvproxy provides subcommands for the nvproxy command. package nvproxy import ( "bytes" "context" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/sentry/devices/nvproxy" "gvisor.dev/gvisor/runsc/flag" ) type Nvproxy struct{} func (*Nvproxy) Name() string { return "nvproxy" } func (*Nvproxy) Synopsis() string { return "shows information about nvproxy support" } func (*Nvproxy) Usage() string { buf := bytes.Buffer{} buf.WriteString("Usage: nvproxy \n\n") cdr := createCommander(&flag.FlagSet{}) cdr.VisitGroups(func(grp *subcommands.CommandGroup) { cdr.ExplainGroup(&buf, grp) }) return buf.String() } func (*Nvproxy) SetFlags(*flag.FlagSet) {} func (*Nvproxy) Execute(ctx context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { nvproxy.Init() return createCommander(f).Execute(ctx, args...) } func createCommander(f *flag.FlagSet) *subcommands.Commander { cdr := subcommands.NewCommander(f, "nvproxy") cdr.Register(cdr.HelpCommand(), "") cdr.Register(cdr.FlagsCommand(), "") cdr.Register(new(listSupportedDrivers), "") return cdr } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/nvproxy/nvproxy_state_autogen.go000066400000000000000000000000711465435605700272610ustar00rootroot00000000000000// automatically generated by stateify. package nvproxy golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/path.go000066400000000000000000000015631465435605700220300ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "os" "gvisor.dev/gvisor/runsc/cmd/util" ) // getwdOrDie returns the current working directory and dies if it cannot. func getwdOrDie() string { wd, err := os.Getwd() if err != nil { util.Fatalf("getting current working directory: %v", err) } return wd } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/pause.go000066400000000000000000000036121465435605700222060ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "github.com/google/subcommands" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // Pause implements subcommands.Command for the "pause" command. type Pause struct{} // Name implements subcommands.Command.Name. func (*Pause) Name() string { return "pause" } // Synopsis implements subcommands.Command.Synopsis. func (*Pause) Synopsis() string { return "pause suspends all processes in a container" } // Usage implements subcommands.Command.Usage. func (*Pause) Usage() string { return `pause - pause process in instance of container.` } // SetFlags implements subcommands.Command.SetFlags. func (*Pause) SetFlags(*flag.FlagSet) { } // Execute implements subcommands.Command.Execute. func (*Pause) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) conf := args[0].(*config.Config) cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { util.Fatalf("loading container: %v", err) } if err := cont.Pause(); err != nil { util.Fatalf("pause failed: %v", err) } return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/platforms.go000066400000000000000000000031011465435605700230710ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "fmt" "os" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/runsc/flag" ) // Platforms implements subcommands.Command for the "platforms" command. type Platforms struct{} // Name implements subcommands.Command.Name. func (*Platforms) Name() string { return "platforms" } // Synopsis implements subcommands.Command.Synopsis. func (*Platforms) Synopsis() string { return "Print a list of available platforms." } // Usage implements subcommands.Command.Usage. func (*Platforms) Usage() string { return `platforms [options] - Print available platforms. ` } // SetFlags implements subcommands.Command.SetFlags. func (*Platforms) SetFlags(f *flag.FlagSet) {} // Execute implements subcommands.Command.Execute. func (*Platforms) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { for _, p := range platform.List() { fmt.Fprintf(os.Stdout, "%s\n", p) } return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/portforward.go000066400000000000000000000233151465435605700234440ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "fmt" "io" "io/ioutil" "math" "net" "os" "os/signal" "strconv" "strings" "sync" "syscall" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // PortForward implements subcommands.Command for the "portforward" command. type PortForward struct { portNum int stream string } // Name implements subcommands.Command.Name. func (*PortForward) Name() string { return "port-forward" } // Synopsis implements subcommands.Command.Synopsis. func (*PortForward) Synopsis() string { return "port forward to a secure container" } // Usage implements subcommands.Command.Usage. func (*PortForward) Usage() string { return `port-forward CONTAINER_ID [LOCAL_PORT:]REMOTE_PORT - port forward to gvisor container. Port forwarding has two modes. Local mode opens a local port and forwards connections to another port inside the specified container. Stream mode forwards a single connection on a UDS to the specified port in the container. EXAMPLES: The following will forward connections on local port 8080 to port 80 in the container named 'nginx': # runsc port-forward nginx 8080:80 The following will forward a single new connection on the unix domain socket at /tmp/pipe to port 80 in the container named 'nginx': # runsc port-forward --stream /tmp/pipe nginx 80 OPTIONS: ` } // SetFlags implements subcommands.Command.SetFlags. func (p *PortForward) SetFlags(f *flag.FlagSet) { f.StringVar(&p.stream, "stream", "", "Stream mode - a Unix doman socket") } // Execute implements subcommands.Command.Execute. func (p *PortForward) Execute(ctx context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { conf := args[0].(*config.Config) // Requires at least the container id and port. if f.NArg() != 2 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) portStr := f.Arg(1) c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { util.Fatalf("loading container: %v", err) } if p.stream != "" { if err := p.doStream(ctx, portStr, c); err != nil { util.Fatalf("doStream: %v", err) } return subcommands.ExitSuccess } // Allow forwarding to a local port. ports := strings.Split(portStr, ":") if len(ports) != 2 { util.Fatalf("invalid port string %q", portStr) } localPort, err := strconv.Atoi(ports[0]) if err != nil { util.Fatalf("invalid port string %q: %v", portStr, err) } portNum, err := strconv.Atoi(ports[1]) if err != nil { util.Fatalf("invalid port string %q: %v", portStr, err) } if portNum <= 0 || portNum > math.MaxUint16 { util.Fatalf("invalid port %d: %v", portNum, err) } // Start port forwarding with the local port. var wg sync.WaitGroup ctx, cancel := context.WithCancel(ctx) wg.Add(3) go func(localPort, portNum int) { defer cancel() defer wg.Done() // Print message to local user. fmt.Printf("Forwarding local port %d to %d...\n", localPort, portNum) if err := localForward(ctx, c, localPort, uint16(portNum)); err != nil { log.Warningf("port forwarding: %v", err) } }(localPort, portNum) // Exit port forwarding if the container exits. go func() { defer wg.Done() // Cancel port forwarding after Wait returns regardless of return // value as err may indicate sandbox has terminated already. _, _ = c.Wait() fmt.Printf("Container %q stopped. Exiting...\n", c.ID) cancel() }() // Wait for ^C from the user. go func() { defer wg.Done() sig := waitSignal() fmt.Printf("Got %v, Exiting...\n", sig) cancel() }() // Wait on a WaitGroup for port forwarding to clean up before exiting. wg.Wait() return subcommands.ExitSuccess } // localForward starts port forwarding from the given local port. func localForward(ctx context.Context, c *container.Container, localPort int, containerPort uint16) error { l, err := net.Listen("tcp", ":"+strconv.Itoa(localPort)) if err != nil { return err } defer l.Close() var localConnChan = make(chan net.Conn, 1) var errChan = make(chan error, 1) go func() { for { if ctx.Err() != nil { return } localConn, err := l.Accept() if err != nil { errChan <- err continue } localConnChan <- localConn } }() for { // Exit if the context is done. select { case <-ctx.Done(): return ctx.Err() case err := <-errChan: if err != nil { log.Warningf("accepting local connection: %v", err) } case localConn := <-localConnChan: // Dispatch a new goroutine to handle the new connection. go func() { defer localConn.Close() fmt.Println("Forwarding new connection...") err := portCopy(ctx, c, localConn, containerPort) if err != nil { log.Warningf("port forwarding: %v", err) } fmt.Println("Finished forwarding connection...") }() } } } // doStream does the stream version of the port-forward command. func (p *PortForward) doStream(ctx context.Context, port string, c *container.Container) error { var err error p.portNum, err = strconv.Atoi(port) if err != nil { return fmt.Errorf("invalid port string %q: %v", port, err) } if p.portNum < 0 || p.portNum > math.MaxUint16 { return fmt.Errorf("invalid port %d: %v", p.portNum, err) } f, err := openStream(p.stream) if err != nil { return fmt.Errorf("opening uds stream: %v", err) } defer f.Close() if err := c.PortForward(&boot.PortForwardOpts{ Port: uint16(p.portNum), ContainerID: c.ID, FilePayload: urpc.FilePayload{Files: []*os.File{f}}, }); err != nil { return fmt.Errorf("PortForward: %v", err) } return nil } // portCopy creates a UDS and begins copying data to and from the local // connection. func portCopy(ctx context.Context, c *container.Container, localConn net.Conn, port uint16) error { // Create a new path address for the UDS. addr, err := tmpUDSAddr() if err != nil { return err } // Create the UDS and Listen on it. l, err := net.Listen("unix", addr) if err != nil { return err } defer l.Close() // Open the UDS as a File so it can be donated to the sentry. streamFile, err := openStream(addr) if err != nil { return fmt.Errorf("opening uds stream: %v", err) } defer streamFile.Close() // Request port forwarding from the sentry. This request will return // immediately after port forwarding is started and connection state is // handled via the UDS from then on. if err := c.PortForward(&boot.PortForwardOpts{ Port: port, FilePayload: urpc.FilePayload{Files: []*os.File{streamFile}}, }); err != nil { return fmt.Errorf("PortForward: %v", err) } // We have already opened a single connection on the UDS and passed the // client end to the sentry. We accept the connection now in order to get // the other half of the connection. conn, err := l.Accept() if err != nil { return err } toErrCh := make(chan error) fromErrCh := make(chan error) // Copy data from the local port to the UDS. go func() { defer conn.Close() defer localConn.Close() log.Debugf("Start copying from %q to %q", localConn.LocalAddr().String(), conn.LocalAddr().String()) _, err := io.Copy(localConn, conn) log.Debugf("Stopped copying from %q to %q", localConn.LocalAddr().String(), conn.LocalAddr().String()) toErrCh <- err close(toErrCh) }() // Copy data from the UDS to the local port. go func() { defer conn.Close() defer localConn.Close() log.Debugf("Start copying from %q to %q", conn.LocalAddr().String(), localConn.LocalAddr().String()) _, err := io.Copy(conn, localConn) log.Debugf("Stopped copying from %q to %q", conn.LocalAddr().String(), localConn.LocalAddr().String()) fromErrCh <- err close(fromErrCh) }() errMap := map[string]error{} for { if len(errMap) == 2 { return nil } select { case e := <-toErrCh: errMap["toChannel"] = e case e := <-fromErrCh: errMap["fromChannel"] = e case <-ctx.Done(): log.Debugf("Port forwarding connection canceled for %q: %v", localConn.LocalAddr().String(), ctx.Err()) return ctx.Err() } } } // tmpUDS generates a temporary UDS addr. func tmpUDSAddr() (string, error) { tmpFile, err := ioutil.TempFile("", "runsc-port-forward") if err != nil { return "", err } path := tmpFile.Name() // Remove the tempfile and just use its name. os.Remove(path) return path, nil } // openStream opens a UDS as a socket and returns the file descriptor in an // os.File object. func openStream(name string) (*os.File, error) { // The net package will abstract the fd, so we use raw syscalls. fd, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) if err != nil { return nil, err } // We are acting as a client so we will connect to the socket. if err = syscall.Connect(fd, &syscall.SockaddrUnix{Name: name}); err != nil { syscall.Close(fd) return nil, err } // Return a File so that we can pass it to the Sentry. return os.NewFile(uintptr(fd), name), nil } // waitSignal waits for SIGINT, SIGQUIT, or SIGTERM from the user. func waitSignal() os.Signal { ch := make(chan os.Signal, 2) signal.Notify( ch, syscall.SIGINT, syscall.SIGQUIT, syscall.SIGTERM, ) return <-ch } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/ps.go000066400000000000000000000045151465435605700215160ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "fmt" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // PS implements subcommands.Command for the "ps" command. type PS struct { format string } // Name implements subcommands.Command.Name. func (*PS) Name() string { return "ps" } // Synopsis implements subcommands.Command.Synopsis. func (*PS) Synopsis() string { return "ps displays the processes running inside a container" } // Usage implements subcommands.Command.Usage. func (*PS) Usage() string { return " [ps options]" } // SetFlags implements subcommands.Command.SetFlags. func (ps *PS) SetFlags(f *flag.FlagSet) { f.StringVar(&ps.format, "format", "table", "output format. Select one of: table or json (default: table)") } // Execute implements subcommands.Command.Execute. func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) conf := args[0].(*config.Config) c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{SkipCheck: true}) if err != nil { util.Fatalf("loading sandbox: %v", err) } pList, err := c.Processes() if err != nil { util.Fatalf("getting processes for container: %v", err) } switch ps.format { case "table": fmt.Println(control.ProcessListToTable(pList)) case "json": o, err := control.PrintPIDsJSON(pList) if err != nil { util.Fatalf("generating JSON: %v", err) } fmt.Println(o) default: util.Fatalf("unsupported format: %s", ps.format) } return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/read_control.go000066400000000000000000000050051465435605700235420ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "fmt" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // ReadControl implements subcommands.Command for the "read-control" command. type ReadControl struct{} // Name implements subcommands.Command.Name. func (*ReadControl) Name() string { return "read-control" } // Synopsis implements subcommands.Command.Synopsis. func (*ReadControl) Synopsis() string { return "read a cgroups control value inside the container" } // Usage implements subcommands.Command.Usage. func (*ReadControl) Usage() string { return `read-control Where "" is the name for the instance of the container, "" is the name of an active cgroupv1 controller, is the path to the cgroup to read and is the name of the control file to read. EXAMPLE: # runsc read-control cpuacct / cpuacct.usage ` } // SetFlags implements subcommands.Command.SetFlags. func (r *ReadControl) SetFlags(f *flag.FlagSet) {} // Execute implements subcommands.Command.Execute. func (r *ReadControl) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() < 4 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) conf := args[0].(*config.Config) c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{SkipCheck: true}) if err != nil { util.Fatalf("loading sandbox: %v", err) } out, err := c.Sandbox.CgroupsReadControlFile(control.CgroupControlFile{ Controller: f.Arg(1), Path: f.Arg(2), Name: f.Arg(3), }) if err != nil { fmt.Printf("ERROR: %s\n", err) return subcommands.ExitFailure } fmt.Printf("%s\n", out) return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/restore.go000066400000000000000000000117371465435605700225630ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "os" "github.com/google/subcommands" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/specutils" ) // Restore implements subcommands.Command for the "restore" command. type Restore struct { // Restore flags are a super-set of those for Create. Create // imagePath is the path to the saved container image imagePath string // detach indicates that runsc has to start a process and exit without waiting it. detach bool // direct indicates whether O_DIRECT should be used for reading the // checkpoint pages file. It is faster if the checkpoint files are not // already in the page cache (for example if its coming from an untouched // network block device). Usually the restore is done only once, so the cost // of adding the checkpoint files to the page cache can be redundant. direct bool } // Name implements subcommands.Command.Name. func (*Restore) Name() string { return "restore" } // Synopsis implements subcommands.Command.Synopsis. func (*Restore) Synopsis() string { return "restore a saved state of container (experimental)" } // Usage implements subcommands.Command.Usage. func (*Restore) Usage() string { return `restore [flags] - restore saved state of container. ` } // SetFlags implements subcommands.Command.SetFlags. func (r *Restore) SetFlags(f *flag.FlagSet) { r.Create.SetFlags(f) f.StringVar(&r.imagePath, "image-path", "", "directory path to saved container image") f.BoolVar(&r.detach, "detach", false, "detach from the container's process") f.BoolVar(&r.direct, "direct", false, "use O_DIRECT for reading checkpoint pages file") // Unimplemented flags necessary for compatibility with docker. var nsr bool f.BoolVar(&nsr, "no-subreaper", false, "ignored") var wp string f.StringVar(&wp, "work-path", "", "ignored") } // Execute implements subcommands.Command.Execute. func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) conf := args[0].(*config.Config) waitStatus := args[1].(*unix.WaitStatus) if conf.Rootless { return util.Errorf("Rootless mode not supported with %q", r.Name()) } bundleDir := r.bundleDir if bundleDir == "" { bundleDir = getwdOrDie() } if r.imagePath == "" { return util.Errorf("image-path flag must be provided") } var cu cleanup.Cleanup defer cu.Clean() runArgs := container.Args{ ID: id, Spec: nil, BundleDir: bundleDir, ConsoleSocket: r.consoleSocket, PIDFile: r.pidFile, UserLog: r.userLog, Attached: !r.detach, } log.Debugf("Restore container, cid: %s, rootDir: %q", id, conf.RootDir) c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { if err != os.ErrNotExist { return util.Errorf("loading container: %v", err) } log.Warningf("Container not found, creating new one, cid: %s, spec from: %s", id, bundleDir) // Read the spec again here to ensure flag annotations from the spec are // applied to "conf". if runArgs.Spec, err = specutils.ReadSpec(bundleDir, conf); err != nil { return util.Errorf("reading spec: %v", err) } specutils.LogSpecDebug(runArgs.Spec, conf.OCISeccomp) if c, err = container.New(conf, runArgs); err != nil { return util.Errorf("creating container: %v", err) } // Clean up partially created container if an error occurs. // Any errors returned by Destroy() itself are ignored. cu.Add(func() { c.Destroy() }) } else { runArgs.Spec = c.Spec } log.Debugf("Restore: %v", r.imagePath) if err := c.Restore(conf, r.imagePath, r.direct); err != nil { return util.Errorf("starting container: %v", err) } // If we allocate a terminal, forward signals to the sandbox process. // Otherwise, Ctrl+C will terminate this process and its children, // including the terminal. if c.Spec.Process.Terminal { stopForwarding := c.ForwardSignals(0, true /* fgProcess */) defer stopForwarding() } var ws unix.WaitStatus if runArgs.Attached { if ws, err = c.Wait(); err != nil { return util.Errorf("running container: %v", err) } } *waitStatus = ws cu.Release() return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/resume.go000066400000000000000000000036051465435605700223730ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "github.com/google/subcommands" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // Resume implements subcommands.Command for the "resume" command. type Resume struct{} // Name implements subcommands.Command.Name. func (*Resume) Name() string { return "resume" } // Synopsis implements subcommands.Command.Synopsis. func (*Resume) Synopsis() string { return "Resume unpauses a paused container" } // Usage implements subcommands.Command.Usage. func (*Resume) Usage() string { return `resume - resume a paused container. ` } // SetFlags implements subcommands.Command.SetFlags. func (r *Resume) SetFlags(*flag.FlagSet) { } // Execute implements subcommands.Command.Execute. func (r *Resume) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) conf := args[0].(*config.Config) cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { util.Fatalf("loading container: %v", err) } if err := cont.Resume(); err != nil { util.Fatalf("resume failed: %v", err) } return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/run.go000066400000000000000000000102361465435605700216750ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "os" "github.com/google/subcommands" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/specutils" ) // Run implements subcommands.Command for the "run" command. type Run struct { // Run flags are a super-set of those for Create. Create // detach indicates that runsc has to start a process and exit without waiting it. detach bool // passFDs are user-supplied FDs from the host to be exposed to the // sandboxed app. passFDs fdMappings // execFD is the host file descriptor used for program execution. execFD int } // Name implements subcommands.Command.Name. func (*Run) Name() string { return "run" } // Synopsis implements subcommands.Command.Synopsis. func (*Run) Synopsis() string { return "create and run a secure container" } // Usage implements subcommands.Command.Usage. func (*Run) Usage() string { return `run [flags] - create and run a secure container. ` } // SetFlags implements subcommands.Command.SetFlags. func (r *Run) SetFlags(f *flag.FlagSet) { f.BoolVar(&r.detach, "detach", false, "detach from the container's process") f.Var(&r.passFDs, "pass-fd", "file descriptor passed to the container in M:N format, where M is the host and N is the guest descriptor (can be supplied multiple times)") f.IntVar(&r.execFD, "exec-fd", -1, "host file descriptor used for program execution") r.Create.SetFlags(f) } // Execute implements subcommands.Command.Execute. func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) conf := args[0].(*config.Config) waitStatus := args[1].(*unix.WaitStatus) if conf.Rootless { if conf.Network == config.NetworkSandbox { return util.Errorf("sandbox network isn't supported with --rootless, use --network=none or --network=host") } if err := specutils.MaybeRunAsRoot(); err != nil { return util.Errorf("Error executing inside namespace: %v", err) } // Execution will continue here if no more capabilities are needed... } bundleDir := r.bundleDir if bundleDir == "" { bundleDir = getwdOrDie() } spec, err := specutils.ReadSpec(bundleDir, conf) if err != nil { return util.Errorf("reading spec: %v", err) } specutils.LogSpecDebug(spec, conf.OCISeccomp) // Create files from file descriptors. fdMap := make(map[int]*os.File) for _, mapping := range r.passFDs { file := os.NewFile(uintptr(mapping.Host), "") if file == nil { return util.Errorf("Failed to create file from file descriptor %d", mapping.Host) } fdMap[mapping.Guest] = file } var execFile *os.File if r.execFD >= 0 { execFile = os.NewFile(uintptr(r.execFD), "exec-fd") } // Close the underlying file descriptors after we have passed them. defer func() { for _, file := range fdMap { fd := file.Fd() if file.Close() != nil { log.Debugf("Failed to close FD %d", fd) } } if execFile != nil && execFile.Close() != nil { log.Debugf("Failed to close exec FD") } }() runArgs := container.Args{ ID: id, Spec: spec, BundleDir: bundleDir, ConsoleSocket: r.consoleSocket, PIDFile: r.pidFile, UserLog: r.userLog, Attached: !r.detach, PassFiles: fdMap, ExecFile: execFile, } ws, err := container.Run(conf, runArgs) if err != nil { return util.Errorf("running container: %v", err) } *waitStatus = ws return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/spec.go000066400000000000000000000116221465435605700220230ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "encoding/json" "io" "os" "path/filepath" "github.com/google/subcommands" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/flag" ) func writeSpec(w io.Writer, cwd string, netns string, args []string) error { spec := &specs.Spec{ Version: "1.0.0", Process: &specs.Process{ User: specs.User{ UID: 0, GID: 0, }, Args: args, Env: []string{ "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", "TERM=xterm", }, Cwd: cwd, Capabilities: &specs.LinuxCapabilities{ Bounding: []string{ "CAP_AUDIT_WRITE", "CAP_KILL", "CAP_NET_BIND_SERVICE", }, Effective: []string{ "CAP_AUDIT_WRITE", "CAP_KILL", "CAP_NET_BIND_SERVICE", }, Inheritable: []string{ "CAP_AUDIT_WRITE", "CAP_KILL", "CAP_NET_BIND_SERVICE", }, Permitted: []string{ "CAP_AUDIT_WRITE", "CAP_KILL", "CAP_NET_BIND_SERVICE", }, // TODO(gvisor.dev/issue/3166): support ambient capabilities }, Rlimits: []specs.POSIXRlimit{ { Type: "RLIMIT_NOFILE", Hard: 1024, Soft: 1024, }, }, }, Root: &specs.Root{ Path: "rootfs", Readonly: true, }, Hostname: "runsc", Mounts: []specs.Mount{ { Destination: "/proc", Type: "proc", Source: "proc", }, { Destination: "/dev", Type: "tmpfs", Source: "tmpfs", }, { Destination: "/sys", Type: "sysfs", Source: "sysfs", Options: []string{ "nosuid", "noexec", "nodev", "ro", }, }, }, Linux: &specs.Linux{ Namespaces: []specs.LinuxNamespace{ { Type: "pid", }, { Type: "network", Path: netns, }, { Type: "ipc", }, { Type: "uts", }, { Type: "mount", }, }, }, } e := json.NewEncoder(w) e.SetIndent("", " ") return e.Encode(spec) } // Spec implements subcommands.Command for the "spec" command. type Spec struct { bundle string cwd string netns string } // Name implements subcommands.Command.Name. func (*Spec) Name() string { return "spec" } // Synopsis implements subcommands.Command.Synopsis. func (*Spec) Synopsis() string { return "create a new OCI bundle specification file" } // Usage implements subcommands.Command.Usage. func (*Spec) Usage() string { return `spec [options] [-- args...] - create a new OCI bundle specification file. The spec command creates a new specification file (config.json) for a new OCI bundle. The specification file is a starter file that runs the command specified by 'args' in the container. If 'args' is not specified the default is to run the 'sh' program. While a number of flags are provided to change values in the specification, you can examine the file and edit it to suit your needs after this command runs. You can find out more about the format of the specification file by visiting the OCI runtime spec repository: https://github.com/opencontainers/runtime-spec/ EXAMPLE: $ mkdir -p bundle/rootfs $ cd bundle $ runsc spec -- /hello $ docker export $(docker create hello-world) | tar -xf - -C rootfs $ sudo runsc run hello ` } // SetFlags implements subcommands.Command.SetFlags. func (s *Spec) SetFlags(f *flag.FlagSet) { f.StringVar(&s.bundle, "bundle", ".", "path to the root of the OCI bundle") f.StringVar(&s.cwd, "cwd", "/", "working directory that will be set for the executable, "+ "this value MUST be an absolute path") f.StringVar(&s.netns, "netns", "", "network namespace path") } // Execute implements subcommands.Command.Execute. func (s *Spec) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { // Grab the arguments. containerArgs := f.Args() if len(containerArgs) == 0 { containerArgs = []string{"sh"} } confPath := filepath.Join(s.bundle, "config.json") if _, err := os.Stat(confPath); !os.IsNotExist(err) { util.Fatalf("file %q already exists", confPath) } configFile, err := os.OpenFile(confPath, os.O_WRONLY|os.O_CREATE, 0664) if err != nil { util.Fatalf("opening file %q: %v", confPath, err) } err = writeSpec(configFile, s.cwd, s.netns, containerArgs) if err != nil { util.Fatalf("writing to %q: %v", confPath, err) } return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/start.go000066400000000000000000000041351465435605700222270ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "github.com/google/subcommands" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/specutils" ) // Start implements subcommands.Command for the "start" command. type Start struct{} // Name implements subcommands.Command.Name. func (*Start) Name() string { return "start" } // Synopsis implements subcommands.Command.Synopsis. func (*Start) Synopsis() string { return "start a secure container" } // Usage implements subcommands.Command.Usage. func (*Start) Usage() string { return `start - start a secure container.` } // SetFlags implements subcommands.Command.SetFlags. func (*Start) SetFlags(*flag.FlagSet) {} // Execute implements subcommands.Command.Execute. func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) conf := args[0].(*config.Config) c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { util.Fatalf("loading container: %v", err) } // Read the spec again here to ensure flag annotations from the spec are // applied to "conf". if _, err := specutils.ReadSpec(c.BundleDir, conf); err != nil { util.Fatalf("reading spec: %v", err) } if err := c.Start(conf); err != nil { util.Fatalf("starting container: %v", err) } return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/state.go000066400000000000000000000042201465435605700222050ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "encoding/json" "os" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // State implements subcommands.Command for the "state" command. type State struct{} // Name implements subcommands.Command.Name. func (*State) Name() string { return "state" } // Synopsis implements subcommands.Command.Synopsis. func (*State) Synopsis() string { return "get the state of a container" } // Usage implements subcommands.Command.Usage. func (*State) Usage() string { return `state [flags] - get the state of a container` } // SetFlags implements subcommands.Command.SetFlags. func (*State) SetFlags(*flag.FlagSet) {} // Execute implements subcommands.Command.Execute. func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) conf := args[0].(*config.Config) c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { util.Fatalf("loading container: %v", err) } state := c.State() log.Debugf("Returning state for container %q: %+v", c.ID, state) // Write json-encoded state directly to stdout. encoder := json.NewEncoder(os.Stdout) encoder.SetIndent("", " ") if err := encoder.Encode(state); err != nil { util.Fatalf("error marshaling container state: %v", err) } return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/statefile.go000066400000000000000000000074341465435605700230570ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "fmt" "os" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/state/pretty" "gvisor.dev/gvisor/pkg/state/statefile" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/flag" ) // Statefile implements subcommands.Command for the "statefile" command. type Statefile struct { list bool get string key string output string html bool } // Name implements subcommands.Command. func (*Statefile) Name() string { return "state" } // Synopsis implements subcommands.Command. func (*Statefile) Synopsis() string { return "shows information about a statefile" } // Usage implements subcommands.Command. func (*Statefile) Usage() string { return `statefile [flags] ` } // SetFlags implements subcommands.Command. func (s *Statefile) SetFlags(f *flag.FlagSet) { f.BoolVar(&s.list, "list", false, "lists the metdata in the statefile.") f.StringVar(&s.get, "get", "", "extracts the given metadata key.") f.StringVar(&s.key, "key", "", "the integrity key for the file.") f.StringVar(&s.output, "output", "", "target to write the result.") f.BoolVar(&s.html, "html", false, "outputs in HTML format.") } // Execute implements subcommands.Command.Execute. func (s *Statefile) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { // Check arguments. if s.list && s.get != "" { util.Fatalf("error: can't specify -list and -get simultaneously.") } // Setup output. var output = os.Stdout // Default. if s.output != "" { f, err := os.OpenFile(s.output, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0644) if err != nil { util.Fatalf("error opening output: %v", err) } defer func() { if err := f.Close(); err != nil { util.Fatalf("error flushing output: %v", err) } }() output = f } // Open the file. if f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } input, err := os.Open(f.Arg(0)) if err != nil { util.Fatalf("error opening input: %v\n", err) } if s.html { fmt.Fprintf(output, "\n") defer fmt.Fprintf(output, "\n") } // Dump the full file? if !s.list && s.get == "" { var key []byte if s.key != "" { key = []byte(s.key) } rc, _, err := statefile.NewReader(input, key) if err != nil { util.Fatalf("error parsing statefile: %v", err) } if s.html { if err := pretty.PrintHTML(output, rc); err != nil { util.Fatalf("error printing state: %v", err) } } else { if err := pretty.PrintText(output, rc); err != nil { util.Fatalf("error printing state: %v", err) } } return subcommands.ExitSuccess } // Load just the metadata. metadata, err := statefile.MetadataUnsafe(input) if err != nil { util.Fatalf("error reading metadata: %v", err) } // Is it a single key? if s.get != "" { val, ok := metadata[s.get] if !ok { util.Fatalf("metadata key %s: not found", s.get) } fmt.Fprintf(output, "%s\n", val) return subcommands.ExitSuccess } // List all keys. if s.html { fmt.Fprintf(output, "
    \n") defer fmt.Fprintf(output, "
\n") } for key := range metadata { if s.html { fmt.Fprintf(output, "
  • %s
  • \n", key) } else { fmt.Fprintf(output, "%s\n", key) } } return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/symbolize.go000066400000000000000000000061351465435605700231110ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "bufio" "context" "os" "strconv" "strings" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/coverage" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/flag" ) // Symbolize implements subcommands.Command for the "symbolize" command. type Symbolize struct { dumpAll bool } // Name implements subcommands.Command.Name. func (*Symbolize) Name() string { return "symbolize" } // Synopsis implements subcommands.Command.Synopsis. func (*Symbolize) Synopsis() string { return "Convert synthetic instruction pointers from kcov into positions in the runsc source code. Only used when Go coverage is enabled." } // Usage implements subcommands.Command.Usage. func (*Symbolize) Usage() string { return `symbolize - converts synthetic instruction pointers into positions in the runsc source code. This command takes instruction pointers from stdin and converts them into their corresponding file names and line/column numbers in the runsc source code. The inputs are not interpreted as actual addresses, but as synthetic values that are exposed through /sys/kernel/debug/kcov. One can extract coverage information from kcov and translate those values into locations in the source code by running symbolize on the same runsc binary. ` } // SetFlags implements subcommands.Command.SetFlags. func (c *Symbolize) SetFlags(f *flag.FlagSet) { f.BoolVar(&c.dumpAll, "all", false, "dump information on all coverage blocks along with their synthetic PCs") } // Execute implements subcommands.Command.Execute. func (c *Symbolize) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() != 0 { f.Usage() return subcommands.ExitUsageError } if !coverage.Available() { return util.Errorf("symbolize can only be used when coverage is available.") } coverage.InitCoverageData() if c.dumpAll { if err := coverage.WriteAllBlocks(os.Stdout); err != nil { return util.Errorf("Failed to write out blocks: %v", err) } return subcommands.ExitSuccess } scanner := bufio.NewScanner(os.Stdin) for scanner.Scan() { // Input is always base 16, but may or may not have a leading "0x". str := strings.TrimPrefix(scanner.Text(), "0x") pc, err := strconv.ParseUint(str, 16 /* base */, 64 /* bitSize */) if err != nil { return util.Errorf("Failed to symbolize \"%s\": %v", scanner.Text(), err) } if err := coverage.Symbolize(os.Stdout, pc); err != nil { return util.Errorf("Failed to symbolize \"%s\": %v", scanner.Text(), err) } } return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/syscalls.go000066400000000000000000000217611465435605700227330ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "encoding/csv" "encoding/json" "fmt" "io" "os" "sort" "strconv" "text/tabwriter" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/flag" ) // Syscalls implements subcommands.Command for the "syscalls" command. type Syscalls struct { format string os string arch string filename string } // CompatibilityInfo is a map of system and architecture to compatibility doc. // Maps operating system to architecture to ArchInfo. type CompatibilityInfo map[string]map[string]ArchInfo // ArchInfo is compatibility doc for an architecture. type ArchInfo struct { // Syscalls maps syscall number for the architecture to the doc. Syscalls map[uintptr]SyscallDoc `json:"syscalls"` } // SyscallDoc represents a single item of syscall documentation. type SyscallDoc struct { Name string `json:"name"` num uintptr Support string `json:"support"` Note string `json:"note,omitempty"` URLs []string `json:"urls,omitempty"` } type outputFunc func(io.Writer, CompatibilityInfo) error var ( // The string name to use for printing compatibility for all OSes. osAll = "all" // The string name to use for printing compatibility for all architectures. archAll = "all" // A map of OS name to map of architecture name to syscall table. syscallTableMap = make(map[string]map[string]*kernel.SyscallTable) // A map of output type names to output functions. outputMap = map[string]outputFunc{ "table": outputTable, "json": outputJSON, "csv": outputCSV, } ) // Name implements subcommands.Command.Name. func (*Syscalls) Name() string { return "syscalls" } // Synopsis implements subcommands.Command.Synopsis. func (*Syscalls) Synopsis() string { return "Print compatibility information for syscalls." } // Usage implements subcommands.Command.Usage. func (*Syscalls) Usage() string { return `syscalls [options] - Print compatibility information for syscalls. ` } // SetFlags implements subcommands.Command.SetFlags. func (s *Syscalls) SetFlags(f *flag.FlagSet) { f.StringVar(&s.format, "format", "table", "Output format (table, csv, json).") f.StringVar(&s.os, "os", osAll, "The OS (e.g. linux)") f.StringVar(&s.arch, "arch", archAll, "The CPU architecture (e.g. amd64).") f.StringVar(&s.filename, "filename", "", "Output filename (otherwise stdout).") } // Execute implements subcommands.Command.Execute. func (s *Syscalls) Execute(context.Context, *flag.FlagSet, ...any) subcommands.ExitStatus { out, ok := outputMap[s.format] if !ok { util.Fatalf("Unsupported output format %q", s.format) } // Build map of all supported architectures. tables := kernel.SyscallTables() for _, t := range tables { osMap, ok := syscallTableMap[t.OS.String()] if !ok { osMap = make(map[string]*kernel.SyscallTable) syscallTableMap[t.OS.String()] = osMap } osMap[t.Arch.String()] = t } // Build a map of the architectures we want to output. info, err := getCompatibilityInfo(s.os, s.arch) if err != nil { util.Fatalf("%v", err) } w := os.Stdout // Default. if s.filename != "" { w, err = os.OpenFile(s.filename, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) if err != nil { util.Fatalf("Error opening %q: %v", s.filename, err) } } if err := out(w, info); err != nil { util.Fatalf("Error writing output: %v", err) } return subcommands.ExitSuccess } // getCompatibilityInfo returns compatibility info for the given OS name and // architecture name. Supports the special name 'all' for OS and architecture that // specifies that all supported OSes or architectures should be included. func getCompatibilityInfo(osName string, archName string) (CompatibilityInfo, error) { info := CompatibilityInfo(make(map[string]map[string]ArchInfo)) if osName == osAll { // Special processing for the 'all' OS name. for osName := range syscallTableMap { info[osName] = make(map[string]ArchInfo) // osName is a specific OS name. if err := addToCompatibilityInfo(info, osName, archName); err != nil { return info, err } } } else { // osName is a specific OS name. info[osName] = make(map[string]ArchInfo) if err := addToCompatibilityInfo(info, osName, archName); err != nil { return info, err } } return info, nil } // addToCompatibilityInfo adds ArchInfo for the given specific OS name and // architecture name. Supports the special architecture name 'all' to specify // that all supported architectures for the OS should be included. func addToCompatibilityInfo(info CompatibilityInfo, osName string, archName string) error { if archName == archAll { // Special processing for the 'all' architecture name. for archName := range syscallTableMap[osName] { archInfo, err := getArchInfo(osName, archName) if err != nil { return err } info[osName][archName] = archInfo } } else { // archName is a specific architecture name. archInfo, err := getArchInfo(osName, archName) if err != nil { return err } info[osName][archName] = archInfo } return nil } // getArchInfo returns compatibility info for a specific OS and architecture. func getArchInfo(osName string, archName string) (ArchInfo, error) { info := ArchInfo{} info.Syscalls = make(map[uintptr]SyscallDoc) t, ok := syscallTableMap[osName][archName] if !ok { return info, fmt.Errorf("syscall table for %s/%s not found", osName, archName) } for num, sc := range t.Table { info.Syscalls[num] = SyscallDoc{ Name: sc.Name, num: num, Support: sc.SupportLevel.String(), Note: sc.Note, URLs: sc.URLs, } } return info, nil } // outputTable outputs the syscall info in tabular format. func outputTable(w io.Writer, info CompatibilityInfo) error { tw := tabwriter.NewWriter(w, 0, 0, 2, ' ', 0) // Linux for osName, osInfo := range info { for archName, archInfo := range osInfo { // Print the OS/arch fmt.Fprintf(w, "%s/%s:\n\n", osName, archName) // Sort the syscalls for output in the table. sortedCalls := []SyscallDoc{} for _, sc := range archInfo.Syscalls { sortedCalls = append(sortedCalls, sc) } sort.Slice(sortedCalls, func(i, j int) bool { return sortedCalls[i].num < sortedCalls[j].num }) // Write the header _, err := fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n", "NUM", "NAME", "SUPPORT", "NOTE", ) if err != nil { return err } // Write each syscall entry for _, sc := range sortedCalls { _, err = fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n", strconv.FormatInt(int64(sc.num), 10), sc.Name, sc.Support, sc.Note, ) if err != nil { return err } // Add issue urls to note. for _, url := range sc.URLs { _, err = fmt.Fprintf(tw, "%s\t%s\t%s\tSee: %s\t\n", "", "", "", url, ) if err != nil { return err } } } err = tw.Flush() if err != nil { return err } } } return nil } // outputJSON outputs the syscall info in JSON format. func outputJSON(w io.Writer, info CompatibilityInfo) error { e := json.NewEncoder(w) e.SetIndent("", " ") return e.Encode(info) } // numberedRow is aCSV row annotated by syscall number (used for sorting) type numberedRow struct { num uintptr row []string } // outputCSV outputs the syscall info in tabular format. func outputCSV(w io.Writer, info CompatibilityInfo) error { csvWriter := csv.NewWriter(w) // Linux for osName, osInfo := range info { for archName, archInfo := range osInfo { // Sort the syscalls for output in the table. sortedCalls := []numberedRow{} for _, sc := range archInfo.Syscalls { // Add issue urls to note. note := sc.Note for _, url := range sc.URLs { note = fmt.Sprintf("%s\nSee: %s", note, url) } sortedCalls = append(sortedCalls, numberedRow{ num: sc.num, row: []string{ osName, archName, strconv.FormatInt(int64(sc.num), 10), sc.Name, sc.Support, note, }, }) } sort.Slice(sortedCalls, func(i, j int) bool { return sortedCalls[i].num < sortedCalls[j].num }) // Write the header err := csvWriter.Write([]string{ "OS", "Arch", "Num", "Name", "Support", "Note", }) if err != nil { return err } // Write each syscall entry for _, sc := range sortedCalls { err = csvWriter.Write(sc.row) if err != nil { return err } } csvWriter.Flush() err = csvWriter.Error() if err != nil { return err } } } return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/trace/000077500000000000000000000000001465435605700216365ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/trace/create.go000066400000000000000000000067421465435605700234410ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package trace import ( "context" "encoding/json" "fmt" "os" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/seccheck" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // create implements subcommands.Command for the "create" command. type create struct { config string force bool } // Name implements subcommands.Command. func (*create) Name() string { return "create" } // Synopsis implements subcommands.Command. func (*create) Synopsis() string { return "create a trace session" } // Usage implements subcommands.Command. func (*create) Usage() string { return `create [flags] - create a trace session ` } // SetFlags implements subcommands.Command. func (l *create) SetFlags(f *flag.FlagSet) { f.StringVar(&l.config, "config", "", "path to the JSON file that describes the session being created") f.BoolVar(&l.force, "force", false, "deletes a conflicting session, if one exists") } // Execute implements subcommands.Command. func (l *create) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } if len(l.config) == 0 { f.Usage() return util.Errorf("missing path to configuration file, please set --config=[path]") } sessionConfig, err := decodeTraceConfig(l.config) if err != nil { return util.Errorf("loading config file: %v", err) } id := f.Arg(0) conf := args[0].(*config.Config) opts := container.LoadOpts{ SkipCheck: true, RootContainer: true, } c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, opts) if err != nil { util.Fatalf("loading sandbox: %v", err) } if err := c.Sandbox.CreateTraceSession(sessionConfig, l.force); err != nil { util.Fatalf("creating session: %v", err) } fmt.Printf("Trace session %q created.\n", sessionConfig.Name) return subcommands.ExitSuccess } func decodeTraceConfig(path string) (*seccheck.SessionConfig, error) { file, err := os.Open(path) if err != nil { return nil, err } defer file.Close() decoder := json.NewDecoder(file) decoder.DisallowUnknownFields() sessionConfig := &seccheck.SessionConfig{} err = decoder.Decode(sessionConfig) if err == nil { // Success, we're done. return sessionConfig, nil } // If file cannot be decoded as a SessionConfig, try with InitConfig as // convenience in case the caller wants to reuse a trace session from // InitConfig file. log.Debugf("Config file is not a seccheck.SessionConfig, try with boot.InitConfig instead: %v", err) if _, err := file.Seek(0, 0); err != nil { return nil, err } initConfig := &boot.InitConfig{} if err := decoder.Decode(initConfig); err != nil { return nil, fmt.Errorf("invalid configuration file: %w", err) } return &initConfig.TraceSession, nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/trace/delete.go000066400000000000000000000043021465435605700234260ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package trace import ( "context" "fmt" "github.com/google/subcommands" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // delete implements subcommands.Command for the "delete" command. type delete struct { sessionName string } // Name implements subcommands.Command. func (*delete) Name() string { return "delete" } // Synopsis implements subcommands.Command. func (*delete) Synopsis() string { return "delete a trace session" } // Usage implements subcommands.Command. func (*delete) Usage() string { return `delete [flags] - delete a trace session ` } // SetFlags implements subcommands.Command. func (l *delete) SetFlags(f *flag.FlagSet) { f.StringVar(&l.sessionName, "name", "", "name of session to be deleted") } // Execute implements subcommands.Command. func (l *delete) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } if len(l.sessionName) == 0 { f.Usage() return util.Errorf("missing session name, please set --name") } id := f.Arg(0) conf := args[0].(*config.Config) opts := container.LoadOpts{ SkipCheck: true, RootContainer: true, } c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, opts) if err != nil { util.Fatalf("loading sandbox: %v", err) } if err := c.Sandbox.DeleteTraceSession(l.sessionName); err != nil { util.Fatalf("deleting session: %v", err) } fmt.Printf("Trace session %q deleted.\n", l.sessionName) return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/trace/list.go000066400000000000000000000041751465435605700231470ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package trace import ( "context" "fmt" "github.com/google/subcommands" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // list implements subcommands.Command for the "list" command. type list struct{} // Name implements subcommands.Command. func (*list) Name() string { return "list" } // Synopsis implements subcommands.Command. func (*list) Synopsis() string { return "list all trace sessions" } // Usage implements subcommands.Command. func (*list) Usage() string { return `list - list all trace sessions ` } // SetFlags implements subcommands.Command. func (*list) SetFlags(*flag.FlagSet) {} // Execute implements subcommands.Command. func (l *list) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) conf := args[0].(*config.Config) opts := container.LoadOpts{ SkipCheck: true, RootContainer: true, } c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, opts) if err != nil { util.Fatalf("loading sandbox: %v", err) } sessions, err := c.Sandbox.ListTraceSessions() if err != nil { util.Fatalf("listing sessions: %v", err) } fmt.Printf("SESSIONS (%d)\n", len(sessions)) for _, session := range sessions { fmt.Printf("%q\n", session.Name) for _, sink := range session.Sinks { fmt.Printf("\tSink: %q, dropped: %d\n", sink.Name, sink.Status.DroppedCount) } } return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/trace/metadata.go000066400000000000000000000045341465435605700237530ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package trace import ( "context" "fmt" "sort" "strings" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/sentry/seccheck" "gvisor.dev/gvisor/runsc/flag" ) // metadata implements subcommands.Command for the "metadata" command. type metadata struct{} // Name implements subcommands.Command. func (*metadata) Name() string { return "metadata" } // Synopsis implements subcommands.Command. func (*metadata) Synopsis() string { return "list all trace points configuration information" } // Usage implements subcommands.Command. func (*metadata) Usage() string { return `metadata - list all trace points configuration information ` } // SetFlags implements subcommands.Command. func (*metadata) SetFlags(*flag.FlagSet) {} // Execute implements subcommands.Command. func (l *metadata) Execute(context.Context, *flag.FlagSet, ...any) subcommands.ExitStatus { // Sort to keep related points together. points := make([]seccheck.PointDesc, 0, len(seccheck.Points)) for _, pt := range seccheck.Points { points = append(points, pt) } sort.Slice(points, func(i int, j int) bool { return points[i].Name < points[j].Name }) fmt.Printf("POINTS (%d)\n", len(seccheck.Points)) for _, pt := range points { optFields := fieldNames(pt.OptionalFields) ctxFields := fieldNames(pt.ContextFields) fmt.Printf("Name: %s, optional fields: [%s], context fields: [%s]\n", pt.Name, strings.Join(optFields, "|"), strings.Join(ctxFields, "|")) } fmt.Printf("\nSINKS (%d)\n", len(seccheck.Sinks)) for _, sink := range seccheck.Sinks { fmt.Printf("Name: %s\n", sink.Name) } return subcommands.ExitSuccess } func fieldNames(fields []seccheck.FieldDesc) []string { names := make([]string, 0, len(fields)) for _, f := range fields { names = append(names, f.Name) } return names } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/trace/procfs.go000066400000000000000000000043321465435605700234630ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package trace import ( "context" "encoding/json" "fmt" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // procfs implements subcommands.Command for the "procfs" command. type procfs struct { } // Name implements subcommands.Command. func (*procfs) Name() string { return "procfs" } // Synopsis implements subcommands.Command. func (*procfs) Synopsis() string { return "dump procfs state for sandbox" } // Usage implements subcommands.Command. func (*procfs) Usage() string { return `procfs - get procfs dump for a trace session ` } // SetFlags implements subcommands.Command. func (*procfs) SetFlags(*flag.FlagSet) {} // Execute implements subcommands.Command. func (*procfs) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) conf := args[0].(*config.Config) opts := container.LoadOpts{ SkipCheck: true, RootContainer: true, } c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, opts) if err != nil { util.Fatalf("loading sandbox: %v", err) } dump, err := c.Sandbox.ProcfsDump() if err != nil { util.Fatalf("procfs dump: %v", err) } fmt.Println("PROCFS DUMP") for _, procDump := range dump { out, err := json.Marshal(procDump) if err != nil { log.Warningf("json.Marshal failed to marshal %+v: %v", procDump, err) continue } fmt.Println("") fmt.Println(string(out)) } return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/trace/trace.go000066400000000000000000000040701465435605700232640ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package trace provides subcommands for the trace command. package trace import ( "bytes" "context" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/sentry/seccheck" "gvisor.dev/gvisor/runsc/flag" ) // Trace implements subcommands.Command for the "trace" command. type Trace struct{} // Name implements subcommands.Command. func (*Trace) Name() string { return "trace" } // Synopsis implements subcommands.Command. func (*Trace) Synopsis() string { return "manages trace sessions for a given sandbox" } // Usage implements subcommands.Command. func (*Trace) Usage() string { buf := bytes.Buffer{} buf.WriteString("Usage: trace \n\n") cdr := createCommander(&flag.FlagSet{}) cdr.VisitGroups(func(grp *subcommands.CommandGroup) { cdr.ExplainGroup(&buf, grp) }) return buf.String() } // SetFlags implements subcommands.Command. func (*Trace) SetFlags(f *flag.FlagSet) {} // Execute implements subcommands.Command. func (*Trace) Execute(ctx context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { seccheck.Initialize() return createCommander(f).Execute(ctx, args...) } func createCommander(f *flag.FlagSet) *subcommands.Commander { cdr := subcommands.NewCommander(f, "trace") cdr.Register(cdr.HelpCommand(), "") cdr.Register(cdr.FlagsCommand(), "") cdr.Register(new(create), "") cdr.Register(new(delete), "") cdr.Register(new(list), "") cdr.Register(new(metadata), "") cdr.Register(new(procfs), "") return cdr } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/trace/trace_state_autogen.go000066400000000000000000000000671465435605700262100ustar00rootroot00000000000000// automatically generated by stateify. package trace golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/umount_unsafe.go000066400000000000000000000042701465435605700237620ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "os" "syscall" "unsafe" "github.com/google/subcommands" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/flag" ) // Umount implements subcommands.Command for the "umount" command. type Umount struct { syncFD int } // Name implements subcommands.Command.Name. func (*Umount) Name() string { return "umount" } // Synopsis implements subcommands.Command.Synopsis. func (*Umount) Synopsis() string { return "umount the specified directory lazily when one byte is read from sync-fd" } // Usage implements subcommands.Command.Usage. func (*Umount) Usage() string { return `umount --sync-fd=FD ` } // SetFlags implements subcommands.Command.SetFlags. func (u *Umount) SetFlags(f *flag.FlagSet) { f.IntVar(&u.syncFD, "sync-fd", -1, "") } // Execute implements subcommands.Command.Execute. func (u *Umount) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() == 0 || f.NArg() > 1 { f.Usage() return subcommands.ExitUsageError } dirPath := f.Arg(0) syncFile := os.NewFile(uintptr(u.syncFD), "sync file") defer syncFile.Close() buf := make([]byte, 1) if l, err := syncFile.Read(buf); err != nil || l != 1 { util.Fatalf("unable to read from the sync descriptor: %v, error %v", l, err) } if _, _, errno := unix.RawSyscall( unix.SYS_UMOUNT2, uintptr(unsafe.Pointer(syscall.StringBytePtr(dirPath))), uintptr(linux.MNT_DETACH), 0); errno != 0 { util.Fatalf("Unable to umount %s: errno %v", dirPath, errno) } return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/usage.go000066400000000000000000000052251465435605700221770ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "encoding/json" "github.com/google/subcommands" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // Usage implements subcommands.Command for the "usage" command. type Usage struct { full bool fd bool } // Name implements subcommands.Command.Name. func (*Usage) Name() string { return "usage" } // Synopsis implements subcommands.Command.Synopsis. func (*Usage) Synopsis() string { return "Usage shows application memory usage across various categories in bytes." } // Usage implements subcommands.Command.Usage. func (*Usage) Usage() string { return `usage [flags] - print memory usages to standard output. ` } // SetFlags implements subcommands.Command.SetFlags. func (u *Usage) SetFlags(f *flag.FlagSet) { f.BoolVar(&u.full, "full", false, "enumerate all usage by categories") f.BoolVar(&u.fd, "fd", false, "retrieves a subset of usage through the established usage FD") } // Execute implements subcommands.Command.Execute. func (u *Usage) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() < 1 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) conf := args[0].(*config.Config) cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{SkipCheck: true}) if err != nil { util.Fatalf("loading container: %v", err) } if u.fd { m, err := cont.Sandbox.UsageFD() if err != nil { util.Fatalf("usagefd failed: %v", err) } mapped, unknown, total, err := m.Fetch() if err != nil { util.Fatalf("Fetch memory usage failed: %v", err) } util.Infof("Mapped %v, Unknown %v, Total %v\n", mapped, unknown, total) } else { m, err := cont.Sandbox.Usage(u.full) if err != nil { util.Fatalf("usage failed: %v", err) } encoder := json.NewEncoder(&util.Writer{}) encoder.SetIndent("", " ") if err := encoder.Encode(m); err != nil { util.Fatalf("Encode MemoryUsage failed: %v", err) } } return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/util/000077500000000000000000000000001465435605700215155ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/util/tpu.go000066400000000000000000000070211465435605700226540ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package util import ( "fmt" "os" "regexp" "strconv" "strings" "syscall" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/tpu" ) const ( googleVendorID = 0x1AE0 accelDevicePathRegex = `^/dev/accel(\d+)$` accelSysfsFormat = "/sys/class/accel/accel%d/device/%s" vfioDevicePathRegex = `^/dev/vfio/(\d+)$` vfioSysfsFormat = "/sys/class/vfio-dev/vfio%d/device/%s" vendorFile = "vendor" deviceFile = "device" ) var tpuV4DeviceIDs = map[uint64]any{tpu.TPUV4DeviceID: nil, tpu.TPUV4liteDeviceID: nil} var tpuV5DeviceIDs = map[uint64]any{tpu.TPUV5eDeviceID: nil, tpu.TPUV5pDeviceID: nil} // ExtractTPUDeviceMinor returns the accelerator device minor number for that // the passed device path. If the passed device is not a valid TPU device, then // it returns false. func ExtractTPUDeviceMinor(path string) (uint32, bool, error) { devNum, valid, err := tpuV4DeviceMinor(path) if err != nil { return 0, false, err } if valid { return devNum, valid, err } return tpuV5DeviceMinor(path) } // tpuDeviceMinor returns the accelerator device minor number for that // the passed device path. If the passed device is not a valid TPU device, then // it returns false. func tpuDeviceMinor(devicePath, devicePathRegex, sysfsFormat string, allowedDeviceIDs map[uint64]any) (uint32, bool, error) { deviceRegex := regexp.MustCompile(devicePathRegex) matches := deviceRegex.FindStringSubmatch(devicePath) if matches == nil { return 0, false, nil } var st syscall.Stat_t if err := syscall.Stat(devicePath, &st); err != nil { return 0, false, err } minor := unix.Minor(st.Rdev) vendor, err := readHexInt(fmt.Sprintf(sysfsFormat, minor, vendorFile)) if err != nil { return 0, false, err } if vendor != googleVendorID { return 0, false, nil } deviceID, err := readHexInt(fmt.Sprintf(sysfsFormat, minor, deviceFile)) if err != nil { return 0, false, err } if _, ok := allowedDeviceIDs[deviceID]; !ok { return 0, false, nil } return minor, true, nil } // tpuv4DeviceMinor returns v4 and v4lite TPU device minor number for the given path. // A valid v4 TPU device is defined as: // * Path is /dev/accel#. // * Vendor is googleVendorID. // * Device ID is one of tpuV4DeviceIDs. func tpuV4DeviceMinor(path string) (uint32, bool, error) { return tpuDeviceMinor(path, accelDevicePathRegex, accelSysfsFormat, tpuV4DeviceIDs) } // tpuV5DeviceMinor returns the v5e TPU device minor number for te given path. // A valid v5 TPU device is defined as: // * Path is /dev/vfio/#. // * Vendor is googleVendorID. // * Device ID is one of tpuV5DeviceIDs. func tpuV5DeviceMinor(path string) (uint32, bool, error) { return tpuDeviceMinor(path, vfioDevicePathRegex, vfioSysfsFormat, tpuV5DeviceIDs) } func readHexInt(path string) (uint64, error) { data, err := os.ReadFile(path) if err != nil { return 0, err } numStr := strings.Trim(strings.TrimSpace(strings.TrimPrefix(string(data), "0x")), "\x00") return strconv.ParseUint(numStr, 16, 64) } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/util/util.go000066400000000000000000000046671465435605700230360ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package util groups a bunch of common helper functions used by commands. package util import ( "encoding/json" "fmt" "io" "os" "time" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/log" ) // ErrorLogger is where error messages should be written to. These messages are // consumed by containerd and show up to users of command line tools, // like docker/kubectl. var ErrorLogger io.Writer type jsonError struct { Msg string `json:"msg"` Level string `json:"level"` Time time.Time `json:"time"` } // Writer writes to log and stdout. type Writer struct{} // Write implements io.Writer. func (i *Writer) Write(data []byte) (n int, err error) { log.Infof("%s", data) return os.Stdout.Write(data) } // Infof writes message to log and stdout. func Infof(format string, args ...any) { log.Infof(format, args...) fmt.Printf(format+"\n", args...) } // Errorf logs error to containerd log (--log), to stderr, and debug logs. It // returns subcommands.ExitFailure for convenience with subcommand.Execute() // methods: // // return Errorf("Danger! Danger!") func Errorf(format string, args ...any) subcommands.ExitStatus { // If runsc is being invoked by docker or cri-o, then we might not have // access to stderr, so we log a serious-looking warning in addition to // writing to stderr. log.Warningf("FATAL ERROR: "+format, args...) fmt.Fprintf(os.Stderr, format+"\n", args...) j := jsonError{ Msg: fmt.Sprintf(format, args...), Level: "error", Time: time.Now(), } b, err := json.Marshal(j) if err != nil { panic(err) } if ErrorLogger != nil { _, _ = ErrorLogger.Write(b) } return subcommands.ExitFailure } // Fatalf logs the same way as Errorf() does, plus *exits* the process. func Fatalf(format string, args ...any) { Errorf(format, args...) // Return an error that is unlikely to be used by the application. os.Exit(128) } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/util/util_state_autogen.go000066400000000000000000000000661465435605700257450ustar00rootroot00000000000000// automatically generated by stateify. package util golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/wait.go000066400000000000000000000104301465435605700220310ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "encoding/json" "os" "github.com/google/subcommands" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) const ( unsetPID = -1 ) // Wait implements subcommands.Command for the "wait" command. type Wait struct { rootPID int pid int checkpoint uint } // Name implements subcommands.Command.Name. func (*Wait) Name() string { return "wait" } // Synopsis implements subcommands.Command.Synopsis. func (*Wait) Synopsis() string { return "wait on a process inside a container" } // Usage implements subcommands.Command.Usage. func (*Wait) Usage() string { return `wait [flags] ` } // SetFlags implements subcommands.Command.SetFlags. func (wt *Wait) SetFlags(f *flag.FlagSet) { f.IntVar(&wt.rootPID, "rootpid", unsetPID, "select a PID in the sandbox root PID namespace to wait on instead of the container's root process") f.IntVar(&wt.pid, "pid", unsetPID, "select a PID in the container's PID namespace to wait on instead of the container's root process") f.UintVar(&wt.checkpoint, "checkpoint", 0, "wait for (n-1)th checkpoint to complete successfully, then waits for the next checkpoint attempt and returns its status. When set to 0, it disables checkpoint waiting.") } // Execute implements subcommands.Command.Execute. It waits for a process in a // container to exit before returning. func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() != 1 { f.Usage() return subcommands.ExitUsageError } // You can't specify both -pid and -rootpid. if wt.rootPID != unsetPID && wt.pid != unsetPID { util.Fatalf("only one of -pid and -rootPid can be set") } id := f.Arg(0) conf := args[0].(*config.Config) c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{}) if err != nil { util.Fatalf("loading container: %v", err) } if wt.checkpoint > 0 { if wt.rootPID != unsetPID || wt.pid != unsetPID { log.Warningf("waiting for checkpoint to complete, ignoring -pid and -rootpid") } if err := c.WaitCheckpoint(uint32(wt.checkpoint)); err != nil { util.Fatalf("waiting for %d-th checkpoint to complete: %v", wt.checkpoint, err) } return subcommands.ExitSuccess } var waitStatus unix.WaitStatus switch { // Wait on the whole container. case wt.rootPID == unsetPID && wt.pid == unsetPID: ws, err := c.Wait() if err != nil { util.Fatalf("waiting on container %q: %v", c.ID, err) } waitStatus = ws // Wait on a PID in the root PID namespace. case wt.rootPID != unsetPID: ws, err := c.WaitRootPID(int32(wt.rootPID)) if err != nil { util.Fatalf("waiting on PID in root PID namespace %d in container %q: %v", wt.rootPID, c.ID, err) } waitStatus = ws // Wait on a PID in the container's PID namespace. case wt.pid != unsetPID: ws, err := c.WaitPID(int32(wt.pid)) if err != nil { util.Fatalf("waiting on PID %d in container %q: %v", wt.pid, c.ID, err) } waitStatus = ws } result := waitResult{ ID: id, ExitStatus: exitStatus(waitStatus), } // Write json-encoded wait result directly to stdout. if err := json.NewEncoder(os.Stdout).Encode(result); err != nil { util.Fatalf("marshaling wait result: %v", err) } return subcommands.ExitSuccess } type waitResult struct { ID string `json:"id"` ExitStatus int `json:"exitStatus"` } // exitStatus returns the correct exit status for a process based on if it // was signaled or exited cleanly. func exitStatus(status unix.WaitStatus) int { if status.Signaled() { return 128 + int(status.Signal()) } return status.ExitStatus() } golang-gvisor-gvisor-0.0~20240729.0/runsc/cmd/write_control.go000066400000000000000000000050401465435605700237600ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" "fmt" "github.com/google/subcommands" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/runsc/cmd/util" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/flag" ) // WriteControl implements subcommands.Command for the "write-control" command. type WriteControl struct{} // Name implements subcommands.Command.Name. func (*WriteControl) Name() string { return "write-control" } // Synopsis implements subcommands.Command.Synopsis. func (*WriteControl) Synopsis() string { return "write a cgroups control value inside the container" } // Usage implements subcommands.Command.Usage. func (*WriteControl) Usage() string { return `write-control Where "" is the name for the instance of the container, "" is the name of an active cgroupv1 controller, is the path to the cgroup to write and is the name of the control file to write. EXAMPLE: # runsc write-control memory / memory.limit_in_bytes 536870912 ` } // SetFlags implements subcommands.Command.SetFlags. func (r *WriteControl) SetFlags(f *flag.FlagSet) {} // Execute implements subcommands.Command.Execute. func (r *WriteControl) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { if f.NArg() < 5 { f.Usage() return subcommands.ExitUsageError } id := f.Arg(0) conf := args[0].(*config.Config) c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{SkipCheck: true}) if err != nil { util.Fatalf("loading sandbox: %v", err) } err = c.Sandbox.CgroupsWriteControlFile(control.CgroupControlFile{ Controller: f.Arg(1), Path: f.Arg(2), Name: f.Arg(3), }, f.Arg(4)) if err != nil { fmt.Printf("ERROR: %s\n", err) return subcommands.ExitFailure } return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/runsc/config/000077500000000000000000000000001465435605700212425ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/config/config.go000066400000000000000000000777401465435605700230550ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package config provides basic infrastructure to set configuration settings // for runsc. The configuration is set by flags to the command line. They can // also propagate to a different process using the same flags. package config import ( "fmt" "path/filepath" "reflect" "runtime" "strconv" "strings" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/runsc/flag" "gvisor.dev/gvisor/runsc/version" ) // Config holds configuration that is not part of the runtime spec. // // Follow these steps to add a new flag: // 1. Create a new field in Config. // 2. Add a field tag with the flag name // 3. Register a new flag in flags.go, with same name and add a description // 4. Add any necessary validation into validate() // 5. If adding an enum, follow the same pattern as FileAccessType // 6. Evaluate if the flag can be changed with OCI annotations. See // overrideAllowlist for more details type Config struct { // RootDir is the runtime root directory. RootDir string `flag:"root"` // Traceback changes the Go runtime's traceback level. Traceback string `flag:"traceback"` // Debug indicates that debug logging should be enabled. Debug bool `flag:"debug"` // LogFilename is the filename to log to, if not empty. LogFilename string `flag:"log"` // LogFormat is the log format. LogFormat string `flag:"log-format"` // DebugLog is the path to log debug information to, if not empty. // If specified together with `DebugToUserLog`, debug logs are emitted // to both. DebugLog string `flag:"debug-log"` // DebugToUserLog indicates that Sentry debug logs should be emitted // to user-visible logs. // If specified together with `DebugLog`, debug logs are emitted // to both. DebugToUserLog bool `flag:"debug-to-user-log"` // DebugCommand is a comma-separated list of commands to be debugged if // --debug-log is also set. Empty means debug all. "!" negates the expression. // E.g. "create,start" or "!boot,events". DebugCommand string `flag:"debug-command"` // PanicLog is the path to log GO's runtime messages, if not empty. PanicLog string `flag:"panic-log"` // CoverageReport is the path to write Go coverage information, if not empty. CoverageReport string `flag:"coverage-report"` // DebugLogFormat is the log format for debug. DebugLogFormat string `flag:"debug-log-format"` // FileAccess indicates how the root filesystem is accessed. FileAccess FileAccessType `flag:"file-access"` // FileAccessMounts indicates how non-root volumes are accessed. FileAccessMounts FileAccessType `flag:"file-access-mounts"` // Overlay is whether to wrap all mounts in an overlay. The upper tmpfs layer // will be backed by application memory. Overlay bool `flag:"overlay"` // Overlay2 holds configuration about wrapping mounts in overlayfs. // DO NOT call it directly, use GetOverlay2() instead. Overlay2 Overlay2 `flag:"overlay2"` // FSGoferHostUDS is deprecated: use host-uds=all. FSGoferHostUDS bool `flag:"fsgofer-host-uds"` // HostUDS controls permission to access host Unix-domain sockets. // DO NOT call it directly, use GetHostUDS() instead. HostUDS HostUDS `flag:"host-uds"` // HostFifo controls permission to access host FIFO (or named pipes). HostFifo HostFifo `flag:"host-fifo"` // Network indicates what type of network to use. Network NetworkType `flag:"network"` // EnableRaw indicates whether raw sockets should be enabled. Raw // sockets are disabled by stripping CAP_NET_RAW from the list of // capabilities. EnableRaw bool `flag:"net-raw"` // AllowPacketEndpointWrite enables write operations on packet endpoints. AllowPacketEndpointWrite bool `flag:"TESTONLY-allow-packet-endpoint-write"` // HostGSO indicates that host segmentation offload is enabled. HostGSO bool `flag:"gso"` // GVisorGSO indicates that gVisor segmentation offload is enabled. The flag // retains its old name of "software" GSO for API consistency. GVisorGSO bool `flag:"software-gso"` // GVisorGRO enables gVisor's generic receive offload. GVisorGRO bool `flag:"gvisor-gro"` // TXChecksumOffload indicates that TX Checksum Offload is enabled. TXChecksumOffload bool `flag:"tx-checksum-offload"` // RXChecksumOffload indicates that RX Checksum Offload is enabled. RXChecksumOffload bool `flag:"rx-checksum-offload"` // QDisc indicates the type of queuening discipline to use by default // for non-loopback interfaces. QDisc QueueingDiscipline `flag:"qdisc"` // LogPackets indicates that all network packets should be logged. LogPackets bool `flag:"log-packets"` // PCAP is a file to which network packets should be logged in PCAP format. PCAP string `flag:"pcap-log"` // Platform is the platform to run on. Platform string `flag:"platform"` // PlatformDevicePath is the path to the device file used by the platform. // e.g. "/dev/kvm" for the KVM platform. // If unset, a sane platform-specific default will be used. PlatformDevicePath string `flag:"platform_device_path"` // MetricServer, if set, indicates that metrics should be exported on this address. // This may either be 1) "addr:port" to export metrics on a specific network interface address, // 2) ":port" for exporting metrics on all addresses, or 3) an absolute path to a Unix Domain // Socket. // The substring "%ID%" will be replaced by the container ID, and "%RUNTIME_ROOT%" by the root. // This flag must be specified *both* as part of the `runsc metric-server` arguments (so that the // metric server knows which address to bind to), and as part of the `runsc create` arguments (as // an indication that the container being created wishes that its metrics should be exported). // The value of this flag must also match across the two command lines. MetricServer string `flag:"metric-server"` // ProfilingMetrics is a comma separated list of metric names which are // going to be written to the ProfilingMetricsLog file from within the // sentry in CSV format. ProfilingMetrics will be snapshotted at a rate // specified by ProfilingMetricsRate. Requires ProfilingMetricsLog to be // set. ProfilingMetrics string `flag:"profiling-metrics"` // ProfilingMetricsLog is the file name to use for ProfilingMetrics // output. ProfilingMetricsLog string `flag:"profiling-metrics-log"` // ProfilingMetricsRate is the target rate (in microseconds) at which // profiling metrics will be snapshotted. ProfilingMetricsRate int `flag:"profiling-metrics-rate-us"` // Strace indicates that strace should be enabled. Strace bool `flag:"strace"` // StraceSyscalls is the set of syscalls to trace (comma-separated values). // If StraceEnable is true and this string is empty, then all syscalls will // be traced. StraceSyscalls string `flag:"strace-syscalls"` // StraceLogSize is the max size of data blobs to display. StraceLogSize uint `flag:"strace-log-size"` // StraceEvent indicates sending strace to events if true. Strace is // sent to log if false. StraceEvent bool `flag:"strace-event"` // DisableSeccomp indicates whether seccomp syscall filters should be // disabled. Pardon the double negation, but default to enabled is important. DisableSeccomp bool // EnableCoreTags indicates whether the Sentry process and children will be // run in a core tagged process. This isolates the sentry from sharing // physical cores with other core tagged processes. This is useful as a // mitigation for hyperthreading side channel based attacks. Requires host // linux kernel >= 5.14. EnableCoreTags bool `flag:"enable-core-tags"` // WatchdogAction sets what action the watchdog takes when triggered. WatchdogAction watchdog.Action `flag:"watchdog-action"` // PanicSignal registers signal handling that panics. Usually set to // SIGUSR2(12) to troubleshoot hangs. -1 disables it. PanicSignal int `flag:"panic-signal"` // ProfileEnable is set to prepare the sandbox to be profiled. ProfileEnable bool `flag:"profile"` // ProfileBlock collects a block profile to the passed file for the // duration of the container execution. Requires ProfileEnabled. ProfileBlock string `flag:"profile-block"` // ProfileCPU collects a CPU profile to the passed file for the // duration of the container execution. Requires ProfileEnabled. ProfileCPU string `flag:"profile-cpu"` // ProfileHeap collects a heap profile to the passed file for the // duration of the container execution. Requires ProfileEnabled. ProfileHeap string `flag:"profile-heap"` // ProfileMutex collects a mutex profile to the passed file for the // duration of the container execution. Requires ProfileEnabled. ProfileMutex string `flag:"profile-mutex"` // TraceFile collects a Go runtime execution trace to the passed file // for the duration of the container execution. TraceFile string `flag:"trace"` // NumNetworkChannels controls the number of AF_PACKET sockets that map // to the same underlying network device. This allows netstack to better // scale for high throughput use cases. NumNetworkChannels int `flag:"num-network-channels"` // NetworkProcessorsPerChannel controls the number of goroutines used to // handle packets on a single network channel. A higher number can help handle // many simultaneous connections. If this is 0, runsc will divide GOMAXPROCS // evenly among each network channel. NetworkProcessorsPerChannel int `flag:"network-processors-per-channel"` // Rootless allows the sandbox to be started with a user that is not root. // Defense in depth measures are weaker in rootless mode. Specifically, the // sandbox and Gofer process run as root inside a user namespace with root // mapped to the caller's user. When using rootless, the container root path // should not have a symlink. Rootless bool `flag:"rootless"` // AlsoLogToStderr allows to send log messages to stderr. AlsoLogToStderr bool `flag:"alsologtostderr"` // ReferenceLeakMode sets reference leak check mode ReferenceLeak refs.LeakMode `flag:"ref-leak-mode"` // CPUNumFromQuota sets CPU number count to available CPU quota, using // least integer value greater than or equal to quota. // // E.g. 0.2 CPU quota will result in 1, and 1.9 in 2. CPUNumFromQuota bool `flag:"cpu-num-from-quota"` // Allows overriding of flags in OCI annotations. AllowFlagOverride bool `flag:"allow-flag-override"` // Enables seccomp inside the sandbox. OCISeccomp bool `flag:"oci-seccomp"` // Don't configure cgroups. IgnoreCgroups bool `flag:"ignore-cgroups"` // Use systemd to configure cgroups. SystemdCgroup bool `flag:"systemd-cgroup"` // PodInitConfig is the path to configuration file with additional steps to // take during pod creation. PodInitConfig string `flag:"pod-init-config"` // Use pools to manage buffer memory instead of heap. BufferPooling bool `flag:"buffer-pooling"` // XDP controls Whether and how to use XDP. XDP XDP `flag:"EXPERIMENTAL-xdp"` // AFXDPUseNeedWakeup determines whether XDP_USE_NEED_WAKEUP is set // when using AF_XDP sockets. AFXDPUseNeedWakeup bool `flag:"EXPERIMENTAL-xdp-need-wakeup"` // FDLimit specifies a limit on the number of host file descriptors that can // be open simultaneously by the sentry and gofer. It applies separately to // each. FDLimit int `flag:"fdlimit"` // DCache sets the global dirent cache size. If negative, per-mount caches are // used. DCache int `flag:"dcache"` // IOUring enables support for the IO_URING API calls to perform // asynchronous I/O operations. IOUring bool `flag:"iouring"` // DirectFS sets up the sandbox to directly access/mutate the filesystem from // the sentry. Sentry runs with escalated privileges. Gofer process still // exists, but is mostly idle. Not supported in rootless mode. DirectFS bool `flag:"directfs"` // AppHugePages enables support for application huge pages. AppHugePages bool `flag:"app-huge-pages"` // NVProxy enables support for Nvidia GPUs. NVProxy bool `flag:"nvproxy"` // NVProxyDocker is deprecated. Please use nvidia-container-runtime or // `docker run --gpus` directly. For backward compatibility, this has the // effect of injecting nvidia-container-runtime-hook as a prestart hook. NVProxyDocker bool `flag:"nvproxy-docker"` // NVProxyDriverVersion is the version of the NVIDIA driver ABI to use. // If empty, it is autodetected from the installed NVIDIA driver. // It can also be set to the special value "latest" to force the use of // the latest supported NVIDIA driver ABI. NVProxyDriverVersion string `flag:"nvproxy-driver-version"` // TPUProxy enables support for TPUs. TPUProxy bool `flag:"tpuproxy"` // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in // tests. It allows runsc to start the sandbox process as the current // user, and without chrooting the sandbox process. This can be // necessary in test environments that have limited capabilities. When // disabling chroot, the container root path should not have a symlink. TestOnlyAllowRunAsCurrentUserWithoutChroot bool `flag:"TESTONLY-unsafe-nonroot"` // TestOnlyTestNameEnv should only be used in tests. It looks up for the // test name in the container environment variables and adds it to the debug // log file name. This is done to help identify the log with the test when // multiple tests are run in parallel, since there is no way to pass // parameters to the runtime from docker. TestOnlyTestNameEnv string `flag:"TESTONLY-test-name-env"` // TestOnlyAFSSyscallPanic should only be used in tests. It enables the // alternate behaviour for afs_syscall to trigger a Go-runtime panic upon being // called. This is useful for tests exercising gVisor panic-reporting. TestOnlyAFSSyscallPanic bool `flag:"TESTONLY-afs-syscall-panic"` // explicitlySet contains whether a flag was explicitly set on the command-line from which this // Config was constructed. Nil when the Config was not initialized from a FlagSet. explicitlySet map[string]struct{} // ReproduceNAT, when true, tells runsc to scrape the host network // namespace's NAT iptables and reproduce it inside the sandbox. ReproduceNAT bool `flag:"reproduce-nat"` // ReproduceNftables attempts to scrape nftables routing rules if // present, and reproduce them in the sandbox. ReproduceNftables bool `flag:"reproduce-nftables"` // NetDisconnectOk indicates whether the link endpoint capability // CapabilityDisconnectOk should be set. This allows open connections to be // disconnected upon save. NetDisconnectOk bool `flag:"net-disconnect-ok"` // TestOnlyAutosaveImagePath if not empty enables auto save for syscall tests // and stores the directory path to the saved state file. TestOnlyAutosaveImagePath string `flag:"TESTONLY-autosave-image-path"` // TestOnlyAutosaveResume indicates save resume for syscall tests. TestOnlyAutosaveResume bool `flag:"TESTONLY-autosave-resume"` } func (c *Config) validate() error { if c.Overlay && c.Overlay2.Enabled() { // Deprecated flag was used together with flag that replaced it. return fmt.Errorf("overlay flag has been replaced with overlay2 flag") } if overlay2 := c.GetOverlay2(); c.FileAccess == FileAccessShared && overlay2.Enabled() { return fmt.Errorf("overlay flag is incompatible with shared file access for rootfs") } if c.NumNetworkChannels <= 0 { return fmt.Errorf("num_network_channels must be > 0, got: %d", c.NumNetworkChannels) } // Require profile flags to explicitly opt-in to profiling with // -profile rather than implying it since these options have security // implications. if c.ProfileBlock != "" && !c.ProfileEnable { return fmt.Errorf("profile-block flag requires enabling profiling with profile flag") } if c.ProfileCPU != "" && !c.ProfileEnable { return fmt.Errorf("profile-cpu flag requires enabling profiling with profile flag") } if c.ProfileHeap != "" && !c.ProfileEnable { return fmt.Errorf("profile-heap flag requires enabling profiling with profile flag") } if c.ProfileMutex != "" && !c.ProfileEnable { return fmt.Errorf("profile-mutex flag requires enabling profiling with profile flag") } if c.FSGoferHostUDS && c.HostUDS != HostUDSNone { // Deprecated flag was used together with flag that replaced it. return fmt.Errorf("fsgofer-host-uds has been replaced with host-uds flag") } if len(c.ProfilingMetrics) > 0 && len(c.ProfilingMetricsLog) == 0 { return fmt.Errorf("profiling-metrics flag requires defining a profiling-metrics-log for output") } return nil } // Log logs important aspects of the configuration to the given log function. func (c *Config) Log() { log.Infof("Platform: %v", c.Platform) log.Infof("RootDir: %s", c.RootDir) log.Infof("FileAccess: %v / Directfs: %t / Overlay: %v", c.FileAccess, c.DirectFS, c.GetOverlay2()) log.Infof("Network: %v", c.Network) if c.Debug || c.Strace { log.Infof("Debug: %t. Strace: %t, max size: %d, syscalls: %s", c.Debug, c.Strace, c.StraceLogSize, c.StraceSyscalls) } if c.Debug { obj := reflect.ValueOf(c).Elem() st := obj.Type() for i := 0; i < st.NumField(); i++ { f := st.Field(i) var val any if strVal := obj.Field(i).String(); strVal == "" { val = "(empty)" } else if !f.IsExported() { // Cannot convert to `interface{}` for non-exported fields, // so just use `strVal`. val = fmt.Sprintf("%s (unexported)", strVal) } else { val = obj.Field(i).Interface() } if flagName, hasFlag := f.Tag.Lookup("flag"); hasFlag { log.Debugf("Config.%s (--%s): %v", f.Name, flagName, val) } else { log.Debugf("Config.%s: %v", f.Name, val) } } } } // GetHostUDS returns the FS gofer communication that is allowed, taking into // consideration all flags what affect the result. func (c *Config) GetHostUDS() HostUDS { if c.FSGoferHostUDS { if c.HostUDS != HostUDSNone { panic(fmt.Sprintf("HostUDS cannot be set when --fsgofer-host-uds=true")) } // Using deprecated flag, honor it to avoid breaking users. return HostUDSOpen } return c.HostUDS } // GetOverlay2 returns the overlay configuration, taking into consideration all // flags that affect the result. func (c *Config) GetOverlay2() Overlay2 { if c.Overlay { if c.Overlay2.Enabled() { panic(fmt.Sprintf("Overlay2 cannot be set when --overlay=true")) } // Using a deprecated flag, honor it to avoid breaking users. return Overlay2{rootMount: true, subMounts: true, medium: "memory"} } return c.Overlay2 } // Bundle is a set of flag name-value pairs. type Bundle map[string]string // BundleName is a human-friendly name for a Bundle. // It is used as part of an annotation to specify that the user wants to apply a Bundle. type BundleName string // Validate validates that given flag string values map to actual flags in runsc. func (b Bundle) Validate() error { flagSet := flag.NewFlagSet("tmp", flag.ContinueOnError) RegisterFlags(flagSet) for key, val := range b { flag := flagSet.Lookup(key) if flag == nil { return fmt.Errorf("unknown flag %q", key) } if err := flagSet.Set(key, val); err != nil { return err } } return nil } // MetricMetadataKeys is the set of keys of metric metadata labels // as returned by `Config.MetricMetadata`. var MetricMetadataKeys = []string{ "version", "platform", "network", "numcores", "coretags", "overlay", "fsmode", "cpuarch", "go", "experiment", } // MetricMetadata returns key-value pairs that are useful to include in metrics // exported about the sandbox this config represents. // It must return the same set of labels as listed in `MetricMetadataKeys`. func (c *Config) MetricMetadata() map[string]string { var fsMode = "goferfs" if c.DirectFS { fsMode = "directfs" } return map[string]string{ "version": version.Version(), "platform": c.Platform, "network": c.Network.String(), "numcores": strconv.Itoa(runtime.NumCPU()), "coretags": strconv.FormatBool(c.EnableCoreTags), "overlay": c.Overlay2.String(), "fsmode": fsMode, "cpuarch": runtime.GOARCH, "go": runtime.Version(), // The "experiment" label is currently unused, but may be used to contain // extra information about e.g. an experiment that may be enabled. "experiment": "", } } // FileAccessType tells how the filesystem is accessed. type FileAccessType int const ( // FileAccessExclusive gives the sandbox exclusive access over files and // directories in the filesystem. No external modifications are permitted and // can lead to undefined behavior. // // Exclusive filesystem access enables more aggressive caching and offers // significantly better performance. This is the default mode for the root // volume. FileAccessExclusive FileAccessType = iota // FileAccessShared is used for volumes that can have external changes. It // requires revalidation on every filesystem access to detect external // changes, and reduces the amount of caching that can be done. This is the // default mode for non-root volumes. FileAccessShared ) func fileAccessTypePtr(v FileAccessType) *FileAccessType { return &v } // Set implements flag.Value. Set(String()) should be idempotent. func (f *FileAccessType) Set(v string) error { switch v { case "shared": *f = FileAccessShared case "exclusive": *f = FileAccessExclusive default: return fmt.Errorf("invalid file access type %q", v) } return nil } // Get implements flag.Value. func (f *FileAccessType) Get() any { return *f } // String implements flag.Value. func (f FileAccessType) String() string { switch f { case FileAccessShared: return "shared" case FileAccessExclusive: return "exclusive" } panic(fmt.Sprintf("Invalid file access type %d", f)) } // NetworkType tells which network stack to use. type NetworkType int const ( // NetworkSandbox uses internal network stack, isolated from the host. NetworkSandbox NetworkType = iota // NetworkHost redirects network related syscalls to the host network. NetworkHost // NetworkNone sets up just loopback using netstack. NetworkNone ) func networkTypePtr(v NetworkType) *NetworkType { return &v } // Set implements flag.Value. Set(String()) should be idempotent. func (n *NetworkType) Set(v string) error { switch v { case "sandbox": *n = NetworkSandbox case "host": *n = NetworkHost case "none": *n = NetworkNone default: return fmt.Errorf("invalid network type %q", v) } return nil } // Get implements flag.Value. func (n *NetworkType) Get() any { return *n } // String implements flag.Value. func (n NetworkType) String() string { switch n { case NetworkSandbox: return "sandbox" case NetworkHost: return "host" case NetworkNone: return "none" } panic(fmt.Sprintf("Invalid network type %d", n)) } // QueueingDiscipline is used to specify the kind of Queueing Discipline to // apply for a give FDBasedLink. type QueueingDiscipline int const ( // QDiscNone disables any queueing for the underlying FD. QDiscNone QueueingDiscipline = iota // QDiscFIFO applies a simple fifo based queue to the underlying FD. QDiscFIFO ) func queueingDisciplinePtr(v QueueingDiscipline) *QueueingDiscipline { return &v } // Set implements flag.Value. Set(String()) should be idempotent. func (q *QueueingDiscipline) Set(v string) error { switch v { case "none": *q = QDiscNone case "fifo": *q = QDiscFIFO default: return fmt.Errorf("invalid qdisc %q", v) } return nil } // Get implements flag.Value. func (q *QueueingDiscipline) Get() any { return *q } // String implements flag.Value. func (q QueueingDiscipline) String() string { switch q { case QDiscNone: return "none" case QDiscFIFO: return "fifo" } panic(fmt.Sprintf("Invalid qdisc %d", q)) } func leakModePtr(v refs.LeakMode) *refs.LeakMode { return &v } func watchdogActionPtr(v watchdog.Action) *watchdog.Action { return &v } // HostUDS tells how much of the host UDS the file system has access to. type HostUDS int const ( // HostUDSNone doesn't allows UDS from the host to be manipulated. HostUDSNone HostUDS = 0x0 // HostUDSOpen allows UDS from the host to be opened, e.g. connect(2). HostUDSOpen HostUDS = 0x1 // HostUDSCreate allows UDS from the host to be created, e.g. bind(2). HostUDSCreate HostUDS = 0x2 // HostUDSAll allows all form of communication with the host through UDS. HostUDSAll = HostUDSOpen | HostUDSCreate ) func hostUDSPtr(v HostUDS) *HostUDS { return &v } // Set implements flag.Value. Set(String()) should be idempotent. func (g *HostUDS) Set(v string) error { switch v { case "", "none": *g = HostUDSNone case "open": *g = HostUDSOpen case "create": *g = HostUDSCreate case "all": *g = HostUDSAll default: return fmt.Errorf("invalid host UDS type %q", v) } return nil } // Get implements flag.Value. func (g *HostUDS) Get() any { return *g } // String implements flag.Value. func (g HostUDS) String() string { switch g { case HostUDSNone: return "none" case HostUDSOpen: return "open" case HostUDSCreate: return "create" case HostUDSAll: return "all" default: panic(fmt.Sprintf("Invalid host UDS type %d", g)) } } // AllowOpen returns true if it can consume UDS from the host. func (g HostUDS) AllowOpen() bool { return g&HostUDSOpen != 0 } // AllowCreate returns true if it can create UDS in the host. func (g HostUDS) AllowCreate() bool { return g&HostUDSCreate != 0 } // HostFifo tells how much of the host FIFO (or named pipes) the file system has // access to. type HostFifo int const ( // HostFifoNone doesn't allow FIFO from the host to be manipulated. HostFifoNone HostFifo = 0x0 // HostFifoOpen allows FIFOs from the host to be opened. HostFifoOpen HostFifo = 0x1 ) func hostFifoPtr(v HostFifo) *HostFifo { return &v } // Set implements flag.Value. Set(String()) should be idempotent. func (g *HostFifo) Set(v string) error { switch v { case "", "none": *g = HostFifoNone case "open": *g = HostFifoOpen default: return fmt.Errorf("invalid host fifo type %q", v) } return nil } // Get implements flag.Value. func (g *HostFifo) Get() any { return *g } // String implements flag.Value. func (g HostFifo) String() string { switch g { case HostFifoNone: return "none" case HostFifoOpen: return "open" default: panic(fmt.Sprintf("Invalid host fifo type %d", g)) } } // AllowOpen returns true if it can consume FIFOs from the host. func (g HostFifo) AllowOpen() bool { return g&HostFifoOpen != 0 } // OverlayMedium describes how overlay medium is configured. type OverlayMedium string const ( // NoOverlay indicates that no overlay will be applied. NoOverlay = OverlayMedium("") // MemoryOverlay indicates that the overlay is backed by app memory. MemoryOverlay = OverlayMedium("memory") // SelfOverlay indicates that the overlaid mount is backed by itself. SelfOverlay = OverlayMedium("self") // AnonOverlayPrefix is the prefix that users should specify in the // config for the anonymous overlay. AnonOverlayPrefix = "dir=" ) // String returns a human-readable string representing the overlay medium config. func (m OverlayMedium) String() string { return string(m) } // Set sets the value. Set(String()) should be idempotent. func (m *OverlayMedium) Set(v string) error { switch OverlayMedium(v) { case NoOverlay, MemoryOverlay, SelfOverlay: // OK default: if !strings.HasPrefix(v, AnonOverlayPrefix) { return fmt.Errorf("unexpected medium: %q", v) } if hostFileDir := strings.TrimPrefix(v, AnonOverlayPrefix); !filepath.IsAbs(hostFileDir) { return fmt.Errorf("overlay host file directory should be an absolute path, got %q", hostFileDir) } } *m = OverlayMedium(v) return nil } // IsBackedByAnon indicates whether the overlaid mount is backed by a host file // in an anonymous directory. func (m OverlayMedium) IsBackedByAnon() bool { return strings.HasPrefix(string(m), AnonOverlayPrefix) } // HostFileDir indicates the directory in which the overlay-backing host file // should be created. // // Precondition: m.IsBackedByAnon(). func (m OverlayMedium) HostFileDir() string { if !m.IsBackedByAnon() { panic(fmt.Sprintf("anonymous overlay medium = %q does not have %v prefix", m, AnonOverlayPrefix)) } return strings.TrimPrefix(string(m), AnonOverlayPrefix) } // Overlay2 holds the configuration for setting up overlay filesystems for the // container. type Overlay2 struct { rootMount bool subMounts bool medium OverlayMedium } func defaultOverlay2() *Overlay2 { // Rootfs overlay is enabled by default and backed by a file in rootfs itself. return &Overlay2{rootMount: true, subMounts: false, medium: SelfOverlay} } // Set implements flag.Value. Set(String()) should be idempotent. func (o *Overlay2) Set(v string) error { if v == "none" { o.rootMount = false o.subMounts = false o.medium = NoOverlay return nil } vs := strings.Split(v, ":") if len(vs) != 2 { return fmt.Errorf("expected format is --overlay2={mount}:{medium}, got %q", v) } switch mount := vs[0]; mount { case "root": o.rootMount = true case "all": o.rootMount = true o.subMounts = true default: return fmt.Errorf("unexpected mount specifier for --overlay2: %q", mount) } return o.medium.Set(vs[1]) } // Get implements flag.Value. func (o *Overlay2) Get() any { return *o } // String implements flag.Value. func (o Overlay2) String() string { if !o.rootMount && !o.subMounts { return "none" } res := "" switch { case o.rootMount && o.subMounts: res = "all" case o.rootMount: res = "root" default: panic("invalid state of subMounts = true and rootMount = false") } return res + ":" + o.medium.String() } // Enabled returns true if the overlay option is enabled for any mounts. func (o *Overlay2) Enabled() bool { return o.medium != NoOverlay } // RootOverlayMedium returns the overlay medium config of the root mount. func (o *Overlay2) RootOverlayMedium() OverlayMedium { if !o.rootMount { return NoOverlay } return o.medium } // SubMountOverlayMedium returns the overlay medium config of submounts. func (o *Overlay2) SubMountOverlayMedium() OverlayMedium { if !o.subMounts { return NoOverlay } return o.medium } // XDP holds configuration for whether and how to use XDP. type XDP struct { Mode XDPMode IfaceName string } // XDPMode specifies a particular use of XDP. type XDPMode int const ( // XDPModeOff doesn't use XDP. XDPModeOff XDPMode = iota // XDPModeNS uses an AF_XDP socket to read from the VETH device inside // the container's network namespace. XDPModeNS // XDPModeRedirect uses an AF_XDP socket on the host NIC to bypass the // Linux network stack. XDPModeRedirect // XDPModeTunnel uses XDP_REDIRECT to redirect packets directy from the // host NIC to the VETH device inside the container's network // namespace. Packets are read from the VETH via AF_XDP, as in // XDPModeNS. XDPModeTunnel ) const ( xdpModeStrOff = "off" xdpModeStrNS = "ns" xdpModeStrRedirect = "redirect" xdpModeStrTunnel = "tunnel" ) var xdpConfig XDP // Get implements flag.Getter. func (xd *XDP) Get() any { return *xd } // String implements flag.Getter. func (xd *XDP) String() string { switch xd.Mode { case XDPModeOff: return xdpModeStrOff case XDPModeNS: return xdpModeStrNS case XDPModeRedirect: return fmt.Sprintf("%s:%s", xdpModeStrRedirect, xd.IfaceName) case XDPModeTunnel: return fmt.Sprintf("%s:%s", xdpModeStrTunnel, xd.IfaceName) default: panic(fmt.Sprintf("unknown mode %d", xd.Mode)) } } // Set implements flag.Getter. func (xd *XDP) Set(input string) error { parts := strings.Split(input, ":") if len(parts) > 2 { return fmt.Errorf("invalid --xdp value: %q", input) } switch { case input == xdpModeStrOff: xd.Mode = XDPModeOff xd.IfaceName = "" case input == xdpModeStrNS: xd.Mode = XDPModeNS xd.IfaceName = "" case len(parts) == 2 && parts[0] == xdpModeStrRedirect && parts[1] != "": xd.Mode = XDPModeRedirect xd.IfaceName = parts[1] case len(parts) == 2 && parts[0] == xdpModeStrTunnel && parts[1] != "": xd.Mode = XDPModeTunnel xd.IfaceName = parts[1] default: return fmt.Errorf("invalid --xdp value: %q", input) } return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/config/config_bundles.go000066400000000000000000000021261465435605700245530ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false package config // Bundles is the set of each Bundle. // Each bundle is a named set of flag names and flag values. // Bundles may be turned on using pod annotations. // Bundles have lower precedence than flag pod annotation and command-line flags. // Bundles are mutually exclusive iff their flag values overlap and differ. var Bundles = map[BundleName]Bundle{ "experimental-high-performance": { "directfs": "true", "overlay2": "root:self", "platform": "systrap", }, } golang-gvisor-gvisor-0.0~20240729.0/runsc/config/config_state_autogen.go000066400000000000000000000001341465435605700257560ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package config golang-gvisor-gvisor-0.0~20240729.0/runsc/config/flags.go000066400000000000000000000572311465435605700226750ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package config import ( "bytes" "fmt" "os" "path/filepath" "reflect" "sort" "strconv" "strings" "text/template" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/runsc/flag" ) // RegisterFlags registers flags used to populate Config. func RegisterFlags(flagSet *flag.FlagSet) { // Although these flags are not part of the OCI spec, they are used by // Docker, and thus should not be changed. flagSet.String("root", "", "root directory for storage of container state.") flagSet.String("log", "", "file path where internal debug information is written, default is stdout.") flagSet.String("log-format", "text", "log format: text (default), json, or json-k8s.") flagSet.Bool("debug", false, "enable debug logging.") flagSet.Bool("systemd-cgroup", false, "EXPERIMENTAL. Use systemd for cgroups.") // These flags are unique to runsc, and are used to configure parts of the // system that are not covered by the runtime spec. // Debugging flags. flagSet.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.") flagSet.String("debug-command", "", `comma-separated list of commands to be debugged if --debug-log is also set. Empty means debug all. "!" negates the expression. E.g. "create,start" or "!boot,events"`) flagSet.String("panic-log", "", "file path where panic reports and other Go's runtime messages are written.") flagSet.String("coverage-report", "", "file path where Go coverage reports are written. Reports will only be generated if runsc is built with --collect_code_coverage and --instrumentation_filter Bazel flags.") flagSet.Bool("log-packets", false, "enable network packet logging.") flagSet.String("pcap-log", "", "location of PCAP log file.") flagSet.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.") flagSet.Bool("debug-to-user-log", false, "also emit Sentry logs to user-visible logs") // Only register -alsologtostderr flag if it is not already defined on this flagSet. if flagSet.Lookup("alsologtostderr") == nil { flagSet.Bool("alsologtostderr", false, "send log messages to stderr.") } flagSet.Bool("allow-flag-override", false, "allow OCI annotations (dev.gvisor.flag.) to override flags for debugging.") flagSet.String("traceback", "system", "golang runtime's traceback level") // Metrics flags. flagSet.String("metric-server", "", "if set, export metrics on this address. This may either be 1) 'addr:port' to export metrics on a specific network interface address, 2) ':port' for exporting metrics on all interfaces, or 3) an absolute path to a Unix Domain Socket. The substring '%ID%' will be replaced by the container ID, and '%RUNTIME_ROOT%' by the root. This flag must be specified in both `runsc metric-server` and `runsc create`, and their values must match.") flagSet.String("profiling-metrics", "", "comma separated list of metric names which are going to be written to the profiling-metrics-log file from within the sentry in CSV format. profiling-metrics will be snapshotted at a rate specified by profiling-metrics-rate-us. Requires profiling-metrics-log to be set. (DO NOT USE IN PRODUCTION).") flagSet.String("profiling-metrics-log", "", "file name to use for profiling-metrics output; use the special value '-' to write to the user-visible logs. (DO NOT USE IN PRODUCTION)") flagSet.Int("profiling-metrics-rate-us", 1000, "the target rate (in microseconds) at which profiling metrics will be snapshotted.") // Debugging flags: strace related flagSet.Bool("strace", false, "enable strace.") flagSet.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.") flagSet.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs.") flagSet.Bool("strace-event", false, "send strace to event.") // Flags that control sandbox runtime behavior. flagSet.String("platform", "systrap", "specifies which platform to use: systrap (default), ptrace, kvm.") flagSet.String("platform_device_path", "", "path to a platform-specific device file (e.g. /dev/kvm for KVM platform). If unset, will use a sane platform-specific default.") flagSet.Var(watchdogActionPtr(watchdog.LogWarning), "watchdog-action", "sets what action the watchdog takes when triggered: log (default), panic.") flagSet.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") flagSet.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") flagSet.String("profile-block", "", "collects a block profile to this file path for the duration of the container execution. Requires -profile=true.") flagSet.String("profile-cpu", "", "collects a CPU profile to this file path for the duration of the container execution. Requires -profile=true.") flagSet.String("profile-heap", "", "collects a heap profile to this file path for the duration of the container execution. Requires -profile=true.") flagSet.String("profile-mutex", "", "collects a mutex profile to this file path for the duration of the container execution. Requires -profile=true.") flagSet.String("trace", "", "collects a Go runtime execution trace to this file path for the duration of the container execution.") flagSet.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.") flagSet.Var(leakModePtr(refs.NoLeakChecking), "ref-leak-mode", "sets reference leak check mode: disabled (default), log-names, log-traces.") flagSet.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)") flagSet.Bool("oci-seccomp", false, "Enables loading OCI seccomp filters inside the sandbox.") flagSet.Bool("enable-core-tags", false, "enables core tagging. Requires host linux kernel >= 5.14.") flagSet.String("pod-init-config", "", "path to configuration file with additional steps to take during pod creation.") // Flags that control sandbox runtime behavior: MM related. flagSet.Bool("app-huge-pages", true, "enable use of huge pages for application memory; requires /sys/kernel/mm/transparent_hugepage/shmem_enabled = advise") // Flags that control sandbox runtime behavior: FS related. flagSet.Var(fileAccessTypePtr(FileAccessExclusive), "file-access", "specifies which filesystem validation to use for the root mount: exclusive (default), shared.") flagSet.Var(fileAccessTypePtr(FileAccessShared), "file-access-mounts", "specifies which filesystem validation to use for volumes other than the root mount: shared (default), exclusive.") flagSet.Bool("overlay", false, "DEPRECATED: use --overlay2=all:memory to achieve the same effect") flagSet.Var(defaultOverlay2(), "overlay2", "wrap mounts with overlayfs. Format is {mount}:{medium}, where 'mount' can be 'root' or 'all' and medium can be 'memory', 'self' or 'dir=/abs/dir/path' in which filestore will be created. 'none' will turn overlay mode off.") flagSet.Bool("fsgofer-host-uds", false, "DEPRECATED: use host-uds=all") flagSet.Var(hostUDSPtr(HostUDSNone), "host-uds", "controls permission to access host Unix-domain sockets. Values: none|open|create|all, default: none") flagSet.Var(hostFifoPtr(HostFifoNone), "host-fifo", "controls permission to access host FIFOs (or named pipes). Values: none|open, default: none") flagSet.Bool("vfs2", true, "DEPRECATED: this flag has no effect.") flagSet.Bool("fuse", true, "DEPRECATED: this flag has no effect.") flagSet.Bool("lisafs", true, "DEPRECATED: this flag has no effect.") flagSet.Bool("cgroupfs", false, "Automatically mount cgroupfs.") flagSet.Bool("ignore-cgroups", false, "don't configure cgroups.") flagSet.Int("fdlimit", -1, "Specifies a limit on the number of host file descriptors that can be open. Applies separately to the sentry and gofer. Note: each file in the sandbox holds more than one host FD open.") flagSet.Int("dcache", -1, "Set the global dentry cache size. This acts as a coarse-grained control on the number of host FDs simultaneously open by the sentry. If negative, per-mount caches are used.") flagSet.Bool("iouring", false, "TEST ONLY; Enables io_uring syscalls in the sentry. Support is experimental and very limited.") flagSet.Bool("directfs", true, "directly access the container filesystems from the sentry. Sentry runs with higher privileges.") // Flags that control sandbox runtime behavior: network related. flagSet.Var(networkTypePtr(NetworkSandbox), "network", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") flagSet.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.") flagSet.Bool("gso", true, "enable host segmentation offload if it is supported by a network device.") flagSet.Bool("software-gso", true, "enable gVisor segmentation offload when host offload can't be enabled.") flagSet.Bool("gvisor-gro", false, "enable gVisor generic receive offload") flagSet.Bool("tx-checksum-offload", false, "enable TX checksum offload.") flagSet.Bool("rx-checksum-offload", true, "enable RX checksum offload.") flagSet.Var(queueingDisciplinePtr(QDiscFIFO), "qdisc", "specifies which queueing discipline to apply by default to the non loopback nics used by the sandbox.") flagSet.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.") flagSet.Int("network-processors-per-channel", 0, "number of goroutines in each channel for processng inbound packets. If 0, the link endpoint will divide GOMAXPROCS evenly among the number of channels specified by num-network-channels.") flagSet.Bool("buffer-pooling", true, "DEPRECATED: this flag has no effect. Buffer pooling is always enabled.") flagSet.Var(&xdpConfig, "EXPERIMENTAL-xdp", `whether and how to use XDP. Can be one of: "off" (default), "ns", "redirect:", or "tunnel:"`) flagSet.Bool("EXPERIMENTAL-xdp-need-wakeup", true, "EXPERIMENTAL. Use XDP_USE_NEED_WAKEUP with XDP sockets.") // TODO(b/240191988): Figure out whether this helps and remove it as a flag. flagSet.Bool("reproduce-nat", false, "Scrape the host netns NAT table and reproduce it in the sandbox.") flagSet.Bool("reproduce-nftables", false, "Attempt to scrape and reproduce nftable rules inside the sandbox. Overrides reproduce-nat when true.") flagSet.Bool("net-disconnect-ok", false, "Indicates whether the link endpoint capability CapabilityDisconnectOk should be set. This allows open connections to be disconnected upon save.") // Flags that control sandbox runtime behavior: accelerator related. flagSet.Bool("nvproxy", false, "EXPERIMENTAL: enable support for Nvidia GPUs") flagSet.Bool("nvproxy-docker", false, "DEPRECATED: use nvidia-container-runtime or `docker run --gpus` directly. Or manually add nvidia-container-runtime-hook as a prestart hook and set up NVIDIA_VISIBLE_DEVICES container environment variable.") flagSet.String("nvproxy-driver-version", "", "NVIDIA driver ABI version to use. If empty, autodetect installed driver version. The special value 'latest' may also be used to use the latest ABI.") flagSet.Bool("tpuproxy", false, "EXPERIMENTAL: enable support for TPU device passthrough.") // Test flags, not to be used outside tests, ever. flagSet.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") flagSet.String("TESTONLY-test-name-env", "", "TEST ONLY; do not ever use! Used for automated tests to improve logging.") flagSet.Bool("TESTONLY-allow-packet-endpoint-write", false, "TEST ONLY; do not ever use! Used for tests to allow writes on packet sockets.") flagSet.Bool("TESTONLY-afs-syscall-panic", false, "TEST ONLY; do not ever use! Used for tests exercising gVisor panic reporting.") flagSet.String("TESTONLY-autosave-image-path", "", "TEST ONLY; enable auto save for syscall tests and set path for state file.") flagSet.Bool("TESTONLY-autosave-resume", false, "TEST ONLY; enable auto save and resume for syscall tests and set path for state file.") } // overrideAllowlist lists all flags that can be changed using OCI // annotations without an administrator setting `--allow-flag-override` on the // runtime. Flags in this list can be set by container authors and should not // make the sandbox less secure. var overrideAllowlist = map[string]struct { check func(name string, value string) error }{ "debug": {}, "debug-to-user-log": {}, "strace": {}, "strace-syscalls": {}, "strace-log-size": {}, "host-uds": {}, "net-disconnect-ok": {}, "oci-seccomp": {check: checkOciSeccomp}, } // checkOciSeccomp ensures that seccomp can be enabled but not disabled. func checkOciSeccomp(name string, value string) error { enable, err := strconv.ParseBool(value) if err != nil { return err } if !enable { return fmt.Errorf("disabling %q requires flag %q to be enabled", name, "allow-flag-override") } return nil } // isFlagExplicitlySet returns whether the given flag name is explicitly set. // Doesn't check for flag existence; returns `false` for flags that don't exist. func isFlagExplicitlySet(flagSet *flag.FlagSet, name string) bool { explicit := false // The FlagSet.Visit function only visits flags that are explicitly set, as opposed to VisitAll. flagSet.Visit(func(fl *flag.Flag) { explicit = explicit || fl.Name == name }) return explicit } // NewFromFlags creates a new Config with values coming from command line flags. func NewFromFlags(flagSet *flag.FlagSet) (*Config, error) { conf := &Config{explicitlySet: map[string]struct{}{}} obj := reflect.ValueOf(conf).Elem() st := obj.Type() for i := 0; i < st.NumField(); i++ { f := st.Field(i) name, ok := f.Tag.Lookup("flag") if !ok { // No flag set for this field. continue } fl := flagSet.Lookup(name) if fl == nil { panic(fmt.Sprintf("Flag %q not found", name)) } x := reflect.ValueOf(flag.Get(fl.Value)) obj.Field(i).Set(x) if isFlagExplicitlySet(flagSet, name) { conf.explicitlySet[name] = struct{}{} } } if len(conf.RootDir) == 0 { // If not set, set default root dir to something (hopefully) user-writeable. conf.RootDir = "/var/run/runsc" // NOTE: empty values for XDG_RUNTIME_DIR should be ignored. if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" { conf.RootDir = filepath.Join(runtimeDir, "runsc") } } if err := conf.validate(); err != nil { return nil, err } return conf, nil } // NewFromBundle makes a new config from a Bundle. func NewFromBundle(bundle Bundle) (*Config, error) { if err := bundle.Validate(); err != nil { return nil, err } flagSet := flag.NewFlagSet("tmp", flag.ContinueOnError) RegisterFlags(flagSet) conf := &Config{explicitlySet: map[string]struct{}{}} obj := reflect.ValueOf(conf).Elem() st := obj.Type() for i := 0; i < st.NumField(); i++ { f := st.Field(i) name, ok := f.Tag.Lookup("flag") if !ok { continue } fl := flagSet.Lookup(name) if fl == nil { return nil, fmt.Errorf("flag %q not found", name) } val, ok := bundle[name] if !ok { continue } if err := flagSet.Set(name, val); err != nil { return nil, fmt.Errorf("error setting flag %s=%q: %w", name, val, err) } conf.Override(flagSet, name, val, true) conf.explicitlySet[name] = struct{}{} } return conf, nil } // ToFlags returns a slice of flags that correspond to the given Config. func (c *Config) ToFlags() []string { flagSet := flag.NewFlagSet("tmp", flag.ContinueOnError) RegisterFlags(flagSet) var rv []string keyVals := c.keyVals(flagSet, false /*onlyIfSet*/) for name, val := range keyVals { rv = append(rv, fmt.Sprintf("--%s=%s", name, val)) } // Construct a temporary set for default plumbing. return rv } // KeyVal is a key value pair. It is used so ToContainerdConfigTOML returns // predictable ordering for runsc flags. type KeyVal struct { Key string Val string } // ContainerdConfigOptions contains arguments for ToContainerdConfigTOML. type ContainerdConfigOptions struct { BinaryPath string RootPath string Options map[string]string RunscFlags []KeyVal } // ToContainerdConfigTOML turns a given config into a format for a k8s containerd config.toml file. // See: https://gvisor.dev/docs/user_guide/containerd/quick_start/ func (c *Config) ToContainerdConfigTOML(opts ContainerdConfigOptions) (string, error) { flagSet := flag.NewFlagSet("tmp", flag.ContinueOnError) RegisterFlags(flagSet) keyVals := c.keyVals(flagSet, true /*onlyIfSet*/) keys := []string{} for k := range keyVals { keys = append(keys, k) } sort.Strings(keys) for _, k := range keys { opts.RunscFlags = append(opts.RunscFlags, KeyVal{k, keyVals[k]}) } const temp = `{{if .BinaryPath}}binary_name = "{{.BinaryPath}}"{{end}} {{if .RootPath}}root = "{{.RootPath}}"{{end}} {{if .Options}}{{ range $key, $value := .Options}}{{$key}} = "{{$value}}" {{end}}{{end}}{{if .RunscFlags}}[runsc_config] {{ range $fl:= .RunscFlags}} {{$fl.Key}} = "{{$fl.Val}}" {{end}}{{end}}` t := template.New("temp") t, err := t.Parse(temp) if err != nil { return "", err } var buf bytes.Buffer if err := t.Execute(&buf, opts); err != nil { return "", err } return buf.String(), nil } func (c *Config) keyVals(flagSet *flag.FlagSet, onlyIfSet bool) map[string]string { keyVals := make(map[string]string) obj := reflect.ValueOf(c).Elem() st := obj.Type() for i := 0; i < st.NumField(); i++ { f := st.Field(i) name, ok := f.Tag.Lookup("flag") if !ok { // No flag set for this field. continue } val := getVal(obj.Field(i)) fl := flagSet.Lookup(name) if fl == nil { panic(fmt.Sprintf("Flag %q not found", name)) } if val == fl.DefValue || onlyIfSet { // If this config wasn't populated from a FlagSet, don't plumb through default flags. if c.explicitlySet == nil { continue } // If this config was populated from a FlagSet, plumb through only default flags which were // explicitly specified. if _, explicit := c.explicitlySet[name]; !explicit { continue } } keyVals[fl.Name] = val } return keyVals } // Override writes a new value to a flag. func (c *Config) Override(flagSet *flag.FlagSet, name string, value string, force bool) error { obj := reflect.ValueOf(c).Elem() st := obj.Type() for i := 0; i < st.NumField(); i++ { f := st.Field(i) fieldName, ok := f.Tag.Lookup("flag") if !ok || fieldName != name { // Not a flag field, or flag name doesn't match. continue } fl := flagSet.Lookup(name) if fl == nil { // Flag must exist if there is a field match above. panic(fmt.Sprintf("Flag %q not found", name)) } if !force { if err := c.isOverrideAllowed(name, value); err != nil { return fmt.Errorf("error setting flag %s=%q: %w", name, value, err) } } // Use flag to convert the string value to the underlying flag type, using // the same rules as the command-line for consistency. if err := fl.Value.Set(value); err != nil { return fmt.Errorf("error setting flag %s=%q: %w", name, value, err) } x := reflect.ValueOf(flag.Get(fl.Value)) obj.Field(i).Set(x) // Validates the config again to ensure it's left in a consistent state. return c.validate() } return fmt.Errorf("flag %q not found. Cannot set it to %q", name, value) } func (c *Config) isOverrideAllowed(name string, value string) error { if c.AllowFlagOverride { return nil } // If the global override flag is not enabled, check if the individual flag is // safe to apply. if allow, ok := overrideAllowlist[name]; ok { if allow.check != nil { if err := allow.check(name, value); err != nil { return err } } return nil } return fmt.Errorf("flag override disabled, use --allow-flag-override to enable it") } // ApplyBundles applies the given bundles by name. // It returns an error if a bundle doesn't exist, or if the given // bundles have conflicting flag values. // Config values which are already specified prior to calling ApplyBundles are overridden. func (c *Config) ApplyBundles(flagSet *flag.FlagSet, bundleNames ...BundleName) error { // Populate a map from flag name to flag value to bundle name. flagToValueToBundleName := make(map[string]map[string]BundleName) for _, bundleName := range bundleNames { b := Bundles[bundleName] if b == nil { return fmt.Errorf("no such bundle: %q", bundleName) } for flagName, val := range b { valueToBundleName := flagToValueToBundleName[flagName] if valueToBundleName == nil { valueToBundleName = make(map[string]BundleName) flagToValueToBundleName[flagName] = valueToBundleName } valueToBundleName[val] = bundleName } } // Check for conflicting flag values between the bundles. for flagName, valueToBundleName := range flagToValueToBundleName { if len(valueToBundleName) == 1 { continue } bundleNameToValue := make(map[string]string) for val, bundleName := range valueToBundleName { bundleNameToValue[string(bundleName)] = val } var sb strings.Builder first := true for _, bundleName := range bundleNames { if val, ok := bundleNameToValue[string(bundleName)]; ok { if !first { sb.WriteString(", ") } sb.WriteString(fmt.Sprintf("bundle %q sets --%s=%q", bundleName, flagName, val)) first = false } } return fmt.Errorf("flag --%s is specified by multiple bundles: %s", flagName, sb.String()) } // Actually apply flag values. for flagName, valueToBundleName := range flagToValueToBundleName { fl := flagSet.Lookup(flagName) if fl == nil { return fmt.Errorf("flag --%s not found", flagName) } prevValue := fl.Value.String() // Note: We verified earlier that valueToBundleName has length 1, // so this loop executes exactly once per flag. for val, bundleName := range valueToBundleName { if prevValue == val { continue } if isFlagExplicitlySet(flagSet, flagName) { log.Infof("Flag --%s has explicitly-set value %q, but bundle %s takes precedence and is overriding its value to --%s=%q.", flagName, prevValue, bundleName, flagName, val) } else { log.Infof("Overriding flag --%s=%q from applying bundle %s.", flagName, val, bundleName) } if err := c.Override(flagSet, flagName, val /* force= */, true); err != nil { return err } } } return c.validate() } func getVal(field reflect.Value) string { if str, ok := field.Addr().Interface().(fmt.Stringer); ok { return str.String() } switch field.Kind() { case reflect.Bool: return strconv.FormatBool(field.Bool()) case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: return strconv.FormatInt(field.Int(), 10) case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr: return strconv.FormatUint(field.Uint(), 10) case reflect.String: return field.String() default: panic("unknown type " + field.Kind().String()) } } golang-gvisor-gvisor-0.0~20240729.0/runsc/console/000077500000000000000000000000001465435605700214375ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/console/console.go000066400000000000000000000036361465435605700234400ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package console contains utilities for working with pty consols in runsc. package console import ( "fmt" "net" "os" "github.com/kr/pty" "golang.org/x/sys/unix" ) // NewWithSocket creates pty master/replica pair, sends the master FD over the // given socket, and returns the replica. func NewWithSocket(socketPath string) (*os.File, error) { // Create a new pty master and replica. ptyMaster, ptyReplica, err := pty.Open() if err != nil { return nil, fmt.Errorf("opening pty: %v", err) } defer ptyMaster.Close() // Get a connection to the socket path. conn, err := net.Dial("unix", socketPath) if err != nil { ptyReplica.Close() return nil, fmt.Errorf("dialing socket %q: %v", socketPath, err) } defer conn.Close() uc, ok := conn.(*net.UnixConn) if !ok { ptyReplica.Close() return nil, fmt.Errorf("connection is not a UnixConn: %T", conn) } socket, err := uc.File() if err != nil { ptyReplica.Close() return nil, fmt.Errorf("getting file for unix socket %v: %v", uc, err) } defer socket.Close() // Send the master FD over the connection. msg := unix.UnixRights(int(ptyMaster.Fd())) if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil { ptyReplica.Close() return nil, fmt.Errorf("sending console over unix socket %q: %v", socketPath, err) } return ptyReplica, nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/console/console_linux_state_autogen.go000066400000000000000000000001331465435605700275660ustar00rootroot00000000000000// automatically generated by stateify. //go:build linux // +build linux package console golang-gvisor-gvisor-0.0~20240729.0/runsc/console/console_state_autogen.go000066400000000000000000000000711465435605700263500ustar00rootroot00000000000000// automatically generated by stateify. package console golang-gvisor-gvisor-0.0~20240729.0/runsc/console/pty_linux.go000066400000000000000000000014671465435605700240310ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build linux // +build linux package console import "golang.org/x/sys/unix" // IsPty returns true if FD is a PTY. func IsPty(fd uintptr) bool { _, err := unix.IoctlGetTermios(int(fd), unix.TCGETS) return err == nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/container/000077500000000000000000000000001465435605700217575ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/container/container.go000066400000000000000000002154501465435605700242770ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package container creates and manipulates containers. package container import ( "bufio" "context" "errors" "fmt" "io/ioutil" "os" "os/exec" "path" "regexp" "strconv" "strings" "syscall" "time" "github.com/cenkalti/backoff" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fsimpl/erofs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sighandling" "gvisor.dev/gvisor/pkg/state/statefile" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/cgroup" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/console" "gvisor.dev/gvisor/runsc/donation" "gvisor.dev/gvisor/runsc/profile" "gvisor.dev/gvisor/runsc/sandbox" "gvisor.dev/gvisor/runsc/specutils" "gvisor.dev/gvisor/runsc/starttime" ) const cgroupParentAnnotation = "dev.gvisor.spec.cgroup-parent" // validateID validates the container id. func validateID(id string) error { // See libcontainer/factory_linux.go. idRegex := regexp.MustCompile(`^[\w+\.-]+$`) if !idRegex.MatchString(id) { return fmt.Errorf("invalid container id: %v", id) } return nil } // Container represents a containerized application. When running, the // container is associated with a single Sandbox. // // Container metadata can be saved and loaded to disk. Within a root directory, // we maintain subdirectories for each container named with the container id. // The container metadata is stored as a json within the container directory // in a file named "meta.json". This metadata format is defined by us and is // not part of the OCI spec. // // Containers must write their metadata files after any change to their internal // states. The entire container directory is deleted when the container is // destroyed. // // When the container is stopped, all processes that belong to the container // must be stopped before Destroy() returns. containerd makes roughly the // following calls to stop a container: // - First it attempts to kill the container process with // 'runsc kill SIGTERM'. After some time, it escalates to SIGKILL. In a // separate thread, it's waiting on the container. As soon as the wait // returns, it moves on to the next step: // - It calls 'runsc kill --all SIGKILL' to stop every process that belongs to // the container. 'kill --all SIGKILL' waits for all processes before // returning. // - Containerd waits for stdin, stdout and stderr to drain and be closed. // - It calls 'runsc delete'. runc implementation kills --all SIGKILL once // again just to be sure, waits, and then proceeds with remaining teardown. // // Container is thread-unsafe. type Container struct { // ID is the container ID. ID string `json:"id"` // Spec is the OCI runtime spec that configures this container. Spec *specs.Spec `json:"spec"` // BundleDir is the directory containing the container bundle. BundleDir string `json:"bundleDir"` // CreatedAt is the time the container was created. CreatedAt time.Time `json:"createdAt"` // Owner is the container owner. Owner string `json:"owner"` // ConsoleSocket is the path to a unix domain socket that will receive // the console FD. ConsoleSocket string `json:"consoleSocket"` // Status is the current container Status. Status Status `json:"status"` // GoferPid is the PID of the gofer running along side the sandbox. May // be 0 if the gofer has been killed. GoferPid int `json:"goferPid"` // Sandbox is the sandbox this container is running in. It's set when the // container is created and reset when the sandbox is destroyed. Sandbox *sandbox.Sandbox `json:"sandbox"` // CompatCgroup has the cgroup configuration for the container. For the single // container case, container cgroup is set in `c.Sandbox` only. CompactCgroup // is only set for multi-container, where the `c.Sandbox` cgroup represents // the entire pod. // // Note that CompatCgroup is created only for compatibility with tools // that expect container cgroups to exist. Setting limits here makes no change // to the container in question. CompatCgroup cgroup.CgroupJSON `json:"compatCgroup"` // Saver handles load from/save to the state file safely from multiple // processes. Saver StateFile `json:"saver"` // GoferMountConfs contains information about how the gofer mounts have been // overlaid (with tmpfs or overlayfs). The first entry is for rootfs and the // following entries are for bind mounts in Spec.Mounts (in the same order). GoferMountConfs boot.GoferMountConfFlags `json:"goferMountConfs"` // // Fields below this line are not saved in the state file and will not // be preserved across commands. // // goferIsChild is set if a gofer process is a child of the current process. // // This field isn't saved to json, because only a creator of a gofer // process will have it as a child process. goferIsChild bool `nojson:"true"` } // Args is used to configure a new container. type Args struct { // ID is the container unique identifier. ID string // Spec is the OCI spec that describes the container. Spec *specs.Spec // BundleDir is the directory containing the container bundle. BundleDir string // ConsoleSocket is the path to a unix domain socket that will receive // the console FD. It may be empty. ConsoleSocket string // PIDFile is the filename where the container's root process PID will be // written to. It may be empty. PIDFile string // UserLog is the filename to send user-visible logs to. It may be empty. // // It only applies for the init container. UserLog string // Attached indicates that the sandbox lifecycle is attached with the caller. // If the caller exits, the sandbox should exit too. // // It only applies for the init container. Attached bool // PassFiles are user-supplied files from the host to be exposed to the // sandboxed app. PassFiles map[int]*os.File // ExecFile is the host file used for program execution. ExecFile *os.File } // New creates the container in a new Sandbox process, unless the metadata // indicates that an existing Sandbox should be used. The caller must call // Destroy() on the container. func New(conf *config.Config, args Args) (*Container, error) { log.Debugf("Create container, cid: %s, rootDir: %q", args.ID, conf.RootDir) if err := validateID(args.ID); err != nil { return nil, err } if err := os.MkdirAll(conf.RootDir, 0711); err != nil { return nil, fmt.Errorf("creating container root directory %q: %v", conf.RootDir, err) } if err := modifySpecForDirectfs(conf, args.Spec); err != nil { return nil, fmt.Errorf("failed to modify spec for directfs: %v", err) } sandboxID := args.ID if !isRoot(args.Spec) { var ok bool sandboxID, ok = specutils.SandboxID(args.Spec) if !ok { return nil, fmt.Errorf("no sandbox ID found when creating container") } } c := &Container{ ID: args.ID, Spec: args.Spec, ConsoleSocket: args.ConsoleSocket, BundleDir: args.BundleDir, Status: Creating, CreatedAt: time.Now(), Owner: os.Getenv("USER"), Saver: StateFile{ RootDir: conf.RootDir, ID: FullID{ SandboxID: sandboxID, ContainerID: args.ID, }, }, } // The Cleanup object cleans up partially created containers when an error // occurs. Any errors occurring during cleanup itself are ignored. cu := cleanup.Make(func() { _ = c.Destroy() }) defer cu.Clean() // Lock the container metadata file to prevent concurrent creations of // containers with the same id. if err := c.Saver.LockForNew(); err != nil { return nil, fmt.Errorf("cannot lock container metadata file: %w", err) } defer c.Saver.UnlockOrDie() // If the metadata annotations indicate that this container should be started // in an existing sandbox, we must do so. These are the possible metadata // annotation states: // 1. No annotations: it means that there is a single container and this // container is obviously the root. Both container and sandbox share the // ID. // 2. Container type == sandbox: it means this is the root container // starting the sandbox. Both container and sandbox share the same ID. // 3. Container type == container: it means this is a subcontainer of an // already started sandbox. In this case, container ID is different than // the sandbox ID. if isRoot(args.Spec) { log.Debugf("Creating new sandbox for container, cid: %s", args.ID) if args.Spec.Linux == nil { args.Spec.Linux = &specs.Linux{} } // Don't force the use of cgroups in tests because they lack permission to do so. if args.Spec.Linux.CgroupsPath == "" && !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { args.Spec.Linux.CgroupsPath = "/" + args.ID } var subCgroup, parentCgroup, containerCgroup cgroup.Cgroup if !conf.IgnoreCgroups { var err error // Create and join cgroup before processes are created to ensure they are // part of the cgroup from the start (and all their children processes). parentCgroup, subCgroup, err = c.setupCgroupForRoot(conf, args.Spec) if err != nil { return nil, fmt.Errorf("cannot set up cgroup for root: %w", err) } // Join the child cgroup when using cgroupfs. Joining non leaf-node // cgroups is illegal in cgroupsv2 and will return EBUSY. if subCgroup != nil && !conf.SystemdCgroup && cgroup.IsOnlyV2() { containerCgroup = subCgroup } else { containerCgroup = parentCgroup } } c.CompatCgroup = cgroup.CgroupJSON{Cgroup: subCgroup} mountHints, err := boot.NewPodMountHints(args.Spec) if err != nil { return nil, fmt.Errorf("error creating pod mount hints: %w", err) } rootfsHint, err := boot.NewRootfsHint(args.Spec) if err != nil { return nil, fmt.Errorf("error creating rootfs hint: %w", err) } goferFilestores, goferConfs, err := c.createGoferFilestores(conf.GetOverlay2(), mountHints, rootfsHint) if err != nil { return nil, err } if !goferConfs[0].ShouldUseLisafs() && specutils.GPUFunctionalityRequestedViaHook(args.Spec, conf) { // nvidia-container-runtime-hook attempts to populate the container // rootfs with NVIDIA libraries and devices. With EROFS, spec.Root.Path // points to an empty directory and populating that has no effect. return nil, fmt.Errorf("nvidia-container-runtime-hook cannot be used together with non-lisafs backed root mount") } c.GoferMountConfs = goferConfs if err := nvProxyPreGoferHostSetup(args.Spec, conf); err != nil { return nil, err } if err := runInCgroup(containerCgroup, func() error { ioFiles, devIOFile, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir, args.Attached, rootfsHint) if err != nil { return fmt.Errorf("cannot create gofer process: %w", err) } // Start a new sandbox for this container. Any errors after this point // must destroy the container. sandArgs := &sandbox.Args{ ID: sandboxID, Spec: args.Spec, BundleDir: args.BundleDir, ConsoleSocket: args.ConsoleSocket, UserLog: args.UserLog, IOFiles: ioFiles, DevIOFile: devIOFile, MountsFile: specFile, Cgroup: containerCgroup, Attached: args.Attached, GoferFilestoreFiles: goferFilestores, GoferMountConfs: goferConfs, MountHints: mountHints, PassFiles: args.PassFiles, ExecFile: args.ExecFile, } sand, err := sandbox.New(conf, sandArgs) if err != nil { return fmt.Errorf("cannot create sandbox: %w", err) } c.Sandbox = sand return nil }); err != nil { return nil, err } } else { log.Debugf("Creating new container, cid: %s, sandbox: %s", c.ID, sandboxID) // Find the sandbox associated with this ID. fullID := FullID{ SandboxID: sandboxID, ContainerID: sandboxID, } sb, err := Load(conf.RootDir, fullID, LoadOpts{Exact: true}) if err != nil { return nil, fmt.Errorf("cannot load sandbox: %w", err) } c.Sandbox = sb.Sandbox subCgroup, err := c.setupCgroupForSubcontainer(conf, args.Spec) if err != nil { return nil, err } c.CompatCgroup = cgroup.CgroupJSON{Cgroup: subCgroup} // If the console control socket file is provided, then create a new // pty master/slave pair and send the TTY to the sandbox process. var tty *os.File if c.ConsoleSocket != "" { // Create a new TTY pair and send the master on the provided socket. var err error tty, err = console.NewWithSocket(c.ConsoleSocket) if err != nil { return nil, fmt.Errorf("setting up console with socket %q: %w", c.ConsoleSocket, err) } // tty file is transferred to the sandbox, then it can be closed here. defer tty.Close() } if err := c.Sandbox.CreateSubcontainer(conf, c.ID, tty); err != nil { return nil, fmt.Errorf("cannot create subcontainer: %w", err) } } c.changeStatus(Created) // Save the metadata file. if err := c.saveLocked(); err != nil { return nil, err } // "If any prestart hook fails, the runtime MUST generate an error, // stop and destroy the container" -OCI spec. if c.Spec.Hooks != nil { // Even though the hook name is Prestart, runc used to call it from create. // For this reason, it's now deprecated, but the spec requires it to be // called *before* CreateRuntime and CreateRuntime must be called in create. // // "For runtimes that implement the deprecated prestart hooks as // createRuntime hooks, createRuntime hooks MUST be called after the // prestart hooks." if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil { return nil, err } if err := executeHooks(c.Spec.Hooks.CreateRuntime, c.State()); err != nil { return nil, err } if len(c.Spec.Hooks.CreateContainer) > 0 { log.Warningf("CreateContainer hook skipped because running inside container namespace is not supported") } } // Write the PID file. Containerd considers the call to create complete after // this file is created, so it must be the last thing we do. if args.PIDFile != "" { if err := ioutil.WriteFile(args.PIDFile, []byte(strconv.Itoa(c.SandboxPid())), 0644); err != nil { return nil, fmt.Errorf("error writing PID file: %v", err) } } cu.Release() return c, nil } // Start starts running the containerized process inside the sandbox. func (c *Container) Start(conf *config.Config) error { log.Debugf("Start container, cid: %s", c.ID) return c.startImpl(conf, "start", c.Sandbox.StartRoot, c.Sandbox.StartSubcontainer) } // Restore takes a container and replaces its kernel and file system // to restore a container from its state file. func (c *Container) Restore(conf *config.Config, imagePath string, direct bool) error { log.Debugf("Restore container, cid: %s", c.ID) restore := func(conf *config.Config) error { return c.Sandbox.Restore(conf, c.ID, imagePath, direct) } return c.startImpl(conf, "restore", restore, c.Sandbox.RestoreSubcontainer) } func (c *Container) startImpl(conf *config.Config, action string, startRoot func(conf *config.Config) error, startSub func(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles, goferFilestores []*os.File, devIOFile *os.File, goferConfs []boot.GoferMountConf) error) error { if err := c.Saver.lock(BlockAcquire); err != nil { return err } unlock := cleanup.Make(c.Saver.UnlockOrDie) defer unlock.Clean() if err := c.requireStatus(action, Created); err != nil { return err } // "If any prestart hook fails, the runtime MUST generate an error, // stop and destroy the container" -OCI spec. if c.Spec.Hooks != nil && len(c.Spec.Hooks.StartContainer) > 0 { log.Warningf("StartContainer hook skipped because running inside container namespace is not supported") } if isRoot(c.Spec) { if err := startRoot(conf); err != nil { return err } } else { rootfsHint, err := boot.NewRootfsHint(c.Spec) if err != nil { return fmt.Errorf("error creating rootfs hint: %w", err) } goferFilestores, goferConfs, err := c.createGoferFilestores(conf.GetOverlay2(), c.Sandbox.MountHints, rootfsHint) if err != nil { return err } c.GoferMountConfs = goferConfs // Join cgroup to start gofer process to ensure it's part of the cgroup from // the start (and all their children processes). if err := runInCgroup(c.Sandbox.CgroupJSON.Cgroup, func() error { // Create the gofer process. goferFiles, devIOFile, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false, rootfsHint) if err != nil { return err } defer func() { if mountsFile != nil { _ = mountsFile.Close() } if devIOFile != nil { _ = devIOFile.Close() } for _, f := range goferFiles { _ = f.Close() } for _, f := range goferFilestores { _ = f.Close() } }() if mountsFile != nil { cleanMounts, err := specutils.ReadMounts(mountsFile) if err != nil { return fmt.Errorf("reading mounts file: %v", err) } c.Spec.Mounts = cleanMounts } // Setup stdios if the container is not using terminal. Otherwise TTY was // already setup in create. var stdios []*os.File if !c.Spec.Process.Terminal { stdios = []*os.File{os.Stdin, os.Stdout, os.Stderr} } return startSub(c.Spec, conf, c.ID, stdios, goferFiles, goferFilestores, devIOFile, goferConfs) }); err != nil { return err } } // "If any poststart hook fails, the runtime MUST log a warning, but // the remaining hooks and lifecycle continue as if the hook had // succeeded" -OCI spec. if c.Spec.Hooks != nil { executeHooksBestEffort(c.Spec.Hooks.Poststart, c.State()) } c.changeStatus(Running) if err := c.saveLocked(); err != nil { return err } // Release lock before adjusting OOM score because the lock is acquired there. unlock.Clean() // Adjust the oom_score_adj for sandbox. This must be done after saveLocked(). if err := adjustSandboxOOMScoreAdj(c.Sandbox, c.Spec, c.Saver.RootDir, false); err != nil { return err } // Set container's oom_score_adj to the gofer since it is dedicated to // the container, in case the gofer uses up too much memory. return c.adjustGoferOOMScoreAdj() } // Run is a helper that calls Create + Start + Wait. func Run(conf *config.Config, args Args) (unix.WaitStatus, error) { log.Debugf("Run container, cid: %s, rootDir: %q", args.ID, conf.RootDir) c, err := New(conf, args) if err != nil { return 0, fmt.Errorf("creating container: %v", err) } // Clean up partially created container if an error occurs. // Any errors returned by Destroy() itself are ignored. cu := cleanup.Make(func() { c.Destroy() }) defer cu.Clean() if err := c.Start(conf); err != nil { return 0, fmt.Errorf("starting container: %v", err) } // If we allocate a terminal, forward signals to the sandbox process. // Otherwise, Ctrl+C will terminate this process and its children, // including the terminal. if c.Spec.Process.Terminal { stopForwarding := c.ForwardSignals(0, true /* fgProcess */) defer stopForwarding() } if args.Attached { return c.Wait() } cu.Release() return 0, nil } // Execute runs the specified command in the container. It returns the PID of // the newly created process. func (c *Container) Execute(conf *config.Config, args *control.ExecArgs) (int32, error) { log.Debugf("Execute in container, cid: %s, args: %+v", c.ID, args) if err := c.requireStatus("execute in", Created, Running); err != nil { return 0, err } args.ContainerID = c.ID return c.Sandbox.Execute(conf, args) } // Event returns events for the container. func (c *Container) Event() (*boot.EventOut, error) { log.Debugf("Getting events for container, cid: %s", c.ID) if err := c.requireStatus("get events for", Created, Running, Paused); err != nil { return nil, err } event, err := c.Sandbox.Event(c.ID) if err != nil { return nil, err } if len(event.ContainerUsage) > 0 { // Some stats can utilize host cgroups for accuracy. c.populateStats(event) } return event, nil } // PortForward starts port forwarding to the container. func (c *Container) PortForward(opts *boot.PortForwardOpts) error { if err := c.requireStatus("port forward", Running); err != nil { return err } opts.ContainerID = c.ID return c.Sandbox.PortForward(opts) } // SandboxPid returns the Getpid of the sandbox the container is running in, or -1 if the // container is not running. func (c *Container) SandboxPid() int { if err := c.requireStatus("get PID", Created, Running, Paused); err != nil { return -1 } return c.Sandbox.Getpid() } // Wait waits for the container to exit, and returns its WaitStatus. // Call to wait on a stopped container is needed to retrieve the exit status // and wait returns immediately. func (c *Container) Wait() (unix.WaitStatus, error) { log.Debugf("Wait on container, cid: %s", c.ID) ws, err := c.Sandbox.Wait(c.ID) if err == nil { // Wait succeeded, container is not running anymore. c.changeStatus(Stopped) } return ws, err } // WaitRootPID waits for process 'pid' in the sandbox's PID namespace and // returns its WaitStatus. func (c *Container) WaitRootPID(pid int32) (unix.WaitStatus, error) { log.Debugf("Wait on process %d in sandbox, cid: %s", pid, c.Sandbox.ID) if !c.IsSandboxRunning() { return 0, fmt.Errorf("sandbox is not running") } return c.Sandbox.WaitPID(c.Sandbox.ID, pid) } // WaitPID waits for process 'pid' in the container's PID namespace and returns // its WaitStatus. func (c *Container) WaitPID(pid int32) (unix.WaitStatus, error) { log.Debugf("Wait on process %d in container, cid: %s", pid, c.ID) if !c.IsSandboxRunning() { return 0, fmt.Errorf("sandbox is not running") } return c.Sandbox.WaitPID(c.ID, pid) } // WaitCheckpoint waits for the Kernel to have been successfully checkpointed // n-1 times, then waits for either the n-th successful checkpoint (in which // case it returns nil) or any number of failed checkpoints (in which case it // returns an error returned by any such failure). func (c *Container) WaitCheckpoint(n uint32) error { log.Debugf("Wait on %d-th checkpoint to complete in container, cid: %s", n, c.ID) if !c.IsSandboxRunning() { return fmt.Errorf("sandbox is not running") } return c.Sandbox.WaitCheckpoint(n) } // SignalContainer sends the signal to the container. If all is true and signal // is SIGKILL, then waits for all processes to exit before returning. // SignalContainer returns an error if the container is already stopped. // TODO(b/113680494): Distinguish different error types. func (c *Container) SignalContainer(sig unix.Signal, all bool) error { log.Debugf("Signal container, cid: %s, signal: %v (%d)", c.ID, sig, sig) // Signaling container in Stopped state is allowed. When all=false, // an error will be returned anyway; when all=true, this allows // sending signal to other processes inside the container even // after the init process exits. This is especially useful for // container cleanup. if err := c.requireStatus("signal", Running, Stopped); err != nil { return err } if !c.IsSandboxRunning() { return fmt.Errorf("sandbox is not running") } return c.Sandbox.SignalContainer(c.ID, sig, all) } // SignalProcess sends sig to a specific process in the container. func (c *Container) SignalProcess(sig unix.Signal, pid int32) error { log.Debugf("Signal process %d in container, cid: %s, signal: %v (%d)", pid, c.ID, sig, sig) if err := c.requireStatus("signal a process inside", Running); err != nil { return err } if !c.IsSandboxRunning() { return fmt.Errorf("sandbox is not running") } return c.Sandbox.SignalProcess(c.ID, int32(pid), sig, false) } // ForwardSignals forwards all signals received by the current process to the // container process inside the sandbox. It returns a function that will stop // forwarding signals. func (c *Container) ForwardSignals(pid int32, fgProcess bool) func() { log.Debugf("Forwarding all signals to container, cid: %s, PIDPID: %d, fgProcess: %t", c.ID, pid, fgProcess) stop := sighandling.StartSignalForwarding(func(sig linux.Signal) { log.Debugf("Forwarding signal %d to container, cid: %s, PID: %d, fgProcess: %t", sig, c.ID, pid, fgProcess) if err := c.Sandbox.SignalProcess(c.ID, pid, unix.Signal(sig), fgProcess); err != nil { log.Warningf("error forwarding signal %d to container %q: %v", sig, c.ID, err) } }) return func() { log.Debugf("Done forwarding signals to container, cid: %s, PID: %d, fgProcess: %t", c.ID, pid, fgProcess) stop() } } // Checkpoint sends the checkpoint call to the container. // The statefile will be written to f, the file at the specified image-path. func (c *Container) Checkpoint(imagePath string, direct bool, sfOpts statefile.Options, mfOpts pgalloc.SaveOpts) error { log.Debugf("Checkpoint container, cid: %s", c.ID) if err := c.requireStatus("checkpoint", Created, Running, Paused); err != nil { return err } return c.Sandbox.Checkpoint(c.ID, imagePath, direct, sfOpts, mfOpts) } // Pause suspends the container and its kernel. // The call only succeeds if the container's status is created or running. func (c *Container) Pause() error { log.Debugf("Pausing container, cid: %s", c.ID) if err := c.Saver.lock(BlockAcquire); err != nil { return err } defer c.Saver.UnlockOrDie() if c.Status != Created && c.Status != Running { return fmt.Errorf("cannot pause container %q in state %v", c.ID, c.Status) } if err := c.Sandbox.Pause(c.ID); err != nil { return fmt.Errorf("pausing container %q: %v", c.ID, err) } c.changeStatus(Paused) return c.saveLocked() } // Resume unpauses the container and its kernel. // The call only succeeds if the container's status is paused. func (c *Container) Resume() error { log.Debugf("Resuming container, cid: %s", c.ID) if err := c.Saver.lock(BlockAcquire); err != nil { return err } defer c.Saver.UnlockOrDie() if c.Status != Paused { return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status) } if err := c.Sandbox.Resume(c.ID); err != nil { return fmt.Errorf("resuming container: %v", err) } c.changeStatus(Running) return c.saveLocked() } // State returns the metadata of the container. func (c *Container) State() specs.State { return specs.State{ Version: specs.Version, ID: c.ID, Status: c.Status, Pid: c.SandboxPid(), Bundle: c.BundleDir, Annotations: c.Spec.Annotations, } } // Processes retrieves the list of processes and associated metadata inside a // container. func (c *Container) Processes() ([]*control.Process, error) { if err := c.requireStatus("get processes of", Running, Paused); err != nil { return nil, err } return c.Sandbox.Processes(c.ID) } // Destroy stops all processes and frees all resources associated with the // container. func (c *Container) Destroy() error { log.Debugf("Destroy container, cid: %s", c.ID) if err := c.Saver.lock(BlockAcquire); err != nil { return err } defer func() { c.Saver.UnlockOrDie() _ = c.Saver.close() }() // Stored for later use as stop() sets c.Sandbox to nil. sb := c.Sandbox // We must perform the following cleanup steps: // * stop the container and gofer processes, // * remove the container filesystem on the host, and // * delete the container metadata directory. // // It's possible for one or more of these steps to fail, but we should // do our best to perform all of the cleanups. Hence, we keep a slice // of errors return their concatenation. var errs []string if err := c.stop(); err != nil { err = fmt.Errorf("stopping container: %v", err) log.Warningf("%v", err) errs = append(errs, err.Error()) } if err := c.Saver.Destroy(); err != nil { err = fmt.Errorf("deleting container state files: %v", err) log.Warningf("%v", err) errs = append(errs, err.Error()) } // Clean up self-backed filestore files created in their respective mounts. c.forEachSelfMount(func(mountSrc string) { if sb != nil { if hint := sb.MountHints.FindMount(mountSrc); hint != nil && hint.ShouldShareMount() { // Don't delete filestore file for shared mounts. The sandbox owns a // shared master mount which uses this filestore and is shared with // multiple mount points. return } } filestorePath := boot.SelfFilestorePath(mountSrc, c.sandboxID()) if err := os.Remove(filestorePath); err != nil { err = fmt.Errorf("failed to delete filestore file %q: %v", filestorePath, err) log.Warningf("%v", err) errs = append(errs, err.Error()) } }) if sb != nil && sb.IsRootContainer(c.ID) { // When the root container is being destroyed, we can clean up filestores // used by shared mounts. for _, hint := range sb.MountHints.Mounts { if !hint.ShouldShareMount() { continue } // Assume this is a self-backed shared mount and try to delete the // filestore. Subsequently ignore the ENOENT if the assumption is wrong. filestorePath := boot.SelfFilestorePath(hint.Mount.Source, c.sandboxID()) if err := os.Remove(filestorePath); err != nil && !os.IsNotExist(err) { err = fmt.Errorf("failed to delete shared filestore file %q: %v", filestorePath, err) log.Warningf("%v", err) errs = append(errs, err.Error()) } } } c.changeStatus(Stopped) // Adjust oom_score_adj for the sandbox. This must be done after the container // is stopped and the directory at c.Root is removed. // // Use 'sb' to tell whether it has been executed before because Destroy must // be idempotent. if sb != nil { if err := adjustSandboxOOMScoreAdj(sb, c.Spec, c.Saver.RootDir, true); err != nil { errs = append(errs, err.Error()) } } // "If any poststop hook fails, the runtime MUST log a warning, but the // remaining hooks and lifecycle continue as if the hook had // succeeded" - OCI spec. // // Based on the OCI, "The post-stop hooks MUST be called after the container // is deleted but before the delete operation returns" // Run it here to: // 1) Conform to the OCI. // 2) Make sure it only runs once, because the root has been deleted, the // container can't be loaded again. if c.Spec.Hooks != nil { executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State()) } if len(errs) == 0 { return nil } return fmt.Errorf(strings.Join(errs, "\n")) } func (c *Container) sandboxID() string { return c.Saver.ID.SandboxID } func (c *Container) forEachSelfMount(fn func(mountSrc string)) { if c.GoferMountConfs == nil { // Container not started? Skip. return } if c.GoferMountConfs[0].IsSelfBacked() { fn(c.Spec.Root.Path) } goferMntIdx := 1 // First index is for rootfs. for i := range c.Spec.Mounts { if !specutils.IsGoferMount(c.Spec.Mounts[i]) { continue } if c.GoferMountConfs[goferMntIdx].IsSelfBacked() { fn(c.Spec.Mounts[i].Source) } goferMntIdx++ } } // createGoferFilestores creates the regular files that will back the // tmpfs/overlayfs mounts that will overlay some gofer mounts. It also returns // information about how each gofer mount is configured. func (c *Container) createGoferFilestores(ovlConf config.Overlay2, mountHints *boot.PodMountHints, rootfsHint *boot.RootfsHint) ([]*os.File, []boot.GoferMountConf, error) { var goferFilestores []*os.File var goferConfs []boot.GoferMountConf // Handle root mount first. overlayMedium := ovlConf.RootOverlayMedium() mountType := boot.Bind if rootfsHint != nil { overlayMedium = rootfsHint.Overlay if !specutils.IsGoferMount(rootfsHint.Mount) { mountType = rootfsHint.Mount.Type } } if c.Spec.Root.Readonly { overlayMedium = config.NoOverlay } filestore, goferConf, err := c.createGoferFilestore(overlayMedium, c.Spec.Root.Path, mountType, false /* isShared */) if err != nil { return nil, nil, err } if filestore != nil { goferFilestores = append(goferFilestores, filestore) } goferConfs = append(goferConfs, goferConf) // Handle bind mounts. for i := range c.Spec.Mounts { if !specutils.IsGoferMount(c.Spec.Mounts[i]) { continue } overlayMedium = ovlConf.SubMountOverlayMedium() mountType = boot.Bind isShared := false if specutils.IsReadonlyMount(c.Spec.Mounts[i].Options) { overlayMedium = config.NoOverlay } if hint := mountHints.FindMount(c.Spec.Mounts[i].Source); hint != nil { // Note that we want overlayMedium=self even if this is a read-only mount so that // the shared mount is created correctly. Future containers may mount this writably. overlayMedium = config.SelfOverlay if !specutils.IsGoferMount(hint.Mount) { mountType = hint.Mount.Type } isShared = hint.ShouldShareMount() } filestore, goferConf, err := c.createGoferFilestore(overlayMedium, c.Spec.Mounts[i].Source, mountType, isShared) if err != nil { return nil, nil, err } if filestore != nil { goferFilestores = append(goferFilestores, filestore) } goferConfs = append(goferConfs, goferConf) } for _, filestore := range goferFilestores { // Perform this work around outside the sandbox. The sandbox may already be // running with seccomp filters that do not allow this. pgalloc.IMAWorkAroundForMemFile(filestore.Fd()) } return goferFilestores, goferConfs, nil } func (c *Container) createGoferFilestore(overlayMedium config.OverlayMedium, mountSrc string, mountType string, isShared bool) (*os.File, boot.GoferMountConf, error) { var lower boot.GoferMountConfLowerType switch mountType { case boot.Bind: lower = boot.Lisafs case tmpfs.Name: lower = boot.NoneLower case erofs.Name: lower = boot.Erofs default: return nil, boot.GoferMountConf{}, fmt.Errorf("unsupported mount type %q in mount hint", mountType) } switch overlayMedium { case config.NoOverlay: return nil, boot.GoferMountConf{Lower: lower, Upper: boot.NoOverlay}, nil case config.MemoryOverlay: return nil, boot.GoferMountConf{Lower: lower, Upper: boot.MemoryOverlay}, nil case config.SelfOverlay: return c.createGoferFilestoreInSelf(mountSrc, isShared, boot.GoferMountConf{Lower: lower, Upper: boot.SelfOverlay}) default: if overlayMedium.IsBackedByAnon() { return c.createGoferFilestoreInDir(overlayMedium.HostFileDir(), boot.GoferMountConf{Lower: lower, Upper: boot.AnonOverlay}) } return nil, boot.GoferMountConf{}, fmt.Errorf("unexpected overlay medium %q", overlayMedium) } } func (c *Container) createGoferFilestoreInSelf(mountSrc string, isShared bool, successConf boot.GoferMountConf) (*os.File, boot.GoferMountConf, error) { mountSrcInfo, err := os.Stat(mountSrc) if err != nil { return nil, boot.GoferMountConf{}, fmt.Errorf("failed to stat mount %q to see if it were a directory: %v", mountSrc, err) } if !mountSrcInfo.IsDir() { log.Warningf("self filestore is only supported for directory mounts, but mount %q is not a directory, falling back to memory", mountSrc) return nil, boot.GoferMountConf{Lower: successConf.Lower, Upper: boot.MemoryOverlay}, nil } // Create the self filestore file. createFlags := unix.O_RDWR | unix.O_CREAT | unix.O_CLOEXEC if !isShared { // Allow shared mounts to reuse existing filestore. A previous shared user // may have already set up the filestore. createFlags |= unix.O_EXCL } filestorePath := boot.SelfFilestorePath(mountSrc, c.sandboxID()) filestoreFD, err := unix.Open(filestorePath, createFlags, 0666) if err != nil { if err == unix.EEXIST { // Note that if the same submount is mounted multiple times within the // same sandbox, and is not shared, then the overlay option doesn't work // correctly. Because each overlay mount is independent and changes to // one are not visible to the other. return nil, boot.GoferMountConf{}, fmt.Errorf("%q mount source already has a filestore file at %q; repeated submounts are not supported with overlay optimizations", mountSrc, filestorePath) } return nil, boot.GoferMountConf{}, fmt.Errorf("failed to create filestore file inside %q: %v", mountSrc, err) } log.Debugf("Created filestore file at %q for mount source %q", filestorePath, mountSrc) // Filestore in self should be a named path because it needs to be // discoverable via path traversal so that k8s can scan the filesystem // and apply any limits appropriately (like local ephemeral storage // limits). So don't delete it. These files will be unlinked when the // container is destroyed. This makes self medium appropriate for k8s. return os.NewFile(uintptr(filestoreFD), filestorePath), successConf, nil } func (c *Container) createGoferFilestoreInDir(filestoreDir string, successConf boot.GoferMountConf) (*os.File, boot.GoferMountConf, error) { fileInfo, err := os.Stat(filestoreDir) if err != nil { return nil, boot.GoferMountConf{}, fmt.Errorf("failed to stat filestore directory %q: %v", filestoreDir, err) } if !fileInfo.IsDir() { return nil, boot.GoferMountConf{}, fmt.Errorf("overlay2 flag should specify an existing directory") } // Create an unnamed temporary file in filestore directory which will be // deleted when the last FD on it is closed. We don't use O_TMPFILE because // it is not supported on all filesystems. So we simulate it by creating a // named file and then immediately unlinking it while keeping an FD on it. // This file will be deleted when the container exits. filestoreFile, err := os.CreateTemp(filestoreDir, "runsc-filestore-") if err != nil { return nil, boot.GoferMountConf{}, fmt.Errorf("failed to create a temporary file inside %q: %v", filestoreDir, err) } if err := unix.Unlink(filestoreFile.Name()); err != nil { return nil, boot.GoferMountConf{}, fmt.Errorf("failed to unlink temporary file %q: %v", filestoreFile.Name(), err) } log.Debugf("Created an unnamed filestore file at %q", filestoreDir) return filestoreFile, successConf, nil } // saveLocked saves the container metadata to a file. // // Precondition: container must be locked with container.lock(). func (c *Container) saveLocked() error { log.Debugf("Save container, cid: %s", c.ID) if err := c.Saver.SaveLocked(c); err != nil { return fmt.Errorf("saving container metadata: %v", err) } return nil } // stop stops the container (for regular containers) or the sandbox (for // root containers), and waits for the container or sandbox and the gofer // to stop. If any of them doesn't stop before timeout, an error is returned. func (c *Container) stop() error { var parentCgroup cgroup.Cgroup if c.Sandbox != nil { log.Debugf("Destroying container, cid: %s", c.ID) if err := c.Sandbox.DestroyContainer(c.ID); err != nil { return fmt.Errorf("destroying container %q: %v", c.ID, err) } // Only uninstall parentCgroup for sandbox stop. if c.Sandbox.IsRootContainer(c.ID) { parentCgroup = c.Sandbox.CgroupJSON.Cgroup } // Only set sandbox to nil after it has been told to destroy the container. c.Sandbox = nil } // Try killing gofer if it does not exit with container. if c.GoferPid != 0 { log.Debugf("Killing gofer for container, cid: %s, PID: %d", c.ID, c.GoferPid) if err := unix.Kill(c.GoferPid, unix.SIGKILL); err != nil { // The gofer may already be stopped, log the error. log.Warningf("Error sending signal %d to gofer %d: %v", unix.SIGKILL, c.GoferPid, err) } } if err := c.waitForStopped(); err != nil { return err } // Delete container cgroup if any. if c.CompatCgroup.Cgroup != nil { if err := c.CompatCgroup.Cgroup.Uninstall(); err != nil { return err } } // Gofer is running inside parentCgroup, so Cgroup.Uninstall has to be called // after the gofer has stopped. if parentCgroup != nil { if err := parentCgroup.Uninstall(); err != nil { return err } } return nil } func (c *Container) waitForStopped() error { if c.GoferPid == 0 { return nil } if c.IsSandboxRunning() { if err := c.SignalContainer(unix.Signal(0), false); err == nil { return fmt.Errorf("container is still running") } } if c.goferIsChild { // The gofer process is a child of the current process, // so we can wait it and collect its zombie. if _, err := unix.Wait4(int(c.GoferPid), nil, 0, nil); err != nil { return fmt.Errorf("error waiting the gofer process: %v", err) } c.GoferPid = 0 return nil } ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) op := func() error { if err := unix.Kill(c.GoferPid, 0); err == nil { return fmt.Errorf("gofer is still running") } c.GoferPid = 0 return nil } return backoff.Retry(op, b) } // shouldCreateDeviceGofer indicates whether a device gofer connection should // be created. func shouldCreateDeviceGofer(spec *specs.Spec, conf *config.Config) bool { return specutils.GPUFunctionalityRequested(spec, conf) || specutils.TPUFunctionalityRequested(spec, conf) } // shouldSpawnGofer indicates whether the gofer process should be spawned. func shouldSpawnGofer(spec *specs.Spec, conf *config.Config, goferConfs []boot.GoferMountConf) bool { // Lisafs mounts need the gofer. for _, cfg := range goferConfs { if cfg.ShouldUseLisafs() { return true } } // Device gofer needs a gofer process. return shouldCreateDeviceGofer(spec, conf) } // createGoferProcess returns an IO file list and a mounts file on success. // The IO file list consists of image files and/or socket files to connect to // a gofer endpoint for the mount points using Gofers. The mounts file is the // file to read list of mounts after they have been resolved (direct paths, // no symlinks), and will be nil if there is no cleaning required for mounts. func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bundleDir string, attached bool, rootfsHint *boot.RootfsHint) ([]*os.File, *os.File, *os.File, error) { if !shouldSpawnGofer(spec, conf, c.GoferMountConfs) { if !c.GoferMountConfs[0].ShouldUseErofs() { panic("goferless mode is only possible with EROFS rootfs") } ioFile, err := os.Open(rootfsHint.Mount.Source) if err != nil { return nil, nil, nil, fmt.Errorf("opening rootfs image %q: %v", rootfsHint.Mount.Source, err) } return []*os.File{ioFile}, nil, nil, nil } // Ensure we don't leak FDs to the gofer process. if err := sandbox.SetCloExeOnAllFDs(); err != nil { return nil, nil, nil, fmt.Errorf("setting CLOEXEC on all FDs: %w", err) } donations := donation.Agency{} defer donations.Close() if err := donations.OpenAndDonate("log-fd", conf.LogFilename, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil { return nil, nil, nil, err } if conf.DebugLog != "" { test := "" if len(conf.TestOnlyTestNameEnv) != 0 { // Fetch test name if one is provided and the test only flag was set. if t, ok := specutils.EnvVar(spec.Process.Env, conf.TestOnlyTestNameEnv); ok { test = t } } if specutils.IsDebugCommand(conf, "gofer") { // The startTime here can mean one of two things: // - If this is the first gofer started at the same time as the sandbox, // then this starttime will exactly match the one used by the sandbox // itself (i.e. `Sandbox.StartTime`). This is desirable, such that the // first gofer's log filename will have the exact same timestamp as // the sandbox's log filename timestamp. // - If this is not the first gofer, then this starttime will be later // than the sandbox start time; this is desirable such that we can // distinguish the gofer log filenames between each other. // In either case, `starttime.Get` gets us the timestamp we want. startTime := starttime.Get() if err := donations.DonateDebugLogFile("debug-log-fd", conf.DebugLog, "gofer", test, startTime); err != nil { return nil, nil, nil, err } } } // Start with the general config flags. cmd := exec.Command(specutils.ExePath, conf.ToFlags()...) cmd.SysProcAttr = &unix.SysProcAttr{ // Detach from session. Otherwise, signals sent to the foreground process // will also be forwarded by this process, resulting in duplicate signals. Setsid: true, } // Set Args[0] to make easier to spot the gofer process. Otherwise it's // shown as `exe`. cmd.Args[0] = "runsc-gofer" // Tranfer FDs that need to be present before the "gofer" command. // Start at 3 because 0, 1, and 2 are taken by stdin/out/err. nextFD := donations.Transfer(cmd, 3) cmd.Args = append(cmd.Args, "gofer", "--bundle", bundleDir) cmd.Args = append(cmd.Args, "--gofer-mount-confs="+c.GoferMountConfs.String()) // Open the spec file to donate to the sandbox. specFile, err := specutils.OpenSpec(bundleDir) if err != nil { return nil, nil, nil, fmt.Errorf("opening spec file: %v", err) } donations.DonateAndClose("spec-fd", specFile) // Donate any profile FDs to the gofer. if err := c.donateGoferProfileFDs(conf, &donations); err != nil { return nil, nil, nil, fmt.Errorf("donating gofer profile fds: %w", err) } // Create pipe that allows gofer to send mount list to sandbox after all paths // have been resolved. mountsSand, mountsGofer, err := os.Pipe() if err != nil { return nil, nil, nil, err } donations.DonateAndClose("mounts-fd", mountsGofer) // Count the number of mounts that needs an IO file. ioFileCount := 0 for _, cfg := range c.GoferMountConfs { if cfg.ShouldUseLisafs() || cfg.ShouldUseErofs() { ioFileCount++ } } sandEnds := make([]*os.File, 0, ioFileCount) for i, cfg := range c.GoferMountConfs { switch { case cfg.ShouldUseLisafs(): fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) if err != nil { return nil, nil, nil, err } sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox IO FD")) goferEnd := os.NewFile(uintptr(fds[1]), "gofer IO FD") donations.DonateAndClose("io-fds", goferEnd) case cfg.ShouldUseErofs(): if i > 0 { return nil, nil, nil, fmt.Errorf("EROFS lower layer is only supported for root mount") } f, err := os.Open(rootfsHint.Mount.Source) if err != nil { return nil, nil, nil, fmt.Errorf("opening rootfs image %q: %v", rootfsHint.Mount.Source, err) } sandEnds = append(sandEnds, f) } } var devSandEnd *os.File if shouldCreateDeviceGofer(spec, conf) { fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) if err != nil { return nil, nil, nil, err } devSandEnd = os.NewFile(uintptr(fds[0]), "sandbox dev IO FD") donations.DonateAndClose("dev-io-fd", os.NewFile(uintptr(fds[1]), "gofer dev IO FD")) } if attached { // The gofer is attached to the lifetime of this process, so it // should synchronously die when this process dies. cmd.SysProcAttr.Pdeathsig = unix.SIGKILL } // Enter new namespaces to isolate from the rest of the system. Don't unshare // cgroup because gofer is added to a cgroup in the caller's namespace. nss := []specs.LinuxNamespace{ {Type: specs.IPCNamespace}, {Type: specs.MountNamespace}, {Type: specs.NetworkNamespace}, {Type: specs.PIDNamespace}, {Type: specs.UTSNamespace}, } rootlessEUID := unix.Geteuid() != 0 // Setup any uid/gid mappings, and create or join the configured user // namespace so the gofer's view of the filesystem aligns with the // users in the sandbox. if !rootlessEUID { if userNS, ok := specutils.GetNS(specs.UserNamespace, spec); ok { nss = append(nss, userNS) specutils.SetUIDGIDMappings(cmd, spec) // We need to set UID and GID to have capabilities in a new user namespace. cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0} } } else { userNS, ok := specutils.GetNS(specs.UserNamespace, spec) if !ok { return nil, nil, nil, fmt.Errorf("unable to run a rootless container without userns") } nss = append(nss, userNS) syncFile, err := sandbox.ConfigureCmdForRootless(cmd, &donations) if err != nil { return nil, nil, nil, err } defer syncFile.Close() } nvProxySetup, err := nvproxySetupAfterGoferUserns(spec, conf, cmd, &donations) if err != nil { return nil, nil, nil, fmt.Errorf("setting up nvproxy for gofer: %w", err) } donations.Transfer(cmd, nextFD) // Start the gofer in the given namespace. donation.LogDonations(cmd) log.Debugf("Starting gofer: %s %v", cmd.Path, cmd.Args) if err := specutils.StartInNS(cmd, nss); err != nil { return nil, nil, nil, fmt.Errorf("gofer: %v", err) } log.Infof("Gofer started, PID: %d", cmd.Process.Pid) c.GoferPid = cmd.Process.Pid c.goferIsChild = true // Set up and synchronize rootless mode userns mappings. if rootlessEUID { if err := sandbox.SetUserMappings(spec, cmd.Process.Pid); err != nil { return nil, nil, nil, err } } // Set up nvproxy within the Gofer namespace. if err := nvProxySetup(); err != nil { return nil, nil, nil, fmt.Errorf("nvproxy setup: %w", err) } return sandEnds, devSandEnd, mountsSand, nil } // changeStatus transitions from one status to another ensuring that the // transition is valid. func (c *Container) changeStatus(s Status) { switch s { case Creating: // Initial state, never transitions to it. panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) case Created: if c.Status != Creating { panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) } if c.Sandbox == nil { panic("sandbox cannot be nil") } case Paused: if c.Status != Running { panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) } if c.Sandbox == nil { panic("sandbox cannot be nil") } case Running: if c.Status != Created && c.Status != Paused { panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) } if c.Sandbox == nil { panic("sandbox cannot be nil") } case Stopped: // All states can transition to Stopped. default: panic(fmt.Sprintf("invalid new state: %v", s)) } c.Status = s } // IsSandboxRunning returns true if the sandbox exists and is running. func (c *Container) IsSandboxRunning() bool { return c.Sandbox != nil && c.Sandbox.IsRunning() } // HasCapabilityInAnySet returns true if the given capability is in any of the // capability sets of the container process. func (c *Container) HasCapabilityInAnySet(capability linux.Capability) bool { capString := capability.String() for _, set := range [5][]string{ c.Spec.Process.Capabilities.Bounding, c.Spec.Process.Capabilities.Effective, c.Spec.Process.Capabilities.Inheritable, c.Spec.Process.Capabilities.Permitted, c.Spec.Process.Capabilities.Ambient, } { for _, c := range set { if c == capString { return true } } } return false } // RunsAsUID0 returns true if the container process runs with UID 0 (root). func (c *Container) RunsAsUID0() bool { return c.Spec.Process.User.UID == 0 } func (c *Container) requireStatus(action string, statuses ...Status) error { for _, s := range statuses { if c.Status == s { return nil } } return fmt.Errorf("cannot %s container %q in state %s", action, c.ID, c.Status) } // IsSandboxRoot returns true if this container is its sandbox's root container. func (c *Container) IsSandboxRoot() bool { return isRoot(c.Spec) } func isRoot(spec *specs.Spec) bool { return specutils.SpecContainerType(spec) != specutils.ContainerTypeContainer } // runInCgroup executes fn inside the specified cgroup. If cg is nil, execute // it in the current context. func runInCgroup(cg cgroup.Cgroup, fn func() error) error { if cg == nil { return fn() } restore, err := cg.Join() if err != nil { return err } defer restore() return fn() } // adjustGoferOOMScoreAdj sets the oom_store_adj for the container's gofer. func (c *Container) adjustGoferOOMScoreAdj() error { if c.GoferPid == 0 || c.Spec.Process.OOMScoreAdj == nil { return nil } return setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj) } // adjustSandboxOOMScoreAdj sets the oom_score_adj for the sandbox. // oom_score_adj is set to the lowest oom_score_adj among the containers // running in the sandbox. // // TODO(gvisor.dev/issue/238): This call could race with other containers being // created at the same time and end up setting the wrong oom_score_adj to the // sandbox. Use rpc client to synchronize. func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, spec *specs.Spec, rootDir string, destroy bool) error { // Adjustment can be skipped if the root container is exiting, because it // brings down the entire sandbox. if isRoot(spec) && destroy { return nil } containers, err := LoadSandbox(rootDir, s.ID, LoadOpts{}) if err != nil { return fmt.Errorf("loading sandbox containers: %v", err) } // Do nothing if the sandbox has been terminated. if len(containers) == 0 { return nil } // Get the lowest score for all containers. var lowScore int scoreFound := false for _, container := range containers { // Special multi-container support for CRI. Ignore the root container when // calculating oom_score_adj for the sandbox because it is the // infrastructure (pause) container and always has a very low oom_score_adj. // // We will use OOMScoreAdj in the single-container case where the // containerd container-type annotation is not present. if specutils.SpecContainerType(container.Spec) == specutils.ContainerTypeSandbox { continue } if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) { scoreFound = true lowScore = *container.Spec.Process.OOMScoreAdj } } // If the container is destroyed and remaining containers have no // oomScoreAdj specified then we must revert to the original oom_score_adj // saved with the root container. if !scoreFound && destroy { lowScore = containers[0].Sandbox.OriginalOOMScoreAdj scoreFound = true } // Only set oom_score_adj if one of the containers has oom_score_adj set. If // not, oom_score_adj is inherited from the parent process. // // See: https://github.com/opencontainers/runtime-spec/blob/master/config.md#linux-process if !scoreFound { return nil } // Set the lowest of all containers oom_score_adj to the sandbox. return setOOMScoreAdj(s.Getpid(), lowScore) } // setOOMScoreAdj sets oom_score_adj to the given value for the given PID. // /proc must be available and mounted read-write. scoreAdj should be between // -1000 and 1000. It's a noop if the process has already exited. func setOOMScoreAdj(pid int, scoreAdj int) error { f, err := os.OpenFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid), os.O_WRONLY, 0644) if err != nil { // Ignore NotExist errors because it can race with process exit. if os.IsNotExist(err) { log.Warningf("Process (%d) not found setting oom_score_adj", pid) return nil } return err } defer f.Close() if _, err := f.WriteString(strconv.Itoa(scoreAdj)); err != nil { if errors.Is(err, unix.ESRCH) { log.Warningf("Process (%d) exited while setting oom_score_adj", pid) return nil } return fmt.Errorf("setting oom_score_adj to %q: %v", scoreAdj, err) } return nil } // populateStats populates event with stats estimates based on cgroups and the // sentry's accounting. func (c *Container) populateStats(event *boot.EventOut) { // The events command, when run for all running containers, should // account for the full cgroup CPU usage. We split cgroup usage // proportionally according to the sentry-internal usage measurements, // only counting Running containers. log.Debugf("event.ContainerUsage: %v", event.ContainerUsage) numContainers := uint64(len(event.ContainerUsage)) if numContainers == 0 { log.Warningf("events: no containers listed in usage, returning zero CPU usage") event.Event.Data.CPU.Usage.Total = 0 return } var containerUsage uint64 var allContainersUsage uint64 for ID, usage := range event.ContainerUsage { allContainersUsage += usage if ID == c.ID { containerUsage = usage } } cgroup, err := c.Sandbox.NewCGroup() if err != nil { // No cgroup, so rely purely on the sentry's accounting. log.Warningf("events: no cgroups") event.Event.Data.CPU.Usage.Total = containerUsage return } // Get the host cgroup CPU usage. cgroupsUsage, err := cgroup.CPUUsage() if err != nil || cgroupsUsage == 0 { // No cgroup usage, so rely purely on the sentry's accounting. log.Warningf("events: failed when getting cgroup CPU usage for container: usage=%d, err: %v", cgroupsUsage, err) event.Event.Data.CPU.Usage.Total = containerUsage return } // If the sentry reports no CPU usage, fall back on cgroups and split usage // equally across containers. if allContainersUsage == 0 { log.Warningf("events: no sentry CPU usage reported") allContainersUsage = cgroupsUsage containerUsage = cgroupsUsage / numContainers } // Scaling can easily overflow a uint64 (e.g. a containerUsage and // cgroupsUsage of 16 seconds each will overflow), so use floats. total := float64(containerUsage) * (float64(cgroupsUsage) / float64(allContainersUsage)) log.Debugf("Usage, container: %d, cgroups: %d, all: %d, total: %.0f", containerUsage, cgroupsUsage, allContainersUsage, total) event.Event.Data.CPU.Usage.Total = uint64(total) return } func (c *Container) createParentCgroup(parentPath string, conf *config.Config) (cgroup.Cgroup, error) { var err error if conf.SystemdCgroup { parentPath, err = cgroup.TransformSystemdPath(parentPath, c.ID, conf.Rootless) if err != nil { return nil, err } } else if cgroup.LikelySystemdPath(parentPath) { log.Warningf("cgroup parent path is set to %q which looks like a systemd path. Please set --systemd-cgroup=true if you intend to use systemd to manage container cgroups", parentPath) } parentCgroup, err := cgroup.NewFromPath(parentPath, conf.SystemdCgroup) if err != nil { return nil, err } return parentCgroup, nil } // setupCgroupForRoot configures and returns cgroup for the sandbox and the // root container. If `cgroupParentAnnotation` is set, use that path as the // sandbox cgroup and use Spec.Linux.CgroupsPath as the root container cgroup. func (c *Container) setupCgroupForRoot(conf *config.Config, spec *specs.Spec) (cgroup.Cgroup, cgroup.Cgroup, error) { var parentCgroup cgroup.Cgroup if parentPath, ok := spec.Annotations[cgroupParentAnnotation]; ok { var err error parentCgroup, err = c.createParentCgroup(parentPath, conf) if err != nil { return nil, nil, err } } else { var err error if spec.Linux == nil || spec.Linux.CgroupsPath == "" { return nil, nil, nil } parentCgroup, err = c.createParentCgroup(spec.Linux.CgroupsPath, conf) if parentCgroup == nil || err != nil { return nil, nil, err } } var err error parentCgroup, err = cgroupInstall(conf, parentCgroup, spec.Linux.Resources) if parentCgroup == nil || err != nil { return nil, nil, err } subCgroup, err := c.setupCgroupForSubcontainer(conf, spec) if err != nil { _ = parentCgroup.Uninstall() return nil, nil, err } return parentCgroup, subCgroup, nil } // setupCgroupForSubcontainer sets up empty cgroups for subcontainers. Since // subcontainers run exclusively inside the sandbox, subcontainer cgroups on the // host have no effect on them. However, some tools (e.g. cAdvisor) uses cgroups // paths to discover new containers and report stats for them. func (c *Container) setupCgroupForSubcontainer(conf *config.Config, spec *specs.Spec) (cgroup.Cgroup, error) { if isRoot(spec) { if _, ok := spec.Annotations[cgroupParentAnnotation]; !ok { return nil, nil } } if spec.Linux == nil || spec.Linux.CgroupsPath == "" { return nil, nil } cg, err := c.createParentCgroup(spec.Linux.CgroupsPath, conf) if cg == nil || err != nil { return nil, err } // Use empty resources, just want the directory structure created. return cgroupInstall(conf, cg, &specs.LinuxResources{}) } // donateGoferProfileFDs will open profile files and donate their FDs to the // gofer. func (c *Container) donateGoferProfileFDs(conf *config.Config, donations *donation.Agency) error { // The gofer profile files are named based on the provided flag, but // suffixed with "gofer" and the container ID to avoid collisions with // sentry profile files or profile files from other gofers. // // TODO(b/243183772): Merge gofer profile data with sentry profile data // into a single file. profSuffix := ".gofer." + c.ID const profFlags = os.O_CREATE | os.O_WRONLY | os.O_TRUNC profile.UpdatePaths(conf, starttime.Get()) if conf.ProfileBlock != "" { if err := donations.OpenAndDonate("profile-block-fd", conf.ProfileBlock+profSuffix, profFlags); err != nil { return err } } if conf.ProfileCPU != "" { if err := donations.OpenAndDonate("profile-cpu-fd", conf.ProfileCPU+profSuffix, profFlags); err != nil { return err } } if conf.ProfileHeap != "" { if err := donations.OpenAndDonate("profile-heap-fd", conf.ProfileHeap+profSuffix, profFlags); err != nil { return err } } if conf.ProfileMutex != "" { if err := donations.OpenAndDonate("profile-mutex-fd", conf.ProfileMutex+profSuffix, profFlags); err != nil { return err } } if conf.TraceFile != "" { if err := donations.OpenAndDonate("trace-fd", conf.TraceFile+profSuffix, profFlags); err != nil { return err } } return nil } // cgroupInstall creates cgroups dir structure and sets their respective // resources. In case of success, returns the cgroups instance and nil error. // For rootless, it's possible that cgroups operations fail, in this case the // error is suppressed and a nil cgroups instance is returned to indicate that // no cgroups was configured. func cgroupInstall(conf *config.Config, cg cgroup.Cgroup, res *specs.LinuxResources) (cgroup.Cgroup, error) { if err := cg.Install(res); err != nil { switch { case (errors.Is(err, unix.EACCES) || errors.Is(err, unix.EROFS)) && conf.Rootless: log.Warningf("Skipping cgroup configuration in rootless mode: %v", err) return nil, nil default: return nil, fmt.Errorf("configuring cgroup: %v", err) } } return cg, nil } func modifySpecForDirectfs(conf *config.Config, spec *specs.Spec) error { if !conf.DirectFS || conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { return nil } if conf.Network == config.NetworkHost { // Hostnet feature requires the sandbox to run in the current user // namespace, in which the network namespace is configured. return nil } if _, ok := specutils.GetNS(specs.UserNamespace, spec); ok { // If the spec already defines a userns, use that. return nil } if spec.Linux == nil { spec.Linux = &specs.Linux{} } if len(spec.Linux.UIDMappings) > 0 || len(spec.Linux.GIDMappings) > 0 { // The spec can only define UID/GID mappings with a userns (checked above). return fmt.Errorf("spec defines UID/GID mappings without defining userns") } // Run the sandbox in a new user namespace with identity UID/GID mappings. log.Debugf("Configuring container with a new userns with identity user mappings into current userns") spec.Linux.Namespaces = append(spec.Linux.Namespaces, specs.LinuxNamespace{Type: specs.UserNamespace}) uidMappings, err := getIdentityMapping("uid_map") if err != nil { return err } spec.Linux.UIDMappings = uidMappings logIDMappings(uidMappings, "UID") gidMappings, err := getIdentityMapping("gid_map") if err != nil { return err } spec.Linux.GIDMappings = gidMappings logIDMappings(gidMappings, "GID") return nil } func getIdentityMapping(mapFileName string) ([]specs.LinuxIDMapping, error) { // See user_namespaces(7) to understand how /proc/self/{uid/gid}_map files // are organized. mapFile := path.Join("/proc/self", mapFileName) file, err := os.Open(mapFile) if err != nil { return nil, fmt.Errorf("failed to open %s: %v", mapFile, err) } defer file.Close() var mappings []specs.LinuxIDMapping scanner := bufio.NewScanner(file) for scanner.Scan() { line := scanner.Text() var myStart, parentStart, rangeLen uint32 numParsed, err := fmt.Sscanf(line, "%d %d %d", &myStart, &parentStart, &rangeLen) if err != nil { return nil, fmt.Errorf("failed to parse line %q in file %s: %v", line, mapFile, err) } if numParsed != 3 { return nil, fmt.Errorf("failed to parse 3 integers from line %q in file %s", line, mapFile) } // Create an identity mapping with the current userns. mappings = append(mappings, specs.LinuxIDMapping{ ContainerID: myStart, HostID: myStart, Size: rangeLen, }) } if err := scanner.Err(); err != nil { return nil, fmt.Errorf("failed to scan file %s: %v", mapFile, err) } return mappings, nil } func logIDMappings(mappings []specs.LinuxIDMapping, idType string) { if !log.IsLogging(log.Debug) { return } log.Debugf("%s Mappings:", idType) for _, m := range mappings { log.Debugf("\tContainer ID: %d, Host ID: %d, Range Length: %d", m.ContainerID, m.HostID, m.Size) } } // nvProxyPreGoferHostSetup does host setup work so that `nvidia-container-cli // configure` can be run in the future. It runs before any Gofers start. // It verifies that all the required dependencies are in place, loads kernel // modules, and ensures the correct device files exist and are accessible. // This should only be necessary once on the host. It should be run during the // root container setup sequence to make sure it has run at least once. func nvProxyPreGoferHostSetup(spec *specs.Spec, conf *config.Config) error { if !specutils.GPUFunctionalityRequestedViaHook(spec, conf) { return nil } // Locate binaries. For security reasons, unlike // nvidia-container-runtime-hook, we don't add the container's filesystem // to the search path. We also don't support // /etc/nvidia-container-runtime/config.toml to avoid importing a TOML // parser. cliPath, err := exec.LookPath("nvidia-container-cli") if err != nil { return fmt.Errorf("failed to locate nvidia-container-cli in PATH: %w", err) } // nvidia-container-cli --load-kmods seems to be a noop; load kernel modules ourselves. nvproxyLoadKernelModules() if _, err := os.Stat("/dev/nvidiactl"); err != nil { if !os.IsNotExist(err) { return fmt.Errorf("stat(2) for /dev/nvidiactl failed: %w", err) } // Run `nvidia-container-cli info`. // This has the side-effect of automatically creating GPU device files. argv := []string{cliPath, "--load-kmods", "info"} log.Debugf("Executing %q", argv) var infoOut, infoErr strings.Builder cmd := exec.Cmd{ Path: argv[0], Args: argv, Env: os.Environ(), Stdout: &infoOut, Stderr: &infoErr, } if err := cmd.Run(); err != nil { return fmt.Errorf("nvidia-container-cli info failed, err: %v\nstdout: %s\nstderr: %s", err, infoOut.String(), infoErr.String()) } log.Debugf("nvidia-container-cli info: %v", infoOut.String()) } return nil } // nvproxyLoadKernelModules loads NVIDIA-related kernel modules with modprobe. func nvproxyLoadKernelModules() { for _, mod := range [...]string{ "nvidia", "nvidia-uvm", } { argv := []string{ "/sbin/modprobe", mod, } log.Debugf("Executing %q", argv) var stdout, stderr strings.Builder cmd := exec.Cmd{ Path: argv[0], Args: argv, Env: os.Environ(), Stdout: &stdout, Stderr: &stderr, } if err := cmd.Run(); err != nil { // This might not be fatal since modules may already be loaded. Log // the failure but continue. log.Warningf("modprobe %s failed, err: %v\nstdout: %s\nstderr: %s", mod, err, stdout.String(), stderr.String()) } } } // nvproxySetupAfterGoferUserns runs `nvidia-container-cli configure`. // This sets up the container filesystem with bind mounts that allow it to // use NVIDIA devices. // // This should be called during the Gofer setup process, as the bind mounts // are created in the Gofer's mount namespace. // If successful, it returns a callback function that must be called once the // Gofer process has started. // This function has no effect if nvproxy functionality is not requested. // // This function essentially replicates // nvidia-container-toolkit:cmd/nvidia-container-runtime-hook, i.e. the // binary that executeHook() is hard-coded to skip, with differences noted // inline. We do this rather than move the prestart hook because the // "runtime environment" in which prestart hooks execute is vaguely // defined, such that nvidia-container-runtime-hook and existing runsc // hooks differ in their expected environment. // // Note that nvidia-container-cli will set up files in /dev and /proc which // are useless, since they will be hidden by sentry devtmpfs and procfs // respectively (and some device files will have the wrong device numbers // from the application's perspective since nvproxy may register device // numbers in sentry VFS that differ from those on the host, e.g. for // nvidia-uvm). These files are separately created during sandbox VFS // construction. For this reason, we don't need to parse // NVIDIA_VISIBLE_DEVICES or pass --device to nvidia-container-cli. func nvproxySetupAfterGoferUserns(spec *specs.Spec, conf *config.Config, goferCmd *exec.Cmd, goferDonations *donation.Agency) (func() error, error) { if !specutils.GPUFunctionalityRequestedViaHook(spec, conf) { return func() error { return nil }, nil } if spec.Root == nil { return nil, fmt.Errorf("spec missing root filesystem") } // nvidia-container-cli does not create this directory. if err := os.MkdirAll(path.Join(spec.Root.Path, "proc", "driver", "nvidia"), 0555); err != nil { return nil, fmt.Errorf("failed to create /proc/driver/nvidia in app filesystem: %w", err) } cliPath, err := exec.LookPath("nvidia-container-cli") if err != nil { return nil, fmt.Errorf("failed to locate nvidia-container-cli in PATH: %w", err) } // On Ubuntu, ldconfig is a wrapper around ldconfig.real, and we need the latter. var ldconfigPath string if _, err := os.Stat("/sbin/ldconfig.real"); err == nil { ldconfigPath = "/sbin/ldconfig.real" } else { ldconfigPath = "/sbin/ldconfig" } devices, err := specutils.ParseNvidiaVisibleDevices(spec) if err != nil { return nil, fmt.Errorf("failed to get nvidia device numbers: %w", err) } // Create synchronization FD for nvproxy. fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) if err != nil { return nil, err } ourEnd := os.NewFile(uintptr(fds[0]), "nvproxy sync runsc FD") goferEnd := os.NewFile(uintptr(fds[1]), "nvproxy sync gofer FD") goferDonations.DonateAndClose("sync-nvproxy-fd", goferEnd) return func() error { defer ourEnd.Close() argv := []string{ cliPath, "--load-kmods", "configure", fmt.Sprintf("--ldconfig=@%s", ldconfigPath), "--no-cgroups", // runsc doesn't configure device cgroups yet "--utility", "--compute", fmt.Sprintf("--pid=%d", goferCmd.Process.Pid), fmt.Sprintf("--device=%s", devices), spec.Root.Path, } log.Debugf("Executing %q", argv) var stdout, stderr strings.Builder cmd := exec.Cmd{ Path: argv[0], Args: argv, Env: os.Environ(), Stdout: &stdout, Stderr: &stderr, } if err := cmd.Run(); err != nil { return fmt.Errorf("nvidia-container-cli configure failed, err: %v\nstdout: %s\nstderr: %s", err, stdout.String(), stderr.String()) } return nil }, nil } // CheckStopped checks if the container is stopped and updates its status. func (c *Container) CheckStopped() { if state, err := c.Sandbox.ContainerRuntimeState(c.ID); err != nil { log.Warningf("Cannot find if container %v exists, checking if sandbox %v is running, err: %v", c.ID, c.Sandbox.ID, err) if !c.IsSandboxRunning() { log.Warningf("Sandbox isn't running anymore, marking container %v as stopped:", c.ID) c.changeStatus(Stopped) } } else { if state == boot.RuntimeStateStopped { log.Warningf("Container %v is stopped", c.ID) c.changeStatus(Stopped) } } } golang-gvisor-gvisor-0.0~20240729.0/runsc/container/container_state_autogen.go000066400000000000000000000000731465435605700272120ustar00rootroot00000000000000// automatically generated by stateify. package container golang-gvisor-gvisor-0.0~20240729.0/runsc/container/hook.go000066400000000000000000000061641465435605700232550ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package container import ( "bytes" "encoding/json" "fmt" "os/exec" "path/filepath" "strings" "time" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/log" ) // This file implements hooks as defined in OCI spec: // https://github.com/opencontainers/runtime-spec/blob/master/config.md#toc22 // // "hooks":{ // "prestart":[{ // "path":"/usr/bin/dockerd", // "args":[ // "libnetwork-setkey", "arg2", // ] // }] // }, // executeHooksBestEffort executes hooks and logs warning in case they fail. // Runs all hooks, always. func executeHooksBestEffort(hooks []specs.Hook, s specs.State) { for _, h := range hooks { if err := executeHook(h, s); err != nil { log.Warningf("Failure to execute hook %+v, err: %v", h, err) } } } // executeHooks executes hooks until the first one fails or they all execute. func executeHooks(hooks []specs.Hook, s specs.State) error { for _, h := range hooks { if err := executeHook(h, s); err != nil { return err } } return nil } func executeHook(h specs.Hook, s specs.State) error { log.Debugf("Executing hook %+v, state: %+v", h, s) if strings.TrimSpace(h.Path) == "" { return fmt.Errorf("empty path for hook") } if !filepath.IsAbs(h.Path) { return fmt.Errorf("path for hook is not absolute: %q", h.Path) } // Don't invoke nvidia-container-runtime-hook at prestart, which may be // configured by e.g. Docker's --gpus flag, since // nvidia-container-runtime-hook doesn't understand gVisor's bifurcation // between sentry and application filesystems. if strings.HasSuffix(h.Path, "/nvidia-container-runtime-hook") { log.Infof("Skipping nvidia-container-runtime-hook") return nil } b, err := json.Marshal(s) if err != nil { return err } var stdout, stderr bytes.Buffer cmd := exec.Cmd{ Path: h.Path, Args: h.Args, Env: h.Env, Stdin: bytes.NewReader(b), Stdout: &stdout, Stderr: &stderr, } if err := cmd.Start(); err != nil { return err } c := make(chan error, 1) go func() { c <- cmd.Wait() }() var timer <-chan time.Time if h.Timeout != nil { timer = time.After(time.Duration(*h.Timeout) * time.Second) } select { case err := <-c: if err != nil { return fmt.Errorf("failure executing hook %q, err: %v\nstdout: %s\nstderr: %s", h.Path, err, stdout.String(), stderr.String()) } case <-timer: _ = cmd.Process.Kill() _ = cmd.Wait() return fmt.Errorf("timeout executing hook %q\nstdout: %s\nstderr: %s", h.Path, stdout.String(), stderr.String()) } log.Debugf("Execute hook %q success!", h.Path) return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/container/state_file.go000066400000000000000000000303341465435605700244300ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package container import ( "encoding/json" "errors" "fmt" "io/ioutil" "os" "path/filepath" "regexp" "strings" "github.com/gofrs/flock" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" ) const stateFileExtension = "state" // ErrStateFileLocked is returned by Load() when the state file is locked // and TryLock is enabled. var ErrStateFileLocked = errors.New("state file locked") // TryLock represents whether we should block waiting for the lock to be acquired or not. type TryLock bool const ( // BlockAcquire means we will block until the lock can be acquired. BlockAcquire TryLock = false // TryAcquire means we will fail fast if the lock cannot be acquired. TryAcquire TryLock = true ) // LoadOpts provides options for Load()ing a container. type LoadOpts struct { // Exact tells whether the search should be exact. See Load() for more. Exact bool // SkipCheck tells Load() to skip checking if container is runnning. SkipCheck bool // TryLock tells Load() to fail if the container state file cannot be locked, // as opposed to blocking until it is available. // When the state file cannot be locked, it will error with ErrStateFileLocked. TryLock TryLock // RootContainer when true matches the search only with the root container of // a sandbox. This is used when looking for a sandbox given that root // container and sandbox share the same ID. RootContainer bool } // Load loads a container with the given id from a metadata file. "id" may // be an abbreviation of the full container id in case LoadOpts.Exact if not // set. It also checks if the container is still running, in order to return // an error to the caller earlier. This check is skipped if LoadOpts.SkipCheck // is set. // // Returns ErrNotExist if no container is found. Returns error in case more than // one containers matching the ID prefix is found. func Load(rootDir string, id FullID, opts LoadOpts) (*Container, error) { log.Debugf("Load container, rootDir: %q, id: %+v, opts: %+v", rootDir, id, opts) if !opts.Exact { var err error id, err = findContainerID(rootDir, id.ContainerID) if err != nil { // Preserve error so that callers can distinguish 'not found' errors. return nil, err } } if err := id.validate(); err != nil { return nil, fmt.Errorf("invalid container id: %v", err) } state := StateFile{ RootDir: rootDir, ID: id, } defer state.close() c := &Container{} if err := state.load(c, opts); err != nil { if os.IsNotExist(err) { // Preserve error so that callers can distinguish 'not found' errors. return nil, err } return nil, fmt.Errorf("reading container metadata file %q: %v", state.statePath(), err) } if opts.RootContainer && c.ID != c.Sandbox.ID { return nil, fmt.Errorf("ID %q doesn't belong to a sandbox", id) } if !opts.SkipCheck { // If the status is "Running" or "Created", check that the sandbox/container // is still running, setting it to Stopped if not. // // This is inherently racy. switch c.Status { case Created, Running: c.CheckStopped() } } return c, nil } // List returns all container ids in the given root directory. func List(rootDir string) ([]FullID, error) { log.Debugf("List containers %q", rootDir) return listMatch(rootDir, FullID{}) } // ListSandboxes returns all sandbox ids in the given root directory. func ListSandboxes(rootDir string) ([]FullID, error) { log.Debugf("List containers %q", rootDir) ids, err := List(rootDir) if err != nil { return nil, err } sandboxes := make(map[string]struct{}, len(ids)) for _, id := range ids { sandboxes[id.SandboxID] = struct{}{} } // Reset ids to list only sandboxes. ids = nil for id := range sandboxes { ids = append(ids, FullID{SandboxID: id, ContainerID: id}) } return ids, nil } // listMatch returns all container ids that match the provided id. func listMatch(rootDir string, id FullID) ([]FullID, error) { id.SandboxID += "*" id.ContainerID += "*" pattern := buildPath(rootDir, id, stateFileExtension) list, err := filepath.Glob(pattern) if err != nil { return nil, err } var out []FullID for _, path := range list { id, err := parseFileName(filepath.Base(path)) if err == nil { out = append(out, id) } } return out, nil } // LoadSandbox loads all containers that belong to the sandbox with the given // ID. func LoadSandbox(rootDir, id string, opts LoadOpts) ([]*Container, error) { cids, err := listMatch(rootDir, FullID{SandboxID: id}) if err != nil { return nil, err } // Override load options that don't make sense in the context of this function. opts.SkipCheck = true // We're loading all containers irrespective of status. opts.RootContainer = false // We're loading all containers, not just the root one. opts.Exact = true // We'll iterate over exact container IDs below. // Load the container metadata. var containers []*Container for _, cid := range cids { container, err := Load(rootDir, cid, opts) if err != nil { // Container file may not exist if it raced with creation/deletion or // directory was left behind. Load provides a snapshot in time, so it's // fine to skip it. if os.IsNotExist(err) { continue } return nil, fmt.Errorf("loading sandbox %q, failed to load container %q: %v", id, cid, err) } containers = append(containers, container) } return containers, nil } func findContainerID(rootDir, partialID string) (FullID, error) { // Check whether the id fully specifies an existing container. pattern := buildPath(rootDir, FullID{SandboxID: "*", ContainerID: partialID + "*"}, stateFileExtension) list, err := filepath.Glob(pattern) if err != nil { return FullID{}, err } switch len(list) { case 0: return FullID{}, os.ErrNotExist case 1: return parseFileName(filepath.Base(list[0])) } // Now see whether id could be an abbreviation of exactly 1 of the // container ids. If id is ambiguous (it could match more than 1 // container), it is an error. ids, err := List(rootDir) if err != nil { return FullID{}, err } var rv *FullID for _, id := range ids { if strings.HasPrefix(id.ContainerID, partialID) { if rv != nil { return FullID{}, fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, rv, id) } rv = &id } } if rv == nil { return FullID{}, os.ErrNotExist } log.Debugf("abbreviated id %q resolves to full id %v", partialID, *rv) return *rv, nil } func parseFileName(name string) (FullID, error) { re := regexp.MustCompile(`([\w+-\.]+)_sandbox:([\w+-\.]+)\.` + stateFileExtension) groups := re.FindStringSubmatch(name) if len(groups) != 3 { return FullID{}, fmt.Errorf("invalid state file name format: %q", name) } id := FullID{ SandboxID: groups[2], ContainerID: groups[1], } if err := id.validate(); err != nil { return FullID{}, fmt.Errorf("invalid state file name %q: %w", name, err) } return id, nil } // FullID combines sandbox and container ID to identify a container. Sandbox ID // is used to allow all containers for a given sandbox to be loaded by matching // sandbox ID in the file name. type FullID struct { SandboxID string `json:"sandboxId"` ContainerID string `json:"containerId"` } func (f *FullID) String() string { return f.SandboxID + "/" + f.ContainerID } func (f *FullID) validate() error { if err := validateID(f.SandboxID); err != nil { return err } return validateID(f.ContainerID) } // StateFile handles load from/save to container state safely from multiple // processes. It uses a lock file to provide synchronization between operations. // // The lock file is located at: "${s.RootDir}/${containerd-id}_sand:{sandbox-id}.lock". // The state file is located at: "${s.RootDir}/${containerd-id}_sand:{sandbox-id}.state". type StateFile struct { // RootDir is the directory containing the container metadata file. RootDir string `json:"rootDir"` // ID is the sandbox+container ID. ID FullID `json:"id"` // // Fields below this line are not saved in the state file and will not // be preserved across commands. // once sync.Once `nojson:"true"` flock *flock.Flock `nojson:"true"` } // lock globally locks all locking operations for the container. func (s *StateFile) lock(tryLock TryLock) error { s.once.Do(func() { s.flock = flock.New(s.lockPath()) }) if tryLock { gotLock, err := s.flock.TryLock() if err != nil { return fmt.Errorf("acquiring lock on %q: %v", s.flock, err) } if !gotLock { return ErrStateFileLocked } } else { if err := s.flock.Lock(); err != nil { return fmt.Errorf("acquiring lock on %q: %v", s.flock, err) } } return nil } // LockForNew acquires the lock and checks if the state file doesn't exist. This // is done to ensure that more than one creation didn't race to create // containers with the same ID. func (s *StateFile) LockForNew() error { if err := s.lock(BlockAcquire); err != nil { return err } // Checks if the container already exists by looking for the metadata file. if _, err := os.Stat(s.statePath()); err == nil { s.UnlockOrDie() return fmt.Errorf("container already exists") } else if !os.IsNotExist(err) { s.UnlockOrDie() return fmt.Errorf("looking for existing container: %v", err) } return nil } // unlock globally unlocks all locking operations for the container. func (s *StateFile) unlock() error { if !s.flock.Locked() { panic("unlock called without lock held") } if err := s.flock.Unlock(); err != nil { log.Warningf("Error to release lock on %q: %v", s.flock, err) return fmt.Errorf("releasing lock on %q: %v", s.flock, err) } return nil } // UnlockOrDie is the same as unlock() but panics in case of failure. func (s *StateFile) UnlockOrDie() { if !s.flock.Locked() { panic("unlock called without lock held") } if err := s.flock.Unlock(); err != nil { panic(fmt.Sprintf("Error releasing lock on %q: %v", s.flock, err)) } } // SaveLocked saves 'v' to the state file. // // Preconditions: lock(*) must been called before. func (s *StateFile) SaveLocked(v any) error { if !s.flock.Locked() { panic("saveLocked called without lock held") } meta, err := json.Marshal(v) if err != nil { return err } if err := ioutil.WriteFile(s.statePath(), meta, 0640); err != nil { return fmt.Errorf("writing json file: %v", err) } return nil } // Stat returns the result of calling stat() on the state file. // Doing so does not require locking. func (s *StateFile) Stat() (os.FileInfo, error) { return os.Stat(s.statePath()) } func (s *StateFile) load(v any, opts LoadOpts) error { if err := s.lock(opts.TryLock); err != nil { return err } defer s.UnlockOrDie() metaBytes, err := ioutil.ReadFile(s.statePath()) if err != nil { return err } return json.Unmarshal(metaBytes, &v) } func (s *StateFile) close() error { if s.flock == nil { return nil } if s.flock.Locked() { panic("Closing locked file") } return s.flock.Close() } func buildPath(rootDir string, id FullID, extension string) string { // Note: "_" and ":" are not valid in IDs. name := fmt.Sprintf("%s_sandbox:%s.%s", id.ContainerID, id.SandboxID, extension) return filepath.Join(rootDir, name) } // statePath is the full path to the state file. func (s *StateFile) statePath() string { return buildPath(s.RootDir, s.ID, stateFileExtension) } // lockPath is the full path to the lock file. func (s *StateFile) lockPath() string { return buildPath(s.RootDir, s.ID, "lock") } // Destroy deletes all state created by the stateFile. It may be called with the // lock file held. In that case, the lock file must still be unlocked and // properly closed after destroy returns. func (s *StateFile) Destroy() error { if err := os.Remove(s.statePath()); err != nil && !os.IsNotExist(err) { return err } if err := os.Remove(s.lockPath()); err != nil && !os.IsNotExist(err) { return err } return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/container/status.go000066400000000000000000000026321465435605700236340ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package container import ( specs "github.com/opencontainers/runtime-spec/specs-go" ) // Status is a local type alias. type Status = specs.ContainerState const ( // Created indicates "the runtime has finished the create operation and // the container process has neither exited nor executed the // user-specified program" Created = specs.StateCreated // Creating indicates "the container is being created". Creating = specs.StateCreating // Running indicates "the container process has executed the // user-specified program but has not exited". Running = specs.StateRunning // Stopped indicates "the container process has exited". Stopped = specs.StateStopped // Paused indicates that the process within the container has been // suspended. This is a local status, not part of the spec. Paused = Status("paused") ) golang-gvisor-gvisor-0.0~20240729.0/runsc/donation/000077500000000000000000000000001465435605700216105ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/donation/donation.go000066400000000000000000000076251465435605700237640ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package donation tracks files that are being donated to a child process and // using flags to notified the child process where the FDs are. package donation import ( "fmt" "os" "os/exec" "time" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/runsc/specutils" ) // LogDonations logs the FDs we are donating in the command. func LogDonations(cmd *exec.Cmd) { for i, f := range cmd.ExtraFiles { log.Debugf("Donating FD %d: %q", i+3, f.Name()) } } // Agency keeps track of files that need to be donated to a child process. type Agency struct { donations []donation closePending []*os.File } type donation struct { flag string files []*os.File } // Donate sets up the given files to be donated to another process. The FD // in which the new file will appear in the child process is added as a flag to // the child process, e.g. --flag=3. In case the file is nil, -1 is used for the // flag value and no file is donated to the next process. func (f *Agency) Donate(flag string, files ...*os.File) { f.donations = append(f.donations, donation{flag: flag, files: files}) } // DonateAndClose does the same as Donate, but takes ownership of the files // passed in. func (f *Agency) DonateAndClose(flag string, files ...*os.File) { f.Donate(flag, files...) f.closePending = append(f.closePending, files...) } // OpenAndDonate is similar to DonateAndClose but handles the opening of the // file for convenience. It's a noop, if path is empty. func (f *Agency) OpenAndDonate(flag, path string, flags int) error { if len(path) == 0 { return nil } file, err := os.OpenFile(path, flags, 0644) if err != nil { return err } f.DonateAndClose(flag, file) return nil } // DonateDebugLogFile is similar to DonateAndClose but handles the opening of // the file using specutils.DebugLogFile() for convenience. It's a noop, if // path is empty. func (f *Agency) DonateDebugLogFile(flag, logPattern, command, test string, timestamp time.Time) error { if len(logPattern) == 0 { return nil } file, err := specutils.DebugLogFile(logPattern, command, test, timestamp) if err != nil { return fmt.Errorf("opening debug log file in %q: %v", logPattern, err) } f.DonateAndClose(flag, file) return nil } // Transfer sets up all files and flags to cmd. It can be called multiple times // to partially transfer files to cmd. func (f *Agency) Transfer(cmd *exec.Cmd, nextFD int) int { for _, d := range f.donations { for _, file := range d.files { fd := -1 if file != nil { cmd.ExtraFiles = append(cmd.ExtraFiles, file) fd = nextFD nextFD++ } cmd.Args = append(cmd.Args, fmt.Sprintf("--%s=%d", d.flag, fd)) } } // Reset donations made so far in case more transfers are needed. f.donations = nil return nextFD } // DonateAndTransferCustomFiles sets up the flags for passing file descriptors from the // host to the sandbox. Making use of the agency is not necessary, func DonateAndTransferCustomFiles(cmd *exec.Cmd, nextFD int, files map[int]*os.File) int { for fd, file := range files { cmd.Args = append(cmd.Args, fmt.Sprintf("--pass-fd=%d:%d", nextFD, fd)) cmd.ExtraFiles = append(cmd.ExtraFiles, file) nextFD++ } return nextFD } // Close closes any files the agency has taken ownership over. func (f *Agency) Close() { for _, file := range f.closePending { if file != nil { _ = file.Close() } } f.closePending = nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/donation/donation_state_autogen.go000066400000000000000000000000721465435605700266730ustar00rootroot00000000000000// automatically generated by stateify. package donation golang-gvisor-gvisor-0.0~20240729.0/runsc/flag/000077500000000000000000000000001465435605700207065ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/flag/flag.go000066400000000000000000000026701465435605700221530ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false // Package flag wraps flag primitives. package flag import ( "flag" ) // FlagSet is an alias for flag.FlagSet. type FlagSet = flag.FlagSet // Flag is an alias for flag.Flag. type Flag = flag.Flag // Aliases for flag functions. var ( Bool = flag.Bool CommandLine = flag.CommandLine Duration = flag.Duration Float64 = flag.Float64 Int = flag.Int Int64 = flag.Int64 Lookup = flag.Lookup NewFlagSet = flag.NewFlagSet Parse = flag.Parse String = flag.String StringVar = flag.StringVar Uint = flag.Uint Uint64 = flag.Uint64 Var = flag.Var ) // ContinueOnError is an alias for flag.ContinueOnError. const ContinueOnError = flag.ContinueOnError // Get returns the flag's underlying object. func Get(v flag.Value) any { return v.(flag.Getter).Get() } golang-gvisor-gvisor-0.0~20240729.0/runsc/flag/flag_state_autogen.go000066400000000000000000000001321465435605700250640ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package flag golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/000077500000000000000000000000001465435605700214305ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/filter/000077500000000000000000000000001465435605700227155ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/filter/config.go000066400000000000000000000156151465435605700245210ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package filter import ( "os" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" ) // allowedSyscalls is the set of syscalls executed by the gofer. var allowedSyscalls = seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_ACCEPT: seccomp.MatchAll{}, unix.SYS_CLOCK_GETTIME: seccomp.MatchAll{}, unix.SYS_CLOSE: seccomp.MatchAll{}, unix.SYS_DUP: seccomp.MatchAll{}, unix.SYS_EPOLL_CTL: seccomp.MatchAll{}, unix.SYS_EPOLL_PWAIT: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(0), }, unix.SYS_EVENTFD2: seccomp.PerArg{ seccomp.EqualTo(0), seccomp.EqualTo(0), }, unix.SYS_EXIT: seccomp.MatchAll{}, unix.SYS_EXIT_GROUP: seccomp.MatchAll{}, unix.SYS_FCHMOD: seccomp.MatchAll{}, unix.SYS_FCHOWNAT: seccomp.MatchAll{}, unix.SYS_FCNTL: seccomp.Or{ seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(unix.F_GETFL), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(unix.F_SETFL), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(unix.F_GETFD), }, // Used by flipcall.PacketWindowAllocator.Init(). seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(unix.F_ADD_SEALS), }, }, unix.SYS_FSTAT: seccomp.MatchAll{}, unix.SYS_FSYNC: seccomp.MatchAll{}, unix.SYS_FUTEX: seccomp.Or{ seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(0), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(0), }, // Non-private futex used for flipcall. seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(linux.FUTEX_WAIT), seccomp.AnyValue{}, seccomp.AnyValue{}, }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(linux.FUTEX_WAKE), seccomp.AnyValue{}, seccomp.AnyValue{}, }, }, // getcpu is used by some versions of the Go runtime and by the hostcpu // package on arm64. unix.SYS_GETCPU: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(0), seccomp.EqualTo(0), }, unix.SYS_GETPID: seccomp.MatchAll{}, unix.SYS_GETRANDOM: seccomp.MatchAll{}, unix.SYS_GETTID: seccomp.MatchAll{}, unix.SYS_GETTIMEOFDAY: seccomp.MatchAll{}, unix.SYS_LSEEK: seccomp.MatchAll{}, unix.SYS_MADVISE: seccomp.MatchAll{}, unix.SYS_MEMFD_CREATE: seccomp.MatchAll{}, // Used by flipcall.PacketWindowAllocator.Init(). unix.SYS_MMAP: seccomp.Or{ seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.MAP_SHARED), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.MAP_PRIVATE | unix.MAP_ANONYMOUS), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.MAP_PRIVATE | unix.MAP_ANONYMOUS | unix.MAP_FIXED), }, }, unix.SYS_MPROTECT: seccomp.MatchAll{}, unix.SYS_MUNMAP: seccomp.MatchAll{}, unix.SYS_NANOSLEEP: seccomp.MatchAll{}, unix.SYS_OPENAT: seccomp.MatchAll{}, unix.SYS_PPOLL: seccomp.MatchAll{}, unix.SYS_PREAD64: seccomp.MatchAll{}, unix.SYS_PWRITE64: seccomp.MatchAll{}, unix.SYS_READ: seccomp.MatchAll{}, unix.SYS_RECVMSG: seccomp.Or{ seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.MSG_DONTWAIT | unix.MSG_TRUNC), }, seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.MSG_DONTWAIT | unix.MSG_TRUNC | unix.MSG_PEEK), }, }, unix.SYS_RESTART_SYSCALL: seccomp.MatchAll{}, // May be used by the runtime during panic(). unix.SYS_RT_SIGACTION: seccomp.MatchAll{}, unix.SYS_RT_SIGPROCMASK: seccomp.MatchAll{}, unix.SYS_RT_SIGRETURN: seccomp.MatchAll{}, unix.SYS_SCHED_YIELD: seccomp.MatchAll{}, unix.SYS_SENDMSG: seccomp.Or{ // Used by fdchannel.Endpoint.SendFD(). seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(0), }, // Used by unet.SocketWriter.WriteVec(). seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.MSG_DONTWAIT | unix.MSG_NOSIGNAL), }, }, unix.SYS_SHUTDOWN: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(unix.SHUT_RDWR), }, unix.SYS_SIGALTSTACK: seccomp.MatchAll{}, // Used by fdchannel.NewConnectedSockets(). unix.SYS_SOCKETPAIR: seccomp.PerArg{ seccomp.EqualTo(unix.AF_UNIX), seccomp.EqualTo(unix.SOCK_SEQPACKET | unix.SOCK_CLOEXEC), seccomp.EqualTo(0), }, unix.SYS_TGKILL: seccomp.PerArg{ seccomp.EqualTo(uint64(os.Getpid())), }, unix.SYS_WRITE: seccomp.MatchAll{}, }) var udsCommonSyscalls = seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_SOCKET: seccomp.Or{ seccomp.PerArg{ seccomp.EqualTo(unix.AF_UNIX), seccomp.EqualTo(unix.SOCK_STREAM), seccomp.EqualTo(0), }, seccomp.PerArg{ seccomp.EqualTo(unix.AF_UNIX), seccomp.EqualTo(unix.SOCK_DGRAM), seccomp.EqualTo(0), }, seccomp.PerArg{ seccomp.EqualTo(unix.AF_UNIX), seccomp.EqualTo(unix.SOCK_SEQPACKET), seccomp.EqualTo(0), }, }, }) var udsOpenSyscalls = seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_CONNECT: seccomp.MatchAll{}, }) var udsCreateSyscalls = seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_ACCEPT4: seccomp.MatchAll{}, unix.SYS_BIND: seccomp.MatchAll{}, unix.SYS_LISTEN: seccomp.MatchAll{}, }) var lisafsFilters = seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_FALLOCATE: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.EqualTo(0), }, unix.SYS_FCHMODAT: seccomp.MatchAll{}, unix.SYS_FGETXATTR: seccomp.MatchAll{}, unix.SYS_FSTATFS: seccomp.MatchAll{}, unix.SYS_GETDENTS64: seccomp.MatchAll{}, unix.SYS_LINKAT: seccomp.PerArg{ seccomp.NonNegativeFD{}, seccomp.AnyValue{}, seccomp.NonNegativeFD{}, seccomp.AnyValue{}, seccomp.EqualTo(0), }, unix.SYS_MKDIRAT: seccomp.MatchAll{}, unix.SYS_MKNODAT: seccomp.MatchAll{}, unix.SYS_READLINKAT: seccomp.MatchAll{}, unix.SYS_RENAMEAT: seccomp.MatchAll{}, unix.SYS_SYMLINKAT: seccomp.MatchAll{}, unix.SYS_FTRUNCATE: seccomp.MatchAll{}, unix.SYS_UNLINKAT: seccomp.MatchAll{}, unix.SYS_UTIMENSAT: seccomp.MatchAll{}, }) golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/filter/config_amd64.go000066400000000000000000000024131465435605700255040ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package filter import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/seccomp" ) func init() { allowedSyscalls.Set(unix.SYS_CLONE, seccomp.PerArg{ // parent_tidptr and child_tidptr are always 0 because neither // CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used. seccomp.EqualTo( unix.CLONE_VM | unix.CLONE_FS | unix.CLONE_FILES | unix.CLONE_SETTLS | unix.CLONE_SIGHAND | unix.CLONE_SYSVSEM | unix.CLONE_THREAD), seccomp.AnyValue{}, // newsp seccomp.EqualTo(0), // parent_tidptr seccomp.EqualTo(0), // child_tidptr seccomp.AnyValue{}, // tls }) allowedSyscalls.Set(unix.SYS_NEWFSTATAT, seccomp.MatchAll{}) } golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/filter/config_arm64.go000066400000000000000000000025571465435605700255330ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package filter import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/seccomp" ) func init() { allowedSyscalls.Set(unix.SYS_CLONE, seccomp.PerArg{ // parent_tidptr and child_tidptr are always 0 because neither // CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used. seccomp.EqualTo( unix.CLONE_VM | unix.CLONE_FS | unix.CLONE_FILES | unix.CLONE_SIGHAND | unix.CLONE_SYSVSEM | unix.CLONE_THREAD), seccomp.AnyValue{}, // newsp // These arguments are left uninitialized by the Go // runtime, so they may be anything (and are unused by // the host). seccomp.AnyValue{}, // parent_tidptr seccomp.AnyValue{}, // tls seccomp.AnyValue{}, // child_tidptr }) allowedSyscalls.Set(unix.SYS_FSTATAT, seccomp.MatchAll{}) } golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/filter/config_profile.go000066400000000000000000000026151465435605700262350ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package filter import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/seccomp" ) var profileFilters = seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_OPENAT: seccomp.PerArg{ seccomp.AnyValue{}, seccomp.AnyValue{}, seccomp.EqualTo(unix.O_RDONLY | unix.O_LARGEFILE | unix.O_CLOEXEC), }, unix.SYS_SETITIMER: seccomp.MatchAll{}, unix.SYS_TIMER_CREATE: seccomp.PerArg{ seccomp.EqualTo(unix.CLOCK_THREAD_CPUTIME_ID), /* which */ seccomp.AnyValue{}, /* sevp */ seccomp.AnyValue{}, /* timerid */ }, unix.SYS_TIMER_DELETE: seccomp.MatchAll{}, unix.SYS_TIMER_SETTIME: seccomp.PerArg{ seccomp.AnyValue{}, /* timerid */ seccomp.EqualTo(0), /* flags */ seccomp.AnyValue{}, /* new_value */ seccomp.EqualTo(0), /* old_value */ }, }) golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/filter/extra_filters.go000066400000000000000000000016661465435605700261300ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !msan && !race // +build !msan,!race package filter import ( "gvisor.dev/gvisor/pkg/seccomp" ) // instrumentationFilters returns additional filters for syscalls used by // Go instrumentation tools, e.g. -race, -msan. // Returns empty when disabled. func instrumentationFilters() seccomp.SyscallRules { return seccomp.NewSyscallRules() } golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/filter/extra_filters_msan.go000066400000000000000000000021621465435605700271360ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build msan // +build msan package filter import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/seccomp" ) // instrumentationFilters returns additional filters for syscalls used by MSAN. func instrumentationFilters() seccomp.SyscallRules { log.Warningf("*** SECCOMP WARNING: MSAN is enabled: syscall filters less restrictive!") return seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_SCHED_GETAFFINITY: seccomp.MatchAll{}, unix.SYS_SET_ROBUST_LIST: seccomp.MatchAll{}, }) } golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/filter/extra_filters_race.go000066400000000000000000000032641465435605700271160ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build race // +build race package filter import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/seccomp" ) // instrumentationFilters returns additional filters for syscalls used by TSAN. func instrumentationFilters() seccomp.SyscallRules { log.Warningf("*** SECCOMP WARNING: TSAN is enabled: syscall filters less restrictive!") return archInstrumentationFilters(seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ unix.SYS_BRK: seccomp.MatchAll{}, unix.SYS_CLOCK_NANOSLEEP: seccomp.MatchAll{}, unix.SYS_CLONE: seccomp.MatchAll{}, unix.SYS_CLONE3: seccomp.MatchAll{}, unix.SYS_FUTEX: seccomp.MatchAll{}, unix.SYS_MADVISE: seccomp.MatchAll{}, unix.SYS_MMAP: seccomp.MatchAll{}, unix.SYS_MUNLOCK: seccomp.MatchAll{}, unix.SYS_NANOSLEEP: seccomp.MatchAll{}, unix.SYS_OPENAT: seccomp.MatchAll{}, unix.SYS_RSEQ: seccomp.MatchAll{}, unix.SYS_SET_ROBUST_LIST: seccomp.MatchAll{}, unix.SYS_SCHED_GETAFFINITY: seccomp.MatchAll{}, })) } golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/filter/extra_filters_race_amd64.go000066400000000000000000000016321465435605700301060ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build race // +build race package filter import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/seccomp" ) func archInstrumentationFilters(f seccomp.SyscallRules) seccomp.SyscallRules { f.Set(unix.SYS_OPEN, seccomp.MatchAll{}) // Used within glibc's malloc. f.Set(unix.SYS_TIME, seccomp.MatchAll{}) return f } golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/filter/extra_filters_race_arm64.go000066400000000000000000000014151465435605700301230ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build race // +build race package filter import ( "gvisor.dev/gvisor/pkg/seccomp" ) func archInstrumentationFilters(f seccomp.SyscallRules) seccomp.SyscallRules { return f } golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/filter/filter.go000066400000000000000000000036331465435605700245360ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package filter defines all syscalls the gofer is allowed to make, and // installs seccomp filters to prevent prohibited syscalls in case it's // compromised. package filter import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/seccomp" ) // Options are seccomp filter related options. type Options struct { UDSOpenEnabled bool UDSCreateEnabled bool ProfileEnabled bool DirectFS bool } // Install installs seccomp filters. func Install(opt Options) error { s := allowedSyscalls if opt.ProfileEnabled { report("profile enabled: syscall filters less restrictive!") s.Merge(profileFilters) } if opt.UDSOpenEnabled || opt.UDSCreateEnabled { report("host UDS enabled: syscall filters less restrictive!") s.Merge(udsCommonSyscalls) if opt.UDSOpenEnabled { s.Merge(udsOpenSyscalls) } if opt.UDSCreateEnabled { s.Merge(udsCreateSyscalls) } } // Set of additional filters used by -race and -msan. Returns empty // when not enabled. s.Merge(instrumentationFilters()) // When DirectFS is not enabled, filters for LisaFS are installed. if !opt.DirectFS { s.Merge(lisafsFilters) } return seccomp.Install(s, seccomp.DenyNewExecMappings, seccomp.DefaultProgramOptions()) } // report writes a warning message to the log. func report(msg string) { log.Warningf("*** SECCOMP WARNING: %s", msg) } golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/filter/filter_amd64_state_autogen.go000066400000000000000000000001321465435605700304420ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 // +build amd64 package filter golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/filter/filter_arm64_state_autogen.go000066400000000000000000000001321465435605700304600ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 // +build arm64 package filter golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/filter/filter_race_amd64_state_autogen.go000066400000000000000000000001301465435605700314320ustar00rootroot00000000000000// automatically generated by stateify. //go:build race // +build race package filter golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/filter/filter_race_arm64_state_autogen.go000066400000000000000000000001301465435605700314500ustar00rootroot00000000000000// automatically generated by stateify. //go:build race // +build race package filter golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/filter/filter_race_state_autogen.go000066400000000000000000000001301465435605700304370ustar00rootroot00000000000000// automatically generated by stateify. //go:build race // +build race package filter golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/filter/filter_state_autogen.go000066400000000000000000000001661465435605700274560ustar00rootroot00000000000000// automatically generated by stateify. //go:build !msan && !race && msan // +build !msan,!race,msan package filter golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/fsgofer_state_autogen.go000066400000000000000000000000711465435605700263320ustar00rootroot00000000000000// automatically generated by stateify. package fsgofer golang-gvisor-gvisor-0.0~20240729.0/runsc/fsgofer/lisafs.go000066400000000000000000001107411465435605700232440ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package fsgofer provides a lisafs server implementation which gives access // to local files. package fsgofer import ( "fmt" "io" "math" "os" "path" "path/filepath" "strconv" "sync" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/cleanup" rwfd "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fsutil" "gvisor.dev/gvisor/pkg/lisafs" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/runsc/config" ) // LINT.IfChange const ( openFlags = unix.O_NOFOLLOW | unix.O_CLOEXEC // UNIX_PATH_MAX as defined in include/uapi/linux/un.h. unixPathMax = 108 ) // Config sets configuration options for each attach point. type Config struct { // ROMount is set to true if this is a readonly mount. ROMount bool // PanicOnWrite panics on attempts to write to RO mounts. PanicOnWrite bool // HostUDS signals whether the gofer can connect to host unix domain sockets. HostUDS config.HostUDS // HostFifo signals whether the gofer can connect to host FIFOs. HostFifo config.HostFifo // DonateMountPointFD indicates whether a host FD to the mount point should // be donated to the client on Mount RPC. DonateMountPointFD bool } var procSelfFD *rwfd.FD // OpenProcSelfFD opens the /proc/self/fd directory, which will be used to // reopen file descriptors. func OpenProcSelfFD(path string) error { d, err := unix.Open(path, unix.O_RDONLY|unix.O_DIRECTORY, 0) if err != nil { return fmt.Errorf("error opening /proc/self/fd: %v", err) } procSelfFD = rwfd.New(d) return nil } // LisafsServer implements lisafs.ServerImpl for fsgofer. type LisafsServer struct { lisafs.Server config Config } var _ lisafs.ServerImpl = (*LisafsServer)(nil) // NewLisafsServer initializes a new lisafs server for fsgofer. func NewLisafsServer(config Config) *LisafsServer { s := &LisafsServer{config: config} s.Server.Init(s, lisafs.ServerOpts{ WalkStatSupported: true, SetAttrOnDeleted: true, AllocateOnDeleted: true, }) return s } // Mount implements lisafs.ServerImpl.Mount. func (s *LisafsServer) Mount(c *lisafs.Connection, mountNode *lisafs.Node) (*lisafs.ControlFD, linux.Statx, int, error) { mountPath := mountNode.FilePath() rootHostFD, err := tryOpen(func(flags int) (int, error) { return unix.Open(mountPath, flags, 0) }) if err != nil { return nil, linux.Statx{}, -1, err } cu := cleanup.Make(func() { _ = unix.Close(rootHostFD) }) defer cu.Clean() stat, err := fstatTo(rootHostFD) if err != nil { return nil, linux.Statx{}, -1, err } if err := checkSupportedFileType(uint32(stat.Mode)); err != nil { log.Warningf("Mount: checkSupportedFileType() failed for file %q with mode %o: %v", mountPath, stat.Mode, err) return nil, linux.Statx{}, -1, err } clientHostFD := -1 if s.config.DonateMountPointFD { clientHostFD, err = unix.Dup(rootHostFD) if err != nil { return nil, linux.Statx{}, -1, err } } cu.Release() rootFD := &controlFDLisa{ hostFD: rootHostFD, writableHostFD: atomicbitops.FromInt32(-1), isMountPoint: true, } mountNode.IncRef() // Ref is transferred to ControlFD. rootFD.ControlFD.Init(c, mountNode, linux.FileMode(stat.Mode), rootFD) return rootFD.FD(), stat, clientHostFD, nil } // MaxMessageSize implements lisafs.ServerImpl.MaxMessageSize. func (s *LisafsServer) MaxMessageSize() uint32 { return lisafs.MaxMessageSize() } // SupportedMessages implements lisafs.ServerImpl.SupportedMessages. func (s *LisafsServer) SupportedMessages() []lisafs.MID { // Note that Flush, FListXattr and FRemoveXattr are not supported. return []lisafs.MID{ lisafs.Mount, lisafs.Channel, lisafs.FStat, lisafs.SetStat, lisafs.Walk, lisafs.WalkStat, lisafs.OpenAt, lisafs.OpenCreateAt, lisafs.Close, lisafs.FSync, lisafs.PWrite, lisafs.PRead, lisafs.MkdirAt, lisafs.MknodAt, lisafs.SymlinkAt, lisafs.LinkAt, lisafs.FStatFS, lisafs.FAllocate, lisafs.ReadLinkAt, lisafs.Connect, lisafs.UnlinkAt, lisafs.RenameAt, lisafs.Getdents64, lisafs.FGetXattr, lisafs.FSetXattr, lisafs.BindAt, lisafs.Listen, lisafs.Accept, } } // controlFDLisa implements lisafs.ControlFDImpl. type controlFDLisa struct { lisafs.ControlFD // hostFD is the file descriptor which can be used to make host syscalls. hostFD int // writableHostFD is the file descriptor number for a writable FD opened on // the same FD as `hostFD`. It is initialized to -1, and can change in value // exactly once. writableHostFD atomicbitops.Int32 // isMountpoint indicates whether this FD represents the mount point for its // owning connection. isMountPoint is immutable. isMountPoint bool } var _ lisafs.ControlFDImpl = (*controlFDLisa)(nil) func newControlFDLisa(hostFD int, parent *controlFDLisa, name string, mode linux.FileMode) *controlFDLisa { var ( childFD *controlFDLisa childNode *lisafs.Node parentNode = parent.Node() ) parentNode.WithChildrenMu(func() { childNode = parentNode.LookupChildLocked(name) if childNode == nil { // Common case. Performance hack which is used to allocate the node and // its control FD together in the heap. For a well-behaving client, there // will be a 1:1 mapping between control FD and node and their lifecycle // will be similar too. This will help reduce allocations and memory // fragmentation. This is more cache friendly too. temp := struct { node lisafs.Node fd controlFDLisa }{} childFD = &temp.fd childNode = &temp.node childNode.InitLocked(name, parentNode) } else { childNode.IncRef() childFD = &controlFDLisa{} } }) childFD.hostFD = hostFD childFD.writableHostFD = atomicbitops.FromInt32(-1) childFD.ControlFD.Init(parent.Conn(), childNode, mode, childFD) return childFD } func (fd *controlFDLisa) getWritableFD() (int, error) { if writableFD := fd.writableHostFD.Load(); writableFD != -1 { return int(writableFD), nil } writableFD, err := unix.Openat(int(procSelfFD.FD()), strconv.Itoa(fd.hostFD), (unix.O_WRONLY|openFlags)&^unix.O_NOFOLLOW, 0) if err != nil { return -1, err } if !fd.writableHostFD.CompareAndSwap(-1, int32(writableFD)) { // Race detected, use the new value and clean this up. unix.Close(writableFD) return int(fd.writableHostFD.Load()), nil } return writableFD, nil } func (fd *controlFDLisa) getParentFD() (int, string, error) { filePath := fd.Node().FilePath() if filePath == "/" { log.Warningf("getParentFD() call on the root") return -1, "", unix.EINVAL } parent, err := unix.Open(path.Dir(filePath), openFlags|unix.O_PATH, 0) return parent, path.Base(filePath), err } // FD implements lisafs.ControlFDImpl.FD. func (fd *controlFDLisa) FD() *lisafs.ControlFD { if fd == nil { return nil } return &fd.ControlFD } // Close implements lisafs.ControlFDImpl.Close. func (fd *controlFDLisa) Close() { if fd.hostFD >= 0 { _ = unix.Close(fd.hostFD) fd.hostFD = -1 } // No concurrent access is possible so no need to use atomics. if fd.writableHostFD.RacyLoad() >= 0 { _ = unix.Close(int(fd.writableHostFD.RacyLoad())) fd.writableHostFD = atomicbitops.FromInt32(-1) } } // Stat implements lisafs.ControlFDImpl.Stat. func (fd *controlFDLisa) Stat() (linux.Statx, error) { return fstatTo(fd.hostFD) } // SetStat implements lisafs.ControlFDImpl.SetStat. func (fd *controlFDLisa) SetStat(stat lisafs.SetStatReq) (failureMask uint32, failureErr error) { if stat.Mask&unix.STATX_MODE != 0 { if fd.IsSocket() { // fchmod(2) on socket files created via bind(2) fails. We need to // fchmodat(2) it from its parent. parent, sockName, err := fd.getParentFD() if err == nil { // Note that AT_SYMLINK_NOFOLLOW flag is not currently supported. err = unix.Fchmodat(parent, sockName, stat.Mode&^unix.S_IFMT, 0 /* flags */) unix.Close(parent) } if err != nil { log.Warningf("SetStat fchmod failed on socket %q, err: %v", fd.Node().FilePath(), err) failureMask |= unix.STATX_MODE failureErr = err } } else { if err := unix.Fchmod(fd.hostFD, stat.Mode&^unix.S_IFMT); err != nil { log.Warningf("SetStat fchmod failed %q, err: %v", fd.Node().FilePath(), err) failureMask |= unix.STATX_MODE failureErr = err } } } if stat.Mask&unix.STATX_SIZE != 0 { // ftruncate(2) requires the FD to be open for writing. writableFD, err := fd.getWritableFD() if err == nil { err = unix.Ftruncate(writableFD, int64(stat.Size)) } if err != nil { log.Warningf("SetStat ftruncate failed %q, err: %v", fd.Node().FilePath(), err) failureMask |= unix.STATX_SIZE failureErr = err } } if stat.Mask&(unix.STATX_ATIME|unix.STATX_MTIME) != 0 { utimes := [2]unix.Timespec{ {Sec: 0, Nsec: unix.UTIME_OMIT}, {Sec: 0, Nsec: unix.UTIME_OMIT}, } if stat.Mask&unix.STATX_ATIME != 0 { utimes[0].Sec = stat.Atime.Sec utimes[0].Nsec = stat.Atime.Nsec } if stat.Mask&unix.STATX_MTIME != 0 { utimes[1].Sec = stat.Mtime.Sec utimes[1].Nsec = stat.Mtime.Nsec } if fd.IsSymlink() { // utimensat operates different that other syscalls. To operate on a // symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty // name. We need the parent FD. parent, symlinkName, err := fd.getParentFD() if err == nil { err = fsutil.Utimensat(parent, symlinkName, utimes, unix.AT_SYMLINK_NOFOLLOW) unix.Close(parent) } if err != nil { failureMask |= (stat.Mask & (unix.STATX_ATIME | unix.STATX_MTIME)) failureErr = err } } else { hostFD := fd.hostFD if fd.IsRegular() { // For regular files, utimensat(2) requires the FD to be open for // writing, see BUGS section. if writableFD, err := fd.getWritableFD(); err == nil { hostFD = writableFD } else { log.Warningf("SetStat getWritableFD failed %q, err: %v", fd.Node().FilePath(), err) } } // Directories and regular files can operate directly on the fd // using empty name. err := fsutil.Utimensat(hostFD, "", utimes, 0) if err != nil { log.Warningf("SetStat utimens failed %q, err: %v", fd.Node().FilePath(), err) failureMask |= (stat.Mask & (unix.STATX_ATIME | unix.STATX_MTIME)) failureErr = err } } } if stat.Mask&(unix.STATX_UID|unix.STATX_GID) != 0 { // "If the owner or group is specified as -1, then that ID is not changed" // - chown(2) uid := -1 if stat.Mask&unix.STATX_UID != 0 { uid = int(stat.UID) } gid := -1 if stat.Mask&unix.STATX_GID != 0 { gid = int(stat.GID) } if err := unix.Fchownat(fd.hostFD, "", uid, gid, unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW); err != nil { log.Warningf("SetStat fchown failed %q, err: %v", fd.Node().FilePath(), err) failureMask |= stat.Mask & (unix.STATX_UID | unix.STATX_GID) failureErr = err } } return } // Walk implements lisafs.ControlFDImpl.Walk. func (fd *controlFDLisa) Walk(name string) (*lisafs.ControlFD, linux.Statx, error) { childHostFD, err := tryOpen(func(flags int) (int, error) { return unix.Openat(fd.hostFD, name, flags, 0) }) if err != nil { return nil, linux.Statx{}, err } stat, err := fstatTo(childHostFD) if err != nil { _ = unix.Close(childHostFD) return nil, linux.Statx{}, err } if err := checkSupportedFileType(uint32(stat.Mode)); err != nil { _ = unix.Close(childHostFD) log.Warningf("Walk: checkSupportedFileType() failed for %q with mode %o: %v", name, stat.Mode, err) return nil, linux.Statx{}, err } return newControlFDLisa(childHostFD, fd, name, linux.FileMode(stat.Mode)).FD(), stat, nil } // WalkStat implements lisafs.ControlFDImpl.WalkStat. func (fd *controlFDLisa) WalkStat(path lisafs.StringArray, recordStat func(linux.Statx)) error { // Note that while performing the walk below, we do not have read concurrency // guarantee for any descendants. So files can be created/deleted inside fd // while the walk is being performed. However, this should be fine from a // security perspective as we are using host FDs to walk and checking that // each opened path component is not a symlink. curDirFD := fd.hostFD closeCurDirFD := func() { if curDirFD != fd.hostFD { unix.Close(curDirFD) } } defer closeCurDirFD() if len(path) > 0 && len(path[0]) == 0 { // Write stat results for dirFD if the first path component is "". stat, err := fstatTo(fd.hostFD) if err != nil { return err } recordStat(stat) path = path[1:] } // Don't attempt walking if parent is a symlink. if fd.IsSymlink() { return nil } for _, name := range path { curFD, err := unix.Openat(curDirFD, name, unix.O_PATH|openFlags, 0) if err == unix.ENOENT { // No more path components exist on the filesystem. Return the partial // walk to the client. break } if err != nil { return err } closeCurDirFD() curDirFD = curFD stat, err := fstatTo(curFD) if err != nil { return err } if err := checkSupportedFileType(uint32(stat.Mode)); err != nil { log.Warningf("WalkStat: checkSupportedFileType() failed for file %q with mode %o while walking path %+v: %v", name, stat.Mode, path, err) return err } recordStat(stat) // Symlinks terminate walk. This client gets the symlink stat result, but // will have to invoke Walk again with the resolved path. if stat.Mode&unix.S_IFMT == unix.S_IFLNK { break } } return nil } // Used to log rejected fifo/uds operations, one time each. var ( logRejectedFifoOpenOnce sync.Once logRejectedUdsOpenOnce sync.Once logRejectedUdsCreateOnce sync.Once logRejectedUdsConnectOnce sync.Once ) // Open implements lisafs.ControlFDImpl.Open. func (fd *controlFDLisa) Open(flags uint32) (*lisafs.OpenFD, int, error) { ftype := fd.FileType() server := fd.Conn().ServerImpl().(*LisafsServer) switch ftype { case unix.S_IFIFO: if !server.config.HostFifo.AllowOpen() { logRejectedFifoOpenOnce.Do(func() { log.Warningf("Rejecting attempt to open fifo/pipe from host filesystem: %q. If you want to allow this, set flag --host-fifo=open", fd.ControlFD.Node().FilePath()) }) return nil, -1, unix.EPERM } case unix.S_IFSOCK: if !server.config.HostUDS.AllowOpen() { logRejectedUdsOpenOnce.Do(func() { log.Warningf("Rejecting attempt to open unix domain socket from host filesystem. If you want to allow this, set flag --host-uds=open", fd.ControlFD.Node().FilePath()) }) return nil, -1, unix.EPERM } } flags |= openFlags openHostFD, err := unix.Openat(int(procSelfFD.FD()), strconv.Itoa(fd.hostFD), int(flags)&^unix.O_NOFOLLOW, 0) if err != nil { return nil, -1, err } hostFDToDonate := -1 switch { case ftype == unix.S_IFREG: // Best effort to donate file to the Sentry (for performance only). hostFDToDonate, _ = unix.Dup(openHostFD) case ftype == unix.S_IFIFO, ftype == unix.S_IFCHR, fd.isMountPoint && fd.Conn().ServerImpl().(*LisafsServer).config.DonateMountPointFD: // Character devices and pipes can block indefinitely during reads/writes, // which is not allowed for gofer operations. Ensure that it donates an FD // back to the caller, so it can wait on the FD when reads/writes return // EWOULDBLOCK. For mount points, if DonateMountPointFD option is set, an // FD must be donated. var err error hostFDToDonate, err = unix.Dup(openHostFD) if err != nil { return nil, 0, err } } openFD := fd.newOpenFDLisa(openHostFD, flags) return openFD.FD(), hostFDToDonate, nil } // OpenCreate implements lisafs.ControlFDImpl.OpenCreate. func (fd *controlFDLisa) OpenCreate(mode linux.FileMode, uid lisafs.UID, gid lisafs.GID, name string, flags uint32) (*lisafs.ControlFD, linux.Statx, *lisafs.OpenFD, int, error) { createFlags := unix.O_CREAT | unix.O_EXCL | unix.O_RDONLY | unix.O_NONBLOCK | openFlags childHostFD, err := unix.Openat(fd.hostFD, name, createFlags, uint32(mode&^linux.FileTypeMask)) if err != nil { return nil, linux.Statx{}, nil, -1, err } cu := cleanup.Make(func() { // Best effort attempt to remove the file in case of failure. if err := unix.Unlinkat(fd.hostFD, name, 0); err != nil { log.Warningf("error unlinking file %q after failure: %v", path.Join(fd.Node().FilePath(), name), err) } unix.Close(childHostFD) }) defer cu.Clean() // Set the owners as requested by the client. if err := unix.Fchownat(childHostFD, "", int(uid), int(gid), unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW); err != nil { return nil, linux.Statx{}, nil, -1, err } // Get stat results. childStat, err := fstatTo(childHostFD) if err != nil { return nil, linux.Statx{}, nil, -1, err } // Now open an FD to the newly created file with the flags requested by the client. flags |= openFlags newHostFD, err := unix.Openat(int(procSelfFD.FD()), strconv.Itoa(childHostFD), int(flags)&^unix.O_NOFOLLOW, 0) if err != nil { return nil, linux.Statx{}, nil, -1, err } cu.Release() childFD := newControlFDLisa(childHostFD, fd, name, linux.ModeRegular) newFD := childFD.newOpenFDLisa(newHostFD, uint32(flags)) // Donate FD because open(O_CREAT|O_EXCL) always creates a regular file. // Since FD donation is a destructive operation, we should duplicate the // to-be-donated FD. Eat the error if one occurs, it is better to have an FD // without a host FD, than failing the Open attempt. hostOpenFD := -1 if dupFD, err := unix.Dup(newFD.hostFD); err == nil { hostOpenFD = dupFD } return childFD.FD(), childStat, newFD.FD(), hostOpenFD, nil } // Mkdir implements lisafs.ControlFDImpl.Mkdir. func (fd *controlFDLisa) Mkdir(mode linux.FileMode, uid lisafs.UID, gid lisafs.GID, name string) (*lisafs.ControlFD, linux.Statx, error) { if err := unix.Mkdirat(fd.hostFD, name, uint32(mode&^linux.FileTypeMask)); err != nil { return nil, linux.Statx{}, err } cu := cleanup.Make(func() { // Best effort attempt to remove the dir in case of failure. if err := unix.Unlinkat(fd.hostFD, name, unix.AT_REMOVEDIR); err != nil { log.Warningf("error unlinking dir %q after failure: %v", path.Join(fd.Node().FilePath(), name), err) } }) defer cu.Clean() // Open directory to change ownership. childDirFd, err := tryOpen(func(flags int) (int, error) { return unix.Openat(fd.hostFD, name, flags|unix.O_DIRECTORY, 0) }) if err != nil { return nil, linux.Statx{}, err } if err := unix.Fchownat(childDirFd, "", int(uid), int(gid), unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW); err != nil { unix.Close(childDirFd) return nil, linux.Statx{}, err } // Get stat results. childDirStat, err := fstatTo(childDirFd) if err != nil { unix.Close(childDirFd) return nil, linux.Statx{}, err } cu.Release() return newControlFDLisa(childDirFd, fd, name, linux.ModeDirectory).FD(), childDirStat, nil } // Mknod implements lisafs.ControlFDImpl.Mknod. func (fd *controlFDLisa) Mknod(mode linux.FileMode, uid lisafs.UID, gid lisafs.GID, name string, minor uint32, major uint32) (*lisafs.ControlFD, linux.Statx, error) { // From mknod(2) man page: // "EPERM: [...] if the filesystem containing pathname does not support // the type of node requested." if mode.FileType() != linux.ModeRegular { return nil, linux.Statx{}, unix.EPERM } if err := unix.Mknodat(fd.hostFD, name, uint32(mode), 0); err != nil { return nil, linux.Statx{}, err } cu := cleanup.Make(func() { // Best effort attempt to remove the file in case of failure. if err := unix.Unlinkat(fd.hostFD, name, 0); err != nil { log.Warningf("error unlinking file %q after failure: %v", path.Join(fd.Node().FilePath(), name), err) } }) defer cu.Clean() // Open file to change ownership. childFD, err := tryOpen(func(flags int) (int, error) { return unix.Openat(fd.hostFD, name, flags, 0) }) if err != nil { return nil, linux.Statx{}, err } if err := unix.Fchownat(childFD, "", int(uid), int(gid), unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW); err != nil { unix.Close(childFD) return nil, linux.Statx{}, err } // Get stat results. childStat, err := fstatTo(childFD) if err != nil { unix.Close(childFD) return nil, linux.Statx{}, err } cu.Release() return newControlFDLisa(childFD, fd, name, mode).FD(), childStat, nil } // Symlink implements lisafs.ControlFDImpl.Symlink. func (fd *controlFDLisa) Symlink(name string, target string, uid lisafs.UID, gid lisafs.GID) (*lisafs.ControlFD, linux.Statx, error) { if err := unix.Symlinkat(target, fd.hostFD, name); err != nil { return nil, linux.Statx{}, err } cu := cleanup.Make(func() { // Best effort attempt to remove the symlink in case of failure. if err := unix.Unlinkat(fd.hostFD, name, 0); err != nil { log.Warningf("error unlinking file %q after failure: %v", path.Join(fd.Node().FilePath(), name), err) } }) defer cu.Clean() // Open symlink to change ownership. symlinkFD, err := unix.Openat(fd.hostFD, name, unix.O_PATH|openFlags, 0) if err != nil { return nil, linux.Statx{}, err } if err := unix.Fchownat(symlinkFD, "", int(uid), int(gid), unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW); err != nil { unix.Close(symlinkFD) return nil, linux.Statx{}, err } symlinkStat, err := fstatTo(symlinkFD) if err != nil { unix.Close(symlinkFD) return nil, linux.Statx{}, err } cu.Release() return newControlFDLisa(symlinkFD, fd, name, linux.ModeSymlink).FD(), symlinkStat, nil } // Link implements lisafs.ControlFDImpl.Link. func (fd *controlFDLisa) Link(dir lisafs.ControlFDImpl, name string) (*lisafs.ControlFD, linux.Statx, error) { // Using linkat(targetFD, "", newdirfd, name, AT_EMPTY_PATH) requires // CAP_DAC_READ_SEARCH in the *root* userns. The gofer process has // CAP_DAC_READ_SEARCH in its own userns. But sometimes the gofer may be // running in a different userns. So we can't use AT_EMPTY_PATH. Fallback // to using olddirfd to call linkat(2). oldDirFD, oldName, err := fd.getParentFD() if err != nil { return nil, linux.Statx{}, err } dirFD := dir.(*controlFDLisa) if err := unix.Linkat(oldDirFD, oldName, dirFD.hostFD, name, 0); err != nil { return nil, linux.Statx{}, err } cu := cleanup.Make(func() { // Best effort attempt to remove the hard link in case of failure. if err := unix.Unlinkat(dirFD.hostFD, name, 0); err != nil { log.Warningf("error unlinking file %q after failure: %v", path.Join(dirFD.Node().FilePath(), name), err) } }) defer cu.Clean() linkFD, err := tryOpen(func(flags int) (int, error) { return unix.Openat(dirFD.hostFD, name, flags, 0) }) if err != nil { return nil, linux.Statx{}, err } linkStat, err := fstatTo(linkFD) if err != nil { return nil, linux.Statx{}, err } cu.Release() return newControlFDLisa(linkFD, dirFD, name, linux.FileMode(linkStat.Mode)).FD(), linkStat, nil } // StatFS implements lisafs.ControlFDImpl.StatFS. func (fd *controlFDLisa) StatFS() (lisafs.StatFS, error) { var s unix.Statfs_t if err := unix.Fstatfs(fd.hostFD, &s); err != nil { return lisafs.StatFS{}, err } return lisafs.StatFS{ Type: uint64(s.Type), BlockSize: s.Bsize, Blocks: s.Blocks, BlocksFree: s.Bfree, BlocksAvailable: s.Bavail, Files: s.Files, FilesFree: s.Ffree, NameLength: uint64(s.Namelen), }, nil } // Readlink implements lisafs.ControlFDImpl.Readlink. func (fd *controlFDLisa) Readlink(getLinkBuf func(uint32) []byte) (uint16, error) { // This is similar to what os.Readlink does. for linkLen := 128; linkLen < math.MaxUint16; linkLen *= 2 { b := getLinkBuf(uint32(linkLen)) n, err := unix.Readlinkat(fd.hostFD, "", b) if err != nil { return 0, err } if n < int(linkLen) { return uint16(n), nil } } return 0, unix.ENOMEM } func isSockTypeSupported(sockType uint32) bool { switch sockType { case unix.SOCK_STREAM, unix.SOCK_DGRAM, unix.SOCK_SEQPACKET: return true default: log.Debugf("socket type %d is not supported", sockType) return false } } // Connect implements lisafs.ControlFDImpl.Connect. func (fd *controlFDLisa) Connect(sockType uint32) (int, error) { if !fd.Conn().ServerImpl().(*LisafsServer).config.HostUDS.AllowOpen() { logRejectedUdsConnectOnce.Do(func() { log.Warningf("Rejecting attempt to connect to unix domain socket from host filesystem: %q. If you want to allow this, set flag --host-uds=open", fd.ControlFD.Node().FilePath()) }) return -1, unix.EPERM } // TODO(gvisor.dev/issue/1003): Due to different app vs replacement // mappings, the app path may have fit in the sockaddr, but we can't fit // hostPath in our sockaddr. We'd need to redirect through a shorter path // in order to actually connect to this socket. hostPath := fd.Node().FilePath() if len(hostPath) >= linux.UnixPathMax { return -1, unix.EINVAL } if !isSockTypeSupported(sockType) { return -1, unix.ENXIO } sock, err := unix.Socket(unix.AF_UNIX, int(sockType), 0) if err != nil { return -1, err } sa := unix.SockaddrUnix{Name: hostPath} if err := unix.Connect(sock, &sa); err != nil { unix.Close(sock) return -1, err } return sock, nil } // BindAt implements lisafs.ControlFDImpl.BindAt. func (fd *controlFDLisa) BindAt(name string, sockType uint32, mode linux.FileMode, uid lisafs.UID, gid lisafs.GID) (*lisafs.ControlFD, linux.Statx, *lisafs.BoundSocketFD, int, error) { if !fd.Conn().ServerImpl().(*LisafsServer).config.HostUDS.AllowCreate() { logRejectedUdsCreateOnce.Do(func() { log.Warningf("Rejecting attempt to create unix domain socket from host filesystem: %q. If you want to allow this, set flag --host-uds=create", name) }) return nil, linux.Statx{}, nil, -1, unix.EPERM } // Because there is no "bindat" syscall in Linux, we must create an // absolute path to the socket we are creating, socketPath := filepath.Join(fd.Node().FilePath(), name) // TODO(gvisor.dev/issue/1003): Due to different app vs replacement // mappings, the app path may have fit in the sockaddr, but we can't fit // hostPath in our sockaddr. We'd need to redirect through a shorter path // in order to actually connect to this socket. if len(socketPath) >= linux.UnixPathMax { log.Warningf("BindAt called with name too long: %q (len=%d)", socketPath, len(socketPath)) return nil, linux.Statx{}, nil, -1, unix.EINVAL } // Only the following types are supported. if !isSockTypeSupported(sockType) { return nil, linux.Statx{}, nil, -1, unix.ENXIO } // Create and bind the socket using the sockPath which may be a // symlink. sockFD, err := unix.Socket(unix.AF_UNIX, int(sockType), 0) if err != nil { return nil, linux.Statx{}, nil, -1, err } cu := cleanup.Make(func() { _ = unix.Close(sockFD) }) defer cu.Clean() // fchmod(2) has to happen *before* the bind(2). sockFD's file mode will // be used in creating the filesystem-object in bind(2). if err := unix.Fchmod(sockFD, uint32(mode&^linux.FileTypeMask)); err != nil { return nil, linux.Statx{}, nil, -1, err } if err := unix.Bind(sockFD, &unix.SockaddrUnix{Name: socketPath}); err != nil { return nil, linux.Statx{}, nil, -1, err } cu.Add(func() { _ = unix.Unlink(socketPath) }) sockFileFD, err := tryOpen(func(flags int) (int, error) { return unix.Openat(fd.hostFD, name, flags, 0) }) if err != nil { return nil, linux.Statx{}, nil, -1, err } cu.Add(func() { _ = unix.Close(sockFileFD) }) if err := unix.Fchownat(sockFileFD, "", int(uid), int(gid), unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW); err != nil { return nil, linux.Statx{}, nil, -1, err } // Stat the socket. sockStat, err := fstatTo(sockFileFD) if err != nil { return nil, linux.Statx{}, nil, -1, err } // Create an FD that will be donated to the sandbox. sockFDToDonate, err := unix.Dup(sockFD) if err != nil { return nil, linux.Statx{}, nil, -1, err } cu.Release() socketControlFD := newControlFDLisa(sockFD, fd, name, linux.ModeSocket) boundSocketFD := &boundSocketFDLisa{ sock: os.NewFile(uintptr(sockFD), socketPath), } boundSocketFD.Init(socketControlFD.FD(), boundSocketFD) return socketControlFD.FD(), sockStat, boundSocketFD.FD(), sockFDToDonate, nil } // Unlink implements lisafs.ControlFDImpl.Unlink. func (fd *controlFDLisa) Unlink(name string, flags uint32) error { return unix.Unlinkat(fd.hostFD, name, int(flags)) } // RenameAt implements lisafs.ControlFDImpl.RenameAt. func (fd *controlFDLisa) RenameAt(oldName string, newDir lisafs.ControlFDImpl, newName string) error { return fsutil.RenameAt(fd.hostFD, oldName, newDir.(*controlFDLisa).hostFD, newName) } // Renamed implements lisafs.ControlFDImpl.Renamed. func (fd *controlFDLisa) Renamed() { // controlFDLisa does not have any state to update on rename. } // GetXattr implements lisafs.ControlFDImpl.GetXattr. func (fd *controlFDLisa) GetXattr(name string, size uint32, getValueBuf func(uint32) []byte) (uint16, error) { data := getValueBuf(size) xattrSize, err := unix.Fgetxattr(fd.hostFD, name, data) return uint16(xattrSize), err } // SetXattr implements lisafs.ControlFDImpl.SetXattr. func (fd *controlFDLisa) SetXattr(name string, value string, flags uint32) error { return unix.EOPNOTSUPP } // ListXattr implements lisafs.ControlFDImpl.ListXattr. func (fd *controlFDLisa) ListXattr(size uint64) (lisafs.StringArray, error) { return nil, unix.EOPNOTSUPP } // RemoveXattr implements lisafs.ControlFDImpl.RemoveXattr. func (fd *controlFDLisa) RemoveXattr(name string) error { return unix.EOPNOTSUPP } // openFDLisa implements lisafs.OpenFDImpl. type openFDLisa struct { lisafs.OpenFD // hostFD is the host file descriptor which can be used to make syscalls. hostFD int } var _ lisafs.OpenFDImpl = (*openFDLisa)(nil) func (fd *controlFDLisa) newOpenFDLisa(hostFD int, flags uint32) *openFDLisa { newFD := &openFDLisa{ hostFD: hostFD, } newFD.OpenFD.Init(fd.FD(), flags, newFD) return newFD } // FD implements lisafs.OpenFDImpl.FD. func (fd *openFDLisa) FD() *lisafs.OpenFD { if fd == nil { return nil } return &fd.OpenFD } // Close implements lisafs.OpenFDImpl.Close. func (fd *openFDLisa) Close() { if fd.hostFD >= 0 { _ = unix.Close(fd.hostFD) fd.hostFD = -1 } } // Stat implements lisafs.OpenFDImpl.Stat. func (fd *openFDLisa) Stat() (linux.Statx, error) { return fstatTo(fd.hostFD) } // Sync implements lisafs.OpenFDImpl.Sync. func (fd *openFDLisa) Sync() error { return unix.Fsync(fd.hostFD) } // Write implements lisafs.OpenFDImpl.Write. func (fd *openFDLisa) Write(buf []byte, off uint64) (uint64, error) { rw := rwfd.NewReadWriter(fd.hostFD) n, err := rw.WriteAt(buf, int64(off)) return uint64(n), err } // Read implements lisafs.OpenFDImpl.Read. func (fd *openFDLisa) Read(buf []byte, off uint64) (uint64, error) { rw := rwfd.NewReadWriter(fd.hostFD) n, err := rw.ReadAt(buf, int64(off)) if err != nil && err != io.EOF { return 0, err } return uint64(n), nil } // Allocate implements lisafs.OpenFDImpl.Allocate. func (fd *openFDLisa) Allocate(mode, off, length uint64) error { return unix.Fallocate(fd.hostFD, uint32(mode), int64(off), int64(length)) } // Flush implements lisafs.OpenFDImpl.Flush. func (fd *openFDLisa) Flush() error { return nil } // Getdent64 implements lisafs.OpenFDImpl.Getdent64. func (fd *openFDLisa) Getdent64(count uint32, seek0 bool, recordDirent func(lisafs.Dirent64)) error { if seek0 { if _, err := unix.Seek(fd.hostFD, 0, 0); err != nil { return err } } var direntsBuf [8192]byte var bytesRead int for bytesRead < int(count) { bufEnd := len(direntsBuf) if remaining := int(count) - bytesRead; remaining < bufEnd { bufEnd = remaining } n, err := unix.Getdents(fd.hostFD, direntsBuf[:bufEnd]) if err != nil { if err == unix.EINVAL && bufEnd < fsutil.UnixDirentMaxSize { // getdents64(2) returns EINVAL is returned when the result // buffer is too small. If bufEnd is smaller than the max // size of unix.Dirent, then just break here to return all // dirents collected till now. break } return err } if n <= 0 { break } fsutil.ParseDirents(direntsBuf[:n], func(ino uint64, off int64, ftype uint8, name string, reclen uint16) { dirent := lisafs.Dirent64{ Ino: primitive.Uint64(ino), Off: primitive.Uint64(off), Type: primitive.Uint8(ftype), Name: lisafs.SizedString(name), } // The client also wants the device ID, which annoyingly incurs an // additional syscall per dirent. // TODO(gvisor.dev/issue/6665): Get rid of per-dirent stat. stat, err := fsutil.StatAt(fd.hostFD, name) if err != nil { log.Warningf("Getdent64: skipping file %q with failed stat, err: %v", path.Join(fd.ControlFD().FD().Node().FilePath(), name), err) return } dirent.DevMinor = primitive.Uint32(unix.Minor(stat.Dev)) dirent.DevMajor = primitive.Uint32(unix.Major(stat.Dev)) recordDirent(dirent) bytesRead += int(reclen) }) } return nil } // Renamed implements lisafs.OpenFDImpl.Renamed. func (fd *openFDLisa) Renamed() { // openFDLisa does not have any state to update on rename. } type boundSocketFDLisa struct { lisafs.BoundSocketFD sock *os.File } var _ lisafs.BoundSocketFDImpl = (*boundSocketFDLisa)(nil) // Close implements lisafs.BoundSocketFD.Close. func (fd *boundSocketFDLisa) Close() { fd.sock.Close() } // FD implements lisafs.BoundSocketFD.FD. func (fd *boundSocketFDLisa) FD() *lisafs.BoundSocketFD { if fd == nil { return nil } return &fd.BoundSocketFD } // Listen implements lisafs.BoundSocketFD.Listen. func (fd *boundSocketFDLisa) Listen(backlog int32) error { return unix.Listen(int(fd.sock.Fd()), int(backlog)) } // Listen implements lisafs.BoundSocketFD.Accept. func (fd *boundSocketFDLisa) Accept() (int, string, error) { flags := unix.O_NONBLOCK | unix.O_CLOEXEC nfd, _, err := unix.Accept4(int(fd.sock.Fd()), flags) if err != nil { return -1, "", err } // Return an empty peer address so that we don't leak the actual host // address. return nfd, "", err } // tryOpen tries to open() with different modes as documented. func tryOpen(open func(int) (int, error)) (hostFD int, err error) { // Attempt to open file in the following in order: // 1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs. // Use non-blocking to prevent getting stuck inside open(2) for // FIFOs. This option has no effect on regular files. // 2. PATH: for symlinks, sockets. flags := []int{ unix.O_RDONLY | unix.O_NONBLOCK, unix.O_PATH, } for _, flag := range flags { hostFD, err = open(flag | openFlags) if err == nil { return } if e := extractErrno(err); e == unix.ENOENT { // File doesn't exist, no point in retrying. return -1, e } } return } func fstatTo(hostFD int) (linux.Statx, error) { var stat unix.Stat_t if err := unix.Fstat(hostFD, &stat); err != nil { return linux.Statx{}, err } return linux.Statx{ Mask: unix.STATX_TYPE | unix.STATX_MODE | unix.STATX_INO | unix.STATX_NLINK | unix.STATX_UID | unix.STATX_GID | unix.STATX_SIZE | unix.STATX_BLOCKS | unix.STATX_ATIME | unix.STATX_MTIME | unix.STATX_CTIME, Mode: uint16(stat.Mode), DevMinor: unix.Minor(stat.Dev), DevMajor: unix.Major(stat.Dev), Ino: stat.Ino, Nlink: uint32(stat.Nlink), UID: stat.Uid, GID: stat.Gid, RdevMinor: unix.Minor(stat.Rdev), RdevMajor: unix.Major(stat.Rdev), Size: uint64(stat.Size), Blksize: uint32(stat.Blksize), Blocks: uint64(stat.Blocks), Atime: linux.StatxTimestamp{ Sec: stat.Atim.Sec, Nsec: uint32(stat.Atim.Nsec), }, Mtime: linux.StatxTimestamp{ Sec: stat.Mtim.Sec, Nsec: uint32(stat.Mtim.Nsec), }, Ctime: linux.StatxTimestamp{ Sec: stat.Ctim.Sec, Nsec: uint32(stat.Ctim.Nsec), }, }, nil } func checkSupportedFileType(mode uint32) error { switch mode & unix.S_IFMT { case unix.S_IFREG, unix.S_IFDIR, unix.S_IFLNK, unix.S_IFCHR, unix.S_IFSOCK, unix.S_IFIFO: return nil default: return unix.EPERM } } // extractErrno tries to determine the errno. func extractErrno(err error) unix.Errno { if err == nil { // This should never happen. The likely result will be that // some user gets the frustrating "error: SUCCESS" message. log.Warningf("extractErrno called with nil error!") return 0 } switch err { case os.ErrNotExist: return unix.ENOENT case os.ErrExist: return unix.EEXIST case os.ErrPermission: return unix.EACCES case os.ErrInvalid: return unix.EINVAL } // See if it's an errno or a common wrapped error. switch e := err.(type) { case unix.Errno: return e case *os.PathError: return extractErrno(e.Err) case *os.LinkError: return extractErrno(e.Err) case *os.SyscallError: return extractErrno(e.Err) } // Fall back to EIO. log.Debugf("Unknown error: %v, defaulting to EIO", err) return unix.EIO } // LINT.ThenChange(../../pkg/sentry/fsimpl/gofer/directfs_dentry.go) golang-gvisor-gvisor-0.0~20240729.0/runsc/main.go000066400000000000000000000016501465435605700212520ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false // Binary runsc implements the OCI runtime interface. package main import ( "gvisor.dev/gvisor/runsc/cli" "gvisor.dev/gvisor/runsc/version" ) // version.Version is set dynamically, but needs to be // linked in the binary, so reference it here. var _ = version.Version() func main() { cli.Main() } golang-gvisor-gvisor-0.0~20240729.0/runsc/metricserver/000077500000000000000000000000001465435605700225075ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/metricserver/containermetrics/000077500000000000000000000000001465435605700260605ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/metricserver/containermetrics/containermetrics.go000066400000000000000000000061251465435605700317640ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package containermetrics returns metrics and labels interesting to export // about a container or sandbox. package containermetrics import ( "crypto/sha256" "encoding/binary" "io" "strconv" "gvisor.dev/gvisor/pkg/prometheus" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/specutils" ) // SandboxPrometheusLabels returns a set of Prometheus labels that identifies the sandbox running // the given root container. func SandboxPrometheusLabels(rootContainer *container.Container) (map[string]string, error) { s := rootContainer.Sandbox labels := make(map[string]string, 4) labels[prometheus.SandboxIDLabel] = s.ID // Compute iteration ID label in a stable manner. // This uses sha256(ID + ":" + creation time). h := sha256.New() if _, err := io.WriteString(h, s.ID); err != nil { return nil, err } if _, err := io.WriteString(h, ":"); err != nil { return nil, err } if _, err := io.WriteString(h, rootContainer.CreatedAt.UTC().String()); err != nil { return nil, err } labels[prometheus.IterationIDLabel] = strconv.FormatUint(binary.BigEndian.Uint64(h.Sum(nil)[:8]), 36) if s.PodName != "" { labels[prometheus.PodNameLabel] = s.PodName } if s.Namespace != "" { labels[prometheus.NamespaceLabel] = s.Namespace } return labels, nil } // ComputeSpecMetadata returns the labels for the `spec_metadata` metric. // It merges data from the Specs of multiple containers running within the // same sandbox. // This function must support being called with `allContainers` being nil. // It must return the same set of label keys regardless of how many containers // are in `allContainers`. func ComputeSpecMetadata(allContainers []*container.Container) map[string]string { const ( unknownOCIVersion = "UNKNOWN" inconsistentOCIVersion = "INCONSISTENT" ) hasUID0Container := false ociVersion := unknownOCIVersion hasNVProxy := false hasTPUProxy := false for _, cont := range allContainers { if cont.RunsAsUID0() { hasUID0Container = true } if ociVersion == unknownOCIVersion { ociVersion = cont.Spec.Version } else if ociVersion != cont.Spec.Version { ociVersion = inconsistentOCIVersion } hasNVProxy = hasNVProxy || cont.Spec.Annotations[specutils.AnnotationNVProxy] == "true" hasTPUProxy = hasTPUProxy || cont.Spec.Annotations[specutils.AnnotationTPU] == "true" } return map[string]string{ "hasuid0": strconv.FormatBool(hasUID0Container), "ociversion": ociVersion, "nvproxy": strconv.FormatBool(hasNVProxy), "tpuproxy": strconv.FormatBool(hasTPUProxy), } } containermetrics_state_autogen.go000066400000000000000000000001021465435605700346140ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/metricserver/containermetrics// automatically generated by stateify. package containermetrics golang-gvisor-gvisor-0.0~20240729.0/runsc/mitigate/000077500000000000000000000000001465435605700216005ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/mitigate/mitigate.go000066400000000000000000000161611465435605700237370ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package mitigate provides libraries for the mitigate command. The // mitigate command mitigates side channel attacks such as MDS. Mitigate // shuts down CPUs via /sys/devices/system/cpu/cpu{N}/online. package mitigate import ( "fmt" "regexp" "strconv" "strings" ) const ( // mds is the only bug we care about. mds = "mds" // Constants for parsing /proc/cpuinfo. processorKey = "processor" vendorIDKey = "vendor_id" cpuFamilyKey = "cpu family" modelKey = "model" physicalIDKey = "physical id" coreIDKey = "core id" bugsKey = "bugs" ) // CPUSet contains a map of all CPUs on the system, mapped // by Physical ID and CoreIDs. threads with the same // Core and Physical ID are Hyperthread pairs. type CPUSet []*CPU // NewCPUSet creates a CPUSet from data read from /proc/cpuinfo. func NewCPUSet(data string) (CPUSet, error) { // Each processor entry should start with the // processor key. Find the beginings of each. r := buildRegex(processorKey) indices := r.FindAllStringIndex(data, -1) if len(indices) < 1 { return nil, fmt.Errorf("no cpus found for: %q", data) } // Add the ending index for last entry. indices = append(indices, []int{len(data), -1}) // Valid cpus are now defined by strings in between // indexes (e.g. data[index[i], index[i+1]]). // There should be len(indicies) - 1 CPUs // since the last index is the end of the string. var set CPUSet // Find each string that represents a CPU. These begin "processor". for i := 1; i < len(indices); i++ { start := indices[i-1][0] end := indices[i][0] // Parse the CPU entry, which should be between start/end. c, err := newCPU(data[start:end]) if err != nil { return nil, err } set = append(set, c) } return set, nil } // IsVulnerable checks if this CPUSet is vulnerable to MDS. func (c CPUSet) IsVulnerable() bool { for _, cpu := range c { if cpu.IsVulnerable() { return true } } return false } // String implements the String method for CPUSet. func (c CPUSet) String() string { parts := make([]string, len(c)) for i, cpu := range c { parts[i] = cpu.String() } return strings.Join(parts, "\n") } // CPU represents pertinent info about a single hyperthread in a pair. type CPU struct { processorNumber int64 // the processor number of this CPU. vendorID string // the vendorID of CPU (e.g. AuthenticAMD). cpuFamily int64 // CPU family number (e.g. 6 for CascadeLake/Skylake). model int64 // CPU model number (e.g. 85 for CascadeLake/Skylake). physicalID int64 // Physical ID of this CPU. coreID int64 // Core ID of this CPU. bugs map[string]struct{} // map of vulnerabilities parsed from the 'bugs' field. } func newCPU(data string) (*CPU, error) { processor, err := parseProcessor(data) if err != nil { return nil, err } vendorID, err := parseVendorID(data) if err != nil { return nil, err } cpuFamily, err := parseCPUFamily(data) if err != nil { return nil, err } model, err := parseModel(data) if err != nil { return nil, err } physicalID, err := parsePhysicalID(data) if err != nil { return nil, err } coreID, err := parseCoreID(data) if err != nil { return nil, err } bugs, err := parseBugs(data) if err != nil { return nil, err } return &CPU{ processorNumber: processor, vendorID: vendorID, cpuFamily: cpuFamily, model: model, physicalID: physicalID, coreID: coreID, bugs: bugs, }, nil } // String implements the String method for CPU. func (t *CPU) String() string { template := `%s: %d %s: %s %s: %d %s: %d %s: %d %s: %d %s: %s ` var bugs []string for bug := range t.bugs { bugs = append(bugs, bug) } return fmt.Sprintf(template, processorKey, t.processorNumber, vendorIDKey, t.vendorID, cpuFamilyKey, t.cpuFamily, modelKey, t.model, physicalIDKey, t.physicalID, coreIDKey, t.coreID, bugsKey, strings.Join(bugs, " ")) } // IsVulnerable checks if a CPU is vulnerable to mds. func (t *CPU) IsVulnerable() bool { _, ok := t.bugs[mds] return ok } // SimilarTo checks family/model/bugs fields for equality of two // processors. func (t *CPU) SimilarTo(other *CPU) bool { if t.vendorID != other.vendorID { return false } if other.cpuFamily != t.cpuFamily { return false } if other.model != t.model { return false } if len(other.bugs) != len(t.bugs) { return false } for bug := range t.bugs { if _, ok := other.bugs[bug]; !ok { return false } } return true } // parseProcessor grabs the processor field from /proc/cpuinfo output. func parseProcessor(data string) (int64, error) { return parseIntegerResult(data, processorKey) } // parseVendorID grabs the vendor_id field from /proc/cpuinfo output. func parseVendorID(data string) (string, error) { return parseRegex(data, vendorIDKey, `[\w\d]+`) } // parseCPUFamily grabs the cpu family field from /proc/cpuinfo output. func parseCPUFamily(data string) (int64, error) { return parseIntegerResult(data, cpuFamilyKey) } // parseModel grabs the model field from /proc/cpuinfo output. func parseModel(data string) (int64, error) { return parseIntegerResult(data, modelKey) } // parsePhysicalID parses the physical id field. func parsePhysicalID(data string) (int64, error) { return parseIntegerResult(data, physicalIDKey) } // parseCoreID parses the core id field. func parseCoreID(data string) (int64, error) { return parseIntegerResult(data, coreIDKey) } // parseBugs grabs the bugs field from /proc/cpuinfo output. func parseBugs(data string) (map[string]struct{}, error) { result, err := parseRegex(data, bugsKey, `[\d\w\s]*`) if err != nil { return nil, err } bugs := strings.Split(result, " ") ret := make(map[string]struct{}, len(bugs)) for _, bug := range bugs { ret[bug] = struct{}{} } return ret, nil } // parseIntegerResult parses fields expecting an integer. func parseIntegerResult(data, key string) (int64, error) { result, err := parseRegex(data, key, `\d+`) if err != nil { return 0, err } return strconv.ParseInt(result, 0, 64) } // buildRegex builds a regex for parsing each CPU field. func buildRegex(key string) *regexp.Regexp { reg := fmt.Sprintf(`(?m)^%s\s*:\s*(.*)$`, key) return regexp.MustCompile(reg) } // parseRegex parses data with key inserted into a standard regex template. func parseRegex(data, key, match string) (string, error) { r := buildRegex(key) matches := r.FindStringSubmatch(data) if len(matches) < 2 { return "", fmt.Errorf("failed to match key %q: %q", key, data) } return matches[1], nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/mitigate/mitigate_state_autogen.go000066400000000000000000000000721465435605700266530ustar00rootroot00000000000000// automatically generated by stateify. package mitigate golang-gvisor-gvisor-0.0~20240729.0/runsc/mitigate/mock.go000066400000000000000000000077021465435605700230660ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mitigate import "strings" // MockCPU represents data from CPUs that will be mitigated. type MockCPU struct { Name string VendorID string Family int64 Model int64 ModelName string Bugs string PhysicalCores int64 Cores int64 ThreadsPerCore int64 } // CascadeLake2 is a two core Intel CascadeLake machine. var CascadeLake2 = MockCPU{ Name: "CascadeLake", VendorID: "GenuineIntel", Family: 6, Model: 85, ModelName: "Intel(R) Xeon(R) CPU", Bugs: "spectre_v1 spectre_v2 spec_store_bypass mds swapgs taa", PhysicalCores: 1, Cores: 1, ThreadsPerCore: 2, } // CascadeLake4 is a four core Intel CascadeLake machine. var CascadeLake4 = MockCPU{ Name: "CascadeLake", VendorID: "GenuineIntel", Family: 6, Model: 85, ModelName: "Intel(R) Xeon(R) CPU", Bugs: "spectre_v1 spectre_v2 spec_store_bypass mds swapgs taa", PhysicalCores: 1, Cores: 2, ThreadsPerCore: 2, } // Haswell2 is a two core Intel Haswell machine. var Haswell2 = MockCPU{ Name: "Haswell", VendorID: "GenuineIntel", Family: 6, Model: 63, ModelName: "Intel(R) Xeon(R) CPU", Bugs: "cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs", PhysicalCores: 1, Cores: 1, ThreadsPerCore: 2, } // Haswell2core is a 2 core Intel Haswell machine with no hyperthread pairs. var Haswell2core = MockCPU{ Name: "Haswell2Physical", VendorID: "GenuineIntel", Family: 6, Model: 63, ModelName: "Intel(R) Xeon(R) CPU", Bugs: "cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs", PhysicalCores: 2, Cores: 1, ThreadsPerCore: 1, } // AMD2 is an two core AMD machine. var AMD2 = MockCPU{ Name: "AMD", VendorID: "AuthenticAMD", Family: 23, Model: 49, ModelName: "AMD EPYC 7B12", Bugs: "sysret_ss_attrs spectre_v1 spectre_v2 spec_store_bypass", PhysicalCores: 1, Cores: 1, ThreadsPerCore: 2, } // AMD8 is an eight core AMD machine. var AMD8 = MockCPU{ Name: "AMD", VendorID: "AuthenticAMD", Family: 23, Model: 49, ModelName: "AMD EPYC 7B12", Bugs: "sysret_ss_attrs spectre_v1 spectre_v2 spec_store_bypass", PhysicalCores: 4, Cores: 1, ThreadsPerCore: 2, } // Empty is an empty CPU set. var Empty = MockCPU{ Name: "Empty", } // MakeCPUSet makes a cpuSet from a MockCPU. func (tc MockCPU) MakeCPUSet() CPUSet { bugs := make(map[string]struct{}) for _, bug := range strings.Split(tc.Bugs, " ") { bugs[bug] = struct{}{} } var cpus CPUSet = []*CPU{} for i := int64(0); i < tc.PhysicalCores; i++ { for j := int64(0); j < tc.Cores; j++ { for k := int64(0); k < tc.ThreadsPerCore; k++ { processorNum := (i*tc.Cores+j)*tc.ThreadsPerCore + k cpu := &CPU{ processorNumber: processorNum, vendorID: tc.VendorID, cpuFamily: tc.Family, model: tc.Model, physicalID: i, coreID: j, bugs: bugs, } cpus = append(cpus, cpu) } } } return cpus } // NumCPUs returns the number of CPUs for this CPU. func (tc MockCPU) NumCPUs() int { return int(tc.PhysicalCores * tc.Cores * tc.ThreadsPerCore) } golang-gvisor-gvisor-0.0~20240729.0/runsc/profile/000077500000000000000000000000001465435605700214355ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/profile/profile.go000066400000000000000000000137651465435605700234400ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package profile contains profiling utils. package profile import ( "fmt" "os" "runtime" "runtime/pprof" "runtime/trace" "strings" "time" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/flag" ) // Kind is the kind of profiling to perform. type Kind int const ( // Block profile. Block Kind = iota // CPU profile. CPU // Heap profile. Heap // Mutex profile. Mutex // Trace profile. Trace ) // FDArgs are the arguments that describe which profiles to enable and which // FDs to write them to. Profiling of a given type will only be enabled if the // corresponding FD is >=0. type FDArgs struct { // BlockFD is the file descriptor to write a block profile to. // Valid if >=0. BlockFD int // CPUFD is the file descriptor to write a CPU profile to. // Valid if >=0. CPUFD int // HeapFD is the file descriptor to write a heap profile to. // Valid if >=0. HeapFD int // MutexFD is the file descriptor to write a mutex profile to. // Valid if >=0. MutexFD int // TraceFD is the file descriptor to write a Go execution trace to. // Valid if >=0. TraceFD int } // SetFromFlags sets the FDArgs from the given flags. The default value for // each FD is -1. func (fds *FDArgs) SetFromFlags(f *flag.FlagSet) { f.IntVar(&fds.BlockFD, "profile-block-fd", -1, "file descriptor to write block profile to. -1 disables profiling.") f.IntVar(&fds.CPUFD, "profile-cpu-fd", -1, "file descriptor to write CPU profile to. -1 disables profiling.") f.IntVar(&fds.HeapFD, "profile-heap-fd", -1, "file descriptor to write heap profile to. -1 disables profiling.") f.IntVar(&fds.MutexFD, "profile-mutex-fd", -1, "file descriptor to write mutex profile to. -1 disables profiling.") f.IntVar(&fds.TraceFD, "trace-fd", -1, "file descriptor to write Go execution trace to. -1 disables tracing.") } // Opts is a map of profile Kind to FD. type Opts map[Kind]uintptr // ToOpts turns FDArgs into an Opts struct which can be passed to Start. func (fds *FDArgs) ToOpts() Opts { o := Opts{} if fds.BlockFD >= 0 { o[Block] = uintptr(fds.BlockFD) } if fds.CPUFD >= 0 { o[CPU] = uintptr(fds.CPUFD) } if fds.HeapFD >= 0 { o[Heap] = uintptr(fds.HeapFD) } if fds.MutexFD >= 0 { o[Mutex] = uintptr(fds.MutexFD) } if fds.TraceFD >= 0 { o[Trace] = uintptr(fds.TraceFD) } return o } // Start starts profiling for the given Kinds in opts, and writes the profile // data to the corresponding FDs in opts. It returns a function which will stop // profiling. func Start(opts Opts) func() { var onStopProfiling []func() stopProfiling := func() { for _, f := range onStopProfiling { f() } } if fd, ok := opts[Block]; ok { log.Infof("Block profiling enabled") file := os.NewFile(fd, "profile-block") runtime.SetBlockProfileRate(control.DefaultBlockProfileRate) onStopProfiling = append(onStopProfiling, func() { if err := pprof.Lookup("block").WriteTo(file, 0); err != nil { log.Warningf("Error writing block profile: %v", err) } file.Close() runtime.SetBlockProfileRate(0) log.Infof("Block profiling stopped") }) } if fd, ok := opts[CPU]; ok { log.Infof("CPU profiling enabled") file := os.NewFile(fd, "profile-cpu") pprof.StartCPUProfile(file) onStopProfiling = append(onStopProfiling, func() { pprof.StopCPUProfile() file.Close() log.Infof("CPU profiling stopped") }) } if fd, ok := opts[Heap]; ok { log.Infof("Heap profiling enabled") file := os.NewFile(fd, "profile-heap") onStopProfiling = append(onStopProfiling, func() { if err := pprof.Lookup("heap").WriteTo(file, 0); err != nil { log.Warningf("Error writing heap profile: %v", err) } file.Close() log.Infof("Heap profiling stopped") }) } if fd, ok := opts[Mutex]; ok { log.Infof("Mutex profiling enabled") file := os.NewFile(fd, "profile-mutex") prev := runtime.SetMutexProfileFraction(control.DefaultMutexProfileRate) onStopProfiling = append(onStopProfiling, func() { if err := pprof.Lookup("mutex").WriteTo(file, 0); err != nil { log.Warningf("Error writing mutex profile: %v", err) } file.Close() runtime.SetMutexProfileFraction(prev) log.Infof("Mutex profiling stopped") }) } if fd, ok := opts[Trace]; ok { log.Infof("Tracing enabled") file := os.NewFile(fd, "trace") trace.Start(file) onStopProfiling = append(onStopProfiling, func() { trace.Stop() file.Close() log.Infof("Tracing stopped") }) } return stopProfiling } // UpdatePaths updates profiling-related file paths in the given config. func UpdatePaths(conf *config.Config, timestamp time.Time) { if !conf.ProfileEnable { return } conf.ProfileCPU = updatePath(conf.ProfileCPU, timestamp) conf.ProfileHeap = updatePath(conf.ProfileHeap, timestamp) conf.ProfileMutex = updatePath(conf.ProfileMutex, timestamp) conf.ProfileBlock = updatePath(conf.ProfileBlock, timestamp) } func updatePath(path string, now time.Time) string { path = strings.ReplaceAll(path, "%TIMESTAMP%", fmt.Sprintf("%d", now.Unix())) path = strings.ReplaceAll(path, "%YYYY%", now.Format("2006")) path = strings.ReplaceAll(path, "%MM%", now.Format("01")) path = strings.ReplaceAll(path, "%DD%", now.Format("02")) path = strings.ReplaceAll(path, "%HH%", now.Format("15")) path = strings.ReplaceAll(path, "%II%", now.Format("04")) path = strings.ReplaceAll(path, "%SS%", now.Format("05")) path = strings.ReplaceAll(path, "%NN%", fmt.Sprintf("%09d", now.Nanosecond())) return path } golang-gvisor-gvisor-0.0~20240729.0/runsc/profile/profile_state_autogen.go000066400000000000000000000000711465435605700263440ustar00rootroot00000000000000// automatically generated by stateify. package profile golang-gvisor-gvisor-0.0~20240729.0/runsc/sandbox/000077500000000000000000000000001465435605700214335ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/sandbox/bpf/000077500000000000000000000000001465435605700222025ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/sandbox/bpf/BUILD000066400000000000000000000012161465435605700227640ustar00rootroot00000000000000load("//tools:defs.bzl", "bpf_program", "go_library") package( default_applicable_licenses = ["//:license"], licenses = ["notice"], ) go_library( name = "bpf", srcs = ["bpf.go"], embedsrcs = [ "af_xdp_ebpf.o", # keep "tunnel_veth_ebpf.o", # keep ], visibility = ["//visibility:public"], ) bpf_program( name = "af_xdp_ebpf", src = "af_xdp.ebpf.c", hdrs = [], bpf_object = "af_xdp_ebpf.o", visibility = ["//:sandbox"], ) bpf_program( name = "tunnel_veth_ebpf", src = "tunnel_veth.ebpf.c", hdrs = [], bpf_object = "tunnel_veth_ebpf.o", visibility = ["//:sandbox"], ) golang-gvisor-gvisor-0.0~20240729.0/runsc/sandbox/bpf/af_xdp.ebpf.c000066400000000000000000000036021465435605700245230ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // clang-format off // Contains types needed by later headers. #include // clang-format on #include #include #define section(secname) __attribute__((section(secname), used)) char __license[] section("license") = "Apache-2.0"; // Note: bpf_helpers.h includes a struct definition for bpf_map_def in some, but // not all, environments. Define our own equivalent struct to avoid issues with // multiple declarations. struct gvisor_bpf_map_def { unsigned int type; unsigned int key_size; unsigned int value_size; unsigned int max_entries; unsigned int map_flags; }; // A map of RX queue number to AF_XDP socket. We only ever use one key: 0. struct gvisor_bpf_map_def section("maps") sock_map = { .type = BPF_MAP_TYPE_XSKMAP, // Note: "XSK" means AF_XDP socket. .key_size = sizeof(int), .value_size = sizeof(int), .max_entries = 1, }; section("xdp") int xdp_prog(struct xdp_md *ctx) { // Lookup the socket for the current RX queue. Veth devices by default have // only one RX queue. If one is found, redirect the packet to that socket. // Otherwise pass it on to the kernel network stack. // // TODO: We can support multiple sockets with a fancier hash-based handoff. return bpf_redirect_map(&sock_map, ctx->rx_queue_index, XDP_PASS); } golang-gvisor-gvisor-0.0~20240729.0/runsc/sandbox/bpf/bpf.go000066400000000000000000000020131465435605700232740ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package bpf provides compiled bpf programs as byte slices. package bpf import _ "embed" // AFXDPProgram is a BPF program that, when attached to a device, redirects all // packets to a single AF_XDP socket unconditionally. // //go:embed af_xdp_ebpf.o var AFXDPProgram []byte // TunnelVethProgram is a BPF program that redirects all packets to exit via // another device. // //go:embed tunnel_veth_ebpf.o var TunnelVethProgram []byte golang-gvisor-gvisor-0.0~20240729.0/runsc/sandbox/bpf/bpf_state_autogen.go000066400000000000000000000000651465435605700262230ustar00rootroot00000000000000// automatically generated by stateify. package bpf golang-gvisor-gvisor-0.0~20240729.0/runsc/sandbox/bpf/tunnel_veth.ebpf.c000066400000000000000000000030571465435605700256210ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // clang-format off // Contains types needed by later headers. #include // clang-format on #include #include #define section(secname) __attribute__((section(secname), used)) char __license[] section("license") = "Apache-2.0"; // Note: bpf_helpers.h includes a struct definition for bpf_map_def in some, but // not all, environments. Define our own equivalent struct to avoid issues with // multiple declarations. struct gvisor_bpf_map_def { unsigned int type; unsigned int key_size; unsigned int value_size; unsigned int max_entries; unsigned int map_flags; }; struct gvisor_bpf_map_def section("maps") dev_map = { .type = BPF_MAP_TYPE_DEVMAP, .key_size = sizeof(__u32), .value_size = sizeof(__u32), .max_entries = 1, }; // Redirect all incoming traffic to go out another device. section("xdp") int xdp_veth_prog(struct xdp_md *ctx) { return bpf_redirect_map(&dev_map, ctx->rx_queue_index, XDP_PASS); } golang-gvisor-gvisor-0.0~20240729.0/runsc/sandbox/memory.go000066400000000000000000000034441465435605700232770ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sandbox import ( "bufio" "fmt" "io" "os" "strconv" "strings" ) // totalSystemMemory extracts "MemTotal" from "/proc/meminfo". func totalSystemMemory() (uint64, error) { f, err := os.Open("/proc/meminfo") if err != nil { return 0, err } defer f.Close() return parseTotalSystemMemory(f) } func parseTotalSystemMemory(r io.Reader) (uint64, error) { for scanner := bufio.NewScanner(r); scanner.Scan(); { line := scanner.Text() totalStr := strings.TrimPrefix(line, "MemTotal:") if len(totalStr) < len(line) { fields := strings.Fields(totalStr) if len(fields) == 0 || len(fields) > 2 { return 0, fmt.Errorf(`malformed "MemTotal": %q`, line) } totalStr = fields[0] unit := "" if len(fields) == 2 { unit = fields[1] } mem, err := strconv.ParseUint(totalStr, 10, 64) if err != nil { return 0, err } switch unit { case "": // do nothing. case "kB": memKb := mem mem = memKb * 1024 if mem < memKb { return 0, fmt.Errorf(`"MemTotal" too large: %d`, memKb) } default: return 0, fmt.Errorf("unknown unit %q: %q", unit, line) } return mem, nil } } return 0, fmt.Errorf(`malformed "/proc/meminfo": "MemTotal" not found`) } golang-gvisor-gvisor-0.0~20240729.0/runsc/sandbox/network.go000066400000000000000000000523061465435605700234610ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sandbox import ( "fmt" "net" "os" "os/exec" "path/filepath" "runtime" "strconv" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/vishvananda/netlink" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" ) // setupNetwork configures the network stack to mimic the local network // configuration. Docker uses network namespaces with vnets to configure the // network for the container. The untrusted app expects to see the same network // inside the sandbox. Routing and port mapping is handled directly by docker // with most of network information not even available to the runtime. // // Netstack inside the sandbox speaks directly to the device using a raw socket. // All IP addresses assigned to the NIC, are removed and passed on to netstack's // device. // // If 'conf.Network' is NoNetwork, skips local configuration and creates a // loopback interface only. // // Run the following container to test it: // // docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4 func setupNetwork(conn *urpc.Client, pid int, conf *config.Config) error { log.Infof("Setting up network") switch conf.Network { case config.NetworkNone: log.Infof("Network is disabled, create loopback interface only") if err := createDefaultLoopbackInterface(conf, conn); err != nil { return fmt.Errorf("creating default loopback interface: %v", err) } case config.NetworkSandbox: // Build the path to the net namespace of the sandbox process. // This is what we will copy. nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net") if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf); err != nil { return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err) } case config.NetworkHost: // Nothing to do here. default: return fmt.Errorf("invalid network type: %v", conf.Network) } return nil } func createDefaultLoopbackInterface(conf *config.Config, conn *urpc.Client) error { link := boot.DefaultLoopbackLink link.GVisorGRO = conf.GVisorGRO if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{ LoopbackLinks: []boot.LoopbackLink{link}, DisconnectOk: conf.NetDisconnectOk, }, nil); err != nil { return fmt.Errorf("creating loopback link and routes: %v", err) } return nil } func joinNetNS(nsPath string) (func(), error) { runtime.LockOSThread() restoreNS, err := specutils.ApplyNS(specs.LinuxNamespace{ Type: specs.NetworkNamespace, Path: nsPath, }) if err != nil { runtime.UnlockOSThread() return nil, fmt.Errorf("joining net namespace %q: %v", nsPath, err) } return func() { restoreNS() runtime.UnlockOSThread() }, nil } // isRootNS determines whether we are running in the root net namespace. // /proc/sys/net/core/rmem_default only exists in root network namespace. func isRootNS() (bool, error) { err := unix.Access("/proc/sys/net/core/rmem_default", unix.F_OK) switch err { case nil: return true, nil case unix.ENOENT: return false, nil default: return false, fmt.Errorf("failed to access /proc/sys/net/core/rmem_default: %v", err) } } // createInterfacesAndRoutesFromNS scrapes the interface and routes from the // net namespace with the given path, creates them in the sandbox, and removes // them from the host. func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, conf *config.Config) error { switch conf.XDP.Mode { case config.XDPModeOff: case config.XDPModeNS: case config.XDPModeRedirect: if err := createRedirectInterfacesAndRoutes(conn, conf); err != nil { return fmt.Errorf("failed to create XDP redirect interface: %w", err) } return nil case config.XDPModeTunnel: if err := createXDPTunnel(conn, nsPath, conf); err != nil { return fmt.Errorf("failed to create XDP tunnel: %w", err) } return nil default: return fmt.Errorf("unknown XDP mode: %v", conf.XDP.Mode) } // Join the network namespace that we will be copying. restore, err := joinNetNS(nsPath) if err != nil { return err } defer restore() // Get all interfaces in the namespace. ifaces, err := net.Interfaces() if err != nil { return fmt.Errorf("querying interfaces: %w", err) } isRoot, err := isRootNS() if err != nil { return err } if isRoot { return fmt.Errorf("cannot run with network enabled in root network namespace") } // Collect addresses and routes from the interfaces. args := boot.CreateLinksAndRoutesArgs{ DisconnectOk: conf.NetDisconnectOk, } for _, iface := range ifaces { if iface.Flags&net.FlagUp == 0 { log.Infof("Skipping down interface: %+v", iface) continue } allAddrs, err := iface.Addrs() if err != nil { return fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err) } // We build our own loopback device. if iface.Flags&net.FlagLoopback != 0 { link, err := loopbackLink(conf, iface, allAddrs) if err != nil { return fmt.Errorf("getting loopback link for iface %q: %w", iface.Name, err) } args.LoopbackLinks = append(args.LoopbackLinks, link) continue } var ipAddrs []*net.IPNet for _, ifaddr := range allAddrs { ipNet, ok := ifaddr.(*net.IPNet) if !ok { return fmt.Errorf("address is not IPNet: %+v", ifaddr) } ipAddrs = append(ipAddrs, ipNet) } if len(ipAddrs) == 0 { log.Warningf("No usable IP addresses found for interface %q, skipping", iface.Name) continue } // Collect data from the ARP table. dump, err := netlink.NeighList(iface.Index, 0) if err != nil { return fmt.Errorf("fetching ARP table for %q: %w", iface.Name, err) } var neighbors []boot.Neighbor for _, n := range dump { // There are only two "good" states NUD_PERMANENT and NUD_REACHABLE, // but NUD_REACHABLE is fully dynamic and will be re-probed anyway. if n.State == netlink.NUD_PERMANENT { log.Debugf("Copying a static ARP entry: %+v %+v", n.IP, n.HardwareAddr) // No flags are copied because Stack.AddStaticNeighbor does not support flags right now. neighbors = append(neighbors, boot.Neighbor{IP: n.IP, HardwareAddr: n.HardwareAddr}) } } // Scrape the routes before removing the address, since that // will remove the routes as well. routes, defv4, defv6, err := routesForIface(iface) if err != nil { return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err) } if defv4 != nil { if !args.Defaultv4Gateway.Route.Empty() { return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway) } args.Defaultv4Gateway.Route = *defv4 args.Defaultv4Gateway.Name = iface.Name } if defv6 != nil { if !args.Defaultv6Gateway.Route.Empty() { return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway) } args.Defaultv6Gateway.Route = *defv6 args.Defaultv6Gateway.Name = iface.Name } // Get the link for the interface. ifaceLink, err := netlink.LinkByName(iface.Name) if err != nil { return fmt.Errorf("getting link for interface %q: %w", iface.Name, err) } linkAddress := ifaceLink.Attrs().HardwareAddr // Collect the addresses for the interface, enable forwarding, // and remove them from the host. var addresses []boot.IPWithPrefix for _, addr := range ipAddrs { prefix, _ := addr.Mask.Size() addresses = append(addresses, boot.IPWithPrefix{Address: addr.IP, PrefixLen: prefix}) // Steal IP address from NIC. if err := removeAddress(ifaceLink, addr.String()); err != nil { // If we encounter an error while deleting the ip, // verify the ip is still present on the interface. if present, err := isAddressOnInterface(iface.Name, addr); err != nil { return fmt.Errorf("checking if address %v is on interface %q: %w", addr, iface.Name, err) } else if !present { continue } return fmt.Errorf("removing address %v from device %q: %w", addr, iface.Name, err) } } if conf.XDP.Mode == config.XDPModeNS { xdpSockFDs, err := createSocketXDP(iface) if err != nil { return fmt.Errorf("failed to create XDP socket: %v", err) } args.FilePayload.Files = append(args.FilePayload.Files, xdpSockFDs...) args.XDPLinks = append(args.XDPLinks, boot.XDPLink{ Name: iface.Name, InterfaceIndex: iface.Index, Routes: routes, TXChecksumOffload: conf.TXChecksumOffload, RXChecksumOffload: conf.RXChecksumOffload, NumChannels: conf.NumNetworkChannels, QDisc: conf.QDisc, Neighbors: neighbors, LinkAddress: linkAddress, Addresses: addresses, GVisorGRO: conf.GVisorGRO, }) } else { link := boot.FDBasedLink{ Name: iface.Name, MTU: iface.MTU, Routes: routes, TXChecksumOffload: conf.TXChecksumOffload, RXChecksumOffload: conf.RXChecksumOffload, NumChannels: conf.NumNetworkChannels, ProcessorsPerChannel: conf.NetworkProcessorsPerChannel, QDisc: conf.QDisc, Neighbors: neighbors, LinkAddress: linkAddress, Addresses: addresses, } log.Debugf("Setting up network channels") // Create the socket for the device. for i := 0; i < link.NumChannels; i++ { log.Debugf("Creating Channel %d", i) socketEntry, err := createSocket(iface, ifaceLink, conf.HostGSO) if err != nil { return fmt.Errorf("failed to createSocket for %s : %w", iface.Name, err) } if i == 0 { link.GSOMaxSize = socketEntry.gsoMaxSize } else { if link.GSOMaxSize != socketEntry.gsoMaxSize { return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s", link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name) } } args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile) } if link.GSOMaxSize == 0 && conf.GVisorGSO { // Host GSO is disabled. Let's enable gVisor GSO. link.GSOMaxSize = stack.GVisorGSOMaxSize link.GVisorGSOEnabled = true } link.GVisorGRO = conf.GVisorGRO args.FDBasedLinks = append(args.FDBasedLinks, link) } } if err := pcapAndNAT(&args, conf); err != nil { return err } log.Debugf("Setting up network, config: %+v", args) if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil { return fmt.Errorf("creating links and routes: %w", err) } return nil } // isAddressOnInterface checks if an address is on an interface func isAddressOnInterface(ifaceName string, addr *net.IPNet) (bool, error) { iface, err := net.InterfaceByName(ifaceName) if err != nil { return false, fmt.Errorf("getting interface by name %q: %w", ifaceName, err) } ifaceAddrs, err := iface.Addrs() if err != nil { return false, fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err) } for _, ifaceAddr := range ifaceAddrs { ipNet, ok := ifaceAddr.(*net.IPNet) if !ok { log.Warningf("Can't cast address to *net.IPNet, skipping: %+v", ifaceAddr) continue } if ipNet.String() == addr.String() { return true, nil } } return false, nil } type socketEntry struct { deviceFile *os.File gsoMaxSize uint32 } // createSocket creates an underlying AF_PACKET socket and configures it for // use by the sentry and returns an *os.File that wraps the underlying socket // fd. func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) { // Create the socket. const protocol = 0x0300 // htons(ETH_P_ALL) fd, err := unix.Socket(unix.AF_PACKET, unix.SOCK_RAW, 0) // pass protocol 0 to avoid slow bind() if err != nil { return nil, fmt.Errorf("unable to create raw socket: %v", err) } deviceFile := os.NewFile(uintptr(fd), "raw-device-fd") // Bind to the appropriate device. ll := unix.SockaddrLinklayer{ Protocol: protocol, Ifindex: iface.Index, } if err := unix.Bind(fd, &ll); err != nil { return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err) } gsoMaxSize := uint32(0) if enableGSO { gso, err := isGSOEnabled(fd, iface.Name) if err != nil { return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err) } if gso { if err := unix.SetsockoptInt(fd, unix.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil { return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err) } gsoMaxSize = ifaceLink.Attrs().GSOMaxSize } else { log.Infof("GSO not available in host.") } } // Use SO_RCVBUFFORCE/SO_SNDBUFFORCE because on linux the receive/send buffer // for an AF_PACKET socket is capped by "net.core.rmem_max/wmem_max". // wmem_max/rmem_max default to a unusually low value of 208KB. This is too // low for gVisor to be able to receive packets at high throughputs without // incurring packet drops. const bufSize = 4 << 20 // 4MB. if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUFFORCE, bufSize); err != nil { _ = unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUF, bufSize) sz, _ := unix.GetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_RCVBUF) if sz < bufSize { log.Warningf("Failed to increase rcv buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err) } } if err := unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUFFORCE, bufSize); err != nil { _ = unix.SetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUF, bufSize) sz, _ := unix.GetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_SNDBUF) if sz < bufSize { log.Warningf("Failed to increase snd buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err) } } return &socketEntry{deviceFile, gsoMaxSize}, nil } // loopbackLink returns the link with addresses and routes for a loopback // interface. func loopbackLink(conf *config.Config, iface net.Interface, addrs []net.Addr) (boot.LoopbackLink, error) { link := boot.LoopbackLink{ Name: iface.Name, GVisorGRO: conf.GVisorGRO, } for _, addr := range addrs { ipNet, ok := addr.(*net.IPNet) if !ok { return boot.LoopbackLink{}, fmt.Errorf("address is not IPNet: %+v", addr) } prefix, _ := ipNet.Mask.Size() link.Addresses = append(link.Addresses, boot.IPWithPrefix{ Address: ipNet.IP, PrefixLen: prefix, }) dst := *ipNet dst.IP = dst.IP.Mask(dst.Mask) link.Routes = append(link.Routes, boot.Route{ Destination: dst, }) } return link, nil } // routesForIface iterates over all routes for the given interface and converts // them to boot.Routes. It also returns the a default v4/v6 route if found. func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, *boot.Route, error) { link, err := netlink.LinkByIndex(iface.Index) if err != nil { return nil, nil, nil, err } rs, err := netlink.RouteList(link, netlink.FAMILY_ALL) if err != nil { return nil, nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err) } var defv4, defv6 *boot.Route var routes []boot.Route for _, r := range rs { // Is it a default route? if r.Dst == nil { if r.Gw == nil { return nil, nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r) } // Create a catch all route to the gateway. switch len(r.Gw) { case header.IPv4AddressSize: if defv4 != nil { return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv4, r) } defv4 = &boot.Route{ Destination: net.IPNet{ IP: net.IPv4zero, Mask: net.IPMask(net.IPv4zero), }, Gateway: r.Gw, } case header.IPv6AddressSize: if defv6 != nil { return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv6, r) } defv6 = &boot.Route{ Destination: net.IPNet{ IP: net.IPv6zero, Mask: net.IPMask(net.IPv6zero), }, Gateway: r.Gw, } default: return nil, nil, nil, fmt.Errorf("unexpected address size for gateway: %+v for route: %+v", r.Gw, r) } continue } dst := *r.Dst dst.IP = dst.IP.Mask(dst.Mask) routes = append(routes, boot.Route{ Destination: dst, Gateway: r.Gw, }) } return routes, defv4, defv6, nil } // removeAddress removes IP address from network device. It's equivalent to: // // ip addr del dev func removeAddress(source netlink.Link, ipAndMask string) error { addr, err := netlink.ParseAddr(ipAndMask) if err != nil { return err } return netlink.AddrDel(source, addr) } func pcapAndNAT(args *boot.CreateLinksAndRoutesArgs, conf *config.Config) error { // Possibly enable packet logging. args.LogPackets = conf.LogPackets // Pass PCAP log file if present. if conf.PCAP != "" { args.PCAP = true pcap, err := os.OpenFile(conf.PCAP, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0664) if err != nil { return fmt.Errorf("failed to open PCAP file %s: %v", conf.PCAP, err) } args.FilePayload.Files = append(args.FilePayload.Files, pcap) } // Pass the host's NAT table if requested. if conf.ReproduceNftables || conf.ReproduceNAT { var f *os.File var err error if conf.ReproduceNftables { log.Infof("reproing nftables") f, err = checkNftables() } else if conf.ReproduceNAT { log.Infof("reproing legacy tables") f, err = writeNATBlob() } if err != nil { return fmt.Errorf("failed to write NAT blob: %v", err) } args.NATBlob = true args.FilePayload.Files = append(args.FilePayload.Files, f) } return nil } // The below is a work around to generate iptables-legacy rules on machines // that use iptables-nftables. The logic goes something like this: // // start // | // v no // are legacy tables empty? -----> scrape rules -----> done <----+ // | ^ | // | yes | | // v yes | | // are nft tables empty? -------------------------------+ | // | | // | no | // v | // pipe iptables-nft-save -t nat to iptables-legacy-restore | // scrape rules | // delete iptables-legacy rules | // | | // +---------------------------------------------------+ // // If we fail at some point (e.g. to find a binary), we just try to scrape the // legacy rules. const emptyNatRules = `-P PREROUTING ACCEPT -P INPUT ACCEPT -P OUTPUT ACCEPT -P POSTROUTING ACCEPT ` func checkNftables() (*os.File, error) { // Use iptables (not iptables-save) to test table emptiness because it // gives predictable results: no counters and no comments. // Is the legacy table empty? if out, err := exec.Command("iptables-legacy", "-t", "nat", "-S").Output(); err != nil || string(out) != emptyNatRules { return writeNATBlob() } // Is the nftables table empty? if out, err := exec.Command("iptables-nft", "-t", "nat", "-S").Output(); err != nil || string(out) == emptyNatRules { return nil, fmt.Errorf("no rules to scrape: %v", err) } // Get the current (empty) legacy rules. currLegacy, err := exec.Command("iptables-legacy-save", "-t", "nat").Output() if err != nil { return nil, fmt.Errorf("failed to save existing rules with error (%v) and output: %s", err, currLegacy) } // Restore empty legacy rules. defer func() { cmd := exec.Command("iptables-legacy-restore") stdin, err := cmd.StdinPipe() if err != nil { log.Warningf("failed to get stdin pipe: %v", err) return } go func() { defer stdin.Close() stdin.Write(currLegacy) }() if out, err := cmd.CombinedOutput(); err != nil { log.Warningf("failed to restore iptables error (%v) with output: %s", err, out) } }() // Pipe the output of iptables-nft-save to iptables-legacy-restore. nftOut, err := exec.Command("iptables-nft-save", "-t", "nat").Output() if err != nil { return nil, fmt.Errorf("failed to run iptables-nft-save: %v", err) } cmd := exec.Command("iptables-legacy-restore") stdin, err := cmd.StdinPipe() if err != nil { return nil, fmt.Errorf("failed to get stdin pipe: %v", err) } go func() { defer stdin.Close() stdin.Write(nftOut) }() if out, err := cmd.CombinedOutput(); err != nil { return nil, fmt.Errorf("failed to restore iptables error (%v) with output: %s", err, out) } return writeNATBlob() } golang-gvisor-gvisor-0.0~20240729.0/runsc/sandbox/network_unsafe.go000066400000000000000000000074121465435605700250200ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sandbox import ( "fmt" "os" "unsafe" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" ) type ethtoolValue struct { cmd uint32 val uint32 } type ifreq struct { ifrName [unix.IFNAMSIZ]byte ifrData *ethtoolValue } const ( _ETHTOOL_GGSO = 0x00000023 ) func isGSOEnabled(fd int, intf string) (bool, error) { val := ethtoolValue{ cmd: _ETHTOOL_GGSO, } var name [unix.IFNAMSIZ]byte copy(name[:], []byte(intf)) ifr := ifreq{ ifrName: name, ifrData: &val, } if _, _, err := unix.Syscall(unix.SYS_IOCTL, uintptr(fd), unix.SIOCETHTOOL, uintptr(unsafe.Pointer(&ifr))); err != 0 { return false, err } return val.val != 0, nil } func writeNATBlob() (*os.File, error) { // Open a socket to use with iptables. iptSock, err := unix.Socket(unix.AF_INET, unix.SOCK_RAW, unix.IPPROTO_ICMP) if err != nil { return nil, fmt.Errorf("failed to open socket for iptables: %v", err) } defer unix.Close(iptSock) // Get the iptables info. var NATName [linux.XT_TABLE_MAXNAMELEN]byte copy(NATName[:], []byte("nat\x00")) natInfo := linux.IPTGetinfo{Name: NATName} natInfoLen := int32(unsafe.Sizeof(linux.IPTGetinfo{})) _, _, errno := unix.Syscall6(unix.SYS_GETSOCKOPT, uintptr(iptSock), unix.SOL_IP, linux.IPT_SO_GET_INFO, uintptr(unsafe.Pointer(&natInfo)), uintptr(unsafe.Pointer(&natInfoLen)), 0) if errno != 0 { return nil, fmt.Errorf("failed to call IPT_SO_GET_INFO: %v", err) } // Get the iptables entries. entries := linux.IPTGetEntries{Name: NATName, Size: natInfo.Size} entriesBufLen := uint32(unsafe.Sizeof(entries)) + natInfo.Size entriesBuf := make([]byte, entriesBufLen) entries.MarshalUnsafe(entriesBuf[:unsafe.Sizeof(entries)]) _, _, errno = unix.Syscall6(unix.SYS_GETSOCKOPT, uintptr(iptSock), unix.SOL_IP, linux.IPT_SO_GET_ENTRIES, uintptr(unsafe.Pointer(&entriesBuf[0])), uintptr(unsafe.Pointer(&entriesBufLen)), 0) if errno != 0 { return nil, fmt.Errorf("failed to call IPT_SO_GET_ENTRIES: %v", errno) } var gotEntries linux.IPTGetEntries gotEntries.UnmarshalUnsafe(entriesBuf[:unsafe.Sizeof(entries)]) // Construct an IPTReplace that can be used to set rules. replace := linux.IPTReplace{ Name: NATName, ValidHooks: natInfo.ValidHooks, NumEntries: natInfo.NumEntries, Size: natInfo.Size, HookEntry: natInfo.HookEntry, Underflow: natInfo.Underflow, // We don't implement counters yet. NumCounters: 0, Counters: 0, } // Marshal into a blob. replaceBuf := make([]byte, unsafe.Sizeof(replace)+uintptr(natInfo.Size)) replace.MarshalUnsafe(replaceBuf[:unsafe.Sizeof(replace)]) if n := copy(replaceBuf[unsafe.Sizeof(replace):], entriesBuf[unsafe.Sizeof(entries):]); uint32(n) != natInfo.Size { panic(fmt.Sprintf("failed to populate entry table: copied %d bytes, but wanted to copy %d", n, natInfo.Size)) } // Write blob to a pipe. reader, writer, err := os.Pipe() if err != nil { return nil, fmt.Errorf("failed to create iptables blob pipe: %v", err) } defer writer.Close() if n, err := writer.Write(replaceBuf); n != len(replaceBuf) || err != nil { return nil, fmt.Errorf("failed to write iptables blob: wrote %d bytes (%d expected) and got error: %v", n, len(replaceBuf), err) } return reader, nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/sandbox/sandbox.go000066400000000000000000002026201465435605700234220ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package sandbox creates and manipulates sandboxes. package sandbox import ( "context" "encoding/json" "errors" "fmt" "io" "math" "os" "os/exec" "path" "path/filepath" "strconv" "strings" "syscall" "time" "github.com/cenkalti/backoff" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/syndtr/gocapability/capability" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/control/client" "gvisor.dev/gvisor/pkg/control/server" "gvisor.dev/gvisor/pkg/coverage" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" metricpb "gvisor.dev/gvisor/pkg/metric/metric_go_proto" "gvisor.dev/gvisor/pkg/prometheus" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/devices/nvproxy" "gvisor.dev/gvisor/pkg/sentry/fsimpl/erofs" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/seccheck" "gvisor.dev/gvisor/pkg/state/statefile" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/boot/procfs" "gvisor.dev/gvisor/runsc/cgroup" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/console" "gvisor.dev/gvisor/runsc/donation" "gvisor.dev/gvisor/runsc/profile" "gvisor.dev/gvisor/runsc/specutils" "gvisor.dev/gvisor/runsc/starttime" ) const ( // namespaceAnnotation is a pod annotation populated by containerd. // It contains the name of the pod that a sandbox is in when running in Kubernetes. podNameAnnotation = "io.kubernetes.cri.sandbox-name" // namespaceAnnotation is a pod annotation populated by containerd. // It contains the namespace of the pod that a sandbox is in when running in Kubernetes. namespaceAnnotation = "io.kubernetes.cri.sandbox-namespace" ) // createControlSocket finds a location and creates the socket used to // communicate with the sandbox. The socket is a UDS on the host filesystem. // // Note that abstract sockets are *not* used, because any user can connect to // them. There is no file mode protecting abstract sockets. func createControlSocket(rootDir, id string) (string, int, error) { name := fmt.Sprintf("runsc-%s.sock", id) // Only use absolute paths to guarantee resolution from anywhere. for _, dir := range []string{rootDir, "/var/run", "/run", "/tmp"} { path := filepath.Join(dir, name) log.Debugf("Attempting to create socket file %q", path) fd, err := server.CreateSocket(path) if err == nil { log.Debugf("Using socket file %q", path) return path, fd, nil } log.Debugf("Failed to create socket file %q: %v", path, err) } return "", -1, fmt.Errorf("unable to find location to write socket file") } // pid is an atomic type that implements JSON marshal/unmarshal interfaces. type pid struct { val atomicbitops.Int64 } func (p *pid) store(pid int) { p.val.Store(int64(pid)) } func (p *pid) load() int { return int(p.val.Load()) } // UnmarshalJSON implements json.Unmarshaler.UnmarshalJSON. func (p *pid) UnmarshalJSON(b []byte) error { var pid int if err := json.Unmarshal(b, &pid); err != nil { return err } p.store(pid) return nil } // MarshalJSON implements json.Marshaler.MarshalJSON func (p *pid) MarshalJSON() ([]byte, error) { return json.Marshal(p.load()) } // Sandbox wraps a sandbox process. // // It is used to start/stop sandbox process (and associated processes like // gofers), as well as for running and manipulating containers inside a running // sandbox. // // Note: Sandbox must be immutable because a copy of it is saved for each // container and changes would not be synchronized to all of them. type Sandbox struct { // ID is the id of the sandbox (immutable). By convention, this is the same // ID as the first container run in the sandbox. ID string `json:"id"` // PodName is the name of the Kubernetes Pod (if any) that this sandbox // represents. Unset if not running under containerd or Kubernetes. PodName string `json:"podName"` // Namespace is the Kubernetes namespace (if any) of the pod that this // sandbox represents. Unset if not running under containerd or Kubernetes. Namespace string `json:"namespace"` // Pid is the pid of the running sandbox. May be 0 if the sandbox // is not running. Pid pid `json:"pid"` // UID is the user ID in the parent namespace that the sandbox is running as. UID int `json:"uid"` // GID is the group ID in the parent namespace that the sandbox is running as. GID int `json:"gid"` // CgroupJSON contains the cgroup configuration that the sandbox is part of // and allow serialization of the configuration into json CgroupJSON cgroup.CgroupJSON `json:"cgroup"` // OriginalOOMScoreAdj stores the value of oom_score_adj when the sandbox // started, before it may be modified. OriginalOOMScoreAdj int `json:"originalOomScoreAdj"` // RegisteredMetrics is the set of metrics registered in the sandbox. // Used for verifying metric data integrity after containers are started. // Only populated if exporting metrics was requested when the sandbox was // created. RegisteredMetrics *metricpb.MetricRegistration `json:"registeredMetrics"` // MetricMetadata are key-value pairs that are useful to export about this // sandbox, but not part of the set of labels that uniquely identify it. // They are static once initialized, and typically contain high-level // configuration information about the sandbox. MetricMetadata map[string]string `json:"metricMetadata"` // MetricServerAddress is the address of the metric server that this sandbox // intends to export metrics for. // Only populated if exporting metrics was requested when the sandbox was // created. MetricServerAddress string `json:"metricServerAddress"` // ControlSocketPath is the path to the sandbox's uRPC server socket. // Connections to the sandbox are made through this. ControlSocketPath string `json:"controlSocketPath"` // MountHints provides extra information about container mounts that apply // to the entire pod. MountHints *boot.PodMountHints `json:"mountHints"` // StartTime is the time the sandbox was started. StartTime time.Time `json:"startTime"` // child is set if a sandbox process is a child of the current process. // // This field isn't saved to json, because only a creator of sandbox // will have it as a child process. child bool `nojson:"true"` // statusMu protects status. statusMu sync.Mutex `nojson:"true"` // status is the exit status of a sandbox process. It's only set if the // child==true and the sandbox was waited on. This field allows for multiple // threads to wait on sandbox and get the exit code, since Linux will return // WaitStatus to one of the waiters only. status unix.WaitStatus `nojson:"true"` } // Getpid returns the process ID of the sandbox process. func (s *Sandbox) Getpid() int { return s.Pid.load() } // Args is used to configure a new sandbox. type Args struct { // ID is the sandbox unique identifier. ID string // Spec is the OCI spec that describes the container. Spec *specs.Spec // BundleDir is the directory containing the container bundle. BundleDir string // ConsoleSocket is the path to a unix domain socket that will receive // the console FD. It may be empty. ConsoleSocket string // UserLog is the filename to send user-visible logs to. It may be empty. UserLog string // IOFiles is the list of image files and/or socket files that connect to // a gofer endpoint for the mount points using Gofers. They must be in the // same order as mounts appear in the spec. IOFiles []*os.File // File that connects to a gofer endpoint for a device mount point at /dev. DevIOFile *os.File // GoferFilestoreFiles are the regular files that will back the overlayfs or // tmpfs mount if a gofer mount is to be overlaid. GoferFilestoreFiles []*os.File // GoferMountConfs contains information about how the gofer mounts have been // configured. The first entry is for rootfs and the following entries are // for bind mounts in Spec.Mounts (in the same order). GoferMountConfs boot.GoferMountConfFlags // MountHints provides extra information about containers mounts that apply // to the entire pod. MountHints *boot.PodMountHints // MountsFile is a file container mount information from the spec. It's // equivalent to the mounts from the spec, except that all paths have been // resolved to their final absolute location. MountsFile *os.File // Gcgroup is the cgroup that the sandbox is part of. Cgroup cgroup.Cgroup // Attached indicates that the sandbox lifecycle is attached with the caller. // If the caller exits, the sandbox should exit too. Attached bool // SinkFiles is the an ordered array of files to be used by seccheck sinks // configured from the --pod-init-config file. SinkFiles []*os.File // PassFiles are user-supplied files from the host to be exposed to the // sandboxed app. PassFiles map[int]*os.File // ExecFile is the file from the host used for program execution. ExecFile *os.File } // New creates the sandbox process. The caller must call Destroy() on the // sandbox. func New(conf *config.Config, args *Args) (*Sandbox, error) { s := &Sandbox{ ID: args.ID, CgroupJSON: cgroup.CgroupJSON{ Cgroup: args.Cgroup, }, UID: -1, // prevent usage before it's set. GID: -1, // prevent usage before it's set. MetricMetadata: conf.MetricMetadata(), MetricServerAddress: conf.MetricServer, MountHints: args.MountHints, StartTime: starttime.Get(), } if args.Spec != nil && args.Spec.Annotations != nil { s.PodName = args.Spec.Annotations[podNameAnnotation] s.Namespace = args.Spec.Annotations[namespaceAnnotation] } // The Cleanup object cleans up partially created sandboxes when an error // occurs. Any errors occurring during cleanup itself are ignored. c := cleanup.Make(func() { if err := s.destroy(); err != nil { log.Warningf("error destroying sandbox: %v", err) } }) defer c.Clean() if len(conf.PodInitConfig) > 0 { initConf, err := boot.LoadInitConfig(conf.PodInitConfig) if err != nil { return nil, fmt.Errorf("loading init config file: %w", err) } args.SinkFiles, err = initConf.Setup() if err != nil { return nil, fmt.Errorf("cannot init config: %w", err) } } // Create pipe to synchronize when sandbox process has been booted. clientSyncFile, sandboxSyncFile, err := os.Pipe() if err != nil { return nil, fmt.Errorf("creating pipe for sandbox %q: %v", s.ID, err) } defer clientSyncFile.Close() // Create the sandbox process. err = s.createSandboxProcess(conf, args, sandboxSyncFile) // sandboxSyncFile has to be closed to be able to detect when the sandbox // process exits unexpectedly. sandboxSyncFile.Close() if err != nil { return nil, fmt.Errorf("cannot create sandbox process: %w", err) } // Wait until the sandbox has booted. b := make([]byte, 1) if l, err := clientSyncFile.Read(b); err != nil || l != 1 { err := fmt.Errorf("waiting for sandbox to start: %v", err) // If the sandbox failed to start, it may be because the binary // permissions were incorrect. Check the bits and return a more helpful // error message. // // NOTE: The error message is checked because error types are lost over // rpc calls. if strings.Contains(err.Error(), io.EOF.Error()) { if permsErr := checkBinaryPermissions(conf); permsErr != nil { return nil, fmt.Errorf("%v: %v", err, permsErr) } } return nil, fmt.Errorf("cannot read client sync file: %w", err) } if conf.MetricServer != "" { // The control server is up and the sandbox was configured to export metrics. // We must gather data about registered metrics prior to any process starting in the sandbox. log.Debugf("Getting metric registration information from sandbox %q", s.ID) var registeredMetrics control.MetricsRegistrationResponse if err := s.call(boot.MetricsGetRegistered, nil, ®isteredMetrics); err != nil { return nil, fmt.Errorf("cannot get registered metrics: %v", err) } s.RegisteredMetrics = registeredMetrics.RegisteredMetrics } c.Release() return s, nil } // CreateSubcontainer creates a container inside the sandbox. func (s *Sandbox) CreateSubcontainer(conf *config.Config, cid string, tty *os.File) error { log.Debugf("Create sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load()) var files []*os.File if tty != nil { files = []*os.File{tty} } if err := s.configureStdios(conf, files); err != nil { return err } args := boot.CreateArgs{ CID: cid, FilePayload: urpc.FilePayload{Files: files}, } if err := s.call(boot.ContMgrCreateSubcontainer, &args, nil); err != nil { return fmt.Errorf("creating sub-container %q: %w", cid, err) } return nil } // StartRoot starts running the root container process inside the sandbox. func (s *Sandbox) StartRoot(conf *config.Config) error { pid := s.Pid.load() log.Debugf("Start root sandbox %q, PID: %d", s.ID, pid) conn, err := s.sandboxConnect() if err != nil { return err } defer conn.Close() // Configure the network. if err := setupNetwork(conn, pid, conf); err != nil { return fmt.Errorf("setting up network: %w", err) } // Send a message to the sandbox control server to start the root container. if err := conn.Call(boot.ContMgrRootContainerStart, &s.ID, nil); err != nil { return fmt.Errorf("starting root container: %w", err) } return nil } // StartSubcontainer starts running a sub-container inside the sandbox. func (s *Sandbox) StartSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles, goferFilestores []*os.File, devIOFile *os.File, goferConfs []boot.GoferMountConf) error { log.Debugf("Start sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load()) if err := s.configureStdios(conf, stdios); err != nil { return err } s.fixPidns(spec) // The payload contains (in this specific order): // * stdin/stdout/stderr (optional: only present when not using TTY) // * The subcontainer's gofer filestore files (optional) // * The subcontainer's dev gofer file (optional) // * Gofer files. payload := urpc.FilePayload{} payload.Files = append(payload.Files, stdios...) payload.Files = append(payload.Files, goferFilestores...) if devIOFile != nil { payload.Files = append(payload.Files, devIOFile) } payload.Files = append(payload.Files, goferFiles...) // Start running the container. args := boot.StartArgs{ Spec: spec, Conf: conf, CID: cid, NumGoferFilestoreFDs: len(goferFilestores), IsDevIoFilePresent: devIOFile != nil, GoferMountConfs: goferConfs, FilePayload: payload, } if err := s.call(boot.ContMgrStartSubcontainer, &args, nil); err != nil { return fmt.Errorf("starting sub-container %v: %w", spec.Process.Args, err) } return nil } // Restore sends the restore call for a container in the sandbox. func (s *Sandbox) Restore(conf *config.Config, cid string, imagePath string, direct bool) error { log.Debugf("Restore sandbox %q from path %q", s.ID, imagePath) stateFileName := path.Join(imagePath, boot.CheckpointStateFileName) sf, err := os.Open(stateFileName) if err != nil { return fmt.Errorf("opening state file %q failed: %v", stateFileName, err) } defer sf.Close() opt := boot.RestoreOpts{ FilePayload: urpc.FilePayload{ Files: []*os.File{sf}, }, } // If the pages file exists, we must pass it in. pagesFileName := path.Join(imagePath, boot.CheckpointPagesFileName) pagesReadFlags := os.O_RDONLY if direct { // The contents are page-aligned, so it can be opened with O_DIRECT. pagesReadFlags |= syscall.O_DIRECT } if pf, err := os.OpenFile(pagesFileName, pagesReadFlags, 0); err == nil { defer pf.Close() pagesMetadataFileName := path.Join(imagePath, boot.CheckpointPagesMetadataFileName) pmf, err := os.Open(pagesMetadataFileName) if err != nil { return fmt.Errorf("opening restore image file %q failed: %v", pagesMetadataFileName, err) } defer pmf.Close() opt.HavePagesFile = true opt.FilePayload.Files = append(opt.FilePayload.Files, pmf, pf) log.Infof("Found page files for sandbox %q. Page metadata: %q, pages: %q", s.ID, pagesMetadataFileName, pagesFileName) } else if !os.IsNotExist(err) { return fmt.Errorf("opening restore pages file %q failed: %v", pagesFileName, err) } else { log.Infof("Using single checkpoint file for sandbox %q", s.ID) } // If the platform needs a device FD we must pass it in. if deviceFile, err := deviceFileForPlatform(conf.Platform, conf.PlatformDevicePath); err != nil { return err } else if deviceFile != nil { defer deviceFile.Close() opt.HaveDeviceFile = true opt.FilePayload.Files = append(opt.FilePayload.Files, deviceFile.ReleaseToFile("device file")) } conn, err := s.sandboxConnect() if err != nil { return err } defer conn.Close() // Configure the network. if err := setupNetwork(conn, s.Pid.load(), conf); err != nil { return fmt.Errorf("setting up network: %v", err) } // Restore the container and start the root container. if err := conn.Call(boot.ContMgrRestore, &opt, nil); err != nil { return fmt.Errorf("restoring container %q: %v", cid, err) } return nil } // RestoreSubcontainer sends the restore call for a sub-container in the sandbox. func (s *Sandbox) RestoreSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles, goferFilestoreFiles []*os.File, devIOFile *os.File, goferMountConf []boot.GoferMountConf) error { log.Debugf("Restore sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load()) if err := s.configureStdios(conf, stdios); err != nil { return err } s.fixPidns(spec) // The payload contains (in this specific order): // * stdin/stdout/stderr (optional: only present when not using TTY) // * The subcontainer's overlay filestore files (optional: only present when // host file backed overlay is configured) // * Gofer files. payload := urpc.FilePayload{} payload.Files = append(payload.Files, stdios...) payload.Files = append(payload.Files, goferFilestoreFiles...) if devIOFile != nil { payload.Files = append(payload.Files, devIOFile) } payload.Files = append(payload.Files, goferFiles...) // Start running the container. args := boot.StartArgs{ Spec: spec, Conf: conf, CID: cid, NumGoferFilestoreFDs: len(goferFilestoreFiles), IsDevIoFilePresent: devIOFile != nil, GoferMountConfs: goferMountConf, FilePayload: payload, } if err := s.call(boot.ContMgrRestoreSubcontainer, &args, nil); err != nil { return fmt.Errorf("starting sub-container %v: %w", spec.Process.Args, err) } return nil } // Processes retrieves the list of processes and associated metadata for a // given container in this sandbox. func (s *Sandbox) Processes(cid string) ([]*control.Process, error) { log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID) var pl []*control.Process if err := s.call(boot.ContMgrProcesses, &cid, &pl); err != nil { return nil, fmt.Errorf("retrieving process data from sandbox: %v", err) } return pl, nil } // CreateTraceSession creates a new trace session. func (s *Sandbox) CreateTraceSession(config *seccheck.SessionConfig, force bool) error { log.Debugf("Creating trace session in sandbox %q", s.ID) sinkFiles, err := seccheck.SetupSinks(config.Sinks) if err != nil { return err } defer func() { for _, f := range sinkFiles { _ = f.Close() } }() arg := boot.CreateTraceSessionArgs{ Config: *config, Force: force, FilePayload: urpc.FilePayload{ Files: sinkFiles, }, } if err := s.call(boot.ContMgrCreateTraceSession, &arg, nil); err != nil { return fmt.Errorf("creating trace session: %w", err) } return nil } // DeleteTraceSession deletes an existing trace session. func (s *Sandbox) DeleteTraceSession(name string) error { log.Debugf("Deleting trace session %q in sandbox %q", name, s.ID) if err := s.call(boot.ContMgrDeleteTraceSession, name, nil); err != nil { return fmt.Errorf("deleting trace session: %w", err) } return nil } // ListTraceSessions lists all trace sessions. func (s *Sandbox) ListTraceSessions() ([]seccheck.SessionConfig, error) { log.Debugf("Listing trace sessions in sandbox %q", s.ID) var sessions []seccheck.SessionConfig if err := s.call(boot.ContMgrListTraceSessions, nil, &sessions); err != nil { return nil, fmt.Errorf("listing trace session: %w", err) } return sessions, nil } // ProcfsDump collects and returns a procfs dump for the sandbox. func (s *Sandbox) ProcfsDump() ([]procfs.ProcessProcfsDump, error) { log.Debugf("Procfs dump %q", s.ID) var procfsDump []procfs.ProcessProcfsDump if err := s.call(boot.ContMgrProcfsDump, nil, &procfsDump); err != nil { return nil, fmt.Errorf("getting sandbox %q stacks: %w", s.ID, err) } return procfsDump, nil } // NewCGroup returns the sandbox's Cgroup, or an error if it does not have one. func (s *Sandbox) NewCGroup() (cgroup.Cgroup, error) { return cgroup.NewFromPid(s.Pid.load(), false /* useSystemd */) } // Execute runs the specified command in the container. It returns the PID of // the newly created process. func (s *Sandbox) Execute(conf *config.Config, args *control.ExecArgs) (int32, error) { log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID) // Stdios are those files which have an FD <= 2 in the process. We do not // want the ownership of other files to be changed by configureStdios. var stdios []*os.File for i, fd := range args.GuestFDs { if fd > 2 || i >= len(args.Files) { continue } stdios = append(stdios, args.Files[i]) } if err := s.configureStdios(conf, stdios); err != nil { return 0, err } // Send a message to the sandbox control server to start the container. var pid int32 if err := s.call(boot.ContMgrExecuteAsync, args, &pid); err != nil { return 0, fmt.Errorf("executing command %q in sandbox: %w", args, err) } return pid, nil } // Event retrieves stats about the sandbox such as memory and CPU utilization. func (s *Sandbox) Event(cid string) (*boot.EventOut, error) { log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID) var e boot.EventOut if err := s.call(boot.ContMgrEvent, &cid, &e); err != nil { return nil, fmt.Errorf("retrieving event data from sandbox: %w", err) } return &e, nil } // PortForward starts port forwarding to the sandbox. func (s *Sandbox) PortForward(opts *boot.PortForwardOpts) error { log.Debugf("Requesting port forward for container %q in sandbox %q: %+v", opts.ContainerID, s.ID, opts) conn, err := s.sandboxConnect() if err != nil { return err } defer conn.Close() if err := conn.Call(boot.ContMgrPortForward, opts, nil); err != nil { return fmt.Errorf("port forwarding to sandbox: %v", err) } return nil } func (s *Sandbox) sandboxConnect() (*urpc.Client, error) { log.Debugf("Connecting to sandbox %q", s.ID) path := s.ControlSocketPath if len(path) >= linux.UnixPathMax { // This is not an abstract socket path. It is a filesystem path. // UDS connect fails when the len(socket path) >= UNIX_PATH_MAX. Instead // open the socket using open(2) and use /proc to refer to the open FD. sockFD, err := unix.Open(path, unix.O_PATH, 0) if err != nil { return nil, fmt.Errorf("failed to open socket at %q", path) } defer unix.Close(sockFD) path = filepath.Join("/proc/self/fd", fmt.Sprintf("%d", sockFD)) } conn, err := client.ConnectTo(path) if err != nil { return nil, s.connError(err) } return conn, nil } func (s *Sandbox) call(method string, arg, result any) error { conn, err := s.sandboxConnect() if err != nil { return err } defer conn.Close() return conn.Call(method, arg, result) } func (s *Sandbox) connError(err error) error { return fmt.Errorf("connecting to control server at PID %d: %v", s.Pid.load(), err) } // createSandboxProcess starts the sandbox as a subprocess by running the "boot" // command, passing in the bundle dir. func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyncFile *os.File) error { // Ensure we don't leak FDs to the sandbox process. if err := SetCloExeOnAllFDs(); err != nil { return fmt.Errorf("setting CLOEXEC on all FDs: %w", err) } donations := donation.Agency{} defer donations.Close() // pgalloc.MemoryFile (which provides application memory) sometimes briefly // mlock(2)s ranges of memory in order to fault in a large number of pages at // a time. Try to make RLIMIT_MEMLOCK unlimited so that it can do so. runsc // expects to run in a memory cgroup that limits its memory usage as // required. // This needs to be done before exec'ing `runsc boot`, as that subcommand // runs as an unprivileged user that will not be able to call `setrlimit` // by itself. Calling `setrlimit` here will have the side-effect of setting // the limit on the currently-running `runsc` process as well, but that // should be OK too. var rlim unix.Rlimit if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil { log.Warningf("Failed to get RLIMIT_MEMLOCK: %v", err) } else if rlim.Cur != unix.RLIM_INFINITY || rlim.Max != unix.RLIM_INFINITY { rlim.Cur = unix.RLIM_INFINITY rlim.Max = unix.RLIM_INFINITY if err := unix.Setrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil { // We may not have CAP_SYS_RESOURCE, so this failure may be expected. log.Infof("Failed to set RLIMIT_MEMLOCK: %v", err) } } // // These flags must come BEFORE the "boot" command in cmd.Args. // // Open the log files to pass to the sandbox as FDs. if err := donations.OpenAndDonate("log-fd", conf.LogFilename, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil { return err } test := "" if len(conf.TestOnlyTestNameEnv) != 0 { // Fetch test name if one is provided and the test only flag was set. if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok { test = t } } if specutils.IsDebugCommand(conf, "boot") { if err := donations.DonateDebugLogFile("debug-log-fd", conf.DebugLog, "boot", test, s.StartTime); err != nil { return err } } if err := donations.DonateDebugLogFile("panic-log-fd", conf.PanicLog, "panic", test, s.StartTime); err != nil { return err } covFilename := conf.CoverageReport if covFilename == "" { covFilename = os.Getenv("GO_COVERAGE_FILE") } if covFilename != "" && coverage.Available() { if err := donations.DonateDebugLogFile("coverage-fd", covFilename, "cov", test, s.StartTime); err != nil { return err } } // Relay all the config flags to the sandbox process. cmd := exec.Command(specutils.ExePath, conf.ToFlags()...) cmd.SysProcAttr = &unix.SysProcAttr{ // Detach from this session, otherwise cmd will get SIGHUP and SIGCONT // when re-parented. Setsid: true, } // Set Args[0] to make easier to spot the sandbox process. Otherwise it's // shown as `exe`. cmd.Args[0] = "runsc-sandbox" // Tranfer FDs that need to be present before the "boot" command. // Start at 3 because 0, 1, and 2 are taken by stdin/out/err. nextFD := donations.Transfer(cmd, 3) // Add the "boot" command to the args. // // All flags after this must be for the boot command cmd.Args = append(cmd.Args, "boot", "--bundle="+args.BundleDir) // Clear environment variables, unless --TESTONLY-unsafe-nonroot is set. if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { // Setting cmd.Env = nil causes cmd to inherit the current process's env. cmd.Env = []string{} // runsc-race with glibc needs to disable rseq. glibcTunables := os.Getenv("GLIBC_TUNABLES") if glibcTunables != "" { cmd.Env = append(cmd.Env, fmt.Sprintf("GLIBC_TUNABLES=%s", glibcTunables)) } } // If there is a gofer, sends all socket ends to the sandbox. donations.DonateAndClose("io-fds", args.IOFiles...) donations.DonateAndClose("dev-io-fd", args.DevIOFile) donations.DonateAndClose("gofer-filestore-fds", args.GoferFilestoreFiles...) donations.DonateAndClose("mounts-fd", args.MountsFile) donations.Donate("start-sync-fd", startSyncFile) if err := donations.OpenAndDonate("user-log-fd", args.UserLog, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil { return err } const profFlags = os.O_CREATE | os.O_WRONLY | os.O_TRUNC profile.UpdatePaths(conf, s.StartTime) if err := donations.OpenAndDonate("profile-block-fd", conf.ProfileBlock, profFlags); err != nil { return err } if err := donations.OpenAndDonate("profile-cpu-fd", conf.ProfileCPU, profFlags); err != nil { return err } if err := donations.OpenAndDonate("profile-heap-fd", conf.ProfileHeap, profFlags); err != nil { return err } if err := donations.OpenAndDonate("profile-mutex-fd", conf.ProfileMutex, profFlags); err != nil { return err } if err := donations.OpenAndDonate("trace-fd", conf.TraceFile, profFlags); err != nil { return err } // Pass gofer mount configs. cmd.Args = append(cmd.Args, "--gofer-mount-confs="+args.GoferMountConfs.String()) // Create a socket for the control server and donate it to the sandbox. controlSocketPath, sockFD, err := createControlSocket(conf.RootDir, s.ID) if err != nil { return fmt.Errorf("failed to create control socket: %v", err) } s.ControlSocketPath = controlSocketPath log.Infof("Control socket path: %q", s.ControlSocketPath) donations.DonateAndClose("controller-fd", os.NewFile(uintptr(sockFD), "control_server_socket")) specFile, err := specutils.OpenSpec(args.BundleDir) if err != nil { return fmt.Errorf("cannot open spec file in bundle dir %v: %w", args.BundleDir, err) } donations.DonateAndClose("spec-fd", specFile) if err := donations.OpenAndDonate("pod-init-config-fd", conf.PodInitConfig, os.O_RDONLY); err != nil { return err } donations.DonateAndClose("sink-fds", args.SinkFiles...) if len(conf.TestOnlyAutosaveImagePath) != 0 { files, err := createSaveFiles(conf.TestOnlyAutosaveImagePath, false, statefile.CompressionLevelFlateBestSpeed) if err != nil { return fmt.Errorf("failed to create auto save files: %w", err) } donations.DonateAndClose("save-fds", files...) } if err := createSandboxProcessExtra(conf, args, &donations); err != nil { return err } gPlatform, err := platform.Lookup(conf.Platform) if err != nil { return fmt.Errorf("cannot look up platform: %w", err) } if deviceFile, err := gPlatform.OpenDevice(conf.PlatformDevicePath); err != nil { return fmt.Errorf("opening device file for platform %q: %v", conf.Platform, err) } else if deviceFile != nil { donations.DonateAndClose("device-fd", deviceFile.ReleaseToFile("device file")) } // TODO(b/151157106): syscall tests fail by timeout if asyncpreemptoff // isn't set. if conf.Platform == "kvm" { cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1") } // nss is the set of namespaces to join or create before starting the sandbox // process. Mount, IPC and UTS namespaces from the host are not used as they // are virtualized inside the sandbox. Be paranoid and run inside an empty // namespace for these. Don't unshare cgroup because sandbox is added to a // cgroup in the caller's namespace. log.Infof("Sandbox will be started in new mount, IPC and UTS namespaces") nss := []specs.LinuxNamespace{ {Type: specs.IPCNamespace}, {Type: specs.MountNamespace}, {Type: specs.UTSNamespace}, } if gPlatform.Requirements().RequiresCurrentPIDNS { // TODO(b/75837838): Also set a new PID namespace so that we limit // access to other host processes. log.Infof("Sandbox will be started in the current PID namespace") } else { log.Infof("Sandbox will be started in a new PID namespace") nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace}) cmd.Args = append(cmd.Args, "--pidns=true") } if specutils.NVProxyEnabled(args.Spec, conf) { version, err := getNvproxyDriverVersion(conf) if err != nil { return fmt.Errorf("failed to get Nvidia driver version: %w", err) } cmd.Args = append(cmd.Args, "--nvidia-driver-version="+version) } // Joins the network namespace if network is enabled. the sandbox talks // directly to the host network, which may have been configured in the // namespace. if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != config.NetworkNone { log.Infof("Sandbox will be started in the container's network namespace: %+v", ns) nss = append(nss, ns) } else if conf.Network == config.NetworkHost { log.Infof("Sandbox will be started in the host network namespace") } else { log.Infof("Sandbox will be started in new network namespace") nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace}) } // These are set to the uid/gid that the sandbox process will use. May be // overriden below. s.UID = os.Getuid() s.GID = os.Getgid() // User namespace depends on the network type or whether access to the host // filesystem is required. These features require to run inside the user // namespace specified in the spec or the current namespace if none is // configured. rootlessEUID := unix.Geteuid() != 0 setUserMappings := false if conf.Network == config.NetworkHost || conf.DirectFS { if userns, ok := specutils.GetNS(specs.UserNamespace, args.Spec); ok { log.Infof("Sandbox will be started in container's user namespace: %+v", userns) nss = append(nss, userns) if rootlessEUID { syncFile, err := ConfigureCmdForRootless(cmd, &donations) if err != nil { return err } defer syncFile.Close() setUserMappings = true } else { specutils.SetUIDGIDMappings(cmd, args.Spec) // We need to set UID and GID to have capabilities in a new user namespace. cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0} } } else { if rootlessEUID { return fmt.Errorf("unable to run a rootless container without userns") } log.Infof("Sandbox will be started in the current user namespace") } // When running in the caller's defined user namespace, apply the same // capabilities to the sandbox process to ensure it abides to the same // rules. cmd.Args = append(cmd.Args, "--apply-caps=true") // If we have CAP_SYS_ADMIN, we can create an empty chroot and // bind-mount the executable inside it. if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!") } else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN) || rootlessEUID { log.Infof("Sandbox will be started in minimal chroot") cmd.Args = append(cmd.Args, "--setup-root") } else { return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN") } } else { // If we have CAP_SETUID and CAP_SETGID, then we can also run // as user nobody. if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid()) log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!") } else if rootlessEUID || specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) { log.Infof("Sandbox will be started in new user namespace") nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace}) cmd.Args = append(cmd.Args, "--setup-root") const nobody = 65534 if rootlessEUID || conf.Rootless { log.Infof("Rootless mode: sandbox will run as nobody inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid()) } else { // Map nobody in the new namespace to nobody in the parent namespace. s.UID = nobody s.GID = nobody } // Set credentials to run as user and group nobody. cmd.SysProcAttr.Credential = &syscall.Credential{Uid: nobody, Gid: nobody} cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ { ContainerID: nobody, HostID: s.UID, Size: 1, }, } cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ { ContainerID: nobody, HostID: s.GID, Size: 1, }, } // A sandbox process will construct an empty root for itself, so it has // to have CAP_SYS_ADMIN and CAP_SYS_CHROOT capabilities. cmd.SysProcAttr.AmbientCaps = append(cmd.SysProcAttr.AmbientCaps, uintptr(capability.CAP_SYS_ADMIN), uintptr(capability.CAP_SYS_CHROOT), // CAP_SETPCAP is required to clear the bounding set. uintptr(capability.CAP_SETPCAP), ) } else { return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID") } } // The current process' stdio must be passed to the application via the // --stdio-fds flag. The stdio of the sandbox process itself must not // be connected to the same FDs, otherwise we risk leaking sandbox // errors to the application, so we set the sandbox stdio to nil, // causing them to read/write from the null device. cmd.Stdin = nil cmd.Stdout = nil cmd.Stderr = nil var stdios [3]*os.File // If the console control socket file is provided, then create a new // pty master/replica pair and set the TTY on the sandbox process. if args.Spec.Process.Terminal && args.ConsoleSocket != "" { // console.NewWithSocket will send the master on the given // socket, and return the replica. tty, err := console.NewWithSocket(args.ConsoleSocket) if err != nil { return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err) } defer tty.Close() // Set the TTY as a controlling TTY on the sandbox process. cmd.SysProcAttr.Setctty = true // Inconveniently, the Ctty must be the FD in the *child* process's FD // table. So transfer all files we have so far and make sure the next file // added to donations is stdin. // // See https://github.com/golang/go/issues/29458. nextFD = donations.Transfer(cmd, nextFD) cmd.SysProcAttr.Ctty = nextFD // Pass the tty as all stdio fds to sandbox. stdios[0] = tty stdios[1] = tty stdios[2] = tty if conf.Debug { // If debugging, send the boot process stdio to the // TTY, so that it is easier to find. cmd.Stdin = tty cmd.Stdout = tty cmd.Stderr = tty } } else { // If not using a console, pass our current stdio as the // container stdio via flags. stdios[0] = os.Stdin stdios[1] = os.Stdout stdios[2] = os.Stderr if conf.Debug { // If debugging, send the boot process stdio to the // this process' stdio, so that is is easier to find. cmd.Stdin = os.Stdin cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr } } if err := s.configureStdios(conf, stdios[:]); err != nil { return fmt.Errorf("configuring stdios: %w", err) } // Note: this must be done right after "cmd.SysProcAttr.Ctty" is set above // because it relies on stdin being the next FD donated. donations.Donate("stdio-fds", stdios[:]...) if conf.ProfilingMetricsLog == "-" { donations.Donate("profiling-metrics-fd", stdios[1]) cmd.Args = append(cmd.Args, "--profiling-metrics-fd-lossy=true") } else if conf.ProfilingMetricsLog != "" { if err := donations.DonateDebugLogFile("profiling-metrics-fd", conf.ProfilingMetricsLog, "metrics", test, s.StartTime); err != nil { return err } cmd.Args = append(cmd.Args, "--profiling-metrics-fd-lossy=false") } totalSysMem, err := totalSystemMemory() if err != nil { return err } cmd.Args = append(cmd.Args, "--total-host-memory", strconv.FormatUint(totalSysMem, 10)) mem := totalSysMem if s.CgroupJSON.Cgroup != nil { cpuNum, err := s.CgroupJSON.Cgroup.NumCPU() if err != nil { return fmt.Errorf("getting cpu count from cgroups: %v", err) } if conf.CPUNumFromQuota { // Dropping below 2 CPUs can trigger application to disable // locks that can lead do hard to debug errors, so just // leaving two cores as reasonable default. const minCPUs = 2 quota, err := s.CgroupJSON.Cgroup.CPUQuota() if err != nil { return fmt.Errorf("getting cpu quota from cgroups: %v", err) } if n := int(math.Ceil(quota)); n > 0 { if n < minCPUs { n = minCPUs } if n < cpuNum { // Only lower the cpu number. cpuNum = n } } } cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum)) memLimit, err := s.CgroupJSON.Cgroup.MemoryLimit() if err != nil { return fmt.Errorf("getting memory limit from cgroups: %v", err) } if memLimit < mem { mem = memLimit } } cmd.Args = append(cmd.Args, "--total-memory", strconv.FormatUint(mem, 10)) if args.Attached { // Kill sandbox if parent process exits in attached mode. cmd.SysProcAttr.Pdeathsig = unix.SIGKILL // Tells boot that any process it creates must have pdeathsig set. cmd.Args = append(cmd.Args, "--attached") } if args.ExecFile != nil { donations.Donate("exec-fd", args.ExecFile) } nextFD = donations.Transfer(cmd, nextFD) _ = donation.DonateAndTransferCustomFiles(cmd, nextFD, args.PassFiles) // Add container ID as the last argument. cmd.Args = append(cmd.Args, s.ID) donation.LogDonations(cmd) log.Debugf("Starting sandbox: %s %v", cmd.Path, cmd.Args) log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr) if err := specutils.StartInNS(cmd, nss); err != nil { err := fmt.Errorf("starting sandbox: %v", err) // If the sandbox failed to start, it may be because the binary // permissions were incorrect. Check the bits and return a more helpful // error message. // // NOTE: The error message is checked because error types are lost over // rpc calls. if strings.Contains(err.Error(), unix.EACCES.Error()) { if permsErr := checkBinaryPermissions(conf); permsErr != nil { return fmt.Errorf("%v: %v", err, permsErr) } } return err } s.OriginalOOMScoreAdj, err = specutils.GetOOMScoreAdj(cmd.Process.Pid) if err != nil { return err } if setUserMappings { if err := SetUserMappings(args.Spec, cmd.Process.Pid); err != nil { return err } } s.child = true s.Pid.store(cmd.Process.Pid) log.Infof("Sandbox started, PID: %d", cmd.Process.Pid) return nil } // Wait waits for the containerized process to exit, and returns its WaitStatus. func (s *Sandbox) Wait(cid string) (unix.WaitStatus, error) { log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID) if conn, err := s.sandboxConnect(); err != nil { // The sandbox may have exited while before we had a chance to wait on it. // There is nothing we can do for subcontainers. For the init container, we // can try to get the sandbox exit code. if !s.IsRootContainer(cid) { return unix.WaitStatus(0), err } log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) } else { defer conn.Close() // Try the Wait RPC to the sandbox. var ws unix.WaitStatus err = conn.Call(boot.ContMgrWait, &cid, &ws) conn.Close() if err == nil { if s.IsRootContainer(cid) { if err := s.waitForStopped(); err != nil { return unix.WaitStatus(0), err } } // It worked! return ws, nil } // See comment above. if !s.IsRootContainer(cid) { return unix.WaitStatus(0), err } // The sandbox may have exited after we connected, but before // or during the Wait RPC. log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) } // The sandbox may have already exited, or exited while handling the Wait RPC. // The best we can do is ask Linux what the sandbox exit status was, since in // most cases that will be the same as the container exit status. if err := s.waitForStopped(); err != nil { return unix.WaitStatus(0), err } if !s.child { return unix.WaitStatus(0), fmt.Errorf("sandbox no longer running and its exit status is unavailable") } s.statusMu.Lock() defer s.statusMu.Unlock() return s.status, nil } // WaitPID waits for process 'pid' in the container's sandbox and returns its // WaitStatus. func (s *Sandbox) WaitPID(cid string, pid int32) (unix.WaitStatus, error) { log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID) var ws unix.WaitStatus args := &boot.WaitPIDArgs{ PID: pid, CID: cid, } if err := s.call(boot.ContMgrWaitPID, args, &ws); err != nil { return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %w", pid, s.ID, err) } return ws, nil } // WaitCheckpoint waits for the Kernel to have been successfully checkpointed // n-1 times, then waits for either the n-th successful checkpoint (in which // case it returns nil) or any number of failed checkpoints (in which case it // returns an error returned by any such failure). func (s *Sandbox) WaitCheckpoint(n uint32) error { log.Debugf("Waiting for %d-th checkpoint to complete in sandbox %q", n, s.ID) return s.call(boot.ContMgrWaitCheckpoint, &n, nil) } // IsRootContainer returns true if the specified container ID belongs to the // root container. func (s *Sandbox) IsRootContainer(cid string) bool { return s.ID == cid } // Destroy frees all resources associated with the sandbox. It fails fast and // is idempotent. func (s *Sandbox) destroy() error { log.Debugf("Destroying sandbox %q", s.ID) // Only delete the control file if it exists. if len(s.ControlSocketPath) > 0 { if err := os.Remove(s.ControlSocketPath); err != nil { log.Warningf("failed to delete control socket file %q: %v", s.ControlSocketPath, err) } } pid := s.Pid.load() if pid != 0 { log.Debugf("Killing sandbox %q", s.ID) if err := unix.Kill(pid, unix.SIGKILL); err != nil && err != unix.ESRCH { return fmt.Errorf("killing sandbox %q PID %q: %w", s.ID, pid, err) } if err := s.waitForStopped(); err != nil { return fmt.Errorf("waiting sandbox %q stop: %w", s.ID, err) } } return nil } // SignalContainer sends the signal to a container in the sandbox. If all is // true and signal is SIGKILL, then waits for all processes to exit before // returning. func (s *Sandbox) SignalContainer(cid string, sig unix.Signal, all bool) error { log.Debugf("Signal sandbox %q", s.ID) mode := boot.DeliverToProcess if all { mode = boot.DeliverToAllProcesses } args := boot.SignalArgs{ CID: cid, Signo: int32(sig), Mode: mode, } if err := s.call(boot.ContMgrSignal, &args, nil); err != nil { return fmt.Errorf("signaling container %q: %w", cid, err) } return nil } // SignalProcess sends the signal to a particular process in the container. If // fgProcess is true, then the signal is sent to the foreground process group // in the same session that PID belongs to. This is only valid if the process // is attached to a host TTY. func (s *Sandbox) SignalProcess(cid string, pid int32, sig unix.Signal, fgProcess bool) error { log.Debugf("Signal sandbox %q", s.ID) mode := boot.DeliverToProcess if fgProcess { mode = boot.DeliverToForegroundProcessGroup } args := boot.SignalArgs{ CID: cid, Signo: int32(sig), PID: pid, Mode: mode, } if err := s.call(boot.ContMgrSignal, &args, nil); err != nil { return fmt.Errorf("signaling container %q PID %d: %v", cid, pid, err) } return nil } // Checkpoint sends the checkpoint call for a container in the sandbox. // The statefile will be written to f. func (s *Sandbox) Checkpoint(cid string, imagePath string, direct bool, sfOpts statefile.Options, mfOpts pgalloc.SaveOpts) error { log.Debugf("Checkpoint sandbox %q, statefile options %+v, MemoryFile options %+v", s.ID, sfOpts, mfOpts) files, err := createSaveFiles(imagePath, direct, sfOpts.Compression) if err != nil { return err } defer func() { for _, f := range files { _ = f.Close() } }() opt := control.SaveOpts{ Metadata: sfOpts.WriteToMetadata(map[string]string{}), MemoryFileSaveOpts: mfOpts, FilePayload: urpc.FilePayload{ Files: files, }, HavePagesFile: len(files) > 1, Resume: sfOpts.Resume, } if err := s.call(boot.ContMgrCheckpoint, &opt, nil); err != nil { return fmt.Errorf("checkpointing container %q: %w", cid, err) } return nil } // createSaveFiles creates the files used by checkpoint to save the state. They are returned in // the following order: sentry state, page metadata, page file. This is the same order expected by // RPCs and argument passing to the sandbox. func createSaveFiles(path string, direct bool, compression statefile.CompressionLevel) ([]*os.File, error) { var files []*os.File stateFilePath := filepath.Join(path, boot.CheckpointStateFileName) f, err := os.OpenFile(stateFilePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644) if err != nil { return nil, fmt.Errorf("creating checkpoint state file %q: %w", stateFilePath, err) } files = append(files, f) // When there is no compression, MemoryFile contents are page-aligned. // It is beneficial to store them separately so certain optimizations can be // applied during restore. See Restore(). if compression == statefile.CompressionLevelNone { pagesMetadataFilePath := filepath.Join(path, boot.CheckpointPagesMetadataFileName) f, err = os.OpenFile(pagesMetadataFilePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644) if err != nil { return nil, fmt.Errorf("creating checkpoint pages metadata file %q: %w", pagesMetadataFilePath, err) } files = append(files, f) pagesFilePath := filepath.Join(path, boot.CheckpointPagesFileName) pagesWriteFlags := os.O_CREATE | os.O_EXCL | os.O_RDWR if direct { // The writes will be page-aligned, so it can be opened with O_DIRECT. pagesWriteFlags |= syscall.O_DIRECT } f, err := os.OpenFile(pagesFilePath, pagesWriteFlags, 0644) if err != nil { return nil, fmt.Errorf("creating checkpoint pages file %q: %w", pagesFilePath, err) } files = append(files, f) } return files, nil } // Pause sends the pause call for a container in the sandbox. func (s *Sandbox) Pause(cid string) error { log.Debugf("Pause sandbox %q", s.ID) if err := s.call(boot.ContMgrPause, nil, nil); err != nil { return fmt.Errorf("pausing container %q: %w", cid, err) } return nil } // Resume sends the resume call for a container in the sandbox. func (s *Sandbox) Resume(cid string) error { log.Debugf("Resume sandbox %q", s.ID) if err := s.call(boot.ContMgrResume, nil, nil); err != nil { return fmt.Errorf("resuming container %q: %w", cid, err) } return nil } // Usage sends the collect call for a container in the sandbox. func (s *Sandbox) Usage(Full bool) (control.MemoryUsage, error) { log.Debugf("Usage sandbox %q", s.ID) opts := control.MemoryUsageOpts{Full: Full} var m control.MemoryUsage if err := s.call(boot.UsageCollect, &opts, &m); err != nil { return control.MemoryUsage{}, fmt.Errorf("collecting usage: %w", err) } return m, nil } // UsageFD sends the usagefd call for a container in the sandbox. func (s *Sandbox) UsageFD() (*control.MemoryUsageRecord, error) { log.Debugf("Usage sandbox %q", s.ID) opts := control.MemoryUsageFileOpts{Version: 1} var m control.MemoryUsageFile if err := s.call(boot.UsageUsageFD, &opts, &m); err != nil { return nil, fmt.Errorf("collecting usage FD: %w", err) } if len(m.FilePayload.Files) != 2 { return nil, fmt.Errorf("wants exactly two fds") } return control.NewMemoryUsageRecord(*m.FilePayload.Files[0], *m.FilePayload.Files[1]) } // GetRegisteredMetrics returns metric registration data from the sandbox. // This data is meant to be used as a way to sanity-check any exported metrics data during the // lifetime of the sandbox in order to avoid a compromised sandbox from being able to produce // bogus metrics. // This returns an error if the sandbox has not requested instrumentation during creation time. func (s *Sandbox) GetRegisteredMetrics() (*metricpb.MetricRegistration, error) { if s.RegisteredMetrics == nil { return nil, errors.New("sandbox did not request instrumentation when it was created") } return s.RegisteredMetrics, nil } // ExportMetrics returns a snapshot of metric values from the sandbox in Prometheus format. func (s *Sandbox) ExportMetrics(opts control.MetricsExportOpts) (*prometheus.Snapshot, error) { log.Debugf("Metrics export sandbox %q", s.ID) var data control.MetricsExportData if err := s.call(boot.MetricsExport, &opts, &data); err != nil { return nil, err } // Since we do not trust the output of the sandbox as-is, double-check that the options were // respected. if err := opts.Verify(&data); err != nil { return nil, err } return data.Snapshot, nil } // IsRunning returns true if the sandbox or gofer process is running. func (s *Sandbox) IsRunning() bool { pid := s.Pid.load() if pid == 0 { return false } // Send a signal 0 to the sandbox process. If it succeeds, the sandbox // process is running. return unix.Kill(pid, 0) == nil } // Stacks collects and returns all stacks for the sandbox. func (s *Sandbox) Stacks() (string, error) { log.Debugf("Stacks sandbox %q", s.ID) var stacks string if err := s.call(boot.DebugStacks, nil, &stacks); err != nil { return "", fmt.Errorf("getting sandbox %q stacks: %w", s.ID, err) } return stacks, nil } // HeapProfile writes a heap profile to the given file. func (s *Sandbox) HeapProfile(f *os.File, delay time.Duration) error { log.Debugf("Heap profile %q", s.ID) opts := control.HeapProfileOpts{ FilePayload: urpc.FilePayload{Files: []*os.File{f}}, Delay: delay, } return s.call(boot.ProfileHeap, &opts, nil) } // CPUProfile collects a CPU profile. func (s *Sandbox) CPUProfile(f *os.File, duration time.Duration) error { log.Debugf("CPU profile %q", s.ID) opts := control.CPUProfileOpts{ FilePayload: urpc.FilePayload{Files: []*os.File{f}}, Duration: duration, } return s.call(boot.ProfileCPU, &opts, nil) } // BlockProfile writes a block profile to the given file. func (s *Sandbox) BlockProfile(f *os.File, duration time.Duration) error { log.Debugf("Block profile %q", s.ID) opts := control.BlockProfileOpts{ FilePayload: urpc.FilePayload{Files: []*os.File{f}}, Duration: duration, } return s.call(boot.ProfileBlock, &opts, nil) } // MutexProfile writes a mutex profile to the given file. func (s *Sandbox) MutexProfile(f *os.File, duration time.Duration) error { log.Debugf("Mutex profile %q", s.ID) opts := control.MutexProfileOpts{ FilePayload: urpc.FilePayload{Files: []*os.File{f}}, Duration: duration, } return s.call(boot.ProfileMutex, &opts, nil) } // Trace collects an execution trace. func (s *Sandbox) Trace(f *os.File, duration time.Duration) error { log.Debugf("Trace %q", s.ID) opts := control.TraceProfileOpts{ FilePayload: urpc.FilePayload{Files: []*os.File{f}}, Duration: duration, } return s.call(boot.ProfileTrace, &opts, nil) } // ChangeLogging changes logging options. func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error { log.Debugf("Change logging start %q", s.ID) if err := s.call(boot.LoggingChange, &args, nil); err != nil { return fmt.Errorf("changing sandbox %q logging: %w", s.ID, err) } return nil } // DestroyContainer destroys the given container. If it is the root container, // then the entire sandbox is destroyed. func (s *Sandbox) DestroyContainer(cid string) error { if err := s.destroyContainer(cid); err != nil { // If the sandbox isn't running, the container has already been destroyed, // ignore the error in this case. if s.IsRunning() { return err } } return nil } func (s *Sandbox) destroyContainer(cid string) error { if s.IsRootContainer(cid) { log.Debugf("Destroying root container by destroying sandbox, cid: %s", cid) return s.destroy() } log.Debugf("Destroying container, cid: %s, sandbox: %s", cid, s.ID) if err := s.call(boot.ContMgrDestroySubcontainer, &cid, nil); err != nil { return fmt.Errorf("destroying container %q: %w", cid, err) } return nil } // waitForStopped waits for the sandbox to actually stop. // This should only be called when the sandbox is known to be shutting down. func (s *Sandbox) waitForStopped() error { const waitTimeout = 2 * time.Minute if s.child { s.statusMu.Lock() defer s.statusMu.Unlock() pid := s.Pid.load() if pid == 0 { return nil } // The sandbox process is a child of the current process, // so we can wait on it to terminate and collect its zombie. if _, err := unix.Wait4(int(pid), &s.status, 0, nil); err != nil { return fmt.Errorf("error waiting the sandbox process: %v", err) } s.Pid.store(0) return nil } ctx, cancel := context.WithTimeout(context.Background(), waitTimeout) defer cancel() b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) op := func() error { if s.IsRunning() { return fmt.Errorf("sandbox is still running") } return nil } return backoff.Retry(op, b) } // configureStdios change stdios ownership to give access to the sandbox // process. This may be skipped depending on the configuration. func (s *Sandbox) configureStdios(conf *config.Config, stdios []*os.File) error { if conf.Rootless || conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { // Cannot change ownership without CAP_CHOWN. return nil } if s.UID < 0 || s.GID < 0 { panic(fmt.Sprintf("sandbox UID/GID is not set: %d/%d", s.UID, s.GID)) } for _, file := range stdios { log.Debugf("Changing %q ownership to %d/%d", file.Name(), s.UID, s.GID) if err := file.Chown(s.UID, s.GID); err != nil { if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) { log.Warningf("can't change an owner of %s: %s", file.Name(), err) continue } return err } } return nil } // deviceFileForPlatform opens the device file for the given platform. If the // platform does not need a device file, then nil is returned. // devicePath may be empty to use a sane platform-specific default. func deviceFileForPlatform(name, devicePath string) (*fd.FD, error) { p, err := platform.Lookup(name) if err != nil { return nil, err } f, err := p.OpenDevice(devicePath) if err != nil { return nil, fmt.Errorf("opening device file for platform %q: %w", name, err) } return f, nil } // getNvproxyDriverVersion returns the NVIDIA driver ABI version to use by // nvproxy. func getNvproxyDriverVersion(conf *config.Config) (string, error) { switch conf.NVProxyDriverVersion { case "": return nvproxy.HostDriverVersion() case "latest": nvproxy.Init() return nvproxy.LatestDriver().String(), nil default: version, err := nvproxy.DriverVersionFrom(conf.NVProxyDriverVersion) return version.String(), err } } // checkBinaryPermissions verifies that the required binary bits are set on // the runsc executable. func checkBinaryPermissions(conf *config.Config) error { // All platforms need the other exe bit neededBits := os.FileMode(0001) if conf.Platform == "ptrace" { // Ptrace needs the other read bit neededBits |= os.FileMode(0004) } exePath, err := os.Executable() if err != nil { return fmt.Errorf("getting exe path: %v", err) } // Check the permissions of the runsc binary and print an error if it // doesn't match expectations. info, err := os.Stat(exePath) if err != nil { return fmt.Errorf("stat file: %v", err) } if info.Mode().Perm()&neededBits != neededBits { return fmt.Errorf(specutils.FaqErrorMsg("runsc-perms", fmt.Sprintf("%s does not have the correct permissions", exePath))) } return nil } // CgroupsReadControlFile reads a single cgroupfs control file in the sandbox. func (s *Sandbox) CgroupsReadControlFile(file control.CgroupControlFile) (string, error) { log.Debugf("CgroupsReadControlFiles sandbox %q", s.ID) args := control.CgroupsReadArgs{ Args: []control.CgroupsReadArg{ { File: file, }, }, } var out control.CgroupsResults if err := s.call(boot.CgroupsReadControlFiles, &args, &out); err != nil { return "", err } if len(out.Results) != 1 { return "", fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out) } return out.Results[0].Unpack() } // CgroupsWriteControlFile writes a single cgroupfs control file in the sandbox. func (s *Sandbox) CgroupsWriteControlFile(file control.CgroupControlFile, value string) error { log.Debugf("CgroupsReadControlFiles sandbox %q", s.ID) args := control.CgroupsWriteArgs{ Args: []control.CgroupsWriteArg{ { File: file, Value: value, }, }, } var out control.CgroupsResults if err := s.call(boot.CgroupsWriteControlFiles, &args, &out); err != nil { return err } if len(out.Results) != 1 { return fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out) } return out.Results[0].AsError() } // fixPidns looks at the PID namespace path. If that path corresponds to the // sandbox process PID namespace, then change the spec so that the container // joins the sandbox root namespace. func (s *Sandbox) fixPidns(spec *specs.Spec) { pidns, ok := specutils.GetNS(specs.PIDNamespace, spec) if !ok { // pidns was not set, nothing to fix. return } if pidns.Path != fmt.Sprintf("/proc/%d/ns/pid", s.Pid.load()) { // Fix only if the PID namespace corresponds to the sandbox's. return } for i := range spec.Linux.Namespaces { if spec.Linux.Namespaces[i].Type == specs.PIDNamespace { // Removing the namespace makes the container join the sandbox root // namespace. log.Infof("Fixing PID namespace in spec from %q to make the container join the sandbox root namespace", pidns.Path) spec.Linux.Namespaces = append(spec.Linux.Namespaces[:i], spec.Linux.Namespaces[i+1:]...) return } } panic("unreachable") } // ConfigureCmdForRootless configures cmd to donate a socket FD that can be // used to synchronize userns configuration. func ConfigureCmdForRootless(cmd *exec.Cmd, donations *donation.Agency) (*os.File, error) { fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) if err != nil { return nil, err } f := os.NewFile(uintptr(fds[1]), "userns sync other FD") donations.DonateAndClose("sync-userns-fd", f) if cmd.SysProcAttr == nil { cmd.SysProcAttr = &unix.SysProcAttr{} } cmd.SysProcAttr.AmbientCaps = []uintptr{ // Same as `cap` in cmd/gofer.go. unix.CAP_CHOWN, unix.CAP_DAC_OVERRIDE, unix.CAP_DAC_READ_SEARCH, unix.CAP_FOWNER, unix.CAP_FSETID, unix.CAP_SYS_CHROOT, // Needed for setuid(2)/setgid(2). unix.CAP_SETUID, unix.CAP_SETGID, // Needed for chroot. unix.CAP_SYS_ADMIN, // Needed to be able to clear bounding set (PR_CAPBSET_DROP). unix.CAP_SETPCAP, } return os.NewFile(uintptr(fds[0]), "userns sync FD"), nil } // SetUserMappings uses newuidmap/newgidmap programs to set up user ID mappings // for process pid. func SetUserMappings(spec *specs.Spec, pid int) error { log.Debugf("Setting user mappings") args := []string{strconv.Itoa(pid)} for _, idMap := range spec.Linux.UIDMappings { log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size) args = append(args, strconv.Itoa(int(idMap.ContainerID)), strconv.Itoa(int(idMap.HostID)), strconv.Itoa(int(idMap.Size)), ) } out, err := exec.Command("newuidmap", args...).CombinedOutput() log.Debugf("newuidmap: %#v\n%s", args, out) if err != nil { return fmt.Errorf("newuidmap failed: %w", err) } args = []string{strconv.Itoa(pid)} for _, idMap := range spec.Linux.GIDMappings { log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size) args = append(args, strconv.Itoa(int(idMap.ContainerID)), strconv.Itoa(int(idMap.HostID)), strconv.Itoa(int(idMap.Size)), ) } out, err = exec.Command("newgidmap", args...).CombinedOutput() log.Debugf("newgidmap: %#v\n%s", args, out) if err != nil { return fmt.Errorf("newgidmap failed: %w", err) } return nil } // Mount mounts a filesystem in a container. func (s *Sandbox) Mount(cid, fstype, src, dest string) error { var files []*os.File switch fstype { case erofs.Name: if imageFile, err := os.Open(src); err != nil { return fmt.Errorf("opening %s: %v", src, err) } else { files = append(files, imageFile) } default: return fmt.Errorf("unsupported filesystem type: %v", fstype) } args := boot.MountArgs{ ContainerID: cid, Source: src, Destination: dest, FsType: fstype, FilePayload: urpc.FilePayload{Files: files}, } return s.call(boot.ContMgrMount, &args, nil) } // ContainerRuntimeState returns the runtime state of a container. func (s *Sandbox) ContainerRuntimeState(cid string) (boot.ContainerRuntimeState, error) { log.Debugf("ContainerRuntimeState, sandbox: %q, cid: %q", s.ID, cid) var state boot.ContainerRuntimeState if err := s.call(boot.ContMgrContainerRuntimeState, &cid, &state); err != nil { return boot.RuntimeStateInvalid, fmt.Errorf("getting container state (CID: %q): %w", cid, err) } log.Debugf("ContainerRuntimeState, sandbox: %q, cid: %q, state: %v", s.ID, cid, state) return state, nil } func setCloExeOnAllFDs() error { f, err := os.Open("/proc/self/fd") if err != nil { return fmt.Errorf("failed to open /proc/self/fd: %w", err) } defer f.Close() for { dents, err := f.Readdirnames(256) if err == io.EOF { break } else if err != nil { return fmt.Errorf("failed to read /proc/self/fd: %w", err) } for _, dent := range dents { fd, err := strconv.Atoi(dent) if err != nil { return fmt.Errorf("failed to convert /proc/self/fd entry %q to int: %w", dent, err) } if fd == int(f.Fd()) { continue } flags, _, errno := unix.RawSyscall(unix.SYS_FCNTL, uintptr(fd), unix.F_GETFD, 0) if errno != 0 { return fmt.Errorf("error getting descriptor flags: %w", errno) } if flags&unix.FD_CLOEXEC != 0 { continue } flags |= unix.FD_CLOEXEC if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, uintptr(fd), unix.F_SETFD, flags); errno != 0 { return fmt.Errorf("error setting CLOEXEC: %w", errno) } } } return nil } var setCloseExecOnce sync.Once // SetCloExeOnAllFDs sets CLOEXEC on all FDs in /proc/self/fd. This avoids // leaking inherited FDs from the parent (caller) to subprocesses created. func SetCloExeOnAllFDs() (retErr error) { // Sufficient to do this only once per runsc invocation. Avoid double work. setCloseExecOnce.Do(func() { retErr = setCloExeOnAllFDs() }) return } golang-gvisor-gvisor-0.0~20240729.0/runsc/sandbox/sandbox_impl.go000066400000000000000000000015161465435605700244440ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false package sandbox import ( "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/donation" ) func createSandboxProcessExtra(conf *config.Config, args *Args, donations *donation.Agency) error { return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/sandbox/sandbox_impl_state_autogen.go000066400000000000000000000001351465435605700273620ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package sandbox golang-gvisor-gvisor-0.0~20240729.0/runsc/sandbox/sandbox_state_autogen.go000066400000000000000000000000711465435605700263400ustar00rootroot00000000000000// automatically generated by stateify. package sandbox golang-gvisor-gvisor-0.0~20240729.0/runsc/sandbox/sandbox_unsafe_state_autogen.go000066400000000000000000000000711465435605700277010ustar00rootroot00000000000000// automatically generated by stateify. package sandbox golang-gvisor-gvisor-0.0~20240729.0/runsc/sandbox/xdp.go000066400000000000000000000460311465435605700225610ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sandbox import ( "bytes" "fmt" "net" "os" "strings" "github.com/cilium/ebpf" "github.com/cilium/ebpf/link" "github.com/vishvananda/netlink" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/pkg/xdp" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/sandbox/bpf" xdpcmd "gvisor.dev/gvisor/tools/xdp/cmd" ) // createRedirectInterfacesAndRoutes initializes the network using an AF_XDP // socket on a *host* device, not a device in the container netns. It: // // - scrapes the address, interface, and routes of the device and recreates // them in the sandbox // - does *not* remove them from the host device // - creates an AF_XDP socket bound to the device // // In effect, this takes over the host device for the duration of the sentry's // lifetime. This also means only one container can run at a time, as it // monopolizes the device. // // TODO(b/240191988): Enbable device sharing via XDP_SHARED_UMEM. // TODO(b/240191988): IPv6 support. // TODO(b/240191988): Merge redundant code with CreateLinksAndRoutes once // features are finalized. func createRedirectInterfacesAndRoutes(conn *urpc.Client, conf *config.Config) error { args, iface, err := prepareRedirectInterfaceArgs(boot.BindRunsc, conf) if err != nil { return fmt.Errorf("failed to generate redirect interface args: %w", err) } // Create an XDP socket. The sentry will mmap the rings. xdpSockFD, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0) if err != nil { return fmt.Errorf("unable to create AF_XDP socket: %w", err) } xdpSock := os.NewFile(uintptr(xdpSockFD), "xdp-sock-fd") // Dup to ensure os.File doesn't close it prematurely. if _, err := unix.Dup(xdpSockFD); err != nil { return fmt.Errorf("failed to dup XDP sock: %w", err) } args.FilePayload.Files = append(args.FilePayload.Files, xdpSock) if err := pcapAndNAT(&args, conf); err != nil { return err } log.Infof("Setting up network, config: %+v", args) if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil { return fmt.Errorf("creating links and routes: %w", err) } // Insert socket into eBPF map. Note that sockets are automatically // removed from eBPF maps when released. See net/xdp/xsk.c:xsk_release // and net/xdp/xsk.c:xsk_delete_from_maps. mapPath := xdpcmd.RedirectMapPath(iface.Name) pinnedMap, err := ebpf.LoadPinnedMap(mapPath, nil) if err != nil { return fmt.Errorf("failed to load pinned map %s: %w", mapPath, err) } // TODO(b/240191988): Updating of pinned maps should be sychronized and // check for the existence of the key. mapKey := uint32(0) mapVal := uint32(xdpSockFD) if err := pinnedMap.Update(&mapKey, &mapVal, ebpf.UpdateAny); err != nil { return fmt.Errorf("failed to insert socket into map %s: %w", mapPath, err) } // Bind to the device. // TODO(b/240191988): We can't assume there's only one queue, but this // appears to be the case on gVNIC instances. if err := xdp.Bind(xdpSockFD, uint32(iface.Index), 0 /* queueID */, conf.AFXDPUseNeedWakeup); err != nil { return fmt.Errorf("failed to bind to interface %q: %v", iface.Name, err) } return nil } // Collect addresses, routes, and neighbors from the interfaces. We only // process two interfaces: the loopback and the interface we've been told to // bind to. This all takes place in the netns where the runsc binary is run, // *not* the netns passed to the container. func prepareRedirectInterfaceArgs(bind boot.BindOpt, conf *config.Config) (boot.CreateLinksAndRoutesArgs, net.Interface, error) { ifaces, err := net.Interfaces() if err != nil { return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("querying interfaces: %w", err) } args := boot.CreateLinksAndRoutesArgs{ DisconnectOk: conf.NetDisconnectOk, } var netIface net.Interface for _, iface := range ifaces { if iface.Flags&net.FlagUp == 0 { log.Infof("Skipping down interface: %+v", iface) continue } allAddrs, err := iface.Addrs() if err != nil { return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err) } // We build our own loopback device. if iface.Flags&net.FlagLoopback != 0 { link, err := loopbackLink(conf, iface, allAddrs) if err != nil { return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("getting loopback link for iface %q: %w", iface.Name, err) } args.LoopbackLinks = append(args.LoopbackLinks, link) continue } if iface.Name != conf.XDP.IfaceName { log.Infof("Skipping interface %q", iface.Name) continue } var ipAddrs []*net.IPNet for _, ifaddr := range allAddrs { ipNet, ok := ifaddr.(*net.IPNet) if !ok { return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("address is not IPNet: %+v", ifaddr) } if ipNet.IP.To4() == nil { log.Infof("Skipping non-IPv4 address %s", ipNet.IP) continue } ipAddrs = append(ipAddrs, ipNet) } if len(ipAddrs) != 1 { return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("we only handle a single IPv4 address, but interface %q has %d: %v", iface.Name, len(ipAddrs), ipAddrs) } prefix, _ := ipAddrs[0].Mask.Size() addr := boot.IPWithPrefix{Address: ipAddrs[0].IP, PrefixLen: prefix} // Collect data from the ARP table. dump, err := netlink.NeighList(iface.Index, 0) if err != nil { return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("fetching ARP table for %q: %w", iface.Name, err) } var neighbors []boot.Neighbor for _, n := range dump { // There are only two "good" states NUD_PERMANENT and NUD_REACHABLE, // but NUD_REACHABLE is fully dynamic and will be re-probed anyway. if n.State == netlink.NUD_PERMANENT { log.Debugf("Copying a static ARP entry: %+v %+v", n.IP, n.HardwareAddr) // No flags are copied because Stack.AddStaticNeighbor does not support flags right now. neighbors = append(neighbors, boot.Neighbor{IP: n.IP, HardwareAddr: n.HardwareAddr}) } } // Scrape routes. routes, defv4, defv6, err := routesForIface(iface) if err != nil { return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("getting routes for interface %q: %v", iface.Name, err) } if defv4 != nil { if !args.Defaultv4Gateway.Route.Empty() { return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway) } args.Defaultv4Gateway.Route = *defv4 args.Defaultv4Gateway.Name = iface.Name } if defv6 != nil { if !args.Defaultv6Gateway.Route.Empty() { return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway) } args.Defaultv6Gateway.Route = *defv6 args.Defaultv6Gateway.Name = iface.Name } // Get the link address of the interface. ifaceLink, err := netlink.LinkByName(iface.Name) if err != nil { return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("getting link for interface %q: %w", iface.Name, err) } linkAddress := ifaceLink.Attrs().HardwareAddr xdplink := boot.XDPLink{ Name: iface.Name, InterfaceIndex: iface.Index, Routes: routes, TXChecksumOffload: conf.TXChecksumOffload, RXChecksumOffload: conf.RXChecksumOffload, NumChannels: conf.NumNetworkChannels, QDisc: conf.QDisc, Neighbors: neighbors, LinkAddress: linkAddress, Addresses: []boot.IPWithPrefix{addr}, GVisorGRO: conf.GVisorGRO, Bind: bind, } args.XDPLinks = append(args.XDPLinks, xdplink) netIface = iface } if len(args.XDPLinks) != 1 { return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, fmt.Errorf("expected 1 XDP link, but found %d", len(args.XDPLinks)) } return args, netIface, nil } func createSocketXDP(iface net.Interface) ([]*os.File, error) { // Create an XDP socket. The sentry will mmap memory for the various // rings and bind to the device. fd, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0) if err != nil { return nil, fmt.Errorf("unable to create AF_XDP socket: %v", err) } // We also need to, before dropping privileges, attach a program to the // device and insert our socket into its map. // Load into the kernel. spec, err := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(bpf.AFXDPProgram)) if err != nil { return nil, fmt.Errorf("failed to load spec: %v", err) } var objects struct { Program *ebpf.Program `ebpf:"xdp_prog"` SockMap *ebpf.Map `ebpf:"sock_map"` } if err := spec.LoadAndAssign(&objects, nil); err != nil { return nil, fmt.Errorf("failed to load program: %v", err) } rawLink, err := link.AttachRawLink(link.RawLinkOptions{ Program: objects.Program, Attach: ebpf.AttachXDP, Target: iface.Index, // By not setting the Flag field, the kernel will choose the // fastest mode. In order those are: // - Offloaded onto the NIC. // - Running directly in the driver. // - Generic mode, which works with any NIC/driver but lacks // much of the XDP performance boost. }) if err != nil { return nil, fmt.Errorf("failed to attach BPF program: %v", err) } // Insert our AF_XDP socket into the BPF map that dictates where // packets are redirected to. // TODO(b/240191988): Updating of pinned maps should be sychronized and // check for the existence of the key. key := uint32(0) val := uint32(fd) if err := objects.SockMap.Update(&key, &val, 0 /* flags */); err != nil { return nil, fmt.Errorf("failed to insert socket into BPF map: %v", err) } // We need to keep the Program, SockMap, and link FDs open until they // can be passed to the sandbox process. progFD, err := unix.Dup(objects.Program.FD()) if err != nil { return nil, fmt.Errorf("failed to dup BPF program: %v", err) } sockMapFD, err := unix.Dup(objects.SockMap.FD()) if err != nil { return nil, fmt.Errorf("failed to dup BPF map: %v", err) } linkFD, err := unix.Dup(rawLink.FD()) if err != nil { return nil, fmt.Errorf("failed to dup BPF link: %v", err) } return []*os.File{ os.NewFile(uintptr(fd), "xdp-fd"), // The socket. os.NewFile(uintptr(progFD), "program-fd"), // The XDP program. os.NewFile(uintptr(sockMapFD), "sockmap-fd"), // The XDP map. os.NewFile(uintptr(linkFD), "link-fd"), // The XDP link. }, nil } // TODO(b/240191988): Merge redundant code with CreateLinksAndRoutes once // features are finalized. // TODO(b/240191988): Cleanup / GC of pinned BPF objects. func createXDPTunnel(conn *urpc.Client, nsPath string, conf *config.Config) error { // Get the setup for the sentry nic. We need the host neighbors and routes. args, hostIface, err := prepareRedirectInterfaceArgs(boot.BindSentry, conf) if err != nil { return fmt.Errorf("failed to generate tunnel interface args: %w", err) } // Setup the XDP socket on the gVisor nic. files, err := func() ([]*os.File, error) { // Join the network namespace that we will be copying. restore, err := joinNetNS(nsPath) if err != nil { return nil, err } defer restore() // Create an XDP socket. The sentry will mmap memory for the various // rings and bind to the device. fd, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0) if err != nil { return nil, fmt.Errorf("unable to create AF_XDP socket: %v", err) } // We also need to, before dropping privileges, attach a program to the // device and insert our socket into its map. // Load into the kernel. spec, err := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(bpf.AFXDPProgram)) if err != nil { return nil, fmt.Errorf("failed to load spec: %v", err) } var objects struct { Program *ebpf.Program `ebpf:"xdp_prog"` SockMap *ebpf.Map `ebpf:"sock_map"` } if err := spec.LoadAndAssign(&objects, nil); err != nil { return nil, fmt.Errorf("failed to load program: %v", err) } // We assume there are two interfaces in the netns: a loopback and veth. ifaces, err := net.Interfaces() if err != nil { return nil, fmt.Errorf("querying interfaces in ns: %w", err) } var iface *net.Interface for _, netIface := range ifaces { if netIface.Flags&net.FlagLoopback == 0 { iface = &netIface break } } if iface == nil { return nil, fmt.Errorf("unable to find non-loopback interface in the ns") } args.XDPLinks[0].InterfaceIndex = iface.Index rawLink, err := link.AttachRawLink(link.RawLinkOptions{ Program: objects.Program, Attach: ebpf.AttachXDP, Target: iface.Index, // By not setting the Flag field, the kernel will choose the // fastest mode. In order those are: // - Offloaded onto the NIC. // - Running directly in the driver. // - Generic mode, which works with any NIC/driver but lacks // much of the XDP performance boost. }) if err != nil { return nil, fmt.Errorf("failed to attach BPF program to interface %q: %v", iface.Name, err) } // Insert our AF_XDP socket into the BPF map that dictates where // packets are redirected to. // TODO(b/240191988): Updating of pinned maps should be // sychronized and check for the existence of the key. key := uint32(0) val := uint32(fd) if err := objects.SockMap.Update(&key, &val, 0 /* flags */); err != nil { return nil, fmt.Errorf("failed to insert socket into BPF map: %v", err) } // We need to keep the Program, SockMap, and link FDs open until they // can be passed to the sandbox process. progFD, err := unix.Dup(objects.Program.FD()) if err != nil { return nil, fmt.Errorf("failed to dup BPF program: %v", err) } sockMapFD, err := unix.Dup(objects.SockMap.FD()) if err != nil { return nil, fmt.Errorf("failed to dup BPF map: %v", err) } linkFD, err := unix.Dup(rawLink.FD()) if err != nil { return nil, fmt.Errorf("failed to dup BPF link: %v", err) } return []*os.File{ os.NewFile(uintptr(fd), "xdp-fd"), // The socket. os.NewFile(uintptr(progFD), "program-fd"), // The XDP program. os.NewFile(uintptr(sockMapFD), "sockmap-fd"), // The XDP map. os.NewFile(uintptr(linkFD), "link-fd"), // The XDP link. }, nil }() if err != nil { return fmt.Errorf("failed to create AF_XDP socket for container: %w", err) } args.FilePayload.Files = append(args.FilePayload.Files, files...) // We're back in the parent netns. Get all interfaces. ifaces, err := net.Interfaces() if err != nil { return fmt.Errorf("querying interfaces: %w", err) } // TODO(b/240191988): Find a better way to identify the other end of the veth. var vethIface *net.Interface for _, iface := range ifaces { if strings.HasPrefix(iface.Name, "veth") { vethIface = &iface break } } if vethIface == nil { return fmt.Errorf("unable to find veth interface") } // Insert veth into host eBPF map. hostMapPath := xdpcmd.TunnelHostMapPath(hostIface.Name) pinnedHostMap, err := ebpf.LoadPinnedMap(hostMapPath, nil) if err != nil { return fmt.Errorf("failed to load pinned host map %s: %w", hostMapPath, err) } // TODO(b/240191988): Updating of pinned maps should be sychronized and // check for the existence of the key. mapKey := uint32(0) mapVal := uint32(vethIface.Index) if err := pinnedHostMap.Update(&mapKey, &mapVal, ebpf.UpdateAny); err != nil { return fmt.Errorf("failed to insert veth into host map %s: %w", hostMapPath, err) } // Attach a program to the veth. spec, err := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(bpf.TunnelVethProgram)) if err != nil { return fmt.Errorf("failed to load spec: %v", err) } var objects struct { Program *ebpf.Program `ebpf:"xdp_veth_prog"` DevMap *ebpf.Map `ebpf:"dev_map"` } if err := spec.LoadAndAssign(&objects, nil); err != nil { return fmt.Errorf("failed to load program: %v", err) } defer func() { if err := objects.Program.Close(); err != nil { log.Infof("failed to close program: %v", err) } if err := objects.DevMap.Close(); err != nil { log.Infof("failed to close sock map: %v", err) } }() attached, err := link.AttachXDP(link.XDPOptions{ Program: objects.Program, Interface: vethIface.Index, // By not setting the Flag field, the kernel will choose the // fastest mode. In order those are: // - Offloaded onto the NIC. // - Running directly in the driver. // - Generic mode, which works with any NIC/driver but lacks // much of the XDP performance boost. }) if err != nil { return fmt.Errorf("failed to attach: %w", err) } var ( vethPinDir = xdpcmd.RedirectPinDir(vethIface.Name) vethMapPath = xdpcmd.TunnelVethMapPath(vethIface.Name) vethProgramPath = xdpcmd.TunnelVethProgramPath(vethIface.Name) vethLinkPath = xdpcmd.TunnelVethLinkPath(vethIface.Name) ) // Create directory /sys/fs/bpf//. if err := os.Mkdir(vethPinDir, 0700); err != nil && !os.IsExist(err) { return fmt.Errorf("failed to create directory for pinning at %s: %v", vethPinDir, err) } // Pin the map at /sys/fs/bpf//tunnel_host_map. if err := objects.DevMap.Pin(vethMapPath); err != nil { return fmt.Errorf("failed to pin map at %s", vethMapPath) } log.Infof("Pinned map at %s", vethMapPath) // Pin the program at /sys/fs/bpf//tunnel_host_program. if err := objects.Program.Pin(vethProgramPath); err != nil { return fmt.Errorf("failed to pin program at %s", vethProgramPath) } log.Infof("Pinned program at %s", vethProgramPath) // Make everything persistent by pinning the link. Otherwise, the XDP // program would detach when this process exits. if err := attached.Pin(vethLinkPath); err != nil { return fmt.Errorf("failed to pin link at %s", vethLinkPath) } log.Infof("Pinned link at %s", vethLinkPath) // Insert host into veth eBPF map. // TODO(b/240191988): We should be able to use the existing map instead // of opening a pinned copy. pinnedVethMap, err := ebpf.LoadPinnedMap(vethMapPath, nil) if err != nil { return fmt.Errorf("failed to load pinned veth map %s: %w", vethMapPath, err) } // TODO(b/240191988): Updating of pinned maps should be sychronized and // check for the existence of the key. mapKey = uint32(0) mapVal = uint32(hostIface.Index) if err := pinnedVethMap.Update(&mapKey, &mapVal, ebpf.UpdateAny); err != nil { return fmt.Errorf("failed to insert host into veth map %s: %w", vethMapPath, err) } if err := pcapAndNAT(&args, conf); err != nil { return err } log.Debugf("Setting up network, config: %+v", args) if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil { return fmt.Errorf("creating links and routes: %w", err) } return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/specutils/000077500000000000000000000000001465435605700220105ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/specutils/cri.go000066400000000000000000000100761465435605700231200ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package specutils import ( specs "github.com/opencontainers/runtime-spec/specs-go" ) const ( // ContainerdContainerTypeAnnotation is the OCI annotation set by // containerd to indicate whether the container to create should have // its own sandbox or a container within an existing sandbox. ContainerdContainerTypeAnnotation = "io.kubernetes.cri.container-type" // ContainerdContainerTypeContainer is the container type value // indicating the container should be created in an existing sandbox. ContainerdContainerTypeContainer = "container" // ContainerdContainerTypeSandbox is the container type value // indicating the container should be created in a new sandbox. ContainerdContainerTypeSandbox = "sandbox" // ContainerdSandboxIDAnnotation is the OCI annotation set to indicate // which sandbox the container should be created in when the container // is not the first container in the sandbox. ContainerdSandboxIDAnnotation = "io.kubernetes.cri.sandbox-id" // CRIOContainerTypeAnnotation is the OCI annotation set by // CRI-O to indicate whether the container to create should have // its own sandbox or a container within an existing sandbox. CRIOContainerTypeAnnotation = "io.kubernetes.cri-o.ContainerType" // CRIOContainerTypeContainer is the container type value // indicating the container should be created in an existing sandbox. CRIOContainerTypeContainer = "container" // CRIOContainerTypeSandbox is the container type value // indicating the container should be created in a new sandbox. CRIOContainerTypeSandbox = "sandbox" // CRIOSandboxIDAnnotation is the OCI annotation set to indicate // which sandbox the container should be created in when the container // is not the first container in the sandbox. CRIOSandboxIDAnnotation = "io.kubernetes.cri-o.SandboxID" ) // ContainerType represents the type of container requested by the calling container manager. type ContainerType int const ( // ContainerTypeUnspecified indicates that no known container type // annotation was found in the spec. ContainerTypeUnspecified ContainerType = iota // ContainerTypeUnknown indicates that a container type was specified // but is unknown to us. ContainerTypeUnknown // ContainerTypeSandbox indicates that the container should be run in a // new sandbox. ContainerTypeSandbox // ContainerTypeContainer indicates that the container should be run in // an existing sandbox. ContainerTypeContainer ) // SpecContainerType tries to determine the type of container specified by the // container manager using well-known container annotations. func SpecContainerType(spec *specs.Spec) ContainerType { if t, ok := spec.Annotations[ContainerdContainerTypeAnnotation]; ok { switch t { case ContainerdContainerTypeSandbox: return ContainerTypeSandbox case ContainerdContainerTypeContainer: return ContainerTypeContainer default: return ContainerTypeUnknown } } if t, ok := spec.Annotations[CRIOContainerTypeAnnotation]; ok { switch t { case CRIOContainerTypeSandbox: return ContainerTypeSandbox case CRIOContainerTypeContainer: return ContainerTypeContainer default: return ContainerTypeUnknown } } return ContainerTypeUnspecified } // SandboxID returns the ID of the sandbox to join and whether an ID was found // in the spec. func SandboxID(spec *specs.Spec) (string, bool) { if id, ok := spec.Annotations[ContainerdSandboxIDAnnotation]; ok { return id, true } if id, ok := spec.Annotations[CRIOSandboxIDAnnotation]; ok { return id, true } return "", false } golang-gvisor-gvisor-0.0~20240729.0/runsc/specutils/fs.go000066400000000000000000000137231465435605700227550ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package specutils import ( "fmt" "math/bits" "path" "slices" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" ) type mapping struct { set bool val uint32 } // optionsMap maps mount propagation-related OCI filesystem options to mount(2) // syscall flags. var optionsMap = map[string]mapping{ "acl": {set: true, val: unix.MS_POSIXACL}, "async": {set: false, val: unix.MS_SYNCHRONOUS}, "atime": {set: false, val: unix.MS_NOATIME}, "bind": {set: true, val: unix.MS_BIND}, "defaults": {set: true, val: 0}, "dev": {set: false, val: unix.MS_NODEV}, "diratime": {set: false, val: unix.MS_NODIRATIME}, "dirsync": {set: true, val: unix.MS_DIRSYNC}, "exec": {set: false, val: unix.MS_NOEXEC}, "noexec": {set: true, val: unix.MS_NOEXEC}, "iversion": {set: true, val: unix.MS_I_VERSION}, "loud": {set: false, val: unix.MS_SILENT}, "mand": {set: true, val: unix.MS_MANDLOCK}, "noacl": {set: false, val: unix.MS_POSIXACL}, "noatime": {set: true, val: unix.MS_NOATIME}, "nodev": {set: true, val: unix.MS_NODEV}, "nodiratime": {set: true, val: unix.MS_NODIRATIME}, "noiversion": {set: false, val: unix.MS_I_VERSION}, "nomand": {set: false, val: unix.MS_MANDLOCK}, "norelatime": {set: false, val: unix.MS_RELATIME}, "nostrictatime": {set: false, val: unix.MS_STRICTATIME}, "nosuid": {set: true, val: unix.MS_NOSUID}, "rbind": {set: true, val: unix.MS_BIND | unix.MS_REC}, "relatime": {set: true, val: unix.MS_RELATIME}, "remount": {set: true, val: unix.MS_REMOUNT}, "ro": {set: true, val: unix.MS_RDONLY}, "rw": {set: false, val: unix.MS_RDONLY}, "silent": {set: true, val: unix.MS_SILENT}, "strictatime": {set: true, val: unix.MS_STRICTATIME}, "suid": {set: false, val: unix.MS_NOSUID}, "sync": {set: true, val: unix.MS_SYNCHRONOUS}, } // propOptionsMap is similar to optionsMap, but it lists propagation options // that cannot be used together with other flags. var propOptionsMap = map[string]mapping{ "private": {set: true, val: unix.MS_PRIVATE}, "rprivate": {set: true, val: unix.MS_PRIVATE | unix.MS_REC}, "slave": {set: true, val: unix.MS_SLAVE}, "rslave": {set: true, val: unix.MS_SLAVE | unix.MS_REC}, "unbindable": {set: true, val: unix.MS_UNBINDABLE}, "runbindable": {set: true, val: unix.MS_UNBINDABLE | unix.MS_REC}, } // invalidOptions list options not allowed. // - shared: sandbox must be isolated from the host. Propagating mount changes // from the sandbox to the host breaks the isolation. The sandbox's mount // table is maintained in sentry memory. Mount operations from the application // are not propagated to the host. var invalidOptions = []string{"shared", "rshared"} // OptionsToFlags converts mount options to syscall flags. func OptionsToFlags(opts []string) uint32 { return optionsToFlags(opts, optionsMap) } // PropOptionsToFlags converts propagation mount options to syscall flags. // Propagation options cannot be set other with other options and must be // handled separately. func PropOptionsToFlags(opts []string) uint32 { return optionsToFlags(opts, propOptionsMap) } func optionsToFlags(opts []string, source map[string]mapping) uint32 { var rv uint32 for _, opt := range opts { if m, ok := source[opt]; ok { if m.set { rv |= m.val } else { rv ^= m.val } } } return rv } // IsReadonlyMount returns true if the mount options has read only option. func IsReadonlyMount(opts []string) bool { for _, o := range opts { if o == "ro" { return true } } return false } // validateMount validates that spec mounts are correct. func validateMount(mnt *specs.Mount) error { if !path.IsAbs(mnt.Destination) { return fmt.Errorf("Mount.Destination must be an absolute path: %v", mnt) } if IsGoferMount(*mnt) { return ValidateMountOptions(mnt.Options) } return nil } // FilterMountOptions filters out all invalid mount options. func FilterMountOptions(opts []string) []string { out := make([]string, 0, len(opts)) for _, o := range opts { if err := validateMountOption(o); err == nil { out = append(out, o) } else { log.Warningf("mount option skipped %q: %v", o, err) } } return out } // ValidateMountOptions validates that mount options are correct. func ValidateMountOptions(opts []string) error { for _, o := range opts { if err := validateMountOption(o); err != nil { return err } } return nil } func validateMountOption(o string) error { if slices.Contains(invalidOptions, o) { return fmt.Errorf("mount option %q is not supported", o) } return validatePropagation(o) } // ValidateRootfsPropagation validates that rootfs propagation options are // correct. func validateRootfsPropagation(opt string) error { flags := PropOptionsToFlags([]string{opt}) if flags&(unix.MS_SLAVE|unix.MS_PRIVATE) == 0 { return fmt.Errorf("root mount propagation option must specify private or slave: %q", opt) } return validatePropagation(opt) } func validatePropagation(opt string) error { flags := PropOptionsToFlags([]string{opt}) exclusive := flags & (unix.MS_SLAVE | unix.MS_PRIVATE | unix.MS_SHARED | unix.MS_UNBINDABLE) if bits.OnesCount32(exclusive) > 1 { return fmt.Errorf("mount propagation options are mutually exclusive: %q", opt) } return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/specutils/namespace.go000066400000000000000000000223571465435605700243040ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package specutils import ( "fmt" "os" "os/exec" "os/signal" "path/filepath" "runtime" "syscall" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/syndtr/gocapability/capability" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" ) // nsCloneFlag returns the clone flag that can be used to set a namespace of // the given type. func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr { switch nst { case specs.IPCNamespace: return unix.CLONE_NEWIPC case specs.MountNamespace: return unix.CLONE_NEWNS case specs.NetworkNamespace: return unix.CLONE_NEWNET case specs.PIDNamespace: return unix.CLONE_NEWPID case specs.UTSNamespace: return unix.CLONE_NEWUTS case specs.UserNamespace: return unix.CLONE_NEWUSER case specs.CgroupNamespace: return unix.CLONE_NEWCGROUP default: panic(fmt.Sprintf("unknown namespace %v", nst)) } } // nsPath returns the path of the namespace for the current process and the // given namespace. func nsPath(nst specs.LinuxNamespaceType) string { base := "/proc/self/ns" switch nst { case specs.CgroupNamespace: return filepath.Join(base, "cgroup") case specs.IPCNamespace: return filepath.Join(base, "ipc") case specs.MountNamespace: return filepath.Join(base, "mnt") case specs.NetworkNamespace: return filepath.Join(base, "net") case specs.PIDNamespace: return filepath.Join(base, "pid") case specs.UserNamespace: return filepath.Join(base, "user") case specs.UTSNamespace: return filepath.Join(base, "uts") default: panic(fmt.Sprintf("unknown namespace %v", nst)) } } // GetNS returns true and the namespace with the given type from the slice of // namespaces in the spec. It returns false if the slice does not contain a // namespace with the type. func GetNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) { if s.Linux == nil { return specs.LinuxNamespace{}, false } for _, ns := range s.Linux.Namespaces { if ns.Type == nst { return ns, true } } return specs.LinuxNamespace{}, false } // setNS sets the namespace of the given type. It must be called with // OSThreadLocked. func setNS(fd, nsType uintptr) error { if _, _, err := unix.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 { return err } return nil } // ApplyNS applies the namespace on the current thread and returns a function // that will restore the namespace to the original value. // // Preconditions: Must be called with os thread locked. func ApplyNS(ns specs.LinuxNamespace) (func() error, error) { log.Infof("Applying namespace %v at path %q", ns.Type, ns.Path) newNS, err := os.Open(ns.Path) if err != nil { return nil, fmt.Errorf("error opening %q: %v", ns.Path, err) } defer newNS.Close() // Store current namespace to restore back. curPath := nsPath(ns.Type) oldNS, err := os.Open(curPath) if err != nil { return nil, fmt.Errorf("error opening %q: %v", curPath, err) } // Set namespace to the one requested and setup function to restore it back. flag := nsCloneFlag(ns.Type) if err := setNS(newNS.Fd(), flag); err != nil { oldNS.Close() return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err) } return func() error { log.Infof("Restoring namespace %v", ns.Type) defer oldNS.Close() if err := setNS(oldNS.Fd(), flag); err != nil { return fmt.Errorf("error restoring namespace: of type %v: %v", ns.Type, err) } return nil }, nil } // StartInNS joins or creates the given namespaces and calls cmd.Start before // restoring the namespaces to the original values. func StartInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error { errChan := make(chan error) go func() { runtime.LockOSThread() defer runtime.UnlockOSThread() rstFuncs, err := startInNS(cmd, nss) errChan <- err for _, rstFunc := range rstFuncs { err := rstFunc() if err == nil { continue } // One or more namespaces have not been restored, but // we can't destroy the current system thread, because // a child process is execited with Pdeathsig. log.Debugf("Block the current system thread due to: %s", err) c := make(chan any) <-c } }() return <-errChan } func startInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) ([]func() error, error) { if cmd.SysProcAttr == nil { cmd.SysProcAttr = &unix.SysProcAttr{} } var deferFuncs []func() error for _, ns := range nss { if ns.Path == "" { // No path. Just set a flag to create a new namespace. cmd.SysProcAttr.Cloneflags |= nsCloneFlag(ns.Type) continue } // Join the given namespace, and restore the current namespace // before exiting. restoreNS, err := ApplyNS(ns) if err != nil { return deferFuncs, err } deferFuncs = append(deferFuncs, restoreNS) } err := cmd.Start() if err != nil && cmd.SysProcAttr.Cloneflags&unix.CLONE_NEWUSER != 0 { err = fmt.Errorf("%v: check whether /proc/sys/user/max_user_namespaces is set too low (gvisor.dev/issue/5964)", err) } return deferFuncs, err } // SetUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd. func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) { if s.Linux == nil { return } if cmd.SysProcAttr == nil { cmd.SysProcAttr = &unix.SysProcAttr{} } for _, idMap := range s.Linux.UIDMappings { log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size) cmd.SysProcAttr.UidMappings = append(cmd.SysProcAttr.UidMappings, syscall.SysProcIDMap{ ContainerID: int(idMap.ContainerID), HostID: int(idMap.HostID), Size: int(idMap.Size), }) } for _, idMap := range s.Linux.GIDMappings { log.Infof("Mapping host gid %d to container gid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size) cmd.SysProcAttr.GidMappings = append(cmd.SysProcAttr.GidMappings, syscall.SysProcIDMap{ ContainerID: int(idMap.ContainerID), HostID: int(idMap.HostID), Size: int(idMap.Size), }) } } // HasCapabilities returns true if the user has all capabilities in 'cs'. func HasCapabilities(cs ...capability.Cap) bool { caps, err := capability.NewPid2(os.Getpid()) if err != nil { return false } if err := caps.Load(); err != nil { return false } for _, c := range cs { if !caps.Get(capability.EFFECTIVE, c) { return false } } return true } // MaybeRunAsRoot ensures the process runs with capabilities needed to create a // sandbox, e.g. CAP_SYS_ADMIN, CAP_SYS_CHROOT, etc. If capabilities are needed, // it will create a new user namespace and re-execute the process as root // inside the namespace with the same arguments and environment. // // This function returns immediately when no new capability is needed. If // another process is executed, it returns straight from here with the same exit // code as the child. func MaybeRunAsRoot() error { if HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT, capability.CAP_SETUID, capability.CAP_SETGID) { return nil } // Current process doesn't have required capabilities, create user namespace // and run as root inside the namespace to acquire capabilities. log.Infof("*** Re-running as root in new user namespace ***") cmd := exec.Command("/proc/self/exe", os.Args[1:]...) cmd.SysProcAttr = &unix.SysProcAttr{ Cloneflags: unix.CLONE_NEWUSER | unix.CLONE_NEWNS, // Set current user/group as root inside the namespace. Since we may not // have CAP_SETUID/CAP_SETGID, just map root to the current user/group. UidMappings: []syscall.SysProcIDMap{ {ContainerID: 0, HostID: os.Getuid(), Size: 1}, }, GidMappings: []syscall.SysProcIDMap{ {ContainerID: 0, HostID: os.Getgid(), Size: 1}, }, Credential: &syscall.Credential{Uid: 0, Gid: 0}, GidMappingsEnableSetgroups: false, // Make sure child is killed when the parent terminates. Pdeathsig: unix.SIGKILL, // Detach from session. Otherwise, signals sent to the foreground process // will also be forwarded by this process, resulting in duplicate signals. Setsid: true, } cmd.Env = os.Environ() cmd.Stdin = os.Stdin cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr if err := cmd.Start(); err != nil { return fmt.Errorf("re-executing self: %w", err) } ch := make(chan os.Signal, 1) signal.Notify(ch) go func() { for { // Forward all signals to child process. sig := <-ch if err := cmd.Process.Signal(sig); err != nil { log.Warningf("Error forwarding signal %v to child (PID %d)", sig, cmd.Process.Pid) } } }() if err := cmd.Wait(); err != nil { if exit, ok := err.(*exec.ExitError); ok { if ws, ok := exit.Sys().(syscall.WaitStatus); ok { os.Exit(ws.ExitStatus()) } log.Warningf("No wait status provided, exiting with -1: %v", err) os.Exit(-1) } return err } // Child completed with success. os.Exit(0) panic("unreachable") } golang-gvisor-gvisor-0.0~20240729.0/runsc/specutils/nvidia.go000066400000000000000000000100631465435605700236110ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package specutils import ( "fmt" "strconv" "strings" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/runsc/config" ) const nvdEnvVar = "NVIDIA_VISIBLE_DEVICES" // AnnotationNVProxy enables nvproxy. const AnnotationNVProxy = "dev.gvisor.internal.nvproxy" // NVProxyEnabled checks both the nvproxy annotation and conf.NVProxy to see if nvproxy is enabled. func NVProxyEnabled(spec *specs.Spec, conf *config.Config) bool { if conf.NVProxy { return true } return AnnotationToBool(spec, AnnotationNVProxy) } // GPUFunctionalityRequested returns true if the container should have access // to GPU functionality. func GPUFunctionalityRequested(spec *specs.Spec, conf *config.Config) bool { if !NVProxyEnabled(spec, conf) { // nvproxy disabled. return false } // In GKE, the nvidia_gpu device plugin injects NVIDIA devices into // spec.Linux.Devices when GPUs are allocated to a container. if spec.Linux != nil { for _, dev := range spec.Linux.Devices { if dev.Path == "/dev/nvidiactl" { return true } } } return gpuFunctionalityRequestedViaHook(spec, conf) } // GPUFunctionalityRequestedViaHook returns true if the container should have // access to GPU functionality configured via nvidia-container-runtime-hook. // This hook is used by: // - Docker when using `--gpus` flag from the CLI. // - nvidia-container-runtime when using its legacy mode. func GPUFunctionalityRequestedViaHook(spec *specs.Spec, conf *config.Config) bool { if !NVProxyEnabled(spec, conf) { // nvproxy disabled. return false } return gpuFunctionalityRequestedViaHook(spec, conf) } // Precondition: NVProxyEnabled(spec, conf). func gpuFunctionalityRequestedViaHook(spec *specs.Spec, conf *config.Config) bool { if !isNvidiaHookPresent(spec, conf) { return false } // In Docker mode, GPU access is only requested if NVIDIA_VISIBLE_DEVICES is // non-empty and set to a value that doesn't mean "no GPU". if spec.Process == nil { return false } nvd, _ := EnvVar(spec.Process.Env, nvdEnvVar) // A value of "none" means "no GPU device, but still access to driver // functionality", so it is not a value we check for here. return nvd != "" && nvd != "void" } func isNvidiaHookPresent(spec *specs.Spec, conf *config.Config) bool { if conf.NVProxyDocker { // This has the effect of injecting the nvidia-container-runtime-hook. return true } if spec.Hooks != nil { for _, h := range spec.Hooks.Prestart { if strings.HasSuffix(h.Path, "/nvidia-container-runtime-hook") { return true } } } return false } // ParseNvidiaVisibleDevices parses NVIDIA_VISIBLE_DEVICES env var and returns // the devices specified in it. This can be passed to nvidia-container-cli. // // Precondition: conf.NVProxyDocker && GPUFunctionalityRequested(spec, conf). func ParseNvidiaVisibleDevices(spec *specs.Spec) (string, error) { nvd, _ := EnvVar(spec.Process.Env, nvdEnvVar) if nvd == "none" { return "", nil } if nvd == "all" { return "all", nil } // Expect nvd to be a list of indices; UUIDs aren't supported // yet. for _, gpuDev := range strings.Split(nvd, ",") { // Validate gpuDev. We only support the following formats for now: // * GPU indices (e.g. 0,1,2) // * GPU UUIDs (e.g. GPU-fef8089b) // // We do not support MIG devices yet. if strings.HasPrefix(gpuDev, "GPU-") { continue } _, err := strconv.ParseUint(gpuDev, 10, 32) if err != nil { return "", fmt.Errorf("invalid %q in NVIDIA_VISIBLE_DEVICES %q: %w", gpuDev, nvd, err) } } return nvd, nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/specutils/seccomp/000077500000000000000000000000001465435605700234415ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/specutils/seccomp/audit_amd64.go000066400000000000000000000013561465435605700260760ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 // +build amd64 package seccomp import ( "gvisor.dev/gvisor/pkg/abi/linux" ) const ( nativeArchAuditNo = linux.AUDIT_ARCH_X86_64 ) golang-gvisor-gvisor-0.0~20240729.0/runsc/specutils/seccomp/audit_arm64.go000066400000000000000000000013571465435605700261150ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package seccomp import ( "gvisor.dev/gvisor/pkg/abi/linux" ) const ( nativeArchAuditNo = linux.AUDIT_ARCH_AARCH64 ) golang-gvisor-gvisor-0.0~20240729.0/runsc/specutils/seccomp/seccomp.go000066400000000000000000000152301465435605700254220ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package seccomp implements some features of libseccomp in order to support // OCI. package seccomp import ( "fmt" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/kernel" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" ) var ( killThreadAction = linux.SECCOMP_RET_KILL_THREAD trapAction = linux.SECCOMP_RET_TRAP // runc always returns EPERM as the errorcode for SECCOMP_RET_ERRNO errnoAction = linux.SECCOMP_RET_ERRNO.WithReturnCode(uint16(unix.EPERM)) // runc always returns EPERM as the errorcode for SECCOMP_RET_TRACE traceAction = linux.SECCOMP_RET_TRACE.WithReturnCode(uint16(unix.EPERM)) allowAction = linux.SECCOMP_RET_ALLOW ) // BuildProgram generates a bpf program based on the given OCI seccomp // config. func BuildProgram(s *specs.LinuxSeccomp) (bpf.Program, error) { defaultAction, err := convertAction(s.DefaultAction) if err != nil { return bpf.Program{}, fmt.Errorf("secomp default action: %w", err) } ruleset, err := convertRules(s) if err != nil { return bpf.Program{}, fmt.Errorf("invalid seccomp rules: %w", err) } instrs, _, err := seccomp.BuildProgram(ruleset, seccomp.ProgramOptions{ DefaultAction: defaultAction, BadArchAction: killThreadAction, }) if err != nil { return bpf.Program{}, fmt.Errorf("building seccomp program: %w", err) } program, err := bpf.Compile(instrs, true /* optimize */) if err != nil { return bpf.Program{}, fmt.Errorf("compiling seccomp program: %w", err) } return program, nil } // lookupSyscallNo gets the syscall number for the syscall with the given name // for the given architecture. func lookupSyscallNo(arch uint32, name string) (uint32, error) { var table *kernel.SyscallTable switch arch { case linux.AUDIT_ARCH_X86_64: table = slinux.AMD64 case linux.AUDIT_ARCH_AARCH64: table = slinux.ARM64 } if table == nil { return 0, fmt.Errorf("unsupported architecture: %d", arch) } n, err := table.LookupNo(name) if err != nil { return 0, err } return uint32(n), nil } // convertAction converts a LinuxSeccompAction to BPFAction func convertAction(act specs.LinuxSeccompAction) (linux.BPFAction, error) { // TODO(gvisor.dev/issue/3124): Update specs package to include ActLog and ActKillProcess. switch act { case specs.ActKill: return killThreadAction, nil case specs.ActTrap: return trapAction, nil case specs.ActErrno: return errnoAction, nil case specs.ActTrace: return traceAction, nil case specs.ActAllow: return allowAction, nil default: return 0, fmt.Errorf("invalid action: %v", act) } } // convertRules converts OCI linux seccomp rules into RuleSets that can be used by // the seccomp package to build a seccomp program. func convertRules(s *specs.LinuxSeccomp) ([]seccomp.RuleSet, error) { // NOTE: Architectures are only really relevant when calling 32bit syscalls // on a 64bit system. Since we don't support that in gVisor anyway, we // ignore Architectures and only test against the native architecture. ruleset := []seccomp.RuleSet{} for _, syscall := range s.Syscalls { sysRules := seccomp.NewSyscallRules() action, err := convertAction(syscall.Action) if err != nil { return nil, err } // Args rule, err := convertArgs(syscall.Args) if err != nil { return nil, err } for _, name := range syscall.Names { syscallNo, err := lookupSyscallNo(nativeArchAuditNo, name) if err != nil { // If there is an error looking up the syscall number, assume it is // not supported on this architecture and ignore it. This is, for // better or worse, what runc does. log.Warningf("OCI seccomp: ignoring syscall %q", name) continue } sysRules.Add(uintptr(syscallNo), rule) } ruleset = append(ruleset, seccomp.RuleSet{ Rules: sysRules, Action: action, }) } return ruleset, nil } // convertArgs converts an OCI seccomp argument rule to a list of seccomp.Rule. func convertArgs(args []specs.LinuxSeccompArg) (seccomp.SyscallRule, error) { argCounts := make([]uint, 6) for _, arg := range args { if arg.Index > 6 { return nil, fmt.Errorf("invalid index: %d", arg.Index) } argCounts[arg.Index]++ } // NOTE: If multiple rules apply to the same argument (same index) the // action is triggered if any one of the rules matches (OR). If not, then // all rules much match in order to trigger the action (AND). This appears to // be some kind of legacy behavior of runc that nevertheless needs to be // supported to maintain compatibility. hasMultipleArgs := false for _, count := range argCounts { if count > 1 { hasMultipleArgs = true break } } if hasMultipleArgs { rules := seccomp.Or{} // Old runc behavior - do this for compatibility. // Add rules as ORs by adding separate Rules. for _, arg := range args { rule := seccomp.PerArg{nil, nil, nil, nil, nil, nil} if err := convertRule(arg, &rule); err != nil { return nil, err } rules = append(rules, rule) } return rules, nil } // Add rules as ANDs by adding to the same Rule. rule := seccomp.PerArg{nil, nil, nil, nil, nil, nil} for _, arg := range args { if err := convertRule(arg, &rule); err != nil { return nil, err } } return rule, nil } // convertRule converts and adds the arg to a PerArg rule. func convertRule(arg specs.LinuxSeccompArg, perArg *seccomp.PerArg) error { switch arg.Op { case specs.OpEqualTo: perArg[arg.Index] = seccomp.EqualTo(arg.Value) case specs.OpNotEqual: perArg[arg.Index] = seccomp.NotEqual(arg.Value) case specs.OpGreaterThan: perArg[arg.Index] = seccomp.GreaterThan(arg.Value) case specs.OpGreaterEqual: perArg[arg.Index] = seccomp.GreaterThanOrEqual(arg.Value) case specs.OpLessThan: perArg[arg.Index] = seccomp.LessThan(arg.Value) case specs.OpLessEqual: perArg[arg.Index] = seccomp.LessThanOrEqual(arg.Value) case specs.OpMaskedEqual: perArg[arg.Index] = seccomp.MaskedEqual(uintptr(arg.Value), uintptr(arg.ValueTwo)) default: return fmt.Errorf("unsupported operand: %q", arg.Op) } return nil } golang-gvisor-gvisor-0.0~20240729.0/runsc/specutils/seccomp/seccomp_amd64_state_autogen.go000066400000000000000000000001331465435605700313330ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 // +build amd64 package seccomp golang-gvisor-gvisor-0.0~20240729.0/runsc/specutils/seccomp/seccomp_arm64_state_autogen.go000066400000000000000000000001331465435605700313510ustar00rootroot00000000000000// automatically generated by stateify. //go:build arm64 // +build arm64 package seccomp golang-gvisor-gvisor-0.0~20240729.0/runsc/specutils/seccomp/seccomp_state_autogen.go000066400000000000000000000000711465435605700303410ustar00rootroot00000000000000// automatically generated by stateify. package seccomp golang-gvisor-gvisor-0.0~20240729.0/runsc/specutils/specutils.go000066400000000000000000000621671465435605700243660ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package specutils contains utility functions for working with OCI runtime // specs. package specutils import ( "encoding/json" "fmt" "io" "io/ioutil" "os" "path" "path/filepath" "strconv" "strings" "time" "github.com/cenkalti/backoff" "github.com/mohae/deepcopy" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/devices/tpuproxy" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/flag" ) const ( annotationFlagPrefix = "dev.gvisor.flag." annotationSeccomp = "dev.gvisor.internal.seccomp." annotationSeccompRuntimeDefault = "RuntimeDefault" annotationContainerName = "io.kubernetes.cri.container-name" ) const ( // AnnotationTPU is the annotation used to enable TPU proxy on a pod. AnnotationTPU = "dev.gvisor.internal.tpuproxy" ) // ExePath must point to runsc binary, which is normally the same binary. It's // changed in tests that aren't linked in the same binary. var ExePath = "/proc/self/exe" // Version is the supported spec version. var Version = specs.Version // LogSpecDebug writes the spec in a human-friendly format to the debug log. func LogSpecDebug(orig *specs.Spec, logSeccomp bool) { if !log.IsLogging(log.Debug) { return } // Strip down parts of the spec that are not interesting. spec := deepcopy.Copy(orig).(*specs.Spec) if spec.Process != nil { spec.Process.Capabilities = nil } if spec.Linux != nil { if !logSeccomp { spec.Linux.Seccomp = nil } spec.Linux.MaskedPaths = nil spec.Linux.ReadonlyPaths = nil if spec.Linux.Resources != nil { spec.Linux.Resources.Devices = nil } } out, err := json.MarshalIndent(spec, "", " ") if err != nil { log.Debugf("Failed to marshal spec: %v", err) return } log.Debugf("Spec:\n%s", out) } // ValidateSpec validates that the spec is compatible with runsc. func ValidateSpec(spec *specs.Spec) error { // Mandatory fields. if spec.Process == nil { return fmt.Errorf("Spec.Process must be defined: %+v", spec) } if len(spec.Process.Args) == 0 { return fmt.Errorf("Spec.Process.Arg must be defined: %+v", spec.Process) } if spec.Root == nil { return fmt.Errorf("Spec.Root must be defined: %+v", spec) } if len(spec.Root.Path) == 0 { return fmt.Errorf("Spec.Root.Path must be defined: %+v", spec.Root) } // Unsupported fields. if spec.Solaris != nil { return fmt.Errorf("Spec.Solaris is not supported: %+v", spec) } if spec.Windows != nil { return fmt.Errorf("Spec.Windows is not supported: %+v", spec) } if len(spec.Process.SelinuxLabel) != 0 { return fmt.Errorf("SELinux is not supported: %s", spec.Process.SelinuxLabel) } // Docker uses AppArmor by default, so just log that it's being ignored. if spec.Process.ApparmorProfile != "" { log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile) } // PR_SET_NO_NEW_PRIVS is assumed to always be set. // See kernel.Task.updateCredsForExecLocked. if !spec.Process.NoNewPrivileges { log.Warningf("noNewPrivileges ignored. PR_SET_NO_NEW_PRIVS is assumed to always be set.") } if spec.Linux != nil && spec.Linux.RootfsPropagation != "" { if err := validateRootfsPropagation(spec.Linux.RootfsPropagation); err != nil { return err } } for _, m := range spec.Mounts { if err := validateMount(&m); err != nil { return err } } // CRI specifies whether a container should start a new sandbox, or run // another container in an existing sandbox. switch SpecContainerType(spec) { case ContainerTypeContainer: // When starting a container in an existing sandbox, the // sandbox ID must be set. if _, ok := SandboxID(spec); !ok { return fmt.Errorf("spec has container-type of container, but no sandbox ID set") } case ContainerTypeUnknown: return fmt.Errorf("unknown container-type") default: } return nil } // absPath turns the given path into an absolute path (if it is not already // absolute) by prepending the base path. func absPath(base, rel string) string { if filepath.IsAbs(rel) { return rel } return filepath.Join(base, rel) } // OpenSpec opens an OCI runtime spec from the given bundle directory. func OpenSpec(bundleDir string) (*os.File, error) { // The spec file must be named "config.json" inside the bundle directory. return os.Open(filepath.Join(bundleDir, "config.json")) } // ReadSpec reads an OCI runtime spec from the given bundle directory. // ReadSpec also normalizes all potential relative paths into absolute // path, e.g. spec.Root.Path, mount.Source. func ReadSpec(bundleDir string, conf *config.Config) (*specs.Spec, error) { specFile, err := OpenSpec(bundleDir) if err != nil { return nil, fmt.Errorf("error opening spec file %q: %v", filepath.Join(bundleDir, "config.json"), err) } defer specFile.Close() return ReadSpecFromFile(bundleDir, specFile, conf) } // ReadSpecFromFile reads an OCI runtime spec from the given file. It also fixes // up the spec so that the rest of the code doesn't need to worry about it. // 1. Normalizes all relative paths into absolute by prepending the bundle // dir to them. // 2. Looks for flag overrides and applies them if any. // 3. Removes seccomp rules if `RuntimeDefault` was used. func ReadSpecFromFile(bundleDir string, specFile *os.File, conf *config.Config) (*specs.Spec, error) { if _, err := specFile.Seek(0, io.SeekStart); err != nil { return nil, fmt.Errorf("error seeking to beginning of file %q: %v", specFile.Name(), err) } specBytes, err := ioutil.ReadAll(specFile) if err != nil { return nil, fmt.Errorf("error reading spec from file %q: %v", specFile.Name(), err) } var spec specs.Spec if err := json.Unmarshal(specBytes, &spec); err != nil { return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile.Name(), err, string(specBytes)) } if err := ValidateSpec(&spec); err != nil { return nil, err } if err := fixSpec(&spec, bundleDir, conf); err != nil { return nil, err } return &spec, nil } func fixSpec(spec *specs.Spec, bundleDir string, conf *config.Config) error { // Turn any relative paths in the spec to absolute by prepending the bundleDir. spec.Root.Path = absPath(bundleDir, spec.Root.Path) for i := range spec.Mounts { m := &spec.Mounts[i] if m.Source != "" { m.Source = absPath(bundleDir, m.Source) } } // Look for config bundle annotations and verify that they exist. const configBundlePrefix = "dev.gvisor.bundle." var bundles []config.BundleName for annotation, val := range spec.Annotations { if !strings.HasPrefix(annotation, configBundlePrefix) { continue } if val != "true" { return fmt.Errorf("invalid value %q for annotation %q (must be set to 'true' or removed entirely)", val, annotation) } bundleName := config.BundleName(annotation[len(configBundlePrefix):]) if _, exists := config.Bundles[bundleName]; !exists { log.Warningf("Bundle name %q (from annotation %q=%q) does not exist; this bundle may have been deprecated. Skipping.", bundleName, annotation, val) continue } bundles = append(bundles, bundleName) } // Apply config bundles, if any. if len(bundles) > 0 { log.Infof("Applying config bundles: %v", bundles) if err := conf.ApplyBundles(flag.CommandLine, bundles...); err != nil { return err } } containerName := ContainerName(spec) for annotation, val := range spec.Annotations { if strings.HasPrefix(annotation, annotationFlagPrefix) { // Override flags using annotation to allow customization per sandbox // instance. name := annotation[len(annotationFlagPrefix):] log.Infof("Overriding flag from flag annotation: --%s=%q", name, val) if err := conf.Override(flag.CommandLine, name, val /* force= */, false); err != nil { return err } } else if len(containerName) > 0 { // If we know the container name, then check to see if seccomp // instructions were given to the container. if annotation == annotationSeccomp+containerName && val == annotationSeccompRuntimeDefault { // Container seccomp rules are redundant when using gVisor, so remove // them when seccomp is set to RuntimeDefault. if spec.Linux != nil && spec.Linux.Seccomp != nil { log.Debugf("Seccomp is being ignored because annotation %q is set to default.", annotationSeccomp) spec.Linux.Seccomp = nil } } } } return nil } // ReadMounts reads mount list from a file. func ReadMounts(f *os.File) ([]specs.Mount, error) { bytes, err := ioutil.ReadAll(f) if err != nil { return nil, fmt.Errorf("error reading mounts: %v", err) } var mounts []specs.Mount if err := json.Unmarshal(bytes, &mounts); err != nil { return nil, fmt.Errorf("error unmarshaling mounts: %v\nJSON bytes:\n%s", err, string(bytes)) } return mounts, nil } // ChangeMountType changes m.Type to the specified type. It may do necessary // amends to m.Options. func ChangeMountType(m *specs.Mount, newType string) { m.Type = newType // OCI spec allows bind mounts to be specified in options only. So if new type // is not bind, remove bind/rbind from options. // // "For bind mounts (when options include either bind or rbind), the type is // a dummy, often "none" (not listed in /proc/filesystems)." if newType != "bind" { newOpts := make([]string, 0, len(m.Options)) for _, opt := range m.Options { if opt != "rbind" && opt != "bind" { newOpts = append(newOpts, opt) } } m.Options = newOpts } } // Capabilities takes in spec and returns a TaskCapabilities corresponding to // the spec. func Capabilities(enableRaw bool, specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) { // Strip CAP_NET_RAW from all capability sets if necessary. skipSet := map[linux.Capability]struct{}{} if !enableRaw { skipSet[linux.CAP_NET_RAW] = struct{}{} } var caps auth.TaskCapabilities if specCaps != nil { var err error if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding, skipSet); err != nil { return nil, err } if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective, skipSet); err != nil { return nil, err } if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable, skipSet); err != nil { return nil, err } if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted, skipSet); err != nil { return nil, err } // TODO(gvisor.dev/issue/3166): Support ambient capabilities. } return &caps, nil } // AllCapabilities returns a LinuxCapabilities struct with all capabilities. func AllCapabilities() *specs.LinuxCapabilities { var names []string for n := range capFromName { names = append(names, n) } return &specs.LinuxCapabilities{ Bounding: names, Effective: names, Inheritable: names, Permitted: names, Ambient: names, } } // AllCapabilitiesUint64 returns a bitmask containing all capabilities set. func AllCapabilitiesUint64() uint64 { var rv uint64 for _, cap := range capFromName { rv |= bits.MaskOf64(int(cap)) } return rv } // MergeCapabilities merges the capabilites from first and second. func MergeCapabilities(first, second *specs.LinuxCapabilities) *specs.LinuxCapabilities { return &specs.LinuxCapabilities{ Bounding: mergeUnique(first.Bounding, second.Bounding), Effective: mergeUnique(first.Effective, second.Effective), Inheritable: mergeUnique(first.Inheritable, second.Inheritable), Permitted: mergeUnique(first.Permitted, second.Permitted), Ambient: mergeUnique(first.Ambient, second.Ambient), } } // DropCapability removes the specified capability from all capability sets. func DropCapability(caps *specs.LinuxCapabilities, drop string) { caps.Bounding = remove(caps.Bounding, drop) caps.Effective = remove(caps.Effective, drop) caps.Inheritable = remove(caps.Inheritable, drop) caps.Permitted = remove(caps.Permitted, drop) caps.Ambient = remove(caps.Ambient, drop) } func mergeUnique(strSlices ...[]string) []string { common := make(map[string]struct{}) for _, strSlice := range strSlices { for _, s := range strSlice { common[s] = struct{}{} } } res := make([]string, 0, len(common)) for s := range common { res = append(res, s) } return res } func remove(ss []string, rem string) []string { var out []string for _, s := range ss { if s == rem { continue } out = append(out, s) } return out } var capFromName = map[string]linux.Capability{ "CAP_CHOWN": linux.CAP_CHOWN, "CAP_DAC_OVERRIDE": linux.CAP_DAC_OVERRIDE, "CAP_DAC_READ_SEARCH": linux.CAP_DAC_READ_SEARCH, "CAP_FOWNER": linux.CAP_FOWNER, "CAP_FSETID": linux.CAP_FSETID, "CAP_KILL": linux.CAP_KILL, "CAP_SETGID": linux.CAP_SETGID, "CAP_SETUID": linux.CAP_SETUID, "CAP_SETPCAP": linux.CAP_SETPCAP, "CAP_LINUX_IMMUTABLE": linux.CAP_LINUX_IMMUTABLE, "CAP_NET_BIND_SERVICE": linux.CAP_NET_BIND_SERVICE, "CAP_NET_BROADCAST": linux.CAP_NET_BROADCAST, "CAP_NET_ADMIN": linux.CAP_NET_ADMIN, "CAP_NET_RAW": linux.CAP_NET_RAW, "CAP_IPC_LOCK": linux.CAP_IPC_LOCK, "CAP_IPC_OWNER": linux.CAP_IPC_OWNER, "CAP_SYS_MODULE": linux.CAP_SYS_MODULE, "CAP_SYS_RAWIO": linux.CAP_SYS_RAWIO, "CAP_SYS_CHROOT": linux.CAP_SYS_CHROOT, "CAP_SYS_PTRACE": linux.CAP_SYS_PTRACE, "CAP_SYS_PACCT": linux.CAP_SYS_PACCT, "CAP_SYS_ADMIN": linux.CAP_SYS_ADMIN, "CAP_SYS_BOOT": linux.CAP_SYS_BOOT, "CAP_SYS_NICE": linux.CAP_SYS_NICE, "CAP_SYS_RESOURCE": linux.CAP_SYS_RESOURCE, "CAP_SYS_TIME": linux.CAP_SYS_TIME, "CAP_SYS_TTY_CONFIG": linux.CAP_SYS_TTY_CONFIG, "CAP_MKNOD": linux.CAP_MKNOD, "CAP_LEASE": linux.CAP_LEASE, "CAP_AUDIT_WRITE": linux.CAP_AUDIT_WRITE, "CAP_AUDIT_CONTROL": linux.CAP_AUDIT_CONTROL, "CAP_SETFCAP": linux.CAP_SETFCAP, "CAP_MAC_OVERRIDE": linux.CAP_MAC_OVERRIDE, "CAP_MAC_ADMIN": linux.CAP_MAC_ADMIN, "CAP_SYSLOG": linux.CAP_SYSLOG, "CAP_WAKE_ALARM": linux.CAP_WAKE_ALARM, "CAP_BLOCK_SUSPEND": linux.CAP_BLOCK_SUSPEND, "CAP_AUDIT_READ": linux.CAP_AUDIT_READ, "CAP_PERFMON": linux.CAP_PERFMON, "CAP_BPF": linux.CAP_BPF, "CAP_CHECKPOINT_RESTORE": linux.CAP_CHECKPOINT_RESTORE, } func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.CapabilitySet, error) { var caps []linux.Capability for _, n := range names { c, ok := capFromName[n] if !ok { return 0, fmt.Errorf("unknown capability %q", n) } // Should we skip this capabilty? if _, ok := skipSet[c]; ok { continue } caps = append(caps, c) } return auth.CapabilitySetOfMany(caps), nil } // IsGoferMount returns true if the given mount can be mounted as an external // gofer. func IsGoferMount(m specs.Mount) bool { MaybeConvertToBindMount(&m) return m.Type == "bind" && m.Source != "" } // MaybeConvertToBindMount converts mount type to "bind" in case any of the // mount options are either "bind" or "rbind" as required by the OCI spec. // // "For bind mounts (when options include either bind or rbind), the type is a // dummy, often "none" (not listed in /proc/filesystems)." func MaybeConvertToBindMount(m *specs.Mount) { if m.Type == "bind" { return } for _, opt := range m.Options { if opt == "bind" || opt == "rbind" { m.Type = "bind" return } } } // WaitForReady waits for a process to become ready. The process is ready when // the 'ready' function returns true. It continues to wait if 'ready' returns // false. It returns error on timeout, if the process stops or if 'ready' fails. func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) error { b := backoff.NewExponentialBackOff() b.InitialInterval = 1 * time.Millisecond b.MaxInterval = 1 * time.Second b.MaxElapsedTime = timeout op := func() error { if ok, err := ready(); err != nil { return backoff.Permanent(err) } else if ok { return nil } // Check if the process is still running. // If the process is alive, child is 0 because of the NOHANG option. // If the process has terminated, child equals the process id. var ws unix.WaitStatus var ru unix.Rusage child, err := unix.Wait4(pid, &ws, unix.WNOHANG, &ru) if err != nil { return backoff.Permanent(fmt.Errorf("error waiting for process: %v", err)) } else if child == pid { return backoff.Permanent(fmt.Errorf("process %d has terminated", pid)) } return fmt.Errorf("process %d not running yet", pid) } return backoff.Retry(op, b) } // DebugLogFile opens a log file using 'logPattern' as location. If 'logPattern' // ends with '/', it's used as a directory with default file name. // 'logPattern' can contain variables that are substituted: // - %TIMESTAMP%: is replaced with a timestamp using the following format: // // - %COMMAND%: is replaced with 'command' // - %TEST%: is replaced with 'test' (omitted by default) func DebugLogFile(logPattern, command, test string, timestamp time.Time) (*os.File, error) { if strings.HasSuffix(logPattern, "/") { // Default format: /runsc.log...txt logPattern += "runsc.log.%TIMESTAMP%.%COMMAND%.txt" } logPattern = strings.Replace(logPattern, "%TIMESTAMP%", timestamp.Format("20060102-150405.000000"), -1) logPattern = strings.Replace(logPattern, "%COMMAND%", command, -1) logPattern = strings.Replace(logPattern, "%TEST%", test, -1) dir := filepath.Dir(logPattern) if err := os.MkdirAll(dir, 0775); err != nil { return nil, fmt.Errorf("error creating dir %q: %v", dir, err) } return os.OpenFile(logPattern, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664) } // IsDebugCommand returns true if the command should be debugged or not, based // on the current configuration. func IsDebugCommand(conf *config.Config, command string) bool { if len(conf.DebugCommand) == 0 { // Debug everything by default. return true } filter := conf.DebugCommand rv := true if filter[0] == '!' { // Negate the match, e.g. !boot should log all, but "boot". filter = filter[1:] rv = false } for _, cmd := range strings.Split(filter, ",") { if cmd == command { return rv } } return !rv } // TPUProxyIsEnabled checks if tpuproxy is enabled in the config or annotations. func TPUProxyIsEnabled(spec *specs.Spec, conf *config.Config) bool { if conf.TPUProxy { return true } return AnnotationToBool(spec, AnnotationTPU) } // VFIOFunctionalityRequested returns true if the container should have access // to VFIO functionality. func VFIOFunctionalityRequested(dev *specs.LinuxDevice) bool { return strings.HasPrefix(dev.Path, filepath.Dir(tpuproxy.VFIOPath)) } // AcceleratorFunctionalityRequested returns true if the container should have // access to compute accelerators. Compute accelerators are different from GPUs // by using a different major number and different device char files. func AcceleratorFunctionalityRequested(dev *specs.LinuxDevice) bool { return strings.HasPrefix(dev.Path, "/dev/accel") } // TPUFunctionalityRequested returns true if the container should have access // to TPU functionality. func TPUFunctionalityRequested(spec *specs.Spec, conf *config.Config) bool { if !TPUProxyIsEnabled(spec, conf) { return false } if spec.Linux != nil { for _, dev := range spec.Linux.Devices { if AcceleratorFunctionalityRequested(&dev) || VFIOFunctionalityRequested(&dev) { return true } } } return false } // SafeSetupAndMount creates the mount point and calls Mount with the given // flags. procPath is the path to procfs. If it is "", procfs is assumed to be // mounted at /proc. func SafeSetupAndMount(src, dst, typ string, flags uint32, procPath string) error { // Create the mount point inside. The type must be the same as the source // (file or directory). var isDir bool if typ == "proc" { // Special case, as there is no source directory for proc mounts. isDir = true } else if fi, err := os.Stat(src); err != nil { return fmt.Errorf("stat(%q) failed: %v", src, err) } else { isDir = fi.IsDir() } if isDir { // Create the destination directory. if err := os.MkdirAll(dst, 0777); err != nil { return fmt.Errorf("mkdir(%q) failed: %v", dst, err) } } else { // Create the parent destination directory. parent := path.Dir(dst) if err := os.MkdirAll(parent, 0777); err != nil { return fmt.Errorf("mkdir(%q) failed: %v", parent, err) } // Create the destination file if it does not exist. f, err := os.OpenFile(dst, unix.O_CREAT, 0777) if err != nil { return fmt.Errorf("open(%q) failed: %v", dst, err) } f.Close() } // Do the mount. if err := SafeMount(src, dst, typ, uintptr(flags), "", procPath); err != nil { return fmt.Errorf("mount(%q, %q, %d) failed: %v", src, dst, flags, err) } return nil } // ErrSymlinkMount is returned by SafeMount when the mount destination is found // to be a symlink. type ErrSymlinkMount struct { error } // SafeMount is like unix.Mount, but will fail if dst is a symlink. procPath is // the path to procfs. If it is "", procfs is assumed to be mounted at /proc. // // SafeMount can fail when dst contains a symlink. However, it is called in the // normal case with a destination consisting of a known root (/proc/root) and // symlink-free path (from resolveSymlink). func SafeMount(src, dst, fstype string, flags uintptr, data, procPath string) error { // Open the destination. fd, err := unix.Open(dst, unix.O_PATH|unix.O_CLOEXEC, 0) if err != nil { return fmt.Errorf("failed to safely mount: Open(%s, _, _): %w", dst, err) } defer unix.Close(fd) // Use /proc/self/fd/ to verify that we opened the intended destination. This // guards against dst being a symlink, in which case we could accidentally // mount over the symlink's target. if procPath == "" { procPath = "/proc" } safePath := filepath.Join(procPath, "self/fd", strconv.Itoa(fd)) target, err := os.Readlink(safePath) if err != nil { return fmt.Errorf("failed to safely mount: Readlink(%s): %w", safePath, err) } if dst != target { return &ErrSymlinkMount{fmt.Errorf("failed to safely mount: expected to open %s, but found %s", dst, target)} } return unix.Mount(src, safePath, fstype, flags, data) } // RetryEintr retries the function until an error different than EINTR is // returned. func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) { for { r1, r2, err := f() if err != unix.EINTR { return r1, r2, err } } } // GetOOMScoreAdj reads the given process' oom_score_adj func GetOOMScoreAdj(pid int) (int, error) { data, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid)) if err != nil { return 0, err } return strconv.Atoi(strings.TrimSpace(string(data))) } // EnvVar looks for a variable value in the env slice assuming the following // format: "NAME=VALUE". If a variable is defined multiple times, the last // value is used. func EnvVar(env []string, name string) (string, bool) { var err error env, err = ResolveEnvs(env) if err != nil { return "", false } prefix := name + "=" for _, e := range env { if strings.HasPrefix(e, prefix) { return strings.TrimPrefix(e, prefix), true } } return "", false } // ResolveEnvs transforms lists of environment variables into a single list of // environment variables. If a variable is defined multiple times, the last // value is used. func ResolveEnvs(envs ...[]string) ([]string, error) { // First create a map of variable names to values. This removes any // duplicates. envMap := make(map[string]string) for _, env := range envs { for _, str := range env { parts := strings.SplitN(str, "=", 2) if len(parts) != 2 { return nil, fmt.Errorf("invalid variable: %s", str) } envMap[parts[0]] = parts[1] } } // Reassemble envMap into a list of environment variables of the form // NAME=VALUE. env := make([]string, 0, len(envMap)) for k, v := range envMap { env = append(env, fmt.Sprintf("%s=%s", k, v)) } return env, nil } // FaqErrorMsg returns an error message pointing to the FAQ. func FaqErrorMsg(anchor, msg string) string { return fmt.Sprintf("%s; see https://gvisor.dev/faq#%s for more details", msg, anchor) } // ContainerName looks for an annotation in the spec with the container name. Returns empty string // if no annotation is found. func ContainerName(spec *specs.Spec) string { return spec.Annotations[annotationContainerName] } // AnnotationToBool parses the annotation value as a bool. On failure, it logs a warning and // returns false. func AnnotationToBool(spec *specs.Spec, annotation string) bool { val, ok := spec.Annotations[annotation] if !ok { return false } ret, err := strconv.ParseBool(val) if err != nil { log.Warningf("Failed to parse annotation %q=%q as a bool: %v", annotation, val, err) return false } return ret } golang-gvisor-gvisor-0.0~20240729.0/runsc/specutils/specutils_state_autogen.go000066400000000000000000000000731465435605700272740ustar00rootroot00000000000000// automatically generated by stateify. package specutils golang-gvisor-gvisor-0.0~20240729.0/runsc/starttime/000077500000000000000000000000001465435605700220115ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/starttime/starttime.go000066400000000000000000000017071465435605700243610ustar00rootroot00000000000000// Copyright 2024 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package starttime holds the time the `runsc` command started. // It is useful in order to plumb this time wherever needed. package starttime import ( "sync" "time" ) var ( setOnce sync.Once startTime time.Time ) // Get returns the time the `runsc` command started. func Get() time.Time { setOnce.Do(func() { startTime = time.Now() }) return startTime } golang-gvisor-gvisor-0.0~20240729.0/runsc/starttime/starttime_state_autogen.go000066400000000000000000000000731465435605700272760ustar00rootroot00000000000000// automatically generated by stateify. package starttime golang-gvisor-gvisor-0.0~20240729.0/runsc/version/000077500000000000000000000000001465435605700214625ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/runsc/version/version.go000066400000000000000000000017651465435605700235070ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !false // +build !false // Package version holds a string containing version information for runsc. // Other packages may import it to get this information while avoiding // import loops. package version // version is the version string. // It is initialized by the runsc main() function. var version = "VERSION_MISSING" // Version returns the version string. func Version() string { return version } golang-gvisor-gvisor-0.0~20240729.0/runsc/version/version_state_autogen.go000066400000000000000000000001351465435605700264170ustar00rootroot00000000000000// automatically generated by stateify. //go:build !false // +build !false package version golang-gvisor-gvisor-0.0~20240729.0/shim/000077500000000000000000000000001465435605700176035ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/shim/cli/000077500000000000000000000000001465435605700203525ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/shim/cli/cli.go000066400000000000000000000016461465435605700214570ustar00rootroot00000000000000// Copyright 2018 The containerd Authors. // Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package cli defines the command line interface for the V2 shim. package cli import ( containerdshim "github.com/containerd/containerd/runtime/v2/shim" "gvisor.dev/gvisor/pkg/shim" ) // Main is the main entrypoint. func Main() { containerdshim.Run("io.containerd.runsc.v1", shim.New) } golang-gvisor-gvisor-0.0~20240729.0/shim/cli/cli_state_autogen.go000066400000000000000000000000651465435605700243730ustar00rootroot00000000000000// automatically generated by stateify. package cli golang-gvisor-gvisor-0.0~20240729.0/shim/main.go000066400000000000000000000014111465435605700210530ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Binary containerd-shim-runsc-v1 is the v2 containerd shim (implementing the formal v1 API). package main import ( "gvisor.dev/gvisor/shim/cli" ) func main() { cli.Main() } golang-gvisor-gvisor-0.0~20240729.0/tools/000077500000000000000000000000001465435605700200035ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/tools/checklocks/000077500000000000000000000000001465435605700221145ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/tools/checklocks/analysis.go000066400000000000000000000703061465435605700242740ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package checklocks import ( "go/token" "go/types" "strings" "golang.org/x/tools/go/ssa" ) func gcd(a, b atomicAlignment) atomicAlignment { for b != 0 { a, b = b, a%b } return a } // typeAlignment returns the type alignment for the given type. func (pc *passContext) typeAlignment(pkg *types.Package, obj types.Object) atomicAlignment { requiredOffset := atomicAlignment(1) if pc.pass.ImportObjectFact(obj, &requiredOffset) { return requiredOffset } switch x := obj.Type().Underlying().(type) { case *types.Struct: fields := make([]*types.Var, x.NumFields()) for i := 0; i < x.NumFields(); i++ { fields[i] = x.Field(i) } offsets := pc.pass.TypesSizes.Offsetsof(fields) for i := 0; i < x.NumFields(); i++ { // Check the offset, and then assuming that this offset // aligns with the offset for the broader type. fieldRequired := pc.typeAlignment(pkg, fields[i]) if offsets[i]%int64(fieldRequired) != 0 { // The offset of this field is not compatible. pc.maybeFail(fields[i].Pos(), "have alignment %d, need %d", offsets[i], fieldRequired) } // Ensure the requiredOffset is the LCM of the offset. requiredOffset *= fieldRequired / gcd(requiredOffset, fieldRequired) } case *types.Array: // Export direct alignment requirements. if named, ok := x.Elem().(*types.Named); ok && !hasTypeParams(named) { requiredOffset = pc.typeAlignment(pkg, named.Obj()) } default: // Use the compiler's underlying alignment. requiredOffset = atomicAlignment(pc.pass.TypesSizes.Alignof(obj.Type().Underlying())) } if pkg == obj.Pkg() { // Cache as an object fact, to subsequent calls. Note that we // can only export object facts for the package that we are // currently analyzing. There may be no exported facts for // array types or alias types, for example. pc.pass.ExportObjectFact(obj, &requiredOffset) } return requiredOffset } // hasTypeParams returns true iff the named type has type parameters. func hasTypeParams(typ *types.Named) bool { return typ.TypeParams() != nil && typ.TypeParams().Len() > 0 } // checkTypeAlignment checks the alignment of the given type. // // This calls typeAlignment, which resolves all types recursively. This method // should be called for all types individual to ensure full coverage. func (pc *passContext) checkTypeAlignment(pkg *types.Package, typ *types.Named) { if !hasTypeParams(typ) { _ = pc.typeAlignment(pkg, typ.Obj()) } } // atomicRules specify read constraints. type atomicRules int const ( nonAtomic atomicRules = iota readWriteAtomic readOnlyAtomic mixedAtomic ) // checkAtomicCall checks for an atomic access. // // inst is the instruction analyzed, obj is used only for maybeFail. func (pc *passContext) checkAtomicCall(inst ssa.Instruction, obj types.Object, ar atomicRules) { switch x := inst.(type) { case *ssa.Call: if x.Common().IsInvoke() { if ar != nonAtomic { // This is an illegal interface dispatch. pc.maybeFail(inst.Pos(), "dynamic dispatch with atomic-only field") } return } fn, ok := x.Common().Value.(*ssa.Function) if !ok { if ar != nonAtomic { // This is an illegal call to a non-static function. pc.maybeFail(inst.Pos(), "dispatch to non-static function with atomic-only field") } return } pkg := fn.Package() if pkg == nil { if ar != nonAtomic { // This is a call to some shared wrapper function. pc.maybeFail(inst.Pos(), "dispatch to shared function or wrapper") } return } var lff lockFunctionFacts // Check for exemption. if obj := fn.Object(); obj != nil && pc.pass.ImportObjectFact(obj, &lff) && lff.Ignore { return } if name := pkg.Pkg.Name(); name != "atomic" && name != "atomicbitops" { if ar != nonAtomic { // This is an illegal call to a non-atomic package function. pc.maybeFail(inst.Pos(), "dispatch to non-atomic function with atomic-only field") } return } if fn.Signature.Recv() != nil { // always allow calls to methods of atomic wrappers such as atomic.Int32 introduced in Go 1.19 return } if ar == nonAtomic { // We are *not* expecting an atomic dispatch. if _, ok := pc.forced[pc.positionKey(inst.Pos())]; !ok { pc.maybeFail(inst.Pos(), "unexpected call to atomic function") } } if !strings.HasPrefix(fn.Name(), "Load") && ar == readOnlyAtomic { // We are not allowing any reads in this context. if _, ok := pc.forced[pc.positionKey(inst.Pos())]; !ok { pc.maybeFail(inst.Pos(), "unexpected call to atomic write function, is a lock missing?") } return } return // Don't hit common case. case *ssa.ChangeType: // Allow casts for atomic values, but nothing else. if refs := x.Referrers(); refs != nil && len(*refs) == 1 { pc.checkAtomicCall((*refs)[0], obj, ar) return } case *ssa.UnOp: if x.Op == token.MUL && ar == mixedAtomic { // This is allowed; this is a strict reading. return } } if ar != nonAtomic { // This is something else entirely. if _, ok := pc.forced[pc.positionKey(inst.Pos())]; !ok { pc.maybeFail(inst.Pos(), "illegal use of atomic-only field by %T instruction", inst) } return } } func resolveStruct(typ types.Type) (*types.Struct, bool) { structType, ok := typ.Underlying().(*types.Struct) if ok { return structType, true } ptrType, ok := typ.Underlying().(*types.Pointer) if ok { return resolveStruct(ptrType.Elem()) } return nil, false } func findField(typ types.Type, field int) (types.Object, bool) { structType, ok := resolveStruct(typ) if !ok || field >= structType.NumFields() { return nil, false } return structType.Field(field), true } // almostInst is a generalization over ssa.Field, ssa.FieldAddr, ssa.Global. type almostInst interface { Pos() token.Pos Referrers() *[]ssa.Instruction } // checkGuards checks the guards held. // // This also enforces atomicity constraints for fields that must be accessed // atomically. The parameter isWrite indicates whether this field is used // downstream for a write operation. // // Note that this function is not called if lff.Ignore is true, since it cannot // discover any local anonymous functions or closures. func (pc *passContext) checkGuards(inst almostInst, from ssa.Value, accessObj types.Object, ls *lockState, isWrite bool) { var ( lgf lockGuardFacts guardsFound int guardsHeld = make(map[string]struct{}) // Keyed by resolved string. ) // Load the facts for the object accessed. pc.pass.ImportObjectFact(accessObj, &lgf) // Check guards held. for guardName, fgr := range lgf.GuardedBy { guardsFound++ r := fgr.resolveField(pc, ls, from) if !r.valid() { // See above; this cannot be forced. pc.maybeFail(inst.Pos(), "field %s cannot be resolved", guardName) continue } s, ok := ls.isHeld(r, isWrite) if ok { guardsHeld[s] = struct{}{} continue } if _, ok := pc.forced[pc.positionKey(inst.Pos())]; ok { // Mark this as locked, since it has been forced. All // forces are treated as an exclusive lock. s, _ := ls.lockField(r, true /* exclusive */) guardsHeld[s] = struct{}{} continue } // Note that we may allow this if the disposition is atomic, // and we are allowing atomic reads only. This will fall into // the atomic disposition check below, which asserts that the // access is atomic. Further, len(guardsHeld) < guardsFound // will be true for this case, so we require it to be // read-only. if lgf.AtomicDisposition != atomicRequired { // There is no force key, no atomic access and no lock held. pc.maybeFail(inst.Pos(), "invalid field access, %s (%s) must be locked when accessing %s (locks: %s)", guardName, s, accessObj.Name(), ls.String()) } } // Check the atomic access for this field. switch lgf.AtomicDisposition { case atomicRequired: // Check that this is used safely as an input. ar := readWriteAtomic if guardsFound > 0 { if len(guardsHeld) < guardsFound { ar = readOnlyAtomic } else { ar = mixedAtomic } } if refs := inst.Referrers(); refs != nil { for _, otherInst := range *refs { pc.checkAtomicCall(otherInst, accessObj, ar) } } // Check that this is not otherwise written non-atomically, // even if we do hold all the locks. if isWrite { pc.maybeFail(inst.Pos(), "non-atomic write of field %s, writes must still be atomic with locks held (locks: %s)", accessObj.Name(), ls.String()) } case atomicDisallow: // If atomic analysis is not enabled, skip. if !enableAtomic { break } // Check that this is *not* used atomically. if refs := inst.Referrers(); refs != nil { for _, otherInst := range *refs { pc.checkAtomicCall(otherInst, accessObj, nonAtomic) } } } // Check inferred locks. if accessObj.Pkg() == pc.pass.Pkg { oo := pc.observationsFor(accessObj) oo.total++ for s, info := range ls.lockedMutexes { // Is this an object for which we have facts? If there // is no ability to name this object, then we don't // bother with any inferrence. We also ignore any self // references (e.g. accessing a mutex while you are // holding that exact mutex). if info.object == nil || accessObj == info.object { continue } // Has this already been held? if _, ok := guardsHeld[s]; ok { oo.counts[info.object]++ continue } // Is this a global? Record directly. if _, ok := from.(*ssa.Global); ok { oo.counts[info.object]++ continue } // Is the object a sibling to the accessObj? We need to // check all fields and see if they match. We accept // only siblings and globals for this recommendation. structType, ok := resolveStruct(from.Type()) if !ok { continue } for i := 0; i < structType.NumFields(); i++ { if fieldObj := structType.Field(i); fieldObj == info.object { // Add to the maybe list. oo.counts[info.object]++ } } } } } // checkFieldAccess checks the validity of a field access. func (pc *passContext) checkFieldAccess(inst almostInst, structObj ssa.Value, field int, ls *lockState, isWrite bool) { fieldObj, _ := findField(structObj.Type(), field) pc.checkGuards(inst, structObj, fieldObj, ls, isWrite) } // noReferrers wraps an instruction as an almostInst. type noReferrers struct { ssa.Instruction } // Referrers implements almostInst.Referrers. func (noReferrers) Referrers() *[]ssa.Instruction { return nil } // checkGlobalAccess checks the validity of a global access. func (pc *passContext) checkGlobalAccess(inst ssa.Instruction, g *ssa.Global, ls *lockState, isWrite bool) { pc.checkGuards(noReferrers{inst}, g, g.Object(), ls, isWrite) } func (pc *passContext) checkCall(call callCommon, lff *lockFunctionFacts, ls *lockState) { // See: https://godoc.org/golang.org/x/tools/go/ssa#CallCommon // // "invoke" mode: Method is non-nil, and Value is the underlying value. if fn := call.Common().Method; fn != nil { var nlff lockFunctionFacts pc.pass.ImportObjectFact(fn, &nlff) nlff.Ignore = nlff.Ignore || lff.Ignore // Inherit ignore. pc.checkFunctionCall(call, fn, &nlff, ls) return } // "call" mode: when Method is nil (!IsInvoke), a CallCommon represents an ordinary // function call of the value in Value, which may be a *Builtin, a *Function or any // other value of kind 'func'. // // Value may be one of: // (a) a *Function, indicating a statically dispatched call // to a package-level function, an anonymous function, or // a method of a named type. // // (b) a *MakeClosure, indicating an immediately applied // function literal with free variables. // // (c) a *Builtin, indicating a statically dispatched call // to a built-in function. // // (d) any other value, indicating a dynamically dispatched // function call. switch fn := call.Common().Value.(type) { case *ssa.Function: nlff := lockFunctionFacts{ Ignore: lff.Ignore, // Inherit ignore. } if obj := fn.Object(); obj != nil { pc.pass.ImportObjectFact(obj, &nlff) nlff.Ignore = nlff.Ignore || lff.Ignore // See above. pc.checkFunctionCall(call, obj.(*types.Func), &nlff, ls) } else { // Anonymous functions have no facts, and cannot be // annotated. We don't check for violations using the // function facts, since they cannot exist. Instead, we // do a fresh analysis using the current lock state. fnls := ls.fork() for i, arg := range call.Common().Args { fnls.store(fn.Params[i], arg) } pc.checkFunction(call, fn, &nlff, fnls, true /* force */) } case *ssa.MakeClosure: // Note that creating and then invoking closures locally is // allowed, but analysis of passing closures is done when // checking individual instructions. pc.checkClosure(call, fn, lff, ls) default: return } } // postFunctionCallUpdate updates all conditions. func (pc *passContext) postFunctionCallUpdate(call callCommon, lff *lockFunctionFacts, ls *lockState, aliases bool) { // Release all locks not still held. for fieldName, fg := range lff.HeldOnEntry { if _, ok := lff.HeldOnExit[fieldName]; ok { continue } if fg.IsAlias && !aliases { continue } r := fg.Resolver.resolveCall(pc, ls, call.Common().Args, call.Value()) if !r.valid() { // See above: this cannot be forced. pc.maybeFail(call.Pos(), "field %s cannot be resolved", fieldName) continue } if s, ok := ls.unlockField(r, fg.Exclusive); !ok && !lff.Ignore { if _, ok := pc.forced[pc.positionKey(call.Pos())]; !ok && !lff.Ignore { pc.maybeFail(call.Pos(), "attempt to release %s (%s), but not held (locks: %s)", fieldName, s, ls.String()) } } } // Update all held locks if acquired. for fieldName, fg := range lff.HeldOnExit { if _, ok := lff.HeldOnEntry[fieldName]; ok { continue } if fg.IsAlias && !aliases { continue } // Acquire the lock per the annotation. r := fg.Resolver.resolveCall(pc, ls, call.Common().Args, call.Value()) if s, ok := ls.lockField(r, fg.Exclusive); !ok && !lff.Ignore { if _, ok := pc.forced[pc.positionKey(call.Pos())]; !ok && !lff.Ignore { pc.maybeFail(call.Pos(), "attempt to acquire %s (%s), but already held (locks: %s)", fieldName, s, ls.String()) } } } } // exclusiveStr returns a string describing exclusive requirements. func exclusiveStr(exclusive bool) string { if exclusive { return "exclusively" } return "non-exclusively" } // checkFunctionCall checks preconditions for function calls, and tracks the // lock state by recording relevant calls to sync functions. Note that calls to // atomic functions are tracked by checkFieldAccess by looking directly at the // referrers (because ordering doesn't matter there, so we need not scan in // instruction order). func (pc *passContext) checkFunctionCall(call callCommon, fn *types.Func, lff *lockFunctionFacts, ls *lockState) { // Extract the "receiver" properly. var args []ssa.Value if call.Common().Method != nil { // This is an interface dispatch for sync.Locker. args = append([]ssa.Value{call.Common().Value}, call.Common().Args...) } else { // This matches the signature for the relevant // sync.Lock/sync.Unlock functions below. args = call.Common().Args } // Check all guards required are held. Note that this explicitly does // not include aliases, hence false being passed below. for fieldName, fg := range lff.HeldOnEntry { if fg.IsAlias { continue } r := fg.Resolver.resolveCall(pc, ls, args, call.Value()) if s, ok := ls.isHeld(r, fg.Exclusive); !ok { if _, ok := pc.forced[pc.positionKey(call.Pos())]; !ok && !lff.Ignore { pc.maybeFail(call.Pos(), "must hold %s %s (%s) to call %s, but not held (locks: %s)", fieldName, exclusiveStr(fg.Exclusive), s, fn.Name(), ls.String()) } else { // Force the lock to be acquired. ls.lockField(r, fg.Exclusive) } } } // Update all lock state accordingly. pc.postFunctionCallUpdate(call, lff, ls, false /* aliases */) // Check if it's a method dispatch for something in the sync package. // See: https://godoc.org/golang.org/x/tools/go/ssa#Function if (lockerRE.MatchString(fn.FullName()) || mutexRE.MatchString(fn.FullName())) && len(args) > 0 { rv := makeResolvedValue(args[0], nil) isExclusive := false switch fn.Name() { case "Lock", "NestedLock": isExclusive = true fallthrough case "RLock": if s, ok := ls.lockField(rv, isExclusive); !ok && !lff.Ignore { if _, ok := pc.forced[pc.positionKey(call.Pos())]; !ok { // Double locking a mutex that is already locked. pc.maybeFail(call.Pos(), "%s already locked (locks: %s)", s, ls.String()) } } case "Unlock", "NestedUnlock": isExclusive = true fallthrough case "RUnlock": if s, ok := ls.unlockField(rv, isExclusive); !ok && !lff.Ignore { if _, ok := pc.forced[pc.positionKey(call.Pos())]; !ok { // Unlocking something that is already unlocked. pc.maybeFail(call.Pos(), "%s already unlocked or locked differently (locks: %s)", s, ls.String()) } } case "DowngradeLock": if s, ok := ls.downgradeField(rv); !ok { if _, ok := pc.forced[pc.positionKey(call.Pos())]; !ok && !lff.Ignore { // Downgrading something that may not be downgraded. pc.maybeFail(call.Pos(), "%s already unlocked or not exclusive (locks: %s)", s, ls.String()) } } } } } // checkClosure forks the lock state, and creates a binding for the FreeVars of // the closure. This allows the analysis to resolve the closure. func (pc *passContext) checkClosure(call callCommon, fn *ssa.MakeClosure, lff *lockFunctionFacts, ls *lockState) { clls := ls.fork() clfn := fn.Fn.(*ssa.Function) for i, fv := range clfn.FreeVars { clls.store(fv, fn.Bindings[i]) } // Note that this is *not* a call to check function call, which checks // against the function preconditions. Instead, this does a fresh // analysis of the function from source code with a different state. nlff := lockFunctionFacts{ Ignore: lff.Ignore, // Inherit ignore. } pc.checkFunction(call, clfn, &nlff, clls, true /* force */) } // freshAlloc indicates that v has been allocated within the local scope. There // is no lock checking done on objects that are freshly allocated. func freshAlloc(v ssa.Value) bool { switch x := v.(type) { case *ssa.Alloc: return true case *ssa.FieldAddr: return freshAlloc(x.X) case *ssa.Field: return freshAlloc(x.X) case *ssa.IndexAddr: return freshAlloc(x.X) case *ssa.Index: return freshAlloc(x.X) case *ssa.Convert: return freshAlloc(x.X) case *ssa.ChangeType: return freshAlloc(x.X) default: return false } } // isWrite indicates that this value is used as the addr field in a store. // // Note that this may still be used for a write. The return here is optimistic // but sufficient for basic analysis. func isWrite(v ssa.Value) bool { refs := v.Referrers() if refs == nil { return false } for _, ref := range *refs { if s, ok := ref.(*ssa.Store); ok && s.Addr == v { return true } } return false } // callCommon is an ssa.Value that also implements Common. type callCommon interface { Pos() token.Pos Common() *ssa.CallCommon Value() *ssa.Call } // checkInstruction checks the legality the single instruction based on the // current lockState. func (pc *passContext) checkInstruction(inst ssa.Instruction, lff *lockFunctionFacts, ls *lockState) (*ssa.Return, *lockState) { // Record any observed globals, and check for violations. The global // value is not itself an instruction, but we check all referrers to // see where they are consumed. var stackLocal [16]*ssa.Value ops := inst.Operands(stackLocal[:]) for _, v := range ops { if v == nil { continue } g, ok := (*v).(*ssa.Global) if !ok { continue } _, isWrite := inst.(*ssa.Store) pc.checkGlobalAccess(inst, g, ls, isWrite) } // Process the instruction. switch x := inst.(type) { case *ssa.Store: // Record that this value is holding this other value. This is // because at the beginning of each ssa execution, there is a // series of assignments of parameter values to alloc objects. // This allows us to trace these back to the original // parameters as aliases above. // // Note that this may overwrite an existing value in the lock // state, but this is intentional. ls.store(x.Addr, x.Val) case *ssa.Field: if !freshAlloc(x.X) && !lff.Ignore { pc.checkFieldAccess(x, x.X, x.Field, ls, false) } case *ssa.FieldAddr: if !freshAlloc(x.X) && !lff.Ignore { pc.checkFieldAccess(x, x.X, x.Field, ls, isWrite(x)) } case *ssa.Call: pc.checkCall(x, lff, ls) case *ssa.Defer: ls.pushDefer(x) case *ssa.RunDefers: for d := ls.popDefer(); d != nil; d = ls.popDefer() { pc.checkCall(d, lff, ls) } case *ssa.MakeClosure: if refs := x.Referrers(); refs != nil { var ( calls int nonCalls int ) for _, ref := range *refs { switch ref.(type) { case *ssa.Call, *ssa.Defer: // Analysis will be done on the call // itself subsequently, including the // lock state at the time of the call. calls++ default: // We need to analyze separately. Per // below, this means that we'll analyze // at closure construction time no zero // assumptions about when it will be // called. nonCalls++ } } if calls > 0 && nonCalls == 0 { return nil, nil } } // Analyze the closure without bindings. This means that we // assume no lock facts or have any existing lock state. Only // trivial closures are acceptable in this case. clfn := x.Fn.(*ssa.Function) nlff := lockFunctionFacts{ Ignore: lff.Ignore, // Inherit ignore. } pc.checkFunction(nil, clfn, &nlff, nil, false /* force */) case *ssa.Return: return x, ls // Valid return state. } return nil, nil } // checkBasicBlock traverses the control flow graph starting at a set of given // block and checks each instruction for allowed operations. func (pc *passContext) checkBasicBlock(fn *ssa.Function, block *ssa.BasicBlock, lff *lockFunctionFacts, parent *lockState, seen map[*ssa.BasicBlock]*lockState, rg map[*ssa.BasicBlock]struct{}) *lockState { // Check for cached results from entering this block from a *different* // execution path. Note that this is not the same path, which is // checked with the recursion guard below. if oldLS, ok := seen[block]; ok && oldLS.isCompatible(parent) { return nil } // Prevent recursion. If the lock state is constantly changing and we // are a recursive path, then there will never be a return block. if rg == nil { rg = make(map[*ssa.BasicBlock]struct{}) } if _, ok := rg[block]; ok { return nil } rg[block] = struct{}{} defer func() { delete(rg, block) }() // If the lock state is not compatible, then we need to do the // recursive analysis to ensure that it is still sane. For example, the // following is guaranteed to generate incompatible locking states: // // if foo { // mu.Lock() // } // other stuff ... // if foo { // mu.Unlock() // } var ( rv *ssa.Return rls *lockState ) // Analyze this block. seen[block] = parent ls := parent.fork() for _, inst := range block.Instrs { rv, rls = pc.checkInstruction(inst, lff, ls) if rls != nil { failed := false // Validate held locks. for fieldName, fg := range lff.HeldOnExit { r := fg.Resolver.resolveStatic(pc, ls, fn, rv) if !r.valid() { // This cannot be forced, since we have no reference. pc.maybeFail(rv.Pos(), "lock %s cannot be resolved", fieldName) continue } if s, ok := rls.isHeld(r, fg.Exclusive); !ok { if _, ok := pc.forced[pc.positionKey(rv.Pos())]; !ok && !lff.Ignore { pc.maybeFail(rv.Pos(), "lock %s (%s) not held %s (locks: %s)", fieldName, s, exclusiveStr(fg.Exclusive), rls.String()) failed = true } else { // Force the lock to be acquired. rls.lockField(r, fg.Exclusive) } } } // Check for other locks, but only if the above didn't trip. if !failed && rls.count() != len(lff.HeldOnExit) && !lff.Ignore { pc.maybeFail(rv.Pos(), "return with unexpected locks held (locks: %s)", rls.String()) } } } // Analyze all successors. for _, succ := range block.Succs { // Collect possible return values, and make sure that the lock // state aligns with any return value that we may have found // above. Note that checkBasicBlock will recursively analyze // the lock state to ensure that Releases and Acquires are // respected. if pls := pc.checkBasicBlock(fn, succ, lff, ls, seen, rg); pls != nil { if rls != nil && !rls.isCompatible(pls) { if _, ok := pc.forced[pc.positionKey(fn.Pos())]; !ok && !lff.Ignore { pc.maybeFail(fn.Pos(), "incompatible return states (first: %s, second: %s)", rls.String(), pls.String()) } } rls = pls } } return rls } // checkFunction checks a function invocation, typically starting with nil lockState. func (pc *passContext) checkFunction(call callCommon, fn *ssa.Function, lff *lockFunctionFacts, parent *lockState, force bool) { defer func() { // Mark this function as checked. This is used by the top-level // loop to ensure that all anonymous functions are scanned, if // they are not explicitly invoked here. Note that this can // happen if the anonymous functions are e.g. passed only as // parameters or used to initialize some structure. pc.functions[fn] = struct{}{} }() if _, ok := pc.functions[fn]; !force && ok { // This function has already been analyzed at least once. // That's all we permit for each function, although this may // cause some anonymous functions to be analyzed in only one // context. return } // If no return value is provided, then synthesize one. This is used // below only to check against the locks preconditions, which may // include return values. if call == nil { call = &ssa.Call{Call: ssa.CallCommon{Value: fn}} } // Initialize ls with any preconditions that require locks to be held // for the method to be invoked. Note that in the overwhleming majority // of cases, parent will be nil. However, in the case of closures and // anonymous functions, we may start with a non-nil lock state. // // Note that this will include all aliases, which are also released // appropriately below. ls := parent.fork() for fieldName, fg := range lff.HeldOnEntry { // The first is the method object itself so we skip that when looking // for receiver/function parameters. r := fg.Resolver.resolveStatic(pc, ls, fn, call.Value()) if !r.valid() { // See above: this cannot be forced. pc.maybeFail(fn.Pos(), "lock %s cannot be resolved", fieldName) continue } if s, ok := ls.lockField(r, fg.Exclusive); !ok && !lff.Ignore { // This can only happen if the same value is declared // multiple times, and should be caught by the earlier // fact scanning. Keep it here as a sanity check. pc.maybeFail(fn.Pos(), "lock %s (%s) acquired multiple times or differently (locks: %s)", fieldName, s, ls.String()) } } // Scan the blocks. seen := make(map[*ssa.BasicBlock]*lockState) if len(fn.Blocks) > 0 { pc.checkBasicBlock(fn, fn.Blocks[0], lff, ls, seen, nil) } // Scan the recover block. if fn.Recover != nil { pc.checkBasicBlock(fn, fn.Recover, lff, ls, seen, nil) } // Update all lock state accordingly. This will be called only if we // are doing inline analysis for e.g. an anonymous function. if call != nil && parent != nil { pc.postFunctionCallUpdate(call, lff, parent, true /* aliases */) } } // checkInferred checks for any inferred lock annotations. func (pc *passContext) checkInferred() { for obj, oo := range pc.observations { var lgf lockGuardFacts pc.pass.ImportObjectFact(obj, &lgf) for other, count := range oo.counts { // Is this already a guard? if _, ok := lgf.GuardedBy[other.Name()]; ok { continue } // Check to see if this field is used with a given lock // held above the threshold. If yes, provide a helpful // hint that this may something you wish to annotate. const threshold = 0.9 if usage := float64(count) / float64(oo.total); usage >= threshold { pc.maybeFail(obj.Pos(), "may require checklocks annotation for %s, used with lock held %2.0f%% of the time", other.Name(), usage*100) } } } } golang-gvisor-gvisor-0.0~20240729.0/tools/checklocks/annotations.go000066400000000000000000000102741465435605700250040ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package checklocks import ( "fmt" "go/token" "strconv" "strings" ) const ( checkLocksAnnotation = "// +checklocks:" checkLocksAnnotationRead = "// +checklocksread:" checkLocksAcquires = "// +checklocksacquire:" checkLocksAcquiresRead = "// +checklocksacquireread:" checkLocksReleases = "// +checklocksrelease:" checkLocksReleasesRead = "// +checklocksreleaseread:" checkLocksIgnore = "// +checklocksignore" checkLocksForce = "// +checklocksforce" checkLocksFail = "// +checklocksfail" checkLocksAlias = "// +checklocksalias:" checkAtomicAnnotation = "// +checkatomic" ) // failData indicates an expected failure. type failData struct { pos token.Pos count int seen int } // positionKey is a simple position string. type positionKey string // positionKey converts from a token.Pos to a key we can use to track failures // as the position of the failure annotation is not the same as the position of // the actual failure (different column/offsets). Hence we ignore these fields // and only use the file/line numbers to track failures. func (pc *passContext) positionKey(pos token.Pos) positionKey { position := pc.pass.Fset.Position(pos) return positionKey(fmt.Sprintf("%s:%d", position.Filename, position.Line)) } // addFailures adds an expected failure. func (pc *passContext) addFailures(pos token.Pos, s string) { count := 1 if len(s) > 0 && s[0] == ':' { parsedCount, err := strconv.Atoi(s[1:]) if err != nil { pc.pass.Reportf(pos, "unable to parse failure annotation %q: %v", s[1:], err) return } count = parsedCount } pc.failures[pc.positionKey(pos)] = &failData{ pos: pos, count: count, } } // addExemption adds an exemption. func (pc *passContext) addExemption(pos token.Pos) { pc.exemptions[pc.positionKey(pos)] = struct{}{} } // addForce adds a force annotation. func (pc *passContext) addForce(pos token.Pos) { pc.forced[pc.positionKey(pos)] = struct{}{} } // maybeFail checks a potential failure against a specific failure map. func (pc *passContext) maybeFail(pos token.Pos, fmtStr string, args ...any) { if fd, ok := pc.failures[pc.positionKey(pos)]; ok { fd.seen++ return } if _, ok := pc.exemptions[pc.positionKey(pos)]; ok { return // Ignored, not counted. } if !enableWrappers && !pos.IsValid() { return // Ignored, implicit. } pc.pass.Reportf(pos, fmtStr, args...) } // checkFailure checks for the expected failure counts. func (pc *passContext) checkFailures() { for _, fd := range pc.failures { if fd.count != fd.seen { // We are missing expect failures, report as much as possible. pc.pass.Reportf(fd.pos, "got %d failures, want %d failures", fd.seen, fd.count) } } } // extractAnnotations extracts annotations from text. func (pc *passContext) extractAnnotations(s string, fns map[string]func(p string)) { for prefix, fn := range fns { if strings.HasPrefix(s, prefix) { fn(s[len(prefix):]) } } } // extractLineFailures extracts all line-based exceptions. // // Note that this applies only to individual line exemptions, and does not // consider function-wide exemptions, or specific field exemptions, which are // extracted separately as part of the saved facts for those objects. func (pc *passContext) extractLineFailures() { for _, f := range pc.pass.Files { for _, cg := range f.Comments { for _, c := range cg.List { pc.extractAnnotations(c.Text, map[string]func(string){ checkLocksFail: func(p string) { pc.addFailures(c.Pos(), p) }, checkLocksIgnore: func(string) { pc.addExemption(c.Pos()) }, checkLocksForce: func(string) { pc.addForce(c.Pos()) }, }) } } } } golang-gvisor-gvisor-0.0~20240729.0/tools/checklocks/checklocks.go000066400000000000000000000142371465435605700245630ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package checklocks performs lock analysis to identify and flag unprotected // access to annotated fields. // // For detailed usage refer to README.md in the same directory. // // Note that this package uses the built-in atomics, in order to avoid the use // of our own atomic package. This is because our own atomic package depends on // our own sync package, which includes lock dependency analysis. This in turn // requires goid, which introduces a dependency cycle. To avoid this, we simply // use the simpler, built-in sync package. // // +checkalignedignore package checklocks import ( "go/ast" "go/token" "go/types" "golang.org/x/tools/go/analysis" "golang.org/x/tools/go/analysis/passes/buildssa" "golang.org/x/tools/go/ssa" ) // Analyzer is the main entrypoint. var Analyzer = &analysis.Analyzer{ Name: "checklocks", Doc: "checks lock preconditions on functions and fields", Run: run, Requires: []*analysis.Analyzer{buildssa.Analyzer}, FactTypes: []analysis.Fact{ (*atomicAlignment)(nil), (*lockGuardFacts)(nil), (*lockFunctionFacts)(nil), }, } var ( enableInferred = true enableAtomic = true enableWrappers = true ) func init() { Analyzer.Flags.BoolVar(&enableInferred, "inferred", true, "enable inferred locks") Analyzer.Flags.BoolVar(&enableAtomic, "atomic", true, "enable atomic checks") Analyzer.Flags.BoolVar(&enableWrappers, "wrappers", true, "enable analysis of wrappers") } // objectObservations tracks lock correlations. type objectObservations struct { counts map[types.Object]int total int } // passContext is a pass with additional expected failures. type passContext struct { pass *analysis.Pass failures map[positionKey]*failData exemptions map[positionKey]struct{} forced map[positionKey]struct{} functions map[*ssa.Function]struct{} observations map[types.Object]*objectObservations } // observationsFor retrieves observations for the given object. func (pc *passContext) observationsFor(obj types.Object) *objectObservations { if pc.observations == nil { pc.observations = make(map[types.Object]*objectObservations) } oo, ok := pc.observations[obj] if !ok { oo = &objectObservations{ counts: make(map[types.Object]int), } pc.observations[obj] = oo } return oo } // forAllGlobals applies the given function to all globals. func (pc *passContext) forAllGlobals(fn func(ts *ast.ValueSpec)) { for _, f := range pc.pass.Files { for _, decl := range f.Decls { d, ok := decl.(*ast.GenDecl) if !ok || d.Tok != token.VAR { continue } for _, gs := range d.Specs { fn(gs.(*ast.ValueSpec)) } } } } // forAllTypes applies the given function over all types. func (pc *passContext) forAllTypes(fn func(ts *ast.TypeSpec)) { for _, f := range pc.pass.Files { for _, decl := range f.Decls { d, ok := decl.(*ast.GenDecl) if !ok || d.Tok != token.TYPE { continue } for _, gs := range d.Specs { fn(gs.(*ast.TypeSpec)) } } } } // forAllFunctions applies the given function over all functions. func (pc *passContext) forAllFunctions(fn func(fn *ast.FuncDecl)) { for _, f := range pc.pass.Files { for _, decl := range f.Decls { d, ok := decl.(*ast.FuncDecl) if !ok { continue } fn(d) } } } // run is the main entrypoint. func run(pass *analysis.Pass) (any, error) { pc := &passContext{ pass: pass, failures: make(map[positionKey]*failData), exemptions: make(map[positionKey]struct{}), forced: make(map[positionKey]struct{}), functions: make(map[*ssa.Function]struct{}), } // Find all line failure annotations. pc.extractLineFailures() // Find all struct declarations and export relevant facts. pc.forAllGlobals(func(vs *ast.ValueSpec) { if ss, ok := vs.Type.(*ast.StructType); ok { structType := pc.pass.TypesInfo.TypeOf(vs.Type).Underlying().(*types.Struct) pc.structLockGuardFacts(structType, ss) } pc.globalLockGuardFacts(vs) }) pc.forAllTypes(func(ts *ast.TypeSpec) { if ss, ok := ts.Type.(*ast.StructType); ok { structType := pc.pass.TypesInfo.TypeOf(ts.Name).Underlying().(*types.Struct) pc.structLockGuardFacts(structType, ss) } }) // Check all alignments. pc.forAllTypes(func(ts *ast.TypeSpec) { typ, ok := pass.TypesInfo.TypeOf(ts.Name).(*types.Named) if !ok { return } pc.checkTypeAlignment(pass.Pkg, typ) }) // Find all function declarations and export relevant facts. pc.forAllFunctions(func(fn *ast.FuncDecl) { pc.functionFacts(fn) }) // Scan all code looking for invalid accesses. state := pass.ResultOf[buildssa.Analyzer].(*buildssa.SSA) for _, fn := range state.SrcFuncs { // Import function facts generated above. // // Note that anonymous(closures) functions do not have an // object but do show up in the SSA. They can only be invoked // by named functions in the package, and they are analyzing // inline on every call. Thus we skip the analysis here. They // will be hit on calls, or picked up in the pass below. if obj := fn.Object(); obj == nil { continue } var lff lockFunctionFacts pc.pass.ImportObjectFact(fn.Object(), &lff) // Check the basic blocks in the function. pc.checkFunction(nil, fn, &lff, nil, false /* force */) } for _, fn := range state.SrcFuncs { // Ensure all anonymous functions are hit. They are not // permitted to have any lock preconditions. if obj := fn.Object(); obj != nil { continue } var nolff lockFunctionFacts pc.checkFunction(nil, fn, &nolff, nil, false /* force */) } // Check for inferred checklocks annotations. if enableInferred { pc.checkInferred() } // Check for expected failures. pc.checkFailures() return nil, nil } golang-gvisor-gvisor-0.0~20240729.0/tools/checklocks/checklocks_state_autogen.go000066400000000000000000000000741465435605700274770ustar00rootroot00000000000000// automatically generated by stateify. package checklocks golang-gvisor-gvisor-0.0~20240729.0/tools/checklocks/cmd/000077500000000000000000000000001465435605700226575ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/tools/checklocks/cmd/checklocks/000077500000000000000000000000001465435605700247705ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/tools/checklocks/cmd/checklocks/main.go000066400000000000000000000014571465435605700262520ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Binary checklocks is a `vettool` for `go vet`. package main import ( "golang.org/x/tools/go/analysis/singlechecker" "gvisor.dev/gvisor/tools/checklocks" ) func main() { singlechecker.Main(checklocks.Analyzer) } golang-gvisor-gvisor-0.0~20240729.0/tools/checklocks/facts.go000066400000000000000000000666551465435605700235650ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package checklocks import ( "encoding/gob" "fmt" "go/ast" "go/token" "go/types" "regexp" "strings" "golang.org/x/tools/go/analysis/passes/buildssa" "golang.org/x/tools/go/ssa" ) // atomicAlignment is saved per type. // // This represents the alignment required for the type, which may // be implied and imposed by other types within the aggregate type. type atomicAlignment int // AFact implements analysis.Fact.AFact. func (*atomicAlignment) AFact() {} // atomicDisposition is saved per field. // // This represents how the field must be accessed. It must either // be non-atomic (default), atomic or ignored. type atomicDisposition int const ( atomicDisallow atomicDisposition = iota atomicIgnore atomicRequired ) // fieldEntry is a single field type. type fieldEntry interface { // synthesize produces a string that is compatible with valueAndObject, // along with the same object that should be produced in that case. // // Note that it is called synthesize because this is produced only the // type information, and not with any ssa.Value objects. synthesize(s string, typ types.Type) (string, types.Object) } // fieldStruct is a non-pointer struct element. type fieldStruct int // synthesize implements fieldEntry.synthesize. func (f fieldStruct) synthesize(s string, typ types.Type) (string, types.Object) { field, ok := findField(typ, int(f)) if !ok { // Should not happen as long as fieldList construction is correct. panic(fmt.Sprintf("unable to resolve field %d in %s", int(f), typ.String())) } return fmt.Sprintf("&(%s.%s)", s, field.Name()), field } // fieldStructPtr is a pointer struct element. type fieldStructPtr int // synthesize implements fieldEntry.synthesize. func (f fieldStructPtr) synthesize(s string, typ types.Type) (string, types.Object) { field, ok := findField(typ, int(f)) if !ok { // See above, this should not happen. panic(fmt.Sprintf("unable to resolve ptr field %d in %s", int(f), typ.String())) } return fmt.Sprintf("*(&(%s.%s))", s, field.Name()), field } // fieldList is a simple list of fields, used in two types below. type fieldList []fieldEntry // resolvedValue is an ssa.Value with additional fields. // // This can be resolved to a string as part of a lock state. type resolvedValue struct { value ssa.Value fieldList fieldList } // makeResolvedValue makes a new resolvedValue. func makeResolvedValue(v ssa.Value, fl fieldList) resolvedValue { return resolvedValue{ value: v, fieldList: fl, } } // valid indicates whether this is a valid resolvedValue. func (rv *resolvedValue) valid() bool { return rv.value != nil } // valueAndObject returns a string and object. // // This uses the lockState valueAndObject in order to produce a string and // object for the base ssa.Value, then synthesizes a string representation // based on the fieldList. func (rv *resolvedValue) valueAndObject(ls *lockState) (string, types.Object) { // N.B. obj.Type() and typ should be equal, but a check is omitted // since, 1) we automatically chase through pointers during field // resolution, and 2) obj may be nil if there is no source object. s, obj := ls.valueAndObject(rv.value) typ := rv.value.Type() for _, entry := range rv.fieldList { s, obj = entry.synthesize(s, typ) typ = obj.Type() } return s, obj } // fieldGuardResolver details a guard for a field. type fieldGuardResolver interface { // resolveField is used to resolve a guard during a field access. The // parent structure is available, as well as the current lock state. resolveField(pc *passContext, ls *lockState, parent ssa.Value) resolvedValue } // functionGuardResolver details a guard for a function. type functionGuardResolver interface { // resolveStatic is used to resolve a guard during static analysis, // e.g. based on static annotations applied to a method. The function's // ssa object is available, as well as the return value. resolveStatic(pc *passContext, ls *lockState, fn *ssa.Function, rv any) resolvedValue // resolveCall is used to resolve a guard during a call. The ssa // return value is available from the instruction context where the // call occurs, but the target's ssa representation is not available. resolveCall(pc *passContext, ls *lockState, args []ssa.Value, rv ssa.Value) resolvedValue } // lockGuardFacts contains guard information. type lockGuardFacts struct { // GuardedBy is the set of locks that are guarding this field. The key // is the original annotation value, and the field list is the object // traversal path. GuardedBy map[string]fieldGuardResolver // AtomicDisposition is the disposition for this field. Note that this // can affect the interpretation of the GuardedBy field above, see the // relevant comment. AtomicDisposition atomicDisposition } // AFact implements analysis.Fact.AFact. func (*lockGuardFacts) AFact() {} // globalGuard is a global value. type globalGuard struct { // ObjectName indicates the object from which resolution should occur. ObjectName string // PackageName is the package where the object lives. PackageName string // FieldList is the traversal path from object. FieldList fieldList } // ssaPackager returns the ssa package. type ssaPackager interface { Package() *ssa.Package } // resolveCommon implements resolution for all cases. func (g *globalGuard) resolveCommon(pc *passContext, ls *lockState) resolvedValue { state := pc.pass.ResultOf[buildssa.Analyzer].(*buildssa.SSA) pkg := state.Pkg if g.PackageName != "" && g.PackageName != state.Pkg.Pkg.Path() { pkg = state.Pkg.Prog.ImportedPackage(g.PackageName) } v := pkg.Members[g.ObjectName].(ssa.Value) return makeResolvedValue(v, g.FieldList) } // resolveStatic implements functionGuardResolver.resolveStatic. func (g *globalGuard) resolveStatic(pc *passContext, ls *lockState, _ *ssa.Function, v any) resolvedValue { return g.resolveCommon(pc, ls) } // resolveCall implements functionGuardResolver.resolveCall. func (g *globalGuard) resolveCall(pc *passContext, ls *lockState, _ []ssa.Value, v ssa.Value) resolvedValue { return g.resolveCommon(pc, ls) } // resolveField implements fieldGuardResolver.resolveField. func (g *globalGuard) resolveField(pc *passContext, ls *lockState, parent ssa.Value) resolvedValue { return g.resolveCommon(pc, ls) } // fieldGuard is a field-based guard. type fieldGuard struct { // FieldList is the traversal path from the parent. FieldList fieldList } // resolveField implements fieldGuardResolver.resolveField. func (f *fieldGuard) resolveField(_ *passContext, _ *lockState, parent ssa.Value) resolvedValue { return makeResolvedValue(parent, f.FieldList) } // parameterGuard is a parameter-based guard. type parameterGuard struct { // Index is the parameter index of the object that contains the // guarding mutex. Index int // fieldList is the traversal path from the parameter. FieldList fieldList } // resolveStatic implements functionGuardResolver.resolveStatic. func (p *parameterGuard) resolveStatic(_ *passContext, _ *lockState, fn *ssa.Function, _ any) resolvedValue { return makeResolvedValue(fn.Params[p.Index], p.FieldList) } // resolveCall implements functionGuardResolver.resolveCall. func (p *parameterGuard) resolveCall(_ *passContext, _ *lockState, args []ssa.Value, _ ssa.Value) resolvedValue { return makeResolvedValue(args[p.Index], p.FieldList) } // returnGuard is a return-based guard. type returnGuard struct { // Index is the index of the return value. Index int // NeedsExtract is used in the case of a return value, and indicates // that the field must be extracted from a tuple. NeedsExtract bool // FieldList is the traversal path from the return value. FieldList fieldList } // resolveCommon implements resolution for both cases. func (r *returnGuard) resolveCommon(rv any) resolvedValue { if rv == nil { // For defers and other objects, this may be nil. This is // handled in state.go in the actual lock checking logic. This // means that there is no resolvedValue available. return resolvedValue{} } // If this is a *ssa.Return object, i.e. we are analyzing the function // and not the call site, then we can just pull the result directly. if ret, ok := rv.(*ssa.Return); ok { return makeResolvedValue(ret.Results[r.Index], r.FieldList) } if r.NeedsExtract { // Resolve on the extracted field, this is necessary if the // type here is not an explicit return. Note that rv must be an // ssa.Value, since it is not an *ssa.Return. v := rv.(ssa.Value) if refs := v.Referrers(); refs != nil { for _, inst := range *refs { if x, ok := inst.(*ssa.Extract); ok && x.Tuple == v && x.Index == r.Index { return makeResolvedValue(x, r.FieldList) } } } // Nothing resolved. return resolvedValue{} } if r.Index != 0 { // This should not happen, NeedsExtract should always be set. panic("NeedsExtract is false, but return value index is non-zero") } // Resolve on the single return. return makeResolvedValue(rv.(ssa.Value), r.FieldList) } // resolveStatic implements functionGuardResolver.resolveStatic. func (r *returnGuard) resolveStatic(_ *passContext, _ *lockState, _ *ssa.Function, rv any) resolvedValue { return r.resolveCommon(rv) } // resolveCall implements functionGuardResolver.resolveCall. func (r *returnGuard) resolveCall(_ *passContext, _ *lockState, _ []ssa.Value, rv ssa.Value) resolvedValue { return r.resolveCommon(rv) } // functionGuardInfo is information about a method guard. type functionGuardInfo struct { // Resolver is the resolver for this guard. Resolver functionGuardResolver // IsAlias indicates that this guard is an alias. IsAlias bool // Exclusive indicates an exclusive lock is required. Exclusive bool } // lockFunctionFacts apply on every method. type lockFunctionFacts struct { // HeldOnEntry tracks the names and number of parameter (including receiver) // lockFuncfields that guard calls to this function. // // The key is the name specified in the checklocks annotation. e.g given // the following code: // // ``` // type A struct { // mu sync.Mutex // a int // } // // // +checklocks:a.mu // func xyz(a *A) {..} // ``` // // '`+checklocks:a.mu' will result in an entry in this map as shown below. // HeldOnEntry: {"a.mu" => {Resolver: ¶meterGuard{Index: 0}} HeldOnEntry map[string]functionGuardInfo // HeldOnExit tracks the locks that are expected to be held on exit. HeldOnExit map[string]functionGuardInfo // Ignore means this function has local analysis ignores. // // This is not used outside the local package. Ignore bool } // AFact implements analysis.Fact.AFact. func (*lockFunctionFacts) AFact() {} // checkGuard validates the guardName. func (lff *lockFunctionFacts) checkGuard(pc *passContext, d *ast.FuncDecl, guardName string, exclusive bool, allowReturn bool) (functionGuardInfo, bool) { if _, ok := lff.HeldOnEntry[guardName]; ok { pc.maybeFail(d.Pos(), "annotation %s specified more than once, already required", guardName) return functionGuardInfo{}, false } if _, ok := lff.HeldOnExit[guardName]; ok { pc.maybeFail(d.Pos(), "annotation %s specified more than once, already acquired", guardName) return functionGuardInfo{}, false } fg, ok := pc.findFunctionGuard(d, guardName, exclusive, allowReturn) return fg, ok } // addGuardedBy adds a field to both HeldOnEntry and HeldOnExit. func (lff *lockFunctionFacts) addGuardedBy(pc *passContext, d *ast.FuncDecl, guardName string, exclusive bool) { if fg, ok := lff.checkGuard(pc, d, guardName, exclusive, false /* allowReturn */); ok { if lff.HeldOnEntry == nil { lff.HeldOnEntry = make(map[string]functionGuardInfo) } if lff.HeldOnExit == nil { lff.HeldOnExit = make(map[string]functionGuardInfo) } lff.HeldOnEntry[guardName] = fg lff.HeldOnExit[guardName] = fg } } // addAcquires adds a field to HeldOnExit. func (lff *lockFunctionFacts) addAcquires(pc *passContext, d *ast.FuncDecl, guardName string, exclusive bool) { if fg, ok := lff.checkGuard(pc, d, guardName, exclusive, true /* allowReturn */); ok { if lff.HeldOnExit == nil { lff.HeldOnExit = make(map[string]functionGuardInfo) } lff.HeldOnExit[guardName] = fg } } // addReleases adds a field to HeldOnEntry. func (lff *lockFunctionFacts) addReleases(pc *passContext, d *ast.FuncDecl, guardName string, exclusive bool) { if fg, ok := lff.checkGuard(pc, d, guardName, exclusive, false /* allowReturn */); ok { if lff.HeldOnEntry == nil { lff.HeldOnEntry = make(map[string]functionGuardInfo) } lff.HeldOnEntry[guardName] = fg } } // addAlias adds an alias. func (lff *lockFunctionFacts) addAlias(pc *passContext, d *ast.FuncDecl, guardName string) { // Parse the alias. parts := strings.Split(guardName, "=") if len(parts) != 2 { pc.maybeFail(d.Pos(), "invalid annotation %s for alias", guardName) return } // Parse the actual guard. fg, ok := lff.checkGuard(pc, d, parts[0], true /* exclusive */, true /* allowReturn */) if !ok { return } fg.IsAlias = true // Find the existing specification. _, entryOk := lff.HeldOnEntry[parts[1]] if entryOk { lff.HeldOnEntry[guardName] = fg } _, exitOk := lff.HeldOnExit[parts[1]] if exitOk { lff.HeldOnExit[guardName] = fg } if !entryOk && !exitOk { pc.maybeFail(d.Pos(), "alias annotation %s does not refer to an existing guard", guardName) } } // fieldEntryFor returns the fieldList value for the given object. func (pc *passContext) fieldEntryFor(fieldObj types.Object, index int) fieldEntry { // Return the resolution path. if _, ok := fieldObj.Type().Underlying().(*types.Pointer); ok { return fieldStructPtr(index) } if _, ok := fieldObj.Type().Underlying().(*types.Interface); ok { return fieldStructPtr(index) } return fieldStruct(index) } // findField resolves a field in a single struct. func (pc *passContext) findField(structType *types.Struct, fieldName string) (fl fieldList, fieldObj types.Object, ok bool) { // Scan to match the next field. for i := 0; i < structType.NumFields(); i++ { fieldObj := structType.Field(i) if fieldObj.Name() != fieldName { continue } fl = append(fl, pc.fieldEntryFor(fieldObj, i)) return fl, fieldObj, true } // Is this an embed? for i := 0; i < structType.NumFields(); i++ { fieldObj := structType.Field(i) if !fieldObj.Embedded() { continue } // Is this an embedded struct? structType, ok := resolveStruct(fieldObj.Type()) if !ok { continue } // Need to check that there is a resolution path. If there is // no resolution path that's not a failure: we just continue // scanning the next embed to find a match. flEmbed := pc.fieldEntryFor(fieldObj, i) flNext, fieldObjNext, ok := pc.findField(structType, fieldName) if !ok { continue } // Found an embedded chain. fl = append(fl, flEmbed) fl = append(fl, flNext...) return fl, fieldObjNext, true } return nil, nil, false } var ( mutexRE = regexp.MustCompile(".*Mutex") rwMutexRE = regexp.MustCompile(".*RWMutex") lockerRE = regexp.MustCompile(".*sync.Locker") ) // validateMutex validates the mutex type. // // This function returns true iff the object is a valid mutex with an error // reported at the given position if necessary. func (pc *passContext) validateMutex(pos token.Pos, obj types.Object, exclusive bool) bool { // Check that it is indeed a mutex. s := obj.Type().String() switch { case rwMutexRE.MatchString(s): // Safe for all cases. return true case mutexRE.MatchString(s), lockerRE.MatchString(s): // Safe for exclusive cases. if !exclusive { pc.maybeFail(pos, "field %s must be a RWMutex", obj.Name()) return false } return true default: // Not a mutex at all? pc.maybeFail(pos, "field %s is not a Mutex or an RWMutex", obj.Name()) return false } } // findFieldList resolves a set of fields given a string, such a 'a.b.c'. // // Note that parts must be non-zero in length. If it may be zero, then // maybeFindFieldList should be used instead with an appropriate object. func (pc *passContext) findFieldList(pos token.Pos, structType *types.Struct, parts []string, exclusive bool) (fl fieldList, ok bool) { var obj types.Object // This loop requires at least one iteration in order to ensure that // obj above is non-nil, and the type can be validated. for i, fieldName := range parts { flOne, fieldObj, ok := pc.findField(structType, fieldName) if !ok { return nil, false } fl = append(fl, flOne...) obj = fieldObj if i < len(parts)-1 { structType, ok = resolveStruct(obj.Type()) if !ok { // N.B. This is associated with the original position. pc.maybeFail(pos, "field %s expected to be struct", fieldName) return nil, false } } } // Validate the final field. This reports the field to the caller // anyways, since the error will be reported only once. _ = pc.validateMutex(pos, obj, exclusive) return fl, true } // maybeFindFieldList resolves the given object. // // Parts may be the empty list, unlike findFieldList. func (pc *passContext) maybeFindFieldList(pos token.Pos, obj types.Object, parts []string, exclusive bool) (fl fieldList, ok bool) { if len(parts) > 0 { structType, ok := resolveStruct(obj.Type()) if !ok { // This does not have any fields; the access is not allowed. pc.maybeFail(pos, "attempted field access on non-struct") return nil, false } return pc.findFieldList(pos, structType, parts, exclusive) } // See above. _ = pc.validateMutex(pos, obj, exclusive) return nil, true } // findFieldGuardResolver finds a symbol resolver. type findFieldGuardResolver func(pos token.Pos, guardName string) (fieldGuardResolver, bool) // findFunctionGuardResolver finds a symbol resolver. type findFunctionGuardResolver func(pos token.Pos, guardName string) (functionGuardResolver, bool) // fillLockGuardFacts fills the facts with guard information. func (pc *passContext) fillLockGuardFacts(obj types.Object, cg *ast.CommentGroup, find findFieldGuardResolver, lgf *lockGuardFacts) { if cg == nil { return } for _, l := range cg.List { pc.extractAnnotations(l.Text, map[string]func(string){ checkAtomicAnnotation: func(string) { switch lgf.AtomicDisposition { case atomicRequired: pc.maybeFail(obj.Pos(), "annotation is redundant, already atomic required") case atomicIgnore: pc.maybeFail(obj.Pos(), "annotation is contradictory, already atomic ignored") } lgf.AtomicDisposition = atomicRequired }, checkLocksIgnore: func(string) { switch lgf.AtomicDisposition { case atomicIgnore: pc.maybeFail(obj.Pos(), "annotation is redundant, already atomic ignored") case atomicRequired: pc.maybeFail(obj.Pos(), "annotation is contradictory, already atomic required") } lgf.AtomicDisposition = atomicIgnore }, checkLocksAnnotation: func(guardName string) { // Check for a duplicate annotation. if _, ok := lgf.GuardedBy[guardName]; ok { pc.maybeFail(obj.Pos(), "annotation %s specified more than once", guardName) return } // Add the item. if lgf.GuardedBy == nil { lgf.GuardedBy = make(map[string]fieldGuardResolver) } fr, ok := find(obj.Pos(), guardName) if !ok { pc.maybeFail(obj.Pos(), "annotation %s cannot be resolved", guardName) return } lgf.GuardedBy[guardName] = fr }, // N.B. We support only the vanilla annotation on // individual fields. If the field is a read lock, then // we will allow read access by default. checkLocksAnnotationRead: func(guardName string) { pc.maybeFail(obj.Pos(), "annotation %s not legal on fields", guardName) }, }) } // Save only if there is something meaningful. if len(lgf.GuardedBy) > 0 || lgf.AtomicDisposition != atomicDisallow { pc.pass.ExportObjectFact(obj, lgf) } } // findGlobalGuard attempts to resolve a name globally. func (pc *passContext) findGlobalGuard(pos token.Pos, guardName string) (*globalGuard, bool) { // Attempt to resolve the object. parts := strings.Split(guardName, ".") globalObj := pc.pass.Pkg.Scope().Lookup(parts[0]) if globalObj == nil { // No global object. return nil, false } fl, ok := pc.maybeFindFieldList(pos, globalObj, parts[1:], true /* exclusive */) if !ok { // Invalid fields. return nil, false } return &globalGuard{ ObjectName: parts[0], PackageName: pc.pass.Pkg.Path(), FieldList: fl, }, true } // findGlobalFieldGuard is compatible with findFieldGuardResolver. func (pc *passContext) findGlobalFieldGuard(pos token.Pos, guardName string) (fieldGuardResolver, bool) { g, ok := pc.findGlobalGuard(pos, guardName) return g, ok } // findGlobalFunctionGuard is compatible with findFunctionGuardResolver. func (pc *passContext) findGlobalFunctionGuard(pos token.Pos, guardName string) (functionGuardResolver, bool) { g, ok := pc.findGlobalGuard(pos, guardName) return g, ok } // structLockGuardFacts finds all relevant guard information for structures. func (pc *passContext) structLockGuardFacts(structType *types.Struct, ss *ast.StructType) { var fieldObj *types.Var findLocal := func(pos token.Pos, guardName string) (fieldGuardResolver, bool) { // Try to resolve from the local structure first. fl, ok := pc.findFieldList(pos, structType, strings.Split(guardName, "."), true /* exclusive */) if ok { // Found a valid resolution. return &fieldGuard{ FieldList: fl, }, true } // Attempt a global resolution. return pc.findGlobalFieldGuard(pos, guardName) } for i, field := range ss.Fields.List { var lgf lockGuardFacts fieldObj = structType.Field(i) // N.B. Captured above. if field.Doc != nil { pc.fillLockGuardFacts(fieldObj, field.Doc, findLocal, &lgf) } else if field.Comment != nil { pc.fillLockGuardFacts(fieldObj, field.Comment, findLocal, &lgf) } // See above, for anonymous structure fields. if ss, ok := field.Type.(*ast.StructType); ok { if st, ok := fieldObj.Type().(*types.Struct); ok { pc.structLockGuardFacts(st, ss) } } } } // globalLockGuardFacts finds all relevant guard information for globals. // // Note that the Type is checked in checklocks.go at the top-level. func (pc *passContext) globalLockGuardFacts(vs *ast.ValueSpec) { var lgf lockGuardFacts globalObj := pc.pass.TypesInfo.ObjectOf(vs.Names[0]) pc.fillLockGuardFacts(globalObj, vs.Doc, pc.findGlobalFieldGuard, &lgf) } // countFields gives an accurate field count, according for unnamed arguments // and return values and the compact identifier format. func countFields(fl []*ast.Field) (count int) { for _, field := range fl { if len(field.Names) == 0 { count++ continue } count += len(field.Names) } return } // matchFieldList attempts to match the given field. // // This function may or may not report an error. This is indicated in the // reported return value. If reported is true, then the specification is // ambiguous or not valid, and should be propagated. func (pc *passContext) matchFieldList(pos token.Pos, fields []*ast.Field, guardName string, exclusive bool) (number int, fl fieldList, reported, ok bool) { parts := strings.Split(guardName, ".") firstName := parts[0] index := 0 for _, field := range fields { // See countFields, above. if len(field.Names) == 0 { index++ continue } for _, name := range field.Names { if name.Name != firstName { index++ continue } obj := pc.pass.TypesInfo.ObjectOf(name) fl, ok := pc.maybeFindFieldList(pos, obj, parts[1:], exclusive) if !ok { // Some intermediate name does not match. The // resolveField function will not report. pc.maybeFail(pos, "name %s does not resolve to a field", guardName) return 0, nil, true, false } // Successfully found a field. return index, fl, false, true } } // Nothing matching. return 0, nil, false, false } // findFunctionGuard identifies the parameter number and field number for a // particular string of the 'a.b'. // // This function will report any errors directly. func (pc *passContext) findFunctionGuard(d *ast.FuncDecl, guardName string, exclusive bool, allowReturn bool) (functionGuardInfo, bool) { // Match against receiver & parameters. var parameterList []*ast.Field if d.Recv != nil { parameterList = append(parameterList, d.Recv.List...) } if d.Type.Params != nil { parameterList = append(parameterList, d.Type.Params.List...) } if index, fl, reported, ok := pc.matchFieldList(d.Pos(), parameterList, guardName, exclusive); reported || ok { if !ok { return functionGuardInfo{}, false } return functionGuardInfo{ Resolver: ¶meterGuard{ Index: index, FieldList: fl, }, Exclusive: exclusive, }, true } // Match against return values, if allowed. if allowReturn { var returnList []*ast.Field if d.Type.Results != nil { returnList = append(returnList, d.Type.Results.List...) } if index, fl, reported, ok := pc.matchFieldList(d.Pos(), returnList, guardName, exclusive); reported || ok { if !ok { return functionGuardInfo{}, false } return functionGuardInfo{ Resolver: &returnGuard{ Index: index, FieldList: fl, NeedsExtract: countFields(returnList) > 1, }, Exclusive: exclusive, }, true } } // Match against globals. if g, ok := pc.findGlobalFunctionGuard(d.Pos(), guardName); ok { return functionGuardInfo{ Resolver: g, Exclusive: exclusive, }, true } // No match found. pc.maybeFail(d.Pos(), "annotation %s does not have a match any parameter, return value or global", guardName) return functionGuardInfo{}, false } // functionFacts exports relevant function findings. func (pc *passContext) functionFacts(d *ast.FuncDecl) { // Extract guard information. if d.Doc == nil || d.Doc.List == nil { return } var lff lockFunctionFacts for _, l := range d.Doc.List { pc.extractAnnotations(l.Text, map[string]func(string){ checkLocksIgnore: func(string) { // Note that this applies to all atomic // analysis as well. There is no provided way // to selectively ignore only lock analysis or // atomic analysis, as we expect this use to be // extremely rare. lff.Ignore = true }, checkLocksAnnotation: func(guardName string) { lff.addGuardedBy(pc, d, guardName, true /* exclusive */) }, checkLocksAnnotationRead: func(guardName string) { lff.addGuardedBy(pc, d, guardName, false /* exclusive */) }, checkLocksAcquires: func(guardName string) { lff.addAcquires(pc, d, guardName, true /* exclusive */) }, checkLocksAcquiresRead: func(guardName string) { lff.addAcquires(pc, d, guardName, false /* exclusive */) }, checkLocksReleases: func(guardName string) { lff.addReleases(pc, d, guardName, true /* exclusive */) }, checkLocksReleasesRead: func(guardName string) { lff.addReleases(pc, d, guardName, false /* exclusive */) }, checkLocksAlias: func(guardName string) { lff.addAlias(pc, d, guardName) }, }) } // Export the function facts if there is anything to save. if lff.Ignore || len(lff.HeldOnEntry) > 0 || len(lff.HeldOnExit) > 0 { funcObj := pc.pass.TypesInfo.Defs[d.Name].(*types.Func) pc.pass.ExportObjectFact(funcObj, &lff) } } func init() { gob.Register((*returnGuard)(nil)) gob.Register((*globalGuard)(nil)) gob.Register((*parameterGuard)(nil)) gob.Register((*fieldGuard)(nil)) gob.Register((*fieldStructPtr)(nil)) gob.Register((*fieldStruct)(nil)) } golang-gvisor-gvisor-0.0~20240729.0/tools/checklocks/state.go000066400000000000000000000243701465435605700235710ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package checklocks import ( "fmt" "go/token" "go/types" "strings" "sync/atomic" "golang.org/x/tools/go/ssa" ) // lockInfo describes a held lock. type lockInfo struct { exclusive bool object types.Object } // lockState tracks the locking state and aliases. type lockState struct { // lockedMutexes is used to track which mutexes in a given struct are // currently locked. Note that most of the heavy lifting is done by // valueAndObject below, which maps to specific structure fields, etc. // // The value indicates whether this is an exclusive lock. lockedMutexes map[string]lockInfo // stored stores values that have been stored in memory, bound to // FreeVars or passed as Parameterse. stored map[ssa.Value]ssa.Value // used is a temporary map, used only for valueAndObject. It prevents // multiple use of the same memory location. used map[ssa.Value]struct{} // defers are the stack of defers that have been pushed. defers []*ssa.Defer // refs indicates the number of references on this structure. If it's // greater than one, we will do copy-on-write. refs *int32 } // newLockState makes a new lockState. func newLockState() *lockState { refs := int32(1) // Not shared. return &lockState{ lockedMutexes: make(map[string]lockInfo), used: make(map[ssa.Value]struct{}), stored: make(map[ssa.Value]ssa.Value), defers: make([]*ssa.Defer, 0), refs: &refs, } } // fork forks the locking state. When a lockState is forked, any modifications // will cause maps to be copied. func (l *lockState) fork() *lockState { if l == nil { return newLockState() } atomic.AddInt32(l.refs, 1) return &lockState{ lockedMutexes: l.lockedMutexes, used: make(map[ssa.Value]struct{}), stored: l.stored, defers: l.defers, refs: l.refs, } } // modify indicates that this state will be modified. func (l *lockState) modify() { if atomic.LoadInt32(l.refs) > 1 { // Copy the lockedMutexes. lm := make(map[string]lockInfo) for k, v := range l.lockedMutexes { lm[k] = v } l.lockedMutexes = lm // Copy the stored values. s := make(map[ssa.Value]ssa.Value) for k, v := range l.stored { s[k] = v } l.stored = s // Reset the used values. clear(l.used) // Copy the defers. ds := make([]*ssa.Defer, len(l.defers)) copy(ds, l.defers) l.defers = ds // Drop our reference. atomic.AddInt32(l.refs, -1) newRefs := int32(1) // Not shared. l.refs = &newRefs } } // isHeld indicates whether the field is held is not. // // Precondition: rv must be valid. func (l *lockState) isHeld(rv resolvedValue, exclusiveRequired bool) (string, bool) { if !rv.valid() { panic("invalid resolvedValue passed to isHeld") } s, _ := rv.valueAndObject(l) info, ok := l.lockedMutexes[s] if !ok { return s, false } // Accept a weaker lock if exclusiveRequired is false. if exclusiveRequired && !info.exclusive { return s, false } return s, true } // lockField locks the given field. // // If false is returned, the field was already locked. // // Precondition: rv must be valid. func (l *lockState) lockField(rv resolvedValue, exclusive bool) (string, bool) { if !rv.valid() { panic("invalid resolvedValue passed to isHeld") } s, obj := rv.valueAndObject(l) if _, ok := l.lockedMutexes[s]; ok { return s, false } l.modify() l.lockedMutexes[s] = lockInfo{ exclusive: exclusive, object: obj, } return s, true } // unlockField unlocks the given field. // // If false is returned, the field was not locked. // // Precondition: rv must be valid. func (l *lockState) unlockField(rv resolvedValue, exclusive bool) (string, bool) { if !rv.valid() { panic("invalid resolvedValue passed to isHeld") } s, _ := rv.valueAndObject(l) info, ok := l.lockedMutexes[s] if !ok { return s, false } if info.exclusive != exclusive { return s, false } l.modify() delete(l.lockedMutexes, s) return s, true } // downgradeField downgrades the given field. // // If false was returned, the field was not downgraded. // // Precondition: rv must be valid. func (l *lockState) downgradeField(rv resolvedValue) (string, bool) { if !rv.valid() { panic("invalid resolvedValue passed to isHeld") } s, _ := rv.valueAndObject(l) info, ok := l.lockedMutexes[s] if !ok { return s, false } if !info.exclusive { return s, false } l.modify() info.exclusive = false l.lockedMutexes[s] = info // Downgraded. return s, true } // store records an alias. func (l *lockState) store(addr ssa.Value, v ssa.Value) { l.modify() l.stored[addr] = v } // isSubset indicates other holds all the locks held by l. func (l *lockState) isSubset(other *lockState) bool { for k, info := range l.lockedMutexes { otherInfo, otherOk := other.lockedMutexes[k] if !otherOk { return false } // Accept weaker locks as a subset. if info.exclusive && !otherInfo.exclusive { return false } } return true } // count indicates the number of locks held. func (l *lockState) count() int { return len(l.lockedMutexes) } // isCompatible returns true if the states are compatible. func (l *lockState) isCompatible(other *lockState) bool { return l.isSubset(other) && other.isSubset(l) } // elemType is a type that implements the Elem function. type elemType interface { Elem() types.Type } // valueAndObject returns a string for a given value, along with a source level // object (if available and relevant). // // This decomposes the value into the simplest possible representation in terms // of parameters, free variables and globals. During resolution, stored values // may be transferred, as well as bound free variables. // // Nil may not be passed here. func (l *lockState) valueAndObject(v ssa.Value) (string, types.Object) { switch x := v.(type) { case *ssa.Parameter: // Was this provided as a paramter for a local anonymous // function invocation? v, ok := l.stored[x] if ok { return l.valueAndObject(v) } return fmt.Sprintf("{param:%s}", x.Name()), x.Object() case *ssa.Global: return fmt.Sprintf("{global:%s}", x.Name()), x.Object() case *ssa.FreeVar: // Attempt to resolve this, in case we are being invoked in a // scope where all the variables are bound. v, ok := l.stored[x] if ok { // The FreeVar is typically bound to a location, so we // check what's been stored there. Note that the second // may map to the same FreeVar, which we can check. stored, ok := l.stored[v] if ok { return l.valueAndObject(stored) } } // FreeVar does not have a corresponding source-level object // that we can return here. return fmt.Sprintf("{freevar:%s}", x.Name()), nil case *ssa.Convert: // Just disregard conversion. return l.valueAndObject(x.X) case *ssa.ChangeType: // Ditto, disregard. return l.valueAndObject(x.X) case *ssa.UnOp: if x.Op != token.MUL { break } // Is this loading a free variable? If yes, then this can be // resolved in the original isAlias function. if fv, ok := x.X.(*ssa.FreeVar); ok { return l.valueAndObject(fv) } // Should be try to resolve via a memory address? This needs to // be done since a memory location can hold its own value. if _, ok := l.used[x.X]; !ok { // Check if we know what the accessed location holds. // This is used to disambiguate memory locations. v, ok := l.stored[x.X] if ok { l.used[x.X] = struct{}{} defer func() { delete(l.used, x.X) }() return l.valueAndObject(v) } } // x.X.Type is pointer. We must construct this type // dynamically, since the ssa.Value could be synthetic. s, obj := l.valueAndObject(x.X) return fmt.Sprintf("*(%s)", s), obj case *ssa.Field: structType, ok := resolveStruct(x.X.Type()) if !ok { // This should not happen. panic(fmt.Sprintf("structType not available for struct: %#v", x.X)) } fieldObj := structType.Field(x.Field) s, _ := l.valueAndObject(x.X) return fmt.Sprintf("%s.%s", s, fieldObj.Name()), fieldObj case *ssa.FieldAddr: structType, ok := resolveStruct(x.X.Type()) if !ok { // This should not happen. panic(fmt.Sprintf("structType not available for struct: %#v", x.X)) } fieldObj := structType.Field(x.Field) s, _ := l.valueAndObject(x.X) return fmt.Sprintf("&(%s.%s)", s, fieldObj.Name()), fieldObj case *ssa.Index: s, _ := l.valueAndObject(x.X) i, _ := l.valueAndObject(x.Index) return fmt.Sprintf("%s[%s]", s, i), nil case *ssa.IndexAddr: s, _ := l.valueAndObject(x.X) i, _ := l.valueAndObject(x.Index) return fmt.Sprintf("&(%s[%s])", s, i), nil case *ssa.Lookup: s, _ := l.valueAndObject(x.X) i, _ := l.valueAndObject(x.Index) return fmt.Sprintf("%s[%s]", s, i), nil case *ssa.Extract: s, _ := l.valueAndObject(x.Tuple) return fmt.Sprintf("%s[%d]", s, x.Index), nil } // In the case of any other type (e.g. this may be an alloc, a return // value, etc.), just return the literal pointer value to the Value. // This will be unique within the ssa graph, and so if two values are // equal, they are from the same type. return fmt.Sprintf("{%T:%p}", v, v), nil } // String returns the full lock state. func (l *lockState) String() string { if l.count() == 0 { return "no locks held" } keys := make([]string, 0, len(l.lockedMutexes)) for k, info := range l.lockedMutexes { // Include the exclusive status of each lock. keys = append(keys, fmt.Sprintf("%s %s", k, exclusiveStr(info.exclusive))) } return strings.Join(keys, ",") } // pushDefer pushes a defer onto the stack. func (l *lockState) pushDefer(d *ssa.Defer) { l.modify() l.defers = append(l.defers, d) } // popDefer pops a defer from the stack. func (l *lockState) popDefer() *ssa.Defer { // Does not technically modify the underlying slice. count := len(l.defers) if count == 0 { return nil } d := l.defers[count-1] l.defers = l.defers[:count-1] return d } golang-gvisor-gvisor-0.0~20240729.0/tools/constraintutil/000077500000000000000000000000001465435605700230655ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/tools/constraintutil/BUILD000066400000000000000000000006741465435605700236560ustar00rootroot00000000000000load("//tools:defs.bzl", "go_library", "go_test") package( default_applicable_licenses = ["//:license"], licenses = ["notice"], ) go_library( name = "constraintutil", srcs = ["constraintutil.go"], marshal = False, stateify = False, visibility = ["//tools:__subpackages__"], ) go_test( name = "constraintutil_test", size = "small", srcs = ["constraintutil_test.go"], library = ":constraintutil", ) golang-gvisor-gvisor-0.0~20240729.0/tools/constraintutil/constraintutil.go000066400000000000000000000112101465435605700264710ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package constraintutil provides utilities for working with Go build // constraints. package constraintutil import ( "bufio" "bytes" "fmt" "go/build/constraint" "io" "os" "strings" ) // FromReader extracts the build constraint from the Go source or assembly file // whose contents are read by r. func FromReader(r io.Reader) (constraint.Expr, error) { // See go/build.parseFileHeader() for the "official" logic that this is // derived from. const ( slashStar = "/*" starSlash = "*/" gobuildPrefix = "//go:build" ) s := bufio.NewScanner(r) var ( inSlashStar = false // between /* and */ haveGobuild = false e constraint.Expr ) Lines: for s.Scan() { line := bytes.TrimSpace(s.Bytes()) if !inSlashStar && constraint.IsGoBuild(string(line)) { if haveGobuild { return nil, fmt.Errorf("multiple go:build directives") } haveGobuild = true var err error e, err = constraint.Parse(string(line)) if err != nil { return nil, err } } ThisLine: for len(line) > 0 { if inSlashStar { if i := bytes.Index(line, []byte(starSlash)); i >= 0 { inSlashStar = false line = bytes.TrimSpace(line[i+len(starSlash):]) continue ThisLine } continue Lines } if bytes.HasPrefix(line, []byte("//")) { continue Lines } // Note that if /* appears in the line, but not at the beginning, // then the line is still non-empty, so skipping this and // terminating below is correct. if bytes.HasPrefix(line, []byte(slashStar)) { inSlashStar = true line = bytes.TrimSpace(line[len(slashStar):]) continue ThisLine } // A non-empty non-comment line terminates scanning for go:build. break Lines } } return e, s.Err() } // FromString extracts the build constraint from the Go source or assembly file // containing the given data. If no build constraint applies to the file, it // returns nil. func FromString(str string) (constraint.Expr, error) { return FromReader(strings.NewReader(str)) } // FromFile extracts the build constraint from the Go source or assembly file // at the given path. If no build constraint applies to the file, it returns // nil. func FromFile(path string) (constraint.Expr, error) { f, err := os.Open(path) if err != nil { return nil, err } defer f.Close() return FromReader(f) } // Combine returns a constraint.Expr that evaluates to true iff all expressions // in es evaluate to true. If es is empty, Combine returns nil. // // Preconditions: All constraint.Exprs in es are non-nil. func Combine(es []constraint.Expr) constraint.Expr { switch len(es) { case 0: return nil case 1: return es[0] default: a := &constraint.AndExpr{es[0], es[1]} for i := 2; i < len(es); i++ { a = &constraint.AndExpr{a, es[i]} } return a } } // CombineFromFiles returns a build constraint expression that evaluates to // true iff the build constraints from all of the given Go source or assembly // files evaluate to true. If no build constraints apply to any of the given // files, it returns nil. func CombineFromFiles(paths []string) (constraint.Expr, error) { var es []constraint.Expr for _, path := range paths { e, err := FromFile(path) if err != nil { return nil, fmt.Errorf("failed to read build constraints from %q: %v", path, err) } if e != nil { es = append(es, e) } } return Combine(es), nil } // Lines returns a string containing build constraint directives for the given // constraint.Expr, including two trailing newlines, as appropriate for a Go // source or assembly file. At least a go:build directive will be emitted; if // the constraint is expressible using +build directives as well, then +build // directives will also be emitted. // // If e is nil, Lines returns the empty string. func Lines(e constraint.Expr) string { if e == nil { return "" } var b strings.Builder b.WriteString("//go:build ") b.WriteString(e.String()) b.WriteByte('\n') if pblines, err := constraint.PlusBuildLines(e); err == nil { for _, line := range pblines { b.WriteString(line) b.WriteByte('\n') } } b.WriteByte('\n') return b.String() } golang-gvisor-gvisor-0.0~20240729.0/tools/constraintutil/constraintutil_test.go000066400000000000000000000066321465435605700275440ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package constraintutil import ( "go/build/constraint" "testing" ) func TestFileParsing(t *testing.T) { for _, test := range []struct { name string data string expr string }{ { name: "Empty", }, { name: "NoConstraint", data: "// copyright header\n\npackage main", }, { name: "ConstraintOnFirstLine", data: "//go:build amd64\n#include \"textflag.h\"", expr: "amd64", }, { name: "ConstraintAfterSlashSlashComment", data: "// copyright header\n\n//go:build linux\n\npackage newlib", expr: "linux", }, { name: "ConstraintAfterSlashStarComment", data: "/*\ncopyright header\n*/\n\n//go:build !race\n\npackage oldlib", expr: "!race", }, { name: "ConstraintInSlashSlashComment", data: "// blah blah //go:build windows", }, { name: "ConstraintInSlashStarComment", data: "/*\n//go:build windows\n*/", }, { name: "ConstraintAfterPackageClause", data: "package oops\n//go:build race", }, { name: "ConstraintAfterCppInclude", data: "#include \"textflag.h\"\n//go:build arm64", }, } { t.Run(test.name, func(t *testing.T) { e, err := FromString(test.data) if err != nil { t.Fatalf("FromString(%q) failed: %v", test.data, err) } if e == nil { if len(test.expr) != 0 { t.Errorf("FromString(%q): got no constraint, wanted %q", test.data, test.expr) } } else { got := e.String() if len(test.expr) == 0 { t.Errorf("FromString(%q): got %q, wanted no constraint", test.data, got) } else if got != test.expr { t.Errorf("FromString(%q): got %q, wanted %q", test.data, got, test.expr) } } }) } } func TestCombine(t *testing.T) { for _, test := range []struct { name string in []string out string }{ { name: "0", }, { name: "1", in: []string{"amd64 || arm64"}, out: "amd64 || arm64", }, { name: "2", in: []string{"amd64", "amd64 && linux"}, out: "amd64 && amd64 && linux", }, { name: "3", in: []string{"amd64", "amd64 || arm64", "amd64 || riscv64"}, out: "amd64 && (amd64 || arm64) && (amd64 || riscv64)", }, } { t.Run(test.name, func(t *testing.T) { inexprs := make([]constraint.Expr, 0, len(test.in)) for _, estr := range test.in { line := "//go:build " + estr e, err := constraint.Parse(line) if err != nil { t.Fatalf("constraint.Parse(%q) failed: %v", line, err) } inexprs = append(inexprs, e) } outexpr := Combine(inexprs) if outexpr == nil { if len(test.out) != 0 { t.Errorf("Combine(%v): got no constraint, wanted %q", test.in, test.out) } } else { got := outexpr.String() if len(test.out) == 0 { t.Errorf("Combine(%v): got %q, wanted no constraint", test.in, got) } else if got != test.out { t.Errorf("Combine(%v): got %q, wanted %q", test.in, got, test.out) } } }) } } golang-gvisor-gvisor-0.0~20240729.0/tools/go_fieldenum/000077500000000000000000000000001465435605700224405ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/tools/go_fieldenum/BUILD000066400000000000000000000005141465435605700232220ustar00rootroot00000000000000load("//tools:defs.bzl", "bzl_library", "go_binary") package(default_applicable_licenses = ["//:license"]) licenses(["notice"]) go_binary( name = "fieldenum", srcs = ["main.go"], visibility = ["//:sandbox"], ) bzl_library( name = "defs_bzl", srcs = ["defs.bzl"], visibility = ["//visibility:private"], ) golang-gvisor-gvisor-0.0~20240729.0/tools/go_fieldenum/defs.bzl000066400000000000000000000020651465435605700240750ustar00rootroot00000000000000"""The go_fieldenum target infers Field, Fields, and FieldSet types for each struct in an input source file marked +fieldenum. """ def _go_fieldenum_impl(ctx): output = ctx.outputs.out args = ["-pkg=%s" % ctx.attr.package, "-out=%s" % output.path] for src in ctx.attr.srcs: args += [f.path for f in src.files.to_list()] ctx.actions.run( inputs = ctx.files.srcs, outputs = [output], mnemonic = "GoFieldenum", progress_message = "Generating Go field enumerators %s" % ctx.label, arguments = args, executable = ctx.executable._tool, ) go_fieldenum = rule( implementation = _go_fieldenum_impl, attrs = { "srcs": attr.label_list(doc = "input source files", mandatory = True, allow_files = True), "package": attr.string(doc = "the package for the generated source file", mandatory = True), "out": attr.output(doc = "output file", mandatory = True), "_tool": attr.label(executable = True, cfg = "exec", default = Label("//tools/go_fieldenum:fieldenum")), }, ) golang-gvisor-gvisor-0.0~20240729.0/tools/go_fieldenum/main.go000066400000000000000000000255041465435605700237210ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Binary fieldenum emits field bitmasks for all structs in a package marked // "+fieldenum". package main import ( "flag" "fmt" "go/ast" "go/parser" "go/token" "log" "os" "strings" ) var ( outputPkg = flag.String("pkg", "", "output package") outputFilename = flag.String("out", "-", "output filename") ) func main() { // Parse command line arguments. flag.Parse() if len(*outputPkg) == 0 { log.Fatalf("-pkg must be provided") } if len(flag.Args()) == 0 { log.Fatalf("Input files must be provided") } // Parse input files. inputFiles := make([]*ast.File, 0, len(flag.Args())) fset := token.NewFileSet() for _, filename := range flag.Args() { f, err := parser.ParseFile(fset, filename, nil, parser.ParseComments) if err != nil { log.Fatalf("Failed to parse input file %q: %v", filename, err) } inputFiles = append(inputFiles, f) } // Determine which types are marked "+fieldenum" and will consequently have // code generated. var typeNames []string fieldEnumTypes := make(map[string]fieldEnumTypeInfo) for _, f := range inputFiles { for _, decl := range f.Decls { d, ok := decl.(*ast.GenDecl) if !ok || d.Tok != token.TYPE || d.Doc == nil || len(d.Specs) == 0 { continue } for _, l := range d.Doc.List { const fieldenumPrefixWithSpace = "// +fieldenum " if l.Text == "// +fieldenum" || strings.HasPrefix(l.Text, fieldenumPrefixWithSpace) { spec := d.Specs[0].(*ast.TypeSpec) name := spec.Name.Name prefix := name if len(l.Text) > len(fieldenumPrefixWithSpace) { prefix = strings.TrimSpace(l.Text[len(fieldenumPrefixWithSpace):]) } st, ok := spec.Type.(*ast.StructType) if !ok { log.Fatalf("Type %s is marked +fieldenum, but is not a struct", name) } typeNames = append(typeNames, name) fieldEnumTypes[name] = fieldEnumTypeInfo{ prefix: prefix, structType: st, } break } } } } // Collect information for each type for which code is being generated. structInfos := make([]structInfo, 0, len(typeNames)) needAtomic := false for _, typeName := range typeNames { typeInfo := fieldEnumTypes[typeName] var si structInfo si.name = typeName si.prefix = typeInfo.prefix for _, field := range typeInfo.structType.Fields.List { name := structFieldName(field) // If the field's type is a type that is also marked +fieldenum, // include a FieldSet for that type in this one's. The field must // be a struct by value, since if it's a pointer then that struct // might also point to or include this one (which would make // FieldSet inclusion circular). It must also be a type defined in // this package, since otherwise we don't know whether it's marked // +fieldenum. Thus, field.Type must be an identifier (rather than // an ast.StarExpr or SelectorExpr). if tident, ok := field.Type.(*ast.Ident); ok { if fieldTypeInfo, ok := fieldEnumTypes[tident.Name]; ok { fsf := fieldSetField{ fieldName: name, typePrefix: fieldTypeInfo.prefix, } si.reprByFieldSet = append(si.reprByFieldSet, fsf) si.allFields = append(si.allFields, fsf) continue } } si.reprByBit = append(si.reprByBit, name) si.allFields = append(si.allFields, fieldSetField{ fieldName: name, }) // atomicbitops import will be needed for FieldSet.Load(). needAtomic = true } structInfos = append(structInfos, si) } // Build the output file. var b strings.Builder fmt.Fprintf(&b, "// Generated by go_fieldenum.\n\n") fmt.Fprintf(&b, "package %s\n\n", *outputPkg) if needAtomic { fmt.Fprintf(&b, `import "gvisor.dev/gvisor/pkg/atomicbitops"`) fmt.Fprintf(&b, "\n\n") } for _, si := range structInfos { si.writeTo(&b) } if *outputFilename == "-" { // Write output to stdout. fmt.Printf("%s", b.String()) } else { // Write output to file. f, err := os.OpenFile(*outputFilename, os.O_WRONLY|os.O_CREATE|os.O_EXCL, 0644) if err != nil { log.Fatalf("Failed to open output file %q: %v", *outputFilename, err) } if _, err := f.WriteString(b.String()); err != nil { log.Fatalf("Failed to write output file %q: %v", *outputFilename, err) } f.Close() } } type fieldEnumTypeInfo struct { prefix string structType *ast.StructType } // structInfo contains information about the code generated for a given struct. type structInfo struct { // name is the name of the represented struct. name string // prefix is the prefix X applied to the name of each generated type and // constant, referred to as X in the comments below for convenience. prefix string // reprByBit contains the names of fields in X that should be represented // by a bit in the bit mask XFieldSet.fields, and by a bool in XFields. reprByBit []string // reprByFieldSet contains fields in X whose type is a named struct (e.g. // Y) that has a corresponding FieldSet type YFieldSet, and which should // therefore be represented by including a value of type YFieldSet in // XFieldSet, and a value of type YFields in XFields. reprByFieldSet []fieldSetField // allFields contains all fields in X in order of declaration. Fields in // reprByBit have fieldSetField.typePrefix == "". allFields []fieldSetField } type fieldSetField struct { fieldName string typePrefix string } func structFieldName(f *ast.Field) string { if len(f.Names) != 0 { return f.Names[0].Name } // For embedded struct fields, the field name is the unqualified type name. texpr := f.Type for { switch t := texpr.(type) { case *ast.StarExpr: texpr = t.X case *ast.SelectorExpr: texpr = t.Sel case *ast.Ident: return t.Name default: panic(fmt.Sprintf("unexpected %T", texpr)) } } } func (si *structInfo) writeTo(b *strings.Builder) { fmt.Fprintf(b, "// A %sField represents a field in %s.\n", si.prefix, si.name) fmt.Fprintf(b, "type %sField uint\n\n", si.prefix) if len(si.reprByBit) != 0 { fmt.Fprintf(b, "// %sFieldX represents %s field X.\n", si.prefix, si.name) fmt.Fprintf(b, "const (\n") fmt.Fprintf(b, "\t%sField%s %sField = iota\n", si.prefix, si.reprByBit[0], si.prefix) for _, fieldName := range si.reprByBit[1:] { fmt.Fprintf(b, "\t%sField%s\n", si.prefix, fieldName) } fmt.Fprintf(b, ")\n\n") } fmt.Fprintf(b, "// %sFields represents a set of fields in %s in a literal-friendly form.\n", si.prefix, si.name) fmt.Fprintf(b, "// The zero value of %sFields represents an empty set.\n", si.prefix) fmt.Fprintf(b, "type %sFields struct {\n", si.prefix) for _, fieldSetField := range si.allFields { if fieldSetField.typePrefix == "" { fmt.Fprintf(b, "\t%s bool\n", fieldSetField.fieldName) } else { fmt.Fprintf(b, "\t%s %sFields\n", fieldSetField.fieldName, fieldSetField.typePrefix) } } fmt.Fprintf(b, "}\n\n") fmt.Fprintf(b, "// %sFieldSet represents a set of fields in %s in a compact form.\n", si.prefix, si.name) fmt.Fprintf(b, "// The zero value of %sFieldSet represents an empty set.\n", si.prefix) fmt.Fprintf(b, "type %sFieldSet struct {\n", si.prefix) numBitmaskUint32s := (len(si.reprByBit) + 31) / 32 for _, fieldSetField := range si.reprByFieldSet { fmt.Fprintf(b, "\t%s %sFieldSet\n", fieldSetField.fieldName, fieldSetField.typePrefix) } if len(si.reprByBit) != 0 { fmt.Fprintf(b, "\tfields [%d]atomicbitops.Uint32\n", numBitmaskUint32s) } fmt.Fprintf(b, "}\n\n") if len(si.reprByBit) != 0 { fmt.Fprintf(b, "// Contains returns true if f is present in the %sFieldSet.\n", si.prefix) fmt.Fprintf(b, "func (fs *%sFieldSet) Contains(f %sField) bool {\n", si.prefix, si.prefix) if numBitmaskUint32s == 1 { fmt.Fprintf(b, "\treturn fs.fields[0].RacyLoad() & (uint32(1) << uint(f)) != 0\n") } else { fmt.Fprintf(b, "\treturn fs.fields[f/32].RacyLoad() & (uint32(1) << (f%%32)) != 0\n") } fmt.Fprintf(b, "}\n\n") fmt.Fprintf(b, "// Add adds f to the %sFieldSet.\n", si.prefix) fmt.Fprintf(b, "func (fs *%sFieldSet) Add(f %sField) {\n", si.prefix, si.prefix) if numBitmaskUint32s == 1 { fmt.Fprintf(b, "\tfs.fields[0] = atomicbitops.FromUint32(fs.fields[0].RacyLoad() | (uint32(1) << uint(f)))\n") } else { fmt.Fprintf(b, "\tfs.fields[f/32] = atomicbitops.FromUint32(fs.fields[f/32].RacyLoad() | (uint32(1) << (f%%32))\n") } fmt.Fprintf(b, "}\n\n") fmt.Fprintf(b, "// Remove removes f from the %sFieldSet.\n", si.prefix) fmt.Fprintf(b, "func (fs *%sFieldSet) Remove(f %sField) {\n", si.prefix, si.prefix) if numBitmaskUint32s == 1 { fmt.Fprintf(b, "\tfs.fields[0] = atomicbitops.FromUint32(fs.fields[0].RacyLoad() &^ (uint32(1) << uint(f)))\n") } else { fmt.Fprintf(b, "\tfs.fields[f/32] = atomicbitops.FromUint32(fs.fields[f/32].RacyLoad() &^ (uint32(1) << uint(f%%32)))\n") } fmt.Fprintf(b, "}\n\n") } fmt.Fprintf(b, "// Load returns a copy of the %sFieldSet.\n", si.prefix) fmt.Fprintf(b, "// Load is safe to call concurrently with AddFieldsLoadable, but not Add or Remove.\n") fmt.Fprintf(b, "func (fs *%sFieldSet) Load() (copied %sFieldSet) {\n", si.prefix, si.prefix) for _, fieldSetField := range si.reprByFieldSet { fmt.Fprintf(b, "\tcopied.%s = fs.%s.Load()\n", fieldSetField.fieldName, fieldSetField.fieldName) } for i := 0; i < numBitmaskUint32s; i++ { fmt.Fprintf(b, "\tcopied.fields[%d] = atomicbitops.FromUint32(fs.fields[%d].Load())\n", i, i) } fmt.Fprintf(b, "\treturn\n") fmt.Fprintf(b, "}\n\n") fmt.Fprintf(b, "// AddFieldsLoadable adds the given fields to the %sFieldSet.\n", si.prefix) fmt.Fprintf(b, "// AddFieldsLoadable is safe to call concurrently with Load, but not other methods (including other calls to AddFieldsLoadable).\n") fmt.Fprintf(b, "func (fs *%sFieldSet) AddFieldsLoadable(fields %sFields) {\n", si.prefix, si.prefix) for _, fieldSetField := range si.reprByFieldSet { fmt.Fprintf(b, "\tfs.%s.AddFieldsLoadable(fields.%s)\n", fieldSetField.fieldName, fieldSetField.fieldName) } for _, fieldName := range si.reprByBit { fieldConstName := fmt.Sprintf("%sField%s", si.prefix, fieldName) fmt.Fprintf(b, "\tif fields.%s {\n", fieldName) if numBitmaskUint32s == 1 { fmt.Fprintf(b, "\t\tfs.fields[0].Store(fs.fields[0].RacyLoad() | (uint32(1) << uint(%s)))\n", fieldConstName) } else { fmt.Fprintf(b, "\t\tword, bit := %s/32, %s%%32\n", fieldConstName, fieldConstName) fmt.Fprintf(b, "\t\tfs.fields[word].Store(fs.fields[word].RacyLoad() | (uint32(1) << bit))\n") } fmt.Fprintf(b, "\t}\n") } fmt.Fprintf(b, "}\n\n") } golang-gvisor-gvisor-0.0~20240729.0/tools/go_generics/000077500000000000000000000000001465435605700222675ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/tools/go_generics/BUILD000066400000000000000000000007001465435605700230460ustar00rootroot00000000000000load("//tools:defs.bzl", "bzl_library", "go_binary") package( default_applicable_licenses = ["//:license"], licenses = ["notice"], ) go_binary( name = "go_generics", srcs = [ "imports.go", "main.go", "remove.go", ], visibility = ["//:sandbox"], deps = ["//tools/go_generics/globals"], ) bzl_library( name = "defs_bzl", srcs = ["defs.bzl"], visibility = ["//visibility:private"], ) golang-gvisor-gvisor-0.0~20240729.0/tools/go_generics/defs.bzl000066400000000000000000000133271465435605700237270ustar00rootroot00000000000000"""Generics support via go_generics. A Go template is similar to a go library, except that it has certain types that can be replaced before usage. For example, one could define a templatized List struct, whose elements are of type T, then instantiate that template for T=segment, where "segment" is the concrete type. """ TemplateInfo = provider( "Information about a go_generics template.", fields = { "unsafe": "whether the template requires unsafe code", "types": "required types", "opt_types": "optional types", "consts": "required consts", "opt_consts": "optional consts", "deps": "package dependencies", "template": "merged template source file", }, ) def _go_template_impl(ctx): srcs = ctx.files.srcs template = ctx.actions.declare_file(ctx.label.name + "_template.go") args = ["-o=%s" % template.path] + [f.path for f in srcs] ctx.actions.run( inputs = srcs, outputs = [template], mnemonic = "GoGenericsTemplate", progress_message = "Building Go template %s" % ctx.label, arguments = args, executable = ctx.executable._tool, ) return [TemplateInfo( types = ctx.attr.types, opt_types = ctx.attr.opt_types, consts = ctx.attr.consts, opt_consts = ctx.attr.opt_consts, deps = ctx.attr.deps, template = template, )] go_template = rule( implementation = _go_template_impl, attrs = { "srcs": attr.label_list(doc = "the list of source files that comprise the template", mandatory = True, allow_files = True), "deps": attr.label_list(doc = "the standard dependency list", allow_files = True, cfg = "target"), "types": attr.string_list(doc = "the list of generic types in the template that are required to be specified"), "opt_types": attr.string_list(doc = "the list of generic types in the template that can but aren't required to be specified"), "consts": attr.string_list(doc = "the list of constants in the template that are required to be specified"), "opt_consts": attr.string_list(doc = "the list of constants in the template that can but aren't required to be specified"), "_tool": attr.label(executable = True, cfg = "exec", default = Label("//tools/go_generics/go_merge")), }, ) def _go_template_instance_impl(ctx): info = ctx.attr.template[TemplateInfo] output = ctx.outputs.out # Check that all required types are defined. for t in info.types: if t not in ctx.attr.types: fail("Missing value for type %s in %s" % (t, ctx.attr.template.label)) # Check that all defined types are expected by the template. for t in ctx.attr.types: if (t not in info.types) and (t not in info.opt_types): fail("Type %s is not a parameter to %s" % (t, ctx.attr.template.label)) # Check that all required consts are defined. for t in info.consts: if t not in ctx.attr.consts: fail("Missing value for constant %s in %s" % (t, ctx.attr.template.label)) # Check that all defined consts are expected by the template. for t in ctx.attr.consts: if (t not in info.consts) and (t not in info.opt_consts): fail("Const %s is not a parameter to %s" % (t, ctx.attr.template.label)) # Build the argument list. args = ["-i=%s" % info.template.path, "-o=%s" % output.path] if ctx.attr.package: args.append("-p=%s" % ctx.attr.package) if len(ctx.attr.prefix) > 0: args.append("-prefix=%s" % ctx.attr.prefix) if len(ctx.attr.suffix) > 0: args.append("-suffix=%s" % ctx.attr.suffix) args += [("-t=%s=%s" % (p[0], p[1])) for p in ctx.attr.types.items()] args += [("-c=%s=%s" % (p[0], p[1])) for p in ctx.attr.consts.items()] args += [("-import=%s=%s" % (p[0], p[1])) for p in ctx.attr.imports.items()] args += [("-in-substr=%s=%s" % (p[0], p[1])) for p in ctx.attr.input_substrs.items()] args += [("-out-substr=%s=%s" % (p[0], p[1])) for p in ctx.attr.substrs.items()] if ctx.attr.anon: args.append("-anon") ctx.actions.run( inputs = [info.template], outputs = [output], mnemonic = "GoGenericsInstance", progress_message = "Building Go template instance %s" % ctx.label, arguments = args, executable = ctx.executable._tool, ) return [DefaultInfo( files = depset([output]), )] go_template_instance = rule( implementation = _go_template_instance_impl, attrs = { "template": attr.label(doc = "the label of the template to be instantiated", mandatory = True), "prefix": attr.string(doc = "a prefix to be added to globals in the template"), "suffix": attr.string(doc = "a suffix to be added to globals in the template"), "types": attr.string_dict(doc = "the map from generic type names to concrete ones"), "consts": attr.string_dict(doc = "the map from constant names to their values"), "imports": attr.string_dict(doc = "the map from imports used in types/consts to their import paths"), "input_substrs": attr.string_dict(doc = "the map from sub-strings to their replacements, applied just after reading the template code"), "substrs": attr.string_dict(doc = "the map from sub-strings to their replacements, applied just before writing the template instance code"), "anon": attr.bool(doc = "whether anoymous fields should be processed", mandatory = False, default = False), "package": attr.string(doc = "the package for the generated source file", mandatory = False), "out": attr.output(doc = "output file", mandatory = True), "_tool": attr.label(executable = True, cfg = "exec", default = Label("//tools/go_generics")), }, ) golang-gvisor-gvisor-0.0~20240729.0/tools/go_generics/globals/000077500000000000000000000000001465435605700237125ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/tools/go_generics/globals/BUILD000066400000000000000000000004631465435605700244770ustar00rootroot00000000000000load("//tools:defs.bzl", "go_library") package( default_applicable_licenses = ["//:license"], licenses = ["notice"], ) go_library( name = "globals", srcs = [ "globals_visitor.go", "scope.go", ], stateify = False, visibility = ["//tools/go_generics:__pkg__"], ) golang-gvisor-gvisor-0.0~20240729.0/tools/go_generics/globals/globals_visitor.go000066400000000000000000000345041465435605700274510ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package globals provides an AST visitor that calls the visit function for all // global identifiers. package globals import ( "fmt" "go/ast" "go/token" "path/filepath" "strconv" ) // globalsVisitor holds the state used while traversing the nodes of a file in // search of globals. // // The visitor does two passes on the global declarations: the first one adds // all globals to the global scope (since Go allows references to globals that // haven't been declared yet), and the second one calls f() for the definition // and uses of globals found in the first pass. // // The implementation correctly handles cases when globals are aliased by // locals; in such cases, f() is not called. type globalsVisitor struct { // file is the file whose nodes are being visited. file *ast.File // fset is the file set the file being visited belongs to. fset *token.FileSet // f is the visit function to be called when a global symbol is reached. f func(*ast.Ident, SymKind) // scope is the current scope as nodes are visited. scope *scope // processAnon indicates whether we should process anonymous struct fields. // It does not perform strict checking on parameter types that share the same name // as the global type and therefore will rename them as well. processAnon bool } // unexpected is called when an unexpected node appears in the AST. It dumps // the location of the associated token and panics because this should only // happen when there is a bug in the traversal code. func (v *globalsVisitor) unexpected(p token.Pos) { panic(fmt.Sprintf("Unable to parse at %v", v.fset.Position(p))) } // pushScope creates a new scope and pushes it to the top of the scope stack. func (v *globalsVisitor) pushScope() { v.scope = newScope(v.scope) } // popScope removes the scope created by the last call to pushScope. func (v *globalsVisitor) popScope() { v.scope = v.scope.outer } // visitType is called when an expression is known to be a type, for example, // on the first argument of make(). It visits all children nodes and reports // any globals. func (v *globalsVisitor) visitType(ge ast.Expr) { switch e := ge.(type) { case *ast.Ident: if s := v.scope.deepLookup(e.Name); s != nil && s.scope.isGlobal() { v.f(e, s.kind) } case *ast.SelectorExpr: id := GetIdent(e.X) if id == nil { v.unexpected(e.X.Pos()) } case *ast.StarExpr: v.visitType(e.X) case *ast.ParenExpr: v.visitType(e.X) case *ast.ChanType: v.visitType(e.Value) case *ast.Ellipsis: v.visitType(e.Elt) case *ast.ArrayType: v.visitExpr(e.Len) v.visitType(e.Elt) case *ast.MapType: v.visitType(e.Key) v.visitType(e.Value) case *ast.StructType: v.visitFields(e.Fields, KindUnknown) case *ast.FuncType: v.visitFields(e.Params, KindUnknown) v.visitFields(e.Results, KindUnknown) case *ast.InterfaceType: v.visitFields(e.Methods, KindUnknown) default: v.unexpected(ge.Pos()) } } // visitFields visits all fields, and add symbols if kind isn't KindUnknown. func (v *globalsVisitor) visitFields(l *ast.FieldList, kind SymKind) { if l == nil { return } for _, f := range l.List { if kind != KindUnknown { for _, n := range f.Names { v.scope.add(n.Name, kind, n.Pos()) } } v.visitType(f.Type) if f.Tag != nil { tag := ast.NewIdent(f.Tag.Value) v.f(tag, KindTag) // Replace the tag if updated. if tag.Name != f.Tag.Value { f.Tag.Value = tag.Name } } } } // visitGenDecl is called when a generic declaration is encountered, for example, // on variable, constant and type declarations. It adds all newly defined // symbols to the current scope and reports them if the current scope is the // global one. func (v *globalsVisitor) visitGenDecl(d *ast.GenDecl) { switch d.Tok { case token.IMPORT: case token.TYPE: for _, gs := range d.Specs { s := gs.(*ast.TypeSpec) v.scope.add(s.Name.Name, KindType, s.Name.Pos()) if v.scope.isGlobal() { v.f(s.Name, KindType) } v.visitType(s.Type) } case token.CONST, token.VAR: kind := KindConst if d.Tok == token.VAR { kind = KindVar } for _, gs := range d.Specs { s := gs.(*ast.ValueSpec) if s.Type != nil { v.visitType(s.Type) } for _, e := range s.Values { v.visitExpr(e) } for _, n := range s.Names { if v.scope.isGlobal() { v.f(n, kind) } v.scope.add(n.Name, kind, n.Pos()) } } default: v.unexpected(d.Pos()) } } // isViableType determines if the given expression is a viable type expression, // that is, if it could be interpreted as a type, for example, sync.Mutex, // myType, func(int)int, as opposed to -1, 2 * 2, a + b, etc. func (v *globalsVisitor) isViableType(expr ast.Expr) bool { switch e := expr.(type) { case *ast.Ident: // This covers the plain identifier case. When we see it, we // have to check if it resolves to a type; if the symbol is not // known, we'll claim it's viable as a type. s := v.scope.deepLookup(e.Name) return s == nil || s.kind == KindType case *ast.ChanType, *ast.ArrayType, *ast.MapType, *ast.StructType, *ast.FuncType, *ast.InterfaceType, *ast.Ellipsis: // This covers the following cases: // 1. ChanType: // chan T // <-chan T // chan<- T // 2. ArrayType: // [Expr]T // 3. MapType: // map[T]U // 4. StructType: // struct { Fields } // 5. FuncType: // func(Fields)Returns // 6. Interface: // interface { Fields } // 7. Ellipsis: // ...T return true case *ast.SelectorExpr: // The only case in which an expression involving a selector can // be a type is if it has the following form X.T, where X is an // import, and T is a type exported by X. // // There's no way to know whether T is a type because we don't // parse imports. So we just claim that this is a viable type; // it doesn't affect the general result because we don't visit // imported symbols. id := GetIdent(e.X) if id == nil { return false } s := v.scope.deepLookup(id.Name) return s != nil && s.kind == KindImport case *ast.StarExpr: // This covers the *T case. The expression is a viable type if // T is. return v.isViableType(e.X) case *ast.ParenExpr: // This covers the (T) case. The expression is a viable type if // T is. return v.isViableType(e.X) default: return false } } // visitCallExpr visits a "call expression" which can be either a // function/method call (e.g., f(), pkg.f(), obj.f(), etc.) call or a type // conversion (e.g., int32(1), (*sync.Mutex)(ptr), etc.). func (v *globalsVisitor) visitCallExpr(e *ast.CallExpr) { if v.isViableType(e.Fun) { v.visitType(e.Fun) } else { v.visitExpr(e.Fun) } // If the function being called is new or make, the first argument is // a type, so it needs to be visited as such. first := 0 if id := GetIdent(e.Fun); id != nil && (id.Name == "make" || id.Name == "new") { if len(e.Args) > 0 { v.visitType(e.Args[0]) } first = 1 } for i := first; i < len(e.Args); i++ { v.visitExpr(e.Args[i]) } } // visitExpr visits all nodes of an expression, and reports any globals that it // finds. func (v *globalsVisitor) visitExpr(ge ast.Expr) { switch e := ge.(type) { case nil: case *ast.Ident: if s := v.scope.deepLookup(e.Name); s != nil && s.scope.isGlobal() { v.f(e, s.kind) } case *ast.BasicLit: case *ast.CompositeLit: v.visitType(e.Type) for _, ne := range e.Elts { v.visitExpr(ne) } case *ast.FuncLit: v.pushScope() v.visitFields(e.Type.Params, KindParameter) v.visitFields(e.Type.Results, KindResult) v.visitBlockStmt(e.Body) v.popScope() case *ast.BinaryExpr: v.visitExpr(e.X) v.visitExpr(e.Y) case *ast.CallExpr: v.visitCallExpr(e) case *ast.IndexExpr: v.visitExpr(e.X) v.visitExpr(e.Index) case *ast.KeyValueExpr: v.visitExpr(e.Value) case *ast.ParenExpr: v.visitExpr(e.X) case *ast.SelectorExpr: v.visitExpr(e.X) if v.processAnon { v.visitExpr(e.Sel) } case *ast.SliceExpr: v.visitExpr(e.X) v.visitExpr(e.Low) v.visitExpr(e.High) v.visitExpr(e.Max) case *ast.StarExpr: v.visitExpr(e.X) case *ast.TypeAssertExpr: v.visitExpr(e.X) if e.Type != nil { v.visitType(e.Type) } case *ast.UnaryExpr: v.visitExpr(e.X) default: v.unexpected(ge.Pos()) } } // GetIdent returns the identifier associated with the given expression by // removing parentheses if needed. func GetIdent(expr ast.Expr) *ast.Ident { switch e := expr.(type) { case *ast.Ident: return e case *ast.ParenExpr: return GetIdent(e.X) default: return nil } } // visitStmt visits all nodes of a statement, and reports any globals that it // finds. It also adds to the current scope new symbols defined/declared. func (v *globalsVisitor) visitStmt(gs ast.Stmt) { switch s := gs.(type) { case nil, *ast.BranchStmt, *ast.EmptyStmt: case *ast.AssignStmt: for _, e := range s.Rhs { v.visitExpr(e) } // We visit the LHS after the RHS because the symbols we'll // potentially add to the table aren't meant to be visible to // the RHS. for _, e := range s.Lhs { if s.Tok == token.DEFINE { if n := GetIdent(e); n != nil { v.scope.add(n.Name, KindVar, n.Pos()) } } v.visitExpr(e) } case *ast.BlockStmt: v.visitBlockStmt(s) case *ast.DeclStmt: v.visitGenDecl(s.Decl.(*ast.GenDecl)) case *ast.DeferStmt: v.visitCallExpr(s.Call) case *ast.ExprStmt: v.visitExpr(s.X) case *ast.ForStmt: v.pushScope() v.visitStmt(s.Init) v.visitExpr(s.Cond) v.visitStmt(s.Post) v.visitBlockStmt(s.Body) v.popScope() case *ast.GoStmt: v.visitCallExpr(s.Call) case *ast.IfStmt: v.pushScope() v.visitStmt(s.Init) v.visitExpr(s.Cond) v.visitBlockStmt(s.Body) v.visitStmt(s.Else) v.popScope() case *ast.IncDecStmt: v.visitExpr(s.X) case *ast.LabeledStmt: v.visitStmt(s.Stmt) case *ast.RangeStmt: v.pushScope() v.visitExpr(s.X) if s.Tok == token.DEFINE { if n := GetIdent(s.Key); n != nil { v.scope.add(n.Name, KindVar, n.Pos()) } if n := GetIdent(s.Value); n != nil { v.scope.add(n.Name, KindVar, n.Pos()) } } v.visitExpr(s.Key) v.visitExpr(s.Value) v.visitBlockStmt(s.Body) v.popScope() case *ast.ReturnStmt: for _, r := range s.Results { v.visitExpr(r) } case *ast.SelectStmt: for _, ns := range s.Body.List { c := ns.(*ast.CommClause) v.pushScope() v.visitStmt(c.Comm) for _, bs := range c.Body { v.visitStmt(bs) } v.popScope() } case *ast.SendStmt: v.visitExpr(s.Chan) v.visitExpr(s.Value) case *ast.SwitchStmt: v.pushScope() v.visitStmt(s.Init) v.visitExpr(s.Tag) for _, ns := range s.Body.List { c := ns.(*ast.CaseClause) v.pushScope() for _, ce := range c.List { v.visitExpr(ce) } for _, bs := range c.Body { v.visitStmt(bs) } v.popScope() } v.popScope() case *ast.TypeSwitchStmt: v.pushScope() v.visitStmt(s.Init) v.visitStmt(s.Assign) for _, ns := range s.Body.List { c := ns.(*ast.CaseClause) v.pushScope() for _, ce := range c.List { v.visitType(ce) } for _, bs := range c.Body { v.visitStmt(bs) } v.popScope() } v.popScope() default: v.unexpected(gs.Pos()) } } // visitBlockStmt visits all statements in the block, adding symbols to a newly // created scope. func (v *globalsVisitor) visitBlockStmt(s *ast.BlockStmt) { v.pushScope() for _, c := range s.List { v.visitStmt(c) } v.popScope() } // visitFuncDecl is called when a function or method declaration is encountered. // it creates a new scope for the function [optional] receiver, parameters and // results, and visits all children nodes. func (v *globalsVisitor) visitFuncDecl(d *ast.FuncDecl) { // We don't report methods. if d.Recv == nil { v.f(d.Name, KindFunction) } v.pushScope() v.visitFields(d.Recv, KindReceiver) v.visitFields(d.Type.Params, KindParameter) v.visitFields(d.Type.Results, KindResult) if d.Body != nil { v.visitBlockStmt(d.Body) } v.popScope() } // globalsFromDecl is called in the first, and adds symbols to global scope. func (v *globalsVisitor) globalsFromGenDecl(d *ast.GenDecl) { switch d.Tok { case token.IMPORT: for _, gs := range d.Specs { s := gs.(*ast.ImportSpec) if s.Name == nil { str, _ := strconv.Unquote(s.Path.Value) v.scope.add(filepath.Base(str), KindImport, s.Path.Pos()) } else if s.Name.Name != "_" { v.scope.add(s.Name.Name, KindImport, s.Name.Pos()) } } case token.TYPE: for _, gs := range d.Specs { s := gs.(*ast.TypeSpec) v.scope.add(s.Name.Name, KindType, s.Name.Pos()) } case token.CONST, token.VAR: kind := KindConst if d.Tok == token.VAR { kind = KindVar } for _, s := range d.Specs { for _, n := range s.(*ast.ValueSpec).Names { v.scope.add(n.Name, kind, n.Pos()) } } default: v.unexpected(d.Pos()) } } // visit implements the visiting of globals. It does performs the two passes // described in the description of the globalsVisitor struct. func (v *globalsVisitor) visit() { // Gather all symbols in the global scope. This excludes methods. v.pushScope() for _, gd := range v.file.Decls { switch d := gd.(type) { case *ast.GenDecl: v.globalsFromGenDecl(d) case *ast.FuncDecl: if d.Recv == nil { v.scope.add(d.Name.Name, KindFunction, d.Name.Pos()) } default: v.unexpected(gd.Pos()) } } // Go through the contents of the declarations. for _, gd := range v.file.Decls { switch d := gd.(type) { case *ast.GenDecl: v.visitGenDecl(d) case *ast.FuncDecl: v.visitFuncDecl(d) } } } // Visit traverses the provided AST and calls f() for each identifier that // refers to global names. The global name must be defined in the file itself. // // The function f() is allowed to modify the identifier, for example, to rename // uses of global references. func Visit(fset *token.FileSet, file *ast.File, f func(*ast.Ident, SymKind), processAnon bool) { v := globalsVisitor{ fset: fset, file: file, f: f, processAnon: processAnon, } v.visit() } golang-gvisor-gvisor-0.0~20240729.0/tools/go_generics/globals/scope.go000066400000000000000000000031541465435605700253550ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package globals import ( "go/token" ) // SymKind specifies the kind of a global symbol. For example, a variable, const // function, etc. type SymKind int // Constants for different kinds of symbols. const ( KindUnknown SymKind = iota KindImport KindType KindVar KindConst KindFunction KindReceiver KindParameter KindResult KindTag ) type symbol struct { kind SymKind pos token.Pos scope *scope } type scope struct { outer *scope syms map[string]*symbol } func newScope(outer *scope) *scope { return &scope{ outer: outer, syms: make(map[string]*symbol), } } func (s *scope) isGlobal() bool { return s.outer == nil } func (s *scope) lookup(n string) *symbol { return s.syms[n] } func (s *scope) deepLookup(n string) *symbol { for x := s; x != nil; x = x.outer { if sym := x.lookup(n); sym != nil { return sym } } return nil } func (s *scope) add(name string, kind SymKind, pos token.Pos) { if s.syms[name] != nil { return } s.syms[name] = &symbol{ kind: kind, pos: pos, scope: s, } } golang-gvisor-gvisor-0.0~20240729.0/tools/go_generics/go_merge/000077500000000000000000000000001465435605700240535ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/tools/go_generics/go_merge/BUILD000066400000000000000000000004231465435605700246340ustar00rootroot00000000000000load("//tools:defs.bzl", "go_binary") package( default_applicable_licenses = ["//:license"], licenses = ["notice"], ) go_binary( name = "go_merge", srcs = ["main.go"], visibility = ["//:sandbox"], deps = [ "//tools/constraintutil", ], ) golang-gvisor-gvisor-0.0~20240729.0/tools/go_generics/go_merge/main.go000066400000000000000000000077341465435605700253410ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package main import ( "bytes" "flag" "fmt" "go/ast" "go/format" "go/parser" "go/token" "os" "path/filepath" "strconv" "gvisor.dev/gvisor/tools/constraintutil" ) var ( output = flag.String("o", "", "output `file`") ) func fatalf(s string, args ...any) { fmt.Fprintf(os.Stderr, s, args...) os.Exit(1) } func main() { flag.Usage = func() { fmt.Fprintf(os.Stderr, "Usage: %s [options] [ ...]\n", os.Args[0]) flag.PrintDefaults() } flag.Parse() if *output == "" || len(flag.Args()) == 0 { flag.Usage() os.Exit(1) } // Load all files. files := make(map[string]*ast.File) fset := token.NewFileSet() var name string for _, fname := range flag.Args() { f, err := parser.ParseFile(fset, fname, nil, parser.ParseComments|parser.DeclarationErrors|parser.SpuriousErrors) if err != nil { fatalf("%v\n", err) } files[fname] = f if name == "" { name = f.Name.Name } else if name != f.Name.Name { fatalf("Expected '%s' for package name instead of '%s'.\n", name, f.Name.Name) } } // Merge all files into one. pkg := &ast.Package{ Name: name, Files: files, } f := ast.MergePackageFiles(pkg, ast.FilterUnassociatedComments|ast.FilterFuncDuplicates|ast.FilterImportDuplicates) // Create a new declaration slice with all imports at the top, merging any // redundant imports. imports := make(map[string]*ast.ImportSpec) var importNames []string // Keep imports in the original order to get deterministic output. var anonImports []*ast.ImportSpec for _, d := range f.Decls { if g, ok := d.(*ast.GenDecl); ok && g.Tok == token.IMPORT { for _, s := range g.Specs { i := s.(*ast.ImportSpec) p, _ := strconv.Unquote(i.Path.Value) var n string if i.Name == nil { n = filepath.Base(p) } else { n = i.Name.Name } if n == "_" { anonImports = append(anonImports, i) } else { if i2, ok := imports[n]; ok { if first, second := i.Path.Value, i2.Path.Value; first != second { fatalf("Conflicting paths for import name '%s': '%s' vs. '%s'\n", n, first, second) } } else { imports[n] = i importNames = append(importNames, n) } } } } } newDecls := make([]ast.Decl, 0, len(f.Decls)) if l := len(imports) + len(anonImports); l > 0 { // Non-NoPos Lparen is needed for Go to recognize more than one spec in // ast.GenDecl.Specs. d := &ast.GenDecl{ Tok: token.IMPORT, Lparen: token.NoPos + 1, Specs: make([]ast.Spec, 0, l), } for _, i := range importNames { d.Specs = append(d.Specs, imports[i]) } for _, i := range anonImports { d.Specs = append(d.Specs, i) } newDecls = append(newDecls, d) } for _, d := range f.Decls { if g, ok := d.(*ast.GenDecl); !ok || g.Tok != token.IMPORT { newDecls = append(newDecls, d) } } f.Decls = newDecls // Infer build constraints for the output file. bcexpr, err := constraintutil.CombineFromFiles(flag.Args()) if err != nil { fatalf("Failed to read build constraints: %v\n", err) } // Write the output file. var buf bytes.Buffer if err := format.Node(&buf, fset, f); err != nil { fatalf("fomatting: %v\n", err) } outf, err := os.OpenFile(*output, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) if err != nil { fatalf("opening output: %v\n", err) } defer outf.Close() outf.WriteString(constraintutil.Lines(bcexpr)) if _, err := outf.Write(buf.Bytes()); err != nil { fatalf("write: %v\n", err) } } golang-gvisor-gvisor-0.0~20240729.0/tools/go_generics/imports.go000066400000000000000000000105711465435605700243170ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package main import ( "bytes" "fmt" "go/ast" "go/format" "go/parser" "go/token" "sort" "strconv" "gvisor.dev/gvisor/tools/go_generics/globals" ) type importedPackage struct { newName string path string } // updateImportIdent modifies the given import identifier with the new name // stored in the used map. If the identifier doesn't exist in the used map yet, // a new name is generated and inserted into the map. func updateImportIdent(orig string, imports mapValue, id *ast.Ident, used map[string]*importedPackage) error { importName := id.Name // If the name is already in the table, just use the new name. m := used[importName] if m != nil { id.Name = m.newName return nil } // Create a new entry in the used map. path := imports[importName] if path == "" { return fmt.Errorf("unknown path to package '%s', used in '%s'", importName, orig) } m = &importedPackage{ newName: fmt.Sprintf("__generics_imported%d", len(used)), path: strconv.Quote(path), } used[importName] = m id.Name = m.newName return nil } // convertExpression creates a new string that is a copy of the input one with // all imports references renamed to the names in the "used" map. If the // referenced import isn't in "used" yet, a new one is created based on the path // in "imports" and stored in "used". For example, if string s is // "math.MaxUint32-math.MaxUint16+10", it would be converted to // "x.MaxUint32-x.MathUint16+10", where x is a generated name. func convertExpression(s string, imports mapValue, used map[string]*importedPackage) (string, error) { // Parse the expression in the input string. expr, err := parser.ParseExpr(s) if err != nil { return "", fmt.Errorf("unable to parse \"%s\": %v", s, err) } // Go through the AST and update references. var retErr error ast.Inspect(expr, func(n ast.Node) bool { switch x := n.(type) { case *ast.SelectorExpr: if id := globals.GetIdent(x.X); id != nil { if err := updateImportIdent(s, imports, id, used); err != nil { retErr = err } return false } } return true }) if retErr != nil { return "", retErr } // Convert the modified AST back to a string. fset := token.NewFileSet() var buf bytes.Buffer if err := format.Node(&buf, fset, expr); err != nil { return "", err } return string(buf.Bytes()), nil } // updateImports replaces all maps in the input slice with copies where the // mapped values have had all references to imported packages renamed to // generated names. It also returns an import declaration for all the renamed // import packages. // // For example, if the input maps contains A=math.B and C=math.D, the updated // maps will instead contain A=__generics_imported0.B and // C=__generics_imported0.C, and the 'import __generics_imported0 "math"' would // be returned as the import declaration. func updateImports(maps []mapValue, imports mapValue) (ast.Decl, error) { importsUsed := make(map[string]*importedPackage) // Update all maps. for i, m := range maps { newMap := make(mapValue) for n, e := range m { updated, err := convertExpression(e, imports, importsUsed) if err != nil { return nil, err } newMap[n] = updated } maps[i] = newMap } // Nothing else to do if no imports are used in the expressions. if len(importsUsed) == 0 { return nil, nil } var names []string for n := range importsUsed { names = append(names, n) } // Sort the new imports for deterministic build outputs. sort.Strings(names) // Create spec array for each new import. specs := make([]ast.Spec, 0, len(importsUsed)) for _, n := range names { i := importsUsed[n] specs = append(specs, &ast.ImportSpec{ Name: &ast.Ident{Name: i.newName}, Path: &ast.BasicLit{Value: i.path}, }) } return &ast.GenDecl{ Tok: token.IMPORT, Specs: specs, Lparen: token.NoPos + 1, }, nil } golang-gvisor-gvisor-0.0~20240729.0/tools/go_generics/main.go000066400000000000000000000212241465435605700235430ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // go_generics reads a Go source file and writes a new version of that file with // a few transformations applied to each. Namely: // // 1. Global types can be explicitly renamed with the -t option. For example, // if -t=A=B is passed in, all references to A will be replaced with // references to B; a function declaration like: // // func f(arg *A) // // would be renamed to: // // func f(arg *B) // // 2. Global type definitions and their method sets will be removed when they're // being renamed with -t. For example, if -t=A=B is passed in, the following // definition and methods that existed in the input file wouldn't exist at // all in the output file: // // type A struct{} // // func (*A) f() {} // // 3. All global types, variables, constants and functions (not methods) are // prefixed and suffixed based on the option -prefix and -suffix arguments. // For example, if -suffix=A is passed in, the following globals: // // func f() // type t struct{} // // would be renamed to: // // func fA() // type tA struct{} // // Some special tags are also modified. For example: // // "state:.(t)" // // would become: // // "state:.(tA)" // // 4. The package is renamed to the value via the -p argument. // 5. Value of constants can be modified with -c argument. // // Note that not just the top-level declarations are renamed, all references to // them are also properly renamed as well, taking into account visibility rules // and shadowing. For example, if -suffix=A is passed in, the following: // // var b = 100 // // func f() { // g(b) // b := 0 // g(b) // } // // Would be replaced with: // // var bA = 100 // // func f() { // g(bA) // b := 0 // g(b) // } // // Note that the second call to g() kept "b" as an argument because it refers to // the local variable "b". // // Note that go_generics can handle anonymous fields with renamed types if // -anon is passed in, however it does not perform strict checking on parameter // types that share the same name as the global type and therefore will rename // them as well. // // You can see an example in the tools/go_generics/generics_tests/interface test. package main import ( "bytes" "flag" "fmt" "go/ast" "go/format" "go/parser" "go/token" "io/ioutil" "os" "regexp" "strings" "gvisor.dev/gvisor/tools/go_generics/globals" ) var ( input = flag.String("i", "", "input `file`") output = flag.String("o", "", "output `file`") suffix = flag.String("suffix", "", "`suffix` to add to each global symbol") prefix = flag.String("prefix", "", "`prefix` to add to each global symbol") packageName = flag.String("p", "main", "output package `name`") printAST = flag.Bool("ast", false, "prints the AST") processAnon = flag.Bool("anon", false, "process anonymous fields") types = make(mapValue) consts = make(mapValue) imports = make(mapValue) inputSubstr = make(mapValue) outputSubstr = make(mapValue) ) // mapValue implements flag.Value. We use a mapValue flag instead of a regular // string flag when we want to allow more than one instance of the flag. For // example, we allow several "-t A=B" arguments, and will rename them all. type mapValue map[string]string func (m mapValue) String() string { var b bytes.Buffer first := true for k, v := range m { if !first { b.WriteRune(',') } else { first = false } b.WriteString(k) b.WriteRune('=') b.WriteString(v) } return b.String() } func (m mapValue) Set(s string) error { sep := strings.Index(s, "=") if sep == -1 { return fmt.Errorf("missing '=' from '%s'", s) } m[s[:sep]] = s[sep+1:] return nil } // stateTagRegexp matches against the 'typed' state tags. var stateTagRegexp = regexp.MustCompile(`^(.*[^a-z0-9_])state:"\.\(([^\)]*)\)"(.*)$`) var identifierRegexp = regexp.MustCompile(`^(.*[^a-zA-Z_])([a-zA-Z_][a-zA-Z0-9_]*)(.*)$`) func main() { flag.Usage = func() { fmt.Fprintf(os.Stderr, "Usage: %s [options]\n", os.Args[0]) flag.PrintDefaults() } flag.Var(types, "t", "rename type A to B when `A=B` is passed in. Multiple such mappings are allowed.") flag.Var(consts, "c", "reassign constant A to value B when `A=B` is passed in. Multiple such mappings are allowed.") flag.Var(imports, "import", "specifies the import libraries to use when types are not local. `name=path` specifies that 'name', used in types as name.type, refers to the package living in 'path'.") flag.Var(inputSubstr, "in-substr", "replace input sub-string A with B when `A=B` is passed in. Multiple such mappings are allowed.") flag.Var(outputSubstr, "out-substr", "replace output sub-string A with B when `A=B` is passed in. Multiple such mappings are allowed.") flag.Parse() if *input == "" || *output == "" { flag.Usage() os.Exit(1) } // Parse the input file. fset := token.NewFileSet() inputBytes, err := os.ReadFile(*input) if err != nil { fmt.Fprintf(os.Stderr, "%v\n", err) os.Exit(1) } for old, new := range inputSubstr { inputBytes = bytes.ReplaceAll(inputBytes, []byte(old), []byte(new)) } f, err := parser.ParseFile(fset, *input, inputBytes, parser.ParseComments|parser.DeclarationErrors|parser.SpuriousErrors) if err != nil { fmt.Fprintf(os.Stderr, "%v\n", err) os.Exit(1) } // Print the AST if requested. if *printAST { ast.Print(fset, f) } cmap := ast.NewCommentMap(fset, f, f.Comments) // Update imports based on what's used in types and consts. maps := []mapValue{types, consts} importDecl, err := updateImports(maps, imports) if err != nil { fmt.Fprintf(os.Stderr, "%v\n", err) os.Exit(1) } types = maps[0] consts = maps[1] // Reassign all specified constants. for _, decl := range f.Decls { d, ok := decl.(*ast.GenDecl) if !ok || d.Tok != token.CONST { continue } for _, gs := range d.Specs { s := gs.(*ast.ValueSpec) for i, id := range s.Names { if n, ok := consts[id.Name]; ok { s.Values[i] = &ast.BasicLit{Value: n} } } } } // Go through all globals and their uses in the AST and rename the types // with explicitly provided names, and rename all types, variables, // consts and functions with the provided prefix and suffix. globals.Visit(fset, f, func(ident *ast.Ident, kind globals.SymKind) { if n, ok := types[ident.Name]; ok && kind == globals.KindType { ident.Name = n } else { switch kind { case globals.KindType, globals.KindVar, globals.KindConst, globals.KindFunction: if ident.Name != "_" && !(ident.Name == "init" && kind == globals.KindFunction) { ident.Name = *prefix + ident.Name + *suffix } case globals.KindTag: // Modify the state tag appropriately. if m := stateTagRegexp.FindStringSubmatch(ident.Name); m != nil { if t := identifierRegexp.FindStringSubmatch(m[2]); t != nil { typeName := *prefix + t[2] + *suffix if n, ok := types[t[2]]; ok { typeName = n } ident.Name = m[1] + `state:".(` + t[1] + typeName + t[3] + `)"` + m[3] } } } } }, *processAnon) // Remove the definition of all types that are being remapped. set := make(typeSet) for _, v := range types { set[v] = struct{}{} } removeTypes(set, f) // Add the new imports, if any, to the top. if importDecl != nil { newDecls := make([]ast.Decl, 0, len(f.Decls)+1) newDecls = append(newDecls, importDecl) newDecls = append(newDecls, f.Decls...) f.Decls = newDecls } // Update comments to remove the ones potentially associated with the // type T that we removed. f.Comments = cmap.Filter(f).Comments() // If there are file (package) comments, delete them. if f.Doc != nil { for i, cg := range f.Comments { if cg == f.Doc { f.Comments = append(f.Comments[:i], f.Comments[i+1:]...) break } } } // Write the output file. f.Name.Name = *packageName var buf bytes.Buffer if err := format.Node(&buf, fset, f); err != nil { fmt.Fprintf(os.Stderr, "%v\n", err) os.Exit(1) } byteBuf := buf.Bytes() for old, new := range outputSubstr { byteBuf = bytes.ReplaceAll(byteBuf, []byte(old), []byte(new)) } if err := ioutil.WriteFile(*output, byteBuf, 0644); err != nil { fmt.Fprintf(os.Stderr, "%v\n", err) os.Exit(1) } } golang-gvisor-gvisor-0.0~20240729.0/tools/go_generics/remove.go000066400000000000000000000057121465435605700241200ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package main import ( "go/ast" "go/token" ) type typeSet map[string]struct{} // isTypeOrPointerToType determines if the given AST expression represents a // type or a pointer to a type that exists in the provided type set. func isTypeOrPointerToType(set typeSet, expr ast.Expr, starCount int) bool { switch e := expr.(type) { case *ast.Ident: _, ok := set[e.Name] return ok case *ast.StarExpr: if starCount > 1 { return false } return isTypeOrPointerToType(set, e.X, starCount+1) case *ast.ParenExpr: return isTypeOrPointerToType(set, e.X, starCount) default: return false } } // isMethodOf determines if the given function declaration is a method of one // of the types in the provided type set. To do that, it checks if the function // has a receiver and that its type is either T or *T, where T is a type that // exists in the set. This is per the spec: // // That parameter section must declare a single parameter, the receiver. Its // type must be of the form T or *T (possibly using parentheses) where T is a // type name. The type denoted by T is called the receiver base type; it must // not be a pointer or interface type and it must be declared in the same // package as the method. func isMethodOf(set typeSet, f *ast.FuncDecl) bool { // If the function doesn't have exactly one receiver, then it's // definitely not a method. if f.Recv == nil || len(f.Recv.List) != 1 { return false } return isTypeOrPointerToType(set, f.Recv.List[0].Type, 0) } // removeTypeDefinitions removes the definition of all types contained in the // provided type set. func removeTypeDefinitions(set typeSet, d *ast.GenDecl) { if d.Tok != token.TYPE { return } i := 0 for _, gs := range d.Specs { s := gs.(*ast.TypeSpec) if _, ok := set[s.Name.Name]; !ok { d.Specs[i] = gs i++ } } d.Specs = d.Specs[:i] } // removeTypes removes from the AST the definition of all types and their // method sets that are contained in the provided type set. func removeTypes(set typeSet, f *ast.File) { // Go through the top-level declarations. i := 0 for _, decl := range f.Decls { keep := true switch d := decl.(type) { case *ast.GenDecl: countBefore := len(d.Specs) removeTypeDefinitions(set, d) keep = countBefore == 0 || len(d.Specs) > 0 case *ast.FuncDecl: keep = !isMethodOf(set, d) } if keep { f.Decls[i] = decl i++ } } f.Decls = f.Decls[:i] } golang-gvisor-gvisor-0.0~20240729.0/tools/go_generics/rules_tests/000077500000000000000000000000001465435605700246435ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/tools/go_generics/rules_tests/BUILD000066400000000000000000000014631465435605700254310ustar00rootroot00000000000000load("//tools:defs.bzl", "go_test") load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") package( default_applicable_licenses = ["//:license"], licenses = ["notice"], ) go_template_instance( name = "instance", out = "instance_test.go", consts = { "n": "20", "m": "\"test\"", "o": "math.MaxUint64", }, imports = { "math": "math", }, package = "template_test", template = ":test_template", types = { "t": "int", }, ) go_template( name = "test_template", srcs = [ "template.go", ], opt_consts = [ "n", "m", "o", ], opt_types = ["t"], ) go_test( name = "template_test", srcs = [ "instance_test.go", "template_test.go", ], ) golang-gvisor-gvisor-0.0~20240729.0/tools/go_generics/rules_tests/template.go000066400000000000000000000015111465435605700270030ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package template type t float const ( n t = 10.1 m = "abc" o = 0 ) func max(a, b t) t { if a > b { return a } return b } func add(a t) t { return a + n } func getName() string { return m } func getMax() uint64 { return o } golang-gvisor-gvisor-0.0~20240729.0/tools/go_generics/rules_tests/template_test.go000066400000000000000000000022511465435605700300440ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package template_test import ( "math" "testing" ) func TestMax(t *testing.T) { var a int a = max(10, 20) if a != 20 { t.Errorf("Bad result of max, got %v, want %v", a, 20) } } func TestIntConst(t *testing.T) { var a int a = add(10) if a != 30 { t.Errorf("Bad result of add, got %v, want %v", a, 30) } } func TestStrConst(t *testing.T) { v := getName() if v != "test" { t.Errorf("Bad name, got %v, want %v", v, "test") } } func TestImport(t *testing.T) { v := getMax() if v != math.MaxUint64 { t.Errorf("Bad max value, got %v, want %v", v, uint64(math.MaxUint64)) } } golang-gvisor-gvisor-0.0~20240729.0/tools/go_marshal/000077500000000000000000000000001465435605700221175ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/tools/go_marshal/BUILD000066400000000000000000000010201465435605700226720ustar00rootroot00000000000000load("//tools:defs.bzl", "bzl_library", "go_binary") package(default_applicable_licenses = ["//:license"]) licenses(["notice"]) go_binary( name = "go_marshal", srcs = ["main.go"], visibility = ["//:sandbox"], deps = [ "//tools/go_marshal/gomarshal", ], ) config_setting( name = "marshal_config_verbose", values = {"define": "gomarshal=verbose"}, visibility = ["//:sandbox"], ) bzl_library( name = "defs_bzl", srcs = ["defs.bzl"], visibility = ["//visibility:private"], ) golang-gvisor-gvisor-0.0~20240729.0/tools/go_marshal/README.md000066400000000000000000000134501465435605700234010ustar00rootroot00000000000000This package implements the go_marshal utility. # Overview `go_marshal` is a code generation utility similar to `go_stateify` for marshalling go data structures to and from memory. `go_marshal` attempts to improve on `binary.Write` and the sentry's `binary.Marshal` by moving the expensive use of reflection from runtime to compile-time. `go_marshal` automatically generates implementations for `marshal.Marshallable` interface. Data structures that require custom serialization can be accomodated through a manual implementation this interface. Data structures can be flagged for code generation by adding a struct-level comment `// +marshal`. For additional details and options, see the documentation for the `marshal.Marshallable` interface. # Usage See `defs.bzl`: a new rule is provided, `go_marshal`. Under the hood, the `go_marshal` rule is used to generate a file that will appear in a Go target; the output file should appear explicitly in a srcs list. For example (note that the above is the preferred method): ``` load("/gvisor/tools/go_marshal:defs.bzl", "go_marshal") go_marshal( name = "foo_abi", srcs = ["foo.go"], out = "foo_abi.go", package = "foo", ) go_library( name = "foo", srcs = [ "foo.go", "foo_abi.go", ], ... ) ``` As part of the interface generation, `go_marshal` also generates some tests for sanity checking the struct definitions for potential alignment issues, and a simple round-trip test through Marshal/Unmarshal to verify the implementation. These tests use reflection to verify properties of the ABI struct, and should be considered part of the generated interfaces (but are too expensive to execute at runtime). Ensure these tests run at some point. # Restrictions Not all valid go type definitions can be used with `go_marshal`. `go_marshal` is intended for ABI structs, which have these additional restrictions: - At the moment, `go_marshal` only supports struct declarations. - Structs are marshalled as packed types. This means no implicit padding is inserted between fields shorter than the platform register size. For alignment, manually insert padding fields. - Structs used with `go_marshal` must have a compile-time static size. This means no dynamically sizes fields like slices or strings. Use statically sized array (byte arrays for strings) instead. - No pointers, channel, map or function pointer fields, and no fields that are arrays of these types. These don't make sense in an ABI data structure. - We could support opaque pointers as `uintptr`, but this is currently not implemented. Implementing this would require handling the architecture dependent native pointer size. - Fields must either be a primitive integer type (`byte`, `[u]int{8,16,32,64}`), or of a type that implements `marshal.Marshallable`. - `int` and `uint` fields are not allowed. Use an explicitly-sized numeric type. - `float*` fields are currently not supported, but could be if necessary. # Appendix ## Working with Non-Packed Structs ABI structs must generally be packed types, meaning they should have no implicit padding between short fields. However, if a field is tagged `marshal:"unaligned"`, `go_marshal` will fall back to a safer but slower mechanism to deal with potentially unaligned fields. Note that the non-packed property is inheritted by any other struct that embeds this struct, since the `go_marshal` tool currently can't reason about alignments for embedded structs that are not aligned. Because of this, it's generally best to avoid using `marshal:"unaligned"` and insert explicit padding fields instead. ## Working with dynamically sized structs While `go_marshal` seamlessly supports statically sized structs (which most ABI structs are), it can also used for other uses cases where marshalling is required. There is some provision to partially support dynamically sized structs that may not be ABI structs. A user can define a dynamic struct and define `SizeBytes()`, `MarshalBytes(dst)` and `UnmarshalBytes(src)` for it. Then user can then add a comment above the struct like `// +marshal dynamic` while will make `go_marshal` autogenerate the remaining methods required to complete the `Marshallable` interface. This feature is currently only available for structs and can not be used alongside the Slice API. ## Modifying the `go_marshal` Tool The following are some guidelines for modifying the `go_marshal` tool: - The `go_marshal` tool currently does a single pass over all types requesting code generation, in arbitrary order. This means the generated code can't directly obtain information about embedded marshallable types at compile-time. One way to work around this restriction is to add a new Marshallable interface method providing this piece of information, and calling it from the generated code. Use this sparingly, as we want to rely on compile-time information as much as possible for performance. - No runtime reflection in the code generated for the marshallable interface. The entire point of the tool is to avoid runtime reflection. The generated tests may use reflection. ## Debugging To enable debugging output from the go-marshal tool, use one of the following options, depending on how go-marshal is being invoked: - Pass `--define gomarshal=verbose` to the bazel command. Note that this can generate a lot of output depending on what's being compiled, as this will enable debugging for all packages built by the command. - Set `marshal_debug = True` on the top-level `go_library` BUILD rule. - Set `debug = True` on the `go_marshal` BUILD rule. - Pass `-debug` to the go-marshal tool invocation. If bazel complains about stdout output being too large, set a larger value through `--experimental_ui_max_stdouterr_bytes`, or `-1` for unlimited output. golang-gvisor-gvisor-0.0~20240729.0/tools/go_marshal/analysis/000077500000000000000000000000001465435605700237425ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/tools/go_marshal/analysis/BUILD000066400000000000000000000003771465435605700245330ustar00rootroot00000000000000load("//tools:defs.bzl", "go_library") package(default_applicable_licenses = ["//:license"]) licenses(["notice"]) go_library( name = "analysis", testonly = 1, srcs = ["analysis_unsafe.go"], visibility = [ "//:sandbox", ], ) golang-gvisor-gvisor-0.0~20240729.0/tools/go_marshal/analysis/analysis_unsafe.go000066400000000000000000000157701465435605700274670ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package analysis implements common functionality used by generated // go_marshal tests. package analysis // All functions in this package are unsafe and are not intended for general // consumption. They contain sharp edge cases and the caller is responsible for // ensuring none of them are hit. Callers must be carefully to pass in only sane // arguments. Failure to do so may cause panics at best and arbitrary memory // corruption at worst. // // Never use outside of tests. import ( "fmt" "math/rand" "reflect" "testing" "unsafe" ) // RandomizeValue assigns random value(s) to an abitrary type. This is intended // for used with ABI structs from go_marshal, meaning the typical restrictions // apply (fixed-size types, no pointers, maps, channels, etc), and should only // be used on zeroed values to avoid overwriting pointers to active go objects. // // Internally, we populate the type with random data by doing an unsafe cast to // access the underlying memory of the type and filling it as if it were a byte // slice. This almost gets us what we want, but padding fields named "_" are // normally not accessible, so we walk the type and recursively zero all "_" // fields. // // Precondition: x must be a pointer. x must not contain any valid // pointers to active go objects (pointer fields aren't allowed in ABI // structs anyways), or we'd be violating the go runtime contract and // the GC may malfunction. func RandomizeValue(x any) { v := reflect.Indirect(reflect.ValueOf(x)) if !v.CanSet() { panic("RandomizeType() called with an unaddressable value. You probably need to pass a pointer to the argument") } // Cast the underlying memory for the type into a byte slice. var b []byte hdr := (*reflect.SliceHeader)(unsafe.Pointer(&b)) // Note: v.UnsafeAddr panics if x is passed by value. x should be a pointer. hdr.Data = v.UnsafeAddr() hdr.Len = int(v.Type().Size()) hdr.Cap = hdr.Len // Fill the byte slice with random data, which in effect fills the type with // random values. n, err := rand.Read(b) if err != nil || n != len(b) { panic("unreachable") } // Normally, padding fields are not accessible, so zero them out. reflectZeroPaddingFields(v.Type(), b, false) } // reflectZeroPaddingFields assigns zero values to padding fields for the value // of type r, represented by the memory in data. Padding fields are defined as // fields with the name "_". If zero is true, the immediate value itself is // zeroed. In addition, the type is recursively scanned for padding fields in // inner types. // // This is used for zeroing padding fields after calling RandomizeValue. func reflectZeroPaddingFields(r reflect.Type, data []byte, zero bool) { if zero { clear(data) } switch r.Kind() { case reflect.Int8, reflect.Uint8, reflect.Int16, reflect.Uint16, reflect.Int32, reflect.Uint32, reflect.Int64, reflect.Uint64: // These types are explicitly allowed in an ABI type, but we don't need // to recurse further as they're scalar types. case reflect.Struct: for i, numFields := 0, r.NumField(); i < numFields; i++ { f := r.Field(i) off := f.Offset len := f.Type.Size() window := data[off : off+len] reflectZeroPaddingFields(f.Type, window, f.Name == "_") } case reflect.Array: eLen := int(r.Elem().Size()) if int(r.Size()) != eLen*r.Len() { panic("Array has unexpected size?") } for i, n := 0, r.Len(); i < n; i++ { reflectZeroPaddingFields(r.Elem(), data[i*eLen:(i+1)*eLen], false) } default: panic(fmt.Sprintf("Type %v not allowed in ABI struct", r.Kind())) } } // AlignmentCheck ensures the definition of the type represented by typ doesn't // cause the go compiler to emit implicit padding between elements of the type // (i.e. fields in a struct). // // AlignmentCheck doesn't explicitly recurse for embedded structs because any // struct present in an ABI struct must also be Marshallable, and therefore // they're aligned by definition (or their alignment check would have failed). func AlignmentCheck(t *testing.T, typ reflect.Type) (ok bool, delta uint64) { switch typ.Kind() { case reflect.Int8, reflect.Uint8, reflect.Int16, reflect.Uint16, reflect.Int32, reflect.Uint32, reflect.Int64, reflect.Uint64: // Primitive types are always considered well aligned. Primitive types // that are fields in structs are checked independently, this branch // exists to handle recursive calls to alignmentCheck. case reflect.Struct: xOff := 0 nextXOff := 0 skipNext := false for i, numFields := 0, typ.NumField(); i < numFields; i++ { xOff = nextXOff f := typ.Field(i) fmt.Printf("Checking alignment of %s.%s @ %d [+%d]...\n", typ.Name(), f.Name, f.Offset, f.Type.Size()) nextXOff = int(f.Offset + f.Type.Size()) if f.Name == "_" { // Padding fields need not be aligned. fmt.Printf("Padding field of type %v\n", f.Type) continue } if tag, ok := f.Tag.Lookup("marshal"); ok && tag == "unaligned" { skipNext = true continue } if skipNext { skipNext = false fmt.Printf("Skipping alignment check for field %s.%s explicitly marked as unaligned.\n", typ.Name(), f.Name) continue } if xOff != int(f.Offset) { implicitPad := int(f.Offset) - xOff t.Fatalf("Suspect offset for field %s.%s, detected an implicit %d byte padding from offset %d to %d; either add %d bytes of explicit padding before this field or tag it as `marshal:\"unaligned\"`.", typ.Name(), f.Name, implicitPad, xOff, f.Offset, implicitPad) } } // Ensure structs end on a byte explicitly defined by the type. if typ.NumField() > 0 && nextXOff != int(typ.Size()) { implicitPad := int(typ.Size()) - nextXOff f := typ.Field(typ.NumField() - 1) // Final field if tag, ok := f.Tag.Lookup("marshal"); ok && tag == "unaligned" { // Final field explicitly marked unaligned. break } t.Fatalf("Suspect offset for field %s.%s at the end of %s, detected an implicit %d byte padding from offset %d to %d at the end of the struct; either add %d bytes of explict padding at end of the struct or tag the final field %s as `marshal:\"unaligned\"`.", typ.Name(), f.Name, typ.Name(), implicitPad, nextXOff, typ.Size(), implicitPad, f.Name) } case reflect.Array: // Independent arrays are also always considered well aligned. We only // need to worry about their alignment when they're embedded in structs, // which we handle above. default: t.Fatalf("Unsupported type in ABI struct while checking for field alignment for type: %v", typ.Kind()) } return true, uint64(typ.Size()) } golang-gvisor-gvisor-0.0~20240729.0/tools/go_marshal/defs.bzl000066400000000000000000000045001465435605700235500ustar00rootroot00000000000000"""Marshal is a tool for generating marshalling interfaces for Go types.""" def _go_marshal_impl(ctx): """Execute the go_marshal tool.""" output = ctx.outputs.lib output_test = ctx.outputs.test output_test_unconditional = ctx.outputs.test_unconditional # Run the marshal command. args = ["-output=%s" % output.path] args.append("-pkg=%s" % ctx.attr.package) args.append("-output_test=%s" % output_test.path) args.append("-output_test_unconditional=%s" % output_test_unconditional.path) if ctx.attr.debug: args += ["-debug"] args += ["--"] for src in ctx.attr.srcs: args += [f.path for f in src.files.to_list()] ctx.actions.run( inputs = ctx.files.srcs, outputs = [output, output_test, output_test_unconditional], mnemonic = "GoMarshal", progress_message = "go_marshal: %s" % ctx.label, arguments = args, executable = ctx.executable._tool, ) # Generates save and restore logic from a set of Go files. # # Args: # name: the name of the rule. # srcs: the input source files. These files should include all structs in the # package that need to be saved. # imports: an optional list of extra, non-aliased, Go-style absolute import # paths. # out: the name of the generated file output. This must not conflict with any # other files and must be added to the srcs of the relevant go_library. # package: the package name for the input sources. go_marshal = rule( implementation = _go_marshal_impl, attrs = { "srcs": attr.label_list(mandatory = True, allow_files = True), "imports": attr.string_list(mandatory = False), "package": attr.string(mandatory = True), "debug": attr.bool(doc = "enable debugging output from the go_marshal tool"), "_tool": attr.label(executable = True, cfg = "exec", default = Label("//tools/go_marshal:go_marshal")), }, outputs = { "lib": "%{name}_unsafe.go", "test": "%{name}_test.go", "test_unconditional": "%{name}_unconditional_test.go", }, ) # marshal_deps are the dependencies requied by generated code. marshal_deps = [ "//pkg/gohacks", "//pkg/hostarch", "//pkg/marshal", ] # marshal_test_deps are required by test targets. marshal_test_deps = [ "//tools/go_marshal/analysis", ] golang-gvisor-gvisor-0.0~20240729.0/tools/go_marshal/gomarshal/000077500000000000000000000000001465435605700240745ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/tools/go_marshal/gomarshal/BUILD000066400000000000000000000011031465435605700246510ustar00rootroot00000000000000load("//tools:defs.bzl", "go_library") package(default_applicable_licenses = ["//:license"]) licenses(["notice"]) go_library( name = "gomarshal", srcs = [ "generator.go", "generator_interfaces.go", "generator_interfaces_array_newtype.go", "generator_interfaces_dynamic.go", "generator_interfaces_primitive_newtype.go", "generator_interfaces_struct.go", "generator_tests.go", "util.go", ], stateify = False, visibility = [ "//:sandbox", ], deps = ["//tools/constraintutil"], ) golang-gvisor-gvisor-0.0~20240729.0/tools/go_marshal/gomarshal/generator.go000066400000000000000000000476361465435605700264310ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package gomarshal implements the go_marshal code generator. See README.md. package gomarshal import ( "bytes" "fmt" "go/ast" "go/parser" "go/token" "os" "sort" "strings" "gvisor.dev/gvisor/tools/constraintutil" ) // List of identifiers we use in generated code that may conflict with a // similarly-named source identifier. Abort gracefully when we see these to // avoid potentially confusing compilation failures in generated code. // // This only applies to import aliases at the moment. All other identifiers // are qualified by a receiver argument, since they're struct fields. // // All recievers are single letters, so we don't allow import aliases to be a // single letter. var badIdents = []string{ "addr", "blk", "buf", "cc", "dst", "dsts", "count", "err", "hdr", "idx", "inner", "length", "limit", "ptr", "size", "src", "srcs", "val", // All single-letter identifiers. } // Constructed fromt badIdents in init(). var badIdentsMap map[string]struct{} func init() { badIdentsMap = make(map[string]struct{}) for _, ident := range badIdents { badIdentsMap[ident] = struct{}{} } } // Generator drives code generation for a single invocation of the go_marshal // utility. // // The Generator holds arguments passed to the tool, and drives parsing, // processing and code Generator for all types marked with +marshal declared in // the input files. // // See Generator.run() as the entry point. type Generator struct { // Paths to input go source files. inputs []string // Output file to write generated go source. output *os.File // Output file to write generated tests. outputTest *os.File // Output file to write unconditionally generated tests. outputTestUC *os.File // Package name for the generated file. pkg string // Set of extra packages to import in the generated file. imports *importTable } // NewGenerator creates a new code Generator. func NewGenerator(srcs []string, out, outTest, outTestUnconditional, pkg string, imports []string) (*Generator, error) { f, err := os.OpenFile(out, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) if err != nil { return nil, fmt.Errorf("couldn't open output file %q: %w", out, err) } fTest, err := os.OpenFile(outTest, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) if err != nil { return nil, fmt.Errorf("couldn't open test output file %q: %w", out, err) } fTestUC, err := os.OpenFile(outTestUnconditional, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) if err != nil { return nil, fmt.Errorf("couldn't open unconditional test output file %q: %w", out, err) } g := Generator{ inputs: srcs, output: f, outputTest: fTest, outputTestUC: fTestUC, pkg: pkg, imports: newImportTable(), } for _, i := range imports { // All imports on the extra imports list are unconditionally marked as // used, so that they're always added to the generated code. g.imports.add(i).markUsed() } // The following imports may or may not be used by the generated code, // depending on what's required for the target types. Don't mark these as // used by default. g.imports.add("io") g.imports.add("reflect") g.imports.add("runtime") g.imports.add("unsafe") g.imports.add("gvisor.dev/gvisor/pkg/gohacks") g.imports.add("gvisor.dev/gvisor/pkg/hostarch") g.imports.add("gvisor.dev/gvisor/pkg/marshal") return &g, nil } // writeHeader writes the header for the generated source file. The header // includes the package name, package level comments and import statements. func (g *Generator) writeHeader() error { var b sourceBuffer b.emit("// Automatically generated marshal implementation. See tools/go_marshal.\n\n") bcexpr, err := constraintutil.CombineFromFiles(g.inputs) if err != nil { return err } if bcexpr != nil { // Emit build constraints. b.emit("// If there are issues with build constraint aggregation, see\n") b.emit("// tools/go_marshal/gomarshal/generator.go:writeHeader(). The constraints here\n") b.emit("// come from the input set of files used to generate this file. This input set\n") b.emit("// is filtered based on pre-defined file suffixes related to build constraints,\n") b.emit("// see tools/defs.bzl:calculate_sets().\n\n") b.emit(constraintutil.Lines(bcexpr)) } // Package header. b.emit("package %s\n\n", g.pkg) if err := b.write(g.output); err != nil { return err } return g.imports.write(g.output) } // writeTypeChecks writes a statement to force the compiler to perform a type // check for all Marshallable types referenced by the generated code. func (g *Generator) writeTypeChecks(ms map[string]struct{}) error { if len(ms) == 0 { return nil } msl := make([]string, 0, len(ms)) for m := range ms { msl = append(msl, m) } sort.Strings(msl) var buf bytes.Buffer fmt.Fprint(&buf, "// Marshallable types used by this file.\n") for _, m := range msl { fmt.Fprintf(&buf, "var _ marshal.Marshallable = (*%s)(nil)\n", m) } fmt.Fprint(&buf, "\n") _, err := fmt.Fprint(g.output, buf.String()) return err } // parse processes all input files passed this generator and produces a set of // parsed go ASTs. func (g *Generator) parse() ([]*ast.File, []*token.FileSet, error) { debugf("go_marshal invoked with %d input files:\n", len(g.inputs)) for _, path := range g.inputs { debugf(" %s\n", path) } files := make([]*ast.File, 0, len(g.inputs)) fsets := make([]*token.FileSet, 0, len(g.inputs)) for _, path := range g.inputs { fset := token.NewFileSet() f, err := parser.ParseFile(fset, path, nil, parser.ParseComments) if err != nil { // Not a valid input file? return nil, nil, fmt.Errorf("input %q can't be parsed: %w", path, err) } if debugEnabled() { debugf("AST for %q:\n", path) ast.Print(fset, f) } files = append(files, f) fsets = append(fsets, fset) } return files, fsets, nil } // sliceAPI carries information about the '+marshal slice' directive. type sliceAPI struct { // Comment node in the AST containing the +marshal tag. comment *ast.Comment // Identifier fragment to use when naming generated functions for the slice // API. ident string // Whether the generated functions should reference the newtype name, or the // inner type name. Only meaningful on newtype declarations on primitives. inner bool } // marshallableType carries information about a type marked with the '+marshal' // directive. type marshallableType struct { spec *ast.TypeSpec slice *sliceAPI recv string dynamic bool boundCheck bool } func newMarshallableType(fset *token.FileSet, tagLine *ast.Comment, spec *ast.TypeSpec) *marshallableType { mt := &marshallableType{ spec: spec, slice: nil, } var unhandledTags []string for _, tag := range strings.Fields(strings.TrimPrefix(tagLine.Text, "// +marshal")) { if strings.HasPrefix(tag, "slice:") { tokens := strings.Split(tag, ":") if len(tokens) < 2 || len(tokens) > 3 { abortAt(fset.Position(tagLine.Slash), fmt.Sprintf("+marshal directive has invalid 'slice' clause. Expecting format 'slice:[:inner]', got '%v'", tag)) } if len(tokens[1]) == 0 { abortAt(fset.Position(tagLine.Slash), "+marshal slice directive has empty identifier argument. Expecting '+marshal slice:identifier'") } sa := &sliceAPI{ comment: tagLine, ident: tokens[1], } mt.slice = sa if len(tokens) == 3 { if tokens[2] != "inner" { abortAt(fset.Position(tagLine.Slash), "+marshal slice directive has an invalid argument. Expecting '+marshal slice:[:inner]'") } sa.inner = true } continue } else if tag == "dynamic" { mt.dynamic = true continue } else if tag == "boundCheck" { mt.boundCheck = true continue } unhandledTags = append(unhandledTags, tag) } if len(unhandledTags) > 0 { abortAt(fset.Position(tagLine.Slash), fmt.Sprintf("+marshal directive contained the following unknown clauses: %v", strings.Join(unhandledTags, " "))) } return mt } // collectMarshallableTypes walks the parsed AST and collects a list of type // declarations for which we need to generate the Marshallable interface. func (g *Generator) collectMarshallableTypes(a *ast.File, f *token.FileSet) map[*ast.TypeSpec]*marshallableType { recv := make(map[string]string) // Type name to recevier name. types := make(map[*ast.TypeSpec]*marshallableType) for _, decl := range a.Decls { gdecl, ok := decl.(*ast.GenDecl) // Type declaration? if !ok || gdecl.Tok != token.TYPE { // Is this a function declaration? We remember receiver names. d, ok := decl.(*ast.FuncDecl) if ok && d.Recv != nil && len(d.Recv.List) == 1 { // Accept concrete methods & pointer methods. ident, ok := d.Recv.List[0].Type.(*ast.Ident) if !ok { var st *ast.StarExpr st, ok = d.Recv.List[0].Type.(*ast.StarExpr) if ok { ident, ok = st.X.(*ast.Ident) } } // The receiver name may be not present. if ok && len(d.Recv.List[0].Names) == 1 { // Recover the type receiver name in this case. recv[ident.Name] = d.Recv.List[0].Names[0].Name } } debugfAt(f.Position(decl.Pos()), "Skipping declaration since it's not a type declaration.\n") continue } // Does it have a comment? if gdecl.Doc == nil { debugfAt(f.Position(gdecl.Pos()), "Skipping declaration since it doesn't have a comment.\n") continue } // Does the comment contain a "+marshal" line? marked := false var tagLine *ast.Comment for _, c := range gdecl.Doc.List { if strings.HasPrefix(c.Text, "// +marshal") { marked = true tagLine = c break } } if !marked { debugfAt(f.Position(gdecl.Pos()), "Skipping declaration since it doesn't have a comment containing +marshal line.\n") continue } for _, spec := range gdecl.Specs { // We already confirmed we're in a type declaration earlier, so this // cast will succeed. t := spec.(*ast.TypeSpec) switch t.Type.(type) { case *ast.StructType: debugfAt(f.Position(t.Pos()), "Collected marshallable struct %s.\n", t.Name.Name) case *ast.Ident: // Newtype on primitive. debugfAt(f.Position(t.Pos()), "Collected marshallable newtype on primitive %s.\n", t.Name.Name) case *ast.ArrayType: // Newtype on array. debugfAt(f.Position(t.Pos()), "Collected marshallable newtype on array %s.\n", t.Name.Name) default: // A user specifically requested marshalling on this type, but we // don't support it. abortAt(f.Position(t.Pos()), fmt.Sprintf("Marshalling codegen was requested on type '%s', but go-marshal doesn't support this kind of declaration.\n", t.Name)) } types[t] = newMarshallableType(f, tagLine, t) } } // Update the types with the last seen receiver. As long as the // receiver name is consistent for the type, then we will generate // code that is still consistent with itself. for t, mt := range types { r, ok := recv[t.Name.Name] if !ok { mt.recv = receiverName(t) // Default. continue } mt.recv = r // Last seen. } return types } // collectImports collects all imports from all input source files. Some of // these imports are copied to the generated output, if they're referenced by // the generated code. // // collectImports de-duplicates imports while building the list, and ensures // identifiers in the generated code don't conflict with any imported package // names. func (g *Generator) collectImports(a *ast.File, f *token.FileSet) map[string]importStmt { is := make(map[string]importStmt) for _, decl := range a.Decls { gdecl, ok := decl.(*ast.GenDecl) // Import statement? if !ok || gdecl.Tok != token.IMPORT { continue } for _, spec := range gdecl.Specs { i := g.imports.addFromSpec(spec.(*ast.ImportSpec), f) debugf("Collected import '%s' as '%s'\n", i.path, i.name) // Make sure we have an import that doesn't use any local names that // would conflict with identifiers in the generated code. if len(i.name) == 1 && i.name != "_" { abortAt(f.Position(spec.Pos()), fmt.Sprintf("Import has a single character local name '%s'; this may conflict with code generated by go_marshal, use a multi-character import alias", i.name)) } if _, ok := badIdentsMap[i.name]; ok { abortAt(f.Position(spec.Pos()), fmt.Sprintf("Import name '%s' is likely to conflict with code generated by go_marshal, use a different import alias", i.name)) } } } return is } func (g *Generator) generateOne(t *marshallableType, fset *token.FileSet) *interfaceGenerator { i := newInterfaceGenerator(t.spec, t.recv, fset) if t.dynamic { if t.slice != nil { abortAt(fset.Position(t.slice.comment.Slash), "Slice API is not supported for dynamic types because it assumes that each slice element is statically sized.") } if t.boundCheck { abortAt(fset.Position(t.slice.comment.Slash), "Can not generate Checked methods for dynamic types. Has to be implemented manually.") } // No validation needed, assume the user knows what they are doing. i.emitMarshallableForDynamicType() return i } switch ty := t.spec.Type.(type) { case *ast.StructType: i.validateStruct(t.spec, ty) i.emitMarshallableForStruct(ty) if t.boundCheck { i.emitCheckedMarshallableForStruct() } if t.slice != nil { i.emitMarshallableSliceForStruct(ty, t.slice) } case *ast.Ident: i.validatePrimitiveNewtype(ty) i.emitMarshallableForPrimitiveNewtype(ty) if t.boundCheck { i.emitCheckedMarshallableForPrimitiveNewtype() } if t.slice != nil { i.emitMarshallableSliceForPrimitiveNewtype(ty, t.slice) } case *ast.ArrayType: i.validateArrayNewtype(t.spec.Name, ty) // After validate, we can safely call arrayLen. i.emitMarshallableForArrayNewtype(t.spec.Name, ty, ty.Elt.(*ast.Ident)) if t.boundCheck { i.emitCheckedMarshallableForArrayNewtype() } if t.slice != nil { abortAt(fset.Position(t.slice.comment.Slash), "Array type marked as '+marshal slice:...', but this is not supported. Perhaps fold one of the dimensions?") } default: // This should've been filtered out by collectMarshallabeTypes. panic(fmt.Sprintf("Unexpected type %+v", ty)) } return i } // generateOneTestSuite generates a test suite for the automatically generated // implementations type t. func (g *Generator) generateOneTestSuite(t *marshallableType) *testGenerator { i := newTestGenerator(t.spec, t.recv) i.emitTests(t.slice, t.boundCheck) return i } // Run is the entry point to code generation using g. // // Run parses all input source files specified in g and emits generated code. func (g *Generator) Run() error { // Parse our input source files into ASTs and token sets. asts, fsets, err := g.parse() if err != nil { return err } if len(asts) != len(fsets) { panic("ASTs and FileSets don't match") } // Map of imports in source files; key = local package name, value = import // path. is := make(map[string]importStmt) for i, a := range asts { // Collect all imports from the source files. We may need to copy some // of these to the generated code if they're referenced. This has to be // done before the loop below because we need to process all ASTs before // we start requesting imports to be copied one by one as we encounter // them in each generated source. for name, i := range g.collectImports(a, fsets[i]) { is[name] = i } } var impls []*interfaceGenerator var ts []*testGenerator // Set of Marshallable types referenced by generated code. ms := make(map[string]struct{}) for i, a := range asts { // Collect type declarations marked for code generation and generate // Marshallable interfaces. var sortedTypes []*marshallableType for _, t := range g.collectMarshallableTypes(a, fsets[i]) { sortedTypes = append(sortedTypes, t) } sort.Slice(sortedTypes, func(x, y int) bool { // Sort by type name, which should be unique within a package. return sortedTypes[x].spec.Name.String() < sortedTypes[y].spec.Name.String() }) for _, t := range sortedTypes { impl := g.generateOne(t, fsets[i]) // Collect Marshallable types referenced by the generated code. for ref := range impl.ms { ms[ref] = struct{}{} } impls = append(impls, impl) // Collect imports referenced by the generated code and add them to // the list of imports we need to copy to the generated code. for name := range impl.is { if !g.imports.markUsed(name) { panic(fmt.Sprintf("Generated code for '%s' referenced a non-existent import with local name '%s'. Either go-marshal needs to add an import to the generated file, or a package in an input source file has a package name differ from the final component of its path, which go-marshal doesn't know how to detect; use an import alias to work around this limitation.", impl.typeName(), name)) } } // Do not generate tests for dynamic types because they inherently // violate some go_marshal requirements. if !t.dynamic { ts = append(ts, g.generateOneTestSuite(t)) } } } // Write output file header. These include things like package name and // import statements. if err := g.writeHeader(); err != nil { return err } // Write type checks for referenced marshallable types to output file. if err := g.writeTypeChecks(ms); err != nil { return err } // Write generated interfaces to output file. for _, i := range impls { if err := i.write(g.output); err != nil { return err } } // Write generated tests to test file. return g.writeTests(ts) } // writeTests outputs tests for the generated interface implementations to a go // source file. func (g *Generator) writeTests(ts []*testGenerator) error { var b sourceBuffer // Write the unconditional test file. This file is always compiled, // regardless of what build tags were specified on the original input // files. We use this file to guarantee we never end up with an empty test // file, as that causes the build to fail with "no tests/benchmarks/examples // found". // // There's no easy way to determine ahead of time if we'll end up with an // empty build file since build constraints can arbitrarily cause some of // the original types to be not defined. We also have no way to tell bazel // to omit the entire test suite since the output files are already defined // before go-marshal is called. b.emit("// Automatically generated marshal tests. See tools/go_marshal.\n\n") b.emit("package %s\n\n", g.pkg) b.emit("func Example() {\n") b.inIndent(func() { b.emit("// This example is intentionally empty, and ensures this package contains at\n") b.emit("// least one testable entity. go-marshal is forced to emit a test package if the\n") b.emit("// input package is marked marshallable, but emitting no testable entities \n") b.emit("// results in a build failure.\n") }) b.emit("}\n") if err := b.write(g.outputTestUC); err != nil { return err } // Now generate the real test file that contains the real types we // processed. These need to be conditionally compiled according to the build // tags, as the original types may not be defined under all build // configurations. b.reset() b.emit("// Automatically generated marshal tests. See tools/go_marshal.\n\n") // Emit build constraints. bcexpr, err := constraintutil.CombineFromFiles(g.inputs) if err != nil { return err } b.emit(constraintutil.Lines(bcexpr)) b.emit("package %s\n\n", g.pkg) if err := b.write(g.outputTest); err != nil { return err } // Collect and write test import statements. imports := newImportTable() for _, t := range ts { imports.merge(t.imports) } if err := imports.write(g.outputTest); err != nil { return err } // Write test functions. for _, t := range ts { if err := t.write(g.outputTest); err != nil { return err } } return nil } golang-gvisor-gvisor-0.0~20240729.0/tools/go_marshal/gomarshal/generator_interfaces.go000066400000000000000000000233631465435605700306230ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gomarshal import ( "fmt" "go/ast" "go/token" "strings" ) // interfaceGenerator generates marshalling interfaces for a single type. // // getState is not thread-safe. type interfaceGenerator struct { sourceBuffer // The type we're serializing. t *ast.TypeSpec // Receiver argument for generated methods. r string // FileSet containing the tokens for the type we're processing. f *token.FileSet // is records external packages referenced by the generated implementation. is map[string]struct{} // ms records Marshallable types referenced by the generated implementation // of t's interfaces. ms map[string]struct{} // as records fields in t that are potentially not packed. The key is the // accessor for the field. as map[string]struct{} } // typeName returns the name of the type this g represents. func (g *interfaceGenerator) typeName() string { return g.t.Name.Name } // newinterfaceGenerator creates a new interface generator. func newInterfaceGenerator(t *ast.TypeSpec, r string, fset *token.FileSet) *interfaceGenerator { g := &interfaceGenerator{ t: t, r: r, f: fset, is: make(map[string]struct{}), ms: make(map[string]struct{}), as: make(map[string]struct{}), } g.recordUsedMarshallable(g.typeName()) return g } func (g *interfaceGenerator) recordUsedMarshallable(m string) { g.ms[m] = struct{}{} } func (g *interfaceGenerator) recordUsedImport(i string) { g.is[i] = struct{}{} } func (g *interfaceGenerator) recordPotentiallyNonPackedField(fieldName string) { // Some calls to g.unmarshalScalar() occur in emitted loops that use "idx" // as a loop variable, passing "field[idx]" as the accessor. When // g.unmarshalScalar() calls this function, we need to convert such cases // to "field[0]" for g.areFieldsPackedExpression(), which is used in // contexts where "idx" is not defined. fieldName = strings.ReplaceAll(fieldName, "[idx]", "[0]") g.as[fieldName] = struct{}{} } // abortAt aborts the go_marshal tool with the given error message, with a // reference position to the input source. Same as abortAt, but uses g to // resolve p to position. func (g *interfaceGenerator) abortAt(p token.Pos, msg string) { abortAt(g.f.Position(p), msg) } // scalarSize returns the size of type identified by t. If t isn't a primitive // type, the size isn't known at code generation time, and must be resolved via // the marshal.Marshallable interface. func (g *interfaceGenerator) scalarSize(t *ast.Ident) (size int, unknownSize bool) { switch t.Name { case "int8", "uint8", "byte": return 1, false case "int16", "uint16": return 2, false case "int32", "uint32": return 4, false case "int64", "uint64": return 8, false default: return 0, true } } func (g *interfaceGenerator) shift(bufVar string, n int) { g.emit("%s = %s[%d:]\n", bufVar, bufVar, n) } func (g *interfaceGenerator) shiftDynamic(bufVar, name string) { g.emit("%s = %s[%s.SizeBytes():]\n", bufVar, bufVar, name) } // marshalScalar writes a single scalar to a byte slice. func (g *interfaceGenerator) marshalScalar(accessor, typ, bufVar string) { switch typ { case "int8", "uint8", "byte": g.emit("%s[0] = byte(%s)\n", bufVar, accessor) g.shift(bufVar, 1) case "int16", "uint16": g.recordUsedImport("hostarch") g.emit("hostarch.ByteOrder.PutUint16(%s[:2], uint16(%s))\n", bufVar, accessor) g.shift(bufVar, 2) case "int32", "uint32": g.recordUsedImport("hostarch") g.emit("hostarch.ByteOrder.PutUint32(%s[:4], uint32(%s))\n", bufVar, accessor) g.shift(bufVar, 4) case "int64", "uint64": g.recordUsedImport("hostarch") g.emit("hostarch.ByteOrder.PutUint64(%s[:8], uint64(%s))\n", bufVar, accessor) g.shift(bufVar, 8) default: g.emit("%s = %s.MarshalUnsafe(%s)\n", bufVar, accessor, bufVar) } } // unmarshalScalar reads a single scalar from a byte slice. func (g *interfaceGenerator) unmarshalScalar(accessor, typ, bufVar string) { switch typ { case "byte": g.emit("%s = %s[0]\n", accessor, bufVar) g.shift(bufVar, 1) case "int8", "uint8": g.emit("%s = %s(%s[0])\n", accessor, typ, bufVar) g.shift(bufVar, 1) case "int16", "uint16": g.recordUsedImport("hostarch") g.emit("%s = %s(hostarch.ByteOrder.Uint16(%s[:2]))\n", accessor, typ, bufVar) g.shift(bufVar, 2) case "int32", "uint32": g.recordUsedImport("hostarch") g.emit("%s = %s(hostarch.ByteOrder.Uint32(%s[:4]))\n", accessor, typ, bufVar) g.shift(bufVar, 4) case "int64", "uint64": g.recordUsedImport("hostarch") g.emit("%s = %s(hostarch.ByteOrder.Uint64(%s[:8]))\n", accessor, typ, bufVar) g.shift(bufVar, 8) default: g.emit("%s = %s.UnmarshalUnsafe(%s)\n", bufVar, accessor, bufVar) g.recordPotentiallyNonPackedField(accessor) } } // emitCastToByteSlice unsafely casts an arbitrary type's underlying memory to a // byte slice, bypassing escape analysis. The caller is responsible for ensuring // srcPtr lives until they're done with dstVar, the runtime does not consider // dstVar dependent on srcPtr due to the escape analysis bypass. // // srcPtr must be a pointer. // // This function uses internally uses the identifier "hdr", and cannot be used // in a context where it is already bound. func (g *interfaceGenerator) emitCastToByteSlice(srcPtr, dstVar, lenExpr string) { g.recordUsedImport("gohacks") g.emit("// Construct a slice backed by dst's underlying memory.\n") g.emit("var %s []byte\n", dstVar) g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&%s))\n", dstVar) g.emit("hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(%s)))\n", srcPtr) g.emit("hdr.Len = %s\n", lenExpr) g.emit("hdr.Cap = %s\n\n", lenExpr) } // emitCastToByteSlice unsafely casts a slice with elements of an abitrary type // to a byte slice. As part of the cast, the byte slice is made to look // independent of the src slice by bypassing escape analysis. This means the // byte slice can be used without causing the source to escape. The caller is // responsible for ensuring srcPtr lives until they're done with dstVar, as the // runtime no longer considers dstVar dependent on srcPtr and is free to GC it. // // srcPtr must be a pointer. // // This function uses internally uses the identifiers "ptr", "val" and "hdr", // and cannot be used in a context where these identifiers are already bound. func (g *interfaceGenerator) emitCastSliceToByteSlice(srcPtr, dstVar, lenExpr string) { g.emitNoEscapeSliceDataPointer(srcPtr, "val") g.emit("// Construct a slice backed by dst's underlying memory.\n") g.emit("var %s []byte\n", dstVar) g.emit("hdr := (*reflect.SliceHeader)(unsafe.Pointer(&%s))\n", dstVar) g.emit("hdr.Data = uintptr(val)\n") g.emit("hdr.Len = %s\n", lenExpr) g.emit("hdr.Cap = %s\n\n", lenExpr) } // emitNoEscapeSliceDataPointer unsafely casts a slice's data pointer to an // unsafe.Pointer, bypassing escape analysis. The caller is responsible for // ensuring srcPtr lives until they're done with dstVar, as the runtime no // longer considers dstVar dependent on srcPtr and is free to GC it. // // srcPtr must be a pointer. // // This function uses internally uses the identifier "ptr" cannot be used in a // context where this identifier is already bound. func (g *interfaceGenerator) emitNoEscapeSliceDataPointer(srcPtr, dstVar string) { g.recordUsedImport("gohacks") g.emit("ptr := unsafe.Pointer(%s)\n", srcPtr) g.emit("%s := gohacks.Noescape(unsafe.Pointer((*reflect.SliceHeader)(ptr).Data))\n\n", dstVar) } func (g *interfaceGenerator) emitKeepAlive(ptrVar string) { g.emit("// Since we bypassed the compiler's escape analysis, indicate that %s\n", ptrVar) g.emit("// must live until the use above.\n") g.emit("runtime.KeepAlive(%s) // escapes: replaced by intrinsic.\n", ptrVar) } func (g *interfaceGenerator) expandBinaryExpr(b *strings.Builder, e *ast.BinaryExpr) { switch x := e.X.(type) { case *ast.BinaryExpr: // Recursively expand sub-expression. g.expandBinaryExpr(b, x) case *ast.Ident: fmt.Fprintf(b, "%s", x.Name) case *ast.BasicLit: fmt.Fprintf(b, "%s", x.Value) default: g.abortAt(e.Pos(), "Cannot convert binary expression to output code. Go-marshal currently only handles simple expressions of literals, constants and basic identifiers") } fmt.Fprintf(b, "%s", e.Op) switch y := e.Y.(type) { case *ast.BinaryExpr: // Recursively expand sub-expression. g.expandBinaryExpr(b, y) case *ast.Ident: fmt.Fprintf(b, "%s", y.Name) case *ast.BasicLit: fmt.Fprintf(b, "%s", y.Value) default: g.abortAt(e.Pos(), "Cannot convert binary expression to output code. Go-marshal currently only handles simple expressions of literals, constants and basic identifiers") } } // arrayLenExpr returns a string containing a valid golang expression // representing the length of array a. The returned expression should be treated // as a single value, and will be already parenthesized as required. func (g *interfaceGenerator) arrayLenExpr(a *ast.ArrayType) string { var b strings.Builder switch l := a.Len.(type) { case *ast.Ident: fmt.Fprintf(&b, "%s", l.Name) case *ast.BasicLit: fmt.Fprintf(&b, "%s", l.Value) case *ast.BinaryExpr: g.expandBinaryExpr(&b, l) return fmt.Sprintf("(%s)", b.String()) default: g.abortAt(l.Pos(), "Cannot convert this array len expression to output code. Go-marshal currently only handles simple expressions of literals, constants and basic identifiers") } return b.String() } golang-gvisor-gvisor-0.0~20240729.0/tools/go_marshal/gomarshal/generator_interfaces_array_newtype.go000066400000000000000000000147701465435605700335760ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // This file contains the bits of the code generator specific to marshalling // newtypes on arrays. package gomarshal import ( "fmt" "go/ast" ) func (g *interfaceGenerator) validateArrayNewtype(n *ast.Ident, a *ast.ArrayType) { if a.Len == nil { g.abortAt(a.Pos(), fmt.Sprintf("Dynamically sized slice '%s' cannot be marshalled, arrays must be statically sized", n.Name)) } if _, ok := a.Elt.(*ast.Ident); !ok { g.abortAt(a.Elt.Pos(), fmt.Sprintf("Marshalling not supported for arrays with %s elements, array elements must be primitive types", kindString(a.Elt))) } } func (g *interfaceGenerator) emitMarshallableForArrayNewtype(n *ast.Ident, a *ast.ArrayType, elt *ast.Ident) { g.recordUsedImport("gohacks") g.recordUsedImport("hostarch") g.recordUsedImport("io") g.recordUsedImport("marshal") g.recordUsedImport("reflect") g.recordUsedImport("runtime") g.recordUsedImport("unsafe") lenExpr := g.arrayLenExpr(a) g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n") g.emit("//go:nosplit\n") g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName()) g.inIndent(func() { if size, dynamic := g.scalarSize(elt); !dynamic { g.emit("return %d * %s\n", size, lenExpr) } else { g.emit("return (*%s)(nil).SizeBytes() * %s\n", n.Name, lenExpr) } }) g.emit("}\n\n") g.emit("// MarshalBytes implements marshal.Marshallable.MarshalBytes.\n") g.emit("func (%s *%s) MarshalBytes(dst []byte) []byte {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("for idx := 0; idx < %s; idx++ {\n", lenExpr) g.inIndent(func() { g.marshalScalar(fmt.Sprintf("%s[idx]", g.r), elt.Name, "dst") }) g.emit("}\n") g.emit("return dst\n") }) g.emit("}\n\n") g.emit("// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.\n") g.emit("func (%s *%s) UnmarshalBytes(src []byte) []byte {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("for idx := 0; idx < %s; idx++ {\n", lenExpr) g.inIndent(func() { g.unmarshalScalar(fmt.Sprintf("%s[idx]", g.r), elt.Name, "src") }) g.emit("}\n") g.emit("return src\n") }) g.emit("}\n\n") g.emit("// Packed implements marshal.Marshallable.Packed.\n") g.emit("//go:nosplit\n") g.emit("func (%s *%s) Packed() bool {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("// Array newtypes are always packed.\n") g.emit("return true\n") }) g.emit("}\n\n") g.emit("// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.\n") g.emit("func (%s *%s) MarshalUnsafe(dst []byte) []byte {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("size := %s.SizeBytes()\n", g.r) g.emit("gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&%s[0]), uintptr(size))\n", g.r) g.emit("return dst[size:]\n") }) g.emit("}\n\n") g.emit("// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.\n") g.emit("func (%s *%s) UnmarshalUnsafe(src []byte) []byte {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("size := %s.SizeBytes()\n", g.r) g.emit("gohacks.Memmove(unsafe.Pointer(%s), unsafe.Pointer(&src[0]), uintptr(size))\n", g.r) g.emit("return src[size:]\n") }) g.emit("}\n\n") g.emit("// CopyOutN implements marshal.Marshallable.CopyOutN.\n") g.emit("func (%s *%s) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r)) g.emit("length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n") g.emitKeepAlive(g.r) g.emit("return length, err\n") }) g.emit("}\n\n") g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n") g.emit("func (%s *%s) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("return %s.CopyOutN(cc, addr, %s.SizeBytes())\n", g.r, g.r) }) g.emit("}\n\n") g.emit("// CopyInN implements marshal.Marshallable.CopyInN.\n") g.emit("func (%s *%s) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r)) g.emit("length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay.\n") g.emitKeepAlive(g.r) g.emit("return length, err\n") }) g.emit("}\n\n") g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n") g.emit("func (%s *%s) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("return %s.CopyInN(cc, addr, %s.SizeBytes())\n", g.r, g.r) }) g.emit("}\n\n") g.emit("// WriteTo implements io.WriterTo.WriteTo.\n") g.emit("func (%s *%s) WriteTo(writer io.Writer) (int64, error) {\n", g.r, g.typeName()) g.inIndent(func() { g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r)) g.emit("length, err := writer.Write(buf)\n") g.emitKeepAlive(g.r) g.emit("return int64(length), err\n") }) g.emit("}\n\n") } func (g *interfaceGenerator) emitCheckedMarshallableForArrayNewtype() { g.emit("// CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal.\n") g.emit("func (%s *%s) CheckedMarshal(dst []byte) ([]byte, bool) {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("size := %s.SizeBytes()\n", g.r) g.emit("if size > len(dst) {\n") g.inIndent(func() { g.emit("return dst, false\n") }) g.emit("}\n") g.emit("gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&%s[0]), uintptr(size))\n", g.r) g.emit("return dst[size:], true\n") }) g.emit("}\n\n") g.emit("// CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal.\n") g.emit("func (%s *%s) CheckedUnmarshal(src []byte) ([]byte, bool) {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("size := %s.SizeBytes()\n", g.r) g.emit("if size > len(src) {\n") g.inIndent(func() { g.emit("return src, false\n") }) g.emit("}\n") g.emit("gohacks.Memmove(unsafe.Pointer(%s), unsafe.Pointer(&src[0]), uintptr(size))\n", g.r) g.emit("return src[size:], true\n") }) g.emit("}\n\n") } golang-gvisor-gvisor-0.0~20240729.0/tools/go_marshal/gomarshal/generator_interfaces_dynamic.go000066400000000000000000000106741465435605700323300ustar00rootroot00000000000000// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gomarshal func (g *interfaceGenerator) emitMarshallableForDynamicType() { // The user writes their own MarshalBytes, UnmarshalBytes and SizeBytes for // dynamic types. Generate the rest using these definitions. g.emit("// Packed implements marshal.Marshallable.Packed.\n") g.emit("//go:nosplit\n") g.emit("func (%s *%s) Packed() bool {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("// Type %s is dynamic so it might have slice/string headers. Hence, it is not packed.\n", g.typeName()) g.emit("return false\n") }) g.emit("}\n\n") g.emit("// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.\n") g.emit("func (%s *%s) MarshalUnsafe(dst []byte) []byte {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("// Type %s doesn't have a packed layout in memory, fallback to MarshalBytes.\n", g.typeName()) g.emit("return %s.MarshalBytes(dst)\n", g.r) }) g.emit("}\n\n") g.emit("// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.\n") g.emit("func (%s *%s) UnmarshalUnsafe(src []byte) []byte {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("// Type %s doesn't have a packed layout in memory, fallback to UnmarshalBytes.\n", g.typeName()) g.emit("return %s.UnmarshalBytes(src)\n", g.r) }) g.emit("}\n\n") g.emit("// CopyOutN implements marshal.Marshallable.CopyOutN.\n") g.emit("//go:nosplit\n") g.recordUsedImport("marshal") g.recordUsedImport("hostarch") g.emit("func (%s *%s) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName()) g.emit("buf := cc.CopyScratchBuffer(%s.SizeBytes()) // escapes: okay.\n", g.r) g.emit("%s.MarshalBytes(buf) // escapes: fallback.\n", g.r) g.emit("return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n") }) g.emit("}\n\n") g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n") g.recordUsedImport("marshal") g.recordUsedImport("hostarch") g.emit("func (%s *%s) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("return %s.CopyOutN(cc, addr, %s.SizeBytes())\n", g.r, g.r) }) g.emit("}\n\n") g.emit("// CopyInN implements marshal.Marshallable.CopyInN.\n") g.emit("//go:nosplit\n") g.recordUsedImport("marshal") g.recordUsedImport("hostarch") g.emit("func (%s *%s) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName()) g.emit("buf := cc.CopyScratchBuffer(%s.SizeBytes()) // escapes: okay.\n", g.r) g.emit("length, err := cc.CopyInBytes(addr, buf) // escapes: okay.\n") g.emit("// Unmarshal unconditionally. If we had a short copy-in, this results in a\n") g.emit("// partially unmarshalled struct.\n") g.emit("%s.UnmarshalBytes(buf) // escapes: fallback.\n", g.r) g.emit("return length, err\n") }) g.emit("}\n\n") g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n") g.recordUsedImport("marshal") g.recordUsedImport("hostarch") g.emit("func (%s *%s) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("return %s.CopyInN(cc, addr, %s.SizeBytes())\n", g.r, g.r) }) g.emit("}\n\n") g.emit("// WriteTo implements io.WriterTo.WriteTo.\n") g.recordUsedImport("io") g.emit("func (%s *%s) WriteTo(writer io.Writer) (int64, error) {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName()) g.emit("buf := make([]byte, %s.SizeBytes())\n", g.r) g.emit("%s.MarshalBytes(buf)\n", g.r) g.emit("length, err := writer.Write(buf)\n") g.emit("return int64(length), err\n") }) g.emit("}\n\n") } generator_interfaces_primitive_newtype.go000066400000000000000000000302521465435605700344020ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/tools/go_marshal/gomarshal// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // This file contains the bits of the code generator specific to marshalling // newtypes on primitives. package gomarshal import ( "fmt" "go/ast" ) // marshalPrimitiveScalar writes a single primitive variable to a byte // slice. func (g *interfaceGenerator) marshalPrimitiveScalar(accessor, typ, bufVar string) { switch typ { case "int8", "uint8", "byte": g.emit("%s[0] = byte(*%s)\n", bufVar, accessor) case "int16", "uint16": g.recordUsedImport("hostarch") g.emit("hostarch.ByteOrder.PutUint16(%s[:2], uint16(*%s))\n", bufVar, accessor) case "int32", "uint32": g.recordUsedImport("hostarch") g.emit("hostarch.ByteOrder.PutUint32(%s[:4], uint32(*%s))\n", bufVar, accessor) case "int64", "uint64": g.recordUsedImport("hostarch") g.emit("hostarch.ByteOrder.PutUint64(%s[:8], uint64(*%s))\n", bufVar, accessor) default: g.emit("// Explicilty cast to the underlying type before dispatching to\n") g.emit("// MarshalBytes, so we don't recursively call %s.MarshalBytes\n", accessor) g.emit("inner := (*%s)(%s)\n", typ, accessor) g.emit("inner.MarshalBytes(%s[:%s.SizeBytes()])\n", bufVar, accessor) } } // unmarshalPrimitiveScalar read a single primitive variable from a byte slice. func (g *interfaceGenerator) unmarshalPrimitiveScalar(accessor, typ, bufVar, typeCast string) { switch typ { case "byte": g.emit("*%s = %s(%s[0])\n", accessor, typeCast, bufVar) case "int8", "uint8": g.emit("*%s = %s(%s(%s[0]))\n", accessor, typeCast, typ, bufVar) case "int16", "uint16": g.recordUsedImport("hostarch") g.emit("*%s = %s(%s(hostarch.ByteOrder.Uint16(%s[:2])))\n", accessor, typeCast, typ, bufVar) case "int32", "uint32": g.recordUsedImport("hostarch") g.emit("*%s = %s(%s(hostarch.ByteOrder.Uint32(%s[:4])))\n", accessor, typeCast, typ, bufVar) case "int64", "uint64": g.recordUsedImport("hostarch") g.emit("*%s = %s(%s(hostarch.ByteOrder.Uint64(%s[:8])))\n", accessor, typeCast, typ, bufVar) default: g.emit("// Explicilty cast to the underlying type before dispatching to\n") g.emit("// UnmarshalBytes, so we don't recursively call %s.UnmarshalBytes\n", accessor) g.emit("inner := (*%s)(%s)\n", typ, accessor) g.emit("inner.UnmarshalBytes(%s[:%s.SizeBytes()])\n", bufVar, accessor) } } func (g *interfaceGenerator) validatePrimitiveNewtype(t *ast.Ident) { switch t.Name { case "int8", "uint8", "byte", "int16", "uint16", "int32", "uint32", "int64", "uint64": // These are the only primitive types we're allow. Below, we provide // suggestions for some disallowed types and reject them, then attempt // to marshal any remaining types by invoking the marshal.Marshallable // interface on them. If these types don't actually implement // marshal.Marshallable, compilation of the generated code will fail // with an appropriate error message. return case "int": g.abortAt(t.Pos(), "Type 'int' has ambiguous width, use int32 or int64") case "uint": g.abortAt(t.Pos(), "Type 'uint' has ambiguous width, use uint32 or uint64") case "string": g.abortAt(t.Pos(), "Type 'string' is dynamically-sized and cannot be marshalled, use a fixed size byte array '[...]byte' instead") default: debugfAt(g.f.Position(t.Pos()), fmt.Sprintf("Found derived type '%s', will attempt dispatch via marshal.Marshallable.\n", t.Name)) } } // emitMarshallableForPrimitiveNewtype outputs code to implement the // marshal.Marshallable interface for a newtype on a primitive. Primitive // newtypes are always packed, so we can omit the various fallbacks required for // non-packed structs. func (g *interfaceGenerator) emitMarshallableForPrimitiveNewtype(nt *ast.Ident) { g.recordUsedImport("gohacks") g.recordUsedImport("hostarch") g.recordUsedImport("io") g.recordUsedImport("marshal") g.recordUsedImport("reflect") g.recordUsedImport("runtime") g.recordUsedImport("unsafe") g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n") g.emit("//go:nosplit\n") g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName()) g.inIndent(func() { if size, dynamic := g.scalarSize(nt); !dynamic { g.emit("return %d\n", size) } else { g.emit("return (*%s)(nil).SizeBytes()\n", nt.Name) } }) g.emit("}\n\n") g.emit("// MarshalBytes implements marshal.Marshallable.MarshalBytes.\n") g.emit("func (%s *%s) MarshalBytes(dst []byte) []byte {\n", g.r, g.typeName()) g.inIndent(func() { g.marshalPrimitiveScalar(g.r, nt.Name, "dst") if size, dynamic := g.scalarSize(nt); !dynamic { g.emit("return dst[%d:]\n", size) } else { g.emit("return dst[(*%s)(nil).SizeBytes():]\n", nt.Name) } }) g.emit("}\n\n") g.emit("// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.\n") g.emit("func (%s *%s) UnmarshalBytes(src []byte) []byte {\n", g.r, g.typeName()) g.inIndent(func() { g.unmarshalPrimitiveScalar(g.r, nt.Name, "src", g.typeName()) if size, dynamic := g.scalarSize(nt); !dynamic { g.emit("return src[%d:]\n", size) } else { g.emit("return src[(*%s)(nil).SizeBytes():]\n", nt.Name) } }) g.emit("}\n\n") g.emit("// Packed implements marshal.Marshallable.Packed.\n") g.emit("//go:nosplit\n") g.emit("func (%s *%s) Packed() bool {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("// Scalar newtypes are always packed.\n") g.emit("return true\n") }) g.emit("}\n\n") g.emit("// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.\n") g.emit("func (%s *%s) MarshalUnsafe(dst []byte) []byte {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("size := %s.SizeBytes()\n", g.r) g.emit("gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(%s), uintptr(size))\n", g.r) g.emit("return dst[size:]\n") }) g.emit("}\n\n") g.emit("// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.\n") g.emit("func (%s *%s) UnmarshalUnsafe(src []byte) []byte {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("size := %s.SizeBytes()\n", g.r) g.emit("gohacks.Memmove(unsafe.Pointer(%s), unsafe.Pointer(&src[0]), uintptr(size))\n", g.r) g.emit("return src[size:]\n") }) g.emit("}\n\n") g.emit("// CopyOutN implements marshal.Marshallable.CopyOutN.\n") g.emit("func (%s *%s) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r)) g.emit("length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n") g.emitKeepAlive(g.r) g.emit("return length, err\n") }) g.emit("}\n\n") g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n") g.emit("func (%s *%s) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("return %s.CopyOutN(cc, addr, %s.SizeBytes())\n", g.r, g.r) }) g.emit("}\n\n") g.emit("// CopyInN implements marshal.Marshallable.CopyInN.\n") g.emit("func (%s *%s) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r)) g.emit("length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay.\n") g.emitKeepAlive(g.r) g.emit("return length, err\n") }) g.emit("}\n\n") g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n") g.emit("func (%s *%s) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("return %s.CopyInN(cc, addr, %s.SizeBytes())\n", g.r, g.r) }) g.emit("}\n\n") g.emit("// WriteTo implements io.WriterTo.WriteTo.\n") g.emit("func (%s *%s) WriteTo(writer io.Writer) (int64, error) {\n", g.r, g.typeName()) g.inIndent(func() { g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r)) g.emit("length, err := writer.Write(buf)\n") g.emitKeepAlive(g.r) g.emit("return int64(length), err\n") }) g.emit("}\n\n") } func (g *interfaceGenerator) emitCheckedMarshallableForPrimitiveNewtype() { g.emit("// CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal.\n") g.emit("func (%s *%s) CheckedMarshal(dst []byte) ([]byte, bool) {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("size := %s.SizeBytes()\n", g.r) g.emit("if size > len(dst) {\n") g.inIndent(func() { g.emit("return dst, false\n") }) g.emit("}\n") g.emit("gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(%s), uintptr(size))\n", g.r) g.emit("return dst[size:], true\n") }) g.emit("}\n\n") g.emit("// CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal.\n") g.emit("func (%s *%s) CheckedUnmarshal(src []byte) ([]byte, bool) {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("size := %s.SizeBytes()\n", g.r) g.emit("if size > len(src) {\n") g.inIndent(func() { g.emit("return src, false\n") }) g.emit("}\n") g.emit("gohacks.Memmove(unsafe.Pointer(%s), unsafe.Pointer(&src[0]), uintptr(size))\n", g.r) g.emit("return src[size:], true\n") }) g.emit("}\n\n") } func (g *interfaceGenerator) emitMarshallableSliceForPrimitiveNewtype(nt *ast.Ident, slice *sliceAPI) { g.recordUsedImport("marshal") g.recordUsedImport("hostarch") g.recordUsedImport("reflect") g.recordUsedImport("runtime") g.recordUsedImport("unsafe") eltType := g.typeName() if slice.inner { eltType = nt.Name } g.emit("// Copy%sIn copies in a slice of %s objects from the task's memory.\n", slice.ident, eltType) g.emit("func Copy%sIn(cc marshal.CopyContext, addr hostarch.Addr, dst []%s) (int, error) {\n", slice.ident, eltType) g.inIndent(func() { g.emit("count := len(dst)\n") g.emit("if count == 0 {\n") g.inIndent(func() { g.emit("return 0, nil\n") }) g.emit("}\n") g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName()) g.emitCastSliceToByteSlice("&dst", "buf", "size * count") g.emit("length, err := cc.CopyInBytes(addr, buf) // escapes: okay.\n") g.emitKeepAlive("dst") g.emit("return length, err\n") }) g.emit("}\n\n") g.emit("// Copy%sOut copies a slice of %s objects to the task's memory.\n", slice.ident, eltType) g.emit("func Copy%sOut(cc marshal.CopyContext, addr hostarch.Addr, src []%s) (int, error) {\n", slice.ident, eltType) g.inIndent(func() { g.emit("count := len(src)\n") g.emit("if count == 0 {\n") g.inIndent(func() { g.emit("return 0, nil\n") }) g.emit("}\n") g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName()) g.emitCastSliceToByteSlice("&src", "buf", "size * count") g.emit("length, err := cc.CopyOutBytes(addr, buf) // escapes: okay.\n") g.emitKeepAlive("src") g.emit("return length, err\n") }) g.emit("}\n\n") g.emit("// MarshalUnsafe%s is like %s.MarshalUnsafe, but for a []%s.\n", slice.ident, g.typeName(), g.typeName()) g.emit("func MarshalUnsafe%s(src []%s, dst []byte) []byte {\n", slice.ident, g.typeName()) g.inIndent(func() { g.emit("count := len(src)\n") g.emit("if count == 0 {\n") g.inIndent(func() { g.emit("return dst\n") }) g.emit("}\n") g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName()) g.emit("buf := dst[:size*count]\n") g.emit("gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf)))\n") g.emit("return dst[size*count:]\n") }) g.emit("}\n\n") g.emit("// UnmarshalUnsafe%s is like %s.UnmarshalUnsafe, but for a []%s.\n", slice.ident, g.typeName(), g.typeName()) g.emit("func UnmarshalUnsafe%s(dst []%s, src []byte) []byte {\n", slice.ident, g.typeName()) g.inIndent(func() { g.emit("count := len(dst)\n") g.emit("if count == 0 {\n") g.inIndent(func() { g.emit("return src\n") }) g.emit("}\n") g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName()) g.emit("buf := src[:size*count]\n") g.emit("gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf)))\n") g.emit("return src[size*count:]\n") }) g.emit("}\n\n") } golang-gvisor-gvisor-0.0~20240729.0/tools/go_marshal/gomarshal/generator_interfaces_struct.go000066400000000000000000000535161465435605700322320ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // This file contains the bits of the code generator specific to marshalling // structs. package gomarshal import ( "fmt" "go/ast" "sort" "strings" ) func (g *interfaceGenerator) fieldAccessor(n *ast.Ident) string { return fmt.Sprintf("%s.%s", g.r, n.Name) } // areFieldsPackedExpression returns a go expression checking whether g.t's fields are // packed. Returns "", false if g.t has no fields that may be potentially not // packed, otherwise returns , true, where is an expression // like "t.a.Packed() && t.b.Packed() && t.c.Packed()". func (g *interfaceGenerator) areFieldsPackedExpression() (string, bool) { if len(g.as) == 0 { return "", false } cs := make([]string, 0, len(g.as)) for accessor := range g.as { cs = append(cs, fmt.Sprintf("%s.Packed()", accessor)) } // Sort expressions for determinstic build outputs. sort.Strings(cs) return strings.Join(cs, " && "), true } // validateStruct ensures the type we're working with can be marshalled. These // checks are done ahead of time and in one place so we can make assumptions // later. func (g *interfaceGenerator) validateStruct(ts *ast.TypeSpec, st *ast.StructType) { forEachStructField(st, func(f *ast.Field) { fieldDispatcher{ primitive: func(_, t *ast.Ident) { g.validatePrimitiveNewtype(t) }, selector: func(_, _, _ *ast.Ident) { // No validation to perform on selector fields. However this // callback must still be provided. }, array: func(n *ast.Ident, a *ast.ArrayType, _ *ast.Ident) { g.validateArrayNewtype(n, a) }, unhandled: func(_ *ast.Ident) { g.abortAt(f.Pos(), fmt.Sprintf("Marshalling not supported for %s fields", kindString(f.Type))) }, }.dispatch(f) }) } func (g *interfaceGenerator) isStructPacked(st *ast.StructType) bool { packed := true forEachStructField(st, func(f *ast.Field) { if f.Tag != nil { if f.Tag.Value == "`marshal:\"unaligned\"`" { if packed { debugfAt(g.f.Position(g.t.Pos()), fmt.Sprintf("Marking type '%s' as not packed due to tag `marshal:\"unaligned\"`.\n", g.t.Name)) packed = false } } } }) return packed } func (g *interfaceGenerator) emitMarshallableForStruct(st *ast.StructType) { thisPacked := g.isStructPacked(st) g.emit("// SizeBytes implements marshal.Marshallable.SizeBytes.\n") g.emit("func (%s *%s) SizeBytes() int {\n", g.r, g.typeName()) g.inIndent(func() { primitiveSize := 0 var dynamicSizeTerms []string forEachStructField(st, fieldDispatcher{ primitive: func(_, t *ast.Ident) { if size, dynamic := g.scalarSize(t); !dynamic { primitiveSize += size } else { g.recordUsedMarshallable(t.Name) dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()", t.Name)) } }, selector: func(_, tX, tSel *ast.Ident) { tName := fmt.Sprintf("%s.%s", tX.Name, tSel.Name) g.recordUsedImport(tX.Name) g.recordUsedMarshallable(tName) dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()", tName)) }, array: func(_ *ast.Ident, a *ast.ArrayType, t *ast.Ident) { lenExpr := g.arrayLenExpr(a) if size, dynamic := g.scalarSize(t); !dynamic { dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("%d*%s", size, lenExpr)) } else { g.recordUsedMarshallable(t.Name) dynamicSizeTerms = append(dynamicSizeTerms, fmt.Sprintf("(*%s)(nil).SizeBytes()*%s", t.Name, lenExpr)) } }, }.dispatch) g.emit("return %d", primitiveSize) if len(dynamicSizeTerms) > 0 { g.incIndent() } { for _, d := range dynamicSizeTerms { g.emitNoIndent(" +\n") g.emit(d) } } if len(dynamicSizeTerms) > 0 { g.decIndent() } }) g.emit("\n}\n\n") g.emit("// MarshalBytes implements marshal.Marshallable.MarshalBytes.\n") g.emit("func (%s *%s) MarshalBytes(dst []byte) []byte {\n", g.r, g.typeName()) g.inIndent(func() { forEachStructField(st, fieldDispatcher{ primitive: func(n, t *ast.Ident) { if n.Name == "_" { g.emit("// Padding: dst[:sizeof(%s)] ~= %s(0)\n", t.Name, t.Name) if len, dynamic := g.scalarSize(t); !dynamic { g.shift("dst", len) } else { // We can't use shiftDynamic here because we don't have // an instance of the dynamic type we can reference here // (since the version in this struct is anonymous). Use // a typed nil pointer to call SizeBytes() instead. g.emit("dst = dst[(*%s)(nil).SizeBytes():]\n", t.Name) } return } g.marshalScalar(g.fieldAccessor(n), t.Name, "dst") }, selector: func(n, tX, tSel *ast.Ident) { if n.Name == "_" { g.emit("// Padding: dst[:sizeof(%s)] ~= %s(0)\n", tX.Name, tSel.Name) g.emit("dst = dst[(*%s.%s)(nil).SizeBytes():]\n", tX.Name, tSel.Name) return } g.marshalScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "dst") }, array: func(n *ast.Ident, a *ast.ArrayType, t *ast.Ident) { lenExpr := g.arrayLenExpr(a) if n.Name == "_" { g.emit("// Padding: dst[:sizeof(%s)*%s] ~= [%s]%s{0}\n", t.Name, lenExpr, lenExpr, t.Name) if size, dynamic := g.scalarSize(t); !dynamic { g.emit("dst = dst[%d*(%s):]\n", size, lenExpr) } else { // We can't use shiftDynamic here because we don't have // an instance of the dynamic type we can reference here // (since the version in this struct is anonymous). Use // a typed nil pointer to call SizeBytes() instead. g.emit("dst = dst[(*%s)(nil).SizeBytes()*(%s):]\n", t.Name, lenExpr) } return } g.emit("for idx := 0; idx < %s; idx++ {\n", lenExpr) g.inIndent(func() { g.marshalScalar(fmt.Sprintf("%s[idx]", g.fieldAccessor(n)), t.Name, "dst") }) g.emit("}\n") }, }.dispatch) // All cases above shift the buffer appropriately. g.emit("return dst\n") }) g.emit("}\n\n") g.emit("// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.\n") g.emit("func (%s *%s) UnmarshalBytes(src []byte) []byte {\n", g.r, g.typeName()) g.inIndent(func() { forEachStructField(st, fieldDispatcher{ primitive: func(n, t *ast.Ident) { if n.Name == "_" { g.emit("// Padding: var _ %s ~= src[:sizeof(%s)]\n", t.Name, t.Name) if len, dynamic := g.scalarSize(t); !dynamic { g.shift("src", len) } else { // We don't have an instance of the dynamic type we can // reference here (since the version in this struct is // anonymous). Use a typed nil pointer to call // SizeBytes() instead. g.shiftDynamic("src", fmt.Sprintf("(*%s)(nil)", t.Name)) g.recordPotentiallyNonPackedField(fmt.Sprintf("(*%s)(nil)", t.Name)) } return } g.unmarshalScalar(g.fieldAccessor(n), t.Name, "src") }, selector: func(n, tX, tSel *ast.Ident) { if n.Name == "_" { g.emit("// Padding: %s ~= src[:sizeof(%s.%s)]\n", g.fieldAccessor(n), tX.Name, tSel.Name) g.emit("src = src[(*%s.%s)(nil).SizeBytes():]\n", tX.Name, tSel.Name) g.recordPotentiallyNonPackedField(fmt.Sprintf("(*%s.%s)(nil)", tX.Name, tSel.Name)) return } g.unmarshalScalar(g.fieldAccessor(n), fmt.Sprintf("%s.%s", tX.Name, tSel.Name), "src") }, array: func(n *ast.Ident, a *ast.ArrayType, t *ast.Ident) { lenExpr := g.arrayLenExpr(a) if n.Name == "_" { g.emit("// Padding: ~ copy([%s]%s(%s), src[:sizeof(%s)*%s])\n", lenExpr, t.Name, g.fieldAccessor(n), t.Name, lenExpr) if size, dynamic := g.scalarSize(t); !dynamic { g.emit("src = src[%d*(%s):]\n", size, lenExpr) } else { // We can't use shiftDynamic here because we don't have // an instance of the dynamic type we can referece here // (since the version in this struct is anonymous). Use // a typed nil pointer to call SizeBytes() instead. g.emit("src = src[(*%s)(nil).SizeBytes()*(%s):]\n", t.Name, lenExpr) } return } g.emit("for idx := 0; idx < %s; idx++ {\n", lenExpr) g.inIndent(func() { g.unmarshalScalar(fmt.Sprintf("%s[idx]", g.fieldAccessor(n)), t.Name, "src") }) g.emit("}\n") }, }.dispatch) // All cases above shift the buffer appropriately. g.emit("return src\n") }) g.emit("}\n\n") g.emit("// Packed implements marshal.Marshallable.Packed.\n") g.emit("//go:nosplit\n") g.emit("func (%s *%s) Packed() bool {\n", g.r, g.typeName()) g.inIndent(func() { expr, fieldsMaybePacked := g.areFieldsPackedExpression() switch { case !thisPacked: g.emit("return false\n") case fieldsMaybePacked: g.emit("return %s\n", expr) default: g.emit("return true\n") } }) g.emit("}\n\n") g.emit("// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.\n") g.emit("func (%s *%s) MarshalUnsafe(dst []byte) []byte {\n", g.r, g.typeName()) g.inIndent(func() { fallback := func() { g.emit("// Type %s doesn't have a packed layout in memory, fallback to MarshalBytes.\n", g.typeName()) g.emit("return %s.MarshalBytes(dst)\n", g.r) } if thisPacked { g.recordUsedImport("gohacks") g.recordUsedImport("unsafe") fastMarshal := func() { g.emit("size := %s.SizeBytes()\n", g.r) g.emit("gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(%s), uintptr(size))\n", g.r) g.emit("return dst[size:]\n") } if cond, ok := g.areFieldsPackedExpression(); ok { g.emit("if %s {\n", cond) g.inIndent(fastMarshal) g.emit("}\n") fallback() } else { fastMarshal() } } else { fallback() } }) g.emit("}\n\n") g.emit("// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.\n") g.emit("func (%s *%s) UnmarshalUnsafe(src []byte) []byte {\n", g.r, g.typeName()) g.inIndent(func() { fallback := func() { g.emit("// Type %s doesn't have a packed layout in memory, fallback to UnmarshalBytes.\n", g.typeName()) g.emit("return %s.UnmarshalBytes(src)\n", g.r) } if thisPacked { g.recordUsedImport("gohacks") g.recordUsedImport("unsafe") fastUnmarshal := func() { g.emit("size := %s.SizeBytes()\n", g.r) g.emit("gohacks.Memmove(unsafe.Pointer(%s), unsafe.Pointer(&src[0]), uintptr(size))\n", g.r) g.emit("return src[size:]\n") } if cond, ok := g.areFieldsPackedExpression(); ok { g.emit("if %s {\n", cond) g.inIndent(fastUnmarshal) g.emit("}\n") fallback() } else { fastUnmarshal() } } else { fallback() } }) g.emit("}\n\n") g.emit("// CopyOutN implements marshal.Marshallable.CopyOutN.\n") g.recordUsedImport("marshal") g.recordUsedImport("hostarch") g.emit("func (%s *%s) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { fallback := func() { g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName()) g.emit("buf := cc.CopyScratchBuffer(%s.SizeBytes()) // escapes: okay.\n", g.r) g.emit("%s.MarshalBytes(buf) // escapes: fallback.\n", g.r) g.emit("return cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n") } if thisPacked { g.recordUsedImport("reflect") g.recordUsedImport("runtime") g.recordUsedImport("unsafe") if cond, ok := g.areFieldsPackedExpression(); ok { g.emit("if !%s {\n", cond) g.inIndent(fallback) g.emit("}\n\n") } // Fast serialization. g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r)) g.emit("length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay.\n") g.emitKeepAlive(g.r) g.emit("return length, err\n") } else { fallback() } }) g.emit("}\n\n") g.emit("// CopyOut implements marshal.Marshallable.CopyOut.\n") g.recordUsedImport("marshal") g.recordUsedImport("hostarch") g.emit("func (%s *%s) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("return %s.CopyOutN(cc, addr, %s.SizeBytes())\n", g.r, g.r) }) g.emit("}\n\n") g.emit("// CopyInN implements marshal.Marshallable.CopyInN.\n") g.recordUsedImport("marshal") g.recordUsedImport("hostarch") g.emit("func (%s *%s) CopyInN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { fallback := func() { g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName()) g.emit("buf := cc.CopyScratchBuffer(%s.SizeBytes()) // escapes: okay.\n", g.r) g.emit("length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay.\n") g.emit("// Unmarshal unconditionally. If we had a short copy-in, this results in a\n") g.emit("// partially unmarshalled struct.\n") g.emit("%s.UnmarshalBytes(buf) // escapes: fallback.\n", g.r) g.emit("return length, err\n") } if thisPacked { g.recordUsedImport("reflect") g.recordUsedImport("runtime") g.recordUsedImport("unsafe") if cond, ok := g.areFieldsPackedExpression(); ok { g.emit("if !%s {\n", cond) g.inIndent(fallback) g.emit("}\n\n") } // Fast deserialization. g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r)) g.emit("length, err := cc.CopyInBytes(addr, buf[:limit]) // escapes: okay.\n") g.emitKeepAlive(g.r) g.emit("return length, err\n") } else { fallback() } }) g.emit("}\n\n") g.emit("// CopyIn implements marshal.Marshallable.CopyIn.\n") g.recordUsedImport("marshal") g.recordUsedImport("hostarch") g.emit("func (%s *%s) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("return %s.CopyInN(cc, addr, %s.SizeBytes())\n", g.r, g.r) }) g.emit("}\n\n") g.emit("// WriteTo implements io.WriterTo.WriteTo.\n") g.recordUsedImport("io") g.emit("func (%s *%s) WriteTo(writer io.Writer) (int64, error) {\n", g.r, g.typeName()) g.inIndent(func() { fallback := func() { g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName()) g.emit("buf := make([]byte, %s.SizeBytes())\n", g.r) g.emit("%s.MarshalBytes(buf)\n", g.r) g.emit("length, err := writer.Write(buf)\n") g.emit("return int64(length), err\n") } if thisPacked { g.recordUsedImport("reflect") g.recordUsedImport("runtime") g.recordUsedImport("unsafe") if cond, ok := g.areFieldsPackedExpression(); ok { g.emit("if !%s {\n", cond) g.inIndent(fallback) g.emit("}\n\n") } // Fast serialization. g.emitCastToByteSlice(g.r, "buf", fmt.Sprintf("%s.SizeBytes()", g.r)) g.emit("length, err := writer.Write(buf)\n") g.emitKeepAlive(g.r) g.emit("return int64(length), err\n") } else { fallback() } }) g.emit("}\n\n") } func (g *interfaceGenerator) emitCheckedMarshallableForStruct() { g.emit("// CheckedMarshal implements marshal.CheckedMarshallable.CheckedMarshal.\n") g.emit("func (%s *%s) CheckedMarshal(dst []byte) ([]byte, bool) {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("if %s.SizeBytes() > len(dst) {\n", g.r) g.inIndent(func() { g.emit("return dst, false\n") }) g.emit("}\n") g.emit("return %s.MarshalUnsafe(dst), true\n", g.r) }) g.emit("}\n\n") g.emit("// CheckedUnmarshal implements marshal.CheckedMarshallable.CheckedUnmarshal.\n") g.emit("func (%s *%s) CheckedUnmarshal(src []byte) ([]byte, bool) {\n", g.r, g.typeName()) g.inIndent(func() { g.emit("if %s.SizeBytes() > len(src) {\n", g.r) g.inIndent(func() { g.emit("return src, false\n") }) g.emit("}\n") g.emit("return %s.UnmarshalUnsafe(src), true\n", g.r) }) g.emit("}\n\n") } func (g *interfaceGenerator) emitMarshallableSliceForStruct(st *ast.StructType, slice *sliceAPI) { thisPacked := g.isStructPacked(st) if slice.inner { abortAt(g.f.Position(slice.comment.Slash), fmt.Sprintf("The ':inner' argument to '+marshal slice:%s:inner' is only applicable to newtypes on primitives. Remove it from this struct declaration.", slice.ident)) } g.recordUsedImport("marshal") g.recordUsedImport("hostarch") g.emit("// Copy%sIn copies in a slice of %s objects from the task's memory.\n", slice.ident, g.typeName()) g.emit("func Copy%sIn(cc marshal.CopyContext, addr hostarch.Addr, dst []%s) (int, error) {\n", slice.ident, g.typeName()) g.inIndent(func() { g.emit("count := len(dst)\n") g.emit("if count == 0 {\n") g.inIndent(func() { g.emit("return 0, nil\n") }) g.emit("}\n") g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName()) fallback := func() { g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName()) g.emit("buf := cc.CopyScratchBuffer(size * count)\n") g.emit("length, err := cc.CopyInBytes(addr, buf)\n\n") g.emit("// Unmarshal as much as possible, even on error. First handle full objects.\n") g.emit("limit := length/size\n") g.emit("for idx := 0; idx < limit; idx++ {\n") g.inIndent(func() { g.emit("buf = dst[idx].UnmarshalBytes(buf)\n") }) g.emit("}\n\n") g.emit("// Handle any final partial object. buf is guaranteed to be long enough for the\n") g.emit("// final element, but may not contain valid data for the entire range. This may\n") g.emit("// result in unmarshalling zero values for some parts of the object.\n") g.emit("if length%size != 0 {\n") g.inIndent(func() { g.emit("dst[limit].UnmarshalBytes(buf)\n") }) g.emit("}\n\n") g.emit("return length, err\n") } if thisPacked { g.recordUsedImport("reflect") g.recordUsedImport("runtime") g.recordUsedImport("unsafe") if _, ok := g.areFieldsPackedExpression(); ok { g.emit("if !dst[0].Packed() {\n") g.inIndent(fallback) g.emit("}\n\n") } // Fast deserialization. g.emitCastSliceToByteSlice("&dst", "buf", "size * count") g.emit("length, err := cc.CopyInBytes(addr, buf)\n") g.emitKeepAlive("dst") g.emit("return length, err\n") } else { fallback() } }) g.emit("}\n\n") g.emit("// Copy%sOut copies a slice of %s objects to the task's memory.\n", slice.ident, g.typeName()) g.emit("func Copy%sOut(cc marshal.CopyContext, addr hostarch.Addr, src []%s) (int, error) {\n", slice.ident, g.typeName()) g.inIndent(func() { g.emit("count := len(src)\n") g.emit("if count == 0 {\n") g.inIndent(func() { g.emit("return 0, nil\n") }) g.emit("}\n") g.emit("size := (*%s)(nil).SizeBytes()\n\n", g.typeName()) fallback := func() { g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName()) g.emit("buf := cc.CopyScratchBuffer(size * count)\n") g.emit("curBuf := buf\n") g.emit("for idx := 0; idx < count; idx++ {\n") g.inIndent(func() { g.emit("curBuf = src[idx].MarshalBytes(curBuf)\n") }) g.emit("}\n") g.emit("return cc.CopyOutBytes(addr, buf)\n") } if thisPacked { g.recordUsedImport("reflect") g.recordUsedImport("runtime") g.recordUsedImport("unsafe") if _, ok := g.areFieldsPackedExpression(); ok { g.emit("if !src[0].Packed() {\n") g.inIndent(fallback) g.emit("}\n\n") } // Fast serialization. g.emitCastSliceToByteSlice("&src", "buf", "size * count") g.emit("length, err := cc.CopyOutBytes(addr, buf)\n") g.emitKeepAlive("src") g.emit("return length, err\n") } else { fallback() } }) g.emit("}\n\n") g.emit("// MarshalUnsafe%s is like %s.MarshalUnsafe, but for a []%s.\n", slice.ident, g.typeName(), g.typeName()) g.emit("func MarshalUnsafe%s(src []%s, dst []byte) []byte {\n", slice.ident, g.typeName()) g.inIndent(func() { g.emit("count := len(src)\n") g.emit("if count == 0 {\n") g.inIndent(func() { g.emit("return dst\n") }) g.emit("}\n\n") fallback := func() { g.emit("// Type %s doesn't have a packed layout in memory, fall back to MarshalBytes.\n", g.typeName()) g.emit("for idx := 0; idx < count; idx++ {\n") g.inIndent(func() { g.emit("dst = src[idx].MarshalBytes(dst)\n") }) g.emit("}\n") g.emit("return dst\n") } if thisPacked { g.recordUsedImport("reflect") g.recordUsedImport("runtime") g.recordUsedImport("unsafe") g.recordUsedImport("gohacks") if _, ok := g.areFieldsPackedExpression(); ok { g.emit("if !src[0].Packed() {\n") g.inIndent(fallback) g.emit("}\n\n") } g.emit("size := (*%s)(nil).SizeBytes()\n", g.typeName()) g.emit("buf := dst[:size*count]\n") g.emit("gohacks.Memmove(unsafe.Pointer(&buf[0]), unsafe.Pointer(&src[0]), uintptr(len(buf)))\n") g.emit("return dst[size*count:]\n") } else { fallback() } }) g.emit("}\n\n") g.emit("// UnmarshalUnsafe%s is like %s.UnmarshalUnsafe, but for a []%s.\n", slice.ident, g.typeName(), g.typeName()) g.emit("func UnmarshalUnsafe%s(dst []%s, src []byte) []byte {\n", slice.ident, g.typeName()) g.inIndent(func() { g.emit("count := len(dst)\n") g.emit("if count == 0 {\n") g.inIndent(func() { g.emit("return src\n") }) g.emit("}\n\n") fallback := func() { g.emit("// Type %s doesn't have a packed layout in memory, fall back to UnmarshalBytes.\n", g.typeName()) g.emit("for idx := 0; idx < count; idx++ {\n") g.inIndent(func() { g.emit("src = dst[idx].UnmarshalBytes(src)\n") }) g.emit("}\n") g.emit("return src\n") } if thisPacked { g.recordUsedImport("gohacks") g.recordUsedImport("reflect") g.recordUsedImport("runtime") if _, ok := g.areFieldsPackedExpression(); ok { g.emit("if !dst[0].Packed() {\n") g.inIndent(fallback) g.emit("}\n\n") } g.emit("size := (*%s)(nil).SizeBytes()\n", g.typeName()) g.emit("buf := src[:size*count]\n") g.emit("gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(&buf[0]), uintptr(len(buf)))\n") g.emit("return src[size*count:]\n") } else { fallback() } }) g.emit("}\n\n") } golang-gvisor-gvisor-0.0~20240729.0/tools/go_marshal/gomarshal/generator_tests.go000066400000000000000000000204121465435605700276320ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gomarshal import ( "fmt" "go/ast" "io" "strings" ) var standardImports = []string{ "bytes", "fmt", "reflect", "testing", "gvisor.dev/gvisor/tools/go_marshal/analysis", } var sliceAPIImports = []string{ "encoding/binary", "gvisor.dev/gvisor/pkg/hostarch", } type testGenerator struct { sourceBuffer // The type we're serializing. t *ast.TypeSpec // Receiver argument for generated methods. r string // Imports used by generated code. imports *importTable // Import statement for the package declaring the type we generated code // for. We need this to construct test instances for the type, since the // tests aren't written in the same package. decl *importStmt } func newTestGenerator(t *ast.TypeSpec, r string) *testGenerator { g := &testGenerator{ t: t, r: r, imports: newImportTable(), } for _, i := range standardImports { g.imports.add(i).markUsed() } // These imports are used if a type requests the slice API. Don't // mark them as used by default. for _, i := range sliceAPIImports { g.imports.add(i) } return g } func (g *testGenerator) typeName() string { return g.t.Name.Name } func (g *testGenerator) testFuncName(base string) string { return fmt.Sprintf("%s%s", base, strings.Title(g.t.Name.Name)) } func (g *testGenerator) inTestFunction(name string, body func()) { g.emit("func %s(t *testing.T) {\n", g.testFuncName(name)) g.inIndent(body) g.emit("}\n\n") } func (g *testGenerator) emitTestNonZeroSize() { g.inTestFunction("TestSizeNonZero", func() { g.emit("var x %v\n", g.typeName()) g.emit("if x.SizeBytes() == 0 {\n") g.inIndent(func() { g.emit("t.Fatal(\"Marshallable.SizeBytes() should not return zero\")\n") }) g.emit("}\n") }) } func (g *testGenerator) emitTestSuspectAlignment() { g.inTestFunction("TestSuspectAlignment", func() { g.emit("var x %v\n", g.typeName()) g.emit("analysis.AlignmentCheck(t, reflect.TypeOf(x))\n") }) } func (g *testGenerator) emitTestMarshalUnmarshalPreservesData() { g.inTestFunction("TestSafeMarshalUnmarshalPreservesData", func() { g.emit("var x, y, z, yUnsafe, zUnsafe %s\n", g.typeName()) g.emit("analysis.RandomizeValue(&x)\n\n") g.emit("buf := make([]byte, x.SizeBytes())\n") g.emit("x.MarshalBytes(buf)\n") g.emit("bufUnsafe := make([]byte, x.SizeBytes())\n") g.emit("x.MarshalUnsafe(bufUnsafe)\n\n") g.emit("y.UnmarshalBytes(buf)\n") g.emit("if !reflect.DeepEqual(x, y) {\n") g.inIndent(func() { g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalBytes/UnmarshalBytes cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, y))\n") }) g.emit("}\n") g.emit("yUnsafe.UnmarshalBytes(bufUnsafe)\n") g.emit("if !reflect.DeepEqual(x, yUnsafe) {\n") g.inIndent(func() { g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalUnsafe/UnmarshalBytes cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, yUnsafe))\n") }) g.emit("}\n\n") g.emit("z.UnmarshalUnsafe(buf)\n") g.emit("if !reflect.DeepEqual(x, z) {\n") g.inIndent(func() { g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalBytes/UnmarshalUnsafe cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, z))\n") }) g.emit("}\n") g.emit("zUnsafe.UnmarshalUnsafe(bufUnsafe)\n") g.emit("if !reflect.DeepEqual(x, zUnsafe) {\n") g.inIndent(func() { g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalUnsafe/UnmarshalUnsafe cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, zUnsafe))\n") }) g.emit("}\n") }) } func (g *testGenerator) emitTestMarshalUnmarshalSlicePreservesData(slice *sliceAPI) { for _, name := range []string{"binary", "hostarch"} { if !g.imports.markUsed(name) { panic(fmt.Sprintf("Generated test for '%s' referenced a non-existent import with local name '%s'", g.typeName(), name)) } } g.inTestFunction("TestSafeMarshalUnmarshalSlicePreservesData", func() { g.emit("var x, y, yUnsafe [8]%s\n", g.typeName()) g.emit("analysis.RandomizeValue(&x)\n\n") g.emit("size := (*%s)(nil).SizeBytes() * len(x)\n", g.typeName()) g.emit("buf := bytes.NewBuffer(make([]byte, size))\n") g.emit("buf.Reset()\n") g.emit("if err := binary.Write(buf, hostarch.ByteOrder, x[:]); err != nil {\n") g.inIndent(func() { g.emit("t.Fatal(fmt.Sprintf(\"binary.Write failed: %v\", err))\n") }) g.emit("}\n") g.emit("bufUnsafe := make([]byte, size)\n") g.emit("MarshalUnsafe%s(x[:], bufUnsafe)\n\n", slice.ident) g.emit("UnmarshalUnsafe%s(y[:], buf.Bytes())\n", slice.ident) g.emit("if !reflect.DeepEqual(x, y) {\n") g.inIndent(func() { g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across binary.Write/UnmarshalUnsafeSlice cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, y))\n") }) g.emit("}\n") g.emit("UnmarshalUnsafe%s(yUnsafe[:], bufUnsafe)\n", slice.ident) g.emit("if !reflect.DeepEqual(x, yUnsafe) {\n") g.inIndent(func() { g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across MarshalUnsafeSlice/UnmarshalUnsafeSlice cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, yUnsafe))\n") }) g.emit("}\n\n") }) } func (g *testGenerator) emitTestWriteToUnmarshalPreservesData() { g.inTestFunction("TestWriteToUnmarshalPreservesData", func() { g.emit("var x, y, yUnsafe %s\n", g.typeName()) g.emit("analysis.RandomizeValue(&x)\n\n") g.emit("var buf bytes.Buffer\n\n") g.emit("x.WriteTo(&buf)\n") g.emit("y.UnmarshalBytes(buf.Bytes())\n\n") g.emit("yUnsafe.UnmarshalUnsafe(buf.Bytes())\n\n") g.emit("if !reflect.DeepEqual(x, y) {\n") g.inIndent(func() { g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across WriteTo/UnmarshalBytes cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, y))\n") }) g.emit("}\n") g.emit("if !reflect.DeepEqual(x, yUnsafe) {\n") g.inIndent(func() { g.emit("t.Fatal(fmt.Sprintf(\"Data corrupted across WriteTo/UnmarshalUnsafe cycle:\\nBefore: %+v\\nAfter: %+v\\n\", x, yUnsafe))\n") }) g.emit("}\n") }) } func (g *testGenerator) emitTestSizeBytesOnTypedNilPtr() { g.inTestFunction("TestSizeBytesOnTypedNilPtr", func() { g.emit("var x %s\n", g.typeName()) g.emit("sizeFromConcrete := x.SizeBytes()\n") g.emit("sizeFromTypedNilPtr := (*%s)(nil).SizeBytes()\n\n", g.typeName()) g.emit("if sizeFromTypedNilPtr != sizeFromConcrete {\n") g.inIndent(func() { g.emit("t.Fatalf(\"SizeBytes() on typed nil pointer (%v) doesn't match size returned by a concrete object (%v).\\n\", sizeFromTypedNilPtr, sizeFromConcrete)\n") }) g.emit("}\n") }) } func (g *testGenerator) emitTestBoundCheck() { g.inTestFunction("TestCheckedMethods", func() { g.emit("var x %s\n", g.typeName()) g.emit("size := x.SizeBytes()\n") g.emit("b := make([]byte, size)\n\n") g.emit("if _, ok := x.CheckedMarshal(b[:size-1]); ok {\n") g.inIndent(func() { g.emit("t.Errorf(\"CheckedMarshal should have failed because buffer is small\")\n") }) g.emit("}\n") g.emit("if _, ok := x.CheckedMarshal(b); !ok {\n") g.inIndent(func() { g.emit("t.Errorf(\"CheckedMarshal should have succeeded because buffer size is okay\")\n") }) g.emit("}\n\n") g.emit("if _, ok := x.CheckedUnmarshal(b[:size-1]); ok {\n") g.inIndent(func() { g.emit("t.Errorf(\"CheckedUnmarshal should have failed because buffer is small\")\n") }) g.emit("}\n") g.emit("if _, ok := x.CheckedUnmarshal(b); !ok {\n") g.inIndent(func() { g.emit("t.Errorf(\"CheckedUnmarshal should have succeeded because buffer size is okay\")\n") }) g.emit("}\n") }) } func (g *testGenerator) emitTests(slice *sliceAPI, boundCheck bool) { g.emitTestNonZeroSize() g.emitTestSuspectAlignment() g.emitTestMarshalUnmarshalPreservesData() g.emitTestWriteToUnmarshalPreservesData() g.emitTestSizeBytesOnTypedNilPtr() if slice != nil { g.emitTestMarshalUnmarshalSlicePreservesData(slice) } if boundCheck { g.emitTestBoundCheck() } } func (g *testGenerator) write(out io.Writer) error { return g.sourceBuffer.write(out) } golang-gvisor-gvisor-0.0~20240729.0/tools/go_marshal/gomarshal/util.go000066400000000000000000000331121465435605700254000ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package gomarshal import ( "bytes" "flag" "fmt" "go/ast" "go/token" "io" "os" "path" "reflect" "sort" "strings" ) var debug = flag.Bool("debug", false, "enables debugging output") // receiverName returns an appropriate receiver name given a type spec. func receiverName(t *ast.TypeSpec) string { if len(t.Name.Name) < 1 { // Zero length type name? panic("unreachable") } return strings.ToLower(t.Name.Name[:1]) } // kindString returns a user-friendly representation of an AST expr type. func kindString(e ast.Expr) string { switch e.(type) { case *ast.Ident: return "scalar" case *ast.ArrayType: return "array" case *ast.StructType: return "struct" case *ast.StarExpr: return "pointer" case *ast.FuncType: return "function" case *ast.InterfaceType: return "interface" case *ast.MapType: return "map" case *ast.ChanType: return "channel" default: return reflect.TypeOf(e).String() } } func forEachStructField(st *ast.StructType, fn func(f *ast.Field)) { for _, field := range st.Fields.List { fn(field) } } // fieldDispatcher is a collection of callbacks for handling different types of // fields in a struct declaration. type fieldDispatcher struct { primitive func(n, t *ast.Ident) selector func(n, tX, tSel *ast.Ident) array func(n *ast.Ident, a *ast.ArrayType, t *ast.Ident) unhandled func(n *ast.Ident) } // Precondition: All dispatch callbacks that will be invoked must be // provided. func (fd fieldDispatcher) dispatch(f *ast.Field) { // Each field declaration may actually be multiple declarations of the same // type. For example, consider: // // type Point struct { // x, y, z int // } // // We invoke the call-backs once per such instance. // Handle embedded fields. Embedded fields have no names, but can be // referenced by the type name. if len(f.Names) < 1 { switch v := f.Type.(type) { case *ast.Ident: fd.primitive(v, v) case *ast.SelectorExpr: fd.selector(v.Sel, v.X.(*ast.Ident), v.Sel) default: // Note: Arrays can't be embedded, which is handled here. panic(fmt.Sprintf("Attempted to dispatch on embedded field of unsupported kind: %#v", f.Type)) } return } // Non-embedded field. for _, name := range f.Names { switch v := f.Type.(type) { case *ast.Ident: fd.primitive(name, v) case *ast.SelectorExpr: fd.selector(name, v.X.(*ast.Ident), v.Sel) case *ast.ArrayType: switch t := v.Elt.(type) { case *ast.Ident: fd.array(name, v, t) default: // Should be handled with a better error message during validate. panic(fmt.Sprintf("Array element type is of unsupported kind. Expected *ast.Ident, got %v", t)) } default: fd.unhandled(name) } } } // debugEnabled indicates whether debugging is enabled for gomarshal. func debugEnabled() bool { return *debug } // abort aborts the go_marshal tool with the given error message. func abort(msg string) { if !strings.HasSuffix(msg, "\n") { msg += "\n" } fmt.Print(msg) os.Exit(1) } // abortAt aborts the go_marshal tool with the given error message, with // a reference position to the input source. func abortAt(p token.Position, msg string) { abort(fmt.Sprintf("%v:\n %s\n", p, msg)) } // debugf conditionally prints a debug message. func debugf(f string, a ...any) { if debugEnabled() { fmt.Printf(f, a...) } } // debugfAt conditionally prints a debug message with a reference to a position // in the input source. func debugfAt(p token.Position, f string, a ...any) { if debugEnabled() { fmt.Printf("%s:\n %s", p, fmt.Sprintf(f, a...)) } } // emit generates a line of code in the output file. // // emit is a wrapper around writing a formatted string to the output // buffer. emit can be invoked in one of two ways: // // (1) emit("some string") // // When emit is called with a single string argument, it is simply copied to // the output buffer without any further formatting. // // (2) emit(fmtString, args...) // // emit can also be invoked in a similar fashion to *Printf() functions, // where the first argument is a format string. // // Calling emit with a single argument that is not a string will result in a // panic, as the caller's intent is ambiguous. func emit(out io.Writer, indent int, a ...any) { const spacesPerIndentLevel = 4 if len(a) < 1 { panic("emit() called with no arguments") } if indent > 0 { if _, err := fmt.Fprint(out, strings.Repeat(" ", indent*spacesPerIndentLevel)); err != nil { // Writing to the emit output should not fail. Typically the output // is a byte.Buffer; writes to these never fail. panic(err) } } first, ok := a[0].(string) if !ok { // First argument must be either the string to emit (case 1 from // function-level comment), or a format string (case 2). panic(fmt.Sprintf("First argument to emit() is not a string: %+v", a[0])) } if len(a) == 1 { // Single string argument. Assume no formatting requested. if _, err := fmt.Fprint(out, first); err != nil { // Writing to out should not fail. panic(err) } return } // Formatting requested. if _, err := fmt.Fprintf(out, first, a[1:]...); err != nil { // Writing to out should not fail. panic(err) } } // sourceBuffer represents fragments of generated go source code. // // sourceBuffer provides a convenient way to build up go souce fragments in // memory. May be safely zero-value initialized. Not thread-safe. type sourceBuffer struct { // Current indentation level. indent int // Memory buffer containing contents while they're being generated. b bytes.Buffer } func (b *sourceBuffer) reset() { b.indent = 0 b.b.Reset() } func (b *sourceBuffer) incIndent() { b.indent++ } func (b *sourceBuffer) decIndent() { if b.indent <= 0 { panic("decIndent() without matching incIndent()") } b.indent-- } func (b *sourceBuffer) emit(a ...any) { emit(&b.b, b.indent, a...) } func (b *sourceBuffer) emitNoIndent(a ...any) { emit(&b.b, 0 /*indent*/, a...) } func (b *sourceBuffer) inIndent(body func()) { b.incIndent() body() b.decIndent() } func (b *sourceBuffer) write(out io.Writer) error { _, err := fmt.Fprint(out, b.b.String()) return err } // Write implements io.Writer.Write. func (b *sourceBuffer) Write(buf []byte) (int, error) { return (b.b.Write(buf)) } // importStmt represents a single import statement. type importStmt struct { // Local name of the imported package. name string // Import path. path string // Indicates whether the local name is an alias, or simply the final // component of the path. aliased bool // Indicates whether this import was referenced by generated code. used bool // AST node and file set representing the import statement, if any. These // are only non-nil if the import statement originates from an input source // file. spec *ast.ImportSpec fset *token.FileSet } func newImport(p string) *importStmt { name := path.Base(p) return &importStmt{ name: name, path: p, aliased: false, } } func newImportFromSpec(spec *ast.ImportSpec, f *token.FileSet) *importStmt { p := spec.Path.Value[1 : len(spec.Path.Value)-1] // Strip the " quotes around path. name := path.Base(p) if name == "" || name == "/" || name == "." { panic(fmt.Sprintf("Couldn't process local package name for import at %s, (processed as %s)", f.Position(spec.Path.Pos()), name)) } if spec.Name != nil { name = spec.Name.Name } return &importStmt{ name: name, path: p, aliased: spec.Name != nil, spec: spec, fset: f, } } // String implements fmt.Stringer.String. This generates a string for the import // statement appropriate for writing directly to generated code. func (i *importStmt) String() string { if i.aliased { return fmt.Sprintf("%s %q", i.name, i.path) } return fmt.Sprintf("%q", i.path) } // debugString returns a debug string representing an import statement. This // representation is not valid golang code and is used for debugging output. func (i *importStmt) debugString() string { if i.spec != nil && i.fset != nil { return fmt.Sprintf("%s: %s", i.fset.Position(i.spec.Path.Pos()), i) } return fmt.Sprintf("(go-marshal import): %s", i) } func (i *importStmt) markUsed() { i.used = true } func (i *importStmt) equivalent(other *importStmt) bool { return i.name == other.name && i.path == other.path && i.aliased == other.aliased } // importTable represents a collection of importStmts. // // An importTable may contain multiple import statements referencing the same // local name. All import statements aliasing to the same local name are // technically ambiguous, as if such an import name is used in the generated // code, it's not clear which import statement it refers to. We ignore any // potential collisions until actually writing the import table to the generated // source file. See importTable.write. // // Given the following import statements across all the files comprising a // package marshalled: // // "sync" // "pkg/sync" // "pkg/sentry/kernel" // ktime "pkg/sentry/kernel/time" // // An importTable representing them would look like this: // // importTable { // is: map[string][]*importStmt { // "sync": []*importStmt{ // importStmt{name:"sync", path:"sync", aliased:false} // importStmt{name:"sync", path:"pkg/sync", aliased:false} // }, // "kernel": []*importStmt{importStmt{ // name: "kernel", // path: "pkg/sentry/kernel", // aliased: false // }}, // "ktime": []*importStmt{importStmt{ // name: "ktime", // path: "pkg/sentry/kernel/time", // aliased: true, // }}, // } // } // // Note that the local name "sync" is assigned to two different import // statements. This is possible if the import statements are from different // source files in the same package. // // Since go-marshal generates a single output file per package regardless of the // number of input files, if "sync" is referenced by any generated code, it's // unclear which import statement "sync" refers to. While it's theoretically // possible to resolve this by assigning a unique local alias to each instance // of the sync package, go-marshal currently aborts when it encounters such an // ambiguity. // // TODO(b/151478251): importTable considers the final component of an import // path to be the package name, but this is only a convention. The actual // package name is determined by the package statement in the source files for // the package. type importTable struct { // Map of imports and whether they should be copied to the output. is map[string][]*importStmt } func newImportTable() *importTable { return &importTable{ is: make(map[string][]*importStmt), } } // Merges import statements from other into i. func (i *importTable) merge(other *importTable) { for name, ims := range other.is { i.is[name] = append(i.is[name], ims...) } } func (i *importTable) addStmt(s *importStmt) *importStmt { i.is[s.name] = append(i.is[s.name], s) return s } func (i *importTable) add(s string) *importStmt { n := newImport(s) return i.addStmt(n) } func (i *importTable) addFromSpec(spec *ast.ImportSpec, f *token.FileSet) *importStmt { return i.addStmt(newImportFromSpec(spec, f)) } // Marks the import named n as used. If no such import is in the table, returns // false. func (i *importTable) markUsed(n string) bool { if ns, ok := i.is[n]; ok { for _, n := range ns { n.markUsed() } return true } return false } func (i *importTable) clear() { for _, is := range i.is { for _, i := range is { i.used = false } } } func (i *importTable) write(out io.Writer) error { if len(i.is) == 0 { // Nothing to import, we're done. return nil } imports := make([]string, 0, len(i.is)) for name, is := range i.is { var lastUsed *importStmt var ambiguous bool for _, i := range is { if i.used { if lastUsed != nil { if !i.equivalent(lastUsed) { ambiguous = true } } lastUsed = i } } if ambiguous { // We have two or more import statements across the different source // files that share a local name, and at least one of these imports // are used by the generated code. This ambiguity can't be resolved // by go-marshal and requires the user intervention. Dump a list of // the colliding import statements and let the user modify the input // files as appropriate. var b strings.Builder fmt.Fprintf(&b, "The imported name %q is used by one of the types marked for marshalling, and which import statement the code refers to is ambiguous. Perhaps give the imports unique local names?\n\n", name) fmt.Fprintf(&b, "The following %d import statements are ambiguous for the local name %q:\n", len(is), name) // Note: len(is) is guaranteed to be 1 or greater or ambiguous can't // be true. Therefore the slicing below is safe. for _, i := range is[:len(is)-1] { fmt.Fprintf(&b, " %v\n", i.debugString()) } fmt.Fprintf(&b, " %v", is[len(is)-1].debugString()) panic(b.String()) } if lastUsed != nil { imports = append(imports, lastUsed.String()) } } sort.Strings(imports) var b sourceBuffer b.emit("import (\n") b.incIndent() for _, i := range imports { b.emit("%s\n", i) } b.decIndent() b.emit(")\n\n") return b.write(out) } golang-gvisor-gvisor-0.0~20240729.0/tools/go_marshal/main.go000066400000000000000000000042621465435605700233760ustar00rootroot00000000000000// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // go_marshal is a code generation utility for automatically generating code to // marshal go data structures to memory. // // This binary is typically run as part of the build process, and is invoked by // the go_marshal bazel rule defined in defs.bzl. // // See README.md. package main import ( "flag" "fmt" "os" "strings" "gvisor.dev/gvisor/tools/go_marshal/gomarshal" ) var ( pkg = flag.String("pkg", "", "output package") output = flag.String("output", "", "output file") outputTest = flag.String("output_test", "", "output file for tests") outputTestUnconditional = flag.String("output_test_unconditional", "", "output file for unconditional tests") imports = flag.String("imports", "", "comma-separated list of extra packages to import in generated code") ) func main() { flag.Usage = func() { fmt.Fprintf(os.Stderr, "Usage: %s \n", os.Args[0]) flag.PrintDefaults() } flag.Parse() if len(flag.Args()) == 0 { flag.Usage() os.Exit(1) } if *pkg == "" { flag.Usage() fmt.Fprint(os.Stderr, "Flag -pkg must be provided.\n") os.Exit(1) } var extraImports []string if len(*imports) > 0 { // Note: strings.Split(s, sep) returns s if sep doesn't exist in s. Thus // we check for an empty imports list to avoid emitting an empty string // as an import. extraImports = strings.Split(*imports, ",") } g, err := gomarshal.NewGenerator(flag.Args(), *output, *outputTest, *outputTestUnconditional, *pkg, extraImports) if err != nil { panic(err) } if err := g.Run(); err != nil { panic(err) } } golang-gvisor-gvisor-0.0~20240729.0/tools/go_stateify/000077500000000000000000000000001465435605700223205ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/tools/go_stateify/BUILD000066400000000000000000000005761465435605700231120ustar00rootroot00000000000000load("//tools:defs.bzl", "bzl_library", "go_binary") package( default_applicable_licenses = ["//:license"], licenses = ["notice"], ) go_binary( name = "stateify", srcs = ["main.go"], visibility = ["//:sandbox"], deps = ["//tools/constraintutil"], ) bzl_library( name = "defs_bzl", srcs = ["defs.bzl"], visibility = ["//visibility:private"], ) golang-gvisor-gvisor-0.0~20240729.0/tools/go_stateify/defs.bzl000066400000000000000000000036231465435605700237560ustar00rootroot00000000000000"""Stateify is a tool for generating state wrappers for Go types.""" def _go_stateify_impl(ctx): """Implementation for the stateify tool.""" output = ctx.outputs.out # Run the stateify command. args = ["-output=%s" % output.path] args.append("-fullpkg=%s" % ctx.attr.package) if ctx.attr._statepkg: args.append("-statepkg=%s" % ctx.attr._statepkg) if ctx.attr.imports: args.append("-imports=%s" % ",".join(ctx.attr.imports)) args.append("--") for src in ctx.attr.srcs: args += [f.path for f in src.files.to_list()] ctx.actions.run( inputs = ctx.files.srcs, outputs = [output], mnemonic = "GoStateify", progress_message = "Generating state library %s" % ctx.label, arguments = args, executable = ctx.executable._tool, ) go_stateify = rule( implementation = _go_stateify_impl, doc = "Generates save and restore logic from a set of Go files.", attrs = { "srcs": attr.label_list( doc = """ The input source files. These files should include all structs in the package that need to be saved. """, mandatory = True, allow_files = True, ), "imports": attr.string_list( doc = """ An optional list of extra non-aliased, Go-style absolute import paths required for statified types. """, mandatory = False, ), "package": attr.string( doc = "The fully qualified package name for the input sources.", mandatory = True, ), "out": attr.output( doc = "Name of the generator output file.", mandatory = True, ), "_tool": attr.label( executable = True, cfg = "exec", default = Label("//tools/go_stateify:stateify"), ), "_statepkg": attr.string(default = "gvisor.dev/gvisor/pkg/state"), }, ) golang-gvisor-gvisor-0.0~20240729.0/tools/go_stateify/main.go000066400000000000000000000405101465435605700235730ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Stateify provides a simple way to generate Load/Save methods based on // existing types and struct tags. package main import ( "flag" "fmt" "go/ast" "go/parser" "go/token" "os" "path/filepath" "reflect" "strings" "sync" "gvisor.dev/gvisor/tools/constraintutil" ) var ( fullPkg = flag.String("fullpkg", "", "fully qualified output package") imports = flag.String("imports", "", "extra imports for the output file") output = flag.String("output", "", "output file") statePkg = flag.String("statepkg", "", "state import package; defaults to empty") ) // resolveTypeName returns a qualified type name. func resolveTypeName(typ ast.Expr) (field string, qualified string) { for done := false; !done; { // Resolve star expressions. switch rs := typ.(type) { case *ast.StarExpr: qualified += "*" typ = rs.X case *ast.ArrayType: if rs.Len == nil { // Slice type declaration. qualified += "[]" } else { // Array type declaration. qualified += "[" + rs.Len.(*ast.BasicLit).Value + "]" } typ = rs.Elt default: // No more descent. done = true } } // Resolve a package selector. sel, ok := typ.(*ast.SelectorExpr) if ok { qualified = qualified + sel.X.(*ast.Ident).Name + "." typ = sel.Sel } // Figure out actual type name. field = typ.(*ast.Ident).Name qualified = qualified + field return } // extractStateTag pulls the relevant state tag. func extractStateTag(tag *ast.BasicLit) string { if tag == nil { return "" } if len(tag.Value) < 2 { return "" } return reflect.StructTag(tag.Value[1 : len(tag.Value)-1]).Get("state") } // scanFunctions is a set of functions passed to scanFields. type scanFunctions struct { zerovalue func(name string) normal func(name string) wait func(name string) value func(name, typName string) } // scanFields scans the fields of a struct. // // Each provided function will be applied to appropriately tagged fields, or // skipped if nil. // // Fields tagged nosave are skipped. func scanFields(ss *ast.StructType, fn scanFunctions) { if ss.Fields.List == nil { // No fields. return } // Scan all fields. for _, field := range ss.Fields.List { if field.Names == nil { // Anonymous types can't be embedded, so we don't need // to worry about providing a useful name here. name, _ := resolveTypeName(field.Type) scanField(name, field, fn) continue } // Iterate over potentially multiple fields defined on the same line. for _, nameI := range field.Names { name := nameI.Name // Skip _ fields. if name == "_" { continue } scanField(name, field, fn) } } } // scanField scans a single struct field with a resolved name. func scanField(name string, field *ast.Field, fn scanFunctions) { // Is this a anonymous struct? If yes, then continue the // recursion with the given prefix. We don't pay attention to // any tags on the top-level struct field. tag := extractStateTag(field.Tag) if anon, ok := field.Type.(*ast.StructType); ok && tag == "" { scanFields(anon, fn) return } switch tag { case "zerovalue": if fn.zerovalue != nil { fn.zerovalue(name) } case "": if fn.normal != nil { fn.normal(name) } case "wait": if fn.wait != nil { fn.wait(name) } case "manual", "nosave", "ignore": // Do nothing. default: if strings.HasPrefix(tag, ".(") && strings.HasSuffix(tag, ")") { if fn.value != nil { fn.value(name, tag[2:len(tag)-1]) } } } } func camelCased(name string) string { return strings.ToUpper(name[:1]) + name[1:] } func main() { // Parse flags. flag.Usage = func() { fmt.Fprintf(os.Stderr, "Usage: %s [options]\n", os.Args[0]) flag.PrintDefaults() } flag.Parse() if len(flag.Args()) == 0 { flag.Usage() os.Exit(1) } if *fullPkg == "" { fmt.Fprintf(os.Stderr, "Error: package required.") os.Exit(1) } // Open the output file. var ( outputFile *os.File err error ) if *output == "" || *output == "-" { outputFile = os.Stdout } else { outputFile, err = os.OpenFile(*output, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) if err != nil { fmt.Fprintf(os.Stderr, "Error opening output %q: %v", *output, err) } defer outputFile.Close() } // Set the statePrefix for below, depending on the import. statePrefix := "" if *statePkg != "" { parts := strings.Split(*statePkg, "/") statePrefix = parts[len(parts)-1] + "." } // initCalls is dumped at the end. var initCalls []string // Common closures. emitRegister := func(name string) { initCalls = append(initCalls, fmt.Sprintf("%sRegister((*%s)(nil))", statePrefix, name)) } // Automated warning. fmt.Fprint(outputFile, "// automatically generated by stateify.\n\n") // Emit build constraints. bcexpr, err := constraintutil.CombineFromFiles(flag.Args()) if err != nil { fmt.Fprintf(os.Stderr, "Failed to infer build constraints: %v", err) os.Exit(1) } outputFile.WriteString(constraintutil.Lines(bcexpr)) // Emit the package name. _, pkg := filepath.Split(*fullPkg) fmt.Fprintf(outputFile, "package %s\n\n", pkg) // Emit the imports lazily. var once sync.Once maybeEmitImports := func() { once.Do(func() { // Emit the imports. fmt.Fprint(outputFile, "import (\n") fmt.Fprint(outputFile, " \"context\"\n") if *statePkg != "" { fmt.Fprintf(outputFile, " \"%s\"\n", *statePkg) } if *imports != "" { for _, i := range strings.Split(*imports, ",") { fmt.Fprintf(outputFile, " \"%s\"\n", i) } } fmt.Fprint(outputFile, ")\n\n") }) } files := make([]*ast.File, 0, len(flag.Args())) // Parse the input files. for _, filename := range flag.Args() { // Parse the file. fset := token.NewFileSet() f, err := parser.ParseFile(fset, filename, nil, parser.ParseComments) if err != nil { // Not a valid input file? fmt.Fprintf(os.Stderr, "Input %q can't be parsed: %v\n", filename, err) os.Exit(1) } files = append(files, f) } type method struct { typeName string methodName string } // Search for and add all method to a set. We auto-detecting several // different methods (and insert them if we don't find them, in order // to ensure that expectations match reality). // // While we do this, figure out the right receiver name. If there are // multiple distinct receivers, then we will just pick the last one. simpleMethods := make(map[method]struct{}) receiverNames := make(map[string]string) for _, f := range files { // Go over all functions. for _, decl := range f.Decls { d, ok := decl.(*ast.FuncDecl) if !ok { continue } if d.Recv == nil || len(d.Recv.List) != 1 { // Not a named method. continue } // Save the method and the receiver. name, _ := resolveTypeName(d.Recv.List[0].Type) simpleMethods[method{ typeName: name, methodName: d.Name.Name, }] = struct{}{} if len(d.Recv.List[0].Names) > 0 { receiverNames[name] = d.Recv.List[0].Names[0].Name } } } for _, f := range files { // Go over all named types. for _, decl := range f.Decls { d, ok := decl.(*ast.GenDecl) if !ok || d.Tok != token.TYPE { continue } // Only generate code for types marked "// +stateify // savable" in one of the proceeding comment lines. If // the line is marked "// +stateify type" then only // generate type information and register the type. // If the type also has a "// +stateify identtype" // comment, the functions are instead generated to refer to // the type that this newly-defined type is identical to, rather // than about the newly-defined type itself. if d.Doc == nil { continue } var ( generateTypeInfo = false generateSaverLoader = false isIdentType = false ) for _, l := range d.Doc.List { if l.Text == "// +stateify savable" { generateTypeInfo = true generateSaverLoader = true } if l.Text == "// +stateify type" { generateTypeInfo = true } if l.Text == "// +stateify identtype" { isIdentType = true } } if !generateTypeInfo && !generateSaverLoader { continue } for _, gs := range d.Specs { ts := gs.(*ast.TypeSpec) recv, ok := receiverNames[ts.Name.Name] if !ok { // Maybe no methods were defined? recv = strings.ToLower(ts.Name.Name[:1]) } switch x := ts.Type.(type) { case *ast.StructType: maybeEmitImports() if isIdentType { fmt.Fprintf(os.Stderr, "Cannot use `+stateify identtype` on a struct type (%v); must be a type definition of an identical type.", ts.Name.Name) os.Exit(1) } // Record the slot for each field. fieldCount := 0 fields := make(map[string]int) emitField := func(name string) { fmt.Fprintf(outputFile, " \"%s\",\n", name) fields[name] = fieldCount fieldCount++ } emitFieldValue := func(name string, _ string) { emitField(name) } emitLoadValue := func(name, typName string) { fmt.Fprintf(outputFile, " stateSourceObject.LoadValue(%d, new(%s), func(y any) { %s.load%s(ctx, y.(%s)) })\n", fields[name], typName, recv, camelCased(name), typName) } emitLoad := func(name string) { fmt.Fprintf(outputFile, " stateSourceObject.Load(%d, &%s.%s)\n", fields[name], recv, name) } emitLoadWait := func(name string) { fmt.Fprintf(outputFile, " stateSourceObject.LoadWait(%d, &%s.%s)\n", fields[name], recv, name) } emitSaveValue := func(name, typName string) { // Emit typName to be more robust against code generation bugs, // but instead of one line make two lines to silence ST1023 // finding (i.e. avoid nogo finding: "should omit type $typName // from declaration; it will be inferred from the right-hand side") fmt.Fprintf(outputFile, " var %sValue %s\n", name, typName) fmt.Fprintf(outputFile, " %sValue = %s.save%s()\n", name, recv, camelCased(name)) fmt.Fprintf(outputFile, " stateSinkObject.SaveValue(%d, %sValue)\n", fields[name], name) } emitSave := func(name string) { fmt.Fprintf(outputFile, " stateSinkObject.Save(%d, &%s.%s)\n", fields[name], recv, name) } emitZeroCheck := func(name string) { fmt.Fprintf(outputFile, " if !%sIsZeroValue(&%s.%s) { %sFailf(\"%s is %%#v, expected zero\", &%s.%s) }\n", statePrefix, recv, name, statePrefix, name, recv, name) } // Generate the type name method. fmt.Fprintf(outputFile, "func (%s *%s) StateTypeName() string {\n", recv, ts.Name.Name) fmt.Fprintf(outputFile, " return \"%s.%s\"\n", *fullPkg, ts.Name.Name) fmt.Fprintf(outputFile, "}\n\n") // Generate the fields method. fmt.Fprintf(outputFile, "func (%s *%s) StateFields() []string {\n", recv, ts.Name.Name) fmt.Fprintf(outputFile, " return []string{\n") scanFields(x, scanFunctions{ normal: emitField, wait: emitField, value: emitFieldValue, }) fmt.Fprintf(outputFile, " }\n") fmt.Fprintf(outputFile, "}\n\n") // Define beforeSave if a definition was not found. This prevents // the code from compiling if a custom beforeSave was defined in a // file not provided to this binary and prevents inherited methods // from being called multiple times by overriding them. if _, ok := simpleMethods[method{ typeName: ts.Name.Name, methodName: "beforeSave", }]; !ok && generateSaverLoader { fmt.Fprintf(outputFile, "func (%s *%s) beforeSave() {}\n\n", recv, ts.Name.Name) } // Generate the save method. // // N.B. For historical reasons, we perform the value saves first, // and perform the value loads last. There should be no dependency // on this specific behavior, but the ability to specify slots // allows a manual implementation to be order-dependent. if generateSaverLoader { fmt.Fprintf(outputFile, "// +checklocksignore\n") fmt.Fprintf(outputFile, "func (%s *%s) StateSave(stateSinkObject %sSink) {\n", recv, ts.Name.Name, statePrefix) fmt.Fprintf(outputFile, " %s.beforeSave()\n", recv) scanFields(x, scanFunctions{zerovalue: emitZeroCheck}) scanFields(x, scanFunctions{value: emitSaveValue}) scanFields(x, scanFunctions{normal: emitSave, wait: emitSave}) fmt.Fprintf(outputFile, "}\n\n") } // Define afterLoad if a definition was not found. We do this for // the same reason that we do it for beforeSave. _, hasAfterLoad := simpleMethods[method{ typeName: ts.Name.Name, methodName: "afterLoad", }] if !hasAfterLoad && generateSaverLoader { fmt.Fprintf(outputFile, "func (%s *%s) afterLoad(context.Context) {}\n\n", recv, ts.Name.Name) } // Generate the load method. // // N.B. See the comment above for the save method. if generateSaverLoader { fmt.Fprintf(outputFile, "// +checklocksignore\n") fmt.Fprintf(outputFile, "func (%s *%s) StateLoad(ctx context.Context, stateSourceObject %sSource) {\n", recv, ts.Name.Name, statePrefix) scanFields(x, scanFunctions{normal: emitLoad, wait: emitLoadWait}) scanFields(x, scanFunctions{value: emitLoadValue}) if hasAfterLoad { // The call to afterLoad is made conditionally, because when // AfterLoad is called, the object encodes a dependency on // referred objects (i.e. fields). This means that afterLoad // will not be called until the other afterLoads are called. fmt.Fprintf(outputFile, " stateSourceObject.AfterLoad(func () { %s.afterLoad(ctx) })\n", recv) } fmt.Fprintf(outputFile, "}\n\n") } // Add to our registration. emitRegister(ts.Name.Name) case *ast.Ident, *ast.SelectorExpr, *ast.ArrayType: maybeEmitImports() // Generate the info methods. fmt.Fprintf(outputFile, "func (%s *%s) StateTypeName() string {\n", recv, ts.Name.Name) fmt.Fprintf(outputFile, " return \"%s.%s\"\n", *fullPkg, ts.Name.Name) fmt.Fprintf(outputFile, "}\n\n") if !isIdentType { fmt.Fprintf(outputFile, "func (%s *%s) StateFields() []string {\n", recv, ts.Name.Name) fmt.Fprintf(outputFile, " return nil\n") fmt.Fprintf(outputFile, "}\n\n") } else { var typeName string switch y := x.(type) { case *ast.Ident: typeName = y.Name case *ast.SelectorExpr: expIdent, ok := y.X.(*ast.Ident) if !ok { fmt.Fprintf(os.Stderr, "Cannot use non-ident %v (type %T) in type selector expression %v", y.X, y.X, y) os.Exit(1) } typeName = fmt.Sprintf("%s.%s", expIdent.Name, y.Sel.Name) default: fmt.Fprintf(os.Stderr, "Cannot use `+stateify identtype` on a non-identifier/non-selector type definition (%v => %v of type %T); must be a type definition of an identical type.", ts.Name.Name, x, x) os.Exit(1) } fmt.Fprintf(outputFile, "func (%s *%s) StateFields() []string {\n", recv, ts.Name.Name) fmt.Fprintf(outputFile, " return (*%s)(%s).StateFields()\n", typeName, recv) fmt.Fprintf(outputFile, "}\n\n") if generateSaverLoader { fmt.Fprintf(outputFile, "// +checklocksignore\n") fmt.Fprintf(outputFile, "func (%s *%s) StateSave(stateSinkObject %sSink) {\n", recv, ts.Name.Name, statePrefix) fmt.Fprintf(outputFile, " (*%s)(%s).StateSave(stateSinkObject)\n", typeName, recv) fmt.Fprintf(outputFile, "}\n\n") fmt.Fprintf(outputFile, "// +checklocksignore\n") fmt.Fprintf(outputFile, "func (%s *%s) StateLoad(ctx context.Context, stateSourceObject %sSource) {\n", recv, ts.Name.Name, statePrefix) fmt.Fprintf(outputFile, " (*%s)(%s).StateLoad(ctx, stateSourceObject)\n", typeName, recv) fmt.Fprintf(outputFile, "}\n\n") } } // See above. emitRegister(ts.Name.Name) } } } } if len(initCalls) > 0 { // Emit the init() function. fmt.Fprintf(outputFile, "func init() {\n") for _, ic := range initCalls { fmt.Fprintf(outputFile, " %s\n", ic) } fmt.Fprintf(outputFile, "}\n") } } golang-gvisor-gvisor-0.0~20240729.0/tools/xdp/000077500000000000000000000000001465435605700205765ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/tools/xdp/cmd/000077500000000000000000000000001465435605700213415ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/tools/xdp/cmd/bpf/000077500000000000000000000000001465435605700221105ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/tools/xdp/cmd/bpf/drop.ebpf.c000066400000000000000000000016631465435605700241410ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #define section(secname) __attribute__((section(secname), used)) char __license[] section("license") = "Apache-2.0"; // You probably shouldn't change the section or function name. Each is used by // BPF tooling, and so changes can cause runtime failures. section("xdp") int xdp_prog(struct xdp_md *ctx) { return XDP_DROP; } golang-gvisor-gvisor-0.0~20240729.0/tools/xdp/cmd/bpf/pass.ebpf.c000066400000000000000000000016631465435605700241430ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #define section(secname) __attribute__((section(secname), used)) char __license[] section("license") = "Apache-2.0"; // You probably shouldn't change the section or function name. Each is used by // BPF tooling, and so changes can cause runtime failures. section("xdp") int xdp_prog(struct xdp_md *ctx) { return XDP_PASS; } golang-gvisor-gvisor-0.0~20240729.0/tools/xdp/cmd/bpf/redirect_host.ebpf.c000066400000000000000000000054061465435605700260320ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // clang-format off // Contains types needed by later headers. #include // clang-format on #include #include #include #include #include #include #include #define section(secname) __attribute__((section(secname), used)) char __license[] section("license") = "Apache-2.0"; // Note: bpf_helpers.h includes a struct definition for bpf_map_def in some, but // not all, environments. Define our own equivalent struct to avoid issues with // multiple declarations. struct gvisor_bpf_map_def { unsigned int type; unsigned int key_size; unsigned int value_size; unsigned int max_entries; unsigned int map_flags; }; struct gvisor_bpf_map_def section("maps") sock_map = { .type = BPF_MAP_TYPE_XSKMAP, // Note: "XSK" means AF_XDP socket. .key_size = sizeof(__u32), .value_size = sizeof(__u32), .max_entries = 1, }; // Redirect almost all incoming traffic to an AF_XDP socket. Certain packets are // allowed through to the Linux network stack: // // - SSH (IPv4 TCP port 22) traffic. // - Some obviously broken packets. section("xdp") int xdp_prog(struct xdp_md *ctx) { void *cursor = (void *)(long)ctx->data; void *data_end = (void *)(long)ctx->data_end; // Ensure there's space for an ethernet header. struct ethhdr *eth = cursor; if ((void *)(eth + 1) > data_end) { return XDP_PASS; } cursor += sizeof(*eth); // Send all non-IPv4 traffic to the socket. if (eth->h_proto != bpf_htons(ETH_P_IP)) { return bpf_redirect_map(&sock_map, ctx->rx_queue_index, XDP_PASS); } // IP packets get inspected to allow SSH traffic to the host. struct iphdr *ip = cursor; if ((void *)(ip + 1) > data_end) { return XDP_PASS; } cursor += sizeof(*ip); if (ip->protocol != IPPROTO_TCP) { return bpf_redirect_map(&sock_map, ctx->rx_queue_index, XDP_PASS); } struct tcphdr *tcp = cursor; if ((void *)(tcp + 1) > data_end) { return XDP_PASS; } // Allow port 22 traffic for SSH debugging. if (tcp->th_dport == bpf_htons(22)) { return XDP_PASS; } return bpf_redirect_map(&sock_map, ctx->rx_queue_index, XDP_PASS); } golang-gvisor-gvisor-0.0~20240729.0/tools/xdp/cmd/bpf/tcpdump.ebpf.c000066400000000000000000000034571465435605700246540ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // clang-format off // Contains types needed by later headers. #include // clang-format on #include #include #define section(secname) __attribute__((section(secname), used)) char __license[] section("license") = "Apache-2.0"; // Note: bpf_helpers.h includes a struct definition for bpf_map_def in some, but // not all, environments. Define our own equivalent struct to avoid issues with // multiple declarations. struct gvisor_bpf_map_def { unsigned int type; unsigned int key_size; unsigned int value_size; unsigned int max_entries; unsigned int map_flags; }; // A map of RX queue number to AF_XDP socket. We only ever use one key: 0. struct gvisor_bpf_map_def section("maps") sock_map = { .type = BPF_MAP_TYPE_XSKMAP, // Note: "XSK" means AF_XDP socket. .key_size = sizeof(int), .value_size = sizeof(int), .max_entries = 1, }; section("xdp") int xdp_prog(struct xdp_md *ctx) { // Lookup the socket for the current RX queue. Veth devices by default have // only one RX queue. If one is found, redirect the packet to that socket. // Otherwise pass it on to the kernel network stack. return bpf_redirect_map(&sock_map, ctx->rx_queue_index, XDP_PASS); } golang-gvisor-gvisor-0.0~20240729.0/tools/xdp/cmd/bpf/tunnel_host.ebpf.c000066400000000000000000000053421465435605700255350ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // clang-format off // Contains types needed by later headers. #include // clang-format on #include #include #include #include #include #include #include #define section(secname) __attribute__((section(secname), used)) char __license[] section("license") = "Apache-2.0"; // Note: bpf_helpers.h includes a struct definition for bpf_map_def in some, but // not all, environments. Define our own equivalent struct to avoid issues with // multiple declarations. struct gvisor_bpf_map_def { unsigned int type; unsigned int key_size; unsigned int value_size; unsigned int max_entries; unsigned int map_flags; }; struct gvisor_bpf_map_def section("maps") dev_map = { .type = BPF_MAP_TYPE_DEVMAP, .key_size = sizeof(__u32), .value_size = sizeof(__u32), .max_entries = 1, }; // Redirect almost all incoming traffic to go out another device. Certain // packets are allowed through to the Linux network stack: // // - SSH (IPv4 TCP port 22) traffic. // - Some obviously broken packets. section("xdp") int xdp_prog(struct xdp_md *ctx) { void *cursor = (void *)(long)ctx->data; void *data_end = (void *)(long)ctx->data_end; // Ensure there's space for an ethernet header. struct ethhdr *eth = cursor; if ((void *)(eth + 1) > data_end) { return XDP_PASS; } cursor += sizeof(*eth); // Send all non-IPv4 traffic to the socket. if (eth->h_proto != bpf_htons(ETH_P_IP)) { return bpf_redirect_map(&dev_map, ctx->rx_queue_index, XDP_PASS); } // IP packets get inspected to allow SSH traffic to the host. struct iphdr *ip = cursor; if ((void *)(ip + 1) > data_end) { return XDP_PASS; } cursor += sizeof(*ip); if (ip->protocol != IPPROTO_TCP) { return bpf_redirect_map(&dev_map, ctx->rx_queue_index, XDP_PASS); } struct tcphdr *tcp = cursor; if ((void *)(tcp + 1) > data_end) { return XDP_PASS; } // Allow port 22 traffic for SSH debugging. if (tcp->th_dport == bpf_htons(22)) { return XDP_PASS; } return bpf_redirect_map(&dev_map, ctx->rx_queue_index, XDP_PASS); } golang-gvisor-gvisor-0.0~20240729.0/tools/xdp/cmd/cmd.go000066400000000000000000000063501465435605700224370ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build amd64 || arm64 // +build amd64 arm64 // Package cmd implements the subcommands of xdp_loader. package cmd import ( "bytes" _ "embed" "fmt" "log" "net" "github.com/cilium/ebpf" "github.com/cilium/ebpf/link" "golang.org/x/sys/unix" ) func runBasicProgram(progData []byte, device string, deviceIndex int) error { iface, err := getIface(device, deviceIndex) if err != nil { return fmt.Errorf("%v", err) } // Load into the kernel. spec, err := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(progData)) if err != nil { return fmt.Errorf("failed to load spec: %v", err) } var objects struct { Program *ebpf.Program `ebpf:"xdp_prog"` } if err := spec.LoadAndAssign(&objects, nil); err != nil { return fmt.Errorf("failed to load program: %v", err) } defer func() { if err := objects.Program.Close(); err != nil { fmt.Printf("failed to close program: %v", err) } }() _, cleanup, err := attach(objects.Program, iface) if err != nil { return fmt.Errorf("failed to attach: %v", err) } defer cleanup() waitForever() return nil } func getIface(device string, deviceIndex int) (*net.Interface, error) { switch { case device != "" && deviceIndex != 0: return nil, fmt.Errorf("device specified twice") case device != "": iface, err := net.InterfaceByName(device) if err != nil { return nil, fmt.Errorf("unknown device %q: %v", device, err) } return iface, nil case deviceIndex != 0: iface, err := net.InterfaceByIndex(deviceIndex) if err != nil { return nil, fmt.Errorf("unknown device with index %d: %v", deviceIndex, err) } return iface, nil default: return nil, fmt.Errorf("no device specified") } } func attach(program *ebpf.Program, iface *net.Interface) (link.Link, func(), error) { // Attach the program to the XDP hook on the device. Fallback from best // to worst mode. modes := []struct { name string flag link.XDPAttachFlags }{ {name: "offload", flag: link.XDPOffloadMode}, {name: "driver", flag: link.XDPDriverMode}, {name: "generic", flag: link.XDPGenericMode}, } var attached link.Link var err error for _, mode := range modes { attached, err = link.AttachXDP(link.XDPOptions{ Program: program, Interface: iface.Index, Flags: mode.flag, }) if err == nil { log.Printf("attached with mode %q", mode.name) break } log.Printf("failed to attach with mode %q: %v", mode.name, err) } if attached == nil { return nil, nil, fmt.Errorf("failed to attach program") } return attached, func() { attached.Close() }, nil } func waitForever() { log.Printf("Successfully attached! Press CTRL-C to quit and remove the program from the device.") for { unix.Pause() } } golang-gvisor-gvisor-0.0~20240729.0/tools/xdp/cmd/cmd_state_autogen.go000066400000000000000000000001461465435605700253560ustar00rootroot00000000000000// automatically generated by stateify. //go:build amd64 || arm64 // +build amd64 arm64 package cmd golang-gvisor-gvisor-0.0~20240729.0/tools/xdp/cmd/drop.go000066400000000000000000000034741465435605700226440ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" _ "embed" "log" "github.com/google/subcommands" "gvisor.dev/gvisor/runsc/flag" ) //go:embed bpf/drop_ebpf.o var dropProgram []byte // DropCommand is a subcommand for dropping packets. type DropCommand struct { device string deviceIndex int } // Name implements subcommands.Command.Name. func (*DropCommand) Name() string { return "drop" } // Synopsis implements subcommands.Command.Synopsis. func (*DropCommand) Synopsis() string { return "Drop all packets to the kernel network stack." } // Usage implements subcommands.Command.Usage. func (*DropCommand) Usage() string { return "drop -device or -devidx " } // SetFlags implements subcommands.Command.SetFlags. func (pc *DropCommand) SetFlags(fs *flag.FlagSet) { fs.StringVar(&pc.device, "device", "", "which device to attach to") fs.IntVar(&pc.deviceIndex, "devidx", 0, "which device to attach to") } // Execute implements subcommands.Command.Execute. func (pc *DropCommand) Execute(context.Context, *flag.FlagSet, ...any) subcommands.ExitStatus { if err := runBasicProgram(dropProgram, pc.device, pc.deviceIndex); err != nil { log.Printf("%v", err) return subcommands.ExitFailure } return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/tools/xdp/cmd/pass.go000066400000000000000000000035271465435605700226450ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" _ "embed" "log" "github.com/google/subcommands" "gvisor.dev/gvisor/runsc/flag" ) //go:embed bpf/pass_ebpf.o var passProgram []byte // PassCommand is a subcommand for passing packets to the kernel network stack. type PassCommand struct { device string deviceIndex int } // Name implements subcommands.Command.Name. func (*PassCommand) Name() string { return "pass" } // Synopsis implements subcommands.Command.Synopsis. func (*PassCommand) Synopsis() string { return "Pass all packets to the kernel network stack." } // Usage implements subcommands.Command.Usage. func (*PassCommand) Usage() string { return "pass -device or -devidx " } // SetFlags implements subcommands.Command.SetFlags. func (pc *PassCommand) SetFlags(fs *flag.FlagSet) { fs.StringVar(&pc.device, "device", "", "which device to attach to") fs.IntVar(&pc.deviceIndex, "devidx", 0, "which device to attach to") } // Execute implements subcommands.Command.Execute. func (pc *PassCommand) Execute(context.Context, *flag.FlagSet, ...any) subcommands.ExitStatus { if err := runBasicProgram(passProgram, pc.device, pc.deviceIndex); err != nil { log.Printf("%v", err) return subcommands.ExitFailure } return subcommands.ExitSuccess } golang-gvisor-gvisor-0.0~20240729.0/tools/xdp/cmd/redirect_host.go000066400000000000000000000161411465435605700245310ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "bytes" "context" _ "embed" "errors" "fmt" "log" "net" "os" "path/filepath" "github.com/cilium/ebpf" "github.com/cilium/ebpf/link" "github.com/google/subcommands" "gvisor.dev/gvisor/runsc/flag" ) // bpffsDirName is the path at which BPFFS is expected to be mounted. const bpffsDirPath = "/sys/fs/bpf/" // RedirectPinDir returns the directory to which eBPF objects will be pinned // when xdp_loader is run against iface. func RedirectPinDir(iface string) string { return filepath.Join(bpffsDirPath, iface) } // RedirectMapPath returns the path where the eBPF map will be pinned when // xdp_loader is run against iface. func RedirectMapPath(iface string) string { return filepath.Join(RedirectPinDir(iface), "redirect_ip_map") } // RedirectProgramPath returns the path where the eBPF program will be pinned // when xdp_loader is run against iface. func RedirectProgramPath(iface string) string { return filepath.Join(RedirectPinDir(iface), "redirect_program") } // RedirectLinkPath returns the path where the eBPF link will be pinned when // xdp_loader is run against iface. func RedirectLinkPath(iface string) string { return filepath.Join(RedirectPinDir(iface), "redirect_link") } //go:embed bpf/redirect_host_ebpf.o var redirectProgram []byte // RedirectHostCommand is a subcommand for redirecting incoming packets based // on a pinned eBPF map. It redirects all non-SSH traffic to a single AF_XDP // socket. type RedirectHostCommand struct { device string deviceIndex int unpin bool } // Name implements subcommands.Command.Name. func (*RedirectHostCommand) Name() string { return "redirect" } // Synopsis implements subcommands.Command.Synopsis. func (*RedirectHostCommand) Synopsis() string { return "Redirect incoming packets to an AF_XDP socket. Pins eBPF objects in /sys/fs/bpf//." } // Usage implements subcommands.Command.Usage. func (*RedirectHostCommand) Usage() string { return "redirect {-device | -device-idx } [--unpin]" } // SetFlags implements subcommands.Command.SetFlags. func (rc *RedirectHostCommand) SetFlags(fs *flag.FlagSet) { fs.StringVar(&rc.device, "device", "", "which device to attach to") fs.IntVar(&rc.deviceIndex, "device-idx", 0, "which device to attach to") fs.BoolVar(&rc.unpin, "unpin", false, "unpin the map and program instead of pinning new ones; useful to reset state") } // Execute implements subcommands.Command.Execute. func (rc *RedirectHostCommand) Execute(context.Context, *flag.FlagSet, ...any) subcommands.ExitStatus { if err := rc.execute(); err != nil { fmt.Printf("%v\n", err) return subcommands.ExitFailure } return subcommands.ExitSuccess } func (rc *RedirectHostCommand) execute() error { iface, err := getIface(rc.device, rc.deviceIndex) if err != nil { return fmt.Errorf("%v", err) } return installProgramAndMap(installProgramAndMapOpts{ program: redirectProgram, iface: iface, unpin: rc.unpin, pinDir: RedirectPinDir(iface.Name), mapPath: RedirectMapPath(iface.Name), programPath: RedirectProgramPath(iface.Name), linkPath: RedirectLinkPath(iface.Name), }) } type installProgramAndMapOpts struct { program []byte iface *net.Interface unpin bool pinDir string mapPath string programPath string linkPath string } func installProgramAndMap(opts installProgramAndMapOpts) error { // User just wants to unpin things. if opts.unpin { return unpin(opts.mapPath, opts.programPath, opts.linkPath) } // Load into the kernel. spec, err := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(opts.program)) if err != nil { return fmt.Errorf("failed to load spec: %v", err) } var objects struct { Program *ebpf.Program `ebpf:"xdp_prog"` SockMap *ebpf.Map `ebpf:"sock_map"` } if err := spec.LoadAndAssign(&objects, nil); err != nil { return fmt.Errorf("failed to load program: %v", err) } defer func() { if err := objects.Program.Close(); err != nil { log.Printf("failed to close program: %v", err) } if err := objects.SockMap.Close(); err != nil { log.Printf("failed to close sock map: %v", err) } }() attachedLink, cleanup, err := attach(objects.Program, opts.iface) if err != nil { return fmt.Errorf("failed to attach: %v", err) } defer cleanup() // Create directory /sys/fs/bpf//. if err := os.Mkdir(opts.pinDir, 0700); err != nil && !os.IsExist(err) { return fmt.Errorf("failed to create directory for pinning at %s: %v", opts.pinDir, err) } // Pin the map at /sys/fs/bpf//ip_map. if err := objects.SockMap.Pin(opts.mapPath); err != nil { return fmt.Errorf("failed to pin map at %s", opts.mapPath) } log.Printf("Pinned map at %s", opts.mapPath) // Pin the program at /sys/fs/bpf//program. if err := objects.Program.Pin(opts.programPath); err != nil { return fmt.Errorf("failed to pin program at %s", opts.programPath) } log.Printf("Pinned program at %s", opts.programPath) // Make everything persistent by pinning the link. Otherwise, the XDP // program would detach when this process exits. if err := attachedLink.Pin(opts.linkPath); err != nil { return fmt.Errorf("failed to pin link at %s", opts.linkPath) } log.Printf("Pinned link at %s", opts.linkPath) return nil } func unpin(mapPath, programPath, linkPath string) error { // Try to unpin both the map and program even if only one is found. mapErr := func() error { pinnedMap, err := ebpf.LoadPinnedMap(mapPath, nil) if err != nil { return fmt.Errorf("failed to load pinned map at %s for unpinning: %v", mapPath, err) } if err := pinnedMap.Unpin(); err != nil { return fmt.Errorf("failed to unpin map %s: %v", mapPath, err) } log.Printf("Unpinned map at %s", mapPath) return nil }() programErr := func() error { pinnedProgram, err := ebpf.LoadPinnedProgram(programPath, nil) if err != nil { return fmt.Errorf("failed to load pinned program at %s for unpinning: %v", programPath, err) } if err := pinnedProgram.Unpin(); err != nil { return fmt.Errorf("failed to unpin program %s: %v", programPath, err) } log.Printf("Unpinned program at %s", programPath) return nil }() linkErr := func() error { pinnedLink, err := link.LoadPinnedLink(linkPath, nil) if err != nil { return fmt.Errorf("failed to load pinned link at %s for unpinning: %v", linkPath, err) } if err := pinnedLink.Unpin(); err != nil { return fmt.Errorf("failed to unpin link %s: %v", linkPath, err) } log.Printf("Unpinned link at %s", linkPath) return nil }() return errors.Join(mapErr, programErr, linkErr) } golang-gvisor-gvisor-0.0~20240729.0/tools/xdp/cmd/tcpdump.go000066400000000000000000000117741465435605700233560ustar00rootroot00000000000000// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "bytes" "context" _ "embed" "errors" "fmt" "log" "github.com/cilium/ebpf" "github.com/google/subcommands" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/xdp" "gvisor.dev/gvisor/runsc/flag" ) //go:embed bpf/tcpdump_ebpf.o var tcpdumpProgram []byte // TcpdumpCommand is a subcommand for capturing incoming packets. type TcpdumpCommand struct { device string deviceIndex int } // Name implements subcommands.Command.Name. func (*TcpdumpCommand) Name() string { return "tcpdump" } // Synopsis implements subcommands.Command.Synopsis. func (*TcpdumpCommand) Synopsis() string { return "Run tcpdump-like program that blocks incoming packets." } // Usage implements subcommands.Command.Usage. func (*TcpdumpCommand) Usage() string { return "tcpdump -device or -devidx " } // SetFlags implements subcommands.Command.SetFlags. func (pc *TcpdumpCommand) SetFlags(fs *flag.FlagSet) { fs.StringVar(&pc.device, "device", "", "which device to attach to") fs.IntVar(&pc.deviceIndex, "devidx", 0, "which device to attach to") } // Execute implements subcommands.Command.Execute. func (pc *TcpdumpCommand) Execute(context.Context, *flag.FlagSet, ...any) subcommands.ExitStatus { if err := pc.execute(); err != nil { fmt.Printf("%v", err) return subcommands.ExitFailure } return subcommands.ExitSuccess } func (pc *TcpdumpCommand) execute() error { iface, err := getIface(pc.device, pc.deviceIndex) if err != nil { return fmt.Errorf("%v", err) } // Load into the kernel. spec, err := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(tcpdumpProgram)) if err != nil { return fmt.Errorf("failed to load spec: %v", err) } var objects struct { Program *ebpf.Program `ebpf:"xdp_prog"` SockMap *ebpf.Map `ebpf:"sock_map"` } if err := spec.LoadAndAssign(&objects, nil); err != nil { return fmt.Errorf("failed to load program: %v", err) } defer func() { if err := objects.Program.Close(); err != nil { log.Printf("failed to close program: %v", err) } if err := objects.SockMap.Close(); err != nil { log.Printf("failed to close sock map: %v", err) } }() _, cleanup, err := attach(objects.Program, iface) if err != nil { return fmt.Errorf("failed to attach: %v", err) } defer cleanup() controlBlock, err := xdp.New( uint32(iface.Index), 0 /* queueID */, xdp.DefaultOpts()) if err != nil { return fmt.Errorf("failed to create socket: %v", err) } // Insert our AF_XDP socket into the BPF map that dictates where // packets are redirected to. key := uint32(0) val := controlBlock.UMEM.SockFD() if err := objects.SockMap.Update(&key, &val, 0 /* flags */); err != nil { return fmt.Errorf("failed to insert socket into BPF map: %v", err) } log.Printf("updated key %d to value %d", key, val) // Put as many UMEM buffers into the fill queue as possible. controlBlock.UMEM.Lock() controlBlock.Fill.FillAll(&controlBlock.UMEM) controlBlock.UMEM.Unlock() go func() { controlBlock.UMEM.Lock() defer controlBlock.UMEM.Unlock() for { pfds := []unix.PollFd{{Fd: int32(controlBlock.UMEM.SockFD()), Events: unix.POLLIN}} _, err := unix.Poll(pfds, -1) if err != nil { if errors.Is(err, unix.EINTR) { continue } panic(fmt.Sprintf("poll failed: %v", err)) } // How many packets did we get? nReceived, rxIndex := controlBlock.RX.Peek() if nReceived == 0 { continue } // Keep the fill queue full. controlBlock.Fill.FillAll(&controlBlock.UMEM) // Read packets one-by-one and log them. for i := uint32(0); i < nReceived; i++ { // Wrap the packet in a PacketBuffer. descriptor := controlBlock.RX.Get(rxIndex + i) data := controlBlock.UMEM.Get(descriptor) pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ Payload: buffer.MakeWithData(data[header.EthernetMinimumSize:]), }) sniffer.LogPacket("", sniffer.DirectionRecv, // XDP operates only on ingress. header.Ethernet(data).Type(), pkt) // NOTE: the address is always 256 bytes offset // from a page boundary. The kernel masks the // address to the frame size, so this isn't a // problem. // // Note that this limits MTU to 4096-256 bytes. controlBlock.UMEM.FreeFrame(descriptor.Addr) } controlBlock.RX.Release(nReceived) } }() waitForever() return nil } golang-gvisor-gvisor-0.0~20240729.0/tools/xdp/cmd/tunnel.go000066400000000000000000000104121465435605700231730ustar00rootroot00000000000000// Copyright 2023 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "context" _ "embed" "fmt" "path/filepath" "github.com/google/subcommands" "gvisor.dev/gvisor/runsc/flag" ) // TunnelPinDir returns the directory to which eBPF objects will be pinned when // xdp_loader is run against iface. func TunnelPinDir(iface string) string { return filepath.Join(bpffsDirPath, iface) } // TunnelHostMapPath returns the path where the eBPF map will be pinned when // xdp_loader is run against iface. func TunnelHostMapPath(iface string) string { return filepath.Join(TunnelPinDir(iface), "tunnel_host_map") } // TunnelHostProgramPath returns the path where the eBPF program will be pinned // when xdp_loader is run against iface. func TunnelHostProgramPath(iface string) string { return filepath.Join(TunnelPinDir(iface), "tunnel_host_program") } // TunnelHostLinkPath returns the path where the eBPF link will be pinned when // xdp_loader is run against iface. func TunnelHostLinkPath(iface string) string { return filepath.Join(TunnelPinDir(iface), "tunnel_host_link") } // TunnelVethMapPath returns the path where the eBPF map should be pinned when // xdp_loader is run against iface. func TunnelVethMapPath(iface string) string { return filepath.Join(TunnelPinDir(iface), "tunnel_veth_map") } // TunnelVethProgramPath returns the path where the eBPF program should be pinned // when xdp_loader is run against iface. func TunnelVethProgramPath(iface string) string { return filepath.Join(TunnelPinDir(iface), "tunnel_veth_program") } // TunnelVethLinkPath returns the path where the eBPF link should be pinned when // xdp_loader is run against iface. func TunnelVethLinkPath(iface string) string { return filepath.Join(TunnelPinDir(iface), "tunnel_veth_link") } //go:embed bpf/tunnel_host_ebpf.o var tunnelHostProgram []byte // TunnelCommand is a subcommand for tunneling traffic between two NICs. It is // intended as a fast path between the host NIC and the veth of a container. // // SSH traffic is not tunneled. It is passed through to the Linux network stack. type TunnelCommand struct { device string deviceIndex int unpin bool } // Name implements subcommands.Command.Name. func (*TunnelCommand) Name() string { return "tunnel" } // Synopsis implements subcommands.Command.Synopsis. func (*TunnelCommand) Synopsis() string { return "Tunnel packets between two interfaces using AF_XDP. Pins eBPF objects in /sys/fs/bpf//." } // Usage implements subcommands.Command.Usage. func (*TunnelCommand) Usage() string { return "tunnel {-device | -device-idx } [--unpin]" } // SetFlags implements subcommands.Command.SetFlags. func (tn *TunnelCommand) SetFlags(fs *flag.FlagSet) { fs.StringVar(&tn.device, "device", "", "which host device to attach to") fs.IntVar(&tn.deviceIndex, "device-idx", 0, "which host device to attach to") fs.BoolVar(&tn.unpin, "unpin", false, "unpin the map and program instead of pinning new ones; useful to reset state") } // Execute implements subcommands.Command.Execute. func (tn *TunnelCommand) Execute(context.Context, *flag.FlagSet, ...any) subcommands.ExitStatus { if err := tn.execute(); err != nil { fmt.Printf("%v\n", err) return subcommands.ExitFailure } return subcommands.ExitSuccess } func (tn *TunnelCommand) execute() error { iface, err := getIface(tn.device, tn.deviceIndex) if err != nil { return fmt.Errorf("failed to get host iface: %v", err) } return installProgramAndMap(installProgramAndMapOpts{ program: tunnelHostProgram, iface: iface, unpin: tn.unpin, pinDir: RedirectPinDir(iface.Name), mapPath: TunnelHostMapPath(iface.Name), programPath: TunnelHostProgramPath(iface.Name), linkPath: TunnelHostLinkPath(iface.Name), }) } golang-gvisor-gvisor-0.0~20240729.0/vdso/000077500000000000000000000000001465435605700176165ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/vdso/BUILD000066400000000000000000000042361465435605700204050ustar00rootroot00000000000000# Description: # This VDSO is a shared library that provides the same interfaces as the # normal system VDSO (time, gettimeofday, clock_gettimeofday) but which uses # timekeeping parameters managed by the sandbox kernel. # Placeholder: load py_test load("//tools:arch.bzl", "select_arch") load("//tools:defs.bzl", "cc_flags_supplier", "cc_toolchain", "vdso_linker_option") package( default_applicable_licenses = ["//:license"], licenses = ["notice"], ) exports_files(["check_vdso.py"]) genrule( name = "vdso", srcs = [ "barrier.h", "compiler.h", "cycle_clock.h", "seqlock.h", "syscalls.h", "vdso.cc", "vdso_amd64.lds", "vdso_arm64.lds", "vdso_time.h", "vdso_time.cc", ], outs = [ "vdso.so", ], cmd = "$(CC) $(CC_FLAGS) " + "-I. " + "-O2 " + "-std=c++11 " + "-fPIC " + "-fno-sanitize=all " + # Some toolchains enable stack protector by default. Disable it, the # VDSO has no hooks to handle failures. "-fno-stack-protector " + vdso_linker_option + "-shared " + "-nostdlib " + "-Wl,-soname=linux-vdso.so.1 " + "-Wl,--hash-style=sysv " + "-Wl,--no-undefined " + "-Wl,-Bsymbolic " + "-Wl,-z,max-page-size=4096 " + "-Wl,-z,common-page-size=4096 " + select_arch( amd64 = "-Wl,-T$(location vdso_amd64.lds) ", arm64 = "-Wl,-T$(location vdso_arm64.lds) ", no_match_error = "unsupported architecture", ) + "-o $(location vdso.so) " + "$(location vdso.cc) " + "$(location vdso_time.cc)", features = ["-pie"], toolchains = [ cc_toolchain, ":no_pie_cc_flags", ], visibility = ["//:sandbox"], ) cc_flags_supplier( name = "no_pie_cc_flags", features = ["-pie"], ) py_test( name = "vdso_test", srcs = ["check_vdso.py"], args = [ "--check-data", "--vdso=$(location :vdso)", ], data = [":vdso"], main = "check_vdso.py", python_version = "PY3", ) golang-gvisor-gvisor-0.0~20240729.0/vdso/barrier.h000066400000000000000000000024671465435605700214260ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef VDSO_BARRIER_H_ #define VDSO_BARRIER_H_ namespace vdso { // Compiler Optimization barrier. inline void barrier(void) { __asm__ __volatile__("" ::: "memory"); } #if __x86_64__ inline void memory_barrier(void) { __asm__ __volatile__("mfence" ::: "memory"); } inline void read_barrier(void) { barrier(); } inline void write_barrier(void) { barrier(); } #elif __aarch64__ inline void memory_barrier(void) { __asm__ __volatile__("dmb ish" ::: "memory"); } inline void read_barrier(void) { __asm__ __volatile__("dmb ishld" ::: "memory"); } inline void write_barrier(void) { __asm__ __volatile__("dmb ishst" ::: "memory"); } #else #error "unsupported architecture" #endif } // namespace vdso #endif // VDSO_BARRIER_H_ golang-gvisor-gvisor-0.0~20240729.0/vdso/check_vdso.py000066400000000000000000000140701465435605700223020ustar00rootroot00000000000000# Copyright 2018 The gVisor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Verify VDSO ELF does not contain any relocations and is directly mmappable. """ import argparse import logging import re import subprocess PAGE_SIZE = 4096 def PageRoundDown(addr): """Rounds down to the nearest page. Args: addr: An address. Returns: The address rounded down to the nearest page. """ return addr & ~(PAGE_SIZE - 1) def Fatal(*args, **kwargs): """Logs a critical message and exits with code 1. Args: *args: Args to pass to logging.critical. **kwargs: Keyword args to pass to logging.critical. """ logging.critical(*args, **kwargs) exit(1) def CheckSegments(vdso_path): """Verifies layout of PT_LOAD segments. PT_LOAD segments must be laid out such that the ELF is directly mmappable. Specifically, check that: * PT_LOAD file offsets are equivalent to the memory offset from the first segment. * No extra zeroed space (memsz) is required. * PT_LOAD segments are in order (required for any ELF). * No two PT_LOAD segments share part of the same page. The readelf line format looks like: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align LOAD 0x000000 0xffffffffff700000 0xffffffffff700000 0x000e68 0x000e68 R E 0x1000 Args: vdso_path: Path to VDSO binary. """ output = subprocess.check_output(["readelf", "-lW", vdso_path]).decode() lines = output.split("\n") segments = [] for line in lines: if not line.startswith(" LOAD"): continue components = line.split() segments.append({ "offset": int(components[1], 16), "addr": int(components[2], 16), "filesz": int(components[4], 16), "memsz": int(components[5], 16), }) if not segments: Fatal("No PT_LOAD segments in VDSO") first = segments[0] if first["offset"] != 0: Fatal("First PT_LOAD segment has non-zero file offset: %s", first) for i, segment in enumerate(segments): memoff = segment["addr"] - first["addr"] if memoff != segment["offset"]: Fatal("PT_LOAD segment has different memory and file offsets: %s", segments) if segment["memsz"] != segment["filesz"]: Fatal("PT_LOAD segment memsz != filesz: %s", segment) if i > 0: last_end = segments[i-1]["addr"] + segments[i-1]["memsz"] if segment["addr"] < last_end: Fatal("PT_LOAD segments out of order") last_page = PageRoundDown(last_end) start_page = PageRoundDown(segment["addr"]) if last_page >= start_page: Fatal("PT_LOAD segments share a page: %s and %s", segment, segments[i - 1]) # Matches the section name in readelf -SW output. _SECTION_NAME_RE = re.compile(r"""^\s+\[\ ?\d+\]\s+ (?P\.\S+)\s+ (?P\S+)\s+ (?P[0-9a-f]+)\s+ (?P[0-9a-f]+)\s+ (?P[0-9a-f]+)""", re.VERBOSE) def CheckData(vdso_path): """Verifies the VDSO contains no .data or .bss sections. The readelf line format looks like: There are 15 section headers, starting at offset 0x15f0: Section Headers: [Nr] Name Type Address Off Size ES Flg Lk Inf Al [ 0] NULL 0000000000000000 000000 000000 00 0 0 0 [ 1] .hash HASH ffffffffff700120 000120 000040 04 A 2 0 8 [ 2] .dynsym DYNSYM ffffffffff700160 000160 000108 18 A 3 1 8 ... [13] .strtab STRTAB 0000000000000000 001448 000123 00 0 0 1 [14] .shstrtab STRTAB 0000000000000000 00156b 000083 00 0 0 1 Key to Flags: W (write), A (alloc), X (execute), M (merge), S (strings), I (info), L (link order), O (extra OS processing required), G (group), T (TLS), C (compressed), x (unknown), o (OS specific), E (exclude), l (large), p (processor specific) Args: vdso_path: Path to VDSO binary. """ output = subprocess.check_output(["readelf", "-SW", vdso_path]).decode() lines = output.split("\n") found_text = False for line in lines: m = re.search(_SECTION_NAME_RE, line) if not m: continue if not line.startswith(" ["): continue name = m.group("name") size = int(m.group("size"), 16) if name == ".text" and size != 0: found_text = True # Clang will typically omit these sections entirely; gcc will include them # but with size 0. if name.startswith(".data") and size != 0: Fatal("VDSO contains non-empty .data section:\n%s" % output) if name.startswith(".bss") and size != 0: Fatal("VDSO contains non-empty .bss section:\n%s" % output) if not found_text: Fatal("VDSO contains no/empty .text section? Bad parsing?:\n%s" % output) def CheckRelocs(vdso_path): """Verifies that the VDSO includes no relocations. Args: vdso_path: Path to VDSO binary. """ output = subprocess.check_output(["readelf", "-r", vdso_path]).decode() if output.strip() != "There are no relocations in this file.": Fatal("VDSO contains relocations: %s", output) def main(): parser = argparse.ArgumentParser(description="Verify VDSO ELF.") parser.add_argument("--vdso", required=True, help="Path to VDSO ELF") parser.add_argument( "--check-data", action="store_true", help="Check that the ELF contains no .data or .bss sections") args = parser.parse_args() CheckSegments(args.vdso) CheckRelocs(args.vdso) if args.check_data: CheckData(args.vdso) if __name__ == "__main__": main() golang-gvisor-gvisor-0.0~20240729.0/vdso/compiler.h000066400000000000000000000016401465435605700216020ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef VDSO_COMPILER_H_ #define VDSO_COMPILER_H_ #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #ifndef __section #define __section(S) __attribute__((__section__(#S))) #endif #ifndef __aligned #define __aligned(N) __attribute__((__aligned__(N))) #endif #endif // VDSO_COMPILER_H_ golang-gvisor-gvisor-0.0~20240729.0/vdso/cycle_clock.h000066400000000000000000000025721465435605700222470ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef VDSO_CYCLE_CLOCK_H_ #define VDSO_CYCLE_CLOCK_H_ #include #include "vdso/barrier.h" namespace vdso { #if __x86_64__ // TODO(b/74613497): The appropriate barrier instruction to use with rdtsc on // x86_64 depends on the vendor. Intel processors can use lfence but AMD may // need mfence, depending on MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT. static inline uint64_t cycle_clock(void) { uint32_t lo, hi; asm volatile("lfence" : : : "memory"); asm volatile("rdtsc" : "=a"(lo), "=d"(hi)); return ((uint64_t)hi << 32) | lo; } #elif __aarch64__ static inline uint64_t cycle_clock(void) { uint64_t val; asm volatile("mrs %0, CNTVCT_EL0" : "=r"(val)::"memory"); return val; } #else #error "unsupported architecture" #endif } // namespace vdso #endif // VDSO_CYCLE_CLOCK_H_ golang-gvisor-gvisor-0.0~20240729.0/vdso/seqlock.h000066400000000000000000000021041465435605700214250ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Low level raw interfaces to the sequence counter used by the VDSO. #ifndef VDSO_SEQLOCK_H_ #define VDSO_SEQLOCK_H_ #include #include "vdso/barrier.h" #include "vdso/compiler.h" namespace vdso { inline int32_t read_seqcount_begin(const uint64_t* s) { uint64_t seq = *s; read_barrier(); return seq & ~1; } inline int read_seqcount_retry(const uint64_t* s, uint64_t seq) { read_barrier(); return unlikely(*s != seq); } } // namespace vdso #endif // VDSO_SEQLOCK_H_ golang-gvisor-gvisor-0.0~20240729.0/vdso/syscalls.h000066400000000000000000000053171465435605700216320ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // System call support for the VDSO. // // Provides fallback system call interfaces for getcpu() // and clock_gettime(). #ifndef VDSO_SYSCALLS_H_ #define VDSO_SYSCALLS_H_ #include #include #include #include #include #define __stringify_1(x...) #x #define __stringify(x...) __stringify_1(x) namespace vdso { #if __x86_64__ struct getcpu_cache; static inline int sys_clock_gettime(clockid_t clock, struct timespec* ts) { int num = __NR_clock_gettime; asm volatile("syscall\n" : "+a"(num) : "D"(clock), "S"(ts) : "rcx", "r11", "memory"); return num; } static inline int sys_getcpu(unsigned* cpu, unsigned* node, struct getcpu_cache* cache) { int num = __NR_getcpu; asm volatile("syscall\n" : "+a"(num) : "D"(cpu), "S"(node), "d"(cache) : "rcx", "r11", "memory"); return num; } static inline void sys_rt_sigreturn(void) { asm volatile("movl $" __stringify(__NR_rt_sigreturn)", %eax \n" "syscall \n"); } #elif __aarch64__ static inline int sys_clock_gettime(clockid_t _clkid, struct timespec* _ts) { register struct timespec* ts asm("x1") = _ts; register clockid_t clkid asm("x0") = _clkid; register long ret asm("x0"); register long nr asm("x8") = __NR_clock_gettime; asm volatile("svc #0\n" : "=r"(ret) : "r"(clkid), "r"(ts), "r"(nr) : "memory"); return ret; } static inline int sys_clock_getres(clockid_t _clkid, struct timespec* _ts) { register struct timespec* ts asm("x1") = _ts; register clockid_t clkid asm("x0") = _clkid; register long ret asm("x0"); register long nr asm("x8") = __NR_clock_getres; asm volatile("svc #0\n" : "=r"(ret) : "r"(clkid), "r"(ts), "r"(nr) : "memory"); return ret; } static inline void sys_rt_sigreturn(void) { asm volatile("mov x8, #" __stringify(__NR_rt_sigreturn)" \n" "svc #0 \n"); } #else #error "unsupported architecture" #endif } // namespace vdso #endif // VDSO_SYSCALLS_H_ golang-gvisor-gvisor-0.0~20240729.0/vdso/vdso.cc000066400000000000000000000106751465435605700211110ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // This is the VDSO for sandboxed binaries. This file just contains the entry // points to the VDSO. All of the real work is done in vdso_time.cc #define _DEFAULT_SOURCE // ensure glibc provides struct timezone. #include #include #include "vdso/syscalls.h" #include "vdso/vdso_time.h" namespace vdso { namespace { int __common_clock_gettime(clockid_t clock, struct timespec* ts) { int ret; switch (clock) { case CLOCK_REALTIME_COARSE: // Fallthrough, CLOCK_REALTIME_COARSE is an alias for CLOCK_REALTIME case CLOCK_REALTIME: ret = ClockRealtime(ts); break; case CLOCK_BOOTTIME: // Fallthrough, CLOCK_BOOTTIME is an alias for CLOCK_MONOTONIC case CLOCK_MONOTONIC_RAW: // Fallthrough, CLOCK_MONOTONIC_RAW is an alias for CLOCK_MONOTONIC case CLOCK_MONOTONIC_COARSE: // Fallthrough, CLOCK_MONOTONIC_COARSE is an alias for CLOCK_MONOTONIC case CLOCK_MONOTONIC: ret = ClockMonotonic(ts); break; default: ret = sys_clock_gettime(clock, ts); break; } return ret; } int __common_gettimeofday(struct timeval* tv, struct timezone* tz) { if (tv) { struct timespec ts; int ret = ClockRealtime(&ts); if (ret) { return ret; } tv->tv_sec = ts.tv_sec; tv->tv_usec = ts.tv_nsec / 1000; } // Nobody should be calling gettimeofday() with a non-NULL // timezone pointer. If they do then they will get zeros. if (tz) { tz->tz_minuteswest = 0; tz->tz_dsttime = 0; } return 0; } } // namespace // __kernel_rt_sigreturn() implements rt_sigreturn() extern "C" void __kernel_rt_sigreturn(unsigned long unused) { // No optimizations yet, just make the real system call. sys_rt_sigreturn(); } #if __x86_64__ // __vdso_clock_gettime() implements clock_gettime() extern "C" int __vdso_clock_gettime(clockid_t clock, struct timespec* ts) { return __common_clock_gettime(clock, ts); } extern "C" int clock_gettime(clockid_t clock, struct timespec* ts) __attribute__((weak, alias("__vdso_clock_gettime"))); // __vdso_gettimeofday() implements gettimeofday() extern "C" int __vdso_gettimeofday(struct timeval* tv, struct timezone* tz) { return __common_gettimeofday(tv, tz); } extern "C" int gettimeofday(struct timeval* tv, struct timezone* tz) __attribute__((weak, alias("__vdso_gettimeofday"))); // __vdso_time() implements time() extern "C" time_t __vdso_time(time_t* t) { struct timespec ts; ClockRealtime(&ts); if (t) { *t = ts.tv_sec; } return ts.tv_sec; } extern "C" time_t time(time_t* t) __attribute__((weak, alias("__vdso_time"))); // __vdso_getcpu() implements getcpu() extern "C" long __vdso_getcpu(unsigned* cpu, unsigned* node, struct getcpu_cache* cache) { // No optimizations yet, just make the real system call. return sys_getcpu(cpu, node, cache); } extern "C" long getcpu(unsigned* cpu, unsigned* node, struct getcpu_cache* cache) __attribute__((weak, alias("__vdso_getcpu"))); #elif __aarch64__ // __kernel_clock_gettime() implements clock_gettime() extern "C" int __kernel_clock_gettime(clockid_t clock, struct timespec* ts) { return __common_clock_gettime(clock, ts); } // __kernel_gettimeofday() implements gettimeofday() extern "C" int __kernel_gettimeofday(struct timeval* tv, struct timezone* tz) { return __common_gettimeofday(tv, tz); } // __kernel_clock_getres() implements clock_getres() extern "C" int __kernel_clock_getres(clockid_t clock, struct timespec* res) { int ret = 0; switch (clock) { case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_BOOTTIME: { if (res == nullptr) { return 0; } res->tv_sec = 0; res->tv_nsec = 1; break; } default: ret = sys_clock_getres(clock, res); break; } return ret; } #else #error "unsupported architecture" #endif } // namespace vdso golang-gvisor-gvisor-0.0~20240729.0/vdso/vdso_amd64.lds000066400000000000000000000063301465435605700222720ustar00rootroot00000000000000/* * Linker script for the VDSO. * * The VDSO is essentially a normal ELF shared library that is mapped into the * address space of the process that is going to use it. The address of the * VDSO is passed to the runtime linker in the AT_SYSINFO_EHDR entry of the aux * vector. * * There are, however, three ways in which the VDSO differs from a normal * shared library: * * - The runtime linker does not attempt to process any relocations for the * VDSO so it is the responsibility of whoever loads the VDSO into the * address space to do this if necessary. Because of this restriction we are * careful to ensure that the VDSO does not need to have any relocations * applied to it. * * - Although the VDSO is position independent and would normally be linked at * virtual address 0, the Linux kernel VDSO is actually linked at a non zero * virtual address and the code in the system runtime linker that handles the * VDSO expects this to be the case so we have to explicitly link this VDSO * at a non zero address. The actual address is arbitrary, but we use the * same one as the Linux kernel VDSO. * * - The VDSO will be directly mmapped by the sentry, rather than going through * a normal ELF loading process. The VDSO must be carefully constructed such * that the layout in the ELF file is identical to the layout in memory. */ VDSO_PRELINK = 0xffffffffff700000; SECTIONS { /* The parameter page is mapped just before the VDSO. */ _params = VDSO_PRELINK - 0x1000; . = VDSO_PRELINK + SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } .dynsym : { *(.dynsym) } .dynstr : { *(.dynstr) } .gnu.version : { *(.gnu.version) } .gnu.version_d : { *(.gnu.version_d) } .gnu.version_r : { *(.gnu.version_r) } .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr .eh_frame : { KEEP (*(.eh_frame)) } :text .dynamic : { *(.dynamic) } :text :dynamic .rodata : { *(.rodata*) } :text .altinstructions : { *(.altinstructions) } .altinstr_replacement : { *(.altinstr_replacement) } /* * TODO(gvisor.dev/issue/157): Remove this alignment? Then the VDSO would fit * in a single page. */ . = ALIGN(0x1000); .text : { *(.text*) } :text =0x90909090 /* * N.B. There is no data/bss section. This VDSO neither needs nor uses a data * section. We omit it entirely because some gcc/clang and gold/bfd version * combinations struggle to handle an empty data PHDR segment (internal * linker assertion failures result). * * If the VDSO does incorrectly include a data section, the linker will * include it in the text segment. check_vdso.py looks for this degenerate * case. */ } PHDRS { text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R | PF_X */ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ eh_frame_hdr PT_GNU_EH_FRAME; } /* * Define the symbols that are to be exported. */ VERSION { LINUX_2.6 { global: clock_gettime; __vdso_clock_gettime; gettimeofday; __vdso_gettimeofday; getcpu; __vdso_getcpu; time; __vdso_time; __kernel_rt_sigreturn; local: *; }; } golang-gvisor-gvisor-0.0~20240729.0/vdso/vdso_arm64.lds000066400000000000000000000063741465435605700223200ustar00rootroot00000000000000/* * Linker script for the VDSO. * * The VDSO is essentially a normal ELF shared library that is mapped into the * address space of the process that is going to use it. The address of the * VDSO is passed to the runtime linker in the AT_SYSINFO_EHDR entry of the aux * vector. * * There are, however, three ways in which the VDSO differs from a normal * shared library: * * - The runtime linker does not attempt to process any relocations for the * VDSO so it is the responsibility of whoever loads the VDSO into the * address space to do this if necessary. Because of this restriction we are * careful to ensure that the VDSO does not need to have any relocations * applied to it. * * - Although the VDSO is position independent and would normally be linked at * virtual address 0, the Linux kernel VDSO is actually linked at a non zero * virtual address and the code in the system runtime linker that handles the * VDSO expects this to be the case so we have to explicitly link this VDSO * at a non zero address. The actual address is arbitrary, but we use the * same one as the Linux kernel VDSO. * * - The VDSO will be directly mmapped by the sentry, rather than going through * a normal ELF loading process. The VDSO must be carefully constructed such * that the layout in the ELF file is identical to the layout in memory. */ VDSO_PRELINK = 0xffffffffff700000; OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64") OUTPUT_ARCH(aarch64) SECTIONS { /* The parameter page is mapped just before the VDSO. */ _params = VDSO_PRELINK - 0x1000; . = VDSO_PRELINK + SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } .dynsym : { *(.dynsym) } .dynstr : { *(.dynstr) } .gnu.version : { *(.gnu.version) } .gnu.version_d : { *(.gnu.version_d) } .gnu.version_r : { *(.gnu.version_r) } .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr .eh_frame : { KEEP (*(.eh_frame)) } :text .dynamic : { *(.dynamic) } :text :dynamic .rodata : { *(.rodata*) } :text .altinstructions : { *(.altinstructions) } .altinstr_replacement : { *(.altinstr_replacement) } /* * TODO(gvisor.dev/issue/157): Remove this alignment? Then the VDSO would fit * in a single page. */ . = ALIGN(0x1000); .text : { *(.text*) } :text =0xd503201f /* * N.B. There is no data/bss section. This VDSO neither needs nor uses a data * section. We omit it entirely because some gcc/clang and gold/bfd version * combinations struggle to handle an empty data PHDR segment (internal * linker assertion failures result). * * If the VDSO does incorrectly include a data section, the linker will * include it in the text segment. check_vdso.py looks for this degenerate * case. */ } PHDRS { text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R | PF_X */ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ eh_frame_hdr PT_GNU_EH_FRAME; } /* * Define the symbols that are to be exported. */ VERSION { LINUX_2.6.39 { global: __kernel_clock_getres; __kernel_clock_gettime; __kernel_gettimeofday; __kernel_rt_sigreturn; local: *; }; } golang-gvisor-gvisor-0.0~20240729.0/vdso/vdso_time.cc000066400000000000000000000110561465435605700221210ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "vdso/vdso_time.h" #include #include #include #include "vdso/cycle_clock.h" #include "vdso/seqlock.h" #include "vdso/syscalls.h" // struct params defines the layout of the parameter page maintained by the // kernel (i.e., sentry). // // This is similar to the VVAR page maintained by the normal Linux kernel for // its VDSO, but it has a different layout. // // It must be kept in sync with VDSOParamPage in pkg/sentry/kernel/vdso.go. struct params { uint64_t seq_count; uint64_t monotonic_ready; int64_t monotonic_base_cycles; int64_t monotonic_base_ref; uint64_t monotonic_frequency; uint64_t realtime_ready; int64_t realtime_base_cycles; int64_t realtime_base_ref; uint64_t realtime_frequency; }; // Returns a pointer to the global parameter page. // // This page lives in the page just before the VDSO binary itself. The linker // defines _params as the page before the VDSO. // // Ideally, we'd simply declare _params as an extern struct params. // Unfortunately various combinations of old/new versions of gcc/clang and // gold/bfd struggle to generate references to such a global without generating // relocations. // // So instead, we use inline assembly with a construct that seems to have wide // compatibility across many toolchains. #if __x86_64__ inline struct params* get_params() { struct params* p = nullptr; asm("leaq _params(%%rip), %0" : "=r"(p) : :); return p; } #elif __aarch64__ inline struct params* get_params() { struct params* p = nullptr; asm("adr %0, _params" : "=r"(p) : :); return p; } #else #error "unsupported architecture" #endif namespace vdso { const uint64_t kNsecsPerSec = 1000000000UL; inline struct timespec ns_to_timespec(uint64_t ns) { struct timespec ts; ts.tv_sec = ns / kNsecsPerSec; ts.tv_nsec = ns % kNsecsPerSec; return ts; } inline uint64_t cycles_to_ns(uint64_t frequency, uint64_t cycles) { uint64_t mult = (kNsecsPerSec << 32) / frequency; return ((unsigned __int128)cycles * mult) >> 32; } // ClockRealtime() is the VDSO implementation of clock_gettime(CLOCK_REALTIME). int ClockRealtime(struct timespec* ts) { struct params* params = get_params(); uint64_t seq; uint64_t ready; int64_t base_ref; int64_t base_cycles; uint64_t frequency; int64_t now_cycles; do { seq = read_seqcount_begin(¶ms->seq_count); ready = params->realtime_ready; base_ref = params->realtime_base_ref; base_cycles = params->realtime_base_cycles; frequency = params->realtime_frequency; now_cycles = cycle_clock(); } while (read_seqcount_retry(¶ms->seq_count, seq)); if (!ready) { // The sandbox kernel ensures that we won't compute a time later than this // once the params are ready. return sys_clock_gettime(CLOCK_REALTIME, ts); } int64_t delta_cycles = (now_cycles < base_cycles) ? 0 : now_cycles - base_cycles; int64_t now_ns = base_ref + cycles_to_ns(frequency, delta_cycles); *ts = ns_to_timespec(now_ns); return 0; } // ClockMonotonic() is the VDSO implementation of // clock_gettime(CLOCK_MONOTONIC). int ClockMonotonic(struct timespec* ts) { struct params* params = get_params(); uint64_t seq; uint64_t ready; int64_t base_ref; int64_t base_cycles; uint64_t frequency; int64_t now_cycles; do { seq = read_seqcount_begin(¶ms->seq_count); ready = params->monotonic_ready; base_ref = params->monotonic_base_ref; base_cycles = params->monotonic_base_cycles; frequency = params->monotonic_frequency; now_cycles = cycle_clock(); } while (read_seqcount_retry(¶ms->seq_count, seq)); if (!ready) { // The sandbox kernel ensures that we won't compute a time later than this // once the params are ready. return sys_clock_gettime(CLOCK_MONOTONIC, ts); } int64_t delta_cycles = (now_cycles < base_cycles) ? 0 : now_cycles - base_cycles; int64_t now_ns = base_ref + cycles_to_ns(frequency, delta_cycles); *ts = ns_to_timespec(now_ns); return 0; } } // namespace vdso golang-gvisor-gvisor-0.0~20240729.0/vdso/vdso_time.h000066400000000000000000000014641465435605700217650ustar00rootroot00000000000000// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef VDSO_VDSO_TIME_H_ #define VDSO_VDSO_TIME_H_ #include namespace vdso { int ClockRealtime(struct timespec* ts); int ClockMonotonic(struct timespec* ts); } // namespace vdso #endif // VDSO_VDSO_TIME_H_ golang-gvisor-gvisor-0.0~20240729.0/webhook/000077500000000000000000000000001465435605700203015ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/webhook/main.go000066400000000000000000000013451465435605700215570ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Binary main serves a mutating Kubernetes webhook. package main import ( "gvisor.dev/gvisor/webhook/pkg/cli" ) func main() { cli.Main() } golang-gvisor-gvisor-0.0~20240729.0/webhook/pkg/000077500000000000000000000000001465435605700210625ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/webhook/pkg/cli/000077500000000000000000000000001465435605700216315ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/webhook/pkg/cli/cli.go000066400000000000000000000062631465435605700227360ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package cli provides a CLI interface for a mutating Kubernetes webhook. package cli import ( "flag" "fmt" "net" "net/http" "os" "strconv" "strings" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/webhook/pkg/injector" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" k8snet "k8s.io/apimachinery/pkg/util/net" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" ) var ( address = flag.String("address", "", "The ip address the admission webhook serves on. If unspecified, a public address is selected automatically.") port = flag.Int("port", 0, "The port the admission webhook serves on.") podLabels = flag.String("pod-namespace-labels", "", "A comma-separated namespace label selector, the admission webhook will only take effect on pods in selected namespaces, e.g. `label1,label2`.") ) // Main runs the webhook. func Main() { flag.Parse() if err := run(); err != nil { log.Warningf("%v", err) os.Exit(1) } } func run() error { log.Infof("Starting %s\n", injector.Name) // Create client config. cfg, err := rest.InClusterConfig() if err != nil { return fmt.Errorf("create in cluster config: %w", err) } // Create clientset. clientset, err := kubernetes.NewForConfig(cfg) if err != nil { return fmt.Errorf("create kubernetes client: %w", err) } if err := injector.CreateConfiguration(clientset, parsePodLabels()); err != nil { return fmt.Errorf("create webhook configuration: %w", err) } if err := startWebhookHTTPS(clientset); err != nil { return fmt.Errorf("start webhook https server: %w", err) } return nil } func parsePodLabels() *metav1.LabelSelector { rv := &metav1.LabelSelector{} for _, s := range strings.Split(*podLabels, ",") { req := metav1.LabelSelectorRequirement{ Key: strings.TrimSpace(s), Operator: "Exists", } rv.MatchExpressions = append(rv.MatchExpressions, req) } return rv } func startWebhookHTTPS(clientset kubernetes.Interface) error { log.Infof("Starting HTTPS handler") defer log.Infof("Stopping HTTPS handler") if *address == "" { ip, err := k8snet.ChooseHostInterface() if err != nil { return fmt.Errorf("select ip address: %w", err) } *address = ip.String() } mux := http.NewServeMux() mux.Handle("/", http.HandlerFunc( func(w http.ResponseWriter, r *http.Request) { injector.Admit(w, r) })) server := &http.Server{ // Listen on all addresses. Addr: net.JoinHostPort(*address, strconv.Itoa(*port)), TLSConfig: injector.GetTLSConfig(), Handler: mux, } if err := server.ListenAndServeTLS("", ""); err != http.ErrServerClosed { return fmt.Errorf("start HTTPS handler: %w", err) } return nil } golang-gvisor-gvisor-0.0~20240729.0/webhook/pkg/cli/cli_state_autogen.go000066400000000000000000000000651465435605700256520ustar00rootroot00000000000000// automatically generated by stateify. package cli golang-gvisor-gvisor-0.0~20240729.0/webhook/pkg/injector/000077500000000000000000000000001465435605700226775ustar00rootroot00000000000000golang-gvisor-gvisor-0.0~20240729.0/webhook/pkg/injector/certs.go000066400000000000000000000023251465435605700243500ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package injector import ( "fmt" "io/ioutil" ) var ( caKey []byte caCert []byte serverKey []byte serverCert []byte ) func init() { var ( caKeyErr error caCertErr error serverKeyErr error serverCertErr error ) caKey, caKeyErr = ioutil.ReadFile("caKey.pem") caCert, caCertErr = ioutil.ReadFile("caCert.pem") serverKey, serverKeyErr = ioutil.ReadFile("serverKey.pem") serverCert, serverCertErr = ioutil.ReadFile("serverCert.pem") for _, err := range []error{caKeyErr, caCertErr, serverKeyErr, serverCertErr} { if err != nil { panic(fmt.Errorf("unable to create certificates: %v", err)) } } } golang-gvisor-gvisor-0.0~20240729.0/webhook/pkg/injector/injector_state_autogen.go000066400000000000000000000000721465435605700277640ustar00rootroot00000000000000// automatically generated by stateify. package injector golang-gvisor-gvisor-0.0~20240729.0/webhook/pkg/injector/webhook.go000066400000000000000000000150401465435605700246640ustar00rootroot00000000000000// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package injector handles mutating webhook operations. package injector import ( "context" "crypto/tls" "encoding/json" "fmt" "net/http" "os" "github.com/mattbaird/jsonpatch" "gvisor.dev/gvisor/pkg/log" admv1beta1 "k8s.io/api/admission/v1beta1" admregv1beta1 "k8s.io/api/admissionregistration/v1beta1" v1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kubeclientset "k8s.io/client-go/kubernetes" ) const ( // Name is the name of the admission webhook service. The admission // webhook must be exposed in the following service; this is mainly for // the server certificate. Name = "gvisor-injection-admission-webhook" // serviceNamespace is the namespace of the admission webhook service. serviceNamespace = "e2e" fullName = Name + "." + serviceNamespace + ".svc" ) // CreateConfiguration creates MutatingWebhookConfiguration and registers the // webhook admission controller with the kube-apiserver. The webhook will only // take effect on pods in the namespaces selected by `podNsSelector`. If `podNsSelector` // is empty, the webhook will take effect on all pods. func CreateConfiguration(clientset kubeclientset.Interface, selector *metav1.LabelSelector) error { fail := admregv1beta1.Fail config := &admregv1beta1.MutatingWebhookConfiguration{ ObjectMeta: metav1.ObjectMeta{ Name: Name, }, Webhooks: []admregv1beta1.MutatingWebhook{ { Name: fullName, ClientConfig: admregv1beta1.WebhookClientConfig{ Service: &admregv1beta1.ServiceReference{ Name: Name, Namespace: serviceNamespace, }, CABundle: caCert, }, Rules: []admregv1beta1.RuleWithOperations{ { Operations: []admregv1beta1.OperationType{ admregv1beta1.Create, }, Rule: admregv1beta1.Rule{ APIGroups: []string{"*"}, APIVersions: []string{"*"}, Resources: []string{"pods"}, }, }, }, FailurePolicy: &fail, NamespaceSelector: selector, }, }, } log.Infof("Creating MutatingWebhookConfiguration %q", config.Name) if _, err := clientset.AdmissionregistrationV1beta1().MutatingWebhookConfigurations().Create(context.TODO(), config, metav1.CreateOptions{}); err != nil { if !apierrors.IsAlreadyExists(err) { return fmt.Errorf("failed to create MutatingWebhookConfiguration %q: %s", config.Name, err) } log.Infof("MutatingWebhookConfiguration %q already exists; use the existing one", config.Name) } return nil } // GetTLSConfig retrieves the CA cert that signed the cert used by the webhook. func GetTLSConfig() *tls.Config { sc, err := tls.X509KeyPair(serverCert, serverKey) if err != nil { log.Warningf("Failed to generate X509 key pair: %v", err) os.Exit(1) } return &tls.Config{ Certificates: []tls.Certificate{sc}, } } // Admit performs admission checks and mutations on Pods. func Admit(writer http.ResponseWriter, req *http.Request) { review := &admv1beta1.AdmissionReview{} if err := json.NewDecoder(req.Body).Decode(review); err != nil { log.Infof("Failed with error (%v) to decode Admit request: %+v", err, *req) writer.WriteHeader(http.StatusBadRequest) return } log.Debugf("admitPod: %+v", review) var err error review.Response, err = admitPod(review.Request) if err != nil { log.Warningf("admitPod failed: %v", err) review.Response = &admv1beta1.AdmissionResponse{ Result: &metav1.Status{ Reason: metav1.StatusReasonInvalid, Message: err.Error(), }, } sendResponse(writer, review) return } log.Debugf("Processed admission review: %+v", review) sendResponse(writer, review) } func sendResponse(writer http.ResponseWriter, response any) { b, err := json.Marshal(response) if err != nil { log.Warningf("Failed with error (%v) to marshal response: %+v", err, response) writer.WriteHeader(http.StatusInternalServerError) return } writer.WriteHeader(http.StatusOK) writer.Write(b) } func admitPod(req *admv1beta1.AdmissionRequest) (*admv1beta1.AdmissionResponse, error) { // Verify that the request is indeed a Pod. resource := metav1.GroupVersionResource{Group: "", Version: "v1", Resource: "pods"} if req.Resource != resource { return nil, fmt.Errorf("unexpected resource %+v in pod admission", req.Resource) } // Decode the request into a Pod. pod := &v1.Pod{} if err := json.Unmarshal(req.Object.Raw, pod); err != nil { return nil, fmt.Errorf("failed to decode pod object %s/%s", req.Namespace, req.Name) } // Copy first to change it. podCopy := pod.DeepCopy() updatePod(podCopy) patch, err := createPatch(req.Object.Raw, podCopy) if err != nil { return nil, fmt.Errorf("failed to create patch for pod %s/%s (generatedName: %s)", pod.Namespace, pod.Name, pod.GenerateName) } log.Debugf("Patched pod %s/%s (generateName: %s): %+v", pod.Namespace, pod.Name, pod.GenerateName, podCopy) patchType := admv1beta1.PatchTypeJSONPatch return &admv1beta1.AdmissionResponse{ Allowed: true, Patch: patch, PatchType: &patchType, }, nil } func updatePod(pod *v1.Pod) { gvisor := "gvisor" pod.Spec.RuntimeClassName = &gvisor // We don't run SELinux test for gvisor. // If SELinuxOptions are specified, this is usually for volume test to pass // on SELinux. This can be safely ignored. if pod.Spec.SecurityContext != nil && pod.Spec.SecurityContext.SELinuxOptions != nil { pod.Spec.SecurityContext.SELinuxOptions = nil } for i := range pod.Spec.Containers { c := &pod.Spec.Containers[i] if c.SecurityContext != nil && c.SecurityContext.SELinuxOptions != nil { c.SecurityContext.SELinuxOptions = nil } } for i := range pod.Spec.InitContainers { c := &pod.Spec.InitContainers[i] if c.SecurityContext != nil && c.SecurityContext.SELinuxOptions != nil { c.SecurityContext.SELinuxOptions = nil } } } func createPatch(old []byte, newObj any) ([]byte, error) { new, err := json.Marshal(newObj) if err != nil { return nil, err } patch, err := jsonpatch.CreatePatch(old, new) if err != nil { return nil, err } return json.Marshal(patch) }